3749 files changed, 299046 insertions, 73815 deletions
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..9b3aa8b
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1 @@
+BasedOnStyle: LLVM
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0a6c75..a68e7e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,11 @@ set(CMAKE_MODULE_PATH
 set(LLVM_VERSION_MAJOR 3)
 set(LLVM_VERSION_MINOR 4)
 
-set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}svn")
+if (NOT PACKAGE_VERSION)
+  set(PACKAGE_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}svn")
+endif()
+
+option(LLVM_INSTALL_TOOLCHAIN_ONLY "Only include toolchain files in the 'install' target." OFF)
 
 option(LLVM_USE_FOLDERS "Enable solution folders in Visual Studio. Disable for Express versions." ON)
 if ( LLVM_USE_FOLDERS )
@@ -33,6 +37,25 @@ set(PACKAGE_NAME LLVM)
 set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
 set(PACKAGE_BUGREPORT "http://llvm.org/bugs/")
 
+# Configure CPack.
+set(CPACK_PACKAGE_INSTALL_DIRECTORY "LLVM")
+set(CPACK_PACKAGE_VENDOR "LLVM")
+set(CPACK_PACKAGE_VERSION_MAJOR ${LLVM_VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${LLVM_VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION ${PACKAGE_VERSION})
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.TXT")
+if(WIN32 AND NOT UNIX)
+  set(CPACK_PACKAGE_INSTALL_REGISTRY_KEY "LLVM")
+  set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}\\\\cmake\\\\nsis_logo.bmp")
+  set(CPACK_NSIS_MODIFY_PATH "ON")
+  set(CPACK_NSIS_ENABLE_UNINSTALL_BEFORE_INSTALL "ON")
+  set(CPACK_NSIS_EXTRA_INSTALL_COMMANDS
+    "ExecWait '$INSTDIR/tools/msbuild/install.bat'")
+  set(CPACK_NSIS_EXTRA_UNINSTALL_COMMANDS
+    "ExecWait '$INSTDIR/tools/msbuild/uninstall.bat'")
+endif()
+include(CPack)
+
 # Sanity check our source directory to make sure that we are not trying to
 # generate an in-tree build (unless on MSVC_IDE, where it is ok), and to make
 # sure that we don't have any stray generated files lying around in the tree
@@ -116,6 +139,11 @@ if(LLVM_ENABLE_BACKTRACES)
   set(ENABLE_BACKTRACES 1)
 endif()
 
+option(LLVM_ENABLE_CRASH_OVERRIDES "Enable crash overrides." ON)
+if(LLVM_ENABLE_CRASH_OVERRIDES)
+  set(ENABLE_CRASH_OVERRIDES 1)
+endif()
+
 option(LLVM_ENABLE_FFI "Use libffi to call external functions from the interpreter" OFF)
 set(FFI_LIBRARY_DIR "" CACHE PATH "Additional directory, where CMake should search for libffi.so")
 set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should search for ffi.h or ffi/ffi.h")
@@ -123,7 +151,7 @@ set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should sear
 set(LLVM_TARGET_ARCH "host"
   CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.")
 
-option(LLVM_ENABLE_CURSES "Use curses to detect terminal info if available." ON)
+option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON)
 
 option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 
@@ -209,18 +237,15 @@ if( WIN32 AND NOT CYGWIN )
 endif()
 
 # Define options to control the inclusion and default build behavior for
-# components which may not strictly be necessary (tools, runtime, examples, and
-# tests).
+# components which may not strictly be necessary (tools, examples, and tests).
 #
 # This is primarily to support building smaller or faster project files.
 option(LLVM_INCLUDE_TOOLS "Generate build targets for the LLVM tools." ON)
 option(LLVM_BUILD_TOOLS
   "Build the LLVM tools. If OFF, just generate build targets." ON)
 
-option(LLVM_INCLUDE_RUNTIME "Generate build targets for the LLVM runtimes" ON)
 option(LLVM_BUILD_RUNTIME
-  "Build the LLVM runtime libraries. If OFF, just generate build targets." ON)
-
+  "Build the LLVM runtime libraries." ON)
 option(LLVM_BUILD_EXAMPLES
   "Build the LLVM example programs. If OFF, just generate build targets." OFF)
 option(LLVM_INCLUDE_EXAMPLES "Generate build targets for the LLVM examples" ON)
@@ -229,6 +254,10 @@ option(LLVM_BUILD_TESTS
   "Build LLVM unit tests. If OFF, just generate build targets." OFF)
 option(LLVM_INCLUDE_TESTS "Generate build targets for the LLVM unit tests." ON)
 
+option (LLVM_BUILD_DOCS "Build the llvm documentation." OFF)
+option (LLVM_INCLUDE_DOCS "Generate build targets for llvm documentation." ON)
+option (LLVM_ENABLE_DOXYGEN "Use doxygen to generate llvm documentation." OFF)
+
 # All options referred to from HandleLLVMOptions have to be specified
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
@@ -243,7 +272,7 @@ set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}")
 include(HandleLLVMOptions)
 
 # Verify that we can find a Python 2 interpreter.  Python 3 is unsupported.
-set(Python_ADDITIONAL_VERSIONS 2.7 2.6 2.5 2.4)
+set(Python_ADDITIONAL_VERSIONS 2.7 2.6 2.5)
 include(FindPythonInterp)
 if( NOT PYTHONINTERP_FOUND )
   message(FATAL_ERROR
@@ -288,7 +317,7 @@ execute_process(
             --enable-optional-components "${LLVMOPTIONALCOMPONENTS}"
             --write-library-table ${LLVMCONFIGLIBRARYDEPENDENCIESINC}
             --write-cmake-fragment ${LLVMBUILDCMAKEFRAG}
-            ERROR_VARIABLE LLVMBUILDOUTPUT
+            OUTPUT_VARIABLE LLVMBUILDOUTPUT
             ERROR_VARIABLE LLVMBUILDERRORS
             OUTPUT_STRIP_TRAILING_WHITESPACE
             ERROR_STRIP_TRAILING_WHITESPACE
@@ -430,10 +459,6 @@ if( LLVM_INCLUDE_TOOLS )
   add_subdirectory(tools)
 endif()
 
-if( LLVM_INCLUDE_RUNTIME )
-  add_subdirectory(runtime)
-endif()
-
 if( LLVM_INCLUDE_EXAMPLES )
   add_subdirectory(examples)
 endif()
@@ -463,38 +488,36 @@ if( LLVM_INCLUDE_TESTS )
     )
 endif()
 
-add_subdirectory(cmake/modules)
-
-install(DIRECTORY include/
-  DESTINATION include
-  FILES_MATCHING
-  PATTERN "*.def"
-  PATTERN "*.h"
-  PATTERN "*.td"
-  PATTERN "*.inc"
-  PATTERN "LICENSE.TXT"
-  PATTERN ".svn" EXCLUDE
-  )
+if (LLVM_INCLUDE_DOCS)
+  add_subdirectory(docs)
+endif()
 
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/
-  DESTINATION include
-  FILES_MATCHING
-  PATTERN "*.def"
-  PATTERN "*.h"
-  PATTERN "*.gen"
-  PATTERN "*.inc"
-  # Exclude include/llvm/CMakeFiles/intrinsics_gen.dir, matched by "*.def"
-  PATTERN "CMakeFiles" EXCLUDE
-  PATTERN ".svn" EXCLUDE
-  )
+add_subdirectory(cmake/modules)
 
-# TODO: make and install documentation.
+if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+  install(DIRECTORY include/
+    DESTINATION include
+    FILES_MATCHING
+    PATTERN "*.def"
+    PATTERN "*.h"
+    PATTERN "*.td"
+    PATTERN "*.inc"
+    PATTERN "LICENSE.TXT"
+    PATTERN ".svn" EXCLUDE
+    )
 
-set(CPACK_PACKAGE_VENDOR "LLVM")
-set(CPACK_PACKAGE_VERSION_MAJOR ${LLVM_VERSION_MAJOR})
-set(CPACK_PACKAGE_VERSION_MINOR ${LLVM_VERSION_MINOR})
-add_version_info_from_vcs(CPACK_PACKAGE_VERSION_PATCH)
-include(CPack)
+  install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/
+    DESTINATION include
+    FILES_MATCHING
+    PATTERN "*.def"
+    PATTERN "*.h"
+    PATTERN "*.gen"
+    PATTERN "*.inc"
+    # Exclude include/llvm/CMakeFiles/intrinsics_gen.dir, matched by "*.def"
+    PATTERN "CMakeFiles" EXCLUDE
+    PATTERN ".svn" EXCLUDE
+    )
+endif()
 
 # Workaround for MSVS10 to avoid the Dialog Hell
 # FIXME: This could be removed with future version of CMake.
@@ -504,3 +527,4 @@ if(MSVC_VERSION EQUAL 1600)
     file(APPEND "${LLVM_SLN_FILENAME}" "\n# This should be regenerated!\n")
   endif()
 endif()
+
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 369aba1..cffb91b 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -102,13 +102,17 @@ E: richard@xmos.com
 D: XCore Backend
 
 N: Chad Rosier
-E: mcrosier@apple.com
+E: mcrosier@codeaurora.org
 D: Fast-Isel
 
 N: Nadav Rotem
 E: nrotem@apple.com
 D: X86 Backend, Loop Vectorizer
 
+N: Daniel Sanders
+E: daniel.sanders@imgtec.com
+D: MIPS Backend (lib/Target/Mips/*)
+
 N: Richard Sandiford
 E: rsandifo@linux.vnet.ibm.com
 D: SystemZ Backend
@@ -139,5 +143,9 @@ E: atrick@apple.com
 D: IndVar Simplify, Loop Strength Reduction, Instruction Scheduling
 
 N: Bill Wendling
-E: wendling@apple.com
+E: isanbard@gmail.com
 D: libLTO, IR Linker
+
+N: Peter Zotov
+E: whitequark@whitequark.org
+D: OCaml bindings
diff --git a/CREDITS.TXT b/CREDITS.TXT
index 3fa187f..3c1dcb0 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -253,7 +253,8 @@ D: Release manager (1.7+)
 
 N: Sylvestre Ledru
 E: sylvestre@debian.org
-W: http://sylvesre.ledru.info/
+W: http://sylvestre.ledru.info/
+W: http://llvm.org/apt/
 D: Debian and Ubuntu packaging
 D: Continous integration with jenkins
 
@@ -365,7 +366,7 @@ I: arosenberg
 D: ARM calling conventions rewrite, hard float support
 
 N: Chad Rosier
-E: mcrosier@apple.com
+E: mcrosier@codeaurora.org
 D: ARM fast-isel improvements
 D: Performance monitoring
 
@@ -411,6 +412,11 @@ E: rspencer@reidspencer.com
 W: http://reidspencer.com/
 D: Lots of stuff, see: http://wiki.llvm.org/index.php/User:Reid
 
+N: Alp Toker
+E: alp@nuanti.com
+W: http://atoker.com/
+D: C++ frontend next generation standards implementation
+
 N: Craig Topper
 E: craig.topper@gmail.com
 D: X86 codegen and disassembler improvements. AVX2 support.
@@ -430,10 +436,10 @@ D: Thread Local Storage implementation
 
 N: Bill Wendling
 I: wendling
-E: wendling@apple.com
-D: Release manager
+E: isanbard@gmail.com
+D: Release manager, IR Linker, LTO
 D: Bunches of stuff
 
 N: Bob Wilson
 E: bob.wilson@acm.org
-D: Advanced SIMD (NEON) support in the ARM backend
+D: Advanced SIMD (NEON) support in the ARM backend.
diff --git a/LLVMBuild.txt b/LLVMBuild.txt
index e763fd2..9cee303 100644
--- a/LLVMBuild.txt
+++ b/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = bindings docs examples lib projects runtime tools utils
+subdirectories = bindings docs examples lib projects tools utils
 
 [component_0]
 type = Group
diff --git a/Makefile b/Makefile
index fee675d..e3672b7 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,7 @@ LEVEL := .
 #   3. Build IR, which builds the Intrinsics.inc file used by libs.
 #   4. Build libs, which are needed by llvm-config.
 #   5. Build llvm-config, which determines inter-lib dependencies for tools.
-#   6. Build tools, runtime, docs.
+#   6. Build tools and docs.
 #
 # When cross-compiling, there are some things (tablegen) that need to
 # be build for the build system first.
@@ -31,7 +31,7 @@ ifeq ($(BUILD_DIRS_ONLY),1)
   OPTIONAL_DIRS := tools/clang/utils/TableGen
 else
   DIRS := lib/Support lib/TableGen utils lib/IR lib tools/llvm-shlib \
-          tools/llvm-config tools runtime docs unittests
+          tools/llvm-config tools docs unittests
   OPTIONAL_DIRS := projects bindings
 endif
 
@@ -52,17 +52,17 @@ ifneq ($(ENABLE_DOCS),1)
 endif
 
 ifeq ($(MAKECMDGOALS),libs-only)
-  DIRS := $(filter-out tools runtime docs, $(DIRS))
+  DIRS := $(filter-out tools docs, $(DIRS))
   OPTIONAL_DIRS :=
 endif
 
 ifeq ($(MAKECMDGOALS),install-libs)
-  DIRS := $(filter-out tools runtime docs, $(DIRS))
+  DIRS := $(filter-out tools docs, $(DIRS))
   OPTIONAL_DIRS := $(filter bindings, $(OPTIONAL_DIRS))
 endif
 
 ifeq ($(MAKECMDGOALS),tools-only)
-  DIRS := $(filter-out runtime docs, $(DIRS))
+  DIRS := $(filter-out docs, $(DIRS))
   OPTIONAL_DIRS :=
 endif
 
@@ -72,7 +72,7 @@ ifeq ($(MAKECMDGOALS),install-clang)
           tools/clang/tools/c-index-test \
           tools/clang/include/clang-c \
           tools/clang/runtime tools/clang/docs \
-          tools/lto runtime
+          tools/lto
   OPTIONAL_DIRS :=
   NO_INSTALL = 1
 endif
@@ -84,7 +84,7 @@ ifeq ($(MAKECMDGOALS),clang-only)
 endif
 
 ifeq ($(MAKECMDGOALS),unittests)
-  DIRS := $(filter-out tools runtime docs, $(DIRS)) utils unittests
+  DIRS := $(filter-out tools docs, $(DIRS)) utils unittests
   OPTIONAL_DIRS :=
 endif
 
diff --git a/Makefile.rules b/Makefile.rules
index 32b1ebc..68f6cf8 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -274,13 +274,9 @@ CPP.Defines :=
 ifeq ($(ENABLE_OPTIMIZED),1)
   BuildMode := Release
   # Don't use -fomit-frame-pointer on Darwin or FreeBSD.
-  ifneq ($(HOST_OS),FreeBSD)
-  ifneq ($(HOST_OS),GNU/kFreeBSD)
-  ifneq ($(HOST_OS),Darwin)
+  ifneq ($(HOST_OS), $(filter $(HOST_OS), Cygwin Darwin DragonFly FreeBSD GNU/kFreeBSD))
     OmitFramePointer := -fomit-frame-pointer
   endif
-  endif
-  endif
 
   CXX.Flags += $(OPTIMIZE_OPTION) $(OmitFramePointer)
   C.Flags   += $(OPTIMIZE_OPTION) $(OmitFramePointer)
@@ -554,9 +550,9 @@ ifeq ($(HOST_OS),Darwin)
   DARWIN_VERSION := `sw_vers -productVersion`
  endif
   # Strip a number like 10.4.7 to 10.4
-  DARWIN_VERSION := $(shell echo $(DARWIN_VERSION)| sed -E 's/(10.[0-9]).*/\1/')
+  DARWIN_VERSION := $(shell echo $(DARWIN_VERSION)| sed -E 's/(10.[0-9]+).*/\1/')
   # Get "4" out of 10.4 for later pieces in the makefile.
-  DARWIN_MAJVERS := $(shell echo $(DARWIN_VERSION)| sed -E 's/10.([0-9]).*/\1/')
+  DARWIN_MAJVERS := $(shell echo $(DARWIN_VERSION)| sed -E 's/10.([0-9]+).*/\1/')
 
   LoadableModuleOptions := -Wl,-flat_namespace -Wl,-undefined,suppress
   SharedLinkOptions := -dynamiclib
@@ -780,8 +776,10 @@ Sources += $(filter %.cpp %.c %.cc,$(BUILT_SOURCES))
 endif
 
 BaseNameSources := $(sort $(basename $(Sources)))
+SourceDirs := $(sort $(dir $(Sources)))
 
 ObjectsO  := $(BaseNameSources:%=$(ObjDir)/%.o)
+ObjectDirs := $(SourceDirs:%=$(ObjDir)/%)
 
 #----------------------------------------------------------
 # For Mingw MSYS bash and Python/w32:
@@ -818,10 +816,19 @@ $(DESTDIR)$(PROJ_bindir) $(DESTDIR)$(PROJ_libdir) $(DESTDIR)$(PROJ_includedir) $
 	$(Verb) $(MKDIR) $* > /dev/null
 	$(Verb) $(DOTDIR_TIMESTAMP_COMMAND) > $@
 
-.PRECIOUS: $(ObjDir)/.dir $(LibDir)/.dir $(ToolDir)/.dir $(ExmplDir)/.dir
+.PRECIOUS: $(LibDir)/.dir $(ToolDir)/.dir $(ExmplDir)/.dir
 .PRECIOUS: $(LLVMLibDir)/.dir $(LLVMToolDir)/.dir $(LLVMExmplDir)/.dir
 
 #---------------------------------------------------------
+# Collect the object directories (as there may be more
+# than one if the source code is spread across
+# subdirectories).
+#---------------------------------------------------------
+
+OBJECT_DIRS := $(ObjDir)/.dir $(ObjectDirs:%=%/.dir)
+.PRECIOUS: $(OBJECT_DIRS)
+
+#---------------------------------------------------------
 # Handle the DIRS options for sequential construction
 #---------------------------------------------------------
 
@@ -1294,7 +1301,7 @@ LD.Flags += -Wl,-exported_symbol,_main
 endif
 endif
 
-ifeq ($(HOST_OS), $(filter $(HOST_OS), Linux NetBSD FreeBSD GNU/kFreeBSD GNU))
+ifeq ($(HOST_OS), $(filter $(HOST_OS), DragonFly Linux NetBSD FreeBSD GNU/kFreeBSD GNU))
 ifneq ($(ARCH), Mips)
   LD.Flags += -Wl,--version-script=$(LLVM_SRC_ROOT)/autoconf/ExportMap.map
 endif
@@ -1436,6 +1443,8 @@ ifeq ($(HOST_OS),HP-UX)
   DISABLE_AUTO_DEPENDENCIES=1
 endif
 
+COMPILE_DEPS = $(OBJECT_DIRS) $(BUILT_SOURCES) $(PROJ_MAKEFILE)
+
 # Provide rule sets for when dependency generation is enabled
 ifndef DISABLE_AUTO_DEPENDENCIES
 
@@ -1451,27 +1460,27 @@ DEPEND_OPTIONS = -MMD -MP -MF "$(ObjDir)/$*.d.tmp" \
 DEPEND_MOVEFILE = then $(MV) -f "$(ObjDir)/$*.d.tmp" "$(ObjDir)/$*.d"; \
                   else $(RM) "$(ObjDir)/$*.d.tmp"; exit 1; fi
 
-$(ObjDir)/%.o: %.cpp $(ObjDir)/.dir $(BUILT_SOURCES) $(PROJ_MAKEFILE)
+$(ObjDir)/%.o: %.cpp $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cpp for $(BuildMode) build" $(PIC_FLAG)
 	$(Verb) if $(Compile.CXX) $(DEPEND_OPTIONS) $< -o $(ObjDir)/$*.o ; \
 	        $(DEPEND_MOVEFILE)
 
-$(ObjDir)/%.o: %.mm $(ObjDir)/.dir $(BUILT_SOURCES) $(PROJ_MAKEFILE)
+$(ObjDir)/%.o: %.mm $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.mm for $(BuildMode) build" $(PIC_FLAG)
 	$(Verb) if $(Compile.CXX) $(DEPEND_OPTIONS) $< -o $(ObjDir)/$*.o ; \
 	        $(DEPEND_MOVEFILE)
 
-$(ObjDir)/%.o: %.cc $(ObjDir)/.dir $(BUILT_SOURCES) $(PROJ_MAKEFILE)
+$(ObjDir)/%.o: %.cc $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cc for $(BuildMode) build" $(PIC_FLAG)
 	$(Verb) if $(Compile.CXX) $(DEPEND_OPTIONS) $< -o $(ObjDir)/$*.o ; \
 	        $(DEPEND_MOVEFILE)
 
-$(ObjDir)/%.o: %.c $(ObjDir)/.dir $(BUILT_SOURCES) $(PROJ_MAKEFILE)
+$(ObjDir)/%.o: %.c $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.c for $(BuildMode) build" $(PIC_FLAG)
 	$(Verb) if $(Compile.C) $(DEPEND_OPTIONS) $< -o $(ObjDir)/$*.o ; \
 	        $(DEPEND_MOVEFILE)
 
-$(ObjDir)/%.o: %.m $(ObjDir)/.dir $(BUILT_SOURCES) $(PROJ_MAKEFILE)
+$(ObjDir)/%.o: %.m $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.m for $(BuildMode) build" $(PIC_FLAG)
 	$(Verb) if $(Compile.C) $(DEPEND_OPTIONS) $< -o $(ObjDir)/$*.o ; \
 	        $(DEPEND_MOVEFILE)
@@ -1479,67 +1488,67 @@ $(ObjDir)/%.o: %.m $(ObjDir)/.dir $(BUILT_SOURCES) $(PROJ_MAKEFILE)
 # Provide alternate rule sets if dependencies are disabled
 else
 
-$(ObjDir)/%.o: %.cpp $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.o: %.cpp $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cpp for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.CXX) $< -o $@
 
-$(ObjDir)/%.o: %.mm $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.o: %.mm $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.mm for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.CXX) $< -o $@
 
-$(ObjDir)/%.o: %.cc $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.o: %.cc $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cc for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.CXX) $< -o $@
 
-$(ObjDir)/%.o: %.c $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.o: %.c $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.c for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.C) $< -o $@
 
-$(ObjDir)/%.o: %.m $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.o: %.m $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.m for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.C) $< -o $@
 endif
 
 
 ## Rules for building preprocessed (.i/.ii) outputs.
-$(BuildMode)/%.ii: %.cpp $(ObjDir)/.dir $(BUILT_SOURCES)
+$(BuildMode)/%.ii: %.cpp $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cpp for $(BuildMode) build to .ii file"
 	$(Verb) $(Preprocess.CXX) $< -o $@
 
-$(BuildMode)/%.ii: %.mm $(ObjDir)/.dir $(BUILT_SOURCES)
+$(BuildMode)/%.ii: %.mm $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.mm for $(BuildMode) build to .ii file"
 	$(Verb) $(Preprocess.CXX) $< -o $@
 
-$(BuildMode)/%.ii: %.cc $(ObjDir)/.dir $(BUILT_SOURCES)
+$(BuildMode)/%.ii: %.cc $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cc for $(BuildMode) build to .ii file"
 	$(Verb) $(Preprocess.CXX) $< -o $@
 
-$(BuildMode)/%.i: %.c $(ObjDir)/.dir $(BUILT_SOURCES)
+$(BuildMode)/%.i: %.c $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.c for $(BuildMode) build to .i file"
 	$(Verb) $(Preprocess.C) $< -o $@
 
-$(BuildMode)/%.i: %.m $(ObjDir)/.dir $(BUILT_SOURCES)
+$(BuildMode)/%.i: %.m $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.m for $(BuildMode) build to .i file"
 	$(Verb) $(Preprocess.C) $< -o $@
 
 
-$(ObjDir)/%.s: %.cpp $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.s: %.cpp $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cpp to asm for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.CXX) $< -o $@ -S
 
-$(ObjDir)/%.s: %.mm $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.s: %.mm $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.mm to asm for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.CXX) $< -o $@ -S
 
-$(ObjDir)/%.s: %.cc $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.s: %.cc $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.cc to asm for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.CXX) $< -o $@ -S
 
-$(ObjDir)/%.s: %.c $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.s: %.c $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.c to asm for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.C) $< -o $@ -S
 
-$(ObjDir)/%.s: %.m $(ObjDir)/.dir $(BUILT_SOURCES)
+$(ObjDir)/%.s: %.m $(COMPILE_DEPS)
 	$(Echo) "Compiling $*.m to asm for $(BuildMode) build" $(PIC_FLAG)
 	$(Compile.C) $< -o $@ -S
 
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 30481ea..a9d4915 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -31,7 +31,7 @@ dnl===
 dnl===-----------------------------------------------------------------------===
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.
-AC_INIT([LLVM],[3.4svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.4],[http://llvm.org/bugs/])
 AC_DEFINE([LLVM_VERSION_MAJOR], [3], [Major version of the LLVM API])
 AC_DEFINE([LLVM_VERSION_MINOR], [4], [Minor version of the LLVM API])
 
@@ -61,8 +61,8 @@ fi
 
 dnl Default to empty (i.e. assigning the null string to) CFLAGS and CXXFLAGS,
 dnl instead of the autoconf default (for example, '-g -O2' for CC=gcc).
-${CFLAGS=}
-${CXXFLAGS=}
+: ${CFLAGS=}
+: ${CXXFLAGS=}
 
 dnl We need to check for the compiler up here to avoid anything else
 dnl starting with a different one.
@@ -551,7 +551,12 @@ AC_ARG_ENABLE(clang-static-analyzer,
                              enableval="yes")
 case "$enableval" in
   yes) AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[1]) ;;
-  no)  AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[0]) ;;
+  no)  
+    if test ${clang_arcmt} != "no" ; then
+      AC_MSG_ERROR([Cannot enable clang ARC Migration Tool while disabling static analyzer.])
+    fi
+    AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[0]) 
+    ;;
   default) AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[1]);;
   *) AC_MSG_ERROR([Invalid setting for --enable-clang-static-analyzer. Use "yes" or "no"]) ;;
 esac
@@ -795,20 +800,37 @@ esac
 AC_DEFINE_UNQUOTED([ENABLE_TIMESTAMPS],$ENABLE_TIMESTAMPS,
                    [Define if timestamp information (e.g., __DATE__) is allowed])
 
-dnl Enable embedding timestamp information into build.
+dnl Enable support for showing backtraces.
+AC_ARG_ENABLE(backtraces, AS_HELP_STRING(
+  [--enable-backtraces],
+  [Enable embedding backtraces on crash (default is YES)]),
+  [case "$enableval" in
+    yes) llvm_cv_enable_backtraces="yes" ;;
+    no)  llvm_cv_enable_backtraces="no"  ;;
+    *) AC_MSG_ERROR([Invalid setting for --enable-backtraces. Use "yes" or "no"]) ;;
+  esac],
+  llvm_cv_enable_backtraces="yes")
+if test "$llvm_cv_enable_backtraces" = "yes" ; then
+  AC_DEFINE([ENABLE_BACKTRACES],[1],
+            [Define if you want backtraces on crash])
+fi
 
-AC_ARG_ENABLE(backtraces,
-  AS_HELP_STRING([--enable-backtraces],
-                 [Enable embedding backtraces on crash (default is YES)]),,
-                 enableval=default)
-case "$enableval" in
-  yes) AC_SUBST(ENABLE_BACKTRACES,[1]) ;;
-  no)  AC_SUBST(ENABLE_BACKTRACES,[0]) ;;
-  default) AC_SUBST(ENABLE_BACKTRACES,[1]) ;;
-  *) AC_MSG_ERROR([Invalid setting for --enable-backtraces. Use "yes" or "no"]) ;;
-esac
-AC_DEFINE_UNQUOTED([ENABLE_BACKTRACES],$ENABLE_BACKTRACES,
-                   [Define if you want backtraces on crash])
+dnl Enable installing platform specific signal handling overrides, for improved
+dnl CrashRecovery support or interaction with crash reporting software. This
+dnl support may be inappropriate for some clients embedding LLVM as a library.
+AC_ARG_ENABLE(crash-overrides, AS_HELP_STRING(
+  [--enable-crash-overrides],
+  [Enable crash handling overrides (default is YES)]),
+  [case "$enableval" in
+    yes) llvm_cv_enable_crash_overrides="yes" ;;
+    no)  llvm_cv_enable_crash_overrides="no"  ;;
+    *) AC_MSG_ERROR([Invalid setting for --enable-crash-overrides. Use "yes" or "no"]) ;;
+  esac],
+  llvm_cv_enable_crash_overrides="yes")
+if test "$llvm_cv_enable_crash_overrides" = "yes" ; then
+  AC_DEFINE([ENABLE_CRASH_OVERRIDES],[1],
+            [Define to enable crash handling overrides])
+fi
 
 dnl Allow specific targets to be specified for building (or not)
 TARGETS_TO_BUILD=""
@@ -1072,16 +1094,16 @@ AC_ARG_WITH(bug-report-url,
 AC_DEFINE_UNQUOTED(BUG_REPORT_URL,"$withval",
                    [Bug report URL.])
 
-dnl --enable-curses: check whether the user wants to control use of curses:
-AC_ARG_ENABLE(curses,AS_HELP_STRING(
-  [--enable-curses],
-  [Use curses for querying terminal infomation if available (default is YES)]),
+dnl --enable-terminfo: check whether the user wants to control use of terminfo:
+AC_ARG_ENABLE(terminfo,AS_HELP_STRING(
+  [--enable-terminfo],
+  [Query the terminfo database if available (default is YES)]),
   [case "$enableval" in
-    yes) llvm_cv_enable_curses="yes" ;;
-    no)  llvm_cv_enable_curses="no"  ;;
-    *) AC_MSG_ERROR([Invalid setting for --enable-curses. Use "yes" or "no"]) ;;
+    yes) llvm_cv_enable_terminfo="yes" ;;
+    no)  llvm_cv_enable_terminfo="no"  ;;
+    *) AC_MSG_ERROR([Invalid setting for --enable-terminfo. Use "yes" or "no"]) ;;
   esac],
-  llvm_cv_enable_curses="yes")
+  llvm_cv_enable_terminfo="yes")
 
 dnl --enable-libffi : check whether the user wants to turn off libffi:
 AC_ARG_ENABLE(libffi,AS_HELP_STRING(
@@ -1355,12 +1377,13 @@ else
 fi
 
 AC_MSG_CHECKING([for python >= 2.5])
-ac_python_version=`$PYTHON -c 'import sys; print sys.version.split()[[0]]'`
+ac_python_version=`$PYTHON -V 2>&1 | cut -d' ' -f2`
 ac_python_version_major=`echo $ac_python_version | cut -d'.' -f1`
 ac_python_version_minor=`echo $ac_python_version | cut -d'.' -f2`
 ac_python_version_patch=`echo $ac_python_version | cut -d'.' -f3`
-if   test "$ac_python_version_major" -eq "2" \
-   && test "$ac_python_version_minor" -ge "5" ; then
+if test "$ac_python_version_major" -gt "2" || \
+   (test "$ac_python_version_major" -eq "2" && \
+    test "$ac_python_version_minor" -ge "5") ; then
   AC_MSG_RESULT([$PYTHON ($ac_python_version)])
 else
   AC_MSG_RESULT([not found])
@@ -1377,6 +1400,7 @@ AC_CHECK_LIB(m,sin)
 if test "$llvm_cv_os_type" = "MingW" ; then
   AC_CHECK_LIB(imagehlp, main)
   AC_CHECK_LIB(psapi, main)
+  AC_CHECK_LIB(shell32, main)
 fi
 
 dnl dlopen() is required for plugin support.
@@ -1390,11 +1414,11 @@ dnl right libraries to link with.
 AC_SEARCH_LIBS(clock_gettime,rt)
 
 dnl The curses library is optional; used for querying terminal info
-if test "$llvm_cv_enable_curses" = "yes" ; then
+if test "$llvm_cv_enable_terminfo" = "yes" ; then
   dnl We need the has_color functionality in curses for it to be useful.
-  AC_SEARCH_LIBS(has_colors,curses ncurses ncursesw,
-                 AC_DEFINE([HAVE_CURSES],[1],
-                           [Define if curses provides the has_color() function on this platform.]))
+  AC_SEARCH_LIBS(setupterm,tinfo terminfo curses ncurses ncursesw,
+                 AC_DEFINE([HAVE_TERMINFO],[1],
+                           [Define if the setupterm() function is supported this platform.]))
 fi
 
 dnl libffi is optional; used to call external functions from the interpreter
@@ -1573,11 +1597,6 @@ else
   AC_SUBST(HAVE_LIBZ, 0)
 fi
 
-dnl Try to find a suitable curses header.
-if test "$llvm_cv_enable_curses" = "yes" ; then
-  AC_CHECK_HEADERS([curses.h ncurses.h ncursesw.h ncurses/curses.h ncursesw/curses.h])
-fi
-
 dnl Try to find ffi.h.
 if test "$llvm_cv_enable_libffi" = "yes" ; then
   AC_CHECK_HEADERS([ffi.h ffi/ffi.h])
@@ -1950,7 +1969,6 @@ AC_CONFIG_MAKEFILE(Makefile)
 AC_CONFIG_MAKEFILE(Makefile.common)
 AC_CONFIG_MAKEFILE(examples/Makefile)
 AC_CONFIG_MAKEFILE(lib/Makefile)
-AC_CONFIG_MAKEFILE(runtime/Makefile)
 AC_CONFIG_MAKEFILE(test/Makefile)
 AC_CONFIG_MAKEFILE(test/Makefile.tests)
 AC_CONFIG_MAKEFILE(unittests/Makefile)
diff --git a/autoconf/m4/ltdl.m4 b/autoconf/m4/ltdl.m4
index 0226f63..b3302fa 100644
--- a/autoconf/m4/ltdl.m4
+++ b/autoconf/m4/ltdl.m4
@@ -379,3 +379,19 @@ if test x"$libltdl_cv_need_uscore" = xyes; then
     [Define if dlsym() requires a leading underscore in symbol names.])
 fi
 ])# AC_LTDL_DLSYM_USCORE
+
+# AC_LTDL_FUNC_ARGZ
+# -----------------
+AC_DEFUN([AC_LTDL_FUNC_ARGZ],
+[AC_CHECK_HEADERS([argz.h])
+
+AC_CHECK_TYPES([error_t],
+  [],
+  [AC_DEFINE([error_t], [int],
+    [Define to a type to use for `error_t' if it is not otherwise available.])],
+  [#if HAVE_ARGZ_H
+#  include <argz.h>
+#endif])
+
+AC_CHECK_FUNCS([argz_append argz_create_sep argz_insert argz_next argz_stringify])
+])# AC_LTDL_FUNC_ARGZ
diff --git a/bindings/ocaml/Makefile b/bindings/ocaml/Makefile
index a89caef..44562fe8 100644
--- a/bindings/ocaml/Makefile
+++ b/bindings/ocaml/Makefile
@@ -8,7 +8,8 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../..
-DIRS = llvm bitreader bitwriter analysis target executionengine transforms
+DIRS = llvm bitreader bitwriter irreader analysis target executionengine \
+       transforms linker backends
 ExtraMakefiles = $(PROJ_OBJ_DIR)/Makefile.ocaml
 
 ocamldoc:
diff --git a/bindings/ocaml/Makefile.ocaml b/bindings/ocaml/Makefile.ocaml
index a2a8b02..f8ed841 100644
--- a/bindings/ocaml/Makefile.ocaml
+++ b/bindings/ocaml/Makefile.ocaml
@@ -1,4 +1,4 @@
-##===- tools/ml/Makefile -----------------------------------*- Makefile -*-===##
+##===- bindings/ocaml/Makefile.ocaml -----------------------*- Makefile -*-===##
 # 
 #                     The LLVM Compiler Infrastructure
 #
@@ -7,10 +7,10 @@
 # 
 ##===----------------------------------------------------------------------===##
 # 
-# An ocaml library is a unique project type in the context of LLVM, so rules are
+# An OCaml library is a unique project type in the context of LLVM, so rules are
 # here rather than in Makefile.rules.
 # 
-# Reference materials on installing ocaml libraries:
+# Reference materials on installing OCaml libraries:
 # 
 #   https://fedoraproject.org/wiki/Packaging/OCaml
 #   http://pkg-ocaml-maint.alioth.debian.org/ocaml_packaging_policy.txt
@@ -23,6 +23,10 @@ include $(LEVEL)/Makefile.config
 CXX.Flags += -I"$(shell $(OCAMLC) -where)"
 C.Flags += -I"$(shell $(OCAMLC) -where)"
 
+ifeq ($(ENABLE_SHARED),1)
+LINK_COMPONENTS := all
+endif
+
 include $(LEVEL)/Makefile.common
 
 # Intentionally ignore PROJ_prefix here. We want the ocaml stdlib. However, the
@@ -38,6 +42,18 @@ UsedLibNames = $(shell $(LLVM_CONFIG) --libnames $(UsedComponents))
 endif
 endif
 
+# How do we link OCaml executables with LLVM?
+# 1) If this is a --enable-shared build, build stub libraries. This also allows
+#    to use LLVM from toplevels.
+# 2) If this is a --disable-shared build, embed ocamlc options for building
+#    a custom runtime and a static executable. It is not possible to use LLVM
+#    from toplevels.
+ifneq ($(ObjectsO),)
+ifeq ($(ENABLE_SHARED),1)
+OCAMLSTUBS := 1
+endif
+endif
+
 # Tools
 OCAMLCFLAGS += -I $(ObjDir) -I $(OcamlDir)
 ifndef IS_CLEANING_TARGET
@@ -60,14 +76,36 @@ endif
 
 Compile.CMI  := $(strip $(OCAMLC) -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
 Compile.CMO  := $(strip $(OCAMLC) -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
+Compile.CMX  := $(strip $(OCAMLOPT) -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
+
+ifdef OCAMLSTUBS
+# Avoid the need for LD_LIBRARY_PATH
+ifneq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW))
+ifneq ($(HOST_OS),Darwin)
+OCAMLRPATH   := $(RPATH) -Wl,'$(SharedLibDir)'
+endif
+endif
+endif
+
+ifdef OCAMLSTUBS
+Archive.CMA  := $(strip $(OCAMLC) -a -dllib -l$(LIBRARYNAME) $(OCAMLDEBUGFLAG) \
+                                  -o)
+else
 Archive.CMA  := $(strip $(OCAMLC) -a -custom $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) \
                                   -o)
+endif
 
-Compile.CMX  := $(strip $(OCAMLOPT) -c $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG) -o)
+ifdef OCAMLSTUBS
+Archive.CMXA := $(strip $(OCAMLOPT) -a $(patsubst %,-cclib %, \
+                                    $(LLVMLibsOptions) -l$(LIBRARYNAME) \
+                                    -L$(SharedLibDir) $(OCAMLRPATH)) \
+                                    $(OCAMLDEBUGFLAG) -o)
+else
 Archive.CMXA := $(strip $(OCAMLOPT) -a $(OCAMLAFLAGS) $(OCAMLDEBUGFLAG) -o)
+endif
 
 ifdef OCAMLOPT
-Archive.EXE := $(strip $(OCAMLOPT) -cc $(CXX) $(OCAMLCFLAGS) $(UsedOcamLibs:%=%.cmxa) $(OCAMLDEBUGFLAG) -o)
+Archive.EXE := $(strip $(OCAMLOPT) -cc $(CXX) $(OCAMLCFLAGS) $(UsedOcamlLibs:%=%.cmxa) $(OCAMLDEBUGFLAG) -o)
 else
 Archive.EXE := $(strip $(OCAMLC) -cc $(CXX) $(OCAMLCFLAGS) $(OCAMLDEBUGFLAG:%=%.cma) -o)
 endif
@@ -113,6 +151,10 @@ OutputCMA  := $(LibraryCMA:$(ObjDir)/%.cma=$(OcamlDir)/%.cma)
 OutputCMXA := $(LibraryCMXA:$(ObjDir)/%.cmxa=$(OcamlDir)/%.cmxa)
 endif
 
+ifdef OCAMLSTUBS
+SharedLib := $(OcamlDir)/dll$(LIBRARYNAME)$(SHLIBEXT)
+endif
+
 ifdef TOOLNAME
 ifdef EXAMPLE_TOOL
 OutputEXE := $(ExmplDir)/$(strip $(TOOLNAME))$(EXEEXT)
@@ -130,6 +172,10 @@ DestCMA  := $(PROJ_libocamldir)/$(LIBRARYNAME).cma
 DestCMXA := $(PROJ_libocamldir)/$(LIBRARYNAME).cmxa
 endif
 
+ifdef OCAMLSTUBS
+DestSharedLib := $(PROJ_libocamldir)/dll$(LIBRARYNAME)$(SHLIBEXT)
+endif
+
 ##===- Dependencies -------------------------------------------------------===##
 # Copy the sources into the intermediate directory because older ocamlc doesn't
 # support -o except when linking (outputs are placed next to inputs).
@@ -187,6 +233,34 @@ uninstall-a::
 endif
 
 
+##===- Build stub library from C sources ----------------------------------===##
+
+ifdef SharedLib
+all-local:: $(SharedLib)
+clean-local:: clean-shared
+install-local:: install-shared
+uninstall-local:: uninstall-shared
+
+$(SharedLib): $(ObjectsO) $(OcamlDir)/.dir
+	$(Echo) "Building $(BuildMode) $(notdir $@)"
+	$(Verb) $(Link) $(SharedLinkOptions) $(OCAMLRPATH) $(LLVMLibsOptions) \
+			-o $@ $(ObjectsO)
+
+clean-shared::
+	-$(Verb) $(RM) -f $(SharedLib)
+
+install-shared:: $(SharedLib)
+	$(Echo) "Installing $(BuildMode) $(DestSharedLib)"
+	$(Verb) $(MKDIR) $(PROJ_libocamldir)
+	$(Verb) $(INSTALL) $(SharedLib) $(DestSharedLib)
+	$(Verb)
+
+uninstall-shared::
+	$(Echo) "Uninstalling $(DestSharedLib)"
+	-$(Verb) $(RM) -f $(DestSharedLib)
+endif
+
+
 ##===- Deposit dependent libraries adjacent to Ocaml libs -----------------===##
 
 all-local:: build-deplibs
@@ -391,6 +465,7 @@ printcamlvars::
 	$(Echo) "CAML_LIBDIR  : " '$(CAML_LIBDIR)'
 	$(Echo) "LibraryCMA   : " '$(LibraryCMA)'
 	$(Echo) "LibraryCMXA  : " '$(LibraryCMXA)'
+	$(Echo) "SharedLib    : " '$(SharedLib)'
 	$(Echo) "OcamlSources1: " '$(OcamlSources1)'
 	$(Echo) "OcamlSources2: " '$(OcamlSources2)'
 	$(Echo) "OcamlSources : " '$(OcamlSources)'
@@ -404,6 +479,7 @@ printcamlvars::
 	$(Echo) "DestA        : " '$(DestA)'
 	$(Echo) "DestCMA      : " '$(DestCMA)'
 	$(Echo) "DestCMXA     : " '$(DestCMXA)'
+	$(Echo) "DestSharedLib: " '$(DestSharedLib)'
 	$(Echo) "UsedLibs     : " '$(UsedLibs)'
 	$(Echo) "UsedLibNames : " '$(UsedLibNames)'
 
diff --git a/bindings/ocaml/analysis/analysis_ocaml.c b/bindings/ocaml/analysis/analysis_ocaml.c
index 9716705..91be2d3 100644
--- a/bindings/ocaml/analysis/analysis_ocaml.c
+++ b/bindings/ocaml/analysis/analysis_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- analysis_ocaml.c - LLVM Ocaml Glue ----------------------*- C++ -*-===*\
+/*===-- analysis_ocaml.c - LLVM OCaml Glue ----------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 |* Note that these functions intentionally take liberties with the CAMLparamX *|
diff --git a/bindings/ocaml/analysis/llvm_analysis.ml b/bindings/ocaml/analysis/llvm_analysis.ml
index fc4d203..21088ab 100644
--- a/bindings/ocaml/analysis/llvm_analysis.ml
+++ b/bindings/ocaml/analysis/llvm_analysis.ml
@@ -1,4 +1,4 @@
-(*===-- llvm_analysis.ml - LLVM Ocaml Interface -----------------*- C++ -*-===*
+(*===-- llvm_analysis.ml - LLVM OCaml Interface -----------------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
diff --git a/bindings/ocaml/analysis/llvm_analysis.mli b/bindings/ocaml/analysis/llvm_analysis.mli
index 793f482..1a0af02 100644
--- a/bindings/ocaml/analysis/llvm_analysis.mli
+++ b/bindings/ocaml/analysis/llvm_analysis.mli
@@ -1,4 +1,4 @@
-(*===-- llvm_analysis.mli - LLVM Ocaml Interface ----------------*- C++ -*-===*
+(*===-- llvm_analysis.mli - LLVM OCaml Interface ----------------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,7 +9,7 @@
 
 (** Intermediate representation analysis.
 
-    This interface provides an ocaml API for LLVM IR analyses, the classes in
+    This interface provides an OCaml API for LLVM IR analyses, the classes in
     the Analysis library. *)
 
 (** [verify_module m] returns [None] if the module [m] is valid, and
diff --git a/bindings/ocaml/backends/META.llvm_backend.in b/bindings/ocaml/backends/META.llvm_backend.in
new file mode 100644
index 0000000..0d4a6d6
--- /dev/null
+++ b/bindings/ocaml/backends/META.llvm_backend.in
@@ -0,0 +1,8 @@
+name = "llvm_@TARGET@"
+version = "@PACKAGE_VERSION@"
+description = "@TARGET@ Backend for LLVM"
+requires = "llvm"
+archive(byte) = "llvm_@TARGET@.cma"
+archive(native) = "llvm_@TARGET@.cmxa"
+directory = "."
+linkopts = "-ccopt -lstdc++"
+\ No newline at end of file
diff --git a/bindings/ocaml/backends/Makefile b/bindings/ocaml/backends/Makefile
new file mode 100644
index 0000000..ff39212
--- /dev/null
+++ b/bindings/ocaml/backends/Makefile
@@ -0,0 +1,61 @@
+##===- bindings/ocaml/backends/Makefile --------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the master makefile for backend-specific bindings. It works by
+# creating a stub makefile for each configured target, e.g. Makefile.ARM, and
+# invoking it to compile the corresponding library, e.g. Llvm_ARM.
+#
+# This scheme allows to keep changes to Makefile.ocaml minimal.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../..
+ExtraMakefiles = $(PROJ_OBJ_DIR)/Makefile.common
+
+include $(LEVEL)/Makefile.config
+include $(LEVEL)/Makefile.common
+
+all-local:: all-backends
+clean-local:: clean-backends
+install-local:: install-backends
+uninstall-local:: uninstall-backends
+
+stubs:
+	$(Verb) for i in $(TARGETS_TO_BUILD); do \
+		$(ECHO) "TARGET := $$i"           >  Makefile.$$i; \
+		$(ECHO) "include Makefile.common" >> Makefile.$$i; \
+	done
+
+all-backends: stubs
+	$(Verb) for i in $(TARGETS_TO_BUILD); do \
+		$(MAKE) -f Makefile.$$i all; \
+	done
+
+clean-backends: stubs
+	$(Verb) for i in $(TARGETS_TO_BUILD); do \
+		$(MAKE) -f Makefile.$$i clean; \
+		$(RM) -f Makefile.$$i; \
+	done
+
+install-backends: stubs
+	$(Verb) for i in $(TARGETS_TO_BUILD); do \
+		$(MAKE) -f Makefile.$$i install; \
+	done
+
+uninstall-backends: stubs
+	$(Verb) for i in $(TARGETS_TO_BUILD); do \
+		$(MAKE) -f Makefile.$$i uninstall; \
+	done
+
+ocamldoc: stubs
+	$(Verb) for i in $(TARGETS_TO_BUILD); do \
+		$(MAKE) -f Makefile.$$i ocamldoc; \
+	done
+
+.PHONY: all-backends clean-backends install-backends uninstall-backends ocamldoc
diff --git a/bindings/ocaml/backends/Makefile.common b/bindings/ocaml/backends/Makefile.common
new file mode 100644
index 0000000..be65dd0
--- /dev/null
+++ b/bindings/ocaml/backends/Makefile.common
@@ -0,0 +1,65 @@
+##===- bindings/ocaml/backends/Makefile.common -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the slave makefile for backend-specific bindings. This makefile should
+# be included after defining TARGET. It will then substitute @TARGET@ for
+# the value of TARGET in various *.in files and build an OCaml library in
+# a regular way.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../..
+LIBRARYNAME := llvm_$(TARGET)
+UsedComponents := $(TARGET)
+UsedOcamlInterfaces := llvm
+
+include $(LEVEL)/Makefile.config
+
+SOURCES       := $(TARGET)_ocaml.c
+OcamlHeaders1 := $(PROJ_SRC_DIR)/llvm_$(TARGET).mli
+OcamlSources1 := $(PROJ_SRC_DIR)/llvm_$(TARGET).ml
+
+include ../Makefile.ocaml
+
+$(ObjDir)/llvm_$(TARGET).ml: $(PROJ_SRC_DIR)/llvm_backend.ml.in $(ObjDir)/.dir
+	$(Verb) $(SED) -e 's/@TARGET@/$(TARGET)/' $< > $@
+
+$(ObjDir)/llvm_$(TARGET).mli: $(PROJ_SRC_DIR)/llvm_backend.mli.in $(ObjDir)/.dir
+	$(Verb) $(SED) -e 's/@TARGET@/$(TARGET)/' $< > $@
+
+$(ObjDir)/$(TARGET)_ocaml.o: $(PROJ_SRC_DIR)/backend_ocaml.c $(ObjDir)/.dir
+	$(Echo) "Compiling $*.c for $(BuildMode) build" $(PIC_FLAG)
+	$(Verb) $(Compile.C) -DTARGET=$(TARGET) $< -o $@
+
+
+##===- OCamlFind Package --------------------------------------------------===##
+
+all-local:: copy-meta
+install-local:: install-meta
+uninstall-local:: uninstall-meta
+
+DestMETA := $(PROJ_libocamldir)/META.llvm_$(TARGET)
+
+# Easy way of generating META in the objdir
+copy-meta: $(OcamlDir)/META.llvm_$(TARGET)
+
+$(OcamlDir)/META.llvm_$(TARGET): META.llvm_backend.in
+	$(Verb) $(SED) -e 's/@TARGET@/$(TARGET)/' \
+								 -e 's/@PACKAGE_VERSION@/$(LLVMVersion)/' $< > $@
+
+install-meta:: $(OcamlDir)/META.llvm_$(TARGET)
+	$(Echo) "Install $(BuildMode) $(DestMETA)"
+	$(Verb) $(MKDIR) $(PROJ_libocamldir)
+	$(Verb) $(DataInstall) $< "$(DestMETA)"
+
+uninstall-meta::
+	$(Echo) "Uninstalling $(DestMETA)"
+	-$(Verb) $(RM) -f "$(DestMETA)"
+
+.PHONY: copy-meta install-meta uninstall-meta
diff --git a/bindings/ocaml/backends/backend_ocaml.c b/bindings/ocaml/backends/backend_ocaml.c
new file mode 100644
index 0000000..2d4ba85
--- /dev/null
+++ b/bindings/ocaml/backends/backend_ocaml.c
@@ -0,0 +1,37 @@
+/*===-- backend_ocaml.c - LLVM OCaml Glue -----------------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+|* Note that these functions intentionally take liberties with the CAMLparamX *|
+|* macros, since most of the parameters are not GC heap objects.              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/Target.h"
+#include "caml/alloc.h"
+#include "caml/memory.h"
+
+// TODO: Figure out how to call these only for targets which support them.
+// LLVMInitialize ## target ## AsmPrinter();
+// LLVMInitialize ## target ## AsmParser();
+// LLVMInitialize ## target ## Disassembler();
+
+#define INITIALIZER1(target) \
+  CAMLprim value llvm_initialize_ ## target(value Unit) {  \
+    LLVMInitialize ## target ## TargetInfo();              \
+    LLVMInitialize ## target ## Target();                  \
+    LLVMInitialize ## target ## TargetMC();                \
+    return Val_unit;                                       \
+  }
+
+#define INITIALIZER(target) INITIALIZER1(target)
+
+INITIALIZER(TARGET)
diff --git a/bindings/ocaml/backends/llvm_backend.ml.in b/bindings/ocaml/backends/llvm_backend.ml.in
new file mode 100644
index 0000000..bd1e586
--- /dev/null
+++ b/bindings/ocaml/backends/llvm_backend.ml.in
@@ -0,0 +1,10 @@
+(*===-- llvm_backend.ml.in - LLVM OCaml Interface -------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+external initialize : unit -> unit = "llvm_initialize_@TARGET@"
diff --git a/bindings/ocaml/backends/llvm_backend.mli.in b/bindings/ocaml/backends/llvm_backend.mli.in
new file mode 100644
index 0000000..9506789
--- /dev/null
+++ b/bindings/ocaml/backends/llvm_backend.mli.in
@@ -0,0 +1,19 @@
+(*===-- llvm_backend.mli.in - LLVM OCaml Interface ------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** @TARGET@ Initialization.
+
+    This interface provides an OCaml API for initialization of
+    the @TARGET@ LLVM target. By referencing this module, you will cause
+    OCaml to load or link in the LLVM libraries corresponding to the target.
+    By calling [initialize], you will register components of this target
+    in the target registry, which is necessary in order to emit assembly,
+    object files, and so on. *)
+
+external initialize : unit -> unit = "llvm_initialize_@TARGET@"
+\ No newline at end of file
diff --git a/bindings/ocaml/bitreader/bitreader_ocaml.c b/bindings/ocaml/bitreader/bitreader_ocaml.c
index ef72ce2..0264e73 100644
--- a/bindings/ocaml/bitreader/bitreader_ocaml.c
+++ b/bindings/ocaml/bitreader/bitreader_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- bitwriter_ocaml.c - LLVM Ocaml Glue ---------------------*- C++ -*-===*\
+/*===-- bitwriter_ocaml.c - LLVM OCaml Glue ---------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 \*===----------------------------------------------------------------------===*/
diff --git a/bindings/ocaml/bitreader/llvm_bitreader.ml b/bindings/ocaml/bitreader/llvm_bitreader.ml
index 8b9d01d..865208c 100644
--- a/bindings/ocaml/bitreader/llvm_bitreader.ml
+++ b/bindings/ocaml/bitreader/llvm_bitreader.ml
@@ -1,4 +1,4 @@
-(*===-- llvm_bitreader.ml - LLVM Ocaml Interface ----------------*- C++ -*-===*
+(*===-- llvm_bitreader.ml - LLVM OCaml Interface ----------------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
diff --git a/bindings/ocaml/bitreader/llvm_bitreader.mli b/bindings/ocaml/bitreader/llvm_bitreader.mli
index 573de5e..ff377b9 100644
--- a/bindings/ocaml/bitreader/llvm_bitreader.mli
+++ b/bindings/ocaml/bitreader/llvm_bitreader.mli
@@ -1,4 +1,4 @@
-(*===-- llvm_bitreader.mli - LLVM Ocaml Interface ---------------*- C++ -*-===*
+(*===-- llvm_bitreader.mli - LLVM OCaml Interface ---------------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,7 +9,7 @@
 
 (** Bitcode reader.
 
-    This interface provides an ocaml API for the LLVM bitcode reader, the
+    This interface provides an OCaml API for the LLVM bitcode reader, the
     classes in the Bitreader library. *)
 
 exception Error of string
diff --git a/bindings/ocaml/bitwriter/bitwriter_ocaml.c b/bindings/ocaml/bitwriter/bitwriter_ocaml.c
index 53c93cb..a47f700 100644
--- a/bindings/ocaml/bitwriter/bitwriter_ocaml.c
+++ b/bindings/ocaml/bitwriter/bitwriter_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- bitwriter_ocaml.c - LLVM Ocaml Glue ---------------------*- C++ -*-===*\
+/*===-- bitwriter_ocaml.c - LLVM OCaml Glue ---------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 |* Note that these functions intentionally take liberties with the CAMLparamX *|
diff --git a/bindings/ocaml/bitwriter/llvm_bitwriter.ml b/bindings/ocaml/bitwriter/llvm_bitwriter.ml
index 3e69a3c..fac8553 100644
--- a/bindings/ocaml/bitwriter/llvm_bitwriter.ml
+++ b/bindings/ocaml/bitwriter/llvm_bitwriter.ml
@@ -1,4 +1,4 @@
-(*===-- llvm_bitwriter.ml - LLVM Ocaml Interface ----------------*- C++ -*-===*
+(*===-- llvm_bitwriter.ml - LLVM OCaml Interface ----------------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -7,7 +7,7 @@
  *
  *===----------------------------------------------------------------------===
  *
- * This interface provides an ocaml API for the LLVM intermediate
+ * This interface provides an OCaml API for the LLVM intermediate
  * representation, the classes in the VMCore library.
  *
  *===----------------------------------------------------------------------===*)
diff --git a/bindings/ocaml/bitwriter/llvm_bitwriter.mli b/bindings/ocaml/bitwriter/llvm_bitwriter.mli
index ea9a876..bb3e3b8 100644
--- a/bindings/ocaml/bitwriter/llvm_bitwriter.mli
+++ b/bindings/ocaml/bitwriter/llvm_bitwriter.mli
@@ -1,4 +1,4 @@
-(*===-- llvm_bitwriter.mli - LLVM Ocaml Interface ---------------*- C++ -*-===*
+(*===-- llvm_bitwriter.mli - LLVM OCaml Interface ---------------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,7 +9,7 @@
 
 (** Bitcode writer.
 
-    This interface provides an ocaml API for the LLVM bitcode writer, the
+    This interface provides an OCaml API for the LLVM bitcode writer, the
     classes in the Bitwriter library. *)
 
 (** [write_bitcode_file m path] writes the bitcode for module [m] to the file at
diff --git a/bindings/ocaml/executionengine/executionengine_ocaml.c b/bindings/ocaml/executionengine/executionengine_ocaml.c
index 02e0306..4b44a91 100644
--- a/bindings/ocaml/executionengine/executionengine_ocaml.c
+++ b/bindings/ocaml/executionengine/executionengine_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- executionengine_ocaml.c - LLVM Ocaml Glue ---------------*- C++ -*-===*\
+/*===-- executionengine_ocaml.c - LLVM OCaml Glue ---------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 |* Note that these functions intentionally take liberties with the CAMLparamX *|
@@ -324,3 +324,18 @@ CAMLprim value llvm_ee_free_machine_code(LLVMValueRef F,
   return Val_unit;
 }
 
+extern value llvm_alloc_data_layout(LLVMTargetDataRef TargetData);
+
+/* ExecutionEngine.t -> Llvm_target.DataLayout.t */
+CAMLprim value llvm_ee_get_data_layout(LLVMExecutionEngineRef EE) {
+  value DataLayout;
+  LLVMTargetDataRef OrigDataLayout;
+  OrigDataLayout = LLVMGetExecutionEngineTargetData(EE);
+
+  char* TargetDataCStr;
+  TargetDataCStr = LLVMCopyStringRepOfTargetData(OrigDataLayout);
+  DataLayout = llvm_alloc_data_layout(LLVMCreateTargetData(TargetDataCStr));
+  LLVMDisposeMessage(TargetDataCStr);
+
+  return DataLayout;
+}
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.ml b/bindings/ocaml/executionengine/llvm_executionengine.ml
index ddb53bb..a738df7 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.ml
+++ b/bindings/ocaml/executionengine/llvm_executionengine.ml
@@ -1,4 +1,4 @@
-(*===-- llvm_executionengine.ml - LLVM Ocaml Interface ----------*- C++ -*-===*
+(*===-- llvm_executionengine.ml - LLVM OCaml Interface ----------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -83,12 +83,11 @@ module ExecutionEngine = struct
   external free_machine_code: Llvm.llvalue -> t -> unit
     = "llvm_ee_free_machine_code"
 
-  external target_data: t -> Llvm_target.DataLayout.t
-    = "LLVMGetExecutionEngineTargetData"
+  external data_layout : t -> Llvm_target.DataLayout.t
+    = "llvm_ee_get_data_layout"
   
   (* The following are not bound. Patches are welcome.
   
-  get_target_data: t -> lltargetdata
   add_global_mapping: llvalue -> llgenericvalue -> t -> unit
   clear_all_global_mappings: t -> unit
   update_global_mapping: llvalue -> llgenericvalue -> t -> unit
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.mli b/bindings/ocaml/executionengine/llvm_executionengine.mli
index 0b06078..16f0893 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.mli
+++ b/bindings/ocaml/executionengine/llvm_executionengine.mli
@@ -1,4 +1,4 @@
-(*===-- llvm_executionengine.mli - LLVM Ocaml Interface ---------*- C++ -*-===*
+(*===-- llvm_executionengine.mli - LLVM OCaml Interface ---------*- C++ -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,7 +9,7 @@
 
 (** JIT Interpreter.
 
-    This interface provides an ocaml API for LLVM execution engine (JIT/
+    This interface provides an OCaml API for LLVM execution engine (JIT/
     interpreter), the classes in the ExecutionEngine library. *)
 
 exception Error of string
@@ -43,7 +43,6 @@ module GenericValue: sig
       bitwidth [w]. See the field [llvm::GenericValue::IntVal]. *)
   val of_nativeint : Llvm.lltype -> nativeint -> t
 
-
   (** [of_int64 n w] boxes the int64 [i] in a generic value with the bitwidth
       [w]. See the field [llvm::GenericValue::IntVal]. *)
   val of_int64 : Llvm.lltype -> int64 -> t
@@ -110,7 +109,7 @@ module ExecutionEngine: sig
   (** [dispose ee] releases the memory used by the execution engine and must be
       invoked to avoid memory leaks. *)
   val dispose : t -> unit
-  
+
   (** [add_module m ee] adds the module [m] to the execution engine [ee]. *)
   val add_module : Llvm.llmodule -> t -> unit
   
@@ -119,19 +118,16 @@ module ExecutionEngine: sig
       [Error msg] if an error occurs. *)
   val remove_module : Llvm.llmodule -> t -> Llvm.llmodule
 
-  
   (** [find_function n ee] finds the function named [n] defined in any of the
       modules owned by the execution engine [ee]. Returns [None] if the function
       is not found and [Some f] otherwise. *)
   val find_function : string -> t -> Llvm.llvalue option
-
   
   (** [run_function f args ee] synchronously executes the function [f] with the
       arguments [args], which must be compatible with the parameter types. *)
   val run_function : Llvm.llvalue -> GenericValue.t array -> t ->
                      GenericValue.t
 
-  
   (** [run_static_ctors ee] executes the static constructors of each module in
       the execution engine [ee]. *)
   val run_static_ctors : t -> unit
@@ -147,17 +143,12 @@ module ExecutionEngine: sig
   val run_function_as_main : Llvm.llvalue -> string array ->
                                   (string * string) array -> t -> int
 
-  
   (** [free_machine_code f ee] releases the memory in the execution engine [ee]
       used to store the machine code for the function [f]. *)
   val free_machine_code : Llvm.llvalue -> t -> unit
 
-
-  (** [target_data ee] is the target data owned by the execution engine
-      [ee]. *)
-  val target_data : t -> Llvm_target.DataLayout.t
-
+  (** [data_layout ee] is the data layout of the execution engine [ee]. *)
+  val data_layout : t -> Llvm_target.DataLayout.t
 end
 
 val initialize_native_target : unit -> bool
-
diff --git a/bindings/ocaml/irreader/Makefile b/bindings/ocaml/irreader/Makefile
new file mode 100644
index 0000000..7665999
--- /dev/null
+++ b/bindings/ocaml/irreader/Makefile
@@ -0,0 +1,19 @@
+##===- bindings/ocaml/irreader/Makefile --------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the makefile for the Objective Caml Llvm_irreader interface.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../..
+LIBRARYNAME := llvm_irreader
+UsedComponents := irreader
+UsedOcamlInterfaces := llvm
+
+include ../Makefile.ocaml
diff --git a/bindings/ocaml/irreader/irreader_ocaml.c b/bindings/ocaml/irreader/irreader_ocaml.c
new file mode 100644
index 0000000..30c10c7
--- /dev/null
+++ b/bindings/ocaml/irreader/irreader_ocaml.c
@@ -0,0 +1,59 @@
+/*===-- irreader_ocaml.c - LLVM OCaml Glue ----------------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/IRReader.h"
+#include "caml/alloc.h"
+#include "caml/fail.h"
+#include "caml/memory.h"
+
+/* Can't use the recommended caml_named_value mechanism for backwards
+   compatibility reasons. This is largely equivalent. */
+static value llvm_irreader_error_exn;
+
+CAMLprim value llvm_register_irreader_exns(value Error) {
+  llvm_irreader_error_exn = Field(Error, 0);
+  register_global_root(&llvm_irreader_error_exn);
+  return Val_unit;
+}
+
+static void llvm_raise(value Prototype, char *Message) {
+  CAMLparam1(Prototype);
+  CAMLlocal1(CamlMessage);
+
+  CamlMessage = copy_string(Message);
+  LLVMDisposeMessage(Message);
+
+  raise_with_arg(Prototype, CamlMessage);
+  abort(); /* NOTREACHED */
+#ifdef CAMLnoreturn
+  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
+#endif
+}
+
+
+/*===-- Modules -----------------------------------------------------------===*/
+
+/* Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule */
+CAMLprim value llvm_parse_ir(LLVMContextRef C,
+                             LLVMMemoryBufferRef MemBuf) {
+  CAMLparam0();
+  CAMLlocal2(Variant, MessageVal);
+  LLVMModuleRef M;
+  char *Message;
+
+  if (LLVMParseIRInContext(C, MemBuf, &M, &Message))
+    llvm_raise(llvm_irreader_error_exn, Message);
+
+  CAMLreturn((value) M);
+}
diff --git a/bindings/ocaml/irreader/llvm_irreader.ml b/bindings/ocaml/irreader/llvm_irreader.ml
new file mode 100644
index 0000000..455b1fa
--- /dev/null
+++ b/bindings/ocaml/irreader/llvm_irreader.ml
@@ -0,0 +1,17 @@
+(*===-- llvm_irreader.ml - LLVM OCaml Interface ---------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+
+exception Error of string
+
+external register_exns : exn -> unit = "llvm_register_irreader_exns"
+let _ = register_exns (Error "")
+
+external parse_ir : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
+                  = "llvm_parse_ir"
diff --git a/bindings/ocaml/irreader/llvm_irreader.mli b/bindings/ocaml/irreader/llvm_irreader.mli
new file mode 100644
index 0000000..2b11473
--- /dev/null
+++ b/bindings/ocaml/irreader/llvm_irreader.mli
@@ -0,0 +1,21 @@
+(*===-- llvm_irreader.mli - LLVM OCaml Interface --------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** IR reader.
+
+    This interface provides an OCaml API for the LLVM assembly reader, the
+    classes in the IRReader library. *)
+
+exception Error of string
+
+(** [parse_ir context mb] parses the IR for a new module [m] from the
+    memory buffer [mb] in the context [context]. Returns [m] if successful, or
+    raises [Error msg] otherwise, where [msg] is a description of the error
+    encountered. See the function [llvm::ParseIR]. *)
+val parse_ir : Llvm.llcontext -> Llvm.llmemorybuffer -> Llvm.llmodule
diff --git a/bindings/ocaml/linker/Makefile b/bindings/ocaml/linker/Makefile
new file mode 100644
index 0000000..ca1c96c
--- /dev/null
+++ b/bindings/ocaml/linker/Makefile
@@ -0,0 +1,19 @@
+##===- bindings/ocaml/linker/Makefile ----------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the makefile for the Objective Caml Llvm_target interface.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../..
+LIBRARYNAME := llvm_linker
+UsedComponents := linker
+UsedOcamlInterfaces := llvm
+
+include ../Makefile.ocaml
diff --git a/bindings/ocaml/linker/linker_ocaml.c b/bindings/ocaml/linker/linker_ocaml.c
new file mode 100644
index 0000000..2491e3b
--- /dev/null
+++ b/bindings/ocaml/linker/linker_ocaml.c
@@ -0,0 +1,54 @@
+/*===-- linker_ocaml.c - LLVM Ocaml Glue ------------------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+|* Note that these functions intentionally take liberties with the CAMLparamX *|
+|* macros, since most of the parameters are not GC heap objects.              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/Linker.h"
+#include "caml/alloc.h"
+#include "caml/memory.h"
+#include "caml/fail.h"
+
+static value llvm_linker_error_exn;
+
+CAMLprim value llvm_register_linker_exns(value Error) {
+  llvm_linker_error_exn = Field(Error, 0);
+  register_global_root(&llvm_linker_error_exn);
+  return Val_unit;
+}
+
+static void llvm_raise(value Prototype, char *Message) {
+  CAMLparam1(Prototype);
+  CAMLlocal1(CamlMessage);
+
+  CamlMessage = copy_string(Message);
+  LLVMDisposeMessage(Message);
+
+  raise_with_arg(Prototype, CamlMessage);
+  abort(); /* NOTREACHED */
+#ifdef CAMLnoreturn
+  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
+#endif
+}
+
+/* llmodule -> llmodule -> Mode.t -> unit
+   raises Error msg on error */
+CAMLprim value llvm_link_modules(LLVMModuleRef Dst, LLVMModuleRef Src, value Mode) {
+  char* Message;
+
+  if (LLVMLinkModules(Dst, Src, Int_val(Mode), &Message))
+    llvm_raise(llvm_linker_error_exn, Message);
+
+  return Val_unit;
+}
diff --git a/bindings/ocaml/linker/llvm_linker.ml b/bindings/ocaml/linker/llvm_linker.ml
new file mode 100644
index 0000000..2b73e2e
--- /dev/null
+++ b/bindings/ocaml/linker/llvm_linker.ml
@@ -0,0 +1,22 @@
+(*===-- llvm_linker.ml - LLVM OCaml Interface ------------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+exception Error of string
+
+external register_exns : exn -> unit = "llvm_register_linker_exns"
+let _ = register_exns (Error "")
+
+module Mode = struct
+  type t =
+  | DestroySource
+  | PreserveSource
+end
+
+external link_modules : Llvm.llmodule -> Llvm.llmodule -> Mode.t -> unit
+                      = "llvm_link_modules"
+\ No newline at end of file
diff --git a/bindings/ocaml/linker/llvm_linker.mli b/bindings/ocaml/linker/llvm_linker.mli
new file mode 100644
index 0000000..4def7a8
--- /dev/null
+++ b/bindings/ocaml/linker/llvm_linker.mli
@@ -0,0 +1,26 @@
+(*===-- llvm_linker.mli - LLVM OCaml Interface -----------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** Linker.
+
+    This interface provides an OCaml API for LLVM bitcode linker,
+    the classes in the Linker library. *)
+
+exception Error of string
+
+(** Linking mode. *)
+module Mode : sig
+  type t =
+  | DestroySource
+  | PreserveSource
+end
+
+(** [link_modules dst src mode] links [src] into [dst], raising [Error]
+    if the linking fails. *)
+val link_modules : Llvm.llmodule -> Llvm.llmodule -> Mode.t -> unit
+\ No newline at end of file
diff --git a/bindings/ocaml/llvm/META.llvm.in b/bindings/ocaml/llvm/META.llvm.in
index fdb3253..c241ea5 100644
--- a/bindings/ocaml/llvm/META.llvm.in
+++ b/bindings/ocaml/llvm/META.llvm.in
@@ -46,6 +46,14 @@ package "ipo" (
     archive(native) = "llvm_ipo.cmxa"
 )
 
+package "irreader" (
+    requires = "llvm"
+    version  = "@PACKAGE_VERSION@"
+    description = "IR assembly reader for LLVM"
+    archive(byte) = "llvm_irreader.cma"
+    archive(native) = "llvm_irreader.cmxa"
+)
+
 package "scalar_opts" (
     requires = "llvm"
     version = "@PACKAGE_VERSION@"
@@ -54,6 +62,22 @@ package "scalar_opts" (
     archive(native) = "llvm_scalar_opts.cmxa"
 )
 
+package "vectorize" (
+    requires = "llvm"
+    version = "@PACKAGE_VERSION@"
+    description = "Vector Transforms for LLVM"
+    archive(byte) = "llvm_vectorize.cma"
+    archive(native) = "llvm_vectorize.cmxa"
+)
+
+package "passmgr_builder" (
+    requires = "llvm"
+    version = "@PACKAGE_VERSION@"
+    description = "Pass Manager Builder for LLVM"
+    archive(byte) = "llvm_passmgr_builder.cma"
+    archive(native) = "llvm_passmgr_builder.cmxa"
+)
+
 package "target" (
     requires = "llvm"
     version  = "@PACKAGE_VERSION@"
@@ -61,3 +85,11 @@ package "target" (
     archive(byte) = "llvm_target.cma"
     archive(native) = "llvm_target.cmxa"
 )
+
+package "linker" (
+    requires = "llvm"
+    version  = "@PACKAGE_VERSION@"
+    description = "Intermediate Representation Linker for LLVM"
+    archive(byte) = "llvm_linker.cma"
+    archive(native) = "llvm_linker.cmxa"
+)
diff --git a/bindings/ocaml/llvm/Makefile b/bindings/ocaml/llvm/Makefile
index 203075a..850f564 100644
--- a/bindings/ocaml/llvm/Makefile
+++ b/bindings/ocaml/llvm/Makefile
@@ -14,7 +14,7 @@
 LEVEL := ../../..
 LIBRARYNAME := llvm
 UsedComponents := core
-UsedOcamLibs := llvm
+UsedOcamlLibs := llvm
 
 include ../Makefile.ocaml
 
diff --git a/bindings/ocaml/llvm/llvm.ml b/bindings/ocaml/llvm/llvm.ml
index b169b85..d36f360 100644
--- a/bindings/ocaml/llvm/llvm.ml
+++ b/bindings/ocaml/llvm/llvm.ml
@@ -34,6 +34,7 @@ module TypeKind = struct
   | Pointer
   | Vector
   | Metadata
+  | X86_mmx
 end
 
 module Linkage = struct
@@ -42,6 +43,7 @@ module Linkage = struct
   | Available_externally
   | Link_once
   | Link_once_odr
+  | Link_once_odr_auto_hide
   | Weak
   | Weak_odr
   | Appending
@@ -53,6 +55,7 @@ module Linkage = struct
   | Ghost
   | Common
   | Linker_private
+  | Linker_private_weak
 end
 
 module Visibility = struct
@@ -202,7 +205,48 @@ module Opcode  = struct
   | AtomicRMW
   | Resume
   | LandingPad
-  | Unwind
+end
+
+module LandingPadClauseTy = struct
+  type t =
+  | Catch
+  | Filter
+end
+
+module ThreadLocalMode = struct
+  type t =
+  | None
+  | GeneralDynamic
+  | LocalDynamic
+  | InitialExec
+  | LocalExec
+end
+
+module AtomicOrdering = struct
+  type t =
+  | NotAtomic
+  | Unordered
+  | Monotonic
+  | Invalid
+  | Acquire
+  | Release
+  | AcqiureRelease
+  | SequentiallyConsistent
+end
+
+module AtomicRMWBinOp = struct
+  type t =
+  | Xchg
+  | Add
+  | Sub
+  | And
+  | Nand
+  | Or
+  | Xor
+  | Max
+  | Min
+  | UMax
+  | UMin
 end
 
 module ValueKind = struct
@@ -216,6 +260,8 @@ module ValueKind = struct
   | BlockAddress
   | ConstantAggregateZero
   | ConstantArray
+  | ConstantDataArray
+  | ConstantDataVector
   | ConstantExpr
   | ConstantFP
   | ConstantInt
@@ -234,6 +280,13 @@ exception IoError of string
 external register_exns : exn -> unit = "llvm_register_core_exns"
 let _ = register_exns (IoError "")
 
+external install_fatal_error_handler : (string -> unit) -> unit
+                                     = "llvm_install_fatal_error_handler"
+external reset_fatal_error_handler : unit -> unit
+                                   = "llvm_reset_fatal_error_handler"
+external enable_pretty_stacktrace : unit -> unit
+                                  = "llvm_enable_pretty_stacktrace"
+
 type ('a, 'b) llpos =
 | At_end of 'a
 | Before of 'b
@@ -260,6 +313,8 @@ external data_layout: llmodule -> string
 external set_data_layout: string -> llmodule -> unit
                         = "llvm_set_data_layout"
 external dump_module : llmodule -> unit = "llvm_dump_module"
+external print_module : string -> llmodule -> unit = "llvm_print_module"
+external string_of_llmodule : llmodule -> string = "llvm_string_of_llmodule"
 external set_module_inline_asm : llmodule -> string -> unit
                                = "llvm_set_module_inline_asm"
 external module_context : llmodule -> llcontext = "LLVMGetModuleContext"
@@ -268,6 +323,8 @@ external module_context : llmodule -> llcontext = "LLVMGetModuleContext"
 external classify_type : lltype -> TypeKind.t = "llvm_classify_type"
 external type_context : lltype -> llcontext = "llvm_type_context"
 external type_is_sized : lltype -> bool = "llvm_type_is_sized"
+external dump_type : lltype -> unit = "llvm_dump_type"
+external string_of_lltype : lltype -> string = "llvm_string_of_lltype"
 
 (*--... Operations on integer types ........................................--*)
 external i1_type : llcontext -> lltype = "llvm_i1_type"
@@ -323,6 +380,7 @@ external vector_size : lltype -> int = "llvm_vector_size"
 (*--... Operations on other types ..........................................--*)
 external void_type : llcontext -> lltype = "llvm_void_type"
 external label_type : llcontext -> lltype = "llvm_label_type"
+external x86_mmx_type : llcontext -> lltype = "llvm_x86_mmx_type"
 external type_by_name : llmodule -> string -> lltype option = "llvm_type_by_name"
 
 external classify_value : llvalue -> ValueKind.t = "llvm_classify_value"
@@ -331,8 +389,9 @@ external type_of : llvalue -> lltype = "llvm_type_of"
 external value_name : llvalue -> string = "llvm_value_name"
 external set_value_name : string -> llvalue -> unit = "llvm_set_value_name"
 external dump_value : llvalue -> unit = "llvm_dump_value"
+external string_of_llvalue : llvalue -> string = "llvm_string_of_llvalue"
 external replace_all_uses_with : llvalue -> llvalue -> unit
-                               = "LLVMReplaceAllUsesWith"
+                               = "llvm_replace_all_uses_with"
 
 (*--... Operations on uses .................................................--*)
 external use_begin : llvalue -> lluse option = "llvm_use_begin"
@@ -391,7 +450,10 @@ external clear_metadata : llvalue -> int -> unit = "llvm_clear_metadata"
 external mdstring : llcontext -> string -> llvalue = "llvm_mdstring"
 external mdnode : llcontext -> llvalue array -> llvalue = "llvm_mdnode"
 external get_mdstring : llvalue -> string option = "llvm_get_mdstring"
-external get_named_metadata : llmodule -> string -> llvalue array = "llvm_get_namedmd"
+external get_named_metadata : llmodule -> string -> llvalue array
+                            = "llvm_get_namedmd"
+external add_named_metadata_operand : llmodule -> string -> llvalue -> unit
+                                    = "llvm_append_namedmd"
 
 (*--... Operations on scalar constants .....................................--*)
 external const_int : lltype -> int -> llvalue = "llvm_const_int"
@@ -477,7 +539,8 @@ external const_trunc_or_bitcast : llvalue -> lltype -> llvalue
                               = "LLVMConstTruncOrBitCast"
 external const_pointercast : llvalue -> lltype -> llvalue
                            = "LLVMConstPointerCast"
-external const_intcast : llvalue -> lltype -> llvalue = "LLVMConstIntCast"
+external const_intcast : llvalue -> lltype -> is_signed:bool -> llvalue
+                       = "llvm_const_intcast"
 external const_fpcast : llvalue -> lltype -> llvalue = "LLVMConstFPCast"
 external const_select : llvalue -> llvalue -> llvalue -> llvalue
                       = "LLVMConstSelect"
@@ -530,6 +593,14 @@ external set_initializer : llvalue -> llvalue -> unit = "llvm_set_initializer"
 external remove_initializer : llvalue -> unit = "llvm_remove_initializer"
 external is_thread_local : llvalue -> bool = "llvm_is_thread_local"
 external set_thread_local : bool -> llvalue -> unit = "llvm_set_thread_local"
+external thread_local_mode : llvalue -> ThreadLocalMode.t
+                           = "llvm_thread_local_mode"
+external set_thread_local_mode : ThreadLocalMode.t -> llvalue -> unit
+                               = "llvm_set_thread_local_mode"
+external is_externally_initialized : llvalue -> bool
+                                   = "llvm_is_externally_initialized"
+external set_externally_initialized : bool -> llvalue -> unit
+                                    = "llvm_set_externally_initialized"
 external global_begin : llmodule -> (llmodule, llvalue) llpos
                       = "llvm_global_begin"
 external global_succ : llvalue -> (llmodule, llvalue) llpos
@@ -725,6 +796,10 @@ let unpack_attr (a : int32) : Attribute.t list =
 let add_function_attr llval attr =
   llvm_add_function_attr llval (pack_attr attr)
 
+external add_target_dependent_function_attr
+    : llvalue -> string -> string -> unit
+    = "llvm_add_target_dependent_function_attr"
+
 let remove_function_attr llval attr =
   llvm_remove_function_attr llval (pack_attr attr)
 
@@ -803,6 +878,11 @@ external block_parent : llbasicblock -> llvalue = "LLVMGetBasicBlockParent"
 external basic_blocks : llvalue -> llbasicblock array = "llvm_basic_blocks"
 external entry_block : llvalue -> llbasicblock = "LLVMGetEntryBasicBlock"
 external delete_block : llbasicblock -> unit = "llvm_delete_block"
+external remove_block : llbasicblock -> unit = "llvm_remove_block"
+external move_block_before : llbasicblock -> llbasicblock -> unit
+                           = "llvm_move_block_before"
+external move_block_after : llbasicblock -> llbasicblock -> unit
+                          = "llvm_move_block_after"
 external append_block : llcontext -> string -> llvalue -> llbasicblock
                       = "llvm_append_block"
 external insert_block : llcontext -> string -> llbasicblock -> llbasicblock
@@ -872,8 +952,6 @@ external instr_pred : llvalue -> (llbasicblock, llvalue) llrev_pos
 external instr_opcode : llvalue -> Opcode.t = "llvm_instr_get_opcode"
 external icmp_predicate : llvalue -> Icmp.t option = "llvm_instr_icmp_predicate"
 
-external icmp_predicate : llvalue -> Icmp.t option = "llvm_instr_icmp_predicate"
-
 let rec iter_instrs_range f i e =
   if i = e then () else
   match i with
@@ -936,6 +1014,10 @@ let remove_instruction_param_attr llval i attr =
 external is_tail_call : llvalue -> bool = "llvm_is_tail_call"
 external set_tail_call : bool -> llvalue -> unit = "llvm_set_tail_call"
 
+(*--... Operations on load/store instructions (only) .......................--*)
+external is_volatile : llvalue -> bool = "llvm_is_volatile"
+external set_volatile : bool -> llvalue -> unit = "llvm_set_volatile"
+
 (*--... Operations on phi nodes ............................................--*)
 external add_incoming : (llvalue * llbasicblock) -> llvalue -> unit
                       = "llvm_add_incoming"
@@ -1078,6 +1160,11 @@ external build_load : llvalue -> string -> llbuilder -> llvalue
                     = "llvm_build_load"
 external build_store : llvalue -> llvalue -> llbuilder -> llvalue
                      = "llvm_build_store"
+external build_atomicrmw : AtomicRMWBinOp.t -> llvalue -> llvalue ->
+                           AtomicOrdering.t -> bool -> string -> llbuilder ->
+                           llvalue
+                         = "llvm_build_atomicrmw_bytecode"
+                           "llvm_build_atomicrmw_native"
 external build_gep : llvalue -> llvalue array -> string -> llbuilder -> llvalue
                    = "llvm_build_gep"
 external build_in_bounds_gep : llvalue -> llvalue array -> string ->
@@ -1167,6 +1254,9 @@ external build_ptrdiff : llvalue -> llvalue -> string -> llbuilder -> llvalue
 module MemoryBuffer = struct
   external of_file : string -> llmemorybuffer = "llvm_memorybuffer_of_file"
   external of_stdin : unit -> llmemorybuffer = "llvm_memorybuffer_of_stdin"
+  external of_string : ?name:string -> string -> llmemorybuffer
+                     = "llvm_memorybuffer_of_string"
+  external as_string : llmemorybuffer -> string = "llvm_memorybuffer_as_string"
   external dispose : llmemorybuffer -> unit = "llvm_memorybuffer_dispose"
 end
 
@@ -1187,54 +1277,3 @@ module PassManager = struct
   external finalize : [ `Function ] t -> bool = "llvm_passmanager_finalize"
   external dispose : [< any ] t -> unit = "llvm_passmanager_dispose"
 end
-
-
-(*===-- Non-Externs -------------------------------------------------------===*)
-(* These functions are built using the externals, so must be declared late.   *)
-
-let concat2 sep arr =
-  let s = ref "" in
-  if 0 < Array.length arr then begin
-    s := !s ^ arr.(0);
-    for i = 1 to (Array.length arr) - 1 do
-      s := !s ^ sep ^ arr.(i)
-    done
-  end;
-  !s
-
-let rec string_of_lltype ty =
-  (* FIXME: stop infinite recursion! :) *)
-  match classify_type ty with
-    TypeKind.Integer -> "i" ^ string_of_int (integer_bitwidth ty)
-  | TypeKind.Pointer ->
-      (let ety = element_type ty in
-      match classify_type ety with
-      | TypeKind.Struct ->
-          (match struct_name ety with
-          | None -> (string_of_lltype ety)
-          | Some s -> s) ^ "*"
-      | _ -> (string_of_lltype (element_type ty)) ^ "*")
-  | TypeKind.Struct ->
-      let s = "{ " ^ (concat2 ", " (
-                Array.map string_of_lltype (struct_element_types ty)
-              )) ^ " }" in
-      if is_packed ty
-        then "<" ^ s ^ ">"
-        else s
-  | TypeKind.Array -> "["   ^ (string_of_int (array_length ty)) ^
-                      " x " ^ (string_of_lltype (element_type ty)) ^ "]"
-  | TypeKind.Vector -> "<"   ^ (string_of_int (vector_size ty)) ^
-                       " x " ^ (string_of_lltype (element_type ty)) ^ ">"
-  | TypeKind.Function -> string_of_lltype (return_type ty) ^
-                         " (" ^ (concat2 ", " (
-                           Array.map string_of_lltype (param_types ty)
-                         )) ^ ")"
-  | TypeKind.Label -> "label"
-  | TypeKind.Ppc_fp128 -> "ppc_fp128"
-  | TypeKind.Fp128 -> "fp128"
-  | TypeKind.X86fp80 -> "x86_fp80"
-  | TypeKind.Double -> "double"
-  | TypeKind.Float -> "float"
-  | TypeKind.Half -> "half"
-  | TypeKind.Void -> "void"
-  | TypeKind.Metadata -> "metadata"
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index eb6c883..e996121 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -1,4 +1,4 @@
-(*===-- llvm/llvm.mli - LLVM Ocaml Interface -------------------------------===*
+(*===-- llvm/llvm.mli - LLVM OCaml Interface ------------------------------===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,7 +9,7 @@
 
 (** Core API.
 
-    This interface provides an ocaml API for the LLVM intermediate
+    This interface provides an OCaml API for the LLVM intermediate
     representation, the classes in the VMCore library. *)
 
 
@@ -67,6 +67,7 @@ module TypeKind : sig
   | Pointer
   | Vector
   | Metadata
+  | X86_mmx
 end
 
 (** The linkage of a global value, accessed with {!linkage} and
@@ -77,6 +78,7 @@ module Linkage : sig
   | Available_externally
   | Link_once
   | Link_once_odr
+  | Link_once_odr_auto_hide
   | Weak
   | Weak_odr
   | Appending
@@ -88,6 +90,7 @@ module Linkage : sig
   | Ghost
   | Common
   | Linker_private
+  | Linker_private_weak
 end
 
 (** The linker visibility of a global value, accessed with {!visibility} and
@@ -115,6 +118,8 @@ module CallConv : sig
                               convention from C. *)
 end
 
+(** The attribute kind of a function parameter, result or the function itself.
+    See [llvm::Attribute::AttrKind]. *)
 module Attribute : sig
   type t =
   | Zext
@@ -252,11 +257,61 @@ module Opcode : sig
   | AtomicRMW
   | Resume
   | LandingPad
-  | Unwind
+end
+
+(** The type of a clause of a [landingpad] instruction.
+    See [llvm::LandingPadInst::ClauseType]. *)
+module LandingPadClauseTy : sig
+  type t =
+  | Catch
+  | Filter
+end
+
+(** The thread local mode of a global value, accessed with {!thread_local_mode}
+    and {!set_thread_local_mode}.
+    See [llvm::GlobalVariable::ThreadLocalMode]. *)
+module ThreadLocalMode : sig
+  type t =
+  | None
+  | GeneralDynamic
+  | LocalDynamic
+  | InitialExec
+  | LocalExec
+end
+
+(** The ordering of an atomic [load], [store], [cmpxchg], [atomicrmw] or
+    [fence] instruction. See [llvm::AtomicOrdering]. *)
+module AtomicOrdering : sig
+  type t =
+  | NotAtomic
+  | Unordered
+  | Monotonic
+  | Invalid (* removed due to API changes *)
+  | Acquire
+  | Release
+  | AcqiureRelease
+  | SequentiallyConsistent
+end
+
+(** The opcode of an [atomicrmw] instruction.
+    See [llvm::AtomicRMWInst::BinOp]. *)
+module AtomicRMWBinOp : sig
+  type t =
+  | Xchg
+  | Add
+  | Sub
+  | And
+  | Nand
+  | Or
+  | Xor
+  | Max
+  | Min
+  | UMax
+  | UMin
 end
 
 (** The kind of an [llvalue], the result of [classify_value v].
- * See the various [LLVMIsA*] functions. *)
+    See the various [LLVMIsA*] functions. *)
 module ValueKind : sig
   type t =
   | NullValue
@@ -268,6 +323,8 @@ module ValueKind : sig
   | BlockAddress
   | ConstantAggregateZero
   | ConstantArray
+  | ConstantDataArray
+  | ConstantDataVector
   | ConstantExpr
   | ConstantFP
   | ConstantInt
@@ -281,6 +338,7 @@ module ValueKind : sig
   | Instruction of Opcode.t
 end
 
+
 (** {6 Iteration} *)
 
 (** [Before b] and [At_end a] specify positions from the start of the ['b] list
@@ -303,6 +361,21 @@ type ('a, 'b) llrev_pos =
 exception IoError of string
 
 
+(** {6 Global configuration} *)
+
+(** [enable_pretty_stacktraces ()] enables LLVM's built-in stack trace code.
+    This intercepts the OS's crash signals and prints which component of LLVM
+    you were in at the time of the crash. *)
+val enable_pretty_stacktrace : unit -> unit
+
+(** [install_fatal_error_handler f] installs [f] as LLVM's fatal error handler.
+    The handler will receive the reason for termination as a string. After
+    the handler has been executed, LLVM calls [exit(1)]. *)
+val install_fatal_error_handler : (string -> unit) -> unit
+
+(** [reset_fatal_error_handler ()] resets LLVM's fatal error handler. *)
+val reset_fatal_error_handler : unit -> unit
+
 (** {6 Contexts} *)
 
 (** [create_context ()] creates a context for storing the "global" state in
@@ -340,18 +413,15 @@ val dispose_module : llmodule -> unit
     [i686-apple-darwin8]. See the method [llvm::Module::getTargetTriple]. *)
 val target_triple: llmodule -> string
 
-
 (** [target_triple triple m] changes the target specifier for the module [m] to
     the string [triple]. See the method [llvm::Module::setTargetTriple]. *)
 val set_target_triple: string -> llmodule -> unit
 
-
 (** [data_layout m] is the data layout specifier for the module [m], something
     like [e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-...-a0:0:64-f80:128:128]. See the
     method [llvm::Module::getDataLayout]. *)
 val data_layout: llmodule -> string
 
-
 (** [set_data_layout s m] changes the data layout specifier for the module [m]
     to the string [s]. See the method [llvm::Module::setDataLayout]. *)
 val set_data_layout: string -> llmodule -> unit
@@ -360,14 +430,23 @@ val set_data_layout: string -> llmodule -> unit
     error. See the method [llvm::Module::dump]. *)
 val dump_module : llmodule -> unit
 
+(** [print_module f m] prints the .ll representation of the module [m]
+    to file [f]. See the method [llvm::Module::print]. *)
+val print_module : string -> llmodule -> unit
+
+(** [string_of_llmodule m] returns the .ll representation of the module [m]
+    as a string. See the method [llvm::Module::print]. *)
+val string_of_llmodule : llmodule -> string
+
 (** [set_module_inline_asm m asm] sets the inline assembler for the module. See
     the method [llvm::Module::setModuleInlineAsm]. *)
 val set_module_inline_asm : llmodule -> string -> unit
 
 (** [module_context m] returns the context of the specified module.
- * See the method [llvm::Module::getContext] *)
+    See the method [llvm::Module::getContext] *)
 val module_context : llmodule -> llcontext
 
+
 (** {6 Types} *)
 
 (** [classify_type ty] returns the {!TypeKind.t} corresponding to the type [ty].
@@ -375,17 +454,22 @@ val module_context : llmodule -> llcontext
 val classify_type : lltype -> TypeKind.t
 
 (** [type_is_sized ty] returns whether the type has a size or not.
- * If it doesn't then it is not safe to call the [DataLayout::] methods on it.
- * *)
+    If it doesn't then it is not safe to call the [DataLayout::] methods on it.
+    *)
 val type_is_sized : lltype -> bool
 
 (** [type_context ty] returns the {!llcontext} corresponding to the type [ty].
     See the method [llvm::Type::getContext]. *)
 val type_context : lltype -> llcontext
 
+(** [dump_type ty] prints the .ll representation of the type [ty] to standard
+    error. See the method [llvm::Type::dump]. *)
+val dump_type : lltype -> unit
+
 (** [string_of_lltype ty] returns a string describing the type [ty]. *)
 val string_of_lltype : lltype -> string
 
+
 (** {7 Operations on integer types} *)
 
 (** [i1_type c] returns an integer type of bitwidth 1 in the context [c]. See
@@ -453,7 +537,6 @@ val function_type : lltype -> lltype array -> lltype
     See the method [llvm::FunctionType::get]. *)
 val var_arg_function_type : lltype -> lltype array -> lltype
 
-
 (** [is_var_arg fty] returns [true] if [fty] is a varargs function type, [false]
     otherwise. See the method [llvm::FunctionType::isVarArg]. *)
 val is_var_arg : lltype -> bool
@@ -474,31 +557,29 @@ val param_types : lltype -> lltype array
     [llvm::StructType::get]. *)
 val struct_type : llcontext -> lltype array -> lltype
 
-
 (** [packed_struct_type context ys] returns the packed structure type in the
     context [context] containing in the types in the array [tys]. See the method
     [llvm::StructType::get]. *)
 val packed_struct_type : llcontext -> lltype array -> lltype
 
 (** [struct_name ty] returns the name of the named structure type [ty],
- * or None if the structure type is not named *)
+    or None if the structure type is not named *)
 val struct_name : lltype -> string option
 
 (** [named_struct_type context name] returns the named structure type [name]
- * in the context [context].
- * See the method [llvm::StructType::get]. *)
+    in the context [context].
+    See the method [llvm::StructType::get]. *)
 val named_struct_type : llcontext -> string -> lltype
 
 (** [struct_set_body ty elts ispacked] sets the body of the named struct [ty]
- * to the [elts] elements.
- * See the moethd [llvm::StructType::setBody]. *)
+    to the [elts] elements.
+    See the moethd [llvm::StructType::setBody]. *)
 val struct_set_body : lltype -> lltype array -> bool -> unit
 
 (** [struct_element_types sty] returns the constituent types of the struct type
     [sty]. See the method [llvm::StructType::getElementType]. *)
 val struct_element_types : lltype -> lltype array
 
-
 (** [is_packed sty] returns [true] if the structure type [sty] is packed,
     [false] otherwise. See the method [llvm::StructType::isPacked]. *)
 val is_packed : lltype -> bool
@@ -507,6 +588,7 @@ val is_packed : lltype -> bool
     [false] otherwise. See the method [llvm::StructType::isOpaque]. *)
 val is_opaque : lltype -> bool
 
+
 (** {7 Operations on pointer, vector, and array types} *)
 
 (** [array_type ty n] returns the array type containing [n] elements of type
@@ -523,7 +605,6 @@ val pointer_type : lltype -> lltype
     See the method [llvm::PointerType::get]. *)
 val qualified_pointer_type : lltype -> int -> lltype
 
-
 (** [vector_type ty n] returns the array type containing [n] elements of the
     primitive type [ty]. See the method [llvm::ArrayType::get]. *)
 val vector_type : lltype -> int -> lltype
@@ -555,17 +636,23 @@ val void_type : llcontext -> lltype
     [llvm::Type::LabelTy]. *)
 val label_type : llcontext -> lltype
 
+(** [x86_mmx_type c] returns the x86 64-bit MMX register type in the
+    context [c]. See [llvm::Type::X86_MMXTy]. *)
+val x86_mmx_type : llcontext -> lltype
+
 (** [type_by_name m name] returns the specified type from the current module
- * if it exists.
- * See the method [llvm::Module::getTypeByName] *)
+    if it exists.
+    See the method [llvm::Module::getTypeByName] *)
 val type_by_name : llmodule -> string -> lltype option
 
+
 (* {6 Values} *)
 
 (** [type_of v] returns the type of the value [v].
     See the method [llvm::Value::getType]. *)
 val type_of : llvalue -> lltype
 
+(** [classify_value v] returns the kind of the value [v]. *)
 val classify_value : llvalue -> ValueKind.t
 
 (** [value_name v] returns the name of the value [v]. For global values, this is
@@ -582,12 +669,14 @@ val set_value_name : string -> llvalue -> unit
     error. See the method [llvm::Value::dump]. *)
 val dump_value : llvalue -> unit
 
+(** [string_of_llvalue v] returns a string describing the value [v]. *)
+val string_of_llvalue : llvalue -> string
+
 (** [replace_all_uses_with old new] replaces all uses of the value [old]
- * with the value [new]. See the method [llvm::Value::replaceAllUsesWith]. *)
+    with the value [new]. See the method [llvm::Value::replaceAllUsesWith]. *)
 val replace_all_uses_with : llvalue -> llvalue -> unit
 
 
-
 (* {6 Uses} *)
 
 (** [use_begin v] returns the first position in the use list for the value [v].
@@ -635,6 +724,7 @@ val set_operand : llvalue -> int -> llvalue -> unit
     See the method [llvm::User::getNumOperands]. *)
 val num_operands : llvalue -> int
 
+
 (** {7 Operations on constants of (mostly) any type} *)
 
 (** [is_constant v] returns [true] if the value [v] is a constant, [false]
@@ -665,7 +755,11 @@ val is_null : llvalue -> bool
     otherwise. Similar to [llvm::isa<UndefValue>]. *)
 val is_undef : llvalue -> bool
 
+(** [constexpr_opcode v] returns an [Opcode.t] corresponding to constexpr
+    value [v], or [Opcode.Invalid] if [v] is not a constexpr. *)
 val constexpr_opcode : llvalue -> Opcode.t
+
+
 (** {7 Operations on instructions} *)
 
 (** [has_metadata i] returns whether or not the instruction [i] has any
@@ -699,14 +793,22 @@ val mdstring : llcontext -> string -> llvalue
 val mdnode : llcontext -> llvalue array -> llvalue
 
 (** [get_mdstring v] returns the MDString.
- * See the method [llvm::MDString::getString] *)
+    See the method [llvm::MDString::getString] *)
 val get_mdstring : llvalue -> string option
 
-(** [get_named_metadata m name] return all the MDNodes belonging to the named
- * metadata (if any).
- * See the method [llvm::NamedMDNode::getOperand]. *)
+(** [get_named_metadata m name] returns all the MDNodes belonging to the named
+    metadata (if any).
+    See the method [llvm::NamedMDNode::getOperand]. *)
 val get_named_metadata : llmodule -> string -> llvalue array
 
+(** [add_named_metadata_operand m name v] adds [v] as the last operand of
+    metadata named [name] in module [m]. If the metadata does not exist,
+    it is created.
+    See the methods [llvm::Module::getNamedMetadata()] and
+    [llvm::MDNode::addOperand()]. *)
+val add_named_metadata_operand : llmodule -> string -> llvalue -> unit
+
+
 (** {7 Operations on scalar constants} *)
 
 (** [const_int ty i] returns the integer constant of type [ty] and value [i].
@@ -718,15 +820,14 @@ val const_int : lltype -> int -> llvalue
 val const_of_int64 : lltype -> Int64.t -> bool -> llvalue
 
 (** [int64_of_const c] returns the int64 value of the [c] constant integer.
- * None is returned if this is not an integer constant, or bitwidth exceeds 64.
- * See the method [llvm::ConstantInt::getSExtValue].*)
+    None is returned if this is not an integer constant, or bitwidth exceeds 64.
+    See the method [llvm::ConstantInt::getSExtValue].*)
 val int64_of_const : llvalue -> Int64.t option
 
 (** [const_int_of_string ty s r] returns the integer constant of type [ty] and
- * value [s], with the radix [r]. See the method [llvm::ConstantInt::get]. *)
+    value [s], with the radix [r]. See the method [llvm::ConstantInt::get]. *)
 val const_int_of_string : lltype -> string -> int -> llvalue
 
-
 (** [const_float ty n] returns the floating point constant of type [ty] and
     value [n]. See the method [llvm::ConstantFP::get]. *)
 val const_float : lltype -> float -> llvalue
@@ -736,7 +837,6 @@ val const_float : lltype -> float -> llvalue
 val const_float_of_string : lltype -> string -> llvalue
 
 
-
 (** {7 Operations on composite constants} *)
 
 (** [const_string c s] returns the constant [i8] array with the values of the
@@ -777,7 +877,6 @@ val const_named_struct : lltype -> llvalue array -> llvalue
     [llvm::ConstantStruct::get]. *)
 val const_packed_struct : llcontext -> llvalue array -> llvalue
 
-
 (** [const_vector elts] returns the vector constant of type
     [vector_type (type_of elts.(0)) (Array.length elts)] and containing the
     values [elts]. See the method [llvm::ConstantVector::get]. *)
@@ -929,13 +1028,11 @@ val const_xor : llvalue -> llvalue -> llvalue
     See the method [llvm::ConstantExpr::getICmp]. *)
 val const_icmp : Icmp.t -> llvalue -> llvalue -> llvalue
 
-
 (** [const_fcmp pred c1 c2] returns the constant comparison of two floating
     point constants, [c1 pred c2].
     See the method [llvm::ConstantExpr::getFCmp]. *)
 val const_fcmp : Fcmp.t -> llvalue -> llvalue -> llvalue
 
-
 (** [const_shl c1 c2] returns the constant integer [c1] left-shifted by the
     constant integer [c2].
     See the method [llvm::ConstantExpr::getShl]. *)
@@ -961,7 +1058,6 @@ val const_gep : llvalue -> llvalue array -> llvalue
     See the method [llvm::ConstantExpr::getInBoundsGetElementPtr]. *)
 val const_in_bounds_gep : llvalue -> llvalue array -> llvalue
 
-
 (** [const_trunc c ty] returns the constant truncation of integer constant [c]
     to the smaller integer type [ty].
     See the method [llvm::ConstantExpr::getTrunc]. *)
@@ -1027,50 +1123,44 @@ val const_bitcast : llvalue -> lltype -> llvalue
     See the method [llvm::ConstantExpr::getZExtOrBitCast]. *)
 val const_zext_or_bitcast : llvalue -> lltype -> llvalue
 
-
 (** [const_sext_or_bitcast c ty] returns a constant sext or bitwise cast
     conversion of constant [c] to type [ty].
     See the method [llvm::ConstantExpr::getSExtOrBitCast]. *)
 val const_sext_or_bitcast : llvalue -> lltype -> llvalue
 
-
 (** [const_trunc_or_bitcast c ty] returns a constant trunc or bitwise cast
     conversion of constant [c] to type [ty].
     See the method [llvm::ConstantExpr::getTruncOrBitCast]. *)
 val const_trunc_or_bitcast : llvalue -> lltype -> llvalue
 
-
 (** [const_pointercast c ty] returns a constant bitcast or a pointer-to-int
     cast conversion of constant [c] to type [ty] of equal size.
     See the method [llvm::ConstantExpr::getPointerCast]. *)
 val const_pointercast : llvalue -> lltype -> llvalue
 
-
-(** [const_intcast c ty] returns a constant zext, bitcast, or trunc for integer
-    -> integer casts of constant [c] to type [ty].
-    See the method [llvm::ConstantExpr::getIntCast]. *)
-val const_intcast : llvalue -> lltype -> llvalue
-
+(** [const_intcast c ty ~is_signed] returns a constant sext/zext, bitcast,
+    or trunc for integer -> integer casts of constant [c] to type [ty].
+    When converting a narrower value to a wider one, whether sext or zext
+    will be used is controlled by [is_signed].
+    See the method [llvm::ConstantExpr::getIntegerCast]. *)
+val const_intcast : llvalue -> lltype -> is_signed:bool -> llvalue
 
 (** [const_fpcast c ty] returns a constant fpext, bitcast, or fptrunc for fp ->
     fp casts of constant [c] to type [ty].
     See the method [llvm::ConstantExpr::getFPCast]. *)
 val const_fpcast : llvalue -> lltype -> llvalue
 
-
 (** [const_select cond t f] returns the constant conditional which returns value
     [t] if the boolean constant [cond] is true and the value [f] otherwise.
     See the method [llvm::ConstantExpr::getSelect]. *)
 val const_select : llvalue -> llvalue -> llvalue -> llvalue
 
-
 (** [const_extractelement vec i] returns the constant [i]th element of
     constant vector [vec]. [i] must be a constant [i32] value unsigned less than
     the size of the vector.
     See the method [llvm::ConstantExpr::getExtractElement]. *)
 val const_extractelement : llvalue -> llvalue -> llvalue
 
-
 (** [const_insertelement vec v i] returns the constant vector with the same
     elements as constant vector [v] but the [i]th element replaced by the
     constant [v]. [v] must be a constant value with the type of the vector
@@ -1079,31 +1169,25 @@ val const_extractelement : llvalue -> llvalue -> llvalue
     See the method [llvm::ConstantExpr::getInsertElement]. *)
 val const_insertelement : llvalue -> llvalue -> llvalue -> llvalue
 
-
 (** [const_shufflevector a b mask] returns a constant [shufflevector].
     See the LLVM Language Reference for details on the [shufflevector]
     instruction.
     See the method [llvm::ConstantExpr::getShuffleVector]. *)
 val const_shufflevector : llvalue -> llvalue -> llvalue -> llvalue
 
-
 (** [const_extractvalue agg idxs] returns the constant [idxs]th value of
     constant aggregate [agg]. Each [idxs] must be less than the size of the
     aggregate.  See the method [llvm::ConstantExpr::getExtractValue]. *)
 val const_extractvalue : llvalue -> int array -> llvalue
 
-
 (** [const_insertvalue agg val idxs] inserts the value [val] in the specified
     indexs [idxs] in the aggegate [agg]. Each [idxs] must be less than the size
     of the aggregate. See the method [llvm::ConstantExpr::getInsertValue]. *)
 val const_insertvalue : llvalue -> llvalue -> int array -> llvalue
 
-
 (** [const_inline_asm ty asm con side align] inserts a inline assembly string.
     See the method [llvm::InlineAsm::get]. *)
-val const_inline_asm : lltype -> string -> string -> bool -> bool ->
-                            llvalue
-
+val const_inline_asm : lltype -> string -> string -> bool -> bool -> llvalue
 
 (** [block_address f bb] returns the address of the basic block [bb] in the
     function [f]. See the method [llvm::BasicBlock::get]. *)
@@ -1145,7 +1229,6 @@ val visibility : llvalue -> Visibility.t
     [v]. See the method [llvm::GlobalValue::setVisibility]. *)
 val set_visibility : Visibility.t -> llvalue -> unit
 
-
 (** [alignment g] returns the required alignment of the global value [g].
     See the method [llvm::GlobalValue::getAlignment]. *)
 val alignment : llvalue -> int
@@ -1163,15 +1246,12 @@ val set_alignment : int -> llvalue -> unit
     global differs, then a bitcast to [ty] is returned. *)
 val declare_global : lltype -> string -> llmodule -> llvalue
 
-
 (** [declare_qualified_global ty name addrspace m] returns a new global variable
     of type [ty] and with name [name] in module [m] in the address space
     [addrspace]. If such a global variable already exists, it is returned. If
     the type of the existing global differs, then a bitcast to [ty] is
     returned. *)
-val declare_qualified_global : lltype -> string -> int -> llmodule ->
-                                    llvalue
-
+val declare_qualified_global : lltype -> string -> int -> llmodule -> llvalue
 
 (** [define_global name init m] returns a new global with name [name] and
     initializer [init] in module [m] in the default address space (0). If the
@@ -1179,21 +1259,17 @@ val declare_qualified_global : lltype -> string -> int -> llmodule ->
     See the constructor of [llvm::GlobalVariable]. *)
 val define_global : string -> llvalue -> llmodule -> llvalue
 
-
 (** [define_qualified_global name init addrspace m] returns a new global with
     name [name] and initializer [init] in module [m] in the address space
     [addrspace]. If the named global already exists, it is renamed.
     See the constructor of [llvm::GlobalVariable]. *)
-val define_qualified_global : string -> llvalue -> int -> llmodule ->
-                                   llvalue
-
+val define_qualified_global : string -> llvalue -> int -> llmodule -> llvalue
 
 (** [lookup_global name m] returns [Some g] if a global variable with name
     [name] exists in module [m]. If no such global exists, returns [None].
     See the [llvm::GlobalVariable] constructor. *)
 val lookup_global : string -> llmodule -> llvalue option
 
-
 (** [delete_global gv] destroys the global variable [gv].
     See the method [llvm::GlobalVariable::eraseFromParent]. *)
 val delete_global : llvalue -> unit
@@ -1204,13 +1280,11 @@ val delete_global : llvalue -> unit
     See the method [llvm::Module::global_begin]. *)
 val global_begin : llmodule -> (llmodule, llvalue) llpos
 
-
 (** [global_succ gv] returns the global variable list position succeeding
     [Before gv].
     See the method [llvm::Module::global_iterator::operator++]. *)
 val global_succ : llvalue -> (llmodule, llvalue) llpos
 
-
 (** [iter_globals f m] applies function [f] to each of the global variables of
     module [m] in order. Tail recursive. *)
 val iter_globals : (llvalue -> unit) -> llmodule -> unit
@@ -1225,13 +1299,11 @@ val fold_left_globals : ('a -> llvalue -> 'a) -> 'a -> llmodule -> 'a
     See the method [llvm::Module::global_end]. *)
 val global_end : llmodule -> (llmodule, llvalue) llrev_pos
 
-
 (** [global_pred gv] returns the global variable list position preceding
     [After gv].
     See the method [llvm::Module::global_iterator::operator--]. *)
 val global_pred : llvalue -> (llmodule, llvalue) llrev_pos
 
-
 (** [rev_iter_globals f m] applies function [f] to each of the global variables
     of module [m] in reverse order. Tail recursive. *)
 val rev_iter_globals : (llvalue -> unit) -> llmodule -> unit
@@ -1250,7 +1322,6 @@ val is_global_constant : llvalue -> bool
     See the method [llvm::GlobalVariable::setConstant]. *)
 val set_global_constant : bool -> llvalue -> unit
 
-
 (** [global_initializer gv] returns the initializer for the global variable
     [gv]. See the method [llvm::GlobalVariable::getInitializer]. *)
 val global_initializer : llvalue -> llvalue
@@ -1275,6 +1346,26 @@ val is_thread_local : llvalue -> bool
     See the method [llvm::GlobalVariable::setThreadLocal]. *)
 val set_thread_local : bool -> llvalue -> unit
 
+(** [is_thread_local gv] returns the thread local mode of the global
+    variable [gv].
+    See the method [llvm::GlobalVariable::getThreadLocalMode]. *)
+val thread_local_mode : llvalue -> ThreadLocalMode.t
+
+(** [set_thread_local c gv] sets the thread local mode of the global
+    variable [gv].
+    See the method [llvm::GlobalVariable::setThreadLocalMode]. *)
+val set_thread_local_mode : ThreadLocalMode.t -> llvalue -> unit
+
+(** [is_externally_initialized gv] returns [true] if the global
+    variable [gv] is externally initialized and [false] otherwise.
+    See the method [llvm::GlobalVariable::isExternallyInitialized]. *)
+val is_externally_initialized : llvalue -> bool
+
+(** [set_externally_initialized c gv] sets the global variable [gv] to be
+    externally initialized if [c] is [true] and not otherwise.
+    See the method [llvm::GlobalVariable::setExternallyInitialized]. *)
+val set_externally_initialized : bool -> llvalue -> unit
+
 
 (** {7 Operations on aliases} *)
 
@@ -1284,7 +1375,6 @@ val set_thread_local : bool -> llvalue -> unit
 val add_alias : llmodule -> lltype -> llvalue -> string -> llvalue
 
 
-
 (** {7 Operations on functions} *)
 
 (** [declare_function name ty m] returns a new function of type [ty] and
@@ -1293,20 +1383,17 @@ val add_alias : llmodule -> lltype -> llvalue -> string -> llvalue
     to [ty] is returned. *)
 val declare_function : string -> lltype -> llmodule -> llvalue
 
-
 (** [define_function name ty m] creates a new function with name [name] and
     type [ty] in module [m]. If the named function already exists, it is
     renamed. An entry basic block is created in the function.
     See the constructor of [llvm::GlobalVariable]. *)
 val define_function : string -> lltype -> llmodule -> llvalue
 
-
 (** [lookup_function name m] returns [Some f] if a function with name
     [name] exists in module [m]. If no such function exists, returns [None].
     See the method [llvm::Module] constructor. *)
 val lookup_function : string -> llmodule -> llvalue option
 
-
 (** [delete_function f] destroys the function [f].
     See the method [llvm::Function::eraseFromParent]. *)
 val delete_function : llvalue -> unit
@@ -1317,13 +1404,11 @@ val delete_function : llvalue -> unit
     See the method [llvm::Module::begin]. *)
 val function_begin : llmodule -> (llmodule, llvalue) llpos
 
-
 (** [function_succ gv] returns the function list position succeeding
     [Before gv].
     See the method [llvm::Module::iterator::operator++]. *)
 val function_succ : llvalue -> (llmodule, llvalue) llpos
 
-
 (** [iter_functions f m] applies function [f] to each of the functions of module
     [m] in order. Tail recursive. *)
 val iter_functions : (llvalue -> unit) -> llmodule -> unit
@@ -1338,12 +1423,10 @@ val fold_left_functions : ('a -> llvalue -> 'a) -> 'a -> llmodule -> 'a
     See the method [llvm::Module::end]. *)
 val function_end : llmodule -> (llmodule, llvalue) llrev_pos
 
-
 (** [function_pred gv] returns the function list position preceding [After gv].
     See the method [llvm::Module::iterator::operator--]. *)
 val function_pred : llvalue -> (llmodule, llvalue) llrev_pos
 
-
 (** [rev_iter_functions f fn] applies function [f] to each of the functions of
     module [m] in reverse order. Tail recursive. *)
 val rev_iter_functions : (llvalue -> unit) -> llmodule -> unit
@@ -1365,7 +1448,6 @@ val function_call_conv : llvalue -> int
     See the method [llvm::Function::setCallingConv]. *)
 val set_function_call_conv : int -> llvalue -> unit
 
-
 (** [gc f] returns [Some name] if the function [f] has a garbage
     collection algorithm specified and [None] otherwise.
     See the method [llvm::Function::getGC]. *)
@@ -1379,14 +1461,19 @@ val set_gc : string option -> llvalue -> unit
     [f]. *)
 val add_function_attr : llvalue -> Attribute.t -> unit
 
+(** [add_target_dependent_function_attr f a] adds target-dependent attribute
+    [a] to function [f]. *)
+val add_target_dependent_function_attr : llvalue -> string -> string -> unit
+
 (** [function_attr f] returns the function attribute for the function [f].
- * See the method [llvm::Function::getAttributes] *)
+    See the method [llvm::Function::getAttributes] *)
 val function_attr : llvalue -> Attribute.t list
 
 (** [remove_function_attr f a] removes attribute [a] from the return type of
     function [f]. *)
 val remove_function_attr : llvalue -> Attribute.t -> unit
 
+
 (** {7 Operations on params} *)
 
 (** [params f] returns the parameters of function [f].
@@ -1398,8 +1485,8 @@ val params : llvalue -> llvalue array
 val param : llvalue -> int -> llvalue
 
 (** [param_attr p] returns the attributes of parameter [p].
- * See the methods [llvm::Function::getAttributes] and
- * [llvm::Attributes::getParamAttributes] *)
+    See the methods [llvm::Function::getAttributes] and
+    [llvm::Attributes::getParamAttributes] *)
 val param_attr : llvalue -> Attribute.t list
 
 (** [param_parent p] returns the parent function that owns the parameter.
@@ -1435,7 +1522,6 @@ val param_end : llvalue -> (llvalue, llvalue) llrev_pos
     See the method [llvm::Function::arg_iterator::operator--]. *)
 val param_pred : llvalue -> (llvalue, llvalue) llrev_pos
 
-
 (** [rev_iter_params f fn] applies function [f] to each of the parameters
     of function [fn] in reverse order. Tail recursive. *)
 val rev_iter_params : (llvalue -> unit) -> llvalue -> unit
@@ -1468,18 +1554,28 @@ val entry_block : llvalue -> llbasicblock
     See the method [llvm::BasicBlock::eraseFromParent]. *)
 val delete_block : llbasicblock -> unit
 
+(** [remove_block bb] removes the basic block [bb] from its parent function.
+    See the method [llvm::BasicBlock::removeFromParent]. *)
+val remove_block : llbasicblock -> unit
+
+(** [move_block_before pos bb] moves the basic block [bb] before [pos].
+    See the method [llvm::BasicBlock::moveBefore]. *)
+val move_block_before : llbasicblock -> llbasicblock -> unit
+
+(** [move_block_after pos bb] moves the basic block [bb] after [pos].
+    See the method [llvm::BasicBlock::moveAfter]. *)
+val move_block_after : llbasicblock -> llbasicblock -> unit
+
 (** [append_block c name f] creates a new basic block named [name] at the end of
     function [f] in the context [c].
     See the constructor of [llvm::BasicBlock]. *)
 val append_block : llcontext -> string -> llvalue -> llbasicblock
 
-
 (** [insert_block c name bb] creates a new basic block named [name] before the
     basic block [bb] in the context [c].
     See the constructor of [llvm::BasicBlock]. *)
 val insert_block : llcontext -> string -> llbasicblock -> llbasicblock
 
-
 (** [block_parent bb] returns the parent function that owns the basic block.
     See the method [llvm::BasicBlock::getParent]. *)
 val block_parent : llbasicblock -> llvalue
@@ -1490,13 +1586,11 @@ val block_parent : llbasicblock -> llvalue
     See the method [llvm::Function::begin]. *)
 val block_begin : llvalue -> (llvalue, llbasicblock) llpos
 
-
 (** [block_succ bb] returns the basic block list position succeeding
     [Before bb].
     See the method [llvm::Function::iterator::operator++]. *)
 val block_succ : llbasicblock -> (llvalue, llbasicblock) llpos
 
-
 (** [iter_blocks f fn] applies function [f] to each of the basic blocks
     of function [fn] in order. Tail recursive. *)
 val iter_blocks : (llbasicblock -> unit) -> llvalue -> unit
@@ -1511,11 +1605,11 @@ val fold_left_blocks : ('a -> llbasicblock -> 'a) -> 'a -> llvalue -> 'a
     See the method [llvm::Function::end]. *)
 val block_end : llvalue -> (llvalue, llbasicblock) llrev_pos
 
-
-(** [block_pred gv] returns the function list position preceding [After gv].
+(** [block_pred bb] returns the basic block list position preceding [After bb].
     See the method [llvm::Function::iterator::operator--]. *)
 val block_pred : llbasicblock -> (llvalue, llbasicblock) llrev_pos
 
+(** [block_terminator bb] returns the terminator of the basic block [bb]. *)
 val block_terminator : llbasicblock -> llvalue option
 
 (** [rev_iter_blocks f fn] applies function [f] to each of the basic blocks
@@ -1544,18 +1638,20 @@ val block_of_value : llvalue -> llbasicblock
     See the method [llvm::Instruction::getParent]. *)
 val instr_parent : llvalue -> llbasicblock
 
+(** [delete_instruction i] deletes the instruction [i].
+ * See the method [llvm::Instruction::eraseFromParent]. *)
+val delete_instruction : llvalue -> unit
+
 (** [instr_begin bb] returns the first position in the instruction list of the
     basic block [bb]. [instr_begin] and [instr_succ] can be used to iterate over
     the instruction list in order.
     See the method [llvm::BasicBlock::begin]. *)
 val instr_begin : llbasicblock -> (llbasicblock, llvalue) llpos
 
-
 (** [instr_succ i] returns the instruction list position succeeding [Before i].
     See the method [llvm::BasicBlock::iterator::operator++]. *)
 val instr_succ : llvalue -> (llbasicblock, llvalue) llpos
 
-
 (** [iter_instrs f bb] applies function [f] to each of the instructions of basic
     block [bb] in order. Tail recursive. *)
 val iter_instrs: (llvalue -> unit) -> llbasicblock -> unit
@@ -1570,20 +1666,23 @@ val fold_left_instrs: ('a -> llvalue -> 'a) -> 'a -> llbasicblock -> 'a
     See the method [llvm::BasicBlock::end]. *)
 val instr_end : llbasicblock -> (llbasicblock, llvalue) llrev_pos
 
-
 (** [instr_pred i] returns the instruction list position preceding [After i].
     See the method [llvm::BasicBlock::iterator::operator--]. *)
 val instr_pred : llvalue -> (llbasicblock, llvalue) llrev_pos
 
-
 (** [fold_right_instrs f bb init] is [f (... (f init fN) ...) f1] where
     [f1,...,fN] are the instructions of basic block [bb]. Tail recursive. *)
 val fold_right_instrs: (llvalue -> 'a -> 'a) -> llbasicblock -> 'a -> 'a
 
+(** [inst_opcode i] returns the [Opcode.t] corresponding to instruction [i],
+    or [Opcode.Invalid] if [i] is not an instruction. *)
 val instr_opcode : llvalue -> Opcode.t
 
+(** [icmp_predicate i] returns the [Icmp.t] corresponding to an [icmp]
+    instruction [i]. *)
 val icmp_predicate : llvalue -> Icmp.t option
 
+
 (** {7 Operations on call sites} *)
 
 (** [instruction_call_conv ci] is the calling convention for the call or invoke
@@ -1592,7 +1691,6 @@ val icmp_predicate : llvalue -> Icmp.t option
     [llvm::InvokeInst::getCallingConv]. *)
 val instruction_call_conv: llvalue -> int
 
-
 (** [set_instruction_call_conv cc ci] sets the calling convention for the call
     or invoke instruction [ci] to the integer [cc], which can be one of the
     values from the module {!CallConv}.
@@ -1600,7 +1698,6 @@ val instruction_call_conv: llvalue -> int
     and [llvm::InvokeInst::setCallingConv]. *)
 val set_instruction_call_conv: int -> llvalue -> unit
 
-
 (** [add_instruction_param_attr ci i a] adds attribute [a] to the [i]th
     parameter of the call or invoke instruction [ci]. [i]=0 denotes the return
     value. *)
@@ -1611,7 +1708,8 @@ val add_instruction_param_attr : llvalue -> int -> Attribute.t -> unit
     return value. *)
 val remove_instruction_param_attr : llvalue -> int -> Attribute.t -> unit
 
-(** {Operations on call instructions (only)} *)
+
+(** {7 Operations on call instructions (only)} *)
 
 (** [is_tail_call ci] is [true] if the call instruction [ci] is flagged as
     eligible for tail call optimization, [false] otherwise.
@@ -1623,20 +1721,33 @@ val is_tail_call : llvalue -> bool
     See the method [llvm::CallInst::setTailCall]. *)
 val set_tail_call : bool -> llvalue -> unit
 
+
+(** {7 Operations on load/store instructions (only)} *)
+
+(** [is_volatile i] is [true] if the load or store instruction [i] is marked
+    as volatile.
+    See the methods [llvm::LoadInst::isVolatile] and
+    [llvm::StoreInst::isVolatile]. *)
+val is_volatile : llvalue -> bool
+
+(** [set_volatile v i] marks the load or store instruction [i] as volatile
+    if [v] is [true], unmarks otherwise.
+    See the methods [llvm::LoadInst::setVolatile] and
+    [llvm::StoreInst::setVolatile]. *)
+val set_volatile : bool -> llvalue -> unit
+
+
 (** {7 Operations on phi nodes} *)
 
 (** [add_incoming (v, bb) pn] adds the value [v] to the phi node [pn] for use
     with branches from [bb]. See the method [llvm::PHINode::addIncoming]. *)
 val add_incoming : (llvalue * llbasicblock) -> llvalue -> unit
 
-
 (** [incoming pn] returns the list of value-block pairs for phi node [pn].
     See the method [llvm::PHINode::getIncomingValue]. *)
 val incoming : llvalue -> (llvalue * llbasicblock) list
 
-(** [delete_instruction i] deletes the instruction [i].
- * See the method [llvm::Instruction::eraseFromParent]. *)
-val delete_instruction : llvalue -> unit
+
 
 (** {6 Instruction builders} *)
 
@@ -1663,7 +1774,6 @@ val builder_at_end : llcontext -> llbasicblock -> llbuilder
     See the constructor for [llvm::LLVMBuilder]. *)
 val position_builder : (llbasicblock, llvalue) llpos -> llbuilder -> unit
 
-
 (** [position_before ins b] moves the instruction builder [b] to before the
     instruction [isn]. See the method [llvm::LLVMBuilder::SetInsertPoint]. *)
 val position_before : llvalue -> llbuilder -> unit
@@ -1691,18 +1801,15 @@ val insert_into_builder : llvalue -> string -> llbuilder -> unit
     See the method [llvm::IRBuilder::SetDebugLocation]. *)
 val set_current_debug_location : llbuilder -> llvalue -> unit
 
-
 (** [clear_current_debug_location b] clears the current debug location in the
     builder [b]. *)
 val clear_current_debug_location : llbuilder -> unit
 
-
 (** [current_debug_location b] returns the current debug location, or None
     if none is currently set.
     See the method [llvm::IRBuilder::GetDebugLocation]. *)
 val current_debug_location : llbuilder -> llvalue option
 
-
 (** [set_inst_debug_location b i] sets the current debug location of the builder
     [b] to the instruction [i].
     See the method [llvm::IRBuilder::SetInstDebugLocation]. *)
@@ -1729,7 +1836,6 @@ val build_ret : llvalue -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateAggregateRet]. *)
 val build_aggregate_ret : llvalue array -> llbuilder -> llvalue
 
-
 (** [build_br bb b] creates a
     [br %bb]
     instruction at the position specified by the instruction builder [b].
@@ -1771,8 +1877,8 @@ val build_free : llvalue -> llbuilder -> llvalue
 val add_case : llvalue -> llvalue -> llbasicblock -> unit
 
 (** [switch_default_dest sw] returns the default destination of the [switch]
- * instruction.
- * See the method [llvm:;SwitchInst::getDefaultDest]. **)
+    instruction.
+    See the method [llvm:;SwitchInst::getDefaultDest]. **)
 val switch_default_dest : llvalue -> llbasicblock
 
 (** [build_indirect_br addr count b] creates a
@@ -1782,13 +1888,11 @@ val switch_default_dest : llvalue -> llbasicblock
     See the method [llvm::LLVMBuilder::CreateIndirectBr]. *)
 val build_indirect_br : llvalue -> int -> llbuilder -> llvalue
 
-
 (** [add_destination br bb] adds the basic block [bb] as a possible branch
     location for the indirectbr instruction [br].
     See the method [llvm::IndirectBrInst::addDestination]. **)
 val add_destination : llvalue -> llbasicblock -> unit
 
-
 (** [build_invoke fn args tobb unwindbb name b] creates an
     [%name = invoke %fn(args) to %tobb unwind %unwindbb]
     instruction at the position specified by the instruction builder [b].
@@ -1811,9 +1915,9 @@ val set_cleanup : llvalue -> bool -> unit
     See the method [llvm::LandingPadInst::addClause]. *)
 val add_clause : llvalue -> llvalue -> unit
 
-(* [build_resume exn b] builds a [resume exn] instruction
- * at the position specified by the instruction builder [b].
- * See the method [llvm::LLVMBuilder::CreateResume] *)
+(** [build_resume exn b] builds a [resume exn] instruction
+    at the position specified by the instruction builder [b].
+    See the method [llvm::LLVMBuilder::CreateResume] *)
 val build_resume : llvalue -> llbuilder -> llvalue
 
 (** [build_unreachable b] creates an
@@ -1831,175 +1935,150 @@ val build_unreachable : llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateAdd]. *)
 val build_add : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nsw_add x y name b] creates a
     [%name = nsw add %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateNSWAdd]. *)
 val build_nsw_add : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nuw_add x y name b] creates a
     [%name = nuw add %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateNUWAdd]. *)
 val build_nuw_add : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_fadd x y name b] creates a
     [%name = fadd %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFAdd]. *)
 val build_fadd : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_sub x y name b] creates a
     [%name = sub %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateSub]. *)
 val build_sub : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nsw_sub x y name b] creates a
     [%name = nsw sub %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateNSWSub]. *)
 val build_nsw_sub : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nuw_sub x y name b] creates a
     [%name = nuw sub %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateNUWSub]. *)
 val build_nuw_sub : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_fsub x y name b] creates a
     [%name = fsub %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFSub]. *)
 val build_fsub : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_mul x y name b] creates a
     [%name = mul %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateMul]. *)
 val build_mul : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nsw_mul x y name b] creates a
     [%name = nsw mul %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateNSWMul]. *)
 val build_nsw_mul : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nuw_mul x y name b] creates a
     [%name = nuw mul %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateNUWMul]. *)
 val build_nuw_mul : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_fmul x y name b] creates a
     [%name = fmul %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFMul]. *)
 val build_fmul : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_udiv x y name b] creates a
     [%name = udiv %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateUDiv]. *)
 val build_udiv : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_sdiv x y name b] creates a
     [%name = sdiv %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateSDiv]. *)
 val build_sdiv : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_exact_sdiv x y name b] creates a
     [%name = exact sdiv %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateExactSDiv]. *)
 val build_exact_sdiv : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_fdiv x y name b] creates a
     [%name = fdiv %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFDiv]. *)
 val build_fdiv : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_urem x y name b] creates a
     [%name = urem %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateURem]. *)
 val build_urem : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_SRem x y name b] creates a
     [%name = srem %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateSRem]. *)
 val build_srem : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_frem x y name b] creates a
     [%name = frem %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFRem]. *)
 val build_frem : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_shl x y name b] creates a
     [%name = shl %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateShl]. *)
 val build_shl : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_lshr x y name b] creates a
     [%name = lshr %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateLShr]. *)
 val build_lshr : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_ashr x y name b] creates a
     [%name = ashr %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateAShr]. *)
 val build_ashr : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_and x y name b] creates a
     [%name = and %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateAnd]. *)
 val build_and : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_or x y name b] creates a
     [%name = or %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateOr]. *)
 val build_or : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_xor x y name b] creates a
     [%name = xor %x, %y]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateXor]. *)
 val build_xor : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_neg x name b] creates a
     [%name = sub 0, %x]
     instruction at the position specified by the instruction builder [b].
@@ -2007,7 +2086,6 @@ val build_xor : llvalue -> llvalue -> string -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateNeg]. *)
 val build_neg : llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nsw_neg x name b] creates a
     [%name = nsw sub 0, %x]
     instruction at the position specified by the instruction builder [b].
@@ -2015,7 +2093,6 @@ val build_neg : llvalue -> string -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateNeg]. *)
 val build_nsw_neg : llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_nuw_neg x name b] creates a
     [%name = nuw sub 0, %x]
     instruction at the position specified by the instruction builder [b].
@@ -2023,7 +2100,6 @@ val build_nsw_neg : llvalue -> string -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateNeg]. *)
 val build_nuw_neg : llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_fneg x name b] creates a
     [%name = fsub 0, %x]
     instruction at the position specified by the instruction builder [b].
@@ -2031,7 +2107,6 @@ val build_nuw_neg : llvalue -> string -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateFNeg]. *)
 val build_fneg : llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_xor x name b] creates a
     [%name = xor %x, -1]
     instruction at the position specified by the instruction builder [b].
@@ -2040,7 +2115,6 @@ val build_fneg : llvalue -> string -> llbuilder -> llvalue
 val build_not : llvalue -> string -> llbuilder -> llvalue
 
 
-
 (** {7 Memory} *)
 
 (** [build_alloca ty name b] creates a
@@ -2049,7 +2123,6 @@ val build_not : llvalue -> string -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateAlloca]. *)
 val build_alloca : lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_array_alloca ty n name b] creates a
     [%name = alloca %ty, %n]
     instruction at the position specified by the instruction builder [b].
@@ -2063,13 +2136,19 @@ val build_array_alloca : lltype -> llvalue -> string -> llbuilder ->
     See the method [llvm::LLVMBuilder::CreateLoad]. *)
 val build_load : llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_store v p b] creates a
     [store %v, %p]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateStore]. *)
 val build_store : llvalue -> llvalue -> llbuilder -> llvalue
 
+(** [build_atomicrmw op ptr val o st b] creates an [atomicrmw] instruction with
+    operation [op] performed on pointer [ptr] and value [val] with ordering [o]
+    and singlethread flag set to [st] at the position specified by
+    the instruction builder [b].
+    See the method [llvm::IRBuilder::CreateAtomicRMW]. *)
+val build_atomicrmw : AtomicRMWBinOp.t -> llvalue -> llvalue ->
+                      AtomicOrdering.t -> bool -> string -> llbuilder -> llvalue
 
 (** [build_gep p indices name b] creates a
     [%name = getelementptr %p, indices...]
@@ -2077,7 +2156,6 @@ val build_store : llvalue -> llvalue -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateGetElementPtr]. *)
 val build_gep : llvalue -> llvalue array -> string -> llbuilder -> llvalue
 
-
 (** [build_in_bounds_gep p indices name b] creates a
     [%name = gelementptr inbounds %p, indices...]
     instruction at the position specified by the instruction builder [b].
@@ -2097,7 +2175,6 @@ val build_struct_gep : llvalue -> int -> string -> llbuilder ->
     See the method [llvm::LLVMBuilder::CreateGlobalString]. *)
 val build_global_string : string -> string -> llbuilder -> llvalue
 
-
 (** [build_global_stringptr str name b] creates a series of instructions that
     adds a global string pointer at the position specified by the instruction
     builder [b].
@@ -2105,7 +2182,6 @@ val build_global_string : string -> string -> llbuilder -> llvalue
 val build_global_stringptr : string -> string -> llbuilder -> llvalue
 
 
-
 (** {7 Casts} *)
 
 (** [build_trunc v ty name b] creates a
@@ -2114,84 +2190,72 @@ val build_global_stringptr : string -> string -> llbuilder -> llvalue
     See the method [llvm::LLVMBuilder::CreateTrunc]. *)
 val build_trunc : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_zext v ty name b] creates a
     [%name = zext %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateZExt]. *)
 val build_zext : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_sext v ty name b] creates a
     [%name = sext %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateSExt]. *)
 val build_sext : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_fptoui v ty name b] creates a
     [%name = fptoui %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFPToUI]. *)
 val build_fptoui : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_fptosi v ty name b] creates a
     [%name = fptosi %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFPToSI]. *)
 val build_fptosi : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_uitofp v ty name b] creates a
     [%name = uitofp %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateUIToFP]. *)
 val build_uitofp : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_sitofp v ty name b] creates a
     [%name = sitofp %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateSIToFP]. *)
 val build_sitofp : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_fptrunc v ty name b] creates a
     [%name = fptrunc %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFPTrunc]. *)
 val build_fptrunc : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_fpext v ty name b] creates a
     [%name = fpext %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFPExt]. *)
 val build_fpext : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_ptrtoint v ty name b] creates a
     [%name = prtotint %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreatePtrToInt]. *)
 val build_ptrtoint : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_inttoptr v ty name b] creates a
     [%name = inttoptr %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateIntToPtr]. *)
 val build_inttoptr : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_bitcast v ty name b] creates a
     [%name = bitcast %p to %ty]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateBitCast]. *)
 val build_bitcast : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_zext_or_bitcast v ty name b] creates a zext or bitcast
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateZExtOrBitCast]. *)
@@ -2215,20 +2279,17 @@ val build_trunc_or_bitcast : llvalue -> lltype -> string -> llbuilder ->
     See the method [llvm::LLVMBuilder::CreatePointerCast]. *)
 val build_pointercast : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_intcast v ty name b] creates a zext, bitcast, or trunc
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateIntCast]. *)
 val build_intcast : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_fpcast v ty name b] creates a fpext, bitcast, or fptrunc
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateFPCast]. *)
 val build_fpcast : llvalue -> lltype -> string -> llbuilder -> llvalue
 
 
-
 (** {7 Comparisons} *)
 
 (** [build_icmp pred x y name b] creates a
@@ -2262,7 +2323,6 @@ val build_phi : (llvalue * llbasicblock) list -> string -> llbuilder ->
     See the method [llvm::LLVMBuilder::CreateCall]. *)
 val build_call : llvalue -> llvalue array -> string -> llbuilder -> llvalue
 
-
 (** [build_select cond thenv elsev name b] creates a
     [%name = select %cond, %thenv, %elsev]
     instruction at the position specified by the instruction builder [b].
@@ -2276,7 +2336,6 @@ val build_select : llvalue -> llvalue -> llvalue -> string -> llbuilder ->
     See the method [llvm::LLVMBuilder::CreateVAArg]. *)
 val build_va_arg : llvalue -> lltype -> string -> llbuilder -> llvalue
 
-
 (** [build_extractelement vec i name b] creates a
     [%name = extractelement %vec, %i]
     instruction at the position specified by the instruction builder [b].
@@ -2318,14 +2377,12 @@ val build_insertvalue : llvalue -> llvalue -> int -> string -> llbuilder ->
     See the method [llvm::LLVMBuilder::CreateIsNull]. *)
 val build_is_null : llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_is_not_null val name b] creates a
     [%name = icmp ne %val, null]
     instruction at the position specified by the instruction builder [b].
     See the method [llvm::LLVMBuilder::CreateIsNotNull]. *)
 val build_is_not_null : llvalue -> string -> llbuilder -> llvalue
 
-
 (** [build_ptrdiff lhs rhs name b] creates a series of instructions that measure
     the difference between two pointer values at the position specified by the
     instruction builder [b].
@@ -2333,7 +2390,6 @@ val build_is_not_null : llvalue -> string -> llbuilder -> llvalue
 val build_ptrdiff : llvalue -> llvalue -> string -> llbuilder -> llvalue
 
 
-
 (** {6 Memory buffers} *)
 
 module MemoryBuffer : sig
@@ -2342,9 +2398,16 @@ module MemoryBuffer : sig
       raised. *)
   val of_file : string -> llmemorybuffer
   
-  (** [stdin ()] is the memory buffer containing the contents of standard input.
+  (** [of_stdin ()] is the memory buffer containing the contents of standard input.
       If standard input is empty, then [IoError msg] is raised. *)
   val of_stdin : unit -> llmemorybuffer
+
+  (** [of_string ~name s] is the memory buffer containing the contents of string [s].
+      The name of memory buffer is set to [name] if it is provided. *)
+  val of_string : ?name:string -> string -> llmemorybuffer
+
+  (** [as_string mb] is the string containing the contents of memory buffer [mb]. *)
+  val as_string : llmemorybuffer -> string
   
   (** Disposes of a memory buffer. *)
   val dispose : llmemorybuffer -> unit
@@ -2371,14 +2434,12 @@ module PassManager : sig
       See the constructor of [llvm::FunctionPassManager]. *)
   val create_function : llmodule -> [ `Function ] t
 
-  
   (** [run_module m pm] initializes, executes on the module [m], and finalizes
       all of the passes scheduled in the pass manager [pm]. Returns [true] if
       any of the passes modified the module, [false] otherwise.
       See the [llvm::PassManager::run] method. *)
   val run_module : llmodule -> [ `Module ] t -> bool
 
-  
   (** [initialize fpm] initializes all of the function passes scheduled in the
       function pass manager [fpm]. Returns [true] if any of the passes modified
       the module, [false] otherwise.
@@ -2390,7 +2451,6 @@ module PassManager : sig
       of the passes modified [f], [false] otherwise.
       See the [llvm::FunctionPassManager::run] method. *)
   val run_function : llvalue -> [ `Function ] t -> bool
-
   
   (** [finalize fpm] finalizes all of the function passes scheduled in in the
       function pass manager [fpm]. Returns [true] if any of the passes
diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index c984bd1..d5ebdcd 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- llvm_ocaml.c - LLVM Ocaml Glue --------------------------*- C++ -*-===*\
+/*===-- llvm_ocaml.c - LLVM OCaml Glue --------------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 |* Note that these functions intentionally take liberties with the CAMLparamX *|
@@ -33,6 +33,7 @@ static value llvm_ioerror_exn;
 CAMLprim value llvm_register_core_exns(value IoError) {
   llvm_ioerror_exn = Field(IoError, 0);
   register_global_root(&llvm_ioerror_exn);
+
   return Val_unit;
 }
 
@@ -50,6 +51,30 @@ static void llvm_raise(value Prototype, char *Message) {
 #endif
 }
 
+static value llvm_fatal_error_handler;
+
+static void llvm_fatal_error_trampoline(const char *Reason) {
+  callback(llvm_fatal_error_handler, copy_string(Reason));
+}
+
+CAMLprim value llvm_install_fatal_error_handler(value Handler) {
+  LLVMInstallFatalErrorHandler(llvm_fatal_error_trampoline);
+  llvm_fatal_error_handler = Handler;
+  caml_register_global_root(&llvm_fatal_error_handler);
+  return Val_unit;
+}
+
+CAMLprim value llvm_reset_fatal_error_handler(value Unit) {
+  caml_remove_global_root(&llvm_fatal_error_handler);
+  LLVMResetFatalErrorHandler();
+  return Val_unit;
+}
+
+CAMLprim value llvm_enable_pretty_stacktrace(value Unit) {
+  LLVMEnablePrettyStackTrace();
+  return Val_unit;
+}
+
 static value alloc_variant(int tag, void *Value) {
   value Iter = alloc_small(1, tag);
   Field(Iter, 0) = Val_op(Value);
@@ -158,6 +183,27 @@ CAMLprim value llvm_dump_module(LLVMModuleRef M) {
   return Val_unit;
 }
 
+/* string -> llmodule -> unit */
+CAMLprim value llvm_print_module(value Filename, LLVMModuleRef M) {
+  char* Message;
+  if(LLVMPrintModuleToFile(M, String_val(Filename), &Message)) {
+    llvm_raise(llvm_ioerror_exn, Message);
+  }
+
+  return Val_unit;
+}
+
+/* llmodule -> string */
+CAMLprim value llvm_string_of_llmodule(LLVMModuleRef M) {
+  char* ModuleCStr;
+  ModuleCStr = LLVMPrintModuleToString(M);
+
+  value ModuleStr = caml_copy_string(ModuleCStr);
+  LLVMDisposeMessage(ModuleCStr);
+
+  return ModuleStr;
+}
+
 /* llmodule -> string -> unit */
 CAMLprim value llvm_set_module_inline_asm(LLVMModuleRef M, value Asm) {
   LLVMSetModuleInlineAsm(M, String_val(Asm));
@@ -180,6 +226,23 @@ CAMLprim LLVMContextRef llvm_type_context(LLVMTypeRef Ty) {
   return LLVMGetTypeContext(Ty);
 }
 
+/* lltype -> unit */
+CAMLprim value llvm_dump_type(LLVMTypeRef Val) {
+  LLVMDumpType(Val);
+  return Val_unit;
+}
+
+/* lltype -> string */
+CAMLprim value llvm_string_of_lltype(LLVMTypeRef M) {
+  char* TypeCStr;
+  TypeCStr = LLVMPrintTypeToString(M);
+
+  value TypeStr = caml_copy_string(TypeCStr);
+  LLVMDisposeMessage(TypeCStr);
+
+  return TypeStr;
+}
+
 /*--... Operations on integer types ........................................--*/
 
 /* llcontext -> lltype */
@@ -244,11 +307,6 @@ CAMLprim LLVMTypeRef llvm_ppc_fp128_type(LLVMContextRef Context) {
   return LLVMPPCFP128TypeInContext(Context);
 }
 
-/* llcontext -> lltype */
-CAMLprim LLVMTypeRef llvm_x86mmx_type(LLVMContextRef Context) {
-  return LLVMX86MMXTypeInContext(Context);
-}
-
 /*--... Operations on function types .......................................--*/
 
 /* lltype -> lltype array -> lltype */
@@ -386,6 +444,11 @@ CAMLprim LLVMTypeRef llvm_label_type(LLVMContextRef Context) {
   return LLVMLabelTypeInContext(Context);
 }
 
+/* llcontext -> lltype */
+CAMLprim LLVMTypeRef llvm_x86_mmx_type(LLVMContextRef Context) {
+  return LLVMX86MMXTypeInContext(Context);
+}
+
 CAMLprim value llvm_type_by_name(LLVMModuleRef M, value Name)
 {
   CAMLparam1(Name);
@@ -416,6 +479,8 @@ enum ValueKind {
   BlockAddress,
   ConstantAggregateZero,
   ConstantArray,
+  ConstantDataArray,
+  ConstantDataVector,
   ConstantExpr,
   ConstantFP,
   ConstantInt,
@@ -441,6 +506,8 @@ CAMLprim value llvm_classify_value(LLVMValueRef Val) {
     DEFINE_CASE(Val, BlockAddress);
     DEFINE_CASE(Val, ConstantAggregateZero);
     DEFINE_CASE(Val, ConstantArray);
+    DEFINE_CASE(Val, ConstantDataArray);
+    DEFINE_CASE(Val, ConstantDataVector);
     DEFINE_CASE(Val, ConstantExpr);
     DEFINE_CASE(Val, ConstantFP);
     DEFINE_CASE(Val, ConstantInt);
@@ -485,6 +552,24 @@ CAMLprim value llvm_dump_value(LLVMValueRef Val) {
   return Val_unit;
 }
 
+/* llvalue -> string */
+CAMLprim value llvm_string_of_llvalue(LLVMValueRef M) {
+  char* ValueCStr;
+  ValueCStr = LLVMPrintValueToString(M);
+
+  value ValueStr = caml_copy_string(ValueCStr);
+  LLVMDisposeMessage(ValueCStr);
+
+  return ValueStr;
+}
+
+/* llvalue -> llvalue -> unit */
+CAMLprim value llvm_replace_all_uses_with(LLVMValueRef OldVal,
+                                          LLVMValueRef NewVal) {
+  LLVMReplaceAllUsesWith(OldVal, NewVal);
+  return Val_unit;
+}
+
 /*--... Operations on users ................................................--*/
 
 /* llvalue -> int -> llvalue */
@@ -590,14 +675,22 @@ CAMLprim value llvm_get_mdstring(LLVMValueRef V) {
   CAMLreturn(Val_int(0));
 }
 
-CAMLprim value llvm_get_namedmd(LLVMModuleRef M, value name)
+/* llmodule -> string -> llvalue array */
+CAMLprim value llvm_get_namedmd(LLVMModuleRef M, value Name)
 {
-  CAMLparam1(name);
+  CAMLparam1(Name);
   CAMLlocal1(Nodes);
-  Nodes = alloc(LLVMGetNamedMetadataNumOperands(M, String_val(name)), 0);
-  LLVMGetNamedMetadataOperands(M, String_val(name), (LLVMValueRef *) Nodes);
+  Nodes = alloc(LLVMGetNamedMetadataNumOperands(M, String_val(Name)), 0);
+  LLVMGetNamedMetadataOperands(M, String_val(Name), (LLVMValueRef *) Nodes);
   CAMLreturn(Nodes);
 }
+
+/* llmodule -> string -> llvalue -> unit */
+CAMLprim value llvm_append_namedmd(LLVMModuleRef M, value Name, LLVMValueRef Val) {
+  LLVMAddNamedMetadataOperand(M, String_val(Name), Val);
+  return Val_unit;
+}
+
 /*--... Operations on scalar constants .....................................--*/
 
 /* lltype -> int -> llvalue */
@@ -718,6 +811,12 @@ CAMLprim LLVMValueRef llvm_const_in_bounds_gep(LLVMValueRef ConstantVal,
                               Wosize_val(Indices));
 }
 
+/* llvalue -> lltype -> is_signed:bool -> llvalue */
+CAMLprim LLVMValueRef llvm_const_intcast(LLVMValueRef CV, LLVMTypeRef T,
+                                         value IsSigned) {
+  return LLVMConstIntCast(CV, T, Bool_val(IsSigned));
+}
+
 /* llvalue -> int array -> llvalue */
 CAMLprim LLVMValueRef llvm_const_extractvalue(LLVMValueRef Aggregate,
                                               value Indices) {
@@ -877,7 +976,8 @@ CAMLprim LLVMValueRef llvm_declare_qualified_global(LLVMTypeRef Ty, value Name,
                               LLVMPointerType(Ty, Int_val(AddressSpace)));
     return GlobalVar;
   }
-  return LLVMAddGlobal(M, Ty, String_val(Name));
+  return LLVMAddGlobalInAddressSpace(M, Ty, String_val(Name),
+                                     Int_val(AddressSpace));
 }
 
 /* string -> llmodule -> llvalue option */
@@ -945,6 +1045,30 @@ CAMLprim value llvm_set_thread_local(value IsThreadLocal,
   return Val_unit;
 }
 
+/* llvalue -> ThreadLocalMode.t */
+CAMLprim value llvm_thread_local_mode(LLVMValueRef GlobalVar) {
+  return Val_int(LLVMGetThreadLocalMode(GlobalVar));
+}
+
+/* ThreadLocalMode.t -> llvalue -> unit */
+CAMLprim value llvm_set_thread_local_mode(value ThreadLocalMode,
+                                          LLVMValueRef GlobalVar) {
+  LLVMSetThreadLocalMode(GlobalVar, Int_val(ThreadLocalMode));
+  return Val_unit;
+}
+
+/* llvalue -> bool */
+CAMLprim value llvm_is_externally_initialized(LLVMValueRef GlobalVar) {
+  return Val_bool(LLVMIsExternallyInitialized(GlobalVar));
+}
+
+/* bool -> llvalue -> unit */
+CAMLprim value llvm_set_externally_initialized(value IsExternallyInitialized,
+                                               LLVMValueRef GlobalVar) {
+  LLVMSetExternallyInitialized(GlobalVar, Bool_val(IsExternallyInitialized));
+  return Val_unit;
+}
+
 /* llvalue -> bool */
 CAMLprim value llvm_is_global_constant(LLVMValueRef GlobalVar) {
   return Val_bool(LLVMIsGlobalConstant(GlobalVar));
@@ -1051,6 +1175,13 @@ CAMLprim value llvm_add_function_attr(LLVMValueRef Arg, value PA) {
   return Val_unit;
 }
 
+/* llvalue -> string -> string -> unit */
+CAMLprim value llvm_add_target_dependent_function_attr(
+                  LLVMValueRef Arg, value A, value V) {
+  LLVMAddTargetDependentFunctionAttr(Arg, String_val(A), String_val(V));
+  return Val_unit;
+}
+
 /* llvalue -> int32 */
 CAMLprim value llvm_function_attr(LLVMValueRef Fn)
 {
@@ -1135,6 +1266,24 @@ CAMLprim value llvm_delete_block(LLVMBasicBlockRef BB) {
   return Val_unit;
 }
 
+/* llbasicblock -> unit */
+CAMLprim value llvm_remove_block(LLVMBasicBlockRef BB) {
+  LLVMRemoveBasicBlockFromParent(BB);
+  return Val_unit;
+}
+
+/* llbasicblock -> llbasicblock -> unit */
+CAMLprim value llvm_move_block_before(LLVMBasicBlockRef Pos, LLVMBasicBlockRef BB) {
+  LLVMMoveBasicBlockBefore(BB, Pos);
+  return Val_unit;
+}
+
+/* llbasicblock -> llbasicblock -> unit */
+CAMLprim value llvm_move_block_after(LLVMBasicBlockRef Pos, LLVMBasicBlockRef BB) {
+  LLVMMoveBasicBlockAfter(BB, Pos);
+  return Val_unit;
+}
+
 /* string -> llvalue -> llbasicblock */
 CAMLprim LLVMBasicBlockRef llvm_append_block(LLVMContextRef Context, value Name,
                                              LLVMValueRef Fn) {
@@ -1167,7 +1316,7 @@ CAMLprim value llvm_instr_get_opcode(LLVMValueRef Inst) {
   return Val_int(o);
 }
 
-/* llvalue -> ICmp.t */
+/* llvalue -> ICmp.t option */
 CAMLprim value llvm_instr_icmp_predicate(LLVMValueRef Val) {
   CAMLparam0();
   int x = LLVMGetICmpPredicate(Val);
@@ -1223,6 +1372,20 @@ CAMLprim value llvm_set_tail_call(value IsTailCall,
   return Val_unit;
 }
 
+/*--... Operations on load/store instructions (only)........................--*/
+
+/* llvalue -> bool */
+CAMLprim value llvm_is_volatile(LLVMValueRef MemoryInst) {
+  return Val_bool(LLVMGetVolatile(MemoryInst));
+}
+
+/* bool -> llvalue -> unit */
+CAMLprim value llvm_set_volatile(value IsVolatile,
+                                  LLVMValueRef MemoryInst) {
+  LLVMSetVolatile(MemoryInst, Bool_val(IsVolatile));
+  return Val_unit;
+}
+
 /*--... Operations on phi nodes ............................................--*/
 
 /* (llvalue * llbasicblock) -> llvalue -> unit */
@@ -1271,7 +1434,7 @@ static void llvm_finalize_builder(value B) {
 }
 
 static struct custom_operations builder_ops = {
-  (char *) "IRBuilder",
+  (char *) "LLVMIRBuilder",
   llvm_finalize_builder,
   custom_compare_default,
   custom_hash_default,
@@ -1694,6 +1857,24 @@ CAMLprim LLVMValueRef llvm_build_store(LLVMValueRef Value, LLVMValueRef Pointer,
   return LLVMBuildStore(Builder_val(B), Value, Pointer);
 }
 
+/* AtomicRMWBinOp.t -> llvalue -> llvalue -> AtomicOrdering.t ->
+   bool -> llbuilder -> llvalue */
+CAMLprim LLVMValueRef llvm_build_atomicrmw_native(value BinOp, LLVMValueRef Ptr,
+                                                  LLVMValueRef Val, value Ord,
+                                                  value ST, value Name, value B) {
+  LLVMValueRef Instr;
+  Instr = LLVMBuildAtomicRMW(Builder_val(B), Int_val(BinOp),
+                             Ptr, Val, Int_val(Ord), Bool_val(ST));
+  LLVMSetValueName(Instr, String_val(Name));
+  return Instr;
+}
+
+CAMLprim LLVMValueRef llvm_build_atomicrmw_bytecode(value *argv, int argn) {
+  return llvm_build_atomicrmw_native(argv[0], (LLVMValueRef) argv[1],
+                                     (LLVMValueRef) argv[2], argv[3],
+                                     argv[4], argv[5], argv[6]);
+}
+
 /* llvalue -> llvalue array -> string -> llbuilder -> llvalue */
 CAMLprim LLVMValueRef llvm_build_gep(LLVMValueRef Pointer, value Indices,
                                      value Name, value B) {
@@ -1960,7 +2141,6 @@ CAMLprim LLVMValueRef llvm_build_ptrdiff(LLVMValueRef LHS, LLVMValueRef RHS,
   return LLVMBuildPtrDiff(Builder_val(B), LHS, RHS, String_val(Name));
 }
 
-
 /*===-- Memory buffers ----------------------------------------------------===*/
 
 /* string -> llmemorybuffer
@@ -1989,6 +2169,30 @@ CAMLprim LLVMMemoryBufferRef llvm_memorybuffer_of_stdin(value Unit) {
   return MemBuf;
 }
 
+/* ?name:string -> string -> llmemorybuffer */
+CAMLprim LLVMMemoryBufferRef llvm_memorybuffer_of_string(value Name, value String) {
+  const char *NameCStr;
+  if(Name == Val_int(0))
+    NameCStr = "";
+  else
+    NameCStr = String_val(Field(Name, 0));
+
+  LLVMMemoryBufferRef MemBuf;
+  MemBuf = LLVMCreateMemoryBufferWithMemoryRangeCopy(
+                String_val(String), caml_string_length(String), NameCStr);
+
+  return MemBuf;
+}
+
+/* llmemorybuffer -> string */
+CAMLprim value llvm_memorybuffer_as_string(LLVMMemoryBufferRef MemBuf) {
+  value String = caml_alloc_string(LLVMGetBufferSize(MemBuf));
+  memcpy(String_val(String), LLVMGetBufferStart(MemBuf),
+         LLVMGetBufferSize(MemBuf));
+
+  return String;
+}
+
 /* llmemorybuffer -> unit */
 CAMLprim value llvm_memorybuffer_dispose(LLVMMemoryBufferRef MemBuf) {
   LLVMDisposeMemoryBuffer(MemBuf);
diff --git a/bindings/ocaml/target/llvm_target.ml b/bindings/ocaml/target/llvm_target.ml
index f4891e2..974bd49 100644
--- a/bindings/ocaml/target/llvm_target.ml
+++ b/bindings/ocaml/target/llvm_target.ml
@@ -1,4 +1,4 @@
-(*===-- llvm_target.ml - LLVM Ocaml Interface ------------------*- OCaml -*-===*
+(*===-- llvm_target.ml - LLVM OCaml Interface ------------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -13,30 +13,126 @@ module Endian = struct
   | Little
 end
 
+module CodeGenOptLevel = struct
+  type t =
+  | None
+  | Less
+  | Default
+  | Aggressive
+end
+
+module RelocMode = struct
+  type t =
+  | Default
+  | Static
+  | PIC
+  | DynamicNoPIC
+end
+
+module CodeModel = struct
+  type t =
+  | Default
+  | JITDefault
+  | Small
+  | Kernel
+  | Medium
+  | Large
+end
+
+module CodeGenFileType = struct
+  type t =
+  | AssemblyFile
+  | ObjectFile
+end
+
+exception Error of string
+
+external register_exns : exn -> unit = "llvm_register_target_exns"
+let _ = register_exns (Error "")
+
 module DataLayout = struct
   type t
 
-  external create : string -> t = "llvm_targetdata_create"
-  external add : t -> [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-               = "llvm_targetdata_add"
-  external as_string : t -> string = "llvm_targetdata_as_string"
-  external dispose : t -> unit = "llvm_targetdata_dispose"
-end
-
-external byte_order : DataLayout.t -> Endian.t = "llvm_byte_order"
-external pointer_size : DataLayout.t -> int = "llvm_pointer_size"
-external intptr_type : DataLayout.t -> Llvm.lltype = "LLVMIntPtrType"
-external size_in_bits : DataLayout.t -> Llvm.lltype -> Int64.t
-                      = "llvm_size_in_bits"
-external store_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_store_size"
-external abi_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_abi_size"
-external abi_align : DataLayout.t -> Llvm.lltype -> int = "llvm_abi_align"
-external stack_align : DataLayout.t -> Llvm.lltype -> int = "llvm_stack_align"
-external preferred_align : DataLayout.t -> Llvm.lltype -> int
-                         = "llvm_preferred_align"
-external preferred_align_of_global : DataLayout.t -> Llvm.llvalue -> int
-                                   = "llvm_preferred_align_of_global"
-external element_at_offset : DataLayout.t -> Llvm.lltype -> Int64.t -> int
-                           = "llvm_element_at_offset"
-external offset_of_element : DataLayout.t -> Llvm.lltype -> int -> Int64.t
-                           = "llvm_offset_of_element"
+  external of_string : string -> t = "llvm_datalayout_of_string"
+  external as_string : t -> string = "llvm_datalayout_as_string"
+  external add_to_pass_manager : [<Llvm.PassManager.any]
+                                 Llvm.PassManager.t -> t -> unit
+                               = "llvm_datalayout_add_to_pass_manager"
+  external byte_order : t -> Endian.t = "llvm_datalayout_byte_order"
+  external pointer_size : t -> int = "llvm_datalayout_pointer_size"
+  external intptr_type : Llvm.llcontext -> t -> Llvm.lltype
+                       = "llvm_datalayout_intptr_type"
+  external qualified_pointer_size : int -> t -> int
+                                  = "llvm_datalayout_qualified_pointer_size"
+  external qualified_intptr_type : Llvm.llcontext -> int -> t -> Llvm.lltype
+                                 = "llvm_datalayout_qualified_intptr_type"
+  external size_in_bits : Llvm.lltype -> t -> Int64.t
+                        = "llvm_datalayout_size_in_bits"
+  external store_size : Llvm.lltype -> t -> Int64.t
+                      = "llvm_datalayout_store_size"
+  external abi_size : Llvm.lltype -> t -> Int64.t
+                    = "llvm_datalayout_abi_size"
+  external abi_align : Llvm.lltype -> t -> int
+                     = "llvm_datalayout_abi_align"
+  external stack_align : Llvm.lltype -> t -> int
+                       = "llvm_datalayout_stack_align"
+  external preferred_align : Llvm.lltype -> t -> int
+                           = "llvm_datalayout_preferred_align"
+  external preferred_align_of_global : Llvm.llvalue -> t -> int
+                                   = "llvm_datalayout_preferred_align_of_global"
+  external element_at_offset : Llvm.lltype -> Int64.t -> t -> int
+                             = "llvm_datalayout_element_at_offset"
+  external offset_of_element : Llvm.lltype -> int -> t -> Int64.t
+                             = "llvm_datalayout_offset_of_element"
+end
+
+module Target = struct
+  type t
+
+  external default_triple : unit -> string = "llvm_target_default_triple"
+  external first : unit -> t option = "llvm_target_first"
+  external succ : t -> t option = "llvm_target_succ"
+  external by_name : string -> t option = "llvm_target_by_name"
+  external by_triple : string -> t = "llvm_target_by_triple"
+  external name : t -> string = "llvm_target_name"
+  external description : t -> string = "llvm_target_description"
+  external has_jit : t -> bool = "llvm_target_has_jit"
+  external has_target_machine : t -> bool = "llvm_target_has_target_machine"
+  external has_asm_backend : t -> bool = "llvm_target_has_asm_backend"
+
+  let all () =
+    let rec step elem lst =
+      match elem with
+      | Some target -> step (succ target) (target :: lst)
+      | None        -> lst
+    in
+    step (first ()) []
+end
+
+module TargetMachine = struct
+  type t
+
+  external create : triple:string -> ?cpu:string -> ?features:string ->
+                    ?level:CodeGenOptLevel.t -> ?reloc_mode:RelocMode.t ->
+                    ?code_model:CodeModel.t -> Target.t -> t
+                  = "llvm_create_targetmachine_bytecode"
+                    "llvm_create_targetmachine_native"
+  external target : t -> Target.t
+                  = "llvm_targetmachine_target"
+  external triple : t -> string
+                  = "llvm_targetmachine_triple"
+  external cpu : t -> string
+               = "llvm_targetmachine_cpu"
+  external features : t -> string
+                    = "llvm_targetmachine_features"
+  external data_layout : t -> DataLayout.t
+                       = "llvm_targetmachine_data_layout"
+  external set_verbose_asm : bool -> t -> unit
+                           = "llvm_targetmachine_set_verbose_asm"
+  external emit_to_file : Llvm.llmodule -> CodeGenFileType.t -> string ->
+                          t -> unit
+                        = "llvm_targetmachine_emit_to_file"
+  external emit_to_memory_buffer : Llvm.llmodule -> CodeGenFileType.t ->
+                                   t -> Llvm.llmemorybuffer
+                                 = "llvm_targetmachine_emit_to_memory_buffer"
+end
diff --git a/bindings/ocaml/target/llvm_target.mli b/bindings/ocaml/target/llvm_target.mli
index ab9c5e4..4f5e717 100644
--- a/bindings/ocaml/target/llvm_target.mli
+++ b/bindings/ocaml/target/llvm_target.mli
@@ -1,4 +1,4 @@
-(*===-- llvm_target.mli - LLVM Ocaml Interface -----------------*- OCaml -*-===*
+(*===-- llvm_target.mli - LLVM OCaml Interface -----------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,7 +9,7 @@
 
 (** Target Information.
 
-    This interface provides an ocaml API for LLVM target information,
+    This interface provides an OCaml API for LLVM target information,
     the classes in the Target library. *)
 
 module Endian : sig
@@ -18,78 +18,205 @@ module Endian : sig
   | Little
 end
 
+module CodeGenOptLevel : sig
+  type t =
+  | None
+  | Less
+  | Default
+  | Aggressive
+end
+
+module RelocMode : sig
+  type t =
+  | Default
+  | Static
+  | PIC
+  | DynamicNoPIC
+end
+
+module CodeModel : sig
+  type t =
+  | Default
+  | JITDefault
+  | Small
+  | Kernel
+  | Medium
+  | Large
+end
+
+module CodeGenFileType : sig
+  type t =
+  | AssemblyFile
+  | ObjectFile
+end
+
+(** {6 Exceptions} *)
+
+exception Error of string
+
+(** {6 Data Layout} *)
+
 module DataLayout : sig
   type t
 
-  (** [DataLayout.create rep] parses the target data string representation [rep].
-      See the constructor llvm::DataLayout::DataLayout. *)
-  external create : string -> t = "llvm_targetdata_create"
+  (** [of_string rep] parses the data layout string representation [rep].
+      See the constructor [llvm::DataLayout::DataLayout]. *)
+  val of_string : string -> t
+
+  (** [as_string dl] is the string representation of the data layout [dl].
+      See the method [llvm::DataLayout::getStringRepresentation]. *)
+  val as_string : t -> string
+
+  (** [add_to_pass_manager dl pm] adds the target data [dl] to
+      the pass manager [pm].
+      See the method [llvm::PassManagerBase::add]. *)
+  val add_to_pass_manager : [<Llvm.PassManager.any] Llvm.PassManager.t ->
+                            t -> unit
+
+  (** Returns the byte order of a target, either [Endian.Big] or
+      [Endian.Little].
+      See the method [llvm::DataLayout::isLittleEndian]. *)
+  val byte_order : t -> Endian.t
+
+  (** Returns the pointer size in bytes for a target.
+      See the method [llvm::DataLayout::getPointerSize]. *)
+  val pointer_size : t -> int
+
+  (** Returns the integer type that is the same size as a pointer on a target.
+      See the method [llvm::DataLayout::getIntPtrType]. *)
+  val intptr_type : Llvm.llcontext -> t -> Llvm.lltype
+
+  (** Returns the pointer size in bytes for a target in a given address space.
+      See the method [llvm::DataLayout::getPointerSize]. *)
+  val qualified_pointer_size : int -> t -> int
+
+  (** Returns the integer type that is the same size as a pointer on a target
+      in a given address space.
+      See the method [llvm::DataLayout::getIntPtrType]. *)
+  val qualified_intptr_type : Llvm.llcontext -> int -> t -> Llvm.lltype
+
+  (** Computes the size of a type in bits for a target.
+      See the method [llvm::DataLayout::getTypeSizeInBits]. *)
+  val size_in_bits : Llvm.lltype -> t -> Int64.t
+
+  (** Computes the storage size of a type in bytes for a target.
+      See the method [llvm::DataLayout::getTypeStoreSize]. *)
+  val store_size : Llvm.lltype -> t -> Int64.t
+
+  (** Computes the ABI size of a type in bytes for a target.
+      See the method [llvm::DataLayout::getTypeAllocSize]. *)
+  val abi_size : Llvm.lltype -> t -> Int64.t
+
+  (** Computes the ABI alignment of a type in bytes for a target.
+      See the method [llvm::DataLayout::getTypeABISize]. *)
+  val abi_align : Llvm.lltype -> t -> int
+
+  (** Computes the call frame alignment of a type in bytes for a target.
+      See the method [llvm::DataLayout::getTypeABISize]. *)
+  val stack_align : Llvm.lltype -> t -> int
+
+  (** Computes the preferred alignment of a type in bytes for a target.
+      See the method [llvm::DataLayout::getTypeABISize]. *)
+  val preferred_align : Llvm.lltype -> t -> int
+
+  (** Computes the preferred alignment of a global variable in bytes for
+      a target. See the method [llvm::DataLayout::getPreferredAlignment]. *)
+  val preferred_align_of_global : Llvm.llvalue -> t -> int
+
+  (** Computes the structure element that contains the byte offset for a target.
+      See the method [llvm::StructLayout::getElementContainingOffset]. *)
+  val element_at_offset : Llvm.lltype -> Int64.t -> t -> int
+
+  (** Computes the byte offset of the indexed struct element for a target.
+      See the method [llvm::StructLayout::getElementContainingOffset]. *)
+  val offset_of_element : Llvm.lltype -> int -> t -> Int64.t
+end
+
+(** {6 Target} *)
+
+module Target : sig
+  type t
+
+  (** [default_triple ()] returns the default target triple for current
+      platform. *)
+  val default_triple : unit -> string
 
-  (** [add_target_data td pm] adds the target data [td] to the pass manager [pm].
-      Does not take ownership of the target data.
-      See the method llvm::PassManagerBase::add. *)
-  external add : t -> [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
-               = "llvm_targetdata_add"
+  (** [first ()] returns the first target in the registered targets
+      list, or [None]. *)
+  val first : unit -> t option
 
-  (** [as_string td] is the string representation of the target data [td].
-      See the constructor llvm::DataLayout::DataLayout. *)
-  external as_string : t -> string = "llvm_targetdata_as_string"
+  (** [succ t] returns the next target after [t], or [None]
+      if [t] was the last target. *)
+  val succ : t -> t option
 
-  (** Deallocates a DataLayout.
-      See the destructor llvm::DataLayout::~DataLayout. *)
-  external dispose : t -> unit = "llvm_targetdata_dispose"
+  (** [all ()] returns a list of known targets. *)
+  val all : unit -> t list
+
+  (** [by_name name] returns [Some t] if a target [t] named [name] is
+      registered, or [None] otherwise. *)
+  val by_name : string -> t option
+
+  (** [by_triple triple] returns a target for a triple [triple], or raises
+      [Error] if [triple] does not correspond to a registered target. *)
+  val by_triple : string -> t
+
+  (** Returns the name of a target. See [llvm::Target::getName]. *)
+  val name : t -> string
+
+  (** Returns the description of a target.
+      See [llvm::Target::getDescription]. *)
+  val description : t -> string
+
+  (** Returns [true] if the target has a JIT. *)
+  val has_jit : t -> bool
+
+  (** Returns [true] if the target has a target machine associated. *)
+  val has_target_machine : t -> bool
+
+  (** Returns [true] if the target has an ASM backend (required for
+      emitting output). *)
+  val has_asm_backend : t -> bool
 end
 
-(** Returns the byte order of a target, either LLVMBigEndian or
-    LLVMLittleEndian.
-    See the method llvm::DataLayout::isLittleEndian. *)
-external byte_order : DataLayout.t -> Endian.t = "llvm_byte_order"
-
-(** Returns the pointer size in bytes for a target.
-    See the method llvm::DataLayout::getPointerSize. *)
-external pointer_size : DataLayout.t -> int = "llvm_pointer_size"
-
-(** Returns the integer type that is the same size as a pointer on a target.
-    See the method llvm::DataLayout::getIntPtrType. *)
-external intptr_type : DataLayout.t -> Llvm.lltype = "LLVMIntPtrType"
-
-(** Computes the size of a type in bytes for a target.
-    See the method llvm::DataLayout::getTypeSizeInBits. *)
-external size_in_bits : DataLayout.t -> Llvm.lltype -> Int64.t
-                      = "llvm_size_in_bits"
-
-(** Computes the storage size of a type in bytes for a target.
-    See the method llvm::DataLayout::getTypeStoreSize. *)
-external store_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_store_size"
-
-(** Computes the ABI size of a type in bytes for a target.
-    See the method llvm::DataLayout::getTypeAllocSize. *)
-external abi_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_abi_size"
-
-(** Computes the ABI alignment of a type in bytes for a target.
-    See the method llvm::DataLayout::getTypeABISize. *)
-external abi_align : DataLayout.t -> Llvm.lltype -> int = "llvm_abi_align"
-
-(** Computes the call frame alignment of a type in bytes for a target.
-    See the method llvm::DataLayout::getTypeABISize. *)
-external stack_align : DataLayout.t -> Llvm.lltype -> int = "llvm_stack_align"
-
-(** Computes the preferred alignment of a type in bytes for a target.
-    See the method llvm::DataLayout::getTypeABISize. *)
-external preferred_align : DataLayout.t -> Llvm.lltype -> int
-                         = "llvm_preferred_align"
-
-(** Computes the preferred alignment of a global variable in bytes for a target.
-    See the method llvm::DataLayout::getPreferredAlignment. *)
-external preferred_align_of_global : DataLayout.t -> Llvm.llvalue -> int
-                                   = "llvm_preferred_align_of_global"
-
-(** Computes the structure element that contains the byte offset for a target.
-    See the method llvm::StructLayout::getElementContainingOffset. *)
-external element_at_offset : DataLayout.t -> Llvm.lltype -> Int64.t -> int
-                           = "llvm_element_at_offset"
-
-(** Computes the byte offset of the indexed struct element for a target.
-    See the method llvm::StructLayout::getElementContainingOffset. *)
-external offset_of_element : DataLayout.t -> Llvm.lltype -> int -> Int64.t
-                           = "llvm_offset_of_element"
+(** {6 Target Machine} *)
+
+module TargetMachine : sig
+  type t
+
+  (** Creates a new target machine.
+      See [llvm::Target::createTargetMachine]. *)
+  val create : triple:string -> ?cpu:string -> ?features:string ->
+               ?level:CodeGenOptLevel.t -> ?reloc_mode:RelocMode.t ->
+               ?code_model:CodeModel.t -> Target.t -> t
+
+  (** Returns the Target used in a TargetMachine *)
+  val target : t -> Target.t
+
+  (** Returns the triple used while creating this target machine. See
+      [llvm::TargetMachine::getTriple]. *)
+  val triple : t -> string
+
+  (** Returns the CPU used while creating this target machine. See
+      [llvm::TargetMachine::getCPU]. *)
+  val cpu : t -> string
+
+  (** Returns the feature string used while creating this target machine. See
+      [llvm::TargetMachine::getFeatureString]. *)
+  val features : t -> string
+
+  (** Returns the data layout of this target machine. *)
+  val data_layout : t -> DataLayout.t
+
+  (** Sets the assembly verbosity of this target machine.
+      See [llvm::TargetMachine::setAsmVerbosity]. *)
+  val set_verbose_asm : bool -> t -> unit
+
+  (** Emits assembly or object data for the given module to the given
+      file or raise [Error]. *)
+  val emit_to_file : Llvm.llmodule -> CodeGenFileType.t -> string -> t -> unit
+
+  (** Emits assembly or object data for the given module to a fresh memory
+      buffer or raise [Error]. *)
+  val emit_to_memory_buffer : Llvm.llmodule -> CodeGenFileType.t -> t ->
+                              Llvm.llmemorybuffer
+end
diff --git a/bindings/ocaml/target/target_ocaml.c b/bindings/ocaml/target/target_ocaml.c
index 62fe789..9e8778a 100644
--- a/bindings/ocaml/target/target_ocaml.c
+++ b/bindings/ocaml/target/target_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- target_ocaml.c - LLVM Ocaml Glue ------------------------*- C++ -*-===*\
+/*===-- target_ocaml.c - LLVM OCaml Glue ------------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 |* Note that these functions intentionally take liberties with the CAMLparamX *|
@@ -16,87 +16,375 @@
 \*===----------------------------------------------------------------------===*/
 
 #include "llvm-c/Target.h"
+#include "llvm-c/TargetMachine.h"
 #include "caml/alloc.h"
+#include "caml/fail.h"
+#include "caml/memory.h"
+#include "caml/custom.h"
 
-/* string -> DataLayout.t */
-CAMLprim LLVMTargetDataRef llvm_targetdata_create(value StringRep) {
-  return LLVMCreateTargetData(String_val(StringRep));
-}
+/*===---- Exceptions ------------------------------------------------------===*/
+
+static value llvm_target_error_exn;
 
-/* DataLayout.t -> [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
-CAMLprim value llvm_targetdata_add(LLVMTargetDataRef TD, LLVMPassManagerRef PM){
-  LLVMAddTargetData(TD, PM);
+CAMLprim value llvm_register_target_exns(value Error) {
+  llvm_target_error_exn = Field(Error, 0);
+  register_global_root(&llvm_target_error_exn);
   return Val_unit;
 }
 
+static void llvm_raise(value Prototype, char *Message) {
+  CAMLparam1(Prototype);
+  CAMLlocal1(CamlMessage);
+
+  CamlMessage = copy_string(Message);
+  LLVMDisposeMessage(Message);
+
+  raise_with_arg(Prototype, CamlMessage);
+  abort(); /* NOTREACHED */
+#ifdef CAMLnoreturn
+  CAMLnoreturn; /* Silences warnings, but is missing in some versions. */
+#endif
+}
+
+static value llvm_string_of_message(char* Message) {
+  value String = caml_copy_string(Message);
+  LLVMDisposeMessage(Message);
+
+  return String;
+}
+
+/*===---- Data Layout -----------------------------------------------------===*/
+
+#define DataLayout_val(v)  (*(LLVMTargetDataRef *)(Data_custom_val(v)))
+
+static void llvm_finalize_data_layout(value DataLayout) {
+  LLVMDisposeTargetData(DataLayout_val(DataLayout));
+}
+
+static struct custom_operations llvm_data_layout_ops = {
+  (char *) "LLVMDataLayout",
+  llvm_finalize_data_layout,
+  custom_compare_default,
+  custom_hash_default,
+  custom_serialize_default,
+  custom_deserialize_default
+#ifdef custom_compare_ext_default
+  , custom_compare_ext_default
+#endif
+};
+
+value llvm_alloc_data_layout(LLVMTargetDataRef DataLayout) {
+  value V = alloc_custom(&llvm_data_layout_ops, sizeof(LLVMTargetDataRef),
+                         0, 1);
+  DataLayout_val(V) = DataLayout;
+  return V;
+}
+
+/* string -> DataLayout.t */
+CAMLprim value llvm_datalayout_of_string(value StringRep) {
+  return llvm_alloc_data_layout(LLVMCreateTargetData(String_val(StringRep)));
+}
+
 /* DataLayout.t -> string */
-CAMLprim value llvm_targetdata_as_string(LLVMTargetDataRef TD) {
-  char *StringRep = LLVMCopyStringRepOfTargetData(TD);
+CAMLprim value llvm_datalayout_as_string(value TD) {
+  char *StringRep = LLVMCopyStringRepOfTargetData(DataLayout_val(TD));
   value Copy = copy_string(StringRep);
   LLVMDisposeMessage(StringRep);
   return Copy;
 }
 
-/* DataLayout.t -> unit */
-CAMLprim value llvm_targetdata_dispose(LLVMTargetDataRef TD) {
-  LLVMDisposeTargetData(TD);
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> DataLayout.t -> unit */
+CAMLprim value llvm_datalayout_add_to_pass_manager(LLVMPassManagerRef PM,
+                                                   value DL) {
+  LLVMAddTargetData(DataLayout_val(DL), PM);
   return Val_unit;
 }
 
 /* DataLayout.t -> Endian.t */
-CAMLprim value llvm_byte_order(LLVMTargetDataRef TD) {
-  return Val_int(LLVMByteOrder(TD));
+CAMLprim value llvm_datalayout_byte_order(value DL) {
+  return Val_int(LLVMByteOrder(DataLayout_val(DL)));
 }
 
 /* DataLayout.t -> int */
-CAMLprim value llvm_pointer_size(LLVMTargetDataRef TD) {
-  return Val_int(LLVMPointerSize(TD));
+CAMLprim value llvm_datalayout_pointer_size(value DL) {
+  return Val_int(LLVMPointerSize(DataLayout_val(DL)));
+}
+
+/* Llvm.llcontext -> DataLayout.t -> Llvm.lltype */
+CAMLprim LLVMTypeRef llvm_datalayout_intptr_type(LLVMContextRef C, value DL) {
+  return LLVMIntPtrTypeInContext(C, DataLayout_val(DL));;
+}
+
+/* int -> DataLayout.t -> int */
+CAMLprim value llvm_datalayout_qualified_pointer_size(value AS, value DL) {
+  return Val_int(LLVMPointerSizeForAS(DataLayout_val(DL), Int_val(AS)));
+}
+
+/* Llvm.llcontext -> int -> DataLayout.t -> Llvm.lltype */
+CAMLprim LLVMTypeRef llvm_datalayout_qualified_intptr_type(LLVMContextRef C,
+                                                           value AS,
+                                                           value DL) {
+  return LLVMIntPtrTypeForASInContext(C, DataLayout_val(DL), Int_val(AS));
+}
+
+/* Llvm.lltype -> DataLayout.t -> Int64.t */
+CAMLprim value llvm_datalayout_size_in_bits(LLVMTypeRef Ty, value DL) {
+  return caml_copy_int64(LLVMSizeOfTypeInBits(DataLayout_val(DL), Ty));
+}
+
+/* Llvm.lltype -> DataLayout.t -> Int64.t */
+CAMLprim value llvm_datalayout_store_size(LLVMTypeRef Ty, value DL) {
+  return caml_copy_int64(LLVMStoreSizeOfType(DataLayout_val(DL), Ty));
+}
+
+/* Llvm.lltype -> DataLayout.t -> Int64.t */
+CAMLprim value llvm_datalayout_abi_size(LLVMTypeRef Ty, value DL) {
+  return caml_copy_int64(LLVMABISizeOfType(DataLayout_val(DL), Ty));
+}
+
+/* Llvm.lltype -> DataLayout.t -> int */
+CAMLprim value llvm_datalayout_abi_align(LLVMTypeRef Ty, value DL) {
+  return Val_int(LLVMABIAlignmentOfType(DataLayout_val(DL), Ty));
+}
+
+/* Llvm.lltype -> DataLayout.t -> int */
+CAMLprim value llvm_datalayout_stack_align(LLVMTypeRef Ty, value DL) {
+  return Val_int(LLVMCallFrameAlignmentOfType(DataLayout_val(DL), Ty));
+}
+
+/* Llvm.lltype -> DataLayout.t -> int */
+CAMLprim value llvm_datalayout_preferred_align(LLVMTypeRef Ty, value DL) {
+  return Val_int(LLVMPreferredAlignmentOfType(DataLayout_val(DL), Ty));
+}
+
+/* Llvm.llvalue -> DataLayout.t -> int */
+CAMLprim value llvm_datalayout_preferred_align_of_global(LLVMValueRef GlobalVar,
+                                                         value DL) {
+  return Val_int(LLVMPreferredAlignmentOfGlobal(DataLayout_val(DL), GlobalVar));
+}
+
+/* Llvm.lltype -> Int64.t -> DataLayout.t -> int */
+CAMLprim value llvm_datalayout_element_at_offset(LLVMTypeRef Ty, value Offset,
+                                                 value DL) {
+  return Val_int(LLVMElementAtOffset(DataLayout_val(DL), Ty,
+                                     Int64_val(Offset)));
+}
+
+/* Llvm.lltype -> int -> DataLayout.t -> Int64.t */
+CAMLprim value llvm_datalayout_offset_of_element(LLVMTypeRef Ty, value Index,
+                                                 value DL) {
+  return caml_copy_int64(LLVMOffsetOfElement(DataLayout_val(DL), Ty,
+                                             Int_val(Index)));
+}
+
+/*===---- Target ----------------------------------------------------------===*/
+
+static value llvm_target_option(LLVMTargetRef Target) {
+  if(Target != NULL) {
+    value Result = caml_alloc_small(1, 0);
+    Store_field(Result, 0, (value) Target);
+    return Result;
+  }
+
+  return Val_int(0);
+}
+
+/* unit -> string */
+CAMLprim value llvm_target_default_triple(value Unit) {
+  char *TripleCStr = LLVMGetDefaultTargetTriple();
+  value TripleStr = caml_copy_string(TripleCStr);
+  LLVMDisposeMessage(TripleCStr);
+
+  return TripleStr;
+}
+
+/* unit -> Target.t option */
+CAMLprim value llvm_target_first(value Unit) {
+  return llvm_target_option(LLVMGetFirstTarget());
+}
+
+/* Target.t -> Target.t option */
+CAMLprim value llvm_target_succ(LLVMTargetRef Target) {
+  return llvm_target_option(LLVMGetNextTarget(Target));
+}
+
+/* string -> Target.t option */
+CAMLprim value llvm_target_by_name(value Name) {
+  return llvm_target_option(LLVMGetTargetFromName(String_val(Name)));
+}
+
+/* string -> Target.t */
+CAMLprim LLVMTargetRef llvm_target_by_triple(value Triple) {
+  LLVMTargetRef T;
+  char *Error;
+
+  if(LLVMGetTargetFromTriple(String_val(Triple), &T, &Error))
+    llvm_raise(llvm_target_error_exn, Error);
+
+  return T;
+}
+
+/* Target.t -> string */
+CAMLprim value llvm_target_name(LLVMTargetRef Target) {
+  return caml_copy_string(LLVMGetTargetName(Target));
+}
+
+/* Target.t -> string */
+CAMLprim value llvm_target_description(LLVMTargetRef Target) {
+  return caml_copy_string(LLVMGetTargetDescription(Target));
+}
+
+/* Target.t -> bool */
+CAMLprim value llvm_target_has_jit(LLVMTargetRef Target) {
+  return Val_bool(LLVMTargetHasJIT(Target));
+}
+
+/* Target.t -> bool */
+CAMLprim value llvm_target_has_target_machine(LLVMTargetRef Target) {
+  return Val_bool(LLVMTargetHasTargetMachine(Target));
 }
 
-/* DataLayout.t -> Llvm.lltype -> Int64.t */
-CAMLprim value llvm_size_in_bits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return caml_copy_int64(LLVMSizeOfTypeInBits(TD, Ty));
+/* Target.t -> bool */
+CAMLprim value llvm_target_has_asm_backend(LLVMTargetRef Target) {
+  return Val_bool(LLVMTargetHasAsmBackend(Target));
 }
 
-/* DataLayout.t -> Llvm.lltype -> Int64.t */
-CAMLprim value llvm_store_size(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return caml_copy_int64(LLVMStoreSizeOfType(TD, Ty));
+/*===---- Target Machine --------------------------------------------------===*/
+
+#define TargetMachine_val(v)  (*(LLVMTargetMachineRef *)(Data_custom_val(v)))
+
+static void llvm_finalize_target_machine(value Machine) {
+  LLVMDisposeTargetMachine(TargetMachine_val(Machine));
+}
+
+static struct custom_operations llvm_target_machine_ops = {
+  (char *) "LLVMTargetMachine",
+  llvm_finalize_target_machine,
+  custom_compare_default,
+  custom_hash_default,
+  custom_serialize_default,
+  custom_deserialize_default
+#ifdef custom_compare_ext_default
+  , custom_compare_ext_default
+#endif
+};
+
+static value llvm_alloc_targetmachine(LLVMTargetMachineRef Machine) {
+  value V = alloc_custom(&llvm_target_machine_ops, sizeof(LLVMTargetMachineRef),
+                         0, 1);
+  TargetMachine_val(V) = Machine;
+  return V;
 }
 
-/* DataLayout.t -> Llvm.lltype -> Int64.t */
-CAMLprim value llvm_abi_size(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return caml_copy_int64(LLVMABISizeOfType(TD, Ty));
+/* triple:string -> ?cpu:string -> ?features:string
+   ?level:CodeGenOptLevel.t -> ?reloc_mode:RelocMode.t
+   ?code_model:CodeModel.t -> Target.t -> TargetMachine.t */
+CAMLprim value llvm_create_targetmachine_native(value Triple, value CPU,
+                  value Features, value OptLevel, value RelocMode,
+                  value CodeModel, LLVMTargetRef Target) {
+  LLVMTargetMachineRef Machine;
+  const char *CPUStr = "", *FeaturesStr = "";
+  LLVMCodeGenOptLevel OptLevelEnum = LLVMCodeGenLevelDefault;
+  LLVMRelocMode RelocModeEnum = LLVMRelocDefault;
+  LLVMCodeModel CodeModelEnum = LLVMCodeModelDefault;
+
+  if(CPU != Val_int(0))
+    CPUStr = String_val(Field(CPU, 0));
+  if(Features != Val_int(0))
+    FeaturesStr = String_val(Field(Features, 0));
+  if(OptLevel != Val_int(0))
+    OptLevelEnum = Int_val(Field(OptLevel, 0));
+  if(RelocMode != Val_int(0))
+    RelocModeEnum = Int_val(Field(RelocMode, 0));
+  if(CodeModel != Val_int(0))
+    CodeModelEnum = Int_val(Field(CodeModel, 0));
+
+  Machine = LLVMCreateTargetMachine(Target, String_val(Triple), CPUStr,
+                FeaturesStr, OptLevelEnum, RelocModeEnum, CodeModelEnum);
+
+  return llvm_alloc_targetmachine(Machine);
 }
 
-/* DataLayout.t -> Llvm.lltype -> int */
-CAMLprim value llvm_abi_align(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return Val_int(LLVMABIAlignmentOfType(TD, Ty));
+CAMLprim value llvm_create_targetmachine_bytecode(value *argv, int argn) {
+  return llvm_create_targetmachine_native(argv[0], argv[1], argv[2], argv[3],
+                                    argv[4], argv[5], (LLVMTargetRef) argv[6]);
 }
 
-/* DataLayout.t -> Llvm.lltype -> int */
-CAMLprim value llvm_stack_align(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return Val_int(LLVMCallFrameAlignmentOfType(TD, Ty));
+/* TargetMachine.t -> Target.t */
+CAMLprim LLVMTargetRef llvm_targetmachine_target(value Machine) {
+  return LLVMGetTargetMachineTarget(TargetMachine_val(Machine));
 }
 
-/* DataLayout.t -> Llvm.lltype -> int */
-CAMLprim value llvm_preferred_align(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return Val_int(LLVMPreferredAlignmentOfType(TD, Ty));
+/* TargetMachine.t -> string */
+CAMLprim value llvm_targetmachine_triple(value Machine) {
+  return llvm_string_of_message(LLVMGetTargetMachineTriple(
+                                TargetMachine_val(Machine)));
 }
 
-/* DataLayout.t -> Llvm.llvalue -> int */
-CAMLprim value llvm_preferred_align_of_global(LLVMTargetDataRef TD,
-                                              LLVMValueRef GlobalVar) {
-  return Val_int(LLVMPreferredAlignmentOfGlobal(TD, GlobalVar));
+/* TargetMachine.t -> string */
+CAMLprim value llvm_targetmachine_cpu(value Machine) {
+  return llvm_string_of_message(LLVMGetTargetMachineCPU(
+                                TargetMachine_val(Machine)));
 }
 
-/* DataLayout.t -> Llvm.lltype -> Int64.t -> int */
-CAMLprim value llvm_element_at_offset(LLVMTargetDataRef TD, LLVMTypeRef Ty,
-                                      value Offset) {
-  return Val_int(LLVMElementAtOffset(TD, Ty, Int_val(Offset)));
+/* TargetMachine.t -> string */
+CAMLprim value llvm_targetmachine_features(value Machine) {
+  return llvm_string_of_message(LLVMGetTargetMachineFeatureString(
+                                TargetMachine_val(Machine)));
 }
 
-/* DataLayout.t -> Llvm.lltype -> int -> Int64.t */
-CAMLprim value llvm_offset_of_element(LLVMTargetDataRef TD, LLVMTypeRef Ty,
-                                      value Index) {
-  return caml_copy_int64(LLVMOffsetOfElement(TD, Ty, Int_val(Index)));
+/* TargetMachine.t -> DataLayout.t */
+CAMLprim value llvm_targetmachine_data_layout(value Machine) {
+  CAMLparam1(Machine);
+  CAMLlocal1(DataLayout);
+
+  /* LLVMGetTargetMachineData returns a pointer owned by the TargetMachine,
+     so it is impossible to wrap it with llvm_alloc_target_data, which assumes
+     that OCaml owns the pointer. */
+  LLVMTargetDataRef OrigDataLayout;
+  OrigDataLayout = LLVMGetTargetMachineData(TargetMachine_val(Machine));
+
+  char* TargetDataCStr;
+  TargetDataCStr = LLVMCopyStringRepOfTargetData(OrigDataLayout);
+  DataLayout = llvm_alloc_data_layout(LLVMCreateTargetData(TargetDataCStr));
+  LLVMDisposeMessage(TargetDataCStr);
+
+  CAMLreturn(DataLayout);
+}
+
+/* TargetMachine.t -> bool -> unit */
+CAMLprim value llvm_targetmachine_set_verbose_asm(value Machine, value Verb) {
+  LLVMSetTargetMachineAsmVerbosity(TargetMachine_val(Machine), Bool_val(Verb));
+  return Val_unit;
+}
+
+/* Llvm.llmodule -> CodeGenFileType.t -> string -> TargetMachine.t -> unit */
+CAMLprim value llvm_targetmachine_emit_to_file(LLVMModuleRef Module,
+                            value FileType, value FileName, value Machine) {
+  char* ErrorMessage;
+
+  if(LLVMTargetMachineEmitToFile(TargetMachine_val(Machine), Module,
+                                 String_val(FileName), Int_val(FileType),
+                                 &ErrorMessage)) {
+    llvm_raise(llvm_target_error_exn, ErrorMessage);
+  }
+
+  return Val_unit;
+}
+
+/* Llvm.llmodule -> CodeGenFileType.t -> TargetMachine.t ->
+   Llvm.llmemorybuffer */
+CAMLprim LLVMMemoryBufferRef llvm_targetmachine_emit_to_memory_buffer(
+                                LLVMModuleRef Module, value FileType,
+                                value Machine) {
+  char* ErrorMessage;
+  LLVMMemoryBufferRef Buffer;
+
+  if(LLVMTargetMachineEmitToMemoryBuffer(TargetMachine_val(Machine), Module,
+                                         Int_val(FileType), &ErrorMessage,
+                                         &Buffer)) {
+    llvm_raise(llvm_target_error_exn, ErrorMessage);
+  }
+
+  return Buffer;
 }
diff --git a/bindings/ocaml/transforms/Makefile b/bindings/ocaml/transforms/Makefile
index 05fcd90..92c8396 100644
--- a/bindings/ocaml/transforms/Makefile
+++ b/bindings/ocaml/transforms/Makefile
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL := ../../..
-DIRS = scalar ipo
+DIRS = scalar ipo vectorize passmgr_builder
 
 ocamldoc:
 	$(Verb) for i in $(DIRS) ; do \
diff --git a/bindings/ocaml/transforms/ipo/Makefile b/bindings/ocaml/transforms/ipo/Makefile
index 130d74c..ed67a7c 100644
--- a/bindings/ocaml/transforms/ipo/Makefile
+++ b/bindings/ocaml/transforms/ipo/Makefile
@@ -13,7 +13,6 @@
 
 LEVEL := ../../../..
 LIBRARYNAME := llvm_ipo
-DONT_BUILD_RELINKED := 1
 UsedComponents := ipo
 UsedOcamlInterfaces := llvm
 
diff --git a/bindings/ocaml/transforms/ipo/ipo_ocaml.c b/bindings/ocaml/transforms/ipo/ipo_ocaml.c
index 612015c..4ad8afb 100644
--- a/bindings/ocaml/transforms/ipo/ipo_ocaml.c
+++ b/bindings/ocaml/transforms/ipo/ipo_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- ipo_ocaml.c - LLVM Ocaml Glue -------------------*- C++ -*-===*\
+/*===-- ipo_ocaml.c - LLVM OCaml Glue ---------------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 |* Note that these functions intentionally take liberties with the CAMLparamX *|
@@ -50,6 +50,12 @@ CAMLprim value llvm_add_function_inlining(LLVMPassManagerRef PM) {
 }
 
 /* [`Module] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_always_inliner(LLVMPassManagerRef PM) {
+  LLVMAddAlwaysInlinerPass(PM);
+  return Val_unit;
+}
+
+/* [`Module] Llvm.PassManager.t -> unit */
 CAMLprim value llvm_add_always_inliner_pass(LLVMPassManagerRef PM) {
   LLVMAddAlwaysInlinerPass(PM);
   return Val_unit;
diff --git a/bindings/ocaml/transforms/ipo/llvm_ipo.ml b/bindings/ocaml/transforms/ipo/llvm_ipo.ml
index 1562d10..93f564a 100644
--- a/bindings/ocaml/transforms/ipo/llvm_ipo.ml
+++ b/bindings/ocaml/transforms/ipo/llvm_ipo.ml
@@ -1,4 +1,4 @@
-(*===-- llvm_ipo.mli - LLVM Ocaml Interface ------------*- OCaml -*-===*
+(*===-- llvm_ipo.ml - LLVM OCaml Interface --------------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -7,59 +7,31 @@
  *
  *===----------------------------------------------------------------------===*)
 
-(** IPO Transforms.
-
-    This interface provides an ocaml API for LLVM interprocedural optimizations, the
-    classes in the [LLVMIPO] library. *)
-
-(** See llvm::createAddArgumentPromotionPass *)
 external add_argument_promotion : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_argument_promotion"
-
-(** See llvm::createConstantMergePass function. *)
 external add_constant_merge : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_constant_merge"
-
-(**  See llvm::createDeadArgEliminationPass function. *)
 external add_dead_arg_elimination :
   [ | `Module ] Llvm.PassManager.t -> unit = "llvm_add_dead_arg_elimination"
-
-(**  See llvm::createFunctionAttrsPass function. *)
 external add_function_attrs : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_function_attrs"
-
-(**  See llvm::createFunctionInliningPass function. *)
 external add_function_inlining : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_function_inlining"
-
-(**  See llvm::createGlobalDCEPass function. *)
+external add_always_inliner : [ | `Module ] Llvm.PassManager.t -> unit =
+  "llvm_add_always_inliner"
 external add_global_dce : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_global_dce"
-
-(**  See llvm::createGlobalOptimizerPass function. *)
 external add_global_optimizer : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_global_optimizer"
-
-(**  See llvm::createIPConstantPropagationPass function. *)
 external add_ipc_propagation : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_ipc_propagation"
-
-(**  See llvm::createPruneEHPass function. *)
 external add_prune_eh : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_prune_eh"
-
-(**  See llvm::createIPSCCPPass function. *)
 external add_ipsccp : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_ipsccp"
-
-(**  See llvm::createInternalizePass function. *)
 external add_internalize : [ | `Module ] Llvm.PassManager.t -> bool -> unit =
   "llvm_add_internalize"
-
-(**  See llvm::createStripDeadPrototypesPass function. *)
 external add_strip_dead_prototypes :
   [ | `Module ] Llvm.PassManager.t -> unit = "llvm_add_strip_dead_prototypes"
-
-(**  See llvm::createStripSymbolsPass function. *)
 external add_strip_symbols : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_strip_symbols"
diff --git a/bindings/ocaml/transforms/ipo/llvm_ipo.mli b/bindings/ocaml/transforms/ipo/llvm_ipo.mli
index 636103d..1944c30 100644
--- a/bindings/ocaml/transforms/ipo/llvm_ipo.mli
+++ b/bindings/ocaml/transforms/ipo/llvm_ipo.mli
@@ -1,4 +1,4 @@
-(*===-- llvm_ipo.mli - LLVM Ocaml Interface ------------*- OCaml -*-===*
+(*===-- llvm_ipo.mli - LLVM OCaml Interface -------------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,13 +9,13 @@
 
 (** IPO Transforms.
 
-    This interface provides an ocaml API for LLVM interprocedural optimizations, the
+    This interface provides an OCaml API for LLVM interprocedural optimizations, the
     classes in the [LLVMIPO] library. *)
 
 (** See llvm::createAddArgumentPromotionPass *)
 external add_argument_promotion : [ | `Module ] Llvm.PassManager.t -> unit =
-
   "llvm_add_argument_promotion"
+
 (** See llvm::createConstantMergePass function. *)
 external add_constant_merge : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_constant_merge"
@@ -32,6 +32,10 @@ external add_function_attrs : [ | `Module ] Llvm.PassManager.t -> unit =
 external add_function_inlining : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_function_inlining"
 
+(**  See llvm::createAlwaysInlinerPass function. *)
+external add_always_inliner : [ | `Module ] Llvm.PassManager.t -> unit =
+  "llvm_add_always_inliner"
+
 (**  See llvm::createGlobalDCEPass function. *)
 external add_global_dce : [ | `Module ] Llvm.PassManager.t -> unit =
   "llvm_add_global_dce"
diff --git a/bindings/ocaml/transforms/passmgr_builder/Makefile b/bindings/ocaml/transforms/passmgr_builder/Makefile
new file mode 100644
index 0000000..54099db
--- /dev/null
+++ b/bindings/ocaml/transforms/passmgr_builder/Makefile
@@ -0,0 +1,19 @@
+##===- bindings/ocaml/transforms/passmgr_builder/Makefile --*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the makefile for the Objective Caml Llvm_passmgr_builder interface.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../../..
+LIBRARYNAME := llvm_passmgr_builder
+UsedComponents := ipo
+UsedOcamlInterfaces := llvm
+
+include ../../Makefile.ocaml
diff --git a/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.ml b/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.ml
new file mode 100644
index 0000000..60df446
--- /dev/null
+++ b/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.ml
@@ -0,0 +1,32 @@
+(*===-- llvm_passmgr_builder.ml - LLVM OCaml Interface --------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+type t
+
+external create : unit -> t
+  = "llvm_pmbuilder_create"
+external set_opt_level : int -> t -> unit
+  = "llvm_pmbuilder_set_opt_level"
+external set_size_level : int -> t -> unit
+  = "llvm_pmbuilder_set_size_level"
+external set_disable_unit_at_a_time : bool -> t -> unit
+  = "llvm_pmbuilder_set_disable_unit_at_a_time"
+external set_disable_unroll_loops : bool -> t -> unit
+  = "llvm_pmbuilder_set_disable_unroll_loops"
+external use_inliner_with_threshold : int -> t -> unit
+  = "llvm_pmbuilder_use_inliner_with_threshold"
+external populate_function_pass_manager
+  : [ `Function ] Llvm.PassManager.t -> t -> unit
+  = "llvm_pmbuilder_populate_function_pass_manager"
+external populate_module_pass_manager
+  : [ `Module ] Llvm.PassManager.t -> t -> unit
+  = "llvm_pmbuilder_populate_module_pass_manager"
+external populate_lto_pass_manager
+  : [ `Module ] Llvm.PassManager.t -> internalize:bool -> run_inliner:bool -> t -> unit
+  = "llvm_pmbuilder_populate_lto_pass_manager"
+\ No newline at end of file
diff --git a/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.mli b/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.mli
new file mode 100644
index 0000000..66b0981
--- /dev/null
+++ b/bindings/ocaml/transforms/passmgr_builder/llvm_passmgr_builder.mli
@@ -0,0 +1,54 @@
+(*===-- llvm_passmgr_builder.mli - LLVM OCaml Interface -------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** Pass Manager Builder.
+
+    This interface provides an OCaml API for LLVM pass manager builder
+    from the [LLVMCore] library. *)
+
+type t
+
+(** See [llvm::PassManagerBuilder]. *)
+external create : unit -> t
+  = "llvm_pmbuilder_create"
+
+(** See [llvm::PassManagerBuilder::OptLevel]. *)
+external set_opt_level : int -> t -> unit
+  = "llvm_pmbuilder_set_opt_level"
+
+(** See [llvm::PassManagerBuilder::SizeLevel]. *)
+external set_size_level : int -> t -> unit
+  = "llvm_pmbuilder_set_size_level"
+
+(** See [llvm::PassManagerBuilder::DisableUnitAtATime]. *)
+external set_disable_unit_at_a_time : bool -> t -> unit
+  = "llvm_pmbuilder_set_disable_unit_at_a_time"
+
+(** See [llvm::PassManagerBuilder::DisableUnrollLoops]. *)
+external set_disable_unroll_loops : bool -> t -> unit
+  = "llvm_pmbuilder_set_disable_unroll_loops"
+
+(** See [llvm::PassManagerBuilder::Inliner]. *)
+external use_inliner_with_threshold : int -> t -> unit
+  = "llvm_pmbuilder_use_inliner_with_threshold"
+
+(** See [llvm::PassManagerBuilder::populateFunctionPassManager]. *)
+external populate_function_pass_manager
+  : [ `Function ] Llvm.PassManager.t -> t -> unit
+  = "llvm_pmbuilder_populate_function_pass_manager"
+
+(** See [llvm::PassManagerBuilder::populateModulePassManager]. *)
+external populate_module_pass_manager
+  : [ `Module ] Llvm.PassManager.t -> t -> unit
+  = "llvm_pmbuilder_populate_module_pass_manager"
+
+(** See [llvm::PassManagerBuilder::populateLTOPassManager]. *)
+external populate_lto_pass_manager
+  : [ `Module ] Llvm.PassManager.t -> internalize:bool -> run_inliner:bool -> t -> unit
+  = "llvm_pmbuilder_populate_lto_pass_manager"
+\ No newline at end of file
diff --git a/bindings/ocaml/transforms/passmgr_builder/passmgr_builder_ocaml.c b/bindings/ocaml/transforms/passmgr_builder/passmgr_builder_ocaml.c
new file mode 100644
index 0000000..a707856
--- /dev/null
+++ b/bindings/ocaml/transforms/passmgr_builder/passmgr_builder_ocaml.c
@@ -0,0 +1,113 @@
+/*===-- passmgr_builder_ocaml.c - LLVM OCaml Glue ---------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+|* Note that these functions intentionally take liberties with the CAMLparamX *|
+|* macros, since most of the parameters are not GC heap objects.              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "caml/mlvalues.h"
+#include "caml/custom.h"
+#include "caml/misc.h"
+
+#define PMBuilder_val(v)  (*(LLVMPassManagerBuilderRef *)(Data_custom_val(v)))
+
+static void llvm_finalize_pmbuilder(value PMB) {
+  LLVMPassManagerBuilderDispose(PMBuilder_val(PMB));
+}
+
+static struct custom_operations pmbuilder_ops = {
+  (char *) "LLVMPassManagerBuilder",
+  llvm_finalize_pmbuilder,
+  custom_compare_default,
+  custom_hash_default,
+  custom_serialize_default,
+  custom_deserialize_default
+#ifdef custom_compare_ext_default
+  , custom_compare_ext_default
+#endif
+};
+
+static value alloc_pmbuilder(LLVMPassManagerBuilderRef Ref) {
+  value Val = alloc_custom(&pmbuilder_ops,
+                           sizeof(LLVMPassManagerBuilderRef), 0, 1);
+  PMBuilder_val(Val) = Ref;
+  return Val;
+}
+
+/* t -> unit */
+CAMLprim value llvm_pmbuilder_create(value Unit) {
+  return alloc_pmbuilder(LLVMPassManagerBuilderCreate());
+}
+
+/* int -> t -> unit */
+CAMLprim value llvm_pmbuilder_set_opt_level(value OptLevel, value PMB) {
+  LLVMPassManagerBuilderSetOptLevel(PMBuilder_val(PMB), Int_val(OptLevel));
+  return Val_unit;
+}
+
+/* int -> t -> unit */
+CAMLprim value llvm_pmbuilder_set_size_level(value SizeLevel, value PMB) {
+  LLVMPassManagerBuilderSetSizeLevel(PMBuilder_val(PMB), Int_val(SizeLevel));
+  return Val_unit;
+}
+
+/* int -> t -> unit */
+CAMLprim value llvm_pmbuilder_use_inliner_with_threshold(
+                      value Threshold, value PMB) {
+  LLVMPassManagerBuilderSetOptLevel(PMBuilder_val(PMB), Int_val(Threshold));
+  return Val_unit;
+}
+
+/* bool -> t -> unit */
+CAMLprim value llvm_pmbuilder_set_disable_unit_at_a_time(
+                      value DisableUnitAtATime, value PMB) {
+  LLVMPassManagerBuilderSetDisableUnitAtATime(
+                      PMBuilder_val(PMB), Bool_val(DisableUnitAtATime));
+  return Val_unit;
+}
+
+/* bool -> t -> unit */
+CAMLprim value llvm_pmbuilder_set_disable_unroll_loops(
+                      value DisableUnroll, value PMB) {
+  LLVMPassManagerBuilderSetDisableUnrollLoops(
+                      PMBuilder_val(PMB), Bool_val(DisableUnroll));
+  return Val_unit;
+}
+
+/* [ `Function ] Llvm.PassManager.t -> t -> unit */
+CAMLprim value llvm_pmbuilder_populate_function_pass_manager(
+                      LLVMPassManagerRef PM, value PMB) {
+  LLVMPassManagerBuilderPopulateFunctionPassManager(
+                      PMBuilder_val(PMB), PM);
+  return Val_unit;
+}
+
+/* [ `Module ] Llvm.PassManager.t -> t -> unit */
+CAMLprim value llvm_pmbuilder_populate_module_pass_manager(
+                      LLVMPassManagerRef PM, value PMB) {
+  LLVMPassManagerBuilderPopulateModulePassManager(
+                      PMBuilder_val(PMB), PM);
+  return Val_unit;
+}
+
+/* [ `Module ] Llvm.PassManager.t ->
+   internalize:bool -> run_inliner:bool -> t -> unit */
+CAMLprim value llvm_pmbuilder_populate_lto_pass_manager(
+                      LLVMPassManagerRef PM, value Internalize, value RunInliner,
+                      value PMB) {
+  LLVMPassManagerBuilderPopulateLTOPassManager(
+                      PMBuilder_val(PMB), PM,
+                      Bool_val(Internalize), Bool_val(RunInliner));
+  return Val_unit;
+}
diff --git a/bindings/ocaml/transforms/scalar/Makefile b/bindings/ocaml/transforms/scalar/Makefile
index cbaffa4..6e250f6 100644
--- a/bindings/ocaml/transforms/scalar/Makefile
+++ b/bindings/ocaml/transforms/scalar/Makefile
@@ -13,7 +13,6 @@
 
 LEVEL := ../../../..
 LIBRARYNAME := llvm_scalar_opts
-DONT_BUILD_RELINKED := 1
 UsedComponents := scalaropts
 UsedOcamlInterfaces := llvm
 
diff --git a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.ml b/bindings/ocaml/transforms/scalar/llvm_scalar_opts.ml
index 93ab1de..958939d 100644
--- a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.ml
+++ b/bindings/ocaml/transforms/scalar/llvm_scalar_opts.ml
@@ -1,4 +1,4 @@
-(*===-- llvm_scalar_opts.ml - LLVM Ocaml Interface -------------*- OCaml -*-===*
+(*===-- llvm_scalar_opts.ml - LLVM OCaml Interface -------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -109,3 +109,6 @@ external
 add_basic_alias_analysis : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
         = "llvm_add_basic_alias_analysis"
 
+external
+add_partially_inline_lib_calls : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+        = "llvm_add_partially_inline_lib_calls"
diff --git a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.mli b/bindings/ocaml/transforms/scalar/llvm_scalar_opts.mli
index 121b376..d69abe2 100644
--- a/bindings/ocaml/transforms/scalar/llvm_scalar_opts.mli
+++ b/bindings/ocaml/transforms/scalar/llvm_scalar_opts.mli
@@ -1,4 +1,4 @@
-(*===-- llvm_scalar_opts.mli - LLVM Ocaml Interface ------------*- OCaml -*-===*
+(*===-- llvm_scalar_opts.mli - LLVM OCaml Interface ------------*- OCaml -*-===*
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -9,7 +9,7 @@
 
 (** Scalar Transforms.
 
-    This interface provides an ocaml API for LLVM scalar transforms, the
+    This interface provides an OCaml API for LLVM scalar transforms, the
     classes in the [LLVMScalarOpts] library. *)
 
 (** See the [llvm::createConstantPropogationPass] function. *)
@@ -162,3 +162,7 @@ external
 add_basic_alias_analysis : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
         = "llvm_add_basic_alias_analysis"
 
+(** See the [llvm::createPartiallyInlineLibCallsPass] function. *)
+external
+add_partially_inline_lib_calls : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+        = "llvm_add_partially_inline_lib_calls"
diff --git a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c b/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
index 7047ec0..0a71bd7 100644
--- a/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
+++ b/bindings/ocaml/transforms/scalar/scalar_opts_ocaml.c
@@ -1,4 +1,4 @@
-/*===-- scalar_opts_ocaml.c - LLVM Ocaml Glue -------------------*- C++ -*-===*\
+/*===-- scalar_opts_ocaml.c - LLVM OCaml Glue -------------------*- C++ -*-===*\
 |*                                                                            *|
 |*                     The LLVM Compiler Infrastructure                       *|
 |*                                                                            *|
@@ -7,7 +7,7 @@
 |*                                                                            *|
 |*===----------------------------------------------------------------------===*|
 |*                                                                            *|
-|* This file glues LLVM's ocaml interface to its C interface. These functions *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
 |* are by and large transparent wrappers to the corresponding C functions.    *|
 |*                                                                            *|
 |* Note that these functions intentionally take liberties with the CAMLparamX *|
@@ -199,3 +199,9 @@ CAMLprim value llvm_add_basic_alias_analysis(LLVMPassManagerRef PM) {
   LLVMAddBasicAliasAnalysisPass(PM);
   return Val_unit;
 }
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_partially_inline_lib_calls(LLVMPassManagerRef PM) {
+  LLVMAddPartiallyInlineLibCallsPass(PM);
+  return Val_unit;
+}
diff --git a/bindings/ocaml/transforms/vectorize/Makefile b/bindings/ocaml/transforms/vectorize/Makefile
new file mode 100644
index 0000000..5a854d1
--- /dev/null
+++ b/bindings/ocaml/transforms/vectorize/Makefile
@@ -0,0 +1,19 @@
+##===- bindings/ocaml/transforms/vectorize/Makefile --------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+#
+# This is the makefile for the Objective Caml Llvm_vectorize_opts interface.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../../..
+LIBRARYNAME := llvm_vectorize
+UsedComponents := vectorize
+UsedOcamlInterfaces := llvm
+
+include ../../Makefile.ocaml
diff --git a/bindings/ocaml/transforms/vectorize/llvm_vectorize.ml b/bindings/ocaml/transforms/vectorize/llvm_vectorize.ml
new file mode 100644
index 0000000..4fc53c6
--- /dev/null
+++ b/bindings/ocaml/transforms/vectorize/llvm_vectorize.ml
@@ -0,0 +1,15 @@
+(*===-- llvm_vectorize.ml - LLVM OCaml Interface --------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+external add_bb_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+                          = "llvm_add_bb_vectorize"
+external add_loop_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+                            = "llvm_add_loop_vectorize"
+external add_slp_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+                           = "llvm_add_slp_vectorize"
diff --git a/bindings/ocaml/transforms/vectorize/llvm_vectorize.mli b/bindings/ocaml/transforms/vectorize/llvm_vectorize.mli
new file mode 100644
index 0000000..0253039
--- /dev/null
+++ b/bindings/ocaml/transforms/vectorize/llvm_vectorize.mli
@@ -0,0 +1,25 @@
+(*===-- llvm_vectorize.mli - LLVM OCaml Interface -------------*- OCaml -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*)
+
+(** Vectorize Transforms.
+
+    This interface provides an OCaml API for LLVM vectorize transforms, the
+    classes in the [LLVMVectorize] library. *)
+
+(** See the [llvm::createBBVectorizePass] function. *)
+external add_bb_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+                          = "llvm_add_bb_vectorize"
+
+(** See the [llvm::createLoopVectorizePass] function. *)
+external add_loop_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+                            = "llvm_add_loop_vectorize"
+
+(** See [llvm::createSLPVectorizerPass] function. *)
+external add_slp_vectorize : [<Llvm.PassManager.any] Llvm.PassManager.t -> unit
+                           = "llvm_add_slp_vectorize"
diff --git a/bindings/ocaml/transforms/vectorize/vectorize_ocaml.c b/bindings/ocaml/transforms/vectorize/vectorize_ocaml.c
new file mode 100644
index 0000000..1c81049
--- /dev/null
+++ b/bindings/ocaml/transforms/vectorize/vectorize_ocaml.c
@@ -0,0 +1,38 @@
+/*===-- vectorize_ocaml.c - LLVM OCaml Glue ---------------------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file glues LLVM's OCaml interface to its C interface. These functions *|
+|* are by and large transparent wrappers to the corresponding C functions.    *|
+|*                                                                            *|
+|* Note that these functions intentionally take liberties with the CAMLparamX *|
+|* macros, since most of the parameters are not GC heap objects.              *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/Transforms/Vectorize.h"
+#include "caml/mlvalues.h"
+#include "caml/misc.h"
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_bb_vectorize(LLVMPassManagerRef PM) {
+  LLVMAddBBVectorizePass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_loop_vectorize(LLVMPassManagerRef PM) {
+  LLVMAddLoopVectorizePass(PM);
+  return Val_unit;
+}
+
+/* [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+CAMLprim value llvm_add_slp_vectorize(LLVMPassManagerRef PM) {
+  LLVMAddSLPVectorizePass(PM);
+  return Val_unit;
+}
diff --git a/bindings/python/llvm/bit_reader.py b/bindings/python/llvm/bit_reader.py
new file mode 100644
index 0000000..5bf5e22
--- /dev/null
+++ b/bindings/python/llvm/bit_reader.py
@@ -0,0 +1,31 @@
+
+from .common import LLVMObject
+from .common import c_object_p
+from .common import get_library
+from . import enumerations
+from .core import MemoryBuffer
+from .core import Module
+from .core import OpCode
+from ctypes import POINTER
+from ctypes import byref
+from ctypes import c_char_p
+from ctypes import cast
+__all__ = ['parse_bitcode']
+lib = get_library()
+
+def parse_bitcode(mem_buffer):
+    """Input is .core.MemoryBuffer"""
+    module = c_object_p()
+    out = c_char_p(None)
+    result = lib.LLVMParseBitcode(mem_buffer, byref(module), byref(out))
+    if result:
+        raise RuntimeError('LLVM Error: %s' % out.value)
+    m = Module(module)
+    m.take_ownership(mem_buffer)
+    return m
+
+def register_library(library):
+    library.LLVMParseBitcode.argtypes = [MemoryBuffer, POINTER(c_object_p), POINTER(c_char_p)]
+    library.LLVMParseBitcode.restype = bool
+
+register_library(lib)
diff --git a/bindings/python/llvm/core.py b/bindings/python/llvm/core.py
index 6756637..ebf57d4 100644
--- a/bindings/python/llvm/core.py
+++ b/bindings/python/llvm/core.py
@@ -16,10 +16,19 @@ from . import enumerations
 from ctypes import POINTER
 from ctypes import byref
 from ctypes import c_char_p
+from ctypes import c_uint
 
 __all__ = [
     "lib",
+    "OpCode",
     "MemoryBuffer",
+    "Module",
+    "Value",
+    "Function",
+    "BasicBlock",
+    "Instruction",
+    "Context",
+    "PassRegistry"
 ]
 
 lib = get_library()
@@ -83,16 +92,449 @@ class MemoryBuffer(LLVMObject):
 
         LLVMObject.__init__(self, memory, disposer=lib.LLVMDisposeMemoryBuffer)
 
+    def __len__(self):
+        return lib.LLVMGetBufferSize(self)
+
+class Value(LLVMObject):
+    
+    def __init__(self, value):
+        LLVMObject.__init__(self, value)
+
+    @property
+    def name(self):
+        return lib.LLVMGetValueName(self)
+
+    def dump(self):
+        lib.LLVMDumpValue(self)
+    
+    def get_operand(self, i):
+        return Value(lib.LLVMGetOperand(self, i))
+    
+    def set_operand(self, i, v):
+        return lib.LLVMSetOperand(self, i, v)
+    
+    def __len__(self):
+        return lib.LLVMGetNumOperands(self)
+
+class Module(LLVMObject):
+    """Represents the top-level structure of an llvm program in an opaque object."""
+
+    def __init__(self, module, name=None, context=None):
+        LLVMObject.__init__(self, module, disposer=lib.LLVMDisposeModule)
+
+    @classmethod
+    def CreateWithName(cls, module_id):
+        m = Module(lib.LLVMModuleCreateWithName(module_id))
+        c = Context.GetGlobalContext().take_ownership(m)
+        return m
+
+    @property
+    def datalayout(self):
+        return lib.LLVMGetDataLayout(self)
+
+    @datalayout.setter
+    def datalayout(self, new_data_layout):
+        """new_data_layout is a string."""
+        lib.LLVMSetDataLayout(self, new_data_layout)
+
+    @property
+    def target(self):
+        return lib.LLVMGetTarget(self)
+
+    @target.setter
+    def target(self, new_target):
+        """new_target is a string."""
+        lib.LLVMSetTarget(self, new_target)
+
+    def dump(self):
+        lib.LLVMDumpModule(self)
+
+    class __function_iterator(object):
+        def __init__(self, module, reverse=False):
+            self.module = module
+            self.reverse = reverse
+            if self.reverse:
+                self.function = self.module.last
+            else:
+                self.function = self.module.first
+        
+        def __iter__(self):
+            return self
+        
+        def next(self):
+            if not isinstance(self.function, Function):
+                raise StopIteration("")
+            result = self.function
+            if self.reverse:
+                self.function = self.function.prev
+            else:
+                self.function = self.function.next
+            return result
+    
+    def __iter__(self):
+        return Module.__function_iterator(self)
+
+    def __reversed__(self):
+        return Module.__function_iterator(self, reverse=True)
+
+    @property
+    def first(self):
+        return Function(lib.LLVMGetFirstFunction(self))
+
+    @property
+    def last(self):
+        return Function(lib.LLVMGetLastFunction(self))
+
+    def print_module_to_file(self, filename):
+        out = c_char_p(None)
+        # Result is inverted so 0 means everything was ok.
+        result = lib.LLVMPrintModuleToFile(self, filename, byref(out))        
+        if result:
+            raise RuntimeError("LLVM Error: %s" % out.value)
+
+class Function(Value):
+
+    def __init__(self, value):
+        Value.__init__(self, value)
+    
+    @property
+    def next(self):
+        f = lib.LLVMGetNextFunction(self)
+        return f and Function(f)
+    
+    @property
+    def prev(self):
+        f = lib.LLVMGetPreviousFunction(self)
+        return f and Function(f)
+    
+    @property
+    def first(self):
+        b = lib.LLVMGetFirstBasicBlock(self)
+        return b and BasicBlock(b)
+
+    @property
+    def last(self):
+        b = lib.LLVMGetLastBasicBlock(self)
+        return b and BasicBlock(b)
+
+    class __bb_iterator(object):
+        def __init__(self, function, reverse=False):
+            self.function = function
+            self.reverse = reverse
+            if self.reverse:
+                self.bb = function.last
+            else:
+                self.bb = function.first
+        
+        def __iter__(self):
+            return self
+        
+        def next(self):
+            if not isinstance(self.bb, BasicBlock):
+                raise StopIteration("")
+            result = self.bb
+            if self.reverse:
+                self.bb = self.bb.prev
+            else:
+                self.bb = self.bb.next
+            return result
+    
+    def __iter__(self):
+        return Function.__bb_iterator(self)
+
+    def __reversed__(self):
+        return Function.__bb_iterator(self, reverse=True)
+    
+    def __len__(self):
+        return lib.LLVMCountBasicBlocks(self)
+
+class BasicBlock(LLVMObject):
+    
+    def __init__(self, value):
+        LLVMObject.__init__(self, value)
+
+    @property
+    def next(self):
+        b = lib.LLVMGetNextBasicBlock(self)
+        return b and BasicBlock(b)
+
+    @property
+    def prev(self):
+        b = lib.LLVMGetPreviousBasicBlock(self)
+        return b and BasicBlock(b)
+    
+    @property
+    def first(self):
+        i = lib.LLVMGetFirstInstruction(self)
+        return i and Instruction(i)
+
+    @property
+    def last(self):
+        i = lib.LLVMGetLastInstruction(self)
+        return i and Instruction(i)
+
+    def __as_value(self):
+        return Value(lib.LLVMBasicBlockAsValue(self))
+    
+    @property
+    def name(self):
+        return lib.LLVMGetValueName(self.__as_value())
+
+    def dump(self):
+        lib.LLVMDumpValue(self.__as_value())
+
+    def get_operand(self, i):
+        return Value(lib.LLVMGetOperand(self.__as_value(),
+                                        i))
+    
+    def set_operand(self, i, v):
+        return lib.LLVMSetOperand(self.__as_value(),
+                                  i, v)
+    
+    def __len__(self):
+        return lib.LLVMGetNumOperands(self.__as_value())
+
+    class __inst_iterator(object):
+        def __init__(self, bb, reverse=False):            
+            self.bb = bb
+            self.reverse = reverse
+            if self.reverse:
+                self.inst = self.bb.last
+            else:
+                self.inst = self.bb.first
+        
+        def __iter__(self):
+            return self
+        
+        def next(self):
+            if not isinstance(self.inst, Instruction):
+                raise StopIteration("")
+            result = self.inst
+            if self.reverse:
+                self.inst = self.inst.prev
+            else:
+                self.inst = self.inst.next
+            return result
+    
+    def __iter__(self):
+        return BasicBlock.__inst_iterator(self)
+
+    def __reversed__(self):
+        return BasicBlock.__inst_iterator(self, reverse=True)
+
+
+class Instruction(Value):
+
+    def __init__(self, value):
+        Value.__init__(self, value)
+
+    @property
+    def next(self):
+        i = lib.LLVMGetNextInstruction(self)
+        return i and Instruction(i)
+
+    @property
+    def prev(self):
+        i = lib.LLVMGetPreviousInstruction(self)
+        return i and Instruction(i)
+
+    @property
+    def opcode(self):
+        return OpCode.from_value(lib.LLVMGetInstructionOpcode(self))
+
+class Context(LLVMObject):
+
+    def __init__(self, context=None):
+        if context is None:
+            context = lib.LLVMContextCreate()
+            LLVMObject.__init__(self, context, disposer=lib.LLVMContextDispose)
+        else:
+            LLVMObject.__init__(self, context)
+
+    @classmethod
+    def GetGlobalContext(cls):
+        return Context(lib.LLVMGetGlobalContext())
+
+class PassRegistry(LLVMObject):
+    """Represents an opaque pass registry object."""
+
+    def __init__(self):
+        LLVMObject.__init__(self,
+                            lib.LLVMGetGlobalPassRegistry())
+
 def register_library(library):
+    # Initialization/Shutdown declarations.
+    library.LLVMInitializeCore.argtypes = [PassRegistry]
+    library.LLVMInitializeCore.restype = None
+
+    library.LLVMInitializeTransformUtils.argtypes = [PassRegistry]
+    library.LLVMInitializeTransformUtils.restype = None
+
+    library.LLVMInitializeScalarOpts.argtypes = [PassRegistry]
+    library.LLVMInitializeScalarOpts.restype = None
+
+    library.LLVMInitializeObjCARCOpts.argtypes = [PassRegistry]
+    library.LLVMInitializeObjCARCOpts.restype = None
+
+    library.LLVMInitializeVectorization.argtypes = [PassRegistry]
+    library.LLVMInitializeVectorization.restype = None
+
+    library.LLVMInitializeInstCombine.argtypes = [PassRegistry]
+    library.LLVMInitializeInstCombine.restype = None
+
+    library.LLVMInitializeIPO.argtypes = [PassRegistry]
+    library.LLVMInitializeIPO.restype = None
+
+    library.LLVMInitializeInstrumentation.argtypes = [PassRegistry]
+    library.LLVMInitializeInstrumentation.restype = None
+
+    library.LLVMInitializeAnalysis.argtypes = [PassRegistry]
+    library.LLVMInitializeAnalysis.restype = None
+
+    library.LLVMInitializeIPA.argtypes = [PassRegistry]
+    library.LLVMInitializeIPA.restype = None
+
+    library.LLVMInitializeCodeGen.argtypes = [PassRegistry]
+    library.LLVMInitializeCodeGen.restype = None
+
+    library.LLVMInitializeTarget.argtypes = [PassRegistry]
+    library.LLVMInitializeTarget.restype = None
+
+    library.LLVMShutdown.argtypes = []
+    library.LLVMShutdown.restype = None
+
+    # Pass Registry declarations.
+    library.LLVMGetGlobalPassRegistry.argtypes = []
+    library.LLVMGetGlobalPassRegistry.restype = c_object_p
+
+    # Context declarations.
+    library.LLVMContextCreate.argtypes = []
+    library.LLVMContextCreate.restype = c_object_p
+
+    library.LLVMContextDispose.argtypes = [Context]
+    library.LLVMContextDispose.restype = None
+
+    library.LLVMGetGlobalContext.argtypes = []
+    library.LLVMGetGlobalContext.restype = c_object_p
+
+    # Memory buffer declarations
     library.LLVMCreateMemoryBufferWithContentsOfFile.argtypes = [c_char_p,
             POINTER(c_object_p), POINTER(c_char_p)]
     library.LLVMCreateMemoryBufferWithContentsOfFile.restype = bool
 
+    library.LLVMGetBufferSize.argtypes = [MemoryBuffer]
+
     library.LLVMDisposeMemoryBuffer.argtypes = [MemoryBuffer]
 
+    # Module declarations
+    library.LLVMModuleCreateWithName.argtypes = [c_char_p]
+    library.LLVMModuleCreateWithName.restype = c_object_p
+
+    library.LLVMDisposeModule.argtypes = [Module]
+    library.LLVMDisposeModule.restype = None
+
+    library.LLVMGetDataLayout.argtypes = [Module]
+    library.LLVMGetDataLayout.restype = c_char_p
+
+    library.LLVMSetDataLayout.argtypes = [Module, c_char_p]
+    library.LLVMSetDataLayout.restype = None
+
+    library.LLVMGetTarget.argtypes = [Module]
+    library.LLVMGetTarget.restype = c_char_p
+
+    library.LLVMSetTarget.argtypes = [Module, c_char_p]
+    library.LLVMSetTarget.restype = None
+
+    library.LLVMDumpModule.argtypes = [Module]
+    library.LLVMDumpModule.restype = None
+
+    library.LLVMPrintModuleToFile.argtypes = [Module, c_char_p,
+                                              POINTER(c_char_p)]
+    library.LLVMPrintModuleToFile.restype = bool
+
+    library.LLVMGetFirstFunction.argtypes = [Module]
+    library.LLVMGetFirstFunction.restype = c_object_p
+
+    library.LLVMGetLastFunction.argtypes = [Module]
+    library.LLVMGetLastFunction.restype = c_object_p
+
+    library.LLVMGetNextFunction.argtypes = [Function]
+    library.LLVMGetNextFunction.restype = c_object_p
+
+    library.LLVMGetPreviousFunction.argtypes = [Function]
+    library.LLVMGetPreviousFunction.restype = c_object_p
+
+    # Value declarations.
+    library.LLVMGetValueName.argtypes = [Value]
+    library.LLVMGetValueName.restype = c_char_p
+
+    library.LLVMDumpValue.argtypes = [Value]
+    library.LLVMDumpValue.restype = None
+
+    library.LLVMGetOperand.argtypes = [Value, c_uint]
+    library.LLVMGetOperand.restype = c_object_p
+
+    library.LLVMSetOperand.argtypes = [Value, Value, c_uint]
+    library.LLVMSetOperand.restype = None
+
+    library.LLVMGetNumOperands.argtypes = [Value]
+    library.LLVMGetNumOperands.restype = c_uint
+
+    # Basic Block Declarations.
+    library.LLVMGetFirstBasicBlock.argtypes = [Function]
+    library.LLVMGetFirstBasicBlock.restype = c_object_p
+
+    library.LLVMGetLastBasicBlock.argtypes = [Function]
+    library.LLVMGetLastBasicBlock.restype = c_object_p
+
+    library.LLVMGetNextBasicBlock.argtypes = [BasicBlock]
+    library.LLVMGetNextBasicBlock.restype = c_object_p
+
+    library.LLVMGetPreviousBasicBlock.argtypes = [BasicBlock]
+    library.LLVMGetPreviousBasicBlock.restype = c_object_p
+
+    library.LLVMGetFirstInstruction.argtypes = [BasicBlock]
+    library.LLVMGetFirstInstruction.restype = c_object_p
+
+    library.LLVMGetLastInstruction.argtypes = [BasicBlock]
+    library.LLVMGetLastInstruction.restype = c_object_p
+
+    library.LLVMBasicBlockAsValue.argtypes = [BasicBlock]
+    library.LLVMBasicBlockAsValue.restype = c_object_p
+
+    library.LLVMCountBasicBlocks.argtypes = [Function]
+    library.LLVMCountBasicBlocks.restype = c_uint
+
+    # Instruction Declarations.
+    library.LLVMGetNextInstruction.argtypes = [Instruction]
+    library.LLVMGetNextInstruction.restype = c_object_p
+
+    library.LLVMGetPreviousInstruction.argtypes = [Instruction]
+    library.LLVMGetPreviousInstruction.restype = c_object_p
+
+    library.LLVMGetInstructionOpcode.argtypes = [Instruction]
+    library.LLVMGetInstructionOpcode.restype = c_uint
+
 def register_enumerations():
     for name, value in enumerations.OpCodes:
         OpCode.register(name, value)
 
+def initialize_llvm():
+    c = Context.GetGlobalContext()
+    p = PassRegistry()
+    lib.LLVMInitializeCore(p)
+    lib.LLVMInitializeTransformUtils(p)
+    lib.LLVMInitializeScalarOpts(p)
+    lib.LLVMInitializeObjCARCOpts(p)
+    lib.LLVMInitializeVectorization(p)
+    lib.LLVMInitializeInstCombine(p)
+    lib.LLVMInitializeIPO(p)
+    lib.LLVMInitializeInstrumentation(p)
+    lib.LLVMInitializeAnalysis(p)
+    lib.LLVMInitializeIPA(p)
+    lib.LLVMInitializeCodeGen(p)
+    lib.LLVMInitializeTarget(p)
+
 register_library(lib)
 register_enumerations()
+initialize_llvm()
diff --git a/bindings/python/llvm/disassembler.py b/bindings/python/llvm/disassembler.py
index dcef9ac..f2df275 100644
--- a/bindings/python/llvm/disassembler.py
+++ b/bindings/python/llvm/disassembler.py
@@ -10,7 +10,6 @@
 from ctypes import CFUNCTYPE
 from ctypes import POINTER
 from ctypes import addressof
-from ctypes import byref
 from ctypes import c_byte
 from ctypes import c_char_p
 from ctypes import c_int
@@ -34,6 +33,29 @@ callbacks = {}
 # Constants for set_options
 Option_UseMarkup = 1
 
+
+
+_initialized = False
+_targets = ['AArch64', 'ARM', 'Hexagon', 'MSP430', 'Mips', 'NVPTX', 'PowerPC', 'R600', 'Sparc', 'SystemZ', 'X86', 'XCore']
+def _ensure_initialized():
+    global _initialized
+    if not _initialized:
+        # Here one would want to call the functions
+        # LLVMInitializeAll{TargetInfo,TargetMC,Disassembler}s, but
+        # unfortunately they are only defined as static inline
+        # functions in the header files of llvm-c, so they don't exist
+        # as symbols in the shared library.
+        # So until that is fixed use this hack to initialize them all
+        for tgt in _targets:
+            for initializer in ("TargetInfo", "TargetMC", "Disassembler"):
+                try:
+                    f = getattr(lib, "LLVMInitialize" + tgt + initializer)
+                except AttributeError:
+                    continue
+                f()
+        _initialized = True
+
+
 class Disassembler(LLVMObject):
     """Represents a disassembler instance.
 
@@ -48,9 +70,12 @@ class Disassembler(LLVMObject):
         The triple argument is the triple to create the disassembler for. This
         is something like 'i386-apple-darwin9'.
         """
+
+        _ensure_initialized()
+
         ptr = lib.LLVMCreateDisasm(c_char_p(triple), c_void_p(None), c_int(0),
                 callbacks['op_info'](0), callbacks['symbol_lookup'](0))
-        if not ptr.contents:
+        if not ptr:
             raise Exception('Could not obtain disassembler for triple: %s' %
                             triple)
 
diff --git a/bindings/python/llvm/tests/base.py b/bindings/python/llvm/tests/base.py
index ff9eb2f..194f1a4 100644
--- a/bindings/python/llvm/tests/base.py
+++ b/bindings/python/llvm/tests/base.py
@@ -30,3 +30,9 @@ class TestBase(unittest.TestCase):
 
         raise Exception('No suitable test binaries available!')
     get_test_binary.__test__ = False
+
+    def get_test_file(self):
+        return os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_file")
+
+    def get_test_bc(self):
+        return os.path.join(os.path.dirname(os.path.abspath(__file__)), "test.bc")
diff --git a/bindings/python/llvm/tests/test.bc b/bindings/python/llvm/tests/test.bc
new file mode 100644
index 0000000..8d3d28f
--- /dev/null
+++ b/bindings/python/llvm/tests/test.bc
diff --git a/bindings/python/llvm/tests/test_bitreader.py b/bindings/python/llvm/tests/test_bitreader.py
new file mode 100644
index 0000000..d585009
--- /dev/null
+++ b/bindings/python/llvm/tests/test_bitreader.py
@@ -0,0 +1,15 @@
+from .base import TestBase
+from ..core import OpCode
+from ..core import MemoryBuffer
+from ..core import PassRegistry
+from ..core import Context
+from ..core import Module
+from ..bit_reader import parse_bitcode
+
+class TestBitReader(TestBase):
+
+    def test_parse_bitcode(self):
+        source = self.get_test_bc()
+        m = parse_bitcode(MemoryBuffer(filename=source))
+        print m.target
+        print m.datalayout
diff --git a/bindings/python/llvm/tests/test_core.py b/bindings/python/llvm/tests/test_core.py
index 545abc8..63f84c8 100644
--- a/bindings/python/llvm/tests/test_core.py
+++ b/bindings/python/llvm/tests/test_core.py
@@ -1,6 +1,10 @@
 from .base import TestBase
 from ..core import OpCode
 from ..core import MemoryBuffer
+from ..core import PassRegistry
+from ..core import Context
+from ..core import Module
+from ..bit_reader import parse_bitcode
 
 class TestCore(TestBase):
     def test_opcode(self):
@@ -13,7 +17,7 @@ class TestCore(TestBase):
         self.assertEqual(op, OpCode.Ret)
 
     def test_memory_buffer_create_from_file(self):
-        source = self.get_test_binary()
+        source = self.get_test_file()
 
         MemoryBuffer(filename=source)
 
@@ -21,3 +25,106 @@ class TestCore(TestBase):
         with self.assertRaises(Exception):
             MemoryBuffer(filename="/hopefully/this/path/doesnt/exist")
 
+    def test_memory_buffer_len(self):
+        source = self.get_test_file()
+        m = MemoryBuffer(filename=source)
+        self.assertEqual(len(m), 50)
+
+    def test_create_passregistry(self):
+        PassRegistry()
+
+    def test_create_context(self):
+        Context.GetGlobalContext()
+
+    def test_create_module_with_name(self):
+        # Make sure we can not create a module without a LLVMModuleRef.
+        with self.assertRaises(TypeError):
+            m = Module()
+        m = Module.CreateWithName("test-module")
+
+    def test_module_getset_datalayout(self):
+        m = Module.CreateWithName("test-module")
+        dl = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+        m.datalayout = dl
+        self.assertEqual(m.datalayout, dl)
+
+    def test_module_getset_target(self):
+        m = Module.CreateWithName("test-module")
+        target = "thumbv7-apple-ios5.0.0"
+        m.target = target
+        self.assertEqual(m.target, target)
+
+    def test_module_print_module_to_file(self):
+        m = Module.CreateWithName("test")
+        dl = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+        m.datalayout = dl
+        target = "thumbv7-apple-ios5.0.0"
+        m.target = target
+        m.print_module_to_file("test2.ll")
+    
+    def test_module_function_iteration(self):
+        m = parse_bitcode(MemoryBuffer(filename=self.get_test_bc()))
+        i = 0
+        functions = ["f", "f2", "f3", "f4", "f5", "f6", "g1", "g2", "h1", "h2",
+                     "h3"]
+        # Forward
+        for f in m:
+            self.assertEqual(f.name, functions[i])
+            f.dump()
+            i += 1
+        # Backwards
+        for f in reversed(m):
+            i -= 1
+            self.assertEqual(f.name, functions[i])
+            f.dump()
+
+    def test_function_basicblock_iteration(self):
+        m = parse_bitcode(MemoryBuffer(filename=self.get_test_bc()))
+        i = 0
+        
+        bb_list = ['b1', 'b2', 'end']
+        
+        f = m.first
+        while f.name != "f6":
+            f = f.next
+        
+        # Forward
+        for bb in f:
+            self.assertEqual(bb.name, bb_list[i])
+            bb.dump()
+            i += 1
+        
+        # Backwards
+        for bb in reversed(f):
+            i -= 1
+            self.assertEqual(bb.name, bb_list[i])
+            bb.dump()
+
+    def test_basicblock_instruction_iteration(self):
+        m = parse_bitcode(MemoryBuffer(filename=self.get_test_bc()))
+        i = 0
+        
+        inst_list = [('arg1', OpCode.ExtractValue),
+                     ('arg2', OpCode.ExtractValue),
+                     ('', OpCode.Call),
+                     ('', OpCode.Ret)]
+        
+        bb = m.first.first
+
+        # Forward
+        for inst in bb:
+            self.assertEqual(inst.name, inst_list[i][0])
+            self.assertEqual(inst.opcode, inst_list[i][1])
+            for op in range(len(inst)):
+                o = inst.get_operand(op)
+                print o.name
+                o.dump()
+            inst.dump()
+            i += 1
+
+        # Backwards
+        for inst in reversed(bb):
+            i -= 1
+            self.assertEqual(inst.name, inst_list[i][0])
+            self.assertEqual(inst.opcode, inst_list[i][1])
+            inst.dump()
diff --git a/bindings/python/llvm/tests/test_disassembler.py b/bindings/python/llvm/tests/test_disassembler.py
index 46d12f7..e960dc0 100644
--- a/bindings/python/llvm/tests/test_disassembler.py
+++ b/bindings/python/llvm/tests/test_disassembler.py
@@ -16,6 +16,10 @@ class TestDisassembler(TestBase):
         self.assertEqual(count, 3)
         self.assertEqual(s, '\tjcxz\t-127')
 
+    def test_nonexistant_triple(self):
+        with self.assertRaisesRegexp(Exception, "Could not obtain disassembler for triple"):
+            Disassembler("nonexistant-triple-raises")
+
     def test_get_instructions(self):
         sequence = '\x67\xe3\x81\x01\xc7' # jcxz -127; addl %eax, %edi
 
diff --git a/bindings/python/llvm/tests/test_file b/bindings/python/llvm/tests/test_file
new file mode 100644
index 0000000..6c9b038
--- /dev/null
+++ b/bindings/python/llvm/tests/test_file
@@ -0,0 +1 @@
+I,"�cAG�xq��Ԑ�d����vl��\�L>�g>`����`��wɩ
+\ No newline at end of file
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 0567820..dc991a2 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -74,12 +74,6 @@ check_symbol_exists(FE_INEXACT "fenv.h" HAVE_DECL_FE_INEXACT)
 check_include_file(mach/mach.h HAVE_MACH_MACH_H)
 check_include_file(mach-o/dyld.h HAVE_MACH_O_DYLD_H)
 
-check_include_file(curses.h HAVE_CURSES_H)
-check_include_file(ncurses.h HAVE_NCURSES_H)
-check_include_file(ncursesw.h HAVE_NCURSESW_H)
-check_include_file(ncurses/curses.h HAVE_NCURSES_CURSES_H)
-check_include_file(ncursesw/curses.h HAVE_NCURSESW_CURSES_H)
-
 # library checks
 if( NOT PURE_WINDOWS )
   check_library_exists(pthread pthread_create "" HAVE_LIBPTHREAD)
@@ -103,18 +97,19 @@ if( NOT PURE_WINDOWS )
   else()
     set(HAVE_LIBZ 0)
   endif()
-  if(LLVM_ENABLE_CURSES)
-    check_library_exists(curses has_colors "" HAVE_CURSES)
-    if(NOT HAVE_CURSES)
-      check_library_exists(ncurses has_colors "" HAVE_NCURSES)
-      set(HAVE_CURSES ${HAVE_NCURSES})
-      if(NOT HAVE_CURSES)
-        check_library_exists(ncursesw has_colors "" HAVE_NCURSESW)
-        set(HAVE_CURSES ${HAVE_NCURSESW})
+  if(LLVM_ENABLE_TERMINFO)
+    set(HAVE_TERMINFO 0)
+    foreach(library tinfo terminfo curses ncurses ncursesw)
+      string(TOUPPER ${library} library_suffix)
+      check_library_exists(${library} setupterm "" HAVE_TERMINFO_${library_suffix})
+      if(HAVE_TERMINFO_${library_suffix})
+        set(HAVE_TERMINFO 1)
+        set(TERMINFO_LIBS "${library}")
+        break()
       endif()
-    endif()
+    endforeach()
   else()
-    set(HAVE_CURSES 0)
+    set(HAVE_TERMINFO 0)
   endif()
 endif()
 
@@ -165,6 +160,7 @@ check_symbol_exists(gettimeofday sys/time.h HAVE_GETTIMEOFDAY)
 check_symbol_exists(getrlimit "sys/types.h;sys/time.h;sys/resource.h" HAVE_GETRLIMIT)
 check_symbol_exists(posix_spawn spawn.h HAVE_POSIX_SPAWN)
 check_symbol_exists(pread unistd.h HAVE_PREAD)
+check_symbol_exists(realpath stdlib.h HAVE_REALPATH)
 check_symbol_exists(sbrk unistd.h HAVE_SBRK)
 check_symbol_exists(srand48 stdlib.h HAVE_RAND48_SRAND48)
 if( HAVE_RAND48_SRAND48 )
@@ -304,6 +300,20 @@ endif()
 find_package(LibXml2)
 if (LIBXML2_FOUND)
   set(CLANG_HAVE_LIBXML 1)
+  # When cross-compiling, liblzma is not detected as a dependency for libxml2,
+  # which makes linking c-index-test fail. But for native builds, all libraries
+  # are installed and checked by CMake before Makefiles are generated and everything
+  # works according to the plan. However, if a -llzma is added to native builds,
+  # an additional requirement on the static liblzma.a is required, but will not
+  # be checked by CMake, breaking native compilation.
+  # Since this is only pertinent to cross-compilations, and there's no way CMake
+  # can check for every foreign library on every OS, we add the dep and warn the dev.
+  if ( CMAKE_CROSSCOMPILING )
+    if (NOT PC_LIBXML_VERSION VERSION_LESS "2.8.0")
+      message(STATUS "Adding LZMA as a dep to XML2 for cross-compilation, make sure liblzma.a is available.")
+      set(LIBXML2_LIBRARIES ${LIBXML2_LIBRARIES} "-llzma")
+    endif ()
+  endif ()
 endif ()
 
 include(CheckCXXCompilerFlag)
@@ -405,6 +415,7 @@ endif ()
 if( MINGW )
   set(HAVE_LIBIMAGEHLP 1)
   set(HAVE_LIBPSAPI 1)
+  set(HAVE_LIBSHELL32 1)
   # TODO: Check existence of libraries.
   #   include(CheckLibraryExists)
   #   CHECK_LIBRARY_EXISTS(imagehlp ??? . HAVE_LIBIMAGEHLP)
@@ -465,3 +476,25 @@ if (LLVM_ENABLE_ZLIB )
 endif()
 
 set(LLVM_PREFIX ${CMAKE_INSTALL_PREFIX})
+
+if (LLVM_ENABLE_DOXYGEN)
+  message(STATUS "Doxygen enabled.")
+  find_package(Doxygen)
+
+  if (DOXYGEN_FOUND)
+    # If we find doxygen and we want to enable doxygen by default create a
+    # global aggregate doxygen target for generating llvm and any/all
+    # subprojects doxygen documentation.
+    if (LLVM_BUILD_DOCS)
+      add_custom_target(doxygen ALL)
+    endif()
+
+    option(LLVM_DOXYGEN_EXTERNAL_SEARCH "Enable doxygen external search." OFF)
+    if (LLVM_DOXYGEN_EXTERNAL_SEARCH)
+      set(LLVM_DOXYGEN_SEARCHENGINE_URL "" CACHE STRING "URL to use for external searhc.")
+      set(LLVM_DOXYGEN_SEARCH_MAPPINGS "" CACHE STRING "Doxygen Search Mappings")
+    endif()
+  endif()
+else()
+  message(STATUS "Doxygen disabled.")
+endif()
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 094969c..34e4017 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -12,6 +12,11 @@ macro(add_llvm_library name)
 
   if( BUILD_SHARED_LIBS )
     llvm_config( ${name} ${LLVM_LINK_COMPONENTS} )
+    if (MSVC)
+      set_target_properties(${name}
+        PROPERTIES
+        IMPORT_SUFFIX ".imp")
+    endif ()
   endif()
 
   # Ensure that the system libraries always comes last on the
@@ -21,9 +26,11 @@ macro(add_llvm_library name)
   if( EXCLUDE_FROM_ALL )
     set_target_properties( ${name} PROPERTIES EXCLUDE_FROM_ALL ON)
   else()
-    install(TARGETS ${name}
-      LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-      ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+    if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "LTO")
+      install(TARGETS ${name}
+        LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+        ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+    endif()
   endif()
   set_target_properties(${name} PROPERTIES FOLDER "Libraries")
 
@@ -65,9 +72,11 @@ ${name} ignored.")
     if( EXCLUDE_FROM_ALL )
       set_target_properties( ${name} PROPERTIES EXCLUDE_FROM_ALL ON)
     else()
-      install(TARGETS ${name}
-        LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-        ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+      if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+        install(TARGETS ${name}
+          LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+          ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+      endif()
     endif()
   endif()
 
@@ -91,14 +100,23 @@ macro(add_llvm_executable name)
 endmacro(add_llvm_executable name)
 
 
+set (LLVM_TOOLCHAIN_TOOLS
+  llvm-ar
+  llvm-objdump
+  )
+
 macro(add_llvm_tool name)
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_TOOLS_BINARY_DIR})
   if( NOT LLVM_BUILD_TOOLS )
     set(EXCLUDE_FROM_ALL ON)
   endif()
   add_llvm_executable(${name} ${ARGN})
-  if( LLVM_BUILD_TOOLS )
-    install(TARGETS ${name} RUNTIME DESTINATION bin)
+
+  list(FIND LLVM_TOOLCHAIN_TOOLS ${name} LLVM_IS_${name}_TOOLCHAIN_TOOL)
+  if (LLVM_IS_${name}_TOOLCHAIN_TOOL GREATER -1 OR NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+    if( LLVM_BUILD_TOOLS )
+      install(TARGETS ${name} RUNTIME DESTINATION bin)
+    endif()
   endif()
   set_target_properties(${name} PROPERTIES FOLDER "Tools")
 endmacro(add_llvm_tool name)
@@ -141,6 +159,7 @@ macro(add_llvm_external_project name)
   if("${add_llvm_external_dir}" STREQUAL "")
     set(add_llvm_external_dir ${name})
   endif()
+  list(APPEND LLVM_IMPLICIT_PROJECT_IGNORE "${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}")
   string(REPLACE "-" "_" nameUNDERSCORE ${name})
   string(TOUPPER ${nameUNDERSCORE} nameUPPER)
   set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}"
@@ -155,6 +174,34 @@ macro(add_llvm_external_project name)
   endif()
 endmacro(add_llvm_external_project)
 
+macro(add_llvm_tool_subdirectory name)
+  list(APPEND LLVM_IMPLICIT_PROJECT_IGNORE "${CMAKE_CURRENT_SOURCE_DIR}/${name}")
+  add_subdirectory(${name})
+endmacro(add_llvm_tool_subdirectory)
+
+macro(ignore_llvm_tool_subdirectory name)
+  list(APPEND LLVM_IMPLICIT_PROJECT_IGNORE "${CMAKE_CURRENT_SOURCE_DIR}/${name}")
+endmacro(ignore_llvm_tool_subdirectory)
+
+function(add_llvm_implicit_external_projects)
+  set(list_of_implicit_subdirs "")
+  file(GLOB sub-dirs "${CMAKE_CURRENT_SOURCE_DIR}/*")
+  foreach(dir ${sub-dirs})
+    if(IS_DIRECTORY "${dir}")
+      list(FIND LLVM_IMPLICIT_PROJECT_IGNORE "${dir}" tool_subdir_ignore)
+      if( tool_subdir_ignore EQUAL -1
+          AND EXISTS "${dir}/CMakeLists.txt")
+        get_filename_component(fn "${dir}" NAME)
+        list(APPEND list_of_implicit_subdirs "${fn}")
+      endif()
+    endif()
+  endforeach()
+
+  foreach(external_proj ${list_of_implicit_subdirs})
+    add_llvm_external_project("${external_proj}")
+  endforeach()
+endfunction(add_llvm_implicit_external_projects)
+
 # Generic support for adding a unittest.
 function(add_unittest test_suite test_name)
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -249,6 +296,22 @@ function(configure_lit_site_cfg input output)
   set(HOST_OS ${CMAKE_SYSTEM_NAME})
   set(HOST_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 
+  if (CLANG_ENABLE_ARCMT)
+    set(ENABLE_CLANG_ARCMT "1")
+  else()
+    set(ENABLE_CLANG_ARCMT "0")
+  endif()
+  if (CLANG_ENABLE_REWRITER)
+    set(ENABLE_CLANG_REWRITER "1")
+  else()
+    set(ENABLE_CLANG_REWRITER "0")
+  endif()
+  if (CLANG_ENABLE_STATIC_ANALYZER)
+    set(ENABLE_CLANG_STATIC_ANALYZER "1")
+  else()
+    set(ENABLE_CLANG_STATIC_ANALYZER "0")
+  endif()
+
   configure_file(${input} ${output} @ONLY)
 endfunction()
 
diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt
index f51e9af..0253952 100644
--- a/cmake/modules/CMakeLists.txt
+++ b/cmake/modules/CMakeLists.txt
@@ -19,19 +19,21 @@ configure_file(
   ${llvm_cmake_builddir}/LLVMConfigVersion.cmake
   @ONLY)
 
-install(FILES
-  ${llvm_cmake_builddir}/LLVMConfig.cmake
-  ${llvm_cmake_builddir}/LLVMConfigVersion.cmake
-  LLVM-Config.cmake
-  DESTINATION share/llvm/cmake)
+if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+  install(FILES
+    ${llvm_cmake_builddir}/LLVMConfig.cmake
+    ${llvm_cmake_builddir}/LLVMConfigVersion.cmake
+    LLVM-Config.cmake
+    DESTINATION share/llvm/cmake)
 
-install(DIRECTORY .
-  DESTINATION share/llvm/cmake
-  FILES_MATCHING PATTERN *.cmake
-  PATTERN .svn EXCLUDE
-  PATTERN LLVMConfig.cmake EXCLUDE
-  PATTERN LLVMConfigVersion.cmake EXCLUDE
-  PATTERN LLVM-Config.cmake EXCLUDE
-  PATTERN GetHostTriple.cmake EXCLUDE
-  PATTERN VersionFromVCS.cmake EXCLUDE
-  PATTERN CheckAtomic.cmake EXCLUDE)
+  install(DIRECTORY .
+    DESTINATION share/llvm/cmake
+    FILES_MATCHING PATTERN *.cmake
+    PATTERN .svn EXCLUDE
+    PATTERN LLVMConfig.cmake EXCLUDE
+    PATTERN LLVMConfigVersion.cmake EXCLUDE
+    PATTERN LLVM-Config.cmake EXCLUDE
+    PATTERN GetHostTriple.cmake EXCLUDE
+    PATTERN VersionFromVCS.cmake EXCLUDE
+    PATTERN CheckAtomic.cmake EXCLUDE)
+endif()
diff --git a/cmake/modules/ChooseMSVCCRT.cmake b/cmake/modules/ChooseMSVCCRT.cmake
index 6a2f426..25ed9c7 100644
--- a/cmake/modules/ChooseMSVCCRT.cmake
+++ b/cmake/modules/ChooseMSVCCRT.cmake
@@ -71,7 +71,7 @@ variables (LLVM_USE_CRT_DEBUG, etc) instead.")
 	CACHE STRING "Specify VC++ CRT to use for ${build_type} configurations."
 	FORCE)
       set_property(CACHE LLVM_USE_CRT_${build}
-	PROPERTY STRINGS "";${${MSVC_CRT}})
+	PROPERTY STRINGS ;${${MSVC_CRT}})
     endif(NOT LLVM_USE_CRT_${build})
   endforeach(build_type)
 
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 58ffb8d..bb41a58 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -8,6 +8,8 @@ include(CheckCXXCompilerFlag)
 
 if( CMAKE_COMPILER_IS_GNUCXX )
   set(LLVM_COMPILER_IS_GCC_COMPATIBLE ON)
+elseif( MSVC )
+  set(LLVM_COMPILER_IS_GCC_COMPATIBLE OFF)
 elseif( "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
   set(LLVM_COMPILER_IS_GCC_COMPATIBLE ON)
 endif()
@@ -22,8 +24,13 @@ if( LLVM_ENABLE_ASSERTIONS )
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
     add_definitions( -UNDEBUG )
     # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
-    string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
-      CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+    set(REGEXP_NDEBUG "(^| )[/-]D *NDEBUG($| )")
+    string (REGEX REPLACE "${REGEXP_NDEBUG}" " "
+      CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+    string (REGEX REPLACE "${REGEXP_NDEBUG}" " "
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+    string (REGEX REPLACE "${REGEXP_NDEBUG}" " "
+      CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL}")
   endif()
 else()
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE" )
@@ -151,6 +158,15 @@ endif()
 if( MSVC )
   include(ChooseMSVCCRT)
 
+  if( NOT (${CMAKE_VERSION} VERSION_LESS 2.8.11) )
+    # set stack reserved size to ~10MB
+    # CMake previously automatically set this value for MSVC builds, but the
+    # behavior was changed in CMake 2.8.11 (Issue 12437) to use the MSVC default
+    # value (1 MB) which is not enough for us in tasks such as parsing recursive
+    # C++ templates in Clang.
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /STACK:10000000")
+  endif()
+
   if( MSVC10 )
     # MSVC 10 will complain about headers in the STL not being exported, but
     # will not complain in MSVC 11.
@@ -182,6 +198,7 @@ if( MSVC )
     -wd4503 # Suppress ''identifier' : decorated name length exceeded, name was truncated'
     -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
     -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
+    -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
     
     # Promoted warnings.
     -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
@@ -239,6 +256,10 @@ macro(append_common_sanitizer_flags)
       NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")
     add_flag_if_supported("-gline-tables-only")
   endif()
+  # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large.
+  if (uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
+    add_flag_if_supported("-O1")
+  endif()
 endmacro()
 
 # Turn on sanitizers if necessary.
diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake
index 3e2447a..e26fabd 100644
--- a/cmake/modules/LLVM-Config.cmake
+++ b/cmake/modules/LLVM-Config.cmake
@@ -2,7 +2,7 @@ function(get_system_libs return_var)
   # Returns in `return_var' a list of system libraries used by LLVM.
   if( NOT MSVC )
     if( MINGW )
-      set(system_libs ${system_libs} imagehlp psapi)
+      set(system_libs ${system_libs} imagehlp psapi shell32)
     elseif( CMAKE_HOST_UNIX )
       if( HAVE_LIBRT )
         set(system_libs ${system_libs} rt)
@@ -10,13 +10,9 @@ function(get_system_libs return_var)
       if( HAVE_LIBDL )
         set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
       endif()
-      if(LLVM_ENABLE_CURSES)
-        if(HAVE_NCURSESW)
-          set(system_libs ${system_libs} ncursesw)
-        elseif(HAVE_NCURSES)
-          set(system_libs ${system_libs} ncurses)
-        elseif(HAVE_CURSES)
-          set(system_libs ${system_libs} curses)
+      if(LLVM_ENABLE_TERMINFO)
+        if(HAVE_TERMINFO)
+          set(system_libs ${system_libs} ${TERMINFO_LIBS})
         endif()
       endif()
       if( LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD )
diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index ae2bc59..68fe296 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -20,7 +20,7 @@ set(TARGET_TRIPLE "@TARGET_TRIPLE@")
 
 set(LLVM_TOOLS_BINARY_DIR @LLVM_TOOLS_BINARY_DIR@)
 
-set(LLVM_ENABLE_CURSES @LLVM_ENABLE_CURSES@)
+set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@)
 
 set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)
 
@@ -30,6 +30,8 @@ set(LLVM_NATIVE_ARCH @LLVM_NATIVE_ARCH@)
 
 set(LLVM_ENABLE_PIC @LLVM_ENABLE_PIC@)
 
+set(HAVE_TERMINFO @HAVE_TERMINFO@)
+set(TERMINFO_LIBS @TERMINFO_LIBS@)
 set(HAVE_LIBDL @HAVE_LIBDL@)
 set(HAVE_LIBPTHREAD @HAVE_LIBPTHREAD@)
 set(HAVE_LIBZ @HAVE_LIBZ@)
diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index 7a1d4a1..c17e67e 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake
@@ -85,6 +85,16 @@ macro(add_tablegen target project)
   add_llvm_utility(${target} ${ARGN})
   set(LLVM_LINK_COMPONENTS ${${target}_OLD_LLVM_LINK_COMPONENTS})
 
+  # For Xcode builds, symlink bin/<target> to bin/<Config>/<target> so that
+  # a separately-configured Clang project can still find llvm-tblgen.
+  if (XCODE)
+    add_custom_target(${target}-top ALL
+      ${CMAKE_COMMAND} -E create_symlink 
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMAKE_CFG_INTDIR}/${target}${CMAKE_EXECUTABLE_SUFFIX}
+        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX}
+      DEPENDS ${target})
+  endif ()
+
   set(${project}_TABLEGEN "${target}" CACHE
       STRING "Native TableGen executable. Saves building one when cross-compiling.")
 
@@ -118,7 +128,7 @@ macro(add_tablegen target project)
   endif()
 
   if( MINGW )
-    target_link_libraries(${target} imagehlp psapi)
+    target_link_libraries(${target} imagehlp psapi shell32)
     if(CMAKE_SIZEOF_VOID_P MATCHES "8")
       set_target_properties(${target} PROPERTIES LINK_FLAGS -Wl,--stack,16777216)
     endif(CMAKE_SIZEOF_VOID_P MATCHES "8")
@@ -127,5 +137,7 @@ macro(add_tablegen target project)
     target_link_libraries(${target} pthread)
   endif()
 
-  install(TARGETS ${target} RUNTIME DESTINATION bin)
+  if (${project} STREQUAL LLVM AND NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+    install(TARGETS ${target} RUNTIME DESTINATION bin)
+  endif()
 endmacro()
diff --git a/cmake/nsis_logo.bmp b/cmake/nsis_logo.bmp
new file mode 100644
index 0000000..5fcb3ce
--- /dev/null
+++ b/cmake/nsis_logo.bmp
diff --git a/configure b/configure
index a148711..72e9b13 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.4svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.4.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.4svn'
-PACKAGE_STRING='LLVM 3.4svn'
+PACKAGE_VERSION='3.4'
+PACKAGE_STRING='LLVM 3.4'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'
 
 ac_unique_file="lib/IR/Module.cpp"
@@ -709,7 +709,6 @@ ENABLE_PIC
 ENABLE_SHARED
 ENABLE_EMBED_STDCXX
 ENABLE_TIMESTAMPS
-ENABLE_BACKTRACES
 TARGETS_TO_BUILD
 LLVM_ENUM_TARGETS
 LLVM_ENUM_ASM_PRINTERS
@@ -1331,7 +1330,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures LLVM 3.4svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1397,7 +1396,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.4svn:";;
+     short | recursive ) echo "Configuration of LLVM 3.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1443,6 +1442,8 @@ Optional Features:
                           (default is YES)
   --enable-backtraces     Enable embedding backtraces on crash (default is
                           YES)
+  --enable-crash-overrides
+                          Enable crash handling overrides (default is YES)
   --enable-targets        Build specific host targets: all or
                           target1,target2,... Valid targets are: host, x86,
                           x86_64, sparc, powerpc, arm, aarch64, mips, hexagon,
@@ -1453,8 +1454,8 @@ Optional Features:
                           target1,target2,... (default=disable)
   --enable-bindings       Build specific language bindings:
                           all,auto,none,{binding-name} (default=auto)
-  --enable-curses         Use curses for querying terminal infomation if
-                          available (default is YES)
+  --enable-terminfo       Query the terminfo database if available (default is
+                          YES)
   --enable-libffi         Check for the presence of libffi (default is NO)
   --enable-ltdl-install   install libltdl
 
@@ -1563,7 +1564,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-LLVM configure 3.4svn
+LLVM configure 3.4
 generated by GNU Autoconf 2.60
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1579,7 +1580,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by LLVM $as_me 3.4svn, which was
+It was created by LLVM $as_me 3.4, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   $ $0 $@
@@ -1991,8 +1992,8 @@ echo "$as_me: error: Already configured in ${srcdir}" >&2;}
   fi
 fi
 
-${CFLAGS=}
-${CXXFLAGS=}
+: ${CFLAGS=}
+: ${CXXFLAGS=}
 
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
@@ -5224,8 +5225,15 @@ fi
 case "$enableval" in
   yes) ENABLE_CLANG_STATIC_ANALYZER=1
  ;;
-  no)  ENABLE_CLANG_STATIC_ANALYZER=0
- ;;
+  no)
+    if test ${clang_arcmt} != "no" ; then
+      { { echo "$as_me:$LINENO: error: Cannot enable clang ARC Migration Tool while disabling static analyzer." >&5
+echo "$as_me: error: Cannot enable clang ARC Migration Tool while disabling static analyzer." >&2;}
+   { (exit 1); exit 1; }; }
+    fi
+    ENABLE_CLANG_STATIC_ANALYZER=0
+
+    ;;
   default) ENABLE_CLANG_STATIC_ANALYZER=1
 ;;
   *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-clang-static-analyzer. Use \"yes\" or \"no\"" >&5
@@ -5625,30 +5633,47 @@ cat >>confdefs.h <<_ACEOF
 _ACEOF
 
 
-
 # Check whether --enable-backtraces was given.
 if test "${enable_backtraces+set}" = set; then
-  enableval=$enable_backtraces;
+  enableval=$enable_backtraces; case "$enableval" in
+    yes) llvm_cv_enable_backtraces="yes" ;;
+    no)  llvm_cv_enable_backtraces="no"  ;;
+    *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+  esac
 else
-  enableval=default
+  llvm_cv_enable_backtraces="yes"
 fi
 
-case "$enableval" in
-  yes) ENABLE_BACKTRACES=1
- ;;
-  no)  ENABLE_BACKTRACES=0
- ;;
-  default) ENABLE_BACKTRACES=1
- ;;
-  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&5
-echo "$as_me: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&2;}
+if test "$llvm_cv_enable_backtraces" = "yes" ; then
+
+cat >>confdefs.h <<\_ACEOF
+#define ENABLE_BACKTRACES 1
+_ACEOF
+
+fi
+
+# Check whether --enable-crash-overrides was given.
+if test "${enable_crash_overrides+set}" = set; then
+  enableval=$enable_crash_overrides; case "$enableval" in
+    yes) llvm_cv_enable_crash_overrides="yes" ;;
+    no)  llvm_cv_enable_crash_overrides="no"  ;;
+    *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-crash-overrides. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-crash-overrides. Use \"yes\" or \"no\"" >&2;}
    { (exit 1); exit 1; }; } ;;
-esac
+  esac
+else
+  llvm_cv_enable_crash_overrides="yes"
+fi
 
-cat >>confdefs.h <<_ACEOF
-#define ENABLE_BACKTRACES $ENABLE_BACKTRACES
+if test "$llvm_cv_enable_crash_overrides" = "yes" ; then
+
+cat >>confdefs.h <<\_ACEOF
+#define ENABLE_CRASH_OVERRIDES 1
 _ACEOF
 
+fi
 
 TARGETS_TO_BUILD=""
 # Check whether --enable-targets was given.
@@ -6006,17 +6031,17 @@ cat >>confdefs.h <<_ACEOF
 _ACEOF
 
 
-# Check whether --enable-curses was given.
-if test "${enable_curses+set}" = set; then
-  enableval=$enable_curses; case "$enableval" in
-    yes) llvm_cv_enable_curses="yes" ;;
-    no)  llvm_cv_enable_curses="no"  ;;
-    *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-curses. Use \"yes\" or \"no\"" >&5
-echo "$as_me: error: Invalid setting for --enable-curses. Use \"yes\" or \"no\"" >&2;}
+# Check whether --enable-terminfo was given.
+if test "${enable_terminfo+set}" = set; then
+  enableval=$enable_terminfo; case "$enableval" in
+    yes) llvm_cv_enable_terminfo="yes" ;;
+    no)  llvm_cv_enable_terminfo="no"  ;;
+    *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-terminfo. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-terminfo. Use \"yes\" or \"no\"" >&2;}
    { (exit 1); exit 1; }; } ;;
   esac
 else
-  llvm_cv_enable_curses="yes"
+  llvm_cv_enable_terminfo="yes"
 fi
 
 
@@ -8800,9 +8825,7 @@ if test "${enable_ltdl_install+set}" = set; then
 fi
 
 
-
-
-if test x"${enable_ltdl_install-no}" != xno; then
+ if test x"${enable_ltdl_install-no}" != xno; then
   INSTALL_LTDL_TRUE=
   INSTALL_LTDL_FALSE='#'
 else
@@ -8810,9 +8833,7 @@ else
   INSTALL_LTDL_FALSE=
 fi
 
-
-
-if test x"${enable_ltdl_convenience-no}" != xno; then
+ if test x"${enable_ltdl_convenience-no}" != xno; then
   CONVENIENCE_LTDL_TRUE=
   CONVENIENCE_LTDL_FALSE='#'
 else
@@ -10561,7 +10582,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10564 "configure"
+#line 10585 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -11790,12 +11811,13 @@ fi
 
 { echo "$as_me:$LINENO: checking for python >= 2.5" >&5
 echo $ECHO_N "checking for python >= 2.5... $ECHO_C" >&6; }
-ac_python_version=`$PYTHON -c 'import sys; print sys.version.split()[0]'`
+ac_python_version=`$PYTHON -V 2>&1 | cut -d' ' -f2`
 ac_python_version_major=`echo $ac_python_version | cut -d'.' -f1`
 ac_python_version_minor=`echo $ac_python_version | cut -d'.' -f2`
 ac_python_version_patch=`echo $ac_python_version | cut -d'.' -f3`
-if   test "$ac_python_version_major" -eq "2" \
-   && test "$ac_python_version_minor" -ge "5" ; then
+if test "$ac_python_version_major" -gt "2" || \
+   (test "$ac_python_version_major" -eq "2" && \
+    test "$ac_python_version_minor" -ge "5") ; then
   { echo "$as_me:$LINENO: result: $PYTHON ($ac_python_version)" >&5
 echo "${ECHO_T}$PYTHON ($ac_python_version)" >&6; }
 else
@@ -12059,6 +12081,87 @@ _ACEOF
 
 fi
 
+
+{ echo "$as_me:$LINENO: checking for main in -lshell32" >&5
+echo $ECHO_N "checking for main in -lshell32... $ECHO_C" >&6; }
+if test "${ac_cv_lib_shell32_main+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lshell32  $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+
+int
+main ()
+{
+return main ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_lib_shell32_main=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_lib_shell32_main=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_lib_shell32_main" >&5
+echo "${ECHO_T}$ac_cv_lib_shell32_main" >&6; }
+if test $ac_cv_lib_shell32_main = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBSHELL32 1
+_ACEOF
+
+  LIBS="-lshell32 $LIBS"
+
+fi
+
 fi
 
 { echo "$as_me:$LINENO: checking for library containing dlopen" >&5
@@ -12268,10 +12371,10 @@ if test "$ac_res" != no; then
 fi
 
 
-if test "$llvm_cv_enable_curses" = "yes" ; then
-    { echo "$as_me:$LINENO: checking for library containing has_colors" >&5
-echo $ECHO_N "checking for library containing has_colors... $ECHO_C" >&6; }
-if test "${ac_cv_search_has_colors+set}" = set; then
+if test "$llvm_cv_enable_terminfo" = "yes" ; then
+    { echo "$as_me:$LINENO: checking for library containing setupterm" >&5
+echo $ECHO_N "checking for library containing setupterm... $ECHO_C" >&6; }
+if test "${ac_cv_search_setupterm+set}" = set; then
   echo $ECHO_N "(cached) $ECHO_C" >&6
 else
   ac_func_search_save_LIBS=$LIBS
@@ -12288,16 +12391,16 @@ cat >>conftest.$ac_ext <<_ACEOF
 #ifdef __cplusplus
 extern "C"
 #endif
-char has_colors ();
+char setupterm ();
 int
 main ()
 {
-return has_colors ();
+return setupterm ();
   ;
   return 0;
 }
 _ACEOF
-for ac_lib in '' curses ncurses ncursesw; do
+for ac_lib in '' tinfo terminfo curses ncurses ncursesw; do
   if test -z "$ac_lib"; then
     ac_res="none required"
   else
@@ -12338,7 +12441,7 @@ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   ac_status=$?
   echo "$as_me:$LINENO: \$? = $ac_status" >&5
   (exit $ac_status); }; }; then
-  ac_cv_search_has_colors=$ac_res
+  ac_cv_search_setupterm=$ac_res
 else
   echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
@@ -12348,26 +12451,26 @@ fi
 
 rm -f core conftest.err conftest.$ac_objext \
       conftest$ac_exeext
-  if test "${ac_cv_search_has_colors+set}" = set; then
+  if test "${ac_cv_search_setupterm+set}" = set; then
   break
 fi
 done
-if test "${ac_cv_search_has_colors+set}" = set; then
+if test "${ac_cv_search_setupterm+set}" = set; then
   :
 else
-  ac_cv_search_has_colors=no
+  ac_cv_search_setupterm=no
 fi
 rm conftest.$ac_ext
 LIBS=$ac_func_search_save_LIBS
 fi
-{ echo "$as_me:$LINENO: result: $ac_cv_search_has_colors" >&5
-echo "${ECHO_T}$ac_cv_search_has_colors" >&6; }
-ac_res=$ac_cv_search_has_colors
+{ echo "$as_me:$LINENO: result: $ac_cv_search_setupterm" >&5
+echo "${ECHO_T}$ac_cv_search_setupterm" >&6; }
+ac_res=$ac_cv_search_setupterm
 if test "$ac_res" != no; then
   test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
 
 cat >>confdefs.h <<\_ACEOF
-#define HAVE_CURSES 1
+#define HAVE_TERMINFO 1
 _ACEOF
 
 fi
@@ -16543,182 +16646,6 @@ else
 
 fi
 
-if test "$llvm_cv_enable_curses" = "yes" ; then
-
-
-
-
-
-for ac_header in curses.h ncurses.h ncursesw.h ncurses/curses.h ncursesw/curses.h
-do
-as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
-if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
-  { echo "$as_me:$LINENO: checking for $ac_header" >&5
-echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
-if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-fi
-ac_res=`eval echo '${'$as_ac_Header'}'`
-	       { echo "$as_me:$LINENO: result: $ac_res" >&5
-echo "${ECHO_T}$ac_res" >&6; }
-else
-  # Is the header compilable?
-{ echo "$as_me:$LINENO: checking $ac_header usability" >&5
-echo $ECHO_N "checking $ac_header usability... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-$ac_includes_default
-#include <$ac_header>
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } &&
-	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; } &&
-	 { ac_try='test -s conftest.$ac_objext'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-  ac_header_compiler=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_header_compiler=no
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
-echo "${ECHO_T}$ac_header_compiler" >&6; }
-
-# Is the header present?
-{ echo "$as_me:$LINENO: checking $ac_header presence" >&5
-echo $ECHO_N "checking $ac_header presence... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <$ac_header>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null; then
-  if test -s conftest.err; then
-    ac_cpp_err=$ac_c_preproc_warn_flag
-    ac_cpp_err=$ac_cpp_err$ac_c_werror_flag
-  else
-    ac_cpp_err=
-  fi
-else
-  ac_cpp_err=yes
-fi
-if test -z "$ac_cpp_err"; then
-  ac_header_preproc=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  ac_header_preproc=no
-fi
-
-rm -f conftest.err conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
-echo "${ECHO_T}$ac_header_preproc" >&6; }
-
-# So?  What about this header?
-case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
-  yes:no: )
-    { echo "$as_me:$LINENO: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&5
-echo "$as_me: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the compiler's result" >&5
-echo "$as_me: WARNING: $ac_header: proceeding with the compiler's result" >&2;}
-    ac_header_preproc=yes
-    ;;
-  no:yes:* )
-    { echo "$as_me:$LINENO: WARNING: $ac_header: present but cannot be compiled" >&5
-echo "$as_me: WARNING: $ac_header: present but cannot be compiled" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header:     check for missing prerequisite headers?" >&5
-echo "$as_me: WARNING: $ac_header:     check for missing prerequisite headers?" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: see the Autoconf documentation" >&5
-echo "$as_me: WARNING: $ac_header: see the Autoconf documentation" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header:     section \"Present But Cannot Be Compiled\"" >&5
-echo "$as_me: WARNING: $ac_header:     section \"Present But Cannot Be Compiled\"" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the preprocessor's result" >&5
-echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result" >&2;}
-    { echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
-echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
-    ( cat <<\_ASBOX
-## ------------------------------------ ##
-## Report this to http://llvm.org/bugs/ ##
-## ------------------------------------ ##
-_ASBOX
-     ) | sed "s/^/$as_me: WARNING:     /" >&2
-    ;;
-esac
-{ echo "$as_me:$LINENO: checking for $ac_header" >&5
-echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
-if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  eval "$as_ac_Header=\$ac_header_preproc"
-fi
-ac_res=`eval echo '${'$as_ac_Header'}'`
-	       { echo "$as_me:$LINENO: result: $ac_res" >&5
-echo "${ECHO_T}$ac_res" >&6; }
-
-fi
-if test `eval echo '${'$as_ac_Header'}'` = yes; then
-  cat >>confdefs.h <<_ACEOF
-#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
-_ACEOF
-
-fi
-
-done
-
-fi
-
 if test "$llvm_cv_enable_libffi" = "yes" ; then
 
 
@@ -22396,9 +22323,6 @@ ac_config_commands="$ac_config_commands examples/Makefile"
 ac_config_commands="$ac_config_commands lib/Makefile"
 
 
-ac_config_commands="$ac_config_commands runtime/Makefile"
-
-
 ac_config_commands="$ac_config_commands test/Makefile"
 
 
@@ -22821,7 +22745,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.4svn, which was
+This file was extended by LLVM $as_me 3.4, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22874,7 +22798,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.4svn
+LLVM config.status 3.4
 configured by $0, generated by GNU Autoconf 2.60,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
@@ -23005,7 +22929,6 @@ do
     "Makefile.common") CONFIG_COMMANDS="$CONFIG_COMMANDS Makefile.common" ;;
     "examples/Makefile") CONFIG_COMMANDS="$CONFIG_COMMANDS examples/Makefile" ;;
     "lib/Makefile") CONFIG_COMMANDS="$CONFIG_COMMANDS lib/Makefile" ;;
-    "runtime/Makefile") CONFIG_COMMANDS="$CONFIG_COMMANDS runtime/Makefile" ;;
     "test/Makefile") CONFIG_COMMANDS="$CONFIG_COMMANDS test/Makefile" ;;
     "test/Makefile.tests") CONFIG_COMMANDS="$CONFIG_COMMANDS test/Makefile.tests" ;;
     "unittests/Makefile") CONFIG_COMMANDS="$CONFIG_COMMANDS unittests/Makefile" ;;
@@ -23224,7 +23147,6 @@ ENABLE_PIC!$ENABLE_PIC$ac_delim
 ENABLE_SHARED!$ENABLE_SHARED$ac_delim
 ENABLE_EMBED_STDCXX!$ENABLE_EMBED_STDCXX$ac_delim
 ENABLE_TIMESTAMPS!$ENABLE_TIMESTAMPS$ac_delim
-ENABLE_BACKTRACES!$ENABLE_BACKTRACES$ac_delim
 TARGETS_TO_BUILD!$TARGETS_TO_BUILD$ac_delim
 LLVM_ENUM_TARGETS!$LLVM_ENUM_TARGETS$ac_delim
 LLVM_ENUM_ASM_PRINTERS!$LLVM_ENUM_ASM_PRINTERS$ac_delim
@@ -23311,6 +23233,7 @@ LLVM_MANDIR!$LLVM_MANDIR$ac_delim
 LLVM_CONFIGTIME!$LLVM_CONFIGTIME$ac_delim
 BINDINGS_TO_BUILD!$BINDINGS_TO_BUILD$ac_delim
 ALL_BINDINGS!$ALL_BINDINGS$ac_delim
+OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
 _ACEOF
 
   if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
@@ -23352,7 +23275,6 @@ _ACEOF
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
-OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
 ENABLE_VISIBILITY_INLINES_HIDDEN!$ENABLE_VISIBILITY_INLINES_HIDDEN$ac_delim
 RPATH!$RPATH$ac_delim
 RDYNAMIC!$RDYNAMIC$ac_delim
@@ -23361,7 +23283,7 @@ LIBOBJS!$LIBOBJS$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 7; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 6; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
@@ -23772,8 +23694,6 @@ echo "$as_me: executing $ac_file commands" >&6;}
    ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/examples/Makefile examples/Makefile ;;
     "lib/Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname lib/Makefile`
    ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/lib/Makefile lib/Makefile ;;
-    "runtime/Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname runtime/Makefile`
-   ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/runtime/Makefile runtime/Makefile ;;
     "test/Makefile":C) ${llvm_src}/autoconf/mkinstalldirs `dirname test/Makefile`
    ${SHELL} ${llvm_src}/autoconf/install-sh -m 0644 -c ${srcdir}/test/Makefile test/Makefile ;;
     "test/Makefile.tests":C) ${llvm_src}/autoconf/mkinstalldirs `dirname test/Makefile.tests`
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index c83b6c1..d9d1df0 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -718,7 +718,7 @@ global variable. The operand fields are:
 MODULE_CODE_FUNCTION Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc]``
+``[FUNCTION, type, callingconv, isproto, linkage, paramattr, alignment, section, visibility, gc, prefix]``
 
 The ``FUNCTION`` record (code 8) marks the declaration or definition of a
 function. The operand fields are:
@@ -757,6 +757,9 @@ function. The operand fields are:
 * *unnamed_addr*: If present and non-zero, indicates that the function has
   ``unnamed_addr``
 
+* *prefix*: If non-zero, the value index of the prefix data for this function,
+  plus 1.
+
 MODULE_CODE_ALIAS Record
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
new file mode 100644
index 0000000..8c49aa5
--- /dev/null
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,51 @@
+
+if (DOXYGEN_FOUND)
+if (LLVM_ENABLE_DOXYGEN)
+  set(abs_top_srcdir ${LLVM_MAIN_SRC_DIR})
+  set(abs_top_builddir ${LLVM_BINARY_DIR})
+  
+  if (HAVE_DOT)
+    set(DOT ${LLVM_PATH_DOT})
+  endif()
+
+  if (LLVM_DOXYGEN_EXTERNAL_SEARCH)
+    set(enable_searchengine "YES")
+    set(searchengine_url "${LLVM_DOXYGEN_SEARCHENGINE_URL}")
+    set(enable_server_based_search "YES")
+    set(enable_external_search "YES")
+    set(extra_search_mappings "${LLVM_DOXYGEN_SEARCH_MAPPINGS}")
+  else()
+    set(enable_searchengine "NO")
+    set(searchengine_url "")
+    set(enable_server_based_search "NO")
+    set(enable_external_search "NO")
+    set(extra_search_mappings "")
+  endif()
+  
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/doxygen.cfg.in
+    ${CMAKE_CURRENT_BINARY_DIR}/doxygen.cfg @ONLY)
+
+  set(abs_top_srcdir)
+  set(abs_top_builddir)
+  set(DOT)
+  set(enable_searchengine)
+  set(searchengine_url)
+  set(enable_server_based_search)
+  set(enable_external_search)
+  set(extra_search_mappings)
+
+  add_custom_target(doxygen-llvm
+    COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/doxygen.cfg
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Generating llvm doxygen documentation." VERBATIM)
+
+  if (LLVM_BUILD_DOCS)
+    add_dependencies(doxygen doxygen-llvm)
+  endif()
+
+  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
+    install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/doxygen/html
+      DESTINATION docs/html)
+  endif()
+endif()
+endif()
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index 1f2dc6c..c87a628 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -636,6 +636,18 @@ file (MCObjectStreamer).  MCAsmStreamer is a straight-forward implementation
 that prints out a directive for each method (e.g. ``EmitValue -> .byte``), but
 MCObjectStreamer implements a full assembler.
 
+For target specific directives, the MCStreamer has a MCTargetStreamer instance.
+Each target that needs it defines a class that inherits from it and is a lot
+like MCStreamer itself: It has one method per directive and two classes that
+inherit from it, a target object streamer and a target asm streamer. The target
+asm streamer just prints it (``emitFnStart -> .fnstrart``), and the object
+streamer implement the assembler logic for it.
+
+To make llvm use these classes, the target initialization must call
+TargetRegistry::RegisterAsmStreamer and TargetRegistry::RegisterMCObjectStreamer
+passing callbacks that allocate the corresponding target streamer and pass it
+to createAsmStreamer or to the appropriate object streamer constructor.
+
 The ``MCContext`` class
 -----------------------
 
@@ -1614,7 +1626,7 @@ Implementing a Native Assembler
 ===============================
 
 Though you're probably reading this because you want to write or maintain a
-compiler backend, LLVM also fully supports building a native assemblers too.
+compiler backend, LLVM also fully supports building a native assembler.
 We've tried hard to automate the generation of the assembler from the .td files
 (in particular the instruction syntax and encodings), which means that a large
 part of the manual and repetitive data entry can be factored and shared with the
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 6be5fc3..5a60d60 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -30,11 +30,13 @@ OPTIONS
 
 .. option:: --check-prefix prefix
 
- FileCheck searches the contents of ``match-filename`` for patterns to match.
- By default, these patterns are prefixed with "``CHECK:``".  If you'd like to
- use a different prefix (e.g. because the same input file is checking multiple
- different tool or options), the :option:`--check-prefix` argument allows you
- to specify a specific prefix to match.
+ FileCheck searches the contents of ``match-filename`` for patterns to
+ match.  By default, these patterns are prefixed with "``CHECK:``".
+ If you'd like to use a different prefix (e.g. because the same input
+ file is checking multiple different tool or options), the
+ :option:`--check-prefix` argument allows you to specify one or more
+ prefixes to match. Multiple prefixes are useful for tests which might
+ change for different run options, but most lines remain the same.
 
 .. option:: --input-file filename
 
@@ -216,6 +218,19 @@ in the natural order:
     Bar b;
     // CHECK-DAG: @_ZTV3Bar =
 
+``CHECK-NOT:`` directives could be mixed with ``CHECK-DAG:`` directives to
+exclude strings between the surrounding ``CHECK-DAG:`` directives. As a result,
+the surrounding ``CHECK-DAG:`` directives cannot be reordered, i.e. all
+occurrences matching ``CHECK-DAG:`` before ``CHECK-NOT:`` must not fall behind
+occurrences matching ``CHECK-DAG:`` after ``CHECK-NOT:``. For example,
+
+.. code-block:: llvm
+
+   ; CHECK-DAG: BEFORE
+   ; CHECK-NOT: NOT
+   ; CHECK-DAG: AFTER
+
+This case will reject input strings where ``BEFORE`` occurs after ``AFTER``.
 
 With captured variables, ``CHECK-DAG:`` is able to match valid topological
 orderings of a DAG with edges from the definition of a variable to its use.
@@ -230,19 +245,34 @@ sequences from the instruction scheduler. For example,
 
 In this case, any order of that two ``add`` instructions will be allowed.
 
-``CHECK-NOT:`` directives could be mixed with ``CHECK-DAG:`` directives to
-exclude strings between the surrounding ``CHECK-DAG:`` directives. As a result,
-the surrounding ``CHECK-DAG:`` directives cannot be reordered, i.e. all
-occurrences matching ``CHECK-DAG:`` before ``CHECK-NOT:`` must not fall behind
-occurrences matching ``CHECK-DAG:`` after ``CHECK-NOT:``. For example,
+If you are defining `and` using variables in the same ``CHECK-DAG:`` block,
+be aware that the definition rule can match `after` its use.
+
+So, for instance, the code below will pass:
 
 .. code-block:: llvm
 
-   ; CHECK-DAG: BEFORE
-   ; CHECK-NOT: NOT
-   ; CHECK-DAG: AFTER
+  ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0]
+  ; CHECK-DAG: vmov.32 [[REG2]][1]
+  vmov.32 d0[1]
+  vmov.32 d0[0]
 
-This case will reject input strings where ``BEFORE`` occurs after ``AFTER``.
+While this other code, will not:
+
+.. code-block:: llvm
+
+  ; CHECK-DAG: vmov.32 [[REG2:d[0-9]+]][0]
+  ; CHECK-DAG: vmov.32 [[REG2]][1]
+  vmov.32 d1[1]
+  vmov.32 d0[0]
+
+While this can be very useful, it's also dangerous, because in the case of
+register sequence, you must have a strong order (read before write, copy before
+use, etc). If the definition your test is looking for doesn't match (because
+of a bug in the compiler), it may match further away from the use, and mask
+real bugs away.
+
+In those cases, to enforce the order, use a non-DAG directive between DAG-blocks.
 
 The "CHECK-LABEL:" directive
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index a4681fb..4d84be6 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -149,12 +149,11 @@ ADDITIONAL OPTIONS
 
 .. option:: --show-suites
 
- List the discovered test suites as part of the standard output.
+ List the discovered test suites and exit.
 
-.. option:: --repeat=N
+.. option:: --show-tests
 
- Run each test ``N`` times.  Currently this is primarily useful for timing
- tests, other results are not collated in any reasonable fashion.
+ List all of the the discovered tests and exit.
 
 EXIT STATUS
 -----------
diff --git a/docs/CommandGuide/llvm-extract.rst b/docs/CommandGuide/llvm-extract.rst
index d569e35..d0e9c1c 100644
--- a/docs/CommandGuide/llvm-extract.rst
+++ b/docs/CommandGuide/llvm-extract.rst
@@ -1,104 +1,79 @@
 llvm-extract - extract a function from an LLVM module
 =====================================================
 
-
 SYNOPSIS
 --------
 
-
-**llvm-extract** [*options*] **--func** *function-name* [*filename*]
-
+:program:`llvm-extract` [*options*] **--func** *function-name* [*filename*]
 
 DESCRIPTION
 -----------
 
-
-The **llvm-extract** command takes the name of a function and extracts it from
-the specified LLVM bitcode file.  It is primarily used as a debugging tool to
-reduce test cases from larger programs that are triggering a bug.
+The :program:`llvm-extract` command takes the name of a function and extracts
+it from the specified LLVM bitcode file.  It is primarily used as a debugging
+tool to reduce test cases from larger programs that are triggering a bug.
 
 In addition to extracting the bitcode of the specified function,
-**llvm-extract** will also remove unreachable global variables, prototypes, and
-unused types.
-
-The **llvm-extract** command reads its input from standard input if filename is
-omitted or if filename is -.  The output is always written to standard output,
-unless the **-o** option is specified (see below).
+:program:`llvm-extract` will also remove unreachable global variables,
+prototypes, and unused types.
 
+The :program:`llvm-extract` command reads its input from standard input if
+filename is omitted or if filename is ``-``.  The output is always written to
+standard output, unless the **-o** option is specified (see below).
 
 OPTIONS
 -------
 
-
-
 **-f**
 
- Enable binary output on terminals.  Normally, **llvm-extract** will refuse to
- write raw bitcode output if the output stream is a terminal. With this option,
- **llvm-extract** will write raw bitcode regardless of the output device.
-
-
+ Enable binary output on terminals.  Normally, :program:`llvm-extract` will
+ refuse to write raw bitcode output if the output stream is a terminal.  With
+ this option, :program:`llvm-extract` will write raw bitcode regardless of the
+ output device.
 
 **--func** *function-name*
 
- Extract the function named *function-name* from the LLVM bitcode. May be
+ Extract the function named *function-name* from the LLVM bitcode.  May be
  specified multiple times to extract multiple functions at once.
 
-
-
 **--rfunc** *function-regular-expr*
 
  Extract the function(s) matching *function-regular-expr* from the LLVM bitcode.
  All functions matching the regular expression will be extracted.  May be
  specified multiple times.
 
-
-
 **--glob** *global-name*
 
- Extract the global variable named *global-name* from the LLVM bitcode. May be
+ Extract the global variable named *global-name* from the LLVM bitcode.  May be
  specified multiple times to extract multiple global variables at once.
 
-
-
 **--rglob** *glob-regular-expr*
 
  Extract the global variable(s) matching *global-regular-expr* from the LLVM
- bitcode. All global variables matching the regular expression will be extracted.
- May be specified multiple times.
-
-
+ bitcode.  All global variables matching the regular expression will be
+ extracted.  May be specified multiple times.
 
 **-help**
 
  Print a summary of command line options.
 
-
-
 **-o** *filename*
 
  Specify the output filename.  If filename is "-" (the default), then
- **llvm-extract** sends its output to standard output.
-
-
+ :program:`llvm-extract` sends its output to standard output.
 
 **-S**
 
  Write output in LLVM intermediate language (instead of bitcode).
 
-
-
-
 EXIT STATUS
 -----------
 
-
-If **llvm-extract** succeeds, it will exit with 0.  Otherwise, if an error
+If :program:`llvm-extract` succeeds, it will exit with 0.  Otherwise, if an error
 occurs, it will exit with a non-zero value.
 
-
 SEE ALSO
 --------
 
+bugpoint
 
-bugpoint|bugpoint
diff --git a/docs/CommandGuide/llvm-nm.rst b/docs/CommandGuide/llvm-nm.rst
index cbc7af2..83d9fba 100644
--- a/docs/CommandGuide/llvm-nm.rst
+++ b/docs/CommandGuide/llvm-nm.rst
@@ -1,189 +1,146 @@
 llvm-nm - list LLVM bitcode and object file's symbol table
 ==========================================================
 
-
 SYNOPSIS
 --------
 
-
 :program:`llvm-nm` [*options*] [*filenames...*]
 
-
 DESCRIPTION
 -----------
 
-
 The :program:`llvm-nm` utility lists the names of symbols from the LLVM bitcode
 files, object files, or :program:`ar` archives containing them, named on the
-command line. Each symbol is listed along with some simple information about its
-provenance. If no file name is specified, or *-* is used as a file name,
+command line.  Each symbol is listed along with some simple information about
+its provenance.  If no file name is specified, or *-* is used as a file name,
 :program:`llvm-nm` will process a file on its standard input stream.
 
 :program:`llvm-nm`'s default output format is the traditional BSD :program:`nm`
-output format. Each such output record consists of an (optional) 8-digit
+output format.  Each such output record consists of an (optional) 8-digit
 hexadecimal address, followed by a type code character, followed by a name, for
-each symbol. One record is printed per line; fields are separated by spaces.
+each symbol.  One record is printed per line; fields are separated by spaces.
 When the address is omitted, it is replaced by 8 spaces.
 
 Type code characters currently supported, and their meanings, are as follows:
 
-
 U
 
  Named object is referenced but undefined in this bitcode file
 
-
-
 C
 
  Common (multiple definitions link together into one def)
 
-
-
 W
 
  Weak reference (multiple definitions link together into zero or one definitions)
 
-
-
 t
 
  Local function (text) object
 
-
-
 T
 
  Global function (text) object
 
-
-
 d
 
  Local data object
 
-
-
 D
 
  Global data object
 
-
-
 ?
 
  Something unrecognizable
 
-
-
 Because LLVM bitcode files typically contain objects that are not considered to
 have addresses until they are linked into an executable image or dynamically
 compiled "just-in-time", :program:`llvm-nm` does not print an address for any
-symbol in a LLVM bitcode file, even symbols which are defined in the bitcode
+symbol in an LLVM bitcode file, even symbols which are defined in the bitcode
 file.
 
-
 OPTIONS
 -------
 
-
 .. program:: llvm-nm
 
-
 .. option:: -B    (default)
 
- Use BSD output format. Alias for :option:`--format=bsd`.
-
+ Use BSD output format.  Alias for :option:`--format=bsd`.
 
 .. option:: -P
 
- Use POSIX.2 output format. Alias for :option:`--format=posix`.
-
+ Use POSIX.2 output format.  Alias for :option:`--format=posix`.
 
 .. option:: --debug-syms, -a
 
  Show all symbols, even debugger only.
 
-
 .. option:: --defined-only
 
  Print only symbols defined in this file (as opposed to
  symbols which may be referenced by objects in this file, but not
  defined in this file.)
 
-
 .. option:: --dynamic, -D
 
  Display dynamic symbols instead of normal symbols.
 
-
 .. option:: --extern-only, -g
 
  Print only symbols whose definitions are external; that is, accessible
  from other files.
 
-
 .. option:: --format=format, -f format
 
- Select an output format; *format* may be *sysv*, *posix*, or *bsd*. The default
+ Select an output format; *format* may be *sysv*, *posix*, or *bsd*.  The default
  is *bsd*.
 
-
 .. option:: -help
 
  Print a summary of command-line options and their meanings.
 
-
 .. option:: --no-sort, -p
 
  Shows symbols in order encountered.
 
-
 .. option:: --numeric-sort, -n, -v
 
  Sort symbols by address.
 
-
 .. option:: --print-file-name, -A, -o
 
  Precede each symbol with the file it came from.
 
-
 .. option:: --print-size, -S
 
  Show symbol size instead of address.
 
-
 .. option:: --size-sort
 
  Sort symbols by size.
 
-
 .. option:: --undefined-only, -u
 
  Print only symbols referenced but not defined in this file.
 
-
 BUGS
 ----
 
-
  * :program:`llvm-nm` cannot demangle C++ mangled names, like GNU :program:`nm`
    can.
 
  * :program:`llvm-nm` does not support the full set of arguments that GNU
    :program:`nm` does.
 
-
 EXIT STATUS
 -----------
 
-
 :program:`llvm-nm` exits with an exit code of zero.
 
-
 SEE ALSO
 --------
 
-
-llvm-dis|llvm-dis, ar(1), nm(1)
+llvm-dis, ar(1), nm(1)
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 6110d0b..7b02a78 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -39,7 +39,7 @@ Itanium (ia64)
 MIPS
 ----
 
-* `MIPS Processor Architecture <http://mips.com/content/Documentation/MIPSDocumentation/ProcessorArchitecture/doclibrary>`_
+* `MIPS Processor Architecture <http://imgtec.com/mips/mips-architectures.asp>`_
 
 PowerPC
 -------
@@ -124,6 +124,7 @@ Linux
 
 * `PowerPC 64-bit ELF ABI Supplement <http://www.linuxbase.org/spec/ELF/ppc64/>`_
 * `Procedure Call Standard for the AArch64 Architecture <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055a/IHI0055A_aapcs64.pdf>`_
+* `ELF for the ARM Architecture <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0044e/IHI0044E_aaelf.pdf>`_
 * `ELF for the ARM 64-bit Architecture (AArch64) <http://infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf>`_
 * `System z ELF ABI Supplement <http://legacy.redhat.com/pub/redhat/linux/7.1/es/os/s390x/doc/lzsabi0.pdf>`_
 
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 0655559..ea5a7d1 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -68,6 +68,9 @@ of bugs and enhancements occurring in LLVM.  We really appreciate people who are
 proactive at catching incoming bugs in their components and dealing with them
 promptly.
 
+Please be aware that all public LLVM mailing lists are public and archived, and
+that notices of confidentiality or non-disclosure cannot be respected.
+
 .. _patch:
 .. _one-off patches:
 
@@ -107,6 +110,10 @@ rather than ``Content-Disposition: attachment``. Apple Mail gamely displays such
 a file inline, making it difficult to work with for reviewers using that
 program.
 
+When submitting patches, please do not add confidentiality or non-disclosure
+notices to the patches themselves.  These notices conflict with the `LLVM
+License`_ and may result in your contribution being excluded.
+
 .. _code review:
 
 Code Reviews
@@ -128,7 +135,24 @@ software. We generally follow these policies:
    all necessary review-related changes.
 
 #. Code review can be an iterative process, which continues until the patch is
-   ready to be committed.
+   ready to be committed. Specifically, once a patch is sent out for review, it
+   needs an explicit "looks good" before it is submitted. Do not assume silent
+   approval, or request active objections to the patch with a deadline.
+
+Sometimes code reviews will take longer than you would hope for, especially for
+larger features. Accepted ways to speed up review times for your patches are:
+
+* Review other people's patches. If you help out, everybody will be more
+  willing to do the same for you; goodwill is our currency.
+* Ping the patch. If it is urgent, provide reasons why it is important to you to
+  get this patch landed and ping it every couple of days. If it is
+  not urgent, the common courtesy ping rate is one week. Remember that you're
+  asking for valuable time from other professional developers.
+* Ask for help on IRC. Developers on IRC will be able to either help you
+  directly, or tell you who might be a good reviewer.
+* Split your patch into multiple smaller patches that build on each other. The
+  smaller your patch, the higher the probability that somebody will take a quick
+  look at it.
 
 Developers should participate in code reviews as both reviewers and
 reviewees. If someone is kind enough to review your code, you should return the
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index 78ff874..e308dbc 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -14,6 +14,20 @@ Introduction
 This document describes extensions to tools and formats LLVM seeks compatibility
 with.
 
+General Assembly Syntax
+===========================
+
+C99-style Hexadecimal Floating-point Constants
+----------------------------------------------
+
+LLVM's assemblers allow floating-point constants to be written in C99's
+hexadecimal format instead of decimal if desired.
+
+.. code-block:: gas
+
+  .section .data
+  .float 0x1c2.2ap3
+
 Machine-specific Assembly Syntax
 ================================
 
@@ -91,3 +105,41 @@ Supported COMDAT types:
   .section .xdata$foo
   .linkonce associative .text$foo
     ...
+
+``.section`` Directive
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+MC supports passing the information in ``.linkonce`` at the end of
+``.section``. For example,  these two codes are equivalent
+
+.. code-block:: gas
+
+  .section secName, "dr", discard, "Symbol1"
+  .globl Symbol1
+  Symbol1:
+  .long 1
+
+.. code-block:: gas
+
+  .section secName, "dr"
+  .linkonce discard
+  .globl Symbol1
+  Symbol1:
+  .long 1
+
+Note that in the combined form the COMDAT symbol is explict. This
+extension exits to support multiple sections with the same name in
+different comdats:
+
+
+.. code-block:: gas
+
+  .section secName, "dr", discard, "Symbol1"
+  .globl Symbol1
+  Symbol1:
+  .long 1
+
+  .section secName, "dr", discard, "Symbol2"
+  .globl Symbol2
+  Symbol2:
+  .long 1
diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index b722277..73bc5ee 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@@ -523,7 +523,7 @@ extension):
   $ cat sample.ll
   define void @f() gc "mygc" {
   entry:
-          ret void
+    ret void
   }
   $ llvm-as < sample.ll | llc -load=MyGC.so
 
@@ -896,21 +896,19 @@ in the JIT, nor using the object writers.
   namespace {
     class LLVM_LIBRARY_VISIBILITY MyGCPrinter : public GCMetadataPrinter {
     public:
-      virtual void beginAssembly(std::ostream &OS, AsmPrinter &AP,
-                                 const TargetAsmInfo &TAI);
+      virtual void beginAssembly(AsmPrinter &AP);
 
-      virtual void finishAssembly(std::ostream &OS, AsmPrinter &AP,
-                                  const TargetAsmInfo &TAI);
+      virtual void finishAssembly(AsmPrinter &AP);
     };
 
     GCMetadataPrinterRegistry::Add<MyGCPrinter>
     X("mygc", "My bespoke garbage collector.");
   }
 
-The collector should use ``AsmPrinter`` and ``TargetAsmInfo`` to print portable
-assembly code to the ``std::ostream``.  The collector itself contains the stack
-map for the entire module, and may access the ``GCFunctionInfo`` using its own
-``begin()`` and ``end()`` methods.  Here's a realistic example:
+The collector should use ``AsmPrinter`` to print portable assembly code.  The
+collector itself contains the stack map for the entire module, and may access
+the ``GCFunctionInfo`` using its own ``begin()`` and ``end()`` methods.  Here's
+a realistic example:
 
 .. code-block:: c++
 
@@ -920,85 +918,74 @@ map for the entire module, and may access the ``GCFunctionInfo`` using its own
   #include "llvm/Target/TargetAsmInfo.h"
   #include "llvm/Target/TargetMachine.h"
 
-  void MyGCPrinter::beginAssembly(std::ostream &OS, AsmPrinter &AP,
-                                  const TargetAsmInfo &TAI) {
+  void MyGCPrinter::beginAssembly(AsmPrinter &AP) {
     // Nothing to do.
   }
 
-  void MyGCPrinter::finishAssembly(std::ostream &OS, AsmPrinter &AP,
-                                   const TargetAsmInfo &TAI) {
-    // Set up for emitting addresses.
-    const char *AddressDirective;
-    int AddressAlignLog;
-    if (AP.TM.getDataLayout()->getPointerSize() == sizeof(int32_t)) {
-      AddressDirective = TAI.getData32bitsDirective();
-      AddressAlignLog = 2;
-    } else {
-      AddressDirective = TAI.getData64bitsDirective();
-      AddressAlignLog = 3;
-    }
+  void MyGCPrinter::finishAssembly(AsmPrinter &AP) {
+    MCStreamer &OS = AP.OutStreamer;
+    unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
 
     // Put this in the data section.
-    AP.SwitchToDataSection(TAI.getDataSection());
+    OS.SwitchSection(AP.getObjFileLowering().getDataSection());
 
     // For each function...
     for (iterator FI = begin(), FE = end(); FI != FE; ++FI) {
       GCFunctionInfo &MD = **FI;
 
-      // Emit this data structure:
+      // A compact GC layout. Emit this data structure:
       //
       // struct {
       //   int32_t PointCount;
-      //   struct {
-      //     void *SafePointAddress;
-      //     int32_t LiveCount;
-      //     int32_t LiveOffsets[LiveCount];
-      //   } Points[PointCount];
+      //   void *SafePointAddress[PointCount];
+      //   int32_t StackFrameSize; // in words
+      //   int32_t StackArity;
+      //   int32_t LiveCount;
+      //   int32_t LiveOffsets[LiveCount];
       // } __gcmap_<FUNCTIONNAME>;
 
       // Align to address width.
-      AP.EmitAlignment(AddressAlignLog);
-
-      // Emit the symbol by which the stack map entry can be found.
-      std::string Symbol;
-      Symbol += TAI.getGlobalPrefix();
-      Symbol += "__gcmap_";
-      Symbol += MD.getFunction().getName();
-      if (const char *GlobalDirective = TAI.getGlobalDirective())
-        OS << GlobalDirective << Symbol << "\n";
-      OS << TAI.getGlobalPrefix() << Symbol << ":\n";
+      AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
 
       // Emit PointCount.
+      OS.AddComment("safe point count");
       AP.EmitInt32(MD.size());
-      AP.EOL("safe point count");
 
       // And each safe point...
       for (GCFunctionInfo::iterator PI = MD.begin(),
-                                       PE = MD.end(); PI != PE; ++PI) {
-        // Align to address width.
-        AP.EmitAlignment(AddressAlignLog);
-
+                                    PE = MD.end(); PI != PE; ++PI) {
         // Emit the address of the safe point.
-        OS << AddressDirective
-           << TAI.getPrivateGlobalPrefix() << "label" << PI->Num;
-        AP.EOL("safe point address");
-
-        // Emit the stack frame size.
-        AP.EmitInt32(MD.getFrameSize());
-        AP.EOL("stack frame size");
-
-        // Emit the number of live roots in the function.
-        AP.EmitInt32(MD.live_size(PI));
-        AP.EOL("live root count");
-
-        // And for each live root...
-        for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
-                                           LE = MD.live_end(PI);
-                                           LI != LE; ++LI) {
-          // Print its offset within the stack frame.
-          AP.EmitInt32(LI->StackOffset);
-          AP.EOL("stack offset");
-        }
+        OS.AddComment("safe point address");
+        MCSymbol *Label = PI->Label;
+        AP.EmitLabelPlusOffset(Label/*Hi*/, 0/*Offset*/, 4/*Size*/);
+      }
+
+      // Stack information never change in safe points! Only print info from the
+      // first call-site.
+      GCFunctionInfo::iterator PI = MD.begin();
+
+      // Emit the stack frame size.
+      OS.AddComment("stack frame size (in words)");
+      AP.EmitInt32(MD.getFrameSize() / IntPtrSize);
+
+      // Emit stack arity, i.e. the number of stacked arguments.
+      unsigned RegisteredArgs = IntPtrSize == 4 ? 5 : 6;
+      unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs ?
+                            MD.getFunction().arg_size() - RegisteredArgs : 0;
+      OS.AddComment("stack arity");
+      AP.EmitInt32(StackArity);
+
+      // Emit the number of live roots in the function.
+      OS.AddComment("live root count");
+      AP.EmitInt32(MD.live_size(PI));
+
+      // And for each live root...
+      for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
+                                         LE = MD.live_end(PI);
+                                         LI != LE; ++LI) {
+        // Emit live root's offset within the stack frame.
+        OS.AddComment("stack index (offset / wordsize)");
+        AP.EmitInt32(LI->StackOffset);
       }
     }
   }
@@ -1026,4 +1013,3 @@ programming.
 
 [Henderson2002] `Accurate Garbage Collection in an Uncooperative Environment
 <http://citeseer.ist.psu.edu/henderson02accurate.html>`__
-
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 40dfc45..2a6b637 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -217,9 +217,7 @@ uses the package and provides other details.
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
 | `SVN <http://subversion.tigris.org/project_packages.html>`_  | >=1.3           | Subversion access to LLVM\ :sup:`2`         |
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
-| `python <http://www.python.org/>`_                           | >=2.4           | Automated test suite\ :sup:`3`              |
-+--------------------------------------------------------------+-----------------+---------------------------------------------+
-| `perl <http://www.perl.com/download.csp>`_                   | >=5.6.0         | Utilities                                   |
+| `python <http://www.python.org/>`_                           | >=2.5           | Automated test suite\ :sup:`3`              |
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
 | `GNU M4 <http://savannah.gnu.org/projects/m4>`_              | 1.4             | Macro processor for configuration\ :sup:`4` |
 +--------------------------------------------------------------+-----------------+---------------------------------------------+
@@ -459,15 +457,6 @@ The files are as follows, with *x.y* marking the version number:
 
   Source release for the LLVM test-suite.
 
-``llvm-gcc-4.2-x.y.source.tar.gz``
-
-  Source release of the llvm-gcc-4.2 front end.  See README.LLVM in the root
-  directory for build instructions.
-
-``llvm-gcc-4.2-x.y-platform.tar.gz``
-
-  Binary release of the llvm-gcc-4.2 front end for a specific platform.
-
 .. _checkout:
 
 Checkout LLVM from Subversion
@@ -490,6 +479,8 @@ you can checkout it from the '``tags``' directory (instead of '``trunk``'). The
 following releases are located in the following subdirectories of the '``tags``'
 directory:
 
+* Release 3.3: **RELEASE_33/final**
+* Release 3.2: **RELEASE_32/final**
 * Release 3.1: **RELEASE_31/final**
 * Release 3.0: **RELEASE_30/final**
 * Release 2.9: **RELEASE_29/final**
@@ -939,6 +930,10 @@ GCC compiler supports.
 The result of such a build is executables that are not runnable on on the build
 host (--build option) but can be executed on the compile host (--host option).
 
+Check :doc:`HowToCrossCompileLLVM` and `Clang docs on how to cross-compile in general
+<http://clang.llvm.org/docs/CrossCompilation.html>`_ for more information
+about cross-compiling.
+
 The Location of LLVM Object Files
 ---------------------------------
 
@@ -1312,7 +1307,7 @@ Example with clang
      Clang works just like GCC by default.  The standard -S and -c arguments
      work as usual (producing a native .s or .o file, respectively).
 
-#. Next, compile the C file into a LLVM bitcode file:
+#. Next, compile the C file into an LLVM bitcode file:
 
    .. code-block:: console
 
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index 9847c83..c46dc83 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -164,7 +164,7 @@ An Example Using the LLVM Tool Chain
         return 0;
       }
 
-2. Next, compile the C file into a LLVM bitcode file:
+2. Next, compile the C file into an LLVM bitcode file:
 
    .. code-block:: bat
 
diff --git a/docs/GoldPlugin.rst b/docs/GoldPlugin.rst
index 17bbeb8..28b202a 100644
--- a/docs/GoldPlugin.rst
+++ b/docs/GoldPlugin.rst
@@ -30,29 +30,22 @@ by running ``/usr/bin/ld -plugin``. If it complains "missing argument" then
 you have plugin support. If not, such as an "unknown option" error then you
 will either need to build gold or install a version with plugin support.
 
-* To build gold with plugin support:
+* Download, configure and build gold with plugin support:
 
   .. code-block:: bash
 
-     $ mkdir binutils
-     $ cd binutils
-     $ cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src login
-     {enter "anoncvs" as the password}
-     $ cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src co binutils
+     $ git clone --depth 1 git://sourceware.org/git/binutils-gdb.git binutils
      $ mkdir build
      $ cd build
-     $ ../src/configure --enable-gold --enable-plugins
+     $ ../binutils/configure --enable-gold --enable-plugins --disable-werror
      $ make all-gold
 
-  That should leave you with ``binutils/build/gold/ld-new`` which supports
-  the ``-plugin`` option. It also built would have
-  ``binutils/build/binutils/ar`` and ``nm-new`` which support plugins but
-  don't have a visible -plugin option, instead relying on the gold plugin
-  being present in ``../lib/bfd-plugins`` relative to where the binaries
-  are placed.
+  That should leave you with ``build/gold/ld-new`` which supports
+  the ``-plugin`` option. Running ``make`` will additionally build
+  ``build/binutils/ar`` and ``nm-new`` binaries supporting plugins.
 
 * Build the LLVMgold plugin: Configure LLVM with
-  ``--with-binutils-include=/path/to/binutils/src/include`` and run
+  ``--with-binutils-include=/path/to/binutils/include`` and run
   ``make``.
 
 Usage
@@ -66,17 +59,16 @@ look for the line where it runs ``collect2``. Replace that with
 ready to switch to using gold, backup your existing ``/usr/bin/ld``
 then replace it with ``ld-new``.
 
-You can produce bitcode files from ``clang`` using ``-emit-llvm`` or
-``-flto``, or the ``-O4`` flag which is synonymous with ``-O3 -flto``.
-
-Any of these flags will also cause ``clang`` to look for the gold plugin in
+You should produce bitcode files from ``clang`` with the option
+``-flto``. This flag will also cause ``clang`` to look for the gold plugin in
 the ``lib`` directory under its prefix and pass the ``-plugin`` option to
 ``ld``. It will not look for an alternate linker, which is why you need
 gold to be the installed system linker in your path.
 
-If you want ``ar`` and ``nm`` to work seamlessly as well, install
-``LLVMgold.so`` to ``/usr/lib/bfd-plugins``. If you built your own gold, be
-sure to install the ``ar`` and ``nm-new`` you built to ``/usr/bin``.
+``ar`` and ``nm`` also accept the ``-plugin`` option and it's possible to
+to install ``LLVMgold.so`` to ``/usr/lib/bfd-plugins`` for a seamless setup.
+If you built your own gold, be sure to install the ``ar`` and ``nm-new`` you
+built to ``/usr/bin``.
 
 
 Example of link time optimization
@@ -153,7 +145,6 @@ everything is in place for an easy to use LTO build of autotooled projects:
      export AR="$PREFIX/bin/ar"
      export NM="$PREFIX/bin/nm"
      export RANLIB=/bin/true #ranlib is not needed, and doesn't support .bc files in .a
-     export CFLAGS="-O4"
 
 * Or you can just set your path:
 
@@ -163,7 +154,6 @@ everything is in place for an easy to use LTO build of autotooled projects:
      export CC="clang -flto"
      export CXX="clang++ -flto"
      export RANLIB=/bin/true
-     export CFLAGS="-O4"
 * Configure and build the project as usual:
 
   .. code-block:: bash
diff --git a/docs/HistoricalNotes/2003-06-25-Reoptimizer1.txt b/docs/HistoricalNotes/2003-06-25-Reoptimizer1.txt
index a745784..521526f 100644
--- a/docs/HistoricalNotes/2003-06-25-Reoptimizer1.txt
+++ b/docs/HistoricalNotes/2003-06-25-Reoptimizer1.txt
@@ -132,6 +132,6 @@ is supposed to be cache-line-aligned, but it is not page-aligned.
 We generate instrumentation traces and optimized traces into separate
 trace caches. We keep the instrumented code around because you don't
 want to delete a trace when you still might have to return to it
-(i.e., return from a llvm_first_trigger() or countPath() call.)
+(i.e., return from an llvm_first_trigger() or countPath() call.)
 
 
diff --git a/docs/HowToBuildOnARM.rst b/docs/HowToBuildOnARM.rst
index 32ae39b..f2edaef 100644
--- a/docs/HowToBuildOnARM.rst
+++ b/docs/HowToBuildOnARM.rst
@@ -6,7 +6,11 @@ Introduction
 ============
 
 This document contains information about building/testing LLVM and
-Clang on ARM.
+Clang on an ARM machine.
+
+This document is *NOT* tailored to help you cross-compile LLVM/Clang
+to ARM on another architecture, for example an x86_64 machine. To find
+out more about cross-compiling, please check :doc:`HowToCrossCompileLLVM`.
 
 Notes On Building LLVM/Clang on ARM
 =====================================
@@ -17,19 +21,19 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips.
 #. If you are building LLVM/Clang on an ARM board with 1G of memory or less,
    please use ``gold`` rather then GNU ``ld``.
    Building LLVM/Clang with ``--enable-optimized``
-   is prefered since it consumes less memory. Otherwise, the building
+   is preferred since it consumes less memory. Otherwise, the building
    process will very likely fail due to insufficient memory. In any
    case it is probably a good idea to set up a swap partition.
 
-#. If you want to run ``make
-   check-all`` after building LLVM/Clang, to avoid false alarms (eg, ARCMT
-   failure) please use at least the following configuration:
+#. If you want to run ``make check-all`` after building LLVM/Clang, to avoid
+   false alarms (e.g., ARCMT failure) please use at least the following
+   configuration:
 
    .. code-block:: bash
 
      $ ../$LLVM_SRC_DIR/configure --with-abi=aapcs-vfp
 
-#. The most popular linaro/ubuntu OS's for ARM boards, eg, the
+#. The most popular Linaro/Ubuntu OS's for ARM boards, e.g., the
    Pandaboard, have become hard-float platforms. The following set
    of configuration options appears to be a good choice for this
    platform:
@@ -41,3 +45,25 @@ on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips.
      --target=armv7l-unknown-linux-gnueabihf --with-cpu=cortex-a9 \
      --with-float=hard --with-abi=aapcs-vfp --with-fpu=neon \
      --enable-targets=arm --enable-optimized --enable-assertions
+
+#. ARM development boards can be unstable and you may experience that cores
+   are disappearing, caches being flushed on every big.LITTLE switch, and
+   other similar issues.  To help ease the effect of this, set the Linux
+   scheduler to "performance" on **all** cores using this little script:
+
+   .. code-block:: bash
+
+      # The code below requires the package 'cpufrequtils' to be installed.
+      for ((cpu=0; cpu<`grep -c proc /proc/cpuinfo`; cpu++)); do
+          sudo cpufreq-set -c $cpu -g performance
+      done
+
+#. Running the build on SD cards is ok, but they are more prone to failures
+   than good quality USB sticks, and those are more prone to failures than
+   external hard-drives (those are also a lot faster). So, at least, you
+   should consider to buy a fast USB stick.  On systems with a fast eMMC,
+   that's a good option too.
+
+#. Make sure you have a decent power supply (dozens of dollars worth) that can
+   provide *at least* 4 amperes, this is especially important if you use USB
+   devices with your board.
diff --git a/docs/HowToCrossCompileLLVM.rst b/docs/HowToCrossCompileLLVM.rst
new file mode 100644
index 0000000..1072517
--- /dev/null
+++ b/docs/HowToCrossCompileLLVM.rst
@@ -0,0 +1,175 @@
+===================================================================
+How To Cross-Compile Clang/LLVM using Clang/LLVM
+===================================================================
+
+Introduction
+============
+
+This document contains information about building LLVM and
+Clang on host machine, targeting another platform.
+
+For more information on how to use Clang as a cross-compiler,
+please check http://clang.llvm.org/docs/CrossCompilation.html.
+
+TODO: Add MIPS and other platforms to this document.
+
+Cross-Compiling from x86_64 to ARM
+==================================
+
+In this use case, we'll be using CMake and Ninja, on a Debian-based Linux
+system, cross-compiling from an x86_64 host (most Intel and AMD chips
+nowadays) to a hard-float ARM target (most ARM targets nowadays).
+
+The packages you'll need are:
+
+ * ``cmake``
+ * ``ninja-build`` (from backports in Ubuntu)
+ * ``gcc-4.7-arm-linux-gnueabihf``
+ * ``gcc-4.7-multilib-arm-linux-gnueabihf``
+ * ``binutils-arm-linux-gnueabihf``
+ * ``libgcc1-armhf-cross``
+ * ``libsfgcc1-armhf-cross``
+ * ``libstdc++6-armhf-cross``
+ * ``libstdc++6-4.7-dev-armhf-cross``
+
+Configuring CMake
+-----------------
+
+For more information on how to configure CMake for LLVM/Clang,
+see :doc:`CMake`.
+
+The CMake options you need to add are:
+ * ``-DCMAKE_CROSSCOMPILING=True``
+ * ``-DCMAKE_INSTALL_PREFIX=<install-dir>``
+ * ``-DLLVM_TABLEGEN=<path-to-host-bin>/llvm-tblgen``
+ * ``-DCLANG_TABLEGEN=<path-to-host-bin>/clang-tblgen``
+ * ``-DLLVM_DEFAULT_TARGET_TRIPLE=arm-linux-gnueabihf``
+ * ``-DLLVM_TARGET_ARCH=ARM``
+ * ``-DLLVM_TARGETS_TO_BUILD=ARM``
+ * ``-DCMAKE_CXX_FLAGS='-target armv7a-linux-gnueabihf -mcpu=cortex-a9 -I/usr/arm-linux-gnueabihf/include/c++/4.7.2/arm-linux-gnueabihf/ -I/usr/arm-linux-gnueabihf/include/ -mfloat-abi=hard -ccc-gcc-name arm-linux-gnueabihf-gcc'``
+
+The TableGen options are required to compile it with the host compiler,
+so you'll need to compile LLVM (or at least ``llvm-tblgen``) to your host
+platform before you start. The CXX flags define the target, cpu (which
+defaults to ``fpu=VFP3`` with NEON), and forcing the hard-float ABI. If you're
+using Clang as a cross-compiler, you will *also* have to set ``-ccc-gcc-name``,
+to make sure it picks the correct linker.
+
+Most of the time, what you want is to have a native compiler to the
+platform itself, but not others. It might not even be feasible to
+produce x86 binaries from ARM targets, so there's no point in compiling
+all back-ends. For that reason, you should also set the
+``TARGETS_TO_BUILD`` to only build the ARM back-end.
+
+You must set the ``CMAKE_INSTALL_PREFIX``, otherwise a ``ninja install``
+will copy ARM binaries to your root filesystem, which is not what you
+want.
+
+Hacks
+-----
+
+There are some bugs in current LLVM, which require some fiddling before
+running CMake:
+
+#. If you're using Clang as the cross-compiler, there is a problem in
+   the LLVM ARM back-end that is producing absolute relocations on
+   position-independent code (``R_ARM_THM_MOVW_ABS_NC``), so for now, you
+   should disable PIC:
+
+   .. code-block:: bash
+
+      -DLLVM_ENABLE_PIC=False
+
+   This is not a problem, since Clang/LLVM libraries are statically
+   linked anyway, it shouldn't affect much.
+
+#. The ARM libraries won't be installed in your system, and possibly
+   not easily installable anyway, so you'll have to build/download
+   them separately. But the CMake prepare step, which checks for
+   dependencies, will check the *host* libraries, not the *target*
+   ones.
+
+   A quick way of getting the libraries is to download them from
+   a distribution repository, like Debian (http://packages.debian.org/wheezy/),
+   and download the missing libraries. Note that the ``libXXX``
+   will have the shared objects (``.so``) and the ``libXXX-dev`` will
+   give you the headers and the static (``.a``) library. Just in
+   case, download both.
+
+   The ones you need for ARM are: ``libtinfo``, ``zlib1g``,
+   ``libxml2`` and ``liblzma``. In the Debian repository you'll
+   find downloads for all architectures.
+
+   After you download and unpack all ``.deb`` packages, copy all
+   ``.so`` and ``.a`` to a directory, make the appropriate
+   symbolic links (if necessary), and add the relevant ``-L``
+   and ``-I`` paths to ``-DCMAKE_CXX_FLAGS`` above.
+
+
+Running CMake and Building
+--------------------------
+
+Finally, if you're using your platform compiler, run:
+
+   .. code-block:: bash
+
+     $ cmake -G Ninja <source-dir> <options above>
+
+If you're using Clang as the cross-compiler, run:
+
+   .. code-block:: bash
+
+     $ CC='clang' CXX='clang++' cmake -G Ninja <source-dir> <options above>
+
+If you have ``clang``/``clang++`` on the path, it should just work, and special
+Ninja files will be created in the build directory. I strongly suggest
+you to run ``cmake`` on a separate build directory, *not* inside the
+source tree.
+
+To build, simply type:
+
+   .. code-block:: bash
+
+     $ ninja
+
+It should automatically find out how many cores you have, what are
+the rules that needs building and will build the whole thing.
+
+You can't run ``ninja check-all`` on this tree because the created
+binaries are targeted to ARM, not x86_64.
+
+Installing and Using
+--------------------
+
+After the LLVM/Clang has built successfully, you should install it
+via:
+
+   .. code-block:: bash
+
+     $ ninja install
+
+which will create a sysroot on the install-dir. You can then tar
+that directory into a binary with the full triple name (for easy
+identification), like:
+
+   .. code-block:: bash
+
+     $ ln -sf <install-dir> arm-linux-gnueabihf-clang
+     $ tar zchf arm-linux-gnueabihf-clang.tar.gz arm-linux-gnueabihf-clang
+
+If you copy that tarball to your target board, you'll be able to use
+it for running the test-suite, for example. Follow the guidelines at
+http://llvm.org/docs/lnt/quickstart.html, unpack the tarball in the
+test directory, and use options:
+
+   .. code-block:: bash
+
+     $ ./sandbox/bin/python sandbox/bin/lnt runtest nt \
+         --sandbox sandbox \
+         --test-suite `pwd`/test-suite \
+         --cc `pwd`/arm-linux-gnueabihf-clang/bin/clang \
+         --cxx `pwd`/arm-linux-gnueabihf-clang/bin/clang++
+
+Remember to add the ``-jN`` options to ``lnt`` to the number of CPUs
+on your board. Also, the path to your clang has to be absolute, so
+you'll need the `pwd` trick above.
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index b69e2a3..810455c 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -128,7 +128,9 @@ lexical features of LLVM:
 #. Unnamed temporaries are created when the result of a computation is
    not assigned to a named value.
 #. Unnamed temporaries are numbered sequentially (using a per-function
-   incrementing counter, starting with 0).
+   incrementing counter, starting with 0). Note that basic blocks are
+   included in this numbering. For example, if the entry basic block is not
+   given a label name, then it will get number 0.
 
 It also shows a convention that we follow in this document. When
 demonstrating instructions, we will follow an instruction with a comment
@@ -267,13 +269,6 @@ linkage:
     ``linkonce_odr`` and ``weak_odr`` linkage types to indicate that the
     global will only be merged with equivalent globals. These linkage
     types are otherwise the same as their non-``odr`` versions.
-``linkonce_odr_auto_hide``
-    Similar to "``linkonce_odr``", but nothing in the translation unit
-    takes the address of this definition. For instance, functions that
-    had an inline definition, but the compiler decided not to inline it.
-    ``linkonce_odr_auto_hide`` may have only ``default`` visibility. The
-    symbols are removed by the linker from the final linked image
-    (executable or dynamic library).
 ``external``
     If none of the above identifiers are used, the global is externally
     visible, meaning that it participates in linkage and can be used to
@@ -305,9 +300,6 @@ declarations), they are accessible outside of the current module.
 It is illegal for a function *declaration* to have any linkage type
 other than ``external``, ``dllimport`` or ``extern_weak``.
 
-Aliases can have only ``external``, ``internal``, ``weak`` or
-``weak_odr`` linkages.
-
 .. _callingconv:
 
 Calling Conventions
@@ -448,9 +440,13 @@ Global Variables
 ----------------
 
 Global variables define regions of memory allocated at compilation time
-instead of run-time. Global variables may optionally be initialized, may
-have an explicit section to be placed in, and may have an optional
-explicit alignment specified.
+instead of run-time.
+
+Global variables definitions must be initialized, may have an explicit section
+to be placed in, and may have an optional explicit alignment specified.
+
+Global variables in other translation units can also be declared, in which
+case they don't have an initializer.
 
 A variable may be defined as ``thread_local``, which means that it will
 not be shared by threads (each thread will have a separated copy of the
@@ -532,6 +528,12 @@ with an initializer, section, and alignment:
 
     @G = addrspace(5) constant float 1.0, section "foo", align 4
 
+The following example just declares a global variable
+
+.. code-block:: llvm
+
+   @G = external global i32
+
 The following example defines a thread-local global with the
 ``initialexec`` TLS model:
 
@@ -552,27 +554,26 @@ an optional ``unnamed_addr`` attribute, a return type, an optional
 name, a (possibly empty) argument list (each with optional :ref:`parameter
 attributes <paramattrs>`), optional :ref:`function attributes <fnattrs>`,
 an optional section, an optional alignment, an optional :ref:`garbage
-collector name <gc>`, an opening curly brace, a list of basic blocks,
-and a closing curly brace.
+collector name <gc>`, an optional :ref:`prefix <prefixdata>`, an opening
+curly brace, a list of basic blocks, and a closing curly brace.
 
 LLVM function declarations consist of the "``declare``" keyword, an
 optional :ref:`linkage type <linkage>`, an optional :ref:`visibility
 style <visibility>`, an optional :ref:`calling convention <callingconv>`,
 an optional ``unnamed_addr`` attribute, a return type, an optional
 :ref:`parameter attribute <paramattrs>` for the return type, a function
-name, a possibly empty list of arguments, an optional alignment, and an
-optional :ref:`garbage collector name <gc>`.
-
-A function definition contains a list of basic blocks, forming the CFG
-(Control Flow Graph) for the function. Each basic block may optionally
-start with a label (giving the basic block a symbol table entry),
-contains a list of instructions, and ends with a
-:ref:`terminator <terminators>` instruction (such as a branch or function
-return). If explicit label is not provided, a block is assigned an
-implicit numbered label, using a next value from the same counter as used
-for unnamed temporaries (:ref:`see above<identifiers>`). For example, if a
-function entry block does not have explicit label, it will be assigned
-label "%0", then first unnamed temporary in that block will be "%1", etc.
+name, a possibly empty list of arguments, an optional alignment, an optional
+:ref:`garbage collector name <gc>` and an optional :ref:`prefix <prefixdata>`.
+
+A function definition contains a list of basic blocks, forming the CFG (Control
+Flow Graph) for the function. Each basic block may optionally start with a label
+(giving the basic block a symbol table entry), contains a list of instructions,
+and ends with a :ref:`terminator <terminators>` instruction (such as a branch or
+function return). If an explicit label is not provided, a block is assigned an
+implicit numbered label, using the next value from the same counter as used for
+unnamed temporaries (:ref:`see above<identifiers>`). For example, if a function
+entry block does not have an explicit label, it will be assigned label "%0",
+then the first unnamed temporary in that block will be "%1", etc.
 
 The first basic block in a function is special in two ways: it is
 immediately executed on entrance to the function, and it is not allowed
@@ -598,7 +599,7 @@ Syntax::
            [cconv] [ret attrs]
            <ResultType> @<FunctionName> ([argument list])
            [fn Attrs] [section "name"] [align N]
-           [gc] { ... }
+           [gc] [prefix Constant] { ... }
 
 .. _langref_aliases:
 
@@ -614,6 +615,12 @@ Syntax::
 
     @<Name> = alias [Linkage] [Visibility] <AliaseeTy> @<Aliasee>
 
+The linkage must be one of ``private``, ``linker_private``,
+``linker_private_weak``, ``internal``, ``linkonce``, ``weak``,
+``linkonce_odr``, ``weak_odr``, ``external``. Note that some system linkers
+might not correctly handle dropping a weak symbol that is aliased by a non weak
+alias.
+
 .. _namedmetadatastructure:
 
 Named Metadata
@@ -757,6 +764,55 @@ The compiler declares the supported values of *name*. Specifying a
 collector which will cause the compiler to alter its output in order to
 support the named garbage collection algorithm.
 
+.. _prefixdata:
+
+Prefix Data
+-----------
+
+Prefix data is data associated with a function which the code generator
+will emit immediately before the function body.  The purpose of this feature
+is to allow frontends to associate language-specific runtime metadata with
+specific functions and make it available through the function pointer while
+still allowing the function pointer to be called.  To access the data for a
+given function, a program may bitcast the function pointer to a pointer to
+the constant's type.  This implies that the IR symbol points to the start
+of the prefix data.
+
+To maintain the semantics of ordinary function calls, the prefix data must
+have a particular format.  Specifically, it must begin with a sequence of
+bytes which decode to a sequence of machine instructions, valid for the
+module's target, which transfer control to the point immediately succeeding
+the prefix data, without performing any other visible action.  This allows
+the inliner and other passes to reason about the semantics of the function
+definition without needing to reason about the prefix data.  Obviously this
+makes the format of the prefix data highly target dependent.
+
+Prefix data is laid out as if it were an initializer for a global variable
+of the prefix data's type.  No padding is automatically placed between the
+prefix data and the function body.  If padding is required, it must be part
+of the prefix data.
+
+A trivial example of valid prefix data for the x86 architecture is ``i8 144``,
+which encodes the ``nop`` instruction:
+
+.. code-block:: llvm
+
+    define void @f() prefix i8 144 { ... }
+
+Generally prefix data can be formed by encoding a relative branch instruction
+which skips the metadata, as in this example of valid prefix data for the
+x86_64 architecture, where the first two bytes encode ``jmp .+10``:
+
+.. code-block:: llvm
+
+    %0 = type <{ i8, i8, i8* }>
+
+    define void @f() prefix %0 <{ i8 235, i8 8, i8* @md}> { ... }
+
+A function may have prefix data but no body.  This has similar semantics
+to the ``available_externally`` linkage in that the data may be used by the
+optimizers but will not be emitted in the object file.
+
 .. _attrgrp:
 
 Attribute Groups
@@ -833,6 +889,11 @@ example:
     inlining this function is desirable (such as the "inline" keyword in
     C/C++). It is just a hint; it imposes no requirements on the
     inliner.
+``minsize``
+    This attribute suggests that optimization passes and code generator
+    passes make choices that keep the code size of this function as small
+    as possible and perform optimizations that may sacrifice runtime
+    performance in order to minimize the size of the generated code.
 ``naked``
     This attribute disables prologue / epilogue emission for the
     function. This can have very system-specific consequences.
@@ -874,10 +935,23 @@ example:
     This function attribute indicates that the function never returns
     with an unwind or exceptional control flow. If the function does
     unwind, its runtime behavior is undefined.
+``optnone``
+    This function attribute indicates that the function is not optimized
+    by any optimization or code generator passes with the
+    exception of interprocedural optimization passes.
+    This attribute cannot be used together with the ``alwaysinline``
+    attribute; this attribute is also incompatible
+    with the ``minsize`` attribute and the ``optsize`` attribute.
+
+    This attribute requires the ``noinline`` attribute to be specified on
+    the function as well, so the function is never inlined into any caller.
+    Only functions with the ``alwaysinline`` attribute are valid
+    candidates for inlining into the body of this function.
 ``optsize``
     This attribute suggests that optimization passes and code generator
     passes make choices that keep the code size of this function low,
-    and otherwise do optimizations specifically to reduce code size.
+    and otherwise do optimizations specifically to reduce code size as
+    long as they do not significantly impact runtime performance.
 ``readnone``
     On a function, this attribute indicates that the function computes its
     result (or decides to unwind an exception) based strictly on its arguments,
@@ -887,7 +961,7 @@ example:
     (including ``byval`` arguments) and never changes any state visible
     to callers. This means that it cannot unwind exceptions by calling
     the ``C++`` exception throwing methods.
-    
+
     On an argument, this attribute indicates that the function does not
     dereference that pointer argument, even though it may read or write the
     memory that the pointer points to if accessed through other pointers.
@@ -901,7 +975,7 @@ example:
     called with the same set of arguments and global state. It cannot
     unwind an exception by calling the ``C++`` exception throwing
     methods.
-    
+
     On an argument, this attribute indicates that the function does not write
     through this pointer argument, even though it may write to the memory that
     the pointer points to.
@@ -1109,6 +1183,30 @@ don't have to specify the string. This will disable some optimizations
 that require precise layout information, but this also prevents those
 optimizations from introducing target specificity into the IR.
 
+.. _langref_triple:
+
+Target Triple
+-------------
+
+A module may specify a target triple string that describes the target
+host. The syntax for the target triple is simply:
+
+.. code-block:: llvm
+
+    target triple = "x86_64-apple-macosx10.7.0"
+
+The *target triple* string consists of a series of identifiers delimited
+by the minus sign character ('-'). The canonical forms are:
+
+::
+
+    ARCHITECTURE-VENDOR-OPERATING_SYSTEM
+    ARCHITECTURE-VENDOR-OPERATING_SYSTEM-ENVIRONMENT
+
+This information is passed along to the backend so that it generates
+code for the proper architecture. It's possible to override this on the
+command line with the ``-mtriple`` command line option.
+
 .. _pointeraliasing:
 
 Pointer Aliasing Rules
@@ -1659,9 +1757,10 @@ Function Type
 Overview:
 """""""""
 
-The function type can be thought of as a function signature. It consists
-of a return type and a list of formal parameter types. The return type
-of a function type is a first class type or a void type.
+The function type can be thought of as a function signature. It consists of a
+return type and a list of formal parameter types. The return type of a function
+type is a void type or first class type --- except for :ref:`label <t_label>`
+and :ref:`metadata <t_metadata>` types.
 
 Syntax:
 """""""
@@ -1671,11 +1770,11 @@ Syntax:
       <returntype> (<parameter list>)
 
 ...where '``<parameter list>``' is a comma-separated list of type
-specifiers. Optionally, the parameter list may include a type ``...``,
-which indicates that the function takes a variable number of arguments.
-Variable argument functions can access their arguments with the
-:ref:`variable argument handling intrinsic <int_varargs>` functions.
-'``<returntype>``' is any type except :ref:`label <t_label>`.
+specifiers. Optionally, the parameter list may include a type ``...``, which
+indicates that the function takes a variable number of arguments.  Variable
+argument functions can access their arguments with the :ref:`variable argument
+handling intrinsic <int_varargs>` functions.  '``<returntype>``' is any type
+except :ref:`label <t_label>` and :ref:`metadata <t_metadata>`.
 
 Examples:
 """""""""
@@ -2286,6 +2385,10 @@ The following is the syntax for constant expressions:
     Convert a constant, CST, to another TYPE. The constraints of the
     operands are the same as those for the :ref:`bitcast
     instruction <i_bitcast>`.
+``addrspacecast (CST to TYPE)``
+    Convert a constant pointer or constant vector of pointer, CST, to another
+    TYPE in a different address space. The constraints of the operands are the
+    same as those for the :ref:`addrspacecast instruction <i_addrspacecast>`.
 ``getelementptr (CSTPTR, IDX0, IDX1, ...)``, ``getelementptr inbounds (CSTPTR, IDX0, IDX1, ...)``
     Perform the :ref:`getelementptr operation <i_getelementptr>` on
     constants. As with the :ref:`getelementptr <i_getelementptr>`
@@ -5630,9 +5733,9 @@ is always a *no-op cast* because no bits change with this
 conversion. The conversion is done as if the ``value`` had been stored
 to memory and read back as type ``ty2``. Pointer (or vector of
 pointers) types may only be converted to other pointer (or vector of
-pointers) types with this instruction if the pointer sizes are
-equal. To convert pointers to other types, use the :ref:`inttoptr
-<i_inttoptr>` or :ref:`ptrtoint <i_ptrtoint>` instructions first.
+pointers) types with the same address space through this instruction.
+To convert pointers to other types, use the :ref:`inttoptr <i_inttoptr>`
+or :ref:`ptrtoint <i_ptrtoint>` instructions first.
 
 Example:
 """"""""
@@ -5644,6 +5747,51 @@ Example:
       %Z = bitcast <2 x int> %V to i64;        ; yields i64: %V
       %Z = bitcast <2 x i32*> %V to <2 x i64*> ; yields <2 x i64*>
 
+.. _i_addrspacecast:
+
+'``addrspacecast .. to``' Instruction
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      <result> = addrspacecast <pty> <ptrval> to <pty2>       ; yields pty2
+
+Overview:
+"""""""""
+
+The '``addrspacecast``' instruction converts ``ptrval`` from ``pty`` in
+address space ``n`` to type ``pty2`` in address space ``m``.
+
+Arguments:
+""""""""""
+
+The '``addrspacecast``' instruction takes a pointer or vector of pointer value
+to cast and a pointer type to cast it to, which must have a different
+address space.
+
+Semantics:
+""""""""""
+
+The '``addrspacecast``' instruction converts the pointer value
+``ptrval`` to type ``pty2``. It can be a *no-op cast* or a complex
+value modification, depending on the target and the address space
+pair. Pointer conversions within the same address space must be
+performed with the ``bitcast`` instruction. Note that if the address space
+conversion is legal then both result and operand refer to the same memory
+location.
+
+Example:
+""""""""
+
+.. code-block:: llvm
+
+      %X = addrspacecast i32* %x to i32 addrspace(1)*    ; yields i32 addrspace(1)*:%x
+      %Y = addrspacecast i32 addrspace(1)* %y to i64 addrspace(2)*    ; yields i64 addrspace(2)*:%y
+      %Z = addrspacecast <4 x i32*> %z to <4 x float addrspace(3)*>   ; yields <4 x float addrspace(3)*>:%z
+
 .. _otherops:
 
 Other Operations
@@ -6293,7 +6441,7 @@ Syntax:
 
 ::
 
-      declare void %llvm.va_start(i8* <arglist>)
+      declare void @llvm.va_start(i8* <arglist>)
 
 Overview:
 """""""""
@@ -6805,7 +6953,7 @@ The '``llvm.memcpy.*``' intrinsics copy a block of memory from the
 source location to the destination location, which are not allowed to
 overlap. It copies "len" bytes of memory over. If the argument is known
 to be aligned to some boundary, this can be specified as the fourth
-argument, otherwise it should be set to 0 or 1.
+argument, otherwise it should be set to 0 or 1 (both meaning no alignment).
 
 '``llvm.memmove``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -6860,7 +7008,7 @@ The '``llvm.memmove.*``' intrinsics copy a block of memory from the
 source location to the destination location, which may overlap. It
 copies "len" bytes of memory over. If the argument is known to be
 aligned to some boundary, this can be specified as the fourth argument,
-otherwise it should be set to 0 or 1.
+otherwise it should be set to 0 or 1 (both meaning no alignment).
 
 '``llvm.memset.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -6911,7 +7059,7 @@ Semantics:
 The '``llvm.memset.*``' intrinsics fill "len" bytes of memory starting
 at the destination location. If the argument is known to be aligned to
 some boundary, this can be specified as the fourth argument, otherwise
-it should be set to 0 or 1.
+it should be set to 0 or 1 (both meaning no alignment).
 
 '``llvm.sqrt.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -7347,6 +7495,42 @@ Semantics:
 This function returns the same values as the libm ``fabs`` functions
 would, and handles error conditions in the same way.
 
+'``llvm.copysign.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.copysign`` on any
+floating point or vector of floating point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.copysign.f32(float  %Mag, float  %Sgn)
+      declare double    @llvm.copysign.f64(double %Mag, double %Sgn)
+      declare x86_fp80  @llvm.copysign.f80(x86_fp80  %Mag, x86_fp80  %Sgn)
+      declare fp128     @llvm.copysign.f128(fp128 %Mag, fp128 %Sgn)
+      declare ppc_fp128 @llvm.copysign.ppcf128(ppc_fp128  %Mag, ppc_fp128  %Sgn)
+
+Overview:
+"""""""""
+
+The '``llvm.copysign.*``' intrinsics return a value with the magnitude of the
+first operand and the sign of the second operand.
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating point numbers of the same
+type.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``copysign``
+functions would, and handles error conditions in the same way.
+
 '``llvm.floor.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -7526,6 +7710,42 @@ Semantics:
 This function returns the same values as the libm ``nearbyint``
 functions would, and handles error conditions in the same way.
 
+'``llvm.round.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.round`` on any
+floating point or vector of floating point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.round.f32(float  %Val)
+      declare double    @llvm.round.f64(double %Val)
+      declare x86_fp80  @llvm.round.f80(x86_fp80  %Val)
+      declare fp128     @llvm.round.f128(fp128 %Val)
+      declare ppc_fp128 @llvm.round.ppcf128(ppc_fp128  %Val)
+
+Overview:
+"""""""""
+
+The '``llvm.round.*``' intrinsics returns the operand rounded to the
+nearest integer.
+
+Arguments:
+""""""""""
+
+The argument and return value are floating point numbers of the same
+type.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``round``
+functions would, and handles error conditions in the same way.
+
 Bit Manipulation Intrinsics
 ---------------------------
 
@@ -8599,14 +8819,52 @@ enough space to hold the value of the guard.
 Semantics:
 """"""""""
 
-This intrinsic causes the prologue/epilogue inserter to force the
-position of the ``AllocaInst`` stack slot to be before local variables
-on the stack. This is to ensure that if a local variable on the stack is
-overwritten, it will destroy the value of the guard. When the function
-exits, the guard on the stack is checked against the original guard. If
-they are different, then the program aborts by calling the
+This intrinsic causes the prologue/epilogue inserter to force the position of
+the ``AllocaInst`` stack slot to be before local variables on the stack. This is
+to ensure that if a local variable on the stack is overwritten, it will destroy
+the value of the guard. When the function exits, the guard on the stack is
+checked against the original guard by ``llvm.stackprotectorcheck``. If they are
+different, then ``llvm.stackprotectorcheck`` causes the program to abort by
+calling the ``__stack_chk_fail()`` function.
+
+'``llvm.stackprotectorcheck``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.stackprotectorcheck(i8** <guard>)
+
+Overview:
+"""""""""
+
+The ``llvm.stackprotectorcheck`` intrinsic compares ``guard`` against an already
+created stack protector and if they are not equal calls the
 ``__stack_chk_fail()`` function.
 
+Arguments:
+""""""""""
+
+The ``llvm.stackprotectorcheck`` intrinsic requires one pointer argument, the
+the variable ``@__stack_chk_guard``.
+
+Semantics:
+""""""""""
+
+This intrinsic is provided to perform the stack protector check by comparing
+``guard`` with the stack slot created by ``llvm.stackprotector`` and if the
+values do not match call the ``__stack_chk_fail()`` function.
+
+The reason to provide this as an IR level intrinsic instead of implementing it
+via other IR operations is that in order to perform this operation at the IR
+level without an intrinsic, one would need to create additional basic blocks to
+handle the success/failure cases. This makes it difficult to stop the stack
+protector check from disrupting sibling tail calls in Codegen. With this
+intrinsic, we are able to generate the stack protector basic blocks late in
+codegen after the tail call decision has occurred.
+
 '``llvm.objectsize``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/MCJIT-creation.png b/docs/MCJIT-creation.png
new file mode 100644
index 0000000..7abdb9d
--- /dev/null
+++ b/docs/MCJIT-creation.png
diff --git a/docs/MCJIT-dyld-load.png b/docs/MCJIT-dyld-load.png
new file mode 100644
index 0000000..1534190
--- /dev/null
+++ b/docs/MCJIT-dyld-load.png
diff --git a/docs/MCJIT-engine-builder.png b/docs/MCJIT-engine-builder.png
new file mode 100644
index 0000000..8fdd7a9
--- /dev/null
+++ b/docs/MCJIT-engine-builder.png
diff --git a/docs/MCJIT-load-object.png b/docs/MCJIT-load-object.png
new file mode 100644
index 0000000..533a52e
--- /dev/null
+++ b/docs/MCJIT-load-object.png
diff --git a/docs/MCJIT-load.png b/docs/MCJIT-load.png
new file mode 100644
index 0000000..9672a66
--- /dev/null
+++ b/docs/MCJIT-load.png
diff --git a/docs/MCJIT-resolve-relocations.png b/docs/MCJIT-resolve-relocations.png
new file mode 100644
index 0000000..fedeacc
--- /dev/null
+++ b/docs/MCJIT-resolve-relocations.png
diff --git a/docs/MCJITDesignAndImplementation.rst b/docs/MCJITDesignAndImplementation.rst
new file mode 100644
index 0000000..2cb6296
--- /dev/null
+++ b/docs/MCJITDesignAndImplementation.rst
@@ -0,0 +1,180 @@
+===============================
+MCJIT Design and Implementation
+===============================
+
+Introduction
+============
+
+This document describes the internal workings of the MCJIT execution
+engine and the RuntimeDyld component.  It is intended as a high level
+overview of the implementation, showing the flow and interactions of
+objects throughout the code generation and dynamic loading process.
+
+Engine Creation
+===============
+
+In most cases, an EngineBuilder object is used to create an instance of
+the MCJIT execution engine.  The EngineBuilder takes an llvm::Module
+object as an argument to its constructor.  The client may then set various
+options that we control the later be passed along to the MCJIT engine,
+including the selection of MCJIT as the engine type to be created.
+Of particular interest is the EngineBuilder::setMCJITMemoryManager
+function.  If the client does not explicitly create a memory manager at
+this time, a default memory manager (specifically SectionMemoryManager)
+will be created when the MCJIT engine is instantiated.
+
+Once the options have been set, a client calls EngineBuilder::create to
+create an instance of the MCJIT engine.  If the client does not use the
+form of this function that takes a TargetMachine as a parameter, a new
+TargetMachine will be created based on the target triple associated with
+the Module that was used to create the EngineBuilder.
+
+.. image:: MCJIT-engine-builder.png
+ 
+EngineBuilder::create will call the static MCJIT::createJIT function,
+passing in its pointers to the module, memory manager and target machine
+objects, all of which will subsequently be owned by the MCJIT object.
+
+The MCJIT class has a member variable, Dyld, which contains an instance of
+the RuntimeDyld wrapper class.  This member will be used for
+communications between MCJIT and the actual RuntimeDyldImpl object that
+gets created when an object is loaded.
+
+.. image:: MCJIT-creation.png
+ 
+Upon creation, MCJIT holds a pointer to the Module object that it received
+from EngineBuilder but it does not immediately generate code for this
+module.  Code generation is deferred until either the
+MCJIT::finalizeObject method is called explicitly or a function such as
+MCJIT::getPointerToFunction is called which requires the code to have been
+generated.
+
+Code Generation
+===============
+
+When code generation is triggered, as described above, MCJIT will first
+attempt to retrieve an object image from its ObjectCache member, if one
+has been set.  If a cached object image cannot be retrieved, MCJIT will
+call its emitObject method.  MCJIT::emitObject uses a local PassManager
+instance and creates a new ObjectBufferStream instance, both of which it
+passes to TargetManager::addPassesToEmitMC before calling PassManager::run
+on the Module with which it was created.
+
+.. image:: MCJIT-load.png
+ 
+The PassManager::run call causes the MC code generation mechanisms to emit
+a complete relocatable binary object image (either in either ELF or MachO
+format, depending on the target) into the ObjectBufferStream object, which
+is flushed to complete the process.  If an ObjectCache is being used, the
+image will be passed to the ObjectCache here.
+
+At this point, the ObjectBufferStream contains the raw object image.
+Before the code can be executed, the code and data sections from this
+image must be loaded into suitable memory, relocations must be applied and
+memory permission and code cache invalidation (if required) must be completed.
+
+Object Loading
+==============
+
+Once an object image has been obtained, either through code generation or
+having been retrieved from an ObjectCache, it is passed to RuntimeDyld to
+be loaded.  The RuntimeDyld wrapper class examines the object to determine
+its file format and creates an instance of either RuntimeDyldELF or
+RuntimeDyldMachO (both of which derive from the RuntimeDyldImpl base
+class) and calls the RuntimeDyldImpl::loadObject method to perform that
+actual loading.
+
+.. image:: MCJIT-dyld-load.png
+ 
+RuntimeDyldImpl::loadObject begins by creating an ObjectImage instance
+from the ObjectBuffer it received.  ObjectImage, which wraps the
+ObjectFile class, is a helper class which parses the binary object image
+and provides access to the information contained in the format-specific
+headers, including section, symbol and relocation information.
+
+RuntimeDyldImpl::loadObject then iterates through the symbols in the
+image.  Information about common symbols is collected for later use.  For
+each function or data symbol, the associated section is loaded into memory
+and the symbol is stored in a symbol table map data structure.  When the
+iteration is complete, a section is emitted for the common symbols.
+
+Next, RuntimeDyldImpl::loadObject iterates through the sections in the
+object image and for each section iterates through the relocations for
+that sections.  For each relocation, it calls the format-specific
+processRelocationRef method, which will examine the relocation and store
+it in one of two data structures, a section-based relocation list map and
+an external symbol relocation map.
+
+.. image:: MCJIT-load-object.png
+ 
+When RuntimeDyldImpl::loadObject returns, all of the code and data
+sections for the object will have been loaded into memory allocated by the
+memory manager and relocation information will have been prepared, but the
+relocations have not yet been applied and the generated code is still not
+ready to be executed.
+
+[Currently (as of August 2013) the MCJIT engine will immediately apply
+relocations when loadObject completes.  However, this shouldn't be
+happening.  Because the code may have been generated for a remote target,
+the client should be given a chance to re-map the section addresses before
+relocations are applied.  It is possible to apply relocations multiple
+times, but in the case where addresses are to be re-mapped, this first
+application is wasted effort.]
+
+Address Remapping
+=================
+
+At any time after initial code has been generated and before
+finalizeObject is called, the client can remap the address of sections in
+the object.  Typically this is done because the code was generated for an
+external process and is being mapped into that process' address space.
+The client remaps the section address by calling MCJIT::mapSectionAddress.
+This should happen before the section memory is copied to its new
+location.
+
+When MCJIT::mapSectionAddress is called, MCJIT passes the call on to
+RuntimeDyldImpl (via its Dyld member).  RuntimeDyldImpl stores the new
+address in an internal data structure but does not update the code at this
+time, since other sections are likely to change.
+
+When the client is finished remapping section addresses, it will call
+MCJIT::finalizeObject to complete the remapping process.
+
+Final Preparations
+==================
+
+When MCJIT::finalizeObject is called, MCJIT calls
+RuntimeDyld::resolveRelocations.  This function will attempt to locate any
+external symbols and then apply all relocations for the object.
+
+External symbols are resolved by calling the memory manager's
+getPointerToNamedFunction method.  The memory manager will return the
+address of the requested symbol in the target address space.  (Note, this
+may not be a valid pointer in the host process.)  RuntimeDyld will then
+iterate through the list of relocations it has stored which are associated
+with this symbol and invoke the resolveRelocation method which, through an
+format-specific implementation, will apply the relocation to the loaded
+section memory.
+
+Next, RuntimeDyld::resolveRelocations iterates through the list of
+sections and for each section iterates through a list of relocations that
+have been saved which reference that symbol and call resolveRelocation for
+each entry in this list.  The relocation list here is a list of
+relocations for which the symbol associated with the relocation is located
+in the section associated with the list.  Each of these locations will
+have a target location at which the relocation will be applied that is
+likely located in a different section.
+
+.. image:: MCJIT-resolve-relocations.png
+ 
+Once relocations have been applied as described above, MCJIT calls
+RuntimeDyld::getEHFrameSection, and if a non-zero result is returned
+passes the section data to the memory manager's registerEHFrames method.
+This allows the memory manager to call any desired target-specific
+functions, such as registering the EH frame information with a debugger.
+
+Finally, MCJIT calls the memory manager's finalizeMemory method.  In this
+method, the memory manager will invalidate the target code cache, if
+necessary, and apply final permissions to the memory pages it has
+allocated for code and data memory.
+
diff --git a/docs/Makefile b/docs/Makefile
index 122c4b8..d973af5 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -19,7 +19,12 @@ $(PROJ_OBJ_DIR)/doxygen.cfg: doxygen.cfg.in
 	  -e 's/@abs_top_srcdir@/../g' \
 	  -e 's/@DOT@/dot/g' \
 	  -e 's/@PACKAGE_VERSION@/mainline/' \
-	  -e 's/@abs_top_builddir@/../g' > $@
+	  -e 's/@abs_top_builddir@/../g' \
+	  -e 's/@enable_searchengine@/NO/g' \
+	  -e 's/@searchengine_url@//g' \
+	  -e 's/@enable_server_based_search@/NO/g' \
+	  -e 's/@enable_external_search@/NO/g' \
+	  -e 's/@extra_search_mappings@//g' > $@
 endif
 
 include $(LEVEL)/Makefile.common
@@ -77,9 +82,7 @@ doxygen: regendoc $(PROJ_OBJ_DIR)/doxygen.tar.gz
 
 regendoc:
 	$(Echo) Building doxygen documentation
-	$(Verb) if test -e $(PROJ_OBJ_DIR)/doxygen ; then \
-	  $(RM) -rf $(PROJ_OBJ_DIR)/doxygen ; \
-	fi
+	$(Verb) $(RM) -rf $(PROJ_OBJ_DIR)/doxygen
 	$(Verb) $(DOXYGEN) $(PROJ_OBJ_DIR)/doxygen.cfg
 
 $(PROJ_OBJ_DIR)/doxygen.tar.gz: $(DOXYFILES) $(PROJ_OBJ_DIR)/doxygen.cfg
@@ -113,9 +116,7 @@ ocamldoc: regen-ocamldoc
 
 regen-ocamldoc:
 	$(Echo) Building ocamldoc documentation
-	$(Verb) if test -e $(PROJ_OBJ_DIR)/ocamldoc ; then \
-		$(RM) -rf $(PROJ_OBJ_DIR)/ocamldoc ; \
-	fi
+	$(Verb) $(RM) -rf $(PROJ_OBJ_DIR)/ocamldoc
 	$(Verb) $(MAKE) -C $(LEVEL)/bindings/ocaml ocamldoc
 	$(Verb) $(MKDIR) $(PROJ_OBJ_DIR)/ocamldoc/html
 	$(Verb) \
diff --git a/docs/MakefileGuide.rst b/docs/MakefileGuide.rst
index 0bb4a3c..120c108 100644
--- a/docs/MakefileGuide.rst
+++ b/docs/MakefileGuide.rst
@@ -153,7 +153,7 @@ libraries are the default. For example:
 
   LIBRARYNAME = mylib
   SHARED_LIBRARY = 1
-  ARCHIVE_LIBRARY = 1
+  BUILD_ARCHIVE = 1
 
 says to build a library named ``mylib`` with both a shared library
 (``mylib.so``) and an archive library (``mylib.a``) version. The contents of all
@@ -168,9 +168,9 @@ openable with the ``dlopen`` function and searchable with the ``dlsym`` function
 (or your operating system's equivalents). While this isn't strictly necessary on
 Linux and a few other platforms, it is required on systems like HP-UX and
 Darwin. You should use ``LOADABLE_MODULE`` for any shared library that you
-intend to be loaded into an tool via the ``-load`` option.  `Pass documentation
-<writing-an-llvm-pass-makefile>`_ has an example of why you might want to do
-this.
+intend to be loaded into an tool via the ``-load`` option.  :ref:`Pass
+documentation <writing-an-llvm-pass-makefile>` has an example of why you might
+want to do this.
 
 Loadable Modules
 ^^^^^^^^^^^^^^^^
@@ -236,7 +236,7 @@ the ``-l`` option). In this case, only the symbols that are unresolved *at
 that point* will be resolved from the library, if they exist. Other
 (unreferenced) symbols will not be included when the ``.a`` syntax is used. Note
 that in order to use the ``.a`` suffix, the library in question must have been
-built with the ``ARCHIVE_LIBRARY`` option set.
+built with the ``BUILD_ARCHIVE`` option set.
 
 JIT Tools
 ^^^^^^^^^
diff --git a/docs/NVPTXUsage.rst b/docs/NVPTXUsage.rst
index 5451619..a9065ce 100644
--- a/docs/NVPTXUsage.rst
+++ b/docs/NVPTXUsage.rst
@@ -66,6 +66,8 @@ function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not.
 When compiled, the PTX kernel functions are callable by host-side code.
 
 
+.. _address_spaces:
+
 Address Spaces
 --------------
 
@@ -103,6 +105,25 @@ space in LLVM, so the ``addrspace(N)`` annotation is *required* for global
 variables.
 
 
+Triples
+-------
+
+The NVPTX target uses the module triple to select between 32/64-bit code
+generation and the driver-compiler interface to use. The triple architecture
+can be one of ``nvptx`` (32-bit PTX) or ``nvptx64`` (64-bit PTX). The
+operating system should be one of ``cuda`` or ``nvcl``, which determines the
+interface used by the generated code to communicate with the driver.  Most
+users will want to use ``cuda`` as the operating system, which makes the
+generated PTX compatible with the CUDA Driver API.
+
+Example: 32-bit PTX for CUDA Driver API: ``nvptx-nvidia-cuda``
+
+Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda``
+
+
+
+.. _nvptx_intrinsics:
+
 NVPTX Intrinsics
 ================
 
@@ -238,6 +259,116 @@ For the full set of NVPTX intrinsics, please see the
 ``include/llvm/IR/IntrinsicsNVVM.td`` file in the LLVM source tree.
 
 
+.. _libdevice:
+
+Linking with Libdevice
+======================
+
+The CUDA Toolkit comes with an LLVM bitcode library called ``libdevice`` that
+implements many common mathematical functions. This library can be used as a
+high-performance math library for any compilers using the LLVM NVPTX target.
+The library can be found under ``nvvm/libdevice/`` in the CUDA Toolkit and
+there is a separate version for each compute architecture.
+
+For a list of all math functions implemented in libdevice, see
+`libdevice Users Guide <http://docs.nvidia.com/cuda/libdevice-users-guide/index.html>`_.
+
+To accomodate various math-related compiler flags that can affect code
+generation of libdevice code, the library code depends on a special LLVM IR
+pass (``NVVMReflect``) to handle conditional compilation within LLVM IR. This
+pass looks for calls to the ``@__nvvm_reflect`` function and replaces them
+with constants based on the defined reflection parameters. Such conditional
+code often follows a pattern:
+
+.. code-block:: c++
+
+  float my_function(float a) {
+    if (__nvvm_reflect("FASTMATH"))
+      return my_function_fast(a);
+    else
+      return my_function_precise(a);
+  }
+
+The default value for all unspecified reflection parameters is zero. 
+
+The ``NVVMReflect`` pass should be executed early in the optimization
+pipeline, immediately after the link stage. The ``internalize`` pass is also
+recommended to remove unused math functions from the resulting PTX. For an
+input IR module ``module.bc``, the following compilation flow is recommended:
+
+1. Save list of external functions in ``module.bc``
+2. Link ``module.bc`` with ``libdevice.compute_XX.YY.bc``
+3. Internalize all functions not in list from (1)
+4. Eliminate all unused internal functions
+5. Run ``NVVMReflect`` pass
+6. Run standard optimization pipeline
+
+.. note::
+
+  ``linkonce`` and ``linkonce_odr`` linkage types are not suitable for the
+  libdevice functions. It is possible to link two IR modules that have been
+  linked against libdevice using different reflection variables.
+
+Since the ``NVVMReflect`` pass replaces conditionals with constants, it will
+often leave behind dead code of the form:
+
+.. code-block:: llvm
+
+  entry:
+    ..
+    br i1 true, label %foo, label %bar
+  foo:
+    ..
+  bar:
+    ; Dead code
+    ..
+
+Therefore, it is recommended that ``NVVMReflect`` is executed early in the
+optimization pipeline before dead-code elimination.
+
+
+Reflection Parameters
+---------------------
+
+The libdevice library currently uses the following reflection parameters to
+control code generation:
+
+==================== ======================================================
+Flag                 Description
+==================== ======================================================
+``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero
+==================== ======================================================
+
+
+Invoking NVVMReflect
+--------------------
+
+To ensure that all dead code caused by the reflection pass is eliminated, it
+is recommended that the reflection pass is executed early in the LLVM IR
+optimization pipeline. The pass takes an optional mapping of reflection
+parameter name to an integer value. This mapping can be specified as either a
+command-line option to ``opt`` or as an LLVM ``StringMap<int>`` object when
+programmatically creating a pass pipeline.
+
+With ``opt``:
+
+.. code-block:: text
+
+  # opt -nvvm-reflect -nvvm-reflect-list=<var>=<value>,<var>=<value> module.bc -o module.reflect.bc
+
+
+With programmatic pass pipeline:
+
+.. code-block:: c++
+
+  extern ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping);
+
+  StringMap<int> ReflectParams;
+  ReflectParams["__CUDA_FTZ"] = 1;
+  Passes.add(createNVVMReflectPass(ReflectParams));
+
+
+
 Executing PTX
 =============
 
@@ -274,3 +405,576 @@ JIT compiling a PTX string to a device binary:
 
 For full examples of executing PTX assembly, please see the `CUDA Samples
 <https://developer.nvidia.com/cuda-downloads>`_ distribution.
+
+
+Common Issues
+=============
+
+ptxas complains of undefined function: __nvvm_reflect
+-----------------------------------------------------
+
+When linking with libdevice, the ``NVVMReflect`` pass must be used. See
+:ref:`libdevice` for more information.
+
+
+Tutorial: A Simple Compute Kernel
+=================================
+
+To start, let us take a look at a simple compute kernel written directly in
+LLVM IR. The kernel implements vector addition, where each thread computes one
+element of the output vector C from the input vectors A and B.  To make this
+easier, we also assume that only a single CTA (thread block) will be launched,
+and that it will be one dimensional.
+
+
+The Kernel
+----------
+
+.. code-block:: llvm
+
+  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+  target triple = "nvptx64-nvidia-cuda"
+
+  ; Intrinsic to read X component of thread ID
+  declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
+
+  define void @kernel(float addrspace(1)* %A,
+                      float addrspace(1)* %B,
+                      float addrspace(1)* %C) {
+  entry:
+    ; What is my ID?
+    %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
+
+    ; Compute pointers into A, B, and C
+    %ptrA = getelementptr float addrspace(1)* %A, i32 %id
+    %ptrB = getelementptr float addrspace(1)* %B, i32 %id
+    %ptrC = getelementptr float addrspace(1)* %C, i32 %id
+
+    ; Read A, B
+    %valA = load float addrspace(1)* %ptrA, align 4
+    %valB = load float addrspace(1)* %ptrB, align 4
+
+    ; Compute C = A + B
+    %valC = fadd float %valA, %valB
+
+    ; Store back to C
+    store float %valC, float addrspace(1)* %ptrC, align 4
+
+    ret void
+  }
+
+  !nvvm.annotations = !{!0}
+  !0 = metadata !{void (float addrspace(1)*,
+                        float addrspace(1)*,
+                        float addrspace(1)*)* @kernel, metadata !"kernel", i32 1}
+
+
+We can use the LLVM ``llc`` tool to directly run the NVPTX code generator:
+
+.. code-block:: text
+
+  # llc -mcpu=sm_20 kernel.ll -o kernel.ptx
+
+
+.. note::
+
+  If you want to generate 32-bit code, change ``p:64:64:64`` to ``p:32:32:32``
+  in the module data layout string and use ``nvptx-nvidia-cuda`` as the
+  target triple.
+
+
+The output we get from ``llc`` (as of LLVM 3.4):
+
+.. code-block:: text
+
+  //
+  // Generated by LLVM NVPTX Back-End
+  //
+
+  .version 3.1
+  .target sm_20
+  .address_size 64
+
+    // .globl kernel
+                                          // @kernel
+  .visible .entry kernel(
+    .param .u64 kernel_param_0,
+    .param .u64 kernel_param_1,
+    .param .u64 kernel_param_2
+  )
+  {
+    .reg .f32   %f<4>;
+    .reg .s32   %r<2>;
+    .reg .s64   %rl<8>;
+
+  // BB#0:                                // %entry
+    ld.param.u64    %rl1, [kernel_param_0];
+    mov.u32         %r1, %tid.x;
+    mul.wide.s32    %rl2, %r1, 4;
+    add.s64         %rl3, %rl1, %rl2;
+    ld.param.u64    %rl4, [kernel_param_1];
+    add.s64         %rl5, %rl4, %rl2;
+    ld.param.u64    %rl6, [kernel_param_2];
+    add.s64         %rl7, %rl6, %rl2;
+    ld.global.f32   %f1, [%rl3];
+    ld.global.f32   %f2, [%rl5];
+    add.f32         %f3, %f1, %f2;
+    st.global.f32   [%rl7], %f3;
+    ret;
+  }
+
+
+Dissecting the Kernel
+---------------------
+
+Now let us dissect the LLVM IR that makes up this kernel. 
+
+Data Layout
+^^^^^^^^^^^
+
+The data layout string determines the size in bits of common data types, their
+ABI alignment, and their storage size.  For NVPTX, you should use one of the
+following:
+
+32-bit PTX:
+
+.. code-block:: llvm
+
+  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+64-bit PTX:
+
+.. code-block:: llvm
+
+  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+
+Target Intrinsics
+^^^^^^^^^^^^^^^^^
+
+In this example, we use the ``@llvm.nvvm.read.ptx.sreg.tid.x`` intrinsic to
+read the X component of the current thread's ID, which corresponds to a read
+of register ``%tid.x`` in PTX. The NVPTX back-end supports a large set of
+intrinsics.  A short list is shown below; please see
+``include/llvm/IR/IntrinsicsNVVM.td`` for the full list.
+
+
+================================================ ====================
+Intrinsic                                        CUDA Equivalent
+================================================ ====================
+``i32 @llvm.nvvm.read.ptx.sreg.tid.{x,y,z}``     threadIdx.{x,y,z}
+``i32 @llvm.nvvm.read.ptx.sreg.ctaid.{x,y,z}``   blockIdx.{x,y,z}
+``i32 @llvm.nvvm.read.ptx.sreg.ntid.{x,y,z}``    blockDim.{x,y,z}
+``i32 @llvm.nvvm.read.ptx.sreg.nctaid.{x,y,z}``  gridDim.{x,y,z}
+``void @llvm.cuda.syncthreads()``                __syncthreads()
+================================================ ====================
+
+
+Address Spaces
+^^^^^^^^^^^^^^
+
+You may have noticed that all of the pointer types in the LLVM IR example had
+an explicit address space specifier. What is address space 1? NVIDIA GPU
+devices (generally) have four types of memory:
+
+- Global: Large, off-chip memory
+- Shared: Small, on-chip memory shared among all threads in a CTA
+- Local: Per-thread, private memory
+- Constant: Read-only memory shared across all threads
+
+These different types of memory are represented in LLVM IR as address spaces.
+There is also a fifth address space used by the NVPTX code generator that
+corresponds to the "generic" address space.  This address space can represent
+addresses in any other address space (with a few exceptions).  This allows
+users to write IR functions that can load/store memory using the same
+instructions. Intrinsics are provided to convert pointers between the generic
+and non-generic address spaces.
+
+See :ref:`address_spaces` and :ref:`nvptx_intrinsics` for more information.
+
+
+Kernel Metadata
+^^^^^^^^^^^^^^^
+
+In PTX, a function can be either a `kernel` function (callable from the host
+program), or a `device` function (callable only from GPU code). You can think
+of `kernel` functions as entry-points in the GPU program. To mark an LLVM IR
+function as a `kernel` function, we make use of special LLVM metadata. The
+NVPTX back-end will look for a named metadata node called
+``nvvm.annotations``. This named metadata must contain a list of metadata that
+describe the IR. For our purposes, we need to declare a metadata node that
+assigns the "kernel" attribute to the LLVM IR function that should be emitted
+as a PTX `kernel` function. These metadata nodes take the form:
+
+.. code-block:: text
+
+  metadata !{<function ref>, metadata !"kernel", i32 1}
+
+For the previous example, we have:
+
+.. code-block:: llvm
+
+  !nvvm.annotations = !{!0}
+  !0 = metadata !{void (float addrspace(1)*,
+                        float addrspace(1)*,
+                        float addrspace(1)*)* @kernel, metadata !"kernel", i32 1}
+
+Here, we have a single metadata declaration in ``nvvm.annotations``. This
+metadata annotates our ``@kernel`` function with the ``kernel`` attribute.
+
+
+Running the Kernel
+------------------
+
+Generating PTX from LLVM IR is all well and good, but how do we execute it on
+a real GPU device? The CUDA Driver API provides a convenient mechanism for
+loading and JIT compiling PTX to a native GPU device, and launching a kernel.
+The API is similar to OpenCL.  A simple example showing how to load and
+execute our vector addition code is shown below. Note that for brevity this
+code does not perform much error checking!
+
+.. note::
+
+  You can also use the ``ptxas`` tool provided by the CUDA Toolkit to offline
+  compile PTX to machine code (SASS) for a specific GPU architecture. Such
+  binaries can be loaded by the CUDA Driver API in the same way as PTX. This
+  can be useful for reducing startup time by precompiling the PTX kernels.
+
+
+.. code-block:: c++
+
+  #include <iostream>
+  #include <fstream>
+  #include <cassert>
+  #include "cuda.h"
+
+
+  void checkCudaErrors(CUresult err) {
+    assert(err == CUDA_SUCCESS);
+  }
+
+  /// main - Program entry point
+  int main(int argc, char **argv) {
+    CUdevice    device;
+    CUmodule    cudaModule;
+    CUcontext   context;
+    CUfunction  function;
+    CUlinkState linker;
+    int         devCount;
+
+    // CUDA initialization
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGetCount(&devCount));
+    checkCudaErrors(cuDeviceGet(&device, 0));
+
+    char name[128];
+    checkCudaErrors(cuDeviceGetName(name, 128, device));
+    std::cout << "Using CUDA Device [0]: " << name << "\n";
+
+    int devMajor, devMinor;
+    checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
+    std::cout << "Device Compute Capability: "
+              << devMajor << "." << devMinor << "\n";
+    if (devMajor < 2) {
+      std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
+      return 1;
+    }
+
+    std::ifstream t("kernel.ptx");
+    if (!t.is_open()) {
+      std::cerr << "kernel.ptx not found\n";
+      return 1;
+    }
+    std::string str((std::istreambuf_iterator<char>(t)),
+                      std::istreambuf_iterator<char>());
+
+    // Create driver context
+    checkCudaErrors(cuCtxCreate(&context, 0, device));
+
+    // Create module for object
+    checkCudaErrors(cuModuleLoadDataEx(&cudaModule, str.c_str(), 0, 0, 0));
+
+    // Get kernel function
+    checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));
+
+    // Device data
+    CUdeviceptr devBufferA;
+    CUdeviceptr devBufferB;
+    CUdeviceptr devBufferC;
+
+    checkCudaErrors(cuMemAlloc(&devBufferA, sizeof(float)*16));
+    checkCudaErrors(cuMemAlloc(&devBufferB, sizeof(float)*16));
+    checkCudaErrors(cuMemAlloc(&devBufferC, sizeof(float)*16));
+
+    float* hostA = new float[16];
+    float* hostB = new float[16];
+    float* hostC = new float[16];
+
+    // Populate input
+    for (unsigned i = 0; i != 16; ++i) {
+      hostA[i] = (float)i;
+      hostB[i] = (float)(2*i);
+      hostC[i] = 0.0f;
+    }
+
+    checkCudaErrors(cuMemcpyHtoD(devBufferA, &hostA[0], sizeof(float)*16));
+    checkCudaErrors(cuMemcpyHtoD(devBufferB, &hostB[0], sizeof(float)*16));
+
+
+    unsigned blockSizeX = 16;
+    unsigned blockSizeY = 1;
+    unsigned blockSizeZ = 1;
+    unsigned gridSizeX  = 1;
+    unsigned gridSizeY  = 1;
+    unsigned gridSizeZ  = 1;
+
+    // Kernel parameters
+    void *KernelParams[] = { &devBufferA, &devBufferB, &devBufferC };
+
+    std::cout << "Launching kernel\n";
+
+    // Kernel launch
+    checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
+                                   blockSizeX, blockSizeY, blockSizeZ,
+                                   0, NULL, KernelParams, NULL));
+
+    // Retrieve device data
+    checkCudaErrors(cuMemcpyDtoH(&hostC[0], devBufferC, sizeof(float)*16));
+
+
+    std::cout << "Results:\n";
+    for (unsigned i = 0; i != 16; ++i) {
+      std::cout << hostA[i] << " + " << hostB[i] << " = " << hostC[i] << "\n";
+    }
+
+
+    // Clean up after ourselves
+    delete [] hostA;
+    delete [] hostB;
+    delete [] hostC;
+
+    // Clean-up
+    checkCudaErrors(cuMemFree(devBufferA));
+    checkCudaErrors(cuMemFree(devBufferB));
+    checkCudaErrors(cuMemFree(devBufferC));
+    checkCudaErrors(cuModuleUnload(cudaModule));
+    checkCudaErrors(cuCtxDestroy(context));
+
+    return 0;
+  }
+
+
+You will need to link with the CUDA driver and specify the path to cuda.h.
+
+.. code-block:: text
+
+  # clang++ sample.cpp -o sample -O2 -g -I/usr/local/cuda-5.5/include -lcuda
+
+We don't need to specify a path to ``libcuda.so`` since this is installed in a
+system location by the driver, not the CUDA toolkit.
+
+If everything goes as planned, you should see the following output when
+running the compiled program:
+
+.. code-block:: text
+
+  Using CUDA Device [0]: GeForce GTX 680
+  Device Compute Capability: 3.0
+  Launching kernel
+  Results:
+  0 + 0 = 0
+  1 + 2 = 3
+  2 + 4 = 6
+  3 + 6 = 9
+  4 + 8 = 12
+  5 + 10 = 15
+  6 + 12 = 18
+  7 + 14 = 21
+  8 + 16 = 24
+  9 + 18 = 27
+  10 + 20 = 30
+  11 + 22 = 33
+  12 + 24 = 36
+  13 + 26 = 39
+  14 + 28 = 42
+  15 + 30 = 45
+
+.. note::
+
+  You will likely see a different device identifier based on your hardware
+
+
+Tutorial: Linking with Libdevice
+================================
+
+In this tutorial, we show a simple example of linking LLVM IR with the
+libdevice library. We will use the same kernel as the previous tutorial,
+except that we will compute ``C = pow(A, B)`` instead of ``C = A + B``.
+Libdevice provides an ``__nv_powf`` function that we will use.
+
+.. code-block:: llvm
+
+  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+  target triple = "nvptx64-nvidia-cuda"
+
+  ; Intrinsic to read X component of thread ID
+  declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
+  ; libdevice function
+  declare float @__nv_powf(float, float)
+
+  define void @kernel(float addrspace(1)* %A,
+                      float addrspace(1)* %B,
+                      float addrspace(1)* %C) {
+  entry:
+    ; What is my ID?
+    %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
+
+    ; Compute pointers into A, B, and C
+    %ptrA = getelementptr float addrspace(1)* %A, i32 %id
+    %ptrB = getelementptr float addrspace(1)* %B, i32 %id
+    %ptrC = getelementptr float addrspace(1)* %C, i32 %id
+
+    ; Read A, B
+    %valA = load float addrspace(1)* %ptrA, align 4
+    %valB = load float addrspace(1)* %ptrB, align 4
+
+    ; Compute C = pow(A, B)
+    %valC = call float @__nv_exp2f(float %valA, float %valB)
+
+    ; Store back to C
+    store float %valC, float addrspace(1)* %ptrC, align 4
+
+    ret void
+  }
+
+  !nvvm.annotations = !{!0}
+  !0 = metadata !{void (float addrspace(1)*,
+                        float addrspace(1)*,
+                        float addrspace(1)*)* @kernel, metadata !"kernel", i32 1}%
+
+
+To compile this kernel, we perform the following steps:
+
+1. Link with libdevice
+2. Internalize all but the public kernel function
+3. Run ``NVVMReflect`` and set ``__CUDA_FTZ`` to 0
+4. Optimize the linked module
+5. Codegen the module
+
+
+These steps can be performed by the LLVM ``llvm-link``, ``opt``, and ``llc``
+tools. In a complete compiler, these steps can also be performed entirely
+programmatically by setting up an appropriate pass configuration (see
+:ref:`libdevice`).
+
+.. code-block:: text
+
+  # llvm-link t2.bc libdevice.compute_20.10.bc -o t2.linked.bc
+  # opt -internalize -internalize-public-api-list=kernel -nvvm-reflect-list=__CUDA_FTZ=0 -nvvm-reflect -O3 t2.linked.bc -o t2.opt.bc
+  # llc -mcpu=sm_20 t2.opt.bc -o t2.ptx
+
+.. note::
+
+  The ``-nvvm-reflect-list=_CUDA_FTZ=0`` is not strictly required, as any
+  undefined variables will default to zero. It is shown here for evaluation
+  purposes.
+
+
+This gives us the following PTX (excerpt):
+
+.. code-block:: text
+
+  //
+  // Generated by LLVM NVPTX Back-End
+  //
+
+  .version 3.1
+  .target sm_20
+  .address_size 64
+
+    // .globl kernel
+                                          // @kernel
+  .visible .entry kernel(
+    .param .u64 kernel_param_0,
+    .param .u64 kernel_param_1,
+    .param .u64 kernel_param_2
+  )
+  {
+    .reg .pred  %p<30>;
+    .reg .f32   %f<111>;
+    .reg .s32   %r<21>;
+    .reg .s64   %rl<8>;
+
+  // BB#0:                                // %entry
+    ld.param.u64  %rl2, [kernel_param_0];
+    mov.u32   %r3, %tid.x;
+    ld.param.u64  %rl3, [kernel_param_1];
+    mul.wide.s32  %rl4, %r3, 4;
+    add.s64   %rl5, %rl2, %rl4;
+    ld.param.u64  %rl6, [kernel_param_2];
+    add.s64   %rl7, %rl3, %rl4;
+    add.s64   %rl1, %rl6, %rl4;
+    ld.global.f32   %f1, [%rl5];
+    ld.global.f32   %f2, [%rl7];
+    setp.eq.f32 %p1, %f1, 0f3F800000;
+    setp.eq.f32 %p2, %f2, 0f00000000;
+    or.pred   %p3, %p1, %p2;
+    @%p3 bra  BB0_1;
+    bra.uni   BB0_2;
+  BB0_1:
+    mov.f32   %f110, 0f3F800000;
+    st.global.f32   [%rl1], %f110;
+    ret;
+  BB0_2:                                  // %__nv_isnanf.exit.i
+    abs.f32   %f4, %f1;
+    setp.gtu.f32  %p4, %f4, 0f7F800000;
+    @%p4 bra  BB0_4;
+  // BB#3:                                // %__nv_isnanf.exit5.i
+    abs.f32   %f5, %f2;
+    setp.le.f32 %p5, %f5, 0f7F800000;
+    @%p5 bra  BB0_5;
+  BB0_4:                                  // %.critedge1.i
+    add.f32   %f110, %f1, %f2;
+    st.global.f32   [%rl1], %f110;
+    ret;
+  BB0_5:                                  // %__nv_isinff.exit.i
+
+    ...
+
+  BB0_26:                                 // %__nv_truncf.exit.i.i.i.i.i
+    mul.f32   %f90, %f107, 0f3FB8AA3B;
+    cvt.rzi.f32.f32 %f91, %f90;
+    mov.f32   %f92, 0fBF317200;
+    fma.rn.f32  %f93, %f91, %f92, %f107;
+    mov.f32   %f94, 0fB5BFBE8E;
+    fma.rn.f32  %f95, %f91, %f94, %f93;
+    mul.f32   %f89, %f95, 0f3FB8AA3B;
+    // inline asm
+    ex2.approx.ftz.f32 %f88,%f89;
+    // inline asm
+    add.f32   %f96, %f91, 0f00000000;
+    ex2.approx.f32  %f97, %f96;
+    mul.f32   %f98, %f88, %f97;
+    setp.lt.f32 %p15, %f107, 0fC2D20000;
+    selp.f32  %f99, 0f00000000, %f98, %p15;
+    setp.gt.f32 %p16, %f107, 0f42D20000;
+    selp.f32  %f110, 0f7F800000, %f99, %p16;
+    setp.eq.f32 %p17, %f110, 0f7F800000;
+    @%p17 bra   BB0_28;
+  // BB#27:
+    fma.rn.f32  %f110, %f110, %f108, %f110;
+  BB0_28:                                 // %__internal_accurate_powf.exit.i
+    setp.lt.f32 %p18, %f1, 0f00000000;
+    setp.eq.f32 %p19, %f3, 0f3F800000;
+    and.pred    %p20, %p18, %p19;
+    @!%p20 bra  BB0_30;
+    bra.uni   BB0_29;
+  BB0_29:
+    mov.b32    %r9, %f110;
+    xor.b32   %r10, %r9, -2147483648;
+    mov.b32    %f110, %r10;
+  BB0_30:                                 // %__nv_powf.exit
+    st.global.f32   [%rl1], %f110;
+    ret;
+  }
+
diff --git a/docs/Passes.rst b/docs/Passes.rst
index d30c3ca..029e472 100644
--- a/docs/Passes.rst
+++ b/docs/Passes.rst
@@ -476,7 +476,7 @@ transformation obviously invalidates the CFG, but can update forward dominator
 -------------------------------------------------
 
 This pass munges the code in the input function to better prepare it for
-SelectionDAG-based code generation.  This works around limitations in it's
+SelectionDAG-based code generation.  This works around limitations in its
 basic-block-at-a-time approach.  It should eventually be removed.
 
 ``-constmerge``: Merge Duplicate Global Constants
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 0c6990e..6fdea1f 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -68,7 +68,7 @@ To upload a new patch:
 To submit an updated patch:
 
 * Click *Differential*.
-* Click *Create Revision*.
+* Click *Create Diff*.
 * Paste the updated diff.
 * Select the review you want to from the *Attach To* dropdown and click
   *Continue*.
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 2910a2a..99aa5c7 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -1172,7 +1172,7 @@ The drawback of SetVector is that it requires twice as much space as a normal
 set and has the sum of constant factors from the set-like container and the
 sequential container that it uses.  Use it **only** if you need to iterate over
 the elements in a deterministic order.  SetVector is also expensive to delete
-elements out of (linear time), unless you use it's "pop_back" method, which is
+elements out of (linear time), unless you use its "pop_back" method, which is
 faster.
 
 ``SetVector`` is an adapter class that defaults to using ``std::vector`` and a
@@ -2080,7 +2080,7 @@ the ``llvm_stop_multithreaded()`` call.  You can also use the
 
 Note that both of these calls must be made *in isolation*.  That is to say that
 no other LLVM API calls may be executing at any time during the execution of
-``llvm_start_multithreaded()`` or ``llvm_stop_multithreaded``.  It's is the
+``llvm_start_multithreaded()`` or ``llvm_stop_multithreaded``.  It is the
 client's responsibility to enforce this isolation.
 
 The return value of ``llvm_start_multithreaded()`` indicates the success or
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 7b143f6..94663c4 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -5,12 +5,6 @@ LLVM 3.4 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 3.4 release.  You may
-   prefer the `LLVM 3.3 Release Notes <http://llvm.org/releases/3.3/docs
-   /ReleaseNotes.html>`_.
-
-
 Introduction
 ============
 
@@ -34,12 +28,15 @@ page <http://llvm.org/releases/>`_.
 Non-comprehensive list of changes in this release
 =================================================
 
-.. NOTE
-   For small 1-3 sentence descriptions, just add an entry at the end of
-   this list. If your description won't fit comfortably in one bullet
-   point (e.g. maybe you would like to give an example of the
-   functionality, or simply have a lot to talk about), see the `NOTE` below
-   for adding a new subsection.
+* This is expected to be the last release of LLVM which compiles using a C++98
+  toolchain. We expect to start using some C++11 features in LLVM and other
+  sub-projects starting after this release. That said, we are committed to
+  supporting a reasonable set of modern C++ toolchains as the host compiler on
+  all of the platforms. This will at least include Visual Studio 2012 on
+  Windows, and Clang 3.1 or GCC 4.7.x on Mac and Linux. The final set of
+  compilers (and the C++11 features they support) is not set in stone, but we
+  wanted users of LLVM to have a heads up that the next release will involve
+  a substantial change in the host toolchain requirements.
 
 * The regression tests now fail if any command in a pipe fails. To disable it in
   a directory, just add ``config.pipefail = False`` to its ``lit.local.cfg``.
@@ -50,9 +47,9 @@ Non-comprehensive list of changes in this release
 
 * The R600 backend is not marked experimental anymore and is built by default.
 
-* APFloat::isNormal() was renamed to APFloat::isFiniteNonZero() and
-  APFloat::isIEEENormal() was renamed to APFloat::isNormal(). This ensures that
-  APFloat::isNormal() conforms to IEEE-754R-2008.
+* ``APFloat::isNormal()`` was renamed to ``APFloat::isFiniteNonZero()`` and
+  ``APFloat::isIEEENormal()`` was renamed to ``APFloat::isNormal()``. This
+  ensures that ``APFloat::isNormal()`` conforms to IEEE-754R-2008.
 
 * The library call simplification pass has been removed.  Its functionality
   has been integrated into the instruction combiner and function attribute
@@ -62,26 +59,74 @@ Non-comprehensive list of changes in this release
   or later instead. For more information, see the `Getting Started using Visual
   Studio <GettingStartedVS.html>`_ page.
 
-* The Loop Vectorizer that was previously enabled for -O3 is now enabled for
-  -Os and -O2.
+* The Loop Vectorizer that was previously enabled for ``-O3`` is now enabled
+  for ``-Os`` and ``-O2``.
 
 * The new SLP Vectorizer is now enabled by default.
 
-* llvm-ar now uses the new Object library and produces archives and
+* ``llvm-ar`` now uses the new Object library and produces archives and
   symbol tables in the gnu format.
 
-* ... next change ...
+* FileCheck now allows specifing ``-check-prefix`` multiple times. This
+  helps reduce duplicate check lines when using multiple RUN lines.
+
+* The bitcast instruction no longer allows casting between pointers
+   with different address spaces. To achieve this, use the new addrspacecast
+   instruction.
+
+* Different sized pointers for different address spaces should now
+  generally work. This is primarily useful for GPU targets.
+
+* OCaml bindings have been significantly extended to cover almost all of the
+  LLVM libraries.
+
+Mips Target
+-----------
+
+Support for the MIPS SIMD Architecture (MSA) has been added. MSA is supported
+through inline assembly, intrinsics with the prefix '``__builtin_msa``', and
+normal code generation.
 
-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
+For more information on MSA (including documentation for the instruction set),
+see the `MIPS SIMD page at Imagination Technologies
+<http://imgtec.com/mips/mips-simd.asp>`_
 
-   Special New Feature
-   -------------------
+PowerPC Target
+--------------
 
-   Makes programs 10x faster by doing Special New Thing.
+Changes in the PowerPC backend include:
 
+* fast-isel support (for faster ``-O0`` code generation)
+* many improvements to the builtin assembler
+* support for generating unaligned (Altivec) vector loads
+* support for generating the fcpsgn instruction
+* generate ``frin`` for ``round()`` (not ``nearbyint()`` and ``rint()``, which
+  had been done only in fast-math mode)
+* improved instruction scheduling for embedded cores (such as the A2)
+* improved prologue/epilogue generation (especially in 32-bit mode)
+* support for dynamic stack alignment (and dynamic stack allocations with large alignments)
+* improved generation of counter-register-based loops
+* bug fixes
+
+SPARC Target
+------------
+
+The SPARC backend got many improvements, namely
+
+* experimental SPARC V9 backend
+* JIT support for SPARC
+* fp128 support
+* exception handling
+* TLS support
+* leaf functions optimization
+* bug fixes
+
+SystemZ/s390x Backend
+---------------------
+
+LLVM and clang can now optimize for zEnterprise z196 and zEnterprise EC12
+targets.  In clang these targets are selected using ``-march=z196`` and
+``-march=zEC12`` respectively.
 
 External Open Source Projects Using LLVM 3.4
 ============================================
@@ -90,6 +135,105 @@ An exciting aspect of LLVM is that it is used as an enabling technology for
 a lot of other language and tools projects. This section lists some of the
 projects that have already been updated to work with LLVM 3.4.
 
+DXR
+---
+
+`DXR <https://wiki.mozilla.org/DXR>`_ is Mozilla's code search and navigation
+tool, aimed at making sense of large projects like Firefox. It supports
+full-text and regex searches as well as structural queries like "Find all the
+callers of this function." Behind the scenes, it uses a custom trigram index,
+the re2 library, and structural data collected by a clang compiler plugin.
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux/PPC64.
+Ports to other architectures like ARM and AArch64 are underway.
+
+LibBeauty
+---------
+
+The `LibBeauty <http://www.libbeauty.com>`_ decompiler and reverse
+engineering tool currently utilises the LLVM disassembler and the LLVM IR
+Builder. The current aim of the project is to take a x86_64 binary ``.o`` file
+as input, and produce an equivalent LLVM IR ``.bc`` or ``.ll`` file as
+output. Support for ARM binary ``.o`` file as input will be added later.
+
+Likely
+------
+
+`Likely <http://www.liblikely.org/>`_ is an open source domain specific
+language for image recognition.  Algorithms are just-in-time compiled using
+LLVM's MCJIT infrastructure to execute on single or multi-threaded CPUs as well
+as OpenCL SPIR or CUDA enabled GPUs. Likely exploits the observation that while
+image processing and statistical learning kernels must be written generically
+to handle any matrix datatype, at runtime they tend to be executed repeatedly
+on the same type.
+
+Portable Computing Language (pocl)
+----------------------------------
+
+In addition to producing an easily portable open source OpenCL
+implementation, another major goal of `pocl <http://portablecl.org/>`_
+is improving performance portability of OpenCL programs with
+compiler optimizations, reducing the need for target-dependent manual
+optimizations. An important part of pocl is a set of LLVM passes used to
+statically parallelize multiple work-items with the kernel compiler, even in
+the presence of work-group barriers. This enables static parallelization of
+the fine-grained static concurrency in the work groups in multiple ways. 
+
+Portable Native Client (PNaCl)
+------------------------------
+
+`Portable Native Client (PNaCl) <http://www.chromium.org/nativeclient/pnacl>`_
+is a Chrome initiative to bring the performance and low-level control of native
+code to modern web browsers, without sacrificing the security benefits and
+portability of web applications. PNaCl works by compiling native C and C++ code
+to an intermediate representation using the LLVM clang compiler. This
+intermediate representation is a subset of LLVM bytecode that is wrapped into a
+portable executable, which can be hosted on a web server like any other website
+asset. When the site is accessed, Chrome fetches and translates the portable
+executable into an architecture-specific machine code optimized directly for
+the underlying device. PNaCl lets developers compile their code once to run on
+any hardware platform and embed their PNaCl application in any website,
+enabling developers to directly leverage the power of the underlying CPU and
+GPU.
+
+TTA-based Co-design Environment (TCE)
+-------------------------------------
+
+`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing new
+exposed datapath processors based on the Transport triggered architecture (TTA). 
+The toolset provides a complete co-design flow from C/C++
+programs down to synthesizable VHDL/Verilog and parallel program binaries.
+Processor customization points include the register files, function units,
+supported operations, and the interconnection network.
+
+TCE uses Clang and LLVM for C/C++/OpenCL C language support, target independent 
+optimizations and also for parts of code generation. It generates
+new LLVM-based code generators "on the fly" for the designed processors and
+loads them in to the compiler backend as runtime libraries to avoid
+per-target recompilation of larger parts of the compiler chain. 
+
+WebCL Validator
+---------------
+
+`WebCL Validator <https://github.com/KhronosGroup/webcl-validator>`_ implements
+validation for WebCL C language which is a subset of OpenCL ES 1.1. Validator
+checks the correctness of WebCL C, and implements memory protection for it as a
+source-2-source transformation. The transformation converts WebCL to memory
+protected OpenCL. The protected OpenCL cannot access any memory ranges which
+were not allocated for it, and its memory is always initialized to prevent
+information leakage from other programs.
+
 
 Additional Information
 ======================
@@ -103,4 +247,3 @@ going into the ``llvm/docs/`` directory in the LLVM tree.
 
 If you have any questions or comments about LLVM, please feel free to contact
 us via the `mailing lists <http://llvm.org/docs/#maillist>`_.
-
diff --git a/docs/ReleaseProcess.rst b/docs/ReleaseProcess.rst
index c4bbc91..0836b6e 100644
--- a/docs/ReleaseProcess.rst
+++ b/docs/ReleaseProcess.rst
@@ -52,16 +52,18 @@ The scripts are in the ``utils/release`` directory.
 test-release.sh
 ---------------
 
-This script will check-out, configure and compile LLVM+Clang (+ most add-ons, like ``compiler-rt``,
-``libcxx`` and ``clang-extra-tools``) in three stages, and will test the final stage.
-It'll have installed the final binaries on the Phase3/Releasei(+Asserts) directory, and
-that's the one you should use for the test-suite and other external tests.
+This script will check-out, configure and compile LLVM+Clang (+ most add-ons,
+like ``compiler-rt``, ``libcxx`` and ``clang-extra-tools``) in three stages, and
+will test the final stage.  It'll have installed the final binaries on the
+Phase3/Releasei(+Asserts) directory, and that's the one you should use for the
+test-suite and other external tests.
 
 To run the script on a specific release candidate run::
 
    ./test-release.sh \
-        -release 3.3 \
+        -release 3.4 \
         -rc 1 \
+        -triple x86_64-apple-darwin \
         -no-64bit \
         -test-asserts \
         -no-compare-files
diff --git a/docs/SourceLevelDebugging.rst b/docs/SourceLevelDebugging.rst
index ad03871..a1d8110 100644
--- a/docs/SourceLevelDebugging.rst
+++ b/docs/SourceLevelDebugging.rst
@@ -295,7 +295,7 @@ Subprogram descriptors
     i32,      ;; Index into a virtual function
     metadata, ;; indicates which base type contains the vtable pointer for the
               ;; derived class
-    i32,      ;; Flags - Artifical, Private, Protected, Explicit, Prototyped.
+    i32,      ;; Flags - Artificial, Private, Protected, Explicit, Prototyped.
     i1,       ;; isOptimized
     Function * , ;; Pointer to LLVM function
     metadata, ;; Lists function template parameters
@@ -346,7 +346,7 @@ Basic type descriptors
 
   !4 = metadata !{
     i32,      ;; Tag = 36 (DW_TAG_base_type)
-    metadata,;; Source directory (including trailing slash) & file pair (may be null)
+    metadata, ;; Source directory (including trailing slash) & file pair (may be null)
     metadata, ;; Reference to context
     metadata, ;; Name (may be "" for anonymous types)
     i32,      ;; Line number where defined (may be 0)
@@ -389,7 +389,7 @@ Derived type descriptors
 
   !5 = metadata !{
     i32,      ;; Tag (see below)
-    metadata,;; Source directory (including trailing slash) & file pair (may be null)
+    metadata, ;; Source directory (including trailing slash) & file pair (may be null)
     metadata, ;; Reference to context
     metadata, ;; Name (may be "" for anonymous types)
     i32,      ;; Line number where defined (may be 0)
@@ -452,7 +452,7 @@ Composite type descriptors
 
   !6 = metadata !{
     i32,      ;; Tag (see below)
-    metadata,;; Source directory (including trailing slash) & file pair (may be null)
+    metadata, ;; Source directory (including trailing slash) & file pair (may be null)
     metadata, ;; Reference to context
     metadata, ;; Name (may be "" for anonymous types)
     i32,      ;; Line number where defined (may be 0)
@@ -462,9 +462,10 @@ Composite type descriptors
     i32,      ;; Flags
     metadata, ;; Reference to type derived from
     metadata, ;; Reference to array of member descriptors
-    i32       ;; Runtime languages
+    i32,      ;; Runtime languages
     metadata, ;; Base type containing the vtable pointer for this type
-    metadata  ;; Template parameters
+    metadata, ;; Template parameters
+    metadata  ;; A unique identifier for type uniquing purpose (may be null)
   }
 
 These descriptors are used to define types that are composed of 0 or more
@@ -649,85 +650,86 @@ Compiled to LLVM, this function would be represented like this:
 
 .. code-block:: llvm
 
-  define void @_Z3foov() #0 {
+  define void @foo() #0 {
   entry:
-    %X = alloca i32, align 4                        ; [#uses=3 type=i32*]
-    %Y = alloca i32, align 4                        ; [#uses=2 type=i32*]
-    %Z = alloca i32, align 4                        ; [#uses=2 type=i32*]
-    call void @llvm.dbg.declare(metadata !{i32* %X}, metadata !8), !dbg !10
+   %X = alloca i32, align 4
+    %Y = alloca i32, align 4
+    %Z = alloca i32, align 4
+    call void @llvm.dbg.declare(metadata !{i32* %X}, metadata !10), !dbg !12
       ; [debug line = 2:7] [debug variable = X]
-    store i32 21, i32* %X, align 4, !dbg !11        ; [debug line = 2:13]
-    call void @llvm.dbg.declare(metadata !{i32* %Y}, metadata !12), !dbg !13
+    store i32 21, i32* %X, align 4, !dbg !12
+    call void @llvm.dbg.declare(metadata !{i32* %Y}, metadata !13), !dbg !14
       ; [debug line = 3:7] [debug variable = Y]
-    store i32 22, i32* %Y, align 4, !dbg !14        ; [debug line = 3:13]
+    store i32 22, i32* %Y, align 4, !dbg !14
     call void @llvm.dbg.declare(metadata !{i32* %Z}, metadata !15), !dbg !17
       ; [debug line = 5:9] [debug variable = Z]
-    store i32 23, i32* %Z, align 4, !dbg !18        ; [debug line = 5:15]
-    %0 = load i32* %X, align 4, !dbg !19            ; [#uses=1 type=i32] \
+    store i32 23, i32* %Z, align 4, !dbg !17
+    %0 = load i32* %X, align 4, !dbg !18
       [debug line = 6:5]
-    store i32 %0, i32* %Z, align 4, !dbg !19        ; [debug line = 6:5]
-    %1 = load i32* %Y, align 4, !dbg !20            ; [#uses=1 type=i32] \
+    store i32 %0, i32* %Z, align 4, !dbg !18
+    %1 = load i32* %Y, align 4, !dbg !19
       [debug line = 8:3]
-    store i32 %1, i32* %X, align 4, !dbg !20        ; [debug line = 8:3]
-    ret void, !dbg !21                              ; [debug line = 9:1]
+    store i32 %1, i32* %X, align 4, !dbg !19
+    ret void, !dbg !20
   }
 
-  ; [#uses=3]
   ; Function Attrs: nounwind readnone
   declare void @llvm.dbg.declare(metadata, metadata) #1
 
-  attributes #0 = { optsize zeroext "less-precise-fpmad"="false"
-    "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true"
-    "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false"
+  attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false"
+    "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
+    "no-infs-fp-math"="false" "no-nans-fp-math"="false"
+    "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
     "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
 
   !llvm.dbg.cu = !{!0}
-  
+  !llvm.module.flags = !{!8}
+  !llvm.ident = !{!9}
+
   !0 = metadata !{i32 786449, metadata !1, i32 12,
-                  metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0,
-                  metadata !2, metadata !2, metadata !3, metadata !2,
-                  metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] \
+                  metadata !"clang version 3.4 (trunk 193128) (llvm/trunk 193139)",
+                  i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3,
+                  metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] \
                     [/private/tmp/foo.c] \
-                    [DW_LANG_C]
-  !1 = metadata !{metadata !"foo.c", metadata !"/private/tmp"}
+                    [DW_LANG_C99]
+  !1 = metadata !{metadata !"t.c", metadata !"/private/tmp"}
   !2 = metadata !{i32 0}
   !3 = metadata !{metadata !4}
   !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo",
-                  metadata !"foo", metadata !"_Z3foov", i32 1, metadata !6,
-                  i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false,
-                  void ()* @_Z3foov, null, null, metadata !2, i32 1}
+                  metadata !"foo", metadata !"", i32 1, metadata !6,
+                  i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false,
+                  void ()* @foo, null, null, metadata !2, i32 1}
                   ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-  !5 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] \
-                    [/private/tmp/foo.c]
-  !6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0,
-                  i64 0, i32 0, null, metadata !7, i32 0, i32 0}
+  !5 = metadata !{i32 786473, metadata !1}  ; [ DW_TAG_file_type ] \
+                    [/private/tmp/t.c]
+  !6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0,
+                  i64 0, i32 0, null, metadata !7, i32 0, null, null, null}
                   ; [ DW_TAG_subroutine_type ] \
                     [line 0, size 0, align 0, offset 0] [from ]
   !7 = metadata !{null}
-  !8 = metadata !{i32 786688, metadata !4, metadata !"X", metadata !5, i32 2, \
-                  metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] \
-                    [line 2]
-  !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, \
-                  i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] \
-                    [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-  !10 = metadata !{i32 2, i32 7, metadata !4, null}
-  !11 = metadata !{i32 2, i32 13, metadata !4, null}
-  !12 = metadata !{i32 786688, metadata !4, metadata !"Y", metadata !5, i32 3, \
-                   metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Y] \
+  !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+  !9 = metadata !{metadata !"clang version 3.4 (trunk 193128) (llvm/trunk 193139)"}
+  !10 = metadata !{i32 786688, metadata !4, metadata !"X", metadata !5, i32 2,
+                   metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] \
+                     [line 2]
+  !11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32,
+                   i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] \
+                     [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+  !12 = metadata !{i32 2, i32 0, metadata !4, null}
+  !13 = metadata !{i32 786688, metadata !4, metadata !"Y", metadata !5, i32 3,
+                   metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Y] \
                      [line 3]
-  !13 = metadata !{i32 3, i32 7, metadata !4, null}
-  !14 = metadata !{i32 3, i32 13, metadata !4, null}
-  !15 = metadata !{i32 786688, metadata !16, metadata !"Z", metadata !5, i32 5, \
-                   metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Z] \
+  !14 = metadata !{i32 3, i32 0, metadata !4, null}
+  !15 = metadata !{i32 786688, metadata !16, metadata !"Z", metadata !5, i32 5,
+                   metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Z] \
                      [line 5]
-  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 3, i32 0}
-                   ; [ DW_TAG_lexical_block ] [/private/tmp/foo.c]
-  !17 = metadata !{i32 5, i32 9, metadata !16, null}
-  !18 = metadata !{i32 5, i32 15, metadata !16, null}
-  !19 = metadata !{i32 6, i32 5, metadata !16, null}
-  !20 = metadata !{i32 8, i32 3, metadata !4, null}
-  !21 = metadata !{i32 9, i32 1, metadata !4, null}
+  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0} \
+                   ; [ DW_TAG_lexical_block ] [/private/tmp/t.c]
+  !17 = metadata !{i32 5, i32 0, metadata !16, null}
+  !18 = metadata !{i32 6, i32 0, metadata !16, null}
+  !19 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+  !20 = metadata !{i32 9, i32 0, metadata !4, null}
 
 This example illustrates a few important details about LLVM debugging
 information.  In particular, it shows how the ``llvm.dbg.declare`` intrinsic and
@@ -737,23 +739,23 @@ variable definitions, and the code used to implement the function.
 
 .. code-block:: llvm
 
-  call void @llvm.dbg.declare(metadata !{i32* %X}, metadata !8), !dbg !10
+  call void @llvm.dbg.declare(metadata !{i32* %X}, metadata !10), !dbg !12
     ; [debug line = 2:7] [debug variable = X]
 
 The first intrinsic ``%llvm.dbg.declare`` encodes debugging information for the
-variable ``X``.  The metadata ``!dbg !10`` attached to the intrinsic provides
+variable ``X``.  The metadata ``!dbg !12`` attached to the intrinsic provides
 scope information for the variable ``X``.
 
 .. code-block:: llvm
 
-  !10 = metadata !{i32 2, i32 7, metadata !4, null}
+  !12 = metadata !{i32 2, i32 0, metadata !4, null}
   !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo",
-                  metadata !"foo", metadata !"_Z3foov", i32 1, metadata !6,
-                  i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false,
-                  void ()* @_Z3foov, null, null, metadata !2, i32 1}
-                  ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+                  metadata !"foo", metadata !"", i32 1, metadata !6,
+                  i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false,
+                  void ()* @foo, null, null, metadata !2, i32 1}
+                    ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 
-Here ``!10`` is metadata providing location information.  It has four fields:
+Here ``!12`` is metadata providing location information.  It has four fields:
 line number, column number, scope, and original scope.  The original scope
 represents inline location if this instruction is inlined inside a caller, and
 is null otherwise.  In this example, scope is encoded by ``!4``, a
@@ -774,12 +776,12 @@ scope information for the variable ``Z``.
 
 .. code-block:: llvm
 
-  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 3, i32 0}
-                   ; [ DW_TAG_lexical_block ] [/private/tmp/foo.c]
-  !17 = metadata !{i32 5, i32 9, metadata !16, null}
+  !16 = metadata !{i32 786443, metadata !1, metadata !4, i32 4, i32 0, i32 0}
+                   ; [ DW_TAG_lexical_block ] [/private/tmp/t.c]
+  !17 = metadata !{i32 5, i32 0, metadata !16, null}
 
 Here ``!15`` indicates that ``Z`` is declared at line number 5 and
-column number 9 inside of lexical scope ``!16``.  The lexical scope itself
+column number 0 inside of lexical scope ``!16``.  The lexical scope itself
 resides inside of subprogram ``!4`` described above.
 
 The scope information attached with each instruction provides a straightforward
diff --git a/docs/TestingGuide.rst b/docs/TestingGuide.rst
index 3cfbb21..c9a35cd 100644
--- a/docs/TestingGuide.rst
+++ b/docs/TestingGuide.rst
@@ -21,9 +21,9 @@ tests.
 Requirements
 ============
 
-In order to use the LLVM testing infrastructure, you will need all of
-the software required to build LLVM, as well as
-`Python <http://python.org>`_ 2.4 or later.
+In order to use the LLVM testing infrastructure, you will need all of the
+software required to build LLVM, as well as `Python <http://python.org>`_ 2.5 or
+later.
 
 LLVM testing infrastructure organization
 ========================================
@@ -120,12 +120,14 @@ can run the LLVM and Clang tests simultaneously using:
 
     % make check-all
 
-To run the tests with Valgrind (Memcheck by default), just append
-``VG=1`` to the commands above, e.g.:
+To run the tests with Valgrind (Memcheck by default), use the ``LIT_ARGS`` make
+variable to pass the required options to lit. For example, you can use:
 
 .. code-block:: bash
 
-    % make check VG=1
+    % make check LIT_ARGS="-v --vg --vg-leak"
+
+to enable testing with valgrind and with leak checking enabled.
 
 To run individual tests or subsets of tests, you can use the ``llvm-lit``
 script which is built as part of LLVM. For example, to run the
diff --git a/docs/WritingAnLLVMBackend.rst b/docs/WritingAnLLVMBackend.rst
index 73381b5..35a2d16 100644
--- a/docs/WritingAnLLVMBackend.rst
+++ b/docs/WritingAnLLVMBackend.rst
@@ -911,6 +911,9 @@ format instructions will bind the operands to the ``rd``, ``rs1``, and ``rs2``
 fields.  This results in the ``XNORrr`` instruction binding ``$dst``, ``$b``,
 and ``$c`` operands to the ``rd``, ``rs1``, and ``rs2`` fields respectively.
 
+Instruction Operand Name Mapping
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 TableGen will also generate a function called getNamedOperandIdx() which
 can be used to look up an operand's index in a MachineInstr based on its
 TableGen name.  Setting the UseNamedOperandTable bit in an instruction's
@@ -952,6 +955,59 @@ XXXInstrInfo.h:
     int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
   } // End namespace XXX
 
+Instruction Operand Types
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TableGen will also generate an enumeration consisting of all named Operand
+types defined in the backend, in the llvm::XXX::OpTypes namespace.
+Some common immediate Operand types (for instance i8, i32, i64, f32, f64)
+are defined for all targets in ``include/llvm/Target/Target.td``, and are
+available in each Target's OpTypes enum.  Also, only named Operand types appear
+in the enumeration: anonymous types are ignored.
+For example, the X86 backend defines ``brtarget`` and ``brtarget8``, both
+instances of the TableGen ``Operand`` class, which represent branch target
+operands:
+
+.. code-block:: llvm
+
+  def brtarget : Operand<OtherVT>;
+  def brtarget8 : Operand<OtherVT>;
+
+This results in:
+
+.. code-block:: c++
+
+  namespace X86 {
+  namespace OpTypes {
+  enum OperandType {
+    ...
+    brtarget,
+    brtarget8,
+    ...
+    i32imm,
+    i64imm,
+    ...
+    OPERAND_TYPE_LIST_END
+  } // End namespace OpTypes
+  } // End namespace X86
+
+In typical TableGen fashion, to use the enum, you will need to define a
+preprocessor macro:
+
+.. code-block:: c++
+
+  #define GET_INSTRINFO_OPERAND_TYPES_ENUM // For OpTypes enum
+  #include "XXXGenInstrInfo.inc"
+
+
+Instruction Scheduling
+----------------------
+
+Instruction itineraries can be queried using MCDesc::getSchedClass(). The
+value can be named by an enumemation in llvm::XXX::Sched namespace generated
+by TableGen in XXXGenInstrInfo.inc. The name of the schedule classes are
+the same as provided in XXXSchedule.td plus a default NoItinerary class.
+
 Instruction Relation Mapping
 ----------------------------
 
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
index 1114fa0..f9cb4fe 100644
--- a/docs/WritingAnLLVMPass.rst
+++ b/docs/WritingAnLLVMPass.rst
@@ -131,7 +131,7 @@ Next, we declare our pass itself:
 
   struct Hello : public FunctionPass {
 
-This declares a "``Hello``" class that is a subclass of `FunctionPass
+This declares a "``Hello``" class that is a subclass of :ref:`FunctionPass
 <writing-an-llvm-pass-FunctionPass>`.  The different builtin pass subclasses
 are described in detail :ref:`later <writing-an-llvm-pass-pass-classes>`, but
 for now, know that ``FunctionPass`` operates on a function at a time.
diff --git a/docs/YamlIO.rst b/docs/YamlIO.rst
index a5cb637..3ecd03a 100644
--- a/docs/YamlIO.rst
+++ b/docs/YamlIO.rst
@@ -408,7 +408,7 @@ some time format (e.g. 4-May-2012 10:30pm).  YAML I/O has a way to support
 custom formatting and parsing of scalar types by specializing ScalarTraits<> on
 your data type.  When writing, YAML I/O will provide the native type and
 your specialization must create a temporary llvm::StringRef.  When reading,
-YAML I/O will provide a llvm::StringRef of scalar and your specialization
+YAML I/O will provide an llvm::StringRef of scalar and your specialization
 must convert that to your native data type.  An outline of a custom scalar type
 looks like:
 
@@ -549,7 +549,7 @@ coordinates into polar when reading YAML.
     };
 
 When writing YAML, the local variable "keys" will be a stack allocated 
-instance of NormalizedPolar, constructed from the suppled polar object which
+instance of NormalizedPolar, constructed from the supplied polar object which
 initializes it x and y fields.  The mapRequired() methods then write out the x
 and y values as key/value pairs.  
 
@@ -633,6 +633,20 @@ This works for both reading and writing. For example:
     };
 
 
+Tags
+----
+
+The YAML syntax supports tags as a way to specify the type of a node before
+it is parsed. This allows dynamic types of nodes.  But the YAML I/O model uses
+static typing, so there are limits to how you can use tags with the YAML I/O
+model. Recently, we added support to YAML I/O for checking/setting the optional 
+tag on a map. Using this functionality it is even possbile to support differnt 
+mappings, as long as they are convertable.  
+
+To check a tag, inside your mapping() method you can use io.mapTag() to specify
+what the tag should be.  This will also add that tag when writing yaml.
+
+
 Sequence
 ========
 
@@ -646,7 +660,7 @@ llvm::yaml::SequenceTraits on T and implement two methods:
   template <>
   struct SequenceTraits<MySeq> {
     static size_t size(IO &io, MySeq &list) { ... }
-    static MySeqEl element(IO &io, MySeq &list, size_t index) { ... }
+    static MySeqEl &element(IO &io, MySeq &list, size_t index) { ... }
   };
 
 The size() method returns how many elements are currently in your sequence.
@@ -669,7 +683,7 @@ add "static const bool flow = true;".  For instance:
   template <>
   struct SequenceTraits<MyList> {
     static size_t size(IO &io, MyList &list) { ... }
-    static MyListEl element(IO &io, MyList &list, size_t index) { ... }
+    static MyListEl &element(IO &io, MyList &list, size_t index) { ... }
     
     // The existence of this member causes YAML I/O to use a flow sequence
     static const bool flow = true;
diff --git a/docs/doxygen.cfg.in b/docs/doxygen.cfg.in
index 20de077..0ed686b 100644
--- a/docs/doxygen.cfg.in
+++ b/docs/doxygen.cfg.in
@@ -1,3 +1,4 @@
+
 # Doxyfile 1.7.1
 
 # This file describes the settings to be used by the documentation system
@@ -1068,7 +1069,7 @@ FORMULA_TRANSPARENT    = YES
 # typically be disabled. For large projects the javascript based search engine
 # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
 
-SEARCHENGINE           = NO
+SEARCHENGINE           = @enable_searchengine@
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
 # implemented using a PHP enabled web server instead of at the web client
@@ -1078,7 +1079,15 @@ SEARCHENGINE           = NO
 # full text search. The disadvances is that it is more difficult to setup
 # and does not have live searching capabilities.
 
-SERVER_BASED_SEARCH    = NO
+SERVER_BASED_SEARCH    = @enable_server_based_search@
+
+SEARCHENGINE_URL       = @searchengine_url@
+
+EXTERNAL_SEARCH        = @enable_external_search@
+
+EXTERNAL_SEARCH_ID     = llvm
+
+EXTRA_SEARCH_MAPPINGS  = @extra_search_mappings@
 
 #---------------------------------------------------------------------------
 # configuration options related to the LaTeX output
diff --git a/docs/index.rst b/docs/index.rst
index be72195..62766f1 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -66,6 +66,7 @@ representation.
 
    CMake
    HowToBuildOnARM
+   HowToCrossCompileLLVM
    CommandGuide/index
    GettingStarted
    GettingStartedVS
@@ -82,6 +83,7 @@ representation.
    Passes
    YamlIO
    GetElementPtr
+   MCJITDesignAndImplementation
 
 :doc:`GettingStarted`
    Discusses how to get up and running quickly with the LLVM infrastructure.
@@ -95,6 +97,9 @@ representation.
 :doc:`HowToBuildOnARM`
    Notes on building and testing LLVM/Clang on ARM.
 
+:doc:`HowToCrossCompileLLVM`
+   Notes on cross-building and testing LLVM/Clang.
+
 :doc:`GettingStartedVS`
    An addendum to the main Getting Started guide for those using Visual Studio
    on Windows.
@@ -285,6 +290,9 @@ For API clients and LLVM developers.
 :doc:`DebuggingJITedCode`
    How to debug JITed code with GDB.
 
+:doc:`MCJITDesignAndImplementation`
+   Describes the inner workings of MCJIT execution engine.
+
 :doc:`BranchWeightMetadata`
    Provides information about Branch Prediction Information.
 
@@ -315,6 +323,7 @@ Information about LLVM's development process.
    LLVMBuild
    HowToReleaseLLVM
    Packaging
+   ReleaseProcess
 
 :doc:`DeveloperPolicy`
    The LLVM project's policy towards developers and their contributions.
diff --git a/docs/tutorial/OCamlLangImpl2.rst b/docs/tutorial/OCamlLangImpl2.rst
index 83a22ab..905b306 100644
--- a/docs/tutorial/OCamlLangImpl2.rst
+++ b/docs/tutorial/OCamlLangImpl2.rst
@@ -339,6 +339,9 @@ expression:
             (* Eat the binop. *)
             Stream.junk stream;
 
+            (* Parse the primary expression after the binary operator *)
+            let rhs = parse_primary stream in
+
             (* Okay, we know this is a binop. *)
             let rhs =
               match Stream.peek stream with
diff --git a/docs/yaml2obj.rst b/docs/yaml2obj.rst
index b269806..2c55f02 100644
--- a/docs/yaml2obj.rst
+++ b/docs/yaml2obj.rst
@@ -38,7 +38,7 @@ Here's a sample COFF file.
       ComplexType: IMAGE_SYM_DTYPE_NULL # (0)
       StorageClass: IMAGE_SYM_CLASS_STATIC # (3)
       NumberOfAuxSymbols: 1
-      AuxillaryData:
+      AuxiliaryData:
         "\x24\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00" # |$.................|
 
     - Name: _main
diff --git a/examples/ExceptionDemo/ExceptionDemo.cpp b/examples/ExceptionDemo/ExceptionDemo.cpp
index 61b1750..d6954e8 100644
--- a/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -1562,7 +1562,7 @@ llvm::Function *createUnwindExceptionTest(llvm::Module &module,
   return(outerCatchFunct);
 }
 
-
+namespace {
 /// Represents our foreign exceptions
 class OurCppRunException : public std::runtime_error {
 public:
@@ -1577,9 +1577,9 @@ public:
                                  std::runtime_error::operator=(toCopy)));
   }
 
-  ~OurCppRunException (void) throw () {}
+  virtual ~OurCppRunException (void) throw () {}
 };
-
+} // end anonymous namespace
 
 /// Throws foreign C++ exception.
 /// @param ignoreIt unused parameter that allows function to match implied
diff --git a/examples/Kaleidoscope/Chapter2/toy.cpp b/examples/Kaleidoscope/Chapter2/toy.cpp
index 23d45ae..cd90139 100644
--- a/examples/Kaleidoscope/Chapter2/toy.cpp
+++ b/examples/Kaleidoscope/Chapter2/toy.cpp
@@ -1,3 +1,4 @@
+#include <cctype>
 #include <cstdio>
 #include <cstdlib>
 #include <map>
@@ -74,7 +75,7 @@ static int gettok() {
 //===----------------------------------------------------------------------===//
 // Abstract Syntax Tree (aka Parse Tree)
 //===----------------------------------------------------------------------===//
-
+namespace {
 /// ExprAST - Base class for all expression nodes.
 class ExprAST {
 public:
@@ -126,6 +127,7 @@ class FunctionAST {
 public:
   FunctionAST(PrototypeAST *proto, ExprAST *body) {}
 };
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Parser
@@ -157,7 +159,6 @@ static int GetTokPrecedence() {
 /// Error* - These are little helper functions for error handling.
 ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
 PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }
-FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; }
 
 static ExprAST *ParseExpression();
 
diff --git a/examples/Kaleidoscope/Chapter3/toy.cpp b/examples/Kaleidoscope/Chapter3/toy.cpp
index 48cfbe6..a7b60ba 100644
--- a/examples/Kaleidoscope/Chapter3/toy.cpp
+++ b/examples/Kaleidoscope/Chapter3/toy.cpp
@@ -3,6 +3,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
@@ -79,7 +80,7 @@ static int gettok() {
 //===----------------------------------------------------------------------===//
 // Abstract Syntax Tree (aka Parse Tree)
 //===----------------------------------------------------------------------===//
-
+namespace {
 /// ExprAST - Base class for all expression nodes.
 class ExprAST {
 public:
@@ -146,6 +147,7 @@ public:
   
   Function *Codegen();
 };
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Parser
diff --git a/examples/Kaleidoscope/Chapter4/toy.cpp b/examples/Kaleidoscope/Chapter4/toy.cpp
index 971a7c6..27c3f4f 100644
--- a/examples/Kaleidoscope/Chapter4/toy.cpp
+++ b/examples/Kaleidoscope/Chapter4/toy.cpp
@@ -10,6 +10,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
@@ -86,7 +87,7 @@ static int gettok() {
 //===----------------------------------------------------------------------===//
 // Abstract Syntax Tree (aka Parse Tree)
 //===----------------------------------------------------------------------===//
-
+namespace {
 /// ExprAST - Base class for all expression nodes.
 class ExprAST {
 public:
@@ -153,6 +154,7 @@ public:
   
   Function *Codegen();
 };
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Parser
diff --git a/examples/Kaleidoscope/Chapter5/toy.cpp b/examples/Kaleidoscope/Chapter5/toy.cpp
index 5558d08..fe4cdfe 100644
--- a/examples/Kaleidoscope/Chapter5/toy.cpp
+++ b/examples/Kaleidoscope/Chapter5/toy.cpp
@@ -10,6 +10,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
@@ -95,7 +96,7 @@ static int gettok() {
 //===----------------------------------------------------------------------===//
 // Abstract Syntax Tree (aka Parse Tree)
 //===----------------------------------------------------------------------===//
-
+namespace {
 /// ExprAST - Base class for all expression nodes.
 class ExprAST {
 public:
@@ -182,6 +183,7 @@ public:
   
   Function *Codegen();
 };
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Parser
diff --git a/examples/Kaleidoscope/Chapter6/toy.cpp b/examples/Kaleidoscope/Chapter6/toy.cpp
index 52926eb..b7b347d 100644
--- a/examples/Kaleidoscope/Chapter6/toy.cpp
+++ b/examples/Kaleidoscope/Chapter6/toy.cpp
@@ -10,6 +10,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
@@ -100,7 +101,7 @@ static int gettok() {
 //===----------------------------------------------------------------------===//
 // Abstract Syntax Tree (aka Parse Tree)
 //===----------------------------------------------------------------------===//
-
+namespace {
 /// ExprAST - Base class for all expression nodes.
 class ExprAST {
 public:
@@ -210,6 +211,7 @@ public:
   
   Function *Codegen();
 };
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Parser
diff --git a/examples/Kaleidoscope/Chapter7/toy.cpp b/examples/Kaleidoscope/Chapter7/toy.cpp
index ba192d6..5c66f5c 100644
--- a/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/examples/Kaleidoscope/Chapter7/toy.cpp
@@ -10,6 +10,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
@@ -104,7 +105,7 @@ static int gettok() {
 //===----------------------------------------------------------------------===//
 // Abstract Syntax Tree (aka Parse Tree)
 //===----------------------------------------------------------------------===//
-
+namespace {
 /// ExprAST - Base class for all expression nodes.
 class ExprAST {
 public:
@@ -228,6 +229,7 @@ public:
   
   Function *Codegen();
 };
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Parser
diff --git a/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp b/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
index 2539025..784781b 100644
--- a/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
+++ b/examples/Kaleidoscope/MCJIT/cached/toy-jit.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
diff --git a/examples/Kaleidoscope/MCJIT/cached/toy.cpp b/examples/Kaleidoscope/MCJIT/cached/toy.cpp
index 6c8e38b..5bc49d7 100644
--- a/examples/Kaleidoscope/MCJIT/cached/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/cached/toy.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
diff --git a/examples/Kaleidoscope/MCJIT/complete/toy.cpp b/examples/Kaleidoscope/MCJIT/complete/toy.cpp
index da3f3b1..1f804d5 100644
--- a/examples/Kaleidoscope/MCJIT/complete/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/complete/toy.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
diff --git a/examples/Kaleidoscope/MCJIT/initial/toy.cpp b/examples/Kaleidoscope/MCJIT/initial/toy.cpp
index c682907..7e2f7df 100644
--- a/examples/Kaleidoscope/MCJIT/initial/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/initial/toy.cpp
@@ -11,6 +11,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
diff --git a/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp b/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
index 8650019..c3d88c8 100644
--- a/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
+++ b/examples/Kaleidoscope/MCJIT/lazy/toy-jit.cpp
@@ -12,6 +12,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
diff --git a/examples/Kaleidoscope/MCJIT/lazy/toy.cpp b/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
index 0a8d80e..189331c 100644
--- a/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
+++ b/examples/Kaleidoscope/MCJIT/lazy/toy.cpp
@@ -13,6 +13,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <string>
diff --git a/include/llvm-c/BitReader.h b/include/llvm-c/BitReader.h
index 5228035..7af209b 100644
--- a/include/llvm-c/BitReader.h
+++ b/include/llvm-c/BitReader.h
@@ -34,7 +34,7 @@ extern "C" {
 
 /* Builds a module from the bitcode in the specified memory buffer, returning a
    reference to the module via the OutModule parameter. Returns 0 on success.
-   Optionally returns a human-readable error message via OutMessage. */ 
+   Optionally returns a human-readable error message via OutMessage. */
 LLVMBool LLVMParseBitcode(LLVMMemoryBufferRef MemBuf,
                           LLVMModuleRef *OutModule, char **OutMessage);
 
@@ -44,7 +44,7 @@ LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
 
 /** Reads a module from the specified path, returning via the OutMP parameter
     a module provider which performs lazy deserialization. Returns 0 on success.
-    Optionally returns a human-readable error message via OutMessage. */ 
+    Optionally returns a human-readable error message via OutMessage. */
 LLVMBool LLVMGetBitcodeModuleInContext(LLVMContextRef ContextRef,
                                        LLVMMemoryBufferRef MemBuf,
                                        LLVMModuleRef *OutM,
diff --git a/include/llvm-c/BitWriter.h b/include/llvm-c/BitWriter.h
index ba5a677..f605e24 100644
--- a/include/llvm-c/BitWriter.h
+++ b/include/llvm-c/BitWriter.h
@@ -34,7 +34,7 @@ extern "C" {
 
 /*===-- Operations on modules ---------------------------------------------===*/
 
-/** Writes a module to the specified path. Returns 0 on success. */ 
+/** Writes a module to the specified path. Returns 0 on success. */
 int LLVMWriteBitcodeToFile(LLVMModuleRef M, const char *Path);
 
 /** Writes a module to an open file descriptor. Returns 0 on success. */
@@ -42,7 +42,7 @@ int LLVMWriteBitcodeToFD(LLVMModuleRef M, int FD, int ShouldClose,
                          int Unbuffered);
 
 /** Deprecated for LLVMWriteBitcodeToFD. Writes a module to an open file
-    descriptor. Returns 0 on success. Closes the Handle. */ 
+    descriptor. Returns 0 on success. Closes the Handle. */
 int LLVMWriteBitcodeToFileHandle(LLVMModuleRef M, int Handle);
 
 /**
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 409c0f1..9953d52 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -165,8 +165,9 @@ typedef enum {
        a temporary measure until the API/ABI impact to the C API is understood
        and the path forward agreed upon.
     LLVMAddressSafety = 1ULL << 32,
-    LLVMStackProtectStrongAttribute = 1ULL<<33
-    LLVMCold = 1ULL << 34
+    LLVMStackProtectStrongAttribute = 1ULL<<33,
+    LLVMCold = 1ULL << 34,
+    LLVMOptimizeNone = 1ULL << 35
     */
 } LLVMAttribute;
 
@@ -221,6 +222,7 @@ typedef enum {
   LLVMPtrToInt       = 39,
   LLVMIntToPtr       = 40,
   LLVMBitCast        = 41,
+  LLVMAddrSpaceCast  = 60,
 
   /* Other Operators */
   LLVMICmp           = 42,
@@ -273,7 +275,7 @@ typedef enum {
   LLVMLinkOnceAnyLinkage, /**< Keep one copy of function when linking (inline)*/
   LLVMLinkOnceODRLinkage, /**< Same, but only replaced by something
                             equivalent. */
-  LLVMLinkOnceODRAutoHideLinkage, /**< Like LinkOnceODR, but possibly hidden. */
+  LLVMLinkOnceODRAutoHideLinkage, /**< Obsolete */
   LLVMWeakAnyLinkage,     /**< Keep one copy of function when linking (weak) */
   LLVMWeakODRLinkage,     /**< Same, but only replaced by something
                             equivalent. */
@@ -300,6 +302,8 @@ typedef enum {
   LLVMCCallConv           = 0,
   LLVMFastCallConv        = 8,
   LLVMColdCallConv        = 9,
+  LLVMWebKitJSCallConv    = 12,
+  LLVMAnyRegCallConv      = 13,
   LLVMX86StdcallCallConv  = 64,
   LLVMX86FastcallCallConv = 65
 } LLVMCallConv;
@@ -353,26 +357,26 @@ typedef enum {
   LLVMAtomicOrderingNotAtomic = 0, /**< A load or store which is not atomic */
   LLVMAtomicOrderingUnordered = 1, /**< Lowest level of atomicity, guarantees
                                      somewhat sane results, lock free. */
-  LLVMAtomicOrderingMonotonic = 2, /**< guarantees that if you take all the 
-                                     operations affecting a specific address, 
+  LLVMAtomicOrderingMonotonic = 2, /**< guarantees that if you take all the
+                                     operations affecting a specific address,
                                      a consistent ordering exists */
-  LLVMAtomicOrderingAcquire = 4, /**< Acquire provides a barrier of the sort 
-                                   necessary to acquire a lock to access other 
+  LLVMAtomicOrderingAcquire = 4, /**< Acquire provides a barrier of the sort
+                                   necessary to acquire a lock to access other
                                    memory with normal loads and stores. */
-  LLVMAtomicOrderingRelease = 5, /**< Release is similar to Acquire, but with 
-                                   a barrier of the sort necessary to release 
+  LLVMAtomicOrderingRelease = 5, /**< Release is similar to Acquire, but with
+                                   a barrier of the sort necessary to release
                                    a lock. */
-  LLVMAtomicOrderingAcquireRelease = 6, /**< provides both an Acquire and a 
-                                          Release barrier (for fences and 
+  LLVMAtomicOrderingAcquireRelease = 6, /**< provides both an Acquire and a
+                                          Release barrier (for fences and
                                           operations which both read and write
                                            memory). */
-  LLVMAtomicOrderingSequentiallyConsistent = 7 /**< provides Acquire semantics 
-                                                 for loads and Release 
-                                                 semantics for stores. 
-                                                 Additionally, it guarantees 
-                                                 that a total ordering exists 
-                                                 between all 
-                                                 SequentiallyConsistent 
+  LLVMAtomicOrderingSequentiallyConsistent = 7 /**< provides Acquire semantics
+                                                 for loads and Release
+                                                 semantics for stores.
+                                                 Additionally, it guarantees
+                                                 that a total ordering exists
+                                                 between all
+                                                 SequentiallyConsistent
                                                  operations. */
 } LLVMAtomicOrdering;
 
@@ -385,16 +389,16 @@ typedef enum {
     LLVMAtomicRMWBinOpOr, /**< OR a value and return the old one */
     LLVMAtomicRMWBinOpXor, /**< Xor a value and return the old one */
     LLVMAtomicRMWBinOpMax, /**< Sets the value if it's greater than the
-                             original using a signed comparison and return 
+                             original using a signed comparison and return
                              the old one */
     LLVMAtomicRMWBinOpMin, /**< Sets the value if it's Smaller than the
-                             original using a signed comparison and return 
+                             original using a signed comparison and return
                              the old one */
     LLVMAtomicRMWBinOpUMax, /**< Sets the value if it's greater than the
-                             original using an unsigned comparison and return 
+                             original using an unsigned comparison and return
                              the old one */
     LLVMAtomicRMWBinOpUMin /**< Sets the value if it's greater than the
-                             original using an unsigned comparison  and return 
+                             original using an unsigned comparison  and return
                              the old one */
 } LLVMAtomicRMWBinOp;
 
@@ -407,7 +411,7 @@ void LLVMInitializeCore(LLVMPassRegistryRef R);
 /** Deallocate and destroy all ManagedStatic variables.
     @see llvm::llvm_shutdown
     @see ManagedStatic */
-void LLVMShutdown();
+void LLVMShutdown(void);
 
 
 /*===-- Error handling ----------------------------------------------------===*/
@@ -415,6 +419,29 @@ void LLVMShutdown();
 char *LLVMCreateMessage(const char *Message);
 void LLVMDisposeMessage(char *Message);
 
+typedef void (*LLVMFatalErrorHandler)(const char *Reason);
+
+/**
+ * Install a fatal error handler. By default, if LLVM detects a fatal error, it
+ * will call exit(1). This may not be appropriate in many contexts. For example,
+ * doing exit(1) will bypass many crash reporting/tracing system tools. This
+ * function allows you to install a callback that will be invoked prior to the
+ * call to exit(1).
+ */
+void LLVMInstallFatalErrorHandler(LLVMFatalErrorHandler Handler);
+
+/**
+ * Reset the fatal error handler. This resets LLVM's fatal error handling
+ * behavior to the default.
+ */
+void LLVMResetFatalErrorHandler(void);
+
+/**
+ * Enable LLVM's built-in stack trace code. This intercepts the OS's crash
+ * signals and prints which component of LLVM you were in at the time if the
+ * crash.
+ */
+void LLVMEnablePrettyStackTrace(void);
 
 /**
  * @defgroup LLVMCCoreContext Contexts
@@ -460,7 +487,7 @@ unsigned LLVMGetMDKindID(const char* Name, unsigned SLen);
 /**
  * @defgroup LLVMCCoreModule Modules
  *
- * Modules represent the top-level structure in a LLVM program. An LLVM
+ * Modules represent the top-level structure in an LLVM program. An LLVM
  * module is effectively a translation unit or a collection of
  * translation units merged together.
  *
@@ -540,6 +567,14 @@ LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
                                char **ErrorMessage);
 
 /**
+ * Return a string representation of the module. Use
+ * LLVMDisposeMessage to free the string.
+ *
+ * @see Module::print()
+ */
+char *LLVMPrintModuleToString(LLVMModuleRef M);
+
+/**
  * Set inline assembly for a module.
  *
  * @see Module::setModuleInlineAsm()
@@ -691,6 +726,21 @@ LLVMBool LLVMTypeIsSized(LLVMTypeRef Ty);
 LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty);
 
 /**
+ * Dump a representation of a type to stderr.
+ *
+ * @see llvm::Type::dump()
+ */
+void LLVMDumpType(LLVMTypeRef Val);
+
+/**
+ * Return a string representation of the type. Use
+ * LLVMDisposeMessage to free the string.
+ *
+ * @see llvm::Type::print()
+ */
+char *LLVMPrintTypeToString(LLVMTypeRef Val);
+
+/**
  * @defgroup LLVMCCoreTypeInt Integer Types
  *
  * Functions in this section operate on integer types.
@@ -1041,7 +1091,7 @@ LLVMTypeRef LLVMX86MMXType(void);
  * hierarchy of classes within this type. Depending on the instance
  * obtained, not all APIs are available.
  *
- * Callers can determine the type of a LLVMValueRef by calling the
+ * Callers can determine the type of an LLVMValueRef by calling the
  * LLVMIsA* family of functions (e.g. LLVMIsAArgument()). These
  * functions are defined by a macro, so it isn't obvious which are
  * available by looking at the Doxygen source code. Instead, look at the
@@ -1063,6 +1113,9 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(BlockAddress)                   \
       macro(ConstantAggregateZero)          \
       macro(ConstantArray)                  \
+      macro(ConstantDataSequential)         \
+        macro(ConstantDataArray)            \
+        macro(ConstantDataVector)           \
       macro(ConstantExpr)                   \
       macro(ConstantFP)                     \
       macro(ConstantInt)                    \
@@ -1107,6 +1160,7 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(UnaryInstruction)               \
         macro(AllocaInst)                   \
         macro(CastInst)                     \
+          macro(AddrSpaceCastInst)          \
           macro(BitCastInst)                \
           macro(FPExtInst)                  \
           macro(FPToSIInst)                 \
@@ -1162,6 +1216,14 @@ void LLVMSetValueName(LLVMValueRef Val, const char *Name);
 void LLVMDumpValue(LLVMValueRef Val);
 
 /**
+ * Return a string representation of the value. Use
+ * LLVMDisposeMessage to free the string.
+ *
+ * @see llvm::Value::print()
+ */
+char *LLVMPrintValueToString(LLVMValueRef Val);
+
+/**
  * Replace all uses of a value with another one.
  *
  * @see llvm::Value::replaceAllUsesWith()
@@ -1181,7 +1243,7 @@ LLVMBool LLVMIsUndef(LLVMValueRef Val);
 /**
  * Convert value instances between types.
  *
- * Internally, a LLVMValueRef is "pinned" to a specific type. This
+ * Internally, an LLVMValueRef is "pinned" to a specific type. This
  * series of functions allows you to cast an instance to a specific
  * type.
  *
@@ -1203,7 +1265,7 @@ LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DECLARE_VALUE_CAST)
  * This module defines functions that allow you to inspect the uses of a
  * LLVMValueRef.
  *
- * It is possible to obtain a LLVMUseRef for any LLVMValueRef instance.
+ * It is possible to obtain an LLVMUseRef for any LLVMValueRef instance.
  * Each LLVMUseRef (which corresponds to a llvm::Use instance) holds a
  * llvm::User and llvm::Value.
  *
@@ -1570,6 +1632,7 @@ LLVMValueRef LLVMConstFPToSI(LLVMValueRef ConstantVal, LLVMTypeRef ToType);
 LLVMValueRef LLVMConstPtrToInt(LLVMValueRef ConstantVal, LLVMTypeRef ToType);
 LLVMValueRef LLVMConstIntToPtr(LLVMValueRef ConstantVal, LLVMTypeRef ToType);
 LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType);
+LLVMValueRef LLVMConstAddrSpaceCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType);
 LLVMValueRef LLVMConstZExtOrBitCast(LLVMValueRef ConstantVal,
                                     LLVMTypeRef ToType);
 LLVMValueRef LLVMConstSExtOrBitCast(LLVMValueRef ConstantVal,
@@ -1625,8 +1688,33 @@ const char *LLVMGetSection(LLVMValueRef Global);
 void LLVMSetSection(LLVMValueRef Global, const char *Section);
 LLVMVisibility LLVMGetVisibility(LLVMValueRef Global);
 void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz);
-unsigned LLVMGetAlignment(LLVMValueRef Global);
-void LLVMSetAlignment(LLVMValueRef Global, unsigned Bytes);
+
+/**
+ * @defgroup LLVMCCoreValueWithAlignment Values with alignment
+ *
+ * Functions in this group only apply to values with alignment, i.e.
+ * global variables, load and store instructions.
+ */
+
+/**
+ * Obtain the preferred alignment of the value.
+ * @see llvm::LoadInst::getAlignment()
+ * @see llvm::StoreInst::getAlignment()
+ * @see llvm::GlobalValue::getAlignment()
+ */
+unsigned LLVMGetAlignment(LLVMValueRef V);
+
+/**
+ * Set the preferred alignment of the value.
+ * @see llvm::LoadInst::setAlignment()
+ * @see llvm::StoreInst::setAlignment()
+ * @see llvm::GlobalValue::setAlignment()
+ */
+void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes);
+
+/**
+  * @}
+  */
 
 /**
  * @defgroup LLVMCoreValueConstantGlobalVariable Global Variables
@@ -1806,7 +1894,7 @@ LLVMValueRef LLVMGetParam(LLVMValueRef Fn, unsigned Index);
 /**
  * Obtain the function to which this argument belongs.
  *
- * Unlike other functions in this group, this one takes a LLVMValueRef
+ * Unlike other functions in this group, this one takes an LLVMValueRef
  * that corresponds to a llvm::Attribute.
  *
  * The returned LLVMValueRef is the llvm::Function to which this
@@ -1831,7 +1919,7 @@ LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn);
 /**
  * Obtain the next parameter to a function.
  *
- * This takes a LLVMValueRef obtained from LLVMGetFirstParam() (which is
+ * This takes an LLVMValueRef obtained from LLVMGetFirstParam() (which is
  * actually a wrapped iterator) and obtains the next parameter from the
  * underlying iterator.
  */
@@ -1980,12 +2068,12 @@ void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest);
 LLVMValueRef LLVMBasicBlockAsValue(LLVMBasicBlockRef BB);
 
 /**
- * Determine whether a LLVMValueRef is itself a basic block.
+ * Determine whether an LLVMValueRef is itself a basic block.
  */
 LLVMBool LLVMValueIsBasicBlock(LLVMValueRef Val);
 
 /**
- * Convert a LLVMValueRef to a LLVMBasicBlockRef instance.
+ * Convert an LLVMValueRef to an LLVMBasicBlockRef instance.
  */
 LLVMBasicBlockRef LLVMValueAsBasicBlock(LLVMValueRef Val);
 
@@ -2142,7 +2230,7 @@ LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB);
 /**
  * Obtain the last instruction in a basic block.
  *
- * The returned LLVMValueRef corresponds to a LLVM:Instruction.
+ * The returned LLVMValueRef corresponds to an LLVM:Instruction.
  */
 LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB);
 
@@ -2324,12 +2412,12 @@ void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues,
 unsigned LLVMCountIncoming(LLVMValueRef PhiNode);
 
 /**
- * Obtain an incoming value to a PHI node as a LLVMValueRef.
+ * Obtain an incoming value to a PHI node as an LLVMValueRef.
  */
 LLVMValueRef LLVMGetIncomingValue(LLVMValueRef PhiNode, unsigned Index);
 
 /**
- * Obtain an incoming value to a PHI node as a LLVMBasicBlockRef.
+ * Obtain an incoming value to a PHI node as an LLVMBasicBlockRef.
  */
 LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index);
 
@@ -2520,6 +2608,8 @@ LLVMValueRef LLVMBuildIntToPtr(LLVMBuilderRef, LLVMValueRef Val,
                                LLVMTypeRef DestTy, const char *Name);
 LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef, LLVMValueRef Val,
                               LLVMTypeRef DestTy, const char *Name);
+LLVMValueRef LLVMBuildAddrSpaceCast(LLVMBuilderRef, LLVMValueRef Val,
+                                    LLVMTypeRef DestTy, const char *Name);
 LLVMValueRef LLVMBuildZExtOrBitCast(LLVMBuilderRef, LLVMValueRef Val,
                                     LLVMTypeRef DestTy, const char *Name);
 LLVMValueRef LLVMBuildSExtOrBitCast(LLVMBuilderRef, LLVMValueRef Val,
@@ -2573,9 +2663,9 @@ LLVMValueRef LLVMBuildIsNotNull(LLVMBuilderRef, LLVMValueRef Val,
                                 const char *Name);
 LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef, LLVMValueRef LHS,
                               LLVMValueRef RHS, const char *Name);
-LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,  
-                                LLVMValueRef PTR, LLVMValueRef Val, 
-                                LLVMAtomicOrdering ordering, 
+LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
+                                LLVMValueRef PTR, LLVMValueRef Val,
+                                LLVMAtomicOrdering ordering,
                                 LLVMBool singleThread);
 
 /**
@@ -2708,16 +2798,16 @@ void LLVMDisposePassManager(LLVMPassManagerRef PM);
     initialization succeeded. Must be executed in isolation from all
     other LLVM api calls.
     @see llvm::llvm_start_multithreaded */
-LLVMBool LLVMStartMultithreaded();
+LLVMBool LLVMStartMultithreaded(void);
 
 /** Deallocate structures necessary to make LLVM safe for multithreading.
     Must be executed in isolation from all other LLVM api calls.
     @see llvm::llvm_stop_multithreaded */
-void LLVMStopMultithreaded();
+void LLVMStopMultithreaded(void);
 
 /** Check whether LLVM is executing in thread-safe mode or not.
     @see llvm::llvm_is_multithreaded */
-LLVMBool LLVMIsMultithreaded();
+LLVMBool LLVMIsMultithreaded(void);
 
 /**
  * @}
diff --git a/include/llvm-c/Disassembler.h b/include/llvm-c/Disassembler.h
index df65a7b..79bcfcd 100644
--- a/include/llvm-c/Disassembler.h
+++ b/include/llvm-c/Disassembler.h
@@ -42,7 +42,7 @@ typedef void *LLVMDisasmContextRef;
  * instruction are specified by the Offset parameter and its byte widith is the
  * size parameter.  For instructions sets with fixed widths and one symbolic
  * operand per instruction, the Offset parameter will be zero and Size parameter
- * will be the instruction width.  The information is returned in TagBuf and is 
+ * will be the instruction width.  The information is returned in TagBuf and is
  * Triple specific with its specific information defined by the value of
  * TagType for that Triple.  If symbolic information is returned the function
  * returns 1, otherwise it returns 0.
@@ -58,7 +58,7 @@ typedef int (*LLVMOpInfoCallback)(void *DisInfo, uint64_t PC,
  * SubtractSymbol can be link edited independent of each other.  Many other
  * platforms only allow a relocatable expression of the form AddSymbol + Offset
  * to be encoded.
- * 
+ *
  * The LLVMOpInfoCallback() for the TagType value of 1 uses the struct
  * LLVMOpInfo1.  The value of the relocatable expression for the operand,
  * including any PC adjustment, is passed in to the call back in the Value
@@ -130,6 +130,17 @@ typedef const char *(*LLVMSymbolLookupCallback)(void *DisInfo,
 /* The output reference is to a cstring address in a literal pool. */
 #define LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr 3
 
+/* The output reference is to a Objective-C CoreFoundation string. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref 4
+/* The output reference is to a Objective-C message. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Message 5
+/* The output reference is to a Objective-C message ref. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref 6
+/* The output reference is to a Objective-C selector ref. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref 7
+/* The output reference is to a Objective-C class ref. */
+#define LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref 8
+
 #ifdef __cplusplus
 extern "C" {
 #endif /* !defined(__cplusplus) */
@@ -170,6 +181,10 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DC, uint64_t Options);
 #define LLVMDisassembler_Option_PrintImmHex 2
 /* The option use the other assembler printer variant */
 #define LLVMDisassembler_Option_AsmPrinterVariant 4
+/* The option to set comment on instructions */
+#define LLVMDisassembler_Option_SetInstrComments 8
+  /* The option to print latency information alongside instructions */
+#define LLVMDisassembler_Option_PrintLatency 16
 
 /**
  * Dispose of a disassembler context.
diff --git a/include/llvm-c/ExecutionEngine.h b/include/llvm-c/ExecutionEngine.h
index 50fdb6b..3564312 100644
--- a/include/llvm-c/ExecutionEngine.h
+++ b/include/llvm-c/ExecutionEngine.h
@@ -171,6 +171,16 @@ void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global);
 
 /*===-- Operations on memory managers -------------------------------------===*/
 
+typedef uint8_t *(*LLVMMemoryManagerAllocateCodeSectionCallback)(
+  void *Opaque, uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  const char *SectionName);
+typedef uint8_t *(*LLVMMemoryManagerAllocateDataSectionCallback)(
+  void *Opaque, uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  const char *SectionName, LLVMBool IsReadOnly);
+typedef LLVMBool (*LLVMMemoryManagerFinalizeMemoryCallback)(
+  void *Opaque, char **ErrMsg);
+typedef void (*LLVMMemoryManagerDestroyCallback)(void *Opaque);
+
 /**
  * Create a simple custom MCJIT memory manager. This memory manager can
  * intercept allocations in a module-oblivious way. This will return NULL
@@ -184,14 +194,10 @@ void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global);
  */
 LLVMMCJITMemoryManagerRef LLVMCreateSimpleMCJITMemoryManager(
   void *Opaque,
-  uint8_t *(*AllocateCodeSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID),
-  uint8_t *(*AllocateDataSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID, LLVMBool IsReadOnly),
-  LLVMBool (*FinalizeMemory)(void *Opaque, char **ErrMsg),
-  void (*Destroy)(void *Opaque));
+  LLVMMemoryManagerAllocateCodeSectionCallback AllocateCodeSection,
+  LLVMMemoryManagerAllocateDataSectionCallback AllocateDataSection,
+  LLVMMemoryManagerFinalizeMemoryCallback FinalizeMemory,
+  LLVMMemoryManagerDestroyCallback Destroy);
 
 void LLVMDisposeMCJITMemoryManager(LLVMMCJITMemoryManagerRef MM);
 
@@ -200,7 +206,7 @@ void LLVMDisposeMCJITMemoryManager(LLVMMCJITMemoryManagerRef MM);
  */
 
 #ifdef __cplusplus
-}  
+}
 #endif /* defined(__cplusplus) */
 
 #endif
diff --git a/include/llvm-c/IRReader.h b/include/llvm-c/IRReader.h
new file mode 100644
index 0000000..d0a23be
--- /dev/null
+++ b/include/llvm-c/IRReader.h
@@ -0,0 +1,40 @@
+/*===-- llvm-c/IRReader.h - IR Reader C Interface -----------------*- C -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file defines the C interface to the IR Reader.                        *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_IRREADER_H
+#define LLVM_C_IRREADER_H
+
+#include "llvm-c/Core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Read LLVM IR from a memory buffer and convert it into an in-memory Module
+ * object. Returns 0 on success.
+ * Optionally returns a human-readable description of any errors that
+ * occured during parsing IR. OutMessage must be disposed with
+ * LLVMDisposeMessage.
+ *
+ * @see llvm::ParseIR()
+ */
+LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef,
+                              LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
+                              char **OutMessage);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/llvm-c/LinkTimeOptimizer.h b/include/llvm-c/LinkTimeOptimizer.h
index 7a0fbf6..8bcf599 100644
--- a/include/llvm-c/LinkTimeOptimizer.h
+++ b/include/llvm-c/LinkTimeOptimizer.h
@@ -4,7 +4,7 @@
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This header provides a C API to use the LLVM link time optimization
@@ -46,7 +46,7 @@ extern "C" {
     //  Added C-specific error codes
     LLVM_LTO_NULL_OBJECT
   } llvm_lto_status_t;
- 
+
   /// This provides C interface to initialize link time optimizer. This allows
   /// linker to use dlopen() interface to dynamically load LinkTimeOptimizer.
   /// extern "C" helps, because dlopen() interface uses name to find the symbol.
diff --git a/include/llvm-c/Object.h b/include/llvm-c/Object.h
index ecccfee..c271552 100644
--- a/include/llvm-c/Object.h
+++ b/include/llvm-c/Object.h
@@ -100,4 +100,3 @@ const char *LLVMGetRelocationValueString(LLVMRelocationIteratorRef RI);
 #endif /* defined(__cplusplus) */
 
 #endif
-
diff --git a/include/llvm-c/Support.h b/include/llvm-c/Support.h
new file mode 100644
index 0000000..7f03ede
--- /dev/null
+++ b/include/llvm-c/Support.h
@@ -0,0 +1,35 @@
+/*===-- llvm-c/Support.h - Support C Interface --------------------*- C -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file defines the C interface to the LLVM support library.             *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_SUPPORT_H
+#define LLVM_C_SUPPORT_H
+
+#include "llvm-c/Core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * This function permanently loads the dynamic library at the given path.
+ * It is safe to call this function multiple times for the same library.
+ *
+ * @see sys::DynamicLibrary::LoadLibraryPermanently()
+  */
+LLVMBool LLVMLoadLibraryPermanently(const char* Filename);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/llvm-c/Target.h b/include/llvm-c/Target.h
index 80fc3e5..b465b4b 100644
--- a/include/llvm-c/Target.h
+++ b/include/llvm-c/Target.h
@@ -22,6 +22,10 @@
 #include "llvm-c/Core.h"
 #include "llvm/Config/llvm-config.h"
 
+#if defined(_MSC_VER) && !defined(inline)
+#define inline __inline
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -37,14 +41,13 @@ enum LLVMByteOrdering { LLVMBigEndian, LLVMLittleEndian };
 
 typedef struct LLVMOpaqueTargetData *LLVMTargetDataRef;
 typedef struct LLVMOpaqueTargetLibraryInfotData *LLVMTargetLibraryInfoRef;
-typedef struct LLVMStructLayout *LLVMStructLayoutRef;
 
 /* Declare all of the target-initialization functions that are available. */
 #define LLVM_TARGET(TargetName) \
   void LLVMInitialize##TargetName##TargetInfo(void);
 #include "llvm/Config/Targets.def"
 #undef LLVM_TARGET  /* Explicit undef to make SWIG happier */
-  
+
 #define LLVM_TARGET(TargetName) void LLVMInitialize##TargetName##Target(void);
 #include "llvm/Config/Targets.def"
 #undef LLVM_TARGET  /* Explicit undef to make SWIG happier */
@@ -53,7 +56,7 @@ typedef struct LLVMStructLayout *LLVMStructLayoutRef;
   void LLVMInitialize##TargetName##TargetMC(void);
 #include "llvm/Config/Targets.def"
 #undef LLVM_TARGET  /* Explicit undef to make SWIG happier */
-  
+
 /* Declare all of the available assembly printer initialization functions. */
 #define LLVM_ASM_PRINTER(TargetName) \
   void LLVMInitialize##TargetName##AsmPrinter(void);
@@ -71,7 +74,7 @@ typedef struct LLVMStructLayout *LLVMStructLayoutRef;
   void LLVMInitialize##TargetName##Disassembler(void);
 #include "llvm/Config/Disassemblers.def"
 #undef LLVM_DISASSEMBLER  /* Explicit undef to make SWIG happier */
-  
+
 /** LLVMInitializeAllTargetInfos - The main program should call this function if
     it wants access to all available targets that LLVM is configured to
     support. */
@@ -98,7 +101,7 @@ static inline void LLVMInitializeAllTargetMCs(void) {
 #include "llvm/Config/Targets.def"
 #undef LLVM_TARGET  /* Explicit undef to make SWIG happier */
 }
-  
+
 /** LLVMInitializeAllAsmPrinters - The main program should call this function if
     it wants all asm printers that LLVM is configured to support, to make them
     available via the TargetRegistry. */
@@ -107,7 +110,7 @@ static inline void LLVMInitializeAllAsmPrinters(void) {
 #include "llvm/Config/AsmPrinters.def"
 #undef LLVM_ASM_PRINTER  /* Explicit undef to make SWIG happier */
 }
-  
+
 /** LLVMInitializeAllAsmParsers - The main program should call this function if
     it wants all asm parsers that LLVM is configured to support, to make them
     available via the TargetRegistry. */
@@ -116,7 +119,7 @@ static inline void LLVMInitializeAllAsmParsers(void) {
 #include "llvm/Config/AsmParsers.def"
 #undef LLVM_ASM_PARSER  /* Explicit undef to make SWIG happier */
 }
-  
+
 /** LLVMInitializeAllDisassemblers - The main program should call this function
     if it wants all disassemblers that LLVM is configured to support, to make
     them available via the TargetRegistry. */
@@ -126,9 +129,9 @@ static inline void LLVMInitializeAllDisassemblers(void) {
 #include "llvm/Config/Disassemblers.def"
 #undef LLVM_DISASSEMBLER  /* Explicit undef to make SWIG happier */
 }
-  
+
 /** LLVMInitializeNativeTarget - The main program should call this function to
-    initialize the native target corresponding to the host.  This is useful 
+    initialize the native target corresponding to the host.  This is useful
     for JIT applications to ensure that the target gets linked in correctly. */
 static inline LLVMBool LLVMInitializeNativeTarget(void) {
   /* If we have a native target, initialize it to ensure it is linked in. */
@@ -140,7 +143,43 @@ static inline LLVMBool LLVMInitializeNativeTarget(void) {
 #else
   return 1;
 #endif
-}  
+}
+
+/** LLVMInitializeNativeTargetAsmParser - The main program should call this
+    function to initialize the parser for the native target corresponding to the
+    host. */
+static inline LLVMBool LLVMInitializeNativeAsmParser(void) {
+#ifdef LLVM_NATIVE_ASMPARSER
+  LLVM_NATIVE_ASMPARSER();
+  return 0;
+#else
+  return 1;
+#endif
+}
+
+/** LLVMInitializeNativeTargetAsmPrinter - The main program should call this
+    function to initialize the printer for the native target corresponding to
+    the host. */
+static inline LLVMBool LLVMInitializeNativeAsmPrinter(void) {
+#ifdef LLVM_NATIVE_ASMPRINTER
+  LLVM_NATIVE_ASMPRINTER();
+  return 0;
+#else
+  return 1;
+#endif
+}
+
+/** LLVMInitializeNativeTargetDisassembler - The main program should call this
+    function to initialize the disassembler for the native target corresponding
+    to the host. */
+static inline LLVMBool LLVMInitializeNativeDisassembler(void) {
+#ifdef LLVM_NATIVE_DISASSEMBLER
+  LLVM_NATIVE_DISASSEMBLER();
+  return 0;
+#else
+  return 1;
+#endif
+}
 
 /*===-- Target Data -------------------------------------------------------===*/
 
@@ -151,83 +190,94 @@ LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep);
 /** Adds target data information to a pass manager. This does not take ownership
     of the target data.
     See the method llvm::PassManagerBase::add. */
-void LLVMAddTargetData(LLVMTargetDataRef, LLVMPassManagerRef);
+void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM);
 
 /** Adds target library information to a pass manager. This does not take
     ownership of the target library info.
     See the method llvm::PassManagerBase::add. */
-void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef, LLVMPassManagerRef);
+void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
+                              LLVMPassManagerRef PM);
 
 /** Converts target data to a target layout string. The string must be disposed
     with LLVMDisposeMessage.
     See the constructor llvm::DataLayout::DataLayout. */
-char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef);
+char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD);
 
 /** Returns the byte order of a target, either LLVMBigEndian or
     LLVMLittleEndian.
     See the method llvm::DataLayout::isLittleEndian. */
-enum LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef);
+enum LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD);
 
 /** Returns the pointer size in bytes for a target.
     See the method llvm::DataLayout::getPointerSize. */
-unsigned LLVMPointerSize(LLVMTargetDataRef);
+unsigned LLVMPointerSize(LLVMTargetDataRef TD);
 
 /** Returns the pointer size in bytes for a target for a specified
     address space.
     See the method llvm::DataLayout::getPointerSize. */
-unsigned LLVMPointerSizeForAS(LLVMTargetDataRef, unsigned AS);
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS);
+
+/** Returns the integer type that is the same size as a pointer on a target.
+    See the method llvm::DataLayout::getIntPtrType. */
+LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD);
+
+/** Returns the integer type that is the same size as a pointer on a target.
+    This version allows the address space to be specified.
+    See the method llvm::DataLayout::getIntPtrType. */
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS);
 
 /** Returns the integer type that is the same size as a pointer on a target.
     See the method llvm::DataLayout::getIntPtrType. */
-LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef);
+LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD);
 
 /** Returns the integer type that is the same size as a pointer on a target.
     This version allows the address space to be specified.
     See the method llvm::DataLayout::getIntPtrType. */
-LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef, unsigned AS);
+LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD,
+                                         unsigned AS);
 
 /** Computes the size of a type in bytes for a target.
     See the method llvm::DataLayout::getTypeSizeInBits. */
-unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef, LLVMTypeRef);
+unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
 /** Computes the storage size of a type in bytes for a target.
     See the method llvm::DataLayout::getTypeStoreSize. */
-unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef, LLVMTypeRef);
+unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
 /** Computes the ABI size of a type in bytes for a target.
     See the method llvm::DataLayout::getTypeAllocSize. */
-unsigned long long LLVMABISizeOfType(LLVMTargetDataRef, LLVMTypeRef);
+unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
 /** Computes the ABI alignment of a type in bytes for a target.
     See the method llvm::DataLayout::getTypeABISize. */
-unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef, LLVMTypeRef);
+unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
 /** Computes the call frame alignment of a type in bytes for a target.
     See the method llvm::DataLayout::getTypeABISize. */
-unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef, LLVMTypeRef);
+unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
 /** Computes the preferred alignment of a type in bytes for a target.
     See the method llvm::DataLayout::getTypeABISize. */
-unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef, LLVMTypeRef);
+unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty);
 
 /** Computes the preferred alignment of a global variable in bytes for a target.
     See the method llvm::DataLayout::getPreferredAlignment. */
-unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef,
+unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
                                         LLVMValueRef GlobalVar);
 
 /** Computes the structure element that contains the byte offset for a target.
     See the method llvm::StructLayout::getElementContainingOffset. */
-unsigned LLVMElementAtOffset(LLVMTargetDataRef, LLVMTypeRef StructTy,
+unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
                              unsigned long long Offset);
 
 /** Computes the byte offset of the indexed struct element for a target.
     See the method llvm::StructLayout::getElementContainingOffset. */
-unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef, LLVMTypeRef StructTy,
-                                       unsigned Element);
+unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD,
+                                       LLVMTypeRef StructTy, unsigned Element);
 
 /** Deallocates a TargetData.
     See the destructor llvm::DataLayout::~DataLayout. */
-void LLVMDisposeTargetData(LLVMTargetDataRef);
+void LLVMDisposeTargetData(LLVMTargetDataRef TD);
 
 /**
  * @}
diff --git a/include/llvm-c/TargetMachine.h b/include/llvm-c/TargetMachine.h
index 5e35595..e159411 100644
--- a/include/llvm-c/TargetMachine.h
+++ b/include/llvm-c/TargetMachine.h
@@ -57,11 +57,21 @@ typedef enum {
 } LLVMCodeGenFileType;
 
 /** Returns the first llvm::Target in the registered targets list. */
-LLVMTargetRef LLVMGetFirstTarget();
+LLVMTargetRef LLVMGetFirstTarget(void);
 /** Returns the next llvm::Target given a previous one (or null if there's none) */
 LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T);
 
 /*===-- Target ------------------------------------------------------------===*/
+/** Finds the target corresponding to the given name and stores it in \p T. 
+  Returns 0 on success. */
+LLVMTargetRef LLVMGetTargetFromName(const char *Name);
+
+/** Finds the target corresponding to the given triple and stores it in \p T.
+  Returns 0 on success. Optionally returns any error in ErrorMessage.
+  Use LLVMDisposeMessage to dispose the message. */
+LLVMBool LLVMGetTargetFromTriple(const char* Triple, LLVMTargetRef *T,
+                                 char **ErrorMessage);
+
 /** Returns the name of a target. See llvm::Target::getName */
 const char *LLVMGetTargetName(LLVMTargetRef T);
 
@@ -79,9 +89,9 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T);
 
 /*===-- Target Machine ----------------------------------------------------===*/
 /** Creates a new llvm::TargetMachine. See llvm::Target::createTargetMachine */
-LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, char *Triple,
-  char *CPU, char *Features, LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
-  LLVMCodeModel CodeModel);
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
+  const char *Triple, const char *CPU, const char *Features,
+  LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc, LLVMCodeModel CodeModel);
 
 /** Dispose the LLVMTargetMachineRef instance generated by
   LLVMCreateTargetMachine. */
@@ -108,6 +118,10 @@ char *LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T);
 /** Returns the llvm::DataLayout used for this llvm:TargetMachine. */
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T);
 
+/** Set the target machine's ASM verbosity. */
+void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
+                                      LLVMBool VerboseAsm);
+
 /** Emits an asm or object file for the given module to the filename. This
   wraps several c++ only classes (among them a file stream). Returns any
   error in ErrorMessage. Use LLVMDisposeMessage to dispose the message. */
@@ -117,6 +131,12 @@ LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
 /** Compile the LLVM IR stored in \p M and store the result in \p OutMemBuf. */
 LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T, LLVMModuleRef M,
   LLVMCodeGenFileType codegen, char** ErrorMessage, LLVMMemoryBufferRef *OutMemBuf);
+
+/*===-- Triple ------------------------------------------------------------===*/
+/** Get a triple for the host machine as a string. The result needs to be
+  disposed with LLVMDisposeMessage. */
+char* LLVMGetDefaultTargetTriple(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/llvm-c/Transforms/Scalar.h b/include/llvm-c/Transforms/Scalar.h
index a2c4d61..355e8dc 100644
--- a/include/llvm-c/Transforms/Scalar.h
+++ b/include/llvm-c/Transforms/Scalar.h
@@ -65,6 +65,9 @@ void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM);
 /** See llvm::createLoopRotatePass function. */
 void LLVMAddLoopRotatePass(LLVMPassManagerRef PM);
 
+/** See llvm::createLoopRerollPass function. */
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM);
+
 /** See llvm::createLoopUnrollPass function. */
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
 
@@ -74,6 +77,9 @@ void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);
 /** See llvm::createMemCpyOptPass function. */
 void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM);
 
+/** See llvm::createPartiallyInlineLibCallsPass function. */
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM);
+
 /** See llvm::createPromoteMemoryToRegisterPass function. */
 void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM);
 
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index 40110fd..89f54b7 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -16,9 +16,22 @@
 #ifndef LLVM_C_LTO_H
 #define LLVM_C_LTO_H
 
-#include <stdbool.h>
 #include <stddef.h>
-#include <unistd.h>
+#include <sys/types.h>
+
+#ifndef __cplusplus
+#if !defined(_MSC_VER)
+#include <stdbool.h>
+typedef bool lto_bool_t;
+#else
+/* MSVC in particular does not have anything like _Bool or bool in C, but we can
+   at least make sure the type is the same size.  The implementation side will
+   use C++ bool. */
+typedef unsigned char lto_bool_t;
+#endif
+#else
+typedef bool lto_bool_t;
+#endif
 
 /**
  * @defgroup LLVMCLTO LTO
@@ -27,7 +40,7 @@
  * @{
  */
 
-#define LTO_API_VERSION 4
+#define LTO_API_VERSION 5
 
 typedef enum {
     LTO_SYMBOL_ALIGNMENT_MASK              = 0x0000001F, /* log2 of alignment */
@@ -87,14 +100,14 @@ lto_get_error_message(void);
 /**
  * Checks if a file is a loadable object file.
  */
-extern bool
+extern lto_bool_t
 lto_module_is_object_file(const char* path);
 
 
 /**
  * Checks if a file is a loadable object compiled for requested target.
  */
-extern bool
+extern lto_bool_t
 lto_module_is_object_file_for_target(const char* path,
                                      const char* target_triple_prefix);
 
@@ -102,14 +115,14 @@ lto_module_is_object_file_for_target(const char* path,
 /**
  * Checks if a buffer is a loadable object file.
  */
-extern bool
+extern lto_bool_t
 lto_module_is_object_file_in_memory(const void* mem, size_t length);
 
 
 /**
  * Checks if a buffer is a loadable object compiled for requested target.
  */
-extern bool
+extern lto_bool_t
 lto_module_is_object_file_in_memory_for_target(const void* mem, size_t length,
                                               const char* target_triple_prefix);
 
@@ -208,7 +221,7 @@ lto_codegen_dispose(lto_code_gen_t);
  * Add an object module to the set of modules for which code will be generated.
  * Returns true on error (check lto_get_error_message() for details).
  */
-extern bool
+extern lto_bool_t
 lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod);
 
 
@@ -217,7 +230,7 @@ lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod);
  * Sets if debug info should be generated.
  * Returns true on error (check lto_get_error_message() for details).
  */
-extern bool
+extern lto_bool_t
 lto_codegen_set_debug_model(lto_code_gen_t cg, lto_debug_model);
 
 
@@ -225,7 +238,7 @@ lto_codegen_set_debug_model(lto_code_gen_t cg, lto_debug_model);
  * Sets which PIC code model to generated.
  * Returns true on error (check lto_get_error_message() for details).
  */
-extern bool
+extern lto_bool_t
 lto_codegen_set_pic_model(lto_code_gen_t cg, lto_codegen_model);
 
 
@@ -251,9 +264,8 @@ lto_codegen_set_assembler_args(lto_code_gen_t cg, const char **args,
                                int nargs);
 
 /**
- * Adds to a list of all global symbols that must exist in the final
- * generated code.  If a function is not listed, it might be
- * inlined into every usage and optimized away.
+ * Tells LTO optimization passes that this symbol must be preserved
+ * because it is referenced by native code or a command line option.
  */
 extern void
 lto_codegen_add_must_preserve_symbol(lto_code_gen_t cg, const char* symbol);
@@ -263,7 +275,7 @@ lto_codegen_add_must_preserve_symbol(lto_code_gen_t cg, const char* symbol);
  * merged contents of all modules added so far.
  * Returns true on error (check lto_get_error_message() for details).
  */
-extern bool
+extern lto_bool_t
 lto_codegen_write_merged_modules(lto_code_gen_t cg, const char* path);
 
 /**
@@ -281,7 +293,7 @@ lto_codegen_compile(lto_code_gen_t cg, size_t* length);
  * Generates code for all added modules into one native object file.
  * The name of the file is written to name. Returns true on error.
  */
-extern bool
+extern lto_bool_t
 lto_codegen_compile_to_file(lto_code_gen_t cg, const char** name);
 
 
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 625a6db..d494ad2 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -765,7 +765,9 @@ public:
       return APInt(getBitWidth(), VAL & RHS.VAL);
     return AndSlowCase(RHS);
   }
-  APInt And(const APInt &RHS) const { return this->operator&(RHS); }
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT And(const APInt &RHS) const {
+    return this->operator&(RHS);
+  }
 
   /// \brief Bitwise OR operator.
   ///
@@ -785,7 +787,9 @@ public:
   /// calling operator|.
   ///
   /// \returns An APInt value representing the bitwise OR of *this and RHS.
-  APInt Or(const APInt &RHS) const { return this->operator|(RHS); }
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT Or(const APInt &RHS) const {
+    return this->operator|(RHS);
+  }
 
   /// \brief Bitwise XOR operator.
   ///
@@ -805,7 +809,9 @@ public:
   /// through the usage of operator^.
   ///
   /// \returns An APInt value representing the bitwise XOR of *this and RHS.
-  APInt Xor(const APInt &RHS) const { return this->operator^(RHS); }
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT Xor(const APInt &RHS) const {
+    return this->operator^(RHS);
+  }
 
   /// \brief Multiplication operator.
   ///
@@ -837,17 +843,17 @@ public:
   /// \brief Arithmetic right-shift function.
   ///
   /// Arithmetic right-shift this APInt by shiftAmt.
-  APInt ashr(unsigned shiftAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT ashr(unsigned shiftAmt) const;
 
   /// \brief Logical right-shift function.
   ///
   /// Logical right-shift this APInt by shiftAmt.
-  APInt lshr(unsigned shiftAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT lshr(unsigned shiftAmt) const;
 
   /// \brief Left-shift function.
   ///
   /// Left-shift this APInt by shiftAmt.
-  APInt shl(unsigned shiftAmt) const {
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT shl(unsigned shiftAmt) const {
     assert(shiftAmt <= BitWidth && "Invalid shift amount");
     if (isSingleWord()) {
       if (shiftAmt >= BitWidth)
@@ -858,31 +864,31 @@ public:
   }
 
   /// \brief Rotate left by rotateAmt.
-  APInt rotl(unsigned rotateAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT rotl(unsigned rotateAmt) const;
 
   /// \brief Rotate right by rotateAmt.
-  APInt rotr(unsigned rotateAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT rotr(unsigned rotateAmt) const;
 
   /// \brief Arithmetic right-shift function.
   ///
   /// Arithmetic right-shift this APInt by shiftAmt.
-  APInt ashr(const APInt &shiftAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT ashr(const APInt &shiftAmt) const;
 
   /// \brief Logical right-shift function.
   ///
   /// Logical right-shift this APInt by shiftAmt.
-  APInt lshr(const APInt &shiftAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT lshr(const APInt &shiftAmt) const;
 
   /// \brief Left-shift function.
   ///
   /// Left-shift this APInt by shiftAmt.
-  APInt shl(const APInt &shiftAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT shl(const APInt &shiftAmt) const;
 
   /// \brief Rotate left by rotateAmt.
-  APInt rotl(const APInt &rotateAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT rotl(const APInt &rotateAmt) const;
 
   /// \brief Rotate right by rotateAmt.
-  APInt rotr(const APInt &rotateAmt) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT rotr(const APInt &rotateAmt) const;
 
   /// \brief Unsigned division operation.
   ///
@@ -890,12 +896,12 @@ public:
   /// RHS are treated as unsigned quantities for purposes of this division.
   ///
   /// \returns a new APInt value containing the division result
-  APInt udiv(const APInt &RHS) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT udiv(const APInt &RHS) const;
 
   /// \brief Signed division function for APInt.
   ///
   /// Signed divide this APInt by APInt RHS.
-  APInt sdiv(const APInt &RHS) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT sdiv(const APInt &RHS) const;
 
   /// \brief Unsigned remainder operation.
   ///
@@ -906,12 +912,12 @@ public:
   /// is *this.
   ///
   /// \returns a new APInt value containing the remainder result
-  APInt urem(const APInt &RHS) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT urem(const APInt &RHS) const;
 
   /// \brief Function for signed remainder operation.
   ///
   /// Signed remainder operation on APInt.
-  APInt srem(const APInt &RHS) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT srem(const APInt &RHS) const;
 
   /// \brief Dual division/remainder interface.
   ///
@@ -1145,7 +1151,7 @@ public:
   ///
   /// Truncate the APInt to a specified width. It is an error to specify a width
   /// that is greater than or equal to the current width.
-  APInt trunc(unsigned width) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT trunc(unsigned width) const;
 
   /// \brief Sign extend to a new width.
   ///
@@ -1153,38 +1159,38 @@ public:
   /// bit is set, the fill on the left will be done with 1 bits, otherwise zero.
   /// It is an error to specify a width that is less than or equal to the
   /// current width.
-  APInt sext(unsigned width) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT sext(unsigned width) const;
 
   /// \brief Zero extend to a new width.
   ///
   /// This operation zero extends the APInt to a new width. The high order bits
   /// are filled with 0 bits.  It is an error to specify a width that is less
   /// than or equal to the current width.
-  APInt zext(unsigned width) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT zext(unsigned width) const;
 
   /// \brief Sign extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is sign
   /// extended, truncated, or left alone to make it that width.
-  APInt sextOrTrunc(unsigned width) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT sextOrTrunc(unsigned width) const;
 
   /// \brief Zero extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is zero
   /// extended, truncated, or left alone to make it that width.
-  APInt zextOrTrunc(unsigned width) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT zextOrTrunc(unsigned width) const;
 
   /// \brief Sign extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is sign
   /// extended, or left alone to make it that width.
-  APInt sextOrSelf(unsigned width) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT sextOrSelf(unsigned width) const;
 
   /// \brief Zero extend or truncate to width
   ///
   /// Make this APInt have the bit width given by \p width. The value is zero
   /// extended, or left alone to make it that width.
-  APInt zextOrSelf(unsigned width) const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT zextOrSelf(unsigned width) const;
 
   /// @}
   /// \name Bit Manipulation Operators
@@ -1421,7 +1427,7 @@ public:
   std::string toString(unsigned Radix, bool Signed) const;
 
   /// \returns a byte-swapped representation of this APInt Value.
-  APInt byteSwap() const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT byteSwap() const;
 
   /// \brief Converts this APInt to a double value.
   double roundToDouble(bool isSigned) const;
@@ -1464,7 +1470,7 @@ public:
   ///
   /// The conversion does not do a translation from double to integer, it just
   /// re-interprets the bits of the double.
-  static APInt doubleToBits(double V) {
+  static APInt LLVM_ATTRIBUTE_UNUSED_RESULT doubleToBits(double V) {
     union {
       uint64_t I;
       double D;
@@ -1477,7 +1483,7 @@ public:
   ///
   /// The conversion does not do a translation from float to integer, it just
   /// re-interprets the bits of the float.
-  static APInt floatToBits(float V) {
+  static APInt LLVM_ATTRIBUTE_UNUSED_RESULT floatToBits(float V) {
     union {
       unsigned I;
       float F;
@@ -1507,12 +1513,12 @@ public:
   }
 
   /// \brief Compute the square root
-  APInt sqrt() const;
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT sqrt() const;
 
   /// \brief Get the absolute value;
   ///
   /// If *this is < 0 then return -(*this), otherwise *this;
-  APInt abs() const {
+  APInt LLVM_ATTRIBUTE_UNUSED_RESULT abs() const {
     if (isNegative())
       return -(*this);
     return *this;
diff --git a/include/llvm/ADT/APSInt.h b/include/llvm/ADT/APSInt.h
index 11be4c5..ad035a7 100644
--- a/include/llvm/ADT/APSInt.h
+++ b/include/llvm/ADT/APSInt.h
@@ -68,18 +68,18 @@ public:
   }
   using APInt::toString;
 
-  APSInt trunc(uint32_t width) const {
+  APSInt LLVM_ATTRIBUTE_UNUSED_RESULT trunc(uint32_t width) const {
     return APSInt(APInt::trunc(width), IsUnsigned);
   }
 
-  APSInt extend(uint32_t width) const {
+  APSInt LLVM_ATTRIBUTE_UNUSED_RESULT extend(uint32_t width) const {
     if (IsUnsigned)
       return APSInt(zext(width), IsUnsigned);
     else
       return APSInt(sext(width), IsUnsigned);
   }
 
-  APSInt extOrTrunc(uint32_t width) const {
+  APSInt LLVM_ATTRIBUTE_UNUSED_RESULT extOrTrunc(uint32_t width) const {
       if (IsUnsigned)
         return APSInt(zextOrTrunc(width), IsUnsigned);
       else
@@ -212,7 +212,7 @@ public:
     assert(IsUnsigned == RHS.IsUnsigned && "Signedness mismatch!");
     return APSInt(static_cast<const APInt&>(*this) & RHS, IsUnsigned);
   }
-  APSInt And(const APSInt& RHS) const {
+  APSInt LLVM_ATTRIBUTE_UNUSED_RESULT And(const APSInt& RHS) const {
     return this->operator&(RHS);
   }
 
@@ -220,7 +220,7 @@ public:
     assert(IsUnsigned == RHS.IsUnsigned && "Signedness mismatch!");
     return APSInt(static_cast<const APInt&>(*this) | RHS, IsUnsigned);
   }
-  APSInt Or(const APSInt& RHS) const {
+  APSInt LLVM_ATTRIBUTE_UNUSED_RESULT Or(const APSInt& RHS) const {
     return this->operator|(RHS);
   }
 
@@ -229,7 +229,7 @@ public:
     assert(IsUnsigned == RHS.IsUnsigned && "Signedness mismatch!");
     return APSInt(static_cast<const APInt&>(*this) ^ RHS, IsUnsigned);
   }
-  APSInt Xor(const APSInt& RHS) const {
+  APSInt LLVM_ATTRIBUTE_UNUSED_RESULT Xor(const APSInt& RHS) const {
     return this->operator^(RHS);
   }
 
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index d4152ec..e5562c3 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -80,9 +80,16 @@ namespace llvm {
 
     /// Construct an ArrayRef from a C array.
     template <size_t N>
-    /*implicit*/ ArrayRef(const T (&Arr)[N])
+    /*implicit*/ LLVM_CONSTEXPR ArrayRef(const T (&Arr)[N])
       : Data(Arr), Length(N) {}
 
+#if LLVM_HAS_INITIALIZER_LISTS
+    /// Construct an ArrayRef from a std::initializer_list.
+    /*implicit*/ ArrayRef(const std::initializer_list<T> &Vec)
+    : Data(Vec.begin() == Vec.end() ? (T*)0 : Vec.begin()),
+      Length(Vec.size()) {}
+#endif
+
     /// @}
     /// @name Simple Operations
     /// @{
@@ -178,6 +185,8 @@ namespace llvm {
   public:
     typedef T *iterator;
 
+    typedef std::reverse_iterator<iterator> reverse_iterator;
+
     /// Construct an empty MutableArrayRef.
     /*implicit*/ MutableArrayRef() : ArrayRef<T>() {}
 
@@ -212,6 +221,9 @@ namespace llvm {
     iterator begin() const { return data(); }
     iterator end() const { return data() + this->size(); }
 
+    reverse_iterator rbegin() const { return reverse_iterator(end()); }
+    reverse_iterator rend() const { return reverse_iterator(begin()); }
+
     /// front - Get the first element.
     T &front() const {
       assert(!this->empty());
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index d5aa864..ce322cc 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -64,7 +64,9 @@ public:
     return const_iterator(getBucketsEnd(), getBucketsEnd(), true);
   }
 
-  bool empty() const { return getNumEntries() == 0; }
+  bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const {
+    return getNumEntries() == 0;
+  }
   unsigned size() const { return getNumEntries(); }
 
   /// Grow the densemap so that it has at least Size buckets. Does not shrink
@@ -436,9 +438,8 @@ private:
       this->grow(NumBuckets * 2);
       LookupBucketFor(Key, TheBucket);
       NumBuckets = getNumBuckets();
-    }
-    if (NumBuckets-(NewNumEntries+getNumTombstones()) <= NumBuckets/8) {
-      this->grow(NumBuckets * 2);
+    } else if (NumBuckets-(NewNumEntries+getNumTombstones()) <= NumBuckets/8) {
+      this->grow(NumBuckets);
       LookupBucketFor(Key, TheBucket);
     }
     assert(TheBucket);
@@ -713,13 +714,13 @@ public:
     init(NumInitBuckets);
   }
 
-  SmallDenseMap(const SmallDenseMap &other) {
+  SmallDenseMap(const SmallDenseMap &other) : BaseT() {
     init(0);
     copyFrom(other);
   }
 
 #if LLVM_HAS_RVALUE_REFERENCES
-  SmallDenseMap(SmallDenseMap &&other) {
+  SmallDenseMap(SmallDenseMap &&other) : BaseT() {
     init(0);
     swap(other);
   }
diff --git a/include/llvm/ADT/ImmutableMap.h b/include/llvm/ADT/ImmutableMap.h
index a667479..8f8fb98 100644
--- a/include/llvm/ADT/ImmutableMap.h
+++ b/include/llvm/ADT/ImmutableMap.h
@@ -211,6 +211,7 @@ public:
     friend class ImmutableMap;
 
   public:
+    typedef ptrdiff_t difference_type;
     typedef typename ImmutableMap<KeyT,ValT,ValInfo>::value_type value_type;
     typedef typename ImmutableMap<KeyT,ValT,ValInfo>::value_type_ref reference;
     typedef typename iterator::value_type *pointer;
diff --git a/include/llvm/ADT/ImmutableSet.h b/include/llvm/ADT/ImmutableSet.h
index fbdf066..ad34969 100644
--- a/include/llvm/ADT/ImmutableSet.h
+++ b/include/llvm/ADT/ImmutableSet.h
@@ -851,6 +851,18 @@ PROFILE_INTEGER_INFO(unsigned long long)
 
 #undef PROFILE_INTEGER_INFO
 
+/// Profile traits for booleans.
+template <>
+struct ImutProfileInfo<bool> {
+  typedef const bool  value_type;
+  typedef const bool& value_type_ref;
+
+  static inline void Profile(FoldingSetNodeID& ID, value_type_ref X) {
+    ID.AddBoolean(X);
+  }
+};
+
+
 /// Generic profile trait for pointer types.  We treat pointers as
 /// references to unique objects.
 template <typename T>
@@ -1060,6 +1072,7 @@ public:
     friend class ImmutableSet<ValT,ValInfo>;
 
   public:
+    typedef ptrdiff_t difference_type;
     typedef typename ImmutableSet<ValT,ValInfo>::value_type value_type;
     typedef typename ImmutableSet<ValT,ValInfo>::value_type_ref reference;
     typedef typename iterator::value_type *pointer;
diff --git a/include/llvm/ADT/PointerIntPair.h b/include/llvm/ADT/PointerIntPair.h
index 0299a83..0cfd470 100644
--- a/include/llvm/ADT/PointerIntPair.h
+++ b/include/llvm/ADT/PointerIntPair.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_POINTERINTPAIR_H
 #define LLVM_ADT_POINTERINTPAIR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <cassert>
 
@@ -40,7 +41,7 @@ template <typename PointerTy, unsigned IntBits, typename IntType=unsigned,
           typename PtrTraits = PointerLikeTypeTraits<PointerTy> >
 class PointerIntPair {
   intptr_t Value;
-  enum {
+  enum LLVM_ENUM_INT_TYPE(uintptr_t) {
     /// PointerBitMask - The bits that come from the pointer.
     PointerBitMask =
       ~(uintptr_t)(((intptr_t)1 << PtrTraits::NumLowBitsAvailable)-1),
diff --git a/include/llvm/ADT/PointerUnion.h b/include/llvm/ADT/PointerUnion.h
index c1a6d74..05d362f 100644
--- a/include/llvm/ADT/PointerUnion.h
+++ b/include/llvm/ADT/PointerUnion.h
@@ -72,7 +72,7 @@ namespace llvm {
   ///    printf("%d %d", P.is<int*>(), P.is<float*>());  // prints "1 0"
   ///    X = P.get<int*>();     // ok.
   ///    Y = P.get<float*>();   // runtime assertion failure.
-  ///    Z = P.get<double*>();  // runtime assertion failure (regardless of tag)
+  ///    Z = P.get<double*>();  // compile time failure.
   ///    P = (float*)0;
   ///    Y = P.get<float*>();   // ok.
   ///    X = P.get<int*>();     // runtime assertion failure.
@@ -177,10 +177,17 @@ namespace llvm {
   };
 
   template<typename PT1, typename PT2>
-  bool operator==(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
+  static bool operator==(PointerUnion<PT1, PT2> lhs,
+                         PointerUnion<PT1, PT2> rhs) {
     return lhs.getOpaqueValue() == rhs.getOpaqueValue();
   }
-  
+
+  template<typename PT1, typename PT2>
+  static bool operator!=(PointerUnion<PT1, PT2> lhs,
+                         PointerUnion<PT1, PT2> rhs) {
+    return lhs.getOpaqueValue() != rhs.getOpaqueValue();
+  }
+
   // Teach SmallPtrSet that PointerUnion is "basically a pointer", that has
   // # low bits available = min(PT1bits,PT2bits)-1.
   template<typename PT1, typename PT2>
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index dacda36..3aa8183 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -217,6 +217,22 @@ inline tier<T1, T2> tie(T1& f, T2& s) {
   return tier<T1, T2>(f, s);
 }
 
+/// \brief Function object to check whether the first component of a std::pair
+/// compares less than the first component of another std::pair.
+struct less_first {
+  template <typename T> bool operator()(const T &lhs, const T &rhs) const {
+    return lhs.first < rhs.first;
+  }
+};
+
+/// \brief Function object to check whether the second component of a std::pair
+/// compares less than the second component of another std::pair.
+struct less_second {
+  template <typename T> bool operator()(const T &lhs, const T &rhs) const {
+    return lhs.second < rhs.second;
+  }
+};
+
 //===----------------------------------------------------------------------===//
 //     Extra additions for arrays
 //===----------------------------------------------------------------------===//
@@ -277,12 +293,16 @@ inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
         get_array_pod_sort_comparator(*Start));
 }
 
-template<class IteratorTy>
-inline void array_pod_sort(IteratorTy Start, IteratorTy End,
-                                  int (*Compare)(const void*, const void*)) {
+template <class IteratorTy>
+inline void array_pod_sort(
+    IteratorTy Start, IteratorTy End,
+    int (*Compare)(
+        const typename std::iterator_traits<IteratorTy>::value_type *,
+        const typename std::iterator_traits<IteratorTy>::value_type *)) {
   // Don't dereference start iterator of empty sequence.
   if (Start == End) return;
-  qsort(&*Start, End-Start, sizeof(*Start), Compare);
+  qsort(&*Start, End - Start, sizeof(*Start),
+        reinterpret_cast<int (*)(const void *, const void *)>(Compare));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/ADT/SetVector.h b/include/llvm/ADT/SetVector.h
index d2f7286..5eda37c 100644
--- a/include/llvm/ADT/SetVector.h
+++ b/include/llvm/ADT/SetVector.h
@@ -170,7 +170,7 @@ public:
     vector_.pop_back();
   }
   
-  T pop_back_val() {
+  T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val() {
     T Ret = back();
     pop_back();
     return Ret;
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 8c73041..bd0d883 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -71,7 +71,7 @@ protected:
   ~SmallPtrSetImpl();
 
 public:
-  bool empty() const { return size() == 0; }
+  bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const { return size() == 0; }
   unsigned size() const { return NumElements; }
 
   void clear() {
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 7ba0a71..505aa8d 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -53,7 +53,7 @@ public:
     return size_t((char*)CapacityX - (char*)BeginX);
   }
 
-  bool empty() const { return BeginX == EndX; }
+  bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const { return BeginX == EndX; }
 };
 
 template <typename T, unsigned N> struct SmallVectorStorage;
@@ -427,7 +427,7 @@ public:
       this->grow(N);
   }
 
-  T pop_back_val() {
+  T LLVM_ATTRIBUTE_UNUSED_RESULT pop_back_val() {
 #if LLVM_HAS_RVALUE_REFERENCES
     T Result = ::std::move(this->back());
 #else
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index d2887c5..56dbb5b 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_STRINGEXTRAS_H
 #define LLVM_ADT_STRINGEXTRAS_H
 
+#include <iterator>
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 
@@ -159,6 +160,48 @@ static inline StringRef getOrdinalSuffix(unsigned Val) {
   }
 }
 
+template <typename IteratorT>
+inline std::string join_impl(IteratorT Begin, IteratorT End,
+                             StringRef Separator, std::input_iterator_tag) {
+  std::string S;
+  if (Begin == End)
+    return S;
+
+  S += (*Begin);
+  while (++Begin != End) {
+    S += Separator;
+    S += (*Begin);
+  }
+  return S;
+}
+
+template <typename IteratorT>
+inline std::string join_impl(IteratorT Begin, IteratorT End,
+                             StringRef Separator, std::forward_iterator_tag) {
+  std::string S;
+  if (Begin == End)
+    return S;
+
+  size_t Len = (std::distance(Begin, End) - 1) * Separator.size();
+  for (IteratorT I = Begin; I != End; ++I)
+    Len += (*Begin).size();
+  S.reserve(Len);
+  S += (*Begin);
+  while (++Begin != End) {
+    S += Separator;
+    S += (*Begin);
+  }
+  return S;
+}
+
+/// Joins the strings in the range [Begin, End), adding Separator between
+/// the elements.
+template <typename IteratorT>
+inline std::string join(IteratorT Begin, IteratorT End, StringRef Separator) {
+  typedef typename std::iterator_traits<IteratorT>::iterator_category tag;
+  return join_impl(Begin, End, Separator, tag());
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index 3e1e24c..ec0c284 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -175,7 +175,7 @@ namespace llvm {
     /// transform one of the given strings into the other. If zero,
     /// the strings are identical.
     unsigned edit_distance(StringRef Other, bool AllowReplacements = true,
-                           unsigned MaxEditDistance = 0);
+                           unsigned MaxEditDistance = 0) const;
 
     /// str - Get the contents as an std::string.
     std::string str() const {
@@ -210,12 +210,18 @@ namespace llvm {
              compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
     }
 
+    /// Check if this string starts with the given \p Prefix, ignoring case.
+    bool startswith_lower(StringRef Prefix) const;
+
     /// Check if this string ends with the given \p Suffix.
     bool endswith(StringRef Suffix) const {
       return Length >= Suffix.Length &&
         compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
     }
 
+    /// Check if this string ends with the given \p Suffix, ignoring case.
+    bool endswith_lower(StringRef Suffix) const;
+
     /// @}
     /// @name String Searching
     /// @{
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 8d968e8..84e0b29 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -14,6 +14,7 @@
 
 // Some system headers or GCC predefined macros conflict with identifiers in
 // this file.  Undefine them here.
+#undef NetBSD
 #undef mips
 #undef sparc
 
@@ -318,7 +319,12 @@ public:
     return getOS() == Triple::Cygwin || getOS() == Triple::MinGW32;
   }
 
-  /// isOSWindows - Is this a "Windows" OS.
+  /// \brief Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
+  bool isOSMSVCRT() const {
+    return getOS() == Triple::Win32 || getOS() == Triple::MinGW32;
+  }
+
+  /// \brief Tests whether the OS is Windows.
   bool isOSWindows() const {
     return getOS() == Triple::Win32 || isOSCygMing();
   }
@@ -328,6 +334,11 @@ public:
     return getOS() == Triple::NaCl;
   }
 
+  /// \brief Tests whether the OS is Linux.
+  bool isOSLinux() const {
+    return getOS() == Triple::Linux;
+  }
+
   /// \brief Tests whether the OS uses the ELF binary format.
   bool isOSBinFormatELF() const {
     return !isOSDarwin() && !isOSWindows();
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index 71dab2e..6aeaa91 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -382,7 +382,9 @@ public:
 
   // Miscellaneous inspection routines.
   size_type max_size() const { return size_type(-1); }
-  bool empty() const { return Head == 0 || Head == getTail(); }
+  bool LLVM_ATTRIBUTE_UNUSED_RESULT empty() const {
+    return Head == 0 || Head == getTail();
+  }
 
   // Front and back accessor functions...
   reference front() {
@@ -534,7 +536,7 @@ public:
   // Functionality derived from other functions defined above...
   //
 
-  size_type size() const {
+  size_type LLVM_ATTRIBUTE_UNUSED_RESULT size() const {
     if (Head == 0) return 0; // Don't require construction of sentinel if empty.
     return std::distance(begin(), end());
   }
diff --git a/include/llvm/ADT/polymorphic_ptr.h b/include/llvm/ADT/polymorphic_ptr.h
new file mode 100644
index 0000000..b8d8d71
--- /dev/null
+++ b/include/llvm/ADT/polymorphic_ptr.h
@@ -0,0 +1,117 @@
+//===- llvm/ADT/polymorphic_ptr.h - Smart copyable owned ptr ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides a polymorphic_ptr class template. See the class comments
+/// for details about this API, its intended use cases, etc.
+///
+/// The primary motivation here is to work around the necessity of copy
+/// semantics in C++98. This is typically used where any actual copies are
+/// incidental or unnecessary. As a consequence, it is expected to cease to be
+/// useful and be removed when we can directly rely on move-only types.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_POLYMORPHIC_PTR_H
+#define LLVM_ADT_POLYMORPHIC_PTR_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+/// \brief An owning, copyable polymorphic smart pointer.
+///
+/// This pointer exists to provide copyable owned smart pointer. Rather than
+/// shared ownership semantics, it has unique ownership semantics and deep copy
+/// semantics. It is copyable by requiring that the underlying type exposes
+/// a method which can produce a (heap allocated) clone.
+///
+/// Note that in almost all scenarios use of this could be avoided if we could
+/// build move-only containers of a std::unique_ptr, but until then this
+/// provides an effective way to place polymorphic objects in a container.
+template <typename T> class polymorphic_ptr {
+  T *ptr;
+
+public:
+  polymorphic_ptr(T *ptr = 0) : ptr(ptr) {}
+  polymorphic_ptr(const polymorphic_ptr &arg) : ptr(arg ? arg->clone() : 0) {}
+#if LLVM_HAS_RVALUE_REFERENCES
+  polymorphic_ptr(polymorphic_ptr &&arg) : ptr(arg.take()) {}
+#endif
+  ~polymorphic_ptr() { delete ptr; }
+
+  polymorphic_ptr &operator=(polymorphic_ptr arg) {
+    swap(arg);
+    return *this;
+  }
+  polymorphic_ptr &operator=(T *arg) {
+    if (arg != ptr) {
+      delete ptr;
+      ptr = arg;
+    }
+    return *this;
+  }
+
+  T &operator*() const { return *ptr; }
+  T *operator->() const { return ptr; }
+  LLVM_EXPLICIT operator bool() const { return ptr != 0; }
+  bool operator!() const { return ptr == 0; }
+
+  T *get() const { return ptr; }
+
+  T *take() {
+    T *tmp = ptr;
+    ptr = 0;
+    return tmp;
+  }
+
+  void swap(polymorphic_ptr &arg) {
+    T *tmp = ptr;
+    ptr = arg.ptr;
+    arg.ptr = tmp;
+  }
+};
+
+template <typename T>
+void swap(polymorphic_ptr<T> &lhs, polymorphic_ptr<T> &rhs) {
+  lhs.swap(rhs);
+}
+
+template <typename T, typename U>
+bool operator==(const polymorphic_ptr<T> &lhs, const polymorphic_ptr<U> &rhs) {
+  return lhs.get() == rhs.get();
+}
+
+template <typename T, typename U>
+bool operator!=(const polymorphic_ptr<T> &lhs, const polymorphic_ptr<U> &rhs) {
+  return lhs.get() != rhs.get();
+}
+
+template <typename T, typename U>
+bool operator==(const polymorphic_ptr<T> &lhs, U *rhs) {
+  return lhs.get() == rhs;
+}
+
+template <typename T, typename U>
+bool operator!=(const polymorphic_ptr<T> &lhs, U *rhs) {
+  return lhs.get() != rhs;
+}
+
+template <typename T, typename U>
+bool operator==(T *lhs, const polymorphic_ptr<U> &rhs) {
+  return lhs == rhs.get();
+}
+
+template <typename T, typename U>
+bool operator!=(T *lhs, const polymorphic_ptr<U> &rhs) {
+  return lhs != rhs.get();
+}
+
+}
+
+#endif
diff --git a/include/llvm/Analysis/BlockFrequencyImpl.h b/include/llvm/Analysis/BlockFrequencyImpl.h
index 5e14d6f..817a441 100644
--- a/include/llvm/Analysis/BlockFrequencyImpl.h
+++ b/include/llvm/Analysis/BlockFrequencyImpl.h
@@ -118,7 +118,7 @@ class BlockFrequencyImpl {
 
   /// isBackedge - Return if edge Src -> Dst is a reachable backedge.
   ///
-  bool isBackedge(BlockT *Src, BlockT *Dst) {
+  bool isBackedge(BlockT *Src, BlockT *Dst) const {
     unsigned a = RPO.lookup(Src);
     if (!a)
       return false;
diff --git a/include/llvm/Analysis/BlockFrequencyInfo.h b/include/llvm/Analysis/BlockFrequencyInfo.h
index 64bd15c..a123d0b 100644
--- a/include/llvm/Analysis/BlockFrequencyInfo.h
+++ b/include/llvm/Analysis/BlockFrequencyInfo.h
@@ -41,6 +41,8 @@ public:
 
   bool runOnFunction(Function &F);
   void print(raw_ostream &O, const Module *M) const;
+  const Function *getFunction() const;
+  void view() const;
 
   /// getblockFreq - Return block frequency. Return 0 if we don't have the
   /// information. Please note that initial frequency is equal to ENTRY_FREQ. It
diff --git a/include/llvm/Analysis/CFG.h b/include/llvm/Analysis/CFG.h
index 08fdc26..e5683c8 100644
--- a/include/llvm/Analysis/CFG.h
+++ b/include/llvm/Analysis/CFG.h
@@ -49,6 +49,9 @@ unsigned GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ);
 bool isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
                     bool AllowIdenticalEdges = false);
 
+/// \brief Determine whether instruction 'To' is reachable from 'From',
+/// returning true if uncertain.
+///
 /// Determine whether there is a path from From to To within a single function.
 /// Returns false only if we can prove that once 'From' has been executed then
 /// 'To' can not be executed. Conservatively returns true.
@@ -62,7 +65,18 @@ bool isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
 /// on branchy code but not loops, and LI is most useful on code with loops but
 /// does not help on branchy code outside loops.
 bool isPotentiallyReachable(const Instruction *From, const Instruction *To,
-                            DominatorTree *DT = 0, LoopInfo *LI = 0);
+                            const DominatorTree *DT = 0,
+                            const LoopInfo *LI = 0);
+
+/// \brief Determine whether block 'To' is reachable from 'From', returning
+/// true if uncertain.
+///
+/// Determine whether there is a path from From to To within a single function.
+/// Returns false only if we can prove that once 'From' has been reached then
+/// 'To' can not be executed. Conservatively returns true.
+bool isPotentiallyReachable(const BasicBlock *From, const BasicBlock *To,
+                            const DominatorTree *DT = 0,
+                            const LoopInfo *LI = 0);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index fa596c3..39e90eb 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@@ -44,8 +44,9 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
     return OS.str();
   }
 
-  static std::string getCompleteNodeLabel(const BasicBlock *Node, 
+  static std::string getCompleteNodeLabel(const BasicBlock *Node,
                                           const Function *) {
+    enum { MaxColumns = 80 };
     std::string Str;
     raw_string_ostream OS(Str);
 
@@ -59,16 +60,32 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
     if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
 
     // Process string output to make it nicer...
-    for (unsigned i = 0; i != OutStr.length(); ++i)
+    unsigned ColNum = 0;
+    unsigned LastSpace = 0;
+    for (unsigned i = 0; i != OutStr.length(); ++i) {
       if (OutStr[i] == '\n') {                            // Left justify
         OutStr[i] = '\\';
         OutStr.insert(OutStr.begin()+i+1, 'l');
+        ColNum = 0;
+        LastSpace = 0;
       } else if (OutStr[i] == ';') {                      // Delete comments!
         unsigned Idx = OutStr.find('\n', i+1);            // Find end of line
         OutStr.erase(OutStr.begin()+i, OutStr.begin()+Idx);
         --i;
+      } else if (ColNum == MaxColumns) {                  // Wrap lines.
+        if (LastSpace) {
+          OutStr.insert(LastSpace, "\\l...");
+          ColNum = i - LastSpace;
+          LastSpace = 0;
+          i += 3; // The loop will advance 'i' again.
+        }
+        // Else keep trying to find a space.
       }
-
+      else
+        ++ColNum;
+      if (OutStr[i] == ' ')
+        LastSpace = i;
+    }
     return OutStr;
   }
 
@@ -86,20 +103,20 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
     if (const BranchInst *BI = dyn_cast<BranchInst>(Node->getTerminator()))
       if (BI->isConditional())
         return (I == succ_begin(Node)) ? "T" : "F";
-    
+
     // Label source of switch edges with the associated value.
     if (const SwitchInst *SI = dyn_cast<SwitchInst>(Node->getTerminator())) {
       unsigned SuccNo = I.getSuccessorIndex();
 
       if (SuccNo == 0) return "def";
-      
+
       std::string Str;
       raw_string_ostream OS(Str);
       SwitchInst::ConstCaseIt Case =
-          SwitchInst::ConstCaseIt::fromSuccessorIndex(SI, SuccNo); 
+          SwitchInst::ConstCaseIt::fromSuccessorIndex(SI, SuccNo);
       OS << Case.getCaseValue()->getValue();
       return OS.str();
-    }    
+    }
     return "";
   }
 };
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index 591484d..d00c2ed 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -69,13 +69,36 @@ class CallGraphNode;
 //===----------------------------------------------------------------------===//
 // CallGraph class definition
 //
-class CallGraph {
-protected:
+class CallGraph : public ModulePass {
   Module *Mod;              // The module this call graph represents
 
   typedef std::map<const Function *, CallGraphNode *> FunctionMapTy;
   FunctionMapTy FunctionMap;    // Map from a function to its node
 
+  // Root is root of the call graph, or the external node if a 'main' function
+  // couldn't be found.
+  //
+  CallGraphNode *Root;
+
+  // ExternalCallingNode - This node has edges to all external functions and
+  // those internal functions that have their address taken.
+  CallGraphNode *ExternalCallingNode;
+
+  // CallsExternalNode - This node has edges to it from all functions making
+  // indirect calls or calling an external function.
+  CallGraphNode *CallsExternalNode;
+
+  /// Replace the function represented by this node by another.
+  /// This does not rescan the body of the function, so it is suitable when
+  /// splicing the body of one function to another while also updating all
+  /// callers from the old function to the new.
+  ///
+  void spliceFunction(const Function *From, const Function *To);
+
+  // Add a function to the call graph, and link the node to all of the functions
+  // that it calls.
+  void addToCallGraph(Function *F);
+
 public:
   static char ID; // Class identification, replacement for typeinfo
   //===---------------------------------------------------------------------
@@ -107,15 +130,14 @@ public:
   }
 
   /// Returns the CallGraphNode which is used to represent undetermined calls
-  /// into the callgraph.  Override this if you want behavioral inheritance.
-  virtual CallGraphNode* getExternalCallingNode() const { return 0; }
-  virtual CallGraphNode* getCallsExternalNode()   const { return 0; }
+  /// into the callgraph.
+  CallGraphNode *getExternalCallingNode() const { return ExternalCallingNode; }
+  CallGraphNode *getCallsExternalNode() const { return CallsExternalNode; }
 
   /// Return the root/main method in the module, or some other root node, such
-  /// as the externalcallingnode.  Overload these if you behavioral
-  /// inheritance.
-  virtual CallGraphNode* getRoot() { return 0; }
-  virtual const CallGraphNode* getRoot() const { return 0; }
+  /// as the externalcallingnode.
+  CallGraphNode *getRoot() { return Root; }
+  const CallGraphNode *getRoot() const { return Root; }
 
   //===---------------------------------------------------------------------
   // Functions to keep a call graph up to date with a function that has been
@@ -129,41 +151,20 @@ public:
   /// do this is to dropAllReferences before calling this.
   ///
   Function *removeFunctionFromModule(CallGraphNode *CGN);
-  Function *removeFunctionFromModule(Function *F) {
-    return removeFunctionFromModule((*this)[F]);
-  }
 
   /// getOrInsertFunction - This method is identical to calling operator[], but
   /// it will insert a new CallGraphNode for the specified function if one does
   /// not already exist.
   CallGraphNode *getOrInsertFunction(const Function *F);
 
-  /// spliceFunction - Replace the function represented by this node by another.
-  /// This does not rescan the body of the function, so it is suitable when
-  /// splicing the body of one function to another while also updating all
-  /// callers from the old function to the new.
-  ///
-  void spliceFunction(const Function *From, const Function *To);
-
-  //===---------------------------------------------------------------------
-  // Pass infrastructure interface glue code.
-  //
-protected:
-  CallGraph() {}
-
-public:
-  virtual ~CallGraph() { destroy(); }
-
-  /// initialize - Call this method before calling other methods,
-  /// re/initializes the state of the CallGraph.
-  ///
-  void initialize(Module &M);
+  CallGraph();
+  virtual ~CallGraph() { releaseMemory(); }
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual bool runOnModule(Module &M);
+  virtual void releaseMemory();
 
-  void print(raw_ostream &o, Module *) const;
+  void print(raw_ostream &o, const Module *) const;
   void dump() const;
-protected:
-  // destroy - Release memory for the call graph
-  virtual void destroy();
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
index 8fce2f6..ea8cecf 100644
--- a/include/llvm/Analysis/DependenceAnalysis.h
+++ b/include/llvm/Analysis/DependenceAnalysis.h
@@ -908,6 +908,10 @@ namespace llvm {
     /// based on the current constraint.
     void updateDirection(Dependence::DVEntry &Level,
                          const Constraint &CurConstraint) const;
+
+    bool tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
+                        SmallVectorImpl<Subscript> &Pair) const;
+
   public:
     static char ID; // Class identification, replacement for typeinfo
     DependenceAnalysis() : FunctionPass(ID) {
diff --git a/include/llvm/Analysis/Dominators.h b/include/llvm/Analysis/Dominators.h
index 81c04bb..3aa0beb 100644
--- a/include/llvm/Analysis/Dominators.h
+++ b/include/llvm/Analysis/Dominators.h
@@ -346,6 +346,20 @@ public:
   DomTreeNodeBase<NodeT> *getRootNode() { return RootNode; }
   const DomTreeNodeBase<NodeT> *getRootNode() const { return RootNode; }
 
+  /// Get all nodes dominated by R, including R itself. Return true on success.
+  void getDescendants(NodeT *R, SmallVectorImpl<NodeT *> &Result) const {
+    const DomTreeNodeBase<NodeT> *RN = getNode(R);
+    SmallVector<const DomTreeNodeBase<NodeT> *, 8> WL;
+    WL.push_back(RN);
+    Result.clear();
+
+    while (!WL.empty()) {
+      const DomTreeNodeBase<NodeT> *N = WL.pop_back_val();
+      Result.push_back(N->getBlock());
+      WL.append(N->begin(), N->end());
+    }
+  }
+
   /// properlyDominates - Returns true iff A dominates B and A != B.
   /// Note that this is not a constant time operation!
   ///
@@ -755,6 +769,12 @@ public:
     return DT->getRootNode();
   }
 
+  /// Get all nodes dominated by R, including R itself. Return true on success.
+  void getDescendants(BasicBlock *R,
+                     SmallVectorImpl<BasicBlock *> &Result) const {
+    DT->getDescendants(R, Result);
+  }
+
   /// compare - Return false if the other dominator tree matches this
   /// dominator tree. Otherwise return true.
   inline bool compare(DominatorTree &Other) const {
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index 7b3eed7..62f5aca 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -69,6 +69,8 @@ class LoopBase {
   // Blocks - The list of blocks in this loop.  First entry is the header node.
   std::vector<BlockT*> Blocks;
 
+  SmallPtrSet<const BlockT*, 8> DenseBlockSet;
+
   LoopBase(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
   const LoopBase<BlockT, LoopT>&
     operator=(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
@@ -108,7 +110,7 @@ public:
   /// contains - Return true if the specified basic block is in this loop.
   ///
   bool contains(const BlockT *BB) const {
-    return std::find(block_begin(), block_end(), BB) != block_end();
+    return DenseBlockSet.count(BB);
   }
 
   /// contains - Return true if the specified instruction is in this loop.
@@ -134,7 +136,6 @@ public:
   /// getBlocks - Get a list of the basic blocks which make up this loop.
   ///
   const std::vector<BlockT*> &getBlocks() const { return Blocks; }
-  std::vector<BlockT*> &getBlocksVector() { return Blocks; }
   typedef typename std::vector<BlockT*>::const_iterator block_iterator;
   block_iterator block_begin() const { return Blocks.begin(); }
   block_iterator block_end() const { return Blocks.end(); }
@@ -271,6 +272,17 @@ public:
   /// transformations should use addBasicBlockToLoop.
   void addBlockEntry(BlockT *BB) {
     Blocks.push_back(BB);
+    DenseBlockSet.insert(BB);
+  }
+
+  /// reverseBlocks - interface to reverse Blocks[from, end of loop] in this loop
+  void reverseBlock(unsigned from) {
+    std::reverse(Blocks.begin() + from, Blocks.end());
+  }
+
+  /// reserveBlocks- interface to do reserve() for Blocks
+  void reserveBlocks(unsigned size) {
+    Blocks.reserve(size);
   }
 
   /// moveToHeader - This method is used to move BB (which must be part of this
@@ -293,6 +305,7 @@ public:
   /// the mapping in the LoopInfo class.
   void removeBlockFromLoop(BlockT *BB) {
     RemoveFromVector(Blocks, BB);
+    DenseBlockSet.erase(BB);
   }
 
   /// verifyLoop - Verify loop structure
@@ -307,6 +320,7 @@ protected:
   friend class LoopInfoBase<BlockT, LoopT>;
   explicit LoopBase(BlockT *BB) : ParentLoop(0) {
     Blocks.push_back(BB);
+    DenseBlockSet.insert(BB);
   }
 };
 
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index 5485f3c..c98cb58 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -31,17 +31,12 @@ namespace llvm {
 template<class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::
 getExitingBlocks(SmallVectorImpl<BlockT *> &ExitingBlocks) const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-  std::sort(LoopBBs.begin(), LoopBBs.end());
-
   typedef GraphTraits<BlockT*> BlockTraits;
   for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
     for (typename BlockTraits::ChildIteratorType I =
            BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
          I != E; ++I)
-      if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I)) {
+      if (!contains(*I)) {
         // Not in current loop? It must be an exit block.
         ExitingBlocks.push_back(*BI);
         break;
@@ -65,17 +60,12 @@ BlockT *LoopBase<BlockT, LoopT>::getExitingBlock() const {
 template<class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::
 getExitBlocks(SmallVectorImpl<BlockT*> &ExitBlocks) const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-  std::sort(LoopBBs.begin(), LoopBBs.end());
-
   typedef GraphTraits<BlockT*> BlockTraits;
   for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
     for (typename BlockTraits::ChildIteratorType I =
            BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
          I != E; ++I)
-      if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+      if (!contains(*I))
         // Not in current loop? It must be an exit block.
         ExitBlocks.push_back(*I);
 }
@@ -95,17 +85,12 @@ BlockT *LoopBase<BlockT, LoopT>::getExitBlock() const {
 template<class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::
 getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-  array_pod_sort(LoopBBs.begin(), LoopBBs.end());
-
   typedef GraphTraits<BlockT*> BlockTraits;
   for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI)
     for (typename BlockTraits::ChildIteratorType I =
            BlockTraits::child_begin(*BI), E = BlockTraits::child_end(*BI);
          I != E; ++I)
-      if (!std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+      if (!contains(*I))
         // Not in current loop? It must be an exit block.
         ExitEdges.push_back(Edge(*BI, *I));
 }
@@ -210,7 +195,7 @@ addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase<BlockT, LoopT> &LIB) {
 
   // Add the basic block to this loop and all parent loops...
   while (L) {
-    L->Blocks.push_back(NewBB);
+    L->addBlockEntry(NewBB);
     L = L->getParentLoop();
   }
 }
@@ -250,11 +235,6 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
   // Keep track of the number of BBs visited.
   unsigned NumVisited = 0;
 
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallVector<BlockT*, 128> LoopBBs(block_begin(), block_end());
-  std::sort(LoopBBs.begin(), LoopBBs.end());
-
   // Check the individual blocks.
   for ( ; BI != BE; ++BI) {
     BlockT *BB = *BI;
@@ -266,7 +246,7 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
     for (typename BlockTraits::ChildIteratorType SI =
            BlockTraits::child_begin(BB), SE = BlockTraits::child_end(BB);
          SI != SE; ++SI)
-      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *SI)) {
+      if (contains(*SI)) {
         HasInsideLoopSuccs = true;
         break;
       }
@@ -275,7 +255,7 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
            InvBlockTraits::child_begin(BB), PE = InvBlockTraits::child_end(BB);
          PI != PE; ++PI) {
       BlockT *N = *PI;
-      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), N))
+      if (contains(N))
         HasInsideLoopPreds = true;
       else
         OutsideLoopPreds.push_back(N);
@@ -309,7 +289,7 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
     // Each block in each subloop should be contained within this loop.
     for (block_iterator BI = (*I)->block_begin(), BE = (*I)->block_end();
          BI != BE; ++BI) {
-        assert(std::binary_search(LoopBBs.begin(), LoopBBs.end(), *BI) &&
+        assert(contains(*BI) &&
                "Loop does not contain all the blocks of a subloop!");
     }
 
@@ -418,7 +398,7 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT*> Backedges,
     }
   }
   L->getSubLoopsVector().reserve(NumSubloops);
-  L->getBlocksVector().reserve(NumBlocks);
+  L->reserveBlocks(NumBlocks);
 }
 
 namespace {
@@ -489,15 +469,14 @@ void PopulateLoopsDFS<BlockT, LoopT>::insertIntoLoop(BlockT *Block) {
 
     // For convenience, Blocks and Subloops are inserted in postorder. Reverse
     // the lists, except for the loop header, which is always at the beginning.
-    std::reverse(Subloop->getBlocksVector().begin()+1,
-                 Subloop->getBlocksVector().end());
+    Subloop->reverseBlock(1);
     std::reverse(Subloop->getSubLoopsVector().begin(),
                  Subloop->getSubLoopsVector().end());
 
     Subloop = Subloop->getParentLoop();
   }
   for (; Subloop; Subloop = Subloop->getParentLoop())
-    Subloop->getBlocksVector().push_back(Block);
+    Subloop->addBlockEntry(Block);
 }
 
 /// Analyze LoopInfo discovers loops during a postorder DominatorTree traversal
diff --git a/include/llvm/Analysis/LoopPass.h b/include/llvm/Analysis/LoopPass.h
index 5767c19..5926610 100644
--- a/include/llvm/Analysis/LoopPass.h
+++ b/include/llvm/Analysis/LoopPass.h
@@ -16,8 +16,8 @@
 #define LLVM_ANALYSIS_LOOPPASS_H
 
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManagers.h"
 #include <deque>
 
 namespace llvm {
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index 0d1ae8c..91224ad 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -64,6 +64,10 @@ bool isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
 bool isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                      bool LookThroughBitCast = false);
 
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory and never returns null (such as operator new).
+bool isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                         bool LookThroughBitCast = false);
 
 //===----------------------------------------------------------------------===//
 //  malloc Call Utility Functions.
@@ -81,7 +85,7 @@ static inline CallInst *extractMallocCall(Value *I,
 /// isArrayMalloc - Returns the corresponding CallInst if the instruction
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
-const CallInst *isArrayMalloc(const Value *I, const DataLayout *TD,
+const CallInst *isArrayMalloc(const Value *I, const DataLayout *DL,
                               const TargetLibraryInfo *TLI);
 
 /// getMallocType - Returns the PointerType resulting from the malloc call.
@@ -103,7 +107,7 @@ Type *getMallocAllocatedType(const CallInst *CI, const TargetLibraryInfo *TLI);
 /// then return that multiple.  For non-array mallocs, the multiple is
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
-Value *getMallocArraySize(CallInst *CI, const DataLayout *TD,
+Value *getMallocArraySize(CallInst *CI, const DataLayout *DL,
                           const TargetLibraryInfo *TLI,
                           bool LookThroughSExt = false);
 
@@ -143,7 +147,7 @@ static inline CallInst *isFreeCall(Value *I, const TargetLibraryInfo *TLI) {
 /// underlying object pointed to by Ptr.
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
-bool getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *TD,
+bool getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *DL,
                    const TargetLibraryInfo *TLI, bool RoundToAlign = false);
 
 
@@ -155,7 +159,7 @@ typedef std::pair<APInt, APInt> SizeOffsetType;
 class ObjectSizeOffsetVisitor
   : public InstVisitor<ObjectSizeOffsetVisitor, SizeOffsetType> {
 
-  const DataLayout *TD;
+  const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   bool RoundToAlign;
   unsigned IntTyBits;
@@ -169,7 +173,7 @@ class ObjectSizeOffsetVisitor
   }
 
 public:
-  ObjectSizeOffsetVisitor(const DataLayout *TD, const TargetLibraryInfo *TLI,
+  ObjectSizeOffsetVisitor(const DataLayout *DL, const TargetLibraryInfo *TLI,
                           LLVMContext &Context, bool RoundToAlign = false);
 
   SizeOffsetType compute(Value *V);
@@ -216,7 +220,7 @@ class ObjectSizeOffsetEvaluator
   typedef DenseMap<const Value*, WeakEvalType> CacheMapTy;
   typedef SmallPtrSet<const Value*, 8> PtrSetTy;
 
-  const DataLayout *TD;
+  const DataLayout *DL;
   const TargetLibraryInfo *TLI;
   LLVMContext &Context;
   BuilderTy Builder;
@@ -224,6 +228,7 @@ class ObjectSizeOffsetEvaluator
   Value *Zero;
   CacheMapTy CacheMap;
   PtrSetTy SeenVals;
+  bool RoundToAlign;
 
   SizeOffsetEvalType unknown() {
     return std::make_pair((Value*)0, (Value*)0);
@@ -231,8 +236,8 @@ class ObjectSizeOffsetEvaluator
   SizeOffsetEvalType compute_(Value *V);
 
 public:
-  ObjectSizeOffsetEvaluator(const DataLayout *TD, const TargetLibraryInfo *TLI,
-                            LLVMContext &Context);
+  ObjectSizeOffsetEvaluator(const DataLayout *DL, const TargetLibraryInfo *TLI,
+                            LLVMContext &Context, bool RoundToAlign = false);
   SizeOffsetEvalType compute(Value *V);
 
   bool knownSize(SizeOffsetEvalType SizeOffset) {
diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index ae11713..a5d098e 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h
@@ -95,64 +95,6 @@ namespace llvm {
 
   //===--------------------------------------------------------------------===//
   //
-  // createProfileLoaderPass - This pass loads information from a profile dump
-  // file.
-  //
-  ModulePass *createProfileLoaderPass();
-  extern char &ProfileLoaderPassID;
-
-  //===--------------------------------------------------------------------===//
-  //
-  // createProfileMetadataLoaderPass - This pass loads information from a
-  // profile dump file and sets branch weight metadata.
-  //
-  ModulePass *createProfileMetadataLoaderPass();
-  extern char &ProfileMetadataLoaderPassID;
-
-  //===--------------------------------------------------------------------===//
-  //
-  // createNoProfileInfoPass - This pass implements the default "no profile".
-  //
-  ImmutablePass *createNoProfileInfoPass();
-
-  //===--------------------------------------------------------------------===//
-  //
-  // createProfileEstimatorPass - This pass estimates profiling information
-  // instead of loading it from a previous run.
-  //
-  FunctionPass *createProfileEstimatorPass();
-  extern char &ProfileEstimatorPassID;
-
-  //===--------------------------------------------------------------------===//
-  //
-  // createProfileVerifierPass - This pass verifies profiling information.
-  //
-  FunctionPass *createProfileVerifierPass();
-
-  //===--------------------------------------------------------------------===//
-  //
-  // createPathProfileLoaderPass - This pass loads information from a path
-  // profile dump file.
-  //
-  ModulePass *createPathProfileLoaderPass();
-  extern char &PathProfileLoaderPassID;
-
-  //===--------------------------------------------------------------------===//
-  //
-  // createNoPathProfileInfoPass - This pass implements the default
-  // "no path profile".
-  //
-  ImmutablePass *createNoPathProfileInfoPass();
-
-  //===--------------------------------------------------------------------===//
-  //
-  // createPathProfileVerifierPass - This pass verifies path profiling
-  // information.
-  //
-  ModulePass *createPathProfileVerifierPass();
-
-  //===--------------------------------------------------------------------===//
-  //
   // createDSAAPass - This pass implements simple context sensitive alias
   // analysis.
   //
@@ -194,6 +136,13 @@ namespace llvm {
 
   //===--------------------------------------------------------------------===//
   //
+  // createDelinearizationPass - This pass implements attempts to restore
+  // multidimensional array indices from linearized expressions.
+  //
+  FunctionPass *createDelinearizationPass();
+
+  //===--------------------------------------------------------------------===//
+  //
   // Minor pass prototypes, allowing us to expose them through bugpoint and
   // analyze.
   FunctionPass *createInstCountPass();
diff --git a/include/llvm/Analysis/PathNumbering.h b/include/llvm/Analysis/PathNumbering.h
deleted file mode 100644
index 400a37d..0000000
--- a/include/llvm/Analysis/PathNumbering.h
+++ /dev/null
@@ -1,304 +0,0 @@
-//===- PathNumbering.h ----------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Ball-Larus path numbers uniquely identify paths through a directed acyclic
-// graph (DAG) [Ball96].  For a CFG backedges are removed and replaced by phony
-// edges to obtain a DAG, and thus the unique path numbers [Ball96].
-//
-// The purpose of this analysis is to enumerate the edges in a CFG in order
-// to obtain paths from path numbers in a convenient manner.  As described in
-// [Ball96] edges can be enumerated such that given a path number by following
-// the CFG and updating the path number, the path is obtained.
-//
-// [Ball96]
-//  T. Ball and J. R. Larus. "Efficient Path Profiling."
-//  International Symposium on Microarchitecture, pages 46-57, 1996.
-//  http://portal.acm.org/citation.cfm?id=243857
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_PATHNUMBERING_H
-#define LLVM_ANALYSIS_PATHNUMBERING_H
-
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <map>
-#include <stack>
-#include <vector>
-
-namespace llvm {
-class BallLarusNode;
-class BallLarusEdge;
-class BallLarusDag;
-
-// typedefs for storage/ interators of various DAG components
-typedef std::vector<BallLarusNode*> BLNodeVector;
-typedef std::vector<BallLarusNode*>::iterator BLNodeIterator;
-typedef std::vector<BallLarusEdge*> BLEdgeVector;
-typedef std::vector<BallLarusEdge*>::iterator BLEdgeIterator;
-typedef std::map<BasicBlock*, BallLarusNode*> BLBlockNodeMap;
-typedef std::stack<BallLarusNode*> BLNodeStack;
-
-// Represents a basic block with information necessary for the BallLarus
-// algorithms.
-class BallLarusNode {
-public:
-  enum NodeColor { WHITE, GRAY, BLACK };
-
-  // Constructor: Initializes a new Node for the given BasicBlock
-  BallLarusNode(BasicBlock* BB) :
-    _basicBlock(BB), _numberPaths(0), _color(WHITE) {
-    static unsigned nextUID = 0;
-    _uid = nextUID++;
-  }
-
-  // Returns the basic block for the BallLarusNode
-  BasicBlock* getBlock();
-
-  // Get/set the number of paths to the exit starting at the node.
-  unsigned getNumberPaths();
-  void setNumberPaths(unsigned numberPaths);
-
-  // Get/set the NodeColor used in graph algorithms.
-  NodeColor getColor();
-  void setColor(NodeColor color);
-
-  // Iterator information for predecessor edges. Includes phony and
-  // backedges.
-  BLEdgeIterator predBegin();
-  BLEdgeIterator predEnd();
-  unsigned getNumberPredEdges();
-
-  // Iterator information for successor edges. Includes phony and
-  // backedges.
-  BLEdgeIterator succBegin();
-  BLEdgeIterator succEnd();
-  unsigned getNumberSuccEdges();
-
-  // Add an edge to the predecessor list.
-  void addPredEdge(BallLarusEdge* edge);
-
-  // Remove an edge from the predecessor list.
-  void removePredEdge(BallLarusEdge* edge);
-
-  // Add an edge to the successor list.
-  void addSuccEdge(BallLarusEdge* edge);
-
-  // Remove an edge from the successor list.
-  void removeSuccEdge(BallLarusEdge* edge);
-
-  // Returns the name of the BasicBlock being represented.  If BasicBlock
-  // is null then returns "<null>".  If BasicBlock has no name, then
-  // "<unnamed>" is returned.  Intended for use with debug output.
-  std::string getName();
-
-private:
-  // The corresponding underlying BB.
-  BasicBlock* _basicBlock;
-
-  // Holds the predecessor edges of this node.
-  BLEdgeVector _predEdges;
-
-  // Holds the successor edges of this node.
-  BLEdgeVector _succEdges;
-
-  // The number of paths from the node to the exit.
-  unsigned _numberPaths;
-
-  // 'Color' used by graph algorithms to mark the node.
-  NodeColor _color;
-
-  // Unique ID to ensure naming difference with dotgraphs
-  unsigned _uid;
-
-  // Removes an edge from an edgeVector.  Used by removePredEdge and
-  // removeSuccEdge.
-  void removeEdge(BLEdgeVector& v, BallLarusEdge* e);
-};
-
-// Represents an edge in the Dag.  For an edge, v -> w, v is the source, and
-// w is the target.
-class BallLarusEdge {
-public:
-  enum EdgeType { NORMAL, BACKEDGE, SPLITEDGE,
-    BACKEDGE_PHONY, SPLITEDGE_PHONY, CALLEDGE_PHONY };
-
-  // Constructor: Initializes an BallLarusEdge with a source and target.
-  BallLarusEdge(BallLarusNode* source, BallLarusNode* target,
-                                unsigned duplicateNumber)
-    : _source(source), _target(target), _weight(0), _edgeType(NORMAL),
-      _realEdge(NULL), _duplicateNumber(duplicateNumber) {}
-
-  // Returns the source/ target node of this edge.
-  BallLarusNode* getSource() const;
-  BallLarusNode* getTarget() const;
-
-  // Sets the type of the edge.
-  EdgeType getType() const;
-
-  // Gets the type of the edge.
-  void setType(EdgeType type);
-
-  // Returns the weight of this edge.  Used to decode path numbers to
-  // sequences of basic blocks.
-  unsigned getWeight();
-
-  // Sets the weight of the edge.  Used during path numbering.
-  void setWeight(unsigned weight);
-
-  // Gets/sets the phony edge originating at the root.
-  BallLarusEdge* getPhonyRoot();
-  void setPhonyRoot(BallLarusEdge* phonyRoot);
-
-  // Gets/sets the phony edge terminating at the exit.
-  BallLarusEdge* getPhonyExit();
-  void setPhonyExit(BallLarusEdge* phonyExit);
-
-  // Gets/sets the associated real edge if this is a phony edge.
-  BallLarusEdge* getRealEdge();
-  void setRealEdge(BallLarusEdge* realEdge);
-
-  // Returns the duplicate number of the edge.
-  unsigned getDuplicateNumber();
-
-protected:
-  // Source node for this edge.
-  BallLarusNode* _source;
-
-  // Target node for this edge.
-  BallLarusNode* _target;
-
-private:
-  // Edge weight cooresponding to path number increments before removing
-  // increments along a spanning tree. The sum over the edge weights gives
-  // the path number.
-  unsigned _weight;
-
-  // Type to represent for what this edge is intended
-  EdgeType _edgeType;
-
-  // For backedges and split-edges, the phony edge which is linked to the
-  // root node of the DAG. This contains a path number initialization.
-  BallLarusEdge* _phonyRoot;
-
-  // For backedges and split-edges, the phony edge which is linked to the
-  // exit node of the DAG. This contains a path counter increment, and
-  // potentially a path number increment.
-  BallLarusEdge* _phonyExit;
-
-  // If this is a phony edge, _realEdge is a link to the back or split
-  // edge. Otherwise, this is null.
-  BallLarusEdge* _realEdge;
-
-  // An ID to differentiate between those edges which have the same source
-  // and destination blocks.
-  unsigned _duplicateNumber;
-};
-
-// Represents the Ball Larus DAG for a given Function.  Can calculate
-// various properties required for instrumentation or analysis.  E.g. the
-// edge weights that determine the path number.
-class BallLarusDag {
-public:
-  // Initializes a BallLarusDag from the CFG of a given function.  Must
-  // call init() after creation, since some initialization requires
-  // virtual functions.
-  BallLarusDag(Function &F)
-    : _root(NULL), _exit(NULL), _function(F) {}
-
-  // Initialization that requires virtual functions which are not fully
-  // functional in the constructor.
-  void init();
-
-  // Frees all memory associated with the DAG.
-  virtual ~BallLarusDag();
-
-  // Calculate the path numbers by assigning edge increments as prescribed
-  // in Ball-Larus path profiling.
-  void calculatePathNumbers();
-
-  // Returns the number of paths for the DAG.
-  unsigned getNumberOfPaths();
-
-  // Returns the root (i.e. entry) node for the DAG.
-  BallLarusNode* getRoot();
-
-  // Returns the exit node for the DAG.
-  BallLarusNode* getExit();
-
-  // Returns the function for the DAG.
-  Function& getFunction();
-
-  // Clears the node colors.
-  void clearColors(BallLarusNode::NodeColor color);
-
-protected:
-  // All nodes in the DAG.
-  BLNodeVector _nodes;
-
-  // All edges in the DAG.
-  BLEdgeVector _edges;
-
-  // All backedges in the DAG.
-  BLEdgeVector _backEdges;
-
-  // Allows subclasses to determine which type of Node is created.
-  // Override this method to produce subclasses of BallLarusNode if
-  // necessary. The destructor of BallLarusDag will call free on each pointer
-  // created.
-  virtual BallLarusNode* createNode(BasicBlock* BB);
-
-  // Allows subclasses to determine which type of Edge is created.
-  // Override this method to produce subclasses of BallLarusEdge if
-  // necessary.  Parameters source and target will have been created by
-  // createNode and can be cast to the subclass of BallLarusNode*
-  // returned by createNode. The destructor of BallLarusDag will call free
-  // on each pointer created.
-  virtual BallLarusEdge* createEdge(BallLarusNode* source, BallLarusNode*
-                                    target, unsigned duplicateNumber);
-
-  // Proxy to node's constructor.  Updates the DAG state.
-  BallLarusNode* addNode(BasicBlock* BB);
-
-  // Proxy to edge's constructor.  Updates the DAG state.
-  BallLarusEdge* addEdge(BallLarusNode* source, BallLarusNode* target,
-                         unsigned duplicateNumber);
-
-private:
-  // The root (i.e. entry) node for this DAG.
-  BallLarusNode* _root;
-
-  // The exit node for this DAG.
-  BallLarusNode* _exit;
-
-  // The function represented by this DAG.
-  Function& _function;
-
-  // Processes one node and its imediate edges for building the DAG.
-  void buildNode(BLBlockNodeMap& inDag, std::stack<BallLarusNode*>& dfsStack);
-
-  // Process an edge in the CFG for DAG building.
-  void buildEdge(BLBlockNodeMap& inDag, std::stack<BallLarusNode*>& dfsStack,
-                 BallLarusNode* currentNode, BasicBlock* succBB,
-                 unsigned duplicateNumber);
-
-  // The weight on each edge is the increment required along any path that
-  // contains that edge.
-  void calculatePathNumbersFrom(BallLarusNode* node);
-
-  // Adds a backedge with its phony edges.  Updates the DAG state.
-  void addBackedge(BallLarusNode* source, BallLarusNode* target,
-                   unsigned duplicateCount);
-};
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/Analysis/PathProfileInfo.h b/include/llvm/Analysis/PathProfileInfo.h
deleted file mode 100644
index 4fce16e..0000000
--- a/include/llvm/Analysis/PathProfileInfo.h
+++ /dev/null
@@ -1,112 +0,0 @@
-//===- PathProfileInfo.h --------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file outlines the interface used by optimizers to load path profiles.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_PATHPROFILEINFO_H
-#define LLVM_ANALYSIS_PATHPROFILEINFO_H
-
-#include "llvm/Analysis/PathNumbering.h"
-#include "llvm/IR/BasicBlock.h"
-
-namespace llvm {
-
-class ProfilePath;
-class ProfilePathEdge;
-class PathProfileInfo;
-
-typedef std::vector<ProfilePathEdge> ProfilePathEdgeVector;
-typedef std::vector<ProfilePathEdge>::iterator ProfilePathEdgeIterator;
-
-typedef std::vector<BasicBlock*> ProfilePathBlockVector;
-typedef std::vector<BasicBlock*>::iterator ProfilePathBlockIterator;
-
-typedef std::map<unsigned int,ProfilePath*> ProfilePathMap;
-typedef std::map<unsigned int,ProfilePath*>::iterator ProfilePathIterator;
-
-typedef std::map<Function*,unsigned int> FunctionPathCountMap;
-typedef std::map<Function*,ProfilePathMap> FunctionPathMap;
-typedef std::map<Function*,ProfilePathMap>::iterator FunctionPathIterator;
-
-class ProfilePathEdge {
-public:
-  ProfilePathEdge(BasicBlock* source, BasicBlock* target,
-                  unsigned duplicateNumber);
-
-  inline unsigned getDuplicateNumber() { return _duplicateNumber; }
-  inline BasicBlock* getSource() { return _source; }
-  inline BasicBlock* getTarget() { return _target; }
-
-protected:
-  BasicBlock* _source;
-  BasicBlock* _target;
-  unsigned _duplicateNumber;
-};
-
-class ProfilePath {
-public:
-  ProfilePath(unsigned int number, unsigned int count,
-              double countStdDev, PathProfileInfo* ppi);
-
-  double getFrequency() const;
-
-  inline unsigned int getNumber() const { return _number; }
-  inline unsigned int getCount() const { return _count; }
-  inline double getCountStdDev() const { return _countStdDev; }
-
-  ProfilePathEdgeVector* getPathEdges() const;
-  ProfilePathBlockVector* getPathBlocks() const;
-
-  BasicBlock* getFirstBlockInPath() const;
-
-private:
-  unsigned int _number;
-  unsigned int _count;
-  double _countStdDev;
-
-  // double pointer back to the profiling info
-  PathProfileInfo* _ppi;
-};
-
-// TODO: overload [] operator for getting path
-// Add: getFunctionCallCount()
-class PathProfileInfo {
-  public:
-  PathProfileInfo();
-  ~PathProfileInfo();
-
-  void setCurrentFunction(Function* F);
-  Function* getCurrentFunction() const;
-  BasicBlock* getCurrentFunctionEntry();
-
-  ProfilePath* getPath(unsigned int number);
-  unsigned int getPotentialPathCount();
-
-  ProfilePathIterator pathBegin();
-  ProfilePathIterator pathEnd();
-  unsigned int pathsRun();
-
-  static char ID; // Pass identification
-  std::string argList;
-
-protected:
-  FunctionPathMap _functionPaths;
-  FunctionPathCountMap _functionPathCounts;
-
-private:
-  BallLarusDag* _currentDag;
-  Function* _currentFunction;
-
-  friend class ProfilePath;
-};
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/Analysis/ProfileDataLoader.h b/include/llvm/Analysis/ProfileDataLoader.h
deleted file mode 100644
index 90097f7..0000000
--- a/include/llvm/Analysis/ProfileDataLoader.h
+++ /dev/null
@@ -1,140 +0,0 @@
-//===- ProfileDataLoader.h - Load & convert profile info ----*- C++ -*-===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileDataLoader class is used to load profiling data from a dump file.
-// The ProfileDataT<FType, BType> class is used to store the mapping of this
-// data to control flow edges.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_PROFILEDATALOADER_H
-#define LLVM_ANALYSIS_PROFILEDATALOADER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <string>
-
-namespace llvm {
-
-class ModulePass;
-class Function;
-class BasicBlock;
-
-// Helper for dumping edges to dbgs().
-raw_ostream& operator<<(raw_ostream &O, std::pair<const BasicBlock *,
-                                                  const BasicBlock *> E);
-
-/// \brief The ProfileDataT<FType, BType> class is used to store the mapping of
-/// profiling data to control flow edges.
-///
-/// An edge is defined by its source and sink basic blocks.
-template<class FType, class BType>
-class ProfileDataT {
-public:
-  // The profiling information defines an Edge by its source and sink basic
-  // blocks.
-  typedef std::pair<const BType*, const BType*> Edge;
-
-private:
-  typedef DenseMap<Edge, unsigned> EdgeWeights;
-
-  /// \brief Count the number of times a transition between two blocks is
-  /// executed.
-  ///
-  /// As a special case, we also hold an edge from the null BasicBlock to the
-  /// entry block to indicate how many times the function was entered.
-  DenseMap<const FType*, EdgeWeights> EdgeInformation;
-
-public:
-  /// getFunction() - Returns the Function for an Edge.
-  static const FType *getFunction(Edge e) {
-    // e.first may be NULL
-    assert(((!e.first) || (e.first->getParent() == e.second->getParent()))
-           && "A ProfileData::Edge can not be between two functions");
-    assert(e.second && "A ProfileData::Edge must have a real sink");
-    return e.second->getParent();
-  }
-
-  /// getEdge() - Creates an Edge between two BasicBlocks.
-  static Edge getEdge(const BType *Src, const BType *Dest) {
-    return Edge(Src, Dest);
-  }
-
-  /// getEdgeWeight - Return the number of times that a given edge was
-  /// executed.
-  unsigned getEdgeWeight(Edge e) const {
-    const FType *f = getFunction(e);
-    assert((EdgeInformation.find(f) != EdgeInformation.end())
-           && "No profiling information for function");
-    EdgeWeights weights = EdgeInformation.find(f)->second;
-
-    assert((weights.find(e) != weights.end())
-           && "No profiling information for edge");
-    return weights.find(e)->second;
-  }
-
-  /// addEdgeWeight - Add 'weight' to the already stored execution count for
-  /// this edge.
-  void addEdgeWeight(Edge e, unsigned weight) {
-    EdgeInformation[getFunction(e)][e] += weight;
-  }
-};
-
-typedef ProfileDataT<Function, BasicBlock> ProfileData;
-//typedef ProfileDataT<MachineFunction, MachineBasicBlock> MachineProfileData;
-
-/// The ProfileDataLoader class is used to load raw profiling data from the
-/// dump file.
-class ProfileDataLoader {
-private:
-  /// The name of the file where the raw profiling data is stored.
-  const std::string &Filename;
-
-  /// A vector of the command line arguments used when the target program was
-  /// run to generate profiling data.  One entry per program run.
-  SmallVector<std::string, 1> CommandLines;
-
-  /// The raw values for how many times each edge was traversed, values from
-  /// multiple program runs are accumulated.
-  SmallVector<unsigned, 32> EdgeCounts;
-
-public:
-  /// ProfileDataLoader ctor - Read the specified profiling data file, exiting
-  /// the program if the file is invalid or broken.
-  ProfileDataLoader(const char *ToolName, const std::string &Filename);
-
-  /// A special value used to represent the weight of an edge which has not
-  /// been counted yet.
-  static const unsigned Uncounted;
-
-  /// getNumExecutions - Return the number of times the target program was run
-  /// to generate this profiling data.
-  unsigned getNumExecutions() const { return CommandLines.size(); }
-
-  /// getExecution - Return the command line parameters used to generate the
-  /// i'th set of profiling data.
-  const std::string &getExecution(unsigned i) const { return CommandLines[i]; }
-
-  const std::string &getFileName() const { return Filename; }
-
-  /// getRawEdgeCounts - Return the raw profiling data, this is just a list of
-  /// numbers with no mappings to edges.
-  ArrayRef<unsigned> getRawEdgeCounts() const { return EdgeCounts; }
-};
-
-/// createProfileMetadataLoaderPass - This function returns a Pass that loads
-/// the profiling information for the module from the specified filename.
-ModulePass *createProfileMetadataLoaderPass(const std::string &Filename);
-
-} // End llvm namespace
-
-#endif
diff --git a/include/llvm/Analysis/ProfileDataTypes.h b/include/llvm/Analysis/ProfileDataTypes.h
deleted file mode 100644
index 1be15e0..0000000
--- a/include/llvm/Analysis/ProfileDataTypes.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*===-- ProfileDataTypes.h - Profiling info shared constants --------------===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source
-|* License. See LICENSE.TXT for details.
-|*
-|*===----------------------------------------------------------------------===*|
-|*
-|* This file defines constants shared by the various different profiling
-|* runtime libraries and the LLVM C++ profile metadata loader. It must be a
-|* C header because, at present, the profiling runtimes are written in C.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#ifndef LLVM_ANALYSIS_PROFILEDATATYPES_H
-#define LLVM_ANALYSIS_PROFILEDATATYPES_H
-
-/* Included by libprofile. */
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* TODO: Strip out unused entries once ProfileInfo etc has been removed. */
-enum ProfilingType {
-  ArgumentInfo  = 1,   /* The command line argument block */
-  FunctionInfo  = 2,   /* Function profiling information  */
-  BlockInfo     = 3,   /* Block profiling information     */
-  EdgeInfo      = 4,   /* Edge profiling information      */
-  PathInfo      = 5,   /* Path profiling information      */
-  BBTraceInfo   = 6,   /* Basic block trace information   */
-  OptEdgeInfo   = 7    /* Edge profiling information, optimal version */
-};
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* LLVM_ANALYSIS_PROFILEDATATYPES_H */
diff --git a/include/llvm/Analysis/ProfileInfo.h b/include/llvm/Analysis/ProfileInfo.h
deleted file mode 100644
index 5d17fa1..0000000
--- a/include/llvm/Analysis/ProfileInfo.h
+++ /dev/null
@@ -1,247 +0,0 @@
-//===- llvm/Analysis/ProfileInfo.h - Profile Info Interface -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the generic ProfileInfo interface, which is used as the
-// common interface used by all clients of profiling information, and
-// implemented either by making static guestimations, or by actually reading in
-// profiling information gathered by running the program.
-//
-// Note that to be useful, all profile-based optimizations should preserve
-// ProfileInfo, which requires that they notify it when changes to the CFG are
-// made. (This is not implemented yet.)
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_PROFILEINFO_H
-#define LLVM_ANALYSIS_PROFILEINFO_H
-
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <map>
-#include <set>
-#include <string>
-
-namespace llvm {
-  class Pass;
-  class raw_ostream;
-
-  class BasicBlock;
-  class Function;
-  class MachineBasicBlock;
-  class MachineFunction;
-
-  // Helper for dumping edges to dbgs().
-  raw_ostream& operator<<(raw_ostream &O, std::pair<const BasicBlock *, const BasicBlock *> E);
-  raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E);
-
-  raw_ostream& operator<<(raw_ostream &O, const BasicBlock *BB);
-  raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB);
-
-  raw_ostream& operator<<(raw_ostream &O, const Function *F);
-  raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF);
-
-  /// ProfileInfo Class - This class holds and maintains profiling
-  /// information for some unit of code.
-  template<class FType, class BType>
-  class ProfileInfoT {
-  public:
-    // Types for handling profiling information.
-    typedef std::pair<const BType*, const BType*> Edge;
-    typedef std::pair<Edge, double> EdgeWeight;
-    typedef std::map<Edge, double> EdgeWeights;
-    typedef std::map<const BType*, double> BlockCounts;
-    typedef std::map<const BType*, const BType*> Path;
-
-  protected:
-    // EdgeInformation - Count the number of times a transition between two
-    // blocks is executed. As a special case, we also hold an edge from the
-    // null BasicBlock to the entry block to indicate how many times the
-    // function was entered.
-    std::map<const FType*, EdgeWeights> EdgeInformation;
-
-    // BlockInformation - Count the number of times a block is executed.
-    std::map<const FType*, BlockCounts> BlockInformation;
-
-    // FunctionInformation - Count the number of times a function is executed.
-    std::map<const FType*, double> FunctionInformation;
-
-    ProfileInfoT<MachineFunction, MachineBasicBlock> *MachineProfile;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    ProfileInfoT();
-    ~ProfileInfoT();  // We want to be subclassed
-
-    // MissingValue - The value that is returned for execution counts in case
-    // no value is available.
-    static const double MissingValue;
-
-    // getFunction() - Returns the Function for an Edge, checking for validity.
-    static const FType* getFunction(Edge e) {
-      if (e.first)
-        return e.first->getParent();
-      if (e.second)
-        return e.second->getParent();
-      llvm_unreachable("Invalid ProfileInfo::Edge");
-    }
-
-    // getEdge() - Creates an Edge from two BasicBlocks.
-    static Edge getEdge(const BType *Src, const BType *Dest) {
-      return std::make_pair(Src, Dest);
-    }
-
-    //===------------------------------------------------------------------===//
-    /// Profile Information Queries
-    ///
-    double getExecutionCount(const FType *F);
-
-    double getExecutionCount(const BType *BB);
-
-    void setExecutionCount(const BType *BB, double w);
-
-    void addExecutionCount(const BType *BB, double w);
-
-    double getEdgeWeight(Edge e) const {
-      typename std::map<const FType*, EdgeWeights>::const_iterator J =
-        EdgeInformation.find(getFunction(e));
-      if (J == EdgeInformation.end()) return MissingValue;
-
-      typename EdgeWeights::const_iterator I = J->second.find(e);
-      if (I == J->second.end()) return MissingValue;
-
-      return I->second;
-    }
-
-    void setEdgeWeight(Edge e, double w) {
-      DEBUG_WITH_TYPE("profile-info",
-            dbgs() << "Creating Edge " << e
-                   << " (weight: " << format("%.20g",w) << ")\n");
-      EdgeInformation[getFunction(e)][e] = w;
-    }
-
-    void addEdgeWeight(Edge e, double w);
-
-    EdgeWeights &getEdgeWeights (const FType *F) {
-      return EdgeInformation[F];
-    }
-
-    //===------------------------------------------------------------------===//
-    /// Analysis Update Methods
-    ///
-    void removeBlock(const BType *BB);
-
-    void removeEdge(Edge e);
-
-    void replaceEdge(const Edge &, const Edge &);
-
-    enum GetPathMode {
-      GetPathToExit = 1,
-      GetPathToValue = 2,
-      GetPathToDest = 4,
-      GetPathWithNewEdges = 8
-    };
-
-    const BType *GetPath(const BType *Src, const BType *Dest,
-                              Path &P, unsigned Mode);
-
-    void divertFlow(const Edge &, const Edge &);
-
-    void splitEdge(const BType *FirstBB, const BType *SecondBB,
-                   const BType *NewBB, bool MergeIdenticalEdges = false);
-
-    void splitBlock(const BType *Old, const BType* New);
-
-    void splitBlock(const BType *BB, const BType* NewBB,
-                    BType *const *Preds, unsigned NumPreds);
-
-    void replaceAllUses(const BType *RmBB, const BType *DestBB);
-
-    void transfer(const FType *Old, const FType *New);
-
-    void repair(const FType *F);
-
-    void dump(FType *F = 0, bool real = true) {
-      dbgs() << "**** This is ProfileInfo " << this << " speaking:\n";
-      if (!real) {
-        typename std::set<const FType*> Functions;
-
-        dbgs() << "Functions: \n";
-        if (F) {
-          dbgs() << F << "@" << format("%p", F) << ": " << format("%.20g",getExecutionCount(F)) << "\n";
-          Functions.insert(F);
-        } else {
-          for (typename std::map<const FType*, double>::iterator fi = FunctionInformation.begin(),
-               fe = FunctionInformation.end(); fi != fe; ++fi) {
-            dbgs() << fi->first << "@" << format("%p",fi->first) << ": " << format("%.20g",fi->second) << "\n";
-            Functions.insert(fi->first);
-          }
-        }
-
-        for (typename std::set<const FType*>::iterator FI = Functions.begin(), FE = Functions.end();
-             FI != FE; ++FI) {
-          const FType *F = *FI;
-          typename std::map<const FType*, BlockCounts>::iterator bwi = BlockInformation.find(F);
-          dbgs() << "BasicBlocks for Function " << F << ":\n";
-          for (typename BlockCounts::const_iterator bi = bwi->second.begin(), be = bwi->second.end(); bi != be; ++bi) {
-            dbgs() << bi->first << "@" << format("%p", bi->first) << ": " << format("%.20g",bi->second) << "\n";
-          }
-        }
-
-        for (typename std::set<const FType*>::iterator FI = Functions.begin(), FE = Functions.end();
-             FI != FE; ++FI) {
-          typename std::map<const FType*, EdgeWeights>::iterator ei = EdgeInformation.find(*FI);
-          dbgs() << "Edges for Function " << ei->first << ":\n";
-          for (typename EdgeWeights::iterator ewi = ei->second.begin(), ewe = ei->second.end(); 
-               ewi != ewe; ++ewi) {
-            dbgs() << ewi->first << ": " << format("%.20g",ewi->second) << "\n";
-          }
-        }
-      } else {
-        assert(F && "No function given, this is not supported!");
-        dbgs() << "Functions: \n";
-        dbgs() << F << "@" << format("%p", F) << ": " << format("%.20g",getExecutionCount(F)) << "\n";
-
-        dbgs() << "BasicBlocks for Function " << F << ":\n";
-        for (typename FType::const_iterator BI = F->begin(), BE = F->end();
-             BI != BE; ++BI) {
-          const BType *BB = &(*BI);
-          dbgs() << BB << "@" << format("%p", BB) << ": " << format("%.20g",getExecutionCount(BB)) << "\n";
-        }
-      }
-      dbgs() << "**** ProfileInfo " << this << ", over and out.\n";
-    }
-
-    bool CalculateMissingEdge(const BType *BB, Edge &removed, bool assumeEmptyExit = false);
-
-    bool EstimateMissingEdges(const BType *BB);
-
-    ProfileInfoT<MachineFunction, MachineBasicBlock> *MI() {
-      if (MachineProfile == 0)
-        MachineProfile = new ProfileInfoT<MachineFunction, MachineBasicBlock>();
-      return MachineProfile;
-    }
-
-    bool hasMI() const {
-      return (MachineProfile != 0);
-    }
-  };
-
-  typedef ProfileInfoT<Function, BasicBlock> ProfileInfo;
-  typedef ProfileInfoT<MachineFunction, MachineBasicBlock> MachineProfileInfo;
-
-  /// createProfileLoaderPass - This function returns a Pass that loads the
-  /// profiling information for the module from the specified filename, making
-  /// it available to the optimizers.
-  Pass *createProfileLoaderPass(const std::string &Filename);
-
-} // End llvm namespace
-
-#endif
diff --git a/include/llvm/Analysis/ProfileInfoLoader.h b/include/llvm/Analysis/ProfileInfoLoader.h
deleted file mode 100644
index e0f49f3..0000000
--- a/include/llvm/Analysis/ProfileInfoLoader.h
+++ /dev/null
@@ -1,81 +0,0 @@
-//===- ProfileInfoLoader.h - Load & convert profile information -*- C++ -*-===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileInfoLoader class is used to load and represent profiling
-// information read in from the dump file.  If conversions between formats are
-// needed, it can also do this.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_PROFILEINFOLOADER_H
-#define LLVM_ANALYSIS_PROFILEINFOLOADER_H
-
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace llvm {
-
-class Module;
-class Function;
-class BasicBlock;
-
-class ProfileInfoLoader {
-  const std::string &Filename;
-  std::vector<std::string> CommandLines;
-  std::vector<unsigned>    FunctionCounts;
-  std::vector<unsigned>    BlockCounts;
-  std::vector<unsigned>    EdgeCounts;
-  std::vector<unsigned>    OptimalEdgeCounts;
-  std::vector<unsigned>    BBTrace;
-public:
-  // ProfileInfoLoader ctor - Read the specified profiling data file, exiting
-  // the program if the file is invalid or broken.
-  ProfileInfoLoader(const char *ToolName, const std::string &Filename);
-
-  static const unsigned Uncounted;
-
-  unsigned getNumExecutions() const { return CommandLines.size(); }
-  const std::string &getExecution(unsigned i) const { return CommandLines[i]; }
-
-  const std::string &getFileName() const { return Filename; }
-
-  // getRawFunctionCounts - This method is used by consumers of function
-  // counting information.
-  //
-  const std::vector<unsigned> &getRawFunctionCounts() const {
-    return FunctionCounts;
-  }
-
-  // getRawBlockCounts - This method is used by consumers of block counting
-  // information.
-  //
-  const std::vector<unsigned> &getRawBlockCounts() const {
-    return BlockCounts;
-  }
-
-  // getEdgeCounts - This method is used by consumers of edge counting
-  // information.
-  //
-  const std::vector<unsigned> &getRawEdgeCounts() const {
-    return EdgeCounts;
-  }
-
-  // getEdgeOptimalCounts - This method is used by consumers of optimal edge 
-  // counting information.
-  //
-  const std::vector<unsigned> &getRawOptimalEdgeCounts() const {
-    return OptimalEdgeCounts;
-  }
-
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/include/llvm/Analysis/ProfileInfoTypes.h b/include/llvm/Analysis/ProfileInfoTypes.h
deleted file mode 100644
index 45aab5b..0000000
--- a/include/llvm/Analysis/ProfileInfoTypes.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*===-- ProfileInfoTypes.h - Profiling info shared constants --------------===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source
-|* License. See LICENSE.TXT for details.
-|*
-|*===----------------------------------------------------------------------===*|
-|*
-|* This file defines constants shared by the various different profiling
-|* runtime libraries and the LLVM C++ profile info loader. It must be a
-|* C header because, at present, the profiling runtimes are written in C.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#ifndef LLVM_ANALYSIS_PROFILEINFOTYPES_H
-#define LLVM_ANALYSIS_PROFILEINFOTYPES_H
-
-/* Included by libprofile. */
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* IDs to distinguish between those path counters stored in hashses vs arrays */
-enum ProfilingStorageType {
-  ProfilingArray = 1,
-  ProfilingHash = 2
-};
-
-#include "llvm/Analysis/ProfileDataTypes.h"
-
-/*
- * The header for tables that map path numbers to path counters.
- */
-typedef struct {
-  unsigned fnNumber; /* function number for these counters */
-  unsigned numEntries;   /* number of entries stored */
-} PathProfileHeader;
-
-/*
- * Describes an entry in a tagged table for path counters.
- */
-typedef struct {
-  unsigned pathNumber;
-  unsigned pathCounter;
-} PathProfileTableEntry;
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* LLVM_ANALYSIS_PROFILEINFOTYPES_H */
diff --git a/include/llvm/Analysis/RegionPass.h b/include/llvm/Analysis/RegionPass.h
index 6ae3191..3907ad9 100644
--- a/include/llvm/Analysis/RegionPass.h
+++ b/include/llvm/Analysis/RegionPass.h
@@ -18,8 +18,8 @@
 
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Pass.h"
-#include "llvm/PassManagers.h"
 #include <deque>
 
 namespace llvm {
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 3817e41..d7f6178 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -189,15 +189,16 @@ namespace llvm {
 
     /// Convenient NoWrapFlags manipulation that hides enum casts and is
     /// visible in the ScalarEvolution name space.
-    static SCEV::NoWrapFlags maskFlags(SCEV::NoWrapFlags Flags, int Mask) {
+    static SCEV::NoWrapFlags LLVM_ATTRIBUTE_UNUSED_RESULT
+    maskFlags(SCEV::NoWrapFlags Flags, int Mask) {
       return (SCEV::NoWrapFlags)(Flags & Mask);
     }
-    static SCEV::NoWrapFlags setFlags(SCEV::NoWrapFlags Flags,
-                                      SCEV::NoWrapFlags OnFlags) {
+    static SCEV::NoWrapFlags LLVM_ATTRIBUTE_UNUSED_RESULT
+    setFlags(SCEV::NoWrapFlags Flags, SCEV::NoWrapFlags OnFlags) {
       return (SCEV::NoWrapFlags)(Flags | OnFlags);
     }
-    static SCEV::NoWrapFlags clearFlags(SCEV::NoWrapFlags Flags,
-                                        SCEV::NoWrapFlags OffFlags) {
+    static SCEV::NoWrapFlags LLVM_ATTRIBUTE_UNUSED_RESULT
+    clearFlags(SCEV::NoWrapFlags Flags, SCEV::NoWrapFlags OffFlags) {
       return (SCEV::NoWrapFlags)(Flags & ~OffFlags);
     }
 
@@ -361,18 +362,18 @@ namespace llvm {
     /// that we attempt to compute getSCEVAtScope information for, which can
     /// be expensive in extreme cases.
     DenseMap<const SCEV *,
-             std::map<const Loop *, const SCEV *> > ValuesAtScopes;
+             SmallVector<std::pair<const Loop *, const SCEV *>, 2> > ValuesAtScopes;
 
     /// LoopDispositions - Memoized computeLoopDisposition results.
     DenseMap<const SCEV *,
-             std::map<const Loop *, LoopDisposition> > LoopDispositions;
+             SmallVector<std::pair<const Loop *, LoopDisposition>, 2> > LoopDispositions;
 
     /// computeLoopDisposition - Compute a LoopDisposition value.
     LoopDisposition computeLoopDisposition(const SCEV *S, const Loop *L);
 
     /// BlockDispositions - Memoized computeBlockDisposition results.
     DenseMap<const SCEV *,
-             std::map<const BasicBlock *, BlockDisposition> > BlockDispositions;
+             SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> > BlockDispositions;
 
     /// computeBlockDisposition - Compute a BlockDisposition value.
     BlockDisposition computeBlockDisposition(const SCEV *S, const BasicBlock *BB);
@@ -426,14 +427,6 @@ namespace llvm {
     /// resolution.
     void ForgetSymbolicName(Instruction *I, const SCEV *SymName);
 
-    /// getBECount - Subtract the end and start values and divide by the step,
-    /// rounding up, to get the number of times the backedge is executed. Return
-    /// CouldNotCompute if an intermediate computation overflows.
-    const SCEV *getBECount(const SCEV *Start,
-                           const SCEV *End,
-                           const SCEV *Step,
-                           bool NoWrap);
-
     /// getBackedgeTakenInfo - Return the BackedgeTakenInfo for the given
     /// loop, lazily computing new values if the loop hasn't been analyzed
     /// yet.
@@ -498,6 +491,8 @@ namespace llvm {
     /// less-than is signed.
     ExitLimit HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
                                const Loop *L, bool isSigned, bool IsSubExpr);
+    ExitLimit HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
+                                  const Loop *L, bool isSigned, bool IsSubExpr);
 
     /// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB
     /// (which may not be an immediate predecessor) which has exactly one
@@ -636,21 +631,15 @@ namespace llvm {
     const SCEV *getUnknown(Value *V);
     const SCEV *getCouldNotCompute();
 
-    /// getSizeOfExpr - Return an expression for sizeof on the given type.
-    ///
-    const SCEV *getSizeOfExpr(Type *AllocTy);
-
-    /// getAlignOfExpr - Return an expression for alignof on the given type.
+    /// getSizeOfExpr - Return an expression for sizeof AllocTy that is type
+    /// IntTy
     ///
-    const SCEV *getAlignOfExpr(Type *AllocTy);
+    const SCEV *getSizeOfExpr(Type *IntTy, Type *AllocTy);
 
-    /// getOffsetOfExpr - Return an expression for offsetof on the given field.
+    /// getOffsetOfExpr - Return an expression for offsetof on the given field
+    /// with type IntTy
     ///
-    const SCEV *getOffsetOfExpr(StructType *STy, unsigned FieldNo);
-
-    /// getOffsetOfExpr - Return an expression for offsetof on the given field.
-    ///
-    const SCEV *getOffsetOfExpr(Type *CTy, Constant *FieldNo);
+    const SCEV *getOffsetOfExpr(Type *IntTy, StructType *STy, unsigned FieldNo);
 
     /// getNegativeSCEV - Return the SCEV object corresponding to -V.
     ///
@@ -886,6 +875,24 @@ namespace llvm {
     virtual void verifyAnalysis() const;
 
   private:
+    /// Compute the backedge taken count knowing the interval difference, the
+    /// stride and presence of the equality in the comparison.
+    const SCEV *computeBECount(const SCEV *Delta, const SCEV *Stride,
+                               bool Equality);
+
+    /// Verify if an linear IV with positive stride can overflow when in a 
+    /// less-than comparison, knowing the invariant term of the comparison,
+    /// the stride and the knowledge of NSW/NUW flags on the recurrence.
+    bool doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
+                            bool IsSigned, bool NoWrap);
+
+    /// Verify if an linear IV with negative stride can overflow when in a 
+    /// greater-than comparison, knowing the invariant term of the comparison,
+    /// the stride and the knowledge of NSW/NUW flags on the recurrence.
+    bool doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
+                            bool IsSigned, bool NoWrap);
+
+  private:
     FoldingSet<SCEV> UniqueSCEVs;
     BumpPtrAllocator SCEVAllocator;
 
diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h
index 00779fc..4433be0 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@@ -26,7 +26,7 @@ namespace llvm {
 
   /// Return true if the given expression is safe to expand in the sense that
   /// all materialized values are safe to speculate.
-  bool isSafeToExpand(const SCEV *S);
+  bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE);
 
   /// SCEVExpander - This class uses information about analyze scalars to
   /// rewrite expressions in canonical form.
@@ -252,8 +252,6 @@ namespace llvm {
 
     void rememberInstruction(Value *I);
 
-    void restoreInsertPoint(BasicBlock *BB, BasicBlock::iterator I);
-
     bool isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
 
     bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L);
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index eac9113..9cd902a 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -351,8 +351,14 @@ namespace llvm {
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scAddRecExpr;
     }
-  };
 
+    /// Splits the SCEV into two vectors of SCEVs representing the subscripts
+    /// and sizes of an array access. Returns the remainder of the
+    /// delinearization that is the offset start of the array.
+    const SCEV *delinearize(ScalarEvolution &SE,
+                            SmallVectorImpl<const SCEV *> &Subscripts,
+                            SmallVectorImpl<const SCEV *> &Sizes) const;
+  };
 
   //===--------------------------------------------------------------------===//
   /// SCEVSMaxExpr - This class represents a signed maximum selection.
@@ -549,53 +555,60 @@ namespace llvm {
     T.visitAll(Root);
   }
 
-  /// The SCEVRewriter takes a scalar evolution expression and copies all its
-  /// components. The result after a rewrite is an identical SCEV.
-  struct SCEVRewriter
-    : public SCEVVisitor<SCEVRewriter, const SCEV*> {
+  typedef DenseMap<const Value*, Value*> ValueToValueMap;
+
+  /// The SCEVParameterRewriter takes a scalar evolution expression and updates
+  /// the SCEVUnknown components following the Map (Value -> Value).
+  struct SCEVParameterRewriter
+    : public SCEVVisitor<SCEVParameterRewriter, const SCEV*> {
   public:
-    SCEVRewriter(ScalarEvolution &S) : SE(S) {}
+    static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE,
+                               ValueToValueMap &Map) {
+      SCEVParameterRewriter Rewriter(SE, Map);
+      return Rewriter.visit(Scev);
+    }
 
-    virtual ~SCEVRewriter() {}
+    SCEVParameterRewriter(ScalarEvolution &S, ValueToValueMap &M)
+      : SE(S), Map(M) {}
 
-    virtual const SCEV *visitConstant(const SCEVConstant *Constant) {
+    const SCEV *visitConstant(const SCEVConstant *Constant) {
       return Constant;
     }
 
-    virtual const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
       const SCEV *Operand = visit(Expr->getOperand());
       return SE.getTruncateExpr(Operand, Expr->getType());
     }
 
-    virtual const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
       const SCEV *Operand = visit(Expr->getOperand());
       return SE.getZeroExtendExpr(Operand, Expr->getType());
     }
 
-    virtual const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
       const SCEV *Operand = visit(Expr->getOperand());
       return SE.getSignExtendExpr(Operand, Expr->getType());
     }
 
-    virtual const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+    const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
         Operands.push_back(visit(Expr->getOperand(i)));
       return SE.getAddExpr(Operands);
     }
 
-    virtual const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+    const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
         Operands.push_back(visit(Expr->getOperand(i)));
       return SE.getMulExpr(Operands);
     }
 
-    virtual const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+    const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
       return SE.getUDivExpr(visit(Expr->getLHS()), visit(Expr->getRHS()));
     }
 
-    virtual const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
         Operands.push_back(visit(Expr->getOperand(i)));
@@ -603,54 +616,33 @@ namespace llvm {
                               Expr->getNoWrapFlags());
     }
 
-    virtual const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
         Operands.push_back(visit(Expr->getOperand(i)));
       return SE.getSMaxExpr(Operands);
     }
 
-    virtual const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
         Operands.push_back(visit(Expr->getOperand(i)));
       return SE.getUMaxExpr(Operands);
     }
 
-    virtual const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-      return Expr;
-    }
-
-    virtual const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
-      return Expr;
-    }
-
-  protected:
-    ScalarEvolution &SE;
-  };
-
-  typedef DenseMap<const Value*, Value*> ValueToValueMap;
-
-  /// The SCEVParameterRewriter takes a scalar evolution expression and updates
-  /// the SCEVUnknown components following the Map (Value -> Value).
-  struct SCEVParameterRewriter: public SCEVRewriter {
-  public:
-    static const SCEV *rewrite(const SCEV *Scev, ScalarEvolution &SE,
-                               ValueToValueMap &Map) {
-      SCEVParameterRewriter Rewriter(SE, Map);
-      return Rewriter.visit(Scev);
-    }
-    SCEVParameterRewriter(ScalarEvolution &S, ValueToValueMap &M)
-      : SCEVRewriter(S), Map(M) {}
-
-    virtual const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    const SCEV *visitUnknown(const SCEVUnknown *Expr) {
       Value *V = Expr->getValue();
       if (Map.count(V))
         return SE.getUnknown(Map[V]);
       return Expr;
     }
 
+    const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+      return Expr;
+    }
+
   private:
+    ScalarEvolution &SE;
     ValueToValueMap &Map;
   };
 
@@ -658,17 +650,56 @@ namespace llvm {
 
   /// The SCEVApplyRewriter takes a scalar evolution expression and applies
   /// the Map (Loop -> SCEV) to all AddRecExprs.
-  struct SCEVApplyRewriter: public SCEVRewriter {
+  struct SCEVApplyRewriter
+    : public SCEVVisitor<SCEVApplyRewriter, const SCEV*> {
   public:
     static const SCEV *rewrite(const SCEV *Scev, LoopToScevMapT &Map,
                                ScalarEvolution &SE) {
       SCEVApplyRewriter Rewriter(SE, Map);
       return Rewriter.visit(Scev);
     }
+
     SCEVApplyRewriter(ScalarEvolution &S, LoopToScevMapT &M)
-      : SCEVRewriter(S), Map(M) {}
+      : SE(S), Map(M) {}
+
+    const SCEV *visitConstant(const SCEVConstant *Constant) {
+      return Constant;
+    }
+
+    const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+      const SCEV *Operand = visit(Expr->getOperand());
+      return SE.getTruncateExpr(Operand, Expr->getType());
+    }
+
+    const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+      const SCEV *Operand = visit(Expr->getOperand());
+      return SE.getZeroExtendExpr(Operand, Expr->getType());
+    }
+
+    const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+      const SCEV *Operand = visit(Expr->getOperand());
+      return SE.getSignExtendExpr(Operand, Expr->getType());
+    }
+
+    const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getAddExpr(Operands);
+    }
+
+    const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getMulExpr(Operands);
+    }
 
-    virtual const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+      return SE.getUDivExpr(visit(Expr->getLHS()), visit(Expr->getRHS()));
+    }
+
+    const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
         Operands.push_back(visit(Expr->getOperand(i)));
@@ -683,7 +714,30 @@ namespace llvm {
       return Rec->evaluateAtIteration(Map[L], SE);
     }
 
+    const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getSMaxExpr(Operands);
+    }
+
+    const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+      SmallVector<const SCEV *, 2> Operands;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+        Operands.push_back(visit(Expr->getOperand(i)));
+      return SE.getUMaxExpr(Operands);
+    }
+
+    const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+      return Expr;
+    }
+
+    const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+      return Expr;
+    }
+
   private:
+    ScalarEvolution &SE;
     LoopToScevMapT &Map;
   };
 
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 21a3a12..4f47562 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -29,6 +29,7 @@
 namespace llvm {
 
 class GlobalValue;
+class Loop;
 class Type;
 class User;
 class Value;
@@ -191,6 +192,36 @@ public:
   /// incurs significant execution cost.
   virtual bool isLoweredToCall(const Function *F) const;
 
+  /// Parameters that control the generic loop unrolling transformation.
+  struct UnrollingPreferences {
+    /// The cost threshold for the unrolled loop, compared to
+    /// CodeMetrics.NumInsts aggregated over all basic blocks in the loop body.
+    /// The unrolling factor is set such that the unrolled loop body does not
+    /// exceed this cost. Set this to UINT_MAX to disable the loop body cost
+    /// restriction.
+    unsigned Threshold;
+    /// The cost threshold for the unrolled loop when optimizing for size (set
+    /// to UINT_MAX to disable).
+    unsigned OptSizeThreshold;
+    /// A forced unrolling factor (the number of concatenated bodies of the
+    /// original loop in the unrolled loop body). When set to 0, the unrolling
+    /// transformation will select an unrolling factor based on the current cost
+    /// threshold and other factors.
+    unsigned Count;
+    /// Allow partial unrolling (unrolling of loops to expand the size of the
+    /// loop body, not only to eliminate small constant-trip-count loops).
+    bool     Partial;
+    /// Allow runtime unrolling (unrolling of loops to expand the size of the
+    /// loop body even when the number of loop iterations is not known at compile
+    /// time).
+    bool     Runtime;
+  };
+
+  /// \brief Get target-customized preferences for the generic loop unrolling
+  /// transformation. The caller will initialize UP with the current
+  /// target-independent defaults.
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+
   /// @}
 
   /// \name Scalar Target Information
@@ -262,6 +293,10 @@ public:
   /// getPopcntSupport - Return hardware support for population count.
   virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
+  /// haveFastSqrt -- Return true if the hardware has a fast square-root
+  /// instruction.
+  virtual bool haveFastSqrt(Type *Ty) const;
+
   /// getIntImmCost - Return the expected cost of materializing the given
   /// integer immediate of the specified type.
   virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
@@ -279,7 +314,7 @@ public:
     SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset.
   };
 
-  /// \brief Additonal information about an operand's possible values.
+  /// \brief Additional information about an operand's possible values.
   enum OperandValueKind {
     OK_AnyValue,            // Operand can have any value.
     OK_UniformValue,        // Operand is uniform (splat of a value).
@@ -333,6 +368,22 @@ public:
                                    unsigned Alignment,
                                    unsigned AddressSpace) const;
 
+  /// \brief Calculate the cost of performing a vector reduction.
+  ///
+  /// This is the cost of reducing the vector value of type \p Ty to a scalar
+  /// value using the operation denoted by \p Opcode. The form of the reduction
+  /// can either be a pairwise reduction or a reduction that splits the vector
+  /// at every reduction level.
+  ///
+  /// Pairwise:
+  ///  (v0, v1, v2, v3)
+  ///  ((v0+v1), (v2, v3), undef, undef)
+  /// Split:
+  ///  (v0, v1, v2, v3)
+  ///  ((v0+v2), (v1+v3), undef, undef)
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwiseForm) const;
+
   /// \returns The cost of Intrinsic instructions.
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                          ArrayRef<Type *> Tys) const;
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index 3775ec9..0392f98 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -25,6 +25,7 @@ namespace llvm {
   class DataLayout;
   class StringRef;
   class MDNode;
+  class TargetLibraryInfo;
 
   /// ComputeMaskedBits - Determine which of the bits specified in Mask are
   /// known to be either zero or one and return them in the KnownZero/KnownOne
@@ -186,7 +187,7 @@ namespace llvm {
   /// isKnownNonNull - Return true if this pointer couldn't possibly be null by
   /// its definition.  This returns true for allocas, non-extern-weak globals
   /// and byval arguments.
-  bool isKnownNonNull(const Value *V);
+  bool isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI = 0);
 
 } // end namespace llvm
 
diff --git a/include/llvm/AutoUpgrade.h b/include/llvm/AutoUpgrade.h
index ca3446e..c774782 100644
--- a/include/llvm/AutoUpgrade.h
+++ b/include/llvm/AutoUpgrade.h
@@ -15,10 +15,14 @@
 #define LLVM_AUTOUPGRADE_H
 
 namespace llvm {
+  class CallInst;
+  class Constant;
+  class Function;
+  class Instruction;
   class Module;
   class GlobalVariable;
-  class Function;
-  class CallInst;
+  class Type;
+  class Value;
 
   /// This is a more granular function that simply checks an intrinsic function
   /// for upgrading, and returns true if it requires upgrading. It may return
@@ -39,6 +43,24 @@ namespace llvm {
   /// This checks for global variables which should be upgraded. It returns true
   /// if it requires upgrading.
   bool UpgradeGlobalVariable(GlobalVariable *GV);
+
+  /// If the TBAA tag for the given instruction uses the scalar TBAA format,
+  /// we upgrade it to the struct-path aware TBAA format.
+  void UpgradeInstWithTBAATag(Instruction *I);
+
+  /// This is an auto-upgrade for bitcast between pointers with different
+  /// address spaces: the instruction is replaced by a pair ptrtoint+inttoptr.
+  Instruction *UpgradeBitCastInst(unsigned Opc, Value *V, Type *DestTy,
+                                  Instruction *&Temp);
+
+  /// This is an auto-upgrade for bitcast constant expression between pointers
+  /// with different address spaces: the instruction is replaced by a pair
+  /// ptrtoint+inttoptr.
+  Value *UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy);
+
+  /// Check the debug info version number, if it is out-dated, drop the debug
+  /// info. Return true if module is modified.
+  bool UpgradeDebugInfo(Module &M);
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 463e3b9..b3d2466 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -194,7 +194,8 @@ namespace bitc {
     CAST_FPEXT    =  8,
     CAST_PTRTOINT =  9,
     CAST_INTTOPTR = 10,
-    CAST_BITCAST  = 11
+    CAST_BITCAST  = 11,
+    CAST_ADDRSPACECAST = 12
   };
 
   /// BinaryOpcodes - These are values used in the bitcode files to encode which
@@ -368,7 +369,8 @@ namespace bitc {
     ATTR_KIND_UW_TABLE = 33,
     ATTR_KIND_Z_EXT = 34,
     ATTR_KIND_BUILTIN = 35,
-    ATTR_KIND_COLD = 36
+    ATTR_KIND_COLD = 36,
+    ATTR_KIND_OPTIMIZE_NONE = 37
   };
 
 } // End bitc namespace
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index ce9ca0a..b2cc704 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -26,6 +26,7 @@ namespace llvm {
 
 class GlobalVariable;
 class TargetLowering;
+class TargetLoweringBase;
 class SDNode;
 class SDValue;
 class SelectionDAG;
@@ -88,6 +89,14 @@ ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred);
 /// This function only tests target-independent requirements.
 bool isInTailCallPosition(ImmutableCallSite CS, const TargetLowering &TLI);
 
+/// Test if given that the input instruction is in the tail call position if the
+/// return type or any attributes of the function will inhibit tail call
+/// optimization.
+bool returnTypeIsEligibleForTailCall(const Function *F,
+                                     const Instruction *I,
+                                     const ReturnInst *Ret,
+                                     const TargetLoweringBase &TLI);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 677331b..4bda0f1 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -41,6 +41,7 @@ namespace llvm {
   class MCAsmInfo;
   class MCCFIInstruction;
   class MCContext;
+  class MCInstrInfo;
   class MCSection;
   class MCStreamer;
   class MCSymbol;
@@ -64,6 +65,7 @@ namespace llvm {
     ///
     const MCAsmInfo *MAI;
 
+    const MCInstrInfo *MII;
     /// OutContext - This is the context for the output file that we are
     /// streaming.  This owns all of the global MC-related objects for the
     /// generated translation unit.
@@ -143,6 +145,7 @@ namespace llvm {
     /// getCurrentSection() - Return the current section we are emitting to.
     const MCSection *getCurrentSection() const;
 
+    MCSymbol *getSymbol(const GlobalValue *GV) const;
 
     //===------------------------------------------------------------------===//
     // MachineFunctionPass Implementation.
@@ -284,6 +287,10 @@ namespace llvm {
     virtual bool
     isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
 
+    /// emitImplicitDef - Targets can override this to customize the output of
+    /// IMPLICIT_DEF instructions in verbose mode.
+    virtual void emitImplicitDef(const MachineInstr *MI) const;
+
     //===------------------------------------------------------------------===//
     // Symbol Lowering Routines.
     //===------------------------------------------------------------------===//
@@ -359,13 +366,15 @@ namespace llvm {
     /// where the size in bytes of the directive is specified by Size and Label
     /// specifies the label.  This implicitly uses .set if it is available.
     void EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
-                                   unsigned Size) const;
+                             unsigned Size,
+                             bool IsSectionRelative = false) const;
 
     /// EmitLabelReference - Emit something like ".long Label"
     /// where the size in bytes of the directive is specified by Size and Label
     /// specifies the label.
-    void EmitLabelReference(const MCSymbol *Label, unsigned Size) const {
-      EmitLabelPlusOffset(Label, 0, Size);
+    void EmitLabelReference(const MCSymbol *Label, unsigned Size,
+                            bool IsSectionRelative = false) const {
+      EmitLabelPlusOffset(Label, 0, Size, IsSectionRelative);
     }
 
     //===------------------------------------------------------------------===//
@@ -449,8 +458,7 @@ namespace llvm {
     /// return true if the operand is erroneous.
     virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                        unsigned AsmVariant,
-                                       const char *ExtraCode,
-                                       raw_ostream &OS);
+                                       const char *ExtraCode, raw_ostream &OS);
 
   private:
     /// Private state for PrintSpecial()
@@ -462,7 +470,8 @@ namespace llvm {
 
     /// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
     void EmitInlineAsm(StringRef Str, const MDNode *LocMDNode = 0,
-                    InlineAsm::AsmDialect AsmDialect = InlineAsm::AD_ATT) const;
+                       InlineAsm::AsmDialect AsmDialect =
+                           InlineAsm::AD_ATT) const;
 
     /// EmitInlineAsm - This method formats and emits the specified machine
     /// instruction that is an inline asm.
@@ -477,12 +486,13 @@ namespace llvm {
     void EmitVisibility(MCSymbol *Sym, unsigned Visibility,
                         bool IsDefinition = true) const;
 
-    void EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const;
+    void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const;
 
     void EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
-                            const MachineBasicBlock *MBB,
-                            unsigned uid) const;
+                            const MachineBasicBlock *MBB, unsigned uid) const;
     void EmitLLVMUsedList(const ConstantArray *InitList);
+    /// Emit llvm.ident metadata in an '.ident' directive.
+    void EmitModuleIdents(Module &M);
     void EmitXXStructorList(const Constant *List, bool isCtor);
     GCMetadataPrinter *GetOrCreateGCPrinter(GCStrategy *C);
   };
diff --git a/include/llvm/CodeGen/CalcSpillWeights.h b/include/llvm/CodeGen/CalcSpillWeights.h
index c8ec764..0d79b1d 100644
--- a/include/llvm/CodeGen/CalcSpillWeights.h
+++ b/include/llvm/CodeGen/CalcSpillWeights.h
@@ -21,7 +21,9 @@ namespace llvm {
   class MachineBlockFrequencyInfo;
   class MachineLoopInfo;
 
-  /// normalizeSpillWeight - The spill weight of a live interval is computed as:
+  /// \brief Normalize the spill weight of a live interval
+  ///
+  /// The spill weight of a live interval is computed as:
   ///
   ///   (sum(use freq) + sum(def freq)) / (K + size)
   ///
@@ -38,44 +40,38 @@ namespace llvm {
     return UseDefFreq / (Size + 25*SlotIndex::InstrDist);
   }
 
-  /// VirtRegAuxInfo - Calculate auxiliary information for a virtual
-  /// register such as its spill weight and allocation hint.
+  /// \brief Calculate auxiliary information for a virtual register such as its
+  /// spill weight and allocation hint.
   class VirtRegAuxInfo {
+  public:
+    typedef float (*NormalizingFn)(float, unsigned);
+
+  private:
     MachineFunction &MF;
     LiveIntervals &LIS;
     const MachineLoopInfo &Loops;
     const MachineBlockFrequencyInfo &MBFI;
     DenseMap<unsigned, float> Hint;
+    NormalizingFn normalize;
+
   public:
     VirtRegAuxInfo(MachineFunction &mf, LiveIntervals &lis,
                    const MachineLoopInfo &loops,
-                   const MachineBlockFrequencyInfo &mbfi)
-        : MF(mf), LIS(lis), Loops(loops), MBFI(mbfi) {}
+                   const MachineBlockFrequencyInfo &mbfi,
+                   NormalizingFn norm = normalizeSpillWeight)
+        : MF(mf), LIS(lis), Loops(loops), MBFI(mbfi), normalize(norm) {}
 
-    /// CalculateWeightAndHint - (re)compute li's spill weight and allocation
-    /// hint.
-    void CalculateWeightAndHint(LiveInterval &li);
+    /// \brief (re)compute li's spill weight and allocation hint.
+    void calculateSpillWeightAndHint(LiveInterval &li);
   };
 
-  /// CalculateSpillWeights - Compute spill weights for all virtual register
+  /// \brief Compute spill weights and allocation hints for all virtual register
   /// live intervals.
-  class CalculateSpillWeights : public MachineFunctionPass {
-  public:
-    static char ID;
-
-    CalculateSpillWeights() : MachineFunctionPass(ID) {
-      initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &au) const;
-
-    virtual bool runOnMachineFunction(MachineFunction &fn);
-
-  private:
-    /// Returns true if the given live interval is zero length.
-    bool isZeroLengthInterval(LiveInterval *li) const;
-  };
-
+  void calculateSpillWeightsAndHints(LiveIntervals &LIS, MachineFunction &MF,
+                                     const MachineLoopInfo &MLI,
+                                     const MachineBlockFrequencyInfo &MBFI,
+                                     VirtRegAuxInfo::NormalizingFn norm =
+                                         normalizeSpillWeight);
 }
 
 #endif // LLVM_CODEGEN_CALCSPILLWEIGHTS_H
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 0c21e76..bc8dce3 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -150,7 +150,7 @@ FloatABIForCalls("float-abi",
 
 cl::opt<llvm::FPOpFusion::FPOpFusionMode>
 FuseFPOps("fp-contract",
-          cl::desc("Enable aggresive formation of fused FP ops"),
+          cl::desc("Enable aggressive formation of fused FP ops"),
           cl::init(FPOpFusion::Standard),
           cl::values(
               clEnumValN(FPOpFusion::Fast, "fast",
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index 0063474..1e0ef6b 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -358,6 +358,15 @@ protected:
     return 0;
   }
 
+  /// \brief Check if \c Add is an add that can be safely folded into \c GEP.
+  ///
+  /// \c Add can be folded into \c GEP if:
+  /// - \c Add is an add,
+  /// - \c Add's size matches \c GEP's,
+  /// - \c Add is in the same basic block as \c GEP, and
+  /// - \c Add has a constant operand.
+  bool canFoldAddIntoGEP(const User *GEP, const Value *Add);
+
 private:
   bool SelectBinaryOp(const User *I, unsigned ISDOpcode);
 
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 9466b90..48a0523 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -419,6 +419,10 @@ namespace ISD {
     /// getNode().
     BITCAST,
 
+    /// ADDRSPACECAST - This operator converts between pointers of different
+    /// address spaces.
+    ADDRSPACECAST,
+
     /// CONVERT_RNDSAT - This operator is used to support various conversions
     /// between various types (float, signed, unsigned and vectors of those
     /// types) with rounding and saturation. NOTE: Avoid using this operator as
@@ -440,11 +444,11 @@ namespace ISD {
 
     /// FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
     /// FLOG, FLOG2, FLOG10, FEXP, FEXP2,
-    /// FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR - Perform various unary
+    /// FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR - Perform various unary
     /// floating point operations. These are inspired by libm.
     FNEG, FABS, FSQRT, FSIN, FCOS, FPOWI, FPOW,
     FLOG, FLOG2, FLOG10, FEXP, FEXP2,
-    FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR,
+    FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR,
     
     /// FSINCOS - Compute both fsin and fcos as a single operation.
     FSINCOS,
@@ -604,11 +608,17 @@ namespace ISD {
     ATOMIC_STORE,
 
     /// Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
+    /// For double-word atomic operations:
+    /// ValLo, ValHi, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmpLo, cmpHi,
+    ///                                          swapLo, swapHi)
     /// This corresponds to the cmpxchg instruction.
     ATOMIC_CMP_SWAP,
 
     /// Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt)
     /// Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amt)
+    /// For double-word atomic operations:
+    /// ValLo, ValHi, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amtLo, amtHi)
+    /// ValLo, ValHi, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN, ptr, amtLo, amtHi)
     /// These correspond to the atomicrmw instruction.
     ATOMIC_SWAP,
     ATOMIC_LOAD_ADD,
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index efad0c6..3a9fef6 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -9,12 +9,12 @@
 //
 // This file implements the LiveRange and LiveInterval classes.  Given some
 // numbering of each the machine instructions an interval [i, j) is said to be a
-// live interval for register v if there is no instruction with number j' >= j
+// live range for register v if there is no instruction with number j' >= j
 // such that v is live at j' and there is no instruction with number i' < i such
-// that v is live at i'. In this implementation intervals can have holes,
-// i.e. an interval might look like [1,20), [50,65), [1000,1001).  Each
-// individual range is represented as an instance of LiveRange, and the whole
-// interval is represented as an instance of LiveInterval.
+// that v is live at i'. In this implementation ranges can have holes,
+// i.e. a range might look like [1,20), [50,65), [1000,1001).  Each
+// individual segment is represented as an instance of LiveRange::Segment,
+// and the whole range is represented as an instance of LiveRange.
 //
 //===----------------------------------------------------------------------===//
 
@@ -35,6 +35,7 @@ namespace llvm {
   class MachineRegisterInfo;
   class TargetRegisterInfo;
   class raw_ostream;
+  template <typename T, unsigned Small> class SmallPtrSet;
 
   /// VNInfo - Value Number Information.
   /// This class holds information about a machine level values, including
@@ -66,7 +67,7 @@ namespace llvm {
     }
 
     /// Returns true if this value is defined by a PHI instruction (or was,
-    /// PHI instrucions may have been eliminated).
+    /// PHI instructions may have been eliminated).
     /// PHI-defs begin at a block boundary, all other defs begin at register or
     /// EC slots.
     bool isPHIDef() const { return def.isBlock(); }
@@ -78,107 +79,136 @@ namespace llvm {
     void markUnused() { def = SlotIndex(); }
   };
 
-  /// LiveRange structure - This represents a simple register range in the
-  /// program, with an inclusive start point and an exclusive end point.
-  /// These ranges are rendered as [start,end).
-  struct LiveRange {
-    SlotIndex start;  // Start point of the interval (inclusive)
-    SlotIndex end;    // End point of the interval (exclusive)
-    VNInfo *valno;   // identifier for the value contained in this interval.
+  /// Result of a LiveRange query. This class hides the implementation details
+  /// of live ranges, and it should be used as the primary interface for
+  /// examining live ranges around instructions.
+  class LiveQueryResult {
+    VNInfo *const EarlyVal;
+    VNInfo *const LateVal;
+    const SlotIndex EndPoint;
+    const bool Kill;
 
-    LiveRange() : valno(0) {}
+  public:
+    LiveQueryResult(VNInfo *EarlyVal, VNInfo *LateVal, SlotIndex EndPoint,
+                    bool Kill)
+      : EarlyVal(EarlyVal), LateVal(LateVal), EndPoint(EndPoint), Kill(Kill)
+    {}
 
-    LiveRange(SlotIndex S, SlotIndex E, VNInfo *V)
-      : start(S), end(E), valno(V) {
-      assert(S < E && "Cannot create empty or backwards range");
+    /// Return the value that is live-in to the instruction. This is the value
+    /// that will be read by the instruction's use operands. Return NULL if no
+    /// value is live-in.
+    VNInfo *valueIn() const {
+      return EarlyVal;
     }
 
-    /// contains - Return true if the index is covered by this range.
-    ///
-    bool contains(SlotIndex I) const {
-      return start <= I && I < end;
+    /// Return true if the live-in value is killed by this instruction. This
+    /// means that either the live range ends at the instruction, or it changes
+    /// value.
+    bool isKill() const {
+      return Kill;
     }
 
-    /// containsRange - Return true if the given range, [S, E), is covered by
-    /// this range.
-    bool containsRange(SlotIndex S, SlotIndex E) const {
-      assert((S < E) && "Backwards interval?");
-      return (start <= S && S < end) && (start < E && E <= end);
+    /// Return true if this instruction has a dead def.
+    bool isDeadDef() const {
+      return EndPoint.isDead();
     }
 
-    bool operator<(const LiveRange &LR) const {
-      return start < LR.start || (start == LR.start && end < LR.end);
+    /// Return the value leaving the instruction, if any. This can be a
+    /// live-through value, or a live def. A dead def returns NULL.
+    VNInfo *valueOut() const {
+      return isDeadDef() ? 0 : LateVal;
     }
-    bool operator==(const LiveRange &LR) const {
-      return start == LR.start && end == LR.end;
+
+    /// Return the value defined by this instruction, if any. This includes
+    /// dead defs, it is the value created by the instruction's def operands.
+    VNInfo *valueDefined() const {
+      return EarlyVal == LateVal ? 0 : LateVal;
     }
 
-    void dump() const;
-    void print(raw_ostream &os) const;
+    /// Return the end point of the last live range segment to interact with
+    /// the instruction, if any.
+    ///
+    /// The end point is an invalid SlotIndex only if the live range doesn't
+    /// intersect the instruction at all.
+    ///
+    /// The end point may be at or past the end of the instruction's basic
+    /// block. That means the value was live out of the block.
+    SlotIndex endPoint() const {
+      return EndPoint;
+    }
   };
 
-  template <> struct isPodLike<LiveRange> { static const bool value = true; };
-
-  raw_ostream& operator<<(raw_ostream& os, const LiveRange &LR);
-
+  /// This class represents the liveness of a register, stack slot, etc.
+  /// It manages an ordered list of Segment objects.
+  /// The Segments are organized in a static single assignment form: At places
+  /// where a new value is defined or different values reach a CFG join a new
+  /// segment with a new value number is used.
+  class LiveRange {
+  public:
 
-  inline bool operator<(SlotIndex V, const LiveRange &LR) {
-    return V < LR.start;
-  }
+    /// This represents a simple continuous liveness interval for a value.
+    /// The start point is inclusive, the end point exclusive. These intervals
+    /// are rendered as [start,end).
+    struct Segment {
+      SlotIndex start;  // Start point of the interval (inclusive)
+      SlotIndex end;    // End point of the interval (exclusive)
+      VNInfo *valno;    // identifier for the value contained in this segment.
 
-  inline bool operator<(const LiveRange &LR, SlotIndex V) {
-    return LR.start < V;
-  }
+      Segment() : valno(0) {}
 
-  /// LiveInterval - This class represents some number of live ranges for a
-  /// register or value.  This class also contains a bit of register allocator
-  /// state.
-  class LiveInterval {
-  public:
+      Segment(SlotIndex S, SlotIndex E, VNInfo *V)
+        : start(S), end(E), valno(V) {
+        assert(S < E && "Cannot create empty or backwards segment");
+      }
 
-    typedef SmallVector<LiveRange,4> Ranges;
-    typedef SmallVector<VNInfo*,4> VNInfoList;
+      /// Return true if the index is covered by this segment.
+      bool contains(SlotIndex I) const {
+        return start <= I && I < end;
+      }
 
-    const unsigned reg;  // the register or stack slot of this interval.
-    float weight;        // weight of this interval
-    Ranges ranges;       // the ranges in which this register is live
-    VNInfoList valnos;   // value#'s
+      /// Return true if the given interval, [S, E), is covered by this segment.
+      bool containsInterval(SlotIndex S, SlotIndex E) const {
+        assert((S < E) && "Backwards interval?");
+        return (start <= S && S < end) && (start < E && E <= end);
+      }
 
-    struct InstrSlots {
-      enum {
-        LOAD  = 0,
-        USE   = 1,
-        DEF   = 2,
-        STORE = 3,
-        NUM   = 4
-      };
+      bool operator<(const Segment &Other) const {
+        return start < Other.start || (start == Other.start && end < Other.end);
+      }
+      bool operator==(const Segment &Other) const {
+        return start == Other.start && end == Other.end;
+      }
 
+      void dump() const;
     };
 
-    LiveInterval(unsigned Reg, float Weight)
-      : reg(Reg), weight(Weight) {}
+    typedef SmallVector<Segment,4> Segments;
+    typedef SmallVector<VNInfo*,4> VNInfoList;
+
+    Segments segments;   // the liveness segments
+    VNInfoList valnos;   // value#'s
 
-    typedef Ranges::iterator iterator;
-    iterator begin() { return ranges.begin(); }
-    iterator end()   { return ranges.end(); }
+    typedef Segments::iterator iterator;
+    iterator begin() { return segments.begin(); }
+    iterator end()   { return segments.end(); }
 
-    typedef Ranges::const_iterator const_iterator;
-    const_iterator begin() const { return ranges.begin(); }
-    const_iterator end() const  { return ranges.end(); }
+    typedef Segments::const_iterator const_iterator;
+    const_iterator begin() const { return segments.begin(); }
+    const_iterator end() const  { return segments.end(); }
 
     typedef VNInfoList::iterator vni_iterator;
     vni_iterator vni_begin() { return valnos.begin(); }
-    vni_iterator vni_end() { return valnos.end(); }
+    vni_iterator vni_end()   { return valnos.end(); }
 
     typedef VNInfoList::const_iterator const_vni_iterator;
     const_vni_iterator vni_begin() const { return valnos.begin(); }
-    const_vni_iterator vni_end() const { return valnos.end(); }
+    const_vni_iterator vni_end() const   { return valnos.end(); }
 
-    /// advanceTo - Advance the specified iterator to point to the LiveRange
+    /// advanceTo - Advance the specified iterator to point to the Segment
     /// containing the specified position, or end() if the position is past the
-    /// end of the interval.  If no LiveRange contains this position, but the
+    /// end of the range.  If no Segment contains this position, but the
     /// position is in a hole, this method returns an iterator pointing to the
-    /// LiveRange immediately after the hole.
+    /// Segment immediately after the hole.
     iterator advanceTo(iterator I, SlotIndex Pos) {
       assert(I != end());
       if (Pos >= endIndex())
@@ -187,22 +217,26 @@ namespace llvm {
       return I;
     }
 
-    /// find - Return an iterator pointing to the first range that ends after
+    /// find - Return an iterator pointing to the first segment that ends after
     /// Pos, or end(). This is the same as advanceTo(begin(), Pos), but faster
-    /// when searching large intervals.
+    /// when searching large ranges.
     ///
-    /// If Pos is contained in a LiveRange, that range is returned.
-    /// If Pos is in a hole, the following LiveRange is returned.
+    /// If Pos is contained in a Segment, that segment is returned.
+    /// If Pos is in a hole, the following Segment is returned.
     /// If Pos is beyond endIndex, end() is returned.
     iterator find(SlotIndex Pos);
 
     const_iterator find(SlotIndex Pos) const {
-      return const_cast<LiveInterval*>(this)->find(Pos);
+      return const_cast<LiveRange*>(this)->find(Pos);
     }
 
     void clear() {
       valnos.clear();
-      ranges.clear();
+      segments.clear();
+    }
+
+    size_t size() const {
+      return segments.size();
     }
 
     bool hasAtLeastOneValue() const { return !valnos.empty(); }
@@ -220,7 +254,7 @@ namespace llvm {
       return valnos[ValNo];
     }
 
-    /// containsValue - Returns true if VNI belongs to this interval.
+    /// containsValue - Returns true if VNI belongs to this range.
     bool containsValue(const VNInfo *VNI) const {
       return VNI && VNI->id < getNumValNums() && VNI == getValNumInfo(VNI->id);
     }
@@ -234,7 +268,7 @@ namespace llvm {
       return VNI;
     }
 
-    /// createDeadDef - Make sure the interval has a value defined at Def.
+    /// createDeadDef - Make sure the range has a value defined at Def.
     /// If one already exists, return it. Otherwise allocate a new value and
     /// add liveness for a dead def.
     VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator &VNInfoAllocator);
@@ -251,42 +285,42 @@ namespace llvm {
 
     /// RenumberValues - Renumber all values in order of appearance and remove
     /// unused values.
-    void RenumberValues(LiveIntervals &lis);
+    void RenumberValues();
 
-    /// MergeValueNumberInto - This method is called when two value nubmers
+    /// MergeValueNumberInto - This method is called when two value numbers
     /// are found to be equivalent.  This eliminates V1, replacing all
-    /// LiveRanges with the V1 value number with the V2 value number.  This can
+    /// segments with the V1 value number with the V2 value number.  This can
     /// cause merging of V1/V2 values numbers and compaction of the value space.
     VNInfo* MergeValueNumberInto(VNInfo *V1, VNInfo *V2);
 
-    /// MergeValueInAsValue - Merge all of the live ranges of a specific val#
-    /// in RHS into this live interval as the specified value number.
-    /// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
-    /// current interval, it will replace the value numbers of the overlaped
-    /// live ranges with the specified value number.
-    void MergeRangesInAsValue(const LiveInterval &RHS, VNInfo *LHSValNo);
-
-    /// MergeValueInAsValue - Merge all of the live ranges of a specific val#
-    /// in RHS into this live interval as the specified value number.
-    /// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
-    /// current interval, but only if the overlapping LiveRanges have the
+    /// Merge all of the live segments of a specific val# in RHS into this live
+    /// range as the specified value number. The segments in RHS are allowed
+    /// to overlap with segments in the current range, it will replace the
+    /// value numbers of the overlaped live segments with the specified value
+    /// number.
+    void MergeSegmentsInAsValue(const LiveRange &RHS, VNInfo *LHSValNo);
+
+    /// MergeValueInAsValue - Merge all of the segments of a specific val#
+    /// in RHS into this live range as the specified value number.
+    /// The segments in RHS are allowed to overlap with segments in the
+    /// current range, but only if the overlapping segments have the
     /// specified value number.
-    void MergeValueInAsValue(const LiveInterval &RHS,
+    void MergeValueInAsValue(const LiveRange &RHS,
                              const VNInfo *RHSValNo, VNInfo *LHSValNo);
 
-    bool empty() const { return ranges.empty(); }
+    bool empty() const { return segments.empty(); }
 
-    /// beginIndex - Return the lowest numbered slot covered by interval.
+    /// beginIndex - Return the lowest numbered slot covered.
     SlotIndex beginIndex() const {
-      assert(!empty() && "Call to beginIndex() on empty interval.");
-      return ranges.front().start;
+      assert(!empty() && "Call to beginIndex() on empty range.");
+      return segments.front().start;
     }
 
-    /// endNumber - return the maximum point of the interval of the whole,
+    /// endNumber - return the maximum point of the range of the whole,
     /// exclusive.
     SlotIndex endIndex() const {
-      assert(!empty() && "Call to endIndex() on empty interval.");
-      return ranges.back().end;
+      assert(!empty() && "Call to endIndex() on empty range.");
+      return segments.back().end;
     }
 
     bool expiredAt(SlotIndex index) const {
@@ -298,31 +332,23 @@ namespace llvm {
       return r != end() && r->start <= index;
     }
 
-    /// killedAt - Return true if a live range ends at index. Note that the kill
-    /// point is not contained in the half-open live range. It is usually the
-    /// getDefIndex() slot following its last use.
-    bool killedAt(SlotIndex index) const {
-      const_iterator r = find(index.getRegSlot(true));
-      return r != end() && r->end == index;
-    }
-
-    /// getLiveRangeContaining - Return the live range that contains the
-    /// specified index, or null if there is none.
-    const LiveRange *getLiveRangeContaining(SlotIndex Idx) const {
-      const_iterator I = FindLiveRangeContaining(Idx);
+    /// Return the segment that contains the specified index, or null if there
+    /// is none.
+    const Segment *getSegmentContaining(SlotIndex Idx) const {
+      const_iterator I = FindSegmentContaining(Idx);
       return I == end() ? 0 : &*I;
     }
 
-    /// getLiveRangeContaining - Return the live range that contains the
-    /// specified index, or null if there is none.
-    LiveRange *getLiveRangeContaining(SlotIndex Idx) {
-      iterator I = FindLiveRangeContaining(Idx);
+    /// Return the live segment that contains the specified index, or null if
+    /// there is none.
+    Segment *getSegmentContaining(SlotIndex Idx) {
+      iterator I = FindSegmentContaining(Idx);
       return I == end() ? 0 : &*I;
     }
 
     /// getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
     VNInfo *getVNInfoAt(SlotIndex Idx) const {
-      const_iterator I = FindLiveRangeContaining(Idx);
+      const_iterator I = FindSegmentContaining(Idx);
       return I == end() ? 0 : I->valno;
     }
 
@@ -330,76 +356,68 @@ namespace llvm {
     /// necessarilly including Idx, or NULL. Use this to find the reaching def
     /// used by an instruction at this SlotIndex position.
     VNInfo *getVNInfoBefore(SlotIndex Idx) const {
-      const_iterator I = FindLiveRangeContaining(Idx.getPrevSlot());
+      const_iterator I = FindSegmentContaining(Idx.getPrevSlot());
       return I == end() ? 0 : I->valno;
     }
 
-    /// FindLiveRangeContaining - Return an iterator to the live range that
-    /// contains the specified index, or end() if there is none.
-    iterator FindLiveRangeContaining(SlotIndex Idx) {
+    /// Return an iterator to the segment that contains the specified index, or
+    /// end() if there is none.
+    iterator FindSegmentContaining(SlotIndex Idx) {
       iterator I = find(Idx);
       return I != end() && I->start <= Idx ? I : end();
     }
 
-    const_iterator FindLiveRangeContaining(SlotIndex Idx) const {
+    const_iterator FindSegmentContaining(SlotIndex Idx) const {
       const_iterator I = find(Idx);
       return I != end() && I->start <= Idx ? I : end();
     }
 
-    /// overlaps - Return true if the intersection of the two live intervals is
+    /// overlaps - Return true if the intersection of the two live ranges is
     /// not empty.
-    bool overlaps(const LiveInterval& other) const {
+    bool overlaps(const LiveRange &other) const {
       if (other.empty())
         return false;
       return overlapsFrom(other, other.begin());
     }
 
-    /// overlaps - Return true if the two intervals have overlapping segments
+    /// overlaps - Return true if the two ranges have overlapping segments
     /// that are not coalescable according to CP.
     ///
-    /// Overlapping segments where one interval is defined by a coalescable
+    /// Overlapping segments where one range is defined by a coalescable
     /// copy are allowed.
-    bool overlaps(const LiveInterval &Other, const CoalescerPair &CP,
+    bool overlaps(const LiveRange &Other, const CoalescerPair &CP,
                   const SlotIndexes&) const;
 
-    /// overlaps - Return true if the live interval overlaps a range specified
+    /// overlaps - Return true if the live range overlaps an interval specified
     /// by [Start, End).
     bool overlaps(SlotIndex Start, SlotIndex End) const;
 
-    /// overlapsFrom - Return true if the intersection of the two live intervals
+    /// overlapsFrom - Return true if the intersection of the two live ranges
     /// is not empty.  The specified iterator is a hint that we can begin
-    /// scanning the Other interval starting at I.
-    bool overlapsFrom(const LiveInterval& other, const_iterator I) const;
+    /// scanning the Other range starting at I.
+    bool overlapsFrom(const LiveRange &Other, const_iterator I) const;
 
-    /// addRange - Add the specified LiveRange to this interval, merging
-    /// intervals as appropriate.  This returns an iterator to the inserted live
-    /// range (which may have grown since it was inserted.
-    iterator addRange(LiveRange LR) {
-      return addRangeFrom(LR, ranges.begin());
+    /// Add the specified Segment to this range, merging segments as
+    /// appropriate.  This returns an iterator to the inserted segment (which
+    /// may have grown since it was inserted).
+    iterator addSegment(Segment S) {
+      return addSegmentFrom(S, segments.begin());
     }
 
-    /// extendInBlock - If this interval is live before Kill in the basic block
+    /// extendInBlock - If this range is live before Kill in the basic block
     /// that starts at StartIdx, extend it to be live up to Kill, and return
-    /// the value. If there is no live range before Kill, return NULL.
+    /// the value. If there is no segment before Kill, return NULL.
     VNInfo *extendInBlock(SlotIndex StartIdx, SlotIndex Kill);
 
-    /// join - Join two live intervals (this, and other) together.  This applies
-    /// mappings to the value numbers in the LHS/RHS intervals as specified.  If
-    /// the intervals are not joinable, this aborts.
-    void join(LiveInterval &Other,
+    /// join - Join two live ranges (this, and other) together.  This applies
+    /// mappings to the value numbers in the LHS/RHS ranges as specified.  If
+    /// the ranges are not joinable, this aborts.
+    void join(LiveRange &Other,
               const int *ValNoAssignments,
               const int *RHSValNoAssignments,
-              SmallVectorImpl<VNInfo *> &NewVNInfo,
-              MachineRegisterInfo *MRI);
+              SmallVectorImpl<VNInfo *> &NewVNInfo);
 
-    /// isInOneLiveRange - Return true if the range specified is entirely in the
-    /// a single LiveRange of the live interval.
-    bool isInOneLiveRange(SlotIndex Start, SlotIndex End) const {
-      const_iterator r = find(Start);
-      return r != end() && r->containsRange(Start, End);
-    }
-
-    /// True iff this live range is a single segment that lies between the
+    /// True iff this segment is a single segment that lies between the
     /// specified boundaries, exclusively. Vregs live across a backedge are not
     /// considered local. The boundaries are expected to lie within an extended
     /// basic block, so vregs that are not live out should contain no holes.
@@ -408,25 +426,63 @@ namespace llvm {
         endIndex() < End.getBoundaryIndex();
     }
 
-    /// removeRange - Remove the specified range from this interval.  Note that
-    /// the range must be a single LiveRange in its entirety.
-    void removeRange(SlotIndex Start, SlotIndex End,
-                     bool RemoveDeadValNo = false);
+    /// Remove the specified segment from this range.  Note that the segment
+    /// must be a single Segment in its entirety.
+    void removeSegment(SlotIndex Start, SlotIndex End,
+                       bool RemoveDeadValNo = false);
 
-    void removeRange(LiveRange LR, bool RemoveDeadValNo = false) {
-      removeRange(LR.start, LR.end, RemoveDeadValNo);
+    void removeSegment(Segment S, bool RemoveDeadValNo = false) {
+      removeSegment(S.start, S.end, RemoveDeadValNo);
     }
 
-    /// removeValNo - Remove all the ranges defined by the specified value#.
+    /// Query Liveness at Idx.
+    /// The sub-instruction slot of Idx doesn't matter, only the instruction
+    /// it refers to is considered.
+    LiveQueryResult Query(SlotIndex Idx) const {
+      // Find the segment that enters the instruction.
+      const_iterator I = find(Idx.getBaseIndex());
+      const_iterator E = end();
+      if (I == E)
+        return LiveQueryResult(0, 0, SlotIndex(), false);
+
+      // Is this an instruction live-in segment?
+      // If Idx is the start index of a basic block, include live-in segments
+      // that start at Idx.getBaseIndex().
+      VNInfo *EarlyVal = 0;
+      VNInfo *LateVal  = 0;
+      SlotIndex EndPoint;
+      bool Kill = false;
+      if (I->start <= Idx.getBaseIndex()) {
+        EarlyVal = I->valno;
+        EndPoint = I->end;
+        // Move to the potentially live-out segment.
+        if (SlotIndex::isSameInstr(Idx, I->end)) {
+          Kill = true;
+          if (++I == E)
+            return LiveQueryResult(EarlyVal, LateVal, EndPoint, Kill);
+        }
+        // Special case: A PHIDef value can have its def in the middle of a
+        // segment if the value happens to be live out of the layout
+        // predecessor.
+        // Such a value is not live-in.
+        if (EarlyVal->def == Idx.getBaseIndex())
+          EarlyVal = 0;
+      }
+      // I now points to the segment that may be live-through, or defined by
+      // this instr. Ignore segments starting after the current instr.
+      if (!SlotIndex::isEarlierInstr(Idx, I->start)) {
+        LateVal = I->valno;
+        EndPoint = I->end;
+      }
+      return LiveQueryResult(EarlyVal, LateVal, EndPoint, Kill);
+    }
+
+    /// removeValNo - Remove all the segments defined by the specified value#.
     /// Also remove the value# from value# list.
     void removeValNo(VNInfo *ValNo);
 
-    /// getSize - Returns the sum of sizes of all the LiveRange's.
-    ///
-    unsigned getSize() const;
-
-    /// Returns true if the live interval is zero length, i.e. no live ranges
-    /// span instructions. It doesn't pay to spill such an interval.
+    /// Returns true if the live range is zero length, i.e. no live segments
+    /// span instructions. It doesn't pay to spill such a range.
     bool isZeroLength(SlotIndexes *Indexes) const {
       for (const_iterator i = begin(), e = end(); i != e; ++i)
         if (Indexes->getNextNonNullIndex(i->start).getBaseIndex() <
@@ -435,27 +491,16 @@ namespace llvm {
       return true;
     }
 
-    /// isSpillable - Can this interval be spilled?
-    bool isSpillable() const {
-      return weight != HUGE_VALF;
-    }
-
-    /// markNotSpillable - Mark interval as not spillable
-    void markNotSpillable() {
-      weight = HUGE_VALF;
-    }
-
-    bool operator<(const LiveInterval& other) const {
+    bool operator<(const LiveRange& other) const {
       const SlotIndex &thisIndex = beginIndex();
       const SlotIndex &otherIndex = other.beginIndex();
-      return (thisIndex < otherIndex ||
-              (thisIndex == otherIndex && reg < other.reg));
+      return thisIndex < otherIndex;
     }
 
     void print(raw_ostream &OS) const;
     void dump() const;
 
-    /// \brief Walk the interval and assert if any invariants fail to hold.
+    /// \brief Walk the range and assert if any invariants fail to hold.
     ///
     /// Note that this is a no-op when asserts are disabled.
 #ifdef NDEBUG
@@ -466,11 +511,55 @@ namespace llvm {
 
   private:
 
-    Ranges::iterator addRangeFrom(LiveRange LR, Ranges::iterator From);
-    void extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd);
-    Ranges::iterator extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStr);
+    iterator addSegmentFrom(Segment S, iterator From);
+    void extendSegmentEndTo(iterator I, SlotIndex NewEnd);
+    iterator extendSegmentStartTo(iterator I, SlotIndex NewStr);
     void markValNoForDeletion(VNInfo *V);
 
+  };
+
+  inline raw_ostream &operator<<(raw_ostream &OS, const LiveRange &LR) {
+    LR.print(OS);
+    return OS;
+  }
+
+  /// LiveInterval - This class represents the liveness of a register,
+  /// or stack slot.
+  class LiveInterval : public LiveRange {
+  public:
+    typedef LiveRange super;
+
+    const unsigned reg;  // the register or stack slot of this interval.
+    float weight;        // weight of this interval
+
+    LiveInterval(unsigned Reg, float Weight)
+      : reg(Reg), weight(Weight) {}
+
+    /// getSize - Returns the sum of sizes of all the LiveRange's.
+    ///
+    unsigned getSize() const;
+
+    /// isSpillable - Can this interval be spilled?
+    bool isSpillable() const {
+      return weight != llvm::huge_valf;
+    }
+
+    /// markNotSpillable - Mark interval as not spillable
+    void markNotSpillable() {
+      weight = llvm::huge_valf;
+    }
+
+    bool operator<(const LiveInterval& other) const {
+      const SlotIndex &thisIndex = beginIndex();
+      const SlotIndex &otherIndex = other.beginIndex();
+      return thisIndex < otherIndex ||
+              (thisIndex == otherIndex && reg < other.reg);
+    }
+
+    void print(raw_ostream &OS) const;
+    void dump() const;
+
+  private:
     LiveInterval& operator=(const LiveInterval& rhs) LLVM_DELETED_FUNCTION;
 
   };
@@ -480,54 +569,65 @@ namespace llvm {
     return OS;
   }
 
-  /// Helper class for performant LiveInterval bulk updates.
+  raw_ostream &operator<<(raw_ostream &OS, const LiveRange::Segment &S);
+
+  inline bool operator<(SlotIndex V, const LiveRange::Segment &S) {
+    return V < S.start;
+  }
+
+  inline bool operator<(const LiveRange::Segment &S, SlotIndex V) {
+    return S.start < V;
+  }
+
+  /// Helper class for performant LiveRange bulk updates.
   ///
-  /// Calling LiveInterval::addRange() repeatedly can be expensive on large
+  /// Calling LiveRange::addSegment() repeatedly can be expensive on large
   /// live ranges because segments after the insertion point may need to be
   /// shifted. The LiveRangeUpdater class can defer the shifting when adding
   /// many segments in order.
   ///
-  /// The LiveInterval will be in an invalid state until flush() is called.
+  /// The LiveRange will be in an invalid state until flush() is called.
   class LiveRangeUpdater {
-    LiveInterval *LI;
+    LiveRange *LR;
     SlotIndex LastStart;
-    LiveInterval::iterator WriteI;
-    LiveInterval::iterator ReadI;
-    SmallVector<LiveRange, 16> Spills;
+    LiveRange::iterator WriteI;
+    LiveRange::iterator ReadI;
+    SmallVector<LiveRange::Segment, 16> Spills;
     void mergeSpills();
 
   public:
-    /// Create a LiveRangeUpdater for adding segments to LI.
-    /// LI will temporarily be in an invalid state until flush() is called.
-    LiveRangeUpdater(LiveInterval *li = 0) : LI(li) {}
+    /// Create a LiveRangeUpdater for adding segments to LR.
+    /// LR will temporarily be in an invalid state until flush() is called.
+    LiveRangeUpdater(LiveRange *lr = 0) : LR(lr) {}
 
     ~LiveRangeUpdater() { flush(); }
 
-    /// Add a segment to LI and coalesce when possible, just like LI.addRange().
-    /// Segments should be added in increasing start order for best performance.
-    void add(LiveRange);
+    /// Add a segment to LR and coalesce when possible, just like
+    /// LR.addSegment(). Segments should be added in increasing start order for
+    /// best performance.
+    void add(LiveRange::Segment);
 
     void add(SlotIndex Start, SlotIndex End, VNInfo *VNI) {
-      add(LiveRange(Start, End, VNI));
+      add(LiveRange::Segment(Start, End, VNI));
     }
 
-    /// Return true if the LI is currently in an invalid state, and flush()
+    /// Return true if the LR is currently in an invalid state, and flush()
     /// needs to be called.
     bool isDirty() const { return LastStart.isValid(); }
 
-    /// Flush the updater state to LI so it is valid and contains all added
+    /// Flush the updater state to LR so it is valid and contains all added
     /// segments.
     void flush();
 
     /// Select a different destination live range.
-    void setDest(LiveInterval *li) {
-      if (LI != li && isDirty())
+    void setDest(LiveRange *lr) {
+      if (LR != lr && isDirty())
         flush();
-      LI = li;
+      LR = lr;
     }
 
     /// Get the current destination live range.
-    LiveInterval *getDest() const { return LI; }
+    LiveRange *getDest() const { return LR; }
 
     void dump() const;
     void print(raw_ostream&) const;
@@ -538,99 +638,6 @@ namespace llvm {
     return OS;
   }
 
-  /// LiveRangeQuery - Query information about a live range around a given
-  /// instruction. This class hides the implementation details of live ranges,
-  /// and it should be used as the primary interface for examining live ranges
-  /// around instructions.
-  ///
-  class LiveRangeQuery {
-    VNInfo *EarlyVal;
-    VNInfo *LateVal;
-    SlotIndex EndPoint;
-    bool Kill;
-
-  public:
-    /// Create a LiveRangeQuery for the given live range and instruction index.
-    /// The sub-instruction slot of Idx doesn't matter, only the instruction it
-    /// refers to is considered.
-    LiveRangeQuery(const LiveInterval &LI, SlotIndex Idx)
-      : EarlyVal(0), LateVal(0), Kill(false) {
-      // Find the segment that enters the instruction.
-      LiveInterval::const_iterator I = LI.find(Idx.getBaseIndex());
-      LiveInterval::const_iterator E = LI.end();
-      if (I == E)
-        return;
-      // Is this an instruction live-in segment?
-      // If Idx is the start index of a basic block, include live-in segments
-      // that start at Idx.getBaseIndex().
-      if (I->start <= Idx.getBaseIndex()) {
-        EarlyVal = I->valno;
-        EndPoint = I->end;
-        // Move to the potentially live-out segment.
-        if (SlotIndex::isSameInstr(Idx, I->end)) {
-          Kill = true;
-          if (++I == E)
-            return;
-        }
-        // Special case: A PHIDef value can have its def in the middle of a
-        // segment if the value happens to be live out of the layout
-        // predecessor.
-        // Such a value is not live-in.
-        if (EarlyVal->def == Idx.getBaseIndex())
-          EarlyVal = 0;
-      }
-      // I now points to the segment that may be live-through, or defined by
-      // this instr. Ignore segments starting after the current instr.
-      if (SlotIndex::isEarlierInstr(Idx, I->start))
-        return;
-      LateVal = I->valno;
-      EndPoint = I->end;
-    }
-
-    /// Return the value that is live-in to the instruction. This is the value
-    /// that will be read by the instruction's use operands. Return NULL if no
-    /// value is live-in.
-    VNInfo *valueIn() const {
-      return EarlyVal;
-    }
-
-    /// Return true if the live-in value is killed by this instruction. This
-    /// means that either the live range ends at the instruction, or it changes
-    /// value.
-    bool isKill() const {
-      return Kill;
-    }
-
-    /// Return true if this instruction has a dead def.
-    bool isDeadDef() const {
-      return EndPoint.isDead();
-    }
-
-    /// Return the value leaving the instruction, if any. This can be a
-    /// live-through value, or a live def. A dead def returns NULL.
-    VNInfo *valueOut() const {
-      return isDeadDef() ? 0 : LateVal;
-    }
-
-    /// Return the value defined by this instruction, if any. This includes
-    /// dead defs, it is the value created by the instruction's def operands.
-    VNInfo *valueDefined() const {
-      return EarlyVal == LateVal ? 0 : LateVal;
-    }
-
-    /// Return the end point of the last live range segment to interact with
-    /// the instruction, if any.
-    ///
-    /// The end point is an invalid SlotIndex only if the live range doesn't
-    /// intersect the instruction at all.
-    ///
-    /// The end point may be at or past the end of the instruction's basic
-    /// block. That means the value was live out of the block.
-    SlotIndex endPoint() const {
-      return EndPoint;
-    }
-  };
-
   /// ConnectedVNInfoEqClasses - Helper class that can divide VNInfos in a
   /// LiveInterval into equivalence clases of connected components. A
   /// LiveInterval that has multiple connected components can be broken into
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index ffb07a5..d8437f0 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -90,9 +90,9 @@ namespace llvm {
     /// block.
     SmallVector<std::pair<unsigned, unsigned>, 8> RegMaskBlocks;
 
-    /// RegUnitIntervals - Keep a live interval for each register unit as a way
-    /// of tracking fixed physreg interference.
-    SmallVector<LiveInterval*, 0> RegUnitIntervals;
+    /// Keeps a live range set for each register unit to track fixed physreg
+    /// interference.
+    SmallVector<LiveRange*, 0> RegUnitRanges;
 
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -103,9 +103,10 @@ namespace llvm {
     static float getSpillWeight(bool isDef, bool isUse, BlockFrequency freq);
 
     LiveInterval &getInterval(unsigned Reg) {
-      LiveInterval *LI = VirtRegIntervals[Reg];
-      assert(LI && "Interval does not exist for virtual register");
-      return *LI;
+      if (hasInterval(Reg))
+        return *VirtRegIntervals[Reg];
+      else
+        return createAndComputeVirtRegInterval(Reg);
     }
 
     const LiveInterval &getInterval(unsigned Reg) const {
@@ -117,12 +118,17 @@ namespace llvm {
     }
 
     // Interval creation.
-    LiveInterval &getOrCreateInterval(unsigned Reg) {
-      if (!hasInterval(Reg)) {
-        VirtRegIntervals.grow(Reg);
-        VirtRegIntervals[Reg] = createInterval(Reg);
-      }
-      return getInterval(Reg);
+    LiveInterval &createEmptyInterval(unsigned Reg) {
+      assert(!hasInterval(Reg) && "Interval already exists!");
+      VirtRegIntervals.grow(Reg);
+      VirtRegIntervals[Reg] = createInterval(Reg);
+      return *VirtRegIntervals[Reg];
+    }
+
+    LiveInterval &createAndComputeVirtRegInterval(unsigned Reg) {
+      LiveInterval &LI = createEmptyInterval(Reg);
+      computeVirtRegInterval(LI);
+      return LI;
     }
 
     // Interval removal.
@@ -131,10 +137,10 @@ namespace llvm {
       VirtRegIntervals[Reg] = 0;
     }
 
-    /// addLiveRangeToEndOfBlock - Given a register and an instruction,
-    /// adds a live range from that instruction to the end of its MBB.
-    LiveRange addLiveRangeToEndOfBlock(unsigned reg,
-                                       MachineInstr* startInst);
+    /// Given a register and an instruction, adds a live segment from that
+    /// instruction to the end of its MBB.
+    LiveInterval::Segment addSegmentToEndOfBlock(unsigned reg,
+                                                 MachineInstr* startInst);
 
     /// shrinkToUses - After removing some uses of a register, shrink its live
     /// range to just the remaining uses. This method does not compute reaching
@@ -154,7 +160,7 @@ namespace llvm {
     /// extended to be live out of the basic block.
     ///
     /// See also LiveRangeCalc::extend().
-    void extendToIndices(LiveInterval *LI, ArrayRef<SlotIndex> Indices);
+    void extendToIndices(LiveRange &LR, ArrayRef<SlotIndex> Indices);
 
     /// pruneValue - If an LI value is live at Kill, prune its live range by
     /// removing any liveness reachable from Kill. Add live range end points to
@@ -200,14 +206,14 @@ namespace llvm {
       return Indexes->getMBBEndIdx(mbb);
     }
 
-    bool isLiveInToMBB(const LiveInterval &li,
+    bool isLiveInToMBB(const LiveRange &LR,
                        const MachineBasicBlock *mbb) const {
-      return li.liveAt(getMBBStartIdx(mbb));
+      return LR.liveAt(getMBBStartIdx(mbb));
     }
 
-    bool isLiveOutOfMBB(const LiveInterval &li,
+    bool isLiveOutOfMBB(const LiveRange &LR,
                         const MachineBasicBlock *mbb) const {
-      return li.liveAt(getMBBEndIdx(mbb).getPrevSlot());
+      return LR.liveAt(getMBBEndIdx(mbb).getPrevSlot());
     }
 
     MachineBasicBlock* getMBBFromIndex(SlotIndex index) const {
@@ -225,6 +231,12 @@ namespace llvm {
       return Indexes->insertMachineInstrInMaps(MI);
     }
 
+    void InsertMachineInstrRangeInMaps(MachineBasicBlock::iterator B,
+                                       MachineBasicBlock::iterator E) {
+      for (MachineBasicBlock::iterator I = B; I != E; ++I)
+        Indexes->insertMachineInstrInMaps(I);
+    }
+
     void RemoveMachineInstrFromMaps(MachineInstr *MI) {
       Indexes->removeMachineInstrFromMaps(MI);
     }
@@ -352,24 +364,24 @@ namespace llvm {
 
     /// getRegUnit - Return the live range for Unit.
     /// It will be computed if it doesn't exist.
-    LiveInterval &getRegUnit(unsigned Unit) {
-      LiveInterval *LI = RegUnitIntervals[Unit];
-      if (!LI) {
+    LiveRange &getRegUnit(unsigned Unit) {
+      LiveRange *LR = RegUnitRanges[Unit];
+      if (!LR) {
         // Compute missing ranges on demand.
-        RegUnitIntervals[Unit] = LI = new LiveInterval(Unit, HUGE_VALF);
-        computeRegUnitInterval(LI);
+        RegUnitRanges[Unit] = LR = new LiveRange();
+        computeRegUnitRange(*LR, Unit);
       }
-      return *LI;
+      return *LR;
     }
 
     /// getCachedRegUnit - Return the live range for Unit if it has already
     /// been computed, or NULL if it hasn't been computed yet.
-    LiveInterval *getCachedRegUnit(unsigned Unit) {
-      return RegUnitIntervals[Unit];
+    LiveRange *getCachedRegUnit(unsigned Unit) {
+      return RegUnitRanges[Unit];
     }
 
-    const LiveInterval *getCachedRegUnit(unsigned Unit) const {
-      return RegUnitIntervals[Unit];
+    const LiveRange *getCachedRegUnit(unsigned Unit) const {
+      return RegUnitRanges[Unit];
     }
 
   private:
@@ -385,8 +397,8 @@ namespace llvm {
     void dumpInstrs() const;
 
     void computeLiveInRegUnits();
-    void computeRegUnitInterval(LiveInterval*);
-    void computeVirtRegInterval(LiveInterval*);
+    void computeRegUnitRange(LiveRange&, unsigned Unit);
+    void computeVirtRegInterval(LiveInterval&);
 
     class HMEditor;
   };
diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h
index 615b339..95933d1 100644
--- a/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -32,7 +32,7 @@ typedef SparseBitVector<128> LiveVirtRegBitSet;
 
 /// Compare a live virtual register segment to a LiveIntervalUnion segment.
 inline bool
-overlap(const LiveRange &VRSeg,
+overlap(const LiveInterval::Segment &VRSeg,
         const IntervalMap<SlotIndex, LiveInterval*>::const_iterator &LUSeg) {
   return VRSeg.start < LUSeg.stop() && LUSeg.start() < VRSeg.end;
 }
diff --git a/include/llvm/CodeGen/LiveRangeEdit.h b/include/llvm/CodeGen/LiveRangeEdit.h
index a8749da..7edf67c 100644
--- a/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/include/llvm/CodeGen/LiveRangeEdit.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -30,10 +31,9 @@ class AliasAnalysis;
 class LiveIntervals;
 class MachineBlockFrequencyInfo;
 class MachineLoopInfo;
-class MachineRegisterInfo;
 class VirtRegMap;
 
-class LiveRangeEdit {
+class LiveRangeEdit : private MachineRegisterInfo::Delegate {
 public:
   /// Callback methods for LiveRangeEdit owners.
   class Delegate {
@@ -58,7 +58,7 @@ public:
 
 private:
   LiveInterval *Parent;
-  SmallVectorImpl<LiveInterval*> &NewRegs;
+  SmallVectorImpl<unsigned> &NewRegs;
   MachineRegisterInfo &MRI;
   LiveIntervals &LIS;
   VirtRegMap *VRM;
@@ -97,6 +97,10 @@ private:
   /// Helper for eliminateDeadDefs.
   void eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink);
 
+  /// MachineRegisterInfo callback to notify when new virtual
+  /// registers are created.
+  void MRI_NoteNewVirtualRegister(unsigned VReg);
+
 public:
   /// Create a LiveRangeEdit for breaking down parent into smaller pieces.
   /// @param parent The register being spilled or split.
@@ -108,7 +112,7 @@ public:
   ///            function.  If NULL, no virtual register map updates will
   ///            be done.  This could be the case if called before Regalloc.
   LiveRangeEdit(LiveInterval *parent,
-                SmallVectorImpl<LiveInterval*> &newRegs,
+                SmallVectorImpl<unsigned> &newRegs,
                 MachineFunction &MF,
                 LiveIntervals &lis,
                 VirtRegMap *vrm,
@@ -118,7 +122,9 @@ public:
       TII(*MF.getTarget().getInstrInfo()),
       TheDelegate(delegate),
       FirstNew(newRegs.size()),
-      ScannedRemattable(false) {}
+      ScannedRemattable(false) { MRI.setDelegate(this); }
+
+  ~LiveRangeEdit() { MRI.resetDelegate(this); }
 
   LiveInterval &getParent() const {
    assert(Parent && "No parent LiveInterval");
@@ -127,23 +133,30 @@ public:
   unsigned getReg() const { return getParent().reg; }
 
   /// Iterator for accessing the new registers added by this edit.
-  typedef SmallVectorImpl<LiveInterval*>::const_iterator iterator;
+  typedef SmallVectorImpl<unsigned>::const_iterator iterator;
   iterator begin() const { return NewRegs.begin()+FirstNew; }
   iterator end() const { return NewRegs.end(); }
   unsigned size() const { return NewRegs.size()-FirstNew; }
   bool empty() const { return size() == 0; }
-  LiveInterval *get(unsigned idx) const { return NewRegs[idx+FirstNew]; }
+  unsigned get(unsigned idx) const { return NewRegs[idx+FirstNew]; }
 
-  ArrayRef<LiveInterval*> regs() const {
+  ArrayRef<unsigned> regs() const {
     return makeArrayRef(NewRegs).slice(FirstNew);
   }
 
+  /// createEmptyIntervalFrom - Create a new empty interval based on OldReg.
+  LiveInterval &createEmptyIntervalFrom(unsigned OldReg);
+
   /// createFrom - Create a new virtual register based on OldReg.
-  LiveInterval &createFrom(unsigned OldReg);
+  unsigned createFrom(unsigned OldReg);
 
   /// create - Create a new register with the same class and original slot as
   /// parent.
-  LiveInterval &create() {
+  LiveInterval &createEmptyInterval() {
+    return createEmptyIntervalFrom(getReg());
+  }
+
+  unsigned create() {
     return createFrom(getReg());
   }
 
diff --git a/include/llvm/CodeGen/LiveRegUnits.h b/include/llvm/CodeGen/LiveRegUnits.h
new file mode 100644
index 0000000..02b9c55
--- /dev/null
+++ b/include/llvm/CodeGen/LiveRegUnits.h
@@ -0,0 +1,88 @@
+//===-- llvm/CodeGen/LiveRegUnits.h - Live register unit set ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Set of live register units. This can be used for ad
+// hoc liveness tracking after register allocation. You can start with the
+// live-ins/live-outs at the beginning/end of a block and update the information
+// while walking the instructions inside the block.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LIVEREGUNITS_H
+#define LLVM_CODEGEN_LIVEREGUNITS_H
+
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+
+namespace llvm {
+
+class MachineInstr;
+
+/// A set of live register units with functions to track liveness when walking
+/// backward/forward through a basic block.
+class LiveRegUnits {
+  SparseSet<unsigned> LiveUnits;
+
+  LiveRegUnits(const LiveRegUnits&) LLVM_DELETED_FUNCTION;
+  LiveRegUnits &operator=(const LiveRegUnits&) LLVM_DELETED_FUNCTION;
+public:
+  /// \brief Constructs a new empty LiveRegUnits set.
+  LiveRegUnits() {}
+
+  void init(const TargetRegisterInfo *TRI) {
+    LiveUnits.clear();
+    LiveUnits.setUniverse(TRI->getNumRegs());
+  }
+
+  void clear() { LiveUnits.clear(); }
+
+  bool empty() const { return LiveUnits.empty(); }
+
+  /// \brief Adds a register to the set.
+  void addReg(unsigned Reg, const MCRegisterInfo &MCRI) {
+    for (MCRegUnitIterator RUnits(Reg, &MCRI); RUnits.isValid(); ++RUnits)
+      LiveUnits.insert(*RUnits);
+  }
+
+  /// \brief Removes a register from the set.
+  void removeReg(unsigned Reg, const MCRegisterInfo &MCRI) {
+    for (MCRegUnitIterator RUnits(Reg, &MCRI); RUnits.isValid(); ++RUnits)
+      LiveUnits.erase(*RUnits);
+  }
+
+  /// \brief Removes registers clobbered by the regmask operand @p Op.
+  void removeRegsInMask(const MachineOperand &Op, const MCRegisterInfo &MCRI);
+
+  /// \brief Returns true if register @p Reg (or one of its super register) is
+  /// contained in the set.
+  bool contains(unsigned Reg, const MCRegisterInfo &MCRI) const {
+    for (MCRegUnitIterator RUnits(Reg, &MCRI); RUnits.isValid(); ++RUnits) {
+      if (LiveUnits.count(*RUnits))
+        return true;
+    }
+    return false;
+  }
+
+  /// \brief Simulates liveness when stepping backwards over an
+  /// instruction(bundle): Remove Defs, add uses.
+  void stepBackward(const MachineInstr &MI, const MCRegisterInfo &MCRI);
+
+  /// \brief Simulates liveness when stepping forward over an
+  /// instruction(bundle): Remove killed-uses, add defs.
+  void stepForward(const MachineInstr &MI, const MCRegisterInfo &MCRI);
+
+  /// \brief Adds all registers in the live-in list of block @p BB.
+  void addLiveIns(const MachineBasicBlock *MBB, const MCRegisterInfo &MCRI);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index d6f5883..7717809 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -410,8 +410,8 @@ public:
   /// branch to do so (e.g., a table jump).  True is a conservative answer.
   bool canFallThrough();
 
-  /// Returns a pointer to the first instructon in this block that is not a
-  /// PHINode instruction. When adding instruction to the beginning of the
+  /// Returns a pointer to the first instruction in this block that is not a
+  /// PHINode instruction. When adding instructions to the beginning of the
   /// basic block, they should be added before the returned value, not before
   /// the first instruction, which might be PHI.
   /// Returns end() is there's no non-PHI instruction.
@@ -733,6 +733,31 @@ template <> struct GraphTraits<Inverse<const MachineBasicBlock*> > {
   }
 };
 
+
+
+/// MachineInstrSpan provides an interface to get an iteration range
+/// containing the instruction it was initialized with, along with all
+/// those instructions inserted prior to or following that instruction
+/// at some point after the MachineInstrSpan is constructed.
+class MachineInstrSpan {
+  MachineBasicBlock &MBB;
+  MachineBasicBlock::iterator I, B, E;
+public:
+  MachineInstrSpan(MachineBasicBlock::iterator I)
+    : MBB(*I->getParent()),
+      I(I),
+      B(I == MBB.begin() ? MBB.end() : llvm::prior(I)),
+      E(llvm::next(I)) {}
+
+  MachineBasicBlock::iterator begin() {
+    return B == MBB.end() ? MBB.begin() : llvm::next(B);
+  }
+  MachineBasicBlock::iterator end() { return E; }
+  bool empty() { return begin() == end(); }
+
+  MachineBasicBlock::iterator getInitial() { return I; }
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index 98dd03b..c59948f 100644
--- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -1,4 +1,4 @@
-//==- MachineBranchProbabilityInfo.h - Machine Branch Probability Analysis -==//
+//=- MachineBranchProbabilityInfo.h - Branch Probability Analysis -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index cf5e7e2..cccab81 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -397,8 +397,8 @@ public:
     return isBranch(Type) & isBarrier(Type) & !isIndirectBranch(Type);
   }
 
-  // isPredicable - Return true if this instruction has a predicate operand that
-  // controls execution.  It may be set to 'always', or may be set to other
+  /// Return true if this instruction has a predicate operand that
+  /// controls execution.  It may be set to 'always', or may be set to other
   /// values.   There are various methods in TargetInstrInfo that can be used to
   /// control and modify the predicate in this instruction.
   bool isPredicable(QueryType Type = AllInBundle) const {
@@ -637,6 +637,13 @@ public:
   bool isEHLabel() const { return getOpcode() == TargetOpcode::EH_LABEL; }
   bool isGCLabel() const { return getOpcode() == TargetOpcode::GC_LABEL; }
   bool isDebugValue() const { return getOpcode() == TargetOpcode::DBG_VALUE; }
+  /// A DBG_VALUE is indirect iff the first operand is a register and
+  /// the second operand is an immediate.
+  bool isIndirectDebugValue() const {
+    return isDebugValue()
+      && getOperand(0).isReg()
+      && getOperand(1).isImm();
+  }
 
   bool isPHI() const { return getOpcode() == TargetOpcode::PHI; }
   bool isKill() const { return getOpcode() == TargetOpcode::KILL; }
@@ -886,13 +893,12 @@ public:
   /// Look for the operand that defines it and mark it as IsDead. If
   /// AddIfNotFound is true, add a implicit operand if it's not found. Returns
   /// true if the operand exists / is added.
-  bool addRegisterDead(unsigned IncomingReg, const TargetRegisterInfo *RegInfo,
+  bool addRegisterDead(unsigned Reg, const TargetRegisterInfo *RegInfo,
                        bool AddIfNotFound = false);
 
   /// addRegisterDefined - We have determined MI defines a register. Make sure
   /// there is an operand defining Reg.
-  void addRegisterDefined(unsigned IncomingReg,
-                          const TargetRegisterInfo *RegInfo = 0);
+  void addRegisterDefined(unsigned Reg, const TargetRegisterInfo *RegInfo = 0);
 
   /// setPhysRegsDeadExcept - Mark every physreg used by this instruction as
   /// dead except those in the UsedRegs list.
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 186017c..460c08c 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -234,7 +234,7 @@ public:
   /// \brief Returns a reference to a list of cfi instructions in the current
   /// function's prologue.  Used to construct frame maps for debug and exception
   /// handling comsumers.
-  const std::vector<MCCFIInstruction> &getFrameInstructions() {
+  const std::vector<MCCFIInstruction> &getFrameInstructions() const {
     return FrameInstructions;
   }
 
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 57b28fe..40f3580 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -564,6 +564,8 @@ public:
                                   unsigned SubReg = 0,
                                   bool isDebug = false,
                                   bool isInternalRead = false) {
+    assert(!(isDead && !isDef) && "Dead flag on non-def");
+    assert(!(isKill && isDef) && "Kill flag on def");
     MachineOperand Op(MachineOperand::MO_Register);
     Op.IsDef = isDef;
     Op.IsImp = isImp;
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 7a6dcd0..58ca907 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -22,12 +22,24 @@
 #include <vector>
 
 namespace llvm {
+class PSetIterator;
 
 /// MachineRegisterInfo - Keep track of information for virtual and physical
 /// registers, including vreg register classes, use/def chains for registers,
 /// etc.
 class MachineRegisterInfo {
+public:
+  class Delegate {
+    virtual void anchor();
+  public:
+    virtual void MRI_NoteNewVirtualRegister(unsigned Reg) = 0;
+
+    virtual ~Delegate() {}
+  };
+
+private:
   const TargetMachine &TM;
+  Delegate *TheDelegate;
 
   /// IsSSA - True when the machine function is in SSA form and virtual
   /// registers have a single def.
@@ -116,6 +128,23 @@ public:
     return TM.getRegisterInfo();
   }
 
+  void resetDelegate(Delegate *delegate) {
+    // Ensure another delegate does not take over unless the current
+    // delegate first unattaches itself. If we ever need to multicast
+    // notifications, we will need to change to using a list.
+    assert(TheDelegate == delegate &&
+           "Only the current delegate can perform reset!");
+    TheDelegate = 0;
+  }
+
+  void setDelegate(Delegate *delegate) {
+    assert(delegate && !TheDelegate &&
+           "Attempted to set delegate to null, or to change it without "
+           "first resetting it!");
+
+    TheDelegate = delegate;
+  }
+
   //===--------------------------------------------------------------------===//
   // Function State
   //===--------------------------------------------------------------------===//
@@ -299,6 +328,11 @@ public:
   /// a physreg.
   bool isConstantPhysReg(unsigned PhysReg, const MachineFunction &MF) const;
 
+  /// Get an iterator over the pressure sets affected by the given physical or
+  /// virtual register. If RegUnit is physical, it must be a register unit (from
+  /// MCRegUnitIterator).
+  PSetIterator getPressureSets(unsigned RegUnit) const;
+
   //===--------------------------------------------------------------------===//
   // Virtual Register Info
   //===--------------------------------------------------------------------===//
@@ -620,9 +654,49 @@ public:
       return Op->getParent();
     }
   };
+};
+
+/// Iterate over the pressure sets affected by the given physical or virtual
+/// register. If Reg is physical, it must be a register unit (from
+/// MCRegUnitIterator).
+class PSetIterator {
+  const int *PSet;
+  unsigned Weight;
+public:
+  PSetIterator(): PSet(0), Weight(0) {}
+  PSetIterator(unsigned RegUnit, const MachineRegisterInfo *MRI) {
+    const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+    if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
+      const TargetRegisterClass *RC = MRI->getRegClass(RegUnit);
+      PSet = TRI->getRegClassPressureSets(RC);
+      Weight = TRI->getRegClassWeight(RC).RegWeight;
+    }
+    else {
+      PSet = TRI->getRegUnitPressureSets(RegUnit);
+      Weight = TRI->getRegUnitWeight(RegUnit);
+    }
+    if (*PSet == -1)
+      PSet = 0;
+  }
+  bool isValid() const { return PSet; }
+
+  unsigned getWeight() const { return Weight; }
+
+  unsigned operator*() const { return *PSet; }
 
+  void operator++() {
+    assert(isValid() && "Invalid PSetIterator.");
+    ++PSet;
+    if (*PSet == -1)
+      PSet = 0;
+  }
 };
 
+inline PSetIterator MachineRegisterInfo::
+getPressureSets(unsigned RegUnit) const {
+  return PSetIterator(RegUnit, this);
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/CodeGen/MachineRelocation.h b/include/llvm/CodeGen/MachineRelocation.h
index 244b466..e778457 100644
--- a/include/llvm/CodeGen/MachineRelocation.h
+++ b/include/llvm/CodeGen/MachineRelocation.h
@@ -57,7 +57,7 @@ class MachineRelocation {
   union {
     void *Result;           // If this has been resolved to a resolved pointer
     GlobalValue *GV;        // If this is a pointer to a GV or an indirect ref.
-    MachineBasicBlock *MBB; // If this is a pointer to a LLVM BB
+    MachineBasicBlock *MBB; // If this is a pointer to an LLVM BB
     const char *ExtSym;     // If this is a pointer to a named symbol
     unsigned Index;         // Constant pool / jump table index
     unsigned GOTIndex;      // Index in the GOT of this symbol/global
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index d110ea1..7782895 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -7,8 +7,48 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides a MachineSchedRegistry for registering alternative machine
-// schedulers. A Target may provide an alternative scheduler implementation by
+// This file provides an interface for customizing the standard MachineScheduler
+// pass. Note that the entire pass may be replaced as follows:
+//
+// <Target>TargetMachine::createPassConfig(PassManagerBase &PM) {
+//   PM.substitutePass(&MachineSchedulerID, &CustomSchedulerPassID);
+//   ...}
+//
+// The MachineScheduler pass is only responsible for choosing the regions to be
+// scheduled. Targets can override the DAG builder and scheduler without
+// replacing the pass as follows:
+//
+// ScheduleDAGInstrs *<Target>PassConfig::
+// createMachineScheduler(MachineSchedContext *C) {
+//   return new CustomMachineScheduler(C);
+// }
+//
+// The default scheduler, ScheduleDAGMI, builds the DAG and drives list
+// scheduling while updating the instruction stream, register pressure, and live
+// intervals. Most targets don't need to override the DAG builder and list
+// schedulier, but subtargets that require custom scheduling heuristics may
+// plugin an alternate MachineSchedStrategy. The strategy is responsible for
+// selecting the highest priority node from the list:
+//
+// ScheduleDAGInstrs *<Target>PassConfig::
+// createMachineScheduler(MachineSchedContext *C) {
+//   return new ScheduleDAGMI(C, CustomStrategy(C));
+// }
+//
+// The DAG builder can also be customized in a sense by adding DAG mutations
+// that will run after DAG building and before list scheduling. DAG mutations
+// can adjust dependencies based on target-specific knowledge or add weak edges
+// to aid heuristics:
+//
+// ScheduleDAGInstrs *<Target>PassConfig::
+// createMachineScheduler(MachineSchedContext *C) {
+//   ScheduleDAGMI *DAG = new ScheduleDAGMI(C, CustomStrategy(C));
+//   DAG->addMutation(new CustomDependencies(DAG->TII, DAG->TRI));
+//   return DAG;
+// }
+//
+// A target that supports alternative schedulers can use the
+// MachineSchedRegistry to allow command line selection. This can be done by
 // implementing the following boilerplate:
 //
 // static ScheduleDAGInstrs *createCustomMachineSched(MachineSchedContext *C) {
@@ -18,9 +58,19 @@
 // SchedCustomRegistry("custom", "Run my target's custom scheduler",
 //                     createCustomMachineSched);
 //
-// Inside <Target>PassConfig:
-//   enablePass(&MachineSchedulerID);
-//   MachineSchedRegistry::setDefault(createCustomMachineSched);
+//
+// Finally, subtargets that don't need to implement custom heuristics but would
+// like to configure the GenericScheduler's policy for a given scheduler region,
+// including scheduling direction and register pressure tracking policy, can do
+// this:
+//
+// void <SubTarget>Subtarget::
+// overrideSchedPolicy(MachineSchedPolicy &Policy,
+//                     MachineInstr *begin,
+//                     MachineInstr *end,
+//                     unsigned NumRegionInstrs) const {
+//   Policy.<Flag> = true;
+// }
 //
 //===----------------------------------------------------------------------===//
 
@@ -85,15 +135,6 @@ public:
   static MachineSchedRegistry *getList() {
     return (MachineSchedRegistry *)Registry.getList();
   }
-  static ScheduleDAGCtor getDefault() {
-    return (ScheduleDAGCtor)Registry.getDefault();
-  }
-  static void setDefault(ScheduleDAGCtor C) {
-    Registry.setDefault((MachinePassCtor)C);
-  }
-  static void setDefault(StringRef Name) {
-    Registry.setDefault(Name);
-  }
   static void setListener(MachinePassRegistryListener *L) {
     Registry.setListener(L);
   }
@@ -101,12 +142,41 @@ public:
 
 class ScheduleDAGMI;
 
+/// Define a generic scheduling policy for targets that don't provide their own
+/// MachineSchedStrategy. This can be overriden for each scheduling region
+/// before building the DAG.
+struct MachineSchedPolicy {
+  // Allow the scheduler to disable register pressure tracking.
+  bool ShouldTrackPressure;
+
+  // Allow the scheduler to force top-down or bottom-up scheduling. If neither
+  // is true, the scheduler runs in both directions and converges.
+  bool OnlyTopDown;
+  bool OnlyBottomUp;
+
+  MachineSchedPolicy():
+    ShouldTrackPressure(false), OnlyTopDown(false), OnlyBottomUp(false) {}
+};
+
 /// MachineSchedStrategy - Interface to the scheduling algorithm used by
 /// ScheduleDAGMI.
+///
+/// Initialization sequence:
+///   initPolicy -> shouldTrackPressure -> initialize(DAG) -> registerRoots
 class MachineSchedStrategy {
+  virtual void anchor();
 public:
   virtual ~MachineSchedStrategy() {}
 
+  /// Optionally override the per-region scheduling policy.
+  virtual void initPolicy(MachineBasicBlock::iterator Begin,
+                          MachineBasicBlock::iterator End,
+                          unsigned NumRegionInstrs) {}
+
+  /// Check if pressure tracking is needed before building the DAG and
+  /// initializing this strategy. Called after initPolicy.
+  virtual bool shouldTrackPressure() const { return true; }
+
   /// Initialize the strategy after building the DAG for a new region.
   virtual void initialize(ScheduleDAGMI *DAG) = 0;
 
@@ -193,6 +263,7 @@ public:
 
 /// Mutate the DAG as a postpass after normal DAG building.
 class ScheduleDAGMutation {
+  virtual void anchor();
 public:
   virtual ~ScheduleDAGMutation() {}
 
@@ -221,14 +292,20 @@ protected:
 
   MachineBasicBlock::iterator LiveRegionEnd;
 
-  /// Register pressure in this region computed by buildSchedGraph.
+  // Map each SU to its summary of pressure changes. This array is updated for
+  // liveness during bottom-up scheduling. Top-down scheduling may proceed but
+  // has no affect on the pressure diffs.
+  PressureDiffs SUPressureDiffs;
+
+  /// Register pressure in this region computed by initRegPressure.
+  bool ShouldTrackPressure;
   IntervalPressure RegPressure;
   RegPressureTracker RPTracker;
 
   /// List of pressure sets that exceed the target's pressure limit before
   /// scheduling, listed in increasing set ID order. Each pressure set is paired
   /// with its max pressure in the currently scheduled regions.
-  std::vector<PressureElement> RegionCriticalPSets;
+  std::vector<PressureChange> RegionCriticalPSets;
 
   /// The top of the unscheduled zone.
   MachineBasicBlock::iterator CurrentTop;
@@ -254,8 +331,9 @@ public:
   ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
     ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
     AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S), DFSResult(0),
-    Topo(SUnits, &ExitSU), RPTracker(RegPressure), CurrentTop(),
-    TopRPTracker(TopPressure), CurrentBottom(), BotRPTracker(BotPressure),
+    Topo(SUnits, &ExitSU), ShouldTrackPressure(false),
+    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
+    CurrentBottom(), BotRPTracker(BotPressure),
     NextClusterPred(NULL), NextClusterSucc(NULL) {
 #ifndef NDEBUG
     NumInstrsScheduled = 0;
@@ -264,6 +342,9 @@ public:
 
   virtual ~ScheduleDAGMI();
 
+  /// \brief Return true if register pressure tracking is enabled.
+  bool isTrackingPressure() const { return ShouldTrackPressure; }
+
   /// Add a postprocessing step to the DAG builder.
   /// Mutations are applied in the order that they are added after normal DAG
   /// building and before MachineSchedStrategy initialization.
@@ -293,8 +374,7 @@ public:
   void enterRegion(MachineBasicBlock *bb,
                    MachineBasicBlock::iterator begin,
                    MachineBasicBlock::iterator end,
-                   unsigned endcount);
-
+                   unsigned regioninstrs) LLVM_OVERRIDE;
 
   /// Implement ScheduleDAGInstrs interface for scheduling a sequence of
   /// reorderable instructions.
@@ -315,10 +395,14 @@ public:
   /// Get register pressure for the entire scheduling region before scheduling.
   const IntervalPressure &getRegPressure() const { return RegPressure; }
 
-  const std::vector<PressureElement> &getRegionCriticalPSets() const {
+  const std::vector<PressureChange> &getRegionCriticalPSets() const {
     return RegionCriticalPSets;
   }
 
+  PressureDiff &getPressureDiff(const SUnit *SU) {
+    return SUPressureDiffs[SU->NodeNum];
+  }
+
   const SUnit *getNextClusterPred() const { return NextClusterPred; }
 
   const SUnit *getNextClusterSucc() const { return NextClusterSucc; }
@@ -332,6 +416,9 @@ public:
 
   BitVector &getScheduledTrees() { return ScheduledTrees; }
 
+  /// Compute the cyclic critical path through the DAG.
+  unsigned computeCyclicCriticalPath();
+
   void viewGraph(const Twine &Name, const Twine &Title) LLVM_OVERRIDE;
   void viewGraph() LLVM_OVERRIDE;
 
@@ -367,7 +454,10 @@ protected:
 
   void initRegPressure();
 
-  void updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure);
+  void updatePressureDiffs(ArrayRef<unsigned> LiveUses);
+
+  void updateScheduledPressure(const SUnit *SU,
+                               const std::vector<unsigned> &NewMaxPressure);
 
   bool checkSchedLimit();
 
diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index b57abca..aca0a91 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h
@@ -20,79 +20,63 @@
 #include "llvm/ADT/ilist_node.h"
 #include <list>
 #include <map>
+#include <set>
 
 namespace PBQP {
 
   /// PBQP Graph class.
   /// Instances of this class describe PBQP problems.
   class Graph {
-  private:
-
-    // ----- TYPEDEFS -----
-    class NodeEntry;
-    class EdgeEntry;
-
-    typedef llvm::ilist<NodeEntry> NodeList;
-    typedef llvm::ilist<EdgeEntry> EdgeList;
-
   public:
 
-    typedef NodeList::iterator NodeItr;
-    typedef NodeList::const_iterator ConstNodeItr;
-
-    typedef EdgeList::iterator EdgeItr;
-    typedef EdgeList::const_iterator ConstEdgeItr;
+    typedef unsigned NodeId;
+    typedef unsigned EdgeId;
 
   private:
 
-    typedef std::list<EdgeItr> AdjEdgeList;
-  
+    typedef std::set<NodeId> AdjEdgeList;
+
   public:
 
     typedef AdjEdgeList::iterator AdjEdgeItr;
 
   private:
 
-    class NodeEntry : public llvm::ilist_node<NodeEntry> {
-      friend struct llvm::ilist_sentinel_traits<NodeEntry>;
+    class NodeEntry {
     private:
-      Vector costs;      
+      Vector costs;
       AdjEdgeList adjEdges;
-      unsigned degree;
       void *data;
       NodeEntry() : costs(0, 0) {}
     public:
-      NodeEntry(const Vector &costs) : costs(costs), degree(0) {}
+      NodeEntry(const Vector &costs) : costs(costs), data(0) {}
       Vector& getCosts() { return costs; }
       const Vector& getCosts() const { return costs; }
-      unsigned getDegree() const { return degree; }
+      unsigned getDegree() const { return adjEdges.size(); }
       AdjEdgeItr edgesBegin() { return adjEdges.begin(); }
       AdjEdgeItr edgesEnd() { return adjEdges.end(); }
-      AdjEdgeItr addEdge(EdgeItr e) {
-        ++degree;
+      AdjEdgeItr addEdge(EdgeId e) {
         return adjEdges.insert(adjEdges.end(), e);
       }
       void removeEdge(AdjEdgeItr ae) {
-        --degree;
         adjEdges.erase(ae);
       }
       void setData(void *data) { this->data = data; }
       void* getData() { return data; }
     };
 
-    class EdgeEntry : public llvm::ilist_node<EdgeEntry> {
-      friend struct llvm::ilist_sentinel_traits<EdgeEntry>;
+    class EdgeEntry {
     private:
-      NodeItr node1, node2;
+      NodeId node1, node2;
       Matrix costs;
       AdjEdgeItr node1AEItr, node2AEItr;
       void *data;
-      EdgeEntry() : costs(0, 0, 0) {}
+      EdgeEntry() : costs(0, 0, 0), data(0) {}
     public:
-      EdgeEntry(NodeItr node1, NodeItr node2, const Matrix &costs)
+      EdgeEntry(NodeId node1, NodeId node2, const Matrix &costs)
         : node1(node1), node2(node2), costs(costs) {}
-      NodeItr getNode1() const { return node1; }
-      NodeItr getNode2() const { return node2; }
+      NodeId getNode1() const { return node1; }
+      NodeId getNode2() const { return node2; }
       Matrix& getCosts() { return costs; }
       const Matrix& getCosts() const { return costs; }
       void setNode1AEItr(AdjEdgeItr ae) { node1AEItr = ae; }
@@ -105,254 +89,305 @@ namespace PBQP {
 
     // ----- MEMBERS -----
 
-    NodeList nodes;
-    unsigned numNodes;
+    typedef std::vector<NodeEntry> NodeVector;
+    typedef std::vector<NodeId> FreeNodeVector;
+    NodeVector nodes;
+    FreeNodeVector freeNodes;
 
-    EdgeList edges;
-    unsigned numEdges;
+    typedef std::vector<EdgeEntry> EdgeVector;
+    typedef std::vector<EdgeId> FreeEdgeVector;
+    EdgeVector edges;
+    FreeEdgeVector freeEdges;
 
     // ----- INTERNAL METHODS -----
 
-    NodeEntry& getNode(NodeItr nItr) { return *nItr; }
-    const NodeEntry& getNode(ConstNodeItr nItr) const { return *nItr; }
-
-    EdgeEntry& getEdge(EdgeItr eItr) { return *eItr; }
-    const EdgeEntry& getEdge(ConstEdgeItr eItr) const { return *eItr; }
-
-    NodeItr addConstructedNode(const NodeEntry &n) {
-      ++numNodes;
-      return nodes.insert(nodes.end(), n);
+    NodeEntry& getNode(NodeId nId) { return nodes[nId]; }
+    const NodeEntry& getNode(NodeId nId) const { return nodes[nId]; }
+
+    EdgeEntry& getEdge(EdgeId eId) { return edges[eId]; }
+    const EdgeEntry& getEdge(EdgeId eId) const { return edges[eId]; }
+
+    NodeId addConstructedNode(const NodeEntry &n) {
+      NodeId nodeId = 0;
+      if (!freeNodes.empty()) {
+        nodeId = freeNodes.back();
+        freeNodes.pop_back();
+        nodes[nodeId] = n;
+      } else {
+        nodeId = nodes.size();
+        nodes.push_back(n);
+      }
+      return nodeId;
     }
 
-    EdgeItr addConstructedEdge(const EdgeEntry &e) {
-      assert(findEdge(e.getNode1(), e.getNode2()) == edges.end() &&
+    EdgeId addConstructedEdge(const EdgeEntry &e) {
+      assert(findEdge(e.getNode1(), e.getNode2()) == invalidEdgeId() &&
              "Attempt to add duplicate edge.");
-      ++numEdges;
-      EdgeItr edgeItr = edges.insert(edges.end(), e);
-      EdgeEntry &ne = getEdge(edgeItr);
+      EdgeId edgeId = 0;
+      if (!freeEdges.empty()) {
+        edgeId = freeEdges.back();
+        freeEdges.pop_back();
+        edges[edgeId] = e;
+      } else {
+        edgeId = edges.size();
+        edges.push_back(e);
+      }
+
+      EdgeEntry &ne = getEdge(edgeId);
       NodeEntry &n1 = getNode(ne.getNode1());
       NodeEntry &n2 = getNode(ne.getNode2());
+
       // Sanity check on matrix dimensions:
       assert((n1.getCosts().getLength() == ne.getCosts().getRows()) &&
              (n2.getCosts().getLength() == ne.getCosts().getCols()) &&
              "Edge cost dimensions do not match node costs dimensions.");
-      ne.setNode1AEItr(n1.addEdge(edgeItr));
-      ne.setNode2AEItr(n2.addEdge(edgeItr));
-      return edgeItr;
+
+      ne.setNode1AEItr(n1.addEdge(edgeId));
+      ne.setNode2AEItr(n2.addEdge(edgeId));
+      return edgeId;
     }
 
-    inline void copyFrom(const Graph &other);
+    Graph(const Graph &other) {}
+    void operator=(const Graph &other) {}
+
   public:
 
-    /// \brief Construct an empty PBQP graph.
-    Graph() : numNodes(0), numEdges(0) {}
+    class NodeItr {
+    public:
+      NodeItr(NodeId nodeId, const Graph &g)
+        : nodeId(nodeId), endNodeId(g.nodes.size()), freeNodes(g.freeNodes) {
+        this->nodeId = findNextInUse(nodeId); // Move to the first in-use nodeId
+      }
 
-    /// \brief Copy construct this graph from "other". Note: Does not copy node
-    ///        and edge data, only graph structure and costs.
-    /// @param other Source graph to copy from.
-    Graph(const Graph &other) : numNodes(0), numEdges(0) {
-      copyFrom(other);
-    }
+      bool operator==(const NodeItr& n) const { return nodeId == n.nodeId; }
+      bool operator!=(const NodeItr& n) const { return !(*this == n); }
+      NodeItr& operator++() { nodeId = findNextInUse(++nodeId); return *this; }
+      NodeId operator*() const { return nodeId; }
 
-    /// \brief Make this graph a copy of "other". Note: Does not copy node and
-    ///        edge data, only graph structure and costs.
-    /// @param other The graph to copy from.
-    /// @return A reference to this graph.
-    ///
-    /// This will clear the current graph, erasing any nodes and edges added,
-    /// before copying from other.
-    Graph& operator=(const Graph &other) {
-      clear();      
-      copyFrom(other);
-      return *this;
-    }
+    private:
+      NodeId findNextInUse(NodeId n) const {
+        while (n < endNodeId &&
+               std::find(freeNodes.begin(), freeNodes.end(), n) !=
+                 freeNodes.end()) {
+          ++n;
+        }
+        return n;
+      }
+
+      NodeId nodeId, endNodeId;
+      const FreeNodeVector& freeNodes;
+    };
+
+    class EdgeItr {
+    public:
+      EdgeItr(EdgeId edgeId, const Graph &g)
+        : edgeId(edgeId), endEdgeId(g.edges.size()), freeEdges(g.freeEdges) {
+        this->edgeId = findNextInUse(edgeId); // Move to the first in-use edgeId
+      }
+
+      bool operator==(const EdgeItr& n) const { return edgeId == n.edgeId; }
+      bool operator!=(const EdgeItr& n) const { return !(*this == n); }
+      EdgeItr& operator++() { edgeId = findNextInUse(++edgeId); return *this; }
+      EdgeId operator*() const { return edgeId; }
+
+    private:
+      EdgeId findNextInUse(EdgeId n) const {
+        while (n < endEdgeId &&
+               std::find(freeEdges.begin(), freeEdges.end(), n) !=
+                 freeEdges.end()) {
+          ++n;
+        }
+        return n;
+      }
+
+      EdgeId edgeId, endEdgeId;
+      const FreeEdgeVector& freeEdges;
+    };
+
+    /// \brief Construct an empty PBQP graph.
+    Graph() {}
 
     /// \brief Add a node with the given costs.
     /// @param costs Cost vector for the new node.
     /// @return Node iterator for the added node.
-    NodeItr addNode(const Vector &costs) {
+    NodeId addNode(const Vector &costs) {
       return addConstructedNode(NodeEntry(costs));
     }
 
     /// \brief Add an edge between the given nodes with the given costs.
-    /// @param n1Itr First node.
-    /// @param n2Itr Second node.
+    /// @param n1Id First node.
+    /// @param n2Id Second node.
     /// @return Edge iterator for the added edge.
-    EdgeItr addEdge(Graph::NodeItr n1Itr, Graph::NodeItr n2Itr,
-                    const Matrix &costs) {
-      assert(getNodeCosts(n1Itr).getLength() == costs.getRows() &&
-             getNodeCosts(n2Itr).getLength() == costs.getCols() &&
+    EdgeId addEdge(NodeId n1Id, NodeId n2Id, const Matrix &costs) {
+      assert(getNodeCosts(n1Id).getLength() == costs.getRows() &&
+             getNodeCosts(n2Id).getLength() == costs.getCols() &&
              "Matrix dimensions mismatch.");
-      return addConstructedEdge(EdgeEntry(n1Itr, n2Itr, costs)); 
+      return addConstructedEdge(EdgeEntry(n1Id, n2Id, costs));
     }
 
     /// \brief Get the number of nodes in the graph.
     /// @return Number of nodes in the graph.
-    unsigned getNumNodes() const { return numNodes; }
+    unsigned getNumNodes() const { return nodes.size() - freeNodes.size(); }
 
     /// \brief Get the number of edges in the graph.
     /// @return Number of edges in the graph.
-    unsigned getNumEdges() const { return numEdges; }
+    unsigned getNumEdges() const { return edges.size() - freeEdges.size(); }
 
     /// \brief Get a node's cost vector.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return Node cost vector.
-    Vector& getNodeCosts(NodeItr nItr) { return getNode(nItr).getCosts(); }
+    Vector& getNodeCosts(NodeId nId) { return getNode(nId).getCosts(); }
 
     /// \brief Get a node's cost vector (const version).
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return Node cost vector.
-    const Vector& getNodeCosts(ConstNodeItr nItr) const {
-      return getNode(nItr).getCosts();
+    const Vector& getNodeCosts(NodeId nId) const {
+      return getNode(nId).getCosts();
     }
 
     /// \brief Set a node's data pointer.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @param data Pointer to node data.
     ///
     /// Typically used by a PBQP solver to attach data to aid in solution.
-    void setNodeData(NodeItr nItr, void *data) { getNode(nItr).setData(data); }
+    void setNodeData(NodeId nId, void *data) { getNode(nId).setData(data); }
 
     /// \brief Get the node's data pointer.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return Pointer to node data.
-    void* getNodeData(NodeItr nItr) { return getNode(nItr).getData(); }
-    
+    void* getNodeData(NodeId nId) { return getNode(nId).getData(); }
+
     /// \brief Get an edge's cost matrix.
-    /// @param eItr Edge iterator.
+    /// @param eId Edge id.
     /// @return Edge cost matrix.
-    Matrix& getEdgeCosts(EdgeItr eItr) { return getEdge(eItr).getCosts(); }
+    Matrix& getEdgeCosts(EdgeId eId) { return getEdge(eId).getCosts(); }
 
     /// \brief Get an edge's cost matrix (const version).
-    /// @param eItr Edge iterator.
+    /// @param eId Edge id.
     /// @return Edge cost matrix.
-    const Matrix& getEdgeCosts(ConstEdgeItr eItr) const {
-      return getEdge(eItr).getCosts();
+    const Matrix& getEdgeCosts(EdgeId eId) const {
+      return getEdge(eId).getCosts();
     }
 
     /// \brief Set an edge's data pointer.
-    /// @param eItr Edge iterator.
+    /// @param eId Edge id.
     /// @param data Pointer to edge data.
     ///
     /// Typically used by a PBQP solver to attach data to aid in solution.
-    void setEdgeData(EdgeItr eItr, void *data) { getEdge(eItr).setData(data); }
+    void setEdgeData(EdgeId eId, void *data) { getEdge(eId).setData(data); }
 
     /// \brief Get an edge's data pointer.
-    /// @param eItr Edge iterator.
-    /// @return Pointer to edge data. 
-    void* getEdgeData(EdgeItr eItr) { return getEdge(eItr).getData(); }
+    /// @param eId Edge id.
+    /// @return Pointer to edge data.
+    void* getEdgeData(EdgeId eId) { return getEdge(eId).getData(); }
 
     /// \brief Get a node's degree.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return The degree of the node.
-    unsigned getNodeDegree(NodeItr nItr) const {
-      return getNode(nItr).getDegree();
+    unsigned getNodeDegree(NodeId nId) const {
+      return getNode(nId).getDegree();
     }
 
     /// \brief Begin iterator for node set.
-    NodeItr nodesBegin() { return nodes.begin(); }
-
-    /// \brief Begin const iterator for node set.
-    ConstNodeItr nodesBegin() const { return nodes.begin(); }
+    NodeItr nodesBegin() const { return NodeItr(0, *this);  }
 
     /// \brief End iterator for node set.
-    NodeItr nodesEnd() { return nodes.end(); }
-
-    /// \brief End const iterator for node set.
-    ConstNodeItr nodesEnd() const { return nodes.end(); }
+    NodeItr nodesEnd() const { return NodeItr(nodes.size(), *this); }
 
     /// \brief Begin iterator for edge set.
-    EdgeItr edgesBegin() { return edges.begin(); }
+    EdgeItr edgesBegin() const { return EdgeItr(0, *this); }
 
     /// \brief End iterator for edge set.
-    EdgeItr edgesEnd() { return edges.end(); }
+    EdgeItr edgesEnd() const { return EdgeItr(edges.size(), *this); }
 
     /// \brief Get begin iterator for adjacent edge set.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return Begin iterator for the set of edges connected to the given node.
-    AdjEdgeItr adjEdgesBegin(NodeItr nItr) {
-      return getNode(nItr).edgesBegin();
+    AdjEdgeItr adjEdgesBegin(NodeId nId) {
+      return getNode(nId).edgesBegin();
     }
 
     /// \brief Get end iterator for adjacent edge set.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return End iterator for the set of edges connected to the given node.
-    AdjEdgeItr adjEdgesEnd(NodeItr nItr) {
-      return getNode(nItr).edgesEnd();
+    AdjEdgeItr adjEdgesEnd(NodeId nId) {
+      return getNode(nId).edgesEnd();
     }
 
     /// \brief Get the first node connected to this edge.
-    /// @param eItr Edge iterator.
-    /// @return The first node connected to the given edge. 
-    NodeItr getEdgeNode1(EdgeItr eItr) {
-      return getEdge(eItr).getNode1();
+    /// @param eId Edge id.
+    /// @return The first node connected to the given edge.
+    NodeId getEdgeNode1(EdgeId eId) {
+      return getEdge(eId).getNode1();
     }
 
     /// \brief Get the second node connected to this edge.
-    /// @param eItr Edge iterator.
-    /// @return The second node connected to the given edge. 
-    NodeItr getEdgeNode2(EdgeItr eItr) {
-      return getEdge(eItr).getNode2();
-    } 
+    /// @param eId Edge id.
+    /// @return The second node connected to the given edge.
+    NodeId getEdgeNode2(EdgeId eId) {
+      return getEdge(eId).getNode2();
+    }
 
     /// \brief Get the "other" node connected to this edge.
-    /// @param eItr Edge iterator.
-    /// @param nItr Node iterator for the "given" node.
-    /// @return The iterator for the "other" node connected to this edge. 
-    NodeItr getEdgeOtherNode(EdgeItr eItr, NodeItr nItr) {
-      EdgeEntry &e = getEdge(eItr);
-      if (e.getNode1() == nItr) {
+    /// @param eId Edge id.
+    /// @param nId Node id for the "given" node.
+    /// @return The iterator for the "other" node connected to this edge.
+    NodeId getEdgeOtherNode(EdgeId eId, NodeId nId) {
+      EdgeEntry &e = getEdge(eId);
+      if (e.getNode1() == nId) {
         return e.getNode2();
       } // else
       return e.getNode1();
     }
 
+    EdgeId invalidEdgeId() const {
+      return std::numeric_limits<EdgeId>::max();
+    }
+
     /// \brief Get the edge connecting two nodes.
-    /// @param n1Itr First node iterator.
-    /// @param n2Itr Second node iterator.
-    /// @return An iterator for edge (n1Itr, n2Itr) if such an edge exists,
-    ///         otherwise returns edgesEnd(). 
-    EdgeItr findEdge(NodeItr n1Itr, NodeItr n2Itr) {
-      for (AdjEdgeItr aeItr = adjEdgesBegin(n1Itr), aeEnd = adjEdgesEnd(n1Itr);
+    /// @param n1Id First node id.
+    /// @param n2Id Second node id.
+    /// @return An id for edge (n1Id, n2Id) if such an edge exists,
+    ///         otherwise returns an invalid edge id.
+    EdgeId findEdge(NodeId n1Id, NodeId n2Id) {
+      for (AdjEdgeItr aeItr = adjEdgesBegin(n1Id), aeEnd = adjEdgesEnd(n1Id);
          aeItr != aeEnd; ++aeItr) {
-        if ((getEdgeNode1(*aeItr) == n2Itr) ||
-            (getEdgeNode2(*aeItr) == n2Itr)) {
+        if ((getEdgeNode1(*aeItr) == n2Id) ||
+            (getEdgeNode2(*aeItr) == n2Id)) {
           return *aeItr;
         }
       }
-      return edges.end();
+      return invalidEdgeId();
     }
 
     /// \brief Remove a node from the graph.
-    /// @param nItr Node iterator.
-    void removeNode(NodeItr nItr) {
-      NodeEntry &n = getNode(nItr);
-      for (AdjEdgeItr itr = n.edgesBegin(), end = n.edgesEnd(); itr != end;) {
-        EdgeItr eItr = *itr;
-        ++itr;
-        removeEdge(eItr); 
+    /// @param nId Node id.
+    void removeNode(NodeId nId) {
+      NodeEntry &n = getNode(nId);
+      for (AdjEdgeItr itr = n.edgesBegin(), end = n.edgesEnd(); itr != end; ++itr) {
+        EdgeId eId = *itr;
+        removeEdge(eId);
       }
-      nodes.erase(nItr);
-      --numNodes;
+      freeNodes.push_back(nId);
     }
 
     /// \brief Remove an edge from the graph.
-    /// @param eItr Edge iterator.
-    void removeEdge(EdgeItr eItr) {
-      EdgeEntry &e = getEdge(eItr);
+    /// @param eId Edge id.
+    void removeEdge(EdgeId eId) {
+      EdgeEntry &e = getEdge(eId);
       NodeEntry &n1 = getNode(e.getNode1());
       NodeEntry &n2 = getNode(e.getNode2());
       n1.removeEdge(e.getNode1AEItr());
       n2.removeEdge(e.getNode2AEItr());
-      edges.erase(eItr);
-      --numEdges;
+      freeEdges.push_back(eId);
     }
 
     /// \brief Remove all nodes and edges from the graph.
     void clear() {
       nodes.clear();
+      freeNodes.clear();
       edges.clear();
-      numNodes = numEdges = 0;
+      freeEdges.clear();
     }
 
     /// \brief Dump a graph to an output stream.
@@ -362,7 +397,7 @@ namespace PBQP {
 
       for (NodeItr nodeItr = nodesBegin(), nodeEnd = nodesEnd();
            nodeItr != nodeEnd; ++nodeItr) {
-        const Vector& v = getNodeCosts(nodeItr);
+        const Vector& v = getNodeCosts(*nodeItr);
         os << "\n" << v.getLength() << "\n";
         assert(v.getLength() != 0 && "Empty vector in graph.");
         os << v[0];
@@ -374,10 +409,10 @@ namespace PBQP {
 
       for (EdgeItr edgeItr = edgesBegin(), edgeEnd = edgesEnd();
            edgeItr != edgeEnd; ++edgeItr) {
-        unsigned n1 = std::distance(nodesBegin(), getEdgeNode1(edgeItr));
-        unsigned n2 = std::distance(nodesBegin(), getEdgeNode2(edgeItr));
+        NodeId n1 = getEdgeNode1(*edgeItr);
+        NodeId n2 = getEdgeNode2(*edgeItr);
         assert(n1 != n2 && "PBQP graphs shound not have self-edges.");
-        const Matrix& m = getEdgeCosts(edgeItr);
+        const Matrix& m = getEdgeCosts(*edgeItr);
         os << "\n" << n1 << " " << n2 << "\n"
            << m.getRows() << " " << m.getCols() << "\n";
         assert(m.getRows() != 0 && "No rows in matrix.");
@@ -396,14 +431,14 @@ namespace PBQP {
     /// @param os Output stream to print on.
     template <typename OStream>
     void printDot(OStream &os) {
-    
+
       os << "graph {\n";
 
       for (NodeItr nodeItr = nodesBegin(), nodeEnd = nodesEnd();
            nodeItr != nodeEnd; ++nodeItr) {
 
         os << "  node" << nodeItr << " [ label=\""
-           << nodeItr << ": " << getNodeCosts(nodeItr) << "\" ]\n";
+           << nodeItr << ": " << getNodeCosts(*nodeItr) << "\" ]\n";
       }
 
       os << "  edge [ len=" << getNumNodes() << " ]\n";
@@ -411,11 +446,11 @@ namespace PBQP {
       for (EdgeItr edgeItr = edgesBegin(), edgeEnd = edgesEnd();
            edgeItr != edgeEnd; ++edgeItr) {
 
-        os << "  node" << getEdgeNode1(edgeItr)
-           << " -- node" << getEdgeNode2(edgeItr)
+        os << "  node" << getEdgeNode1(*edgeItr)
+           << " -- node" << getEdgeNode2(*edgeItr)
            << " [ label=\"";
 
-        const Matrix &edgeCosts = getEdgeCosts(edgeItr);
+        const Matrix &edgeCosts = getEdgeCosts(*edgeItr);
 
         for (unsigned i = 0; i < edgeCosts.getRows(); ++i) {
           os << edgeCosts.getRowAsVector(i) << "\\n";
@@ -427,39 +462,16 @@ namespace PBQP {
 
   };
 
-  class NodeItrComparator {
-  public:
-    bool operator()(Graph::NodeItr n1, Graph::NodeItr n2) const {
-      return &*n1 < &*n2;
-    }
-
-    bool operator()(Graph::ConstNodeItr n1, Graph::ConstNodeItr n2) const {
-      return &*n1 < &*n2;
-    }
-  };
-
-  class EdgeItrCompartor {
-  public:
-    bool operator()(Graph::EdgeItr e1, Graph::EdgeItr e2) const {
-      return &*e1 < &*e2;
-    }
-
-    bool operator()(Graph::ConstEdgeItr e1, Graph::ConstEdgeItr e2) const {
-      return &*e1 < &*e2;
-    }
-  };
-
-  void Graph::copyFrom(const Graph &other) {
-    std::map<Graph::ConstNodeItr, Graph::NodeItr,
-             NodeItrComparator> nodeMap;
+//  void Graph::copyFrom(const Graph &other) {
+//     std::map<Graph::ConstNodeItr, Graph::NodeItr,
+//              NodeItrComparator> nodeMap;
 
-     for (Graph::ConstNodeItr nItr = other.nodesBegin(),
-                             nEnd = other.nodesEnd();
-         nItr != nEnd; ++nItr) {
-      nodeMap[nItr] = addNode(other.getNodeCosts(nItr));
-    }
-      
-  }
+//      for (Graph::ConstNodeItr nItr = other.nodesBegin(),
+//                              nEnd = other.nodesEnd();
+//          nItr != nEnd; ++nItr) {
+//       nodeMap[nItr] = addNode(other.getNodeCosts(nItr));
+//     }
+//  }
 
 }
 
diff --git a/include/llvm/CodeGen/PBQP/HeuristicBase.h b/include/llvm/CodeGen/PBQP/HeuristicBase.h
index 0c1fcb7..8bcbb9e 100644
--- a/include/llvm/CodeGen/PBQP/HeuristicBase.h
+++ b/include/llvm/CodeGen/PBQP/HeuristicBase.h
@@ -27,7 +27,7 @@ namespace PBQP {
   ///   <li> void heuristicReduce() : Perform a single heuristic reduction.
   ///   <li> void preUpdateEdgeCosts(Graph::EdgeItr) : Handle the (imminent)
   ///        change to the cost matrix on the given edge (by R2).
-  ///   <li> void postUpdateEdgeCostts(Graph::EdgeItr) : Handle the new 
+  ///   <li> void postUpdateEdgeCostts(Graph::EdgeItr) : Handle the new
   ///        costs on the given edge.
   ///   <li> void handleAddEdge(Graph::EdgeItr) : Handle the addition of a new
   ///        edge into the PBQP graph (by R2).
@@ -39,7 +39,7 @@ namespace PBQP {
   ///
   /// These methods are implemented in this class for documentation purposes,
   /// but will assert if called.
-  /// 
+  ///
   /// Note that this class uses the curiously recursive template idiom to
   /// forward calls to the derived class. These methods need not be made
   /// virtual, and indeed probably shouldn't for performance reasons.
@@ -52,7 +52,7 @@ namespace PBQP {
   class HeuristicBase {
   private:
 
-    typedef std::list<Graph::NodeItr> OptimalList;
+    typedef std::list<Graph::NodeId> OptimalList;
 
     HeuristicSolverImpl<HImpl> &s;
     Graph &g;
@@ -62,9 +62,9 @@ namespace PBQP {
     HImpl& impl() { return static_cast<HImpl&>(*this); }
 
     // Add the given node to the optimal reductions list. Keep an iterator to
-    // its location for fast removal. 
-    void addToOptimalReductionList(Graph::NodeItr nItr) {
-      optimalList.insert(optimalList.end(), nItr);
+    // its location for fast removal.
+    void addToOptimalReductionList(Graph::NodeId nId) {
+      optimalList.insert(optimalList.end(), nId);
     }
 
   public:
@@ -94,7 +94,7 @@ namespace PBQP {
     /// behaviour.
     bool solverRunSimplify() const { return true; }
 
-    /// \brief Decide whether a node should be optimally or heuristically 
+    /// \brief Decide whether a node should be optimally or heuristically
     ///        reduced.
     /// @return Whether or not the given node should be listed for optimal
     ///         reduction (via R0, R1 or R2).
@@ -105,21 +105,21 @@ namespace PBQP {
     /// criteria. Note however that your criteria for selecting optimal nodes
     /// should be <i>at least</i> as strong as this. I.e. Nodes of degree 3 or
     /// higher should not be selected under any circumstances.
-    bool shouldOptimallyReduce(Graph::NodeItr nItr) {
-      if (g.getNodeDegree(nItr) < 3)
+    bool shouldOptimallyReduce(Graph::NodeId nId) {
+      if (g.getNodeDegree(nId) < 3)
         return true;
       // else
       return false;
     }
 
     /// \brief Add the given node to the list of nodes to be optimally reduced.
-    /// @param nItr Node iterator to be added.
+    /// @param nId Node id to be added.
     ///
     /// You probably don't want to over-ride this, except perhaps to record
     /// statistics before calling this implementation. HeuristicBase relies on
     /// its behaviour.
-    void addToOptimalReduceList(Graph::NodeItr nItr) {
-      optimalList.push_back(nItr);
+    void addToOptimalReduceList(Graph::NodeId nId) {
+      optimalList.push_back(nId);
     }
 
     /// \brief Initialise the heuristic.
@@ -132,10 +132,10 @@ namespace PBQP {
     void setup() {
       for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
            nItr != nEnd; ++nItr) {
-        if (impl().shouldOptimallyReduce(nItr)) {
-          addToOptimalReduceList(nItr);
+        if (impl().shouldOptimallyReduce(*nItr)) {
+          addToOptimalReduceList(*nItr);
         } else {
-          impl().addToHeuristicReduceList(nItr);
+          impl().addToHeuristicReduceList(*nItr);
         }
       }
     }
@@ -150,13 +150,13 @@ namespace PBQP {
       if (optimalList.empty())
         return false;
 
-      Graph::NodeItr nItr = optimalList.front();
+      Graph::NodeId nId = optimalList.front();
       optimalList.pop_front();
 
-      switch (s.getSolverDegree(nItr)) {
-        case 0: s.applyR0(nItr); break;
-        case 1: s.applyR1(nItr); break;
-        case 2: s.applyR2(nItr); break;
+      switch (s.getSolverDegree(nId)) {
+        case 0: s.applyR0(nId); break;
+        case 1: s.applyR1(nId); break;
+        case 2: s.applyR2(nId); break;
         default: llvm_unreachable(
                         "Optimal reductions of degree > 2 nodes is invalid.");
       }
@@ -184,8 +184,8 @@ namespace PBQP {
     }
 
     /// \brief Add a node to the heuristic reduce list.
-    /// @param nItr Node iterator to add to the heuristic reduce list.
-    void addToHeuristicList(Graph::NodeItr nItr) {
+    /// @param nId Node id to add to the heuristic reduce list.
+    void addToHeuristicList(Graph::NodeId nId) {
       llvm_unreachable("Must be implemented in derived class.");
     }
 
@@ -199,31 +199,31 @@ namespace PBQP {
     }
 
     /// \brief Prepare a change in the costs on the given edge.
-    /// @param eItr Edge iterator.    
-    void preUpdateEdgeCosts(Graph::EdgeItr eItr) {
+    /// @param eId Edge id.
+    void preUpdateEdgeCosts(Graph::EdgeId eId) {
       llvm_unreachable("Must be implemented in derived class.");
     }
 
     /// \brief Handle the change in the costs on the given edge.
-    /// @param eItr Edge iterator.
-    void postUpdateEdgeCostts(Graph::EdgeItr eItr) {
+    /// @param eId Edge id.
+    void postUpdateEdgeCostts(Graph::EdgeId eId) {
       llvm_unreachable("Must be implemented in derived class.");
     }
 
     /// \brief Handle the addition of a new edge into the PBQP graph.
-    /// @param eItr Edge iterator for the added edge.
-    void handleAddEdge(Graph::EdgeItr eItr) {
+    /// @param eId Edge id for the added edge.
+    void handleAddEdge(Graph::EdgeId eId) {
       llvm_unreachable("Must be implemented in derived class.");
     }
 
     /// \brief Handle disconnection of an edge from a node.
-    /// @param eItr Edge iterator for edge being disconnected.
-    /// @param nItr Node iterator for the node being disconnected from.
+    /// @param eId Edge id for edge being disconnected.
+    /// @param nId Node id for the node being disconnected from.
     ///
     /// Edges are frequently removed due to the removal of a node. This
     /// method allows for the effect to be computed only for the remaining
     /// node in the graph.
-    void handleRemoveEdge(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
+    void handleRemoveEdge(Graph::EdgeId eId, Graph::NodeId nId) {
       llvm_unreachable("Must be implemented in derived class.");
     }
 
diff --git a/include/llvm/CodeGen/PBQP/HeuristicSolver.h b/include/llvm/CodeGen/PBQP/HeuristicSolver.h
index 47e15b2..e26ca02 100644
--- a/include/llvm/CodeGen/PBQP/HeuristicSolver.h
+++ b/include/llvm/CodeGen/PBQP/HeuristicSolver.h
@@ -9,7 +9,7 @@
 //
 // Heuristic PBQP solver. This solver is able to perform optimal reductions for
 // nodes of degree 0, 1 or 2. For nodes of degree >2 a plugable heuristic is
-// used to select a node for reduction. 
+// used to select a node for reduction.
 //
 //===----------------------------------------------------------------------===//
 
@@ -40,10 +40,10 @@ namespace PBQP {
     typedef typename HImpl::NodeData HeuristicNodeData;
     typedef typename HImpl::EdgeData HeuristicEdgeData;
 
-    typedef std::list<Graph::EdgeItr> SolverEdges;
+    typedef std::list<Graph::EdgeId> SolverEdges;
 
   public:
-  
+
     /// \brief Iterator type for edges in the solver graph.
     typedef SolverEdges::iterator SolverEdgeItr;
 
@@ -55,9 +55,9 @@ namespace PBQP {
 
       HeuristicNodeData& getHeuristicData() { return hData; }
 
-      SolverEdgeItr addSolverEdge(Graph::EdgeItr eItr) {
+      SolverEdgeItr addSolverEdge(Graph::EdgeId eId) {
         ++solverDegree;
-        return solverEdges.insert(solverEdges.end(), eItr);
+        return solverEdges.insert(solverEdges.end(), eId);
       }
 
       void removeSolverEdge(SolverEdgeItr seItr) {
@@ -70,15 +70,15 @@ namespace PBQP {
       unsigned getSolverDegree() const { return solverDegree; }
       void clearSolverEdges() {
         solverDegree = 0;
-        solverEdges.clear(); 
+        solverEdges.clear();
       }
-      
+
     private:
       HeuristicNodeData hData;
       unsigned solverDegree;
       SolverEdges solverEdges;
     };
- 
+
     class EdgeData {
     public:
       HeuristicEdgeData& getHeuristicData() { return hData; }
@@ -104,7 +104,7 @@ namespace PBQP {
     Graph &g;
     HImpl h;
     Solution s;
-    std::vector<Graph::NodeItr> stack;
+    std::vector<Graph::NodeId> stack;
 
     typedef std::list<NodeData> NodeDataList;
     NodeDataList nodeDataList;
@@ -117,7 +117,7 @@ namespace PBQP {
     /// \brief Construct a heuristic solver implementation to solve the given
     ///        graph.
     /// @param g The graph representing the problem instance to be solved.
-    HeuristicSolverImpl(Graph &g) : g(g), h(*this) {}  
+    HeuristicSolverImpl(Graph &g) : g(g), h(*this) {}
 
     /// \brief Get the graph being solved by this solver.
     /// @return The graph representing the problem instance being solved by this
@@ -125,46 +125,46 @@ namespace PBQP {
     Graph& getGraph() { return g; }
 
     /// \brief Get the heuristic data attached to the given node.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return The heuristic data attached to the given node.
-    HeuristicNodeData& getHeuristicNodeData(Graph::NodeItr nItr) {
-      return getSolverNodeData(nItr).getHeuristicData();
+    HeuristicNodeData& getHeuristicNodeData(Graph::NodeId nId) {
+      return getSolverNodeData(nId).getHeuristicData();
     }
 
     /// \brief Get the heuristic data attached to the given edge.
-    /// @param eItr Edge iterator.
+    /// @param eId Edge id.
     /// @return The heuristic data attached to the given node.
-    HeuristicEdgeData& getHeuristicEdgeData(Graph::EdgeItr eItr) {
-      return getSolverEdgeData(eItr).getHeuristicData();
+    HeuristicEdgeData& getHeuristicEdgeData(Graph::EdgeId eId) {
+      return getSolverEdgeData(eId).getHeuristicData();
     }
 
     /// \brief Begin iterator for the set of edges adjacent to the given node in
     ///        the solver graph.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return Begin iterator for the set of edges adjacent to the given node
-    ///         in the solver graph. 
-    SolverEdgeItr solverEdgesBegin(Graph::NodeItr nItr) {
-      return getSolverNodeData(nItr).solverEdgesBegin();
+    ///         in the solver graph.
+    SolverEdgeItr solverEdgesBegin(Graph::NodeId nId) {
+      return getSolverNodeData(nId).solverEdgesBegin();
     }
 
     /// \brief End iterator for the set of edges adjacent to the given node in
     ///        the solver graph.
-    /// @param nItr Node iterator.
+    /// @param nId Node id.
     /// @return End iterator for the set of edges adjacent to the given node in
-    ///         the solver graph. 
-    SolverEdgeItr solverEdgesEnd(Graph::NodeItr nItr) {
-      return getSolverNodeData(nItr).solverEdgesEnd();
+    ///         the solver graph.
+    SolverEdgeItr solverEdgesEnd(Graph::NodeId nId) {
+      return getSolverNodeData(nId).solverEdgesEnd();
     }
 
     /// \brief Remove a node from the solver graph.
-    /// @param eItr Edge iterator for edge to be removed.
+    /// @param eId Edge id for edge to be removed.
     ///
     /// Does <i>not</i> notify the heuristic of the removal. That should be
     /// done manually if necessary.
-    void removeSolverEdge(Graph::EdgeItr eItr) {
-      EdgeData &eData = getSolverEdgeData(eItr);
-      NodeData &n1Data = getSolverNodeData(g.getEdgeNode1(eItr)),
-               &n2Data = getSolverNodeData(g.getEdgeNode2(eItr));
+    void removeSolverEdge(Graph::EdgeId eId) {
+      EdgeData &eData = getSolverEdgeData(eId);
+      NodeData &n1Data = getSolverNodeData(g.getEdgeNode1(eId)),
+               &n2Data = getSolverNodeData(g.getEdgeNode2(eId));
 
       n1Data.removeSolverEdge(eData.getN1SolverEdgeItr());
       n2Data.removeSolverEdge(eData.getN2SolverEdgeItr());
@@ -188,66 +188,66 @@ namespace PBQP {
     }
 
     /// \brief Add to the end of the stack.
-    /// @param nItr Node iterator to add to the reduction stack.
-    void pushToStack(Graph::NodeItr nItr) {
-      getSolverNodeData(nItr).clearSolverEdges();
-      stack.push_back(nItr);
+    /// @param nId Node id to add to the reduction stack.
+    void pushToStack(Graph::NodeId nId) {
+      getSolverNodeData(nId).clearSolverEdges();
+      stack.push_back(nId);
     }
 
     /// \brief Returns the solver degree of the given node.
-    /// @param nItr Node iterator for which degree is requested.
+    /// @param nId Node id for which degree is requested.
     /// @return Node degree in the <i>solver</i> graph (not the original graph).
-    unsigned getSolverDegree(Graph::NodeItr nItr) {
-      return  getSolverNodeData(nItr).getSolverDegree();
+    unsigned getSolverDegree(Graph::NodeId nId) {
+      return  getSolverNodeData(nId).getSolverDegree();
     }
 
     /// \brief Set the solution of the given node.
-    /// @param nItr Node iterator to set solution for.
+    /// @param nId Node id to set solution for.
     /// @param selection Selection for node.
-    void setSolution(const Graph::NodeItr &nItr, unsigned selection) {
-      s.setSelection(nItr, selection);
+    void setSolution(const Graph::NodeId &nId, unsigned selection) {
+      s.setSelection(nId, selection);
 
-      for (Graph::AdjEdgeItr aeItr = g.adjEdgesBegin(nItr),
-                             aeEnd = g.adjEdgesEnd(nItr);
+      for (Graph::AdjEdgeItr aeItr = g.adjEdgesBegin(nId),
+                             aeEnd = g.adjEdgesEnd(nId);
            aeItr != aeEnd; ++aeItr) {
-        Graph::EdgeItr eItr(*aeItr);
-        Graph::NodeItr anItr(g.getEdgeOtherNode(eItr, nItr));
-        getSolverNodeData(anItr).addSolverEdge(eItr);
+        Graph::EdgeId eId(*aeItr);
+        Graph::NodeId anId(g.getEdgeOtherNode(eId, nId));
+        getSolverNodeData(anId).addSolverEdge(eId);
       }
     }
 
     /// \brief Apply rule R0.
-    /// @param nItr Node iterator for node to apply R0 to.
+    /// @param nId Node id for node to apply R0 to.
     ///
     /// Node will be automatically pushed to the solver stack.
-    void applyR0(Graph::NodeItr nItr) {
-      assert(getSolverNodeData(nItr).getSolverDegree() == 0 &&
+    void applyR0(Graph::NodeId nId) {
+      assert(getSolverNodeData(nId).getSolverDegree() == 0 &&
              "R0 applied to node with degree != 0.");
 
       // Nothing to do. Just push the node onto the reduction stack.
-      pushToStack(nItr);
+      pushToStack(nId);
 
       s.recordR0();
     }
 
     /// \brief Apply rule R1.
-    /// @param xnItr Node iterator for node to apply R1 to.
+    /// @param xnId Node id for node to apply R1 to.
     ///
     /// Node will be automatically pushed to the solver stack.
-    void applyR1(Graph::NodeItr xnItr) {
-      NodeData &nd = getSolverNodeData(xnItr);
+    void applyR1(Graph::NodeId xnId) {
+      NodeData &nd = getSolverNodeData(xnId);
       assert(nd.getSolverDegree() == 1 &&
              "R1 applied to node with degree != 1.");
 
-      Graph::EdgeItr eItr = *nd.solverEdgesBegin();
+      Graph::EdgeId eId = *nd.solverEdgesBegin();
+
+      const Matrix &eCosts = g.getEdgeCosts(eId);
+      const Vector &xCosts = g.getNodeCosts(xnId);
 
-      const Matrix &eCosts = g.getEdgeCosts(eItr);
-      const Vector &xCosts = g.getNodeCosts(xnItr);
-      
       // Duplicate a little to avoid transposing matrices.
-      if (xnItr == g.getEdgeNode1(eItr)) {
-        Graph::NodeItr ynItr = g.getEdgeNode2(eItr);
-        Vector &yCosts = g.getNodeCosts(ynItr);
+      if (xnId == g.getEdgeNode1(eId)) {
+        Graph::NodeId ynId = g.getEdgeNode2(eId);
+        Vector &yCosts = g.getNodeCosts(ynId);
         for (unsigned j = 0; j < yCosts.getLength(); ++j) {
           PBQPNum min = eCosts[0][j] + xCosts[0];
           for (unsigned i = 1; i < xCosts.getLength(); ++i) {
@@ -257,10 +257,10 @@ namespace PBQP {
           }
           yCosts[j] += min;
         }
-        h.handleRemoveEdge(eItr, ynItr);
+        h.handleRemoveEdge(eId, ynId);
      } else {
-        Graph::NodeItr ynItr = g.getEdgeNode1(eItr);
-        Vector &yCosts = g.getNodeCosts(ynItr);
+        Graph::NodeId ynId = g.getEdgeNode1(eId);
+        Vector &yCosts = g.getNodeCosts(ynId);
         for (unsigned i = 0; i < yCosts.getLength(); ++i) {
           PBQPNum min = eCosts[i][0] + xCosts[0];
           for (unsigned j = 1; j < xCosts.getLength(); ++j) {
@@ -270,48 +270,48 @@ namespace PBQP {
           }
           yCosts[i] += min;
         }
-        h.handleRemoveEdge(eItr, ynItr);
+        h.handleRemoveEdge(eId, ynId);
       }
-      removeSolverEdge(eItr);
+      removeSolverEdge(eId);
       assert(nd.getSolverDegree() == 0 &&
              "Degree 1 with edge removed should be 0.");
-      pushToStack(xnItr);
+      pushToStack(xnId);
       s.recordR1();
     }
 
     /// \brief Apply rule R2.
-    /// @param xnItr Node iterator for node to apply R2 to.
+    /// @param xnId Node id for node to apply R2 to.
     ///
     /// Node will be automatically pushed to the solver stack.
-    void applyR2(Graph::NodeItr xnItr) {
-      assert(getSolverNodeData(xnItr).getSolverDegree() == 2 &&
+    void applyR2(Graph::NodeId xnId) {
+      assert(getSolverNodeData(xnId).getSolverDegree() == 2 &&
              "R2 applied to node with degree != 2.");
 
-      NodeData &nd = getSolverNodeData(xnItr);
-      const Vector &xCosts = g.getNodeCosts(xnItr);
+      NodeData &nd = getSolverNodeData(xnId);
+      const Vector &xCosts = g.getNodeCosts(xnId);
 
       SolverEdgeItr aeItr = nd.solverEdgesBegin();
-      Graph::EdgeItr yxeItr = *aeItr,
-                     zxeItr = *(++aeItr);
+      Graph::EdgeId yxeId = *aeItr,
+                    zxeId = *(++aeItr);
 
-      Graph::NodeItr ynItr = g.getEdgeOtherNode(yxeItr, xnItr),
-                     znItr = g.getEdgeOtherNode(zxeItr, xnItr);
+      Graph::NodeId ynId = g.getEdgeOtherNode(yxeId, xnId),
+                    znId = g.getEdgeOtherNode(zxeId, xnId);
 
-      bool flipEdge1 = (g.getEdgeNode1(yxeItr) == xnItr),
-           flipEdge2 = (g.getEdgeNode1(zxeItr) == xnItr);
+      bool flipEdge1 = (g.getEdgeNode1(yxeId) == xnId),
+           flipEdge2 = (g.getEdgeNode1(zxeId) == xnId);
 
       const Matrix *yxeCosts = flipEdge1 ?
-        new Matrix(g.getEdgeCosts(yxeItr).transpose()) :
-        &g.getEdgeCosts(yxeItr);
+        new Matrix(g.getEdgeCosts(yxeId).transpose()) :
+        &g.getEdgeCosts(yxeId);
 
       const Matrix *zxeCosts = flipEdge2 ?
-        new Matrix(g.getEdgeCosts(zxeItr).transpose()) :
-        &g.getEdgeCosts(zxeItr);
+        new Matrix(g.getEdgeCosts(zxeId).transpose()) :
+        &g.getEdgeCosts(zxeId);
 
       unsigned xLen = xCosts.getLength(),
                yLen = yxeCosts->getRows(),
                zLen = zxeCosts->getRows();
-               
+
       Matrix delta(yLen, zLen);
 
       for (unsigned i = 0; i < yLen; ++i) {
@@ -333,79 +333,79 @@ namespace PBQP {
       if (flipEdge2)
         delete zxeCosts;
 
-      Graph::EdgeItr yzeItr = g.findEdge(ynItr, znItr);
+      Graph::EdgeId yzeId = g.findEdge(ynId, znId);
       bool addedEdge = false;
 
-      if (yzeItr == g.edgesEnd()) {
-        yzeItr = g.addEdge(ynItr, znItr, delta);
+      if (yzeId == g.invalidEdgeId()) {
+        yzeId = g.addEdge(ynId, znId, delta);
         addedEdge = true;
       } else {
-        Matrix &yzeCosts = g.getEdgeCosts(yzeItr);
-        h.preUpdateEdgeCosts(yzeItr);
-        if (ynItr == g.getEdgeNode1(yzeItr)) {
+        Matrix &yzeCosts = g.getEdgeCosts(yzeId);
+        h.preUpdateEdgeCosts(yzeId);
+        if (ynId == g.getEdgeNode1(yzeId)) {
           yzeCosts += delta;
         } else {
           yzeCosts += delta.transpose();
         }
       }
 
-      bool nullCostEdge = tryNormaliseEdgeMatrix(yzeItr);
+      bool nullCostEdge = tryNormaliseEdgeMatrix(yzeId);
 
       if (!addedEdge) {
         // If we modified the edge costs let the heuristic know.
-        h.postUpdateEdgeCosts(yzeItr);
+        h.postUpdateEdgeCosts(yzeId);
       }
- 
+
       if (nullCostEdge) {
         // If this edge ended up null remove it.
         if (!addedEdge) {
           // We didn't just add it, so we need to notify the heuristic
           // and remove it from the solver.
-          h.handleRemoveEdge(yzeItr, ynItr);
-          h.handleRemoveEdge(yzeItr, znItr);
-          removeSolverEdge(yzeItr);
+          h.handleRemoveEdge(yzeId, ynId);
+          h.handleRemoveEdge(yzeId, znId);
+          removeSolverEdge(yzeId);
         }
-        g.removeEdge(yzeItr);
+        g.removeEdge(yzeId);
       } else if (addedEdge) {
         // If the edge was added, and non-null, finish setting it up, add it to
         // the solver & notify heuristic.
         edgeDataList.push_back(EdgeData());
-        g.setEdgeData(yzeItr, &edgeDataList.back());
-        addSolverEdge(yzeItr);
-        h.handleAddEdge(yzeItr);
+        g.setEdgeData(yzeId, &edgeDataList.back());
+        addSolverEdge(yzeId);
+        h.handleAddEdge(yzeId);
       }
 
-      h.handleRemoveEdge(yxeItr, ynItr);
-      removeSolverEdge(yxeItr);
-      h.handleRemoveEdge(zxeItr, znItr);
-      removeSolverEdge(zxeItr);
+      h.handleRemoveEdge(yxeId, ynId);
+      removeSolverEdge(yxeId);
+      h.handleRemoveEdge(zxeId, znId);
+      removeSolverEdge(zxeId);
 
-      pushToStack(xnItr);
+      pushToStack(xnId);
       s.recordR2();
     }
 
     /// \brief Record an application of the RN rule.
     ///
     /// For use by the HeuristicBase.
-    void recordRN() { s.recordRN(); } 
+    void recordRN() { s.recordRN(); }
 
   private:
 
-    NodeData& getSolverNodeData(Graph::NodeItr nItr) {
-      return *static_cast<NodeData*>(g.getNodeData(nItr));
+    NodeData& getSolverNodeData(Graph::NodeId nId) {
+      return *static_cast<NodeData*>(g.getNodeData(nId));
     }
 
-    EdgeData& getSolverEdgeData(Graph::EdgeItr eItr) {
-      return *static_cast<EdgeData*>(g.getEdgeData(eItr));
+    EdgeData& getSolverEdgeData(Graph::EdgeId eId) {
+      return *static_cast<EdgeData*>(g.getEdgeData(eId));
     }
 
-    void addSolverEdge(Graph::EdgeItr eItr) {
-      EdgeData &eData = getSolverEdgeData(eItr);
-      NodeData &n1Data = getSolverNodeData(g.getEdgeNode1(eItr)),
-               &n2Data = getSolverNodeData(g.getEdgeNode2(eItr));
+    void addSolverEdge(Graph::EdgeId eId) {
+      EdgeData &eData = getSolverEdgeData(eId);
+      NodeData &n1Data = getSolverNodeData(g.getEdgeNode1(eId)),
+               &n2Data = getSolverNodeData(g.getEdgeNode2(eId));
 
-      eData.setN1SolverEdgeItr(n1Data.addSolverEdge(eItr));
-      eData.setN2SolverEdgeItr(n2Data.addSolverEdge(eItr));
+      eData.setN1SolverEdgeItr(n1Data.addSolverEdge(eId));
+      eData.setN2SolverEdgeItr(n2Data.addSolverEdge(eId));
     }
 
     void setup() {
@@ -417,15 +417,15 @@ namespace PBQP {
       for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
            nItr != nEnd; ++nItr) {
         nodeDataList.push_back(NodeData());
-        g.setNodeData(nItr, &nodeDataList.back());
+        g.setNodeData(*nItr, &nodeDataList.back());
       }
 
       // Create edge data objects.
       for (Graph::EdgeItr eItr = g.edgesBegin(), eEnd = g.edgesEnd();
            eItr != eEnd; ++eItr) {
         edgeDataList.push_back(EdgeData());
-        g.setEdgeData(eItr, &edgeDataList.back());
-        addSolverEdge(eItr);
+        g.setEdgeData(*eItr, &edgeDataList.back());
+        addSolverEdge(*eItr);
       }
     }
 
@@ -441,28 +441,30 @@ namespace PBQP {
       for (Graph::NodeItr nItr = g.nodesBegin(), nEnd = g.nodesEnd();
            nItr != nEnd; ++nItr) {
 
-        if (g.getNodeCosts(nItr).getLength() == 1) {
+        Graph::NodeId nId = *nItr;
+
+        if (g.getNodeCosts(nId).getLength() == 1) {
 
-          std::vector<Graph::EdgeItr> edgesToRemove;
+          std::vector<Graph::EdgeId> edgesToRemove;
 
-          for (Graph::AdjEdgeItr aeItr = g.adjEdgesBegin(nItr),
-                                 aeEnd = g.adjEdgesEnd(nItr);
+          for (Graph::AdjEdgeItr aeItr = g.adjEdgesBegin(nId),
+                                 aeEnd = g.adjEdgesEnd(nId);
                aeItr != aeEnd; ++aeItr) {
 
-            Graph::EdgeItr eItr = *aeItr;
+            Graph::EdgeId eId = *aeItr;
 
-            if (g.getEdgeNode1(eItr) == nItr) {
-              Graph::NodeItr otherNodeItr = g.getEdgeNode2(eItr);
-              g.getNodeCosts(otherNodeItr) +=
-                g.getEdgeCosts(eItr).getRowAsVector(0);
+            if (g.getEdgeNode1(eId) == nId) {
+              Graph::NodeId otherNodeId = g.getEdgeNode2(eId);
+              g.getNodeCosts(otherNodeId) +=
+                g.getEdgeCosts(eId).getRowAsVector(0);
             }
             else {
-              Graph::NodeItr otherNodeItr = g.getEdgeNode1(eItr);
-              g.getNodeCosts(otherNodeItr) +=
-                g.getEdgeCosts(eItr).getColAsVector(0);
+              Graph::NodeId otherNodeId = g.getEdgeNode1(eId);
+              g.getNodeCosts(otherNodeId) +=
+                g.getEdgeCosts(eId).getColAsVector(0);
             }
 
-            edgesToRemove.push_back(eItr);
+            edgesToRemove.push_back(eId);
           }
 
           if (!edgesToRemove.empty())
@@ -477,12 +479,12 @@ namespace PBQP {
     }
 
     void eliminateIndependentEdges() {
-      std::vector<Graph::EdgeItr> edgesToProcess;
+      std::vector<Graph::EdgeId> edgesToProcess;
       unsigned numEliminated = 0;
 
       for (Graph::EdgeItr eItr = g.edgesBegin(), eEnd = g.edgesEnd();
            eItr != eEnd; ++eItr) {
-        edgesToProcess.push_back(eItr);
+        edgesToProcess.push_back(*eItr);
       }
 
       while (!edgesToProcess.empty()) {
@@ -492,21 +494,21 @@ namespace PBQP {
       }
     }
 
-    bool tryToEliminateEdge(Graph::EdgeItr eItr) {
-      if (tryNormaliseEdgeMatrix(eItr)) {
-        g.removeEdge(eItr);
-        return true; 
+    bool tryToEliminateEdge(Graph::EdgeId eId) {
+      if (tryNormaliseEdgeMatrix(eId)) {
+        g.removeEdge(eId);
+        return true;
       }
       return false;
     }
 
-    bool tryNormaliseEdgeMatrix(Graph::EdgeItr &eItr) {
+    bool tryNormaliseEdgeMatrix(Graph::EdgeId &eId) {
 
       const PBQPNum infinity = std::numeric_limits<PBQPNum>::infinity();
 
-      Matrix &edgeCosts = g.getEdgeCosts(eItr);
-      Vector &uCosts = g.getNodeCosts(g.getEdgeNode1(eItr)),
-             &vCosts = g.getNodeCosts(g.getEdgeNode2(eItr));
+      Matrix &edgeCosts = g.getEdgeCosts(eId);
+      Vector &uCosts = g.getNodeCosts(g.getEdgeNode1(eId)),
+             &vCosts = g.getNodeCosts(g.getEdgeNode2(eId));
 
       for (unsigned r = 0; r < edgeCosts.getRows(); ++r) {
         PBQPNum rowMin = infinity;
@@ -554,34 +556,34 @@ namespace PBQP {
       }
     }
 
-    void computeSolution(Graph::NodeItr nItr) {
+    void computeSolution(Graph::NodeId nId) {
 
-      NodeData &nodeData = getSolverNodeData(nItr);
+      NodeData &nodeData = getSolverNodeData(nId);
 
-      Vector v(g.getNodeCosts(nItr));
+      Vector v(g.getNodeCosts(nId));
 
       // Solve based on existing solved edges.
       for (SolverEdgeItr solvedEdgeItr = nodeData.solverEdgesBegin(),
                          solvedEdgeEnd = nodeData.solverEdgesEnd();
            solvedEdgeItr != solvedEdgeEnd; ++solvedEdgeItr) {
 
-        Graph::EdgeItr eItr(*solvedEdgeItr);
-        Matrix &edgeCosts = g.getEdgeCosts(eItr);
+        Graph::EdgeId eId(*solvedEdgeItr);
+        Matrix &edgeCosts = g.getEdgeCosts(eId);
 
-        if (nItr == g.getEdgeNode1(eItr)) {
-          Graph::NodeItr adjNode(g.getEdgeNode2(eItr));
+        if (nId == g.getEdgeNode1(eId)) {
+          Graph::NodeId adjNode(g.getEdgeNode2(eId));
           unsigned adjSolution = s.getSelection(adjNode);
           v += edgeCosts.getColAsVector(adjSolution);
         }
         else {
-          Graph::NodeItr adjNode(g.getEdgeNode1(eItr));
+          Graph::NodeId adjNode(g.getEdgeNode1(eId));
           unsigned adjSolution = s.getSelection(adjNode);
           v += edgeCosts.getRowAsVector(adjSolution);
         }
 
       }
 
-      setSolution(nItr, v.minIndex());
+      setSolution(nId, v.minIndex());
     }
 
     void cleanup() {
diff --git a/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h b/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h
index 307d81e..c355c2c 100644
--- a/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h
+++ b/include/llvm/CodeGen/PBQP/Heuristics/Briggs.h
@@ -27,7 +27,7 @@ namespace PBQP {
 
     /// \brief PBQP Heuristic which applies an allocability test based on
     ///        Briggs.
-    /// 
+    ///
     /// This heuristic assumes that the elements of cost vectors in the PBQP
     /// problem represent storage options, with the first being the spill
     /// option and subsequent elements representing legal registers for the
@@ -39,16 +39,16 @@ namespace PBQP {
     /// solver stack. If no nodes can be proven allocable then the node with
     /// the lowest estimated spill cost is selected and push to the solver stack
     /// instead.
-    /// 
-    /// This implementation is built on top of HeuristicBase.       
+    ///
+    /// This implementation is built on top of HeuristicBase.
     class Briggs : public HeuristicBase<Briggs> {
     private:
 
       class LinkDegreeComparator {
       public:
         LinkDegreeComparator(HeuristicSolverImpl<Briggs> &s) : s(&s) {}
-        bool operator()(Graph::NodeItr n1Itr, Graph::NodeItr n2Itr) const {
-          if (s->getSolverDegree(n1Itr) > s->getSolverDegree(n2Itr))
+        bool operator()(Graph::NodeId n1Id, Graph::NodeId n2Id) const {
+          if (s->getSolverDegree(n1Id) > s->getSolverDegree(n2Id))
             return true;
           return false;
         }
@@ -60,12 +60,12 @@ namespace PBQP {
       public:
         SpillCostComparator(HeuristicSolverImpl<Briggs> &s)
           : s(&s), g(&s.getGraph()) {}
-        bool operator()(Graph::NodeItr n1Itr, Graph::NodeItr n2Itr) const {
-          const PBQP::Vector &cv1 = g->getNodeCosts(n1Itr);
-          const PBQP::Vector &cv2 = g->getNodeCosts(n2Itr);
+        bool operator()(Graph::NodeId n1Id, Graph::NodeId n2Id) const {
+          const PBQP::Vector &cv1 = g->getNodeCosts(n1Id);
+          const PBQP::Vector &cv2 = g->getNodeCosts(n2Id);
 
-          PBQPNum cost1 = cv1[0] / s->getSolverDegree(n1Itr);
-          PBQPNum cost2 = cv2[0] / s->getSolverDegree(n2Itr);
+          PBQPNum cost1 = cv1[0] / s->getSolverDegree(n1Id);
+          PBQPNum cost2 = cv2[0] / s->getSolverDegree(n2Id);
 
           if (cost1 < cost2)
             return true;
@@ -77,10 +77,10 @@ namespace PBQP {
         Graph *g;
       };
 
-      typedef std::list<Graph::NodeItr> RNAllocableList;
+      typedef std::list<Graph::NodeId> RNAllocableList;
       typedef RNAllocableList::iterator RNAllocableListItr;
 
-      typedef std::list<Graph::NodeItr> RNUnallocableList;  
+      typedef std::list<Graph::NodeId> RNUnallocableList;
       typedef RNUnallocableList::iterator RNUnallocableListItr;
 
     public:
@@ -114,7 +114,7 @@ namespace PBQP {
 
       /// \brief Determine whether a node should be reduced using optimal
       ///        reduction.
-      /// @param nItr Node iterator to be considered.
+      /// @param nId Node id to be considered.
       /// @return True if the given node should be optimally reduced, false
       ///         otherwise.
       ///
@@ -123,8 +123,8 @@ namespace PBQP {
       /// infinite are checked for allocability first. Allocable nodes may be
       /// optimally reduced, but nodes whose allocability cannot be proven are
       /// selected for heuristic reduction instead.
-      bool shouldOptimallyReduce(Graph::NodeItr nItr) {
-        if (getSolver().getSolverDegree(nItr) < 3) {
+      bool shouldOptimallyReduce(Graph::NodeId nId) {
+        if (getSolver().getSolverDegree(nId) < 3) {
           return true;
         }
         // else
@@ -132,15 +132,15 @@ namespace PBQP {
       }
 
       /// \brief Add a node to the heuristic reduce list.
-      /// @param nItr Node iterator to add to the heuristic reduce list.
-      void addToHeuristicReduceList(Graph::NodeItr nItr) {
-        NodeData &nd = getHeuristicNodeData(nItr);
-        initializeNode(nItr);
+      /// @param nId Node id to add to the heuristic reduce list.
+      void addToHeuristicReduceList(Graph::NodeId nId) {
+        NodeData &nd = getHeuristicNodeData(nId);
+        initializeNode(nId);
         nd.isHeuristic = true;
         if (nd.isAllocable) {
-          nd.rnaItr = rnAllocableList.insert(rnAllocableList.end(), nItr);
+          nd.rnaItr = rnAllocableList.insert(rnAllocableList.end(), nId);
         } else {
-          nd.rnuItr = rnUnallocableList.insert(rnUnallocableList.end(), nItr);
+          nd.rnuItr = rnUnallocableList.insert(rnUnallocableList.end(), nId);
         }
       }
 
@@ -159,19 +159,19 @@ namespace PBQP {
           RNAllocableListItr rnaItr =
             min_element(rnAllocableList.begin(), rnAllocableList.end(),
                         LinkDegreeComparator(getSolver()));
-          Graph::NodeItr nItr = *rnaItr;
+          Graph::NodeId nId = *rnaItr;
           rnAllocableList.erase(rnaItr);
-          handleRemoveNode(nItr);
-          getSolver().pushToStack(nItr);
+          handleRemoveNode(nId);
+          getSolver().pushToStack(nId);
           return true;
         } else if (!rnUnallocableList.empty()) {
           RNUnallocableListItr rnuItr =
             min_element(rnUnallocableList.begin(), rnUnallocableList.end(),
                         SpillCostComparator(getSolver()));
-          Graph::NodeItr nItr = *rnuItr;
+          Graph::NodeId nId = *rnuItr;
           rnUnallocableList.erase(rnuItr);
-          handleRemoveNode(nItr);
-          getSolver().pushToStack(nItr);
+          handleRemoveNode(nId);
+          getSolver().pushToStack(nId);
           return true;
         }
         // else
@@ -179,43 +179,43 @@ namespace PBQP {
       }
 
       /// \brief Prepare a change in the costs on the given edge.
-      /// @param eItr Edge iterator.    
-      void preUpdateEdgeCosts(Graph::EdgeItr eItr) {
+      /// @param eId Edge id.
+      void preUpdateEdgeCosts(Graph::EdgeId eId) {
         Graph &g = getGraph();
-        Graph::NodeItr n1Itr = g.getEdgeNode1(eItr),
-                       n2Itr = g.getEdgeNode2(eItr);
-        NodeData &n1 = getHeuristicNodeData(n1Itr),
-                 &n2 = getHeuristicNodeData(n2Itr);
+        Graph::NodeId n1Id = g.getEdgeNode1(eId),
+                      n2Id = g.getEdgeNode2(eId);
+        NodeData &n1 = getHeuristicNodeData(n1Id),
+                 &n2 = getHeuristicNodeData(n2Id);
 
         if (n1.isHeuristic)
-          subtractEdgeContributions(eItr, getGraph().getEdgeNode1(eItr));
+          subtractEdgeContributions(eId, getGraph().getEdgeNode1(eId));
         if (n2.isHeuristic)
-          subtractEdgeContributions(eItr, getGraph().getEdgeNode2(eItr));
+          subtractEdgeContributions(eId, getGraph().getEdgeNode2(eId));
 
-        EdgeData &ed = getHeuristicEdgeData(eItr);
+        EdgeData &ed = getHeuristicEdgeData(eId);
         ed.isUpToDate = false;
       }
 
       /// \brief Handle the change in the costs on the given edge.
-      /// @param eItr Edge iterator.
-      void postUpdateEdgeCosts(Graph::EdgeItr eItr) {
+      /// @param eId Edge id.
+      void postUpdateEdgeCosts(Graph::EdgeId eId) {
         // This is effectively the same as adding a new edge now, since
         // we've factored out the costs of the old one.
-        handleAddEdge(eItr);
+        handleAddEdge(eId);
       }
 
       /// \brief Handle the addition of a new edge into the PBQP graph.
-      /// @param eItr Edge iterator for the added edge.
+      /// @param eId Edge id for the added edge.
       ///
       /// Updates allocability of any nodes connected by this edge which are
       /// being managed by the heuristic. If allocability changes they are
       /// moved to the appropriate list.
-      void handleAddEdge(Graph::EdgeItr eItr) {
+      void handleAddEdge(Graph::EdgeId eId) {
         Graph &g = getGraph();
-        Graph::NodeItr n1Itr = g.getEdgeNode1(eItr),
-                       n2Itr = g.getEdgeNode2(eItr);
-        NodeData &n1 = getHeuristicNodeData(n1Itr),
-                 &n2 = getHeuristicNodeData(n2Itr);
+        Graph::NodeId n1Id = g.getEdgeNode1(eId),
+                      n2Id = g.getEdgeNode2(eId);
+        NodeData &n1 = getHeuristicNodeData(n1Id),
+                 &n2 = getHeuristicNodeData(n2Id);
 
         // If neither node is managed by the heuristic there's nothing to be
         // done.
@@ -223,60 +223,60 @@ namespace PBQP {
           return;
 
         // Ok - we need to update at least one node.
-        computeEdgeContributions(eItr);
+        computeEdgeContributions(eId);
 
         // Update node 1 if it's managed by the heuristic.
         if (n1.isHeuristic) {
           bool n1WasAllocable = n1.isAllocable;
-          addEdgeContributions(eItr, n1Itr);
-          updateAllocability(n1Itr);
+          addEdgeContributions(eId, n1Id);
+          updateAllocability(n1Id);
           if (n1WasAllocable && !n1.isAllocable) {
             rnAllocableList.erase(n1.rnaItr);
             n1.rnuItr =
-              rnUnallocableList.insert(rnUnallocableList.end(), n1Itr);
+              rnUnallocableList.insert(rnUnallocableList.end(), n1Id);
           }
         }
 
         // Likewise for node 2.
         if (n2.isHeuristic) {
           bool n2WasAllocable = n2.isAllocable;
-          addEdgeContributions(eItr, n2Itr);
-          updateAllocability(n2Itr);
+          addEdgeContributions(eId, n2Id);
+          updateAllocability(n2Id);
           if (n2WasAllocable && !n2.isAllocable) {
             rnAllocableList.erase(n2.rnaItr);
             n2.rnuItr =
-              rnUnallocableList.insert(rnUnallocableList.end(), n2Itr);
+              rnUnallocableList.insert(rnUnallocableList.end(), n2Id);
           }
         }
       }
 
       /// \brief Handle disconnection of an edge from a node.
-      /// @param eItr Edge iterator for edge being disconnected.
-      /// @param nItr Node iterator for the node being disconnected from.
+      /// @param eId Edge id for edge being disconnected.
+      /// @param nId Node id for the node being disconnected from.
       ///
       /// Updates allocability of the given node and, if appropriate, moves the
       /// node to a new list.
-      void handleRemoveEdge(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
-        NodeData &nd = getHeuristicNodeData(nItr);
+      void handleRemoveEdge(Graph::EdgeId eId, Graph::NodeId nId) {
+        NodeData &nd =getHeuristicNodeData(nId);
 
         // If the node is not managed by the heuristic there's nothing to be
         // done.
         if (!nd.isHeuristic)
           return;
 
-        EdgeData &ed = getHeuristicEdgeData(eItr);
+        EdgeData &ed = getHeuristicEdgeData(eId);
         (void)ed;
         assert(ed.isUpToDate && "Edge data is not up to date.");
 
         // Update node.
         bool ndWasAllocable = nd.isAllocable;
-        subtractEdgeContributions(eItr, nItr);
-        updateAllocability(nItr);
+        subtractEdgeContributions(eId, nId);
+        updateAllocability(nId);
 
         // If the node has gone optimal...
-        if (shouldOptimallyReduce(nItr)) {
+        if (shouldOptimallyReduce(nId)) {
           nd.isHeuristic = false;
-          addToOptimalReduceList(nItr);
+          addToOptimalReduceList(nId);
           if (ndWasAllocable) {
             rnAllocableList.erase(nd.rnaItr);
           } else {
@@ -287,36 +287,36 @@ namespace PBQP {
           // from "unallocable" to "allocable".
           if (!ndWasAllocable && nd.isAllocable) {
             rnUnallocableList.erase(nd.rnuItr);
-            nd.rnaItr = rnAllocableList.insert(rnAllocableList.end(), nItr);
+            nd.rnaItr = rnAllocableList.insert(rnAllocableList.end(), nId);
           }
         }
       }
 
     private:
 
-      NodeData& getHeuristicNodeData(Graph::NodeItr nItr) {
-        return getSolver().getHeuristicNodeData(nItr);
+      NodeData& getHeuristicNodeData(Graph::NodeId nId) {
+        return getSolver().getHeuristicNodeData(nId);
       }
 
-      EdgeData& getHeuristicEdgeData(Graph::EdgeItr eItr) {
-        return getSolver().getHeuristicEdgeData(eItr);
+      EdgeData& getHeuristicEdgeData(Graph::EdgeId eId) {
+        return getSolver().getHeuristicEdgeData(eId);
       }
 
       // Work out what this edge will contribute to the allocability of the
       // nodes connected to it.
-      void computeEdgeContributions(Graph::EdgeItr eItr) {
-        EdgeData &ed = getHeuristicEdgeData(eItr);
+      void computeEdgeContributions(Graph::EdgeId eId) {
+        EdgeData &ed = getHeuristicEdgeData(eId);
 
         if (ed.isUpToDate)
           return; // Edge data is already up to date.
 
-        Matrix &eCosts = getGraph().getEdgeCosts(eItr);
+        Matrix &eCosts = getGraph().getEdgeCosts(eId);
 
         unsigned numRegs = eCosts.getRows() - 1,
                  numReverseRegs = eCosts.getCols() - 1;
 
         std::vector<unsigned> rowInfCounts(numRegs, 0),
-                              colInfCounts(numReverseRegs, 0);        
+                              colInfCounts(numReverseRegs, 0);
 
         ed.worst = 0;
         ed.reverseWorst = 0;
@@ -348,19 +348,19 @@ namespace PBQP {
         ed.isUpToDate = true;
       }
 
-      // Add the contributions of the given edge to the given node's 
+      // Add the contributions of the given edge to the given node's
       // numDenied and safe members. No action is taken other than to update
       // these member values. Once updated these numbers can be used by clients
       // to update the node's allocability.
-      void addEdgeContributions(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
-        EdgeData &ed = getHeuristicEdgeData(eItr);
+      void addEdgeContributions(Graph::EdgeId eId, Graph::NodeId nId) {
+        EdgeData &ed = getHeuristicEdgeData(eId);
 
         assert(ed.isUpToDate && "Using out-of-date edge numbers.");
 
-        NodeData &nd = getHeuristicNodeData(nItr);
-        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
-        
-        bool nIsNode1 = nItr == getGraph().getEdgeNode1(eItr);
+        NodeData &nd = getHeuristicNodeData(nId);
+        unsigned numRegs = getGraph().getNodeCosts(nId).getLength() - 1;
+
+        bool nIsNode1 = nId == getGraph().getEdgeNode1(eId);
         EdgeData::UnsafeArray &unsafe =
           nIsNode1 ? ed.unsafe : ed.reverseUnsafe;
         nd.numDenied += nIsNode1 ? ed.worst : ed.reverseWorst;
@@ -375,25 +375,25 @@ namespace PBQP {
         }
       }
 
-      // Subtract the contributions of the given edge to the given node's 
+      // Subtract the contributions of the given edge to the given node's
       // numDenied and safe members. No action is taken other than to update
       // these member values. Once updated these numbers can be used by clients
       // to update the node's allocability.
-      void subtractEdgeContributions(Graph::EdgeItr eItr, Graph::NodeItr nItr) {
-        EdgeData &ed = getHeuristicEdgeData(eItr);
+      void subtractEdgeContributions(Graph::EdgeId eId, Graph::NodeId nId) {
+        EdgeData &ed = getHeuristicEdgeData(eId);
 
         assert(ed.isUpToDate && "Using out-of-date edge numbers.");
 
-        NodeData &nd = getHeuristicNodeData(nItr);
-        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
-        
-        bool nIsNode1 = nItr == getGraph().getEdgeNode1(eItr);
+        NodeData &nd = getHeuristicNodeData(nId);
+        unsigned numRegs = getGraph().getNodeCosts(nId).getLength() - 1;
+
+        bool nIsNode1 = nId == getGraph().getEdgeNode1(eId);
         EdgeData::UnsafeArray &unsafe =
           nIsNode1 ? ed.unsafe : ed.reverseUnsafe;
         nd.numDenied -= nIsNode1 ? ed.worst : ed.reverseWorst;
 
         for (unsigned r = 0; r < numRegs; ++r) {
-          if (unsafe[r]) { 
+          if (unsafe[r]) {
             if (nd.unsafeDegrees[r] == 1) {
               ++nd.numSafe;
             }
@@ -402,22 +402,22 @@ namespace PBQP {
         }
       }
 
-      void updateAllocability(Graph::NodeItr nItr) {
-        NodeData &nd = getHeuristicNodeData(nItr);
-        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
+      void updateAllocability(Graph::NodeId nId) {
+        NodeData &nd = getHeuristicNodeData(nId);
+        unsigned numRegs = getGraph().getNodeCosts(nId).getLength() - 1;
         nd.isAllocable = nd.numDenied < numRegs || nd.numSafe > 0;
       }
 
-      void initializeNode(Graph::NodeItr nItr) {
-        NodeData &nd = getHeuristicNodeData(nItr);
+      void initializeNode(Graph::NodeId nId) {
+        NodeData &nd = getHeuristicNodeData(nId);
 
         if (nd.isInitialized)
           return; // Node data is already up to date.
 
-        unsigned numRegs = getGraph().getNodeCosts(nItr).getLength() - 1;
+        unsigned numRegs = getGraph().getNodeCosts(nId).getLength() - 1;
 
         nd.numDenied = 0;
-        const Vector& nCosts = getGraph().getNodeCosts(nItr);
+        const Vector& nCosts = getGraph().getNodeCosts(nId);
         for (unsigned i = 1; i < nCosts.getLength(); ++i) {
           if (nCosts[i] == std::numeric_limits<PBQPNum>::infinity())
             ++nd.numDenied;
@@ -428,27 +428,27 @@ namespace PBQP {
 
         typedef HeuristicSolverImpl<Briggs>::SolverEdgeItr SolverEdgeItr;
 
-        for (SolverEdgeItr aeItr = getSolver().solverEdgesBegin(nItr),
-                           aeEnd = getSolver().solverEdgesEnd(nItr);
+        for (SolverEdgeItr aeItr = getSolver().solverEdgesBegin(nId),
+                           aeEnd = getSolver().solverEdgesEnd(nId);
              aeItr != aeEnd; ++aeItr) {
-          
-          Graph::EdgeItr eItr = *aeItr;
-          computeEdgeContributions(eItr);
-          addEdgeContributions(eItr, nItr);
+
+          Graph::EdgeId eId = *aeItr;
+          computeEdgeContributions(eId);
+          addEdgeContributions(eId, nId);
         }
 
-        updateAllocability(nItr);
+        updateAllocability(nId);
         nd.isInitialized = true;
       }
 
-      void handleRemoveNode(Graph::NodeItr xnItr) {
+      void handleRemoveNode(Graph::NodeId xnId) {
         typedef HeuristicSolverImpl<Briggs>::SolverEdgeItr SolverEdgeItr;
-        std::vector<Graph::EdgeItr> edgesToRemove;
-        for (SolverEdgeItr aeItr = getSolver().solverEdgesBegin(xnItr),
-                           aeEnd = getSolver().solverEdgesEnd(xnItr);
+        std::vector<Graph::EdgeId> edgesToRemove;
+        for (SolverEdgeItr aeItr = getSolver().solverEdgesBegin(xnId),
+                           aeEnd = getSolver().solverEdgesEnd(xnId);
              aeItr != aeEnd; ++aeItr) {
-          Graph::NodeItr ynItr = getGraph().getEdgeOtherNode(*aeItr, xnItr);
-          handleRemoveEdge(*aeItr, ynItr);
+          Graph::NodeId ynId = getGraph().getEdgeOtherNode(*aeItr, xnId);
+          handleRemoveEdge(*aeItr, ynId);
           edgesToRemove.push_back(*aeItr);
         }
         while (!edgesToRemove.empty()) {
diff --git a/include/llvm/CodeGen/PBQP/Solution.h b/include/llvm/CodeGen/PBQP/Solution.h
index b9f288b..091805d 100644
--- a/include/llvm/CodeGen/PBQP/Solution.h
+++ b/include/llvm/CodeGen/PBQP/Solution.h
@@ -26,8 +26,7 @@ namespace PBQP {
   class Solution {
   private:
 
-    typedef std::map<Graph::ConstNodeItr, unsigned,
-                     NodeItrComparator> SelectionsMap;
+    typedef std::map<Graph::NodeId, unsigned> SelectionsMap;
     SelectionsMap selections;
 
     unsigned r0Reductions, r1Reductions, r2Reductions, rNReductions;
@@ -71,17 +70,17 @@ namespace PBQP {
     unsigned numRNReductions() const { return rNReductions; }
 
     /// \brief Set the selection for a given node.
-    /// @param nItr Node iterator.
-    /// @param selection Selection for nItr.
-    void setSelection(Graph::NodeItr nItr, unsigned selection) {
-      selections[nItr] = selection;
+    /// @param nodeId Node id.
+    /// @param selection Selection for nodeId.
+    void setSelection(Graph::NodeId nodeId, unsigned selection) {
+      selections[nodeId] = selection;
     }
 
     /// \brief Get a node's selection.
-    /// @param nItr Node iterator.
-    /// @return The selection for nItr;
-    unsigned getSelection(Graph::ConstNodeItr nItr) const {
-      SelectionsMap::const_iterator sItr = selections.find(nItr);
+    /// @param nodeId Node id.
+    /// @return The selection for nodeId;
+    unsigned getSelection(Graph::NodeId nodeId) const {
+      SelectionsMap::const_iterator sItr = selections.find(nodeId);
       assert(sItr != selections.end() && "No selection for node.");
       return sItr->second;
     }
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 9b6f61e..ae4a2fa 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -21,19 +21,22 @@
 
 namespace llvm {
 
-  class FunctionPass;
-  class MachineFunctionPass;
-  class PassInfo;
-  class PassManagerBase;
-  class TargetLoweringBase;
-  class TargetLowering;
-  class TargetRegisterClass;
-  class raw_ostream;
-}
-
-namespace llvm {
-
+class FunctionPass;
+class MachineFunctionPass;
 class PassConfigImpl;
+class PassInfo;
+class ScheduleDAGInstrs;
+class TargetLowering;
+class TargetLoweringBase;
+class TargetRegisterClass;
+class raw_ostream;
+struct MachineSchedContext;
+
+// The old pass manager infrastructure is hidden in a legacy namespace now.
+namespace legacy {
+class PassManagerBase;
+}
+using legacy::PassManagerBase;
 
 /// Discriminated union of Pass ID types.
 ///
@@ -204,6 +207,20 @@ public:
   /// Fully developed targets will not generally override this.
   virtual void addMachinePasses();
 
+  /// createTargetScheduler - Create an instance of ScheduleDAGInstrs to be run
+  /// within the standard MachineScheduler pass for this function and target at
+  /// the current optimization level.
+  ///
+  /// This can also be used to plug a new MachineSchedStrategy into an instance
+  /// of the standard ScheduleDAGMI:
+  ///   return new ScheduleDAGMI(C, new MyStrategy(C))
+  ///
+  /// Return NULL to select the default (generic) machine scheduler.
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const {
+    return 0;
+  }
+
 protected:
   // Helper to verify the analysis is really immutable.
   void setOpt(bool &Opt, bool Val);
@@ -365,14 +382,6 @@ namespace llvm {
   /// these register allocator like this: AU.addRequiredID(PHIEliminationID);
   extern char &PHIEliminationID;
 
-  /// StrongPHIElimination - This pass eliminates machine instruction PHI
-  /// nodes by inserting copy instructions.  This destroys SSA information, but
-  /// is the desired input for some register allocators.  This pass is
-  /// "required" by these register allocator like this:
-  ///    AU.addRequiredID(PHIEliminationID);
-  ///  This pass is still in development
-  extern char &StrongPHIEliminationID;
-
   /// LiveIntervals - This analysis keeps track of the live ranges of virtual
   /// and physical registers.
   extern char &LiveIntervalsID;
diff --git a/include/llvm/CodeGen/PseudoSourceValue.h b/include/llvm/CodeGen/PseudoSourceValue.h
index df74d088..705086c 100644
--- a/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/include/llvm/CodeGen/PseudoSourceValue.h
@@ -44,7 +44,7 @@ namespace llvm {
     virtual bool isAliased(const MachineFrameInfo *) const;
 
     /// mayAlias - Return true if the memory pointed to by this
-    /// PseudoSourceValue can ever alias a LLVM IR Value.
+    /// PseudoSourceValue can ever alias an LLVM IR Value.
     virtual bool mayAlias(const MachineFrameInfo *) const;
 
     /// classof - Methods for support type inquiry through isa, cast, and
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index 6f2d139..7472e5a 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -52,22 +52,22 @@ namespace llvm {
     /// PBQPBuilder you are unlikely to need this: Nodes and options for all
     /// vregs will already have been set up for you by the base class. 
     template <typename AllowedRegsItr>
-    void recordVReg(unsigned vreg, PBQP::Graph::NodeItr node,
+    void recordVReg(unsigned vreg, PBQP::Graph::NodeId nodeId,
                     AllowedRegsItr arBegin, AllowedRegsItr arEnd) {
-      assert(node2VReg.find(node) == node2VReg.end() && "Re-mapping node.");
+      assert(node2VReg.find(nodeId) == node2VReg.end() && "Re-mapping node.");
       assert(vreg2Node.find(vreg) == vreg2Node.end() && "Re-mapping vreg.");
       assert(allowedSets[vreg].empty() && "vreg already has pregs.");
 
-      node2VReg[node] = vreg;
-      vreg2Node[vreg] = node;
+      node2VReg[nodeId] = vreg;
+      vreg2Node[vreg] = nodeId;
       std::copy(arBegin, arEnd, std::back_inserter(allowedSets[vreg]));
     }
 
     /// Get the virtual register corresponding to the given PBQP node.
-    unsigned getVRegForNode(PBQP::Graph::ConstNodeItr node) const;
+    unsigned getVRegForNode(PBQP::Graph::NodeId nodeId) const;
 
     /// Get the PBQP node corresponding to the given virtual register.
-    PBQP::Graph::NodeItr getNodeForVReg(unsigned vreg) const;
+    PBQP::Graph::NodeId getNodeForVReg(unsigned vreg) const;
 
     /// Returns true if the given PBQP option represents a physical register,
     /// false otherwise.
@@ -92,9 +92,8 @@ namespace llvm {
 
   private:
 
-    typedef std::map<PBQP::Graph::ConstNodeItr, unsigned,
-                     PBQP::NodeItrComparator>  Node2VReg;
-    typedef DenseMap<unsigned, PBQP::Graph::NodeItr> VReg2Node;
+    typedef std::map<PBQP::Graph::NodeId, unsigned>  Node2VReg;
+    typedef DenseMap<unsigned, PBQP::Graph::NodeId> VReg2Node;
     typedef DenseMap<unsigned, AllowedSet> AllowedSetMap;
 
     PBQP::Graph graph;
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 8a0a8f3..a801d1d 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -22,7 +22,7 @@
 namespace llvm {
 
 class LiveIntervals;
-class LiveInterval;
+class LiveRange;
 class RegisterClassInfo;
 class MachineInstr;
 
@@ -89,20 +89,89 @@ struct RegionPressure : RegisterPressure {
   void openBottom(MachineBasicBlock::const_iterator PrevBottom);
 };
 
-/// An element of pressure difference that identifies the pressure set and
-/// amount of increase or decrease in units of pressure.
-struct PressureElement {
-  unsigned PSetID;
-  int UnitIncrease;
+/// Capture a change in pressure for a single pressure set. UnitInc may be
+/// expressed in terms of upward or downward pressure depending on the client
+/// and will be dynamically adjusted for current liveness.
+///
+/// Pressure increments are tiny, typically 1-2 units, and this is only for
+/// heuristics, so we don't check UnitInc overflow. Instead, we may have a
+/// higher level assert that pressure is consistent within a region. We also
+/// effectively ignore dead defs which don't affect heuristics much.
+class PressureChange {
+  uint16_t PSetID; // ID+1. 0=Invalid.
+  int16_t  UnitInc;
+public:
+  PressureChange(): PSetID(0), UnitInc(0) {}
+  PressureChange(unsigned id): PSetID(id+1), UnitInc(0) {
+    assert(id < UINT16_MAX && "PSetID overflow.");
+  }
+
+  bool isValid() const { return PSetID > 0; }
+
+  unsigned getPSet() const {
+    assert(isValid() && "invalid PressureChange");
+    return PSetID - 1;
+  }
+  // If PSetID is invalid, return UINT16_MAX to give it lowest priority.
+  unsigned getPSetOrMax() const { return (PSetID - 1) & UINT16_MAX; }
+
+  int getUnitInc() const { return UnitInc; }
 
-  PressureElement(): PSetID(~0U), UnitIncrease(0) {}
-  PressureElement(unsigned id, int inc): PSetID(id), UnitIncrease(inc) {}
+  void setUnitInc(int Inc) { UnitInc = Inc; }
 
-  bool isValid() const { return PSetID != ~0U; }
+  bool operator==(const PressureChange &RHS) const {
+    return PSetID == RHS.PSetID && UnitInc == RHS.UnitInc;
+  }
+};
 
-  // If signed PSetID is negative, it is invalid; convert it to INT_MAX to give
-  // it lowest priority.
-  int PSetRank() const { return PSetID & INT_MAX; }
+template <> struct isPodLike<PressureChange> {
+   static const bool value = true;
+};
+
+/// List of PressureChanges in order of increasing, unique PSetID.
+///
+/// Use a small fixed number, because we can fit more PressureChanges in an
+/// empty SmallVector than ever need to be tracked per register class. If more
+/// PSets are affected, then we only track the most constrained.
+class PressureDiff {
+  // The initial design was for MaxPSets=4, but that requires PSet partitions,
+  // which are not yet implemented. (PSet partitions are equivalent PSets given
+  // the register classes actually in use within the scheduling region.)
+  enum { MaxPSets = 16 };
+
+  PressureChange PressureChanges[MaxPSets];
+public:
+  typedef PressureChange* iterator;
+  typedef const PressureChange* const_iterator;
+  iterator begin() { return &PressureChanges[0]; }
+  iterator end() { return &PressureChanges[MaxPSets]; }
+  const_iterator begin() const { return &PressureChanges[0]; }
+  const_iterator end() const { return &PressureChanges[MaxPSets]; }
+
+  void addPressureChange(unsigned RegUnit, bool IsDec,
+                         const MachineRegisterInfo *MRI);
+};
+
+/// Array of PressureDiffs.
+class PressureDiffs {
+  PressureDiff *PDiffArray;
+  unsigned Size;
+  unsigned Max;
+public:
+  PressureDiffs(): PDiffArray(0), Size(0), Max(0) {}
+  ~PressureDiffs() { free(PDiffArray); }
+
+  void clear() { Size = 0; }
+
+  void init(unsigned N);
+
+  PressureDiff &operator[](unsigned Idx) {
+    assert(Idx < Size && "PressureDiff index out of bounds");
+    return PDiffArray[Idx];
+  }
+  const PressureDiff &operator[](unsigned Idx) const {
+    return const_cast<PressureDiffs*>(this)->operator[](Idx);
+  }
 };
 
 /// Store the effects of a change in pressure on things that MI scheduler cares
@@ -120,11 +189,19 @@ struct PressureElement {
 /// CurrentMax records the largest increase in the tracker's max pressure that
 /// exceeds the current limit for some pressure set determined by the client.
 struct RegPressureDelta {
-  PressureElement Excess;
-  PressureElement CriticalMax;
-  PressureElement CurrentMax;
+  PressureChange Excess;
+  PressureChange CriticalMax;
+  PressureChange CurrentMax;
 
   RegPressureDelta() {}
+
+  bool operator==(const RegPressureDelta &RHS) const {
+    return Excess == RHS.Excess && CriticalMax == RHS.CriticalMax
+      && CurrentMax == RHS.CurrentMax;
+  }
+  bool operator!=(const RegPressureDelta &RHS) const {
+    return !operator==(RHS);
+  }
 };
 
 /// \brief A set of live virtual registers and physical register units.
@@ -135,7 +212,7 @@ struct LiveRegSet {
   SparseSet<unsigned> PhysRegs;
   SparseSet<unsigned, VirtReg2IndexFunctor> VirtRegs;
 
-  bool contains(unsigned Reg) {
+  bool contains(unsigned Reg) const {
     if (TargetRegisterInfo::isVirtualRegister(Reg))
       return VirtRegs.count(Reg);
     return PhysRegs.count(Reg);
@@ -215,6 +292,8 @@ public:
     MF(0), TRI(0), RCI(0), LIS(0), MBB(0), P(rp), RequireIntervals(false),
     TrackUntiedDefs(false) {}
 
+  void reset();
+
   void init(const MachineFunction *mf, const RegisterClassInfo *rci,
             const LiveIntervals *lis, const MachineBasicBlock *mbb,
             MachineBasicBlock::const_iterator pos,
@@ -239,7 +318,7 @@ public:
   SlotIndex getCurrSlot() const;
 
   /// Recede across the previous instruction.
-  bool recede();
+  bool recede(SmallVectorImpl<unsigned> *LiveUses = 0, PressureDiff *PDiff = 0);
 
   /// Advance across the current instruction.
   bool advance();
@@ -282,31 +361,39 @@ public:
   /// limit based on the tracker's current pressure, and record the number of
   /// excess register units of that pressure set introduced by this instruction.
   void getMaxUpwardPressureDelta(const MachineInstr *MI,
+                                 PressureDiff *PDiff,
                                  RegPressureDelta &Delta,
-                                 ArrayRef<PressureElement> CriticalPSets,
+                                 ArrayRef<PressureChange> CriticalPSets,
                                  ArrayRef<unsigned> MaxPressureLimit);
 
+  void getUpwardPressureDelta(const MachineInstr *MI,
+                              /*const*/ PressureDiff &PDiff,
+                              RegPressureDelta &Delta,
+                              ArrayRef<PressureChange> CriticalPSets,
+                              ArrayRef<unsigned> MaxPressureLimit) const;
+
   /// Consider the pressure increase caused by traversing this instruction
   /// top-down. Find the pressure set with the most change beyond its pressure
   /// limit based on the tracker's current pressure, and record the number of
   /// excess register units of that pressure set introduced by this instruction.
   void getMaxDownwardPressureDelta(const MachineInstr *MI,
                                    RegPressureDelta &Delta,
-                                   ArrayRef<PressureElement> CriticalPSets,
+                                   ArrayRef<PressureChange> CriticalPSets,
                                    ArrayRef<unsigned> MaxPressureLimit);
 
   /// Find the pressure set with the most change beyond its pressure limit after
   /// traversing this instruction either upward or downward depending on the
   /// closed end of the current region.
-  void getMaxPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
-                           ArrayRef<PressureElement> CriticalPSets,
+  void getMaxPressureDelta(const MachineInstr *MI,
+                           RegPressureDelta &Delta,
+                           ArrayRef<PressureChange> CriticalPSets,
                            ArrayRef<unsigned> MaxPressureLimit) {
     if (isTopClosed())
       return getMaxDownwardPressureDelta(MI, Delta, CriticalPSets,
                                          MaxPressureLimit);
 
     assert(isBottomClosed() && "Uninitialized pressure tracker");
-    return getMaxUpwardPressureDelta(MI, Delta, CriticalPSets,
+    return getMaxUpwardPressureDelta(MI, 0, Delta, CriticalPSets,
                                      MaxPressureLimit);
   }
 
@@ -337,7 +424,7 @@ public:
   void dump() const;
 
 protected:
-  const LiveInterval *getInterval(unsigned Reg) const;
+  const LiveRange *getLiveRange(unsigned Reg) const;
 
   void increaseRegPressure(ArrayRef<unsigned> Regs);
   void decreaseRegPressure(ArrayRef<unsigned> Regs);
diff --git a/include/llvm/CodeGen/RuntimeLibcalls.h b/include/llvm/CodeGen/RuntimeLibcalls.h
index 41289a4..009b8a0 100644
--- a/include/llvm/CodeGen/RuntimeLibcalls.h
+++ b/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -1,4 +1,4 @@
-//===-- CodeGen/RuntimeLibcall.h - Runtime Library Calls --------*- C++ -*-===//
+//===-- CodeGen/RuntimeLibcalls.h - Runtime Library Calls -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -188,6 +188,11 @@ namespace RTLIB {
     NEARBYINT_F80,
     NEARBYINT_F128,
     NEARBYINT_PPCF128,
+    ROUND_F32,
+    ROUND_F64,
+    ROUND_F80,
+    ROUND_F128,
+    ROUND_PPCF128,
     FLOOR_F32,
     FLOOR_F64,
     FLOOR_F80,
@@ -320,34 +325,65 @@ namespace RTLIB {
     SYNC_VAL_COMPARE_AND_SWAP_2,
     SYNC_VAL_COMPARE_AND_SWAP_4,
     SYNC_VAL_COMPARE_AND_SWAP_8,
+    SYNC_VAL_COMPARE_AND_SWAP_16,
     SYNC_LOCK_TEST_AND_SET_1,
     SYNC_LOCK_TEST_AND_SET_2,
     SYNC_LOCK_TEST_AND_SET_4,
     SYNC_LOCK_TEST_AND_SET_8,
+    SYNC_LOCK_TEST_AND_SET_16,
     SYNC_FETCH_AND_ADD_1,
     SYNC_FETCH_AND_ADD_2,
     SYNC_FETCH_AND_ADD_4,
     SYNC_FETCH_AND_ADD_8,
+    SYNC_FETCH_AND_ADD_16,
     SYNC_FETCH_AND_SUB_1,
     SYNC_FETCH_AND_SUB_2,
     SYNC_FETCH_AND_SUB_4,
     SYNC_FETCH_AND_SUB_8,
+    SYNC_FETCH_AND_SUB_16,
     SYNC_FETCH_AND_AND_1,
     SYNC_FETCH_AND_AND_2,
     SYNC_FETCH_AND_AND_4,
     SYNC_FETCH_AND_AND_8,
+    SYNC_FETCH_AND_AND_16,
     SYNC_FETCH_AND_OR_1,
     SYNC_FETCH_AND_OR_2,
     SYNC_FETCH_AND_OR_4,
     SYNC_FETCH_AND_OR_8,
+    SYNC_FETCH_AND_OR_16,
     SYNC_FETCH_AND_XOR_1,
     SYNC_FETCH_AND_XOR_2,
     SYNC_FETCH_AND_XOR_4,
     SYNC_FETCH_AND_XOR_8,
+    SYNC_FETCH_AND_XOR_16,
     SYNC_FETCH_AND_NAND_1,
     SYNC_FETCH_AND_NAND_2,
     SYNC_FETCH_AND_NAND_4,
     SYNC_FETCH_AND_NAND_8,
+    SYNC_FETCH_AND_NAND_16,
+    SYNC_FETCH_AND_MAX_1,
+    SYNC_FETCH_AND_MAX_2,
+    SYNC_FETCH_AND_MAX_4,
+    SYNC_FETCH_AND_MAX_8,
+    SYNC_FETCH_AND_MAX_16,
+    SYNC_FETCH_AND_UMAX_1,
+    SYNC_FETCH_AND_UMAX_2,
+    SYNC_FETCH_AND_UMAX_4,
+    SYNC_FETCH_AND_UMAX_8,
+    SYNC_FETCH_AND_UMAX_16,
+    SYNC_FETCH_AND_MIN_1,
+    SYNC_FETCH_AND_MIN_2,
+    SYNC_FETCH_AND_MIN_4,
+    SYNC_FETCH_AND_MIN_8,
+    SYNC_FETCH_AND_MIN_16,
+    SYNC_FETCH_AND_UMIN_1,
+    SYNC_FETCH_AND_UMIN_2,
+    SYNC_FETCH_AND_UMIN_4,
+    SYNC_FETCH_AND_UMIN_8,
+    SYNC_FETCH_AND_UMIN_16,
+
+    // Stack Protector Fail.
+    STACKPROTECTOR_CHECK_FAIL,
 
     UNKNOWN_LIBCALL
   };
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 371ac6c..ccba1b0 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -248,7 +248,7 @@ namespace llvm {
   /// SUnit - Scheduling unit. This is a node in the scheduling DAG.
   class SUnit {
   private:
-    enum { BoundaryID = ~0u };
+    enum LLVM_ENUM_INT_TYPE(unsigned) { BoundaryID = ~0u };
 
     SDNode *Node;                       // Representative node.
     MachineInstr *Instr;                // Alternatively, a MachineInstr.
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 9ab1013..fe4f3c2 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -28,6 +28,7 @@ namespace llvm {
   class MachineDominatorTree;
   class LiveIntervals;
   class RegPressureTracker;
+  class PressureDiffs;
 
   /// An individual mapping from virtual register number to SUnit.
   struct VReg2SUnit {
@@ -56,7 +57,8 @@ namespace llvm {
   /// Use a SparseMultiSet to track physical registers. Storage is only
   /// allocated once for the pass. It can be cleared in constant time and reused
   /// without any frees.
-  typedef SparseMultiSet<PhysRegSUOper, llvm::identity<unsigned>, uint16_t> Reg2SUnitsMap;
+  typedef SparseMultiSet<PhysRegSUOper, llvm::identity<unsigned>, uint16_t>
+  Reg2SUnitsMap;
 
   /// Use SparseSet as a SparseMap by relying on the fact that it never
   /// compares ValueT's, only unsigned keys. This allows the set to be cleared
@@ -64,6 +66,11 @@ namespace llvm {
   /// require a destructor.
   typedef SparseSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2SUnitMap;
 
+  /// Track local uses of virtual registers. These uses are gathered by the DAG
+  /// builder and may be consulted by the scheduler to avoid iterating an entire
+  /// vreg use list.
+  typedef SparseMultiSet<VReg2SUnit, VirtReg2IndexFunctor> VReg2UseMap;
+
   /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of
   /// MachineInstrs.
   class ScheduleDAGInstrs : public ScheduleDAG {
@@ -81,10 +88,6 @@ namespace llvm {
     /// isPostRA flag indicates vregs cannot be present.
     bool IsPostRA;
 
-    /// UnitLatencies (misnamed) flag avoids computing def-use latencies, using
-    /// the def-side latency only.
-    bool UnitLatencies;
-
     /// The standard DAG builder does not normally include terminators as DAG
     /// nodes because it does not create the necessary dependencies to prevent
     /// reordering. A specialized scheduler can overide
@@ -104,17 +107,18 @@ namespace llvm {
     /// The end of the range to be scheduled.
     MachineBasicBlock::iterator RegionEnd;
 
-    /// The index in BB of RegionEnd.
-    ///
-    /// This is the instruction number from the top of the current block, not
-    /// the SlotIndex. It is only used by the AntiDepBreaker and should be
-    /// removed once that client is obsolete.
-    unsigned EndIndex;
+    /// Instructions in this region (distance(RegionBegin, RegionEnd)).
+    unsigned NumRegionInstrs;
 
     /// After calling BuildSchedGraph, each machine instruction in the current
     /// scheduling region is mapped to an SUnit.
     DenseMap<MachineInstr*, SUnit*> MISUnitMap;
 
+    /// After calling BuildSchedGraph, each vreg used in the scheduling region
+    /// is mapped to a set of SUnits. These include all local vreg uses, not
+    /// just the uses for a singly defined vreg.
+    VReg2UseMap VRegUses;
+
     /// State internal to DAG building.
     /// -------------------------------
 
@@ -125,7 +129,7 @@ namespace llvm {
     Reg2SUnitsMap Defs;
     Reg2SUnitsMap Uses;
 
-    /// Track the last instructon in this region defining each virtual register.
+    /// Track the last instruction in this region defining each virtual register.
     VReg2SUnitMap VRegDefs;
 
     /// PendingLoads - Remember where unknown loads are after the most recent
@@ -185,14 +189,15 @@ namespace llvm {
     virtual void enterRegion(MachineBasicBlock *bb,
                              MachineBasicBlock::iterator begin,
                              MachineBasicBlock::iterator end,
-                             unsigned endcount);
+                             unsigned regioninstrs);
 
     /// Notify that the scheduler has finished scheduling the current region.
     virtual void exitRegion();
 
     /// buildSchedGraph - Build SUnits from the MachineBasicBlock that we are
     /// input.
-    void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);
+    void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0,
+                         PressureDiffs *PDiffs = 0);
 
     /// addSchedBarrierDeps - Add dependencies from instructions in the current
     /// list of instructions being scheduled to scheduling barrier. We want to
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 79e533e..82becca 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -38,6 +38,45 @@ class TargetLowering;
 class TargetSelectionDAGInfo;
 class TargetTransformInfo;
 
+class SDVTListNode : public FoldingSetNode {
+  friend struct FoldingSetTrait<SDVTListNode>;
+  /// FastID - A reference to an Interned FoldingSetNodeID for this node.
+  /// The Allocator in SelectionDAG holds the data.
+  /// SDVTList contains all types which are frequently accessed in SelectionDAG.
+  /// The size of this list is not expected big so it won't introduce memory penalty.
+  FoldingSetNodeIDRef FastID;
+  const EVT *VTs;
+  unsigned int NumVTs;
+  /// The hash value for SDVTList is fixed so cache it to avoid hash calculation
+  unsigned HashValue;
+public:
+  SDVTListNode(const FoldingSetNodeIDRef ID, const EVT *VT, unsigned int Num) :
+      FastID(ID), VTs(VT), NumVTs(Num) {
+    HashValue = ID.ComputeHash();
+  }
+  SDVTList getSDVTList() {
+    SDVTList result = {VTs, NumVTs};
+    return result;
+  }
+};
+
+// Specialize FoldingSetTrait for SDVTListNode
+// To avoid computing temp FoldingSetNodeID and hash value.
+template<> struct FoldingSetTrait<SDVTListNode> : DefaultFoldingSetTrait<SDVTListNode> {
+  static void Profile(const SDVTListNode &X, FoldingSetNodeID& ID) {
+    ID = X.FastID;
+  }
+  static bool Equals(const SDVTListNode &X, const FoldingSetNodeID &ID,
+                     unsigned IDHash, FoldingSetNodeID &TempID) {
+    if (X.HashValue != IDHash)
+      return false;
+    return ID == X.FastID;
+  }
+  static unsigned ComputeHash(const SDVTListNode &X, FoldingSetNodeID &TempID) {
+    return X.HashValue;
+  }
+};
+
 template<> struct ilist_traits<SDNode> : public ilist_default_traits<SDNode> {
 private:
   mutable ilist_half_node<SDNode> Sentinel;
@@ -131,6 +170,7 @@ class SelectionDAG {
   const TargetMachine &TM;
   const TargetSelectionDAGInfo &TSI;
   const TargetTransformInfo *TTI;
+  const TargetLowering *TLI;
   MachineFunction *MF;
   LLVMContext *Context;
   CodeGenOpt::Level OptLevel;
@@ -197,6 +237,13 @@ public:
     virtual void NodeUpdated(SDNode *N);
   };
 
+  /// NewNodesMustHaveLegalTypes - When true, additional steps are taken to
+  /// ensure that getConstant() and similar functions return DAG nodes that
+  /// have legal types. This is important after type legalization since
+  /// any illegally typed nodes generated after this point will not experience
+  /// type legalization.
+  bool NewNodesMustHaveLegalTypes;
+
 private:
   /// DAGUpdateListener is a friend so it can manipulate the listener stack.
   friend struct DAGUpdateListener;
@@ -222,7 +269,8 @@ public:
   /// init - Prepare this SelectionDAG to process code in the given
   /// MachineFunction.
   ///
-  void init(MachineFunction &mf, const TargetTransformInfo *TTI);
+  void init(MachineFunction &mf, const TargetTransformInfo *TTI,
+            const TargetLowering *TLI);
 
   /// clear - Clear state and free memory necessary to make this
   /// SelectionDAG ready to process a new block.
@@ -231,9 +279,7 @@ public:
 
   MachineFunction &getMachineFunction() const { return *MF; }
   const TargetMachine &getTarget() const { return TM; }
-  const TargetLowering &getTargetLoweringInfo() const {
-    return *TM.getTargetLowering();
-  }
+  const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
   const TargetSelectionDAGInfo &getSelectionDAGInfo() const { return TSI; }
   const TargetTransformInfo *getTargetTransformInfo() const { return TTI; }
   LLVMContext *getContext() const {return Context; }
@@ -677,6 +723,13 @@ public:
                     AtomicOrdering Ordering,
                     SynchronizationScope SynchScope);
 
+  /// getAtomic - Gets a node for an atomic op, produces result and chain and
+  /// takes N operands.
+  SDValue getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT, SDVTList VTList,
+                    SDValue* Ops, unsigned NumOps, MachineMemOperand *MMO,
+                    AtomicOrdering Ordering,
+                    SynchronizationScope SynchScope);
+
   /// getMemIntrinsicNode - Creates a MemIntrinsicNode that may produce a
   /// result and takes a list of operands. Opcode may be INTRINSIC_VOID,
   /// INTRINSIC_W_CHAIN, or a target-specific opcode with a value not
@@ -708,11 +761,16 @@ public:
                   MachinePointerInfo PtrInfo, bool isVolatile,
                   bool isNonTemporal, bool isInvariant, unsigned Alignment,
                   const MDNode *TBAAInfo = 0, const MDNode *Ranges = 0);
+  SDValue getLoad(EVT VT, SDLoc dl, SDValue Chain, SDValue Ptr,
+                  MachineMemOperand *MMO);
   SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                      SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo,
                      EVT MemVT, bool isVolatile,
                      bool isNonTemporal, unsigned Alignment,
                      const MDNode *TBAAInfo = 0);
+  SDValue getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
+                     SDValue Chain, SDValue Ptr, EVT MemVT,
+                     MachineMemOperand *MMO);
   SDValue getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base,
                          SDValue Offset, ISD::MemIndexedMode AM);
   SDValue getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
@@ -751,6 +809,10 @@ public:
   /// getMDNode - Return an MDNodeSDNode which holds an MDNode.
   SDValue getMDNode(const MDNode *MD);
 
+  /// getAddrSpaceCast - Return an AddrSpaceCastSDNode.
+  SDValue getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
+                           unsigned SrcAS, unsigned DestAS);
+
   /// getShiftAmountOperand - Return the specified value casted to
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
@@ -1068,6 +1130,30 @@ public:
   /// it cannot be inferred.
   unsigned InferPtrAlignment(SDValue Ptr) const;
 
+  /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+  /// which is split (or expanded) into two not necessarily identical pieces.
+  std::pair<EVT, EVT> GetSplitDestVTs(const EVT &VT) const;
+
+  /// SplitVector - Split the vector with EXTRACT_SUBVECTOR using the provides
+  /// VTs and return the low/high part.
+  std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL,
+                                          const EVT &LoVT, const EVT &HiVT);
+
+  /// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
+  /// low/high part.
+  std::pair<SDValue, SDValue> SplitVector(const SDValue &N, const SDLoc &DL) {
+    EVT LoVT, HiVT;
+    llvm::tie(LoVT, HiVT) = GetSplitDestVTs(N.getValueType());
+    return SplitVector(N, DL, LoVT, HiVT);
+  }
+
+  /// SplitVectorOperand - Split the node's operand with EXTRACT_SUBVECTOR and
+  /// return the low/high part.
+  std::pair<SDValue, SDValue> SplitVectorOperand(const SDNode *N, unsigned OpNo)
+  {
+    return SplitVector(N->getOperand(OpNo), SDLoc(N));
+  }
+
 private:
   bool RemoveNodeFromCSEMaps(SDNode *N);
   void AddModifiedNodeToCSEMaps(SDNode *N);
@@ -1086,7 +1172,7 @@ private:
   void allnodes_clear();
 
   /// VTList - List of non-single value types.
-  std::vector<SDVTList> VTList;
+  FoldingSet<SDVTListNode> VTListMap;
 
   /// CondCodeNodes - Maps to auto-CSE operations.
   std::vector<CondCodeSDNode*> CondCodeNodes;
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index 3d55d3a..b5ec8cb 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -113,6 +113,8 @@ public:
     OPC_MoveChild,
     OPC_MoveParent,
     OPC_CheckSame,
+    OPC_CheckChild0Same, OPC_CheckChild1Same,
+    OPC_CheckChild2Same, OPC_CheckChild3Same,
     OPC_CheckPatternPredicate,
     OPC_CheckPredicate,
     OPC_CheckOpcode,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 987f290..70c15e6 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -372,7 +372,7 @@ public:
   /// \<target\>ISD namespace).
   bool isTargetOpcode() const { return NodeType >= ISD::BUILTIN_OP_END; }
 
-  /// isTargetMemoryOpcode - Test if this node has a target-specific 
+  /// isTargetMemoryOpcode - Test if this node has a target-specific
   /// memory-referencing opcode (in the \<target\>ISD namespace and
   /// greater than FIRST_TARGET_MEMORY_OPCODE).
   bool isTargetMemoryOpcode() const {
@@ -520,7 +520,9 @@ public:
   /// isPredecessorOf - Return true if this node is a predecessor of N.
   /// NOTE: Implemented on top of hasPredecessor and every bit as
   /// expensive. Use carefully.
-  bool isPredecessorOf(const SDNode *N) const { return N->hasPredecessor(this); }
+  bool isPredecessorOf(const SDNode *N) const {
+    return N->hasPredecessor(this);
+  }
 
   /// hasPredecessor - Return true if N is a predecessor of this node.
   /// N is either an operand of this node, or can be reached by recursively
@@ -901,7 +903,8 @@ inline void SDUse::setNode(SDNode *N) {
 class UnarySDNode : public SDNode {
   SDUse Op;
 public:
-  UnarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X)
+  UnarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
+              SDValue X)
     : SDNode(Opc, Order, dl, VTs) {
     InitOperands(&Op, X);
   }
@@ -912,7 +915,8 @@ public:
 class BinarySDNode : public SDNode {
   SDUse Ops[2];
 public:
-  BinarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, SDValue X, SDValue Y)
+  BinarySDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
+               SDValue X, SDValue Y)
     : SDNode(Opc, Order, dl, VTs) {
     InitOperands(Ops, X, Y);
   }
@@ -938,13 +942,7 @@ public:
 class HandleSDNode : public SDNode {
   SDUse Op;
 public:
-  // FIXME: Remove the "noinline" attribute once <rdar://problem/5852746> is
-  // fixed.
-#if __GNUC__==4 && __GNUC_MINOR__==2 && defined(__APPLE__) && !defined(__llvm__)
-  explicit __attribute__((__noinline__)) HandleSDNode(SDValue X)
-#else
   explicit HandleSDNode(SDValue X)
-#endif
     : SDNode(ISD::HANDLENODE, 0, DebugLoc(), getSDVTList(MVT::Other)) {
     InitOperands(&Op, X);
   }
@@ -952,6 +950,23 @@ public:
   const SDValue &getValue() const { return Op; }
 };
 
+class AddrSpaceCastSDNode : public UnarySDNode {
+private:
+  unsigned SrcAddrSpace;
+  unsigned DestAddrSpace;
+
+public:
+  AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT, SDValue X,
+                      unsigned SrcAS, unsigned DestAS);
+
+  unsigned getSrcAddressSpace() const { return SrcAddrSpace; }
+  unsigned getDestAddressSpace() const { return DestAddrSpace; }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::ADDRSPACECAST;
+  }
+};
+
 /// Abstact virtual class for operations for memory operations
 class MemSDNode : public SDNode {
 private:
@@ -966,14 +981,15 @@ public:
   MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
             EVT MemoryVT, MachineMemOperand *MMO);
 
-  MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs, const SDValue *Ops,
+  MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
+            const SDValue *Ops,
             unsigned NumOps, EVT MemoryVT, MachineMemOperand *MMO);
 
   bool readMem() const { return MMO->isLoad(); }
   bool writeMem() const { return MMO->isStore(); }
 
   /// Returns alignment and volatility of the memory access
-  unsigned getOriginalAlignment() const { 
+  unsigned getOriginalAlignment() const {
     return MMO->getBaseAlignment();
   }
   unsigned getAlignment() const {
@@ -1090,7 +1106,8 @@ public:
   // Swp:    swap value
   // SrcVal: address to update as a Value (used for MemOperand)
   // Align:  alignment of memory
-  AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT,
+  AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL,
+               EVT MemVT,
                SDValue Chain, SDValue Ptr,
                SDValue Cmp, SDValue Swp, MachineMemOperand *MMO,
                AtomicOrdering Ordering, SynchronizationScope SynchScope)
@@ -1098,7 +1115,8 @@ public:
     InitAtomic(Ordering, SynchScope);
     InitOperands(Ops, Chain, Ptr, Cmp, Swp);
   }
-  AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT,
+  AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL,
+               EVT MemVT,
                SDValue Chain, SDValue Ptr,
                SDValue Val, MachineMemOperand *MMO,
                AtomicOrdering Ordering, SynchronizationScope SynchScope)
@@ -1106,7 +1124,8 @@ public:
     InitAtomic(Ordering, SynchScope);
     InitOperands(Ops, Chain, Ptr, Val);
   }
-  AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT,
+  AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL,
+               EVT MemVT,
                SDValue Chain, SDValue Ptr,
                MachineMemOperand *MMO,
                AtomicOrdering Ordering, SynchronizationScope SynchScope)
@@ -1114,6 +1133,16 @@ public:
     InitAtomic(Ordering, SynchScope);
     InitOperands(Ops, Chain, Ptr);
   }
+  AtomicSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTL, EVT MemVT,
+               SDValue* AllOps, SDUse *DynOps, unsigned NumOps,
+               MachineMemOperand *MMO,
+               AtomicOrdering Ordering, SynchronizationScope SynchScope)
+    : MemSDNode(Opc, Order, dl, VTL, MemVT, MMO) {
+    InitAtomic(Ordering, SynchScope);
+    assert((DynOps || NumOps <= array_lengthof(Ops)) &&
+           "Too many ops for internal storage!");
+    InitOperands(DynOps ? DynOps : Ops, AllOps, NumOps);
+  }
 
   const SDValue &getBasePtr() const { return getOperand(1); }
   const SDValue &getVal() const { return getOperand(2); }
@@ -1286,8 +1315,9 @@ class GlobalAddressSDNode : public SDNode {
   int64_t Offset;
   unsigned char TargetFlags;
   friend class SelectionDAG;
-  GlobalAddressSDNode(unsigned Opc, unsigned Order, DebugLoc DL, const GlobalValue *GA, EVT VT,
-                      int64_t o, unsigned char TargetFlags);
+  GlobalAddressSDNode(unsigned Opc, unsigned Order, DebugLoc DL,
+                      const GlobalValue *GA, EVT VT, int64_t o,
+                      unsigned char TargetFlags);
 public:
 
   const GlobalValue *getGlobal() const { return TheGlobal; }
@@ -1351,21 +1381,22 @@ class ConstantPoolSDNode : public SDNode {
   friend class SelectionDAG;
   ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o,
                      unsigned Align, unsigned char TF)
-    : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0, DebugLoc(),
-             getSDVTList(VT)), Offset(o), Alignment(Align), TargetFlags(TF) {
+    : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
+             DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align),
+             TargetFlags(TF) {
     assert(Offset >= 0 && "Offset is too large");
     Val.ConstVal = c;
   }
   ConstantPoolSDNode(bool isTarget, MachineConstantPoolValue *v,
                      EVT VT, int o, unsigned Align, unsigned char TF)
-    : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0, DebugLoc(),
-             getSDVTList(VT)), Offset(o), Alignment(Align), TargetFlags(TF) {
+    : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
+             DebugLoc(), getSDVTList(VT)), Offset(o), Alignment(Align),
+             TargetFlags(TF) {
     assert(Offset >= 0 && "Offset is too large");
     Val.MachineCPVal = v;
     Offset |= 1 << (sizeof(unsigned)*CHAR_BIT-1);
   }
 public:
-  
 
   bool isMachineConstantPoolEntry() const {
     return Offset < 0;
@@ -1427,8 +1458,8 @@ class BasicBlockSDNode : public SDNode {
   /// blocks out of order when they're jumped to, which makes it a bit
   /// harder.  Let's see if we need it first.
   explicit BasicBlockSDNode(MachineBasicBlock *mbb)
-    : SDNode(ISD::BasicBlock, 0, DebugLoc(), getSDVTList(MVT::Other)), MBB(mbb) {
-  }
+    : SDNode(ISD::BasicBlock, 0, DebugLoc(), getSDVTList(MVT::Other)), MBB(mbb)
+  {}
 public:
 
   MachineBasicBlock *getBasicBlock() const { return MBB; }
@@ -1481,22 +1512,22 @@ public:
     return N->getOpcode() == ISD::SRCVALUE;
   }
 };
-  
+
 class MDNodeSDNode : public SDNode {
   const MDNode *MD;
   friend class SelectionDAG;
   explicit MDNodeSDNode(const MDNode *md)
-  : SDNode(ISD::MDNODE_SDNODE, 0, DebugLoc(), getSDVTList(MVT::Other)), MD(md) {}
+  : SDNode(ISD::MDNODE_SDNODE, 0, DebugLoc(), getSDVTList(MVT::Other)), MD(md)
+  {}
 public:
-  
+
   const MDNode *getMD() const { return MD; }
-  
+
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MDNODE_SDNODE;
   }
 };
 
-
 class RegisterSDNode : public SDNode {
   unsigned Reg;
   friend class SelectionDAG;
@@ -1568,7 +1599,7 @@ public:
 class ExternalSymbolSDNode : public SDNode {
   const char *Symbol;
   unsigned char TargetFlags;
-  
+
   friend class SelectionDAG;
   ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned char TF, EVT VT)
     : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol,
@@ -1600,14 +1631,15 @@ public:
     return N->getOpcode() == ISD::CONDCODE;
   }
 };
-  
+
 /// CvtRndSatSDNode - NOTE: avoid using this node as this may disappear in the
 /// future and most targets don't support it.
 class CvtRndSatSDNode : public SDNode {
   ISD::CvtCode CvtCode;
   friend class SelectionDAG;
-  explicit CvtRndSatSDNode(EVT VT, unsigned Order, DebugLoc dl, const SDValue *Ops,
-                           unsigned NumOps, ISD::CvtCode Code)
+  explicit CvtRndSatSDNode(EVT VT, unsigned Order, DebugLoc dl,
+                           const SDValue *Ops, unsigned NumOps,
+                           ISD::CvtCode Code)
     : SDNode(ISD::CONVERT_RNDSAT, Order, dl, getSDVTList(VT), Ops, NumOps),
       CvtCode(Code) {
     assert(NumOps == 5 && "wrong number of operations");
@@ -1649,9 +1681,10 @@ class LSBaseSDNode : public MemSDNode {
    */
   SDUse Ops[4];
 public:
-  LSBaseSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl, SDValue *Operands,
-               unsigned numOperands, SDVTList VTs, ISD::MemIndexedMode AM,
-               EVT MemVT, MachineMemOperand *MMO)
+  LSBaseSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl,
+               SDValue *Operands, unsigned numOperands,
+               SDVTList VTs, ISD::MemIndexedMode AM, EVT MemVT,
+               MachineMemOperand *MMO)
     : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
     SubclassData |= AM << 2;
     assert(getAddressingMode() == AM && "MemIndexedMode encoding error!");
@@ -1840,7 +1873,7 @@ template <> struct GraphTraits<SDNode*> {
 
 /// LargestSDNode - The largest SDNode class.
 ///
-typedef LoadSDNode LargestSDNode;
+typedef AtomicSDNode LargestSDNode;
 
 /// MostAlignedSDNode - The SDNode class with the greatest alignment
 /// requirement.
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
new file mode 100644
index 0000000..e90f22e
--- /dev/null
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -0,0 +1,175 @@
+//===------------------- StackMaps.h - StackMaps ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_STACKMAPS
+#define LLVM_STACKMAPS
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+class AsmPrinter;
+class MCExpr;
+
+/// \brief MI-level patchpoint operands.
+///
+/// MI patchpoint operations take the form:
+/// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+///
+/// IR patchpoint intrinsics do not have the <cc> operand because calling
+/// convention is part of the subclass data.
+///
+/// SD patchpoint nodes do not have a def operand because it is part of the
+/// SDValue.
+///
+/// Patchpoints following the anyregcc convention are handled specially. For
+/// these, the stack map also records the location of the return value and
+/// arguments.
+class PatchPointOpers {
+public:
+  /// Enumerate the meta operands.
+  enum { IDPos, NBytesPos, TargetPos, NArgPos, CCPos, MetaEnd };
+private:
+  const MachineInstr *MI;
+  bool HasDef;
+  bool IsAnyReg;
+public:
+  explicit PatchPointOpers(const MachineInstr *MI);
+
+  bool isAnyReg() const { return IsAnyReg; }
+  bool hasDef() const { return HasDef; }
+
+  unsigned getMetaIdx(unsigned Pos = 0) const {
+    assert(Pos < MetaEnd && "Meta operand index out of range.");
+    return (HasDef ? 1 : 0) + Pos;
+  }
+
+  const MachineOperand &getMetaOper(unsigned Pos) {
+    return MI->getOperand(getMetaIdx(Pos));
+  }
+
+  unsigned getArgIdx() const { return getMetaIdx() + MetaEnd; }
+
+  /// Get the operand index of the variable list of non-argument operands.
+  /// These hold the "live state".
+  unsigned getVarIdx() const {
+    return getMetaIdx() + MetaEnd
+      + MI->getOperand(getMetaIdx(NArgPos)).getImm();
+  }
+
+  /// Get the index at which stack map locations will be recorded.
+  /// Arguments are not recorded unless the anyregcc convention is used.
+  unsigned getStackMapStartIdx() const {
+    if (IsAnyReg)
+      return getArgIdx();
+    return getVarIdx();
+  }
+
+  /// \brief Get the next scratch register operand index.
+  unsigned getNextScratchIdx(unsigned StartIdx = 0) const;
+};
+
+class StackMaps {
+public:
+  struct Location {
+    enum LocationType { Unprocessed, Register, Direct, Indirect, Constant,
+                        ConstantIndex };
+    LocationType LocType;
+    unsigned Size;
+    unsigned Reg;
+    int64_t Offset;
+    Location() : LocType(Unprocessed), Size(0), Reg(0), Offset(0) {}
+    Location(LocationType LocType, unsigned Size, unsigned Reg, int64_t Offset)
+      : LocType(LocType), Size(Size), Reg(Reg), Offset(Offset) {}
+  };
+
+  // Typedef a function pointer for functions that parse sequences of operands
+  // and return a Location, plus a new "next" operand iterator.
+  typedef std::pair<Location, MachineInstr::const_mop_iterator>
+    (*OperandParser)(MachineInstr::const_mop_iterator,
+                     MachineInstr::const_mop_iterator, const TargetMachine&);
+
+  // OpTypes are used to encode information about the following logical
+  // operand (which may consist of several MachineOperands) for the
+  // OpParser.
+  typedef enum { DirectMemRefOp, IndirectMemRefOp, ConstantOp } OpType;
+
+  StackMaps(AsmPrinter &AP, OperandParser OpParser)
+    : AP(AP), OpParser(OpParser) {}
+
+  /// \brief Generate a stackmap record for a stackmap instruction.
+  ///
+  /// MI must be a raw STACKMAP, not a PATCHPOINT.
+  void recordStackMap(const MachineInstr &MI);
+
+  /// \brief Generate a stackmap record for a patchpoint instruction.
+  void recordPatchPoint(const MachineInstr &MI);
+
+  /// If there is any stack map data, create a stack map section and serialize
+  /// the map info into it. This clears the stack map data structures
+  /// afterwards.
+  void serializeToStackMapSection();
+
+private:
+  typedef SmallVector<Location, 8> LocationVec;
+
+  struct CallsiteInfo {
+    const MCExpr *CSOffsetExpr;
+    unsigned ID;
+    LocationVec Locations;
+    CallsiteInfo() : CSOffsetExpr(0), ID(0) {}
+    CallsiteInfo(const MCExpr *CSOffsetExpr, unsigned ID,
+                 LocationVec Locations)
+      : CSOffsetExpr(CSOffsetExpr), ID(ID), Locations(Locations) {}
+  };
+
+  typedef std::vector<CallsiteInfo> CallsiteInfoList;
+
+  struct ConstantPool {
+  private:
+    typedef std::map<int64_t, size_t> ConstantsMap;
+    std::vector<int64_t> ConstantsList;
+    ConstantsMap ConstantIndexes;
+
+  public:
+    size_t getNumConstants() const { return ConstantsList.size(); }
+    int64_t getConstant(size_t Idx) const { return ConstantsList[Idx]; }
+    size_t getConstantIndex(int64_t ConstVal) {
+      size_t NextIdx = ConstantsList.size();
+      ConstantsMap::const_iterator I =
+        ConstantIndexes.insert(ConstantIndexes.end(),
+                               std::make_pair(ConstVal, NextIdx));
+      if (I->second == NextIdx)
+        ConstantsList.push_back(ConstVal);
+      return I->second;
+    }
+  };
+
+  AsmPrinter &AP;
+  OperandParser OpParser;
+  CallsiteInfoList CSInfos;
+  ConstantPool ConstPool;
+
+  /// This should be called by the MC lowering code _immediately_ before
+  /// lowering the MI to an MCInst. It records where the operands for the
+  /// instruction are stored, and outputs a label to record the offset of
+  /// the call from the start of the text section. In special cases (e.g. AnyReg
+  /// calling convention) the return register is also recorded if requested.
+  void recordStackMapOpers(const MachineInstr &MI, uint32_t ID,
+                           MachineInstr::const_mop_iterator MOI,
+                           MachineInstr::const_mop_iterator MOE,
+                           bool recordResult = false);
+};
+
+}
+
+#endif // LLVM_STACKMAPS
diff --git a/include/llvm/CodeGen/StackProtector.h b/include/llvm/CodeGen/StackProtector.h
new file mode 100644
index 0000000..d09a933
--- /dev/null
+++ b/include/llvm/CodeGen/StackProtector.h
@@ -0,0 +1,127 @@
+//===-- StackProtector.h - Stack Protector Insertion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass inserts stack protectors into functions which need them. A variable
+// with a random value in it is stored onto the stack before the local variables
+// are allocated. Upon exiting the block, the stored value is checked. If it's
+// changed, then there was some sort of violation and the program aborts.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_STACKPROTECTOR_H
+#define LLVM_CODEGEN_STACKPROTECTOR_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+class DominatorTree;
+class Function;
+class Module;
+class PHINode;
+
+class StackProtector : public FunctionPass {
+public:
+  /// SSPLayoutKind.  Stack Smashing Protection (SSP) rules require that
+  /// vulnerable stack allocations are located close the stack protector.
+  enum SSPLayoutKind {
+    SSPLK_None,       ///< Did not trigger a stack protector.  No effect on data
+                      ///< layout.
+    SSPLK_LargeArray, ///< Array or nested array >= SSP-buffer-size.  Closest
+                      ///< to the stack protector.
+    SSPLK_SmallArray, ///< Array or nested array < SSP-buffer-size. 2nd closest
+                      ///< to the stack protector.
+    SSPLK_AddrOf      ///< The address of this allocation is exposed and
+                      ///< triggered protection.  3rd closest to the protector.
+  };
+
+  /// A mapping of AllocaInsts to their required SSP layout.
+  typedef ValueMap<const AllocaInst *, SSPLayoutKind> SSPLayoutMap;
+
+private:
+  const TargetMachine *TM;
+
+  /// TLI - Keep a pointer of a TargetLowering to consult for determining
+  /// target type sizes.
+  const TargetLoweringBase *TLI;
+  const Triple Trip;
+
+  Function *F;
+  Module *M;
+
+  DominatorTree *DT;
+
+  /// Layout - Mapping of allocations to the required SSPLayoutKind.
+  /// StackProtector analysis will update this map when determining if an
+  /// AllocaInst triggers a stack protector.
+  SSPLayoutMap Layout;
+
+  /// \brief The minimum size of buffers that will receive stack smashing
+  /// protection when -fstack-protection is used.
+  unsigned SSPBufferSize;
+
+  /// VisitedPHIs - The set of PHI nodes visited when determining
+  /// if a variable's reference has been taken.  This set
+  /// is maintained to ensure we don't visit the same PHI node multiple
+  /// times.
+  SmallPtrSet<const PHINode *, 16> VisitedPHIs;
+
+  /// InsertStackProtectors - Insert code into the prologue and epilogue of
+  /// the function.
+  ///
+  ///  - The prologue code loads and stores the stack guard onto the stack.
+  ///  - The epilogue checks the value stored in the prologue against the
+  ///    original value. It calls __stack_chk_fail if they differ.
+  bool InsertStackProtectors();
+
+  /// CreateFailBB - Create a basic block to jump to when the stack protector
+  /// check fails.
+  BasicBlock *CreateFailBB();
+
+  /// ContainsProtectableArray - Check whether the type either is an array or
+  /// contains an array of sufficient size so that we need stack protectors
+  /// for it.
+  /// \param [out] IsLarge is set to true if a protectable array is found and
+  /// it is "large" ( >= ssp-buffer-size).  In the case of a structure with
+  /// multiple arrays, this gets set if any of them is large.
+  bool ContainsProtectableArray(Type *Ty, bool &IsLarge, bool Strong = false,
+                                bool InStruct = false) const;
+
+  /// \brief Check whether a stack allocation has its address taken.
+  bool HasAddressTaken(const Instruction *AI);
+
+  /// RequiresStackProtector - Check whether or not this function needs a
+  /// stack protector based upon the stack protector level.
+  bool RequiresStackProtector();
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  StackProtector() : FunctionPass(ID), TM(0), TLI(0), SSPBufferSize(0) {
+    initializeStackProtectorPass(*PassRegistry::getPassRegistry());
+  }
+  StackProtector(const TargetMachine *TM)
+      : FunctionPass(ID), TM(TM), TLI(0), Trip(TM->getTargetTriple()),
+        SSPBufferSize(8) {
+    initializeStackProtectorPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<DominatorTree>();
+  }
+
+  SSPLayoutKind getSSPLayout(const AllocaInst *AI) const;
+
+  virtual bool runOnFunction(Function &Fn);
+};
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_STACKPROTECTOR_H
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
index f2adcf8..8ef26b7 100644
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -152,7 +152,13 @@ public:
   /// Compute and return the expected latency of this instruction independent of
   /// a particular use. computeOperandLatency is the prefered API, but this is
   /// occasionally useful to help estimate instruction cost.
-  unsigned computeInstrLatency(const MachineInstr *MI) const;
+  ///
+  /// If UseDefaultDefLatency is false and no new machine sched model is
+  /// present this method falls back to TII->getInstrLatency with an empty
+  /// instruction itinerary (this is so we preserve the previous behavior of the
+  /// if converter after moving it to TargetSchedModel).
+  unsigned computeInstrLatency(const MachineInstr *MI,
+                               bool UseDefaultDefLatency = true) const;
 
   /// \brief Output dependency latency of a pair of defs of the same register.
   ///
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index b7b3d73..79f3233 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -27,9 +27,9 @@ namespace llvm {
   class LLVMContext;
   struct EVT;
 
-  /// MVT - Machine Value Type.  Every type that is supported natively by some
-  /// processor targeted by LLVM occurs here.  This means that any legal value
-  /// type can be represented by a MVT.
+  /// MVT - Machine Value Type. Every type that is supported natively by some
+  /// processor targeted by LLVM occurs here. This means that any legal value
+  /// type can be represented by an MVT.
   class MVT {
   public:
     enum SimpleValueType {
@@ -67,40 +67,45 @@ namespace llvm {
       v32i1          =  17,   // 32 x i1
       v64i1          =  18,   // 64 x i1
 
-      v2i8           =  19,   //  2 x i8
-      v4i8           =  20,   //  4 x i8
-      v8i8           =  21,   //  8 x i8
-      v16i8          =  22,   // 16 x i8
-      v32i8          =  23,   // 32 x i8
-      v64i8          =  24,   // 64 x i8
-      v1i16          =  25,   //  1 x i16
-      v2i16          =  26,   //  2 x i16
-      v4i16          =  27,   //  4 x i16
-      v8i16          =  28,   //  8 x i16
-      v16i16         =  29,   // 16 x i16
-      v32i16         =  30,   // 32 x i16
-      v1i32          =  31,   //  1 x i32
-      v2i32          =  32,   //  2 x i32
-      v4i32          =  33,   //  4 x i32
-      v8i32          =  34,   //  8 x i32
-      v16i32         =  35,   // 16 x i32
-      v1i64          =  36,   //  1 x i64
-      v2i64          =  37,   //  2 x i64
-      v4i64          =  38,   //  4 x i64
-      v8i64          =  39,   //  8 x i64
-      v16i64         =  40,   // 16 x i64
+      v1i8           =  19,   //  1 x i8
+      v2i8           =  20,   //  2 x i8
+      v4i8           =  21,   //  4 x i8
+      v8i8           =  22,   //  8 x i8
+      v16i8          =  23,   // 16 x i8
+      v32i8          =  24,   // 32 x i8
+      v64i8          =  25,   // 64 x i8
+      v1i16          =  26,   //  1 x i16
+      v2i16          =  27,   //  2 x i16
+      v4i16          =  28,   //  4 x i16
+      v8i16          =  29,   //  8 x i16
+      v16i16         =  30,   // 16 x i16
+      v32i16         =  31,   // 32 x i16
+      v1i32          =  32,   //  1 x i32
+      v2i32          =  33,   //  2 x i32
+      v4i32          =  34,   //  4 x i32
+      v8i32          =  35,   //  8 x i32
+      v16i32         =  36,   // 16 x i32
+      v1i64          =  37,   //  1 x i64
+      v2i64          =  38,   //  2 x i64
+      v4i64          =  39,   //  4 x i64
+      v8i64          =  40,   //  8 x i64
+      v16i64         =  41,   // 16 x i64
 
       FIRST_INTEGER_VECTOR_VALUETYPE = v2i1,
       LAST_INTEGER_VECTOR_VALUETYPE = v16i64,
 
-      v2f16          =  41,   //  2 x f16
-      v2f32          =  42,   //  2 x f32
-      v4f32          =  43,   //  4 x f32
-      v8f32          =  44,   //  8 x f32
-      v16f32         =  45,   // 16 x f32
-      v2f64          =  46,   //  2 x f64
-      v4f64          =  47,   //  4 x f64
-      v8f64          =  48,   //  8 x f64
+      v2f16          =  42,   //  2 x f16
+      v4f16          =  43,   //  4 x f16
+      v8f16          =  44,   //  8 x f16
+      v1f32          =  45,   //  1 x f32
+      v2f32          =  46,   //  2 x f32
+      v4f32          =  47,   //  4 x f32
+      v8f32          =  48,   //  8 x f32
+      v16f32         =  49,   // 16 x f32
+      v1f64          =  50,   //  1 x f64
+      v2f64          =  51,   //  2 x f64
+      v4f64          =  52,   //  4 x f64
+      v8f64          =  53,   //  8 x f64
 
       FIRST_FP_VECTOR_VALUETYPE = v2f16,
       LAST_FP_VECTOR_VALUETYPE = v8f64,
@@ -108,17 +113,17 @@ namespace llvm {
       FIRST_VECTOR_VALUETYPE = v2i1,
       LAST_VECTOR_VALUETYPE  = v8f64,
 
-      x86mmx         =  49,   // This is an X86 MMX value
+      x86mmx         =  54,   // This is an X86 MMX value
 
-      Glue           =  50,   // This glues nodes together during pre-RA sched
+      Glue           =  55,   // This glues nodes together during pre-RA sched
 
-      isVoid         =  51,   // This has no value
+      isVoid         =  56,   // This has no value
 
-      Untyped        =  52,   // This value takes a register, but has
+      Untyped        =  57,   // This value takes a register, but has
                               // unspecified type.  The register class
                               // will be determined by the opcode.
 
-      LAST_VALUETYPE =  53,   // This always remains at the end of the list.
+      LAST_VALUETYPE =  58,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -203,7 +208,7 @@ namespace llvm {
     bool is64BitVector() const {
       return (SimpleTy == MVT::v8i8  || SimpleTy == MVT::v4i16 ||
               SimpleTy == MVT::v2i32 || SimpleTy == MVT::v1i64 ||
-              SimpleTy == MVT::v2f32);
+              SimpleTy == MVT::v1f64 || SimpleTy == MVT::v2f32);
     }
 
     /// is128BitVector - Return true if this is a 128-bit vector type.
@@ -265,6 +270,7 @@ namespace llvm {
       case v16i1 :
       case v32i1 :
       case v64i1: return i1;
+      case v1i8 :
       case v2i8 :
       case v4i8 :
       case v8i8 :
@@ -287,11 +293,15 @@ namespace llvm {
       case v4i64:
       case v8i64:
       case v16i64: return i64;
-      case v2f16: return f16;
+      case v2f16:
+      case v4f16:
+      case v8f16: return f16;
+      case v1f32:
       case v2f32:
       case v4f32:
       case v8f32:
       case v16f32: return f32;
+      case v1f64:
       case v2f64:
       case v4f64:
       case v8f64: return f64;
@@ -318,6 +328,7 @@ namespace llvm {
       case v8i16:
       case v8i32:
       case v8i64:
+      case v8f16:
       case v8f32:
       case v8f64: return 8;
       case v4i1:
@@ -325,6 +336,7 @@ namespace llvm {
       case v4i16:
       case v4i32:
       case v4i64:
+      case v4f16:
       case v4f32:
       case v4f64: return 4;
       case v2i1:
@@ -335,9 +347,12 @@ namespace llvm {
       case v2f16:
       case v2f32:
       case v2f64: return 2;
+      case v1i8:
       case v1i16:
       case v1i32:
-      case v1i64: return 1;
+      case v1i64:
+      case v1f32:
+      case v1f64: return 1;
       }
     }
 
@@ -360,6 +375,7 @@ namespace llvm {
       case v2i1:  return 2;
       case v4i1:  return 4;
       case i8  :
+      case v1i8:
       case v8i1: return 8;
       case i16 :
       case f16:
@@ -372,6 +388,7 @@ namespace llvm {
       case v4i8:
       case v2i16:
       case v2f16:
+      case v1f32:
       case v1i32: return 32;
       case x86mmx:
       case f64 :
@@ -381,7 +398,9 @@ namespace llvm {
       case v4i16:
       case v2i32:
       case v1i64:
-      case v2f32: return 64;
+      case v4f16:
+      case v2f32:
+      case v1f64: return 64;
       case f80 :  return 80;
       case f128:
       case ppcf128:
@@ -390,6 +409,7 @@ namespace llvm {
       case v8i16:
       case v4i32:
       case v2i64:
+      case v8f16:
       case v4f32:
       case v2f64: return 128;
       case v32i8:
@@ -490,6 +510,7 @@ namespace llvm {
         if (NumElements == 64) return MVT::v64i1;
         break;
       case MVT::i8:
+        if (NumElements == 1)  return MVT::v1i8;
         if (NumElements == 2)  return MVT::v2i8;
         if (NumElements == 4)  return MVT::v4i8;
         if (NumElements == 8)  return MVT::v8i8;
@@ -521,14 +542,18 @@ namespace llvm {
         break;
       case MVT::f16:
         if (NumElements == 2)  return MVT::v2f16;
+        if (NumElements == 4)  return MVT::v4f16;
+        if (NumElements == 8)  return MVT::v8f16;
         break;
       case MVT::f32:
+        if (NumElements == 1)  return MVT::v1f32;
         if (NumElements == 2)  return MVT::v2f32;
         if (NumElements == 4)  return MVT::v4f32;
         if (NumElements == 8)  return MVT::v8f32;
         if (NumElements == 16) return MVT::v16f32;
         break;
       case MVT::f64:
+        if (NumElements == 1)  return MVT::v1f64;
         if (NumElements == 2)  return MVT::v2f64;
         if (NumElements == 4)  return MVT::v4f64;
         if (NumElements == 8)  return MVT::v8f64;
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index da26985..b5fa0e8 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -26,7 +26,7 @@ def i16    : ValueType<16 ,  3>;   // 16-bit integer value
 def i32    : ValueType<32 ,  4>;   // 32-bit integer value
 def i64    : ValueType<64 ,  5>;   // 64-bit integer value
 def i128   : ValueType<128,  6>;   // 128-bit integer value
-def f16    : ValueType<16 ,  7>;   // 32-bit floating point value
+def f16    : ValueType<16 ,  7>;   // 16-bit floating point value
 def f32    : ValueType<32 ,  8>;   // 32-bit floating point value
 def f64    : ValueType<64 ,  9>;   // 64-bit floating point value
 def f80    : ValueType<80 , 10>;   // 80-bit floating point value
@@ -39,43 +39,48 @@ def v8i1   : ValueType<8 ,  15>;   //  8 x i1  vector value
 def v16i1  : ValueType<16,  16>;   // 16 x i1  vector value
 def v32i1  : ValueType<32 , 17>;   // 32 x i1  vector value
 def v64i1  : ValueType<64 , 18>;   // 64 x i1  vector value
-def v2i8   : ValueType<16 , 19>;   //  2 x i8  vector value
-def v4i8   : ValueType<32 , 20>;   //  4 x i8  vector value
-def v8i8   : ValueType<64 , 21>;   //  8 x i8  vector value
-def v16i8  : ValueType<128, 22>;   // 16 x i8  vector value
-def v32i8  : ValueType<256, 23>;   // 32 x i8 vector value
-def v64i8  : ValueType<512, 24>;   // 64 x i8 vector value
-def v1i16  : ValueType<16 , 25>;   //  1 x i16 vector value
-def v2i16  : ValueType<32 , 26>;   //  2 x i16 vector value
-def v4i16  : ValueType<64 , 27>;   //  4 x i16 vector value
-def v8i16  : ValueType<128, 28>;   //  8 x i16 vector value
-def v16i16 : ValueType<256, 29>;   // 16 x i16 vector value
-def v32i16 : ValueType<512, 30>;   // 32 x i16 vector value
-def v1i32  : ValueType<32 , 31>;   //  1 x i32 vector value
-def v2i32  : ValueType<64 , 32>;   //  2 x i32 vector value
-def v4i32  : ValueType<128, 33>;   //  4 x i32 vector value
-def v8i32  : ValueType<256, 34>;   //  8 x i32 vector value
-def v16i32 : ValueType<512, 35>;   // 16 x i32 vector value
-def v1i64  : ValueType<64 , 36>;   //  1 x i64 vector value
-def v2i64  : ValueType<128, 37>;   //  2 x i64 vector value
-def v4i64  : ValueType<256, 38>;   //  4 x i64 vector value
-def v8i64  : ValueType<512, 39>;   //  8 x i64 vector value
-def v16i64 : ValueType<1024,40>;   // 16 x i64 vector value
+def v1i8   : ValueType<16, 19>;    //  1 x i8  vector value
+def v2i8   : ValueType<16 , 20>;   //  2 x i8  vector value
+def v4i8   : ValueType<32 , 21>;   //  4 x i8  vector value
+def v8i8   : ValueType<64 , 22>;   //  8 x i8  vector value
+def v16i8  : ValueType<128, 23>;   // 16 x i8  vector value
+def v32i8  : ValueType<256, 24>;   // 32 x i8 vector value
+def v64i8  : ValueType<512, 25>;   // 64 x i8 vector value
+def v1i16  : ValueType<16 , 26>;   //  1 x i16 vector value
+def v2i16  : ValueType<32 , 27>;   //  2 x i16 vector value
+def v4i16  : ValueType<64 , 28>;   //  4 x i16 vector value
+def v8i16  : ValueType<128, 29>;   //  8 x i16 vector value
+def v16i16 : ValueType<256, 30>;   // 16 x i16 vector value
+def v32i16 : ValueType<512, 31>;   // 32 x i16 vector value
+def v1i32  : ValueType<32 , 32>;   //  1 x i32 vector value
+def v2i32  : ValueType<64 , 33>;   //  2 x i32 vector value
+def v4i32  : ValueType<128, 34>;   //  4 x i32 vector value
+def v8i32  : ValueType<256, 35>;   //  8 x i32 vector value
+def v16i32 : ValueType<512, 36>;   // 16 x i32 vector value
+def v1i64  : ValueType<64 , 37>;   //  1 x i64 vector value
+def v2i64  : ValueType<128, 38>;   //  2 x i64 vector value
+def v4i64  : ValueType<256, 39>;   //  4 x i64 vector value
+def v8i64  : ValueType<512, 40>;   //  8 x i64 vector value
+def v16i64 : ValueType<1024,41>;   // 16 x i64 vector value
 
-def v2f16  : ValueType<32 , 41>;   //  2 x f16 vector value
-def v2f32  : ValueType<64 , 42>;   //  2 x f32 vector value
-def v4f32  : ValueType<128, 43>;   //  4 x f32 vector value
-def v8f32  : ValueType<256, 44>;   //  8 x f32 vector value
-def v16f32 : ValueType<512, 45>;   // 16 x f32 vector value
-def v2f64  : ValueType<128, 46>;   //  2 x f64 vector value
-def v4f64  : ValueType<256, 47>;   //  4 x f64 vector value
-def v8f64  : ValueType<512, 48>;   //  8 x f64 vector value
+def v2f16  : ValueType<32 , 42>;   //  2 x f16 vector value
+def v4f16  : ValueType<64 , 43>;   //  4 x f16 vector value
+def v8f16  : ValueType<128, 44>;   //  8 x f16 vector value
+def v1f32  : ValueType<32 , 45>;   //  1 x f32 vector value
+def v2f32  : ValueType<64 , 46>;   //  2 x f32 vector value
+def v4f32  : ValueType<128, 47>;   //  4 x f32 vector value
+def v8f32  : ValueType<256, 48>;   //  8 x f32 vector value
+def v16f32 : ValueType<512, 49>;   // 16 x f32 vector value
+def v1f64  : ValueType<64, 50>;    //  1 x f64 vector value
+def v2f64  : ValueType<128, 51>;   //  2 x f64 vector value
+def v4f64  : ValueType<256, 52>;   //  4 x f64 vector value
+def v8f64  : ValueType<512, 53>;   //  8 x f64 vector value
 
 
-def x86mmx : ValueType<64 , 49>;   // X86 MMX value
-def FlagVT : ValueType<0  , 50>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 51>;   // Produces no value
-def untyped: ValueType<8  , 52>;   // Produces an untyped value
+def x86mmx : ValueType<64 , 54>;   // X86 MMX value
+def FlagVT : ValueType<0  , 55>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 56>;   // Produces no value
+def untyped: ValueType<8  , 57>;   // Produces an untyped value
 def MetadataVT: ValueType<0, 250>; // Metadata
 
 // Pseudo valuetype mapped to the current pointer size to any address space.
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index 4b0aa4b..5c72ad8 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -21,6 +21,9 @@
 /* Define if you want backtraces on crash */
 #cmakedefine ENABLE_BACKTRACES
 
+/* Define to enable crash overrides */
+#cmakedefine ENABLE_CRASH_OVERRIDES
+
 /* Define if position independent code is enabled */
 #cmakedefine ENABLE_PIC
 
@@ -48,12 +51,6 @@
 /* Define to 1 if you have the `closedir' function. */
 #cmakedefine HAVE_CLOSEDIR ${HAVE_CLOSEDIR}
 
-/* Define if curses provides the has_color() function on this platform. */
-#cmakedefine HAVE_CURSES
-
-/* Define to 1 if you have the <curses.h> header file. */
-#cmakedefine HAVE_CURSES_H
-
 /* Define to 1 if you have the <cxxabi.h> header file. */
 #cmakedefine HAVE_CXXABI_H ${HAVE_CXXABI_H}
 
@@ -132,13 +129,13 @@
 #cmakedefine HAVE_LOG10 ${HAVE_LOG10}
 
 /* Define to 1 if you have the `exp' function. */
-#cmakedefine HAVE_EXP ${HAVE_LOG}
+#cmakedefine HAVE_EXP ${HAVE_EXP}
 
 /* Define to 1 if you have the `exp2' function. */
-#cmakedefine HAVE_EXP2 ${HAVE_LOG2}
+#cmakedefine HAVE_EXP2 ${HAVE_EXP2}
 
 /* Define to 1 if you have the `exp10' function. */
-#cmakedefine HAVE_EXP10 ${HAVE_LOG10}
+#cmakedefine HAVE_EXP10 ${HAVE_EXP10}
 
 /* Define to 1 if you have the `fmodf' function. */
 #cmakedefine HAVE_FMODF ${HAVE_FMODF}
@@ -206,6 +203,9 @@
 /* Define to 1 if you have the `pthread' library (-lpthread). */
 #cmakedefine HAVE_LIBPTHREAD ${HAVE_LIBPTHREAD}
 
+/* Define to 1 if you have the `shell32' library (-lshell32). */
+#cmakedefine HAVE_LIBSHELL32 ${HAVE_LIBSHELL32}
+
 /* Define to 1 if you have the `udis86' library (-ludis86). */
 #undef HAVE_LIBUDIS86
 
@@ -262,18 +262,6 @@
 /* Define if mmap() can map files into memory */
 #undef HAVE_MMAP_FILE
 
-/* Define to 1 if you have the <ncursesw/curses.h> header file. */
-#cmakedefine HAVE_NCURSESW_CURSES_H
-
-/* Define to 1 if you have the <ncursesw.h> header file. */
-#cmakedefine HAVE_NCURSESW_H
-
-/* Define to 1 if you have the <ncurses/curses.h> header file. */
-#cmakedefine HAVE_NCURSES_CURSES_H
-
-/* Define to 1 if you have the <ncurses.h> header file. */
-#cmakedefine HAVE_NCURSES_H
-
 /* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
 #cmakedefine HAVE_NDIR_H ${HAVE_NDIR_H}
 
@@ -320,7 +308,7 @@
 #cmakedefine HAVE_READDIR ${HAVE_READDIR}
 
 /* Define to 1 if you have the `realpath' function. */
-#undef HAVE_REALPATH
+#cmakedefine HAVE_REALPATH ${HAVE_REALPATH}
 
 /* Define to 1 if you have the `rintf' function. */
 #undef HAVE_RINTF
@@ -420,6 +408,9 @@
 /* Define to 1 if you have <sys/wait.h> that is POSIX.1 compatible. */
 #cmakedefine HAVE_SYS_WAIT_H ${HAVE_SYS_WAIT_H}
 
+/* Define if the setupterm() function is supported this platform. */
+#cmakedefine HAVE_TERMINFO ${HAVE_TERMINFO}
+
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
 
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 7bb1caa..dcec8f8 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -3,6 +3,9 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 
+/* Define if building universal (internal helper macro) */
+#undef AC_APPLE_UNIVERSAL_BUILD
+
 /* Bug report URL. */
 #undef BUG_REPORT_URL
 
@@ -21,6 +24,9 @@
 /* Define if you want backtraces on crash */
 #undef ENABLE_BACKTRACES
 
+/* Define to enable crash handling overrides */
+#undef ENABLE_CRASH_OVERRIDES
+
 /* Define if position independent code is enabled */
 #undef ENABLE_PIC
 
@@ -69,12 +75,6 @@
 /* can use __crashreporter_info__ */
 #undef HAVE_CRASHREPORTER_INFO
 
-/* Define if curses provides the has_color() function on this platform. */
-#undef HAVE_CURSES
-
-/* Define to 1 if you have the <curses.h> header file. */
-#undef HAVE_CURSES_H
-
 /* Define to 1 if you have the <cxxabi.h> header file. */
 #undef HAVE_CXXABI_H
 
@@ -220,6 +220,9 @@
 /* Define to 1 if you have the `pthread' library (-lpthread). */
 #undef HAVE_LIBPTHREAD
 
+/* Define to 1 if you have the `shell32' library (-lshell32). */
+#undef HAVE_LIBSHELL32
+
 /* Define to 1 if you have the `udis86' library (-ludis86). */
 #undef HAVE_LIBUDIS86
 
@@ -288,18 +291,6 @@
 /* Define if mmap() can map files into memory */
 #undef HAVE_MMAP_FILE
 
-/* Define to 1 if you have the <ncursesw/curses.h> header file. */
-#undef HAVE_NCURSESW_CURSES_H
-
-/* Define to 1 if you have the <ncursesw.h> header file. */
-#undef HAVE_NCURSESW_H
-
-/* Define to 1 if you have the <ncurses/curses.h> header file. */
-#undef HAVE_NCURSES_CURSES_H
-
-/* Define to 1 if you have the <ncurses.h> header file. */
-#undef HAVE_NCURSES_H
-
 /* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
 #undef HAVE_NDIR_H
 
@@ -455,6 +446,9 @@
 /* Define to 1 if you have <sys/wait.h> that is POSIX.1 compatible. */
 #undef HAVE_SYS_WAIT_H
 
+/* Define if the setupterm() function is supported this platform. */
+#undef HAVE_TERMINFO
+
 /* Define to 1 if you have the <termios.h> header file. */
 #undef HAVE_TERMIOS_H
 
@@ -679,6 +673,9 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
 
+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
@@ -703,6 +700,18 @@
 /* Type of 1st arg on ELM Callback */
 #undef WIN32_ELMCB_PCSTR
 
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+#  undef WORDS_BIGENDIAN
+# endif
+#endif
+
 /* Define to empty if `const' does not conform to ANSI C. */
 #undef const
 
diff --git a/include/llvm/DIBuilder.h b/include/llvm/DIBuilder.h
index a15d619..bac1679 100644
--- a/include/llvm/DIBuilder.h
+++ b/include/llvm/DIBuilder.h
@@ -17,7 +17,9 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ValueHandle.h"
 
 namespace llvm {
   class BasicBlock;
@@ -65,7 +67,9 @@ namespace llvm {
     Function *ValueFn;       // llvm.dbg.value
 
     SmallVector<Value *, 4> AllEnumTypes;
-    SmallVector<Value *, 4> AllRetainTypes;
+    /// Use TrackingVH to collect RetainTypes, since they can be updated
+    /// later on.
+    SmallVector<TrackingVH<MDNode>, 4> AllRetainTypes;
     SmallVector<Value *, 4> AllSubprograms;
     SmallVector<Value *, 4> AllGVs;
     SmallVector<Value *, 4> AllImportedModules;
@@ -279,13 +283,15 @@ namespace llvm {
     ///                     DW_AT_containing_type. See DWARF documentation
     ///                     for more info.
     /// @param TemplateParms Template type parameters.
+    /// @param UniqueIdentifier A unique identifier for the class.
     DICompositeType createClassType(DIDescriptor Scope, StringRef Name,
                                     DIFile File, unsigned LineNumber,
                                     uint64_t SizeInBits, uint64_t AlignInBits,
                                     uint64_t OffsetInBits, unsigned Flags,
                                     DIType DerivedFrom, DIArray Elements,
-                                    MDNode *VTableHolder = 0,
-                                    MDNode *TemplateParms = 0);
+                                    DIType VTableHolder = DIType(),
+                                    MDNode *TemplateParms = 0,
+                                    StringRef UniqueIdentifier = StringRef());
 
     /// createStructType - Create debugging information entry for a struct.
     /// @param Scope        Scope in which this struct is defined.
@@ -297,12 +303,14 @@ namespace llvm {
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Elements     Struct elements.
     /// @param RunTimeLang  Optional parameter, Objective-C runtime version.
+    /// @param UniqueIdentifier A unique identifier for the struct.
     DICompositeType createStructType(DIDescriptor Scope, StringRef Name,
                                      DIFile File, unsigned LineNumber,
                                      uint64_t SizeInBits, uint64_t AlignInBits,
                                      unsigned Flags, DIType DerivedFrom,
                                      DIArray Elements, unsigned RunTimeLang = 0,
-                                     MDNode *VTableHolder = 0);
+                                     DIType VTableHolder = DIType(),
+                                     StringRef UniqueIdentifier = StringRef());
 
     /// createUnionType - Create debugging information entry for an union.
     /// @param Scope        Scope in which this union is defined.
@@ -314,10 +322,12 @@ namespace llvm {
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Elements     Union elements.
     /// @param RunTimeLang  Optional parameter, Objective-C runtime version.
+    /// @param UniqueIdentifier A unique identifier for the union.
     DICompositeType createUnionType(
         DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
         uint64_t SizeInBits, uint64_t AlignInBits, unsigned Flags,
-        DIArray Elements, unsigned RunTimeLang = 0);
+        DIArray Elements, unsigned RunTimeLang = 0,
+        StringRef UniqueIdentifier = StringRef());
 
     /// createTemplateTypeParameter - Create debugging information for template
     /// type parameter.
@@ -398,12 +408,11 @@ namespace llvm {
     /// @param AlignInBits    Member alignment.
     /// @param Elements       Enumeration elements.
     /// @param UnderlyingType Underlying type of a C++11/ObjC fixed enum.
+    /// @param UniqueIdentifier A unique identifier for the enum.
     DICompositeType createEnumerationType(DIDescriptor Scope, StringRef Name,
-                                          DIFile File, unsigned LineNumber,
-                                          uint64_t SizeInBits,
-                                          uint64_t AlignInBits,
-                                          DIArray Elements,
-                                          DIType UnderlyingType);
+        DIFile File, unsigned LineNumber, uint64_t SizeInBits,
+        uint64_t AlignInBits, DIArray Elements, DIType UnderlyingType,
+        StringRef UniqueIdentifier = StringRef());
 
     /// createSubroutineType - Create subroutine type.
     /// @param File           File in which this subroutine is defined.
@@ -419,9 +428,12 @@ namespace llvm {
     DIType createObjectPointerType(DIType Ty);
 
     /// createForwardDecl - Create a temporary forward-declared type.
-    DIType createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
-                             DIFile F, unsigned Line, unsigned RuntimeLang = 0,
-                             uint64_t SizeInBits = 0, uint64_t AlignInBits = 0);
+    DICompositeType createForwardDecl(unsigned Tag, StringRef Name,
+                                      DIDescriptor Scope, DIFile F,
+                                      unsigned Line, unsigned RuntimeLang = 0,
+                                      uint64_t SizeInBits = 0,
+                                      uint64_t AlignInBits = 0,
+                                      StringRef UniqueIdentifier = StringRef());
 
     /// retainType - Retain DIType in a module even if it is not referenced
     /// through debug info anchors.
@@ -495,7 +507,7 @@ namespace llvm {
     /// @param AlwaysPreserve Boolean. Set to true if debug info for this
     ///                       variable should be preserved in optimized build.
     /// @param Flags          Flags, e.g. artificial variable.
-    /// @param ArgNo       If this variable is an arugment then this argument's
+    /// @param ArgNo       If this variable is an argument then this argument's
     ///                    number. 1 indicates 1st argument.
     DIVariable createLocalVariable(unsigned Tag, DIDescriptor Scope,
                                    StringRef Name,
@@ -515,7 +527,7 @@ namespace llvm {
     /// @param LineNo      Line number.
     /// @param Ty          Variable Type
     /// @param Addr        An array of complex address operations.
-    /// @param ArgNo       If this variable is an arugment then this argument's
+    /// @param ArgNo       If this variable is an argument then this argument's
     ///                    number. 1 indicates 1st argument.
     DIVariable createComplexVariable(unsigned Tag, DIDescriptor Scope,
                                      StringRef Name, DIFile F, unsigned LineNo,
@@ -550,6 +562,20 @@ namespace llvm {
                                 MDNode *TParam = 0,
                                 MDNode *Decl = 0);
 
+    /// FIXME: this is added for dragonegg. Once we update dragonegg
+    /// to call resolve function, this will be removed.
+    DISubprogram createFunction(DIScopeRef Scope, StringRef Name,
+                                StringRef LinkageName,
+                                DIFile File, unsigned LineNo,
+                                DICompositeType Ty, bool isLocalToUnit,
+                                bool isDefinition,
+                                unsigned ScopeLine,
+                                unsigned Flags = 0,
+                                bool isOptimized = false,
+                                Function *Fn = 0,
+                                MDNode *TParam = 0,
+                                MDNode *Decl = 0);
+
     /// createMethod - Create a new descriptor for the specified C++ method.
     /// See comments in DISubprogram for descriptions of these fields.
     /// @param Scope         Function scope.
@@ -575,7 +601,7 @@ namespace llvm {
                               DICompositeType Ty, bool isLocalToUnit,
                               bool isDefinition,
                               unsigned Virtuality = 0, unsigned VTableIndex = 0,
-                              MDNode *VTableHolder = 0,
+                              DIType VTableHolder = DIType(),
                               unsigned Flags = 0,
                               bool isOptimized = false,
                               Function *Fn = 0,
diff --git a/include/llvm/DebugInfo.h b/include/llvm/DebugInfo.h
index f7e6434..768cf4e 100644
--- a/include/llvm/DebugInfo.h
+++ b/include/llvm/DebugInfo.h
@@ -17,780 +17,834 @@
 #ifndef LLVM_DEBUGINFO_H
 #define LLVM_DEBUGINFO_H
 
+#include "llvm/Support/Casting.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Dwarf.h"
 
 namespace llvm {
-  class BasicBlock;
-  class Constant;
-  class Function;
-  class GlobalVariable;
-  class Module;
-  class Type;
-  class Value;
-  class DbgDeclareInst;
-  class DbgValueInst;
-  class Instruction;
-  class MDNode;
-  class NamedMDNode;
-  class LLVMContext;
-  class raw_ostream;
-
-  class DIFile;
-  class DISubprogram;
-  class DILexicalBlock;
-  class DILexicalBlockFile;
-  class DIVariable;
-  class DIType;
-  class DIObjCProperty;
-
-  /// DIDescriptor - A thin wraper around MDNode to access encoded debug info.
-  /// This should not be stored in a container, because the underlying MDNode
-  /// may change in certain situations.
-  class DIDescriptor {
-  public:
-    enum {
-      FlagPrivate            = 1 << 0,
-      FlagProtected          = 1 << 1,
-      FlagFwdDecl            = 1 << 2,
-      FlagAppleBlock         = 1 << 3,
-      FlagBlockByrefStruct   = 1 << 4,
-      FlagVirtual            = 1 << 5,
-      FlagArtificial         = 1 << 6,
-      FlagExplicit           = 1 << 7,
-      FlagPrototyped         = 1 << 8,
-      FlagObjcClassComplete  = 1 << 9,
-      FlagObjectPointer      = 1 << 10,
-      FlagVector             = 1 << 11,
-      FlagStaticMember       = 1 << 12,
-      FlagIndirectVariable   = 1 << 13
-    };
-  protected:
-    const MDNode *DbgNode;
-
-    StringRef getStringField(unsigned Elt) const;
-    unsigned getUnsignedField(unsigned Elt) const {
-      return (unsigned)getUInt64Field(Elt);
-    }
-    uint64_t getUInt64Field(unsigned Elt) const;
-    int64_t getInt64Field(unsigned Elt) const;
-    DIDescriptor getDescriptorField(unsigned Elt) const;
-
-    template <typename DescTy>
-    DescTy getFieldAs(unsigned Elt) const {
-      return DescTy(getDescriptorField(Elt));
-    }
-
-    GlobalVariable *getGlobalVariableField(unsigned Elt) const;
-    Constant *getConstantField(unsigned Elt) const;
-    Function *getFunctionField(unsigned Elt) const;
-    void replaceFunctionField(unsigned Elt, Function *F);
-
-  public:
-    explicit DIDescriptor(const MDNode *N = 0) : DbgNode(N) {}
-
-    bool Verify() const;
-
-    operator MDNode *() const { return const_cast<MDNode*>(DbgNode); }
-    MDNode *operator ->() const { return const_cast<MDNode*>(DbgNode); }
-
-    // An explicit operator bool so that we can do testing of DI values
-    // easily.
-    // FIXME: This operator bool isn't actually protecting anything at the
-    // moment due to the conversion operator above making DIDescriptor nodes
-    // implicitly convertable to bool.
-    LLVM_EXPLICIT operator bool() const { return DbgNode != 0; }
-
-    bool operator==(DIDescriptor Other) const {
-      return DbgNode == Other.DbgNode;
-    }
-    bool operator!=(DIDescriptor Other) const {
-      return !operator==(Other);
-    }
-
-    unsigned getTag() const {
-      return getUnsignedField(0) & ~LLVMDebugVersionMask;
-    }
-
-    bool isDerivedType() const;
-    bool isCompositeType() const;
-    bool isBasicType() const;
-    bool isVariable() const;
-    bool isSubprogram() const;
-    bool isGlobalVariable() const;
-    bool isScope() const;
-    bool isFile() const;
-    bool isCompileUnit() const;
-    bool isNameSpace() const;
-    bool isLexicalBlockFile() const;
-    bool isLexicalBlock() const;
-    bool isSubrange() const;
-    bool isEnumerator() const;
-    bool isType() const;
-    bool isUnspecifiedParameter() const;
-    bool isTemplateTypeParameter() const;
-    bool isTemplateValueParameter() const;
-    bool isObjCProperty() const;
-    bool isImportedEntity() const;
-
-    /// print - print descriptor.
-    void print(raw_ostream &OS) const;
-
-    /// dump - print descriptor to dbgs() with a newline.
-    void dump() const;
+class BasicBlock;
+class Constant;
+class Function;
+class GlobalVariable;
+class Module;
+class Type;
+class Value;
+class DbgDeclareInst;
+class DbgValueInst;
+class Instruction;
+class MDNode;
+class MDString;
+class NamedMDNode;
+class LLVMContext;
+class raw_ostream;
+
+class DIFile;
+class DISubprogram;
+class DILexicalBlock;
+class DILexicalBlockFile;
+class DIVariable;
+class DIType;
+class DIScope;
+class DIObjCProperty;
+
+/// Maps from type identifier to the actual MDNode.
+typedef DenseMap<const MDString *, MDNode *> DITypeIdentifierMap;
+
+/// DIDescriptor - A thin wraper around MDNode to access encoded debug info.
+/// This should not be stored in a container, because the underlying MDNode
+/// may change in certain situations.
+class DIDescriptor {
+  // Befriends DIRef so DIRef can befriend the protected member
+  // function: getFieldAs<DIRef>.
+  template <typename T> friend class DIRef;
+
+public:
+  enum {
+    FlagPrivate = 1 << 0,
+    FlagProtected = 1 << 1,
+    FlagFwdDecl = 1 << 2,
+    FlagAppleBlock = 1 << 3,
+    FlagBlockByrefStruct = 1 << 4,
+    FlagVirtual = 1 << 5,
+    FlagArtificial = 1 << 6,
+    FlagExplicit = 1 << 7,
+    FlagPrototyped = 1 << 8,
+    FlagObjcClassComplete = 1 << 9,
+    FlagObjectPointer = 1 << 10,
+    FlagVector = 1 << 11,
+    FlagStaticMember = 1 << 12,
+    FlagIndirectVariable = 1 << 13
   };
 
-  /// DISubrange - This is used to represent ranges, for array bounds.
-  class DISubrange : public DIDescriptor {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DISubrange(const MDNode *N = 0) : DIDescriptor(N) {}
-
-    int64_t getLo() const { return getInt64Field(1); }
-    int64_t  getCount() const { return getInt64Field(2); }
-    bool Verify() const;
-  };
-
-  /// DIArray - This descriptor holds an array of descriptors.
-  class DIArray : public DIDescriptor {
-  public:
-    explicit DIArray(const MDNode *N = 0) : DIDescriptor(N) {}
-
-    unsigned getNumElements() const;
-    DIDescriptor getElement(unsigned Idx) const {
-      return getDescriptorField(Idx);
-    }
-  };
-
-  /// DIEnumerator - A wrapper for an enumerator (e.g. X and Y in 'enum {X,Y}').
-  /// FIXME: it seems strange that this doesn't have either a reference to the
-  /// type/precision or a file/line pair for location info.
-  class DIEnumerator : public DIDescriptor {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DIEnumerator(const MDNode *N = 0) : DIDescriptor(N) {}
-
-    StringRef getName() const        { return getStringField(1); }
-    int64_t getEnumValue() const      { return getInt64Field(2); }
-    bool Verify() const;
-  };
-
-  /// DIScope - A base class for various scopes.
-  class DIScope : public DIDescriptor {
-  protected:
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DIScope(const MDNode *N = 0) : DIDescriptor (N) {}
-
-    /// Gets the parent scope for this scope node or returns a
-    /// default constructed scope.
-    DIScope getContext() const;
-    StringRef getFilename() const;
-    StringRef getDirectory() const;
-  };
-
-  /// DIType - This is a wrapper for a type.
-  /// FIXME: Types should be factored much better so that CV qualifiers and
-  /// others do not require a huge and empty descriptor full of zeros.
-  class DIType : public DIScope {
-  protected:
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-
-  public:
-    DIType(const MDNode *N = 0) : DIScope(N) {}
-
-    /// Verify - Verify that a type descriptor is well formed.
-    bool Verify() const;
-
-    DIScope getContext() const          { return getFieldAs<DIScope>(2); }
-    StringRef getName() const           { return getStringField(3);     }
-    unsigned getLineNumber() const      { return getUnsignedField(4); }
-    uint64_t getSizeInBits() const      { return getUInt64Field(5); }
-    uint64_t getAlignInBits() const     { return getUInt64Field(6); }
-    // FIXME: Offset is only used for DW_TAG_member nodes.  Making every type
-    // carry this is just plain insane.
-    uint64_t getOffsetInBits() const    { return getUInt64Field(7); }
-    unsigned getFlags() const           { return getUnsignedField(8); }
-    bool isPrivate() const {
-      return (getFlags() & FlagPrivate) != 0;
-    }
-    bool isProtected() const {
-      return (getFlags() & FlagProtected) != 0;
-    }
-    bool isForwardDecl() const {
-      return (getFlags() & FlagFwdDecl) != 0;
-    }
-    // isAppleBlock - Return true if this is the Apple Blocks extension.
-    bool isAppleBlockExtension() const {
-      return (getFlags() & FlagAppleBlock) != 0;
-    }
-    bool isBlockByrefStruct() const {
-      return (getFlags() & FlagBlockByrefStruct) != 0;
-    }
-    bool isVirtual() const {
-      return (getFlags() & FlagVirtual) != 0;
-    }
-    bool isArtificial() const {
-      return (getFlags() & FlagArtificial) != 0;
-    }
-    bool isObjectPointer() const {
-      return (getFlags() & FlagObjectPointer) != 0;
-    }
-    bool isObjcClassComplete() const {
-      return (getFlags() & FlagObjcClassComplete) != 0;
-    }
-    bool isVector() const {
-      return (getFlags() & FlagVector) != 0;
-    }
-    bool isStaticMember() const {
-      return (getFlags() & FlagStaticMember) != 0;
-    }
-    bool isValid() const {
-      return DbgNode && isType();
-    }
-
-    /// isUnsignedDIType - Return true if type encoding is unsigned.
-    bool isUnsignedDIType();
-
-    /// replaceAllUsesWith - Replace all uses of debug info referenced by
-    /// this descriptor.
-    void replaceAllUsesWith(DIDescriptor &D);
-    void replaceAllUsesWith(MDNode *D);
-  };
-
-  /// DIBasicType - A basic type, like 'int' or 'float'.
-  class DIBasicType : public DIType {
-  public:
-    explicit DIBasicType(const MDNode *N = 0) : DIType(N) {}
-
-    unsigned getEncoding() const { return getUnsignedField(9); }
-
-    /// Verify - Verify that a basic type descriptor is well formed.
-    bool Verify() const;
-  };
-
-  /// DIDerivedType - A simple derived type, like a const qualified type,
-  /// a typedef, a pointer or reference, et cetera.  Or, a data member of
-  /// a class/struct/union.
-  class DIDerivedType : public DIType {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-
-  public:
-    explicit DIDerivedType(const MDNode *N = 0) : DIType(N) {}
-
-    DIType getTypeDerivedFrom() const { return getFieldAs<DIType>(9); }
-
-    /// getOriginalTypeSize - If this type is derived from a base type then
-    /// return base type size.
-    uint64_t getOriginalTypeSize() const;
-
-    /// getObjCProperty - Return property node, if this ivar is
-    /// associated with one.
-    MDNode *getObjCProperty() const;
-
-    DIType getClassType() const {
-      assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
-      return getFieldAs<DIType>(10);
-    }
-
-    Constant *getConstant() const {
-      assert((getTag() == dwarf::DW_TAG_member) && isStaticMember());
-      return getConstantField(10);
-    }
-
-    /// Verify - Verify that a derived type descriptor is well formed.
-    bool Verify() const;
-  };
-
-  /// DICompositeType - This descriptor holds a type that can refer to multiple
-  /// other types, like a function or struct.
-  /// DICompositeType is derived from DIDerivedType because some
-  /// composite types (such as enums) can be derived from basic types
-  // FIXME: Make this derive from DIType directly & just store the
-  // base type in a single DIType field.
-  class DICompositeType : public DIDerivedType {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DICompositeType(const MDNode *N = 0) : DIDerivedType(N) {}
-
-    DIArray getTypeArray() const { return getFieldAs<DIArray>(10); }
-    void setTypeArray(DIArray Elements, DIArray TParams = DIArray());
-    unsigned getRunTimeLang() const { return getUnsignedField(11); }
-    DICompositeType getContainingType() const {
-      return getFieldAs<DICompositeType>(12);
-    }
-    void setContainingType(DICompositeType ContainingType);
-    DIArray getTemplateParams() const { return getFieldAs<DIArray>(13); }
-
-    /// Verify - Verify that a composite type descriptor is well formed.
-    bool Verify() const;
-  };
-
-  /// DIFile - This is a wrapper for a file.
-  class DIFile : public DIScope {
-    friend class DIDescriptor;
-  public:
-    explicit DIFile(const MDNode *N = 0) : DIScope(N) {}
-    MDNode *getFileNode() const;
-    bool Verify() const;
-  };
-
-  /// DICompileUnit - A wrapper for a compile unit.
-  class DICompileUnit : public DIScope {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DICompileUnit(const MDNode *N = 0) : DIScope(N) {}
-
-    unsigned getLanguage() const { return getUnsignedField(2); }
-    StringRef getProducer() const { return getStringField(3); }
-
-    bool isOptimized() const { return getUnsignedField(4) != 0; }
-    StringRef getFlags() const { return getStringField(5); }
-    unsigned getRunTimeVersion() const { return getUnsignedField(6); }
-
-    DIArray getEnumTypes() const;
-    DIArray getRetainedTypes() const;
-    DIArray getSubprograms() const;
-    DIArray getGlobalVariables() const;
-    DIArray getImportedEntities() const;
-
-    StringRef getSplitDebugFilename() const { return getStringField(12); }
-
-    /// Verify - Verify that a compile unit is well formed.
-    bool Verify() const;
-  };
-
-  /// DISubprogram - This is a wrapper for a subprogram (e.g. a function).
-  class DISubprogram : public DIScope {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DISubprogram(const MDNode *N = 0) : DIScope(N) {}
-
-    DIScope getContext() const          { return getFieldAs<DIScope>(2); }
-    StringRef getName() const         { return getStringField(3); }
-    StringRef getDisplayName() const  { return getStringField(4); }
-    StringRef getLinkageName() const  { return getStringField(5); }
-    unsigned getLineNumber() const      { return getUnsignedField(6); }
-    DICompositeType getType() const { return getFieldAs<DICompositeType>(7); }
-
-    /// isLocalToUnit - Return true if this subprogram is local to the current
-    /// compile unit, like 'static' in C.
-    unsigned isLocalToUnit() const     { return getUnsignedField(8); }
-    unsigned isDefinition() const      { return getUnsignedField(9); }
-
-    unsigned getVirtuality() const { return getUnsignedField(10); }
-    unsigned getVirtualIndex() const { return getUnsignedField(11); }
-
-    DICompositeType getContainingType() const {
-      return getFieldAs<DICompositeType>(12);
-    }
-
-    unsigned getFlags() const {
-      return getUnsignedField(13);
-    }
-
-    unsigned isArtificial() const    {
-      return (getUnsignedField(13) & FlagArtificial) != 0;
-    }
-    /// isPrivate - Return true if this subprogram has "private"
-    /// access specifier.
-    bool isPrivate() const    {
-      return (getUnsignedField(13) & FlagPrivate) != 0;
-    }
-    /// isProtected - Return true if this subprogram has "protected"
-    /// access specifier.
-    bool isProtected() const    {
-      return (getUnsignedField(13) & FlagProtected) != 0;
-    }
-    /// isExplicit - Return true if this subprogram is marked as explicit.
-    bool isExplicit() const    {
-      return (getUnsignedField(13) & FlagExplicit) != 0;
-    }
-    /// isPrototyped - Return true if this subprogram is prototyped.
-    bool isPrototyped() const    {
-      return (getUnsignedField(13) & FlagPrototyped) != 0;
-    }
-
-    unsigned isOptimized() const;
-
-    /// Verify - Verify that a subprogram descriptor is well formed.
-    bool Verify() const;
-
-    /// describes - Return true if this subprogram provides debugging
-    /// information for the function F.
-    bool describes(const Function *F);
-
-    Function *getFunction() const { return getFunctionField(15); }
-    void replaceFunction(Function *F) { replaceFunctionField(15, F); }
-    DIArray getTemplateParams() const { return getFieldAs<DIArray>(16); }
-    DISubprogram getFunctionDeclaration() const {
-      return getFieldAs<DISubprogram>(17);
-    }
-    MDNode *getVariablesNodes() const;
-    DIArray getVariables() const;
-
-    /// getScopeLineNumber - Get the beginning of the scope of the
-    /// function, not necessarily where the name of the program
-    /// starts.
-    unsigned getScopeLineNumber() const { return getUnsignedField(19); }
-  };
-
-  /// DILexicalBlock - This is a wrapper for a lexical block.
-  class DILexicalBlock : public DIScope {
-  public:
-    explicit DILexicalBlock(const MDNode *N = 0) : DIScope(N) {}
-    DIScope getContext() const       { return getFieldAs<DIScope>(2);      }
-    unsigned getLineNumber() const   { return getUnsignedField(3);         }
-    unsigned getColumnNumber() const { return getUnsignedField(4);         }
-    bool Verify() const;
-  };
-
-  /// DILexicalBlockFile - This is a wrapper for a lexical block with
-  /// a filename change.
-  class DILexicalBlockFile : public DIScope {
-  public:
-    explicit DILexicalBlockFile(const MDNode *N = 0) : DIScope(N) {}
-    DIScope getContext() const {
-      if (getScope().isSubprogram())
-        return getScope();
-      return getScope().getContext();
-    }
-    unsigned getLineNumber() const { return getScope().getLineNumber(); }
-    unsigned getColumnNumber() const { return getScope().getColumnNumber(); }
-    DILexicalBlock getScope() const { return getFieldAs<DILexicalBlock>(2); }
-    bool Verify() const;
-  };
-
-  /// DINameSpace - A wrapper for a C++ style name space.
-  class DINameSpace : public DIScope {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DINameSpace(const MDNode *N = 0) : DIScope(N) {}
-    DIScope getContext() const     { return getFieldAs<DIScope>(2);      }
-    StringRef getName() const      { return getStringField(3);           }
-    unsigned getLineNumber() const { return getUnsignedField(4);         }
-    bool Verify() const;
-  };
-
-  /// DITemplateTypeParameter - This is a wrapper for template type parameter.
-  class DITemplateTypeParameter : public DIDescriptor {
-  public:
-    explicit DITemplateTypeParameter(const MDNode *N = 0) : DIDescriptor(N) {}
-
-    DIScope getContext() const       { return getFieldAs<DIScope>(1); }
-    StringRef getName() const        { return getStringField(2); }
-    DIType getType() const           { return getFieldAs<DIType>(3); }
-    StringRef getFilename() const    {
-      return getFieldAs<DIFile>(4).getFilename();
-    }
-    StringRef getDirectory() const   {
-      return getFieldAs<DIFile>(4).getDirectory();
-    }
-    unsigned getLineNumber() const   { return getUnsignedField(5); }
-    unsigned getColumnNumber() const { return getUnsignedField(6); }
-    bool Verify() const;
-  };
-
-  /// DITemplateValueParameter - This is a wrapper for template value parameter.
-  class DITemplateValueParameter : public DIDescriptor {
-  public:
-    explicit DITemplateValueParameter(const MDNode *N = 0) : DIDescriptor(N) {}
-
-    DIScope getContext() const       { return getFieldAs<DIScope>(1); }
-    StringRef getName() const        { return getStringField(2); }
-    DIType getType() const           { return getFieldAs<DIType>(3); }
-    Value *getValue() const;
-    StringRef getFilename() const    {
-      return getFieldAs<DIFile>(5).getFilename();
-    }
-    StringRef getDirectory() const   {
-      return getFieldAs<DIFile>(5).getDirectory();
-    }
-    unsigned getLineNumber() const   { return getUnsignedField(6); }
-    unsigned getColumnNumber() const { return getUnsignedField(7); }
-    bool Verify() const;
-  };
-
-  /// DIGlobalVariable - This is a wrapper for a global variable.
-  class DIGlobalVariable : public DIDescriptor {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DIGlobalVariable(const MDNode *N = 0) : DIDescriptor(N) {}
-
-    DIScope getContext() const          { return getFieldAs<DIScope>(2); }
-    StringRef getName() const         { return getStringField(3); }
-    StringRef getDisplayName() const  { return getStringField(4); }
-    StringRef getLinkageName() const  { return getStringField(5); }
-    StringRef getFilename() const {
-      return getFieldAs<DIFile>(6).getFilename();
-    }
-    StringRef getDirectory() const {
-      return getFieldAs<DIFile>(6).getDirectory();
-
-    }
-
-    unsigned getLineNumber() const      { return getUnsignedField(7); }
-    DIType getType() const              { return getFieldAs<DIType>(8); }
-    unsigned isLocalToUnit() const      { return getUnsignedField(9); }
-    unsigned isDefinition() const       { return getUnsignedField(10); }
-
-    GlobalVariable *getGlobal() const { return getGlobalVariableField(11); }
-    Constant *getConstant() const   { return getConstantField(11); }
-    DIDerivedType getStaticDataMemberDeclaration() const {
-      return getFieldAs<DIDerivedType>(12);
-    }
-
-    /// Verify - Verify that a global variable descriptor is well formed.
-    bool Verify() const;
-  };
-
-  /// DIVariable - This is a wrapper for a variable (e.g. parameter, local,
-  /// global etc).
-  class DIVariable : public DIDescriptor {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DIVariable(const MDNode *N = 0) : DIDescriptor(N) {}
-
-    DIScope getContext() const          { return getFieldAs<DIScope>(1); }
-    StringRef getName() const           { return getStringField(2);     }
-    DIFile getFile() const              { return getFieldAs<DIFile>(3); }
-    unsigned getLineNumber() const      {
-      return (getUnsignedField(4) << 8) >> 8;
-    }
-    unsigned getArgNumber() const       {
-      unsigned L = getUnsignedField(4);
-      return L >> 24;
-    }
-    DIType getType() const              { return getFieldAs<DIType>(5); }
-
-    /// isArtificial - Return true if this variable is marked as "artificial".
-    bool isArtificial() const    {
-      return (getUnsignedField(6) & FlagArtificial) != 0;
-    }
-
-    bool isObjectPointer() const {
-      return (getUnsignedField(6) & FlagObjectPointer) != 0;
-    }
-
-    /// \brief Return true if this variable is represented as a pointer.
-    bool isIndirect() const {
-      return (getUnsignedField(6) & FlagIndirectVariable) != 0;
-    }
-
-    /// getInlinedAt - If this variable is inlined then return inline location.
-    MDNode *getInlinedAt() const;
-
-    /// Verify - Verify that a variable descriptor is well formed.
-    bool Verify() const;
-
-    /// HasComplexAddr - Return true if the variable has a complex address.
-    bool hasComplexAddress() const {
-      return getNumAddrElements() > 0;
-    }
-
-    unsigned getNumAddrElements() const;
-
-    uint64_t getAddrElement(unsigned Idx) const {
-      return getUInt64Field(Idx+8);
-    }
-
-    /// isBlockByrefVariable - Return true if the variable was declared as
-    /// a "__block" variable (Apple Blocks).
-    bool isBlockByrefVariable() const {
-      return getType().isBlockByrefStruct();
-    }
-
-    /// isInlinedFnArgument - Return true if this variable provides debugging
-    /// information for an inlined function arguments.
-    bool isInlinedFnArgument(const Function *CurFn);
-
-    void printExtendedName(raw_ostream &OS) const;
-  };
-
-  /// DILocation - This object holds location information. This object
-  /// is not associated with any DWARF tag.
-  class DILocation : public DIDescriptor {
-  public:
-    explicit DILocation(const MDNode *N) : DIDescriptor(N) { }
-
-    unsigned getLineNumber() const     { return getUnsignedField(0); }
-    unsigned getColumnNumber() const   { return getUnsignedField(1); }
-    DIScope  getScope() const          { return getFieldAs<DIScope>(2); }
-    DILocation getOrigLocation() const { return getFieldAs<DILocation>(3); }
-    StringRef getFilename() const    { return getScope().getFilename(); }
-    StringRef getDirectory() const   { return getScope().getDirectory(); }
-    bool Verify() const;
-  };
-
-  class DIObjCProperty : public DIDescriptor {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DIObjCProperty(const MDNode *N) : DIDescriptor(N) { }
-
-    StringRef getObjCPropertyName() const { return getStringField(1); }
-    DIFile getFile() const { return getFieldAs<DIFile>(2); }
-    unsigned getLineNumber() const { return getUnsignedField(3); }
-
-    StringRef getObjCPropertyGetterName() const {
-      return getStringField(4);
-    }
-    StringRef getObjCPropertySetterName() const {
-      return getStringField(5);
-    }
-    bool isReadOnlyObjCProperty() const {
-      return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_readonly) != 0;
-    }
-    bool isReadWriteObjCProperty() const {
-      return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_readwrite) != 0;
-    }
-    bool isAssignObjCProperty() const {
-      return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_assign) != 0;
-    }
-    bool isRetainObjCProperty() const {
-      return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_retain) != 0;
-    }
-    bool isCopyObjCProperty() const {
-      return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_copy) != 0;
-    }
-    bool isNonAtomicObjCProperty() const {
-      return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_nonatomic) != 0;
-    }
-
-    DIType getType() const { return getFieldAs<DIType>(7); }
-
-    /// Verify - Verify that a derived type descriptor is well formed.
-    bool Verify() const;
-  };
-
-  /// \brief An imported module (C++ using directive or similar).
-  class DIImportedEntity : public DIDescriptor {
-    friend class DIDescriptor;
-    void printInternal(raw_ostream &OS) const;
-  public:
-    explicit DIImportedEntity(const MDNode *N) : DIDescriptor(N) { }
-    DIScope getContext() const { return getFieldAs<DIScope>(1); }
-    DIDescriptor getEntity() const { return getFieldAs<DIDescriptor>(2); }
-    unsigned getLineNumber() const { return getUnsignedField(3); }
-    StringRef getName() const { return getStringField(4); }
-    bool Verify() const;
-  };
-
-  /// getDISubprogram - Find subprogram that is enclosing this scope.
-  DISubprogram getDISubprogram(const MDNode *Scope);
-
-  /// getDICompositeType - Find underlying composite type.
-  DICompositeType getDICompositeType(DIType T);
-
-  /// isSubprogramContext - Return true if Context is either a subprogram
-  /// or another context nested inside a subprogram.
-  bool isSubprogramContext(const MDNode *Context);
-
-  /// getOrInsertFnSpecificMDNode - Return a NameMDNode that is suitable
-  /// to hold function specific information.
-  NamedMDNode *getOrInsertFnSpecificMDNode(Module &M, DISubprogram SP);
-
-  /// getFnSpecificMDNode - Return a NameMDNode, if available, that is
-  /// suitable to hold function specific information.
-  NamedMDNode *getFnSpecificMDNode(const Module &M, DISubprogram SP);
-
-  /// createInlinedVariable - Create a new inlined variable based on current
-  /// variable.
-  /// @param DV            Current Variable.
-  /// @param InlinedScope  Location at current variable is inlined.
-  DIVariable createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
-                                   LLVMContext &VMContext);
-
-  /// cleanseInlinedVariable - Remove inlined scope from the variable.
-  DIVariable cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext);
-
-  /// DebugInfoFinder tries to list all debug info MDNodes used in a module. To
-  /// list debug info MDNodes used by an instruction, DebugInfoFinder uses
-  /// processDeclare, processValue and processLocation to handle DbgDeclareInst,
-  /// DbgValueInst and DbgLoc attached to instructions. processModule will go
-  /// through all DICompileUnits in llvm.dbg.cu and list debug info MDNodes
-  /// used by the CUs.
-  class DebugInfoFinder {
-  public:
-    /// processModule - Process entire module and collect debug info
-    /// anchors.
-    void processModule(const Module &M);
-
-    /// processDeclare - Process DbgDeclareInst.
-    void processDeclare(const DbgDeclareInst *DDI);
-    /// Process DbgValueInst.
-    void processValue(const DbgValueInst *DVI);
-    /// processLocation - Process DILocation.
-    void processLocation(DILocation Loc);
-
-    /// Clear all lists.
-    void reset();
-  private:
-    /// processType - Process DIType.
-    void processType(DIType DT);
-
-    /// processLexicalBlock - Process DILexicalBlock.
-    void processLexicalBlock(DILexicalBlock LB);
-
-    /// processSubprogram - Process DISubprogram.
-    void processSubprogram(DISubprogram SP);
-
-    void processScope(DIScope Scope);
-
-    /// addCompileUnit - Add compile unit into CUs.
-    bool addCompileUnit(DICompileUnit CU);
-
-    /// addGlobalVariable - Add global variable into GVs.
-    bool addGlobalVariable(DIGlobalVariable DIG);
-
-    // addSubprogram - Add subprogram into SPs.
-    bool addSubprogram(DISubprogram SP);
-
-    /// addType - Add type into Tys.
-    bool addType(DIType DT);
-
-    bool addScope(DIScope Scope);
-
-  public:
-    typedef SmallVectorImpl<MDNode *>::const_iterator iterator;
-    iterator compile_unit_begin()    const { return CUs.begin(); }
-    iterator compile_unit_end()      const { return CUs.end(); }
-    iterator subprogram_begin()      const { return SPs.begin(); }
-    iterator subprogram_end()        const { return SPs.end(); }
-    iterator global_variable_begin() const { return GVs.begin(); }
-    iterator global_variable_end()   const { return GVs.end(); }
-    iterator type_begin()            const { return TYs.begin(); }
-    iterator type_end()              const { return TYs.end(); }
-    iterator scope_begin()           const { return Scopes.begin(); }
-    iterator scope_end()             const { return Scopes.end(); }
-
-    unsigned compile_unit_count()    const { return CUs.size(); }
-    unsigned global_variable_count() const { return GVs.size(); }
-    unsigned subprogram_count()      const { return SPs.size(); }
-    unsigned type_count()            const { return TYs.size(); }
-    unsigned scope_count()           const { return Scopes.size(); }
-
-  private:
-    SmallVector<MDNode *, 8> CUs;  // Compile Units
-    SmallVector<MDNode *, 8> SPs;  // Subprograms
-    SmallVector<MDNode *, 8> GVs;  // Global Variables;
-    SmallVector<MDNode *, 8> TYs;  // Types
-    SmallVector<MDNode *, 8> Scopes; // Scopes
-    SmallPtrSet<MDNode *, 64> NodesSeen;
-  };
+protected:
+  const MDNode *DbgNode;
+
+  StringRef getStringField(unsigned Elt) const;
+  unsigned getUnsignedField(unsigned Elt) const {
+    return (unsigned)getUInt64Field(Elt);
+  }
+  uint64_t getUInt64Field(unsigned Elt) const;
+  int64_t getInt64Field(unsigned Elt) const;
+  DIDescriptor getDescriptorField(unsigned Elt) const;
+
+  template <typename DescTy> DescTy getFieldAs(unsigned Elt) const {
+    return DescTy(getDescriptorField(Elt));
+  }
+
+  GlobalVariable *getGlobalVariableField(unsigned Elt) const;
+  Constant *getConstantField(unsigned Elt) const;
+  Function *getFunctionField(unsigned Elt) const;
+  void replaceFunctionField(unsigned Elt, Function *F);
+
+public:
+  explicit DIDescriptor(const MDNode *N = 0) : DbgNode(N) {}
+
+  bool Verify() const;
+
+  operator MDNode *() const { return const_cast<MDNode *>(DbgNode); }
+  MDNode *operator->() const { return const_cast<MDNode *>(DbgNode); }
+
+  // An explicit operator bool so that we can do testing of DI values
+  // easily.
+  // FIXME: This operator bool isn't actually protecting anything at the
+  // moment due to the conversion operator above making DIDescriptor nodes
+  // implicitly convertable to bool.
+  LLVM_EXPLICIT operator bool() const { return DbgNode != 0; }
+
+  bool operator==(DIDescriptor Other) const { return DbgNode == Other.DbgNode; }
+  bool operator!=(DIDescriptor Other) const { return !operator==(Other); }
+
+  uint16_t getTag() const {
+    return getUnsignedField(0) & ~LLVMDebugVersionMask;
+  }
+
+  bool isDerivedType() const;
+  bool isCompositeType() const;
+  bool isBasicType() const;
+  bool isVariable() const;
+  bool isSubprogram() const;
+  bool isGlobalVariable() const;
+  bool isScope() const;
+  bool isFile() const;
+  bool isCompileUnit() const;
+  bool isNameSpace() const;
+  bool isLexicalBlockFile() const;
+  bool isLexicalBlock() const;
+  bool isSubrange() const;
+  bool isEnumerator() const;
+  bool isType() const;
+  bool isUnspecifiedParameter() const;
+  bool isTemplateTypeParameter() const;
+  bool isTemplateValueParameter() const;
+  bool isObjCProperty() const;
+  bool isImportedEntity() const;
+
+  /// print - print descriptor.
+  void print(raw_ostream &OS) const;
+
+  /// dump - print descriptor to dbgs() with a newline.
+  void dump() const;
+};
+
+/// DISubrange - This is used to represent ranges, for array bounds.
+class DISubrange : public DIDescriptor {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DISubrange(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  int64_t getLo() const { return getInt64Field(1); }
+  int64_t getCount() const { return getInt64Field(2); }
+  bool Verify() const;
+};
+
+/// DIArray - This descriptor holds an array of descriptors.
+class DIArray : public DIDescriptor {
+public:
+  explicit DIArray(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  unsigned getNumElements() const;
+  DIDescriptor getElement(unsigned Idx) const {
+    return getDescriptorField(Idx);
+  }
+};
+
+/// DIEnumerator - A wrapper for an enumerator (e.g. X and Y in 'enum {X,Y}').
+/// FIXME: it seems strange that this doesn't have either a reference to the
+/// type/precision or a file/line pair for location info.
+class DIEnumerator : public DIDescriptor {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIEnumerator(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  StringRef getName() const { return getStringField(1); }
+  int64_t getEnumValue() const { return getInt64Field(2); }
+  bool Verify() const;
+};
+
+template <typename T> class DIRef;
+typedef DIRef<DIScope> DIScopeRef;
+typedef DIRef<DIType> DITypeRef;
+
+/// DIScope - A base class for various scopes.
+class DIScope : public DIDescriptor {
+protected:
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIScope(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  /// Gets the parent scope for this scope node or returns a
+  /// default constructed scope.
+  DIScopeRef getContext() const;
+  /// If the scope node has a name, return that, else return an empty string.
+  StringRef getName() const;
+  StringRef getFilename() const;
+  StringRef getDirectory() const;
+
+  /// Generate a reference to this DIScope. Uses the type identifier instead
+  /// of the actual MDNode if possible, to help type uniquing.
+  DIScopeRef getRef() const;
+};
+
+/// Represents reference to a DIDescriptor, abstracts over direct and
+/// identifier-based metadata references.
+template <typename T> class DIRef {
+  template <typename DescTy>
+  friend DescTy DIDescriptor::getFieldAs(unsigned Elt) const;
+  friend DIScopeRef DIScope::getContext() const;
+  friend DIScopeRef DIScope::getRef() const;
+
+  /// Val can be either a MDNode or a MDString, in the latter,
+  /// MDString specifies the type identifier.
+  const Value *Val;
+  explicit DIRef(const Value *V);
+
+public:
+  T resolve(const DITypeIdentifierMap &Map) const;
+  StringRef getName() const;
+  operator Value *() const { return const_cast<Value *>(Val); }
+};
+
+template <typename T>
+T DIRef<T>::resolve(const DITypeIdentifierMap &Map) const {
+  if (!Val)
+    return T();
+
+  if (const MDNode *MD = dyn_cast<MDNode>(Val))
+    return T(MD);
+
+  const MDString *MS = cast<MDString>(Val);
+  // Find the corresponding MDNode.
+  DITypeIdentifierMap::const_iterator Iter = Map.find(MS);
+  assert(Iter != Map.end() && "Identifier not in the type map?");
+  assert(DIDescriptor(Iter->second).isType() &&
+         "MDNode in DITypeIdentifierMap should be a DIType.");
+  return T(Iter->second);
+}
+
+template <typename T> StringRef DIRef<T>::getName() const {
+  if (!Val)
+    return StringRef();
+
+  if (const MDNode *MD = dyn_cast<MDNode>(Val))
+    return T(MD).getName();
+
+  const MDString *MS = cast<MDString>(Val);
+  return MS->getString();
+}
+
+/// Specialize getFieldAs to handle fields that are references to DIScopes.
+template <> DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const;
+/// Specialize DIRef constructor for DIScopeRef.
+template <> DIRef<DIScope>::DIRef(const Value *V);
+
+/// Specialize getFieldAs to handle fields that are references to DITypes.
+template <> DITypeRef DIDescriptor::getFieldAs<DITypeRef>(unsigned Elt) const;
+/// Specialize DIRef constructor for DITypeRef.
+template <> DIRef<DIType>::DIRef(const Value *V);
+
+/// DIType - This is a wrapper for a type.
+/// FIXME: Types should be factored much better so that CV qualifiers and
+/// others do not require a huge and empty descriptor full of zeros.
+class DIType : public DIScope {
+protected:
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIType(const MDNode *N = 0) : DIScope(N) {}
+
+  /// Verify - Verify that a type descriptor is well formed.
+  bool Verify() const;
+
+  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(2); }
+  StringRef getName() const { return getStringField(3); }
+  unsigned getLineNumber() const { return getUnsignedField(4); }
+  uint64_t getSizeInBits() const { return getUInt64Field(5); }
+  uint64_t getAlignInBits() const { return getUInt64Field(6); }
+  // FIXME: Offset is only used for DW_TAG_member nodes.  Making every type
+  // carry this is just plain insane.
+  uint64_t getOffsetInBits() const { return getUInt64Field(7); }
+  unsigned getFlags() const { return getUnsignedField(8); }
+  bool isPrivate() const { return (getFlags() & FlagPrivate) != 0; }
+  bool isProtected() const { return (getFlags() & FlagProtected) != 0; }
+  bool isForwardDecl() const { return (getFlags() & FlagFwdDecl) != 0; }
+  // isAppleBlock - Return true if this is the Apple Blocks extension.
+  bool isAppleBlockExtension() const {
+    return (getFlags() & FlagAppleBlock) != 0;
+  }
+  bool isBlockByrefStruct() const {
+    return (getFlags() & FlagBlockByrefStruct) != 0;
+  }
+  bool isVirtual() const { return (getFlags() & FlagVirtual) != 0; }
+  bool isArtificial() const { return (getFlags() & FlagArtificial) != 0; }
+  bool isObjectPointer() const { return (getFlags() & FlagObjectPointer) != 0; }
+  bool isObjcClassComplete() const {
+    return (getFlags() & FlagObjcClassComplete) != 0;
+  }
+  bool isVector() const { return (getFlags() & FlagVector) != 0; }
+  bool isStaticMember() const { return (getFlags() & FlagStaticMember) != 0; }
+  bool isValid() const { return DbgNode && isType(); }
+
+  /// replaceAllUsesWith - Replace all uses of debug info referenced by
+  /// this descriptor.
+  void replaceAllUsesWith(DIDescriptor &D);
+  void replaceAllUsesWith(MDNode *D);
+};
+
+/// DIBasicType - A basic type, like 'int' or 'float'.
+class DIBasicType : public DIType {
+public:
+  explicit DIBasicType(const MDNode *N = 0) : DIType(N) {}
+
+  unsigned getEncoding() const { return getUnsignedField(9); }
+
+  /// Verify - Verify that a basic type descriptor is well formed.
+  bool Verify() const;
+};
+
+/// DIDerivedType - A simple derived type, like a const qualified type,
+/// a typedef, a pointer or reference, et cetera.  Or, a data member of
+/// a class/struct/union.
+class DIDerivedType : public DIType {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIDerivedType(const MDNode *N = 0) : DIType(N) {}
+
+  DITypeRef getTypeDerivedFrom() const { return getFieldAs<DITypeRef>(9); }
+
+  /// getObjCProperty - Return property node, if this ivar is
+  /// associated with one.
+  MDNode *getObjCProperty() const;
+
+  DITypeRef getClassType() const {
+    assert(getTag() == dwarf::DW_TAG_ptr_to_member_type);
+    return getFieldAs<DITypeRef>(10);
+  }
+
+  Constant *getConstant() const {
+    assert((getTag() == dwarf::DW_TAG_member) && isStaticMember());
+    return getConstantField(10);
+  }
+
+  /// Verify - Verify that a derived type descriptor is well formed.
+  bool Verify() const;
+};
+
+/// DICompositeType - This descriptor holds a type that can refer to multiple
+/// other types, like a function or struct.
+/// DICompositeType is derived from DIDerivedType because some
+/// composite types (such as enums) can be derived from basic types
+// FIXME: Make this derive from DIType directly & just store the
+// base type in a single DIType field.
+class DICompositeType : public DIDerivedType {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DICompositeType(const MDNode *N = 0) : DIDerivedType(N) {}
+
+  DIArray getTypeArray() const { return getFieldAs<DIArray>(10); }
+  void setTypeArray(DIArray Elements, DIArray TParams = DIArray());
+  void addMember(DIDescriptor D);
+  unsigned getRunTimeLang() const { return getUnsignedField(11); }
+  DITypeRef getContainingType() const { return getFieldAs<DITypeRef>(12); }
+  void setContainingType(DICompositeType ContainingType);
+  DIArray getTemplateParams() const { return getFieldAs<DIArray>(13); }
+  MDString *getIdentifier() const;
+
+  /// Verify - Verify that a composite type descriptor is well formed.
+  bool Verify() const;
+};
+
+/// DIFile - This is a wrapper for a file.
+class DIFile : public DIScope {
+  friend class DIDescriptor;
+
+public:
+  explicit DIFile(const MDNode *N = 0) : DIScope(N) {}
+  MDNode *getFileNode() const;
+  bool Verify() const;
+};
+
+/// DICompileUnit - A wrapper for a compile unit.
+class DICompileUnit : public DIScope {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DICompileUnit(const MDNode *N = 0) : DIScope(N) {}
+
+  unsigned getLanguage() const { return getUnsignedField(2); }
+  StringRef getProducer() const { return getStringField(3); }
+
+  bool isOptimized() const { return getUnsignedField(4) != 0; }
+  StringRef getFlags() const { return getStringField(5); }
+  unsigned getRunTimeVersion() const { return getUnsignedField(6); }
+
+  DIArray getEnumTypes() const;
+  DIArray getRetainedTypes() const;
+  DIArray getSubprograms() const;
+  DIArray getGlobalVariables() const;
+  DIArray getImportedEntities() const;
+
+  StringRef getSplitDebugFilename() const { return getStringField(12); }
+
+  /// Verify - Verify that a compile unit is well formed.
+  bool Verify() const;
+};
+
+/// DISubprogram - This is a wrapper for a subprogram (e.g. a function).
+class DISubprogram : public DIScope {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DISubprogram(const MDNode *N = 0) : DIScope(N) {}
+
+  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(2); }
+  StringRef getName() const { return getStringField(3); }
+  StringRef getDisplayName() const { return getStringField(4); }
+  StringRef getLinkageName() const { return getStringField(5); }
+  unsigned getLineNumber() const { return getUnsignedField(6); }
+  DICompositeType getType() const { return getFieldAs<DICompositeType>(7); }
+
+  /// isLocalToUnit - Return true if this subprogram is local to the current
+  /// compile unit, like 'static' in C.
+  unsigned isLocalToUnit() const { return getUnsignedField(8); }
+  unsigned isDefinition() const { return getUnsignedField(9); }
+
+  unsigned getVirtuality() const { return getUnsignedField(10); }
+  unsigned getVirtualIndex() const { return getUnsignedField(11); }
+
+  DITypeRef getContainingType() const { return getFieldAs<DITypeRef>(12); }
+
+  unsigned getFlags() const { return getUnsignedField(13); }
+
+  unsigned isArtificial() const {
+    return (getUnsignedField(13) & FlagArtificial) != 0;
+  }
+  /// isPrivate - Return true if this subprogram has "private"
+  /// access specifier.
+  bool isPrivate() const { return (getUnsignedField(13) & FlagPrivate) != 0; }
+  /// isProtected - Return true if this subprogram has "protected"
+  /// access specifier.
+  bool isProtected() const {
+    return (getUnsignedField(13) & FlagProtected) != 0;
+  }
+  /// isExplicit - Return true if this subprogram is marked as explicit.
+  bool isExplicit() const { return (getUnsignedField(13) & FlagExplicit) != 0; }
+  /// isPrototyped - Return true if this subprogram is prototyped.
+  bool isPrototyped() const {
+    return (getUnsignedField(13) & FlagPrototyped) != 0;
+  }
+
+  unsigned isOptimized() const;
+
+  /// Verify - Verify that a subprogram descriptor is well formed.
+  bool Verify() const;
+
+  /// describes - Return true if this subprogram provides debugging
+  /// information for the function F.
+  bool describes(const Function *F);
+
+  Function *getFunction() const { return getFunctionField(15); }
+  void replaceFunction(Function *F) { replaceFunctionField(15, F); }
+  DIArray getTemplateParams() const { return getFieldAs<DIArray>(16); }
+  DISubprogram getFunctionDeclaration() const {
+    return getFieldAs<DISubprogram>(17);
+  }
+  MDNode *getVariablesNodes() const;
+  DIArray getVariables() const;
+
+  /// getScopeLineNumber - Get the beginning of the scope of the
+  /// function, not necessarily where the name of the program
+  /// starts.
+  unsigned getScopeLineNumber() const { return getUnsignedField(19); }
+};
+
+/// DILexicalBlock - This is a wrapper for a lexical block.
+class DILexicalBlock : public DIScope {
+public:
+  explicit DILexicalBlock(const MDNode *N = 0) : DIScope(N) {}
+  DIScope getContext() const { return getFieldAs<DIScope>(2); }
+  unsigned getLineNumber() const { return getUnsignedField(3); }
+  unsigned getColumnNumber() const { return getUnsignedField(4); }
+  bool Verify() const;
+};
+
+/// DILexicalBlockFile - This is a wrapper for a lexical block with
+/// a filename change.
+class DILexicalBlockFile : public DIScope {
+public:
+  explicit DILexicalBlockFile(const MDNode *N = 0) : DIScope(N) {}
+  DIScope getContext() const {
+    if (getScope().isSubprogram())
+      return getScope();
+    return getScope().getContext();
+  }
+  unsigned getLineNumber() const { return getScope().getLineNumber(); }
+  unsigned getColumnNumber() const { return getScope().getColumnNumber(); }
+  DILexicalBlock getScope() const { return getFieldAs<DILexicalBlock>(2); }
+  bool Verify() const;
+};
+
+/// DINameSpace - A wrapper for a C++ style name space.
+class DINameSpace : public DIScope {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DINameSpace(const MDNode *N = 0) : DIScope(N) {}
+  DIScope getContext() const { return getFieldAs<DIScope>(2); }
+  StringRef getName() const { return getStringField(3); }
+  unsigned getLineNumber() const { return getUnsignedField(4); }
+  bool Verify() const;
+};
+
+/// DITemplateTypeParameter - This is a wrapper for template type parameter.
+class DITemplateTypeParameter : public DIDescriptor {
+public:
+  explicit DITemplateTypeParameter(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
+  StringRef getName() const { return getStringField(2); }
+  DITypeRef getType() const { return getFieldAs<DITypeRef>(3); }
+  StringRef getFilename() const { return getFieldAs<DIFile>(4).getFilename(); }
+  StringRef getDirectory() const {
+    return getFieldAs<DIFile>(4).getDirectory();
+  }
+  unsigned getLineNumber() const { return getUnsignedField(5); }
+  unsigned getColumnNumber() const { return getUnsignedField(6); }
+  bool Verify() const;
+};
+
+/// DITemplateValueParameter - This is a wrapper for template value parameter.
+class DITemplateValueParameter : public DIDescriptor {
+public:
+  explicit DITemplateValueParameter(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  DIScopeRef getContext() const { return getFieldAs<DIScopeRef>(1); }
+  StringRef getName() const { return getStringField(2); }
+  DITypeRef getType() const { return getFieldAs<DITypeRef>(3); }
+  Value *getValue() const;
+  StringRef getFilename() const { return getFieldAs<DIFile>(5).getFilename(); }
+  StringRef getDirectory() const {
+    return getFieldAs<DIFile>(5).getDirectory();
+  }
+  unsigned getLineNumber() const { return getUnsignedField(6); }
+  unsigned getColumnNumber() const { return getUnsignedField(7); }
+  bool Verify() const;
+};
+
+/// DIGlobalVariable - This is a wrapper for a global variable.
+class DIGlobalVariable : public DIDescriptor {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIGlobalVariable(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  DIScope getContext() const { return getFieldAs<DIScope>(2); }
+  StringRef getName() const { return getStringField(3); }
+  StringRef getDisplayName() const { return getStringField(4); }
+  StringRef getLinkageName() const { return getStringField(5); }
+  StringRef getFilename() const { return getFieldAs<DIFile>(6).getFilename(); }
+  StringRef getDirectory() const {
+    return getFieldAs<DIFile>(6).getDirectory();
+  }
+
+  unsigned getLineNumber() const { return getUnsignedField(7); }
+  DIType getType() const { return getFieldAs<DIType>(8); }
+  unsigned isLocalToUnit() const { return getUnsignedField(9); }
+  unsigned isDefinition() const { return getUnsignedField(10); }
+
+  GlobalVariable *getGlobal() const { return getGlobalVariableField(11); }
+  Constant *getConstant() const { return getConstantField(11); }
+  DIDerivedType getStaticDataMemberDeclaration() const {
+    return getFieldAs<DIDerivedType>(12);
+  }
+
+  /// Verify - Verify that a global variable descriptor is well formed.
+  bool Verify() const;
+};
+
+/// DIVariable - This is a wrapper for a variable (e.g. parameter, local,
+/// global etc).
+class DIVariable : public DIDescriptor {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIVariable(const MDNode *N = 0) : DIDescriptor(N) {}
+
+  DIScope getContext() const { return getFieldAs<DIScope>(1); }
+  StringRef getName() const { return getStringField(2); }
+  DIFile getFile() const { return getFieldAs<DIFile>(3); }
+  unsigned getLineNumber() const { return (getUnsignedField(4) << 8) >> 8; }
+  unsigned getArgNumber() const {
+    unsigned L = getUnsignedField(4);
+    return L >> 24;
+  }
+  DIType getType() const { return getFieldAs<DIType>(5); }
+
+  /// isArtificial - Return true if this variable is marked as "artificial".
+  bool isArtificial() const {
+    return (getUnsignedField(6) & FlagArtificial) != 0;
+  }
+
+  bool isObjectPointer() const {
+    return (getUnsignedField(6) & FlagObjectPointer) != 0;
+  }
+
+  /// \brief Return true if this variable is represented as a pointer.
+  bool isIndirect() const {
+    return (getUnsignedField(6) & FlagIndirectVariable) != 0;
+  }
+
+  /// getInlinedAt - If this variable is inlined then return inline location.
+  MDNode *getInlinedAt() const;
+
+  /// Verify - Verify that a variable descriptor is well formed.
+  bool Verify() const;
+
+  /// HasComplexAddr - Return true if the variable has a complex address.
+  bool hasComplexAddress() const { return getNumAddrElements() > 0; }
+
+  unsigned getNumAddrElements() const;
+
+  uint64_t getAddrElement(unsigned Idx) const {
+    return getUInt64Field(Idx + 8);
+  }
+
+  /// isBlockByrefVariable - Return true if the variable was declared as
+  /// a "__block" variable (Apple Blocks).
+  bool isBlockByrefVariable() const { return getType().isBlockByrefStruct(); }
+
+  /// isInlinedFnArgument - Return true if this variable provides debugging
+  /// information for an inlined function arguments.
+  bool isInlinedFnArgument(const Function *CurFn);
+
+  void printExtendedName(raw_ostream &OS) const;
+};
+
+/// DILocation - This object holds location information. This object
+/// is not associated with any DWARF tag.
+class DILocation : public DIDescriptor {
+public:
+  explicit DILocation(const MDNode *N) : DIDescriptor(N) {}
+
+  unsigned getLineNumber() const { return getUnsignedField(0); }
+  unsigned getColumnNumber() const { return getUnsignedField(1); }
+  DIScope getScope() const { return getFieldAs<DIScope>(2); }
+  DILocation getOrigLocation() const { return getFieldAs<DILocation>(3); }
+  StringRef getFilename() const { return getScope().getFilename(); }
+  StringRef getDirectory() const { return getScope().getDirectory(); }
+  bool Verify() const;
+};
+
+class DIObjCProperty : public DIDescriptor {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIObjCProperty(const MDNode *N) : DIDescriptor(N) {}
+
+  StringRef getObjCPropertyName() const { return getStringField(1); }
+  DIFile getFile() const { return getFieldAs<DIFile>(2); }
+  unsigned getLineNumber() const { return getUnsignedField(3); }
+
+  StringRef getObjCPropertyGetterName() const { return getStringField(4); }
+  StringRef getObjCPropertySetterName() const { return getStringField(5); }
+  bool isReadOnlyObjCProperty() const {
+    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_readonly) != 0;
+  }
+  bool isReadWriteObjCProperty() const {
+    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_readwrite) != 0;
+  }
+  bool isAssignObjCProperty() const {
+    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_assign) != 0;
+  }
+  bool isRetainObjCProperty() const {
+    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_retain) != 0;
+  }
+  bool isCopyObjCProperty() const {
+    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_copy) != 0;
+  }
+  bool isNonAtomicObjCProperty() const {
+    return (getUnsignedField(6) & dwarf::DW_APPLE_PROPERTY_nonatomic) != 0;
+  }
+
+  DIType getType() const { return getFieldAs<DIType>(7); }
+
+  /// Verify - Verify that a derived type descriptor is well formed.
+  bool Verify() const;
+};
+
+/// \brief An imported module (C++ using directive or similar).
+class DIImportedEntity : public DIDescriptor {
+  friend class DIDescriptor;
+  void printInternal(raw_ostream &OS) const;
+
+public:
+  explicit DIImportedEntity(const MDNode *N) : DIDescriptor(N) {}
+  DIScope getContext() const { return getFieldAs<DIScope>(1); }
+  DIDescriptor getEntity() const { return getFieldAs<DIDescriptor>(2); }
+  unsigned getLineNumber() const { return getUnsignedField(3); }
+  StringRef getName() const { return getStringField(4); }
+  bool Verify() const;
+};
+
+/// getDISubprogram - Find subprogram that is enclosing this scope.
+DISubprogram getDISubprogram(const MDNode *Scope);
+
+/// getDICompositeType - Find underlying composite type.
+DICompositeType getDICompositeType(DIType T);
+
+/// getOrInsertFnSpecificMDNode - Return a NameMDNode that is suitable
+/// to hold function specific information.
+NamedMDNode *getOrInsertFnSpecificMDNode(Module &M, DISubprogram SP);
+
+/// getFnSpecificMDNode - Return a NameMDNode, if available, that is
+/// suitable to hold function specific information.
+NamedMDNode *getFnSpecificMDNode(const Module &M, DISubprogram SP);
+
+/// createInlinedVariable - Create a new inlined variable based on current
+/// variable.
+/// @param DV            Current Variable.
+/// @param InlinedScope  Location at current variable is inlined.
+DIVariable createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
+                                 LLVMContext &VMContext);
+
+/// cleanseInlinedVariable - Remove inlined scope from the variable.
+DIVariable cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext);
+
+/// Construct DITypeIdentifierMap by going through retained types of each CU.
+DITypeIdentifierMap generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes);
+
+/// Strip debug info in the module if it exists.
+/// To do this, we remove all calls to the debugger intrinsics and any named
+/// metadata for debugging. We also remove debug locations for instructions.
+/// Return true if module is modified.
+bool StripDebugInfo(Module &M);
+
+/// Return Debug Info Metadata Version by checking module flags.
+unsigned getDebugMetadataVersionFromModule(const Module &M);
+
+/// DebugInfoFinder tries to list all debug info MDNodes used in a module. To
+/// list debug info MDNodes used by an instruction, DebugInfoFinder uses
+/// processDeclare, processValue and processLocation to handle DbgDeclareInst,
+/// DbgValueInst and DbgLoc attached to instructions. processModule will go
+/// through all DICompileUnits in llvm.dbg.cu and list debug info MDNodes
+/// used by the CUs.
+class DebugInfoFinder {
+public:
+  DebugInfoFinder() : TypeMapInitialized(false) {}
+
+  /// processModule - Process entire module and collect debug info
+  /// anchors.
+  void processModule(const Module &M);
+
+  /// processDeclare - Process DbgDeclareInst.
+  void processDeclare(const Module &M, const DbgDeclareInst *DDI);
+  /// Process DbgValueInst.
+  void processValue(const Module &M, const DbgValueInst *DVI);
+  /// processLocation - Process DILocation.
+  void processLocation(const Module &M, DILocation Loc);
+
+  /// Clear all lists.
+  void reset();
+
+private:
+  /// Initialize TypeIdentifierMap.
+  void InitializeTypeMap(const Module &M);
+
+  /// processType - Process DIType.
+  void processType(DIType DT);
+
+  /// processLexicalBlock - Process DILexicalBlock.
+  void processLexicalBlock(DILexicalBlock LB);
+
+  /// processSubprogram - Process DISubprogram.
+  void processSubprogram(DISubprogram SP);
+
+  void processScope(DIScope Scope);
+
+  /// addCompileUnit - Add compile unit into CUs.
+  bool addCompileUnit(DICompileUnit CU);
+
+  /// addGlobalVariable - Add global variable into GVs.
+  bool addGlobalVariable(DIGlobalVariable DIG);
+
+  // addSubprogram - Add subprogram into SPs.
+  bool addSubprogram(DISubprogram SP);
+
+  /// addType - Add type into Tys.
+  bool addType(DIType DT);
+
+  bool addScope(DIScope Scope);
+
+public:
+  typedef SmallVectorImpl<MDNode *>::const_iterator iterator;
+  iterator compile_unit_begin() const { return CUs.begin(); }
+  iterator compile_unit_end() const { return CUs.end(); }
+  iterator subprogram_begin() const { return SPs.begin(); }
+  iterator subprogram_end() const { return SPs.end(); }
+  iterator global_variable_begin() const { return GVs.begin(); }
+  iterator global_variable_end() const { return GVs.end(); }
+  iterator type_begin() const { return TYs.begin(); }
+  iterator type_end() const { return TYs.end(); }
+  iterator scope_begin() const { return Scopes.begin(); }
+  iterator scope_end() const { return Scopes.end(); }
+
+  unsigned compile_unit_count() const { return CUs.size(); }
+  unsigned global_variable_count() const { return GVs.size(); }
+  unsigned subprogram_count() const { return SPs.size(); }
+  unsigned type_count() const { return TYs.size(); }
+  unsigned scope_count() const { return Scopes.size(); }
+
+private:
+  SmallVector<MDNode *, 8> CUs;    // Compile Units
+  SmallVector<MDNode *, 8> SPs;    // Subprograms
+  SmallVector<MDNode *, 8> GVs;    // Global Variables;
+  SmallVector<MDNode *, 8> TYs;    // Types
+  SmallVector<MDNode *, 8> Scopes; // Scopes
+  SmallPtrSet<MDNode *, 64> NodesSeen;
+  DITypeIdentifierMap TypeIdentifierMap;
+  /// Specify if TypeIdentifierMap is initialized.
+  bool TypeMapInitialized;
+};
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index 4bb7c77..a1a4642 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -38,11 +38,10 @@ public:
   DILineInfo()
     : FileName("<invalid>"), FunctionName("<invalid>"),
       Line(0), Column(0) {}
-  DILineInfo(const SmallString<16> &fileName,
-             const SmallString<16> &functionName,
-             uint32_t line, uint32_t column)
-    : FileName(fileName), FunctionName(functionName),
-      Line(line), Column(column) {}
+  DILineInfo(StringRef fileName, StringRef functionName, uint32_t line,
+             uint32_t column)
+      : FileName(fileName), FunctionName(functionName), Line(line),
+        Column(column) {}
 
   const char *getFileName() { return FileName.c_str(); }
   const char *getFunctionName() { return FunctionName.c_str(); }
@@ -105,10 +104,14 @@ enum DIDumpType {
   DIDT_Frames,
   DIDT_Info,
   DIDT_InfoDwo,
+  DIDT_Types,
   DIDT_Line,
   DIDT_Loc,
   DIDT_Ranges,
   DIDT_Pubnames,
+  DIDT_Pubtypes,
+  DIDT_GnuPubnames,
+  DIDT_GnuPubtypes,
   DIDT_Str,
   DIDT_StrDwo,
   DIDT_StrOffsetsDwo
diff --git a/include/llvm/DebugInfo/DWARFFormValue.h b/include/llvm/DebugInfo/DWARFFormValue.h
index eaaccfb..533d259 100644
--- a/include/llvm/DebugInfo/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARFFormValue.h
@@ -10,15 +10,31 @@
 #ifndef LLVM_DEBUGINFO_DWARFFORMVALUE_H
 #define LLVM_DEBUGINFO_DWARFFORMVALUE_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/DataExtractor.h"
 
 namespace llvm {
 
-class DWARFCompileUnit;
+class DWARFUnit;
 class raw_ostream;
 
 class DWARFFormValue {
 public:
+  enum FormClass {
+    FC_Unknown,
+    FC_Address,
+    FC_Block,
+    FC_Constant,
+    FC_String,
+    FC_Flag,
+    FC_Reference,
+    FC_Indirect,
+    FC_SectionOffset,
+    FC_Exprloc
+  };
+
+private:
   struct ValueType {
     ValueType() : data(NULL) {
       uval = 0;
@@ -32,49 +48,36 @@ public:
     const uint8_t* data;
   };
 
-  enum {
-    eValueTypeInvalid = 0,
-    eValueTypeUnsigned,
-    eValueTypeSigned,
-    eValueTypeCStr,
-    eValueTypeBlock
-  };
-
-private:
   uint16_t Form;   // Form for this value.
   ValueType Value; // Contains all data for the form.
 
 public:
-  DWARFFormValue(uint16_t form = 0) : Form(form) {}
+  DWARFFormValue(uint16_t Form = 0) : Form(Form) {}
   uint16_t getForm() const { return Form; }
-  const ValueType& value() const { return Value; }
-  void dump(raw_ostream &OS, const DWARFCompileUnit* cu) const;
+  bool isFormClass(FormClass FC) const;
+
+  void dump(raw_ostream &OS, const DWARFUnit *U) const;
   bool extractValue(DataExtractor data, uint32_t *offset_ptr,
-                    const DWARFCompileUnit *cu);
+                    const DWARFUnit *u);
   bool isInlinedCStr() const {
     return Value.data != NULL && Value.data == (const uint8_t*)Value.cstr;
   }
-  const uint8_t *BlockData() const;
-  uint64_t getReference(const DWARFCompileUnit* cu) const;
 
-  /// Resolve any compile unit specific references so that we don't need
-  /// the compile unit at a later time in order to work with the form
-  /// value.
-  bool resolveCompileUnitReferences(const DWARFCompileUnit* cu);
-  uint64_t getUnsigned() const { return Value.uval; }
-  int64_t getSigned() const { return Value.sval; }
-  const char *getAsCString(const DataExtractor *debug_str_data_ptr) const;
-  const char *getIndirectCString(const DataExtractor *,
-                                 const DataExtractor *) const;
-  uint64_t getIndirectAddress(const DataExtractor *,
-                              const DWARFCompileUnit *) const;
+  /// getAsFoo functions below return the extracted value as Foo if only
+  /// DWARFFormValue has form class is suitable for representing Foo.
+  Optional<uint64_t> getAsReference(const DWARFUnit *U) const;
+  Optional<uint64_t> getAsUnsignedConstant() const;
+  Optional<const char *> getAsCString(const DWARFUnit *U) const;
+  Optional<uint64_t> getAsAddress(const DWARFUnit *U) const;
+  Optional<uint64_t> getAsSectionOffset() const;
+
   bool skipValue(DataExtractor debug_info_data, uint32_t *offset_ptr,
-                 const DWARFCompileUnit *cu) const;
+                 const DWARFUnit *u) const;
   static bool skipValue(uint16_t form, DataExtractor debug_info_data,
-                        uint32_t *offset_ptr, const DWARFCompileUnit *cu);
-  static bool isBlockForm(uint16_t form);
-  static bool isDataForm(uint16_t form);
-  static const uint8_t *getFixedFormSizes(uint8_t AddrSize, uint16_t Version);
+                        uint32_t *offset_ptr, const DWARFUnit *u);
+
+  static ArrayRef<uint8_t> getFixedFormSizes(uint8_t AddrSize,
+                                             uint16_t Version);
 };
 
 }
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index ceac298..233084d 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -16,7 +16,6 @@
 #define LLVM_EXECUTIONENGINE_EXECUTIONENGINE_H
 
 #include "llvm-c/ExecutionEngine.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/ValueMap.h"
@@ -153,17 +152,8 @@ protected:
   /// abort.
   void *(*LazyFunctionCreator)(const std::string &);
 
-  /// ExceptionTableRegister - If Exception Handling is set, the JIT will
-  /// register dwarf tables with this function.
-  typedef void (*EERegisterFn)(void*);
-  EERegisterFn ExceptionTableRegister;
-  EERegisterFn ExceptionTableDeregister;
-  /// This maps functions to their exception tables frames.
-  DenseMap<const Function*, void*> AllExceptionTables;
-
-
 public:
-  /// lock - This lock protects the ExecutionEngine, JIT, JITResolver and
+  /// lock - This lock protects the ExecutionEngine, MCJIT, JIT, JITResolver and
   /// JITEmitter classes.  It must be held while changing the internal state of
   /// any of those classes.
   sys::Mutex lock;
@@ -225,7 +215,7 @@ public:
   /// FindFunctionNamed - Search all of the active modules to find the one that
   /// defines FnName.  This is very slow operation and shouldn't be used for
   /// general code.
-  Function *FindFunctionNamed(const char *FnName);
+  virtual Function *FindFunctionNamed(const char *FnName);
 
   /// runFunction - Execute the specified function with the specified arguments,
   /// and return the result.
@@ -240,6 +230,11 @@ public:
   /// found, this function silently returns a null pointer. Otherwise,
   /// it prints a message to stderr and aborts.
   ///
+  /// This function is deprecated for the MCJIT execution engine.
+  ///
+  /// FIXME: the JIT and MCJIT interfaces should be disentangled or united 
+  /// again, if possible.
+  ///
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true) = 0;
 
@@ -252,6 +247,23 @@ public:
                      "EE!");
   }
 
+  /// generateCodeForModule - Run code generationen for the specified module and
+  /// load it into memory.
+  ///
+  /// When this function has completed, all code and data for the specified
+  /// module, and any module on which this module depends, will be generated
+  /// and loaded into memory, but relocations will not yet have been applied
+  /// and all memory will be readable and writable but not executable.
+  ///
+  /// This function is primarily useful when generating code for an external
+  /// target, allowing the client an opportunity to remap section addresses
+  /// before relocations are applied.  Clients that intend to execute code
+  /// locally can use the getFunctionAddress call, which will generate code
+  /// and apply final preparations all in one step.
+  ///
+  /// This method has no effect for the legacy JIT engine or the interpeter.
+  virtual void generateCodeForModule(Module *M) {}
+
   /// finalizeObject - ensure the module is fully processed and is usable.
   ///
   /// It is the user-level function for completing the process of making the
@@ -266,7 +278,7 @@ public:
   /// the static constructors or destructors for a program.
   ///
   /// \param isDtors - Run the destructors instead of constructors.
-  void runStaticConstructorsDestructors(bool isDtors);
+  virtual void runStaticConstructorsDestructors(bool isDtors);
 
   /// runStaticConstructorsDestructors - This method is used to execute all of
   /// the static constructors or destructors for a particular module.
@@ -307,10 +319,16 @@ public:
   /// getPointerToGlobalIfAvailable - This returns the address of the specified
   /// global value if it is has already been codegen'd, otherwise it returns
   /// null.
+  ///
+  /// This function is deprecated for the MCJIT execution engine.  It doesn't
+  /// seem to be needed in that case, but an equivalent can be added if it is.
   void *getPointerToGlobalIfAvailable(const GlobalValue *GV);
 
   /// getPointerToGlobal - This returns the address of the specified global
   /// value. This may involve code generation if it's a function.
+  ///
+  /// This function is deprecated for the MCJIT execution engine.  Use
+  /// getGlobalValueAddress instead.
   void *getPointerToGlobal(const GlobalValue *GV);
 
   /// getPointerToFunction - The different EE's represent function bodies in
@@ -318,22 +336,48 @@ public:
   /// pointer should look like.  When F is destroyed, the ExecutionEngine will
   /// remove its global mapping and free any machine code.  Be sure no threads
   /// are running inside F when that happens.
+  ///
+  /// This function is deprecated for the MCJIT execution engine.  Use
+  /// getFunctionAddress instead.
   virtual void *getPointerToFunction(Function *F) = 0;
 
   /// getPointerToBasicBlock - The different EE's represent basic blocks in
   /// different ways.  Return the representation for a blockaddress of the
   /// specified block.
+  ///
+  /// This function will not be implemented for the MCJIT execution engine.
   virtual void *getPointerToBasicBlock(BasicBlock *BB) = 0;
 
   /// getPointerToFunctionOrStub - If the specified function has been
   /// code-gen'd, return a pointer to the function.  If not, compile it, or use
   /// a stub to implement lazy compilation if available.  See
   /// getPointerToFunction for the requirements on destroying F.
+  ///
+  /// This function is deprecated for the MCJIT execution engine.  Use
+  /// getFunctionAddress instead.
   virtual void *getPointerToFunctionOrStub(Function *F) {
     // Default implementation, just codegen the function.
     return getPointerToFunction(F);
   }
 
+  /// getGlobalValueAddress - Return the address of the specified global
+  /// value. This may involve code generation.
+  ///
+  /// This function should not be called with the JIT or interpreter engines.
+  virtual uint64_t getGlobalValueAddress(const std::string &Name) {
+    // Default implementation for JIT and interpreter.  MCJIT will override this.
+    // JIT and interpreter clients should use getPointerToGlobal instead.
+    return 0;
+  }
+
+  /// getFunctionAddress - Return the address of the specified function.
+  /// This may involve code generation.
+  virtual uint64_t getFunctionAddress(const std::string &Name) {
+    // Default implementation for JIT and interpreter.  MCJIT will override this.
+    // JIT and interpreter clients should use getPointerToFunction instead.
+    return 0;
+  }
+
   // The JIT overrides a version that actually does this.
   virtual void runJITOnFunction(Function *, MachineCodeInfo * = 0) { }
 
@@ -366,6 +410,9 @@ public:
   /// getOrEmitGlobalVariable - Return the address of the specified global
   /// variable, possibly emitting it to memory if needed.  This is used by the
   /// Emitter.
+  ///
+  /// This function is deprecated for the MCJIT execution engine.  Use
+  /// getGlobalValueAddress instead.
   virtual void *getOrEmitGlobalVariable(const GlobalVariable *GV) {
     return getPointerToGlobal((const GlobalValue *)GV);
   }
@@ -435,41 +482,6 @@ public:
     LazyFunctionCreator = P;
   }
 
-  /// InstallExceptionTableRegister - The JIT will use the given function
-  /// to register the exception tables it generates.
-  void InstallExceptionTableRegister(EERegisterFn F) {
-    ExceptionTableRegister = F;
-  }
-  void InstallExceptionTableDeregister(EERegisterFn F) {
-    ExceptionTableDeregister = F;
-  }
-
-  /// RegisterTable - Registers the given pointer as an exception table.  It
-  /// uses the ExceptionTableRegister function.
-  void RegisterTable(const Function *fn, void* res) {
-    if (ExceptionTableRegister) {
-      ExceptionTableRegister(res);
-      AllExceptionTables[fn] = res;
-    }
-  }
-
-  /// DeregisterTable - Deregisters the exception frame previously registered
-  /// for the given function.
-  void DeregisterTable(const Function *Fn) {
-    if (ExceptionTableDeregister) {
-      DenseMap<const Function*, void*>::iterator frame =
-        AllExceptionTables.find(Fn);
-      if(frame != AllExceptionTables.end()) {
-        ExceptionTableDeregister(frame->second);
-        AllExceptionTables.erase(frame);
-      }
-    }
-  }
-
-  /// DeregisterAllTables - Deregisters all previously registered pointers to an
-  /// exception tables.  It uses the ExceptionTableoDeregister function.
-  void DeregisterAllTables();
-
 protected:
   explicit ExecutionEngine(Module *M);
 
diff --git a/include/llvm/ExecutionEngine/ObjectBuffer.h b/include/llvm/ExecutionEngine/ObjectBuffer.h
index 32de404..af2a926 100644
--- a/include/llvm/ExecutionEngine/ObjectBuffer.h
+++ b/include/llvm/ExecutionEngine/ObjectBuffer.h
@@ -30,6 +30,7 @@ namespace llvm {
 /// ObjectFile) as needed, but the MemoryBuffer instance returned does not own the
 /// actual memory it points to.
 class ObjectBuffer {
+  virtual void anchor();
 public:
   ObjectBuffer() {}
   ObjectBuffer(MemoryBuffer* Buf) : Buffer(Buf) {}
@@ -56,6 +57,7 @@ protected:
 /// while providing a common ObjectBuffer interface for access to the
 /// memory once the object has been generated.
 class ObjectBufferStream : public ObjectBuffer {
+  virtual void anchor();
 public:
   ObjectBufferStream() : OS(SV) {}
   virtual ~ObjectBufferStream() {}
diff --git a/include/llvm/ExecutionEngine/ObjectCache.h b/include/llvm/ExecutionEngine/ObjectCache.h
index aa200fb..d1849df 100644
--- a/include/llvm/ExecutionEngine/ObjectCache.h
+++ b/include/llvm/ExecutionEngine/ObjectCache.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_EXECUTIONENGINE_OBJECTCACHE_H
-#define LLVM_LIB_EXECUTIONENGINE_OBJECTCACHE_H
+#ifndef LLVM_EXECUTIONENGINE_OBJECTCACHE_H
+#define LLVM_EXECUTIONENGINE_OBJECTCACHE_H
 
 #include "llvm/Support/MemoryBuffer.h"
 
@@ -20,6 +20,7 @@ class Module;
 /// ExecutionEngine for the purpose of avoiding compilation for Modules that
 /// have already been compiled and an object file is available.
 class ObjectCache {
+  virtual void anchor();
 public:
   ObjectCache() { }
 
diff --git a/include/llvm/ExecutionEngine/ObjectImage.h b/include/llvm/ExecutionEngine/ObjectImage.h
index 9fddca7..076f4b1 100644
--- a/include/llvm/ExecutionEngine/ObjectImage.h
+++ b/include/llvm/ExecutionEngine/ObjectImage.h
@@ -25,6 +25,7 @@ namespace llvm {
 class ObjectImage {
   ObjectImage() LLVM_DELETED_FUNCTION;
   ObjectImage(const ObjectImage &other) LLVM_DELETED_FUNCTION;
+  virtual void anchor();
 
 protected:
   OwningPtr<ObjectBuffer> Buffer;
diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
index 99e2594..3ad2e50 100644
--- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
+++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h
@@ -21,6 +21,9 @@
 
 namespace llvm {
 
+class ExecutionEngine;
+class ObjectImage;
+
 // RuntimeDyld clients often want to handle the memory management of
 // what gets placed where. For JIT clients, this is the subset of
 // JITMemoryManager required for dynamic loading of binaries.
@@ -38,17 +41,30 @@ public:
   /// executable code. The SectionID is a unique identifier assigned by the JIT
   /// engine, and optionally recorded by the memory manager to access a loaded
   /// section.
-  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID) = 0;
+  virtual uint8_t *allocateCodeSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName) = 0;
 
   /// Allocate a memory block of (at least) the given size suitable for data.
   /// The SectionID is a unique identifier assigned by the JIT engine, and
   /// optionally recorded by the memory manager to access a loaded section.
-  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID, bool IsReadOnly) = 0;
+  virtual uint8_t *allocateDataSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName, bool IsReadOnly) = 0;
 
   /// Register the EH frames with the runtime so that c++ exceptions work.
-  virtual void registerEHFrames(StringRef SectionData);
+  ///
+  /// \p Addr parameter provides the local address of the EH frame section
+  /// data, while \p LoadAddr provides the address of the data in the target
+  /// address space.  If the section has not been remapped (which will usually
+  /// be the case for local execution) these two values will be the same.
+  virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size);
+
+  virtual void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size);
+
+  /// This method returns the address of the specified function or variable.
+  /// It is used to resolve symbols during module linking.
+  virtual uint64_t getSymbolAddress(const std::string &Name);
 
   /// This method returns the address of the specified function. As such it is
   /// only useful for resolving library symbols, not code generated symbols.
@@ -56,9 +72,25 @@ public:
   /// If \p AbortOnFailure is false and no function with the given name is
   /// found, this function returns a null pointer. Otherwise, it prints a
   /// message to stderr and aborts.
+  ///
+  /// This function is deprecated for memory managers to be used with
+  /// MCJIT or RuntimeDyld.  Use getSymbolAddress instead.
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true);
 
+  /// This method is called after an object has been loaded into memory but
+  /// before relocations are applied to the loaded sections.  The object load
+  /// may have been initiated by MCJIT to resolve an external symbol for another
+  /// object that is being finalized.  In that case, the object about which
+  /// the memory manager is being notified will be finalized immediately after
+  /// the memory manager returns from this call.
+  ///
+  /// Memory managers which are preparing code for execution in an external
+  /// address space can use this call to remap the section addresses for the
+  /// newly loaded object.
+  virtual void notifyObjectLoaded(ExecutionEngine *EE,
+                                  const ObjectImage *) {}
+
   /// This method is called when object loading is complete and section page
   /// permissions can be applied.  It is up to the memory manager implementation
   /// to decide whether or not to act on this method.  The memory manager will
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 1a57317..b832438 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -64,9 +64,16 @@ public:
   /// This is the address which will be used for relocation resolution.
   void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress);
 
-  StringRef getErrorString();
+  /// Register any EH frame sections that have been loaded but not previously
+  /// registered with the memory manager.  Note, RuntimeDyld is responsible
+  /// for identifying the EH frame and calling the memory manager with the
+  /// EH frame section data.  However, the memory manager itself will handle
+  /// the actual target-specific EH frame registration.
+  void registerEHFrames();
+
+  void deregisterEHFrames();
 
-  StringRef getEHFrameSection();
+  StringRef getErrorString();
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ExecutionEngine/SectionMemoryManager.h b/include/llvm/ExecutionEngine/SectionMemoryManager.h
index 6ee2a2a..fd6e41f 100644
--- a/include/llvm/ExecutionEngine/SectionMemoryManager.h
+++ b/include/llvm/ExecutionEngine/SectionMemoryManager.h
@@ -49,7 +49,8 @@ public:
   /// The value of \p Alignment must be a power of two.  If \p Alignment is zero
   /// a default alignment of 16 will be used.
   virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID);
+                                       unsigned SectionID,
+                                       StringRef SectionName);
 
   /// \brief Allocates a memory block of (at least) the given size suitable for
   /// executable code.
@@ -58,6 +59,7 @@ public:
   /// a default alignment of 16 will be used.
   virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
                                        unsigned SectionID,
+                                       StringRef SectionName,
                                        bool isReadOnly);
 
   /// \brief Update section-specific memory permissions and other attributes.
diff --git a/include/llvm/GVMaterializer.h b/include/llvm/GVMaterializer.h
index 1e5c426..8efe50a 100644
--- a/include/llvm/GVMaterializer.h
+++ b/include/llvm/GVMaterializer.h
@@ -18,7 +18,7 @@
 #ifndef LLVM_GVMATERIALIZER_H
 #define LLVM_GVMATERIALIZER_H
 
-#include <string>
+#include "llvm/Support/system_error.h"
 
 namespace llvm {
 
@@ -41,11 +41,9 @@ public:
   /// dematerialized back to whatever backing store this GVMaterializer uses.
   virtual bool isDematerializable(const GlobalValue *GV) const = 0;
 
-  /// Materialize - make sure the given GlobalValue is fully read.  If the
-  /// module is corrupt, this returns true and fills in the optional string with
-  /// information about the problem.  If successful, this returns false.
+  /// Materialize - make sure the given GlobalValue is fully read.
   ///
-  virtual bool Materialize(GlobalValue *GV, std::string *ErrInfo = 0) = 0;
+  virtual error_code Materialize(GlobalValue *GV) = 0;
 
   /// Dematerialize - If the given GlobalValue is read in, and if the
   /// GVMaterializer supports it, release the memory for the GV, and set it up
@@ -55,10 +53,8 @@ public:
   virtual void Dematerialize(GlobalValue *) {}
 
   /// MaterializeModule - make sure the entire Module has been completely read.
-  /// On error, this returns true and fills in the optional string with
-  /// information about the problem.  If successful, this returns false.
   ///
-  virtual bool MaterializeModule(Module *M, std::string *ErrInfo = 0) = 0;
+  virtual error_code MaterializeModule(Module *M) = 0;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 2183758..c23ba0f 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <bitset>
 #include <cassert>
@@ -88,6 +89,7 @@ public:
     NoReturn,              ///< Mark the function as not returning
     NoUnwind,              ///< Function doesn't unwind stack
     OptimizeForSize,       ///< opt_size
+    OptimizeNone,          ///< Function must not be optimized.
     ReadNone,              ///< Function does not access memory
     ReadOnly,              ///< Function only reads from memory
     Returned,              ///< Return value is always equal to this argument
@@ -199,7 +201,7 @@ public:
 /// index `1'.
 class AttributeSet {
 public:
-  enum AttrIndex {
+  enum AttrIndex LLVM_ENUM_INT_TYPE(unsigned) {
     ReturnIndex = 0U,
     FunctionIndex = ~0U
   };
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 585fc5e..4437af2 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -51,6 +51,13 @@ namespace CallingConv {
     // (HiPE).
     HiPE = 11,
 
+    // WebKit JS - Calling convention for stack based JavaScript calls
+    WebKit_JS = 12,
+
+    // AnyReg - Calling convention for dynamic register based calls (e.g.
+    // stackmap and patchpoint intrinsics).
+    AnyReg = 13,
+
     // Target - This is the start of the target-specific calling conventions,
     // e.g. fastcall and thiscall on X86.
     FirstTargetCC = 64,
diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h
index 99aed0d..dac20c9 100644
--- a/include/llvm/IR/Constants.h
+++ b/include/llvm/IR/Constants.h
@@ -862,6 +862,7 @@ public:
   static Constant *getPtrToInt(Constant *C, Type *Ty);
   static Constant *getIntToPtr(Constant *C, Type *Ty);
   static Constant *getBitCast (Constant *C, Type *Ty);
+  static Constant *getAddrSpaceCast(Constant *C, Type *Ty);
 
   static Constant *getNSWNeg(Constant *C) { return getNeg(C, false, true); }
   static Constant *getNUWNeg(Constant *C) { return getNeg(C, true, false); }
@@ -942,12 +943,20 @@ public:
     Type *Ty ///< The type to trunc or bitcast C to
   );
 
-  /// @brief Create a BitCast or a PtrToInt cast constant expression
+  /// @brief Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant
+  /// expression.
   static Constant *getPointerCast(
     Constant *C,   ///< The pointer value to be casted (operand 0)
     Type *Ty ///< The type to which cast should be made
   );
 
+  /// @brief Create a BitCast or AddrSpaceCast for a pointer type depending on
+  /// the address space.
+  static Constant *getPointerBitCastOrAddrSpaceCast(
+    Constant *C,   ///< The constant to addrspacecast or bitcast
+    Type *Ty ///< The type to bitcast or addrspacecast C to
+  );
+
   /// @brief Create a ZExt, Bitcast or Trunc for integer -> integer casts
   static Constant *getIntegerCast(
     Constant *C,    ///< The integer constant to be casted
@@ -1079,8 +1088,8 @@ public:
   /// as this ConstantExpr. The instruction is not linked to any basic block.
   ///
   /// A better approach to this could be to have a constructor for Instruction
-  /// which would take a ConstantExpr parameter, but that would have spread 
-  /// implementation details of ConstantExpr outside of Constants.cpp, which 
+  /// which would take a ConstantExpr parameter, but that would have spread
+  /// implementation details of ConstantExpr outside of Constants.cpp, which
   /// would make it harder to remove ConstantExprs altogether.
   Instruction *getAsInstruction();
 
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index 269edeb..10630a2 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -369,6 +369,17 @@ public:
   /// least as big as Width bits.
   Type *getSmallestLegalIntType(LLVMContext &C, unsigned Width = 0) const;
 
+  /// getLargestLegalIntType - Return the largest legal integer type, or null if
+  /// none are set.
+  Type *getLargestLegalIntType(LLVMContext &C) const {
+    unsigned LargestSize = getLargestLegalIntTypeSize();
+    return (LargestSize == 0) ? 0 : Type::getIntNTy(C, LargestSize);
+  }
+
+  /// getLargestLegalIntType - Return the size of largest legal integer type
+  /// size, or 0 if none are set.
+  unsigned getLargestLegalIntTypeSize() const;
+
   /// getIndexedOffset - return the offset from the beginning of the type for
   /// the specified indices.  This is used to implement getelementptr.
   uint64_t getIndexedOffset(Type *Ty, ArrayRef<Value *> Indices) const;
@@ -451,7 +462,7 @@ inline uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::LabelTyID:
     return getPointerSizeInBits(0);
   case Type::PointerTyID:
-    return getPointerSizeInBits(cast<PointerType>(Ty)->getAddressSpace());
+    return getPointerSizeInBits(Ty->getPointerAddressSpace());
   case Type::ArrayTyID: {
     ArrayType *ATy = cast<ArrayType>(Ty);
     return ATy->getNumElements() *
@@ -461,7 +472,7 @@ inline uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
     // Get the layout annotation... which is lazily created on demand.
     return getStructLayout(cast<StructType>(Ty))->getSizeInBits();
   case Type::IntegerTyID:
-    return cast<IntegerType>(Ty)->getBitWidth();
+    return Ty->getIntegerBitWidth();
   case Type::HalfTyID:
     return 16;
   case Type::FloatTyID:
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 0e51c6f..bba7ecd 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -159,11 +159,11 @@ public:
   /// calling convention of this function.  The enum values for the known
   /// calling conventions are defined in CallingConv.h.
   CallingConv::ID getCallingConv() const {
-    return static_cast<CallingConv::ID>(getSubclassDataFromValue() >> 1);
+    return static_cast<CallingConv::ID>(getSubclassDataFromValue() >> 2);
   }
   void setCallingConv(CallingConv::ID CC) {
-    setValueSubclassData((getSubclassDataFromValue() & 1) |
-                         (static_cast<unsigned>(CC) << 1));
+    setValueSubclassData((getSubclassDataFromValue() & 3) |
+                         (static_cast<unsigned>(CC) << 2));
   }
 
   /// @brief Return the attribute list for this Function.
@@ -427,6 +427,13 @@ public:
   size_t arg_size() const;
   bool arg_empty() const;
 
+  bool hasPrefixData() const {
+    return getSubclassDataFromValue() & 2;
+  }
+
+  Constant *getPrefixData() const;
+  void setPrefixData(Constant *PrefixData);
+
   /// viewCFG - This function is meant for use from the debugger.  You can just
   /// say 'call F->viewCFG()' and a ghostview window should pop up from the
   /// program, displaying the CFG of the current function with the code for each
diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h
index 883814a..fec61a7 100644
--- a/include/llvm/IR/GlobalAlias.h
+++ b/include/llvm/IR/GlobalAlias.h
@@ -66,14 +66,25 @@ public:
   }
   /// getAliasedGlobal() - Aliasee can be either global or bitcast of
   /// global. This method retrives the global for both aliasee flavours.
-  const GlobalValue *getAliasedGlobal() const;
+  GlobalValue *getAliasedGlobal();
+  const GlobalValue *getAliasedGlobal() const {
+    return const_cast<GlobalAlias *>(this)->getAliasedGlobal();
+  }
 
   /// resolveAliasedGlobal() - This method tries to ultimately resolve the alias
   /// by going through the aliasing chain and trying to find the very last
   /// global. Returns NULL if a cycle was found. If stopOnWeak is false, then
   /// the whole chain aliasing chain is traversed, otherwise - only strong
   /// aliases.
-  const GlobalValue *resolveAliasedGlobal(bool stopOnWeak = true) const;
+  GlobalValue *resolveAliasedGlobal(bool stopOnWeak = true);
+  const GlobalValue *resolveAliasedGlobal(bool stopOnWeak = true) const {
+    return const_cast<GlobalAlias *>(this)->resolveAliasedGlobal(stopOnWeak);
+  }
+
+  static bool isValidLinkage(LinkageTypes L) {
+    return isExternalLinkage(L) || isLocalLinkage(L) ||
+      isWeakLinkage(L) || isLinkOnceLinkage(L);
+  }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Value *V) {
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index 1dc99cf..4f20a31 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -35,7 +35,6 @@ public:
     AvailableExternallyLinkage, ///< Available for inspection, not emission.
     LinkOnceAnyLinkage, ///< Keep one copy of function when linking (inline)
     LinkOnceODRLinkage, ///< Same, but only replaced by something equivalent.
-    LinkOnceODRAutoHideLinkage, ///< Like LinkOnceODRLinkage but addr not taken.
     WeakAnyLinkage,     ///< Keep one copy of named function when linking (weak)
     WeakODRLinkage,     ///< Same, but only replaced by something equivalent.
     AppendingLinkage,   ///< Special purpose, only applies to global arrays
@@ -123,12 +122,7 @@ public:
     return Linkage == AvailableExternallyLinkage;
   }
   static bool isLinkOnceLinkage(LinkageTypes Linkage) {
-    return Linkage == LinkOnceAnyLinkage ||
-           Linkage == LinkOnceODRLinkage ||
-           Linkage == LinkOnceODRAutoHideLinkage;
-  }
-  static bool isLinkOnceODRAutoHideLinkage(LinkageTypes Linkage) {
-    return Linkage == LinkOnceODRAutoHideLinkage;
+    return Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage;
   }
   static bool isWeakLinkage(LinkageTypes Linkage) {
     return Linkage == WeakAnyLinkage || Linkage == WeakODRLinkage;
@@ -192,7 +186,6 @@ public:
            Linkage == WeakODRLinkage ||
            Linkage == LinkOnceAnyLinkage ||
            Linkage == LinkOnceODRLinkage ||
-           Linkage == LinkOnceODRAutoHideLinkage ||
            Linkage == CommonLinkage ||
            Linkage == ExternalWeakLinkage ||
            Linkage == LinkerPrivateWeakLinkage;
@@ -205,9 +198,6 @@ public:
   bool hasLinkOnceLinkage() const {
     return isLinkOnceLinkage(Linkage);
   }
-  bool hasLinkOnceODRAutoHideLinkage() const {
-    return isLinkOnceODRAutoHideLinkage(Linkage);
-  }
   bool hasWeakLinkage() const {
     return isWeakLinkage(Linkage);
   }
diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h
index bfed507..660092d 100644
--- a/include/llvm/IR/GlobalVariable.h
+++ b/include/llvm/IR/GlobalVariable.h
@@ -84,9 +84,7 @@ public:
   /// Provide fast operand accessors
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
-  /// hasInitializer - Unless a global variable isExternal(), it has an
-  /// initializer.  The initializer for the global variable/constant is held by
-  /// Initializer if an initializer is specified.
+  /// Definitions have initializers, declarations don't.
   ///
   inline bool hasInitializer() const { return !isDeclaration(); }
 
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index e396e71..8d1432d 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -25,6 +25,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/ConstantFolder.h"
+#include "llvm/Support/ValueHandle.h"
 
 namespace llvm {
   class MDNode;
@@ -52,10 +53,13 @@ protected:
   BasicBlock *BB;
   BasicBlock::iterator InsertPt;
   LLVMContext &Context;
+
+  MDNode *DefaultFPMathTag;
+  FastMathFlags FMF;
 public:
 
-  IRBuilderBase(LLVMContext &context)
-    : Context(context) {
+  IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = 0)
+    : Context(context), DefaultFPMathTag(FPMathTag), FMF() {
     ClearInsertionPoint();
   }
 
@@ -169,6 +173,68 @@ public:
       ClearInsertionPoint();
   }
 
+  /// \brief Get the floating point math metadata being used.
+  MDNode *getDefaultFPMathTag() const { return DefaultFPMathTag; }
+
+  /// \brief Get the flags to be applied to created floating point ops
+  FastMathFlags getFastMathFlags() const { return FMF; }
+
+  /// \brief Clear the fast-math flags.
+  void clearFastMathFlags() { FMF.clear(); }
+
+  /// \brief Set the floating point math metadata to be used.
+  void SetDefaultFPMathTag(MDNode *FPMathTag) { DefaultFPMathTag = FPMathTag; }
+
+  /// \brief Set the fast-math flags to be used with generated fp-math operators
+  void SetFastMathFlags(FastMathFlags NewFMF) { FMF = NewFMF; }
+
+  //===--------------------------------------------------------------------===//
+  // RAII helpers.
+  //===--------------------------------------------------------------------===//
+
+  // \brief RAII object that stores the current insertion point and restores it
+  // when the object is destroyed. This includes the debug location.
+  class InsertPointGuard {
+    IRBuilderBase &Builder;
+    AssertingVH<BasicBlock> Block;
+    BasicBlock::iterator Point;
+    DebugLoc DbgLoc;
+
+    InsertPointGuard(const InsertPointGuard &) LLVM_DELETED_FUNCTION;
+    InsertPointGuard &operator=(const InsertPointGuard &) LLVM_DELETED_FUNCTION;
+
+  public:
+    InsertPointGuard(IRBuilderBase &B)
+        : Builder(B), Block(B.GetInsertBlock()), Point(B.GetInsertPoint()),
+          DbgLoc(B.getCurrentDebugLocation()) {}
+
+    ~InsertPointGuard() {
+      Builder.restoreIP(InsertPoint(Block, Point));
+      Builder.SetCurrentDebugLocation(DbgLoc);
+    }
+  };
+
+  // \brief RAII object that stores the current fast math settings and restores
+  // them when the object is destroyed.
+  class FastMathFlagGuard {
+    IRBuilderBase &Builder;
+    FastMathFlags FMF;
+    MDNode *FPMathTag;
+
+    FastMathFlagGuard(const FastMathFlagGuard &) LLVM_DELETED_FUNCTION;
+    FastMathFlagGuard &operator=(
+        const FastMathFlagGuard &) LLVM_DELETED_FUNCTION;
+
+  public:
+    FastMathFlagGuard(IRBuilderBase &B)
+        : Builder(B), FMF(B.FMF), FPMathTag(B.DefaultFPMathTag) {}
+
+    ~FastMathFlagGuard() {
+      Builder.FMF = FMF;
+      Builder.DefaultFPMathTag = FPMathTag;
+    }
+  };
+
   //===--------------------------------------------------------------------===//
   // Miscellaneous creation methods.
   //===--------------------------------------------------------------------===//
@@ -354,76 +420,52 @@ template<bool preserveNames = true, typename T = ConstantFolder,
          typename Inserter = IRBuilderDefaultInserter<preserveNames> >
 class IRBuilder : public IRBuilderBase, public Inserter {
   T Folder;
-  MDNode *DefaultFPMathTag;
-  FastMathFlags FMF;
 public:
   IRBuilder(LLVMContext &C, const T &F, const Inserter &I = Inserter(),
             MDNode *FPMathTag = 0)
-    : IRBuilderBase(C), Inserter(I), Folder(F), DefaultFPMathTag(FPMathTag),
-      FMF() {
+    : IRBuilderBase(C, FPMathTag), Inserter(I), Folder(F) {
   }
 
   explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = 0)
-    : IRBuilderBase(C), Folder(), DefaultFPMathTag(FPMathTag), FMF() {
+    : IRBuilderBase(C, FPMathTag), Folder() {
   }
 
   explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = 0)
-    : IRBuilderBase(TheBB->getContext()), Folder(F),
-      DefaultFPMathTag(FPMathTag), FMF() {
+    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
     SetInsertPoint(TheBB);
   }
 
   explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = 0)
-    : IRBuilderBase(TheBB->getContext()), Folder(),
-      DefaultFPMathTag(FPMathTag), FMF() {
+    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
     SetInsertPoint(TheBB);
   }
 
   explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = 0)
-    : IRBuilderBase(IP->getContext()), Folder(), DefaultFPMathTag(FPMathTag),
-      FMF() {
+    : IRBuilderBase(IP->getContext(), FPMathTag), Folder() {
     SetInsertPoint(IP);
     SetCurrentDebugLocation(IP->getDebugLoc());
   }
 
   explicit IRBuilder(Use &U, MDNode *FPMathTag = 0)
-    : IRBuilderBase(U->getContext()), Folder(), DefaultFPMathTag(FPMathTag),
-      FMF() {
+    : IRBuilderBase(U->getContext(), FPMathTag), Folder() {
     SetInsertPoint(U);
     SetCurrentDebugLocation(cast<Instruction>(U.getUser())->getDebugLoc());
   }
 
   IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T& F,
             MDNode *FPMathTag = 0)
-    : IRBuilderBase(TheBB->getContext()), Folder(F),
-      DefaultFPMathTag(FPMathTag), FMF() {
+    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
     SetInsertPoint(TheBB, IP);
   }
 
   IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, MDNode *FPMathTag = 0)
-    : IRBuilderBase(TheBB->getContext()), Folder(),
-      DefaultFPMathTag(FPMathTag), FMF() {
+    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
     SetInsertPoint(TheBB, IP);
   }
 
   /// \brief Get the constant folder being used.
   const T &getFolder() { return Folder; }
 
-  /// \brief Get the floating point math metadata being used.
-  MDNode *getDefaultFPMathTag() const { return DefaultFPMathTag; }
-
-  /// \brief Get the flags to be applied to created floating point ops
-  FastMathFlags getFastMathFlags() const { return FMF; }
-
-  /// \brief Clear the fast-math flags.
-  void clearFastMathFlags() { FMF.clear(); }
-
-  /// \brief SetDefaultFPMathTag - Set the floating point math metadata to be used.
-  void SetDefaultFPMathTag(MDNode *FPMathTag) { DefaultFPMathTag = FPMathTag; }
-
-  /// \brief Set the fast-math flags to be used with generated fp-math operators
-  void SetFastMathFlags(FastMathFlags NewFMF) { FMF = NewFMF; }
-
   /// \brief Return true if this builder is configured to actually add the
   /// requested names to IR created through it.
   bool isNamePreserving() const { return preserveNames; }
@@ -1091,6 +1133,10 @@ public:
                        const Twine &Name = "") {
     return CreateCast(Instruction::BitCast, V, DestTy, Name);
   }
+  Value *CreateAddrSpaceCast(Value *V, Type *DestTy,
+                             const Twine &Name = "") {
+    return CreateCast(Instruction::AddrSpaceCast, V, DestTy, Name);
+  }
   Value *CreateZExtOrBitCast(Value *V, Type *DestTy,
                              const Twine &Name = "") {
     if (V->getType() == DestTy)
diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index 33e4ab8..3398a83 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h
@@ -197,7 +197,7 @@ public:
   // These are helper methods for dealing with flags in the INLINEASM SDNode
   // in the backend.
   
-  enum {
+  enum LLVM_ENUM_INT_TYPE(uint32_t) {
     // Fixed operands on an INLINEASM SDNode.
     Op_InputChain = 0,
     Op_AsmString = 1,
diff --git a/include/llvm/IR/Instruction.def b/include/llvm/IR/Instruction.def
index e59a052..d46314c 100644
--- a/include/llvm/IR/Instruction.def
+++ b/include/llvm/IR/Instruction.def
@@ -154,25 +154,26 @@ HANDLE_CAST_INST(41, FPExt   , FPExtInst   )  // Extend floating point
 HANDLE_CAST_INST(42, PtrToInt, PtrToIntInst)  // Pointer -> Integer
 HANDLE_CAST_INST(43, IntToPtr, IntToPtrInst)  // Integer -> Pointer
 HANDLE_CAST_INST(44, BitCast , BitCastInst )  // Type cast
-  LAST_CAST_INST(44)
+HANDLE_CAST_INST(45, AddrSpaceCast, AddrSpaceCastInst)  // addrspace cast
+  LAST_CAST_INST(45)
 
 // Other operators...
- FIRST_OTHER_INST(45)
-HANDLE_OTHER_INST(45, ICmp   , ICmpInst   )  // Integer comparison instruction
-HANDLE_OTHER_INST(46, FCmp   , FCmpInst   )  // Floating point comparison instr.
-HANDLE_OTHER_INST(47, PHI    , PHINode    )  // PHI node instruction
-HANDLE_OTHER_INST(48, Call   , CallInst   )  // Call a function
-HANDLE_OTHER_INST(49, Select , SelectInst )  // select instruction
-HANDLE_OTHER_INST(50, UserOp1, Instruction)  // May be used internally in a pass
-HANDLE_OTHER_INST(51, UserOp2, Instruction)  // Internal to passes only
-HANDLE_OTHER_INST(52, VAArg  , VAArgInst  )  // vaarg instruction
-HANDLE_OTHER_INST(53, ExtractElement, ExtractElementInst)// extract from vector
-HANDLE_OTHER_INST(54, InsertElement, InsertElementInst)  // insert into vector
-HANDLE_OTHER_INST(55, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
-HANDLE_OTHER_INST(56, ExtractValue, ExtractValueInst)// extract from aggregate
-HANDLE_OTHER_INST(57, InsertValue, InsertValueInst)  // insert into aggregate
-HANDLE_OTHER_INST(58, LandingPad, LandingPadInst)  // Landing pad instruction.
-  LAST_OTHER_INST(58)
+ FIRST_OTHER_INST(46)
+HANDLE_OTHER_INST(46, ICmp   , ICmpInst   )  // Integer comparison instruction
+HANDLE_OTHER_INST(47, FCmp   , FCmpInst   )  // Floating point comparison instr.
+HANDLE_OTHER_INST(48, PHI    , PHINode    )  // PHI node instruction
+HANDLE_OTHER_INST(49, Call   , CallInst   )  // Call a function
+HANDLE_OTHER_INST(50, Select , SelectInst )  // select instruction
+HANDLE_OTHER_INST(51, UserOp1, Instruction)  // May be used internally in a pass
+HANDLE_OTHER_INST(52, UserOp2, Instruction)  // Internal to passes only
+HANDLE_OTHER_INST(53, VAArg  , VAArgInst  )  // vaarg instruction
+HANDLE_OTHER_INST(54, ExtractElement, ExtractElementInst)// extract from vector
+HANDLE_OTHER_INST(55, InsertElement, InsertElementInst)  // insert into vector
+HANDLE_OTHER_INST(56, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
+HANDLE_OTHER_INST(57, ExtractValue, ExtractValueInst)// extract from aggregate
+HANDLE_OTHER_INST(58, InsertValue, InsertValueInst)  // insert into aggregate
+HANDLE_OTHER_INST(59, LandingPad, LandingPadInst)  // Landing pad instruction.
+  LAST_OTHER_INST(59)
 
 #undef  FIRST_TERM_INST
 #undef HANDLE_TERM_INST
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index e05c3a8..0843d8f 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -23,8 +23,6 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IntegersSubset.h"
-#include "llvm/Support/IntegersSubsetMapping.h"
 #include <iterator>
 
 namespace llvm {
@@ -911,6 +909,18 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrInst, Value)
 /// must be identical types.
 /// \brief Represent an integer comparison operator.
 class ICmpInst: public CmpInst {
+  void AssertOK() {
+    assert(getPredicate() >= CmpInst::FIRST_ICMP_PREDICATE &&
+           getPredicate() <= CmpInst::LAST_ICMP_PREDICATE &&
+           "Invalid ICmp predicate value");
+    assert(getOperand(0)->getType() == getOperand(1)->getType() &&
+          "Both operands to ICmp instruction are not of the same type!");
+    // Check that the operands are the right type
+    assert((getOperand(0)->getType()->isIntOrIntVectorTy() ||
+            getOperand(0)->getType()->isPtrOrPtrVectorTy()) &&
+           "Invalid operand types for ICmp instruction");
+  }
+
 protected:
   /// \brief Clone an identical ICmpInst
   virtual ICmpInst *clone_impl() const;
@@ -925,15 +935,9 @@ public:
   ) : CmpInst(makeCmpResultType(LHS->getType()),
               Instruction::ICmp, pred, LHS, RHS, NameStr,
               InsertBefore) {
-    assert(pred >= CmpInst::FIRST_ICMP_PREDICATE &&
-           pred <= CmpInst::LAST_ICMP_PREDICATE &&
-           "Invalid ICmp predicate value");
-    assert(getOperand(0)->getType() == getOperand(1)->getType() &&
-          "Both operands to ICmp instruction are not of the same type!");
-    // Check that the operands are the right type
-    assert((getOperand(0)->getType()->isIntOrIntVectorTy() ||
-            getOperand(0)->getType()->getScalarType()->isPointerTy()) &&
-           "Invalid operand types for ICmp instruction");
+#ifndef NDEBUG
+  AssertOK();
+#endif
   }
 
   /// \brief Constructor with insert-at-end semantics.
@@ -946,15 +950,9 @@ public:
   ) : CmpInst(makeCmpResultType(LHS->getType()),
               Instruction::ICmp, pred, LHS, RHS, NameStr,
               &InsertAtEnd) {
-    assert(pred >= CmpInst::FIRST_ICMP_PREDICATE &&
-          pred <= CmpInst::LAST_ICMP_PREDICATE &&
-          "Invalid ICmp predicate value");
-    assert(getOperand(0)->getType() == getOperand(1)->getType() &&
-          "Both operands to ICmp instruction are not of the same type!");
-    // Check that the operands are the right type
-    assert((getOperand(0)->getType()->isIntOrIntVectorTy() ||
-            getOperand(0)->getType()->getScalarType()->isPointerTy()) &&
-           "Invalid operand types for ICmp instruction");
+#ifndef NDEBUG
+  AssertOK();
+#endif
   }
 
   /// \brief Constructor with no-insertion semantics
@@ -965,15 +963,9 @@ public:
     const Twine &NameStr = "" ///< Name of the instruction
   ) : CmpInst(makeCmpResultType(LHS->getType()),
               Instruction::ICmp, pred, LHS, RHS, NameStr) {
-    assert(pred >= CmpInst::FIRST_ICMP_PREDICATE &&
-           pred <= CmpInst::LAST_ICMP_PREDICATE &&
-           "Invalid ICmp predicate value");
-    assert(getOperand(0)->getType() == getOperand(1)->getType() &&
-          "Both operands to ICmp instruction are not of the same type!");
-    // Check that the operands are the right type
-    assert((getOperand(0)->getType()->isIntOrIntVectorTy() ||
-            getOperand(0)->getType()->getScalarType()->isPointerTy()) &&
-           "Invalid operand types for ICmp instruction");
+#ifndef NDEBUG
+  AssertOK();
+#endif
   }
 
   /// For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
@@ -2457,31 +2449,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 class SwitchInst : public TerminatorInst {
   void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   unsigned ReservedSpace;
-  // Operands format:
   // Operand[0]    = Value to switch on
   // Operand[1]    = Default basic block destination
   // Operand[2n  ] = Value to match
   // Operand[2n+1] = BasicBlock to go to on match
-
-  // Store case values separately from operands list. We needn't User-Use
-  // concept here, since it is just a case value, it will always constant,
-  // and case value couldn't reused with another instructions/values.
-  // Additionally:
-  // It allows us to use custom type for case values that is not inherited
-  // from Value. Since case value is a complex type that implements
-  // the subset of integers, we needn't extract sub-constants within
-  // slow getAggregateElement method.
-  // For case values we will use std::list to by two reasons:
-  // 1. It allows to add/remove cases without whole collection reallocation.
-  // 2. In most of cases we needn't random access.
-  // Currently case values are also stored in Operands List, but it will moved
-  // out in future commits.
-  typedef std::list<IntegersSubset> Subsets;
-  typedef Subsets::iterator SubsetsIt;
-  typedef Subsets::const_iterator SubsetsConstIt;
-
-  Subsets TheSubsets;
-
   SwitchInst(const SwitchInst &SI);
   void init(Value *Value, BasicBlock *Default, unsigned NumReserved);
   void growOperands();
@@ -2506,25 +2477,121 @@ protected:
   virtual SwitchInst *clone_impl() const;
 public:
 
-  // FIXME: Currently there are a lot of unclean template parameters,
-  // we need to make refactoring in future.
-  // All these parameters are used to implement both iterator and const_iterator
-  // without code duplication.
-  // SwitchInstTy may be "const SwitchInst" or "SwitchInst"
-  // ConstantIntTy may be "const ConstantInt" or "ConstantInt"
-  // SubsetsItTy may be SubsetsConstIt or SubsetsIt
-  // BasicBlockTy may be "const BasicBlock" or "BasicBlock"
-  template <class SwitchInstTy, class ConstantIntTy,
-            class SubsetsItTy, class BasicBlockTy>
-    class CaseIteratorT;
-
-  typedef CaseIteratorT<const SwitchInst, const ConstantInt,
-                        SubsetsConstIt, const BasicBlock> ConstCaseIt;
-  class CaseIt;
-
   // -2
   static const unsigned DefaultPseudoIndex = static_cast<unsigned>(~0L-1);
 
+  template <class SwitchInstTy, class ConstantIntTy, class BasicBlockTy>
+  class CaseIteratorT {
+  protected:
+
+    SwitchInstTy *SI;
+    unsigned Index;
+
+  public:
+
+    typedef CaseIteratorT<SwitchInstTy, ConstantIntTy, BasicBlockTy> Self;
+
+    /// Initializes case iterator for given SwitchInst and for given
+    /// case number.
+    CaseIteratorT(SwitchInstTy *SI, unsigned CaseNum) {
+      this->SI = SI;
+      Index = CaseNum;
+    }
+
+    /// Initializes case iterator for given SwitchInst and for given
+    /// TerminatorInst's successor index.
+    static Self fromSuccessorIndex(SwitchInstTy *SI, unsigned SuccessorIndex) {
+      assert(SuccessorIndex < SI->getNumSuccessors() &&
+             "Successor index # out of range!");
+      return SuccessorIndex != 0 ?
+             Self(SI, SuccessorIndex - 1) :
+             Self(SI, DefaultPseudoIndex);
+    }
+
+    /// Resolves case value for current case.
+    ConstantIntTy *getCaseValue() {
+      assert(Index < SI->getNumCases() && "Index out the number of cases.");
+      return reinterpret_cast<ConstantIntTy*>(SI->getOperand(2 + Index*2));
+    }
+
+    /// Resolves successor for current case.
+    BasicBlockTy *getCaseSuccessor() {
+      assert((Index < SI->getNumCases() ||
+              Index == DefaultPseudoIndex) &&
+             "Index out the number of cases.");
+      return SI->getSuccessor(getSuccessorIndex());
+    }
+
+    /// Returns number of current case.
+    unsigned getCaseIndex() const { return Index; }
+
+    /// Returns TerminatorInst's successor index for current case successor.
+    unsigned getSuccessorIndex() const {
+      assert((Index == DefaultPseudoIndex || Index < SI->getNumCases()) &&
+             "Index out the number of cases.");
+      return Index != DefaultPseudoIndex ? Index + 1 : 0;
+    }
+
+    Self operator++() {
+      // Check index correctness after increment.
+      // Note: Index == getNumCases() means end().
+      assert(Index+1 <= SI->getNumCases() && "Index out the number of cases.");
+      ++Index;
+      return *this;
+    }
+    Self operator++(int) {
+      Self tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+    Self operator--() {
+      // Check index correctness after decrement.
+      // Note: Index == getNumCases() means end().
+      // Also allow "-1" iterator here. That will became valid after ++.
+      assert((Index == 0 || Index-1 <= SI->getNumCases()) &&
+             "Index out the number of cases.");
+      --Index;
+      return *this;
+    }
+    Self operator--(int) {
+      Self tmp = *this;
+      --(*this);
+      return tmp;
+    }
+    bool operator==(const Self& RHS) const {
+      assert(RHS.SI == SI && "Incompatible operators.");
+      return RHS.Index == Index;
+    }
+    bool operator!=(const Self& RHS) const {
+      assert(RHS.SI == SI && "Incompatible operators.");
+      return RHS.Index != Index;
+    }
+  };
+
+  typedef CaseIteratorT<const SwitchInst, const ConstantInt, const BasicBlock>
+    ConstCaseIt;
+
+  class CaseIt : public CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> {
+
+    typedef CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> ParentTy;
+
+  public:
+
+    CaseIt(const ParentTy& Src) : ParentTy(Src) {}
+    CaseIt(SwitchInst *SI, unsigned CaseNum) : ParentTy(SI, CaseNum) {}
+
+    /// Sets the new value for current case.
+    void setValue(ConstantInt *V) {
+      assert(Index < SI->getNumCases() && "Index out the number of cases.");
+      SI->setOperand(2 + Index*2, reinterpret_cast<Value*>(V));
+    }
+
+    /// Sets the new successor for current case.
+    void setSuccessor(BasicBlock *S) {
+      SI->setSuccessor(getSuccessorIndex(), S);
+    }
+  };
+
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
                             unsigned NumCases, Instruction *InsertBefore = 0) {
     return new SwitchInst(Value, Default, NumCases, InsertBefore);
@@ -2560,23 +2627,23 @@ public:
   /// Returns a read/write iterator that points to the first
   /// case in SwitchInst.
   CaseIt case_begin() {
-    return CaseIt(this, 0, TheSubsets.begin());
+    return CaseIt(this, 0);
   }
   /// Returns a read-only iterator that points to the first
   /// case in the SwitchInst.
   ConstCaseIt case_begin() const {
-    return ConstCaseIt(this, 0, TheSubsets.begin());
+    return ConstCaseIt(this, 0);
   }
 
   /// Returns a read/write iterator that points one past the last
   /// in the SwitchInst.
   CaseIt case_end() {
-    return CaseIt(this, getNumCases(), TheSubsets.end());
+    return CaseIt(this, getNumCases());
   }
   /// Returns a read-only iterator that points one past the last
   /// in the SwitchInst.
   ConstCaseIt case_end() const {
-    return ConstCaseIt(this, getNumCases(), TheSubsets.end());
+    return ConstCaseIt(this, getNumCases());
   }
   /// Returns an iterator that points to the default case.
   /// Note: this iterator allows to resolve successor only. Attempt
@@ -2584,10 +2651,10 @@ public:
   /// Also note, that increment and decrement also causes an assertion and
   /// makes iterator invalid.
   CaseIt case_default() {
-    return CaseIt(this, DefaultPseudoIndex, TheSubsets.end());
+    return CaseIt(this, DefaultPseudoIndex);
   }
   ConstCaseIt case_default() const {
-    return ConstCaseIt(this, DefaultPseudoIndex, TheSubsets.end());
+    return ConstCaseIt(this, DefaultPseudoIndex);
   }
 
   /// findCaseValue - Search all of the case values for the specified constant.
@@ -2596,13 +2663,13 @@ public:
   /// that it is handled by the default handler.
   CaseIt findCaseValue(const ConstantInt *C) {
     for (CaseIt i = case_begin(), e = case_end(); i != e; ++i)
-      if (i.getCaseValueEx().isSatisfies(IntItem::fromConstantInt(C)))
+      if (i.getCaseValue() == C)
         return i;
     return case_default();
   }
   ConstCaseIt findCaseValue(const ConstantInt *C) const {
     for (ConstCaseIt i = case_begin(), e = case_end(); i != e; ++i)
-      if (i.getCaseValueEx().isSatisfies(IntItem::fromConstantInt(C)))
+      if (i.getCaseValue() == C)
         return i;
     return case_default();
   }
@@ -2628,19 +2695,13 @@ public:
   /// point to the added case.
   void addCase(ConstantInt *OnVal, BasicBlock *Dest);
 
-  /// addCase - Add an entry to the switch instruction.
-  /// Note:
-  /// This action invalidates case_end(). Old case_end() iterator will
-  /// point to the added case.
-  void addCase(IntegersSubset& OnVal, BasicBlock *Dest);
-
   /// removeCase - This method removes the specified case and its successor
   /// from the switch instruction. Note that this operation may reorder the
   /// remaining cases at index idx and above.
   /// Note:
   /// This action invalidates iterators for all cases following the one removed,
   /// including the case_end() iterator.
-  void removeCase(CaseIt& i);
+  void removeCase(CaseIt i);
 
   unsigned getNumSuccessors() const { return getNumOperands()/2; }
   BasicBlock *getSuccessor(unsigned idx) const {
@@ -2652,190 +2713,7 @@ public:
     setOperand(idx*2+1, (Value*)NewSucc);
   }
 
-  uint16_t hash() const {
-    uint32_t NumberOfCases = (uint32_t)getNumCases();
-    uint16_t Hash = (0xFFFF & NumberOfCases) ^ (NumberOfCases >> 16);
-    for (ConstCaseIt i = case_begin(), e = case_end();
-         i != e; ++i) {
-      uint32_t NumItems = (uint32_t)i.getCaseValueEx().getNumItems();
-      Hash = (Hash << 1) ^ (0xFFFF & NumItems) ^ (NumItems >> 16);
-    }
-    return Hash;
-  }
-
-  // Case iterators definition.
-
-  template <class SwitchInstTy, class ConstantIntTy,
-            class SubsetsItTy, class BasicBlockTy>
-  class CaseIteratorT {
-  protected:
-
-    SwitchInstTy *SI;
-    unsigned Index;
-    SubsetsItTy SubsetIt;
-
-    /// Initializes case iterator for given SwitchInst and for given
-    /// case number.
-    friend class SwitchInst;
-    CaseIteratorT(SwitchInstTy *SI, unsigned SuccessorIndex,
-                  SubsetsItTy CaseValueIt) {
-      this->SI = SI;
-      Index = SuccessorIndex;
-      this->SubsetIt = CaseValueIt;
-    }
-
-  public:
-    typedef typename SubsetsItTy::reference IntegersSubsetRef;
-    typedef CaseIteratorT<SwitchInstTy, ConstantIntTy,
-                          SubsetsItTy, BasicBlockTy> Self;
-
-    CaseIteratorT(SwitchInstTy *SI, unsigned CaseNum) {
-          this->SI = SI;
-          Index = CaseNum;
-          SubsetIt = SI->TheSubsets.begin();
-          std::advance(SubsetIt, CaseNum);
-        }
-
-
-    /// Initializes case iterator for given SwitchInst and for given
-    /// TerminatorInst's successor index.
-    static Self fromSuccessorIndex(SwitchInstTy *SI, unsigned SuccessorIndex) {
-      assert(SuccessorIndex < SI->getNumSuccessors() &&
-             "Successor index # out of range!");
-      return SuccessorIndex != 0 ?
-             Self(SI, SuccessorIndex - 1) :
-             Self(SI, DefaultPseudoIndex);
-    }
-
-    /// Resolves case value for current case.
-    ConstantIntTy *getCaseValue() {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
-      IntegersSubsetRef CaseRanges = *SubsetIt;
-
-      // FIXME: Currently we work with ConstantInt based cases.
-      // So return CaseValue as ConstantInt.
-      return CaseRanges.getSingleNumber(0).toConstantInt();
-    }
-
-    /// Resolves case value for current case.
-    IntegersSubsetRef getCaseValueEx() {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
-      return *SubsetIt;
-    }
-
-    /// Resolves successor for current case.
-    BasicBlockTy *getCaseSuccessor() {
-      assert((Index < SI->getNumCases() ||
-              Index == DefaultPseudoIndex) &&
-             "Index out the number of cases.");
-      return SI->getSuccessor(getSuccessorIndex());
-    }
-
-    /// Returns number of current case.
-    unsigned getCaseIndex() const { return Index; }
-
-    /// Returns TerminatorInst's successor index for current case successor.
-    unsigned getSuccessorIndex() const {
-      assert((Index == DefaultPseudoIndex || Index < SI->getNumCases()) &&
-             "Index out the number of cases.");
-      return Index != DefaultPseudoIndex ? Index + 1 : 0;
-    }
-
-    Self operator++() {
-      // Check index correctness after increment.
-      // Note: Index == getNumCases() means end().
-      assert(Index+1 <= SI->getNumCases() && "Index out the number of cases.");
-      ++Index;
-      if (Index == 0)
-        SubsetIt = SI->TheSubsets.begin();
-      else
-        ++SubsetIt;
-      return *this;
-    }
-    Self operator++(int) {
-      Self tmp = *this;
-      ++(*this);
-      return tmp;
-    }
-    Self operator--() {
-      // Check index correctness after decrement.
-      // Note: Index == getNumCases() means end().
-      // Also allow "-1" iterator here. That will became valid after ++.
-      unsigned NumCases = SI->getNumCases();
-      assert((Index == 0 || Index-1 <= NumCases) &&
-             "Index out the number of cases.");
-      --Index;
-      if (Index == NumCases) {
-        SubsetIt = SI->TheSubsets.end();
-        return *this;
-      }
-
-      if (Index != -1U)
-        --SubsetIt;
-
-      return *this;
-    }
-    Self operator--(int) {
-      Self tmp = *this;
-      --(*this);
-      return tmp;
-    }
-    bool operator==(const Self& RHS) const {
-      assert(RHS.SI == SI && "Incompatible operators.");
-      return RHS.Index == Index;
-    }
-    bool operator!=(const Self& RHS) const {
-      assert(RHS.SI == SI && "Incompatible operators.");
-      return RHS.Index != Index;
-    }
-  };
-
-  class CaseIt : public CaseIteratorT<SwitchInst, ConstantInt,
-                                      SubsetsIt, BasicBlock> {
-    typedef CaseIteratorT<SwitchInst, ConstantInt, SubsetsIt, BasicBlock>
-      ParentTy;
-
-  protected:
-    friend class SwitchInst;
-    CaseIt(SwitchInst *SI, unsigned CaseNum, SubsetsIt SubsetIt) :
-      ParentTy(SI, CaseNum, SubsetIt) {}
-
-    void updateCaseValueOperand(IntegersSubset& V) {
-      SI->setOperand(2 + Index*2, reinterpret_cast<Value*>((Constant*)V));
-    }
-
-  public:
-
-    CaseIt(SwitchInst *SI, unsigned CaseNum) : ParentTy(SI, CaseNum) {}
-
-    CaseIt(const ParentTy& Src) : ParentTy(Src) {}
-
-    /// Sets the new value for current case.
-    void setValue(ConstantInt *V) {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
-      IntegersSubsetToBB Mapping;
-      // FIXME: Currently we work with ConstantInt based cases.
-      // So inititalize IntItem container directly from ConstantInt.
-      Mapping.add(IntItem::fromConstantInt(V));
-      *SubsetIt = Mapping.getCase();
-      updateCaseValueOperand(*SubsetIt);
-    }
-
-    /// Sets the new value for current case.
-    void setValueEx(IntegersSubset& V) {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
-      *SubsetIt = V;
-      updateCaseValueOperand(*SubsetIt);
-    }
-
-    /// Sets the new successor for current case.
-    void setSuccessor(BasicBlock *S) {
-      SI->setSuccessor(getSuccessorIndex(), S);
-    }
-  };
-
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Switch;
   }
@@ -3034,7 +2912,7 @@ public:
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attribute attr);
 
-  /// \brief Determine whether this call has the NoAlias attribute.
+  /// \brief Determine whether this call has the given attribute.
   bool hasFnAttr(Attribute::AttrKind A) const {
     assert(A != Attribute::NoBuiltin &&
            "Use CallInst::isNoBuiltin() to check for Attribute::NoBuiltin");
@@ -3737,6 +3615,43 @@ public:
   }
 };
 
+//===----------------------------------------------------------------------===//
+//                          AddrSpaceCastInst Class
+//===----------------------------------------------------------------------===//
+
+/// \brief This class represents a conversion between pointers from
+/// one address space to another.
+class AddrSpaceCastInst : public CastInst {
+protected:
+  /// \brief Clone an identical AddrSpaceCastInst
+  virtual AddrSpaceCastInst *clone_impl() const;
+
+public:
+  /// \brief Constructor with insert-before-instruction semantics
+  AddrSpaceCastInst(
+    Value *S,                     ///< The value to be casted
+    Type *Ty,                     ///< The type to casted to
+    const Twine &NameStr = "",    ///< A name for the new instruction
+    Instruction *InsertBefore = 0 ///< Where to insert the new instruction
+  );
+
+  /// \brief Constructor with insert-at-end-of-block semantics
+  AddrSpaceCastInst(
+    Value *S,                     ///< The value to be casted
+    Type *Ty,                     ///< The type to casted to
+    const Twine &NameStr,         ///< A name for the new instruction
+    BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
+  );
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Instruction *I) {
+    return I->getOpcode() == AddrSpaceCast;
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index c81d110..473e525 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@@ -77,7 +77,7 @@ namespace Intrinsic {
   /// getIntrinsicInfoTableEntries.
   struct IITDescriptor {
     enum IITDescriptorKind {
-      Void, MMX, Metadata, Half, Float, Double,
+      Void, VarArg, MMX, Metadata, Half, Float, Double,
       Integer, Vector, Pointer, Struct,
       Argument, ExtendVecArgument, TruncVecArgument
     } Kind;
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 1a849c4..ded6cc1 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -140,6 +140,7 @@ def llvm_v8i1_ty       : LLVMType<v8i1>;     //  8 x i1
 def llvm_v16i1_ty      : LLVMType<v16i1>;    // 16 x i1
 def llvm_v32i1_ty      : LLVMType<v32i1>;    // 32 x i1
 def llvm_v64i1_ty      : LLVMType<v64i1>;    // 64 x i1
+def llvm_v1i8_ty       : LLVMType<v1i8>;     //  1 x i8
 def llvm_v2i8_ty       : LLVMType<v2i8>;     //  2 x i8
 def llvm_v4i8_ty       : LLVMType<v4i8>;     //  4 x i8
 def llvm_v8i8_ty       : LLVMType<v8i8>;     //  8 x i8
@@ -165,10 +166,15 @@ def llvm_v4i64_ty      : LLVMType<v4i64>;    //  4 x i64
 def llvm_v8i64_ty      : LLVMType<v8i64>;    //  8 x i64
 def llvm_v16i64_ty     : LLVMType<v16i64>;   // 16 x i64
 
+def llvm_v2f16_ty      : LLVMType<v2f16>;    //  2 x half (__fp16)
+def llvm_v4f16_ty      : LLVMType<v4f16>;    //  4 x half (__fp16)
+def llvm_v8f16_ty      : LLVMType<v8f16>;    //  8 x half (__fp16)
+def llvm_v1f32_ty      : LLVMType<v1f32>;    //  1 x float
 def llvm_v2f32_ty      : LLVMType<v2f32>;    //  2 x float
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
 def llvm_v8f32_ty      : LLVMType<v8f32>;    //  8 x float
 def llvm_v16f32_ty     : LLVMType<v16f32>;   // 16 x float
+def llvm_v1f64_ty      : LLVMType<v1f64>;    //  1 x double
 def llvm_v2f64_ty      : LLVMType<v2f64>;    //  2 x double
 def llvm_v4f64_ty      : LLVMType<v4f64>;    //  4 x double
 def llvm_v8f64_ty      : LLVMType<v8f64>;    //  8 x double
@@ -258,6 +264,8 @@ def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
 // Stack Protector Intrinsic - The stackprotector intrinsic writes the stack
 // guard to the correct place on the stack frame.
 def int_stackprotector : Intrinsic<[], [llvm_ptr_ty, llvm_ptrptr_ty], []>;
+def int_stackprotectorcheck : Intrinsic<[], [llvm_ptrptr_ty],
+                                        [IntrReadWriteArgMem]>;
 
 //===------------------- Standard C Library Intrinsics --------------------===//
 //
@@ -293,11 +301,14 @@ let Properties = [IntrReadMem] in {
   def int_exp  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_exp2 : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_fabs : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_copysign : Intrinsic<[llvm_anyfloat_ty],
+                               [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_floor : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_ceil  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_trunc : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_rint  : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_nearbyint : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_round : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
 }
 
 let Properties = [IntrNoMem] in {
@@ -317,7 +328,7 @@ def int_sigsetjmp  : Intrinsic<[llvm_i32_ty] , [llvm_ptr_ty, llvm_i32_ty]>;
 def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>;
 
 // Internal interface for object size checking
-def int_objectsize : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i1_ty],
+def int_objectsize : Intrinsic<[llvm_anyint_ty], [llvm_anyptr_ty, llvm_i1_ty],
                                [IntrNoMem]>,
                                GCCBuiltin<"__builtin_object_size">;
 
@@ -444,6 +455,19 @@ def int_invariant_end   : Intrinsic<[],
                                      llvm_ptr_ty],
                                     [IntrReadWriteArgMem, NoCapture<2>]>;
 
+//===------------------------ Stackmap Intrinsics -------------------------===//
+//
+def int_experimental_stackmap : Intrinsic<[],
+                                  [llvm_i32_ty, llvm_i32_ty, llvm_vararg_ty]>;
+def int_experimental_patchpoint_void : Intrinsic<[],
+                                                 [llvm_i32_ty, llvm_i32_ty,
+                                                  llvm_ptr_ty, llvm_i32_ty,
+                                                  llvm_vararg_ty]>;
+def int_experimental_patchpoint_i64 : Intrinsic<[llvm_i64_ty],
+                                                [llvm_i32_ty, llvm_i32_ty,
+                                                 llvm_ptr_ty, llvm_i32_ty,
+                                                 llvm_vararg_ty]>;
+
 //===-------------------------- Other Intrinsics --------------------------===//
 //
 def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index d7b1947..68af8c1 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -17,12 +17,47 @@
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 
 // Vector Absolute Compare (Floating Point)
-def int_aarch64_neon_vacgeq : Intrinsic<[llvm_v2i64_ty],
-                                    [llvm_v2f64_ty, llvm_v2f64_ty],
-                                    [IntrNoMem]>;
-def int_aarch64_neon_vacgtq : Intrinsic<[llvm_v2i64_ty],
-                                    [llvm_v2f64_ty, llvm_v2f64_ty],
-                                    [IntrNoMem]>;
+def int_aarch64_neon_vacgeq :
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_aarch64_neon_vacgtq :
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+// Vector saturating accumulate
+def int_aarch64_neon_suqadd : Neon_2Arg_Intrinsic;
+def int_aarch64_neon_usqadd : Neon_2Arg_Intrinsic;
+
+// Vector Bitwise reverse
+def int_aarch64_neon_rbit : Neon_1Arg_Intrinsic;
+
+// Vector extract and narrow
+def int_aarch64_neon_xtn : 
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+// Vector floating-point convert
+def int_aarch64_neon_frintn : Neon_1Arg_Intrinsic;
+def int_aarch64_neon_fsqrt : Neon_1Arg_Intrinsic;
+def int_aarch64_neon_fcvtxn :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtns : 
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtnu :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtps :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtpu :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtms :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtmu :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtas :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtau :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtzs :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+def int_aarch64_neon_fcvtzu :
+  Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
 
 // Vector maxNum (Floating Point)
 def int_aarch64_neon_vmaxnm : Neon_2Arg_Intrinsic;
@@ -36,6 +71,318 @@ def int_aarch64_neon_vpmaxnm : Neon_2Arg_Intrinsic;
 // Vector Pairwise minNum (Floating Point)
 def int_aarch64_neon_vpminnm : Neon_2Arg_Intrinsic;
 
-// Vector Multiply Extended (Floating Point)
-def int_aarch64_neon_vmulx : Neon_2Arg_Intrinsic;
+// Vector Multiply Extended and Scalar Multiply Extended (Floating Point)
+def int_aarch64_neon_vmulx  :
+  Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
+
+class Neon_N2V_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty],
+              [IntrNoMem]>;
+class Neon_N3V_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+              [IntrNoMem]>;
+class Neon_N2V_Narrow_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMExtendedElementVectorType<0>, llvm_i32_ty],
+              [IntrNoMem]>;
+
+// Vector rounding shift right by immediate (Signed)
+def int_aarch64_neon_vsrshr : Neon_N2V_Intrinsic;
+def int_aarch64_neon_vurshr : Neon_N2V_Intrinsic;
+def int_aarch64_neon_vsqshlu : Neon_N2V_Intrinsic;
+
+def int_aarch64_neon_vsri : Neon_N3V_Intrinsic;
+def int_aarch64_neon_vsli : Neon_N3V_Intrinsic;
+
+def int_aarch64_neon_vsqshrun : Neon_N2V_Narrow_Intrinsic;
+def int_aarch64_neon_vrshrn : Neon_N2V_Narrow_Intrinsic;
+def int_aarch64_neon_vsqrshrun : Neon_N2V_Narrow_Intrinsic;
+def int_aarch64_neon_vsqshrn : Neon_N2V_Narrow_Intrinsic;
+def int_aarch64_neon_vuqshrn : Neon_N2V_Narrow_Intrinsic;
+def int_aarch64_neon_vsqrshrn : Neon_N2V_Narrow_Intrinsic;
+def int_aarch64_neon_vuqrshrn : Neon_N2V_Narrow_Intrinsic;
+
+// Vector across
+class Neon_Across_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+class Neon_2Arg_Across_Float_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+def int_aarch64_neon_saddlv : Neon_Across_Intrinsic;
+def int_aarch64_neon_uaddlv : Neon_Across_Intrinsic;
+def int_aarch64_neon_smaxv  : Neon_Across_Intrinsic;
+def int_aarch64_neon_umaxv  : Neon_Across_Intrinsic;
+def int_aarch64_neon_sminv  : Neon_Across_Intrinsic;
+def int_aarch64_neon_uminv  : Neon_Across_Intrinsic;
+def int_aarch64_neon_vaddv  : Neon_Across_Intrinsic;
+def int_aarch64_neon_vmaxv  : Neon_Across_Intrinsic;
+def int_aarch64_neon_vminv  : Neon_Across_Intrinsic;
+def int_aarch64_neon_vmaxnmv : Neon_Across_Intrinsic;
+def int_aarch64_neon_vminnmv : Neon_Across_Intrinsic;
+
+// Vector Table Lookup.
+def int_aarch64_neon_vtbl1 :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty, LLVMMatchType<0>], [IntrNoMem]>;
+
+def int_aarch64_neon_vtbl2 :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<0>],
+            [IntrNoMem]>;
+
+def int_aarch64_neon_vtbl3 :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>,
+            LLVMMatchType<0>], [IntrNoMem]>;
+
+def int_aarch64_neon_vtbl4 :
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>,
+            LLVMMatchType<1>, LLVMMatchType<0>], [IntrNoMem]>;
+
+// Vector Table Extension.
+// Some elements of the destination vector may not be updated, so the original
+// value of that vector is passed as the first argument.  The next 1-4
+// arguments after that are the table.
+def int_aarch64_neon_vtbx1 :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+            [IntrNoMem]>;
+
+def int_aarch64_neon_vtbx2 :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>,
+            LLVMMatchType<0>], [IntrNoMem]>;
+
+def int_aarch64_neon_vtbx3 :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>,
+            LLVMMatchType<1>, LLVMMatchType<0>], [IntrNoMem]>;
+
+def int_aarch64_neon_vtbx4 :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, llvm_anyvector_ty,  LLVMMatchType<1>,
+            LLVMMatchType<1>,  LLVMMatchType<1>, LLVMMatchType<0>],
+            [IntrNoMem]>;
+
+// Vector Load/store
+def int_aarch64_neon_vld1x2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                        [llvm_ptr_ty, llvm_i32_ty],
+                                        [IntrReadArgMem]>;
+def int_aarch64_neon_vld1x3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                         LLVMMatchType<0>],
+                                        [llvm_ptr_ty, llvm_i32_ty],
+                                        [IntrReadArgMem]>;
+def int_aarch64_neon_vld1x4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                         LLVMMatchType<0>, LLVMMatchType<0>],
+                                        [llvm_ptr_ty, llvm_i32_ty],
+                                        [IntrReadArgMem]>;
+
+def int_aarch64_neon_vst1x2 : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_anyvector_ty,
+                                         LLVMMatchType<0>, llvm_i32_ty],
+                                        [IntrReadWriteArgMem]>;
+def int_aarch64_neon_vst1x3 : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_anyvector_ty,
+                                         LLVMMatchType<0>, LLVMMatchType<0>,
+                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
+def int_aarch64_neon_vst1x4 : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_anyvector_ty,
+                                         LLVMMatchType<0>, LLVMMatchType<0>,
+                                         LLVMMatchType<0>, llvm_i32_ty],
+                                        [IntrReadWriteArgMem]>;
+
+// Scalar Add
+def int_aarch64_neon_vaddds :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+def int_aarch64_neon_vadddu :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+
+
+// Scalar Sub
+def int_aarch64_neon_vsubds :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+def int_aarch64_neon_vsubdu :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+
+
+// Scalar Shift
+// Scalar Shift Left
+def int_aarch64_neon_vshlds :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+def int_aarch64_neon_vshldu :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+
+// Scalar Saturating Shift Left
+def int_aarch64_neon_vqshls : Neon_2Arg_Intrinsic;
+def int_aarch64_neon_vqshlu : Neon_2Arg_Intrinsic;
+
+// Scalar Shift Rouding Left
+def int_aarch64_neon_vrshlds :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+def int_aarch64_neon_vrshldu :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+
+// Scalar Saturating Rounding Shift Left
+def int_aarch64_neon_vqrshls : Neon_2Arg_Intrinsic;
+def int_aarch64_neon_vqrshlu : Neon_2Arg_Intrinsic;
+
+// Scalar Reduce Pairwise Add.
+def int_aarch64_neon_vpadd :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v2i64_ty],[IntrNoMem]>;
+def int_aarch64_neon_vpfadd :
+  Intrinsic<[llvm_v1f32_ty], [llvm_v2f32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vpfaddq :
+  Intrinsic<[llvm_v1f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+// Scalar Reduce Pairwise Floating Point Max/Min.
+def int_aarch64_neon_vpmax :
+  Intrinsic<[llvm_v1f32_ty], [llvm_v2f32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vpmaxq :
+  Intrinsic<[llvm_v1f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_aarch64_neon_vpmin :
+  Intrinsic<[llvm_v1f32_ty], [llvm_v2f32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vpminq :
+  Intrinsic<[llvm_v1f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+// Scalar Reduce Pairwise Floating Point Maxnm/Minnm.
+def int_aarch64_neon_vpfmaxnm :
+  Intrinsic<[llvm_v1f32_ty], [llvm_v2f32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vpfmaxnmq :
+  Intrinsic<[llvm_v1f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_aarch64_neon_vpfminnm :
+  Intrinsic<[llvm_v1f32_ty], [llvm_v2f32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vpfminnmq :
+  Intrinsic<[llvm_v1f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+// Scalar Signed Integer Convert To Floating-point
+def int_aarch64_neon_vcvtf32_s32 :
+  Intrinsic<[llvm_float_ty], [llvm_v1i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vcvtf64_s64 :
+  Intrinsic<[llvm_double_ty], [llvm_v1i64_ty], [IntrNoMem]>;
+
+// Scalar Unsigned Integer Convert To Floating-point
+def int_aarch64_neon_vcvtf32_u32 :
+  Intrinsic<[llvm_float_ty], [llvm_v1i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vcvtf64_u64 :
+  Intrinsic<[llvm_double_ty], [llvm_v1i64_ty], [IntrNoMem]>;
+
+// Scalar Floating-point Reciprocal Exponent
+def int_aarch64_neon_vrecpx : Neon_1Arg_Intrinsic;
+
+class Neon_Cmp_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyvector_ty],
+              [IntrNoMem]>;
+
+// Scalar Compare Equal
+def int_aarch64_neon_vceq : Neon_Cmp_Intrinsic;
+
+// Scalar Compare Greater-Than or Equal
+def int_aarch64_neon_vcge : Neon_Cmp_Intrinsic;
+def int_aarch64_neon_vchs : Neon_Cmp_Intrinsic;
+
+// Scalar Compare Less-Than or Equal
+def int_aarch64_neon_vclez : Neon_Cmp_Intrinsic;
+
+// Scalar Compare Less-Than
+def int_aarch64_neon_vcltz : Neon_Cmp_Intrinsic;
+
+// Scalar Compare Greater-Than
+def int_aarch64_neon_vcgt : Neon_Cmp_Intrinsic;
+def int_aarch64_neon_vchi : Neon_Cmp_Intrinsic;
+
+// Scalar Compare Bitwise Test Bits
+def int_aarch64_neon_vtstd : Neon_Cmp_Intrinsic;
+
+// Scalar Floating-point Absolute Compare Greater Than Or Equal
+def int_aarch64_neon_vcage : Neon_Cmp_Intrinsic;
+ 
+// Scalar Floating-point Absolute Compare Greater Than
+def int_aarch64_neon_vcagt : Neon_Cmp_Intrinsic;
+
+// Scalar Signed Saturating Accumulated of Unsigned Value
+def int_aarch64_neon_vuqadd : Neon_2Arg_Intrinsic;
+
+// Scalar Unsigned Saturating Accumulated of Signed Value
+def int_aarch64_neon_vsqadd : Neon_2Arg_Intrinsic;
+
+// Scalar Absolute Value
+def int_aarch64_neon_vabs :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty], [IntrNoMem]>;
+
+// Scalar Absolute Difference
+def int_aarch64_neon_vabd : Neon_2Arg_Intrinsic;
+
+// Scalar Negate Value
+def int_aarch64_neon_vneg :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty], [IntrNoMem]>;
+
+// Signed Saturating Doubling Multiply-Add Long
+def int_aarch64_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
+
+// Signed Saturating Doubling Multiply-Subtract Long
+def int_aarch64_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
+
+class Neon_2Arg_ShiftImm_Intrinsic
+  : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+class Neon_3Arg_ShiftImm_Intrinsic
+  : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_v1i64_ty, llvm_i32_ty],
+              [IntrNoMem]>;
+
+// Scalar Shift Right (Immediate)
+def int_aarch64_neon_vshrds_n : Neon_2Arg_ShiftImm_Intrinsic;
+def int_aarch64_neon_vshrdu_n : Neon_2Arg_ShiftImm_Intrinsic;
+
+// Scalar Shift Right and Accumulate (Immediate)
+def int_aarch64_neon_vsrads_n : Neon_3Arg_ShiftImm_Intrinsic;
+def int_aarch64_neon_vsradu_n : Neon_3Arg_ShiftImm_Intrinsic;
+
+// Scalar Rounding Shift Right and Accumulate (Immediate)
+def int_aarch64_neon_vrsrads_n : Neon_3Arg_ShiftImm_Intrinsic;
+def int_aarch64_neon_vrsradu_n : Neon_3Arg_ShiftImm_Intrinsic;
+
+// Scalar Shift Left (Immediate)
+def int_aarch64_neon_vshld_n : Neon_2Arg_ShiftImm_Intrinsic;
+
+// Scalar Saturating Shift Left (Immediate)
+def int_aarch64_neon_vqshls_n : Neon_N2V_Intrinsic;
+def int_aarch64_neon_vqshlu_n : Neon_N2V_Intrinsic;
+
+// Scalar Signed Saturating Shift Left Unsigned (Immediate)
+def int_aarch64_neon_vqshlus_n : Neon_N2V_Intrinsic;
+
+// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
+def int_aarch64_neon_vcvtf32_n_s32 :
+  Intrinsic<[llvm_float_ty], [llvm_v1i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vcvtf64_n_s64 :
+  Intrinsic<[llvm_double_ty], [llvm_v1i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
+def int_aarch64_neon_vcvtf32_n_u32 :
+  Intrinsic<[llvm_float_ty], [llvm_v1i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vcvtf64_n_u64 :
+  Intrinsic<[llvm_double_ty], [llvm_v1i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
+def int_aarch64_neon_vcvts_n_s32_f32 :
+  Intrinsic<[llvm_v1i32_ty], [llvm_v1f32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vcvtd_n_s64_f64 :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1f64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
+def int_aarch64_neon_vcvts_n_u32_f32 :
+  Intrinsic<[llvm_v1i32_ty], [llvm_v1f32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_aarch64_neon_vcvtd_n_u64_f64 :
+  Intrinsic<[llvm_v1i64_ty], [llvm_v1f64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+class Neon_SHA_Intrinsic
+  : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v1i32_ty, llvm_v4i32_ty],
+              [IntrNoMem]>;
+
+def int_aarch64_neon_sha1c : Neon_SHA_Intrinsic;
+def int_aarch64_neon_sha1m : Neon_SHA_Intrinsic;
+def int_aarch64_neon_sha1p : Neon_SHA_Intrinsic;
 }
diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index 3c5d5ff..0b50d64 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@@ -45,6 +45,11 @@ def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
 def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty]>;
 
 //===----------------------------------------------------------------------===//
+// Data barrier instructions
+def int_arm_dmb : GCCBuiltin<"__builtin_arm_dmb">, Intrinsic<[], [llvm_i32_ty]>;
+def int_arm_dsb : GCCBuiltin<"__builtin_arm_dsb">, Intrinsic<[], [llvm_i32_ty]>;
+
+//===----------------------------------------------------------------------===//
 // VFP
 
 def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">,
@@ -92,6 +97,26 @@ def int_arm_mcrr2 : GCCBuiltin<"__builtin_arm_mcrr2">,
                   llvm_i32_ty, llvm_i32_ty], []>;
 
 //===----------------------------------------------------------------------===//
+// CRC32
+
+def int_arm_crc32b  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm_crc32h  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm_crc32w  : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+def int_arm_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// HINT
+def int_arm_sevl : Intrinsic<[], []>;
+
+//===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
 // The following classes do not correspond directly to GCC builtins.
@@ -163,7 +188,6 @@ let Properties = [IntrNoMem, Commutative] in {
   def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
   def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
   def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
-  def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
   def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
 
   // Vector Multiply.
@@ -175,10 +199,6 @@ let Properties = [IntrNoMem, Commutative] in {
   def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
   def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
 
-  // Vector Multiply and Accumulate/Subtract.
-  def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
-  def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
-
   // Vector Maximum.
   def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
   def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
@@ -201,7 +221,6 @@ def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
 def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
 def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
 def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
-def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
 def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
 
 // Vector Absolute Compare.
@@ -451,4 +470,21 @@ def int_arm_neon_vbsl : Intrinsic<[llvm_anyvector_ty],
                         [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                         [IntrNoMem]>;
 
+
+// Crypto instructions
+def int_arm_neon_aesd : Neon_2Arg_Intrinsic;
+def int_arm_neon_aese : Neon_2Arg_Intrinsic;
+def int_arm_neon_aesimc : Neon_1Arg_Intrinsic;
+def int_arm_neon_aesmc : Neon_1Arg_Intrinsic;
+def int_arm_neon_sha1h : Neon_1Arg_Intrinsic;
+def int_arm_neon_sha1su1 : Neon_2Arg_Intrinsic;
+def int_arm_neon_sha256su0 : Neon_2Arg_Intrinsic;
+def int_arm_neon_sha1c : Neon_3Arg_Intrinsic;
+def int_arm_neon_sha1m : Neon_3Arg_Intrinsic;
+def int_arm_neon_sha1p : Neon_3Arg_Intrinsic;
+def int_arm_neon_sha1su0: Neon_3Arg_Intrinsic;
+def int_arm_neon_sha256h: Neon_3Arg_Intrinsic;
+def int_arm_neon_sha256h2: Neon_3Arg_Intrinsic;
+def int_arm_neon_sha256su1: Neon_3Arg_Intrinsic;
+
 } // end TargetPrefix
diff --git a/include/llvm/IR/IntrinsicsMips.td b/include/llvm/IR/IntrinsicsMips.td
index a0987c8..42c5821 100644
--- a/include/llvm/IR/IntrinsicsMips.td
+++ b/include/llvm/IR/IntrinsicsMips.td
@@ -386,4 +386,1372 @@ def int_mips_subuh_qb: GCCBuiltin<"__builtin_mips_subuh_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
 def int_mips_subuh_r_qb: GCCBuiltin<"__builtin_mips_subuh_r_qb">,
   Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
+
+//===----------------------------------------------------------------------===//
+// MIPS MSA
+
+//===----------------------------------------------------------------------===//
+// Addition/subtraction
+
+def int_mips_add_a_b : GCCBuiltin<"__builtin_msa_add_a_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_add_a_h : GCCBuiltin<"__builtin_msa_add_a_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_add_a_w : GCCBuiltin<"__builtin_msa_add_a_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_add_a_d : GCCBuiltin<"__builtin_msa_add_a_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_adds_a_b : GCCBuiltin<"__builtin_msa_adds_a_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_a_h : GCCBuiltin<"__builtin_msa_adds_a_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_a_w : GCCBuiltin<"__builtin_msa_adds_a_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_a_d : GCCBuiltin<"__builtin_msa_adds_a_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_adds_s_b : GCCBuiltin<"__builtin_msa_adds_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_s_h : GCCBuiltin<"__builtin_msa_adds_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_s_w : GCCBuiltin<"__builtin_msa_adds_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_s_d : GCCBuiltin<"__builtin_msa_adds_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_adds_u_b : GCCBuiltin<"__builtin_msa_adds_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_u_h : GCCBuiltin<"__builtin_msa_adds_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_u_w : GCCBuiltin<"__builtin_msa_adds_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_adds_u_d : GCCBuiltin<"__builtin_msa_adds_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_addv_b : GCCBuiltin<"__builtin_msa_addv_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_addv_h : GCCBuiltin<"__builtin_msa_addv_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_addv_w : GCCBuiltin<"__builtin_msa_addv_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_addv_d : GCCBuiltin<"__builtin_msa_addv_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_addvi_b : GCCBuiltin<"__builtin_msa_addvi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_addvi_h : GCCBuiltin<"__builtin_msa_addvi_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_addvi_w : GCCBuiltin<"__builtin_msa_addvi_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_addvi_d : GCCBuiltin<"__builtin_msa_addvi_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_and_v : GCCBuiltin<"__builtin_msa_and_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+def int_mips_andi_b : GCCBuiltin<"__builtin_msa_andi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_asub_s_b : GCCBuiltin<"__builtin_msa_asub_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_asub_s_h : GCCBuiltin<"__builtin_msa_asub_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_asub_s_w : GCCBuiltin<"__builtin_msa_asub_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_asub_s_d : GCCBuiltin<"__builtin_msa_asub_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_asub_u_b : GCCBuiltin<"__builtin_msa_asub_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_asub_u_h : GCCBuiltin<"__builtin_msa_asub_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_asub_u_w : GCCBuiltin<"__builtin_msa_asub_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_asub_u_d : GCCBuiltin<"__builtin_msa_asub_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_ave_s_b : GCCBuiltin<"__builtin_msa_ave_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_ave_s_h : GCCBuiltin<"__builtin_msa_ave_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_ave_s_w : GCCBuiltin<"__builtin_msa_ave_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_ave_s_d : GCCBuiltin<"__builtin_msa_ave_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_ave_u_b : GCCBuiltin<"__builtin_msa_ave_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_ave_u_h : GCCBuiltin<"__builtin_msa_ave_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_ave_u_w : GCCBuiltin<"__builtin_msa_ave_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_ave_u_d : GCCBuiltin<"__builtin_msa_ave_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_aver_s_b : GCCBuiltin<"__builtin_msa_aver_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_aver_s_h : GCCBuiltin<"__builtin_msa_aver_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_aver_s_w : GCCBuiltin<"__builtin_msa_aver_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_aver_s_d : GCCBuiltin<"__builtin_msa_aver_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_aver_u_b : GCCBuiltin<"__builtin_msa_aver_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_aver_u_h : GCCBuiltin<"__builtin_msa_aver_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_aver_u_w : GCCBuiltin<"__builtin_msa_aver_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
+  [Commutative, IntrNoMem]>;
+def int_mips_aver_u_d : GCCBuiltin<"__builtin_msa_aver_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
+  [Commutative, IntrNoMem]>;
+
+def int_mips_bclr_b : GCCBuiltin<"__builtin_msa_bclr_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_bclr_h : GCCBuiltin<"__builtin_msa_bclr_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_bclr_w : GCCBuiltin<"__builtin_msa_bclr_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_bclr_d : GCCBuiltin<"__builtin_msa_bclr_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_bclri_b : GCCBuiltin<"__builtin_msa_bclri_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bclri_h : GCCBuiltin<"__builtin_msa_bclri_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bclri_w : GCCBuiltin<"__builtin_msa_bclri_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bclri_d : GCCBuiltin<"__builtin_msa_bclri_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_binsl_b : GCCBuiltin<"__builtin_msa_binsl_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem]>;
+def int_mips_binsl_h : GCCBuiltin<"__builtin_msa_binsl_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+            [IntrNoMem]>;
+def int_mips_binsl_w : GCCBuiltin<"__builtin_msa_binsl_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsl_d : GCCBuiltin<"__builtin_msa_binsl_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+            [IntrNoMem]>;
+
+def int_mips_binsli_b : GCCBuiltin<"__builtin_msa_binsli_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsli_h : GCCBuiltin<"__builtin_msa_binsli_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsli_w : GCCBuiltin<"__builtin_msa_binsli_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsli_d : GCCBuiltin<"__builtin_msa_binsli_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_binsr_b : GCCBuiltin<"__builtin_msa_binsr_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem]>;
+def int_mips_binsr_h : GCCBuiltin<"__builtin_msa_binsr_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+            [IntrNoMem]>;
+def int_mips_binsr_w : GCCBuiltin<"__builtin_msa_binsr_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsr_d : GCCBuiltin<"__builtin_msa_binsr_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+            [IntrNoMem]>;
+
+def int_mips_binsri_b : GCCBuiltin<"__builtin_msa_binsri_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsri_h : GCCBuiltin<"__builtin_msa_binsri_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsri_w : GCCBuiltin<"__builtin_msa_binsri_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_binsri_d : GCCBuiltin<"__builtin_msa_binsri_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_bmnz_v : GCCBuiltin<"__builtin_msa_bmnz_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem]>;
+
+def int_mips_bmnzi_b : GCCBuiltin<"__builtin_msa_bmnzi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_bmz_v : GCCBuiltin<"__builtin_msa_bmz_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem]>;
+
+def int_mips_bmzi_b : GCCBuiltin<"__builtin_msa_bmzi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_bneg_b : GCCBuiltin<"__builtin_msa_bneg_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_bneg_h : GCCBuiltin<"__builtin_msa_bneg_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_bneg_w : GCCBuiltin<"__builtin_msa_bneg_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_bneg_d : GCCBuiltin<"__builtin_msa_bneg_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_bnegi_b : GCCBuiltin<"__builtin_msa_bnegi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bnegi_h : GCCBuiltin<"__builtin_msa_bnegi_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bnegi_w : GCCBuiltin<"__builtin_msa_bnegi_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bnegi_d : GCCBuiltin<"__builtin_msa_bnegi_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_bnz_b : GCCBuiltin<"__builtin_msa_bnz_b">,
+  Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_bnz_h : GCCBuiltin<"__builtin_msa_bnz_h">,
+  Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_bnz_w : GCCBuiltin<"__builtin_msa_bnz_w">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_bnz_d : GCCBuiltin<"__builtin_msa_bnz_d">,
+  Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_bnz_v : GCCBuiltin<"__builtin_msa_bnz_v">,
+  Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+
+def int_mips_bsel_v : GCCBuiltin<"__builtin_msa_bsel_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem]>;
+
+def int_mips_bseli_b : GCCBuiltin<"__builtin_msa_bseli_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_bset_b : GCCBuiltin<"__builtin_msa_bset_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_bset_h : GCCBuiltin<"__builtin_msa_bset_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_bset_w : GCCBuiltin<"__builtin_msa_bset_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_bset_d : GCCBuiltin<"__builtin_msa_bset_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_bseti_b : GCCBuiltin<"__builtin_msa_bseti_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bseti_h : GCCBuiltin<"__builtin_msa_bseti_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bseti_w : GCCBuiltin<"__builtin_msa_bseti_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_bseti_d : GCCBuiltin<"__builtin_msa_bseti_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_bz_b : GCCBuiltin<"__builtin_msa_bz_b">,
+  Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_bz_h : GCCBuiltin<"__builtin_msa_bz_h">,
+  Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_bz_w : GCCBuiltin<"__builtin_msa_bz_w">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_bz_d : GCCBuiltin<"__builtin_msa_bz_d">,
+  Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_bz_v : GCCBuiltin<"__builtin_msa_bz_v">,
+  Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+
+def int_mips_ceq_b : GCCBuiltin<"__builtin_msa_ceq_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_ceq_h : GCCBuiltin<"__builtin_msa_ceq_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_ceq_w : GCCBuiltin<"__builtin_msa_ceq_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_ceq_d : GCCBuiltin<"__builtin_msa_ceq_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_ceqi_b : GCCBuiltin<"__builtin_msa_ceqi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_ceqi_h : GCCBuiltin<"__builtin_msa_ceqi_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_ceqi_w : GCCBuiltin<"__builtin_msa_ceqi_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_ceqi_d : GCCBuiltin<"__builtin_msa_ceqi_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_cfcmsa : GCCBuiltin<"__builtin_msa_cfcmsa">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+
+def int_mips_cle_s_b : GCCBuiltin<"__builtin_msa_cle_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_cle_s_h : GCCBuiltin<"__builtin_msa_cle_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_cle_s_w : GCCBuiltin<"__builtin_msa_cle_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_cle_s_d : GCCBuiltin<"__builtin_msa_cle_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_cle_u_b : GCCBuiltin<"__builtin_msa_cle_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_cle_u_h : GCCBuiltin<"__builtin_msa_cle_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_cle_u_w : GCCBuiltin<"__builtin_msa_cle_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_cle_u_d : GCCBuiltin<"__builtin_msa_cle_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_clei_s_b : GCCBuiltin<"__builtin_msa_clei_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clei_s_h : GCCBuiltin<"__builtin_msa_clei_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clei_s_w : GCCBuiltin<"__builtin_msa_clei_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clei_s_d : GCCBuiltin<"__builtin_msa_clei_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_clei_u_b : GCCBuiltin<"__builtin_msa_clei_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clei_u_h : GCCBuiltin<"__builtin_msa_clei_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clei_u_w : GCCBuiltin<"__builtin_msa_clei_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clei_u_d : GCCBuiltin<"__builtin_msa_clei_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_clt_s_b : GCCBuiltin<"__builtin_msa_clt_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_clt_s_h : GCCBuiltin<"__builtin_msa_clt_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_clt_s_w : GCCBuiltin<"__builtin_msa_clt_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_clt_s_d : GCCBuiltin<"__builtin_msa_clt_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_clt_u_b : GCCBuiltin<"__builtin_msa_clt_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_clt_u_h : GCCBuiltin<"__builtin_msa_clt_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_clt_u_w : GCCBuiltin<"__builtin_msa_clt_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_clt_u_d : GCCBuiltin<"__builtin_msa_clt_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_clti_s_b : GCCBuiltin<"__builtin_msa_clti_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clti_s_h : GCCBuiltin<"__builtin_msa_clti_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clti_s_w : GCCBuiltin<"__builtin_msa_clti_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clti_s_d : GCCBuiltin<"__builtin_msa_clti_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_clti_u_b : GCCBuiltin<"__builtin_msa_clti_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clti_u_h : GCCBuiltin<"__builtin_msa_clti_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clti_u_w : GCCBuiltin<"__builtin_msa_clti_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_clti_u_d : GCCBuiltin<"__builtin_msa_clti_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_copy_s_b : GCCBuiltin<"__builtin_msa_copy_s_b">,
+  Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_copy_s_h : GCCBuiltin<"__builtin_msa_copy_s_h">,
+  Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_copy_s_w : GCCBuiltin<"__builtin_msa_copy_s_w">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_copy_s_d : GCCBuiltin<"__builtin_msa_copy_s_d">,
+  Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_copy_u_b : GCCBuiltin<"__builtin_msa_copy_u_b">,
+  Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_copy_u_h : GCCBuiltin<"__builtin_msa_copy_u_h">,
+  Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_copy_u_w : GCCBuiltin<"__builtin_msa_copy_u_w">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_copy_u_d : GCCBuiltin<"__builtin_msa_copy_u_d">,
+  Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_ctcmsa : GCCBuiltin<"__builtin_msa_ctcmsa">,
+  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>;
+
+def int_mips_div_s_b : GCCBuiltin<"__builtin_msa_div_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_div_s_h : GCCBuiltin<"__builtin_msa_div_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_div_s_w : GCCBuiltin<"__builtin_msa_div_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_div_s_d : GCCBuiltin<"__builtin_msa_div_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_div_u_b : GCCBuiltin<"__builtin_msa_div_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_div_u_h : GCCBuiltin<"__builtin_msa_div_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_div_u_w : GCCBuiltin<"__builtin_msa_div_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_div_u_d : GCCBuiltin<"__builtin_msa_div_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_dotp_s_h : GCCBuiltin<"__builtin_msa_dotp_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_dotp_s_w : GCCBuiltin<"__builtin_msa_dotp_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_dotp_s_d : GCCBuiltin<"__builtin_msa_dotp_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_dotp_u_h : GCCBuiltin<"__builtin_msa_dotp_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_dotp_u_w : GCCBuiltin<"__builtin_msa_dotp_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_dotp_u_d : GCCBuiltin<"__builtin_msa_dotp_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_dpadd_s_h : GCCBuiltin<"__builtin_msa_dpadd_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+  [IntrNoMem]>;
+def int_mips_dpadd_s_w : GCCBuiltin<"__builtin_msa_dpadd_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_dpadd_s_d : GCCBuiltin<"__builtin_msa_dpadd_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_dpadd_u_h : GCCBuiltin<"__builtin_msa_dpadd_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+  [IntrNoMem]>;
+def int_mips_dpadd_u_w : GCCBuiltin<"__builtin_msa_dpadd_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_dpadd_u_d : GCCBuiltin<"__builtin_msa_dpadd_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_dpsub_s_h : GCCBuiltin<"__builtin_msa_dpsub_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+  [IntrNoMem]>;
+def int_mips_dpsub_s_w : GCCBuiltin<"__builtin_msa_dpsub_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_dpsub_s_d : GCCBuiltin<"__builtin_msa_dpsub_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_dpsub_u_h : GCCBuiltin<"__builtin_msa_dpsub_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+  [IntrNoMem]>;
+def int_mips_dpsub_u_w : GCCBuiltin<"__builtin_msa_dpsub_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_dpsub_u_d : GCCBuiltin<"__builtin_msa_dpsub_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_fadd_w : GCCBuiltin<"__builtin_msa_fadd_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fadd_d : GCCBuiltin<"__builtin_msa_fadd_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcaf_w : GCCBuiltin<"__builtin_msa_fcaf_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcaf_d : GCCBuiltin<"__builtin_msa_fcaf_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fceq_w : GCCBuiltin<"__builtin_msa_fceq_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fceq_d : GCCBuiltin<"__builtin_msa_fceq_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcle_w : GCCBuiltin<"__builtin_msa_fcle_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcle_d : GCCBuiltin<"__builtin_msa_fcle_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fclt_w : GCCBuiltin<"__builtin_msa_fclt_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fclt_d : GCCBuiltin<"__builtin_msa_fclt_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fclass_w : GCCBuiltin<"__builtin_msa_fclass_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fclass_d : GCCBuiltin<"__builtin_msa_fclass_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcne_w : GCCBuiltin<"__builtin_msa_fcne_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcne_d : GCCBuiltin<"__builtin_msa_fcne_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcor_w : GCCBuiltin<"__builtin_msa_fcor_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcor_d : GCCBuiltin<"__builtin_msa_fcor_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcueq_w : GCCBuiltin<"__builtin_msa_fcueq_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcueq_d : GCCBuiltin<"__builtin_msa_fcueq_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcule_w : GCCBuiltin<"__builtin_msa_fcule_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcule_d : GCCBuiltin<"__builtin_msa_fcule_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcult_w : GCCBuiltin<"__builtin_msa_fcult_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcult_d : GCCBuiltin<"__builtin_msa_fcult_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcun_w : GCCBuiltin<"__builtin_msa_fcun_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcun_d : GCCBuiltin<"__builtin_msa_fcun_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fcune_w : GCCBuiltin<"__builtin_msa_fcune_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fcune_d : GCCBuiltin<"__builtin_msa_fcune_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fdiv_w : GCCBuiltin<"__builtin_msa_fdiv_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fdiv_d : GCCBuiltin<"__builtin_msa_fdiv_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fexdo_h : GCCBuiltin<"__builtin_msa_fexdo_h">,
+  Intrinsic<[llvm_v8f16_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fexdo_w : GCCBuiltin<"__builtin_msa_fexdo_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fexp2_w : GCCBuiltin<"__builtin_msa_fexp2_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_fexp2_d : GCCBuiltin<"__builtin_msa_fexp2_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_fexupl_w : GCCBuiltin<"__builtin_msa_fexupl_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v8f16_ty], [IntrNoMem]>;
+def int_mips_fexupl_d : GCCBuiltin<"__builtin_msa_fexupl_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+def int_mips_fexupr_w : GCCBuiltin<"__builtin_msa_fexupr_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v8f16_ty], [IntrNoMem]>;
+def int_mips_fexupr_d : GCCBuiltin<"__builtin_msa_fexupr_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+def int_mips_ffint_s_w : GCCBuiltin<"__builtin_msa_ffint_s_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_ffint_s_d : GCCBuiltin<"__builtin_msa_ffint_s_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_ffint_u_w : GCCBuiltin<"__builtin_msa_ffint_u_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_ffint_u_d : GCCBuiltin<"__builtin_msa_ffint_u_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_ffql_w : GCCBuiltin<"__builtin_msa_ffql_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_ffql_d : GCCBuiltin<"__builtin_msa_ffql_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_ffqr_w : GCCBuiltin<"__builtin_msa_ffqr_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_ffqr_d : GCCBuiltin<"__builtin_msa_ffqr_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_fill_b : GCCBuiltin<"__builtin_msa_fill_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_mips_fill_h : GCCBuiltin<"__builtin_msa_fill_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_mips_fill_w : GCCBuiltin<"__builtin_msa_fill_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_mips_fill_d : GCCBuiltin<"__builtin_msa_fill_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_i64_ty], [IntrNoMem]>;
+
+def int_mips_flog2_w : GCCBuiltin<"__builtin_msa_flog2_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_flog2_d : GCCBuiltin<"__builtin_msa_flog2_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fmadd_w : GCCBuiltin<"__builtin_msa_fmadd_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+            [IntrNoMem]>;
+def int_mips_fmadd_d : GCCBuiltin<"__builtin_msa_fmadd_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+            [IntrNoMem]>;
+
+def int_mips_fmax_w : GCCBuiltin<"__builtin_msa_fmax_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fmax_d : GCCBuiltin<"__builtin_msa_fmax_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fmax_a_w : GCCBuiltin<"__builtin_msa_fmax_a_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fmax_a_d : GCCBuiltin<"__builtin_msa_fmax_a_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fmin_w : GCCBuiltin<"__builtin_msa_fmin_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fmin_d : GCCBuiltin<"__builtin_msa_fmin_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fmin_a_w : GCCBuiltin<"__builtin_msa_fmin_a_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fmin_a_d : GCCBuiltin<"__builtin_msa_fmin_a_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fmsub_w : GCCBuiltin<"__builtin_msa_fmsub_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+            [IntrNoMem]>;
+def int_mips_fmsub_d : GCCBuiltin<"__builtin_msa_fmsub_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+            [IntrNoMem]>;
+
+def int_mips_fmul_w : GCCBuiltin<"__builtin_msa_fmul_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fmul_d : GCCBuiltin<"__builtin_msa_fmul_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_frint_w : GCCBuiltin<"__builtin_msa_frint_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_frint_d : GCCBuiltin<"__builtin_msa_frint_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_frcp_w : GCCBuiltin<"__builtin_msa_frcp_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_frcp_d : GCCBuiltin<"__builtin_msa_frcp_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_frsqrt_w : GCCBuiltin<"__builtin_msa_frsqrt_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_frsqrt_d : GCCBuiltin<"__builtin_msa_frsqrt_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsaf_w : GCCBuiltin<"__builtin_msa_fsaf_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsaf_d : GCCBuiltin<"__builtin_msa_fsaf_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fseq_w : GCCBuiltin<"__builtin_msa_fseq_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fseq_d : GCCBuiltin<"__builtin_msa_fseq_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsle_w : GCCBuiltin<"__builtin_msa_fsle_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsle_d : GCCBuiltin<"__builtin_msa_fsle_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fslt_w : GCCBuiltin<"__builtin_msa_fslt_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fslt_d : GCCBuiltin<"__builtin_msa_fslt_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsne_w : GCCBuiltin<"__builtin_msa_fsne_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsne_d : GCCBuiltin<"__builtin_msa_fsne_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsor_w : GCCBuiltin<"__builtin_msa_fsor_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsor_d : GCCBuiltin<"__builtin_msa_fsor_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsqrt_w : GCCBuiltin<"__builtin_msa_fsqrt_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsqrt_d : GCCBuiltin<"__builtin_msa_fsqrt_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsub_w : GCCBuiltin<"__builtin_msa_fsub_w">,
+  Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsub_d : GCCBuiltin<"__builtin_msa_fsub_d">,
+  Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsueq_w : GCCBuiltin<"__builtin_msa_fsueq_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsueq_d : GCCBuiltin<"__builtin_msa_fsueq_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsule_w : GCCBuiltin<"__builtin_msa_fsule_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsule_d : GCCBuiltin<"__builtin_msa_fsule_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsult_w : GCCBuiltin<"__builtin_msa_fsult_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsult_d : GCCBuiltin<"__builtin_msa_fsult_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsun_w : GCCBuiltin<"__builtin_msa_fsun_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsun_d : GCCBuiltin<"__builtin_msa_fsun_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_fsune_w : GCCBuiltin<"__builtin_msa_fsune_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_fsune_d : GCCBuiltin<"__builtin_msa_fsune_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_ftint_s_w : GCCBuiltin<"__builtin_msa_ftint_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_ftint_s_d : GCCBuiltin<"__builtin_msa_ftint_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_ftint_u_w : GCCBuiltin<"__builtin_msa_ftint_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_ftint_u_d : GCCBuiltin<"__builtin_msa_ftint_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_ftq_h : GCCBuiltin<"__builtin_msa_ftq_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_ftq_w : GCCBuiltin<"__builtin_msa_ftq_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_ftrunc_s_w : GCCBuiltin<"__builtin_msa_ftrunc_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_ftrunc_s_d : GCCBuiltin<"__builtin_msa_ftrunc_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_ftrunc_u_w : GCCBuiltin<"__builtin_msa_ftrunc_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_mips_ftrunc_u_d : GCCBuiltin<"__builtin_msa_ftrunc_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+def int_mips_hadd_s_h : GCCBuiltin<"__builtin_msa_hadd_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_hadd_s_w : GCCBuiltin<"__builtin_msa_hadd_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_hadd_s_d : GCCBuiltin<"__builtin_msa_hadd_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_hadd_u_h : GCCBuiltin<"__builtin_msa_hadd_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_hadd_u_w : GCCBuiltin<"__builtin_msa_hadd_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_hadd_u_d : GCCBuiltin<"__builtin_msa_hadd_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_hsub_s_h : GCCBuiltin<"__builtin_msa_hsub_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_hsub_s_w : GCCBuiltin<"__builtin_msa_hsub_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_hsub_s_d : GCCBuiltin<"__builtin_msa_hsub_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_hsub_u_h : GCCBuiltin<"__builtin_msa_hsub_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_hsub_u_w : GCCBuiltin<"__builtin_msa_hsub_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_hsub_u_d : GCCBuiltin<"__builtin_msa_hsub_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_ilvev_b : GCCBuiltin<"__builtin_msa_ilvev_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_ilvev_h : GCCBuiltin<"__builtin_msa_ilvev_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_ilvev_w : GCCBuiltin<"__builtin_msa_ilvev_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_ilvev_d : GCCBuiltin<"__builtin_msa_ilvev_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_ilvl_b : GCCBuiltin<"__builtin_msa_ilvl_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_ilvl_h : GCCBuiltin<"__builtin_msa_ilvl_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_ilvl_w : GCCBuiltin<"__builtin_msa_ilvl_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_ilvl_d : GCCBuiltin<"__builtin_msa_ilvl_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_ilvod_b : GCCBuiltin<"__builtin_msa_ilvod_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_ilvod_h : GCCBuiltin<"__builtin_msa_ilvod_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_ilvod_w : GCCBuiltin<"__builtin_msa_ilvod_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_ilvod_d : GCCBuiltin<"__builtin_msa_ilvod_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_ilvr_b : GCCBuiltin<"__builtin_msa_ilvr_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_ilvr_h : GCCBuiltin<"__builtin_msa_ilvr_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_ilvr_w : GCCBuiltin<"__builtin_msa_ilvr_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_ilvr_d : GCCBuiltin<"__builtin_msa_ilvr_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_insert_b : GCCBuiltin<"__builtin_msa_insert_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+def int_mips_insert_h : GCCBuiltin<"__builtin_msa_insert_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+def int_mips_insert_w : GCCBuiltin<"__builtin_msa_insert_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+def int_mips_insert_d : GCCBuiltin<"__builtin_msa_insert_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty, llvm_i64_ty],
+  [IntrNoMem]>;
+
+def int_mips_insve_b : GCCBuiltin<"__builtin_msa_insve_b">,
+  Intrinsic<[llvm_v16i8_ty],
+            [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty],
+            [IntrNoMem]>;
+def int_mips_insve_h : GCCBuiltin<"__builtin_msa_insve_h">,
+  Intrinsic<[llvm_v8i16_ty],
+            [llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty],
+            [IntrNoMem]>;
+def int_mips_insve_w : GCCBuiltin<"__builtin_msa_insve_w">,
+  Intrinsic<[llvm_v4i32_ty],
+            [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty],
+            [IntrNoMem]>;
+def int_mips_insve_d : GCCBuiltin<"__builtin_msa_insve_d">,
+  Intrinsic<[llvm_v2i64_ty],
+            [llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty],
+            [IntrNoMem]>;
+
+def int_mips_ld_b : GCCBuiltin<"__builtin_msa_ld_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadArgMem]>;
+def int_mips_ld_h : GCCBuiltin<"__builtin_msa_ld_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadArgMem]>;
+def int_mips_ld_w : GCCBuiltin<"__builtin_msa_ld_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadArgMem]>;
+def int_mips_ld_d : GCCBuiltin<"__builtin_msa_ld_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadArgMem]>;
+
+def int_mips_ldi_b : GCCBuiltin<"__builtin_msa_ldi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_mips_ldi_h : GCCBuiltin<"__builtin_msa_ldi_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_mips_ldi_w : GCCBuiltin<"__builtin_msa_ldi_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_mips_ldi_d : GCCBuiltin<"__builtin_msa_ldi_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+// This instruction is part of the MSA spec but it does not share the
+// __builtin_msa prefix because it operates on the GPR registers.
+def int_mips_lsa : GCCBuiltin<"__builtin_mips_lsa">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_madd_q_h : GCCBuiltin<"__builtin_msa_madd_q_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_madd_q_w : GCCBuiltin<"__builtin_msa_madd_q_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_maddr_q_h : GCCBuiltin<"__builtin_msa_maddr_q_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_maddr_q_w : GCCBuiltin<"__builtin_msa_maddr_q_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_maddv_b : GCCBuiltin<"__builtin_msa_maddv_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+  [IntrNoMem]>;
+def int_mips_maddv_h : GCCBuiltin<"__builtin_msa_maddv_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_maddv_w : GCCBuiltin<"__builtin_msa_maddv_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+def int_mips_maddv_d : GCCBuiltin<"__builtin_msa_maddv_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+  [IntrNoMem]>;
+
+def int_mips_max_a_b : GCCBuiltin<"__builtin_msa_max_a_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_max_a_h : GCCBuiltin<"__builtin_msa_max_a_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_max_a_w : GCCBuiltin<"__builtin_msa_max_a_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_max_a_d : GCCBuiltin<"__builtin_msa_max_a_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_max_s_b : GCCBuiltin<"__builtin_msa_max_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_max_s_h : GCCBuiltin<"__builtin_msa_max_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_max_s_w : GCCBuiltin<"__builtin_msa_max_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_max_s_d : GCCBuiltin<"__builtin_msa_max_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_max_u_b : GCCBuiltin<"__builtin_msa_max_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_max_u_h : GCCBuiltin<"__builtin_msa_max_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_max_u_w : GCCBuiltin<"__builtin_msa_max_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_max_u_d : GCCBuiltin<"__builtin_msa_max_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_maxi_s_b : GCCBuiltin<"__builtin_msa_maxi_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_maxi_s_h : GCCBuiltin<"__builtin_msa_maxi_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_maxi_s_w : GCCBuiltin<"__builtin_msa_maxi_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_maxi_s_d : GCCBuiltin<"__builtin_msa_maxi_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_maxi_u_b : GCCBuiltin<"__builtin_msa_maxi_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_maxi_u_h : GCCBuiltin<"__builtin_msa_maxi_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_maxi_u_w : GCCBuiltin<"__builtin_msa_maxi_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_maxi_u_d : GCCBuiltin<"__builtin_msa_maxi_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_min_a_b : GCCBuiltin<"__builtin_msa_min_a_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_min_a_h : GCCBuiltin<"__builtin_msa_min_a_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_min_a_w : GCCBuiltin<"__builtin_msa_min_a_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_min_a_d : GCCBuiltin<"__builtin_msa_min_a_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_min_s_b : GCCBuiltin<"__builtin_msa_min_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_min_s_h : GCCBuiltin<"__builtin_msa_min_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_min_s_w : GCCBuiltin<"__builtin_msa_min_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_min_s_d : GCCBuiltin<"__builtin_msa_min_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_min_u_b : GCCBuiltin<"__builtin_msa_min_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_min_u_h : GCCBuiltin<"__builtin_msa_min_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_min_u_w : GCCBuiltin<"__builtin_msa_min_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_min_u_d : GCCBuiltin<"__builtin_msa_min_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_mini_s_b : GCCBuiltin<"__builtin_msa_mini_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_mini_s_h : GCCBuiltin<"__builtin_msa_mini_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_mini_s_w : GCCBuiltin<"__builtin_msa_mini_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_mini_s_d : GCCBuiltin<"__builtin_msa_mini_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_mini_u_b : GCCBuiltin<"__builtin_msa_mini_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_mini_u_h : GCCBuiltin<"__builtin_msa_mini_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_mini_u_w : GCCBuiltin<"__builtin_msa_mini_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_mini_u_d : GCCBuiltin<"__builtin_msa_mini_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_mod_s_b : GCCBuiltin<"__builtin_msa_mod_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_mod_s_h : GCCBuiltin<"__builtin_msa_mod_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_mod_s_w : GCCBuiltin<"__builtin_msa_mod_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_mod_s_d : GCCBuiltin<"__builtin_msa_mod_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_mod_u_b : GCCBuiltin<"__builtin_msa_mod_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_mod_u_h : GCCBuiltin<"__builtin_msa_mod_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_mod_u_w : GCCBuiltin<"__builtin_msa_mod_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_mod_u_d : GCCBuiltin<"__builtin_msa_mod_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_move_v : GCCBuiltin<"__builtin_msa_move_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+
+def int_mips_msub_q_h : GCCBuiltin<"__builtin_msa_msub_q_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_msub_q_w : GCCBuiltin<"__builtin_msa_msub_q_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_msubr_q_h : GCCBuiltin<"__builtin_msa_msubr_q_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_msubr_q_w : GCCBuiltin<"__builtin_msa_msubr_q_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_msubv_b : GCCBuiltin<"__builtin_msa_msubv_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+  [IntrNoMem]>;
+def int_mips_msubv_h : GCCBuiltin<"__builtin_msa_msubv_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+  [IntrNoMem]>;
+def int_mips_msubv_w : GCCBuiltin<"__builtin_msa_msubv_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+  [IntrNoMem]>;
+def int_mips_msubv_d : GCCBuiltin<"__builtin_msa_msubv_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+  [IntrNoMem]>;
+
+def int_mips_mul_q_h : GCCBuiltin<"__builtin_msa_mul_q_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_mul_q_w : GCCBuiltin<"__builtin_msa_mul_q_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_mulr_q_h : GCCBuiltin<"__builtin_msa_mulr_q_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_mulr_q_w : GCCBuiltin<"__builtin_msa_mulr_q_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+
+def int_mips_mulv_b : GCCBuiltin<"__builtin_msa_mulv_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_mulv_h : GCCBuiltin<"__builtin_msa_mulv_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_mulv_w : GCCBuiltin<"__builtin_msa_mulv_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_mulv_d : GCCBuiltin<"__builtin_msa_mulv_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_nloc_b : GCCBuiltin<"__builtin_msa_nloc_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_nloc_h : GCCBuiltin<"__builtin_msa_nloc_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_nloc_w : GCCBuiltin<"__builtin_msa_nloc_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_nloc_d : GCCBuiltin<"__builtin_msa_nloc_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_nlzc_b : GCCBuiltin<"__builtin_msa_nlzc_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_nlzc_h : GCCBuiltin<"__builtin_msa_nlzc_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_nlzc_w : GCCBuiltin<"__builtin_msa_nlzc_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_nlzc_d : GCCBuiltin<"__builtin_msa_nlzc_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_nor_v : GCCBuiltin<"__builtin_msa_nor_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+def int_mips_nori_b : GCCBuiltin<"__builtin_msa_nori_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_or_v : GCCBuiltin<"__builtin_msa_or_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+def int_mips_ori_b : GCCBuiltin<"__builtin_msa_ori_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_pckev_b : GCCBuiltin<"__builtin_msa_pckev_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_pckev_h : GCCBuiltin<"__builtin_msa_pckev_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_pckev_w : GCCBuiltin<"__builtin_msa_pckev_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_pckev_d : GCCBuiltin<"__builtin_msa_pckev_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_pckod_b : GCCBuiltin<"__builtin_msa_pckod_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_pckod_h : GCCBuiltin<"__builtin_msa_pckod_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_pckod_w : GCCBuiltin<"__builtin_msa_pckod_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_pckod_d : GCCBuiltin<"__builtin_msa_pckod_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_pcnt_b : GCCBuiltin<"__builtin_msa_pcnt_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_pcnt_h : GCCBuiltin<"__builtin_msa_pcnt_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_pcnt_w : GCCBuiltin<"__builtin_msa_pcnt_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_pcnt_d : GCCBuiltin<"__builtin_msa_pcnt_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_sat_s_b : GCCBuiltin<"__builtin_msa_sat_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sat_s_h : GCCBuiltin<"__builtin_msa_sat_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sat_s_w : GCCBuiltin<"__builtin_msa_sat_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sat_s_d : GCCBuiltin<"__builtin_msa_sat_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_sat_u_b : GCCBuiltin<"__builtin_msa_sat_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sat_u_h : GCCBuiltin<"__builtin_msa_sat_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sat_u_w : GCCBuiltin<"__builtin_msa_sat_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sat_u_d : GCCBuiltin<"__builtin_msa_sat_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_shf_b : GCCBuiltin<"__builtin_msa_shf_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shf_h : GCCBuiltin<"__builtin_msa_shf_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shf_w : GCCBuiltin<"__builtin_msa_shf_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_sld_b : GCCBuiltin<"__builtin_msa_sld_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sld_h : GCCBuiltin<"__builtin_msa_sld_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sld_w : GCCBuiltin<"__builtin_msa_sld_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sld_d : GCCBuiltin<"__builtin_msa_sld_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_sldi_b : GCCBuiltin<"__builtin_msa_sldi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sldi_h : GCCBuiltin<"__builtin_msa_sldi_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sldi_w : GCCBuiltin<"__builtin_msa_sldi_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_sldi_d : GCCBuiltin<"__builtin_msa_sldi_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_sll_b : GCCBuiltin<"__builtin_msa_sll_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_sll_h : GCCBuiltin<"__builtin_msa_sll_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_sll_w : GCCBuiltin<"__builtin_msa_sll_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_sll_d : GCCBuiltin<"__builtin_msa_sll_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_slli_b : GCCBuiltin<"__builtin_msa_slli_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_slli_h : GCCBuiltin<"__builtin_msa_slli_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_slli_w : GCCBuiltin<"__builtin_msa_slli_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_slli_d : GCCBuiltin<"__builtin_msa_slli_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_splat_b : GCCBuiltin<"__builtin_msa_splat_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_splat_h : GCCBuiltin<"__builtin_msa_splat_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_splat_w : GCCBuiltin<"__builtin_msa_splat_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_splat_d : GCCBuiltin<"__builtin_msa_splat_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_splati_b : GCCBuiltin<"__builtin_msa_splati_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_splati_h : GCCBuiltin<"__builtin_msa_splati_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_splati_w : GCCBuiltin<"__builtin_msa_splati_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_splati_d : GCCBuiltin<"__builtin_msa_splati_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_sra_b : GCCBuiltin<"__builtin_msa_sra_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_sra_h : GCCBuiltin<"__builtin_msa_sra_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_sra_w : GCCBuiltin<"__builtin_msa_sra_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_sra_d : GCCBuiltin<"__builtin_msa_sra_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_srai_b : GCCBuiltin<"__builtin_msa_srai_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srai_h : GCCBuiltin<"__builtin_msa_srai_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srai_w : GCCBuiltin<"__builtin_msa_srai_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srai_d : GCCBuiltin<"__builtin_msa_srai_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_srar_b : GCCBuiltin<"__builtin_msa_srar_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_srar_h : GCCBuiltin<"__builtin_msa_srar_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_srar_w : GCCBuiltin<"__builtin_msa_srar_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_srar_d : GCCBuiltin<"__builtin_msa_srar_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_srari_b : GCCBuiltin<"__builtin_msa_srari_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srari_h : GCCBuiltin<"__builtin_msa_srari_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srari_w : GCCBuiltin<"__builtin_msa_srari_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srari_d : GCCBuiltin<"__builtin_msa_srari_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_srl_b : GCCBuiltin<"__builtin_msa_srl_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_srl_h : GCCBuiltin<"__builtin_msa_srl_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_srl_w : GCCBuiltin<"__builtin_msa_srl_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_srl_d : GCCBuiltin<"__builtin_msa_srl_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_srli_b : GCCBuiltin<"__builtin_msa_srli_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srli_h : GCCBuiltin<"__builtin_msa_srli_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srli_w : GCCBuiltin<"__builtin_msa_srli_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srli_d : GCCBuiltin<"__builtin_msa_srli_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_srlr_b : GCCBuiltin<"__builtin_msa_srlr_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_srlr_h : GCCBuiltin<"__builtin_msa_srlr_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_srlr_w : GCCBuiltin<"__builtin_msa_srlr_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_srlr_d : GCCBuiltin<"__builtin_msa_srlr_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_srlri_b : GCCBuiltin<"__builtin_msa_srlri_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srlri_h : GCCBuiltin<"__builtin_msa_srlri_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srlri_w : GCCBuiltin<"__builtin_msa_srlri_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_srlri_d : GCCBuiltin<"__builtin_msa_srlri_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_st_b : GCCBuiltin<"__builtin_msa_st_b">,
+  Intrinsic<[], [llvm_v16i8_ty, llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadWriteArgMem]>;
+def int_mips_st_h : GCCBuiltin<"__builtin_msa_st_h">,
+  Intrinsic<[], [llvm_v8i16_ty, llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadWriteArgMem]>;
+def int_mips_st_w : GCCBuiltin<"__builtin_msa_st_w">,
+  Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadWriteArgMem]>;
+def int_mips_st_d : GCCBuiltin<"__builtin_msa_st_d">,
+  Intrinsic<[], [llvm_v2i64_ty, llvm_ptr_ty, llvm_i32_ty],
+  [IntrReadWriteArgMem]>;
+
+def int_mips_subs_s_b : GCCBuiltin<"__builtin_msa_subs_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_subs_s_h : GCCBuiltin<"__builtin_msa_subs_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_subs_s_w : GCCBuiltin<"__builtin_msa_subs_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_subs_s_d : GCCBuiltin<"__builtin_msa_subs_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_subs_u_b : GCCBuiltin<"__builtin_msa_subs_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_subs_u_h : GCCBuiltin<"__builtin_msa_subs_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_subs_u_w : GCCBuiltin<"__builtin_msa_subs_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_subs_u_d : GCCBuiltin<"__builtin_msa_subs_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_subsus_u_b : GCCBuiltin<"__builtin_msa_subsus_u_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_subsus_u_h : GCCBuiltin<"__builtin_msa_subsus_u_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_subsus_u_w : GCCBuiltin<"__builtin_msa_subsus_u_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_subsus_u_d : GCCBuiltin<"__builtin_msa_subsus_u_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_subsuu_s_b : GCCBuiltin<"__builtin_msa_subsuu_s_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_subsuu_s_h : GCCBuiltin<"__builtin_msa_subsuu_s_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_subsuu_s_w : GCCBuiltin<"__builtin_msa_subsuu_s_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_subsuu_s_d : GCCBuiltin<"__builtin_msa_subsuu_s_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_subv_b : GCCBuiltin<"__builtin_msa_subv_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+def int_mips_subv_h : GCCBuiltin<"__builtin_msa_subv_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>;
+def int_mips_subv_w : GCCBuiltin<"__builtin_msa_subv_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+def int_mips_subv_d : GCCBuiltin<"__builtin_msa_subv_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+
+def int_mips_subvi_b : GCCBuiltin<"__builtin_msa_subvi_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_subvi_h : GCCBuiltin<"__builtin_msa_subvi_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_subvi_w : GCCBuiltin<"__builtin_msa_subvi_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_subvi_d : GCCBuiltin<"__builtin_msa_subvi_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_vshf_b : GCCBuiltin<"__builtin_msa_vshf_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
+            [IntrNoMem]>;
+def int_mips_vshf_h : GCCBuiltin<"__builtin_msa_vshf_h">,
+  Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
+            [IntrNoMem]>;
+def int_mips_vshf_w : GCCBuiltin<"__builtin_msa_vshf_w">,
+  Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+            [IntrNoMem]>;
+def int_mips_vshf_d : GCCBuiltin<"__builtin_msa_vshf_d">,
+  Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+            [IntrNoMem]>;
+
+def int_mips_xor_v : GCCBuiltin<"__builtin_msa_xor_v">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>;
+
+def int_mips_xori_b : GCCBuiltin<"__builtin_msa_xori_b">,
+  Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
 }
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index c7675c2..4c5718f 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -206,6 +206,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_cvtsi642ss : GCCBuiltin<"__builtin_ia32_cvtsi642ss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
                          llvm_i64_ty], [IntrNoMem]>;
+
   def int_x86_sse_cvtps2pi : GCCBuiltin<"__builtin_ia32_cvtps2pi">,
               Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_x86_sse_cvttps2pi: GCCBuiltin<"__builtin_ia32_cvttps2pi">,
@@ -936,9 +937,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
   def int_x86_sse42_crc32_32_32      : GCCBuiltin<"__builtin_ia32_crc32si">,
           Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
                     [IntrNoMem]>;
-  def int_x86_sse42_crc32_64_8       :
-          Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
   def int_x86_sse42_crc32_64_64      : GCCBuiltin<"__builtin_ia32_crc32di">,
           Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                     [IntrNoMem]>;
@@ -1635,7 +1633,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               GCCBuiltin<"__builtin_ia32_vbroadcastss_ps256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_x86_avx2_vbroadcasti128 :
-              GCCBuiltin<"__builtin_ia32_vbroadcastsi256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx2_pbroadcastb_128 :
               GCCBuiltin<"__builtin_ia32_pbroadcastb128">,
@@ -1867,6 +1864,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_vfmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddps512">,
+              Intrinsic<[llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_vfmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddpd512">,
+              Intrinsic<[llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -1891,6 +1896,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_vfmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubps512">,
+              Intrinsic<[llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_vfmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubpd512">,
+              Intrinsic<[llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -1915,6 +1928,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_vfnmadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmaddps512">,
+              Intrinsic<[llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_vfnmadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmaddpd512">,
+              Intrinsic<[llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -1939,6 +1960,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_vfnmsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfnmsubps512">,
+              Intrinsic<[llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_vfnmsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfnmsubpd512">,
+              Intrinsic<[llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -1957,6 +1986,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_vfmaddsub_ps_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubps512">,
+              Intrinsic<[llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_vfmaddsub_pd_512 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd512">,
+              Intrinsic<[llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [IntrNoMem]>;
   def int_x86_fma_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">,
               Intrinsic<[llvm_v4f32_ty],
                         [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
@@ -1975,6 +2012,14 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v4f64_ty],
                         [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
                         [IntrNoMem]>;
+  def int_x86_fma_vfmsubadd_ps_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddps512">,
+              Intrinsic<[llvm_v16f32_ty],
+                        [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                        [IntrNoMem]>;
+  def int_x86_fma_vfmsubadd_pd_512 : GCCBuiltin<"__builtin_ia32_vfmsubaddpd512">,
+              Intrinsic<[llvm_v8f64_ty],
+                        [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                        [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2550,6 +2595,16 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 }
 
 //===----------------------------------------------------------------------===//
+// TBM
+
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_tbm_bextri_u32 : GCCBuiltin<"__builtin_ia32_bextri_u32">,
+        Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_tbm_bextri_u64 : GCCBuiltin<"__builtin_ia32_bextri_u64">,
+        Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
 // RDRAND intrinsics - Return a random value and whether it is valid.
 // RDSEED intrinsics - Return a NIST SP800-90B & C compliant random value and
 // whether it is valid.
@@ -2578,8 +2633,11 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_xtest : GCCBuiltin<"__builtin_ia32_xtest">,
               Intrinsic<[llvm_i32_ty], [], []>;
 }
-// AVX-512
 
+//===----------------------------------------------------------------------===//
+// AVX512
+
+// Mask ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   // Mask instructions
   // 16-bit mask
@@ -2617,3 +2675,451 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
                         [IntrNoMem]>;
 }
+
+// Conversion ops
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx512_cvtss2usi : GCCBuiltin<"__builtin_ia32_cvtss2usi">,
+              Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvtss2usi64 : GCCBuiltin<"__builtin_ia32_cvtss2usi64">,
+              Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvttss2usi : GCCBuiltin<"__builtin_ia32_cvttss2usi">,
+              Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvttss2usi64 : GCCBuiltin<"__builtin_ia32_cvttss2usi64">,
+              Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvtusi2ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss">,
+              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvtusi642ss : GCCBuiltin<"__builtin_ia32_cvtusi642ss">,
+              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
+                         llvm_i64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_cvtsd2usi : GCCBuiltin<"__builtin_ia32_cvtsd2usi">,
+              Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvtsd2usi64 : GCCBuiltin<"__builtin_ia32_cvtsd2usi64">,
+              Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvttsd2usi : GCCBuiltin<"__builtin_ia32_cvttsd2usi">,
+              Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvttsd2usi64 : GCCBuiltin<"__builtin_ia32_cvttsd2usi64">,
+              Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvtusi2sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd">,
+              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvtusi642sd : GCCBuiltin<"__builtin_ia32_cvtusi642sd">,
+              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
+                         llvm_i64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_vcvtph2ps_512 : GCCBuiltin<"__builtin_ia32_vcvtph2ps512">,
+              Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty],
+                        [IntrNoMem]>;
+}
+
+// Vector convert
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx512_cvt_ps2dq_512 : GCCBuiltin<"__builtin_ia32_cvtps2dq512">,
+        Intrinsic<[llvm_v16i32_ty], [llvm_v16f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_cvtdq2_ps_512 : GCCBuiltin<"__builtin_ia32_cvtdq2ps512">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty], [IntrNoMem]>;
+}
+
+// Vector load with broadcast
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx512_vbroadcast_ss_512 :
+        GCCBuiltin<"__builtin_ia32_vbroadcastss512">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_vbroadcast_ss_ps_512 :
+              GCCBuiltin<"__builtin_ia32_vbroadcastss_ps512">,
+              Intrinsic<[llvm_v16f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_vbroadcast_sd_512 :
+        GCCBuiltin<"__builtin_ia32_vbroadcastsd512">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_x86_avx512_vbroadcast_sd_pd_512 :
+              GCCBuiltin<"__builtin_ia32_vbroadcastsd_pd512">,
+              Intrinsic<[llvm_v8f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_pbroadcastd_512 :
+         GCCBuiltin<"__builtin_ia32_pbroadcastd512">,
+         Intrinsic<[llvm_v16i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pbroadcastd_i32_512 :
+         Intrinsic<[llvm_v16i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_pbroadcastq_512 :
+         GCCBuiltin<"__builtin_ia32_pbroadcastq512">,
+         Intrinsic<[llvm_v8i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_pbroadcastq_i64_512 :
+         Intrinsic<[llvm_v8i64_ty], [llvm_i64_ty], [IntrNoMem]>;
+}
+
+// Vector sign and zero extend
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx512_pmovzxbq : GCCBuiltin<"__builtin_ia32_pmovzxbq512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v16i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_pmovzxwd : GCCBuiltin<"__builtin_ia32_pmovzxwd512">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i16_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_pmovzxbd : GCCBuiltin<"__builtin_ia32_pmovzxbd512">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i8_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_pmovzxwq : GCCBuiltin<"__builtin_ia32_pmovzxwq512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i16_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_pmovzxdq : GCCBuiltin<"__builtin_ia32_pmovzxdq512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i32_ty],
+                        [IntrNoMem]>;
+}
+
+// Arithmetic ops
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx512_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512">,
+      Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty,
+                llvm_v16f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512">,
+      Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty,
+                llvm_v8f64_ty], [IntrNoMem]>;
+  def int_x86_avx512_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512">,
+      Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty,
+                llvm_v16f32_ty], [IntrNoMem]>;
+  def int_x86_avx512_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512">,
+      Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty,
+                llvm_v8f64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_pmaxu_d : GCCBuiltin<"__builtin_ia32_pmaxud512">,
+      Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pmaxu_q : GCCBuiltin<"__builtin_ia32_pmaxuq512">,
+      Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                llvm_v8i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_pmaxs_d : GCCBuiltin<"__builtin_ia32_pmaxsd512">,
+      Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pmaxs_q : GCCBuiltin<"__builtin_ia32_pmaxsq512">,
+      Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                llvm_v8i64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_pminu_d : GCCBuiltin<"__builtin_ia32_pminud512">,
+      Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pminu_q : GCCBuiltin<"__builtin_ia32_pminuq512">,
+      Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                llvm_v8i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_pmins_d : GCCBuiltin<"__builtin_ia32_pminsd512">,
+      Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                llvm_v16i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_pmins_q : GCCBuiltin<"__builtin_ia32_pminsq512">,
+      Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                llvm_v8i64_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_rndscale_ss        : GCCBuiltin<"__builtin_ia32_rndscaless">,
+              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_rndscale_sd        : GCCBuiltin<"__builtin_ia32_rndscalesd">,
+              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_sqrt_ss        : GCCBuiltin<"__builtin_ia32_sqrtrndss">,
+              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_sqrt_sd        : GCCBuiltin<"__builtin_ia32_sqrtrndsd">,
+              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty],
+                        [IntrNoMem]>;
+
+  def int_x86_avx512_rndscale_ps_512        : GCCBuiltin<"__builtin_ia32_rndscaleps512">,
+              Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_rndscale_pd_512        : GCCBuiltin<"__builtin_ia32_rndscalepd512">,
+              Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+                         
+  def int_x86_avx512_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512">,
+        Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty], [IntrNoMem]>;
+  def int_x86_avx512_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512">,
+        Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_rcp14_ps_512 : GCCBuiltin<"__builtin_ia32_rcp14ps512">,
+            Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rcp14_pd_512 : GCCBuiltin<"__builtin_ia32_rcp14pd512">,
+            Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss">,
+            Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd">,
+            Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512">,
+            Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt14_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt14pd512">,
+            Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss">,
+            Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd">,
+            Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
+                      [IntrNoMem]>;
+
+  def int_x86_avx512_rcp28_ps_512 : GCCBuiltin<"__builtin_ia32_rcp28ps512">,
+            Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rcp28_pd_512 : GCCBuiltin<"__builtin_ia32_rcp28pd512">,
+            Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss">,
+            Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd">,
+            Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt28_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt28ps512">,
+            Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt28_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt28pd512">,
+            Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss">,
+            Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd">,
+            Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
+                      [IntrNoMem]>;
+}
+
+// Integer shift ops.
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx512_psll_dq : GCCBuiltin<"__builtin_ia32_pslldqi512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_psrl_dq : GCCBuiltin<"__builtin_ia32_psrldqi512">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_psll_dq_bs : GCCBuiltin<"__builtin_ia32_pslldqi512_byteshift">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_psrl_dq_bs : GCCBuiltin<"__builtin_ia32_psrldqi512_byteshift">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+}
+
+// Gather and Scatter ops
+let TargetPrefix = "x86" in {
+  def int_x86_avx512_gather_dpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdpd512">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
+                     llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadMem]>;
+  def int_x86_avx512_gather_dps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdps512">,
+          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i16_ty,
+                     llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadMem]>;
+  def int_x86_avx512_gather_qpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqpd512">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadMem]>;
+  def int_x86_avx512_gather_qps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqps512">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadMem]>;
+
+  def int_x86_avx512_gather_dpd_512  : GCCBuiltin<"__builtin_ia32_gatherdpd512">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
+                     llvm_i32_ty],
+                    [IntrReadMem]>;
+  def int_x86_avx512_gather_dps_512  : GCCBuiltin<"__builtin_ia32_gatherdps512">,
+          Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
+                     llvm_i32_ty],
+                    [IntrReadMem]>;
+  def int_x86_avx512_gather_qpd_512  : GCCBuiltin<"__builtin_ia32_gatherqpd512">,
+          Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_i32_ty],
+                    [IntrReadArgMem]>;
+  def int_x86_avx512_gather_qps_512  : GCCBuiltin<"__builtin_ia32_gatherqps512">,
+          Intrinsic<[llvm_v8f32_ty], [llvm_v8i64_ty, llvm_ptr_ty, 
+                     llvm_i32_ty],
+                    [IntrReadMem]>;
+
+  def int_x86_avx512_gather_dpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdpq512">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
+                     llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadMem]>;
+  def int_x86_avx512_gather_dpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherdpi512">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_i16_ty,
+                     llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadArgMem]>;
+  def int_x86_avx512_gather_qpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqpq512">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadArgMem]>;
+  def int_x86_avx512_gather_qpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_gatherqpi512">,
+          Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+                    [IntrReadMem]>;
+
+  def int_x86_avx512_gather_dpq_512  : GCCBuiltin<"__builtin_ia32_gatherdpq512">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
+                     llvm_i32_ty],
+                    [IntrReadArgMem]>;
+  def int_x86_avx512_gather_dpi_512  : GCCBuiltin<"__builtin_ia32_gatherdpi512">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty, 
+                     llvm_i32_ty],
+                    [IntrReadArgMem]>;
+  def int_x86_avx512_gather_qpq_512  : GCCBuiltin<"__builtin_ia32_gatherqpq512">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_i32_ty],
+                    [IntrReadArgMem]>;
+  def int_x86_avx512_gather_qpi_512  : GCCBuiltin<"__builtin_ia32_gatherqpi512">,
+          Intrinsic<[llvm_v8i32_ty], [llvm_v8i64_ty, llvm_ptr_ty,
+                     llvm_i32_ty],
+                    [IntrReadArgMem]>;
+// scatter
+  def int_x86_avx512_scatter_dpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdpd512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
+                        llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_dps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdps512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
+                       llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_qpd_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqpd512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_qps_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqps512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_scatter_dpd_512  : GCCBuiltin<"__builtin_ia32_scatterdpd512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f64_ty, 
+                         llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_dps_512  : GCCBuiltin<"__builtin_ia32_scatterdps512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_v16f32_ty, 
+                         llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_qpd_512  : GCCBuiltin<"__builtin_ia32_scatterqpd512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f64_ty, 
+                         llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_qps_512  : GCCBuiltin<"__builtin_ia32_scatterqps512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f32_ty, 
+                         llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_scatter_dpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdpq512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, 
+                         llvm_v8i64_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_dpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterdpi512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
+                     llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_qpq_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqpq512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+  def int_x86_avx512_scatter_qpi_mask_512  : GCCBuiltin<"__builtin_ia32_mask_scatterqpi512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
+                     llvm_v8i64_ty, llvm_v8i32_ty, llvm_i32_ty],
+                    [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_scatter_dpq_512  : GCCBuiltin<"__builtin_ia32_scatterdpq512">,
+          Intrinsic<[], [llvm_ptr_ty,
+                         llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
+                    []>;
+  def int_x86_avx512_scatter_dpi_512  : GCCBuiltin<"__builtin_ia32_scatterdpi512">,
+          Intrinsic<[], [llvm_ptr_ty,
+                     llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
+                    []>;
+  def int_x86_avx512_scatter_qpq_512  : GCCBuiltin<"__builtin_ia32_scatterqpq512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i64_ty,
+                         llvm_i32_ty],
+                    []>;
+  def int_x86_avx512_scatter_qpi_512  : GCCBuiltin<"__builtin_ia32_scatterqpi512">,
+          Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i32_ty, 
+                         llvm_i32_ty],
+                    []>;
+}
+
+// AVX-512 conflict detection
+let TargetPrefix = "x86" in {
+  def int_x86_avx512_conflict_d_512 : GCCBuiltin<"__builtin_ia32_conflictd512">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty],
+          []>;
+  def int_x86_avx512_conflict_d_mask_512 :
+          GCCBuiltin<"__builtin_ia32_mask_conflictd512">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                    llvm_v16i1_ty, llvm_v16i32_ty],
+                    []>;
+  def int_x86_avx512_conflict_d_maskz_512:
+          GCCBuiltin<"__builtin_ia32_maskz_conflictd512">,
+          Intrinsic<[llvm_v16i32_ty], [llvm_v16i1_ty, llvm_v16i32_ty],
+                    []>;
+
+  def int_x86_avx512_conflict_q_512 : GCCBuiltin<"__builtin_ia32_conflictq512">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty],
+          []>;
+  def int_x86_avx512_conflict_q_mask_512 :
+          GCCBuiltin<"__builtin_ia32_mask_conflictq512">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
+                    llvm_v8i1_ty, llvm_v8i64_ty],
+                    []>;
+  def int_x86_avx512_conflict_q_maskz_512:
+          GCCBuiltin<"__builtin_ia32_maskz_conflictq512">,
+          Intrinsic<[llvm_v8i64_ty], [llvm_v8i1_ty, llvm_v8i64_ty],
+                    []>;
+}
+
+// Vector blend
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_avx512_mskblend_ps_512 : GCCBuiltin<"__builtin_ia32_mskblendps512">,
+        Intrinsic<[llvm_v16f32_ty],
+                  [llvm_v16i1_ty, llvm_v16f32_ty, llvm_v16f32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mskblend_pd_512 : GCCBuiltin<"__builtin_ia32_mskblendpd512">,
+        Intrinsic<[llvm_v8f64_ty],
+                  [llvm_v8i1_ty, llvm_v8f64_ty, llvm_v8f64_ty],
+                  [IntrNoMem]>;
+
+  def int_x86_avx512_mskblend_d_512 : GCCBuiltin<"__builtin_ia32_mskblendd512">,
+        Intrinsic<[llvm_v16i32_ty],
+                  [llvm_v16i1_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                  [IntrNoMem]>;
+  def int_x86_avx512_mskblend_q_512 : GCCBuiltin<"__builtin_ia32_mskblendq512">,
+        Intrinsic<[llvm_v8i64_ty],
+                  [llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i64_ty],
+                  [IntrNoMem]>;
+}
+
+// Misc.
+let TargetPrefix = "x86" in {
+  def int_x86_avx512_cmpeq_pi_512 : GCCBuiltin<"__builtin_ia32_cmpeqpi512">,
+            Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
+                      [IntrNoMem]>;
+  def int_x86_avx512_and_pi : GCCBuiltin<"__builtin_ia32_andpi512">,
+            Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
+         [IntrNoMem]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SHA intrinsics
+let TargetPrefix = "x86" in {
+  def int_x86_sha1rnds4 : GCCBuiltin<"__builtin_ia32_sha1rnds4">,
+        Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
+                  [IntrNoMem]>;
+  def int_x86_sha1nexte : GCCBuiltin<"__builtin_ia32_sha1nexte">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_sha1msg1 : GCCBuiltin<"__builtin_ia32_sha1msg1">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_sha1msg2 : GCCBuiltin<"__builtin_ia32_sha1msg2">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_sha256rnds2 : GCCBuiltin<"__builtin_ia32_sha256rnds2">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                [IntrNoMem]>;
+  def int_x86_sha256msg1 : GCCBuiltin<"__builtin_ia32_sha256msg1">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+  def int_x86_sha256msg2 : GCCBuiltin<"__builtin_ia32_sha256msg2">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
+}
diff --git a/include/llvm/IR/LegacyPassManager.h b/include/llvm/IR/LegacyPassManager.h
new file mode 100644
index 0000000..fa1436e
--- /dev/null
+++ b/include/llvm/IR/LegacyPassManager.h
@@ -0,0 +1,111 @@
+//===- LegacyPassManager.h - Legacy Container for Passes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the legacy PassManager class.  This class is used to hold,
+// maintain, and optimize execution of Passes.  The PassManager class ensures
+// that analysis results are available before a pass runs, and that Pass's are
+// destroyed when the PassManager is destroyed.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_LEGACYPASSMANAGER_H
+#define LLVM_IR_LEGACYPASSMANAGER_H
+
+#include "llvm/Pass.h"
+#include "llvm/Support/CBindingWrapping.h"
+
+namespace llvm {
+
+class Pass;
+class Module;
+
+namespace legacy {
+
+class PassManagerImpl;
+class FunctionPassManagerImpl;
+
+/// PassManagerBase - An abstract interface to allow code to add passes to
+/// a pass manager without having to hard-code what kind of pass manager
+/// it is.
+class PassManagerBase {
+public:
+  virtual ~PassManagerBase();
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  virtual void add(Pass *P) = 0;
+};
+
+/// PassManager manages ModulePassManagers
+class PassManager : public PassManagerBase {
+public:
+
+  PassManager();
+  ~PassManager();
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P);
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Module &M);
+
+private:
+  /// PassManagerImpl_New is the actual class. PassManager is just the
+  /// wraper to publish simple pass manager interface
+  PassManagerImpl *PM;
+};
+
+/// FunctionPassManager manages FunctionPasses and BasicBlockPassManagers.
+class FunctionPassManager : public PassManagerBase {
+public:
+  /// FunctionPassManager ctor - This initializes the pass manager.  It needs,
+  /// but does not take ownership of, the specified Module.
+  explicit FunctionPassManager(Module *M);
+  ~FunctionPassManager();
+
+  /// add - Add a pass to the queue of passes to run.  This passes
+  /// ownership of the Pass to the PassManager.  When the
+  /// PassManager_X is destroyed, the pass will be destroyed as well, so
+  /// there is no need to delete the pass.
+  /// This implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P);
+
+  /// run - Execute all of the passes scheduled for execution.  Keep
+  /// track of whether any of the passes modifies the function, and if
+  /// so, return true.
+  ///
+  bool run(Function &F);
+
+  /// doInitialization - Run all of the initializers for the function passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the function passes.
+  ///
+  bool doFinalization();
+
+private:
+  FunctionPassManagerImpl *FPM;
+  Module *M;
+};
+
+} // End legacy namespace
+
+// Create wrappers for C Binding types (see CBindingWrapping.h).
+DEFINE_STDCXX_CONVERSION_FUNCTIONS(legacy::PassManagerBase, LLVMPassManagerRef)
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/IR/LegacyPassManagers.h b/include/llvm/IR/LegacyPassManagers.h
new file mode 100644
index 0000000..d256a3e
--- /dev/null
+++ b/include/llvm/IR/LegacyPassManagers.h
@@ -0,0 +1,470 @@
+//===- LegacyPassManagers.h - Legacy Pass Infrastructure --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the LLVM Pass Manager infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PASSMANAGERS_H
+#define LLVM_PASSMANAGERS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Pass.h"
+#include <map>
+#include <vector>
+
+//===----------------------------------------------------------------------===//
+// Overview:
+// The Pass Manager Infrastructure manages passes. It's responsibilities are:
+//
+//   o Manage optimization pass execution order
+//   o Make required Analysis information available before pass P is run
+//   o Release memory occupied by dead passes
+//   o If Analysis information is dirtied by a pass then regenerate Analysis
+//     information before it is consumed by another pass.
+//
+// Pass Manager Infrastructure uses multiple pass managers.  They are
+// PassManager, FunctionPassManager, MPPassManager, FPPassManager, BBPassManager.
+// This class hierarchy uses multiple inheritance but pass managers do not
+// derive from another pass manager.
+//
+// PassManager and FunctionPassManager are two top-level pass manager that
+// represents the external interface of this entire pass manager infrastucture.
+//
+// Important classes :
+//
+// [o] class PMTopLevelManager;
+//
+// Two top level managers, PassManager and FunctionPassManager, derive from
+// PMTopLevelManager. PMTopLevelManager manages information used by top level
+// managers such as last user info.
+//
+// [o] class PMDataManager;
+//
+// PMDataManager manages information, e.g. list of available analysis info,
+// used by a pass manager to manage execution order of passes. It also provides
+// a place to implement common pass manager APIs. All pass managers derive from
+// PMDataManager.
+//
+// [o] class BBPassManager : public FunctionPass, public PMDataManager;
+//
+// BBPassManager manages BasicBlockPasses.
+//
+// [o] class FunctionPassManager;
+//
+// This is a external interface used by JIT to manage FunctionPasses. This
+// interface relies on FunctionPassManagerImpl to do all the tasks.
+//
+// [o] class FunctionPassManagerImpl : public ModulePass, PMDataManager,
+//                                     public PMTopLevelManager;
+//
+// FunctionPassManagerImpl is a top level manager. It manages FPPassManagers
+//
+// [o] class FPPassManager : public ModulePass, public PMDataManager;
+//
+// FPPassManager manages FunctionPasses and BBPassManagers
+//
+// [o] class MPPassManager : public Pass, public PMDataManager;
+//
+// MPPassManager manages ModulePasses and FPPassManagers
+//
+// [o] class PassManager;
+//
+// This is a external interface used by various tools to manages passes. It
+// relies on PassManagerImpl to do all the tasks.
+//
+// [o] class PassManagerImpl : public Pass, public PMDataManager,
+//                             public PMTopLevelManager
+//
+// PassManagerImpl is a top level pass manager responsible for managing
+// MPPassManagers.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/PrettyStackTrace.h"
+
+namespace llvm {
+  class Module;
+  class Pass;
+  class StringRef;
+  class Value;
+  class Timer;
+  class PMDataManager;
+
+// enums for debugging strings
+enum PassDebuggingString {
+  EXECUTION_MSG, // "Executing Pass '"
+  MODIFICATION_MSG, // "' Made Modification '"
+  FREEING_MSG, // " Freeing Pass '"
+  ON_BASICBLOCK_MSG, // "'  on BasicBlock '" + PassName + "'...\n"
+  ON_FUNCTION_MSG, // "' on Function '" + FunctionName + "'...\n"
+  ON_MODULE_MSG, // "' on Module '" + ModuleName + "'...\n"
+  ON_REGION_MSG, // " 'on Region ...\n'"
+  ON_LOOP_MSG, // " 'on Loop ...\n'"
+  ON_CG_MSG // "' on Call Graph ...\n'"
+};
+
+/// PassManagerPrettyStackEntry - This is used to print informative information
+/// about what pass is running when/if a stack trace is generated.
+class PassManagerPrettyStackEntry : public PrettyStackTraceEntry {
+  Pass *P;
+  Value *V;
+  Module *M;
+public:
+  explicit PassManagerPrettyStackEntry(Pass *p)
+    : P(p), V(0), M(0) {}  // When P is releaseMemory'd.
+  PassManagerPrettyStackEntry(Pass *p, Value &v)
+    : P(p), V(&v), M(0) {} // When P is run on V
+  PassManagerPrettyStackEntry(Pass *p, Module &m)
+    : P(p), V(0), M(&m) {} // When P is run on M
+
+  /// print - Emit information about this stack frame to OS.
+  virtual void print(raw_ostream &OS) const;
+};
+
+
+//===----------------------------------------------------------------------===//
+// PMStack
+//
+/// PMStack - This class implements a stack data structure of PMDataManager
+/// pointers.
+///
+/// Top level pass managers (see PassManager.cpp) maintain active Pass Managers
+/// using PMStack. Each Pass implements assignPassManager() to connect itself
+/// with appropriate manager. assignPassManager() walks PMStack to find
+/// suitable manager.
+class PMStack {
+public:
+  typedef std::vector<PMDataManager *>::const_reverse_iterator iterator;
+  iterator begin() const { return S.rbegin(); }
+  iterator end() const { return S.rend(); }
+
+  void pop();
+  PMDataManager *top() const { return S.back(); }
+  void push(PMDataManager *PM);
+  bool empty() const { return S.empty(); }
+
+  void dump() const;
+
+private:
+  std::vector<PMDataManager *> S;
+};
+
+
+//===----------------------------------------------------------------------===//
+// PMTopLevelManager
+//
+/// PMTopLevelManager manages LastUser info and collects common APIs used by
+/// top level pass managers.
+class PMTopLevelManager {
+protected:
+  explicit PMTopLevelManager(PMDataManager *PMDM);
+
+  unsigned getNumContainedManagers() const {
+    return (unsigned)PassManagers.size();
+  }
+
+  void initializeAllAnalysisInfo();
+
+private:
+  virtual PMDataManager *getAsPMDataManager() = 0;
+  virtual PassManagerType getTopLevelPassManagerType() = 0;
+
+public:
+  /// Schedule pass P for execution. Make sure that passes required by
+  /// P are run before P is run. Update analysis info maintained by
+  /// the manager. Remove dead passes. This is a recursive function.
+  void schedulePass(Pass *P);
+
+  /// Set pass P as the last user of the given analysis passes.
+  void setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P);
+
+  /// Collect passes whose last user is P
+  void collectLastUses(SmallVectorImpl<Pass *> &LastUses, Pass *P);
+
+  /// Find the pass that implements Analysis AID. Search immutable
+  /// passes and all pass managers. If desired pass is not found
+  /// then return NULL.
+  Pass *findAnalysisPass(AnalysisID AID);
+
+  /// Find analysis usage information for the pass P.
+  AnalysisUsage *findAnalysisUsage(Pass *P);
+
+  virtual ~PMTopLevelManager();
+
+  /// Add immutable pass and initialize it.
+  inline void addImmutablePass(ImmutablePass *P) {
+    P->initializePass();
+    ImmutablePasses.push_back(P);
+  }
+
+  inline SmallVectorImpl<ImmutablePass *>& getImmutablePasses() {
+    return ImmutablePasses;
+  }
+
+  void addPassManager(PMDataManager *Manager) {
+    PassManagers.push_back(Manager);
+  }
+
+  // Add Manager into the list of managers that are not directly
+  // maintained by this top level pass manager
+  inline void addIndirectPassManager(PMDataManager *Manager) {
+    IndirectPassManagers.push_back(Manager);
+  }
+
+  // Print passes managed by this top level manager.
+  void dumpPasses() const;
+  void dumpArguments() const;
+
+  // Active Pass Managers
+  PMStack activeStack;
+
+protected:
+
+  /// Collection of pass managers
+  SmallVector<PMDataManager *, 8> PassManagers;
+
+private:
+
+  /// Collection of pass managers that are not directly maintained
+  /// by this pass manager
+  SmallVector<PMDataManager *, 8> IndirectPassManagers;
+
+  // Map to keep track of last user of the analysis pass.
+  // LastUser->second is the last user of Lastuser->first.
+  DenseMap<Pass *, Pass *> LastUser;
+
+  // Map to keep track of passes that are last used by a pass.
+  // This inverse map is initialized at PM->run() based on
+  // LastUser map.
+  DenseMap<Pass *, SmallPtrSet<Pass *, 8> > InversedLastUser;
+
+  /// Immutable passes are managed by top level manager.
+  SmallVector<ImmutablePass *, 8> ImmutablePasses;
+
+  DenseMap<Pass *, AnalysisUsage *> AnUsageMap;
+};
+
+
+
+//===----------------------------------------------------------------------===//
+// PMDataManager
+
+/// PMDataManager provides the common place to manage the analysis data
+/// used by pass managers.
+class PMDataManager {
+public:
+
+  explicit PMDataManager() : TPM(NULL), Depth(0) {
+    initializeAnalysisInfo();
+  }
+
+  virtual ~PMDataManager();
+
+  virtual Pass *getAsPass() = 0;
+
+  /// Augment AvailableAnalysis by adding analysis made available by pass P.
+  void recordAvailableAnalysis(Pass *P);
+
+  /// verifyPreservedAnalysis -- Verify analysis presreved by pass P.
+  void verifyPreservedAnalysis(Pass *P);
+
+  /// Remove Analysis that is not preserved by the pass
+  void removeNotPreservedAnalysis(Pass *P);
+
+  /// Remove dead passes used by P.
+  void removeDeadPasses(Pass *P, StringRef Msg,
+                        enum PassDebuggingString);
+
+  /// Remove P.
+  void freePass(Pass *P, StringRef Msg,
+                enum PassDebuggingString);
+
+  /// Add pass P into the PassVector. Update
+  /// AvailableAnalysis appropriately if ProcessAnalysis is true.
+  void add(Pass *P, bool ProcessAnalysis = true);
+
+  /// Add RequiredPass into list of lower level passes required by pass P.
+  /// RequiredPass is run on the fly by Pass Manager when P requests it
+  /// through getAnalysis interface.
+  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
+
+  virtual Pass *getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F);
+
+  /// Initialize available analysis information.
+  void initializeAnalysisInfo() {
+    AvailableAnalysis.clear();
+    for (unsigned i = 0; i < PMT_Last; ++i)
+      InheritedAnalysis[i] = NULL;
+  }
+
+  // Return true if P preserves high level analysis used by other
+  // passes that are managed by this manager.
+  bool preserveHigherLevelAnalysis(Pass *P);
+
+
+  /// Populate RequiredPasses with analysis pass that are required by
+  /// pass P and are available. Populate ReqPassNotAvailable with analysis
+  /// pass that are required by pass P but are not available.
+  void collectRequiredAnalysis(SmallVectorImpl<Pass *> &RequiredPasses,
+                               SmallVectorImpl<AnalysisID> &ReqPassNotAvailable,
+                               Pass *P);
+
+  /// All Required analyses should be available to the pass as it runs!  Here
+  /// we fill in the AnalysisImpls member of the pass so that it can
+  /// successfully use the getAnalysis() method to retrieve the
+  /// implementations it needs.
+  void initializeAnalysisImpl(Pass *P);
+
+  /// Find the pass that implements Analysis AID. If desired pass is not found
+  /// then return NULL.
+  Pass *findAnalysisPass(AnalysisID AID, bool Direction);
+
+  // Access toplevel manager
+  PMTopLevelManager *getTopLevelManager() { return TPM; }
+  void setTopLevelManager(PMTopLevelManager *T) { TPM = T; }
+
+  unsigned getDepth() const { return Depth; }
+  void setDepth(unsigned newDepth) { Depth = newDepth; }
+
+  // Print routines used by debug-pass
+  void dumpLastUses(Pass *P, unsigned Offset) const;
+  void dumpPassArguments() const;
+  void dumpPassInfo(Pass *P, enum PassDebuggingString S1,
+                    enum PassDebuggingString S2, StringRef Msg);
+  void dumpRequiredSet(const Pass *P) const;
+  void dumpPreservedSet(const Pass *P) const;
+
+  unsigned getNumContainedPasses() const {
+    return (unsigned)PassVector.size();
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    assert ( 0 && "Invalid use of getPassManagerType");
+    return PMT_Unknown;
+  }
+
+  DenseMap<AnalysisID, Pass*> *getAvailableAnalysis() {
+    return &AvailableAnalysis;
+  }
+
+  // Collect AvailableAnalysis from all the active Pass Managers.
+  void populateInheritedAnalysis(PMStack &PMS) {
+    unsigned Index = 0;
+    for (PMStack::iterator I = PMS.begin(), E = PMS.end();
+         I != E; ++I)
+      InheritedAnalysis[Index++] = (*I)->getAvailableAnalysis();
+  }
+
+protected:
+
+  // Top level manager.
+  PMTopLevelManager *TPM;
+
+  // Collection of pass that are managed by this manager
+  SmallVector<Pass *, 16> PassVector;
+
+  // Collection of Analysis provided by Parent pass manager and
+  // used by current pass manager. At at time there can not be more
+  // then PMT_Last active pass mangers.
+  DenseMap<AnalysisID, Pass *> *InheritedAnalysis[PMT_Last];
+
+  /// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
+  /// or higher is specified.
+  bool isPassDebuggingExecutionsOrMore() const;
+
+private:
+  void dumpAnalysisUsage(StringRef Msg, const Pass *P,
+                         const AnalysisUsage::VectorType &Set) const;
+
+  // Set of available Analysis. This information is used while scheduling
+  // pass. If a pass requires an analysis which is not available then
+  // the required analysis pass is scheduled to run before the pass itself is
+  // scheduled to run.
+  DenseMap<AnalysisID, Pass*> AvailableAnalysis;
+
+  // Collection of higher level analysis used by the pass managed by
+  // this manager.
+  SmallVector<Pass *, 8> HigherLevelAnalysis;
+
+  unsigned Depth;
+};
+
+//===----------------------------------------------------------------------===//
+// FPPassManager
+//
+/// FPPassManager manages BBPassManagers and FunctionPasses.
+/// It batches all function passes and basic block pass managers together and
+/// sequence them to process one function at a time before processing next
+/// function.
+class FPPassManager : public ModulePass, public PMDataManager {
+public:
+  static char ID;
+  explicit FPPassManager()
+  : ModulePass(ID), PMDataManager() { }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnFunction(Function &F);
+  bool runOnModule(Module &M);
+
+  /// cleanup - After running all passes, clean up pass manager cache.
+  void cleanup();
+
+  /// doInitialization - Overrides ModulePass doInitialization for global
+  /// initialization tasks
+  ///
+  using ModulePass::doInitialization;
+
+  /// doInitialization - Run all of the initializers for the function passes.
+  ///
+  bool doInitialization(Module &M);
+
+  /// doFinalization - Overrides ModulePass doFinalization for global
+  /// finalization tasks
+  /// 
+  using ModulePass::doFinalization;
+  
+  /// doFinalization - Run all of the finalizers for the function passes.
+  ///
+  bool doFinalization(Module &M);
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset);
+
+  virtual const char *getPassName() const {
+    return "Function Pass Manager";
+  }
+
+  FunctionPass *getContainedPass(unsigned N) {
+    assert ( N < PassVector.size() && "Pass number out of range!");
+    FunctionPass *FP = static_cast<FunctionPass *>(PassVector[N]);
+    return FP;
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_FunctionPassManager;
+  }
+};
+
+Timer *getPassTimer(Pass *);
+
+}
+
+#endif
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index acd84d7..9659c2e 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -28,6 +28,10 @@ template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
 
 
+enum LLVMConstants LLVM_ENUM_INT_TYPE(uint32_t) {
+  DEBUG_METADATA_VERSION = 1  // Current debug info version number.
+};
+
 //===----------------------------------------------------------------------===//
 /// MDString - a single uniqued string.
 /// These are used to efficiently contain a byte sequence for metadata.
@@ -161,6 +165,9 @@ public:
     return V->getValueID() == MDNodeVal;
   }
 
+  /// Check whether MDNode is a vtable access.
+  bool isTBAAVtableAccess() const;
+
   /// Methods for metadata merging.
   static MDNode *getMostGenericTBAA(MDNode *A, MDNode *B);
   static MDNode *getMostGenericFPMath(MDNode *A, MDNode *B);
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 3dbc5ff..b30a9a3 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -391,7 +391,7 @@ public:
 /// @name Named Metadata Accessors
 /// @{
 
-  /// getNamedMetadata - Return the NamedMDNode in the module with the
+  /// getNamedMetadata - Return the first NamedMDNode in the module with the
   /// specified name. This method returns null if a NamedMDNode with the
   /// specified name is not found.
   NamedMDNode *getNamedMetadata(const Twine &Name) const;
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 13ab72c..5b9bee7 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -439,8 +439,8 @@ public:
   /// offset of this GEP if the GEP is in fact constant. If the GEP is not
   /// all-constant, it returns false and the value of the offset APInt is
   /// undefined (it is *not* preserved!). The APInt passed into this routine
-  /// must be at least as wide as the IntPtr type for the address space of
-  /// the base GEP pointer.
+  /// must be at exactly as wide as the IntPtr type for the address space of the
+  /// base GEP pointer.
   bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const {
     assert(Offset.getBitWidth() ==
            DL.getPointerSizeInBits(getPointerAddressSpace()) &&
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
new file mode 100644
index 0000000..833547a
--- /dev/null
+++ b/include/llvm/IR/PassManager.h
@@ -0,0 +1,383 @@
+//===- PassManager.h - Pass management infrastructure -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This header defines various interfaces for pass management in LLVM. There
+/// is no "pass" interface in LLVM per se. Instead, an instance of any class
+/// which supports a method to 'run' it over a unit of IR can be used as
+/// a pass. A pass manager is generally a tool to collect a sequence of passes
+/// which run over a particular IR construct, and run each of them in sequence
+/// over each such construct in the containing IR construct. As there is no
+/// containing IR construct for a Module, a manager for passes over modules
+/// forms the base case which runs its managed passes in sequence over the
+/// single module provided.
+///
+/// The core IR library provides managers for running passes over
+/// modules and functions.
+///
+/// * FunctionPassManager can run over a Module, runs each pass over
+///   a Function.
+/// * ModulePassManager must be directly run, runs each pass over the Module.
+///
+/// Note that the implementations of the pass managers use concept-based
+/// polymorphism as outlined in the "Value Semantics and Concept-based
+/// Polymorphism" talk (or its abbreviated sibling "Inheritance Is The Base
+/// Class of Evil") by Sean Parent:
+/// * http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations
+/// * http://www.youtube.com/watch?v=_BpMYeUFXv8
+/// * http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/polymorphic_ptr.h"
+#include "llvm/Support/type_traits.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include <list>
+#include <vector>
+
+namespace llvm {
+
+class Module;
+class Function;
+
+/// \brief Implementation details of the pass manager interfaces.
+namespace detail {
+
+/// \brief Template for the abstract base class used to dispatch
+/// polymorphically over pass objects.
+template <typename T> struct PassConcept {
+  // Boiler plate necessary for the container of derived classes.
+  virtual ~PassConcept() {}
+  virtual PassConcept *clone() = 0;
+
+  /// \brief The polymorphic API which runs the pass over a given IR entity.
+  virtual bool run(T Arg) = 0;
+};
+
+/// \brief A template wrapper used to implement the polymorphic API.
+///
+/// Can be instantiated for any object which provides a \c run method
+/// accepting a \c T. It requires the pass to be a copyable
+/// object.
+template <typename T, typename PassT> struct PassModel : PassConcept<T> {
+  PassModel(PassT Pass) : Pass(llvm_move(Pass)) {}
+  virtual PassModel *clone() { return new PassModel(Pass); }
+  virtual bool run(T Arg) { return Pass.run(Arg); }
+  PassT Pass;
+};
+
+}
+
+class AnalysisManager;
+
+class ModulePassManager {
+public:
+  ModulePassManager(Module *M, AnalysisManager *AM = 0) : M(M), AM(AM) {}
+
+  template <typename ModulePassT> void addPass(ModulePassT Pass) {
+    Passes.push_back(new ModulePassModel<ModulePassT>(llvm_move(Pass)));
+  }
+
+  void run();
+
+private:
+  // Pull in the concept type and model template specialized for modules.
+  typedef detail::PassConcept<Module *> ModulePassConcept;
+  template <typename PassT>
+  struct ModulePassModel : detail::PassModel<Module *, PassT> {
+    ModulePassModel(PassT Pass) : detail::PassModel<Module *, PassT>(Pass) {}
+  };
+
+  Module *M;
+  AnalysisManager *AM;
+  std::vector<polymorphic_ptr<ModulePassConcept> > Passes;
+};
+
+class FunctionPassManager {
+public:
+  FunctionPassManager(AnalysisManager *AM = 0) : AM(AM) {}
+
+  template <typename FunctionPassT> void addPass(FunctionPassT Pass) {
+    Passes.push_back(new FunctionPassModel<FunctionPassT>(llvm_move(Pass)));
+  }
+
+  bool run(Module *M);
+
+private:
+  // Pull in the concept type and model template specialized for functions.
+  typedef detail::PassConcept<Function *> FunctionPassConcept;
+  template <typename PassT>
+  struct FunctionPassModel : detail::PassModel<Function *, PassT> {
+    FunctionPassModel(PassT Pass)
+        : detail::PassModel<Function *, PassT>(Pass) {}
+  };
+
+  AnalysisManager *AM;
+  std::vector<polymorphic_ptr<FunctionPassConcept> > Passes;
+};
+
+
+/// \brief An analysis manager to coordinate and cache analyses run over
+/// a module.
+///
+/// The analysis manager is typically used by passes in a pass pipeline
+/// (consisting potentially of several individual pass managers) over a module
+/// of IR. It provides registration of available analyses, declaring
+/// requirements on support for specific analyses, running of an specific
+/// analysis over a specific unit of IR to compute an analysis result, and
+/// caching of the analysis results to reuse them across multiple passes.
+///
+/// It is the responsibility of callers to use the invalidation API to
+/// invalidate analysis results when the IR they correspond to changes. The
+/// \c ModulePassManager and \c FunctionPassManager do this automatically.
+class AnalysisManager {
+public:
+  AnalysisManager(Module *M) : M(M) {}
+
+  /// \brief Get the result of an analysis pass for this module.
+  ///
+  /// If there is not a valid cached result in the manager already, this will
+  /// re-run the analysis to produce a valid result.
+  ///
+  /// The module passed in must be the same module as the analysis manager was
+  /// constructed around.
+  template <typename PassT>
+  const typename PassT::Result &getResult(Module *M) {
+    assert(ModuleAnalysisPasses.count(PassT::ID()) &&
+           "This analysis pass was not registered prior to being queried");
+
+    const AnalysisResultConcept<Module> &ResultConcept =
+        getResultImpl(PassT::ID(), M);
+    typedef AnalysisResultModel<Module, typename PassT::Result> ResultModelT;
+    return static_cast<const ResultModelT &>(ResultConcept).Result;
+  }
+
+  /// \brief Get the result of an analysis pass for a function.
+  ///
+  /// If there is not a valid cached result in the manager already, this will
+  /// re-run the analysis to produce a valid result.
+  template <typename PassT>
+  const typename PassT::Result &getResult(Function *F) {
+    assert(FunctionAnalysisPasses.count(PassT::ID()) &&
+           "This analysis pass was not registered prior to being queried");
+
+    const AnalysisResultConcept<Function> &ResultConcept =
+        getResultImpl(PassT::ID(), F);
+    typedef AnalysisResultModel<Function, typename PassT::Result> ResultModelT;
+    return static_cast<const ResultModelT &>(ResultConcept).Result;
+  }
+
+  /// \brief Register an analysis pass with the manager.
+  ///
+  /// This provides an initialized and set-up analysis pass to the
+  /// analysis
+  /// manager. Whomever is setting up analysis passes must use this to
+  /// populate
+  /// the manager with all of the analysis passes available.
+  template <typename PassT> void registerAnalysisPass(PassT Pass) {
+    registerAnalysisPassImpl<PassT>(llvm_move(Pass));
+  }
+
+  /// \brief Invalidate a specific analysis pass for an IR module.
+  ///
+  /// Note that the analysis result can disregard invalidation.
+  template <typename PassT> void invalidate(Module *M) {
+    invalidateImpl(PassT::ID(), M);
+  }
+
+  /// \brief Invalidate a specific analysis pass for an IR function.
+  ///
+  /// Note that the analysis result can disregard invalidation.
+  template <typename PassT> void invalidate(Function *F) {
+    invalidateImpl(PassT::ID(), F);
+  }
+
+  /// \brief Invalidate analyses cached for an IR Module.
+  ///
+  /// Note that specific analysis results can disregard invalidation by
+  /// overriding their invalidate method.
+  ///
+  /// The module must be the module this analysis manager was constructed
+  /// around.
+  void invalidateAll(Module *M);
+
+  /// \brief Invalidate analyses cached for an IR Function.
+  ///
+  /// Note that specific analysis results can disregard invalidation by
+  /// overriding the invalidate method.
+  void invalidateAll(Function *F);
+
+private:
+  /// \brief Abstract concept of an analysis result.
+  ///
+  /// This concept is parameterized over the IR unit that this result pertains
+  /// to.
+  template <typename IRUnitT> struct AnalysisResultConcept {
+    virtual ~AnalysisResultConcept() {}
+    virtual AnalysisResultConcept *clone() = 0;
+
+    /// \brief Method to try and mark a result as invalid.
+    ///
+    /// When the outer \c AnalysisManager detects a change in some underlying
+    /// unit of the IR, it will call this method on all of the results cached.
+    ///
+    /// \returns true if the result should indeed be invalidated (the default).
+    virtual bool invalidate(IRUnitT *IR) = 0;
+  };
+
+  /// \brief Wrapper to model the analysis result concept.
+  ///
+  /// Can wrap any type which implements a suitable invalidate member and model
+  /// the AnalysisResultConcept for the AnalysisManager.
+  template <typename IRUnitT, typename ResultT>
+  struct AnalysisResultModel : AnalysisResultConcept<IRUnitT> {
+    AnalysisResultModel(ResultT Result) : Result(llvm_move(Result)) {}
+    virtual AnalysisResultModel *clone() {
+      return new AnalysisResultModel(Result);
+    }
+
+    /// \brief The model delegates to the \c ResultT method.
+    virtual bool invalidate(IRUnitT *IR) { return Result.invalidate(IR); }
+
+    ResultT Result;
+  };
+
+  /// \brief Abstract concept of an analysis pass.
+  ///
+  /// This concept is parameterized over the IR unit that it can run over and
+  /// produce an analysis result.
+  template <typename IRUnitT> struct AnalysisPassConcept {
+    virtual ~AnalysisPassConcept() {}
+    virtual AnalysisPassConcept *clone() = 0;
+
+    /// \brief Method to run this analysis over a unit of IR.
+    /// \returns The analysis result object to be queried by users, the caller
+    /// takes ownership.
+    virtual AnalysisResultConcept<IRUnitT> *run(IRUnitT *IR) = 0;
+  };
+
+  /// \brief Wrapper to model the analysis pass concept.
+  ///
+  /// Can wrap any type which implements a suitable \c run method. The method
+  /// must accept the IRUnitT as an argument and produce an object which can be
+  /// wrapped in a \c AnalysisResultModel.
+  template <typename PassT>
+  struct AnalysisPassModel : AnalysisPassConcept<typename PassT::IRUnitT> {
+    AnalysisPassModel(PassT Pass) : Pass(llvm_move(Pass)) {}
+    virtual AnalysisPassModel *clone() { return new AnalysisPassModel(Pass); }
+
+    // FIXME: Replace PassT::IRUnitT with type traits when we use C++11.
+    typedef typename PassT::IRUnitT IRUnitT;
+
+    // FIXME: Replace PassT::Result with type traits when we use C++11.
+    typedef AnalysisResultModel<IRUnitT, typename PassT::Result> ResultModelT;
+
+    /// \brief The model delegates to the \c PassT::run method.
+    ///
+    /// The return is wrapped in an \c AnalysisResultModel.
+    virtual ResultModelT *run(IRUnitT *IR) {
+      return new ResultModelT(Pass.run(IR));
+    }
+
+    PassT Pass;
+  };
+
+
+  /// \brief Get a module pass result, running the pass if necessary.
+  const AnalysisResultConcept<Module> &getResultImpl(void *PassID, Module *M);
+
+  /// \brief Get a function pass result, running the pass if necessary.
+  const AnalysisResultConcept<Function> &getResultImpl(void *PassID,
+                                                       Function *F);
+
+  /// \brief Invalidate a module pass result.
+  void invalidateImpl(void *PassID, Module *M);
+
+  /// \brief Invalidate a function pass result.
+  void invalidateImpl(void *PassID, Function *F);
+
+
+  /// \brief Module pass specific implementation of registration.
+  template <typename PassT>
+  typename enable_if<is_same<typename PassT::IRUnitT, Module> >::type
+  registerAnalysisPassImpl(PassT Pass) {
+    assert(!ModuleAnalysisPasses.count(PassT::ID()) &&
+           "Registered the same analysis pass twice!");
+    ModuleAnalysisPasses[PassT::ID()] =
+        new AnalysisPassModel<PassT>(llvm_move(Pass));
+  }
+
+  /// \brief Function pass specific implementation of registration.
+  template <typename PassT>
+  typename enable_if<is_same<typename PassT::IRUnitT, Function> >::type
+  registerAnalysisPassImpl(PassT Pass) {
+    assert(!FunctionAnalysisPasses.count(PassT::ID()) &&
+           "Registered the same analysis pass twice!");
+    FunctionAnalysisPasses[PassT::ID()] =
+        new AnalysisPassModel<PassT>(llvm_move(Pass));
+  }
+
+
+  /// \brief Map type from module analysis pass ID to pass concept pointer.
+  typedef DenseMap<void *, polymorphic_ptr<AnalysisPassConcept<Module> > >
+  ModuleAnalysisPassMapT;
+
+  /// \brief Collection of module analysis passes, indexed by ID.
+  ModuleAnalysisPassMapT ModuleAnalysisPasses;
+
+  /// \brief Map type from module analysis pass ID to pass result concept pointer.
+  typedef DenseMap<void *, polymorphic_ptr<AnalysisResultConcept<Module> > >
+  ModuleAnalysisResultMapT;
+
+  /// \brief Cache of computed module analysis results for this module.
+  ModuleAnalysisResultMapT ModuleAnalysisResults;
+
+
+  /// \brief Map type from function analysis pass ID to pass concept pointer.
+  typedef DenseMap<void *, polymorphic_ptr<AnalysisPassConcept<Function> > >
+  FunctionAnalysisPassMapT;
+
+  /// \brief Collection of function analysis passes, indexed by ID.
+  FunctionAnalysisPassMapT FunctionAnalysisPasses;
+
+  /// \brief List of function analysis pass IDs and associated concept pointers.
+  ///
+  /// Requires iterators to be valid across appending new entries and arbitrary
+  /// erases. Provides both the pass ID and concept pointer such that it is
+  /// half of a bijection and provides storage for the actual result concept.
+  typedef std::list<
+      std::pair<void *, polymorphic_ptr<AnalysisResultConcept<Function> > > >
+  FunctionAnalysisResultListT;
+
+  /// \brief Map type from function pointer to our custom list type.
+  typedef DenseMap<Function *, FunctionAnalysisResultListT> FunctionAnalysisResultListMapT;
+
+  /// \brief Map from function to a list of function analysis results.
+  ///
+  /// Provides linear time removal of all analysis results for a function and
+  /// the ultimate storage for a particular cached analysis result.
+  FunctionAnalysisResultListMapT FunctionAnalysisResultLists;
+
+  /// \brief Map type from a pair of analysis ID and function pointer to an
+  /// iterator into a particular result list.
+  typedef DenseMap<std::pair<void *, Function *>,
+                   FunctionAnalysisResultListT::iterator>
+  FunctionAnalysisResultMapT;
+
+  /// \brief Map from an analysis ID and function to a particular cached
+  /// analysis result.
+  FunctionAnalysisResultMapT FunctionAnalysisResults;
+
+  /// \brief Module handle for the \c AnalysisManager.
+  Module *M;
+};
+
+}
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index 1bf8789..3cfb84e 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -324,6 +324,14 @@ public:
   subtype_iterator subtype_begin() const { return ContainedTys; }
   subtype_iterator subtype_end() const { return &ContainedTys[NumContainedTys];}
 
+  typedef std::reverse_iterator<subtype_iterator> subtype_reverse_iterator;
+  subtype_reverse_iterator subtype_rbegin() const {
+    return subtype_reverse_iterator(subtype_end());
+  }
+  subtype_reverse_iterator subtype_rend() const {
+    return subtype_reverse_iterator(subtype_begin());
+  }
+
   /// getContainedType - This method is used to implement the type iterator
   /// (defined a the end of the file).  For derived types, this returns the
   /// types 'contained' in the derived type.
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 5fba3d5..e1361fe 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -22,26 +22,29 @@
 
 namespace llvm {
 
-class Constant;
+class APInt;
 class Argument;
-class Instruction;
+class AssemblyAnnotationWriter;
 class BasicBlock;
-class GlobalValue;
+class Constant;
+class DataLayout;
 class Function;
-class GlobalVariable;
 class GlobalAlias;
+class GlobalValue;
+class GlobalVariable;
 class InlineAsm;
-class ValueSymbolTable;
-template<typename ValueTy> class StringMapEntry;
-typedef StringMapEntry<Value*> ValueName;
-class raw_ostream;
-class AssemblyAnnotationWriter;
-class ValueHandleBase;
+class Instruction;
 class LLVMContext;
-class Twine;
 class MDNode;
-class Type;
 class StringRef;
+class Twine;
+class Type;
+class ValueHandleBase;
+class ValueSymbolTable;
+class raw_ostream;
+
+template<typename ValueTy> class StringMapEntry;
+typedef StringMapEntry<Value*> ValueName;
 
 //===----------------------------------------------------------------------===//
 //                                 Value Class
@@ -260,37 +263,53 @@ public:
   /// this value.
   bool hasValueHandle() const { return HasValueHandle; }
 
-  /// \brief This method strips off any unneeded pointer casts,
-  /// all-zero GEPs and aliases from the specified value, returning the original
-  /// uncasted value. If this is called on a non-pointer value, it returns
-  /// 'this'.
+  /// \brief Strips off any unneeded pointer casts, all-zero GEPs and aliases
+  /// from the specified value, returning the original uncasted value.
+  ///
+  /// If this is called on a non-pointer value, it returns 'this'.
   Value *stripPointerCasts();
   const Value *stripPointerCasts() const {
     return const_cast<Value*>(this)->stripPointerCasts();
   }
 
-  /// \brief This method strips off any unneeded pointer casts and
-  /// all-zero GEPs from the specified value, returning the original
-  /// uncasted value. If this is called on a non-pointer value, it returns
-  /// 'this'.
+  /// \brief Strips off any unneeded pointer casts and all-zero GEPs from the
+  /// specified value, returning the original uncasted value.
+  ///
+  /// If this is called on a non-pointer value, it returns 'this'.
   Value *stripPointerCastsNoFollowAliases();
   const Value *stripPointerCastsNoFollowAliases() const {
     return const_cast<Value*>(this)->stripPointerCastsNoFollowAliases();
   }
 
-  /// stripInBoundsConstantOffsets - This method strips off unneeded pointer casts and
-  /// all-constant GEPs from the specified value, returning the original
-  /// pointer value. If this is called on a non-pointer value, it returns
-  /// 'this'.
+  /// \brief Strips off unneeded pointer casts and all-constant GEPs from the
+  /// specified value, returning the original pointer value.
+  ///
+  /// If this is called on a non-pointer value, it returns 'this'.
   Value *stripInBoundsConstantOffsets();
   const Value *stripInBoundsConstantOffsets() const {
     return const_cast<Value*>(this)->stripInBoundsConstantOffsets();
   }
 
-  /// stripInBoundsOffsets - This method strips off unneeded pointer casts and
-  /// any in-bounds Offsets from the specified value, returning the original
-  /// pointer value. If this is called on a non-pointer value, it returns
-  /// 'this'.
+  /// \brief Strips like \c stripInBoundsConstantOffsets but also accumulates
+  /// the constant offset stripped.
+  ///
+  /// Stores the resulting constant offset stripped into the APInt provided.
+  /// The provided APInt will be extended or truncated as needed to be the
+  /// correct bitwidth for an offset of this pointer type.
+  ///
+  /// If this is called on a non-pointer value, it returns 'this'.
+  Value *stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                   APInt &Offset);
+  const Value *stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                         APInt &Offset) const {
+    return const_cast<Value *>(this)
+        ->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+  }
+
+  /// \brief Strips off unneeded pointer casts and any in-bounds offsets from
+  /// the specified value, returning the original pointer value.
+  ///
+  /// If this is called on a non-pointer value, it returns 'this'.
   Value *stripInBoundsOffsets();
   const Value *stripInBoundsOffsets() const {
     return const_cast<Value*>(this)->stripInBoundsOffsets();
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index d7d18d0..aefb3c0 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -70,13 +70,13 @@ void initializeAliasDebuggerPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlwaysInlinerPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
+void initializeSampleProfileLoaderPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAliasAnalysisPass(PassRegistry&);
-void initializeBasicCallGraphPass(PassRegistry&);
+void initializeCallGraphPass(PassRegistry&);
 void initializeBasicTTIPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
 void initializeBlockFrequencyInfoPass(PassRegistry&);
-void initializeBlockPlacementPass(PassRegistry&);
 void initializeBoundsCheckingPass(PassRegistry&);
 void initializeBranchFolderPassPass(PassRegistry&);
 void initializeBranchProbabilityInfoPass(PassRegistry&);
@@ -90,8 +90,6 @@ void initializeCFGSimplifyPassPass(PassRegistry&);
 void initializeFlattenCFGPassPass(PassRegistry&);
 void initializeStructurizeCFGPass(PassRegistry&);
 void initializeCFGViewerPass(PassRegistry&);
-void initializeCalculateSpillWeightsPass(PassRegistry&);
-void initializeCallGraphAnalysisGroup(PassRegistry&);
 void initializeCodeGenPreparePass(PassRegistry&);
 void initializeConstantMergePass(PassRegistry&);
 void initializeConstantPropagationPass(PassRegistry&);
@@ -105,6 +103,7 @@ void initializeDSEPass(PassRegistry&);
 void initializeDebugIRPass(PassRegistry&);
 void initializeDeadInstEliminationPass(PassRegistry&);
 void initializeDeadMachineInstructionElimPass(PassRegistry&);
+void initializeDelinearizationPass(PassRegistry &);
 void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDomOnlyPrinterPass(PassRegistry&);
 void initializeDomOnlyViewerPass(PassRegistry&);
@@ -114,14 +113,13 @@ void initializeDominanceFrontierPass(PassRegistry&);
 void initializeDominatorTreePass(PassRegistry&);
 void initializeEarlyIfConverterPass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
-void initializeEdgeProfilerPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
-void initializePathProfilerPass(PassRegistry&);
 void initializeGCOVProfilerPass(PassRegistry&);
 void initializeAddressSanitizerPass(PassRegistry&);
 void initializeAddressSanitizerModulePass(PassRegistry&);
 void initializeMemorySanitizerPass(PassRegistry&);
 void initializeThreadSanitizerPass(PassRegistry&);
+void initializeDataFlowSanitizerPass(PassRegistry&);
 void initializeEarlyCSEPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeFindUsedTypesPass(PassRegistry&);
@@ -155,8 +153,6 @@ void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
-void initializeProfileMetadataLoaderPassPass(PassRegistry&);
-void initializePathProfileLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLoopDeletionPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
@@ -166,6 +162,7 @@ void initializeLoopRotatePass(PassRegistry&);
 void initializeLoopSimplifyPass(PassRegistry&);
 void initializeLoopStrengthReducePass(PassRegistry&);
 void initializeGlobalMergePass(PassRegistry&);
+void initializeLoopRerollPass(PassRegistry&);
 void initializeLoopUnrollPass(PassRegistry&);
 void initializeLoopUnswitchPass(PassRegistry&);
 void initializeLoopIdiomRecognizePass(PassRegistry&);
@@ -195,15 +192,13 @@ void initializeMetaRenamerPass(PassRegistry&);
 void initializeMergeFunctionsPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
 void initializeNoAAPass(PassRegistry&);
-void initializeNoProfileInfoPass(PassRegistry&);
-void initializeNoPathProfileInfoPass(PassRegistry&);
 void initializeObjCARCAliasAnalysisPass(PassRegistry&);
 void initializeObjCARCAPElimPass(PassRegistry&);
 void initializeObjCARCExpandPass(PassRegistry&);
 void initializeObjCARCContractPass(PassRegistry&);
 void initializeObjCARCOptPass(PassRegistry&);
-void initializeOptimalEdgeProfilerPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
+void initializePartiallyInlineLibCallsPass(PassRegistry&);
 void initializePEIPass(PassRegistry&);
 void initializePHIEliminationPass(PassRegistry&);
 void initializePartialInlinerPass(PassRegistry&);
@@ -219,11 +214,6 @@ void initializePrintFunctionPassPass(PassRegistry&);
 void initializePrintModulePassPass(PassRegistry&);
 void initializePrintBasicBlockPassPass(PassRegistry&);
 void initializeProcessImplicitDefsPass(PassRegistry&);
-void initializeProfileEstimatorPassPass(PassRegistry&);
-void initializeProfileInfoAnalysisGroup(PassRegistry&);
-void initializePathProfileInfoAnalysisGroup(PassRegistry&);
-void initializePathProfileVerifierPass(PassRegistry&);
-void initializeProfileVerifierPassPass(PassRegistry&);
 void initializePromotePassPass(PassRegistry&);
 void initializePruneEHPass(PassRegistry&);
 void initializeReassociatePass(PassRegistry&);
@@ -253,7 +243,6 @@ void initializeStripDeadPrototypesPassPass(PassRegistry&);
 void initializeStripDebugDeclarePass(PassRegistry&);
 void initializeStripNonDebugSymbolsPass(PassRegistry&);
 void initializeStripSymbolsPass(PassRegistry&);
-void initializeStrongPHIEliminationPass(PassRegistry&);
 void initializeTailCallElimPass(PassRegistry&);
 void initializeTailDuplicatePassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
diff --git a/include/llvm/InstVisitor.h b/include/llvm/InstVisitor.h
index 2911703..de7206d 100644
--- a/include/llvm/InstVisitor.h
+++ b/include/llvm/InstVisitor.h
@@ -191,6 +191,7 @@ public:
   RetTy visitPtrToIntInst(PtrToIntInst &I)        { DELEGATE(CastInst);}
   RetTy visitIntToPtrInst(IntToPtrInst &I)        { DELEGATE(CastInst);}
   RetTy visitBitCastInst(BitCastInst &I)          { DELEGATE(CastInst);}
+  RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I) { DELEGATE(CastInst);}
   RetTy visitSelectInst(SelectInst &I)            { DELEGATE(Instruction);}
   RetTy visitVAArgInst(VAArgInst   &I)            { DELEGATE(UnaryInstruction);}
   RetTy visitExtractElementInst(ExtractElementInst &I) { DELEGATE(Instruction);}
diff --git a/include/llvm/LTO/LTOCodeGenerator.h b/include/llvm/LTO/LTOCodeGenerator.h
new file mode 100644
index 0000000..c478bd9
--- /dev/null
+++ b/include/llvm/LTO/LTOCodeGenerator.h
@@ -0,0 +1,153 @@
+//===-LTOCodeGenerator.h - LLVM Link Time Optimizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the LTOCodeGenerator class.
+//
+//   LTO compilation consists of three phases: Pre-IPO, IPO and Post-IPO. 
+//
+//   The Pre-IPO phase compiles source code into bitcode file. The resulting
+// bitcode files, along with object files and libraries, will be fed to the
+// linker to through the IPO and Post-IPO phases. By using obj-file extension,
+// the resulting bitcode file disguises itself as an object file, and therefore
+// obviates the need of writing a special set of the make-rules only for LTO
+// compilation.
+//
+//   The IPO phase perform inter-procedural analyses and optimizations, and
+// the Post-IPO consists two sub-phases: intra-procedural scalar optimizations
+// (SOPT), and intra-procedural target-dependent code generator (CG).
+// 
+//   As of this writing, we don't separate IPO and the Post-IPO SOPT. They
+// are intermingled together, and are driven by a single pass manager (see
+// PassManagerBuilder::populateLTOPassManager()).
+// 
+//   The "LTOCodeGenerator" is the driver for the IPO and Post-IPO stages. 
+// The "CodeGenerator" here is bit confusing. Don't confuse the "CodeGenerator"
+// with the machine specific code generator.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LTO_CODE_GENERATOR_H
+#define LTO_CODE_GENERATOR_H
+
+#include "llvm-c/lto.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Linker.h"
+#include "llvm/Target/TargetOptions.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+  class LLVMContext;
+  class GlobalValue;
+  class Mangler;
+  class MemoryBuffer;
+  class TargetLibraryInfo;
+  class TargetMachine;
+  class raw_ostream;
+}
+
+//===----------------------------------------------------------------------===//
+/// LTOCodeGenerator - C++ class which implements the opaque lto_code_gen_t
+/// type.
+///
+struct LTOCodeGenerator {
+  static const char *getVersionString();
+
+  LTOCodeGenerator();
+  ~LTOCodeGenerator();
+
+  // Merge given module, return true on success.
+  bool addModule(struct LTOModule*, std::string &errMsg);
+
+  void setTargetOptions(llvm::TargetOptions options);
+  void setDebugInfo(lto_debug_model);
+  void setCodePICModel(lto_codegen_model);
+
+  void setCpu(const char *mCpu) { MCpu = mCpu; }
+
+  void addMustPreserveSymbol(const char *sym) { MustPreserveSymbols[sym] = 1; }
+
+  // To pass options to the driver and optimization passes. These options are
+  // not necessarily for debugging purpose (The function name is misleading).
+  // This function should be called before LTOCodeGenerator::compilexxx(),
+  // and LTOCodeGenerator::writeMergedModules().
+  //
+  void setCodeGenDebugOptions(const char *opts);
+  
+  // Parse the options set in setCodeGenDebugOptions. Like
+  // setCodeGenDebugOptions, this must be called before
+  // LTOCodeGenerator::compilexxx() and LTOCodeGenerator::writeMergedModules()
+  void parseCodeGenDebugOptions();
+
+  // Write the merged module to the file specified by the given path.
+  // Return true on success.
+  bool writeMergedModules(const char *path, std::string &errMsg);
+
+  // Compile the merged module into a *single* object file; the path to object
+  // file is returned to the caller via argument "name". Return true on
+  // success.
+  //
+  // NOTE that it is up to the linker to remove the intermediate object file.
+  //  Do not try to remove the object file in LTOCodeGenerator's destructor
+  //  as we don't who (LTOCodeGenerator or the obj file) will last longer.
+  // 
+  bool compile_to_file(const char **name,
+                       bool disableOpt,
+                       bool disableInline,
+                       bool disableGVNLoadPRE,
+                       std::string &errMsg);
+
+  // As with compile_to_file(), this function compiles the merged module into
+  // single object file. Instead of returning the object-file-path to the caller
+  // (linker), it brings the object to a buffer, and return the buffer to the
+  // caller. This function should delete intermediate object file once its content
+  // is brought to memory. Return NULL if the compilation was not successful. 
+  //
+  const void *compile(size_t *length,
+                      bool disableOpt,
+                      bool disableInline,
+                      bool disableGVNLoadPRE,
+                      std::string &errMsg);
+
+private:
+  void initializeLTOPasses();
+
+  bool generateObjectFile(llvm::raw_ostream &out,
+                          bool disableOpt,
+                          bool disableInline,
+                          bool disableGVNLoadPRE,
+                          std::string &errMsg);
+  void applyScopeRestrictions();
+  void applyRestriction(llvm::GlobalValue &GV,
+                        const llvm::ArrayRef<llvm::StringRef> &Libcalls,
+                        std::vector<const char*> &MustPreserveList,
+                        llvm::SmallPtrSet<llvm::GlobalValue*, 8> &AsmUsed,
+                        llvm::Mangler &Mangler);
+  bool determineTarget(std::string &errMsg);
+
+  typedef llvm::StringMap<uint8_t> StringSet;
+
+  llvm::LLVMContext &Context;
+  llvm::Linker Linker;
+  llvm::TargetMachine *TargetMach;
+  bool EmitDwarfDebugInfo;
+  bool ScopeRestrictionsDone;
+  lto_codegen_model CodeModel;
+  StringSet MustPreserveSymbols;
+  StringSet AsmUndefinedRefs;
+  llvm::MemoryBuffer *NativeObjectFile;
+  std::vector<char *> CodegenOptions;
+  std::string MCpu;
+  std::string NativeObjectPath;
+  llvm::TargetOptions Options;
+};
+
+#endif // LTO_CODE_GENERATOR_H
diff --git a/include/llvm/LTO/LTOModule.h b/include/llvm/LTO/LTOModule.h
new file mode 100644
index 0000000..f4693c8
--- /dev/null
+++ b/include/llvm/LTO/LTOModule.h
@@ -0,0 +1,196 @@
+//===-LTOModule.h - LLVM Link Time Optimizer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the LTOModule class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LTO_MODULE_H
+#define LTO_MODULE_H
+
+#include "llvm-c/lto.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetMachine.h"
+#include <string>
+#include <vector>
+
+// Forward references to llvm classes.
+namespace llvm {
+  class Function;
+  class GlobalValue;
+  class MemoryBuffer;
+  class TargetOptions;
+  class Value;
+}
+
+//===----------------------------------------------------------------------===//
+/// LTOModule - C++ class which implements the opaque lto_module_t type.
+///
+struct LTOModule {
+private:
+  typedef llvm::StringMap<uint8_t> StringSet;
+
+  struct NameAndAttributes {
+    const char        *name;
+    uint32_t           attributes;
+    bool               isFunction;
+    const llvm::GlobalValue *symbol;
+  };
+
+  llvm::OwningPtr<llvm::Module>           _module;
+  llvm::OwningPtr<llvm::TargetMachine>    _target;
+  std::vector<NameAndAttributes>          _symbols;
+
+  // _defines and _undefines only needed to disambiguate tentative definitions
+  StringSet                               _defines;
+  llvm::StringMap<NameAndAttributes>      _undefines;
+  std::vector<const char*>                _asm_undefines;
+  llvm::MCContext                         _context;
+
+  // Use mangler to add GlobalPrefix to names to match linker names.
+  llvm::Mangler                           _mangler;
+
+  LTOModule(llvm::Module *m, llvm::TargetMachine *t);
+public:
+  /// isBitcodeFile - Returns 'true' if the file or memory contents is LLVM
+  /// bitcode.
+  static bool isBitcodeFile(const void *mem, size_t length);
+  static bool isBitcodeFile(const char *path);
+
+  /// isBitcodeFileForTarget - Returns 'true' if the file or memory contents
+  /// is LLVM bitcode for the specified triple.
+  static bool isBitcodeFileForTarget(const void *mem,
+                                     size_t length,
+                                     const char *triplePrefix);
+  static bool isBitcodeFileForTarget(const char *path,
+                                     const char *triplePrefix);
+
+  /// makeLTOModule - Create an LTOModule. N.B. These methods take ownership
+  /// of the buffer. The caller must have initialized the Targets, the
+  /// TargetMCs, the AsmPrinters, and the AsmParsers by calling:
+  ///
+  /// InitializeAllTargets();
+  /// InitializeAllTargetMCs();
+  /// InitializeAllAsmPrinters();
+  /// InitializeAllAsmParsers();
+  static LTOModule *makeLTOModule(const char* path,
+                                  llvm::TargetOptions options,
+                                  std::string &errMsg);
+  static LTOModule *makeLTOModule(int fd, const char *path,
+                                  size_t size, llvm::TargetOptions options,
+                                  std::string &errMsg);
+  static LTOModule *makeLTOModule(int fd, const char *path,
+                                  size_t map_size,
+                                  off_t offset, llvm::TargetOptions options,
+                                  std::string& errMsg);
+  static LTOModule *makeLTOModule(const void *mem, size_t length,
+                                  llvm::TargetOptions options,
+                                  std::string &errMsg);
+
+  /// getTargetTriple - Return the Module's target triple.
+  const char *getTargetTriple() {
+    return _module->getTargetTriple().c_str();
+  }
+
+  /// setTargetTriple - Set the Module's target triple.
+  void setTargetTriple(const char *triple) {
+    _module->setTargetTriple(triple);
+  }
+
+  /// getSymbolCount - Get the number of symbols
+  uint32_t getSymbolCount() {
+    return _symbols.size();
+  }
+
+  /// getSymbolAttributes - Get the attributes for a symbol at the specified
+  /// index.
+  lto_symbol_attributes getSymbolAttributes(uint32_t index) {
+    if (index < _symbols.size())
+      return lto_symbol_attributes(_symbols[index].attributes);
+    return lto_symbol_attributes(0);
+  }
+
+  /// getSymbolName - Get the name of the symbol at the specified index.
+  const char *getSymbolName(uint32_t index) {
+    if (index < _symbols.size())
+      return _symbols[index].name;
+    return NULL;
+  }
+
+  /// getLLVVMModule - Return the Module.
+  llvm::Module *getLLVVMModule() { return _module.get(); }
+
+  /// getAsmUndefinedRefs -
+  const std::vector<const char*> &getAsmUndefinedRefs() {
+    return _asm_undefines;
+  }
+
+private:
+  /// parseSymbols - Parse the symbols from the module and model-level ASM and
+  /// add them to either the defined or undefined lists.
+  bool parseSymbols(std::string &errMsg);
+
+  /// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet
+  /// to a list to be resolved later.
+  void addPotentialUndefinedSymbol(const llvm::GlobalValue *dcl, bool isFunc);
+
+  /// addDefinedSymbol - Add a defined symbol to the list.
+  void addDefinedSymbol(const llvm::GlobalValue *def, bool isFunction);
+
+  /// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
+  void addDefinedFunctionSymbol(const llvm::Function *f);
+
+  /// addDefinedDataSymbol - Add a data symbol as defined to the list.
+  void addDefinedDataSymbol(const llvm::GlobalValue *v);
+
+  /// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
+  /// defined or undefined lists.
+  bool addAsmGlobalSymbols(std::string &errMsg);
+
+  /// addAsmGlobalSymbol - Add a global symbol from module-level ASM to the
+  /// defined list.
+  void addAsmGlobalSymbol(const char *, lto_symbol_attributes scope);
+
+  /// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to
+  /// the undefined list.
+  void addAsmGlobalSymbolUndef(const char *);
+
+  /// addObjCClass - Parse i386/ppc ObjC class data structure.
+  void addObjCClass(const llvm::GlobalVariable *clgv);
+
+  /// addObjCCategory - Parse i386/ppc ObjC category data structure.
+  void addObjCCategory(const llvm::GlobalVariable *clgv);
+
+  /// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
+  void addObjCClassRef(const llvm::GlobalVariable *clgv);
+
+  /// objcClassNameFromExpression - Get string that the data pointer points
+  /// to.
+  bool objcClassNameFromExpression(const llvm::Constant* c, std::string &name);
+
+  /// isTargetMatch - Returns 'true' if the memory buffer is for the specified
+  /// target triple.
+  static bool isTargetMatch(llvm::MemoryBuffer *memBuffer,
+                            const char *triplePrefix);
+
+  /// makeLTOModule - Create an LTOModule (private version). N.B. This
+  /// method takes ownership of the buffer.
+  static LTOModule *makeLTOModule(llvm::MemoryBuffer *buffer,
+                                  llvm::TargetOptions options,
+                                  std::string &errMsg);
+
+  /// makeBuffer - Create a MemoryBuffer from a memory range.
+  static llvm::MemoryBuffer *makeBuffer(const void *mem, size_t length);
+};
+
+#endif // LTO_MODULE_H
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index ec39829..8183fa2 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -56,7 +56,6 @@ namespace {
       (void) llvm::createLibCallAliasAnalysisPass(0);
       (void) llvm::createScalarEvolutionAliasAnalysisPass();
       (void) llvm::createTypeBasedAliasAnalysisPass();
-      (void) llvm::createBlockPlacementPass();
       (void) llvm::createBoundsCheckingPass();
       (void) llvm::createBreakCriticalEdgesPass();
       (void) llvm::createCallGraphPrinterPass();
@@ -75,9 +74,6 @@ namespace {
       (void) llvm::createDomPrinterPass();
       (void) llvm::createDomOnlyViewerPass();
       (void) llvm::createDomViewerPass();
-      (void) llvm::createEdgeProfilerPass();
-      (void) llvm::createOptimalEdgeProfilerPass();
-      (void) llvm::createPathProfilerPass();
       (void) llvm::createGCOVProfilerPass();
       (void) llvm::createFunctionInliningPass();
       (void) llvm::createAlwaysInlinerPass();
@@ -95,6 +91,7 @@ namespace {
       (void) llvm::createLoopExtractorPass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopStrengthReducePass();
+      (void) llvm::createLoopRerollPass();
       (void) llvm::createLoopUnrollPass();
       (void) llvm::createLoopUnswitchPass();
       (void) llvm::createLoopIdiomPass();
@@ -103,18 +100,11 @@ namespace {
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
       (void) llvm::createNoAAPass();
-      (void) llvm::createNoProfileInfoPass();
       (void) llvm::createObjCARCAliasAnalysisPass();
       (void) llvm::createObjCARCAPElimPass();
       (void) llvm::createObjCARCExpandPass();
       (void) llvm::createObjCARCContractPass();
       (void) llvm::createObjCARCOptPass();
-      (void) llvm::createProfileEstimatorPass();
-      (void) llvm::createProfileVerifierPass();
-      (void) llvm::createPathProfileVerifierPass();
-      (void) llvm::createProfileLoaderPass();
-      (void) llvm::createProfileMetadataLoaderPass();
-      (void) llvm::createPathProfileLoaderPass();
       (void) llvm::createPromoteMemoryToRegisterPass();
       (void) llvm::createDemoteRegisterToMemoryPass();
       (void) llvm::createPruneEHPass();
@@ -163,6 +153,7 @@ namespace {
       (void) llvm::createLoopVectorizePass();
       (void) llvm::createSLPVectorizerPass();
       (void) llvm::createBBVectorizePass();
+      (void) llvm::createPartiallyInlineLibCallsPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::FindUsedTypes();
diff --git a/include/llvm/Linker.h b/include/llvm/Linker.h
index 3667b85..4f37459 100644
--- a/include/llvm/Linker.h
+++ b/include/llvm/Linker.h
@@ -32,7 +32,9 @@ class Linker {
 
     Linker(Module *M);
     ~Linker();
+
     Module *getModule() const { return Composite; }
+    void deleteModule();
 
     /// \brief Link \p Src into the composite. The source is destroyed if
     /// \p Mode is DestroySource and preserved if it is PreserveSource.
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 9a6b703..f946f41 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -10,7 +10,9 @@
 #ifndef LLVM_MC_MCASMBACKEND_H
 #define LLVM_MC_MCASMBACKEND_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -32,6 +34,7 @@ class raw_ostream;
 class MCAsmBackend {
   MCAsmBackend(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
   void operator=(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
+
 protected: // Can only create subclasses.
   MCAsmBackend();
 
@@ -42,7 +45,7 @@ public:
   virtual ~MCAsmBackend();
 
   /// lifetime management
-  virtual void reset() { }
+  virtual void reset() {}
 
   /// createObjectWriter - Create a new MCObjectWriter instance for use by the
   /// assembler backend to emit the final object file.
@@ -50,7 +53,7 @@ public:
 
   /// createELFObjectTargetWriter - Create a new ELFObjectTargetWriter to enable
   /// non-standard ELFObjectWriters.
-  virtual  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+  virtual MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
     llvm_unreachable("createELFObjectTargetWriter is not supported by asm "
                      "backend");
   }
@@ -71,9 +74,7 @@ public:
 
   /// hasDataInCodeSupport - Check whether this target implements data-in-code
   /// markers. If not, data region directives will be ignored.
-  bool hasDataInCodeSupport() const {
-    return HasDataInCodeSupport;
-  }
+  bool hasDataInCodeSupport() const { return HasDataInCodeSupport; }
 
   /// doesSectionRequireSymbols - Check whether the given section requires that
   /// all symbols (even temporaries) have symbol table entries.
@@ -128,8 +129,7 @@ public:
 
   /// fixupNeedsRelaxation - Target specific predicate for whether a given
   /// fixup requires the associated instruction to be relaxed.
-  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                                    uint64_t Value,
+  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                                     const MCRelaxableFragment *DF,
                                     const MCAsmLayout &Layout) const = 0;
 
@@ -160,6 +160,12 @@ public:
   /// handleAssemblerFlag - Handle any target-specific assembler flags.
   /// By default, do nothing.
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction>) const {
+    return 0;
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index e83b624..7a99394 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -93,9 +93,9 @@ namespace llvm {
     /// this value.  Factored out in .debug_frame and .debug_line.
     unsigned MinInstAlignment;                  // Defaults to 1.
 
-    /// PCSymbol - The symbol used to represent the current PC.  Used in PC
-    /// relative expressions.
-    const char *PCSymbol;                    // Defaults to "$".
+    /// DollarIsPC - The '$' token, when not referencing an identifier or
+    /// constant, refers to the current PC.
+    bool DollarIsPC;                         // Defaults to false.
 
     /// SeparatorString - This string, if specified, is used to separate
     /// instructions from each other when on the same line.
@@ -144,21 +144,9 @@ namespace llvm {
     /// AssemblerDialect - Which dialect of an assembler variant to use.
     unsigned AssemblerDialect;               // Defaults to 0
 
-    /// AllowQuotesInName - This is true if the assembler allows for complex
-    /// symbol names to be surrounded in quotes.  This defaults to false.
-    bool AllowQuotesInName;
-
-    /// AllowNameToStartWithDigit - This is true if the assembler allows symbol
-    /// names to start with a digit (e.g., "0x0021").  This defaults to false.
-    bool AllowNameToStartWithDigit;
-
-    /// AllowPeriodsInName - This is true if the assembler allows periods in
-    /// symbol names.  This defaults to true.
-    bool AllowPeriodsInName;
-
-    /// AllowUTF8 - This is true if the assembler accepts UTF-8 input.
-    // FIXME: Make this a more general encoding setting?
-    bool AllowUTF8;
+    /// \brief This is true if the assembler allows @ characters in symbol
+    /// names. Defaults to false.
+    bool AllowAtInName;
 
     /// UseDataRegionDirectives - This is true if data region markers should
     /// be printed as ".data_region/.end_data_region" directives. If false,
@@ -240,11 +228,6 @@ namespace llvm {
     ///
     const char *GlobalDirective;             // Defaults to NULL.
 
-    /// ExternDirective - This is the directive used to declare external
-    /// globals.
-    ///
-    const char *ExternDirective;             // Defaults to NULL.
-
     /// HasSetDirective - True if the assembler supports the .set directive.
     bool HasSetDirective;                    // Defaults to true.
 
@@ -271,14 +254,14 @@ namespace llvm {
     /// .file directive, this is true for ELF targets.
     bool HasSingleParameterDotFile;          // Defaults to true.
 
+    /// hasIdentDirective - True if the target has a .ident directive, this is
+    /// true for ELF targets.
+    bool HasIdentDirective;                  // Defaults to false.
+
     /// HasNoDeadStrip - True if this target supports the MachO .no_dead_strip
     /// directive.
     bool HasNoDeadStrip;                     // Defaults to false.
 
-    /// HasSymbolResolver - True if this target supports the MachO
-    /// .symbol_resolver directive.
-    bool HasSymbolResolver;                     // Defaults to false.
-
     /// WeakRefDirective - This directive, if non-null, is used to declare a
     /// global as being a weak undefined symbol.
     const char *WeakRefDirective;            // Defaults to NULL.
@@ -316,10 +299,6 @@ namespace llvm {
     /// SupportsExceptionHandling - True if target supports exception handling.
     ExceptionHandling::ExceptionsType ExceptionsType; // Defaults to None
 
-    /// DwarfUsesInlineInfoSection - True if DwarfDebugInlineSection is used to
-    /// encode inline subroutine information.
-    bool DwarfUsesInlineInfoSection;         // Defaults to false.
-
     /// DwarfUsesRelocationsAcrossSections - True if Dwarf2 output generally
     /// uses relocations for references to other .debug_* sections.
     bool DwarfUsesRelocationsAcrossSections;
@@ -429,8 +408,8 @@ namespace llvm {
     unsigned getMinInstAlignment() const {
       return MinInstAlignment;
     }
-    const char *getPCSymbol() const {
-      return PCSymbol;
+    bool getDollarIsPC() const {
+      return DollarIsPC;
     }
     const char *getSeparatorString() const {
       return SeparatorString;
@@ -476,17 +455,8 @@ namespace llvm {
     unsigned getAssemblerDialect() const {
       return AssemblerDialect;
     }
-    bool doesAllowQuotesInName() const {
-      return AllowQuotesInName;
-    }
-    bool doesAllowNameToStartWithDigit() const {
-      return AllowNameToStartWithDigit;
-    }
-    bool doesAllowPeriodsInName() const {
-      return AllowPeriodsInName;
-    }
-    bool doesAllowUTF8() const {
-      return AllowUTF8;
+    bool doesAllowAtInName() const {
+      return AllowAtInName;
     }
     bool doesSupportDataRegionDirectives() const {
       return UseDataRegionDirectives;
@@ -512,9 +482,6 @@ namespace llvm {
     const char *getGlobalDirective() const {
       return GlobalDirective;
     }
-    const char *getExternDirective() const {
-      return ExternDirective;
-    }
     bool hasSetDirective() const { return HasSetDirective; }
     bool hasAggressiveSymbolFolding() const {
       return HasAggressiveSymbolFolding;
@@ -527,8 +494,8 @@ namespace llvm {
     }
     bool hasDotTypeDotSizeDirective() const {return HasDotTypeDotSizeDirective;}
     bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
+    bool hasIdentDirective() const { return HasIdentDirective; }
     bool hasNoDeadStrip() const { return HasNoDeadStrip; }
-    bool hasSymbolResolver() const { return HasSymbolResolver; }
     const char *getWeakRefDirective() const { return WeakRefDirective; }
     const char *getWeakDefDirective() const { return WeakDefDirective; }
     const char *getLinkOnceDirective() const { return LinkOnceDirective; }
@@ -558,9 +525,6 @@ namespace llvm {
          ExceptionsType == ExceptionHandling::ARM ||
          ExceptionsType == ExceptionHandling::Win64);
     }
-    bool doesDwarfUseInlineInfoSection() const {
-      return DwarfUsesInlineInfoSection;
-    }
     bool doesDwarfUseRelocationsAcrossSections() const {
       return DwarfUsesRelocationsAcrossSections;
     }
diff --git a/include/llvm/MC/MCAsmInfoELF.h b/include/llvm/MC/MCAsmInfoELF.h
new file mode 100644
index 0000000..27fea84
--- /dev/null
+++ b/include/llvm/MC/MCAsmInfoELF.h
@@ -0,0 +1,23 @@
+//===-- llvm/MC/MCAsmInfoELF.h - ELF Asm info -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCASMINFOELF_H
+#define LLVM_MC_MCASMINFOELF_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class MCAsmInfoELF : public MCAsmInfo {
+  virtual void anchor();
+protected:
+  MCAsmInfoELF();
+};
+}
+
+#endif
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 2c8c602..0cf2b1d 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DataTypes.h"
+#include <algorithm>
 #include <vector> // FIXME: Shouldn't be needed.
 
 namespace mcld {
@@ -824,6 +825,9 @@ public:
   typedef SymbolDataListType::const_iterator const_symbol_iterator;
   typedef SymbolDataListType::iterator symbol_iterator;
 
+  typedef std::vector<std::string> FileNameVectorType;
+  typedef FileNameVectorType::const_iterator const_file_name_iterator;
+
   typedef std::vector<IndirectSymbolData>::const_iterator
     const_indirect_symbol_iterator;
   typedef std::vector<IndirectSymbolData>::iterator indirect_symbol_iterator;
@@ -867,6 +871,9 @@ private:
   /// The list of linker options to propagate into the object file.
   std::vector<std::vector<std::string> > LinkerOptions;
 
+  /// List of declared file names
+  FileNameVectorType FileNames;
+
   /// The set of function symbols for which a .thumb_func directive has
   /// been seen.
   //
@@ -1162,6 +1169,20 @@ public:
     return *Entry;
   }
 
+  const_file_name_iterator file_names_begin() const {
+    return FileNames.begin();
+  }
+
+  const_file_name_iterator file_names_end() const {
+    return FileNames.end();
+  }
+
+  void addFileName(StringRef FileName) {
+    if (std::find(file_names_begin(), file_names_end(), FileName) ==
+        file_names_end())
+      FileNames.push_back(FileName);
+  }
+
   /// @}
 
   void dump();
diff --git a/include/llvm/MC/MCAtom.h b/include/llvm/MC/MCAtom.h
index 6a93798..eab32d6 100644
--- a/include/llvm/MC/MCAtom.h
+++ b/include/llvm/MC/MCAtom.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_MC_MCATOM_H
 #define LLVM_MC_MCATOM_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/DataTypes.h"
 #include <vector>
@@ -28,9 +29,10 @@ class MCAtom;
 class MCTextAtom;
 class MCDataAtom;
 
-/// MCAtom - Represents a contiguous range of either instructions (a TextAtom)
+/// \brief Represents a contiguous range of either instructions (a TextAtom)
 /// or data (a DataAtom).  Address ranges are expressed as _closed_ intervals.
 class MCAtom {
+  virtual void anchor();
 public:
   virtual ~MCAtom() {}
 
@@ -138,7 +140,7 @@ public:
 
   const MCDecodedInst &back() const { return Insts.back(); }
   const MCDecodedInst &at(size_t n) const { return Insts.at(n); }
-  uint64_t size() const { return Insts.size(); }
+  size_t size() const { return Insts.size(); }
   /// @}
 
   /// \name Atom type specific split/truncate logic.
@@ -172,6 +174,9 @@ public:
   /// Append a data entry, expanding the atom if necessary.
   void addData(const MCData &D);
 
+  /// Get a reference to the data in this atom.
+  ArrayRef<MCData> getData() const { return Data; }
+
   /// \name Atom type specific split/truncate logic.
   /// @{
   MCDataAtom *split(uint64_t SplitPt) LLVM_OVERRIDE;
@@ -184,7 +189,9 @@ private:
   friend class MCModule;
   // Private constructor - only callable by MCModule
   MCDataAtom(MCModule *P, uint64_t Begin, uint64_t End)
-    : MCAtom(DataAtom, P, Begin, End), Data(End - Begin) {}
+    : MCAtom(DataAtom, P, Begin, End) {
+    Data.reserve(End + 1 - Begin);
+  }
 };
 
 }
diff --git a/include/llvm/MC/MCCodeGenInfo.h b/include/llvm/MC/MCCodeGenInfo.h
index d1765e1..84ce934 100644
--- a/include/llvm/MC/MCCodeGenInfo.h
+++ b/include/llvm/MC/MCCodeGenInfo.h
@@ -42,6 +42,9 @@ namespace llvm {
     CodeModel::Model getCodeModel() const { return CMModel; }
 
     CodeGenOpt::Level getOptLevel() const { return OptLevel; }
+
+    // Allow overriding OptLevel on a per-function basis.
+    void setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
   };
 } // namespace llvm
 
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index c012ed0..c8b6626 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -258,9 +258,15 @@ namespace llvm {
 
     const MCSectionCOFF *getCOFFSection(StringRef Section,
                                         unsigned Characteristics,
-                                        SectionKind Kind, int Selection = 0,
+                                        SectionKind Kind,
+                                        StringRef COMDATSymName,
+                                        int Selection,
                                         const MCSectionCOFF *Assoc = 0);
 
+    const MCSectionCOFF *getCOFFSection(StringRef Section,
+                                        unsigned Characteristics,
+                                        SectionKind Kind);
+
     const MCSectionCOFF *getCOFFSection(StringRef Section);
 
     /// @}
@@ -406,7 +412,7 @@ namespace llvm {
     void Deallocate(void *Ptr) {
     }
 
-    // Unrecoverable error has occured. Display the best diagnostic we can
+    // Unrecoverable error has occurred. Display the best diagnostic we can
     // and bail via exit(1). For now, most MC backend errors are unrecoverable.
     // FIXME: We should really do something about that.
     LLVM_ATTRIBUTE_NORETURN void FatalError(SMLoc L, const Twine &Msg);
diff --git a/include/llvm/MC/MCDisassembler.h b/include/llvm/MC/MCDisassembler.h
index e82b0c1..83f26ef 100644
--- a/include/llvm/MC/MCDisassembler.h
+++ b/include/llvm/MC/MCDisassembler.h
@@ -131,6 +131,8 @@ public:
   void *getDisInfoBlock() const { return DisInfo; }
   MCContext *getMCContext() const { return Ctx; }
 
+  const MCSubtargetInfo& getSubtargetInfo() const { return STI; }
+
   // Marked mutable because we cache it inside the disassembler, rather than
   // having to pass it around as an argument through all the autogenerated code.
   mutable raw_ostream *CommentStream;
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 3ece262..65b920b 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -23,400 +23,445 @@
 #include <vector>
 
 namespace llvm {
-  class MCContext;
-  class MCSection;
-  class MCStreamer;
-  class MCSymbol;
-  class SourceMgr;
-  class SMLoc;
-
-  /// MCDwarfFile - Instances of this class represent the name of the dwarf
-  /// .file directive and its associated dwarf file number in the MC file,
-  /// and MCDwarfFile's are created and unique'd by the MCContext class where
-  /// the file number for each is its index into the vector of DwarfFiles (note
-  /// index 0 is not used and not a valid dwarf file number).
-  class MCDwarfFile {
-    // Name - the base name of the file without its directory path.
-    // The StringRef references memory allocated in the MCContext.
-    StringRef Name;
-
-    // DirIndex - the index into the list of directory names for this file name.
-    unsigned DirIndex;
-
-  private:  // MCContext creates and uniques these.
-    friend class MCContext;
-    MCDwarfFile(StringRef name, unsigned dirIndex)
+class MCAsmBackend;
+class MCContext;
+class MCSection;
+class MCStreamer;
+class MCSymbol;
+class SourceMgr;
+class SMLoc;
+
+/// MCDwarfFile - Instances of this class represent the name of the dwarf
+/// .file directive and its associated dwarf file number in the MC file,
+/// and MCDwarfFile's are created and unique'd by the MCContext class where
+/// the file number for each is its index into the vector of DwarfFiles (note
+/// index 0 is not used and not a valid dwarf file number).
+class MCDwarfFile {
+  // Name - the base name of the file without its directory path.
+  // The StringRef references memory allocated in the MCContext.
+  StringRef Name;
+
+  // DirIndex - the index into the list of directory names for this file name.
+  unsigned DirIndex;
+
+private: // MCContext creates and uniques these.
+  friend class MCContext;
+  MCDwarfFile(StringRef name, unsigned dirIndex)
       : Name(name), DirIndex(dirIndex) {}
 
-    MCDwarfFile(const MCDwarfFile&) LLVM_DELETED_FUNCTION;
-    void operator=(const MCDwarfFile&) LLVM_DELETED_FUNCTION;
-  public:
-    /// getName - Get the base name of this MCDwarfFile.
-    StringRef getName() const { return Name; }
-
-    /// getDirIndex - Get the dirIndex of this MCDwarfFile.
-    unsigned getDirIndex() const { return DirIndex; }
-
-
-    /// print - Print the value to the stream \p OS.
-    void print(raw_ostream &OS) const;
-
-    /// dump - Print the value to stderr.
-    void dump() const;
-  };
-
-  inline raw_ostream &operator<<(raw_ostream &OS, const MCDwarfFile &DwarfFile){
-    DwarfFile.print(OS);
-    return OS;
-  }
-
-  /// MCDwarfLoc - Instances of this class represent the information from a
-  /// dwarf .loc directive.
-  class MCDwarfLoc {
-    // FileNum - the file number.
-    unsigned FileNum;
-    // Line - the line number.
-    unsigned Line;
-    // Column - the column position.
-    unsigned Column;
-    // Flags (see #define's below)
-    unsigned Flags;
-    // Isa
-    unsigned Isa;
-    // Discriminator
-    unsigned Discriminator;
+  MCDwarfFile(const MCDwarfFile &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCDwarfFile &) LLVM_DELETED_FUNCTION;
+
+public:
+  /// getName - Get the base name of this MCDwarfFile.
+  StringRef getName() const { return Name; }
+
+  /// getDirIndex - Get the dirIndex of this MCDwarfFile.
+  unsigned getDirIndex() const { return DirIndex; }
+
+  /// print - Print the value to the stream \p OS.
+  void print(raw_ostream &OS) const;
+
+  /// dump - Print the value to stderr.
+  void dump() const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const MCDwarfFile &DwarfFile) {
+  DwarfFile.print(OS);
+  return OS;
+}
+
+/// MCDwarfLoc - Instances of this class represent the information from a
+/// dwarf .loc directive.
+class MCDwarfLoc {
+  // FileNum - the file number.
+  unsigned FileNum;
+  // Line - the line number.
+  unsigned Line;
+  // Column - the column position.
+  unsigned Column;
+  // Flags (see #define's below)
+  unsigned Flags;
+  // Isa
+  unsigned Isa;
+  // Discriminator
+  unsigned Discriminator;
 
 // Flag that indicates the initial value of the is_stmt_start flag.
-#define DWARF2_LINE_DEFAULT_IS_STMT     1
+#define DWARF2_LINE_DEFAULT_IS_STMT 1
 
-#define DWARF2_FLAG_IS_STMT        (1 << 0)
-#define DWARF2_FLAG_BASIC_BLOCK    (1 << 1)
-#define DWARF2_FLAG_PROLOGUE_END   (1 << 2)
+#define DWARF2_FLAG_IS_STMT (1 << 0)
+#define DWARF2_FLAG_BASIC_BLOCK (1 << 1)
+#define DWARF2_FLAG_PROLOGUE_END (1 << 2)
 #define DWARF2_FLAG_EPILOGUE_BEGIN (1 << 3)
 
-  private:  // MCContext manages these
-    friend class MCContext;
-    friend class MCLineEntry;
-    MCDwarfLoc(unsigned fileNum, unsigned line, unsigned column, unsigned flags,
-               unsigned isa, unsigned discriminator)
+private: // MCContext manages these
+  friend class MCContext;
+  friend class MCLineEntry;
+  MCDwarfLoc(unsigned fileNum, unsigned line, unsigned column, unsigned flags,
+             unsigned isa, unsigned discriminator)
       : FileNum(fileNum), Line(line), Column(column), Flags(flags), Isa(isa),
         Discriminator(discriminator) {}
 
-    // Allow the default copy constructor and assignment operator to be used
-    // for an MCDwarfLoc object.
+  // Allow the default copy constructor and assignment operator to be used
+  // for an MCDwarfLoc object.
 
-  public:
-    /// getFileNum - Get the FileNum of this MCDwarfLoc.
-    unsigned getFileNum() const { return FileNum; }
+public:
+  /// getFileNum - Get the FileNum of this MCDwarfLoc.
+  unsigned getFileNum() const { return FileNum; }
 
-    /// getLine - Get the Line of this MCDwarfLoc.
-    unsigned getLine() const { return Line; }
+  /// getLine - Get the Line of this MCDwarfLoc.
+  unsigned getLine() const { return Line; }
 
-    /// getColumn - Get the Column of this MCDwarfLoc.
-    unsigned getColumn() const { return Column; }
+  /// getColumn - Get the Column of this MCDwarfLoc.
+  unsigned getColumn() const { return Column; }
 
-    /// getFlags - Get the Flags of this MCDwarfLoc.
-    unsigned getFlags() const { return Flags; }
+  /// getFlags - Get the Flags of this MCDwarfLoc.
+  unsigned getFlags() const { return Flags; }
 
-    /// getIsa - Get the Isa of this MCDwarfLoc.
-    unsigned getIsa() const { return Isa; }
+  /// getIsa - Get the Isa of this MCDwarfLoc.
+  unsigned getIsa() const { return Isa; }
 
-    /// getDiscriminator - Get the Discriminator of this MCDwarfLoc.
-    unsigned getDiscriminator() const { return Discriminator; }
+  /// getDiscriminator - Get the Discriminator of this MCDwarfLoc.
+  unsigned getDiscriminator() const { return Discriminator; }
 
-    /// setFileNum - Set the FileNum of this MCDwarfLoc.
-    void setFileNum(unsigned fileNum) { FileNum = fileNum; }
+  /// setFileNum - Set the FileNum of this MCDwarfLoc.
+  void setFileNum(unsigned fileNum) { FileNum = fileNum; }
 
-    /// setLine - Set the Line of this MCDwarfLoc.
-    void setLine(unsigned line) { Line = line; }
+  /// setLine - Set the Line of this MCDwarfLoc.
+  void setLine(unsigned line) { Line = line; }
 
-    /// setColumn - Set the Column of this MCDwarfLoc.
-    void setColumn(unsigned column) { Column = column; }
+  /// setColumn - Set the Column of this MCDwarfLoc.
+  void setColumn(unsigned column) { Column = column; }
 
-    /// setFlags - Set the Flags of this MCDwarfLoc.
-    void setFlags(unsigned flags) { Flags = flags; }
+  /// setFlags - Set the Flags of this MCDwarfLoc.
+  void setFlags(unsigned flags) { Flags = flags; }
 
-    /// setIsa - Set the Isa of this MCDwarfLoc.
-    void setIsa(unsigned isa) { Isa = isa; }
+  /// setIsa - Set the Isa of this MCDwarfLoc.
+  void setIsa(unsigned isa) { Isa = isa; }
 
-    /// setDiscriminator - Set the Discriminator of this MCDwarfLoc.
-    void setDiscriminator(unsigned discriminator) {
-      Discriminator = discriminator;
-    }
-  };
+  /// setDiscriminator - Set the Discriminator of this MCDwarfLoc.
+  void setDiscriminator(unsigned discriminator) {
+    Discriminator = discriminator;
+  }
+};
+
+/// MCLineEntry - Instances of this class represent the line information for
+/// the dwarf line table entries.  Which is created after a machine
+/// instruction is assembled and uses an address from a temporary label
+/// created at the current address in the current section and the info from
+/// the last .loc directive seen as stored in the context.
+class MCLineEntry : public MCDwarfLoc {
+  MCSymbol *Label;
+
+private:
+  // Allow the default copy constructor and assignment operator to be used
+  // for an MCLineEntry object.
+
+public:
+  // Constructor to create an MCLineEntry given a symbol and the dwarf loc.
+  MCLineEntry(MCSymbol *label, const MCDwarfLoc loc)
+      : MCDwarfLoc(loc), Label(label) {}
+
+  MCSymbol *getLabel() const { return Label; }
+
+  // This is called when an instruction is assembled into the specified
+  // section and if there is information from the last .loc directive that
+  // has yet to have a line entry made for it is made.
+  static void Make(MCStreamer *MCOS, const MCSection *Section);
+};
+
+/// MCLineSection - Instances of this class represent the line information
+/// for a section where machine instructions have been assembled after seeing
+/// .loc directives.  This is the information used to build the dwarf line
+/// table for a section.
+class MCLineSection {
+
+private:
+  MCLineSection(const MCLineSection &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCLineSection &) LLVM_DELETED_FUNCTION;
+
+public:
+  // Constructor to create an MCLineSection with an empty MCLineEntries
+  // vector.
+  MCLineSection() {}
+
+  // addLineEntry - adds an entry to this MCLineSection's line entries
+  void addLineEntry(const MCLineEntry &LineEntry, unsigned CUID) {
+    MCLineDivisions[CUID].push_back(LineEntry);
+  }
 
-  /// MCLineEntry - Instances of this class represent the line information for
-  /// the dwarf line table entries.  Which is created after a machine
-  /// instruction is assembled and uses an address from a temporary label
-  /// created at the current address in the current section and the info from
-  /// the last .loc directive seen as stored in the context.
-  class MCLineEntry : public MCDwarfLoc {
-    MCSymbol *Label;
-
-  private:
-    // Allow the default copy constructor and assignment operator to be used
-    // for an MCLineEntry object.
-
-  public:
-    // Constructor to create an MCLineEntry given a symbol and the dwarf loc.
-    MCLineEntry(MCSymbol *label, const MCDwarfLoc loc) : MCDwarfLoc(loc),
-                Label(label) {}
-
-    MCSymbol *getLabel() const { return Label; }
-
-    // This is called when an instruction is assembled into the specified
-    // section and if there is information from the last .loc directive that
-    // has yet to have a line entry made for it is made.
-    static void Make(MCStreamer *MCOS, const MCSection *Section);
-  };
+  typedef std::vector<MCLineEntry> MCLineEntryCollection;
+  typedef MCLineEntryCollection::iterator iterator;
+  typedef MCLineEntryCollection::const_iterator const_iterator;
+  typedef std::map<unsigned, MCLineEntryCollection> MCLineDivisionMap;
+
+private:
+  // A collection of MCLineEntry for each Compile Unit ID.
+  MCLineDivisionMap MCLineDivisions;
 
-  /// MCLineSection - Instances of this class represent the line information
-  /// for a section where machine instructions have been assembled after seeing
-  /// .loc directives.  This is the information used to build the dwarf line
-  /// table for a section.
-  class MCLineSection {
-
-  private:
-    MCLineSection(const MCLineSection&) LLVM_DELETED_FUNCTION;
-    void operator=(const MCLineSection&) LLVM_DELETED_FUNCTION;
-
-  public:
-    // Constructor to create an MCLineSection with an empty MCLineEntries
-    // vector.
-    MCLineSection() {}
-
-    // addLineEntry - adds an entry to this MCLineSection's line entries
-    void addLineEntry(const MCLineEntry &LineEntry, unsigned CUID) {
-      MCLineDivisions[CUID].push_back(LineEntry);
-    }
-
-    typedef std::vector<MCLineEntry> MCLineEntryCollection;
-    typedef MCLineEntryCollection::iterator iterator;
-    typedef MCLineEntryCollection::const_iterator const_iterator;
-    typedef std::map<unsigned, MCLineEntryCollection> MCLineDivisionMap;
-
-  private:
-    // A collection of MCLineEntry for each Compile Unit ID.
-    MCLineDivisionMap MCLineDivisions;
-
-  public:
-    // Returns whether MCLineSection contains entries for a given Compile
-    // Unit ID.
-    bool containEntriesForID(unsigned CUID) const {
-      return MCLineDivisions.count(CUID);
-    }
-    // Returns the collection of MCLineEntry for a given Compile Unit ID.
-    const MCLineEntryCollection &getMCLineEntries(unsigned CUID) const {
-      MCLineDivisionMap::const_iterator CIter = MCLineDivisions.find(CUID);
-      assert(CIter != MCLineDivisions.end());
-      return CIter->second;
-    }
+public:
+  // Returns whether MCLineSection contains entries for a given Compile
+  // Unit ID.
+  bool containEntriesForID(unsigned CUID) const {
+    return MCLineDivisions.count(CUID);
+  }
+  // Returns the collection of MCLineEntry for a given Compile Unit ID.
+  const MCLineEntryCollection &getMCLineEntries(unsigned CUID) const {
+    MCLineDivisionMap::const_iterator CIter = MCLineDivisions.find(CUID);
+    assert(CIter != MCLineDivisions.end());
+    return CIter->second;
+  }
+};
+
+class MCDwarfFileTable {
+public:
+  //
+  // This emits the Dwarf file and the line tables for all Compile Units.
+  //
+  static const MCSymbol *Emit(MCStreamer *MCOS);
+  //
+  // This emits the Dwarf file and the line tables for a given Compile Unit.
+  //
+  static const MCSymbol *EmitCU(MCStreamer *MCOS, unsigned ID);
+};
+
+class MCDwarfLineAddr {
+public:
+  /// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas.
+  static void Encode(MCContext &Context, int64_t LineDelta, uint64_t AddrDelta,
+                     raw_ostream &OS);
+
+  /// Utility function to emit the encoding to a streamer.
+  static void Emit(MCStreamer *MCOS, int64_t LineDelta, uint64_t AddrDelta);
+};
+
+class MCGenDwarfInfo {
+public:
+  //
+  // When generating dwarf for assembly source files this emits the Dwarf
+  // sections.
+  //
+  static void Emit(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol);
+};
+
+// When generating dwarf for assembly source files this is the info that is
+// needed to be gathered for each symbol that will have a dwarf label.
+class MCGenDwarfLabelEntry {
+private:
+  // Name of the symbol without a leading underbar, if any.
+  StringRef Name;
+  // The dwarf file number this symbol is in.
+  unsigned FileNumber;
+  // The line number this symbol is at.
+  unsigned LineNumber;
+  // The low_pc for the dwarf label is taken from this symbol.
+  MCSymbol *Label;
+
+public:
+  MCGenDwarfLabelEntry(StringRef name, unsigned fileNumber, unsigned lineNumber,
+                       MCSymbol *label)
+      : Name(name), FileNumber(fileNumber), LineNumber(lineNumber),
+        Label(label) {}
+
+  StringRef getName() const { return Name; }
+  unsigned getFileNumber() const { return FileNumber; }
+  unsigned getLineNumber() const { return LineNumber; }
+  MCSymbol *getLabel() const { return Label; }
+
+  // This is called when label is created when we are generating dwarf for
+  // assembly source files.
+  static void Make(MCSymbol *Symbol, MCStreamer *MCOS, SourceMgr &SrcMgr,
+                   SMLoc &Loc);
+};
+
+class MCCFIInstruction {
+public:
+  enum OpType {
+    OpSameValue,
+    OpRememberState,
+    OpRestoreState,
+    OpOffset,
+    OpDefCfaRegister,
+    OpDefCfaOffset,
+    OpDefCfa,
+    OpRelOffset,
+    OpAdjustCfaOffset,
+    OpEscape,
+    OpRestore,
+    OpUndefined,
+    OpRegister,
+    OpWindowSave
   };
 
-  class MCDwarfFileTable {
-  public:
-    //
-    // This emits the Dwarf file and the line tables for all Compile Units.
-    //
-    static const MCSymbol *Emit(MCStreamer *MCOS);
-    //
-    // This emits the Dwarf file and the line tables for a given Compile Unit.
-    //
-    static const MCSymbol *EmitCU(MCStreamer *MCOS, unsigned ID);
+private:
+  OpType Operation;
+  MCSymbol *Label;
+  unsigned Register;
+  union {
+    int Offset;
+    unsigned Register2;
   };
+  std::vector<char> Values;
 
-  class MCDwarfLineAddr {
-  public:
-    /// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas.
-    static void Encode(MCContext &Context, int64_t LineDelta,
-                       uint64_t AddrDelta, raw_ostream &OS);
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V)
+      : Operation(Op), Label(L), Register(R), Offset(O),
+        Values(V.begin(), V.end()) {
+    assert(Op != OpRegister);
+  }
 
-    /// Utility function to emit the encoding to a streamer.
-    static void Emit(MCStreamer *MCOS,
-                     int64_t LineDelta,uint64_t AddrDelta);
-  };
+  MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R1, unsigned R2)
+      : Operation(Op), Label(L), Register(R1), Register2(R2) {
+    assert(Op == OpRegister);
+  }
 
-  class MCGenDwarfInfo {
-  public:
-    //
-    // When generating dwarf for assembly source files this emits the Dwarf
-    // sections.
-    //
-    static void Emit(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol);
-  };
+public:
+  /// \brief .cfi_def_cfa defines a rule for computing CFA as: take address from
+  /// Register and add Offset to it.
+  static MCCFIInstruction createDefCfa(MCSymbol *L, unsigned Register,
+                                       int Offset) {
+    return MCCFIInstruction(OpDefCfa, L, Register, -Offset, "");
+  }
 
-  // When generating dwarf for assembly source files this is the info that is
-  // needed to be gathered for each symbol that will have a dwarf label.
-  class MCGenDwarfLabelEntry {
-  private:
-    // Name of the symbol without a leading underbar, if any.
-    StringRef Name;
-    // The dwarf file number this symbol is in.
-    unsigned FileNumber;
-    // The line number this symbol is at.
-    unsigned LineNumber;
-    // The low_pc for the dwarf label is taken from this symbol.
-    MCSymbol *Label;
-
-  public:
-    MCGenDwarfLabelEntry(StringRef name, unsigned fileNumber,
-                         unsigned lineNumber, MCSymbol *label) :
-      Name(name), FileNumber(fileNumber), LineNumber(lineNumber), Label(label){}
-
-    StringRef getName() const { return Name; }
-    unsigned getFileNumber() const { return FileNumber; }
-    unsigned getLineNumber() const { return LineNumber; }
-    MCSymbol *getLabel() const { return Label; }
-
-    // This is called when label is created when we are generating dwarf for
-    // assembly source files.
-    static void Make(MCSymbol *Symbol, MCStreamer *MCOS, SourceMgr &SrcMgr,
-                     SMLoc &Loc);
-  };
+  /// \brief .cfi_def_cfa_register modifies a rule for computing CFA. From now
+  /// on Register will be used instead of the old one. Offset remains the same.
+  static MCCFIInstruction createDefCfaRegister(MCSymbol *L, unsigned Register) {
+    return MCCFIInstruction(OpDefCfaRegister, L, Register, 0, "");
+  }
 
-  class MCCFIInstruction {
-  public:
-    enum OpType { OpSameValue, OpRememberState, OpRestoreState, OpOffset,
-                  OpDefCfaRegister, OpDefCfaOffset, OpDefCfa, OpRelOffset,
-                  OpAdjustCfaOffset, OpEscape, OpRestore, OpUndefined,
-                  OpRegister };
-  private:
-    OpType Operation;
-    MCSymbol *Label;
-    unsigned Register;
-    union {
-      int Offset;
-      unsigned Register2;
-    };
-    std::vector<char> Values;
-
-    MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R, int O, StringRef V) :
-      Operation(Op), Label(L), Register(R), Offset(O),
-      Values(V.begin(), V.end()) {
-      assert(Op != OpRegister);
-    }
-
-    MCCFIInstruction(OpType Op, MCSymbol *L, unsigned R1, unsigned R2) :
-      Operation(Op), Label(L), Register(R1), Register2(R2) {
-      assert(Op == OpRegister);
-    }
-
-  public:
-    static MCCFIInstruction
-    createOffset(MCSymbol *L, unsigned Register, int Offset) {
-      return MCCFIInstruction(OpOffset, L, Register, Offset, "");
-    }
-
-    static MCCFIInstruction
-    createDefCfaRegister(MCSymbol *L, unsigned Register) {
-      return MCCFIInstruction(OpDefCfaRegister, L, Register, 0, "");
-    }
-
-    static MCCFIInstruction createDefCfaOffset(MCSymbol *L, int Offset) {
-      return MCCFIInstruction(OpDefCfaOffset, L, 0, -Offset, "");
-    }
-
-    static MCCFIInstruction
-    createDefCfa(MCSymbol *L, unsigned Register, int Offset) {
-      return MCCFIInstruction(OpDefCfa, L, Register, -Offset, "");
-    }
-
-    static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register) {
-      return MCCFIInstruction(OpUndefined, L, Register, 0, "");
-    }
-
-    static MCCFIInstruction createRestore(MCSymbol *L, unsigned Register) {
-      return MCCFIInstruction(OpRestore, L, Register, 0, "");
-    }
-
-    static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register) {
-      return MCCFIInstruction(OpSameValue, L, Register, 0, "");
-    }
-
-    static MCCFIInstruction createRestoreState(MCSymbol *L) {
-      return MCCFIInstruction(OpRestoreState, L, 0, 0, "");
-    }
-
-    static MCCFIInstruction createRememberState(MCSymbol *L) {
-      return MCCFIInstruction(OpRememberState, L, 0, 0, "");
-    }
-
-    static MCCFIInstruction
-    createRelOffset(MCSymbol *L, unsigned Register, int Offset) {
-      return MCCFIInstruction(OpRelOffset, L, Register, Offset, "");
-    }
-
-    static MCCFIInstruction
-    createAdjustCfaOffset(MCSymbol *L, int Adjustment) {
-      return MCCFIInstruction(OpAdjustCfaOffset, L, 0, Adjustment, "");
-    }
-
-    static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals) {
-      return MCCFIInstruction(OpEscape, L, 0, 0, Vals);
-    }
-
-   static MCCFIInstruction
-   createRegister(MCSymbol *L, unsigned Register1, unsigned Register2) {
-      return MCCFIInstruction(OpRegister, L, Register1, Register2);
-    }
-
-    OpType getOperation() const { return Operation; }
-    MCSymbol *getLabel() const { return Label; }
-
-    unsigned getRegister() const {
-      assert(Operation == OpDefCfa || Operation == OpOffset ||
-             Operation == OpRestore || Operation == OpUndefined ||
-             Operation == OpSameValue || Operation == OpDefCfaRegister ||
-             Operation == OpRelOffset || Operation == OpRegister);
-      return Register;
-    }
-
-    unsigned getRegister2() const {
-      assert(Operation == OpRegister);
-      return Register2;
-    }
-
-    int getOffset() const {
-      assert(Operation == OpDefCfa || Operation == OpOffset ||
-             Operation == OpRelOffset || Operation == OpDefCfaOffset ||
-             Operation == OpAdjustCfaOffset);
-      return Offset;
-    }
-
-    const StringRef getValues() const {
-      assert(Operation == OpEscape);
-      return StringRef(&Values[0], Values.size());
-    }
-  };
+  /// \brief .cfi_def_cfa_offset modifies a rule for computing CFA. Register
+  /// remains the same, but offset is new. Note that it is the absolute offset
+  /// that will be added to a defined register to the compute CFA address.
+  static MCCFIInstruction createDefCfaOffset(MCSymbol *L, int Offset) {
+    return MCCFIInstruction(OpDefCfaOffset, L, 0, -Offset, "");
+  }
 
-  struct MCDwarfFrameInfo {
-    MCDwarfFrameInfo() : Begin(0), End(0), Personality(0), Lsda(0),
-                         Function(0), Instructions(), PersonalityEncoding(),
-                         LsdaEncoding(0), CompactUnwindEncoding(0),
-                         IsSignalFrame(false) {}
-    MCSymbol *Begin;
-    MCSymbol *End;
-    const MCSymbol *Personality;
-    const MCSymbol *Lsda;
-    const MCSymbol *Function;
-    std::vector<MCCFIInstruction> Instructions;
-    unsigned PersonalityEncoding;
-    unsigned LsdaEncoding;
-    uint32_t CompactUnwindEncoding;
-    bool IsSignalFrame;
-  };
+  /// \brief .cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but
+  /// Offset is a relative value that is added/subtracted from the previous
+  /// offset.
+  static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int Adjustment) {
+    return MCCFIInstruction(OpAdjustCfaOffset, L, 0, Adjustment, "");
+  }
 
-  class MCDwarfFrameEmitter {
-  public:
-    //
-    // This emits the frame info section.
-    //
-    static void Emit(MCStreamer &streamer, bool usingCFI,
-                     bool isEH);
-    static void EmitAdvanceLoc(MCStreamer &Streamer, uint64_t AddrDelta);
-    static void EncodeAdvanceLoc(MCContext &Context, uint64_t AddrDelta,
-                                 raw_ostream &OS);
-  };
+  /// \brief .cfi_offset Previous value of Register is saved at offset Offset
+  /// from CFA.
+  static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register,
+                                       int Offset) {
+    return MCCFIInstruction(OpOffset, L, Register, Offset, "");
+  }
+
+  /// \brief .cfi_rel_offset Previous value of Register is saved at offset
+  /// Offset from the current CFA register. This is transformed to .cfi_offset
+  /// using the known displacement of the CFA register from the CFA.
+  static MCCFIInstruction createRelOffset(MCSymbol *L, unsigned Register,
+                                          int Offset) {
+    return MCCFIInstruction(OpRelOffset, L, Register, Offset, "");
+  }
+
+  /// \brief .cfi_register Previous value of Register1 is saved in
+  /// register Register2.
+  static MCCFIInstruction createRegister(MCSymbol *L, unsigned Register1,
+                                         unsigned Register2) {
+    return MCCFIInstruction(OpRegister, L, Register1, Register2);
+  }
+
+  /// \brief .cfi_window_save SPARC register window is saved.
+  static MCCFIInstruction createWindowSave(MCSymbol *L) {
+    return MCCFIInstruction(OpWindowSave, L, 0, 0, "");
+  }
+
+  /// \brief .cfi_restore says that the rule for Register is now the same as it
+  /// was at the beginning of the function, after all initial instructions added
+  /// by .cfi_startproc were executed.
+  static MCCFIInstruction createRestore(MCSymbol *L, unsigned Register) {
+    return MCCFIInstruction(OpRestore, L, Register, 0, "");
+  }
+
+  /// \brief .cfi_undefined From now on the previous value of Register can't be
+  /// restored anymore.
+  static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register) {
+    return MCCFIInstruction(OpUndefined, L, Register, 0, "");
+  }
+
+  /// \brief .cfi_same_value Current value of Register is the same as in the
+  /// previous frame. I.e., no restoration is needed.
+  static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register) {
+    return MCCFIInstruction(OpSameValue, L, Register, 0, "");
+  }
+
+  /// \brief .cfi_remember_state Save all current rules for all registers.
+  static MCCFIInstruction createRememberState(MCSymbol *L) {
+    return MCCFIInstruction(OpRememberState, L, 0, 0, "");
+  }
+
+  /// \brief .cfi_restore_state Restore the previously saved state.
+  static MCCFIInstruction createRestoreState(MCSymbol *L) {
+    return MCCFIInstruction(OpRestoreState, L, 0, 0, "");
+  }
+
+  /// \brief .cfi_escape Allows the user to add arbitrary bytes to the unwind
+  /// info.
+  static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals) {
+    return MCCFIInstruction(OpEscape, L, 0, 0, Vals);
+  }
+
+  OpType getOperation() const { return Operation; }
+  MCSymbol *getLabel() const { return Label; }
+
+  unsigned getRegister() const {
+    assert(Operation == OpDefCfa || Operation == OpOffset ||
+           Operation == OpRestore || Operation == OpUndefined ||
+           Operation == OpSameValue || Operation == OpDefCfaRegister ||
+           Operation == OpRelOffset || Operation == OpRegister);
+    return Register;
+  }
+
+  unsigned getRegister2() const {
+    assert(Operation == OpRegister);
+    return Register2;
+  }
+
+  int getOffset() const {
+    assert(Operation == OpDefCfa || Operation == OpOffset ||
+           Operation == OpRelOffset || Operation == OpDefCfaOffset ||
+           Operation == OpAdjustCfaOffset);
+    return Offset;
+  }
+
+  const StringRef getValues() const {
+    assert(Operation == OpEscape);
+    return StringRef(&Values[0], Values.size());
+  }
+};
+
+struct MCDwarfFrameInfo {
+  MCDwarfFrameInfo()
+      : Begin(0), End(0), Personality(0), Lsda(0), Function(0), Instructions(),
+        PersonalityEncoding(), LsdaEncoding(0), CompactUnwindEncoding(0),
+        IsSignalFrame(false) {}
+  MCSymbol *Begin;
+  MCSymbol *End;
+  const MCSymbol *Personality;
+  const MCSymbol *Lsda;
+  const MCSymbol *Function;
+  std::vector<MCCFIInstruction> Instructions;
+  unsigned PersonalityEncoding;
+  unsigned LsdaEncoding;
+  uint32_t CompactUnwindEncoding;
+  bool IsSignalFrame;
+};
+
+class MCDwarfFrameEmitter {
+public:
+  //
+  // This emits the frame info section.
+  //
+  static void Emit(MCStreamer &streamer, MCAsmBackend *MAB,
+                   bool usingCFI, bool isEH);
+  static void EmitAdvanceLoc(MCStreamer &Streamer, uint64_t AddrDelta);
+  static void EncodeAdvanceLoc(MCContext &Context, uint64_t AddrDelta,
+                               raw_ostream &OS);
+};
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h
index 7565c15..4e24dcf 100644
--- a/include/llvm/MC/MCELFStreamer.h
+++ b/include/llvm/MC/MCELFStreamer.h
@@ -28,20 +28,17 @@ class MCSymbolData;
 class raw_ostream;
 
 class MCELFStreamer : public MCObjectStreamer {
-protected:
-  MCELFStreamer(StreamerKind Kind, MCContext &Context, MCAsmBackend &TAB,
-                raw_ostream &OS, MCCodeEmitter *Emitter)
-      : MCObjectStreamer(Kind, Context, TAB, OS, Emitter) {}
-
 public:
-  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                MCCodeEmitter *Emitter)
-      : MCObjectStreamer(SK_ELFStreamer, Context, TAB, OS, Emitter) {}
+  MCELFStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter)
+      : MCObjectStreamer(Context, TargetStreamer, TAB, OS, Emitter), 
+                         SeenIdent(false) {}
 
-  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                MCCodeEmitter *Emitter, MCAssembler *Assembler)
-      : MCObjectStreamer(SK_ELFStreamer, Context, TAB, OS, Emitter,
-                         Assembler) {}
+  MCELFStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter,
+                MCAssembler *Assembler)
+      : MCObjectStreamer(Context, TargetStreamer, TAB, OS, Emitter, Assembler), 
+                         SeenIdent(false) {}
 
   virtual ~MCELFStreamer();
 
@@ -57,7 +54,7 @@ public:
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
   virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                 unsigned ByteAlignment);
@@ -81,16 +78,13 @@ public:
 
   virtual void EmitFileDirective(StringRef Filename);
 
-  virtual void EmitTCEntry(const MCSymbol &S);
+  virtual void EmitIdent(StringRef IdentString);
 
   virtual void EmitValueToAlignment(unsigned, int64_t, unsigned, unsigned);
 
-  virtual void FinishImpl();
-  /// @}
+  virtual void Flush();
 
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_ELFStreamer || S->getKind() == SK_ARMELFStreamer;
-  }
+  virtual void FinishImpl();
 
 private:
   virtual void EmitInstToFragment(const MCInst &Inst);
@@ -102,6 +96,8 @@ private:
 
   void fixSymbolsInTLSFixups(const MCExpr *expr);
 
+  bool SeenIdent;
+
   struct LocalCommon {
     MCSymbolData *SD;
     uint64_t Size;
@@ -120,6 +116,11 @@ private:
   void SetSectionBss();
 };
 
+MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    bool RelaxAll, bool NoExecStack,
+                                    bool IsThumb);
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/MC/MCELFSymbolFlags.h b/include/llvm/MC/MCELFSymbolFlags.h
index 2225ea0..d0e1dac 100644
--- a/include/llvm/MC/MCELFSymbolFlags.h
+++ b/include/llvm/MC/MCELFSymbolFlags.h
@@ -27,7 +27,7 @@ namespace llvm {
     ELF_Other_Shift = 10 // Shift value for other flags.
   };
 
-  enum SymbolFlags {
+  enum ELFSymbolFlags {
     ELF_STB_Local     = (ELF::STB_LOCAL     << ELF_STB_Shift),
       ELF_STB_Global    = (ELF::STB_GLOBAL    << ELF_STB_Shift),
       ELF_STB_Weak      = (ELF::STB_WEAK      << ELF_STB_Shift),
diff --git a/include/llvm/MC/MCFunction.h b/include/llvm/MC/MCFunction.h
index b85011e..22c9192 100644
--- a/include/llvm/MC/MCFunction.h
+++ b/include/llvm/MC/MCFunction.h
@@ -70,25 +70,33 @@ public:
 
   void addPredecessor(const MCBasicBlock *MCBB);
   bool isPredecessor(const MCBasicBlock *MCBB) const;
+
+  /// \brief Split block, mirrorring NewAtom = Insts->split(..).
+  /// This moves all successors to \p SplitBB, and
+  /// adds a fallthrough to it.
+  /// \p SplitBB The result of splitting Insts, a basic block directly following
+  /// this basic block.
+  void splitBasicBlock(MCBasicBlock *SplitBB);
   /// @}
 };
 
 /// \brief Represents a function in machine code, containing MCBasicBlocks.
-/// MCFunctions are created using MCModule::createFunction.
+/// MCFunctions are created by MCModule.
 class MCFunction {
   MCFunction           (const MCFunction&) LLVM_DELETED_FUNCTION;
   MCFunction& operator=(const MCFunction&) LLVM_DELETED_FUNCTION;
 
   std::string Name;
+  MCModule *ParentModule;
   typedef std::vector<MCBasicBlock*> BasicBlockListTy;
   BasicBlockListTy Blocks;
 
   // MCModule owns the function.
   friend class MCModule;
-  MCFunction(StringRef Name);
-public:
+  MCFunction(StringRef Name, MCModule *Parent);
   ~MCFunction();
 
+public:
   /// \brief Create an MCBasicBlock backed by Insts and add it to this function.
   /// \param Insts Sequence of straight-line code backing the basic block.
   /// \returns The newly created basic block.
@@ -96,13 +104,21 @@ public:
 
   StringRef getName() const { return Name; }
 
-  /// \name Access to the function's basic blocks. No ordering is enforced.
+  /// \name Get the owning MC Module.
+  /// @{
+  const MCModule *getParent() const { return ParentModule; }
+        MCModule *getParent()       { return ParentModule; }
+  /// @}
+
+  /// \name Access to the function's basic blocks. No ordering is enforced,
+  /// except that the first block is the entry block.
   /// @{
   /// \brief Get the entry point basic block.
   const MCBasicBlock *getEntryBlock() const { return front(); }
         MCBasicBlock *getEntryBlock()       { return front(); }
 
-  // NOTE: Dereferencing iterators gives pointers, so maybe a list is best here.
+  bool empty() const { return Blocks.empty(); }
+
   typedef BasicBlockListTy::const_iterator const_iterator;
   typedef BasicBlockListTy::      iterator       iterator;
   const_iterator begin() const { return Blocks.begin(); }
@@ -114,6 +130,10 @@ public:
         MCBasicBlock* front()       { return Blocks.front(); }
   const MCBasicBlock*  back() const { return Blocks.back(); }
         MCBasicBlock*  back()       { return Blocks.back(); }
+
+  /// \brief Find the basic block, if any, that starts at \p StartAddr.
+  const MCBasicBlock *find(uint64_t StartAddr) const;
+        MCBasicBlock *find(uint64_t StartAddr);
   /// @}
 };
 
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index cb7225f..b4258be 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -41,7 +41,7 @@ protected:
   const MCRegisterInfo &MRI;
 
   /// The current set of available features.
-  unsigned AvailableFeatures;
+  uint64_t AvailableFeatures;
 
   /// True if we are printing marked up assembly.
   bool UseMarkup;
@@ -77,8 +77,8 @@ public:
   /// printRegName - Print the assembler register name.
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
-  unsigned getAvailableFeatures() const { return AvailableFeatures; }
-  void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; }
+  uint64_t getAvailableFeatures() const { return AvailableFeatures; }
+  void setAvailableFeatures(uint64_t Value) { AvailableFeatures = Value; }
 
   bool getUseMarkup() const { return UseMarkup; }
   void setUseMarkup(bool Value) { UseMarkup = Value; }
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index 310f706..214b593 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -17,6 +17,7 @@
 
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -145,6 +146,10 @@ public:
   const uint16_t *ImplicitUses;  // Registers implicitly read by this instr
   const uint16_t *ImplicitDefs;  // Registers implicitly defined by this instr
   const MCOperandInfo *OpInfo;   // 'NumOperands' entries about operands
+  uint64_t DeprecatedFeatureMask;// Feature bits that this is deprecated on, if any
+  // A complex method to determine is a certain is deprecated or not, and return
+  // the reason for deprecation.
+  bool (*ComplexDeprecationInfo)(MCInst &, MCSubtargetInfo &, std::string &);
 
   /// \brief Returns the value of the specific constraint if
   /// it is set. Returns -1 if it is not set.
@@ -158,6 +163,20 @@ public:
     return -1;
   }
 
+  /// \brief Returns true if a certain instruction is deprecated and if so
+  /// returns the reason in \p Info.
+  bool getDeprecatedInfo(MCInst &MI, MCSubtargetInfo &STI,
+                         std::string &Info) const {
+    if (ComplexDeprecationInfo)
+      return ComplexDeprecationInfo(MI, STI, Info);
+    if ((DeprecatedFeatureMask & STI.getFeatureBits()) != 0) {
+      // FIXME: it would be nice to include the subtarget feature here.
+      Info = "deprecated";
+      return true;
+    }
+    return false;
+  }
+
   /// \brief Return the opcode number for this descriptor.
   unsigned getOpcode() const {
     return Opcode;
diff --git a/include/llvm/MC/MCMachOSymbolFlags.h b/include/llvm/MC/MCMachOSymbolFlags.h
index 696436d..71f01fa 100644
--- a/include/llvm/MC/MCMachOSymbolFlags.h
+++ b/include/llvm/MC/MCMachOSymbolFlags.h
@@ -19,9 +19,9 @@
 // the correct relocation information.
 
 namespace llvm {
-  /// SymbolFlags - We store the value for the 'desc' symbol field in the lowest
-  /// 16 bits of the implementation defined flags.
-  enum SymbolFlags { // See <mach-o/nlist.h>.
+  /// MachOSymbolFlags - We store the value for the 'desc' symbol field in the
+  /// lowest 16 bits of the implementation defined flags.
+  enum MachOSymbolFlags { // See <mach-o/nlist.h>.
     SF_DescFlagsMask                        = 0xFFFF,
 
     // Reference type flags.
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 3c9a588..3ba6e65 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -15,8 +15,8 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/MachO.h"
 #include <vector>
 
 namespace llvm {
@@ -98,7 +98,7 @@ class MachObjectWriter : public MCObjectWriter {
   /// @{
 
   llvm::DenseMap<const MCSectionData*,
-                 std::vector<object::macho::RelocationEntry> > Relocations;
+                 std::vector<MachO::any_relocation_info> > Relocations;
   llvm::DenseMap<const MCSectionData*, unsigned> IndirectSymBase;
 
   /// @}
@@ -155,9 +155,8 @@ public:
 
   bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
   bool isARM() const {
-    uint32_t CPUType = TargetObjectWriter->getCPUType() &
-      ~object::mach::CTFM_ArchMask;
-    return CPUType == object::mach::CTM_ARM;
+    uint32_t CPUType = TargetObjectWriter->getCPUType() & ~MachO::CPU_ARCH_MASK;
+    return CPUType == MachO::CPU_TYPE_ARM;
   }
 
   /// @}
@@ -213,7 +212,7 @@ public:
   //    these through in many cases.
 
   void addRelocation(const MCSectionData *SD,
-                     object::macho::RelocationEntry &MRE) {
+                     MachO::any_relocation_info &MRE) {
     Relocations[SD].push_back(MRE);
   }
 
diff --git a/include/llvm/MC/MCModule.h b/include/llvm/MC/MCModule.h
index 02f8ca0..63635c7 100644
--- a/include/llvm/MC/MCModule.h
+++ b/include/llvm/MC/MCModule.h
@@ -23,6 +23,7 @@
 namespace llvm {
 
 class MCAtom;
+class MCBasicBlock;
 class MCDataAtom;
 class MCFunction;
 class MCObjectDisassembler;
@@ -42,7 +43,9 @@ class MCModule {
   typedef std::vector<MCAtom*> AtomListTy;
   AtomListTy Atoms;
 
+  // For access to map/remap.
   friend class MCAtom;
+
   /// \brief Remap \p Atom to the given range, and update its Begin/End fields.
   /// \param Atom An atom belonging to this module.
   /// An atom should always use this method to update its bounds, because this
@@ -53,20 +56,38 @@ class MCModule {
   void map(MCAtom *NewAtom);
   /// @}
 
+  /// \name Basic block tracking
+  /// @{
+  typedef std::vector<MCBasicBlock*> BBsByAtomTy;
+  BBsByAtomTy BBsByAtom;
+
+  // For access to basic block > atom tracking.
+  friend class MCBasicBlock;
+  friend class MCTextAtom;
+
+  /// \brief Keep track of \p BBBackedByAtom as being backed by \p Atom.
+  /// This is used to update succs/preds when \p Atom is split.
+  void trackBBForAtom(const MCTextAtom *Atom, MCBasicBlock *BBBackedByAtom);
+  void splitBasicBlocksForAtom(const MCTextAtom *TA, const MCTextAtom *NewTA);
+  /// @}
+
   /// \name Function tracking
   /// @{
   typedef std::vector<MCFunction*> FunctionListTy;
   FunctionListTy Functions;
   /// @}
 
+  /// The address of the entrypoint function.
+  uint64_t Entrypoint;
+
   MCModule           (const MCModule &) LLVM_DELETED_FUNCTION;
   MCModule& operator=(const MCModule &) LLVM_DELETED_FUNCTION;
 
   // MCObjectDisassembler creates MCModules.
   friend class MCObjectDisassembler;
-  MCModule() : Atoms() { }
 
 public:
+  MCModule() : Entrypoint(0) { }
   ~MCModule();
 
   /// \name Create a new MCAtom covering the specified offset range.
@@ -79,6 +100,8 @@ public:
   /// @{
   const MCAtom *findAtomContaining(uint64_t Addr) const;
         MCAtom *findAtomContaining(uint64_t Addr);
+  const MCAtom *findFirstAtomAfter(uint64_t Addr) const;
+        MCAtom *findFirstAtomAfter(uint64_t Addr);
 
   typedef AtomListTy::const_iterator const_atom_iterator;
   typedef AtomListTy::      iterator       atom_iterator;
@@ -88,8 +111,8 @@ public:
         atom_iterator atom_end()         { return Atoms.end(); }
   /// @}
 
-  /// \name Create a new MCFunction.
-  MCFunction *createFunction(const StringRef &Name);
+  /// \brief Create a new MCFunction.
+  MCFunction *createFunction(StringRef Name);
 
   /// \name Access to the owned function list.
   /// @{
@@ -100,6 +123,9 @@ public:
   const_func_iterator func_end()   const { return Functions.end(); }
         func_iterator func_end()         { return Functions.end(); }
   /// @}
+
+  /// \brief Get the address of the entrypoint function, or 0 if there is none.
+  uint64_t getEntrypoint() const { return Entrypoint; }
 };
 
 }
diff --git a/include/llvm/MC/MCModuleYAML.h b/include/llvm/MC/MCModuleYAML.h
new file mode 100644
index 0000000..281e3d8
--- /dev/null
+++ b/include/llvm/MC/MCModuleYAML.h
@@ -0,0 +1,41 @@
+//===- MCModuleYAML.h - MCModule YAMLIO implementation ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares classes for handling the YAML representation
+/// of MCModule.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCMODULEYAML_H
+#define LLVM_MC_MCMODULEYAML_H
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCModule.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCInstrInfo;
+class MCRegisterInfo;
+
+/// \brief Dump a YAML representation of the MCModule \p MCM to \p OS.
+/// \returns The empty string on success, an error message on failure.
+StringRef mcmodule2yaml(raw_ostream &OS, const MCModule &MCM,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI);
+
+/// \brief Creates a new module and returns it in \p MCM.
+/// \returns The empty string on success, an error message on failure.
+StringRef yaml2mcmodule(OwningPtr<MCModule> &MCM, StringRef YamlContent,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI);
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCObjectDisassembler.h b/include/llvm/MC/MCObjectDisassembler.h
index 749a54e..0d87d33 100644
--- a/include/llvm/MC/MCObjectDisassembler.h
+++ b/include/llvm/MC/MCObjectDisassembler.h
@@ -15,10 +15,18 @@
 #ifndef LLVM_MC_MCOBJECTDISASSEMBLER_H
 #define LLVM_MC_MCOBJECTDISASSEMBLER_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/MemoryObject.h"
+#include <vector>
+
 namespace llvm {
 
 namespace object {
   class ObjectFile;
+  class MachOObjectFile;
 }
 
 class MCBasicBlock;
@@ -26,6 +34,7 @@ class MCDisassembler;
 class MCFunction;
 class MCInstrAnalysis;
 class MCModule;
+class MCObjectSymbolizer;
 
 /// \brief Disassemble an ObjectFile to an MCModule and MCFunctions.
 /// This class builds on MCDisassembler to disassemble whole sections, creating
@@ -33,14 +42,11 @@ class MCModule;
 /// It can also be used to create a control flow graph consisting of MCFunctions
 /// and MCBasicBlocks.
 class MCObjectDisassembler {
-  const object::ObjectFile &Obj;
-  const MCDisassembler &Dis;
-  const MCInstrAnalysis &MIA;
-
 public:
   MCObjectDisassembler(const object::ObjectFile &Obj,
                        const MCDisassembler &Dis,
                        const MCInstrAnalysis &MIA);
+  virtual ~MCObjectDisassembler() {}
 
   /// \brief Build an MCModule, creating atoms and optionally functions.
   /// \param withCFG Also build a CFG by adding MCFunctions to the Module.
@@ -50,6 +56,72 @@ public:
   /// block atoms, which then each back an MCBasicBlock.
   MCModule *buildModule(bool withCFG = false);
 
+  MCModule *buildEmptyModule();
+
+  typedef std::vector<uint64_t> AddressSetTy;
+  /// \name Create a new MCFunction.
+  MCFunction *createFunction(MCModule *Module, uint64_t BeginAddr,
+                             AddressSetTy &CallTargets,
+                             AddressSetTy &TailCallTargets);
+
+  /// \brief Set the region on which to fallback if disassembly was requested
+  /// somewhere not accessible in the object file.
+  /// This is used for dynamic disassembly (see RawMemoryObject).
+  void setFallbackRegion(OwningPtr<MemoryObject> &Region) {
+    FallbackRegion.reset(Region.take());
+  }
+
+  /// \brief Set the symbolizer to use to get information on external functions.
+  /// Note that this isn't used to do instruction-level symbolization (that is,
+  /// plugged into MCDisassembler), but to symbolize function call targets.
+  void setSymbolizer(MCObjectSymbolizer *ObjectSymbolizer) {
+    MOS = ObjectSymbolizer;
+  }
+
+  /// \brief Get the effective address of the entrypoint, or 0 if there is none.
+  virtual uint64_t getEntrypoint();
+
+  /// \name Get the addresses of static constructors/destructors in the object.
+  /// The caller is expected to know how to interpret the addresses;
+  /// for example, Mach-O init functions expect 5 arguments, not for ELF.
+  /// The addresses are original object file load addresses, not effective.
+  /// @{
+  virtual ArrayRef<uint64_t> getStaticInitFunctions();
+  virtual ArrayRef<uint64_t> getStaticExitFunctions();
+  /// @}
+
+  /// \name Translation between effective and objectfile load address.
+  /// @{
+  /// \brief Compute the effective load address, from an objectfile virtual
+  /// address. This is implemented in a format-specific way, to take into
+  /// account things like PIE/ASLR when doing dynamic disassembly.
+  /// For example, on Mach-O this would be done by adding the VM addr slide,
+  /// on glibc ELF by keeping a map between segment load addresses, filled
+  /// using dl_iterate_phdr, etc..
+  /// In most static situations and in the default impl., this returns \p Addr.
+  virtual uint64_t getEffectiveLoadAddr(uint64_t Addr);
+
+  /// \brief Compute the original load address, as specified in the objectfile.
+  /// This is the inverse of getEffectiveLoadAddr.
+  virtual uint64_t getOriginalLoadAddr(uint64_t EffectiveAddr);
+  /// @}
+
+protected:
+  const object::ObjectFile &Obj;
+  const MCDisassembler &Dis;
+  const MCInstrAnalysis &MIA;
+  MCObjectSymbolizer *MOS;
+
+  /// \brief The fallback memory region, outside the object file.
+  OwningPtr<MemoryObject> FallbackRegion;
+
+  /// \brief Return a memory region suitable for reading starting at \p Addr.
+  /// In most cases, this returns a StringRefMemoryObject backed by the
+  /// containing section. When no section was found, this returns the
+  /// FallbackRegion, if it is suitable.
+  /// If it is not, or if there is no fallback region, this returns 0.
+  MemoryObject *getRegionFor(uint64_t Addr);
+
 private:
   /// \brief Fill \p Module by creating an atom for each section.
   /// This could be made much smarter, using information like symbols, but also
@@ -62,6 +134,40 @@ private:
   /// When the CFG is built, contiguous instructions that were previously in a
   /// single MCTextAtom will be split in multiple basic block atoms.
   void buildCFG(MCModule *Module);
+
+  MCBasicBlock *getBBAt(MCModule *Module, MCFunction *MCFN, uint64_t BeginAddr,
+                        AddressSetTy &CallTargets,
+                        AddressSetTy &TailCallTargets);
+};
+
+class MCMachOObjectDisassembler : public MCObjectDisassembler {
+  const object::MachOObjectFile &MOOF;
+
+  uint64_t VMAddrSlide;
+  uint64_t HeaderLoadAddress;
+
+  // __DATA;__mod_init_func support.
+  llvm::StringRef ModInitContents;
+  // __DATA;__mod_exit_func support.
+  llvm::StringRef ModExitContents;
+
+public:
+  /// \brief Construct a Mach-O specific object disassembler.
+  /// \param VMAddrSlide The virtual address slide applied by dyld.
+  /// \param HeaderLoadAddress The load address of the mach_header for this
+  /// object.
+  MCMachOObjectDisassembler(const object::MachOObjectFile &MOOF,
+                            const MCDisassembler &Dis,
+                            const MCInstrAnalysis &MIA, uint64_t VMAddrSlide,
+                            uint64_t HeaderLoadAddress);
+
+protected:
+  uint64_t getEffectiveLoadAddr(uint64_t Addr) LLVM_OVERRIDE;
+  uint64_t getOriginalLoadAddr(uint64_t EffectiveAddr) LLVM_OVERRIDE;
+  uint64_t getEntrypoint() LLVM_OVERRIDE;
+
+  ArrayRef<uint64_t> getStaticInitFunctions() LLVM_OVERRIDE;
+  ArrayRef<uint64_t> getStaticExitFunctions() LLVM_OVERRIDE;
 };
 
 }
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index a5853b6..c48dcb0 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -125,6 +125,10 @@ protected:
   const MCSection *DwarfStrOffDWOSection;
   const MCSection *DwarfAddrSection;
 
+  /// Sections for newer gnu pubnames and pubtypes.
+  const MCSection *DwarfGnuPubNamesSection;
+  const MCSection *DwarfGnuPubTypesSection;
+
   // Extra TLS Variable Data section.  If the target needs to put additional
   // information for a TLS variable, it'll go here.
   const MCSection *TLSExtraDataSection;
@@ -138,6 +142,8 @@ protected:
   /// ELF and MachO only.
   const MCSection *TLSBSSSection;         // Defaults to ".tbss".
 
+  /// StackMap section.
+  const MCSection *StackMapSection;
 
   /// EHFrameSection - EH frame section. It is initialized on demand so it
   /// can be overwritten (with uniquing).
@@ -223,6 +229,12 @@ public:
   const MCSection *getDwarfFrameSection() const { return DwarfFrameSection; }
   const MCSection *getDwarfPubNamesSection() const{return DwarfPubNamesSection;}
   const MCSection *getDwarfPubTypesSection() const{return DwarfPubTypesSection;}
+  const MCSection *getDwarfGnuPubNamesSection() const {
+    return DwarfGnuPubNamesSection;
+  }
+  const MCSection *getDwarfGnuPubTypesSection() const {
+    return DwarfGnuPubTypesSection;
+  }
   const MCSection *getDwarfDebugInlineSection() const {
     return DwarfDebugInlineSection;
   }
@@ -275,6 +287,8 @@ public:
   const MCSection *getTLSDataSection() const { return TLSDataSection; }
   const MCSection *getTLSBSSSection() const { return TLSBSSSection; }
 
+  const MCSection *getStackMapSection() const { return StackMapSection; }
+
   /// ELF specific sections.
   ///
   const MCSection *getDataRelSection() const { return DataRelSection; }
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index 0affeee..5667544 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -40,10 +40,11 @@ class MCObjectStreamer : public MCStreamer {
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame);
 
 protected:
-  MCObjectStreamer(StreamerKind Kind, MCContext &Context, MCAsmBackend &TAB,
-                   raw_ostream &_OS, MCCodeEmitter *_Emitter);
-  MCObjectStreamer(StreamerKind Kind, MCContext &Context, MCAsmBackend &TAB,
-                   raw_ostream &_OS, MCCodeEmitter *_Emitter,
+  MCObjectStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                   MCAsmBackend &TAB, raw_ostream &_OS,
+                   MCCodeEmitter *_Emitter);
+  MCObjectStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                   MCAsmBackend &TAB, raw_ostream &_OS, MCCodeEmitter *_Emitter,
                    MCAssembler *_Assembler);
   ~MCObjectStreamer();
 
@@ -116,12 +117,6 @@ public:
   virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue);
   virtual void EmitZeros(uint64_t NumBytes);
   virtual void FinishImpl();
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() >= SK_ELFStreamer && S->getKind() <= SK_WinCOFFStreamer;
-  }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCObjectSymbolizer.h b/include/llvm/MC/MCObjectSymbolizer.h
index 555cf51..64b932e 100644
--- a/include/llvm/MC/MCObjectSymbolizer.h
+++ b/include/llvm/MC/MCObjectSymbolizer.h
@@ -32,22 +32,14 @@ class MCObjectSymbolizer : public MCSymbolizer {
 protected:
   const object::ObjectFile *Obj;
 
-  typedef DenseMap<uint64_t, object::RelocationRef> AddrToRelocMap;
-  typedef std::vector<object::SectionRef> SortedSectionList;
-  SortedSectionList SortedSections;
-
   // Map a load address to the first relocation that applies there. As far as I
   // know, if there are several relocations at the exact same address, they are
   // related and the others can be determined from the first that was found in
   // the relocation table. For instance, on x86-64 mach-o, a SUBTRACTOR
   // relocation (referencing the minuend symbol) is followed by an UNSIGNED
   // relocation (referencing the subtrahend symbol).
-  AddrToRelocMap AddrToReloc;
-
-  // Helpers around SortedSections.
-  SortedSectionList::const_iterator findSectionContaining(uint64_t Addr) const;
-  void insertSection(object::SectionRef Section);
-
+  const object::RelocationRef *findRelocationAt(uint64_t Addr);
+  const object::SectionRef *findSectionContaining(uint64_t Addr);
 
   MCObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
                      const object::ObjectFile *Obj);
@@ -56,18 +48,32 @@ public:
   /// \name Overridden MCSymbolizer methods:
   /// @{
   bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
-                                int64_t Value,
-                                uint64_t Address, bool IsBranch,
-                                uint64_t Offset, uint64_t InstSize);
+                                int64_t Value, uint64_t Address,
+                                bool IsBranch, uint64_t Offset,
+                                uint64_t InstSize);
 
   void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
                                        int64_t Value, uint64_t Address);
   /// @}
 
+  /// \brief Look for an external function symbol at \p Addr.
+  /// (References through the ELF PLT, Mach-O stubs, and similar).
+  /// \returns An MCExpr representing the external symbol, or 0 if not found.
+  virtual StringRef findExternalFunctionAt(uint64_t Addr);
+
   /// \brief Create an object symbolizer for \p Obj.
   static MCObjectSymbolizer *
     createObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
                            const object::ObjectFile *Obj);
+
+private:
+  typedef DenseMap<uint64_t, object::RelocationRef> AddrToRelocMap;
+  typedef std::vector<object::SectionRef> SortedSectionList;
+  SortedSectionList SortedSections;
+  AddrToRelocMap AddrToReloc;
+
+  void buildSectionList();
+  void buildRelocationByAddrMap();
 };
 
 }
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 0dab314..1b3ab57 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -63,6 +63,7 @@ private:
   AsmToken LexSingleQuote();
   AsmToken LexQuote();
   AsmToken LexFloatLiteral();
+  AsmToken LexHexFloatLiteral(bool NoIntDigits);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 673cdf6..6881e1d 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -124,7 +124,7 @@ struct MCSchedClassDesc {
 /// microarchitecture to the scheduler in the form of properties. It also
 /// optionally refers to scheduler resource tables and itinerary
 /// tables. Scheduler resource tables model the latency and cost for each
-/// instruction type. Itinerary tables are an independant mechanism that
+/// instruction type. Itinerary tables are an independent mechanism that
 /// provides a detailed reservation table describing each cycle of instruction
 /// execution. Subtargets may define any or all of the above categories of data
 /// depending on the type of CPU and selected scheduler.
@@ -174,6 +174,8 @@ public:
   unsigned MispredictPenalty;
   static const unsigned DefaultMispredictPenalty = 10;
 
+  bool CompleteModel;
+
 private:
   unsigned ProcID;
   const MCProcResourceDesc *ProcResourceTable;
@@ -194,6 +196,7 @@ public:
                   LoadLatency(DefaultLoadLatency),
                   HighLatency(DefaultHighLatency),
                   MispredictPenalty(DefaultMispredictPenalty),
+                  CompleteModel(true),
                   ProcID(0), ProcResourceTable(0), SchedClassTable(0),
                   NumProcResourceKinds(0), NumSchedClasses(0),
                   InstrItineraries(0) {
@@ -203,19 +206,23 @@ public:
 
   // Table-gen driven ctor.
   MCSchedModel(unsigned iw, int mbs, unsigned ll, unsigned hl,
-               unsigned mp, unsigned pi, const MCProcResourceDesc *pr,
+               unsigned mp, bool cm, unsigned pi, const MCProcResourceDesc *pr,
                const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
                const InstrItinerary *ii):
     IssueWidth(iw), MicroOpBufferSize(mbs), LoadLatency(ll), HighLatency(hl),
-    MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
-    SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc),
-    InstrItineraries(ii) {}
+    MispredictPenalty(mp), CompleteModel(cm), ProcID(pi),
+    ProcResourceTable(pr), SchedClassTable(sc), NumProcResourceKinds(npr),
+    NumSchedClasses(nsc), InstrItineraries(ii) {}
 
   unsigned getProcessorID() const { return ProcID; }
 
   /// Does this machine model include instruction-level scheduling.
   bool hasInstrSchedModel() const { return SchedClassTable; }
 
+  /// Return true if this machine model data for all instructions with a
+  /// scheduling class (itinerary class or SchedRW list).
+  bool isComplete() const { return CompleteModel; }
+
   unsigned getNumProcResourceKinds() const {
     return NumProcResourceKinds;
   }
diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index 754e829..45c84ae 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/COFF.h"
 
 namespace llvm {
+class MCSymbol;
 
 /// MCSectionCOFF - This represents a section on Windows
   class MCSectionCOFF : public MCSection {
@@ -32,6 +33,11 @@ namespace llvm {
     /// drawn from the enums below.
     mutable unsigned Characteristics;
 
+    /// The COMDAT symbol of this section. Only valid if this is a COMDAT
+    /// section. Two COMDAT sections are merged if they have the same
+    /// COMDAT symbol.
+    const MCSymbol *COMDATSymbol;
+
     /// Selection - This is the Selection field for the section symbol, if
     /// it is a COMDAT section (Characteristics & IMAGE_SCN_LNK_COMDAT) != 0
     mutable int Selection;
@@ -44,9 +50,11 @@ namespace llvm {
   private:
     friend class MCContext;
     MCSectionCOFF(StringRef Section, unsigned Characteristics,
-                  int Selection, const MCSectionCOFF *Assoc, SectionKind K)
-      : MCSection(SV_COFF, K), SectionName(Section),
-        Characteristics(Characteristics), Selection(Selection), Assoc(Assoc) {
+                  const MCSymbol *COMDATSymbol, int Selection,
+                  const MCSectionCOFF *Assoc, SectionKind K)
+        : MCSection(SV_COFF, K), SectionName(Section),
+          Characteristics(Characteristics), COMDATSymbol(COMDATSymbol),
+          Selection(Selection), Assoc(Assoc) {
       assert ((Characteristics & 0x00F00000) == 0 &&
         "alignment must not be set upon section creation");
       assert ((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
diff --git a/include/llvm/MC/MCSectionMachO.h b/include/llvm/MC/MCSectionMachO.h
index b68bd85..d7e88ea 100644
--- a/include/llvm/MC/MCSectionMachO.h
+++ b/include/llvm/MC/MCSectionMachO.h
@@ -41,7 +41,7 @@ public:
 
   /// These are the section type and attributes fields.  A MachO section can
   /// have only one Type, but can have any of the attributes specified.
-  enum {
+  enum LLVM_ENUM_INT_TYPE(uint32_t) {
     // TypeAndAttributes bitmasks.
     SECTION_TYPE       = 0x000000FFU,
     SECTION_ATTRIBUTES = 0xFFFFFF00U,
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 970c4ed..0b1fe6e 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -24,675 +24,716 @@
 #include <string>
 
 namespace llvm {
-  class MCAsmBackend;
-  class MCCodeEmitter;
-  class MCContext;
-  class MCExpr;
-  class MCInst;
-  class MCInstPrinter;
-  class MCSection;
-  class MCSymbol;
-  class StringRef;
-  class Twine;
-  class raw_ostream;
-  class formatted_raw_ostream;
-
-  typedef std::pair<const MCSection *, const MCExpr *> MCSectionSubPair;
-
-  /// MCStreamer - Streaming machine code generation interface.  This interface
-  /// is intended to provide a programatic interface that is very similar to the
-  /// level that an assembler .s file provides.  It has callbacks to emit bytes,
-  /// handle directives, etc.  The implementation of this interface retains
-  /// state to know what the current section is etc.
-  ///
-  /// There are multiple implementations of this interface: one for writing out
-  /// a .s file, and implementations that write out .o files of various formats.
-  ///
-  class MCStreamer {
-  public:
-    enum StreamerKind {
-      SK_AsmStreamer,
-      SK_NullStreamer,
-      SK_RecordStreamer,
-
-      // MCObjectStreamer subclasses.
-      SK_ELFStreamer,
-      SK_ARMELFStreamer,
-      SK_MachOStreamer,
-      SK_PureStreamer,
-      SK_MipsELFStreamer,
-      SK_WinCOFFStreamer
-    };
-
-  private:
-    const StreamerKind Kind;
-    MCContext &Context;
-
-    MCStreamer(const MCStreamer&) LLVM_DELETED_FUNCTION;
-    MCStreamer &operator=(const MCStreamer&) LLVM_DELETED_FUNCTION;
-
-    bool EmitEHFrame;
-    bool EmitDebugFrame;
-
-    std::vector<MCDwarfFrameInfo> FrameInfos;
-    MCDwarfFrameInfo *getCurrentFrameInfo();
-    MCSymbol *EmitCFICommon();
-    void EnsureValidFrame();
-
-    std::vector<MCWin64EHUnwindInfo *> W64UnwindInfos;
-    MCWin64EHUnwindInfo *CurrentW64UnwindInfo;
-    void setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame);
-    void EnsureValidW64UnwindInfo();
-
-    MCSymbol* LastSymbol;
-
-    /// SectionStack - This is stack of current and previous section
-    /// values saved by PushSection.
-    SmallVector<std::pair<MCSectionSubPair, MCSectionSubPair>, 4> SectionStack;
-
-    bool AutoInitSections;
-
-  protected:
-    MCStreamer(StreamerKind Kind, MCContext &Ctx);
-
-    const MCExpr *BuildSymbolDiff(MCContext &Context, const MCSymbol *A,
-                                  const MCSymbol *B);
-
-    const MCExpr *ForceExpAbs(const MCExpr* Expr);
-
-    void RecordProcStart(MCDwarfFrameInfo &Frame);
-    virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
-    void RecordProcEnd(MCDwarfFrameInfo &Frame);
-    virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
-    void EmitFrames(bool usingCFI);
-
-    MCWin64EHUnwindInfo *getCurrentW64UnwindInfo(){return CurrentW64UnwindInfo;}
-    void EmitW64Tables();
-
-  public:
-    virtual ~MCStreamer();
-
-    StreamerKind getKind() const { return Kind; }
-
-    /// State management
-    ///
-    virtual void reset();
-
-    MCContext &getContext() const { return Context; }
-
-    unsigned getNumFrameInfos() {
-      return FrameInfos.size();
-    }
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCExpr;
+class MCInst;
+class MCInstPrinter;
+class MCSection;
+class MCStreamer;
+class MCSymbol;
+class StringRef;
+class Twine;
+class raw_ostream;
+class formatted_raw_ostream;
+
+typedef std::pair<const MCSection *, const MCExpr *> MCSectionSubPair;
+
+/// Target specific streamer interface. This is used so that targets can
+/// implement support for target specific assembly directives.
+///
+/// If target foo wants to use this, it should implement 3 classes:
+/// * FooTargetStreamer : public MCTargetStreamer
+/// * FooTargetAsmSreamer : public FooTargetStreamer
+/// * FooTargetELFStreamer : public FooTargetStreamer
+///
+/// FooTargetStreamer should have a pure virtual method for each directive. For
+/// example, for a ".bar symbol_name" directive, it should have
+/// virtual emitBar(const MCSymbol &Symbol) = 0;
+///
+/// The FooTargetAsmSreamer and FooTargetELFStreamer classes implement the
+/// method. The assembly streamer just prints ".bar symbol_name". The object
+/// streamer does whatever is needed to implement .bar in the object file.
+///
+/// In the assembly printer and parser the target streamer can be used by
+/// calling getTargetStreamer and casting it to FooTargetStreamer:
+///
+/// MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+/// FooTargetStreamer &ATS = static_cast<FooTargetStreamer &>(TS);
+///
+/// The base classes FooTargetAsmSreamer and FooTargetELFStreamer should *never*
+/// be treated differently. Callers should always talk to a FooTargetStreamer.
+class MCTargetStreamer {
+protected:
+  MCStreamer *Streamer;
+
+public:
+  virtual ~MCTargetStreamer();
+  void setStreamer(MCStreamer *S) { Streamer = S; }
+};
+
+// FIXME: declared here because it is used from
+// lib/CodeGen/AsmPrinter/ARMException.cpp.
+class ARMTargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+public:
+  virtual void emitFnStart() = 0;
+  virtual void emitFnEnd() = 0;
+  virtual void emitCantUnwind() = 0;
+  virtual void emitPersonality(const MCSymbol *Personality) = 0;
+  virtual void emitHandlerData() = 0;
+  virtual void emitSetFP(unsigned FpReg, unsigned SpReg,
+                         int64_t Offset = 0) = 0;
+  virtual void emitPad(int64_t Offset) = 0;
+  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector) = 0;
+
+  virtual void switchVendor(StringRef Vendor) = 0;
+  virtual void emitAttribute(unsigned Attribute, unsigned Value) = 0;
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String) = 0;
+  virtual void emitFPU(unsigned FPU) = 0;
+  virtual void finishAttributeSection() = 0;
+};
+
+/// MCStreamer - Streaming machine code generation interface.  This interface
+/// is intended to provide a programatic interface that is very similar to the
+/// level that an assembler .s file provides.  It has callbacks to emit bytes,
+/// handle directives, etc.  The implementation of this interface retains
+/// state to know what the current section is etc.
+///
+/// There are multiple implementations of this interface: one for writing out
+/// a .s file, and implementations that write out .o files of various formats.
+///
+class MCStreamer {
+  MCContext &Context;
+  OwningPtr<MCTargetStreamer> TargetStreamer;
+
+  MCStreamer(const MCStreamer &) LLVM_DELETED_FUNCTION;
+  MCStreamer &operator=(const MCStreamer &) LLVM_DELETED_FUNCTION;
+
+  bool EmitEHFrame;
+  bool EmitDebugFrame;
+
+  std::vector<MCDwarfFrameInfo> FrameInfos;
+  MCDwarfFrameInfo *getCurrentFrameInfo();
+  MCSymbol *EmitCFICommon();
+  void EnsureValidFrame();
+
+  std::vector<MCWin64EHUnwindInfo *> W64UnwindInfos;
+  MCWin64EHUnwindInfo *CurrentW64UnwindInfo;
+  void setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame);
+  void EnsureValidW64UnwindInfo();
+
+  MCSymbol *LastSymbol;
+
+  // SymbolOrdering - Tracks an index to represent the order
+  // a symbol was emitted in. Zero means we did not emit that symbol.
+  DenseMap<const MCSymbol *, unsigned> SymbolOrdering;
+
+  /// SectionStack - This is stack of current and previous section
+  /// values saved by PushSection.
+  SmallVector<std::pair<MCSectionSubPair, MCSectionSubPair>, 4> SectionStack;
+
+  bool AutoInitSections;
+
+protected:
+  MCStreamer(MCContext &Ctx, MCTargetStreamer *TargetStreamer);
+
+  const MCExpr *BuildSymbolDiff(MCContext &Context, const MCSymbol *A,
+                                const MCSymbol *B);
+
+  const MCExpr *ForceExpAbs(const MCExpr *Expr);
+
+  void RecordProcStart(MCDwarfFrameInfo &Frame);
+  virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
+  void RecordProcEnd(MCDwarfFrameInfo &Frame);
+  virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
+  void EmitFrames(MCAsmBackend *MAB, bool usingCFI);
+
+  MCWin64EHUnwindInfo *getCurrentW64UnwindInfo() {
+    return CurrentW64UnwindInfo;
+  }
+  void EmitW64Tables();
+
+  virtual void EmitRawTextImpl(StringRef String);
+
+public:
+  virtual ~MCStreamer();
+
+  /// State management
+  ///
+  virtual void reset();
 
-    const MCDwarfFrameInfo &getFrameInfo(unsigned i) {
-      return FrameInfos[i];
-    }
+  MCContext &getContext() const { return Context; }
 
-    ArrayRef<MCDwarfFrameInfo> getFrameInfos() {
-      return FrameInfos;
-    }
+  MCTargetStreamer &getTargetStreamer() {
+    assert(TargetStreamer);
+    return *TargetStreamer;
+  }
 
-    unsigned getNumW64UnwindInfos() {
-      return W64UnwindInfos.size();
-    }
+  unsigned getNumFrameInfos() { return FrameInfos.size(); }
 
-    MCWin64EHUnwindInfo &getW64UnwindInfo(unsigned i) {
-      return *W64UnwindInfos[i];
-    }
+  const MCDwarfFrameInfo &getFrameInfo(unsigned i) { return FrameInfos[i]; }
 
-    /// @name Assembly File Formatting.
-    /// @{
-
-    /// isVerboseAsm - Return true if this streamer supports verbose assembly
-    /// and if it is enabled.
-    virtual bool isVerboseAsm() const { return false; }
-
-    /// hasRawTextSupport - Return true if this asm streamer supports emitting
-    /// unformatted text to the .s file with EmitRawText.
-    virtual bool hasRawTextSupport() const { return false; }
-
-    /// AddComment - Add a comment that can be emitted to the generated .s
-    /// file if applicable as a QoI issue to make the output of the compiler
-    /// more readable.  This only affects the MCAsmStreamer, and only when
-    /// verbose assembly output is enabled.
-    ///
-    /// If the comment includes embedded \n's, they will each get the comment
-    /// prefix as appropriate.  The added comment should not end with a \n.
-    virtual void AddComment(const Twine &T) {}
-
-    /// GetCommentOS - Return a raw_ostream that comments can be written to.
-    /// Unlike AddComment, you are required to terminate comments with \n if you
-    /// use this method.
-    virtual raw_ostream &GetCommentOS();
-
-    /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
-    virtual void AddBlankLine() {}
-
-    /// @}
-
-    /// @name Symbol & Section Management
-    /// @{
-
-    /// getCurrentSection - Return the current section that the streamer is
-    /// emitting code to.
-    MCSectionSubPair getCurrentSection() const {
-      if (!SectionStack.empty())
-        return SectionStack.back().first;
-      return MCSectionSubPair();
-    }
+  ArrayRef<MCDwarfFrameInfo> getFrameInfos() const { return FrameInfos; }
 
-    /// getPreviousSection - Return the previous section that the streamer is
-    /// emitting code to.
-    MCSectionSubPair getPreviousSection() const {
-      if (!SectionStack.empty())
-        return SectionStack.back().second;
-      return MCSectionSubPair();
-    }
+  unsigned getNumW64UnwindInfos() { return W64UnwindInfos.size(); }
 
-    /// ChangeSection - Update streamer for a new active section.
-    ///
-    /// This is called by PopSection and SwitchSection, if the current
-    /// section changes.
-    virtual void ChangeSection(const MCSection *, const MCExpr *) = 0;
-
-    /// pushSection - Save the current and previous section on the
-    /// section stack.
-    void PushSection() {
-      SectionStack.push_back(std::make_pair(getCurrentSection(),
-                                            getPreviousSection()));
-    }
+  MCWin64EHUnwindInfo &getW64UnwindInfo(unsigned i) {
+    return *W64UnwindInfos[i];
+  }
 
-    /// popSection - Restore the current and previous section from
-    /// the section stack.  Calls ChangeSection as needed.
-    ///
-    /// Returns false if the stack was empty.
-    bool PopSection() {
-      if (SectionStack.size() <= 1)
-        return false;
-      MCSectionSubPair oldSection = SectionStack.pop_back_val().first;
-      MCSectionSubPair curSection = SectionStack.back().first;
-
-      if (oldSection != curSection)
-        ChangeSection(curSection.first, curSection.second);
-      return true;
-    }
+  void generateCompactUnwindEncodings(MCAsmBackend *MAB);
 
-    bool SubSection(const MCExpr *Subsection) {
-      if (SectionStack.empty())
-        return false;
+  /// @name Assembly File Formatting.
+  /// @{
 
-      SwitchSection(SectionStack.back().first.first, Subsection);
-      return true;
-    }
+  /// isVerboseAsm - Return true if this streamer supports verbose assembly
+  /// and if it is enabled.
+  virtual bool isVerboseAsm() const { return false; }
 
-    /// SwitchSection - Set the current section where code is being emitted to
-    /// @p Section.  This is required to update CurSection.
-    ///
-    /// This corresponds to assembler directives like .section, .text, etc.
-    void SwitchSection(const MCSection *Section, const MCExpr *Subsection = 0) {
-      assert(Section && "Cannot switch to a null section!");
-      MCSectionSubPair curSection = SectionStack.back().first;
-      SectionStack.back().second = curSection;
-      if (MCSectionSubPair(Section, Subsection) != curSection) {
-        SectionStack.back().first = MCSectionSubPair(Section, Subsection);
-        ChangeSection(Section, Subsection);
-      }
-    }
+  /// hasRawTextSupport - Return true if this asm streamer supports emitting
+  /// unformatted text to the .s file with EmitRawText.
+  virtual bool hasRawTextSupport() const { return false; }
 
-    /// SwitchSectionNoChange - Set the current section where code is being
-    /// emitted to @p Section.  This is required to update CurSection. This
-    /// version does not call ChangeSection.
-    void SwitchSectionNoChange(const MCSection *Section,
-                               const MCExpr *Subsection = 0) {
-      assert(Section && "Cannot switch to a null section!");
-      MCSectionSubPair curSection = SectionStack.back().first;
-      SectionStack.back().second = curSection;
-      if (MCSectionSubPair(Section, Subsection) != curSection)
-        SectionStack.back().first = MCSectionSubPair(Section, Subsection);
+  /// AddComment - Add a comment that can be emitted to the generated .s
+  /// file if applicable as a QoI issue to make the output of the compiler
+  /// more readable.  This only affects the MCAsmStreamer, and only when
+  /// verbose assembly output is enabled.
+  ///
+  /// If the comment includes embedded \n's, they will each get the comment
+  /// prefix as appropriate.  The added comment should not end with a \n.
+  virtual void AddComment(const Twine &T) {}
+
+  /// GetCommentOS - Return a raw_ostream that comments can be written to.
+  /// Unlike AddComment, you are required to terminate comments with \n if you
+  /// use this method.
+  virtual raw_ostream &GetCommentOS();
+
+  /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
+  virtual void AddBlankLine() {}
+
+  /// @}
+
+  /// @name Symbol & Section Management
+  /// @{
+
+  /// getCurrentSection - Return the current section that the streamer is
+  /// emitting code to.
+  MCSectionSubPair getCurrentSection() const {
+    if (!SectionStack.empty())
+      return SectionStack.back().first;
+    return MCSectionSubPair();
+  }
+
+  /// getPreviousSection - Return the previous section that the streamer is
+  /// emitting code to.
+  MCSectionSubPair getPreviousSection() const {
+    if (!SectionStack.empty())
+      return SectionStack.back().second;
+    return MCSectionSubPair();
+  }
+
+  /// GetSymbolOrder - Returns an index to represent the order
+  /// a symbol was emitted in. (zero if we did not emit that symbol)
+  unsigned GetSymbolOrder(const MCSymbol *Sym) const {
+    return SymbolOrdering.lookup(Sym);
+  }
+
+  /// ChangeSection - Update streamer for a new active section.
+  ///
+  /// This is called by PopSection and SwitchSection, if the current
+  /// section changes.
+  virtual void ChangeSection(const MCSection *, const MCExpr *) = 0;
+
+  /// pushSection - Save the current and previous section on the
+  /// section stack.
+  void PushSection() {
+    SectionStack.push_back(
+        std::make_pair(getCurrentSection(), getPreviousSection()));
+  }
+
+  /// popSection - Restore the current and previous section from
+  /// the section stack.  Calls ChangeSection as needed.
+  ///
+  /// Returns false if the stack was empty.
+  bool PopSection() {
+    if (SectionStack.size() <= 1)
+      return false;
+    MCSectionSubPair oldSection = SectionStack.pop_back_val().first;
+    MCSectionSubPair curSection = SectionStack.back().first;
+
+    if (oldSection != curSection)
+      ChangeSection(curSection.first, curSection.second);
+    return true;
+  }
+
+  bool SubSection(const MCExpr *Subsection) {
+    if (SectionStack.empty())
+      return false;
+
+    SwitchSection(SectionStack.back().first.first, Subsection);
+    return true;
+  }
+
+  /// SwitchSection - Set the current section where code is being emitted to
+  /// @p Section.  This is required to update CurSection.
+  ///
+  /// This corresponds to assembler directives like .section, .text, etc.
+  void SwitchSection(const MCSection *Section, const MCExpr *Subsection = 0) {
+    assert(Section && "Cannot switch to a null section!");
+    MCSectionSubPair curSection = SectionStack.back().first;
+    SectionStack.back().second = curSection;
+    if (MCSectionSubPair(Section, Subsection) != curSection) {
+      SectionStack.back().first = MCSectionSubPair(Section, Subsection);
+      ChangeSection(Section, Subsection);
     }
+  }
+
+  /// SwitchSectionNoChange - Set the current section where code is being
+  /// emitted to @p Section.  This is required to update CurSection. This
+  /// version does not call ChangeSection.
+  void SwitchSectionNoChange(const MCSection *Section,
+                             const MCExpr *Subsection = 0) {
+    assert(Section && "Cannot switch to a null section!");
+    MCSectionSubPair curSection = SectionStack.back().first;
+    SectionStack.back().second = curSection;
+    if (MCSectionSubPair(Section, Subsection) != curSection)
+      SectionStack.back().first = MCSectionSubPair(Section, Subsection);
+  }
+
+  /// Initialize the streamer.
+  void InitStreamer() {
+    if (AutoInitSections)
+      InitSections();
+  }
+
+  /// Tell this MCStreamer to call InitSections upon initialization.
+  void setAutoInitSections(bool AutoInitSections) {
+    this->AutoInitSections = AutoInitSections;
+  }
+
+  /// InitSections - Create the default sections and set the initial one.
+  virtual void InitSections() = 0;
+
+  /// InitToTextSection - Create a text section and switch the streamer to it.
+  virtual void InitToTextSection() = 0;
+
+  /// AssignSection - Sets the symbol's section.
+  ///
+  /// Each emitted symbol will be tracked in the ordering table,
+  /// so we can sort on them later.
+  void AssignSection(MCSymbol *Symbol, const MCSection *Section);
 
-    /// Initialize the streamer.
-    void InitStreamer() {
-      if (AutoInitSections)
-        InitSections();
-    }
+  /// EmitLabel - Emit a label for @p Symbol into the current section.
+  ///
+  /// This corresponds to an assembler statement such as:
+  ///   foo:
+  ///
+  /// @param Symbol - The symbol to emit. A given symbol should only be
+  /// emitted as a label once, and symbols emitted as a label should never be
+  /// used in an assignment.
+  virtual void EmitLabel(MCSymbol *Symbol);
 
-    /// Tell this MCStreamer to call InitSections upon initialization.
-    void setAutoInitSections(bool AutoInitSections) {
-      this->AutoInitSections = AutoInitSections;
-    }
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
 
-    /// InitSections - Create the default sections and set the initial one.
-    virtual void InitSections() = 0;
-
-    /// InitToTextSection - Create a text section and switch the streamer to it.
-    virtual void InitToTextSection() = 0;
-
-    /// EmitLabel - Emit a label for @p Symbol into the current section.
-    ///
-    /// This corresponds to an assembler statement such as:
-    ///   foo:
-    ///
-    /// @param Symbol - The symbol to emit. A given symbol should only be
-    /// emitted as a label once, and symbols emitted as a label should never be
-    /// used in an assignment.
-    virtual void EmitLabel(MCSymbol *Symbol);
-
-    virtual void EmitDebugLabel(MCSymbol *Symbol);
-
-    virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
-                                     MCSymbol *EHSymbol);
-
-    /// EmitAssemblerFlag - Note in the output the specified @p Flag.
-    virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) = 0;
-
-    /// EmitLinkerOptions - Emit the given list @p Options of strings as linker
-    /// options into the output.
-    virtual void EmitLinkerOptions(ArrayRef<std::string> Kind) {}
-
-    /// EmitDataRegion - Note in the output the specified region @p Kind.
-    virtual void EmitDataRegion(MCDataRegionType Kind) {}
-
-    /// EmitThumbFunc - Note in the output that the specified @p Func is
-    /// a Thumb mode function (ARM target only).
-    virtual void EmitThumbFunc(MCSymbol *Func) = 0;
-
-    /// getOrCreateSymbolData - Get symbol data for given symbol.
-    virtual MCSymbolData &getOrCreateSymbolData(MCSymbol *Symbol);
-
-    /// EmitAssignment - Emit an assignment of @p Value to @p Symbol.
-    ///
-    /// This corresponds to an assembler statement such as:
-    ///  symbol = value
-    ///
-    /// The assignment generates no code, but has the side effect of binding the
-    /// value in the current context. For the assembly streamer, this prints the
-    /// binding into the .s file.
-    ///
-    /// @param Symbol - The symbol being assigned to.
-    /// @param Value - The value for the symbol.
-    virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) = 0;
-
-    /// EmitWeakReference - Emit an weak reference from @p Alias to @p Symbol.
-    ///
-    /// This corresponds to an assembler statement such as:
-    ///  .weakref alias, symbol
-    ///
-    /// @param Alias - The alias that is being created.
-    /// @param Symbol - The symbol being aliased.
-    virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) = 0;
-
-    /// EmitSymbolAttribute - Add the given @p Attribute to @p Symbol.
-    virtual void EmitSymbolAttribute(MCSymbol *Symbol,
-                                     MCSymbolAttr Attribute) = 0;
-
-    /// EmitSymbolDesc - Set the @p DescValue for the @p Symbol.
-    ///
-    /// @param Symbol - The symbol to have its n_desc field set.
-    /// @param DescValue - The value to set into the n_desc field.
-    virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) = 0;
-
-    /// BeginCOFFSymbolDef - Start emitting COFF symbol definition
-    ///
-    /// @param Symbol - The symbol to have its External & Type fields set.
-    virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) = 0;
-
-    /// EmitCOFFSymbolStorageClass - Emit the storage class of the symbol.
-    ///
-    /// @param StorageClass - The storage class the symbol should have.
-    virtual void EmitCOFFSymbolStorageClass(int StorageClass) = 0;
-
-    /// EmitCOFFSymbolType - Emit the type of the symbol.
-    ///
-    /// @param Type - A COFF type identifier (see COFF::SymbolType in X86COFF.h)
-    virtual void EmitCOFFSymbolType(int Type) = 0;
-
-    /// EndCOFFSymbolDef - Marks the end of the symbol definition.
-    virtual void EndCOFFSymbolDef() = 0;
-
-    /// EmitCOFFSecRel32 - Emits a COFF section relative relocation.
-    ///
-    /// @param Symbol - Symbol the section relative realocation should point to.
-    virtual void EmitCOFFSecRel32(MCSymbol const *Symbol);
-
-    /// EmitELFSize - Emit an ELF .size directive.
-    ///
-    /// This corresponds to an assembler statement such as:
-    ///  .size symbol, expression
-    ///
-    virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) = 0;
-
-    /// EmitCommonSymbol - Emit a common symbol.
-    ///
-    /// @param Symbol - The common symbol to emit.
-    /// @param Size - The size of the common symbol.
-    /// @param ByteAlignment - The alignment of the symbol if
-    /// non-zero. This must be a power of 2.
-    virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                  unsigned ByteAlignment) = 0;
-
-    /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
-    ///
-    /// @param Symbol - The common symbol to emit.
-    /// @param Size - The size of the common symbol.
-    /// @param ByteAlignment - The alignment of the common symbol in bytes.
-    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                       unsigned ByteAlignment) = 0;
-
-    /// EmitZerofill - Emit the zerofill section and an optional symbol.
-    ///
-    /// @param Section - The zerofill section to create and or to put the symbol
-    /// @param Symbol - The zerofill symbol to emit, if non-NULL.
-    /// @param Size - The size of the zerofill symbol.
-    /// @param ByteAlignment - The alignment of the zerofill symbol if
-    /// non-zero. This must be a power of 2 on some targets.
-    virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
-                              uint64_t Size = 0,unsigned ByteAlignment = 0) = 0;
-
-    /// EmitTBSSSymbol - Emit a thread local bss (.tbss) symbol.
-    ///
-    /// @param Section - The thread local common section.
-    /// @param Symbol - The thread local common symbol to emit.
-    /// @param Size - The size of the symbol.
-    /// @param ByteAlignment - The alignment of the thread local common symbol
-    /// if non-zero.  This must be a power of 2 on some targets.
-    virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                                uint64_t Size, unsigned ByteAlignment = 0) = 0;
-
-    /// @}
-    /// @name Generating Data
-    /// @{
-
-    /// EmitBytes - Emit the bytes in \p Data into the output.
-    ///
-    /// This is used to implement assembler directives such as .byte, .ascii,
-    /// etc.
-    virtual void EmitBytes(StringRef Data) = 0;
-
-    /// EmitValue - Emit the expression @p Value into the output as a native
-    /// integer of the given @p Size bytes.
-    ///
-    /// This is used to implement assembler directives such as .word, .quad,
-    /// etc.
-    ///
-    /// @param Value - The value to emit.
-    /// @param Size - The size of the integer (in bytes) to emit. This must
-    /// match a native machine width.
-    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) = 0;
-
-    void EmitValue(const MCExpr *Value, unsigned Size);
-
-    /// EmitIntValue - Special case of EmitValue that avoids the client having
-    /// to pass in a MCExpr for constant integers.
-    virtual void EmitIntValue(uint64_t Value, unsigned Size);
-
-    /// EmitAbsValue - Emit the Value, but try to avoid relocations. On MachO
-    /// this is done by producing
-    /// foo = value
-    /// .long foo
-    void EmitAbsValue(const MCExpr *Value, unsigned Size);
-
-    virtual void EmitULEB128Value(const MCExpr *Value) = 0;
-
-    virtual void EmitSLEB128Value(const MCExpr *Value) = 0;
-
-    /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
-    /// client having to pass in a MCExpr for constant integers.
-    void EmitULEB128IntValue(uint64_t Value, unsigned Padding = 0);
-
-    /// EmitSLEB128Value - Special case of EmitSLEB128Value that avoids the
-    /// client having to pass in a MCExpr for constant integers.
-    void EmitSLEB128IntValue(int64_t Value);
-
-    /// EmitSymbolValue - Special case of EmitValue that avoids the client
-    /// having to pass in a MCExpr for MCSymbols.
-    void EmitSymbolValue(const MCSymbol *Sym, unsigned Size);
-
-    /// EmitGPRel64Value - Emit the expression @p Value into the output as a
-    /// gprel64 (64-bit GP relative) value.
-    ///
-    /// This is used to implement assembler directives such as .gpdword on
-    /// targets that support them.
-    virtual void EmitGPRel64Value(const MCExpr *Value);
-
-    /// EmitGPRel32Value - Emit the expression @p Value into the output as a
-    /// gprel32 (32-bit GP relative) value.
-    ///
-    /// This is used to implement assembler directives such as .gprel32 on
-    /// targets that support them.
-    virtual void EmitGPRel32Value(const MCExpr *Value);
-
-    /// EmitFill - Emit NumBytes bytes worth of the value specified by
-    /// FillValue.  This implements directives such as '.space'.
-    virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue);
-
-    /// \brief Emit NumBytes worth of zeros.
-    /// This function properly handles data in virtual sections.
-    virtual void EmitZeros(uint64_t NumBytes);
-
-    /// EmitValueToAlignment - Emit some number of copies of @p Value until
-    /// the byte alignment @p ByteAlignment is reached.
-    ///
-    /// If the number of bytes need to emit for the alignment is not a multiple
-    /// of @p ValueSize, then the contents of the emitted fill bytes is
-    /// undefined.
-    ///
-    /// This used to implement the .align assembler directive.
-    ///
-    /// @param ByteAlignment - The alignment to reach. This must be a power of
-    /// two on some targets.
-    /// @param Value - The value to use when filling bytes.
-    /// @param ValueSize - The size of the integer (in bytes) to emit for
-    /// @p Value. This must match a native machine width.
-    /// @param MaxBytesToEmit - The maximum numbers of bytes to emit, or 0. If
-    /// the alignment cannot be reached in this many bytes, no bytes are
-    /// emitted.
-    virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
-                                      unsigned ValueSize = 1,
-                                      unsigned MaxBytesToEmit = 0) = 0;
-
-    /// EmitCodeAlignment - Emit nops until the byte alignment @p ByteAlignment
-    /// is reached.
-    ///
-    /// This used to align code where the alignment bytes may be executed.  This
-    /// can emit different bytes for different sizes to optimize execution.
-    ///
-    /// @param ByteAlignment - The alignment to reach. This must be a power of
-    /// two on some targets.
-    /// @param MaxBytesToEmit - The maximum numbers of bytes to emit, or 0. If
-    /// the alignment cannot be reached in this many bytes, no bytes are
-    /// emitted.
-    virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                   unsigned MaxBytesToEmit = 0) = 0;
-
-    /// EmitValueToOffset - Emit some number of copies of @p Value until the
-    /// byte offset @p Offset is reached.
-    ///
-    /// This is used to implement assembler directives such as .org.
-    ///
-    /// @param Offset - The offset to reach. This may be an expression, but the
-    /// expression must be associated with the current section.
-    /// @param Value - The value to use when filling bytes.
-    /// @return false on success, true if the offset was invalid.
-    virtual bool EmitValueToOffset(const MCExpr *Offset,
-                                   unsigned char Value = 0) = 0;
-
-    /// @}
-
-    /// EmitFileDirective - Switch to a new logical file.  This is used to
-    /// implement the '.file "foo.c"' assembler directive.
-    virtual void EmitFileDirective(StringRef Filename) = 0;
-
-    /// EmitDwarfFileDirective - Associate a filename with a specified logical
-    /// file number.  This implements the DWARF2 '.file 4 "foo.c"' assembler
-    /// directive.
-    virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
-                                        StringRef Filename, unsigned CUID = 0);
-
-    /// EmitDwarfLocDirective - This implements the DWARF2
-    // '.loc fileno lineno ...' assembler directive.
-    virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
-                                       unsigned Column, unsigned Flags,
-                                       unsigned Isa,
-                                       unsigned Discriminator,
-                                       StringRef FileName);
-
-    virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                          const MCSymbol *LastLabel,
-                                          const MCSymbol *Label,
-                                          unsigned PointerSize) = 0;
-
-    virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
-                                           const MCSymbol *Label) {
-    }
+  virtual void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol);
 
-    void EmitDwarfSetLineAddr(int64_t LineDelta, const MCSymbol *Label,
-                              int PointerSize);
-
-    virtual void EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding);
-    virtual void EmitCFISections(bool EH, bool Debug);
-    void EmitCFIStartProc();
-    void EmitCFIEndProc();
-    virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
-    virtual void EmitCFIDefCfaOffset(int64_t Offset);
-    virtual void EmitCFIDefCfaRegister(int64_t Register);
-    virtual void EmitCFIOffset(int64_t Register, int64_t Offset);
-    virtual void EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding);
-    virtual void EmitCFILsda(const MCSymbol *Sym, unsigned Encoding);
-    virtual void EmitCFIRememberState();
-    virtual void EmitCFIRestoreState();
-    virtual void EmitCFISameValue(int64_t Register);
-    virtual void EmitCFIRestore(int64_t Register);
-    virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset);
-    virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment);
-    virtual void EmitCFIEscape(StringRef Values);
-    virtual void EmitCFISignalFrame();
-    virtual void EmitCFIUndefined(int64_t Register);
-    virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
-
-    virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
-    virtual void EmitWin64EHEndProc();
-    virtual void EmitWin64EHStartChained();
-    virtual void EmitWin64EHEndChained();
-    virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
-                                    bool Except);
-    virtual void EmitWin64EHHandlerData();
-    virtual void EmitWin64EHPushReg(unsigned Register);
-    virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
-    virtual void EmitWin64EHAllocStack(unsigned Size);
-    virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
-    virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
-    virtual void EmitWin64EHPushFrame(bool Code);
-    virtual void EmitWin64EHEndProlog();
-
-    /// EmitInstruction - Emit the given @p Instruction into the current
-    /// section.
-    virtual void EmitInstruction(const MCInst &Inst) = 0;
-
-    /// \brief Set the bundle alignment mode from now on in the section.
-    /// The argument is the power of 2 to which the alignment is set. The
-    /// value 0 means turn the bundle alignment off.
-    virtual void EmitBundleAlignMode(unsigned AlignPow2) = 0;
-
-    /// \brief The following instructions are a bundle-locked group.
-    ///
-    /// \param AlignToEnd - If true, the bundle-locked group will be aligned to
-    ///                     the end of a bundle.
-    virtual void EmitBundleLock(bool AlignToEnd) = 0;
-
-    /// \brief Ends a bundle-locked group.
-    virtual void EmitBundleUnlock() = 0;
-
-    /// EmitRawText - If this file is backed by a assembly streamer, this dumps
-    /// the specified string in the output .s file.  This capability is
-    /// indicated by the hasRawTextSupport() predicate.  By default this aborts.
-    virtual void EmitRawText(StringRef String);
-    void EmitRawText(const Twine &String);
-
-    /// ARM-related methods.
-    /// FIXME: Eventually we should have some "target MC streamer" and move
-    /// these methods there.
-    virtual void EmitFnStart();
-    virtual void EmitFnEnd();
-    virtual void EmitCantUnwind();
-    virtual void EmitPersonality(const MCSymbol *Personality);
-    virtual void EmitHandlerData();
-    virtual void EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
-    virtual void EmitPad(int64_t Offset);
-    virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                             bool isVector);
-
-    /// PPC-related methods.
-    /// FIXME: Eventually replace it with some "target MC streamer" and move
-    /// these methods there.
-    virtual void EmitTCEntry(const MCSymbol &S);
-
-    /// FinishImpl - Streamer specific finalization.
-    virtual void FinishImpl() = 0;
-    /// Finish - Finish emission of machine code.
-    void Finish();
-  };
-
-  /// createNullStreamer - Create a dummy machine code streamer, which does
-  /// nothing. This is useful for timing the assembler front end.
-  MCStreamer *createNullStreamer(MCContext &Ctx);
-
-  /// createAsmStreamer - Create a machine code streamer which will print out
-  /// assembly for the native target, suitable for compiling with a native
-  /// assembler.
-  ///
-  /// \param InstPrint - If given, the instruction printer to use. If not given
-  /// the MCInst representation will be printed.  This method takes ownership of
-  /// InstPrint.
-  ///
-  /// \param CE - If given, a code emitter to use to show the instruction
-  /// encoding inline with the assembly. This method takes ownership of \p CE.
-  ///
-  /// \param TAB - If given, a target asm backend to use to show the fixup
-  /// information in conjunction with encoding information. This method takes
-  /// ownership of \p TAB.
-  ///
-  /// \param ShowInst - Whether to show the MCInst representation inline with
-  /// the assembly.
-  MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm,
-                                bool useLoc,
-                                bool useCFI,
-                                bool useDwarfDirectory,
-                                MCInstPrinter *InstPrint = 0,
-                                MCCodeEmitter *CE = 0,
-                                MCAsmBackend *TAB = 0,
-                                bool ShowInst = false);
-
-  /// createMachOStreamer - Create a machine code streamer which will generate
-  /// Mach-O format object files.
-  ///
-  /// Takes ownership of \p TAB and \p CE.
-  MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
-                                  raw_ostream &OS, MCCodeEmitter *CE,
-                                  bool RelaxAll = false);
+  /// EmitAssemblerFlag - Note in the output the specified @p Flag.
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) = 0;
+
+  /// EmitLinkerOptions - Emit the given list @p Options of strings as linker
+  /// options into the output.
+  virtual void EmitLinkerOptions(ArrayRef<std::string> Kind) {}
+
+  /// EmitDataRegion - Note in the output the specified region @p Kind.
+  virtual void EmitDataRegion(MCDataRegionType Kind) {}
+
+  /// EmitThumbFunc - Note in the output that the specified @p Func is
+  /// a Thumb mode function (ARM target only).
+  virtual void EmitThumbFunc(MCSymbol *Func) = 0;
 
-  /// createWinCOFFStreamer - Create a machine code streamer which will
-  /// generate Microsoft COFF format object files.
+  /// getOrCreateSymbolData - Get symbol data for given symbol.
+  virtual MCSymbolData &getOrCreateSymbolData(MCSymbol *Symbol);
+
+  /// EmitAssignment - Emit an assignment of @p Value to @p Symbol.
+  ///
+  /// This corresponds to an assembler statement such as:
+  ///  symbol = value
+  ///
+  /// The assignment generates no code, but has the side effect of binding the
+  /// value in the current context. For the assembly streamer, this prints the
+  /// binding into the .s file.
   ///
-  /// Takes ownership of \p TAB and \p CE.
-  MCStreamer *createWinCOFFStreamer(MCContext &Ctx,
-                                    MCAsmBackend &TAB,
-                                    MCCodeEmitter &CE, raw_ostream &OS,
-                                    bool RelaxAll = false);
+  /// @param Symbol - The symbol being assigned to.
+  /// @param Value - The value for the symbol.
+  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) = 0;
 
-  /// createELFStreamer - Create a machine code streamer which will generate
-  /// ELF format object files.
-  MCStreamer *createELFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
-                                raw_ostream &OS, MCCodeEmitter *CE,
-                                bool RelaxAll, bool NoExecStack);
+  /// EmitWeakReference - Emit an weak reference from @p Alias to @p Symbol.
+  ///
+  /// This corresponds to an assembler statement such as:
+  ///  .weakref alias, symbol
+  ///
+  /// @param Alias - The alias that is being created.
+  /// @param Symbol - The symbol being aliased.
+  virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) = 0;
+
+  /// EmitSymbolAttribute - Add the given @p Attribute to @p Symbol.
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol,
+                                   MCSymbolAttr Attribute) = 0;
+
+  /// EmitSymbolDesc - Set the @p DescValue for the @p Symbol.
+  ///
+  /// @param Symbol - The symbol to have its n_desc field set.
+  /// @param DescValue - The value to set into the n_desc field.
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) = 0;
+
+  /// BeginCOFFSymbolDef - Start emitting COFF symbol definition
+  ///
+  /// @param Symbol - The symbol to have its External & Type fields set.
+  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) = 0;
+
+  /// EmitCOFFSymbolStorageClass - Emit the storage class of the symbol.
+  ///
+  /// @param StorageClass - The storage class the symbol should have.
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass) = 0;
 
-  /// createPureStreamer - Create a machine code streamer which will generate
-  /// "pure" MC object files, for use with MC-JIT and testing tools.
+  /// EmitCOFFSymbolType - Emit the type of the symbol.
   ///
-  /// Takes ownership of \p TAB and \p CE.
-  MCStreamer *createPureStreamer(MCContext &Ctx, MCAsmBackend &TAB,
-                                 raw_ostream &OS, MCCodeEmitter *CE);
+  /// @param Type - A COFF type identifier (see COFF::SymbolType in X86COFF.h)
+  virtual void EmitCOFFSymbolType(int Type) = 0;
+
+  /// EndCOFFSymbolDef - Marks the end of the symbol definition.
+  virtual void EndCOFFSymbolDef() = 0;
+
+  /// EmitCOFFSecRel32 - Emits a COFF section relative relocation.
+  ///
+  /// @param Symbol - Symbol the section relative realocation should point to.
+  virtual void EmitCOFFSecRel32(MCSymbol const *Symbol);
+
+  /// EmitELFSize - Emit an ELF .size directive.
+  ///
+  /// This corresponds to an assembler statement such as:
+  ///  .size symbol, expression
+  ///
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) = 0;
+
+  /// EmitCommonSymbol - Emit a common symbol.
+  ///
+  /// @param Symbol - The common symbol to emit.
+  /// @param Size - The size of the common symbol.
+  /// @param ByteAlignment - The alignment of the symbol if
+  /// non-zero. This must be a power of 2.
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                unsigned ByteAlignment) = 0;
+
+  /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
+  ///
+  /// @param Symbol - The common symbol to emit.
+  /// @param Size - The size of the common symbol.
+  /// @param ByteAlignment - The alignment of the common symbol in bytes.
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment) = 0;
+
+  /// EmitZerofill - Emit the zerofill section and an optional symbol.
+  ///
+  /// @param Section - The zerofill section to create and or to put the symbol
+  /// @param Symbol - The zerofill symbol to emit, if non-NULL.
+  /// @param Size - The size of the zerofill symbol.
+  /// @param ByteAlignment - The alignment of the zerofill symbol if
+  /// non-zero. This must be a power of 2 on some targets.
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+                            uint64_t Size = 0, unsigned ByteAlignment = 0) = 0;
+
+  /// EmitTBSSSymbol - Emit a thread local bss (.tbss) symbol.
+  ///
+  /// @param Section - The thread local common section.
+  /// @param Symbol - The thread local common symbol to emit.
+  /// @param Size - The size of the symbol.
+  /// @param ByteAlignment - The alignment of the thread local common symbol
+  /// if non-zero.  This must be a power of 2 on some targets.
+  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size, unsigned ByteAlignment = 0) = 0;
+
+  /// @}
+  /// @name Generating Data
+  /// @{
+
+  /// EmitBytes - Emit the bytes in \p Data into the output.
+  ///
+  /// This is used to implement assembler directives such as .byte, .ascii,
+  /// etc.
+  virtual void EmitBytes(StringRef Data) = 0;
+
+  /// EmitValue - Emit the expression @p Value into the output as a native
+  /// integer of the given @p Size bytes.
+  ///
+  /// This is used to implement assembler directives such as .word, .quad,
+  /// etc.
+  ///
+  /// @param Value - The value to emit.
+  /// @param Size - The size of the integer (in bytes) to emit. This must
+  /// match a native machine width.
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) = 0;
+
+  void EmitValue(const MCExpr *Value, unsigned Size);
+
+  /// EmitIntValue - Special case of EmitValue that avoids the client having
+  /// to pass in a MCExpr for constant integers.
+  virtual void EmitIntValue(uint64_t Value, unsigned Size);
+
+  /// EmitAbsValue - Emit the Value, but try to avoid relocations. On MachO
+  /// this is done by producing
+  /// foo = value
+  /// .long foo
+  void EmitAbsValue(const MCExpr *Value, unsigned Size);
+
+  virtual void EmitULEB128Value(const MCExpr *Value) = 0;
+
+  virtual void EmitSLEB128Value(const MCExpr *Value) = 0;
+
+  /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
+  /// client having to pass in a MCExpr for constant integers.
+  void EmitULEB128IntValue(uint64_t Value, unsigned Padding = 0);
+
+  /// EmitSLEB128Value - Special case of EmitSLEB128Value that avoids the
+  /// client having to pass in a MCExpr for constant integers.
+  void EmitSLEB128IntValue(int64_t Value);
+
+  /// EmitSymbolValue - Special case of EmitValue that avoids the client
+  /// having to pass in a MCExpr for MCSymbols.
+  void EmitSymbolValue(const MCSymbol *Sym, unsigned Size);
+
+  /// EmitGPRel64Value - Emit the expression @p Value into the output as a
+  /// gprel64 (64-bit GP relative) value.
+  ///
+  /// This is used to implement assembler directives such as .gpdword on
+  /// targets that support them.
+  virtual void EmitGPRel64Value(const MCExpr *Value);
+
+  /// EmitGPRel32Value - Emit the expression @p Value into the output as a
+  /// gprel32 (32-bit GP relative) value.
+  ///
+  /// This is used to implement assembler directives such as .gprel32 on
+  /// targets that support them.
+  virtual void EmitGPRel32Value(const MCExpr *Value);
+
+  /// EmitFill - Emit NumBytes bytes worth of the value specified by
+  /// FillValue.  This implements directives such as '.space'.
+  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue);
+
+  /// \brief Emit NumBytes worth of zeros.
+  /// This function properly handles data in virtual sections.
+  virtual void EmitZeros(uint64_t NumBytes);
+
+  /// EmitValueToAlignment - Emit some number of copies of @p Value until
+  /// the byte alignment @p ByteAlignment is reached.
+  ///
+  /// If the number of bytes need to emit for the alignment is not a multiple
+  /// of @p ValueSize, then the contents of the emitted fill bytes is
+  /// undefined.
+  ///
+  /// This used to implement the .align assembler directive.
+  ///
+  /// @param ByteAlignment - The alignment to reach. This must be a power of
+  /// two on some targets.
+  /// @param Value - The value to use when filling bytes.
+  /// @param ValueSize - The size of the integer (in bytes) to emit for
+  /// @p Value. This must match a native machine width.
+  /// @param MaxBytesToEmit - The maximum numbers of bytes to emit, or 0. If
+  /// the alignment cannot be reached in this many bytes, no bytes are
+  /// emitted.
+  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
+                                    unsigned ValueSize = 1,
+                                    unsigned MaxBytesToEmit = 0) = 0;
+
+  /// EmitCodeAlignment - Emit nops until the byte alignment @p ByteAlignment
+  /// is reached.
+  ///
+  /// This used to align code where the alignment bytes may be executed.  This
+  /// can emit different bytes for different sizes to optimize execution.
+  ///
+  /// @param ByteAlignment - The alignment to reach. This must be a power of
+  /// two on some targets.
+  /// @param MaxBytesToEmit - The maximum numbers of bytes to emit, or 0. If
+  /// the alignment cannot be reached in this many bytes, no bytes are
+  /// emitted.
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit = 0) = 0;
+
+  /// EmitValueToOffset - Emit some number of copies of @p Value until the
+  /// byte offset @p Offset is reached.
+  ///
+  /// This is used to implement assembler directives such as .org.
+  ///
+  /// @param Offset - The offset to reach. This may be an expression, but the
+  /// expression must be associated with the current section.
+  /// @param Value - The value to use when filling bytes.
+  /// @return false on success, true if the offset was invalid.
+  virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                 unsigned char Value = 0) = 0;
+
+  /// @}
+
+  /// EmitFileDirective - Switch to a new logical file.  This is used to
+  /// implement the '.file "foo.c"' assembler directive.
+  virtual void EmitFileDirective(StringRef Filename) = 0;
+
+  /// Emit the "identifiers" directive.  This implements the
+  /// '.ident "version foo"' assembler directive.
+  virtual void EmitIdent(StringRef IdentString) {}
+
+  /// EmitDwarfFileDirective - Associate a filename with a specified logical
+  /// file number.  This implements the DWARF2 '.file 4 "foo.c"' assembler
+  /// directive.
+  virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                      StringRef Filename, unsigned CUID = 0);
+
+  /// EmitDwarfLocDirective - This implements the DWARF2
+  // '.loc fileno lineno ...' assembler directive.
+  virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+                                     unsigned Column, unsigned Flags,
+                                     unsigned Isa, unsigned Discriminator,
+                                     StringRef FileName);
+
+  virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                        const MCSymbol *LastLabel,
+                                        const MCSymbol *Label,
+                                        unsigned PointerSize) = 0;
+
+  virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
+                                         const MCSymbol *Label) {}
+
+  void EmitDwarfSetLineAddr(int64_t LineDelta, const MCSymbol *Label,
+                            int PointerSize);
+
+  virtual void EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding);
+  virtual void EmitCFISections(bool EH, bool Debug);
+  void EmitCFIStartProc();
+  void EmitCFIEndProc();
+  virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
+  virtual void EmitCFIDefCfaOffset(int64_t Offset);
+  virtual void EmitCFIDefCfaRegister(int64_t Register);
+  virtual void EmitCFIOffset(int64_t Register, int64_t Offset);
+  virtual void EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding);
+  virtual void EmitCFILsda(const MCSymbol *Sym, unsigned Encoding);
+  virtual void EmitCFIRememberState();
+  virtual void EmitCFIRestoreState();
+  virtual void EmitCFISameValue(int64_t Register);
+  virtual void EmitCFIRestore(int64_t Register);
+  virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset);
+  virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment);
+  virtual void EmitCFIEscape(StringRef Values);
+  virtual void EmitCFISignalFrame();
+  virtual void EmitCFIUndefined(int64_t Register);
+  virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
+  virtual void EmitCFIWindowSave();
+
+  virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
+  virtual void EmitWin64EHEndProc();
+  virtual void EmitWin64EHStartChained();
+  virtual void EmitWin64EHEndChained();
+  virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                  bool Except);
+  virtual void EmitWin64EHHandlerData();
+  virtual void EmitWin64EHPushReg(unsigned Register);
+  virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHAllocStack(unsigned Size);
+  virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHPushFrame(bool Code);
+  virtual void EmitWin64EHEndProlog();
+
+  /// EmitInstruction - Emit the given @p Instruction into the current
+  /// section.
+  virtual void EmitInstruction(const MCInst &Inst) = 0;
+
+  /// \brief Set the bundle alignment mode from now on in the section.
+  /// The argument is the power of 2 to which the alignment is set. The
+  /// value 0 means turn the bundle alignment off.
+  virtual void EmitBundleAlignMode(unsigned AlignPow2) = 0;
+
+  /// \brief The following instructions are a bundle-locked group.
+  ///
+  /// \param AlignToEnd - If true, the bundle-locked group will be aligned to
+  ///                     the end of a bundle.
+  virtual void EmitBundleLock(bool AlignToEnd) = 0;
+
+  /// \brief Ends a bundle-locked group.
+  virtual void EmitBundleUnlock() = 0;
+
+  /// EmitRawText - If this file is backed by a assembly streamer, this dumps
+  /// the specified string in the output .s file.  This capability is
+  /// indicated by the hasRawTextSupport() predicate.  By default this aborts.
+  void EmitRawText(const Twine &String);
+
+  /// Flush - Causes any cached state to be written out.
+  virtual void Flush() {}
+
+  /// FinishImpl - Streamer specific finalization.
+  virtual void FinishImpl() = 0;
+  /// Finish - Finish emission of machine code.
+  void Finish();
+};
+
+/// createNullStreamer - Create a dummy machine code streamer, which does
+/// nothing. This is useful for timing the assembler front end.
+MCStreamer *createNullStreamer(MCContext &Ctx);
+
+/// createAsmStreamer - Create a machine code streamer which will print out
+/// assembly for the native target, suitable for compiling with a native
+/// assembler.
+///
+/// \param InstPrint - If given, the instruction printer to use. If not given
+/// the MCInst representation will be printed.  This method takes ownership of
+/// InstPrint.
+///
+/// \param CE - If given, a code emitter to use to show the instruction
+/// encoding inline with the assembly. This method takes ownership of \p CE.
+///
+/// \param TAB - If given, a target asm backend to use to show the fixup
+/// information in conjunction with encoding information. This method takes
+/// ownership of \p TAB.
+///
+/// \param ShowInst - Whether to show the MCInst representation inline with
+/// the assembly.
+MCStreamer *createAsmStreamer(MCContext &Ctx, MCTargetStreamer *TargetStreamer,
+                              formatted_raw_ostream &OS, bool isVerboseAsm,
+                              bool useLoc, bool useCFI, bool useDwarfDirectory,
+                              MCInstPrinter *InstPrint = 0,
+                              MCCodeEmitter *CE = 0, MCAsmBackend *TAB = 0,
+                              bool ShowInst = false);
+
+/// createMachOStreamer - Create a machine code streamer which will generate
+/// Mach-O format object files.
+///
+/// Takes ownership of \p TAB and \p CE.
+MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                                raw_ostream &OS, MCCodeEmitter *CE,
+                                bool RelaxAll = false);
+
+/// createWinCOFFStreamer - Create a machine code streamer which will
+/// generate Microsoft COFF format object files.
+///
+/// Takes ownership of \p TAB and \p CE.
+MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                                  MCCodeEmitter &CE, raw_ostream &OS,
+                                  bool RelaxAll = false);
+
+/// createELFStreamer - Create a machine code streamer which will generate
+/// ELF format object files.
+MCStreamer *createELFStreamer(MCContext &Ctx, MCTargetStreamer *TargetStreamer,
+                              MCAsmBackend &TAB, raw_ostream &OS,
+                              MCCodeEmitter *CE, bool RelaxAll,
+                              bool NoExecStack);
+
+/// createPureStreamer - Create a machine code streamer which will generate
+/// "pure" MC object files, for use with MC-JIT and testing tools.
+///
+/// Takes ownership of \p TAB and \p CE.
+MCStreamer *createPureStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                               raw_ostream &OS, MCCodeEmitter *CE);
 
 } // end namespace llvm
 
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 346fb2d..01e8236 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -72,6 +72,9 @@ public:
   /// feature string). Recompute feature bits and scheduling model.
   void InitMCProcessorInfo(StringRef CPU, StringRef FS);
 
+  /// InitCPUSchedModel - Recompute scheduling model based on CPU.
+  void InitCPUSchedModel(StringRef CPU);
+
   /// ToggleFeature - Toggle a feature and returns the re-computed feature
   /// bits. This version does not change the implied bits.
   uint64_t ToggleFeature(uint64_t FB);
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index ebcbd5b..d132a73 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -11,6 +11,7 @@
 #define LLVM_MC_TARGETPARSER_H
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 class MCStreamer;
@@ -174,6 +175,14 @@ public:
 
   virtual void convertToMapAndConstraints(unsigned Kind,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) = 0;
+
+  virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                            MCSymbolRefExpr::VariantKind,
+                                            MCContext &Ctx) {
+    return 0;
+  }
+
+  virtual void onLabelParsed(MCSymbol *Symbol) { };
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCWinCOFFObjectWriter.h b/include/llvm/MC/MCWinCOFFObjectWriter.h
index f13e7d5..213481c 100644
--- a/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -17,6 +17,7 @@ namespace llvm {
   class raw_ostream;
 
   class MCWinCOFFObjectTargetWriter {
+    virtual void anchor();
     const unsigned Machine;
 
   protected:
diff --git a/include/llvm/MC/MachineLocation.h b/include/llvm/MC/MachineLocation.h
index c4a9660..b3fbee7 100644
--- a/include/llvm/MC/MachineLocation.h
+++ b/include/llvm/MC/MachineLocation.h
@@ -16,6 +16,9 @@
 #ifndef LLVM_MC_MACHINELOCATION_H
 #define LLVM_MC_MACHINELOCATION_H
 
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+
 namespace llvm {
   class MCSymbol;
 
@@ -25,7 +28,7 @@ private:
   unsigned Register;                    // gcc/gdb register number.
   int Offset;                           // Displacement if not register.
 public:
-  enum {
+  enum LLVM_ENUM_INT_TYPE(uint32_t) {
     // The target register number for an abstract frame pointer. The value is
     // an arbitrary value that doesn't collide with any real target register.
     VirtualFP = ~0U
@@ -44,7 +47,8 @@ public:
         Offset == Other.Offset;
   }
 
-  // Accessors
+  // Accessors.
+  /// \return true iff this is a register-indirect location.
   bool isIndirect()      const { return !IsRegister; }
   bool isReg()           const { return IsRegister; }
   unsigned getReg()      const { return Register; }
diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index 8862c8b..d0735cc 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@@ -37,9 +37,9 @@ struct SubtargetFeatureKV {
   uint64_t Value;                       // K-V integer value
   uint64_t Implies;                     // K-V bit mask
 
-  // Compare routine for std binary search
-  bool operator<(const SubtargetFeatureKV &S) const {
-    return strcmp(Key, S.Key) < 0;
+  // Compare routine for std::lower_bound
+  bool operator<(StringRef S) const {
+    return StringRef(Key) < S;
   }
 };
 
@@ -52,9 +52,9 @@ struct SubtargetInfoKV {
   const char *Key;                      // K-V key string
   const void *Value;                    // K-V pointer value
 
-  // Compare routine for std binary search
-  bool operator<(const SubtargetInfoKV &S) const {
-    return strcmp(Key, S.Key) < 0;
+  // Compare routine for std::lower_bound
+  bool operator<(StringRef S) const {
+    return StringRef(Key) < S;
   }
 };
 
@@ -99,8 +99,7 @@ public:
   // Dump feature info.
   void dump() const;
 
-  /// Retrieve a formatted string of the default features for the specified
-  /// target triple.
+  /// Adds the default features for the specified target triple.
   void getDefaultSubtargetFeatures(const Triple& Triple);
 };
 
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index 2190067..e05ae6c 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -23,6 +23,8 @@ namespace llvm {
   class ArrayRef;
 
 namespace object {
+class ImportDirectoryEntryRef;
+typedef content_iterator<ImportDirectoryEntryRef> import_directory_iterator;
 
 /// The DOS compatible header at the front of all PE/COFF executables.
 struct dos_header {
@@ -55,6 +57,8 @@ struct coff_file_header {
   support::ulittle32_t NumberOfSymbols;
   support::ulittle16_t SizeOfOptionalHeader;
   support::ulittle16_t Characteristics;
+
+  bool isImportLibrary() const { return NumberOfSections == 0xffff; }
 };
 
 /// The 32-bit PE header that follows the COFF header.
@@ -137,6 +141,22 @@ struct import_directory_table_entry {
   support::ulittle32_t ImportAddressTableRVA;
 };
 
+struct import_lookup_table_entry32 {
+  support::ulittle32_t data;
+
+  bool isOrdinal() const { return data & 0x80000000; }
+
+  uint16_t getOrdinal() const {
+    assert(isOrdinal() && "ILT entry is not an ordinal!");
+    return data & 0xFFFF;
+  }
+
+  uint32_t getHintNameRVA() const {
+    assert(!isOrdinal() && "ILT entry is not a Hint/Name RVA!");
+    return data;
+  }
+};
+
 struct coff_symbol {
   struct StringTableOffset {
     support::ulittle32_t Zeroes;
@@ -184,6 +204,12 @@ struct coff_relocation {
   support::ulittle16_t Type;
 };
 
+struct coff_aux_weak_external {
+  support::ulittle32_t TagIndex;
+  support::ulittle32_t Characteristics;
+  char Unused[10];
+};
+
 struct coff_aux_section_definition {
   support::ulittle32_t Length;
   support::ulittle16_t NumberOfRelocations;
@@ -196,6 +222,7 @@ struct coff_aux_section_definition {
 
 class COFFObjectFile : public ObjectFile {
 private:
+  friend class ImportDirectoryEntryRef;
   const coff_file_header *COFFHeader;
   const pe32_header      *PE32Header;
   const data_directory   *DataDirectory;
@@ -203,6 +230,8 @@ private:
   const coff_symbol      *SymbolTable;
   const char             *StringTable;
         uint32_t          StringTableSize;
+  const import_directory_table_entry *ImportDirectory;
+        uint32_t          NumberOfImportDirectory;
 
         error_code        getString(uint32_t offset, StringRef &Res) const;
 
@@ -210,13 +239,15 @@ private:
   const coff_section     *toSec(DataRefImpl Sec) const;
   const coff_relocation  *toRel(DataRefImpl Rel) const;
 
+        error_code        initSymbolTablePtr();
+        error_code        initImportTablePtr();
+
 protected:
   virtual error_code getSymbolNext(DataRefImpl Symb, SymbolRef &Res) const;
   virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const;
   virtual error_code getSymbolFileOffset(DataRefImpl Symb, uint64_t &Res) const;
   virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const;
   virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const;
-  virtual error_code getSymbolNMTypeChar(DataRefImpl Symb, char &Res) const;
   virtual error_code getSymbolFlags(DataRefImpl Symb, uint32_t &Res) const;
   virtual error_code getSymbolType(DataRefImpl Symb, SymbolRef::Type &Res) const;
   virtual error_code getSymbolSection(DataRefImpl Symb,
@@ -239,8 +270,8 @@ protected:
                                                    bool &Res) const;
   virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
                                            bool &Result) const;
-  virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const;
-  virtual relocation_iterator getSectionRelEnd(DataRefImpl Sec) const;
+  virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const;
+  virtual relocation_iterator section_rel_end(DataRefImpl Sec) const;
 
   virtual error_code getRelocationNext(DataRefImpl Rel,
                                        RelocationRef &Res) const;
@@ -281,6 +312,9 @@ public:
   virtual unsigned getArch() const;
   virtual StringRef getLoadName() const;
 
+  import_directory_iterator import_directory_begin() const;
+  import_directory_iterator import_directory_end() const;
+
   error_code getHeader(const coff_file_header *&Res) const;
   error_code getCOFFHeader(const coff_file_header *&Res) const;
   error_code getPE32Header(const pe32_header *&Res) const;
@@ -301,12 +335,37 @@ public:
   error_code getSectionContents(const coff_section *Sec,
                                 ArrayRef<uint8_t> &Res) const;
 
+  error_code getRvaPtr(uint32_t Rva, uintptr_t &Res) const;
+  error_code getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const;
+
   static inline bool classof(const Binary *v) {
     return v->isCOFF();
   }
 };
 
-}
-}
+// The iterator for the import directory table.
+class ImportDirectoryEntryRef {
+public:
+  ImportDirectoryEntryRef() : OwningObject(0) {}
+  ImportDirectoryEntryRef(DataRefImpl ImportDirectory,
+                          const COFFObjectFile *Owner)
+      : ImportDirectoryPimpl(ImportDirectory), OwningObject(Owner) {}
+
+  bool operator==(const ImportDirectoryEntryRef &Other) const;
+  error_code getNext(ImportDirectoryEntryRef &Result) const;
+  error_code getName(StringRef &Result) const;
+
+  error_code
+  getImportTableEntry(const import_directory_table_entry *&Result) const;
+
+  error_code
+  getImportLookupEntry(const import_lookup_table_entry32 *&Result) const;
+
+private:
+  DataRefImpl ImportDirectoryPimpl;
+  const COFFObjectFile *OwningObject;
+};
+} // end namespace object
+} // end namespace llvm
 
 #endif
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 2063809..6c000bb 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -7,23 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the ELFObjectFile template class.
+// This file declares the ELFFile template class.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_OBJECT_ELF_H
 #define LLVM_OBJECT_ELF_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -35,22 +38,9 @@
 namespace llvm {
 namespace object {
 
-using support::endianness;
+StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type);
 
-template<endianness target_endianness, std::size_t max_alignment, bool is64Bits>
-struct ELFType {
-  static const endianness TargetEndianness = target_endianness;
-  static const std::size_t MaxAlignment = max_alignment;
-  static const bool Is64Bits = is64Bits;
-};
-
-template<typename T, int max_align>
-struct MaximumAlignment {
-  enum {value = AlignOf<T>::Alignment > max_align ? max_align
-                                                  : AlignOf<T>::Alignment};
-};
-
-// Subclasses of ELFObjectFile may need this for template instantiation
+// Subclasses of ELFFile may need this for template instantiation
 inline std::pair<unsigned char, unsigned char>
 getElfArchType(MemoryBuffer *Object) {
   if (Object->getBufferSize() < ELF::EI_NIDENT)
@@ -59,442 +49,15 @@ getElfArchType(MemoryBuffer *Object) {
                         (uint8_t) Object->getBufferStart()[ELF::EI_DATA]);
 }
 
-// Templates to choose Elf_Addr and Elf_Off depending on is64Bits.
-template<endianness target_endianness, std::size_t max_alignment>
-struct ELFDataTypeTypedefHelperCommon {
-  typedef support::detail::packed_endian_specific_integral
-    <uint16_t, target_endianness,
-     MaximumAlignment<uint16_t, max_alignment>::value> Elf_Half;
-  typedef support::detail::packed_endian_specific_integral
-    <uint32_t, target_endianness,
-     MaximumAlignment<uint32_t, max_alignment>::value> Elf_Word;
-  typedef support::detail::packed_endian_specific_integral
-    <int32_t, target_endianness,
-     MaximumAlignment<int32_t, max_alignment>::value> Elf_Sword;
-  typedef support::detail::packed_endian_specific_integral
-    <uint64_t, target_endianness,
-     MaximumAlignment<uint64_t, max_alignment>::value> Elf_Xword;
-  typedef support::detail::packed_endian_specific_integral
-    <int64_t, target_endianness,
-     MaximumAlignment<int64_t, max_alignment>::value> Elf_Sxword;
-};
-
-template<class ELFT>
-struct ELFDataTypeTypedefHelper;
-
-/// ELF 32bit types.
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct ELFDataTypeTypedefHelper<ELFType<TargetEndianness, MaxAlign, false> >
-  : ELFDataTypeTypedefHelperCommon<TargetEndianness, MaxAlign> {
-  typedef uint32_t value_type;
-  typedef support::detail::packed_endian_specific_integral
-    <value_type, TargetEndianness,
-     MaximumAlignment<value_type, MaxAlign>::value> Elf_Addr;
-  typedef support::detail::packed_endian_specific_integral
-    <value_type, TargetEndianness,
-     MaximumAlignment<value_type, MaxAlign>::value> Elf_Off;
-};
-
-/// ELF 64bit types.
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct ELFDataTypeTypedefHelper<ELFType<TargetEndianness, MaxAlign, true> >
-  : ELFDataTypeTypedefHelperCommon<TargetEndianness, MaxAlign> {
-  typedef uint64_t value_type;
-  typedef support::detail::packed_endian_specific_integral
-    <value_type, TargetEndianness,
-     MaximumAlignment<value_type, MaxAlign>::value> Elf_Addr;
-  typedef support::detail::packed_endian_specific_integral
-    <value_type, TargetEndianness,
-     MaximumAlignment<value_type, MaxAlign>::value> Elf_Off;
-};
-
-// I really don't like doing this, but the alternative is copypasta.
-#define LLVM_ELF_IMPORT_TYPES(E, M, W)                                         \
-typedef typename ELFDataTypeTypedefHelper<ELFType<E,M,W> >::Elf_Addr Elf_Addr; \
-typedef typename ELFDataTypeTypedefHelper<ELFType<E,M,W> >::Elf_Off Elf_Off;   \
-typedef typename ELFDataTypeTypedefHelper<ELFType<E,M,W> >::Elf_Half Elf_Half; \
-typedef typename ELFDataTypeTypedefHelper<ELFType<E,M,W> >::Elf_Word Elf_Word; \
-typedef typename                                                               \
-  ELFDataTypeTypedefHelper<ELFType<E,M,W> >::Elf_Sword Elf_Sword;              \
-typedef typename                                                               \
-  ELFDataTypeTypedefHelper<ELFType<E,M,W> >::Elf_Xword Elf_Xword;              \
-typedef typename                                                               \
-  ELFDataTypeTypedefHelper<ELFType<E,M,W> >::Elf_Sxword Elf_Sxword;
-
-#define LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)                                       \
-  LLVM_ELF_IMPORT_TYPES(ELFT::TargetEndianness, ELFT::MaxAlignment,            \
-  ELFT::Is64Bits)
-
-// Section header.
-template<class ELFT>
-struct Elf_Shdr_Base;
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Shdr_Base<ELFType<TargetEndianness, MaxAlign, false> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
-  Elf_Word sh_name;     // Section name (index into string table)
-  Elf_Word sh_type;     // Section type (SHT_*)
-  Elf_Word sh_flags;    // Section flags (SHF_*)
-  Elf_Addr sh_addr;     // Address where section is to be loaded
-  Elf_Off  sh_offset;   // File offset of section data, in bytes
-  Elf_Word sh_size;     // Size of section, in bytes
-  Elf_Word sh_link;     // Section type-specific header table index link
-  Elf_Word sh_info;     // Section type-specific extra information
-  Elf_Word sh_addralign;// Section address alignment
-  Elf_Word sh_entsize;  // Size of records contained within the section
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Shdr_Base<ELFType<TargetEndianness, MaxAlign, true> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
-  Elf_Word  sh_name;     // Section name (index into string table)
-  Elf_Word  sh_type;     // Section type (SHT_*)
-  Elf_Xword sh_flags;    // Section flags (SHF_*)
-  Elf_Addr  sh_addr;     // Address where section is to be loaded
-  Elf_Off   sh_offset;   // File offset of section data, in bytes
-  Elf_Xword sh_size;     // Size of section, in bytes
-  Elf_Word  sh_link;     // Section type-specific header table index link
-  Elf_Word  sh_info;     // Section type-specific extra information
-  Elf_Xword sh_addralign;// Section address alignment
-  Elf_Xword sh_entsize;  // Size of records contained within the section
-};
-
-template<class ELFT>
-struct Elf_Shdr_Impl : Elf_Shdr_Base<ELFT> {
-  using Elf_Shdr_Base<ELFT>::sh_entsize;
-  using Elf_Shdr_Base<ELFT>::sh_size;
-
-  /// @brief Get the number of entities this section contains if it has any.
-  unsigned getEntityCount() const {
-    if (sh_entsize == 0)
-      return 0;
-    return sh_size / sh_entsize;
-  }
-};
-
-template<class ELFT>
-struct Elf_Sym_Base;
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Sym_Base<ELFType<TargetEndianness, MaxAlign, false> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
-  Elf_Word      st_name;  // Symbol name (index into string table)
-  Elf_Addr      st_value; // Value or address associated with the symbol
-  Elf_Word      st_size;  // Size of the symbol
-  unsigned char st_info;  // Symbol's type and binding attributes
-  unsigned char st_other; // Must be zero; reserved
-  Elf_Half      st_shndx; // Which section (header table index) it's defined in
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Sym_Base<ELFType<TargetEndianness, MaxAlign, true> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
-  Elf_Word      st_name;  // Symbol name (index into string table)
-  unsigned char st_info;  // Symbol's type and binding attributes
-  unsigned char st_other; // Must be zero; reserved
-  Elf_Half      st_shndx; // Which section (header table index) it's defined in
-  Elf_Addr      st_value; // Value or address associated with the symbol
-  Elf_Xword     st_size;  // Size of the symbol
-};
-
-template<class ELFT>
-struct Elf_Sym_Impl : Elf_Sym_Base<ELFT> {
-  using Elf_Sym_Base<ELFT>::st_info;
-
-  // These accessors and mutators correspond to the ELF32_ST_BIND,
-  // ELF32_ST_TYPE, and ELF32_ST_INFO macros defined in the ELF specification:
-  unsigned char getBinding() const { return st_info >> 4; }
-  unsigned char getType() const { return st_info & 0x0f; }
-  void setBinding(unsigned char b) { setBindingAndType(b, getType()); }
-  void setType(unsigned char t) { setBindingAndType(getBinding(), t); }
-  void setBindingAndType(unsigned char b, unsigned char t) {
-    st_info = (b << 4) + (t & 0x0f);
-  }
-};
-
-/// Elf_Versym: This is the structure of entries in the SHT_GNU_versym section
-/// (.gnu.version). This structure is identical for ELF32 and ELF64.
-template<class ELFT>
-struct Elf_Versym_Impl {
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  Elf_Half vs_index;   // Version index with flags (e.g. VERSYM_HIDDEN)
-};
-
-template<class ELFT>
-struct Elf_Verdaux_Impl;
-
-/// Elf_Verdef: This is the structure of entries in the SHT_GNU_verdef section
-/// (.gnu.version_d). This structure is identical for ELF32 and ELF64.
-template<class ELFT>
-struct Elf_Verdef_Impl {
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  typedef Elf_Verdaux_Impl<ELFT> Elf_Verdaux;
-  Elf_Half vd_version; // Version of this structure (e.g. VER_DEF_CURRENT)
-  Elf_Half vd_flags;   // Bitwise flags (VER_DEF_*)
-  Elf_Half vd_ndx;     // Version index, used in .gnu.version entries
-  Elf_Half vd_cnt;     // Number of Verdaux entries
-  Elf_Word vd_hash;    // Hash of name
-  Elf_Word vd_aux;     // Offset to the first Verdaux entry (in bytes)
-  Elf_Word vd_next;    // Offset to the next Verdef entry (in bytes)
-
-  /// Get the first Verdaux entry for this Verdef.
-  const Elf_Verdaux *getAux() const {
-    return reinterpret_cast<const Elf_Verdaux*>((const char*)this + vd_aux);
-  }
-};
-
-/// Elf_Verdaux: This is the structure of auxiliary data in the SHT_GNU_verdef
-/// section (.gnu.version_d). This structure is identical for ELF32 and ELF64.
-template<class ELFT>
-struct Elf_Verdaux_Impl {
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  Elf_Word vda_name; // Version name (offset in string table)
-  Elf_Word vda_next; // Offset to next Verdaux entry (in bytes)
-};
-
-/// Elf_Verneed: This is the structure of entries in the SHT_GNU_verneed
-/// section (.gnu.version_r). This structure is identical for ELF32 and ELF64.
-template<class ELFT>
-struct Elf_Verneed_Impl {
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  Elf_Half vn_version; // Version of this structure (e.g. VER_NEED_CURRENT)
-  Elf_Half vn_cnt;     // Number of associated Vernaux entries
-  Elf_Word vn_file;    // Library name (string table offset)
-  Elf_Word vn_aux;     // Offset to first Vernaux entry (in bytes)
-  Elf_Word vn_next;    // Offset to next Verneed entry (in bytes)
-};
-
-/// Elf_Vernaux: This is the structure of auxiliary data in SHT_GNU_verneed
-/// section (.gnu.version_r). This structure is identical for ELF32 and ELF64.
-template<class ELFT>
-struct Elf_Vernaux_Impl {
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  Elf_Word vna_hash;  // Hash of dependency name
-  Elf_Half vna_flags; // Bitwise Flags (VER_FLAG_*)
-  Elf_Half vna_other; // Version index, used in .gnu.version entries
-  Elf_Word vna_name;  // Dependency name
-  Elf_Word vna_next;  // Offset to next Vernaux entry (in bytes)
-};
-
-/// Elf_Dyn_Base: This structure matches the form of entries in the dynamic
-///               table section (.dynamic) look like.
-template<class ELFT>
-struct Elf_Dyn_Base;
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Dyn_Base<ELFType<TargetEndianness, MaxAlign, false> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
-  Elf_Sword d_tag;
-  union {
-    Elf_Word d_val;
-    Elf_Addr d_ptr;
-  } d_un;
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Dyn_Base<ELFType<TargetEndianness, MaxAlign, true> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
-  Elf_Sxword d_tag;
-  union {
-    Elf_Xword d_val;
-    Elf_Addr d_ptr;
-  } d_un;
-};
-
-/// Elf_Dyn_Impl: This inherits from Elf_Dyn_Base, adding getters and setters.
-template<class ELFT>
-struct Elf_Dyn_Impl : Elf_Dyn_Base<ELFT> {
-  using Elf_Dyn_Base<ELFT>::d_tag;
-  using Elf_Dyn_Base<ELFT>::d_un;
-  int64_t getTag() const { return d_tag; }
-  uint64_t getVal() const { return d_un.d_val; }
-  uint64_t getPtr() const { return d_un.ptr; }
-};
-
-// Elf_Rel: Elf Relocation
-template<class ELFT, bool isRela>
-struct Elf_Rel_Base;
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, false>, false> {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
-  Elf_Addr r_offset;     // Location (file byte offset, or program virtual addr)
-  Elf_Word r_info;       // Symbol table index and type of relocation to apply
-
-  uint32_t getRInfo(bool isMips64EL) const {
-    assert(!isMips64EL);
-    return r_info;
-  }
-  void setRInfo(uint32_t R) {
-    r_info = R;
-  }
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, true>, false> {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
-  Elf_Addr  r_offset; // Location (file byte offset, or program virtual addr)
-  Elf_Xword r_info;   // Symbol table index and type of relocation to apply
-
-  uint64_t getRInfo(bool isMips64EL) const {
-    uint64_t t = r_info;
-    if (!isMips64EL)
-      return t;
-    // Mips64 little endian has a "special" encoding of r_info. Instead of one
-    // 64 bit little endian number, it is a little endian 32 bit number followed
-    // by a 32 bit big endian number.
-    return (t << 32) | ((t >> 8) & 0xff000000) | ((t >> 24) & 0x00ff0000) |
-      ((t >> 40) & 0x0000ff00) | ((t >> 56) & 0x000000ff);
-  }
-  void setRInfo(uint64_t R) {
-    // FIXME: Add mips64el support.
-    r_info = R;
-  }
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, false>, true> {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
-  Elf_Addr  r_offset; // Location (file byte offset, or program virtual addr)
-  Elf_Word  r_info;   // Symbol table index and type of relocation to apply
-  Elf_Sword r_addend; // Compute value for relocatable field by adding this
-
-  uint32_t getRInfo(bool isMips64EL) const {
-    assert(!isMips64EL);
-    return r_info;
-  }
-  void setRInfo(uint32_t R) {
-    r_info = R;
-  }
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, true>, true> {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
-  Elf_Addr   r_offset; // Location (file byte offset, or program virtual addr)
-  Elf_Xword  r_info;   // Symbol table index and type of relocation to apply
-  Elf_Sxword r_addend; // Compute value for relocatable field by adding this.
-
-  uint64_t getRInfo(bool isMips64EL) const {
-    // Mips64 little endian has a "special" encoding of r_info. Instead of one
-    // 64 bit little endian number, it is a little endian 32 bit number followed
-    // by a 32 bit big endian number.
-    uint64_t t = r_info;
-    if (!isMips64EL)
-      return t;
-    return (t << 32) | ((t >> 8) & 0xff000000) | ((t >> 24) & 0x00ff0000) |
-      ((t >> 40) & 0x0000ff00) | ((t >> 56) & 0x000000ff);
-  }
-  void setRInfo(uint64_t R) {
-    // FIXME: Add mips64el support.
-    r_info = R;
-  }
-};
-
-template<class ELFT, bool isRela>
-struct Elf_Rel_Impl;
-
-template<endianness TargetEndianness, std::size_t MaxAlign, bool isRela>
-struct Elf_Rel_Impl<ELFType<TargetEndianness, MaxAlign, true>, isRela>
-       : Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, true>, isRela> {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
-
-  // These accessors and mutators correspond to the ELF64_R_SYM, ELF64_R_TYPE,
-  // and ELF64_R_INFO macros defined in the ELF specification:
-  uint32_t getSymbol(bool isMips64EL) const {
-    return (uint32_t) (this->getRInfo(isMips64EL) >> 32);
-  }
-  uint32_t getType(bool isMips64EL) const {
-    return (uint32_t) (this->getRInfo(isMips64EL) & 0xffffffffL);
-  }
-  void setSymbol(uint32_t s) { setSymbolAndType(s, getType()); }
-  void setType(uint32_t t) { setSymbolAndType(getSymbol(), t); }
-  void setSymbolAndType(uint32_t s, uint32_t t) {
-    this->setRInfo(((uint64_t)s << 32) + (t&0xffffffffL));
-  }
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign, bool isRela>
-struct Elf_Rel_Impl<ELFType<TargetEndianness, MaxAlign, false>, isRela>
-       : Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, false>, isRela> {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
-
-  // These accessors and mutators correspond to the ELF32_R_SYM, ELF32_R_TYPE,
-  // and ELF32_R_INFO macros defined in the ELF specification:
-  uint32_t getSymbol(bool isMips64EL) const {
-    return this->getRInfo(isMips64EL) >> 8;
-  }
-  unsigned char getType(bool isMips64EL) const {
-    return (unsigned char) (this->getRInfo(isMips64EL) & 0x0ff);
-  }
-  void setSymbol(uint32_t s) { setSymbolAndType(s, getType()); }
-  void setType(unsigned char t) { setSymbolAndType(getSymbol(), t); }
-  void setSymbolAndType(uint32_t s, unsigned char t) {
-    this->setRInfo((s << 8) + t);
-  }
-};
-
-template<class ELFT>
-struct Elf_Ehdr_Impl {
-  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
-  unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
-  Elf_Half e_type;     // Type of file (see ET_*)
-  Elf_Half e_machine;  // Required architecture for this file (see EM_*)
-  Elf_Word e_version;  // Must be equal to 1
-  Elf_Addr e_entry;    // Address to jump to in order to start program
-  Elf_Off  e_phoff;    // Program header table's file offset, in bytes
-  Elf_Off  e_shoff;    // Section header table's file offset, in bytes
-  Elf_Word e_flags;    // Processor-specific flags
-  Elf_Half e_ehsize;   // Size of ELF header, in bytes
-  Elf_Half e_phentsize;// Size of an entry in the program header table
-  Elf_Half e_phnum;    // Number of entries in the program header table
-  Elf_Half e_shentsize;// Size of an entry in the section header table
-  Elf_Half e_shnum;    // Number of entries in the section header table
-  Elf_Half e_shstrndx; // Section header table index of section name
-                                 // string table
-  bool checkMagic() const {
-    return (memcmp(e_ident, ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
-  }
-   unsigned char getFileClass() const { return e_ident[ELF::EI_CLASS]; }
-   unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
-};
-
-template<class ELFT>
-struct Elf_Phdr_Impl;
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Phdr_Impl<ELFType<TargetEndianness, MaxAlign, false> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
-  Elf_Word p_type;   // Type of segment
-  Elf_Off  p_offset; // FileOffset where segment is located, in bytes
-  Elf_Addr p_vaddr;  // Virtual Address of beginning of segment
-  Elf_Addr p_paddr;  // Physical address of beginning of segment (OS-specific)
-  Elf_Word p_filesz; // Num. of bytes in file image of segment (may be zero)
-  Elf_Word p_memsz;  // Num. of bytes in mem image of segment (may be zero)
-  Elf_Word p_flags;  // Segment flags
-  Elf_Word p_align;  // Segment alignment constraint
-};
-
-template<endianness TargetEndianness, std::size_t MaxAlign>
-struct Elf_Phdr_Impl<ELFType<TargetEndianness, MaxAlign, true> > {
-  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
-  Elf_Word p_type;   // Type of segment
-  Elf_Word p_flags;  // Segment flags
-  Elf_Off  p_offset; // FileOffset where segment is located, in bytes
-  Elf_Addr p_vaddr;  // Virtual Address of beginning of segment
-  Elf_Addr p_paddr;  // Physical address of beginning of segment (OS-specific)
-  Elf_Xword p_filesz; // Num. of bytes in file image of segment (may be zero)
-  Elf_Xword p_memsz;  // Num. of bytes in mem image of segment (may be zero)
-  Elf_Xword p_align;  // Segment alignment constraint
-};
-
-template<class ELFT>
-class ELFObjectFile : public ObjectFile {
+template <class ELFT>
+class ELFFile {
+public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  typedef typename conditional<ELFT::Is64Bits,
+                               uint64_t, uint32_t>::type uintX_t;
 
-public:
   /// \brief Iterate over constant sized entities.
-  template<class EntT>
+  template <class EntT>
   class ELFEntityIterator {
   public:
     typedef ptrdiff_t difference_type;
@@ -505,9 +68,8 @@ public:
 
     /// \brief Default construct iterator.
     ELFEntityIterator() : EntitySize(0), Current(0) {}
-    ELFEntityIterator(uint64_t EntSize, const char *Start)
-      : EntitySize(EntSize)
-      , Current(Start) {}
+    ELFEntityIterator(uintX_t EntSize, const char *Start)
+        : EntitySize(EntSize), Current(Start) {}
 
     reference operator *() {
       assert(Current && "Attempted to dereference an invalid iterator!");
@@ -547,14 +109,16 @@ public:
 
     difference_type operator -(const ELFEntityIterator &Other) const {
       assert(EntitySize == Other.EntitySize &&
-             "Subtracting iterators of different EntitiySize!");
+             "Subtracting iterators of different EntitySize!");
       return (Current - Other.Current) / EntitySize;
     }
 
     const char *get() const { return Current; }
 
+    uintX_t getEntSize() const { return EntitySize; }
+
   private:
-    uint64_t EntitySize;
+    uintX_t EntitySize;
     const char *Current;
   };
 
@@ -570,40 +134,141 @@ public:
   typedef Elf_Verneed_Impl<ELFT> Elf_Verneed;
   typedef Elf_Vernaux_Impl<ELFT> Elf_Vernaux;
   typedef Elf_Versym_Impl<ELFT> Elf_Versym;
-  typedef ELFEntityIterator<const Elf_Dyn> Elf_Dyn_iterator;
-  typedef ELFEntityIterator<const Elf_Sym> Elf_Sym_iterator;
+  typedef ELFEntityIterator<const Elf_Dyn> Elf_Dyn_Iter;
   typedef ELFEntityIterator<const Elf_Rela> Elf_Rela_Iter;
   typedef ELFEntityIterator<const Elf_Rel> Elf_Rel_Iter;
+  typedef ELFEntityIterator<const Elf_Shdr> Elf_Shdr_Iter;
 
-protected:
-  // This flag is used for classof, to distinguish ELFObjectFile from
-  // its subclass. If more subclasses will be created, this flag will
-  // have to become an enum.
-  bool isDyldELFObject;
+  /// \brief Archive files are 2 byte aligned, so we need this for
+  ///     PointerIntPair to work.
+  template <typename T>
+  class ArchivePointerTypeTraits {
+  public:
+    static inline const void *getAsVoidPointer(T *P) { return P; }
+    static inline T *getFromVoidPointer(const void *P) {
+      return static_cast<T *>(P);
+    }
+    enum { NumLowBitsAvailable = 1 };
+  };
+
+  class Elf_Sym_Iter {
+  public:
+    typedef ptrdiff_t difference_type;
+    typedef const Elf_Sym value_type;
+    typedef std::random_access_iterator_tag iterator_category;
+    typedef value_type &reference;
+    typedef value_type *pointer;
+
+    /// \brief Default construct iterator.
+    Elf_Sym_Iter() : EntitySize(0), Current(0, false) {}
+    Elf_Sym_Iter(uintX_t EntSize, const char *Start, bool IsDynamic)
+        : EntitySize(EntSize), Current(Start, IsDynamic) {}
+
+    reference operator*() {
+      assert(Current.getPointer() &&
+             "Attempted to dereference an invalid iterator!");
+      return *reinterpret_cast<pointer>(Current.getPointer());
+    }
+
+    pointer operator->() {
+      assert(Current.getPointer() &&
+             "Attempted to dereference an invalid iterator!");
+      return reinterpret_cast<pointer>(Current.getPointer());
+    }
+
+    bool operator==(const Elf_Sym_Iter &Other) {
+      return Current == Other.Current;
+    }
+
+    bool operator!=(const Elf_Sym_Iter &Other) { return !(*this == Other); }
+
+    Elf_Sym_Iter &operator++() {
+      assert(Current.getPointer() &&
+             "Attempted to increment an invalid iterator!");
+      Current.setPointer(Current.getPointer() + EntitySize);
+      return *this;
+    }
+
+    Elf_Sym_Iter operator++(int) {
+      Elf_Sym_Iter Tmp = *this;
+      ++*this;
+      return Tmp;
+    }
+
+    Elf_Sym_Iter operator+(difference_type Dist) {
+      assert(Current.getPointer() &&
+             "Attempted to increment an invalid iterator!");
+      Current.setPointer(Current.getPointer() + EntitySize * Dist);
+      return *this;
+    }
+
+    Elf_Sym_Iter &operator=(const Elf_Sym_Iter &Other) {
+      EntitySize = Other.EntitySize;
+      Current = Other.Current;
+      return *this;
+    }
+
+    difference_type operator-(const Elf_Sym_Iter &Other) const {
+      assert(EntitySize == Other.EntitySize &&
+             "Subtracting iterators of different EntitySize!");
+      return (Current.getPointer() - Other.Current.getPointer()) / EntitySize;
+    }
+
+    const char *get() const { return Current.getPointer(); }
+
+    bool isDynamic() const { return Current.getInt(); }
+
+    uintX_t getEntSize() const { return EntitySize; }
+
+  private:
+    uintX_t EntitySize;
+    PointerIntPair<const char *, 1, bool,
+                   ArchivePointerTypeTraits<const char> > Current;
+  };
 
 private:
+  typedef SmallVector<const Elf_Shdr *, 2> Sections_t;
+  typedef DenseMap<unsigned, unsigned> IndexMap_t;
+
+  MemoryBuffer *Buf;
+
+  const uint8_t *base() const {
+    return reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
+  }
+
   const Elf_Ehdr *Header;
   const Elf_Shdr *SectionHeaderTable;
   const Elf_Shdr *dot_shstrtab_sec; // Section header string table.
   const Elf_Shdr *dot_strtab_sec;   // Symbol header string table.
-  const Elf_Shdr *dot_dynstr_sec;   // Dynamic symbol string table.
+  const Elf_Shdr *dot_symtab_sec;   // Symbol table section.
 
-  int SymbolTableIndex;
-  int DynamicSymbolTableIndex;
-  DenseMap<const Elf_Sym*, ELF::Elf64_Word> ExtendedSymbolTable;
+  const Elf_Shdr *SymbolTableSectionHeaderIndex;
+  DenseMap<const Elf_Sym *, ELF::Elf64_Word> ExtendedSymbolTable;
 
-  const Elf_Shdr *dot_dynamic_sec;       // .dynamic
   const Elf_Shdr *dot_gnu_version_sec;   // .gnu.version
   const Elf_Shdr *dot_gnu_version_r_sec; // .gnu.version_r
   const Elf_Shdr *dot_gnu_version_d_sec; // .gnu.version_d
 
+  /// \brief Represents a region described by entries in the .dynamic table.
+  struct DynRegionInfo {
+    DynRegionInfo() : Addr(0), Size(0), EntSize(0) {}
+    /// \brief Address in current address space.
+    const void *Addr;
+    /// \brief Size in bytes of the region.
+    uintX_t Size;
+    /// \brief Size of each entity in the region.
+    uintX_t EntSize;
+  };
+
+  DynRegionInfo DynamicRegion;
+  DynRegionInfo DynHashRegion;
+  DynRegionInfo DynStrRegion;
+  DynRegionInfo DynSymRegion;
+
   // Pointer to SONAME entry in dynamic string table
   // This is set the first time getLoadName is called.
   mutable const char *dt_soname;
 
-private:
-  uint64_t getROffset(DataRefImpl Rel) const;
-
   // Records for each version index the corresponding Verdef or Vernaux entry.
   // This is filled the first time LoadVersionMap() is called.
   class VersionMapEntry : public PointerIntPair<const void*, 1> {
@@ -630,99 +295,29 @@ private:
   void LoadVersionNeeds(const Elf_Shdr *ec) const;
   void LoadVersionMap() const;
 
-  /// @brief Get the relocation section that contains \a Rel.
-  const Elf_Shdr *getRelSection(DataRefImpl Rel) const {
-    return getSection(Rel.d.a);
-  }
-
 public:
-  bool            isRelocationHasAddend(DataRefImpl Rel) const;
   template<typename T>
   const T        *getEntry(uint32_t Section, uint32_t Entry) const;
-  template<typename T>
-  const T        *getEntry(const Elf_Shdr *Section, uint32_t Entry) const;
-  const Elf_Shdr *getSection(DataRefImpl index) const;
-  const Elf_Shdr *getSection(uint32_t index) const;
-  const Elf_Rel  *getRel(DataRefImpl Rel) const;
-  const Elf_Rela *getRela(DataRefImpl Rela) const;
+  template <typename T>
+  const T *getEntry(const Elf_Shdr *Section, uint32_t Entry) const;
   const char     *getString(uint32_t section, uint32_t offset) const;
   const char     *getString(const Elf_Shdr *section, uint32_t offset) const;
-  error_code      getSymbolVersion(const Elf_Shdr *section,
-                                   const Elf_Sym *Symb,
-                                   StringRef &Version,
-                                   bool &IsDefault) const;
+  const char *getDynamicString(uintX_t Offset) const;
+  ErrorOr<StringRef> getSymbolVersion(const Elf_Shdr *section,
+                                      const Elf_Sym *Symb,
+                                      bool &IsDefault) const;
   void VerifyStrTab(const Elf_Shdr *sh) const;
 
-protected:
-  const Elf_Sym *getSymbol(DataRefImpl Symb) const; // FIXME: Should be private?
-  void            validateSymbol(DataRefImpl Symb) const;
-  StringRef       getRelocationTypeName(uint32_t Type) const;
+  StringRef getRelocationTypeName(uint32_t Type) const;
+  void getRelocationTypeName(uint32_t Type,
+                             SmallVectorImpl<char> &Result) const;
 
-public:
-  error_code      getSymbolName(const Elf_Shdr *section,
-                                const Elf_Sym *Symb,
-                                StringRef &Res) const;
-  error_code      getSectionName(const Elf_Shdr *section,
-                                 StringRef &Res) const;
-  const Elf_Dyn  *getDyn(DataRefImpl DynData) const;
-  error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
-                              bool &IsDefault) const;
-  uint64_t getSymbolIndex(const Elf_Sym *sym) const;
-  error_code getRelocationAddend(DataRefImpl Rel, int64_t &Res) const;
-protected:
-  virtual error_code getSymbolNext(DataRefImpl Symb, SymbolRef &Res) const;
-  virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const;
-  virtual error_code getSymbolFileOffset(DataRefImpl Symb, uint64_t &Res) const;
-  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const;
-  virtual error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const;
-  virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const;
-  virtual error_code getSymbolNMTypeChar(DataRefImpl Symb, char &Res) const;
-  virtual error_code getSymbolFlags(DataRefImpl Symb, uint32_t &Res) const;
-  virtual error_code getSymbolType(DataRefImpl Symb,
-                                   SymbolRef::Type &Res) const;
-  virtual error_code getSymbolSection(DataRefImpl Symb,
-                                      section_iterator &Res) const;
-  virtual error_code getSymbolValue(DataRefImpl Symb, uint64_t &Val) const;
-
-  virtual error_code getLibraryNext(DataRefImpl Data, LibraryRef &Result) const;
-  virtual error_code getLibraryPath(DataRefImpl Data, StringRef &Res) const;
-
-  virtual error_code getSectionNext(DataRefImpl Sec, SectionRef &Res) const;
-  virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const;
-  virtual error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const;
-  virtual error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const;
-  virtual error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const;
-  virtual error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const;
-  virtual error_code isSectionText(DataRefImpl Sec, bool &Res) const;
-  virtual error_code isSectionData(DataRefImpl Sec, bool &Res) const;
-  virtual error_code isSectionBSS(DataRefImpl Sec, bool &Res) const;
-  virtual error_code isSectionRequiredForExecution(DataRefImpl Sec,
-                                                   bool &Res) const;
-  virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const;
-  virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const;
-  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const;
-  virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
-                                           bool &Result) const;
-  virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const;
-  virtual relocation_iterator getSectionRelEnd(DataRefImpl Sec) const;
-  virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
-
-  virtual error_code getRelocationNext(DataRefImpl Rel,
-                                       RelocationRef &Res) const;
-  virtual error_code getRelocationAddress(DataRefImpl Rel,
-                                          uint64_t &Res) const;
-  virtual error_code getRelocationOffset(DataRefImpl Rel,
-                                         uint64_t &Res) const;
-  virtual symbol_iterator getRelocationSymbol(DataRefImpl Rel) const;
-  virtual error_code getRelocationType(DataRefImpl Rel,
-                                       uint64_t &Res) const;
-  virtual error_code getRelocationTypeName(DataRefImpl Rel,
-                                           SmallVectorImpl<char> &Result) const;
-  virtual error_code getRelocationValueString(DataRefImpl Rel,
-                                           SmallVectorImpl<char> &Result) const;
+  /// \brief Get the symbol table section and symbol for a given relocation.
+  template <class RelT>
+  std::pair<const Elf_Shdr *, const Elf_Sym *>
+  getRelocationSymbol(const Elf_Shdr *RelSec, const RelT *Rel) const;
 
-public:
-  ELFObjectFile(MemoryBuffer *Object, error_code &ec);
+  ELFFile(MemoryBuffer *Object, error_code &ec);
 
   bool isMips64EL() const {
     return Header->e_machine == ELF::EM_MIPS &&
@@ -730,65 +325,51 @@ public:
       Header->getDataEncoding() == ELF::ELFDATA2LSB;
   }
 
-  virtual symbol_iterator begin_symbols() const;
-  virtual symbol_iterator end_symbols() const;
-
-  virtual symbol_iterator begin_dynamic_symbols() const;
-  virtual symbol_iterator end_dynamic_symbols() const;
+  Elf_Shdr_Iter begin_sections() const;
+  Elf_Shdr_Iter end_sections() const;
 
-  virtual section_iterator begin_sections() const;
-  virtual section_iterator end_sections() const;
+  Elf_Sym_Iter begin_symbols() const;
+  Elf_Sym_Iter end_symbols() const;
 
-  virtual library_iterator begin_libraries_needed() const;
-  virtual library_iterator end_libraries_needed() const;
-
-  const Elf_Shdr *getDynamicSymbolTableSectionHeader() const {
-    return getSection(DynamicSymbolTableIndex);
-  }
-
-  const Elf_Shdr *getDynamicStringTableSectionHeader() const {
-    return dot_dynstr_sec;
-  }
-
-  Elf_Dyn_iterator begin_dynamic_table() const;
+  Elf_Dyn_Iter begin_dynamic_table() const;
   /// \param NULLEnd use one past the first DT_NULL entry as the end instead of
   /// the section size.
-  Elf_Dyn_iterator end_dynamic_table(bool NULLEnd = false) const;
-
-  Elf_Sym_iterator begin_elf_dynamic_symbols() const {
-    const Elf_Shdr *DynSymtab = getDynamicSymbolTableSectionHeader();
-    if (DynSymtab)
-      return Elf_Sym_iterator(DynSymtab->sh_entsize,
-                              (const char *)base() + DynSymtab->sh_offset);
-    return Elf_Sym_iterator(0, 0);
+  Elf_Dyn_Iter end_dynamic_table(bool NULLEnd = false) const;
+
+  Elf_Sym_Iter begin_dynamic_symbols() const {
+    if (DynSymRegion.Addr)
+      return Elf_Sym_Iter(DynSymRegion.EntSize, (const char *)DynSymRegion.Addr,
+                          true);
+    return Elf_Sym_Iter(0, 0, true);
   }
 
-  Elf_Sym_iterator end_elf_dynamic_symbols() const {
-    const Elf_Shdr *DynSymtab = getDynamicSymbolTableSectionHeader();
-    if (DynSymtab)
-      return Elf_Sym_iterator(DynSymtab->sh_entsize, (const char *)base() +
-                              DynSymtab->sh_offset + DynSymtab->sh_size);
-    return Elf_Sym_iterator(0, 0);
+  Elf_Sym_Iter end_dynamic_symbols() const {
+    if (DynSymRegion.Addr)
+      return Elf_Sym_Iter(DynSymRegion.EntSize,
+                          (const char *)DynSymRegion.Addr + DynSymRegion.Size,
+                          true);
+    return Elf_Sym_Iter(0, 0, true);
   }
 
-  Elf_Rela_Iter beginELFRela(const Elf_Shdr *sec) const {
+  Elf_Rela_Iter begin_rela(const Elf_Shdr *sec) const {
     return Elf_Rela_Iter(sec->sh_entsize,
                          (const char *)(base() + sec->sh_offset));
   }
 
-  Elf_Rela_Iter endELFRela(const Elf_Shdr *sec) const {
-    return Elf_Rela_Iter(sec->sh_entsize, (const char *)
-                         (base() + sec->sh_offset + sec->sh_size));
+  Elf_Rela_Iter end_rela(const Elf_Shdr *sec) const {
+    return Elf_Rela_Iter(
+        sec->sh_entsize,
+        (const char *)(base() + sec->sh_offset + sec->sh_size));
   }
 
-  Elf_Rel_Iter beginELFRel(const Elf_Shdr *sec) const {
+  Elf_Rel_Iter begin_rel(const Elf_Shdr *sec) const {
     return Elf_Rel_Iter(sec->sh_entsize,
                         (const char *)(base() + sec->sh_offset));
   }
 
-  Elf_Rel_Iter endELFRel(const Elf_Shdr *sec) const {
-    return Elf_Rel_Iter(sec->sh_entsize, (const char *)
-                        (base() + sec->sh_offset + sec->sh_size));
+  Elf_Rel_Iter end_rel(const Elf_Shdr *sec) const {
+    return Elf_Rel_Iter(sec->sh_entsize,
+                        (const char *)(base() + sec->sh_offset + sec->sh_size));
   }
 
   /// \brief Iterate over program header table.
@@ -806,43 +387,42 @@ public:
                            (Header->e_phnum * Header->e_phentsize));
   }
 
-  virtual uint8_t getBytesInAddress() const;
-  virtual StringRef getFileFormatName() const;
-  virtual StringRef getObjectType() const { return "ELF"; }
-  virtual unsigned getArch() const;
-  virtual StringRef getLoadName() const;
-  virtual error_code getSectionContents(const Elf_Shdr *sec,
-                                        StringRef &Res) const;
-
   uint64_t getNumSections() const;
-  uint64_t getStringTableIndex() const;
+  uintX_t getStringTableIndex() const;
   ELF::Elf64_Word getSymbolTableIndex(const Elf_Sym *symb) const;
-  const Elf_Ehdr *getElfHeader() const;
+  const Elf_Ehdr *getHeader() const { return Header; }
   const Elf_Shdr *getSection(const Elf_Sym *symb) const;
-  const Elf_Shdr *getElfSection(section_iterator &It) const;
-  const Elf_Sym *getElfSymbol(symbol_iterator &It) const;
-  const Elf_Sym *getElfSymbol(uint32_t index) const;
-
-  // Methods for type inquiry through isa, cast, and dyn_cast
-  bool isDyldType() const { return isDyldELFObject; }
-  static inline bool classof(const Binary *v) {
-    return v->getType() == getELFType(ELFT::TargetEndianness == support::little,
-                                      ELFT::Is64Bits);
-  }
+  const Elf_Shdr *getSection(uint32_t Index) const;
+  const Elf_Sym *getSymbol(uint32_t index) const;
+
+  ErrorOr<StringRef> getSymbolName(Elf_Sym_Iter Sym) const;
+
+  /// \brief Get the name of \p Symb.
+  /// \param SymTab The symbol table section \p Symb is contained in.
+  /// \param Symb The symbol to get the name of.
+  ///
+  /// \p SymTab is used to lookup the string table to use to get the symbol's
+  /// name.
+  ErrorOr<StringRef> getSymbolName(const Elf_Shdr *SymTab,
+                                   const Elf_Sym *Symb) const;
+  ErrorOr<StringRef> getSectionName(const Elf_Shdr *Section) const;
+  uint64_t getSymbolIndex(const Elf_Sym *sym) const;
+  ErrorOr<ArrayRef<uint8_t> > getSectionContents(const Elf_Shdr *Sec) const;
+  StringRef getLoadName() const;
 };
 
 // Use an alignment of 2 for the typedefs since that is the worst case for
 // ELF files in archives.
-typedef ELFObjectFile<ELFType<support::little, 2, false> > ELF32LEObjectFile;
-typedef ELFObjectFile<ELFType<support::little, 2, true> > ELF64LEObjectFile;
-typedef ELFObjectFile<ELFType<support::big, 2, false> > ELF32BEObjectFile;
-typedef ELFObjectFile<ELFType<support::big, 2, true> > ELF64BEObjectFile;
+typedef ELFFile<ELFType<support::little, 2, false> > ELF32LEFile;
+typedef ELFFile<ELFType<support::little, 2, true> > ELF64LEFile;
+typedef ELFFile<ELFType<support::big, 2, false> > ELF32BEFile;
+typedef ELFFile<ELFType<support::big, 2, true> > ELF64BEFile;
 
 // Iterate through the version definitions, and place each Elf_Verdef
 // in the VersionMap according to its index.
-template<class ELFT>
-void ELFObjectFile<ELFT>::LoadVersionDefs(const Elf_Shdr *sec) const {
-  unsigned vd_size = sec->sh_size; // Size of section in bytes
+template <class ELFT>
+void ELFFile<ELFT>::LoadVersionDefs(const Elf_Shdr *sec) const {
+  unsigned vd_size = sec->sh_size;  // Size of section in bytes
   unsigned vd_count = sec->sh_info; // Number of Verdef entries
   const char *sec_start = (const char*)base() + sec->sh_offset;
   const char *sec_end = sec_start + vd_size;
@@ -857,7 +437,7 @@ void ELFObjectFile<ELFT>::LoadVersionDefs(const Elf_Shdr *sec) const {
       report_fatal_error("Unexpected verdef version");
     size_t index = vd->vd_ndx & ELF::VERSYM_VERSION;
     if (index >= VersionMap.size())
-      VersionMap.resize(index+1);
+      VersionMap.resize(index + 1);
     VersionMap[index] = VersionMapEntry(vd);
     p += vd->vd_next;
   }
@@ -865,11 +445,11 @@ void ELFObjectFile<ELFT>::LoadVersionDefs(const Elf_Shdr *sec) const {
 
 // Iterate through the versions needed section, and place each Elf_Vernaux
 // in the VersionMap according to its index.
-template<class ELFT>
-void ELFObjectFile<ELFT>::LoadVersionNeeds(const Elf_Shdr *sec) const {
-  unsigned vn_size = sec->sh_size; // Size of section in bytes
+template <class ELFT>
+void ELFFile<ELFT>::LoadVersionNeeds(const Elf_Shdr *sec) const {
+  unsigned vn_size = sec->sh_size;  // Size of section in bytes
   unsigned vn_count = sec->sh_info; // Number of Verneed entries
-  const char *sec_start = (const char*)base() + sec->sh_offset;
+  const char *sec_start = (const char *)base() + sec->sh_offset;
   const char *sec_end = sec_start + vn_size;
   // The first Verneed entry is at the start of the section.
   const char *p = sec_start;
@@ -889,7 +469,7 @@ void ELFObjectFile<ELFT>::LoadVersionNeeds(const Elf_Shdr *sec) const {
       const Elf_Vernaux *vna = reinterpret_cast<const Elf_Vernaux *>(paux);
       size_t index = vna->vna_other & ELF::VERSYM_VERSION;
       if (index >= VersionMap.size())
-        VersionMap.resize(index+1);
+        VersionMap.resize(index + 1);
       VersionMap[index] = VersionMapEntry(vna);
       paux += vna->vna_next;
     }
@@ -897,11 +477,10 @@ void ELFObjectFile<ELFT>::LoadVersionNeeds(const Elf_Shdr *sec) const {
   }
 }
 
-template<class ELFT>
-void ELFObjectFile<ELFT>::LoadVersionMap() const {
+template <class ELFT>
+void ELFFile<ELFT>::LoadVersionMap() const {
   // If there is no dynamic symtab or version table, there is nothing to do.
-  if (getDynamicStringTableSectionHeader() == NULL ||
-      dot_gnu_version_sec == NULL)
+  if (DynSymRegion.Addr == NULL || dot_gnu_version_sec == NULL)
     return;
 
   // Has the VersionMap already been loaded?
@@ -920,64 +499,16 @@ void ELFObjectFile<ELFT>::LoadVersionMap() const {
     LoadVersionNeeds(dot_gnu_version_r_sec);
 }
 
-template<class ELFT>
-void ELFObjectFile<ELFT>::validateSymbol(DataRefImpl Symb) const {
-#ifndef NDEBUG
-  const Elf_Sym  *symb = getSymbol(Symb);
-  const Elf_Shdr *SymbolTableSection = getSection(Symb.d.b);
-  // FIXME: We really need to do proper error handling in the case of an invalid
-  //        input file. Because we don't use exceptions, I think we'll just pass
-  //        an error object around.
-  if (!(  symb
-        && SymbolTableSection
-        && symb >= (const Elf_Sym*)(base()
-                   + SymbolTableSection->sh_offset)
-        && symb <  (const Elf_Sym*)(base()
-                   + SymbolTableSection->sh_offset
-                   + SymbolTableSection->sh_size)))
-    // FIXME: Proper error handling.
-    report_fatal_error("Symb must point to a valid symbol!");
-#endif
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolNext(DataRefImpl Symb,
-                                              SymbolRef &Result) const {
-  validateSymbol(Symb);
-  ++Symb.d.a;
-  Result = SymbolRef(Symb, this);
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
-                                              StringRef &Result) const {
-  validateSymbol(Symb);
-  const Elf_Sym *symb = getSymbol(Symb);
-  return getSymbolName(getSection(Symb.d.b), symb, Result);
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
-                                                 StringRef &Version,
-                                                 bool &IsDefault) const {
-  DataRefImpl Symb = SymRef.getRawDataRefImpl();
-  validateSymbol(Symb);
-  const Elf_Sym *symb = getSymbol(Symb);
-  return getSymbolVersion(getSection(Symb.d.b), symb, Version, IsDefault);
-}
-
-template<class ELFT>
-ELF::Elf64_Word ELFObjectFile<ELFT>
-                             ::getSymbolTableIndex(const Elf_Sym *symb) const {
+template <class ELFT>
+ELF::Elf64_Word ELFFile<ELFT>::getSymbolTableIndex(const Elf_Sym *symb) const {
   if (symb->st_shndx == ELF::SHN_XINDEX)
     return ExtendedSymbolTable.lookup(symb);
   return symb->st_shndx;
 }
 
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Shdr *
-ELFObjectFile<ELFT>::getSection(const Elf_Sym *symb) const {
+template <class ELFT>
+const typename ELFFile<ELFT>::Elf_Shdr *
+ELFFile<ELFT>::getSection(const Elf_Sym *symb) const {
   if (symb->st_shndx == ELF::SHN_XINDEX)
     return getSection(ExtendedSymbolTable.lookup(symb));
   if (symb->st_shndx >= ELF::SHN_LORESERVE)
@@ -985,1282 +516,36 @@ ELFObjectFile<ELFT>::getSection(const Elf_Sym *symb) const {
   return getSection(symb->st_shndx);
 }
 
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Ehdr *
-ELFObjectFile<ELFT>::getElfHeader() const {
-  return Header;
-}
-
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Shdr *
-ELFObjectFile<ELFT>::getElfSection(section_iterator &It) const {
-  llvm::object::DataRefImpl ShdrRef = It->getRawDataRefImpl();
-  return reinterpret_cast<const Elf_Shdr *>(ShdrRef.p);
-}
-
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Sym *
-ELFObjectFile<ELFT>::getElfSymbol(symbol_iterator &It) const {
-  return getSymbol(It->getRawDataRefImpl());
-}
-
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Sym *
-ELFObjectFile<ELFT>::getElfSymbol(uint32_t index) const {
-  DataRefImpl SymbolData;
-  SymbolData.d.a = index;
-  SymbolData.d.b = SymbolTableIndex;
-  return getSymbol(SymbolData);
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolFileOffset(DataRefImpl Symb,
-                                                    uint64_t &Result) const {
-  validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-  const Elf_Shdr *Section;
-  switch (getSymbolTableIndex(symb)) {
-  case ELF::SHN_COMMON:
-   // Unintialized symbols have no offset in the object file
-  case ELF::SHN_UNDEF:
-    Result = UnknownAddressOrSize;
-    return object_error::success;
-  case ELF::SHN_ABS:
-    Result = symb->st_value;
-    return object_error::success;
-  default: Section = getSection(symb);
-  }
-
-  switch (symb->getType()) {
-  case ELF::STT_SECTION:
-    Result = Section ? Section->sh_offset : UnknownAddressOrSize;
-    return object_error::success;
-  case ELF::STT_FUNC:
-  case ELF::STT_OBJECT:
-  case ELF::STT_NOTYPE:
-    Result = symb->st_value +
-             (Section ? Section->sh_offset : 0);
-    return object_error::success;
-  default:
-    Result = UnknownAddressOrSize;
-    return object_error::success;
-  }
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
-                                                 uint64_t &Result) const {
-  validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-  const Elf_Shdr *Section;
-  switch (getSymbolTableIndex(symb)) {
-  case ELF::SHN_COMMON:
-  case ELF::SHN_UNDEF:
-    Result = UnknownAddressOrSize;
-    return object_error::success;
-  case ELF::SHN_ABS:
-    Result = symb->st_value;
-    return object_error::success;
-  default: Section = getSection(symb);
-  }
-
-  switch (symb->getType()) {
-  case ELF::STT_SECTION:
-    Result = Section ? Section->sh_addr : UnknownAddressOrSize;
-    return object_error::success;
-  case ELF::STT_FUNC:
-  case ELF::STT_OBJECT:
-  case ELF::STT_NOTYPE:
-    bool IsRelocatable;
-    switch(Header->e_type) {
-    case ELF::ET_EXEC:
-    case ELF::ET_DYN:
-      IsRelocatable = false;
-      break;
-    default:
-      IsRelocatable = true;
-    }
-    Result = symb->st_value;
-
-    // Clear the ARM/Thumb indicator flag.
-    if (Header->e_machine == ELF::EM_ARM)
-      Result &= ~1;
-
-    if (IsRelocatable && Section != 0)
-      Result += Section->sh_addr;
-    return object_error::success;
-  default:
-    Result = UnknownAddressOrSize;
-    return object_error::success;
-  }
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
-                                                   uint32_t &Res) const {
-  uint32_t flags;
-  getSymbolFlags(Symb, flags);
-  if (flags & SymbolRef::SF_Common) {
-    uint64_t Value;
-    getSymbolValue(Symb, Value);
-    Res = Value;
-  } else {
-    Res = 0;
-  }
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Symb,
-                                              uint64_t &Result) const {
-  validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-  if (symb->st_size == 0)
-    Result = UnknownAddressOrSize;
-  Result = symb->st_size;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolNMTypeChar(DataRefImpl Symb,
-                                                    char &Result) const {
-  validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-  const Elf_Shdr *Section = getSection(symb);
-
-  char ret = '?';
-
-  if (Section) {
-    switch (Section->sh_type) {
-    case ELF::SHT_PROGBITS:
-    case ELF::SHT_DYNAMIC:
-      switch (Section->sh_flags) {
-      case (ELF::SHF_ALLOC | ELF::SHF_EXECINSTR):
-        ret = 't'; break;
-      case (ELF::SHF_ALLOC | ELF::SHF_WRITE):
-        ret = 'd'; break;
-      case ELF::SHF_ALLOC:
-      case (ELF::SHF_ALLOC | ELF::SHF_MERGE):
-      case (ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS):
-        ret = 'r'; break;
-      }
-      break;
-    case ELF::SHT_NOBITS: ret = 'b';
-    }
-  }
-
-  switch (getSymbolTableIndex(symb)) {
-  case ELF::SHN_UNDEF:
-    if (ret == '?')
-      ret = 'U';
-    break;
-  case ELF::SHN_ABS: ret = 'a'; break;
-  case ELF::SHN_COMMON: ret = 'c'; break;
-  }
-
-  switch (symb->getBinding()) {
-  case ELF::STB_GLOBAL: ret = ::toupper(ret); break;
-  case ELF::STB_WEAK:
-    if (getSymbolTableIndex(symb) == ELF::SHN_UNDEF)
-      ret = 'w';
-    else
-      if (symb->getType() == ELF::STT_OBJECT)
-        ret = 'V';
-      else
-        ret = 'W';
-  }
-
-  if (ret == '?' && symb->getType() == ELF::STT_SECTION) {
-    StringRef name;
-    if (error_code ec = getSymbolName(Symb, name))
-      return ec;
-    Result = StringSwitch<char>(name)
-      .StartsWith(".debug", 'N')
-      .StartsWith(".note", 'n')
-      .Default('?');
-    return object_error::success;
-  }
-
-  Result = ret;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
-                                              SymbolRef::Type &Result) const {
-  validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-
-  switch (symb->getType()) {
-  case ELF::STT_NOTYPE:
-    Result = SymbolRef::ST_Unknown;
-    break;
-  case ELF::STT_SECTION:
-    Result = SymbolRef::ST_Debug;
-    break;
-  case ELF::STT_FILE:
-    Result = SymbolRef::ST_File;
-    break;
-  case ELF::STT_FUNC:
-    Result = SymbolRef::ST_Function;
-    break;
-  case ELF::STT_OBJECT:
-  case ELF::STT_COMMON:
-  case ELF::STT_TLS:
-    Result = SymbolRef::ST_Data;
-    break;
-  default:
-    Result = SymbolRef::ST_Other;
-    break;
-  }
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Symb,
-                                               uint32_t &Result) const {
-  validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-
-  Result = SymbolRef::SF_None;
-
-  if (symb->getBinding() != ELF::STB_LOCAL)
-    Result |= SymbolRef::SF_Global;
-
-  if (symb->getBinding() == ELF::STB_WEAK)
-    Result |= SymbolRef::SF_Weak;
-
-  if (symb->st_shndx == ELF::SHN_ABS)
-    Result |= SymbolRef::SF_Absolute;
-
-  if (symb->getType() == ELF::STT_FILE ||
-      symb->getType() == ELF::STT_SECTION ||
-      Symb == begin_symbols()->getRawDataRefImpl())
-    Result |= SymbolRef::SF_FormatSpecific;
-
-  if (getSymbolTableIndex(symb) == ELF::SHN_UNDEF)
-    Result |= SymbolRef::SF_Undefined;
-
-  if (symb->getType() == ELF::STT_COMMON ||
-      getSymbolTableIndex(symb) == ELF::SHN_COMMON)
-    Result |= SymbolRef::SF_Common;
-
-  if (symb->getType() == ELF::STT_TLS)
-    Result |= SymbolRef::SF_ThreadLocal;
-
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb,
-                                                 section_iterator &Res) const {
-  validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-  const Elf_Shdr *sec = getSection(symb);
-  if (!sec)
-    Res = end_sections();
-  else {
-    DataRefImpl Sec;
-    Sec.p = reinterpret_cast<intptr_t>(sec);
-    Res = section_iterator(SectionRef(Sec, this));
-  }
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolValue(DataRefImpl Symb,
-                                               uint64_t &Val) const {
-  validateSymbol(Symb);
-  const Elf_Sym *symb = getSymbol(Symb);
-  Val = symb->st_value;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionNext(DataRefImpl Sec,
-                                               SectionRef &Result) const {
-  const uint8_t *sec = reinterpret_cast<const uint8_t *>(Sec.p);
-  sec += Header->e_shentsize;
-  Sec.p = reinterpret_cast<intptr_t>(sec);
-  Result = SectionRef(Sec, this);
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
-                                               StringRef &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  Result = StringRef(getString(dot_shstrtab_sec, sec->sh_name));
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
-                                                  uint64_t &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  Result = sec->sh_addr;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
-                                               uint64_t &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  Result = sec->sh_size;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec,
-                                                   StringRef &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  const char *start = (const char*)base() + sec->sh_offset;
-  Result = StringRef(start, sec->sh_size);
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionContents(const Elf_Shdr *Sec,
-                                                   StringRef &Result) const {
-  const char *start = (const char*)base() + Sec->sh_offset;
-  Result = StringRef(start, Sec->sh_size);
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
-                                                    uint64_t &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  Result = sec->sh_addralign;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
-                                              bool &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  if (sec->sh_flags & ELF::SHF_EXECINSTR)
-    Result = true;
-  else
-    Result = false;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
-                                              bool &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  if (sec->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)
-      && sec->sh_type == ELF::SHT_PROGBITS)
-    Result = true;
-  else
-    Result = false;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
-                                             bool &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  if (sec->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)
-      && sec->sh_type == ELF::SHT_NOBITS)
-    Result = true;
-  else
-    Result = false;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionRequiredForExecution(
-    DataRefImpl Sec, bool &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  if (sec->sh_flags & ELF::SHF_ALLOC)
-    Result = true;
-  else
-    Result = false;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
-                                                 bool &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  if (sec->sh_type == ELF::SHT_NOBITS)
-    Result = true;
-  else
-    Result = false;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
-                                                  bool &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  // For ELF, all zero-init sections are virtual (that is, they occupy no space
-  //   in the object image) and vice versa.
-  Result = sec->sh_type == ELF::SHT_NOBITS;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
-                                                      bool &Result) const {
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  if (sec->sh_flags & ELF::SHF_WRITE || sec->sh_flags & ELF::SHF_EXECINSTR)
-    Result = false;
-  else
-    Result = true;
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
-                                                      DataRefImpl Symb,
-                                                      bool &Result) const {
-  validateSymbol(Symb);
-
-  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  const Elf_Sym  *symb = getSymbol(Symb);
-
-  unsigned shndx = symb->st_shndx;
-  bool Reserved = shndx >= ELF::SHN_LORESERVE
-               && shndx <= ELF::SHN_HIRESERVE;
-
-  Result = !Reserved && (sec == getSection(symb->st_shndx));
-  return object_error::success;
-}
-
-template<class ELFT>
-relocation_iterator
-ELFObjectFile<ELFT>::getSectionRelBegin(DataRefImpl Sec) const {
-  DataRefImpl RelData;
-  uintptr_t SHT = reinterpret_cast<uintptr_t>(SectionHeaderTable);
-  RelData.d.a = (Sec.p - SHT) / Header->e_shentsize;
-  RelData.d.b = 0;
-  return relocation_iterator(RelocationRef(RelData, this));
-}
-
-template<class ELFT>
-relocation_iterator
-ELFObjectFile<ELFT>::getSectionRelEnd(DataRefImpl Sec) const {
-  DataRefImpl RelData;
-  uintptr_t SHT = reinterpret_cast<uintptr_t>(SectionHeaderTable);
-  const Elf_Shdr *S = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  RelData.d.a = (Sec.p - SHT) / Header->e_shentsize;
-  if (S->sh_type != ELF::SHT_RELA && S->sh_type != ELF::SHT_REL)
-    RelData.d.b = 0;
-  else
-    RelData.d.b = S->sh_size / S->sh_entsize;
-
-  return relocation_iterator(RelocationRef(RelData, this));
-}
-
 template <class ELFT>
-section_iterator
-ELFObjectFile<ELFT>::getRelocatedSection(DataRefImpl Sec) const {
-  if (Header->e_type != ELF::ET_REL)
-    return end_sections();
-
-  const Elf_Shdr *S = reinterpret_cast<const Elf_Shdr *>(Sec.p);
-  unsigned sh_type = S->sh_type;
-  if (sh_type != ELF::SHT_RELA && sh_type != ELF::SHT_REL)
-    return end_sections();
-
-  assert(S->sh_info != 0);
-  const Elf_Shdr *R = getSection(S->sh_info);
-  DataRefImpl D;
-  D.p = reinterpret_cast<uintptr_t>(R);
-  return section_iterator(SectionRef(D, this));
-}
-
-// Relocations
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationNext(DataRefImpl Rel,
-                                                  RelocationRef &Result) const {
-  ++Rel.d.b;
-  Result = RelocationRef(Rel, this);
-  return object_error::success;
+const typename ELFFile<ELFT>::Elf_Sym *
+ELFFile<ELFT>::getSymbol(uint32_t Index) const {
+  return &*(begin_symbols() + Index);
 }
 
 template <class ELFT>
-symbol_iterator
-ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel) const {
-  uint32_t symbolIdx;
-  const Elf_Shdr *sec = getRelSection(Rel);
-  switch (sec->sh_type) {
-    default :
-      report_fatal_error("Invalid section type in Rel!");
-    case ELF::SHT_REL : {
-      symbolIdx = getRel(Rel)->getSymbol(isMips64EL());
-      break;
-    }
-    case ELF::SHT_RELA : {
-      symbolIdx = getRela(Rel)->getSymbol(isMips64EL());
-      break;
-    }
-  }
-  if (!symbolIdx)
-    return end_symbols();
-
-  DataRefImpl SymbolData;
-  SymbolData.d.a = symbolIdx;
-  SymbolData.d.b = sec->sh_link;
-  return symbol_iterator(SymbolRef(SymbolData, this));
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
-                                                     uint64_t &Result) const {
-  assert((Header->e_type == ELF::ET_EXEC || Header->e_type == ELF::ET_DYN) &&
-         "Only executable and shared objects files have addresses");
-  Result = getROffset(Rel);
-  return object_error::success;
+ErrorOr<ArrayRef<uint8_t> >
+ELFFile<ELFT>::getSectionContents(const Elf_Shdr *Sec) const {
+  if (Sec->sh_offset + Sec->sh_size > Buf->getBufferSize())
+    return object_error::parse_failed;
+  const uint8_t *Start = base() + Sec->sh_offset;
+  return ArrayRef<uint8_t>(Start, Sec->sh_size);
 }
 
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel,
-                                                    uint64_t &Result) const {
-  assert(Header->e_type == ELF::ET_REL &&
-         "Only relocatable object files have relocation offsets");
-  Result = getROffset(Rel);
-  return object_error::success;
-}
-
-template<class ELFT>
-uint64_t ELFObjectFile<ELFT>::getROffset(DataRefImpl Rel) const {
-  const Elf_Shdr *sec = getRelSection(Rel);
-  switch (sec->sh_type) {
-  default:
-    report_fatal_error("Invalid section type in Rel!");
-  case ELF::SHT_REL:
-    return getRel(Rel)->r_offset;
-  case ELF::SHT_RELA:
-    return getRela(Rel)->r_offset;
-  }
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel,
-                                                  uint64_t &Result) const {
-  const Elf_Shdr *sec = getRelSection(Rel);
-  switch (sec->sh_type) {
-    default :
-      report_fatal_error("Invalid section type in Rel!");
-    case ELF::SHT_REL : {
-      Result = getRel(Rel)->getType(isMips64EL());
-      break;
-    }
-    case ELF::SHT_RELA : {
-      Result = getRela(Rel)->getType(isMips64EL());
-      break;
-    }
-  }
-  return object_error::success;
-}
-
-#define LLVM_ELF_SWITCH_RELOC_TYPE_NAME(enum) \
-  case ELF::enum: Res = #enum; break;
-
-template<class ELFT>
-StringRef ELFObjectFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
-  StringRef Res = "Unknown";
-  switch (Header->e_machine) {
-  case ELF::EM_X86_64:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32S);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSGD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSLD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTTPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPLT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32_TLSDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_IRELATIVE);
-    default: break;
-    }
-    break;
-  case ELF::EM_386:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTPC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32PLT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTIE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_PUSH);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_POP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_PUSH);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_POP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDO_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_IRELATIVE);
-    default: break;
-    }
-    break;
-  case ELF::EM_MIPS:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_26);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LITERAL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_UNUSED1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_UNUSED2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT5);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT6);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_DISP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_PAGE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_OFST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SUB);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_A);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_B);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_DELETE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SCN_DISP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_ADD_IMMEDIATE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PJUMP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_RELGOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JALR);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_LDM);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GOTTPREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NUM);
-    default: break;
-    }
-    break;
-  case ELF::EM_AARCH64:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G3);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD_PREL_LO19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_LO21);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_PG_HI21);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADD_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST8_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TSTBR14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CONDBR19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP26);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CALL26);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST16_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST32_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST64_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST128_ABS_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_GOT_PAGE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD64_GOT_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_HI12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_HI12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADR_PAGE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_LD64_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADD_LO12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_CALL);
-    default: break;
-    }
-    break;
-  case ELF::EM_ARM:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PC24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ABS5);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BREL_ADJ);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_SWI8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_XPC25);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_XPC22);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_TPOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_7_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_15_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_23_15);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SBREL_11_0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_19_12_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_27_20_CK);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL31);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_V4BX);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PREL31);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_ABS_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_PREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_ABS_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_PREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP19);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP6);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ALU_PREL_11_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32_NOI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32_NOI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL_NC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GOTDESC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESCSEQ);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_CALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_ABS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_PREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTRELAX);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTENTRY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTINHERIT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP11);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDM32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE12GP);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_3);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_4);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_5);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_6);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_7);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_9);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_10);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_11);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_13);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_15);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ME_TOO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ32);
-    default: break;
-    }
-    break;
-  case ELF::EM_HEXAGON:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_0);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_1);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_2);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_3);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B32_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_12_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_10_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_9_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_7_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_JMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_PLT_B22_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPMOD_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_PLT_B22_PCREL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_LO16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_HI16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_PCREL_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_11_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32_6_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16_X);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_11_X);
-    default: break;
-    }
-    break;
-  case ELF::EM_PPC:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPMOD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSGD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSLD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HA);
-    default: break;
-    }
-    break;
-  case ELF::EM_PPC64:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL24);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRNTAKEN);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHERA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHESTA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPMOD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHERA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHESTA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO_DS);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHER);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHERA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHEST);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHESTA);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSGD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSLD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_LO);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HI);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HA);
-    default: break;
-    }
-    break;
-  case ELF::EM_S390:
-    switch (Type) {
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_NONE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_8);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_COPY);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GLOB_DAT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_JMP_SLOT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_RELATIVE);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPC);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT16DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32DBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPCDBL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTENT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLTENT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF16);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LOAD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GDCALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDCALL);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE12);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IEENT);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO32);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO64);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPMOD);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_TPOFF);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE20);
-      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_IRELATIVE);
-    default: break;
-    }
-    break;
-  default: break;
-  }
-  return Res;
+template <class ELFT>
+StringRef ELFFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
+  return getELFRelocationTypeName(Header->e_machine, Type);
 }
 
-#undef LLVM_ELF_SWITCH_RELOC_TYPE_NAME
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationTypeName(
-    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  const Elf_Shdr *sec = getRelSection(Rel);
-  uint32_t type;
-  switch (sec->sh_type) {
-    default :
-      return object_error::parse_failed;
-    case ELF::SHT_REL : {
-      type = getRel(Rel)->getType(isMips64EL());
-      break;
-    }
-    case ELF::SHT_RELA : {
-      type = getRela(Rel)->getType(isMips64EL());
-      break;
-    }
-  }
-
+template <class ELFT>
+void ELFFile<ELFT>::getRelocationTypeName(uint32_t Type,
+                                          SmallVectorImpl<char> &Result) const {
   if (!isMips64EL()) {
-    StringRef Name = getRelocationTypeName(type);
+    StringRef Name = getRelocationTypeName(Type);
     Result.append(Name.begin(), Name.end());
   } else {
-    uint8_t Type1 = (type >>  0) & 0xFF;
-    uint8_t Type2 = (type >>  8) & 0xFF;
-    uint8_t Type3 = (type >> 16) & 0xFF;
+    uint8_t Type1 = (Type >> 0) & 0xFF;
+    uint8_t Type2 = (Type >> 8) & 0xFF;
+    uint8_t Type3 = (Type >> 16) & 0xFF;
 
     // Concat all three relocation type names.
     StringRef Name = getRelocationTypeName(Type1);
@@ -2274,135 +559,63 @@ error_code ELFObjectFile<ELFT>::getRelocationTypeName(
     Result.append(1, '/');
     Result.append(Name.begin(), Name.end());
   }
-
-  return object_error::success;
 }
 
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationAddend(
-    DataRefImpl Rel, int64_t &Result) const {
-  const Elf_Shdr *sec = getRelSection(Rel);
-  switch (sec->sh_type) {
-    default :
-      report_fatal_error("Invalid section type in Rel!");
-    case ELF::SHT_REL : {
-      Result = 0;
-      return object_error::success;
-    }
-    case ELF::SHT_RELA : {
-      Result = getRela(Rel)->r_addend;
-      return object_error::success;
-    }
-  }
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getRelocationValueString(
-    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  const Elf_Shdr *sec = getRelSection(Rel);
-  uint8_t type;
-  StringRef res;
-  int64_t addend = 0;
-  uint16_t symbol_index = 0;
-  switch (sec->sh_type) {
-    default:
-      return object_error::parse_failed;
-    case ELF::SHT_REL: {
-      type = getRel(Rel)->getType(isMips64EL());
-      symbol_index = getRel(Rel)->getSymbol(isMips64EL());
-      // TODO: Read implicit addend from section data.
-      break;
-    }
-    case ELF::SHT_RELA: {
-      type = getRela(Rel)->getType(isMips64EL());
-      symbol_index = getRela(Rel)->getSymbol(isMips64EL());
-      addend = getRela(Rel)->r_addend;
-      break;
-    }
-  }
-  const Elf_Sym *symb = getEntry<Elf_Sym>(sec->sh_link, symbol_index);
-  StringRef symname;
-  if (error_code ec = getSymbolName(getSection(sec->sh_link), symb, symname))
-    return ec;
-  switch (Header->e_machine) {
-  case ELF::EM_X86_64:
-    switch (type) {
-    case ELF::R_X86_64_PC8:
-    case ELF::R_X86_64_PC16:
-    case ELF::R_X86_64_PC32: {
-        std::string fmtbuf;
-        raw_string_ostream fmt(fmtbuf);
-        fmt << symname << (addend < 0 ? "" : "+") << addend << "-P";
-        fmt.flush();
-        Result.append(fmtbuf.begin(), fmtbuf.end());
-      }
-      break;
-    case ELF::R_X86_64_8:
-    case ELF::R_X86_64_16:
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-    case ELF::R_X86_64_64: {
-        std::string fmtbuf;
-        raw_string_ostream fmt(fmtbuf);
-        fmt << symname << (addend < 0 ? "" : "+") << addend;
-        fmt.flush();
-        Result.append(fmtbuf.begin(), fmtbuf.end());
-      }
-      break;
-    default:
-      res = "Unknown";
-    }
-    break;
-  case ELF::EM_AARCH64: {
-    std::string fmtbuf;
-    raw_string_ostream fmt(fmtbuf);
-    fmt << symname;
-    if (addend != 0)
-      fmt << (addend < 0 ? "" : "+") << addend;
-    fmt.flush();
-    Result.append(fmtbuf.begin(), fmtbuf.end());
-    break;
-  }
-  case ELF::EM_ARM:
-  case ELF::EM_HEXAGON:
-    res = symname;
-    break;
-  default:
-    res = "Unknown";
-  }
-  if (Result.empty())
-    Result.append(res.begin(), res.end());
-  return object_error::success;
+template <class ELFT>
+template <class RelT>
+std::pair<const typename ELFFile<ELFT>::Elf_Shdr *,
+          const typename ELFFile<ELFT>::Elf_Sym *>
+ELFFile<ELFT>::getRelocationSymbol(const Elf_Shdr *Sec, const RelT *Rel) const {
+  if (!Sec->sh_link)
+    return std::make_pair((const Elf_Shdr *)0, (const Elf_Sym *)0);
+  const Elf_Shdr *SymTable = getSection(Sec->sh_link);
+  return std::make_pair(
+      SymTable, getEntry<Elf_Sym>(SymTable, Rel->getSymbol(isMips64EL())));
 }
 
 // Verify that the last byte in the string table in a null.
-template<class ELFT>
-void ELFObjectFile<ELFT>::VerifyStrTab(const Elf_Shdr *sh) const {
-  const char *strtab = (const char*)base() + sh->sh_offset;
+template <class ELFT>
+void ELFFile<ELFT>::VerifyStrTab(const Elf_Shdr *sh) const {
+  const char *strtab = (const char *)base() + sh->sh_offset;
   if (strtab[sh->sh_size - 1] != 0)
     // FIXME: Proper error handling.
     report_fatal_error("String table must end with a null terminator!");
 }
 
-template<class ELFT>
-ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, error_code &ec)
-  : ObjectFile(getELFType(
-      static_cast<endianness>(ELFT::TargetEndianness) == support::little,
-      ELFT::Is64Bits),
-      Object)
-  , isDyldELFObject(false)
-  , SectionHeaderTable(0)
-  , dot_shstrtab_sec(0)
-  , dot_strtab_sec(0)
-  , dot_dynstr_sec(0)
-  , dot_dynamic_sec(0)
-  , dot_gnu_version_sec(0)
-  , dot_gnu_version_r_sec(0)
-  , dot_gnu_version_d_sec(0)
-  , dt_soname(0)
- {
-
-  const uint64_t FileSize = Data->getBufferSize();
+template <class ELFT>
+uint64_t ELFFile<ELFT>::getNumSections() const {
+  assert(Header && "Header not initialized!");
+  if (Header->e_shnum == ELF::SHN_UNDEF) {
+    assert(SectionHeaderTable && "SectionHeaderTable not initialized!");
+    return SectionHeaderTable->sh_size;
+  }
+  return Header->e_shnum;
+}
+
+template <class ELFT>
+typename ELFFile<ELFT>::uintX_t ELFFile<ELFT>::getStringTableIndex() const {
+  if (Header->e_shnum == ELF::SHN_UNDEF) {
+    if (Header->e_shstrndx == ELF::SHN_HIRESERVE)
+      return SectionHeaderTable->sh_link;
+    if (Header->e_shstrndx >= getNumSections())
+      return 0;
+  }
+  return Header->e_shstrndx;
+}
+
+template <class ELFT>
+ELFFile<ELFT>::ELFFile(MemoryBuffer *Object, error_code &ec)
+    : Buf(Object),
+      SectionHeaderTable(0),
+      dot_shstrtab_sec(0),
+      dot_strtab_sec(0),
+      dot_symtab_sec(0),
+      SymbolTableSectionHeaderIndex(0),
+      dot_gnu_version_sec(0),
+      dot_gnu_version_r_sec(0),
+      dot_gnu_version_d_sec(0),
+      dt_soname(0) {
+  const uint64_t FileSize = Buf->getBufferSize();
 
   if (sizeof(Elf_Ehdr) > FileSize)
     // FIXME: Proper error handling.
@@ -2428,68 +641,64 @@ ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, error_code &ec)
     // FIXME: Proper error handling.
     report_fatal_error("Section table goes past end of file!");
 
-  // To find the symbol tables we walk the section table to find SHT_SYMTAB.
-  const Elf_Shdr* SymbolTableSectionHeaderIndex = 0;
-  const Elf_Shdr* sh = SectionHeaderTable;
+  // Scan sections for special sections.
 
-  SymbolTableIndex = -1;
-  DynamicSymbolTableIndex = -1;
-
-  for (uint64_t i = 0, e = getNumSections(); i != e; ++i) {
-    switch (sh->sh_type) {
-    case ELF::SHT_SYMTAB_SHNDX: {
+  for (Elf_Shdr_Iter SecI = begin_sections(), SecE = end_sections();
+       SecI != SecE; ++SecI) {
+    switch (SecI->sh_type) {
+    case ELF::SHT_SYMTAB_SHNDX:
       if (SymbolTableSectionHeaderIndex)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .symtab_shndx!");
-      SymbolTableSectionHeaderIndex = sh;
+      SymbolTableSectionHeaderIndex = &*SecI;
       break;
-    }
-    case ELF::SHT_SYMTAB: {
-      if (SymbolTableIndex != -1)
-        report_fatal_error("More than one SHT_SYMTAB!");
-      SymbolTableIndex = i;
+    case ELF::SHT_SYMTAB:
+      if (dot_symtab_sec)
+        // FIXME: Proper error handling.
+        report_fatal_error("More than one .symtab!");
+      dot_symtab_sec = &*SecI;
+      dot_strtab_sec = getSection(SecI->sh_link);
       break;
-    }
     case ELF::SHT_DYNSYM: {
-      if (DynamicSymbolTableIndex != -1)
+      if (DynSymRegion.Addr)
         // FIXME: Proper error handling.
-        report_fatal_error("More than one SHT_DYNSYM!");
-      DynamicSymbolTableIndex = i;
+        report_fatal_error("More than one .dynsym!");
+      DynSymRegion.Addr = base() + SecI->sh_offset;
+      DynSymRegion.Size = SecI->sh_size;
+      DynSymRegion.EntSize = SecI->sh_entsize;
+      const Elf_Shdr *DynStr = getSection(SecI->sh_link);
+      DynStrRegion.Addr = base() + DynStr->sh_offset;
+      DynStrRegion.Size = DynStr->sh_size;
+      DynStrRegion.EntSize = DynStr->sh_entsize;
       break;
     }
-    case ELF::SHT_REL:
-    case ELF::SHT_RELA:
-      break;
-    case ELF::SHT_DYNAMIC: {
-      if (dot_dynamic_sec != NULL)
+    case ELF::SHT_DYNAMIC:
+      if (DynamicRegion.Addr)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .dynamic!");
-      dot_dynamic_sec = sh;
+      DynamicRegion.Addr = base() + SecI->sh_offset;
+      DynamicRegion.Size = SecI->sh_size;
+      DynamicRegion.EntSize = SecI->sh_entsize;
       break;
-    }
-    case ELF::SHT_GNU_versym: {
+    case ELF::SHT_GNU_versym:
       if (dot_gnu_version_sec != NULL)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .gnu.version section!");
-      dot_gnu_version_sec = sh;
+      dot_gnu_version_sec = &*SecI;
       break;
-    }
-    case ELF::SHT_GNU_verdef: {
+    case ELF::SHT_GNU_verdef:
       if (dot_gnu_version_d_sec != NULL)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .gnu.version_d section!");
-      dot_gnu_version_d_sec = sh;
+      dot_gnu_version_d_sec = &*SecI;
       break;
-    }
-    case ELF::SHT_GNU_verneed: {
+    case ELF::SHT_GNU_verneed:
       if (dot_gnu_version_r_sec != NULL)
         // FIXME: Proper error handling.
         report_fatal_error("More than one .gnu.version_r section!");
-      dot_gnu_version_r_sec = sh;
+      dot_gnu_version_r_sec = &*SecI;
       break;
     }
-    }
-    ++sh;
   }
 
   // Get string table sections.
@@ -2499,173 +708,117 @@ ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, error_code &ec)
     VerifyStrTab(dot_shstrtab_sec);
   }
 
-  // Merge this into the above loop.
-  for (const char *i = reinterpret_cast<const char *>(SectionHeaderTable),
-                  *e = i + getNumSections() * Header->e_shentsize;
-                   i != e; i += Header->e_shentsize) {
-    const Elf_Shdr *sh = reinterpret_cast<const Elf_Shdr*>(i);
-    if (sh->sh_type == ELF::SHT_STRTAB) {
-      StringRef SectionName(getString(dot_shstrtab_sec, sh->sh_name));
-      if (SectionName == ".strtab") {
-        if (dot_strtab_sec != 0)
-          // FIXME: Proper error handling.
-          report_fatal_error("Already found section named .strtab!");
-        dot_strtab_sec = sh;
-        VerifyStrTab(dot_strtab_sec);
-      } else if (SectionName == ".dynstr") {
-        if (dot_dynstr_sec != 0)
-          // FIXME: Proper error handling.
-          report_fatal_error("Already found section named .dynstr!");
-        dot_dynstr_sec = sh;
-        VerifyStrTab(dot_dynstr_sec);
-      }
-    }
-  }
-
   // Build symbol name side-mapping if there is one.
   if (SymbolTableSectionHeaderIndex) {
     const Elf_Word *ShndxTable = reinterpret_cast<const Elf_Word*>(base() +
                                       SymbolTableSectionHeaderIndex->sh_offset);
-    error_code ec;
-    for (symbol_iterator si = begin_symbols(),
-                         se = end_symbols(); si != se; si.increment(ec)) {
-      if (ec)
-        report_fatal_error("Fewer extended symbol table entries than symbols!");
+    for (Elf_Sym_Iter SI = begin_symbols(), SE = end_symbols(); SI != SE;
+         ++SI) {
       if (*ShndxTable != ELF::SHN_UNDEF)
-        ExtendedSymbolTable[getSymbol(si->getRawDataRefImpl())] = *ShndxTable;
+        ExtendedSymbolTable[&*SI] = *ShndxTable;
       ++ShndxTable;
     }
   }
+
+  // Scan program headers.
+  for (Elf_Phdr_Iter PhdrI = begin_program_headers(),
+                     PhdrE = end_program_headers();
+       PhdrI != PhdrE; ++PhdrI) {
+    if (PhdrI->p_type == ELF::PT_DYNAMIC) {
+      DynamicRegion.Addr = base() + PhdrI->p_offset;
+      DynamicRegion.Size = PhdrI->p_filesz;
+      DynamicRegion.EntSize = sizeof(Elf_Dyn);
+      break;
+    }
+  }
+
+  ec = error_code::success();
 }
 
 // Get the symbol table index in the symtab section given a symbol
-template<class ELFT>
-uint64_t ELFObjectFile<ELFT>::getSymbolIndex(const Elf_Sym *Sym) const {
-  const Elf_Shdr *SymTab = getSection(SymbolTableIndex);
+template <class ELFT>
+uint64_t ELFFile<ELFT>::getSymbolIndex(const Elf_Sym *Sym) const {
   uintptr_t SymLoc = uintptr_t(Sym);
-  uintptr_t SymTabLoc = uintptr_t(base() + SymTab->sh_offset);
+  uintptr_t SymTabLoc = uintptr_t(base() + dot_symtab_sec->sh_offset);
   assert(SymLoc > SymTabLoc && "Symbol not in symbol table!");
   uint64_t SymOffset = SymLoc - SymTabLoc;
-  assert(SymOffset % SymTab->sh_entsize == 0 &&
+  assert(SymOffset % dot_symtab_sec->sh_entsize == 0 &&
          "Symbol not multiple of symbol size!");
-  return SymOffset / SymTab->sh_entsize;
+  return SymOffset / dot_symtab_sec->sh_entsize;
 }
 
-template<class ELFT>
-symbol_iterator ELFObjectFile<ELFT>::begin_symbols() const {
-  DataRefImpl SymbolData;
-  if (SymbolTableIndex == -1) {
-    SymbolData.d.a = 0;
-    SymbolData.d.b = 0;
-  } else {
-    SymbolData.d.a = 0;
-    SymbolData.d.b = SymbolTableIndex;
-  }
-  return symbol_iterator(SymbolRef(SymbolData, this));
+template <class ELFT>
+typename ELFFile<ELFT>::Elf_Shdr_Iter ELFFile<ELFT>::begin_sections() const {
+  return Elf_Shdr_Iter(Header->e_shentsize,
+                       (const char *)base() + Header->e_shoff);
 }
 
-template<class ELFT>
-symbol_iterator ELFObjectFile<ELFT>::end_symbols() const {
-  DataRefImpl SymbolData;
-  if (SymbolTableIndex == -1) {
-    SymbolData.d.a = 0;
-    SymbolData.d.b = 0;
-  } else {
-    const Elf_Shdr *SymbolTableSection = getSection(SymbolTableIndex);
-    SymbolData.d.a = SymbolTableSection->getEntityCount();
-    SymbolData.d.b = SymbolTableIndex;
-  }
-  return symbol_iterator(SymbolRef(SymbolData, this));
+template <class ELFT>
+typename ELFFile<ELFT>::Elf_Shdr_Iter ELFFile<ELFT>::end_sections() const {
+  return Elf_Shdr_Iter(Header->e_shentsize,
+                       (const char *)base() + Header->e_shoff +
+                           (getNumSections() * Header->e_shentsize));
 }
 
-template<class ELFT>
-symbol_iterator ELFObjectFile<ELFT>::begin_dynamic_symbols() const {
-  DataRefImpl SymbolData;
-  if (DynamicSymbolTableIndex == -1) {
-    SymbolData.d.a = 0;
-    SymbolData.d.b = 0;
-  } else {
-    SymbolData.d.a = 0;
-    SymbolData.d.b = DynamicSymbolTableIndex;
-  }
-  return symbol_iterator(SymbolRef(SymbolData, this));
+template <class ELFT>
+typename ELFFile<ELFT>::Elf_Sym_Iter ELFFile<ELFT>::begin_symbols() const {
+  if (!dot_symtab_sec)
+    return Elf_Sym_Iter(0, 0, false);
+  return Elf_Sym_Iter(dot_symtab_sec->sh_entsize,
+                      (const char *)base() + dot_symtab_sec->sh_offset, false);
 }
 
-template<class ELFT>
-symbol_iterator ELFObjectFile<ELFT>::end_dynamic_symbols() const {
-  DataRefImpl SymbolData;
-  if (DynamicSymbolTableIndex == -1) {
-    SymbolData.d.a = 0;
-    SymbolData.d.b = 0;
-  } else {
-    const Elf_Shdr *SymbolTableSection = getSection(DynamicSymbolTableIndex);
-    SymbolData.d.a = SymbolTableSection->getEntityCount();
-    SymbolData.d.b = DynamicSymbolTableIndex;
-  }
-  return symbol_iterator(SymbolRef(SymbolData, this));
+template <class ELFT>
+typename ELFFile<ELFT>::Elf_Sym_Iter ELFFile<ELFT>::end_symbols() const {
+  if (!dot_symtab_sec)
+    return Elf_Sym_Iter(0, 0, false);
+  return Elf_Sym_Iter(dot_symtab_sec->sh_entsize,
+                      (const char *)base() + dot_symtab_sec->sh_offset +
+                          dot_symtab_sec->sh_size,
+                      false);
 }
 
-template<class ELFT>
-section_iterator ELFObjectFile<ELFT>::begin_sections() const {
-  DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(base() + Header->e_shoff);
-  return section_iterator(SectionRef(ret, this));
+template <class ELFT>
+typename ELFFile<ELFT>::Elf_Dyn_Iter
+ELFFile<ELFT>::begin_dynamic_table() const {
+  if (DynamicRegion.Addr)
+    return Elf_Dyn_Iter(DynamicRegion.EntSize,
+                        (const char *)DynamicRegion.Addr);
+  return Elf_Dyn_Iter(0, 0);
 }
 
-template<class ELFT>
-section_iterator ELFObjectFile<ELFT>::end_sections() const {
-  DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(base()
-                                     + Header->e_shoff
-                                     + (Header->e_shentsize*getNumSections()));
-  return section_iterator(SectionRef(ret, this));
-}
+template <class ELFT>
+typename ELFFile<ELFT>::Elf_Dyn_Iter
+ELFFile<ELFT>::end_dynamic_table(bool NULLEnd) const {
+  if (!DynamicRegion.Addr)
+    return Elf_Dyn_Iter(0, 0);
+  Elf_Dyn_Iter Ret(DynamicRegion.EntSize,
+                    (const char *)DynamicRegion.Addr + DynamicRegion.Size);
 
-template<class ELFT>
-typename ELFObjectFile<ELFT>::Elf_Dyn_iterator
-ELFObjectFile<ELFT>::begin_dynamic_table() const {
-  if (dot_dynamic_sec)
-    return Elf_Dyn_iterator(dot_dynamic_sec->sh_entsize,
-                            (const char *)base() + dot_dynamic_sec->sh_offset);
-  return Elf_Dyn_iterator(0, 0);
-}
+  if (NULLEnd) {
+    Elf_Dyn_Iter Start = begin_dynamic_table();
+    while (Start != Ret && Start->getTag() != ELF::DT_NULL)
+      ++Start;
 
-template<class ELFT>
-typename ELFObjectFile<ELFT>::Elf_Dyn_iterator
-ELFObjectFile<ELFT>::end_dynamic_table(bool NULLEnd) const {
-  if (dot_dynamic_sec) {
-    Elf_Dyn_iterator Ret(dot_dynamic_sec->sh_entsize,
-                         (const char *)base() + dot_dynamic_sec->sh_offset +
-                         dot_dynamic_sec->sh_size);
-
-    if (NULLEnd) {
-      Elf_Dyn_iterator Start = begin_dynamic_table();
-      while (Start != Ret && Start->getTag() != ELF::DT_NULL)
-        ++Start;
-
-      // Include the DT_NULL.
-      if (Start != Ret)
-        ++Start;
-      Ret = Start;
-    }
-    return Ret;
+    // Include the DT_NULL.
+    if (Start != Ret)
+      ++Start;
+    Ret = Start;
   }
-  return Elf_Dyn_iterator(0, 0);
+  return Ret;
 }
 
-template<class ELFT>
-StringRef ELFObjectFile<ELFT>::getLoadName() const {
+template <class ELFT>
+StringRef ELFFile<ELFT>::getLoadName() const {
   if (!dt_soname) {
     // Find the DT_SONAME entry
-    Elf_Dyn_iterator it = begin_dynamic_table();
-    Elf_Dyn_iterator ie = end_dynamic_table();
+    Elf_Dyn_Iter it = begin_dynamic_table();
+    Elf_Dyn_Iter ie = end_dynamic_table();
     while (it != ie && it->getTag() != ELF::DT_SONAME)
       ++it;
 
     if (it != ie) {
-      if (dot_dynstr_sec == NULL)
-        report_fatal_error("Dynamic string table is missing");
-      dt_soname = getString(dot_dynstr_sec, it->getVal());
+      dt_soname = getDynamicString(it->getVal());
     } else {
       dt_soname = "";
     }
@@ -2673,210 +826,23 @@ StringRef ELFObjectFile<ELFT>::getLoadName() const {
   return dt_soname;
 }
 
-template<class ELFT>
-library_iterator ELFObjectFile<ELFT>::begin_libraries_needed() const {
-  // Find the first DT_NEEDED entry
-  Elf_Dyn_iterator i = begin_dynamic_table();
-  Elf_Dyn_iterator e = end_dynamic_table();
-  while (i != e && i->getTag() != ELF::DT_NEEDED)
-    ++i;
-
-  DataRefImpl DRI;
-  DRI.p = reinterpret_cast<uintptr_t>(i.get());
-  return library_iterator(LibraryRef(DRI, this));
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
-                                               LibraryRef &Result) const {
-  // Use the same DataRefImpl format as DynRef.
-  Elf_Dyn_iterator i = Elf_Dyn_iterator(dot_dynamic_sec->sh_entsize,
-                                        reinterpret_cast<const char *>(Data.p));
-  Elf_Dyn_iterator e = end_dynamic_table();
-
-  // Skip the current dynamic table entry and find the next DT_NEEDED entry.
-  do
-    ++i;
-  while (i != e && i->getTag() != ELF::DT_NEEDED);
-
-  DataRefImpl DRI;
-  DRI.p = reinterpret_cast<uintptr_t>(i.get());
-  Result = LibraryRef(DRI, this);
-  return object_error::success;
-}
-
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
-                                               StringRef &Res) const {
-  Elf_Dyn_iterator i = Elf_Dyn_iterator(dot_dynamic_sec->sh_entsize,
-                                        reinterpret_cast<const char *>(Data.p));
-  if (i == end_dynamic_table())
-    report_fatal_error("getLibraryPath() called on iterator end");
-
-  if (i->getTag() != ELF::DT_NEEDED)
-    report_fatal_error("Invalid library_iterator");
-
-  // This uses .dynstr to lookup the name of the DT_NEEDED entry.
-  // THis works as long as DT_STRTAB == .dynstr. This is true most of
-  // the time, but the specification allows exceptions.
-  // TODO: This should really use DT_STRTAB instead. Doing this requires
-  // reading the program headers.
-  if (dot_dynstr_sec == NULL)
-    report_fatal_error("Dynamic string table is missing");
-  Res = getString(dot_dynstr_sec, i->getVal());
-  return object_error::success;
-}
-
-template<class ELFT>
-library_iterator ELFObjectFile<ELFT>::end_libraries_needed() const {
-  Elf_Dyn_iterator e = end_dynamic_table();
-  DataRefImpl DRI;
-  DRI.p = reinterpret_cast<uintptr_t>(e.get());
-  return library_iterator(LibraryRef(DRI, this));
-}
-
-template<class ELFT>
-uint8_t ELFObjectFile<ELFT>::getBytesInAddress() const {
-  return ELFT::Is64Bits ? 8 : 4;
-}
-
-template<class ELFT>
-StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
-  switch(Header->e_ident[ELF::EI_CLASS]) {
-  case ELF::ELFCLASS32:
-    switch(Header->e_machine) {
-    case ELF::EM_386:
-      return "ELF32-i386";
-    case ELF::EM_X86_64:
-      return "ELF32-x86-64";
-    case ELF::EM_ARM:
-      return "ELF32-arm";
-    case ELF::EM_HEXAGON:
-      return "ELF32-hexagon";
-    case ELF::EM_MIPS:
-      return "ELF32-mips";
-    case ELF::EM_PPC:
-      return "ELF32-ppc";
-    default:
-      return "ELF32-unknown";
-    }
-  case ELF::ELFCLASS64:
-    switch(Header->e_machine) {
-    case ELF::EM_386:
-      return "ELF64-i386";
-    case ELF::EM_X86_64:
-      return "ELF64-x86-64";
-    case ELF::EM_AARCH64:
-      return "ELF64-aarch64";
-    case ELF::EM_PPC64:
-      return "ELF64-ppc64";
-    case ELF::EM_S390:
-      return "ELF64-s390";
-    default:
-      return "ELF64-unknown";
-    }
-  default:
-    // FIXME: Proper error handling.
-    report_fatal_error("Invalid ELFCLASS!");
-  }
-}
-
-template<class ELFT>
-unsigned ELFObjectFile<ELFT>::getArch() const {
-  switch(Header->e_machine) {
-  case ELF::EM_386:
-    return Triple::x86;
-  case ELF::EM_X86_64:
-    return Triple::x86_64;
-  case ELF::EM_AARCH64:
-    return Triple::aarch64;
-  case ELF::EM_ARM:
-    return Triple::arm;
-  case ELF::EM_HEXAGON:
-    return Triple::hexagon;
-  case ELF::EM_MIPS:
-    return (ELFT::TargetEndianness == support::little) ?
-           Triple::mipsel : Triple::mips;
-  case ELF::EM_PPC64:
-    return (ELFT::TargetEndianness == support::little) ?
-           Triple::ppc64le : Triple::ppc64;
-  case ELF::EM_S390:
-    return Triple::systemz;
-  default:
-    return Triple::UnknownArch;
-  }
-}
-
-template<class ELFT>
-uint64_t ELFObjectFile<ELFT>::getNumSections() const {
-  assert(Header && "Header not initialized!");
-  if (Header->e_shnum == ELF::SHN_UNDEF) {
-    assert(SectionHeaderTable && "SectionHeaderTable not initialized!");
-    return SectionHeaderTable->sh_size;
-  }
-  return Header->e_shnum;
-}
-
-template<class ELFT>
-uint64_t
-ELFObjectFile<ELFT>::getStringTableIndex() const {
-  if (Header->e_shnum == ELF::SHN_UNDEF) {
-    if (Header->e_shstrndx == ELF::SHN_HIRESERVE)
-      return SectionHeaderTable->sh_link;
-    if (Header->e_shstrndx >= getNumSections())
-      return 0;
-  }
-  return Header->e_shstrndx;
-}
-
-template<class ELFT>
-template<typename T>
-inline const T *
-ELFObjectFile<ELFT>::getEntry(uint32_t Section, uint32_t Entry) const {
+template <class ELFT>
+template <typename T>
+const T *ELFFile<ELFT>::getEntry(uint32_t Section, uint32_t Entry) const {
   return getEntry<T>(getSection(Section), Entry);
 }
 
-template<class ELFT>
-template<typename T>
-inline const T *
-ELFObjectFile<ELFT>::getEntry(const Elf_Shdr * Section, uint32_t Entry) const {
-  return reinterpret_cast<const T *>(
-           base()
-           + Section->sh_offset
-           + (Entry * Section->sh_entsize));
-}
-
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Sym *
-ELFObjectFile<ELFT>::getSymbol(DataRefImpl Symb) const {
-  return getEntry<Elf_Sym>(Symb.d.b, Symb.d.a);
-}
-
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Rel *
-ELFObjectFile<ELFT>::getRel(DataRefImpl Rel) const {
-  return getEntry<Elf_Rel>(Rel.d.a, Rel.d.b);
-}
-
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Rela *
-ELFObjectFile<ELFT>::getRela(DataRefImpl Rela) const {
-  return getEntry<Elf_Rela>(Rela.d.a, Rela.d.b);
-}
-
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Shdr *
-ELFObjectFile<ELFT>::getSection(DataRefImpl Symb) const {
-  const Elf_Shdr *sec = getSection(Symb.d.b);
-  if (sec->sh_type != ELF::SHT_SYMTAB || sec->sh_type != ELF::SHT_DYNSYM)
-    // FIXME: Proper error handling.
-    report_fatal_error("Invalid symbol table section!");
-  return sec;
+template <class ELFT>
+template <typename T>
+const T *ELFFile<ELFT>::getEntry(const Elf_Shdr *Section,
+                                 uint32_t Entry) const {
+  return reinterpret_cast<const T *>(base() + Section->sh_offset +
+                                     (Entry * Section->sh_entsize));
 }
 
-template<class ELFT>
-const typename ELFObjectFile<ELFT>::Elf_Shdr *
-ELFObjectFile<ELFT>::getSection(uint32_t index) const {
+template <class ELFT>
+const typename ELFFile<ELFT>::Elf_Shdr *
+ELFFile<ELFT>::getSection(uint32_t index) const {
   if (index == 0)
     return 0;
   if (!SectionHeaderTable || index >= getNumSections())
@@ -2888,15 +854,15 @@ ELFObjectFile<ELFT>::getSection(uint32_t index) const {
          + (index * Header->e_shentsize));
 }
 
-template<class ELFT>
-const char *ELFObjectFile<ELFT>::getString(uint32_t section,
-                                           ELF::Elf32_Word offset) const {
+template <class ELFT>
+const char *ELFFile<ELFT>::getString(uint32_t section,
+                                     ELF::Elf32_Word offset) const {
   return getString(getSection(section), offset);
 }
 
-template<class ELFT>
-const char *ELFObjectFile<ELFT>::getString(const Elf_Shdr *section,
-                                           ELF::Elf32_Word offset) const {
+template <class ELFT>
+const char *ELFFile<ELFT>::getString(const Elf_Shdr *section,
+                                     ELF::Elf32_Word offset) const {
   assert(section && section->sh_type == ELF::SHT_STRTAB && "Invalid section!");
   if (offset >= section->sh_size)
     // FIXME: Proper error handling.
@@ -2904,56 +870,63 @@ const char *ELFObjectFile<ELFT>::getString(const Elf_Shdr *section,
   return (const char *)base() + section->sh_offset + offset;
 }
 
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolName(const Elf_Shdr *section,
-                                              const Elf_Sym *symb,
-                                              StringRef &Result) const {
-  if (symb->st_name == 0) {
-    const Elf_Shdr *section = getSection(symb);
-    if (!section)
-      Result = "";
-    else
-      Result = getString(dot_shstrtab_sec, section->sh_name);
-    return object_error::success;
-  }
+template <class ELFT>
+const char *ELFFile<ELFT>::getDynamicString(uintX_t Offset) const {
+  if (!DynStrRegion.Addr || Offset >= DynStrRegion.Size)
+    return 0;
+  return (const char *)DynStrRegion.Addr + Offset;
+}
 
-  if (DynamicSymbolTableIndex != -1 &&
-      section == getSection(DynamicSymbolTableIndex)) {
-    // Symbol is in .dynsym, use .dynstr string table
-    Result = getString(dot_dynstr_sec, symb->st_name);
-  } else {
-    // Use the default symbol table name section.
-    Result = getString(dot_strtab_sec, symb->st_name);
+template <class ELFT>
+ErrorOr<StringRef> ELFFile<ELFT>::getSymbolName(Elf_Sym_Iter Sym) const {
+  if (!Sym.isDynamic())
+    return getSymbolName(dot_symtab_sec, &*Sym);
+
+  if (!DynStrRegion.Addr || Sym->st_name >= DynStrRegion.Size)
+    return object_error::parse_failed;
+  return StringRef(getDynamicString(Sym->st_name));
+}
+
+template <class ELFT>
+ErrorOr<StringRef> ELFFile<ELFT>::getSymbolName(const Elf_Shdr *Section,
+                                                const Elf_Sym *Symb) const {
+  if (Symb->st_name == 0) {
+    const Elf_Shdr *ContainingSec = getSection(Symb);
+    if (ContainingSec)
+      return getSectionName(ContainingSec);
   }
-  return object_error::success;
+
+  const Elf_Shdr *StrTab = getSection(Section->sh_link);
+  if (Symb->st_name >= StrTab->sh_size)
+    return object_error::parse_failed;
+  return StringRef(getString(StrTab, Symb->st_name));
 }
 
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSectionName(const Elf_Shdr *section,
-                                               StringRef &Result) const {
-  Result = StringRef(getString(dot_shstrtab_sec, section->sh_name));
-  return object_error::success;
+template <class ELFT>
+ErrorOr<StringRef>
+ELFFile<ELFT>::getSectionName(const Elf_Shdr *Section) const {
+  if (Section->sh_name >= dot_shstrtab_sec->sh_size)
+    return object_error::parse_failed;
+  return StringRef(getString(dot_shstrtab_sec, Section->sh_name));
 }
 
-template<class ELFT>
-error_code ELFObjectFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
-                                                 const Elf_Sym *symb,
-                                                 StringRef &Version,
-                                                 bool &IsDefault) const {
+template <class ELFT>
+ErrorOr<StringRef> ELFFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
+                                                   const Elf_Sym *symb,
+                                                   bool &IsDefault) const {
   // Handle non-dynamic symbols.
-  if (section != getSection(DynamicSymbolTableIndex)) {
+  if (section != DynSymRegion.Addr && section != 0) {
     // Non-dynamic symbols can have versions in their names
     // A name of the form 'foo@V1' indicates version 'V1', non-default.
     // A name of the form 'foo@@V2' indicates version 'V2', default version.
-    StringRef Name;
-    error_code ec = getSymbolName(section, symb, Name);
-    if (ec != object_error::success)
-      return ec;
+    ErrorOr<StringRef> SymName = getSymbolName(section, symb);
+    if (!SymName)
+      return SymName;
+    StringRef Name = *SymName;
     size_t atpos = Name.find('@');
     if (atpos == StringRef::npos) {
-      Version = "";
       IsDefault = false;
-      return object_error::success;
+      return StringRef("");
     }
     ++atpos;
     if (atpos < Name.size() && Name[atpos] == '@') {
@@ -2962,21 +935,19 @@ error_code ELFObjectFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
     } else {
       IsDefault = false;
     }
-    Version = Name.substr(atpos);
-    return object_error::success;
+    return Name.substr(atpos);
   }
 
   // This is a dynamic symbol. Look in the GNU symbol version table.
   if (dot_gnu_version_sec == NULL) {
     // No version table.
-    Version = "";
     IsDefault = false;
-    return object_error::success;
+    return StringRef("");
   }
 
   // Determine the position in the symbol table of this entry.
-  const char *sec_start = (const char*)base() + section->sh_offset;
-  size_t entry_index = ((const char*)symb - sec_start)/section->sh_entsize;
+  size_t entry_index = ((const char *)symb - (const char *)DynSymRegion.Addr) /
+                       DynSymRegion.EntSize;
 
   // Get the corresponding version index entry
   const Elf_Versym *vs = getEntry<Elf_Versym>(dot_gnu_version_sec, entry_index);
@@ -2985,16 +956,14 @@ error_code ELFObjectFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
   // Special markers for unversioned symbols.
   if (version_index == ELF::VER_NDX_LOCAL ||
       version_index == ELF::VER_NDX_GLOBAL) {
-    Version = "";
     IsDefault = false;
-    return object_error::success;
+    return StringRef("");
   }
 
   // Lookup this symbol in the version table
   LoadVersionMap();
   if (version_index >= VersionMap.size() || VersionMap[version_index].isNull())
-    report_fatal_error("Symbol has version index without corresponding "
-                       "define or reference entry");
+    return object_error::parse_failed;
   const VersionMapEntry &entry = VersionMap[version_index];
 
   // Get the version name string
@@ -3005,7 +974,6 @@ error_code ELFObjectFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
   } else {
     name_offset = entry.getVernaux()->vna_name;
   }
-  Version = getString(dot_dynstr_sec, name_offset);
 
   // Set IsDefault
   if (entry.isVerdef()) {
@@ -3014,57 +982,9 @@ error_code ELFObjectFile<ELFT>::getSymbolVersion(const Elf_Shdr *section,
     IsDefault = false;
   }
 
-  return object_error::success;
-}
-
-/// FIXME: Maybe we should have a base ElfObjectFile that is not a template
-/// and make these member functions?
-static inline error_code getELFRelocationAddend(const RelocationRef R,
-                                                int64_t &Addend) {
-  const ObjectFile *Obj = R.getObjectFile();
-  DataRefImpl DRI = R.getRawDataRefImpl();
-  // Little-endian 32-bit
-  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  // Big-endian 32-bit
-  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  // Little-endian 64-bit
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  // Big-endian 64-bit
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return ELFObj->getRelocationAddend(DRI, Addend);
-
-  llvm_unreachable("Object passed to getELFRelocationAddend() is not ELF");
-}
-
-/// This is a generic interface for retrieving GNU symbol version
-/// information from an ELFObjectFile.
-static inline error_code GetELFSymbolVersion(const ObjectFile *Obj,
-                                             const SymbolRef &Sym,
-                                             StringRef &Version,
-                                             bool &IsDefault) {
-  // Little-endian 32-bit
-  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  // Big-endian 32-bit
-  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  // Little-endian 64-bit
-  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  // Big-endian 64-bit
-  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
-
-  llvm_unreachable("Object passed to GetELFSymbolVersion() is not ELF");
+  if (name_offset >= DynStrRegion.Size)
+    return object_error::parse_failed;
+  return StringRef(getDynamicString(name_offset));
 }
 
 /// This function returns the hash value for a symbol in the .dynsym section
@@ -3081,8 +1001,7 @@ static inline unsigned elf_hash(StringRef &symbolName) {
   }
   return h;
 }
-
-}
-}
+} // end namespace object
+} // end namespace llvm
 
 #endif
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
new file mode 100644
index 0000000..962a3e2
--- /dev/null
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -0,0 +1,1027 @@
+//===- ELFObjectFile.h - ELF object file implementation ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ELFObjectFile template class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_ELF_OBJECT_FILE_H
+#define LLVM_OBJECT_ELF_OBJECT_FILE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cctype>
+#include <limits>
+#include <utility>
+
+namespace llvm {
+namespace object {
+
+template <class ELFT>
+class ELFObjectFile : public ObjectFile {
+public:
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+
+  typedef typename ELFFile<ELFT>::uintX_t uintX_t;
+
+  typedef typename ELFFile<ELFT>::Elf_Sym Elf_Sym;
+  typedef typename ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
+  typedef typename ELFFile<ELFT>::Elf_Rel Elf_Rel;
+  typedef typename ELFFile<ELFT>::Elf_Rela Elf_Rela;
+  typedef typename ELFFile<ELFT>::Elf_Dyn Elf_Dyn;
+
+  typedef typename ELFFile<ELFT>::Elf_Sym_Iter Elf_Sym_Iter;
+  typedef typename ELFFile<ELFT>::Elf_Shdr_Iter Elf_Shdr_Iter;
+  typedef typename ELFFile<ELFT>::Elf_Dyn_Iter Elf_Dyn_Iter;
+
+protected:
+  ELFFile<ELFT> EF;
+
+  virtual error_code getSymbolNext(DataRefImpl Symb, SymbolRef &Res) const;
+  virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const;
+  virtual error_code getSymbolFileOffset(DataRefImpl Symb, uint64_t &Res) const;
+  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const;
+  virtual error_code getSymbolAlignment(DataRefImpl Symb, uint32_t &Res) const;
+  virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const;
+  virtual error_code getSymbolFlags(DataRefImpl Symb, uint32_t &Res) const;
+  virtual error_code getSymbolType(DataRefImpl Symb,
+                                   SymbolRef::Type &Res) const;
+  virtual error_code getSymbolSection(DataRefImpl Symb,
+                                      section_iterator &Res) const;
+  virtual error_code getSymbolValue(DataRefImpl Symb, uint64_t &Val) const;
+
+  virtual error_code getLibraryNext(DataRefImpl Data, LibraryRef &Result) const;
+  virtual error_code getLibraryPath(DataRefImpl Data, StringRef &Res) const;
+
+  virtual error_code getSectionNext(DataRefImpl Sec, SectionRef &Res) const;
+  virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const;
+  virtual error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const;
+  virtual error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const;
+  virtual error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const;
+  virtual error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const;
+  virtual error_code isSectionText(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionData(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionBSS(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionRequiredForExecution(DataRefImpl Sec,
+                                                   bool &Res) const;
+  virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const;
+  virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
+                                           bool &Result) const;
+  virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const;
+  virtual relocation_iterator section_rel_end(DataRefImpl Sec) const;
+  virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
+
+  virtual error_code getRelocationNext(DataRefImpl Rel,
+                                       RelocationRef &Res) const;
+  virtual error_code getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const;
+  virtual error_code getRelocationOffset(DataRefImpl Rel, uint64_t &Res) const;
+  virtual symbol_iterator getRelocationSymbol(DataRefImpl Rel) const;
+  virtual error_code getRelocationType(DataRefImpl Rel, uint64_t &Res) const;
+  virtual error_code getRelocationTypeName(DataRefImpl Rel,
+                                           SmallVectorImpl<char> &Result) const;
+  virtual error_code
+  getRelocationValueString(DataRefImpl Rel,
+                           SmallVectorImpl<char> &Result) const;
+
+  uint64_t getROffset(DataRefImpl Rel) const;
+  StringRef getRelocationTypeName(uint32_t Type) const;
+
+  /// \brief Get the relocation section that contains \a Rel.
+  const Elf_Shdr *getRelSection(DataRefImpl Rel) const {
+    return EF.getSection(Rel.d.a);
+  }
+
+  const Elf_Rel *getRel(DataRefImpl Rel) const;
+  const Elf_Rela *getRela(DataRefImpl Rela) const;
+
+  Elf_Sym_Iter toELFSymIter(DataRefImpl Symb) const {
+    bool IsDynamic = Symb.p & 1;
+    if (IsDynamic)
+      return Elf_Sym_Iter(
+          EF.begin_dynamic_symbols().getEntSize(),
+          reinterpret_cast<const char *>(Symb.p & ~uintptr_t(1)), IsDynamic);
+    return Elf_Sym_Iter(EF.begin_symbols().getEntSize(),
+                        reinterpret_cast<const char *>(Symb.p), IsDynamic);
+  }
+
+  DataRefImpl toDRI(Elf_Sym_Iter Symb) const {
+    DataRefImpl DRI;
+    DRI.p = reinterpret_cast<uintptr_t>(Symb.get()) |
+      static_cast<uintptr_t>(Symb.isDynamic());
+    return DRI;
+  }
+
+  Elf_Shdr_Iter toELFShdrIter(DataRefImpl Sec) const {
+    return Elf_Shdr_Iter(EF.getHeader()->e_shentsize,
+                         reinterpret_cast<const char *>(Sec.p));
+  }
+
+  DataRefImpl toDRI(Elf_Shdr_Iter Sec) const {
+    DataRefImpl DRI;
+    DRI.p = reinterpret_cast<uintptr_t>(Sec.get());
+    return DRI;
+  }
+
+  DataRefImpl toDRI(const Elf_Shdr *Sec) const {
+    DataRefImpl DRI;
+    DRI.p = reinterpret_cast<uintptr_t>(Sec);
+    return DRI;
+  }
+
+  Elf_Dyn_Iter toELFDynIter(DataRefImpl Dyn) const {
+    return Elf_Dyn_Iter(EF.begin_dynamic_table().getEntSize(),
+                        reinterpret_cast<const char *>(Dyn.p));
+  }
+
+  DataRefImpl toDRI(Elf_Dyn_Iter Dyn) const {
+    DataRefImpl DRI;
+    DRI.p = reinterpret_cast<uintptr_t>(Dyn.get());
+    return DRI;
+  }
+
+  // This flag is used for classof, to distinguish ELFObjectFile from
+  // its subclass. If more subclasses will be created, this flag will
+  // have to become an enum.
+  bool isDyldELFObject;
+
+public:
+  ELFObjectFile(MemoryBuffer *Object, error_code &ec);
+
+  const Elf_Sym *getSymbol(DataRefImpl Symb) const;
+
+  virtual symbol_iterator begin_symbols() const;
+  virtual symbol_iterator end_symbols() const;
+
+  virtual symbol_iterator begin_dynamic_symbols() const;
+  virtual symbol_iterator end_dynamic_symbols() const;
+
+  virtual section_iterator begin_sections() const;
+  virtual section_iterator end_sections() const;
+
+  virtual library_iterator begin_libraries_needed() const;
+  virtual library_iterator end_libraries_needed() const;
+
+  error_code getRelocationAddend(DataRefImpl Rel, int64_t &Res) const;
+  error_code getSymbolVersion(SymbolRef Symb, StringRef &Version,
+                              bool &IsDefault) const;
+
+  virtual uint8_t getBytesInAddress() const;
+  virtual StringRef getFileFormatName() const;
+  virtual StringRef getObjectType() const { return "ELF"; }
+  virtual unsigned getArch() const;
+  virtual StringRef getLoadName() const;
+
+  const ELFFile<ELFT> *getELFFile() const { return &EF; }
+
+  bool isDyldType() const { return isDyldELFObject; }
+  static inline bool classof(const Binary *v) {
+    return v->getType() == getELFType(ELFT::TargetEndianness == support::little,
+                                      ELFT::Is64Bits);
+  }
+};
+
+// Use an alignment of 2 for the typedefs since that is the worst case for
+// ELF files in archives.
+typedef ELFObjectFile<ELFType<support::little, 2, false> > ELF32LEObjectFile;
+typedef ELFObjectFile<ELFType<support::little, 2, true> > ELF64LEObjectFile;
+typedef ELFObjectFile<ELFType<support::big, 2, false> > ELF32BEObjectFile;
+typedef ELFObjectFile<ELFType<support::big, 2, true> > ELF64BEObjectFile;
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolNext(DataRefImpl Symb,
+                                              SymbolRef &Result) const {
+  Result = SymbolRef(toDRI(++toELFSymIter(Symb)), this);
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolName(DataRefImpl Symb,
+                                              StringRef &Result) const {
+  ErrorOr<StringRef> Name = EF.getSymbolName(toELFSymIter(Symb));
+  if (!Name)
+    return Name;
+  Result = *Name;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolVersion(SymbolRef SymRef,
+                                                 StringRef &Version,
+                                                 bool &IsDefault) const {
+  DataRefImpl Symb = SymRef.getRawDataRefImpl();
+  const Elf_Sym *symb = getSymbol(Symb);
+  ErrorOr<StringRef> Ver =
+      EF.getSymbolVersion(EF.getSection(Symb.d.b), symb, IsDefault);
+  if (!Ver)
+    return Ver;
+  Version = *Ver;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolFileOffset(DataRefImpl Symb,
+                                                    uint64_t &Result) const {
+  const Elf_Sym *ESym = getSymbol(Symb);
+  const Elf_Shdr *ESec;
+  switch (EF.getSymbolTableIndex(ESym)) {
+  case ELF::SHN_COMMON:
+  // Unintialized symbols have no offset in the object file
+  case ELF::SHN_UNDEF:
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  case ELF::SHN_ABS:
+    Result = ESym->st_value;
+    return object_error::success;
+  default:
+    ESec = EF.getSection(ESym);
+  }
+
+  switch (ESym->getType()) {
+  case ELF::STT_SECTION:
+    Result = ESec ? ESec->sh_offset : UnknownAddressOrSize;
+    return object_error::success;
+  case ELF::STT_FUNC:
+  case ELF::STT_OBJECT:
+  case ELF::STT_NOTYPE:
+    Result = ESym->st_value + (ESec ? ESec->sh_offset : 0);
+    return object_error::success;
+  default:
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolAddress(DataRefImpl Symb,
+                                                 uint64_t &Result) const {
+  const Elf_Sym *ESym = getSymbol(Symb);
+  const Elf_Shdr *ESec;
+  switch (EF.getSymbolTableIndex(ESym)) {
+  case ELF::SHN_COMMON:
+  case ELF::SHN_UNDEF:
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  case ELF::SHN_ABS:
+    Result = ESym->st_value;
+    return object_error::success;
+  default:
+    ESec = EF.getSection(ESym);
+  }
+
+  switch (ESym->getType()) {
+  case ELF::STT_SECTION:
+    Result = ESec ? ESec->sh_addr : UnknownAddressOrSize;
+    return object_error::success;
+  case ELF::STT_FUNC:
+  case ELF::STT_OBJECT:
+  case ELF::STT_NOTYPE:
+    bool IsRelocatable;
+    switch (EF.getHeader()->e_type) {
+    case ELF::ET_EXEC:
+    case ELF::ET_DYN:
+      IsRelocatable = false;
+      break;
+    default:
+      IsRelocatable = true;
+    }
+    Result = ESym->st_value;
+
+    // Clear the ARM/Thumb indicator flag.
+    if (EF.getHeader()->e_machine == ELF::EM_ARM)
+      Result &= ~1;
+
+    if (IsRelocatable && ESec != 0)
+      Result += ESec->sh_addr;
+    return object_error::success;
+  default:
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolAlignment(DataRefImpl Symb,
+                                                   uint32_t &Res) const {
+  Elf_Sym_Iter Sym = toELFSymIter(Symb);
+  if (Sym->st_shndx == ELF::SHN_COMMON)
+    Res = Sym->st_value;
+  else
+    Res = 0;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolSize(DataRefImpl Symb,
+                                              uint64_t &Result) const {
+  Result = toELFSymIter(Symb)->st_size;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolType(DataRefImpl Symb,
+                                              SymbolRef::Type &Result) const {
+  const Elf_Sym *ESym = getSymbol(Symb);
+
+  switch (ESym->getType()) {
+  case ELF::STT_NOTYPE:
+    Result = SymbolRef::ST_Unknown;
+    break;
+  case ELF::STT_SECTION:
+    Result = SymbolRef::ST_Debug;
+    break;
+  case ELF::STT_FILE:
+    Result = SymbolRef::ST_File;
+    break;
+  case ELF::STT_FUNC:
+    Result = SymbolRef::ST_Function;
+    break;
+  case ELF::STT_OBJECT:
+  case ELF::STT_COMMON:
+  case ELF::STT_TLS:
+    Result = SymbolRef::ST_Data;
+    break;
+  default:
+    Result = SymbolRef::ST_Other;
+    break;
+  }
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolFlags(DataRefImpl Symb,
+                                               uint32_t &Result) const {
+  const Elf_Sym *ESym = getSymbol(Symb);
+
+  Result = SymbolRef::SF_None;
+
+  if (ESym->getBinding() != ELF::STB_LOCAL)
+    Result |= SymbolRef::SF_Global;
+
+  if (ESym->getBinding() == ELF::STB_WEAK)
+    Result |= SymbolRef::SF_Weak;
+
+  if (ESym->st_shndx == ELF::SHN_ABS)
+    Result |= SymbolRef::SF_Absolute;
+
+  if (ESym->getType() == ELF::STT_FILE || ESym->getType() == ELF::STT_SECTION ||
+      ESym == &*EF.begin_symbols())
+    Result |= SymbolRef::SF_FormatSpecific;
+
+  if (EF.getSymbolTableIndex(ESym) == ELF::SHN_UNDEF)
+    Result |= SymbolRef::SF_Undefined;
+
+  if (ESym->getType() == ELF::STT_COMMON ||
+      EF.getSymbolTableIndex(ESym) == ELF::SHN_COMMON)
+    Result |= SymbolRef::SF_Common;
+
+  if (ESym->getType() == ELF::STT_TLS)
+    Result |= SymbolRef::SF_ThreadLocal;
+
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolSection(DataRefImpl Symb,
+                                                 section_iterator &Res) const {
+  const Elf_Sym *ESym = getSymbol(Symb);
+  const Elf_Shdr *ESec = EF.getSection(ESym);
+  if (!ESec)
+    Res = end_sections();
+  else {
+    DataRefImpl Sec;
+    Sec.p = reinterpret_cast<intptr_t>(ESec);
+    Res = section_iterator(SectionRef(Sec, this));
+  }
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSymbolValue(DataRefImpl Symb,
+                                               uint64_t &Val) const {
+  const Elf_Sym *ESym = getSymbol(Symb);
+  Val = ESym->st_value;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionNext(DataRefImpl Sec,
+                                               SectionRef &Result) const {
+  Result = SectionRef(toDRI(++toELFShdrIter(Sec)), this);
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionName(DataRefImpl Sec,
+                                               StringRef &Result) const {
+  ErrorOr<StringRef> Name = EF.getSectionName(&*toELFShdrIter(Sec));
+  if (!Name)
+    return Name;
+  Result = *Name;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec,
+                                                  uint64_t &Result) const {
+  Result = toELFShdrIter(Sec)->sh_addr;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec,
+                                               uint64_t &Result) const {
+  Result = toELFShdrIter(Sec)->sh_size;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionContents(DataRefImpl Sec,
+                                                   StringRef &Result) const {
+  Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
+  Result = StringRef((const char *)base() + EShdr->sh_offset, EShdr->sh_size);
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getSectionAlignment(DataRefImpl Sec,
+                                                    uint64_t &Result) const {
+  Result = toELFShdrIter(Sec)->sh_addralign;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionText(DataRefImpl Sec,
+                                              bool &Result) const {
+  Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_EXECINSTR;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionData(DataRefImpl Sec,
+                                              bool &Result) const {
+  Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
+  Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
+           EShdr->sh_type == ELF::SHT_PROGBITS;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionBSS(DataRefImpl Sec,
+                                             bool &Result) const {
+  Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
+  Result = EShdr->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE) &&
+           EShdr->sh_type == ELF::SHT_NOBITS;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code
+ELFObjectFile<ELFT>::isSectionRequiredForExecution(DataRefImpl Sec,
+                                                   bool &Result) const {
+  Result = toELFShdrIter(Sec)->sh_flags & ELF::SHF_ALLOC;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionVirtual(DataRefImpl Sec,
+                                                 bool &Result) const {
+  Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionZeroInit(DataRefImpl Sec,
+                                                  bool &Result) const {
+  Result = toELFShdrIter(Sec)->sh_type == ELF::SHT_NOBITS;
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::isSectionReadOnlyData(DataRefImpl Sec,
+                                                      bool &Result) const {
+  Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
+  Result = !(EShdr->sh_flags & (ELF::SHF_WRITE | ELF::SHF_EXECINSTR));
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::sectionContainsSymbol(DataRefImpl Sec,
+                                                      DataRefImpl Symb,
+                                                      bool &Result) const {
+  Elf_Sym_Iter ESym = toELFSymIter(Symb);
+
+  uintX_t Index = ESym->st_shndx;
+  bool Reserved = Index >= ELF::SHN_LORESERVE && Index <= ELF::SHN_HIRESERVE;
+
+  Result = !Reserved && (&*toELFShdrIter(Sec) == EF.getSection(ESym->st_shndx));
+  return object_error::success;
+}
+
+template <class ELFT>
+relocation_iterator
+ELFObjectFile<ELFT>::section_rel_begin(DataRefImpl Sec) const {
+  DataRefImpl RelData;
+  uintptr_t SHT = reinterpret_cast<uintptr_t>(EF.begin_sections().get());
+  RelData.d.a = (Sec.p - SHT) / EF.getHeader()->e_shentsize;
+  RelData.d.b = 0;
+  return relocation_iterator(RelocationRef(RelData, this));
+}
+
+template <class ELFT>
+relocation_iterator
+ELFObjectFile<ELFT>::section_rel_end(DataRefImpl Sec) const {
+  DataRefImpl RelData;
+  uintptr_t SHT = reinterpret_cast<uintptr_t>(EF.begin_sections().get());
+  const Elf_Shdr *S = reinterpret_cast<const Elf_Shdr *>(Sec.p);
+  RelData.d.a = (Sec.p - SHT) / EF.getHeader()->e_shentsize;
+  if (S->sh_type != ELF::SHT_RELA && S->sh_type != ELF::SHT_REL)
+    RelData.d.b = 0;
+  else
+    RelData.d.b = S->sh_size / S->sh_entsize;
+
+  return relocation_iterator(RelocationRef(RelData, this));
+}
+
+template <class ELFT>
+section_iterator
+ELFObjectFile<ELFT>::getRelocatedSection(DataRefImpl Sec) const {
+  if (EF.getHeader()->e_type != ELF::ET_REL)
+    return end_sections();
+
+  Elf_Shdr_Iter EShdr = toELFShdrIter(Sec);
+  uintX_t Type = EShdr->sh_type;
+  if (Type != ELF::SHT_REL && Type != ELF::SHT_RELA)
+    return end_sections();
+
+  const Elf_Shdr *R = EF.getSection(EShdr->sh_info);
+  return section_iterator(SectionRef(toDRI(R), this));
+}
+
+// Relocations
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationNext(DataRefImpl Rel,
+                                                  RelocationRef &Result) const {
+  ++Rel.d.b;
+  Result = RelocationRef(Rel, this);
+  return object_error::success;
+}
+
+template <class ELFT>
+symbol_iterator
+ELFObjectFile<ELFT>::getRelocationSymbol(DataRefImpl Rel) const {
+  uint32_t symbolIdx;
+  const Elf_Shdr *sec = getRelSection(Rel);
+  switch (sec->sh_type) {
+  default:
+    report_fatal_error("Invalid section type in Rel!");
+  case ELF::SHT_REL: {
+    symbolIdx = getRel(Rel)->getSymbol(EF.isMips64EL());
+    break;
+  }
+  case ELF::SHT_RELA: {
+    symbolIdx = getRela(Rel)->getSymbol(EF.isMips64EL());
+    break;
+  }
+  }
+  if (!symbolIdx)
+    return end_symbols();
+
+  const Elf_Shdr *SymSec = EF.getSection(sec->sh_link);
+
+  DataRefImpl SymbolData;
+  switch (SymSec->sh_type) {
+  default:
+    report_fatal_error("Invalid symbol table section type!");
+  case ELF::SHT_SYMTAB:
+    SymbolData = toDRI(EF.begin_symbols() + symbolIdx);
+    break;
+  case ELF::SHT_DYNSYM:
+    SymbolData = toDRI(EF.begin_dynamic_symbols() + symbolIdx);
+    break;
+  }
+
+  return symbol_iterator(SymbolRef(SymbolData, this));
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationAddress(DataRefImpl Rel,
+                                                     uint64_t &Result) const {
+  Result = getROffset(Rel);
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationOffset(DataRefImpl Rel,
+                                                    uint64_t &Result) const {
+  Result = getROffset(Rel);
+  return object_error::success;
+}
+
+template <class ELFT>
+uint64_t ELFObjectFile<ELFT>::getROffset(DataRefImpl Rel) const {
+  const Elf_Shdr *sec = getRelSection(Rel);
+  switch (sec->sh_type) {
+  default:
+    report_fatal_error("Invalid section type in Rel!");
+  case ELF::SHT_REL:
+    return getRel(Rel)->r_offset;
+  case ELF::SHT_RELA:
+    return getRela(Rel)->r_offset;
+  }
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationType(DataRefImpl Rel,
+                                                  uint64_t &Result) const {
+  const Elf_Shdr *sec = getRelSection(Rel);
+  switch (sec->sh_type) {
+  default:
+    report_fatal_error("Invalid section type in Rel!");
+  case ELF::SHT_REL: {
+    Result = getRel(Rel)->getType(EF.isMips64EL());
+    break;
+  }
+  case ELF::SHT_RELA: {
+    Result = getRela(Rel)->getType(EF.isMips64EL());
+    break;
+  }
+  }
+  return object_error::success;
+}
+
+template <class ELFT>
+StringRef ELFObjectFile<ELFT>::getRelocationTypeName(uint32_t Type) const {
+  return getELFRelocationTypeName(EF.getHeader()->e_machine, Type);
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationTypeName(
+    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
+  const Elf_Shdr *sec = getRelSection(Rel);
+  uint32_t type;
+  switch (sec->sh_type) {
+  default:
+    return object_error::parse_failed;
+  case ELF::SHT_REL: {
+    type = getRel(Rel)->getType(EF.isMips64EL());
+    break;
+  }
+  case ELF::SHT_RELA: {
+    type = getRela(Rel)->getType(EF.isMips64EL());
+    break;
+  }
+  }
+
+  EF.getRelocationTypeName(type, Result);
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationAddend(DataRefImpl Rel,
+                                                    int64_t &Result) const {
+  const Elf_Shdr *sec = getRelSection(Rel);
+  switch (sec->sh_type) {
+  default:
+    report_fatal_error("Invalid section type in Rel!");
+  case ELF::SHT_REL: {
+    Result = 0;
+    return object_error::success;
+  }
+  case ELF::SHT_RELA: {
+    Result = getRela(Rel)->r_addend;
+    return object_error::success;
+  }
+  }
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getRelocationValueString(
+    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
+  const Elf_Shdr *sec = getRelSection(Rel);
+  uint8_t type;
+  StringRef res;
+  int64_t addend = 0;
+  uint16_t symbol_index = 0;
+  switch (sec->sh_type) {
+  default:
+    return object_error::parse_failed;
+  case ELF::SHT_REL: {
+    type = getRel(Rel)->getType(EF.isMips64EL());
+    symbol_index = getRel(Rel)->getSymbol(EF.isMips64EL());
+    // TODO: Read implicit addend from section data.
+    break;
+  }
+  case ELF::SHT_RELA: {
+    type = getRela(Rel)->getType(EF.isMips64EL());
+    symbol_index = getRela(Rel)->getSymbol(EF.isMips64EL());
+    addend = getRela(Rel)->r_addend;
+    break;
+  }
+  }
+  const Elf_Sym *symb =
+      EF.template getEntry<Elf_Sym>(sec->sh_link, symbol_index);
+  ErrorOr<StringRef> SymName =
+      EF.getSymbolName(EF.getSection(sec->sh_link), symb);
+  if (!SymName)
+    return SymName;
+  switch (EF.getHeader()->e_machine) {
+  case ELF::EM_X86_64:
+    switch (type) {
+    case ELF::R_X86_64_PC8:
+    case ELF::R_X86_64_PC16:
+    case ELF::R_X86_64_PC32: {
+      std::string fmtbuf;
+      raw_string_ostream fmt(fmtbuf);
+      fmt << *SymName << (addend < 0 ? "" : "+") << addend << "-P";
+      fmt.flush();
+      Result.append(fmtbuf.begin(), fmtbuf.end());
+    } break;
+    case ELF::R_X86_64_8:
+    case ELF::R_X86_64_16:
+    case ELF::R_X86_64_32:
+    case ELF::R_X86_64_32S:
+    case ELF::R_X86_64_64: {
+      std::string fmtbuf;
+      raw_string_ostream fmt(fmtbuf);
+      fmt << *SymName << (addend < 0 ? "" : "+") << addend;
+      fmt.flush();
+      Result.append(fmtbuf.begin(), fmtbuf.end());
+    } break;
+    default:
+      res = "Unknown";
+    }
+    break;
+  case ELF::EM_AARCH64: {
+    std::string fmtbuf;
+    raw_string_ostream fmt(fmtbuf);
+    fmt << *SymName;
+    if (addend != 0)
+      fmt << (addend < 0 ? "" : "+") << addend;
+    fmt.flush();
+    Result.append(fmtbuf.begin(), fmtbuf.end());
+    break;
+  }
+  case ELF::EM_ARM:
+  case ELF::EM_HEXAGON:
+    res = *SymName;
+    break;
+  default:
+    res = "Unknown";
+  }
+  if (Result.empty())
+    Result.append(res.begin(), res.end());
+  return object_error::success;
+}
+
+template <class ELFT>
+const typename ELFFile<ELFT>::Elf_Sym *
+ELFObjectFile<ELFT>::getSymbol(DataRefImpl Symb) const {
+  return &*toELFSymIter(Symb);
+}
+
+template <class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Rel *
+ELFObjectFile<ELFT>::getRel(DataRefImpl Rel) const {
+  return EF.template getEntry<Elf_Rel>(Rel.d.a, Rel.d.b);
+}
+
+template <class ELFT>
+const typename ELFObjectFile<ELFT>::Elf_Rela *
+ELFObjectFile<ELFT>::getRela(DataRefImpl Rela) const {
+  return EF.template getEntry<Elf_Rela>(Rela.d.a, Rela.d.b);
+}
+
+template <class ELFT>
+ELFObjectFile<ELFT>::ELFObjectFile(MemoryBuffer *Object, error_code &ec)
+    : ObjectFile(getELFType(static_cast<endianness>(ELFT::TargetEndianness) ==
+                                support::little,
+                            ELFT::Is64Bits),
+                 Object),
+      EF(Object, ec) {}
+
+template <class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::begin_symbols() const {
+  return symbol_iterator(SymbolRef(toDRI(EF.begin_symbols()), this));
+}
+
+template <class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::end_symbols() const {
+  return symbol_iterator(SymbolRef(toDRI(EF.end_symbols()), this));
+}
+
+template <class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::begin_dynamic_symbols() const {
+  return symbol_iterator(SymbolRef(toDRI(EF.begin_dynamic_symbols()), this));
+}
+
+template <class ELFT>
+symbol_iterator ELFObjectFile<ELFT>::end_dynamic_symbols() const {
+  return symbol_iterator(SymbolRef(toDRI(EF.end_dynamic_symbols()), this));
+}
+
+template <class ELFT>
+section_iterator ELFObjectFile<ELFT>::begin_sections() const {
+  return section_iterator(SectionRef(toDRI(EF.begin_sections()), this));
+}
+
+template <class ELFT>
+section_iterator ELFObjectFile<ELFT>::end_sections() const {
+  return section_iterator(SectionRef(toDRI(EF.end_sections()), this));
+}
+
+template <class ELFT>
+StringRef ELFObjectFile<ELFT>::getLoadName() const {
+  Elf_Dyn_Iter DI = EF.begin_dynamic_table();
+  Elf_Dyn_Iter DE = EF.end_dynamic_table();
+
+  while (DI != DE && DI->getTag() != ELF::DT_SONAME)
+    ++DI;
+
+  if (DI != DE)
+    return EF.getDynamicString(DI->getVal());
+  return "";
+}
+
+template <class ELFT>
+library_iterator ELFObjectFile<ELFT>::begin_libraries_needed() const {
+  Elf_Dyn_Iter DI = EF.begin_dynamic_table();
+  Elf_Dyn_Iter DE = EF.end_dynamic_table();
+
+  while (DI != DE && DI->getTag() != ELF::DT_SONAME)
+    ++DI;
+
+  return library_iterator(LibraryRef(toDRI(DI), this));
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getLibraryNext(DataRefImpl Data,
+                                               LibraryRef &Result) const {
+  Elf_Dyn_Iter DI = toELFDynIter(Data);
+  Elf_Dyn_Iter DE = EF.end_dynamic_table();
+
+  // Skip to the next DT_NEEDED entry.
+  do
+    ++DI;
+  while (DI != DE && DI->getTag() != ELF::DT_NEEDED);
+
+  Result = LibraryRef(toDRI(DI), this);
+  return object_error::success;
+}
+
+template <class ELFT>
+error_code ELFObjectFile<ELFT>::getLibraryPath(DataRefImpl Data,
+                                               StringRef &Res) const {
+  Res = EF.getDynamicString(toELFDynIter(Data)->getVal());
+  return object_error::success;
+}
+
+template <class ELFT>
+library_iterator ELFObjectFile<ELFT>::end_libraries_needed() const {
+  return library_iterator(LibraryRef(toDRI(EF.end_dynamic_table()), this));
+}
+
+template <class ELFT>
+uint8_t ELFObjectFile<ELFT>::getBytesInAddress() const {
+  return ELFT::Is64Bits ? 8 : 4;
+}
+
+template <class ELFT>
+StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
+  switch (EF.getHeader()->e_ident[ELF::EI_CLASS]) {
+  case ELF::ELFCLASS32:
+    switch (EF.getHeader()->e_machine) {
+    case ELF::EM_386:
+      return "ELF32-i386";
+    case ELF::EM_X86_64:
+      return "ELF32-x86-64";
+    case ELF::EM_ARM:
+      return "ELF32-arm";
+    case ELF::EM_HEXAGON:
+      return "ELF32-hexagon";
+    case ELF::EM_MIPS:
+      return "ELF32-mips";
+    case ELF::EM_PPC:
+      return "ELF32-ppc";
+    default:
+      return "ELF32-unknown";
+    }
+  case ELF::ELFCLASS64:
+    switch (EF.getHeader()->e_machine) {
+    case ELF::EM_386:
+      return "ELF64-i386";
+    case ELF::EM_X86_64:
+      return "ELF64-x86-64";
+    case ELF::EM_AARCH64:
+      return "ELF64-aarch64";
+    case ELF::EM_PPC64:
+      return "ELF64-ppc64";
+    case ELF::EM_S390:
+      return "ELF64-s390";
+    default:
+      return "ELF64-unknown";
+    }
+  default:
+    // FIXME: Proper error handling.
+    report_fatal_error("Invalid ELFCLASS!");
+  }
+}
+
+template <class ELFT>
+unsigned ELFObjectFile<ELFT>::getArch() const {
+  switch (EF.getHeader()->e_machine) {
+  case ELF::EM_386:
+    return Triple::x86;
+  case ELF::EM_X86_64:
+    return Triple::x86_64;
+  case ELF::EM_AARCH64:
+    return Triple::aarch64;
+  case ELF::EM_ARM:
+    return Triple::arm;
+  case ELF::EM_HEXAGON:
+    return Triple::hexagon;
+  case ELF::EM_MIPS:
+    return (ELFT::TargetEndianness == support::little) ? Triple::mipsel
+                                                       : Triple::mips;
+  case ELF::EM_PPC64:
+    return (ELFT::TargetEndianness == support::little) ? Triple::ppc64le
+                                                       : Triple::ppc64;
+  case ELF::EM_S390:
+    return Triple::systemz;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+/// FIXME: Maybe we should have a base ElfObjectFile that is not a template
+/// and make these member functions?
+static inline error_code getELFRelocationAddend(const RelocationRef R,
+                                                int64_t &Addend) {
+  const ObjectFile *Obj = R.getObjectFile();
+  DataRefImpl DRI = R.getRawDataRefImpl();
+  // Little-endian 32-bit
+  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
+    return ELFObj->getRelocationAddend(DRI, Addend);
+
+  // Big-endian 32-bit
+  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
+    return ELFObj->getRelocationAddend(DRI, Addend);
+
+  // Little-endian 64-bit
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+    return ELFObj->getRelocationAddend(DRI, Addend);
+
+  // Big-endian 64-bit
+  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+    return ELFObj->getRelocationAddend(DRI, Addend);
+
+  llvm_unreachable("Object passed to getELFRelocationAddend() is not ELF");
+}
+
+/// This is a generic interface for retrieving GNU symbol version
+/// information from an ELFObjectFile.
+static inline error_code GetELFSymbolVersion(const ObjectFile *Obj,
+                                             const SymbolRef &Sym,
+                                             StringRef &Version,
+                                             bool &IsDefault) {
+  // Little-endian 32-bit
+  if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
+    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
+
+  // Big-endian 32-bit
+  if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
+    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
+
+  // Little-endian 64-bit
+  if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
+    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
+
+  // Big-endian 64-bit
+  if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
+    return ELFObj->getSymbolVersion(Sym, Version, IsDefault);
+
+  llvm_unreachable("Object passed to GetELFSymbolVersion() is not ELF");
+}
+}
+}
+
+#endif
diff --git a/include/llvm/Object/ELFTypes.h b/include/llvm/Object/ELFTypes.h
new file mode 100644
index 0000000..84b6031
--- /dev/null
+++ b/include/llvm/Object/ELFTypes.h
@@ -0,0 +1,463 @@
+//===- ELFTypes.h - Endian specific types for ELF ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_ELF_TYPES_H
+#define LLVM_OBJECT_ELF_TYPES_H
+
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/Endian.h"
+
+namespace llvm {
+namespace object {
+
+using support::endianness;
+
+template <endianness target_endianness, std::size_t max_alignment,
+          bool is64Bits>
+struct ELFType {
+  static const endianness TargetEndianness = target_endianness;
+  static const std::size_t MaxAlignment = max_alignment;
+  static const bool Is64Bits = is64Bits;
+};
+
+template <typename T, int max_align> struct MaximumAlignment {
+  enum { value = AlignOf<T>::Alignment > max_align ? max_align
+                                                   : AlignOf<T>::Alignment
+  };
+};
+
+// Templates to choose Elf_Addr and Elf_Off depending on is64Bits.
+template <endianness target_endianness, std::size_t max_alignment>
+struct ELFDataTypeTypedefHelperCommon {
+  typedef support::detail::packed_endian_specific_integral<
+      uint16_t, target_endianness,
+      MaximumAlignment<uint16_t, max_alignment>::value> Elf_Half;
+  typedef support::detail::packed_endian_specific_integral<
+      uint32_t, target_endianness,
+      MaximumAlignment<uint32_t, max_alignment>::value> Elf_Word;
+  typedef support::detail::packed_endian_specific_integral<
+      int32_t, target_endianness,
+      MaximumAlignment<int32_t, max_alignment>::value> Elf_Sword;
+  typedef support::detail::packed_endian_specific_integral<
+      uint64_t, target_endianness,
+      MaximumAlignment<uint64_t, max_alignment>::value> Elf_Xword;
+  typedef support::detail::packed_endian_specific_integral<
+      int64_t, target_endianness,
+      MaximumAlignment<int64_t, max_alignment>::value> Elf_Sxword;
+};
+
+template <class ELFT> struct ELFDataTypeTypedefHelper;
+
+/// ELF 32bit types.
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct ELFDataTypeTypedefHelper<ELFType<TargetEndianness, MaxAlign, false> >
+    : ELFDataTypeTypedefHelperCommon<TargetEndianness, MaxAlign> {
+  typedef uint32_t value_type;
+  typedef support::detail::packed_endian_specific_integral<
+      value_type, TargetEndianness,
+      MaximumAlignment<value_type, MaxAlign>::value> Elf_Addr;
+  typedef support::detail::packed_endian_specific_integral<
+      value_type, TargetEndianness,
+      MaximumAlignment<value_type, MaxAlign>::value> Elf_Off;
+};
+
+/// ELF 64bit types.
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct ELFDataTypeTypedefHelper<ELFType<TargetEndianness, MaxAlign, true> >
+    : ELFDataTypeTypedefHelperCommon<TargetEndianness, MaxAlign> {
+  typedef uint64_t value_type;
+  typedef support::detail::packed_endian_specific_integral<
+      value_type, TargetEndianness,
+      MaximumAlignment<value_type, MaxAlign>::value> Elf_Addr;
+  typedef support::detail::packed_endian_specific_integral<
+      value_type, TargetEndianness,
+      MaximumAlignment<value_type, MaxAlign>::value> Elf_Off;
+};
+
+// I really don't like doing this, but the alternative is copypasta.
+#define LLVM_ELF_IMPORT_TYPES(E, M, W)                                         \
+typedef typename ELFDataTypeTypedefHelper<ELFType<E, M, W> >::Elf_Addr         \
+    Elf_Addr;                                                                  \
+typedef typename ELFDataTypeTypedefHelper<ELFType<E, M, W> >::Elf_Off          \
+    Elf_Off;                                                                   \
+typedef typename ELFDataTypeTypedefHelper<ELFType<E, M, W> >::Elf_Half         \
+    Elf_Half;                                                                  \
+typedef typename ELFDataTypeTypedefHelper<ELFType<E, M, W> >::Elf_Word         \
+    Elf_Word;                                                                  \
+typedef typename ELFDataTypeTypedefHelper<ELFType<E, M, W> >::Elf_Sword        \
+    Elf_Sword;                                                                 \
+typedef typename ELFDataTypeTypedefHelper<ELFType<E, M, W> >::Elf_Xword        \
+    Elf_Xword;                                                                 \
+typedef typename ELFDataTypeTypedefHelper<ELFType<E, M, W> >::Elf_Sxword       \
+    Elf_Sxword;
+
+#define LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)                                       \
+  LLVM_ELF_IMPORT_TYPES(ELFT::TargetEndianness, ELFT::MaxAlignment,            \
+                        ELFT::Is64Bits)
+
+// Section header.
+template <class ELFT> struct Elf_Shdr_Base;
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Shdr_Base<ELFType<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
+  Elf_Word sh_name;      // Section name (index into string table)
+  Elf_Word sh_type;      // Section type (SHT_*)
+  Elf_Word sh_flags;     // Section flags (SHF_*)
+  Elf_Addr sh_addr;      // Address where section is to be loaded
+  Elf_Off sh_offset;     // File offset of section data, in bytes
+  Elf_Word sh_size;      // Size of section, in bytes
+  Elf_Word sh_link;      // Section type-specific header table index link
+  Elf_Word sh_info;      // Section type-specific extra information
+  Elf_Word sh_addralign; // Section address alignment
+  Elf_Word sh_entsize;   // Size of records contained within the section
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Shdr_Base<ELFType<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
+  Elf_Word sh_name;       // Section name (index into string table)
+  Elf_Word sh_type;       // Section type (SHT_*)
+  Elf_Xword sh_flags;     // Section flags (SHF_*)
+  Elf_Addr sh_addr;       // Address where section is to be loaded
+  Elf_Off sh_offset;      // File offset of section data, in bytes
+  Elf_Xword sh_size;      // Size of section, in bytes
+  Elf_Word sh_link;       // Section type-specific header table index link
+  Elf_Word sh_info;       // Section type-specific extra information
+  Elf_Xword sh_addralign; // Section address alignment
+  Elf_Xword sh_entsize;   // Size of records contained within the section
+};
+
+template <class ELFT>
+struct Elf_Shdr_Impl : Elf_Shdr_Base<ELFT> {
+  using Elf_Shdr_Base<ELFT>::sh_entsize;
+  using Elf_Shdr_Base<ELFT>::sh_size;
+
+  /// @brief Get the number of entities this section contains if it has any.
+  unsigned getEntityCount() const {
+    if (sh_entsize == 0)
+      return 0;
+    return sh_size / sh_entsize;
+  }
+};
+
+template <class ELFT> struct Elf_Sym_Base;
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Sym_Base<ELFType<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
+  Elf_Word st_name;       // Symbol name (index into string table)
+  Elf_Addr st_value;      // Value or address associated with the symbol
+  Elf_Word st_size;       // Size of the symbol
+  unsigned char st_info;  // Symbol's type and binding attributes
+  unsigned char st_other; // Must be zero; reserved
+  Elf_Half st_shndx;      // Which section (header table index) it's defined in
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Sym_Base<ELFType<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
+  Elf_Word st_name;       // Symbol name (index into string table)
+  unsigned char st_info;  // Symbol's type and binding attributes
+  unsigned char st_other; // Must be zero; reserved
+  Elf_Half st_shndx;      // Which section (header table index) it's defined in
+  Elf_Addr st_value;      // Value or address associated with the symbol
+  Elf_Xword st_size;      // Size of the symbol
+};
+
+template <class ELFT>
+struct Elf_Sym_Impl : Elf_Sym_Base<ELFT> {
+  using Elf_Sym_Base<ELFT>::st_info;
+
+  // These accessors and mutators correspond to the ELF32_ST_BIND,
+  // ELF32_ST_TYPE, and ELF32_ST_INFO macros defined in the ELF specification:
+  unsigned char getBinding() const { return st_info >> 4; }
+  unsigned char getType() const { return st_info & 0x0f; }
+  void setBinding(unsigned char b) { setBindingAndType(b, getType()); }
+  void setType(unsigned char t) { setBindingAndType(getBinding(), t); }
+  void setBindingAndType(unsigned char b, unsigned char t) {
+    st_info = (b << 4) + (t & 0x0f);
+  }
+};
+
+/// Elf_Versym: This is the structure of entries in the SHT_GNU_versym section
+/// (.gnu.version). This structure is identical for ELF32 and ELF64.
+template <class ELFT>
+struct Elf_Versym_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  Elf_Half vs_index; // Version index with flags (e.g. VERSYM_HIDDEN)
+};
+
+template <class ELFT> struct Elf_Verdaux_Impl;
+
+/// Elf_Verdef: This is the structure of entries in the SHT_GNU_verdef section
+/// (.gnu.version_d). This structure is identical for ELF32 and ELF64.
+template <class ELFT>
+struct Elf_Verdef_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  typedef Elf_Verdaux_Impl<ELFT> Elf_Verdaux;
+  Elf_Half vd_version; // Version of this structure (e.g. VER_DEF_CURRENT)
+  Elf_Half vd_flags;   // Bitwise flags (VER_DEF_*)
+  Elf_Half vd_ndx;     // Version index, used in .gnu.version entries
+  Elf_Half vd_cnt;     // Number of Verdaux entries
+  Elf_Word vd_hash;    // Hash of name
+  Elf_Word vd_aux;     // Offset to the first Verdaux entry (in bytes)
+  Elf_Word vd_next;    // Offset to the next Verdef entry (in bytes)
+
+  /// Get the first Verdaux entry for this Verdef.
+  const Elf_Verdaux *getAux() const {
+    return reinterpret_cast<const Elf_Verdaux *>((const char *)this + vd_aux);
+  }
+};
+
+/// Elf_Verdaux: This is the structure of auxiliary data in the SHT_GNU_verdef
+/// section (.gnu.version_d). This structure is identical for ELF32 and ELF64.
+template <class ELFT>
+struct Elf_Verdaux_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  Elf_Word vda_name; // Version name (offset in string table)
+  Elf_Word vda_next; // Offset to next Verdaux entry (in bytes)
+};
+
+/// Elf_Verneed: This is the structure of entries in the SHT_GNU_verneed
+/// section (.gnu.version_r). This structure is identical for ELF32 and ELF64.
+template <class ELFT>
+struct Elf_Verneed_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  Elf_Half vn_version; // Version of this structure (e.g. VER_NEED_CURRENT)
+  Elf_Half vn_cnt;     // Number of associated Vernaux entries
+  Elf_Word vn_file;    // Library name (string table offset)
+  Elf_Word vn_aux;     // Offset to first Vernaux entry (in bytes)
+  Elf_Word vn_next;    // Offset to next Verneed entry (in bytes)
+};
+
+/// Elf_Vernaux: This is the structure of auxiliary data in SHT_GNU_verneed
+/// section (.gnu.version_r). This structure is identical for ELF32 and ELF64.
+template <class ELFT>
+struct Elf_Vernaux_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  Elf_Word vna_hash;  // Hash of dependency name
+  Elf_Half vna_flags; // Bitwise Flags (VER_FLAG_*)
+  Elf_Half vna_other; // Version index, used in .gnu.version entries
+  Elf_Word vna_name;  // Dependency name
+  Elf_Word vna_next;  // Offset to next Vernaux entry (in bytes)
+};
+
+/// Elf_Dyn_Base: This structure matches the form of entries in the dynamic
+///               table section (.dynamic) look like.
+template <class ELFT> struct Elf_Dyn_Base;
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Dyn_Base<ELFType<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
+  Elf_Sword d_tag;
+  union {
+    Elf_Word d_val;
+    Elf_Addr d_ptr;
+  } d_un;
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Dyn_Base<ELFType<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
+  Elf_Sxword d_tag;
+  union {
+    Elf_Xword d_val;
+    Elf_Addr d_ptr;
+  } d_un;
+};
+
+/// Elf_Dyn_Impl: This inherits from Elf_Dyn_Base, adding getters and setters.
+template <class ELFT>
+struct Elf_Dyn_Impl : Elf_Dyn_Base<ELFT> {
+  using Elf_Dyn_Base<ELFT>::d_tag;
+  using Elf_Dyn_Base<ELFT>::d_un;
+  int64_t getTag() const { return d_tag; }
+  uint64_t getVal() const { return d_un.d_val; }
+  uint64_t getPtr() const { return d_un.ptr; }
+};
+
+// Elf_Rel: Elf Relocation
+template <class ELFT, bool isRela> struct Elf_Rel_Base;
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, false>, false> {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
+  Elf_Addr r_offset; // Location (file byte offset, or program virtual addr)
+  Elf_Word r_info;   // Symbol table index and type of relocation to apply
+
+  uint32_t getRInfo(bool isMips64EL) const {
+    assert(!isMips64EL);
+    return r_info;
+  }
+  void setRInfo(uint32_t R) { r_info = R; }
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, true>, false> {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
+  Elf_Addr r_offset; // Location (file byte offset, or program virtual addr)
+  Elf_Xword r_info;  // Symbol table index and type of relocation to apply
+
+  uint64_t getRInfo(bool isMips64EL) const {
+    uint64_t t = r_info;
+    if (!isMips64EL)
+      return t;
+    // Mips64 little endian has a "special" encoding of r_info. Instead of one
+    // 64 bit little endian number, it is a little endian 32 bit number followed
+    // by a 32 bit big endian number.
+    return (t << 32) | ((t >> 8) & 0xff000000) | ((t >> 24) & 0x00ff0000) |
+           ((t >> 40) & 0x0000ff00) | ((t >> 56) & 0x000000ff);
+  }
+  void setRInfo(uint64_t R) {
+    // FIXME: Add mips64el support.
+    r_info = R;
+  }
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, false>, true> {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
+  Elf_Addr r_offset;  // Location (file byte offset, or program virtual addr)
+  Elf_Word r_info;    // Symbol table index and type of relocation to apply
+  Elf_Sword r_addend; // Compute value for relocatable field by adding this
+
+  uint32_t getRInfo(bool isMips64EL) const {
+    assert(!isMips64EL);
+    return r_info;
+  }
+  void setRInfo(uint32_t R) { r_info = R; }
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Rel_Base<ELFType<TargetEndianness, MaxAlign, true>, true> {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
+  Elf_Addr r_offset;   // Location (file byte offset, or program virtual addr)
+  Elf_Xword r_info;    // Symbol table index and type of relocation to apply
+  Elf_Sxword r_addend; // Compute value for relocatable field by adding this.
+
+  uint64_t getRInfo(bool isMips64EL) const {
+    // Mips64 little endian has a "special" encoding of r_info. Instead of one
+    // 64 bit little endian number, it is a little endian 32 bit number followed
+    // by a 32 bit big endian number.
+    uint64_t t = r_info;
+    if (!isMips64EL)
+      return t;
+    return (t << 32) | ((t >> 8) & 0xff000000) | ((t >> 24) & 0x00ff0000) |
+           ((t >> 40) & 0x0000ff00) | ((t >> 56) & 0x000000ff);
+  }
+  void setRInfo(uint64_t R) {
+    // FIXME: Add mips64el support.
+    r_info = R;
+  }
+};
+
+template <class ELFT, bool isRela> struct Elf_Rel_Impl;
+
+template <endianness TargetEndianness, std::size_t MaxAlign, bool isRela>
+struct Elf_Rel_Impl<ELFType<TargetEndianness, MaxAlign, true>,
+                    isRela> : Elf_Rel_Base<
+    ELFType<TargetEndianness, MaxAlign, true>, isRela> {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
+
+  // These accessors and mutators correspond to the ELF64_R_SYM, ELF64_R_TYPE,
+  // and ELF64_R_INFO macros defined in the ELF specification:
+  uint32_t getSymbol(bool isMips64EL) const {
+    return (uint32_t)(this->getRInfo(isMips64EL) >> 32);
+  }
+  uint32_t getType(bool isMips64EL) const {
+    return (uint32_t)(this->getRInfo(isMips64EL) & 0xffffffffL);
+  }
+  void setSymbol(uint32_t s) { setSymbolAndType(s, getType()); }
+  void setType(uint32_t t) { setSymbolAndType(getSymbol(), t); }
+  void setSymbolAndType(uint32_t s, uint32_t t) {
+    this->setRInfo(((uint64_t)s << 32) + (t & 0xffffffffL));
+  }
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign, bool isRela>
+struct Elf_Rel_Impl<ELFType<TargetEndianness, MaxAlign, false>,
+                    isRela> : Elf_Rel_Base<
+    ELFType<TargetEndianness, MaxAlign, false>, isRela> {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
+
+  // These accessors and mutators correspond to the ELF32_R_SYM, ELF32_R_TYPE,
+  // and ELF32_R_INFO macros defined in the ELF specification:
+  uint32_t getSymbol(bool isMips64EL) const {
+    return this->getRInfo(isMips64EL) >> 8;
+  }
+  unsigned char getType(bool isMips64EL) const {
+    return (unsigned char)(this->getRInfo(isMips64EL) & 0x0ff);
+  }
+  void setSymbol(uint32_t s) { setSymbolAndType(s, getType()); }
+  void setType(unsigned char t) { setSymbolAndType(getSymbol(), t); }
+  void setSymbolAndType(uint32_t s, unsigned char t) {
+    this->setRInfo((s << 8) + t);
+  }
+};
+
+template <class ELFT>
+struct Elf_Ehdr_Impl {
+  LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
+  unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
+  Elf_Half e_type;                       // Type of file (see ET_*)
+  Elf_Half e_machine;   // Required architecture for this file (see EM_*)
+  Elf_Word e_version;   // Must be equal to 1
+  Elf_Addr e_entry;     // Address to jump to in order to start program
+  Elf_Off e_phoff;      // Program header table's file offset, in bytes
+  Elf_Off e_shoff;      // Section header table's file offset, in bytes
+  Elf_Word e_flags;     // Processor-specific flags
+  Elf_Half e_ehsize;    // Size of ELF header, in bytes
+  Elf_Half e_phentsize; // Size of an entry in the program header table
+  Elf_Half e_phnum;     // Number of entries in the program header table
+  Elf_Half e_shentsize; // Size of an entry in the section header table
+  Elf_Half e_shnum;     // Number of entries in the section header table
+  Elf_Half e_shstrndx;  // Section header table index of section name
+                        // string table
+  bool checkMagic() const {
+    return (memcmp(e_ident, ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
+  }
+  unsigned char getFileClass() const { return e_ident[ELF::EI_CLASS]; }
+  unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
+};
+
+template <class ELFT> struct Elf_Phdr_Impl;
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Phdr_Impl<ELFType<TargetEndianness, MaxAlign, false> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, false)
+  Elf_Word p_type;   // Type of segment
+  Elf_Off p_offset;  // FileOffset where segment is located, in bytes
+  Elf_Addr p_vaddr;  // Virtual Address of beginning of segment
+  Elf_Addr p_paddr;  // Physical address of beginning of segment (OS-specific)
+  Elf_Word p_filesz; // Num. of bytes in file image of segment (may be zero)
+  Elf_Word p_memsz;  // Num. of bytes in mem image of segment (may be zero)
+  Elf_Word p_flags;  // Segment flags
+  Elf_Word p_align;  // Segment alignment constraint
+};
+
+template <endianness TargetEndianness, std::size_t MaxAlign>
+struct Elf_Phdr_Impl<ELFType<TargetEndianness, MaxAlign, true> > {
+  LLVM_ELF_IMPORT_TYPES(TargetEndianness, MaxAlign, true)
+  Elf_Word p_type;    // Type of segment
+  Elf_Word p_flags;   // Segment flags
+  Elf_Off p_offset;   // FileOffset where segment is located, in bytes
+  Elf_Addr p_vaddr;   // Virtual Address of beginning of segment
+  Elf_Addr p_paddr;   // Physical address of beginning of segment (OS-specific)
+  Elf_Xword p_filesz; // Num. of bytes in file image of segment (may be zero)
+  Elf_Xword p_memsz;  // Num. of bytes in mem image of segment (may be zero)
+  Elf_Xword p_align;  // Segment alignment constraint
+};
+
+} // end namespace object.
+} // end namespace llvm.
+
+#endif
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index 50435d6..100613a 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -18,10 +18,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MachO.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace object {
@@ -55,7 +53,7 @@ class MachOObjectFile : public ObjectFile {
 public:
   struct LoadCommandInfo {
     const char *Ptr;      // Where in memory the load command is.
-    macho::LoadCommand C; // The command itself.
+    MachO::load_command C; // The command itself.
   };
 
   MachOObjectFile(MemoryBuffer *Object, bool IsLittleEndian, bool Is64Bits,
@@ -69,7 +67,6 @@ public:
   virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const;
   virtual error_code getSymbolType(DataRefImpl Symb,
                                    SymbolRef::Type &Res) const;
-  virtual error_code getSymbolNMTypeChar(DataRefImpl Symb, char &Res) const;
   virtual error_code getSymbolFlags(DataRefImpl Symb, uint32_t &Res) const;
   virtual error_code getSymbolSection(DataRefImpl Symb,
                                       section_iterator &Res) const;
@@ -91,8 +88,8 @@ public:
   virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const;
   virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
                                            bool &Result) const;
-  virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const;
-  virtual relocation_iterator getSectionRelEnd(DataRefImpl Sec) const;
+  virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const;
+  virtual relocation_iterator section_rel_end(DataRefImpl Sec) const;
 
   virtual error_code getRelocationNext(DataRefImpl Rel,
                                        RelocationRef &Res) const;
@@ -131,8 +128,8 @@ public:
 
   virtual StringRef getLoadName() const;
 
-  relocation_iterator getSectionRelBegin(unsigned Index) const;
-  relocation_iterator getSectionRelEnd(unsigned Index) const;
+  relocation_iterator section_rel_begin(unsigned Index) const;
+  relocation_iterator section_rel_end(unsigned Index) const;
 
   dice_iterator begin_dices() const;
   dice_iterator end_dices() const;
@@ -148,50 +145,53 @@ public:
   ArrayRef<char> getSectionRawFinalSegmentName(DataRefImpl Sec) const;
 
   // MachO specific Info about relocations.
-  bool isRelocationScattered(const macho::RelocationEntry &RE) const;
-  unsigned getPlainRelocationSymbolNum(const macho::RelocationEntry &RE) const;
-  bool getPlainRelocationExternal(const macho::RelocationEntry &RE) const;
-  bool getScatteredRelocationScattered(const macho::RelocationEntry &RE) const;
-  uint32_t getScatteredRelocationValue(const macho::RelocationEntry &RE) const;
-  unsigned getAnyRelocationAddress(const macho::RelocationEntry &RE) const;
-  unsigned getAnyRelocationPCRel(const macho::RelocationEntry &RE) const;
-  unsigned getAnyRelocationLength(const macho::RelocationEntry &RE) const;
-  unsigned getAnyRelocationType(const macho::RelocationEntry &RE) const;
-  SectionRef getRelocationSection(const macho::RelocationEntry &RE) const;
+  bool isRelocationScattered(const MachO::any_relocation_info &RE) const;
+  unsigned getPlainRelocationSymbolNum(
+                                    const MachO::any_relocation_info &RE) const;
+  bool getPlainRelocationExternal(const MachO::any_relocation_info &RE) const;
+  bool getScatteredRelocationScattered(
+                                    const MachO::any_relocation_info &RE) const;
+  uint32_t getScatteredRelocationValue(
+                                    const MachO::any_relocation_info &RE) const;
+  unsigned getAnyRelocationAddress(const MachO::any_relocation_info &RE) const;
+  unsigned getAnyRelocationPCRel(const MachO::any_relocation_info &RE) const;
+  unsigned getAnyRelocationLength(const MachO::any_relocation_info &RE) const;
+  unsigned getAnyRelocationType(const MachO::any_relocation_info &RE) const;
+  SectionRef getRelocationSection(const MachO::any_relocation_info &RE) const;
 
   // Walk load commands.
   LoadCommandInfo getFirstLoadCommandInfo() const;
   LoadCommandInfo getNextLoadCommandInfo(const LoadCommandInfo &L) const;
 
   // MachO specific structures.
-  macho::Section getSection(DataRefImpl DRI) const;
-  macho::Section64 getSection64(DataRefImpl DRI) const;
-  macho::Section getSection(const LoadCommandInfo &L, unsigned Index) const;
-  macho::Section64 getSection64(const LoadCommandInfo &L, unsigned Index) const;
-  macho::SymbolTableEntry getSymbolTableEntry(DataRefImpl DRI) const;
-  macho::Symbol64TableEntry getSymbol64TableEntry(DataRefImpl DRI) const;
-
-  macho::LinkeditDataLoadCommand
+  MachO::section getSection(DataRefImpl DRI) const;
+  MachO::section_64 getSection64(DataRefImpl DRI) const;
+  MachO::section getSection(const LoadCommandInfo &L, unsigned Index) const;
+  MachO::section_64 getSection64(const LoadCommandInfo &L,unsigned Index) const;
+  MachO::nlist getSymbolTableEntry(DataRefImpl DRI) const;
+  MachO::nlist_64 getSymbol64TableEntry(DataRefImpl DRI) const;
+
+  MachO::linkedit_data_command
   getLinkeditDataLoadCommand(const LoadCommandInfo &L) const;
-  macho::SegmentLoadCommand
+  MachO::segment_command
   getSegmentLoadCommand(const LoadCommandInfo &L) const;
-  macho::Segment64LoadCommand
+  MachO::segment_command_64
   getSegment64LoadCommand(const LoadCommandInfo &L) const;
-  macho::LinkerOptionsLoadCommand
+  MachO::linker_options_command
   getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const;
 
-  macho::RelocationEntry getRelocation(DataRefImpl Rel) const;
-  macho::DataInCodeTableEntry getDice(DataRefImpl Rel) const;
-  macho::Header getHeader() const;
-  macho::Header64Ext getHeader64Ext() const;
-  macho::IndirectSymbolTableEntry
-  getIndirectSymbolTableEntry(const macho::DysymtabLoadCommand &DLC,
+  MachO::any_relocation_info getRelocation(DataRefImpl Rel) const;
+  MachO::data_in_code_entry getDice(DataRefImpl Rel) const;
+  MachO::mach_header getHeader() const;
+  MachO::mach_header_64 getHeader64() const;
+  uint32_t
+  getIndirectSymbolTableEntry(const MachO::dysymtab_command &DLC,
                               unsigned Index) const;
-  macho::DataInCodeTableEntry getDataInCodeTableEntry(uint32_t DataOffset,
-                                                      unsigned Index) const;
-  macho::SymtabLoadCommand getSymtabLoadCommand() const;
-  macho::DysymtabLoadCommand getDysymtabLoadCommand() const;
-  macho::LinkeditDataLoadCommand getDataInCodeLoadCommand() const;
+  MachO::data_in_code_entry getDataInCodeTableEntry(uint32_t DataOffset,
+                                                    unsigned Index) const;
+  MachO::symtab_command getSymtabLoadCommand() const;
+  MachO::dysymtab_command getDysymtabLoadCommand() const;
+  MachO::linkedit_data_command getDataInCodeLoadCommand() const;
 
   StringRef getStringTableData() const;
   bool is64Bit() const;
@@ -225,8 +225,8 @@ inline bool DiceRef::operator<(const DiceRef &Other) const {
 
 inline error_code DiceRef::getNext(DiceRef &Result) const {
   DataRefImpl Rel = DicePimpl;
-  const macho::DataInCodeTableEntry *P =
-    reinterpret_cast<const macho::DataInCodeTableEntry *>(Rel.p);
+  const MachO::data_in_code_entry *P =
+    reinterpret_cast<const MachO::data_in_code_entry *>(Rel.p);
   Rel.p = reinterpret_cast<uintptr_t>(P + 1);
   Result = DiceRef(Rel, OwningObject);
   return object_error::success;
@@ -239,24 +239,24 @@ inline error_code DiceRef::getNext(DiceRef &Result) const {
 inline error_code DiceRef::getOffset(uint32_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
-  macho::DataInCodeTableEntry Dice = MachOOF->getDice(DicePimpl);
-  Result = Dice.Offset;
+  MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
+  Result = Dice.offset;
   return object_error::success;
 }
 
 inline error_code DiceRef::getLength(uint16_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
-  macho::DataInCodeTableEntry Dice = MachOOF->getDice(DicePimpl);
-  Result = Dice.Length;
+  MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
+  Result = Dice.length;
   return object_error::success;
 }
 
 inline error_code DiceRef::getKind(uint16_t &Result) const {
   const MachOObjectFile *MachOOF =
     static_cast<const MachOObjectFile *>(OwningObject);
-  macho::DataInCodeTableEntry Dice = MachOOF->getDice(DicePimpl);
-  Result = Dice.Kind;
+  MachO::data_in_code_entry Dice = MachOOF->getDice(DicePimpl);
+  Result = Dice.kind;
   return object_error::success;
 }
 
diff --git a/include/llvm/Object/MachOFormat.h b/include/llvm/Object/MachOFormat.h
deleted file mode 100644
index 96ee8a7..0000000
--- a/include/llvm/Object/MachOFormat.h
+++ /dev/null
@@ -1,433 +0,0 @@
-//===- MachOFormat.h - Mach-O Format Structures And Constants ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares various structures and constants which are platform
-// independent and can be shared by any client which wishes to interact with
-// Mach object files.
-//
-// The definitions here are purposely chosen to match the LLVM style as opposed
-// to following the platform specific definition of the format.
-//
-// On a Mach system, see the <mach-o/...> includes for more information, in
-// particular <mach-o/loader.h>.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OBJECT_MACHOFORMAT_H
-#define LLVM_OBJECT_MACHOFORMAT_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-namespace object {
-
-/// General Mach platform information.
-namespace mach {
-  /// @name CPU Type and Subtype Information
-  /// {
-
-  /// \brief Capability bits used in CPU type encoding.
-  enum CPUTypeFlagsMask {
-    CTFM_ArchMask =  0xFF000000,
-    CTFM_ArchABI64 = 0x01000000
-  };
-
-  /// \brief Machine type IDs used in CPU type encoding.
-  enum CPUTypeMachine {
-    CTM_i386      = 7,
-    CTM_x86_64    = CTM_i386 | CTFM_ArchABI64,
-    CTM_ARM       = 12,
-    CTM_SPARC     = 14,
-    CTM_PowerPC   = 18,
-    CTM_PowerPC64 = CTM_PowerPC | CTFM_ArchABI64
-  };
-
-  /// \brief Capability bits used in CPU subtype encoding.
-  enum CPUSubtypeFlagsMask {
-    CSFM_SubtypeMask =  0xFF000000,
-    CSFM_SubtypeLib64 = 0x80000000
-  };
-
-  /// \brief ARM Machine Subtypes.
-  enum CPUSubtypeARM {
-    CSARM_ALL    = 0,
-    CSARM_V4T    = 5,
-    CSARM_V6     = 6,
-    CSARM_V5TEJ  = 7,
-    CSARM_XSCALE = 8,
-    CSARM_V7     = 9,
-    CSARM_V7F    = 10,
-    CSARM_V7S    = 11,
-    CSARM_V7K    = 12,
-    CSARM_V6M    = 14,
-    CSARM_V7M    = 15,
-    CSARM_V7EM   = 16
-  };
-
-  /// \brief PowerPC Machine Subtypes.
-  enum CPUSubtypePowerPC {
-    CSPPC_ALL = 0
-  };
-
-  /// \brief SPARC Machine Subtypes.
-  enum CPUSubtypeSPARC {
-    CSSPARC_ALL = 0
-  };
-
-  /// \brief x86 Machine Subtypes.
-  enum CPUSubtypeX86 {
-    CSX86_ALL = 3
-  };
-
-  /// @}
-
-} // end namespace mach
-
-/// Format information for Mach object files.
-namespace macho {
-  /// \brief Constants for structure sizes.
-  enum StructureSizes {
-    Header32Size = 28,
-    Header64Size = 32,
-    FatHeaderSize = 8,
-    FatArchHeaderSize = 20,
-    SegmentLoadCommand32Size = 56,
-    SegmentLoadCommand64Size = 72,
-    Section32Size = 68,
-    Section64Size = 80,
-    SymtabLoadCommandSize = 24,
-    DysymtabLoadCommandSize = 80,
-    Nlist32Size = 12,
-    Nlist64Size = 16,
-    RelocationInfoSize = 8,
-    LinkeditLoadCommandSize = 16
-  };
-
-  /// \brief Constants for header magic field.
-  enum HeaderMagic {
-    HM_Object32 = 0xFEEDFACE,  ///< 32-bit mach object file
-    HM_Object64 = 0xFEEDFACF,  ///< 64-bit mach object file
-    HM_Universal = 0xCAFEBABE  ///< Universal object file
-  };
-
-  /// \brief Header common to all Mach object files.
-  struct Header {
-    uint32_t Magic;
-    uint32_t CPUType;
-    uint32_t CPUSubtype;
-    uint32_t FileType;
-    uint32_t NumLoadCommands;
-    uint32_t SizeOfLoadCommands;
-    uint32_t Flags;
-  };
-
-  /// \brief Extended header for 64-bit object files.
-  struct Header64Ext {
-    uint32_t Reserved;
-  };
-
-  /// \brief Header for universal object files.
-  struct FatHeader {
-    uint32_t Magic;
-    uint32_t NumFatArch;
-  };
-
-  /// \brief Header for a single-architecture object file in a
-  /// universal binary.
-  struct FatArchHeader {
-    uint32_t CPUType;
-    uint32_t CPUSubtype;
-    uint32_t Offset;
-    uint32_t Size;
-    uint32_t Align;
-  };
-
-  // See <mach-o/loader.h>.
-  enum HeaderFileType {
-    HFT_Object = 0x1
-  };
-
-  enum HeaderFlags {
-    HF_SubsectionsViaSymbols = 0x2000
-  };
-
-  enum LoadCommandType {
-    LCT_Segment = 0x1,
-    LCT_Symtab = 0x2,
-    LCT_Dysymtab = 0xb,
-    LCT_Segment64 = 0x19,
-    LCT_UUID = 0x1b,
-    LCT_CodeSignature = 0x1d,
-    LCT_SegmentSplitInfo = 0x1e,
-    LCT_FunctionStarts = 0x26,
-    LCT_DataInCode = 0x29,
-    LCT_LinkerOptions = 0x2D
-  };
-
-  /// \brief Load command structure.
-  struct LoadCommand {
-    uint32_t Type;
-    uint32_t Size;
-  };
-
-  /// @name Load Command Structures
-  /// @{
-
-  struct SegmentLoadCommand {
-    uint32_t Type;
-    uint32_t Size;
-    char Name[16];
-    uint32_t VMAddress;
-    uint32_t VMSize;
-    uint32_t FileOffset;
-    uint32_t FileSize;
-    uint32_t MaxVMProtection;
-    uint32_t InitialVMProtection;
-    uint32_t NumSections;
-    uint32_t Flags;
-  };
-
-  struct Segment64LoadCommand {
-    uint32_t Type;
-    uint32_t Size;
-    char Name[16];
-    uint64_t VMAddress;
-    uint64_t VMSize;
-    uint64_t FileOffset;
-    uint64_t FileSize;
-    uint32_t MaxVMProtection;
-    uint32_t InitialVMProtection;
-    uint32_t NumSections;
-    uint32_t Flags;
-  };
-
-  struct SymtabLoadCommand {
-    uint32_t Type;
-    uint32_t Size;
-    uint32_t SymbolTableOffset;
-    uint32_t NumSymbolTableEntries;
-    uint32_t StringTableOffset;
-    uint32_t StringTableSize;
-  };
-
-  struct DysymtabLoadCommand {
-    uint32_t Type;
-    uint32_t Size;
-
-    uint32_t LocalSymbolsIndex;
-    uint32_t NumLocalSymbols;
-
-    uint32_t ExternalSymbolsIndex;
-    uint32_t NumExternalSymbols;
-
-    uint32_t UndefinedSymbolsIndex;
-    uint32_t NumUndefinedSymbols;
-
-    uint32_t TOCOffset;
-    uint32_t NumTOCEntries;
-
-    uint32_t ModuleTableOffset;
-    uint32_t NumModuleTableEntries;
-
-    uint32_t ReferenceSymbolTableOffset;
-    uint32_t NumReferencedSymbolTableEntries;
-
-    uint32_t IndirectSymbolTableOffset;
-    uint32_t NumIndirectSymbolTableEntries;
-
-    uint32_t ExternalRelocationTableOffset;
-    uint32_t NumExternalRelocationTableEntries;
-
-    uint32_t LocalRelocationTableOffset;
-    uint32_t NumLocalRelocationTableEntries;
-  };
-
-  struct LinkeditDataLoadCommand {
-    uint32_t Type;
-    uint32_t Size;
-    uint32_t DataOffset;
-    uint32_t DataSize;
-  };
-
-  struct LinkerOptionsLoadCommand {
-    uint32_t Type;
-    uint32_t Size;
-    uint32_t Count;
-    // Load command is followed by Count number of zero-terminated UTF8 strings,
-    // and then zero-filled to be 4-byte aligned.
-  };
-
-  /// @}
-  /// @name Section Data
-  /// @{
-
-  enum SectionFlags {
-    SF_PureInstructions = 0x80000000
-  };
-
-  struct Section {
-    char Name[16];
-    char SegmentName[16];
-    uint32_t Address;
-    uint32_t Size;
-    uint32_t Offset;
-    uint32_t Align;
-    uint32_t RelocationTableOffset;
-    uint32_t NumRelocationTableEntries;
-    uint32_t Flags;
-    uint32_t Reserved1;
-    uint32_t Reserved2;
-  };
-  struct Section64 {
-    char Name[16];
-    char SegmentName[16];
-    uint64_t Address;
-    uint64_t Size;
-    uint32_t Offset;
-    uint32_t Align;
-    uint32_t RelocationTableOffset;
-    uint32_t NumRelocationTableEntries;
-    uint32_t Flags;
-    uint32_t Reserved1;
-    uint32_t Reserved2;
-    uint32_t Reserved3;
-  };
-
-  /// @}
-  /// @name Symbol Table Entries
-  /// @{
-
-  struct SymbolTableEntry {
-    uint32_t StringIndex;
-    uint8_t Type;
-    uint8_t SectionIndex;
-    uint16_t Flags;
-    uint32_t Value;
-  };
-  // Despite containing a uint64_t, this structure is only 4-byte aligned within
-  // a MachO file.
-#pragma pack(push)
-#pragma pack(4)
-  struct Symbol64TableEntry {
-    uint32_t StringIndex;
-    uint8_t Type;
-    uint8_t SectionIndex;
-    uint16_t Flags;
-    uint64_t Value;
-  };
-#pragma pack(pop)
-
-  /// @}
-  /// @name Data-in-code Table Entry
-  /// @{
-
-  // See <mach-o/loader.h>.
-  enum DataRegionType { Data = 1, JumpTable8, JumpTable16, JumpTable32 };
-  struct DataInCodeTableEntry {
-    uint32_t Offset;  /* from mach_header to start of data region */
-    uint16_t Length;  /* number of bytes in data region */
-    uint16_t Kind;    /* a DataRegionType value  */
-  };
-
-  /// @}
-  /// @name Indirect Symbol Table
-  /// @{
-
-  struct IndirectSymbolTableEntry {
-    uint32_t Index;
-  };
-
-  /// @}
-  /// @name Relocation Data
-  /// @{
-
-  struct RelocationEntry {
-    uint32_t Word0;
-    uint32_t Word1;
-  };
-
-  /// @}
-
-  // See <mach-o/nlist.h>.
-  enum SymbolTypeType {
-    STT_Undefined = 0x00,
-    STT_Absolute  = 0x02,
-    STT_Section   = 0x0e
-  };
-
-  enum SymbolTypeFlags {
-    // If any of these bits are set, then the entry is a stab entry number (see
-    // <mach-o/stab.h>. Otherwise the other masks apply.
-    STF_StabsEntryMask = 0xe0,
-
-    STF_TypeMask       = 0x0e,
-    STF_External       = 0x01,
-    STF_PrivateExtern  = 0x10
-  };
-
-  /// IndirectSymbolFlags - Flags for encoding special values in the indirect
-  /// symbol entry.
-  enum IndirectSymbolFlags {
-    ISF_Local    = 0x80000000,
-    ISF_Absolute = 0x40000000
-  };
-
-  /// RelocationFlags - Special flags for addresses.
-  enum RelocationFlags {
-    RF_Scattered = 0x80000000
-  };
-
-  /// Common relocation info types.
-  enum RelocationInfoType {
-    RIT_Vanilla             = 0,
-    RIT_Pair                = 1,
-    RIT_Difference          = 2
-  };
-
-  /// Generic relocation info types, which are shared by some (but not all)
-  /// platforms.
-  enum RelocationInfoType_Generic {
-    RIT_Generic_PreboundLazyPointer = 3,
-    RIT_Generic_LocalDifference     = 4,
-    RIT_Generic_TLV                 = 5
-  };
-
-  /// X86_64 uses its own relocation types.
-  enum RelocationInfoTypeX86_64 {
-    // Note that x86_64 doesn't even share the common relocation types.
-    RIT_X86_64_Unsigned   = 0,
-    RIT_X86_64_Signed     = 1,
-    RIT_X86_64_Branch     = 2,
-    RIT_X86_64_GOTLoad    = 3,
-    RIT_X86_64_GOT        = 4,
-    RIT_X86_64_Subtractor = 5,
-    RIT_X86_64_Signed1    = 6,
-    RIT_X86_64_Signed2    = 7,
-    RIT_X86_64_Signed4    = 8,
-    RIT_X86_64_TLV        = 9
-  };
-
-  /// ARM uses its own relocation types.
-  enum RelocationInfoTypeARM {
-    RIT_ARM_LocalDifference = 3,
-    RIT_ARM_PreboundLazyPointer = 4,
-    RIT_ARM_Branch24Bit = 5,
-    RIT_ARM_ThumbBranch22Bit = 6,
-    RIT_ARM_ThumbBranch32Bit = 7,
-    RIT_ARM_Half = 8,
-    RIT_ARM_HalfDifference = 9
-
-  };
-
-} // end namespace macho
-
-} // end namespace object
-} // end namespace llvm
-
-#endif
diff --git a/include/llvm/Object/MachOUniversal.h b/include/llvm/Object/MachOUniversal.h
index 5743282..c5d1359 100644
--- a/include/llvm/Object/MachOUniversal.h
+++ b/include/llvm/Object/MachOUniversal.h
@@ -18,7 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/Binary.h"
-#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/MachO.h"
 
 namespace llvm {
 namespace object {
@@ -35,7 +35,7 @@ public:
     /// \brief Index of object in the universal binary.
     uint32_t Index;
     /// \brief Descriptor of the object.
-    macho::FatArchHeader Header;
+    MachO::fat_arch Header;
 
   public:
     ObjectForArch(const MachOUniversalBinary *Parent, uint32_t Index);
@@ -50,7 +50,7 @@ public:
     }
 
     ObjectForArch getNext() const { return ObjectForArch(Parent, Index + 1); }
-    uint32_t getCPUType() const { return Header.CPUType; }
+    uint32_t getCPUType() const { return Header.cputype; }
 
     error_code getAsObjectFile(OwningPtr<ObjectFile> &Result) const;
   };
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index f434d63..9aea639 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -192,7 +192,7 @@ public:
     ST_Other
   };
 
-  enum Flags {
+  enum Flags LLVM_ENUM_INT_TYPE(unsigned) {
     SF_None            = 0,
     SF_Undefined       = 1U << 0,  // Symbol is defined in another object file
     SF_Global          = 1U << 1,  // Global symbol
@@ -221,10 +221,6 @@ public:
   error_code getSize(uint64_t &Result) const;
   error_code getType(SymbolRef::Type &Result) const;
 
-  /// Returns the ascii char that should be displayed in a symbol table dump via
-  /// nm for this symbol.
-  error_code getNMTypeChar(char &Result) const;
-
   /// Get symbol flags (bitwise OR of SymbolRef::Flags)
   error_code getFlags(uint32_t &Result) const;
 
@@ -296,7 +292,6 @@ protected:
   virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const = 0;
   virtual error_code getSymbolType(DataRefImpl Symb,
                                    SymbolRef::Type &Res) const = 0;
-  virtual error_code getSymbolNMTypeChar(DataRefImpl Symb, char &Res) const = 0;
   virtual error_code getSymbolFlags(DataRefImpl Symb,
                                     uint32_t &Res) const = 0;
   virtual error_code getSymbolSection(DataRefImpl Symb,
@@ -322,8 +317,8 @@ protected:
   virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const =0;
   virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
                                            bool &Result) const = 0;
-  virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const = 0;
-  virtual relocation_iterator getSectionRelEnd(DataRefImpl Sec) const = 0;
+  virtual relocation_iterator section_rel_begin(DataRefImpl Sec) const = 0;
+  virtual relocation_iterator section_rel_end(DataRefImpl Sec) const = 0;
   virtual section_iterator getRelocatedSection(DataRefImpl Sec) const;
 
   // Same as above for RelocationRef.
@@ -431,10 +426,6 @@ inline error_code SymbolRef::getSize(uint64_t &Result) const {
   return OwningObject->getSymbolSize(SymbolPimpl, Result);
 }
 
-inline error_code SymbolRef::getNMTypeChar(char &Result) const {
-  return OwningObject->getSymbolNMTypeChar(SymbolPimpl, Result);
-}
-
 inline error_code SymbolRef::getFlags(uint32_t &Result) const {
   return OwningObject->getSymbolFlags(SymbolPimpl, Result);
 }
@@ -528,11 +519,11 @@ inline error_code SectionRef::containsSymbol(SymbolRef S, bool &Result) const {
 }
 
 inline relocation_iterator SectionRef::begin_relocations() const {
-  return OwningObject->getSectionRelBegin(SectionPimpl);
+  return OwningObject->section_rel_begin(SectionPimpl);
 }
 
 inline relocation_iterator SectionRef::end_relocations() const {
-  return OwningObject->getSectionRelEnd(SectionPimpl);
+  return OwningObject->section_rel_end(SectionPimpl);
 }
 
 inline section_iterator SectionRef::getRelocatedSection() const {
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index b1eb4e6..97912fe 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -18,7 +18,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/include/llvm/Option/OptParser.td b/include/llvm/Option/OptParser.td
index 32cc2c0..963389f 100644
--- a/include/llvm/Option/OptParser.td
+++ b/include/llvm/Option/OptParser.td
@@ -44,6 +44,8 @@ def KIND_JOINED_OR_SEPARATE : OptionKind<"JoinedOrSeparate">;
 // An option which is both joined to its (first) value, and followed by its
 // (second) value.
 def KIND_JOINED_AND_SEPARATE : OptionKind<"JoinedAndSeparate">;
+// An option which consumes all remaining arguments if there are any.
+def KIND_REMAINING_ARGS : OptionKind<"RemainingArgs">;
 
 // Define the option flags.
 
diff --git a/include/llvm/Option/OptTable.h b/include/llvm/Option/OptTable.h
index a5b59ce..5035940 100644
--- a/include/llvm/Option/OptTable.h
+++ b/include/llvm/Option/OptTable.h
@@ -51,6 +51,7 @@ private:
   /// \brief The static option information table.
   const Info *OptionInfos;
   unsigned NumOptionInfos;
+  bool IgnoreCase;
 
   unsigned TheInputOptionID;
   unsigned TheUnknownOptionID;
@@ -72,7 +73,8 @@ private:
   }
 
 protected:
-  OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos);
+  OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos,
+           bool _IgnoreCase = false);
 public:
   ~OptTable();
 
diff --git a/include/llvm/Option/Option.h b/include/llvm/Option/Option.h
index 47fd817..03d4774 100644
--- a/include/llvm/Option/Option.h
+++ b/include/llvm/Option/Option.h
@@ -50,6 +50,7 @@ public:
     FlagClass,
     JoinedClass,
     SeparateClass,
+    RemainingArgsClass,
     CommaJoinedClass,
     MultiArgClass,
     JoinedOrSeparateClass,
@@ -149,6 +150,7 @@ public:
     case SeparateClass:
     case MultiArgClass:
     case JoinedOrSeparateClass:
+    case RemainingArgsClass:
       return RenderSeparateStyle;
     }
     llvm_unreachable("Unexpected kind!");
diff --git a/include/llvm/PassManager.h b/include/llvm/PassManager.h
index b6a8186..2a191b3 100644
--- a/include/llvm/PassManager.h
+++ b/include/llvm/PassManager.h
@@ -7,101 +7,33 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the PassManager class.  This class is used to hold,
-// maintain, and optimize execution of Passes.  The PassManager class ensures
-// that analysis results are available before a pass runs, and that Pass's are
-// destroyed when the PassManager is destroyed.
+// This is a legacy redirect header for the old PassManager. It is intended to
+// be used by clients that have not been converted to be aware of the new pass
+// management infrastructure being built for LLVM, which is every client
+// initially. Eventually this header (and the legacy management layer) will go
+// away, but we want to minimize changes to out-of-tree users of LLVM in the
+// interim.
+//
+// Note that this header *must not* be included into the same file as the new
+// pass management infrastructure is included. Things will break spectacularly.
+// If you are starting that conversion, you should switch to explicitly
+// including LegacyPassManager.h and using the legacy namespace.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_PASSMANAGER_H
 #define LLVM_PASSMANAGER_H
 
-#include "llvm/Pass.h"
-#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/IR/LegacyPassManager.h"
 
 namespace llvm {
 
-class Pass;
-class Module;
-
-class PassManagerImpl;
-class FunctionPassManagerImpl;
-
-/// PassManagerBase - An abstract interface to allow code to add passes to
-/// a pass manager without having to hard-code what kind of pass manager
-/// it is.
-class PassManagerBase {
-public:
-  virtual ~PassManagerBase();
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  virtual void add(Pass *P) = 0;
-};
-
-/// PassManager manages ModulePassManagers
-class PassManager : public PassManagerBase {
-public:
-
-  PassManager();
-  ~PassManager();
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P);
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool run(Module &M);
-
-private:
-  /// PassManagerImpl_New is the actual class. PassManager is just the
-  /// wraper to publish simple pass manager interface
-  PassManagerImpl *PM;
-};
-
-/// FunctionPassManager manages FunctionPasses and BasicBlockPassManagers.
-class FunctionPassManager : public PassManagerBase {
-public:
-  /// FunctionPassManager ctor - This initializes the pass manager.  It needs,
-  /// but does not take ownership of, the specified Module.
-  explicit FunctionPassManager(Module *M);
-  ~FunctionPassManager();
-
-  /// add - Add a pass to the queue of passes to run.  This passes
-  /// ownership of the Pass to the PassManager.  When the
-  /// PassManager_X is destroyed, the pass will be destroyed as well, so
-  /// there is no need to delete the pass.
-  /// This implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P);
-
-  /// run - Execute all of the passes scheduled for execution.  Keep
-  /// track of whether any of the passes modifies the function, and if
-  /// so, return true.
-  ///
-  bool run(Function &F);
-
-  /// doInitialization - Run all of the initializers for the function passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the function passes.
-  ///
-  bool doFinalization();
-
-private:
-  FunctionPassManagerImpl *FPM;
-  Module *M;
-};
-
-// Create wrappers for C Binding types (see CBindingWrapping.h).
-DEFINE_STDCXX_CONVERSION_FUNCTIONS(PassManagerBase, LLVMPassManagerRef)
+// Pull these into the llvm namespace so that existing code that expects it
+// there can find it.
+using legacy::PassManagerBase;
+using legacy::PassManager;
+using legacy::FunctionPassManager;
 
-} // End llvm namespace
+}
 
 #endif
diff --git a/include/llvm/PassManagers.h b/include/llvm/PassManagers.h
deleted file mode 100644
index 7afb0a0..0000000
--- a/include/llvm/PassManagers.h
+++ /dev/null
@@ -1,470 +0,0 @@
-//===- llvm/PassManagers.h - Pass Infrastructure classes  -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the LLVM Pass Manager infrastructure.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_PASSMANAGERS_H
-#define LLVM_PASSMANAGERS_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Pass.h"
-#include <map>
-#include <vector>
-
-//===----------------------------------------------------------------------===//
-// Overview:
-// The Pass Manager Infrastructure manages passes. It's responsibilities are:
-//
-//   o Manage optimization pass execution order
-//   o Make required Analysis information available before pass P is run
-//   o Release memory occupied by dead passes
-//   o If Analysis information is dirtied by a pass then regenerate Analysis
-//     information before it is consumed by another pass.
-//
-// Pass Manager Infrastructure uses multiple pass managers.  They are
-// PassManager, FunctionPassManager, MPPassManager, FPPassManager, BBPassManager.
-// This class hierarchy uses multiple inheritance but pass managers do not
-// derive from another pass manager.
-//
-// PassManager and FunctionPassManager are two top-level pass manager that
-// represents the external interface of this entire pass manager infrastucture.
-//
-// Important classes :
-//
-// [o] class PMTopLevelManager;
-//
-// Two top level managers, PassManager and FunctionPassManager, derive from
-// PMTopLevelManager. PMTopLevelManager manages information used by top level
-// managers such as last user info.
-//
-// [o] class PMDataManager;
-//
-// PMDataManager manages information, e.g. list of available analysis info,
-// used by a pass manager to manage execution order of passes. It also provides
-// a place to implement common pass manager APIs. All pass managers derive from
-// PMDataManager.
-//
-// [o] class BBPassManager : public FunctionPass, public PMDataManager;
-//
-// BBPassManager manages BasicBlockPasses.
-//
-// [o] class FunctionPassManager;
-//
-// This is a external interface used by JIT to manage FunctionPasses. This
-// interface relies on FunctionPassManagerImpl to do all the tasks.
-//
-// [o] class FunctionPassManagerImpl : public ModulePass, PMDataManager,
-//                                     public PMTopLevelManager;
-//
-// FunctionPassManagerImpl is a top level manager. It manages FPPassManagers
-//
-// [o] class FPPassManager : public ModulePass, public PMDataManager;
-//
-// FPPassManager manages FunctionPasses and BBPassManagers
-//
-// [o] class MPPassManager : public Pass, public PMDataManager;
-//
-// MPPassManager manages ModulePasses and FPPassManagers
-//
-// [o] class PassManager;
-//
-// This is a external interface used by various tools to manages passes. It
-// relies on PassManagerImpl to do all the tasks.
-//
-// [o] class PassManagerImpl : public Pass, public PMDataManager,
-//                             public PMTopLevelManager
-//
-// PassManagerImpl is a top level pass manager responsible for managing
-// MPPassManagers.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/PrettyStackTrace.h"
-
-namespace llvm {
-  class Module;
-  class Pass;
-  class StringRef;
-  class Value;
-  class Timer;
-  class PMDataManager;
-
-// enums for debugging strings
-enum PassDebuggingString {
-  EXECUTION_MSG, // "Executing Pass '"
-  MODIFICATION_MSG, // "' Made Modification '"
-  FREEING_MSG, // " Freeing Pass '"
-  ON_BASICBLOCK_MSG, // "'  on BasicBlock '" + PassName + "'...\n"
-  ON_FUNCTION_MSG, // "' on Function '" + FunctionName + "'...\n"
-  ON_MODULE_MSG, // "' on Module '" + ModuleName + "'...\n"
-  ON_REGION_MSG, // " 'on Region ...\n'"
-  ON_LOOP_MSG, // " 'on Loop ...\n'"
-  ON_CG_MSG // "' on Call Graph ...\n'"
-};
-
-/// PassManagerPrettyStackEntry - This is used to print informative information
-/// about what pass is running when/if a stack trace is generated.
-class PassManagerPrettyStackEntry : public PrettyStackTraceEntry {
-  Pass *P;
-  Value *V;
-  Module *M;
-public:
-  explicit PassManagerPrettyStackEntry(Pass *p)
-    : P(p), V(0), M(0) {}  // When P is releaseMemory'd.
-  PassManagerPrettyStackEntry(Pass *p, Value &v)
-    : P(p), V(&v), M(0) {} // When P is run on V
-  PassManagerPrettyStackEntry(Pass *p, Module &m)
-    : P(p), V(0), M(&m) {} // When P is run on M
-
-  /// print - Emit information about this stack frame to OS.
-  virtual void print(raw_ostream &OS) const;
-};
-
-
-//===----------------------------------------------------------------------===//
-// PMStack
-//
-/// PMStack - This class implements a stack data structure of PMDataManager
-/// pointers.
-///
-/// Top level pass managers (see PassManager.cpp) maintain active Pass Managers
-/// using PMStack. Each Pass implements assignPassManager() to connect itself
-/// with appropriate manager. assignPassManager() walks PMStack to find
-/// suitable manager.
-class PMStack {
-public:
-  typedef std::vector<PMDataManager *>::const_reverse_iterator iterator;
-  iterator begin() const { return S.rbegin(); }
-  iterator end() const { return S.rend(); }
-
-  void pop();
-  PMDataManager *top() const { return S.back(); }
-  void push(PMDataManager *PM);
-  bool empty() const { return S.empty(); }
-
-  void dump() const;
-
-private:
-  std::vector<PMDataManager *> S;
-};
-
-
-//===----------------------------------------------------------------------===//
-// PMTopLevelManager
-//
-/// PMTopLevelManager manages LastUser info and collects common APIs used by
-/// top level pass managers.
-class PMTopLevelManager {
-protected:
-  explicit PMTopLevelManager(PMDataManager *PMDM);
-
-  unsigned getNumContainedManagers() const {
-    return (unsigned)PassManagers.size();
-  }
-
-  void initializeAllAnalysisInfo();
-
-private:
-  virtual PMDataManager *getAsPMDataManager() = 0;
-  virtual PassManagerType getTopLevelPassManagerType() = 0;
-
-public:
-  /// Schedule pass P for execution. Make sure that passes required by
-  /// P are run before P is run. Update analysis info maintained by
-  /// the manager. Remove dead passes. This is a recursive function.
-  void schedulePass(Pass *P);
-
-  /// Set pass P as the last user of the given analysis passes.
-  void setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P);
-
-  /// Collect passes whose last user is P
-  void collectLastUses(SmallVectorImpl<Pass *> &LastUses, Pass *P);
-
-  /// Find the pass that implements Analysis AID. Search immutable
-  /// passes and all pass managers. If desired pass is not found
-  /// then return NULL.
-  Pass *findAnalysisPass(AnalysisID AID);
-
-  /// Find analysis usage information for the pass P.
-  AnalysisUsage *findAnalysisUsage(Pass *P);
-
-  virtual ~PMTopLevelManager();
-
-  /// Add immutable pass and initialize it.
-  inline void addImmutablePass(ImmutablePass *P) {
-    P->initializePass();
-    ImmutablePasses.push_back(P);
-  }
-
-  inline SmallVectorImpl<ImmutablePass *>& getImmutablePasses() {
-    return ImmutablePasses;
-  }
-
-  void addPassManager(PMDataManager *Manager) {
-    PassManagers.push_back(Manager);
-  }
-
-  // Add Manager into the list of managers that are not directly
-  // maintained by this top level pass manager
-  inline void addIndirectPassManager(PMDataManager *Manager) {
-    IndirectPassManagers.push_back(Manager);
-  }
-
-  // Print passes managed by this top level manager.
-  void dumpPasses() const;
-  void dumpArguments() const;
-
-  // Active Pass Managers
-  PMStack activeStack;
-
-protected:
-
-  /// Collection of pass managers
-  SmallVector<PMDataManager *, 8> PassManagers;
-
-private:
-
-  /// Collection of pass managers that are not directly maintained
-  /// by this pass manager
-  SmallVector<PMDataManager *, 8> IndirectPassManagers;
-
-  // Map to keep track of last user of the analysis pass.
-  // LastUser->second is the last user of Lastuser->first.
-  DenseMap<Pass *, Pass *> LastUser;
-
-  // Map to keep track of passes that are last used by a pass.
-  // This inverse map is initialized at PM->run() based on
-  // LastUser map.
-  DenseMap<Pass *, SmallPtrSet<Pass *, 8> > InversedLastUser;
-
-  /// Immutable passes are managed by top level manager.
-  SmallVector<ImmutablePass *, 8> ImmutablePasses;
-
-  DenseMap<Pass *, AnalysisUsage *> AnUsageMap;
-};
-
-
-
-//===----------------------------------------------------------------------===//
-// PMDataManager
-
-/// PMDataManager provides the common place to manage the analysis data
-/// used by pass managers.
-class PMDataManager {
-public:
-
-  explicit PMDataManager() : TPM(NULL), Depth(0) {
-    initializeAnalysisInfo();
-  }
-
-  virtual ~PMDataManager();
-
-  virtual Pass *getAsPass() = 0;
-
-  /// Augment AvailableAnalysis by adding analysis made available by pass P.
-  void recordAvailableAnalysis(Pass *P);
-
-  /// verifyPreservedAnalysis -- Verify analysis presreved by pass P.
-  void verifyPreservedAnalysis(Pass *P);
-
-  /// Remove Analysis that is not preserved by the pass
-  void removeNotPreservedAnalysis(Pass *P);
-
-  /// Remove dead passes used by P.
-  void removeDeadPasses(Pass *P, StringRef Msg,
-                        enum PassDebuggingString);
-
-  /// Remove P.
-  void freePass(Pass *P, StringRef Msg,
-                enum PassDebuggingString);
-
-  /// Add pass P into the PassVector. Update
-  /// AvailableAnalysis appropriately if ProcessAnalysis is true.
-  void add(Pass *P, bool ProcessAnalysis = true);
-
-  /// Add RequiredPass into list of lower level passes required by pass P.
-  /// RequiredPass is run on the fly by Pass Manager when P requests it
-  /// through getAnalysis interface.
-  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
-
-  virtual Pass *getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F);
-
-  /// Initialize available analysis information.
-  void initializeAnalysisInfo() {
-    AvailableAnalysis.clear();
-    for (unsigned i = 0; i < PMT_Last; ++i)
-      InheritedAnalysis[i] = NULL;
-  }
-
-  // Return true if P preserves high level analysis used by other
-  // passes that are managed by this manager.
-  bool preserveHigherLevelAnalysis(Pass *P);
-
-
-  /// Populate RequiredPasses with analysis pass that are required by
-  /// pass P and are available. Populate ReqPassNotAvailable with analysis
-  /// pass that are required by pass P but are not available.
-  void collectRequiredAnalysis(SmallVectorImpl<Pass *> &RequiredPasses,
-                               SmallVectorImpl<AnalysisID> &ReqPassNotAvailable,
-                               Pass *P);
-
-  /// All Required analyses should be available to the pass as it runs!  Here
-  /// we fill in the AnalysisImpls member of the pass so that it can
-  /// successfully use the getAnalysis() method to retrieve the
-  /// implementations it needs.
-  void initializeAnalysisImpl(Pass *P);
-
-  /// Find the pass that implements Analysis AID. If desired pass is not found
-  /// then return NULL.
-  Pass *findAnalysisPass(AnalysisID AID, bool Direction);
-
-  // Access toplevel manager
-  PMTopLevelManager *getTopLevelManager() { return TPM; }
-  void setTopLevelManager(PMTopLevelManager *T) { TPM = T; }
-
-  unsigned getDepth() const { return Depth; }
-  void setDepth(unsigned newDepth) { Depth = newDepth; }
-
-  // Print routines used by debug-pass
-  void dumpLastUses(Pass *P, unsigned Offset) const;
-  void dumpPassArguments() const;
-  void dumpPassInfo(Pass *P, enum PassDebuggingString S1,
-                    enum PassDebuggingString S2, StringRef Msg);
-  void dumpRequiredSet(const Pass *P) const;
-  void dumpPreservedSet(const Pass *P) const;
-
-  unsigned getNumContainedPasses() const {
-    return (unsigned)PassVector.size();
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    assert ( 0 && "Invalid use of getPassManagerType");
-    return PMT_Unknown;
-  }
-
-  DenseMap<AnalysisID, Pass*> *getAvailableAnalysis() {
-    return &AvailableAnalysis;
-  }
-
-  // Collect AvailableAnalysis from all the active Pass Managers.
-  void populateInheritedAnalysis(PMStack &PMS) {
-    unsigned Index = 0;
-    for (PMStack::iterator I = PMS.begin(), E = PMS.end();
-         I != E; ++I)
-      InheritedAnalysis[Index++] = (*I)->getAvailableAnalysis();
-  }
-
-protected:
-
-  // Top level manager.
-  PMTopLevelManager *TPM;
-
-  // Collection of pass that are managed by this manager
-  SmallVector<Pass *, 16> PassVector;
-
-  // Collection of Analysis provided by Parent pass manager and
-  // used by current pass manager. At at time there can not be more
-  // then PMT_Last active pass mangers.
-  DenseMap<AnalysisID, Pass *> *InheritedAnalysis[PMT_Last];
-
-  /// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
-  /// or higher is specified.
-  bool isPassDebuggingExecutionsOrMore() const;
-
-private:
-  void dumpAnalysisUsage(StringRef Msg, const Pass *P,
-                         const AnalysisUsage::VectorType &Set) const;
-
-  // Set of available Analysis. This information is used while scheduling
-  // pass. If a pass requires an analysis which is not available then
-  // the required analysis pass is scheduled to run before the pass itself is
-  // scheduled to run.
-  DenseMap<AnalysisID, Pass*> AvailableAnalysis;
-
-  // Collection of higher level analysis used by the pass managed by
-  // this manager.
-  SmallVector<Pass *, 8> HigherLevelAnalysis;
-
-  unsigned Depth;
-};
-
-//===----------------------------------------------------------------------===//
-// FPPassManager
-//
-/// FPPassManager manages BBPassManagers and FunctionPasses.
-/// It batches all function passes and basic block pass managers together and
-/// sequence them to process one function at a time before processing next
-/// function.
-class FPPassManager : public ModulePass, public PMDataManager {
-public:
-  static char ID;
-  explicit FPPassManager()
-  : ModulePass(ID), PMDataManager() { }
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool runOnFunction(Function &F);
-  bool runOnModule(Module &M);
-
-  /// cleanup - After running all passes, clean up pass manager cache.
-  void cleanup();
-
-  /// doInitialization - Overrides ModulePass doInitialization for global
-  /// initialization tasks
-  ///
-  using ModulePass::doInitialization;
-
-  /// doInitialization - Run all of the initializers for the function passes.
-  ///
-  bool doInitialization(Module &M);
-
-  /// doFinalization - Overrides ModulePass doFinalization for global
-  /// finalization tasks
-  /// 
-  using ModulePass::doFinalization;
-  
-  /// doFinalization - Run all of the finalizers for the function passes.
-  ///
-  bool doFinalization(Module &M);
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  // Print passes managed by this manager
-  void dumpPassStructure(unsigned Offset);
-
-  virtual const char *getPassName() const {
-    return "Function Pass Manager";
-  }
-
-  FunctionPass *getContainedPass(unsigned N) {
-    assert ( N < PassVector.size() && "Pass number out of range!");
-    FunctionPass *FP = static_cast<FunctionPass *>(PassVector[N]);
-    return FP;
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    return PMT_FunctionPassManager;
-  }
-};
-
-Timer *getPassTimer(Pass *);
-
-}
-
-#endif
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index 3243fd9..397f50f 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -99,6 +99,9 @@ class BumpPtrAllocator {
   /// allocate a separate slab.
   size_t SizeThreshold;
 
+  /// \brief the default allocator used if one is not provided
+  MallocSlabAllocator DefaultSlabAllocator;
+
   /// Allocator - The underlying allocator we use to get slabs of memory.  This
   /// defaults to MallocSlabAllocator, which wraps malloc, but it could be
   /// changed to use a custom allocator.
@@ -133,12 +136,10 @@ class BumpPtrAllocator {
   /// one.
   void DeallocateSlabs(MemSlab *Slab);
 
-  static MallocSlabAllocator DefaultSlabAllocator;
-
   template<typename T> friend class SpecificBumpPtrAllocator;
 public:
-  BumpPtrAllocator(size_t size = 4096, size_t threshold = 4096,
-                   SlabAllocator &allocator = DefaultSlabAllocator);
+  BumpPtrAllocator(size_t size = 4096, size_t threshold = 4096);
+  BumpPtrAllocator(size_t size, size_t threshold, SlabAllocator &allocator);
   ~BumpPtrAllocator();
 
   /// Reset - Deallocate all but the current slab and reset the current pointer
@@ -189,8 +190,10 @@ template <typename T>
 class SpecificBumpPtrAllocator {
   BumpPtrAllocator Allocator;
 public:
-  SpecificBumpPtrAllocator(size_t size = 4096, size_t threshold = 4096,
-              SlabAllocator &allocator = BumpPtrAllocator::DefaultSlabAllocator)
+  SpecificBumpPtrAllocator(size_t size = 4096, size_t threshold = 4096)
+    : Allocator(size, threshold) {}
+  SpecificBumpPtrAllocator(size_t size, size_t threshold,
+                           SlabAllocator &allocator)
     : Allocator(size, threshold, allocator) {}
 
   ~SpecificBumpPtrAllocator() {
diff --git a/include/llvm/Support/BlockFrequency.h b/include/llvm/Support/BlockFrequency.h
index 666b3c8..21879e7 100644
--- a/include/llvm/Support/BlockFrequency.h
+++ b/include/llvm/Support/BlockFrequency.h
@@ -27,8 +27,10 @@ class BlockFrequency {
   uint64_t Frequency;
   static const int64_t ENTRY_FREQ = 1 << 14;
 
-  // Scale frequency by N/D, saturating on overflow.
-  void scale(uint32_t N, uint32_t D);
+  /// \brief Scale the given BlockFrequency by N/D. Return the remainder from
+  /// the division by D. Upon overflow, the routine will saturate and
+  /// additionally will return the remainder set to D.
+  uint32_t scale(uint32_t N, uint32_t D);
 
 public:
   BlockFrequency(uint64_t Freq = 0) : Frequency(Freq) { }
@@ -57,6 +59,10 @@ public:
   BlockFrequency &operator+=(const BlockFrequency &Freq);
   const BlockFrequency operator+(const BlockFrequency &Freq) const;
 
+  /// \brief Scale the given BlockFrequency by N/D. Return the remainder from
+  /// the division by D. Upon overflow, the routine will saturate.
+  uint32_t scale(const BranchProbability &Prob);
+
   bool operator<(const BlockFrequency &RHS) const {
     return Frequency < RHS.Frequency;
   }
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index a39a3f9..9cc3989 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -222,7 +222,7 @@ namespace COFF {
     uint32_t Characteristics;
   };
 
-  enum SectionCharacteristics {
+  enum SectionCharacteristics LLVM_ENUM_INT_TYPE(uint32_t) {
     SC_Invalid = 0xffffffff,
 
     IMAGE_SCN_TYPE_NO_PAD            = 0x00000008,
@@ -511,7 +511,9 @@ namespace COFF {
     IMAGE_SUBSYSTEM_NATIVE = 1, ///< Device drivers and native Windows processes
     IMAGE_SUBSYSTEM_WINDOWS_GUI = 2, ///< The Windows GUI subsystem.
     IMAGE_SUBSYSTEM_WINDOWS_CUI = 3, ///< The Windows character subsystem.
+    IMAGE_SUBSYSTEM_OS2_CUI = 5, ///< The OS/2 character subsytem.
     IMAGE_SUBSYSTEM_POSIX_CUI = 7, ///< The POSIX character subsystem.
+    IMAGE_SUBSYSTEM_NATIVE_WINDOWS = 8, ///< Native Windows 9x driver.
     IMAGE_SUBSYSTEM_WINDOWS_CE_GUI = 9, ///< Windows CE.
     IMAGE_SUBSYSTEM_EFI_APPLICATION = 10, ///< An EFI application.
     IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER = 11, ///< An EFI driver with boot
@@ -519,7 +521,8 @@ namespace COFF {
     IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER = 12, ///< An EFI driver with run-time
                                              ///  services.
     IMAGE_SUBSYSTEM_EFI_ROM = 13, ///< An EFI ROM image.
-    IMAGE_SUBSYSTEM_XBOX = 14 ///< XBOX.
+    IMAGE_SUBSYSTEM_XBOX = 14, ///< XBOX.
+    IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION = 16 ///< A BCD application.
   };
 
   enum DLLCharacteristics {
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index c0bfbae..4efb6a6 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -350,6 +350,9 @@ struct cat {
 struct GenericOptionValue {
   virtual ~GenericOptionValue() {}
   virtual bool compare(const GenericOptionValue &V) const = 0;
+
+private:
+  virtual void anchor();
 };
 
 template<class DataType> struct OptionValue;
@@ -1752,6 +1755,7 @@ void getRegisteredOptions(StringMap<Option*> &Map);
 /// \brief Saves strings in the inheritor's stable storage and returns a stable
 /// raw character pointer.
 class StringSaver {
+  virtual void anchor();
 public:
   virtual const char *SaveString(const char *Str) = 0;
   virtual ~StringSaver() {};  // Pacify -Wnon-virtual-dtor.
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index feac934..860f43e 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -179,6 +179,12 @@
 #define LLVM_ATTRIBUTE_USED
 #endif
 
+#if __has_attribute(warn_unused_result) || __GNUC_PREREQ(3, 4)
+#define LLVM_ATTRIBUTE_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define LLVM_ATTRIBUTE_UNUSED_RESULT
+#endif
+
 // Some compilers warn about unused functions. When a function is sometimes
 // used or not depending on build settings (e.g. a function only called from
 // within "assert"), this attribute can be used to suppress such warnings.
@@ -385,4 +391,24 @@
 # define LLVM_STATIC_ASSERT(expr, msg)
 #endif
 
+/// \macro LLVM_ENUM_INT_TYPE
+/// \brief Expands to colon followed by the given integral type on compilers
+/// which support C++11 strong enums.  This can be used to make enums unsigned
+/// with MSVC.
+#if __has_feature(cxx_strong_enums)
+# define LLVM_ENUM_INT_TYPE(intty) : intty
+#elif defined(_MSC_VER) && _MSC_VER >= 1600  // Added in MSVC 2010.
+# define LLVM_ENUM_INT_TYPE(intty) : intty
+#else
+# define LLVM_ENUM_INT_TYPE(intty)
+#endif
+
+/// \brief Does the compiler support generalized initializers (using braced
+/// lists and std::initializer_list).
+#if __has_feature(cxx_generalized_initializers)
+#define LLVM_HAS_INITIALIZER_LISTS 1
+#else
+#define LLVM_HAS_INITIALIZER_LISTS 0
+#endif
+
 #endif
diff --git a/include/llvm/Support/Compression.h b/include/llvm/Support/Compression.h
index 9b1142d..bef9146 100644
--- a/include/llvm/Support/Compression.h
+++ b/include/llvm/Support/Compression.h
@@ -50,6 +50,8 @@ Status uncompress(StringRef InputBuffer,
                   OwningPtr<MemoryBuffer> &UncompressedBuffer,
                   size_t UncompressedSize);
 
+uint32_t crc32(StringRef Buffer);
+
 }  // End of namespace zlib
 
 } // End of namespace llvm
diff --git a/include/llvm/Support/Debug.h b/include/llvm/Support/Debug.h
index 896fe84..2702408 100644
--- a/include/llvm/Support/Debug.h
+++ b/include/llvm/Support/Debug.h
@@ -26,9 +26,9 @@
 #ifndef LLVM_SUPPORT_DEBUG_H
 #define LLVM_SUPPORT_DEBUG_H
 
-namespace llvm {
+#include "llvm/Support/raw_ostream.h"
 
-class raw_ostream;
+namespace llvm {
 
 /// DEBUG_TYPE macro - Files can specify a DEBUG_TYPE as a string, which causes
 /// all of their DEBUG statements to be activatable with -debug-only=thatstring.
diff --git a/include/llvm/Support/DebugLoc.h b/include/llvm/Support/DebugLoc.h
index f35d407..05f31d7 100644
--- a/include/llvm/Support/DebugLoc.h
+++ b/include/llvm/Support/DebugLoc.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_SUPPORT_DEBUGLOC_H
 #define LLVM_SUPPORT_DEBUGLOC_H
 
+#include "llvm/Support/DataTypes.h"
+
 namespace llvm {
   template <typename T> struct DenseMapInfo;
   class MDNode;
@@ -45,7 +47,7 @@ namespace llvm {
     /// LineCol - This 32-bit value encodes the line and column number for the
     /// location, encoded as 24-bits for line and 8 bits for col.  A value of 0
     /// for either means unknown.
-    unsigned LineCol;
+    uint32_t LineCol;
 
     /// ScopeIdx - This is an opaque ID# for Scope/InlinedAt information,
     /// decoded by LLVMContext.  0 is unknown.
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index d0e2322..23bbd1c 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -16,60 +16,59 @@
 #ifndef LLVM_SUPPORT_DWARF_H
 #define LLVM_SUPPORT_DWARF_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 
-
 namespace llvm {
 
 //===----------------------------------------------------------------------===//
 // Debug info constants.
 
-enum {
-  LLVMDebugVersion = (12 << 16),        // Current version of debug information.
-  LLVMDebugVersion11 = (11 << 16),      // Constant for version 11.
-  LLVMDebugVersion10 = (10 << 16),      // Constant for version 10.
-  LLVMDebugVersion9 = (9 << 16),        // Constant for version 9.
-  LLVMDebugVersion8 = (8 << 16),        // Constant for version 8.
-  LLVMDebugVersion7 = (7 << 16),        // Constant for version 7.
-  LLVMDebugVersion6 = (6 << 16),        // Constant for version 6.
-  LLVMDebugVersion5 = (5 << 16),        // Constant for version 5.
-  LLVMDebugVersion4 = (4 << 16),        // Constant for version 4.
-  LLVMDebugVersionMask = 0xffff0000     // Mask for version number.
+enum LLVM_ENUM_INT_TYPE(uint32_t) {
+  LLVMDebugVersion = (12 << 16),    // Current version of debug information.
+  LLVMDebugVersion11 = (11 << 16),  // Constant for version 11.
+  LLVMDebugVersion10 = (10 << 16),  // Constant for version 10.
+  LLVMDebugVersion9 = (9 << 16),    // Constant for version 9.
+  LLVMDebugVersion8 = (8 << 16),    // Constant for version 8.
+  LLVMDebugVersion7 = (7 << 16),    // Constant for version 7.
+  LLVMDebugVersion6 = (6 << 16),    // Constant for version 6.
+  LLVMDebugVersion5 = (5 << 16),    // Constant for version 5.
+  LLVMDebugVersion4 = (4 << 16),    // Constant for version 4.
+  LLVMDebugVersionMask = 0xffff0000 // Mask for version number.
 };
 
 namespace dwarf {
 
 //===----------------------------------------------------------------------===//
 // Dwarf constants as gleaned from the DWARF Debugging Information Format V.4
-// reference manual http://dwarf.freestandards.org .
+// reference manual http://dwarf.freestandards.org.
 //
 
 // Do not mix the following two enumerations sets.  DW_TAG_invalid changes the
 // enumeration base type.
 
-enum llvm_dwarf_constants {
+enum LLVMConstants LLVM_ENUM_INT_TYPE(uint32_t) {
   // llvm mock tags
-  DW_TAG_invalid = ~0U,                 // Tag for invalid results.
+  DW_TAG_invalid = ~0U, // Tag for invalid results.
 
-  DW_TAG_auto_variable = 0x100,         // Tag for local (auto) variables.
-  DW_TAG_arg_variable = 0x101,          // Tag for argument variables.
+  DW_TAG_auto_variable = 0x100, // Tag for local (auto) variables.
+  DW_TAG_arg_variable = 0x101,  // Tag for argument variables.
 
-  DW_TAG_user_base = 0x1000,            // Recommended base for user tags.
+  DW_TAG_user_base = 0x1000, // Recommended base for user tags.
 
-  DW_CIE_VERSION = 1                    // Common frame information version.
+  DWARF_VERSION = 4,       // Default dwarf version we output.
+  DW_CIE_VERSION = 1,      // Common frame information version.
+  DW_PUBTYPES_VERSION = 2, // Section version number for .debug_pubtypes.
+  DW_PUBNAMES_VERSION = 2, // Section version number for .debug_pubnames.
+  DW_ARANGES_VERSION = 2   // Section version number for .debug_aranges.
 };
 
-
 // Special ID values that distinguish a CIE from a FDE in DWARF CFI.
 // Not inside an enum because a 64-bit value is needed.
 const uint32_t DW_CIE_ID = UINT32_MAX;
 const uint64_t DW64_CIE_ID = UINT64_MAX;
 
-
-enum dwarf_constants {
-  DWARF_VERSION = 2,
-
-  // Tags
+enum Tag LLVM_ENUM_INT_TYPE(uint16_t) {
   DW_TAG_array_type = 0x01,
   DW_TAG_class_type = 0x02,
   DW_TAG_entry_point = 0x03,
@@ -139,12 +138,38 @@ enum dwarf_constants {
   DW_TAG_GNU_formal_parameter_pack = 0x4108,
   DW_TAG_lo_user = 0x4080,
   DW_TAG_APPLE_property = 0x4200,
-  DW_TAG_hi_user = 0xffff,
-
-  // Children flag
-  DW_CHILDREN_no = 0x00,
-  DW_CHILDREN_yes = 0x01,
+  DW_TAG_hi_user = 0xffff
+};
 
+inline bool isType(Tag T) {
+  switch (T) {
+  case DW_TAG_array_type:
+  case DW_TAG_class_type:
+  case DW_TAG_interface_type:
+  case DW_TAG_enumeration_type:
+  case DW_TAG_pointer_type:
+  case DW_TAG_reference_type:
+  case DW_TAG_rvalue_reference_type:
+  case DW_TAG_string_type:
+  case DW_TAG_structure_type:
+  case DW_TAG_subroutine_type:
+  case DW_TAG_union_type:
+  case DW_TAG_ptr_to_member_type:
+  case DW_TAG_set_type:
+  case DW_TAG_subrange_type:
+  case DW_TAG_base_type:
+  case DW_TAG_const_type:
+  case DW_TAG_file_type:
+  case DW_TAG_packed_type:
+  case DW_TAG_volatile_type:
+  case DW_TAG_typedef:
+    return true;
+  default:
+    return false;
+  }
+}
+
+enum Attribute LLVM_ENUM_INT_TYPE(uint16_t) {
   // Attributes
   DW_AT_sibling = 0x01,
   DW_AT_location = 0x02,
@@ -295,8 +320,10 @@ enum dwarf_constants {
   DW_AT_APPLE_property_setter = 0x3fea,
   DW_AT_APPLE_property_attribute = 0x3feb,
   DW_AT_APPLE_objc_complete_type = 0x3fec,
-  DW_AT_APPLE_property = 0x3fed,
+  DW_AT_APPLE_property = 0x3fed
+};
 
+enum Form LLVM_ENUM_INT_TYPE(uint16_t) {
   // Attribute form encodings
   DW_FORM_addr = 0x01,
   DW_FORM_block2 = 0x03,
@@ -326,8 +353,10 @@ enum dwarf_constants {
 
   // Extensions for Fission proposal
   DW_FORM_GNU_addr_index = 0x1f01,
-  DW_FORM_GNU_str_index = 0x1f02,
+  DW_FORM_GNU_str_index = 0x1f02
+};
 
+enum LocationAtom {
   // Operation encodings
   DW_OP_addr = 0x03,
   DW_OP_deref = 0x06,
@@ -486,10 +515,15 @@ enum dwarf_constants {
   DW_OP_lo_user = 0xe0,
   DW_OP_hi_user = 0xff,
 
+  // Extensions for GNU-style thread-local storage.
+  DW_OP_GNU_push_tls_address = 0xe0,
+
   // Extensions for Fission proposal.
   DW_OP_GNU_addr_index = 0xfb,
-  DW_OP_GNU_const_index = 0xfc,
+  DW_OP_GNU_const_index = 0xfc
+};
 
+enum TypeKind {
   // Encoding attribute values
   DW_ATE_address = 0x01,
   DW_ATE_boolean = 0x02,
@@ -508,37 +542,49 @@ enum dwarf_constants {
   DW_ATE_decimal_float = 0x0f,
   DW_ATE_UTF = 0x10,
   DW_ATE_lo_user = 0x80,
-  DW_ATE_hi_user = 0xff,
+  DW_ATE_hi_user = 0xff
+};
 
+enum DecimalSignEncoding {
   // Decimal sign attribute values
   DW_DS_unsigned = 0x01,
   DW_DS_leading_overpunch = 0x02,
   DW_DS_trailing_overpunch = 0x03,
   DW_DS_leading_separate = 0x04,
-  DW_DS_trailing_separate = 0x05,
+  DW_DS_trailing_separate = 0x05
+};
 
+enum EndianityEncoding {
   // Endianity attribute values
   DW_END_default = 0x00,
   DW_END_big = 0x01,
   DW_END_little = 0x02,
   DW_END_lo_user = 0x40,
-  DW_END_hi_user = 0xff,
+  DW_END_hi_user = 0xff
+};
 
+enum AccessAttribute {
   // Accessibility codes
   DW_ACCESS_public = 0x01,
   DW_ACCESS_protected = 0x02,
-  DW_ACCESS_private = 0x03,
+  DW_ACCESS_private = 0x03
+};
 
+enum VisibilityAttribute {
   // Visibility codes
   DW_VIS_local = 0x01,
   DW_VIS_exported = 0x02,
-  DW_VIS_qualified = 0x03,
+  DW_VIS_qualified = 0x03
+};
 
+enum VirtualityAttribute {
   // Virtuality codes
   DW_VIRTUALITY_none = 0x00,
   DW_VIRTUALITY_virtual = 0x01,
-  DW_VIRTUALITY_pure_virtual = 0x02,
+  DW_VIRTUALITY_pure_virtual = 0x02
+};
 
+enum SourceLanguage {
   // Language names
   DW_LANG_C89 = 0x0001,
   DW_LANG_C = 0x0002,
@@ -562,35 +608,47 @@ enum dwarf_constants {
   DW_LANG_Python = 0x0014,
   DW_LANG_lo_user = 0x8000,
   DW_LANG_Mips_Assembler = 0x8001,
-  DW_LANG_hi_user = 0xffff,
+  DW_LANG_hi_user = 0xffff
+};
 
+enum CaseSensitivity {
   // Identifier case codes
   DW_ID_case_sensitive = 0x00,
   DW_ID_up_case = 0x01,
   DW_ID_down_case = 0x02,
-  DW_ID_case_insensitive = 0x03,
+  DW_ID_case_insensitive = 0x03
+};
 
+enum CallingConvention {
   // Calling convention codes
   DW_CC_normal = 0x01,
   DW_CC_program = 0x02,
   DW_CC_nocall = 0x03,
   DW_CC_lo_user = 0x40,
-  DW_CC_hi_user = 0xff,
+  DW_CC_hi_user = 0xff
+};
 
+enum InlineAttribute {
   // Inline codes
   DW_INL_not_inlined = 0x00,
   DW_INL_inlined = 0x01,
   DW_INL_declared_not_inlined = 0x02,
-  DW_INL_declared_inlined = 0x03,
+  DW_INL_declared_inlined = 0x03
+};
 
+enum ArrayDimensionOrdering {
   // Array ordering
   DW_ORD_row_major = 0x00,
-  DW_ORD_col_major = 0x01,
+  DW_ORD_col_major = 0x01
+};
 
+enum DiscriminantList {
   // Discriminant descriptor values
   DW_DSC_label = 0x00,
-  DW_DSC_range = 0x01,
+  DW_DSC_range = 0x01
+};
 
+enum LineNumberOps {
   // Line Number Standard Opcode Encodings
   DW_LNS_extended_op = 0x00,
   DW_LNS_copy = 0x01,
@@ -604,23 +662,29 @@ enum dwarf_constants {
   DW_LNS_fixed_advance_pc = 0x09,
   DW_LNS_set_prologue_end = 0x0a,
   DW_LNS_set_epilogue_begin = 0x0b,
-  DW_LNS_set_isa = 0x0c,
+  DW_LNS_set_isa = 0x0c
+};
 
+enum LineNumberExtendedOps {
   // Line Number Extended Opcode Encodings
   DW_LNE_end_sequence = 0x01,
   DW_LNE_set_address = 0x02,
   DW_LNE_define_file = 0x03,
   DW_LNE_set_discriminator = 0x04,
   DW_LNE_lo_user = 0x80,
-  DW_LNE_hi_user = 0xff,
+  DW_LNE_hi_user = 0xff
+};
 
+enum MacinfoRecordType {
   // Macinfo Type Encodings
   DW_MACINFO_define = 0x01,
   DW_MACINFO_undef = 0x02,
   DW_MACINFO_start_file = 0x03,
   DW_MACINFO_end_file = 0x04,
-  DW_MACINFO_vendor_ext = 0xff,
+  DW_MACINFO_vendor_ext = 0xff
+};
 
+enum CallFrameInfo {
   // Call frame instruction encodings
   DW_CFA_extended = 0x00,
   DW_CFA_nop = 0x00,
@@ -653,7 +717,13 @@ enum dwarf_constants {
   DW_CFA_GNU_window_save = 0x2d,
   DW_CFA_GNU_args_size = 0x2e,
   DW_CFA_lo_user = 0x1c,
-  DW_CFA_hi_user = 0x3f,
+  DW_CFA_hi_user = 0x3f
+};
+
+enum Constants {
+  // Children flag
+  DW_CHILDREN_no = 0x00,
+  DW_CHILDREN_yes = 0x01,
 
   DW_EH_PE_absptr = 0x00,
   DW_EH_PE_omit = 0xff,
@@ -671,8 +741,10 @@ enum dwarf_constants {
   DW_EH_PE_datarel = 0x30,
   DW_EH_PE_funcrel = 0x40,
   DW_EH_PE_aligned = 0x50,
-  DW_EH_PE_indirect = 0x80,
+  DW_EH_PE_indirect = 0x80
+};
 
+enum ApplePropertyAttributes {
   // Apple Objective-C Property Attributes
   DW_APPLE_PROPERTY_readonly = 0x01,
   DW_APPLE_PROPERTY_readwrite = 0x02,
@@ -765,6 +837,84 @@ const char *MacinfoString(unsigned Encoding);
 /// CallFrameString - Return the string for the specified call frame instruction
 /// encodings.
 const char *CallFrameString(unsigned Encoding);
+
+// Constants for the DWARF5 Accelerator Table Proposal
+enum AcceleratorTable {
+  // Data layout descriptors.
+  DW_ATOM_null = 0u,       // Marker as the end of a list of atoms.
+  DW_ATOM_die_offset = 1u, // DIE offset in the debug_info section.
+  DW_ATOM_cu_offset = 2u, // Offset of the compile unit header that contains the
+                          // item in question.
+  DW_ATOM_die_tag = 3u,   // A tag entry.
+  DW_ATOM_type_flags = 4u, // Set of flags for a type.
+
+  // DW_ATOM_type_flags values.
+
+  // Always set for C++, only set for ObjC if this is the @implementation for a
+  // class.
+  DW_FLAG_type_implementation = 2u,
+
+  // Hash functions.
+
+  // Daniel J. Bernstein hash.
+  DW_hash_function_djb = 0u
+};
+
+/// AtomTypeString - Return the string for the specified Atom type.
+const char *AtomTypeString(unsigned Atom);
+
+// Constants for the GNU pubnames/pubtypes extensions supporting gdb index.
+enum GDBIndexEntryKind {
+  GIEK_NONE,
+  GIEK_TYPE,
+  GIEK_VARIABLE,
+  GIEK_FUNCTION,
+  GIEK_OTHER,
+  GIEK_UNUSED5,
+  GIEK_UNUSED6,
+  GIEK_UNUSED7
+};
+
+const char *GDBIndexEntryKindString(GDBIndexEntryKind Kind);
+
+enum GDBIndexEntryLinkage {
+  GIEL_EXTERNAL,
+  GIEL_STATIC
+};
+
+const char *GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
+
+/// The gnu_pub* kind looks like:
+///
+/// 0-3  reserved
+/// 4-6  symbol kind
+/// 7    0 == global, 1 == static
+///
+/// A gdb_index descriptor includes the above kind, shifted 24 bits up with the
+/// offset of the cu within the debug_info section stored in those 24 bits.
+struct PubIndexEntryDescriptor {
+  GDBIndexEntryKind Kind;
+  GDBIndexEntryLinkage Linkage;
+  PubIndexEntryDescriptor(GDBIndexEntryKind Kind, GDBIndexEntryLinkage Linkage)
+      : Kind(Kind), Linkage(Linkage) {}
+  /* implicit */ PubIndexEntryDescriptor(GDBIndexEntryKind Kind)
+      : Kind(Kind), Linkage(GIEL_EXTERNAL) {}
+  explicit PubIndexEntryDescriptor(uint8_t Value)
+      : Kind(static_cast<GDBIndexEntryKind>((Value & KIND_MASK) >>
+                                            KIND_OFFSET)),
+        Linkage(static_cast<GDBIndexEntryLinkage>((Value & LINKAGE_MASK) >>
+                                                  LINKAGE_OFFSET)) {}
+  uint8_t toBits() { return Kind << KIND_OFFSET | Linkage << LINKAGE_OFFSET; }
+
+private:
+  enum {
+    KIND_OFFSET = 4,
+    KIND_MASK = 7 << KIND_OFFSET,
+    LINKAGE_OFFSET = 7,
+    LINKAGE_MASK = 1 << LINKAGE_OFFSET
+  };
+};
+
 } // End of namespace dwarf
 
 } // End of namespace llvm
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 54da31c..2868f35 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -20,6 +20,7 @@
 #ifndef LLVM_SUPPORT_ELF_H
 #define LLVM_SUPPORT_ELF_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include <cstring>
 
@@ -650,7 +651,7 @@ enum {
 };
 
 // ARM Specific e_flags
-enum {
+enum LLVM_ENUM_INT_TYPE(unsigned) {
   EF_ARM_SOFT_FLOAT =     0x00000200U,
   EF_ARM_VFP_FLOAT =      0x00000400U,
   EF_ARM_EABI_UNKNOWN =   0x00000000U,
@@ -800,7 +801,7 @@ enum {
 };
 
 // Mips Specific e_flags
-enum {
+enum LLVM_ENUM_INT_TYPE(unsigned) {
   EF_MIPS_NOREORDER = 0x00000001, // Don't reorder instructions
   EF_MIPS_PIC       = 0x00000002, // Position independent code
   EF_MIPS_CPIC      = 0x00000004, // Call object with Position independent code
@@ -880,6 +881,19 @@ enum {
   R_MIPS_GLOB_DAT          = 51,
   R_MIPS_COPY              = 126,
   R_MIPS_JUMP_SLOT         = 127,
+  R_MICROMIPS_26_S1        = 133,
+  R_MICROMIPS_HI16         = 134,
+  R_MICROMIPS_LO16         = 135,
+  R_MICROMIPS_GOT16        = 138,
+  R_MICROMIPS_PC16_S1      = 141,
+  R_MICROMIPS_CALL16       = 142,
+  R_MICROMIPS_GOT_DISP     = 145,
+  R_MICROMIPS_GOT_PAGE     = 146,
+  R_MICROMIPS_GOT_OFST     = 147,
+  R_MICROMIPS_TLS_DTPREL_HI16 = 164,
+  R_MICROMIPS_TLS_DTPREL_LO16 = 165,
+  R_MICROMIPS_TLS_TPREL_HI16  = 169,
+  R_MICROMIPS_TLS_TPREL_LO16  = 170,
   R_MIPS_NUM               = 218
 };
 
@@ -1116,7 +1130,7 @@ enum {
 };
 
 // Section types.
-enum {
+enum LLVM_ENUM_INT_TYPE(unsigned) {
   SHT_NULL          = 0,  // No associated section (inactive entry).
   SHT_PROGBITS      = 1,  // Program-defined contents.
   SHT_SYMTAB        = 2,  // Symbol table.
@@ -1164,7 +1178,7 @@ enum {
 };
 
 // Section flags.
-enum {
+enum LLVM_ENUM_INT_TYPE(unsigned) {
   // Section data should be writable during execution.
   SHF_WRITE = 0x1,
 
@@ -1196,6 +1210,9 @@ enum {
   // This section holds Thread-Local Storage.
   SHF_TLS = 0x400U,
 
+  // This section is excluded from the final executable or shared library.
+  SHF_EXCLUDE = 0x80000000U,
+
   // Start of target-specific flags.
 
   /// XCORE_SHF_CP_SECTION - All sections with the "c" flag are grouped
@@ -1226,12 +1243,34 @@ enum {
   // for faster accesses
   SHF_HEX_GPREL = 0x10000000,
 
-  // Do not strip this section. FIXME: We need target specific SHF_ enums.
-  SHF_MIPS_NOSTRIP = 0x8000000
+  // Section contains text/data which may be replicated in other sections.
+  // Linker must retain only one copy.
+  SHF_MIPS_NODUPES = 0x01000000,
+
+  // Linker must generate implicit hidden weak names.
+  SHF_MIPS_NAMES   = 0x02000000,
+
+  // Section data local to process.
+  SHF_MIPS_LOCAL   = 0x04000000,
+
+  // Do not strip this section.
+  SHF_MIPS_NOSTRIP = 0x08000000,
+
+  // Section must be part of global data area.
+  SHF_MIPS_GPREL   = 0x10000000,
+
+  // This section should be merged.
+  SHF_MIPS_MERGE   = 0x20000000,
+
+  // Address size to be inferred from section entry size.
+  SHF_MIPS_ADDR    = 0x40000000,
+
+  // Section data is string data by default.
+  SHF_MIPS_STRING  = 0x80000000
 };
 
 // Section Group Flags
-enum {
+enum LLVM_ENUM_INT_TYPE(unsigned) {
   GRP_COMDAT = 0x1,
   GRP_MASKOS = 0x0ff00000,
   GRP_MASKPROC = 0xf0000000
@@ -1444,11 +1483,16 @@ enum {
   PT_ARM_ARCHEXT = 0x70000000, // Platform architecture compatibility info
   // These all contain stack unwind tables.
   PT_ARM_EXIDX   = 0x70000001,
-  PT_ARM_UNWIND  = 0x70000001
+  PT_ARM_UNWIND  = 0x70000001,
+
+  // MIPS program header types.
+  PT_MIPS_REGINFO  = 0x70000000,  // Register usage information.
+  PT_MIPS_RTPROC   = 0x70000001,  // Runtime procedure table.
+  PT_MIPS_OPTIONS  = 0x70000002   // Options segment.
 };
 
 // Segment flag bits.
-enum {
+enum LLVM_ENUM_INT_TYPE(unsigned) {
   PF_X        = 1,         // Execute
   PF_W        = 2,         // Write
   PF_R        = 4,         // Read
@@ -1526,6 +1570,7 @@ enum {
   DT_RELCOUNT     = 0x6FFFFFFA, // ELF32_Rel count.
 
   DT_FLAGS_1      = 0X6FFFFFFB, // Flags_1.
+  DT_VERSYM       = 0x6FFFFFF0, // The address of .gnu.version section.
   DT_VERDEF       = 0X6FFFFFFC, // The address of the version definition table.
   DT_VERDEFNUM    = 0X6FFFFFFD, // The number of entries in DT_VERDEF.
   DT_VERNEED      = 0X6FFFFFFE, // The address of the version Dependency table.
diff --git a/include/llvm/Support/ErrorOr.h b/include/llvm/Support/ErrorOr.h
index f3ac305..d5b11cb 100644
--- a/include/llvm/Support/ErrorOr.h
+++ b/include/llvm/Support/ErrorOr.h
@@ -27,38 +27,6 @@
 #endif
 
 namespace llvm {
-struct ErrorHolderBase {
-  error_code Error;
-  uint16_t RefCount;
-  bool HasUserData;
-
-  ErrorHolderBase() : RefCount(1) {}
-
-  void aquire() {
-    ++RefCount;
-  }
-
-  void release() {
-    if (--RefCount == 0)
-      delete this;
-  }
-
-protected:
-  virtual ~ErrorHolderBase() {}
-};
-
-template<class T>
-struct ErrorHolder : ErrorHolderBase {
-#if LLVM_HAS_RVALUE_REFERENCES
-  ErrorHolder(T &&UD) : UserData(llvm_move(UD)) {}
-#else
-  ErrorHolder(T &UD) : UserData(UD) {}
-#endif
-  T UserData;
-};
-
-template<class Tp> struct ErrorOrUserDataTraits : llvm::false_type {};
-
 #if LLVM_HAS_CXX11_TYPETRAITS && LLVM_HAS_RVALUE_REFERENCES
 template<class T, class V>
 typename std::enable_if< std::is_constructible<T, V>::value
@@ -111,44 +79,6 @@ public:
 ///   buffer->write("adena");
 /// \endcode
 ///
-/// ErrorOr<T> also supports user defined data for specific error_codes. To use
-/// this feature you must first add a template specialization of
-/// ErrorOrUserDataTraits derived from std::true_type for your type in the lld
-/// namespace. This specialization must have a static error_code error()
-/// function that returns the error_code this data is used with.
-///
-/// getError<UserData>() may be called to get either the stored user data, or
-/// a default constructed UserData if none was stored.
-///
-/// Example:
-/// \code
-///   struct InvalidArgError {
-///     InvalidArgError() {}
-///     InvalidArgError(std::string S) : ArgName(S) {}
-///     std::string ArgName;
-///   };
-///
-///   namespace llvm {
-///   template<>
-///   struct ErrorOrUserDataTraits<InvalidArgError> : std::true_type {
-///     static error_code error() {
-///       return make_error_code(errc::invalid_argument);
-///     }
-///   };
-///   } // end namespace llvm
-///
-///   using namespace llvm;
-///
-///   ErrorOr<int> foo() {
-///     return InvalidArgError("adena");
-///   }
-///
-///   int main() {
-///     auto a = foo();
-///     if (!a && error_code(a) == errc::invalid_argument)
-///       llvm::errs() << a.getError<InvalidArgError>().ArgName << "\n";
-///   }
-/// \endcode
 ///
 /// An implicit conversion to bool provides a way to check if there was an
 /// error. The unary * and -> operators provide pointer like access to the
@@ -178,43 +108,28 @@ private:
   typedef typename remove_reference<T>::type *pointer;
 
 public:
-  ErrorOr() : IsValid(false) {}
-
   template <class E>
   ErrorOr(E ErrorCode, typename enable_if_c<is_error_code_enum<E>::value ||
                                             is_error_condition_enum<E>::value,
                                             void *>::type = 0)
-      : HasError(true), IsValid(true) {
-    Error = new ErrorHolderBase;
-    Error->Error = make_error_code(ErrorCode);
-    Error->HasUserData = false;
-  }
-
-  ErrorOr(llvm::error_code EC) : HasError(true), IsValid(true) {
-    Error = new ErrorHolderBase;
-    Error->Error = EC;
-    Error->HasUserData = false;
+      : HasError(true) {
+    new (getError()) error_code(make_error_code(ErrorCode));
   }
 
-  template<class UserDataT>
-  ErrorOr(UserDataT UD, typename
-          enable_if_c<ErrorOrUserDataTraits<UserDataT>::value>::type* = 0)
-    : HasError(true), IsValid(true) {
-    Error = new ErrorHolder<UserDataT>(llvm_move(UD));
-    Error->Error = ErrorOrUserDataTraits<UserDataT>::error();
-    Error->HasUserData = true;
+  ErrorOr(llvm::error_code EC) : HasError(true) {
+    new (getError()) error_code(EC);
   }
 
-  ErrorOr(T Val) : HasError(false), IsValid(true) {
+  ErrorOr(T Val) : HasError(false) {
     new (get()) storage_type(moveIfMoveConstructible<storage_type>(Val));
   }
 
-  ErrorOr(const ErrorOr &Other) : IsValid(false) {
+  ErrorOr(const ErrorOr &Other) {
     copyConstruct(Other);
   }
 
   template <class OtherT>
-  ErrorOr(const ErrorOr<OtherT> &Other) : IsValid(false) {
+  ErrorOr(const ErrorOr<OtherT> &Other) {
     copyConstruct(Other);
   }
 
@@ -230,12 +145,12 @@ public:
   }
 
 #if LLVM_HAS_RVALUE_REFERENCES
-  ErrorOr(ErrorOr &&Other) : IsValid(false) {
+  ErrorOr(ErrorOr &&Other) {
     moveConstruct(std::move(Other));
   }
 
   template <class OtherT>
-  ErrorOr(ErrorOr<OtherT> &&Other) : IsValid(false) {
+  ErrorOr(ErrorOr<OtherT> &&Other) {
     moveConstruct(std::move(Other));
   }
 
@@ -252,37 +167,20 @@ public:
 #endif
 
   ~ErrorOr() {
-    if (!IsValid)
-      return;
-    if (HasError)
-      Error->release();
-    else
+    if (!HasError)
       get()->~storage_type();
   }
 
-  template<class ET>
-  ET getError() const {
-    assert(IsValid && "Cannot get the error of a default constructed ErrorOr!");
-    assert(HasError && "Cannot get an error if none exists!");
-    assert(ErrorOrUserDataTraits<ET>::error() == Error->Error &&
-           "Incorrect user error data type for error!");
-    if (!Error->HasUserData)
-      return ET();
-    return reinterpret_cast<const ErrorHolder<ET>*>(Error)->UserData;
-  }
-
   typedef void (*unspecified_bool_type)();
   static void unspecified_bool_true() {}
 
   /// \brief Return false if there is an error.
   operator unspecified_bool_type() const {
-    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
     return HasError ? 0 : unspecified_bool_true;
   }
 
   operator llvm::error_code() const {
-    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
-    return HasError ? Error->Error : llvm::error_code::success();
+    return HasError ? *getError() : llvm::error_code::success();
   }
 
   pointer operator ->() {
@@ -296,19 +194,14 @@ public:
 private:
   template <class OtherT>
   void copyConstruct(const ErrorOr<OtherT> &Other) {
-    // Construct an invalid ErrorOr if other is invalid.
-    if (!Other.IsValid)
-      return;
-    IsValid = true;
     if (!Other.HasError) {
       // Get the other value.
       HasError = false;
       new (get()) storage_type(*Other.get());
     } else {
       // Get other's error.
-      Error = Other.Error;
       HasError = true;
-      Error->aquire();
+      new (getError()) error_code(Other);
     }
   }
 
@@ -334,22 +227,14 @@ private:
 #if LLVM_HAS_RVALUE_REFERENCES
   template <class OtherT>
   void moveConstruct(ErrorOr<OtherT> &&Other) {
-    // Construct an invalid ErrorOr if other is invalid.
-    if (!Other.IsValid)
-      return;
-    IsValid = true;
     if (!Other.HasError) {
       // Get the other value.
       HasError = false;
       new (get()) storage_type(std::move(*Other.get()));
-      // Tell other not to do any destruction.
-      Other.IsValid = false;
     } else {
       // Get other's error.
-      Error = Other.Error;
       HasError = true;
-      // Tell other not to do any destruction.
-      Other.IsValid = false;
+      new (getError()) error_code(Other);
     }
   }
 
@@ -372,135 +257,30 @@ private:
   }
 
   storage_type *get() {
-    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
     assert(!HasError && "Cannot get value when an error exists!");
     return reinterpret_cast<storage_type*>(TStorage.buffer);
   }
 
   const storage_type *get() const {
-    assert(IsValid && "Can't do anything on a default constructed ErrorOr!");
     assert(!HasError && "Cannot get value when an error exists!");
     return reinterpret_cast<const storage_type*>(TStorage.buffer);
   }
 
-  union {
-    AlignedCharArrayUnion<storage_type> TStorage;
-    ErrorHolderBase *Error;
-  };
-  bool HasError : 1;
-  bool IsValid : 1;
-};
-
-// ErrorOr specialization for void.
-template <>
-class ErrorOr<void> {
-public:
-  ErrorOr() : Error(0, 0) {}
-
-  template <class E>
-  ErrorOr(E ErrorCode, typename enable_if_c<is_error_code_enum<E>::value ||
-                                            is_error_condition_enum<E>::value,
-                                            void *> ::type = 0)
-      : Error(0, 0) {
-    error_code EC = make_error_code(ErrorCode);
-    if (EC == errc::success) {
-      Error.setInt(1);
-      return;
-    }
-    ErrorHolderBase *EHB = new ErrorHolderBase;
-    EHB->Error = EC;
-    EHB->HasUserData = false;
-    Error.setPointer(EHB);
-  }
-
-  ErrorOr(llvm::error_code EC) : Error(0, 0) {
-    if (EC == errc::success) {
-      Error.setInt(1);
-      return;
-    }
-    ErrorHolderBase *E = new ErrorHolderBase;
-    E->Error = EC;
-    E->HasUserData = false;
-    Error.setPointer(E);
-  }
-
-  template<class UserDataT>
-  ErrorOr(UserDataT UD, typename
-          enable_if_c<ErrorOrUserDataTraits<UserDataT>::value>::type* = 0)
-      : Error(0, 0) {
-    ErrorHolderBase *E = new ErrorHolder<UserDataT>(llvm_move(UD));
-    E->Error = ErrorOrUserDataTraits<UserDataT>::error();
-    E->HasUserData = true;
-    Error.setPointer(E);
+  error_code *getError() {
+    assert(HasError && "Cannot get error when a value exists!");
+    return reinterpret_cast<error_code*>(ErrorStorage.buffer);
   }
 
-  ErrorOr(const ErrorOr &Other) : Error(0, 0) {
-    Error = Other.Error;
-    if (Other.Error.getPointer()->Error) {
-      Error.getPointer()->aquire();
-    }
+  const error_code *getError() const {
+    return const_cast<ErrorOr<T> *>(this)->getError();
   }
 
-  ErrorOr &operator =(const ErrorOr &Other) {
-    if (this == &Other)
-      return *this;
-
-    this->~ErrorOr();
-    new (this) ErrorOr(Other);
 
-    return *this;
-  }
-
-#if LLVM_HAS_RVALUE_REFERENCES
-  ErrorOr(ErrorOr &&Other) : Error(0) {
-    // Get other's error.
-    Error = Other.Error;
-    // Tell other not to do any destruction.
-    Other.Error.setPointer(0);
-  }
-
-  ErrorOr &operator =(ErrorOr &&Other) {
-    if (this == &Other)
-      return *this;
-
-    this->~ErrorOr();
-    new (this) ErrorOr(std::move(Other));
-
-    return *this;
-  }
-#endif
-
-  ~ErrorOr() {
-    if (Error.getPointer())
-      Error.getPointer()->release();
-  }
-
-  template<class ET>
-  ET getError() const {
-    assert(ErrorOrUserDataTraits<ET>::error() == *this &&
-           "Incorrect user error data type for error!");
-    if (!Error.getPointer()->HasUserData)
-      return ET();
-    return reinterpret_cast<const ErrorHolder<ET> *>(
-        Error.getPointer())->UserData;
-  }
-
-  typedef void (*unspecified_bool_type)();
-  static void unspecified_bool_true() {}
-
-  /// \brief Return false if there is an error.
-  operator unspecified_bool_type() const {
-    return Error.getInt() ? unspecified_bool_true : 0;
-  }
-
-  operator llvm::error_code() const {
-    return Error.getInt() ? make_error_code(errc::success)
-                          : Error.getPointer()->Error;
-  }
-
-private:
-  // If the bit is 1, the error is success.
-  llvm::PointerIntPair<ErrorHolderBase *, 1> Error;
+  union {
+    AlignedCharArrayUnion<storage_type> TStorage;
+    AlignedCharArrayUnion<error_code> ErrorStorage;
+  };
+  bool HasError : 1;
 };
 
 template<class T, class E>
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index c130b47..d301f84 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -238,7 +238,9 @@ struct file_magic {
     macho_dsym_companion,     ///< Mach-O dSYM companion file
     macho_universal_binary,   ///< Mach-O universal binary
     coff_object,              ///< COFF object file
-    pecoff_executable         ///< PECOFF executable file
+    coff_import_library,      ///< COFF import library
+    pecoff_executable,        ///< PECOFF executable file
+    windows_resource          ///< Windows compiled resource file (.rc)
   };
 
   bool is_object() const {
@@ -638,17 +640,6 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD, OpenFlags Flags,
 
 error_code openFileForRead(const Twine &Name, int &ResultFD);
 
-/// @brief Canonicalize path.
-///
-/// Sets result to the file system's idea of what path is. The result is always
-/// absolute and has the same capitalization as the file system.
-///
-/// @param path Input path.
-/// @param result Set to the canonicalized version of \a path.
-/// @returns errc::success if result has been successfully set, otherwise a
-///          platform specific error_code.
-error_code canonicalize(const Twine &path, SmallVectorImpl<char> &result);
-
 /// @brief Are \a path's first bytes \a magic?
 ///
 /// @param path Input path.
@@ -731,7 +722,7 @@ public:
   ///               should begin. Must be a multiple of
   ///               mapped_file_region::alignment().
   /// \param ec This is set to errc::success if the map was constructed
-  ///           sucessfully. Otherwise it is set to a platform dependent error.
+  ///           successfully. Otherwise it is set to a platform dependent error.
   mapped_file_region(const Twine &path,
                      mapmode mode,
                      uint64_t length,
@@ -869,7 +860,7 @@ public:
   }
 
   /// Construct end iterator.
-  directory_iterator() : State(new detail::DirIterState) {}
+  directory_iterator() : State(0) {}
 
   // No operator++ because we need error_code.
   directory_iterator &increment(error_code &ec) {
@@ -881,6 +872,12 @@ public:
   const directory_entry *operator->() const { return &State->CurrentEntry; }
 
   bool operator==(const directory_iterator &RHS) const {
+    if (State == RHS.State)
+      return true;
+    if (RHS.State == 0)
+      return State->CurrentEntry == directory_entry();
+    if (State == 0)
+      return RHS.State->CurrentEntry == directory_entry();
     return State->CurrentEntry == RHS.State->CurrentEntry;
   }
 
@@ -920,7 +917,7 @@ public:
   }
   // No operator++ because we need error_code.
   recursive_directory_iterator &increment(error_code &ec) {
-    static const directory_iterator end_itr;
+    const directory_iterator end_itr;
 
     if (State->HasNoPushRequest)
       State->HasNoPushRequest = false;
@@ -964,10 +961,10 @@ public:
   // modifiers
   /// Goes up one level if Level > 0.
   void pop() {
-    assert(State && "Cannot pop and end itertor!");
+    assert(State && "Cannot pop an end iterator!");
     assert(State->Level > 0 && "Cannot pop an iterator with level < 1");
 
-    static const directory_iterator end_itr;
+    const directory_iterator end_itr;
     error_code ec;
     do {
       if (ec)
diff --git a/include/llvm/Support/FormattedStream.h b/include/llvm/Support/FormattedStream.h
index 18ee048..df1f218 100644
--- a/include/llvm/Support/FormattedStream.h
+++ b/include/llvm/Support/FormattedStream.h
@@ -129,6 +129,27 @@ public:
 
   /// getLine - Return the line number
   unsigned getLine() { return Position.second; }
+  
+  raw_ostream &resetColor() {
+    TheStream->resetColor();
+    return *this;
+  }
+  
+  raw_ostream &reverseColor() {
+    TheStream->reverseColor();
+    return *this;
+  }
+  
+  raw_ostream &changeColor(enum Colors Color,
+                           bool Bold,
+                           bool BG) {
+    TheStream->changeColor(Color, Bold, BG);
+    return *this;
+  }
+  
+  bool is_displayed() const {
+    return TheStream->is_displayed();
+  }
 
 private:
   void releaseStream() {
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index f1040f5..0aa716a 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_SUPPORT_GCOV_H
 #define LLVM_SUPPORT_GCOV_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -24,7 +25,6 @@ namespace llvm {
 
 class GCOVFunction;
 class GCOVBlock;
-class GCOVLines;
 class FileInfo;
 
 namespace GCOV {
@@ -125,30 +125,65 @@ public:
     return true;
   }
 
-  uint32_t readInt() {
-    uint32_t Result;
+  /// readObjectTag - If cursor points to an object summary tag then increment
+  /// the cursor and return true otherwise return false.
+  bool readObjectTag() {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
+    if (Tag.empty() ||
+        Tag[0] != '\0' || Tag[1] != '\0' ||
+        Tag[2] != '\0' || Tag[3] != '\xa1') {
+      return false;
+    }
+    Cursor += 4;
+    return true;
+  }
+
+  /// readProgramTag - If cursor points to a program summary tag then increment
+  /// the cursor and return true otherwise return false.
+  bool readProgramTag() {
+    StringRef Tag = Buffer->getBuffer().slice(Cursor, Cursor+4);
+    if (Tag.empty() ||
+        Tag[0] != '\0' || Tag[1] != '\0' ||
+        Tag[2] != '\0' || Tag[3] != '\xa3') {
+      return false;
+    }
+    Cursor += 4;
+    return true;
+  }
+
+  bool readInt(uint32_t &Val) {
+    if (Buffer->getBuffer().size() < Cursor+4) {
+      errs() << "Unexpected end of memory buffer: " << Cursor+4 << ".\n";
+      return false;
+    }
     StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor+4);
-    assert (Str.empty() == false && "Unexpected memory buffer end!");
     Cursor += 4;
-    Result = *(const uint32_t *)(Str.data());
-    return Result;
+    Val = *(const uint32_t *)(Str.data());
+    return true;
   }
 
-  uint64_t readInt64() {
-    uint64_t Lo = readInt();
-    uint64_t Hi = readInt();
-    uint64_t Result = Lo | (Hi << 32);
-    return Result;
+  bool readInt64(uint64_t &Val) {
+    uint32_t Lo, Hi;
+    if (!readInt(Lo) || !readInt(Hi)) return false;
+    Val = ((uint64_t)Hi << 32) | Lo;
+    return true;
   }
 
-  StringRef readString() {
-    uint32_t Len = readInt() * 4;
-    StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor+Len);
+  bool readString(StringRef &Str) {
+    uint32_t Len;
+    if (!readInt(Len)) return false;
+    Len *= 4;
+    if (Buffer->getBuffer().size() < Cursor+Len) {
+      errs() << "Unexpected end of memory buffer: " << Cursor+Len << ".\n";
+      return false;
+    }
+    Str = Buffer->getBuffer().slice(Cursor, Cursor+Len).split('\0').first;
     Cursor += Len;
-    return Str;
+    return true;
   }
 
   uint64_t getCursor() const { return Cursor; }
+  void advanceCursor(uint32_t n) { Cursor += n*4; }
 private:
   MemoryBuffer *Buffer;
   uint64_t Cursor;
@@ -158,13 +193,15 @@ private:
 /// (.gcno and .gcda).
 class GCOVFile {
 public:
-  GCOVFile() {}
+  GCOVFile() : Functions(), RunCount(0), ProgramCount(0) {}
   ~GCOVFile();
   bool read(GCOVBuffer &Buffer);
   void dump();
   void collectLineCounts(FileInfo &FI);
 private:
   SmallVector<GCOVFunction *, 16> Functions;
+  uint32_t RunCount;
+  uint32_t ProgramCount;
 };
 
 /// GCOVFunction - Collects function information.
@@ -173,6 +210,7 @@ public:
   GCOVFunction() : Ident(0), LineNumber(0) {}
   ~GCOVFunction();
   bool read(GCOVBuffer &Buffer, GCOV::GCOVFormat Format);
+  StringRef getFilename() const { return Filename; }
   void dump();
   void collectLineCounts(FileInfo &FI);
 private:
@@ -186,39 +224,36 @@ private:
 /// GCOVBlock - Collects block information.
 class GCOVBlock {
 public:
-  GCOVBlock(uint32_t N) : Number(N), Counter(0) {}
+  GCOVBlock(GCOVFunction &P, uint32_t N) :
+    Parent(P), Number(N), Counter(0), Edges(), Lines() {}
   ~GCOVBlock();
   void addEdge(uint32_t N) { Edges.push_back(N); }
-  void addLine(StringRef Filename, uint32_t LineNo);
-  void addCount(uint64_t N) { Counter = N; }
+  void addLine(uint32_t N) { Lines.push_back(N); }
+  void addCount(uint64_t N) { Counter += N; }
+  size_t getNumEdges() { return Edges.size(); }
   void dump();
   void collectLineCounts(FileInfo &FI);
 private:
+  GCOVFunction &Parent;
   uint32_t Number;
   uint64_t Counter;
   SmallVector<uint32_t, 16> Edges;
-  StringMap<GCOVLines *> Lines;
+  SmallVector<uint32_t, 16> Lines;
 };
 
-/// GCOVLines - A wrapper around a vector of int to keep track of line nos.
-class GCOVLines {
-public:
-  ~GCOVLines() { Lines.clear(); }
-  void add(uint32_t N) { Lines.push_back(N); }
-  void collectLineCounts(FileInfo &FI, StringRef Filename, uint32_t Count);
-  void dump();
-
-private:
-  SmallVector<uint32_t, 4> Lines;
-};
-
-typedef SmallVector<uint32_t, 16> LineCounts;
+typedef DenseMap<uint32_t, uint64_t> LineCounts;
 class FileInfo {
 public:
-  void addLineCount(StringRef Filename, uint32_t Line, uint32_t Count);
-  void print();
+  void addLineCount(StringRef Filename, uint32_t Line, uint64_t Count) {
+    LineInfo[Filename][Line-1] += Count;
+  }
+  void setRunCount(uint32_t Runs) { RunCount = Runs; }
+  void setProgramCount(uint32_t Programs) { ProgramCount = Programs; }
+  void print(raw_fd_ostream &OS, StringRef gcnoFile, StringRef gcdaFile);
 private:
   StringMap<LineCounts> LineInfo;
+  uint32_t RunCount;
+  uint32_t ProgramCount;
 };
 
 }
diff --git a/include/llvm/Support/IntegersSubset.h b/include/llvm/Support/IntegersSubset.h
deleted file mode 100644
index 64b79ee..0000000
--- a/include/llvm/Support/IntegersSubset.h
+++ /dev/null
@@ -1,540 +0,0 @@
-//===-- llvm/IntegersSubset.h - The subset of integers ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// @file
-/// This file contains class that implements constant set of ranges:
-/// [<Low0,High0>,...,<LowN,HighN>]. Initially, this class was created for
-/// SwitchInst and was used for case value representation that may contain
-/// multiple ranges for a single successor.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_INTEGERSSUBSET_H
-#define LLVM_SUPPORT_INTEGERSSUBSET_H
-
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/LLVMContext.h"
-#include <list>
-
-namespace llvm {
-
-  // The IntItem is a wrapper for APInt.
-  // 1. It determines sign of integer, it allows to use
-  //    comparison operators >,<,>=,<=, and as result we got shorter and cleaner
-  //    constructions.
-  // 2. It helps to implement PR1255 (case ranges) as a series of small patches.
-  // 3. Currently we can interpret IntItem both as ConstantInt and as APInt.
-  //    It allows to provide SwitchInst methods that works with ConstantInt for
-  //    non-updated passes. And it allows to use APInt interface for new methods.
-  // 4. IntItem can be easily replaced with APInt.
-
-  // The set of macros that allows to propagate APInt operators to the IntItem.
-
-#define INT_ITEM_DEFINE_COMPARISON(op,func) \
-  bool operator op (const APInt& RHS) const { \
-    return getAPIntValue().func(RHS); \
-  }
-
-#define INT_ITEM_DEFINE_UNARY_OP(op) \
-  IntItem operator op () const { \
-    APInt res = op(getAPIntValue()); \
-    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
-    return IntItem(cast<ConstantInt>(NewVal)); \
-  }
-
-#define INT_ITEM_DEFINE_BINARY_OP(op) \
-  IntItem operator op (const APInt& RHS) const { \
-    APInt res = getAPIntValue() op RHS; \
-    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
-    return IntItem(cast<ConstantInt>(NewVal)); \
-  }
-
-#define INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(op) \
-  IntItem& operator op (const APInt& RHS) {\
-    APInt res = getAPIntValue();\
-    res op RHS; \
-    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
-    ConstantIntVal = cast<ConstantInt>(NewVal); \
-    return *this; \
-  }
-
-#define INT_ITEM_DEFINE_PREINCDEC(op) \
-    IntItem& operator op () { \
-      APInt res = getAPIntValue(); \
-      op(res); \
-      Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
-      ConstantIntVal = cast<ConstantInt>(NewVal); \
-      return *this; \
-    }
-
-#define INT_ITEM_DEFINE_POSTINCDEC(op) \
-    IntItem& operator op (int) { \
-      APInt res = getAPIntValue();\
-      op(res); \
-      Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res); \
-      OldConstantIntVal = ConstantIntVal; \
-      ConstantIntVal = cast<ConstantInt>(NewVal); \
-      return IntItem(OldConstantIntVal); \
-    }
-
-#define INT_ITEM_DEFINE_OP_STANDARD_INT(RetTy, op, IntTy) \
-  RetTy operator op (IntTy RHS) const { \
-    return (*this) op APInt(getAPIntValue().getBitWidth(), RHS); \
-  }
-
-class IntItem {
-  ConstantInt *ConstantIntVal;
-  const APInt* APIntVal;
-  IntItem(const ConstantInt *V) :
-    ConstantIntVal(const_cast<ConstantInt*>(V)),
-    APIntVal(&ConstantIntVal->getValue()){}
-  const APInt& getAPIntValue() const {
-    return *APIntVal;
-  }
-public:
-
-  IntItem() {}
-
-  operator const APInt&() const {
-    return getAPIntValue();
-  }
-
-  // Propagate APInt operators.
-  // Note, that
-  // /,/=,>>,>>= are not implemented in APInt.
-  // <<= is implemented for unsigned RHS, but not implemented for APInt RHS.
-
-  INT_ITEM_DEFINE_COMPARISON(<, ult)
-  INT_ITEM_DEFINE_COMPARISON(>, ugt)
-  INT_ITEM_DEFINE_COMPARISON(<=, ule)
-  INT_ITEM_DEFINE_COMPARISON(>=, uge)
-
-  INT_ITEM_DEFINE_COMPARISON(==, eq)
-  INT_ITEM_DEFINE_OP_STANDARD_INT(bool,==,uint64_t)
-
-  INT_ITEM_DEFINE_COMPARISON(!=, ne)
-  INT_ITEM_DEFINE_OP_STANDARD_INT(bool,!=,uint64_t)
-
-  INT_ITEM_DEFINE_BINARY_OP(*)
-  INT_ITEM_DEFINE_BINARY_OP(+)
-  INT_ITEM_DEFINE_OP_STANDARD_INT(IntItem,+,uint64_t)
-  INT_ITEM_DEFINE_BINARY_OP(-)
-  INT_ITEM_DEFINE_OP_STANDARD_INT(IntItem,-,uint64_t)
-  INT_ITEM_DEFINE_BINARY_OP(<<)
-  INT_ITEM_DEFINE_OP_STANDARD_INT(IntItem,<<,unsigned)
-  INT_ITEM_DEFINE_BINARY_OP(&)
-  INT_ITEM_DEFINE_BINARY_OP(^)
-  INT_ITEM_DEFINE_BINARY_OP(|)
-
-  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(*=)
-  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(+=)
-  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(-=)
-  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(&=)
-  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(^=)
-  INT_ITEM_DEFINE_ASSIGNMENT_BY_OP(|=)
-
-  // Special case for <<=
-  IntItem& operator <<= (unsigned RHS) {
-    APInt res = getAPIntValue();
-    res <<= RHS;
-    Constant *NewVal = ConstantInt::get(ConstantIntVal->getContext(), res);
-    ConstantIntVal = cast<ConstantInt>(NewVal);
-    return *this;
-  }
-
-  INT_ITEM_DEFINE_UNARY_OP(-)
-  INT_ITEM_DEFINE_UNARY_OP(~)
-
-  INT_ITEM_DEFINE_PREINCDEC(++)
-  INT_ITEM_DEFINE_PREINCDEC(--)
-
-  // The set of workarounds, since currently we use ConstantInt implemented
-  // integer.
-
-  static IntItem fromConstantInt(const ConstantInt *V) {
-    return IntItem(V);
-  }
-  static IntItem fromType(Type* Ty, const APInt& V) {
-    ConstantInt *C = cast<ConstantInt>(ConstantInt::get(Ty, V));
-    return fromConstantInt(C);
-  }
-  static IntItem withImplLikeThis(const IntItem& LikeThis, const APInt& V) {
-    ConstantInt *C = cast<ConstantInt>(ConstantInt::get(
-        LikeThis.ConstantIntVal->getContext(), V));
-    return fromConstantInt(C);
-  }
-  ConstantInt *toConstantInt() const {
-    return ConstantIntVal;
-  }
-};
-
-template<class IntType>
-class IntRange {
-protected:
-    IntType Low;
-    IntType High;
-    bool IsEmpty : 1;
-    bool IsSingleNumber : 1;
-
-public:
-    typedef IntRange<IntType> self;
-    typedef std::pair<self, self> SubRes;
-
-    IntRange() : IsEmpty(true) {}
-    IntRange(const self &RHS) :
-      Low(RHS.Low), High(RHS.High),
-      IsEmpty(RHS.IsEmpty), IsSingleNumber(RHS.IsSingleNumber) {}
-    IntRange(const IntType &C) :
-      Low(C), High(C), IsEmpty(false), IsSingleNumber(true) {}
-
-    IntRange(const IntType &L, const IntType &H) : Low(L), High(H),
-      IsEmpty(false), IsSingleNumber(Low == High) {}
-
-    bool isEmpty() const { return IsEmpty; }
-    bool isSingleNumber() const { return IsSingleNumber; }
-
-    const IntType& getLow() const {
-      assert(!IsEmpty && "Range is empty.");
-      return Low;
-    }
-    const IntType& getHigh() const {
-      assert(!IsEmpty && "Range is empty.");
-      return High;
-    }
-
-    bool operator<(const self &RHS) const {
-      assert(!IsEmpty && "Left range is empty.");
-      assert(!RHS.IsEmpty && "Right range is empty.");
-      if (Low == RHS.Low) {
-        if (High > RHS.High)
-          return true;
-        return false;
-      }
-      if (Low < RHS.Low)
-        return true;
-      return false;
-    }
-
-    bool operator==(const self &RHS) const {
-      assert(!IsEmpty && "Left range is empty.");
-      assert(!RHS.IsEmpty && "Right range is empty.");
-      return Low == RHS.Low && High == RHS.High;
-    }
-
-    bool operator!=(const self &RHS) const {
-      return !operator ==(RHS);
-    }
-
-    static bool LessBySize(const self &LHS, const self &RHS) {
-      return (LHS.High - LHS.Low) < (RHS.High - RHS.Low);
-    }
-
-    bool isInRange(const IntType &IntVal) const {
-      assert(!IsEmpty && "Range is empty.");
-      return IntVal >= Low && IntVal <= High;
-    }
-
-    SubRes sub(const self &RHS) const {
-      SubRes Res;
-
-      // RHS is either more global and includes this range or
-      // if it doesn't intersected with this range.
-      if (!isInRange(RHS.Low) && !isInRange(RHS.High)) {
-
-        // If RHS more global (it is enough to check
-        // only one border in this case.
-        if (RHS.isInRange(Low))
-          return std::make_pair(self(Low, High), self());
-
-        return Res;
-      }
-
-      if (Low < RHS.Low) {
-        Res.first.Low = Low;
-        IntType NewHigh = RHS.Low;
-        --NewHigh;
-        Res.first.High = NewHigh;
-      }
-      if (High > RHS.High) {
-        IntType NewLow = RHS.High;
-        ++NewLow;
-        Res.second.Low = NewLow;
-        Res.second.High = High;
-      }
-      return Res;
-    }
-  };
-
-//===----------------------------------------------------------------------===//
-/// IntegersSubsetGeneric - class that implements the subset of integers. It
-/// consists from ranges and single numbers.
-template <class IntTy>
-class IntegersSubsetGeneric {
-public:
-  // Use Chris Lattner idea, that was initially described here:
-  // http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20120213/136954.html
-  // In short, for more compact memory consumption we can store flat
-  // numbers collection, and define range as pair of indices.
-  // In that case we can safe some memory on 32 bit machines.
-  typedef std::vector<IntTy> FlatCollectionTy;
-  typedef std::pair<IntTy*, IntTy*> RangeLinkTy;
-  typedef std::vector<RangeLinkTy> RangeLinksTy;
-  typedef typename RangeLinksTy::const_iterator RangeLinksConstIt;
-
-  typedef IntegersSubsetGeneric<IntTy> self;
-
-protected:
-
-  FlatCollectionTy FlatCollection;
-  RangeLinksTy RangeLinks;
-
-  bool IsSingleNumber;
-  bool IsSingleNumbersOnly;
-
-public:
-
-  template<class RangesCollectionTy>
-  explicit IntegersSubsetGeneric(const RangesCollectionTy& Links) {
-    assert(Links.size() && "Empty ranges are not allowed.");
-
-    // In case of big set of single numbers consumes additional RAM space,
-    // but allows to avoid additional reallocation.
-    FlatCollection.reserve(Links.size() * 2);
-    RangeLinks.reserve(Links.size());
-    IsSingleNumbersOnly = true;
-    for (typename RangesCollectionTy::const_iterator i = Links.begin(),
-         e = Links.end(); i != e; ++i) {
-      RangeLinkTy RangeLink;
-      FlatCollection.push_back(i->getLow());
-      RangeLink.first = &FlatCollection.back();
-      if (i->getLow() != i->getHigh()) {
-        FlatCollection.push_back(i->getHigh());
-        IsSingleNumbersOnly = false;
-      }
-      RangeLink.second = &FlatCollection.back();
-      RangeLinks.push_back(RangeLink);
-    }
-    IsSingleNumber = IsSingleNumbersOnly && RangeLinks.size() == 1;
-  }
-
-  IntegersSubsetGeneric(const self& RHS) {
-    *this = RHS;
-  }
-
-  self& operator=(const self& RHS) {
-    FlatCollection.clear();
-    RangeLinks.clear();
-    FlatCollection.reserve(RHS.RangeLinks.size() * 2);
-    RangeLinks.reserve(RHS.RangeLinks.size());
-    for (RangeLinksConstIt i = RHS.RangeLinks.begin(), e = RHS.RangeLinks.end();
-         i != e; ++i) {
-      RangeLinkTy RangeLink;
-      FlatCollection.push_back(*(i->first));
-      RangeLink.first = &FlatCollection.back();
-      if (i->first != i->second)
-        FlatCollection.push_back(*(i->second));
-      RangeLink.second = &FlatCollection.back();
-      RangeLinks.push_back(RangeLink);
-    }
-    IsSingleNumber = RHS.IsSingleNumber;
-    IsSingleNumbersOnly = RHS.IsSingleNumbersOnly;
-    return *this;
-  }
-
-  typedef IntRange<IntTy> Range;
-
-  /// Checks is the given constant satisfies this case. Returns
-  /// true if it equals to one of contained values or belongs to the one of
-  /// contained ranges.
-  bool isSatisfies(const IntTy &CheckingVal) const {
-    if (IsSingleNumber)
-      return FlatCollection.front() == CheckingVal;
-    if (IsSingleNumbersOnly)
-      return std::find(FlatCollection.begin(),
-                       FlatCollection.end(),
-                       CheckingVal) != FlatCollection.end();
-
-    for (size_t i = 0, e = getNumItems(); i < e; ++i) {
-      if (RangeLinks[i].first == RangeLinks[i].second) {
-        if (*RangeLinks[i].first == CheckingVal)
-          return true;
-      } else if (*RangeLinks[i].first <= CheckingVal &&
-                 *RangeLinks[i].second >= CheckingVal)
-        return true;
-    }
-    return false;
-  }
-
-  /// Returns set's item with given index.
-  Range getItem(unsigned idx) const {
-    const RangeLinkTy &Link = RangeLinks[idx];
-    if (Link.first != Link.second)
-      return Range(*Link.first, *Link.second);
-    else
-      return Range(*Link.first);
-  }
-
-  /// Return number of items (ranges) stored in set.
-  size_t getNumItems() const {
-    return RangeLinks.size();
-  }
-
-  /// Returns true if whole subset contains single element.
-  bool isSingleNumber() const {
-    return IsSingleNumber;
-  }
-
-  /// Returns true if whole subset contains only single numbers, no ranges.
-  bool isSingleNumbersOnly() const {
-    return IsSingleNumbersOnly;
-  }
-
-  /// Does the same like getItem(idx).isSingleNumber(), but
-  /// works faster, since we avoid creation of temporary range object.
-  bool isSingleNumber(unsigned idx) const {
-    return RangeLinks[idx].first == RangeLinks[idx].second;
-  }
-
-  /// Returns set the size, that equals number of all values + sizes of all
-  /// ranges.
-  /// Ranges set is considered as flat numbers collection.
-  /// E.g.: for range [<0>, <1>, <4,8>] the size will 7;
-  ///       for range [<0>, <1>, <5>] the size will 3
-  unsigned getSize() const {
-    APInt sz(((const APInt&)getItem(0).getLow()).getBitWidth(), 0);
-    for (size_t i = 0, e = getNumItems(); i != e; ++i) {
-      const APInt Low = getItem(i).getLow();
-      const APInt High = getItem(i).getHigh();
-      APInt S = High - Low + 1;
-      sz += S;
-    }
-    return sz.getZExtValue();
-  }
-
-  /// Allows to access single value even if it belongs to some range.
-  /// Ranges set is considered as flat numbers collection.
-  /// [<1>, <4,8>] is considered as [1,4,5,6,7,8]
-  /// For range [<1>, <4,8>] getSingleValue(3) returns 6.
-  APInt getSingleValue(unsigned idx) const {
-    APInt sz(((const APInt&)getItem(0).getLow()).getBitWidth(), 0);
-    for (unsigned i = 0, e = getNumItems(); i != e; ++i) {
-      const APInt Low = getItem(i).getLow();
-      const APInt High = getItem(i).getHigh();
-      APInt S = High - Low + 1;
-      APInt oldSz = sz;
-      sz += S;
-      if (sz.ugt(idx)) {
-        APInt Res = Low;
-        APInt Offset(oldSz.getBitWidth(), idx);
-        Offset -= oldSz;
-        Res += Offset;
-        return Res;
-      }
-    }
-    assert(0 && "Index exceeds high border.");
-    return sz;
-  }
-
-  /// Does the same as getSingleValue, but works only if subset contains
-  /// single numbers only.
-  const IntTy& getSingleNumber(unsigned idx) const {
-    assert(IsSingleNumbersOnly && "This method works properly if subset "
-                                  "contains single numbers only.");
-    return FlatCollection[idx];
-  }
-};
-
-//===----------------------------------------------------------------------===//
-/// IntegersSubset - currently is extension of IntegersSubsetGeneric
-/// that also supports conversion to/from Constant* object.
-class IntegersSubset : public IntegersSubsetGeneric<IntItem> {
-
-  typedef IntegersSubsetGeneric<IntItem> ParentTy;
-
-  Constant *Holder;
-
-  static unsigned getNumItemsFromConstant(Constant *C) {
-    return cast<ArrayType>(C->getType())->getNumElements();
-  }
-
-  static Range getItemFromConstant(Constant *C, unsigned idx) {
-    const Constant *CV = C->getAggregateElement(idx);
-
-    unsigned NumEls = cast<VectorType>(CV->getType())->getNumElements();
-    switch (NumEls) {
-    case 1:
-      return Range(IntItem::fromConstantInt(
-                     cast<ConstantInt>(CV->getAggregateElement(0U))),
-                   IntItem::fromConstantInt(cast<ConstantInt>(
-                     cast<ConstantInt>(CV->getAggregateElement(0U)))));
-    case 2:
-      return Range(IntItem::fromConstantInt(
-                     cast<ConstantInt>(CV->getAggregateElement(0U))),
-                   IntItem::fromConstantInt(
-                   cast<ConstantInt>(CV->getAggregateElement(1))));
-    default:
-      assert(0 && "Only pairs and single numbers are allowed here.");
-      return Range();
-    }
-  }
-
-  std::vector<Range> rangesFromConstant(Constant *C) {
-    unsigned NumItems = getNumItemsFromConstant(C);
-    std::vector<Range> r;
-    r.reserve(NumItems);
-    for (unsigned i = 0, e = NumItems; i != e; ++i)
-      r.push_back(getItemFromConstant(C, i));
-    return r;
-  }
-
-public:
-
-  explicit IntegersSubset(Constant *C) : ParentTy(rangesFromConstant(C)),
-                          Holder(C) {}
-
-  IntegersSubset(const IntegersSubset& RHS) :
-    ParentTy(*(const ParentTy *)&RHS), // FIXME: tweak for msvc.
-    Holder(RHS.Holder) {}
-
-  template<class RangesCollectionTy>
-  explicit IntegersSubset(const RangesCollectionTy& Src) : ParentTy(Src) {
-    std::vector<Constant*> Elts;
-    Elts.reserve(Src.size());
-    for (typename RangesCollectionTy::const_iterator i = Src.begin(),
-         e = Src.end(); i != e; ++i) {
-      const Range &R = *i;
-      std::vector<Constant*> r;
-      if (R.isSingleNumber()) {
-        r.reserve(2);
-        // FIXME: Since currently we have ConstantInt based numbers
-        // use hack-conversion of IntItem to ConstantInt
-        r.push_back(R.getLow().toConstantInt());
-        r.push_back(R.getHigh().toConstantInt());
-      } else {
-        r.reserve(1);
-        r.push_back(R.getLow().toConstantInt());
-      }
-      Constant *CV = ConstantVector::get(r);
-      Elts.push_back(CV);
-    }
-    ArrayType *ArrTy =
-        ArrayType::get(Elts.front()->getType(), (uint64_t)Elts.size());
-    Holder = ConstantArray::get(ArrTy, Elts);
-  }
-
-  operator Constant*() { return Holder; }
-  operator const Constant*() const { return Holder; }
-  Constant *operator->() { return Holder; }
-  const Constant *operator->() const { return Holder; }
-};
-
-}
-
-#endif /* CLLVM_SUPPORT_INTEGERSSUBSET_H */
diff --git a/include/llvm/Support/IntegersSubsetMapping.h b/include/llvm/Support/IntegersSubsetMapping.h
deleted file mode 100644
index 641ce78..0000000
--- a/include/llvm/Support/IntegersSubsetMapping.h
+++ /dev/null
@@ -1,588 +0,0 @@
-//===- IntegersSubsetMapping.h - Mapping subset ==> Successor ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// @file
-/// IntegersSubsetMapping is mapping from A to B, where
-/// Items in A is subsets of integers,
-/// Items in B some pointers (Successors).
-/// If user which to add another subset for successor that is already
-/// exists in mapping, IntegersSubsetMapping merges existing subset with
-/// added one.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_INTEGERSSUBSETMAPPING_H
-#define LLVM_SUPPORT_INTEGERSSUBSETMAPPING_H
-
-#include "llvm/Support/IntegersSubset.h"
-#include <list>
-#include <map>
-#include <vector>
-
-namespace llvm {
-
-template <class SuccessorClass,
-          class IntegersSubsetTy = IntegersSubset,
-          class IntTy = IntItem>
-class IntegersSubsetMapping {
-  // FIXME: To much similar iterators typedefs, similar names. 
-  //        - Rename RangeIterator to the cluster iterator.
-  //        - Remove unused "add" methods.
-  //        - Class contents needs cleaning.
-public:
-  
-  typedef IntRange<IntTy> RangeTy;
-  
-  struct RangeEx : public RangeTy {
-    RangeEx() : Weight(1) {}
-    RangeEx(const RangeTy &R) : RangeTy(R), Weight(1) {}
-    RangeEx(const RangeTy &R, unsigned W) : RangeTy(R), Weight(W) {}
-    RangeEx(const IntTy &C) : RangeTy(C), Weight(1) {}
-    RangeEx(const IntTy &L, const IntTy &H) : RangeTy(L, H), Weight(1) {}
-    RangeEx(const IntTy &L, const IntTy &H, unsigned W) :
-      RangeTy(L, H), Weight(W) {}
-    unsigned Weight;
-  };
-
-  typedef std::pair<RangeEx, SuccessorClass*> Cluster;
-
-  typedef std::list<RangeTy> RangesCollection;
-  typedef typename RangesCollection::iterator RangesCollectionIt;
-  typedef typename RangesCollection::const_iterator RangesCollectionConstIt;
-  typedef IntegersSubsetMapping<SuccessorClass, IntegersSubsetTy, IntTy> self;
-  
-protected:
-
-  typedef std::list<Cluster> CaseItems;
-  typedef typename CaseItems::iterator CaseItemIt;
-  typedef typename CaseItems::const_iterator CaseItemConstIt;
-  
-  // TODO: Change unclean CRS prefixes to SubsetMap for example.
-  typedef std::map<SuccessorClass*, RangesCollection > CRSMap;
-  typedef typename CRSMap::iterator CRSMapIt;
-
-  struct ClustersCmp {
-    bool operator()(const Cluster &C1, const Cluster &C2) {
-      return C1.first < C2.first;
-    }
-  };
-  
-  CaseItems Items;
-  bool Sorted;
-  
-  bool isIntersected(CaseItemIt& LItem, CaseItemIt& RItem) {
-    return LItem->first.getHigh() >= RItem->first.getLow();
-  }
-
-  bool isJoinable(CaseItemIt& LItem, CaseItemIt& RItem) {
-    if (LItem->second != RItem->second) {
-      assert(!isIntersected(LItem, RItem) &&
-             "Intersected items with different successors!");
-      return false;
-    }
-    APInt RLow = RItem->first.getLow();
-    if (RLow != APInt::getNullValue(RLow.getBitWidth()))
-      --RLow;
-    return LItem->first.getHigh() >= RLow;
-  }
-  
-  void sort() {
-    if (!Sorted) {
-      std::vector<Cluster> clustersVector;
-      clustersVector.reserve(Items.size());
-      clustersVector.insert(clustersVector.begin(), Items.begin(), Items.end());
-      std::sort(clustersVector.begin(), clustersVector.end(), ClustersCmp());
-      Items.clear();
-      Items.insert(Items.begin(), clustersVector.begin(), clustersVector.end());
-      Sorted = true;
-    }
-  }
-  
-  enum DiffProcessState {
-    L_OPENED,
-    INTERSECT_OPENED,
-    R_OPENED,
-    ALL_IS_CLOSED
-  };
-  
-  class DiffStateMachine {
-    
-    DiffProcessState State;
-    IntTy OpenPt;
-    SuccessorClass *CurrentLSuccessor;
-    SuccessorClass *CurrentRSuccessor;
-    
-    self *LeftMapping;
-    self *IntersectionMapping;
-    self *RightMapping;
-
-  public:
-    
-    typedef
-      IntegersSubsetMapping<SuccessorClass, IntegersSubsetTy, IntTy> MappingTy;
-    
-    DiffStateMachine(MappingTy *L,
-                                 MappingTy *Intersection,
-                                 MappingTy *R) :
-                                 State(ALL_IS_CLOSED),
-                                 LeftMapping(L),
-                                 IntersectionMapping(Intersection),
-                                 RightMapping(R)
-                                 {}
-    
-    void onLOpen(const IntTy &Pt, SuccessorClass *S) {
-      switch (State) {
-      case R_OPENED:
-        if (Pt > OpenPt/*Don't add empty ranges.*/ && RightMapping) 
-          RightMapping->add(OpenPt, Pt-1, CurrentRSuccessor);
-        State = INTERSECT_OPENED;
-        break;
-      case ALL_IS_CLOSED:
-        State = L_OPENED;
-        break;
-      default:
-        assert(0 && "Got unexpected point.");
-        break;
-      }
-      CurrentLSuccessor = S;
-      OpenPt = Pt;
-    }
-    
-    void onLClose(const IntTy &Pt) {
-      switch (State) {
-      case L_OPENED:
-        assert(Pt >= OpenPt &&
-               "Subset is not sorted or contains overlapped ranges");
-        if (LeftMapping)
-          LeftMapping->add(OpenPt, Pt, CurrentLSuccessor);
-        State = ALL_IS_CLOSED;
-        break;
-      case INTERSECT_OPENED:
-        if (IntersectionMapping)
-          IntersectionMapping->add(OpenPt, Pt, CurrentLSuccessor);
-        OpenPt = Pt + 1;
-        State = R_OPENED;
-        break;
-      default:
-        assert(0 && "Got unexpected point.");
-        break;
-      }
-    }
-    
-    void onROpen(const IntTy &Pt, SuccessorClass *S) {
-      switch (State) {
-      case L_OPENED:
-        if (Pt > OpenPt && LeftMapping)
-          LeftMapping->add(OpenPt, Pt-1, CurrentLSuccessor);
-        State = INTERSECT_OPENED;
-        break;
-      case ALL_IS_CLOSED:
-        State = R_OPENED;
-        break;
-      default:
-        assert(0 && "Got unexpected point.");
-        break;
-      }
-      CurrentRSuccessor = S;
-      OpenPt = Pt;      
-    }
-    
-    void onRClose(const IntTy &Pt) {
-      switch (State) {
-      case R_OPENED:
-        assert(Pt >= OpenPt &&
-               "Subset is not sorted or contains overlapped ranges");
-        if (RightMapping)
-          RightMapping->add(OpenPt, Pt, CurrentRSuccessor);
-        State = ALL_IS_CLOSED;
-        break;
-      case INTERSECT_OPENED:
-        if (IntersectionMapping)
-          IntersectionMapping->add(OpenPt, Pt, CurrentLSuccessor);
-        OpenPt = Pt + 1;
-        State = L_OPENED;
-        break;
-      default:
-        assert(0 && "Got unexpected point.");
-        break;
-      }
-    }
-    
-    void onLROpen(const IntTy &Pt,
-                  SuccessorClass *LS,
-                  SuccessorClass *RS) {
-      switch (State) {
-      case ALL_IS_CLOSED:
-        State = INTERSECT_OPENED;
-        break;
-      default:
-        assert(0 && "Got unexpected point.");
-        break;
-      }
-      CurrentLSuccessor = LS;
-      CurrentRSuccessor = RS;
-      OpenPt = Pt;        
-    }
-    
-    void onLRClose(const IntTy &Pt) {
-      switch (State) {
-      case INTERSECT_OPENED:
-        if (IntersectionMapping)
-          IntersectionMapping->add(OpenPt, Pt, CurrentLSuccessor);
-        State = ALL_IS_CLOSED;
-        break;
-      default:
-        assert(0 && "Got unexpected point.");
-        break;        
-      }
-    }
-    
-    bool isLOpened() { return State == L_OPENED; }
-    bool isROpened() { return State == R_OPENED; }
-  };
-
-public:
-  
-  // Don't public CaseItems itself. Don't allow edit the Items directly. 
-  // Just present the user way to iterate over the internal collection
-  // sharing iterator, begin() and end(). Editing should be controlled by
-  // factory.
-  typedef CaseItemIt RangeIterator;
-  
-  typedef std::pair<SuccessorClass*, IntegersSubsetTy> Case;
-  typedef std::list<Case> Cases;
-  typedef typename Cases::iterator CasesIt;
-  
-  IntegersSubsetMapping() {
-    Sorted = false;
-  }
-  
-  bool verify() {
-    RangeIterator DummyErrItem;
-    return verify(DummyErrItem);
-  }
-  
-  bool verify(RangeIterator& errItem) {
-    if (Items.empty())
-      return true;
-    sort();
-    for (CaseItemIt j = Items.begin(), i = j++, e = Items.end();
-         j != e; i = j++) {
-      if (isIntersected(i, j) && i->second != j->second) {
-        errItem = j;
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool isOverlapped(self &RHS) {
-    if (Items.empty() || RHS.empty())
-      return true;
-    
-    for (CaseItemIt L = Items.begin(), R = RHS.Items.begin(),
-         el = Items.end(), er = RHS.Items.end(); L != el && R != er;) {
-      
-      const RangeTy &LRange = L->first;
-      const RangeTy &RRange = R->first;
-      
-      if (LRange.getLow() > RRange.getLow()) {
-        if (RRange.isSingleNumber() || LRange.getLow() > RRange.getHigh())
-          ++R;
-        else
-          return true;
-      } else if (LRange.getLow() < RRange.getLow()) {
-        if (LRange.isSingleNumber() || LRange.getHigh() < RRange.getLow())
-          ++L;
-        else
-          return true;
-      } else // iRange.getLow() == jRange.getLow() 
-        return true;
-    }
-    return false;
-  }
-   
-  
-  void optimize() {
-    if (Items.size() < 2)
-      return;
-    sort();
-    CaseItems OldItems = Items;
-    Items.clear();
-    const IntTy *Low = &OldItems.begin()->first.getLow();
-    const IntTy *High = &OldItems.begin()->first.getHigh();
-    unsigned Weight = OldItems.begin()->first.Weight;
-    SuccessorClass *Successor = OldItems.begin()->second;
-    for (CaseItemIt j = OldItems.begin(), i = j++, e = OldItems.end();
-         j != e; i = j++) {
-      if (isJoinable(i, j)) {
-        const IntTy *CurHigh = &j->first.getHigh();
-        Weight += j->first.Weight;
-        if (*CurHigh > *High)
-          High = CurHigh;
-      } else {
-        RangeEx R(*Low, *High, Weight);
-        add(R, Successor);
-        Low = &j->first.getLow();
-        High = &j->first.getHigh(); 
-        Weight = j->first.Weight;
-        Successor = j->second;
-      }
-    }
-    RangeEx R(*Low, *High, Weight);
-    add(R, Successor);
-    // We recollected the Items, but we kept it sorted.
-    Sorted = true;
-  }
-  
-  /// Adds a constant value.
-  void add(const IntTy &C, SuccessorClass *S = 0) {
-    RangeTy R(C);
-    add(R, S);
-  }
-  
-  /// Adds a range.
-  void add(const IntTy &Low, const IntTy &High, SuccessorClass *S = 0) {
-    RangeTy R(Low, High);
-    add(R, S);
-  }
-  void add(const RangeTy &R, SuccessorClass *S = 0) {
-    RangeEx REx = R;
-    add(REx, S);
-  }   
-  void add(const RangeEx &R, SuccessorClass *S = 0) {
-    Items.push_back(std::make_pair(R, S));
-    Sorted = false;
-  }  
-  
-  /// Adds all ranges and values from given ranges set to the current
-  /// mapping.
-  void add(const IntegersSubsetTy &CRS, SuccessorClass *S = 0,
-           unsigned Weight = 0) {
-    unsigned ItemWeight = 1;
-    if (Weight)
-      // Weight is associated with CRS, for now we perform a division to
-      // get the weight for each item.
-      ItemWeight = Weight / CRS.getNumItems();
-    for (unsigned i = 0, e = CRS.getNumItems(); i < e; ++i) {
-      RangeTy R = CRS.getItem(i);
-      RangeEx REx(R, ItemWeight);
-      add(REx, S);
-    }
-  }
-  
-  void add(self& RHS) {
-    Items.insert(Items.end(), RHS.Items.begin(), RHS.Items.end());
-  }
-  
-  void add(self& RHS, SuccessorClass *S) {
-    for (CaseItemIt i = RHS.Items.begin(), e = RHS.Items.end(); i != e; ++i)
-      add(i->first, S);
-  }  
-  
-  void add(const RangesCollection& RHS, SuccessorClass *S = 0) {
-    for (RangesCollectionConstIt i = RHS.begin(), e = RHS.end(); i != e; ++i)
-      add(*i, S);
-  }  
-  
-  /// Removes items from set.
-  void removeItem(RangeIterator i) { Items.erase(i); }
-  
-  /// Moves whole case from current mapping to the NewMapping object.
-  void detachCase(self& NewMapping, SuccessorClass *Succ) {
-    for (CaseItemIt i = Items.begin(); i != Items.end();)
-      if (i->second == Succ) {
-        NewMapping.add(i->first, i->second);
-        Items.erase(i++);
-      } else
-        ++i;
-  }
-  
-  /// Removes all clusters for given successor.
-  void removeCase(SuccessorClass *Succ) {
-    for (CaseItemIt i = Items.begin(); i != Items.end();)
-      if (i->second == Succ) {
-        Items.erase(i++);
-      } else
-        ++i;
-  }  
-  
-  /// Find successor that satisfies given value.
-  SuccessorClass *findSuccessor(const IntTy& Val) {
-    for (CaseItemIt i = Items.begin(); i != Items.end(); ++i) {
-      if (i->first.isInRange(Val))
-        return i->second;
-    }
-    return 0;
-  }  
-  
-  /// Calculates the difference between this mapping and RHS.
-  /// THIS without RHS is placed into LExclude,
-  /// RHS without THIS is placed into RExclude,
-  /// THIS intersect RHS is placed into Intersection.
-  void diff(self *LExclude, self *Intersection, self *RExclude,
-                             const self& RHS) {
-    
-    DiffStateMachine Machine(LExclude, Intersection, RExclude);
-    
-    CaseItemConstIt L = Items.begin(), R = RHS.Items.begin();
-    while (L != Items.end() && R != RHS.Items.end()) {
-      const Cluster &LCluster = *L;
-      const RangeEx &LRange = LCluster.first;
-      const Cluster &RCluster = *R;
-      const RangeEx &RRange = RCluster.first;
-      
-      if (LRange.getHigh() < RRange.getLow()) {
-        Machine.onLOpen(LRange.getLow(), LCluster.second);
-        Machine.onLClose(LRange.getHigh());
-        ++L;
-        continue;
-      }
-      
-      if (LRange.getLow() > RRange.getHigh()) {
-        Machine.onROpen(RRange.getLow(), RCluster.second);
-        Machine.onRClose(RRange.getHigh());
-        ++R;
-        continue;
-      }
-
-      if (LRange.getLow() < RRange.getLow()) {
-        // May be opened in previous iteration.
-        if (!Machine.isLOpened())
-          Machine.onLOpen(LRange.getLow(), LCluster.second);
-        Machine.onROpen(RRange.getLow(), RCluster.second);
-      }
-      else if (RRange.getLow() < LRange.getLow()) {
-        if (!Machine.isROpened())
-          Machine.onROpen(RRange.getLow(), RCluster.second);
-        Machine.onLOpen(LRange.getLow(), LCluster.second);
-      }
-      else
-        Machine.onLROpen(LRange.getLow(), LCluster.second, RCluster.second);
-      
-      if (LRange.getHigh() < RRange.getHigh()) {
-        Machine.onLClose(LRange.getHigh());
-        ++L;
-        while(L != Items.end() && L->first.getHigh() < RRange.getHigh()) {
-          Machine.onLOpen(L->first.getLow(), L->second);
-          Machine.onLClose(L->first.getHigh());
-          ++L;
-        }
-      }
-      else if (RRange.getHigh() < LRange.getHigh()) {
-        Machine.onRClose(RRange.getHigh());
-        ++R;
-        while(R != RHS.Items.end() && R->first.getHigh() < LRange.getHigh()) {
-          Machine.onROpen(R->first.getLow(), R->second);
-          Machine.onRClose(R->first.getHigh());
-          ++R;
-        }
-      }
-      else {
-        Machine.onLRClose(LRange.getHigh());
-        ++L;
-        ++R;
-      }
-    }
-    
-    if (L != Items.end()) {
-      if (Machine.isLOpened()) {
-        Machine.onLClose(L->first.getHigh());
-        ++L;
-      }
-      if (LExclude)
-        while (L != Items.end()) {
-          LExclude->add(L->first, L->second);
-          ++L;
-        }
-    } else if (R != RHS.Items.end()) {
-      if (Machine.isROpened()) {
-        Machine.onRClose(R->first.getHigh());
-        ++R;
-      }
-      if (RExclude)
-        while (R != RHS.Items.end()) {
-          RExclude->add(R->first, R->second);
-          ++R;
-        }
-    }
-  }  
-  
-  /// Builds the finalized case objects.
-  void getCases(Cases& TheCases, bool PreventMerging = false) {
-    //FIXME: PreventMerging is a temporary parameter.
-    //Currently a set of passes is still knows nothing about
-    //switches with case ranges, and if these passes meet switch
-    //with complex case that crashs the application.
-    if (PreventMerging) {
-      for (RangeIterator i = this->begin(); i != this->end(); ++i) {
-        RangesCollection SingleRange;
-        SingleRange.push_back(i->first);
-        TheCases.push_back(std::make_pair(i->second,
-                                          IntegersSubsetTy(SingleRange)));
-      }
-      return;
-    }
-    CRSMap TheCRSMap;
-    for (RangeIterator i = this->begin(); i != this->end(); ++i)
-      TheCRSMap[i->second].push_back(i->first);
-    for (CRSMapIt i = TheCRSMap.begin(), e = TheCRSMap.end(); i != e; ++i)
-      TheCases.push_back(std::make_pair(i->first, IntegersSubsetTy(i->second)));
-  }
-  
-  /// Builds the finalized case objects ignoring successor values, as though
-  /// all ranges belongs to the same successor.
-  IntegersSubsetTy getCase() {
-    RangesCollection Ranges;
-    for (RangeIterator i = this->begin(); i != this->end(); ++i)
-      Ranges.push_back(i->first);
-    return IntegersSubsetTy(Ranges);
-  }  
-  
-  /// Returns pointer to value of case if it is single-numbered or 0
-  /// in another case.
-  const IntTy* getCaseSingleNumber(SuccessorClass *Succ) {
-    const IntTy* Res = 0;
-    for (CaseItemIt i = Items.begin(); i != Items.end(); ++i)
-      if (i->second == Succ) {
-        if (!i->first.isSingleNumber())
-          return 0;
-        if (Res)
-          return 0;
-        else 
-          Res = &(i->first.getLow());
-      }
-    return Res;
-  }  
-  
-  /// Returns true if there is no ranges and values inside.
-  bool empty() const { return Items.empty(); }
-  
-  void clear() {
-    Items.clear();
-    // Don't reset Sorted flag:
-    // 1. For empty mapping it matters nothing.
-    // 2. After first item will added Sorted flag will cleared.
-  }  
-  
-  // Returns number of clusters
-  unsigned size() const {
-    return Items.size();
-  }
-  
-  RangeIterator begin() { return Items.begin(); }
-  RangeIterator end() { return Items.end(); }
-};
-
-class BasicBlock;
-typedef IntegersSubsetMapping<BasicBlock> IntegersSubsetToBB;
-
-}
-
-#endif /* LLVM_SUPPORT_INTEGERSSUBSETMAPPING_CRSBUILDER_H */
diff --git a/include/llvm/Support/MachO.h b/include/llvm/Support/MachO.h
index 7f28c3f..897a611 100644
--- a/include/llvm/Support/MachO.h
+++ b/include/llvm/Support/MachO.h
@@ -14,273 +14,419 @@
 #ifndef LLVM_SUPPORT_MACHO_H
 #define LLVM_SUPPORT_MACHO_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Host.h"
 
-// NOTE: The enums in this file are intentially named to be different than those
-// in the headers in /usr/include/mach (on darwin systems) to avoid conflicts
-// with those macros.
 namespace llvm {
   namespace MachO {
     // Enums from <mach-o/loader.h>
-    enum {
+    enum LLVM_ENUM_INT_TYPE(uint32_t) {
       // Constants for the "magic" field in llvm::MachO::mach_header and
       // llvm::MachO::mach_header_64
-      HeaderMagic32         = 0xFEEDFACEu, // MH_MAGIC
-      HeaderMagic32Swapped  = 0xCEFAEDFEu, // MH_CIGAM
-      HeaderMagic64         = 0xFEEDFACFu, // MH_MAGIC_64
-      HeaderMagic64Swapped  = 0xCFFAEDFEu, // MH_CIGAM_64
-      UniversalMagic        = 0xCAFEBABEu, // FAT_MAGIC
-      UniversalMagicSwapped = 0xBEBAFECAu, // FAT_CIGAM
+      MH_MAGIC    = 0xFEEDFACEu,
+      MH_CIGAM    = 0xCEFAEDFEu,
+      MH_MAGIC_64 = 0xFEEDFACFu,
+      MH_CIGAM_64 = 0xCFFAEDFEu,
+      FAT_MAGIC   = 0xCAFEBABEu,
+      FAT_CIGAM   = 0xBEBAFECAu
+    };
 
+    enum HeaderFileType {
       // Constants for the "filetype" field in llvm::MachO::mach_header and
       // llvm::MachO::mach_header_64
-      HeaderFileTypeObject              = 0x1u, // MH_OBJECT
-      HeaderFileTypeExecutable          = 0x2u, // MH_EXECUTE
-      HeaderFileTypeFixedVMShlib        = 0x3u, // MH_FVMLIB
-      HeaderFileTypeCore                = 0x4u, // MH_CORE
-      HeaderFileTypePreloadedExecutable = 0x5u, // MH_PRELOAD
-      HeaderFileTypeDynamicShlib        = 0x6u, // MH_DYLIB
-      HeaderFileTypeDynamicLinkEditor   = 0x7u, // MH_DYLINKER
-      HeaderFileTypeBundle              = 0x8u, // MH_BUNDLE
-      HeaderFileTypeDynamicShlibStub    = 0x9u, // MH_DYLIB_STUB
-      HeaderFileTypeDSYM                = 0xAu, // MH_DSYM
-      HeaderFileTypeKextBundle          = 0xBu, // MH_KEXT_BUNDLE
+      MH_OBJECT      = 0x1u,
+      MH_EXECUTE     = 0x2u,
+      MH_FVMLIB      = 0x3u,
+      MH_CORE        = 0x4u,
+      MH_PRELOAD     = 0x5u,
+      MH_DYLIB       = 0x6u,
+      MH_DYLINKER    = 0x7u,
+      MH_BUNDLE      = 0x8u,
+      MH_DYLIB_STUB  = 0x9u,
+      MH_DSYM        = 0xAu,
+      MH_KEXT_BUNDLE = 0xBu
+    };
 
+    enum {
       // Constant bits for the "flags" field in llvm::MachO::mach_header and
       // llvm::MachO::mach_header_64
-      HeaderFlagBitNoUndefinedSymbols     = 0x00000001u, // MH_NOUNDEFS
-      HeaderFlagBitIsIncrementalLinkObject= 0x00000002u, // MH_INCRLINK
-      HeaderFlagBitIsDynamicLinkObject    = 0x00000004u, // MH_DYLDLINK
-      HeaderFlagBitBindAtLoad             = 0x00000008u, // MH_BINDATLOAD
-      HeaderFlagBitPrebound               = 0x00000010u, // MH_PREBOUND
-      HeaderFlagBitSplitSegments          = 0x00000020u, // MH_SPLIT_SEGS
-      HeaderFlagBitLazyInit               = 0x00000040u, // MH_LAZY_INIT
-      HeaderFlagBitTwoLevelNamespace      = 0x00000080u, // MH_TWOLEVEL
-      HeaderFlagBitForceFlatNamespace     = 0x00000100u, // MH_FORCE_FLAT
-      HeaderFlagBitNoMultipleDefintions   = 0x00000200u, // MH_NOMULTIDEFS
-      HeaderFlagBitNoFixPrebinding        = 0x00000400u, // MH_NOFIXPREBINDING
-      HeaderFlagBitPrebindable            = 0x00000800u, // MH_PREBINDABLE
-      HeaderFlagBitAllModulesBound        = 0x00001000u, // MH_ALLMODSBOUND
-      HeaderFlagBitSubsectionsViaSymbols  = 0x00002000u, // MH_SUBSECTIONS_VIA_SYMBOLS
-      HeaderFlagBitCanonical              = 0x00004000u, // MH_CANONICAL
-      HeaderFlagBitWeakDefines            = 0x00008000u, // MH_WEAK_DEFINES
-      HeaderFlagBitBindsToWeak            = 0x00010000u, // MH_BINDS_TO_WEAK
-      HeaderFlagBitAllowStackExecution    = 0x00020000u, // MH_ALLOW_STACK_EXECUTION
-      HeaderFlagBitRootSafe               = 0x00040000u, // MH_ROOT_SAFE
-      HeaderFlagBitSetUIDSafe             = 0x00080000u, // MH_SETUID_SAFE
-      HeaderFlagBitNoReexportedDylibs     = 0x00100000u, // MH_NO_REEXPORTED_DYLIBS
-      HeaderFlagBitPIE                    = 0x00200000u, // MH_PIE
-      HeaderFlagBitDeadStrippableDylib    = 0x00400000u, // MH_DEAD_STRIPPABLE_DYLIB
-
+      MH_NOUNDEFS                = 0x00000001u,
+      MH_INCRLINK                = 0x00000002u,
+      MH_DYLDLINK                = 0x00000004u,
+      MH_BINDATLOAD              = 0x00000008u,
+      MH_PREBOUND                = 0x00000010u,
+      MH_SPLIT_SEGS              = 0x00000020u,
+      MH_LAZY_INIT               = 0x00000040u,
+      MH_TWOLEVEL                = 0x00000080u,
+      MH_FORCE_FLAT              = 0x00000100u,
+      MH_NOMULTIDEFS             = 0x00000200u,
+      MH_NOFIXPREBINDING         = 0x00000400u,
+      MH_PREBINDABLE             = 0x00000800u,
+      MH_ALLMODSBOUND            = 0x00001000u,
+      MH_SUBSECTIONS_VIA_SYMBOLS = 0x00002000u,
+      MH_CANONICAL               = 0x00004000u,
+      MH_WEAK_DEFINES            = 0x00008000u,
+      MH_BINDS_TO_WEAK           = 0x00010000u,
+      MH_ALLOW_STACK_EXECUTION   = 0x00020000u,
+      MH_ROOT_SAFE               = 0x00040000u,
+      MH_SETUID_SAFE             = 0x00080000u,
+      MH_NO_REEXPORTED_DYLIBS    = 0x00100000u,
+      MH_PIE                     = 0x00200000u,
+      MH_DEAD_STRIPPABLE_DYLIB   = 0x00400000u
+    };
+
+    enum LLVM_ENUM_INT_TYPE(uint32_t) {
+      // Flags for the "cmd" field in llvm::MachO::load_command
+      LC_REQ_DYLD    = 0x80000000u
+    };
+
+    enum LoadCommandType LLVM_ENUM_INT_TYPE(uint32_t) {
       // Constants for the "cmd" field in llvm::MachO::load_command
-      LoadCommandDynamicLinkerRequired    = 0x80000000u, // LC_REQ_DYLD
-      LoadCommandSegment32                = 0x00000001u, // LC_SEGMENT
-      LoadCommandSymtab                   = 0x00000002u, // LC_SYMTAB
-      LoadCommandSymSeg                   = 0x00000003u, // LC_SYMSEG
-      LoadCommandThread                   = 0x00000004u, // LC_THREAD
-      LoadCommandUnixThread               = 0x00000005u, // LC_UNIXTHREAD
-      LoadCommandFixedVMShlibLoad         = 0x00000006u, // LC_LOADFVMLIB
-      LoadCommandFixedVMShlibIdent        = 0x00000007u, // LC_IDFVMLIB
-      LoadCommandIdent                    = 0x00000008u, // LC_IDENT
-      LoadCommandFixedVMFileInclusion     = 0x00000009u, // LC_FVMFILE
-      LoadCommandPrePage                  = 0x0000000Au, // LC_PREPAGE
-      LoadCommandDynamicSymtabInfo        = 0x0000000Bu, // LC_DYSYMTAB
-      LoadCommandDylibLoad                = 0x0000000Cu, // LC_LOAD_DYLIB
-      LoadCommandDylibIdent               = 0x0000000Du, // LC_ID_DYLIB
-      LoadCommandDynamicLinkerLoad        = 0x0000000Eu, // LC_LOAD_DYLINKER
-      LoadCommandDynamicLinkerIdent       = 0x0000000Fu, // LC_ID_DYLINKER
-      LoadCommandDylibPrebound            = 0x00000010u, // LC_PREBOUND_DYLIB
-      LoadCommandRoutines32               = 0x00000011u, // LC_ROUTINES
-      LoadCommandSubFramework             = 0x00000012u, // LC_SUB_FRAMEWORK
-      LoadCommandSubUmbrella              = 0x00000013u, // LC_SUB_UMBRELLA
-      LoadCommandSubClient                = 0x00000014u, // LC_SUB_CLIENT
-      LoadCommandSubLibrary               = 0x00000015u, // LC_SUB_LIBRARY
-      LoadCommandTwoLevelHints            = 0x00000016u, // LC_TWOLEVEL_HINTS
-      LoadCommandPreBindChecksum          = 0x00000017u, // LC_PREBIND_CKSUM
-      LoadCommandDylibLoadWeak            = 0x80000018u, // LC_LOAD_WEAK_DYLIB
-      LoadCommandSegment64                = 0x00000019u, // LC_SEGMENT_64
-      LoadCommandRoutines64               = 0x0000001Au, // LC_ROUTINES_64
-      LoadCommandUUID                     = 0x0000001Bu, // LC_UUID
-      LoadCommandRunpath                  = 0x8000001Cu, // LC_RPATH
-      LoadCommandCodeSignature            = 0x0000001Du, // LC_CODE_SIGNATURE
-      LoadCommandSegmentSplitInfo         = 0x0000001Eu, // LC_SEGMENT_SPLIT_INFO
-      LoadCommandDylibReexport            = 0x8000001Fu, // LC_REEXPORT_DYLIB
-      LoadCommandDylibLazyLoad            = 0x00000020u, // LC_LAZY_LOAD_DYLIB
-      LoadCommandEncryptionInfo           = 0x00000021u, // LC_ENCRYPTION_INFO
-      LoadCommandDynamicLinkerInfo        = 0x00000022u, // LC_DYLD_INFO
-      LoadCommandDynamicLinkerInfoOnly    = 0x80000022u, // LC_DYLD_INFO_ONLY
-      LoadCommandDylibLoadUpward          = 0x80000023u, // LC_LOAD_UPWARD_DYLIB
-      LoadCommandVersionMinMacOSX         = 0x00000024u, // LC_VERSION_MIN_MACOSX
-      LoadCommandVersionMinIPhoneOS       = 0x00000025u, // LC_VERSION_MIN_IPHONEOS
-      LoadCommandFunctionStarts           = 0x00000026u, // LC_FUNCTION_STARTS
-      LoadCommandDyldEnvironment          = 0x00000027u, // LC_DYLD_ENVIRONMENT
-      LoadCommandMain                     = 0x80000028u, // LC_MAIN
-      LoadCommandDataInCode               = 0x00000029u, // LC_DATA_IN_CODE
-      LoadCommandSourceVersion            = 0x0000002Au, // LC_SOURCE_VERSION
-      LoadCommandCodeSignDRs              = 0x0000002Bu, // LC_DYLIB_CODE_SIGN_DRS
-
+      LC_SEGMENT              = 0x00000001u,
+      LC_SYMTAB               = 0x00000002u,
+      LC_SYMSEG               = 0x00000003u,
+      LC_THREAD               = 0x00000004u,
+      LC_UNIXTHREAD           = 0x00000005u,
+      LC_LOADFVMLIB           = 0x00000006u,
+      LC_IDFVMLIB             = 0x00000007u,
+      LC_IDENT                = 0x00000008u,
+      LC_FVMFILE              = 0x00000009u,
+      LC_PREPAGE              = 0x0000000Au,
+      LC_DYSYMTAB             = 0x0000000Bu,
+      LC_LOAD_DYLIB           = 0x0000000Cu,
+      LC_ID_DYLIB             = 0x0000000Du,
+      LC_LOAD_DYLINKER        = 0x0000000Eu,
+      LC_ID_DYLINKER          = 0x0000000Fu,
+      LC_PREBOUND_DYLIB       = 0x00000010u,
+      LC_ROUTINES             = 0x00000011u,
+      LC_SUB_FRAMEWORK        = 0x00000012u,
+      LC_SUB_UMBRELLA         = 0x00000013u,
+      LC_SUB_CLIENT           = 0x00000014u,
+      LC_SUB_LIBRARY          = 0x00000015u,
+      LC_TWOLEVEL_HINTS       = 0x00000016u,
+      LC_PREBIND_CKSUM        = 0x00000017u,
+      LC_LOAD_WEAK_DYLIB      = 0x80000018u,
+      LC_SEGMENT_64           = 0x00000019u,
+      LC_ROUTINES_64          = 0x0000001Au,
+      LC_UUID                 = 0x0000001Bu,
+      LC_RPATH                = 0x8000001Cu,
+      LC_CODE_SIGNATURE       = 0x0000001Du,
+      LC_SEGMENT_SPLIT_INFO   = 0x0000001Eu,
+      LC_REEXPORT_DYLIB       = 0x8000001Fu,
+      LC_LAZY_LOAD_DYLIB      = 0x00000020u,
+      LC_ENCRYPTION_INFO      = 0x00000021u,
+      LC_DYLD_INFO            = 0x00000022u,
+      LC_DYLD_INFO_ONLY       = 0x80000022u,
+      LC_LOAD_UPWARD_DYLIB    = 0x80000023u,
+      LC_VERSION_MIN_MACOSX   = 0x00000024u,
+      LC_VERSION_MIN_IPHONEOS = 0x00000025u,
+      LC_FUNCTION_STARTS      = 0x00000026u,
+      LC_DYLD_ENVIRONMENT     = 0x00000027u,
+      LC_MAIN                 = 0x80000028u,
+      LC_DATA_IN_CODE         = 0x00000029u,
+      LC_SOURCE_VERSION       = 0x0000002Au,
+      LC_DYLIB_CODE_SIGN_DRS  = 0x0000002Bu,
+      //                        0x0000002Cu,
+      LC_LINKER_OPTIONS       = 0x0000002Du
+    };
+
+    enum LLVM_ENUM_INT_TYPE(uint32_t) {
       // Constant bits for the "flags" field in llvm::MachO::segment_command
-      SegmentCommandFlagBitHighVM             = 0x1u, // SG_HIGHVM
-      SegmentCommandFlagBitFixedVMLibrary     = 0x2u, // SG_FVMLIB
-      SegmentCommandFlagBitNoRelocations      = 0x4u, // SG_NORELOC
-      SegmentCommandFlagBitProtectedVersion1  = 0x8u, // SG_PROTECTED_VERSION_1
+      SG_HIGHVM              = 0x1u,
+      SG_FVMLIB              = 0x2u,
+      SG_NORELOC             = 0x4u,
+      SG_PROTECTED_VERSION_1 = 0x8u,
 
 
       // Constant masks for the "flags" field in llvm::MachO::section and
       // llvm::MachO::section_64
-      SectionFlagMaskSectionType      = 0x000000ffu, // SECTION_TYPE
-      SectionFlagMaskAllAttributes    = 0xffffff00u, // SECTION_ATTRIBUTES
-      SectionFlagMaskUserAttributes   = 0xff000000u, // SECTION_ATTRIBUTES_USR
-      SectionFlagMaskSystemAttributes = 0x00ffff00u, // SECTION_ATTRIBUTES_SYS
+      SECTION_TYPE           = 0x000000ffu, // SECTION_TYPE
+      SECTION_ATTRIBUTES     = 0xffffff00u, // SECTION_ATTRIBUTES
+      SECTION_ATTRIBUTES_USR = 0xff000000u, // SECTION_ATTRIBUTES_USR
+      SECTION_ATTRIBUTES_SYS = 0x00ffff00u  // SECTION_ATTRIBUTES_SYS
+    };
 
+    enum SectionType {
       // Constant masks for the "flags[7:0]" field in llvm::MachO::section and
       // llvm::MachO::section_64 (mask "flags" with SECTION_TYPE)
-      SectionTypeRegular                    = 0x00u, // S_REGULAR
-      SectionTypeZeroFill                   = 0x01u, // S_ZEROFILL
-      SectionTypeCStringLiterals            = 0x02u, // S_CSTRING_LITERALS
-      SectionType4ByteLiterals              = 0x03u, // S_4BYTE_LITERALS
-      SectionType8ByteLiterals              = 0x04u, // S_8BYTE_LITERALS
-      SectionTypeLiteralPointers            = 0x05u, // S_LITERAL_POINTERS
-      SectionTypeNonLazySymbolPointers      = 0x06u, // S_NON_LAZY_SYMBOL_POINTERS
-      SectionTypeLazySymbolPointers         = 0x07u, // S_LAZY_SYMBOL_POINTERS
-      SectionTypeSymbolStubs                = 0x08u, // S_SYMBOL_STUBS
-      SectionTypeModuleInitFunctionPointers = 0x09u, // S_MOD_INIT_FUNC_POINTERS
-      SectionTypeModuleTermFunctionPointers = 0x0au, // S_MOD_TERM_FUNC_POINTERS
-      SectionTypeCoalesced                  = 0x0bu, // S_COALESCED
-      SectionTypeZeroFillLarge              = 0x0cu, // S_GB_ZEROFILL
-      SectionTypeInterposing                = 0x0du, // S_INTERPOSING
-      SectionType16ByteLiterals             = 0x0eu, // S_16BYTE_LITERALS
-      SectionTypeDTraceObjectFormat         = 0x0fu, // S_DTRACE_DOF
-      SectionTypeLazyDylibSymbolPointers    = 0x10u, // S_LAZY_DYLIB_SYMBOL_POINTERS
-
+      S_REGULAR                             = 0x00u,
+      S_ZEROFILL                            = 0x01u,
+      S_CSTRING_LITERALS                    = 0x02u,
+      S_4BYTE_LITERALS                      = 0x03u,
+      S_8BYTE_LITERALS                      = 0x04u,
+      S_LITERAL_POINTERS                    = 0x05u,
+      S_NON_LAZY_SYMBOL_POINTERS            = 0x06u,
+      S_LAZY_SYMBOL_POINTERS                = 0x07u,
+      S_SYMBOL_STUBS                        = 0x08u,
+      S_MOD_INIT_FUNC_POINTERS              = 0x09u,
+      S_MOD_TERM_FUNC_POINTERS              = 0x0au,
+      S_COALESCED                           = 0x0bu,
+      S_GB_ZEROFILL                         = 0x0cu,
+      S_INTERPOSING                         = 0x0du,
+      S_16BYTE_LITERALS                     = 0x0eu,
+      S_DTRACE_DOF                          = 0x0fu,
+      S_LAZY_DYLIB_SYMBOL_POINTERS          = 0x10u,
+      S_THREAD_LOCAL_REGULAR                = 0x11u,
+      S_THREAD_LOCAL_ZEROFILL               = 0x12u,
+      S_THREAD_LOCAL_VARIABLES              = 0x13u,
+      S_THREAD_LOCAL_VARIABLE_POINTERS      = 0x14u,
+      S_THREAD_LOCAL_INIT_FUNCTION_POINTERS = 0x15u
+    };
+
+    enum LLVM_ENUM_INT_TYPE(uint32_t) {
       // Constant masks for the "flags[31:24]" field in llvm::MachO::section and
       // llvm::MachO::section_64 (mask "flags" with SECTION_ATTRIBUTES_USR)
-      SectionAttrUserPureInstructions       = 0x80000000u, // S_ATTR_PURE_INSTRUCTIONS
-      SectionAttrUserNoTableOfContents      = 0x40000000u, // S_ATTR_NO_TOC
-      SectionAttrUserCanStripStaticSymbols  = 0x20000000u, // S_ATTR_STRIP_STATIC_SYMS
-      SectionAttrUserNoDeadStrip            = 0x10000000u, // S_ATTR_NO_DEAD_STRIP
-      SectionAttrUserLiveSupport            = 0x08000000u, // S_ATTR_LIVE_SUPPORT
-      SectionAttrUserSelfModifyingCode      = 0x04000000u, // S_ATTR_SELF_MODIFYING_CODE
-      SectionAttrUserDebug                  = 0x02000000u, // S_ATTR_DEBUG
+      S_ATTR_PURE_INSTRUCTIONS   = 0x80000000u,
+      S_ATTR_NO_TOC              = 0x40000000u,
+      S_ATTR_STRIP_STATIC_SYMS   = 0x20000000u,
+      S_ATTR_NO_DEAD_STRIP       = 0x10000000u,
+      S_ATTR_LIVE_SUPPORT        = 0x08000000u,
+      S_ATTR_SELF_MODIFYING_CODE = 0x04000000u,
+      S_ATTR_DEBUG               = 0x02000000u,
 
       // Constant masks for the "flags[23:8]" field in llvm::MachO::section and
       // llvm::MachO::section_64 (mask "flags" with SECTION_ATTRIBUTES_SYS)
-      SectionAttrSytemSomeInstructions      = 0x00000400u, // S_ATTR_SOME_INSTRUCTIONS
-      SectionAttrSytemHasExternalRelocations= 0x00000200u, // S_ATTR_EXT_RELOC
-      SectionAttrSytemHasLocalRelocations   = 0x00000100u, // S_ATTR_LOC_RELOC
-
-      IndirectSymbolLocal                   = 0x80000000u, // INDIRECT_SYMBOL_LOCAL
-      IndirectSymbolAbsolute                = 0x40000000u, // INDIRECT_SYMBOL_ABS
-
-      RebaseTypePointer                     = 1u, // REBASE_TYPE_POINTER
-      RebaseTypeTextAbsolute32              = 2u, // REBASE_TYPE_TEXT_ABSOLUTE32
-      RebaseTypeTextPCRelative32            = 3u, // REBASE_TYPE_TEXT_PCREL32
-
-      RebaseOpcodeMask                          = 0xF0u, // REBASE_OPCODE_MASK
-      RebaseImmediateMask                       = 0x0Fu, // REBASE_IMMEDIATE_MASK
-      RebaseOpcodeDone                          = 0x00u, // REBASE_OPCODE_DONE
-      RebaseOpcodeSetTypeImmediate              = 0x10u, // REBASE_OPCODE_SET_TYPE_IMM
-      RebaseOpcodeSetSegmentAndOffsetULEB       = 0x20u, // REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
-      RebaseOpcodeAddAddressULEB                = 0x30u, // REBASE_OPCODE_ADD_ADDR_ULEB
-      RebaseOpcodeAddAddressImmediateScaled     = 0x40u, // REBASE_OPCODE_ADD_ADDR_IMM_SCALED
-      RebaseOpcodeDoRebaseImmediateTimes        = 0x50u, // REBASE_OPCODE_DO_REBASE_IMM_TIMES
-      RebaseOpcodeDoRebaseULEBTimes             = 0x60u, // REBASE_OPCODE_DO_REBASE_ULEB_TIMES
-      RebaseOpcodeDoRebaseAddAddressULEB        = 0x70u, // REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB
-      RebaseOpcodeDoRebaseULEBTimesSkippingULEB = 0x80u, // REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB
-
-
-      BindTypePointer           = 1u, // BIND_TYPE_POINTER
-      BindTypeTextAbsolute32    = 2u, // BIND_TYPE_TEXT_ABSOLUTE32
-      BindTypeTextPCRelative32  = 3u, // BIND_TYPE_TEXT_PCREL32
-
-      BindSpecialDylibSelf            =  0u, // BIND_SPECIAL_DYLIB_SELF
-      BindSpecialDylibMainExecutable  = -1u, // BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE
-      BindSpecialDylibFlatLookup      = -2u, // BIND_SPECIAL_DYLIB_FLAT_LOOKUP
-
-      BindSymbolFlagsWeakImport         = 0x1u, // BIND_SYMBOL_FLAGS_WEAK_IMPORT
-      BindSymbolFlagsNonWeakDefinition  = 0x8u, // BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION
-
-      BindOpcodeMask                            = 0xF0u, // BIND_OPCODE_MASK
-      BindImmediateMask                         = 0x0Fu, // BIND_IMMEDIATE_MASK
-      BindOpcodeDone                            = 0x00u, // BIND_OPCODE_DONE
-      BindOpcodeSetDylibOrdinalImmediate        = 0x10u, // BIND_OPCODE_SET_DYLIB_ORDINAL_IMM
-      BindOpcodeSetDylibOrdinalULEB             = 0x20u, // BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB
-      BindOpcodeSetDylibSpecialImmediate        = 0x30u, // BIND_OPCODE_SET_DYLIB_SPECIAL_IMM
-      BindOpcodeSetSymbolTrailingFlagsImmediate = 0x40u, // BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM
-      BindOpcodeSetTypeImmediate                = 0x50u, // BIND_OPCODE_SET_TYPE_IMM
-      BindOpcodeSetAppendSLEB                   = 0x60u, // BIND_OPCODE_SET_ADDEND_SLEB
-      BindOpcodeSetSegmentAndOffsetULEB         = 0x70u, // BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB
-      BindOpcodeAddAddressULEB                  = 0x80u, // BIND_OPCODE_ADD_ADDR_ULEB
-      BindOpcodeDoBind                          = 0x90u, // BIND_OPCODE_DO_BIND
-      BindOpcodeDoBindAddAddressULEB            = 0xA0u, // BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB
-      BindOpcodeDoBindAddAddressImmediateScaled = 0xB0u, // BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED
-      BindOpcodeDoBindULEBTimesSkippingULEB     = 0xC0u, // BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB
-
-      ExportSymbolFlagsKindMask           = 0x03u, // EXPORT_SYMBOL_FLAGS_KIND_MASK
-      ExportSymbolFlagsKindRegular        = 0x00u, // EXPORT_SYMBOL_FLAGS_KIND_REGULAR
-      ExportSymbolFlagsKindThreadLocal    = 0x01u, // EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL
-      ExportSymbolFlagsWeakDefinition     = 0x04u, // EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION
-      ExportSymbolFlagsIndirectDefinition = 0x08u, // EXPORT_SYMBOL_FLAGS_INDIRECT_DEFINITION
-      ExportSymbolFlagsHasSpecializations = 0x10u, // EXPORT_SYMBOL_FLAGS_HAS_SPECIALIZATIONS
+      S_ATTR_SOME_INSTRUCTIONS   = 0x00000400u,
+      S_ATTR_EXT_RELOC           = 0x00000200u,
+      S_ATTR_LOC_RELOC           = 0x00000100u,
+
+      // Constant masks for the value of an indirect symbol in an indirect
+      // symbol table
+      INDIRECT_SYMBOL_LOCAL = 0x80000000u,
+      INDIRECT_SYMBOL_ABS   = 0x40000000u
+    };
+
+    enum DataRegionType {
+      // Constants for the "kind" field in a data_in_code_entry structure
+      DICE_KIND_DATA             = 1u,
+      DICE_KIND_JUMP_TABLE8      = 2u,
+      DICE_KIND_JUMP_TABLE16     = 3u,
+      DICE_KIND_JUMP_TABLE32     = 4u,
+      DICE_KIND_ABS_JUMP_TABLE32 = 5u
+    };
+
+    enum RebaseType {
+      REBASE_TYPE_POINTER         = 1u,
+      REBASE_TYPE_TEXT_ABSOLUTE32 = 2u,
+      REBASE_TYPE_TEXT_PCREL32    = 3u
+    };
+
+    enum {
+      REBASE_OPCODE_MASK    = 0xF0u,
+      REBASE_IMMEDIATE_MASK = 0x0Fu
+    };
+
+    enum RebaseOpcode {
+      REBASE_OPCODE_DONE                               = 0x00u,
+      REBASE_OPCODE_SET_TYPE_IMM                       = 0x10u,
+      REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB        = 0x20u,
+      REBASE_OPCODE_ADD_ADDR_ULEB                      = 0x30u,
+      REBASE_OPCODE_ADD_ADDR_IMM_SCALED                = 0x40u,
+      REBASE_OPCODE_DO_REBASE_IMM_TIMES                = 0x50u,
+      REBASE_OPCODE_DO_REBASE_ULEB_TIMES               = 0x60u,
+      REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB            = 0x70u,
+      REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB = 0x80u
+    };
+
+    enum BindType {
+      BIND_TYPE_POINTER         = 1u,
+      BIND_TYPE_TEXT_ABSOLUTE32 = 2u,
+      BIND_TYPE_TEXT_PCREL32    = 3u
+    };
 
+    enum BindSpecialDylib {
+      BIND_SPECIAL_DYLIB_SELF            =  0,
+      BIND_SPECIAL_DYLIB_MAIN_EXECUTABLE = -1,
+      BIND_SPECIAL_DYLIB_FLAT_LOOKUP     = -2
+    };
+
+    enum {
+      BIND_SYMBOL_FLAGS_WEAK_IMPORT         = 0x1u,
+      BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION = 0x8u,
 
+      BIND_OPCODE_MASK                      = 0xF0u,
+      BIND_IMMEDIATE_MASK                   = 0x0Fu
+    };
+
+    enum BindOpcode {
+      BIND_OPCODE_DONE                             = 0x00u,
+      BIND_OPCODE_SET_DYLIB_ORDINAL_IMM            = 0x10u,
+      BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB           = 0x20u,
+      BIND_OPCODE_SET_DYLIB_SPECIAL_IMM            = 0x30u,
+      BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM    = 0x40u,
+      BIND_OPCODE_SET_TYPE_IMM                     = 0x50u,
+      BIND_OPCODE_SET_ADDEND_SLEB                  = 0x60u,
+      BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB      = 0x70u,
+      BIND_OPCODE_ADD_ADDR_ULEB                    = 0x80u,
+      BIND_OPCODE_DO_BIND                          = 0x90u,
+      BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB            = 0xA0u,
+      BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED      = 0xB0u,
+      BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB = 0xC0u
+    };
+
+    enum {
+      EXPORT_SYMBOL_FLAGS_KIND_MASK           = 0x03u,
+      EXPORT_SYMBOL_FLAGS_WEAK_DEFINITION     = 0x04u,
+      EXPORT_SYMBOL_FLAGS_REEXPORT            = 0x08u,
+      EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER   = 0x10u
+    };
+
+    enum ExportSymbolKind {
+      EXPORT_SYMBOL_FLAGS_KIND_REGULAR        = 0x00u,
+      EXPORT_SYMBOL_FLAGS_KIND_THREAD_LOCAL   = 0x01u
+    };
+
+
+    enum {
       // Constant masks for the "n_type" field in llvm::MachO::nlist and
       // llvm::MachO::nlist_64
-      NlistMaskStab             = 0xe0, // N_STAB
-      NlistMaskPrivateExternal  = 0x10, // N_PEXT
-      NlistMaskType             = 0x0e, // N_TYPE
-      NlistMaskExternal         = 0x01, // N_EXT
+      N_STAB = 0xe0,
+      N_PEXT = 0x10,
+      N_TYPE = 0x0e,
+      N_EXT  = 0x01
+    };
 
+    enum NListType {
       // Constants for the "n_type & N_TYPE" llvm::MachO::nlist and
       // llvm::MachO::nlist_64
-      NListTypeUndefined          = 0x0u, // N_UNDF
-      NListTypeAbsolute           = 0x2u, // N_ABS
-      NListTypeSection            = 0xeu, // N_SECT
-      NListTypePreboundUndefined  = 0xcu, // N_PBUD
-      NListTypeIndirect           = 0xau, // N_INDR
+      N_UNDF = 0x0u,
+      N_ABS  = 0x2u,
+      N_SECT = 0xeu,
+      N_PBUD = 0xcu,
+      N_INDR = 0xau
+    };
 
-      // Constant masks for the "n_sect" field in llvm::MachO::nlist and
+    enum SectionOrdinal {
+      // Constants for the "n_sect" field in llvm::MachO::nlist and
       // llvm::MachO::nlist_64
-      NListSectionNoSection     = 0u, // NO_SECT
-      NListSectionMaxSection    = 0xffu, // MAX_SECT
+      NO_SECT  = 0u,
+      MAX_SECT = 0xffu
+    };
 
-      NListDescWeakRef          = 0x40u,
-      NListDescWeakDef          = 0x80u,
+    enum {
+      // Constant masks for the "n_desc" field in llvm::MachO::nlist and
+      // llvm::MachO::nlist_64
+      N_ARM_THUMB_DEF   = 0x0008u,
+      N_NO_DEAD_STRIP   = 0x0020u,
+      N_WEAK_REF        = 0x0040u,
+      N_WEAK_DEF        = 0x0080u,
+      N_SYMBOL_RESOLVER = 0x0100u
+    };
 
+    enum StabType {
       // Constant values for the "n_type" field in llvm::MachO::nlist and
       // llvm::MachO::nlist_64 when "(n_type & NlistMaskStab) != 0"
-      StabGlobalSymbol          = 0x20u,  // N_GSYM
-      StabFunctionName          = 0x22u,  // N_FNAME
-      StabFunction              = 0x24u,  // N_FUN
-      StabStaticSymbol          = 0x26u,  // N_STSYM
-      StabLocalCommon           = 0x28u,  // N_LCSYM
-      StabBeginSymbol           = 0x2Eu,  // N_BNSYM
-      StabSourceFileOptions     = 0x3Cu,  // N_OPT
-      StabRegisterSymbol        = 0x40u,  // N_RSYM
-      StabSourceLine            = 0x44u,  // N_SLINE
-      StabEndSymbol             = 0x4Eu,  // N_ENSYM
-      StabStructureType         = 0x60u,  // N_SSYM
-      StabSourceFileName        = 0x64u,  // N_SO
-      StabObjectFileName        = 0x66u,  // N_OSO
-      StabLocalSymbol           = 0x80u,  // N_LSYM
-      StabBeginIncludeFileName  = 0x82u,  // N_BINCL
-      StabIncludeFileName       = 0x84u,  // N_SOL
-      StabCompilerParameters    = 0x86u,  // N_PARAMS
-      StabCompilerVersion       = 0x88u,  // N_VERSION
-      StabCompilerOptLevel      = 0x8Au,  // N_OLEVEL
-      StabParameter             = 0xA0u,  // N_PSYM
-      StabEndIncludeFile        = 0xA2u,  // N_EINCL
-      StabAlternateEntry        = 0xA4u,  // N_ENTRY
-      StabLeftBracket           = 0xC0u,  // N_LBRAC
-      StabDeletedIncludeFile    = 0xC2u,  // N_EXCL
-      StabRightBracket          = 0xE0u,  // N_RBRAC
-      StabBeginCommon           = 0xE2u,  // N_BCOMM
-      StabEndCommon             = 0xE4u,  // N_ECOMM
-      StabEndCommonLocal        = 0xE8u,  // N_ECOML
-      StabLength                = 0xFEu   // N_LENG
-
+      N_GSYM    = 0x20u,
+      N_FNAME   = 0x22u,
+      N_FUN     = 0x24u,
+      N_STSYM   = 0x26u,
+      N_LCSYM   = 0x28u,
+      N_BNSYM   = 0x2Eu,
+      N_OPT     = 0x3Cu,
+      N_RSYM    = 0x40u,
+      N_SLINE   = 0x44u,
+      N_ENSYM   = 0x4Eu,
+      N_SSYM    = 0x60u,
+      N_SO      = 0x64u,
+      N_OSO     = 0x66u,
+      N_LSYM    = 0x80u,
+      N_BINCL   = 0x82u,
+      N_SOL     = 0x84u,
+      N_PARAMS  = 0x86u,
+      N_VERSION = 0x88u,
+      N_OLEVEL  = 0x8Au,
+      N_PSYM    = 0xA0u,
+      N_EINCL   = 0xA2u,
+      N_ENTRY   = 0xA4u,
+      N_LBRAC   = 0xC0u,
+      N_EXCL    = 0xC2u,
+      N_RBRAC   = 0xE0u,
+      N_BCOMM   = 0xE2u,
+      N_ECOMM   = 0xE4u,
+      N_ECOML   = 0xE8u,
+      N_LENG    = 0xFEu
+    };
+
+    enum LLVM_ENUM_INT_TYPE(uint32_t) {
+      // Constant values for the r_symbolnum field in an
+      // llvm::MachO::relocation_info structure when r_extern is 0.
+      R_ABS = 0,
+
+      // Constant bits for the r_address field in an
+      // llvm::MachO::relocation_info structure.
+      R_SCATTERED = 0x80000000
+    };
+
+    enum RelocationInfoType {
+      // Constant values for the r_type field in an
+      // llvm::MachO::relocation_info or llvm::MachO::scattered_relocation_info
+      // structure.
+      GENERIC_RELOC_VANILLA        = 0,
+      GENERIC_RELOC_PAIR           = 1,
+      GENERIC_RELOC_SECTDIFF       = 2,
+      GENERIC_RELOC_PB_LA_PTR      = 3,
+      GENERIC_RELOC_LOCAL_SECTDIFF = 4,
+      GENERIC_RELOC_TLV            = 5,
+
+      // Constant values for the r_type field in a PowerPC architecture
+      // llvm::MachO::relocation_info or llvm::MachO::scattered_relocation_info
+      // structure.
+      PPC_RELOC_VANILLA            = GENERIC_RELOC_VANILLA,
+      PPC_RELOC_PAIR               = GENERIC_RELOC_PAIR,
+      PPC_RELOC_BR14               = 2,
+      PPC_RELOC_BR24               = 3,
+      PPC_RELOC_HI16               = 4,
+      PPC_RELOC_LO16               = 5,
+      PPC_RELOC_HA16               = 6,
+      PPC_RELOC_LO14               = 7,
+      PPC_RELOC_SECTDIFF           = 8,
+      PPC_RELOC_PB_LA_PTR          = 9,
+      PPC_RELOC_HI16_SECTDIFF      = 10,
+      PPC_RELOC_LO16_SECTDIFF      = 11,
+      PPC_RELOC_HA16_SECTDIFF      = 12,
+      PPC_RELOC_JBSR               = 13,
+      PPC_RELOC_LO14_SECTDIFF      = 14,
+      PPC_RELOC_LOCAL_SECTDIFF     = 15,
+
+      // Constant values for the r_type field in an ARM architecture
+      // llvm::MachO::relocation_info or llvm::MachO::scattered_relocation_info
+      // structure.
+      ARM_RELOC_VANILLA            = GENERIC_RELOC_VANILLA,
+      ARM_RELOC_PAIR               = GENERIC_RELOC_PAIR,
+      ARM_RELOC_SECTDIFF           = GENERIC_RELOC_SECTDIFF,
+      ARM_RELOC_LOCAL_SECTDIFF     = 3,
+      ARM_RELOC_PB_LA_PTR          = 4,
+      ARM_RELOC_BR24               = 5,
+      ARM_THUMB_RELOC_BR22         = 6,
+      ARM_THUMB_32BIT_BRANCH       = 7, // obsolete
+      ARM_RELOC_HALF               = 8,
+      ARM_RELOC_HALF_SECTDIFF      = 9,
+
+      // Constant values for the r_type field in an x86_64 architecture
+      // llvm::MachO::relocation_info or llvm::MachO::scattered_relocation_info
+      // structure
+      X86_64_RELOC_UNSIGNED        = 0,
+      X86_64_RELOC_SIGNED          = 1,
+      X86_64_RELOC_BRANCH          = 2,
+      X86_64_RELOC_GOT_LOAD        = 3,
+      X86_64_RELOC_GOT             = 4,
+      X86_64_RELOC_SUBTRACTOR      = 5,
+      X86_64_RELOC_SIGNED_1        = 6,
+      X86_64_RELOC_SIGNED_2        = 7,
+      X86_64_RELOC_SIGNED_4        = 8,
+      X86_64_RELOC_TLV             = 9
+    };
+
+    // Values for segment_command.initprot.
+    // From <mach/vm_prot.h>
+    enum {
+      VM_PROT_READ    = 0x1,
+      VM_PROT_WRITE   = 0x2,
+      VM_PROT_EXECUTE = 0x4
     };
 
+
     // Structs from <mach-o/loader.h>
 
     struct mach_header {
@@ -572,6 +718,18 @@ namespace llvm {
       uint32_t datasize;
     };
 
+    struct data_in_code_entry {
+      uint32_t offset;
+      uint16_t length;
+      uint16_t kind;
+    };
+
+    struct source_version_command {
+      uint32_t cmd;
+      uint32_t cmdsize;
+      uint64_t version;
+    };
+
     struct encryption_info_command {
       uint32_t cmd;
       uint32_t cmdsize;
@@ -602,6 +760,12 @@ namespace llvm {
       uint32_t export_size;
     };
 
+    struct linker_options_command {
+      uint32_t cmd;
+      uint32_t cmdsize;
+      uint32_t count;
+    };
+
     struct symseg_command {
       uint32_t cmd;
       uint32_t cmdsize;
@@ -621,6 +785,31 @@ namespace llvm {
       uint32_t header_addr;
     };
 
+    struct tlv_descriptor_32 {
+      uint32_t thunk;
+      uint32_t key;
+      uint32_t offset;
+    };
+
+    struct tlv_descriptor_64 {
+      uint64_t thunk;
+      uint64_t key;
+      uint64_t offset;
+    };
+
+    struct tlv_descriptor {
+      uintptr_t thunk;
+      uintptr_t key;
+      uintptr_t offset;
+    };
+
+    struct entry_point_command {
+      uint32_t cmd;
+      uint32_t cmdsize;
+      uint64_t entryoff;
+      uint64_t stacksize;
+    };
+
 
     // Structs from <mach-o/fat.h>
     struct fat_header {
@@ -636,7 +825,39 @@ namespace llvm {
       uint32_t align;
     };
 
-    // Structs from <mach-o/fat.h>
+    // Structs from <mach-o/reloc.h>
+    struct relocation_info {
+      int32_t r_address;
+      uint32_t r_symbolnum:24,
+               r_pcrel:1,
+               r_length:2,
+               r_extern:1,
+               r_type:4;
+    };
+
+    struct scattered_relocation_info {
+#if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)
+      uint32_t r_scattered:1,
+               r_pcrel:1,
+               r_length:2,
+               r_type:4,
+               r_address:24;
+#else
+      uint32_t r_address:24,
+               r_type:4,
+               r_length:2,
+               r_pcrel:1,
+               r_scattered:1;
+#endif
+      int32_t r_value;
+    };
+
+    // Structs NOT from <mach-o/reloc.h>, but that make LLVM's life easier
+    struct any_relocation_info {
+      uint32_t r_word0, r_word1;
+    };
+
+    // Structs from <mach-o/nlist.h>
     struct nlist {
       uint32_t n_strx;
       uint8_t n_type;
@@ -655,58 +876,132 @@ namespace llvm {
 
     // Get/Set functions from <mach-o/nlist.h>
 
-    static inline uint16_t GET_LIBRARY_ORDINAL(uint16_t n_desc)
-    {
+    static inline uint16_t GET_LIBRARY_ORDINAL(uint16_t n_desc) {
       return (((n_desc) >> 8u) & 0xffu);
     }
 
-    static inline void SET_LIBRARY_ORDINAL(uint16_t &n_desc, uint8_t ordinal)
-    {
+    static inline void SET_LIBRARY_ORDINAL(uint16_t &n_desc, uint8_t ordinal) {
       n_desc = (((n_desc) & 0x00ff) | (((ordinal) & 0xff) << 8));
     }
 
-    static inline uint8_t GET_COMM_ALIGN (uint16_t n_desc)
-    {
+    static inline uint8_t GET_COMM_ALIGN (uint16_t n_desc) {
       return (n_desc >> 8u) & 0x0fu;
     }
 
-    static inline void SET_COMM_ALIGN (uint16_t &n_desc, uint8_t align)
-    {
+    static inline void SET_COMM_ALIGN (uint16_t &n_desc, uint8_t align) {
       n_desc = ((n_desc & 0xf0ffu) | ((align & 0x0fu) << 8u));
     }
 
     // Enums from <mach/machine.h>
-    enum {
+    enum LLVM_ENUM_INT_TYPE(uint32_t) {
       // Capability bits used in the definition of cpu_type.
-      CPUArchMask = 0xff000000,   // Mask for architecture bits
-      CPUArchABI64 = 0x01000000,  // 64 bit ABI
-
-      // Constants for the cputype field.
-      CPUTypeI386      = 7,
-      CPUTypeX86_64    = CPUTypeI386 | CPUArchABI64,
-      CPUTypeARM       = 12,
-      CPUTypeSPARC     = 14,
-      CPUTypePowerPC   = 18,
-      CPUTypePowerPC64 = CPUTypePowerPC | CPUArchABI64,
-
-
-      // Constants for the cpusubtype field.
-
-      // X86
-      CPUSubType_I386_ALL    = 3,
-      CPUSubType_X86_64_ALL  = 3,
-
-      // ARM
-      CPUSubType_ARM_ALL     = 0,
-      CPUSubType_ARM_V4T     = 5,
-      CPUSubType_ARM_V5      = 7,
-      CPUSubType_ARM_V6      = 6,
-      CPUSubType_ARM_V7      = 9,
-
-      // PowerPC
-      CPUSubType_POWERPC_ALL = 0,
-
-      CPUSubType_SPARC_ALL   = 0
+      CPU_ARCH_MASK  = 0xff000000,   // Mask for architecture bits
+      CPU_ARCH_ABI64 = 0x01000000    // 64 bit ABI
+    };
+
+    // Constants for the cputype field.
+    enum CPUType {
+      CPU_TYPE_ANY       = -1,
+      CPU_TYPE_X86       = 7,
+      CPU_TYPE_I386      = CPU_TYPE_X86,
+      CPU_TYPE_X86_64    = CPU_TYPE_X86 | CPU_ARCH_ABI64,
+   /* CPU_TYPE_MIPS      = 8, */
+      CPU_TYPE_MC98000   = 10, // Old Motorola PowerPC
+      CPU_TYPE_ARM       = 12,
+      CPU_TYPE_SPARC     = 14,
+      CPU_TYPE_POWERPC   = 18,
+      CPU_TYPE_POWERPC64 = CPU_TYPE_POWERPC | CPU_ARCH_ABI64
+    };
+
+    enum LLVM_ENUM_INT_TYPE(uint32_t) {
+      // Capability bits used in the definition of cpusubtype.
+      CPU_SUB_TYPE_MASK  = 0xff000000,   // Mask for architecture bits
+      CPU_SUB_TYPE_LIB64 = 0x80000000,   // 64 bit libraries
+
+      // Special CPU subtype constants.
+      CPU_SUBTYPE_MULTIPLE = ~0u
+    };
+
+    // Constants for the cpusubtype field.
+    enum CPUSubTypeX86 {
+      CPU_SUBTYPE_I386_ALL       = 3,
+      CPU_SUBTYPE_386            = 3,
+      CPU_SUBTYPE_486            = 4,
+      CPU_SUBTYPE_486SX          = 0x84,
+      CPU_SUBTYPE_586            = 5,
+      CPU_SUBTYPE_PENT           = CPU_SUBTYPE_586,
+      CPU_SUBTYPE_PENTPRO        = 0x16,
+      CPU_SUBTYPE_PENTII_M3      = 0x36,
+      CPU_SUBTYPE_PENTII_M5      = 0x56,
+      CPU_SUBTYPE_CELERON        = 0x67,
+      CPU_SUBTYPE_CELERON_MOBILE = 0x77,
+      CPU_SUBTYPE_PENTIUM_3      = 0x08,
+      CPU_SUBTYPE_PENTIUM_3_M    = 0x18,
+      CPU_SUBTYPE_PENTIUM_3_XEON = 0x28,
+      CPU_SUBTYPE_PENTIUM_M      = 0x09,
+      CPU_SUBTYPE_PENTIUM_4      = 0x0a,
+      CPU_SUBTYPE_PENTIUM_4_M    = 0x1a,
+      CPU_SUBTYPE_ITANIUM        = 0x0b,
+      CPU_SUBTYPE_ITANIUM_2      = 0x1b,
+      CPU_SUBTYPE_XEON           = 0x0c,
+      CPU_SUBTYPE_XEON_MP        = 0x1c,
+
+      CPU_SUBTYPE_X86_ALL     = 3,
+      CPU_SUBTYPE_X86_64_ALL  = 3,
+      CPU_SUBTYPE_X86_ARCH1   = 4,
+      CPU_SUBTYPE_X86_64_H    = 8
+    };
+    static inline int CPU_SUBTYPE_INTEL(int Family, int Model) {
+      return Family | (Model << 4);
+    }
+    static inline int CPU_SUBTYPE_INTEL_FAMILY(CPUSubTypeX86 ST) {
+      return ((int)ST) & 0x0f;
+    }
+    static inline int CPU_SUBTYPE_INTEL_MODEL(CPUSubTypeX86 ST) {
+      return ((int)ST) >> 4;
+    }
+    enum {
+      CPU_SUBTYPE_INTEL_FAMILY_MAX = 15,
+      CPU_SUBTYPE_INTEL_MODEL_ALL  = 0
+    };
+
+    enum CPUSubTypeARM {
+      CPU_SUBTYPE_ARM_ALL     = 0,
+      CPU_SUBTYPE_ARM_V4T     = 5,
+      CPU_SUBTYPE_ARM_V6      = 6,
+      CPU_SUBTYPE_ARM_V5      = 7,
+      CPU_SUBTYPE_ARM_V5TEJ   = 7,
+      CPU_SUBTYPE_ARM_XSCALE  = 8,
+      CPU_SUBTYPE_ARM_V7      = 9,
+      CPU_SUBTYPE_ARM_V7F     = 10,
+      CPU_SUBTYPE_ARM_V7S     = 11,
+      CPU_SUBTYPE_ARM_V7K     = 12,
+      CPU_SUBTYPE_ARM_V6M     = 14,
+      CPU_SUBTYPE_ARM_V7M     = 15,
+      CPU_SUBTYPE_ARM_V7EM    = 16
+    };
+
+    enum CPUSubTypeSPARC {
+      CPU_SUBTYPE_SPARC_ALL   = 0
+    };
+
+    enum CPUSubTypePowerPC {
+      CPU_SUBTYPE_POWERPC_ALL   = 0,
+      CPU_SUBTYPE_POWERPC_601   = 1,
+      CPU_SUBTYPE_POWERPC_602   = 2,
+      CPU_SUBTYPE_POWERPC_603   = 3,
+      CPU_SUBTYPE_POWERPC_603e  = 4,
+      CPU_SUBTYPE_POWERPC_603ev = 5,
+      CPU_SUBTYPE_POWERPC_604   = 6,
+      CPU_SUBTYPE_POWERPC_604e  = 7,
+      CPU_SUBTYPE_POWERPC_620   = 8,
+      CPU_SUBTYPE_POWERPC_750   = 9,
+      CPU_SUBTYPE_POWERPC_7400  = 10,
+      CPU_SUBTYPE_POWERPC_7450  = 11,
+      CPU_SUBTYPE_POWERPC_970   = 100,
+
+      CPU_SUBTYPE_MC980000_ALL  = CPU_SUBTYPE_POWERPC_ALL,
+      CPU_SUBTYPE_MC98601       = CPU_SUBTYPE_POWERPC_601
     };
   } // end namespace MachO
 } // end namespace llvm
diff --git a/include/llvm/Support/ManagedStatic.h b/include/llvm/Support/ManagedStatic.h
index 4171d1b..5587618 100644
--- a/include/llvm/Support/ManagedStatic.h
+++ b/include/llvm/Support/ManagedStatic.h
@@ -99,7 +99,6 @@ public:
 /// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
 void llvm_shutdown();
 
-
 /// llvm_shutdown_obj - This is a simple helper class that calls
 /// llvm_shutdown() when it is destroyed.
 struct llvm_shutdown_obj {
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 00c6ad7..ff41608 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -21,7 +21,8 @@
 #include <cstring>
 
 #ifdef _MSC_VER
-# include <intrin.h>
+#include <intrin.h>
+#include <limits>
 #endif
 
 namespace llvm {
@@ -603,6 +604,13 @@ inline int64_t SignExtend64(uint64_t X, unsigned B) {
   return int64_t(X << (64 - B)) >> (64 - B);
 }
 
+#if defined(_MSC_VER)
+  // Visual Studio defines the HUGE_VAL class of macros using purposeful
+  // constant arithmetic overflow, which it then warns on when encountered.
+  const float huge_valf = std::numeric_limits<float>::infinity();
+#else
+  const float huge_valf = HUGE_VALF;
+#endif
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 4f28da4..ff22fb6 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_SUPPORT_MEMORYBUFFER_H
 #define LLVM_SUPPORT_MEMORYBUFFER_H
 
-#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
@@ -66,11 +66,7 @@ public:
   /// MemoryBuffer if successful, otherwise returning null.  If FileSize is
   /// specified, this means that the client knows that the file exists and that
   /// it has the specified size.
-  static error_code getFile(StringRef Filename, OwningPtr<MemoryBuffer> &result,
-                            int64_t FileSize = -1,
-                            bool RequiresNullTerminator = true);
-  static error_code getFile(const char *Filename,
-                            OwningPtr<MemoryBuffer> &result,
+  static error_code getFile(Twine Filename, OwningPtr<MemoryBuffer> &result,
                             int64_t FileSize = -1,
                             bool RequiresNullTerminator = true);
 
diff --git a/include/llvm/Support/PassNameParser.h b/include/llvm/Support/PassNameParser.h
index a73dc8f..c0914b1 100644
--- a/include/llvm/Support/PassNameParser.h
+++ b/include/llvm/Support/PassNameParser.h
@@ -86,10 +86,9 @@ public:
 
 private:
   // ValLessThan - Provide a sorting comparator for Values elements...
-  static int ValLessThan(const void *VT1, const void *VT2) {
-    typedef PassNameParser::OptionInfo ValType;
-    return std::strcmp(static_cast<const ValType *>(VT1)->Name,
-                       static_cast<const ValType *>(VT2)->Name);
+  static int ValLessThan(const PassNameParser::OptionInfo *VT1,
+                         const PassNameParser::OptionInfo *VT2) {
+    return std::strcmp(VT1->Name, VT2->Name);
   }
 };
 
diff --git a/include/llvm/Support/Path.h b/include/llvm/Support/Path.h
index f9a65e5..b2afe1b 100644
--- a/include/llvm/Support/Path.h
+++ b/include/llvm/Support/Path.h
@@ -173,6 +173,13 @@ void append(SmallVectorImpl<char> &path,
 /// @param result Holds the result of the transformation.
 void native(const Twine &path, SmallVectorImpl<char> &result);
 
+/// Convert path to the native form in place. This is used to give paths to
+/// users and operating system calls in the platform's normal way. For example,
+/// on Windows all '/' are converted to '\'.
+///
+/// @param path A path that is transformed to native format.
+void native(SmallVectorImpl<char> &path);
+
 /// @}
 /// @name Lexical Observers
 /// @{
diff --git a/include/llvm/Support/PatternMatch.h b/include/llvm/Support/PatternMatch.h
index b1732b2..240bb81 100644
--- a/include/llvm/Support/PatternMatch.h
+++ b/include/llvm/Support/PatternMatch.h
@@ -1041,7 +1041,7 @@ inline Argument_match<Opnd_t> m_Argument(const Opnd_t &Op) {
 /// Intrinsic matchers.
 struct IntrinsicID_match {
   unsigned ID;
-  IntrinsicID_match(unsigned IntrID) : ID(IntrID) { }
+  IntrinsicID_match(Intrinsic::ID IntrID) : ID(IntrID) { }
 
   template<typename OpTy>
   bool match(OpTy *V) {
@@ -1080,29 +1080,29 @@ struct m_Intrinsic_Ty<T0, T1, T2, T3> {
 
 /// Match intrinsic calls like this:
 ///   m_Intrinsic<Intrinsic::fabs>(m_Value(X))
-template <unsigned IntrID>
+template <Intrinsic::ID IntrID>
 inline IntrinsicID_match
 m_Intrinsic() { return IntrinsicID_match(IntrID); }
 
-template<unsigned IntrID, typename T0>
+template<Intrinsic::ID IntrID, typename T0>
 inline typename m_Intrinsic_Ty<T0>::Ty
 m_Intrinsic(const T0 &Op0) {
   return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));
 }
 
-template<unsigned IntrID, typename T0, typename T1>
+template<Intrinsic::ID IntrID, typename T0, typename T1>
 inline typename m_Intrinsic_Ty<T0, T1>::Ty
 m_Intrinsic(const T0 &Op0, const T1 &Op1) {
   return m_CombineAnd(m_Intrinsic<IntrID>(Op0), m_Argument<1>(Op1));
 }
 
-template<unsigned IntrID, typename T0, typename T1, typename T2>
+template<Intrinsic::ID IntrID, typename T0, typename T1, typename T2>
 inline typename m_Intrinsic_Ty<T0, T1, T2>::Ty
 m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2) {
   return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1), m_Argument<2>(Op2));
 }
 
-template<unsigned IntrID, typename T0, typename T1, typename T2, typename T3>
+template<Intrinsic::ID IntrID, typename T0, typename T1, typename T2, typename T3>
 inline typename m_Intrinsic_Ty<T0, T1, T2, T3>::Ty
 m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2, const T3 &Op3) {
   return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1, Op2), m_Argument<3>(Op3));
diff --git a/include/llvm/Support/PrettyStackTrace.h b/include/llvm/Support/PrettyStackTrace.h
index 2122e06..4f68fca 100644
--- a/include/llvm/Support/PrettyStackTrace.h
+++ b/include/llvm/Support/PrettyStackTrace.h
@@ -21,11 +21,7 @@
 namespace llvm {
   class raw_ostream;
 
-  /// DisablePrettyStackTrace - Set this to true to disable this module. This
-  /// might be necessary if the host application installs its own signal
-  /// handlers which conflict with the ones installed by this module.
-  /// Defaults to false.
-  extern bool DisablePrettyStackTrace;
+  void EnablePrettyStackTrace();
 
   /// PrettyStackTraceEntry - This class is used to represent a frame of the
   /// "pretty" stack trace that is dumped when a program crashes. You can define
@@ -64,7 +60,9 @@ namespace llvm {
     const char *const *ArgV;
   public:
     PrettyStackTraceProgram(int argc, const char * const*argv)
-      : ArgC(argc), ArgV(argv) {}
+      : ArgC(argc), ArgV(argv) {
+      EnablePrettyStackTrace();
+    }
     virtual void print(raw_ostream &OS) const LLVM_OVERRIDE;
   };
 
diff --git a/include/llvm/Support/Process.h b/include/llvm/Support/Process.h
index 0baf7b9..2172036 100644
--- a/include/llvm/Support/Process.h
+++ b/include/llvm/Support/Process.h
@@ -25,11 +25,17 @@
 #ifndef LLVM_SUPPORT_PROCESS_H
 #define LLVM_SUPPORT_PROCESS_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/system_error.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/TimeValue.h"
 
 namespace llvm {
+class StringRef;
+
 namespace sys {
 
 class self_process;
@@ -155,22 +161,24 @@ public:
   static void GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
                            TimeValue &sys_time);
 
-  /// This static function will return the process' current user id number.
-  /// Not all operating systems support this feature. Where it is not
-  /// supported, the function should return 65536 as the value.
-  static int GetCurrentUserId();
-
-  /// This static function will return the process' current group id number.
-  /// Not all operating systems support this feature. Where it is not
-  /// supported, the function should return 65536 as the value.
-  static int GetCurrentGroupId();
-
   /// This function makes the necessary calls to the operating system to
   /// prevent core files or any other kind of large memory dumps that can
   /// occur when a program fails.
   /// @brief Prevent core file generation.
   static void PreventCoreFiles();
 
+  // This function returns the environment variable \arg name's value as a UTF-8
+  // string. \arg Name is assumed to be in UTF-8 encoding too.
+  static Optional<std::string> GetEnv(StringRef name);
+
+  /// This function returns a SmallVector containing the arguments passed from
+  /// the operating system to the program.  This function expects to be handed
+  /// the vector passed in from main.
+  static error_code
+  GetArgumentVector(SmallVectorImpl<const char *> &Args,
+                    ArrayRef<const char *> ArgsFromMain,
+                    SpecificBumpPtrAllocator<char> &ArgAllocator);
+
   /// This function determines if the standard input is connected directly
   /// to a user's input (keyboard probably), rather than coming from a file
   /// or pipe.
@@ -219,6 +227,12 @@ public:
   /// terminal, this function returns false.
   static bool StandardErrHasColors();
 
+  /// Enables or disables whether ANSI escape sequences are used to output
+  /// colors. This only has an effect on Windows.
+  /// Note: Setting this option is not thread-safe and should only be done
+  /// during initialization.
+  static void UseANSIEscapeCodes(bool enable);
+
   /// Whether changing colors requires the output to be flushed.
   /// This is needed on systems that don't support escape sequences for
   /// changing colors.
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index 5134351..00571a4 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -30,6 +30,28 @@ namespace sys {
   const char EnvPathSeparator = ';';
 #endif
 
+/// @brief This struct encapsulates information about a process.
+struct ProcessInfo {
+#if defined(LLVM_ON_UNIX)
+  typedef pid_t ProcessId;
+#elif defined(LLVM_ON_WIN32)
+  typedef unsigned long ProcessId; // Must match the type of DWORD on Windows.
+  typedef void * HANDLE; // Must match the type of HANDLE on Windows.
+  /// The handle to the process (available on Windows only).
+  HANDLE ProcessHandle;
+#else
+#error "ProcessInfo is not defined for this platform!"
+#endif
+
+  /// The process identifier.
+  ProcessId Pid;
+
+  /// The return code, set after execution.
+  int ReturnCode;
+
+  ProcessInfo();
+};
+
   /// This static constructor (factory) will attempt to locate a program in
   /// the operating system's file system using some pre-determined set of
   /// locations to search (e.g. the PATH on Unix). Paths with slashes are
@@ -87,15 +109,41 @@ namespace sys {
       ///< program.
       bool *ExecutionFailed = 0);
 
-  /// Similar to ExecuteAndWait, but return immediately.
-  void ExecuteNoWait(StringRef Program, const char **args, const char **env = 0,
-                     const StringRef **redirects = 0, unsigned memoryLimit = 0,
-                     std::string *ErrMsg = 0);
+  /// Similar to ExecuteAndWait, but returns immediately.
+  /// @returns The \see ProcessInfo of the newly launced process.
+  /// \note On Microsoft Windows systems, users will need to either call \see
+  /// Wait until the process finished execution or win32 CloseHandle() API on
+  /// ProcessInfo.ProcessHandle to avoid memory leaks.
+  ProcessInfo
+  ExecuteNoWait(StringRef Program, const char **args, const char **env = 0,
+                const StringRef **redirects = 0, unsigned memoryLimit = 0,
+                std::string *ErrMsg = 0, bool *ExecutionFailed = 0);
 
-  // Return true if the given arguments fit within system-specific
-  // argument length limits.
+  /// Return true if the given arguments fit within system-specific
+  /// argument length limits.
   bool argumentsFitWithinSystemLimits(ArrayRef<const char*> Args);
-}
+
+  /// This function waits for the process specified by \p PI to finish.
+  /// \returns A \see ProcessInfo struct with Pid set to:
+  /// \li The process id of the child process if the child process has changed
+  /// state.
+  /// \li 0 if the child process has not changed state.
+  /// \note Users of this function should always check the ReturnCode member of
+  /// the \see ProcessInfo returned from this function.
+  ProcessInfo Wait(
+      const ProcessInfo &PI, ///< The child process that should be waited on.
+      unsigned SecondsToWait, ///< If non-zero, this specifies the amount of
+      ///< time to wait for the child process to exit. If the time expires, the
+      ///< child is killed and this function returns. If zero, this function
+      ///< will perform a non-blocking wait on the child process.
+      bool WaitUntilTerminates, ///< If true, ignores \p SecondsToWait and waits
+      ///< until child has terminated.
+      std::string *ErrMsg = 0 ///< If non-zero, provides a pointer to a string
+      ///< instance in which error messages will be returned. If the string
+      ///< is non-empty upon return an error occurred while invoking the
+      ///< program.
+      );
+  }
 }
 
 #endif
diff --git a/include/llvm/Support/RecyclingAllocator.h b/include/llvm/Support/RecyclingAllocator.h
index f67503f..001d1cf 100644
--- a/include/llvm/Support/RecyclingAllocator.h
+++ b/include/llvm/Support/RecyclingAllocator.h
@@ -60,9 +60,10 @@ public:
 }
 
 template<class AllocatorType, class T, size_t Size, size_t Align>
-inline void *operator new(size_t,
+inline void *operator new(size_t size,
                           llvm::RecyclingAllocator<AllocatorType,
                                                    T, Size, Align> &Allocator) {
+  assert(size <= Size && "allocation size exceeded");
   return Allocator.Allocate();
 }
 
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 5b33d42..dd48974 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -144,11 +144,17 @@ public:
   ///
   /// @param ShowColors - Display colored messages if output is a terminal and
   /// the default error handler is used.
-  void PrintMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
+  void PrintMessage(raw_ostream &OS, SMLoc Loc, DiagKind Kind,
+                    const Twine &Msg,
                     ArrayRef<SMRange> Ranges = None,
                     ArrayRef<SMFixIt> FixIts = None,
                     bool ShowColors = true) const;
 
+  /// Emits a diagnostic to llvm::errs().
+  void PrintMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
+                    ArrayRef<SMRange> Ranges = None,
+                    ArrayRef<SMFixIt> FixIts = None,
+                    bool ShowColors = true) const;
 
   /// GetMessage - Return an SMDiagnostic at the specified location with the
   /// specified string.
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index 109b3a2..9ecee3b 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -46,18 +46,18 @@ namespace llvm {
   class MCRelocationInfo;
   class MCTargetAsmParser;
   class TargetMachine;
+  class MCTargetStreamer;
   class TargetOptions;
   class raw_ostream;
   class formatted_raw_ostream;
 
-  MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm,
+  MCStreamer *createAsmStreamer(MCContext &Ctx,
+                                MCTargetStreamer *TargetStreamer,
+                                formatted_raw_ostream &OS, bool isVerboseAsm,
                                 bool useLoc, bool useCFI,
                                 bool useDwarfDirectory,
-                                MCInstPrinter *InstPrint,
-                                MCCodeEmitter *CE,
-                                MCAsmBackend *TAB,
-                                bool ShowInst);
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst);
 
   MCRelocationInfo *createMCRelocationInfo(StringRef TT, MCContext &Ctx);
 
@@ -104,10 +104,12 @@ namespace llvm {
     typedef AsmPrinter *(*AsmPrinterCtorTy)(TargetMachine &TM,
                                             MCStreamer &Streamer);
     typedef MCAsmBackend *(*MCAsmBackendCtorTy)(const Target &T,
+                                                const MCRegisterInfo &MRI,
                                                 StringRef TT,
                                                 StringRef CPU);
     typedef MCTargetAsmParser *(*MCAsmParserCtorTy)(MCSubtargetInfo &STI,
-                                                    MCAsmParser &P);
+                                                    MCAsmParser &P,
+                                                    const MCInstrInfo &MII);
     typedef MCDisassembler *(*MCDisassemblerCtorTy)(const Target &T,
                                                     const MCSubtargetInfo &STI);
     typedef MCInstPrinter *(*MCInstPrinterCtorTy)(const Target &T,
@@ -234,9 +236,9 @@ namespace llvm {
     MCSymbolizerCtorTy MCSymbolizerCtorFn;
 
   public:
-    Target() : AsmStreamerCtorFn(llvm::createAsmStreamer),
-               MCRelocationInfoCtorFn(llvm::createMCRelocationInfo),
-               MCSymbolizerCtorFn(llvm::createMCSymbolizer) {}
+    Target()
+        : AsmStreamerCtorFn(0), MCRelocationInfoCtorFn(0),
+          MCSymbolizerCtorFn(0) {}
 
     /// @name Target Information
     /// @{
@@ -263,27 +265,6 @@ namespace llvm {
     /// hasMCAsmBackend - Check if this target supports .o generation.
     bool hasMCAsmBackend() const { return MCAsmBackendCtorFn != 0; }
 
-    /// hasAsmParser - Check if this target supports .s parsing.
-    bool hasMCAsmParser() const { return MCAsmParserCtorFn != 0; }
-
-    /// hasAsmPrinter - Check if this target supports .s printing.
-    bool hasAsmPrinter() const { return AsmPrinterCtorFn != 0; }
-
-    /// hasMCDisassembler - Check if this target has a disassembler.
-    bool hasMCDisassembler() const { return MCDisassemblerCtorFn != 0; }
-
-    /// hasMCInstPrinter - Check if this target has an instruction printer.
-    bool hasMCInstPrinter() const { return MCInstPrinterCtorFn != 0; }
-
-    /// hasMCCodeEmitter - Check if this target supports instruction encoding.
-    bool hasMCCodeEmitter() const { return MCCodeEmitterCtorFn != 0; }
-
-    /// hasMCObjectStreamer - Check if this target supports streaming to files.
-    bool hasMCObjectStreamer() const { return MCObjectStreamerCtorFn != 0; }
-
-    /// hasAsmStreamer - Check if this target supports streaming to files.
-    bool hasAsmStreamer() const { return AsmStreamerCtorFn != 0; }
-
     /// @}
     /// @name Feature Constructors
     /// @{
@@ -373,10 +354,11 @@ namespace llvm {
     /// createMCAsmBackend - Create a target specific assembly parser.
     ///
     /// \param Triple The target triple string.
-    MCAsmBackend *createMCAsmBackend(StringRef Triple, StringRef CPU) const {
+    MCAsmBackend *createMCAsmBackend(const MCRegisterInfo &MRI,
+                                     StringRef Triple, StringRef CPU) const {
       if (!MCAsmBackendCtorFn)
         return 0;
-      return MCAsmBackendCtorFn(*this, Triple, CPU);
+      return MCAsmBackendCtorFn(*this, MRI, Triple, CPU);
     }
 
     /// createMCAsmParser - Create a target specific assembly parser.
@@ -384,10 +366,11 @@ namespace llvm {
     /// \param Parser The target independent parser implementation to use for
     /// parsing and lexing.
     MCTargetAsmParser *createMCAsmParser(MCSubtargetInfo &STI,
-                                         MCAsmParser &Parser) const {
+                                         MCAsmParser &Parser,
+                                         const MCInstrInfo &MII) const {
       if (!MCAsmParserCtorFn)
         return 0;
-      return MCAsmParserCtorFn(STI, Parser);
+      return MCAsmParserCtorFn(STI, Parser, MII);
     }
 
     /// createAsmPrinter - Create a target specific assembly printer pass.  This
@@ -457,9 +440,13 @@ namespace llvm {
                                   MCCodeEmitter *CE,
                                   MCAsmBackend *TAB,
                                   bool ShowInst) const {
-      // AsmStreamerCtorFn is default to llvm::createAsmStreamer
-      return AsmStreamerCtorFn(Ctx, OS, isVerboseAsm, useLoc, useCFI,
-                               useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+      if (AsmStreamerCtorFn)
+        return AsmStreamerCtorFn(Ctx, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
+      return llvm::createAsmStreamer(Ctx, 0, OS, isVerboseAsm, useLoc, useCFI,
+                                     useDwarfDirectory, InstPrint, CE, TAB,
+                                     ShowInst);
     }
 
     /// createMCRelocationInfo - Create a target specific MCRelocationInfo.
@@ -468,7 +455,10 @@ namespace llvm {
     /// \param Ctx The target context.
     MCRelocationInfo *
       createMCRelocationInfo(StringRef TT, MCContext &Ctx) const {
-      return MCRelocationInfoCtorFn(TT, Ctx);
+      MCRelocationInfoCtorTy Fn = MCRelocationInfoCtorFn
+                                      ? MCRelocationInfoCtorFn
+                                      : llvm::createMCRelocationInfo;
+      return Fn(TT, Ctx);
     }
 
     /// createMCSymbolizer - Create a target specific MCSymbolizer.
@@ -485,8 +475,9 @@ namespace llvm {
                        LLVMSymbolLookupCallback SymbolLookUp,
                        void *DisInfo,
                        MCContext *Ctx, MCRelocationInfo *RelInfo) const {
-      return MCSymbolizerCtorFn(TT, GetOpInfo, SymbolLookUp, DisInfo,
-                                Ctx, RelInfo);
+      MCSymbolizerCtorTy Fn =
+          MCSymbolizerCtorFn ? MCSymbolizerCtorFn : llvm::createMCSymbolizer;
+      return Fn(TT, GetOpInfo, SymbolLookUp, DisInfo, Ctx, RelInfo);
     }
 
     /// @}
@@ -607,9 +598,7 @@ namespace llvm {
     /// @param T - The target being registered.
     /// @param Fn - A function to construct a MCAsmInfo for the target.
     static void RegisterMCAsmInfo(Target &T, Target::MCAsmInfoCtorFnTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.MCAsmInfoCtorFn)
-        T.MCAsmInfoCtorFn = Fn;
+      T.MCAsmInfoCtorFn = Fn;
     }
 
     /// RegisterMCCodeGenInfo - Register a MCCodeGenInfo implementation for the
@@ -623,9 +612,7 @@ namespace llvm {
     /// @param Fn - A function to construct a MCCodeGenInfo for the target.
     static void RegisterMCCodeGenInfo(Target &T,
                                      Target::MCCodeGenInfoCtorFnTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.MCCodeGenInfoCtorFn)
-        T.MCCodeGenInfoCtorFn = Fn;
+      T.MCCodeGenInfoCtorFn = Fn;
     }
 
     /// RegisterMCInstrInfo - Register a MCInstrInfo implementation for the
@@ -638,18 +625,14 @@ namespace llvm {
     /// @param T - The target being registered.
     /// @param Fn - A function to construct a MCInstrInfo for the target.
     static void RegisterMCInstrInfo(Target &T, Target::MCInstrInfoCtorFnTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.MCInstrInfoCtorFn)
-        T.MCInstrInfoCtorFn = Fn;
+      T.MCInstrInfoCtorFn = Fn;
     }
 
     /// RegisterMCInstrAnalysis - Register a MCInstrAnalysis implementation for
     /// the given target.
     static void RegisterMCInstrAnalysis(Target &T,
                                         Target::MCInstrAnalysisCtorFnTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.MCInstrAnalysisCtorFn)
-        T.MCInstrAnalysisCtorFn = Fn;
+      T.MCInstrAnalysisCtorFn = Fn;
     }
 
     /// RegisterMCRegInfo - Register a MCRegisterInfo implementation for the
@@ -662,9 +645,7 @@ namespace llvm {
     /// @param T - The target being registered.
     /// @param Fn - A function to construct a MCRegisterInfo for the target.
     static void RegisterMCRegInfo(Target &T, Target::MCRegInfoCtorFnTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.MCRegInfoCtorFn)
-        T.MCRegInfoCtorFn = Fn;
+      T.MCRegInfoCtorFn = Fn;
     }
 
     /// RegisterMCSubtargetInfo - Register a MCSubtargetInfo implementation for
@@ -678,9 +659,7 @@ namespace llvm {
     /// @param Fn - A function to construct a MCSubtargetInfo for the target.
     static void RegisterMCSubtargetInfo(Target &T,
                                         Target::MCSubtargetInfoCtorFnTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.MCSubtargetInfoCtorFn)
-        T.MCSubtargetInfoCtorFn = Fn;
+      T.MCSubtargetInfoCtorFn = Fn;
     }
 
     /// RegisterTargetMachine - Register a TargetMachine implementation for the
@@ -694,9 +673,7 @@ namespace llvm {
     /// @param Fn - A function to construct a TargetMachine for the target.
     static void RegisterTargetMachine(Target &T,
                                       Target::TargetMachineCtorTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.TargetMachineCtorFn)
-        T.TargetMachineCtorFn = Fn;
+      T.TargetMachineCtorFn = Fn;
     }
 
     /// RegisterMCAsmBackend - Register a MCAsmBackend implementation for the
@@ -709,8 +686,7 @@ namespace llvm {
     /// @param T - The target being registered.
     /// @param Fn - A function to construct an AsmBackend for the target.
     static void RegisterMCAsmBackend(Target &T, Target::MCAsmBackendCtorTy Fn) {
-      if (!T.MCAsmBackendCtorFn)
-        T.MCAsmBackendCtorFn = Fn;
+      T.MCAsmBackendCtorFn = Fn;
     }
 
     /// RegisterMCAsmParser - Register a MCTargetAsmParser implementation for
@@ -723,8 +699,7 @@ namespace llvm {
     /// @param T - The target being registered.
     /// @param Fn - A function to construct an MCTargetAsmParser for the target.
     static void RegisterMCAsmParser(Target &T, Target::MCAsmParserCtorTy Fn) {
-      if (!T.MCAsmParserCtorFn)
-        T.MCAsmParserCtorFn = Fn;
+      T.MCAsmParserCtorFn = Fn;
     }
 
     /// RegisterAsmPrinter - Register an AsmPrinter implementation for the given
@@ -737,9 +712,7 @@ namespace llvm {
     /// @param T - The target being registered.
     /// @param Fn - A function to construct an AsmPrinter for the target.
     static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn) {
-      // Ignore duplicate registration.
-      if (!T.AsmPrinterCtorFn)
-        T.AsmPrinterCtorFn = Fn;
+      T.AsmPrinterCtorFn = Fn;
     }
 
     /// RegisterMCDisassembler - Register a MCDisassembler implementation for
@@ -753,8 +726,7 @@ namespace llvm {
     /// @param Fn - A function to construct an MCDisassembler for the target.
     static void RegisterMCDisassembler(Target &T,
                                        Target::MCDisassemblerCtorTy Fn) {
-      if (!T.MCDisassemblerCtorFn)
-        T.MCDisassemblerCtorFn = Fn;
+      T.MCDisassemblerCtorFn = Fn;
     }
 
     /// RegisterMCInstPrinter - Register a MCInstPrinter implementation for the
@@ -768,8 +740,7 @@ namespace llvm {
     /// @param Fn - A function to construct an MCInstPrinter for the target.
     static void RegisterMCInstPrinter(Target &T,
                                       Target::MCInstPrinterCtorTy Fn) {
-      if (!T.MCInstPrinterCtorFn)
-        T.MCInstPrinterCtorFn = Fn;
+      T.MCInstPrinterCtorFn = Fn;
     }
 
     /// RegisterMCCodeEmitter - Register a MCCodeEmitter implementation for the
@@ -783,8 +754,7 @@ namespace llvm {
     /// @param Fn - A function to construct an MCCodeEmitter for the target.
     static void RegisterMCCodeEmitter(Target &T,
                                       Target::MCCodeEmitterCtorTy Fn) {
-      if (!T.MCCodeEmitterCtorFn)
-        T.MCCodeEmitterCtorFn = Fn;
+      T.MCCodeEmitterCtorFn = Fn;
     }
 
     /// RegisterMCObjectStreamer - Register a object code MCStreamer
@@ -798,8 +768,7 @@ namespace llvm {
     /// @param Fn - A function to construct an MCStreamer for the target.
     static void RegisterMCObjectStreamer(Target &T,
                                          Target::MCObjectStreamerCtorTy Fn) {
-      if (!T.MCObjectStreamerCtorFn)
-        T.MCObjectStreamerCtorFn = Fn;
+      T.MCObjectStreamerCtorFn = Fn;
     }
 
     /// RegisterAsmStreamer - Register an assembly MCStreamer implementation
@@ -812,8 +781,7 @@ namespace llvm {
     /// @param T - The target being registered.
     /// @param Fn - A function to construct an MCStreamer for the target.
     static void RegisterAsmStreamer(Target &T, Target::AsmStreamerCtorTy Fn) {
-      if (T.AsmStreamerCtorFn == createAsmStreamer)
-        T.AsmStreamerCtorFn = Fn;
+      T.AsmStreamerCtorFn = Fn;
     }
 
     /// RegisterMCRelocationInfo - Register an MCRelocationInfo
@@ -827,8 +795,7 @@ namespace llvm {
     /// @param Fn - A function to construct an MCRelocationInfo for the target.
     static void RegisterMCRelocationInfo(Target &T,
                                          Target::MCRelocationInfoCtorTy Fn) {
-      if (T.MCRelocationInfoCtorFn == llvm::createMCRelocationInfo)
-        T.MCRelocationInfoCtorFn = Fn;
+      T.MCRelocationInfoCtorFn = Fn;
     }
 
     /// RegisterMCSymbolizer - Register an MCSymbolizer
@@ -842,8 +809,7 @@ namespace llvm {
     /// @param Fn - A function to construct an MCSymbolizer for the target.
     static void RegisterMCSymbolizer(Target &T,
                                      Target::MCSymbolizerCtorTy Fn) {
-      if (T.MCSymbolizerCtorFn == llvm::createMCSymbolizer)
-        T.MCSymbolizerCtorFn = Fn;
+      T.MCSymbolizerCtorFn = Fn;
     }
 
     /// @}
@@ -1118,9 +1084,10 @@ namespace llvm {
     }
 
   private:
-    static MCAsmBackend *Allocator(const Target &T, StringRef Triple,
-                                   StringRef CPU) {
-      return new MCAsmBackendImpl(T, Triple, CPU);
+    static MCAsmBackend *Allocator(const Target &T,
+                                   const MCRegisterInfo &MRI,
+                                   StringRef Triple, StringRef CPU) {
+      return new MCAsmBackendImpl(T, MRI, Triple, CPU);
     }
   };
 
@@ -1139,8 +1106,9 @@ namespace llvm {
     }
 
   private:
-    static MCTargetAsmParser *Allocator(MCSubtargetInfo &STI, MCAsmParser &P) {
-      return new MCAsmParserImpl(STI, P);
+    static MCTargetAsmParser *Allocator(MCSubtargetInfo &STI, MCAsmParser &P,
+                                        const MCInstrInfo &MII) {
+      return new MCAsmParserImpl(STI, P, MII);
     }
   };
 
diff --git a/include/llvm/Support/Unicode.h b/include/llvm/Support/Unicode.h
new file mode 100644
index 0000000..e6a52c4
--- /dev/null
+++ b/include/llvm/Support/Unicode.h
@@ -0,0 +1,62 @@
+//===- llvm/Support/Unicode.h - Unicode character properties  -*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that allow querying certain properties of Unicode
+// characters.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+
+enum ColumnWidthErrors {
+  ErrorInvalidUTF8 = -2,
+  ErrorNonPrintableCharacter = -1
+};
+
+/// Determines if a character is likely to be displayed correctly on the
+/// terminal. Exact implementation would have to depend on the specific
+/// terminal, so we define the semantic that should be suitable for generic case
+/// of a terminal capable to output Unicode characters.
+///
+/// All characters from the Unicode code point range are considered printable
+/// except for:
+///   * C0 and C1 control character ranges;
+///   * default ignorable code points as per 5.21 of
+///     http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
+///     except for U+00AD SOFT HYPHEN, as it's actually displayed on most
+///     terminals;
+///   * format characters (category = Cf);
+///   * surrogates (category = Cs);
+///   * unassigned characters (category = Cn).
+/// \return true if the character is considered printable.
+bool isPrintable(int UCS);
+
+/// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
+/// when output on a terminal ("character width"). This depends on the
+/// implementation of the terminal, and there's no standard definition of
+/// character width.
+///
+/// The implementation defines it in a way that is expected to be compatible
+/// with a generic Unicode-capable terminal.
+///
+/// \return Character width:
+///   * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable
+///     characters (as identified by isPrintable);
+///   * 0 for each non-spacing and enclosing combining mark;
+///   * 2 for each CJK character excluding halfwidth forms;
+///   * 1 for each of the remaining characters.
+int columnWidthUTF8(StringRef Text);
+
+} // namespace unicode
+} // namespace sys
+} // namespace llvm
diff --git a/include/llvm/Support/UnicodeCharRanges.h b/include/llvm/Support/UnicodeCharRanges.h
index 4a4d988..86faa38 100644
--- a/include/llvm/Support/UnicodeCharRanges.h
+++ b/include/llvm/Support/UnicodeCharRanges.h
@@ -17,82 +17,80 @@
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/raw_ostream.h"
 
-namespace {
+#include <algorithm>
 
+namespace llvm {
+namespace sys {
+
+/// \brief Represents a closed range of Unicode code points [Lower, Upper].
 struct UnicodeCharRange {
   uint32_t Lower;
   uint32_t Upper;
 };
-typedef llvm::ArrayRef<UnicodeCharRange> UnicodeCharSet;
 
-/// Returns true if each of the ranges in \p CharSet is a proper closed range
-/// [min, max], and if the ranges themselves are ordered and non-overlapping.
-static inline bool isValidCharSet(UnicodeCharSet CharSet) {
-#ifndef NDEBUG
-  static llvm::SmallPtrSet<const UnicodeCharRange *, 16> Validated;
-  static llvm::sys::Mutex ValidationMutex;
+inline bool operator<(uint32_t Value, UnicodeCharRange Range) {
+  return Value < Range.Lower;
+}
+inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
+  return Range.Upper < Value;
+}
 
-  // Check the validation cache.
-  {
-    llvm::MutexGuard Guard(ValidationMutex);
-    if (Validated.count(CharSet.data()))
-      return true;
-  }
+/// \brief Holds a reference to an ordered array of UnicodeCharRange and allows
+/// to quickly check if a code point is contained in the set represented by this
+/// array.
+class UnicodeCharSet {
+public:
+  typedef llvm::ArrayRef<UnicodeCharRange> CharRanges;
 
-  // Walk through the ranges.
-  uint32_t Prev = 0;
-  for (UnicodeCharSet::iterator I = CharSet.begin(), E = CharSet.end();
-       I != E; ++I) {
-    if (I != CharSet.begin() && Prev >= I->Lower) {
-      DEBUG(llvm::dbgs() << "Upper bound 0x");
-      DEBUG(llvm::dbgs().write_hex(Prev));
-      DEBUG(llvm::dbgs() << " should be less than succeeding lower bound 0x");
-      DEBUG(llvm::dbgs().write_hex(I->Lower) << "\n");
-      return false;
-    }
-    if (I->Upper < I->Lower) {
-      DEBUG(llvm::dbgs() << "Upper bound 0x");
-      DEBUG(llvm::dbgs().write_hex(I->Lower));
-      DEBUG(llvm::dbgs() << " should not be less than lower bound 0x");
-      DEBUG(llvm::dbgs().write_hex(I->Upper) << "\n");
-      return false;
-    }
-    Prev = I->Upper;
+  /// \brief Constructs a UnicodeCharSet instance from an array of
+  /// UnicodeCharRanges.
+  ///
+  /// Array pointed by \p Ranges should have the lifetime at least as long as
+  /// the UnicodeCharSet instance, and should not change. Array is validated by
+  /// the constructor, so it makes sense to create as few UnicodeCharSet
+  /// instances per each array of ranges, as possible.
+  UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {
+    assert(rangesAreValid());
   }
 
-  // Update the validation cache.
-  {
-    llvm::MutexGuard Guard(ValidationMutex);
-    Validated.insert(CharSet.data());
+  /// \brief Returns true if the character set contains the Unicode code point
+  /// \p C.
+  bool contains(uint32_t C) const {
+    return std::binary_search(Ranges.begin(), Ranges.end(), C);
   }
-#endif
-  return true;
-}
-
-} // namespace
 
+private:
+  /// \brief Returns true if each of the ranges is a proper closed range
+  /// [min, max], and if the ranges themselves are ordered and non-overlapping.
+  bool rangesAreValid() const {
+    uint32_t Prev = 0;
+    for (CharRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
+         I != E; ++I) {
+      if (I != Ranges.begin() && Prev >= I->Lower) {
+        DEBUG(llvm::dbgs() << "Upper bound 0x");
+        DEBUG(llvm::dbgs().write_hex(Prev));
+        DEBUG(llvm::dbgs() << " should be less than succeeding lower bound 0x");
+        DEBUG(llvm::dbgs().write_hex(I->Lower) << "\n");
+        return false;
+      }
+      if (I->Upper < I->Lower) {
+        DEBUG(llvm::dbgs() << "Upper bound 0x");
+        DEBUG(llvm::dbgs().write_hex(I->Lower));
+        DEBUG(llvm::dbgs() << " should not be less than lower bound 0x");
+        DEBUG(llvm::dbgs().write_hex(I->Upper) << "\n");
+        return false;
+      }
+      Prev = I->Upper;
+    }
 
-/// Returns true if the Unicode code point \p C is within the set of
-/// characters specified by \p CharSet.
-LLVM_READONLY static inline bool isCharInSet(uint32_t C,
-                                             UnicodeCharSet CharSet) {
-  assert(isValidCharSet(CharSet));
+    return true;
+  }
 
-  size_t LowPoint = 0;
-  size_t HighPoint = CharSet.size();
+  const CharRanges Ranges;
+};
 
-  // Binary search the set of char ranges.
-  while (HighPoint != LowPoint) {
-    size_t MidPoint = (HighPoint + LowPoint) / 2;
-    if (C < CharSet[MidPoint].Lower)
-      HighPoint = MidPoint;
-    else if (C > CharSet[MidPoint].Upper)
-      LowPoint = MidPoint + 1;
-    else
-      return true;
-  }
+} // namespace sys
+} // namespace llvm
 
-  return false;
-}
 
 #endif // LLVM_SUPPORT_UNICODECHARRANGES_H
diff --git a/include/llvm/Support/Valgrind.h b/include/llvm/Support/Valgrind.h
index a1397db..7ae40af 100644
--- a/include/llvm/Support/Valgrind.h
+++ b/include/llvm/Support/Valgrind.h
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_SYSTEM_VALGRIND_H
-#define LLVM_SYSTEM_VALGRIND_H
+#ifndef LLVM_SUPPORT_VALGRIND_H
+#define LLVM_SUPPORT_VALGRIND_H
 
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
diff --git a/include/llvm/Support/ValueHandle.h b/include/llvm/Support/ValueHandle.h
index b49341c..bc02ba3 100644
--- a/include/llvm/Support/ValueHandle.h
+++ b/include/llvm/Support/ValueHandle.h
@@ -339,6 +339,7 @@ public:
 /// rearrange itself when the pointer changes).  Unlike ValueHandleBase, this
 /// class has a vtable and a virtual destructor.
 class CallbackVH : public ValueHandleBase {
+  virtual void anchor();
 protected:
   CallbackVH(const CallbackVH &RHS)
     : ValueHandleBase(Callback, RHS) {}
@@ -365,13 +366,13 @@ public:
   ///
   /// All implementations must remove the reference from this object to the
   /// Value that's being destroyed.
-  virtual void deleted();
+  virtual void deleted() { setValPtr(NULL); }
 
   /// Called when this->getValPtr()->replaceAllUsesWith(new_value) is called,
   /// _before_ any of the uses have actually been replaced.  If WeakVH were
   /// implemented as a CallbackVH, it would use this method to call
   /// setValPtr(new_value).  AssertingVH would do nothing in this method.
-  virtual void allUsesReplacedWith(Value *);
+  virtual void allUsesReplacedWith(Value *) {}
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index 338bb4b..7020449 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@@ -43,6 +43,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/SMLoc.h"
+
+#include <map>
 #include <limits>
 #include <utility>
 
@@ -99,13 +101,11 @@ private:
   OwningPtr<Document> CurrentDoc;
 
   friend class Document;
-
-  /// @brief Validate a %YAML x.x directive.
-  void handleYAMLDirective(const Token &);
 };
 
 /// @brief Abstract base class for all Nodes.
 class Node {
+   virtual void anchor();
 public:
   enum NodeKind {
     NK_Null,
@@ -116,12 +116,21 @@ public:
     NK_Alias
   };
 
-  Node(unsigned int Type, OwningPtr<Document>&, StringRef Anchor);
+  Node(unsigned int Type, OwningPtr<Document> &, StringRef Anchor,
+       StringRef Tag);
 
   /// @brief Get the value of the anchor attached to this node. If it does not
   ///        have one, getAnchor().size() will be 0.
   StringRef getAnchor() const { return Anchor; }
 
+  /// \brief Get the tag as it was written in the document. This does not
+  ///   perform tag resolution.
+  StringRef getRawTag() const { return Tag; }
+
+  /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
+  ///   and substitution.
+  std::string getVerbatimTag() const;
+
   SMRange getSourceRange() const { return SourceRange; }
   void setSourceRange(SMRange SR) { SourceRange = SR; }
 
@@ -158,6 +167,8 @@ protected:
 private:
   unsigned int TypeID;
   StringRef Anchor;
+  /// \brief The tag as typed in the document.
+  StringRef Tag;
 };
 
 /// @brief A null value.
@@ -165,8 +176,10 @@ private:
 /// Example:
 ///   !!null null
 class NullNode : public Node {
+  virtual void anchor();
 public:
-  NullNode(OwningPtr<Document> &D) : Node(NK_Null, D, StringRef()) {}
+  NullNode(OwningPtr<Document> &D)
+      : Node(NK_Null, D, StringRef(), StringRef()) {}
 
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Null;
@@ -179,10 +192,11 @@ public:
 /// Example:
 ///   Adena
 class ScalarNode : public Node {
+  virtual void anchor();
 public:
-  ScalarNode(OwningPtr<Document> &D, StringRef Anchor, StringRef Val)
-    : Node(NK_Scalar, D, Anchor)
-    , Value(Val) {
+  ScalarNode(OwningPtr<Document> &D, StringRef Anchor, StringRef Tag,
+             StringRef Val)
+      : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
     SMLoc Start = SMLoc::getFromPointer(Val.begin());
     SMLoc End = SMLoc::getFromPointer(Val.end());
     SourceRange = SMRange(Start, End);
@@ -220,9 +234,10 @@ private:
 /// Example:
 ///   Section: .text
 class KeyValueNode : public Node {
+  virtual void anchor();
 public:
   KeyValueNode(OwningPtr<Document> &D)
-    : Node(NK_KeyValue, D, StringRef())
+    : Node(NK_KeyValue, D, StringRef(), StringRef())
     , Key(0)
     , Value(0)
   {}
@@ -331,6 +346,7 @@ void skip(CollectionType &C) {
 ///   Name: _main
 ///   Scope: Global
 class MappingNode : public Node {
+  virtual void anchor();
 public:
   enum MappingType {
     MT_Block,
@@ -338,13 +354,10 @@ public:
     MT_Inline ///< An inline mapping node is used for "[key: value]".
   };
 
-  MappingNode(OwningPtr<Document> &D, StringRef Anchor, MappingType MT)
-    : Node(NK_Mapping, D, Anchor)
-    , Type(MT)
-    , IsAtBeginning(true)
-    , IsAtEnd(false)
-    , CurrentEntry(0)
-  {}
+  MappingNode(OwningPtr<Document> &D, StringRef Anchor, StringRef Tag,
+              MappingType MT)
+      : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true),
+        IsAtEnd(false), CurrentEntry(0) {}
 
   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
   typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
@@ -383,6 +396,7 @@ private:
 ///   - Hello
 ///   - World
 class SequenceNode : public Node {
+  virtual void anchor();
 public:
   enum SequenceType {
     ST_Block,
@@ -397,14 +411,12 @@ public:
     ST_Indentless
   };
 
-  SequenceNode(OwningPtr<Document> &D, StringRef Anchor, SequenceType ST)
-    : Node(NK_Sequence, D, Anchor)
-    , SeqType(ST)
-    , IsAtBeginning(true)
-    , IsAtEnd(false)
-    , WasPreviousTokenFlowEntry(true) // Start with an imaginary ','.
-    , CurrentEntry(0)
-  {}
+  SequenceNode(OwningPtr<Document> &D, StringRef Anchor, StringRef Tag,
+               SequenceType ST)
+      : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true),
+        IsAtEnd(false),
+        WasPreviousTokenFlowEntry(true), // Start with an imaginary ','.
+        CurrentEntry(0) {}
 
   friend class basic_collection_iterator<SequenceNode, Node>;
   typedef basic_collection_iterator<SequenceNode, Node> iterator;
@@ -440,9 +452,10 @@ private:
 /// Example:
 ///   *AnchorName
 class AliasNode : public Node {
+  virtual void anchor();
 public:
   AliasNode(OwningPtr<Document> &D, StringRef Val)
-    : Node(NK_Alias, D, StringRef()), Name(Val) {}
+    : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
 
   StringRef getName() const { return Name; }
   Node *getTarget();
@@ -475,6 +488,10 @@ public:
     return Root = parseBlockNode();
   }
 
+  const std::map<StringRef, StringRef> &getTagMap() const {
+    return TagMap;
+  }
+
 private:
   friend class Node;
   friend class document_iterator;
@@ -490,18 +507,23 @@ private:
   ///        document.
   Node *Root;
 
+  /// \brief Maps tag prefixes to their expansion.
+  std::map<StringRef, StringRef> TagMap;
+
   Token &peekNext();
   Token getNext();
   void setError(const Twine &Message, Token &Location) const;
   bool failed() const;
 
-  void handleTagDirective(const Token &Tag) {
-    // TODO: Track tags.
-  }
-
   /// @brief Parse %BLAH directives and return true if any were encountered.
   bool parseDirectives();
 
+  /// \brief Parse %YAML
+  void parseYAMLDirective();
+
+  /// \brief Parse %TAG
+  void parseTAGDirective();
+
   /// @brief Consume the next token and error if it is not \a TK.
   bool expectToken(int TK);
 };
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 0f57f44..c19eb23 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
@@ -317,18 +318,20 @@ public:
   IO(void *Ctxt=NULL);
   virtual ~IO();
 
-  virtual bool outputting() = 0;
+  virtual bool outputting() const = 0;
 
   virtual unsigned beginSequence() = 0;
   virtual bool preflightElement(unsigned, void *&) = 0;
   virtual void postflightElement(void*) = 0;
   virtual void endSequence() = 0;
+  virtual bool canElideEmptySequence() = 0;
 
   virtual unsigned beginFlowSequence() = 0;
   virtual bool preflightFlowElement(unsigned, void *&) = 0;
   virtual void postflightFlowElement(void*) = 0;
   virtual void endFlowSequence() = 0;
 
+  virtual bool mapTag(StringRef Tag, bool Default=false) = 0;
   virtual void beginMapping() = 0;
   virtual void endMapping() = 0;
   virtual bool preflightKey(const char*, bool, bool, bool &, void *&) = 0;
@@ -388,7 +391,7 @@ public:
   typename llvm::enable_if_c<has_SequenceTraits<T>::value,void>::type
   mapOptional(const char* Key, T& Val) {
     // omit key/value instead of outputting empty sequence
-    if ( this->outputting() && !(Val.begin() != Val.end()) )
+    if ( this->canElideEmptySequence() && !(Val.begin() != Val.end()) )
       return;
     this->processKey(Key, Val, false);
   }
@@ -403,8 +406,7 @@ public:
   void mapOptional(const char* Key, T& Val, const T& Default) {
     this->processKeyWithDefault(Key, Val, Default, false);
   }
-
-
+  
 private:
   template <typename T>
   void processKeyWithDefault(const char *Key, T &Val, const T& DefaultValue,
@@ -683,18 +685,23 @@ private:
 ///
 class Input : public IO {
 public:
-  // Construct a yaml Input object from a StringRef and optional user-data.
-  Input(StringRef InputContent, void *Ctxt=NULL);
+  // Construct a yaml Input object from a StringRef and optional
+  // user-data. The DiagHandler can be specified to provide
+  // alternative error reporting.
+  Input(StringRef InputContent,
+        void *Ctxt = NULL,
+        SourceMgr::DiagHandlerTy DiagHandler = NULL,
+        void *DiagHandlerCtxt = NULL);
   ~Input();
-  
+
   // Check if there was an syntax or semantic error during parsing.
   llvm::error_code error();
 
-  // To set alternate error reporting.
-  void setDiagHandler(llvm::SourceMgr::DiagHandlerTy Handler, void *Ctxt = 0);
+  static bool classof(const IO *io) { return !io->outputting(); }
 
 private:
-  virtual bool outputting();
+  virtual bool outputting() const;
+  virtual bool mapTag(StringRef, bool);
   virtual void beginMapping();
   virtual void endMapping();
   virtual bool preflightKey(const char *, bool, bool, bool &, void *&);
@@ -715,8 +722,10 @@ private:
   virtual void endBitSetScalar();
   virtual void scalarString(StringRef &);
   virtual void setError(const Twine &message);
+  virtual bool canElideEmptySequence();
 
   class HNode {
+    virtual void anchor();
   public:
     HNode(Node *n) : _node(n) { }
     virtual ~HNode() { }
@@ -726,9 +735,9 @@ private:
   };
 
   class EmptyHNode : public HNode {
+    virtual void anchor();
   public:
     EmptyHNode(Node *n) : HNode(n) { }
-    virtual ~EmptyHNode() {}
     static inline bool classof(const HNode *n) {
       return NullNode::classof(n->_node);
     }
@@ -736,9 +745,9 @@ private:
   };
 
   class ScalarHNode : public HNode {
+    virtual void anchor();
   public:
     ScalarHNode(Node *n, StringRef s) : HNode(n), _value(s) { }
-    virtual ~ScalarHNode() { }
 
     StringRef value() const { return _value; }
 
@@ -816,7 +825,10 @@ public:
   Output(llvm::raw_ostream &, void *Ctxt=NULL);
   virtual ~Output();
 
-  virtual bool outputting();
+  static bool classof(const IO *io) { return io->outputting(); }
+  
+  virtual bool outputting() const;
+  virtual bool mapTag(StringRef, bool);
   virtual void beginMapping();
   virtual void endMapping();
   virtual bool preflightKey(const char *key, bool, bool, bool &, void *&);
@@ -837,7 +849,7 @@ public:
   virtual void endBitSetScalar();
   virtual void scalarString(StringRef &);
   virtual void setError(const Twine &message);
-
+  virtual bool canElideEmptySequence();
 public:
   // These are only used by operator<<. They could be private
   // if that templated operator could be made a friend.
@@ -959,8 +971,8 @@ template <typename T>
 inline
 typename llvm::enable_if_c<has_SequenceTraits<T>::value,Input &>::type
 operator>>(Input &yin, T &docSeq) {
-  yin.setCurrentDocument();
-  yamlize(yin, docSeq, true);
+  if (yin.setCurrentDocument())
+    yamlize(yin, docSeq, true);
   return yin;
 }
 
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index e17cddd..50352bd 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -694,7 +694,7 @@ public:
 };
 
 
-/// IntInit - 7 - Represent an initalization by a literal integer value.
+/// IntInit - 7 - Represent an initialization by a literal integer value.
 ///
 class IntInit : public TypedInit {
   int64_t Value;
diff --git a/utils/TableGen/StringToOffsetTable.h b/include/llvm/TableGen/StringToOffsetTable.h
index d94d3a2..d94d3a2 100644
--- a/utils/TableGen/StringToOffsetTable.h
+++ b/include/llvm/TableGen/StringToOffsetTable.h
diff --git a/include/llvm/TableGen/TableGenBackend.h b/include/llvm/TableGen/TableGenBackend.h
index bedf7fb..3e4f3cc 100644
--- a/include/llvm/TableGen/TableGenBackend.h
+++ b/include/llvm/TableGen/TableGenBackend.h
@@ -20,7 +20,7 @@ namespace llvm {
 
 class raw_ostream;
 
-/// emitSourceFileHeader - Output a LLVM style file header to the specified
+/// emitSourceFileHeader - Output an LLVM style file header to the specified
 /// raw_ostream.
 void emitSourceFileHeader(StringRef Desc, raw_ostream &OS);
 
diff --git a/include/llvm/Target/CostTable.h b/include/llvm/Target/CostTable.h
index a974b56..34f6041 100644
--- a/include/llvm/Target/CostTable.h
+++ b/include/llvm/Target/CostTable.h
@@ -25,18 +25,25 @@ struct CostTblEntry {
   unsigned Cost;
 };
 
-/// Find in cost table, TypeTy must be comparable by ==
-template <class TypeTy>
-int CostTableLookup(const CostTblEntry<TypeTy> *Tbl,
-                    unsigned len, int ISD, TypeTy Ty) {
+/// Find in cost table, TypeTy must be comparable to CompareTy by ==
+template <class TypeTy, class CompareTy>
+int CostTableLookup(const CostTblEntry<TypeTy> *Tbl, unsigned len, int ISD,
+                    CompareTy Ty) {
   for (unsigned int i = 0; i < len; ++i)
-    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
+    if (ISD == Tbl[i].ISD && Ty == Tbl[i].Type)
       return i;
 
   // Could not find an entry.
   return -1;
 }
 
+/// Find in cost table, TypeTy must be comparable to CompareTy by ==
+template <class TypeTy, class CompareTy, unsigned N>
+int CostTableLookup(const CostTblEntry<TypeTy>(&Tbl)[N], int ISD,
+                    CompareTy Ty) {
+  return CostTableLookup(Tbl, N, ISD, Ty);
+}
+
 /// Type Conversion Cost Table
 template <class TypeTy>
 struct TypeConversionCostTblEntry {
@@ -46,18 +53,28 @@ struct TypeConversionCostTblEntry {
   unsigned Cost;
 };
 
-/// Find in type conversion cost table, TypeTy must be comparable by ==
-template <class TypeTy>
+/// Find in type conversion cost table, TypeTy must be comparable to CompareTy
+/// by ==
+template <class TypeTy, class CompareTy>
 int ConvertCostTableLookup(const TypeConversionCostTblEntry<TypeTy> *Tbl,
-                           unsigned len, int ISD, TypeTy Dst, TypeTy Src) {
+                           unsigned len, int ISD, CompareTy Dst,
+                           CompareTy Src) {
   for (unsigned int i = 0; i < len; ++i)
-    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
+    if (ISD == Tbl[i].ISD && Src == Tbl[i].Src && Dst == Tbl[i].Dst)
       return i;
 
   // Could not find an entry.
   return -1;
 }
 
+/// Find in type conversion cost table, TypeTy must be comparable to CompareTy
+/// by ==
+template <class TypeTy, class CompareTy, unsigned N>
+int ConvertCostTableLookup(const TypeConversionCostTblEntry<TypeTy>(&Tbl)[N],
+                           int ISD, CompareTy Dst, CompareTy Src) {
+  return ConvertCostTableLookup(Tbl, N, ISD, Dst, Src);
+}
+
 } // namespace llvm
 
 
diff --git a/include/llvm/Target/Mangler.h b/include/llvm/Target/Mangler.h
index e925cd5..eee7bf6 100644
--- a/include/llvm/Target/Mangler.h
+++ b/include/llvm/Target/Mangler.h
@@ -20,7 +20,6 @@ namespace llvm {
 
 class GlobalValue;
 class MCContext;
-class MCSymbol;
 template <typename T> class SmallVectorImpl;
 class TargetMachine;
 class Twine;
@@ -34,7 +33,6 @@ public:
   };
 
 private:
-  MCContext &Context;
   const TargetMachine *TM;
 
   /// AnonGlobalIDs - We need to give global values the same name every time
@@ -48,12 +46,7 @@ private:
   unsigned NextAnonGlobalID;
 
 public:
-  Mangler(MCContext &Context, const TargetMachine *TM)
-    : Context(Context), TM(TM), NextAnonGlobalID(1) {}
-
-  /// getSymbol - Return the MCSymbol for the specified global value.  This
-  /// symbol is the main label that is the address of the global.
-  MCSymbol *getSymbol(const GlobalValue *GV);
+  Mangler(const TargetMachine *TM) : TM(TM), NextAnonGlobalID(1) {}
 
   /// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
   /// and the specified global variable's name.  If the global variable doesn't
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 89ca529..3f6eae6 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -632,6 +632,11 @@ def f64imm : Operand<f64>;
 ///
 def zero_reg;
 
+/// All operands which the MC layer classifies as predicates should inherit from
+/// this class in some manner. This is already handled for the most commonly
+/// used PredicateOperand, but may be useful in other circumstances.
+class PredicateOp;
+
 /// OperandWithDefaultOps - This Operand class can be used as the parent class
 /// for an Operand that needs to be initialized with a default value if
 /// no value is supplied in a pattern.  This class can be used to simplify the
@@ -647,7 +652,7 @@ class OperandWithDefaultOps<ValueType ty, dag defaultops>
 /// AlwaysVal specifies the value of this predicate when set to "always
 /// execute".
 class PredicateOperand<ValueType ty, dag OpTypes, dag AlwaysVal>
-  : OperandWithDefaultOps<ty, AlwaysVal> {
+  : OperandWithDefaultOps<ty, AlwaysVal>, PredicateOp {
   let MIOperandInfo = OpTypes;
 }
 
@@ -795,6 +800,19 @@ def LIFETIME_END : Instruction {
   let AsmString = "LIFETIME_END";
   let neverHasSideEffects = 1;
 }
+def STACKMAP : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$id, i32imm:$nbytes, variable_ops);
+  let isCall = 1;
+  let mayLoad = 1;
+}
+def PATCHPOINT : Instruction {
+  let OutOperandList = (outs unknown:$dst);
+  let InOperandList = (ins i32imm:$id, i32imm:$nbytes, unknown:$callee,
+                       i32imm:$nargs, i32imm:$cc, variable_ops);
+  let isCall = 1;
+  let mayLoad = 1;
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -1005,6 +1023,17 @@ class SubtargetFeature<string n, string a,  string v, string d,
   list<SubtargetFeature> Implies = i;
 }
 
+/// Specifies a Subtarget feature that this instruction is deprecated on.
+class Deprecated<SubtargetFeature dep> {
+  SubtargetFeature DeprecatedFeatureMask = dep;
+}
+
+/// A custom predicate used to determine if an instruction is
+/// deprecated or not.
+class ComplexDeprecationPredicate<string dep> {
+  string ComplexDeprecationPredicate = dep;
+}
+
 //===----------------------------------------------------------------------===//
 // Processor chip sets - These values represent each of the chip sets supported
 // by the scheduler.  Each Processor definition requires corresponding
diff --git a/include/llvm/Target/TargetCallingConv.h b/include/llvm/Target/TargetCallingConv.h
index 1fd0bd9..9cc52a5 100644
--- a/include/llvm/Target/TargetCallingConv.h
+++ b/include/llvm/Target/TargetCallingConv.h
@@ -113,6 +113,7 @@ namespace ISD {
   struct InputArg {
     ArgFlagsTy Flags;
     MVT VT;
+    EVT ArgVT;
     bool Used;
 
     /// Index original Function's argument.
@@ -124,10 +125,11 @@ namespace ISD {
     unsigned PartOffset;
 
     InputArg() : VT(MVT::Other), Used(false) {}
-    InputArg(ArgFlagsTy flags, EVT vt, bool used,
+    InputArg(ArgFlagsTy flags, EVT vt, EVT argvt, bool used,
              unsigned origIdx, unsigned partOffs)
       : Flags(flags), Used(used), OrigArgIndex(origIdx), PartOffset(partOffs) {
       VT = vt.getSimpleVT();
+      ArgVT = argvt;
     }
   };
 
@@ -138,6 +140,7 @@ namespace ISD {
   struct OutputArg {
     ArgFlagsTy Flags;
     MVT VT;
+    EVT ArgVT;
 
     /// IsFixed - Is this a "fixed" value, ie not passed through a vararg "...".
     bool IsFixed;
@@ -151,11 +154,12 @@ namespace ISD {
     unsigned PartOffset;
 
     OutputArg() : IsFixed(false) {}
-    OutputArg(ArgFlagsTy flags, EVT vt, bool isfixed,
+    OutputArg(ArgFlagsTy flags, EVT vt, EVT argvt, bool isfixed,
               unsigned origIdx, unsigned partOffs)
       : Flags(flags), IsFixed(isfixed), OrigArgIndex(origIdx),
         PartOffset(partOffs) {
       VT = vt.getSimpleVT();
+      ArgVT = argvt;
     }
   };
 }
diff --git a/include/llvm/Target/TargetCallingConv.td b/include/llvm/Target/TargetCallingConv.td
index a53ed29..c1bef28 100644
--- a/include/llvm/Target/TargetCallingConv.td
+++ b/include/llvm/Target/TargetCallingConv.td
@@ -143,4 +143,10 @@ class CallingConv<list<CCAction> actions> {
 /// returning from getCallPreservedMask().
 class CalleeSavedRegs<dag saves> {
   dag SaveList = saves;
+
+  // Registers that are also preserved across function calls, but should not be
+  // included in the generated FOO_SaveList array. These registers will be
+  // included in the FOO_RegMask bit mask. This can be used for registers that
+  // are saved automatically, like the SPARC register windows.
+  dag OtherPreserved;
 }
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index f746daf..d4e14f6 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -181,6 +181,23 @@ public:
     return false;
   }
 
+  /// Compute the size in bytes and offset within a stack slot of a spilled
+  /// register or subregister.
+  ///
+  /// \param [out] Size in bytes of the spilled value.
+  /// \param [out] Offset in bytes within the stack slot.
+  /// \returns true if both Size and Offset are successfully computed.
+  ///
+  /// Not all subregisters have computable spill slots. For example,
+  /// subregisters registers may not be byte-sized, and a pair of discontiguous
+  /// subregisters has no single offset.
+  ///
+  /// Targets with nontrivial bigendian implementations may need to override
+  /// this, particularly to support spilled vector registers.
+  virtual bool getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx,
+                                 unsigned &Size, unsigned &Offset,
+                                 const TargetMachine *TM) const;
+
   /// reMaterialize - Re-issue the specified 'original' instruction at the
   /// specific location targeting a new destination register.
   /// The register in Orig->getOperand(0).getReg() will be substituted by
@@ -615,6 +632,8 @@ public:
     return false;
   }
 
+  virtual bool enableClusterLoads() const { return false; }
+
   virtual bool shouldClusterLoads(MachineInstr *FirstLdSt,
                                   MachineInstr *SecondLdSt,
                                   unsigned NumLoads) const {
@@ -821,6 +840,8 @@ public:
                                    const MachineInstr *MI,
                                    unsigned *PredCost = 0) const;
 
+  virtual unsigned getPredicationCost(const MachineInstr *MI) const;
+
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const;
 
@@ -938,6 +959,26 @@ public:
     return 0;
   }
 
+  /// \brief Return the minimum clearance before an instruction that reads an
+  /// unused register.
+  ///
+  /// For example, AVX instructions may copy part of an register operand into
+  /// the unused high bits of the destination register.
+  ///
+  /// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+  ///
+  /// In the code above, vcvtsi2sdq copies %xmm0[127:64] into %xmm14 creating a
+  /// false dependence on any previous write to %xmm0.
+  ///
+  /// This hook works similarly to getPartialRegUpdateClearance, except that it
+  /// does not take an operand index. Instead sets \p OpNum to the index of the
+  /// unused register.
+  virtual unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                                        const TargetRegisterInfo *TRI) const {
+    // The default implementation returns 0 for no undef register dependency.
+    return 0;
+  }
+
   /// breakPartialRegDependency - Insert a dependency-breaking instruction
   /// before MI to eliminate an unwanted dependency on OpNum.
   ///
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index 8c1f223..46eaef2 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -46,6 +46,10 @@ namespace llvm {
       Znwm,
       /// void *new(unsigned long, nothrow);
       ZnwmRKSt9nothrow_t,
+      /// double __cospi(double x);
+      cospi,
+      /// float __cospif(float x);
+      cospif,
       /// int __cxa_atexit(void (*f)(void *), void *p, void *d);
       cxa_atexit,
       /// void __cxa_guard_abort(guard_t *guard);
@@ -61,6 +65,14 @@ namespace llvm {
       dunder_isoc99_sscanf,
       /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size);
       memcpy_chk,
+      /// double __sincospi_stret(double x);
+      sincospi_stret,
+      /// float __sincospi_stretf(float x);
+      sincospi_stretf,
+      /// double __sinpi(double x);
+      sinpi,
+      /// float __sinpif(float x);
+      sinpif,
       /// double __sqrt_finite(double x);
       sqrt_finite,
       /// float __sqrt_finite(float x);
@@ -695,10 +707,13 @@ public:
     case LibFunc::nearbyint: case LibFunc::nearbyintf: case LibFunc::nearbyintl:
     case LibFunc::ceil:      case LibFunc::ceilf:      case LibFunc::ceill:
     case LibFunc::rint:      case LibFunc::rintf:      case LibFunc::rintl:
+    case LibFunc::round:     case LibFunc::roundf:     case LibFunc::roundl:
     case LibFunc::trunc:     case LibFunc::truncf:     case LibFunc::truncl:
     case LibFunc::log2:      case LibFunc::log2f:      case LibFunc::log2l:
     case LibFunc::exp2:      case LibFunc::exp2f:      case LibFunc::exp2l:
-    case LibFunc::memcmp:
+    case LibFunc::memcmp:    case LibFunc::strcmp:     case LibFunc::strcpy:
+    case LibFunc::stpcpy:    case LibFunc::strlen:     case LibFunc::strnlen:
+    case LibFunc::memchr:
       return true;
     }
     return false;
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index c3fa3cc..5ab04f7 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -148,10 +148,13 @@ public:
 
   bool isBigEndian() const { return !IsLittleEndian; }
   bool isLittleEndian() const { return IsLittleEndian; }
-  // Return the pointer type for the given address space, defaults to
-  // the pointer type from the data layout.
-  // FIXME: The default needs to be removed once all the code is updated.
-  virtual MVT getPointerTy(uint32_t /*AS*/ = 0) const { return PointerTy; }
+
+  /// Return the pointer type for the given address space, defaults to
+  /// the pointer type from the data layout.
+  /// FIXME: The default needs to be removed once all the code is updated.
+  virtual MVT getPointerTy(uint32_t /*AS*/ = 0) const;
+  unsigned getPointerSizeInBits(uint32_t AS = 0) const;
+  unsigned getPointerTypeSizeInBits(Type *Ty) const;
   virtual MVT getScalarShiftAmountTy(EVT LHSTy) const;
 
   EVT getShiftAmountTy(EVT LHSTy) const;
@@ -201,6 +204,17 @@ public:
     return PredictableSelectIsExpensive;
   }
 
+  /// isLoadBitCastBeneficial() - Return true if the following transform
+  /// is beneficial.
+  /// fold (conv (load x)) -> (load (conv*)x)
+  /// On architectures that don't natively support some vector loads efficiently,
+  /// casting the load to a smaller vector of larger types and loading
+  /// is more efficient, however, this can be undone by optimizations in
+  /// dag combiner.
+  virtual bool isLoadBitCastBeneficial(EVT /* Load */, EVT /* Bitcast */) const {
+    return true;
+  }
+
   /// Return the ValueType of the result of SETCC operations.  Also used to
   /// obtain the target's preferred type for the condition operand of SELECT and
   /// BRCOND nodes.  In the case of BRCOND the argument passed is MVT::Other
@@ -518,13 +532,12 @@ public:
   LegalizeAction
   getCondCodeAction(ISD::CondCode CC, MVT VT) const {
     assert((unsigned)CC < array_lengthof(CondCodeActions) &&
-           (unsigned)VT.SimpleTy < sizeof(CondCodeActions[0])*4 &&
+           ((unsigned)VT.SimpleTy >> 4) < array_lengthof(CondCodeActions[0]) &&
            "Table isn't big enough!");
-    /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 64bit
-    /// value and the upper 27 bits index into the second dimension of the
-    /// array to select what 64bit value to use.
-    LegalizeAction Action = (LegalizeAction)
-      ((CondCodeActions[CC][VT.SimpleTy >> 5] >> (2*(VT.SimpleTy & 0x1F))) & 3);
+    // See setCondCodeAction for how this is encoded.
+    uint32_t Shift = 2 * (VT.SimpleTy & 0xF);
+    uint32_t Value = CondCodeActions[CC][VT.SimpleTy >> 4];
+    LegalizeAction Action = (LegalizeAction) ((Value >> Shift) & 0x3);
     assert(Action != Promote && "Can't promote condition code!");
     return Action;
   }
@@ -568,14 +581,18 @@ public:
   /// otherwise it will assert.
   EVT getValueType(Type *Ty, bool AllowUnknown = false) const {
     // Lower scalar pointers to native pointer types.
-    if (Ty->isPointerTy()) return PointerTy;
+    if (PointerType *PTy = dyn_cast<PointerType>(Ty))
+      return getPointerTy(PTy->getAddressSpace());
 
     if (Ty->isVectorTy()) {
       VectorType *VTy = cast<VectorType>(Ty);
       Type *Elm = VTy->getElementType();
       // Lower vectors of pointers to native pointer types.
-      if (Elm->isPointerTy()) 
-        Elm = EVT(PointerTy).getTypeForEVT(Ty->getContext());
+      if (PointerType *PT = dyn_cast<PointerType>(Elm)) {
+        EVT PointerTy(getPointerTy(PT->getAddressSpace()));
+        Elm = PointerTy.getTypeForEVT(Ty->getContext());
+      }
+
       return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(Elm, false),
                        VTy->getNumElements());
     }
@@ -821,6 +838,11 @@ public:
     return 0;
   }
 
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   /// \name Helpers for TargetTransformInfo implementations
   /// @{
@@ -1014,13 +1036,12 @@ protected:
     assert(VT < MVT::LAST_VALUETYPE &&
            (unsigned)CC < array_lengthof(CondCodeActions) &&
            "Table isn't big enough!");
-    /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 64bit
-    /// value and the upper 27 bits index into the second dimension of the
-    /// array to select what 64bit value to use.
-    CondCodeActions[(unsigned)CC][VT.SimpleTy >> 5]
-      &= ~(uint64_t(3UL)  << (VT.SimpleTy & 0x1F)*2);
-    CondCodeActions[(unsigned)CC][VT.SimpleTy >> 5]
-      |= (uint64_t)Action << (VT.SimpleTy & 0x1F)*2;
+    /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 32-bit
+    /// value and the upper 27 bits index into the second dimension of the array
+    /// to select what 32-bit value to use.
+    uint32_t Shift = 2 * (VT.SimpleTy & 0xF);
+    CondCodeActions[CC][VT.SimpleTy >> 4] &= ~((uint32_t)0x3 << Shift);
+    CondCodeActions[CC][VT.SimpleTy >> 4] |= (uint32_t)Action << Shift;
   }
 
   /// If Opc/OrigVT is specified as being promoted, the promotion code defaults
@@ -1181,6 +1202,37 @@ public:
     return false;
   }
 
+  /// Return true if the target supplies and combines to a paired load
+  /// two loaded values of type LoadedType next to each other in memory.
+  /// RequiredAlignment gives the minimal alignment constraints that must be met
+  /// to be able to select this paired load.
+  ///
+  /// This information is *not* used to generate actual paired loads, but it is
+  /// used to generate a sequence of loads that is easier to combine into a
+  /// paired load.
+  /// For instance, something like this:
+  /// a = load i64* addr
+  /// b = trunc i64 a to i32
+  /// c = lshr i64 a, 32
+  /// d = trunc i64 c to i32
+  /// will be optimized into:
+  /// b = load i32* addr1
+  /// d = load i32* addr2
+  /// Where addr1 = addr2 +/- sizeof(i32).
+  ///
+  /// In other words, unless the target performs a post-isel load combining,
+  /// this information should not be provided because it will generate more
+  /// loads.
+  virtual bool hasPairedLoad(Type * /*LoadedType*/,
+                             unsigned & /*RequiredAligment*/) const {
+    return false;
+  }
+
+  virtual bool hasPairedLoad(EVT /*LoadedType*/,
+                             unsigned & /*RequiredAligment*/) const {
+    return false;
+  }
+
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).
@@ -1262,10 +1314,6 @@ private:
   const DataLayout *TD;
   const TargetLoweringObjectFile &TLOF;
 
-  /// The type to use for pointers for the default address space, usually i32 or
-  /// i64.
-  MVT PointerTy;
-
   /// True if this is a little endian target.
   bool IsLittleEndian;
 
@@ -1414,9 +1462,9 @@ private:
   /// indicates how instruction selection should deal with the condition code.
   ///
   /// Because each CC action takes up 2 bits, we need to have the array size be
-  /// large enough to fit all of the value types. This can be done by dividing
-  /// the MVT::LAST_VALUETYPE by 32 and adding one.
-  uint64_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE / 32) + 1];
+  /// large enough to fit all of the value types. This can be done by rounding
+  /// up the MVT::LAST_VALUETYPE value to the next multiple of 16.
+  uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 15) / 16];
 
   ValueTypeActionImpl ValueTypeActions;
 
@@ -1471,10 +1519,12 @@ public:
     if (NumElts == 1)
       return LegalizeKind(TypeScalarizeVector, EltVT);
 
-    // Try to widen vector elements until a legal type is found.
+    // Try to widen vector elements until the element type is a power of two and
+    // promote it to a legal type later on, for example:
+    // <3 x i8> -> <4 x i8> -> <4 x i32>
     if (EltVT.isInteger()) {
       // Vectors with a number of elements that is not a power of two are always
-      // widened, for example <3 x float> -> <4 x float>.
+      // widened, for example <3 x i8> -> <4 x i8>.
       if (!VT.isPow2VectorType()) {
         NumElts = (unsigned)NextPowerOf2(NumElts);
         EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
@@ -1503,7 +1553,8 @@ public:
 
         // Stop trying when getting a non-simple element type.
         // Note that vector elements may be greater than legal vector element
-        // types. Example: X86 XMM registers hold 64bit element on 32bit systems.
+        // types. Example: X86 XMM registers hold 64bit element on 32bit
+        // systems.
         if (!EltVT.isSimple()) break;
 
         // Build a new vector type and check if it is legal.
@@ -1664,7 +1715,8 @@ public:
   /// by reference if this node can be combined with a load / store to form a
   /// post-indexed load / store.
   virtual bool getPostIndexedAddressParts(SDNode * /*N*/, SDNode * /*Op*/,
-                                          SDValue &/*Base*/, SDValue &/*Offset*/,
+                                          SDValue &/*Base*/,
+                                          SDValue &/*Offset*/,
                                           ISD::MemIndexedMode &/*AM*/,
                                           SelectionDAG &/*DAG*/) const {
     return false;
@@ -1702,9 +1754,12 @@ public:
                            SDValue &NewLHS, SDValue &NewRHS,
                            ISD::CondCode &CCCode, SDLoc DL) const;
 
-  SDValue makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
-                      const SDValue *Ops, unsigned NumOps,
-                      bool isSigned, SDLoc dl) const;
+  /// Returns a pair of (return value, chain).
+  std::pair<SDValue, SDValue> makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC,
+                                          EVT RetVT, const SDValue *Ops,
+                                          unsigned NumOps, bool isSigned,
+                                          SDLoc dl, bool doesNotReturn = false,
+                                          bool isReturnValueUsed = true) const;
 
   //===--------------------------------------------------------------------===//
   // TargetLowering Optimization Methods
@@ -1882,6 +1937,8 @@ public:
     ArgListEntry() : isSExt(false), isZExt(false), isInReg(false),
       isSRet(false), isNest(false), isByVal(false), isReturned(false),
       Alignment(0) { }
+
+    void setAttributes(ImmutableCallSite *CS, unsigned AttrIdx);
   };
   typedef std::vector<ArgListEntry> ArgListTy;
 
@@ -2015,6 +2072,12 @@ public:
     return VT.bitsLT(MinVT) ? MinVT : VT;
   }
 
+  /// Returns a 0 terminated array of registers that can be safely used as
+  /// scratch registers.
+  virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const {
+    return NULL;
+  }
+
   /// This callback is invoked by the type legalizer to legalize nodes with an
   /// illegal operand type but legal result types.  It replaces the
   /// LowerOperation callback in the type Legalizer.  The reason we can not do
@@ -2211,12 +2274,12 @@ public:
   // Instruction Emitting Hooks
   //
 
-  // This method should be implemented by targets that mark instructions with
-  // the 'usesCustomInserter' flag.  These instructions are special in various
-  // ways, which require special support to insert.  The specified MachineInstr
-  // is created but not inserted into any basic blocks, and this method is
-  // called to expand it into a sequence of instructions, potentially also
-  // creating new basic blocks and control flow.
+  /// This method should be implemented by targets that mark instructions with
+  /// the 'usesCustomInserter' flag.  These instructions are special in various
+  /// ways, which require special support to insert.  The specified MachineInstr
+  /// is created but not inserted into any basic blocks, and this method is
+  /// called to expand it into a sequence of instructions, potentially also
+  /// creating new basic blocks and control flow.
   virtual MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
 
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index 7f15b74..284b6bb 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -117,6 +117,10 @@ public:
                           MachineModuleInfo *MMI, unsigned Encoding,
                           MCStreamer &Streamer) const;
 
+  /// Return the MCSymbol for the specified global value.  This symbol is the
+  /// main label that is the address of the global
+  MCSymbol *getSymbol(Mangler &M, const GlobalValue *GV) const;
+
   // getCFIPersonalitySymbol - The symbol that gets passed to .cfi_personality.
   virtual MCSymbol *
   getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index fd7228a..11b0f5f 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -29,7 +29,6 @@ class GlobalValue;
 class MCAsmInfo;
 class MCCodeGenInfo;
 class MCContext;
-class PassManagerBase;
 class Target;
 class DataLayout;
 class TargetLibraryInfo;
@@ -47,6 +46,12 @@ class VectorTargetTransformInfo;
 class formatted_raw_ostream;
 class raw_ostream;
 
+// The old pass manager infrastructure is hidden in a legacy namespace now.
+namespace legacy {
+class PassManagerBase;
+}
+using legacy::PassManagerBase;
+
 //===----------------------------------------------------------------------===//
 ///
 /// TargetMachine - Primary interface to the complete machine description for
@@ -70,7 +75,8 @@ protected: // Can only create subclasses.
   std::string TargetFS;
 
   /// CodeGenInfo - Low level target information such as relocation model.
-  const MCCodeGenInfo *CodeGenInfo;
+  /// Non-const to allow resetting optimization level per-function.
+  MCCodeGenInfo *CodeGenInfo;
 
   /// AsmInfo - Contains target specific asm information.
   ///
@@ -208,6 +214,9 @@ public:
   /// Default, or Aggressive.
   CodeGenOpt::Level getOptLevel() const;
 
+  /// \brief Overrides the optimization level.
+  void setOptLevel(CodeGenOpt::Level Level) const;
+
   void setFastISel(bool Enable) { Options.EnableFastISel = Enable; }
 
   bool shouldPrintMachineCode() const { return Options.PrintMachineCode; }
diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index 516e070..bd74cb9 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h
@@ -69,8 +69,9 @@ namespace TargetOpcode {
     DBG_VALUE = 11,
 
     /// REG_SEQUENCE - This variadic instruction is used to form a register that
-    /// represent a consecutive sequence of sub-registers. It's used as register
-    /// coalescing / allocation aid and must be eliminated before code emission.
+    /// represents a consecutive sequence of sub-registers. It's used as a
+    /// register coalescing / allocation aid and must be eliminated before code
+    /// emission.
     // In SDNode form, the first operand encodes the register class created by
     // the REG_SEQUENCE, while each subsequent pair names a vreg + subreg index
     // pair.  Once it has been lowered to a MachineInstr, the regclass operand
@@ -91,7 +92,19 @@ namespace TargetOpcode {
 
     /// Lifetime markers.
     LIFETIME_START = 15,
-    LIFETIME_END = 16
+    LIFETIME_END = 16,
+
+    /// A Stackmap instruction captures the location of live variables at its
+    /// position in the instruction stream. It is followed by a shadow of bytes
+    /// that must lie within the function and not contain another stackmap.
+    STACKMAP = 17,
+
+    /// Patchable call instruction - this instruction represents a call to a
+    /// constant address, followed by a series of NOPs. It is intended to
+    /// support optimizations for dynamic languages (such as javascript) that
+    /// rewrite calls to runtimes with more efficient code sequences.
+    /// This also implies a stack map.
+    PATCHPOINT = 18
   };
 } // end namespace TargetOpcode
 } // end namespace llvm
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 82fccde..958bea6 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -901,6 +901,7 @@ static inline raw_ostream &operator<<(raw_ostream &OS, const PrintReg &PR) {
 /// Usage: OS << PrintRegUnit(Unit, TRI) << '\n';
 ///
 class PrintRegUnit {
+protected:
   const TargetRegisterInfo *TRI;
   unsigned Unit;
 public:
@@ -914,6 +915,21 @@ static inline raw_ostream &operator<<(raw_ostream &OS, const PrintRegUnit &PR) {
   return OS;
 }
 
+/// PrintVRegOrUnit - It is often convenient to track virtual registers and
+/// physical register units in the same list.
+class PrintVRegOrUnit : protected PrintRegUnit {
+public:
+  PrintVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *tri)
+    : PrintRegUnit(VRegOrUnit, tri) {}
+  void print(raw_ostream&) const;
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS,
+                                      const PrintVRegOrUnit &PR) {
+  PR.print(OS);
+  return OS;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 575cb83..9d4858a 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -76,7 +76,7 @@ def instregex;
 // See MCSchedule.h for detailed comments.
 class SchedMachineModel {
   int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle.
-  int MinLatency = -1; // Determines which instrucions are allowed in a group.
+  int MinLatency = -1; // Determines which instructions are allowed in a group.
                        // (-1) inorder (0) ooo, (1): inorder +var latencies.
   int MicroOpBufferSize = -1; // Max micro-ops that can be buffered.
   int LoadLatency = -1; // Cycles for loads to access the cache.
@@ -86,6 +86,15 @@ class SchedMachineModel {
   // Per-cycle resources tables.
   ProcessorItineraries Itineraries = NoItineraries;
 
+  // Subtargets that define a model for only a subset of instructions
+  // that have a scheduling class (itinerary class or SchedRW list)
+  // and may actually be generated for that subtarget must clear this
+  // bit. Otherwise, the scheduler considers an unmodelled opcode to
+  // be an error. This should only be set during initial bringup,
+  // or there will be no way to catch simple errors in the model
+  // resulting from changes to the instruction definitions.
+  bit CompleteModel = 1;
+
   bit NoModel = 0; // Special tag to indicate missing machine model.
 }
 
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index befab43..d94bdc6 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -383,6 +383,7 @@ def ftrunc     : SDNode<"ISD::FTRUNC"     , SDTFPUnaryOp>;
 def fceil      : SDNode<"ISD::FCEIL"      , SDTFPUnaryOp>;
 def ffloor     : SDNode<"ISD::FFLOOR"     , SDTFPUnaryOp>;
 def fnearbyint : SDNode<"ISD::FNEARBYINT" , SDTFPUnaryOp>;
+def frnd       : SDNode<"ISD::FROUND"     , SDTFPUnaryOp>;
 
 def fround     : SDNode<"ISD::FP_ROUND"   , SDTFPRoundOp>;
 def fextend    : SDNode<"ISD::FP_EXTEND"  , SDTFPExtendOp>;
@@ -464,6 +465,8 @@ def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
     SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>, []>;
 def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",
     SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
+def concat_vectors : SDNode<"ISD::CONCAT_VECTORS",
+    SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1, 2>]>,[]>;
 
 // This operator does not do subvector type checking.  The ARM
 // backend, at least, needs it.
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index fe2fba4..3474ee4 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -94,6 +94,74 @@ public:
                           MachinePointerInfo DstPtrInfo) const {
     return SDValue();
   }
+
+  /// EmitTargetCodeForMemcmp - Emit target-specific code that performs a
+  /// memcmp, in cases where that is faster than a libcall.  The first
+  /// returned SDValue is the result of the memcmp and the second is
+  /// the chain.  Both SDValues can be null if a normal libcall should
+  /// be used.
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc dl,
+                          SDValue Chain,
+                          SDValue Op1, SDValue Op2,
+                          SDValue Op3, MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const {
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  /// EmitTargetCodeForMemchr - Emit target-specific code that performs a
+  /// memchr, in cases where that is faster than a libcall.  The first
+  /// returned SDValue is the result of the memchr and the second is
+  /// the chain.  Both SDValues can be null if a normal libcall should
+  /// be used.
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                          SDValue Src, SDValue Char, SDValue Length,
+                          MachinePointerInfo SrcPtrInfo) const {
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  /// EmitTargetCodeForStrcpy - Emit target-specific code that performs a
+  /// strcpy or stpcpy, in cases where that is faster than a libcall.
+  /// The first returned SDValue is the result of the copy (the start
+  /// of the destination string for strcpy, a pointer to the null terminator
+  /// for stpcpy) and the second is the chain.  Both SDValues can be null
+  /// if a normal libcall should be used.
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Dest, SDValue Src,
+                          MachinePointerInfo DestPtrInfo,
+                          MachinePointerInfo SrcPtrInfo,
+                          bool isStpcpy) const {
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  /// EmitTargetCodeForStrcmp - Emit target-specific code that performs a
+  /// strcmp, in cases where that is faster than a libcall.  The first
+  /// returned SDValue is the result of the strcmp and the second is
+  /// the chain.  Both SDValues can be null if a normal libcall should
+  /// be used.
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc dl,
+                          SDValue Chain,
+                          SDValue Op1, SDValue Op2,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const {
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, MachinePointerInfo SrcPtrInfo) const {
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                           SDValue Src, SDValue MaxLength,
+                           MachinePointerInfo SrcPtrInfo) const {
+    return std::make_pair(SDValue(), SDValue());
+  }
 };
 
 } // end llvm namespace
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index b2d405d..1b2e06a 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -25,6 +25,7 @@ class SDep;
 class SUnit;
 class TargetRegisterClass;
 class TargetSchedModel;
+struct MachineSchedPolicy;
 template <typename T> class SmallVectorImpl;
 
 //===----------------------------------------------------------------------===//
@@ -55,6 +56,9 @@ public:
     return 0;
   }
 
+  /// \brief Temporary API to test migration to MI scheduler.
+  bool useMachineScheduler() const;
+
   /// \brief True if the subtarget should run MachineScheduler after aggressive
   /// coalescing.
   ///
@@ -62,6 +66,16 @@ public:
   /// scheduler. It does not yet disable the postRA scheduler.
   virtual bool enableMachineScheduler() const;
 
+  /// \brief Override generic scheduling policy within a region.
+  ///
+  /// This is a convenient way for targets that don't provide any custom
+  /// scheduling heuristics (no custom MachineSchedStrategy) to make
+  /// changes to the generic scheduling policy.
+  virtual void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                   MachineInstr *begin,
+                                   MachineInstr *end,
+                                   unsigned NumRegionInstrs) const {}
+
   // enablePostRAScheduler - If the target can benefit from post-regalloc
   // scheduling and the specified optimization level meets the requirement
   // return true to enable post-register-allocation scheduling. In
@@ -75,6 +89,10 @@ public:
   virtual void adjustSchedDependency(SUnit *def, SUnit *use,
                                      SDep& dep) const { }
 
+  /// \brief Enable use of alias analysis during code generation (during MI
+  /// scheduling, DAGCombine, etc.).
+  virtual bool useAA() const;
+
   /// \brief Reset the features for the subtarget.
   virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
 };
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index e6eb8d3..7f51c51 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -104,12 +104,16 @@ Pass *createPruneEHPass();
 
 //===----------------------------------------------------------------------===//
 /// createInternalizePass - This pass loops over all of the functions in the
-/// input module, internalizing all globals (functions and variables) not in the
-/// given exportList.
+/// input module, internalizing all globals (functions and variables) it can.
+////
+/// The symbols in \p ExportList are never internalized.
+///
+/// The symbol in DSOList are internalized if it is safe to drop them from
+/// the symbol table.
 ///
 /// Note that commandline options that are used with the above function are not
 /// used now!
-ModulePass *createInternalizePass(ArrayRef<const char *> exportList);
+ModulePass *createInternalizePass(ArrayRef<const char *> ExportList);
 /// createInternalizePass - Same as above, but with an empty exportList.
 ModulePass *createInternalizePass();
 
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 75631b3..2788774 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -18,10 +18,16 @@
 #include <vector>
 
 namespace llvm {
-  class TargetLibraryInfo;
-  class PassManagerBase;
-  class Pass;
-  class FunctionPassManager;
+class TargetLibraryInfo;
+class Pass;
+
+// The old pass manager infrastructure is hidden in a legacy namespace now.
+namespace legacy {
+class PassManagerBase;
+class FunctionPassManager;
+}
+using legacy::PassManagerBase;
+using legacy::FunctionPassManager;
 
 /// PassManagerBuilder - This class is used to set up a standard optimization
 /// sequence for languages like C and C++, allowing some APIs to customize the
@@ -106,6 +112,7 @@ public:
   bool SLPVectorize;
   bool LoopVectorize;
   bool LateVectorize;
+  bool RerollLoops;
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index d1b6fe1..8a1b34e 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -16,20 +16,25 @@
 
 #include "llvm/ADT/StringRef.h"
 
+#if defined(__GNUC__) && defined(__linux__)
+inline void *getDFSanArgTLSPtrForJIT() {
+  extern __thread __attribute__((tls_model("initial-exec")))
+    void *__dfsan_arg_tls;
+  return (void *)&__dfsan_arg_tls;
+}
+
+inline void *getDFSanRetValTLSPtrForJIT() {
+  extern __thread __attribute__((tls_model("initial-exec")))
+    void *__dfsan_retval_tls;
+  return (void *)&__dfsan_retval_tls;
+}
+#endif
+
 namespace llvm {
 
 class ModulePass;
 class FunctionPass;
 
-// Insert edge profiling instrumentation
-ModulePass *createEdgeProfilerPass();
-
-// Insert optimal edge profiling instrumentation
-ModulePass *createOptimalEdgeProfilerPass();
-
-// Insert path profiling instrumentation
-ModulePass *createPathProfilerPass();
-
 // Insert GCOV profiling instrumentation
 struct GCOVOptions {
   static GCOVOptions getDefault();
@@ -74,6 +79,19 @@ FunctionPass *createMemorySanitizerPass(bool TrackOrigins = false,
 // Insert ThreadSanitizer (race detection) instrumentation
 FunctionPass *createThreadSanitizerPass(StringRef BlacklistFile = StringRef());
 
+// Insert DataFlowSanitizer (dynamic data flow analysis) instrumentation
+ModulePass *createDataFlowSanitizerPass(StringRef ABIListFile = StringRef(),
+                                        void *(*getArgTLS)() = 0,
+                                        void *(*getRetValTLS)() = 0);
+
+#if defined(__GNUC__) && defined(__linux__)
+inline ModulePass *createDataFlowSanitizerPassForJIT(StringRef ABIListFile =
+                                                         StringRef()) {
+  return createDataFlowSanitizerPass(ABIListFile, getDFSanArgTLSPtrForJIT,
+                                     getDFSanRetValTLSPtrForJIT);
+}
+#endif
+
 // BoundsChecking - This pass instruments the code to perform run-time bounds
 // checking on loads, stores, and other memory intrinsics.
 FunctionPass *createBoundsCheckingPass();
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 037ab6b..1521c4c 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_H
 #define LLVM_TRANSFORMS_SCALAR_H
 
+#include "llvm/ADT/StringRef.h"
+
 namespace llvm {
 
 class FunctionPass;
@@ -138,7 +140,14 @@ Pass *createLoopInstSimplifyPass();
 //
 // LoopUnroll - This pass is a simple loop unrolling pass.
 //
-Pass *createLoopUnrollPass(int Threshold = -1, int Count = -1, int AllowPartial = -1);
+Pass *createLoopUnrollPass(int Threshold = -1, int Count = -1,
+                           int AllowPartial = -1, int Runtime = -1);
+
+//===----------------------------------------------------------------------===//
+//
+// LoopReroll - This pass is a simple loop rerolling pass.
+//
+Pass *createLoopRerollPass();
 
 //===----------------------------------------------------------------------===//
 //
@@ -267,13 +276,6 @@ extern char &LowerInvokePassID;
 
 //===----------------------------------------------------------------------===//
 //
-// BlockPlacement - This pass reorders basic blocks in order to increase the
-// number of fall-through conditional branches.
-//
-FunctionPass *createBlockPlacementPass();
-
-//===----------------------------------------------------------------------===//
-//
 // LCSSA - This pass inserts phi nodes at loop boundaries to simplify other loop
 // optimizations.
 //
@@ -354,6 +356,20 @@ extern char &InstructionSimplifierID;
 FunctionPass *createLowerExpectIntrinsicPass();
 
 
+//===----------------------------------------------------------------------===//
+//
+// PartiallyInlineLibCalls - Tries to inline the fast path of library
+// calls such as sqrt.
+//
+FunctionPass *createPartiallyInlineLibCallsPass();
+
+//===----------------------------------------------------------------------===//
+//
+// SampleProfilePass - Loads sample profile data from disk and generates
+// IR metadata to reflect the profile.
+FunctionPass *createSampleProfileLoaderPass();
+FunctionPass *createSampleProfileLoaderPass(StringRef Name);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/GlobalStatus.h b/include/llvm/Transforms/Utils/GlobalStatus.h
new file mode 100644
index 0000000..c366095
--- /dev/null
+++ b/include/llvm/Transforms/Utils/GlobalStatus.h
@@ -0,0 +1,82 @@
+//===- GlobalStatus.h - Compute status info for globals ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_GLOBALSTATUS_H
+#define LLVM_TRANSFORMS_UTILS_GLOBALSTATUS_H
+
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+class Value;
+class Function;
+
+/// It is safe to destroy a constant iff it is only used by constants itself.
+/// Note that constants cannot be cyclic, so this test is pretty easy to
+/// implement recursively.
+///
+bool isSafeToDestroyConstant(const Constant *C);
+
+/// As we analyze each global, keep track of some information about it.  If we
+/// find out that the address of the global is taken, none of this info will be
+/// accurate.
+struct GlobalStatus {
+  /// True if the global's address is used in a comparison.
+  bool IsCompared;
+
+  /// True if the global is ever loaded.  If the global isn't ever loaded it
+  /// can be deleted.
+  bool IsLoaded;
+
+  /// Keep track of what stores to the global look like.
+  enum StoredType {
+    /// There is no store to this global.  It can thus be marked constant.
+    NotStored,
+
+    /// This global is stored to, but the only thing stored is the constant it
+    /// was initialized with. This is only tracked for scalar globals.
+    InitializerStored,
+
+    /// This global is stored to, but only its initializer and one other value
+    /// is ever stored to it.  If this global isStoredOnce, we track the value
+    /// stored to it in StoredOnceValue below.  This is only tracked for scalar
+    /// globals.
+    StoredOnce,
+
+    /// This global is stored to by multiple values or something else that we
+    /// cannot track.
+    Stored
+  } StoredType;
+
+  /// If only one value (besides the initializer constant) is ever stored to
+  /// this global, keep track of what value it is.
+  Value *StoredOnceValue;
+
+  /// These start out null/false.  When the first accessing function is noticed,
+  /// it is recorded. When a second different accessing function is noticed,
+  /// HasMultipleAccessingFunctions is set to true.
+  const Function *AccessingFunction;
+  bool HasMultipleAccessingFunctions;
+
+  /// Set to true if this global has a user that is not an instruction (e.g. a
+  /// constant expr or GV initializer).
+  bool HasNonInstructionUser;
+
+  /// Set to the strongest atomic ordering requirement.
+  AtomicOrdering Ordering;
+
+  /// Look at all uses of the global and fill in the GlobalStatus structure.  If
+  /// the global has its address taken, return true to indicate we can't do
+  /// anything with it.
+  static bool analyzeGlobal(const Value *V, GlobalStatus &GS);
+
+  GlobalStatus();
+};
+}
+
+#endif
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 65755d0..5586c15 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -203,12 +203,17 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &TD, User *GEP,
        ++i, ++GTI) {
     Value *Op = *i;
     uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask;
-    if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
-      if (OpC->isZero()) continue;
+    if (Constant *OpC = dyn_cast<Constant>(Op)) {
+      if (OpC->isZeroValue())
+        continue;
 
       // Handle a struct index, which adds its field offset to the pointer.
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+        if (OpC->getType()->isVectorTy())
+          OpC = OpC->getSplatValue();
+
+        uint64_t OpValue = cast<ConstantInt>(OpC)->getZExtValue();
+        Size = TD.getStructLayout(STy)->getElementOffset(OpValue);
 
         if (Size)
           Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size),
diff --git a/include/llvm/Transforms/Utils/PromoteMemToReg.h b/include/llvm/Transforms/Utils/PromoteMemToReg.h
index 2f28f33..22f46e5 100644
--- a/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -20,7 +20,6 @@
 namespace llvm {
 
 class AllocaInst;
-class DataLayout;
 class DominatorTree;
 class AliasSetTracker;
 
@@ -30,7 +29,7 @@ class AliasSetTracker;
 /// (transitively) using this alloca. This also enforces that there is only
 /// ever one layer of bitcasts or GEPs between the alloca and the lifetime
 /// markers.
-bool isAllocaPromotable(const AllocaInst *AI, const DataLayout *DL);
+bool isAllocaPromotable(const AllocaInst *AI);
 
 /// \brief Promote the specified list of alloca instructions into scalar
 /// registers, inserting PHI nodes as appropriate.
@@ -42,7 +41,7 @@ bool isAllocaPromotable(const AllocaInst *AI, const DataLayout *DL);
 /// If AST is specified, the specified tracker is updated to reflect changes
 /// made to the IR.
 void PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                     const DataLayout *DL, AliasSetTracker *AST = 0);
+                     AliasSetTracker *AST = 0);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/SpecialCaseList.h b/include/llvm/Transforms/Utils/SpecialCaseList.h
index 787ddb0..34c28fc 100644
--- a/include/llvm/Transforms/Utils/SpecialCaseList.h
+++ b/include/llvm/Transforms/Utils/SpecialCaseList.h
@@ -49,6 +49,7 @@
 
 namespace llvm {
 class Function;
+class GlobalAlias;
 class GlobalVariable;
 class MemoryBuffer;
 class Module;
@@ -57,8 +58,17 @@ class StringRef;
 
 class SpecialCaseList {
  public:
-  SpecialCaseList(const StringRef Path);
-  SpecialCaseList(const MemoryBuffer *MB);
+  /// Parses the special case list from a file. If Path is empty, returns
+  /// an empty special case list. On failure, returns 0 and writes an error
+  /// message to string.
+  static SpecialCaseList *create(const StringRef Path, std::string &Error);
+  /// Parses the special case list from a memory buffer. On failure, returns
+  /// 0 and writes an error message to string.
+  static SpecialCaseList *create(const MemoryBuffer *MB, std::string &Error);
+  /// Parses the special case list from a file. On failure, reports a fatal
+  /// error.
+  static SpecialCaseList *createOrDie(const StringRef Path);
+
   ~SpecialCaseList();
 
   /// Returns whether either this function or its source file are listed in the
@@ -70,31 +80,29 @@ class SpecialCaseList {
   bool isIn(const GlobalVariable &G,
             const StringRef Category = StringRef()) const;
 
+  /// Returns whether this global alias is listed in the given category, which
+  /// may be omitted to search the empty category.
+  ///
+  /// If GA aliases a function, the alias's name is matched as a function name
+  /// would be.  Similarly, aliases of globals are matched like globals.
+  bool isIn(const GlobalAlias &GA,
+            const StringRef Category = StringRef()) const;
+
   /// Returns whether this module is listed in the given category, which may be
   /// omitted to search the empty category.
   bool isIn(const Module &M, const StringRef Category = StringRef()) const;
 
-  /// Returns whether either this function or its source file are listed in any
-  /// category.  Category will contain the name of an arbitrary category in
-  /// which this function is listed.
-  bool findCategory(const Function &F, StringRef &Category) const;
-
-  /// Returns whether this global, its type or its source file are listed in any
-  /// category.  Category will contain the name of an arbitrary category in
-  /// which this global is listed.
-  bool findCategory(const GlobalVariable &G, StringRef &Category) const;
-
-  /// Returns whether this module is listed in any category.  Category will
-  /// contain the name of an arbitrary category in which this module is listed.
-  bool findCategory(const Module &M, StringRef &Category) const;
-
  private:
+  SpecialCaseList(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
+  SpecialCaseList &operator=(SpecialCaseList const &) LLVM_DELETED_FUNCTION;
+
   struct Entry;
   StringMap<StringMap<Entry> > Entries;
 
-  void init(const MemoryBuffer *MB);
-  bool findCategory(const StringRef Section, const StringRef Query,
-                    StringRef &Category) const;
+  SpecialCaseList();
+  /// Parses just-constructed SpecialCaseList entries from a memory buffer.
+  bool parse(const MemoryBuffer *MB, std::string &Error);
+
   bool inSectionCategory(const StringRef Section, const StringRef Query,
                          const StringRef Category) const;
 };
diff --git a/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h b/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
index 54506cf..933c85c 100644
--- a/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
+++ b/include/llvm/Transforms/Utils/UnifyFunctionExitNodes.h
@@ -34,7 +34,7 @@ public:
   // We can preserve non-critical-edgeness when we unify function exit nodes
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
 
-  // getReturn|Unwind|UnreachableBlock - Return the new single (or nonexistant)
+  // getReturn|Unwind|UnreachableBlock - Return the new single (or nonexistent)
   // return, unwind, or unreachable  basic blocks in the CFG.
   //
   BasicBlock *getReturnBlock() const { return ReturnBlock; }
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index 8d0db16..823c5fb 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -114,7 +114,7 @@ createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig());
 //
 // LoopVectorize - Create a loop vectorization pass.
 //
-Pass *createLoopVectorizePass();
+Pass *createLoopVectorizePass(bool NoUnrolling = false);
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 5910526..2289c12 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -299,7 +299,6 @@ bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
 bool AliasSetTracker::add(LoadInst *LI) {
   if (LI->getOrdering() > Monotonic) return addUnknown(LI);
   AliasSet::AccessType ATy = AliasSet::Refs;
-  if (!LI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   AliasSet &AS = addPointer(LI->getOperand(0),
                             AA.getTypeStoreSize(LI->getType()),
@@ -312,7 +311,6 @@ bool AliasSetTracker::add(LoadInst *LI) {
 bool AliasSetTracker::add(StoreInst *SI) {
   if (SI->getOrdering() > Monotonic) return addUnknown(SI);
   AliasSet::AccessType ATy = AliasSet::Mods;
-  if (!SI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   Value *Val = SI->getOperand(0);
   AliasSet &AS = addPointer(SI->getOperand(1),
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 349c417..98f2a55 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -34,6 +34,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeCFGOnlyViewerPass(Registry);
   initializeCFGOnlyPrinterPass(Registry);
   initializeDependenceAnalysisPass(Registry);
+  initializeDelinearizationPass(Registry);
   initializeDominanceFrontierPass(Registry);
   initializeDomViewerPass(Registry);
   initializeDomPrinterPass(Registry);
@@ -54,16 +55,6 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeMemoryDependenceAnalysisPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
   initializePostDominatorTreePass(Registry);
-  initializeProfileEstimatorPassPass(Registry);
-  initializeNoProfileInfoPass(Registry);
-  initializeNoPathProfileInfoPass(Registry);
-  initializeProfileInfoAnalysisGroup(Registry);
-  initializePathProfileInfoAnalysisGroup(Registry);
-  initializeLoaderPassPass(Registry);
-  initializePathProfileLoaderPassPass(Registry);
-  initializeProfileVerifierPassPass(Registry);
-  initializePathProfileVerifierPass(Registry);
-  initializeProfileMetadataLoaderPassPass(Registry);
   initializeRegionInfoPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 9fe1362..b2c2011 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -122,7 +122,7 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
   //      question (in this case rewind to p), or
   //    - just give up. It is up to caller to make sure the pointer is pointing
   //      to the base address the object.
-  // 
+  //
   // We go for 2nd option for simplicity.
   if (!isIdentifiedObject(V))
     return false;
@@ -130,7 +130,7 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
   uint64_t ObjectSize = getObjectSize(V, TD, TLI, /*RoundToAlign*/true);
-  
+
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize < Size;
 }
 
@@ -163,7 +163,7 @@ namespace {
     EK_SignExt,
     EK_ZeroExt
   };
-  
+
   struct VariableGEPIndex {
     const Value *V;
     ExtensionKind Extension;
@@ -200,7 +200,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
     Offset = 0;
     return V;
   }
-  
+
   if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
       switch (BOp->getOpcode()) {
@@ -231,7 +231,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
       }
     }
   }
-  
+
   // Since GEP indices are sign extended anyway, we don't care about the high
   // bits of a sign or zero extended value - just scales and offsets.  The
   // extensions have to be consistent though.
@@ -248,10 +248,10 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
                                         TD, Depth+1);
     Scale = Scale.zext(OldWidth);
     Offset = Offset.zext(OldWidth);
-    
+
     return Result;
   }
-  
+
   Scale = 1;
   Offset = 0;
   return V;
@@ -276,7 +276,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
                        const DataLayout *TD) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = 6;
-  
+
   BaseOffs = 0;
   do {
     // See if this is a bitcast or GEP.
@@ -291,7 +291,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       }
       return V;
     }
-    
+
     if (Op->getOpcode() == Instruction::BitCast) {
       V = Op->getOperand(0);
       continue;
@@ -308,15 +308,14 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
           V = Simplified;
           continue;
         }
-    
+
       return V;
     }
-    
+
     // Don't attempt to analyze GEPs over unsized objects.
-    if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
-        ->getElementType()->isSized())
+    if (!GEPOp->getOperand(0)->getType()->getPointerElementType()->isSized())
       return V;
-    
+
     // If we are lacking DataLayout information, we can't compute the offets of
     // elements computed by GEPs.  However, we can handle bitcast equivalent
     // GEPs.
@@ -326,7 +325,8 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       V = GEPOp->getOperand(0);
       continue;
     }
-    
+
+    unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
     for (User::const_op_iterator I = GEPOp->op_begin()+1,
@@ -337,38 +337,37 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
         // For a struct, add the member offset.
         unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
         if (FieldNo == 0) continue;
-        
+
         BaseOffs += TD->getStructLayout(STy)->getElementOffset(FieldNo);
         continue;
       }
-      
+
       // For an array/pointer, add the element offset, explicitly scaled.
       if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
         if (CIdx->isZero()) continue;
         BaseOffs += TD->getTypeAllocSize(*GTI)*CIdx->getSExtValue();
         continue;
       }
-      
+
       uint64_t Scale = TD->getTypeAllocSize(*GTI);
       ExtensionKind Extension = EK_NotExtended;
-      
+
       // If the integer type is smaller than the pointer size, it is implicitly
       // sign extended to pointer size.
-      unsigned Width = cast<IntegerType>(Index->getType())->getBitWidth();
-      if (TD->getPointerSizeInBits() > Width)
+      unsigned Width = Index->getType()->getIntegerBitWidth();
+      if (TD->getPointerSizeInBits(AS) > Width)
         Extension = EK_SignExt;
-      
+
       // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
       APInt IndexScale(Width, 0), IndexOffset(Width, 0);
       Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension,
                                   *TD, 0);
-      
+
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
       BaseOffs += IndexOffset.getSExtValue()*Scale;
       Scale *= IndexScale.getSExtValue();
-      
-      
+
       // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
       //   A[x][x] -> x*16 + x*4 -> x*20
@@ -381,25 +380,25 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
           break;
         }
       }
-      
+
       // Make sure that we have a scale that makes sense for this target's
       // pointer size.
-      if (unsigned ShiftBits = 64-TD->getPointerSizeInBits()) {
+      if (unsigned ShiftBits = 64 - TD->getPointerSizeInBits(AS)) {
         Scale <<= ShiftBits;
         Scale = (int64_t)Scale >> ShiftBits;
       }
-      
+
       if (Scale) {
         VariableGEPIndex Entry = {Index, Extension,
                                   static_cast<int64_t>(Scale)};
         VarIndices.push_back(Entry);
       }
     }
-    
+
     // Analyze the base pointer next.
     V = GEPOp->getOperand(0);
   } while (--MaxLookup);
-  
+
   // If the chain of expressions is too deep, just return early.
   return V;
 }
@@ -407,7 +406,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
 /// GetIndexDifference - Dest and Src are the variable indices from two
 /// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base
 /// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
-/// difference between the two pointers. 
+/// difference between the two pointers.
 static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
                                const SmallVectorImpl<VariableGEPIndex> &Src) {
   if (Src.empty()) return;
@@ -416,12 +415,12 @@ static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
     const Value *V = Src[i].V;
     ExtensionKind Extension = Src[i].Extension;
     int64_t Scale = Src[i].Scale;
-    
+
     // Find V in Dest.  This is N^2, but pointer indices almost never have more
     // than a few variable indexes.
     for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
       if (Dest[j].V != V || Dest[j].Extension != Extension) continue;
-      
+
       // If we found it, subtract off Scale V's from the entry in Dest.  If it
       // goes to zero, remove the entry.
       if (Dest[j].Scale != Scale)
@@ -431,7 +430,7 @@ static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
       Scale = 0;
       break;
     }
-    
+
     // If we didn't consume this entry, add it to the end of the Dest list.
     if (Scale) {
       VariableGEPIndex Entry = { V, Extension, -Scale };
@@ -526,7 +525,7 @@ namespace {
         return (AliasAnalysis*)this;
       return this;
     }
-    
+
   private:
     // AliasCache - Track alias queries to guard against recursion.
     typedef std::pair<Location, Location> LocPair;
@@ -696,7 +695,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
          "AliasAnalysis query involving multiple functions!");
 
   const Value *Object = GetUnderlyingObject(Loc.Ptr, TD);
-  
+
   // If this is a tail call and Loc.Ptr points to a stack location, we know that
   // the tail call cannot access or modify the local stack.
   // We cannot exclude byval arguments here; these belong to the caller of
@@ -706,7 +705,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
       if (CI->isTailCall())
         return NoModRef;
-  
+
   // If the pointer is to a locally allocated object that does not escape,
   // then the call can not mod/ref the pointer unless the call takes the pointer
   // as an argument, and itself doesn't capture it.
@@ -722,7 +721,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       if (!(*CI)->getType()->isPointerTy() ||
           (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
         continue;
-      
+
       // If this is a no-capture pointer argument, see if we can tell that it
       // is impossible to alias the pointer we're checking.  If not, we have to
       // assume that the call could touch the pointer, even though it doesn't
@@ -732,7 +731,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
         break;
       }
     }
-    
+
     if (!PassedAsArg)
       return NoModRef;
   }
@@ -821,7 +820,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     }
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
-  // for memcpy/memset.  This is particularly important because the 
+  // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
   else if (TLI.has(LibFunc::memset_pattern16) &&
@@ -925,22 +924,22 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         GEP1VariableIndices.clear();
       }
     }
-    
+
     // If we get a No or May, then return it immediately, no amount of analysis
     // will improve this situation.
     if (BaseAlias != MustAlias) return BaseAlias;
-    
+
     // Otherwise, we have a MustAlias.  Since the base pointers alias each other
     // exactly, see if the computed offset from the common pointer tells us
     // about the relation of the resulting pointer.
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
-    
+
     int64_t GEP2BaseOffset;
     SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
     const Value *GEP2BasePtr =
       DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
-    
+
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
@@ -948,12 +947,12 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
-    
+
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
     GEP1BaseOffset -= GEP2BaseOffset;
     GetIndexDifference(GEP1VariableIndices, GEP2VariableIndices);
-    
+
   } else {
     // Check to see if these two pointers are related by the getelementptr
     // instruction.  If one pointer is a GEP with a non-zero index of the other
@@ -975,7 +974,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
 
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
-    
+
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1) {
@@ -984,7 +983,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       return MayAlias;
     }
   }
-  
+
   // In the two GEP Case, if there is no difference in the offsets of the
   // computed pointers, the resultant pointers are a must alias.  This
   // hapens when we have two lexically identical GEP's (for example).
@@ -1226,7 +1225,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     if ((isa<ConstantPointerNull>(O2) && isKnownNonNull(O1)) ||
         (isa<ConstantPointerNull>(O1) && isKnownNonNull(O2)))
       return NoAlias;
-  
+
     // If one pointer is the result of a call/invoke or load and the other is a
     // non-escaping local object within the same function, then we know the
     // object couldn't escape to a point where the call could return it.
@@ -1248,7 +1247,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD, *TLI)) ||
         (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD, *TLI)))
       return NoAlias;
-  
+
   // Check the cache before climbing up use-def chains. This also terminates
   // otherwise infinitely recursive queries.
   LocPair Locs(Location(V1, V1Size, V1TBAAInfo),
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 8469556..62f3ab1 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -1,4 +1,4 @@
-//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------=======//
+//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,14 +17,97 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
-                      true, true)
+#ifndef NDEBUG
+enum GVDAGType {
+  GVDT_None,
+  GVDT_Fraction,
+  GVDT_Integer
+};
+
+static cl::opt<GVDAGType>
+ViewBlockFreqPropagationDAG("view-block-freq-propagation-dags", cl::Hidden,
+          cl::desc("Pop up a window to show a dag displaying how block "
+                   "frequencies propagation through the CFG."),
+          cl::values(
+            clEnumValN(GVDT_None, "none",
+                       "do not display graphs."),
+            clEnumValN(GVDT_Fraction, "fraction", "display a graph using the "
+                       "fractional block frequency representation."),
+            clEnumValN(GVDT_Integer, "integer", "display a graph using the raw "
+                       "integer fractional block frequency representation."),
+            clEnumValEnd));
+
+namespace llvm {
+
+template <>
+struct GraphTraits<BlockFrequencyInfo *> {
+  typedef const BasicBlock NodeType;
+  typedef succ_const_iterator ChildIteratorType;
+  typedef Function::const_iterator nodes_iterator;
+
+  static inline const NodeType *getEntryNode(const BlockFrequencyInfo *G) {
+    return G->getFunction()->begin();
+  }
+  static ChildIteratorType child_begin(const NodeType *N) {
+    return succ_begin(N);
+  }
+  static ChildIteratorType child_end(const NodeType *N) {
+    return succ_end(N);
+  }
+  static nodes_iterator nodes_begin(const BlockFrequencyInfo *G) {
+    return G->getFunction()->begin();
+  }
+  static nodes_iterator nodes_end(const BlockFrequencyInfo *G) {
+    return G->getFunction()->end();
+  }
+};
+
+template<>
+struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple=false) :
+    DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const BlockFrequencyInfo *G) {
+    return G->getFunction()->getName();
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node,
+                           const BlockFrequencyInfo *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << Node->getName().str() << ":";
+    switch (ViewBlockFreqPropagationDAG) {
+    case GVDT_Fraction:
+      Graph->getBlockFreq(Node).print(OS);
+      break;
+    case GVDT_Integer:
+      OS << Graph->getBlockFreq(Node).getFrequency();
+      break;
+    case GVDT_None:
+      llvm_unreachable("If we are not supposed to render a graph we should "
+                       "never reach this point.");
+    }
+
+    return Result;
+  }
+};
+
+} // end namespace llvm
+#endif
+
+INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
+                      "Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
-INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
-                    true, true)
+INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
+                    "Block Frequency Analysis", true, true)
 
 char BlockFrequencyInfo::ID = 0;
 
@@ -46,6 +129,10 @@ void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
 bool BlockFrequencyInfo::runOnFunction(Function &F) {
   BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
   BFI->doFunction(&F, &BPI);
+#ifndef NDEBUG
+  if (ViewBlockFreqPropagationDAG != GVDT_None)
+    view();
+#endif
   return false;
 }
 
@@ -56,3 +143,19 @@ void BlockFrequencyInfo::print(raw_ostream &O, const Module *) const {
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
   return BFI->getBlockFreq(BB);
 }
+
+/// Pop up a ghostview window with the current block frequency propagation
+/// rendered using dot.
+void BlockFrequencyInfo::view() const {
+// This code is only for debugging.
+#ifndef NDEBUG
+  ViewGraph(const_cast<BlockFrequencyInfo *>(this), "BlockFrequencyDAGs");
+#else
+  errs() << "BlockFrequencyInfo::view is only available in debug builds on "
+            "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+const Function *BlockFrequencyInfo::getFunction() const {
+  return BFI->Fn;
+}
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 7cdf828..86560ca 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -398,10 +398,24 @@ bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
     // InstCombine canonicalizes X <= 0 into X < 1.
     // X <= 0   ->  Unlikely
     isProb = false;
-  } else if (CV->isAllOnesValue() && CI->getPredicate() == CmpInst::ICMP_SGT) {
-    // InstCombine canonicalizes X >= 0 into X > -1.
-    // X >= 0   ->  Likely
-    isProb = true;
+  } else if (CV->isAllOnesValue()) {
+    switch (CI->getPredicate()) {
+    case CmpInst::ICMP_EQ:
+      // X == -1  ->  Unlikely
+      isProb = false;
+      break;
+    case CmpInst::ICMP_NE:
+      // X != -1  ->  Likely
+      isProb = true;
+      break;
+    case CmpInst::ICMP_SGT:
+      // InstCombine canonicalizes X >= 0 into X > -1.
+      // X >= 0   ->  Likely
+      isProb = true;
+      break;
+    default:
+      return false;
+    }
   } else {
     return false;
   }
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index a5ed21a..c3f32d3 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -116,7 +116,7 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
 
 // LoopInfo contains a mapping from basic block to the innermost loop. Find
 // the outermost loop in the loop nest that contains BB.
-static const Loop *getOutermostLoop(LoopInfo *LI, const BasicBlock *BB) {
+static const Loop *getOutermostLoop(const LoopInfo *LI, const BasicBlock *BB) {
   const Loop *L = LI->getLoopFor(BB);
   if (L) {
     while (const Loop *Parent = L->getParentLoop())
@@ -126,60 +126,17 @@ static const Loop *getOutermostLoop(LoopInfo *LI, const BasicBlock *BB) {
 }
 
 // True if there is a loop which contains both BB1 and BB2.
-static bool loopContainsBoth(LoopInfo *LI,
+static bool loopContainsBoth(const LoopInfo *LI,
                              const BasicBlock *BB1, const BasicBlock *BB2) {
   const Loop *L1 = getOutermostLoop(LI, BB1);
   const Loop *L2 = getOutermostLoop(LI, BB2);
   return L1 != NULL && L1 == L2;
 }
 
-static bool isPotentiallyReachableSameBlock(const Instruction *A,
-                                            const Instruction *B,
-                                            LoopInfo *LI) {
-  // The same block case is special because it's the only time we're looking
-  // within a single block to see which comes first. Once we start looking at
-  // multiple blocks, the first instruction of the block is reachable, so we
-  // only need to determine reachability between whole blocks.
-
-  const BasicBlock *BB = A->getParent();
-  // If the block is in a loop then we can reach any instruction in the block
-  // from any other instruction in the block by going around the backedge.
-  // Check whether we're in a loop (or aren't sure).
-
-  // Can't be in a loop if it's the entry block -- the entry block may not
-  // have predecessors.
-  bool HasLoop = BB != &BB->getParent()->getEntryBlock();
-
-  // Can't be in a loop if LoopInfo doesn't know about it.
-  if (LI && HasLoop) {
-    HasLoop = LI->getLoopFor(BB) != 0;
-  }
-  if (HasLoop)
-    return true;
-
-  // Linear scan, start at 'A', see whether we hit 'B' or the end first.
-  for (BasicBlock::const_iterator I = A, E = BB->end(); I != E; ++I) {
-    if (&*I == B)
-      return true;
-  }
-  return false;
-}
-
-bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
-                                  DominatorTree *DT, LoopInfo *LI) {
-  assert(A->getParent()->getParent() == B->getParent()->getParent() &&
-         "This analysis is function-local!");
-
-  const BasicBlock *StopBB = B->getParent();
-
-  if (A->getParent() == B->getParent())
-    return isPotentiallyReachableSameBlock(A, B, LI);
-
-  if (A->getParent() == &A->getParent()->getParent()->getEntryBlock())
-    return true;
-  if (B->getParent() == &A->getParent()->getParent()->getEntryBlock())
-    return false;
-
+static bool isPotentiallyReachableInner(SmallVectorImpl<BasicBlock *> &Worklist,
+                                        BasicBlock *StopBB,
+                                        const DominatorTree *DT,
+                                        const LoopInfo *LI) {
   // When the stop block is unreachable, it's dominated from everywhere,
   // regardless of whether there's a path between the two blocks.
   if (DT && !DT->isReachableFromEntry(StopBB))
@@ -188,11 +145,7 @@ bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
   // Limit the number of blocks we visit. The goal is to avoid run-away compile
   // times on large CFGs without hampering sensible code. Arbitrarily chosen.
   unsigned Limit = 32;
-
   SmallSet<const BasicBlock*, 64> Visited;
-  SmallVector<BasicBlock*, 32> Worklist;
-  Worklist.push_back(const_cast<BasicBlock*>(A->getParent()));
-
   do {
     BasicBlock *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB))
@@ -221,7 +174,72 @@ bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
     }
   } while (!Worklist.empty());
 
-  // We have exhaustived all possible paths and are certain that 'To' can not
-  // be reached from 'From'.
+  // We have exhausted all possible paths and are certain that 'To' can not be
+  // reached from 'From'.
   return false;
 }
+
+bool llvm::isPotentiallyReachable(const BasicBlock *A, const BasicBlock *B,
+                                  const DominatorTree *DT, const LoopInfo *LI) {
+  assert(A->getParent() == B->getParent() &&
+         "This analysis is function-local!");
+
+  SmallVector<BasicBlock*, 32> Worklist;
+  Worklist.push_back(const_cast<BasicBlock*>(A));
+
+  return isPotentiallyReachableInner(Worklist, const_cast<BasicBlock*>(B),
+                                     DT, LI);
+}
+
+bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
+                                  const DominatorTree *DT, const LoopInfo *LI) {
+  assert(A->getParent()->getParent() == B->getParent()->getParent() &&
+         "This analysis is function-local!");
+
+  SmallVector<BasicBlock*, 32> Worklist;
+
+  if (A->getParent() == B->getParent()) {
+    // The same block case is special because it's the only time we're looking
+    // within a single block to see which instruction comes first. Once we
+    // start looking at multiple blocks, the first instruction of the block is
+    // reachable, so we only need to determine reachability between whole
+    // blocks.
+    BasicBlock *BB = const_cast<BasicBlock *>(A->getParent());
+
+    // If the block is in a loop then we can reach any instruction in the block
+    // from any other instruction in the block by going around a backedge.
+    if (LI && LI->getLoopFor(BB) != 0)
+      return true;
+
+    // Linear scan, start at 'A', see whether we hit 'B' or the end first.
+    for (BasicBlock::const_iterator I = A, E = BB->end(); I != E; ++I) {
+      if (&*I == B)
+        return true;
+    }
+
+    // Can't be in a loop if it's the entry block -- the entry block may not
+    // have predecessors.
+    if (BB == &BB->getParent()->getEntryBlock())
+      return false;
+
+    // Otherwise, continue doing the normal per-BB CFG walk.
+    for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+      Worklist.push_back(*I);
+
+    if (Worklist.empty()) {
+      // We've proven that there's no path!
+      return false;
+    }
+  } else {
+    Worklist.push_back(const_cast<BasicBlock*>(A->getParent()));
+  }
+
+  if (A->getParent() == &A->getParent()->getParent()->getEntryBlock())
+    return true;
+  if (B->getParent() == &A->getParent()->getParent()->getEntryBlock())
+    return false;
+
+  return isPotentiallyReachableInner(Worklist,
+                                     const_cast<BasicBlock*>(B->getParent()),
+                                     DT, LI);
+}
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 94ded34..3624aac 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_library(LLVMAnalysis
   CostModel.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
+  Delinearization.cpp
   DependenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
@@ -35,17 +36,7 @@ add_llvm_library(LLVMAnalysis
   ModuleDebugInfoPrinter.cpp
   NoAliasAnalysis.cpp
   PHITransAddr.cpp
-  PathNumbering.cpp
-  PathProfileInfo.cpp
-  PathProfileVerifier.cpp
   PostDominators.cpp
-  ProfileEstimatorPass.cpp
-  ProfileInfo.cpp
-  ProfileInfoLoader.cpp
-  ProfileInfoLoaderPass.cpp
-  ProfileVerifierPass.cpp
-  ProfileDataLoader.cpp
-  ProfileDataLoaderPass.cpp
   PtrUseVisitor.cpp
   RegionInfo.cpp
   RegionPass.cpp
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 9eb76a8..79fab1b 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -146,8 +146,14 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
     case Instruction::PHI:
     case Instruction::Select:
       // The original value is not captured via this if the new value isn't.
+      Count = 0;
       for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
            UI != UE; ++UI) {
+        // If there are lots of uses, conservatively say that the value
+        // is captured to avoid taking too much compile time.
+        if (Count++ >= Threshold)
+          return Tracker->tooManyUses();
+
         Use *U = &UI.getUse();
         if (Visited.insert(U))
           if (Tracker->shouldExplore(U))
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index bc0dffc..3d32232 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -224,7 +224,8 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
                                        APInt &Offset, const DataLayout &TD) {
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
-    Offset.clearAllBits();
+    unsigned BitWidth = TD.getPointerTypeSizeInBits(GV->getType());
+    Offset = APInt(BitWidth, 0);
     return true;
   }
 
@@ -238,16 +239,23 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
-    // If the base isn't a global+constant, we aren't either.
-    if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
-      return false;
+  GEPOperator *GEP = dyn_cast<GEPOperator>(CE);
+  if (!GEP)
+    return false;
 
-    // Otherwise, add any offset that our operands provide.
-    return GEP->accumulateConstantOffset(TD, Offset);
-  }
+  unsigned BitWidth = TD.getPointerTypeSizeInBits(GEP->getType());
+  APInt TmpOffset(BitWidth, 0);
 
-  return false;
+  // If the base isn't a global+constant, we aren't either.
+  if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, TmpOffset, TD))
+    return false;
+
+  // Otherwise, add any offset that our operands provide.
+  if (!GEP->accumulateConstantOffset(TD, TmpOffset))
+    return false;
+
+  Offset = TmpOffset;
+  return true;
 }
 
 /// ReadDataFromGlobal - Recursive helper to read bits out of global.  C is the
@@ -324,12 +332,12 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
       // If we read all of the bytes we needed from this element we're done.
       uint64_t NextEltOffset = SL->getElementOffset(Index);
 
-      if (BytesLeft <= NextEltOffset-CurEltOffset-ByteOffset)
+      if (BytesLeft <= NextEltOffset - CurEltOffset - ByteOffset)
         return true;
 
       // Move to the next element of the struct.
-      CurPtr += NextEltOffset-CurEltOffset-ByteOffset;
-      BytesLeft -= NextEltOffset-CurEltOffset-ByteOffset;
+      CurPtr += NextEltOffset - CurEltOffset - ByteOffset;
+      BytesLeft -= NextEltOffset - CurEltOffset - ByteOffset;
       ByteOffset = 0;
       CurEltOffset = NextEltOffset;
     }
@@ -338,7 +346,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
   if (isa<ConstantArray>(C) || isa<ConstantVector>(C) ||
       isa<ConstantDataSequential>(C)) {
-    Type *EltTy = cast<SequentialType>(C->getType())->getElementType();
+    Type *EltTy = C->getType()->getSequentialElementType();
     uint64_t EltSize = TD.getTypeAllocSize(EltTy);
     uint64_t Index = ByteOffset / EltSize;
     uint64_t Offset = ByteOffset - Index * EltSize;
@@ -346,7 +354,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     if (ArrayType *AT = dyn_cast<ArrayType>(C->getType()))
       NumElts = AT->getNumElements();
     else
-      NumElts = cast<VectorType>(C->getType())->getNumElements();
+      NumElts = C->getType()->getVectorNumElements();
 
     for (; Index != NumElts; ++Index) {
       if (!ReadDataFromGlobal(C->getAggregateElement(Index), Offset, CurPtr,
@@ -367,9 +375,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (CE->getOpcode() == Instruction::IntToPtr &&
-        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getContext()))
+        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getType())) {
       return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr,
                                 BytesLeft, TD);
+    }
   }
 
   // Otherwise, unknown initializer type.
@@ -378,26 +387,29 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
 static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                                                  const DataLayout &TD) {
-  Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
+  PointerType *PTy = cast<PointerType>(C->getType());
+  Type *LoadTy = PTy->getElementType();
   IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
 
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
+    unsigned AS = PTy->getAddressSpace();
+
     // If this is a float/double load, we can try folding it as an int32/64 load
     // and then bitcast the result.  This can be useful for union cases.  Note
     // that address spaces don't matter here since we're not going to result in
     // an actual new load.
     Type *MapTy;
     if (LoadTy->isHalfTy())
-      MapTy = Type::getInt16PtrTy(C->getContext());
+      MapTy = Type::getInt16PtrTy(C->getContext(), AS);
     else if (LoadTy->isFloatTy())
-      MapTy = Type::getInt32PtrTy(C->getContext());
+      MapTy = Type::getInt32PtrTy(C->getContext(), AS);
     else if (LoadTy->isDoubleTy())
-      MapTy = Type::getInt64PtrTy(C->getContext());
+      MapTy = Type::getInt64PtrTy(C->getContext(), AS);
     else if (LoadTy->isVectorTy()) {
-      MapTy = IntegerType::get(C->getContext(),
-                               TD.getTypeAllocSizeInBits(LoadTy));
-      MapTy = PointerType::getUnqual(MapTy);
+      MapTy = PointerType::getIntNPtrTy(C->getContext(),
+                                        TD.getTypeAllocSizeInBits(LoadTy),
+                                        AS);
     } else
       return 0;
 
@@ -408,10 +420,11 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   }
 
   unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
-  if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
+  if (BytesLoaded > 32 || BytesLoaded == 0)
+    return 0;
 
   GlobalValue *GVal;
-  APInt Offset(TD.getPointerSizeInBits(), 0);
+  APInt Offset;
   if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
     return 0;
 
@@ -422,7 +435,8 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
 
   // If we're loading off the beginning of the global, some bytes may be valid,
   // but we don't try to handle this.
-  if (Offset.isNegative()) return 0;
+  if (Offset.isNegative())
+    return 0;
 
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset.getZExtValue() >=
@@ -439,7 +453,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
     ResultVal = RawBytes[BytesLoaded - 1];
     for (unsigned i = 1; i != BytesLoaded; ++i) {
       ResultVal <<= 8;
-      ResultVal |= RawBytes[BytesLoaded-1-i];
+      ResultVal |= RawBytes[BytesLoaded - 1 - i];
     }
   } else {
     ResultVal = RawBytes[0];
@@ -464,14 +478,17 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
 
   // If the loaded value isn't a constant expr, we can't handle it.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
-  if (!CE) return 0;
+  if (!CE)
+    return 0;
 
   if (CE->getOpcode() == Instruction::GetElementPtr) {
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
-      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
+      if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
         if (Constant *V =
              ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
           return V;
+      }
+    }
   }
 
   // Instead of loading constant c string, use corresponding integer value
@@ -576,13 +593,13 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
   // constant.  This happens frequently when iterating over a global array.
   if (Opc == Instruction::Sub && DL) {
     GlobalValue *GV1, *GV2;
-    unsigned PtrSize = DL->getPointerSizeInBits();
-    unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
-    APInt Offs1(PtrSize, 0), Offs2(PtrSize, 0);
+    APInt Offs1, Offs2;
 
     if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *DL))
       if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *DL) &&
           GV1 == GV2) {
+        unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
+
         // (&GV+C1) - (&GV+C2) -> C1-C2, pointer arithmetic cannot overflow.
         // PtrToInt may change the bitwidth so we have convert to the right size
         // first.
@@ -600,15 +617,18 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
 static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
                                 Type *ResultTy, const DataLayout *TD,
                                 const TargetLibraryInfo *TLI) {
-  if (!TD) return 0;
-  Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
+  if (!TD)
+    return 0;
+
+  Type *IntPtrTy = TD->getIntPtrType(ResultTy);
 
   bool Any = false;
   SmallVector<Constant*, 32> NewIdxs;
   for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
     if ((i == 1 ||
-         !isa<StructType>(GetElementPtrInst::getIndexedType(Ops[0]->getType(),
-                                                        Ops.slice(1, i-1)))) &&
+         !isa<StructType>(GetElementPtrInst::getIndexedType(
+                            Ops[0]->getType(),
+                            Ops.slice(1, i - 1)))) &&
         Ops[i]->getType() != IntPtrTy) {
       Any = true;
       NewIdxs.push_back(ConstantExpr::getCast(CastInst::getCastOpcode(Ops[i],
@@ -619,13 +639,16 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
     } else
       NewIdxs.push_back(Ops[i]);
   }
-  if (!Any) return 0;
 
-  Constant *C =
-    ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+  if (!Any)
+    return 0;
+
+  Constant *C = ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
+  }
+
   return C;
 }
 
@@ -640,7 +663,7 @@ static Constant* StripPtrCastKeepAS(Constant* Ptr) {
   if (NewPtrTy->getAddressSpace() != OldPtrTy->getAddressSpace()) {
     NewPtrTy = NewPtrTy->getElementType()->getPointerTo(
       OldPtrTy->getAddressSpace());
-    Ptr = ConstantExpr::getBitCast(Ptr, NewPtrTy);
+    Ptr = ConstantExpr::getPointerCast(Ptr, NewPtrTy);
   }
   return Ptr;
 }
@@ -651,11 +674,12 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
                                          Type *ResultTy, const DataLayout *TD,
                                          const TargetLibraryInfo *TLI) {
   Constant *Ptr = Ops[0];
-  if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized() ||
+  if (!TD || !Ptr->getType()->getPointerElementType()->isSized() ||
       !Ptr->getType()->isPointerTy())
     return 0;
 
-  Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(Ptr->getType());
+  Type *ResultElementTy = ResultTy->getPointerElementType();
 
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
@@ -664,8 +688,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
       // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
       // "inttoptr (sub (ptrtoint Ptr), V)"
-      if (Ops.size() == 2 &&
-          cast<PointerType>(ResultTy)->getElementType()->isIntegerTy(8)) {
+      if (Ops.size() == 2 && ResultElementTy->isIntegerTy(8)) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[1]);
         assert((CE == 0 || CE->getType() == IntPtrTy) &&
                "CastGEPIndices didn't canonicalize index types!");
@@ -692,7 +715,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
   while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
-    SmallVector<Value *, 4> NestedOps(GEP->op_begin()+1, GEP->op_end());
+    SmallVector<Value *, 4> NestedOps(GEP->op_begin() + 1, GEP->op_end());
 
     // Do not try the incorporate the sub-GEP if some index is not a number.
     bool AllConstantInt = true;
@@ -713,12 +736,15 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // If the base value for this address is a literal integer value, fold the
   // getelementptr to the resulting integer value casted to the pointer type.
   APInt BasePtr(BitWidth, 0);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
-    if (CE->getOpcode() == Instruction::IntToPtr)
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+    if (CE->getOpcode() == Instruction::IntToPtr) {
       if (ConstantInt *Base = dyn_cast<ConstantInt>(CE->getOperand(0)))
         BasePtr = Base->getValue().zextOrTrunc(BitWidth);
+    }
+  }
+
   if (Ptr->isNullValue() || BasePtr != 0) {
-    Constant *C = ConstantInt::get(Ptr->getContext(), Offset+BasePtr);
+    Constant *C = ConstantInt::get(Ptr->getContext(), Offset + BasePtr);
     return ConstantExpr::getIntToPtr(C, ResultTy);
   }
 
@@ -728,7 +754,8 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // Also, this helps GlobalOpt do SROA on GlobalVariables.
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Forming regular GEP of non-pointer type");
-  SmallVector<Constant*, 32> NewIdxs;
+  SmallVector<Constant *, 32> NewIdxs;
+
   do {
     if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
       if (ATy->isPointerTy()) {
@@ -743,7 +770,6 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
       // Determine which element of the array the offset points into.
       APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
-      IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
       if (ElemSize == 0)
         // The element size is 0. This may be [0 x Ty]*, so just use a zero
         // index for this level and proceed to the next level to see if it can
@@ -778,7 +804,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
       // We've reached some non-indexable type.
       break;
     }
-  } while (Ty != cast<PointerType>(ResultTy)->getElementType());
+  } while (Ty != ResultElementTy);
 
   // If we haven't used up the entire offset by descending the static
   // type, then the offset is pointing into the middle of an indivisible
@@ -787,14 +813,13 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
     return 0;
 
   // Create a GEP.
-  Constant *C =
-    ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
-  assert(cast<PointerType>(C->getType())->getElementType() == Ty &&
+  Constant *C = ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
+  assert(C->getType()->getPointerElementType() == Ty &&
          "Computed GetElementPtr has unexpected type!");
 
   // If we ended up indexing a member with a type that doesn't match
   // the type of what the original indices indexed, add a cast.
-  if (Ty != cast<PointerType>(ResultTy)->getElementType())
+  if (Ty != ResultElementTy)
     C = FoldBitCast(C, ResultTy, *TD);
 
   return C;
@@ -867,16 +892,18 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return ConstantFoldLoadInst(LI, TD);
 
-  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I))
+  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I)) {
     return ConstantExpr::getInsertValue(
                                 cast<Constant>(IVI->getAggregateOperand()),
                                 cast<Constant>(IVI->getInsertedValueOperand()),
                                 IVI->getIndices());
+  }
 
-  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I))
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I)) {
     return ConstantExpr::getExtractValue(
                                     cast<Constant>(EVI->getAggregateOperand()),
                                     EVI->getIndices());
+  }
 
   return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD, TLI);
 }
@@ -930,9 +957,10 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
                                          const TargetLibraryInfo *TLI) {
   // Handle easy binops first.
   if (Instruction::isBinaryOp(Opcode)) {
-    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1]))
+    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1])) {
       if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], TD))
         return C;
+    }
 
     return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
   }
@@ -953,10 +981,11 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
       if (TD && CE->getOpcode() == Instruction::IntToPtr) {
         Constant *Input = CE->getOperand(0);
         unsigned InWidth = Input->getType()->getScalarSizeInBits();
-        if (TD->getPointerSizeInBits() < InWidth) {
+        unsigned PtrWidth = TD->getPointerTypeSizeInBits(CE->getType());
+        if (PtrWidth < InWidth) {
           Constant *Mask =
-            ConstantInt::get(CE->getContext(), APInt::getLowBitsSet(InWidth,
-                                                  TD->getPointerSizeInBits()));
+            ConstantInt::get(CE->getContext(),
+                             APInt::getLowBitsSet(InWidth, PtrWidth));
           Input = ConstantExpr::getAnd(Input, Mask);
         }
         // Do a zext or trunc to get to the dest size.
@@ -966,13 +995,22 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
     return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::IntToPtr:
     // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
-    // the int size is >= the ptr size.  This requires knowing the width of a
-    // pointer, so it can't be done in ConstantExpr::getCast.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
-      if (TD &&
-          TD->getPointerSizeInBits() <= CE->getType()->getScalarSizeInBits() &&
-          CE->getOpcode() == Instruction::PtrToInt)
-        return FoldBitCast(CE->getOperand(0), DestTy, *TD);
+    // the int size is >= the ptr size and the address spaces are the same.
+    // This requires knowing the width of a pointer, so it can't be done in
+    // ConstantExpr::getCast.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
+      if (TD && CE->getOpcode() == Instruction::PtrToInt) {
+        Constant *SrcPtr = CE->getOperand(0);
+        unsigned SrcPtrSize = TD->getPointerTypeSizeInBits(SrcPtr->getType());
+        unsigned MidIntSize = CE->getType()->getScalarSizeInBits();
+
+        if (MidIntSize >= SrcPtrSize) {
+          unsigned SrcAS = SrcPtr->getType()->getPointerAddressSpace();
+          if (SrcAS == DestTy->getPointerAddressSpace())
+            return FoldBitCast(CE->getOperand(0), DestTy, *TD);
+        }
+      }
+    }
 
     return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::Trunc:
@@ -984,6 +1022,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
   case Instruction::SIToFP:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
+  case Instruction::AddrSpaceCast:
       return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::BitCast:
     if (TD)
@@ -1024,8 +1063,8 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
   // around to know if bit truncation is happening.
   if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
     if (TD && Ops1->isNullValue()) {
-      Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
       if (CE0->getOpcode() == Instruction::IntToPtr) {
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getType());
         // Convert the integer value to the right size to ensure we get the
         // proper extension or truncation.
         Constant *C = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -1036,19 +1075,21 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 
       // Only do this transformation if the int is intptrty in size, otherwise
       // there is a truncation or extension that we aren't modeling.
-      if (CE0->getOpcode() == Instruction::PtrToInt &&
-          CE0->getType() == IntPtrTy) {
-        Constant *C = CE0->getOperand(0);
-        Constant *Null = Constant::getNullValue(C->getType());
-        return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+      if (CE0->getOpcode() == Instruction::PtrToInt) {
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+        if (CE0->getType() == IntPtrTy) {
+          Constant *C = CE0->getOperand(0);
+          Constant *Null = Constant::getNullValue(C->getType());
+          return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+        }
       }
     }
 
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
       if (TD && CE0->getOpcode() == CE1->getOpcode()) {
-        Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
-
         if (CE0->getOpcode() == Instruction::IntToPtr) {
+          Type *IntPtrTy = TD->getIntPtrType(CE0->getType());
+
           // Convert the integer value to the right size to ensure we get the
           // proper extension or truncation.
           Constant *C0 = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -1060,11 +1101,17 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 
         // Only do this transformation if the int is intptrty in size, otherwise
         // there is a truncation or extension that we aren't modeling.
-        if ((CE0->getOpcode() == Instruction::PtrToInt &&
-             CE0->getType() == IntPtrTy &&
-             CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()))
-          return ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0),
-                                                 CE1->getOperand(0), TD, TLI);
+        if (CE0->getOpcode() == Instruction::PtrToInt) {
+          Type *IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+          if (CE0->getType() == IntPtrTy &&
+              CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()) {
+            return ConstantFoldCompareInstOperands(Predicate,
+                                                   CE0->getOperand(0),
+                                                   CE1->getOperand(0),
+                                                   TD,
+                                                   TLI);
+          }
+        }
       }
     }
 
@@ -1101,7 +1148,8 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
   // addressing.
   for (unsigned i = 2, e = CE->getNumOperands(); i != e; ++i) {
     C = C->getAggregateElement(CE->getOperand(i));
-    if (C == 0) return 0;
+    if (C == 0)
+      return 0;
   }
   return C;
 }
@@ -1116,7 +1164,8 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
   // addressing.
   for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
     C = C->getAggregateElement(Indices[i]);
-    if (C == 0) return 0;
+    if (C == 0)
+      return 0;
   }
   return C;
 }
@@ -1128,8 +1177,7 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
 
 /// canConstantFoldCallTo - Return true if its even possible to fold a call to
 /// the specified function.
-bool
-llvm::canConstantFoldCallTo(const Function *F) {
+bool llvm::canConstantFoldCallTo(const Function *F) {
   switch (F->getIntrinsicID()) {
   case Intrinsic::fabs:
   case Intrinsic::log:
@@ -1167,7 +1215,8 @@ llvm::canConstantFoldCallTo(const Function *F) {
   case 0: break;
   }
 
-  if (!F->hasName()) return false;
+  if (!F->hasName())
+    return false;
   StringRef Name = F->getName();
 
   // In these cases, the check of the length is required.  We don't want to
@@ -1250,7 +1299,7 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
 static Constant *ConstantFoldConvertToInt(const APFloat &Val,
                                           bool roundTowardZero, Type *Ty) {
   // All of these conversion intrinsics form an integer of at most 64bits.
-  unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
+  unsigned ResultWidth = Ty->getIntegerBitWidth();
   assert(ResultWidth <= 64 &&
          "Can only constant fold conversions to 64 and 32 bit ints");
 
@@ -1271,7 +1320,8 @@ static Constant *ConstantFoldConvertToInt(const APFloat &Val,
 Constant *
 llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
                        const TargetLibraryInfo *TLI) {
-  if (!F->hasName()) return 0;
+  if (!F->hasName())
+    return 0;
   StringRef Name = F->getName();
 
   Type *Ty = F->getReturnType();
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 927508e..f943258 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -19,6 +19,7 @@
 
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
@@ -26,10 +27,15 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
+                                     cl::Hidden,
+                                     cl::desc("Recognize reduction patterns."));
+
 namespace {
   class CostModelAnalysis : public FunctionPass {
 
@@ -105,6 +111,260 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
   return OpInfo;
 }
 
+static bool matchMask(SmallVectorImpl<int> &M1, SmallVectorImpl<int> &M2) {
+  if (M1.size() != M2.size())
+    return false;
+
+  for (unsigned i = 0, e = M1.size(); i != e; ++i)
+    if (M1[i] != M2[i])
+      return false;
+
+  return true;
+}
+
+static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
+                                     unsigned Level) {
+  // We don't need a shuffle if we just want to have element 0 in position 0 of
+  // the vector.
+  if (!SI && Level == 0 && IsLeft)
+    return true;
+  else if (!SI)
+    return false;
+
+  SmallVector<int, 32> Mask(SI->getType()->getVectorNumElements(), -1);
+
+  // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
+  // we look at the left or right side.
+  for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
+    Mask[i] = val;
+
+  SmallVector<int, 16> ActualMask = SI->getShuffleMask();
+  if (!matchMask(Mask, ActualMask))
+    return false;
+
+  return true;
+}
+
+static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
+                                          unsigned Level, unsigned NumLevels) {
+  // Match one level of pairwise operations.
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  if (BinOp == 0)
+    return false;
+
+  assert(BinOp->getType()->isVectorTy() && "Expecting a vector type");
+
+  unsigned Opcode = BinOp->getOpcode();
+  Value *L = BinOp->getOperand(0);
+  Value *R = BinOp->getOperand(1);
+
+  ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(L);
+  if (!LS && Level)
+    return false;
+  ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(R);
+  if (!RS && Level)
+    return false;
+
+  // On level 0 we can omit one shufflevector instruction.
+  if (!Level && !RS && !LS)
+    return false;
+
+  // Shuffle inputs must match.
+  Value *NextLevelOpL = LS ? LS->getOperand(0) : 0;
+  Value *NextLevelOpR = RS ? RS->getOperand(0) : 0;
+  Value *NextLevelOp = 0;
+  if (NextLevelOpR && NextLevelOpL) {
+    // If we have two shuffles their operands must match.
+    if (NextLevelOpL != NextLevelOpR)
+      return false;
+
+    NextLevelOp = NextLevelOpL;
+  } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
+    // On the first level we can omit the shufflevector <0, undef,...>. So the
+    // input to the other shufflevector <1, undef> must match with one of the
+    // inputs to the current binary operation.
+    // Example:
+    //  %NextLevelOpL = shufflevector %R, <1, undef ...>
+    //  %BinOp        = fadd          %NextLevelOpL, %R
+    if (NextLevelOpL && NextLevelOpL != R)
+      return false;
+    else if (NextLevelOpR && NextLevelOpR != L)
+      return false;
+
+    NextLevelOp = NextLevelOpL ? R : L;
+  } else
+    return false;
+
+  // Check that the next levels binary operation exists and matches with the
+  // current one.
+  BinaryOperator *NextLevelBinOp = 0;
+  if (Level + 1 != NumLevels) {
+    if (!(NextLevelBinOp = dyn_cast<BinaryOperator>(NextLevelOp)))
+      return false;
+    else if (NextLevelBinOp->getOpcode() != Opcode)
+      return false;
+  }
+
+  // Shuffle mask for pairwise operation must match.
+  if (matchPairwiseShuffleMask(LS, true, Level)) {
+    if (!matchPairwiseShuffleMask(RS, false, Level))
+      return false;
+  } else if (matchPairwiseShuffleMask(RS, true, Level)) {
+    if (!matchPairwiseShuffleMask(LS, false, Level))
+      return false;
+  } else
+    return false;
+
+  if (++Level == NumLevels)
+    return true;
+
+  // Match next level.
+  return matchPairwiseReductionAtLevel(NextLevelBinOp, Level, NumLevels);
+}
+
+static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
+                                   unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffle,shuffle,add triples like the following
+  // that builds a pairwise reduction tree.
+  //
+  //  (X0, X1, X2, X3)
+  //   (X0 + X1, X2 + X3, undef, undef)
+  //    ((X0 + X1) + (X2 + X3), undef, undef, undef)
+  //
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  // %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  // %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+  if (!matchPairwiseReductionAtLevel(RdxStart, 0,  Log2_32(NumVecElems)))
+    return false;
+
+  Opcode = RdxStart->getOpcode();
+  Ty = VecTy;
+
+  return true;
+}
+
+static std::pair<Value *, ShuffleVectorInst *>
+getShuffleAndOtherOprd(BinaryOperator *B) {
+
+  Value *L = B->getOperand(0);
+  Value *R = B->getOperand(1);
+  ShuffleVectorInst *S = 0;
+
+  if ((S = dyn_cast<ShuffleVectorInst>(L)))
+    return std::make_pair(R, S);
+
+  S = dyn_cast<ShuffleVectorInst>(R);
+  return std::make_pair(L, S);
+}
+
+static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
+                                          unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+  unsigned RdxOpcode = RdxStart->getOpcode();
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffles and adds like the following matching one
+  // fadd, shuffle vector pair at a time.
+  //
+  // %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //                           <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+  // %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
+  //                          <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+
+  unsigned MaskStart = 1;
+  Value *RdxOp = RdxStart;
+  SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
+  unsigned NumVecElemsRemain = NumVecElems;
+  while (NumVecElemsRemain - 1) {
+    // Check for the right reduction operation.
+    BinaryOperator *BinOp;
+    if (!(BinOp = dyn_cast<BinaryOperator>(RdxOp)))
+      return false;
+    if (BinOp->getOpcode() != RdxOpcode)
+      return false;
+
+    Value *NextRdxOp;
+    ShuffleVectorInst *Shuffle;
+    tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp);
+
+    // Check the current reduction operation and the shuffle use the same value.
+    if (Shuffle == 0)
+      return false;
+    if (Shuffle->getOperand(0) != NextRdxOp)
+      return false;
+
+    // Check that shuffle masks matches.
+    for (unsigned j = 0; j != MaskStart; ++j)
+      ShuffleMask[j] = MaskStart + j;
+    // Fill the rest of the mask with -1 for undef.
+    std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
+
+    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+    if (!matchMask(ShuffleMask, Mask))
+      return false;
+
+    RdxOp = NextRdxOp;
+    NumVecElemsRemain /= 2;
+    MaskStart *= 2;
+  }
+
+  Opcode = RdxOpcode;
+  Ty = VecTy;
+  return true;
+}
+
 unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   if (!TTI)
     return -1;
@@ -189,6 +449,17 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
     unsigned Idx = -1;
     if (CI)
       Idx = CI->getZExtValue();
+
+    // Try to match a reduction sequence (series of shufflevector and vector
+    // adds followed by a extractelement).
+    unsigned ReduxOpCode;
+    Type *ReduxType;
+
+    if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, false);
+    else if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, true);
+
     return TTI->getVectorInstrCost(I->getOpcode(),
                                    EEI->getOperand(0)->getType(), Idx);
   }
diff --git a/lib/Analysis/Delinearization.cpp b/lib/Analysis/Delinearization.cpp
new file mode 100644
index 0000000..3ed0609
--- /dev/null
+++ b/lib/Analysis/Delinearization.cpp
@@ -0,0 +1,133 @@
+//===---- Delinearization.cpp - MultiDimensional Index Delinearization ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements an analysis pass that tries to delinearize all GEP
+// instructions in all loops using the SCEV analysis functionality. This pass is
+// only used for testing purposes: if your pass needs delinearization, please
+// use the on-demand SCEVAddRecExpr::delinearize() function.
+//
+//===----------------------------------------------------------------------===//
+
+#define DL_NAME "delinearize"
+#define DEBUG_TYPE DL_NAME
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class Delinearization : public FunctionPass {
+  Delinearization(const Delinearization &); // do not implement
+protected:
+  Function *F;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  Delinearization() : FunctionPass(ID) {
+    initializeDelinearizationPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnFunction(Function &F);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual void print(raw_ostream &O, const Module *M = 0) const;
+};
+
+} // end anonymous namespace
+
+void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<LoopInfo>();
+  AU.addRequired<ScalarEvolution>();
+}
+
+bool Delinearization::runOnFunction(Function &F) {
+  this->F = &F;
+  SE = &getAnalysis<ScalarEvolution>();
+  LI = &getAnalysis<LoopInfo>();
+  return false;
+}
+
+static Value *getPointerOperand(Instruction &Inst) {
+  if (LoadInst *Load = dyn_cast<LoadInst>(&Inst))
+    return Load->getPointerOperand();
+  else if (StoreInst *Store = dyn_cast<StoreInst>(&Inst))
+    return Store->getPointerOperand();
+  else if (GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(&Inst))
+    return Gep->getPointerOperand();
+  return NULL;
+}
+
+void Delinearization::print(raw_ostream &O, const Module *) const {
+  O << "Delinearization on function " << F->getName() << ":\n";
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    Instruction *Inst = &(*I);
+
+    // Only analyze loads and stores.
+    if (!isa<StoreInst>(Inst) && !isa<LoadInst>(Inst) &&
+        !isa<GetElementPtrInst>(Inst))
+      continue;
+
+    const BasicBlock *BB = Inst->getParent();
+    // Delinearize the memory access as analyzed in all the surrounding loops.
+    // Do not analyze memory accesses outside loops.
+    for (Loop *L = LI->getLoopFor(BB); L != NULL; L = L->getParentLoop()) {
+      const SCEV *AccessFn = SE->getSCEVAtScope(getPointerOperand(*Inst), L);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(AccessFn);
+
+      // Do not try to delinearize memory accesses that are not AddRecs.
+      if (!AR)
+        break;
+
+      O << "AddRec: " << *AR << "\n";
+
+      SmallVector<const SCEV *, 3> Subscripts, Sizes;
+      const SCEV *Res = AR->delinearize(*SE, Subscripts, Sizes);
+      int Size = Subscripts.size();
+      if (Res == AR || Size == 0) {
+        O << "failed to delinearize\n";
+        continue;
+      }
+      O << "Base offset: " << *Res << "\n";
+      O << "ArrayDecl[UnknownSize]";
+      for (int i = 0; i < Size - 1; i++)
+        O << "[" << *Sizes[i] << "]";
+      O << " with elements of " << *Sizes[Size - 1] << " bytes.\n";
+
+      O << "ArrayRef";
+      for (int i = 0; i < Size; i++)
+        O << "[" << *Subscripts[i] << "]";
+      O << "\n";
+    }
+  }
+}
+
+char Delinearization::ID = 0;
+static const char delinearization_name[] = "Delinearization";
+INITIALIZE_PASS_BEGIN(Delinearization, DL_NAME, delinearization_name, true,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(Delinearization, DL_NAME, delinearization_name, true, true)
+
+FunctionPass *llvm::createDelinearizationPass() { return new Delinearization; }
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index a0f1a69..3b3e2ef 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -24,11 +24,11 @@
 // Both of these are conservative weaknesses;
 // that is, not a source of correctness problems.
 //
-// The implementation depends on the GEP instruction to
-// differentiate subscripts. Since Clang linearizes subscripts
-// for most arrays, we give up some precision (though the existing MIV tests
-// will help). We trust that the GEP instruction will eventually be extended.
-// In the meantime, we should explore Maslov's ideas about delinearization.
+// The implementation depends on the GEP instruction to differentiate
+// subscripts. Since Clang linearizes some array subscripts, the dependence
+// analysis is using SCEV->delinearize to recover the representation of multiple
+// subscripts, and thus avoid the more expensive and less precise MIV tests. The
+// delinearization is controlled by the flag -da-delinearize.
 //
 // We should pay some careful attention to the possibility of integer overflow
 // in the implementation of the various tests. This could happen with Add,
@@ -61,6 +61,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstIterator.h"
@@ -104,6 +105,10 @@ STATISTIC(BanerjeeApplications, "Banerjee applications");
 STATISTIC(BanerjeeIndependence, "Banerjee independence");
 STATISTIC(BanerjeeSuccesses, "Banerjee successes");
 
+static cl::opt<bool>
+Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+            cl::desc("Try to delinearize array references."));
+
 //===----------------------------------------------------------------------===//
 // basics
 
@@ -3171,6 +3176,55 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
     llvm_unreachable("constraint has unexpected kind");
 }
 
+/// Check if we can delinearize the subscripts. If the SCEVs representing the
+/// source and destination array references are recurrences on a nested loop,
+/// this function flattens the nested recurrences into seperate recurrences
+/// for each loop level.
+bool
+DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
+                                   SmallVectorImpl<Subscript> &Pair) const {
+  const SCEVAddRecExpr *SrcAR = dyn_cast<SCEVAddRecExpr>(SrcSCEV);
+  const SCEVAddRecExpr *DstAR = dyn_cast<SCEVAddRecExpr>(DstSCEV);
+  if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine())
+    return false;
+
+  SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts, SrcSizes, DstSizes;
+  SrcAR->delinearize(*SE, SrcSubscripts, SrcSizes);
+  DstAR->delinearize(*SE, DstSubscripts, DstSizes);
+
+  int size = SrcSubscripts.size();
+  int dstSize = DstSubscripts.size();
+  if (size != dstSize || size < 2)
+    return false;
+
+#ifndef NDEBUG
+  DEBUG(errs() << "\nSrcSubscripts: ");
+  for (int i = 0; i < size; i++)
+    DEBUG(errs() << *SrcSubscripts[i]);
+  DEBUG(errs() << "\nDstSubscripts: ");
+  for (int i = 0; i < size; i++)
+    DEBUG(errs() << *DstSubscripts[i]);
+#endif
+
+  // The delinearization transforms a single-subscript MIV dependence test into
+  // a multi-subscript SIV dependence test that is easier to compute. So we
+  // resize Pair to contain as many pairs of subscripts as the delinearization
+  // has found, and then initialize the pairs following the delinearization.
+  Pair.resize(size);
+  for (int i = 0; i < size; ++i) {
+    Pair[i].Src = SrcSubscripts[i];
+    Pair[i].Dst = DstSubscripts[i];
+
+    // FIXME: we should record the bounds SrcSizes[i] and DstSizes[i] that the
+    // delinearization has found, and add these constraints to the dependence
+    // check to avoid memory accesses overflow from one dimension into another.
+    // This is related to the problem of determining the existence of data
+    // dependences in array accesses using a different number of subscripts: in
+    // C one can access an array A[100][100]; as A[0][9999], *A[9999], etc.
+  }
+
+  return true;
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -3280,6 +3334,12 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
     Pair[0].Dst = DstSCEV;
   }
 
+  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+    DEBUG(dbgs() << "    delinerized GEP\n");
+    Pairs = Pair.size();
+  }
+
   for (unsigned P = 0; P < Pairs; ++P) {
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
@@ -3698,6 +3758,12 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
     Pair[0].Dst = DstSCEV;
   }
 
+  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+    DEBUG(dbgs() << "    delinerized GEP\n");
+    Pairs = Pair.size();
+  }
+
   for (unsigned P = 0; P < Pairs; ++P) {
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 7620fd9..f042964 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -6,11 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements the CallGraph class and provides the BasicCallGraph
-// default implementation.
-//
-//===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Instructions.h"
@@ -21,168 +16,92 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace {
+CallGraph::CallGraph()
+    : ModulePass(ID), Root(0), ExternalCallingNode(0), CallsExternalNode(0) {
+  initializeCallGraphPass(*PassRegistry::getPassRegistry());
+}
 
-//===----------------------------------------------------------------------===//
-// BasicCallGraph class definition
-//
-class BasicCallGraph : public ModulePass, public CallGraph {
-  // Root is root of the call graph, or the external node if a 'main' function
-  // couldn't be found.
-  //
-  CallGraphNode *Root;
-
-  // ExternalCallingNode - This node has edges to all external functions and
-  // those internal functions that have their address taken.
-  CallGraphNode *ExternalCallingNode;
-
-  // CallsExternalNode - This node has edges to it from all functions making
-  // indirect calls or calling an external function.
-  CallGraphNode *CallsExternalNode;
-
-public:
-  static char ID; // Class identification, replacement for typeinfo
-  BasicCallGraph() : ModulePass(ID), Root(0), 
-    ExternalCallingNode(0), CallsExternalNode(0) {
-      initializeBasicCallGraphPass(*PassRegistry::getPassRegistry());
-    }
+void CallGraph::addToCallGraph(Function *F) {
+  CallGraphNode *Node = getOrInsertFunction(F);
 
-  // runOnModule - Compute the call graph for the specified module.
-  virtual bool runOnModule(Module &M) {
-    CallGraph::initialize(M);
-    
-    ExternalCallingNode = getOrInsertFunction(0);
-    CallsExternalNode = new CallGraphNode(0);
-    Root = 0;
-  
-    // Add every function to the call graph.
-    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-      addToCallGraph(I);
-  
-    // If we didn't find a main function, use the external call graph node
-    if (Root == 0) Root = ExternalCallingNode;
-    
-    return false;
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesAll();
-  }
+  // If this function has external linkage, anything could call it.
+  if (!F->hasLocalLinkage()) {
+    ExternalCallingNode->addCalledFunction(CallSite(), Node);
 
-  virtual void print(raw_ostream &OS, const Module *) const {
-    OS << "CallGraph Root is: ";
-    if (Function *F = getRoot()->getFunction())
-      OS << F->getName() << "\n";
-    else {
-      OS << "<<null function: 0x" << getRoot() << ">>\n";
+    // Found the entry point?
+    if (F->getName() == "main") {
+      if (Root) // Found multiple external mains?  Don't pick one.
+        Root = ExternalCallingNode;
+      else
+        Root = Node; // Found a main, keep track of it!
     }
-    
-    CallGraph::print(OS, 0);
   }
 
-  virtual void releaseMemory() {
-    destroy();
-  }
-  
-  /// getAdjustedAnalysisPointer - This method is used when a pass implements
-  /// an analysis interface through multiple inheritance.  If needed, it should
-  /// override this to adjust the this pointer as needed for the specified pass
-  /// info.
-  virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-    if (PI == &CallGraph::ID)
-      return (CallGraph*)this;
-    return this;
-  }
-  
-  CallGraphNode* getExternalCallingNode() const { return ExternalCallingNode; }
-  CallGraphNode* getCallsExternalNode()   const { return CallsExternalNode; }
-
-  // getRoot - Return the root of the call graph, which is either main, or if
-  // main cannot be found, the external node.
-  //
-  CallGraphNode *getRoot()             { return Root; }
-  const CallGraphNode *getRoot() const { return Root; }
-
-private:
-  //===---------------------------------------------------------------------
-  // Implementation of CallGraph construction
-  //
-
-  // addToCallGraph - Add a function to the call graph, and link the node to all
-  // of the functions that it calls.
-  //
-  void addToCallGraph(Function *F) {
-    CallGraphNode *Node = getOrInsertFunction(F);
-
-    // If this function has external linkage, anything could call it.
-    if (!F->hasLocalLinkage()) {
-      ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-      // Found the entry point?
-      if (F->getName() == "main") {
-        if (Root)    // Found multiple external mains?  Don't pick one.
-          Root = ExternalCallingNode;
-        else
-          Root = Node;          // Found a main, keep track of it!
+  // If this function has its address taken, anything could call it.
+  if (F->hasAddressTaken())
+    ExternalCallingNode->addCalledFunction(CallSite(), Node);
+
+  // If this function is not defined in this translation unit, it could call
+  // anything.
+  if (F->isDeclaration() && !F->isIntrinsic())
+    Node->addCalledFunction(CallSite(), CallsExternalNode);
+
+  // Look for calls by this function.
+  for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
+      CallSite CS(cast<Value>(II));
+      if (CS) {
+        const Function *Callee = CS.getCalledFunction();
+        if (!Callee)
+          // Indirect calls of intrinsics are not allowed so no need to check.
+          Node->addCalledFunction(CS, CallsExternalNode);
+        else if (!Callee->isIntrinsic())
+          Node->addCalledFunction(CS, getOrInsertFunction(Callee));
       }
     }
+}
 
-    // If this function has its address taken, anything could call it.
-    if (F->hasAddressTaken())
-      ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-    // If this function is not defined in this translation unit, it could call
-    // anything.
-    if (F->isDeclaration() && !F->isIntrinsic())
-      Node->addCalledFunction(CallSite(), CallsExternalNode);
-
-    // Look for calls by this function.
-    for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
-      for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
-           II != IE; ++II) {
-        CallSite CS(cast<Value>(II));
-        if (CS) {
-          const Function *Callee = CS.getCalledFunction();
-          if (!Callee)
-            // Indirect calls of intrinsics are not allowed so no need to check.
-            Node->addCalledFunction(CS, CallsExternalNode);
-          else if (!Callee->isIntrinsic())
-            Node->addCalledFunction(CS, getOrInsertFunction(Callee));
-        }
-      }
-  }
+void CallGraph::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
 
-  //
-  // destroy - Release memory for the call graph
-  virtual void destroy() {
-    /// CallsExternalNode is not in the function map, delete it explicitly.
-    if (CallsExternalNode) {
-      CallsExternalNode->allReferencesDropped();
-      delete CallsExternalNode;
-      CallsExternalNode = 0;
-    }
-    CallGraph::destroy();
-  }
-};
+bool CallGraph::runOnModule(Module &M) {
+  Mod = &M;
 
-} //End anonymous namespace
+  ExternalCallingNode = getOrInsertFunction(0);
+  assert(!CallsExternalNode);
+  CallsExternalNode = new CallGraphNode(0);
+  Root = 0;
 
-INITIALIZE_ANALYSIS_GROUP(CallGraph, "Call Graph", BasicCallGraph)
-INITIALIZE_AG_PASS(BasicCallGraph, CallGraph, "basiccg",
-                   "Basic CallGraph Construction", false, true, true)
+  // Add every function to the call graph.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    addToCallGraph(I);
 
-char CallGraph::ID = 0;
-char BasicCallGraph::ID = 0;
+  // If we didn't find a main function, use the external call graph node
+  if (Root == 0)
+    Root = ExternalCallingNode;
 
-void CallGraph::initialize(Module &M) {
-  Mod = &M;
+  return false;
 }
 
-void CallGraph::destroy() {
-  if (FunctionMap.empty()) return;
-  
-  // Reset all node's use counts to zero before deleting them to prevent an
-  // assertion from firing.
+INITIALIZE_PASS(CallGraph, "basiccg", "CallGraph Construction", false, true)
+
+char CallGraph::ID = 0;
+
+void CallGraph::releaseMemory() {
+  /// CallsExternalNode is not in the function map, delete it explicitly.
+  if (CallsExternalNode) {
+    CallsExternalNode->allReferencesDropped();
+    delete CallsExternalNode;
+    CallsExternalNode = 0;
+  }
+
+  if (FunctionMap.empty())
+    return;
+
+// Reset all node's use counts to zero before deleting them to prevent an
+// assertion from firing.
 #ifndef NDEBUG
   for (FunctionMapTy::iterator I = FunctionMap.begin(), E = FunctionMap.end();
        I != E; ++I)
@@ -195,7 +114,14 @@ void CallGraph::destroy() {
   FunctionMap.clear();
 }
 
-void CallGraph::print(raw_ostream &OS, Module*) const {
+void CallGraph::print(raw_ostream &OS, const Module*) const {
+  OS << "CallGraph Root is: ";
+  if (Function *F = Root->getFunction())
+    OS << F->getName() << "\n";
+  else {
+    OS << "<<null function: 0x" << Root << ">>\n";
+  }
+
   for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
     I->second->print(OS);
 }
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index a0d788f..182beca 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/PassManagers.h"
+#include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 92d0d23..7ec4644 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -189,7 +189,7 @@ char GlobalsModRef::ID = 0;
 INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis,
                 "globalsmodref-aa", "Simple mod/ref analysis for globals",    
                 false, true, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis,
                 "globalsmodref-aa", "Simple mod/ref analysis for globals",    
                 false, true, false)
diff --git a/lib/Analysis/IPA/IPA.cpp b/lib/Analysis/IPA/IPA.cpp
index 1c1816d..47357cf 100644
--- a/lib/Analysis/IPA/IPA.cpp
+++ b/lib/Analysis/IPA/IPA.cpp
@@ -19,8 +19,7 @@ using namespace llvm;
 
 /// initializeIPA - Initialize all passes linked into the IPA library.
 void llvm::initializeIPA(PassRegistry &Registry) {
-  initializeBasicCallGraphPass(Registry);
-  initializeCallGraphAnalysisGroup(Registry);
+  initializeCallGraphPass(Registry);
   initializeCallGraphPrinterPass(Registry);
   initializeCallGraphViewerPass(Registry);
   initializeFindUsedTypesPass(Registry);
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 37d73a8..3bc796e 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -59,6 +59,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool ExposesReturnsTwice;
   bool HasDynamicAlloca;
   bool ContainsNoDuplicateCall;
+  bool HasReturn;
+  bool HasIndirectBr;
 
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
@@ -132,6 +134,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitExtractValue(ExtractValueInst &I);
   bool visitInsertValue(InsertValueInst &I);
   bool visitCallSite(CallSite CS);
+  bool visitReturnInst(ReturnInst &RI);
+  bool visitBranchInst(BranchInst &BI);
+  bool visitSwitchInst(SwitchInst &SI);
+  bool visitIndirectBrInst(IndirectBrInst &IBI);
+  bool visitResumeInst(ResumeInst &RI);
+  bool visitUnreachableInst(UnreachableInst &I);
 
 public:
   CallAnalyzer(const DataLayout *TD, const TargetTransformInfo &TTI,
@@ -139,12 +147,13 @@ public:
       : TD(TD), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
+        AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
+        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -704,7 +713,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
 }
 
 bool CallAnalyzer::visitCallSite(CallSite CS) {
-  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
+  if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
       !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::ReturnsTwice)) {
     // This aborts the entire analysis.
@@ -785,6 +794,60 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   return Base::visitCallSite(CS);
 }
 
+bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
+  // At least one return instruction will be free after inlining.
+  bool Free = !HasReturn;
+  HasReturn = true;
+  return Free;
+}
+
+bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
+  // We model unconditional branches as essentially free -- they really
+  // shouldn't exist at all, but handling them makes the behavior of the
+  // inliner more regular and predictable. Interestingly, conditional branches
+  // which will fold away are also free.
+  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(BI.getCondition()));
+}
+
+bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
+  // We model unconditional switches as free, see the comments on handling
+  // branches.
+  return isa<ConstantInt>(SI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(SI.getCondition()));
+}
+
+bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
+  // We never want to inline functions that contain an indirectbr.  This is
+  // incorrect because all the blockaddress's (in static global initializers
+  // for example) would be referring to the original function, and this
+  // indirect jump would jump from the inlined copy of the function into the
+  // original function which is extremely undefined behavior.
+  // FIXME: This logic isn't really right; we can safely inline functions with
+  // indirectbr's as long as no other function or global references the
+  // blockaddress of a block within the current function.  And as a QOI issue,
+  // if someone is using a blockaddress without an indirectbr, and that
+  // reference somehow ends up in another function or global, we probably don't
+  // want to inline this function.
+  HasIndirectBr = true;
+  return false;
+}
+
+bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a resume instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
+  // FIXME: It might be reasonably to discount the cost of instructions leading
+  // to unreachable as they have the lowest possible impact on both runtime and
+  // code size.
+  return true; // No actual code is needed for unreachable.
+}
+
 bool CallAnalyzer::visitInstruction(Instruction &I) {
   // Some instructions are free. All of the free intrinsics can also be
   // handled by SROA, etc.
@@ -808,8 +871,7 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
 /// construct has been detected. It returns false if inlining is no longer
 /// viable, and true if inlining remains viable.
 bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
-  for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end());
-       I != E; ++I) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     ++NumInstructions;
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
@@ -825,7 +887,8 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
       Cost += InlineConstants::InstrCost;
 
     // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+        HasIndirectBr)
       return false;
 
     // If the caller is a recursive function then we don't want to inline
@@ -989,10 +1052,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     }
   }
 
-  // Track whether we've seen a return instruction. The first return
-  // instruction is free, as at least one will usually disappear in inlining.
-  bool HasReturn = false;
-
   // Populate our simplified values by mapping from function arguments to call
   // arguments with known important simplifications.
   CallSite::arg_iterator CAI = CS.arg_begin();
@@ -1039,33 +1098,11 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     if (BB->empty())
       continue;
 
-    // Handle the terminator cost here where we can track returns and other
-    // function-wide constructs.
-    TerminatorInst *TI = BB->getTerminator();
-
-    // We never want to inline functions that contain an indirectbr.  This is
-    // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this
-    // indirect jump would jump from the inlined copy of the function into the 
-    // original function which is extremely undefined behavior.
-    // FIXME: This logic isn't really right; we can safely inline functions
-    // with indirectbr's as long as no other function or global references the
-    // blockaddress of a block within the current function.  And as a QOI issue,
-    // if someone is using a blockaddress without an indirectbr, and that
-    // reference somehow ends up in another function or global, we probably
-    // don't want to inline this function.
-    if (isa<IndirectBrInst>(TI))
-      return false;
-
-    if (!HasReturn && isa<ReturnInst>(TI))
-      HasReturn = true;
-    else
-      Cost += InlineConstants::InstrCost;
-
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     if (!analyzeBlock(BB)) {
-      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+          HasIndirectBr)
         return false;
 
       // If the caller is a recursive function then we don't want to inline
@@ -1078,6 +1115,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       break;
     }
 
+    TerminatorInst *TI = BB->getTerminator();
+
     // Add in the live successors by first checking whether we have terminator
     // that may be simplified based on the values simplified by this call.
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -1171,6 +1210,22 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
   return getInlineCost(CS, CS.getCalledFunction(), Threshold);
 }
 
+/// \brief Test that two functions either have or have not the given attribute
+///        at the same time.
+static bool attributeMatches(Function *F1, Function *F2,
+                             Attribute::AttrKind Attr) {
+  return F1->hasFnAttribute(Attr) == F2->hasFnAttribute(Attr);
+}
+
+/// \brief Test that there are no attribute conflicts between Caller and Callee
+///        that prevent inlining.
+static bool functionsHaveCompatibleAttributes(Function *Caller,
+                                              Function *Callee) {
+  return attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeThread);
+}
+
 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
                                              int Threshold) {
   // Cannot inline indirect calls.
@@ -1179,20 +1234,26 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
 
   // Calls to functions with always-inline attributes should be inlined
   // whenever possible.
-  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::AlwaysInline)) {
+  if (Callee->hasFnAttribute(Attribute::AlwaysInline)) {
     if (isInlineViable(*Callee))
       return llvm::InlineCost::getAlways();
     return llvm::InlineCost::getNever();
   }
 
+  // Never inline functions with conflicting attributes (unless callee has
+  // always-inline attribute).
+  if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee))
+    return llvm::InlineCost::getNever();
+
+  // Don't inline this call if the caller has the optnone attribute.
+  if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
+    return llvm::InlineCost::getNever();
+
   // Don't inline functions which can be redefined at link-time to mean
   // something else.  Don't inline functions marked noinline or call sites
   // marked noinline.
   if (Callee->mayBeOverridden() ||
-      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::NoInline) ||
-      CS.isNoInline())
+      Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
     return llvm::InlineCost::getNever();
 
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index b275dfe..b867af1 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -668,7 +668,8 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
 /// folding.
 static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
-                                                Value *&V) {
+                                                Value *&V,
+                                                bool AllowNonInbounds = false) {
   assert(V->getType()->getScalarType()->isPointerTy());
 
   // Without DataLayout, just be conservative for now. Theoretically, more could
@@ -685,7 +686,8 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!GEP->isInBounds() || !GEP->accumulateConstantOffset(*TD, Offset))
+      if ((!AllowNonInbounds && !GEP->isInBounds()) ||
+          !GEP->accumulateConstantOffset(*TD, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
@@ -1737,7 +1739,7 @@ static Constant *computePointerICmp(const DataLayout *TD,
   RHS = RHS->stripPointerCasts();
 
   // A non-null pointer is not equal to a null pointer.
-  if (llvm::isKnownNonNull(LHS) && isa<ConstantPointerNull>(RHS) &&
+  if (llvm::isKnownNonNull(LHS, TLI) && isa<ConstantPointerNull>(RHS) &&
       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
     return ConstantInt::get(GetCompareTy(LHS),
                             !CmpInst::isTrueWhenEqual(Pred));
@@ -1837,6 +1839,17 @@ static Constant *computePointerICmp(const DataLayout *TD,
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
     }
+
+    // Even if an non-inbounds GEP occurs along the path we can still optimize
+    // equality comparisons concerning the result. We avoid walking the whole
+    // chain again by starting where the last calls to
+    // stripAndComputeConstantOffsets left off and accumulate the offsets.
+    Constant *LHSNoBound = stripAndComputeConstantOffsets(TD, LHS, true);
+    Constant *RHSNoBound = stripAndComputeConstantOffsets(TD, RHS, true);
+    if (LHS == RHS)
+      return ConstantExpr::getICmp(Pred,
+                                   ConstantExpr::getAdd(LHSOffset, LHSNoBound),
+                                   ConstantExpr::getAdd(RHSOffset, RHSNoBound));
   }
 
   // Otherwise, fail.
@@ -2946,6 +2959,7 @@ static bool IsIdempotent(Intrinsic::ID ID) {
   case Intrinsic::trunc:
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
+  case Intrinsic::round:
     return true;
   }
 }
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 9393508..ec17f47 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -207,7 +207,7 @@ void Lint::visitCallSite(CallSite CS) {
             &I);
 
     FunctionType *FT = F->getFunctionType();
-    unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+    unsigned NumActualArgs = CS.arg_size();
 
     Assert1(FT->isVarArg() ?
               FT->getNumParams() <= NumActualArgs :
@@ -504,14 +504,42 @@ void Lint::visitShl(BinaryOperator &I) {
             "Undefined result: Shift count out of range", &I);
 }
 
-static bool isZero(Value *V, DataLayout *TD) {
+static bool isZero(Value *V, DataLayout *DL) {
   // Assume undef could be zero.
-  if (isa<UndefValue>(V)) return true;
+  if (isa<UndefValue>(V))
+    return true;
+
+  VectorType *VecTy = dyn_cast<VectorType>(V->getType());
+  if (!VecTy) {
+    unsigned BitWidth = V->getType()->getIntegerBitWidth();
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    ComputeMaskedBits(V, KnownZero, KnownOne, DL);
+    return KnownZero.isAllOnesValue();
+  }
+
+  // Per-component check doesn't work with zeroinitializer
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (C->isZeroValue())
+    return true;
+
+  // For a vector, KnownZero will only be true if all values are zero, so check
+  // this per component
+  unsigned BitWidth = VecTy->getElementType()->getIntegerBitWidth();
+  for (unsigned I = 0, N = VecTy->getNumElements(); I != N; ++I) {
+    Constant *Elem = C->getAggregateElement(I);
+    if (isa<UndefValue>(Elem))
+      return true;
+
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    ComputeMaskedBits(Elem, KnownZero, KnownOne, DL);
+    if (KnownZero.isAllOnesValue())
+      return true;
+  }
 
-  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
-  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
-  return KnownZero.isAllOnesValue();
+  return false;
 }
 
 void Lint::visitSDiv(BinaryOperator &I) {
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 142ebed..e369633 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -177,10 +177,6 @@ PHINode *Loop::getCanonicalInductionVariable() const {
 
 /// isLCSSAForm - Return true if the Loop is in LCSSA form
 bool Loop::isLCSSAForm(DominatorTree &DT) const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallPtrSet<BasicBlock*, 16> LoopBBs(block_begin(), block_end());
-
   for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) {
     BasicBlock *BB = *BI;
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I)
@@ -196,7 +192,7 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
         // block they are defined in.  Also, blocks not reachable from the
         // entry are special; uses in them don't need to go through PHIs.
         if (UserBB != BB &&
-            !LoopBBs.count(UserBB) &&
+            !contains(UserBB) &&
             DT.isReachableFromEntry(UserBB))
           return false;
       }
@@ -220,12 +216,12 @@ bool Loop::isSafeToClone() const {
   // Return false if any loop blocks contain indirectbrs, or there are any calls
   // to noduplicate functions.
   for (Loop::block_iterator I = block_begin(), E = block_end(); I != E; ++I) {
-    if (isa<IndirectBrInst>((*I)->getTerminator())) {
+    if (isa<IndirectBrInst>((*I)->getTerminator()))
       return false;
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) {
+
+    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator()))
       if (II->hasFnAttr(Attribute::NoDuplicate))
         return false;
-    }
 
     for (BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); BI != BE; ++BI) {
       if (const CallInst *CI = dyn_cast<CallInst>(BI)) {
@@ -309,15 +305,15 @@ bool Loop::isAnnotatedParallel() const {
       if (!II->mayReadOrWriteMemory())
         continue;
 
-      if (!II->getMetadata("llvm.mem.parallel_loop_access"))
-        return false;
-
       // The memory instruction can refer to the loop identifier metadata
       // directly or indirectly through another list metadata (in case of
       // nested parallel loops). The loop identifier metadata refers to
       // itself so we can check both cases with the same routine.
-      MDNode *loopIdMD =
-          dyn_cast<MDNode>(II->getMetadata("llvm.mem.parallel_loop_access"));
+      MDNode *loopIdMD = II->getMetadata("llvm.mem.parallel_loop_access");
+
+      if (!loopIdMD)
+        return false;
+
       bool loopIdMDFound = false;
       for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) {
         if (loopIdMD->getOperand(i) == desiredLoopIdMetadata) {
@@ -337,9 +333,6 @@ bool Loop::isAnnotatedParallel() const {
 /// hasDedicatedExits - Return true if no exit block for the loop
 /// has a predecessor that is outside the loop.
 bool Loop::hasDedicatedExits() const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallPtrSet<BasicBlock *, 16> LoopBBs(block_begin(), block_end());
   // Each predecessor of each exit block of a normal loop is contained
   // within the loop.
   SmallVector<BasicBlock *, 4> ExitBlocks;
@@ -347,7 +340,7 @@ bool Loop::hasDedicatedExits() const {
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
     for (pred_iterator PI = pred_begin(ExitBlocks[i]),
          PE = pred_end(ExitBlocks[i]); PI != PE; ++PI)
-      if (!LoopBBs.count(*PI))
+      if (!contains(*PI))
         return false;
   // All the requirements are met.
   return true;
@@ -362,11 +355,6 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
   assert(hasDedicatedExits() &&
          "getUniqueExitBlocks assumes the loop has canonical form exits!");
 
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallVector<BasicBlock *, 128> LoopBBs(block_begin(), block_end());
-  std::sort(LoopBBs.begin(), LoopBBs.end());
-
   SmallVector<BasicBlock *, 32> switchExitBlocks;
 
   for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI) {
@@ -376,7 +364,7 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
 
     for (succ_iterator I = succ_begin(*BI), E = succ_end(*BI); I != E; ++I) {
       // If block is inside the loop then it is not a exit block.
-      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+      if (contains(*I))
         continue;
 
       pred_iterator PI = pred_begin(*I);
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 0f0a1c9..1db0f63 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -31,12 +31,13 @@
 using namespace llvm;
 
 enum AllocType {
-  MallocLike         = 1<<0, // allocates
-  CallocLike         = 1<<1, // allocates + bzero
-  ReallocLike        = 1<<2, // reallocates
-  StrDupLike         = 1<<3,
+  OpNewLike          = 1<<0, // allocates; never returns null
+  MallocLike         = 1<<1 | OpNewLike, // allocates; may return null
+  CallocLike         = 1<<2, // allocates + bzero
+  ReallocLike        = 1<<3, // reallocates
+  StrDupLike         = 1<<4,
   AllocLike          = MallocLike | CallocLike | StrDupLike,
-  AnyAlloc           = MallocLike | CallocLike | ReallocLike | StrDupLike
+  AnyAlloc           = AllocLike | ReallocLike
 };
 
 struct AllocFnsTy {
@@ -52,20 +53,20 @@ struct AllocFnsTy {
 static const AllocFnsTy AllocationFnData[] = {
   {LibFunc::malloc,              MallocLike,  1, 0,  -1},
   {LibFunc::valloc,              MallocLike,  1, 0,  -1},
-  {LibFunc::Znwj,                MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {LibFunc::Znwj,                OpNewLike,   1, 0,  -1}, // new(unsigned int)
   {LibFunc::ZnwjRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
-  {LibFunc::Znwm,                MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {LibFunc::Znwm,                OpNewLike,   1, 0,  -1}, // new(unsigned long)
   {LibFunc::ZnwmRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
-  {LibFunc::Znaj,                MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::Znaj,                OpNewLike,   1, 0,  -1}, // new[](unsigned int)
   {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
-  {LibFunc::Znam,                MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {LibFunc::Znam,                OpNewLike,   1, 0,  -1}, // new[](unsigned long)
   {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
-  {LibFunc::posix_memalign,      MallocLike,  3, 2,  -1},
   {LibFunc::calloc,              CallocLike,  2, 0,   1},
   {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
   {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
   {LibFunc::strdup,              StrDupLike,  1, -1, -1},
   {LibFunc::strndup,             StrDupLike,  2, 1,  -1}
+  // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
 
@@ -117,7 +118,7 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
     return 0;
 
   const AllocFnsTy *FnData = &AllocationFnData[i];
-  if ((FnData->AllocTy & AllocTy) == 0)
+  if ((FnData->AllocTy & AllocTy) != FnData->AllocTy)
     return 0;
 
   // Check function prototype.
@@ -189,6 +190,13 @@ bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
   return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast);
 }
 
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory and never returns null (such as operator new).
+bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                               bool LookThroughBitCast) {
+  return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast);
+}
+
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
@@ -197,7 +205,7 @@ const CallInst *llvm::extractMallocCall(const Value *I,
   return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : 0;
 }
 
-static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
+static Value *computeArraySize(const CallInst *CI, const DataLayout *DL,
                                const TargetLibraryInfo *TLI,
                                bool LookThroughSExt = false) {
   if (!CI)
@@ -205,12 +213,12 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
 
   // The size of the malloc's result type must be known to determine array size.
   Type *T = getMallocAllocatedType(CI, TLI);
-  if (!T || !T->isSized() || !TD)
+  if (!T || !T->isSized() || !DL)
     return 0;
 
-  unsigned ElementSize = TD->getTypeAllocSize(T);
+  unsigned ElementSize = DL->getTypeAllocSize(T);
   if (StructType *ST = dyn_cast<StructType>(T))
-    ElementSize = TD->getStructLayout(ST)->getSizeInBytes();
+    ElementSize = DL->getStructLayout(ST)->getSizeInBytes();
 
   // If malloc call's arg can be determined to be a multiple of ElementSize,
   // return the multiple.  Otherwise, return NULL.
@@ -227,10 +235,10 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
 const CallInst *llvm::isArrayMalloc(const Value *I,
-                                    const DataLayout *TD,
+                                    const DataLayout *DL,
                                     const TargetLibraryInfo *TLI) {
   const CallInst *CI = extractMallocCall(I, TLI);
-  Value *ArraySize = computeArraySize(CI, TD, TLI);
+  Value *ArraySize = computeArraySize(CI, DL, TLI);
 
   if (ConstantInt *ConstSize = dyn_cast_or_null<ConstantInt>(ArraySize))
     if (ConstSize->isOne())
@@ -288,11 +296,11 @@ Type *llvm::getMallocAllocatedType(const CallInst *CI,
 /// then return that multiple.  For non-array mallocs, the multiple is
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
-Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *TD,
+Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *DL,
                                 const TargetLibraryInfo *TLI,
                                 bool LookThroughSExt) {
   assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call");
-  return computeArraySize(CI, TD, TLI, LookThroughSExt);
+  return computeArraySize(CI, DL, TLI, LookThroughSExt);
 }
 
 
@@ -354,12 +362,12 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
 /// object size in Size if successful, and false otherwise.
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
-bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *TD,
+bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *DL,
                          const TargetLibraryInfo *TLI, bool RoundToAlign) {
-  if (!TD)
+  if (!DL)
     return false;
 
-  ObjectSizeOffsetVisitor Visitor(TD, TLI, Ptr->getContext(), RoundToAlign);
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), RoundToAlign);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -386,12 +394,12 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
   return Size;
 }
 
-ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *TD,
+ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *DL,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
                                                  bool RoundToAlign)
-: TD(TD), TLI(TLI), RoundToAlign(RoundToAlign) {
-  IntegerType *IntTy = TD->getIntPtrType(Context);
+: DL(DL), TLI(TLI), RoundToAlign(RoundToAlign) {
+  IntegerType *IntTy = DL->getIntPtrType(Context);
   IntTyBits = IntTy->getBitWidth();
   Zero = APInt::getNullValue(IntTyBits);
 }
@@ -434,7 +442,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return unknown();
 
-  APInt Size(IntTyBits, TD->getTypeAllocSize(I.getAllocatedType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(I.getAllocatedType()));
   if (!I.isArrayAllocation())
     return std::make_pair(align(Size, I.getAlignment()), Zero);
 
@@ -453,7 +461,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
     return unknown();
   }
   PointerType *PT = cast<PointerType>(A.getType());
-  APInt Size(IntTyBits, TD->getTypeAllocSize(PT->getElementType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(PT->getElementType()));
   return std::make_pair(align(Size, A.getParamAlignment()), Zero);
 }
 
@@ -526,7 +534,7 @@ ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
 SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
   SizeOffsetType PtrData = compute(GEP.getPointerOperand());
   APInt Offset(IntTyBits, 0);
-  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(*TD, Offset))
+  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(*DL, Offset))
     return unknown();
 
   return std::make_pair(PtrData.first, PtrData.second + Offset);
@@ -542,7 +550,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){
   if (!GV.hasDefinitiveInitializer())
     return unknown();
 
-  APInt Size(IntTyBits, TD->getTypeAllocSize(GV.getType()->getElementType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(GV.getType()->getElementType()));
   return std::make_pair(align(Size, GV.getAlignment()), Zero);
 }
 
@@ -578,12 +586,13 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
   return unknown();
 }
 
-
-ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *TD,
-                                                   const TargetLibraryInfo *TLI,
-                                                     LLVMContext &Context)
-: TD(TD), TLI(TLI), Context(Context), Builder(Context, TargetFolder(TD)) {
-  IntTy = TD->getIntPtrType(Context);
+ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *DL,
+                                                     const TargetLibraryInfo *TLI,
+                                                     LLVMContext &Context,
+                                                     bool RoundToAlign)
+: DL(DL), TLI(TLI), Context(Context), Builder(Context, TargetFolder(DL)),
+  RoundToAlign(RoundToAlign) {
+  IntTy = DL->getIntPtrType(Context);
   Zero = ConstantInt::get(IntTy, 0);
 }
 
@@ -607,7 +616,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
-  ObjectSizeOffsetVisitor Visitor(TD, TLI, Context);
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, RoundToAlign);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
@@ -626,13 +635,15 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     Builder.SetInsertPoint(I);
 
-  // record the pointers that were handled in this run, so that they can be
-  // cleaned later if something fails
-  SeenVals.insert(V);
-
   // now compute the size and offset
   SizeOffsetEvalType Result;
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+
+  // Record the pointers that were handled in this run, so that they can be
+  // cleaned later if something fails. We also use this set to break cycles that
+  // can occur in dead code.
+  if (!SeenVals.insert(V)) {
+    Result = unknown();
+  } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     Result = visitGEPOperator(*GEP);
   } else if (Instruction *I = dyn_cast<Instruction>(V)) {
     Result = visit(*I);
@@ -665,7 +676,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
   assert(I.isArrayAllocation());
   Value *ArraySize = I.getArraySize();
   Value *Size = ConstantInt::get(ArraySize->getType(),
-                                 TD->getTypeAllocSize(I.getAllocatedType()));
+                                 DL->getTypeAllocSize(I.getAllocatedType()));
   Size = Builder.CreateMul(Size, ArraySize);
   return std::make_pair(Size, Zero);
 }
@@ -717,7 +728,7 @@ ObjectSizeOffsetEvaluator::visitGEPOperator(GEPOperator &GEP) {
   if (!bothKnown(PtrData))
     return unknown();
 
-  Value *Offset = EmitGEPOffset(&Builder, *TD, &GEP, /*NoAssumptions=*/true);
+  Value *Offset = EmitGEPOffset(&Builder, *DL, &GEP, /*NoAssumptions=*/true);
   Offset = Builder.CreateAdd(PtrData.second, Offset);
   return std::make_pair(PtrData.first, Offset);
 }
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index fe1c874..84ff2ee 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -371,18 +371,19 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
 
   // Walk backwards through the basic block, looking for dependencies.
   while (ScanIt != BB->begin()) {
+    Instruction *Inst = --ScanIt;
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+      // Debug intrinsics don't (and can't) cause dependencies.
+      if (isa<DbgInfoIntrinsic>(II)) continue;
+
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
     --Limit;
     if (!Limit)
       return MemDepResult::getUnknown();
 
-    Instruction *Inst = --ScanIt;
-
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-      // Debug intrinsics don't (and can't) cause dependences.
-      if (isa<DbgInfoIntrinsic>(II)) continue;
-
       // If we reach a lifetime begin or end marker, then the query ends here
       // because the value is undefined.
       if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp
deleted file mode 100644
index 30d213b..0000000
--- a/lib/Analysis/PathNumbering.cpp
+++ /dev/null
@@ -1,521 +0,0 @@
-//===- PathNumbering.cpp --------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Ball-Larus path numbers uniquely identify paths through a directed acyclic
-// graph (DAG) [Ball96].  For a CFG backedges are removed and replaced by phony
-// edges to obtain a DAG, and thus the unique path numbers [Ball96].
-//
-// The purpose of this analysis is to enumerate the edges in a CFG in order
-// to obtain paths from path numbers in a convenient manner.  As described in
-// [Ball96] edges can be enumerated such that given a path number by following
-// the CFG and updating the path number, the path is obtained.
-//
-// [Ball96]
-//  T. Ball and J. R. Larus. "Efficient Path Profiling."
-//  International Symposium on Microarchitecture, pages 46-57, 1996.
-//  http://portal.acm.org/citation.cfm?id=243857
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "ball-larus-numbering"
-
-#include "llvm/Analysis/PathNumbering.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <queue>
-#include <sstream>
-#include <stack>
-#include <string>
-#include <utility>
-
-using namespace llvm;
-
-// Are we enabling early termination
-static cl::opt<bool> ProcessEarlyTermination(
-  "path-profile-early-termination", cl::Hidden,
-  cl::desc("In path profiling, insert extra instrumentation to account for "
-           "unexpected function termination."));
-
-// Returns the basic block for the BallLarusNode
-BasicBlock* BallLarusNode::getBlock() {
-  return(_basicBlock);
-}
-
-// Returns the number of paths to the exit starting at the node.
-unsigned BallLarusNode::getNumberPaths() {
-  return(_numberPaths);
-}
-
-// Sets the number of paths to the exit starting at the node.
-void BallLarusNode::setNumberPaths(unsigned numberPaths) {
-  _numberPaths = numberPaths;
-}
-
-// Gets the NodeColor used in graph algorithms.
-BallLarusNode::NodeColor BallLarusNode::getColor() {
-  return(_color);
-}
-
-// Sets the NodeColor used in graph algorithms.
-void BallLarusNode::setColor(BallLarusNode::NodeColor color) {
-  _color = color;
-}
-
-// Returns an iterator over predecessor edges. Includes phony and
-// backedges.
-BLEdgeIterator BallLarusNode::predBegin() {
-  return(_predEdges.begin());
-}
-
-// Returns the end sentinel for the predecessor iterator.
-BLEdgeIterator BallLarusNode::predEnd() {
-  return(_predEdges.end());
-}
-
-// Returns the number of predecessor edges.  Includes phony and
-// backedges.
-unsigned BallLarusNode::getNumberPredEdges() {
-  return(_predEdges.size());
-}
-
-// Returns an iterator over successor edges. Includes phony and
-// backedges.
-BLEdgeIterator BallLarusNode::succBegin() {
-  return(_succEdges.begin());
-}
-
-// Returns the end sentinel for the successor iterator.
-BLEdgeIterator BallLarusNode::succEnd() {
-  return(_succEdges.end());
-}
-
-// Returns the number of successor edges.  Includes phony and
-// backedges.
-unsigned BallLarusNode::getNumberSuccEdges() {
-  return(_succEdges.size());
-}
-
-// Add an edge to the predecessor list.
-void BallLarusNode::addPredEdge(BallLarusEdge* edge) {
-  _predEdges.push_back(edge);
-}
-
-// Remove an edge from the predecessor list.
-void BallLarusNode::removePredEdge(BallLarusEdge* edge) {
-  removeEdge(_predEdges, edge);
-}
-
-// Add an edge to the successor list.
-void BallLarusNode::addSuccEdge(BallLarusEdge* edge) {
-  _succEdges.push_back(edge);
-}
-
-// Remove an edge from the successor list.
-void BallLarusNode::removeSuccEdge(BallLarusEdge* edge) {
-  removeEdge(_succEdges, edge);
-}
-
-// Returns the name of the BasicBlock being represented.  If BasicBlock
-// is null then returns "<null>".  If BasicBlock has no name, then
-// "<unnamed>" is returned.  Intended for use with debug output.
-std::string BallLarusNode::getName() {
-  std::stringstream name;
-
-  if(getBlock() != NULL) {
-    if(getBlock()->hasName()) {
-      std::string tempName(getBlock()->getName());
-      name << tempName.c_str() << " (" << _uid << ")";
-    } else
-      name << "<unnamed> (" << _uid << ")";
-  } else
-    name << "<null> (" << _uid << ")";
-
-  return name.str();
-}
-
-// Removes an edge from an edgeVector.  Used by removePredEdge and
-// removeSuccEdge.
-void BallLarusNode::removeEdge(BLEdgeVector& v, BallLarusEdge* e) {
-  // TODO: Avoid linear scan by using a set instead
-  for(BLEdgeIterator i = v.begin(),
-        end = v.end();
-      i != end;
-      ++i) {
-    if((*i) == e) {
-      v.erase(i);
-      break;
-    }
-  }
-}
-
-// Returns the source node of this edge.
-BallLarusNode* BallLarusEdge::getSource() const {
-  return(_source);
-}
-
-// Returns the target node of this edge.
-BallLarusNode* BallLarusEdge::getTarget() const {
-  return(_target);
-}
-
-// Sets the type of the edge.
-BallLarusEdge::EdgeType BallLarusEdge::getType() const {
-  return _edgeType;
-}
-
-// Gets the type of the edge.
-void BallLarusEdge::setType(EdgeType type) {
-  _edgeType = type;
-}
-
-// Returns the weight of this edge.  Used to decode path numbers to sequences
-// of basic blocks.
-unsigned BallLarusEdge::getWeight() {
-  return(_weight);
-}
-
-// Sets the weight of the edge.  Used during path numbering.
-void BallLarusEdge::setWeight(unsigned weight) {
-  _weight = weight;
-}
-
-// Gets the phony edge originating at the root.
-BallLarusEdge* BallLarusEdge::getPhonyRoot() {
-  return _phonyRoot;
-}
-
-// Sets the phony edge originating at the root.
-void BallLarusEdge::setPhonyRoot(BallLarusEdge* phonyRoot) {
-  _phonyRoot = phonyRoot;
-}
-
-// Gets the phony edge terminating at the exit.
-BallLarusEdge* BallLarusEdge::getPhonyExit() {
-  return _phonyExit;
-}
-
-// Sets the phony edge terminating at the exit.
-void BallLarusEdge::setPhonyExit(BallLarusEdge* phonyExit) {
-  _phonyExit = phonyExit;
-}
-
-// Gets the associated real edge if this is a phony edge.
-BallLarusEdge* BallLarusEdge::getRealEdge() {
-  return _realEdge;
-}
-
-// Sets the associated real edge if this is a phony edge.
-void BallLarusEdge::setRealEdge(BallLarusEdge* realEdge) {
-  _realEdge = realEdge;
-}
-
-// Returns the duplicate number of the edge.
-unsigned BallLarusEdge::getDuplicateNumber() {
-  return(_duplicateNumber);
-}
-
-// Initialization that requires virtual functions which are not fully
-// functional in the constructor.
-void BallLarusDag::init() {
-  BLBlockNodeMap inDag;
-  std::stack<BallLarusNode*> dfsStack;
-
-  _root = addNode(&(_function.getEntryBlock()));
-  _exit = addNode(NULL);
-
-  // start search from root
-  dfsStack.push(getRoot());
-
-  // dfs to add each bb into the dag
-  while(dfsStack.size())
-    buildNode(inDag, dfsStack);
-
-  // put in the final edge
-  addEdge(getExit(),getRoot(),0);
-}
-
-// Frees all memory associated with the DAG.
-BallLarusDag::~BallLarusDag() {
-  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end(); edge != end;
-      ++edge)
-    delete (*edge);
-
-  for(BLNodeIterator node = _nodes.begin(), end = _nodes.end(); node != end;
-      ++node)
-    delete (*node);
-}
-
-// Calculate the path numbers by assigning edge increments as prescribed
-// in Ball-Larus path profiling.
-void BallLarusDag::calculatePathNumbers() {
-  BallLarusNode* node;
-  std::queue<BallLarusNode*> bfsQueue;
-  bfsQueue.push(getExit());
-
-  while(bfsQueue.size() > 0) {
-    node = bfsQueue.front();
-
-    DEBUG(dbgs() << "calculatePathNumbers on " << node->getName() << "\n");
-
-    bfsQueue.pop();
-    unsigned prevPathNumber = node->getNumberPaths();
-    calculatePathNumbersFrom(node);
-
-    // Check for DAG splitting
-    if( node->getNumberPaths() > 100000000 && node != getRoot() ) {
-      // Add new phony edge from the split-node to the DAG's exit
-      BallLarusEdge* exitEdge = addEdge(node, getExit(), 0);
-      exitEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
-
-      // Counters to handle the possibility of a multi-graph
-      BasicBlock* oldTarget = 0;
-      unsigned duplicateNumber = 0;
-
-      // Iterate through each successor edge, adding phony edges
-      for( BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
-           succ != end; oldTarget = (*succ)->getTarget()->getBlock(), succ++ ) {
-
-        if( (*succ)->getType() == BallLarusEdge::NORMAL ) {
-          // is this edge a duplicate?
-          if( oldTarget != (*succ)->getTarget()->getBlock() )
-            duplicateNumber = 0;
-
-          // create the new phony edge: root -> succ
-          BallLarusEdge* rootEdge =
-            addEdge(getRoot(), (*succ)->getTarget(), duplicateNumber++);
-          rootEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
-          rootEdge->setRealEdge(*succ);
-
-          // split on this edge and reference it's exit/root phony edges
-          (*succ)->setType(BallLarusEdge::SPLITEDGE);
-          (*succ)->setPhonyRoot(rootEdge);
-          (*succ)->setPhonyExit(exitEdge);
-          (*succ)->setWeight(0);
-        }
-      }
-
-      calculatePathNumbersFrom(node);
-    }
-
-    DEBUG(dbgs() << "prev, new number paths " << prevPathNumber << ", "
-          << node->getNumberPaths() << ".\n");
-
-    if(prevPathNumber == 0 && node->getNumberPaths() != 0) {
-      DEBUG(dbgs() << "node ready : " << node->getName() << "\n");
-      for(BLEdgeIterator pred = node->predBegin(), end = node->predEnd();
-          pred != end; pred++) {
-        if( (*pred)->getType() == BallLarusEdge::BACKEDGE ||
-            (*pred)->getType() == BallLarusEdge::SPLITEDGE )
-          continue;
-
-        BallLarusNode* nextNode = (*pred)->getSource();
-        // not yet visited?
-        if(nextNode->getNumberPaths() == 0)
-          bfsQueue.push(nextNode);
-      }
-    }
-  }
-
-  DEBUG(dbgs() << "\tNumber of paths: " << getRoot()->getNumberPaths() << "\n");
-}
-
-// Returns the number of paths for the Dag.
-unsigned BallLarusDag::getNumberOfPaths() {
-  return(getRoot()->getNumberPaths());
-}
-
-// Returns the root (i.e. entry) node for the DAG.
-BallLarusNode* BallLarusDag::getRoot() {
-  return _root;
-}
-
-// Returns the exit node for the DAG.
-BallLarusNode* BallLarusDag::getExit() {
-  return _exit;
-}
-
-// Returns the function for the DAG.
-Function& BallLarusDag::getFunction() {
-  return(_function);
-}
-
-// Clears the node colors.
-void BallLarusDag::clearColors(BallLarusNode::NodeColor color) {
-  for (BLNodeIterator nodeIt = _nodes.begin(); nodeIt != _nodes.end(); nodeIt++)
-    (*nodeIt)->setColor(color);
-}
-
-// Processes one node and its imediate edges for building the DAG.
-void BallLarusDag::buildNode(BLBlockNodeMap& inDag, BLNodeStack& dfsStack) {
-  BallLarusNode* currentNode = dfsStack.top();
-  BasicBlock* currentBlock = currentNode->getBlock();
-
-  if(currentNode->getColor() != BallLarusNode::WHITE) {
-    // we have already visited this node
-    dfsStack.pop();
-    currentNode->setColor(BallLarusNode::BLACK);
-  } else {
-    // are there any external procedure calls?
-    if( ProcessEarlyTermination ) {
-      for( BasicBlock::iterator bbCurrent = currentNode->getBlock()->begin(),
-             bbEnd = currentNode->getBlock()->end(); bbCurrent != bbEnd;
-           bbCurrent++ ) {
-        Instruction& instr = *bbCurrent;
-        if( instr.getOpcode() == Instruction::Call ) {
-          BallLarusEdge* callEdge = addEdge(currentNode, getExit(), 0);
-          callEdge->setType(BallLarusEdge::CALLEDGE_PHONY);
-          break;
-        }
-      }
-    }
-
-    TerminatorInst* terminator = currentNode->getBlock()->getTerminator();
-    if(isa<ReturnInst>(terminator) || isa<UnreachableInst>(terminator) ||
-       isa<ResumeInst>(terminator))
-      addEdge(currentNode, getExit(),0);
-
-    currentNode->setColor(BallLarusNode::GRAY);
-    inDag[currentBlock] = currentNode;
-
-    BasicBlock* oldSuccessor = 0;
-    unsigned duplicateNumber = 0;
-
-    // iterate through this node's successors
-    for(succ_iterator successor = succ_begin(currentBlock),
-          succEnd = succ_end(currentBlock); successor != succEnd;
-        oldSuccessor = *successor, ++successor ) {
-      BasicBlock* succBB = *successor;
-
-      // is this edge a duplicate?
-      if (oldSuccessor == succBB)
-        duplicateNumber++;
-      else
-        duplicateNumber = 0;
-
-      buildEdge(inDag, dfsStack, currentNode, succBB, duplicateNumber);
-    }
-  }
-}
-
-// Process an edge in the CFG for DAG building.
-void BallLarusDag::buildEdge(BLBlockNodeMap& inDag, std::stack<BallLarusNode*>&
-                             dfsStack, BallLarusNode* currentNode,
-                             BasicBlock* succBB, unsigned duplicateCount) {
-  BallLarusNode* succNode = inDag[succBB];
-
-  if(succNode && succNode->getColor() == BallLarusNode::BLACK) {
-    // visited node and forward edge
-    addEdge(currentNode, succNode, duplicateCount);
-  } else if(succNode && succNode->getColor() == BallLarusNode::GRAY) {
-    // visited node and back edge
-    DEBUG(dbgs() << "Backedge detected.\n");
-    addBackedge(currentNode, succNode, duplicateCount);
-  } else {
-    BallLarusNode* childNode;
-    // not visited node and forward edge
-    if(succNode) // an unvisited node that is child of a gray node
-      childNode = succNode;
-    else { // an unvisited node that is a child of a an unvisted node
-      childNode = addNode(succBB);
-      inDag[succBB] = childNode;
-    }
-    addEdge(currentNode, childNode, duplicateCount);
-    dfsStack.push(childNode);
-  }
-}
-
-// The weight on each edge is the increment required along any path that
-// contains that edge.
-void BallLarusDag::calculatePathNumbersFrom(BallLarusNode* node) {
-  if(node == getExit())
-    // The Exit node must be base case
-    node->setNumberPaths(1);
-  else {
-    unsigned sumPaths = 0;
-    BallLarusNode* succNode;
-
-    for(BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
-        succ != end; succ++) {
-      if( (*succ)->getType() == BallLarusEdge::BACKEDGE ||
-          (*succ)->getType() == BallLarusEdge::SPLITEDGE )
-        continue;
-
-      (*succ)->setWeight(sumPaths);
-      succNode = (*succ)->getTarget();
-
-      if( !succNode->getNumberPaths() )
-        return;
-      sumPaths += succNode->getNumberPaths();
-    }
-
-    node->setNumberPaths(sumPaths);
-  }
-}
-
-// Allows subclasses to determine which type of Node is created.
-// Override this method to produce subclasses of BallLarusNode if
-// necessary. The destructor of BallLarusDag will call free on each
-// pointer created.
-BallLarusNode* BallLarusDag::createNode(BasicBlock* BB) {
-  return( new BallLarusNode(BB) );
-}
-
-// Allows subclasses to determine which type of Edge is created.
-// Override this method to produce subclasses of BallLarusEdge if
-// necessary. The destructor of BallLarusDag will call free on each
-// pointer created.
-BallLarusEdge* BallLarusDag::createEdge(BallLarusNode* source,
-                                        BallLarusNode* target,
-                                        unsigned duplicateCount) {
-  return( new BallLarusEdge(source, target, duplicateCount) );
-}
-
-// Proxy to node's constructor.  Updates the DAG state.
-BallLarusNode* BallLarusDag::addNode(BasicBlock* BB) {
-  BallLarusNode* newNode = createNode(BB);
-  _nodes.push_back(newNode);
-  return( newNode );
-}
-
-// Proxy to edge's constructor. Updates the DAG state.
-BallLarusEdge* BallLarusDag::addEdge(BallLarusNode* source,
-                                     BallLarusNode* target,
-                                     unsigned duplicateCount) {
-  BallLarusEdge* newEdge = createEdge(source, target, duplicateCount);
-  _edges.push_back(newEdge);
-  source->addSuccEdge(newEdge);
-  target->addPredEdge(newEdge);
-  return(newEdge);
-}
-
-// Adds a backedge with its phony edges. Updates the DAG state.
-void BallLarusDag::addBackedge(BallLarusNode* source, BallLarusNode* target,
-                               unsigned duplicateCount) {
-  BallLarusEdge* childEdge = addEdge(source, target, duplicateCount);
-  childEdge->setType(BallLarusEdge::BACKEDGE);
-
-  childEdge->setPhonyRoot(addEdge(getRoot(), target,0));
-  childEdge->setPhonyExit(addEdge(source, getExit(),0));
-
-  childEdge->getPhonyRoot()->setRealEdge(childEdge);
-  childEdge->getPhonyRoot()->setType(BallLarusEdge::BACKEDGE_PHONY);
-
-  childEdge->getPhonyExit()->setRealEdge(childEdge);
-  childEdge->getPhonyExit()->setType(BallLarusEdge::BACKEDGE_PHONY);
-  _backEdges.push_back(childEdge);
-}
diff --git a/lib/Analysis/PathProfileInfo.cpp b/lib/Analysis/PathProfileInfo.cpp
deleted file mode 100644
index bc53221..0000000
--- a/lib/Analysis/PathProfileInfo.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-//===- PathProfileInfo.cpp ------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interface used by optimizers to load path profiles,
-// and provides a loader pass which reads a path profile file.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "path-profile-info"
-
-#include "llvm/Analysis/PathProfileInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdio>
-
-using namespace llvm;
-
-// command line option for loading path profiles
-static cl::opt<std::string>
-PathProfileInfoFilename("path-profile-loader-file", cl::init("llvmprof.out"),
-  cl::value_desc("filename"),
-  cl::desc("Path profile file loaded by -path-profile-loader"), cl::Hidden);
-
-namespace {
-  class PathProfileLoaderPass : public ModulePass, public PathProfileInfo {
-  public:
-    PathProfileLoaderPass() : ModulePass(ID) { }
-    ~PathProfileLoaderPass();
-
-    // this pass doesn't change anything (only loads information)
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    // the full name of the loader pass
-    virtual const char* getPassName() const {
-      return "Path Profiling Information Loader";
-    }
-
-    // required since this pass implements multiple inheritance
-                virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &PathProfileInfo::ID)
-        return (PathProfileInfo*)this;
-      return this;
-    }
-
-    // entry point to run the pass
-    bool runOnModule(Module &M);
-
-    // pass identification
-    static char ID;
-
-  private:
-    // make a reference table to refer to function by number
-    void buildFunctionRefs(Module &M);
-
-    // process argument info of a program from the input file
-    void handleArgumentInfo();
-
-    // process path number information from the input file
-    void handlePathInfo();
-
-    // array of references to the functions in the module
-    std::vector<Function*> _functions;
-
-    // path profile file handle
-    FILE* _file;
-
-    // path profile file name
-    std::string _filename;
-  };
-}
-
-// register PathLoader
-char PathProfileLoaderPass::ID = 0;
-
-INITIALIZE_ANALYSIS_GROUP(PathProfileInfo, "Path Profile Information",
-                          NoPathProfileInfo)
-INITIALIZE_AG_PASS(PathProfileLoaderPass, PathProfileInfo,
-                   "path-profile-loader",
-                   "Load path profile information from file",
-                   false, true, false)
-
-char &llvm::PathProfileLoaderPassID = PathProfileLoaderPass::ID;
-
-// link PathLoader as a pass, and make it available as an optimisation
-ModulePass *llvm::createPathProfileLoaderPass() {
-  return new PathProfileLoaderPass;
-}
-
-// ----------------------------------------------------------------------------
-// PathEdge implementation
-//
-ProfilePathEdge::ProfilePathEdge (BasicBlock* source, BasicBlock* target,
-                                  unsigned duplicateNumber)
-  : _source(source), _target(target), _duplicateNumber(duplicateNumber) {}
-
-// ----------------------------------------------------------------------------
-// Path implementation
-//
-
-ProfilePath::ProfilePath (unsigned int number, unsigned int count,
-                          double countStdDev,   PathProfileInfo* ppi)
-  : _number(number) , _count(count), _countStdDev(countStdDev), _ppi(ppi) {}
-
-double ProfilePath::getFrequency() const {
-  return 100 * double(_count) /
-    double(_ppi->_functionPathCounts[_ppi->_currentFunction]);
-}
-
-static BallLarusEdge* getNextEdge (BallLarusNode* node,
-                                   unsigned int pathNumber) {
-  BallLarusEdge* best = 0;
-
-  for( BLEdgeIterator next = node->succBegin(),
-         end = node->succEnd(); next != end; next++ ) {
-    if( (*next)->getType() != BallLarusEdge::BACKEDGE && // no backedges
-        (*next)->getType() != BallLarusEdge::SPLITEDGE && // no split edges
-        (*next)->getWeight() <= pathNumber && // weight must be <= pathNumber
-        (!best || (best->getWeight() < (*next)->getWeight())) ) // best one?
-      best = *next;
-  }
-
-  return best;
-}
-
-ProfilePathEdgeVector* ProfilePath::getPathEdges() const {
-  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
-  unsigned int increment = _number;
-  ProfilePathEdgeVector* pev = new ProfilePathEdgeVector;
-
-  while (currentNode != _ppi->_currentDag->getExit()) {
-    BallLarusEdge* next = getNextEdge(currentNode, increment);
-
-    increment -= next->getWeight();
-
-    if( next->getType() != BallLarusEdge::BACKEDGE_PHONY &&
-        next->getType() != BallLarusEdge::SPLITEDGE_PHONY &&
-        next->getTarget() != _ppi->_currentDag->getExit() )
-      pev->push_back(ProfilePathEdge(
-                       next->getSource()->getBlock(),
-                       next->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    if( next->getType() == BallLarusEdge::BACKEDGE_PHONY &&
-        next->getTarget() == _ppi->_currentDag->getExit() )
-      pev->push_back(ProfilePathEdge(
-                       next->getRealEdge()->getSource()->getBlock(),
-                       next->getRealEdge()->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    if( next->getType() == BallLarusEdge::SPLITEDGE_PHONY &&
-        next->getSource() == _ppi->_currentDag->getRoot() )
-      pev->push_back(ProfilePathEdge(
-                       next->getRealEdge()->getSource()->getBlock(),
-                       next->getRealEdge()->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    // set the new node
-    currentNode = next->getTarget();
-  }
-
-  return pev;
-}
-
-ProfilePathBlockVector* ProfilePath::getPathBlocks() const {
-  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
-  unsigned int increment = _number;
-  ProfilePathBlockVector* pbv = new ProfilePathBlockVector;
-
-  while (currentNode != _ppi->_currentDag->getExit()) {
-    BallLarusEdge* next = getNextEdge(currentNode, increment);
-    increment -= next->getWeight();
-
-    // add block to the block list if it is a real edge
-    if( next->getType() == BallLarusEdge::NORMAL)
-      pbv->push_back (currentNode->getBlock());
-    // make the back edge the last edge since we are at the end
-    else if( next->getTarget() == _ppi->_currentDag->getExit() ) {
-      pbv->push_back (currentNode->getBlock());
-      pbv->push_back (next->getRealEdge()->getTarget()->getBlock());
-    }
-
-    // set the new node
-    currentNode = next->getTarget();
-  }
-
-  return pbv;
-}
-
-BasicBlock* ProfilePath::getFirstBlockInPath() const {
-  BallLarusNode* root = _ppi->_currentDag->getRoot();
-  BallLarusEdge* edge = getNextEdge(root, _number);
-
-  if( edge && (edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
-               edge->getType() == BallLarusEdge::SPLITEDGE_PHONY) )
-    return edge->getTarget()->getBlock();
-
-  return root->getBlock();
-}
-
-// ----------------------------------------------------------------------------
-// PathProfileInfo implementation
-//
-
-// Pass identification
-char llvm::PathProfileInfo::ID = 0;
-
-PathProfileInfo::PathProfileInfo () : _currentDag(0) , _currentFunction(0) {
-}
-
-PathProfileInfo::~PathProfileInfo() {
-  if (_currentDag)
-    delete _currentDag;
-}
-
-// set the function for which paths are currently begin processed
-void PathProfileInfo::setCurrentFunction(Function* F) {
-  // Make sure it exists
-  if (!F) return;
-
-  if (_currentDag)
-    delete _currentDag;
-
-  _currentFunction = F;
-  _currentDag = new BallLarusDag(*F);
-  _currentDag->init();
-  _currentDag->calculatePathNumbers();
-}
-
-// get the function for which paths are currently being processed
-Function* PathProfileInfo::getCurrentFunction() const {
-  return _currentFunction;
-}
-
-// get the entry block of the function
-BasicBlock* PathProfileInfo::getCurrentFunctionEntry() {
-  return _currentDag->getRoot()->getBlock();
-}
-
-// return the path based on its number
-ProfilePath* PathProfileInfo::getPath(unsigned int number) {
-  return _functionPaths[_currentFunction][number];
-}
-
-// return the number of paths which a function may potentially execute
-unsigned int PathProfileInfo::getPotentialPathCount() {
-  return _currentDag ? _currentDag->getNumberOfPaths() : 0;
-}
-
-// return an iterator for the beginning of a functions executed paths
-ProfilePathIterator PathProfileInfo::pathBegin() {
-  return _functionPaths[_currentFunction].begin();
-}
-
-// return an iterator for the end of a functions executed paths
-ProfilePathIterator PathProfileInfo::pathEnd() {
-  return _functionPaths[_currentFunction].end();
-}
-
-// returns the total number of paths run in the function
-unsigned int PathProfileInfo::pathsRun() {
-  return _currentFunction ? _functionPaths[_currentFunction].size() : 0;
-}
-
-// ----------------------------------------------------------------------------
-// PathLoader implementation
-//
-
-// remove all generated paths
-PathProfileLoaderPass::~PathProfileLoaderPass() {
-  for( FunctionPathIterator funcNext = _functionPaths.begin(),
-         funcEnd = _functionPaths.end(); funcNext != funcEnd; funcNext++)
-    for( ProfilePathIterator pathNext = funcNext->second.begin(),
-           pathEnd = funcNext->second.end(); pathNext != pathEnd; pathNext++)
-      delete pathNext->second;
-}
-
-// entry point of the pass; this loads and parses a file
-bool PathProfileLoaderPass::runOnModule(Module &M) {
-  // get the filename and setup the module's function references
-  _filename = PathProfileInfoFilename;
-  buildFunctionRefs (M);
-
-  if (!(_file = fopen(_filename.c_str(), "rb"))) {
-    errs () << "error: input '" << _filename << "' file does not exist.\n";
-    return false;
-  }
-
-  ProfilingType profType;
-
-  while( fread(&profType, sizeof(ProfilingType), 1, _file) ) {
-    switch (profType) {
-    case ArgumentInfo:
-      handleArgumentInfo ();
-      break;
-    case PathInfo:
-      handlePathInfo ();
-      break;
-    default:
-      errs () << "error: bad path profiling file syntax, " << profType << "\n";
-      fclose (_file);
-      return false;
-    }
-  }
-
-  fclose (_file);
-
-  return true;
-}
-
-// create a reference table for functions defined in the path profile file
-void PathProfileLoaderPass::buildFunctionRefs (Module &M) {
-  _functions.push_back(0); // make the 0 index a null pointer
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
-    if (F->isDeclaration())
-      continue;
-    _functions.push_back(F);
-  }
-}
-
-// handle command like argument infor in the output file
-void PathProfileLoaderPass::handleArgumentInfo() {
-  // get the argument list's length
-  unsigned savedArgsLength;
-  if( fread(&savedArgsLength, sizeof(unsigned), 1, _file) != 1 ) {
-    errs() << "warning: argument info header/data mismatch\n";
-    return;
-  }
-
-  // allocate a buffer, and get the arguments
-  char* args = new char[savedArgsLength+1];
-  if( fread(args, 1, savedArgsLength, _file) != savedArgsLength )
-    errs() << "warning: argument info header/data mismatch\n";
-
-  args[savedArgsLength] = '\0';
-  argList = std::string(args);
-  delete [] args; // cleanup dynamic string
-
-  // byte alignment
-  if (savedArgsLength & 3)
-    fseek(_file, 4-(savedArgsLength&3), SEEK_CUR);
-}
-
-// Handle path profile information in the output file
-void PathProfileLoaderPass::handlePathInfo () {
-  // get the number of functions in this profile
-  unsigned functionCount;
-  if( fread(&functionCount, sizeof(functionCount), 1, _file) != 1 ) {
-    errs() << "warning: path info header/data mismatch\n";
-    return;
-  }
-
-  // gather path information for each function
-  for (unsigned i = 0; i < functionCount; i++) {
-    PathProfileHeader pathHeader;
-    if( fread(&pathHeader, sizeof(pathHeader), 1, _file) != 1 ) {
-      errs() << "warning: bad header for path function info\n";
-      break;
-    }
-
-    Function* f = _functions[pathHeader.fnNumber];
-
-    // dynamically allocate a table to store path numbers
-    PathProfileTableEntry* pathTable =
-      new PathProfileTableEntry[pathHeader.numEntries];
-
-    if( fread(pathTable, sizeof(PathProfileTableEntry),
-              pathHeader.numEntries, _file) != pathHeader.numEntries) {
-      delete [] pathTable;
-      errs() << "warning: path function info header/data mismatch\n";
-      return;
-    }
-
-    // Build a new path for the current function
-    unsigned int totalPaths = 0;
-    for (unsigned int j = 0; j < pathHeader.numEntries; j++) {
-      totalPaths += pathTable[j].pathCounter;
-      _functionPaths[f][pathTable[j].pathNumber]
-        = new ProfilePath(pathTable[j].pathNumber, pathTable[j].pathCounter,
-                          0, this);
-    }
-
-    _functionPathCounts[f] = totalPaths;
-
-    delete [] pathTable;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//  NoProfile PathProfileInfo implementation
-//
-
-namespace {
-  struct NoPathProfileInfo : public ImmutablePass, public PathProfileInfo {
-    static char ID; // Class identification, replacement for typeinfo
-    NoPathProfileInfo() : ImmutablePass(ID) {
-      initializeNoPathProfileInfoPass(*PassRegistry::getPassRegistry());
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &PathProfileInfo::ID)
-        return (PathProfileInfo*)this;
-      return this;
-    }
-
-    virtual const char *getPassName() const {
-      return "NoPathProfileInfo";
-    }
-  };
-}  // End of anonymous namespace
-
-char NoPathProfileInfo::ID = 0;
-// Register this pass...
-INITIALIZE_AG_PASS(NoPathProfileInfo, PathProfileInfo, "no-path-profile",
-                   "No Path Profile Information", false, true, true)
-
-ImmutablePass *llvm::createNoPathProfileInfoPass() { return new NoPathProfileInfo(); }
diff --git a/lib/Analysis/PathProfileVerifier.cpp b/lib/Analysis/PathProfileVerifier.cpp
deleted file mode 100644
index 48d7d05..0000000
--- a/lib/Analysis/PathProfileVerifier.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===- PathProfileVerifier.cpp --------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This verifier derives an edge profile file from current path profile
-// information
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "path-profile-verifier"
-
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/PathProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <stdio.h>
-
-using namespace llvm;
-
-namespace {
-  class PathProfileVerifier : public ModulePass {
-  private:
-    bool runOnModule(Module &M);
-
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    PathProfileVerifier() : ModulePass(ID) {
-      initializePathProfileVerifierPass(*PassRegistry::getPassRegistry());
-    }
-
-
-    virtual const char *getPassName() const {
-      return "Path Profiler Verifier";
-    }
-
-    // The verifier requires the path profile and edge profile.
-    virtual void getAnalysisUsage(AnalysisUsage& AU) const;
-  };
-}
-
-static cl::opt<std::string>
-EdgeProfileFilename("path-profile-verifier-file",
-  cl::init("edgefrompath.llvmprof.out"),
-  cl::value_desc("filename"),
-  cl::desc("Edge profile file generated by -path-profile-verifier"),
-  cl::Hidden);
-
-char PathProfileVerifier::ID = 0;
-INITIALIZE_PASS(PathProfileVerifier, "path-profile-verifier",
-                "Compare the path profile derived edge profile against the "
-                "edge profile.", true, true)
-
-ModulePass *llvm::createPathProfileVerifierPass() {
-  return new PathProfileVerifier();
-}
-
-// The verifier requires the path profile and edge profile.
-void PathProfileVerifier::getAnalysisUsage(AnalysisUsage& AU) const {
-  AU.addRequired<PathProfileInfo>();
-  AU.addPreserved<PathProfileInfo>();
-}
-
-typedef std::map<unsigned, unsigned> DuplicateToIndexMap;
-typedef std::map<BasicBlock*,DuplicateToIndexMap> BlockToDuplicateMap;
-typedef std::map<BasicBlock*,BlockToDuplicateMap> NestedBlockToIndexMap;
-
-// the verifier iterates through each path to gather the total
-// number of edge frequencies
-bool PathProfileVerifier::runOnModule (Module &M) {
-  PathProfileInfo& pathProfileInfo = getAnalysis<PathProfileInfo>();
-
-  // setup a data structure to map path edges which index an
-  // array of edge counters
-  NestedBlockToIndexMap arrayMap;
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-
-    arrayMap[(BasicBlock*)0][F->begin()][0] = i++;
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-
-      unsigned duplicate = 0;
-      BasicBlock* prev = 0;
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e;
-           prev = TI->getSuccessor(s), ++s) {
-        if (prev == TI->getSuccessor(s))
-          duplicate++;
-        else duplicate = 0;
-
-        arrayMap[BB][TI->getSuccessor(s)][duplicate] = i++;
-      }
-    }
-  }
-
-  std::vector<unsigned> edgeArray(i);
-
-  // iterate through each path and increment the edge counters as needed
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-
-    pathProfileInfo.setCurrentFunction(F);
-
-    DEBUG(dbgs() << "function '" << F->getName() << "' ran "
-          << pathProfileInfo.pathsRun()
-          << "/" << pathProfileInfo.getPotentialPathCount()
-          << " potential paths\n");
-
-    for( ProfilePathIterator nextPath = pathProfileInfo.pathBegin(),
-           endPath = pathProfileInfo.pathEnd();
-         nextPath != endPath; nextPath++ ) {
-      ProfilePath* currentPath = nextPath->second;
-
-      ProfilePathEdgeVector* pev = currentPath->getPathEdges();
-      DEBUG(dbgs () << "path #" << currentPath->getNumber() << ": "
-            << currentPath->getCount() << "\n");
-      // setup the entry edge (normally path profiling doesn't care about this)
-      if (currentPath->getFirstBlockInPath() == &F->getEntryBlock())
-        edgeArray[arrayMap[(BasicBlock*)0][currentPath->getFirstBlockInPath()][0]]
-          += currentPath->getCount();
-
-      for( ProfilePathEdgeIterator nextEdge = pev->begin(),
-             endEdge = pev->end(); nextEdge != endEdge; nextEdge++ ) {
-        if (nextEdge != pev->begin())
-          DEBUG(dbgs() << " :: ");
-
-        BasicBlock* source = nextEdge->getSource();
-        BasicBlock* target = nextEdge->getTarget();
-        unsigned duplicateNumber = nextEdge->getDuplicateNumber();
-        DEBUG(dbgs() << source->getName() << " --{" << duplicateNumber
-                     << "}--> " << target->getName());
-
-        // Ensure all the referenced edges exist
-        // TODO: make this a separate function
-        if( !arrayMap.count(source) ) {
-          errs() << "  error [" << F->getName() << "()]: source '"
-                 << source->getName()
-                 << "' does not exist in the array map.\n";
-        } else if( !arrayMap[source].count(target) ) {
-          errs() << "  error [" << F->getName() << "()]: target '"
-                 << target->getName()
-                 << "' does not exist in the array map.\n";
-        } else if( !arrayMap[source][target].count(duplicateNumber) ) {
-          errs() << "  error [" << F->getName() << "()]: edge "
-                 << source->getName() << " -> " << target->getName()
-                 << " duplicate number " << duplicateNumber
-                 << " does not exist in the array map.\n";
-        } else {
-          edgeArray[arrayMap[source][target][duplicateNumber]]
-            += currentPath->getCount();
-        }
-      }
-
-      DEBUG(errs() << "\n");
-
-      delete pev;
-    }
-  }
-
-  std::string errorInfo;
-  std::string filename = EdgeProfileFilename;
-
-  // Open a handle to the file
-  FILE* edgeFile = fopen(filename.c_str(),"wb");
-
-  if (!edgeFile) {
-    errs() << "error: unable to open file '" << filename << "' for output.\n";
-    return false;
-  }
-
-  errs() << "Generating edge profile '" << filename << "' ...\n";
-
-  // write argument info
-  unsigned type = ArgumentInfo;
-  unsigned num = pathProfileInfo.argList.size();
-  int zeros = 0;
-
-  fwrite(&type,sizeof(unsigned),1,edgeFile);
-  fwrite(&num,sizeof(unsigned),1,edgeFile);
-  fwrite(pathProfileInfo.argList.c_str(),1,num,edgeFile);
-  if (num&3)
-    fwrite(&zeros, 1, 4-(num&3), edgeFile);
-
-  type = EdgeInfo;
-  num = edgeArray.size();
-  fwrite(&type,sizeof(unsigned),1,edgeFile);
-  fwrite(&num,sizeof(unsigned),1,edgeFile);
-
-  // write each edge to the file
-  for( std::vector<unsigned>::iterator s = edgeArray.begin(),
-         e = edgeArray.end(); s != e; s++)
-    fwrite(&*s, sizeof (unsigned), 1, edgeFile);
-
-  fclose (edgeFile);
-
-  return true;
-}
diff --git a/lib/Analysis/ProfileDataLoader.cpp b/lib/Analysis/ProfileDataLoader.cpp
deleted file mode 100644
index 3d0a1e2..0000000
--- a/lib/Analysis/ProfileDataLoader.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===- ProfileDataLoader.cpp - Load profile information from disk ---------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileDataLoader class is used to load raw profiling data from the dump
-// file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/ProfileDataLoader.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/Analysis/ProfileDataTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-#include <cstdio>
-#include <cstdlib>
-using namespace llvm;
-
-raw_ostream &llvm::operator<<(raw_ostream &O, std::pair<const BasicBlock *,
-                                                        const BasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first->getName();
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second->getName();
-  else
-    O << "0";
-
-  return O << ")";
-}
-
-/// AddCounts - Add 'A' and 'B', accounting for the fact that the value of one
-/// (or both) may not be defined.
-static unsigned AddCounts(unsigned A, unsigned B) {
-  // If either value is undefined, use the other.
-  // Undefined + undefined = undefined.
-  if (A == ProfileDataLoader::Uncounted) return B;
-  if (B == ProfileDataLoader::Uncounted) return A;
-
-  return A + B;
-}
-
-/// ReadProfilingData - Load 'NumEntries' items of type 'T' from file 'F'
-template <typename T>
-static void ReadProfilingData(const char *ToolName, FILE *F,
-                              T *Data, size_t NumEntries) {
-  // Read in the block of data...
-  if (fread(Data, sizeof(T), NumEntries, F) != NumEntries)
-    report_fatal_error(Twine(ToolName) + ": Profiling data truncated");
-}
-
-/// ReadProfilingNumEntries - Read how many entries are in this profiling data
-/// packet.
-static unsigned ReadProfilingNumEntries(const char *ToolName, FILE *F,
-                                        bool ShouldByteSwap) {
-  unsigned Entry;
-  ReadProfilingData<unsigned>(ToolName, F, &Entry, 1);
-  return ShouldByteSwap ? ByteSwap_32(Entry) : Entry;
-}
-
-/// ReadProfilingBlock - Read the number of entries in the next profiling data
-/// packet and then accumulate the entries into 'Data'.
-static void ReadProfilingBlock(const char *ToolName, FILE *F,
-                               bool ShouldByteSwap,
-                               SmallVectorImpl<unsigned> &Data) {
-  // Read the number of entries...
-  unsigned NumEntries = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
-
-  // Read in the data.
-  SmallVector<unsigned, 8> TempSpace(NumEntries);
-  ReadProfilingData<unsigned>(ToolName, F, TempSpace.data(), NumEntries);
-
-  // Make sure we have enough space ...
-  if (Data.size() < NumEntries)
-    Data.resize(NumEntries, ProfileDataLoader::Uncounted);
-
-  // Accumulate the data we just read into the existing data.
-  for (unsigned i = 0; i < NumEntries; ++i) {
-    unsigned Entry = ShouldByteSwap ? ByteSwap_32(TempSpace[i]) : TempSpace[i];
-    Data[i] = AddCounts(Entry, Data[i]);
-  }
-}
-
-/// ReadProfilingArgBlock - Read the command line arguments that the progam was
-/// run with when the current profiling data packet(s) were generated.
-static void ReadProfilingArgBlock(const char *ToolName, FILE *F,
-                                  bool ShouldByteSwap,
-                                  SmallVectorImpl<std::string> &CommandLines) {
-  // Read the number of bytes ...
-  unsigned ArgLength = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
-
-  // Read in the arguments (if there are any to read).  Round up the length to
-  // the nearest 4-byte multiple.
-  SmallVector<char, 8> Args(ArgLength+4);
-  if (ArgLength)
-    ReadProfilingData<char>(ToolName, F, Args.data(), (ArgLength+3) & ~3);
-
-  // Store the arguments.
-  CommandLines.push_back(std::string(&Args[0], &Args[ArgLength]));
-}
-
-const unsigned ProfileDataLoader::Uncounted = ~0U;
-
-/// ProfileDataLoader ctor - Read the specified profiling data file, reporting
-/// a fatal error if the file is invalid or broken.
-ProfileDataLoader::ProfileDataLoader(const char *ToolName,
-                                     const std::string &Filename)
-  : Filename(Filename) {
-  FILE *F = fopen(Filename.c_str(), "rb");
-  if (F == 0)
-    report_fatal_error(Twine(ToolName) + ": Error opening '" +
-                       Filename + "': ");
-
-  // Keep reading packets until we run out of them.
-  unsigned PacketType;
-  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
-    // If the low eight bits of the packet are zero, we must be dealing with an
-    // endianness mismatch.  Byteswap all words read from the profiling
-    // information.  This can happen when the compiler host and target have
-    // different endianness.
-    bool ShouldByteSwap = (char)PacketType == 0;
-    PacketType = ShouldByteSwap ? ByteSwap_32(PacketType) : PacketType;
-
-    switch (PacketType) {
-      case ArgumentInfo:
-        ReadProfilingArgBlock(ToolName, F, ShouldByteSwap, CommandLines);
-        break;
-
-      case EdgeInfo:
-        ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
-        break;
-
-      default:
-        report_fatal_error(std::string(ToolName)
-                           + ": Unknown profiling packet type");
-        break;
-    }
-  }
-
-  fclose(F);
-}
diff --git a/lib/Analysis/ProfileDataLoaderPass.cpp b/lib/Analysis/ProfileDataLoaderPass.cpp
deleted file mode 100644
index 2ee0093..0000000
--- a/lib/Analysis/ProfileDataLoaderPass.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-//===- ProfileDataLoaderPass.cpp - Set branch weight metadata from prof ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass loads profiling data from a dump file and sets branch weight
-// metadata.
-//
-// TODO: Replace all "profile-metadata-loader" strings with "profile-loader"
-// once ProfileInfo etc. has been removed.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-metadata-loader"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileDataLoader.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumEdgesRead, "The # of edges read.");
-STATISTIC(NumTermsAnnotated, "The # of terminator instructions annotated.");
-
-static cl::opt<std::string>
-ProfileMetadataFilename("profile-file", cl::init("llvmprof.out"),
-                  cl::value_desc("filename"),
-                  cl::desc("Profile file loaded by -profile-metadata-loader"));
-
-namespace {
-  /// This pass loads profiling data from a dump file and sets branch weight
-  /// metadata.
-  class ProfileMetadataLoaderPass : public ModulePass {
-    std::string Filename;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit ProfileMetadataLoaderPass(const std::string &filename = "")
-        : ModulePass(ID), Filename(filename) {
-      initializeProfileMetadataLoaderPassPass(*PassRegistry::getPassRegistry());
-      if (filename.empty()) Filename = ProfileMetadataFilename;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profile loader";
-    }
-
-    virtual void readEdge(unsigned, ProfileData&, ProfileData::Edge,
-                          ArrayRef<unsigned>);
-    virtual unsigned matchEdges(Module&, ProfileData&, ArrayRef<unsigned>);
-    virtual void setBranchWeightMetadata(Module&, ProfileData&);
-
-    virtual bool runOnModule(Module &M);
-  };
-}  // End of anonymous namespace
-
-char ProfileMetadataLoaderPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ProfileMetadataLoaderPass, "profile-metadata-loader",
-              "Load profile information from llvmprof.out", false, true)
-INITIALIZE_PASS_END(ProfileMetadataLoaderPass, "profile-metadata-loader",
-              "Load profile information from llvmprof.out", false, true)
-
-char &llvm::ProfileMetadataLoaderPassID = ProfileMetadataLoaderPass::ID;
-
-/// createProfileMetadataLoaderPass - This function returns a Pass that loads
-/// the profiling information for the module from the specified filename,
-/// making it available to the optimizers.
-ModulePass *llvm::createProfileMetadataLoaderPass() { 
-    return new ProfileMetadataLoaderPass();
-}
-ModulePass *llvm::createProfileMetadataLoaderPass(const std::string &Filename) {
-  return new ProfileMetadataLoaderPass(Filename);
-}
-
-/// readEdge - Take the value from a profile counter and assign it to an edge.
-void ProfileMetadataLoaderPass::readEdge(unsigned ReadCount,
-                                         ProfileData &PB, ProfileData::Edge e,
-                                         ArrayRef<unsigned> Counters) {
-  if (ReadCount >= Counters.size()) return;
-
-  unsigned weight = Counters[ReadCount];
-  assert(weight != ProfileDataLoader::Uncounted);
-  PB.addEdgeWeight(e, weight);
-
-  DEBUG(dbgs() << "-- Read Edge Counter for " << e
-               << " (# "<< (ReadCount) << "): "
-               << PB.getEdgeWeight(e) << "\n");
-}
-
-/// matchEdges - Link every profile counter with an edge.
-unsigned ProfileMetadataLoaderPass::matchEdges(Module &M, ProfileData &PB,
-                                               ArrayRef<unsigned> Counters) {
-  if (Counters.size() == 0) return 0;
-
-  unsigned ReadCount = 0;
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Loading edges in '" << F->getName() << "'\n");
-    readEdge(ReadCount++, PB, PB.getEdge(0, &F->getEntryBlock()), Counters);
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-        readEdge(ReadCount++, PB, PB.getEdge(BB,TI->getSuccessor(s)),
-                 Counters);
-      }
-    }
-  }
-
-  return ReadCount;
-}
-
-/// setBranchWeightMetadata - Translate the counter values associated with each
-/// edge into branch weights for each conditional branch (a branch with 2 or
-/// more desinations).
-void ProfileMetadataLoaderPass::setBranchWeightMetadata(Module &M,
-                                                        ProfileData &PB) {
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Setting branch metadata in '" << F->getName() << "'\n");
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      unsigned NumSuccessors = TI->getNumSuccessors();
-
-      // If there is only one successor then we can not set a branch
-      // probability as the target is certain.
-      if (NumSuccessors < 2) continue;
-
-      // Load the weights of all edges leading from this terminator.
-      DEBUG(dbgs() << "-- Terminator with " << NumSuccessors
-                   << " successors:\n");
-      SmallVector<uint32_t, 4> Weights(NumSuccessors);
-      for (unsigned s = 0 ; s < NumSuccessors ; ++s) {
-          ProfileData::Edge edge = PB.getEdge(BB, TI->getSuccessor(s));
-          Weights[s] = (uint32_t)PB.getEdgeWeight(edge);
-          DEBUG(dbgs() << "---- Edge '" << edge << "' has weight "
-                       << Weights[s] << "\n");
-      }
-
-      // Set branch weight metadata.  This will set branch probabilities of
-      // 100%/0% if that is true of the dynamic execution.
-      // BranchProbabilityInfo can account for this when it loads this metadata
-      // (it gives the unexectuted branch a weight of 1 for the purposes of
-      // probability calculations).
-      MDBuilder MDB(TI->getContext());
-      MDNode *Node = MDB.createBranchWeights(Weights);
-      TI->setMetadata(LLVMContext::MD_prof, Node);
-      NumTermsAnnotated++;
-    }
-  }
-}
-
-bool ProfileMetadataLoaderPass::runOnModule(Module &M) {
-  ProfileDataLoader PDL("profile-data-loader", Filename);
-  ProfileData PB;
-
-  ArrayRef<unsigned> Counters = PDL.getRawEdgeCounts();
-
-  unsigned ReadCount = matchEdges(M, PB, Counters);
-
-  if (ReadCount != Counters.size()) {
-    errs() << "WARNING: profile information is inconsistent with "
-           << "the current program!\n";
-  }
-  NumEdgesRead = ReadCount;
-
-  setBranchWeightMetadata(M, PB);
-
-  return ReadCount > 0;
-}
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
deleted file mode 100644
index 365b64c..0000000
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-//===- ProfileEstimatorPass.cpp - LLVM Pass to estimate profile info ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a concrete implementation of profiling information that
-// estimates the profiling information in a very crude and unimaginative way.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-estimator"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-static cl::opt<double>
-LoopWeight(
-    "profile-estimator-loop-weight", cl::init(10),
-    cl::value_desc("loop-weight"),
-    cl::desc("Number of loop executions used for profile-estimator")
-);
-
-namespace {
-  class ProfileEstimatorPass : public FunctionPass, public ProfileInfo {
-    double ExecCount;
-    LoopInfo *LI;
-    std::set<BasicBlock*>  BBToVisit;
-    std::map<Loop*,double> LoopExitWeights;
-    std::map<Edge,double>  MinimalWeight;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit ProfileEstimatorPass(const double execcount = 0)
-        : FunctionPass(ID), ExecCount(execcount) {
-      initializeProfileEstimatorPassPass(*PassRegistry::getPassRegistry());
-      if (execcount == 0) ExecCount = LoopWeight;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-      AU.addRequired<LoopInfo>();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profiling information estimator";
-    }
-
-    /// run - Estimate the profile information from the specified file.
-    virtual bool runOnFunction(Function &F);
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    virtual void recurseBasicBlock(BasicBlock *BB);
-
-    void inline printEdgeWeight(Edge);
-  };
-}  // End of anonymous namespace
-
-char ProfileEstimatorPass::ID = 0;
-INITIALIZE_AG_PASS_BEGIN(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
-                "Estimate profiling information", false, true, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
-INITIALIZE_AG_PASS_END(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
-                "Estimate profiling information", false, true, false)
-
-namespace llvm {
-  char &ProfileEstimatorPassID = ProfileEstimatorPass::ID;
-
-  FunctionPass *createProfileEstimatorPass() {
-    return new ProfileEstimatorPass();
-  }
-
-  /// createProfileEstimatorPass - This function returns a Pass that estimates
-  /// profiling information using the given loop execution count.
-  Pass *createProfileEstimatorPass(const unsigned execcount) {
-    return new ProfileEstimatorPass(execcount);
-  }
-}
-
-static double ignoreMissing(double w) {
-  if (w == ProfileInfo::MissingValue) return 0;
-  return w;
-}
-
-static void inline printEdgeError(ProfileInfo::Edge e, const char *M) {
-  DEBUG(dbgs() << "-- Edge " << e << " is not calculated, " << M << "\n");
-}
-
-void inline ProfileEstimatorPass::printEdgeWeight(Edge E) {
-  DEBUG(dbgs() << "-- Weight of Edge " << E << ":"
-               << format("%20.20g", getEdgeWeight(E)) << "\n");
-}
-
-// recurseBasicBlock() - This calculates the ProfileInfo estimation for a
-// single block and then recurses into the successors.
-// The algorithm preserves the flow condition, meaning that the sum of the
-// weight of the incoming edges must be equal the block weight which must in
-// turn be equal to the sume of the weights of the outgoing edges.
-// Since the flow of an block is deterimined from the current state of the
-// flow, once an edge has a flow assigned this flow is never changed again,
-// otherwise it would be possible to violate the flow condition in another
-// block.
-void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
-
-  // Break the recursion if this BasicBlock was already visited.
-  if (BBToVisit.find(BB) == BBToVisit.end()) return;
-
-  // Read the LoopInfo for this block.
-  bool  BBisHeader = LI->isLoopHeader(BB);
-  Loop* BBLoop     = LI->getLoopFor(BB);
-
-  // To get the block weight, read all incoming edges.
-  double BBWeight = 0;
-  std::set<BasicBlock*> ProcessedPreds;
-  for ( pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-        bbi != bbe; ++bbi ) {
-    // If this block was not considered already, add weight.
-    Edge edge = getEdge(*bbi,BB);
-    double w = getEdgeWeight(edge);
-    if (ProcessedPreds.insert(*bbi).second) {
-      BBWeight += ignoreMissing(w);
-    }
-    // If this block is a loop header and the predecessor is contained in this
-    // loop, thus the edge is a backedge, continue and do not check if the
-    // value is valid.
-    if (BBisHeader && BBLoop->contains(*bbi)) {
-      printEdgeError(edge, "but is backedge, continuing");
-      continue;
-    }
-    // If the edges value is missing (and this is no loop header, and this is
-    // no backedge) return, this block is currently non estimatable.
-    if (w == MissingValue) {
-      printEdgeError(edge, "returning");
-      return;
-    }
-  }
-  if (getExecutionCount(BB) != MissingValue) {
-    BBWeight = getExecutionCount(BB);
-  }
-
-  // Fetch all necessary information for current block.
-  SmallVector<Edge, 8> ExitEdges;
-  SmallVector<Edge, 8> Edges;
-  if (BBLoop) {
-    BBLoop->getExitEdges(ExitEdges);
-  }
-
-  // If this is a loop header, consider the following:
-  // Exactly the flow that is entering this block, must exit this block too. So
-  // do the following: 
-  // *) get all the exit edges, read the flow that is already leaving this
-  // loop, remember the edges that do not have any flow on them right now.
-  // (The edges that have already flow on them are most likely exiting edges of
-  // other loops, do not touch those flows because the previously caclulated
-  // loopheaders would not be exact anymore.)
-  // *) In case there is not a single exiting edge left, create one at the loop
-  // latch to prevent the flow from building up in the loop.
-  // *) Take the flow that is not leaving the loop already and distribute it on
-  // the remaining exiting edges.
-  // (This ensures that all flow that enters the loop also leaves it.)
-  // *) Increase the flow into the loop by increasing the weight of this block.
-  // There is at least one incoming backedge that will bring us this flow later
-  // on. (So that the flow condition in this node is valid again.)
-  if (BBisHeader) {
-    double incoming = BBWeight;
-    // Subtract the flow leaving the loop.
-    std::set<Edge> ProcessedExits;
-    for (SmallVectorImpl<Edge>::iterator ei = ExitEdges.begin(),
-         ee = ExitEdges.end(); ei != ee; ++ei) {
-      if (ProcessedExits.insert(*ei).second) {
-        double w = getEdgeWeight(*ei);
-        if (w == MissingValue) {
-          Edges.push_back(*ei);
-          // Check if there is a necessary minimal weight, if yes, subtract it 
-          // from weight.
-          if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-            incoming -= MinimalWeight[*ei];
-            DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-          }
-        } else {
-          incoming -= w;
-        }
-      }
-    }
-    // If no exit edges, create one:
-    if (Edges.size() == 0) {
-      BasicBlock *Latch = BBLoop->getLoopLatch();
-      if (Latch) {
-        Edge edge = getEdge(Latch,0);
-        EdgeInformation[BB->getParent()][edge] = BBWeight;
-        printEdgeWeight(edge);
-        edge = getEdge(Latch, BB);
-        EdgeInformation[BB->getParent()][edge] = BBWeight * ExecCount;
-        printEdgeWeight(edge);
-      }
-    }
-
-    // Distribute remaining weight to the exting edges. To prevent fractions
-    // from building up and provoking precision problems the weight which is to
-    // be distributed is split and the rounded, the last edge gets a somewhat
-    // bigger value, but we are close enough for an estimation.
-    double fraction = floor(incoming/Edges.size());
-    for (SmallVectorImpl<Edge>::iterator ei = Edges.begin(), ee = Edges.end();
-         ei != ee; ++ei) {
-      double w = 0;
-      if (ei != (ee-1)) {
-        w = fraction;
-        incoming -= fraction;
-      } else {
-        w = incoming;
-      }
-      EdgeInformation[BB->getParent()][*ei] += w;
-      // Read necessary minimal weight.
-      if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-        EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
-        DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-      }
-      printEdgeWeight(*ei);
-      
-      // Add minimal weight to paths to all exit edges, this is used to ensure
-      // that enough flow is reaching this edges.
-      Path p;
-      const BasicBlock *Dest = GetPath(BB, (*ei).first, p, GetPathToDest);
-      while (Dest != BB) {
-        const BasicBlock *Parent = p.find(Dest)->second;
-        Edge e = getEdge(Parent, Dest);
-        if (MinimalWeight.find(e) == MinimalWeight.end()) {
-          MinimalWeight[e] = 0;
-        }
-        MinimalWeight[e] += w;
-        DEBUG(dbgs() << "Minimal Weight for " << e << ": " << format("%.20g",MinimalWeight[e]) << "\n");
-        Dest = Parent;
-      }
-    }
-    // Increase flow into the loop.
-    BBWeight *= (ExecCount+1);
-  }
-
-  BlockInformation[BB->getParent()][BB] = BBWeight;
-  // Up until now we considered only the loop exiting edges, now we have a
-  // definite block weight and must distribute this onto the outgoing edges.
-  // Since there may be already flow attached to some of the edges, read this
-  // flow first and remember the edges that have still now flow attached.
-  Edges.clear();
-  std::set<BasicBlock*> ProcessedSuccs;
-
-  succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-  // Also check for (BB,0) edges that may already contain some flow. (But only
-  // in case there are no successors.)
-  if (bbi == bbe) {
-    Edge edge = getEdge(BB,0);
-    EdgeInformation[BB->getParent()][edge] = BBWeight;
-    printEdgeWeight(edge);
-  }
-  for ( ; bbi != bbe; ++bbi ) {
-    if (ProcessedSuccs.insert(*bbi).second) {
-      Edge edge = getEdge(BB,*bbi);
-      double w = getEdgeWeight(edge);
-      if (w != MissingValue) {
-        BBWeight -= getEdgeWeight(edge);
-      } else {
-        Edges.push_back(edge);
-        // If minimal weight is necessary, reserve weight by subtracting weight
-        // from block weight, this is readded later on.
-        if (MinimalWeight.find(edge) != MinimalWeight.end()) {
-          BBWeight -= MinimalWeight[edge];
-          DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[edge]) << " at " << edge << "\n");
-        }
-      }
-    }
-  }
-
-  double fraction = Edges.size() ? floor(BBWeight/Edges.size()) : 0.0;
-  // Finally we know what flow is still not leaving the block, distribute this
-  // flow onto the empty edges.
-  for (SmallVectorImpl<Edge>::iterator ei = Edges.begin(), ee = Edges.end();
-       ei != ee; ++ei) {
-    if (ei != (ee-1)) {
-      EdgeInformation[BB->getParent()][*ei] += fraction;
-      BBWeight -= fraction;
-    } else {
-      EdgeInformation[BB->getParent()][*ei] += BBWeight;
-    }
-    // Readd minial necessary weight.
-    if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-      EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
-      DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-    }
-    printEdgeWeight(*ei);
-  }
-
-  // This block is visited, mark this before the recursion.
-  BBToVisit.erase(BB);
-
-  // Recurse into successors.
-  for (succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-}
-
-bool ProfileEstimatorPass::runOnFunction(Function &F) {
-  if (F.isDeclaration()) return false;
-
-  // Fetch LoopInfo and clear ProfileInfo for this function.
-  LI = &getAnalysis<LoopInfo>();
-  FunctionInformation.erase(&F);
-  BlockInformation[&F].clear();
-  EdgeInformation[&F].clear();
-  BBToVisit.clear();
-
-  // Mark all blocks as to visit.
-  for (Function::iterator bi = F.begin(), be = F.end(); bi != be; ++bi)
-    BBToVisit.insert(bi);
-
-  // Clear Minimal Edges.
-  MinimalWeight.clear();
-
-  DEBUG(dbgs() << "Working on function " << F.getName() << "\n");
-
-  // Since the entry block is the first one and has no predecessors, the edge
-  // (0,entry) is inserted with the starting weight of 1.
-  BasicBlock *entry = &F.getEntryBlock();
-  BlockInformation[&F][entry] = pow(2.0, 32.0);
-  Edge edge = getEdge(0,entry);
-  EdgeInformation[&F][edge] = BlockInformation[&F][entry];
-  printEdgeWeight(edge);
-
-  // Since recurseBasicBlock() maybe returns with a block which was not fully
-  // estimated, use recurseBasicBlock() until everything is calculated.
-  bool cleanup = false;
-  recurseBasicBlock(entry);
-  while (BBToVisit.size() > 0 && !cleanup) {
-    // Remember number of open blocks, this is later used to check if progress
-    // was made.
-    unsigned size = BBToVisit.size();
-
-    // Try to calculate all blocks in turn.
-    for (std::set<BasicBlock*>::iterator bi = BBToVisit.begin(),
-         be = BBToVisit.end(); bi != be; ++bi) {
-      recurseBasicBlock(*bi);
-      // If at least one block was finished, break because iterator may be
-      // invalid.
-      if (BBToVisit.size() < size) break;
-    }
-
-    // If there was not a single block resolved, make some assumptions.
-    if (BBToVisit.size() == size) {
-      bool found = false;
-      for (std::set<BasicBlock*>::iterator BBI = BBToVisit.begin(), BBE = BBToVisit.end(); 
-           (BBI != BBE) && (!found); ++BBI) {
-        BasicBlock *BB = *BBI;
-        // Try each predecessor if it can be assumend.
-        for (pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-             (bbi != bbe) && (!found); ++bbi) {
-          Edge e = getEdge(*bbi,BB);
-          double w = getEdgeWeight(e);
-          // Check that edge from predecessor is still free.
-          if (w == MissingValue) {
-            // Check if there is a circle from this block to predecessor.
-            Path P;
-            const BasicBlock *Dest = GetPath(BB, *bbi, P, GetPathToDest);
-            if (Dest != *bbi) {
-              // If there is no circle, just set edge weight to 0
-              EdgeInformation[&F][e] = 0;
-              DEBUG(dbgs() << "Assuming edge weight: ");
-              printEdgeWeight(e);
-              found = true;
-            }
-          }
-        }
-      }
-      if (!found) {
-        cleanup = true;
-        DEBUG(dbgs() << "No assumption possible in Fuction "<<F.getName()<<", setting all to zero\n");
-      }
-    }
-  }
-  // In case there was no safe way to assume edges, set as a last measure, 
-  // set _everything_ to zero.
-  if (cleanup) {
-    FunctionInformation[&F] = 0;
-    BlockInformation[&F].clear();
-    EdgeInformation[&F].clear();
-    for (Function::const_iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-      const BasicBlock *BB = &(*FI);
-      BlockInformation[&F][BB] = 0;
-      const_pred_iterator predi = pred_begin(BB), prede = pred_end(BB);
-      if (predi == prede) {
-        Edge e = getEdge(0,BB);
-        setEdgeWeight(e,0);
-      }
-      for (;predi != prede; ++predi) {
-        Edge e = getEdge(*predi,BB);
-        setEdgeWeight(e,0);
-      }
-      succ_const_iterator succi = succ_begin(BB), succe = succ_end(BB);
-      if (succi == succe) {
-        Edge e = getEdge(BB,0);
-        setEdgeWeight(e,0);
-      }
-      for (;succi != succe; ++succi) {
-        Edge e = getEdge(*succi,BB);
-        setEdgeWeight(e,0);
-      }
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
deleted file mode 100644
index 9626a48..0000000
--- a/lib/Analysis/ProfileInfo.cpp
+++ /dev/null
@@ -1,1079 +0,0 @@
-//===- ProfileInfo.cpp - Profile Info Interface ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the abstract ProfileInfo interface, and the default
-// "no profile" implementation.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-info"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <limits>
-#include <queue>
-#include <set>
-using namespace llvm;
-
-namespace llvm {
-  template<> char ProfileInfoT<Function,BasicBlock>::ID = 0;
-}
-
-// Register the ProfileInfo interface, providing a nice name to refer to.
-INITIALIZE_ANALYSIS_GROUP(ProfileInfo, "Profile Information", NoProfileInfo)
-
-namespace llvm {
-
-template <>
-ProfileInfoT<MachineFunction, MachineBasicBlock>::ProfileInfoT() {}
-template <>
-ProfileInfoT<MachineFunction, MachineBasicBlock>::~ProfileInfoT() {}
-
-template <>
-ProfileInfoT<Function, BasicBlock>::ProfileInfoT() {
-  MachineProfile = 0;
-}
-template <>
-ProfileInfoT<Function, BasicBlock>::~ProfileInfoT() {
-  if (MachineProfile) delete MachineProfile;
-}
-
-template<>
-char ProfileInfoT<MachineFunction, MachineBasicBlock>::ID = 0;
-
-template<>
-const double ProfileInfoT<Function,BasicBlock>::MissingValue = -1;
-
-template<> const
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::MissingValue = -1;
-
-template<> double
-ProfileInfoT<Function,BasicBlock>::getExecutionCount(const BasicBlock *BB) {
-  std::map<const Function*, BlockCounts>::iterator J =
-    BlockInformation.find(BB->getParent());
-  if (J != BlockInformation.end()) {
-    BlockCounts::iterator I = J->second.find(BB);
-    if (I != J->second.end())
-      return I->second;
-  }
-
-  double Count = MissingValue;
-
-  const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-
-  // Are there zero predecessors of this block?
-  if (PI == PE) {
-    Edge e = getEdge(0, BB);
-    Count = getEdgeWeight(e);
-  } else {
-    // Otherwise, if there are predecessors, the execution count of this block is
-    // the sum of the edge frequencies from the incoming edges.
-    std::set<const BasicBlock*> ProcessedPreds;
-    Count = 0;
-    for (; PI != PE; ++PI) {
-      const BasicBlock *P = *PI;
-      if (ProcessedPreds.insert(P).second) {
-        double w = getEdgeWeight(getEdge(P, BB));
-        if (w == MissingValue) {
-          Count = MissingValue;
-          break;
-        }
-        Count += w;
-      }
-    }
-  }
-
-  // If the predecessors did not suffice to get block weight, try successors.
-  if (Count == MissingValue) {
-
-    succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
-
-    // Are there zero successors of this block?
-    if (SI == SE) {
-      Edge e = getEdge(BB,0);
-      Count = getEdgeWeight(e);
-    } else {
-      std::set<const BasicBlock*> ProcessedSuccs;
-      Count = 0;
-      for (; SI != SE; ++SI)
-        if (ProcessedSuccs.insert(*SI).second) {
-          double w = getEdgeWeight(getEdge(BB, *SI));
-          if (w == MissingValue) {
-            Count = MissingValue;
-            break;
-          }
-          Count += w;
-        }
-    }
-  }
-
-  if (Count != MissingValue) BlockInformation[BB->getParent()][BB] = Count;
-  return Count;
-}
-
-template<>
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        getExecutionCount(const MachineBasicBlock *MBB) {
-  std::map<const MachineFunction*, BlockCounts>::iterator J =
-    BlockInformation.find(MBB->getParent());
-  if (J != BlockInformation.end()) {
-    BlockCounts::iterator I = J->second.find(MBB);
-    if (I != J->second.end())
-      return I->second;
-  }
-
-  return MissingValue;
-}
-
-template<>
-double ProfileInfoT<Function,BasicBlock>::getExecutionCount(const Function *F) {
-  std::map<const Function*, double>::iterator J =
-    FunctionInformation.find(F);
-  if (J != FunctionInformation.end())
-    return J->second;
-
-  // isDeclaration() is checked here and not at start of function to allow
-  // functions without a body still to have a execution count.
-  if (F->isDeclaration()) return MissingValue;
-
-  double Count = getExecutionCount(&F->getEntryBlock());
-  if (Count != MissingValue) FunctionInformation[F] = Count;
-  return Count;
-}
-
-template<>
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        getExecutionCount(const MachineFunction *MF) {
-  std::map<const MachineFunction*, double>::iterator J =
-    FunctionInformation.find(MF);
-  if (J != FunctionInformation.end())
-    return J->second;
-
-  double Count = getExecutionCount(&MF->front());
-  if (Count != MissingValue) FunctionInformation[MF] = Count;
-  return Count;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        setExecutionCount(const BasicBlock *BB, double w) {
-  DEBUG(dbgs() << "Creating Block " << BB->getName() 
-               << " (weight: " << format("%.20g",w) << ")\n");
-  BlockInformation[BB->getParent()][BB] = w;
-}
-
-template<>
-void ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        setExecutionCount(const MachineBasicBlock *MBB, double w) {
-  DEBUG(dbgs() << "Creating Block " << MBB->getBasicBlock()->getName()
-               << " (weight: " << format("%.20g",w) << ")\n");
-  BlockInformation[MBB->getParent()][MBB] = w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::addEdgeWeight(Edge e, double w) {
-  double oldw = getEdgeWeight(e);
-  assert (oldw != MissingValue && "Adding weight to Edge with no previous weight");
-  DEBUG(dbgs() << "Adding to Edge " << e
-               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
-  EdgeInformation[getFunction(e)][e] = oldw + w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        addExecutionCount(const BasicBlock *BB, double w) {
-  double oldw = getExecutionCount(BB);
-  assert (oldw != MissingValue && "Adding weight to Block with no previous weight");
-  DEBUG(dbgs() << "Adding to Block " << BB->getName()
-               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
-  BlockInformation[BB->getParent()][BB] = oldw + w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::removeBlock(const BasicBlock *BB) {
-  std::map<const Function*, BlockCounts>::iterator J =
-    BlockInformation.find(BB->getParent());
-  if (J == BlockInformation.end()) return;
-
-  DEBUG(dbgs() << "Deleting " << BB->getName() << "\n");
-  J->second.erase(BB);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::removeEdge(Edge e) {
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(getFunction(e));
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Deleting" << e << "\n");
-  J->second.erase(e);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        replaceEdge(const Edge &oldedge, const Edge &newedge) {
-  double w;
-  if ((w = getEdgeWeight(newedge)) == MissingValue) {
-    w = getEdgeWeight(oldedge);
-    DEBUG(dbgs() << "Replacing " << oldedge << " with " << newedge  << "\n");
-  } else {
-    w += getEdgeWeight(oldedge);
-    DEBUG(dbgs() << "Adding " << oldedge << " to " << newedge  << "\n");
-  }
-  setEdgeWeight(newedge,w);
-  removeEdge(oldedge);
-}
-
-template<>
-const BasicBlock *ProfileInfoT<Function,BasicBlock>::
-        GetPath(const BasicBlock *Src, const BasicBlock *Dest,
-                Path &P, unsigned Mode) {
-  const BasicBlock *BB = 0;
-  bool hasFoundPath = false;
-
-  std::queue<const BasicBlock *> BFS;
-  BFS.push(Src);
-
-  while(BFS.size() && !hasFoundPath) {
-    BB = BFS.front();
-    BFS.pop();
-
-    succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
-    if (Succ == End) {
-      P[(const BasicBlock*)0] = BB;
-      if (Mode & GetPathToExit) {
-        hasFoundPath = true;
-        BB = 0;
-      }
-    }
-    for(;Succ != End; ++Succ) {
-      if (P.find(*Succ) != P.end()) continue;
-      Edge e = getEdge(BB,*Succ);
-      if ((Mode & GetPathWithNewEdges) && (getEdgeWeight(e) != MissingValue)) continue;
-      P[*Succ] = BB;
-      BFS.push(*Succ);
-      if ((Mode & GetPathToDest) && *Succ == Dest) {
-        hasFoundPath = true;
-        BB = *Succ;
-        break;
-      }
-      if ((Mode & GetPathToValue) && (getExecutionCount(*Succ) != MissingValue)) {
-        hasFoundPath = true;
-        BB = *Succ;
-        break;
-      }
-    }
-  }
-
-  return BB;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        divertFlow(const Edge &oldedge, const Edge &newedge) {
-  DEBUG(dbgs() << "Diverting " << oldedge << " via " << newedge );
-
-  // First check if the old edge was taken, if not, just delete it...
-  if (getEdgeWeight(oldedge) == 0) {
-    removeEdge(oldedge);
-    return;
-  }
-
-  Path P;
-  P[newedge.first] = 0;
-  P[newedge.second] = newedge.first;
-  const BasicBlock *BB = GetPath(newedge.second,oldedge.second,P,GetPathToExit | GetPathToDest);
-
-  double w = getEdgeWeight (oldedge);
-  DEBUG(dbgs() << ", Weight: " << format("%.20g",w) << "\n");
-  do {
-    const BasicBlock *Parent = P.find(BB)->second;
-    Edge e = getEdge(Parent,BB);
-    double oldw = getEdgeWeight(e);
-    double oldc = getExecutionCount(e.first);
-    setEdgeWeight(e, w+oldw);
-    if (Parent != oldedge.first) {
-      setExecutionCount(e.first, w+oldc);
-    }
-    BB = Parent;
-  } while (BB != newedge.first);
-  removeEdge(oldedge);
-}
-
-/// Replaces all occurrences of RmBB in the ProfilingInfo with DestBB.
-/// This checks all edges of the function the blocks reside in and replaces the
-/// occurrences of RmBB with DestBB.
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        replaceAllUses(const BasicBlock *RmBB, const BasicBlock *DestBB) {
-  DEBUG(dbgs() << "Replacing " << RmBB->getName()
-               << " with " << DestBB->getName() << "\n");
-  const Function *F = DestBB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  Edge e, newedge;
-  bool erasededge = false;
-  EdgeWeights::iterator I = J->second.begin(), E = J->second.end();
-  while(I != E) {
-    e = (I++)->first;
-    bool foundedge = false; bool eraseedge = false;
-    if (e.first == RmBB) {
-      if (e.second == DestBB) {
-        eraseedge = true;
-      } else {
-        newedge = getEdge(DestBB, e.second);
-        foundedge = true;
-      }
-    }
-    if (e.second == RmBB) {
-      if (e.first == DestBB) {
-        eraseedge = true;
-      } else {
-        newedge = getEdge(e.first, DestBB);
-        foundedge = true;
-      }
-    }
-    if (foundedge) {
-      replaceEdge(e, newedge);
-    }
-    if (eraseedge) {
-      if (erasededge) {
-        Edge newedge = getEdge(DestBB, DestBB);
-        replaceEdge(e, newedge);
-      } else {
-        removeEdge(e);
-        erasededge = true;
-      }
-    }
-  }
-}
-
-/// Splits an edge in the ProfileInfo and redirects flow over NewBB.
-/// Since its possible that there is more than one edge in the CFG from FristBB
-/// to SecondBB its necessary to redirect the flow proporionally.
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitEdge(const BasicBlock *FirstBB,
-                                                  const BasicBlock *SecondBB,
-                                                  const BasicBlock *NewBB,
-                                                  bool MergeIdenticalEdges) {
-  const Function *F = FirstBB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  // Generate edges and read current weight.
-  Edge e  = getEdge(FirstBB, SecondBB);
-  Edge n1 = getEdge(FirstBB, NewBB);
-  Edge n2 = getEdge(NewBB, SecondBB);
-  EdgeWeights &ECs = J->second;
-  double w = ECs[e];
-
-  int succ_count = 0;
-  if (!MergeIdenticalEdges) {
-    // First count the edges from FristBB to SecondBB, if there is more than
-    // one, only slice out a proporional part for NewBB.
-    for(succ_const_iterator BBI = succ_begin(FirstBB), BBE = succ_end(FirstBB);
-        BBI != BBE; ++BBI) {
-      if (*BBI == SecondBB) succ_count++;  
-    }
-    // When the NewBB is completely new, increment the count by one so that
-    // the counts are properly distributed.
-    if (getExecutionCount(NewBB) == ProfileInfo::MissingValue) succ_count++;
-  } else {
-    // When the edges are merged anyway, then redirect all flow.
-    succ_count = 1;
-  }
-
-  // We know now how many edges there are from FirstBB to SecondBB, reroute a
-  // proportional part of the edge weight over NewBB.
-  double neww = floor(w / succ_count);
-  ECs[n1] += neww;
-  ECs[n2] += neww;
-  BlockInformation[F][NewBB] += neww;
-  if (succ_count == 1) {
-    ECs.erase(e);
-  } else {
-    ECs[e] -= neww;
-  }
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *Old,
-                                                   const BasicBlock* New) {
-  const Function *F = Old->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Splitting " << Old->getName() << " to " << New->getName() << "\n");
-
-  std::set<Edge> Edges;
-  for (EdgeWeights::iterator ewi = J->second.begin(), ewe = J->second.end(); 
-       ewi != ewe; ++ewi) {
-    Edge old = ewi->first;
-    if (old.first == Old) {
-      Edges.insert(old);
-    }
-  }
-  for (std::set<Edge>::iterator EI = Edges.begin(), EE = Edges.end(); 
-       EI != EE; ++EI) {
-    Edge newedge = getEdge(New, EI->second);
-    replaceEdge(*EI, newedge);
-  }
-
-  double w = getExecutionCount(Old);
-  setEdgeWeight(getEdge(Old, New), w);
-  setExecutionCount(New, w);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *BB,
-                                                   const BasicBlock* NewBB,
-                                                   BasicBlock *const *Preds,
-                                                   unsigned NumPreds) {
-  const Function *F = BB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Splitting " << NumPreds << " Edges from " << BB->getName() 
-               << " to " << NewBB->getName() << "\n");
-
-  // Collect weight that was redirected over NewBB.
-  double newweight = 0;
-  
-  std::set<const BasicBlock *> ProcessedPreds;
-  // For all requestes Predecessors.
-  for (unsigned pred = 0; pred < NumPreds; ++pred) {
-    const BasicBlock * Pred = Preds[pred];
-    if (ProcessedPreds.insert(Pred).second) {
-      // Create edges and read old weight.
-      Edge oldedge = getEdge(Pred, BB);
-      Edge newedge = getEdge(Pred, NewBB);
-
-      // Remember how much weight was redirected.
-      newweight += getEdgeWeight(oldedge);
-    
-      replaceEdge(oldedge,newedge);
-    }
-  }
-
-  Edge newedge = getEdge(NewBB,BB);
-  setEdgeWeight(newedge, newweight);
-  setExecutionCount(NewBB, newweight);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::transfer(const Function *Old,
-                                                 const Function *New) {
-  DEBUG(dbgs() << "Replacing Function " << Old->getName() << " with "
-               << New->getName() << "\n");
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(Old);
-  if(J != EdgeInformation.end()) {
-    EdgeInformation[New] = J->second;
-  }
-  EdgeInformation.erase(Old);
-  BlockInformation.erase(Old);
-  FunctionInformation.erase(Old);
-}
-
-static double readEdgeOrRemember(ProfileInfo::Edge edge, double w,
-                                 ProfileInfo::Edge &tocalc, unsigned &uncalc) {
-  if (w == ProfileInfo::MissingValue) {
-    tocalc = edge;
-    uncalc++;
-    return 0;
-  } else {
-    return w;
-  }
-}
-
-template<>
-bool ProfileInfoT<Function,BasicBlock>::
-        CalculateMissingEdge(const BasicBlock *BB, Edge &removed,
-                             bool assumeEmptySelf) {
-  Edge edgetocalc;
-  unsigned uncalculated = 0;
-
-  // collect weights of all incoming and outgoing edges, rememer edges that
-  // have no value
-  double incount = 0;
-  SmallSet<const BasicBlock*,8> pred_visited;
-  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-  if (bbi==bbe) {
-    Edge e = getEdge(0,BB);
-    incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
-  }
-  for (;bbi != bbe; ++bbi) {
-    if (pred_visited.insert(*bbi)) {
-      Edge e = getEdge(*bbi,BB);
-      incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
-    }
-  }
-
-  double outcount = 0;
-  SmallSet<const BasicBlock*,8> succ_visited;
-  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi==sbbe) {
-    Edge e = getEdge(BB,0);
-    if (getEdgeWeight(e) == MissingValue) {
-      double w = getExecutionCount(BB);
-      if (w != MissingValue) {
-        setEdgeWeight(e,w);
-        removed = e;
-      }
-    }
-    outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
-  }
-  for (;sbbi != sbbe; ++sbbi) {
-    if (succ_visited.insert(*sbbi)) {
-      Edge e = getEdge(BB,*sbbi);
-      outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
-    }
-  }
-
-  // if exactly one edge weight was missing, calculate it and remove it from
-  // spanning tree
-  if (uncalculated == 0 ) {
-    return true;
-  } else
-  if (uncalculated == 1) {
-    if (incount < outcount) {
-      EdgeInformation[BB->getParent()][edgetocalc] = outcount-incount;
-    } else {
-      EdgeInformation[BB->getParent()][edgetocalc] = incount-outcount;
-    }
-    DEBUG(dbgs() << "--Calc Edge Counter for " << edgetocalc << ": "
-                 << format("%.20g", getEdgeWeight(edgetocalc)) << "\n");
-    removed = edgetocalc;
-    return true;
-  } else 
-  if (uncalculated == 2 && assumeEmptySelf && edgetocalc.first == edgetocalc.second && incount == outcount) {
-    setEdgeWeight(edgetocalc, incount * 10);
-    removed = edgetocalc;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-static void readEdge(ProfileInfo *PI, ProfileInfo::Edge e, double &calcw, std::set<ProfileInfo::Edge> &misscount) {
-  double w = PI->getEdgeWeight(e);
-  if (w != ProfileInfo::MissingValue) {
-    calcw += w;
-  } else {
-    misscount.insert(e);
-  }
-}
-
-template<>
-bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *BB) {
-  double inWeight = 0;
-  std::set<Edge> inMissing;
-  std::set<const BasicBlock*> ProcessedPreds;
-  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-  if (bbi == bbe) {
-    readEdge(this,getEdge(0,BB),inWeight,inMissing);
-  }
-  for( ; bbi != bbe; ++bbi ) {
-    if (ProcessedPreds.insert(*bbi).second) {
-      readEdge(this,getEdge(*bbi,BB),inWeight,inMissing);
-    }
-  }
-
-  double outWeight = 0;
-  std::set<Edge> outMissing;
-  std::set<const BasicBlock*> ProcessedSuccs;
-  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi == sbbe)
-    readEdge(this,getEdge(BB,0),outWeight,outMissing);
-  for ( ; sbbi != sbbe; ++sbbi ) {
-    if (ProcessedSuccs.insert(*sbbi).second) {
-      readEdge(this,getEdge(BB,*sbbi),outWeight,outMissing);
-    }
-  }
-
-  double share;
-  std::set<Edge>::iterator ei,ee;
-  if (inMissing.size() == 0 && outMissing.size() > 0) {
-    ei = outMissing.begin();
-    ee = outMissing.end();
-    share = inWeight/outMissing.size();
-    setExecutionCount(BB,inWeight);
-  } else
-  if (inMissing.size() > 0 && outMissing.size() == 0 && outWeight == 0) {
-    ei = inMissing.begin();
-    ee = inMissing.end();
-    share = 0;
-    setExecutionCount(BB,0);
-  } else
-  if (inMissing.size() == 0 && outMissing.size() == 0) {
-    setExecutionCount(BB,outWeight);
-    return true;
-  } else {
-    return false;
-  }
-  for ( ; ei != ee; ++ei ) {
-    setEdgeWeight(*ei,share);
-  }
-  return true;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
-//  if (getExecutionCount(&(F->getEntryBlock())) == 0) {
-//    for (Function::const_iterator FI = F->begin(), FE = F->end();
-//         FI != FE; ++FI) {
-//      const BasicBlock* BB = &(*FI);
-//      {
-//        const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-//        if (NBB == End) {
-//          setEdgeWeight(getEdge(0,BB),0);
-//        }
-//        for(;NBB != End; ++NBB) {
-//          setEdgeWeight(getEdge(*NBB,BB),0);
-//        }
-//      }
-//      {
-//        succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-//        if (NBB == End) {
-//          setEdgeWeight(getEdge(0,BB),0);
-//        }
-//        for(;NBB != End; ++NBB) {
-//          setEdgeWeight(getEdge(*NBB,BB),0);
-//        }
-//      }
-//    }
-//    return;
-//  }
-  // The set of BasicBlocks that are still unvisited.
-  std::set<const BasicBlock*> Unvisited;
-
-  // The set of return edges (Edges with no successors).
-  std::set<Edge> ReturnEdges;
-  double ReturnWeight = 0;
-  
-  // First iterate over the whole function and collect:
-  // 1) The blocks in this function in the Unvisited set.
-  // 2) The return edges in the ReturnEdges set.
-  // 3) The flow that is leaving the function already via return edges.
-
-  // Data structure for searching the function.
-  std::queue<const BasicBlock *> BFS;
-  const BasicBlock *BB = &(F->getEntryBlock());
-  BFS.push(BB);
-  Unvisited.insert(BB);
-
-  while (BFS.size()) {
-    BB = BFS.front(); BFS.pop();
-    succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-    if (NBB == End) {
-      Edge e = getEdge(BB,0);
-      double w = getEdgeWeight(e);
-      if (w == MissingValue) {
-        // If the return edge has no value, try to read value from block.
-        double bw = getExecutionCount(BB);
-        if (bw != MissingValue) {
-          setEdgeWeight(e,bw);
-          ReturnWeight += bw;
-        } else {
-          // If both return edge and block provide no value, collect edge.
-          ReturnEdges.insert(e);
-        }
-      } else {
-        // If the return edge has a proper value, collect it.
-        ReturnWeight += w;
-      }
-    }
-    for (;NBB != End; ++NBB) {
-      if (Unvisited.insert(*NBB).second) {
-        BFS.push(*NBB);
-      }
-    }
-  }
-
-  while (Unvisited.size() > 0) {
-    unsigned oldUnvisitedCount = Unvisited.size();
-    bool FoundPath = false;
-
-    // If there is only one edge left, calculate it.
-    if (ReturnEdges.size() == 1) {
-      ReturnWeight = getExecutionCount(&(F->getEntryBlock())) - ReturnWeight;
-
-      Edge e = *ReturnEdges.begin();
-      setEdgeWeight(e,ReturnWeight);
-      setExecutionCount(e.first,ReturnWeight);
-
-      Unvisited.erase(e.first);
-      ReturnEdges.erase(e);
-      continue;
-    }
-
-    // Calculate all blocks where only one edge is missing, this may also
-    // resolve furhter return edges.
-    std::set<const BasicBlock *>::iterator FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      Edge e;
-      if(CalculateMissingEdge(BB,e,true)) {
-        if (BlockInformation[F].find(BB) == BlockInformation[F].end()) {
-          setExecutionCount(BB,getExecutionCount(BB));
-        }
-        Unvisited.erase(BB);
-        if (e.first != 0 && e.second == 0) {
-          ReturnEdges.erase(e);
-          ReturnWeight += getEdgeWeight(e);
-        }
-      }
-    }
-    if (oldUnvisitedCount > Unvisited.size()) continue;
-
-    // Estimate edge weights by dividing the flow proportionally.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      const BasicBlock *Dest = 0;
-      bool AllEdgesHaveSameReturn = true;
-      // Check each Successor, these must all end up in the same or an empty
-      // return block otherwise its dangerous to do an estimation on them.
-      for (succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
-           Succ != End; ++Succ) {
-        Path P;
-        GetPath(*Succ, 0, P, GetPathToExit);
-        if (Dest && Dest != P[(const BasicBlock*)0]) {
-          AllEdgesHaveSameReturn = false;
-        }
-        Dest = P[(const BasicBlock*)0];
-      }
-      if (AllEdgesHaveSameReturn) {
-        if(EstimateMissingEdges(BB)) {
-          Unvisited.erase(BB);
-          break;
-        }
-      }
-    }
-    if (oldUnvisitedCount > Unvisited.size()) continue;
-
-    // Check if there is a path to an block that has a known value and redirect
-    // flow accordingly.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      // Fetch path.
-      const BasicBlock *BB = *FI; ++FI;
-      Path P;
-      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToValue);
-
-      // Calculate incoming flow.
-      double iw = 0; unsigned inmissing = 0; unsigned incount = 0; unsigned invalid = 0;
-      std::set<const BasicBlock *> Processed;
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(*NBB, BB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            iw += ew;
-            invalid++;
-          } else {
-            // If the path contains the successor, this means its a backedge,
-            // do not count as missing.
-            if (P.find(*NBB) == P.end())
-              inmissing++;
-          }
-          incount++;
-        }
-      }
-      if (inmissing == incount) continue;
-      if (invalid == 0) continue;
-
-      // Subtract (already) outgoing flow.
-      Processed.clear();
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(BB, *NBB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            iw -= ew;
-          }
-        }
-      }
-      if (iw < 0) continue;
-
-      // Check the receiving end of the path if it can handle the flow.
-      double ow = getExecutionCount(Dest);
-      Processed.clear();
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(BB, *NBB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            ow -= ew;
-          }
-        }
-      }
-      if (ow < 0) continue;
-
-      // Determine how much flow shall be used.
-      double ew = getEdgeWeight(getEdge(P[Dest],Dest));
-      if (ew != MissingValue) {
-        ew = ew<ow?ew:ow;
-        ew = ew<iw?ew:iw;
-      } else {
-        if (inmissing == 0)
-          ew = iw;
-      }
-
-      // Create flow.
-      if (ew != MissingValue) {
-        do {
-          Edge e = getEdge(P[Dest],Dest);
-          if (getEdgeWeight(e) == MissingValue) {
-            setEdgeWeight(e,ew);
-            FoundPath = true;
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Calculate a block with self loop.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-      bool SelfEdgeFound = false;
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (*NBB == BB) {
-          SelfEdgeFound = true;
-          break;
-        }
-      }
-      if (SelfEdgeFound) {
-        Edge e = getEdge(BB,BB);
-        if (getEdgeWeight(e) == MissingValue) {
-          double iw = 0;
-          std::set<const BasicBlock *> Processed;
-          for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-               NBB != End; ++NBB) {
-            if (Processed.insert(*NBB).second) {
-              Edge e = getEdge(*NBB, BB);
-              double ew = getEdgeWeight(e);
-              if (ew != MissingValue) {
-                iw += ew;
-              }
-            }
-          }
-          setEdgeWeight(e,iw * 10);
-          FoundPath = true;
-        }
-      }
-    }
-    if (FoundPath) continue;
-
-    // Determine backedges, set them to zero.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-      const BasicBlock *Dest = 0;
-      Path P;
-      bool BackEdgeFound = false;
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        Dest = GetPath(BB, *NBB, P, GetPathToDest | GetPathWithNewEdges);
-        if (Dest == *NBB) {
-          BackEdgeFound = true;
-          break;
-        }
-      }
-      if (BackEdgeFound) {
-        Edge e = getEdge(Dest,BB);
-        double w = getEdgeWeight(e);
-        if (w == MissingValue) {
-          setEdgeWeight(e,0);
-          FoundPath = true;
-        }
-        do {
-          Edge e = getEdge(P[Dest], Dest);
-          double w = getEdgeWeight(e);
-          if (w == MissingValue) {
-            setEdgeWeight(e,0);
-            FoundPath = true;
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Channel flow to return block.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-
-      Path P;
-      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToExit | GetPathWithNewEdges);
-      Dest = P[(const BasicBlock*)0];
-      if (!Dest) continue;
-
-      if (getEdgeWeight(getEdge(Dest,0)) == MissingValue) {
-        // Calculate incoming flow.
-        double iw = 0;
-        std::set<const BasicBlock *> Processed;
-        for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-             NBB != End; ++NBB) {
-          if (Processed.insert(*NBB).second) {
-            Edge e = getEdge(*NBB, BB);
-            double ew = getEdgeWeight(e);
-            if (ew != MissingValue) {
-              iw += ew;
-            }
-          }
-        }
-        do {
-          Edge e = getEdge(P[Dest], Dest);
-          double w = getEdgeWeight(e);
-          if (w == MissingValue) {
-            setEdgeWeight(e,iw);
-            FoundPath = true;
-          } else {
-            assert(0 && "Edge should not have value already!");
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Speculatively set edges to zero.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        Edge e = getEdge(*NBB,BB);
-        double w = getEdgeWeight(e);
-        if (w == MissingValue) {
-          setEdgeWeight(e,0);
-          FoundPath = true;
-          break;
-        }
-      }
-    }
-    if (FoundPath) continue;
-
-    errs() << "{";
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      dbgs() << BB->getName();
-      if (FI != FE)
-        dbgs() << ",";
-    }
-    errs() << "}";
-
-    errs() << "ASSERT: could not repair function";
-    assert(0 && "could not repair function");
-  }
-
-  EdgeWeights J = EdgeInformation[F];
-  for (EdgeWeights::iterator EI = J.begin(), EE = J.end(); EI != EE; ++EI) {
-    Edge e = EI->first;
-
-    bool SuccFound = false;
-    if (e.first != 0) {
-      succ_const_iterator NBB = succ_begin(e.first), End = succ_end(e.first);
-      if (NBB == End) {
-        if (0 == e.second) {
-          SuccFound = true;
-        }
-      }
-      for (;NBB != End; ++NBB) {
-        if (*NBB == e.second) {
-          SuccFound = true;
-          break;
-        }
-      }
-      if (!SuccFound) {
-        removeEdge(e);
-      }
-    }
-  }
-}
-
-raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF) {
-  return O << MF->getFunction()->getName() << "(MF)";
-}
-
-raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB) {
-  return O << MBB->getBasicBlock()->getName() << "(MB)";
-}
-
-raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first;
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second;
-  else
-    O << "0";
-
-  return O << ")";
-}
-
-} // namespace llvm
-
-//===----------------------------------------------------------------------===//
-//  NoProfile ProfileInfo implementation
-//
-
-namespace {
-  struct NoProfileInfo : public ImmutablePass, public ProfileInfo {
-    static char ID; // Class identification, replacement for typeinfo
-    NoProfileInfo() : ImmutablePass(ID) {
-      initializeNoProfileInfoPass(*PassRegistry::getPassRegistry());
-    }
-    
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    virtual const char *getPassName() const {
-      return "NoProfileInfo";
-    }
-  };
-}  // End of anonymous namespace
-
-char NoProfileInfo::ID = 0;
-// Register this pass...
-INITIALIZE_AG_PASS(NoProfileInfo, ProfileInfo, "no-profile",
-                   "No Profile Information", false, true, true)
-
-ImmutablePass *llvm::createNoProfileInfoPass() { return new NoProfileInfo(); }
diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp
deleted file mode 100644
index f1f3e94..0000000
--- a/lib/Analysis/ProfileInfoLoader.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===- ProfileInfoLoad.cpp - Load profile information from disk -----------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileInfoLoader class is used to load and represent profiling
-// information read in from the dump file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdio>
-#include <cstdlib>
-using namespace llvm;
-
-// ByteSwap - Byteswap 'Var' if 'Really' is true.
-//
-static inline unsigned ByteSwap(unsigned Var, bool Really) {
-  if (!Really) return Var;
-  return ((Var & (255U<< 0U)) << 24U) |
-         ((Var & (255U<< 8U)) <<  8U) |
-         ((Var & (255U<<16U)) >>  8U) |
-         ((Var & (255U<<24U)) >> 24U);
-}
-
-static unsigned AddCounts(unsigned A, unsigned B) {
-  // If either value is undefined, use the other.
-  if (A == ProfileInfoLoader::Uncounted) return B;
-  if (B == ProfileInfoLoader::Uncounted) return A;
-  return A + B;
-}
-
-static void ReadProfilingBlock(const char *ToolName, FILE *F,
-                               bool ShouldByteSwap,
-                               std::vector<unsigned> &Data) {
-  // Read the number of entries...
-  unsigned NumEntries;
-  if (fread(&NumEntries, sizeof(unsigned), 1, F) != 1) {
-    errs() << ToolName << ": data packet truncated!\n";
-    perror(0);
-    exit(1);
-  }
-  NumEntries = ByteSwap(NumEntries, ShouldByteSwap);
-
-  // Read the counts...
-  std::vector<unsigned> TempSpace(NumEntries);
-
-  // Read in the block of data...
-  if (fread(&TempSpace[0], sizeof(unsigned)*NumEntries, 1, F) != 1) {
-    errs() << ToolName << ": data packet truncated!\n";
-    perror(0);
-    exit(1);
-  }
-
-  // Make sure we have enough space... The space is initialised to -1 to
-  // facitiltate the loading of missing values for OptimalEdgeProfiling.
-  if (Data.size() < NumEntries)
-    Data.resize(NumEntries, ProfileInfoLoader::Uncounted);
-
-  // Accumulate the data we just read into the data.
-  if (!ShouldByteSwap) {
-    for (unsigned i = 0; i != NumEntries; ++i) {
-      Data[i] = AddCounts(TempSpace[i], Data[i]);
-    }
-  } else {
-    for (unsigned i = 0; i != NumEntries; ++i) {
-      Data[i] = AddCounts(ByteSwap(TempSpace[i], true), Data[i]);
-    }
-  }
-}
-
-const unsigned ProfileInfoLoader::Uncounted = ~0U;
-
-// ProfileInfoLoader ctor - Read the specified profiling data file, exiting the
-// program if the file is invalid or broken.
-//
-ProfileInfoLoader::ProfileInfoLoader(const char *ToolName,
-                                     const std::string &Filename)
-  : Filename(Filename) {
-  FILE *F = fopen(Filename.c_str(), "rb");
-  if (F == 0) {
-    errs() << ToolName << ": Error opening '" << Filename << "': ";
-    perror(0);
-    exit(1);
-  }
-
-  // Keep reading packets until we run out of them.
-  unsigned PacketType;
-  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
-    // If the low eight bits of the packet are zero, we must be dealing with an
-    // endianness mismatch.  Byteswap all words read from the profiling
-    // information.
-    bool ShouldByteSwap = (char)PacketType == 0;
-    PacketType = ByteSwap(PacketType, ShouldByteSwap);
-
-    switch (PacketType) {
-    case ArgumentInfo: {
-      unsigned ArgLength;
-      if (fread(&ArgLength, sizeof(unsigned), 1, F) != 1) {
-        errs() << ToolName << ": arguments packet truncated!\n";
-        perror(0);
-        exit(1);
-      }
-      ArgLength = ByteSwap(ArgLength, ShouldByteSwap);
-
-      // Read in the arguments...
-      std::vector<char> Chars(ArgLength+4);
-
-      if (ArgLength)
-        if (fread(&Chars[0], (ArgLength+3) & ~3, 1, F) != 1) {
-          errs() << ToolName << ": arguments packet truncated!\n";
-          perror(0);
-          exit(1);
-        }
-      CommandLines.push_back(std::string(&Chars[0], &Chars[ArgLength]));
-      break;
-    }
-
-    case FunctionInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, FunctionCounts);
-      break;
-
-    case BlockInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BlockCounts);
-      break;
-
-    case EdgeInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
-      break;
-
-    case OptEdgeInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, OptimalEdgeCounts);
-      break;
-
-    case BBTraceInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BBTrace);
-      break;
-
-    default:
-      errs() << ToolName << ": Unknown packet type #" << PacketType << "!\n";
-      exit(1);
-    }
-  }
-
-  fclose(F);
-}
-
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
deleted file mode 100644
index 346f8d6..0000000
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-//===- ProfileInfoLoaderPass.cpp - LLVM Pass to load profile info ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a concrete implementation of profiling information that
-// loads the information from a profile dump file.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-loader"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumEdgesRead, "The # of edges read.");
-
-static cl::opt<std::string>
-ProfileInfoFilename("profile-info-file", cl::init("llvmprof.out"),
-                    cl::value_desc("filename"),
-                    cl::desc("Profile file loaded by -profile-loader"));
-
-namespace {
-  class LoaderPass : public ModulePass, public ProfileInfo {
-    std::string Filename;
-    std::set<Edge> SpanningTree;
-    std::set<const BasicBlock*> BBisUnvisited;
-    unsigned ReadCount;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit LoaderPass(const std::string &filename = "")
-      : ModulePass(ID), Filename(filename) {
-      initializeLoaderPassPass(*PassRegistry::getPassRegistry());
-      if (filename.empty()) Filename = ProfileInfoFilename;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profiling information loader";
-    }
-
-    // recurseBasicBlock() - Calculates the edge weights for as much basic
-    // blocks as possbile.
-    virtual void recurseBasicBlock(const BasicBlock *BB);
-    virtual void readEdgeOrRemember(Edge, Edge&, unsigned &, double &);
-    virtual void readEdge(ProfileInfo::Edge, std::vector<unsigned>&);
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    /// run - Load the profile information from the specified file.
-    virtual bool runOnModule(Module &M);
-  };
-}  // End of anonymous namespace
-
-char LoaderPass::ID = 0;
-INITIALIZE_AG_PASS(LoaderPass, ProfileInfo, "profile-loader",
-              "Load profile information from llvmprof.out", false, true, false)
-
-char &llvm::ProfileLoaderPassID = LoaderPass::ID;
-
-ModulePass *llvm::createProfileLoaderPass() { return new LoaderPass(); }
-
-/// createProfileLoaderPass - This function returns a Pass that loads the
-/// profiling information for the module from the specified filename, making it
-/// available to the optimizers.
-Pass *llvm::createProfileLoaderPass(const std::string &Filename) {
-  return new LoaderPass(Filename);
-}
-
-void LoaderPass::readEdgeOrRemember(Edge edge, Edge &tocalc, 
-                                    unsigned &uncalc, double &count) {
-  double w;
-  if ((w = getEdgeWeight(edge)) == MissingValue) {
-    tocalc = edge;
-    uncalc++;
-  } else {
-    count+=w;
-  }
-}
-
-// recurseBasicBlock - Visits all neighbours of a block and then tries to
-// calculate the missing edge values.
-void LoaderPass::recurseBasicBlock(const BasicBlock *BB) {
-
-  // break recursion if already visited
-  if (BBisUnvisited.find(BB) == BBisUnvisited.end()) return;
-  BBisUnvisited.erase(BB);
-  if (!BB) return;
-
-  for (succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-  for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-
-  Edge tocalc;
-  if (CalculateMissingEdge(BB, tocalc)) {
-    SpanningTree.erase(tocalc);
-  }
-}
-
-void LoaderPass::readEdge(ProfileInfo::Edge e,
-                          std::vector<unsigned> &ECs) {
-  if (ReadCount < ECs.size()) {
-    double weight = ECs[ReadCount++];
-    if (weight != ProfileInfoLoader::Uncounted) {
-      // Here the data realm changes from the unsigned of the file to the
-      // double of the ProfileInfo. This conversion is save because we know
-      // that everything thats representable in unsinged is also representable
-      // in double.
-      EdgeInformation[getFunction(e)][e] += (double)weight;
-
-      DEBUG(dbgs() << "--Read Edge Counter for " << e
-                   << " (# "<< (ReadCount-1) << "): "
-                   << (unsigned)getEdgeWeight(e) << "\n");
-    } else {
-      // This happens only if reading optimal profiling information, not when
-      // reading regular profiling information.
-      SpanningTree.insert(e);
-    }
-  }
-}
-
-bool LoaderPass::runOnModule(Module &M) {
-  ProfileInfoLoader PIL("profile-loader", Filename);
-
-  EdgeInformation.clear();
-  std::vector<unsigned> Counters = PIL.getRawEdgeCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
-        }
-      }
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-    NumEdgesRead = ReadCount;
-  }
-
-  Counters = PIL.getRawOptimalEdgeCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
-        if (TI->getNumSuccessors() == 0) {
-          readEdge(getEdge(BB,0), Counters);
-        }
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
-        }
-      }
-      while (SpanningTree.size() > 0) {
-
-        unsigned size = SpanningTree.size();
-
-        BBisUnvisited.clear();
-        for (std::set<Edge>::iterator ei = SpanningTree.begin(),
-             ee = SpanningTree.end(); ei != ee; ++ei) {
-          BBisUnvisited.insert(ei->first);
-          BBisUnvisited.insert(ei->second);
-        }
-        while (BBisUnvisited.size() > 0) {
-          recurseBasicBlock(*BBisUnvisited.begin());
-        }
-
-        if (SpanningTree.size() == size) {
-          DEBUG(dbgs()<<"{");
-          for (std::set<Edge>::iterator ei = SpanningTree.begin(),
-               ee = SpanningTree.end(); ei != ee; ++ei) {
-            DEBUG(dbgs()<< *ei <<",");
-          }
-          assert(0 && "No edge calculated!");
-        }
-
-      }
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-    NumEdgesRead = ReadCount;
-  }
-
-  BlockInformation.clear();
-  Counters = PIL.getRawBlockCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-        if (ReadCount < Counters.size())
-          // Here the data realm changes from the unsigned of the file to the
-          // double of the ProfileInfo. This conversion is save because we know
-          // that everything thats representable in unsinged is also
-          // representable in double.
-          BlockInformation[F][BB] = (double)Counters[ReadCount++];
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-  }
-
-  FunctionInformation.clear();
-  Counters = PIL.getRawFunctionCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      if (ReadCount < Counters.size())
-        // Here the data realm changes from the unsigned of the file to the
-        // double of the ProfileInfo. This conversion is save because we know
-        // that everything thats representable in unsinged is also
-        // representable in double.
-        FunctionInformation[F] = (double)Counters[ReadCount++];
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Analysis/ProfileVerifierPass.cpp b/lib/Analysis/ProfileVerifierPass.cpp
deleted file mode 100644
index c8896de..0000000
--- a/lib/Analysis/ProfileVerifierPass.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-//===- ProfileVerifierPass.cpp - LLVM Pass to estimate profile info -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a pass that checks profiling information for 
-// plausibility.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-verifier"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Support/raw_ostream.h"
-#include <set>
-using namespace llvm;
-
-static cl::opt<bool,false>
-ProfileVerifierDisableAssertions("profile-verifier-noassert",
-     cl::desc("Disable assertions"));
-
-namespace {
-  template<class FType, class BType>
-  class ProfileVerifierPassT : public FunctionPass {
-
-    struct DetailedBlockInfo {
-      const BType *BB;
-      double      BBWeight;
-      double      inWeight;
-      int         inCount;
-      double      outWeight;
-      int         outCount;
-    };
-
-    ProfileInfoT<FType, BType> *PI;
-    std::set<const BType*> BBisVisited;
-    std::set<const FType*>   FisVisited;
-    bool DisableAssertions;
-
-    // When debugging is enabled, the verifier prints a whole slew of debug
-    // information, otherwise its just the assert. These are all the helper
-    // functions.
-    bool PrintedDebugTree;
-    std::set<const BType*> BBisPrinted;
-    void debugEntry(DetailedBlockInfo*);
-    void printDebugInfo(const BType *BB);
-
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-
-    explicit ProfileVerifierPassT () : FunctionPass(ID) {
-      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
-      DisableAssertions = ProfileVerifierDisableAssertions;
-    }
-    explicit ProfileVerifierPassT (bool da) : FunctionPass(ID), 
-                                              DisableAssertions(da) {
-      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-      AU.addRequired<ProfileInfoT<FType, BType> >();
-    }
-
-    const char *getPassName() const {
-      return "Profiling information verifier";
-    }
-
-    /// run - Verify the profile information.
-    bool runOnFunction(FType &F);
-    void recurseBasicBlock(const BType*);
-
-    bool   exitReachable(const FType*);
-    double ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge);
-    void   CheckValue(bool, const char*, DetailedBlockInfo*);
-  };
-
-  typedef ProfileVerifierPassT<Function, BasicBlock> ProfileVerifierPass;
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::printDebugInfo(const BType *BB) {
-
-    if (BBisPrinted.find(BB) != BBisPrinted.end()) return;
-
-    double BBWeight = PI->getExecutionCount(BB);
-    if (BBWeight == ProfileInfoT<FType, BType>::MissingValue) { BBWeight = 0; }
-    double inWeight = 0;
-    int inCount = 0;
-    std::set<const BType*> ProcessedPreds;
-    for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-         bbi != bbe; ++bbi ) {
-      if (ProcessedPreds.insert(*bbi).second) {
-        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(*bbi,BB);
-        double EdgeWeight = PI->getEdgeWeight(E);
-        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
-        dbgs() << "calculated in-edge " << E << ": " 
-               << format("%20.20g",EdgeWeight) << "\n";
-        inWeight += EdgeWeight;
-        inCount++;
-      }
-    }
-    double outWeight = 0;
-    int outCount = 0;
-    std::set<const BType*> ProcessedSuccs;
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-          bbi != bbe; ++bbi ) {
-      if (ProcessedSuccs.insert(*bbi).second) {
-        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(BB,*bbi);
-        double EdgeWeight = PI->getEdgeWeight(E);
-        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
-        dbgs() << "calculated out-edge " << E << ": " 
-               << format("%20.20g",EdgeWeight) << "\n";
-        outWeight += EdgeWeight;
-        outCount++;
-      }
-    }
-    dbgs() << "Block " << BB->getName()                   << " in "
-           << BB->getParent()->getName()                  << ":"
-           << "BBWeight="  << format("%20.20g",BBWeight)  << ","
-           << "inWeight="  << format("%20.20g",inWeight)  << ","
-           << "inCount="   << inCount                     << ","
-           << "outWeight=" << format("%20.20g",outWeight) << ","
-           << "outCount"   << outCount                    << "\n";
-
-    // mark as visited and recurse into subnodes
-    BBisPrinted.insert(BB);
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-          bbi != bbe; ++bbi ) {
-      printDebugInfo(*bbi);
-    }
-  }
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::debugEntry (DetailedBlockInfo *DI) {
-    dbgs() << "TROUBLE: Block " << DI->BB->getName()          << " in "
-           << DI->BB->getParent()->getName()                  << ":"
-           << "BBWeight="  << format("%20.20g",DI->BBWeight)  << ","
-           << "inWeight="  << format("%20.20g",DI->inWeight)  << ","
-           << "inCount="   << DI->inCount                     << ","
-           << "outWeight=" << format("%20.20g",DI->outWeight) << ","
-           << "outCount="  << DI->outCount                    << "\n";
-    if (!PrintedDebugTree) {
-      PrintedDebugTree = true;
-      printDebugInfo(&(DI->BB->getParent()->getEntryBlock()));
-    }
-  }
-
-  // This compares A and B for equality.
-  static bool Equals(double A, double B) {
-    return A == B;
-  }
-
-  // This checks if the function "exit" is reachable from an given function
-  // via calls, this is necessary to check if a profile is valid despite the
-  // counts not fitting exactly.
-  template<class FType, class BType>
-  bool ProfileVerifierPassT<FType, BType>::exitReachable(const FType *F) {
-    if (!F) return false;
-
-    if (FisVisited.count(F)) return false;
-
-    FType *Exit = F->getParent()->getFunction("exit");
-    if (Exit == F) {
-      return true;
-    }
-
-    FisVisited.insert(F);
-    bool exits = false;
-    for (const_inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&*I)) {
-        FType *F = CI->getCalledFunction();
-        if (F) {
-          exits |= exitReachable(F);
-        } else {
-          // This is a call to a pointer, all bets are off...
-          exits = true;
-        }
-        if (exits) break;
-      }
-    }
-    return exits;
-  }
-
-  #define ASSERTMESSAGE(M) \
-    { dbgs() << "ASSERT:" << (M) << "\n"; \
-      if (!DisableAssertions) assert(0 && (M)); }
-
-  template<class FType, class BType>
-  double ProfileVerifierPassT<FType, BType>::ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge E) {
-    double EdgeWeight = PI->getEdgeWeight(E);
-    if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) {
-      dbgs() << "Edge " << E << " in Function " 
-             << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
-      ASSERTMESSAGE("Edge has missing value");
-      return 0;
-    } else {
-      if (EdgeWeight < 0) {
-        dbgs() << "Edge " << E << " in Function " 
-               << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
-        ASSERTMESSAGE("Edge has negative value");
-      }
-      return EdgeWeight;
-    }
-  }
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::CheckValue(bool Error, 
-                                                      const char *Message,
-                                                      DetailedBlockInfo *DI) {
-    if (Error) {
-      DEBUG(debugEntry(DI));
-      dbgs() << "Block " << DI->BB->getName() << " in Function "
-             << DI->BB->getParent()->getName() << ": ";
-      ASSERTMESSAGE(Message);
-    }
-    return;
-  }
-
-  // This calculates the Information for a block and then recurses into the
-  // successors.
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::recurseBasicBlock(const BType *BB) {
-
-    // Break the recursion by remembering all visited blocks.
-    if (BBisVisited.find(BB) != BBisVisited.end()) return;
-
-    // Use a data structure to store all the information, this can then be handed
-    // to debug printers.
-    DetailedBlockInfo DI;
-    DI.BB = BB;
-    DI.outCount = DI.inCount = 0;
-    DI.inWeight = DI.outWeight = 0;
-
-    // Read predecessors.
-    std::set<const BType*> ProcessedPreds;
-    const_pred_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
-    // If there are none, check for (0,BB) edge.
-    if (bpi == bpe) {
-      DI.inWeight += ReadOrAssert(PI->getEdge(0,BB));
-      DI.inCount++;
-    }
-    for (;bpi != bpe; ++bpi) {
-      if (ProcessedPreds.insert(*bpi).second) {
-        DI.inWeight += ReadOrAssert(PI->getEdge(*bpi,BB));
-        DI.inCount++;
-      }
-    }
-
-    // Read successors.
-    std::set<const BType*> ProcessedSuccs;
-    succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-    // If there is an (0,BB) edge, consider it too. (This is done not only when
-    // there are no successors, but every time; not every function contains
-    // return blocks with no successors (think loop latch as return block)).
-    double w = PI->getEdgeWeight(PI->getEdge(BB,0));
-    if (w != ProfileInfoT<FType, BType>::MissingValue) {
-      DI.outWeight += w;
-      DI.outCount++;
-    }
-    for (;bbi != bbe; ++bbi) {
-      if (ProcessedSuccs.insert(*bbi).second) {
-        DI.outWeight += ReadOrAssert(PI->getEdge(BB,*bbi));
-        DI.outCount++;
-      }
-    }
-
-    // Read block weight.
-    DI.BBWeight = PI->getExecutionCount(BB);
-    CheckValue(DI.BBWeight == ProfileInfoT<FType, BType>::MissingValue,
-               "BasicBlock has missing value", &DI);
-    CheckValue(DI.BBWeight < 0,
-               "BasicBlock has negative value", &DI);
-
-    // Check if this block is a setjmp target.
-    bool isSetJmpTarget = false;
-    if (DI.outWeight > DI.inWeight) {
-      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
-           i != ie; ++i) {
-        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-          FType *F = CI->getCalledFunction();
-          if (F && (F->getName() == "_setjmp")) {
-            isSetJmpTarget = true; break;
-          }
-        }
-      }
-    }
-    // Check if this block is eventually reaching exit.
-    bool isExitReachable = false;
-    if (DI.inWeight > DI.outWeight) {
-      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
-           i != ie; ++i) {
-        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-          FType *F = CI->getCalledFunction();
-          if (F) {
-            FisVisited.clear();
-            isExitReachable |= exitReachable(F);
-          } else {
-            // This is a call to a pointer, all bets are off...
-            isExitReachable = true;
-          }
-          if (isExitReachable) break;
-        }
-      }
-    }
-
-    if (DI.inCount > 0 && DI.outCount == 0) {
-       // If this is a block with no successors.
-      if (!isSetJmpTarget) {
-        CheckValue(!Equals(DI.inWeight,DI.BBWeight), 
-                   "inWeight and BBWeight do not match", &DI);
-      }
-    } else if (DI.inCount == 0 && DI.outCount > 0) {
-      // If this is a block with no predecessors.
-      if (!isExitReachable)
-        CheckValue(!Equals(DI.BBWeight,DI.outWeight), 
-                   "BBWeight and outWeight do not match", &DI);
-    } else {
-      // If this block has successors and predecessors.
-      if (DI.inWeight > DI.outWeight && !isExitReachable)
-        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                   "inWeight and outWeight do not match", &DI);
-      if (DI.inWeight < DI.outWeight && !isSetJmpTarget)
-        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                   "inWeight and outWeight do not match", &DI);
-    }
-
-
-    // Mark this block as visited, rescurse into successors.
-    BBisVisited.insert(BB);
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-          bbi != bbe; ++bbi ) {
-      recurseBasicBlock(*bbi);
-    }
-  }
-
-  template<class FType, class BType>
-  bool ProfileVerifierPassT<FType, BType>::runOnFunction(FType &F) {
-    PI = getAnalysisIfAvailable<ProfileInfoT<FType, BType> >();
-    if (!PI)
-      ASSERTMESSAGE("No ProfileInfo available");
-
-    // Prepare global variables.
-    PrintedDebugTree = false;
-    BBisVisited.clear();
-
-    // Fetch entry block and recurse into it.
-    const BType *entry = &F.getEntryBlock();
-    recurseBasicBlock(entry);
-
-    if (PI->getExecutionCount(&F) != PI->getExecutionCount(entry))
-      ASSERTMESSAGE("Function count and entry block count do not match");
-
-    return false;
-  }
-
-  template<class FType, class BType>
-  char ProfileVerifierPassT<FType, BType>::ID = 0;
-}
-
-INITIALIZE_PASS_BEGIN(ProfileVerifierPass, "profile-verifier",
-                "Verify profiling information", false, true)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(ProfileVerifierPass, "profile-verifier",
-                "Verify profiling information", false, true)
-
-namespace llvm {
-  FunctionPass *createProfileVerifierPass() {
-    return new ProfileVerifierPass(ProfileVerifierDisableAssertions); 
-  }
-}
-
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 8577025..5635688 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -9,6 +9,7 @@
 // Detects single entry single exit regions in the control flow graph.
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "region"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -17,12 +18,9 @@
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-
-#define DEBUG_TYPE "region"
 #include "llvm/Support/Debug.h"
-
-#include <set>
 #include <algorithm>
+#include <set>
 
 using namespace llvm;
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index f5d095b..0a02f4e 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -2590,55 +2590,39 @@ const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
   return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
 }
 
-const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy) {
+const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
-                       TD->getTypeAllocSize(AllocTy));
+    return getConstant(IntTy, TD->getTypeAllocSize(AllocTy));
 
   Constant *C = ConstantExpr::getSizeOf(AllocTy);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
   Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
+  assert(Ty == IntTy && "Effective SCEV type doesn't match");
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
-const SCEV *ScalarEvolution::getAlignOfExpr(Type *AllocTy) {
-  Constant *C = ConstantExpr::getAlignOf(AllocTy);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
-      C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
-  return getTruncateOrZeroExtend(getSCEV(C), Ty);
-}
-
-const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy,
+const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy,
+                                             StructType *STy,
                                              unsigned FieldNo) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
-  if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
+  if (TD) {
+    return getConstant(IntTy,
                        TD->getStructLayout(STy)->getElementOffset(FieldNo));
+  }
 
   Constant *C = ConstantExpr::getOffsetOf(STy, FieldNo);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
-  return getTruncateOrZeroExtend(getSCEV(C), Ty);
-}
 
-const SCEV *ScalarEvolution::getOffsetOfExpr(Type *CTy,
-                                             Constant *FieldNo) {
-  Constant *C = ConstantExpr::getOffsetOf(CTy, FieldNo);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
-      C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(CTy));
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
@@ -2703,12 +2687,15 @@ uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
 Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
-  if (Ty->isIntegerTy())
+  if (Ty->isIntegerTy()) {
     return Ty;
+  }
 
   // The only other support type is pointer.
   assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
-  if (TD) return TD->getIntPtrType(getContext());
+
+  if (TD)
+    return TD->getIntPtrType(Ty);
 
   // Without DataLayout, conservatively assume pointers are 64-bit.
   return Type::getInt64Ty(getContext());
@@ -3101,15 +3088,26 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
                   Flags = setFlags(Flags, SCEV::FlagNUW);
                 if (OBO->hasNoSignedWrap())
                   Flags = setFlags(Flags, SCEV::FlagNSW);
-              } else if (const GEPOperator *GEP =
-                         dyn_cast<GEPOperator>(BEValueV)) {
+              } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
                 // If the increment is an inbounds GEP, then we know the address
                 // space cannot be wrapped around. We cannot make any guarantee
                 // about signed or unsigned overflow because pointers are
                 // unsigned but we may have a negative index from the base
-                // pointer.
-                if (GEP->isInBounds())
+                // pointer. We can guarantee that no unsigned wrap occurs if the
+                // indices form a positive value.
+                if (GEP->isInBounds()) {
                   Flags = setFlags(Flags, SCEV::FlagNW);
+
+                  const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
+                  if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
+                    Flags = setFlags(Flags, SCEV::FlagNUW);
+                }
+              } else if (const SubOperator *OBO =
+                           dyn_cast<SubOperator>(BEValueV)) {
+                if (OBO->hasNoUnsignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNUW);
+                if (OBO->hasNoSignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNSW);
               }
 
               const SCEV *StartVal = getSCEV(StartValueV);
@@ -3177,18 +3175,18 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
 /// operations. This allows them to be analyzed by regular SCEV code.
 ///
 const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
+  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
+  Value *Base = GEP->getOperand(0);
+  // Don't attempt to analyze GEPs over unsized objects.
+  if (!Base->getType()->getPointerElementType()->isSized())
+    return getUnknown(GEP);
 
   // Don't blindly transfer the inbounds flag from the GEP instruction to the
   // Add expression, because the Instruction may be guarded by control flow
   // and the no-overflow bits may not be valid for the expression in any
   // context.
-  bool isInBounds = GEP->isInBounds();
+  SCEV::NoWrapFlags Wrap = GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
-  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
-  Value *Base = GEP->getOperand(0);
-  // Don't attempt to analyze GEPs over unsized objects.
-  if (!cast<PointerType>(Base->getType())->getElementType()->isSized())
-    return getUnknown(GEP);
   const SCEV *TotalOffset = getConstant(IntPtrTy, 0);
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (GetElementPtrInst::op_iterator I = llvm::next(GEP->op_begin()),
@@ -3199,21 +3197,19 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
     if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
       // For a struct, add the member offset.
       unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
-      const SCEV *FieldOffset = getOffsetOfExpr(STy, FieldNo);
+      const SCEV *FieldOffset = getOffsetOfExpr(IntPtrTy, STy, FieldNo);
 
       // Add the field offset to the running total offset.
       TotalOffset = getAddExpr(TotalOffset, FieldOffset);
     } else {
       // For an array, add the element offset, explicitly scaled.
-      const SCEV *ElementSize = getSizeOfExpr(*GTI);
+      const SCEV *ElementSize = getSizeOfExpr(IntPtrTy, *GTI);
       const SCEV *IndexS = getSCEV(Index);
       // Getelementptr indices are signed.
       IndexS = getTruncateOrSignExtend(IndexS, IntPtrTy);
 
       // Multiply the index by the element size to compute the element offset.
-      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize,
-                                           isInBounds ? SCEV::FlagNSW :
-                                           SCEV::FlagAnyWrap);
+      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize, Wrap);
 
       // Add the element offset to the running total offset.
       TotalOffset = getAddExpr(TotalOffset, LocalOffset);
@@ -3224,8 +3220,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   const SCEV *BaseS = getSCEV(Base);
 
   // Add the total offset from all the GEP indices to the base.
-  return getAddExpr(BaseS, TotalOffset,
-                    isInBounds ? SCEV::FlagNSW : SCEV::FlagAnyWrap);
+  return getAddExpr(BaseS, TotalOffset, Wrap);
 }
 
 /// GetMinTrailingZeros - Determine the minimum number of zero bits that S is
@@ -4616,25 +4611,17 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
     if (EL.hasAnyInfo()) return EL;
     break;
   }
-  case ICmpInst::ICMP_SLT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, true, IsSubExpr);
-    if (EL.hasAnyInfo()) return EL;
-    break;
-  }
-  case ICmpInst::ICMP_SGT: {
-    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                    getNotSCEV(RHS), L, true, IsSubExpr);
-    if (EL.hasAnyInfo()) return EL;
-    break;
-  }
-  case ICmpInst::ICMP_ULT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, false, IsSubExpr);
+  case ICmpInst::ICMP_SLT:
+  case ICmpInst::ICMP_ULT: {                    // while (X < Y)
+    bool IsSigned = Cond == ICmpInst::ICMP_SLT;
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, IsSigned, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
-  case ICmpInst::ICMP_UGT: {
-    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                    getNotSCEV(RHS), L, false, IsSubExpr);
+  case ICmpInst::ICMP_SGT:
+  case ICmpInst::ICMP_UGT: {                    // while (X > Y)
+    bool IsSigned = Cond == ICmpInst::ICMP_SGT;
+    ExitLimit EL = HowManyGreaterThans(LHS, RHS, L, IsSigned, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
@@ -5072,15 +5059,21 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
 /// original value V is returned.
 const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   // Check to see if we've folded this expression at this loop before.
-  std::map<const Loop *, const SCEV *> &Values = ValuesAtScopes[V];
-  std::pair<std::map<const Loop *, const SCEV *>::iterator, bool> Pair =
-    Values.insert(std::make_pair(L, static_cast<const SCEV *>(0)));
-  if (!Pair.second)
-    return Pair.first->second ? Pair.first->second : V;
-
+  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values = ValuesAtScopes[V];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == L)
+      return Values[u].second ? Values[u].second : V;
+  }
+  Values.push_back(std::make_pair(L, static_cast<const SCEV *>(0)));
   // Otherwise compute it.
   const SCEV *C = computeSCEVAtScope(V, L);
-  ValuesAtScopes[V][L] = C;
+  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values2 = ValuesAtScopes[V];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == L) {
+      Values2[u - 1].second = C;
+      break;
+    }
+  }
   return C;
 }
 
@@ -5119,18 +5112,23 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
     case scAddExpr: {
       const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
       if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
-        if (C->getType()->isPointerTy())
-          C = ConstantExpr::getBitCast(C, Type::getInt8PtrTy(C->getContext()));
+        if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+          unsigned AS = PTy->getAddressSpace();
+          Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
+          C = ConstantExpr::getBitCast(C, DestPtrTy);
+        }
         for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
           Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
           if (!C2) return 0;
 
           // First pointer!
           if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
+            unsigned AS = C2->getType()->getPointerAddressSpace();
             std::swap(C, C2);
+            Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
             // The offsets have been converted to bytes.  We can add bytes to an
             // i8* by GEP with the byte count in the first index.
-            C = ConstantExpr::getBitCast(C,Type::getInt8PtrTy(C->getContext()));
+            C = ConstantExpr::getBitCast(C, DestPtrTy);
           }
 
           // Don't bother trying to sum two pointers. We probably can't
@@ -5138,8 +5136,8 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
           if (C2->getType()->isPointerTy())
             return 0;
 
-          if (C->getType()->isPointerTy()) {
-            if (cast<PointerType>(C->getType())->getElementType()->isStructTy())
+          if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+            if (PTy->getElementType()->isStructTy())
               C2 = ConstantExpr::getIntegerCast(
                   C2, Type::getInt32Ty(C->getContext()), true);
             C = ConstantExpr::getGetElementPtr(C, C2);
@@ -6336,45 +6334,72 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
   return false;
 }
 
-/// getBECount - Subtract the end and start values and divide by the step,
-/// rounding up, to get the number of times the backedge is executed. Return
-/// CouldNotCompute if an intermediate computation overflows.
-const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
-                                        const SCEV *End,
-                                        const SCEV *Step,
-                                        bool NoWrap) {
-  assert(!isKnownNegative(Step) &&
-         "This code doesn't handle negative strides yet!");
-
-  Type *Ty = Start->getType();
-
-  // When Start == End, we have an exact BECount == 0. Short-circuit this case
-  // here because SCEV may not be able to determine that the unsigned division
-  // after rounding is zero.
-  if (Start == End)
-    return getConstant(Ty, 0);
-
-  const SCEV *NegOne = getConstant(Ty, (uint64_t)-1);
-  const SCEV *Diff = getMinusSCEV(End, Start);
-  const SCEV *RoundUp = getAddExpr(Step, NegOne);
-
-  // Add an adjustment to the difference between End and Start so that
-  // the division will effectively round up.
-  const SCEV *Add = getAddExpr(Diff, RoundUp);
-
-  if (!NoWrap) {
-    // Check Add for unsigned overflow.
-    // TODO: More sophisticated things could be done here.
-    Type *WideTy = IntegerType::get(getContext(),
-                                          getTypeSizeInBits(Ty) + 1);
-    const SCEV *EDiff = getZeroExtendExpr(Diff, WideTy);
-    const SCEV *ERoundUp = getZeroExtendExpr(RoundUp, WideTy);
-    const SCEV *OperandExtendedAdd = getAddExpr(EDiff, ERoundUp);
-    if (getZeroExtendExpr(Add, WideTy) != OperandExtendedAdd)
-      return getCouldNotCompute();
+// Verify if an linear IV with positive stride can overflow when in a 
+// less-than comparison, knowing the invariant term of the comparison, the 
+// stride and the knowledge of NSW/NUW flags on the recurrence.
+bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
+                                         bool IsSigned, bool NoWrap) {
+  if (NoWrap) return false;
+
+  unsigned BitWidth = getTypeSizeInBits(RHS->getType());
+  const SCEV *One = getConstant(Stride->getType(), 1);
+
+  if (IsSigned) {
+    APInt MaxRHS = getSignedRange(RHS).getSignedMax();
+    APInt MaxValue = APInt::getSignedMaxValue(BitWidth);
+    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
+                                .getSignedMax();
+
+    // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
+    return (MaxValue - MaxStrideMinusOne).slt(MaxRHS);
+  }
+
+  APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
+  APInt MaxValue = APInt::getMaxValue(BitWidth);
+  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
+                              .getUnsignedMax();
+
+  // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
+  return (MaxValue - MaxStrideMinusOne).ult(MaxRHS);
+}
+
+// Verify if an linear IV with negative stride can overflow when in a 
+// greater-than comparison, knowing the invariant term of the comparison,
+// the stride and the knowledge of NSW/NUW flags on the recurrence.
+bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
+                                         bool IsSigned, bool NoWrap) {
+  if (NoWrap) return false;
+
+  unsigned BitWidth = getTypeSizeInBits(RHS->getType());
+  const SCEV *One = getConstant(Stride->getType(), 1);
+
+  if (IsSigned) {
+    APInt MinRHS = getSignedRange(RHS).getSignedMin();
+    APInt MinValue = APInt::getSignedMinValue(BitWidth);
+    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
+                               .getSignedMax();
+
+    // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
+    return (MinValue + MaxStrideMinusOne).sgt(MinRHS);
   }
 
-  return getUDivExpr(Add, Step);
+  APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
+  APInt MinValue = APInt::getMinValue(BitWidth);
+  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
+                            .getUnsignedMax();
+
+  // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
+  return (MinValue + MaxStrideMinusOne).ugt(MinRHS);
+}
+
+// Compute the backedge taken count knowing the interval difference, the
+// stride and presence of the equality in the comparison.
+const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step, 
+                                            bool Equality) {
+  const SCEV *One = getConstant(Step->getType(), 1);
+  Delta = Equality ? getAddExpr(Delta, Step)
+                   : getAddExpr(Delta, getMinusSCEV(Step, One));
+  return getUDivExpr(Delta, Step);
 }
 
 /// HowManyLessThans - Return the number of times a backedge containing the
@@ -6386,119 +6411,144 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
 /// a subexpression that cannot overflow before evaluating true.
 ScalarEvolution::ExitLimit
 ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
-                                  const Loop *L, bool isSigned,
+                                  const Loop *L, bool IsSigned,
                                   bool IsSubExpr) {
-  // Only handle:  "ADDREC < LoopInvariant".
-  if (!isLoopInvariant(RHS, L)) return getCouldNotCompute();
+  // We handle only IV < Invariant
+  if (!isLoopInvariant(RHS, L))
+    return getCouldNotCompute();
 
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS);
-  if (!AddRec || AddRec->getLoop() != L)
+  const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+
+  // Avoid weird loops
+  if (!IV || IV->getLoop() != L || !IV->isAffine())
     return getCouldNotCompute();
 
-  // Check to see if we have a flag which makes analysis easy.
-  bool NoWrap = false;
-  if (!IsSubExpr) {
-    NoWrap = AddRec->getNoWrapFlags(
-      (SCEV::NoWrapFlags)(((isSigned ? SCEV::FlagNSW : SCEV::FlagNUW))
-                          | SCEV::FlagNW));
-  }
-  if (AddRec->isAffine()) {
-    unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
-    const SCEV *Step = AddRec->getStepRecurrence(*this);
+  bool NoWrap = !IsSubExpr &&
+                IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
 
-    if (Step->isZero())
-      return getCouldNotCompute();
-    if (Step->isOne()) {
-      // With unit stride, the iteration never steps past the limit value.
-    } else if (isKnownPositive(Step)) {
-      // Test whether a positive iteration can step past the limit
-      // value and past the maximum value for its type in a single step.
-      // Note that it's not sufficient to check NoWrap here, because even
-      // though the value after a wrap is undefined, it's not undefined
-      // behavior, so if wrap does occur, the loop could either terminate or
-      // loop infinitely, but in either case, the loop is guaranteed to
-      // iterate at least until the iteration where the wrapping occurs.
-      const SCEV *One = getConstant(Step->getType(), 1);
-      if (isSigned) {
-        APInt Max = APInt::getSignedMaxValue(BitWidth);
-        if ((Max - getSignedRange(getMinusSCEV(Step, One)).getSignedMax())
-              .slt(getSignedRange(RHS).getSignedMax()))
-          return getCouldNotCompute();
-      } else {
-        APInt Max = APInt::getMaxValue(BitWidth);
-        if ((Max - getUnsignedRange(getMinusSCEV(Step, One)).getUnsignedMax())
-              .ult(getUnsignedRange(RHS).getUnsignedMax()))
-          return getCouldNotCompute();
-      }
-    } else
-      // TODO: Handle negative strides here and below.
-      return getCouldNotCompute();
+  const SCEV *Stride = IV->getStepRecurrence(*this);
 
-    // We know the LHS is of the form {n,+,s} and the RHS is some loop-invariant
-    // m.  So, we count the number of iterations in which {n,+,s} < m is true.
-    // Note that we cannot simply return max(m-n,0)/s because it's not safe to
-    // treat m-n as signed nor unsigned due to overflow possibility.
-
-    // First, we get the value of the LHS in the first iteration: n
-    const SCEV *Start = AddRec->getOperand(0);
-
-    // Determine the minimum constant start value.
-    const SCEV *MinStart = getConstant(isSigned ?
-      getSignedRange(Start).getSignedMin() :
-      getUnsignedRange(Start).getUnsignedMin());
-
-    // If we know that the condition is true in order to enter the loop,
-    // then we know that it will run exactly (m-n)/s times. Otherwise, we
-    // only know that it will execute (max(m,n)-n)/s times. In both cases,
-    // the division must round up.
-    const SCEV *End = RHS;
-    if (!isLoopEntryGuardedByCond(L,
-                                  isSigned ? ICmpInst::ICMP_SLT :
-                                             ICmpInst::ICMP_ULT,
-                                  getMinusSCEV(Start, Step), RHS))
-      End = isSigned ? getSMaxExpr(RHS, Start)
-                     : getUMaxExpr(RHS, Start);
-
-    // Determine the maximum constant end value.
-    const SCEV *MaxEnd = getConstant(isSigned ?
-      getSignedRange(End).getSignedMax() :
-      getUnsignedRange(End).getUnsignedMax());
-
-    // If MaxEnd is within a step of the maximum integer value in its type,
-    // adjust it down to the minimum value which would produce the same effect.
-    // This allows the subsequent ceiling division of (N+(step-1))/step to
-    // compute the correct value.
-    const SCEV *StepMinusOne = getMinusSCEV(Step,
-                                            getConstant(Step->getType(), 1));
-    MaxEnd = isSigned ?
-      getSMinExpr(MaxEnd,
-                  getMinusSCEV(getConstant(APInt::getSignedMaxValue(BitWidth)),
-                               StepMinusOne)) :
-      getUMinExpr(MaxEnd,
-                  getMinusSCEV(getConstant(APInt::getMaxValue(BitWidth)),
-                               StepMinusOne));
-
-    // Finally, we subtract these two values and divide, rounding up, to get
-    // the number of times the backedge is executed.
-    const SCEV *BECount = getBECount(Start, End, Step, NoWrap);
-
-    // The maximum backedge count is similar, except using the minimum start
-    // value and the maximum end value.
-    // If we already have an exact constant BECount, use it instead.
-    const SCEV *MaxBECount = isa<SCEVConstant>(BECount) ? BECount
-      : getBECount(MinStart, MaxEnd, Step, NoWrap);
-
-    // If the stride is nonconstant, and NoWrap == true, then
-    // getBECount(MinStart, MaxEnd) may not compute. This would result in an
-    // exact BECount and invalid MaxBECount, which should be avoided to catch
-    // more optimization opportunities.
-    if (isa<SCEVCouldNotCompute>(MaxBECount))
-      MaxBECount = BECount;
-
-    return ExitLimit(BECount, MaxBECount);
-  }
+  // Avoid negative or zero stride values
+  if (!isKnownPositive(Stride))
+    return getCouldNotCompute();
 
-  return getCouldNotCompute();
+  // Avoid proven overflow cases: this will ensure that the backedge taken count
+  // will not generate any unsigned overflow. Relaxed no-overflow conditions
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // behaviors like the case of C language.
+  if (!Stride->isOne() && doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap))
+    return getCouldNotCompute();
+
+  ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT
+                                      : ICmpInst::ICMP_ULT;
+  const SCEV *Start = IV->getStart();
+  const SCEV *End = RHS;
+  if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS))
+    End = IsSigned ? getSMaxExpr(RHS, Start)
+                   : getUMaxExpr(RHS, Start);
+
+  const SCEV *BECount = computeBECount(getMinusSCEV(End, Start), Stride, false);
+
+  APInt MinStart = IsSigned ? getSignedRange(Start).getSignedMin()
+                            : getUnsignedRange(Start).getUnsignedMin();
+
+  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
+                             : getUnsignedRange(Stride).getUnsignedMin();
+
+  unsigned BitWidth = getTypeSizeInBits(LHS->getType());
+  APInt Limit = IsSigned ? APInt::getSignedMaxValue(BitWidth) - (MinStride - 1)
+                         : APInt::getMaxValue(BitWidth) - (MinStride - 1);
+
+  // Although End can be a MAX expression we estimate MaxEnd considering only
+  // the case End = RHS. This is safe because in the other case (End - Start)
+  // is zero, leading to a zero maximum backedge taken count.
+  APInt MaxEnd =
+    IsSigned ? APIntOps::smin(getSignedRange(RHS).getSignedMax(), Limit)
+             : APIntOps::umin(getUnsignedRange(RHS).getUnsignedMax(), Limit);
+
+  const SCEV *MaxBECount = getCouldNotCompute();
+  if (isa<SCEVConstant>(BECount))
+    MaxBECount = BECount;
+  else
+    MaxBECount = computeBECount(getConstant(MaxEnd - MinStart),
+                                getConstant(MinStride), false);
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    MaxBECount = BECount;
+
+  return ExitLimit(BECount, MaxBECount);
+}
+
+ScalarEvolution::ExitLimit
+ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
+                                     const Loop *L, bool IsSigned,
+                                     bool IsSubExpr) {
+  // We handle only IV > Invariant
+  if (!isLoopInvariant(RHS, L))
+    return getCouldNotCompute();
+
+  const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+
+  // Avoid weird loops
+  if (!IV || IV->getLoop() != L || !IV->isAffine())
+    return getCouldNotCompute();
+
+  bool NoWrap = !IsSubExpr &&
+                IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
+
+  const SCEV *Stride = getNegativeSCEV(IV->getStepRecurrence(*this));
+
+  // Avoid negative or zero stride values
+  if (!isKnownPositive(Stride))
+    return getCouldNotCompute();
+
+  // Avoid proven overflow cases: this will ensure that the backedge taken count
+  // will not generate any unsigned overflow. Relaxed no-overflow conditions
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // behaviors like the case of C language.
+  if (!Stride->isOne() && doesIVOverflowOnGT(RHS, Stride, IsSigned, NoWrap))
+    return getCouldNotCompute();
+
+  ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SGT
+                                      : ICmpInst::ICMP_UGT;
+
+  const SCEV *Start = IV->getStart();
+  const SCEV *End = RHS;
+  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS))
+    End = IsSigned ? getSMinExpr(RHS, Start)
+                   : getUMinExpr(RHS, Start);
+
+  const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
+
+  APInt MaxStart = IsSigned ? getSignedRange(Start).getSignedMax()
+                            : getUnsignedRange(Start).getUnsignedMax();
+
+  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
+                             : getUnsignedRange(Stride).getUnsignedMin();
+
+  unsigned BitWidth = getTypeSizeInBits(LHS->getType());
+  APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1)
+                         : APInt::getMinValue(BitWidth) + (MinStride - 1);
+
+  // Although End can be a MIN expression we estimate MinEnd considering only
+  // the case End = RHS. This is safe because in the other case (Start - End)
+  // is zero, leading to a zero maximum backedge taken count.
+  APInt MinEnd =
+    IsSigned ? APIntOps::smax(getSignedRange(RHS).getSignedMin(), Limit)
+             : APIntOps::umax(getUnsignedRange(RHS).getUnsignedMin(), Limit);
+
+
+  const SCEV *MaxBECount = getCouldNotCompute();
+  if (isa<SCEVConstant>(BECount))
+    MaxBECount = BECount;
+  else
+    MaxBECount = computeBECount(getConstant(MaxStart - MinEnd), 
+                                getConstant(MinStride), false);
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    MaxBECount = BECount;
+
+  return ExitLimit(BECount, MaxBECount);
 }
 
 /// getNumIterationsInRange - Return the number of iterations of this loop that
@@ -6627,7 +6677,534 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
   return SE.getCouldNotCompute();
 }
 
+static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue().abs();
+  APInt B = C2->getValue()->getValue().abs();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.zext(ABW);
+  else if (ABW < BBW)
+    A = A.zext(BBW);
+
+  return APIntOps::GreatestCommonDivisor(A, B);
+}
+
+static const APInt srem(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::srem(A, B);
+}
+
+static const APInt sdiv(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::sdiv(A, B);
+}
+
+namespace {
+struct SCEVGCD : public SCEVVisitor<SCEVGCD, const SCEV *> {
+public:
+  // Pattern match Step into Start. When Step is a multiply expression, find
+  // the largest subexpression of Step that appears in Start. When Start is an
+  // add expression, try to match Step in the subexpressions of Start, non
+  // matching subexpressions are returned under Remainder.
+  static const SCEV *findGCD(ScalarEvolution &SE, const SCEV *Start,
+                             const SCEV *Step, const SCEV **Remainder) {
+    assert(Remainder && "Remainder should not be NULL");
+    SCEVGCD R(SE, Step, SE.getConstant(Step->getType(), 0));
+    const SCEV *Res = R.visit(Start);
+    *Remainder = R.Remainder;
+    return Res;
+  }
+
+  SCEVGCD(ScalarEvolution &S, const SCEV *G, const SCEV *R)
+      : SE(S), GCD(G), Remainder(R) {
+    Zero = SE.getConstant(GCD->getType(), 0);
+    One = SE.getConstant(GCD->getType(), 1);
+  }
+
+  const SCEV *visitConstant(const SCEVConstant *Constant) {
+    if (GCD == Constant || Constant == Zero)
+      return GCD;
+
+    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD)) {
+      const SCEV *Res = SE.getConstant(gcd(Constant, CGCD));
+      if (Res != One)
+        return Res;
+
+      Remainder = SE.getConstant(srem(Constant, CGCD));
+      Constant = cast<SCEVConstant>(SE.getMinusSCEV(Constant, Remainder));
+      Res = SE.getConstant(gcd(Constant, CGCD));
+      return Res;
+    }
+
+    // When GCD is not a constant, it could be that the GCD is an Add, Mul,
+    // AddRec, etc., in which case we want to find out how many times the
+    // Constant divides the GCD: we then return that as the new GCD.
+    const SCEV *Rem = Zero;
+    const SCEV *Res = findGCD(SE, GCD, Constant, &Rem);
+
+    if (Res == One || Rem != Zero) {
+      Remainder = Constant;
+      return One;
+    }
+
+    assert(isa<SCEVConstant>(Res) && "Res should be a constant");
+    Remainder = SE.getConstant(srem(Constant, cast<SCEVConstant>(Res)));
+    return Res;
+  }
+
+  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr->getOperand(e - 1 - i), GCD, &Rem);
+
+      // FIXME: There may be ambiguous situations: for instance,
+      // GCD(-4 + (3 * %m), 2 * %m) where 2 divides -4 and %m divides (3 * %m).
+      // The order in which the AddExpr is traversed computes a different GCD
+      // and Remainder.
+      if (Res != One)
+        GCD = Res;
+      if (Rem != Zero)
+        Remainder = SE.getAddExpr(Remainder, Rem);
+    }
+
+    return GCD;
+  }
+
+  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      if (Expr->getOperand(i) == GCD)
+        return GCD;
+    }
+
+    // If we have not returned yet, it means that GCD is not part of Expr.
+    const SCEV *PartialGCD = One;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr->getOperand(i), GCD, &Rem);
+      if (Rem != Zero)
+        // GCD does not divide Expr->getOperand(i).
+        continue;
+
+      if (Res == GCD)
+        return GCD;
+      PartialGCD = SE.getMulExpr(PartialGCD, Res);
+      if (PartialGCD == GCD)
+        return GCD;
+    }
+
+    if (PartialGCD != One)
+      return PartialGCD;
+
+    Remainder = Expr;
+    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(GCD);
+    if (!Mul)
+      return PartialGCD;
+
+    // When the GCD is a multiply expression, try to decompose it:
+    // this occurs when Step does not divide the Start expression
+    // as in: {(-4 + (3 * %m)),+,(2 * %m)}
+    for (int i = 0, e = Mul->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr, Mul->getOperand(i), &Rem);
+      if (Rem == Zero) {
+        Remainder = Rem;
+        return Res;
+      }
+    }
+
+    return PartialGCD;
+  }
+
+  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    if (!Expr->isAffine()) {
+      Remainder = Expr;
+      return GCD;
+    }
+
+    const SCEV *Rem = Zero;
+    const SCEV *Res = findGCD(SE, Expr->getOperand(0), GCD, &Rem);
+    if (Rem != Zero)
+      Remainder = SE.getAddExpr(Remainder, Rem);
+
+    Rem = Zero;
+    Res = findGCD(SE, Expr->getOperand(1), Res, &Rem);
+    if (Rem != Zero) {
+      Remainder = Expr;
+      return GCD;
+    }
+
+    return Res;
+  }
+
+  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return One;
+  }
+
+private:
+  ScalarEvolution &SE;
+  const SCEV *GCD, *Remainder, *Zero, *One;
+};
+
+struct SCEVDivision : public SCEVVisitor<SCEVDivision, const SCEV *> {
+public:
+  // Remove from Start all multiples of Step.
+  static const SCEV *divide(ScalarEvolution &SE, const SCEV *Start,
+                            const SCEV *Step) {
+    SCEVDivision D(SE, Step);
+    const SCEV *Rem = D.Zero;
+    (void)Rem;
+    // The division is guaranteed to succeed: Step should divide Start with no
+    // remainder.
+    assert(Step == SCEVGCD::findGCD(SE, Start, Step, &Rem) && Rem == D.Zero &&
+           "Step should divide Start with no remainder.");
+    return D.visit(Start);
+  }
+
+  SCEVDivision(ScalarEvolution &S, const SCEV *G) : SE(S), GCD(G) {
+    Zero = SE.getConstant(GCD->getType(), 0);
+    One = SE.getConstant(GCD->getType(), 1);
+  }
+
+  const SCEV *visitConstant(const SCEVConstant *Constant) {
+    if (GCD == Constant)
+      return One;
+
+    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD))
+      return SE.getConstant(sdiv(Constant, CGCD));
+    return Constant;
+  }
+
+  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    SmallVector<const SCEV *, 2> Operands;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+      Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
+
+    if (Operands.size() == 1)
+      return Operands[0];
+    return SE.getAddExpr(Operands);
+  }
+
+  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    bool FoundGCDTerm = false;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+      if (Expr->getOperand(i) == GCD)
+        FoundGCDTerm = true;
+
+    SmallVector<const SCEV *, 2> Operands;
+    if (FoundGCDTerm) {
+      FoundGCDTerm = false;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+        if (FoundGCDTerm)
+          Operands.push_back(Expr->getOperand(i));
+        else if (Expr->getOperand(i) == GCD)
+          FoundGCDTerm = true;
+        else
+          Operands.push_back(Expr->getOperand(i));
+      }
+    } else {
+      FoundGCDTerm = false;
+      const SCEV *PartialGCD = One;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+        if (PartialGCD == GCD) {
+          Operands.push_back(Expr->getOperand(i));
+          continue;
+        }
+
+        const SCEV *Rem = Zero;
+        const SCEV *Res = SCEVGCD::findGCD(SE, Expr->getOperand(i), GCD, &Rem);
+        if (Rem == Zero) {
+          PartialGCD = SE.getMulExpr(PartialGCD, Res);
+          Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
+        } else {
+          Operands.push_back(Expr->getOperand(i));
+        }
+      }
+    }
+
+    if (Operands.size() == 1)
+      return Operands[0];
+    return SE.getMulExpr(Operands);
+  }
+
+  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    assert(Expr->isAffine() && "Expr should be affine");
+
+    const SCEV *Start = divide(SE, Expr->getStart(), GCD);
+    const SCEV *Step = divide(SE, Expr->getStepRecurrence(SE), GCD);
+
+    return SE.getAddRecExpr(Start, Step, Expr->getLoop(),
+                            Expr->getNoWrapFlags());
+  }
+
+  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return Expr;
+  }
+
+private:
+  ScalarEvolution &SE;
+  const SCEV *GCD, *Zero, *One;
+};
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access. Returns the remainder of the delinearization that
+/// is the offset start of the array.  The SCEV->delinearize algorithm computes
+/// the multiples of SCEV coefficients: that is a pattern matching of sub
+/// expressions in the stride and base of a SCEV corresponding to the
+/// computation of a GCD (greatest common divisor) of base and stride.  When
+/// SCEV->delinearize fails, it returns the SCEV unchanged.
+///
+/// For example: when analyzing the memory access A[i][j][k] in this loop nest
+///
+///  void foo(long n, long m, long o, double A[n][m][o]) {
+///
+///    for (long i = 0; i < n; i++)
+///      for (long j = 0; j < m; j++)
+///        for (long k = 0; k < o; k++)
+///          A[i][j][k] = 1.0;
+///  }
+///
+/// the delinearization input is the following AddRec SCEV:
+///
+///  AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
+///
+/// From this SCEV, we are able to say that the base offset of the access is %A
+/// because it appears as an offset that does not divide any of the strides in
+/// the loops:
+///
+///  CHECK: Base offset: %A
+///
+/// and then SCEV->delinearize determines the size of some of the dimensions of
+/// the array as these are the multiples by which the strides are happening:
+///
+///  CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
+///
+/// Note that the outermost dimension remains of UnknownSize because there are
+/// no strides that would help identifying the size of the last dimension: when
+/// the array has been statically allocated, one could compute the size of that
+/// dimension by dividing the overall size of the array by the size of the known
+/// dimensions: %m * %o * 8.
+///
+/// Finally delinearize provides the access functions for the array reference
+/// that does correspond to A[i][j][k] of the above C testcase:
+///
+///  CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// The testcases are checking the output of a function pass:
+/// DelinearizationPass that walks through all loads and stores of a function
+/// asking for the SCEV of the memory access with respect to all enclosing
+/// loops, calling SCEV->delinearize on that and printing the results.
 
+const SCEV *
+SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
+                            SmallVectorImpl<const SCEV *> &Subscripts,
+                            SmallVectorImpl<const SCEV *> &Sizes) const {
+  // Early exit in case this SCEV is not an affine multivariate function.
+  if (!this->isAffine())
+    return this;
+
+  const SCEV *Start = this->getStart();
+  const SCEV *Step = this->getStepRecurrence(SE);
+
+  // Build the SCEV representation of the cannonical induction variable in the
+  // loop of this SCEV.
+  const SCEV *Zero = SE.getConstant(this->getType(), 0);
+  const SCEV *One = SE.getConstant(this->getType(), 1);
+  const SCEV *IV =
+      SE.getAddRecExpr(Zero, One, this->getLoop(), this->getNoWrapFlags());
+
+  DEBUG(dbgs() << "(delinearize: " << *this << "\n");
+
+  // Currently we fail to delinearize when the stride of this SCEV is 1. We
+  // could decide to not fail in this case: we could just return 1 for the size
+  // of the subscript, and this same SCEV for the access function.
+  if (Step == One) {
+    DEBUG(dbgs() << "failed to delinearize " << *this << "\n)\n");
+    return this;
+  }
+
+  // Find the GCD and Remainder of the Start and Step coefficients of this SCEV.
+  const SCEV *Remainder = NULL;
+  const SCEV *GCD = SCEVGCD::findGCD(SE, Start, Step, &Remainder);
+
+  DEBUG(dbgs() << "GCD: " << *GCD << "\n");
+  DEBUG(dbgs() << "Remainder: " << *Remainder << "\n");
+
+  // Same remark as above: we currently fail the delinearization, although we
+  // can very well handle this special case.
+  if (GCD == One) {
+    DEBUG(dbgs() << "failed to delinearize " << *this << "\n)\n");
+    return this;
+  }
+
+  // As findGCD computed Remainder, GCD divides "Start - Remainder." The
+  // Quotient is then this SCEV without Remainder, scaled down by the GCD.  The
+  // Quotient is what will be used in the next subscript delinearization.
+  const SCEV *Quotient =
+      SCEVDivision::divide(SE, SE.getMinusSCEV(Start, Remainder), GCD);
+  DEBUG(dbgs() << "Quotient: " << *Quotient << "\n");
+
+  const SCEV *Rem;
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Quotient))
+    // Recursively call delinearize on the Quotient until there are no more
+    // multiples that can be recognized.
+    Rem = AR->delinearize(SE, Subscripts, Sizes);
+  else
+    Rem = Quotient;
+
+  // Scale up the cannonical induction variable IV by whatever remains from the
+  // Step after division by the GCD: the GCD is the size of all the sub-array.
+  if (Step != GCD) {
+    Step = SCEVDivision::divide(SE, Step, GCD);
+    IV = SE.getMulExpr(IV, Step);
+  }
+  // The access function in the current subscript is computed as the cannonical
+  // induction variable IV (potentially scaled up by the step) and offset by
+  // Rem, the offset of delinearization in the sub-array.
+  const SCEV *Index = SE.getAddExpr(IV, Rem);
+
+  // Record the access function and the size of the current subscript.
+  Subscripts.push_back(Index);
+  Sizes.push_back(GCD);
+
+#ifndef NDEBUG
+  int Size = Sizes.size();
+  DEBUG(dbgs() << "succeeded to delinearize " << *this << "\n");
+  DEBUG(dbgs() << "ArrayDecl[UnknownSize]");
+  for (int i = 0; i < Size - 1; i++)
+    DEBUG(dbgs() << "[" << *Sizes[i] << "]");
+  DEBUG(dbgs() << " with elements of " << *Sizes[Size - 1] << " bytes.\n");
+
+  DEBUG(dbgs() << "ArrayRef");
+  for (int i = 0; i < Size; i++)
+    DEBUG(dbgs() << "[" << *Subscripts[i] << "]");
+  DEBUG(dbgs() << "\n)\n");
+#endif
+
+  return Remainder;
+}
 
 //===----------------------------------------------------------------------===//
 //                   SCEVCallbackVH Class Implementation
@@ -6683,7 +7260,7 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 //===----------------------------------------------------------------------===//
 
 ScalarEvolution::ScalarEvolution()
-  : FunctionPass(ID), FirstUnknown(0) {
+  : FunctionPass(ID), ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64), FirstUnknown(0) {
   initializeScalarEvolutionPass(*PassRegistry::getPassRegistry());
 }
 
@@ -6821,14 +7398,21 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
 
 ScalarEvolution::LoopDisposition
 ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) {
-  std::map<const Loop *, LoopDisposition> &Values = LoopDispositions[S];
-  std::pair<std::map<const Loop *, LoopDisposition>::iterator, bool> Pair =
-    Values.insert(std::make_pair(L, LoopVariant));
-  if (!Pair.second)
-    return Pair.first->second;
-
+  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values = LoopDispositions[S];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == L)
+      return Values[u].second;
+  }
+  Values.push_back(std::make_pair(L, LoopVariant));
   LoopDisposition D = computeLoopDisposition(S, L);
-  return LoopDispositions[S][L] = D;
+  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values2 = LoopDispositions[S];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == L) {
+      Values2[u - 1].second = D;
+      break;
+    }
+  }
+  return D;
 }
 
 ScalarEvolution::LoopDisposition
@@ -6920,14 +7504,21 @@ bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) {
 
 ScalarEvolution::BlockDisposition
 ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) {
-  std::map<const BasicBlock *, BlockDisposition> &Values = BlockDispositions[S];
-  std::pair<std::map<const BasicBlock *, BlockDisposition>::iterator, bool>
-    Pair = Values.insert(std::make_pair(BB, DoesNotDominateBlock));
-  if (!Pair.second)
-    return Pair.first->second;
-
+  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values = BlockDispositions[S];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == BB)
+      return Values[u].second;
+  }
+  Values.push_back(std::make_pair(BB, DoesNotDominateBlock));
   BlockDisposition D = computeBlockDisposition(S, BB);
-  return BlockDispositions[S][BB] = D;
+  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values2 = BlockDispositions[S];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == BB) {
+      Values2[u - 1].second = D;
+      break;
+    }
+  }
+  return D;
 }
 
 ScalarEvolution::BlockDisposition
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index c434b40..86a557b 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -176,8 +177,8 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc();
+  BuilderType::InsertPointGuard Guard(Builder);
 
   // Move the insertion point out of as many loops as we can.
   while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -191,13 +192,9 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
 
   // If we haven't found this binop, insert it.
   Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
-  BO->setDebugLoc(SaveInsertPt->getDebugLoc());
+  BO->setDebugLoc(Loc);
   rememberInstruction(BO);
 
-  // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
   return BO;
 }
 
@@ -406,6 +403,10 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   // without the other.
   SplitAddRecs(Ops, Ty, SE);
 
+  Type *IntPtrTy = SE.TD
+                 ? SE.TD->getIntPtrType(PTy)
+                 : Type::getInt64Ty(PTy->getContext());
+
   // Descend down the pointer's type and attempt to convert the other
   // operands into GEP indices, at each level. The first index in a GEP
   // indexes into the array implied by the pointer operand; the rest of
@@ -416,7 +417,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     // array indexing.
     SmallVector<const SCEV *, 8> ScaledOps;
     if (ElTy->isSized()) {
-      const SCEV *ElSize = SE.getSizeOfExpr(ElTy);
+      const SCEV *ElSize = SE.getSizeOfExpr(IntPtrTy, ElTy);
       if (!ElSize->isZero()) {
         SmallVector<const SCEV *, 8> NewOps;
         for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
@@ -548,8 +549,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     }
 
     // Save the original insertion point so we can restore it when we're done.
-    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+    BuilderType::InsertPointGuard Guard(Builder);
 
     // Move the insertion point out of as many loops as we can.
     while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -565,16 +565,11 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     Value *GEP = Builder.CreateGEP(V, Idx, "uglygep");
     rememberInstruction(GEP);
 
-    // Restore the original insert point.
-    if (SaveInsertBB)
-      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
     return GEP;
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPoint SaveInsertPt = Builder.saveIP();
 
   // Move the insertion point out of as many loops as we can.
   while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -610,8 +605,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   rememberInstruction(GEP);
 
   // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+  Builder.restoreIP(SaveInsertPt);
 
   return expand(SE.getAddExpr(Ops));
 }
@@ -1076,8 +1070,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
 
   // Another AddRec may need to be recursively expanded below. For example, if
   // this AddRec is quadratic, the StepV may itself be an AddRec in this
@@ -1144,10 +1137,6 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
     PN->addIncoming(IncV, Pred);
   }
 
-  // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
   // After expanding subexpressions, restore the PostIncLoops set so the caller
   // can ensure that IVIncrement dominates the current uses.
   PostIncLoops = SavedPostIncLoops;
@@ -1232,19 +1221,19 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
         !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
       if (useSubtract)
         Step = SE.getNegativeSCEV(Step);
-      // Expand the step somewhere that dominates the loop header.
-      BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-      BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
-      Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
-      // Restore the insertion point to the place where the caller has
-      // determined dominates all uses.
-      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+      Value *StepV;
+      {
+        // Expand the step somewhere that dominates the loop header.
+        BuilderType::InsertPointGuard Guard(Builder);
+        StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
+      }
       Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
     }
   }
 
   // Re-apply any non-loop-dominating scale.
   if (PostLoopScale) {
+    assert(S->isAffine() && "Can't linearly scale non-affine recurrences.");
     Result = InsertNoopCastOfTo(Result, IntTy);
     Result = Builder.CreateMul(Result,
                                expandCodeFor(PostLoopScale, IntTy));
@@ -1289,16 +1278,14 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
       NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
     Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
                                        S->getNoWrapFlags(SCEV::FlagNW)));
-    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
     BasicBlock::iterator NewInsertPt =
       llvm::next(BasicBlock::iterator(cast<Instruction>(V)));
+    BuilderType::InsertPointGuard Guard(Builder);
     while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) ||
            isa<LandingPadInst>(NewInsertPt))
       ++NewInsertPt;
     V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
                       NewInsertPt);
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
     return V;
   }
 
@@ -1342,9 +1329,13 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
                                   Header->begin());
     rememberInstruction(CanonicalIV);
 
+    SmallSet<BasicBlock *, 4> PredSeen;
     Constant *One = ConstantInt::get(Ty, 1);
     for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
       BasicBlock *HP = *HPI;
+      if (!PredSeen.insert(HP))
+        continue;
+
       if (L->contains(HP)) {
         // Insert a unit add instruction right before the terminator
         // corresponding to the back-edge.
@@ -1527,8 +1518,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
   if (I != InsertedExpressions.end())
     return I->second;
 
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
   Builder.SetInsertPoint(InsertPt->getParent(), InsertPt);
 
   // Expand the expression into instructions.
@@ -1541,8 +1531,6 @@ Value *SCEVExpander::expand(const SCEV *S) {
   // a postinc expansion, it could be reused by a non postinc user, but only if
   // its insertion point was already at the head of the loop.
   InsertedExpressions[std::make_pair(S, InsertPt)] = V;
-
-  restoreInsertPoint(SaveInsertBB, SaveInsertPt);
   return V;
 }
 
@@ -1553,10 +1541,6 @@ void SCEVExpander::rememberInstruction(Value *I) {
     InsertedValues.insert(I);
 }
 
-void SCEVExpander::restoreInsertPoint(BasicBlock *BB, BasicBlock::iterator I) {
-  Builder.SetInsertPoint(BB, I);
-}
-
 /// getOrInsertCanonicalInductionVariable - This method returns the
 /// canonical induction variable of the specified type for the specified
 /// loop (inserting one if there is none).  A canonical induction variable
@@ -1572,11 +1556,8 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
                                    SE.getConstant(Ty, 1), L, SCEV::FlagAnyWrap);
 
   // Emit code for it.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
   PHINode *V = cast<PHINode>(expandCodeFor(H, 0, L->getHeader()->begin()));
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
 
   return V;
 }
@@ -1724,28 +1705,43 @@ namespace {
 // Currently, we only allow division by a nonzero constant here. If this is
 // inadequate, we could easily allow division by SCEVUnknown by using
 // ValueTracking to check isKnownNonZero().
+//
+// We cannot generally expand recurrences unless the step dominates the loop
+// header. The expander handles the special case of affine recurrences by
+// scaling the recurrence outside the loop, but this technique isn't generally
+// applicable. Expanding a nested recurrence outside a loop requires computing
+// binomial coefficients. This could be done, but the recurrence has to be in a
+// perfectly reduced form, which can't be guaranteed.
 struct SCEVFindUnsafe {
+  ScalarEvolution &SE;
   bool IsUnsafe;
 
-  SCEVFindUnsafe(): IsUnsafe(false) {}
+  SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {}
 
   bool follow(const SCEV *S) {
-    const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S);
-    if (!D)
-      return true;
-    const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
-    if (SC && !SC->getValue()->isZero())
-      return true;
-    IsUnsafe = true;
-    return false;
+    if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+      if (!SC || SC->getValue()->isZero()) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+      const SCEV *Step = AR->getStepRecurrence(SE);
+      if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    return true;
   }
   bool isDone() const { return IsUnsafe; }
 };
 }
 
 namespace llvm {
-bool isSafeToExpand(const SCEV *S) {
-  SCEVFindUnsafe Search;
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) {
+  SCEVFindUnsafe Search(SE);
   visitAll(S, Search);
   return !Search.IsUnsafe;
 }
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index dd2ed4f..f110616 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -119,11 +119,19 @@ TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
     const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
     switch (Kind) {
     case NormalizeAutodetect:
-      if (IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getMinusSCEV(Result, TransformedStep);
+      // Normalize this SCEV by subtracting the expression for the final step.
+      // We only allow affine AddRecs to be normalized, otherwise we would not
+      // be able to correctly denormalize.
+      // e.g. {1,+,3,+,2} == {-2,+,1,+,2} + {3,+,2}
+      // Normalized form:   {-2,+,1,+,2}
+      // Denormalized form: {1,+,3,+,2}
+      //
+      // However, denormalization would use the a different step expression than
+      // normalization (see getPostIncExpr), generating the wrong final
+      // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2}
+      if (AR->isAffine() &&
+          IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
+        Result = SE.getMinusSCEV(Result, AR->getStepRecurrence(SE));
         Loops.insert(L);
       }
 #if 0
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 4ad7162..0353295 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -96,6 +96,11 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
   return PrevTTI->isLoweredToCall(F);
 }
 
+void TargetTransformInfo::getUnrollingPreferences(Loop *L,
+                            UnrollingPreferences &UP) const {
+  PrevTTI->getUnrollingPreferences(L, UP);
+}
+
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return PrevTTI->isLegalAddImmediate(Imm);
 }
@@ -145,6 +150,10 @@ TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
   return PrevTTI->getPopcntSupport(IntTyWidthInBit);
 }
 
+bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
+  return PrevTTI->haveFastSqrt(Ty);
+}
+
 unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return PrevTTI->getIntImmCost(Imm, Ty);
 }
@@ -215,6 +224,11 @@ unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
   return PrevTTI->getAddressComputationCost(Tp, IsComplex);
 }
 
+unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
+                                               bool IsPairwise) const {
+  return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
+}
+
 namespace {
 
 struct NoTTI : ImmutablePass, TargetTransformInfo {
@@ -265,26 +279,34 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
       // Otherwise, the default basic cost is used.
       return TCC_Basic;
 
-    case Instruction::IntToPtr:
+    case Instruction::IntToPtr: {
+      if (!DL)
+        return TCC_Basic;
+
       // An inttoptr cast is free so long as the input is a legal integer type
       // which doesn't contain values outside the range of a pointer.
-      if (DL && DL->isLegalInteger(OpTy->getScalarSizeInBits()) &&
-          OpTy->getScalarSizeInBits() <= DL->getPointerSizeInBits())
+      unsigned OpSize = OpTy->getScalarSizeInBits();
+      if (DL->isLegalInteger(OpSize) &&
+          OpSize <= DL->getPointerTypeSizeInBits(Ty))
         return TCC_Free;
 
       // Otherwise it's not a no-op.
       return TCC_Basic;
+    }
+    case Instruction::PtrToInt: {
+      if (!DL)
+        return TCC_Basic;
 
-    case Instruction::PtrToInt:
       // A ptrtoint cast is free so long as the result is large enough to store
       // the pointer, and a legal integer type.
-      if (DL && DL->isLegalInteger(Ty->getScalarSizeInBits()) &&
-          Ty->getScalarSizeInBits() >= DL->getPointerSizeInBits())
+      unsigned DestSize = Ty->getScalarSizeInBits();
+      if (DL->isLegalInteger(DestSize) &&
+          DestSize >= DL->getPointerTypeSizeInBits(OpTy))
         return TCC_Free;
 
       // Otherwise it's not a no-op.
       return TCC_Basic;
-
+    }
     case Instruction::Trunc:
       // trunc to a native type is free (assuming the target has compare and
       // shift-right of the same width).
@@ -457,6 +479,8 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return true;
   }
 
+  void getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+
   bool isLegalAddImmediate(int64_t Imm) const {
     return false;
   }
@@ -505,6 +529,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return PSK_Software;
   }
 
+  bool haveFastSqrt(Type *Ty) const {
+    return false;
+  }
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const {
     return 1;
   }
@@ -569,6 +597,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
   unsigned getAddressComputationCost(Type *Tp, bool) const {
     return 0;
   }
+
+  unsigned getReductionCost(unsigned, Type *, bool) const {
+    return 1;
+  }
 };
 
 } // end anonymous namespace
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index bbf3c3a..6791d4b 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -16,7 +16,12 @@
 // typical C/C++ TBAA, but it can also be used to implement custom alias
 // analysis behavior for other languages.
 //
-// The current metadata format is very simple. TBAA MDNodes have up to
+// We now support two types of metadata format: scalar TBAA and struct-path
+// aware TBAA. After all testing cases are upgraded to use struct-path aware
+// TBAA and we can auto-upgrade existing bc files, the support for scalar TBAA
+// can be dropped.
+//
+// The scalar TBAA metadata format is very simple. TBAA MDNodes have up to
 // three fields, e.g.:
 //   !0 = metadata !{ metadata !"an example type tree" }
 //   !1 = metadata !{ metadata !"int", metadata !0 }
@@ -40,6 +45,65 @@
 // should return true; see
 // http://llvm.org/docs/AliasAnalysis.html#OtherItfs).
 //
+// With struct-path aware TBAA, the MDNodes attached to an instruction using
+// "!tbaa" are called path tag nodes.
+//
+// The path tag node has 4 fields with the last field being optional.
+//
+// The first field is the base type node, it can be a struct type node
+// or a scalar type node. The second field is the access type node, it
+// must be a scalar type node. The third field is the offset into the base type.
+// The last field has the same meaning as the last field of our scalar TBAA:
+// it's an integer which if equal to 1 indicates that the access is "constant".
+//
+// The struct type node has a name and a list of pairs, one pair for each member
+// of the struct. The first element of each pair is a type node (a struct type
+// node or a sclar type node), specifying the type of the member, the second
+// element of each pair is the offset of the member.
+//
+// Given an example
+// typedef struct {
+//   short s;
+// } A;
+// typedef struct {
+//   uint16_t s;
+//   A a;
+// } B;
+//
+// For an acess to B.a.s, we attach !5 (a path tag node) to the load/store
+// instruction. The base type is !4 (struct B), the access type is !2 (scalar
+// type short) and the offset is 4.
+//
+// !0 = metadata !{metadata !"Simple C/C++ TBAA"}
+// !1 = metadata !{metadata !"omnipotent char", metadata !0} // Scalar type node
+// !2 = metadata !{metadata !"short", metadata !1}           // Scalar type node
+// !3 = metadata !{metadata !"A", metadata !2, i64 0}        // Struct type node
+// !4 = metadata !{metadata !"B", metadata !2, i64 0, metadata !3, i64 4}
+//                                                           // Struct type node
+// !5 = metadata !{metadata !4, metadata !2, i64 4}          // Path tag node
+//
+// The struct type nodes and the scalar type nodes form a type DAG.
+//         Root (!0)
+//         char (!1)  -- edge to Root
+//         short (!2) -- edge to char
+//         A (!3) -- edge with offset 0 to short
+//         B (!4) -- edge with offset 0 to short and edge with offset 4 to A
+//
+// To check if two tags (tagX and tagY) can alias, we start from the base type
+// of tagX, follow the edge with the correct offset in the type DAG and adjust
+// the offset until we reach the base type of tagY or until we reach the Root
+// node.
+// If we reach the base type of tagY, compare the adjusted offset with
+// offset of tagY, return Alias if the offsets are the same, return NoAlias
+// otherwise.
+// If we reach the Root node, perform the above starting from base type of tagY
+// to see if we reach base type of tagX.
+//
+// If they have different roots, they're part of different potentially
+// unrelated type systems, so we return Alias to be conservative.
+// If neither node is an ancestor of the other and they have the same root,
+// then we say NoAlias.
+//
 // TODO: The current metadata format doesn't support struct
 // fields. For example:
 //   struct X {
@@ -71,7 +135,6 @@ using namespace llvm;
 // achieved by stripping the !tbaa tags from IR, but this option is sometimes
 // more convenient.
 static cl::opt<bool> EnableTBAA("enable-tbaa", cl::init(true));
-static cl::opt<bool> EnableStructPathTBAA("struct-path-tbaa", cl::init(false));
 
 namespace {
   /// TBAANode - This is a simple wrapper around an MDNode which provides a
@@ -168,8 +231,12 @@ namespace {
       if (Node->getNumOperands() < 2)
         return TBAAStructTypeNode();
 
-      // Special handling for a scalar type node. 
+      // Fast path for a scalar type node and a struct type node with a single
+      // field.
       if (Node->getNumOperands() <= 3) {
+        uint64_t Cur = Node->getNumOperands() == 2 ? 0 :
+                       cast<ConstantInt>(Node->getOperand(2))->getZExtValue();
+        Offset -= Cur;
         MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
         if (!P)
           return TBAAStructTypeNode();
@@ -259,12 +326,21 @@ TypeBasedAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AliasAnalysis::getAnalysisUsage(AU);
 }
 
+/// Check the first operand of the tbaa tag node, if it is a MDNode, we treat
+/// it as struct-path aware TBAA format, otherwise, we treat it as scalar TBAA
+/// format.
+static bool isStructPathTBAA(const MDNode *MD) {
+  // Anonymous TBAA root starts with a MDNode and dragonegg uses it as
+  // a TBAA tag.
+  return isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3;
+}
+
 /// Aliases - Test whether the type represented by A may alias the
 /// type represented by B.
 bool
 TypeBasedAliasAnalysis::Aliases(const MDNode *A,
                                 const MDNode *B) const {
-  if (EnableStructPathTBAA)
+  if (isStructPathTBAA(A))
     return PathAliases(A, B);
 
   // Keep track of the root node for A and B.
@@ -397,8 +473,8 @@ bool TypeBasedAliasAnalysis::pointsToConstantMemory(const Location &Loc,
 
   // If this is an "immutable" type, we can assume the pointer is pointing
   // to constant memory.
-  if ((!EnableStructPathTBAA && TBAANode(M).TypeIsImmutable()) ||
-      (EnableStructPathTBAA && TBAAStructTagNode(M).TypeIsImmutable()))
+  if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) ||
+      (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
     return true;
 
   return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
@@ -414,8 +490,8 @@ TypeBasedAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
   // If this is an "immutable" type, we can assume the call doesn't write
   // to memory.
   if (const MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
-    if ((!EnableStructPathTBAA && TBAANode(M).TypeIsImmutable()) ||
-        (EnableStructPathTBAA && TBAAStructTagNode(M).TypeIsImmutable()))
+    if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) ||
+        (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
       Min = OnlyReadsMemory;
 
   return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
@@ -458,6 +534,25 @@ TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
   return AliasAnalysis::getModRefInfo(CS1, CS2);
 }
 
+bool MDNode::isTBAAVtableAccess() const {
+  if (!isStructPathTBAA(this)) {
+    if (getNumOperands() < 1) return false;
+    if (MDString *Tag1 = dyn_cast<MDString>(getOperand(0))) {
+      if (Tag1->getString() == "vtable pointer") return true;
+    }
+    return false;
+  }
+
+  // For struct-path aware TBAA, we use the access type of the tag.
+  if (getNumOperands() < 2) return false;
+  MDNode *Tag = cast_or_null<MDNode>(getOperand(1));
+  if (!Tag) return false;
+  if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
+    if (Tag1->getString() == "vtable pointer") return true;
+  }
+  return false;  
+}
+
 MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
   if (!A || !B)
     return NULL;
@@ -466,7 +561,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     return A;
 
   // For struct-path aware TBAA, we use the access type of the tag.
-  if (EnableStructPathTBAA) {
+  bool StructPath = isStructPathTBAA(A);
+  if (StructPath) {
     A = cast_or_null<MDNode>(A->getOperand(1));
     if (!A) return 0;
     B = cast_or_null<MDNode>(B->getOperand(1));
@@ -499,7 +595,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     --IA;
     --IB;
   }
-  if (!EnableStructPathTBAA)
+  if (!StructPath)
     return Ret;
 
   if (!Ret)
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 4591af8..e39ee62 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -39,8 +40,8 @@ const unsigned MaxDepth = 6;
 static unsigned getBitWidth(Type *Ty, const DataLayout *TD) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
-  assert(isa<PointerType>(Ty) && "Expected a pointer type!");
-  return TD ? TD->getPointerSizeInBits() : 0;
+
+  return TD ? TD->getPointerTypeSizeInBits(Ty) : 0;
 }
 
 static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
@@ -629,9 +630,19 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       Value *Index = I->getOperand(i);
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         // Handle struct member offset arithmetic.
-        if (!TD) return;
-        const StructLayout *SL = TD->getStructLayout(STy);
+        if (!TD)
+          return;
+
+        // Handle case when index is vector zeroinitializer
+        Constant *CIndex = cast<Constant>(Index);
+        if (CIndex->isZeroValue())
+          continue;
+
+        if (CIndex->getType()->isVectorTy())
+          Index = CIndex->getSplatValue();
+
         unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+        const StructLayout *SL = TD->getStructLayout(STy);
         uint64_t Offset = SL->getElementOffset(Idx);
         TrailZ = std::min<unsigned>(TrailZ,
                                     countTrailingZeros(Offset));
@@ -749,7 +760,6 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
         break;
@@ -1704,20 +1714,24 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
 /// it can be expressed as a base pointer plus a constant offset.  Return the
 /// base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                              const DataLayout *TD) {
+                                              const DataLayout *DL) {
   // Without DataLayout, conservatively assume 64-bit offsets, which is
   // the widest we support.
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
+  unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(Ptr->getType()) : 64;
   APInt ByteOffset(BitWidth, 0);
   while (1) {
     if (Ptr->getType()->isVectorTy())
       break;
 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
-      APInt GEPOffset(BitWidth, 0);
-      if (TD && !GEP->accumulateConstantOffset(*TD, GEPOffset))
-        break;
-      ByteOffset += GEPOffset;
+      if (DL) {
+        APInt GEPOffset(BitWidth, 0);
+        if (!GEP->accumulateConstantOffset(*DL, GEPOffset))
+          break;
+
+        ByteOffset += GEPOffset;
+      }
+
       Ptr = GEP->getPointerOperand();
     } else if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
@@ -2050,7 +2064,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
 
 /// isKnownNonNull - Return true if we know that the specified value is never
 /// null.
-bool llvm::isKnownNonNull(const Value *V) {
+bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
   // Alloca never returns null, malloc might.
   if (isa<AllocaInst>(V)) return true;
 
@@ -2061,5 +2075,10 @@ bool llvm::isKnownNonNull(const Value *V) {
   // Global values are not null unless extern weak.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return !GV->hasExternalWeakLinkage();
+
+  // operator new never returns null.
+  if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true))
+    return true;
+
   return false;
 }
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 48675ac..1e6085b 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -478,12 +478,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(private);
   KEYWORD(linker_private);
   KEYWORD(linker_private_weak);
-  KEYWORD(linker_private_weak_def_auto); // FIXME: For backwards compatibility.
   KEYWORD(internal);
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
-  KEYWORD(linkonce_odr_auto_hide);
   KEYWORD(weak);
   KEYWORD(weak_odr);
   KEYWORD(appending);
@@ -540,6 +538,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(alignstack);
   KEYWORD(inteldialect);
   KEYWORD(gc);
+  KEYWORD(prefix);
 
   KEYWORD(ccc);
   KEYWORD(fastcc);
@@ -558,6 +557,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(intel_ocl_bicc);
   KEYWORD(x86_64_sysvcc);
   KEYWORD(x86_64_win64cc);
+  KEYWORD(webkit_jscc);
+  KEYWORD(anyregcc);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -583,6 +584,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noredzone);
   KEYWORD(noreturn);
   KEYWORD(nounwind);
+  KEYWORD(optnone);
   KEYWORD(optsize);
   KEYWORD(readnone);
   KEYWORD(readonly);
@@ -663,6 +665,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(inttoptr,    IntToPtr);
   INSTKEYWORD(ptrtoint,    PtrToInt);
   INSTKEYWORD(bitcast,     BitCast);
+  INSTKEYWORD(addrspacecast, AddrSpaceCast);
   INSTKEYWORD(select,      Select);
   INSTKEYWORD(va_arg,      VAArg);
   INSTKEYWORD(ret,         Ret);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 62a07f5..3b903cd 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueSymbolTable.h"
@@ -65,6 +66,9 @@ bool LLParser::ValidateEndOfModule() {
     ForwardRefInstMetadata.clear();
   }
 
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
   // Handle any function attribute group forward references.
   for (std::map<Value*, std::vector<unsigned> >::iterator
          I = ForwardRefAttrGroups.begin(), E = ForwardRefAttrGroups.end();
@@ -178,6 +182,8 @@ bool LLParser::ValidateEndOfModule() {
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
     UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove
 
+  UpgradeDebugInfo(*M);
+
   return false;
 }
 
@@ -242,13 +248,11 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_private:             // OptionalLinkage
     case lltok::kw_linker_private:      // OptionalLinkage
     case lltok::kw_linker_private_weak: // OptionalLinkage
-    case lltok::kw_linker_private_weak_def_auto: // FIXME: backwards compat.
     case lltok::kw_internal:            // OptionalLinkage
     case lltok::kw_weak:                // OptionalLinkage
     case lltok::kw_weak_odr:            // OptionalLinkage
     case lltok::kw_linkonce:            // OptionalLinkage
     case lltok::kw_linkonce_odr:        // OptionalLinkage
-    case lltok::kw_linkonce_odr_auto_hide: // OptionalLinkage
     case lltok::kw_appending:           // OptionalLinkage
     case lltok::kw_dllexport:           // OptionalLinkage
     case lltok::kw_common:              // OptionalLinkage
@@ -623,18 +627,14 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
                           unsigned Visibility) {
   assert(Lex.getKind() == lltok::kw_alias);
   Lex.Lex();
-  unsigned Linkage;
   LocTy LinkageLoc = Lex.getLoc();
-  if (ParseOptionalLinkage(Linkage))
+  unsigned L;
+  if (ParseOptionalLinkage(L))
     return true;
 
-  if (Linkage != GlobalValue::ExternalLinkage &&
-      Linkage != GlobalValue::WeakAnyLinkage &&
-      Linkage != GlobalValue::WeakODRLinkage &&
-      Linkage != GlobalValue::InternalLinkage &&
-      Linkage != GlobalValue::PrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateWeakLinkage)
+  GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L;
+
+  if(!GlobalAlias::isValidLinkage(Linkage))
     return Error(LinkageLoc, "invalid linkage type for alias");
 
   Constant *Aliasee;
@@ -922,6 +922,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_noredzone:         B.addAttribute(Attribute::NoRedZone); break;
     case lltok::kw_noreturn:          B.addAttribute(Attribute::NoReturn); break;
     case lltok::kw_nounwind:          B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_optnone:           B.addAttribute(Attribute::OptimizeNone); break;
     case lltok::kw_optsize:           B.addAttribute(Attribute::OptimizeForSize); break;
     case lltok::kw_readnone:          B.addAttribute(Attribute::ReadNone); break;
     case lltok::kw_readonly:          B.addAttribute(Attribute::ReadOnly); break;
@@ -1180,6 +1181,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nounwind:
+    case lltok::kw_optnone:
     case lltok::kw_optsize:
     case lltok::kw_returns_twice:
     case lltok::kw_sanitize_address:
@@ -1238,6 +1240,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nounwind:
+    case lltok::kw_optnone:
     case lltok::kw_optsize:
     case lltok::kw_returns_twice:
     case lltok::kw_sanitize_address:
@@ -1269,7 +1272,6 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
 ///   ::= 'weak_odr'
 ///   ::= 'linkonce'
 ///   ::= 'linkonce_odr'
-///   ::= 'linkonce_odr_auto_hide'
 ///   ::= 'available_externally'
 ///   ::= 'appending'
 ///   ::= 'dllexport'
@@ -1291,10 +1293,6 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
   case lltok::kw_linkonce:       Res = GlobalValue::LinkOnceAnyLinkage;   break;
   case lltok::kw_linkonce_odr:   Res = GlobalValue::LinkOnceODRLinkage;   break;
-  case lltok::kw_linkonce_odr_auto_hide:
-  case lltok::kw_linker_private_weak_def_auto: // FIXME: For backwards compat.
-    Res = GlobalValue::LinkOnceODRAutoHideLinkage;
-    break;
   case lltok::kw_available_externally:
     Res = GlobalValue::AvailableExternallyLinkage;
     break;
@@ -1346,6 +1344,8 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= 'spir_kernel'
 ///   ::= 'x86_64_sysvcc'
 ///   ::= 'x86_64_win64cc'
+///   ::= 'webkit_jscc'
+///   ::= 'anyregcc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
@@ -1368,6 +1368,8 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
   case lltok::kw_x86_64_win64cc: CC = CallingConv::X86_64_Win64; break;
+  case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
+  case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
   case lltok::kw_cc: {
       unsigned ArbitraryCC;
       Lex.Lex();
@@ -1424,6 +1426,9 @@ bool LLParser::ParseInstructionMetadata(Instruction *Inst,
       }
     }
 
+    if (MDK == LLVMContext::MD_tbaa)
+      InstsWithTBAATag.push_back(Inst);
+
     // If this is the end of the list, we're done.
   } while (EatIfPresent(lltok::comma));
   return false;
@@ -2383,7 +2388,6 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
 
     ValID Fn, Label;
-    LocTy FnLoc, LabelLoc;
 
     if (ParseToken(lltok::lparen, "expected '(' in block address expression") ||
         ParseValID(Fn) ||
@@ -2413,6 +2417,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   case lltok::kw_fptrunc:
   case lltok::kw_fpext:
   case lltok::kw_bitcast:
+  case lltok::kw_addrspacecast:
   case lltok::kw_uitofp:
   case lltok::kw_sitofp:
   case lltok::kw_fptoui:
@@ -2919,7 +2924,7 @@ bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
 /// FunctionHeader
 ///   ::= OptionalLinkage OptionalVisibility OptionalCallingConv OptRetAttrs
 ///       OptUnnamedAddr Type GlobalName '(' ArgList ')' OptFuncAttrs OptSection
-///       OptionalAlign OptGC
+///       OptionalAlign OptGC OptionalPrefix
 bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   // Parse the linkage.
   LocTy LinkageLoc = Lex.getLoc();
@@ -2953,7 +2958,6 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   case GlobalValue::AvailableExternallyLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::DLLExportLinkage:
@@ -2998,6 +3002,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::string GC;
   bool UnnamedAddr;
   LocTy UnnamedAddrLoc;
+  Constant *Prefix = 0;
 
   if (ParseArgumentList(ArgList, isVarArg) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
@@ -3008,7 +3013,9 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
        ParseStringConstant(Section)) ||
       ParseOptionalAlignment(Alignment) ||
       (EatIfPresent(lltok::kw_gc) &&
-       ParseStringConstant(GC)))
+       ParseStringConstant(GC)) ||
+      (EatIfPresent(lltok::kw_prefix) &&
+       ParseGlobalTypeAndValue(Prefix)))
     return true;
 
   if (FuncAttrs.contains(Attribute::Builtin))
@@ -3106,6 +3113,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setAlignment(Alignment);
   Fn->setSection(Section);
   if (!GC.empty()) Fn->setGC(GC.c_str());
+  Fn->setPrefixData(Prefix);
   ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
 
   // Add all of the arguments we parsed to the function.
@@ -3171,7 +3179,6 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
 
   // Parse the instructions in this block until we get a terminator.
   Instruction *Inst;
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MetadataOnInst;
   do {
     // This instruction may have three possibilities for a name: a) none
     // specified, b) name specified "%foo =", c) number specified: "%4 =".
@@ -3299,6 +3306,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fptrunc:
   case lltok::kw_fpext:
   case lltok::kw_bitcast:
+  case lltok::kw_addrspacecast:
   case lltok::kw_uitofp:
   case lltok::kw_sitofp:
   case lltok::kw_fptoui:
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 594281e..ded776c 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -107,6 +107,8 @@ namespace llvm {
     };
     DenseMap<Instruction*, std::vector<MDRef> > ForwardRefInstMetadata;
 
+    SmallVector<Instruction*, 64> InstsWithTBAATag;
+
     // Type resolution handling data structures.  The location is set when we
     // have processed a use of the type but not a definition yet.
     StringMap<std::pair<Type*, LocTy> > NamedTypes;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 9cf4c2c..786d84d 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -38,9 +38,8 @@ namespace lltok {
     kw_global,  kw_constant,
 
     kw_private, kw_linker_private, kw_linker_private_weak,
-    kw_linker_private_weak_def_auto, // FIXME: For backwards compatibility.
     kw_internal,
-    kw_linkonce, kw_linkonce_odr, kw_linkonce_odr_auto_hide,
+    kw_linkonce, kw_linkonce_odr,
     kw_weak, kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
@@ -81,6 +80,7 @@ namespace lltok {
     kw_alignstack,
     kw_inteldialect,
     kw_gc,
+    kw_prefix,
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
@@ -91,6 +91,7 @@ namespace lltok {
     kw_ptx_kernel, kw_ptx_device,
     kw_spir_kernel, kw_spir_func,
     kw_x86_64_sysvcc, kw_x86_64_win64cc,
+    kw_webkit_jscc, kw_anyregcc,
 
     // Attributes:
     kw_attributes,
@@ -114,6 +115,7 @@ namespace lltok {
     kw_noredzone,
     kw_noreturn,
     kw_nounwind,
+    kw_optnone,
     kw_optsize,
     kw_readnone,
     kw_readonly,
@@ -148,6 +150,7 @@ namespace lltok {
     kw_phi, kw_call,
     kw_trunc, kw_zext, kw_sext, kw_fptrunc, kw_fpext, kw_uitofp, kw_sitofp,
     kw_fptoui, kw_fptosi, kw_inttoptr, kw_ptrtoint, kw_bitcast,
+    kw_addrspacecast,
     kw_select, kw_va_arg,
 
     kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index e6d7b50..ce3b7d1 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
@@ -89,7 +90,6 @@ static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
   case 12: return GlobalValue::AvailableExternallyLinkage;
   case 13: return GlobalValue::LinkerPrivateLinkage;
   case 14: return GlobalValue::LinkerPrivateWeakLinkage;
-  case 15: return GlobalValue::LinkOnceODRAutoHideLinkage;
   }
 }
 
@@ -128,6 +128,7 @@ static int GetDecodedCastOpcode(unsigned Val) {
   case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
   case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
   case bitc::CAST_BITCAST : return Instruction::BitCast;
+  case bitc::CAST_ADDRSPACECAST: return Instruction::AddrSpaceCast;
   }
 }
 static int GetDecodedBinaryOpcode(unsigned Val, Type *Ty) {
@@ -450,12 +451,12 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
                 (EncodedAttrs & 0xffff));
 }
 
-bool BitcodeReader::ParseAttributeBlock() {
+error_code BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   if (!MAttributes.empty())
-    return Error("Multiple PARAMATTR blocks found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -468,9 +469,9 @@ bool BitcodeReader::ParseAttributeBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("Error at end of PARAMATTR block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -484,7 +485,7 @@ bool BitcodeReader::ParseAttributeBlock() {
     case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...]
       // FIXME: Remove in 4.0.
       if (Record.size() & 1)
-        return Error("Invalid ENTRY record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
@@ -508,131 +509,102 @@ bool BitcodeReader::ParseAttributeBlock() {
   }
 }
 
-bool BitcodeReader::ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind) {
+// Returns Attribute::None on unrecognized codes.
+static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
   switch (Code) {
+  default:
+    return Attribute::None;
   case bitc::ATTR_KIND_ALIGNMENT:
-    *Kind = Attribute::Alignment;
-    return false;
+    return Attribute::Alignment;
   case bitc::ATTR_KIND_ALWAYS_INLINE:
-    *Kind = Attribute::AlwaysInline;
-    return false;
+    return Attribute::AlwaysInline;
   case bitc::ATTR_KIND_BUILTIN:
-    *Kind = Attribute::Builtin;
-    return false;
+    return Attribute::Builtin;
   case bitc::ATTR_KIND_BY_VAL:
-    *Kind = Attribute::ByVal;
-    return false;
+    return Attribute::ByVal;
   case bitc::ATTR_KIND_COLD:
-    *Kind = Attribute::Cold;
-    return false;
+    return Attribute::Cold;
   case bitc::ATTR_KIND_INLINE_HINT:
-    *Kind = Attribute::InlineHint;
-    return false;
+    return Attribute::InlineHint;
   case bitc::ATTR_KIND_IN_REG:
-    *Kind = Attribute::InReg;
-    return false;
+    return Attribute::InReg;
   case bitc::ATTR_KIND_MIN_SIZE:
-    *Kind = Attribute::MinSize;
-    return false;
+    return Attribute::MinSize;
   case bitc::ATTR_KIND_NAKED:
-    *Kind = Attribute::Naked;
-    return false;
+    return Attribute::Naked;
   case bitc::ATTR_KIND_NEST:
-    *Kind = Attribute::Nest;
-    return false;
+    return Attribute::Nest;
   case bitc::ATTR_KIND_NO_ALIAS:
-    *Kind = Attribute::NoAlias;
-    return false;
+    return Attribute::NoAlias;
   case bitc::ATTR_KIND_NO_BUILTIN:
-    *Kind = Attribute::NoBuiltin;
-    return false;
+    return Attribute::NoBuiltin;
   case bitc::ATTR_KIND_NO_CAPTURE:
-    *Kind = Attribute::NoCapture;
-    return false;
+    return Attribute::NoCapture;
   case bitc::ATTR_KIND_NO_DUPLICATE:
-    *Kind = Attribute::NoDuplicate;
-    return false;
+    return Attribute::NoDuplicate;
   case bitc::ATTR_KIND_NO_IMPLICIT_FLOAT:
-    *Kind = Attribute::NoImplicitFloat;
-    return false;
+    return Attribute::NoImplicitFloat;
   case bitc::ATTR_KIND_NO_INLINE:
-    *Kind = Attribute::NoInline;
-    return false;
+    return Attribute::NoInline;
   case bitc::ATTR_KIND_NON_LAZY_BIND:
-    *Kind = Attribute::NonLazyBind;
-    return false;
+    return Attribute::NonLazyBind;
   case bitc::ATTR_KIND_NO_RED_ZONE:
-    *Kind = Attribute::NoRedZone;
-    return false;
+    return Attribute::NoRedZone;
   case bitc::ATTR_KIND_NO_RETURN:
-    *Kind = Attribute::NoReturn;
-    return false;
+    return Attribute::NoReturn;
   case bitc::ATTR_KIND_NO_UNWIND:
-    *Kind = Attribute::NoUnwind;
-    return false;
+    return Attribute::NoUnwind;
   case bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE:
-    *Kind = Attribute::OptimizeForSize;
-    return false;
+    return Attribute::OptimizeForSize;
+  case bitc::ATTR_KIND_OPTIMIZE_NONE:
+    return Attribute::OptimizeNone;
   case bitc::ATTR_KIND_READ_NONE:
-    *Kind = Attribute::ReadNone;
-    return false;
+    return Attribute::ReadNone;
   case bitc::ATTR_KIND_READ_ONLY:
-    *Kind = Attribute::ReadOnly;
-    return false;
+    return Attribute::ReadOnly;
   case bitc::ATTR_KIND_RETURNED:
-    *Kind = Attribute::Returned;
-    return false;
+    return Attribute::Returned;
   case bitc::ATTR_KIND_RETURNS_TWICE:
-    *Kind = Attribute::ReturnsTwice;
-    return false;
+    return Attribute::ReturnsTwice;
   case bitc::ATTR_KIND_S_EXT:
-    *Kind = Attribute::SExt;
-    return false;
+    return Attribute::SExt;
   case bitc::ATTR_KIND_STACK_ALIGNMENT:
-    *Kind = Attribute::StackAlignment;
-    return false;
+    return Attribute::StackAlignment;
   case bitc::ATTR_KIND_STACK_PROTECT:
-    *Kind = Attribute::StackProtect;
-    return false;
+    return Attribute::StackProtect;
   case bitc::ATTR_KIND_STACK_PROTECT_REQ:
-    *Kind = Attribute::StackProtectReq;
-    return false;
+    return Attribute::StackProtectReq;
   case bitc::ATTR_KIND_STACK_PROTECT_STRONG:
-    *Kind = Attribute::StackProtectStrong;
-    return false;
+    return Attribute::StackProtectStrong;
   case bitc::ATTR_KIND_STRUCT_RET:
-    *Kind = Attribute::StructRet;
-    return false;
+    return Attribute::StructRet;
   case bitc::ATTR_KIND_SANITIZE_ADDRESS:
-    *Kind = Attribute::SanitizeAddress;
-    return false;
+    return Attribute::SanitizeAddress;
   case bitc::ATTR_KIND_SANITIZE_THREAD:
-    *Kind = Attribute::SanitizeThread;
-    return false;
+    return Attribute::SanitizeThread;
   case bitc::ATTR_KIND_SANITIZE_MEMORY:
-    *Kind = Attribute::SanitizeMemory;
-    return false;
+    return Attribute::SanitizeMemory;
   case bitc::ATTR_KIND_UW_TABLE:
-    *Kind = Attribute::UWTable;
-    return false;
+    return Attribute::UWTable;
   case bitc::ATTR_KIND_Z_EXT:
-    *Kind = Attribute::ZExt;
-    return false;
-  default:
-    std::string Buf;
-    raw_string_ostream fmt(Buf);
-    fmt << "Unknown attribute kind (" << Code << ")";
-    fmt.flush();
-    return Error(Buf.c_str());
+    return Attribute::ZExt;
   }
 }
 
-bool BitcodeReader::ParseAttributeGroupBlock() {
+error_code BitcodeReader::ParseAttrKind(uint64_t Code,
+                                        Attribute::AttrKind *Kind) {
+  *Kind = GetAttrFromCode(Code);
+  if (*Kind == Attribute::None)
+    return Error(InvalidValue);
+  return error_code::success();
+}
+
+error_code BitcodeReader::ParseAttributeGroupBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   if (!MAttributeGroups.empty())
-    return Error("Multiple PARAMATTR_GROUP blocks found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -643,9 +615,9 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("Error at end of PARAMATTR_GROUP block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -658,7 +630,7 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
       break;
     case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
       if (Record.size() < 3)
-        return Error("Invalid ENTRY record");
+        return Error(InvalidRecord);
 
       uint64_t GrpID = Record[0];
       uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
@@ -667,14 +639,14 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Record[i] == 0) {        // Enum attribute
           Attribute::AttrKind Kind;
-          if (ParseAttrKind(Record[++i], &Kind))
-            return true;
+          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+            return EC;
 
           B.addAttribute(Kind);
         } else if (Record[i] == 1) { // Align attribute
           Attribute::AttrKind Kind;
-          if (ParseAttrKind(Record[++i], &Kind))
-            return true;
+          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+            return EC;
           if (Kind == Attribute::Alignment)
             B.addAlignmentAttr(Record[++i]);
           else
@@ -709,16 +681,16 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
   }
 }
 
-bool BitcodeReader::ParseTypeTable() {
+error_code BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   return ParseTypeTableBody();
 }
 
-bool BitcodeReader::ParseTypeTableBody() {
+error_code BitcodeReader::ParseTypeTableBody() {
   if (!TypeList.empty())
-    return Error("Multiple TYPE_BLOCKs found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
   unsigned NumRecords = 0;
@@ -732,12 +704,11 @@ bool BitcodeReader::ParseTypeTableBody() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      Error("Error in the type table block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
-        return Error("Invalid type forward reference in TYPE_BLOCK");
-      return false;
+        return Error(MalformedBlock);
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -747,12 +718,13 @@ bool BitcodeReader::ParseTypeTableBody() {
     Record.clear();
     Type *ResultTy = 0;
     switch (Stream.readRecord(Entry.ID, Record)) {
-    default: return Error("unknown type in type table");
+    default:
+      return Error(InvalidValue);
     case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
       // type list.  This allows us to reserve space.
       if (Record.size() < 1)
-        return Error("Invalid TYPE_CODE_NUMENTRY record");
+        return Error(InvalidRecord);
       TypeList.resize(Record[0]);
       continue;
     case bitc::TYPE_CODE_VOID:      // VOID
@@ -787,19 +759,20 @@ bool BitcodeReader::ParseTypeTableBody() {
       break;
     case bitc::TYPE_CODE_INTEGER:   // INTEGER: [width]
       if (Record.size() < 1)
-        return Error("Invalid Integer type record");
+        return Error(InvalidRecord);
 
       ResultTy = IntegerType::get(Context, Record[0]);
       break;
     case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
                                     //          [pointee type, address space]
       if (Record.size() < 1)
-        return Error("Invalid POINTER type record");
+        return Error(InvalidRecord);
       unsigned AddressSpace = 0;
       if (Record.size() == 2)
         AddressSpace = Record[1];
       ResultTy = getTypeByID(Record[0]);
-      if (ResultTy == 0) return Error("invalid element type in pointer type");
+      if (ResultTy == 0)
+        return Error(InvalidType);
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
     }
@@ -807,7 +780,7 @@ bool BitcodeReader::ParseTypeTableBody() {
       // FIXME: attrid is dead, remove it in LLVM 4.0
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
-        return Error("Invalid FUNCTION type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 3, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -818,7 +791,7 @@ bool BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[2]);
       if (ResultTy == 0 || ArgTys.size() < Record.size()-3)
-        return Error("invalid type in function type");
+        return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
@@ -826,7 +799,7 @@ bool BitcodeReader::ParseTypeTableBody() {
     case bitc::TYPE_CODE_FUNCTION: {
       // FUNCTION: [vararg, retty, paramty x N]
       if (Record.size() < 2)
-        return Error("Invalid FUNCTION type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -837,14 +810,14 @@ bool BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[1]);
       if (ResultTy == 0 || ArgTys.size() < Record.size()-2)
-        return Error("invalid type in function type");
+        return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error("Invalid STRUCT type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -853,21 +826,21 @@ bool BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error("invalid type in struct type");
+        return Error(InvalidType);
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
       if (ConvertToString(Record, 0, TypeName))
-        return Error("Invalid STRUCT_NAME record");
+        return Error(InvalidRecord);
       continue;
 
     case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error("Invalid STRUCT type record");
+        return Error(InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error("invalid TYPE table");
+        return Error(InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -886,17 +859,17 @@ bool BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error("invalid STRUCT type record");
+        return Error(InvalidRecord);
       Res->setBody(EltTys, Record[0]);
       ResultTy = Res;
       break;
     }
     case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
       if (Record.size() != 1)
-        return Error("Invalid OPAQUE type record");
+        return Error(InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error("invalid TYPE table");
+        return Error(InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -911,33 +884,33 @@ bool BitcodeReader::ParseTypeTableBody() {
     }
     case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
-        return Error("Invalid ARRAY type record");
+        return Error(InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = ArrayType::get(ResultTy, Record[0]);
       else
-        return Error("Invalid ARRAY type element");
+        return Error(InvalidType);
       break;
     case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty]
       if (Record.size() < 2)
-        return Error("Invalid VECTOR type record");
+        return Error(InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = VectorType::get(ResultTy, Record[0]);
       else
-        return Error("Invalid ARRAY type element");
+        return Error(InvalidType);
       break;
     }
 
     if (NumRecords >= TypeList.size())
-      return Error("invalid TYPE table");
+      return Error(InvalidTYPETable);
     assert(ResultTy && "Didn't read a type?");
     assert(TypeList[NumRecords] == 0 && "Already read type?");
     TypeList[NumRecords++] = ResultTy;
   }
 }
 
-bool BitcodeReader::ParseValueSymbolTable() {
+error_code BitcodeReader::ParseValueSymbolTable() {
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -949,9 +922,9 @@ bool BitcodeReader::ParseValueSymbolTable() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed value symbol table block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -964,10 +937,10 @@ bool BitcodeReader::ParseValueSymbolTable() {
       break;
     case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
       if (ConvertToString(Record, 1, ValueName))
-        return Error("Invalid VST_ENTRY record");
+        return Error(InvalidRecord);
       unsigned ValueID = Record[0];
       if (ValueID >= ValueList.size())
-        return Error("Invalid Value ID in VST_ENTRY record");
+        return Error(InvalidRecord);
       Value *V = ValueList[ValueID];
 
       V->setName(StringRef(ValueName.data(), ValueName.size()));
@@ -976,10 +949,10 @@ bool BitcodeReader::ParseValueSymbolTable() {
     }
     case bitc::VST_CODE_BBENTRY: {
       if (ConvertToString(Record, 1, ValueName))
-        return Error("Invalid VST_BBENTRY record");
+        return Error(InvalidRecord);
       BasicBlock *BB = getBasicBlock(Record[0]);
       if (BB == 0)
-        return Error("Invalid BB ID in VST_BBENTRY record");
+        return Error(InvalidRecord);
 
       BB->setName(StringRef(ValueName.data(), ValueName.size()));
       ValueName.clear();
@@ -989,11 +962,11 @@ bool BitcodeReader::ParseValueSymbolTable() {
   }
 }
 
-bool BitcodeReader::ParseMetadata() {
+error_code BitcodeReader::ParseMetadata() {
   unsigned NextMDValueNo = MDValueList.size();
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1004,10 +977,9 @@ bool BitcodeReader::ParseMetadata() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      Error("malformed metadata block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1036,7 +1008,7 @@ bool BitcodeReader::ParseMetadata() {
       for (unsigned i = 0; i != Size; ++i) {
         MDNode *MD = dyn_cast<MDNode>(MDValueList.getValueFwdRef(Record[i]));
         if (MD == 0)
-          return Error("Malformed metadata record");
+          return Error(InvalidRecord);
         NMD->addOperand(MD);
       }
       break;
@@ -1046,13 +1018,14 @@ bool BitcodeReader::ParseMetadata() {
       // fall-through
     case bitc::METADATA_NODE: {
       if (Record.size() % 2 == 1)
-        return Error("Invalid METADATA_NODE record");
+        return Error(InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Value*, 8> Elts;
       for (unsigned i = 0; i != Size; i += 2) {
         Type *Ty = getTypeByID(Record[i]);
-        if (!Ty) return Error("Invalid METADATA_NODE record");
+        if (!Ty)
+          return Error(InvalidRecord);
         if (Ty->isMetadataTy())
           Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
         else if (!Ty->isVoidTy())
@@ -1073,14 +1046,14 @@ bool BitcodeReader::ParseMetadata() {
     }
     case bitc::METADATA_KIND: {
       if (Record.size() < 2)
-        return Error("Invalid METADATA_KIND record");
+        return Error(InvalidRecord);
 
       unsigned Kind = Record[0];
       SmallString<8> Name(Record.begin()+1, Record.end());
 
       unsigned NewKind = TheModule->getMDKindID(Name.str());
       if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
-        return Error("Conflicting METADATA_KIND records");
+        return Error(ConflictingMETADATA_KINDRecords);
       break;
     }
     }
@@ -1100,12 +1073,14 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
 
 /// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
 /// values and aliases that we can.
-bool BitcodeReader::ResolveGlobalAndAliasInits() {
+error_code BitcodeReader::ResolveGlobalAndAliasInits() {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrefixWorklist;
 
   GlobalInitWorklist.swap(GlobalInits);
   AliasInitWorklist.swap(AliasInits);
+  FunctionPrefixWorklist.swap(FunctionPrefixes);
 
   while (!GlobalInitWorklist.empty()) {
     unsigned ValID = GlobalInitWorklist.back().second;
@@ -1116,7 +1091,7 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
         GlobalInitWorklist.back().first->setInitializer(C);
       else
-        return Error("Global variable initializer is not a constant!");
+        return Error(ExpectedConstant);
     }
     GlobalInitWorklist.pop_back();
   }
@@ -1129,11 +1104,25 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
         AliasInitWorklist.back().first->setAliasee(C);
       else
-        return Error("Alias initializer is not a constant!");
+        return Error(ExpectedConstant);
     }
     AliasInitWorklist.pop_back();
   }
-  return false;
+
+  while (!FunctionPrefixWorklist.empty()) {
+    unsigned ValID = FunctionPrefixWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      FunctionPrefixes.push_back(FunctionPrefixWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
+        FunctionPrefixWorklist.back().first->setPrefixData(C);
+      else
+        return Error(ExpectedConstant);
+    }
+    FunctionPrefixWorklist.pop_back();
+  }
+
+  return error_code::success();
 }
 
 static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
@@ -1144,9 +1133,9 @@ static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   return APInt(TypeBits, Words);
 }
 
-bool BitcodeReader::ParseConstants() {
+error_code BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1159,15 +1148,15 @@ bool BitcodeReader::ParseConstants() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed block record in AST file");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NextCstNo != ValueList.size())
-        return Error("Invalid constant reference!");
+        return Error(InvalidConstantReference);
 
       // Once all the constants have been read, go through and resolve forward
       // references.
       ValueList.ResolveConstantForwardRefs();
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1184,9 +1173,9 @@ bool BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
       if (Record.empty())
-        return Error("Malformed CST_SETTYPE record");
+        return Error(InvalidRecord);
       if (Record[0] >= TypeList.size())
-        return Error("Invalid Type ID in CST_SETTYPE record");
+        return Error(InvalidRecord);
       CurTy = TypeList[Record[0]];
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
@@ -1194,12 +1183,12 @@ bool BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error("Invalid CST_INTEGER record");
+        return Error(InvalidRecord);
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error("Invalid WIDE_INTEGER record");
+        return Error(InvalidRecord);
 
       APInt VInt = ReadWideAPInt(Record,
                                  cast<IntegerType>(CurTy)->getBitWidth());
@@ -1209,7 +1198,7 @@ bool BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
-        return Error("Invalid FLOAT record");
+        return Error(InvalidRecord);
       if (CurTy->isHalfTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf,
                                              APInt(16, (uint16_t)Record[0])));
@@ -1239,7 +1228,7 @@ bool BitcodeReader::ParseConstants() {
 
     case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
       if (Record.empty())
-        return Error("Invalid CST_AGGREGATE record");
+        return Error(InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Constant*, 16> Elts;
@@ -1267,7 +1256,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_STRING:    // STRING: [values]
     case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
       if (Record.empty())
-        return Error("Invalid CST_STRING record");
+        return Error(InvalidRecord);
 
       SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
@@ -1276,7 +1265,7 @@ bool BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_DATA: {// DATA: [n x value]
       if (Record.empty())
-        return Error("Invalid CST_DATA record");
+        return Error(InvalidRecord);
 
       Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
       unsigned Size = Record.size();
@@ -1321,13 +1310,14 @@ bool BitcodeReader::ParseConstants() {
         else
           V = ConstantDataArray::get(Context, Elts);
       } else {
-        return Error("Unknown element type in CE_DATA");
+        return Error(InvalidTypeForValue);
       }
       break;
     }
 
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
-      if (Record.size() < 3) return Error("Invalid CE_BINOP record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       int Opc = GetDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown binop.
@@ -1357,25 +1347,30 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
-      if (Record.size() < 3) return Error("Invalid CE_CAST record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       int Opc = GetDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
         Type *OpTy = getTypeByID(Record[1]);
-        if (!OpTy) return Error("Invalid CE_CAST record");
+        if (!OpTy)
+          return Error(InvalidRecord);
         Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
-        V = ConstantExpr::getCast(Opc, Op, CurTy);
+        V = UpgradeBitCastExpr(Opc, Op, CurTy);
+        if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
       }
       break;
     }
     case bitc::CST_CODE_CE_INBOUNDS_GEP:
     case bitc::CST_CODE_CE_GEP: {  // CE_GEP:        [n x operands]
-      if (Record.size() & 1) return Error("Invalid CE_GEP record");
+      if (Record.size() & 1)
+        return Error(InvalidRecord);
       SmallVector<Constant*, 16> Elts;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         Type *ElTy = getTypeByID(Record[i]);
-        if (!ElTy) return Error("Invalid CE_GEP record");
+        if (!ElTy)
+          return Error(InvalidRecord);
         Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
       }
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
@@ -1384,19 +1379,31 @@ bool BitcodeReader::ParseConstants() {
                                            bitc::CST_CODE_CE_INBOUNDS_GEP);
       break;
     }
-    case bitc::CST_CODE_CE_SELECT:  // CE_SELECT: [opval#, opval#, opval#]
-      if (Record.size() < 3) return Error("Invalid CE_SELECT record");
-      V = ConstantExpr::getSelect(
-                          ValueList.getConstantFwdRef(Record[0],
-                                                      Type::getInt1Ty(Context)),
-                          ValueList.getConstantFwdRef(Record[1],CurTy),
-                          ValueList.getConstantFwdRef(Record[2],CurTy));
+    case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
+
+      Type *SelectorTy = Type::getInt1Ty(Context);
+
+      // If CurTy is a vector of length n, then Record[0] must be a <n x i1>
+      // vector. Otherwise, it must be a single bit.
+      if (VectorType *VTy = dyn_cast<VectorType>(CurTy))
+        SelectorTy = VectorType::get(Type::getInt1Ty(Context),
+                                     VTy->getNumElements());
+
+      V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
+                                                              SelectorTy),
+                                  ValueList.getConstantFwdRef(Record[1],CurTy),
+                                  ValueList.getConstantFwdRef(Record[2],CurTy));
       break;
+    }
     case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opval]
-      if (Record.size() < 3) return Error("Invalid CE_EXTRACTELT record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
-      if (OpTy == 0) return Error("Invalid CE_EXTRACTELT record");
+      if (OpTy == 0)
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2],
                                                   Type::getInt32Ty(Context));
@@ -1406,7 +1413,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
-        return Error("Invalid CE_INSERTELT record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
                                                   OpTy->getElementType());
@@ -1418,7 +1425,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
-        return Error("Invalid CE_SHUFFLEVEC record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1432,7 +1439,7 @@ bool BitcodeReader::ParseConstants() {
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || RTy == 0 || OpTy == 0)
-        return Error("Invalid CE_SHUFVEC_EX record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1442,9 +1449,11 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
-      if (Record.size() < 4) return Error("Invalid CE_CMP record");
+      if (Record.size() < 4)
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
-      if (OpTy == 0) return Error("Invalid CE_CMP record");
+      if (OpTy == 0)
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
 
@@ -1457,16 +1466,17 @@ bool BitcodeReader::ParseConstants() {
     // This maintains backward compatibility, pre-asm dialect keywords.
     // FIXME: Remove with the 4.0 release.
     case bitc::CST_CODE_INLINEASM_OLD: {
-      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      if (Record.size() < 2)
+        return Error(InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1480,17 +1490,18 @@ bool BitcodeReader::ParseConstants() {
     // This version adds support for the asm dialect keywords (e.g.,
     // inteldialect).
     case bitc::CST_CODE_INLINEASM: {
-      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      if (Record.size() < 2)
+        return Error(InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = (Record[0] >> 1) & 1;
       unsigned AsmDialect = Record[0] >> 2;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1503,12 +1514,15 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
-      if (Record.size() < 3) return Error("Invalid CE_BLOCKADDRESS record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       Type *FnTy = getTypeByID(Record[0]);
-      if (FnTy == 0) return Error("Invalid CE_BLOCKADDRESS record");
+      if (FnTy == 0)
+        return Error(InvalidRecord);
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
-      if (Fn == 0) return Error("Invalid CE_BLOCKADDRESS record");
+      if (Fn == 0)
+        return Error(InvalidRecord);
 
       // If the function is already parsed we can insert the block address right
       // away.
@@ -1516,7 +1530,7 @@ bool BitcodeReader::ParseConstants() {
         Function::iterator BBI = Fn->begin(), BBE = Fn->end();
         for (size_t I = 0, E = Record[2]; I != E; ++I) {
           if (BBI == BBE)
-            return Error("Invalid blockaddress block #");
+            return Error(InvalidID);
           ++BBI;
         }
         V = BlockAddress::get(Fn, BBI);
@@ -1539,9 +1553,9 @@ bool BitcodeReader::ParseConstants() {
   }
 }
 
-bool BitcodeReader::ParseUseLists() {
+error_code BitcodeReader::ParseUseLists() {
   if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1552,9 +1566,9 @@ bool BitcodeReader::ParseUseLists() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed use list block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1568,7 +1582,7 @@ bool BitcodeReader::ParseUseLists() {
     case bitc::USELIST_CODE_ENTRY: { // USELIST_CODE_ENTRY: TBD.
       unsigned RecordLength = Record.size();
       if (RecordLength < 1)
-        return Error ("Invalid UseList reader!");
+        return Error(InvalidRecord);
       UseListRecords.push_back(Record);
       break;
     }
@@ -1579,10 +1593,10 @@ bool BitcodeReader::ParseUseLists() {
 /// RememberAndSkipFunctionBody - When we see the block for a function body,
 /// remember where it is and then skip it.  This lets us lazily deserialize the
 /// functions.
-bool BitcodeReader::RememberAndSkipFunctionBody() {
+error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Get the function we are talking about.
   if (FunctionsWithBodies.empty())
-    return Error("Insufficient function protos");
+    return Error(InsufficientFunctionProtos);
 
   Function *Fn = FunctionsWithBodies.back();
   FunctionsWithBodies.pop_back();
@@ -1593,15 +1607,15 @@ bool BitcodeReader::RememberAndSkipFunctionBody() {
 
   // Skip over the function block for now.
   if (Stream.SkipBlock())
-    return Error("Malformed block record");
-  return false;
+    return Error(InvalidRecord);
+  return error_code::success();
 }
 
-bool BitcodeReader::GlobalCleanup() {
+error_code BitcodeReader::GlobalCleanup() {
   // Patch the initializers for globals and aliases up.
   ResolveGlobalAndAliasInits();
   if (!GlobalInits.empty() || !AliasInits.empty())
-    return Error("Malformed global initializer set");
+    return Error(MalformedGlobalInitializerSet);
 
   // Look for intrinsic functions which need to be upgraded at some point
   for (Module::iterator FI = TheModule->begin(), FE = TheModule->end();
@@ -1620,14 +1634,14 @@ bool BitcodeReader::GlobalCleanup() {
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
   std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
-  return false;
+  return error_code::success();
 }
 
-bool BitcodeReader::ParseModule(bool Resume) {
+error_code BitcodeReader::ParseModule(bool Resume) {
   if (Resume)
     Stream.JumpToBit(NextUnreadBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   std::vector<std::string> SectionTable;
@@ -1639,8 +1653,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       return GlobalCleanup();
 
@@ -1648,49 +1661,51 @@ bool BitcodeReader::ParseModule(bool Resume) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error("Malformed BlockInfoBlock");
+          return Error(MalformedBlock);
         break;
       case bitc::PARAMATTR_BLOCK_ID:
-        if (ParseAttributeBlock())
-          return true;
+        if (error_code EC = ParseAttributeBlock())
+          return EC;
         break;
       case bitc::PARAMATTR_GROUP_BLOCK_ID:
-        if (ParseAttributeGroupBlock())
-          return true;
+        if (error_code EC = ParseAttributeGroupBlock())
+          return EC;
         break;
       case bitc::TYPE_BLOCK_ID_NEW:
-        if (ParseTypeTable())
-          return true;
+        if (error_code EC = ParseTypeTable())
+          return EC;
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (ParseValueSymbolTable())
-          return true;
+        if (error_code EC = ParseValueSymbolTable())
+          return EC;
         SeenValueSymbolTable = true;
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (ParseConstants() || ResolveGlobalAndAliasInits())
-          return true;
+        if (error_code EC = ParseConstants())
+          return EC;
+        if (error_code EC = ResolveGlobalAndAliasInits())
+          return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (ParseMetadata())
-          return true;
+        if (error_code EC = ParseMetadata())
+          return EC;
         break;
       case bitc::FUNCTION_BLOCK_ID:
         // If this is the first function body we've seen, reverse the
         // FunctionsWithBodies list.
         if (!SeenFirstFunctionBody) {
           std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end());
-          if (GlobalCleanup())
-            return true;
+          if (error_code EC = GlobalCleanup())
+            return EC;
           SeenFirstFunctionBody = true;
         }
 
-        if (RememberAndSkipFunctionBody())
-          return true;
+        if (error_code EC = RememberAndSkipFunctionBody())
+          return EC;
         // For streaming bitcode, suspend parsing when we reach the function
         // bodies. Subsequent materialization calls will resume it when
         // necessary. For streaming, the function bodies must be at the end of
@@ -1699,12 +1714,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
         // just finish the parse now.
         if (LazyStreamer && SeenValueSymbolTable) {
           NextUnreadBit = Stream.GetCurrentBitNo();
-          return false;
+          return error_code::success();
         }
         break;
       case bitc::USELIST_BLOCK_ID:
-        if (ParseUseLists())
-          return true;
+        if (error_code EC = ParseUseLists())
+          return EC;
         break;
       }
       continue;
@@ -1720,11 +1735,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
-        return Error("Malformed MODULE_CODE_VERSION");
+        return Error(InvalidRecord);
       // Only version #0 and #1 are supported so far.
       unsigned module_version = Record[0];
       switch (module_version) {
-        default: return Error("Unknown bitstream version!");
+        default:
+          return Error(InvalidValue);
         case 0:
           UseRelativeIDs = false;
           break;
@@ -1737,21 +1753,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_TRIPLE record");
+        return Error(InvalidRecord);
       TheModule->setTargetTriple(S);
       break;
     }
     case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_DATALAYOUT record");
+        return Error(InvalidRecord);
       TheModule->setDataLayout(S);
       break;
     }
     case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_ASM record");
+        return Error(InvalidRecord);
       TheModule->setModuleInlineAsm(S);
       break;
     }
@@ -1759,21 +1775,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
       // FIXME: Remove in 4.0.
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_DEPLIB record");
+        return Error(InvalidRecord);
       // Ignore value.
       break;
     }
     case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_SECTIONNAME record");
+        return Error(InvalidRecord);
       SectionTable.push_back(S);
       break;
     }
     case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_GCNAME record");
+        return Error(InvalidRecord);
       GCTable.push_back(S);
       break;
     }
@@ -1782,11 +1798,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     //             unnamed_addr]
     case bitc::MODULE_CODE_GLOBALVAR: {
       if (Record.size() < 6)
-        return Error("Invalid MODULE_CODE_GLOBALVAR record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_CODE_GLOBALVAR record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Global not a pointer type!");
+        return Error(InvalidTypeForValue);
       unsigned AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
       Ty = cast<PointerType>(Ty)->getElementType();
 
@@ -1796,7 +1813,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
       std::string Section;
       if (Record[5]) {
         if (Record[5]-1 >= SectionTable.size())
-          return Error("Invalid section ID");
+          return Error(InvalidID);
         Section = SectionTable[Record[5]-1];
       }
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
@@ -1835,15 +1852,16 @@ bool BitcodeReader::ParseModule(bool Resume) {
     //             alignment, section, visibility, gc, unnamed_addr]
     case bitc::MODULE_CODE_FUNCTION: {
       if (Record.size() < 8)
-        return Error("Invalid MODULE_CODE_FUNCTION record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_CODE_FUNCTION record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Function not a pointer type!");
+        return Error(InvalidTypeForValue);
       FunctionType *FTy =
         dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
       if (!FTy)
-        return Error("Function not a pointer to function type!");
+        return Error(InvalidTypeForValue);
 
       Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
                                         "", TheModule);
@@ -1856,19 +1874,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
       Func->setAlignment((1 << Record[5]) >> 1);
       if (Record[6]) {
         if (Record[6]-1 >= SectionTable.size())
-          return Error("Invalid section ID");
+          return Error(InvalidID);
         Func->setSection(SectionTable[Record[6]-1]);
       }
       Func->setVisibility(GetDecodedVisibility(Record[7]));
       if (Record.size() > 8 && Record[8]) {
         if (Record[8]-1 > GCTable.size())
-          return Error("Invalid GC ID");
+          return Error(InvalidID);
         Func->setGC(GCTable[Record[8]-1].c_str());
       }
       bool UnnamedAddr = false;
       if (Record.size() > 9)
         UnnamedAddr = Record[9];
       Func->setUnnamedAddr(UnnamedAddr);
+      if (Record.size() > 10 && Record[10] != 0)
+        FunctionPrefixes.push_back(std::make_pair(Func, Record[10]-1));
       ValueList.push_back(Func);
 
       // If this is a function with a body, remember the prototype we are
@@ -1883,11 +1903,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     // ALIAS: [alias type, aliasee val#, linkage, visibility]
     case bitc::MODULE_CODE_ALIAS: {
       if (Record.size() < 3)
-        return Error("Invalid MODULE_ALIAS record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_ALIAS record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Function not a pointer type!");
+        return Error(InvalidTypeForValue);
 
       GlobalAlias *NewGA = new GlobalAlias(Ty, GetDecodedLinkage(Record[2]),
                                            "", 0, TheModule);
@@ -1902,7 +1923,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_PURGEVALS:
       // Trim down the value list to the specified size.
       if (Record.size() < 1 || Record[0] > ValueList.size())
-        return Error("Invalid MODULE_PURGEVALS record");
+        return Error(InvalidRecord);
       ValueList.shrinkTo(Record[0]);
       break;
     }
@@ -1910,10 +1931,11 @@ bool BitcodeReader::ParseModule(bool Resume) {
   }
 }
 
-bool BitcodeReader::ParseBitcodeInto(Module *M) {
+error_code BitcodeReader::ParseBitcodeInto(Module *M) {
   TheModule = 0;
 
-  if (InitStream()) return true;
+  if (error_code EC = InitStream())
+    return EC;
 
   // Sniff for the signature.
   if (Stream.Read(8) != 'B' ||
@@ -1922,42 +1944,42 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
   while (1) {
     if (Stream.AtEndOfStream())
-      return false;
+      return error_code::success();
 
     BitstreamEntry Entry =
       Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module file");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
 
     case BitstreamEntry::SubBlock:
       switch (Entry.ID) {
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error("Malformed BlockInfoBlock");
+          return Error(MalformedBlock);
         break;
       case bitc::MODULE_BLOCK_ID:
         // Reject multiple MODULE_BLOCK's in a single bitstream.
         if (TheModule)
-          return Error("Multiple MODULE_BLOCKs in same stream");
+          return Error(InvalidMultipleBlocks);
         TheModule = M;
-        if (ParseModule(false))
-          return true;
-        if (LazyStreamer) return false;
+        if (error_code EC = ParseModule(false))
+          return EC;
+        if (LazyStreamer)
+          return error_code::success();
         break;
       default:
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       }
       continue;
@@ -1970,16 +1992,16 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
           Stream.AtEndOfStream())
-        return false;
+        return error_code::success();
 
-      return Error("Invalid record at top-level");
+      return Error(InvalidRecord);
     }
   }
 }
 
-bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
+error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1990,9 +2012,9 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed module block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2004,7 +2026,7 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_TRIPLE record");
+        return Error(InvalidRecord);
       Triple = S;
       break;
     }
@@ -2013,8 +2035,9 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
   }
 }
 
-bool BitcodeReader::ParseTriple(std::string &Triple) {
-  if (InitStream()) return true;
+error_code BitcodeReader::ParseTriple(std::string &Triple) {
+  if (error_code EC = InitStream())
+    return EC;
 
   // Sniff for the signature.
   if (Stream.Read(8) != 'B' ||
@@ -2023,7 +2046,7 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
@@ -2032,20 +2055,17 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module file");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
 
     case BitstreamEntry::SubBlock:
       if (Entry.ID == bitc::MODULE_BLOCK_ID)
         return ParseModuleTriple(Triple);
 
       // Ignore other sub-blocks.
-      if (Stream.SkipBlock()) {
-        Error("malformed block record in AST file");
-        return true;
-      }
+      if (Stream.SkipBlock())
+        return Error(MalformedBlock);
       continue;
 
     case BitstreamEntry::Record:
@@ -2056,9 +2076,9 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
 }
 
 /// ParseMetadataAttachment - Parse metadata attachments.
-bool BitcodeReader::ParseMetadataAttachment() {
+error_code BitcodeReader::ParseMetadataAttachment() {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   while (1) {
@@ -2067,9 +2087,9 @@ bool BitcodeReader::ParseMetadataAttachment() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed metadata block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2083,16 +2103,18 @@ bool BitcodeReader::ParseMetadataAttachment() {
     case bitc::METADATA_ATTACHMENT: {
       unsigned RecordLength = Record.size();
       if (Record.empty() || (RecordLength - 1) % 2 == 1)
-        return Error ("Invalid METADATA_ATTACHMENT reader!");
+        return Error(InvalidRecord);
       Instruction *Inst = InstructionList[Record[0]];
       for (unsigned i = 1; i != RecordLength; i = i+2) {
         unsigned Kind = Record[i];
         DenseMap<unsigned, unsigned>::iterator I =
           MDKindMap.find(Kind);
         if (I == MDKindMap.end())
-          return Error("Invalid metadata kind ID");
+          return Error(InvalidID);
         Value *Node = MDValueList.getValueFwdRef(Record[i+1]);
         Inst->setMetadata(I->second, cast<MDNode>(Node));
+        if (I->second == LLVMContext::MD_tbaa)
+          InstsWithTBAATag.push_back(Inst);
       }
       break;
     }
@@ -2101,9 +2123,9 @@ bool BitcodeReader::ParseMetadataAttachment() {
 }
 
 /// ParseFunctionBody - Lazily parse the specified function body block.
-bool BitcodeReader::ParseFunctionBody(Function *F) {
+error_code BitcodeReader::ParseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
@@ -2126,7 +2148,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error("Bitcode error in function block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       goto OutOfRecordLoop;
 
@@ -2134,20 +2156,24 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (ParseConstants()) return true;
+        if (error_code EC = ParseConstants())
+          return EC;
         NextValueNo = ValueList.size();
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (ParseValueSymbolTable()) return true;
+        if (error_code EC = ParseValueSymbolTable())
+          return EC;
         break;
       case bitc::METADATA_ATTACHMENT_ID:
-        if (ParseMetadataAttachment()) return true;
+        if (error_code EC = ParseMetadataAttachment())
+          return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (ParseMetadata()) return true;
+        if (error_code EC = ParseMetadata())
+          return EC;
         break;
       }
       continue;
@@ -2163,10 +2189,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: // Default behavior: reject
-      return Error("Unknown instruction");
+      return Error(InvalidValue);
     case bitc::FUNC_CODE_DECLAREBLOCKS:     // DECLAREBLOCKS: [nblocks]
       if (Record.size() < 1 || Record[0] == 0)
-        return Error("Invalid DECLAREBLOCKS record");
+        return Error(InvalidRecord);
       // Create all the basic blocks for the function.
       FunctionBBs.resize(Record[0]);
       for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
@@ -2186,7 +2212,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
 
-      if (I == 0) return Error("Invalid DEBUG_LOC_AGAIN record");
+      if (I == 0)
+        return Error(InvalidRecord);
       I->setDebugLoc(LastLoc);
       I = 0;
       continue;
@@ -2199,7 +2226,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
       if (I == 0 || Record.size() < 4)
-        return Error("Invalid FUNC_CODE_DEBUG_LOC record");
+        return Error(InvalidRecord);
 
       unsigned Line = Record[0], Col = Record[1];
       unsigned ScopeID = Record[2], IAID = Record[3];
@@ -2219,10 +2246,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 > Record.size())
-        return Error("Invalid BINOP record");
+        return Error(InvalidRecord);
 
       int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
-      if (Opc == -1) return Error("Invalid BINOP record");
+      if (Opc == -1)
+        return Error(InvalidRecord);
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
@@ -2264,13 +2292,21 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error("Invalid CAST record");
+        return Error(InvalidRecord);
 
       Type *ResTy = getTypeByID(Record[OpNum]);
       int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
       if (Opc == -1 || ResTy == 0)
-        return Error("Invalid CAST record");
-      I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+        return Error(InvalidRecord);
+      Instruction *Temp = 0;
+      if ((I = UpgradeBitCastInst(Opc, Op, ResTy, Temp))) {
+        if (Temp) {
+          InstructionList.push_back(Temp);
+          CurBB->getInstList().push_back(Temp);
+        }
+      } else {
+        I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+      }
       InstructionList.push_back(I);
       break;
     }
@@ -2279,13 +2315,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *BasePtr;
       if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
-        return Error("Invalid GEP record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error("Invalid GEP record");
+          return Error(InvalidRecord);
         GEPIdx.push_back(Op);
       }
 
@@ -2301,14 +2337,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error("Invalid EXTRACTVAL record");
+        return Error(InvalidRecord);
 
       SmallVector<unsigned, 4> EXTRACTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error("Invalid EXTRACTVAL index");
+          return Error(InvalidValue);
         EXTRACTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2322,17 +2358,17 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error("Invalid INSERTVAL record");
+        return Error(InvalidRecord);
       Value *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Val))
-        return Error("Invalid INSERTVAL record");
+        return Error(InvalidRecord);
 
       SmallVector<unsigned, 4> INSERTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error("Invalid INSERTVAL index");
+          return Error(InvalidValue);
         INSERTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2349,7 +2385,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
-        return Error("Invalid SELECT record");
+        return Error(InvalidRecord);
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
       InstructionList.push_back(I);
@@ -2364,18 +2400,18 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           getValueTypePair(Record, OpNum, NextValueNo, Cond))
-        return Error("Invalid SELECT record");
+        return Error(InvalidRecord);
 
       // select condition can be either i1 or [N x i1]
       if (VectorType* vector_type =
           dyn_cast<VectorType>(Cond->getType())) {
         // expect <n x i1>
         if (vector_type->getElementType() != Type::getInt1Ty(Context))
-          return Error("Invalid SELECT condition type");
+          return Error(InvalidTypeForValue);
       } else {
         // expect i1
         if (Cond->getType() != Type::getInt1Ty(Context))
-          return Error("Invalid SELECT condition type");
+          return Error(InvalidTypeForValue);
       }
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
@@ -2388,7 +2424,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
-        return Error("Invalid EXTRACTELT record");
+        return Error(InvalidRecord);
       I = ExtractElementInst::Create(Vec, Idx);
       InstructionList.push_back(I);
       break;
@@ -2401,7 +2437,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                    cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
-        return Error("Invalid INSERTELT record");
+        return Error(InvalidRecord);
       I = InsertElementInst::Create(Vec, Elt, Idx);
       InstructionList.push_back(I);
       break;
@@ -2412,10 +2448,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec1, *Vec2, *Mask;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
           popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
-        return Error("Invalid SHUFFLEVEC record");
+        return Error(InvalidRecord);
 
       if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
-        return Error("Invalid SHUFFLEVEC record");
+        return Error(InvalidRecord);
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
       InstructionList.push_back(I);
       break;
@@ -2433,7 +2469,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 != Record.size())
-        return Error("Invalid CMP record");
+        return Error(InvalidRecord);
 
       if (LHS->getType()->isFPOrFPVectorTy())
         I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
@@ -2455,9 +2491,9 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         unsigned OpNum = 0;
         Value *Op = NULL;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error("Invalid RET record");
+          return Error(InvalidRecord);
         if (OpNum != Record.size())
-          return Error("Invalid RET record");
+          return Error(InvalidRecord);
 
         I = ReturnInst::Create(Context, Op);
         InstructionList.push_back(I);
@@ -2465,10 +2501,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       }
     case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
       if (Record.size() != 1 && Record.size() != 3)
-        return Error("Invalid BR record");
+        return Error(InvalidRecord);
       BasicBlock *TrueDest = getBasicBlock(Record[0]);
       if (TrueDest == 0)
-        return Error("Invalid BR record");
+        return Error(InvalidRecord);
 
       if (Record.size() == 1) {
         I = BranchInst::Create(TrueDest);
@@ -2479,7 +2515,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo,
                                Type::getInt1Ty(Context));
         if (FalseDest == 0 || Cond == 0)
-          return Error("Invalid BR record");
+          return Error(InvalidRecord);
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
         InstructionList.push_back(I);
       }
@@ -2488,7 +2524,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
       // Check magic
       if ((Record[0] >> 16) == SWITCH_INST_MAGIC) {
-        // New SwitchInst format with case ranges.
+        // "New" SwitchInst format with case ranges. The changes to write this
+        // format were reverted but we still recognize bitcode that uses it.
+        // Hopefully someday we will have support for case ranges and can use
+        // this format again.
 
         Type *OpTy = getTypeByID(Record[1]);
         unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
@@ -2496,7 +2535,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (OpTy == 0 || Cond == 0 || Default == 0)
-          return Error("Invalid SWITCH record");
+          return Error(InvalidRecord);
 
         unsigned NumCases = Record[4];
 
@@ -2505,7 +2544,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
         unsigned CurIdx = 5;
         for (unsigned i = 0; i != NumCases; ++i) {
-          IntegersSubsetToBB CaseBuilder;
+          SmallVector<ConstantInt*, 1> CaseVals;
           unsigned NumItems = Record[CurIdx++];
           for (unsigned ci = 0; ci != NumItems; ++ci) {
             bool isSingleNumber = Record[CurIdx++];
@@ -2525,20 +2564,22 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
               APInt High =
                   ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
                                 ValueBitWidth);
-
-              CaseBuilder.add(IntItem::fromType(OpTy, Low),
-                              IntItem::fromType(OpTy, High));
               CurIdx += ActiveWords;
+
+              // FIXME: It is not clear whether values in the range should be
+              // compared as signed or unsigned values. The partially
+              // implemented changes that used this format in the past used
+              // unsigned comparisons.
+              for ( ; Low.ule(High); ++Low)
+                CaseVals.push_back(ConstantInt::get(Context, Low));
             } else
-              CaseBuilder.add(IntItem::fromType(OpTy, Low));
+              CaseVals.push_back(ConstantInt::get(Context, Low));
           }
           BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
-          IntegersSubset Case = CaseBuilder.getCase();
-          SI->addCase(Case, DestBB);
+          for (SmallVector<ConstantInt*, 1>::iterator cvi = CaseVals.begin(),
+                 cve = CaseVals.end(); cvi != cve; ++cvi)
+            SI->addCase(*cvi, DestBB);
         }
-        uint16_t Hash = SI->hash();
-        if (Hash != (Record[0] & 0xFFFF))
-          return Error("Invalid SWITCH record");
         I = SI;
         break;
       }
@@ -2546,12 +2587,12 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // Old SwitchInst format without case ranges.
 
       if (Record.size() < 3 || (Record.size() & 1) == 0)
-        return Error("Invalid SWITCH record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (OpTy == 0 || Cond == 0 || Default == 0)
-        return Error("Invalid SWITCH record");
+        return Error(InvalidRecord);
       unsigned NumCases = (Record.size()-3)/2;
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
@@ -2561,7 +2602,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
         if (CaseVal == 0 || DestBB == 0) {
           delete SI;
-          return Error("Invalid SWITCH record!");
+          return Error(InvalidRecord);
         }
         SI->addCase(CaseVal, DestBB);
       }
@@ -2570,11 +2611,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
-        return Error("Invalid INDIRECTBR record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Address = getValue(Record, 1, NextValueNo, OpTy);
       if (OpTy == 0 || Address == 0)
-        return Error("Invalid INDIRECTBR record");
+        return Error(InvalidRecord);
       unsigned NumDests = Record.size()-2;
       IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
       InstructionList.push_back(IBI);
@@ -2583,7 +2624,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           IBI->addDestination(DestBB);
         } else {
           delete IBI;
-          return Error("Invalid INDIRECTBR record!");
+          return Error(InvalidRecord);
         }
       }
       I = IBI;
@@ -2592,7 +2633,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     case bitc::FUNC_CODE_INST_INVOKE: {
       // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...]
-      if (Record.size() < 4) return Error("Invalid INVOKE record");
+      if (Record.size() < 4)
+        return Error(InvalidRecord);
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
       BasicBlock *NormalBB = getBasicBlock(Record[2]);
@@ -2601,7 +2643,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 4;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error("Invalid INVOKE record");
+        return Error(InvalidRecord);
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = !CalleeTy ? 0 :
@@ -2610,24 +2652,25 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // Check that the right number of fixed parameters are here.
       if (FTy == 0 || NormalBB == 0 || UnwindBB == 0 ||
           Record.size() < OpNum+FTy->getNumParams())
-        return Error("Invalid INVOKE record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> Ops;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Ops.push_back(getValue(Record, OpNum, NextValueNo,
                                FTy->getParamType(i)));
-        if (Ops.back() == 0) return Error("Invalid INVOKE record");
+        if (Ops.back() == 0)
+          return Error(InvalidRecord);
       }
 
       if (!FTy->isVarArg()) {
         if (Record.size() != OpNum)
-          return Error("Invalid INVOKE record");
+          return Error(InvalidRecord);
       } else {
         // Read type/value pairs for varargs params.
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error("Invalid INVOKE record");
+            return Error(InvalidRecord);
           Ops.push_back(Op);
         }
       }
@@ -2643,7 +2686,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned Idx = 0;
       Value *Val = 0;
       if (getValueTypePair(Record, Idx, NextValueNo, Val))
-        return Error("Invalid RESUME record");
+        return Error(InvalidRecord);
       I = ResumeInst::Create(Val);
       InstructionList.push_back(I);
       break;
@@ -2654,9 +2697,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
-        return Error("Invalid PHI record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid PHI record");
+      if (!Ty)
+        return Error(InvalidRecord);
 
       PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
       InstructionList.push_back(PN);
@@ -2671,7 +2715,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         else
           V = getValue(Record, 1+i, NextValueNo, Ty);
         BasicBlock *BB = getBasicBlock(Record[2+i]);
-        if (!V || !BB) return Error("Invalid PHI record");
+        if (!V || !BB)
+          return Error(InvalidRecord);
         PN->addIncoming(V, BB);
       }
       I = PN;
@@ -2682,12 +2727,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?]
       unsigned Idx = 0;
       if (Record.size() < 4)
-        return Error("Invalid LANDINGPAD record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[Idx++]);
-      if (!Ty) return Error("Invalid LANDINGPAD record");
+      if (!Ty)
+        return Error(InvalidRecord);
       Value *PersFn = 0;
       if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
-        return Error("Invalid LANDINGPAD record");
+        return Error(InvalidRecord);
 
       bool IsCleanup = !!Record[Idx++];
       unsigned NumClauses = Record[Idx++];
@@ -2700,7 +2746,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
         if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
           delete LP;
-          return Error("Invalid LANDINGPAD record");
+          return Error(InvalidRecord);
         }
 
         assert((CT != LandingPadInst::Catch ||
@@ -2719,13 +2765,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       if (Record.size() != 4)
-        return Error("Invalid ALLOCA record");
+        return Error(InvalidRecord);
       PointerType *Ty =
         dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
       unsigned Align = Record[3];
-      if (!Ty || !Size) return Error("Invalid ALLOCA record");
+      if (!Ty || !Size)
+        return Error(InvalidRecord);
       I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
       InstructionList.push_back(I);
       break;
@@ -2735,7 +2782,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error("Invalid LOAD record");
+        return Error(InvalidRecord);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2747,15 +2794,15 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+4 != Record.size())
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
 
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Release ||
           Ordering == AcquireRelease)
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1,
@@ -2770,7 +2817,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+2 != Record.size())
-        return Error("Invalid STORE record");
+        return Error(InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2784,15 +2831,15 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Acquire ||
           Ordering == AcquireRelease)
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1,
                         Ordering, SynchScope);
@@ -2809,10 +2856,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
           OpNum+3 != Record.size())
-        return Error("Invalid CMPXCHG record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+1]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error("Invalid CMPXCHG record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]);
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, Ordering, SynchScope);
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
@@ -2827,14 +2874,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]);
       if (Operation < AtomicRMWInst::FIRST_BINOP ||
           Operation > AtomicRMWInst::LAST_BINOP)
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
       cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
@@ -2843,11 +2890,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
       if (2 != Record.size())
-        return Error("Invalid FENCE record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[0]);
       if (Ordering == NotAtomic || Ordering == Unordered ||
           Ordering == Monotonic)
-        return Error("Invalid FENCE record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]);
       I = new FenceInst(Context, Ordering, SynchScope);
       InstructionList.push_back(I);
@@ -2856,7 +2903,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
@@ -2864,13 +2911,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 2;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = 0;
       if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
       if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> Args;
       // Read the fixed params.
@@ -2880,18 +2927,19 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         else
           Args.push_back(getValue(Record, OpNum, NextValueNo,
                                   FTy->getParamType(i)));
-        if (Args.back() == 0) return Error("Invalid CALL record");
+        if (Args.back() == 0)
+          return Error(InvalidRecord);
       }
 
       // Read type/value pairs for varargs params.
       if (!FTy->isVarArg()) {
         if (OpNum != Record.size())
-          return Error("Invalid CALL record");
+          return Error(InvalidRecord);
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error("Invalid CALL record");
+            return Error(InvalidRecord);
           Args.push_back(Op);
         }
       }
@@ -2906,12 +2954,12 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
-        return Error("Invalid VAARG record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Op = getValue(Record, 1, NextValueNo, OpTy);
       Type *ResTy = getTypeByID(Record[2]);
       if (!OpTy || !Op || !ResTy)
-        return Error("Invalid VAARG record");
+        return Error(InvalidRecord);
       I = new VAArgInst(Op, ResTy);
       InstructionList.push_back(I);
       break;
@@ -2922,7 +2970,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     // this file.
     if (CurBB == 0) {
       delete I;
-      return Error("Invalid instruction with no BB");
+      return Error(InvalidInstructionWithNoBB);
     }
     CurBB->getInstList().push_back(I);
 
@@ -2949,7 +2997,7 @@ OutOfRecordLoop:
           delete A;
         }
       }
-      return Error("Never resolved value found in function!");
+      return Error(NeverResolvedValueFoundInFunction);
     }
   }
 
@@ -2965,7 +3013,7 @@ OutOfRecordLoop:
     for (unsigned i = 0, e = RefList.size(); i != e; ++i) {
       unsigned BlockIdx = RefList[i].first;
       if (BlockIdx >= FunctionBBs.size())
-        return Error("Invalid blockaddress block #");
+        return Error(InvalidID);
 
       GlobalVariable *FwdRef = RefList[i].second;
       FwdRef->replaceAllUsesWith(BlockAddress::get(F, FunctionBBs[BlockIdx]));
@@ -2979,20 +3027,21 @@ OutOfRecordLoop:
   ValueList.shrinkTo(ModuleValueListSize);
   MDValueList.shrinkTo(ModuleMDValueListSize);
   std::vector<BasicBlock*>().swap(FunctionBBs);
-  return false;
+  return error_code::success();
 }
 
-/// FindFunctionInStream - Find the function body in the bitcode stream
-bool BitcodeReader::FindFunctionInStream(Function *F,
+/// Find the function body in the bitcode stream
+error_code BitcodeReader::FindFunctionInStream(Function *F,
        DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
     if (Stream.AtEndOfStream())
-      return Error("Could not find Function in stream");
+      return Error(CouldNotFindFunctionInStream);
     // ParseModule will parse the next body in the stream and set its
     // position in the DeferredFunctionInfo map.
-    if (ParseModule(true)) return true;
+    if (error_code EC = ParseModule(true))
+      return EC;
   }
-  return false;
+  return error_code::success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -3008,25 +3057,25 @@ bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
   return false;
 }
 
-bool BitcodeReader::Materialize(GlobalValue *GV, std::string *ErrInfo) {
+error_code BitcodeReader::Materialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
-  if (!F || !F->isMaterializable()) return false;
+  if (!F || !F->isMaterializable())
+    return error_code::success();
 
   DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
   assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
   // If its position is recorded as 0, its body is somewhere in the stream
   // but we haven't seen it yet.
-  if (DFII->second == 0)
-    if (LazyStreamer && FindFunctionInStream(F, DFII)) return true;
+  if (DFII->second == 0 && LazyStreamer)
+    if (error_code EC = FindFunctionInStream(F, DFII))
+      return EC;
 
   // Move the bit stream to the saved position of the deferred function body.
   Stream.JumpToBit(DFII->second);
 
-  if (ParseFunctionBody(F)) {
-    if (ErrInfo) *ErrInfo = ErrorString;
-    return true;
-  }
+  if (error_code EC = ParseFunctionBody(F))
+    return EC;
 
   // Upgrade any old intrinsic calls in the function.
   for (UpgradedIntrinsicMap::iterator I = UpgradedIntrinsics.begin(),
@@ -3040,7 +3089,7 @@ bool BitcodeReader::Materialize(GlobalValue *GV, std::string *ErrInfo) {
     }
   }
 
-  return false;
+  return error_code::success();
 }
 
 bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
@@ -3063,17 +3112,18 @@ void BitcodeReader::Dematerialize(GlobalValue *GV) {
 }
 
 
-bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
+error_code BitcodeReader::MaterializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on
   // disk.
   for (Module::iterator F = TheModule->begin(), E = TheModule->end();
-       F != E; ++F)
-    if (F->isMaterializable() &&
-        Materialize(F, ErrInfo))
-      return true;
-
+       F != E; ++F) {
+    if (F->isMaterializable()) {
+      if (error_code EC = Materialize(F))
+        return EC;
+    }
+  }
   // At this point, if there are any function bodies, the current bit is
   // pointing to the END_BLOCK record after them. Now make sure the rest
   // of the bits in the module have been read.
@@ -3099,38 +3149,43 @@ bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
   }
   std::vector<std::pair<Function*, Function*> >().swap(UpgradedIntrinsics);
 
-  return false;
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
+  UpgradeDebugInfo(*M);
+  return error_code::success();
 }
 
-bool BitcodeReader::InitStream() {
-  if (LazyStreamer) return InitLazyStream();
+error_code BitcodeReader::InitStream() {
+  if (LazyStreamer)
+    return InitLazyStream();
   return InitStreamFromBuffer();
 }
 
-bool BitcodeReader::InitStreamFromBuffer() {
+error_code BitcodeReader::InitStreamFromBuffer() {
   const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
   if (Buffer->getBufferSize() & 3) {
     if (!isRawBitcode(BufPtr, BufEnd) && !isBitcodeWrapper(BufPtr, BufEnd))
-      return Error("Invalid bitcode signature");
+      return Error(InvalidBitcodeSignature);
     else
-      return Error("Bitcode stream should be a multiple of 4 bytes in length");
+      return Error(BitcodeStreamInvalidSize);
   }
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
   // The magic number is 0x0B17C0DE stored in little endian.
   if (isBitcodeWrapper(BufPtr, BufEnd))
     if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true))
-      return Error("Invalid bitcode wrapper header");
+      return Error(InvalidBitcodeWrapperHeader);
 
   StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
   Stream.init(*StreamFile);
 
-  return false;
+  return error_code::success();
 }
 
-bool BitcodeReader::InitLazyStream() {
+error_code BitcodeReader::InitLazyStream() {
   // Check and strip off the bitcode wrapper; BitstreamReader expects never to
   // see it.
   StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
@@ -3139,10 +3194,10 @@ bool BitcodeReader::InitLazyStream() {
 
   unsigned char buf[16];
   if (Bytes->readBytes(0, 16, buf) == -1)
-    return Error("Bitcode stream must be at least 16 bytes in length");
+    return Error(BitcodeStreamInvalidSize);
 
   if (!isBitcode(buf, buf + 16))
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   if (isBitcodeWrapper(buf, buf + 4)) {
     const unsigned char *bitcodeStart = buf;
@@ -3151,7 +3206,64 @@ bool BitcodeReader::InitLazyStream() {
     Bytes->dropLeadingBytes(bitcodeStart - buf);
     Bytes->setKnownObjectSize(bitcodeEnd - bitcodeStart);
   }
-  return false;
+  return error_code::success();
+}
+
+namespace {
+class BitcodeErrorCategoryType : public _do_message {
+  const char *name() const LLVM_OVERRIDE {
+    return "llvm.bitcode";
+  }
+  std::string message(int IE) const LLVM_OVERRIDE {
+    BitcodeReader::ErrorType E = static_cast<BitcodeReader::ErrorType>(IE);
+    switch (E) {
+    case BitcodeReader::BitcodeStreamInvalidSize:
+      return "Bitcode stream length should be >= 16 bytes and a multiple of 4";
+    case BitcodeReader::ConflictingMETADATA_KINDRecords:
+      return "Conflicting METADATA_KIND records";
+    case BitcodeReader::CouldNotFindFunctionInStream:
+      return "Could not find function in stream";
+    case BitcodeReader::ExpectedConstant:
+      return "Expected a constant";
+    case BitcodeReader::InsufficientFunctionProtos:
+      return "Insufficient function protos";
+    case BitcodeReader::InvalidBitcodeSignature:
+      return "Invalid bitcode signature";
+    case BitcodeReader::InvalidBitcodeWrapperHeader:
+      return "Invalid bitcode wrapper header";
+    case BitcodeReader::InvalidConstantReference:
+      return "Invalid ronstant reference";
+    case BitcodeReader::InvalidID:
+      return "Invalid ID";
+    case BitcodeReader::InvalidInstructionWithNoBB:
+      return "Invalid instruction with no BB";
+    case BitcodeReader::InvalidRecord:
+      return "Invalid record";
+    case BitcodeReader::InvalidTypeForValue:
+      return "Invalid type for value";
+    case BitcodeReader::InvalidTYPETable:
+      return "Invalid TYPE table";
+    case BitcodeReader::InvalidType:
+      return "Invalid type";
+    case BitcodeReader::MalformedBlock:
+      return "Malformed block";
+    case BitcodeReader::MalformedGlobalInitializerSet:
+      return "Malformed global initializer set";
+    case BitcodeReader::InvalidMultipleBlocks:
+      return "Invalid multiple blocks";
+    case BitcodeReader::NeverResolvedValueFoundInFunction:
+      return "Never resolved value found in function";
+    case BitcodeReader::InvalidValue:
+      return "Invalid value";
+    }
+    llvm_unreachable("Unknown error type!");
+  }
+};
+}
+
+const error_category &BitcodeReader::BitcodeErrorCategory() {
+  static BitcodeErrorCategoryType O;
+  return O;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3166,9 +3278,9 @@ Module *llvm::getLazyBitcodeModule(MemoryBuffer *Buffer,
   Module *M = new Module(Buffer->getBufferIdentifier(), Context);
   BitcodeReader *R = new BitcodeReader(Buffer, Context);
   M->setMaterializer(R);
-  if (R->ParseBitcodeInto(M)) {
+  if (error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
 
     delete M;  // Also deletes R.
     return 0;
@@ -3189,9 +3301,9 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name,
   Module *M = new Module(name, Context);
   BitcodeReader *R = new BitcodeReader(streamer, Context);
   M->setMaterializer(R);
-  if (R->ParseBitcodeInto(M)) {
+  if (error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
     delete M;  // Also deletes R.
     return 0;
   }
@@ -3230,9 +3342,9 @@ std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer,
   R->setBufferOwned(false);
 
   std::string Triple("");
-  if (R->ParseTriple(Triple))
+  if (error_code EC = R->ParseTriple(Triple))
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
 
   delete R;
   return Triple;
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index b095447..c5d345b 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -21,6 +21,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/system_error.h"
 #include "llvm/Support/ValueHandle.h"
 #include <vector>
 
@@ -132,8 +133,6 @@ class BitcodeReader : public GVMaterializer {
   uint64_t NextUnreadBit;
   bool SeenValueSymbolTable;
 
-  const char *ErrorString;
-
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
   BitcodeReaderMDValueList MDValueList;
@@ -142,6 +141,9 @@ class BitcodeReader : public GVMaterializer {
 
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrefixes;
+
+  SmallVector<Instruction*, 64> InstsWithTBAATag;
 
   /// MAttributes - The set of attributes by index.  Index zero in the
   /// file is for null, and is thus not represented here.  As such all indices
@@ -191,17 +193,46 @@ class BitcodeReader : public GVMaterializer {
   /// not need this flag.
   bool UseRelativeIDs;
 
+  static const error_category &BitcodeErrorCategory();
+
 public:
+  enum ErrorType {
+    BitcodeStreamInvalidSize,
+    ConflictingMETADATA_KINDRecords,
+    CouldNotFindFunctionInStream,
+    ExpectedConstant,
+    InsufficientFunctionProtos,
+    InvalidBitcodeSignature,
+    InvalidBitcodeWrapperHeader,
+    InvalidConstantReference,
+    InvalidID, // A read identifier is not found in the table it should be in.
+    InvalidInstructionWithNoBB,
+    InvalidRecord, // A read record doesn't have the expected size or structure
+    InvalidTypeForValue, // Type read OK, but is invalid for its use
+    InvalidTYPETable,
+    InvalidType, // We were unable to read a type
+    MalformedBlock, // We are unable to advance in the stream.
+    MalformedGlobalInitializerSet,
+    InvalidMultipleBlocks, // We found multiple blocks of a kind that should
+                           // have only one
+    NeverResolvedValueFoundInFunction,
+    InvalidValue // Invalid version, inst number, attr number, etc
+  };
+
+  error_code Error(ErrorType E) {
+    return error_code(E, BitcodeErrorCategory());
+  }
+
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(buffer), BufferOwned(false),
       LazyStreamer(0), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ErrorString(0), ValueList(C), MDValueList(C),
+      ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(0), BufferOwned(false),
       LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ErrorString(0), ValueList(C), MDValueList(C),
+      ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   ~BitcodeReader() {
@@ -218,23 +249,17 @@ public:
 
   virtual bool isMaterializable(const GlobalValue *GV) const;
   virtual bool isDematerializable(const GlobalValue *GV) const;
-  virtual bool Materialize(GlobalValue *GV, std::string *ErrInfo = 0);
-  virtual bool MaterializeModule(Module *M, std::string *ErrInfo = 0);
+  virtual error_code Materialize(GlobalValue *GV);
+  virtual error_code MaterializeModule(Module *M);
   virtual void Dematerialize(GlobalValue *GV);
 
-  bool Error(const char *Str) {
-    ErrorString = Str;
-    return true;
-  }
-  const char *getErrorString() const { return ErrorString; }
-
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
-  bool ParseBitcodeInto(Module *M);
+  error_code ParseBitcodeInto(Module *M);
 
   /// @brief Cheap mechanism to just extract module triple
   /// @returns true if an error occurred.
-  bool ParseTriple(std::string &Triple);
+  error_code ParseTriple(std::string &Triple);
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -321,27 +346,27 @@ private:
     return getFnValueByID(ValNo, Ty);
   }
 
-  bool ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
-  bool ParseModule(bool Resume);
-  bool ParseAttributeBlock();
-  bool ParseAttributeGroupBlock();
-  bool ParseTypeTable();
-  bool ParseTypeTableBody();
-
-  bool ParseValueSymbolTable();
-  bool ParseConstants();
-  bool RememberAndSkipFunctionBody();
-  bool ParseFunctionBody(Function *F);
-  bool GlobalCleanup();
-  bool ResolveGlobalAndAliasInits();
-  bool ParseMetadata();
-  bool ParseMetadataAttachment();
-  bool ParseModuleTriple(std::string &Triple);
-  bool ParseUseLists();
-  bool InitStream();
-  bool InitStreamFromBuffer();
-  bool InitLazyStream();
-  bool FindFunctionInStream(Function *F,
+  error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
+  error_code ParseModule(bool Resume);
+  error_code ParseAttributeBlock();
+  error_code ParseAttributeGroupBlock();
+  error_code ParseTypeTable();
+  error_code ParseTypeTableBody();
+
+  error_code ParseValueSymbolTable();
+  error_code ParseConstants();
+  error_code RememberAndSkipFunctionBody();
+  error_code ParseFunctionBody(Function *F);
+  error_code GlobalCleanup();
+  error_code ResolveGlobalAndAliasInits();
+  error_code ParseMetadata();
+  error_code ParseMetadataAttachment();
+  error_code ParseModuleTriple(std::string &Triple);
+  error_code ParseUseLists();
+  error_code InitStream();
+  error_code InitStreamFromBuffer();
+  error_code InitLazyStream();
+  error_code FindFunctionInStream(Function *F,
          DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator);
 };
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 311c233..4cfc6bd 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -60,10 +60,7 @@ enum {
   FUNCTION_INST_CAST_ABBREV,
   FUNCTION_INST_RET_VOID_ABBREV,
   FUNCTION_INST_RET_VAL_ABBREV,
-  FUNCTION_INST_UNREACHABLE_ABBREV,
-
-  // SwitchInst Magic
-  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
+  FUNCTION_INST_UNREACHABLE_ABBREV
 };
 
 static unsigned GetEncodedCastOpcode(unsigned Opcode) {
@@ -81,6 +78,7 @@ static unsigned GetEncodedCastOpcode(unsigned Opcode) {
   case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
   case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
   case Instruction::BitCast : return bitc::CAST_BITCAST;
+  case Instruction::AddrSpaceCast: return bitc::CAST_ADDRSPACECAST;
   }
 }
 
@@ -205,6 +203,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_UNWIND;
   case Attribute::OptimizeForSize:
     return bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE;
+  case Attribute::OptimizeNone:
+    return bitc::ATTR_KIND_OPTIMIZE_NONE;
   case Attribute::ReadNone:
     return bitc::ATTR_KIND_READ_NONE;
   case Attribute::ReadOnly:
@@ -490,7 +490,6 @@ static unsigned getEncodedLinkage(const GlobalValue *GV) {
   case GlobalValue::AvailableExternallyLinkage:      return 12;
   case GlobalValue::LinkerPrivateLinkage:            return 13;
   case GlobalValue::LinkerPrivateWeakLinkage:        return 14;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:      return 15;
   }
   llvm_unreachable("Invalid linkage");
 }
@@ -607,7 +606,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
     // GLOBALVAR: [type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
-    //             unnamed_addr]
+    //             unnamed_addr, externally_initialized]
     Vals.push_back(VE.getTypeID(GV->getType()));
     Vals.push_back(GV->isConstant());
     Vals.push_back(GV->isDeclaration() ? 0 :
@@ -633,7 +632,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   // Emit the function proto information.
   for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
-    //             section, visibility, gc, unnamed_addr]
+    //             section, visibility, gc, unnamed_addr, prefix]
     Vals.push_back(VE.getTypeID(F->getType()));
     Vals.push_back(F->getCallingConv());
     Vals.push_back(F->isDeclaration());
@@ -644,6 +643,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(getEncodedVisibility(F));
     Vals.push_back(F->hasGC() ? GCMap[F->getGC()] : 0);
     Vals.push_back(F->hasUnnamedAddr());
+    Vals.push_back(F->hasPrefixData() ? (VE.getValueID(F->getPrefixData()) + 1)
+                                      : 0);
 
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
@@ -863,34 +864,6 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
     Vals.push_back((-V << 1) | 1);
 }
 
-static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
-                      unsigned &Code, unsigned &AbbrevToUse, const APInt &Val,
-                      bool EmitSizeForWideNumbers = false
-                      ) {
-  if (Val.getBitWidth() <= 64) {
-    uint64_t V = Val.getSExtValue();
-    emitSignedInt64(Vals, V);
-    Code = bitc::CST_CODE_INTEGER;
-    AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
-  } else {
-    // Wide integers, > 64 bits in size.
-    // We have an arbitrary precision integer value to write whose
-    // bit width is > 64. However, in canonical unsigned integer
-    // format it is likely that the high bits are going to be zero.
-    // So, we only write the number of active words.
-    unsigned NWords = Val.getActiveWords();
-
-    if (EmitSizeForWideNumbers)
-      Vals.push_back(NWords);
-
-    const uint64_t *RawWords = Val.getRawData();
-    for (unsigned i = 0; i != NWords; ++i) {
-      emitSignedInt64(Vals, RawWords[i]);
-    }
-    Code = bitc::CST_CODE_WIDE_INTEGER;
-  }
-}
-
 static void WriteConstants(unsigned FirstVal, unsigned LastVal,
                            const ValueEnumerator &VE,
                            BitstreamWriter &Stream, bool isGlobal) {
@@ -974,7 +947,23 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
     } else if (isa<UndefValue>(C)) {
       Code = bitc::CST_CODE_UNDEF;
     } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
-      EmitAPInt(Record, Code, AbbrevToUse, IV->getValue());
+      if (IV->getBitWidth() <= 64) {
+        uint64_t V = IV->getSExtValue();
+        emitSignedInt64(Record, V);
+        Code = bitc::CST_CODE_INTEGER;
+        AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+      } else {                             // Wide integers, > 64 bits in size.
+        // We have an arbitrary precision integer value to write whose
+        // bit width is > 64. However, in canonical unsigned integer
+        // format it is likely that the high bits are going to be zero.
+        // So, we only write the number of active words.
+        unsigned NWords = IV->getValue().getActiveWords();
+        const uint64_t *RawWords = IV->getValue().getRawData();
+        for (unsigned i = 0; i != NWords; ++i) {
+          emitSignedInt64(Record, RawWords[i]);
+        }
+        Code = bitc::CST_CODE_WIDE_INTEGER;
+      }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
       Type *Ty = CFP->getType();
@@ -1182,13 +1171,6 @@ static void pushValue(const Value *V, unsigned InstID,
   Vals.push_back(InstID - ValID);
 }
 
-static void pushValue64(const Value *V, unsigned InstID,
-                        SmallVectorImpl<uint64_t> &Vals,
-                        ValueEnumerator &VE) {
-  uint64_t ValID = VE.getValueID(V);
-  Vals.push_back(InstID - ValID);
-}
-
 static void pushValueSigned(const Value *V, unsigned InstID,
                             SmallVectorImpl<uint64_t> &Vals,
                             ValueEnumerator &VE) {
@@ -1312,63 +1294,16 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     break;
   case Instruction::Switch:
     {
-      // Redefine Vals, since here we need to use 64 bit values
-      // explicitly to store large APInt numbers.
-      SmallVector<uint64_t, 128> Vals64;
-
       Code = bitc::FUNC_CODE_INST_SWITCH;
       const SwitchInst &SI = cast<SwitchInst>(I);
-
-      uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16);
-      Vals64.push_back(SwitchRecordHeader);
-
-      Vals64.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      pushValue64(SI.getCondition(), InstID, Vals64, VE);
-      Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
-      Vals64.push_back(SI.getNumCases());
+      Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
+      pushValue(SI.getCondition(), InstID, Vals, VE);
+      Vals.push_back(VE.getValueID(SI.getDefaultDest()));
       for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
            i != e; ++i) {
-        const IntegersSubset& CaseRanges = i.getCaseValueEx();
-        unsigned Code, Abbrev; // will unused.
-
-        if (CaseRanges.isSingleNumber()) {
-          Vals64.push_back(1/*NumItems = 1*/);
-          Vals64.push_back(true/*IsSingleNumber = true*/);
-          EmitAPInt(Vals64, Code, Abbrev, CaseRanges.getSingleNumber(0), true);
-        } else {
-
-          Vals64.push_back(CaseRanges.getNumItems());
-
-          if (CaseRanges.isSingleNumbersOnly()) {
-            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
-                 ri != rn; ++ri) {
-
-              Vals64.push_back(true/*IsSingleNumber = true*/);
-
-              EmitAPInt(Vals64, Code, Abbrev,
-                        CaseRanges.getSingleNumber(ri), true);
-            }
-          } else
-            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
-                 ri != rn; ++ri) {
-              IntegersSubset::Range r = CaseRanges.getItem(ri);
-              bool IsSingleNumber = CaseRanges.isSingleNumber(ri);
-
-              Vals64.push_back(IsSingleNumber);
-
-              EmitAPInt(Vals64, Code, Abbrev, r.getLow(), true);
-              if (!IsSingleNumber)
-                EmitAPInt(Vals64, Code, Abbrev, r.getHigh(), true);
-            }
-        }
-        Vals64.push_back(VE.getValueID(i.getCaseSuccessor()));
+        Vals.push_back(VE.getValueID(i.getCaseValue()));
+        Vals.push_back(VE.getValueID(i.getCaseSuccessor()));
       }
-
-      Stream.EmitRecord(Code, Vals64, AbbrevToUse);
-
-      // Also do expected action - clear external Vals collection:
-      Vals.clear();
-      return;
     }
     break;
   case Instruction::IndirectBr:
@@ -1930,6 +1865,8 @@ static void WriteModuleUseLists(const Module *M, ValueEnumerator &VE,
     WriteUseList(FI, VE, Stream);
     if (!FI->isDeclaration())
       WriteFunctionUseList(FI, VE, Stream);
+    if (FI->hasPrefixData())
+      WriteUseList(FI->getPrefixData(), VE, Stream);
   }
 
   // Write the aliases.
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 8bac6da..a164104 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -60,6 +60,11 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
        I != E; ++I)
     EnumerateValue(I->getAliasee());
 
+  // Enumerate the prefix data constants.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
+    if (I->hasPrefixData())
+      EnumerateValue(I->getPrefixData());
+
   // Insert constants and metadata that are named at module level into the slot
   // pool so that the module symbol table can refer to them...
   EnumerateValueSymbolTable(M->getValueSymbolTable());
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 4d9aebc..7fbf123 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory(Bitcode)
 add_subdirectory(Transforms)
 add_subdirectory(Linker)
 add_subdirectory(Analysis)
+add_subdirectory(LTO)
 add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(Option)
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index e079707..2ee7767 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -782,7 +782,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
     if (MI == CriticalPathMI) {
       CriticalPathSU = CriticalPathStep(CriticalPathSU);
       CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : 0;
-    } else {
+    } else if (CriticalPathSet.any()) {
       ExcludeRegs = &CriticalPathSet;
     }
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index ca08b5b..1600c67 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -320,6 +320,7 @@ static const Value *getNoopInput(const Value *V,
 static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
                                  SmallVectorImpl<unsigned> &RetIndices,
                                  SmallVectorImpl<unsigned> &CallIndices,
+                                 bool AllowDifferingSizes,
                                  const TargetLoweringBase &TLI) {
 
   // Trace the sub-value needed by the return value as far back up the graph as
@@ -350,7 +351,8 @@ static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
   // all the bits that are needed by the "ret" have been provided by the "tail
   // call". FIXME: with sufficiently cunning bit-tracking, we could look through
   // extensions too.
-  if (BitsProvided < BitsRequired)
+  if (BitsProvided < BitsRequired ||
+      (!AllowDifferingSizes && BitsProvided != BitsRequired))
     return false;
 
   return true;
@@ -382,9 +384,8 @@ static bool indexReallyValid(CompositeType *T, unsigned Idx) {
 /// function again on a finished iterator will repeatedly return
 /// false. SubTypes.back()->getTypeAtIndex(Path.back()) is either an empty
 /// aggregate or a non-aggregate
-static bool
-advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
-                     SmallVectorImpl<unsigned> &Path) {
+static bool advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
+                                  SmallVectorImpl<unsigned> &Path) {
   // First march back up the tree until we can successfully increment one of the
   // coordinates in Path.
   while (!Path.empty() && !indexReallyValid(SubTypes.back(), Path.back() + 1)) {
@@ -454,8 +455,8 @@ static bool firstRealType(Type *Next,
 
 /// Set the iterator data-structures to the next non-empty, non-aggregate
 /// subtype.
-bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
-                  SmallVectorImpl<unsigned> &Path) {
+static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
+                         SmallVectorImpl<unsigned> &Path) {
   do {
     if (!advanceToNextLeafType(SubTypes, Path))
       return false;
@@ -509,6 +510,13 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
         return false;
     }
 
+  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret, TLI);
+}
+
+bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
+                                           const Instruction *I,
+                                           const ReturnInst *Ret,
+                                           const TargetLoweringBase &TLI) {
   // If the block ends with a void return or unreachable, it doesn't matter
   // what the call's return type is.
   if (!Ret || Ret->getNumOperands() == 0) return true;
@@ -517,19 +525,38 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
   // return type is.
   if (isa<UndefValue>(Ret->getOperand(0))) return true;
 
-  // Conservatively require the attributes of the call to match those of
-  // the return. Ignore noalias because it doesn't affect the call sequence.
-  const Function *F = ExitBB->getParent();
-  AttributeSet CallerAttrs = F->getAttributes();
-  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
-        removeAttribute(Attribute::NoAlias) !=
-      AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
-        removeAttribute(Attribute::NoAlias))
-    return false;
+  // Make sure the attributes attached to each return are compatible.
+  AttrBuilder CallerAttrs(F->getAttributes(),
+                          AttributeSet::ReturnIndex);
+  AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
+                          AttributeSet::ReturnIndex);
+
+  // Noalias is completely benign as far as calling convention goes, it
+  // shouldn't affect whether the call is a tail call.
+  CallerAttrs = CallerAttrs.removeAttribute(Attribute::NoAlias);
+  CalleeAttrs = CalleeAttrs.removeAttribute(Attribute::NoAlias);
+
+  bool AllowDifferingSizes = true;
+  if (CallerAttrs.contains(Attribute::ZExt)) {
+    if (!CalleeAttrs.contains(Attribute::ZExt))
+      return false;
+
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::ZExt);
+    CalleeAttrs.removeAttribute(Attribute::ZExt);
+  } else if (CallerAttrs.contains(Attribute::SExt)) {
+    if (!CalleeAttrs.contains(Attribute::SExt))
+      return false;
+
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::SExt);
+    CalleeAttrs.removeAttribute(Attribute::SExt);
+  }
 
-  // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+  // If they're still different, there's some facet we don't understand
+  // (currently only "inreg", but in future who knows). It may be OK but the
+  // only safe option is to reject the tail call.
+  if (CallerAttrs != CalleeAttrs)
     return false;
 
   const Value *RetVal = Ret->getOperand(0), *CallVal = I;
@@ -571,7 +598,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
 
     // Finally, we can check whether the value produced by the tail call at this
     // index is compatible with the value we return.
-    if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath, TLI))
+    if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath,
+                              AllowDifferingSizes, TLI))
       return false;
 
     CallEmpty  = !nextRealType(CallSubTypes, CallPath);
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 188047d..5d82dd9 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -47,13 +47,18 @@ ARMException::ARMException(AsmPrinter *A)
 
 ARMException::~ARMException() {}
 
+ARMTargetStreamer &ARMException::getTargetStreamer() {
+  MCTargetStreamer &TS = Asm->OutStreamer.getTargetStreamer();
+  return static_cast<ARMTargetStreamer &>(TS);
+}
+
 void ARMException::EndModule() {
 }
 
 /// BeginFunction - Gather pre-function exception information. Assumes it's
 /// being emitted immediately after the function entry point.
 void ARMException::BeginFunction(const MachineFunction *MF) {
-  Asm->OutStreamer.EmitFnStart();
+  getTargetStreamer().emitFnStart();
   if (Asm->MF->getFunction()->needsUnwindTableEntry())
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
                                                   Asm->getFunctionNumber()));
@@ -62,8 +67,9 @@ void ARMException::BeginFunction(const MachineFunction *MF) {
 /// EndFunction - Gather and emit post-function exception information.
 ///
 void ARMException::EndFunction() {
+  ARMTargetStreamer &ATS = getTargetStreamer();
   if (!Asm->MF->getFunction()->needsUnwindTableEntry())
-    Asm->OutStreamer.EmitCantUnwind();
+    ATS.emitCantUnwind();
   else {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
                                                   Asm->getFunctionNumber()));
@@ -76,13 +82,13 @@ void ARMException::EndFunction() {
         // Emit references to personality.
         if (const Function * Personality =
             MMI->getPersonalities()[MMI->getPersonalityIndex()]) {
-          MCSymbol *PerSym = Asm->Mang->getSymbol(Personality);
+          MCSymbol *PerSym = Asm->getSymbol(Personality);
           Asm->OutStreamer.EmitSymbolAttribute(PerSym, MCSA_Global);
-          Asm->OutStreamer.EmitPersonality(PerSym);
+          ATS.emitPersonality(PerSym);
         }
 
         // Emit .handlerdata directive.
-        Asm->OutStreamer.EmitHandlerData();
+        ATS.emitHandlerData();
 
         // Emit actual exception table
         EmitExceptionTable();
@@ -90,7 +96,7 @@ void ARMException::EndFunction() {
     }
   }
 
-  Asm->OutStreamer.EmitFnEnd();
+  ATS.emitFnEnd();
 }
 
 void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 12c3574..308b0e0 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 using namespace llvm;
 
 static const char *const DWARFGroupName = "DWARF Emission";
@@ -94,7 +95,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &TD,
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
   : MachineFunctionPass(ID),
-    TM(tm), MAI(tm.getMCAsmInfo()),
+    TM(tm), MAI(tm.getMCAsmInfo()), MII(tm.getInstrInfo()),
     OutContext(Streamer.getContext()),
     OutStreamer(Streamer),
     LastMI(0), LastFn(0), Counter(~0U), SetCounter(0) {
@@ -164,7 +165,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   OutStreamer.InitStreamer();
 
-  Mang = new Mangler(OutContext, &TM);
+  Mang = new Mangler(&TM);
 
   // Allow the target to emit any magic that it wants at the start of the file.
   EmitStartOfAsmFile(M);
@@ -212,12 +213,12 @@ bool AsmPrinter::doInitialization(Module &M) {
   llvm_unreachable("Unknown exception type.");
 }
 
-void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
-  switch ((GlobalValue::LinkageTypes)Linkage) {
+void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
+  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
+  switch (Linkage) {
   case GlobalValue::CommonLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
@@ -225,8 +226,19 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
 
-      if ((GlobalValue::LinkageTypes)Linkage !=
-          GlobalValue::LinkOnceODRAutoHideLinkage)
+      bool CanBeHidden = false;
+
+      if (Linkage == GlobalValue::LinkOnceODRLinkage) {
+        if (GV->hasUnnamedAddr()) {
+          CanBeHidden = true;
+        } else {
+          GlobalStatus GS;
+          if (!GlobalStatus::analyzeGlobal(GV, GS) && !GS.IsCompared)
+            CanBeHidden = true;
+        }
+      }
+
+      if (!CanBeHidden)
         // .weak_definition _foo
         OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
       else
@@ -239,7 +251,7 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
       // .weak _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
     }
-    break;
+    return;
   case GlobalValue::DLLExportLinkage:
   case GlobalValue::AppendingLinkage:
     // FIXME: appending linkage variables should go into a section of
@@ -248,16 +260,23 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
     // If external or appending, declare as a global symbol.
     // .globl _foo
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
-    break;
+    return;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::InternalLinkage:
   case GlobalValue::LinkerPrivateLinkage:
-    break;
-  default:
-    llvm_unreachable("Unknown linkage type!");
+    return;
+  case GlobalValue::AvailableExternallyLinkage:
+    llvm_unreachable("Should never emit this");
+  case GlobalValue::DLLImportLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    llvm_unreachable("Don't know how to emit these");
   }
+  llvm_unreachable("Unknown linkage type!");
 }
 
+MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
+  return getObjFileLowering().getSymbol(*Mang, GV);
+}
 
 /// EmitGlobalVariable - Emit the specified global variable to the .s file.
 void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
@@ -273,7 +292,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     }
   }
 
-  MCSymbol *GVSym = Mang->getSymbol(GV);
+  MCSymbol *GVSym = getSymbol(GV);
   EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
@@ -284,13 +303,16 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *TD = TM.getDataLayout();
-  uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+  const DataLayout *DL = TM.getDataLayout();
+  uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
   // with a specified alignment is a prompt way to break globals emitted to
   // sections and expected to be contiguous (e.g. ObjC metadata).
-  unsigned AlignLog = getGVAlignmentLog2(GV, *TD);
+  unsigned AlignLog = getGVAlignmentLog2(GV, *DL);
+
+  if (DD)
+    DD->setSymbolSize(GVSym, Size);
 
   // Handle common and BSS local symbols (.lcomm).
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
@@ -388,14 +410,14 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     OutStreamer.SwitchSection(TLVSect);
     // Emit the linkage here.
-    EmitLinkage(GV->getLinkage(), GVSym);
+    EmitLinkage(GV, GVSym);
     OutStreamer.EmitLabel(GVSym);
 
     // Three pointers in size:
     //   - __tlv_bootstrap - used to make sure support exists
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
-    unsigned PtrSize = TD->getPointerSizeInBits()/8;
+    unsigned PtrSize = DL->getPointerTypeSize(GV->getType());
     OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
                                 PtrSize);
     OutStreamer.EmitIntValue(0, PtrSize);
@@ -407,7 +429,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   OutStreamer.SwitchSection(TheSection);
 
-  EmitLinkage(GV->getLinkage(), GVSym);
+  EmitLinkage(GV, GVSym);
   EmitAlignment(AlignLog, GV);
 
   OutStreamer.EmitLabel(GVSym);
@@ -433,7 +455,7 @@ void AsmPrinter::EmitFunctionHeader() {
   OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
   EmitVisibility(CurrentFnSym, F->getVisibility());
 
-  EmitLinkage(F->getLinkage(), CurrentFnSym);
+  EmitLinkage(F, CurrentFnSym);
   EmitAlignment(MF->getAlignment(), F);
 
   if (MAI->hasDotTypeDotSizeDirective())
@@ -459,16 +481,6 @@ void AsmPrinter::EmitFunctionHeader() {
     OutStreamer.EmitLabel(DeadBlockSyms[i]);
   }
 
-  // Add some workaround for linkonce linkage on Cygwin\MinGW.
-  if (MAI->getLinkOnceDirective() != 0 &&
-      (F->hasLinkOnceLinkage() || F->hasWeakLinkage())) {
-    // FIXME: What is this?
-    MCSymbol *FakeStub =
-      OutContext.GetOrCreateSymbol(Twine("Lllvm$workaround$fake$stub$")+
-                                   CurrentFnSym->getName());
-    OutStreamer.EmitLabel(FakeStub);
-  }
-
   // Emit pre-function debug and/or EH information.
   if (DE) {
     NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
@@ -478,6 +490,10 @@ void AsmPrinter::EmitFunctionHeader() {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     DD->beginFunction(MF);
   }
+
+  // Emit the prefix data.
+  if (F->hasPrefixData())
+    EmitGlobalConstant(F->getPrefixData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
@@ -530,11 +546,11 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 
 /// emitImplicitDef - This method emits the specified machine instruction
 /// that is an implicit def.
-static void emitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
+void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  AP.OutStreamer.AddComment(Twine("implicit-def: ") +
-                            AP.TM.getRegisterInfo()->getName(RegNo));
-  AP.OutStreamer.AddBlankLine();
+  OutStreamer.AddComment(Twine("implicit-def: ") +
+                         TM.getRegisterInfo()->getName(RegNo));
+  OutStreamer.AddBlankLine();
 }
 
 static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
@@ -646,7 +662,7 @@ bool AsmPrinter::needsRelocationsForDwarfStringPool() const {
 }
 
 void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
-  MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+  const MCSymbol *Label = MI.getOperand(0).getMCSymbol();
 
   if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
     return;
@@ -657,12 +673,12 @@ void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
   if (MMI->getCompactUnwindEncoding() != 0)
     OutStreamer.EmitCompactUnwindEncoding(MMI->getCompactUnwindEncoding());
 
-  MachineModuleInfo &MMI = MF->getMMI();
-  std::vector<MCCFIInstruction> Instructions = MMI.getFrameInstructions();
+  const MachineModuleInfo &MMI = MF->getMMI();
+  const std::vector<MCCFIInstruction> &Instrs = MMI.getFrameInstructions();
   bool FoundOne = false;
   (void)FoundOne;
-  for (std::vector<MCCFIInstruction>::iterator I = Instructions.begin(),
-         E = Instructions.end(); I != E; ++I) {
+  for (std::vector<MCCFIInstruction>::const_iterator I = Instrs.begin(),
+         E = Instrs.end(); I != E; ++I) {
     if (I->getLabel() == Label) {
       emitCFIInstruction(*I);
       FoundOne = true;
@@ -724,7 +740,7 @@ void AsmPrinter::EmitFunctionBody() {
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
-        if (isVerbose()) emitImplicitDef(II, *this);
+        if (isVerbose()) emitImplicitDef(II);
         break;
       case TargetOpcode::KILL:
         if (isVerbose()) emitKill(II, *this);
@@ -877,7 +893,7 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (V == GlobalValue::DefaultVisibility)
       continue;
 
-    MCSymbol *Name = Mang->getSymbol(&F);
+    MCSymbol *Name = getSymbol(&F);
     EmitVisibility(Name, V, false);
   }
 
@@ -887,6 +903,9 @@ bool AsmPrinter::doFinalization(Module &M) {
   if (!ModuleFlags.empty())
     getObjFileLowering().emitModuleFlags(OutStreamer, ModuleFlags, Mang, TM);
 
+  // Make sure we wrote out everything we need.
+  OutStreamer.Flush();
+
   // Finalize debug and EH information.
   if (DE) {
     {
@@ -914,12 +933,12 @@ bool AsmPrinter::doFinalization(Module &M) {
     for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
          I != E; ++I) {
       if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
     }
 
     for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
       if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
     }
   }
 
@@ -927,14 +946,19 @@ bool AsmPrinter::doFinalization(Module &M) {
     OutStreamer.AddBlankLine();
     for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
          I != E; ++I) {
-      MCSymbol *Name = Mang->getSymbol(I);
+      MCSymbol *Name = getSymbol(I);
 
       const GlobalValue *GV = I->getAliasedGlobal();
-      MCSymbol *Target = Mang->getSymbol(GV);
+      if (GV->isDeclaration()) {
+        report_fatal_error(Name->getName() +
+                           ": Target doesn't support aliases to declarations");
+      }
+
+      MCSymbol *Target = getSymbol(GV);
 
       if (I->hasExternalLinkage() || !MAI->getWeakRefDirective())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
-      else if (I->hasWeakLinkage())
+      else if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
       else
         assert(I->hasLocalLinkage() && "Invalid alias linkage");
@@ -953,6 +977,9 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*--I))
       MP->finishAssembly(*this);
 
+  // Emit llvm.ident metadata in an '.ident' directive.
+  EmitModuleIdents(M);
+
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
@@ -976,7 +1003,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
   // Get the function symbol.
-  CurrentFnSym = Mang->getSymbol(MF.getFunction());
+  CurrentFnSym = getSymbol(MF.getFunction());
   CurrentFnSymForSize = CurrentFnSym;
 
   if (isVerbose())
@@ -1283,16 +1310,10 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
     const GlobalValue *GV =
       dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
     if (GV && getObjFileLowering().shouldEmitUsedDirectiveFor(GV, Mang))
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(GV), MCSA_NoDeadStrip);
+      OutStreamer.EmitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
   }
 }
 
-typedef std::pair<unsigned, Constant*> Structor;
-
-static bool priority_order(const Structor& lhs, const Structor& rhs) {
-  return lhs.first < rhs.first;
-}
-
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
 void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
@@ -1309,6 +1330,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
       !isa<PointerType>(ETy->getTypeAtIndex(1U))) return; // Not (int, ptr).
 
   // Gather the structors in a form that's convenient for sorting by priority.
+  typedef std::pair<unsigned, Constant *> Structor;
   SmallVector<Structor, 8> Structors;
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
     ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
@@ -1322,9 +1344,9 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *TD = TM.getDataLayout();
-  unsigned Align = Log2_32(TD->getPointerPrefAlignment());
-  std::stable_sort(Structors.begin(), Structors.end(), priority_order);
+  const DataLayout *DL = TM.getDataLayout();
+  unsigned Align = Log2_32(DL->getPointerPrefAlignment());
+  std::stable_sort(Structors.begin(), Structors.end(), less_first());
   for (unsigned i = 0, e = Structors.size(); i != e; ++i) {
     const MCSection *OutputSection =
       (isCtor ?
@@ -1337,6 +1359,21 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 }
 
+void AsmPrinter::EmitModuleIdents(Module &M) {
+  if (!MAI->hasIdentDirective())
+    return;
+
+  if (const NamedMDNode *NMD = M.getNamedMetadata("llvm.ident")) {
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      const MDNode *N = NMD->getOperand(i);
+      assert(N->getNumOperands() == 1 && 
+             "llvm.ident metadata entry can have only one operand");
+      const MDString *S = cast<MDString>(N->getOperand(0));
+      OutStreamer.EmitIdent(S->getString());
+    }
+  }
+}
+
 //===--------------------------------------------------------------------===//
 // Emission and print routines
 //
@@ -1402,12 +1439,12 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
                             OutContext);
 
   if (!MAI->hasSetDirective())
-    OutStreamer.EmitValue(Diff, 4);
+    OutStreamer.EmitValue(Diff, Size);
   else {
     // Otherwise, emit with .set (aka assignment).
     MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
     OutStreamer.EmitAssignment(SetLabel, Diff);
-    OutStreamer.EmitSymbolValue(SetLabel, 4);
+    OutStreamer.EmitSymbolValue(SetLabel, Size);
   }
 }
 
@@ -1415,9 +1452,9 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
 /// where the size in bytes of the directive is specified by Size and Label
 /// specifies the label.  This implicitly uses .set if it is available.
 void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
-                                      unsigned Size)
+                                      unsigned Size, bool IsSectionRelative)
   const {
-  if (MAI->needsDwarfSectionOffsetDirective() && Size == 4) { // secrel32 ONLY works for 32bits.
+  if (MAI->needsDwarfSectionOffsetDirective() && IsSectionRelative) {
     OutStreamer.EmitCOFFSecRel32(Label);
     return;
   }
@@ -1468,7 +1505,7 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
@@ -1498,10 +1535,10 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Generate a symbolic expression for the byte address
-    APInt OffsetAI(TD.getPointerSizeInBits(), 0);
-    cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
+    APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
 
     const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
     if (!OffsetAI)
@@ -1522,17 +1559,17 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return lowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
                                       false/*ZExt*/);
     return lowerConstant(Op, AP);
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
@@ -1542,13 +1579,13 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
-    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+    if (DL.getTypeAllocSize(Ty) == DL.getTypeAllocSize(Op->getType()))
       return OpExpr;
 
     // Otherwise the pointer is smaller than the resultant integer, mask off
     // the high bits so we are sure to get a proper truncation if the input is
     // a constant expr.
-    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
     const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
     return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
   }
@@ -1699,9 +1736,9 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
     }
   }
 
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  unsigned Size = TD.getTypeAllocSize(CDS->getType());
-  unsigned EmittedSize = TD.getTypeAllocSize(CDS->getType()->getElementType()) *
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  unsigned Size = DL.getTypeAllocSize(CDS->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer.EmitZeros(Padding);
@@ -1727,9 +1764,9 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
     emitGlobalConstantImpl(CV->getOperand(i), AP);
 
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  unsigned Size = TD.getTypeAllocSize(CV->getType());
-  unsigned EmittedSize = TD.getTypeAllocSize(CV->getType()->getElementType()) *
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  unsigned Size = DL.getTypeAllocSize(CV->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer.EmitZeros(Padding);
@@ -1737,15 +1774,15 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
 
 static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *TD = AP.TM.getDataLayout();
-  unsigned Size = TD->getTypeAllocSize(CS->getType());
-  const StructLayout *Layout = TD->getStructLayout(CS->getType());
+  const DataLayout *DL = AP.TM.getDataLayout();
+  unsigned Size = DL->getTypeAllocSize(CS->getType());
+  const StructLayout *Layout = DL->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
   for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
     const Constant *Field = CS->getOperand(i);
 
     // Check if padding is needed and insert one or more 0s.
-    uint64_t FieldSize = TD->getTypeAllocSize(Field->getType());
+    uint64_t FieldSize = DL->getTypeAllocSize(Field->getType());
     uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
                         - Layout->getElementOffset(i)) - FieldSize;
     SizeSoFar += FieldSize + PadSize;
@@ -1802,13 +1839,13 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  AP.OutStreamer.EmitZeros(TD.getTypeAllocSize(CFP->getType()) -
-                           TD.getTypeStoreSize(CFP->getType()));
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  AP.OutStreamer.EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
+                           DL.getTypeStoreSize(CFP->getType()));
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
-  const DataLayout *TD = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
@@ -1825,7 +1862,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
     // Big endian:
     // * Record the extra bits to emit.
     // * Realign the raw data to emit the chunks of 64-bits.
-    if (TD->isBigEndian()) {
+    if (DL->isBigEndian()) {
       // Basically the structure of the raw data is a chunk of 64-bits cells:
       //    0        1         BitWidth / 64
       // [chunk1][chunk2] ... [chunkN].
@@ -1846,7 +1883,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
   // quantities at a time.
   const uint64_t *RawData = Realigned.getRawData();
   for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
-    uint64_t Val = TD->isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    uint64_t Val = DL->isBigEndian() ? RawData[e - i - 1] : RawData[i];
     AP.OutStreamer.EmitIntValue(Val, 8);
   }
 
@@ -1864,8 +1901,8 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
 }
 
 static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
-  const DataLayout *TD = AP.TM.getDataLayout();
-  uint64_t Size = TD->getTypeAllocSize(CV->getType());
+  const DataLayout *DL = AP.TM.getDataLayout();
+  uint64_t Size = DL->getTypeAllocSize(CV->getType());
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size);
 
@@ -1913,7 +1950,7 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
       // If the constant expression's size is greater than 64-bits, then we have
       // to emit the value in chunks. Try to constant fold the value and emit it
       // that way.
-      Constant *New = ConstantFoldConstantExpression(CE, TD);
+      Constant *New = ConstantFoldConstantExpression(CE, DL);
       if (New && New != CE)
         return emitGlobalConstantImpl(New, AP);
     }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index c141d60..b92f49c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -185,5 +185,11 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
   case MCCFIInstruction::OpOffset:
     OutStreamer.EmitCFIOffset(Inst.getRegister(), Inst.getOffset());
     break;
+  case MCCFIInstruction::OpRegister:
+    OutStreamer.EmitCFIRegister(Inst.getRegister(), Inst.getRegister2());
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OutStreamer.EmitCFIWindowSave();
+    break;
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index d8e9c95..4f927f6 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -123,7 +123,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
                                              TM.getTargetCPU(),
                                              TM.getTargetFeatureString()));
   OwningPtr<MCTargetAsmParser>
-    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser));
+    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser, *MII));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 65e7bee..be484a6 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMAsmPrinter
   AsmPrinterDwarf.cpp
   AsmPrinterInlineAsm.cpp
   DIE.cpp
+  DIEHash.cpp
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
   DwarfCompileUnit.cpp
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index ab03861..6944428 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -34,8 +34,10 @@ using namespace llvm;
 /// Profile - Used to gather unique data for the abbreviation folding set.
 ///
 void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
-  ID.AddInteger(Attribute);
-  ID.AddInteger(Form);
+  // Explicitly cast to an integer type for which FoldingSetNodeID has
+  // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
+  ID.AddInteger(unsigned(Attribute));
+  ID.AddInteger(unsigned(Form));
 }
 
 //===----------------------------------------------------------------------===//
@@ -45,7 +47,7 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
 /// Profile - Used to gather unique data for the abbreviation folding set.
 ///
 void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
-  ID.AddInteger(Tag);
+  ID.AddInteger(unsigned(Tag));
   ID.AddInteger(ChildrenFlag);
 
   // For each attribute description.
@@ -112,17 +114,25 @@ DIE::~DIE() {
 
 /// Climb up the parent chain to get the compile unit DIE to which this DIE
 /// belongs.
-DIE *DIE::getCompileUnit() {
-  DIE *p = this;
+const DIE *DIE::getCompileUnit() const {
+  const DIE *Cu = getCompileUnitOrNull();
+  assert(Cu && "We should not have orphaned DIEs.");
+  return Cu;
+}
+
+/// Climb up the parent chain to get the compile unit DIE this DIE belongs
+/// to. Return NULL if DIE is not added to an owner yet.
+const DIE *DIE::getCompileUnitOrNull() const {
+  const DIE *p = this;
   while (p) {
     if (p->getTag() == dwarf::DW_TAG_compile_unit)
       return p;
     p = p->getParent();
   }
-  llvm_unreachable("We should not have orphaned DIEs.");
+  return NULL;
 }
 
-DIEValue *DIE::findAttribute(unsigned Attribute) {
+DIEValue *DIE::findAttribute(uint16_t Attribute) {
   const SmallVectorImpl<DIEValue *> &Values = getValues();
   const DIEAbbrev &Abbrevs = getAbbrev();
 
@@ -199,14 +209,14 @@ void DIEValue::dump() const {
 
 /// EmitValue - Emit integer of appropriate size.
 ///
-void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+void DIEInteger::EmitValue(AsmPrinter *Asm, dwarf::Form Form) const {
   unsigned Size = ~0U;
   switch (Form) {
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
     if (Asm->OutStreamer.hasRawTextSupport())
-      Asm->OutStreamer.EmitRawText(StringRef(""));
+      Asm->OutStreamer.EmitRawText("");
     return;
   case dwarf::DW_FORM_flag:  // Fall thru
   case dwarf::DW_FORM_ref1:  // Fall thru
@@ -231,7 +241,7 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
 
 /// SizeOf - Determine size of integer value in bytes.
 ///
-unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEInteger::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_flag_present: return 0;
   case dwarf::DW_FORM_flag:  // Fall thru
@@ -266,13 +276,13 @@ void DIEInteger::print(raw_ostream &O) const {
 
 /// EmitValue - Emit expression value.
 ///
-void DIEExpr::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEExpr::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->OutStreamer.EmitValue(Expr, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
 ///
-unsigned DIEExpr::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEExpr::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
@@ -292,13 +302,16 @@ void DIEExpr::print(raw_ostream &O) const {
 
 /// EmitValue - Emit label value.
 ///
-void DIELabel::EmitValue(AsmPrinter *AP, unsigned Form) const {
-  AP->EmitLabelReference(Label, SizeOf(AP, Form));
+void DIELabel::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
+  AP->EmitLabelReference(Label, SizeOf(AP, Form),
+                         Form == dwarf::DW_FORM_strp ||
+                             Form == dwarf::DW_FORM_sec_offset ||
+                             Form == dwarf::DW_FORM_ref_addr);
 }
 
 /// SizeOf - Determine size of label value in bytes.
 ///
-unsigned DIELabel::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIELabel::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
@@ -317,13 +330,13 @@ void DIELabel::print(raw_ostream &O) const {
 
 /// EmitValue - Emit delta value.
 ///
-void DIEDelta::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEDelta::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->EmitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of delta value in bytes.
 ///
-unsigned DIEDelta::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEDelta::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
   return AP->getDataLayout().getPointerSize();
@@ -341,13 +354,13 @@ void DIEDelta::print(raw_ostream &O) const {
 
 /// EmitValue - Emit string value.
 ///
-void DIEString::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEString::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   Access->EmitValue(AP, Form);
 }
 
 /// SizeOf - Determine size of delta value in bytes.
 ///
-unsigned DIEString::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEString::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   return Access->SizeOf(AP, Form);
 }
 
@@ -364,7 +377,7 @@ void DIEString::print(raw_ostream &O) const {
 
 /// EmitValue - Emit debug information entry offset.
 ///
-void DIEEntry::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->EmitInt32(Entry->getOffset());
 }
 
@@ -402,7 +415,7 @@ unsigned DIEBlock::ComputeSize(AsmPrinter *AP) {
 
 /// EmitValue - Emit block data.
 ///
-void DIEBlock::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+void DIEBlock::EmitValue(AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   default: llvm_unreachable("Improper form for block");
   case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
@@ -418,7 +431,7 @@ void DIEBlock::EmitValue(AsmPrinter *Asm, unsigned Form) const {
 
 /// SizeOf - Determine size of block data in bytes.
 ///
-unsigned DIEBlock::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEBlock::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
   case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index bfd7d1d..f4fa326 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -33,17 +33,17 @@ namespace llvm {
   class DIEAbbrevData {
     /// Attribute - Dwarf attribute code.
     ///
-    uint16_t Attribute;
+    dwarf::Attribute Attribute;
 
     /// Form - Dwarf form code.
     ///
-    uint16_t Form;
+    dwarf::Form Form;
   public:
-    DIEAbbrevData(uint16_t A, uint16_t F) : Attribute(A), Form(F) {}
+    DIEAbbrevData(dwarf::Attribute A, dwarf::Form F) : Attribute(A), Form(F) {}
 
     // Accessors.
-    uint16_t getAttribute() const { return Attribute; }
-    uint16_t getForm()      const { return Form; }
+    dwarf::Attribute getAttribute() const { return Attribute; }
+    dwarf::Form getForm() const { return Form; }
 
     /// Profile - Used to gather unique data for the abbreviation folding set.
     ///
@@ -56,7 +56,7 @@ namespace llvm {
   class DIEAbbrev : public FoldingSetNode {
     /// Tag - Dwarf tag code.
     ///
-    uint16_t Tag;
+    dwarf::Tag Tag;
 
     /// ChildrenFlag - Dwarf children flag.
     ///
@@ -71,20 +71,19 @@ namespace llvm {
     SmallVector<DIEAbbrevData, 12> Data;
 
   public:
-    DIEAbbrev(uint16_t T, uint16_t C) : Tag(T), ChildrenFlag(C), Data() {}
+    DIEAbbrev(dwarf::Tag T, uint16_t C) : Tag(T), ChildrenFlag(C), Data() {}
 
     // Accessors.
-    uint16_t getTag() const { return Tag; }
+    dwarf::Tag getTag() const { return Tag; }
     unsigned getNumber() const { return Number; }
     uint16_t getChildrenFlag() const { return ChildrenFlag; }
     const SmallVectorImpl<DIEAbbrevData> &getData() const { return Data; }
-    void setTag(uint16_t T) { Tag = T; }
     void setChildrenFlag(uint16_t CF) { ChildrenFlag = CF; }
     void setNumber(unsigned N) { Number = N; }
 
     /// AddAttribute - Adds another set of attribute information to the
     /// abbreviation.
-    void AddAttribute(uint16_t Attribute, uint16_t Form) {
+    void AddAttribute(dwarf::Attribute Attribute, dwarf::Form Form) {
       Data.push_back(DIEAbbrevData(Attribute, Form));
     }
 
@@ -131,19 +130,17 @@ namespace llvm {
     ///
     SmallVector<DIEValue*, 12> Values;
 
-#ifndef NDEBUG
-    // Private data for print()
-    mutable unsigned IndentCount;
-#endif
   public:
     explicit DIE(unsigned Tag)
-      : Offset(0), Size(0), Abbrev(Tag, dwarf::DW_CHILDREN_no), Parent(0) {}
+        : Offset(0), Size(0), Abbrev((dwarf::Tag)Tag, dwarf::DW_CHILDREN_no),
+          Parent(0) {}
     virtual ~DIE();
 
     // Accessors.
     DIEAbbrev &getAbbrev() { return Abbrev; }
+    const DIEAbbrev &getAbbrev() const { return Abbrev; }
     unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
-    unsigned getTag() const { return Abbrev.getTag(); }
+    dwarf::Tag getTag() const { return Abbrev.getTag(); }
     unsigned getOffset() const { return Offset; }
     unsigned getSize() const { return Size; }
     const std::vector<DIE *> &getChildren() const { return Children; }
@@ -151,14 +148,17 @@ namespace llvm {
     DIE *getParent() const { return Parent; }
     /// Climb up the parent chain to get the compile unit DIE this DIE belongs
     /// to.
-    DIE *getCompileUnit();
-    void setTag(unsigned Tag) { Abbrev.setTag(Tag); }
+    const DIE *getCompileUnit() const;
+    /// Similar to getCompileUnit, returns null when DIE is not added to an
+    /// owner yet.
+    const DIE *getCompileUnitOrNull() const;
     void setOffset(unsigned O) { Offset = O; }
     void setSize(unsigned S) { Size = S; }
 
     /// addValue - Add a value and attributes to a DIE.
     ///
-    void addValue(unsigned Attribute, unsigned Form, DIEValue *Value) {
+    void addValue(dwarf::Attribute Attribute, dwarf::Form Form,
+                  DIEValue *Value) {
       Abbrev.AddAttribute(Attribute, Form);
       Values.push_back(Value);
     }
@@ -166,10 +166,7 @@ namespace llvm {
     /// addChild - Add a child to the DIE.
     ///
     void addChild(DIE *Child) {
-      if (Child->getParent()) {
-        assert (Child->getParent() == this && "Unexpected DIE Parent!");
-        return;
-      }
+      assert(!Child->getParent());
       Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
       Children.push_back(Child);
       Child->Parent = this;
@@ -177,7 +174,7 @@ namespace llvm {
 
     /// findAttribute - Find a value in the DIE with the attribute given, returns NULL
     /// if no such attribute exists.
-    DIEValue *findAttribute(unsigned Attribute);
+    DIEValue *findAttribute(uint16_t Attribute);
 
 #ifndef NDEBUG
     void print(raw_ostream &O, unsigned IndentCount = 0) const;
@@ -213,11 +210,11 @@ namespace llvm {
 
     /// EmitValue - Emit value via the Dwarf writer.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const = 0;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const = 0;
 
     /// SizeOf - Return the size of a value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const = 0;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const = 0;
 
 #ifndef NDEBUG
     virtual void print(raw_ostream &O) const = 0;
@@ -235,7 +232,7 @@ namespace llvm {
 
     /// BestForm - Choose the best form for integer.
     ///
-    static unsigned BestForm(bool IsSigned, uint64_t Int) {
+    static dwarf::Form BestForm(bool IsSigned, uint64_t Int) {
       if (IsSigned) {
         const int64_t SignedInt = Int;
         if ((char)Int == SignedInt)     return dwarf::DW_FORM_data1;
@@ -251,13 +248,13 @@ namespace llvm {
 
     /// EmitValue - Emit integer of appropriate size.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     uint64_t getValue() const { return Integer; }
 
     /// SizeOf - Determine size of integer value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
@@ -277,7 +274,7 @@ namespace llvm {
 
     /// EmitValue - Emit expression value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// getValue - Get MCExpr.
     ///
@@ -285,7 +282,7 @@ namespace llvm {
 
     /// SizeOf - Determine size of expression value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *E) { return E->getType() == isExpr; }
@@ -305,7 +302,7 @@ namespace llvm {
 
     /// EmitValue - Emit label value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// getValue - Get MCSymbol.
     ///
@@ -313,7 +310,7 @@ namespace llvm {
 
     /// SizeOf - Determine size of label value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
@@ -335,11 +332,11 @@ namespace llvm {
 
     /// EmitValue - Emit delta value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of delta value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
@@ -365,11 +362,11 @@ namespace llvm {
 
     /// EmitValue - Emit delta value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of delta value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *D) { return D->getType() == isString; }
@@ -394,13 +391,13 @@ namespace llvm {
 
     /// EmitValue - Emit debug information entry offset.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of debug information entry in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const {
-      return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP) :
-                                               sizeof(int32_t);
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
+      return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP)
+                                             : sizeof(int32_t);
     }
 
     /// Returns size of a ref_addr entry.
@@ -420,9 +417,7 @@ namespace llvm {
   class DIEBlock : public DIEValue, public DIE {
     unsigned Size;                // Size in bytes excluding size header.
   public:
-    DIEBlock()
-      : DIEValue(isBlock), DIE(0), Size(0) {}
-    virtual ~DIEBlock() {}
+    DIEBlock() : DIEValue(isBlock), DIE(0), Size(0) {}
 
     /// ComputeSize - calculate the size of the block.
     ///
@@ -430,7 +425,7 @@ namespace llvm {
 
     /// BestForm - Choose the best form for data.
     ///
-    unsigned BestForm() const {
+    dwarf::Form BestForm() const {
       if ((unsigned char)Size == Size)  return dwarf::DW_FORM_block1;
       if ((unsigned short)Size == Size) return dwarf::DW_FORM_block2;
       if ((unsigned int)Size == Size)   return dwarf::DW_FORM_block4;
@@ -439,11 +434,11 @@ namespace llvm {
 
     /// EmitValue - Emit block data.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of block data in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
new file mode 100644
index 0000000..95eca90
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -0,0 +1,507 @@
+//===-- llvm/CodeGen/DIEHash.cpp - Dwarf Hashing Framework ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfdebug"
+
+#include "DIEHash.h"
+
+#include "DIE.h"
+#include "DwarfCompileUnit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// \brief Grabs the string in whichever attribute is passed in and returns
+/// a reference to it.
+static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
+  const DIEAbbrev &Abbrevs = Die.getAbbrev();
+
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return an empty string.
+  for (size_t i = 0; i < Values.size(); ++i) {
+    if (Abbrevs.getData()[i].getAttribute() == Attr) {
+      DIEValue *V = Values[i];
+      assert(isa<DIEString>(V) && "String requested. Not a string.");
+      DIEString *S = cast<DIEString>(V);
+      return S->getString();
+    }
+  }
+  return StringRef("");
+}
+
+/// \brief Adds the string in \p Str to the hash. This also hashes
+/// a trailing NULL with the string.
+void DIEHash::addString(StringRef Str) {
+  DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
+  Hash.update(Str);
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+// FIXME: The LEB128 routines are copied and only slightly modified out of
+// LEB128.h.
+
+/// \brief Adds the unsigned in \p Value to the hash encoded as a ULEB128.
+void DIEHash::addULEB128(uint64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    if (Value != 0)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (Value != 0);
+}
+
+void DIEHash::addSLEB128(int64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  bool More;
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
+              ((Value == -1) && ((Byte & 0x40) != 0))));
+    if (More)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (More);
+}
+
+/// \brief Including \p Parent adds the context of Parent to the hash..
+void DIEHash::addParentContext(const DIE &Parent) {
+
+  DEBUG(dbgs() << "Adding parent context to hash...\n");
+
+  // [7.27.2] For each surrounding type or namespace beginning with the
+  // outermost such construct...
+  SmallVector<const DIE *, 1> Parents;
+  const DIE *Cur = &Parent;
+  while (Cur->getTag() != dwarf::DW_TAG_compile_unit) {
+    Parents.push_back(Cur);
+    Cur = Cur->getParent();
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (SmallVectorImpl<const DIE *>::reverse_iterator I = Parents.rbegin(),
+                                                      E = Parents.rend();
+       I != E; ++I) {
+    const DIE &Die = **I;
+
+    // ... Append the letter "C" to the sequence...
+    addULEB128('C');
+
+    // ... Followed by the DWARF tag of the construct...
+    addULEB128(Die.getTag());
+
+    // ... Then the name, taken from the DW_AT_name attribute.
+    StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
+    DEBUG(dbgs() << "... adding context: " << Name << "\n");
+    if (!Name.empty())
+      addString(Name);
+  }
+}
+
+// Collect all of the attributes for a particular DIE in single structure.
+void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
+  const DIEAbbrev &Abbrevs = Die.getAbbrev();
+
+#define COLLECT_ATTR(NAME)                                                     \
+  case dwarf::NAME:                                                            \
+    Attrs.NAME.Val = Values[i];                                                \
+    Attrs.NAME.Desc = &Abbrevs.getData()[i];                                   \
+    break
+
+  for (size_t i = 0, e = Values.size(); i != e; ++i) {
+    DEBUG(dbgs() << "Attribute: "
+                 << dwarf::AttributeString(Abbrevs.getData()[i].getAttribute())
+                 << " added.\n");
+    switch (Abbrevs.getData()[i].getAttribute()) {
+    COLLECT_ATTR(DW_AT_name);
+    COLLECT_ATTR(DW_AT_accessibility);
+    COLLECT_ATTR(DW_AT_address_class);
+    COLLECT_ATTR(DW_AT_allocated);
+    COLLECT_ATTR(DW_AT_artificial);
+    COLLECT_ATTR(DW_AT_associated);
+    COLLECT_ATTR(DW_AT_binary_scale);
+    COLLECT_ATTR(DW_AT_bit_offset);
+    COLLECT_ATTR(DW_AT_bit_size);
+    COLLECT_ATTR(DW_AT_bit_stride);
+    COLLECT_ATTR(DW_AT_byte_size);
+    COLLECT_ATTR(DW_AT_byte_stride);
+    COLLECT_ATTR(DW_AT_const_expr);
+    COLLECT_ATTR(DW_AT_const_value);
+    COLLECT_ATTR(DW_AT_containing_type);
+    COLLECT_ATTR(DW_AT_count);
+    COLLECT_ATTR(DW_AT_data_bit_offset);
+    COLLECT_ATTR(DW_AT_data_location);
+    COLLECT_ATTR(DW_AT_data_member_location);
+    COLLECT_ATTR(DW_AT_decimal_scale);
+    COLLECT_ATTR(DW_AT_decimal_sign);
+    COLLECT_ATTR(DW_AT_default_value);
+    COLLECT_ATTR(DW_AT_digit_count);
+    COLLECT_ATTR(DW_AT_discr);
+    COLLECT_ATTR(DW_AT_discr_list);
+    COLLECT_ATTR(DW_AT_discr_value);
+    COLLECT_ATTR(DW_AT_encoding);
+    COLLECT_ATTR(DW_AT_enum_class);
+    COLLECT_ATTR(DW_AT_endianity);
+    COLLECT_ATTR(DW_AT_explicit);
+    COLLECT_ATTR(DW_AT_is_optional);
+    COLLECT_ATTR(DW_AT_location);
+    COLLECT_ATTR(DW_AT_lower_bound);
+    COLLECT_ATTR(DW_AT_mutable);
+    COLLECT_ATTR(DW_AT_ordering);
+    COLLECT_ATTR(DW_AT_picture_string);
+    COLLECT_ATTR(DW_AT_prototyped);
+    COLLECT_ATTR(DW_AT_small);
+    COLLECT_ATTR(DW_AT_segment);
+    COLLECT_ATTR(DW_AT_string_length);
+    COLLECT_ATTR(DW_AT_threads_scaled);
+    COLLECT_ATTR(DW_AT_upper_bound);
+    COLLECT_ATTR(DW_AT_use_location);
+    COLLECT_ATTR(DW_AT_use_UTF8);
+    COLLECT_ATTR(DW_AT_variable_parameter);
+    COLLECT_ATTR(DW_AT_virtuality);
+    COLLECT_ATTR(DW_AT_visibility);
+    COLLECT_ATTR(DW_AT_vtable_elem_location);
+    COLLECT_ATTR(DW_AT_type);
+    default:
+      break;
+    }
+  }
+}
+
+void DIEHash::hashShallowTypeReference(dwarf::Attribute Attribute,
+                                       const DIE &Entry, StringRef Name) {
+  // append the letter 'N'
+  addULEB128('N');
+
+  // the DWARF attribute code (DW_AT_type or DW_AT_friend),
+  addULEB128(Attribute);
+
+  // the context of the tag,
+  if (const DIE *Parent = Entry.getParent())
+    addParentContext(*Parent);
+
+  // the letter 'E',
+  addULEB128('E');
+
+  // and the name of the type.
+  addString(Name);
+
+  // Currently DW_TAG_friends are not used by Clang, but if they do become so,
+  // here's the relevant spec text to implement:
+  //
+  // For DW_TAG_friend, if the referenced entry is the DW_TAG_subprogram,
+  // the context is omitted and the name to be used is the ABI-specific name
+  // of the subprogram (e.g., the mangled linker name).
+}
+
+void DIEHash::hashRepeatedTypeReference(dwarf::Attribute Attribute,
+                                        unsigned DieNumber) {
+  // a) If T is in the list of [previously hashed types], use the letter
+  // 'R' as the marker
+  addULEB128('R');
+
+  addULEB128(Attribute);
+
+  // and use the unsigned LEB128 encoding of [the index of T in the
+  // list] as the attribute value;
+  addULEB128(DieNumber);
+}
+
+void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                           const DIE &Entry) {
+  assert(Tag != dwarf::DW_TAG_friend && "No current LLVM clients emit friend "
+                                        "tags. Add support here when there's "
+                                        "a use case");
+  // Step 5
+  // If the tag in Step 3 is one of [the below tags]
+  if ((Tag == dwarf::DW_TAG_pointer_type ||
+       Tag == dwarf::DW_TAG_reference_type ||
+       Tag == dwarf::DW_TAG_rvalue_reference_type ||
+       Tag == dwarf::DW_TAG_ptr_to_member_type) &&
+      // and the referenced type (via the [below attributes])
+      // FIXME: This seems overly restrictive, and causes hash mismatches
+      // there's a decl/def difference in the containing type of a
+      // ptr_to_member_type, but it's what DWARF says, for some reason.
+      Attribute == dwarf::DW_AT_type) {
+    // ... has a DW_AT_name attribute,
+    StringRef Name = getDIEStringAttr(Entry, dwarf::DW_AT_name);
+    if (!Name.empty()) {
+      hashShallowTypeReference(Attribute, Entry, Name);
+      return;
+    }
+  }
+
+  unsigned &DieNumber = Numbering[&Entry];
+  if (DieNumber) {
+    hashRepeatedTypeReference(Attribute, DieNumber);
+    return;
+  }
+
+  // otherwise, b) use the letter 'T' as a the marker, ...
+  addULEB128('T');
+
+  addULEB128(Attribute);
+
+  // ... process the type T recursively by performing Steps 2 through 7, and
+  // use the result as the attribute value.
+  DieNumber = Numbering.size();
+  computeHash(Entry);
+}
+
+// Hash an individual attribute \param Attr based on the type of attribute and
+// the form.
+void DIEHash::hashAttribute(AttrEntry Attr, dwarf::Tag Tag) {
+  const DIEValue *Value = Attr.Val;
+  const DIEAbbrevData *Desc = Attr.Desc;
+  dwarf::Attribute Attribute = Desc->getAttribute();
+
+  // 7.27 Step 3
+  // ... An attribute that refers to another type entry T is processed as
+  // follows:
+  if (const DIEEntry *EntryAttr = dyn_cast<DIEEntry>(Value)) {
+    hashDIEEntry(Attribute, Tag, *EntryAttr->getEntry());
+    return;
+  }
+
+  // Other attribute values use the letter 'A' as the marker, ...
+  addULEB128('A');
+
+  addULEB128(Attribute);
+
+  // ... and the value consists of the form code (encoded as an unsigned LEB128
+  // value) followed by the encoding of the value according to the form code. To
+  // ensure reproducibility of the signature, the set of forms used in the
+  // signature computation is limited to the following: DW_FORM_sdata,
+  // DW_FORM_flag, DW_FORM_string, and DW_FORM_block.
+  switch (Desc->getForm()) {
+  case dwarf::DW_FORM_string:
+    llvm_unreachable(
+        "Add support for DW_FORM_string if we ever start emitting them again");
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_strp:
+    addULEB128(dwarf::DW_FORM_string);
+    addString(cast<DIEString>(Value)->getString());
+    break;
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_udata:
+    addULEB128(dwarf::DW_FORM_sdata);
+    addSLEB128((int64_t)cast<DIEInteger>(Value)->getValue());
+    break;
+  default:
+    llvm_unreachable("Add support for additional forms");
+  }
+}
+
+// Go through the attributes from \param Attrs in the order specified in 7.27.4
+// and hash them.
+void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) {
+#define ADD_ATTR(ATTR)                                                         \
+  {                                                                            \
+    if (ATTR.Val != 0)                                                         \
+      hashAttribute(ATTR, Tag);                                                \
+  }
+
+  ADD_ATTR(Attrs.DW_AT_name);
+  ADD_ATTR(Attrs.DW_AT_accessibility);
+  ADD_ATTR(Attrs.DW_AT_address_class);
+  ADD_ATTR(Attrs.DW_AT_allocated);
+  ADD_ATTR(Attrs.DW_AT_artificial);
+  ADD_ATTR(Attrs.DW_AT_associated);
+  ADD_ATTR(Attrs.DW_AT_binary_scale);
+  ADD_ATTR(Attrs.DW_AT_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_bit_size);
+  ADD_ATTR(Attrs.DW_AT_bit_stride);
+  ADD_ATTR(Attrs.DW_AT_byte_size);
+  ADD_ATTR(Attrs.DW_AT_byte_stride);
+  ADD_ATTR(Attrs.DW_AT_const_expr);
+  ADD_ATTR(Attrs.DW_AT_const_value);
+  ADD_ATTR(Attrs.DW_AT_containing_type);
+  ADD_ATTR(Attrs.DW_AT_count);
+  ADD_ATTR(Attrs.DW_AT_data_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_data_location);
+  ADD_ATTR(Attrs.DW_AT_data_member_location);
+  ADD_ATTR(Attrs.DW_AT_decimal_scale);
+  ADD_ATTR(Attrs.DW_AT_decimal_sign);
+  ADD_ATTR(Attrs.DW_AT_default_value);
+  ADD_ATTR(Attrs.DW_AT_digit_count);
+  ADD_ATTR(Attrs.DW_AT_discr);
+  ADD_ATTR(Attrs.DW_AT_discr_list);
+  ADD_ATTR(Attrs.DW_AT_discr_value);
+  ADD_ATTR(Attrs.DW_AT_encoding);
+  ADD_ATTR(Attrs.DW_AT_enum_class);
+  ADD_ATTR(Attrs.DW_AT_endianity);
+  ADD_ATTR(Attrs.DW_AT_explicit);
+  ADD_ATTR(Attrs.DW_AT_is_optional);
+  ADD_ATTR(Attrs.DW_AT_location);
+  ADD_ATTR(Attrs.DW_AT_lower_bound);
+  ADD_ATTR(Attrs.DW_AT_mutable);
+  ADD_ATTR(Attrs.DW_AT_ordering);
+  ADD_ATTR(Attrs.DW_AT_picture_string);
+  ADD_ATTR(Attrs.DW_AT_prototyped);
+  ADD_ATTR(Attrs.DW_AT_small);
+  ADD_ATTR(Attrs.DW_AT_segment);
+  ADD_ATTR(Attrs.DW_AT_string_length);
+  ADD_ATTR(Attrs.DW_AT_threads_scaled);
+  ADD_ATTR(Attrs.DW_AT_upper_bound);
+  ADD_ATTR(Attrs.DW_AT_use_location);
+  ADD_ATTR(Attrs.DW_AT_use_UTF8);
+  ADD_ATTR(Attrs.DW_AT_variable_parameter);
+  ADD_ATTR(Attrs.DW_AT_virtuality);
+  ADD_ATTR(Attrs.DW_AT_visibility);
+  ADD_ATTR(Attrs.DW_AT_vtable_elem_location);
+  ADD_ATTR(Attrs.DW_AT_type);
+
+  // FIXME: Add the extended attributes.
+}
+
+// Add all of the attributes for \param Die to the hash.
+void DIEHash::addAttributes(const DIE &Die) {
+  DIEAttrs Attrs = {};
+  collectAttributes(Die, Attrs);
+  hashAttributes(Attrs, Die.getTag());
+}
+
+void DIEHash::hashNestedType(const DIE &Die, StringRef Name) {
+  // 7.27 Step 7
+  // ... append the letter 'S',
+  addULEB128('S');
+
+  // the tag of C,
+  addULEB128(Die.getTag());
+
+  // and the name.
+  addString(Name);
+}
+
+// Compute the hash of a DIE. This is based on the type signature computation
+// given in section 7.27 of the DWARF4 standard. It is the md5 hash of a
+// flattened description of the DIE.
+void DIEHash::computeHash(const DIE &Die) {
+  // Append the letter 'D', followed by the DWARF tag of the DIE.
+  addULEB128('D');
+  addULEB128(Die.getTag());
+
+  // Add each of the attributes of the DIE.
+  addAttributes(Die);
+
+  // Then hash each of the children of the DIE.
+  for (std::vector<DIE *>::const_iterator I = Die.getChildren().begin(),
+                                          E = Die.getChildren().end();
+       I != E; ++I) {
+    // 7.27 Step 7
+    // If C is a nested type entry or a member function entry, ...
+    if (isType((*I)->getTag()) || (*I)->getTag() == dwarf::DW_TAG_subprogram) {
+      StringRef Name = getDIEStringAttr(**I, dwarf::DW_AT_name);
+      // ... and has a DW_AT_name attribute
+      if (!Name.empty()) {
+        hashNestedType(**I, Name);
+        continue;
+      }
+    }
+    computeHash(**I);
+  }
+
+  // Following the last (or if there are no children), append a zero byte.
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is the md5 hash of a flattened description of the DIE
+/// with the exception that we are hashing only the context and the name of the
+/// type.
+uint64_t DIEHash::computeDIEODRSignature(const DIE &Die) {
+
+  // Add the contexts to the hash. We won't be computing the ODR hash for
+  // function local types so it's safe to use the generic context hashing
+  // algorithm here.
+  // FIXME: If we figure out how to account for linkage in some way we could
+  // actually do this with a slight modification to the parent hash algorithm.
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Add the current DIE information.
+
+  // Add the DWARF tag of the DIE.
+  addULEB128(Die.getTag());
+
+  // Add the name of the type to the hash.
+  addString(getDIEStringAttr(Die, dwarf::DW_AT_name));
+
+  // Now get the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of the full CU and all top level CU entities.
+// TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
+uint64_t DIEHash::computeCUSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of additional forms not specifically called out in the
+/// standard.
+uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
new file mode 100644
index 0000000..f0c4ef9
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -0,0 +1,147 @@
+//===-- llvm/CodeGen/DIEHash.h - Dwarf Hashing Framework -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DIE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/MD5.h"
+
+namespace llvm {
+
+class CompileUnit;
+
+/// \brief An object containing the capability of hashing and adding hash
+/// attributes onto a DIE.
+class DIEHash {
+  // The entry for a particular attribute.
+  struct AttrEntry {
+    const DIEValue *Val;
+    const DIEAbbrevData *Desc;
+  };
+
+  // Collection of all attributes used in hashing a particular DIE.
+  struct DIEAttrs {
+    AttrEntry DW_AT_name;
+    AttrEntry DW_AT_accessibility;
+    AttrEntry DW_AT_address_class;
+    AttrEntry DW_AT_allocated;
+    AttrEntry DW_AT_artificial;
+    AttrEntry DW_AT_associated;
+    AttrEntry DW_AT_binary_scale;
+    AttrEntry DW_AT_bit_offset;
+    AttrEntry DW_AT_bit_size;
+    AttrEntry DW_AT_bit_stride;
+    AttrEntry DW_AT_byte_size;
+    AttrEntry DW_AT_byte_stride;
+    AttrEntry DW_AT_const_expr;
+    AttrEntry DW_AT_const_value;
+    AttrEntry DW_AT_containing_type;
+    AttrEntry DW_AT_count;
+    AttrEntry DW_AT_data_bit_offset;
+    AttrEntry DW_AT_data_location;
+    AttrEntry DW_AT_data_member_location;
+    AttrEntry DW_AT_decimal_scale;
+    AttrEntry DW_AT_decimal_sign;
+    AttrEntry DW_AT_default_value;
+    AttrEntry DW_AT_digit_count;
+    AttrEntry DW_AT_discr;
+    AttrEntry DW_AT_discr_list;
+    AttrEntry DW_AT_discr_value;
+    AttrEntry DW_AT_encoding;
+    AttrEntry DW_AT_enum_class;
+    AttrEntry DW_AT_endianity;
+    AttrEntry DW_AT_explicit;
+    AttrEntry DW_AT_is_optional;
+    AttrEntry DW_AT_location;
+    AttrEntry DW_AT_lower_bound;
+    AttrEntry DW_AT_mutable;
+    AttrEntry DW_AT_ordering;
+    AttrEntry DW_AT_picture_string;
+    AttrEntry DW_AT_prototyped;
+    AttrEntry DW_AT_small;
+    AttrEntry DW_AT_segment;
+    AttrEntry DW_AT_string_length;
+    AttrEntry DW_AT_threads_scaled;
+    AttrEntry DW_AT_upper_bound;
+    AttrEntry DW_AT_use_location;
+    AttrEntry DW_AT_use_UTF8;
+    AttrEntry DW_AT_variable_parameter;
+    AttrEntry DW_AT_virtuality;
+    AttrEntry DW_AT_visibility;
+    AttrEntry DW_AT_vtable_elem_location;
+    AttrEntry DW_AT_type;
+
+    // Insert any additional ones here...
+  };
+
+public:
+  /// \brief Computes the ODR signature.
+  uint64_t computeDIEODRSignature(const DIE &Die);
+
+  /// \brief Computes the CU signature.
+  uint64_t computeCUSignature(const DIE &Die);
+
+  /// \brief Computes the type signature.
+  uint64_t computeTypeSignature(const DIE &Die);
+
+  // Helper routines to process parts of a DIE.
+private:
+  /// \brief Adds the parent context of \param Die to the hash.
+  void addParentContext(const DIE &Die);
+
+  /// \brief Adds the attributes of \param Die to the hash.
+  void addAttributes(const DIE &Die);
+
+  /// \brief Computes the full DWARF4 7.27 hash of the DIE.
+  void computeHash(const DIE &Die);
+
+  // Routines that add DIEValues to the hash.
+private:
+  /// \brief Encodes and adds \param Value to the hash as a ULEB128.
+  void addULEB128(uint64_t Value);
+
+  /// \brief Encodes and adds \param Value to the hash as a SLEB128.
+  void addSLEB128(int64_t Value);
+
+  /// \brief Adds \param Str to the hash and includes a NULL byte.
+  void addString(StringRef Str);
+
+  /// \brief Collects the attributes of DIE \param Die into the \param Attrs
+  /// structure.
+  void collectAttributes(const DIE &Die, DIEAttrs &Attrs);
+
+  /// \brief Hashes the attributes in \param Attrs in order.
+  void hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag);
+
+  /// \brief Hashes an individual attribute.
+  void hashAttribute(AttrEntry Attr, dwarf::Tag Tag);
+
+  /// \brief Hashes an attribute that refers to another DIE.
+  void hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                    const DIE &Entry);
+
+  /// \brief Hashes a reference to a named type in such a way that is
+  /// independent of whether that type is described by a declaration or a
+  /// definition.
+  void hashShallowTypeReference(dwarf::Attribute Attribute, const DIE &Entry,
+                                StringRef Name);
+
+  /// \brief Hashes a reference to a previously referenced type DIE.
+  void hashRepeatedTypeReference(dwarf::Attribute Attribute, unsigned DieNumber);
+
+  void hashNestedType(const DIE &Die, StringRef Name);
+
+private:
+  MD5 Hash;
+  DenseMap<const DIE *, unsigned> Numbering;
+};
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index a82a149..689aeda 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -24,27 +24,14 @@
 
 using namespace llvm;
 
-const char *DwarfAccelTable::Atom::AtomTypeString(enum AtomType AT) {
-  switch (AT) {
-  case eAtomTypeNULL: return "eAtomTypeNULL";
-  case eAtomTypeDIEOffset: return "eAtomTypeDIEOffset";
-  case eAtomTypeCUOffset: return "eAtomTypeCUOffset";
-  case eAtomTypeTag: return "eAtomTypeTag";
-  case eAtomTypeNameFlags: return "eAtomTypeNameFlags";
-  case eAtomTypeTypeFlags: return "eAtomTypeTypeFlags";
-  }
-  llvm_unreachable("invalid AtomType!");
-}
-
 // The length of the header data is always going to be 4 + 4 + 4*NumAtoms.
-DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList) :
-  Header(8 + (atomList.size() * 4)),
-  HeaderData(atomList),
-  Entries(Allocator) { }
+DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList)
+    : Header(8 + (atomList.size() * 4)), HeaderData(atomList),
+      Entries(Allocator) {}
 
-DwarfAccelTable::~DwarfAccelTable() { }
+DwarfAccelTable::~DwarfAccelTable() {}
 
-void DwarfAccelTable::AddName(StringRef Name, DIE* die, char Flags) {
+void DwarfAccelTable::AddName(StringRef Name, DIE *die, char Flags) {
   assert(Data.empty() && "Already finalized!");
   // If the string is in the list already then add this die to the list
   // otherwise add a new one.
@@ -59,13 +46,16 @@ void DwarfAccelTable::ComputeBucketCount(void) {
     uniques[i] = Data[i]->HashValue;
   array_pod_sort(uniques.begin(), uniques.end());
   std::vector<uint32_t>::iterator p =
-    std::unique(uniques.begin(), uniques.end());
+      std::unique(uniques.begin(), uniques.end());
   uint32_t num = std::distance(uniques.begin(), p);
 
   // Then compute the bucket size, minimum of 1 bucket.
-  if (num > 1024) Header.bucket_count = num/4;
-  if (num > 16) Header.bucket_count = num/2;
-  else Header.bucket_count = num > 0 ? num : 1;
+  if (num > 1024)
+    Header.bucket_count = num / 4;
+  if (num > 16)
+    Header.bucket_count = num / 2;
+  else
+    Header.bucket_count = num > 0 ? num : 1;
 
   Header.hashes_count = num;
 }
@@ -78,13 +68,13 @@ static bool compareDIEs(const DwarfAccelTable::HashDataContents *A,
 
 void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, StringRef Prefix) {
   // Create the individual hash data outputs.
-  for (StringMap<DataArray>::iterator
-         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+  for (StringMap<DataArray>::iterator EI = Entries.begin(), EE = Entries.end();
+       EI != EE; ++EI) {
 
     // Unique the entries.
     std::stable_sort(EI->second.begin(), EI->second.end(), compareDIEs);
     EI->second.erase(std::unique(EI->second.begin(), EI->second.end()),
-                       EI->second.end());
+                     EI->second.end());
 
     HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
     Data.push_back(Entry);
@@ -126,7 +116,7 @@ void DwarfAccelTable::EmitHeader(AsmPrinter *Asm) {
   Asm->EmitInt32(HeaderData.Atoms.size());
   for (size_t i = 0; i < HeaderData.Atoms.size(); i++) {
     Atom A = HeaderData.Atoms[i];
-    Asm->OutStreamer.AddComment(Atom::AtomTypeString(A.type));
+    Asm->OutStreamer.AddComment(dwarf::AtomTypeString(A.type));
     Asm->EmitInt16(A.type);
     Asm->OutStreamer.AddComment(dwarf::FormEncodingString(A.form));
     Asm->EmitInt16(A.form);
@@ -152,7 +142,8 @@ void DwarfAccelTable::EmitBuckets(AsmPrinter *Asm) {
 void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       Asm->OutStreamer.AddComment("Hash in Bucket " + Twine(i));
       Asm->EmitInt32((*HI)->HashValue);
     }
@@ -166,13 +157,13 @@ void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
 void DwarfAccelTable::EmitOffsets(AsmPrinter *Asm, MCSymbol *SecBegin) {
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       Asm->OutStreamer.AddComment("Offset in Bucket " + Twine(i));
       MCContext &Context = Asm->OutStreamer.getContext();
-      const MCExpr *Sub =
-        MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create((*HI)->Sym, Context),
-                                MCSymbolRefExpr::Create(SecBegin, Context),
-                                Context);
+      const MCExpr *Sub = MCBinaryExpr::CreateSub(
+          MCSymbolRefExpr::Create((*HI)->Sym, Context),
+          MCSymbolRefExpr::Create(SecBegin, Context), Context);
       Asm->OutStreamer.EmitValue(Sub, sizeof(uint32_t));
     }
   }
@@ -185,7 +176,8 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
   uint64_t PrevHash = UINT64_MAX;
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       // Remember to emit the label for our offset.
       Asm->OutStreamer.EmitLabel((*HI)->Sym);
       Asm->OutStreamer.AddComment((*HI)->Str);
@@ -193,8 +185,9 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
                              D->getStringPoolSym());
       Asm->OutStreamer.AddComment("Num DIEs");
       Asm->EmitInt32((*HI)->Data.size());
-      for (ArrayRef<HashDataContents*>::const_iterator
-             DI = (*HI)->Data.begin(), DE = (*HI)->Data.end();
+      for (ArrayRef<HashDataContents *>::const_iterator
+               DI = (*HI)->Data.begin(),
+               DE = (*HI)->Data.end();
            DI != DE; ++DI) {
         // Emit the DIE offset
         Asm->EmitInt32((*DI)->Die->getOffset());
@@ -214,8 +207,7 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
 }
 
 // Emit the entire data structure to the output file.
-void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin,
-                           DwarfUnits *D) {
+void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfUnits *D) {
   // Emit the header.
   EmitHeader(Asm);
 
@@ -239,11 +231,12 @@ void DwarfAccelTable::print(raw_ostream &O) {
   HeaderData.print(O);
 
   O << "Entries: \n";
-  for (StringMap<DataArray>::const_iterator
-         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+  for (StringMap<DataArray>::const_iterator EI = Entries.begin(),
+                                            EE = Entries.end();
+       EI != EE; ++EI) {
     O << "Name: " << EI->getKeyData() << "\n";
     for (DataArray::const_iterator DI = EI->second.begin(),
-           DE = EI->second.end();
+                                   DE = EI->second.end();
          DI != DE; ++DI)
       (*DI)->print(O);
   }
@@ -251,14 +244,14 @@ void DwarfAccelTable::print(raw_ostream &O) {
   O << "Buckets and Hashes: \n";
   for (size_t i = 0, e = Buckets.size(); i < e; ++i)
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI)
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI)
       (*HI)->print(O);
 
   O << "Data: \n";
-    for (std::vector<HashData*>::const_iterator
-           DI = Data.begin(), DE = Data.end(); DI != DE; ++DI)
-      (*DI)->print(O);
-
-
+  for (std::vector<HashData *>::const_iterator DI = Data.begin(),
+                                               DE = Data.end();
+       DI != DE; ++DI)
+    (*DI)->print(O);
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 3ef1dc5..7627313 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -67,11 +67,7 @@ class DwarfUnits;
 
 class DwarfAccelTable {
 
-  enum HashFunctionType {
-    eHashFunctionDJB = 0u
-  };
-
-  static uint32_t HashDJB (StringRef Str) {
+  static uint32_t HashDJB(StringRef Str) {
     uint32_t h = 5381;
     for (unsigned i = 0, e = Str.size(); i != e; ++i)
       h = ((h << 5) + h) + Str[i];
@@ -80,25 +76,25 @@ class DwarfAccelTable {
 
   // Helper function to compute the number of buckets needed based on
   // the number of unique hashes.
-  void ComputeBucketCount (void);
+  void ComputeBucketCount(void);
 
   struct TableHeader {
-    uint32_t   magic;           // 'HASH' magic value to allow endian detection
-    uint16_t   version;         // Version number.
-    uint16_t   hash_function;   // The hash function enumeration that was used.
-    uint32_t   bucket_count;    // The number of buckets in this hash table.
-    uint32_t   hashes_count;    // The total number of unique hash values
-                                // and hash data offsets in this table.
-    uint32_t   header_data_len; // The bytes to skip to get to the hash
-                                // indexes (buckets) for correct alignment.
+    uint32_t magic;           // 'HASH' magic value to allow endian detection
+    uint16_t version;         // Version number.
+    uint16_t hash_function;   // The hash function enumeration that was used.
+    uint32_t bucket_count;    // The number of buckets in this hash table.
+    uint32_t hashes_count;    // The total number of unique hash values
+                              // and hash data offsets in this table.
+    uint32_t header_data_len; // The bytes to skip to get to the hash
+                              // indexes (buckets) for correct alignment.
     // Also written to disk is the implementation specific header data.
 
     static const uint32_t MagicHash = 0x48415348;
 
-    TableHeader (uint32_t data_len) :
-      magic (MagicHash), version (1), hash_function (eHashFunctionDJB),
-      bucket_count (0), hashes_count (0), header_data_len (data_len)
-    {}
+    TableHeader(uint32_t data_len)
+        : magic(MagicHash), version(1),
+          hash_function(dwarf::DW_hash_function_djb), bucket_count(0),
+          hashes_count(0), header_data_len(data_len) {}
 
 #ifndef NDEBUG
     void print(raw_ostream &O) {
@@ -124,62 +120,38 @@ public:
   // uint32_t die_offset_base
   // uint32_t atom_count
   // atom_count Atoms
-  enum AtomType {
-    eAtomTypeNULL       = 0u,
-    eAtomTypeDIEOffset  = 1u,   // DIE offset, check form for encoding
-    eAtomTypeCUOffset   = 2u,   // DIE offset of the compiler unit header that
-                                // contains the item in question
-    eAtomTypeTag        = 3u,   // DW_TAG_xxx value, should be encoded as
-                                // DW_FORM_data1 (if no tags exceed 255) or
-                                // DW_FORM_data2.
-    eAtomTypeNameFlags  = 4u,   // Flags from enum NameFlags
-    eAtomTypeTypeFlags  = 5u    // Flags from enum TypeFlags
-  };
-
-  enum TypeFlags {
-    eTypeFlagClassMask = 0x0000000fu,
-
-    // Always set for C++, only set for ObjC if this is the
-    // @implementation for a class.
-    eTypeFlagClassIsImplementation  = ( 1u << 1 )
-  };
 
   // Make these public so that they can be used as a general interface to
   // the class.
   struct Atom {
-    AtomType type; // enum AtomType
+    uint16_t type; // enum AtomType
     uint16_t form; // DWARF DW_FORM_ defines
 
-    Atom(AtomType type, uint16_t form) : type(type), form(form) {}
-    static const char * AtomTypeString(enum AtomType);
+    Atom(uint16_t type, uint16_t form) : type(type), form(form) {}
 #ifndef NDEBUG
     void print(raw_ostream &O) {
-      O << "Type: " << AtomTypeString(type) << "\n"
+      O << "Type: " << dwarf::AtomTypeString(type) << "\n"
         << "Form: " << dwarf::FormEncodingString(form) << "\n";
     }
-    void dump() {
-      print(dbgs());
-    }
+    void dump() { print(dbgs()); }
 #endif
   };
 
- private:
+private:
   struct TableHeaderData {
     uint32_t die_offset_base;
     SmallVector<Atom, 1> Atoms;
 
     TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
-      : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) { }
+        : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) {}
 
 #ifndef NDEBUG
-    void print (raw_ostream &O) {
+    void print(raw_ostream &O) {
       O << "die_offset_base: " << die_offset_base << "\n";
       for (size_t i = 0; i < Atoms.size(); i++)
         Atoms[i].print(O);
     }
-    void dump() {
-      print(dbgs());
-    }
+    void dump() { print(dbgs()); }
 #endif
   };
 
@@ -193,37 +165,38 @@ public:
   // HashData[hash_data_count]
 public:
   struct HashDataContents {
-    DIE *Die; // Offsets
+    DIE *Die;   // Offsets
     char Flags; // Specific flags to output
 
-    HashDataContents(DIE *D, char Flags) :
-      Die(D),
-      Flags(Flags) { }
-    #ifndef NDEBUG
+    HashDataContents(DIE *D, char Flags) : Die(D), Flags(Flags) {}
+#ifndef NDEBUG
     void print(raw_ostream &O) const {
       O << "  Offset: " << Die->getOffset() << "\n";
       O << "  Tag: " << dwarf::TagString(Die->getTag()) << "\n";
       O << "  Flags: " << Flags << "\n";
     }
-    #endif
+#endif
   };
+
 private:
   struct HashData {
     StringRef Str;
     uint32_t HashValue;
     MCSymbol *Sym;
-    ArrayRef<HashDataContents*> Data; // offsets
-    HashData(StringRef S, ArrayRef<HashDataContents*> Data)
-      : Str(S), Data(Data) {
+    ArrayRef<HashDataContents *> Data; // offsets
+    HashData(StringRef S, ArrayRef<HashDataContents *> Data)
+        : Str(S), Data(Data) {
       HashValue = DwarfAccelTable::HashDJB(S);
     }
-    #ifndef NDEBUG
+#ifndef NDEBUG
     void print(raw_ostream &O) {
       O << "Name: " << Str << "\n";
       O << "  Hash Value: " << format("0x%x", HashValue) << "\n";
-      O << "  Symbol: " ;
-      if (Sym) Sym->print(O);
-      else O << "<none>";
+      O << "  Symbol: ";
+      if (Sym)
+        Sym->print(O);
+      else
+        O << "<none>";
       O << "\n";
       for (size_t i = 0; i < Data.size(); i++) {
         O << "  Offset: " << Data[i]->Die->getOffset() << "\n";
@@ -231,14 +204,12 @@ private:
         O << "  Flags: " << Data[i]->Flags << "\n";
       }
     }
-    void dump() {
-      print(dbgs());
-    }
-    #endif
+    void dump() { print(dbgs()); }
+#endif
   };
 
-  DwarfAccelTable(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
-  void operator=(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
+  DwarfAccelTable(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
+  void operator=(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
 
   // Internal Functions
   void EmitHeader(AsmPrinter *);
@@ -253,24 +224,24 @@ private:
   // Output Variables
   TableHeader Header;
   TableHeaderData HeaderData;
-  std::vector<HashData*> Data;
+  std::vector<HashData *> Data;
 
   // String Data
-  typedef std::vector<HashDataContents*> DataArray;
-  typedef StringMap<DataArray, BumpPtrAllocator&> StringEntries;
+  typedef std::vector<HashDataContents *> DataArray;
+  typedef StringMap<DataArray, BumpPtrAllocator &> StringEntries;
   StringEntries Entries;
 
   // Buckets/Hashes/Offsets
-  typedef std::vector<HashData*> HashList;
+  typedef std::vector<HashData *> HashList;
   typedef std::vector<HashList> BucketList;
   BucketList Buckets;
   HashList Hashes;
 
   // Public Implementation
- public:
+public:
   DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
   ~DwarfAccelTable();
-  void AddName(StringRef, DIE*, char = 0);
+  void AddName(StringRef, DIE *, char = 0);
   void FinalizeTable(AsmPrinter *, StringRef);
   void Emit(AsmPrinter *, MCSymbol *, DwarfUnits *);
 #ifndef NDEBUG
@@ -278,6 +249,5 @@ private:
   void dump() { print(dbgs()); }
 #endif
 };
-
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index fec5ced..8918f3d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -68,7 +68,7 @@ void DwarfCFIException::EndModule() {
   for (size_t i = 0, e = Personalities.size(); i != e; ++i) {
     if (!Personalities[i])
       continue;
-    MCSymbol *Sym = Asm->Mang->getSymbol(Personalities[i]);
+    MCSymbol *Sym = Asm->getSymbol(Personalities[i]);
     TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym);
     AtLeastOne = true;
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index df8ca17..a6ff953 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -22,8 +22,8 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -33,12 +33,12 @@
 using namespace llvm;
 
 /// CompileUnit - Compile unit constructor.
-CompileUnit::CompileUnit(unsigned UID, unsigned L, DIE *D, const MDNode *N,
+CompileUnit::CompileUnit(unsigned UID, DIE *D, DICompileUnit Node,
                          AsmPrinter *A, DwarfDebug *DW, DwarfUnits *DWU)
-  : UniqueID(UID), Language(L), CUDie(D), Asm(A), DD(DW), DU(DWU),
-    IndexTyDie(0), DebugInfoOffset(0) {
+    : UniqueID(UID), Node(Node), CUDie(D), Asm(A), DD(DW), DU(DWU),
+      IndexTyDie(0), DebugInfoOffset(0) {
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
-  insertDIE(N, D);
+  insertDIE(Node, D);
 }
 
 /// ~CompileUnit - Destructor for compile unit.
@@ -57,7 +57,7 @@ DIEEntry *CompileUnit::createDIEEntry(DIE *Entry) {
 /// getDefaultLowerBound - Return the default lower bound for an array. If the
 /// DWARF version doesn't handle the language, return -1.
 int64_t CompileUnit::getDefaultLowerBound() const {
-  switch (Language) {
+  switch (getLanguage()) {
   default:
     break;
 
@@ -98,32 +98,71 @@ int64_t CompileUnit::getDefaultLowerBound() const {
   return -1;
 }
 
+/// Check whether the DIE for this MDNode can be shared across CUs.
+static bool isShareableAcrossCUs(DIDescriptor D) {
+  // When the MDNode can be part of the type system, the DIE can be
+  // shared across CUs.
+  return D.isType() ||
+         (D.isSubprogram() && !DISubprogram(D).isDefinition());
+}
+
+/// getDIE - Returns the debug information entry map slot for the
+/// specified debug variable. We delegate the request to DwarfDebug
+/// when the DIE for this MDNode can be shared across CUs. The mappings
+/// will be kept in DwarfDebug for shareable DIEs.
+DIE *CompileUnit::getDIE(DIDescriptor D) const {
+  if (isShareableAcrossCUs(D))
+    return DD->getDIE(D);
+  return MDNodeToDieMap.lookup(D);
+}
+
+/// insertDIE - Insert DIE into the map. We delegate the request to DwarfDebug
+/// when the DIE for this MDNode can be shared across CUs. The mappings
+/// will be kept in DwarfDebug for shareable DIEs.
+void CompileUnit::insertDIE(DIDescriptor Desc, DIE *D) {
+  if (isShareableAcrossCUs(Desc)) {
+    DD->insertDIE(Desc, D);
+    return;
+  }
+  MDNodeToDieMap.insert(std::make_pair(Desc, D));
+}
+
 /// addFlag - Add a flag that is true.
-void CompileUnit::addFlag(DIE *Die, unsigned Attribute) {
-  if (!DD->useDarwinGDBCompat())
-    Die->addValue(Attribute, dwarf::DW_FORM_flag_present,
-                  DIEIntegerOne);
+void CompileUnit::addFlag(DIE *Die, dwarf::Attribute Attribute) {
+  if (DD->getDwarfVersion() >= 4)
+    Die->addValue(Attribute, dwarf::DW_FORM_flag_present, DIEIntegerOne);
   else
-    addUInt(Die, Attribute, dwarf::DW_FORM_flag, 1);
+    Die->addValue(Attribute, dwarf::DW_FORM_flag, DIEIntegerOne);
 }
 
 /// addUInt - Add an unsigned integer attribute data and value.
 ///
-void CompileUnit::addUInt(DIE *Die, unsigned Attribute,
-                          unsigned Form, uint64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = Integer == 1 ?
-    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
+void CompileUnit::addUInt(DIE *Die, dwarf::Attribute Attribute,
+                          Optional<dwarf::Form> Form, uint64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(false, Integer);
+  DIEValue *Value = Integer == 1 ? DIEIntegerOne : new (DIEValueAllocator)
+                        DIEInteger(Integer);
+  Die->addValue(Attribute, *Form, Value);
+}
+
+void CompileUnit::addUInt(DIEBlock *Block, dwarf::Form Form, uint64_t Integer) {
+  addUInt(Block, (dwarf::Attribute)0, Form, Integer);
 }
 
 /// addSInt - Add an signed integer attribute data and value.
 ///
-void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
-                          unsigned Form, int64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(true, Integer);
+void CompileUnit::addSInt(DIE *Die, dwarf::Attribute Attribute,
+                          Optional<dwarf::Form> Form, int64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(true, Integer);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
+  Die->addValue(Attribute, *Form, Value);
+}
+
+void CompileUnit::addSInt(DIEBlock *Die, Optional<dwarf::Form> Form,
+                          int64_t Integer) {
+  addSInt(Die, (dwarf::Attribute)0, Form, Integer);
 }
 
 /// addString - Add a string attribute data and value. We always emit a
@@ -131,9 +170,10 @@ void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
 /// more predictable sizes. In the case of split dwarf we emit an index
 /// into another table which gets us the static offset into the string
 /// table.
-void CompileUnit::addString(DIE *Die, unsigned Attribute, StringRef String) {
+void CompileUnit::addString(DIE *Die, dwarf::Attribute Attribute,
+                            StringRef String) {
   DIEValue *Value;
-  unsigned Form;
+  dwarf::Form Form;
   if (!DD->useSplitDwarf()) {
     MCSymbol *Symb = DU->getStringPoolEntry(String);
     if (Asm->needsRelocationsForDwarfStringPool())
@@ -154,7 +194,7 @@ void CompileUnit::addString(DIE *Die, unsigned Attribute, StringRef String) {
 
 /// addLocalString - Add a string attribute data and value. This is guaranteed
 /// to be in the local string pool instead of indirected.
-void CompileUnit::addLocalString(DIE *Die, unsigned Attribute,
+void CompileUnit::addLocalString(DIE *Die, dwarf::Attribute Attribute,
                                  StringRef String) {
   MCSymbol *Symb = DU->getStringPoolEntry(String);
   DIEValue *Value;
@@ -169,25 +209,32 @@ void CompileUnit::addLocalString(DIE *Die, unsigned Attribute,
 
 /// addExpr - Add a Dwarf expression attribute data and value.
 ///
-void CompileUnit::addExpr(DIE *Die, unsigned Attribute, unsigned Form,
-                          const MCExpr *Expr) {
+void CompileUnit::addExpr(DIEBlock *Die, dwarf::Form Form, const MCExpr *Expr) {
   DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
-  Die->addValue(Attribute, Form, Value);
+  Die->addValue((dwarf::Attribute)0, Form, Value);
 }
 
 /// addLabel - Add a Dwarf label attribute data and value.
 ///
-void CompileUnit::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
-                           const MCSymbol *Label) {
+void CompileUnit::addLabel(DIE *Die, dwarf::Attribute Attribute,
+                           dwarf::Form Form, const MCSymbol *Label) {
   DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
   Die->addValue(Attribute, Form, Value);
 }
 
+void CompileUnit::addLabel(DIEBlock *Die, dwarf::Form Form,
+                           const MCSymbol *Label) {
+  addLabel(Die, (dwarf::Attribute)0, Form, Label);
+}
+
 /// addLabelAddress - Add a dwarf label attribute data and value using
 /// DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
+void CompileUnit::addLabelAddress(DIE *Die, dwarf::Attribute Attribute,
                                   MCSymbol *Label) {
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
   if (!DD->useSplitDwarf()) {
     if (Label != NULL) {
       DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
@@ -206,34 +253,60 @@ void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
 /// addOpAddress - Add a dwarf op address data and value using the
 /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void CompileUnit::addOpAddress(DIE *Die, const MCSymbol *Sym) {
+void CompileUnit::addOpAddress(DIEBlock *Die, const MCSymbol *Sym) {
+  DD->addArangeLabel(SymbolCU(this, Sym));
   if (!DD->useSplitDwarf()) {
-    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Die, 0, dwarf::DW_FORM_udata, Sym);
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Die, dwarf::DW_FORM_udata, Sym);
   } else {
-    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
-    addUInt(Die, 0, dwarf::DW_FORM_GNU_addr_index, DU->getAddrPoolIndex(Sym));
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
+    addUInt(Die, dwarf::DW_FORM_GNU_addr_index, DU->getAddrPoolIndex(Sym));
   }
 }
 
 /// addDelta - Add a label delta attribute data and value.
 ///
-void CompileUnit::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                           const MCSymbol *Hi, const MCSymbol *Lo) {
+void CompileUnit::addDelta(DIE *Die, dwarf::Attribute Attribute,
+                           dwarf::Form Form, const MCSymbol *Hi,
+                           const MCSymbol *Lo) {
   DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
   Die->addValue(Attribute, Form, Value);
 }
 
 /// addDIEEntry - Add a DIE attribute data and value.
 ///
-void CompileUnit::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form,
+void CompileUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute,
                               DIE *Entry) {
-  Die->addValue(Attribute, Form, createDIEEntry(Entry));
+  addDIEEntry(Die, Attribute, createDIEEntry(Entry));
+}
+
+void CompileUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute,
+                              DIEEntry *Entry) {
+  const DIE *DieCU = Die->getCompileUnitOrNull();
+  const DIE *EntryCU = Entry->getEntry()->getCompileUnitOrNull();
+  if (!DieCU)
+    // We assume that Die belongs to this CU, if it is not linked to any CU yet.
+    DieCU = getCUDie();
+  if (!EntryCU)
+    EntryCU = getCUDie();
+  Die->addValue(Attribute, EntryCU == DieCU ? dwarf::DW_FORM_ref4
+                                            : dwarf::DW_FORM_ref_addr,
+                Entry);
+}
+
+/// Create a DIE with the given Tag, add the DIE to its parent, and
+/// call insertDIE if MD is not null.
+DIE *CompileUnit::createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N) {
+  DIE *Die = new DIE(Tag);
+  Parent.addChild(Die);
+  if (N)
+    insertDIE(N, Die);
+  return Die;
 }
 
 /// addBlock - Add block data.
 ///
-void CompileUnit::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
+void CompileUnit::addBlock(DIE *Die, dwarf::Attribute Attribute,
                            DIEBlock *Block) {
   Block->ComputeSize(Asm);
   DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
@@ -250,12 +323,12 @@ void CompileUnit::addSourceLine(DIE *Die, DIVariable V) {
   unsigned Line = V.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(V.getContext().getFilename(),
-                                            V.getContext().getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(V.getContext().getFilename(),
+                              V.getContext().getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -268,11 +341,11 @@ void CompileUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
   unsigned Line = G.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(G.getFilename(), G.getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(G.getFilename(), G.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -287,11 +360,11 @@ void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) {
   if (Line == 0)
     return;
 
-  unsigned FileID = DD->getOrCreateSourceID(SP.getFilename(),
-                                            SP.getDirectory(), getUniqueID());
+  unsigned FileID = DD->getOrCreateSourceID(SP.getFilename(), SP.getDirectory(),
+                                            getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -304,11 +377,11 @@ void CompileUnit::addSourceLine(DIE *Die, DIType Ty) {
   unsigned Line = Ty.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(Ty.getFilename(),
-                                            Ty.getDirectory(), getUniqueID());
+  unsigned FileID = DD->getOrCreateSourceID(Ty.getFilename(), Ty.getDirectory(),
+                                            getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -325,8 +398,8 @@ void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
   unsigned FileID = DD->getOrCreateSourceID(File.getFilename(),
                                             File.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -341,11 +414,11 @@ void CompileUnit::addSourceLine(DIE *Die, DINameSpace NS) {
     return;
   StringRef FN = NS.getFilename();
 
-  unsigned FileID = DD->getOrCreateSourceID(FN, NS.getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(FN, NS.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addVariableAddress - Add DW_AT_location attribute for a
@@ -362,38 +435,38 @@ void CompileUnit::addVariableAddress(const DbgVariable &DV, DIE *Die,
 }
 
 /// addRegisterOp - Add register operand.
-void CompileUnit::addRegisterOp(DIE *TheDie, unsigned Reg) {
+void CompileUnit::addRegisterOp(DIEBlock *TheDie, unsigned Reg) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
   if (DWReg < 32)
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
   else {
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
+    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
   }
 }
 
 /// addRegisterOffset - Add register offset.
-void CompileUnit::addRegisterOffset(DIE *TheDie, unsigned Reg,
+void CompileUnit::addRegisterOffset(DIEBlock *TheDie, unsigned Reg,
                                     int64_t Offset) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
   if (Reg == TRI->getFrameRegister(*Asm->MF))
     // If variable offset is based in frame register then use fbreg.
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
   else if (DWReg < 32)
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
   else {
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
+    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
   }
-  addSInt(TheDie, 0, dwarf::DW_FORM_sdata, Offset);
+  addSInt(TheDie, dwarf::DW_FORM_sdata, Offset);
 }
 
 /// addAddress - Add an address attribute to a die based on the location
 /// provided.
-void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
+void CompileUnit::addAddress(DIE *Die, dwarf::Attribute Attribute,
                              const MachineLocation &Location, bool Indirect) {
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
@@ -402,12 +475,12 @@ void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
   else {
     addRegisterOffset(Block, Location.getReg(), Location.getOffset());
     if (Indirect && !Location.isReg()) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     }
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /// addComplexAddress - Start with the address based on the location provided,
@@ -416,7 +489,7 @@ void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
 /// the starting location.  Add the DWARF information to the die.
 ///
 void CompileUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
-                                    unsigned Attribute,
+                                    dwarf::Attribute Attribute,
                                     const MachineLocation &Location) {
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   unsigned N = DV.getNumAddrElements();
@@ -429,23 +502,23 @@ void CompileUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
       i = 2;
     } else
       addRegisterOp(Block, Location.getReg());
-  }
-  else
+  } else
     addRegisterOffset(Block, Location.getReg(), Location.getOffset());
 
-  for (;i < N; ++i) {
+  for (; i < N; ++i) {
     uint64_t Element = DV.getAddrElement(i);
     if (Element == DIBuilder::OpPlus) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(Block, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
     } else if (Element == DIBuilder::OpDeref) {
       if (!Location.isReg())
-        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    } else llvm_unreachable("unknown DIBuilder Opcode");
+        addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    } else
+      llvm_unreachable("unknown DIBuilder Opcode");
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -509,44 +582,41 @@ void CompileUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
 /// more information, read large comment just above here.
 ///
 void CompileUnit::addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
-                                       unsigned Attribute,
+                                       dwarf::Attribute Attribute,
                                        const MachineLocation &Location) {
   DIType Ty = DV.getType();
   DIType TmpTy = Ty;
-  unsigned Tag = Ty.getTag();
+  uint16_t Tag = Ty.getTag();
   bool isPointer = false;
 
   StringRef varName = DV.getName();
 
   if (Tag == dwarf::DW_TAG_pointer_type) {
-    DIDerivedType DTy = DIDerivedType(Ty);
-    TmpTy = DTy.getTypeDerivedFrom();
+    DIDerivedType DTy(Ty);
+    TmpTy = resolve(DTy.getTypeDerivedFrom());
     isPointer = true;
   }
 
-  DICompositeType blockStruct = DICompositeType(TmpTy);
+  DICompositeType blockStruct(TmpTy);
 
   // Find the __forwarding field and the variable field in the __Block_byref
   // struct.
   DIArray Fields = blockStruct.getTypeArray();
-  DIDescriptor varField = DIDescriptor();
-  DIDescriptor forwardingField = DIDescriptor();
+  DIDerivedType varField;
+  DIDerivedType forwardingField;
 
   for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) {
-    DIDescriptor Element = Fields.getElement(i);
-    DIDerivedType DT = DIDerivedType(Element);
+    DIDerivedType DT(Fields.getElement(i));
     StringRef fieldName = DT.getName();
     if (fieldName == "__forwarding")
-      forwardingField = Element;
+      forwardingField = DT;
     else if (fieldName == varName)
-      varField = Element;
+      varField = DT;
   }
 
   // Get the offsets for the forwarding field and the variable field.
-  unsigned forwardingFieldOffset =
-    DIDerivedType(forwardingField).getOffsetInBits() >> 3;
-  unsigned varFieldOffset =
-    DIDerivedType(varField).getOffsetInBits() >> 3;
+  unsigned forwardingFieldOffset = forwardingField.getOffsetInBits() >> 3;
+  unsigned varFieldOffset = varField.getOffsetInBits() >> 2;
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
@@ -560,45 +630,91 @@ void CompileUnit::addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
   if (isPointer)
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Next add the offset for the '__forwarding' field:
   // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
   // adding the offset if it's 0.
   if (forwardingFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, dwarf::DW_FORM_udata, forwardingFieldOffset);
   }
 
   // Now dereference the __forwarding field to get to the real __Block_byref
   // struct:  DW_OP_deref.
-  addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+  addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Now that we've got the real __Block_byref... struct, add the offset
   // for the variable's field to get to the location of the actual variable:
   // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
   if (varFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, dwarf::DW_FORM_udata, varFieldOffset);
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /// isTypeSigned - Return true if the type is signed.
-static bool isTypeSigned(DIType Ty, int *SizeInBits) {
+static bool isTypeSigned(DwarfDebug *DD, DIType Ty, int *SizeInBits) {
   if (Ty.isDerivedType())
-    return isTypeSigned(DIDerivedType(Ty).getTypeDerivedFrom(), SizeInBits);
+    return isTypeSigned(DD, DD->resolve(DIDerivedType(Ty).getTypeDerivedFrom()),
+                        SizeInBits);
   if (Ty.isBasicType())
-    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed
-        || DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
+    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed ||
+        DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
       *SizeInBits = Ty.getSizeInBits();
       return true;
     }
   return false;
 }
 
+/// Return true if type encoding is unsigned.
+static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
+  DIDerivedType DTy(Ty);
+  if (DTy.isDerivedType())
+    return isUnsignedDIType(DD, DD->resolve(DTy.getTypeDerivedFrom()));
+
+  DIBasicType BTy(Ty);
+  if (BTy.isBasicType()) {
+    unsigned Encoding = BTy.getEncoding();
+    if (Encoding == dwarf::DW_ATE_unsigned ||
+        Encoding == dwarf::DW_ATE_unsigned_char ||
+        Encoding == dwarf::DW_ATE_boolean)
+      return true;
+  }
+  return false;
+}
+
+/// If this type is derived from a base type then return base type size.
+static uint64_t getBaseTypeSize(DwarfDebug *DD, DIDerivedType Ty) {
+  unsigned Tag = Ty.getTag();
+
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return Ty.getSizeInBits();
+
+  DIType BaseType = DD->resolve(Ty.getTypeDerivedFrom());
+
+  // If this type is not derived from any type then take conservative approach.
+  if (!BaseType.isValid())
+    return Ty.getSizeInBits();
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return Ty.getSizeInBits();
+
+  if (BaseType.isDerivedType())
+    return getBaseTypeSize(DD, DIDerivedType(BaseType));
+
+  return BaseType.getSizeInBits();
+}
+
 /// addConstantValue - Add constant value entry in variable DIE.
 void CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
                                    DIType Ty) {
@@ -606,32 +722,47 @@ void CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
   // their maximum bit width which is a bit unfortunate (& doesn't prefer
   // udata/sdata over dataN as suggested by the DWARF spec)
   assert(MO.isImm() && "Invalid machine operand!");
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   int SizeInBits = -1;
-  bool SignedConstant = isTypeSigned(Ty, &SizeInBits);
-  unsigned Form = SignedConstant ? dwarf::DW_FORM_sdata : dwarf::DW_FORM_udata;
-  switch (SizeInBits) {
-    case 8:  Form = dwarf::DW_FORM_data1; break;
-    case 16: Form = dwarf::DW_FORM_data2; break;
-    case 32: Form = dwarf::DW_FORM_data4; break;
-    case 64: Form = dwarf::DW_FORM_data8; break;
-    default: break;
+  bool SignedConstant = isTypeSigned(DD, Ty, &SizeInBits);
+  dwarf::Form Form;
+
+  // If we're a signed constant definitely use sdata.
+  if (SignedConstant) {
+    addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, MO.getImm());
+    return;
   }
-  SignedConstant ? addSInt(Block, 0, Form, MO.getImm())
-    : addUInt(Block, 0, Form, MO.getImm());
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  // Else use data for now unless it's larger than we can deal with.
+  switch (SizeInBits) {
+  case 8:
+    Form = dwarf::DW_FORM_data1;
+    break;
+  case 16:
+    Form = dwarf::DW_FORM_data2;
+    break;
+  case 32:
+    Form = dwarf::DW_FORM_data4;
+    break;
+  case 64:
+    Form = dwarf::DW_FORM_data8;
+    break;
+  default:
+    Form = dwarf::DW_FORM_udata;
+    addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
+    return;
+  }
+  addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
 }
 
 /// addConstantFPValue - Add constant value entry in variable DIE.
 void CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
-  assert (MO.isFPImm() && "Invalid machine operand!");
+  assert(MO.isFPImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   APFloat FPImm = MO.getFPImm()->getValueAPF();
 
   // Get the raw data form of the floating point.
   const APInt FltVal = FPImm.bitcastToAPInt();
-  const char *FltPtr = (const char*)FltVal.getRawData();
+  const char *FltPtr = (const char *)FltVal.getRawData();
 
   int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
   bool LittleEndian = Asm->getDataLayout().isLittleEndian();
@@ -641,15 +772,15 @@ void CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
 
   // Output the constant to DWARF one byte at a time.
   for (; Start != Stop; Start += Incr)
-    addUInt(Block, 0, dwarf::DW_FORM_data1,
-            (unsigned char)0xFF & FltPtr[Start]);
+    addUInt(Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
 }
 
 /// addConstantFPValue - Add constant value entry in variable DIE.
 void CompileUnit::addConstantFPValue(DIE *Die, const ConstantFP *CFP) {
-  addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), false);
+  // Pass this down to addConstantValue as an unsigned bag of bits.
+  addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
 }
 
 /// addConstantValue - Add constant value entry in variable DIE.
@@ -662,19 +793,34 @@ void CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
 void CompileUnit::addConstantValue(DIE *Die, const APInt &Val, bool Unsigned) {
   unsigned CIBitWidth = Val.getBitWidth();
   if (CIBitWidth <= 64) {
-    unsigned form = 0;
+    // If we're a signed constant definitely use sdata.
+    if (!Unsigned) {
+      addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
+              Val.getSExtValue());
+      return;
+    }
+
+    // Else use data for now unless it's larger than we can deal with.
+    dwarf::Form Form;
     switch (CIBitWidth) {
-    case 8: form = dwarf::DW_FORM_data1; break;
-    case 16: form = dwarf::DW_FORM_data2; break;
-    case 32: form = dwarf::DW_FORM_data4; break;
-    case 64: form = dwarf::DW_FORM_data8; break;
+    case 8:
+      Form = dwarf::DW_FORM_data1;
+      break;
+    case 16:
+      Form = dwarf::DW_FORM_data2;
+      break;
+    case 32:
+      Form = dwarf::DW_FORM_data4;
+      break;
+    case 64:
+      Form = dwarf::DW_FORM_data8;
+      break;
     default:
-      form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata;
+      addUInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
+              Val.getZExtValue());
+      return;
     }
-    if (Unsigned)
-      addUInt(Die, dwarf::DW_AT_const_value, form, Val.getZExtValue());
-    else
-      addSInt(Die, dwarf::DW_AT_const_value, form, Val.getSExtValue());
+    addUInt(Die, dwarf::DW_AT_const_value, Form, Val.getZExtValue());
     return;
   }
 
@@ -693,10 +839,10 @@ void CompileUnit::addConstantValue(DIE *Die, const APInt &Val, bool Unsigned) {
       c = Ptr64[i / 8] >> (8 * (i & 7));
     else
       c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, c);
+    addUInt(Block, dwarf::DW_FORM_data1, c);
   }
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
 }
 
 /// addTemplateParams - Add template parameters into buffer.
@@ -705,47 +851,48 @@ void CompileUnit::addTemplateParams(DIE &Buffer, DIArray TParams) {
   for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) {
     DIDescriptor Element = TParams.getElement(i);
     if (Element.isTemplateTypeParameter())
-      Buffer.addChild(getOrCreateTemplateTypeParameterDIE(
-                        DITemplateTypeParameter(Element)));
+      constructTemplateTypeParameterDIE(Buffer,
+                                        DITemplateTypeParameter(Element));
     else if (Element.isTemplateValueParameter())
-      Buffer.addChild(getOrCreateTemplateValueParameterDIE(
-                        DITemplateValueParameter(Element)));
+      constructTemplateValueParameterDIE(Buffer,
+                                         DITemplateValueParameter(Element));
   }
 }
 
 /// getOrCreateContextDIE - Get context owner's DIE.
-DIE *CompileUnit::getOrCreateContextDIE(DIDescriptor Context) {
+DIE *CompileUnit::getOrCreateContextDIE(DIScope Context) {
+  if (!Context || Context.isFile())
+    return getCUDie();
   if (Context.isType())
     return getOrCreateTypeDIE(DIType(Context));
-  else if (Context.isNameSpace())
+  if (Context.isNameSpace())
     return getOrCreateNameSpace(DINameSpace(Context));
-  else if (Context.isSubprogram())
+  if (Context.isSubprogram())
     return getOrCreateSubprogramDIE(DISubprogram(Context));
-  else
-    return getDIE(Context);
-}
-
-/// addToContextOwner - Add Die into the list of its context owner's children.
-void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
-  if (DIE *ContextDIE = getOrCreateContextDIE(Context))
-    ContextDIE->addChild(Die);
-  else
-    addDie(Die);
+  return getDIE(Context);
 }
 
 /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
 /// given DIType.
 DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
-  DIType Ty(TyNode);
-  if (!Ty.isType())
+  if (!TyNode)
     return NULL;
+
+  DIType Ty(TyNode);
+  assert(Ty.isType());
+
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(Ty.getContext()));
+  assert(ContextDIE);
+
   DIE *TyDIE = getDIE(Ty);
   if (TyDIE)
     return TyDIE;
 
   // Create new type.
-  TyDIE = new DIE(dwarf::DW_TAG_base_type);
-  insertDIE(Ty, TyDIE);
+  TyDIE = createAndAddDIE(Ty.getTag(), *ContextDIE, Ty);
+
   if (Ty.isBasicType())
     constructTypeDIE(*TyDIE, DIBasicType(Ty));
   else if (Ty.isCompositeType())
@@ -762,28 +909,24 @@ DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
       DICompositeType CT(Ty);
       // A runtime language of 0 actually means C/C++ and that any
       // non-negative value is some version of Objective-C/C++.
-      IsImplementation = (CT.getRunTimeLang() == 0) ||
-        CT.isObjcClassComplete();
+      IsImplementation = (CT.getRunTimeLang() == 0) || CT.isObjcClassComplete();
     }
-    unsigned Flags = IsImplementation ?
-                     DwarfAccelTable::eTypeFlagClassIsImplementation : 0;
+    unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
     addAccelType(Ty.getName(), std::make_pair(TyDIE, Flags));
   }
 
-  addToContextOwner(TyDIE, Ty.getContext());
   return TyDIE;
 }
 
 /// addType - Add a new type attribute to the specified entity.
-void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
-  if (!Ty.isType())
-    return;
+void CompileUnit::addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute) {
+  assert(Ty && "Trying to add a type that doesn't exist?");
 
   // Check for pre-existence.
   DIEEntry *Entry = getDIEEntry(Ty);
   // If it exists then use the existing value.
   if (Entry) {
-    Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+    addDIEEntry(Entity, Attribute, Entry);
     return;
   }
 
@@ -793,28 +936,105 @@ void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
   // Set up proxy.
   Entry = createDIEEntry(Buffer);
   insertDIEEntry(Ty, Entry);
-  Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+  addDIEEntry(Entity, Attribute, Entry);
 
   // If this is a complete composite type then include it in the
   // list of global types.
   addGlobalType(Ty);
 }
 
+// Accelerator table mutators - add each name along with its companion
+// DIE to the proper table while ensuring that the name that we're going
+// to reference is in the string table. We do this since the names we
+// add may not only be identical to the names in the DIE.
+void CompileUnit::addAccelName(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelNames[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelObjC(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelObjC[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelNamespace(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelNamespace[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<std::pair<DIE *, unsigned> > &DIEs = AccelTypes[Name];
+  DIEs.push_back(Die);
+}
+
+/// addGlobalName - Add a new global name to the compile unit.
+void CompileUnit::addGlobalName(StringRef Name, DIE *Die, DIScope Context) {
+  std::string FullName = getParentContextString(Context) + Name.str();
+  GlobalNames[FullName] = Die;
+}
+
 /// addGlobalType - Add a new global type to the compile unit.
 ///
 void CompileUnit::addGlobalType(DIType Ty) {
-  DIDescriptor Context = Ty.getContext();
-  if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl()
-      && (!Context || Context.isCompileUnit() || Context.isFile()
-          || Context.isNameSpace()))
-    if (DIEEntry *Entry = getDIEEntry(Ty))
-      GlobalTypes[Ty.getName()] = Entry->getEntry();
+  DIScope Context = resolve(Ty.getContext());
+  if (!Ty.getName().empty() && !Ty.isForwardDecl() &&
+      (!Context || Context.isCompileUnit() || Context.isFile() ||
+       Context.isNameSpace()))
+    if (DIEEntry *Entry = getDIEEntry(Ty)) {
+      std::string FullName =
+          getParentContextString(Context) + Ty.getName().str();
+      GlobalTypes[FullName] = Entry->getEntry();
+    }
+}
+
+/// getParentContextString - Walks the metadata parent chain in a language
+/// specific manner (using the compile unit language) and returns
+/// it as a string. This is done at the metadata level because DIEs may
+/// not currently have been added to the parent context and walking the
+/// DIEs looking for names is more expensive than walking the metadata.
+std::string CompileUnit::getParentContextString(DIScope Context) const {
+  if (!Context)
+    return "";
+
+  // FIXME: Decide whether to implement this for non-C++ languages.
+  if (getLanguage() != dwarf::DW_LANG_C_plus_plus)
+    return "";
+
+  std::string CS;
+  SmallVector<DIScope, 1> Parents;
+  while (!Context.isCompileUnit()) {
+    Parents.push_back(Context);
+    if (Context.getContext())
+      Context = resolve(Context.getContext());
+    else
+      // Structure, etc types will have a NULL context if they're at the top
+      // level.
+      break;
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (SmallVectorImpl<DIScope>::reverse_iterator I = Parents.rbegin(),
+                                                  E = Parents.rend();
+       I != E; ++I) {
+    DIScope Ctx = *I;
+    StringRef Name = Ctx.getName();
+    if (!Name.empty()) {
+      CS += Name;
+      CS += "::";
+    }
+  }
+  return CS;
 }
 
-/// addPubTypes - Add type for pubtypes section.
+/// addPubTypes - Add subprogram argument types for pubtypes section.
 void CompileUnit::addPubTypes(DISubprogram SP) {
   DICompositeType SPTy = SP.getType();
-  unsigned SPTag = SPTy.getTag();
+  uint16_t SPTag = SPTy.getTag();
   if (SPTag != dwarf::DW_TAG_subroutine_type)
     return;
 
@@ -835,18 +1055,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
-  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type) {
-    Buffer.setTag(dwarf::DW_TAG_unspecified_type);
-    // An unspecified type only has a name attribute.
+  // An unspecified type only has a name attribute.
+  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type)
     return;
-  }
 
-  Buffer.setTag(dwarf::DW_TAG_base_type);
   addUInt(&Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
           BTy.getEncoding());
 
   uint64_t Size = BTy.getSizeInBits() >> 3;
-  addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+  addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
 }
 
 /// constructTypeDIE - Construct derived type die from DIDerivedType.
@@ -854,16 +1071,12 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
   // Get core information.
   StringRef Name = DTy.getName();
   uint64_t Size = DTy.getSizeInBits() >> 3;
-  unsigned Tag = DTy.getTag();
-
-  // FIXME - Workaround for templates.
-  if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type;
-
-  Buffer.setTag(Tag);
+  uint16_t Tag = Buffer.getTag();
 
   // Map to main type, void will not have a type.
-  DIType FromTy = DTy.getTypeDerivedFrom();
-  addType(&Buffer, FromTy);
+  DIType FromTy = resolve(DTy.getTypeDerivedFrom());
+  if (FromTy)
+    addType(&Buffer, FromTy);
 
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
@@ -871,11 +1084,11 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
 
   // Add size if non-zero (derived types might be zero-sized.)
   if (Size && Tag != dwarf::DW_TAG_pointer_type)
-    addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+    addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
-                  getOrCreateTypeDIE(DTy.getClassType()));
+    addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
+                getOrCreateTypeDIE(resolve(DTy.getClassType())));
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy.isForwardDecl())
     addSourceLine(&Buffer, DTy);
@@ -883,20 +1096,20 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
 
 /// Return true if the type is appropriately scoped to be contained inside
 /// its own type unit.
-static bool isTypeUnitScoped(DIType Ty) {
-  DIScope Parent = Ty.getContext();
+static bool isTypeUnitScoped(DIType Ty, const DwarfDebug *DD) {
+  DIScope Parent = DD->resolve(Ty.getContext());
   while (Parent) {
     // Don't generate a hash for anything scoped inside a function.
     if (Parent.isSubprogram())
       return false;
-    Parent = Parent.getContext();
+    Parent = DD->resolve(Parent.getContext());
   }
   return true;
 }
 
 /// Return true if the type should be split out into a type unit.
-static bool shouldCreateTypeUnit(DICompositeType CTy) {
-  unsigned Tag = CTy.getTag();
+static bool shouldCreateTypeUnit(DICompositeType CTy, const DwarfDebug *DD) {
+  uint16_t Tag = CTy.getTag();
 
   switch (Tag) {
   case dwarf::DW_TAG_structure_type:
@@ -904,13 +1117,11 @@ static bool shouldCreateTypeUnit(DICompositeType CTy) {
   case dwarf::DW_TAG_enumeration_type:
   case dwarf::DW_TAG_class_type:
     // If this is a class, structure, union, or enumeration type
-    // that is not a declaration, is a type definition, and not scoped
+    // that is a definition (not a declaration), and not scoped
     // inside a function then separate this out as a type unit.
-    if (CTy.isForwardDecl() || !isTypeUnitScoped(CTy))
-      return 0;
-    return 1;
+    return !CTy.isForwardDecl() && isTypeUnitScoped(CTy, DD);
   default:
-    return 0;
+    return false;
   }
 }
 
@@ -920,69 +1131,47 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   StringRef Name = CTy.getName();
 
   uint64_t Size = CTy.getSizeInBits() >> 3;
-  unsigned Tag = CTy.getTag();
-  Buffer.setTag(Tag);
+  uint16_t Tag = Buffer.getTag();
 
   switch (Tag) {
   case dwarf::DW_TAG_array_type:
-    constructArrayTypeDIE(Buffer, &CTy);
+    constructArrayTypeDIE(Buffer, CTy);
     break;
-  case dwarf::DW_TAG_enumeration_type: {
-    DIArray Elements = CTy.getTypeArray();
-
-    // Add enumerators to enumeration type.
-    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-      DIE *ElemDie = NULL;
-      DIDescriptor Enum(Elements.getElement(i));
-      if (Enum.isEnumerator()) {
-        ElemDie = constructEnumTypeDIE(DIEnumerator(Enum));
-        Buffer.addChild(ElemDie);
-      }
-    }
-    DIType DTy = CTy.getTypeDerivedFrom();
-    if (DTy.isType()) {
-      addType(&Buffer, DTy);
-      addUInt(&Buffer, dwarf::DW_AT_enum_class, dwarf::DW_FORM_flag, 1);
-    }
-  }
+  case dwarf::DW_TAG_enumeration_type:
+    constructEnumTypeDIE(Buffer, CTy);
     break;
   case dwarf::DW_TAG_subroutine_type: {
-    // Add return type.
+    // Add return type. A void return won't have a type.
     DIArray Elements = CTy.getTypeArray();
-    DIDescriptor RTy = Elements.getElement(0);
-    addType(&Buffer, DIType(RTy));
+    DIType RTy(Elements.getElement(0));
+    if (RTy)
+      addType(&Buffer, RTy);
 
     bool isPrototyped = true;
     // Add arguments.
     for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
       DIDescriptor Ty = Elements.getElement(i);
       if (Ty.isUnspecifiedParameter()) {
-        DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters);
-        Buffer.addChild(Arg);
+        createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
         isPrototyped = false;
       } else {
-        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        DIE *Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
         addType(Arg, DIType(Ty));
         if (DIType(Ty).isArtificial())
           addFlag(Arg, dwarf::DW_AT_artificial);
-        Buffer.addChild(Arg);
       }
     }
     // Add prototype flag if we're dealing with a C language and the
     // function has been prototyped.
+    uint16_t Language = getLanguage();
     if (isPrototyped &&
-        (Language == dwarf::DW_LANG_C89 ||
-         Language == dwarf::DW_LANG_C99 ||
+        (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
          Language == dwarf::DW_LANG_ObjC))
       addFlag(&Buffer, dwarf::DW_AT_prototyped);
-  }
-    break;
+  } break;
   case dwarf::DW_TAG_structure_type:
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_class_type: {
-    if (CTy.isForwardDecl())
-      break;
-
     // Add elements to structure type.
     DIArray Elements = CTy.getTypeArray();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
@@ -990,7 +1179,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
       DIE *ElemDie = NULL;
       if (Element.isSubprogram()) {
         DISubprogram SP(Element);
-        ElemDie = getOrCreateSubprogramDIE(DISubprogram(Element));
+        ElemDie = getOrCreateSubprogramDIE(SP);
         if (SP.isProtected())
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
                   dwarf::DW_ACCESS_protected);
@@ -999,21 +1188,23 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
                   dwarf::DW_ACCESS_private);
         else
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_public);
+                  dwarf::DW_ACCESS_public);
         if (SP.isExplicit())
           addFlag(ElemDie, dwarf::DW_AT_explicit);
       } else if (Element.isDerivedType()) {
         DIDerivedType DDTy(Element);
         if (DDTy.getTag() == dwarf::DW_TAG_friend) {
-          ElemDie = new DIE(dwarf::DW_TAG_friend);
-          addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
-        } else if (DDTy.isStaticMember())
-          ElemDie = createStaticMemberDIE(DDTy);
-        else
-          ElemDie = createMemberDIE(DDTy);
+          ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
+          addType(ElemDie, resolve(DDTy.getTypeDerivedFrom()),
+                  dwarf::DW_AT_friend);
+        } else if (DDTy.isStaticMember()) {
+          getOrCreateStaticMemberDIE(DDTy);
+        } else {
+          constructMemberDIE(Buffer, DDTy);
+        }
       } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
-        ElemDie = new DIE(Property.getTag());
+        ElemDie = createAndAddDIE(Property.getTag(), Buffer);
         StringRef PropertyName = Property.getObjCPropertyName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
         addType(ElemDie, Property.getType());
@@ -1038,8 +1229,8 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         if (Property.isNonAtomicObjCProperty())
           PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_nonatomic;
         if (PropertyAttributes)
-          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, 0,
-                 PropertyAttributes);
+          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, None,
+                  PropertyAttributes);
 
         DIEEntry *Entry = getDIEEntry(Element);
         if (!Entry) {
@@ -1048,18 +1239,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         }
       } else
         continue;
-      Buffer.addChild(ElemDie);
     }
 
     if (CTy.isAppleBlockExtension())
       addFlag(&Buffer, dwarf::DW_AT_APPLE_block);
 
-    DICompositeType ContainingType = CTy.getContainingType();
-    if (DIDescriptor(ContainingType).isCompositeType())
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
-                  getOrCreateTypeDIE(DIType(ContainingType)));
-    else
-      addToContextOwner(&Buffer, CTy.getContext());
+    DICompositeType ContainingType(resolve(CTy.getContainingType()));
+    if (ContainingType)
+      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
+                  getOrCreateTypeDIE(ContainingType));
 
     if (CTy.isObjcClassComplete())
       addFlag(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
@@ -1067,8 +1255,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // Add template parameters to a class, structure or union types.
     // FIXME: The support isn't in the metadata for this yet.
     if (Tag == dwarf::DW_TAG_class_type ||
-        Tag == dwarf::DW_TAG_structure_type ||
-        Tag == dwarf::DW_TAG_union_type)
+        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
       addTemplateParams(Buffer, CTy.getTemplateParams());
 
     break;
@@ -1082,16 +1269,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
   if (Tag == dwarf::DW_TAG_enumeration_type ||
-      Tag == dwarf::DW_TAG_class_type ||
-      Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
       Tag == dwarf::DW_TAG_union_type) {
     // Add size if non-zero (derived types might be zero-sized.)
     // TODO: Do we care about size for enum forward declarations?
     if (Size)
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
     else if (!CTy.isForwardDecl())
       // Add zero size if it is not a forward declaration.
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, 0);
 
     // If we're a forward decl, say so.
     if (CTy.isForwardDecl())
@@ -1104,131 +1290,126 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // No harm in adding the runtime language to the declaration.
     unsigned RLang = CTy.getRunTimeLang();
     if (RLang)
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
-              dwarf::DW_FORM_data1, RLang);
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
+              RLang);
   }
   // If this is a type applicable to a type unit it then add it to the
   // list of types we'll compute a hash for later.
-  if (shouldCreateTypeUnit(CTy))
+  if (shouldCreateTypeUnit(CTy, DD))
     DD->addTypeUnitType(&Buffer);
 }
 
-/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
-/// for the given DITemplateTypeParameter.
-DIE *
-CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
-  DIE *ParamDIE = getDIE(TP);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter);
-  addType(ParamDIE, TP.getType());
+/// constructTemplateTypeParameterDIE - Construct new DIE for the given
+/// DITemplateTypeParameter.
+void
+CompileUnit::constructTemplateTypeParameterDIE(DIE &Buffer,
+                                               DITemplateTypeParameter TP) {
+  DIE *ParamDIE =
+      createAndAddDIE(dwarf::DW_TAG_template_type_parameter, Buffer);
+  // Add the type if it exists, it could be void and therefore no type.
+  if (TP.getType())
+    addType(ParamDIE, resolve(TP.getType()));
   if (!TP.getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, TP.getName());
-  return ParamDIE;
-}
-
-/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE
-/// for the given DITemplateValueParameter.
-DIE *
-CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV){
-  DIE *ParamDIE = getDIE(TPV);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(TPV.getTag());
-  addType(ParamDIE, TPV.getType());
-  if (!TPV.getName().empty())
-    addString(ParamDIE, dwarf::DW_AT_name, TPV.getName());
-  if (Value *Val = TPV.getValue()) {
+}
+
+/// constructTemplateValueParameterDIE - Construct new DIE for the given
+/// DITemplateValueParameter.
+void
+CompileUnit::constructTemplateValueParameterDIE(DIE &Buffer,
+                                                DITemplateValueParameter VP) {
+  DIE *ParamDIE = createAndAddDIE(VP.getTag(), Buffer);
+
+  // Add the type if there is one, template template and template parameter
+  // packs will not have a type.
+  if (VP.getTag() == dwarf::DW_TAG_template_value_parameter)
+    addType(ParamDIE, resolve(VP.getType()));
+  if (!VP.getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, VP.getName());
+  if (Value *Val = VP.getValue()) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Val))
-      addConstantValue(ParamDIE, CI, TPV.getType().isUnsignedDIType());
+      addConstantValue(ParamDIE, CI,
+                       isUnsignedDIType(DD, resolve(VP.getType())));
     else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val)) {
       // For declaration non-type template parameters (such as global values and
       // functions)
       DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-      addOpAddress(Block, Asm->Mang->getSymbol(GV));
+      addOpAddress(Block, Asm->getSymbol(GV));
       // Emit DW_OP_stack_value to use the address as the immediate value of the
       // parameter, rather than a pointer to it.
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
-      addBlock(ParamDIE, dwarf::DW_AT_location, 0, Block);
-    } else if (TPV.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+      addBlock(ParamDIE, dwarf::DW_AT_location, Block);
+    } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
       assert(isa<MDString>(Val));
       addString(ParamDIE, dwarf::DW_AT_GNU_template_name,
                 cast<MDString>(Val)->getString());
-    } else if (TPV.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
+    } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
       assert(isa<MDNode>(Val));
       DIArray A(cast<MDNode>(Val));
       addTemplateParams(*ParamDIE, A);
     }
   }
-
-  return ParamDIE;
 }
 
 /// getOrCreateNameSpace - Create a DIE for DINameSpace.
 DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(NS.getContext());
+
   DIE *NDie = getDIE(NS);
   if (NDie)
     return NDie;
-  NDie = new DIE(dwarf::DW_TAG_namespace);
-  insertDIE(NS, NDie);
+  NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
+
   if (!NS.getName().empty()) {
     addString(NDie, dwarf::DW_AT_name, NS.getName());
     addAccelNamespace(NS.getName(), NDie);
+    addGlobalName(NS.getName(), NDie, NS.getContext());
   } else
     addAccelNamespace("(anonymous namespace)", NDie);
   addSourceLine(NDie, NS);
-  addToContextOwner(NDie, NS.getContext());
   return NDie;
 }
 
 /// getOrCreateSubprogramDIE - Create new DIE using SP.
 DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE (as is the case for member function
+  // declarations).
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
+
   DIE *SPDie = getDIE(SP);
   if (SPDie)
     return SPDie;
 
-  SPDie = new DIE(dwarf::DW_TAG_subprogram);
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  if (SPDecl.isSubprogram())
+    // Add subprogram definitions to the CU die directly.
+    ContextDIE = CUDie.get();
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
-  insertDIE(SP, SPDie);
+  SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
 
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
   DIE *DeclDie = NULL;
-  if (SPDecl.isSubprogram()) {
+  if (SPDecl.isSubprogram())
     DeclDie = getOrCreateSubprogramDIE(SPDecl);
-  }
-
-  // Add to context owner.
-  addToContextOwner(SPDie, SP.getContext());
 
   // Add function template parameters.
   addTemplateParams(*SPDie, SP.getTemplateParams());
 
-  // Unfortunately this code needs to stay here instead of below the
-  // AT_specification code in order to work around a bug in older
-  // gdbs that requires the linkage name to resolve multiple template
-  // functions.
-  // TODO: Remove this set of code when we get rid of the old gdb
-  // compatibility.
-  StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty() && DD->useDarwinGDBCompat())
-    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
-              GlobalValue::getRealLinkageName(LinkageName));
-
   // If this DIE is going to refer declaration info using AT_specification
   // then there is no need to add other attributes.
   if (DeclDie) {
     // Refer function declaration directly.
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
-                DeclDie);
+    addDIEEntry(SPDie, dwarf::DW_AT_specification, DeclDie);
 
     return SPDie;
   }
 
   // Add the linkage name if we have one.
-  if (!LinkageName.empty() && !DD->useDarwinGDBCompat())
+  StringRef LinkageName = SP.getLinkageName();
+  if (!LinkageName.empty())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
               GlobalValue::getRealLinkageName(LinkageName));
 
@@ -1240,29 +1421,31 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
   // Add the prototype if we have a prototype and we have a C like
   // language.
+  uint16_t Language = getLanguage();
   if (SP.isPrototyped() &&
-      (Language == dwarf::DW_LANG_C89 ||
-       Language == dwarf::DW_LANG_C99 ||
+      (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
        Language == dwarf::DW_LANG_ObjC))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
-  // Add Return Type.
   DICompositeType SPTy = SP.getType();
   assert(SPTy.getTag() == dwarf::DW_TAG_subroutine_type &&
          "the type of a subprogram should be a subroutine");
 
   DIArray Args = SPTy.getTypeArray();
-  addType(SPDie, DIType(Args.getElement(0)));
+  // Add a return type. If this is a type like a C/C++ void type we don't add a
+  // return type.
+  if (Args.getElement(0))
+    addType(SPDie, DIType(Args.getElement(0)));
 
   unsigned VK = SP.getVirtuality();
   if (VK) {
     addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
     DIEBlock *Block = getDIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
-    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
-    ContainingTypeMap.insert(std::make_pair(SPDie,
-                                            SP.getContainingType()));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(Block, dwarf::DW_FORM_udata, SP.getVirtualIndex());
+    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
+    ContainingTypeMap.insert(
+        std::make_pair(SPDie, resolve(SP.getContainingType())));
   }
 
   if (!SP.isDefinition()) {
@@ -1270,13 +1453,12 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
-      DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-      DIType ATy = DIType(Args.getElement(i));
+    for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
+      DIE *Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, *SPDie);
+      DIType ATy(Args.getElement(i));
       addType(Arg, ATy);
       if (ATy.isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
-      SPDie->addChild(Arg);
     }
   }
 
@@ -1324,16 +1506,16 @@ static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
 }
 
 /// createGlobalVariableDIE - create global variable DIE.
-void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
+void CompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
+
   // Check for pre-existence.
-  if (getDIE(N))
+  if (getDIE(GV))
     return;
 
-  DIGlobalVariable GV(N);
   if (!GV.isGlobalVariable())
     return;
 
-  DIDescriptor GVContext = GV.getContext();
+  DIScope GVContext = GV.getContext();
   DIType GTy = GV.getType();
 
   // If this is a static data member definition, some attributes belong
@@ -1344,35 +1526,30 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
   if (SDMDecl.Verify()) {
     assert(SDMDecl.isStaticMember() && "Expected static member decl");
     // We need the declaration DIE that is in the static member's class.
-    // But that class might not exist in the DWARF yet.
-    // Creating the class will create the static member decl DIE.
-    getOrCreateContextDIE(SDMDecl.getContext());
-    VariableDIE = getDIE(SDMDecl);
-    assert(VariableDIE && "Static member decl has no context?");
+    VariableDIE = getOrCreateStaticMemberDIE(SDMDecl);
     IsStaticMember = true;
   }
 
   // If this is not a static data member definition, create the variable
   // DIE and add the initial set of attributes to it.
   if (!VariableDIE) {
-    VariableDIE = new DIE(GV.getTag());
+    // Construct the context before querying for the existence of the DIE in
+    // case such construction creates the DIE.
+    DIE *ContextDIE = getOrCreateContextDIE(GVContext);
+
     // Add to map.
-    insertDIE(N, VariableDIE);
+    VariableDIE = createAndAddDIE(GV.getTag(), *ContextDIE, GV);
 
     // Add name and type.
     addString(VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
     addType(VariableDIE, GTy);
 
     // Add scoping info.
-    if (!GV.isLocalToUnit()) {
+    if (!GV.isLocalToUnit())
       addFlag(VariableDIE, dwarf::DW_AT_external);
-      addGlobalName(GV.getName(), VariableDIE);
-    }
 
     // Add line number info.
     addSourceLine(VariableDIE, GV);
-    // Add to context owner.
-    addToContextOwner(VariableDIE, GVContext);
   }
 
   // Add location.
@@ -1382,7 +1559,7 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
   if (isGlobalVariable) {
     addToAccelTable = true;
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    const MCSymbol *Sym = Asm->Mang->getSymbol(GV.getGlobal());
+    const MCSymbol *Sym = Asm->getSymbol(GV.getGlobal());
     if (GV.getGlobal()->isThreadLocal()) {
       // FIXME: Make this work with -gsplit-dwarf.
       unsigned PointerSize = Asm->getDataLayout().getPointerSize();
@@ -1393,68 +1570,62 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
       // Based on GCC's support for TLS:
       if (!DD->useSplitDwarf()) {
         // 1) Start with a constNu of the appropriate pointer size
-        addUInt(Block, 0, dwarf::DW_FORM_data1,
+        addUInt(Block, dwarf::DW_FORM_data1,
                 PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
-        // 2) containing the (relocated) address of the TLS variable
-        addExpr(Block, 0, dwarf::DW_FORM_udata, Expr);
+        // 2) containing the (relocated) offset of the TLS variable
+        //    within the module's TLS block.
+        addExpr(Block, dwarf::DW_FORM_udata, Expr);
       } else {
-        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-        addUInt(Block, 0, dwarf::DW_FORM_udata, DU->getAddrPoolIndex(Expr));
+        addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+        addUInt(Block, dwarf::DW_FORM_udata, DU->getAddrPoolIndex(Expr));
       }
-      // 3) followed by a custom OP to tell the debugger about TLS (presumably)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_lo_user);
+      // 3) followed by a custom OP to make the debugger do a TLS lookup.
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
     } else
       addOpAddress(Block, Sym);
     // Do not create specification DIE if context is either compile unit
     // or a subprogram.
     if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
-        !GVContext.isFile() && !isSubprogramContext(GVContext)) {
+        !GVContext.isFile() && !DD->isSubprogramContext(GVContext)) {
       // Create specification DIE.
-      VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
-      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
-                  dwarf::DW_FORM_ref4, VariableDIE);
-      addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
+      VariableSpecDIE = createAndAddDIE(dwarf::DW_TAG_variable, *CUDie);
+      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification, VariableDIE);
+      addBlock(VariableSpecDIE, dwarf::DW_AT_location, Block);
       // A static member's declaration is already flagged as such.
       if (!SDMDecl.Verify())
         addFlag(VariableDIE, dwarf::DW_AT_declaration);
-      addDie(VariableSpecDIE);
     } else {
-      addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+      addBlock(VariableDIE, dwarf::DW_AT_location, Block);
     }
-    // Add linkage name.
+    // Add the linkage name.
     StringRef LinkageName = GV.getLinkageName();
-    if (!LinkageName.empty()) {
+    if (!LinkageName.empty())
       // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
       // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
       // TAG_variable.
-      addString(IsStaticMember && VariableSpecDIE ?
-                VariableSpecDIE : VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
+      addString(IsStaticMember && VariableSpecDIE ? VariableSpecDIE
+                                                  : VariableDIE,
+                dwarf::DW_AT_MIPS_linkage_name,
                 GlobalValue::getRealLinkageName(LinkageName));
-      // In compatibility mode with older gdbs we put the linkage name on both
-      // the TAG_variable DIE and on the TAG_member DIE.
-      if (IsStaticMember && VariableSpecDIE && DD->useDarwinGDBCompat())
-        addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
-                  GlobalValue::getRealLinkageName(LinkageName));
-    }
   } else if (const ConstantInt *CI =
-             dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
+                 dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
     // AT_const_value was added when the static member was created. To avoid
     // emitting AT_const_value multiple times, we only add AT_const_value when
     // it is not a static member.
     if (!IsStaticMember)
-      addConstantValue(VariableDIE, CI, GTy.isUnsignedDIType());
-  } else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
+      addConstantValue(VariableDIE, CI, isUnsignedDIType(DD, GTy));
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getOperand(11))) {
     addToAccelTable = true;
     // GV is a merged global.
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
     Value *Ptr = CE->getOperand(0);
-    addOpAddress(Block, Asm->Mang->getSymbol(cast<GlobalValue>(Ptr)));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    SmallVector<Value*, 3> Idx(CE->op_begin()+1, CE->op_end());
-    addUInt(Block, 0, dwarf::DW_FORM_udata,
-                   Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+    addOpAddress(Block, Asm->getSymbol(cast<GlobalValue>(Ptr)));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
+    addUInt(Block, dwarf::DW_FORM_udata,
+            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(VariableDIE, dwarf::DW_AT_location, Block);
   }
 
   if (addToAccelTable) {
@@ -1466,13 +1637,17 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
     if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
       addAccelName(GV.getLinkageName(), AddrDIE);
   }
+
+  if (!GV.isLocalToUnit())
+    addGlobalName(GV.getName(), VariableSpecDIE ? VariableSpecDIE : VariableDIE,
+                  GV.getContext());
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
                                        DIE *IndexTy) {
-  DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
-  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
+  DIE *DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, IndexTy);
 
   // The LowerBound value defines the lower bounds which is typically zero for
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
@@ -1485,26 +1660,22 @@ void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
   int64_t Count = SR.getCount();
 
   if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
-    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, LowerBound);
+    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
 
   if (Count != -1 && Count != 0)
     // FIXME: An unbounded array should reference the expression that defines
     // the array.
-    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, LowerBound + Count - 1);
-
-  Buffer.addChild(DW_Subrange);
+    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, None,
+            LowerBound + Count - 1);
 }
 
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
-                                        DICompositeType *CTy) {
-  Buffer.setTag(dwarf::DW_TAG_array_type);
-  if (CTy->isVector())
+void CompileUnit::constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  if (CTy.isVector())
     addFlag(&Buffer, dwarf::DW_AT_GNU_vector);
 
-  // Emit derived type.
-  addType(&Buffer, CTy->getTypeDerivedFrom());
-  DIArray Elements = CTy->getTypeArray();
+  // Emit the element type.
+  addType(&Buffer, resolve(CTy.getTypeDerivedFrom()));
 
   // Get an anonymous type for index type.
   // FIXME: This type should be passed down from the front end
@@ -1512,16 +1683,16 @@ void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
   DIE *IdxTy = getIndexTyDie();
   if (!IdxTy) {
     // Construct an anonymous type for index type.
-    IdxTy = new DIE(dwarf::DW_TAG_base_type);
+    IdxTy = createAndAddDIE(dwarf::DW_TAG_base_type, *CUDie.get());
     addString(IdxTy, dwarf::DW_AT_name, "int");
-    addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
+    addUInt(IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int32_t));
     addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
             dwarf::DW_ATE_signed);
-    addDie(IdxTy);
     setIndexTyDie(IdxTy);
   }
 
   // Add subranges to array type.
+  DIArray Elements = CTy.getTypeArray();
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
     DIDescriptor Element = Elements.getElement(i);
     if (Element.getTag() == dwarf::DW_TAG_subrange_type)
@@ -1529,168 +1700,180 @@ void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
   }
 }
 
-/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-DIE *CompileUnit::constructEnumTypeDIE(DIEnumerator ETy) {
-  DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
-  StringRef Name = ETy.getName();
-  addString(Enumerator, dwarf::DW_AT_name, Name);
-  int64_t Value = ETy.getEnumValue();
-  addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
-  return Enumerator;
+/// constructEnumTypeDIE - Construct an enum type DIE from DICompositeType.
+void CompileUnit::constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  DIArray Elements = CTy.getTypeArray();
+
+  // Add enumerators to enumeration type.
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+    DIEnumerator Enum(Elements.getElement(i));
+    if (Enum.isEnumerator()) {
+      DIE *Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
+      StringRef Name = Enum.getName();
+      addString(Enumerator, dwarf::DW_AT_name, Name);
+      int64_t Value = Enum.getEnumValue();
+      addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
+    }
+  }
+  DIType DTy = resolve(CTy.getTypeDerivedFrom());
+  if (DTy) {
+    addType(&Buffer, DTy);
+    addFlag(&Buffer, dwarf::DW_AT_enum_class);
+  }
 }
 
 /// constructContainingTypeDIEs - Construct DIEs for types that contain
 /// vtables.
 void CompileUnit::constructContainingTypeDIEs() {
   for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
-         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
+                                                 CE = ContainingTypeMap.end();
+       CI != CE; ++CI) {
     DIE *SPDie = CI->first;
-    const MDNode *N = CI->second;
-    if (!N) continue;
-    DIE *NDie = getDIE(N);
-    if (!NDie) continue;
-    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+    DIDescriptor D(CI->second);
+    if (!D)
+      continue;
+    DIE *NDie = getDIE(D);
+    if (!NDie)
+      continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, NDie);
   }
 }
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-DIE *CompileUnit::constructVariableDIE(DbgVariable *DV,
-                                       bool isScopeAbstract) {
-  StringRef Name = DV->getName();
-
-  // Translate tag to proper Dwarf tag.
-  unsigned Tag = DV->getTag();
+DIE *CompileUnit::constructVariableDIE(DbgVariable &DV, bool isScopeAbstract) {
+  StringRef Name = DV.getName();
 
   // Define variable debug information entry.
-  DIE *VariableDie = new DIE(Tag);
-  DbgVariable *AbsVar = DV->getAbstractVariable();
+  DIE *VariableDie = new DIE(DV.getTag());
+  DbgVariable *AbsVar = DV.getAbstractVariable();
   DIE *AbsDIE = AbsVar ? AbsVar->getDIE() : NULL;
   if (AbsDIE)
-    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
-                            dwarf::DW_FORM_ref4, AbsDIE);
+    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin, AbsDIE);
   else {
-    addString(VariableDie, dwarf::DW_AT_name, Name);
-    addSourceLine(VariableDie, DV->getVariable());
-    addType(VariableDie, DV->getType());
+    if (!Name.empty())
+      addString(VariableDie, dwarf::DW_AT_name, Name);
+    addSourceLine(VariableDie, DV.getVariable());
+    addType(VariableDie, DV.getType());
   }
 
-  if (DV->isArtificial())
+  if (DV.isArtificial())
     addFlag(VariableDie, dwarf::DW_AT_artificial);
 
   if (isScopeAbstract) {
-    DV->setDIE(VariableDie);
+    DV.setDIE(VariableDie);
     return VariableDie;
   }
 
   // Add variable address.
 
-  unsigned Offset = DV->getDotDebugLocOffset();
+  unsigned Offset = DV.getDotDebugLocOffset();
   if (Offset != ~0U) {
-    addLabel(VariableDie, dwarf::DW_AT_location, dwarf::DW_FORM_data4,
+    addLabel(VariableDie, dwarf::DW_AT_location,
+             DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                        : dwarf::DW_FORM_data4,
              Asm->GetTempSymbol("debug_loc", Offset));
-    DV->setDIE(VariableDie);
+    DV.setDIE(VariableDie);
     return VariableDie;
   }
 
   // Check if variable is described by a DBG_VALUE instruction.
-  if (const MachineInstr *DVInsn = DV->getMInsn()) {
+  if (const MachineInstr *DVInsn = DV.getMInsn()) {
     assert(DVInsn->getNumOperands() == 3);
     if (DVInsn->getOperand(0).isReg()) {
       const MachineOperand RegOp = DVInsn->getOperand(0);
       // If the second operand is an immediate, this is an indirect value.
       if (DVInsn->getOperand(1).isImm()) {
-        MachineLocation Location(RegOp.getReg(), DVInsn->getOperand(1).getImm());
-        addVariableAddress(*DV, VariableDie, Location);
+        MachineLocation Location(RegOp.getReg(),
+                                 DVInsn->getOperand(1).getImm());
+        addVariableAddress(DV, VariableDie, Location);
       } else if (RegOp.getReg())
-        addVariableAddress(*DV, VariableDie, MachineLocation(RegOp.getReg()));
+        addVariableAddress(DV, VariableDie, MachineLocation(RegOp.getReg()));
     } else if (DVInsn->getOperand(0).isImm())
-      addConstantValue(VariableDie, DVInsn->getOperand(0), DV->getType());
+      addConstantValue(VariableDie, DVInsn->getOperand(0), DV.getType());
     else if (DVInsn->getOperand(0).isFPImm())
       addConstantFPValue(VariableDie, DVInsn->getOperand(0));
     else if (DVInsn->getOperand(0).isCImm())
       addConstantValue(VariableDie, DVInsn->getOperand(0).getCImm(),
-                       DV->getType().isUnsignedDIType());
+                       isUnsignedDIType(DD, DV.getType()));
 
-    DV->setDIE(VariableDie);
+    DV.setDIE(VariableDie);
     return VariableDie;
   } else {
     // .. else use frame index.
-    int FI = DV->getFrameIndex();
+    int FI = DV.getFrameIndex();
     if (FI != ~0) {
       unsigned FrameReg = 0;
       const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-      int Offset =
-        TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+      int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
       MachineLocation Location(FrameReg, Offset);
-      addVariableAddress(*DV, VariableDie, Location);
+      addVariableAddress(DV, VariableDie, Location);
     }
   }
 
-  DV->setDIE(VariableDie);
+  DV.setDIE(VariableDie);
   return VariableDie;
 }
 
-/// createMemberDIE - Create new member DIE.
-DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
-  DIE *MemberDie = new DIE(DT.getTag());
+/// constructMemberDIE - Construct member DIE from DIDerivedType.
+void CompileUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
+  DIE *MemberDie = createAndAddDIE(DT.getTag(), Buffer);
   StringRef Name = DT.getName();
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, Name);
 
-  addType(MemberDie, DT.getTypeDerivedFrom());
+  addType(MemberDie, resolve(DT.getTypeDerivedFrom()));
 
   addSourceLine(MemberDie, DT);
 
   DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
-  addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-
-  uint64_t Size = DT.getSizeInBits();
-  uint64_t FieldSize = DT.getOriginalTypeSize();
-
-  if (Size != FieldSize) {
-    // Handle bitfield.
-    addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3);
-    addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits());
-
-    uint64_t Offset = DT.getOffsetInBits();
-    uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
-    uint64_t HiMark = (Offset + FieldSize) & AlignMask;
-    uint64_t FieldOffset = (HiMark - FieldSize);
-    Offset -= FieldOffset;
-
-    // Maybe we need to work from the other end.
-    if (Asm->getDataLayout().isLittleEndian())
-      Offset = FieldSize - (Offset + Size);
-    addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
-
-    // Here WD_AT_data_member_location points to the anonymous
-    // field that includes this bit field.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3);
-
-  } else
-    // This is not a bitfield.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3);
+  addUInt(MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
 
-  if (DT.getTag() == dwarf::DW_TAG_inheritance
-      && DT.isVirtual()) {
+  if (DT.getTag() == dwarf::DW_TAG_inheritance && DT.isVirtual()) {
 
     // For C++, virtual base classes are not at fixed offset. Use following
     // expression to extract appropriate offset from vtable.
     // BaseAddr = ObAddr + *((*ObAddr) - Offset)
 
     DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock();
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits());
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0,
-             VBaseLocationDie);
-  } else
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_udata, DT.getOffsetInBits());
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
+  } else {
+    uint64_t Size = DT.getSizeInBits();
+    uint64_t FieldSize = getBaseTypeSize(DD, DT);
+    uint64_t OffsetInBytes;
+
+    if (Size != FieldSize) {
+      // Handle bitfield.
+      addUInt(MemberDie, dwarf::DW_AT_byte_size, None,
+              getBaseTypeSize(DD, DT) >> 3);
+      addUInt(MemberDie, dwarf::DW_AT_bit_size, None, DT.getSizeInBits());
+
+      uint64_t Offset = DT.getOffsetInBits();
+      uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
+      uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+      uint64_t FieldOffset = (HiMark - FieldSize);
+      Offset -= FieldOffset;
+
+      // Maybe we need to work from the other end.
+      if (Asm->getDataLayout().isLittleEndian())
+        Offset = FieldSize - (Offset + Size);
+      addUInt(MemberDie, dwarf::DW_AT_bit_offset, None, Offset);
+
+      // Here WD_AT_data_member_location points to the anonymous
+      // field that includes this bit field.
+      OffsetInBytes = FieldOffset >> 3;
+    } else
+      // This is not a bitfield.
+      OffsetInBytes = DT.getOffsetInBits() >> 3;
+    addUInt(MemberDie, dwarf::DW_AT_data_member_location, None, OffsetInBytes);
+  }
 
   if (DT.isProtected())
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
@@ -1714,17 +1897,26 @@ DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
 
   if (DT.isArtificial())
     addFlag(MemberDie, dwarf::DW_AT_artificial);
-
-  return MemberDie;
 }
 
-/// createStaticMemberDIE - Create new DIE for C++ static member.
-DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
+/// getOrCreateStaticMemberDIE - Create new DIE for C++ static member.
+DIE *CompileUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
   if (!DT.Verify())
     return NULL;
 
-  DIE *StaticMemberDIE = new DIE(DT.getTag());
-  DIType Ty = DT.getTypeDerivedFrom();
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(DT.getContext()));
+  assert(dwarf::isType(ContextDIE->getTag()) &&
+         "Static member should belong to a type.");
+
+  DIE *StaticMemberDIE = getDIE(DT);
+  if (StaticMemberDIE)
+    return StaticMemberDIE;
+
+  StaticMemberDIE = createAndAddDIE(DT.getTag(), *ContextDIE, DT);
+
+  DIType Ty = resolve(DT.getTypeDerivedFrom());
 
   addString(StaticMemberDIE, dwarf::DW_AT_name, DT.getName());
   addType(StaticMemberDIE, Ty);
@@ -1745,10 +1937,20 @@ DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
             dwarf::DW_ACCESS_public);
 
   if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT.getConstant()))
-    addConstantValue(StaticMemberDIE, CI, Ty.isUnsignedDIType());
+    addConstantValue(StaticMemberDIE, CI, isUnsignedDIType(DD, Ty));
   if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT.getConstant()))
     addConstantFPValue(StaticMemberDIE, CFP);
 
-  insertDIE(DT, StaticMemberDIE);
   return StaticMemberDIE;
 }
+
+void CompileUnit::emitHeader(const MCSection *ASection,
+                             const MCSymbol *ASectionSym) {
+  Asm->OutStreamer.AddComment("DWARF version number");
+  Asm->EmitInt16(DD->getDwarfVersion());
+  Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
+  Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
+                         ASectionSym);
+  Asm->OutStreamer.AddComment("Address Size (in bytes)");
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 3908b37..d782c88 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,7 +15,9 @@
 #define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DIE.h"
+#include "DwarfDebug.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/DebugInfo.h"
@@ -23,8 +25,6 @@
 
 namespace llvm {
 
-class DwarfDebug;
-class DwarfUnits;
 class MachineLocation;
 class MachineOperand;
 class ConstantInt;
@@ -39,11 +39,10 @@ class CompileUnit {
   ///
   unsigned UniqueID;
 
-  /// Language - The DW_AT_language of the compile unit
-  ///
-  unsigned Language;
+  /// Node - MDNode for the compile unit.
+  DICompileUnit Node;
 
-  /// Die - Compile unit debug information entry.
+  /// CUDie - Compile unit debug information entry.
   ///
   const OwningPtr<DIE> CUDie;
 
@@ -67,18 +66,18 @@ class CompileUnit {
 
   /// GlobalNames - A map of globally visible named entities for this unit.
   ///
-  StringMap<DIE*> GlobalNames;
+  StringMap<DIE *> GlobalNames;
 
   /// GlobalTypes - A map of globally visible types for this unit.
   ///
-  StringMap<DIE*> GlobalTypes;
+  StringMap<DIE *> GlobalTypes;
 
   /// AccelNames - A map of names for the name accelerator table.
   ///
-  StringMap<std::vector<DIE*> > AccelNames;
-  StringMap<std::vector<DIE*> > AccelObjC;
-  StringMap<std::vector<DIE*> > AccelNamespace;
-  StringMap<std::vector<std::pair<DIE*, unsigned> > > AccelTypes;
+  StringMap<std::vector<DIE *> > AccelNames;
+  StringMap<std::vector<DIE *> > AccelObjC;
+  StringMap<std::vector<DIE *> > AccelNamespace;
+  StringMap<std::vector<std::pair<DIE *, unsigned> > > AccelTypes;
 
   /// DIEBlocks - A list of all the DIEBlocks in use.
   std::vector<DIEBlock *> DIEBlocks;
@@ -88,165 +87,154 @@ class CompileUnit {
   /// corresponds to the MDNode mapped with the subprogram DIE.
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
-  /// Offset of the CUDie from beginning of debug info section.
-  unsigned DebugInfoOffset;
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
 
-  /// getLowerBoundDefault - Return the default lower bound for an array. If the
-  /// DWARF version doesn't handle the language, return -1.
-  int64_t getDefaultLowerBound() const;
+  // DIEIntegerOne - A preallocated DIEValue because 1 is used frequently.
+  DIEInteger *DIEIntegerOne;
 
 public:
-  CompileUnit(unsigned UID, unsigned L, DIE *D, const MDNode *N, AsmPrinter *A,
+  CompileUnit(unsigned UID, DIE *D, DICompileUnit CU, AsmPrinter *A,
               DwarfDebug *DW, DwarfUnits *DWU);
   ~CompileUnit();
 
   // Accessors.
-  unsigned getUniqueID()            const { return UniqueID; }
-  unsigned getLanguage()            const { return Language; }
-  DIE* getCUDie()                   const { return CUDie.get(); }
-  unsigned getDebugInfoOffset()     const { return DebugInfoOffset; }
-  const StringMap<DIE*> &getGlobalNames() const { return GlobalNames; }
-  const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; }
-
-  const StringMap<std::vector<DIE*> > &getAccelNames() const {
+  unsigned getUniqueID() const { return UniqueID; }
+  uint16_t getLanguage() const { return Node.getLanguage(); }
+  DICompileUnit getNode() const { return Node; }
+  DIE *getCUDie() const { return CUDie.get(); }
+  const StringMap<DIE *> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<DIE *> &getGlobalTypes() const { return GlobalTypes; }
+
+  const StringMap<std::vector<DIE *> > &getAccelNames() const {
     return AccelNames;
   }
-  const StringMap<std::vector<DIE*> > &getAccelObjC() const {
+  const StringMap<std::vector<DIE *> > &getAccelObjC() const {
     return AccelObjC;
   }
-  const StringMap<std::vector<DIE*> > &getAccelNamespace() const {
+  const StringMap<std::vector<DIE *> > &getAccelNamespace() const {
     return AccelNamespace;
   }
-  const StringMap<std::vector<std::pair<DIE*, unsigned > > >
-  &getAccelTypes() const {
+  const StringMap<std::vector<std::pair<DIE *, unsigned> > > &
+  getAccelTypes() const {
     return AccelTypes;
   }
 
+  unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
+
   /// hasContent - Return true if this compile unit has something to write out.
   ///
   bool hasContent() const { return !CUDie->getChildren().empty(); }
 
+  /// getParentContextString - Get a string containing the language specific
+  /// context for a global name.
+  std::string getParentContextString(DIScope Context) const;
+
   /// addGlobalName - Add a new global entity to the compile unit.
   ///
-  void addGlobalName(StringRef Name, DIE *Die) { GlobalNames[Name] = Die; }
+  void addGlobalName(StringRef Name, DIE *Die, DIScope Context);
 
   /// addGlobalType - Add a new global type to the compile unit.
   ///
   void addGlobalType(DIType Ty);
 
+  /// addPubTypes - Add a set of types from the subprogram to the global types.
+  void addPubTypes(DISubprogram SP);
 
   /// addAccelName - Add a new name to the name accelerator table.
-  void addAccelName(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelNames[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelObjC(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelObjC[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelNamespace(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelNamespace[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die) {
-    std::vector<std::pair<DIE *, unsigned> > &DIEs = AccelTypes[Name];
-    DIEs.push_back(Die);
-  }
+  void addAccelName(StringRef Name, DIE *Die);
 
-  /// getDIE - Returns the debug information entry map slot for the
-  /// specified debug variable.
-  DIE *getDIE(const MDNode *N) const { return MDNodeToDieMap.lookup(N); }
+  /// addAccelObjC - Add a new name to the ObjC accelerator table.
+  void addAccelObjC(StringRef Name, DIE *Die);
 
-  DIEBlock *getDIEBlock() {
-    return new (DIEValueAllocator) DIEBlock();
-  }
+  /// addAccelNamespace - Add a new name to the namespace accelerator table.
+  void addAccelNamespace(StringRef Name, DIE *Die);
 
-  /// insertDIE - Insert DIE into the map.
-  void insertDIE(const MDNode *N, DIE *D) {
-    MDNodeToDieMap.insert(std::make_pair(N, D));
-  }
+  /// addAccelType - Add a new type to the type accelerator table.
+  void addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die);
 
-  /// getDIEEntry - Returns the debug information entry for the specified
-  /// debug variable.
-  DIEEntry *getDIEEntry(const MDNode *N) const {
-    return MDNodeToDIEEntryMap.lookup(N);
-  }
+  /// getDIE - Returns the debug information entry map slot for the
+  /// specified debug variable. We delegate the request to DwarfDebug
+  /// when the MDNode can be part of the type system, since DIEs for
+  /// the type system can be shared across CUs and the mappings are
+  /// kept in DwarfDebug.
+  DIE *getDIE(DIDescriptor D) const;
 
-  /// insertDIEEntry - Insert debug information entry into the map.
-  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
-    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
-  }
+  DIEBlock *getDIEBlock() { return new (DIEValueAllocator) DIEBlock(); }
+
+  /// insertDIE - Insert DIE into the map. We delegate the request to DwarfDebug
+  /// when the MDNode can be part of the type system, since DIEs for
+  /// the type system can be shared across CUs and the mappings are
+  /// kept in DwarfDebug.
+  void insertDIE(DIDescriptor Desc, DIE *D);
 
   /// addDie - Adds or interns the DIE to the compile unit.
   ///
-  void addDie(DIE *Buffer) {
-    this->CUDie->addChild(Buffer);
-  }
-
-  // getIndexTyDie - Get an anonymous type for index type.
-  DIE *getIndexTyDie() {
-    return IndexTyDie;
-  }
-
-  // setIndexTyDie - Set D as anonymous type for index which can be reused
-  // later.
-  void setIndexTyDie(DIE *D) {
-    IndexTyDie = D;
-  }
+  void addDie(DIE *Buffer) { CUDie->addChild(Buffer); }
 
   /// addFlag - Add a flag that is true to the DIE.
-  void addFlag(DIE *Die, unsigned Attribute);
+  void addFlag(DIE *Die, dwarf::Attribute Attribute);
 
   /// addUInt - Add an unsigned integer attribute data and value.
   ///
-  void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
+  void addUInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+               uint64_t Integer);
+
+  void addUInt(DIEBlock *Block, dwarf::Form Form, uint64_t Integer);
 
   /// addSInt - Add an signed integer attribute data and value.
   ///
-  void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
+  void addSInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+               int64_t Integer);
+
+  void addSInt(DIEBlock *Die, Optional<dwarf::Form> Form, int64_t Integer);
 
   /// addString - Add a string attribute data and value.
   ///
-  void addString(DIE *Die, unsigned Attribute, const StringRef Str);
+  void addString(DIE *Die, dwarf::Attribute Attribute, const StringRef Str);
 
   /// addLocalString - Add a string attribute data and value.
   ///
-  void addLocalString(DIE *Die, unsigned Attribute, const StringRef Str);
+  void addLocalString(DIE *Die, dwarf::Attribute Attribute, const StringRef Str);
 
   /// addExpr - Add a Dwarf expression attribute data and value.
   ///
-  void addExpr(DIE *Die, unsigned Attribute, unsigned Form,
-               const MCExpr *Expr);
+  void addExpr(DIEBlock *Die, dwarf::Form Form, const MCExpr *Expr);
 
   /// addLabel - Add a Dwarf label attribute data and value.
   ///
-  void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+  void addLabel(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form,
                 const MCSymbol *Label);
 
+  void addLabel(DIEBlock *Die, dwarf::Form Form, const MCSymbol *Label);
+
   /// addLabelAddress - Add a dwarf label attribute data and value using
   /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
   ///
-  void addLabelAddress(DIE *Die, unsigned Attribute, MCSymbol *Label);
+  void addLabelAddress(DIE *Die, dwarf::Attribute Attribute, MCSymbol *Label);
 
   /// addOpAddress - Add a dwarf op address data and value using the
   /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
   ///
-  void addOpAddress(DIE *Die, const MCSymbol *Label);
-  void addOpAddress(DIE *Die, const MCSymbolRefExpr *Label);
+  void addOpAddress(DIEBlock *Die, const MCSymbol *Label);
 
   /// addDelta - Add a label delta attribute data and value.
   ///
-  void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                const MCSymbol *Hi, const MCSymbol *Lo);
+  void addDelta(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form, const MCSymbol *Hi,
+                const MCSymbol *Lo);
 
   /// addDIEEntry - Add a DIE attribute data and value.
   ///
-  void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry);
+  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIE *Entry);
+
+  /// addDIEEntry - Add a DIE attribute data and value.
+  ///
+  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIEEntry *Entry);
 
   /// addBlock - Add block data.
   ///
-  void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
+  void addBlock(DIE *Die, dwarf::Attribute Attribute, DIEBlock *Block);
 
   /// addSourceLine - Add location information to specified debug information
   /// entry.
@@ -259,8 +247,8 @@ public:
 
   /// addAddress - Add an address attribute to a die based on the location
   /// provided.
-  void addAddress(DIE *Die, unsigned Attribute,
-                  const MachineLocation &Location, bool Indirect = false);
+  void addAddress(DIE *Die, dwarf::Attribute Attribute, const MachineLocation &Location,
+                  bool Indirect = false);
 
   /// addConstantValue - Add constant value entry in variable DIE.
   void addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
@@ -275,17 +263,17 @@ public:
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
   /// addRegisterOp - Add register operand.
-  void addRegisterOp(DIE *TheDie, unsigned Reg);
+  void addRegisterOp(DIEBlock *TheDie, unsigned Reg);
 
   /// addRegisterOffset - Add register offset.
-  void addRegisterOffset(DIE *TheDie, unsigned Reg, int64_t Offset);
+  void addRegisterOffset(DIEBlock *TheDie, unsigned Reg, int64_t Offset);
 
   /// addComplexAddress - Start with the address based on the location provided,
   /// and generate the DWARF information necessary to find the actual variable
   /// (navigating the extra location information encoded in the type) based on
   /// the starting location.  Add the DWARF information to the die.
   ///
-  void addComplexAddress(const DbgVariable &DV, DIE *Die, unsigned Attribute,
+  void addComplexAddress(const DbgVariable &DV, DIE *Die, dwarf::Attribute Attribute,
                          const MachineLocation &Location);
 
   // FIXME: Should be reformulated in terms of addComplexAddress.
@@ -295,7 +283,7 @@ public:
   /// starting location.  Add the DWARF information to the die.  Obsolete,
   /// please use addComplexAddress instead.
   ///
-  void addBlockByrefAddress(const DbgVariable &DV, DIE *Die, unsigned Attribute,
+  void addBlockByrefAddress(const DbgVariable &DV, DIE *Die, dwarf::Attribute Attribute,
                             const MachineLocation &Location);
 
   /// addVariableAddress - Add DW_AT_location attribute for a
@@ -303,13 +291,10 @@ public:
   void addVariableAddress(const DbgVariable &DV, DIE *Die,
                           MachineLocation Location);
 
-  /// addToContextOwner - Add Die into the list of its context owner's children.
-  void addToContextOwner(DIE *Die, DIDescriptor Context);
-
   /// addType - Add a new type attribute to the specified entity. This takes
   /// and attribute parameter because DW_AT_friend attributes are also
   /// type references.
-  void addType(DIE *Entity, DIType Ty, unsigned Attribute = dwarf::DW_AT_type);
+  void addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute = dwarf::DW_AT_type);
 
   /// getOrCreateNameSpace - Create a DIE for DINameSpace.
   DIE *getOrCreateNameSpace(DINameSpace NS);
@@ -321,66 +306,103 @@ public:
   /// given DIType.
   DIE *getOrCreateTypeDIE(const MDNode *N);
 
-  /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
-  /// for the given DITemplateTypeParameter.
-  DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP);
+  /// getOrCreateContextDIE - Get context owner's DIE.
+  DIE *getOrCreateContextDIE(DIScope Context);
 
-  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create
-  /// new DIE for the given DITemplateValueParameter.
-  DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP);
+  /// createGlobalVariableDIE - create global variable DIE.
+  void createGlobalVariableDIE(DIGlobalVariable GV);
 
-  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
-  /// information entry.
-  DIEEntry *createDIEEntry(DIE *Entry);
+  /// constructContainingTypeDIEs - Construct DIEs for types that contain
+  /// vtables.
+  void constructContainingTypeDIEs();
 
-  /// createGlobalVariableDIE - create global variable DIE.
-  void createGlobalVariableDIE(const MDNode *N);
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable &DV, bool isScopeAbstract);
+
+  /// Create a DIE with the given Tag, add the DIE to its parent, and
+  /// call insertDIE if MD is not null.
+  DIE *createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N = DIDescriptor());
+
+  /// Compute the size of a header for this unit, not including the initial
+  /// length field.
+  unsigned getHeaderSize() const {
+    return sizeof(int16_t) + // DWARF version number
+           sizeof(int32_t) + // Offset Into Abbrev. Section
+           sizeof(int8_t);   // Pointer Size (in bytes)
+  }
 
-  void addPubTypes(DISubprogram SP);
+  /// Emit the header for this unit, not including the initial length field.
+  void emitHeader(const MCSection *ASection, const MCSymbol *ASectionSym);
 
+private:
   /// constructTypeDIE - Construct basic type die from DIBasicType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIBasicType BTy);
+  void constructTypeDIE(DIE &Buffer, DIBasicType BTy);
 
   /// constructTypeDIE - Construct derived type die from DIDerivedType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIDerivedType DTy);
+  void constructTypeDIE(DIE &Buffer, DIDerivedType DTy);
 
   /// constructTypeDIE - Construct type DIE from DICompositeType.
-  void constructTypeDIE(DIE &Buffer,
-                        DICompositeType CTy);
+  void constructTypeDIE(DIE &Buffer, DICompositeType CTy);
 
   /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
   void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
 
   /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-  void constructArrayTypeDIE(DIE &Buffer,
-                             DICompositeType *CTy);
+  void constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy);
 
   /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-  DIE *constructEnumTypeDIE(DIEnumerator ETy);
+  void constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy);
 
-  /// constructContainingTypeDIEs - Construct DIEs for types that contain
-  /// vtables.
-  void constructContainingTypeDIEs();
+  /// constructMemberDIE - Construct member DIE from DIDerivedType.
+  void constructMemberDIE(DIE &Buffer, DIDerivedType DT);
 
-  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, bool isScopeAbstract);
+  /// constructTemplateTypeParameterDIE - Construct new DIE for the given
+  /// DITemplateTypeParameter.
+  void constructTemplateTypeParameterDIE(DIE &Buffer,
+                                         DITemplateTypeParameter TP);
 
-  /// createMemberDIE - Create new member DIE.
-  DIE *createMemberDIE(DIDerivedType DT);
+  /// constructTemplateValueParameterDIE - Construct new DIE for the given
+  /// DITemplateValueParameter.
+  void constructTemplateValueParameterDIE(DIE &Buffer,
+                                          DITemplateValueParameter TVP);
 
-  /// createStaticMemberDIE - Create new static data member DIE.
-  DIE *createStaticMemberDIE(DIDerivedType DT);
+  /// getOrCreateStaticMemberDIE - Create new static data member DIE.
+  DIE *getOrCreateStaticMemberDIE(DIDerivedType DT);
 
-  /// getOrCreateContextDIE - Get context owner's DIE.
-  DIE *getOrCreateContextDIE(DIDescriptor Context);
+  /// Offset of the CUDie from beginning of debug info section.
+  unsigned DebugInfoOffset;
 
-private:
+  /// getLowerBoundDefault - Return the default lower bound for an array. If the
+  /// DWARF version doesn't handle the language, return -1.
+  int64_t getDefaultLowerBound() const;
 
-  // DIEValueAllocator - All DIEValues are allocated through this allocator.
-  BumpPtrAllocator DIEValueAllocator;
-  DIEInteger *DIEIntegerOne;
+  /// getDIEEntry - Returns the debug information entry for the specified
+  /// debug variable.
+  DIEEntry *getDIEEntry(const MDNode *N) const {
+    return MDNodeToDIEEntryMap.lookup(N);
+  }
+
+  /// insertDIEEntry - Insert debug information entry into the map.
+  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
+    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
+  }
+
+  // getIndexTyDie - Get an anonymous type for index type.
+  DIE *getIndexTyDie() { return IndexTyDie; }
+
+  // setIndexTyDie - Set D as anonymous type for index which can be reused
+  // later.
+  void setIndexTyDie(DIE *D) { IndexTyDie = D; }
+
+  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+  /// information entry.
+  DIEEntry *createDIEEntry(DIE *Entry);
+
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return DD->resolve(Ref);
+  }
 };
 
 } // end llvm namespace
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 979c0c3..24e2c05 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "dwarfdebug"
 #include "DwarfDebug.h"
 #include "DIE.h"
+#include "DIEHash.h"
 #include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "llvm/ADT/STLExtras.h"
@@ -34,6 +35,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MD5.h"
@@ -57,15 +59,20 @@ static cl::opt<bool> UnknownLocations(
     cl::init(false));
 
 static cl::opt<bool>
-GenerateDwarfPubNamesSection("generate-dwarf-pubnames", cl::Hidden,
-                             cl::init(false),
-                             cl::desc("Generate DWARF pubnames section"));
-
-static cl::opt<bool>
 GenerateODRHash("generate-odr-hash", cl::Hidden,
                 cl::desc("Add an ODR hash to external type DIEs."),
                 cl::init(false));
 
+static cl::opt<bool>
+GenerateCUHash("generate-cu-hash", cl::Hidden,
+               cl::desc("Add the CU hash as the dwo_id."),
+               cl::init(false));
+
+static cl::opt<bool>
+GenerateGnuPubSections("generate-gnu-dwarf-pub-sections", cl::Hidden,
+                       cl::desc("Generate GNU-style pubnames and pubtypes"),
+                       cl::init(false));
+
 namespace {
 enum DefaultOnOff {
   Default,
@@ -83,14 +90,6 @@ DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
                  cl::init(Default));
 
 static cl::opt<DefaultOnOff>
-DarwinGDBCompat("darwin-gdb-compat", cl::Hidden,
-                cl::desc("Compatibility with Darwin gdb."),
-                cl::values(clEnumVal(Default, "Default for platform"),
-                           clEnumVal(Enable, "Enabled"),
-                           clEnumVal(Disable, "Disabled"), clEnumValEnd),
-                cl::init(Default));
-
-static cl::opt<DefaultOnOff>
 SplitDwarf("split-dwarf", cl::Hidden,
            cl::desc("Output prototype dwarf split debug info."),
            cl::values(clEnumVal(Default, "Default for platform"),
@@ -98,16 +97,16 @@ SplitDwarf("split-dwarf", cl::Hidden,
                       clEnumVal(Disable, "Disabled"), clEnumValEnd),
            cl::init(Default));
 
-namespace {
-  const char *const DWARFGroupName = "DWARF Emission";
-  const char *const DbgTimerName = "DWARF Debug Writer";
+static cl::opt<DefaultOnOff>
+DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
+                 cl::desc("Generate DWARF pubnames and pubtypes sections"),
+                 cl::values(clEnumVal(Default, "Default for platform"),
+                            clEnumVal(Enable, "Enabled"),
+                            clEnumVal(Disable, "Disabled"), clEnumValEnd),
+                 cl::init(Default));
 
-  struct CompareFirst {
-    template <typename T> bool operator()(const T &lhs, const T &rhs) const {
-      return lhs.first < rhs.first;
-    }
-  };
-} // end anonymous namespace
+static const char *const DWARFGroupName = "DWARF Emission";
+static const char *const DbgTimerName = "DWARF Debug Writer";
 
 //===----------------------------------------------------------------------===//
 
@@ -117,6 +116,13 @@ static const unsigned InitAbbreviationsSetSize = 9; // log2(512)
 
 namespace llvm {
 
+/// resolve - Look in the DwarfDebug map for the MDNode that
+/// corresponds to the reference.
+template <typename T>
+T DbgVariable::resolve(DIRef<T> Ref) const {
+  return DD->resolve(Ref);
+}
+
 DIType DbgVariable::getType() const {
   DIType Ty = Var.getType();
   // FIXME: isBlockByrefVariable should be reformulated in terms of complex
@@ -147,21 +153,16 @@ DIType DbgVariable::getType() const {
        the pointers and __Block_byref_x_VarName struct to find the actual
        value of the variable.  The function addBlockByrefType does this.  */
     DIType subType = Ty;
-    unsigned tag = Ty.getTag();
+    uint16_t tag = Ty.getTag();
 
-    if (tag == dwarf::DW_TAG_pointer_type) {
-      DIDerivedType DTy = DIDerivedType(Ty);
-      subType = DTy.getTypeDerivedFrom();
-    }
-
-    DICompositeType blockStruct = DICompositeType(subType);
-    DIArray Elements = blockStruct.getTypeArray();
+    if (tag == dwarf::DW_TAG_pointer_type)
+      subType = resolve(DIDerivedType(Ty).getTypeDerivedFrom());
 
+    DIArray Elements = DICompositeType(subType).getTypeArray();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-      DIDescriptor Element = Elements.getElement(i);
-      DIDerivedType DT = DIDerivedType(Element);
+      DIDerivedType DT(Elements.getElement(i));
       if (getName() == DT.getName())
-        return (DT.getTypeDerivedFrom());
+        return (resolve(DT.getTypeDerivedFrom()));
     }
   }
   return Ty;
@@ -182,10 +183,10 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     AbbreviationsSet(InitAbbreviationsSetSize),
     SourceIdMap(DIEValueAllocator),
     PrevLabel(NULL), GlobalCUIndexCount(0),
-    InfoHolder(A, &AbbreviationsSet, &Abbreviations, "info_string",
+    InfoHolder(A, &AbbreviationsSet, Abbreviations, "info_string",
                DIEValueAllocator),
     SkeletonAbbrevSet(InitAbbreviationsSetSize),
-    SkeletonHolder(A, &SkeletonAbbrevSet, &SkeletonAbbrevs, "skel_string",
+    SkeletonHolder(A, &SkeletonAbbrevSet, SkeletonAbbrevs, "skel_string",
                    DIEValueAllocator) {
 
   DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
@@ -195,29 +196,24 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfAbbrevDWOSectionSym = DwarfStrDWOSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
 
-  // Turn on accelerator tables and older gdb compatibility
-  // for Darwin.
+  // Turn on accelerator tables for Darwin by default, pubnames by
+  // default for non-Darwin, and handle split dwarf.
   bool IsDarwin = Triple(A->getTargetTriple()).isOSDarwin();
-  if (DarwinGDBCompat == Default) {
-    if (IsDarwin)
-      IsDarwinGDBCompat = true;
-    else
-      IsDarwinGDBCompat = false;
-  } else
-    IsDarwinGDBCompat = DarwinGDBCompat == Enable ? true : false;
 
-  if (DwarfAccelTables == Default) {
-    if (IsDarwin)
-      HasDwarfAccelTables = true;
-    else
-      HasDwarfAccelTables = false;
-  } else
-    HasDwarfAccelTables = DwarfAccelTables == Enable ? true : false;
+  if (DwarfAccelTables == Default)
+    HasDwarfAccelTables = IsDarwin;
+  else
+    HasDwarfAccelTables = DwarfAccelTables == Enable;
 
   if (SplitDwarf == Default)
     HasSplitDwarf = false;
   else
-    HasSplitDwarf = SplitDwarf == Enable ? true : false;
+    HasSplitDwarf = SplitDwarf == Enable;
+
+  if (DwarfPubSections == Default)
+    HasDwarfPubSections = !IsDarwin;
+  else
+    HasDwarfPubSections = DwarfPubSections == Enable;
 
   DwarfVersion = getDwarfVersionFromModule(MMI->getModule());
 
@@ -226,8 +222,6 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     beginModule();
   }
 }
-DwarfDebug::~DwarfDebug() {
-}
 
 // Switch to the specified MCSection and emit an assembler
 // temporary label to it if SymbolStem is specified.
@@ -285,10 +279,10 @@ void DwarfUnits::assignAbbrevNumber(DIEAbbrev &Abbrev) {
   // If it's newly added.
   if (InSet == &Abbrev) {
     // Add to abbreviation list.
-    Abbreviations->push_back(&Abbrev);
+    Abbreviations.push_back(&Abbrev);
 
     // Assign the vector position + 1 as its number.
-    Abbrev.setNumber(Abbreviations->size());
+    Abbrev.setNumber(Abbreviations.size());
   } else {
     // Assign existing abbreviation number.
     Abbrev.setNumber(InSet->getNumber());
@@ -302,12 +296,7 @@ static bool isObjCClass(StringRef Name) {
 static bool hasObjCCategory(StringRef Name) {
   if (!isObjCClass(Name)) return false;
 
-  size_t pos = Name.find(')');
-  if (pos != std::string::npos) {
-    if (Name[pos+1] != ' ') return false;
-    return true;
-  }
-  return false;
+  return Name.find(") ") != StringRef::npos;
 }
 
 static void getObjCClassCategory(StringRef In, StringRef &Class,
@@ -327,11 +316,20 @@ static StringRef getObjCMethodName(StringRef In) {
   return In.slice(In.find(' ') + 1, In.find(']'));
 }
 
+// Helper for sorting sections into a stable output order.
+static bool SectionSort(const MCSection *A, const MCSection *B) {
+    std::string LA = (A ? A->getLabelBeginName() : "");
+    std::string LB = (B ? B->getLabelBeginName() : "");
+    return LA < LB;
+}
+
 // Add the various names to the Dwarf accelerator table names.
+// TODO: Determine whether or not we should add names for programs
+// that do not have a DW_AT_name or DW_AT_linkage_name field - this
+// is only slightly different than the lookup of non-standard ObjC names.
 static void addSubprogramNames(CompileUnit *TheCU, DISubprogram SP,
                                DIE* Die) {
   if (!SP.isDefinition()) return;
-
   TheCU->addAccelName(SP.getName(), Die);
 
   // If the linkage name is different than the name, go ahead and output
@@ -352,30 +350,34 @@ static void addSubprogramNames(CompileUnit *TheCU, DISubprogram SP,
   }
 }
 
+/// isSubprogramContext - Return true if Context is either a subprogram
+/// or another context nested inside a subprogram.
+bool DwarfDebug::isSubprogramContext(const MDNode *Context) {
+  if (!Context)
+    return false;
+  DIDescriptor D(Context);
+  if (D.isSubprogram())
+    return true;
+  if (D.isType())
+    return isSubprogramContext(resolve(DIType(Context).getContext()));
+  return false;
+}
+
 // Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
-DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
-                                          const MDNode *SPNode) {
-  DIE *SPDie = SPCU->getDIE(SPNode);
+DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU, DISubprogram SP) {
+  DIE *SPDie = SPCU->getDIE(SP);
 
   assert(SPDie && "Unable to find subprogram DIE!");
-  DISubprogram SP(SPNode);
 
   // If we're updating an abstract DIE, then we will be adding the children and
   // object pointer later on. But what we don't want to do is process the
   // concrete DIE twice.
-  DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode);
-  if (AbsSPDIE) {
-    bool InSameCU = (AbsSPDIE->getCompileUnit() == SPCU->getCUDie());
+  if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
     // Pick up abstract subprogram DIE.
-    SPDie = new DIE(dwarf::DW_TAG_subprogram);
-    // If AbsSPDIE belongs to a different CU, use DW_FORM_ref_addr instead of
-    // DW_FORM_ref4.
-    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
-                      InSameCU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
-                      AbsSPDIE);
-    SPCU->addDie(SPDie);
+    SPDie = SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *SPCU->getCUDie());
+    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin, AbsSPDIE);
   } else {
     DISubprogram SPDecl = SP.getFunctionDeclaration();
     if (!SPDecl.isSubprogram()) {
@@ -384,32 +386,31 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
       // function then gdb prefers the definition at top level and but does not
       // expect specification DIE in parent function. So avoid creating
       // specification DIE for a function defined inside a function.
-      if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
-          !SP.getContext().isFile() &&
-          !isSubprogramContext(SP.getContext())) {
+      DIScope SPContext = resolve(SP.getContext());
+      if (SP.isDefinition() && !SPContext.isCompileUnit() &&
+          !SPContext.isFile() &&
+          !isSubprogramContext(SPContext)) {
         SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
 
         // Add arguments.
         DICompositeType SPTy = SP.getType();
         DIArray Args = SPTy.getTypeArray();
-        unsigned SPTag = SPTy.getTag();
+        uint16_t SPTag = SPTy.getTag();
         if (SPTag == dwarf::DW_TAG_subroutine_type)
           for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-            DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-            DIType ATy = DIType(Args.getElement(i));
+            DIE *Arg =
+                SPCU->createAndAddDIE(dwarf::DW_TAG_formal_parameter, *SPDie);
+            DIType ATy(Args.getElement(i));
             SPCU->addType(Arg, ATy);
             if (ATy.isArtificial())
               SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
             if (ATy.isObjectPointer())
-              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer,
-                                dwarf::DW_FORM_ref4, Arg);
-            SPDie->addChild(Arg);
+              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer, Arg);
           }
         DIE *SPDeclDie = SPDie;
-        SPDie = new DIE(dwarf::DW_TAG_subprogram);
-        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification,
-                          dwarf::DW_FORM_ref4, SPDeclDie);
-        SPCU->addDie(SPDie);
+        SPDie =
+            SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *SPCU->getCUDie());
+        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, SPDeclDie);
       }
     }
   }
@@ -431,18 +432,39 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
   return SPDie;
 }
 
+/// Check whether we should create a DIE for the given Scope, return true
+/// if we don't create a DIE (the corresponding DIE is null).
+bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
+  if (Scope->isAbstractScope())
+    return false;
+
+  // We don't create a DIE if there is no Range.
+  const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
+  if (Ranges.empty())
+    return true;
+
+  if (Ranges.size() > 1)
+    return false;
+
+  // We don't create a DIE if we have a single Range and the end label
+  // is null.
+  SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
+  MCSymbol *End = getLabelAfterInsn(RI->second);
+  return !End;
+}
+
 // Construct new DW_TAG_lexical_block for this scope and attach
 // DW_AT_low_pc/DW_AT_high_pc labels.
 DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
                                           LexicalScope *Scope) {
+  if (isLexicalScopeDIENull(Scope))
+    return 0;
+
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_lexical_block);
   if (Scope->isAbstractScope())
     return ScopeDIE;
 
   const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
-  if (Ranges.empty())
-    return 0;
-
   // If we have multiple ranges, emit them into the range section.
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
@@ -467,8 +489,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
   SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
   MCSymbol *Start = getLabelBeforeInsn(RI->first);
   MCSymbol *End = getLabelAfterInsn(RI->second);
-
-  if (End == 0) return 0;
+  assert(End && "End label should not be null!");
 
   assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
   assert(End->isDefined() && "Invalid end label for an inlined scope!");
@@ -498,8 +519,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   }
 
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
-  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
-                     dwarf::DW_FORM_ref4, OriginDIE);
+  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin, OriginDIE);
 
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
@@ -535,26 +555,10 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
 
   // Add the call site information to the DIE.
   DILocation DL(Scope->getInlinedAt());
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0,
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, None,
                  getOrCreateSourceID(DL.getFilename(), DL.getDirectory(),
                                      TheCU->getUniqueID()));
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber());
-
-  // Track the start label for this inlined function.
-  //.debug_inlined section specification does not clearly state how
-  // to emit inlined scopes that are split into multiple instruction ranges.
-  // For now, use the first instruction range and emit low_pc/high_pc pair and
-  // corresponding the .debug_inlined section entry for this pair.
-  if (Asm->MAI->doesDwarfUseInlineInfoSection()) {
-    MCSymbol *StartLabel = getLabelBeforeInsn(Ranges.begin()->first);
-    InlineInfoMap::iterator I = InlineInfo.find(InlinedSP);
-
-    if (I == InlineInfo.end()) {
-      InlineInfo[InlinedSP].push_back(std::make_pair(StartLabel, ScopeDIE));
-      InlinedSPNodes.push_back(InlinedSP);
-    } else
-      I->second.push_back(std::make_pair(StartLabel, ScopeDIE));
-  }
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
 
   // Add name to the name table, we do this here because we're guaranteed
   // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
@@ -563,26 +567,16 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   return ScopeDIE;
 }
 
-// Construct a DIE for this scope.
-DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
-  if (!Scope || !Scope->getScopeNode())
-    return NULL;
-
-  DIScope DS(Scope->getScopeNode());
-  // Early return to avoid creating dangling variable|scope DIEs.
-  if (!Scope->getInlinedAt() && DS.isSubprogram() && Scope->isAbstractScope() &&
-      !TheCU->getDIE(DS))
-    return NULL;
-
-  SmallVector<DIE *, 8> Children;
-  DIE *ObjectPointer = NULL;
+DIE *DwarfDebug::createScopeChildrenDIE(CompileUnit *TheCU, LexicalScope *Scope,
+                                        SmallVectorImpl<DIE*> &Children) {
+    DIE *ObjectPointer = NULL;
 
   // Collect arguments for current function.
   if (LScopes.isCurrentFunctionScope(Scope))
     for (unsigned i = 0, N = CurrentFnArguments.size(); i < N; ++i)
       if (DbgVariable *ArgDV = CurrentFnArguments[i])
         if (DIE *Arg =
-            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope())) {
+            TheCU->constructVariableDIE(*ArgDV, Scope->isAbstractScope())) {
           Children.push_back(Arg);
           if (ArgDV->isObjectPointer()) ObjectPointer = Arg;
         }
@@ -591,7 +585,7 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
   const SmallVectorImpl<DbgVariable *> &Variables =ScopeVariables.lookup(Scope);
   for (unsigned i = 0, N = Variables.size(); i < N; ++i)
     if (DIE *Variable =
-        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope())) {
+        TheCU->constructVariableDIE(*Variables[i], Scope->isAbstractScope())) {
       Children.push_back(Variable);
       if (Variables[i]->isObjectPointer()) ObjectPointer = Variable;
     }
@@ -599,6 +593,23 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
   for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
     if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
       Children.push_back(Nested);
+  return ObjectPointer;
+}
+
+// Construct a DIE for this scope.
+DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
+  if (!Scope || !Scope->getScopeNode())
+    return NULL;
+
+  DIScope DS(Scope->getScopeNode());
+
+  SmallVector<DIE *, 8> Children;
+  DIE *ObjectPointer = NULL;
+  bool ChildrenCreated = false;
+
+  // We try to create the scope DIE first, then the children DIEs. This will
+  // avoid creating un-used children then removing them later when we find out
+  // the scope DIE is null.
   DIE *ScopeDIE = NULL;
   if (Scope->getInlinedAt())
     ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
@@ -609,26 +620,41 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
       // Note down abstract DIE.
       if (ScopeDIE)
         AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
-    }
-    else
-      ScopeDIE = updateSubprogramScopeDIE(TheCU, DS);
-  }
-  else {
+    } else
+      ScopeDIE = updateSubprogramScopeDIE(TheCU, DISubprogram(DS));
+  } else {
+    // Early exit when we know the scope DIE is going to be null.
+    if (isLexicalScopeDIENull(Scope))
+      return NULL;
+
+    // We create children here when we know the scope DIE is not going to be
+    // null and the children will be added to the scope DIE.
+    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
+    ChildrenCreated = true;
+
     // There is no need to emit empty lexical block DIE.
     std::pair<ImportedEntityMap::const_iterator,
               ImportedEntityMap::const_iterator> Range = std::equal_range(
         ScopesWithImportedEntities.begin(), ScopesWithImportedEntities.end(),
         std::pair<const MDNode *, const MDNode *>(DS, (const MDNode*)0),
-        CompareFirst());
+        less_first());
     if (Children.empty() && Range.first == Range.second)
       return NULL;
     ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
+    assert(ScopeDIE && "Scope DIE should not be null.");
     for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second;
          ++i)
       constructImportedEntityDIE(TheCU, i->second, ScopeDIE);
   }
 
-  if (!ScopeDIE) return NULL;
+  if (!ScopeDIE) {
+    assert(Children.empty() &&
+           "We create children only when the scope DIE is not null.");
+    return NULL;
+  }
+  if (!ChildrenCreated)
+    // We create children when the scope DIE is not null.
+    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
 
   // Add children
   for (SmallVectorImpl<DIE *>::iterator I = Children.begin(),
@@ -636,8 +662,7 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
     ScopeDIE->addChild(*I);
 
   if (DS.isSubprogram() && ObjectPointer != NULL)
-    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer,
-                       dwarf::DW_FORM_ref4, ObjectPointer);
+    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, ObjectPointer);
 
   if (DS.isSubprogram())
     TheCU->addPubTypes(DISubprogram(DS));
@@ -653,8 +678,10 @@ unsigned DwarfDebug::getOrCreateSourceID(StringRef FileName,
                                          StringRef DirName, unsigned CUID) {
   // If we use .loc in assembly, we can't separate .file entries according to
   // compile units. Thus all files will belong to the default compile unit.
-  if (Asm->TM.hasMCUseLoc() &&
-      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+
+  // FIXME: add a better feature test than hasRawTextSupport. Even better,
+  // extend .file to support this.
+  if (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport())
     CUID = 0;
 
   // If FE did not provide a file name, then assume stdin.
@@ -689,14 +716,12 @@ unsigned DwarfDebug::getOrCreateSourceID(StringRef FileName,
 
 // Create new CompileUnit for the given metadata node with tag
 // DW_TAG_compile_unit.
-CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
-  DICompileUnit DIUnit(N);
+CompileUnit *DwarfDebug::constructCompileUnit(DICompileUnit DIUnit) {
   StringRef FN = DIUnit.getFilename();
   CompilationDir = DIUnit.getDirectory();
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
-                                       DIUnit.getLanguage(), Die, N, Asm,
+  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++, Die, DIUnit, Asm,
                                        this, &InfoHolder);
 
   FileIDCUMap[NewCU->getUniqueID()] = 0;
@@ -723,31 +748,57 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   // Use a single line table if we are using .loc and generating assembly.
   bool UseTheFirstCU =
-    (Asm->TM.hasMCUseLoc() &&
-     Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer) ||
-    (NewCU->getUniqueID() == 0);
+      (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport()) ||
+      (NewCU->getUniqueID() == 0);
 
-  // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. For split dwarf this is
-  // left in the skeleton CU and so not included.
-  // The line table entries are not always emitted in assembly, so it
-  // is not okay to use line_table_start here.
   if (!useSplitDwarf()) {
+    // DW_AT_stmt_list is a offset of line number information for this
+    // compile unit in debug_line section. For split dwarf this is
+    // left in the skeleton CU and so not included.
+    // The line table entries are not always emitted in assembly, so it
+    // is not okay to use line_table_start here.
     if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-      NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
-                      UseTheFirstCU ?
-                      Asm->GetTempSymbol("section_line") : LineTableStartSym);
+      NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_sec_offset,
+                      UseTheFirstCU ? Asm->GetTempSymbol("section_line")
+                                    : LineTableStartSym);
     else if (UseTheFirstCU)
       NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
     else
       NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
                       LineTableStartSym, DwarfLineSectionSym);
+
+    // If we're using split dwarf the compilation dir is going to be in the
+    // skeleton CU and so we don't need to duplicate it here.
+    if (!CompilationDir.empty())
+      NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+
+    // Flags to let the linker know we have emitted new style pubnames. Only
+    // emit it here if we don't have a skeleton CU for split dwarf.
+    if (GenerateGnuPubSections) {
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+        NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubnames,
+                        dwarf::DW_FORM_sec_offset,
+                        Asm->GetTempSymbol("gnu_pubnames",
+                                           NewCU->getUniqueID()));
+      else
+        NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_data4,
+                        Asm->GetTempSymbol("gnu_pubnames",
+                                           NewCU->getUniqueID()),
+                        DwarfGnuPubNamesSectionSym);
+
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+        NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubtypes,
+                        dwarf::DW_FORM_sec_offset,
+                        Asm->GetTempSymbol("gnu_pubtypes",
+                                           NewCU->getUniqueID()));
+      else
+        NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_data4,
+                        Asm->GetTempSymbol("gnu_pubtypes",
+                                           NewCU->getUniqueID()),
+                        DwarfGnuPubTypesSectionSym);
+    }
   }
 
-  // If we're using split dwarf the compilation dir is going to be in the
-  // skeleton CU and so we don't need to duplicate it here.
-  if (!useSplitDwarf() && !CompilationDir.empty())
-    NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
   if (DIUnit.isOptimized())
     NewCU->addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
@@ -764,13 +815,17 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   InfoHolder.addUnit(NewCU);
 
-  CUMap.insert(std::make_pair(N, NewCU));
+  CUMap.insert(std::make_pair(DIUnit, NewCU));
+  CUDieMap.insert(std::make_pair(Die, NewCU));
   return NewCU;
 }
 
 // Construct subprogram DIE.
-void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
-                                        const MDNode *N) {
+void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N) {
+  // FIXME: We should only call this routine once, however, during LTO if a
+  // program is defined in multiple CUs we could end up calling it out of
+  // beginModule as we walk the CUs.
+
   CompileUnit *&CURef = SPMap[N];
   if (CURef)
     return;
@@ -784,15 +839,8 @@ void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
 
   DIE *SubprogramDie = TheCU->getOrCreateSubprogramDIE(SP);
 
-  // Add to map.
-  TheCU->insertDIE(N, SubprogramDie);
-
-  // Add to context owner.
-  TheCU->addToContextOwner(SubprogramDie, SP.getContext());
-
-  // Expose as global, if requested.
-  if (GenerateDwarfPubNamesSection)
-    TheCU->addGlobalName(SP.getName(), SubprogramDie);
+  // Expose as a global name.
+  TheCU->addGlobalName(SP.getName(), SubprogramDie, resolve(SP.getContext()));
 }
 
 void DwarfDebug::constructImportedEntityDIE(CompileUnit *TheCU,
@@ -833,10 +881,9 @@ void DwarfDebug::constructImportedEntityDIE(CompileUnit *TheCU,
   unsigned FileID = getOrCreateSourceID(Module.getContext().getFilename(),
                                         Module.getContext().getDirectory(),
                                         TheCU->getUniqueID());
-  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_file, 0, FileID);
-  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_line, 0, Module.getLineNumber());
-  TheCU->addDIEEntry(IMDie, dwarf::DW_AT_import, dwarf::DW_FORM_ref4,
-                     EntityDie);
+  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_file, None, FileID);
+  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_line, None, Module.getLineNumber());
+  TheCU->addDIEEntry(IMDie, dwarf::DW_AT_import, EntityDie);
   StringRef Name = Module.getName();
   if (!Name.empty())
     TheCU->addString(IMDie, dwarf::DW_AT_name, Name);
@@ -857,6 +904,7 @@ void DwarfDebug::beginModule() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes)
     return;
+  TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
 
   // Emit initial sections so we can reference labels later.
   emitSectionLabels();
@@ -870,10 +918,10 @@ void DwarfDebug::beginModule() {
           DIImportedEntity(ImportedEntities.getElement(i)).getContext(),
           ImportedEntities.getElement(i)));
     std::sort(ScopesWithImportedEntities.begin(),
-              ScopesWithImportedEntities.end(), CompareFirst());
+              ScopesWithImportedEntities.end(), less_first());
     DIArray GVs = CUNode.getGlobalVariables();
     for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
-      CU->createGlobalVariableDIE(GVs.getElement(i));
+      CU->createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
     DIArray SPs = CUNode.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
       constructSubprogramDIE(CU, SPs.getElement(i));
@@ -887,22 +935,13 @@ void DwarfDebug::beginModule() {
     // available.
     for (unsigned i = 0, e = ImportedEntities.getNumElements(); i != e; ++i)
       constructImportedEntityDIE(CU, ImportedEntities.getElement(i));
-    // If we're splitting the dwarf out now that we've got the entire
-    // CU then construct a skeleton CU based upon it.
-    if (useSplitDwarf()) {
-      // This should be a unique identifier when we want to build .dwp files.
-      CU->addUInt(CU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
-                  dwarf::DW_FORM_data8, 0);
-      // Now construct the skeleton CU associated.
-      constructSkeletonCU(CUNode);
-    }
   }
 
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
 
   // Prime section data.
-  SectionMap.insert(Asm->getObjFileLowering().getTextSection());
+  SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
 // Attach DW_AT_inline attribute with inlined subprogram DIEs.
@@ -911,21 +950,20 @@ void DwarfDebug::computeInlinedDIEs() {
   for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
          AE = InlinedSubprogramDIEs.end(); AI != AE; ++AI) {
     DIE *ISP = *AI;
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
   }
   for (DenseMap<const MDNode *, DIE *>::iterator AI = AbstractSPDies.begin(),
          AE = AbstractSPDies.end(); AI != AE; ++AI) {
     DIE *ISP = AI->second;
     if (InlinedSubprogramDIEs.count(ISP))
       continue;
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
   }
 }
 
 // Collect info for variables that were optimized out.
 void DwarfDebug::collectDeadVariables() {
   const Module *M = MMI->getModule();
-  DenseMap<const MDNode *, LexicalScope *> DeadFnScopeMap;
 
   if (NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu")) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
@@ -933,37 +971,38 @@ void DwarfDebug::collectDeadVariables() {
       DIArray Subprograms = TheCU.getSubprograms();
       for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
         DISubprogram SP(Subprograms.getElement(i));
-        if (ProcessedSPNodes.count(SP) != 0) continue;
-        if (!SP.isSubprogram()) continue;
-        if (!SP.isDefinition()) continue;
+        if (ProcessedSPNodes.count(SP) != 0)
+          continue;
+        if (!SP.isSubprogram())
+          continue;
+        if (!SP.isDefinition())
+          continue;
         DIArray Variables = SP.getVariables();
-        if (Variables.getNumElements() == 0) continue;
-
-        LexicalScope *Scope =
-          new LexicalScope(NULL, DIDescriptor(SP), NULL, false);
-        DeadFnScopeMap[SP] = Scope;
+        if (Variables.getNumElements() == 0)
+          continue;
 
         // Construct subprogram DIE and add variables DIEs.
         CompileUnit *SPCU = CUMap.lookup(TheCU);
         assert(SPCU && "Unable to find Compile Unit!");
+        // FIXME: See the comment in constructSubprogramDIE about duplicate
+        // subprogram DIEs.
         constructSubprogramDIE(SPCU, SP);
-        DIE *ScopeDIE = SPCU->getDIE(SP);
+        DIE *SPDIE = SPCU->getDIE(SP);
         for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
           DIVariable DV(Variables.getElement(vi));
-          if (!DV.isVariable()) continue;
-          DbgVariable NewVar(DV, NULL);
+          if (!DV.isVariable())
+            continue;
+          DbgVariable NewVar(DV, NULL, this);
           if (DIE *VariableDIE =
-              SPCU->constructVariableDIE(&NewVar, Scope->isAbstractScope()))
-            ScopeDIE->addChild(VariableDIE);
+                  SPCU->constructVariableDIE(NewVar, false))
+            SPDIE->addChild(VariableDIE);
         }
       }
     }
   }
-  DeleteContainerSeconds(DeadFnScopeMap);
 }
 
-// Type Signature [7.27] computation code.
-typedef ArrayRef<uint8_t> HashValue;
+// Type Signature [7.27] and ODR Hash code.
 
 /// \brief Grabs the string in whichever attribute is passed in and returns
 /// a reference to it. Returns "" if the attribute doesn't exist.
@@ -976,100 +1015,6 @@ static StringRef getDIEStringAttr(DIE *Die, unsigned Attr) {
   return StringRef("");
 }
 
-/// \brief Adds the string in \p Str to the hash in \p Hash. This also hashes
-/// a trailing NULL with the string.
-static void addStringToHash(MD5 &Hash, StringRef Str) {
-  DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
-  Hash.update(Str);
-  Hash.update(makeArrayRef((uint8_t)'\0'));
-}
-
-// FIXME: These are copied and only slightly modified out of LEB128.h.
-
-/// \brief Adds the unsigned in \p N to the hash in \p Hash. This also encodes
-/// the unsigned as a ULEB128.
-static void addULEB128ToHash(MD5 &Hash, uint64_t Value) {
-  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
-  do {
-    uint8_t Byte = Value & 0x7f;
-    Value >>= 7;
-    if (Value != 0)
-      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
-    Hash.update(Byte);
-  } while (Value != 0);
-}
-
-/// \brief Including \p Parent adds the context of Parent to \p Hash.
-static void addParentContextToHash(MD5 &Hash, DIE *Parent) {
-
-  DEBUG(dbgs() << "Adding parent context to hash...\n");
-
-  // [7.27.2] For each surrounding type or namespace beginning with the
-  // outermost such construct...
-  SmallVector<DIE *, 1> Parents;
-  while (Parent->getTag() != dwarf::DW_TAG_compile_unit) {
-    Parents.push_back(Parent);
-    Parent = Parent->getParent();
-  }
-
-  // Reverse iterate over our list to go from the outermost construct to the
-  // innermost.
-  for (SmallVectorImpl<DIE *>::reverse_iterator I = Parents.rbegin(),
-                                                E = Parents.rend();
-       I != E; ++I) {
-    DIE *Die = *I;
-
-    // ... Append the letter "C" to the sequence...
-    addULEB128ToHash(Hash, 'C');
-
-    // ... Followed by the DWARF tag of the construct...
-    addULEB128ToHash(Hash, Die->getTag());
-
-    // ... Then the name, taken from the DW_AT_name attribute.
-    StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
-    DEBUG(dbgs() << "... adding context: " << Name << "\n");
-    if (!Name.empty())
-      addStringToHash(Hash, Name);
-  }
-}
-
-/// This is based on the type signature computation given in section 7.27 of the
-/// DWARF4 standard. It is the md5 hash of a flattened description of the DIE with
-/// the exception that we are hashing only the context and the name of the type.
-static void addDIEODRSignature(MD5 &Hash, CompileUnit *CU, DIE *Die) {
-
-  // Add the contexts to the hash. We won't be computing the ODR hash for
-  // function local types so it's safe to use the generic context hashing
-  // algorithm here.
-  // FIXME: If we figure out how to account for linkage in some way we could
-  // actually do this with a slight modification to the parent hash algorithm.
-  DIE *Parent = Die->getParent();
-  if (Parent)
-    addParentContextToHash(Hash, Parent);
-
-  // Add the current DIE information.
-
-  // Add the DWARF tag of the DIE.
-  addULEB128ToHash(Hash, Die->getTag());
-
-  // Add the name of the type to the hash.
-  addStringToHash(Hash, getDIEStringAttr(Die, dwarf::DW_AT_name));
-
-  // Now get the result.
-  MD5::MD5Result Result;
-  Hash.final(Result);
-
-  // ... take the least significant 8 bytes and store those as the attribute.
-  // Our MD5 implementation always returns its results in little endian, swap
-  // bytes appropriately.
-  uint64_t Signature = *reinterpret_cast<support::ulittle64_t *>(Result + 8);
-
-  // FIXME: This should be added onto the type unit, not the type, but this
-  // works as an intermediate stage.
-  CU->addUInt(Die, dwarf::DW_AT_GNU_odr_signature, dwarf::DW_FORM_data8,
-              Signature);
-}
-
 /// Return true if the current DIE is contained within an anonymous namespace.
 static bool isContainedInAnonNamespace(DIE *Die) {
   DIE *Parent = Die->getParent();
@@ -1090,7 +1035,7 @@ static bool shouldAddODRHash(CompileUnit *CU, DIE *Die) {
   return CU->getLanguage() == dwarf::DW_LANG_C_plus_plus &&
          getDIEStringAttr(Die, dwarf::DW_AT_name) != "" &&
          !isContainedInAnonNamespace(Die);
- }
+}
 
 void DwarfDebug::finalizeModuleInfo() {
   // Collect info for variables that were optimized out.
@@ -1099,43 +1044,102 @@ void DwarfDebug::finalizeModuleInfo() {
   // Attach DW_AT_inline attribute with inlined subprogram DIEs.
   computeInlinedDIEs();
 
-  // Emit DW_AT_containing_type attribute to connect types with their
-  // vtable holding type.
-  for (DenseMap<const MDNode *, CompileUnit *>::iterator CUI = CUMap.begin(),
-         CUE = CUMap.end(); CUI != CUE; ++CUI) {
-    CompileUnit *TheCU = CUI->second;
-    TheCU->constructContainingTypeDIEs();
-  }
-
   // Split out type units and conditionally add an ODR tag to the split
   // out type.
   // FIXME: Do type splitting.
   for (unsigned i = 0, e = TypeUnits.size(); i != e; ++i) {
-    MD5 Hash;
     DIE *Die = TypeUnits[i];
+    DIEHash Hash;
     // If we've requested ODR hashes and it's applicable for an ODR hash then
     // add the ODR signature now.
+    // FIXME: This should be added onto the type unit, not the type, but this
+    // works as an intermediate stage.
     if (GenerateODRHash && shouldAddODRHash(CUMap.begin()->second, Die))
-      addDIEODRSignature(Hash, CUMap.begin()->second, Die);
+      CUMap.begin()->second->addUInt(Die, dwarf::DW_AT_GNU_odr_signature,
+                                     dwarf::DW_FORM_data8,
+                                     Hash.computeDIEODRSignature(*Die));
   }
 
-   // Compute DIE offsets and sizes.
+  // Handle anything that needs to be done on a per-cu basis.
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator CUI = CUMap.begin(),
+                                                         CUE = CUMap.end();
+       CUI != CUE; ++CUI) {
+    CompileUnit *TheCU = CUI->second;
+    // Emit DW_AT_containing_type attribute to connect types with their
+    // vtable holding type.
+    TheCU->constructContainingTypeDIEs();
+
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then construct a skeleton CU based upon it.
+    if (useSplitDwarf()) {
+      uint64_t ID = 0;
+      if (GenerateCUHash) {
+        DIEHash CUHash;
+        ID = CUHash.computeCUSignature(*TheCU->getCUDie());
+      }
+      // This should be a unique identifier when we want to build .dwp files.
+      TheCU->addUInt(TheCU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
+                     dwarf::DW_FORM_data8, ID);
+      // Now construct the skeleton CU associated.
+      CompileUnit *SkCU = constructSkeletonCU(TheCU);
+      // This should be a unique identifier when we want to build .dwp files.
+      SkCU->addUInt(SkCU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+    }
+  }
+
+  // Compute DIE offsets and sizes.
   InfoHolder.computeSizeAndOffsets();
   if (useSplitDwarf())
     SkeletonHolder.computeSizeAndOffsets();
 }
 
 void DwarfDebug::endSections() {
-  // Standard sections final addresses.
-  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getTextSection());
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("text_end"));
-  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getDataSection());
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("data_end"));
+   // Filter labels by section.
+  for (size_t n = 0; n < ArangeLabels.size(); n++) {
+    const SymbolCU &SCU = ArangeLabels[n];
+    if (SCU.Sym->isInSection()) {
+      // Make a note of this symbol and it's section.
+      const MCSection *Section = &SCU.Sym->getSection();
+      if (!Section->getKind().isMetadata())
+        SectionMap[Section].push_back(SCU);
+    } else {
+      // Some symbols (e.g. common/bss on mach-o) can have no section but still
+      // appear in the output. This sucks as we rely on sections to build
+      // arange spans. We can do it without, but it's icky.
+      SectionMap[NULL].push_back(SCU);
+    }
+  }
 
-  // End text sections.
-  for (unsigned I = 0, E = SectionMap.size(); I != E; ++I) {
-    Asm->OutStreamer.SwitchSection(SectionMap[I]);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", I+1));
+  // Build a list of sections used.
+  std::vector<const MCSection *> Sections;
+  for (SectionMapType::iterator it = SectionMap.begin(); it != SectionMap.end();
+       it++) {
+    const MCSection *Section = it->first;
+    Sections.push_back(Section);
+  }
+
+  // Sort the sections into order.
+  // This is only done to ensure consistent output order across different runs.
+  std::sort(Sections.begin(), Sections.end(), SectionSort);
+
+  // Add terminating symbols for each section.
+  for (unsigned ID=0;ID<Sections.size();ID++) {
+    const MCSection *Section = Sections[ID];
+    MCSymbol *Sym = NULL;
+
+    if (Section) {
+      // We can't call MCSection::getLabelEndName, as it's only safe to do so
+      // if we know the section name up-front. For user-created sections, the resulting
+      // label may not be valid to use as a label. (section names can use a greater
+      // set of characters on some systems)
+      Sym = Asm->GetTempSymbol("debug_end", ID);
+      Asm->OutStreamer.SwitchSection(Section);
+      Asm->OutStreamer.EmitLabel(Sym);
+    }
+
+    // Insert a final terminator.
+    SectionMap[Section].push_back(SymbolCU(NULL, Sym));
   }
 }
 
@@ -1152,6 +1156,8 @@ void DwarfDebug::endModule() {
   finalizeModuleInfo();
 
   if (!useSplitDwarf()) {
+    emitDebugStr();
+
     // Emit all the DIEs into a debug info section.
     emitDebugInfo();
 
@@ -1170,15 +1176,12 @@ void DwarfDebug::endModule() {
     // Emit info into a debug macinfo section.
     emitDebugMacInfo();
 
-    // Emit inline info.
-    // TODO: When we don't need the option anymore we
-    // can remove all of the code that this section
-    // depends upon.
-    if (useDarwinGDBCompat())
-      emitDebugInlineInfo();
   } else {
     // TODO: Fill this in for separated debug sections and separate
     // out information into new sections.
+    emitDebugStr();
+    if (useSplitDwarf())
+      emitDebugStrDWO();
 
     // Emit the debug info section and compile units.
     emitDebugInfo();
@@ -1203,12 +1206,6 @@ void DwarfDebug::endModule() {
     // Emit DWO addresses.
     InfoHolder.emitAddresses(Asm->getObjFileLowering().getDwarfAddrSection());
 
-    // Emit inline info.
-    // TODO: When we don't need the option anymore we
-    // can remove all of the code that this section
-    // depends upon.
-    if (useDarwinGDBCompat())
-      emitDebugInlineInfo();
   }
 
   // Emit info into the dwarf accelerator table sections.
@@ -1219,20 +1216,11 @@ void DwarfDebug::endModule() {
     emitAccelTypes();
   }
 
-  // Emit info into a debug pubnames section, if requested.
-  if (GenerateDwarfPubNamesSection)
-    emitDebugPubnames();
-
-  // Emit info into a debug pubtypes section.
-  // TODO: When we don't need the option anymore we can
-  // remove all of the code that adds to the table.
-  if (useDarwinGDBCompat())
-    emitDebugPubTypes();
-
-  // Finally emit string information into a string table.
-  emitDebugStr();
-  if (useSplitDwarf())
-    emitDebugStrDWO();
+  // Emit the pubnames and pubtypes sections if requested.
+  if (HasDwarfPubSections) {
+    emitDebugPubNames(GenerateGnuPubSections);
+    emitDebugPubTypes(GenerateGnuPubSections);
+  }
 
   // clean up.
   SPMap.clear();
@@ -1262,7 +1250,7 @@ DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
   if (!Scope)
     return NULL;
 
-  AbsDbgVariable = new DbgVariable(Var, NULL);
+  AbsDbgVariable = new DbgVariable(Var, NULL, this);
   addScopeVariable(Scope, AbsDbgVariable);
   AbstractVariables[Var] = AbsDbgVariable;
   return AbsDbgVariable;
@@ -1311,7 +1299,7 @@ DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction *MF,
       continue;
 
     DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VP.second);
-    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable, this);
     RegVar->setFrameIndex(VP.first);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
       addScopeVariable(Scope, RegVar);
@@ -1396,7 +1384,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     Processed.insert(DV);
     assert(MInsn->isDebugValue() && "History must begin with debug value");
     DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
-    DbgVariable *RegVar = new DbgVariable(DV, AbsVar);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsVar, this);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
       addScopeVariable(Scope, RegVar);
     if (AbsVar)
@@ -1459,7 +1447,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     if (!DV || !DV.isVariable() || !Processed.insert(DV))
       continue;
     if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
-      addScopeVariable(Scope, new DbgVariable(DV, NULL));
+      addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
   }
 }
 
@@ -1602,36 +1590,45 @@ static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
 void DwarfDebug::beginFunction(const MachineFunction *MF) {
-  if (!MMI->hasDebugInfo()) return;
+
+  // If there's no debug info for the function we're not going to do anything.
+  if (!MMI->hasDebugInfo())
+    return;
+
+  // Grab the lexical scopes for the function, if we don't have any of those
+  // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
-  if (LScopes.empty()) return;
+  if (LScopes.empty())
+    return;
+
+  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
+
+  // Make sure that each lexical scope will have a begin/end label.
   identifyScopeMarkers();
 
   // Set DwarfCompileUnitID in MCContext to the Compile Unit this function
-  // belongs to.
+  // belongs to so that we add to the correct per-cu line table in the
+  // non-asm case.
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
   CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
   assert(TheCU && "Unable to find compile unit!");
-  if (Asm->TM.hasMCUseLoc() &&
-      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+  if (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport())
     // Use a single line table if we are using .loc and generating assembly.
     Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
   else
     Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
 
-  FunctionBeginSym = Asm->GetTempSymbol("func_begin",
-                                        Asm->getFunctionNumber());
+  // Emit a label for the function so that we have a beginning address.
+  FunctionBeginSym = Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber());
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
-  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
-
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
   // LiveUserVar - Map physreg numbers to the MDNode they contain.
-  std::vector<const MDNode*> LiveUserVar(TRI->getNumRegs());
+  std::vector<const MDNode *> LiveUserVar(TRI->getNumRegs());
 
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
+       ++I) {
     bool AtBlockEntry = true;
     for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
          II != IE; ++II) {
@@ -1642,22 +1639,21 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
         // Keep track of user variables.
         const MDNode *Var =
-          MI->getOperand(MI->getNumOperands() - 1).getMetadata();
+            MI->getOperand(MI->getNumOperands() - 1).getMetadata();
 
         // Variable is in a register, we need to check for clobbers.
         if (isDbgValueInDefinedReg(MI))
           LiveUserVar[MI->getOperand(0).getReg()] = Var;
 
         // Check the history of this variable.
-        SmallVectorImpl<const MachineInstr*> &History = DbgValues[Var];
+        SmallVectorImpl<const MachineInstr *> &History = DbgValues[Var];
         if (History.empty()) {
           UserVariables.push_back(Var);
           // The first mention of a function argument gets the FunctionBeginSym
           // label, so arguments are visible when breaking at function entry.
           DIVariable DV(Var);
           if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
-              DISubprogram(getDISubprogram(DV.getContext()))
-                .describes(MF->getFunction()))
+              getDISubprogram(DV.getContext()).describes(MF->getFunction()))
             LabelsBeforeInsn[MI] = FunctionBeginSym;
         } else {
           // We have seen this variable before. Try to coalesce DBG_VALUEs.
@@ -1667,8 +1663,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
             if (History.size() >= 2 &&
                 Prev->isIdenticalTo(History[History.size() - 2])) {
               DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
-                    << "\t" << *Prev
-                    << "\t" << *History[History.size() - 2] << "\n");
+                           << "\t" << *Prev << "\t"
+                           << *History[History.size() - 2] << "\n");
               History.pop_back();
             }
 
@@ -1679,11 +1675,11 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
               // Previous register assignment needs to terminate at the end of
               // its basic block.
               MachineBasicBlock::const_iterator LastMI =
-                PrevMBB->getLastNonDebugInstr();
+                  PrevMBB->getLastNonDebugInstr();
               if (LastMI == PrevMBB->end()) {
                 // Drop DBG_VALUE for empty range.
                 DEBUG(dbgs() << "Dropping DBG_VALUE for empty range:\n"
-                      << "\t" << *Prev << "\n");
+                             << "\t" << *Prev << "\n");
                 History.pop_back();
               } else if (llvm::next(PrevMBB) != PrevMBB->getParent()->end())
                 // Terminate after LastMI.
@@ -1705,11 +1701,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
         // Check if the instruction clobbers any registers with debug vars.
         for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
-               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+                                              MOE = MI->operands_end();
+             MOI != MOE; ++MOI) {
           if (!MOI->isReg() || !MOI->isDef() || !MOI->getReg())
             continue;
-          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true);
-               AI.isValid(); ++AI) {
+          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true); AI.isValid();
+               ++AI) {
             unsigned Reg = *AI;
             const MDNode *Var = LiveUserVar[Reg];
             if (!Var)
@@ -1721,7 +1718,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
             DbgValueHistoryMap::iterator HistI = DbgValues.find(Var);
             if (HistI == DbgValues.end())
               continue;
-            SmallVectorImpl<const MachineInstr*> &History = HistI->second;
+            SmallVectorImpl<const MachineInstr *> &History = HistI->second;
             if (History.empty())
               continue;
             const MachineInstr *Prev = History.back();
@@ -1743,7 +1740,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   for (DbgValueHistoryMap::iterator I = DbgValues.begin(), E = DbgValues.end();
        I != E; ++I) {
-    SmallVectorImpl<const MachineInstr*> &History = I->second;
+    SmallVectorImpl<const MachineInstr *> &History = I->second;
     if (History.empty())
       continue;
 
@@ -1752,7 +1749,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     if (Prev->isDebugValue() && isDbgValueInDefinedReg(Prev)) {
       const MachineBasicBlock *PrevMBB = Prev->getParent();
       MachineBasicBlock::const_iterator LastMI =
-        PrevMBB->getLastNonDebugInstr();
+          PrevMBB->getLastNonDebugInstr();
       if (LastMI == PrevMBB->end())
         // Drop DBG_VALUE for empty range.
         History.pop_back();
@@ -1776,13 +1773,14 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   // Record beginning of function.
   if (!PrologEndLoc.isUnknown()) {
-    DebugLoc FnStartDL = getFnDebugLoc(PrologEndLoc,
-                                       MF->getFunction()->getContext());
-    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
-                     FnStartDL.getScope(MF->getFunction()->getContext()),
-    // We'd like to list the prologue as "not statements" but GDB behaves
-    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-                     DWARF2_FLAG_IS_STMT);
+    DebugLoc FnStartDL =
+        getFnDebugLoc(PrologEndLoc, MF->getFunction()->getContext());
+    recordSourceLine(
+        FnStartDL.getLine(), FnStartDL.getCol(),
+        FnStartDL.getScope(MF->getFunction()->getContext()),
+        // We'd like to list the prologue as "not statements" but GDB behaves
+        // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+        DWARF2_FLAG_IS_STMT);
   }
 }
 
@@ -1855,7 +1853,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
         if (AbstractVariables.lookup(CleanDV))
           continue;
         if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
-          addScopeVariable(Scope, new DbgVariable(DV, NULL));
+          addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
       }
     }
     if (ProcessedSPNodes.count(AScope->getScopeNode()) == 0)
@@ -1924,7 +1922,8 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
 // Emit Methods
 //===----------------------------------------------------------------------===//
 
-// Compute the size and offset of a DIE.
+// Compute the size and offset of a DIE. The offset is relative to start of the
+// CU. It returns the offset after laying out the DIE.
 unsigned
 DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
   // Get the children.
@@ -1935,7 +1934,7 @@ DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
 
   // Get the abbreviation for this DIE.
   unsigned AbbrevNumber = Die->getAbbrevNumber();
-  const DIEAbbrev *Abbrev = Abbreviations->at(AbbrevNumber - 1);
+  const DIEAbbrev *Abbrev = Abbreviations[AbbrevNumber - 1];
 
   // Set DIE offset
   Die->setOffset(Offset);
@@ -1967,19 +1966,23 @@ DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
   return Offset;
 }
 
-// Compute the size and offset of all the DIEs.
+// Compute the size and offset for each DIE.
 void DwarfUnits::computeSizeAndOffsets() {
-  // Offset from the beginning of debug info section.
+  // Offset from the first CU in the debug info section is 0 initially.
   unsigned SecOffset = 0;
+
+  // Iterate over each compile unit and set the size and offsets for each
+  // DIE within each compile unit. All offsets are CU relative.
   for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
          E = CUs.end(); I != E; ++I) {
     (*I)->setDebugInfoOffset(SecOffset);
-    unsigned Offset =
-      sizeof(int32_t) + // Length of Compilation Unit Info
-      sizeof(int16_t) + // DWARF version number
-      sizeof(int32_t) + // Offset Into Abbrev. Section
-      sizeof(int8_t);   // Pointer Size (in bytes)
 
+    // CU-relative offset is reset to 0 here.
+    unsigned Offset = sizeof(int32_t) + // Length of Unit Info
+                      (*I)->getHeaderSize(); // Unit-specific headers
+
+    // EndOffset here is CU-relative, after laying out
+    // all of the CU DIE.
     unsigned EndOffset = computeSizeAndOffset((*I)->getCUDie(), Offset);
     SecOffset += EndOffset;
   }
@@ -2006,9 +2009,16 @@ void DwarfDebug::emitSectionLabels() {
   DwarfLineSectionSym =
     emitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
   emitSectionSym(Asm, TLOF.getDwarfLocSection());
-  if (GenerateDwarfPubNamesSection)
+  if (GenerateGnuPubSections) {
+    DwarfGnuPubNamesSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfGnuPubNamesSection());
+    DwarfGnuPubTypesSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfGnuPubTypesSection());
+  } else if (HasDwarfPubSections) {
     emitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
-  emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
+    emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
+  }
+
   DwarfStrSectionSym =
     emitSectionSym(Asm, TLOF.getDwarfStrSection(), "info_string");
   if (useSplitDwarf()) {
@@ -2028,10 +2038,10 @@ void DwarfDebug::emitSectionLabels() {
 }
 
 // Recursively emits a debug information entry.
-void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
+void DwarfDebug::emitDIE(DIE *Die, ArrayRef<DIEAbbrev *> Abbrevs) {
   // Get the abbreviation for this DIE.
   unsigned AbbrevNumber = Die->getAbbrevNumber();
-  const DIEAbbrev *Abbrev = Abbrevs->at(AbbrevNumber - 1);
+  const DIEAbbrev *Abbrev = Abbrevs[AbbrevNumber - 1];
 
   // Emit the code (index) for the abbreviation.
   if (Asm->isVerbose())
@@ -2046,27 +2056,44 @@ void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
 
   // Emit the DIE attribute values.
   for (unsigned i = 0, N = Values.size(); i < N; ++i) {
-    unsigned Attr = AbbrevData[i].getAttribute();
-    unsigned Form = AbbrevData[i].getForm();
+    dwarf::Attribute Attr = AbbrevData[i].getAttribute();
+    dwarf::Form Form = AbbrevData[i].getForm();
     assert(Form && "Too many attributes for DIE (check abbreviation)");
 
     if (Asm->isVerbose())
       Asm->OutStreamer.AddComment(dwarf::AttributeString(Attr));
 
     switch (Attr) {
-    case dwarf::DW_AT_abstract_origin: {
+    case dwarf::DW_AT_abstract_origin:
+    case dwarf::DW_AT_type:
+    case dwarf::DW_AT_friend:
+    case dwarf::DW_AT_specification:
+    case dwarf::DW_AT_import:
+    case dwarf::DW_AT_containing_type: {
       DIEEntry *E = cast<DIEEntry>(Values[i]);
       DIE *Origin = E->getEntry();
       unsigned Addr = Origin->getOffset();
       if (Form == dwarf::DW_FORM_ref_addr) {
+        assert(!useSplitDwarf() && "TODO: dwo files can't have relocations.");
         // For DW_FORM_ref_addr, output the offset from beginning of debug info
         // section. Origin->getOffset() returns the offset from start of the
         // compile unit.
-        DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
-        Addr += Holder.getCUOffset(Origin->getCompileUnit());
+        CompileUnit *CU = CUDieMap.lookup(Origin->getCompileUnit());
+        assert(CU && "CUDie should belong to a CU.");
+        Addr += CU->getDebugInfoOffset();
+        if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+          Asm->EmitLabelPlusOffset(DwarfInfoSectionSym, Addr,
+                                   DIEEntry::getRefAddrSize(Asm));
+        else
+          Asm->EmitLabelOffsetDifference(DwarfInfoSectionSym, Addr,
+                                         DwarfInfoSectionSym,
+                                         DIEEntry::getRefAddrSize(Asm));
+      } else {
+        // Make sure Origin belong to the same CU.
+        assert(Die->getCompileUnit() == Origin->getCompileUnit() &&
+               "The referenced DIE should belong to the same CU in ref4");
+        Asm->EmitInt32(Addr);
       }
-      Asm->OutStreamer.EmitIntValue(Addr,
-          Form == dwarf::DW_FORM_ref_addr ? DIEEntry::getRefAddrSize(Asm) : 4);
       break;
     }
     case dwarf::DW_AT_ranges: {
@@ -2088,7 +2115,7 @@ void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
     case dwarf::DW_AT_location: {
       if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) {
         if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-          Asm->EmitLabelReference(L->getValue(), 4);
+          Asm->EmitSectionOffset(L->getValue(), DwarfDebugLocSectionSym);
         else
           Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
       } else {
@@ -2142,20 +2169,10 @@ void DwarfUnits::emitUnits(DwarfDebug *DD,
                                     TheCU->getUniqueID()));
 
     // Emit size of content not including length itself
-    unsigned ContentSize = Die->getSize() +
-      sizeof(int16_t) + // DWARF version number
-      sizeof(int32_t) + // Offset Into Abbrev. Section
-      sizeof(int8_t);   // Pointer Size (in bytes)
+    Asm->OutStreamer.AddComment("Length of Unit");
+    Asm->EmitInt32(TheCU->getHeaderSize() + Die->getSize());
 
-    Asm->OutStreamer.AddComment("Length of Compilation Unit Info");
-    Asm->EmitInt32(ContentSize);
-    Asm->OutStreamer.AddComment("DWARF version number");
-    Asm->EmitInt16(DD->getDwarfVersion());
-    Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
-    Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
-                           ASectionSym);
-    Asm->OutStreamer.AddComment("Address Size (in bytes)");
-    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+    TheCU->emitHeader(ASection, ASectionSym);
 
     DD->emitDIE(Die, Abbreviations);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(USection->getLabelEndName(),
@@ -2163,19 +2180,6 @@ void DwarfUnits::emitUnits(DwarfDebug *DD,
   }
 }
 
-/// For a given compile unit DIE, returns offset from beginning of debug info.
-unsigned DwarfUnits::getCUOffset(DIE *Die) {
-  assert(Die->getTag() == dwarf::DW_TAG_compile_unit  &&
-         "Input DIE should be compile unit in getCUOffset.");
-  for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
-       E = CUs.end(); I != E; ++I) {
-    CompileUnit *TheCU = *I;
-    if (TheCU->getCUDie() == Die)
-      return TheCU->getDebugInfoOffset();
-  }
-  llvm_unreachable("The compile unit DIE should belong to CUs in DwarfUnits.");
-}
-
 // Emit the debug info section.
 void DwarfDebug::emitDebugInfo() {
   DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
@@ -2249,7 +2253,7 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
 
 // Emit visible names into a hashed accelerator table section.
 void DwarfDebug::emitAccelNames() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2278,7 +2282,7 @@ void DwarfDebug::emitAccelNames() {
 // Emit objective C classes and categories into a hashed accelerator table
 // section.
 void DwarfDebug::emitAccelObjC() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2306,7 +2310,7 @@ void DwarfDebug::emitAccelObjC() {
 
 // Emit namespace dies into a hashed accelerator table.
 void DwarfDebug::emitAccelNamespaces() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2335,11 +2339,11 @@ void DwarfDebug::emitAccelNamespaces() {
 // Emit type dies into a hashed accelerator table.
 void DwarfDebug::emitAccelTypes() {
   std::vector<DwarfAccelTable::Atom> Atoms;
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                         dwarf::DW_FORM_data4));
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTag,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag,
                                         dwarf::DW_FORM_data2));
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTypeFlags,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags,
                                         dwarf::DW_FORM_data1));
   DwarfAccelTable AT(Atoms);
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
@@ -2367,23 +2371,85 @@ void DwarfDebug::emitAccelTypes() {
   AT.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
-/// emitDebugPubnames - Emit visible names into a debug pubnames section.
+// Public name handling.
+// The format for the various pubnames:
+//
+// dwarf pubnames - offset/name pairs where the offset is the offset into the CU
+// for the DIE that is named.
+//
+// gnu pubnames - offset/index value/name tuples where the offset is the offset
+// into the CU and the index value is computed according to the type of value
+// for the DIE that is named.
+//
+// For type units the offset is the offset of the skeleton DIE. For split dwarf
+// it's the offset within the debug_info/debug_types dwo section, however, the
+// reference in the pubname header doesn't change.
+
+/// computeIndexValue - Compute the gdb index value for the DIE and CU.
+static dwarf::PubIndexEntryDescriptor computeIndexValue(CompileUnit *CU,
+                                                        DIE *Die) {
+  dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
+
+  // We could have a specification DIE that has our most of our knowledge,
+  // look for that now.
+  DIEValue *SpecVal = Die->findAttribute(dwarf::DW_AT_specification);
+  if (SpecVal) {
+    DIE *SpecDIE = cast<DIEEntry>(SpecVal)->getEntry();
+    if (SpecDIE->findAttribute(dwarf::DW_AT_external))
+      Linkage = dwarf::GIEL_EXTERNAL;
+  } else if (Die->findAttribute(dwarf::DW_AT_external))
+    Linkage = dwarf::GIEL_EXTERNAL;
+
+  switch (Die->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+    return dwarf::PubIndexEntryDescriptor(
+        dwarf::GIEK_TYPE, CU->getLanguage() != dwarf::DW_LANG_C_plus_plus
+                              ? dwarf::GIEL_STATIC
+                              : dwarf::GIEL_EXTERNAL);
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_subrange_type:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, dwarf::GIEL_STATIC);
+  case dwarf::DW_TAG_namespace:
+    return dwarf::GIEK_TYPE;
+  case dwarf::DW_TAG_subprogram:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_FUNCTION, Linkage);
+  case dwarf::DW_TAG_constant:
+  case dwarf::DW_TAG_variable:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE, Linkage);
+  case dwarf::DW_TAG_enumerator:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE,
+                                          dwarf::GIEL_STATIC);
+  default:
+    return dwarf::GIEK_NONE;
+  }
+}
+
+/// emitDebugPubNames - Emit visible names into a debug pubnames section.
 ///
-void DwarfDebug::emitDebugPubnames() {
+void DwarfDebug::emitDebugPubNames(bool GnuStyle) {
   const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  const MCSection *PSec =
+      GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
+               : Asm->getObjFileLowering().getDwarfPubNamesSection();
 
   typedef DenseMap<const MDNode*, CompileUnit*> CUMapType;
   for (CUMapType::iterator I = CUMap.begin(), E = CUMap.end(); I != E; ++I) {
     CompileUnit *TheCU = I->second;
     unsigned ID = TheCU->getUniqueID();
 
-    if (TheCU->getGlobalNames().empty())
-      continue;
-
     // Start the dwarf pubnames section.
-    Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfPubNamesSection());
+    Asm->OutStreamer.SwitchSection(PSec);
+
+    // Emit a label so we can reference the beginning of this pubname section.
+    if (GnuStyle)
+      Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("gnu_pubnames",
+                                                    TheCU->getUniqueID()));
 
+    // Emit the header.
     Asm->OutStreamer.AddComment("Length of Public Names Info");
     Asm->EmitLabelDifference(Asm->GetTempSymbol("pubnames_end", ID),
                              Asm->GetTempSymbol("pubnames_begin", ID), 4);
@@ -2391,7 +2457,7 @@ void DwarfDebug::emitDebugPubnames() {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_begin", ID));
 
     Asm->OutStreamer.AddComment("DWARF Version");
-    Asm->EmitInt16(DwarfVersion);
+    Asm->EmitInt16(dwarf::DW_PUBNAMES_VERSION);
 
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
     Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
@@ -2402,15 +2468,24 @@ void DwarfDebug::emitDebugPubnames() {
                              Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
                              4);
 
+    // Emit the pubnames for this compilation unit.
     const StringMap<DIE*> &Globals = TheCU->getGlobalNames();
     for (StringMap<DIE*>::const_iterator
            GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
-      const DIE *Entity = GI->second;
+      DIE *Entity = GI->second;
 
       Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
 
+      if (GnuStyle) {
+        dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheCU, Entity);
+        Asm->OutStreamer.AddComment(
+            Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
+            dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+        Asm->EmitInt8(Desc.toBits());
+      }
+
       if (Asm->isVerbose())
         Asm->OutStreamer.AddComment("External Name");
       Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
@@ -2422,55 +2497,78 @@ void DwarfDebug::emitDebugPubnames() {
   }
 }
 
-void DwarfDebug::emitDebugPubTypes() {
+void DwarfDebug::emitDebugPubTypes(bool GnuStyle) {
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  const MCSection *PSec =
+      GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
+               : Asm->getObjFileLowering().getDwarfPubTypesSection();
+
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
-         E = CUMap.end(); I != E; ++I) {
+                                                         E = CUMap.end();
+       I != E; ++I) {
     CompileUnit *TheCU = I->second;
     // Start the dwarf pubtypes section.
-    Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfPubTypesSection());
+    Asm->OutStreamer.SwitchSection(PSec);
+
+    // Emit a label so we can reference the beginning of this pubtype section.
+    if (GnuStyle)
+      Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("gnu_pubtypes",
+                                                    TheCU->getUniqueID()));
+
+    // Emit the header.
     Asm->OutStreamer.AddComment("Length of Public Types Info");
     Asm->EmitLabelDifference(
-      Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()),
-      Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()), 4);
+        Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()),
+        Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()), 4);
 
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_begin",
-                                                  TheCU->getUniqueID()));
+    Asm->OutStreamer.EmitLabel(
+        Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()));
 
-    if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DWARF Version");
-    Asm->EmitInt16(DwarfVersion);
+    if (Asm->isVerbose())
+      Asm->OutStreamer.AddComment("DWARF Version");
+    Asm->EmitInt16(dwarf::DW_PUBTYPES_VERSION);
 
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
-    const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
-    Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(),
-                                              TheCU->getUniqueID()),
-                           DwarfInfoSectionSym);
+    Asm->EmitSectionOffset(
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), TheCU->getUniqueID()),
+        DwarfInfoSectionSym);
 
     Asm->OutStreamer.AddComment("Compilation Unit Length");
-    Asm->EmitLabelDifference(Asm->GetTempSymbol(ISec->getLabelEndName(),
-                                                TheCU->getUniqueID()),
-                             Asm->GetTempSymbol(ISec->getLabelBeginName(),
-                                                TheCU->getUniqueID()),
-                             4);
-
-    const StringMap<DIE*> &Globals = TheCU->getGlobalTypes();
-    for (StringMap<DIE*>::const_iterator
-           GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
+    Asm->EmitLabelDifference(
+        Asm->GetTempSymbol(ISec->getLabelEndName(), TheCU->getUniqueID()),
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), TheCU->getUniqueID()), 4);
+
+    // Emit the pubtypes.
+    const StringMap<DIE *> &Globals = TheCU->getGlobalTypes();
+    for (StringMap<DIE *>::const_iterator GI = Globals.begin(),
+                                          GE = Globals.end();
+         GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
       DIE *Entity = GI->second;
 
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
 
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("External Name");
+      if (GnuStyle) {
+        dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheCU, Entity);
+        Asm->OutStreamer.AddComment(
+            Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
+            dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+        Asm->EmitInt8(Desc.toBits());
+      }
+
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("External Name");
+
       // Emit the name with a terminating null byte.
-      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
+      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength() + 1));
     }
 
     Asm->OutStreamer.AddComment("End Mark");
     Asm->EmitInt32(0);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_end",
-                                                  TheCU->getUniqueID()));
+    Asm->OutStreamer.EmitLabel(
+        Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()));
   }
 }
 
@@ -2649,18 +2747,178 @@ void DwarfDebug::emitDebugLoc() {
   }
 }
 
-// Emit visible names into a debug aranges section.
+struct SymbolCUSorter {
+  SymbolCUSorter(const MCStreamer &s) : Streamer(s) {}
+  const MCStreamer &Streamer;
+
+  bool operator() (const SymbolCU &A, const SymbolCU &B) {
+    unsigned IA = A.Sym ? Streamer.GetSymbolOrder(A.Sym) : 0;
+    unsigned IB = B.Sym ? Streamer.GetSymbolOrder(B.Sym) : 0;
+
+    // Symbols with no order assigned should be placed at the end.
+    // (e.g. section end labels)
+    if (IA == 0)
+      IA = (unsigned)(-1);
+    if (IB == 0)
+      IB = (unsigned)(-1);
+    return IA < IB;
+  }
+};
+
+static bool CUSort(const CompileUnit *A, const CompileUnit *B) {
+    return (A->getUniqueID() < B->getUniqueID());
+}
+
+struct ArangeSpan {
+  const MCSymbol *Start, *End;
+};
+
+// Emit a debug aranges section, containing a CU lookup for any
+// address we can tie back to a CU.
 void DwarfDebug::emitDebugARanges() {
   // Start the dwarf aranges section.
-  Asm->OutStreamer.SwitchSection(
-                          Asm->getObjFileLowering().getDwarfARangesSection());
+  Asm->OutStreamer
+      .SwitchSection(Asm->getObjFileLowering().getDwarfARangesSection());
+
+  typedef DenseMap<CompileUnit *, std::vector<ArangeSpan> > SpansType;
+
+  SpansType Spans;
+
+  // Build a list of sections used.
+  std::vector<const MCSection *> Sections;
+  for (SectionMapType::iterator it = SectionMap.begin(); it != SectionMap.end();
+       it++) {
+    const MCSection *Section = it->first;
+    Sections.push_back(Section);
+  }
+
+  // Sort the sections into order.
+  // This is only done to ensure consistent output order across different runs.
+  std::sort(Sections.begin(), Sections.end(), SectionSort);
+
+  // Build a set of address spans, sorted by CU.
+  for (size_t SecIdx=0;SecIdx<Sections.size();SecIdx++) {
+    const MCSection *Section = Sections[SecIdx];
+    SmallVector<SymbolCU, 8> &List = SectionMap[Section];
+    if (List.size() < 2)
+      continue;
+
+    // Sort the symbols by offset within the section.
+    SymbolCUSorter sorter(Asm->OutStreamer);
+    std::sort(List.begin(), List.end(), sorter);
+
+    // If we have no section (e.g. common), just write out
+    // individual spans for each symbol.
+    if (Section == NULL) {
+      for (size_t n = 0; n < List.size(); n++) {
+        const SymbolCU &Cur = List[n];
+
+        ArangeSpan Span;
+        Span.Start = Cur.Sym;
+        Span.End = NULL;
+        if (Cur.CU)
+          Spans[Cur.CU].push_back(Span);
+      }
+    } else {
+      // Build spans between each label.
+      const MCSymbol *StartSym = List[0].Sym;
+      for (size_t n = 1; n < List.size(); n++) {
+        const SymbolCU &Prev = List[n - 1];
+        const SymbolCU &Cur = List[n];
+
+        // Try and build the longest span we can within the same CU.
+        if (Cur.CU != Prev.CU) {
+          ArangeSpan Span;
+          Span.Start = StartSym;
+          Span.End = Cur.Sym;
+          Spans[Prev.CU].push_back(Span);
+          StartSym = Cur.Sym;
+        }
+      }
+    }
+  }
+
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  unsigned PtrSize = Asm->getDataLayout().getPointerSize();
+
+  // Build a list of CUs used.
+  std::vector<CompileUnit *> CUs;
+  for (SpansType::iterator it = Spans.begin(); it != Spans.end(); it++) {
+    CompileUnit *CU = it->first;
+    CUs.push_back(CU);
+  }
+
+  // Sort the CU list (again, to ensure consistent output order).
+  std::sort(CUs.begin(), CUs.end(), CUSort);
+
+  // Emit an arange table for each CU we used.
+  for (size_t CUIdx=0;CUIdx<CUs.size();CUIdx++) {
+    CompileUnit *CU = CUs[CUIdx];
+    std::vector<ArangeSpan> &List = Spans[CU];
+
+    // Emit size of content not including length itself.
+    unsigned ContentSize
+        = sizeof(int16_t) // DWARF ARange version number
+        + sizeof(int32_t) // Offset of CU in the .debug_info section
+        + sizeof(int8_t)  // Pointer Size (in bytes)
+        + sizeof(int8_t); // Segment Size (in bytes)
+
+    unsigned TupleSize = PtrSize * 2;
+
+    // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
+    unsigned Padding = 0;
+    while (((sizeof(int32_t) + ContentSize + Padding) % TupleSize) != 0)
+      Padding++;
+
+    ContentSize += Padding;
+    ContentSize += (List.size() + 1) * TupleSize;
+
+    // For each compile unit, write the list of spans it covers.
+    Asm->OutStreamer.AddComment("Length of ARange Set");
+    Asm->EmitInt32(ContentSize);
+    Asm->OutStreamer.AddComment("DWARF Arange version number");
+    Asm->EmitInt16(dwarf::DW_ARANGES_VERSION);
+    Asm->OutStreamer.AddComment("Offset Into Debug Info Section");
+    Asm->EmitSectionOffset(
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), CU->getUniqueID()),
+        DwarfInfoSectionSym);
+    Asm->OutStreamer.AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(PtrSize);
+    Asm->OutStreamer.AddComment("Segment Size (in bytes)");
+    Asm->EmitInt8(0);
+
+    for (unsigned n = 0; n < Padding; n++)
+      Asm->EmitInt8(0xff);
+
+    for (unsigned n = 0; n < List.size(); n++) {
+      const ArangeSpan &Span = List[n];
+      Asm->EmitLabelReference(Span.Start, PtrSize);
+
+      // Calculate the size as being from the span start to it's end.
+      if (Span.End) {
+        Asm->EmitLabelDifference(Span.End, Span.Start, PtrSize);
+      } else {
+        // For symbols without an end marker (e.g. common), we
+        // write a single arange entry containing just that one symbol.
+        uint64_t Size = SymSize[Span.Start];
+        if (Size == 0)
+          Size = 1;
+
+        Asm->OutStreamer.EmitIntValue(Size, PtrSize);
+      }
+    }
+
+    Asm->OutStreamer.AddComment("ARange terminator");
+    Asm->OutStreamer.EmitIntValue(0, PtrSize);
+    Asm->OutStreamer.EmitIntValue(0, PtrSize);
+  }
 }
 
 // Emit visible names into a debug ranges section.
 void DwarfDebug::emitDebugRanges() {
   // Start the dwarf ranges section.
-  Asm->OutStreamer.SwitchSection(
-    Asm->getObjFileLowering().getDwarfRangesSection());
+  Asm->OutStreamer
+      .SwitchSection(Asm->getObjFileLowering().getDwarfRangesSection());
   unsigned char Size = Asm->getDataLayout().getPointerSize();
   for (SmallVectorImpl<const MCSymbol *>::iterator
          I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end();
@@ -2681,103 +2939,19 @@ void DwarfDebug::emitDebugMacInfo() {
   }
 }
 
-// Emit inline info using following format.
-// Section Header:
-// 1. length of section
-// 2. Dwarf version number
-// 3. address size.
-//
-// Entries (one "entry" for each function that was inlined):
-//
-// 1. offset into __debug_str section for MIPS linkage name, if exists;
-//   otherwise offset into __debug_str for regular function name.
-// 2. offset into __debug_str section for regular function name.
-// 3. an unsigned LEB128 number indicating the number of distinct inlining
-// instances for the function.
-//
-// The rest of the entry consists of a {die_offset, low_pc} pair for each
-// inlined instance; the die_offset points to the inlined_subroutine die in the
-// __debug_info section, and the low_pc is the starting address for the
-// inlining instance.
-void DwarfDebug::emitDebugInlineInfo() {
-  if (!Asm->MAI->doesDwarfUseInlineInfoSection())
-    return;
-
-  if (!FirstCU)
-    return;
-
-  Asm->OutStreamer.SwitchSection(
-                        Asm->getObjFileLowering().getDwarfDebugInlineSection());
-
-  Asm->OutStreamer.AddComment("Length of Debug Inlined Information Entry");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("debug_inlined_end", 1),
-                           Asm->GetTempSymbol("debug_inlined_begin", 1), 4);
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_begin", 1));
-
-  Asm->OutStreamer.AddComment("Dwarf Version");
-  Asm->EmitInt16(DwarfVersion);
-  Asm->OutStreamer.AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
-
-  for (SmallVectorImpl<const MDNode *>::iterator I = InlinedSPNodes.begin(),
-         E = InlinedSPNodes.end(); I != E; ++I) {
-
-    const MDNode *Node = *I;
-    InlineInfoMap::iterator II = InlineInfo.find(Node);
-    SmallVectorImpl<InlineInfoLabels> &Labels = II->second;
-    DISubprogram SP(Node);
-    StringRef LName = SP.getLinkageName();
-    StringRef Name = SP.getName();
-
-    Asm->OutStreamer.AddComment("MIPS linkage name");
-    if (LName.empty())
-      Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
-                             DwarfStrSectionSym);
-    else
-      Asm->EmitSectionOffset(
-          InfoHolder.getStringPoolEntry(Function::getRealLinkageName(LName)),
-          DwarfStrSectionSym);
-
-    Asm->OutStreamer.AddComment("Function name");
-    Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
-                           DwarfStrSectionSym);
-    Asm->EmitULEB128(Labels.size(), "Inline count");
-
-    for (SmallVectorImpl<InlineInfoLabels>::iterator LI = Labels.begin(),
-           LE = Labels.end(); LI != LE; ++LI) {
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
-      Asm->EmitInt32(LI->second->getOffset());
-
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("low_pc");
-      Asm->OutStreamer.EmitSymbolValue(LI->first,
-                                       Asm->getDataLayout().getPointerSize());
-    }
-  }
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_end", 1));
-}
-
 // DWARF5 Experimental Separate Dwarf emitters.
 
 // This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
 // DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
-// DW_AT_ranges_base, DW_AT_addr_base. If DW_AT_ranges is present,
-// DW_AT_low_pc and DW_AT_high_pc are not used, and vice versa.
-CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
-  DICompileUnit DIUnit(N);
-  CompilationDir = DIUnit.getDirectory();
+// DW_AT_ranges_base, DW_AT_addr_base.
+CompileUnit *DwarfDebug::constructSkeletonCU(const CompileUnit *CU) {
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
-                                       DIUnit.getLanguage(), Die, N, Asm,
-                                       this, &SkeletonHolder);
+  CompileUnit *NewCU = new CompileUnit(CU->getUniqueID(), Die, CU->getNode(),
+                                       Asm, this, &SkeletonHolder);
 
   NewCU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
-                        DIUnit.getSplitDebugFilename());
-
-  // This should be a unique identifier when we want to build .dwp files.
-  NewCU->addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, 0);
+                        CU->getNode().getSplitDebugFilename());
 
   // Relocate to the beginning of the addr_base section, else 0 for the
   // beginning of the one for this compile unit.
@@ -2804,6 +2978,35 @@ CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
   if (!CompilationDir.empty())
     NewCU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
+  // Flags to let the linker know we have emitted new style pubnames.
+  if (GenerateGnuPubSections) {
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_sec_offset,
+                      Asm->GetTempSymbol("gnu_pubnames", NewCU->getUniqueID()));
+    else
+      NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_data4,
+                      Asm->GetTempSymbol("gnu_pubnames", NewCU->getUniqueID()),
+                      DwarfGnuPubNamesSectionSym);
+
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_sec_offset,
+                      Asm->GetTempSymbol("gnu_pubtypes", NewCU->getUniqueID()));
+    else
+      NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_data4,
+                      Asm->GetTempSymbol("gnu_pubtypes", NewCU->getUniqueID()),
+                      DwarfGnuPubTypesSectionSym);
+  }
+
+  // Flag if we've emitted any ranges and their location for the compile unit.
+  if (DebugRangeSymbols.size()) {
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_ranges_base,
+                      dwarf::DW_FORM_sec_offset, DwarfDebugRangeSectionSym);
+    else
+      NewCU->addUInt(Die, dwarf::DW_AT_GNU_ranges_base, dwarf::DW_FORM_data4,
+                     0);
+  }
+
   SkeletonHolder.addUnit(NewCU);
   SkeletonCUs.push_back(NewCU);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index e14f9b1..cebac39 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -150,11 +150,12 @@ class DbgVariable {
   DbgVariable *AbsVar;               // Corresponding Abstract variable, if any.
   const MachineInstr *MInsn;         // DBG_VALUE instruction of the variable.
   int FrameIndex;
+  DwarfDebug *DD;
 public:
   // AbsVar may be NULL.
-  DbgVariable(DIVariable V, DbgVariable *AV)
+  DbgVariable(DIVariable V, DbgVariable *AV, DwarfDebug *DD)
     : Var(V), TheDIE(0), DotDebugLocOffset(~0U), AbsVar(AV), MInsn(0),
-      FrameIndex(~0) {}
+      FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable()           const { return Var; }
@@ -169,7 +170,7 @@ public:
   int getFrameIndex()                const { return FrameIndex; }
   void setFrameIndex(int FI)               { FrameIndex = FI; }
   // Translate tag to proper Dwarf tag.
-  unsigned getTag()                  const {
+  uint16_t getTag()                  const {
     if (Var.getTag() == dwarf::DW_TAG_arg_variable)
       return dwarf::DW_TAG_formal_parameter;
 
@@ -208,6 +209,11 @@ public:
     return Var.getAddrElement(i);
   }
   DIType getType() const;
+
+private:
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const;
 };
 
 /// \brief Collects and handles information specific to a particular
@@ -220,7 +226,7 @@ class DwarfUnits {
   FoldingSet<DIEAbbrev> *AbbreviationsSet;
 
   // A list of all the unique abbreviations in use.
-  std::vector<DIEAbbrev *> *Abbreviations;
+  std::vector<DIEAbbrev *> &Abbreviations;
 
   // A pointer to all units in the section.
   SmallVector<CompileUnit *, 1> CUs;
@@ -243,7 +249,7 @@ class DwarfUnits {
 
 public:
   DwarfUnits(AsmPrinter *AP, FoldingSet<DIEAbbrev> *AS,
-             std::vector<DIEAbbrev *> *A, const char *Pref,
+             std::vector<DIEAbbrev *> &A, const char *Pref,
              BumpPtrAllocator &DA)
       : Asm(AP), AbbreviationsSet(AS), Abbreviations(A), StringPool(DA),
         NextStringPoolNumber(0), StringPref(Pref), AddressPool(),
@@ -294,10 +300,13 @@ public:
 
   /// \brief Returns the address pool.
   AddrPool *getAddrPool() { return &AddressPool; }
+};
 
-  /// \brief for a given compile unit DIE, returns offset from beginning of
-  /// debug info.
-  unsigned getCUOffset(DIE *Die);
+/// \brief Helper used to pair up a symbol and its DWARF compile unit.
+struct SymbolCU {
+  SymbolCU(CompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
+  const MCSymbol *Sym;
+  CompileUnit *CU;
 };
 
 /// \brief Collects and handles dwarf debug information.
@@ -320,6 +329,14 @@ class DwarfDebug {
   // Maps subprogram MDNode with its corresponding CompileUnit.
   DenseMap <const MDNode *, CompileUnit *> SPMap;
 
+  // Maps a CU DIE with its corresponding CompileUnit.
+  DenseMap <const DIE *, CompileUnit *> CUDieMap;
+
+  /// Maps MDNodes for type sysstem with the corresponding DIEs. These DIEs can
+  /// be shared across CUs, that is why we keep the map here instead
+  /// of in CompileUnit.
+  DenseMap<const MDNode *, DIE *> MDTypeNodeToDieMap;
+
   // Used to uniquely define abbreviations.
   FoldingSet<DIEAbbrev> AbbreviationsSet;
 
@@ -332,8 +349,15 @@ class DwarfDebug {
   // separated by a zero byte, mapped to a unique id.
   StringMap<unsigned, BumpPtrAllocator&> SourceIdMap;
 
+  // List of all labels used in aranges generation.
+  std::vector<SymbolCU> ArangeLabels;
+
+  // Size of each symbol emitted (for those symbols that have a specific size).
+  DenseMap <const MCSymbol *, uint64_t> SymSize;
+
   // Provides a unique id per text section.
-  SetVector<const MCSection*> SectionMap;
+  typedef DenseMap<const MCSection *, SmallVector<SymbolCU, 8> > SectionMapType;
+  SectionMapType SectionMap;
 
   // List of arguments for current function.
   SmallVector<DbgVariable *, 8> CurrentFnArguments;
@@ -358,14 +382,6 @@ class DwarfDebug {
   // as DW_AT_inline.
   SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
 
-  // Keep track of inlined functions and their location.  This
-  // information is used to populate the debug_inlined section.
-  typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels;
-  typedef DenseMap<const MDNode *,
-                   SmallVector<InlineInfoLabels, 4> > InlineInfoMap;
-  InlineInfoMap InlineInfo;
-  SmallVector<const MDNode *, 4> InlinedSPNodes;
-
   // This is a collection of subprogram MDNodes that are processed to
   // create DIEs.
   SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
@@ -406,6 +422,7 @@ class DwarfDebug {
   MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym, *DwarfAddrSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
   MCSymbol *DwarfAbbrevDWOSectionSym, *DwarfStrDWOSectionSym;
+  MCSymbol *DwarfGnuPubNamesSectionSym, *DwarfGnuPubTypesSectionSym;
 
   // As an optimization, there is no need to emit an entry in the directory
   // table for the same directory as DW_AT_comp_dir.
@@ -420,9 +437,6 @@ class DwarfDebug {
   // Holders for the various debug information flags that we might need to
   // have exposed. See accessor functions below for description.
 
-  // Whether or not we're emitting info for older versions of gdb on darwin.
-  bool IsDarwinGDBCompat;
-
   // Holder for imported entities.
   typedef SmallVector<std::pair<const MDNode *, const MDNode *>, 32>
     ImportedEntityMap;
@@ -431,12 +445,16 @@ class DwarfDebug {
   // Holder for types that are going to be extracted out into a type unit.
   std::vector<DIE *> TypeUnits;
 
+  // Whether to emit the pubnames/pubtypes sections.
+  bool HasDwarfPubSections;
+
+  // Version of dwarf we're emitting.
+  unsigned DwarfVersion;
+
   // DWARF5 Experimental Options
   bool HasDwarfAccelTables;
   bool HasSplitDwarf;
 
-  unsigned DwarfVersion;
-
   // Separated Dwarf Variables
   // In general these will all be for bits that are left in the
   // original object file, rather than things that are meant
@@ -454,6 +472,9 @@ class DwarfDebug {
   // Holder for the skeleton information.
   DwarfUnits SkeletonHolder;
 
+  // Maps from a type identifier to the actual MDNode.
+  DITypeIdentifierMap TypeIdentifierMap;
+
 private:
 
   void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
@@ -465,11 +486,14 @@ private:
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
   /// variables in this scope then create and insert DIEs for these
   /// variables.
-  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, const MDNode *SPNode);
+  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, DISubprogram SP);
 
   /// \brief Construct new DW_TAG_lexical_block for this scope and
   /// attach DW_AT_low_pc/DW_AT_high_pc labels.
   DIE *constructLexicalScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+  /// A helper function to check whether the DIE for a given Scope is going
+  /// to be null.
+  bool isLexicalScopeDIENull(LexicalScope *Scope);
 
   /// \brief This scope represents inlined body of a function. Construct
   /// DIE to represent this concrete inlined copy of the function.
@@ -477,6 +501,9 @@ private:
 
   /// \brief Construct a DIE for this scope.
   DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+  /// A helper function to create children of a Scope DIE.
+  DIE *createScopeChildrenDIE(CompileUnit *TheCU, LexicalScope *Scope,
+                              SmallVectorImpl<DIE*> &Children);
 
   /// \brief Emit initial Dwarf sections with a label at the start of each one.
   void emitSectionLabels();
@@ -528,10 +555,16 @@ private:
   void emitAccelTypes();
 
   /// \brief Emit visible names into a debug pubnames section.
-  void emitDebugPubnames();
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubNames(bool GnuStyle = false);
 
   /// \brief Emit visible types into a debug pubtypes section.
-  void emitDebugPubTypes();
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubTypes(bool GnuStyle = false);
 
   /// \brief Emit visible names into a debug str section.
   void emitDebugStr();
@@ -555,7 +588,7 @@ private:
 
   /// \brief Construct the split debug info compile unit for the debug info
   /// section.
-  CompileUnit *constructSkeletonCU(const MDNode *);
+  CompileUnit *constructSkeletonCU(const CompileUnit *CU);
 
   /// \brief Emit the local split abbreviations.
   void emitSkeletonAbbrevs(const MCSection *);
@@ -571,7 +604,7 @@ private:
 
   /// \brief Create new CompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
-  CompileUnit *constructCompileUnit(const MDNode *N);
+  CompileUnit *constructCompileUnit(DICompileUnit DIUnit);
 
   /// \brief Construct subprogram DIE.
   void constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N);
@@ -633,7 +666,13 @@ public:
   // Main entry points.
   //
   DwarfDebug(AsmPrinter *A, Module *M);
-  ~DwarfDebug();
+
+  void insertDIE(const MDNode *TypeMD, DIE *Die) {
+    MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
+  }
+  DIE *getDIE(const MDNode *TypeMD) {
+    return MDTypeNodeToDieMap.lookup(TypeMD);
+  }
 
   /// \brief Emit all Dwarf sections that should come prior to the
   /// content.
@@ -658,6 +697,13 @@ public:
   /// type units.
   void addTypeUnitType(DIE *Die) { TypeUnits.push_back(Die); }
 
+  /// \brief Add a label so that arange data can be generated for it.
+  void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
+
+  /// \brief For symbols that have a size designated (e.g. common symbols),
+  /// this tracks that size.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) { SymSize[Sym] = Size;}
+
   /// \brief Look up the source id with the given directory and source file
   /// names. If none currently exists, create a new id and insert it in the
   /// SourceIds map.
@@ -665,11 +711,7 @@ public:
                                unsigned CUID);
 
   /// \brief Recursively Emits a debug information entry.
-  void emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs);
-
-  /// \brief Returns whether or not to limit some of our debug
-  /// output to the limitations of darwin gdb.
-  bool useDarwinGDBCompat() { return IsDarwinGDBCompat; }
+  void emitDIE(DIE *Die, ArrayRef<DIEAbbrev *> Abbrevs);
 
   // Experimental DWARF5 features.
 
@@ -683,6 +725,16 @@ public:
 
   /// Returns the Dwarf Version.
   unsigned getDwarfVersion() const { return DwarfVersion; }
+
+  /// Find the MDNode for the given reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return Ref.resolve(TypeIdentifierMap);
+  }
+
+  /// isSubprogramContext - Return true if Context is either a subprogram
+  /// or another context nested inside a subprogram.
+  bool isSubprogramContext(const MDNode *Context);
+
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 49a85d8..1575161 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -29,6 +29,7 @@ class MCAsmInfo;
 class MCExpr;
 class MCSymbol;
 class Function;
+class ARMTargetStreamer;
 class AsmPrinter;
 
 //===----------------------------------------------------------------------===//
@@ -177,6 +178,8 @@ public:
 
 class ARMException : public DwarfException {
   void EmitTypeInfos(unsigned TTypeEncoding);
+  ARMTargetStreamer &getTargetStreamer();
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index b48b817..24aa1ab 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -83,6 +83,8 @@ public:
   virtual unsigned getJumpBufAlignment() const;
   virtual unsigned getJumpBufSize() const;
   virtual bool shouldBuildLookupTables() const;
+  virtual bool haveFastSqrt(Type *Ty) const;
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -111,6 +113,7 @@ public:
                                          ArrayRef<Type*> Tys) const;
   virtual unsigned getNumberOfParts(Type *Tp) const;
   virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) const;
 
   /// @}
 };
@@ -182,6 +185,14 @@ bool BasicTTI::shouldBuildLookupTables() const {
        TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
 }
 
+bool BasicTTI::haveFastSqrt(Type *Ty) const {
+  const TargetLoweringBase *TLI = getTLI();
+  EVT VT = TLI->getValueType(Ty);
+  return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
+}
+
+void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+
 //===----------------------------------------------------------------------===//
 //
 // Calls used by the vectorizers.
@@ -443,12 +454,14 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   case Intrinsic::log10:   ISD = ISD::FLOG10; break;
   case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
   case Intrinsic::fabs:    ISD = ISD::FABS;   break;
+  case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break;
   case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
   case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
   case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
   case Intrinsic::nearbyint:
                            ISD = ISD::FNEARBYINT; break;
   case Intrinsic::rint:    ISD = ISD::FRINT;  break;
+  case Intrinsic::round:   ISD = ISD::FROUND; break;
   case Intrinsic::pow:     ISD = ISD::FPOW;   break;
   case Intrinsic::fma:     ISD = ISD::FMA;    break;
   case Intrinsic::fmuladd: ISD = ISD::FMA;    break; // FIXME: mul + add?
@@ -498,3 +511,17 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
 unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
   return 0;
 }
+
+unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwise) const {
+  assert(Ty->isVectorTy() && "Expect a vector type");
+  unsigned NumVecElts = Ty->getVectorNumElements();
+  unsigned NumReduxLevels = Log2_32(NumVecElts);
+  unsigned ArithCost = NumReduxLevels *
+    TopTTI->getArithmeticInstrCost(Opcode, Ty);
+  // Assume the pairwise shuffles add a cost.
+  unsigned ShuffleCost =
+      NumReduxLevels * (IsPairwise + 1) *
+      TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
+  return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+}
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index 26bdca9..0d15ed7 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -1,4 +1,4 @@
-//===-- BranchFolding.h - Fold machine code branch instructions --*- C++ -*===//
+//===-- BranchFolding.h - Fold machine code branch instructions -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 56aa330..10cc9ff 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(LLVMCodeGen
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
   LiveRegMatrix.cpp
+  LiveRegUnits.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
   LocalStackSlotAllocation.cpp
@@ -88,7 +89,6 @@ add_llvm_library(LLVMCodeGen
   ScheduleDAGPrinter.cpp
   ScoreboardHazardRecognizer.cpp
   ShadowStackGC.cpp
-  ShrinkWrapping.cpp
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
@@ -97,7 +97,7 @@ add_llvm_library(LLVMCodeGen
   StackColoring.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
-  StrongPHIElimination.cpp
+  StackMaps.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index b03c325..4925c4d 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -9,14 +9,12 @@
 
 #define DEBUG_TYPE "calcspillweights"
 
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -24,38 +22,22 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
-char CalculateSpillWeights::ID = 0;
-INITIALIZE_PASS_BEGIN(CalculateSpillWeights, "calcspillweights",
-                "Calculate spill weights", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(CalculateSpillWeights, "calcspillweights",
-                "Calculate spill weights", false, false)
-
-void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
-  au.addRequired<LiveIntervals>();
-  au.addRequired<MachineBlockFrequencyInfo>();
-  au.addRequired<MachineLoopInfo>();
-  au.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(au);
-}
-
-bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
-
+void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
+                           MachineFunction &MF,
+                           const MachineLoopInfo &MLI,
+                           const MachineBlockFrequencyInfo &MBFI,
+                           VirtRegAuxInfo::NormalizingFn norm) {
   DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
                << "********** Function: " << MF.getName() << '\n');
 
-  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  VirtRegAuxInfo VRAI(MF, LIS, getAnalysis<MachineLoopInfo>(),
-                      getAnalysis<MachineBlockFrequencyInfo>());
+  VirtRegAuxInfo VRAI(MF, LIS, MLI, MBFI, norm);
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
-    VRAI.CalculateWeightAndHint(LIS.getInterval(Reg));
+    VRAI.calculateSpillWeightAndHint(LIS.getInterval(Reg));
   }
-  return false;
 }
 
 // Return the preferred allocation register for reg, given a COPY instruction.
@@ -111,7 +93,7 @@ static bool isRematerializable(const LiveInterval &LI,
 }
 
 void
-VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
+VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
   const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
   MachineBasicBlock *mbb = 0;
@@ -201,5 +183,5 @@ VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
     totalWeight *= 0.5F;
 
-  li.weight = normalizeSpillWeight(totalWeight, li.getSize());
+  li.weight = normalize(totalWeight, li.getSize());
 }
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index c641991..7430c53 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -22,7 +22,6 @@ using namespace llvm;
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
-  initializeCalculateSpillWeightsPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
   initializeEarlyIfConverterPass(Registry);
   initializeExpandPostRAPass(Registry);
@@ -60,7 +59,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeStackProtectorPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackSlotColoringPass(Registry);
-  initializeStrongPHIEliminationPass(Registry);
   initializeTailDuplicatePassPass(Registry);
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionPassPass(Registry);
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index 840a101..6619bcf 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -160,7 +160,8 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator EndItr) {
   assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
   VLIWScheduler->startBlock(MBB);
-  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr, MBB->size());
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr,
+                             std::distance(BeginItr, EndItr));
   VLIWScheduler->schedule();
 
   // Generate MI -> SU map.
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index e277f5c..031f19c 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -23,6 +23,7 @@
 #define DEBUG_TYPE "execution-fix"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Allocator.h"
@@ -136,6 +137,12 @@ class ExeDepsFix : public MachineFunctionPass {
   typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
   LiveOutMap LiveOuts;
 
+  /// List of undefined register reads in this block in forward order.
+  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
+
+  /// Storage for register unit liveness.
+  LiveRegUnits LiveUnits;
+
   /// Current instruction number.
   /// The first instruction in each basic block is 0.
   int CurInstr;
@@ -185,6 +192,8 @@ private:
   void processDefs(MachineInstr*, bool Kill);
   void visitSoftInstr(MachineInstr*, unsigned mask);
   void visitHardInstr(MachineInstr*, unsigned domain);
+  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
+  void processUndefReads(MachineBasicBlock*);
 };
 }
 
@@ -341,6 +350,10 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Reset instruction counter in each basic block.
   CurInstr = 0;
 
+  // Set up UndefReads to track undefined register reads.
+  UndefReads.clear();
+  LiveUnits.clear();
+
   // Set up LiveRegs to represent registers entering MBB.
   if (!LiveRegs)
     LiveRegs = new LiveReg[NumRegs];
@@ -448,10 +461,46 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
   processDefs(MI, !DomP.first);
 }
 
+/// \brief Return true to if it makes sense to break dependence on a partial def
+/// or undef use.
+bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                       unsigned Pref) {
+  int rx = regIndex(MI->getOperand(OpIdx).getReg());
+  if (rx < 0)
+    return false;
+
+  unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+  DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+
+  if (Pref > Clearance) {
+    DEBUG(dbgs() << ": Break dependency.\n");
+    return true;
+  }
+  // The current clearance seems OK, but we may be ignoring a def from a
+  // back-edge.
+  if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+    DEBUG(dbgs() << ": OK .\n");
+    return false;
+  }
+  // A def from an unprocessed back-edge may make us break this dependency.
+  DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+  return false;
+}
+
 // Update def-ages for registers defined by MI.
 // If Kill is set, also kill off DomainValues clobbered by the defs.
+//
+// Also break dependencies on partial defs and undef uses.
 void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
   assert(!MI->isDebugValue() && "Won't process debug values");
+
+  // Break dependence on undef uses. Do this before updating LiveRegs below.
+  unsigned OpNum;
+  unsigned Pref = TII->getUndefRegClearance(MI, OpNum, TRI);
+  if (Pref) {
+    if (shouldBreakDependence(MI, OpNum, Pref))
+      UndefReads.push_back(std::make_pair(MI, OpNum));
+  }
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
          e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
@@ -471,37 +520,58 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
     DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
                  << '\t' << *MI);
 
+    // Check clearance before partial register updates.
+    // Call breakDependence before setting LiveRegs[rx].Def.
+    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+    if (Pref && shouldBreakDependence(MI, i, Pref))
+      TII->breakPartialRegDependency(MI, i, TRI);
+
     // How many instructions since rx was last written?
-    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
     LiveRegs[rx].Def = CurInstr;
 
     // Kill off domains redefined by generic instructions.
     if (Kill)
       kill(rx);
+  }
+  ++CurInstr;
+}
 
-    // Verify clearance before partial register updates.
-    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
-    if (!Pref)
-      continue;
-    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
-    if (Pref > Clearance) {
-      DEBUG(dbgs() << ": Break dependency.\n");
-      TII->breakPartialRegDependency(MI, i, TRI);
-      continue;
-    }
-
-    // The current clearance seems OK, but we may be ignoring a def from a
-    // back-edge.
-    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-      DEBUG(dbgs() << ": OK.\n");
-      continue;
-    }
+/// \break Break false dependencies on undefined register reads.
+///
+/// Walk the block backward computing precise liveness. This is expensive, so we
+/// only do it on demand. Note that the occurrence of undefined register reads
+/// that should be broken is very rare, but when they occur we may have many in
+/// a single block.
+void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
+  if (UndefReads.empty())
+    return;
 
-    // A def from an unprocessed back-edge may make us break this dependency.
-    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+  // Collect this block's live out register units.
+  LiveUnits.init(TRI);
+  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    LiveUnits.addLiveIns(*SI, *TRI);
   }
+  MachineInstr *UndefMI = UndefReads.back().first;
+  unsigned OpIdx = UndefReads.back().second;
 
-  ++CurInstr;
+  for (MachineBasicBlock::reverse_iterator I = MBB->rbegin(), E = MBB->rend();
+       I != E; ++I) {
+    // Update liveness, including the current instrucion's defs.
+    LiveUnits.stepBackward(*I, *TRI);
+
+    if (UndefMI == &*I) {
+      if (!LiveUnits.contains(UndefMI->getOperand(OpIdx).getReg(), *TRI))
+        TII->breakPartialRegDependency(UndefMI, OpIdx, TRI);
+
+      UndefReads.pop_back();
+      if (UndefReads.empty())
+        return;
+
+      UndefMI = UndefReads.back().first;
+      OpIdx = UndefReads.back().second;
+    }
+  }
 }
 
 // A hard instruction only works in one domain. All input registers will be
@@ -549,7 +619,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
         // Is it possible to use this collapsed register for free?
         if (dv->isCollapsed()) {
           // Restrict available domains to the ones in common with the operand.
-          // If there are no common domains, we must pay the cross-domain 
+          // If there are no common domains, we must pay the cross-domain
           // penalty for this operand.
           if (common) available = common;
         } else if (common)
@@ -686,6 +756,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
         ++I)
       visitInstr(I);
+    processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
 
@@ -698,6 +769,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
         ++I)
       if (!I->isDebugValue())
         processDefs(I, false);
+    processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
 
@@ -713,6 +785,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     delete[] FI->second;
   }
   LiveOuts.clear();
+  UndefReads.clear();
   Avail.clear();
   Allocator.DestroyAll();
 
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 1611db8..6c73fff 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -104,7 +104,7 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
   }
 
   if (DstSubReg == InsReg) {
-    // No need to insert an identify copy instruction.
+    // No need to insert an identity copy instruction.
     // Watch out for case like this:
     // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
     // We must leave %RAX live.
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 1ae7e3b..e2d0eb4 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -22,6 +22,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -31,6 +33,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 // Hidden options for help debugging.
@@ -150,14 +154,17 @@ namespace {
     /// BBAnalysis - Results of if-conversion feasibility analysis indexed by
     /// basic block number.
     std::vector<BBInfo> BBAnalysis;
+    TargetSchedModel SchedModel;
 
     const TargetLoweringBase *TLI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    const InstrItineraryData *InstrItins;
     const MachineBranchProbabilityInfo *MBPI;
     MachineRegisterInfo *MRI;
 
+    LiveRegUnits Redefs;
+    LiveRegUnits DontKill;
+
     bool PreRegAlloc;
     bool MadeChange;
     int FnNum;
@@ -198,11 +205,9 @@ namespace {
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
                         SmallVectorImpl<MachineOperand> &Cond,
-                        SmallSet<unsigned, 4> &Redefs,
                         SmallSet<unsigned, 4> *LaterRedefs = 0);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
-                               SmallSet<unsigned, 4> &Redefs,
                                bool IgnoreBr = false);
     void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
 
@@ -267,7 +272,11 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getTarget().getRegisterInfo();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
-  InstrItins = MF.getTarget().getInstrItineraryData();
+
+  const TargetSubtargetInfo &ST =
+    MF.getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
   if (!TII) return false;
 
   PreRegAlloc = MRI->isSSA();
@@ -666,32 +675,28 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
     bool isPredicated = TII->isPredicated(I);
     bool isCondBr = BBI.IsBrAnalyzable && I->isConditionalBranch();
 
-    if (!isCondBr) {
-      if (!isPredicated) {
-        BBI.NonPredSize++;
-        unsigned ExtraPredCost = 0;
-        unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I,
-                                                  &ExtraPredCost);
-        if (NumCycles > 1)
-          BBI.ExtraCost += NumCycles-1;
-        BBI.ExtraCost2 += ExtraPredCost;
-      } else if (!AlreadyPredicated) {
-        // FIXME: This instruction is already predicated before the
-        // if-conversion pass. It's probably something like a conditional move.
-        // Mark this block unpredicable for now.
-        BBI.IsUnpredicable = true;
-        return;
-      }
+    // A conditional branch is not predicable, but it may be eliminated.
+    if (isCondBr)
+      continue;
+
+    if (!isPredicated) {
+      BBI.NonPredSize++;
+      unsigned ExtraPredCost = TII->getPredicationCost(&*I);
+      unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
+      if (NumCycles > 1)
+        BBI.ExtraCost += NumCycles-1;
+      BBI.ExtraCost2 += ExtraPredCost;
+    } else if (!AlreadyPredicated) {
+      // FIXME: This instruction is already predicated before the
+      // if-conversion pass. It's probably something like a conditional move.
+      // Mark this block unpredicable for now.
+      BBI.IsUnpredicable = true;
+      return;
     }
 
     if (BBI.ClobbersPred && !isPredicated) {
       // Predicate modification instruction should end the block (except for
       // already predicated instructions and end of block branches).
-      if (isCondBr) {
-        // A conditional branch is not predicable, but it may be eliminated.
-        continue;
-      }
-
       // Predicate may have been modified, the subsequent (currently)
       // unpredicated instructions cannot be correctly predicated.
       BBI.IsUnpredicable = true;
@@ -961,64 +966,58 @@ void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
     BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
 }
 
-/// InitPredRedefs / UpdatePredRedefs - Defs by predicated instructions are
-/// modeled as read + write (sort like two-address instructions). These
-/// routines track register liveness and add implicit uses to if-converted
-/// instructions to conform to the model.
-static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
-                           const TargetRegisterInfo *TRI) {
-  for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
-         E = BB->livein_end(); I != E; ++I) {
-    unsigned Reg = *I;
-    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-         SubRegs.isValid(); ++SubRegs)
-      Redefs.insert(*SubRegs);
-  }
-}
-
-static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
-                             const TargetRegisterInfo *TRI,
-                             bool AddImpUse = false) {
-  SmallVector<unsigned, 4> Defs;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
+/// Behaves like LiveRegUnits::StepForward() but also adds implicit uses to all
+/// values defined in MI which are not live/used by MI.
+static void UpdatePredRedefs(MachineInstr *MI, LiveRegUnits &Redefs,
+                             const TargetRegisterInfo *TRI) {
+  for (ConstMIBundleOperands Ops(MI); Ops.isValid(); ++Ops) {
+    if (!Ops->isReg() || !Ops->isKill())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
+    unsigned Reg = Ops->getReg();
+    if (Reg == 0)
       continue;
-    if (MO.isDef())
-      Defs.push_back(Reg);
-    else if (MO.isKill()) {
-      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        Redefs.erase(*SubRegs);
-    }
+    Redefs.removeReg(Reg, *TRI);
   }
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
-  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
-    unsigned Reg = Defs[i];
-    if (!Redefs.insert(Reg)) {
-      if (AddImpUse)
-        // Treat predicated update as read + write.
-        MIB.addReg(Reg, RegState::Implicit | RegState::Undef);
-    } else {
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-        Redefs.insert(*SubRegs);
-    }
+  for (MIBundleOperands Ops(MI); Ops.isValid(); ++Ops) {
+    if (!Ops->isReg() || !Ops->isDef())
+      continue;
+    unsigned Reg = Ops->getReg();
+    if (Reg == 0 || Redefs.contains(Reg, *TRI))
+      continue;
+    Redefs.addReg(Reg, *TRI);
+
+    MachineOperand &Op = *Ops;
+    MachineInstr *MI = Op.getParent();
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(Reg, RegState::Implicit | RegState::Undef);
   }
 }
 
-static void UpdatePredRedefs(MachineBasicBlock::iterator I,
-                             MachineBasicBlock::iterator E,
-                             SmallSet<unsigned,4> &Redefs,
-                             const TargetRegisterInfo *TRI) {
-  while (I != E) {
-    UpdatePredRedefs(I, Redefs, TRI);
-    ++I;
+/**
+ * Remove kill flags from operands with a registers in the @p DontKill set.
+ */
+static void RemoveKills(MachineInstr &MI, const LiveRegUnits &DontKill,
+                        const MCRegisterInfo &MCRI) {
+  for (MIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->isKill())
+      continue;
+    if (DontKill.contains(O->getReg(), MCRI))
+      O->setIsKill(false);
   }
 }
 
+/**
+ * Walks a range of machine instructions and removes kill flags for registers
+ * in the @p DontKill set.
+ */
+static void RemoveKills(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator E,
+                        const LiveRegUnits &DontKill,
+                        const MCRegisterInfo &MCRI) {
+  for ( ; I != E; ++I)
+    RemoveKills(*I, DontKill, MCRI);
+}
+
 /// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
 ///
 bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
@@ -1049,21 +1048,27 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
 
   // Initialize liveins to the first BB. These are potentiall redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
-  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(CvtBBI->BB, *TRI);
+  Redefs.addLiveIns(NextBBI->BB, *TRI);
+
+  // Compute a set of registers which must not be killed by instructions in
+  // BB1: This is everything live-in to BB2.
+  DontKill.init(TRI);
+  DontKill.addLiveIns(NextBBI->BB, *TRI);
 
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
     BBI.BB->removeSuccessor(CvtBBI->BB);
   } else {
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+    RemoveKills(CvtBBI->BB->begin(), CvtBBI->BB->end(), DontKill, *TRI);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
 
     // Merge converted block into entry block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -1148,16 +1153,18 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
 
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
-  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(CvtBBI->BB, *TRI);
+  Redefs.addLiveIns(NextBBI->BB, *TRI);
+
+  DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != NULL;
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs, true);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
@@ -1165,7 +1172,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
 
     // Now merge the entry of the triangle with the true block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -1276,8 +1283,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(BBI1->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(BBI1->BB, *TRI);
 
   // Remove the duplicated instructions at the beginnings of both paths.
   MachineBasicBlock::iterator DI1 = BBI1->BB->begin();
@@ -1304,7 +1311,19 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       --NumDups1;
   }
 
-  UpdatePredRedefs(BBI1->BB->begin(), DI1, Redefs, TRI);
+  // Compute a set of registers which must not be killed by instructions in BB1:
+  // This is everything used+live in BB2 after the duplicated instructions. We
+  // can compute this set by simulating liveness backwards from the end of BB2.
+  DontKill.init(TRI);
+  for (MachineBasicBlock::reverse_iterator I = BBI2->BB->rbegin(),
+       E = MachineBasicBlock::reverse_iterator(DI2); I != E; ++I) {
+    DontKill.stepBackward(*I, *TRI);
+  }
+
+  for (MachineBasicBlock::const_iterator I = BBI1->BB->begin(), E = DI1; I != E;
+       ++I) {
+    Redefs.stepForward(*I, *TRI);
+  }
   BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
   BBI2->BB->erase(BBI2->BB->begin(), DI2);
 
@@ -1322,6 +1341,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   }
   BBI1->BB->erase(DI1, BBI1->BB->end());
 
+  // Kill flags in the true block for registers living into the false block
+  // must be removed.
+  RemoveKills(BBI1->BB->begin(), BBI1->BB->end(), DontKill, *TRI);
+
   // Remove 'false' block branch and find the last instruction to predicate.
   BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
   DI2 = BBI2->BB->end();
@@ -1380,10 +1403,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   }
 
   // Predicate the 'true' block.
-  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, Redefs, &RedefsByFalse);
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, &RedefsByFalse);
 
   // Predicate the 'false' block.
-  PredicateBlock(*BBI2, DI2, *Cond2, Redefs);
+  PredicateBlock(*BBI2, DI2, *Cond2);
 
   // Merge the true block into the entry of the diamond.
   MergeBlocks(BBI, *BBI1, TailBB == 0);
@@ -1458,7 +1481,6 @@ static bool MaySpeculate(const MachineInstr *MI,
 void IfConverter::PredicateBlock(BBInfo &BBI,
                                  MachineBasicBlock::iterator E,
                                  SmallVectorImpl<MachineOperand> &Cond,
-                                 SmallSet<unsigned, 4> &Redefs,
                                  SmallSet<unsigned, 4> *LaterRedefs) {
   bool AnyUnpred = false;
   bool MaySpec = LaterRedefs != 0;
@@ -1484,7 +1506,7 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 
     // If the predicated instruction now redefines a register as the result of
     // if-conversion, add an implicit kill.
-    UpdatePredRedefs(I, Redefs, TRI, true);
+    UpdatePredRedefs(I, Redefs, TRI);
   }
 
   std::copy(Cond.begin(), Cond.end(), std::back_inserter(BBI.Predicate));
@@ -1501,7 +1523,6 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 /// the destination block. Skip end of block branches if IgnoreBr is true.
 void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                         SmallVectorImpl<MachineOperand> &Cond,
-                                        SmallSet<unsigned, 4> &Redefs,
                                         bool IgnoreBr) {
   MachineFunction &MF = *ToBBI.BB->getParent();
 
@@ -1514,8 +1535,8 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
     MachineInstr *MI = MF.CloneMachineInstr(I);
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
-    unsigned ExtraPredCost = 0;
-    unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I, &ExtraPredCost);
+    unsigned ExtraPredCost = TII->getPredicationCost(&*I);
+    unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
     if (NumCycles > 1)
       ToBBI.ExtraCost += NumCycles-1;
     ToBBI.ExtraCost2 += ExtraPredCost;
@@ -1531,7 +1552,11 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
 
     // If the predicated instruction now redefines a register as the result of
     // if-conversion, add an implicit kill.
-    UpdatePredRedefs(MI, Redefs, TRI, true);
+    UpdatePredRedefs(MI, Redefs, TRI);
+
+    // Some kill flags may not be correct anymore.
+    if (!DontKill.empty())
+      RemoveKills(*MI, DontKill, *TRI);
   }
 
   if (!IgnoreBr) {
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 8910652..bb0e642 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -179,10 +179,8 @@ private:
   bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
   bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> >,
                          MachineInstr *LoadMI = 0);
-  void insertReload(LiveInterval &NewLI, SlotIndex,
-                    MachineBasicBlock::iterator MI);
-  void insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                   SlotIndex, MachineBasicBlock::iterator MI);
+  void insertReload(unsigned VReg, SlotIndex, MachineBasicBlock::iterator MI);
+  void insertSpill(unsigned VReg, bool isKill, MachineBasicBlock::iterator MI);
 
   void spillAroundUses(unsigned Reg);
   void spillAll();
@@ -580,7 +578,7 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
     if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
       if (isSibling(SrcReg)) {
         LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveRangeQuery SrcQ(SrcLI, VNI->def);
+        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
         assert(SrcQ.valueIn() && "Copy from non-existing value");
         // Check if this COPY kills its source.
         SVI->second.KillsSource = SrcQ.isKill();
@@ -885,12 +883,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   }
 
   // Alocate a new register for the remat.
-  LiveInterval &NewLI = Edit->createFrom(Original);
-  NewLI.markNotSpillable();
+  unsigned NewVReg = Edit->createFrom(Original);
 
   // Finally we can rematerialize OrigMI before MI.
-  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewLI.reg, RM,
+  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewVReg, RM,
                                            TRI);
+  (void)DefIdx;
   DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
                << *LIS.getInstructionFromIndex(DefIdx));
 
@@ -898,15 +896,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(Ops[i].second);
     if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) {
-      MO.setReg(NewLI.reg);
+      MO.setReg(NewVReg);
       MO.setIsKill();
     }
   }
-  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI);
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI << '\n');
 
-  VNInfo *DefVNI = NewLI.getNextValue(DefIdx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(DefIdx, UseIdx.getRegSlot(), DefVNI));
-  DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
   ++NumRemats;
   return true;
 }
@@ -1009,6 +1004,40 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   return true;
 }
 
+#if !defined(NDEBUG)
+// Dump the range of instructions from B to E with their slot indexes.
+static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
+                                               MachineBasicBlock::iterator E,
+                                               LiveIntervals const &LIS,
+                                               const char *const header,
+                                               unsigned VReg =0) {
+  char NextLine = '\n';
+  char SlotIndent = '\t';
+
+  if (llvm::next(B) == E) {
+    NextLine = ' ';
+    SlotIndent = ' ';
+  }
+
+  dbgs() << '\t' << header << ": " << NextLine;
+
+  for (MachineBasicBlock::iterator I = B; I != E; ++I) {
+    SlotIndex Idx = LIS.getInstructionIndex(I).getRegSlot();
+
+    // If a register was passed in and this instruction has it as a
+    // destination that is marked as an early clobber, print the
+    // early-clobber slot index.
+    if (VReg) {
+      MachineOperand *MO = I->findRegisterDefOperand(VReg);
+      if (MO && MO->isEarlyClobber())
+        Idx = Idx.getRegSlot(true);
+    }
+
+    dbgs() << SlotIndent << Idx << '\t' << *I;
+  }
+}
+#endif
+
 /// foldMemoryOperand - Try folding stack slot references in Ops into their
 /// instructions.
 ///
@@ -1028,6 +1057,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
+  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+                       MI->getOpcode() == TargetOpcode::STACKMAP);
+
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
   // operands.
   SmallVector<unsigned, 8> FoldOps;
@@ -1039,7 +1071,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       continue;
     }
     // FIXME: Teach targets to deal with subregs.
-    if (MO.getSubReg())
+    if (!SpillSubRegs && MO.getSubReg())
       return false;
     // We cannot fold a load instruction into a def.
     if (LoadMI && MO.isDef())
@@ -1049,6 +1081,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       FoldOps.push_back(Idx);
   }
 
+  MachineInstrSpan MIS(MI);
+
   MachineInstr *FoldMI =
                 LoadMI ? TII.foldMemoryOperand(MI, FoldOps, LoadMI)
                        : TII.foldMemoryOperand(MI, FoldOps, StackSlot);
@@ -1075,16 +1109,24 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
     // FoldMI does not define this physreg. Remove the LI segment.
     assert(MO->isDead() && "Cannot fold physreg def");
     for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
-      if (LiveInterval *LI = LIS.getCachedRegUnit(*Units)) {
+      if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
         SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
-        if (VNInfo *VNI = LI->getVNInfoAt(Idx))
-          LI->removeValNo(VNI);
+        if (VNInfo *VNI = LR->getVNInfoAt(Idx))
+          LR->removeValNo(VNI);
       }
     }
   }
+
   LIS.ReplaceMachineInstrInMaps(MI, FoldMI);
   MI->eraseFromParent();
 
+  // Insert any new instructions other than FoldMI into the LIS maps.
+  assert(!MIS.empty() && "Unexpected empty span of instructions!");
+  for (MachineBasicBlock::iterator MII = MIS.begin(), End = MIS.end();
+       MII != End; ++MII)
+    if (&*MII != FoldMI)
+      LIS.InsertMachineInstrInMaps(&*MII);
+
   // TII.foldMemoryOperand may have left some implicit operands on the
   // instruction.  Strip them.
   if (ImpReg)
@@ -1096,8 +1138,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
         FoldMI->RemoveOperand(i - 1);
     }
 
-  DEBUG(dbgs() << "\tfolded:  " << LIS.getInstructionIndex(FoldMI) << '\t'
-               << *FoldMI);
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
+                                           "folded"));
+
   if (!WasCopy)
     ++NumFolded;
   else if (Ops.front().second == 0)
@@ -1107,36 +1150,35 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   return true;
 }
 
-/// insertReload - Insert a reload of NewLI.reg before MI.
-void InlineSpiller::insertReload(LiveInterval &NewLI,
+void InlineSpiller::insertReload(unsigned NewVReg,
                                  SlotIndex Idx,
                                  MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-  TII.loadRegFromStackSlot(MBB, MI, NewLI.reg, StackSlot,
-                           MRI.getRegClass(NewLI.reg), &TRI);
-  --MI; // Point to load instruction.
-  SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
-  // Some (out-of-tree) targets have EC reload instructions.
-  if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg))
-    if (MO->isEarlyClobber())
-      LoadIdx = LoadIdx.getRegSlot(true);
-  DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
-  VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+
+  MachineInstrSpan MIS(MI);
+  TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
+                           MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MI, LIS, "reload",
+                                           NewVReg));
   ++NumReloads;
 }
 
-/// insertSpill - Insert a spill of NewLI.reg after MI.
-void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                                SlotIndex Idx, MachineBasicBlock::iterator MI) {
+/// insertSpill - Insert a spill of NewVReg after MI.
+void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
+                                 MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-  TII.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, StackSlot,
-                          MRI.getRegClass(NewLI.reg), &TRI);
-  --MI; // Point to store instruction.
-  SlotIndex StoreIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
-  DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
-  VNInfo *StoreVNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+
+  MachineInstrSpan MIS(MI);
+  TII.storeRegToStackSlot(MBB, llvm::next(MI), NewVReg, isKill, StackSlot,
+                          MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(llvm::next(MI), MIS.end());
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(llvm::next(MI), MIS.end(), LIS,
+                                           "spill"));
   ++NumSpills;
 }
 
@@ -1152,7 +1194,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     // Debug values are not allowed to affect codegen.
     if (MI->isDebugValue()) {
       // Modify DBG_VALUE now that the value is in a spill slot.
-      bool IsIndirect = MI->getOperand(1).isImm();
+      bool IsIndirect = MI->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       const MDNode *MDPtr = MI->getOperand(2).getMetadata();
       DebugLoc DL = MI->getDebugLoc();
@@ -1212,19 +1254,18 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     if (foldMemoryOperand(Ops))
       continue;
 
-    // Allocate interval around instruction.
+    // Create a new virtual register for spill/fill.
     // FIXME: Infer regclass from instruction alone.
-    LiveInterval &NewLI = Edit->createFrom(Reg);
-    NewLI.markNotSpillable();
+    unsigned NewVReg = Edit->createFrom(Reg);
 
     if (RI.Reads)
-      insertReload(NewLI, Idx, MI);
+      insertReload(NewVReg, Idx, MI);
 
     // Rewrite instruction operands.
     bool hasLiveDef = false;
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second);
-      MO.setReg(NewLI.reg);
+      MO.setReg(NewVReg);
       if (MO.isUse()) {
         if (!Ops[i].first->isRegTiedToDefOperand(Ops[i].second))
           MO.setIsKill();
@@ -1233,21 +1274,12 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
           hasLiveDef = true;
       }
     }
-    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI);
+    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n');
 
     // FIXME: Use a second vreg if instruction has no tied ops.
-    if (RI.Writes) {
+    if (RI.Writes)
       if (hasLiveDef)
-        insertSpill(NewLI, OldLI, Idx, MI);
-      else {
-        // This instruction defines a dead value.  We don't need to spill it,
-        // but do create a live range for the dead value.
-        VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
-        NewLI.addRange(LiveRange(Idx, Idx.getDeadSlot(), VNI));
-      }
-    }
-
-    DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+        insertSpill(NewVReg, true, MI);
   }
 }
 
@@ -1266,8 +1298,8 @@ void InlineSpiller::spillAll() {
 
   assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
   for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
-    StackInt->MergeRangesInAsValue(LIS.getInterval(RegsToSpill[i]),
-                                   StackInt->getValNumInfo(0));
+    StackInt->MergeSegmentsInAsValue(LIS.getInterval(RegsToSpill[i]),
+                                     StackInt->getValNumInfo(0));
   DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
 
   // Spill around uses of all RegsToSpill.
@@ -1308,8 +1340,8 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
 
   DEBUG(dbgs() << "Inline spilling "
                << MRI.getRegClass(edit.getReg())->getName()
-               << ':' << PrintReg(edit.getReg()) << ' ' << edit.getParent()
-               << "\nFrom original " << LIS.getInterval(Original) << '\n');
+               << ':' << edit.getParent()
+               << "\nFrom original " << PrintReg(Original) << '\n');
   assert(edit.getParent().isSpillable() &&
          "Attempting to spill already spilled value.");
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index a8e711e..427225d 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -204,11 +204,11 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
   // Fixed interference.
   for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
     LiveInterval::iterator &I = RegUnits[i].FixedI;
-    LiveInterval *LI = RegUnits[i].Fixed;
-    if (I == LI->end() || I->start >= Stop)
+    LiveRange *LR = RegUnits[i].Fixed;
+    if (I == LR->end() || I->start >= Stop)
       continue;
-    I = LI->advanceTo(I, Stop);
-    bool Backup = I == LI->end() || I->start >= Stop;
+    I = LR->advanceTo(I, Stop);
+    bool Backup = I == LR->end() || I->start >= Stop;
     if (Backup)
       --I;
     SlotIndex StopI = I->end;
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index c02fb9a..800f705 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -72,7 +72,7 @@ class InterferenceCache {
       unsigned VirtTag;
 
       /// Fixed interference in RegUnit.
-      LiveInterval *Fixed;
+      LiveRange *Fixed;
 
       /// Iterator pointing into the fixed RegUnit interference.
       LiveInterval::iterator FixedI;
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index d894f66..c38d4fb 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -485,11 +485,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memset: {
-    Type *IntPtr = TD.getIntPtrType(Context);
+    Value *Op0 = CI->getArgOperand(0);
+    Type *IntPtr = TD.getIntPtrType(Op0->getType());
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getArgOperand(0);
+    Ops[0] = Op0;
     // Extend the amount to i32.
     Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1),
                                    Type::getInt32Ty(Context),
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 6c9b2e5..ad2c553 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -175,12 +175,11 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 
     // Create a code emitter if asked to show the encoding.
     MCCodeEmitter *MCE = 0;
-    MCAsmBackend *MAB = 0;
-    if (ShowMCEncoding) {
+    if (ShowMCEncoding)
       MCE = getTarget().createMCCodeEmitter(MII, MRI, STI, *Context);
-      MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
-    }
 
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
+                                                       TargetCPU);
     MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
                                                   getVerboseAsm(),
                                                   hasMCUseLoc(),
@@ -197,7 +196,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     // emission fails.
     MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, STI,
                                                          *Context);
-    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(),
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                        TargetCPU);
     if (MCE == 0 || MAB == 0)
       return true;
@@ -232,7 +231,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
 /// get machine code emitted.  This uses a JITCodeEmitter object to handle
 /// actually outputting the machine code and resolving things like the address
-/// of functions.  This method should returns true if machine code emission is
+/// of functions.  This method should return true if machine code emission is
 /// not supported.
 ///
 bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
@@ -271,7 +270,8 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
                                                        STI, *Ctx);
-  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
+  MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
+                                                     TargetCPU);
   if (MCE == 0 || MAB == 0)
     return true;
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 85bed46..25645e0 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -131,7 +131,8 @@ class UserValue {
 
   /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
   /// is live. Returns true if any changes were made.
-  bool splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+  bool splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
 public:
   /// UserValue - Create a new UserValue.
@@ -219,13 +220,13 @@ public:
   /// End points where VNI is no longer live are added to Kills.
   /// @param Idx   Starting point for the definition.
   /// @param LocNo Location number to propagate.
-  /// @param LI    Restrict liveness to where LI has the value VNI. May be null.
-  /// @param VNI   When LI is not null, this is the value to restrict to.
+  /// @param LR    Restrict liveness to where LR has the value VNI. May be null.
+  /// @param VNI   When LR is not null, this is the value to restrict to.
   /// @param Kills Append end points of VNI's live range to Kills.
   /// @param LIS   Live intervals analysis.
   /// @param MDT   Dominator tree.
   void extendDef(SlotIndex Idx, unsigned LocNo,
-                 LiveInterval *LI, const VNInfo *VNI,
+                 LiveRange *LR, const VNInfo *VNI,
                  SmallVectorImpl<SlotIndex> *Kills,
                  LiveIntervals &LIS, MachineDominatorTree &MDT,
                  UserValueScopes &UVS);
@@ -251,7 +252,8 @@ public:
 
   /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
   /// live. Returns true if any changes were made.
-  bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+  bool splitRegister(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
   /// rewriteLocations - Rewrite virtual register locations according to the
   /// provided virtual register map.
@@ -345,7 +347,7 @@ public:
   void mapVirtReg(unsigned VirtReg, UserValue *EC);
 
   /// splitRegister -  Replace all references to OldReg with NewRegs.
-  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
 
   /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
@@ -455,9 +457,10 @@ bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
   }
 
   // Get or create the UserValue for (variable,offset).
-  bool IsIndirect = MI->getOperand(1).isImm();
+  bool IsIndirect = MI->isIndirectDebugValue();
   unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
   const MDNode *Var = MI->getOperand(2).getMetadata();
+  //here.
   UserValue *UV = getUserValue(Var, Offset, IsIndirect, MI->getDebugLoc());
   UV->addDef(Idx, MI->getOperand(0));
   return true;
@@ -492,7 +495,7 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
 }
 
 void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
-                          LiveInterval *LI, const VNInfo *VNI,
+                          LiveRange *LR, const VNInfo *VNI,
                           SmallVectorImpl<SlotIndex> *Kills,
                           LiveIntervals &LIS, MachineDominatorTree &MDT,
                           UserValueScopes &UVS) {
@@ -506,15 +509,15 @@ void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
 
     // Limit to VNI's live range.
     bool ToEnd = true;
-    if (LI && VNI) {
-      LiveRange *Range = LI->getLiveRangeContaining(Start);
-      if (!Range || Range->valno != VNI) {
+    if (LR && VNI) {
+      LiveInterval::Segment *Segment = LR->getSegmentContaining(Start);
+      if (!Segment || Segment->valno != VNI) {
         if (Kills)
           Kills->push_back(Start);
         continue;
       }
-      if (Range->end < Stop)
-        Stop = Range->end, ToEnd = false;
+      if (Segment->end < Stop)
+        Stop = Segment->end, ToEnd = false;
     }
 
     // There could already be a short def at Start.
@@ -666,10 +669,10 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
     // For physregs, use the live range of the first regunit as a guide.
     unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI);
-    LiveInterval *LI = &LIS.getRegUnit(Unit);
-    const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    LiveRange *LR = &LIS.getRegUnit(Unit);
+    const VNInfo *VNI = LR->getVNInfoAt(Idx);
     // Don't track copies from physregs, it is too expensive.
-    extendDef(Idx, LocNo, LI, VNI, 0, LIS, MDT, UVS);
+    extendDef(Idx, LocNo, LR, VNI, 0, LIS, MDT, UVS);
   }
 
   // Finally, erase all the undefs.
@@ -729,7 +732,8 @@ LiveDebugVariables::~LiveDebugVariables() {
 //===----------------------------------------------------------------------===//
 
 bool
-UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals& LIS) {
   DEBUG({
     dbgs() << "Splitting Loc" << OldLocNo << '\t';
     print(dbgs(), 0);
@@ -738,7 +742,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
   LocMap::iterator LocMapI;
   LocMapI.setMap(locInts);
   for (unsigned i = 0; i != NewRegs.size(); ++i) {
-    LiveInterval *LI = NewRegs[i];
+    LiveInterval *LI = &LIS.getInterval(NewRegs[i]);
     if (LI->empty())
       continue;
 
@@ -827,7 +831,8 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
 }
 
 bool
-UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals &LIS) {
   bool DidChange = false;
   // Split locations referring to OldReg. Iterate backwards so splitLocation can
   // safely erase unused locations.
@@ -836,15 +841,15 @@ UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
     const MachineOperand *Loc = &locations[LocNo];
     if (!Loc->isReg() || Loc->getReg() != OldReg)
       continue;
-    DidChange |= splitLocation(LocNo, NewRegs);
+    DidChange |= splitLocation(LocNo, NewRegs, LIS);
   }
   return DidChange;
 }
 
-void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
   bool DidChange = false;
   for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
-    DidChange |= UV->splitRegister(OldReg, NewRegs);
+    DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
 
   if (!DidChange)
     return;
@@ -852,11 +857,11 @@ void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
   // Map all of the new virtual registers.
   UserValue *UV = lookupVirtReg(OldReg);
   for (unsigned i = 0; i != NewRegs.size(); ++i)
-    mapVirtReg(NewRegs[i]->reg, UV);
+    mapVirtReg(NewRegs[i], UV);
 }
 
 void LiveDebugVariables::
-splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs, LiveIntervals &LIS) {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
 }
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index 3ce3c39..58a3f0f 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -27,6 +27,7 @@
 namespace llvm {
 
 class LiveInterval;
+class LiveIntervals;
 class VirtRegMap;
 
 class LiveDebugVariables : public MachineFunctionPass {
@@ -47,7 +48,8 @@ public:
   /// splitRegister - Move any user variables in OldReg to the live ranges in
   /// NewRegs where they are live. Mark the values as unavailable where no new
   /// register is live.
-  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
   /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
   /// that happened during register allocation.
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 6be6bf3..2b8feb8 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -9,12 +9,12 @@
 //
 // This file implements the LiveRange and LiveInterval classes.  Given some
 // numbering of each the machine instructions an interval [i, j) is said to be a
-// live interval for register v if there is no instruction with number j' > j
+// live range for register v if there is no instruction with number j' >= j
 // such that v is live at j' and there is no instruction with number i' < i such
-// that v is live at i'. In this implementation intervals can have holes,
-// i.e. an interval might look like [1,20), [50,65), [1000,1001).  Each
-// individual range is represented as an instance of LiveRange, and the whole
-// interval is represented as an instance of LiveInterval.
+// that v is live at i'. In this implementation ranges can have holes,
+// i.e. a range might look like [1,20), [50,65), [1000,1001).  Each
+// individual segment is represented as an instance of LiveRange::Segment,
+// and the whole range is represented as an instance of LiveRange.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,14 +31,14 @@
 #include <algorithm>
 using namespace llvm;
 
-LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
+LiveRange::iterator LiveRange::find(SlotIndex Pos) {
   // This algorithm is basically std::upper_bound.
   // Unfortunately, std::upper_bound cannot be used with mixed types until we
   // adopt C++0x. Many libraries can do it, but not all.
   if (empty() || Pos >= endIndex())
     return end();
   iterator I = begin();
-  size_t Len = ranges.size();
+  size_t Len = size();
   do {
     size_t Mid = Len >> 1;
     if (Pos < I[Mid].end)
@@ -49,13 +49,13 @@ LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
   return I;
 }
 
-VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
-                                    VNInfo::Allocator &VNInfoAllocator) {
+VNInfo *LiveRange::createDeadDef(SlotIndex Def,
+                                  VNInfo::Allocator &VNInfoAllocator) {
   assert(!Def.isDead() && "Cannot define a value at the dead slot");
   iterator I = find(Def);
   if (I == end()) {
     VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-    ranges.push_back(LiveRange(Def, Def.getDeadSlot(), VNI));
+    segments.push_back(Segment(Def, Def.getDeadSlot(), VNI));
     return VNI;
   }
   if (SlotIndex::isSameInstr(Def, I->start)) {
@@ -73,11 +73,11 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
   }
   assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
   VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-  ranges.insert(I, LiveRange(Def, Def.getDeadSlot(), VNI));
+  segments.insert(I, Segment(Def, Def.getDeadSlot(), VNI));
   return VNI;
 }
 
-// overlaps - Return true if the intersection of the two live intervals is
+// overlaps - Return true if the intersection of the two live ranges is
 // not empty.
 //
 // An example for overlaps():
@@ -86,7 +86,7 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
 // 4: B = ...
 // 8: C = A + B ;; last use of A
 //
-// The live intervals should look like:
+// The live ranges should look like:
 //
 // A = [3, 11)
 // B = [7, x)
@@ -95,9 +95,9 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
 // A->overlaps(C) should return false since we want to be able to join
 // A and C.
 //
-bool LiveInterval::overlapsFrom(const LiveInterval& other,
-                                const_iterator StartPos) const {
-  assert(!empty() && "empty interval");
+bool LiveRange::overlapsFrom(const LiveRange& other,
+                             const_iterator StartPos) const {
+  assert(!empty() && "empty range");
   const_iterator i = begin();
   const_iterator ie = end();
   const_iterator j = StartPos;
@@ -108,13 +108,13 @@ bool LiveInterval::overlapsFrom(const LiveInterval& other,
 
   if (i->start < j->start) {
     i = std::upper_bound(i, ie, j->start);
-    if (i != ranges.begin()) --i;
+    if (i != begin()) --i;
   } else if (j->start < i->start) {
     ++StartPos;
     if (StartPos != other.end() && StartPos->start <= i->start) {
       assert(StartPos < other.end() && i < end());
       j = std::upper_bound(j, je, i->start);
-      if (j != other.ranges.begin()) --j;
+      if (j != other.begin()) --j;
     }
   } else {
     return true;
@@ -136,10 +136,9 @@ bool LiveInterval::overlapsFrom(const LiveInterval& other,
   return false;
 }
 
-bool LiveInterval::overlaps(const LiveInterval &Other,
-                            const CoalescerPair &CP,
-                            const SlotIndexes &Indexes) const {
-  assert(!empty() && "empty interval");
+bool LiveRange::overlaps(const LiveRange &Other, const CoalescerPair &CP,
+                         const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty range");
   if (Other.empty())
     return false;
 
@@ -178,9 +177,9 @@ bool LiveInterval::overlaps(const LiveInterval &Other,
   }
 }
 
-/// overlaps - Return true if the live interval overlaps a range specified
+/// overlaps - Return true if the live range overlaps an interval specified
 /// by [Start, End).
-bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
+bool LiveRange::overlaps(SlotIndex Start, SlotIndex End) const {
   assert(Start < End && "Invalid range");
   const_iterator I = std::lower_bound(begin(), end(), End);
   return I != begin() && (--I)->end > Start;
@@ -190,7 +189,7 @@ bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
 /// ValNo is dead, remove it.  If it is the largest value number, just nuke it
 /// (and any other deleted values neighboring it), otherwise mark it as ~1U so
 /// it can be nuked later.
-void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
+void LiveRange::markValNoForDeletion(VNInfo *ValNo) {
   if (ValNo->id == getNumValNums()-1) {
     do {
       valnos.pop_back();
@@ -202,137 +201,135 @@ void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
 
 /// RenumberValues - Renumber all values in order of appearance and delete the
 /// remaining unused values.
-void LiveInterval::RenumberValues(LiveIntervals &lis) {
+void LiveRange::RenumberValues() {
   SmallPtrSet<VNInfo*, 8> Seen;
   valnos.clear();
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     VNInfo *VNI = I->valno;
     if (!Seen.insert(VNI))
       continue;
-    assert(!VNI->isUnused() && "Unused valno used by live range");
+    assert(!VNI->isUnused() && "Unused valno used by live segment");
     VNI->id = (unsigned)valnos.size();
     valnos.push_back(VNI);
   }
 }
 
-/// extendIntervalEndTo - This method is used when we want to extend the range
-/// specified by I to end at the specified endpoint.  To do this, we should
-/// merge and eliminate all ranges that this will overlap with.  The iterator is
-/// not invalidated.
-void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
-  assert(I != ranges.end() && "Not a valid interval!");
+/// This method is used when we want to extend the segment specified by I to end
+/// at the specified endpoint.  To do this, we should merge and eliminate all
+/// segments that this will overlap with.  The iterator is not invalidated.
+void LiveRange::extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
+  assert(I != end() && "Not a valid segment!");
   VNInfo *ValNo = I->valno;
 
-  // Search for the first interval that we can't merge with.
-  Ranges::iterator MergeTo = llvm::next(I);
-  for (; MergeTo != ranges.end() && NewEnd >= MergeTo->end; ++MergeTo) {
+  // Search for the first segment that we can't merge with.
+  iterator MergeTo = llvm::next(I);
+  for (; MergeTo != end() && NewEnd >= MergeTo->end; ++MergeTo) {
     assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
   }
 
-  // If NewEnd was in the middle of an interval, make sure to get its endpoint.
+  // If NewEnd was in the middle of a segment, make sure to get its endpoint.
   I->end = std::max(NewEnd, prior(MergeTo)->end);
 
-  // If the newly formed range now touches the range after it and if they have
-  // the same value number, merge the two ranges into one range.
-  if (MergeTo != ranges.end() && MergeTo->start <= I->end &&
+  // If the newly formed segment now touches the segment after it and if they
+  // have the same value number, merge the two segments into one segment.
+  if (MergeTo != end() && MergeTo->start <= I->end &&
       MergeTo->valno == ValNo) {
     I->end = MergeTo->end;
     ++MergeTo;
   }
 
-  // Erase any dead ranges.
-  ranges.erase(llvm::next(I), MergeTo);
+  // Erase any dead segments.
+  segments.erase(llvm::next(I), MergeTo);
 }
 
 
-/// extendIntervalStartTo - This method is used when we want to extend the range
-/// specified by I to start at the specified endpoint.  To do this, we should
-/// merge and eliminate all ranges that this will overlap with.
-LiveInterval::Ranges::iterator
-LiveInterval::extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStart) {
-  assert(I != ranges.end() && "Not a valid interval!");
+/// This method is used when we want to extend the segment specified by I to
+/// start at the specified endpoint.  To do this, we should merge and eliminate
+/// all segments that this will overlap with.
+LiveRange::iterator
+LiveRange::extendSegmentStartTo(iterator I, SlotIndex NewStart) {
+  assert(I != end() && "Not a valid segment!");
   VNInfo *ValNo = I->valno;
 
-  // Search for the first interval that we can't merge with.
-  Ranges::iterator MergeTo = I;
+  // Search for the first segment that we can't merge with.
+  iterator MergeTo = I;
   do {
-    if (MergeTo == ranges.begin()) {
+    if (MergeTo == begin()) {
       I->start = NewStart;
-      ranges.erase(MergeTo, I);
+      segments.erase(MergeTo, I);
       return I;
     }
     assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
     --MergeTo;
   } while (NewStart <= MergeTo->start);
 
-  // If we start in the middle of another interval, just delete a range and
-  // extend that interval.
+  // If we start in the middle of another segment, just delete a range and
+  // extend that segment.
   if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
     MergeTo->end = I->end;
   } else {
-    // Otherwise, extend the interval right after.
+    // Otherwise, extend the segment right after.
     ++MergeTo;
     MergeTo->start = NewStart;
     MergeTo->end = I->end;
   }
 
-  ranges.erase(llvm::next(MergeTo), llvm::next(I));
+  segments.erase(llvm::next(MergeTo), llvm::next(I));
   return MergeTo;
 }
 
-LiveInterval::iterator
-LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
-  SlotIndex Start = LR.start, End = LR.end;
-  iterator it = std::upper_bound(From, ranges.end(), Start);
+LiveRange::iterator LiveRange::addSegmentFrom(Segment S, iterator From) {
+  SlotIndex Start = S.start, End = S.end;
+  iterator it = std::upper_bound(From, end(), Start);
 
-  // If the inserted interval starts in the middle or right at the end of
-  // another interval, just extend that interval to contain the range of LR.
-  if (it != ranges.begin()) {
+  // If the inserted segment starts in the middle or right at the end of
+  // another segment, just extend that segment to contain the segment of S.
+  if (it != begin()) {
     iterator B = prior(it);
-    if (LR.valno == B->valno) {
+    if (S.valno == B->valno) {
       if (B->start <= Start && B->end >= Start) {
-        extendIntervalEndTo(B, End);
+        extendSegmentEndTo(B, End);
         return B;
       }
     } else {
-      // Check to make sure that we are not overlapping two live ranges with
+      // Check to make sure that we are not overlapping two live segments with
       // different valno's.
       assert(B->end <= Start &&
-             "Cannot overlap two LiveRanges with differing ValID's"
+             "Cannot overlap two segments with differing ValID's"
              " (did you def the same reg twice in a MachineInstr?)");
     }
   }
 
-  // Otherwise, if this range ends in the middle of, or right next to, another
-  // interval, merge it into that interval.
-  if (it != ranges.end()) {
-    if (LR.valno == it->valno) {
+  // Otherwise, if this segment ends in the middle of, or right next to, another
+  // segment, merge it into that segment.
+  if (it != end()) {
+    if (S.valno == it->valno) {
       if (it->start <= End) {
-        it = extendIntervalStartTo(it, Start);
+        it = extendSegmentStartTo(it, Start);
 
-        // If LR is a complete superset of an interval, we may need to grow its
+        // If S is a complete superset of a segment, we may need to grow its
         // endpoint as well.
         if (End > it->end)
-          extendIntervalEndTo(it, End);
+          extendSegmentEndTo(it, End);
         return it;
       }
     } else {
-      // Check to make sure that we are not overlapping two live ranges with
+      // Check to make sure that we are not overlapping two live segments with
       // different valno's.
       assert(it->start >= End &&
-             "Cannot overlap two LiveRanges with differing ValID's");
+             "Cannot overlap two segments with differing ValID's");
     }
   }
 
-  // Otherwise, this is just a new range that doesn't interact with anything.
+  // Otherwise, this is just a new segment that doesn't interact with anything.
   // Insert it.
-  return ranges.insert(it, LR);
+  return segments.insert(it, S);
 }
 
-/// extendInBlock - If this interval is live before Kill in the basic
+/// extendInBlock - If this range is live before Kill in the basic
 /// block that starts at StartIdx, extend it to be live up to Kill and return
 /// the value. If there is no live range before Kill, return NULL.
-VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
+VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (empty())
     return 0;
   iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
@@ -342,20 +339,21 @@ VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (I->end <= StartIdx)
     return 0;
   if (I->end < Kill)
-    extendIntervalEndTo(I, Kill);
+    extendSegmentEndTo(I, Kill);
   return I->valno;
 }
 
-/// removeRange - Remove the specified range from this interval.  Note that
-/// the range must be in a single LiveRange in its entirety.
-void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
-                               bool RemoveDeadValNo) {
-  // Find the LiveRange containing this span.
-  Ranges::iterator I = find(Start);
-  assert(I != ranges.end() && "Range is not in interval!");
-  assert(I->containsRange(Start, End) && "Range is not entirely in interval!");
+/// Remove the specified segment from this range.  Note that the segment must
+/// be in a single Segment in its entirety.
+void LiveRange::removeSegment(SlotIndex Start, SlotIndex End,
+                              bool RemoveDeadValNo) {
+  // Find the Segment containing this span.
+  iterator I = find(Start);
+  assert(I != end() && "Segment is not in range!");
+  assert(I->containsInterval(Start, End)
+         && "Segment is not entirely in range!");
 
-  // If the span we are removing is at the start of the LiveRange, adjust it.
+  // If the span we are removing is at the start of the Segment, adjust it.
   VNInfo *ValNo = I->valno;
   if (I->start == Start) {
     if (I->end == End) {
@@ -373,54 +371,50 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
         }
       }
 
-      ranges.erase(I);  // Removed the whole LiveRange.
+      segments.erase(I);  // Removed the whole Segment.
     } else
       I->start = End;
     return;
   }
 
-  // Otherwise if the span we are removing is at the end of the LiveRange,
+  // Otherwise if the span we are removing is at the end of the Segment,
   // adjust the other way.
   if (I->end == End) {
     I->end = Start;
     return;
   }
 
-  // Otherwise, we are splitting the LiveRange into two pieces.
+  // Otherwise, we are splitting the Segment into two pieces.
   SlotIndex OldEnd = I->end;
-  I->end = Start;   // Trim the old interval.
+  I->end = Start;   // Trim the old segment.
 
   // Insert the new one.
-  ranges.insert(llvm::next(I), LiveRange(End, OldEnd, ValNo));
+  segments.insert(llvm::next(I), Segment(End, OldEnd, ValNo));
 }
 
-/// removeValNo - Remove all the ranges defined by the specified value#.
+/// removeValNo - Remove all the segments defined by the specified value#.
 /// Also remove the value# from value# list.
-void LiveInterval::removeValNo(VNInfo *ValNo) {
+void LiveRange::removeValNo(VNInfo *ValNo) {
   if (empty()) return;
-  Ranges::iterator I = ranges.end();
-  Ranges::iterator E = ranges.begin();
+  iterator I = end();
+  iterator E = begin();
   do {
     --I;
     if (I->valno == ValNo)
-      ranges.erase(I);
+      segments.erase(I);
   } while (I != E);
   // Now that ValNo is dead, remove it.
   markValNoForDeletion(ValNo);
 }
 
-/// join - Join two live intervals (this, and other) together.  This applies
-/// mappings to the value numbers in the LHS/RHS intervals as specified.  If
-/// the intervals are not joinable, this aborts.
-void LiveInterval::join(LiveInterval &Other,
-                        const int *LHSValNoAssignments,
-                        const int *RHSValNoAssignments,
-                        SmallVectorImpl<VNInfo *> &NewVNInfo,
-                        MachineRegisterInfo *MRI) {
+void LiveRange::join(LiveRange &Other,
+                     const int *LHSValNoAssignments,
+                     const int *RHSValNoAssignments,
+                     SmallVectorImpl<VNInfo *> &NewVNInfo) {
   verify();
 
-  // Determine if any of our live range values are mapped.  This is uncommon, so
-  // we want to avoid the interval scan if not.
+  // Determine if any of our values are mapped.  This is uncommon, so we want
+  // to avoid the range scan if not.
   bool MustMapCurValNos = false;
   unsigned NumVals = getNumValNums();
   unsigned NumNewVals = NewVNInfo.size();
@@ -433,8 +427,7 @@ void LiveInterval::join(LiveInterval &Other,
     }
   }
 
-  // If we have to apply a mapping to our base interval assignment, rewrite it
-  // now.
+  // If we have to apply a mapping to our base range assignment, rewrite it now.
   if (MustMapCurValNos && !empty()) {
     // Map the first live range.
 
@@ -445,12 +438,12 @@ void LiveInterval::join(LiveInterval &Other,
       assert(nextValNo != 0 && "Huh?");
 
       // If this live range has the same value # as its immediate predecessor,
-      // and if they are neighbors, remove one LiveRange.  This happens when we
+      // and if they are neighbors, remove one Segment.  This happens when we
       // have [0,4:0)[4,7:1) and map 0/1 onto the same value #.
       if (OutIt->valno == nextValNo && OutIt->end == I->start) {
         OutIt->end = I->end;
       } else {
-        // Didn't merge. Move OutIt to the next interval,
+        // Didn't merge. Move OutIt to the next segment,
         ++OutIt;
         OutIt->valno = nextValNo;
         if (OutIt != I) {
@@ -459,9 +452,9 @@ void LiveInterval::join(LiveInterval &Other,
         }
       }
     }
-    // If we merge some live ranges, chop off the end.
+    // If we merge some segments, chop off the end.
     ++OutIt;
-    ranges.erase(OutIt, end());
+    segments.erase(OutIt, end());
   }
 
   // Rewrite Other values before changing the VNInfo ids.
@@ -472,7 +465,7 @@ void LiveInterval::join(LiveInterval &Other,
     I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
 
   // Update val# info. Renumber them and make sure they all belong to this
-  // LiveInterval now. Also remove dead val#'s.
+  // LiveRange now. Also remove dead val#'s.
   unsigned NumValNos = 0;
   for (unsigned i = 0; i < NumNewVals; ++i) {
     VNInfo *VNI = NewVNInfo[i];
@@ -487,31 +480,31 @@ void LiveInterval::join(LiveInterval &Other,
   if (NumNewVals < NumVals)
     valnos.resize(NumNewVals);  // shrinkify
 
-  // Okay, now insert the RHS live ranges into the LHS.
+  // Okay, now insert the RHS live segments into the LHS.
   LiveRangeUpdater Updater(this);
   for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
     Updater.add(*I);
 }
 
-/// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
-/// interval as the specified value number.  The LiveRanges in RHS are
-/// allowed to overlap with LiveRanges in the current interval, but only if
-/// the overlapping LiveRanges have the specified value number.
-void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
-                                        VNInfo *LHSValNo) {
+/// Merge all of the segments in RHS into this live range as the specified
+/// value number.  The segments in RHS are allowed to overlap with segments in
+/// the current range, but only if the overlapping segments have the
+/// specified value number.
+void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
+                                       VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
     Updater.add(I->start, I->end, LHSValNo);
 }
 
-/// MergeValueInAsValue - Merge all of the live ranges of a specific val#
-/// in RHS into this live interval as the specified value number.
-/// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
-/// current interval, it will replace the value numbers of the overlaped
-/// live ranges with the specified value number.
-void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
-                                       const VNInfo *RHSValNo,
-                                       VNInfo *LHSValNo) {
+/// MergeValueInAsValue - Merge all of the live segments of a specific val#
+/// in RHS into this live range as the specified value number.
+/// The segments in RHS are allowed to overlap with segments in the
+/// current range, it will replace the value numbers of the overlaped
+/// segments with the specified value number.
+void LiveRange::MergeValueInAsValue(const LiveRange &RHS,
+                                    const VNInfo *RHSValNo,
+                                    VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
     if (I->valno == RHSValNo)
@@ -520,9 +513,9 @@ void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
 
 /// MergeValueNumberInto - This method is called when two value nubmers
 /// are found to be equivalent.  This eliminates V1, replacing all
-/// LiveRanges with the V1 value number with the V2 value number.  This can
+/// segments with the V1 value number with the V2 value number.  This can
 /// cause merging of V1/V2 values numbers and compaction of the value space.
-VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+VNInfo *LiveRange::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   assert(V1 != V2 && "Identical value#'s are always equivalent!");
 
   // This code actually merges the (numerically) larger value number into the
@@ -536,37 +529,37 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
     std::swap(V1, V2);
   }
 
-  // Merge V1 live ranges into V2.
+  // Merge V1 segments into V2.
   for (iterator I = begin(); I != end(); ) {
-    iterator LR = I++;
-    if (LR->valno != V1) continue;  // Not a V1 LiveRange.
+    iterator S = I++;
+    if (S->valno != V1) continue;  // Not a V1 Segment.
 
     // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
     // range, extend it.
-    if (LR != begin()) {
-      iterator Prev = LR-1;
-      if (Prev->valno == V2 && Prev->end == LR->start) {
-        Prev->end = LR->end;
+    if (S != begin()) {
+      iterator Prev = S-1;
+      if (Prev->valno == V2 && Prev->end == S->start) {
+        Prev->end = S->end;
 
         // Erase this live-range.
-        ranges.erase(LR);
+        segments.erase(S);
         I = Prev+1;
-        LR = Prev;
+        S = Prev;
       }
     }
 
     // Okay, now we have a V1 or V2 live range that is maximally merged forward.
     // Ensure that it is a V2 live-range.
-    LR->valno = V2;
+    S->valno = V2;
 
-    // If we can merge it into later V2 live ranges, do so now.  We ignore any
-    // following V1 live ranges, as they will be merged in subsequent iterations
+    // If we can merge it into later V2 segments, do so now.  We ignore any
+    // following V1 segments, as they will be merged in subsequent iterations
     // of the loop.
     if (I != end()) {
-      if (I->start == LR->end && I->valno == V2) {
-        LR->end = I->end;
-        ranges.erase(I);
-        I = LR+1;
+      if (I->start == S->end && I->valno == V2) {
+        S->end = I->end;
+        segments.erase(I);
+        I = S+1;
       }
     }
   }
@@ -584,22 +577,21 @@ unsigned LiveInterval::getSize() const {
   return Sum;
 }
 
-raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
-  return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
+raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange::Segment &S) {
+  return os << '[' << S.start << ',' << S.end << ':' << S.valno->id << ")";
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LiveRange::dump() const {
+void LiveRange::Segment::dump() const {
   dbgs() << *this << "\n";
 }
 #endif
 
-void LiveInterval::print(raw_ostream &OS) const {
+void LiveRange::print(raw_ostream &OS) const {
   if (empty())
     OS << "EMPTY";
   else {
-    for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
-           E = ranges.end(); I != E; ++I) {
+    for (const_iterator I = begin(), E = end(); I != E; ++I) {
       OS << *I;
       assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
     }
@@ -625,19 +617,29 @@ void LiveInterval::print(raw_ostream &OS) const {
   }
 }
 
+void LiveInterval::print(raw_ostream &OS) const {
+  OS << PrintReg(reg) << ' ';
+  super::print(OS);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveRange::dump() const {
+  dbgs() << *this << "\n";
+}
+
 void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
 #endif
 
 #ifndef NDEBUG
-void LiveInterval::verify() const {
+void LiveRange::verify() const {
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     assert(I->start.isValid());
     assert(I->end.isValid());
     assert(I->start < I->end);
     assert(I->valno != 0);
+    assert(I->valno->id < valnos.size());
     assert(I->valno == valnos[I->valno->id]);
     if (llvm::next(I) != E) {
       assert(I->end <= llvm::next(I)->start);
@@ -649,10 +651,6 @@ void LiveInterval::verify() const {
 #endif
 
 
-void LiveRange::print(raw_ostream &os) const {
-  os << *this;
-}
-
 //===----------------------------------------------------------------------===//
 //                           LiveRangeUpdater class
 //===----------------------------------------------------------------------===//
@@ -665,11 +663,11 @@ void LiveRange::print(raw_ostream &os) const {
 //
 // Otherwise, segments are kept in three separate areas:
 //
-// 1. [begin; WriteI) at the front of LI.
-// 2. [ReadI; end) at the back of LI.
+// 1. [begin; WriteI) at the front of LR.
+// 2. [ReadI; end) at the back of LR.
 // 3. Spills.
 //
-// - LI.begin() <= WriteI <= ReadI <= LI.end().
+// - LR.begin() <= WriteI <= ReadI <= LR.end().
 // - Segments in all three areas are fully ordered and coalesced.
 // - Segments in area 1 precede and can't coalesce with segments in area 2.
 // - Segments in Spills precede and can't coalesce with segments in area 2.
@@ -684,23 +682,23 @@ void LiveRange::print(raw_ostream &os) const {
 
 void LiveRangeUpdater::print(raw_ostream &OS) const {
   if (!isDirty()) {
-    if (LI)
-      OS << "Clean " << PrintReg(LI->reg) << " updater: " << *LI << '\n';
+    if (LR)
+      OS << "Clean updater: " << *LR << '\n';
     else
       OS << "Null updater.\n";
     return;
   }
-  assert(LI && "Can't have null LI in dirty updater.");
-  OS << PrintReg(LI->reg) << " updater with gap = " << (ReadI - WriteI)
+  assert(LR && "Can't have null LR in dirty updater.");
+  OS << " updater with gap = " << (ReadI - WriteI)
      << ", last start = " << LastStart
      << ":\n  Area 1:";
-  for (LiveInterval::const_iterator I = LI->begin(); I != WriteI; ++I)
+  for (LiveRange::const_iterator I = LR->begin(); I != WriteI; ++I)
     OS << ' ' << *I;
   OS << "\n  Spills:";
   for (unsigned I = 0, E = Spills.size(); I != E; ++I)
     OS << ' ' << Spills[I];
   OS << "\n  Area 2:";
-  for (LiveInterval::const_iterator I = ReadI, E = LI->end(); I != E; ++I)
+  for (LiveRange::const_iterator I = ReadI, E = LR->end(); I != E; ++I)
     OS << ' ' << *I;
   OS << '\n';
 }
@@ -711,8 +709,9 @@ void LiveRangeUpdater::dump() const
 }
 
 // Determine if A and B should be coalesced.
-static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
-  assert(A.start <= B.start && "Unordered live ranges.");
+static inline bool coalescable(const LiveRange::Segment &A,
+                               const LiveRange::Segment &B) {
+  assert(A.start <= B.start && "Unordered live segments.");
   if (A.end == B.start)
     return A.valno == B.valno;
   if (A.end < B.start)
@@ -721,8 +720,8 @@ static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
   return true;
 }
 
-void LiveRangeUpdater::add(LiveRange Seg) {
-  assert(LI && "Cannot add to a null destination");
+void LiveRangeUpdater::add(LiveRange::Segment Seg) {
+  assert(LR && "Cannot add to a null destination");
 
   // Flush the state if Start moves backwards.
   if (!LastStart.isValid() || LastStart > Seg.start) {
@@ -730,21 +729,21 @@ void LiveRangeUpdater::add(LiveRange Seg) {
       flush();
     // This brings us to an uninitialized state. Reinitialize.
     assert(Spills.empty() && "Leftover spilled segments");
-    WriteI = ReadI = LI->begin();
+    WriteI = ReadI = LR->begin();
   }
 
   // Remember start for next time.
   LastStart = Seg.start;
 
   // Advance ReadI until it ends after Seg.start.
-  LiveInterval::iterator E = LI->end();
+  LiveRange::iterator E = LR->end();
   if (ReadI != E && ReadI->end <= Seg.start) {
     // First try to close the gap between WriteI and ReadI with spills.
     if (ReadI != WriteI)
       mergeSpills();
     // Then advance ReadI.
     if (ReadI == WriteI)
-      ReadI = WriteI = LI->find(Seg.start);
+      ReadI = WriteI = LR->find(Seg.start);
     else
       while (ReadI != E && ReadI->end <= Seg.start)
         *WriteI++ = *ReadI++;
@@ -777,7 +776,7 @@ void LiveRangeUpdater::add(LiveRange Seg) {
   }
 
   // Try coalescing Seg into WriteI[-1].
-  if (WriteI != LI->begin() && coalescable(WriteI[-1], Seg)) {
+  if (WriteI != LR->begin() && coalescable(WriteI[-1], Seg)) {
     WriteI[-1].end = std::max(WriteI[-1].end, Seg.end);
     return;
   }
@@ -788,10 +787,10 @@ void LiveRangeUpdater::add(LiveRange Seg) {
     return;
   }
 
-  // Finally, append to LI or Spills.
+  // Finally, append to LR or Spills.
   if (WriteI == E) {
-    LI->ranges.push_back(Seg);
-    WriteI = ReadI = LI->ranges.end();
+    LR->segments.push_back(Seg);
+    WriteI = ReadI = LR->end();
   } else
     Spills.push_back(Seg);
 }
@@ -802,10 +801,10 @@ void LiveRangeUpdater::mergeSpills() {
   // Perform a backwards merge of Spills and [SpillI;WriteI).
   size_t GapSize = ReadI - WriteI;
   size_t NumMoved = std::min(Spills.size(), GapSize);
-  LiveInterval::iterator Src = WriteI;
-  LiveInterval::iterator Dst = Src + NumMoved;
-  LiveInterval::iterator SpillSrc = Spills.end();
-  LiveInterval::iterator B = LI->begin();
+  LiveRange::iterator Src = WriteI;
+  LiveRange::iterator Dst = Src + NumMoved;
+  LiveRange::iterator SpillSrc = Spills.end();
+  LiveRange::iterator B = LR->begin();
 
   // This is the new WriteI position after merging spills.
   WriteI = Dst;
@@ -827,12 +826,12 @@ void LiveRangeUpdater::flush() {
   // Clear the dirty state.
   LastStart = SlotIndex();
 
-  assert(LI && "Cannot add to a null destination");
+  assert(LR && "Cannot add to a null destination");
 
   // Nothing to merge?
   if (Spills.empty()) {
-    LI->ranges.erase(WriteI, ReadI);
-    LI->verify();
+    LR->segments.erase(WriteI, ReadI);
+    LR->verify();
     return;
   }
 
@@ -840,17 +839,17 @@ void LiveRangeUpdater::flush() {
   size_t GapSize = ReadI - WriteI;
   if (GapSize < Spills.size()) {
     // The gap is too small. Make some room.
-    size_t WritePos = WriteI - LI->begin();
-    LI->ranges.insert(ReadI, Spills.size() - GapSize, LiveRange());
+    size_t WritePos = WriteI - LR->begin();
+    LR->segments.insert(ReadI, Spills.size() - GapSize, LiveRange::Segment());
     // This also invalidated ReadI, but it is recomputed below.
-    WriteI = LI->ranges.begin() + WritePos;
+    WriteI = LR->begin() + WritePos;
   } else {
     // Shrink the gap if necessary.
-    LI->ranges.erase(WriteI + Spills.size(), ReadI);
+    LR->segments.erase(WriteI + Spills.size(), ReadI);
   }
   ReadI = WriteI + Spills.size();
   mergeSpills();
-  LI->verify();
+  LR->verify();
 }
 
 unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
@@ -918,7 +917,7 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
       Idx = LIS.getSlotIndexes()->getIndexBefore(MI);
     else
       Idx = LIS.getInstructionIndex(MI);
-    LiveRangeQuery LRQ(LI, Idx);
+    LiveQueryResult LRQ = LI.Query(Idx);
     const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
     // In the case of an <undef> use that isn't tied to any def, VNI will be
     // NULL. If the use is tied to a def, VNI will be the defined value.
@@ -935,11 +934,11 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     if (unsigned eq = EqClass[I->valno->id]) {
       assert((LIV[eq]->empty() || LIV[eq]->expiredAt(I->start)) &&
              "New intervals should be empty");
-      LIV[eq]->ranges.push_back(*I);
+      LIV[eq]->segments.push_back(*I);
     } else
       *J++ = *I;
   }
-  LI.ranges.erase(J, E);
+  LI.segments.erase(J, E);
 
   // Transfer VNInfos to their new owners and renumber them.
   unsigned j = 0, e = LI.getNumValNums();
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 3680943..e1c3217 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -95,15 +95,15 @@ void LiveIntervals::releaseMemory() {
   RegMaskBits.clear();
   RegMaskBlocks.clear();
 
-  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
-    delete RegUnitIntervals[i];
-  RegUnitIntervals.clear();
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    delete RegUnitRanges[i];
+  RegUnitRanges.clear();
 
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
 }
 
-/// runOnMachineFunction - Register allocate the whole function
+/// runOnMachineFunction - calculates LiveIntervals
 ///
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
@@ -139,15 +139,15 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   OS << "********** INTERVALS **********\n";
 
   // Dump the regunits.
-  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
-    if (LiveInterval *LI = RegUnitIntervals[i])
-      OS << PrintRegUnit(i, TRI) << " = " << *LI << '\n';
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    if (LiveRange *LR = RegUnitRanges[i])
+      OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n';
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (hasInterval(Reg))
-      OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
+      OS << getInterval(Reg) << '\n';
   }
 
   OS << "RegMasks:";
@@ -170,16 +170,17 @@ void LiveIntervals::dumpInstrs() const {
 #endif
 
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
-  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? HUGE_VALF : 0.0F;
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ?
+                  llvm::huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
 
 
 /// computeVirtRegInterval - Compute the live interval of a virtual register,
 /// based on defs and uses.
-void LiveIntervals::computeVirtRegInterval(LiveInterval *LI) {
+void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
-  assert(LI->empty() && "Should only compute empty intervals.");
+  assert(LI.empty() && "Should only compute empty intervals.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   LRCalc->createDeadDefs(LI);
   LRCalc->extendToUses(LI);
@@ -190,9 +191,7 @@ void LiveIntervals::computeVirtRegs() {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *LI = createInterval(Reg);
-    VirtRegIntervals[Reg] = LI;
-    computeVirtRegInterval(LI);
+    createAndComputeVirtRegInterval(Reg);
   }
 }
 
@@ -229,12 +228,10 @@ void LiveIntervals::computeRegMasks() {
 // interference.
 //
 
-/// computeRegUnitInterval - Compute the live interval of a register unit, based
-/// on the uses and defs of aliasing registers.  The interval should be empty,
+/// computeRegUnitInterval - Compute the live range of a register unit, based
+/// on the uses and defs of aliasing registers.  The range should be empty,
 /// or contain only dead phi-defs from ABI blocks.
-void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
-  unsigned Unit = LI->reg;
-
+void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
 
@@ -247,18 +244,18 @@ void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
     for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
          Supers.isValid(); ++Supers) {
       if (!MRI->reg_empty(*Supers))
-        LRCalc->createDeadDefs(LI, *Supers);
+        LRCalc->createDeadDefs(LR, *Supers);
     }
   }
 
-  // Now extend LI to reach all uses.
+  // Now extend LR to reach all uses.
   // Ignore uses of reserved registers. We only track defs of those.
   for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
     for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
          Supers.isValid(); ++Supers) {
       unsigned Reg = *Supers;
       if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
-        LRCalc->extendToUses(LI, Reg);
+        LRCalc->extendToUses(LR, Reg);
     }
   }
 }
@@ -269,11 +266,11 @@ void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
 /// without a corresponding def when entering the entry block or a landing pad.
 ///
 void LiveIntervals::computeLiveInRegUnits() {
-  RegUnitIntervals.resize(TRI->getNumRegUnits());
+  RegUnitRanges.resize(TRI->getNumRegUnits());
   DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
 
-  // Keep track of the intervals allocated.
-  SmallVector<LiveInterval*, 8> NewIntvs;
+  // Keep track of the live range sets allocated.
+  SmallVector<unsigned, 8> NewRanges;
 
   // Check all basic blocks for live-ins.
   for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
@@ -291,23 +288,25 @@ void LiveIntervals::computeLiveInRegUnits() {
          LIE = MBB->livein_end(); LII != LIE; ++LII) {
       for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) {
         unsigned Unit = *Units;
-        LiveInterval *Intv = RegUnitIntervals[Unit];
-        if (!Intv) {
-          Intv = RegUnitIntervals[Unit] = new LiveInterval(Unit, HUGE_VALF);
-          NewIntvs.push_back(Intv);
+        LiveRange *LR = RegUnitRanges[Unit];
+        if (!LR) {
+          LR = RegUnitRanges[Unit] = new LiveRange();
+          NewRanges.push_back(Unit);
         }
-        VNInfo *VNI = Intv->createDeadDef(Begin, getVNInfoAllocator());
+        VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
         (void)VNI;
         DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id);
       }
     }
     DEBUG(dbgs() << '\n');
   }
-  DEBUG(dbgs() << "Created " << NewIntvs.size() << " new intervals.\n");
+  DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
 
-  // Compute the 'normal' part of the intervals.
-  for (unsigned i = 0, e = NewIntvs.size(); i != e; ++i)
-    computeRegUnitInterval(NewIntvs[i]);
+  // Compute the 'normal' part of the ranges.
+  for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) {
+    unsigned Unit = NewRanges[i];
+    computeRegUnitRange(*RegUnitRanges[Unit], Unit);
+  }
 }
 
 
@@ -331,7 +330,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
       continue;
     SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
-    LiveRangeQuery LRQ(*li, Idx);
+    LiveQueryResult LRQ = li->Query(Idx);
     VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
       // This shouldn't happen: readsVirtualRegister returns true, but there is
@@ -350,14 +349,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     WorkList.push_back(std::make_pair(Idx, VNI));
   }
 
-  // Create a new live interval with only minimal live segments per def.
-  LiveInterval NewLI(li->reg, 0);
+  // Create new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
   for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
        I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    NewLI.addRange(LiveRange(VNI->def, VNI->def.getDeadSlot(), VNI));
+    NewLR.addSegment(LiveRange::Segment(VNI->def, VNI->def.getDeadSlot(), VNI));
   }
 
   // Keep track of the PHIs that are in use.
@@ -372,7 +371,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     SlotIndex BlockStart = getMBBStartIdx(MBB);
 
     // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = NewLI.extendInBlock(BlockStart, Idx)) {
+    if (VNInfo *ExtVNI = NewLR.extendInBlock(BlockStart, Idx)) {
       (void)ExtVNI;
       assert(ExtVNI == VNI && "Unexpected existing value number");
       // Is this a PHIDef we haven't seen before?
@@ -393,7 +392,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
     // VNI is live-in to MBB.
     DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
-    NewLI.addRange(LiveRange(BlockStart, Idx, VNI));
+    NewLR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
 
     // Make sure VNI is live-out from the predecessors.
     for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
@@ -414,14 +413,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    LiveInterval::iterator LII = NewLI.FindLiveRangeContaining(VNI->def);
-    assert(LII != NewLI.end() && "Missing live range for PHI");
-    if (LII->end != VNI->def.getDeadSlot())
+    LiveRange::iterator LRI = NewLR.FindSegmentContaining(VNI->def);
+    assert(LRI != NewLR.end() && "Missing segment for PHI");
+    if (LRI->end != VNI->def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      NewLI.removeRange(*LII);
+      NewLR.removeSegment(LRI->start, LRI->end);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
       CanSeparate = true;
     } else {
@@ -436,23 +435,23 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     }
   }
 
-  // Move the trimmed ranges back.
-  li->ranges.swap(NewLI.ranges);
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
   DEBUG(dbgs() << "Shrunk: " << *li << '\n');
   return CanSeparate;
 }
 
-void LiveIntervals::extendToIndices(LiveInterval *LI,
+void LiveIntervals::extendToIndices(LiveRange &LR,
                                     ArrayRef<SlotIndex> Indices) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   for (unsigned i = 0, e = Indices.size(); i != e; ++i)
-    LRCalc->extend(LI, Indices[i]);
+    LRCalc->extend(LR, Indices[i]);
 }
 
 void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
-  LiveRangeQuery LRQ(*LI, Kill);
+  LiveQueryResult LRQ = LI->Query(Kill);
   VNInfo *VNI = LRQ.valueOut();
   if (!VNI)
     return;
@@ -463,13 +462,13 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
   // If VNI isn't live out from KillMBB, the value is trivially pruned.
   if (LRQ.endPoint() < MBBEnd) {
-    LI->removeRange(Kill, LRQ.endPoint());
+    LI->removeSegment(Kill, LRQ.endPoint());
     if (EndPoints) EndPoints->push_back(LRQ.endPoint());
     return;
   }
 
   // VNI is live out of KillMBB.
-  LI->removeRange(Kill, MBBEnd);
+  LI->removeSegment(Kill, MBBEnd);
   if (EndPoints) EndPoints->push_back(MBBEnd);
 
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
@@ -487,23 +486,23 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
       // Check if VNI is live in to MBB.
       tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
-      LiveRangeQuery LRQ(*LI, MBBStart);
+      LiveQueryResult LRQ = LI->Query(MBBStart);
       if (LRQ.valueIn() != VNI) {
-        // This block isn't part of the VNI live range. Prune the search.
+        // This block isn't part of the VNI segment. Prune the search.
         I.skipChildren();
         continue;
       }
 
       // Prune the search if VNI is killed in MBB.
       if (LRQ.endPoint() < MBBEnd) {
-        LI->removeRange(MBBStart, LRQ.endPoint());
+        LI->removeSegment(MBBStart, LRQ.endPoint());
         if (EndPoints) EndPoints->push_back(LRQ.endPoint());
         I.skipChildren();
         continue;
       }
 
       // VNI is live through MBB.
-      LI->removeRange(MBBStart, MBBEnd);
+      LI->removeSegment(MBBStart, MBBEnd);
       if (EndPoints) EndPoints->push_back(MBBEnd);
       ++I;
     }
@@ -516,7 +515,7 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
 void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
   // Keep track of regunit ranges.
-  SmallVector<std::pair<LiveInterval*, LiveInterval::iterator>, 8> RU;
+  SmallVector<std::pair<LiveRange*, LiveRange::iterator>, 8> RU;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
@@ -531,13 +530,14 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
     RU.clear();
     for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
          ++Units) {
-      LiveInterval *RUInt = &getRegUnit(*Units);
-      if (RUInt->empty())
+      LiveRange &RURanges = getRegUnit(*Units);
+      if (RURanges.empty())
         continue;
-      RU.push_back(std::make_pair(RUInt, RUInt->find(LI->begin()->end)));
+      RU.push_back(std::make_pair(&RURanges, RURanges.find(LI->begin()->end)));
     }
 
-    // Every instruction that kills Reg corresponds to a live range end point.
+    // Every instruction that kills Reg corresponds to a segment range end
+    // point.
     for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
          ++RI) {
       // A block index indicates an MBB edge.
@@ -547,7 +547,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       if (!MI)
         continue;
 
-      // Check if any of the reguints are live beyond the end of RI. That could
+      // Check if any of the regunits are live beyond the end of RI. That could
       // happen when a physreg is defined as a copy of a virtreg:
       //
       //   %EAX = COPY %vreg5
@@ -557,12 +557,12 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
       bool CancelKill = false;
       for (unsigned u = 0, e = RU.size(); u != e; ++u) {
-        LiveInterval *RInt = RU[u].first;
-        LiveInterval::iterator &I = RU[u].second;
-        if (I == RInt->end())
+        LiveRange &RRanges = *RU[u].first;
+        LiveRange::iterator &I = RU[u].second;
+        if (I == RRanges.end())
           continue;
-        I = RInt->advanceTo(I, RI->end);
-        if (I == RInt->end() || I->start >= RI->end)
+        I = RRanges.advanceTo(I, RI->end);
+        if (I == RRanges.end() || I->start >= RI->end)
           continue;
         // I is overlapping RI.
         CancelKill = true;
@@ -625,18 +625,18 @@ LiveIntervals::getSpillWeight(bool isDef, bool isUse, BlockFrequency freq) {
   return (isDef + isUse) * (freq.getFrequency() * Scale);
 }
 
-LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
-                                                  MachineInstr* startInst) {
-  LiveInterval& Interval = getOrCreateInterval(reg);
+LiveRange::Segment
+LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr* startInst) {
+  LiveInterval& Interval = createEmptyInterval(reg);
   VNInfo* VN = Interval.getNextValue(
     SlotIndex(getInstructionIndex(startInst).getRegSlot()),
     getVNInfoAllocator());
-  LiveRange LR(
+  LiveRange::Segment S(
      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
      getMBBEndIdx(startInst->getParent()), VN);
-  Interval.addRange(LR);
+  Interval.addSegment(S);
 
-  return LR;
+  return S;
 }
 
 
@@ -711,7 +711,7 @@ private:
   const TargetRegisterInfo& TRI;
   SlotIndex OldIdx;
   SlotIndex NewIdx;
-  SmallPtrSet<LiveInterval*, 8> Updated;
+  SmallPtrSet<LiveRange*, 8> Updated;
   bool UpdateFlags;
 
 public:
@@ -725,7 +725,7 @@ public:
   // physregs, even those that aren't needed for regalloc, in order to update
   // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
   // flags, and postRA passes will use a live register utility instead.
-  LiveInterval *getRegUnitLI(unsigned Unit) {
+  LiveRange *getRegUnitLI(unsigned Unit) {
     if (UpdateFlags)
       return &LIS.getRegUnit(Unit);
     return LIS.getCachedRegUnit(Unit);
@@ -750,15 +750,16 @@ public:
       if (!Reg)
         continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        updateRange(LIS.getInterval(Reg));
+        LiveInterval &LI = LIS.getInterval(Reg);
+        updateRange(LI, Reg);
         continue;
       }
 
       // For physregs, only update the regunits that actually have a
       // precomputed live range.
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
-        if (LiveInterval *LI = getRegUnitLI(*Units))
-          updateRange(*LI);
+        if (LiveRange *LR = getRegUnitLI(*Units))
+          updateRange(*LR, *Units);
     }
     if (hasRegMask)
       updateRegMaskSlots();
@@ -767,26 +768,26 @@ public:
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveInterval &LI) {
-    if (!Updated.insert(&LI))
+  void updateRange(LiveRange &LR, unsigned Reg) {
+    if (!Updated.insert(&LR))
       return;
     DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-        dbgs() << PrintReg(LI.reg);
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        dbgs() << PrintReg(Reg);
       else
-        dbgs() << PrintRegUnit(LI.reg, &TRI);
-      dbgs() << ":\t" << LI << '\n';
+        dbgs() << PrintRegUnit(Reg, &TRI);
+      dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
-      handleMoveDown(LI);
+      handleMoveDown(LR);
     else
-      handleMoveUp(LI);
-    DEBUG(dbgs() << "        -->\t" << LI << '\n');
-    LI.verify();
+      handleMoveUp(LR, Reg);
+    DEBUG(dbgs() << "        -->\t" << LR << '\n');
+    LR.verify();
   }
 
-  /// Update LI to reflect an instruction has been moved downwards from OldIdx
+  /// Update LR to reflect an instruction has been moved downwards from OldIdx
   /// to NewIdx.
   ///
   /// 1. Live def at OldIdx:
@@ -800,17 +801,17 @@ private:
   ///    Move def to NewIdx, possibly across another live value.
   ///
   /// 4. Def at OldIdx AND at NewIdx:
-  ///    Remove live range [OldIdx;NewIdx) and value defined at OldIdx.
+  ///    Remove segment [OldIdx;NewIdx) and value defined at OldIdx.
   ///    (Happens when bundling multiple defs together).
   ///
   /// 5. Value read at OldIdx, killed before NewIdx:
   ///    Extend kill to NewIdx.
   ///
-  void handleMoveDown(LiveInterval &LI) {
+  void handleMoveDown(LiveRange &LR) {
     // First look for a kill at OldIdx.
-    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
-    LiveInterval::iterator E = LI.end();
-    // Is LI even live at OldIdx?
+    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
+    LiveRange::iterator E = LR.end();
+    // Is LR even live at OldIdx?
     if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
       return;
 
@@ -827,7 +828,7 @@ private:
         for (MIBundleOperands MO(KillMI); MO.isValid(); ++MO)
           if (MO->isReg() && MO->isUse())
             MO->setIsKill(false);
-      // Adjust I->end to reach NewIdx. This may temporarily make LI invalid by
+      // Adjust I->end to reach NewIdx. This may temporarily make LR invalid by
       // overlapping ranges. Case 5 above.
       I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
       // If this was a kill, there may also be a def. Otherwise we're done.
@@ -856,24 +857,25 @@ private:
     assert((I->end == OldIdx.getDeadSlot() ||
             SlotIndex::isSameInstr(I->end, NewIdx)) &&
             "Cannot move def below kill");
-    LiveInterval::iterator NewI = LI.advanceTo(I, NewIdx.getRegSlot());
+    LiveRange::iterator NewI = LR.advanceTo(I, NewIdx.getRegSlot());
     if (NewI != E && SlotIndex::isSameInstr(NewI->start, NewIdx)) {
       // There is an existing def at NewIdx, case 4 above. The def at OldIdx is
       // coalesced into that value.
       assert(NewI->valno != DefVNI && "Multiple defs of value?");
-      LI.removeValNo(DefVNI);
+      LR.removeValNo(DefVNI);
       return;
     }
     // There was no existing def at NewIdx. Turn *I into a dead def at NewIdx.
-    // If the def at OldIdx was dead, we allow it to be moved across other LI
+    // If the def at OldIdx was dead, we allow it to be moved across other LR
     // values. The new range should be placed immediately before NewI, move any
     // intermediate ranges up.
     assert(NewI != I && "Inconsistent iterators");
     std::copy(llvm::next(I), NewI, I);
-    *llvm::prior(NewI) = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+    *llvm::prior(NewI)
+      = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
-  /// Update LI to reflect an instruction has been moved upwards from OldIdx
+  /// Update LR to reflect an instruction has been moved upwards from OldIdx
   /// to NewIdx.
   ///
   /// 1. Live def at OldIdx:
@@ -893,11 +895,11 @@ private:
   ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
   ///    OldIdx.
   ///
-  void handleMoveUp(LiveInterval &LI) {
+  void handleMoveUp(LiveRange &LR, unsigned Reg) {
     // First look for a kill at OldIdx.
-    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
-    LiveInterval::iterator E = LI.end();
-    // Is LI even live at OldIdx?
+    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
+    LiveRange::iterator E = LR.end();
+    // Is LR even live at OldIdx?
     if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
       return;
 
@@ -914,7 +916,7 @@ private:
       if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
         // No def, search for the new kill.
         // This can never be an early clobber kill since there is no def.
-        llvm::prior(I)->end = findLastUseBefore(LI.reg).getRegSlot();
+        llvm::prior(I)->end = findLastUseBefore(Reg).getRegSlot();
         return;
       }
     }
@@ -926,18 +928,18 @@ private:
     DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
 
     // Check for an existing def at NewIdx.
-    LiveInterval::iterator NewI = LI.find(NewIdx.getRegSlot());
+    LiveRange::iterator NewI = LR.find(NewIdx.getRegSlot());
     if (SlotIndex::isSameInstr(NewI->start, NewIdx)) {
       assert(NewI->valno != DefVNI && "Same value defined more than once?");
       // There is an existing def at NewIdx.
       if (I->end.isDead()) {
         // Case 3: Remove the dead def at OldIdx.
-        LI.removeValNo(DefVNI);
+        LR.removeValNo(DefVNI);
         return;
       }
       // Case 4: Replace def at NewIdx with live def at OldIdx.
       I->start = DefVNI->def;
-      LI.removeValNo(NewI->valno);
+      LR.removeValNo(NewI->valno);
       return;
     }
 
@@ -948,10 +950,10 @@ private:
       return;
     }
 
-    // DefVNI is a dead def. It may have been moved across other values in LI,
+    // DefVNI is a dead def. It may have been moved across other values in LR,
     // so move I up to NewI. Slide [NewI;I) down one position.
     std::copy_backward(NewI, I, llvm::next(I));
-    *NewI = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+    *NewI = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
   void updateRegMaskSlots() {
@@ -1074,8 +1076,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
       if (MOI->isReg() &&
           TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
           !hasInterval(MOI->getReg())) {
-        LiveInterval &LI = getOrCreateInterval(MOI->getReg());
-        computeVirtRegInterval(&LI);
+        createAndComputeVirtRegInterval(MOI->getReg());
       }
     }
   }
@@ -1122,9 +1123,9 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
               if (LII != LI.begin())
                 prevStart = llvm::prior(LII)->start;
 
-              // FIXME: This could be more efficient if there was a removeRange
-              // method that returned an iterator.
-              LI.removeRange(*LII, true);
+              // FIXME: This could be more efficient if there was a
+              // removeSegment method that returned an iterator.
+              LI.removeSegment(*LII, true);
               if (prevStart.isValid())
                 LII = LI.find(prevStart);
               else
@@ -1143,13 +1144,14 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
           if (!lastUseIdx.isValid()) {
             VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
                                           VNInfoAllocator);
-            LiveRange LR(instrIdx.getRegSlot(), instrIdx.getDeadSlot(), VNI);
-            LII = LI.addRange(LR);
+            LiveRange::Segment S(instrIdx.getRegSlot(),
+                                 instrIdx.getDeadSlot(), VNI);
+            LII = LI.addSegment(S);
           } else if (LII->start != instrIdx.getRegSlot()) {
             VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
                                           VNInfoAllocator);
-            LiveRange LR(instrIdx.getRegSlot(), lastUseIdx, VNI);
-            LII = LI.addRange(LR);
+            LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+            LII = LI.addSegment(S);
           }
 
           if (MO.getSubReg() && !MO.isUndef())
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index dede490..ae086bc 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -36,11 +36,11 @@ void LiveRangeCalc::reset(const MachineFunction *mf,
 }
 
 
-void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
+void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
   // Visit all def operands. If the same instruction has multiple defs of Reg,
-  // LI->createDeadDef() will deduplicate.
+  // LR.createDeadDef() will deduplicate.
   for (MachineRegisterInfo::def_iterator
        I = MRI->def_begin(Reg), E = MRI->def_end(); I != E; ++I) {
     const MachineInstr *MI = &*I;
@@ -54,13 +54,13 @@ void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
       Idx = Indexes->getInstructionIndex(MI)
         .getRegSlot(I.getOperand().isEarlyClobber());
 
-    // Create the def in LI. This may find an existing def.
-    LI->createDeadDef(Idx, *Alloc);
+    // Create the def in LR. This may find an existing def.
+    LR.createDeadDef(Idx, *Alloc);
   }
 }
 
 
-void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
   // Visit all operands that read Reg. This may include partial defs.
@@ -99,7 +99,7 @@ void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
           Idx = Idx.getRegSlot(true);
       }
     }
-    extend(LI, Idx, Reg);
+    extend(LR, Idx, Reg);
   }
 }
 
@@ -125,17 +125,14 @@ void LiveRangeCalc::updateLiveIns() {
       assert(Seen.test(MBB->getNumber()));
       LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)0);
     }
-    Updater.setDest(I->LI);
+    Updater.setDest(&I->LR);
     Updater.add(Start, End, I->Value);
   }
   LiveIn.clear();
 }
 
 
-void LiveRangeCalc::extend(LiveInterval *LI,
-                           SlotIndex Kill,
-                           unsigned PhysReg) {
-  assert(LI && "Missing live range");
+void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg) {
   assert(Kill.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
@@ -144,14 +141,14 @@ void LiveRangeCalc::extend(LiveInterval *LI,
   assert(KillMBB && "No MBB at Kill");
 
   // Is there a def in the same MBB we can extend?
-  if (LI->extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
+  if (LR.extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
     return;
 
   // Find the single reaching def, or determine if Kill is jointly dominated by
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  if (findReachingDefs(LI, KillMBB, Kill, PhysReg))
+  if (findReachingDefs(LR, *KillMBB, Kill, PhysReg))
     return;
 
   // When there were multiple different values, we may need new PHIs.
@@ -170,13 +167,11 @@ void LiveRangeCalc::calculateValues() {
 }
 
 
-bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
-                                     MachineBasicBlock *KillMBB,
-                                     SlotIndex Kill,
-                                     unsigned PhysReg) {
-  unsigned KillMBBNum = KillMBB->getNumber();
+bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+                                     SlotIndex Kill, unsigned PhysReg) {
+  unsigned KillMBBNum = KillMBB.getNumber();
 
-  // Block numbers where LI should be live-in.
+  // Block numbers where LR should be live-in.
   SmallVector<unsigned, 16> WorkList(1, KillMBBNum);
 
   // Remember if we have seen more than one value.
@@ -203,7 +198,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 #endif
 
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-           PE = MBB->pred_end(); PI != PE; ++PI) {
+         PE = MBB->pred_end(); PI != PE; ++PI) {
        MachineBasicBlock *Pred = *PI;
 
        // Is this a known live-out block?
@@ -221,7 +216,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
        // First time we see Pred.  Try to determine the live-out value, but set
        // it as null if Pred is live-through with an unknown value.
-       VNInfo *VNI = LI->extendInBlock(Start, End);
+       VNInfo *VNI = LR.extendInBlock(Start, End);
        setLiveOutValue(Pred, VNI);
        if (VNI) {
          if (TheVNI && TheVNI != VNI)
@@ -231,7 +226,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
        }
 
        // No, we need a live-in value for Pred as well
-       if (Pred != KillMBB)
+       if (Pred != &KillMBB)
           WorkList.push_back(Pred->getNumber());
        else
           // Loopback to KillMBB, so value is really live through.
@@ -248,9 +243,9 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
   // If a unique reaching def was found, blit in the live ranges immediately.
   if (UniqueVNI) {
-    LiveRangeUpdater Updater(LI);
-    for (SmallVectorImpl<unsigned>::const_iterator
-         I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+    LiveRangeUpdater Updater(&LR);
+    for (SmallVectorImpl<unsigned>::const_iterator I = WorkList.begin(),
+         E = WorkList.end(); I != E; ++I) {
        SlotIndex Start, End;
        tie(Start, End) = Indexes->getMBBRange(*I);
        // Trim the live range in KillMBB.
@@ -270,8 +265,8 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
   for (SmallVectorImpl<unsigned>::const_iterator
        I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
-    addLiveInBlock(LI, DomTree->getNode(MBB));
-    if (MBB == KillMBB)
+    addLiveInBlock(LR, DomTree->getNode(MBB));
+    if (MBB == &KillMBB)
       LiveIn.back().Kill = Kill;
   }
 
@@ -348,16 +343,17 @@ void LiveRangeCalc::updateSSA() {
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         tie(Start, End) = Indexes->getMBBRange(MBB);
-        VNInfo *VNI = I->LI->getNextValue(Start, *Alloc);
+        LiveRange &LR = I->LR;
+        VNInfo *VNI = LR.getNextValue(Start, *Alloc);
         I->Value = VNI;
         // This block is done, we know the final value.
         I->DomNode = 0;
 
         // Add liveness since updateLiveIns now skips this node.
         if (I->Kill.isValid())
-          I->LI->addRange(LiveRange(Start, I->Kill, VNI));
+          LR.addSegment(LiveInterval::Segment(Start, I->Kill, VNI));
         else {
-          I->LI->addRange(LiveRange(Start, End, VNI));
+          LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
       } else if (IDomValue.first) {
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 57cab7b..a3a3fbb 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -75,9 +75,9 @@ class LiveRangeCalc {
   /// LiveInBlock - Information about a basic block where a live range is known
   /// to be live-in, but the value has not yet been determined.
   struct LiveInBlock {
-    // LI - The live range that is live-in to this block.  The algorithms can
+    // The live range set that is live-in to this block.  The algorithms can
     // handle multiple non-overlapping live ranges simultaneously.
-    LiveInterval *LI;
+    LiveRange &LR;
 
     // DomNode - Dominator tree node for the block.
     // Cleared when the final value has been determined and LI has been updated.
@@ -91,8 +91,8 @@ class LiveRangeCalc {
     // Live-in value filled in by updateSSA once it is known.
     VNInfo *Value;
 
-    LiveInBlock(LiveInterval *li, MachineDomTreeNode *node, SlotIndex kill)
-      : LI(li), DomNode(node), Kill(kill), Value(0) {}
+    LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
+      : LR(LR), DomNode(node), Kill(kill), Value(0) {}
   };
 
   /// LiveIn - Work list of blocks where the live-in value has yet to be
@@ -111,10 +111,8 @@ class LiveRangeCalc {
   /// are added to the LiveIn array, and the function returns false.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  bool findReachingDefs(LiveInterval *LI,
-                        MachineBasicBlock *KillMBB,
-                        SlotIndex Kill,
-                        unsigned PhysReg);
+  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+                        SlotIndex Kill, unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
   /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
@@ -146,10 +144,6 @@ public:
              MachineDominatorTree*,
              VNInfo::Allocator*);
 
-  /// calculate - Calculate the live range of a virtual register from its defs
-  /// and uses.  LI must be empty with no values.
-  void calculate(LiveInterval *LI);
-
   //===--------------------------------------------------------------------===//
   // Mid-level interface.
   //===--------------------------------------------------------------------===//
@@ -165,27 +159,27 @@ public:
   /// single existing value, Alloc may be null.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0);
+  void extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg = 0);
 
   /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
   /// Each instruction defining Reg gets a new VNInfo with a corresponding
   /// minimal live range.
-  void createDeadDefs(LiveInterval *LI, unsigned Reg);
+  void createDeadDefs(LiveRange &LR, unsigned Reg);
 
   /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
-  void createDeadDefs(LiveInterval *LI) {
-    createDeadDefs(LI, LI->reg);
+  void createDeadDefs(LiveInterval &LI) {
+    createDeadDefs(LI, LI.reg);
   }
 
   /// extendToUses - Extend the live range of LI to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveInterval *LI, unsigned Reg);
+  void extendToUses(LiveRange &LR, unsigned Reg);
 
   /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
-  void extendToUses(LiveInterval *LI) {
-    extendToUses(LI, LI->reg);
+  void extendToUses(LiveInterval &LI) {
+    extendToUses(LI, LI.reg);
   }
 
   //===--------------------------------------------------------------------===//
@@ -216,15 +210,15 @@ public:
   /// function can only be called once per basic block.  Once the live-in value
   /// has been determined, calculateValues() will add liveness to LI.
   ///
-  /// @param LI      The live range that is live-in to the block.
+  /// @param LR      The live range that is live-in to the block.
   /// @param DomNode The domtree node for the block.
   /// @param Kill    Index in block where LI is killed.  If the value is
   ///                live-through, set Kill = SLotIndex() and also call
   ///                setLiveOutValue(MBB, 0).
-  void addLiveInBlock(LiveInterval *LI,
+  void addLiveInBlock(LiveRange &LR,
                       MachineDomTreeNode *DomNode,
                       SlotIndex Kill = SlotIndex()) {
-    LiveIn.push_back(LiveInBlock(LI, DomNode, Kill));
+    LiveIn.push_back(LiveInBlock(LR, DomNode, Kill));
   }
 
   /// calculateValues - Calculate the value that will be live-in to each block
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 792ef54..cb70c43 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -30,17 +30,23 @@ STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
 
 void LiveRangeEdit::Delegate::anchor() { }
 
-LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) {
+LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
   unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM) {
-    VRM->grow();
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
-  LiveInterval &LI = LIS.getOrCreateInterval(VReg);
-  NewRegs.push_back(&LI);
+  LiveInterval &LI = LIS.createEmptyInterval(VReg);
   return LI;
 }
 
+unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
+  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  if (VRM) {
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
+  return VReg;
+}
+
 bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
                                           AliasAnalysis *aa) {
@@ -256,9 +262,9 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
       else if (MOI->isDef()) {
         for (MCRegUnitIterator Units(Reg, MRI.getTargetRegisterInfo());
              Units.isValid(); ++Units) {
-          if (LiveInterval *LI = LIS.getCachedRegUnit(*Units)) {
-            if (VNInfo *VNI = LI->getVNInfoAt(Idx))
-              LI->removeValNo(VNI);
+          if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
+            if (VNInfo *VNI = LR->getVNInfoAt(Idx))
+              LR->removeValNo(VNI);
           }
         }
       }
@@ -272,7 +278,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     // Always shrink COPY uses that probably come from live range splitting.
     if (MI->readsVirtualRegister(Reg) &&
         (MI->isCopy() || MOI->isDef() || MRI.hasOneNonDBGUse(Reg) ||
-         LI.killedAt(Idx)))
+         LI.Query(Idx).isKill()))
       ToShrink.insert(&LI);
 
     // Remove defined value.
@@ -360,7 +366,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     if (BeingSpilled) continue;
 
     // LI may have been separated, create new intervals.
-    LI->RenumberValues(LIS);
+    LI->RenumberValues();
     ConnectedVNInfoEqClasses ConEQ(LIS);
     unsigned NumComp = ConEQ.Classify(LI);
     if (NumComp <= 1)
@@ -370,7 +376,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
     SmallVector<LiveInterval*, 8> Dups(1, LI);
     for (unsigned i = 1; i != NumComp; ++i) {
-      Dups.push_back(&createFrom(LI->reg));
+      Dups.push_back(&createEmptyIntervalFrom(LI->reg));
       // If LI is an original interval that hasn't been split yet, make the new
       // intervals their own originals instead of referring to LI. The original
       // interval must contain all the split products, and LI doesn't.
@@ -387,16 +393,27 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
   }
 }
 
+// Keep track of new virtual registers created via
+// MachineRegisterInfo::createVirtualRegister.
+void
+LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
+{
+  if (VRM)
+    VRM->grow();
+
+  NewRegs.push_back(VReg);
+}
+
 void
 LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
                                         const MachineLoopInfo &Loops,
                                         const MachineBlockFrequencyInfo &MBFI) {
   VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI);
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    LiveInterval &LI = **I;
+  for (unsigned I = 0, Size = size(); I < Size; ++I) {
+    LiveInterval &LI = LIS.getInterval(get(I));
     if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
       DEBUG(dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
                    << MRI.getRegClass(LI.reg)->getName() << '\n');
-    VRAI.CalculateWeightAndHint(LI);
+    VRAI.calculateSpillWeightAndHint(LI);
   }
 }
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 0ef069f..1d801ac 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -119,9 +119,11 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   if (VirtReg.empty())
     return false;
   CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (VirtReg.overlaps(LIS->getRegUnit(*Units), CP, *LIS->getSlotIndexes()))
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    const LiveRange &UnitRange = LIS->getRegUnit(*Units);
+    if (VirtReg.overlaps(UnitRange, CP, *LIS->getSlotIndexes()))
       return true;
+  }
   return false;
 }
 
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
new file mode 100644
index 0000000..6221ca2
--- /dev/null
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -0,0 +1,111 @@
+//===-- LiveInterval.cpp - Live Interval Representation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRegUnits utility for tracking liveness of
+// physical register units across machine instructions in forward or backward
+// order.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+using namespace llvm;
+
+/// Return true if the given MachineOperand clobbers the given register unit.
+/// A register unit is only clobbered if all its super-registers are clobbered.
+static bool operClobbersUnit(const MachineOperand *MO, unsigned Unit,
+                             const MCRegisterInfo *MCRI) {
+  for (MCRegUnitRootIterator RI(Unit, MCRI); RI.isValid(); ++RI) {
+    for (MCSuperRegIterator SI(*RI, MCRI, true); SI.isValid(); ++SI) {
+      if (!MO->clobbersPhysReg(*SI))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// We assume the high bits of a physical super register are not preserved
+/// unless the instruction has an implicit-use operand reading the
+/// super-register or a register unit for the upper bits is available.
+void LiveRegUnits::removeRegsInMask(const MachineOperand &Op,
+                                    const MCRegisterInfo &MCRI) {
+  SparseSet<unsigned>::iterator LUI = LiveUnits.begin();
+  while (LUI != LiveUnits.end()) {
+    if (operClobbersUnit(&Op, *LUI, &MCRI))
+      LUI = LiveUnits.erase(LUI);
+    else
+      ++LUI;
+  }
+}
+
+void LiveRegUnits::stepBackward(const MachineInstr &MI,
+                                const MCRegisterInfo &MCRI) {
+  // Remove defined registers and regmask kills from the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      if (!O->isDef())
+        continue;
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      removeReg(Reg, MCRI);
+    } else if (O->isRegMask()) {
+      removeRegsInMask(*O, MCRI);
+    }
+  }
+  // Add uses to the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->readsReg() || O->isUndef())
+      continue;
+    unsigned Reg = O->getReg();
+    if (Reg == 0)
+      continue;
+    addReg(Reg, MCRI);
+  }
+}
+
+/// Uses with kill flag get removed from the set, defs added. If possible
+/// use StepBackward() instead of this function because some kill flags may
+/// be missing.
+void LiveRegUnits::stepForward(const MachineInstr &MI,
+                               const MCRegisterInfo &MCRI) {
+  SmallVector<unsigned, 4> Defs;
+  // Remove killed registers from the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      if (O->isDef()) {
+        if (!O->isDead())
+          Defs.push_back(Reg);
+      } else {
+        if (!O->isKill())
+          continue;
+        assert(O->isUse());
+        removeReg(Reg, MCRI);
+      }
+    } else if (O->isRegMask()) {
+      removeRegsInMask(*O, MCRI);
+    }
+  }
+  // Add defs to the set.
+  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+    addReg(Defs[i], MCRI);
+  }
+}
+
+/// Adds all registers in the live-in list of block @p BB.
+void LiveRegUnits::addLiveIns(const MachineBasicBlock *MBB,
+                              const MCRegisterInfo &MCRI) {
+  for (MachineBasicBlock::livein_iterator L = MBB->livein_begin(),
+         LE = MBB->livein_end(); L != LE; ++L) {
+    addReg(*L, MCRI);
+  }
+}
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 5633271..ca71e3b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -861,7 +861,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
           LiveInterval &LI = LIS->getInterval(Reg);
           VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
           assert(VNI && "PHI sources should be live out of their predecessors.");
-          LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+          LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
         }
       }
     }
@@ -880,9 +880,9 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
       if (isLiveOut && isLastMBB) {
         VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
         assert(VNI && "LiveInterval should have VNInfo where it is live.");
-        LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+        LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
       } else if (!isLiveOut && !isLastMBB) {
-        LI.removeRange(StartIndex, EndIndex);
+        LI.removeSegment(StartIndex, EndIndex);
       }
     }
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 06bb80a..295b450 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -647,12 +647,15 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
     }
   }
 
+#ifndef NDEBUG
+  bool isMetaDataOp = Op.getType() == MachineOperand::MO_Metadata;
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
   // RegMask operands go between the explicit and implicit operands.
   assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
-          OpNo < MCID->getNumOperands()) &&
+          OpNo < MCID->getNumOperands() || isMetaDataOp) &&
          "Trying to add an operand to a machine instr that is already done!");
+#endif
 
   MachineRegisterInfo *MRI = getRegInfo();
 
@@ -1702,31 +1705,31 @@ void MachineInstr::clearRegisterKills(unsigned Reg,
   }
 }
 
-bool MachineInstr::addRegisterDead(unsigned IncomingReg,
+bool MachineInstr::addRegisterDead(unsigned Reg,
                                    const TargetRegisterInfo *RegInfo,
                                    bool AddIfNotFound) {
-  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(Reg);
   bool hasAliases = isPhysReg &&
-    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
+    MCRegAliasIterator(Reg, RegInfo, false).isValid();
   bool Found = false;
   SmallVector<unsigned,4> DeadOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
       continue;
 
-    if (Reg == IncomingReg) {
+    if (MOReg == Reg) {
       MO.setIsDead();
       Found = true;
     } else if (hasAliases && MO.isDead() &&
-               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+               TargetRegisterInfo::isPhysicalRegister(MOReg)) {
       // There exists a super-register that's marked dead.
-      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+      if (RegInfo->isSuperRegister(Reg, MOReg))
         return true;
-      if (RegInfo->isSubRegister(IncomingReg, Reg))
+      if (RegInfo->isSubRegister(Reg, MOReg))
         DeadOps.push_back(i);
     }
   }
@@ -1746,7 +1749,7 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
   if (Found || !AddIfNotFound)
     return Found;
 
-  addOperand(MachineOperand::CreateReg(IncomingReg,
+  addOperand(MachineOperand::CreateReg(Reg,
                                        true  /*IsDef*/,
                                        true  /*IsImp*/,
                                        false /*IsKill*/,
@@ -1754,21 +1757,21 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
   return true;
 }
 
-void MachineInstr::addRegisterDefined(unsigned IncomingReg,
+void MachineInstr::addRegisterDefined(unsigned Reg,
                                       const TargetRegisterInfo *RegInfo) {
-  if (TargetRegisterInfo::isPhysicalRegister(IncomingReg)) {
-    MachineOperand *MO = findRegisterDefOperand(IncomingReg, false, RegInfo);
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    MachineOperand *MO = findRegisterDefOperand(Reg, false, RegInfo);
     if (MO)
       return;
   } else {
     for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = getOperand(i);
-      if (MO.isReg() && MO.getReg() == IncomingReg && MO.isDef() &&
+      if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
           MO.getSubReg() == 0)
         return;
     }
   }
-  addOperand(MachineOperand::CreateReg(IncomingReg,
+  addOperand(MachineOperand::CreateReg(Reg,
                                        true  /*IsDef*/,
                                        true  /*IsImp*/));
 }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 6ad4e39..104eacd 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -468,12 +468,12 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
       if (PhysRegDefs.test(*AS))
         PhysRegClobbers.set(*AS);
-      if (PhysRegClobbers.test(*AS))
-        // MI defined register is seen defined by another instruction in
-        // the loop, it cannot be a LICM candidate.
-        RuledOut = true;
       PhysRegDefs.set(*AS);
     }
+    if (PhysRegClobbers.test(Reg))
+      // MI defined register is seen defined by another instruction in
+      // the loop, it cannot be a LICM candidate.
+      RuledOut = true;
   }
 
   // Only consider reloads for now and remats which do not have register
@@ -502,7 +502,7 @@ void MachineLICM::HoistRegionPostRA() {
 
   // Walk the entire region, count number of defs for each register, and
   // collect potential LICM candidates.
-  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *BB = Blocks[i];
 
@@ -584,7 +584,7 @@ void MachineLICM::HoistRegionPostRA() {
 /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
 /// loop, and make sure it is not killed by any instructions in the loop.
 void MachineLICM::AddToLiveIns(unsigned Reg) {
-  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *BB = Blocks[i];
     if (!BB->isLiveIn(Reg))
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 7f2c0ca..f8b8796 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -19,8 +19,11 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MachineRegisterInfo::Delegate::anchor() {}
+
 MachineRegisterInfo::MachineRegisterInfo(const TargetMachine &TM)
-  : TM(TM), IsSSA(true), TracksLiveness(true) {
+  : TM(TM), TheDelegate(0), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
@@ -108,6 +111,8 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
   VRegInfo.grow(Reg);
   VRegInfo[Reg].first = RegClass;
   RegAllocHints.grow(Reg);
+  if (TheDelegate)
+    TheDelegate->MRI_NoteNewVirtualRegister(Reg);
   return Reg;
 }
 
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index a6c5a9f..e71c4df 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -53,6 +53,12 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
+static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
+  cl::desc("Enable register pressure scheduling."), cl::init(true));
+
+static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
+  cl::desc("Enable cyclic critical path analysis."), cl::init(true));
+
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
   cl::desc("Enable load clustering."), cl::init(true));
 
@@ -66,6 +72,10 @@ static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
 
+// Pin the vtables to this file.
+void MachineSchedStrategy::anchor() {}
+void ScheduleDAGMutation::anchor() {}
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -95,6 +105,9 @@ public:
   virtual void print(raw_ostream &O, const Module* = 0) const;
 
   static char ID; // Class identification, replacement for typeinfo
+
+protected:
+  ScheduleDAGInstrs *createMachineScheduler();
 };
 } // namespace
 
@@ -149,12 +162,13 @@ DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
 
 /// Forward declare the standard machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
+static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C);
 
 
 /// Decrement this iterator until reaching the top or a non-debug instr.
-static MachineBasicBlock::iterator
-priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+static MachineBasicBlock::const_iterator
+priorNonDebug(MachineBasicBlock::const_iterator I,
+              MachineBasicBlock::const_iterator Beg) {
   assert(I != Beg && "reached the top of the region, cannot decrement");
   while (--I != Beg) {
     if (!I->isDebugValue())
@@ -163,10 +177,19 @@ priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
   return I;
 }
 
+/// Non-const version.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I,
+              MachineBasicBlock::const_iterator Beg) {
+  return const_cast<MachineInstr*>(
+    &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg));
+}
+
 /// If this iterator is a debug value, increment until reaching the End or a
 /// non-debug instruction.
-static MachineBasicBlock::iterator
-nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
   for(; I != End; ++I) {
     if (!I->isDebugValue())
       break;
@@ -174,6 +197,34 @@ nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
   return I;
 }
 
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+/// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
+ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
+  // Select the scheduler, or set the default.
+  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+  if (Ctor != useDefaultMachineSched)
+    return Ctor(this);
+
+  // Get the default scheduler set by the target for this function.
+  ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
+  if (Scheduler)
+    return Scheduler;
+
+  // Default to GenericScheduler.
+  return createGenericSched(this);
+}
+
 /// Top-level MachineScheduler pass driver.
 ///
 /// Visit blocks in function order. Divide each block into scheduling regions
@@ -209,18 +260,9 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   }
   RegClassInfo->runOnMachineFunction(*MF);
 
-  // Select the scheduler, or set the default.
-  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
-  if (Ctor == useDefaultMachineSched) {
-    // Get the default scheduler set by the target.
-    Ctor = MachineSchedRegistry::getDefault();
-    if (!Ctor) {
-      Ctor = createConvergingSched;
-      MachineSchedRegistry::setDefault(Ctor);
-    }
-  }
-  // Instantiate the selected scheduler.
-  OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
+  // Instantiate the selected scheduler for this target, function, and
+  // optimization level.
+  OwningPtr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
 
   // Visit all machine basic blocks.
   //
@@ -255,14 +297,15 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
 
       // The next region starts above the previous region. Look backward in the
       // instruction stream until we find the nearest boundary.
+      unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingInstrs) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
         if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
           break;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
-      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingInstrs);
+      Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
 
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
       if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
@@ -277,7 +320,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
             << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
-            dbgs() << " Remaining: " << RemainingInstrs << "\n");
+            dbgs() << " RegionInstrs: " << NumRegionInstrs
+            << " Remaining: " << RemainingInstrs << "\n");
 
       // Schedule a region: possibly reorder instructions.
       // This invalidates 'RegionEnd' and 'I'.
@@ -446,13 +490,19 @@ bool ScheduleDAGMI::checkSchedLimit() {
 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
                                 MachineBasicBlock::iterator begin,
                                 MachineBasicBlock::iterator end,
-                                unsigned endcount)
+                                unsigned regioninstrs)
 {
-  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
 
   // For convenience remember the end of the liveness region.
   LiveRegionEnd =
     (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+
+  SUPressureDiffs.clear();
+
+  SchedImpl->initPolicy(begin, end, regioninstrs);
+
+  ShouldTrackPressure = SchedImpl->shouldTrackPressure();
 }
 
 // Setup the register pressure trackers for the top scheduled top and bottom
@@ -483,9 +533,16 @@ void ScheduleDAGMI::initRegPressure() {
           dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
   };
 
+  // For each live out vreg reduce the pressure change associated with other
+  // uses of the same vreg below the live-out reaching def.
+  updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
+
   // Account for liveness generated by the region boundary.
-  if (LiveRegionEnd != RegionEnd)
-    BotRPTracker.recede();
+  if (LiveRegionEnd != RegionEnd) {
+    SmallVector<unsigned, 8> LiveUses;
+    BotRPTracker.recede(&LiveUses);
+    updatePressureDiffs(LiveUses);
+  }
 
   assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
 
@@ -500,34 +557,83 @@ void ScheduleDAGMI::initRegPressure() {
       DEBUG(dbgs() << TRI->getRegPressureSetName(i)
             << " Limit " << Limit
             << " Actual " << RegionPressure[i] << "\n");
-      RegionCriticalPSets.push_back(PressureElement(i, 0));
+      RegionCriticalPSets.push_back(PressureChange(i));
     }
   }
   DEBUG(dbgs() << "Excess PSets: ";
         for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
           dbgs() << TRI->getRegPressureSetName(
-            RegionCriticalPSets[i].PSetID) << " ";
+            RegionCriticalPSets[i].getPSet()) << " ";
         dbgs() << "\n");
 }
 
-// FIXME: When the pressure tracker deals in pressure differences then we won't
-// iterate over all RegionCriticalPSets[i].
 void ScheduleDAGMI::
-updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) {
-  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
-    unsigned ID = RegionCriticalPSets[i].PSetID;
-    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
-    if ((int)NewMaxPressure[ID] > MaxUnits)
-      MaxUnits = NewMaxPressure[ID];
+updateScheduledPressure(const SUnit *SU,
+                        const std::vector<unsigned> &NewMaxPressure) {
+  const PressureDiff &PDiff = getPressureDiff(SU);
+  unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
+  for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
+       I != E; ++I) {
+    if (!I->isValid())
+      break;
+    unsigned ID = I->getPSet();
+    while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
+      ++CritIdx;
+    if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
+      if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
+          && NewMaxPressure[ID] <= INT16_MAX)
+        RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
+    }
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
+    if (NewMaxPressure[ID] >= Limit - 2) {
+      DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
+            << NewMaxPressure[ID] << " > " << Limit << "(+ "
+            << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
+    }
   }
-  DEBUG(
-    for (unsigned i = 0, e = NewMaxPressure.size(); i < e; ++i) {
-      unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
-      if (NewMaxPressure[i] > Limit ) {
-        dbgs() << "  " << TRI->getRegPressureSetName(i) << ": "
-               << NewMaxPressure[i] << " > " << Limit << "\n";
+}
+
+/// Update the PressureDiff array for liveness after scheduling this
+/// instruction.
+void ScheduleDAGMI::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
+  for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) {
+    /// FIXME: Currently assuming single-use physregs.
+    unsigned Reg = LiveUses[LUIdx];
+    DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
+    if (!TRI->isVirtualRegister(Reg))
+      continue;
+
+    // This may be called before CurrentBottom has been initialized. However,
+    // BotRPTracker must have a valid position. We want the value live into the
+    // instruction or live out of the block, so ask for the previous
+    // instruction's live-out.
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    VNInfo *VNI;
+    MachineBasicBlock::const_iterator I =
+      nextIfDebug(BotRPTracker.getPos(), BB->end());
+    if (I == BB->end())
+      VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    else {
+      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I));
+      VNI = LRQ.valueIn();
+    }
+    // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
+    assert(VNI && "No live value at use.");
+    for (VReg2UseMap::iterator
+           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+      SUnit *SU = UI->SU;
+      DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+            << *SU->getInstr());
+      // If this use comes before the reaching def, it cannot be a last use, so
+      // descrease its pressure change.
+      if (!SU->isScheduled && SU != &ExitSU) {
+        LiveQueryResult LRQ
+          = LI.Query(LIS->getInstructionIndex(SU->getInstr()));
+        if (LRQ.valueIn() == VNI)
+          getPressureDiff(SU).addPressureChange(Reg, true, &MRI);
       }
-    });
+    }
+  }
 }
 
 /// schedule - Called back from MachineScheduler::runOnMachineFunction
@@ -585,6 +691,13 @@ void ScheduleDAGMI::schedule() {
 
 /// Build the DAG and setup three register pressure trackers.
 void ScheduleDAGMI::buildDAGWithRegPressure() {
+  if (!ShouldTrackPressure) {
+    RPTracker.reset();
+    RegionCriticalPSets.clear();
+    buildSchedGraph(AA);
+    return;
+  }
+
   // Initialize the register pressure tracker used by buildSchedGraph.
   RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
                  /*TrackUntiedDefs=*/true);
@@ -594,7 +707,7 @@ void ScheduleDAGMI::buildDAGWithRegPressure() {
     RPTracker.recede();
 
   // Build the DAG, and compute current register pressure.
-  buildSchedGraph(AA, &RPTracker);
+  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs);
 
   // Initialize top/bottom trackers after computing region pressure.
   initRegPressure();
@@ -637,6 +750,91 @@ void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
   ExitSU.biasCriticalPath();
 }
 
+/// Compute the max cyclic critical path through the DAG. The scheduling DAG
+/// only provides the critical path for single block loops. To handle loops that
+/// span blocks, we could use the vreg path latencies provided by
+/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
+/// available for use in the scheduler.
+///
+/// The cyclic path estimation identifies a def-use pair that crosses the back
+/// edge and considers the depth and height of the nodes. For example, consider
+/// the following instruction sequence where each instruction has unit latency
+/// and defines an epomymous virtual register:
+///
+/// a->b(a,c)->c(b)->d(c)->exit
+///
+/// The cyclic critical path is a two cycles: b->c->b
+/// The acyclic critical path is four cycles: a->b->c->d->exit
+/// LiveOutHeight = height(c) = len(c->d->exit) = 2
+/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
+/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
+/// LiveInDepth = depth(b) = len(a->b) = 1
+///
+/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
+/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
+/// CyclicCriticalPath = min(2, 2) = 2
+unsigned ScheduleDAGMI::computeCyclicCriticalPath() {
+  // This only applies to single block loop.
+  if (!BB->isSuccessor(BB))
+    return 0;
+
+  unsigned MaxCyclicLatency = 0;
+  // Visit each live out vreg def to find def/use pairs that cross iterations.
+  ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
+  for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
+       RI != RE; ++RI) {
+    unsigned Reg = *RI;
+    if (!TRI->isVirtualRegister(Reg))
+        continue;
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    if (!DefVNI)
+      continue;
+
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
+    const SUnit *DefSU = getSUnit(DefMI);
+    if (!DefSU)
+      continue;
+
+    unsigned LiveOutHeight = DefSU->getHeight();
+    unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
+    // Visit all local users of the vreg def.
+    for (VReg2UseMap::iterator
+           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+      if (UI->SU == &ExitSU)
+        continue;
+
+      // Only consider uses of the phi.
+      LiveQueryResult LRQ =
+        LI.Query(LIS->getInstructionIndex(UI->SU->getInstr()));
+      if (!LRQ.valueIn()->isPHIDef())
+        continue;
+
+      // Assume that a path spanning two iterations is a cycle, which could
+      // overestimate in strange cases. This allows cyclic latency to be
+      // estimated as the minimum slack of the vreg's depth or height.
+      unsigned CyclicLatency = 0;
+      if (LiveOutDepth > UI->SU->getDepth())
+        CyclicLatency = LiveOutDepth - UI->SU->getDepth();
+
+      unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
+      if (LiveInHeight > LiveOutHeight) {
+        if (LiveInHeight - LiveOutHeight < CyclicLatency)
+          CyclicLatency = LiveInHeight - LiveOutHeight;
+      }
+      else
+        CyclicLatency = 0;
+
+      DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
+            << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
+      if (CyclicLatency > MaxCyclicLatency)
+        MaxCyclicLatency = CyclicLatency;
+    }
+  }
+  DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
+  return MaxCyclicLatency;
+}
+
 /// Identify DAG roots and setup scheduler queues.
 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
                                ArrayRef<SUnit*> BotRoots) {
@@ -664,11 +862,13 @@ void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
   SchedImpl->registerRoots();
 
   // Advance past initial DebugValues.
-  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
-  TopRPTracker.setPos(CurrentTop);
-
   CurrentBottom = RegionEnd;
+
+  if (ShouldTrackPressure) {
+    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+    TopRPTracker.setPos(CurrentTop);
+  }
 }
 
 /// Move an instruction and update register pressure.
@@ -685,10 +885,12 @@ void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
       TopRPTracker.setPos(MI);
     }
 
-    // Update top scheduled pressure.
-    TopRPTracker.advance();
-    assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
-    updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+    if (ShouldTrackPressure) {
+      // Update top scheduled pressure.
+      TopRPTracker.advance();
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
+    }
   }
   else {
     assert(SU->isBottomReady() && "node still has unscheduled dependencies");
@@ -704,10 +906,14 @@ void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
       moveInstruction(MI, CurrentBottom);
       CurrentBottom = MI;
     }
-    // Update bottom scheduled pressure.
-    BotRPTracker.recede();
-    assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
-    updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+    if (ShouldTrackPressure) {
+      // Update bottom scheduled pressure.
+      SmallVector<unsigned, 8> LiveUses;
+      BotRPTracker.recede(&LiveUses);
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
+      updatePressureDiffs(LiveUses);
+    }
   }
 }
 
@@ -1113,13 +1319,13 @@ void CopyConstrain::apply(ScheduleDAGMI *DAG) {
 }
 
 //===----------------------------------------------------------------------===//
-// ConvergingScheduler - Implementation of the generic MachineSchedStrategy.
+// GenericScheduler - Implementation of the generic MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
+/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
-class ConvergingScheduler : public MachineSchedStrategy {
+class GenericScheduler : public MachineSchedStrategy {
 public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
@@ -1129,7 +1335,7 @@ public:
     TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
 
 #ifndef NDEBUG
-  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+  static const char *getReasonStr(GenericScheduler::CandReason Reason);
 #endif
 
   /// Policy for scheduling the next instruction in the candidate's zone.
@@ -1160,7 +1366,7 @@ public:
     }
   };
 
-  /// Store the state used by ConvergingScheduler heuristics, required for the
+  /// Store the state used by GenericScheduler heuristics, required for the
   /// lifetime of one invocation of pickNode().
   struct SchedCandidate {
     CandPolicy Policy;
@@ -1205,16 +1411,21 @@ public:
   struct SchedRemainder {
     // Critical path through the DAG in expected latency.
     unsigned CriticalPath;
+    unsigned CyclicCritPath;
 
     // Scaled count of micro-ops left to schedule.
     unsigned RemIssueCount;
 
+    bool IsAcyclicLatencyLimited;
+
     // Unscheduled resources
     SmallVector<unsigned, 16> RemainingCounts;
 
     void reset() {
       CriticalPath = 0;
+      CyclicCritPath = 0;
       RemIssueCount = 0;
+      IsAcyclicLatencyLimited = false;
       RemainingCounts.clear();
     }
 
@@ -1288,13 +1499,16 @@ public:
 
     void reset() {
       // A new HazardRec is created for each DAG and owned by SchedBoundary.
-      delete HazardRec;
-
+      // Destroying and reconstructing it is very expensive though. So keep
+      // invalid, placeholder HazardRecs.
+      if (HazardRec && HazardRec->isEnabled()) {
+        delete HazardRec;
+        HazardRec = 0;
+      }
       Available.clear();
       Pending.clear();
       CheckPending = false;
       NextSUs.clear();
-      HazardRec = 0;
       CurrCycle = 0;
       CurrMOps = 0;
       MinReadyCycle = UINT_MAX;
@@ -1316,7 +1530,7 @@ public:
     /// PendingFlag set.
     SchedBoundary(unsigned ID, const Twine &Name):
       DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
-      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
+      Pending(ID << GenericScheduler::LogMaxQID, Name+".P"),
       HazardRec(0) {
       reset();
     }
@@ -1327,7 +1541,7 @@ public:
               SchedRemainder *rem);
 
     bool isTop() const {
-      return Available.getID() == ConvergingScheduler::TopQID;
+      return Available.getID() == GenericScheduler::TopQID;
     }
 
 #ifndef NDEBUG
@@ -1399,6 +1613,7 @@ public:
   };
 
 private:
+  const MachineSchedContext *Context;
   ScheduleDAGMI *DAG;
   const TargetSchedModel *SchedModel;
   const TargetRegisterInfo *TRI;
@@ -1408,6 +1623,7 @@ private:
   SchedBoundary Top;
   SchedBoundary Bot;
 
+  MachineSchedPolicy RegionPolicy;
 public:
   /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
   enum {
@@ -1416,8 +1632,15 @@ public:
     LogMaxQID = 2
   };
 
-  ConvergingScheduler():
-    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+  GenericScheduler(const MachineSchedContext *C):
+    Context(C), DAG(0), SchedModel(0), TRI(0),
+    Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initPolicy(MachineBasicBlock::iterator Begin,
+                          MachineBasicBlock::iterator End,
+                          unsigned NumRegionInstrs);
+
+  bool shouldTrackPressure() const { return RegionPolicy.ShouldTrackPressure; }
 
   virtual void initialize(ScheduleDAGMI *dag);
 
@@ -1432,6 +1655,8 @@ public:
   virtual void registerRoots();
 
 protected:
+  void checkAcyclicLatency();
+
   void tryCandidate(SchedCandidate &Cand,
                     SchedCandidate &TryCand,
                     SchedBoundary &Zone,
@@ -1452,7 +1677,7 @@ protected:
 };
 } // namespace
 
-void ConvergingScheduler::SchedRemainder::
+void GenericScheduler::SchedRemainder::
 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   reset();
   if (!SchedModel->hasInstrSchedModel())
@@ -1473,7 +1698,7 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   }
 }
 
-void ConvergingScheduler::SchedBoundary::
+void GenericScheduler::SchedBoundary::
 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
   reset();
   DAG = dag;
@@ -1483,7 +1708,49 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
     ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
 }
 
-void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
+/// Initialize the per-region scheduling policy.
+void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+                                     MachineBasicBlock::iterator End,
+                                     unsigned NumRegionInstrs) {
+  const TargetMachine &TM = Context->MF->getTarget();
+
+  // Avoid setting up the register pressure tracker for small regions to save
+  // compile time. As a rough heuristic, only track pressure when the number of
+  // schedulable instructions exceeds half the integer register file.
+  unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
+    TM.getTargetLowering()->getRegClassFor(MVT::i32));
+
+  RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
+
+  // For generic targets, we default to bottom-up, because it's simpler and more
+  // compile-time optimizations have been implemented in that direction.
+  RegionPolicy.OnlyBottomUp = true;
+
+  // Allow the subtarget to override default policy.
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
+
+  // After subtarget overrides, apply command line options.
+  if (!EnableRegPressure)
+    RegionPolicy.ShouldTrackPressure = false;
+
+  // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+  // e.g. -misched-bottomup=false allows scheduling in both directions.
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  if (ForceBottomUp.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyBottomUp = ForceBottomUp;
+    if (RegionPolicy.OnlyBottomUp)
+      RegionPolicy.OnlyTopDown = false;
+  }
+  if (ForceTopDown.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyTopDown = ForceTopDown;
+    if (RegionPolicy.OnlyTopDown)
+      RegionPolicy.OnlyBottomUp = false;
+  }
+}
+
+void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
@@ -1498,14 +1765,17 @@ void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   // are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
   const TargetMachine &TM = DAG->MF.getTarget();
-  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  }
+  if (!Bot.HazardRec) {
+    Bot.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  }
 }
 
-void ConvergingScheduler::releaseTopNode(SUnit *SU) {
+void GenericScheduler::releaseTopNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
@@ -1524,7 +1794,7 @@ void ConvergingScheduler::releaseTopNode(SUnit *SU) {
   Top.releaseNode(SU, SU->TopReadyCycle);
 }
 
-void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
+void GenericScheduler::releaseBottomNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
@@ -1545,8 +1815,46 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
   Bot.releaseNode(SU, SU->BotReadyCycle);
 }
 
-void ConvergingScheduler::registerRoots() {
+/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
+/// critical path by more cycles than it takes to drain the instruction buffer.
+/// We estimate an upper bounds on in-flight instructions as:
+///
+/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
+/// InFlightIterations = AcyclicPath / CyclesPerIteration
+/// InFlightResources = InFlightIterations * LoopResources
+///
+/// TODO: Check execution resources in addition to IssueCount.
+void GenericScheduler::checkAcyclicLatency() {
+  if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
+    return;
+
+  // Scaled number of cycles per loop iteration.
+  unsigned IterCount =
+    std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
+             Rem.RemIssueCount);
+  // Scaled acyclic critical path.
+  unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
+  // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
+  unsigned InFlightCount =
+    (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
+  unsigned BufferLimit =
+    SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
+
+  Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
+
+  DEBUG(dbgs() << "IssueCycles="
+        << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
+        << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
+        << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
+        << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
+        << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
+        if (Rem.IsAcyclicLatencyLimited)
+          dbgs() << "  ACYCLIC LATENCY LIMIT\n");
+}
+
+void GenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
+
   // Some roots may not feed into ExitSU. Check all of them in case.
   for (std::vector<SUnit*>::const_iterator
          I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
@@ -1554,6 +1862,11 @@ void ConvergingScheduler::registerRoots() {
       Rem.CriticalPath = (*I)->getDepth();
   }
   DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+
+  if (EnableCyclicPath) {
+    Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
+    checkAcyclicLatency();
+  }
 }
 
 /// Does this SU have a hazard within the current instruction group.
@@ -1569,7 +1882,7 @@ void ConvergingScheduler::registerRoots() {
 /// can dispatch per cycle.
 ///
 /// TODO: Also check whether the SU must start a new group.
-bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+bool GenericScheduler::SchedBoundary::checkHazard(SUnit *SU) {
   if (HazardRec->isEnabled())
     return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
 
@@ -1583,7 +1896,7 @@ bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
 }
 
 // Find the unscheduled node in ReadySUs with the highest latency.
-unsigned ConvergingScheduler::SchedBoundary::
+unsigned GenericScheduler::SchedBoundary::
 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
   SUnit *LateSU = 0;
   unsigned RemLatency = 0;
@@ -1605,7 +1918,7 @@ findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
 // Count resources in this zone and the remaining unscheduled
 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical
 // resource index, or zero if the zone is issue limited.
-unsigned ConvergingScheduler::SchedBoundary::
+unsigned GenericScheduler::SchedBoundary::
 getOtherResourceCount(unsigned &OtherCritIdx) {
   OtherCritIdx = 0;
   if (!SchedModel->hasInstrSchedModel())
@@ -1633,7 +1946,7 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
 
 /// Set the CandPolicy for this zone given the current resources and latencies
 /// inside and outside the zone.
-void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
+void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
                                                    SchedBoundary &OtherZone) {
   // Now that potential stalls have been considered, apply preemptive heuristics
   // based on the the total latency and resources inside and outside this
@@ -1692,7 +2005,7 @@ void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
     Policy.DemandResIdx = OtherCritIdx;
 }
 
-void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
+void GenericScheduler::SchedBoundary::releaseNode(SUnit *SU,
                                                      unsigned ReadyCycle) {
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
@@ -1710,7 +2023,7 @@ void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
 }
 
 /// Move the boundary of scheduled code by one cycle.
-void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
+void GenericScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
   if (SchedModel->getMicroOpBufferSize() == 0) {
     assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
     if (MinReadyCycle > NextCycle)
@@ -1748,7 +2061,7 @@ void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
   DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
 }
 
-void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
+void GenericScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
                                                               unsigned Count) {
   ExecutedResCounts[PIdx] += Count;
   if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
@@ -1762,7 +2075,7 @@ void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
 ///
 /// \return the next cycle at which the instruction may execute without
 /// oversubscribing resources.
-unsigned ConvergingScheduler::SchedBoundary::
+unsigned GenericScheduler::SchedBoundary::
 countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
   unsigned Factor = SchedModel->getResourceFactor(PIdx);
   unsigned Count = Factor * Cycles;
@@ -1787,7 +2100,7 @@ countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
 }
 
 /// Move the boundary of scheduled code by one SUnit.
-void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
   // Update the reservation table.
   if (HazardRec->isEnabled()) {
     if (!isTop() && SU->isCall) {
@@ -1891,7 +2204,7 @@ void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
 
 /// Release pending ready nodes in to the available queue. This makes them
 /// visible to heuristics.
-void ConvergingScheduler::SchedBoundary::releasePending() {
+void GenericScheduler::SchedBoundary::releasePending() {
   // If the available queue is empty, it is safe to reset MinReadyCycle.
   if (Available.empty())
     MinReadyCycle = UINT_MAX;
@@ -1921,7 +2234,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() {
 }
 
 /// Remove SU from the ready set for this boundary.
-void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
+void GenericScheduler::SchedBoundary::removeReady(SUnit *SU) {
   if (Available.isInQueue(SU))
     Available.remove(Available.find(SU));
   else {
@@ -1933,7 +2246,7 @@ void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
 /// If this queue only has one ready candidate, return it. As a side effect,
 /// defer any nodes that now hit a hazard, and advance the cycle until at least
 /// one node is ready. If multiple instructions are ready, return NULL.
-SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
+SUnit *GenericScheduler::SchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
@@ -1962,7 +2275,7 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
 #ifndef NDEBUG
 // This is useful information to dump after bumpNode.
 // Note that the Queue contents are more useful before pickNodeFromQueue.
-void ConvergingScheduler::SchedBoundary::dumpScheduledState() {
+void GenericScheduler::SchedBoundary::dumpScheduledState() {
   unsigned ResFactor;
   unsigned ResCount;
   if (ZoneCritResIdx) {
@@ -1985,7 +2298,7 @@ void ConvergingScheduler::SchedBoundary::dumpScheduledState() {
 }
 #endif
 
-void ConvergingScheduler::SchedCandidate::
+void GenericScheduler::SchedCandidate::
 initResourceDelta(const ScheduleDAGMI *DAG,
                   const TargetSchedModel *SchedModel) {
   if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
@@ -2005,9 +2318,9 @@ initResourceDelta(const ScheduleDAGMI *DAG,
 
 /// Return true if this heuristic determines order.
 static bool tryLess(int TryVal, int CandVal,
-                    ConvergingScheduler::SchedCandidate &TryCand,
-                    ConvergingScheduler::SchedCandidate &Cand,
-                    ConvergingScheduler::CandReason Reason) {
+                    GenericScheduler::SchedCandidate &TryCand,
+                    GenericScheduler::SchedCandidate &Cand,
+                    GenericScheduler::CandReason Reason) {
   if (TryVal < CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -2022,9 +2335,9 @@ static bool tryLess(int TryVal, int CandVal,
 }
 
 static bool tryGreater(int TryVal, int CandVal,
-                       ConvergingScheduler::SchedCandidate &TryCand,
-                       ConvergingScheduler::SchedCandidate &Cand,
-                       ConvergingScheduler::CandReason Reason) {
+                       GenericScheduler::SchedCandidate &TryCand,
+                       GenericScheduler::SchedCandidate &Cand,
+                       GenericScheduler::CandReason Reason) {
   if (TryVal > CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -2038,26 +2351,26 @@ static bool tryGreater(int TryVal, int CandVal,
   return false;
 }
 
-static bool tryPressure(const PressureElement &TryP,
-                        const PressureElement &CandP,
-                        ConvergingScheduler::SchedCandidate &TryCand,
-                        ConvergingScheduler::SchedCandidate &Cand,
-                        ConvergingScheduler::CandReason Reason) {
+static bool tryPressure(const PressureChange &TryP,
+                        const PressureChange &CandP,
+                        GenericScheduler::SchedCandidate &TryCand,
+                        GenericScheduler::SchedCandidate &Cand,
+                        GenericScheduler::CandReason Reason) {
+  int TryRank = TryP.getPSetOrMax();
+  int CandRank = CandP.getPSetOrMax();
   // If both candidates affect the same set, go with the smallest increase.
-  if (TryP.PSetID == CandP.PSetID) {
-    return tryLess(TryP.UnitIncrease, CandP.UnitIncrease, TryCand, Cand,
+  if (TryRank == CandRank) {
+    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
                    Reason);
   }
   // If one candidate decreases and the other increases, go with it.
-  if (tryLess(TryP.UnitIncrease < 0, CandP.UnitIncrease < 0, TryCand, Cand,
+  // Invalid candidates have UnitInc==0.
+  if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
               Reason)) {
     return true;
   }
-  // If TryP has lower Rank, it has a higher priority.
-  int TryRank = TryP.PSetRank();
-  int CandRank = CandP.PSetRank();
   // If the candidates are decreasing pressure, reverse priority.
-  if (TryP.UnitIncrease < 0)
+  if (TryP.getUnitInc() < 0)
     std::swap(TryRank, CandRank);
   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
 }
@@ -2094,6 +2407,32 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
   return 0;
 }
 
+static bool tryLatency(GenericScheduler::SchedCandidate &TryCand,
+                       GenericScheduler::SchedCandidate &Cand,
+                       GenericScheduler::SchedBoundary &Zone) {
+  if (Zone.isTop()) {
+    if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                  TryCand, Cand, GenericScheduler::TopDepthReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                   TryCand, Cand, GenericScheduler::TopPathReduce))
+      return true;
+  }
+  else {
+    if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                  TryCand, Cand, GenericScheduler::BotHeightReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                   TryCand, Cand, GenericScheduler::BotPathReduce))
+      return true;
+  }
+  return false;
+}
+
 /// Apply a set of heursitics to a new candidate. Heuristics are currently
 /// hierarchical. This may be more efficient than a graduated cost model because
 /// we don't need to evaluate all aspects of the model for each node in the
@@ -2105,16 +2444,44 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
 /// \param Zone describes the scheduled zone that we are extending.
 /// \param RPTracker describes reg pressure within the scheduled zone.
 /// \param TempTracker is a scratch pressure tracker to reuse in queries.
-void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
+void GenericScheduler::tryCandidate(SchedCandidate &Cand,
                                        SchedCandidate &TryCand,
                                        SchedBoundary &Zone,
                                        const RegPressureTracker &RPTracker,
                                        RegPressureTracker &TempTracker) {
 
-  // Always initialize TryCand's RPDelta.
-  TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
-                                  DAG->getRegionCriticalPSets(),
-                                  DAG->getRegPressure().MaxSetPressure);
+  if (DAG->isTrackingPressure()) {
+    // Always initialize TryCand's RPDelta.
+    if (Zone.isTop()) {
+      TempTracker.getMaxDownwardPressureDelta(
+        TryCand.SU->getInstr(),
+        TryCand.RPDelta,
+        DAG->getRegionCriticalPSets(),
+        DAG->getRegPressure().MaxSetPressure);
+    }
+    else {
+      if (VerifyScheduling) {
+        TempTracker.getMaxUpwardPressureDelta(
+          TryCand.SU->getInstr(),
+          &DAG->getPressureDiff(TryCand.SU),
+          TryCand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+      else {
+        RPTracker.getUpwardPressureDelta(
+          TryCand.SU->getInstr(),
+          DAG->getPressureDiff(TryCand.SU),
+          TryCand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+    }
+  }
+  DEBUG(if (TryCand.RPDelta.Excess.isValid())
+          dbgs() << "  SU(" << TryCand.SU->NodeNum << ") "
+                 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
+                 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
 
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
@@ -2129,13 +2496,22 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
 
   // Avoid exceeding the target's limit. If signed PSetID is negative, it is
   // invalid; convert it to INT_MAX to give it lowest priority.
-  if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
-                  RegExcess))
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
+                                               Cand.RPDelta.Excess,
+                                               TryCand, Cand, RegExcess))
     return;
 
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
-                  TryCand, Cand, RegCritical))
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
+                                               Cand.RPDelta.CriticalMax,
+                                               TryCand, Cand, RegCritical))
+    return;
+
+  // For loops that are acyclic path limited, aggressively schedule for latency.
+  // This can result in very long dependence chains scheduled in sequence, so
+  // once every cycle (when CurrMOps == 0), switch to normal heuristics.
+  if (Rem.IsAcyclicLatencyLimited && !Zone.CurrMOps
+      && tryLatency(TryCand, Cand, Zone))
     return;
 
   // Keep clustered nodes together to encourage downstream peephole
@@ -2157,8 +2533,9 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     return;
   }
   // Avoid increasing the max pressure of the entire region.
-  if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax,
-                  TryCand, Cand, RegMax))
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
+                                               Cand.RPDelta.CurrentMax,
+                                               TryCand, Cand, RegMax))
     return;
 
   // Avoid critical resource consumption and balance the schedule.
@@ -2172,27 +2549,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     return;
 
   // Avoid serializing long latency dependence chains.
-  if (Cand.Policy.ReduceLatency) {
-    if (Zone.isTop()) {
-      if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
-        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
-                    TryCand, Cand, TopDepthReduce))
-          return;
-      }
-      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
-                     TryCand, Cand, TopPathReduce))
-        return;
-    }
-    else {
-      if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
-        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
-                    TryCand, Cand, BotHeightReduce))
-          return;
-      }
-      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
-                     TryCand, Cand, BotPathReduce))
-        return;
-    }
+  // For acyclic path limited loops, latency was already checked above.
+  if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
+      && tryLatency(TryCand, Cand, Zone)) {
+    return;
   }
 
   // Prefer immediate defs/users of the last scheduled instruction. This is a
@@ -2210,8 +2570,8 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
 }
 
 #ifndef NDEBUG
-const char *ConvergingScheduler::getReasonStr(
-  ConvergingScheduler::CandReason Reason) {
+const char *GenericScheduler::getReasonStr(
+  GenericScheduler::CandReason Reason) {
   switch (Reason) {
   case NoCand:         return "NOCAND    ";
   case PhysRegCopy:    return "PREG-COPY";
@@ -2232,8 +2592,8 @@ const char *ConvergingScheduler::getReasonStr(
   llvm_unreachable("Unknown reason!");
 }
 
-void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
-  PressureElement P;
+void GenericScheduler::traceCandidate(const SchedCandidate &Cand) {
+  PressureChange P;
   unsigned ResIdx = 0;
   unsigned Latency = 0;
   switch (Cand.Reason) {
@@ -2269,8 +2629,8 @@ void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
   }
   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
   if (P.isValid())
-    dbgs() << " " << TRI->getRegPressureSetName(P.PSetID)
-           << ":" << P.UnitIncrease << " ";
+    dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
+           << ":" << P.getUnitInc() << " ";
   else
     dbgs() << "      ";
   if (ResIdx)
@@ -2285,12 +2645,12 @@ void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
 }
 #endif
 
-/// Pick the best candidate from the top queue.
+/// Pick the best candidate from the queue.
 ///
 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
-void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
                                             const RegPressureTracker &RPTracker,
                                             SchedCandidate &Cand) {
   ReadyQueue &Q = Zone.Available;
@@ -2315,14 +2675,14 @@ void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
   }
 }
 
-static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
+static void tracePick(const GenericScheduler::SchedCandidate &Cand,
                       bool IsTop) {
   DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
-        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
+        << GenericScheduler::getReasonStr(Cand.Reason) << '\n');
 }
 
 /// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
@@ -2377,7 +2737,7 @@ SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
 }
 
 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
-SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
@@ -2385,24 +2745,26 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   }
   SUnit *SU;
   do {
-    if (ForceTopDown) {
+    if (RegionPolicy.OnlyTopDown) {
       SU = Top.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         SchedCandidate TopCand(NoPolicy);
         pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
-        assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(TopCand, true);
         SU = TopCand.SU;
       }
       IsTopNode = true;
     }
-    else if (ForceBottomUp) {
+    else if (RegionPolicy.OnlyBottomUp) {
       SU = Bot.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         SchedCandidate BotCand(NoPolicy);
         pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
-        assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(BotCand, false);
         SU = BotCand.SU;
       }
       IsTopNode = false;
@@ -2421,7 +2783,7 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   return SU;
 }
 
-void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
+void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 
   MachineBasicBlock::iterator InsertPos = SU->getInstr();
   if (!isTop)
@@ -2452,7 +2814,7 @@ void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 ///
 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
 /// them here. See comments in biasPhysRegCopy.
-void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle);
     Top.bumpNode(SU);
@@ -2469,25 +2831,23 @@ void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
-  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
+static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C) {
+  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new GenericScheduler(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
   // data and pass it to later mutations. Have a single mutation that gathers
   // the interesting nodes in one pass.
   DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
-  if (EnableLoadCluster)
+  if (EnableLoadCluster && DAG->TII->enableClusterLoads())
     DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
   if (EnableMacroFusion)
     DAG->addMutation(new MacroFusion(DAG->TII));
   return DAG;
 }
 static MachineSchedRegistry
-ConvergingSchedRegistry("converge", "Standard converging scheduler.",
-                        createConvergingSched);
+GenericSchedRegistry("converge", "Standard converging scheduler.",
+                     createGenericSched);
 
 //===----------------------------------------------------------------------===//
 // ILP Scheduler. Currently for experimental analysis of heuristics.
@@ -2529,15 +2889,6 @@ struct ILPOrder {
 
 /// \brief Schedule based on the ILP metric.
 class ILPScheduler : public MachineSchedStrategy {
-  /// In case all subtrees are eventually connected to a common root through
-  /// data dependence (e.g. reduction), place an upper limit on their size.
-  ///
-  /// FIXME: A subtree limit is generally good, but in the situation commented
-  /// above, where multiple similar subtrees feed a common root, we should
-  /// only split at a point where the resulting subtrees will be balanced.
-  /// (a motivating test case must be found).
-  static const unsigned SubtreeLimit = 16;
-
   ScheduleDAGMI *DAG;
   ILPOrder Cmp;
 
@@ -2721,7 +3072,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   }
 
   static bool isNodeHidden(const SUnit *Node) {
-    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+    return (Node->Preds.size() > 10 || Node->Succs.size() > 10);
   }
 
   static bool hasNodeAddressLabel(const SUnit *Node,
@@ -2744,7 +3095,11 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
     std::string Str;
     raw_string_ostream SS(Str);
-    SS << "SU(" << SU->NodeNum << ')';
+    const SchedDFSResult *DFS =
+      static_cast<const ScheduleDAGMI*>(G)->getDFSResult();
+    SS << "SU:" << SU->NodeNum;
+    if (DFS)
+      SS << " I:" << DFS->getNumInstrs(SU);
     return SS.str();
   }
   static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index dacdbdd..105d7c2 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -308,12 +308,29 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
   // to be sunk then it's probably worth it.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg()) continue;
+    if (!MO.isReg() || !MO.isUse())
+      continue;
     unsigned Reg = MO.getReg();
-    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Reg == 0)
       continue;
-    if (MRI->hasOneNonDBGUse(Reg))
-      return true;
+
+    // We don't move live definitions of physical registers,
+    // so sinking their uses won't enable any opportunities.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    // If this instruction is the only user of a virtual register,
+    // check if breaking the edge will enable sinking
+    // both this instruction and the defining instruction.
+    if (MRI->hasOneNonDBGUse(Reg)) {
+      // If the definition resides in same MBB,
+      // claim it's likely we can sink these together.
+      // If definition resides elsewhere, we aren't
+      // blocking it from being sunk so don't break the edge.
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (DefMI->getParent() == MI->getParent())
+        return true;
+    }
   }
 
   return false;
@@ -615,9 +632,8 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
 
   DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
 
-  // If the block has multiple predecessors, this would introduce computation on
-  // a path that it doesn't already exist.  We could split the critical edge,
-  // but for now we just punt.
+  // If the block has multiple predecessors, this is a critical edge.
+  // Decide if we can sink along it or need to break the edge.
   if (SuccToSinkTo->pred_size() > 1) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index e74bfc8..d61470c 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -213,6 +213,10 @@ namespace {
                 const LiveInterval &LI);
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
+    void report(const char *msg, const MachineFunction *MF,
+                const LiveRange &LR);
+    void report(const char *msg, const MachineBasicBlock *MBB,
+                const LiveRange &LR);
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -225,9 +229,10 @@ namespace {
     void verifyLiveVariables();
     void verifyLiveIntervals();
     void verifyLiveInterval(const LiveInterval&);
-    void verifyLiveIntervalValue(const LiveInterval&, VNInfo*);
-    void verifyLiveIntervalSegment(const LiveInterval&,
-                                   LiveInterval::const_iterator);
+    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned);
+    void verifyLiveRangeSegment(const LiveRange&,
+                                const LiveRange::const_iterator I, unsigned);
+    void verifyLiveRange(const LiveRange&, unsigned);
 
     void verifyStackFrame();
   };
@@ -414,23 +419,25 @@ void MachineVerifier::report(const char *msg,
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
                              const LiveInterval &LI) {
   report(msg, MF);
-  *OS << "- interval:    ";
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-    *OS << PrintReg(LI.reg, TRI);
-  else
-    *OS << PrintRegUnit(LI.reg, TRI);
-  *OS << ' ' << LI << '\n';
+  *OS << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
                              const LiveInterval &LI) {
   report(msg, MBB);
-  *OS << "- interval:    ";
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-    *OS << PrintReg(LI.reg, TRI);
-  else
-    *OS << PrintRegUnit(LI.reg, TRI);
-  *OS << ' ' << LI << '\n';
+  *OS << "- interval:    " << LI << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
+                             const LiveRange &LR) {
+  report(msg, MBB);
+  *OS << "- liverange:    " << LR << "\n";
+}
+
+void MachineVerifier::report(const char *msg, const MachineFunction *MF,
+                             const LiveRange &LR) {
+  report(msg, MF);
+  *OS << "- liverange:    " << LR << "\n";
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -768,7 +775,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   if (MI->getNumOperands() < MCID.getNumOperands()) {
     report("Too few operands", MI);
     *OS << MCID.getNumOperands() << " operands expected, but "
-        << MI->getNumExplicitOperands() << " given.\n";
+        << MI->getNumOperands() << " given.\n";
   }
 
   // Check the tied operands.
@@ -826,7 +833,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     if (MO->isReg() &&
         !(MI->isVariadic() && MONum == MCID.getNumOperands()-1)) {
       if (MO->isDef() && !MCOI.isOptionalDef())
-          report("Explicit operand marked as def", MO, MONum);
+        report("Explicit operand marked as def", MO, MONum);
       if (MO->isImplicit())
         report("Explicit operand marked as implicit", MO, MONum);
     }
@@ -1001,16 +1008,16 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       // Check the cached regunit intervals.
       if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
         for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-          if (const LiveInterval *LI = LiveInts->getCachedRegUnit(*Units)) {
-            LiveRangeQuery LRQ(*LI, UseIdx);
+          if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units)) {
+            LiveQueryResult LRQ = LR->Query(UseIdx);
             if (!LRQ.valueIn()) {
-              report("No live range at use", MO, MONum);
+              report("No live segment at use", MO, MONum);
               *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
-                  << ' ' << *LI << '\n';
+                  << ' ' << *LR << '\n';
             }
             if (MO->isKill() && !LRQ.isKill()) {
               report("Live range continues after kill flag", MO, MONum);
-              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LI << '\n';
+              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
             }
           }
         }
@@ -1020,9 +1027,9 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         if (LiveInts->hasInterval(Reg)) {
           // This is a virtual register interval.
           const LiveInterval &LI = LiveInts->getInterval(Reg);
-          LiveRangeQuery LRQ(LI, UseIdx);
+          LiveQueryResult LRQ = LI.Query(UseIdx);
           if (!LRQ.valueIn()) {
-            report("No live range at use", MO, MONum);
+            report("No live segment at use", MO, MONum);
             *OS << UseIdx << " is not live in " << LI << '\n';
           }
           // Check for extra kill flags.
@@ -1071,7 +1078,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         llvm::next(MRI->def_begin(Reg)) != MRI->def_end())
       report("Multiple virtual register defs in SSA form", MO, MONum);
 
-    // Check LiveInts for a live range, but only for virtual registers.
+    // Check LiveInts for a live segment, but only for virtual registers.
     if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
         !LiveInts->isNotInMIMap(MI)) {
       SlotIndex DefIdx = LiveInts->getInstructionIndex(MI);
@@ -1086,9 +1093,17 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
               << DefIdx << " in " << LI << '\n';
           }
         } else {
-          report("No live range at def", MO, MONum);
+          report("No live segment at def", MO, MONum);
           *OS << DefIdx << " is not live in " << LI << '\n';
         }
+        // Check that, if the dead def flag is present, LiveInts agree.
+        if (MO->isDead()) {
+          LiveQueryResult LRQ = LI.Query(DefIdx);
+          if (!LRQ.isDeadDef()) {
+            report("Live range continues after dead def flag", MO, MONum);
+            *OS << "Live range: " << LI << '\n';
+          }
+        }
       } else {
         report("Virtual register has no Live interval", MO, MONum);
       }
@@ -1335,25 +1350,26 @@ void MachineVerifier::verifyLiveIntervals() {
 
   // Verify all the cached regunit intervals.
   for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
-    if (const LiveInterval *LI = LiveInts->getCachedRegUnit(i))
-      verifyLiveInterval(*LI);
+    if (const LiveRange *LR = LiveInts->getCachedRegUnit(i))
+      verifyLiveRange(*LR, i);
 }
 
-void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
-                                              VNInfo *VNI) {
+void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
+                                           const VNInfo *VNI,
+                                           unsigned Reg) {
   if (VNI->isUnused())
     return;
 
-  const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
+  const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LI);
+    report("Valno not live at def and not marked unused", MF, LR);
     *OS << "Valno #" << VNI->id << '\n';
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live range at def has different valno", MF, LI);
+    report("Live segment at def has different valno", MF, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " where valno #" << DefVNI->id << " is live\n";
     return;
@@ -1361,15 +1377,15 @@ void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LI);
+    report("Invalid definition index", MF, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-        << " in " << LI << '\n';
+        << " in " << LR << '\n';
     return;
   }
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LI);
+      report("PHIDef value is not defined at MBB start", MBB, LR);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
           << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
     }
@@ -1379,161 +1395,154 @@ void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LI);
+    report("No instruction at def index", MBB, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     return;
   }
 
-  bool hasDef = false;
-  bool isEarlyClobber = false;
-  for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-    if (!MOI->isReg() || !MOI->isDef())
-      continue;
-    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-      if (MOI->getReg() != LI.reg)
-        continue;
-    } else {
-      if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
-          !TRI->hasRegUnit(MOI->getReg(), LI.reg))
+  if (Reg != 0) {
+    bool hasDef = false;
+    bool isEarlyClobber = false;
+    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef())
         continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (MOI->getReg() != Reg)
+          continue;
+      } else {
+        if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+            !TRI->hasRegUnit(MOI->getReg(), Reg))
+          continue;
+      }
+      hasDef = true;
+      if (MOI->isEarlyClobber())
+        isEarlyClobber = true;
     }
-    hasDef = true;
-    if (MOI->isEarlyClobber())
-      isEarlyClobber = true;
-  }
 
-  if (!hasDef) {
-    report("Defining instruction does not modify register", MI);
-    *OS << "Valno #" << VNI->id << " in " << LI << '\n';
-  }
+    if (!hasDef) {
+      report("Defining instruction does not modify register", MI);
+      *OS << "Valno #" << VNI->id << " in " << LR << '\n';
+    }
 
-  // Early clobber defs begin at USE slots, but other defs must begin at
-  // DEF slots.
-  if (isEarlyClobber) {
-    if (!VNI->def.isEarlyClobber()) {
-      report("Early clobber def must be at an early-clobber slot", MBB, LI);
+    // Early clobber defs begin at USE slots, but other defs must begin at
+    // DEF slots.
+    if (isEarlyClobber) {
+      if (!VNI->def.isEarlyClobber()) {
+        report("Early clobber def must be at an early-clobber slot", MBB, LR);
+        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+      }
+    } else if (!VNI->def.isRegister()) {
+      report("Non-PHI, non-early clobber def must be at a register slot",
+             MBB, LR);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     }
-  } else if (!VNI->def.isRegister()) {
-    report("Non-PHI, non-early clobber def must be at a register slot",
-           MBB, LI);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
   }
 }
 
-void
-MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
-                                           LiveInterval::const_iterator I) {
-  const VNInfo *VNI = I->valno;
-  assert(VNI && "Live range has no valno");
-
-  if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live range", MF, LI);
-    *OS << *I << " has a bad valno\n";
+void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
+                                             const LiveRange::const_iterator I,
+                                             unsigned Reg) {
+  const LiveRange::Segment &S = *I;
+  const VNInfo *VNI = S.valno;
+  assert(VNI && "Live segment has no valno");
+
+  if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live segment", MF, LR);
+    *OS << S << " has a bad valno\n";
   }
 
   if (VNI->isUnused()) {
-    report("Live range valno is marked unused", MF, LI);
-    *OS << *I << '\n';
+    report("Live segment valno is marked unused", MF, LR);
+    *OS << S << '\n';
   }
 
-  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LI);
-    *OS << *I << '\n';
+    report("Bad start of live segment, no basic block", MF, LR);
+    *OS << S << '\n';
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
-  if (I->start != MBBStartIdx && I->start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LI);
-    *OS << *I << '\n';
+  if (S.start != MBBStartIdx && S.start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB, LR);
+    *OS << S << '\n';
   }
 
   const MachineBasicBlock *EndMBB =
-    LiveInts->getMBBFromIndex(I->end.getPrevSlot());
+    LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LI);
-    *OS << *I << '\n';
+    report("Bad end of live segment, no basic block", MF, LR);
+    *OS << S << '\n';
     return;
   }
 
   // No more checks for live-out segments.
-  if (I->end == LiveInts->getMBBEndIdx(EndMBB))
+  if (S.end == LiveInts->getMBBEndIdx(EndMBB))
     return;
 
   // RegUnit intervals are allowed dead phis.
-  if (!TargetRegisterInfo::isVirtualRegister(LI.reg) && VNI->isPHIDef() &&
-      I->start == VNI->def && I->end == VNI->def.getDeadSlot())
+  if (!TargetRegisterInfo::isVirtualRegister(Reg) && VNI->isPHIDef() &&
+      S.start == VNI->def && S.end == VNI->def.getDeadSlot())
     return;
 
   // The live segment is ending inside EndMBB
   const MachineInstr *MI =
-    LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
+    LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LI);
-    *OS << *I << '\n';
+    report("Live segment doesn't end at a valid instruction", EndMBB, LR);
+    *OS << S << '\n';
     return;
   }
 
   // The block slot must refer to a basic block boundary.
-  if (I->end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LI);
-    *OS << *I << '\n';
+  if (S.end.isBlock()) {
+    report("Live segment ends at B slot of an instruction", EndMBB, LR);
+    *OS << S << '\n';
   }
 
-  if (I->end.isDead()) {
+  if (S.end.isDead()) {
     // Segment ends on the dead slot.
     // That means there must be a dead def.
-    if (!SlotIndex::isSameInstr(I->start, I->end)) {
-      report("Live segment ending at dead slot spans instructions", EndMBB, LI);
-      *OS << *I << '\n';
+    if (!SlotIndex::isSameInstr(S.start, S.end)) {
+      report("Live segment ending at dead slot spans instructions", EndMBB, LR);
+      *OS << S << '\n';
     }
   }
 
   // A live segment can only end at an early-clobber slot if it is being
   // redefined by an early-clobber def.
-  if (I->end.isEarlyClobber()) {
-    if (I+1 == LI.end() || (I+1)->start != I->end) {
+  if (S.end.isEarlyClobber()) {
+    if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LI);
-      *OS << *I << '\n';
+             "redefined by an EC def in the same instruction", EndMBB, LR);
+      *OS << S << '\n';
     }
   }
 
   // The following checks only apply to virtual registers. Physreg liveness
   // is too weird to check.
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-    // A live range can end with either a redefinition, a kill flag on a
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
-    bool hasDeadDef = false;
     for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-      if (!MOI->isReg() || MOI->getReg() != LI.reg)
+      if (!MOI->isReg() || MOI->getReg() != Reg)
         continue;
       if (MOI->readsReg())
         hasRead = true;
-      if (MOI->isDef() && MOI->isDead())
-        hasDeadDef = true;
     }
-
-    if (I->end.isDead()) {
-      if (!hasDeadDef) {
-        report("Instruction doesn't have a dead def operand", MI);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-      }
-    } else {
+    if (!S.end.isDead()) {
       if (!hasRead) {
-        report("Instruction ending live range doesn't read the register", MI);
-        *OS << *I << " in " << LI << '\n';
+        report("Instruction ending live segment doesn't read the register", MI);
+        *OS << S << " in " << LR << '\n';
       }
     }
   }
 
   // Now check all the basic blocks in this live segment.
   MachineFunction::const_iterator MFI = MBB;
-  // Is this live range the beginning of a non-PHIDef VN?
-  if (I->start == VNI->def && !VNI->isPHIDef()) {
+  // Is this live segment the beginning of a non-PHIDef VN?
+  if (S.start == VNI->def && !VNI->isPHIDef()) {
     // Not live-in to any blocks.
     if (MBB == EndMBB)
       return;
@@ -1541,9 +1550,9 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
     ++MFI;
   }
   for (;;) {
-    assert(LiveInts->isLiveInToMBB(LI, MFI));
+    assert(LiveInts->isLiveInToMBB(LR, MFI));
     // We don't know how to track physregs into a landing pad.
-    if (!TargetRegisterInfo::isVirtualRegister(LI.reg) &&
+    if (!TargetRegisterInfo::isVirtualRegister(Reg) &&
         MFI->isLandingPad()) {
       if (&*MFI == EndMBB)
         break;
@@ -1559,11 +1568,11 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
     for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
          PE = MFI->pred_end(); PI != PE; ++PI) {
       SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
-      const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
+      const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LI);
+        report("Register not marked live out of predecessor", *PI, LR);
         *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
             << PEnd << '\n';
@@ -1572,7 +1581,7 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LI);
+        report("Different value live out of predecessor", *PI, LR);
         *OS << "Valno #" << PVNI->id << " live out of BB#"
             << (*PI)->getNumber() << '@' << PEnd
             << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
@@ -1585,13 +1594,17 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
   }
 }
 
-void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
-  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-       I!=E; ++I)
-    verifyLiveIntervalValue(LI, *I);
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg) {
+  for (LiveRange::const_vni_iterator I = LR.vni_begin(), E = LR.vni_end();
+       I != E; ++I)
+    verifyLiveRangeValue(LR, *I, Reg);
+
+  for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
+    verifyLiveRangeSegment(LR, I, Reg);
+}
 
-  for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I)
-    verifyLiveIntervalSegment(LI, I);
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  verifyLiveRange(LI, LI.reg);
 
   // Check the LI only has one connected component.
   if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index bf23eca..dcd9072 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -313,14 +313,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     if (IncomingReg) {
       // Add the region from the beginning of MBB to the copy instruction to
       // IncomingReg's live interval.
-      LiveInterval &IncomingLI = LIS->getOrCreateInterval(IncomingReg);
+      LiveInterval &IncomingLI = LIS->createEmptyInterval(IncomingReg);
       VNInfo *IncomingVNI = IncomingLI.getVNInfoAt(MBBStartIndex);
       if (!IncomingVNI)
         IncomingVNI = IncomingLI.getNextValue(MBBStartIndex,
                                               LIS->getVNInfoAllocator());
-      IncomingLI.addRange(LiveRange(MBBStartIndex,
-                                    DestCopyIndex.getRegSlot(),
-                                    IncomingVNI));
+      IncomingLI.addSegment(LiveInterval::Segment(MBBStartIndex,
+                                                  DestCopyIndex.getRegSlot(),
+                                                  IncomingVNI));
     }
 
     LiveInterval &DestLI = LIS->getInterval(DestReg);
@@ -332,14 +332,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // the copy instruction.
       VNInfo *OrigDestVNI = DestLI.getVNInfoAt(MBBStartIndex);
       assert(OrigDestVNI && "PHI destination should be live at block entry.");
-      DestLI.removeRange(MBBStartIndex, MBBStartIndex.getDeadSlot());
+      DestLI.removeSegment(MBBStartIndex, MBBStartIndex.getDeadSlot());
       DestLI.createDeadDef(DestCopyIndex.getRegSlot(),
                            LIS->getVNInfoAllocator());
       DestLI.removeValNo(OrigDestVNI);
     } else {
       // Otherwise, remove the region from the beginning of MBB to the copy
       // instruction from DestReg's live interval.
-      DestLI.removeRange(MBBStartIndex, DestCopyIndex.getRegSlot());
+      DestLI.removeSegment(MBBStartIndex, DestCopyIndex.getRegSlot());
       VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
       assert(DestVNI && "PHI destination should be live at its definition.");
       DestVNI->def = DestCopyIndex.getRegSlot();
@@ -460,7 +460,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     if (LIS) {
       if (NewSrcInstr) {
         LIS->InsertMachineInstrInMaps(NewSrcInstr);
-        LIS->addLiveRangeToEndOfBlock(IncomingReg, NewSrcInstr);
+        LIS->addSegmentToEndOfBlock(IncomingReg, NewSrcInstr);
       }
 
       if (!SrcUndef &&
@@ -511,8 +511,8 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
                  "Cannot find kill instruction");
 
           SlotIndex LastUseIndex = LIS->getInstructionIndex(KillInst);
-          SrcLI.removeRange(LastUseIndex.getRegSlot(),
-                            LIS->getMBBEndIdx(&opBlock));
+          SrcLI.removeSegment(LastUseIndex.getRegSlot(),
+                              LIS->getMBBEndIdx(&opBlock));
         }
       }
     }
diff --git a/lib/CodeGen/PHIEliminationUtils.h b/lib/CodeGen/PHIEliminationUtils.h
index 9ac47fb..48234ae 100644
--- a/lib/CodeGen/PHIEliminationUtils.h
+++ b/lib/CodeGen/PHIEliminationUtils.h
@@ -1,4 +1,4 @@
-//=- PHIEliminationUtils.h - Helper functions for PHI elimination *- C++ -*--=//
+//=- PHIEliminationUtils.h - Helper functions for PHI elimination -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index c0861c5..f4ffd03 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -58,8 +58,6 @@ OptimizeRegAlloc("optimize-regalloc", cl::Hidden,
 static cl::opt<cl::boolOrDefault>
 EnableMachineSched("enable-misched", cl::Hidden,
     cl::desc("Enable the machine instruction scheduling pass."));
-static cl::opt<bool> EnableStrongPHIElim("strong-phi-elim", cl::Hidden,
-    cl::desc("Use strong PHI elimination."));
 static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
     cl::Hidden,
     cl::desc("Disable Machine LICM"));
@@ -236,7 +234,7 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
 
   // Temporarily disable experimental passes.
   const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
-  if (!ST.enableMachineScheduler())
+  if (!ST.useMachineScheduler())
     disablePass(&MachineSchedulerID);
 }
 
@@ -675,24 +673,15 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   // preferably fix the scavenger to not depend on them).
   addPass(&LiveVariablesID);
 
-  // Add passes that move from transformed SSA into conventional SSA. This is a
-  // "copy coalescing" problem.
-  //
-  if (!EnableStrongPHIElim) {
-    // Edge splitting is smarter with machine loop info.
-    addPass(&MachineLoopInfoID);
-    addPass(&PHIEliminationID);
-  }
+  // Edge splitting is smarter with machine loop info.
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
     addPass(&LiveIntervalsID);
 
   addPass(&TwoAddressInstructionPassID);
-
-  if (EnableStrongPHIElim)
-    addPass(&StrongPHIEliminationID);
-
   addPass(&RegisterCoalescerID);
 
   // PreRA instruction scheduling.
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index a7439b5..28f2d2f 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -40,20 +40,30 @@
 //     If the branch instruction can use flag from "sub", then we can replace
 //     "sub" with "subs" and eliminate the "cmp" instruction.
 //
-// - Optimize Bitcast pairs:
-//
-//     v1 = bitcast v0
-//     v2 = bitcast v1
-//        = v2
-//   =>
-//     v1 = bitcast v0
-//        = v0
-//
 // - Optimize Loads:
 //
 //     Loads that can be folded into a later instruction. A load is foldable
 //     if it loads to virtual registers and the virtual register defined has 
 //     a single use.
+//
+// - Optimize Copies and Bitcast:
+//
+//     Rewrite copies and bitcasts to avoid cross register bank copies
+//     when possible.
+//     E.g., Consider the following example, where capital and lower
+//     letters denote different register file:
+//     b = copy A <-- cross-bank copy
+//     C = copy b <-- cross-bank copy
+//   =>
+//     b = copy A <-- cross-bank copy
+//     C = copy A <-- same-bank copy
+//
+//     E.g., for bitcast:
+//     b = bitcast A <-- cross-bank copy
+//     C = bitcast b <-- cross-bank copy
+//   =>
+//     b = bitcast A <-- cross-bank copy
+//     C = copy A    <-- same-bank copy
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "peephole-opt"
@@ -81,11 +91,11 @@ DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
                 cl::desc("Disable the peephole optimizer"));
 
 STATISTIC(NumReuse,      "Number of extension results reused");
-STATISTIC(NumBitcasts,   "Number of bitcasts eliminated");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
 STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
+STATISTIC(NumCopiesBitcasts, "Number of copies/bitcasts optimized");
 
 namespace {
   class PeepholeOptimizer : public MachineFunctionPass {
@@ -112,11 +122,11 @@ namespace {
     }
 
   private:
-    bool optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSet<MachineInstr*, 8> &LocalMIs);
     bool optimizeSelect(MachineInstr *MI);
+    bool optimizeCopyOrBitcast(MachineInstr *MI);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
@@ -298,78 +308,6 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   return Changed;
 }
 
-/// optimizeBitcastInstr - If the instruction is a bitcast instruction A that
-/// cannot be optimized away during isel (e.g. ARM::VMOVSR, which bitcast
-/// a value cross register classes), and the source is defined by another
-/// bitcast instruction B. And if the register class of source of B matches
-/// the register class of instruction A, then it is legal to replace all uses
-/// of the def of A with source of B. e.g.
-///   %vreg0<def> = VMOVSR %vreg1
-///   %vreg3<def> = VMOVRS %vreg0
-///   Replace all uses of vreg3 with vreg1.
-
-bool PeepholeOptimizer::optimizeBitcastInstr(MachineInstr *MI,
-                                             MachineBasicBlock *MBB) {
-  unsigned NumDefs = MI->getDesc().getNumDefs();
-  unsigned NumSrcs = MI->getDesc().getNumOperands() - NumDefs;
-  if (NumDefs != 1)
-    return false;
-
-  unsigned Def = 0;
-  unsigned Src = 0;
-  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (MO.isDef())
-      Def = Reg;
-    else if (Src)
-      // Multiple sources?
-      return false;
-    else
-      Src = Reg;
-  }
-
-  assert(Def && Src && "Malformed bitcast instruction!");
-
-  MachineInstr *DefMI = MRI->getVRegDef(Src);
-  if (!DefMI || !DefMI->isBitcast())
-    return false;
-
-  unsigned SrcSrc = 0;
-  NumDefs = DefMI->getDesc().getNumDefs();
-  NumSrcs = DefMI->getDesc().getNumOperands() - NumDefs;
-  if (NumDefs != 1)
-    return false;
-  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
-    const MachineOperand &MO = DefMI->getOperand(i);
-    if (!MO.isReg() || MO.isDef())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (!MO.isDef()) {
-      if (SrcSrc)
-        // Multiple sources?
-        return false;
-      else
-        SrcSrc = Reg;
-    }
-  }
-
-  if (MRI->getRegClass(SrcSrc) != MRI->getRegClass(Def))
-    return false;
-
-  MRI->replaceRegWith(Def, SrcSrc);
-  MRI->clearKillFlags(SrcSrc);
-  MI->eraseFromParent();
-  ++NumBitcasts;
-  return true;
-}
-
 /// optimizeCmpInstr - If the instruction is a compare and the previous
 /// instruction it's comparing against all ready sets (or could be modified to
 /// set) the same flag as the compare, then we can remove the comparison and use
@@ -411,6 +349,150 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
   return true;
 }
 
+/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
+/// share the same register file.
+static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
+                                  const TargetRegisterClass *DefRC,
+                                  unsigned DefSubReg,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SrcSubReg) {
+  // Same register class.
+  if (DefRC == SrcRC)
+    return true;
+
+  // Both operands are sub registers. Check if they share a register class.
+  unsigned SrcIdx, DefIdx;
+  if (SrcSubReg && DefSubReg)
+    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
+                                      SrcIdx, DefIdx) != NULL;
+  // At most one of the register is a sub register, make it Src to avoid
+  // duplicating the test.
+  if (!SrcSubReg) {
+    std::swap(DefSubReg, SrcSubReg);
+    std::swap(DefRC, SrcRC);
+  }
+
+  // One of the register is a sub register, check if we can get a superclass.
+  if (SrcSubReg)
+    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != NULL;
+  // Plain copy.
+  return TRI.getCommonSubClass(DefRC, SrcRC) != NULL;
+}
+
+/// \brief Get the index of the definition and source for \p Copy
+/// instruction.
+/// \pre Copy.isCopy() or Copy.isBitcast().
+/// \return True if the Copy instruction has only one register source
+/// and one register definition. Otherwise, \p DefIdx and \p SrcIdx
+/// are invalid.
+static bool getCopyOrBitcastDefUseIdx(const MachineInstr &Copy,
+                                      unsigned &DefIdx, unsigned &SrcIdx) {
+  assert((Copy.isCopy() || Copy.isBitcast()) && "Wrong operation type.");
+  if (Copy.isCopy()) {
+    // Copy instruction are supposed to be: Def = Src.
+     if (Copy.getDesc().getNumOperands() != 2)
+       return false;
+     DefIdx = 0;
+     SrcIdx = 1;
+     assert(Copy.getOperand(DefIdx).isDef() && "Use comes before def!");
+     return true;
+  }
+  // Bitcast case.
+  // Bitcasts with more than one def are not supported.
+  if (Copy.getDesc().getNumDefs() != 1)
+    return false;
+  // Initialize SrcIdx to an undefined operand.
+  SrcIdx = Copy.getDesc().getNumOperands();
+  for (unsigned OpIdx = 0, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; ++OpIdx) {
+    const MachineOperand &MO = Copy.getOperand(OpIdx);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isDef())
+      DefIdx = OpIdx;
+    else if (SrcIdx != EndOpIdx)
+      // Multiple sources?
+      return false;
+    SrcIdx = OpIdx;
+  }
+  return true;
+}
+
+/// \brief Optimize a copy or bitcast instruction to avoid cross
+/// register bank copy. The optimization looks through a chain of
+/// copies and try to find a source that has a compatible register
+/// class.
+/// Two register classes are considered to be compatible if they share
+/// the same register bank.
+/// New copies issued by this optimization are register allocator
+/// friendly. This optimization does not remove any copy as it may
+/// overconstraint the register allocator, but replaces some when
+/// possible.
+/// \pre \p MI is a Copy (MI->isCopy() is true)
+/// \return True, when \p MI has been optimized. In that case, \p MI has
+/// been removed from its parent.
+bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
+  unsigned DefIdx, SrcIdx;
+  if (!MI || !getCopyOrBitcastDefUseIdx(*MI, DefIdx, SrcIdx))
+    return false;
+
+  const MachineOperand &MODef = MI->getOperand(DefIdx);
+  assert(MODef.isReg() && "Copies must be between registers.");
+  unsigned Def = MODef.getReg();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Def))
+    return false;
+
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Def);
+  unsigned DefSubReg = MODef.getSubReg();
+
+  unsigned Src;
+  unsigned SrcSubReg;
+  bool ShouldRewrite = false;
+  MachineInstr *Copy = MI;
+  const TargetRegisterInfo &TRI = *TM->getRegisterInfo();
+
+  // Follow the chain of copies until we reach the top or find a
+  // more suitable source.
+  do {
+    unsigned CopyDefIdx, CopySrcIdx;
+    if (!getCopyOrBitcastDefUseIdx(*Copy, CopyDefIdx, CopySrcIdx))
+      break;
+    const MachineOperand &MO = Copy->getOperand(CopySrcIdx);
+    assert(MO.isReg() && "Copies must be between registers.");
+    Src = MO.getReg();
+
+    if (TargetRegisterInfo::isPhysicalRegister(Src))
+      break;
+
+    const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
+    SrcSubReg = MO.getSubReg();
+
+    // If this source does not incur a cross register bank copy, use it.
+    ShouldRewrite = shareSameRegisterFile(TRI, DefRC, DefSubReg, SrcRC,
+                                          SrcSubReg);
+    // Follow the chain of copies: get the definition of Src.
+    Copy = MRI->getVRegDef(Src);
+  } while (!ShouldRewrite && Copy && (Copy->isCopy() || Copy->isBitcast()));
+
+  // If we did not find a more suitable source, there is nothing to optimize.
+  if (!ShouldRewrite || Src == MI->getOperand(SrcIdx).getReg())
+    return false;
+
+  // Rewrite the copy to avoid a cross register bank penalty. 
+  unsigned NewVR = TargetRegisterInfo::isPhysicalRegister(Def) ? Def :
+    MRI->createVirtualRegister(DefRC);
+  MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                  TII->get(TargetOpcode::COPY), NewVR)
+    .addReg(Src, 0, SrcSubReg);
+  NewCopy->getOperand(0).setSubReg(DefSubReg);
+
+  MRI->replaceRegWith(Def, NewVR);
+  MRI->clearKillFlags(NewVR);
+  MI->eraseFromParent();
+  ++NumCopiesBitcasts;
+  return true;
+}
+
 /// isLoadFoldable - Check whether MI is a candidate for folding into a later
 /// instruction. We only fold loads to virtual registers and the virtual
 /// register defined has a single use.
@@ -523,7 +605,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->mayStore() || MI->isCall())
         FoldAsLoadDefReg = 0;
 
-      if ((MI->isBitcast() && optimizeBitcastInstr(MI, MBB)) ||
+      if (((MI->isBitcast() || MI->isCopy()) && optimizeCopyOrBitcast(MI)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
           (MI->isSelect() && optimizeSelect(MI))) {
         // MI is deleted.
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 27f5676..1afc1ec 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -127,6 +127,12 @@ namespace {
     /// The schedule. Null SUnit*'s represent noop instructions.
     std::vector<SUnit*> Sequence;
 
+    /// The index in BB of RegionEnd.
+    ///
+    /// This is the instruction number from the top of the current block, not
+    /// the SlotIndex. It is only used by the AntiDepBreaker.
+    unsigned EndIndex;
+
   public:
     SchedulePostRATDList(
       MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
@@ -141,11 +147,14 @@ namespace {
     ///
     void startBlock(MachineBasicBlock *BB);
 
+    // Set the index of RegionEnd within the current BB.
+    void setEndIndex(unsigned EndIdx) { EndIndex = EndIdx; }
+
     /// Initialize the scheduler state for the next scheduling region.
     virtual void enterRegion(MachineBasicBlock *bb,
                              MachineBasicBlock::iterator begin,
                              MachineBasicBlock::iterator end,
-                             unsigned endcount);
+                             unsigned regioninstrs);
 
     /// Notify that the scheduler has finished scheduling the current region.
     virtual void exitRegion();
@@ -197,7 +206,7 @@ SchedulePostRATDList::SchedulePostRATDList(
   TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
   SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs)
   : ScheduleDAGInstrs(MF, MLI, MDT, /*IsPostRA=*/true), AA(AA),
-    LiveRegs(TRI->getNumRegs())
+    LiveRegs(TRI->getNumRegs()), EndIndex(0)
 {
   const TargetMachine &TM = MF.getTarget();
   const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
@@ -223,8 +232,8 @@ SchedulePostRATDList::~SchedulePostRATDList() {
 void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
                  MachineBasicBlock::iterator begin,
                  MachineBasicBlock::iterator end,
-                 unsigned endcount) {
-  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+                 unsigned regioninstrs) {
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
   Sequence.clear();
 }
 
@@ -312,20 +321,21 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     unsigned Count = MBB->size(), CurrentCount = Count;
     for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
       MachineInstr *MI = llvm::prior(I);
+      --Count;
       // Calls are not scheduling boundaries before register allocation, but
       // post-ra we don't gain anything by scheduling across calls since we
       // don't need to worry about register pressure.
       if (MI->isCall() || TII->isSchedulingBoundary(MI, MBB, Fn)) {
-        Scheduler.enterRegion(MBB, I, Current, CurrentCount);
+        Scheduler.enterRegion(MBB, I, Current, CurrentCount - Count);
+        Scheduler.setEndIndex(CurrentCount);
         Scheduler.schedule();
         Scheduler.exitRegion();
         Scheduler.EmitSchedule();
         Current = MI;
-        CurrentCount = Count - 1;
+        CurrentCount = Count;
         Scheduler.Observe(MI, CurrentCount);
       }
       I = MI;
-      --Count;
       if (MI->isBundle())
         Count -= MI->getBundleSize();
     }
@@ -333,6 +343,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     assert((MBB->begin() == Current || CurrentCount != 0) &&
            "Instruction count mismatch!");
     Scheduler.enterRegion(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.setEndIndex(CurrentCount);
     Scheduler.schedule();
     Scheduler.exitRegion();
     Scheduler.EmitSchedule();
@@ -504,11 +515,11 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
 
     // Examine all used registers and set/clear kill flag. When a
     // register is used multiple times we only set the kill flag on
-    // the first use.
+    // the first use. Don't set kill flags on undef operands.
     killedRegs.reset();
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse()) continue;
+      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
       unsigned Reg = MO.getReg();
       if ((Reg == 0) || MRI.isReserved(Reg)) continue;
 
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index e4e18c3..0c5173a 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -78,7 +78,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   unsigned Reg = MI->getOperand(0).getReg();
 
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    // For virtual regiusters, mark all uses as <undef>, and convert users to
+    // For virtual registers, mark all uses as <undef>, and convert users to
     // implicit-def when possible.
     for (MachineRegisterInfo::use_nodbg_iterator UI =
          MRI->use_nodbg_begin(Reg),
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 1965188..b0e494f 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -14,9 +14,6 @@
 // This pass must be run after register allocation.  After this pass is
 // executed, it is illegal to construct MO_FrameIndex operands.
 //
-// This pass provides an optional shrink wrapping variant of prolog/epilog
-// insertion, enabled via --shrink-wrap. See ShrinkWrapping.cpp.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "pei"
@@ -66,6 +63,38 @@ STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 STATISTIC(NumBytesStackSpace,
           "Number of bytes used for stack in all functions");
 
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<TargetPassConfig>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
+  return (MBB && !MBB->empty() && MBB->back().isReturn());
+}
+
+/// Compute the set of return blocks
+void PEI::calculateSets(MachineFunction &Fn) {
+  // Sets used to compute spill, restore placement sets.
+  const std::vector<CalleeSavedInfo> &CSI =
+    Fn.getFrameInfo()->getCalleeSavedInfo();
+
+  // If no CSRs used, we are done.
+  if (CSI.empty())
+    return;
+
+  // Save refs to entry and return blocks.
+  EntryBlock = Fn.begin();
+  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
+       MBB != E; ++MBB)
+    if (isReturnBlock(MBB))
+      ReturnBlocks.push_back(MBB);
+
+  return;
+}
+
 /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
 /// frame indexes with appropriate references.
 ///
@@ -93,16 +122,11 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   calculateCalleeSavedRegisters(Fn);
 
   // Determine placement of CSR spill/restore code:
-  //  - With shrink wrapping, place spills and restores to tightly
-  //    enclose regions in the Machine CFG of the function where
-  //    they are used.
-  //  - Without shink wrapping (default), place all spills in the
-  //    entry block, all restores in return blocks.
-  placeCSRSpillsAndRestores(Fn);
+  // place all spills in the entry block, all restores in return blocks.
+  calculateSets(Fn);
 
   // Add the code to save and restore the callee saved registers
-  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::Naked))
+  if (!F->hasFnAttribute(Attribute::Naked))
     insertCSRSpillsAndRestores(Fn);
 
   // Allow the target machine to make final modifications to the function
@@ -117,8 +141,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // called functions.  Because of this, calculateCalleeSavedRegisters()
   // must be called before this function in order to set the AdjustsStack
   // and MaxCallFrameSize variables.
-  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::Naked))
+  if (!F->hasFnAttribute(Attribute::Naked))
     insertPrologEpilogCode(Fn);
 
   // Replace all MO_FrameIndex operands with physical register references
@@ -143,7 +166,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
            << ") in " << Fn.getName()  << ".\n";
 
   delete RS;
-  clearAllSets();
+  ReturnBlocks.clear();
   return true;
 }
 
@@ -221,8 +244,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
     return;
 
   // In Naked functions we aren't going to save any registers.
-  if (F.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                    Attribute::Naked))
+  if (F.getFunction()->hasFnAttribute(Attribute::Naked))
     return;
 
   std::vector<CalleeSavedInfo> CSI;
@@ -286,7 +308,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
 }
 
 /// insertCSRSpillsAndRestores - Insert spill and restore code for
-/// callee saved registers used in the function, handling shrink wrapping.
+/// callee saved registers used in the function.
 ///
 void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   // Get callee saved register information.
@@ -304,133 +326,33 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
-  if (!ShrinkWrapThisFunction) {
-    // Spill using target interface.
-    I = EntryBlock->begin();
-    if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
-      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-        // Add the callee-saved register as live-in.
-        // It's killed at the spill.
-        EntryBlock->addLiveIn(CSI[i].getReg());
-
-        // Insert the spill to the stack frame.
-        unsigned Reg = CSI[i].getReg();
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.storeRegToStackSlot(*EntryBlock, I, Reg, true,
-                                CSI[i].getFrameIdx(), RC, TRI);
-      }
-    }
-
-    // Restore using target interface.
-    for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
-      MachineBasicBlock* MBB = ReturnBlocks[ri];
-      I = MBB->end(); --I;
-
-      // Skip over all terminator instructions, which are part of the return
-      // sequence.
-      MachineBasicBlock::iterator I2 = I;
-      while (I2 != MBB->begin() && (--I2)->isTerminator())
-        I = I2;
-
-      bool AtStart = I == MBB->begin();
-      MachineBasicBlock::iterator BeforeI = I;
-      if (!AtStart)
-        --BeforeI;
-
-      // Restore all registers immediately before the return and any
-      // terminators that precede it.
-      if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
-        for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-          unsigned Reg = CSI[i].getReg();
-          const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-          TII.loadRegFromStackSlot(*MBB, I, Reg,
-                                   CSI[i].getFrameIdx(),
-                                   RC, TRI);
-          assert(I != MBB->begin() &&
-                 "loadRegFromStackSlot didn't insert any code!");
-          // Insert in reverse order.  loadRegFromStackSlot can insert
-          // multiple instructions.
-          if (AtStart)
-            I = MBB->begin();
-          else {
-            I = BeforeI;
-            ++I;
-          }
-        }
-      }
-    }
-    return;
-  }
-
-  // Insert spills.
-  std::vector<CalleeSavedInfo> blockCSI;
-  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
-         BE = CSRSave.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet save = BI->second;
-
-    if (save.empty())
-      continue;
-
-    blockCSI.clear();
-    for (CSRegSet::iterator RI = save.begin(),
-           RE = save.end(); RI != RE; ++RI) {
-      blockCSI.push_back(CSI[*RI]);
-    }
-    assert(blockCSI.size() > 0 &&
-           "Could not collect callee saved register info");
-
-    I = MBB->begin();
-
-    // When shrink wrapping, use stack slot stores/loads.
-    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+  // Spill using target interface.
+  I = EntryBlock->begin();
+  if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
-      MBB->addLiveIn(blockCSI[i].getReg());
+      EntryBlock->addLiveIn(CSI[i].getReg());
 
       // Insert the spill to the stack frame.
-      unsigned Reg = blockCSI[i].getReg();
+      unsigned Reg = CSI[i].getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(*MBB, I, Reg,
-                              true,
-                              blockCSI[i].getFrameIdx(),
+      TII.storeRegToStackSlot(*EntryBlock, I, Reg, true, CSI[i].getFrameIdx(),
                               RC, TRI);
     }
   }
 
-  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
-         BE = CSRRestore.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet restore = BI->second;
-
-    if (restore.empty())
-      continue;
+  // Restore using target interface.
+  for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
+    MachineBasicBlock *MBB = ReturnBlocks[ri];
+    I = MBB->end();
+    --I;
 
-    blockCSI.clear();
-    for (CSRegSet::iterator RI = restore.begin(),
-           RE = restore.end(); RI != RE; ++RI) {
-      blockCSI.push_back(CSI[*RI]);
-    }
-    assert(blockCSI.size() > 0 &&
-           "Could not find callee saved register info");
-
-    // If MBB is empty and needs restores, insert at the _beginning_.
-    if (MBB->empty()) {
-      I = MBB->begin();
-    } else {
-      I = MBB->end();
-      --I;
-
-      // Skip over all terminator instructions, which are part of the
-      // return sequence.
-      if (! I->isTerminator()) {
-        ++I;
-      } else {
-        MachineBasicBlock::iterator I2 = I;
-        while (I2 != MBB->begin() && (--I2)->isTerminator())
-          I = I2;
-      }
-    }
+    // Skip over all terminator instructions, which are part of the return
+    // sequence.
+    MachineBasicBlock::iterator I2 = I;
+    while (I2 != MBB->begin() && (--I2)->isTerminator())
+      I = I2;
 
     bool AtStart = I == MBB->begin();
     MachineBasicBlock::iterator BeforeI = I;
@@ -439,21 +361,21 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
 
     // Restore all registers immediately before the return and any
     // terminators that precede it.
-    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
-      unsigned Reg = blockCSI[i].getReg();
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(*MBB, I, Reg,
-                               blockCSI[i].getFrameIdx(),
-                               RC, TRI);
-      assert(I != MBB->begin() &&
-             "loadRegFromStackSlot didn't insert any code!");
-      // Insert in reverse order.  loadRegFromStackSlot can insert
-      // multiple instructions.
-      if (AtStart)
-        I = MBB->begin();
-      else {
-        I = BeforeI;
-        ++I;
+    if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+        assert(I != MBB->begin() &&
+               "loadRegFromStackSlot didn't insert any code!");
+        // Insert in reverse order.  loadRegFromStackSlot can insert
+        // multiple instructions.
+        if (AtStart)
+          I = MBB->begin();
+        else {
+          I = BeforeI;
+          ++I;
+        }
       }
     }
   }
diff --git a/lib/CodeGen/PrologEpilogInserter.h b/lib/CodeGen/PrologEpilogInserter.h
index 50f4daf..77cfa2b 100644
--- a/lib/CodeGen/PrologEpilogInserter.h
+++ b/lib/CodeGen/PrologEpilogInserter.h
@@ -14,9 +14,6 @@
 // This pass must be run after register allocation.  After this pass is
 // executed, it is illegal to construct MO_FrameIndex operands.
 //
-// This pass also implements a shrink wrapping variant of prolog/epilog
-// insertion.
-//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CODEGEN_PEI_H
@@ -54,74 +51,16 @@ namespace llvm {
     // stack frame indexes.
     unsigned MinCSFrameIndex, MaxCSFrameIndex;
 
-    // Analysis info for spill/restore placement.
-    // "CSR": "callee saved register".
-
-    // CSRegSet contains indices into the Callee Saved Register Info
-    // vector built by calculateCalleeSavedRegisters() and accessed
-    // via MF.getFrameInfo()->getCalleeSavedInfo().
-    typedef SparseBitVector<> CSRegSet;
-
-    // CSRegBlockMap maps MachineBasicBlocks to sets of callee
-    // saved register indices.
-    typedef DenseMap<MachineBasicBlock*, CSRegSet> CSRegBlockMap;
-
-    // Set and maps for computing CSR spill/restore placement:
-    //  used in function (UsedCSRegs)
-    //  used in a basic block (CSRUsed)
-    //  anticipatable in a basic block (Antic{In,Out})
-    //  available in a basic block (Avail{In,Out})
-    //  to be spilled at the entry to a basic block (CSRSave)
-    //  to be restored at the end of a basic block (CSRRestore)
-    CSRegSet UsedCSRegs;
-    CSRegBlockMap CSRUsed;
-    CSRegBlockMap AnticIn, AnticOut;
-    CSRegBlockMap AvailIn, AvailOut;
-    CSRegBlockMap CSRSave;
-    CSRegBlockMap CSRRestore;
-
     // Entry and return blocks of the current function.
     MachineBasicBlock* EntryBlock;
     SmallVector<MachineBasicBlock*, 4> ReturnBlocks;
 
-    // Map of MBBs to top level MachineLoops.
-    DenseMap<MachineBasicBlock*, MachineLoop*> TLLoops;
-
-    // Flag to control shrink wrapping per-function:
-    // may choose to skip shrink wrapping for certain
-    // functions.
-    bool ShrinkWrapThisFunction;
-
     // Flag to control whether to use the register scavenger to resolve
     // frame index materialization registers. Set according to
     // TRI->requiresFrameIndexScavenging() for the curren function.
     bool FrameIndexVirtualScavenging;
 
-#ifndef NDEBUG
-    // Machine function handle.
-    MachineFunction* MF;
-
-    // Flag indicating that the current function
-    // has at least one "short" path in the machine
-    // CFG from the entry block to an exit block.
-    bool HasFastExitPath;
-#endif
-
-    bool calculateSets(MachineFunction &Fn);
-    bool calcAnticInOut(MachineBasicBlock* MBB);
-    bool calcAvailInOut(MachineBasicBlock* MBB);
-    void calculateAnticAvail(MachineFunction &Fn);
-    bool addUsesForMEMERegion(MachineBasicBlock* MBB,
-                              SmallVectorImpl<MachineBasicBlock *> &blks);
-    bool addUsesForTopLevelLoops(SmallVectorImpl<MachineBasicBlock *> &blks);
-    bool calcSpillPlacements(MachineBasicBlock* MBB,
-                             SmallVectorImpl<MachineBasicBlock *> &blks,
-                             CSRegBlockMap &prevSpills);
-    bool calcRestorePlacements(MachineBasicBlock* MBB,
-                               SmallVectorImpl<MachineBasicBlock *> &blks,
-                               CSRegBlockMap &prevRestores);
-    void placeSpillsAndRestores(MachineFunction &Fn);
-    void placeCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateSets(MachineFunction &Fn);
     void calculateCallsInformation(MachineFunction &Fn);
     void calculateCalleeSavedRegisters(MachineFunction &Fn);
     void insertCSRSpillsAndRestores(MachineFunction &Fn);
@@ -132,44 +71,8 @@ namespace llvm {
     void scavengeFrameVirtualRegs(MachineFunction &Fn);
     void insertPrologEpilogCode(MachineFunction &Fn);
 
-    // Initialize DFA sets, called before iterations.
-    void clearAnticAvailSets();
-    // Clear all sets constructed by shrink wrapping.
-    void clearAllSets();
-
-    // Initialize all shrink wrapping data.
-    void initShrinkWrappingInfo();
-
-    // Convienences for dealing with machine loops.
-    MachineBasicBlock* getTopLevelLoopPreheader(MachineLoop* LP);
-    MachineLoop* getTopLevelLoopParent(MachineLoop *LP);
-
-    // Propgate CSRs used in MBB to all MBBs of loop LP.
-    void propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP);
-
     // Convenience for recognizing return blocks.
     bool isReturnBlock(MachineBasicBlock* MBB);
-
-#ifndef NDEBUG
-    // Debugging methods.
-
-    // Mark this function as having fast exit paths.
-    void findFastExitPath();
-
-    // Verify placement of spills/restores.
-    void verifySpillRestorePlacement();
-
-    std::string getBasicBlockName(const MachineBasicBlock* MBB);
-    std::string stringifyCSRegSet(const CSRegSet& s);
-    void dumpSet(const CSRegSet& s);
-    void dumpUsed(MachineBasicBlock* MBB);
-    void dumpAllUsed();
-    void dumpSets(MachineBasicBlock* MBB);
-    void dumpSets1(MachineBasicBlock* MBB);
-    void dumpAllSets();
-    void dumpSRSets();
-#endif
-
   };
 } // End llvm namespace
 #endif
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index df3e12a..293e306 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -50,6 +50,9 @@ bool RegAllocBase::VerifyEnabled = false;
 //                         RegAllocBase Implementation
 //===----------------------------------------------------------------------===//
 
+// Pin the vtable to this file.
+void RegAllocBase::anchor() {}
+
 void RegAllocBase::init(VirtRegMap &vrm,
                         LiveIntervals &lis,
                         LiveRegMatrix &mat) {
@@ -99,14 +102,13 @@ void RegAllocBase::allocatePhysRegs() {
     // result from splitting.
     DEBUG(dbgs() << "\nselectOrSplit "
                  << MRI->getRegClass(VirtReg->reg)->getName()
-                 << ':' << PrintReg(VirtReg->reg) << ' ' << *VirtReg << '\n');
-    typedef SmallVector<LiveInterval*, 4> VirtRegVec;
+                 << ':' << *VirtReg << '\n');
+    typedef SmallVector<unsigned, 4> VirtRegVec;
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
 
     if (AvailablePhysReg == ~0u) {
       // selectOrSplit failed to find a register!
-      const char *Msg = "ran out of registers during register allocation";
       // Probably caused by an inline asm.
       MachineInstr *MI;
       for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(VirtReg->reg);
@@ -114,9 +116,9 @@ void RegAllocBase::allocatePhysRegs() {
         if (MI->isInlineAsm())
           break;
       if (MI)
-        MI->emitError(Msg);
+        MI->emitError("inline assembly requires more registers than available");
       else
-        report_fatal_error(Msg);
+        report_fatal_error("ran out of registers during register allocation");
       // Keep going after reporting the error.
       VRM->assignVirt2Phys(VirtReg->reg,
                  RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg)).front());
@@ -128,7 +130,7 @@ void RegAllocBase::allocatePhysRegs() {
 
     for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
          I != E; ++I) {
-      LiveInterval *SplitVirtReg = *I;
+      LiveInterval *SplitVirtReg = &LIS->getInterval(*I);
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
         DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index ccaabba..c17a8d9 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -38,7 +38,7 @@
 #define LLVM_CODEGEN_REGALLOCBASE
 
 #include "llvm/ADT/OwningPtr.h"
-#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 
 namespace llvm {
@@ -57,6 +57,7 @@ class Spiller;
 /// live range splitting. They must also override enqueue/dequeue to provide an
 /// assignment order.
 class RegAllocBase {
+  virtual void anchor();
 protected:
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
@@ -90,7 +91,7 @@ protected:
   // or new set of split live virtual registers. It is up to the splitter to
   // converge quickly toward fully spilled live ranges.
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &splitLVRs) = 0;
+                                 SmallVectorImpl<unsigned> &splitLVRs) = 0;
 
   // Use this group name for NamedRegionTimer.
   static const char TimerGroupName[];
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index d6a7d6f..6768e45 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -102,7 +102,7 @@ public:
   }
 
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &SplitVRegs);
+                                 SmallVectorImpl<unsigned> &SplitVRegs);
 
   /// Perform register allocation.
   virtual bool runOnMachineFunction(MachineFunction &mf);
@@ -111,7 +111,7 @@ public:
   // that interfere with the most recently queried lvr.  Return true if spilling
   // was successful, and append any new spilled/split intervals to splitLVRs.
   bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                          SmallVectorImpl<LiveInterval*> &SplitVRegs);
+                          SmallVectorImpl<unsigned> &SplitVRegs);
 
   static char ID;
 };
@@ -126,7 +126,6 @@ RABasic::RABasic(): MachineFunctionPass(ID) {
   initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
   initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
   initializeLiveStacksPass(*PassRegistry::getPassRegistry());
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -143,7 +142,6 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
   AU.addPreserved<LiveDebugVariables>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
   AU.addRequired<MachineBlockFrequencyInfo>();
@@ -168,7 +166,7 @@ void RABasic::releaseMemory() {
 // that interfere with VirtReg. The newly spilled or split live intervals are
 // returned by appending them to SplitVRegs.
 bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+                                 SmallVectorImpl<unsigned> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
   SmallVector<LiveInterval*, 8> Intfs;
@@ -222,7 +220,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
 // minimal, there is no value in caching them outside the scope of
 // selectOrSplit().
 unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
-                                SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+                                SmallVectorImpl<unsigned> &SplitVRegs) {
   // Populate a list of physical register spill candidates.
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
@@ -279,6 +277,11 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
                      getAnalysis<LiveIntervals>(),
                      getAnalysis<LiveRegMatrix>());
+
+  calculateSpillWeightsAndHints(*LIS, *MF,
+                                getAnalysis<MachineLoopInfo>(),
+                                getAnalysis<MachineBlockFrequencyInfo>());
+
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 6617e50..e92dbd2 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -144,7 +144,7 @@ namespace {
     // not be erased.
     bool isBulkSpilling;
 
-    enum {
+    enum LLVM_ENUM_INT_TYPE(unsigned) {
       spillClean = 1,
       spillDirty = 100,
       spillImpossible = ~0u
@@ -298,7 +298,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
     for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
       MachineInstr *DBG = LRIDbgValues[li];
       const MDNode *MDPtr = DBG->getOperand(2).getMetadata();
-      bool IsIndirect = DBG->getOperand(1).isImm(); // Register-indirect value?
+      bool IsIndirect = DBG->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0;
       DebugLoc DL;
       if (MI == MBB->end()) {
@@ -569,7 +569,10 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   }
 
   // Nothing we can do. Report an error and keep going with a bad allocation.
-  MI->emitError("ran out of registers during register allocation");
+  if (MI->isInlineAsm())
+    MI->emitError("inline assembly requires more registers than available");
+  else
+    MI->emitError("ran out of registers during register allocation");
   definePhysReg(MI, *AO.begin(), regFree);
   return assignVirtToPhysReg(VirtReg, *AO.begin());
 }
@@ -856,7 +859,7 @@ void RAFast::AllocateBasicBlock() {
             }
             else {
               // Modify DBG_VALUE now that the value is in a spill slot.
-              bool IsIndirect = MI->getOperand(1).isImm();
+              bool IsIndirect = MI->isIndirectDebugValue();
               uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
               const MDNode *MDPtr =
                 MI->getOperand(MI->getNumOperands()-1).getMetadata();
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index f9e363b..c08d955 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -120,7 +120,9 @@ class RAGreedy : public MachineFunctionPass,
     RS_Done
   };
 
+#ifndef NDEBUG
   static const char *const StageName[];
+#endif
 
   // RegInfo - Keep additional information about each live range.
   struct RegInfo {
@@ -147,7 +149,7 @@ class RAGreedy : public MachineFunctionPass,
   void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
     ExtraRegInfo.resize(MRI->getNumVirtRegs());
     for (;Begin != End; ++Begin) {
-      unsigned Reg = (*Begin)->reg;
+      unsigned Reg = *Begin;
       if (ExtraRegInfo[Reg].Stage == RS_New)
         ExtraRegInfo[Reg].Stage = NewStage;
     }
@@ -220,7 +222,7 @@ class RAGreedy : public MachineFunctionPass,
   /// class.
   SmallVector<GlobalSplitCandidate, 32> GlobalCand;
 
-  enum { NoCand = ~0u };
+  enum LLVM_ENUM_INT_TYPE(unsigned) { NoCand = ~0u };
 
   /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
   /// NoCand which indicates the stack interval.
@@ -241,7 +243,7 @@ public:
   virtual void enqueue(LiveInterval *LI);
   virtual LiveInterval *dequeue();
   virtual unsigned selectOrSplit(LiveInterval&,
-                                 SmallVectorImpl<LiveInterval*>&);
+                                 SmallVectorImpl<unsigned>&);
 
   /// Perform register allocation.
   virtual bool runOnMachineFunction(MachineFunction &mf);
@@ -265,22 +267,22 @@ private:
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
   bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&);
   void evictInterference(LiveInterval&, unsigned,
-                         SmallVectorImpl<LiveInterval*>&);
+                         SmallVectorImpl<unsigned>&);
 
   unsigned tryAssign(LiveInterval&, AllocationOrder&,
-                     SmallVectorImpl<LiveInterval*>&);
+                     SmallVectorImpl<unsigned>&);
   unsigned tryEvict(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<LiveInterval*>&, unsigned = ~0u);
+                    SmallVectorImpl<unsigned>&, unsigned = ~0u);
   unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
-                          SmallVectorImpl<LiveInterval*>&);
+                          SmallVectorImpl<unsigned>&);
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
-                         SmallVectorImpl<LiveInterval*>&);
+                         SmallVectorImpl<unsigned>&);
   unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
-                               SmallVectorImpl<LiveInterval*>&);
+                               SmallVectorImpl<unsigned>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
-    SmallVectorImpl<LiveInterval*>&);
+    SmallVectorImpl<unsigned>&);
   unsigned trySplit(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<LiveInterval*>&);
+                    SmallVectorImpl<unsigned>&);
 };
 } // end anonymous namespace
 
@@ -313,7 +315,6 @@ RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
   initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
   initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
   initializeLiveStacksPass(*PassRegistry::getPassRegistry());
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -337,7 +338,6 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveDebugVariables>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -455,7 +455,7 @@ LiveInterval *RAGreedy::dequeue() {
 /// tryAssign - Try to assign VirtReg to an available register.
 unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
                              AllocationOrder &Order,
-                             SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                             SmallVectorImpl<unsigned> &NewVRegs) {
   Order.rewind();
   unsigned PhysReg;
   while ((PhysReg = Order.next()))
@@ -638,7 +638,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 /// from being assigned to Physreg. This assumes that canEvictInterference
 /// returned true.
 void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
   // evicted by a newer cascade, preventing infinite loops.
@@ -670,7 +670,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
            "Cannot decrease cascade number, illegal eviction");
     ExtraRegInfo[Intf->reg].Cascade = Cascade;
     ++NumEvicted;
-    NewVRegs.push_back(Intf);
+    NewVRegs.push_back(Intf->reg);
   }
 }
 
@@ -680,7 +680,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 /// @return         Physreg to assign VirtReg, or 0.
 unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
                             AllocationOrder &Order,
-                            SmallVectorImpl<LiveInterval*> &NewVRegs,
+                            SmallVectorImpl<unsigned> &NewVRegs,
                             unsigned CostPerUseLimit) {
   NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled);
 
@@ -1125,7 +1125,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(Reg, LREdit.regs());
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   unsigned OrigBlocks = SA->getNumLiveBlocks();
@@ -1136,7 +1136,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // - Block-local splits are candidates for local splitting.
   // - DCE leftovers should go back on the queue.
   for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &Reg = *LREdit.get(i);
+    LiveInterval &Reg = LIS->getInterval(LREdit.get(i));
 
     // Ignore old intervals from DCE.
     if (getStage(Reg) != RS_New)
@@ -1170,7 +1170,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 }
 
 unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                  SmallVectorImpl<unsigned> &NewVRegs) {
   unsigned NumCands = 0;
   unsigned BestCand = NoCand;
   BlockFrequency BestCost;
@@ -1305,7 +1305,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// creates a lot of local live ranges, that will be split by tryLocalSplit if
 /// they don't allocate.
 unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
@@ -1326,14 +1326,14 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->finish(&IntvMap);
 
   // Tell LiveDebugVariables about the new ranges.
-  DebugVars->splitRegister(Reg, LREdit.regs());
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
 
   // Sort out the new intervals created by splitting. The remainder interval
   // goes straight to spilling, the new local ranges get to stay RS_New.
   for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &LI = *LREdit.get(i);
+    LiveInterval &LI = LIS->getInterval(LREdit.get(i));
     if (getStage(LI) == RS_New && IntvMap[i] == 0)
       setStage(LI, RS_Spill);
   }
@@ -1357,7 +1357,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// This is similar to spilling to a larger register class.
 unsigned
 RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                              SmallVectorImpl<unsigned> &NewVRegs) {
   // There is no point to this if there are no larger sub-classes.
   if (!RegClassInfo.isProperSubClass(MRI->getRegClass(VirtReg.reg)))
     return 0;
@@ -1393,7 +1393,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
 
   // Assign all new registers to RS_Spill. This was the last chance.
@@ -1464,9 +1464,9 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
 
   // Add fixed interference.
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    const LiveInterval &LI = LIS->getRegUnit(*Units);
-    LiveInterval::const_iterator I = LI.find(StartIdx);
-    LiveInterval::const_iterator E = LI.end();
+    const LiveRange &LR = LIS->getRegUnit(*Units);
+    LiveRange::const_iterator I = LR.find(StartIdx);
+    LiveRange::const_iterator E = LR.end();
 
     // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
     for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
@@ -1477,7 +1477,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
         break;
 
       for (; Gap != NumGaps; ++Gap) {
-        GapWeight[Gap] = HUGE_VALF;
+        GapWeight[Gap] = llvm::huge_valf;
         if (Uses[Gap+1].getBaseIndex() >= I->end)
           break;
       }
@@ -1491,7 +1491,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
 /// basic block.
 ///
 unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
   const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
 
@@ -1583,7 +1583,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // Remove any gaps with regmask clobbers.
     if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
       for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
-        GapWeight[RegMaskGaps[i]] = HUGE_VALF;
+        GapWeight[RegMaskGaps[i]] = llvm::huge_valf;
 
     // Try to find the best sequence of gaps to close.
     // The new spill weight must be larger than any gap interference.
@@ -1618,7 +1618,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       // Legally, without causing looping?
       bool Legal = !ProgressRequired || NewGaps < NumGaps;
 
-      if (Legal && MaxGap < HUGE_VALF) {
+      if (Legal && MaxGap < llvm::huge_valf) {
         // Estimate the new spill weight. Each instruction reads or writes the
         // register. Conservatively assume there are no read-modify-write
         // instructions.
@@ -1685,7 +1685,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->useIntv(SegStart, SegStop);
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
 
   // If the new range has the same number of instructions as before, mark it as
   // RS_Split2 so the next split will be forced to make progress. Otherwise,
@@ -1698,8 +1698,8 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     assert(!ProgressRequired && "Didn't make progress when it was required.");
     for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
       if (IntvMap[i] == 1) {
-        setStage(*LREdit.get(i), RS_Split2);
-        DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg));
+        setStage(LIS->getInterval(LREdit.get(i)), RS_Split2);
+        DEBUG(dbgs() << PrintReg(LREdit.get(i)));
       }
     DEBUG(dbgs() << '\n');
   }
@@ -1716,7 +1716,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// assignable.
 /// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
 unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                            SmallVectorImpl<LiveInterval*>&NewVRegs) {
+                            SmallVectorImpl<unsigned>&NewVRegs) {
   // Ranges must be Split2 or less.
   if (getStage(VirtReg) >= RS_Spill)
     return 0;
@@ -1765,7 +1765,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 //===----------------------------------------------------------------------===//
 
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   // First try assigning a free register.
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
@@ -1790,7 +1790,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
   if (Stage < RS_Split) {
     setStage(VirtReg, RS_Split);
     DEBUG(dbgs() << "wait for second round\n");
-    NewVRegs.push_back(&VirtReg);
+    NewVRegs.push_back(VirtReg.reg);
     return 0;
   }
 
@@ -1838,6 +1838,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   SpillPlacer = &getAnalysis<SpillPlacement>();
   DebugVars = &getAnalysis<LiveDebugVariables>();
 
+  calculateSpillWeightsAndHints(*LIS, mf, *Loops, *MBFI);
+
   DEBUG(LIS->dump());
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 81ecca1..88c8201 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -95,7 +95,6 @@ public:
       : MachineFunctionPass(ID), builder(b.take()), customPassID(cPassID) {
     initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
-    initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
     initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
   }
@@ -158,13 +157,13 @@ char RegAllocPBQP::ID = 0;
 
 } // End anonymous namespace.
 
-unsigned PBQPRAProblem::getVRegForNode(PBQP::Graph::ConstNodeItr node) const {
+unsigned PBQPRAProblem::getVRegForNode(PBQP::Graph::NodeId node) const {
   Node2VReg::const_iterator vregItr = node2VReg.find(node);
   assert(vregItr != node2VReg.end() && "No vreg for node.");
   return vregItr->second;
 }
 
-PBQP::Graph::NodeItr PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
+PBQP::Graph::NodeId PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
   VReg2Node::const_iterator nodeItr = vreg2Node.find(vreg);
   assert(nodeItr != vreg2Node.end() && "No node for vreg.");
   return nodeItr->second;
@@ -247,7 +246,7 @@ PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
     }
 
     // Construct the node.
-    PBQP::Graph::NodeItr node =
+    PBQP::Graph::NodeId node =
       g.addNode(PBQP::Vector(vrAllowed.size() + 1, 0));
 
     // Record the mapping and allowed set in the problem.
@@ -273,7 +272,7 @@ PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
 
       assert(!l2.empty() && "Empty interval in vreg set?");
       if (l1.overlaps(l2)) {
-        PBQP::Graph::EdgeItr edge =
+        PBQP::Graph::EdgeId edge =
           g.addEdge(p->getNodeForVReg(vr1), p->getNodeForVReg(vr2),
                     PBQP::Matrix(vr1Allowed.size()+1, vr2Allowed.size()+1, 0));
 
@@ -364,16 +363,16 @@ PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
         }
         if (pregOpt < allowed.size()) {
           ++pregOpt; // +1 to account for spill option.
-          PBQP::Graph::NodeItr node = p->getNodeForVReg(src);
+          PBQP::Graph::NodeId node = p->getNodeForVReg(src);
           addPhysRegCoalesce(g.getNodeCosts(node), pregOpt, cBenefit);
         }
       } else {
         const PBQPRAProblem::AllowedSet *allowed1 = &p->getAllowedSet(dst);
         const PBQPRAProblem::AllowedSet *allowed2 = &p->getAllowedSet(src);
-        PBQP::Graph::NodeItr node1 = p->getNodeForVReg(dst);
-        PBQP::Graph::NodeItr node2 = p->getNodeForVReg(src);
-        PBQP::Graph::EdgeItr edge = g.findEdge(node1, node2);
-        if (edge == g.edgesEnd()) {
+        PBQP::Graph::NodeId node1 = p->getNodeForVReg(dst);
+        PBQP::Graph::NodeId node2 = p->getNodeForVReg(src);
+        PBQP::Graph::EdgeId edge = g.findEdge(node1, node2);
+        if (edge == g.invalidEdgeId()) {
           edge = g.addEdge(node1, node2, PBQP::Matrix(allowed1->size() + 1,
                                                       allowed2->size() + 1,
                                                       0));
@@ -432,7 +431,6 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   //au.addRequiredID(SplitCriticalEdgesID);
   if (customPassID)
     au.addRequiredID(*customPassID);
-  au.addRequired<CalculateSpillWeights>();
   au.addRequired<LiveStacks>();
   au.addPreserved<LiveStacks>();
   au.addRequired<MachineBlockFrequencyInfo>();
@@ -477,11 +475,11 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
   const PBQP::Graph &g = problem.getGraph();
   // Iterate over the nodes mapping the PBQP solution to a register
   // assignment.
-  for (PBQP::Graph::ConstNodeItr node = g.nodesBegin(),
-                                 nodeEnd = g.nodesEnd();
-       node != nodeEnd; ++node) {
-    unsigned vreg = problem.getVRegForNode(node);
-    unsigned alloc = solution.getSelection(node);
+  for (PBQP::Graph::NodeItr nodeItr = g.nodesBegin(),
+                            nodeEnd = g.nodesEnd();
+       nodeItr != nodeEnd; ++nodeItr) {
+    unsigned vreg = problem.getVRegForNode(*nodeItr);
+    unsigned alloc = solution.getSelection(*nodeItr);
 
     if (problem.isPRegOption(vreg, alloc)) {
       unsigned preg = problem.getPRegForOption(vreg, alloc);
@@ -491,7 +489,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       vrm->assignVirt2Phys(vreg, preg);
     } else if (problem.isSpillOption(vreg, alloc)) {
       vregsToAlloc.erase(vreg);
-      SmallVector<LiveInterval*, 8> newSpills;
+      SmallVector<unsigned, 8> newSpills;
       LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
       spiller->spill(LRE);
 
@@ -502,9 +500,10 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       // allocate.
       for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
            itr != end; ++itr) {
-        assert(!(*itr)->empty() && "Empty spill range.");
-        DEBUG(dbgs() << PrintReg((*itr)->reg, tri) << " ");
-        vregsToAlloc.insert((*itr)->reg);
+        LiveInterval &li = lis->getInterval(*itr);
+        assert(!li.empty() && "Empty spill range.");
+        DEBUG(dbgs() << PrintReg(li.reg, tri) << " ");
+        vregsToAlloc.insert(li.reg);
       }
 
       DEBUG(dbgs() << ")\n");
@@ -550,6 +549,9 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   lss = &getAnalysis<LiveStacks>();
   mbfi = &getAnalysis<MachineBlockFrequencyInfo>();
 
+  calculateSpillWeightsAndHints(*lis, MF, getAnalysis<MachineLoopInfo>(),
+                                *mbfi);
+
   vrm = &getAnalysis<VirtRegMap>();
   spiller.reset(createInlineSpiller(*this, MF, *vrm));
 
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index f99f1a3..dd86c1f 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -398,7 +398,7 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void RegisterCoalescer::eliminateDeadDefs() {
-  SmallVector<LiveInterval*, 8> NewRegs;
+  SmallVector<unsigned, 8> NewRegs;
   LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs);
 }
 
@@ -434,11 +434,11 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
 
-  // BValNo is a value number in B that is defined by a copy from A.  'B3' in
+  // BValNo is a value number in B that is defined by a copy from A.  'B1' in
   // the example above.
-  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
-  if (BLR == IntB.end()) return false;
-  VNInfo *BValNo = BLR->valno;
+  LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
+  if (BS == IntB.end()) return false;
+  VNInfo *BValNo = BS->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
   // an unknown definition point or it is defined at CopyIdx.  If unknown, we
@@ -447,10 +447,10 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
 
   // AValNo is the value number in A that defines the copy, A3 in the example.
   SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
-  LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyUseIdx);
-  // The live range might not exist after fun with physreg coalescing.
-  if (ALR == IntA.end()) return false;
-  VNInfo *AValNo = ALR->valno;
+  LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
+  // The live segment might not exist after fun with physreg coalescing.
+  if (AS == IntA.end()) return false;
+  VNInfo *AValNo = AS->valno;
 
   // If AValNo is defined as a copy from IntB, we can potentially process this.
   // Get the instruction that defines this value number.
@@ -459,54 +459,54 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
     return false;
 
-  // Get the LiveRange in IntB that this value number starts with.
-  LiveInterval::iterator ValLR =
-    IntB.FindLiveRangeContaining(AValNo->def.getPrevSlot());
-  if (ValLR == IntB.end())
+  // Get the Segment in IntB that this value number starts with.
+  LiveInterval::iterator ValS =
+    IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
+  if (ValS == IntB.end())
     return false;
 
-  // Make sure that the end of the live range is inside the same block as
+  // Make sure that the end of the live segment is inside the same block as
   // CopyMI.
-  MachineInstr *ValLREndInst =
-    LIS->getInstructionFromIndex(ValLR->end.getPrevSlot());
-  if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
+  MachineInstr *ValSEndInst =
+    LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
+  if (!ValSEndInst || ValSEndInst->getParent() != CopyMI->getParent())
     return false;
 
-  // Okay, we now know that ValLR ends in the same block that the CopyMI
-  // live-range starts.  If there are no intervening live ranges between them in
-  // IntB, we can merge them.
-  if (ValLR+1 != BLR) return false;
+  // Okay, we now know that ValS ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live segments between them
+  // in IntB, we can merge them.
+  if (ValS+1 != BS) return false;
 
   DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
 
-  SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
+  SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
   // We are about to delete CopyMI, so need to remove it as the 'instruction
   // that defines this value #'. Update the valnum with the new defining
   // instruction #.
   BValNo->def = FillerStart;
 
   // Okay, we can merge them.  We need to insert a new liverange:
-  // [ValLR.end, BLR.begin) of either value number, then we merge the
+  // [ValS.end, BS.begin) of either value number, then we merge the
   // two value numbers.
-  IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
+  IntB.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, BValNo));
 
   // Okay, merge "B1" into the same value number as "B0".
-  if (BValNo != ValLR->valno)
-    IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+  if (BValNo != ValS->valno)
+    IntB.MergeValueNumberInto(BValNo, ValS->valno);
   DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
   // merge, unset the isKill marker given the live range has been extended.
-  int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true);
   if (UIdx != -1) {
-    ValLREndInst->getOperand(UIdx).setIsKill(false);
+    ValSEndInst->getOperand(UIdx).setIsKill(false);
   }
 
   // Rewrite the copy. If the copy instruction was killing the destination
   // register before the merge, find the last use and trim the live range. That
   // will also add the isKill marker.
   CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
-  if (ALR->end == CopyIdx)
+  if (AS->end == CopyIdx)
     LIS->shrinkToUses(&IntA);
 
   ++numExtends;
@@ -527,11 +527,11 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
-    LiveInterval::Ranges::iterator BI =
-      std::upper_bound(IntB.ranges.begin(), IntB.ranges.end(), AI->start);
-    if (BI != IntB.ranges.begin())
+    LiveInterval::iterator BI =
+      std::upper_bound(IntB.begin(), IntB.end(), AI->start);
+    if (BI != IntB.begin())
       --BI;
-    for (; BI != IntB.ranges.end() && AI->end >= BI->start; ++BI) {
+    for (; BI != IntB.end() && AI->end >= BI->start; ++BI) {
       if (BI->valno == BValNo)
         continue;
       if (BI->start <= AI->start && BI->end > AI->start)
@@ -577,14 +577,12 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   LiveInterval &IntB =
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
 
-  // BValNo is a value number in B that is defined by a copy from A. 'B3' in
+  // BValNo is a value number in B that is defined by a copy from A. 'B1' in
   // the example above.
   VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
   if (!BValNo || BValNo->def != CopyIdx)
     return false;
 
-  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
-
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
   assert(AValNo && "COPY source not live");
@@ -614,7 +612,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
   unsigned NewReg = NewDstMO.getReg();
-  if (NewReg != IntB.reg || !LiveRangeQuery(IntB, AValNo->def).isKill())
+  if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
     return false;
 
   // Make sure there are no other definitions of IntB that would reach the
@@ -629,8 +627,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
        UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
     MachineInstr *UseMI = &*UI;
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
-    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end() || ULR->valno != AValNo)
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
       continue;
     // If this use is tied to a def, we can't rewrite the register.
     if (UseMI->isRegTiedToDefOperand(UI.getOperandNo()))
@@ -681,8 +679,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     }
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
-    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end() || ULR->valno != AValNo)
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
@@ -712,14 +710,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     UseMI->eraseFromParent();
   }
 
-  // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
+  // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
   // is updated.
   VNInfo *ValNo = BValNo;
   ValNo->def = AValNo->def;
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
-    IntB.addRange(LiveRange(AI->start, AI->end, ValNo));
+    IntB.addSegment(LiveInterval::Segment(AI->start, AI->end, ValNo));
   }
   DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
 
@@ -744,7 +742,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
 
   LiveInterval &SrcInt = LIS->getInterval(SrcReg);
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
-  VNInfo *ValNo = LiveRangeQuery(SrcInt, CopyIdx).valueIn();
+  VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
   assert(ValNo && "CopyMI input register not live");
   if (ValNo->isPHIDef() || ValNo->isUnused())
     return false;
@@ -876,8 +874,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
   for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
     unsigned Reg = NewMIImplDefs[i];
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
-      if (LiveInterval *LI = LIS->getCachedRegUnit(*Units))
-        LI->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+      if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
   DEBUG(dbgs() << "Remat: " << *NewMI);
@@ -1048,7 +1046,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
     DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
-    LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(CopyMI));
+    LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(CopyMI));
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
       VNInfo *ReadVNI = LRQ.valueIn();
       assert(ReadVNI && "No value before copy and no <undef> flag.");
@@ -1091,8 +1089,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     });
 
     // When possible, let DstReg be the larger interval.
-    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
-                           LIS->getInterval(CP.getDstReg()).ranges.size())
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
+                           LIS->getInterval(CP.getDstReg()).size())
       CP.flip();
   }
 
@@ -1109,7 +1107,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
       return true;
 
-    // If we can eliminate the copy without merging the live ranges, do so now.
+    // If we can eliminate the copy without merging the live segments, do so
+    // now.
     if (!CP.isPartial() && !CP.isPhys()) {
       if (adjustCopiesBackFrom(CP, CopyMI) ||
           removeCopyByCommutingDef(CP, CopyMI)) {
@@ -1157,10 +1156,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    dbgs() << "\tJoined. Result = " << PrintReg(CP.getDstReg(), TRI);
-    if (!CP.isPhys())
+    dbgs() << "\tJoined. Result = ";
+    if (CP.isPhys())
+      dbgs() << PrintReg(CP.getDstReg(), TRI);
+    else
       dbgs() << LIS->getInterval(CP.getDstReg());
-     dbgs() << '\n';
+    dbgs() << '\n';
   });
 
   ++numJoins;
@@ -1172,8 +1173,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   assert(CP.isPhys() && "Must be a physreg copy");
   assert(MRI->isReserved(CP.getDstReg()) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
-  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
-               << '\n');
+  DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
 
   assert(CP.isFlipped() && RHS.containsOneValue() &&
          "Invalid join with reserved register");
@@ -1442,7 +1442,7 @@ VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
     unsigned Reg = MI->getOperand(1).getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       break;
-    LiveRangeQuery LRQ(LIS->getInterval(Reg), VNI->def);
+    LiveQueryResult LRQ = LIS->getInterval(Reg).Query(VNI->def);
     if (!LRQ.valueIn())
       break;
     VNI = LRQ.valueIn();
@@ -1493,7 +1493,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
     // The <read-undef> flag on the def operand means that old lane values are
     // not important.
     if (Redef) {
-      V.RedefVNI = LiveRangeQuery(LI, VNI->def).valueIn();
+      V.RedefVNI = LI.Query(VNI->def).valueIn();
       assert(V.RedefVNI && "Instruction is reading nonexistent value");
       computeAssignment(V.RedefVNI->id, Other);
       V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
@@ -1510,7 +1510,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   }
 
   // Find the value in Other that overlaps VNI->def, if any.
-  LiveRangeQuery OtherLRQ(Other.LI, VNI->def);
+  LiveQueryResult OtherLRQ = Other.LI.Query(VNI->def);
 
   // It is possible that both values are defined by the same instruction, or
   // the values are PHIs defined in the same block. When that happens, the two
@@ -1969,8 +1969,8 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
   JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
 
-  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
-               << "\n\t\tLHS = " << PrintReg(CP.getDstReg()) << ' ' << LHS
+  DEBUG(dbgs() << "\t\tRHS = " << RHS
+               << "\n\t\tLHS = " << LHS
                << '\n');
 
   // First compute NewVNInfo and the simple value mappings.
@@ -2001,8 +2001,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
     LIS->shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
 
   // Join RHS into LHS.
-  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo,
-           MRI);
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo);
 
   // Kill flags are going to be wrong if the live ranges were overlapping.
   // Eventually, we should simply clear all kill flags when computing live
@@ -2017,7 +2016,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   // CR_Replace conflicts.
   DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
                << " points: " << LHS << '\n');
-  LIS->extendToIndices(&LHS, EndPoints);
+  LIS->extendToIndices(LHS, EndPoints);
   return true;
 }
 
@@ -2043,9 +2042,8 @@ struct MBBPriorityInfo {
 // block (the unsigned), and then on the MBB number.
 //
 // EnableGlobalCopies assumes that the primary sort key is loop depth.
-static int compareMBBPriority(const void *L, const void *R) {
-  const MBBPriorityInfo *LHS = static_cast<const MBBPriorityInfo*>(L);
-  const MBBPriorityInfo *RHS = static_cast<const MBBPriorityInfo*>(R);
+static int compareMBBPriority(const MBBPriorityInfo *LHS,
+                              const MBBPriorityInfo *RHS) {
   // Deeper loops first
   if (LHS->Depth != RHS->Depth)
     return LHS->Depth > RHS->Depth ? -1 : 1;
@@ -2203,7 +2201,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
 
   const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
   if (EnableGlobalCopies == cl::BOU_UNSET)
-    JoinGlobalCopies = ST.enableMachineScheduler();
+    JoinGlobalCopies = ST.useMachineScheduler();
   else
     JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
 
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index b7ab138..092ecdd 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -25,53 +25,19 @@ using namespace llvm;
 
 /// Increase pressure for each pressure set provided by TargetRegisterInfo.
 static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                std::vector<unsigned> &MaxSetPressure,
-                                const int *PSet, unsigned Weight) {
-  for (; *PSet != -1; ++PSet) {
-    CurrSetPressure[*PSet] += Weight;
-    if (&CurrSetPressure != &MaxSetPressure
-        && CurrSetPressure[*PSet] > MaxSetPressure[*PSet]) {
-      MaxSetPressure[*PSet] = CurrSetPressure[*PSet];
-    }
-  }
+                                PSetIterator PSetI) {
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI)
+    CurrSetPressure[*PSetI] += Weight;
 }
 
 /// Decrease pressure for each pressure set provided by TargetRegisterInfo.
 static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                const int *PSet, unsigned Weight) {
-  for (; *PSet != -1; ++PSet) {
-    assert(CurrSetPressure[*PSet] >= Weight && "register pressure underflow");
-    CurrSetPressure[*PSet] -= Weight;
-  }
-}
-
-/// Directly increase pressure only within this RegisterPressure result.
-void RegisterPressure::increase(unsigned Reg, const TargetRegisterInfo *TRI,
-                                const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    increaseSetPressure(MaxSetPressure, MaxSetPressure,
-                        TRI->getRegClassPressureSets(RC),
-                        TRI->getRegClassWeight(RC).RegWeight);
-  }
-  else {
-    increaseSetPressure(MaxSetPressure, MaxSetPressure,
-                        TRI->getRegUnitPressureSets(Reg),
-                        TRI->getRegUnitWeight(Reg));
-  }
-}
-
-/// Directly decrease pressure only within this RegisterPressure result.
-void RegisterPressure::decrease(unsigned Reg, const TargetRegisterInfo *TRI,
-                                const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    decreaseSetPressure(MaxSetPressure, TRI->getRegClassPressureSets(RC),
-                        TRI->getRegClassWeight(RC).RegWeight);
-  }
-  else {
-    decreaseSetPressure(MaxSetPressure, TRI->getRegUnitPressureSets(Reg),
-                        TRI->getRegUnitWeight(Reg));
+                                PSetIterator PSetI) {
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    assert(CurrSetPressure[*PSetI] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSetI] -= Weight;
   }
 }
 
@@ -113,36 +79,23 @@ void RegPressureTracker::dump() const {
 
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
-void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> Regs) {
-  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
-    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
-      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
-    }
-    else {
-      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
-                          TRI->getRegUnitPressureSets(Regs[I]),
-                          TRI->getRegUnitWeight(Regs[I]));
+void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]);
+    unsigned Weight = PSetI.getWeight();
+    for (; PSetI.isValid(); ++PSetI) {
+      CurrSetPressure[*PSetI] += Weight;
+      if (CurrSetPressure[*PSetI] > P.MaxSetPressure[*PSetI]) {
+        P.MaxSetPressure[*PSetI] = CurrSetPressure[*PSetI];
+      }
     }
   }
 }
 
 /// Simply decrease the current pressure as impacted by these registers.
-void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> Regs) {
-  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
-    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
-      decreaseSetPressure(CurrSetPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
-    }
-    else {
-      decreaseSetPressure(CurrSetPressure, TRI->getRegUnitPressureSets(Regs[I]),
-                          TRI->getRegUnitWeight(Regs[I]));
-    }
-  }
+void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) {
+  for (unsigned I = 0, E = RegUnits.size(); I != E; ++I)
+    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I]));
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -194,12 +147,30 @@ void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
   LiveInRegs.clear();
 }
 
-const LiveInterval *RegPressureTracker::getInterval(unsigned Reg) const {
+const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
     return &LIS->getInterval(Reg);
   return LIS->getCachedRegUnit(Reg);
 }
 
+void RegPressureTracker::reset() {
+  MBB = 0;
+  LIS = 0;
+
+  CurrSetPressure.clear();
+  LiveThruPressure.clear();
+  P.MaxSetPressure.clear();
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+
+  LiveRegs.PhysRegs.clear();
+  LiveRegs.VirtRegs.clear();
+  UntiedDefs.clear();
+}
+
 /// Setup the RegPressureTracker.
 ///
 /// TODO: Add support for pressure without LiveIntervals.
@@ -210,6 +181,8 @@ void RegPressureTracker::init(const MachineFunction *mf,
                               MachineBasicBlock::const_iterator pos,
                               bool ShouldTrackUntiedDefs)
 {
+  reset();
+
   MF = mf;
   TRI = MF->getTarget().getRegisterInfo();
   RCI = rci;
@@ -224,19 +197,11 @@ void RegPressureTracker::init(const MachineFunction *mf,
 
   CurrPos = pos;
   CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
-  LiveThruPressure.clear();
 
-  if (RequireIntervals)
-    static_cast<IntervalPressure&>(P).reset();
-  else
-    static_cast<RegionPressure&>(P).reset();
   P.MaxSetPressure = CurrSetPressure;
 
-  LiveRegs.PhysRegs.clear();
   LiveRegs.PhysRegs.setUniverse(TRI->getNumRegs());
-  LiveRegs.VirtRegs.clear();
   LiveRegs.VirtRegs.setUniverse(MRI->getNumVirtRegs());
-  UntiedDefs.clear();
   if (TrackUntiedDefs)
     UntiedDefs.setUniverse(MRI->getNumVirtRegs());
 }
@@ -328,24 +293,25 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
     unsigned Reg = P.LiveOutRegs[i];
     if (TargetRegisterInfo::isVirtualRegister(Reg)
         && !RPTracker.hasUntiedDef(Reg)) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-      increaseSetPressure(LiveThruPressure, LiveThruPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
+      increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg));
     }
   }
 }
 
 /// \brief Convenient wrapper for checking membership in RegisterOperands.
-static bool containsReg(ArrayRef<unsigned> Regs, unsigned Reg) {
-  return std::find(Regs.begin(), Regs.end(), Reg) != Regs.end();
+/// (std::count() doesn't have an early exit).
+static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) {
+  return std::find(RegUnits.begin(), RegUnits.end(), RegUnit) != RegUnits.end();
 }
 
 /// Collect this instruction's unique uses and defs into SmallVectors for
 /// processing defs and uses in order.
+///
+/// FIXME: always ignore tied opers
 class RegisterOperands {
   const TargetRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
+  bool IgnoreDead;
 
 public:
   SmallVector<unsigned, 8> Uses;
@@ -353,7 +319,8 @@ public:
   SmallVector<unsigned, 8> DeadDefs;
 
   RegisterOperands(const TargetRegisterInfo *tri,
-                   const MachineRegisterInfo *mri): TRI(tri), MRI(mri) {}
+                   const MachineRegisterInfo *mri, bool ID = false):
+    TRI(tri), MRI(mri), IgnoreDead(ID) {}
 
   /// Push this operand's register onto the correct vector.
   void collect(const MachineOperand &MO) {
@@ -362,25 +329,27 @@ public:
     if (MO.readsReg())
       pushRegUnits(MO.getReg(), Uses);
     if (MO.isDef()) {
-      if (MO.isDead())
-        pushRegUnits(MO.getReg(), DeadDefs);
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushRegUnits(MO.getReg(), DeadDefs);
+      }
       else
         pushRegUnits(MO.getReg(), Defs);
     }
   }
 
 protected:
-  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &Regs) {
+  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      if (containsReg(Regs, Reg))
+      if (containsReg(RegUnits, Reg))
         return;
-      Regs.push_back(Reg);
+      RegUnits.push_back(Reg);
     }
     else if (MRI->isAllocatable(Reg)) {
       for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-        if (containsReg(Regs, *Units))
+        if (containsReg(RegUnits, *Units))
           continue;
-        Regs.push_back(*Units);
+        RegUnits.push_back(*Units);
       }
     }
   }
@@ -399,6 +368,56 @@ static void collectOperands(const MachineInstr *MI,
   RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
 }
 
+/// Initialize an array of N PressureDiffs.
+void PressureDiffs::init(unsigned N) {
+  Size = N;
+  if (N <= Max) {
+    memset(PDiffArray, 0, N * sizeof(PressureDiff));
+    return;
+  }
+  Max = Size;
+  free(PDiffArray);
+  PDiffArray = reinterpret_cast<PressureDiff*>(calloc(N, sizeof(PressureDiff)));
+}
+
+/// Add a change in pressure to the pressure diff of a given instruction.
+void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
+                                     const MachineRegisterInfo *MRI) {
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    // Find an existing entry in the pressure diff for this PSet.
+    PressureDiff::iterator I = begin(), E = end();
+    for (; I != E && I->isValid(); ++I) {
+      if (I->getPSet() >= *PSetI)
+        break;
+    }
+    // If all pressure sets are more constrained, skip the remaining PSets.
+    if (I == E)
+      break;
+    // Insert this PressureChange.
+    if (!I->isValid() || I->getPSet() != *PSetI) {
+      PressureChange PTmp = PressureChange(*PSetI);
+      for (PressureDiff::iterator J = I; J != E && PTmp.isValid(); ++J)
+        std::swap(*J,PTmp);
+    }
+    // Update the units for this pressure set.
+    I->setUnitInc(I->getUnitInc() + Weight);
+  }
+}
+
+/// Record the pressure difference induced by the given operand list.
+static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers,
+                         const MachineRegisterInfo *MRI) {
+  assert(!PDiff.begin()->isValid() && "stale PDiff");
+
+  for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i)
+    PDiff.addPressureChange(RegOpers.Defs[i], true, MRI);
+
+  for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i)
+    PDiff.addPressureChange(RegOpers.Uses[i], false, MRI);
+}
+
 /// Force liveness of registers.
 void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
@@ -415,7 +434,7 @@ void RegPressureTracker::discoverLiveIn(unsigned Reg) {
 
   // At live in discovery, unconditionally increase the high water mark.
   P.LiveInRegs.push_back(Reg);
-  P.increase(Reg, TRI, MRI);
+  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
 }
 
 /// Add Reg to the live out set and increase max pressure.
@@ -426,11 +445,16 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) {
 
   // At live out discovery, unconditionally increase the high water mark.
   P.LiveOutRegs.push_back(Reg);
-  P.increase(Reg, TRI, MRI);
+  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
 }
 
-/// Recede across the previous instruction.
-bool RegPressureTracker::recede() {
+/// Recede across the previous instruction. If LiveUses is provided, record any
+/// RegUnits that are made live by the current instruction's uses. This includes
+/// registers that are both defined and used by the instruction.  If a pressure
+/// difference pointer is provided record the changes is pressure caused by this
+/// instruction independent of liveness.
+bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
+                                PressureDiff *PDiff) {
   // Check for the top of the analyzable region.
   if (CurrPos == MBB->begin()) {
     closeRegion();
@@ -463,6 +487,9 @@ bool RegPressureTracker::recede() {
   RegisterOperands RegOpers(TRI, MRI);
   collectOperands(CurrPos, RegOpers);
 
+  if (PDiff)
+    collectPDiff(*PDiff, RegOpers, MRI);
+
   // Boost pressure for all dead defs together.
   increaseRegPressure(RegOpers.DeadDefs);
   decreaseRegPressure(RegOpers.DeadDefs);
@@ -471,10 +498,20 @@ bool RegPressureTracker::recede() {
   // TODO: consider earlyclobbers?
   for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
     unsigned Reg = RegOpers.Defs[i];
-    if (LiveRegs.erase(Reg))
-      decreaseRegPressure(Reg);
-    else
-      discoverLiveOut(Reg);
+    bool DeadDef = false;
+    if (RequireIntervals) {
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        DeadDef = LRQ.isDeadDef();
+      }
+    }
+    if (!DeadDef) {
+      if (LiveRegs.erase(Reg))
+        decreaseRegPressure(Reg);
+      else
+        discoverLiveOut(Reg);
+    }
   }
 
   // Generate liveness for uses.
@@ -483,12 +520,17 @@ bool RegPressureTracker::recede() {
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
-        const LiveInterval *LI = getInterval(Reg);
-        if (LI && !LI->killedAt(SlotIdx))
-          discoverLiveOut(Reg);
+        const LiveRange *LR = getLiveRange(Reg);
+        if (LR) {
+          LiveQueryResult LRQ = LR->Query(SlotIdx);
+          if (!LRQ.isKill() && !LRQ.valueDefined())
+            discoverLiveOut(Reg);
+        }
       }
       increaseRegPressure(Reg);
       LiveRegs.insert(Reg);
+      if (LiveUses && !containsReg(*LiveUses, Reg))
+        LiveUses->push_back(Reg);
     }
   }
   if (TrackUntiedDefs) {
@@ -537,8 +579,8 @@ bool RegPressureTracker::advance() {
     // Kill liveness at last uses.
     bool lastUse = false;
     if (RequireIntervals) {
-      const LiveInterval *LI = getInterval(Reg);
-      lastUse = LI && LI->killedAt(SlotIdx);
+      const LiveRange *LR = getLiveRange(Reg);
+      lastUse = LR && LR->Query(SlotIdx).isKill();
     }
     else {
       // Allocatable physregs are always single-use before register rewriting.
@@ -576,8 +618,7 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
                                        RegPressureDelta &Delta,
                                        const RegisterClassInfo *RCI,
                                        ArrayRef<unsigned> LiveThruPressureVec) {
-  int ExcessUnits = 0;
-  unsigned PSetID = ~0U;
+  Delta.Excess = PressureChange();
   for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
     unsigned POld = OldPressureVec[i];
     unsigned PNew = NewPressureVec[i];
@@ -599,13 +640,11 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
       PDiff = Limit - POld;   // Just obeyed limit.
 
     if (PDiff) {
-      ExcessUnits = PDiff;
-      PSetID = i;
+      Delta.Excess = PressureChange(i);
+      Delta.Excess.setUnitInc(PDiff);
       break;
     }
   }
-  Delta.Excess.PSetID = PSetID;
-  Delta.Excess.UnitIncrease = ExcessUnits;
 }
 
 /// Find the max change in max pressure that either surpasses a critical PSet
@@ -616,11 +655,11 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
 /// RegPressureTracker API change to work with pressure differences.
 static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
                                     ArrayRef<unsigned> NewMaxPressureVec,
-                                    ArrayRef<PressureElement> CriticalPSets,
+                                    ArrayRef<PressureChange> CriticalPSets,
                                     ArrayRef<unsigned> MaxPressureLimit,
                                     RegPressureDelta &Delta) {
-  Delta.CriticalMax = PressureElement();
-  Delta.CurrentMax = PressureElement();
+  Delta.CriticalMax = PressureChange();
+  Delta.CurrentMax = PressureChange();
 
   unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
   for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
@@ -630,27 +669,24 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
       continue;
 
     if (!Delta.CriticalMax.isValid()) {
-      while (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID < i)
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < i)
         ++CritIdx;
 
-      if (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID == i) {
-        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].UnitIncrease;
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
+        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
         if (PDiff > 0) {
-          Delta.CriticalMax.PSetID = i;
-          Delta.CriticalMax.UnitIncrease = PDiff;
+          Delta.CriticalMax = PressureChange(i);
+          Delta.CriticalMax.setUnitInc(PDiff);
         }
       }
     }
     // Find the first increase above MaxPressureLimit.
     // (Ignores negative MDiff).
-    if (!Delta.CurrentMax.isValid()) {
-      int MDiff = (int)PNew - (int)MaxPressureLimit[i];
-      if (MDiff > 0) {
-        Delta.CurrentMax.PSetID = i;
-        Delta.CurrentMax.UnitIncrease = MDiff;
-        if (CritIdx == CritEnd || Delta.CriticalMax.isValid())
-          break;
-      }
+    if (!Delta.CurrentMax.isValid() && PNew > MaxPressureLimit[i]) {
+      Delta.CurrentMax = PressureChange(i);
+      Delta.CurrentMax.setUnitInc(PNew - POld);
+      if (CritIdx == CritEnd || Delta.CriticalMax.isValid())
+        break;
     }
   }
 }
@@ -665,7 +701,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI);
+  RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true);
   collectOperands(MI, RegOpers);
 
   // Boost max pressure for all dead defs together.
@@ -676,8 +712,19 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Kill liveness at live defs.
   for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
     unsigned Reg = RegOpers.Defs[i];
-    if (!containsReg(RegOpers.Uses, Reg))
-      decreaseRegPressure(Reg);
+    bool DeadDef = false;
+    if (RequireIntervals) {
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        DeadDef = LRQ.isDeadDef();
+      }
+    }
+    if (!DeadDef) {
+      if (!containsReg(RegOpers.Uses, Reg))
+        decreaseRegPressure(Reg);
+    }
   }
   // Generate liveness for uses.
   for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
@@ -699,8 +746,9 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
 /// result per-SUnit with enough information to adjust for the current
 /// scheduling position. But this works as a proof of concept.
 void RegPressureTracker::
-getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
-                          ArrayRef<PressureElement> CriticalPSets,
+getMaxUpwardPressureDelta(const MachineInstr *MI, PressureDiff *PDiff,
+                          RegPressureDelta &Delta,
+                          ArrayRef<PressureChange> CriticalPSets,
                           ArrayRef<unsigned> MaxPressureLimit) {
   // Snapshot Pressure.
   // FIXME: The snapshot heap space should persist. But I'm planning to
@@ -714,12 +762,113 @@ getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
                              LiveThruPressure);
   computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
                           MaxPressureLimit, Delta);
-  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
-         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
 
   // Restore the tracker's state.
   P.MaxSetPressure.swap(SavedMaxPressure);
   CurrSetPressure.swap(SavedPressure);
+
+#ifndef NDEBUG
+  if (!PDiff)
+    return;
+
+  // Check if the alternate algorithm yields the same result.
+  RegPressureDelta Delta2;
+  getUpwardPressureDelta(MI, *PDiff, Delta2, CriticalPSets, MaxPressureLimit);
+  if (Delta != Delta2) {
+    dbgs() << "DELTA: " << *MI;
+    if (Delta.Excess.isValid())
+      dbgs() << "Excess1 " << TRI->getRegPressureSetName(Delta.Excess.getPSet())
+             << " " << Delta.Excess.getUnitInc() << "\n";
+    if (Delta.CriticalMax.isValid())
+      dbgs() << "Critic1 " << TRI->getRegPressureSetName(Delta.CriticalMax.getPSet())
+             << " " << Delta.CriticalMax.getUnitInc() << "\n";
+    if (Delta.CurrentMax.isValid())
+      dbgs() << "CurrMx1 " << TRI->getRegPressureSetName(Delta.CurrentMax.getPSet())
+             << " " << Delta.CurrentMax.getUnitInc() << "\n";
+    if (Delta2.Excess.isValid())
+      dbgs() << "Excess2 " << TRI->getRegPressureSetName(Delta2.Excess.getPSet())
+             << " " << Delta2.Excess.getUnitInc() << "\n";
+    if (Delta2.CriticalMax.isValid())
+      dbgs() << "Critic2 " << TRI->getRegPressureSetName(Delta2.CriticalMax.getPSet())
+             << " " << Delta2.CriticalMax.getUnitInc() << "\n";
+    if (Delta2.CurrentMax.isValid())
+      dbgs() << "CurrMx2 " << TRI->getRegPressureSetName(Delta2.CurrentMax.getPSet())
+             << " " << Delta2.CurrentMax.getUnitInc() << "\n";
+    llvm_unreachable("RegP Delta Mismatch");
+  }
+#endif
+}
+
+/// This is a prototype of the fast version of querying register pressure that
+/// does not directly depend on current liveness. It's still slow because we
+/// recompute pressure change on-the-fly. This implementation only exists to
+/// prove correctness.
+///
+/// @param Delta captures information needed for heuristics.
+///
+/// @param CriticalPSets Are the pressure sets that are known to exceed some
+/// limit within the region, not necessarily at the current position.
+///
+/// @param MaxPressureLimit Is the max pressure within the region, not
+/// necessarily at the current position.
+void RegPressureTracker::
+getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
+                       RegPressureDelta &Delta,
+                       ArrayRef<PressureChange> CriticalPSets,
+                       ArrayRef<unsigned> MaxPressureLimit) const {
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (PressureDiff::const_iterator
+         PDiffI = PDiff.begin(), PDiffE = PDiff.end();
+       PDiffI != PDiffE && PDiffI->isValid(); ++PDiffI) {
+
+    unsigned PSetID = PDiffI->getPSet();
+    unsigned Limit = RCI->getRegPressureSetLimit(PSetID);
+    if (!LiveThruPressure.empty())
+      Limit += LiveThruPressure[PSetID];
+
+    unsigned POld = CurrSetPressure[PSetID];
+    unsigned MOld = P.MaxSetPressure[PSetID];
+    unsigned MNew = MOld;
+    // Ignore DeadDefs here because they aren't captured by PressureChange.
+    unsigned PNew = POld + PDiffI->getUnitInc();
+    assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld) && "PSet overflow");
+    if (PNew > MOld)
+      MNew = PNew;
+    // Check if current pressure has exceeded the limit.
+    if (!Delta.Excess.isValid()) {
+      unsigned ExcessInc = 0;
+      if (PNew > Limit)
+        ExcessInc = POld > Limit ? PNew - POld : PNew - Limit;
+      else if (POld > Limit)
+        ExcessInc = Limit - POld;
+      if (ExcessInc) {
+        Delta.Excess = PressureChange(PSetID);
+        Delta.Excess.setUnitInc(ExcessInc);
+      }
+    }
+    // Check if max pressure has exceeded a critical pressure set max.
+    if (MNew == MOld)
+      continue;
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < PSetID)
+        ++CritIdx;
+
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
+        int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (CritInc > 0 && CritInc <= INT16_MAX) {
+          Delta.CriticalMax = PressureChange(PSetID);
+          Delta.CriticalMax.setUnitInc(CritInc);
+        }
+      }
+    }
+    // Check if max pressure has exceeded the current max.
+    if (!Delta.CurrentMax.isValid() && MNew > MaxPressureLimit[PSetID]) {
+      Delta.CurrentMax = PressureChange(PSetID);
+      Delta.CurrentMax.setUnitInc(MNew - MOld);
+    }
+  }
 }
 
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
@@ -765,10 +914,12 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      const LiveInterval *LI = getInterval(Reg);
-      if (LI && LI->killedAt(SlotIdx)
-          && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
-        decreaseRegPressure(Reg);
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
+          decreaseRegPressure(Reg);
+        }
       }
     }
     else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
@@ -793,7 +944,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 /// This assumes that the current LiveIn set is sufficient.
 void RegPressureTracker::
 getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
-                            ArrayRef<PressureElement> CriticalPSets,
+                            ArrayRef<PressureChange> CriticalPSets,
                             ArrayRef<unsigned> MaxPressureLimit) {
   // Snapshot Pressure.
   std::vector<unsigned> SavedPressure = CurrSetPressure;
@@ -805,8 +956,8 @@ getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
                              LiveThruPressure);
   computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
                           MaxPressureLimit, Delta);
-  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
-         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
 
   // Restore the tracker's state.
   P.MaxSetPressure.swap(SavedMaxPressure);
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 892903c..7f1f9c4 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -36,6 +36,8 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <queue>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
@@ -178,14 +180,11 @@ void ScheduleDAGInstrs::finishBlock() {
 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
-                                    unsigned endcount) {
+                                    unsigned regioninstrs) {
   assert(bb == BB && "startBlock should set BB");
   RegionBegin = begin;
   RegionEnd = end;
-  EndIndex = endcount;
-  MISUnitMap.clear();
-
-  ScheduleDAG::clearDAG();
+  NumRegionInstrs = regioninstrs;
 }
 
 /// Close the current scheduling region. Don't clear any state in case the
@@ -405,9 +404,19 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   unsigned Reg = MI->getOperand(OperIdx).getReg();
 
+  // Record this local VReg use.
+  VReg2UseMap::iterator UI = VRegUses.find(Reg);
+  for (; UI != VRegUses.end(); ++UI) {
+    if (UI->SU == SU)
+      break;
+  }
+  if (UI == VRegUses.end())
+    VRegUses.insert(VReg2SUnit(Reg, SU));
+
   // Lookup this operand's reaching definition.
   assert(LIS && "vreg dependencies requires LiveIntervals");
-  LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI));
+  LiveQueryResult LRQ
+    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
   VNInfo *VNI = LRQ.valueIn();
 
   // VNI will be valid because MachineOperand::readsReg() is checked by caller.
@@ -635,8 +644,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
                          bool isNormalMemory = false) {
   // If this is a false dependency,
   // do not add the edge, but rememeber the rejected node.
-  if (!EnableAASchedMI ||
-      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+  if (!AA || MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
     SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
     Dep.setLatency(TrueMemOrderLatency);
     SUb->addPred(Dep);
@@ -664,7 +672,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
 void ScheduleDAGInstrs::initSUnits() {
   // We'll be allocating one SUnit for each real instruction in the region,
   // which is contained within a basic block.
-  SUnits.reserve(BB->size());
+  SUnits.reserve(NumRegionInstrs);
 
   for (MachineBasicBlock::iterator I = RegionBegin; I != RegionEnd; ++I) {
     MachineInstr *MI = I;
@@ -686,10 +694,22 @@ void ScheduleDAGInstrs::initSUnits() {
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
-                                        RegPressureTracker *RPTracker) {
+                                        RegPressureTracker *RPTracker,
+                                        PressureDiffs *PDiffs) {
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
+                                                       : ST.useAA();
+  AliasAnalysis *AAForDep = UseAA ? AA : 0;
+
+  MISUnitMap.clear();
+  ScheduleDAG::clearDAG();
+
   // Create an SUnit for each real instruction.
   initSUnits();
 
+  if (PDiffs)
+    PDiffs->init(SUnits.size());
+
   // We build scheduling units by walking a block's instruction list from bottom
   // to top.
 
@@ -715,10 +735,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Uses.setUniverse(TRI->getNumRegs());
 
   assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
-  // FIXME: Allow SparseSet to reserve space for the creation of virtual
-  // registers during scheduling. Don't artificially inflate the Universe
-  // because we want to assert that vregs are not created during DAG building.
+  VRegUses.clear();
   VRegDefs.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(MRI.getNumVirtRegs());
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -738,17 +757,18 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       DbgMI = MI;
       continue;
     }
+    SUnit *SU = MISUnitMap[MI];
+    assert(SU && "No SUnit mapped to this MI");
+
     if (RPTracker) {
-      RPTracker->recede();
+      PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : 0;
+      RPTracker->recede(/*LiveUses=*/0, PDiff);
       assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
     }
 
     assert((CanHandleTerminators || (!MI->isTerminator() && !MI->isLabel())) &&
            "Cannot schedule terminators or labels!");
 
-    SUnit *SU = MISUnitMap[MI];
-    assert(SU && "No SUnit mapped to this MI");
-
     // Add register-based dependencies (data, anti, and output).
     bool HasVRegDef = false;
     for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
@@ -826,20 +846,20 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         unsigned ChainLatency = 0;
         if (AliasChain->getInstr()->mayLoad())
           ChainLatency = TrueMemOrderLatency;
-        addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes,
                            ChainLatency);
       }
       AliasChain = SU;
       for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                            TrueMemOrderLatency);
       for (MapVector<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
            E = AliasMemDefs.end(); I != E; ++I)
-        addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+        addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
       for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
            AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, I->second[i], RejectMemNodes,
                              TrueMemOrderLatency);
       }
       adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -872,7 +892,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         MapVector<const Value *, SUnit *>::iterator IE =
           ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
-          addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+          addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                             0, true);
           I->second = SU;
         } else {
           if (ThisMayAlias)
@@ -887,7 +908,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
         if (J != JE) {
           for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes,
+            addChainDependency(AAForDep, MFI, SU, J->second[i], RejectMemNodes,
                                TrueMemOrderLatency, true);
           J->second.clear();
         }
@@ -896,11 +917,11 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // Add dependencies from all the PendingLoads, i.e. loads
         // with no underlying object.
         for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-          addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                              TrueMemOrderLatency);
         // Add dependence on alias chain, if needed.
         if (AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
         // But we also should check dependent instructions for the
         // SU in question.
         adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -930,7 +951,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           // potentially aliasing stores.
           for (MapVector<const Value *, SUnit *>::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
 
           PendingLoads.push_back(SU);
           MayAlias = true;
@@ -952,7 +973,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           MapVector<const Value *, SUnit *>::iterator IE =
             ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                               0, true);
           if (ThisMayAlias)
             AliasMemUses[V].push_back(SU);
           else
@@ -962,7 +984,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0);
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Barrier));
       }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cb88941..43f72c5 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,6 +35,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -43,6 +45,7 @@ STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
 STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
 STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
+STATISTIC(SlicedLoads, "Number of load sliced");
 
 namespace {
   static cl::opt<bool>
@@ -53,6 +56,14 @@ namespace {
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Include global information in alias analysis"));
 
+  /// Hidden option to stress test load slicing, i.e., when this option
+  /// is enabled, load slicing bypasses most of its profitability guards.
+  static cl::opt<bool>
+  StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
+                    cl::desc("Bypass the profitability model of load "
+                             "slicing"),
+                    cl::init(false));
+
 //------------------------------ DAGCombiner ---------------------------------//
 
   class DAGCombiner {
@@ -62,6 +73,7 @@ namespace {
     CodeGenOpt::Level OptLevel;
     bool LegalOperations;
     bool LegalTypes;
+    bool ForCodeSize;
 
     // Worklist of all of the nodes that need to be simplified.
     //
@@ -144,6 +156,7 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
+    bool SliceUpLoad(SDNode *N);
 
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
@@ -283,11 +296,11 @@ namespace {
 
     /// isAlias - Return true if there is any possibility that the two addresses
     /// overlap.
-    bool isAlias(SDValue Ptr1, int64_t Size1,
+    bool isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
                  const Value *SrcValue1, int SrcValueOffset1,
                  unsigned SrcValueAlign1,
                  const MDNode *TBAAInfo1,
-                 SDValue Ptr2, int64_t Size2,
+                 SDValue Ptr2, int64_t Size2, bool IsVolatile2,
                  const Value *SrcValue2, int SrcValueOffset2,
                  unsigned SrcValueAlign2,
                  const MDNode *TBAAInfo2) const;
@@ -299,7 +312,7 @@ namespace {
     /// FindAliasInfo - Extracts the relevant alias information from the memory
     /// node.  Returns true if the operand was a load.
     bool FindAliasInfo(SDNode *N,
-                       SDValue &Ptr, int64_t &Size,
+                       SDValue &Ptr, int64_t &Size, bool &IsVolatile,
                        const Value *&SrcValue, int &SrcValueOffset,
                        unsigned &SrcValueAlignment,
                        const MDNode *&TBAAInfo) const;
@@ -315,8 +328,15 @@ namespace {
 
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
-      : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-        OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
+        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
+          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+      AttributeSet FnAttrs =
+          DAG.getMachineFunction().getFunction()->getAttributes();
+      ForCodeSize =
+          FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::OptimizeForSize) ||
+          FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    }
 
     /// Run - runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
@@ -329,7 +349,8 @@ namespace {
       assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
       if (LHSTy.isVector())
         return LHSTy;
-      return LegalTypes ? TLI.getScalarShiftAmountTy(LHSTy) : TLI.getPointerTy();
+      return LegalTypes ? TLI.getScalarShiftAmountTy(LHSTy)
+                        : TLI.getPointerTy();
     }
 
     /// isTypeLegal - This method returns true if we are running before type
@@ -744,9 +765,7 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
     Replace = true;
     return DAG.getExtLoad(ExtType, dl, PVT,
                           LD->getChain(), LD->getBasePtr(),
-                          LD->getPointerInfo(),
-                          MemVT, LD->isVolatile(),
-                          LD->isNonTemporal(), LD->getAlignment());
+                          MemVT, LD->getMemOperand());
   }
 
   unsigned Opc = Op.getOpcode();
@@ -967,9 +986,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
       : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
                                    LD->getChain(), LD->getBasePtr(),
-                                   LD->getPointerInfo(),
-                                   MemVT, LD->isVolatile(),
-                                   LD->isNonTemporal(), LD->getAlignment());
+                                   MemVT, LD->getMemOperand());
     SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, VT, NewLD);
 
     DEBUG(dbgs() << "\nPromoting ";
@@ -1017,7 +1034,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   // try and combine it.
   while (!WorkListContents.empty()) {
     SDNode *N;
-    // The WorkListOrder holds the SDNodes in order, but it may contain duplicates.
+    // The WorkListOrder holds the SDNodes in order, but it may contain
+    // duplicates.
     // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
     // worklist *should* contain, and check the node we want to visit is should
     // actually be visited.
@@ -1617,19 +1635,8 @@ static SDValue tryFoldToZero(SDLoc DL, const TargetLowering &TLI, EVT VT,
                              bool LegalOperations, bool LegalTypes) {
   if (!VT.isVector())
     return DAG.getConstant(0, VT);
-  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
-    // Produce a vector of zeros.
-    EVT ElemTy = VT.getVectorElementType();
-    if (LegalTypes && TLI.getTypeAction(*DAG.getContext(), ElemTy) ==
-                      TargetLowering::TypePromoteInteger)
-      ElemTy = TLI.getTypeToTransformTo(*DAG.getContext(), ElemTy);
-    assert((!LegalTypes || TLI.isTypeLegal(ElemTy)) &&
-           "Type for zero vector elements is not legal");
-    SDValue El = DAG.getConstant(0, ElemTy);
-    std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
-      &Ops[0], Ops.size());
-  }
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return DAG.getConstant(0, VT);
   return SDValue();
 }
 
@@ -1771,8 +1778,8 @@ SDValue DAGCombiner::visitSUBE(SDNode *N) {
   return SDValue();
 }
 
-/// isConstantSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
-/// all the same constant or undefined.
+/// isConstantSplatVector - Returns true if N is a BUILD_VECTOR node whose
+/// elements are all the same constant or undefined.
 static bool isConstantSplatVector(SDNode *N, APInt& SplatValue) {
   BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(N);
   if (!C)
@@ -1808,9 +1815,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     N1IsConst = isConstantSplatVector(N1.getNode(), ConstValue1);
   } else {
     N0IsConst = dyn_cast<ConstantSDNode>(N0) != 0;
-    ConstValue0 = N0IsConst? (dyn_cast<ConstantSDNode>(N0))->getAPIntValue() : APInt();
+    ConstValue0 = N0IsConst ? (dyn_cast<ConstantSDNode>(N0))->getAPIntValue()
+                            : APInt();
     N1IsConst = dyn_cast<ConstantSDNode>(N1) != 0;
-    ConstValue1 = N1IsConst? (dyn_cast<ConstantSDNode>(N1))->getAPIntValue() : APInt();
+    ConstValue1 = N1IsConst ? (dyn_cast<ConstantSDNode>(N1))->getAPIntValue()
+                            : APInt();
   }
 
   // fold (mul c1, c2) -> c1*c2
@@ -1823,20 +1832,24 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // fold (mul x, 0) -> 0
   if (N1IsConst && ConstValue1 == 0)
     return N1;
+  // We require a splat of the entire scalar bit width for non-contiguous
+  // bit patterns.
+  bool IsFullSplat =
+    ConstValue1.getBitWidth() == VT.getScalarType().getSizeInBits();
   // fold (mul x, 1) -> x
-  if (N1IsConst && ConstValue1 == 1)
+  if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
     return N0;
   // fold (mul x, -1) -> 0-x
   if (N1IsConst && ConstValue1.isAllOnesValue())
     return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                        DAG.getConstant(0, VT), N0);
   // fold (mul x, (1 << c)) -> x << c
-  if (N1IsConst && ConstValue1.isPowerOf2())
+  if (N1IsConst && ConstValue1.isPowerOf2() && IsFullSplat)
     return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
                        DAG.getConstant(ConstValue1.logBase2(),
                                        getShiftAmountTy(N0.getValueType())));
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
-  if (N1IsConst && (-ConstValue1).isPowerOf2()) {
+  if (N1IsConst && (-ConstValue1).isPowerOf2() && IsFullSplat) {
     unsigned Log2Val = (-ConstValue1).logBase2();
     // FIXME: If the input is something that is easily negated (e.g. a
     // single-use add), we should put the negate there.
@@ -2675,6 +2688,19 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
     }
+    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2)
+    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) &&
+        Op0 == Op1 && LL.getValueType().isInteger() &&
+      Op0 == ISD::SETNE && ((cast<ConstantSDNode>(LR)->isNullValue() &&
+                                 cast<ConstantSDNode>(RR)->isAllOnesValue()) ||
+                                (cast<ConstantSDNode>(LR)->isAllOnesValue() &&
+                                 cast<ConstantSDNode>(RR)->isNullValue()))) {
+      SDValue ADDNode = DAG.getNode(ISD::ADD, SDLoc(N0), LL.getValueType(),
+                                    LL, DAG.getConstant(1, LL.getValueType()));
+      AddToWorkList(ADDNode.getNode());
+      return DAG.getSetCC(SDLoc(N), VT, ADDNode,
+                          DAG.getConstant(2, LL.getValueType()), ISD::SETUGE);
+    }
     // canonicalize equivalent to ll == rl
     if (LL == RR && LR == RL) {
       Op1 = ISD::getSetCCSwappedOperands(Op1);
@@ -2718,9 +2744,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
-                                       LN0->getPointerInfo(), MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       MemVT, LN0->getMemOperand());
       AddToWorkList(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2739,11 +2763,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
-                                       LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
       AddToWorkList(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2773,10 +2794,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
           SDValue NewLoad =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
-                           LN0->getChain(), LN0->getBasePtr(),
-                           LN0->getPointerInfo(),
-                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           LN0->getAlignment());
+                           LN0->getChain(), LN0->getBasePtr(), ExtVT,
+                           LN0->getMemOperand());
           AddToWorkList(N);
           CombineTo(LN0, NewLoad, NewLoad.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2812,7 +2831,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            LN0->getChain(), NewPtr,
                            LN0->getPointerInfo(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           Alignment);
+                           Alignment, LN0->getTBAAInfo());
           AddToWorkList(N);
           CombineTo(LN0, Load, Load.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2848,6 +2867,14 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
+  if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
+    SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                       N0.getOperand(1), false);
+    if (BSwap.getNode())
+      return BSwap;
+  }
+
   return SDValue();
 }
 
@@ -2932,13 +2959,23 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   if (N00 != N10)
     return SDValue();
 
-  // Make sure everything beyond the low halfword is zero since the SRL 16
-  // will clear the top bits.
+  // Make sure everything beyond the low halfword gets set to zero since the SRL
+  // 16 will clear the top bits.
   unsigned OpSizeInBits = VT.getSizeInBits();
-  if (DemandHighBits && OpSizeInBits > 16 &&
-      (!LookPassAnd0 || !LookPassAnd1) &&
-      !DAG.MaskedValueIsZero(N10, APInt::getHighBitsSet(OpSizeInBits, 16)))
-    return SDValue();
+  if (DemandHighBits && OpSizeInBits > 16) {
+    // If the left-shift isn't masked out then the only way this is a bswap is
+    // if all bits beyond the low 8 are 0. In that case the entire pattern
+    // reduces to a left shift anyway: leave it for other parts of the combiner.
+    if (!LookPassAnd0)
+      return SDValue();
+
+    // However, if the right shift isn't masked out then it might be because
+    // it's not needed. See if we can spot that too.
+    if (!LookPassAnd1 &&
+        !DAG.MaskedValueIsZero(
+            N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
+      return SDValue();
+  }
 
   SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
   if (OpSizeInBits > 16)
@@ -3078,7 +3115,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   SDValue BSwap = DAG.getNode(ISD::BSWAP, SDLoc(N), VT,
                               SDValue(Parts[0],0));
 
-  // Result of the bswap should be rotated by 16. If it's not legal, than
+  // Result of the bswap should be rotated by 16. If it's not legal, then
   // do  (x << 16) | (x >> 16).
   SDValue ShAmt = DAG.getConstant(16, getShiftAmountTy(VT));
   if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
@@ -3343,29 +3380,9 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
   if (LHSMask.getNode() || RHSMask.getNode())
     return 0;
 
-  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotl x, y)
-  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotr x, (sub 32, y))
-  if (RHSShiftAmt.getOpcode() == ISD::SUB &&
-      LHSShiftAmt == RHSShiftAmt.getOperand(1)) {
-    if (ConstantSDNode *SUBC =
-          dyn_cast<ConstantSDNode>(RHSShiftAmt.getOperand(0))) {
-      if (SUBC->getAPIntValue() == OpSizeInBits)
-        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
-                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
-    }
-  }
-
-  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotr x, y)
-  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotl x, (sub 32, y))
-  if (LHSShiftAmt.getOpcode() == ISD::SUB &&
-      RHSShiftAmt == LHSShiftAmt.getOperand(1))
-    if (ConstantSDNode *SUBC =
-          dyn_cast<ConstantSDNode>(LHSShiftAmt.getOperand(0)))
-      if (SUBC->getAPIntValue() == OpSizeInBits)
-        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
-                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
-
-  // Look for sign/zext/any-extended or truncate cases:
+  // If the shift amount is sign/zext/any-extended just peel it off.
+  SDValue LExtOp0 = LHSShiftAmt;
+  SDValue RExtOp0 = RHSShiftAmt;
   if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
@@ -3374,33 +3391,31 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
        RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
-    SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
-    SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
-    if (RExtOp0.getOpcode() == ISD::SUB &&
-        RExtOp0.getOperand(1) == LExtOp0) {
-      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
-      //   (rotl x, y)
-      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
-      //   (rotr x, (sub 32, y))
-      if (ConstantSDNode *SUBC =
+    LExtOp0 = LHSShiftAmt.getOperand(0);
+    RExtOp0 = RHSShiftAmt.getOperand(0);
+  }
+
+  if (RExtOp0.getOpcode() == ISD::SUB && RExtOp0.getOperand(1) == LExtOp0) {
+    // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+    //   (rotl x, y)
+    // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+    //   (rotr x, (sub 32, y))
+    if (ConstantSDNode *SUBC =
             dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0)))
-        if (SUBC->getAPIntValue() == OpSizeInBits)
-          return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
-                             LHSShiftArg,
-                             HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
-    } else if (LExtOp0.getOpcode() == ISD::SUB &&
-               RExtOp0 == LExtOp0.getOperand(1)) {
-      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
-      //   (rotr x, y)
-      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
-      //   (rotl x, (sub 32, y))
-      if (ConstantSDNode *SUBC =
+      if (SUBC->getAPIntValue() == OpSizeInBits)
+        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
+  } else if (LExtOp0.getOpcode() == ISD::SUB &&
+             RExtOp0 == LExtOp0.getOperand(1)) {
+    // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+    //   (rotr x, y)
+    // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+    //   (rotl x, (sub 32, y))
+    if (ConstantSDNode *SUBC =
             dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0)))
-        if (SUBC->getAPIntValue() == OpSizeInBits)
-          return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT,
-                             LHSShiftArg,
-                             HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
-    }
+      if (SUBC->getAPIntValue() == OpSizeInBits)
+        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
+                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
   }
 
   return 0;
@@ -3620,6 +3635,12 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (shl c1, c2) -> c1<<c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
@@ -3697,6 +3718,27 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
+  // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
+  // Only fold this if the inner zext has no other uses to avoid increasing
+  // the total number of instructions.
+  if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) {
+    uint64_t c1 =
+      cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue();
+    if (c1 < VT.getSizeInBits()) {
+      uint64_t c2 = N1C->getZExtValue();
+      if (c1 == c2) {
+        SDValue NewOp0 = N0.getOperand(0);
+        EVT CountVT = NewOp0.getOperand(1).getValueType();
+        SDValue NewSHL = DAG.getNode(ISD::SHL, SDLoc(N), NewOp0.getValueType(),
+                                     NewOp0, DAG.getConstant(c2, CountVT));
+        AddToWorkList(NewSHL.getNode());
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
+      }
+    }
+  }
+
   // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
   //                               (and (srl x, (sub c1, c2), MASK)
   // Only fold this if the inner shift has no other uses -- if it does, folding
@@ -3750,6 +3792,12 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (sra c1, c2) -> (sra c1, c2)
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
@@ -3895,6 +3943,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (srl c1, c2) -> c1 >>u c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
@@ -4217,6 +4271,23 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
+static
+std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the inputs.
+  SDValue Lo, Hi, LL, LH, RL, RH;
+  llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+
+  return std::make_pair(Lo, Hi);
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4254,6 +4325,34 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     }
   }
 
+  // If the VSELECT result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (N0.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
+    llvm::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
+    llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
+    llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
+
+    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
+    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
+
+    // Add the new VSELECT nodes to the work list in case they need to be split
+    // again.
+    AddToWorkList(Lo.getNode());
+    AddToWorkList(Hi.getNode());
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+  }
+
   return SDValue();
 }
 
@@ -4469,10 +4568,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
@@ -4493,10 +4590,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       CombineTo(N0.getNode(),
                 DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
@@ -4524,11 +4619,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       if (DoXform) {
         SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT,
                                          LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getPointerInfo(),
                                          LN0->getMemoryVT(),
-                                         LN0->isVolatile(),
-                                         LN0->isNonTemporal(),
-                                         LN0->getAlignment());
+                                         LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
         SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
@@ -4593,9 +4685,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
          TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(VT)))) {
       return DAG.getSelect(SDLoc(N), VT,
                            DAG.getSetCC(SDLoc(N),
-                                        getSetCCResultType(VT),
-                                        N0.getOperand(0), N0.getOperand(1),
-                                        cast<CondCodeSDNode>(N0.getOperand(2))->get()),
+                           getSetCCResultType(VT),
+                           N0.getOperand(0), N0.getOperand(1),
+                           cast<CondCodeSDNode>(N0.getOperand(2))->get()),
                            NegOne, DAG.getConstant(0, VT));
     }
   }
@@ -4762,10 +4854,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
@@ -4795,11 +4885,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       if (DoXform) {
         SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
                                          LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getPointerInfo(),
                                          LN0->getMemoryVT(),
-                                         LN0->isVolatile(),
-                                         LN0->isNonTemporal(),
-                                         LN0->getAlignment());
+                                         LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.zext(VT.getSizeInBits());
         SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
@@ -4826,10 +4913,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       CombineTo(N0.getNode(),
                 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(),
@@ -4992,10 +5077,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
@@ -5016,9 +5099,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(N),
                                      VT, LN0->getChain(), LN0->getBasePtr(),
-                                     LN0->getPointerInfo(), MemVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     MemVT, LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(),
               DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
@@ -5250,12 +5331,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     Load =  DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
                         LN0->getPointerInfo().getWithOffset(PtrOff),
                         LN0->isVolatile(), LN0->isNonTemporal(),
-                        LN0->isInvariant(), NewAlign);
+                        LN0->isInvariant(), NewAlign, LN0->getTBAAInfo());
   else
     Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(),NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff),
                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                          NewAlign);
+                          NewAlign, LN0->getTBAAInfo());
 
   // Replace the old load's chain with the new load's chain.
   WorkListRemover DeadNodes(*this);
@@ -5353,10 +5434,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     EVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     AddToWorkList(ExtLoad.getNode());
@@ -5371,10 +5450,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     EVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -5657,7 +5734,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile() &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
+      TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     unsigned Align = TLI.getDataLayout()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
@@ -5667,7 +5745,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(),
                                  LN0->getBasePtr(), LN0->getPointerInfo(),
                                  LN0->isVolatile(), LN0->isNonTemporal(),
-                                 LN0->isInvariant(), OrigAlign);
+                                 LN0->isInvariant(), OrigAlign,
+                                 LN0->getTBAAInfo());
       AddToWorkList(N);
       CombineTo(N0.getNode(),
                 DAG.getNode(ISD::BITCAST, SDLoc(N0),
@@ -6652,16 +6731,14 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
   }
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
-  if (ISD::isNON_EXTLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     N0.getValueType(),
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), N0.getValueType(),
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(),
               DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
@@ -7451,13 +7528,16 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
                               LD->getValueType(0),
                               Chain, Ptr, LD->getPointerInfo(),
                               LD->getMemoryVT(),
-                              LD->isVolatile(), LD->isNonTemporal(), Align);
+                              LD->isVolatile(), LD->isNonTemporal(), Align,
+                              LD->getTBAAInfo());
         return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
       }
     }
   }
 
-  if (CombinerAA) {
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -7468,17 +7548,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       // Replace the chain to void dependency.
       if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
         ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
-                               BetterChain, Ptr, LD->getPointerInfo(),
-                               LD->isVolatile(), LD->isNonTemporal(),
-                               LD->isInvariant(), LD->getAlignment());
+                               BetterChain, Ptr, LD->getMemOperand());
       } else {
         ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
                                   LD->getValueType(0),
-                                  BetterChain, Ptr, LD->getPointerInfo(),
-                                  LD->getMemoryVT(),
-                                  LD->isVolatile(),
-                                  LD->isNonTemporal(),
-                                  LD->getAlignment());
+                                  BetterChain, Ptr, LD->getMemoryVT(),
+                                  LD->getMemOperand());
       }
 
       // Create token factor to keep old chain connected.
@@ -7498,9 +7573,562 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
+  // Try to slice up N to more direct loads if the slices are mapped to
+  // different register banks or pairing can take place.
+  if (SliceUpLoad(N))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
+namespace {
+/// \brief Helper structure used to slice a load in smaller loads.
+/// Basically a slice is obtained from the following sequence:
+/// Origin = load Ty1, Base
+/// Shift = srl Ty1 Origin, CstTy Amount
+/// Inst = trunc Shift to Ty2
+///
+/// Then, it will be rewriten into:
+/// Slice = load SliceTy, Base + SliceOffset
+/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
+///
+/// SliceTy is deduced from the number of bits that are actually used to
+/// build Inst.
+struct LoadedSlice {
+  /// \brief Helper structure used to compute the cost of a slice.
+  struct Cost {
+    /// Are we optimizing for code size.
+    bool ForCodeSize;
+    /// Various cost.
+    unsigned Loads;
+    unsigned Truncates;
+    unsigned CrossRegisterBanksCopies;
+    unsigned ZExts;
+    unsigned Shift;
+
+    Cost(bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
+
+    /// \brief Get the cost of one isolated slice.
+    Cost(const LoadedSlice &LS, bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
+      EVT TruncType = LS.Inst->getValueType(0);
+      EVT LoadedType = LS.getLoadedType();
+      if (TruncType != LoadedType &&
+          !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
+        ZExts = 1;
+    }
+
+    /// \brief Account for slicing gain in the current cost.
+    /// Slicing provide a few gains like removing a shift or a
+    /// truncate. This method allows to grow the cost of the original
+    /// load with the gain from this slice.
+    void addSliceGain(const LoadedSlice &LS) {
+      // Each slice saves a truncate.
+      const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
+      if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
+                              LS.Inst->getOperand(0).getValueType()))
+        ++Truncates;
+      // If there is a shift amount, this slice gets rid of it.
+      if (LS.Shift)
+        ++Shift;
+      // If this slice can merge a cross register bank copy, account for it.
+      if (LS.canMergeExpensiveCrossRegisterBankCopy())
+        ++CrossRegisterBanksCopies;
+    }
+
+    Cost &operator+=(const Cost &RHS) {
+      Loads += RHS.Loads;
+      Truncates += RHS.Truncates;
+      CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
+      ZExts += RHS.ZExts;
+      Shift += RHS.Shift;
+      return *this;
+    }
+
+    bool operator==(const Cost &RHS) const {
+      return Loads == RHS.Loads && Truncates == RHS.Truncates &&
+             CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
+             ZExts == RHS.ZExts && Shift == RHS.Shift;
+    }
+
+    bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
+
+    bool operator<(const Cost &RHS) const {
+      // Assume cross register banks copies are as expensive as loads.
+      // FIXME: Do we want some more target hooks?
+      unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
+      unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
+      // Unless we are optimizing for code size, consider the
+      // expensive operation first.
+      if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
+        return ExpensiveOpsLHS < ExpensiveOpsRHS;
+      return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
+             (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
+    }
+
+    bool operator>(const Cost &RHS) const { return RHS < *this; }
+
+    bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
+
+    bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
+  };
+  // The last instruction that represent the slice. This should be a
+  // truncate instruction.
+  SDNode *Inst;
+  // The original load instruction.
+  LoadSDNode *Origin;
+  // The right shift amount in bits from the original load.
+  unsigned Shift;
+  // The DAG from which Origin came from.
+  // This is used to get some contextual information about legal types, etc.
+  SelectionDAG *DAG;
+
+  LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL,
+              unsigned Shift = 0, SelectionDAG *DAG = NULL)
+      : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
+
+  LoadedSlice(const LoadedSlice &LS)
+      : Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {}
+
+  /// \brief Get the bits used in a chunk of bits \p BitWidth large.
+  /// \return Result is \p BitWidth and has used bits set to 1 and
+  ///         not used bits set to 0.
+  APInt getUsedBits() const {
+    // Reproduce the trunc(lshr) sequence:
+    // - Start from the truncated value.
+    // - Zero extend to the desired bit width.
+    // - Shift left.
+    assert(Origin && "No original load to compare against.");
+    unsigned BitWidth = Origin->getValueSizeInBits(0);
+    assert(Inst && "This slice is not bound to an instruction");
+    assert(Inst->getValueSizeInBits(0) <= BitWidth &&
+           "Extracted slice is bigger than the whole type!");
+    APInt UsedBits(Inst->getValueSizeInBits(0), 0);
+    UsedBits.setAllBits();
+    UsedBits = UsedBits.zext(BitWidth);
+    UsedBits <<= Shift;
+    return UsedBits;
+  }
+
+  /// \brief Get the size of the slice to be loaded in bytes.
+  unsigned getLoadedSize() const {
+    unsigned SliceSize = getUsedBits().countPopulation();
+    assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
+    return SliceSize / 8;
+  }
+
+  /// \brief Get the type that will be loaded for this slice.
+  /// Note: This may not be the final type for the slice.
+  EVT getLoadedType() const {
+    assert(DAG && "Missing context");
+    LLVMContext &Ctxt = *DAG->getContext();
+    return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
+  }
+
+  /// \brief Get the alignment of the load used for this slice.
+  unsigned getAlignment() const {
+    unsigned Alignment = Origin->getAlignment();
+    unsigned Offset = getOffsetFromBase();
+    if (Offset != 0)
+      Alignment = MinAlign(Alignment, Alignment + Offset);
+    return Alignment;
+  }
+
+  /// \brief Check if this slice can be rewritten with legal operations.
+  bool isLegal() const {
+    // An invalid slice is not legal.
+    if (!Origin || !Inst || !DAG)
+      return false;
+
+    // Offsets are for indexed load only, we do not handle that.
+    if (Origin->getOffset().getOpcode() != ISD::UNDEF)
+      return false;
+
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+
+    // Check that the type is legal.
+    EVT SliceType = getLoadedType();
+    if (!TLI.isTypeLegal(SliceType))
+      return false;
+
+    // Check that the load is legal for this type.
+    if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
+      return false;
+
+    // Check that the offset can be computed.
+    // 1. Check its type.
+    EVT PtrType = Origin->getBasePtr().getValueType();
+    if (PtrType == MVT::Untyped || PtrType.isExtended())
+      return false;
+
+    // 2. Check that it fits in the immediate.
+    if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
+      return false;
+
+    // 3. Check that the computation is legal.
+    if (!TLI.isOperationLegal(ISD::ADD, PtrType))
+      return false;
+
+    // Check that the zext is legal if it needs one.
+    EVT TruncateType = Inst->getValueType(0);
+    if (TruncateType != SliceType &&
+        !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
+      return false;
+
+    return true;
+  }
+
+  /// \brief Get the offset in bytes of this slice in the original chunk of
+  /// bits.
+  /// \pre DAG != NULL.
+  uint64_t getOffsetFromBase() const {
+    assert(DAG && "Missing context.");
+    bool IsBigEndian =
+        DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian();
+    assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
+    uint64_t Offset = Shift / 8;
+    unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
+    assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
+           "The size of the original loaded type is not a multiple of a"
+           " byte.");
+    // If Offset is bigger than TySizeInBytes, it means we are loading all
+    // zeros. This should have been optimized before in the process.
+    assert(TySizeInBytes > Offset &&
+           "Invalid shift amount for given loaded size");
+    if (IsBigEndian)
+      Offset = TySizeInBytes - Offset - getLoadedSize();
+    return Offset;
+  }
+
+  /// \brief Generate the sequence of instructions to load the slice
+  /// represented by this object and redirect the uses of this slice to
+  /// this new sequence of instructions.
+  /// \pre this->Inst && this->Origin are valid Instructions and this
+  /// object passed the legal check: LoadedSlice::isLegal returned true.
+  /// \return The last instruction of the sequence used to load the slice.
+  SDValue loadSlice() const {
+    assert(Inst && Origin && "Unable to replace a non-existing slice.");
+    const SDValue &OldBaseAddr = Origin->getBasePtr();
+    SDValue BaseAddr = OldBaseAddr;
+    // Get the offset in that chunk of bytes w.r.t. the endianess.
+    int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
+    assert(Offset >= 0 && "Offset too big to fit in int64_t!");
+    if (Offset) {
+      // BaseAddr = BaseAddr + Offset.
+      EVT ArithType = BaseAddr.getValueType();
+      BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr,
+                              DAG->getConstant(Offset, ArithType));
+    }
+
+    // Create the type of the loaded slice according to its size.
+    EVT SliceType = getLoadedType();
+
+    // Create the load for the slice.
+    SDValue LastInst = DAG->getLoad(
+        SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
+        Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(),
+        Origin->isNonTemporal(), Origin->isInvariant(), getAlignment());
+    // If the final type is not the same as the loaded type, this means that
+    // we have to pad with zero. Create a zero extend for that.
+    EVT FinalType = Inst->getValueType(0);
+    if (SliceType != FinalType)
+      LastInst =
+          DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
+    return LastInst;
+  }
+
+  /// \brief Check if this slice can be merged with an expensive cross register
+  /// bank copy. E.g.,
+  /// i = load i32
+  /// f = bitcast i32 i to float
+  bool canMergeExpensiveCrossRegisterBankCopy() const {
+    if (!Inst || !Inst->hasOneUse())
+      return false;
+    SDNode *Use = *Inst->use_begin();
+    if (Use->getOpcode() != ISD::BITCAST)
+      return false;
+    assert(DAG && "Missing context");
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+    EVT ResVT = Use->getValueType(0);
+    const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
+    const TargetRegisterClass *ArgRC =
+        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
+    if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // At this point, we know that we perform a cross-register-bank copy.
+    // Check if it is expensive.
+    const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
+    // Assume bitcasts are cheap, unless both register classes do not
+    // explicitly share a common sub class.
+    if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
+      return false;
+
+    // Check if it will be merged with the load.
+    // 1. Check the alignment constraint.
+    unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment(
+        ResVT.getTypeForEVT(*DAG->getContext()));
+
+    if (RequiredAlignment > getAlignment())
+      return false;
+
+    // 2. Check that the load is a legal operation for that type.
+    if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // 3. Check that we do not have a zext in the way.
+    if (Inst->getValueType(0) != getLoadedType())
+      return false;
+
+    return true;
+  }
+};
+}
+
+/// \brief Sorts LoadedSlice according to their offset.
+struct LoadedSliceSorter {
+  bool operator()(const LoadedSlice &LHS, const LoadedSlice &RHS) {
+    assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
+    return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
+  }
+};
+
+/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
+/// \p UsedBits looks like 0..0 1..1 0..0.
+static bool areUsedBitsDense(const APInt &UsedBits) {
+  // If all the bits are one, this is dense!
+  if (UsedBits.isAllOnesValue())
+    return true;
+
+  // Get rid of the unused bits on the right.
+  APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
+  // Get rid of the unused bits on the left.
+  if (NarrowedUsedBits.countLeadingZeros())
+    NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
+  // Check that the chunk of bits is completely used.
+  return NarrowedUsedBits.isAllOnesValue();
+}
+
+/// \brief Check whether or not \p First and \p Second are next to each other
+/// in memory. This means that there is no hole between the bits loaded
+/// by \p First and the bits loaded by \p Second.
+static bool areSlicesNextToEachOther(const LoadedSlice &First,
+                                     const LoadedSlice &Second) {
+  assert(First.Origin == Second.Origin && First.Origin &&
+         "Unable to match different memory origins.");
+  APInt UsedBits = First.getUsedBits();
+  assert((UsedBits & Second.getUsedBits()) == 0 &&
+         "Slices are not supposed to overlap.");
+  UsedBits |= Second.getUsedBits();
+  return areUsedBitsDense(UsedBits);
+}
+
+/// \brief Adjust the \p GlobalLSCost according to the target
+/// paring capabilities and the layout of the slices.
+/// \pre \p GlobalLSCost should account for at least as many loads as
+/// there is in the slices in \p LoadedSlices.
+static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                 LoadedSlice::Cost &GlobalLSCost) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  // If there is less than 2 elements, no pairing is possible.
+  if (NumberOfSlices < 2)
+    return;
+
+  // Sort the slices so that elements that are likely to be next to each
+  // other in memory are next to each other in the list.
+  std::sort(LoadedSlices.begin(), LoadedSlices.end(), LoadedSliceSorter());
+  const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
+  // First (resp. Second) is the first (resp. Second) potentially candidate
+  // to be placed in a paired load.
+  const LoadedSlice *First = NULL;
+  const LoadedSlice *Second = NULL;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
+                // Set the beginning of the pair.
+                                                           First = Second) {
+
+    Second = &LoadedSlices[CurrSlice];
+
+    // If First is NULL, it means we start a new pair.
+    // Get to the next slice.
+    if (!First)
+      continue;
+
+    EVT LoadedType = First->getLoadedType();
+
+    // If the types of the slices are different, we cannot pair them.
+    if (LoadedType != Second->getLoadedType())
+      continue;
+
+    // Check if the target supplies paired loads for this type.
+    unsigned RequiredAlignment = 0;
+    if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
+      // move to the next pair, this type is hopeless.
+      Second = NULL;
+      continue;
+    }
+    // Check if we meet the alignment requirement.
+    if (RequiredAlignment > First->getAlignment())
+      continue;
+
+    // Check that both loads are next to each other in memory.
+    if (!areSlicesNextToEachOther(*First, *Second))
+      continue;
+
+    assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
+    --GlobalLSCost.Loads;
+    // Move to the next pair.
+    Second = NULL;
+  }
+}
+
+/// \brief Check the profitability of all involved LoadedSlice.
+/// Currently, it is considered profitable if there is exactly two
+/// involved slices (1) which are (2) next to each other in memory, and
+/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
+///
+/// Note: The order of the elements in \p LoadedSlices may be modified, but not
+/// the elements themselves.
+///
+/// FIXME: When the cost model will be mature enough, we can relax
+/// constraints (1) and (2).
+static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                const APInt &UsedBits, bool ForCodeSize) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  if (StressLoadSlicing)
+    return NumberOfSlices > 1;
+
+  // Check (1).
+  if (NumberOfSlices != 2)
+    return false;
+
+  // Check (2).
+  if (!areUsedBitsDense(UsedBits))
+    return false;
+
+  // Check (3).
+  LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
+  // The original code has one big load.
+  OrigCost.Loads = 1;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
+    const LoadedSlice &LS = LoadedSlices[CurrSlice];
+    // Accumulate the cost of all the slices.
+    LoadedSlice::Cost SliceCost(LS, ForCodeSize);
+    GlobalSlicingCost += SliceCost;
+
+    // Account as cost in the original configuration the gain obtained
+    // with the current slices.
+    OrigCost.addSliceGain(LS);
+  }
+
+  // If the target supports paired load, adjust the cost accordingly.
+  adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
+  return OrigCost > GlobalSlicingCost;
+}
+
+/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
+/// operations, split it in the various pieces being extracted.
+///
+/// This sort of thing is introduced by SROA.
+/// This slicing takes care not to insert overlapping loads.
+/// \pre LI is a simple load (i.e., not an atomic or volatile load).
+bool DAGCombiner::SliceUpLoad(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
+      !LD->getValueType(0).isInteger())
+    return false;
+
+  // Keep track of already used bits to detect overlapping values.
+  // In that case, we will just abort the transformation.
+  APInt UsedBits(LD->getValueSizeInBits(0), 0);
+
+  SmallVector<LoadedSlice, 4> LoadedSlices;
+
+  // Check if this load is used as several smaller chunks of bits.
+  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
+  // of computation for each trunc.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    SDNode *User = *UI;
+    unsigned Shift = 0;
+
+    // Check if this is a trunc(lshr).
+    if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
+        isa<ConstantSDNode>(User->getOperand(1))) {
+      Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue();
+      User = *User->use_begin();
+    }
+
+    // At this point, User is a Truncate, iff we encountered, trunc or
+    // trunc(lshr).
+    if (User->getOpcode() != ISD::TRUNCATE)
+      return false;
+
+    // The width of the type must be a power of 2 and greater than 8-bits.
+    // Otherwise the load cannot be represented in LLVM IR.
+    // Moreover, if we shifted with a non 8-bits multiple, the slice
+    // will be accross several bytes. We do not support that.
+    unsigned Width = User->getValueSizeInBits(0);
+    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
+      return 0;
+
+    // Build the slice for this chain of computations.
+    LoadedSlice LS(User, LD, Shift, &DAG);
+    APInt CurrentUsedBits = LS.getUsedBits();
+
+    // Check if this slice overlaps with another.
+    if ((CurrentUsedBits & UsedBits) != 0)
+      return false;
+    // Update the bits used globally.
+    UsedBits |= CurrentUsedBits;
+
+    // Check if the new slice would be legal.
+    if (!LS.isLegal())
+      return false;
+
+    // Record the slice.
+    LoadedSlices.push_back(LS);
+  }
+
+  // Abort slicing if it does not seem to be profitable.
+  if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
+    return false;
+
+  ++SlicedLoads;
+
+  // Rewrite each chain to use an independent load.
+  // By construction, each chain can be represented by a unique load.
+
+  // Prepare the argument for the new token factor for all the slices.
+  SmallVector<SDValue, 8> ArgChains;
+  for (SmallVectorImpl<LoadedSlice>::const_iterator
+           LSIt = LoadedSlices.begin(),
+           LSItEnd = LoadedSlices.end();
+       LSIt != LSItEnd; ++LSIt) {
+    SDValue SliceInst = LSIt->loadSlice();
+    CombineTo(LSIt->Inst, SliceInst, true);
+    if (SliceInst.getNode()->getOpcode() != ISD::LOAD)
+      SliceInst = SliceInst.getOperand(0);
+    assert(SliceInst->getOpcode() == ISD::LOAD &&
+           "It takes more than a zext to get to the loaded slice!!");
+    ArgChains.push_back(SliceInst.getValue(1));
+  }
+
+  SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
+                              &ArgChains[0], ArgChains.size());
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  return true;
+}
+
 /// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
 /// load is having specific bytes cleared out.  If so, return the byte size
 /// being masked out and the shift amount.
@@ -7735,7 +8363,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
                                   LD->getChain(), NewPtr,
                                   LD->getPointerInfo().getWithOffset(PtrOff),
                                   LD->isVolatile(), LD->isNonTemporal(),
-                                  LD->isInvariant(), NewAlign);
+                                  LD->isInvariant(), NewAlign,
+                                  LD->getTBAAInfo());
       SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
                                    DAG.getConstant(NewImm, NewVT));
       SDValue NewST = DAG.getStore(Chain, SDLoc(N),
@@ -7846,17 +8475,28 @@ struct BaseIndexOffset {
   static BaseIndexOffset match(SDValue Ptr) {
     bool IsIndexSignExt = false;
 
-    // Just Base or possibly anything else.
+    // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD
+    // instruction, then it could be just the BASE or everything else we don't
+    // know how to handle. Just use Ptr as BASE and give up.
     if (Ptr->getOpcode() != ISD::ADD)
       return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
 
-    // Base + offset.
+    // We know that we have at least an ADD instruction. Try to pattern match
+    // the simple case of BASE + OFFSET.
     if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
       int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
       return  BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset,
                               IsIndexSignExt);
     }
 
+    // Inside a loop the current BASE pointer is calculated using an ADD and a
+    // MUL instruction. In this case Ptr is the actual BASE pointer.
+    // (i64 add (i64 %array_ptr)
+    //          (i64 mul (i64 %induction_var)
+    //                   (i64 %element_size)))
+    if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
     // Look at Base + Index + Offset cases.
     SDValue Base = Ptr->getOperand(0);
     SDValue IndexOffset = Ptr->getOperand(1);
@@ -8007,6 +8647,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
         Index = STn;
         break;
       } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
+        if (Ldn->isVolatile()) {
+          Index = NULL;
+          break;
+        }
+
         // Save the load node for later. Continue the scan.
         AliasLoadNodes.push_back(Ldn);
         NextInChain = Ldn->getChain().getNode();
@@ -8384,7 +9029,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
          TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
       return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0),
                           Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                          ST->isNonTemporal(), OrigAlign);
+                          ST->isNonTemporal(), OrigAlign,
+                          ST->getTBAAInfo());
   }
 
   // Turn 'store undef, Ptr' -> nothing.
@@ -8399,7 +9045,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // transform should not be done in this case.
     if (Value.getOpcode() != ISD::TargetConstantFP) {
       SDValue Tmp;
-      switch (CFP->getValueType(0).getSimpleVT().SimpleTy) {
+      switch (CFP->getSimpleValueType(0).SimpleTy) {
       default: llvm_unreachable("Unknown FP type");
       case MVT::f16:    // We don't do this for these yet.
       case MVT::f80:
@@ -8412,8 +9058,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
                               bitcastToAPInt().getZExtValue(), MVT::i32);
           return DAG.getStore(Chain, SDLoc(N), Tmp,
-                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                              ST->isNonTemporal(), ST->getAlignment());
+                              Ptr, ST->getMemOperand());
         }
         break;
       case MVT::f64:
@@ -8423,8 +9068,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                 getZExtValue(), MVT::i64);
           return DAG.getStore(Chain, SDLoc(N), Tmp,
-                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                              ST->isNonTemporal(), ST->getAlignment());
+                              Ptr, ST->getMemOperand());
         }
 
         if (!ST->isVolatile() &&
@@ -8440,18 +9084,19 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           unsigned Alignment = ST->getAlignment();
           bool isVolatile = ST->isVolatile();
           bool isNonTemporal = ST->isNonTemporal();
+          const MDNode *TBAAInfo = ST->getTBAAInfo();
 
           SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo,
                                      Ptr, ST->getPointerInfo(),
                                      isVolatile, isNonTemporal,
-                                     ST->getAlignment());
+                                     ST->getAlignment(), TBAAInfo);
           Ptr = DAG.getNode(ISD::ADD, SDLoc(N), Ptr.getValueType(), Ptr,
                             DAG.getConstant(4, Ptr.getValueType()));
           Alignment = MinAlign(Alignment, 4U);
           SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi,
                                      Ptr, ST->getPointerInfo().getWithOffset(4),
                                      isVolatile, isNonTemporal,
-                                     Alignment);
+                                     Alignment, TBAAInfo);
           return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other,
                              St0, St1);
         }
@@ -8467,7 +9112,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       if (Align > ST->getAlignment())
         return DAG.getTruncStore(Chain, SDLoc(N), Value,
                                  Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                                 ST->isVolatile(), ST->isNonTemporal(), Align);
+                                 ST->isVolatile(), ST->isNonTemporal(), Align,
+                                 ST->getTBAAInfo());
     }
   }
 
@@ -8477,7 +9123,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (NewST.getNode())
     return NewST;
 
-  if (CombinerAA) {
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -8488,14 +9136,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       // Replace the chain to avoid dependency.
       if (ST->isTruncatingStore()) {
         ReplStore = DAG.getTruncStore(BetterChain, SDLoc(N), Value, Ptr,
-                                      ST->getPointerInfo(),
-                                      ST->getMemoryVT(), ST->isVolatile(),
-                                      ST->isNonTemporal(), ST->getAlignment());
+                                      ST->getMemoryVT(), ST->getMemOperand());
       } else {
         ReplStore = DAG.getStore(BetterChain, SDLoc(N), Value, Ptr,
-                                 ST->getPointerInfo(),
-                                 ST->isVolatile(), ST->isNonTemporal(),
-                                 ST->getAlignment());
+                                 ST->getMemOperand());
       }
 
       // Create token to keep both nodes around.
@@ -8528,9 +9172,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     AddToWorkList(Value.getNode());
     if (Shorter.getNode())
       return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
-                               Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                               ST->isVolatile(), ST->isNonTemporal(),
-                               ST->getAlignment());
+                               Ptr, ST->getMemoryVT(), ST->getMemOperand());
 
     // Otherwise, see if we can simplify the operation with
     // SimplifyDemandedBits, which only works if the value has a single use.
@@ -8561,9 +9203,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
                             ST->getMemoryVT())) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
-                             Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                             ST->isVolatile(), ST->isNonTemporal(),
-                             ST->getAlignment());
+                             Ptr, ST->getMemoryVT(), ST->getMemOperand());
   }
 
   // Only perform this optimization before the types are legal, because we
@@ -8821,13 +9461,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
         ? ISD::ZEXTLOAD : ISD::EXTLOAD;
       Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(),
                             NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
-                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),Align);
+                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                            Align, LN0->getTBAAInfo());
       Chain = Load.getValue(1);
     } else {
       Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr,
                          LN0->getPointerInfo().getWithOffset(PtrOff),
                          LN0->isVolatile(), LN0->isNonTemporal(),
-                         LN0->isInvariant(), Align);
+                         LN0->isInvariant(), Align, LN0->getTBAAInfo());
       Chain = Load.getValue(1);
       if (NVT.bitsLT(LVT))
         Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load);
@@ -9165,8 +9806,35 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     return N->getOperand(0);
 
   // Check if all of the operands are undefs.
+  EVT VT = N->getValueType(0);
   if (ISD::allOperandsUndef(N))
-    return DAG.getUNDEF(N->getValueType(0));
+    return DAG.getUNDEF(VT);
+
+  // Optimize concat_vectors where one of the vectors is undef.
+  if (N->getNumOperands() == 2 &&
+      N->getOperand(1)->getOpcode() == ISD::UNDEF) {
+    SDValue In = N->getOperand(0);
+    assert(In.getValueType().isVector() && "Must concat vectors");
+
+    // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
+    if (In->getOpcode() == ISD::BITCAST &&
+        !In->getOperand(0)->getValueType(0).isVector()) {
+      SDValue Scalar = In->getOperand(0);
+      EVT SclTy = Scalar->getValueType(0);
+
+      if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy,
+                                 VT.getSizeInBits() / SclTy.getSizeInBits());
+      if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
+        return SDValue();
+
+      SDLoc dl = SDLoc(N);
+      SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NVT, Scalar);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Res);
+    }
+  }
 
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
   // nodes often generate nop CONCAT_VECTOR nodes.
@@ -9225,7 +9893,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     //    (extract_subvec (concat V1, V2, ...), i)
     // Into:
     //    Vi if possible
-    // Only operand 0 is checked as 'concat' assumes all inputs of the same type.
+    // Only operand 0 is checked as 'concat' assumes all inputs of the same
+    // type.
     if (V->getOperand(0).getValueType() != NVT)
       return SDValue();
     unsigned Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -9358,10 +10027,10 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       if (Idx >= 0) {
-        if (Idx < (int)NumElts)
-          Idx += NumElts;
-        else
+        if (Idx >= (int)NumElts)
           Idx -= NumElts;
+        else
+          Idx = -1; // remove reference to lhs
       }
       NewMask.push_back(Idx);
     }
@@ -9738,7 +10407,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       Load = DAG.getLoad(TheSelect->getValueType(0),
                          SDLoc(TheSelect),
-                         // FIXME: Discards pointer info.
+                         // FIXME: Discards pointer and TBAA info.
                          LLD->getChain(), Addr, MachinePointerInfo(),
                          LLD->isVolatile(), LLD->isNonTemporal(),
                          LLD->isInvariant(), LLD->getAlignment());
@@ -9747,7 +10416,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                             RLD->getExtensionType() : LLD->getExtensionType(),
                             SDLoc(TheSelect),
                             TheSelect->getValueType(0),
-                            // FIXME: Discards pointer info.
+                            // FIXME: Discards pointer and TBAA info.
                             LLD->getChain(), Addr, MachinePointerInfo(),
                             LLD->getMemoryVT(), LLD->isVolatile(),
                             LLD->isNonTemporal(), LLD->getAlignment());
@@ -9852,7 +10521,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
         SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
                                           Cond, One, Zero);
         AddToWorkList(CstOffset.getNode());
-        CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
+        CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
                             CstOffset);
         AddToWorkList(CPIdx.getNode());
         return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
@@ -9974,9 +10643,10 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
         return Temp;
 
       // shl setcc result by log2 n2c
-      return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
-                         DAG.getConstant(N2C->getAPIntValue().logBase2(),
-                                         getShiftAmountTy(Temp.getValueType())));
+      return DAG.getNode(
+          ISD::SHL, DL, N2.getValueType(), Temp,
+          DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                          getShiftAmountTy(Temp.getValueType())));
     }
   }
 
@@ -10132,17 +10802,20 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
 
 /// isAlias - Return true if there is any possibility that the two addresses
 /// overlap.
-bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
+bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
                           const Value *SrcValue1, int SrcValueOffset1,
                           unsigned SrcValueAlign1,
                           const MDNode *TBAAInfo1,
-                          SDValue Ptr2, int64_t Size2,
+                          SDValue Ptr2, int64_t Size2, bool IsVolatile2,
                           const Value *SrcValue2, int SrcValueOffset2,
                           unsigned SrcValueAlign2,
                           const MDNode *TBAAInfo2) const {
   // If they are the same then they must be aliases.
   if (Ptr1 == Ptr2) return true;
 
+  // If they are both volatile then they cannot be reordered.
+  if (IsVolatile1 && IsVolatile2) return true;
+
   // Gather base node and offset information.
   SDValue Base1, Base2;
   int64_t Offset1, Offset2;
@@ -10187,7 +10860,9 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
       return false;
   }
 
-  if (CombinerGlobalAA) {
+  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA && SrcValue1 && SrcValue2) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValueOffset1, SrcValueOffset2);
     int64_t Overlap1 = Size1 + SrcValueOffset1 - MinOffset;
@@ -10206,24 +10881,25 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
 bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) {
   SDValue Ptr0, Ptr1;
   int64_t Size0, Size1;
+  bool IsVolatile0, IsVolatile1;
   const Value *SrcValue0, *SrcValue1;
   int SrcValueOffset0, SrcValueOffset1;
   unsigned SrcValueAlign0, SrcValueAlign1;
   const MDNode *SrcTBAAInfo0, *SrcTBAAInfo1;
-  FindAliasInfo(Op0, Ptr0, Size0, SrcValue0, SrcValueOffset0,
+  FindAliasInfo(Op0, Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
                 SrcValueAlign0, SrcTBAAInfo0);
-  FindAliasInfo(Op1, Ptr1, Size1, SrcValue1, SrcValueOffset1,
+  FindAliasInfo(Op1, Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
                 SrcValueAlign1, SrcTBAAInfo1);
-  return isAlias(Ptr0, Size0, SrcValue0, SrcValueOffset0,
+  return isAlias(Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
                  SrcValueAlign0, SrcTBAAInfo0,
-                 Ptr1, Size1, SrcValue1, SrcValueOffset1,
+                 Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
                  SrcValueAlign1, SrcTBAAInfo1);
 }
 
 /// FindAliasInfo - Extracts the relevant alias information from the memory
-/// node.  Returns true if the operand was a load.
+/// node.  Returns true if the operand was a nonvolatile load.
 bool DAGCombiner::FindAliasInfo(SDNode *N,
-                                SDValue &Ptr, int64_t &Size,
+                                SDValue &Ptr, int64_t &Size, bool &IsVolatile,
                                 const Value *&SrcValue,
                                 int &SrcValueOffset,
                                 unsigned &SrcValueAlign,
@@ -10232,11 +10908,12 @@ bool DAGCombiner::FindAliasInfo(SDNode *N,
 
   Ptr = LS->getBasePtr();
   Size = LS->getMemoryVT().getSizeInBits() >> 3;
+  IsVolatile = LS->isVolatile();
   SrcValue = LS->getSrcValue();
   SrcValueOffset = LS->getSrcValueOffset();
   SrcValueAlign = LS->getOriginalAlignment();
   TBAAInfo = LS->getTBAAInfo();
-  return isa<LoadSDNode>(LS);
+  return isa<LoadSDNode>(LS) && !IsVolatile;
 }
 
 /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
@@ -10249,12 +10926,13 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   // Get alias information for node.
   SDValue Ptr;
   int64_t Size;
+  bool IsVolatile;
   const Value *SrcValue;
   int SrcValueOffset;
   unsigned SrcValueAlign;
   const MDNode *SrcTBAAInfo;
-  bool IsLoad = FindAliasInfo(N, Ptr, Size, SrcValue, SrcValueOffset,
-                              SrcValueAlign, SrcTBAAInfo);
+  bool IsLoad = FindAliasInfo(N, Ptr, Size, IsVolatile, SrcValue,
+                              SrcValueOffset, SrcValueAlign, SrcTBAAInfo);
 
   // Starting off.
   Chains.push_back(OriginalChain);
@@ -10295,20 +10973,21 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       // Get alias information for Chain.
       SDValue OpPtr;
       int64_t OpSize;
+      bool OpIsVolatile;
       const Value *OpSrcValue;
       int OpSrcValueOffset;
       unsigned OpSrcValueAlign;
       const MDNode *OpSrcTBAAInfo;
       bool IsOpLoad = FindAliasInfo(Chain.getNode(), OpPtr, OpSize,
-                                    OpSrcValue, OpSrcValueOffset,
+                                    OpIsVolatile, OpSrcValue, OpSrcValueOffset,
                                     OpSrcValueAlign,
                                     OpSrcTBAAInfo);
 
       // If chain is alias then stop here.
       if (!(IsLoad && IsOpLoad) &&
-          isAlias(Ptr, Size, SrcValue, SrcValueOffset, SrcValueAlign,
-                  SrcTBAAInfo,
-                  OpPtr, OpSize, OpSrcValue, OpSrcValueOffset,
+          isAlias(Ptr, Size, IsVolatile, SrcValue, SrcValueOffset,
+                  SrcValueAlign, SrcTBAAInfo,
+                  OpPtr, OpSize, OpIsVolatile, OpSrcValue, OpSrcValueOffset,
                   OpSrcValueAlign, OpSrcTBAAInfo)) {
         Aliases.push_back(Chain);
       } else {
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index b4ac948f..a6f7461 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -638,29 +638,25 @@ bool FastISel::SelectCall(const User *I) {
         (!isa<AllocaInst>(Address) ||
          !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
       Op = MachineOperand::CreateReg(FuncInfo.InitializeRegForValue(Address),
-                                      false);
+                                     false);
 
-    if (Op)
+    if (Op) {
       if (Op->isReg()) {
-        // Set the indirect flag if the type and the DIVariable's
-        // indirect field are in disagreement: Indirectly-addressed
-        // variables that are nonpointer types should be marked as
-        // indirect, and VLAs should be marked as indirect eventhough
-        // they are a pointer type.
-        bool IsIndirect = DI->getAddress()->getType()->isPointerTy()
-          ^ DIVar.isIndirect();
         Op->setIsDebug(true);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                TII.get(TargetOpcode::DBG_VALUE),
-                IsIndirect, Op->getReg(), Offset, DI->getVariable());
+                TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0,
+                DI->getVariable());
       } else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                TII.get(TargetOpcode::DBG_VALUE)).addOperand(*Op).addImm(0)
-          .addMetadata(DI->getVariable());
-    else
+                TII.get(TargetOpcode::DBG_VALUE))
+            .addOperand(*Op)
+            .addImm(0)
+            .addMetadata(DI->getVariable());
+    } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
       DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+    }
     return true;
   }
   case Intrinsic::dbg_value: {
@@ -688,6 +684,7 @@ bool FastISel::SelectCall(const User *I) {
         .addFPImm(CF).addImm(DI->getOffset())
         .addMetadata(DI->getVariable());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
+      // FIXME: This does not handle register-indirect values at offset 0.
       bool IsIndirect = DI->getOffset() != 0;
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, IsIndirect,
               Reg, DI->getOffset(), DI->getVariable());
@@ -1574,4 +1571,19 @@ bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   return tryToFoldLoadIntoMI(User, RI.getOperandNo(), LI);
 }
 
+bool FastISel::canFoldAddIntoGEP(const User *GEP, const Value *Add) {
+  // Must be an add.
+  if (!isa<AddOperator>(Add))
+    return false;
+  // Type size needs to match.
+  if (TD.getTypeSizeInBits(GEP->getType()) !=
+      TD.getTypeSizeInBits(Add->getType()))
+    return false;
+  // Must be in the same basic block.
+  if (isa<Instruction>(Add) &&
+      FuncInfo.MBBMap[cast<Instruction>(Add)->getParent()] != FuncInfo.MBB)
+    return false;
+  // Must have a constant operand.
+  return isa<ConstantInt>(cast<AddOperator>(Add)->getOperand(1));
+}
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index e107276..3a8fb85 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -211,6 +212,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
   assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
          "IMPLICIT_DEF should have been handled as a special case elsewhere!");
 
+  unsigned NumResults = CountResults(Node);
   for (unsigned i = 0; i < II.getNumDefs(); ++i) {
     // If the specific node value is only used by a CopyToReg and the dest reg
     // is a vreg in the same register class, use the CopyToReg'd destination
@@ -218,6 +220,10 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     unsigned VRBase = 0;
     const TargetRegisterClass *RC =
       TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
+    // If the register class is unknown for the given definition, then try to
+    // infer one from the value type.
+    if (!RC && i < NumResults)
+      RC = TLI->getRegClassFor(Node->getSimpleValueType(i));
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       unsigned NumResults = CountResults(Node);
@@ -722,10 +728,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
+  unsigned NumDefs = II.getNumDefs();
+  const uint16_t *ScratchRegs = NULL;
+
+  // Handle PATCHPOINT specially and then use the generic code.
+  if (Opc == TargetOpcode::PATCHPOINT) {
+    unsigned CC = Node->getConstantOperandVal(PatchPointOpers::CCPos);
+    NumDefs = NumResults;
+    ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  }
+
   unsigned NumImpUses = 0;
   unsigned NodeOperands =
-    countOperands(Node, II.getNumOperands() - II.getNumDefs(), NumImpUses);
-  bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
+    countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses);
+  bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
   if (II.isVariadic())
@@ -748,14 +764,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Emit all of the actual operands of this instruction, adding them to the
   // instruction as appropriate.
-  bool HasOptPRefs = II.getNumDefs() > NumResults;
+  bool HasOptPRefs = NumDefs > NumResults;
   assert((!HasOptPRefs || !HasPhysRegOuts) &&
          "Unable to cope with optional defs and phys regs defs!");
-  unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0;
+  unsigned NumSkip = HasOptPRefs ? NumDefs - NumResults : 0;
   for (unsigned i = NumSkip; i != NodeOperands; ++i)
-    AddOperand(MIB, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II,
+    AddOperand(MIB, Node->getOperand(i), i-NumSkip+NumDefs, &II,
                VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned);
 
+  // Add scratch registers as implicit def and early clobber
+  if (ScratchRegs)
+    for (unsigned i = 0; ScratchRegs[i]; ++i)
+      MIB.addReg(ScratchRegs[i], RegState::ImplicitDefine |
+                                 RegState::EarlyClobber);
+
   // Transfer all of the memory reference descriptions of this instruction.
   MIB.setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
                  cast<MachineSDNode>(Node)->memoperands_end());
@@ -784,8 +806,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Additional results must be physical register defs.
   if (HasPhysRegOuts) {
-    for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
-      unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
+    for (unsigned i = NumDefs; i < NumResults; ++i) {
+      unsigned Reg = II.getImplicitDefs()[i - NumDefs];
       if (!Node->hasAnyUseOfValue(i))
         continue;
       // This implicitly defined physreg has a use.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bd844e5..9061ae9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -95,8 +95,8 @@ private:
                                      SDValue N1, SDValue N2,
                                      ArrayRef<int> Mask) const;
 
-  void LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
-                             SDLoc dl);
+  bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
+                             bool &NeedInvert, SDLoc dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
   SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
@@ -311,6 +311,8 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   SDValue Val = ST->getValue();
   EVT VT = Val.getValueType();
   int Alignment = ST->getAlignment();
+  unsigned AS = ST->getAddressSpace();
+
   SDLoc dl(ST);
   if (ST->getMemoryVT().isFloatingPoint() ||
       ST->getMemoryVT().isVector()) {
@@ -343,7 +345,7 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
     SDValue Store = DAG.getTruncStore(Chain, dl,
                                       Val, StackPtr, MachinePointerInfo(),
                                       StoredVT, false, false, 0);
-    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy(AS));
     SmallVector<SDValue, 8> Stores;
     unsigned Offset = 0;
 
@@ -381,7 +383,8 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                          .getWithOffset(Offset),
                                        MemVT, ST->isVolatile(),
                                        ST->isNonTemporal(),
-                                       MinAlign(ST->getAlignment(), Offset)));
+                                       MinAlign(ST->getAlignment(), Offset),
+                                       ST->getTBAAInfo()));
     // The order of the stores doesn't matter - say it with a TokenFactor.
     SDValue Result =
       DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
@@ -408,13 +411,14 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   Store1 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Lo:Hi, Ptr,
                              ST->getPointerInfo(), NewStoredVT,
                              ST->isVolatile(), ST->isNonTemporal(), Alignment);
+
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                    DAG.getConstant(IncrementSize, TLI.getPointerTy(AS)));
   Alignment = MinAlign(Alignment, IncrementSize);
   Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                              NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
-                             Alignment);
+                             Alignment, ST->getTBAAInfo());
 
   SDValue Result =
     DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
@@ -438,10 +442,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
     if (TLI.isTypeLegal(intVT) && TLI.isTypeLegal(LoadedVT)) {
       // Expand to a (misaligned) integer load of the same size,
       // then bitconvert to floating point or vector.
-      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                                    LD->isVolatile(),
-                                    LD->isNonTemporal(),
-                                    LD->isInvariant(), LD->getAlignment());
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
+                                    LD->getMemOperand());
       SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
       if (LoadedVT != VT)
         Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
@@ -474,7 +476,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
                                  LD->isInvariant(),
-                                 MinAlign(LD->getAlignment(), Offset));
+                                 MinAlign(LD->getAlignment(), Offset),
+                                 LD->getTBAAInfo());
       // Follow the load with a store to the stack slot.  Remember the store.
       Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
                                     MachinePointerInfo(), false, false, 0));
@@ -492,7 +495,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                   LD->getPointerInfo().getWithOffset(Offset),
                                   MemVT, LD->isVolatile(),
                                   LD->isNonTemporal(),
-                                  MinAlign(LD->getAlignment(), Offset));
+                                  MinAlign(LD->getAlignment(), Offset),
+                                  LD->getTBAAInfo());
     // Follow the load with a store to the stack slot.  Remember the store.
     // On big-endian machines this requires a truncating store to ensure
     // that the bits end up in the right place.
@@ -536,23 +540,25 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   if (TLI.isLittleEndian()) {
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment);
+                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
+                        LD->getTBAAInfo());
   } else {
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment);
+                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
+                        LD->getTBAAInfo());
   }
 
   // aggregate the two parts
@@ -655,6 +661,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDLoc dl(ST);
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
@@ -663,7 +670,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
       return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                          isVolatile, isNonTemporal, Alignment);
+                          isVolatile, isNonTemporal, Alignment, TBAAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
@@ -672,7 +679,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                            isVolatile, isNonTemporal, Alignment);
+                            isVolatile, isNonTemporal, Alignment, TBAAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
@@ -685,12 +692,13 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
         Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                            DAG.getIntPtrConstant(4));
+                          DAG.getConstant(4, Ptr.getValueType()));
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
-                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
+                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U),
+                          TBAAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
@@ -708,6 +716,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
     unsigned Alignment = ST->getAlignment();
     bool isVolatile = ST->isVolatile();
     bool isNonTemporal = ST->isNonTemporal();
+    const MDNode *TBAAInfo = ST->getTBAAInfo();
 
     if (!ST->isTruncatingStore()) {
       if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
@@ -745,7 +754,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr,
                          ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment);
+                         isNonTemporal, Alignment, TBAAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -767,7 +776,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         Value = DAG.getZeroExtendInReg(Value, dl, StVT);
         SDValue Result =
           DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment);
+                            NVT, isVolatile, isNonTemporal, Alignment,
+                            TBAAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -788,19 +798,20 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // Store the bottom RoundWidth bits.
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                                  RoundVT,
-                                 isVolatile, isNonTemporal, Alignment);
+                                 isVolatile, isNonTemporal, Alignment,
+                                 TBAAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                             DAG.getIntPtrConstant(IncrementSize));
+                            DAG.getConstant(IncrementSize, Ptr.getValueType()));
           Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
                            DAG.getConstant(RoundWidth,
                                     TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
+                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
         } else {
           // Big endian - avoid unaligned stores.
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
@@ -809,16 +820,17 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
                            DAG.getConstant(ExtraWidth,
                                     TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
-                                 RoundVT, isVolatile, isNonTemporal, Alignment);
+                                 RoundVT, isVolatile, isNonTemporal, Alignment,
+                                 TBAAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                             DAG.getIntPtrConstant(IncrementSize));
+                             DAG.getConstant(IncrementSize, Ptr.getValueType()));
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
                               ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
+                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
         }
 
         // The order of the stores doesn't matter.
@@ -854,7 +866,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment);
+                         isVolatile, isNonTemporal, Alignment, TBAAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -902,9 +914,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
              "Can only promote loads to same size type");
 
-      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                         LD->isVolatile(), LD->isNonTemporal(),
-                         LD->isInvariant(), LD->getAlignment());
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
       RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
       RChain = Res.getValue(1);
       break;
@@ -924,6 +934,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   unsigned Alignment = LD->getAlignment();
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   if (SrcWidth != SrcVT.getStoreSizeInBits() &&
       // Some targets pretend to have an i1 loading operation, and actually
@@ -950,7 +961,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     SDValue Result =
       DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
                      Chain, Ptr, LD->getPointerInfo(),
-                     NVT, isVolatile, isNonTemporal, Alignment);
+                     NVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
@@ -987,16 +998,16 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
                           Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize));
+                          MinAlign(Alignment, IncrementSize), TBAAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1016,17 +1027,17 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
                           dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize));
+                          MinAlign(Alignment, IncrementSize), TBAAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1079,9 +1090,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     case TargetLowering::Expand:
              if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
                SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
-                                          LD->getPointerInfo(),
-                                          LD->isVolatile(), LD->isNonTemporal(),
-                                          LD->isInvariant(), LD->getAlignment());
+                                          LD->getMemOperand());
                unsigned ExtendOp;
                switch (ExtType) {
                case ISD::EXTLOAD:
@@ -1109,9 +1118,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
              // Turn the unsupported load into an EXTLOAD followed by an explicit
              // zero/sign extend inreg.
              SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
-                                             Chain, Ptr, LD->getPointerInfo(), SrcVT,
-                                             LD->isVolatile(), LD->isNonTemporal(),
-                                             LD->getAlignment());
+                                             Chain, Ptr, SrcVT,
+                                             LD->getMemOperand());
              SDValue ValRes;
              if (ExtType == ISD::SEXTLOAD)
                ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
@@ -1386,11 +1394,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, Idx.getValueType()));
 
-  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
-    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
-  else
-    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
-
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
   StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
 
   if (Op.getValueType().isVector())
@@ -1428,11 +1432,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, Idx.getValueType()));
-
-  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
-    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
-  else
-    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
 
   SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
                                     StackPtr);
@@ -1531,7 +1531,8 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
       unsigned Strides = (FloatVT.getSizeInBits()-1)/LoadTy.getSizeInBits();
       unsigned ByteOffset = (Strides * LoadTy.getSizeInBits()) / 8;
       LoadPtr = DAG.getNode(ISD::ADD, dl, LoadPtr.getValueType(),
-                            LoadPtr, DAG.getIntPtrConstant(ByteOffset));
+                            LoadPtr,
+                            DAG.getConstant(ByteOffset, LoadPtr.getValueType()));
       // Load a legal integer containing the sign bit.
       SignBit = DAG.getLoad(LoadTy, dl, Ch, LoadPtr, MachinePointerInfo(),
                             false, false, false, 0);
@@ -1580,10 +1581,10 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   Chain = SP.getValue(1);
   unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
   unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
-  if (Align > StackAlign)
-    SP = DAG.getNode(ISD::AND, dl, VT, SP,
-                      DAG.getConstant(-(uint64_t)Align, VT));
   Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  if (Align > StackAlign)
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-(uint64_t)Align, VT));
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
 
   Tmp2 = DAG.getCALLSEQ_END(Chain,  DAG.getIntPtrConstant(0, true),
@@ -1595,22 +1596,44 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 }
 
 /// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
-/// condition code CC on the current target. This routine expands SETCC with
-/// illegal condition code into AND / OR of multiple SETCC values.
-void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
+/// condition code CC on the current target.
+///
+/// If the SETCC has been legalized using AND / OR, then the legalized node
+/// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
+/// will be set to false.
+///
+/// If the SETCC has been legalized by using getSetCCSwappedOperands(),
+/// then the values of LHS and RHS will be swapped, CC will be set to the
+/// new condition, and NeedInvert will be set to false.
+///
+/// If the SETCC has been legalized using the inverse condcode, then LHS and
+/// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
+/// will be set to true. The caller must invert the result of the SETCC with
+/// SelectionDAG::getNOT() or take equivalent action to swap the effect of a
+/// true/false result.
+///
+/// \returns true if the SetCC has been legalized, false if it hasn't.
+bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
                                                  SDValue &LHS, SDValue &RHS,
                                                  SDValue &CC,
+                                                 bool &NeedInvert,
                                                  SDLoc dl) {
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  NeedInvert = false;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default: llvm_unreachable("Unknown condition code action!");
   case TargetLowering::Legal:
     // Nothing to do.
     break;
   case TargetLowering::Expand: {
+    ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
+    if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(InvCC);
+      return true;
+    }
     ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
-    ISD::CondCode InvCC = ISD::SETCC_INVALID;
     unsigned Opc = 0;
     switch (CCCode) {
     default: llvm_unreachable("Don't know how to expand this condition!");
@@ -1650,18 +1673,21 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     case ISD::SETGT:
     case ISD::SETGE:
     case ISD::SETLT:
+      // We only support using the inverted operation, which is computed above
+      // and not a different manner of supporting expanding these cases.
+      llvm_unreachable("Don't know how to expand this condition!");
     case ISD::SETNE:
     case ISD::SETEQ:
-      InvCC = ISD::getSetCCSwappedOperands(CCCode);
-      if (TLI.getCondCodeAction(InvCC, OpVT) == TargetLowering::Expand) {
-        // We only support using the inverted operation and not a
-        // different manner of supporting expanding these cases.
-        llvm_unreachable("Don't know how to expand this condition!");
+      // Try inverting the result of the inverse condition.
+      InvCC = CCCode == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
+      if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+        CC = DAG.getCondCode(InvCC);
+        NeedInvert = true;
+        return true;
       }
-      LHS = DAG.getSetCC(dl, VT, RHS, LHS, InvCC);
-      RHS = SDValue();
-      CC = SDValue();
-      return;
+      // If inverting the condition didn't work then we have no means to expand
+      // the condition.
+      llvm_unreachable("Don't know how to expand this condition!");
     }
 
     SDValue SetCC1, SetCC2;
@@ -1678,9 +1704,10 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
     RHS = SDValue();
     CC  = SDValue();
-    break;
+    return true;
   }
   }
+  return false;
 }
 
 /// EmitStackConvert - Emit a store/load combination to the stack.  This stores
@@ -1969,7 +1996,7 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F128,
                                               RTLIB::Libcall Call_PPCF128) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32: LC = Call_F32; break;
   case MVT::f64: LC = Call_F64; break;
@@ -1987,7 +2014,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
                                                RTLIB::Libcall Call_I64,
                                                RTLIB::Libcall Call_I128) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC = Call_I8; break;
   case MVT::i16:  LC = Call_I16; break;
@@ -2002,7 +2029,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
@@ -2049,7 +2076,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   bool isSigned = Opcode == ISD::SDIVREM;
 
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
@@ -2106,7 +2133,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 /// isSinCosLibcallAvailable - Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
@@ -2156,7 +2183,7 @@ void
 SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
@@ -2232,11 +2259,11 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
 
     // word offset constant for Hi/Lo address computation
-    SDValue WordOff = DAG.getConstant(sizeof(int), TLI.getPointerTy());
+    SDValue WordOff = DAG.getConstant(sizeof(int), StackSlot.getValueType());
     // set up Hi and Lo (into buffer) address based on endian
     SDValue Hi = StackSlot;
-    SDValue Lo = DAG.getNode(ISD::ADD, dl,
-                             TLI.getPointerTy(), StackSlot, WordOff);
+    SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
+                             StackSlot, WordOff);
     if (TLI.isLittleEndian())
       std::swap(Hi, Lo);
 
@@ -2382,7 +2409,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   // as a negative number.  To counteract this, the dynamic code adds an
   // offset depending on the data type.
   uint64_t FF;
-  switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+  switch (Op0.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unsupported integer type!");
   case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
   case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
@@ -2395,7 +2422,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
 
   SDValue CPIdx = DAG.getConstantPool(FudgeFactor, TLI.getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  CPIdx = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), CPIdx, CstOffset);
+  CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
   Alignment = std::min(Alignment, 4u);
   SDValue FudgeInReg;
   if (DestVT == MVT::f32)
@@ -2656,6 +2683,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
     case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
     case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_LOCK_TEST_AND_SET_16;break;
     }
     break;
   case ISD::ATOMIC_CMP_SWAP:
@@ -2665,6 +2693,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
     case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
     case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_ADD:
@@ -2674,6 +2703,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_ADD_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_SUB:
@@ -2683,6 +2713,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_SUB_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_AND:
@@ -2692,6 +2723,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_AND_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_OR:
@@ -2701,6 +2733,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_OR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_XOR:
@@ -2710,6 +2743,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_XOR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_NAND:
@@ -2719,6 +2753,47 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_NAND_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_MAX:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_MAX_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_MAX_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_MAX_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_MAX_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_MAX_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_UMAX:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_UMAX_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_UMAX_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_UMAX_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_UMAX_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_UMAX_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_MIN:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_MIN_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_MIN_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_MIN_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_MIN_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_MIN_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_UMIN:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_UMIN_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_UMIN_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_UMIN_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_UMIN_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_UMIN_16;break;
     }
     break;
   }
@@ -2730,6 +2805,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::CTPOP:
   case ISD::CTLZ:
@@ -2947,20 +3023,20 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (Align > TLI.getMinStackArgumentAlignment()) {
       assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
 
-      VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+      VAList = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                            DAG.getConstant(Align - 1,
-                                           TLI.getPointerTy()));
+                                           VAList.getValueType()));
 
-      VAList = DAG.getNode(ISD::AND, dl, TLI.getPointerTy(), VAList,
+      VAList = DAG.getNode(ISD::AND, dl, VAList.getValueType(), VAList,
                            DAG.getConstant(-(int64_t)Align,
-                                           TLI.getPointerTy()));
+                                           VAList.getValueType()));
     }
 
     // Increment the pointer, VAList, to the next vaarg
-    Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+    Tmp3 = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                        DAG.getConstant(TLI.getDataLayout()->
                           getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
-                                       TLI.getPointerTy()));
+                                       VAList.getValueType()));
     // Store the incremented VAList to the legalized pointer
     Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2,
                         MachinePointerInfo(V), false, false, 0);
@@ -3231,6 +3307,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                       RTLIB::NEARBYINT_F128,
                                       RTLIB::NEARBYINT_PPCF128));
     break;
+  case ISD::FROUND:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
+                                      RTLIB::ROUND_F64,
+                                      RTLIB::ROUND_F80,
+                                      RTLIB::ROUND_F128,
+                                      RTLIB::ROUND_PPCF128));
+    break;
   case ISD::FPOWI:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
                                       RTLIB::POWI_F80, RTLIB::POWI_F128,
@@ -3565,9 +3648,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
-    Index = DAG.getNode(ISD::MUL, dl, PTy,
-                        Index, DAG.getConstant(EntrySize, PTy));
-    SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+    Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(),
+                       Index, DAG.getConstant(EntrySize, Index.getValueType()));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
+                               Index, Table);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
@@ -3611,10 +3695,21 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     Tmp3 = Node->getOperand(2);
-    LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2, Tmp3, dl);
+    bool Legalized = LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2,
+                                           Tmp3, NeedInvert, dl);
+
+    if (Legalized) {
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SETCC node.
+      if (Tmp3.getNode())
+        Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3);
+
+      // If we expanded the SETCC by inverting the condition code, then wrap
+      // the existing SETCC in a NOT to restore the intended condition.
+      if (NeedInvert)
+        Tmp1 = DAG.getNOT(dl, Tmp1, Tmp1->getValueType(0));
 
-    // If we expanded the SETCC into an AND/OR, return the new node
-    if (Tmp2.getNode() == 0) {
       Results.push_back(Tmp1);
       break;
     }
@@ -3645,14 +3740,52 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp4 = Node->getOperand(3);   // False
     SDValue CC = Node->getOperand(4);
 
-    LegalizeSetCCCondCode(getSetCCResultType(Tmp1.getValueType()),
-                          Tmp1, Tmp2, CC, dl);
+    bool Legalized = false;
+    // Try to legalize by inverting the condition.  This is for targets that
+    // might support an ordered version of a condition, but not the unordered
+    // version (or vice versa).
+    ISD::CondCode InvCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               Tmp1.getValueType().isInteger());
+    if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
+      // Use the new condition code and swap true and false
+      Legalized = true;
+      Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
+    } else {
+      // If The inverse is not legal, then try to swap the arguments using
+      // the inverse condition code.
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
+      if (TLI.isCondCodeLegal(SwapInvCC, Tmp1.getSimpleValueType())) {
+        // The swapped inverse condition is legal, so swap true and false,
+        // lhs and rhs.
+        Legalized = true;
+        Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
+      }
+    }
+
+    if (!Legalized) {
+      Legalized = LegalizeSetCCCondCode(
+          getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, NeedInvert,
+          dl);
+
+      assert(Legalized && "Can't legalize SELECT_CC with legal condition!");
+
+      // If we expanded the SETCC by inverting the condition code, then swap
+      // the True/False operands to match.
+      if (NeedInvert)
+        std::swap(Tmp3, Tmp4);
 
-    assert(!Tmp2.getNode() && "Can't legalize SELECT_CC with legal condition!");
-    Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
-    CC = DAG.getCondCode(ISD::SETNE);
-    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
-                       Tmp3, Tmp4, CC);
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SELECT_CC node.
+      if (CC.getNode()) {
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3, Tmp4, CC);
+      } else {
+        Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
+        CC = DAG.getCondCode(ISD::SETNE);
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
+                           Tmp3, Tmp4, CC);
+      }
+    }
     Results.push_back(Tmp1);
     break;
   }
@@ -3662,14 +3795,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(3);              // RHS
     Tmp4 = Node->getOperand(1);              // CC
 
-    LegalizeSetCCCondCode(getSetCCResultType(Tmp2.getValueType()),
-                          Tmp2, Tmp3, Tmp4, dl);
-
-    assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
-    Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
-    Tmp4 = DAG.getCondCode(ISD::SETNE);
-    Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
-                       Tmp3, Node->getOperand(4));
+    bool Legalized = LegalizeSetCCCondCode(getSetCCResultType(
+        Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, NeedInvert, dl);
+    (void)Legalized;
+    assert(Legalized && "Can't legalize BR_CC with legal condition!");
+
+    // If we expanded the SETCC by inverting the condition code, then wrap
+    // the existing SETCC in a NOT to restore the intended condition.
+    if (NeedInvert)
+      Tmp4 = DAG.getNOT(dl, Tmp4, Tmp4->getValueType(0));
+
+    // If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
+    // node.
+    if (Tmp4.getNode()) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
+                         Tmp4, Tmp2, Tmp3, Node->getOperand(4));
+    } else {
+      Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
+      Tmp4 = DAG.getCondCode(ISD::SETNE);
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
+                         Tmp3, Node->getOperand(4));
+    }
     Results.push_back(Tmp1);
     break;
   }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index cea0b02..ecf4c5d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -88,6 +88,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
     case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
     case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
@@ -160,7 +161,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
                                            RTLIB::ADD_F80,
                                            RTLIB::ADD_F128,
                                            RTLIB::ADD_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
@@ -172,7 +173,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
                                            RTLIB::CEIL_F80,
                                            RTLIB::CEIL_F128,
                                            RTLIB::CEIL_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
@@ -226,7 +227,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
                                            RTLIB::COS_F80,
                                            RTLIB::COS_F128,
                                            RTLIB::COS_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
@@ -239,7 +240,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
                                            RTLIB::DIV_F80,
                                            RTLIB::DIV_F128,
                                            RTLIB::DIV_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
@@ -251,7 +252,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
                                            RTLIB::EXP_F80,
                                            RTLIB::EXP_F128,
                                            RTLIB::EXP_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
@@ -263,7 +264,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
                                            RTLIB::EXP2_F80,
                                            RTLIB::EXP2_F128,
                                            RTLIB::EXP2_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
@@ -275,7 +276,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
                                            RTLIB::FLOOR_F80,
                                            RTLIB::FLOOR_F128,
                                            RTLIB::FLOOR_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
@@ -287,7 +288,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
                                            RTLIB::LOG_F80,
                                            RTLIB::LOG_F128,
                                            RTLIB::LOG_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
@@ -299,7 +300,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
                                            RTLIB::LOG2_F80,
                                            RTLIB::LOG2_F128,
                                            RTLIB::LOG2_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
@@ -311,7 +312,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
                                            RTLIB::LOG10_F80,
                                            RTLIB::LOG10_F128,
                                            RTLIB::LOG10_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
@@ -325,7 +326,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
                                            RTLIB::FMA_F80,
                                            RTLIB::FMA_F128,
                                            RTLIB::FMA_PPCF128),
-                         NVT, Ops, 3, false, SDLoc(N));
+                         NVT, Ops, 3, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
@@ -338,7 +339,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
                                            RTLIB::MUL_F80,
                                            RTLIB::MUL_F128,
                                            RTLIB::MUL_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
@@ -350,7 +351,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
                                            RTLIB::NEARBYINT_F80,
                                            RTLIB::NEARBYINT_F128,
                                            RTLIB::NEARBYINT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
@@ -364,7 +365,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
@@ -372,7 +373,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -381,7 +382,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
   return TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
-                         SDLoc(N));
+                         SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
@@ -389,7 +390,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
@@ -402,7 +403,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
                                            RTLIB::POW_F80,
                                            RTLIB::POW_F128,
                                            RTLIB::POW_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
@@ -416,7 +417,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
                                            RTLIB::POWI_F80,
                                            RTLIB::POWI_F128,
                                            RTLIB::POWI_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
@@ -429,7 +430,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
                                            RTLIB::REM_F80,
                                            RTLIB::REM_F128,
                                            RTLIB::REM_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
@@ -441,7 +442,19 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
                                            RTLIB::RINT_F80,
                                            RTLIB::RINT_F128,
                                            RTLIB::RINT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::ROUND_F32,
+                                           RTLIB::ROUND_F64,
+                                           RTLIB::ROUND_F80,
+                                           RTLIB::ROUND_F128,
+                                           RTLIB::ROUND_PPCF128),
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
@@ -453,7 +466,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
                                            RTLIB::SIN_F80,
                                            RTLIB::SIN_F128,
                                            RTLIB::SIN_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
@@ -465,7 +478,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
                                            RTLIB::SQRT_F80,
                                            RTLIB::SQRT_F128,
                                            RTLIB::SQRT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
@@ -478,7 +491,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
@@ -490,7 +503,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
                                            RTLIB::TRUNC_F80,
                                            RTLIB::TRUNC_F128,
                                            RTLIB::TRUNC_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
@@ -504,7 +517,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
     NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
                        NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
                        L->getPointerInfo(), NVT, L->isVolatile(),
-                       L->isNonTemporal(), false, L->getAlignment());
+                       L->isNonTemporal(), false, L->getAlignment(),
+                       L->getTBAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -516,7 +530,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                      L->getMemoryVT(), dl, L->getChain(),
                      L->getBasePtr(), L->getOffset(), L->getPointerInfo(),
                      L->getMemoryVT(), L->isVolatile(),
-                     L->isNonTemporal(), false, L->getAlignment());
+                     L->isNonTemporal(), false, L->getAlignment(),
+                     L->getTBAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -585,7 +600,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                            NVT, N->getOperand(0));
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         &Op, 1, false, dl);
+                         &Op, 1, false, dl).first;
 }
 
 
@@ -645,7 +660,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
@@ -676,7 +691,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
@@ -684,14 +699,14 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) {
   EVT RVT = N->getValueType(0);
   RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16;
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
@@ -754,9 +769,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
     Val = GetSoftenedFloat(Val);
 
   return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
-                      ST->getPointerInfo(),
-                      ST->isVolatile(), ST->isNonTemporal(),
-                      ST->getAlignment());
+                      ST->getMemOperand());
 }
 
 
@@ -817,6 +830,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
   case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
   case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
   case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
   case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
   case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
@@ -912,7 +926,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                                    RTLIB::DIV_F128,
                                                    RTLIB::DIV_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -986,7 +1000,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
                                  N->getValueType(0), Ops, 3, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1000,7 +1014,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                                    RTLIB::MUL_F128,
                                                    RTLIB::MUL_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1072,6 +1086,18 @@ void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
   GetPairElements(Call, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ROUND_F32,
+                                         RTLIB::ROUND_F64,
+                                         RTLIB::ROUND_F80,
+                                         RTLIB::ROUND_F128,
+                                         RTLIB::ROUND_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
@@ -1102,7 +1128,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                                    RTLIB::SUB_F128,
                                                    RTLIB::SUB_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1134,8 +1160,7 @@ void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
   assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
 
   Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
-                      LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(),
-                      LD->isNonTemporal(), LD->getAlignment());
+                      LD->getMemoryVT(), LD->getMemOperand());
 
   // Remember the chain.
   Chain = Hi.getValue(1);
@@ -1181,7 +1206,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
-    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl);
+    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl).first;
     GetPairElements(Hi, Lo, Hi);
   }
 
@@ -1251,6 +1276,7 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
 
   case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+  case ISD::FCOPYSIGN:  Res = ExpandFloatOp_FCOPYSIGN(N); break;
   case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
   case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
   case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
@@ -1325,6 +1351,17 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
                                 N->getOperand(4)), 0);
 }
 
+SDValue DAGTypeLegalizer::ExpandFloatOp_FCOPYSIGN(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(1), Lo, Hi);
+  // The ppcf128 value is providing only the sign; take it from the
+  // higher-order double (which must have the larger magnitude).
+  return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N),
+                     N->getValueType(0), N->getOperand(0), Hi);
+}
+
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
   assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
@@ -1353,7 +1390,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl);
+  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
@@ -1386,7 +1423,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), &N->getOperand(0), 1,
-                         false, dl);
+                         false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
@@ -1445,7 +1482,5 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
   GetExpandedOp(ST->getValue(), Lo, Hi);
 
   return DAG.getTruncStore(Chain, SDLoc(N), Hi, Ptr,
-                           ST->getPointerInfo(),
-                           ST->getMemoryVT(), ST->isVolatile(),
-                           ST->isNonTemporal(), ST->getAlignment());
+                           ST->getMemoryVT(), ST->getMemOperand());
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ff8f1f9..4255948 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -417,9 +417,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
     ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
   SDLoc dl(N);
   SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
-                               N->getPointerInfo(),
-                               N->getMemoryVT(), N->isVolatile(),
-                               N->isNonTemporal(), N->getAlignment());
+                               N->getMemoryVT(), N->getMemOperand());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -919,7 +917,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   // type does not have a strange size (eg: it is not i1).
   EVT VecVT = N->getValueType(0);
   unsigned NumElts = VecVT.getVectorNumElements();
-  assert(!(NumElts & 1) && "Legal vector of one illegal element?");
+  assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
+		 "Legal vector of one illegal element?");
 
   // Promote the inserted value.  The type does not need to match the
   // vector element type.  Check that any extra bits introduced will be
@@ -1037,17 +1036,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
   assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
   SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
-  unsigned Alignment = N->getAlignment();
-  bool isVolatile = N->isVolatile();
-  bool isNonTemporal = N->isNonTemporal();
   SDLoc dl(N);
 
   SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
 
   // Truncate the value and store the result.
-  return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getPointerInfo(),
-                           N->getMemoryVT(),
-                           isVolatile, isNonTemporal, Alignment);
+  return DAG.getTruncStore(Ch, dl, Val, Ptr,
+                           N->getMemoryVT(), N->getMemOperand());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
@@ -1193,6 +1188,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
     case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
     case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_LOCK_TEST_AND_SET_16;break;
     }
     break;
   case ISD::ATOMIC_CMP_SWAP:
@@ -1202,6 +1198,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
     case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
     case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_ADD:
@@ -1211,6 +1208,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_ADD_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_SUB:
@@ -1220,6 +1218,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_SUB_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_AND:
@@ -1229,6 +1228,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_AND_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_OR:
@@ -1238,6 +1238,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_OR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_XOR:
@@ -1247,6 +1248,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_XOR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_NAND:
@@ -1256,6 +1258,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_NAND_16;break;
     }
     break;
   }
@@ -1770,7 +1773,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -1781,7 +1785,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -1803,6 +1808,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
   bool isInvariant = N->isInvariant();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
   SDLoc dl(N);
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
@@ -1811,7 +1817,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     EVT MemVT = N->getMemoryVT();
 
     Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
-                        MemVT, isVolatile, isNonTemporal, Alignment);
+                        MemVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
@@ -1833,7 +1839,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   } else if (TLI.isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
-                     isVolatile, isNonTemporal, isInvariant, Alignment);
+                     isVolatile, isNonTemporal, isInvariant, Alignment,
+                     TBAAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -1842,11 +1849,11 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
                         isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize));
+                        MinAlign(Alignment, IncrementSize), TBAAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1864,17 +1871,17 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
-                        isVolatile, isNonTemporal, Alignment);
+                        isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     // Load the rest of the low bits.
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                         isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize));
+                        MinAlign(Alignment, IncrementSize), TBAAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1997,7 +2004,8 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -2060,7 +2068,7 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
@@ -2155,7 +2163,8 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl).first, Lo,
+                 Hi);
     return;
   }
 
@@ -2238,7 +2247,7 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
@@ -2378,7 +2387,7 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
@@ -2398,7 +2407,7 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
@@ -2685,7 +2694,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2702,6 +2711,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getAlignment();
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
   SDLoc dl(N);
   SDValue Lo, Hi;
 
@@ -2711,7 +2721,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
                              N->getMemoryVT(), isVolatile, isNonTemporal,
-                             Alignment);
+                             Alignment, TBAAInfo);
   }
 
   if (TLI.isLittleEndian()) {
@@ -2719,7 +2729,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
     Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                      isVolatile, isNonTemporal, Alignment);
+                      isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -2728,11 +2738,11 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
                            NEVT, isVolatile, isNonTemporal,
-                           MinAlign(Alignment, IncrementSize));
+                           MinAlign(Alignment, IncrementSize), TBAAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
@@ -2760,17 +2770,17 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
   // Store both the high bits and maybe some of the low bits.
   Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(),
-                         HiVT, isVolatile, isNonTemporal, Alignment);
+                         HiVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   // Store the lowest ExcessBits bits in the second half.
   Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                          isVolatile, isNonTemporal,
-                         MinAlign(Alignment, IncrementSize));
+                         MinAlign(Alignment, IncrementSize), TBAAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
@@ -2835,7 +2845,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
     SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet,
                                    Zero, Four);
     unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
-    FudgePtr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), FudgePtr, Offset);
+    FudgePtr = DAG.getNode(ISD::ADD, dl, FudgePtr.getValueType(),
+                           FudgePtr, Offset);
     Alignment = std::min(Alignment, 4u);
 
     // Load the value out, extending it from f32 to the destination float type.
@@ -2852,7 +2863,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl);
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index fd770d1..eb13230 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -958,20 +958,6 @@ SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
   return SDValue(N->getOperand(ResNo));
 }
 
-/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
-/// which is split into two not necessarily identical pieces.
-void DAGTypeLegalizer::GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT) {
-  // Currently all types are split in half.
-  if (!InVT.isVector()) {
-    LoVT = HiVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
-  } else {
-    unsigned NumElements = InVT.getVectorNumElements();
-    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-    LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                   InVT.getVectorElementType(), NumElements/2);
-  }
-}
-
 /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
 /// high parts of the given value.
 void DAGTypeLegalizer::GetPairElements(SDValue Pair,
@@ -988,10 +974,7 @@ SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
                                                   SDValue Index) {
   SDLoc dl(Index);
   // Make sure the index type is big enough to compute in.
-  if (Index.getValueType().bitsGT(TLI.getPointerTy()))
-    Index = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Index);
-  else
-    Index = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Index);
+  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy());
 
   // Calculate the element offset and add it to the pointer.
   unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
@@ -1024,20 +1007,23 @@ SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
   unsigned NumOps = N->getNumOperands();
   SDLoc dl(N);
   if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned,
+                           dl).first;
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned,
+                           dl).first;
   } else if (NumOps == 2) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned,
+                           dl).first;
   }
   SmallVector<SDValue, 8> Ops(NumOps);
   for (unsigned i = 0; i < NumOps; ++i)
     Ops[i] = N->getOperand(i);
 
   return TLI.makeLibCall(DAG, LC, N->getValueType(0),
-                         &Ops[0], NumOps, isSigned, dl);
+                         &Ops[0], NumOps, isSigned, dl).first;
 }
 
 // ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 63e9af3..13bb08f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -410,6 +410,7 @@ private:
   SDValue SoftenFloatRes_FPOWI(SDNode *N);
   SDValue SoftenFloatRes_FREM(SDNode *N);
   SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FROUND(SDNode *N);
   SDValue SoftenFloatRes_FSIN(SDNode *N);
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
@@ -470,6 +471,7 @@ private:
   void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUND    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -480,6 +482,7 @@ private:
   // Float Operand Expansion.
   bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
   SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N);
   SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
   SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
   SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
@@ -534,7 +537,7 @@ private:
   // Vector Operand Scalarization: <1 x ty> -> ty.
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
-  SDValue ScalarizeVecOp_EXTEND(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
@@ -558,6 +561,7 @@ private:
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -628,6 +632,7 @@ private:
 
   SDValue WidenVecRes_Ternary(SDNode *N);
   SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
   SDValue WidenVecRes_Shift(SDNode *N);
@@ -699,10 +704,6 @@ private:
       GetExpandedFloat(Op, Lo, Hi);
   }
 
-  /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
-  /// which is split (or expanded) into two not necessarily identical pieces.
-  void GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT);
-
   /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
   /// high parts of the given value.
   void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
@@ -730,6 +731,12 @@ private:
       GetExpandedFloat(Op, Lo, Hi);
   }
 
+
+  /// This function will split the integer \p Op into \p NumElements
+  /// operations of type \p EltVT and store them in \p Ops.
+  void IntegerToVector(SDValue Op, unsigned NumElements,
+                       SmallVectorImpl<SDValue> &Ops, EVT EltVT);
+
   // Generic Result Expansion.
   void ExpandRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
                                     SDValue &Lo, SDValue &Hi);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 96f6143..c749fde 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -77,13 +77,9 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypeWidenVector: {
       assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
       InOp = GetWidenedVector(InOp);
-      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                                   InVT.getVectorNumElements()/2);
-      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
-                       DAG.getConstant(0, TLI.getVectorIdxTy()));
-      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
-                       DAG.getConstant(InNVT.getVectorNumElements(),
-                                       TLI.getVectorIdxTy()));
+      EVT LoVT, HiVT;
+      llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
+      llvm::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
@@ -169,7 +165,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   // Increment the pointer to the other half.
   unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize,
+                                         StackPtr.getValueType()));
 
   // Load the second half from the stack slot.
   Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
@@ -253,20 +250,22 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
   Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                   isVolatile, isNonTemporal, isInvariant, Alignment);
+                   isVolatile, isNonTemporal, isInvariant, Alignment,
+                   TBAAInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
                    isVolatile, isNonTemporal, isInvariant,
-                   MinAlign(Alignment, IncrementSize));
+                   MinAlign(Alignment, IncrementSize), TBAAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -307,6 +306,25 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
 // Generic Operand Expansion.
 //===--------------------------------------------------------------------===//
 
+void DAGTypeLegalizer::IntegerToVector(SDValue Op, unsigned NumElements,
+                                       SmallVectorImpl<SDValue> &Ops,
+                                       EVT EltVT) {
+  assert(Op.getValueType().isInteger());
+  SDLoc DL(Op);
+  SDValue Parts[2];
+
+  if (NumElements > 1) {
+    NumElements >>= 1;
+    SplitInteger(Op, Parts[0], Parts[1]);
+      if (TLI.isBigEndian())
+        std::swap(Parts[0], Parts[1]);
+    IntegerToVector(Parts[0], NumElements, Ops, EltVT);
+    IntegerToVector(Parts[1], NumElements, Ops, EltVT);
+  } else {
+    Ops.push_back(DAG.getNode(ISD::BITCAST, DL, EltVT, Op));
+  }
+}
+
 SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
   SDLoc dl(N);
   if (N->getValueType(0).isVector()) {
@@ -315,21 +333,27 @@ SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
     // instead, but only if the new vector type is legal (otherwise there
     // is no point, and it might create expansion loops).  For example, on
     // x86 this turns v1i64 = BITCAST i64 into v1i64 = BITCAST v2i32.
+    //
+    // FIXME: I'm not sure why we are first trying to split the input into
+    // a 2 element vector, so I'm leaving it here to maintain the current
+    // behavior.
+    unsigned NumElts = 2;
     EVT OVT = N->getOperand(0).getValueType();
     EVT NVT = EVT::getVectorVT(*DAG.getContext(),
                                TLI.getTypeToTransformTo(*DAG.getContext(), OVT),
-                               2);
-
-    if (isTypeLegal(NVT)) {
-      SDValue Parts[2];
-      GetExpandedOp(N->getOperand(0), Parts[0], Parts[1]);
+                               NumElts);
+    if (!isTypeLegal(NVT)) {
+      // If we can't find a legal type by splitting the integer in half,
+      // then we can use the node's value type.
+      NumElts = N->getValueType(0).getVectorNumElements();
+      NVT = N->getValueType(0);
+    }
 
-      if (TLI.isBigEndian())
-        std::swap(Parts[0], Parts[1]);
+    SmallVector<SDValue, 8> Ops;
+    IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType());
 
-      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Parts, 2);
-      return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
-    }
+    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], NumElts);
+    return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
   }
 
   // Otherwise, store to a temporary and load out again as the new type.
@@ -439,6 +463,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   unsigned Alignment = St->getAlignment();
   bool isVolatile = St->isVolatile();
   bool isNonTemporal = St->isNonTemporal();
+  const MDNode *TBAAInfo = St->getTBAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -450,15 +475,14 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
     std::swap(Lo, Hi);
 
   Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
-                    isVolatile, isNonTemporal, Alignment);
+                    isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
-  assert(isTypeLegal(Ptr.getValueType()) && "Pointers must be legal!");
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                     St->getPointerInfo().getWithOffset(IncrementSize),
                     isVolatile, isNonTemporal,
-                    MinAlign(Alignment, IncrementSize));
+                    MinAlign(Alignment, IncrementSize), TBAAInfo);
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
@@ -489,14 +513,12 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
   SDValue Cond = N->getOperand(0);
   CL = CH = Cond;
   if (Cond.getValueType().isVector()) {
-    assert(Cond.getValueType().getVectorElementType() == MVT::i1 &&
-           "Condition legalized before result?");
-    unsigned NumElements = Cond.getValueType().getVectorNumElements();
-    EVT VCondTy = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElements / 2);
-    CL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
-                     DAG.getConstant(0, TLI.getVectorIdxTy()));
-    CH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
-                     DAG.getConstant(NumElements / 2, TLI.getVectorIdxTy()));
+    // Check if there are already splitted versions of the vector available and
+    // use those instead of splitting the mask operand again.
+    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Cond, CL, CH);
+    else
+      llvm::tie(CL, CH) = DAG.SplitVector(Cond, dl);
   }
 
   Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
@@ -518,7 +540,7 @@ void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
 
 void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   Lo = DAG.getUNDEF(LoVT);
   Hi = DAG.getUNDEF(HiVT);
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index bbe11b8..2c3cdcc 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -171,7 +171,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         return TranslateLegalizeResults(Op, Result);
       case TargetLowering::Custom:
         Changed = true;
-        return LegalizeOp(TLI.LowerOperation(Result, DAG));
+        return TranslateLegalizeResults(Op, TLI.LowerOperation(Result, DAG));
       case TargetLowering::Expand:
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
@@ -227,6 +227,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_UINT:
   case ISD::FNEG:
   case ISD::FABS:
+  case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
   case ISD::FCOS:
@@ -241,6 +242,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FTRUNC:
   case ISD::FRINT:
   case ISD::FNEARBYINT:
+  case ISD::FROUND:
   case ISD::FFLOOR:
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
@@ -416,7 +418,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(), LD->getAlignment());
+                                 LD->isInvariant(), LD->getAlignment(),
+                                 LD->getTBAAInfo());
       } else {
         EVT LoadVT = WideVT;
         while (RemainingBytes < LoadBytes) {
@@ -426,13 +429,14 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
                                     LD->getPointerInfo().getWithOffset(Offset),
                                     LoadVT, LD->isVolatile(),
-                                    LD->isNonTemporal(), LD->getAlignment());
+                                    LD->isNonTemporal(), LD->getAlignment(),
+                                    LD->getTBAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
       Offset += LoadBytes;
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                            DAG.getIntPtrConstant(LoadBytes));
+                            DAG.getConstant(LoadBytes, BasePTR.getValueType()));
 
       LoadVals.push_back(ScalarLoad.getValue(0));
       LoadChains.push_back(ScalarLoad.getValue(1));
@@ -497,10 +501,10 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
                 Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
                 SrcVT.getScalarType(),
                 LD->isVolatile(), LD->isNonTemporal(),
-                LD->getAlignment());
+                LD->getAlignment(), LD->getTBAAInfo());
 
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                         DAG.getIntPtrConstant(Stride));
+                         DAG.getConstant(Stride, BasePTR.getValueType()));
 
       Vals.push_back(ScalarLoad.getValue(0));
       LoadChains.push_back(ScalarLoad.getValue(1));
@@ -529,6 +533,7 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
 
   unsigned NumElem = StVT.getVectorNumElements();
   // The type of the data we want to save
@@ -556,10 +561,10 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
                ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, Alignment);
+               isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                                DAG.getIntPtrConstant(Stride));
+                               DAG.getConstant(Stride, BasePTR.getValueType()));
 
     Stores.push_back(Store);
   }
@@ -597,10 +602,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
     return DAG.UnrollVectorOp(Op.getNode());
 
   // Generate a mask operand.
-  EVT MaskTy = TLI.getSetCCResultType(*DAG.getContext(), VT);
-  assert(MaskTy.isVector() && "Invalid CC type");
-  assert(MaskTy.getSizeInBits() == Op1.getValueType().getSizeInBits()
-         && "Invalid mask size");
+  EVT MaskTy = VT.changeVectorElementTypeToInteger();
 
   // What is the size of each element in the vector mask.
   EVT BitTy = MaskTy.getScalarType();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 54380ec..f7a3e3d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -83,6 +83,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -97,6 +98,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ADD:
   case ISD::AND:
   case ISD::FADD:
+  case ISD::FCOPYSIGN:
   case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
@@ -215,7 +217,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
                                N->getPointerInfo(),
                                N->getMemoryVT().getVectorElementType(),
                                N->isVolatile(), N->isNonTemporal(),
-                               N->isInvariant(), N->getOriginalAlignment());
+                               N->isInvariant(), N->getOriginalAlignment(),
+                               N->getTBAAInfo());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -369,7 +372,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND:
-      Res = ScalarizeVecOp_EXTEND(N);
+    case ISD::TRUNCATE:
+      Res = ScalarizeVecOp_UnaryOp(N);
       break;
     case ISD::CONCAT_VECTORS:
       Res = ScalarizeVecOp_CONCAT_VECTORS(N);
@@ -408,7 +412,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
 
 /// ScalarizeVecOp_EXTEND - If the value to extend is a vector that needs
 /// to be scalarized, it must be <1 x ty>.  Extend the element instead.
-SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTEND(SDNode *N) {
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
          "Unexected vector type!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
@@ -455,12 +459,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
                              N->getBasePtr(), N->getPointerInfo(),
                              N->getMemoryVT().getVectorElementType(),
                              N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment());
+                             N->getAlignment(), N->getTBAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
                       N->isVolatile(), N->isNonTemporal(),
-                      N->getOriginalAlignment());
+                      N->getOriginalAlignment(), N->getTBAAInfo());
 }
 
 
@@ -517,7 +521,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
     break;
 
-  case ISD::ANY_EXTEND:
   case ISD::CONVERT_RNDSAT:
   case ISD::CTLZ:
   case ISD::CTTZ:
@@ -540,21 +543,27 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
-  case ISD::SIGN_EXTEND:
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
-  case ISD::ZERO_EXTEND:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    SplitVecRes_ExtendOp(N, Lo, Hi);
+    break;
+
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
   case ISD::FADD:
+  case ISD::FCOPYSIGN:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::SDIV:
@@ -615,7 +624,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   // We know the result is a vector.  The input may be either a vector or a
   // scalar value.
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   SDLoc dl(N);
 
   SDValue InOp = N->getOperand(0);
@@ -670,7 +679,7 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned LoNumElts = LoVT.getVectorNumElements();
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
   Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, &LoOps[0], LoOps.size());
@@ -691,7 +700,7 @@ void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
   }
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
   Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, &LoOps[0], LoOps.size());
@@ -707,7 +716,7 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
@@ -731,7 +740,8 @@ void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT(), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) =
+    DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());
 
   Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
                    DAG.getValueType(LoVT));
@@ -783,7 +793,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                       DAG.getConstant(IncrementSize, StackPtr.getValueType()));
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
@@ -794,7 +804,7 @@ void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
   Hi = DAG.getUNDEF(HiVT);
 }
@@ -804,7 +814,7 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
   EVT LoVT, HiVT;
   SDLoc dl(LD);
-  GetSplitDestVTs(LD->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
 
   ISD::LoadExtType ExtType = LD->getExtensionType();
   SDValue Ch = LD->getChain();
@@ -815,20 +825,22 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   EVT LoMemVT, HiMemVT;
-  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+  llvm::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal,
-                   isInvariant, Alignment);
+                   isInvariant, Alignment, TBAAInfo);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
-                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment);
+                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment,
+                   TBAAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -847,24 +859,12 @@ void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   EVT LoVT, HiVT;
   SDLoc DL(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // Split the input.
-  EVT InVT = N->getOperand(0).getValueType();
   SDValue LL, LH, RL, RH;
-  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
-  LL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
-                   DAG.getConstant(0, TLI.getVectorIdxTy()));
-  LH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
-                   DAG.getConstant(InNVT.getVectorNumElements(),
-                   TLI.getVectorIdxTy()));
-
-  RL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
-                   DAG.getConstant(0, TLI.getVectorIdxTy()));
-  RH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
-                   DAG.getConstant(InNVT.getVectorNumElements(),
-                   TLI.getVectorIdxTy()));
+  llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
 
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
@@ -875,22 +875,15 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   // Get the dest types - they may not match the input types, e.g. int_to_fp.
   EVT LoVT, HiVT;
   SDLoc dl(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // If the input also splits, handle it directly for a compile time speedup.
   // Otherwise split it by hand.
   EVT InVT = N->getOperand(0).getValueType();
-  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
     GetSplitVector(N->getOperand(0), Lo, Hi);
-  } else {
-    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                                 LoVT.getVectorNumElements());
-    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
-                     DAG.getConstant(0, TLI.getVectorIdxTy()));
-    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
-                     DAG.getConstant(InNVT.getVectorNumElements(),
-                                     TLI.getVectorIdxTy()));
-  }
+  else
+    llvm::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
 
   if (N->getOpcode() == ISD::FP_ROUND) {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
@@ -913,6 +906,58 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   }
 }
 
+void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDLoc dl(N);
+  EVT SrcVT = N->getOperand(0).getValueType();
+  EVT DestVT = N->getValueType(0);
+  EVT LoVT, HiVT;
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);
+
+  // We can do better than a generic split operation if the extend is doing
+  // more than just doubling the width of the elements and the following are
+  // true:
+  //   - The number of vector elements is even,
+  //   - the source type is legal,
+  //   - the type of a split source is illegal,
+  //   - the type of an extended (by doubling element size) source is legal, and
+  //   - the type of that extended source when split is legal.
+  //
+  // This won't necessarily completely legalize the operation, but it will
+  // more effectively move in the right direction and prevent falling down
+  // to scalarization in many cases due to the input vector being split too
+  // far.
+  unsigned NumElements = SrcVT.getVectorNumElements();
+  if ((NumElements & 1) == 0 &&
+      SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT NewSrcVT = EVT::getVectorVT(
+        Ctx, EVT::getIntegerVT(
+                 Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2),
+        NumElements);
+    EVT SplitSrcVT =
+        EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2);
+    EVT SplitLoVT, SplitHiVT;
+    llvm::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
+    if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
+        TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
+      DEBUG(dbgs() << "Split vector extend via incremental extend:";
+            N->dump(&DAG); dbgs() << "\n");
+      // Extend the source vector by one step.
+      SDValue NewSrc =
+          DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
+      // Get the low and high halves of the new, extended one step, vector.
+      llvm::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+      // Extend those vector halves the rest of the way.
+      Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+      Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+      return;
+    }
+  }
+  // Fall back to the generic unary operator splitting otherwise.
+  SplitVecRes_UnaryOp(N, Lo, Hi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
                                                   SDValue &Lo, SDValue &Hi) {
   // The low and high parts of the original input give four input vectors.
@@ -1105,41 +1150,23 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
   SDValue Mask = N->getOperand(0);
   SDValue Src0 = N->getOperand(1);
   SDValue Src1 = N->getOperand(2);
+  EVT Src0VT = Src0.getValueType();
   SDLoc DL(N);
-  EVT MaskVT = Mask.getValueType();
-  assert(MaskVT.isVector() && "VSELECT without a vector mask?");
+  assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");
 
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(0), Lo, Hi);
   assert(Lo.getValueType() == Hi.getValueType() &&
          "Lo and Hi have differing types");
 
-  unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
-  unsigned HiNumElts = Hi.getValueType().getVectorNumElements();
-  assert(LoNumElts == HiNumElts && "Asymmetric vector split?");
-
-  LLVMContext &Ctx = *DAG.getContext();
-  SDValue Zero = DAG.getConstant(0, TLI.getVectorIdxTy());
-  SDValue LoElts = DAG.getConstant(LoNumElts, TLI.getVectorIdxTy());
-  EVT Src0VT = Src0.getValueType();
-  EVT Src0EltTy = Src0VT.getVectorElementType();
-  EVT MaskEltTy = MaskVT.getVectorElementType();
-
-  EVT LoOpVT = EVT::getVectorVT(Ctx, Src0EltTy, LoNumElts);
-  EVT LoMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, LoNumElts);
-  EVT HiOpVT = EVT::getVectorVT(Ctx, Src0EltTy, HiNumElts);
-  EVT HiMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, HiNumElts);
+  EVT LoOpVT, HiOpVT;
+  llvm::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
+  assert(LoOpVT == HiOpVT && "Asymmetric vector split?");
 
-  SDValue LoOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src0, Zero);
-  SDValue LoOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src1, Zero);
-
-  SDValue HiOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src0, LoElts);
-  SDValue HiOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src1, LoElts);
-
-  SDValue LoMask =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoMaskVT, Mask, Zero);
-  SDValue HiMask =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiMaskVT, Mask, LoElts);
+  SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
+  llvm::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
+  llvm::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
+  llvm::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
 
   SDValue LoSelect =
     DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
@@ -1249,33 +1276,34 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getOriginalAlignment();
   bool isVol = N->isVolatile();
   bool isNT = N->isNonTemporal();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(1), Lo, Hi);
 
   EVT LoMemVT, HiMemVT;
-  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+  llvm::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
 
   if (isTruncating)
     Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                           LoMemVT, isVol, isNT, Alignment);
+                           LoMemVT, isVol, isNT, Alignment, TBAAInfo);
   else
     Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                      isVol, isNT, Alignment);
+                      isVol, isNT, Alignment, TBAAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
 
   if (isTruncating)
     Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
-                           HiMemVT, isVol, isNT, Alignment);
+                           HiMemVT, isVol, isNT, Alignment, TBAAInfo);
   else
     Hi = DAG.getStore(Ch, DL, Hi, Ptr,
                       N->getPointerInfo().getWithOffset(IncrementSize),
-                      isVol, isNT, Alignment);
+                      isVol, isNT, Alignment, TBAAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
@@ -1341,13 +1369,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
   SDLoc DL(N);
 
   // Extract the halves of the input via extract_subvector.
-  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
-                                 InVT.getVectorElementType(), NumElements/2);
-  SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
-                                DAG.getConstant(0, TLI.getVectorIdxTy()));
-  SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
-                                DAG.getConstant(NumElements/2,
-                                TLI.getVectorIdxTy()));
+  SDValue InLoVec, InHiVec;
+  llvm::tie(InLoVec, InHiVec) = DAG.SplitVector(InVec, DL);
   // Truncate them to 1/2 the element size.
   EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
   EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
@@ -1446,27 +1469,31 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
+
   case ISD::ADD:
   case ISD::AND:
   case ISD::BSWAP:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SUB:
+  case ISD::XOR:
+    Res = WidenVecRes_Binary(N);
+    break;
+
   case ISD::FADD:
   case ISD::FCOPYSIGN:
-  case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
-  case ISD::FREM:
   case ISD::FSUB:
-  case ISD::MUL:
-  case ISD::MULHS:
-  case ISD::MULHU:
-  case ISD::OR:
+  case ISD::FDIV:
+  case ISD::FREM:
   case ISD::SDIV:
-  case ISD::SREM:
   case ISD::UDIV:
+  case ISD::SREM:
   case ISD::UREM:
-  case ISD::SUB:
-  case ISD::XOR:
-    Res = WidenVecRes_Binary(N);
+    Res = WidenVecRes_BinaryCanTrap(N);
     break;
 
   case ISD::FPOWI:
@@ -1507,6 +1534,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -1534,6 +1562,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   // Binary op widening.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
+  // Binary op widening for operations that can trap.
   unsigned Opcode = N->getOpcode();
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -2532,6 +2569,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
   bool      isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;          // Difference
@@ -2541,7 +2579,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
-                             isVolatile, isNonTemporal, isInvariant, Align);
+                             isVolatile, isNonTemporal, isInvariant, Align,
+                             TBAAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction
@@ -2577,7 +2616,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     unsigned Increment = NewVTWidth / 8;
     Offset += Increment;
     BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                          DAG.getIntPtrConstant(Increment));
+                          DAG.getConstant(Increment, BasePtr.getValueType()));
 
     SDValue L;
     if (LdWidth < NewVTWidth) {
@@ -2586,7 +2625,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
+                      TBAAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2602,7 +2642,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
+                      TBAAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
@@ -2682,6 +2723,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   unsigned  Align    = LD->getAlignment();
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
@@ -2693,15 +2735,17 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
   Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr,
                           LD->getPointerInfo(),
-                          LdEltVT, isVolatile, isNonTemporal, Align);
+                          LdEltVT, isVolatile, isNonTemporal, Align, TBAAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
-                                     BasePtr, DAG.getIntPtrConstant(Offset));
+                                     BasePtr,
+                                     DAG.getConstant(Offset,
+                                                     BasePtr.getValueType()));
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
-                            isVolatile, isNonTemporal, Align);
+                            isVolatile, isNonTemporal, Align, TBAAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
@@ -2724,6 +2768,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -2750,12 +2795,12 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset)));
+                                       MinAlign(Align, Offset), TBAAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                              DAG.getIntPtrConstant(Increment));
+                              DAG.getConstant(Increment, BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
     } else {
       // Cast the vector to the scalar type we can store
@@ -2770,11 +2815,11 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset)));
+                                       MinAlign(Align, Offset), TBAAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                              DAG.getIntPtrConstant(Increment));
+                            DAG.getConstant(Increment, BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
       // Restore index back to be relative to the original widen element type
       Idx = Idx * NewVTWidth / ValEltWidth;
@@ -2792,6 +2837,7 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -2814,17 +2860,19 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
                             DAG.getConstant(0, TLI.getVectorIdxTy()));
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
                                       ST->getPointerInfo(), StEltVT,
-                                      isVolatile, isNonTemporal, Align));
+                                      isVolatile, isNonTemporal, Align,
+                                      TBAAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
-                                     BasePtr, DAG.getIntPtrConstant(Offset));
+                                     BasePtr, DAG.getConstant(Offset,
+                                                       BasePtr.getValueType()));
     SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
                             DAG.getConstant(0, TLI.getVectorIdxTy()));
     StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
                                       ST->getPointerInfo().getWithOffset(Offset),
                                         StEltVT, isVolatile, isNonTemporal,
-                                        MinAlign(Align, Offset)));
+                                        MinAlign(Align, Offset), TBAAInfo));
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index d684164..1dd2128 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -389,10 +389,9 @@ signed ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
-static const unsigned PriorityTwo = 100;
-static const unsigned PriorityThree = 50;
-static const unsigned PriorityFour = 15;
-static const unsigned PriorityFive = 5;
+static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 15;
+static const unsigned PriorityFour = 5;
 static const unsigned ScaleOne = 20;
 static const unsigned ScaleTwo = 10;
 static const unsigned ScaleThree = 5;
@@ -449,7 +448,7 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
     if (N->isMachineOpcode()) {
       const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
       if (TID.isCall())
-        ResCount += (PriorityThree + (ScaleThree*N->getNumValues()));
+        ResCount += (PriorityTwo + (ScaleThree*N->getNumValues()));
     }
     else
       switch (N->getOpcode()) {
@@ -457,11 +456,11 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
       case ISD::TokenFactor:
       case ISD::CopyFromReg:
       case ISD::CopyToReg:
-        ResCount += PriorityFive;
+        ResCount += PriorityFour;
         break;
 
       case ISD::INLINEASM:
-        ResCount += PriorityFour;
+        ResCount += PriorityThree;
         break;
       }
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index f5fe168..1a562d7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -718,7 +718,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
   // indicate the scheduled cycle.
   SU->setHeightToAtLeast(CurCycle);
 
-  // Reserve resources for the scheduled intruction.
+  // Reserve resources for the scheduled instruction.
   EmitNode(SU);
 
   Sequence.push_back(SU);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 982dcc9..054e3dd 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -690,15 +690,6 @@ void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
 }
 #endif // NDEBUG
 
-namespace {
-  struct OrderSorter {
-    bool operator()(const std::pair<unsigned, MachineInstr*> &A,
-                    const std::pair<unsigned, MachineInstr*> &B) {
-      return A.first < B.first;
-    }
-  };
-}
-
 /// ProcessSDDbgValues - Process SDDbgValues associated with this node.
 static void
 ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
@@ -744,7 +735,10 @@ ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
   }
 
   MachineBasicBlock *BB = Emitter.getBlock();
-  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI()) {
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI() ||
+      // Fast-isel may have inserted some instructions, in which case the
+      // BB->back().isPHI() test will not fire when we want it to.
+      prior(Emitter.getInsertPos())->isPHI()) {
     // Did not insert any instruction.
     Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
     return;
@@ -857,7 +851,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
 
     // Sort the source order instructions and use the order to insert debug
     // values.
-    std::sort(Orders.begin(), Orders.end(), OrderSorter());
+    std::sort(Orders.begin(), Orders.end(), less_first());
 
     SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
     SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bc6063c..45d5a4f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -869,16 +869,19 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TTI(0), OptLevel(OL),
+  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TTI(0), TLI(0), OptLevel(OL),
     EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
-    Root(getEntryNode()), UpdateListeners(0) {
+    Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
+    UpdateListeners(0) {
   AllNodes.push_back(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf, const TargetTransformInfo *tti) {
+void SelectionDAG::init(MachineFunction &mf, const TargetTransformInfo *tti,
+                        const TargetLowering *tli) {
   MF = &mf;
   TTI = tti;
+  TLI = tli;
   Context = &mf.getFunction()->getContext();
 }
 
@@ -983,6 +986,54 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
    APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
    Elt = ConstantInt::get(*getContext(), NewVal);
   }
+  // In other cases the element type is illegal and needs to be expanded, for
+  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
+  // the value into n parts and use a vector type with n-times the elements.
+  // Then bitcast to the type requested.
+  // Legalizing constants too early makes the DAGCombiner's job harder so we
+  // only legalize if the DAG tells us we must produce legal types.
+  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
+           TLI->getTypeAction(*getContext(), EltVT) ==
+           TargetLowering::TypeExpandInteger) {
+    APInt NewVal = Elt->getValue();
+    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+
+    // Check the temporary vector is the correct size. If this fails then
+    // getTypeToTransformTo() probably returned a type whose size (in bits)
+    // isn't a power-of-2 factor of the requested type size.
+    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SmallVector<SDValue, 2> EltParts;
+    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
+      EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
+                                           .trunc(ViaEltSizeInBits),
+                                     ViaEltVT, isT));
+    }
+
+    // EltParts is currently in little endian order. If we actually want
+    // big-endian order then reverse it now.
+    if (TLI->isBigEndian())
+      std::reverse(EltParts.begin(), EltParts.end());
+
+    // The elements must be reversed when the element order is different
+    // to the endianness of the elements (because the BITCAST is itself a
+    // vector shuffle in this situation). However, we do not need any code to
+    // perform this reversal because getConstant() is producing a vector
+    // splat.
+    // This situation occurs in MIPS MSA.
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+      Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
+
+    SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT,
+                             getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT,
+                                     &Ops[0], Ops.size()));
+    return Result;
+  }
 
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
          "APInt size does not match type size!");
@@ -1077,9 +1128,10 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
                                        unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
+  const TargetLowering *TLI = TM.getTargetLowering();
 
   // Truncate (with sign-extension) the offset value to the pointer size.
-  unsigned BitWidth = TM.getTargetLowering()->getPointerTy().getSizeInBits();
+  unsigned BitWidth = TLI->getPointerTypeSizeInBits(GV->getType());
   if (BitWidth < 64)
     Offset = SignExtend64(Offset, BitWidth);
 
@@ -1298,11 +1350,8 @@ static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl<int> &M) {
 
 SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
                                        SDValue N2, const int *Mask) {
-  assert(N1.getValueType() == N2.getValueType() && "Invalid VECTOR_SHUFFLE");
-  assert(VT.isVector() && N1.getValueType().isVector() &&
-         "Vector Shuffle VTs must be a vectors");
-  assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType()
-         && "Vector Shuffle VTs must have same element type");
+  assert(VT == N1.getValueType() && VT == N2.getValueType() &&
+         "Invalid VECTOR_SHUFFLE");
 
   // Canonicalize shuffle undef, undef -> undef
   if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF)
@@ -1351,17 +1400,13 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
     commuteShuffle(N1, N2, MaskVec);
   }
 
-  // If Identity shuffle, or all shuffle in to undef, return that node.
-  bool AllUndef = true;
+  // If Identity shuffle return that node.
   bool Identity = true;
   for (unsigned i = 0; i != NElts; ++i) {
     if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
-    if (MaskVec[i] >= 0) AllUndef = false;
   }
-  if (Identity && NElts == N1.getValueType().getVectorNumElements())
+  if (Identity && NElts)
     return N1;
-  if (AllUndef)
-    return getUNDEF(VT);
 
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
@@ -1380,7 +1425,9 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
   memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int));
 
   ShuffleVectorSDNode *N =
-    new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), N1, N2, MaskAlloc);
+    new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(),
+                                            dl.getDebugLoc(), N1, N2,
+                                            MaskAlloc);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1403,8 +1450,9 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), Ops, 5,
-                                                           Code);
+  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(),
+                                                           dl.getDebugLoc(),
+                                                           Ops, 5, Code);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1447,7 +1495,8 @@ SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) {
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(), dl.getDebugLoc(), Root, Label);
+  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(),
+                                                dl.getDebugLoc(), Root, Label);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1510,6 +1559,26 @@ SDValue SelectionDAG::getMDNode(const MDNode *MD) {
   return SDValue(N, 0);
 }
 
+/// getAddrSpaceCast - Return an AddrSpaceCastSDNode.
+SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
+                                       unsigned SrcAS, unsigned DestAS) {
+  SDValue Ops[] = {Ptr};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), &Ops[0], 1);
+  ID.AddInteger(SrcAS);
+  ID.AddInteger(DestAS);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) AddrSpaceCastSDNode(dl.getIROrder(),
+                                                      dl.getDebugLoc(),
+                                                      VT, Ptr, SrcAS, DestAS);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
 
 /// getShiftAmountOperand - Return the specified value casted to
 /// the target's desired shift amount type.
@@ -1561,7 +1630,12 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
   case ISD::SETFALSE:
   case ISD::SETFALSE2: return getConstant(0, VT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2:  return getConstant(1, VT);
+  case ISD::SETTRUE2: {
+    const TargetLowering *TLI = TM.getTargetLowering();
+    TargetLowering::BooleanContent Cnt = TLI->getBooleanContents(VT.isVector());
+    return getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
+  }
 
   case ISD::SETOEQ:
   case ISD::SETOGT:
@@ -1643,7 +1717,12 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
       }
     } else {
       // Ensure that the constant occurs on the RHS.
-      return getSetCC(dl, VT, N2, N1, ISD::getSetCCSwappedOperands(Cond));
+      ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
+      MVT CompVT = N1.getValueType().getSimpleVT();
+      if (!TM.getTargetLowering()->isCondCodeLegal(SwappedCond, CompVT))
+        return SDValue();
+
+      return getSetCC(dl, VT, N2, N1, SwappedCond);
     }
   }
 
@@ -1942,7 +2021,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
-    APInt InSignBit = APInt::getSignBit(InBits);
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
 
     KnownZero = KnownZero.trunc(InBits);
@@ -2054,7 +2132,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
       const APInt &RA = Rem->getAPIntValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
         ComputeMaskedBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
 
         // The low bits of the first operand are unchanged by the srem.
@@ -2150,7 +2227,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   }
 
   case ISD::SIGN_EXTEND:
-    Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    Tmp =
+        VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
   case ISD::SIGN_EXTEND_INREG:
@@ -2411,7 +2489,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), getVTList(VT));
+  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
   AllNodes.push_back(N);
@@ -2672,10 +2751,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand);
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs, Operand);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand);
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs, Operand);
   }
 
   AllNodes.push_back(N);
@@ -3073,9 +3154,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     if (VT.isSimple() && N1.getValueType().isSimple()) {
       assert(VT.isVector() && N1.getValueType().isVector() &&
              "Extract subvector VTs must be a vectors!");
-      assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType() &&
+      assert(VT.getVectorElementType() ==
+             N1.getValueType().getVectorElementType() &&
              "Extract subvector VTs must have the same element type!");
-      assert(VT.getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+      assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
              "Extract subvector must be from larger vector to smaller vector!");
 
       if (isa<ConstantSDNode>(Index.getNode())) {
@@ -3086,7 +3168,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       }
 
       // Trivial extraction.
-      if (VT.getSimpleVT() == N1.getValueType().getSimpleVT())
+      if (VT.getSimpleVT() == N1.getSimpleValueType())
         return N1;
     }
     break;
@@ -3244,10 +3326,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), VTs, N1, N2);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), VTs, N1, N2);
   }
 
   AllNodes.push_back(N);
@@ -3316,7 +3400,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
              "Insert subvector VTs must be a vectors");
       assert(VT == N1.getValueType() &&
              "Dest and insert subvector source types must match!");
-      assert(N2.getValueType().getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+      assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
              "Insert subvector must be from smaller vector to larger vector!");
       if (isa<ConstantSDNode>(Index.getNode())) {
         assert((N2.getValueType().getVectorNumElements() +
@@ -3326,7 +3410,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
       }
 
       // Trivial insertion.
-      if (VT.getSimpleVT() == N2.getValueType().getSimpleVT())
+      if (VT.getSimpleVT() == N2.getSimpleValueType())
         return N2;
     }
     break;
@@ -3349,10 +3433,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3);
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTs, N1, N2, N3);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3);
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTs, N1, N2, N3);
   }
 
   AllNodes.push_back(N);
@@ -3771,7 +3857,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
-    SDValue Value, Store;
+    SDValue Value;
 
     Value = DAG.getLoad(VT, dl, Chain,
                         getMemBasePlusOffset(Src, SrcOff, dl, DAG),
@@ -3787,7 +3873,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
-    SDValue Value, Store;
+    SDValue Store;
 
     Store = DAG.getStore(Chain, dl, LoadValues[i],
                          getMemBasePlusOffset(Dst, DstOff, dl, DAG),
@@ -3800,6 +3886,24 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
                      &OutChains[0], OutChains.size());
 }
 
+/// \brief Lower the call to 'memset' intrinsic function into a series of store
+/// operations.
+///
+/// \param DAG Selection DAG where lowered code is placed.
+/// \param dl Link to corresponding IR location.
+/// \param Chain Control flow dependency.
+/// \param Dst Pointer to destination memory location.
+/// \param Src Value of byte to write into the memory.
+/// \param Size Number of bytes to write.
+/// \param Align Alignment of the destination in bytes.
+/// \param isVol True if destination is volatile.
+/// \param DstPtrInfo IR information on the memory pointer.
+/// \returns New head in the control flow, if lowering was successful, empty
+/// SDValue otherwise.
+///
+/// The function tries to replace 'llvm.memset' intrinsic with several store
+/// operations and value calculation code. This is usually profitable for small
+/// memory size.
 static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
                                SDValue Chain, SDValue Dst,
                                SDValue Src, uint64_t Size,
@@ -4078,6 +4182,37 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
+                                SDVTList VTList, SDValue* Ops, unsigned NumOps,
+                                MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  // Allocate the operands array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the allocator is released.
+  // If the number of operands is less than 5 we use AtomicSDNode's internal
+  // storage.
+  SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate<SDUse>(NumOps) : 0;
+
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, MemVT,
+                                               Ops, DynOps, NumOps, MMO,
+                                               Ordering, SynchScope);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Cmp,
                                 SDValue Swp, MachinePointerInfo PtrInfo,
                                 unsigned Alignment,
@@ -4117,22 +4252,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   EVT VT = Cmp.getValueType();
 
   SDVTList VTs = getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, MemVT, Chain,
-                                               Ptr, Cmp, Swp, MMO, Ordering,
-                                               SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 4, MMO, Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
@@ -4190,22 +4311,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
 
   SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
                                                getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Val};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, MemVT, Chain,
-                                               Ptr, Val, MMO,
-                                               Ordering, SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 3, MMO, Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
@@ -4248,21 +4355,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
 
   SDVTList VTs = getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, MemVT, Chain,
-                                               Ptr, MMO, Ordering, SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 2, MMO, Ordering, SynchScope);
 }
 
 /// getMergeValues - Create a MERGE_VALUES node from the given operands.
@@ -4339,12 +4433,14 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
       return SDValue(E, 0);
     }
 
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, NumOps,
-                                               MemVT, MMO);
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, Ops,
+                                               NumOps, MemVT, MMO);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, NumOps,
-                                               MemVT, MMO);
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, Ops,
+                                               NumOps, MemVT, MMO);
   }
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4458,7 +4554,8 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
     cast<LoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ExtType,
+  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(),
+                                             dl.getDebugLoc(), VTs, AM, ExtType,
                                              MemVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
@@ -4478,6 +4575,14 @@ SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
                  TBAAInfo, Ranges);
 }
 
+SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
+                              SDValue Chain, SDValue Ptr,
+                              MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 VT, MMO);
+}
+
 SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                                  SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
@@ -4490,6 +4595,14 @@ SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
 }
 
 
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
+                                 SDValue Chain, SDValue Ptr, EVT MemVT,
+                                 MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                 MemVT, MMO);
+}
+
 SDValue
 SelectionDAG::getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base,
                              SDValue Offset, ISD::MemIndexedMode AM) {
@@ -4548,8 +4661,9 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED,
-                                              false, VT, MMO);
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs,
+                                              ISD::UNINDEXED, false, VT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4616,8 +4730,9 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED,
-                                              true, SVT, MMO);
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs,
+                                              ISD::UNINDEXED, true, SVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4640,7 +4755,8 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs, AM,
                                               ST->isTruncatingStore(),
                                               ST->getMemoryVT(),
                                               ST->getMemOperand());
@@ -4715,10 +4831,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops, NumOps);
+    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                   VTs, Ops, NumOps);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops, NumOps);
+    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                   VTs, Ops, NumOps);
   }
 
   AllNodes.push_back(N);
@@ -4781,26 +4899,36 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
       return SDValue(E, 0);
 
     if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]);
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTList, Ops[0]);
     } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]);
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                           DL.getDebugLoc(), VTList, Ops[0],
+                                           Ops[1]);
     } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1],
-                                            Ops[2]);
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                            DL.getDebugLoc(), VTList, Ops[0],
+                                            Ops[1], Ops[2]);
     } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops, NumOps);
+      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                     VTList, Ops, NumOps);
     }
     CSEMap.InsertNode(N, IP);
   } else {
     if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]);
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTList, Ops[0]);
     } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]);
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                           DL.getDebugLoc(), VTList, Ops[0],
+                                           Ops[1]);
     } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1],
-                                            Ops[2]);
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                            DL.getDebugLoc(), VTList, Ops[0],
+                                            Ops[1], Ops[2]);
     } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops, NumOps);
+      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                     VTList, Ops, NumOps);
     }
   }
   AllNodes.push_back(N);
@@ -4851,76 +4979,81 @@ SDVTList SelectionDAG::getVTList(EVT VT) {
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 2 && I->VTs[0] == VT1 && I->VTs[1] == VT2)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(2);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  SDVTList Result = makeVTList(Array, 2);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(2U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(2);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 3 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
-                          I->VTs[2] == VT3)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(3);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  Array[2] = VT3;
-  SDVTList Result = makeVTList(Array, 3);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(3U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(3);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 4 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
-                          I->VTs[2] == VT3 && I->VTs[3] == VT4)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(4);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  Array[2] = VT3;
-  Array[3] = VT4;
-  SDVTList Result = makeVTList(Array, 4);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(4U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+  ID.AddInteger(VT4.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(4);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Array[3] = VT4;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
-  switch (NumVTs) {
-    case 0: llvm_unreachable("Cannot have nodes without results!");
-    case 1: return getVTList(VTs[0]);
-    case 2: return getVTList(VTs[0], VTs[1]);
-    case 3: return getVTList(VTs[0], VTs[1], VTs[2]);
-    case 4: return getVTList(VTs[0], VTs[1], VTs[2], VTs[3]);
-    default: break;
+  FoldingSetNodeID ID;
+  ID.AddInteger(NumVTs);
+  for (unsigned index = 0; index < NumVTs; index++) {
+    ID.AddInteger(VTs[index].getRawBits());
   }
 
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I) {
-    if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
-      continue;
-
-    if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2]))
-      return *I;
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+    std::copy(VTs, VTs + NumVTs, Array);
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
+    VTListMap.InsertNode(Result, IP);
   }
-
-  EVT *Array = Allocator.Allocate<EVT>(NumVTs);
-  std::copy(VTs, VTs+NumVTs, Array);
-  SDVTList Result = makeVTList(Array, NumVTs);
-  VTList.push_back(Result);
-  return Result;
+  return Result->getSDVTList();
 }
 
 
@@ -5410,7 +5543,8 @@ SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
   }
 
   // Allocate a new MachineSDNode.
-  N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+  N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs);
 
   // Initialize the operands list.
   if (NumOps > array_lengthof(N->LocalOperands))
@@ -5916,6 +6050,12 @@ GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
   TheGlobal = GA;
 }
 
+AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT,
+                                         SDValue X, unsigned SrcAS,
+                                         unsigned DestAS)
+ : UnarySDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT), X),
+   SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
+
 MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
                      EVT memvt, MachineMemOperand *mmo)
  : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
@@ -6162,8 +6302,8 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
     case ISD::ROTL:
     case ISD::ROTR:
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
-                                getShiftAmountOperand(Operands[0].getValueType(),
-                                                      Operands[1])));
+                               getShiftAmountOperand(Operands[0].getValueType(),
+                                                     Operands[1])));
       break;
     case ISD::SIGN_EXTEND_INREG:
     case ISD::FP_ROUND_INREG: {
@@ -6235,7 +6375,7 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   int64_t GVOffset = 0;
   const TargetLowering *TLI = TM.getTargetLowering();
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
-    unsigned PtrWidth = TLI->getPointerTy().getSizeInBits();
+    unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType());
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
     llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
                             TLI->getDataLayout());
@@ -6268,6 +6408,38 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   return 0;
 }
 
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split (or expanded) into two not necessarily identical pieces.
+std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
+  // Currently all types are split in half.
+  EVT LoVT, HiVT;
+  if (!VT.isVector()) {
+    LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
+  } else {
+    unsigned NumElements = VT.getVectorNumElements();
+    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+    LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
+                                   NumElements/2);
+  }
+  return std::make_pair(LoVT, HiVT);
+}
+
+/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
+/// low/high part.
+std::pair<SDValue, SDValue>
+SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
+                          const EVT &HiVT) {
+  assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
+         N.getValueType().getVectorNumElements() &&
+         "More vector elements requested than available!");
+  SDValue Lo, Hi;
+  Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+               getConstant(0, TLI->getVectorIdxTy()));
+  Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+               getConstant(LoVT.getVectorNumElements(), TLI->getVectorIdxTy()));
+  return std::make_pair(Lo, Hi);
+}
+
 // getAddressSpace - Return the address space this GlobalAddress belongs to.
 unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
@@ -6389,7 +6561,7 @@ static void checkForCyclesHelper(const SDNode *N,
 
 void llvm::checkForCycles(const llvm::SDNode *N) {
 #ifdef XDEBUG
-  assert(N && "Checking nonexistant SDNode");
+  assert(N && "Checking nonexistent SDNode");
   SmallPtrSet<const SDNode*, 32> visited;
   SmallPtrSet<const SDNode*, 32> checked;
   checkForCyclesHelper(N, visited, checked);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b9f4381..2b2713d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -49,7 +50,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IntegersSubsetMapping.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -58,6 +58,7 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -1063,8 +1064,10 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
 
-    if (isa<ConstantPointerNull>(C))
-      return DAG.getConstant(0, TLI->getPointerTy());
+    if (isa<ConstantPointerNull>(C)) {
+      unsigned AS = V->getType()->getPointerAddressSpace();
+      return DAG.getConstant(0, TLI->getPointerTy(AS));
+    }
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
       return DAG.getConstantFP(*CFP, VT);
@@ -1268,7 +1271,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
         for (unsigned i = 0; i < NumParts; ++i) {
           Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
-                                        /*isfixed=*/true, 0, 0));
+                                        VT, /*isfixed=*/true, 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
@@ -1617,8 +1620,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
     } else
       Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
   } else {
-    assert(CB.CC == ISD::SETCC_INVALID &&
-           "Condition is undefined for to-the-range belonging check.");
+    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
@@ -1626,9 +1628,9 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
     SDValue CmpOp = getValue(CB.CmpMHS);
     EVT VT = CmpOp.getValueType();
 
-    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(false)) {
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
       Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
-                          ISD::SETULE);
+                          ISD::SETLE);
     } else {
       SDValue SUB = DAG.getNode(ISD::SUB, dl,
                                 VT, CmpOp, DAG.getConstant(Low, VT));
@@ -1741,6 +1743,77 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   DAG.setRoot(BrCond);
 }
 
+/// Codegen a new tail for a stack protector check ParentMBB which has had its
+/// tail spliced into a stack protector check success bb.
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                                                  MachineBasicBlock *ParentBB) {
+
+  // First create the loads to the guard/stack slot for the comparison.
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT PtrTy = TLI->getPointerTy();
+
+  MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
+  int FI = MFI->getStackProtectorIndex();
+
+  const Value *IRGuard = SPD.getGuard();
+  SDValue GuardPtr = getValue(IRGuard);
+  SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+
+  unsigned Align =
+    TLI->getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
+  SDValue Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                              GuardPtr, MachinePointerInfo(IRGuard, 0),
+                              true, false, false, Align);
+
+  SDValue StackSlot = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                                  StackSlotPtr,
+                                  MachinePointerInfo::getFixedStack(FI),
+                                  true, false, false, Align);
+
+  // Perform the comparison via a subtract/getsetcc.
+  EVT VT = Guard.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, Guard, StackSlot);
+
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(),
+                             TLI->getSetCCResultType(*DAG.getContext(),
+                                                     Sub.getValueType()),
+                             Sub, DAG.getConstant(0, VT),
+                             ISD::SETNE);
+
+  // If the sub is not 0, then we know the guard/stackslot do not equal, so
+  // branch to failure MBB.
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurSDLoc(),
+                               MVT::Other, StackSlot.getOperand(0),
+                               Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
+  // Otherwise branch to success MBB.
+  SDValue Br = DAG.getNode(ISD::BR, getCurSDLoc(),
+                           MVT::Other, BrCond,
+                           DAG.getBasicBlock(SPD.getSuccessMBB()));
+
+  DAG.setRoot(Br);
+}
+
+/// Codegen the failure basic block for a stack protector check.
+///
+/// A failure stack protector machine basic block consists simply of a call to
+/// __stack_chk_fail().
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void
+SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+  SDValue Chain = TLI->makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
+                                   MVT::isVoid, 0, 0, false, getCurSDLoc(),
+                                   false, false).second;
+  DAG.setRoot(Chain);
+}
+
 /// visitBitTestHeader - This function emits necessary code to produce value
 /// suitable for "bit tests"
 void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
@@ -2073,7 +2146,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
       CC = ISD::SETEQ;
       LHS = SV; RHS = I->High; MHS = NULL;
     } else {
-      CC = ISD::SETCC_INVALID;
+      CC = ISD::SETLE;
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
 
@@ -2107,7 +2180,7 @@ static inline bool areJTsAllowed(const TargetLowering &TLI) {
 
 static APInt ComputeRange(const APInt &First, const APInt &Last) {
   uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
-  APInt LastExt = Last.zext(BitWidth), FirstExt = First.zext(BitWidth);
+  APInt LastExt = Last.sext(BitWidth), FirstExt = First.sext(BitWidth);
   return (LastExt - FirstExt + 1ULL);
 }
 
@@ -2174,7 +2247,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
     const APInt &Low = cast<ConstantInt>(I->Low)->getValue();
     const APInt &High = cast<ConstantInt>(I->High)->getValue();
 
-    if (Low.ule(TEI) && TEI.ule(High)) {
+    if (Low.sle(TEI) && TEI.sle(High)) {
       DestBBs.push_back(I->BB);
       if (TEI==High)
         ++I;
@@ -2348,7 +2421,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // Create a CaseBlock record representing a conditional branch to
   // the LHS node if the value being switched on SV is less than C.
   // Otherwise, branch to LHS.
-  CaseBlock CB(ISD::SETULT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
 
   if (CR.CaseBB == SwitchBB)
     visitSwitchCase(CB, SwitchBB);
@@ -2378,7 +2451,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   MachineFunction *CurMF = FuncInfo.MF;
 
   // If target does not have legal shift left, do not emit bit tests at all.
-  if (!TLI->isOperationLegal(ISD::SHL, TLI->getPointerTy()))
+  if (!TLI->isOperationLegal(ISD::SHL, PTy))
     return false;
 
   size_t numCmps = 0;
@@ -2421,7 +2494,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   // Optimize the case where all the case values fit in a
   // word without having to subtract minValue. In this case,
   // we can optimize away the subtraction.
-  if (maxValue.ult(IntPtrBits)) {
+  if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) {
     cmpRange = maxValue;
   } else {
     lowBound = minValue;
@@ -2496,12 +2569,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 /// Clusterify - Transform simple list of Cases into list of CaseRange's
 size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                        const SwitchInst& SI) {
-
-  /// Use a shorter form of declaration, and also
-  /// show the we want to use CRSBuilder as Clusterifier.
-  typedef IntegersSubsetMapping<MachineBasicBlock> Clusterifier;
-
-  Clusterifier TheClusterifier;
+  size_t numCmps = 0;
 
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
@@ -2510,27 +2578,40 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
-    TheClusterifier.add(i.getCaseValueEx(), SMBB,
-        BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0);
-  }
-
-  TheClusterifier.optimize();
-
-  size_t numCmps = 0;
-  for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
-       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
-    Clusterifier::Cluster &C = *i;
-    // Update edge weight for the cluster.
-    unsigned W = C.first.Weight;
-
-    // FIXME: Currently work with ConstantInt based numbers.
-    // Changing it to APInt based is a pretty heavy for this commit.
-    Cases.push_back(Case(C.first.getLow().toConstantInt(),
-                         C.first.getHigh().toConstantInt(), C.second, W));
+    uint32_t ExtraWeight =
+      BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0;
+
+    Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(),
+                         SMBB, ExtraWeight));
+  }
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size() >= 2)
+    // Must recompute end() each iteration because it may be
+    // invalidated by erase if we hold on to it
+    for (CaseItr I = Cases.begin(), J = llvm::next(Cases.begin());
+         J != Cases.end(); ) {
+      const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue();
+      const APInt& currentValue = cast<ConstantInt>(I->High)->getValue();
+      MachineBasicBlock* nextBB = J->BB;
+      MachineBasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        I->ExtraWeight += J->ExtraWeight;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
 
-    if (C.first.getLow() != C.first.getHigh())
-    // A range counts double, since it requires two compares.
-    ++numCmps;
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
   }
 
   return numCmps;
@@ -2859,6 +2940,21 @@ void SelectionDAGBuilder::visitBitCast(const User &I) {
     setValue(&I, N);            // noop cast.
 }
 
+void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Value *SV = I.getOperand(0);
+  SDValue N = getValue(SV);
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+
+  unsigned SrcAS = SV->getType()->getPointerAddressSpace();
+  unsigned DestAS = I.getType()->getPointerAddressSpace();
+
+  if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+    N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
+
+  setValue(&I, N);
+}
+
 void SelectionDAGBuilder::visitInsertElement(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
@@ -3151,10 +3247,12 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
 }
 
 void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
-  SDValue N = getValue(I.getOperand(0));
+  Value *Op0 = I.getOperand(0);
   // Note that the pointer operand may be a vector of pointers. Take the scalar
   // element which holds a pointer.
-  Type *Ty = I.getOperand(0)->getType()->getScalarType();
+  Type *Ty = Op0->getType()->getScalarType();
+  unsigned AS = Ty->getPointerAddressSpace();
+  SDValue N = getValue(Op0);
 
   for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
        OI != E; ++OI) {
@@ -3179,14 +3277,13 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         uint64_t Offs =
             TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         SDValue OffsVal;
-        EVT PTy = TLI->getPointerTy();
+        EVT PTy = TLI->getPointerTy(AS);
         unsigned PtrBits = PTy.getSizeInBits();
         if (PtrBits < 64)
-          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(),
-                                TLI->getPointerTy(),
+          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), PTy,
                                 DAG.getConstant(Offs, MVT::i64));
         else
-          OffsVal = DAG.getIntPtrConstant(Offs);
+          OffsVal = DAG.getConstant(Offs, PTy);
 
         N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N,
                         OffsVal);
@@ -3194,7 +3291,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       }
 
       // N = N + Idx * ElementSize;
-      APInt ElementSize = APInt(TLI->getPointerTy().getSizeInBits(),
+      APInt ElementSize = APInt(TLI->getPointerSizeInBits(AS),
                                 TD->getTypeAllocSize(Ty));
       SDValue IdxN = getValue(Idx);
 
@@ -3451,7 +3548,7 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
 
   SDValue L =
     DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
-                  getValue(I.getCompareOperand()).getValueType().getSimpleVT(),
+                  getValue(I.getCompareOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getCompareOperand()),
@@ -3499,7 +3596,7 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
 
   SDValue L =
     DAG.getAtomic(NT, dl,
-                  getValue(I.getValOperand()).getValueType().getSimpleVT(),
+                  getValue(I.getValOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValOperand()),
@@ -4193,7 +4290,7 @@ static SDValue expandExp2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS,
                          SelectionDAG &DAG, const TargetLowering &TLI) {
   bool IsExp10 = false;
-  if (LHS.getValueType() == MVT::f32 && LHS.getValueType() == MVT::f32 &&
+  if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
       APFloat Ten(10.0f);
@@ -4705,14 +4802,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
                                         TLI->getPointerTy());
     SDValue Offset = DAG.getNode(ISD::ADD, sdl,
-                                 TLI->getPointerTy(),
+                                 CfaArg.getValueType(),
                                  DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl,
-                                             TLI->getPointerTy()),
+                                             CfaArg.getValueType()),
                                  CfaArg);
     SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl,
                              TLI->getPointerTy(),
                              DAG.getConstant(0, TLI->getPointerTy()));
-    setValue(&I, DAG.getNode(ISD::ADD, sdl, TLI->getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
                              FA, Offset));
     return 0;
   }
@@ -4902,7 +4999,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::ceil:
   case Intrinsic::trunc:
   case Intrinsic::rint:
-  case Intrinsic::nearbyint: {
+  case Intrinsic::nearbyint:
+  case Intrinsic::round: {
     unsigned Opcode;
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -4915,6 +5013,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
     case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
     case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+    case Intrinsic::round:     Opcode = ISD::FROUND;     break;
     }
 
     setValue(&I, DAG.getNode(Opcode, sdl,
@@ -4922,6 +5021,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(0))));
     return 0;
   }
+  case Intrinsic::copysign:
+    setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return 0;
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(ISD::FMA, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -5207,9 +5312,30 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::invariant_end:
     // Discard region information.
     return 0;
+  case Intrinsic::stackprotectorcheck: {
+    // Do not actually emit anything for this basic block. Instead we initialize
+    // the stack protector descriptor and export the guard variable so we can
+    // access it in FinishBasicBlock.
+    const BasicBlock *BB = I.getParent();
+    SPDescriptor.initialize(BB, FuncInfo.MBBMap[BB], I);
+    ExportFromCurrentBlock(SPDescriptor.getGuard());
+
+    // Flush our exports since we are going to process a terminator.
+    (void)getControlRoot();
+    return 0;
+  }
   case Intrinsic::donothing:
     // ignore
     return 0;
+  case Intrinsic::experimental_stackmap: {
+    visitStackmap(I);
+    return 0;
+  }
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64: {
+    visitPatchpoint(I);
+    return 0;
+  }
   }
 }
 
@@ -5274,15 +5400,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     SDValue ArgNode = getValue(V);
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
-    unsigned attrInd = i - CS.arg_begin() + 1;
-    Entry.isSExt     = CS.paramHasAttr(attrInd, Attribute::SExt);
-    Entry.isZExt     = CS.paramHasAttr(attrInd, Attribute::ZExt);
-    Entry.isInReg    = CS.paramHasAttr(attrInd, Attribute::InReg);
-    Entry.isSRet     = CS.paramHasAttr(attrInd, Attribute::StructRet);
-    Entry.isNest     = CS.paramHasAttr(attrInd, Attribute::Nest);
-    Entry.isByVal    = CS.paramHasAttr(attrInd, Attribute::ByVal);
-    Entry.isReturned = CS.paramHasAttr(attrInd, Attribute::Returned);
-    Entry.Alignment  = CS.getParamAlignment(attrInd);
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
     Args.push_back(Entry);
   }
 
@@ -5364,8 +5483,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   }
 
   if (!Result.second.getNode()) {
-    // As a special case, a null chain means that a tail call has been emitted and
-    // the DAG root is already updated.
+    // As a special case, a null chain means that a tail call has been emitted
+    // and the DAG root is already updated.
     HasTailCall = true;
 
     // Since there's no actual continuation from this block, nothing can be
@@ -5445,6 +5564,18 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   return LoadVal;
 }
 
+/// processIntegerCallValue - Record the value for an instruction that
+/// produces an integer result, converting the type where necessary.
+void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
+                                                  SDValue Value,
+                                                  bool IsSigned) {
+  EVT VT = TM.getTargetLowering()->getValueType(I.getType(), true);
+  if (IsSigned)
+    Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
+  else
+    Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
+  setValue(&I, Value);
+}
 
 /// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
 /// If so, return true and lower it, otherwise return false and it will be
@@ -5460,15 +5591,33 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       !I.getType()->isIntegerTy())
     return false;
 
-  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
+  const Value *Size = I.getArgOperand(2);
+  const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
+  if (CSize && CSize->getZExtValue() == 0) {
+    EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
+    setValue(&I, DAG.getConstant(0, CallVT));
+    return true;
+  }
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(LHS), getValue(RHS), getValue(Size),
+                                MachinePointerInfo(LHS),
+                                MachinePointerInfo(RHS));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
+  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
     bool ActuallyDoIt = true;
     MVT LoadVT;
     Type *LoadTy;
-    switch (Size->getZExtValue()) {
+    switch (CSize->getZExtValue()) {
     default:
       LoadVT = MVT::Other;
       LoadTy = 0;
@@ -5476,20 +5625,20 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       break;
     case 2:
       LoadVT = MVT::i16;
-      LoadTy = Type::getInt16Ty(Size->getContext());
+      LoadTy = Type::getInt16Ty(CSize->getContext());
       break;
     case 4:
       LoadVT = MVT::i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       break;
     case 8:
       LoadVT = MVT::i64;
-      LoadTy = Type::getInt64Ty(Size->getContext());
+      LoadTy = Type::getInt64Ty(CSize->getContext());
       break;
         /*
     case 16:
       LoadVT = MVT::v4i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       LoadTy = VectorType::get(LoadTy, 4);
       break;
          */
@@ -5503,7 +5652,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
     // supports unaligned loads of that type.  Expanding into byte loads would
     // bloat the code.
     const TargetLowering *TLI = TM.getTargetLowering();
-    if (ActuallyDoIt && Size->getZExtValue() > 4) {
+    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
       if (!TLI->isTypeLegal(LoadVT) ||!TLI->allowsUnalignedMemoryAccesses(LoadVT))
@@ -5516,8 +5665,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
       SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
                                  ISD::SETNE);
-      EVT CallVT = TLI->getValueType(I.getType(), true);
-      setValue(&I, DAG.getZExtOrTrunc(Res, getCurSDLoc(), CallVT));
+      processIntegerCallValue(I, Res, false);
       return true;
     }
   }
@@ -5526,6 +5674,148 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   return false;
 }
 
+/// visitMemChrCall -- See if we can lower a memchr call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t)
+  if (I.getNumArgOperands() != 3)
+    return false;
+
+  const Value *Src = I.getArgOperand(0);
+  const Value *Char = I.getArgOperand(1);
+  const Value *Length = I.getArgOperand(2);
+  if (!Src->getType()->isPointerTy() ||
+      !Char->getType()->isIntegerTy() ||
+      !Length->getType()->isIntegerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Src), getValue(Char), getValue(Length),
+                                MachinePointerInfo(Src));
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an
+/// optimized form.  If so, return true and lower it, otherwise return false
+/// and it will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
+  // Verify that the prototype makes sense.  char *strcpy(char *, char *)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1), isStpcpy);
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    DAG.setRoot(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form.
+/// If so, return true and lower it, otherwise return false and it will be
+/// lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  int strcmp(void*,void*)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrLenCall -- See if we can lower a strlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strlen(char *)
+  if (I.getNumArgOperands() != 1)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0);
+  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isIntegerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                 getValue(Arg0), getValue(Arg1),
+                                 MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
 /// visitUnaryFloatCall - If a call instruction is a unary floating-point
 /// operation (as expected), translate it to an SDNode with the specified opcode
 /// and return true.
@@ -5644,6 +5934,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
         break;
+      case LibFunc::round:
+      case LibFunc::roundf:
+      case LibFunc::roundl:
+        if (visitUnaryFloatCall(I, ISD::FROUND))
+          return;
+        break;
       case LibFunc::trunc:
       case LibFunc::truncf:
       case LibFunc::truncl:
@@ -5666,6 +5962,30 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitMemCmpCall(I))
           return;
         break;
+      case LibFunc::memchr:
+        if (visitMemChrCall(I))
+          return;
+        break;
+      case LibFunc::strcpy:
+        if (visitStrCpyCall(I, false))
+          return;
+        break;
+      case LibFunc::stpcpy:
+        if (visitStrCpyCall(I, true))
+          return;
+        break;
+      case LibFunc::strcmp:
+        if (visitStrCmpCall(I))
+          return;
+        break;
+      case LibFunc::strlen:
+        if (visitStrLenCall(I))
+          return;
+        break;
+      case LibFunc::strnlen:
+        if (visitStrNLenCall(I))
+          return;
+        break;
       }
     }
   }
@@ -6421,6 +6741,248 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
                           DAG.getSrcValue(I.getArgOperand(1))));
 }
 
+/// \brief Lower an argument list according to the target calling convention.
+///
+/// \return A tuple of <return-value, token-chain>
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+std::pair<SDValue, SDValue>
+SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
+                                       unsigned NumArgs, SDValue Callee,
+                                       bool useVoidTy) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  ImmutableCallSite CS(&CI);
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+       ArgI != ArgE; ++ArgI) {
+    const Value *V = CI.getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = getValue(V);
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, AttrI);
+    Args.push_back(Entry);
+  }
+
+  Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType();
+  TargetLowering::CallLoweringInfo CLI(getRoot(), retTy, /*retSExt*/ false,
+    /*retZExt*/ false, /*isVarArg*/ false, /*isInReg*/ false, NumArgs,
+    CI.getCallingConv(), /*isTailCall*/ false, /*doesNotReturn*/ false,
+    /*isReturnValueUsed*/ CI.use_empty(), Callee, Args, DAG, getCurSDLoc());
+
+  const TargetLowering *TLI = TM.getTargetLowering();
+  return TLI->LowerCallTo(CLI);
+}
+
+/// \brief Lower llvm.experimental.stackmap directly to its target opcode.
+void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
+  // void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+
+  assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");
+
+  SDValue Callee = getValue(CI.getCalledValue());
+
+  // Lower into a call sequence with no args and no return value.
+  std::pair<SDValue, SDValue> Result = LowerCallOperands(CI, 0, 0, Callee);
+  // Set the root to the target-lowered call chain.
+  SDValue Chain = Result.second;
+  DAG.setRoot(Chain);
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  SDNode *CallEnd = Chain.getNode();
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool hasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the stackmap intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Add the <id> and <numShadowBytes> constants.
+  for (unsigned i = 0; i < 2; ++i) {
+    SDValue tmp = getValue(CI.getOperand(i));
+    Ops.push_back(DAG.getTargetConstant(
+        cast<ConstantSDNode>(tmp)->getZExtValue(), MVT::i32));
+  }
+  // Push live variables for the stack map.
+  for (unsigned i = 2, e = CI.getNumArgOperands(); i != e; ++i)
+    Ops.push_back(getValue(CI.getArgOperand(i)));
+
+  // Push the chain (this is originally the first operand of the call, but
+  // becomes now the last or second to last operand).
+  Ops.push_back(*(Call->op_begin()));
+
+    // Push the glue flag (last operand).
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-1));
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a STACKMAP node.
+  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::STACKMAP, getCurSDLoc(),
+                                         NodeTys, Ops);
+
+  // StackMap generates no value, so nothing goes in the NodeMap.
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence.
+  DAG.ReplaceAllUsesWith(Call, MN);
+
+  DAG.DeleteNode(Call);
+}
+
+/// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
+void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i32 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+
+  CallingConv::ID CC = CI.getCallingConv();
+  bool isAnyRegCC = CC == CallingConv::AnyReg;
+  bool hasDef = !CI.getType()->isVoidTy();
+  SDValue Callee = getValue(CI.getOperand(2)); // <target>
+
+  // Get the real number of arguments participating in the call <numArgs>
+  unsigned NumArgs =
+    cast<ConstantSDNode>(getValue(CI.getArgOperand(3)))->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  assert(CI.getNumArgOperands() >= NumArgs + 4 &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = isAnyRegCC ? 0 : NumArgs;
+  std::pair<SDValue, SDValue> Result =
+    LowerCallOperands(CI, 4, NumCallArgs, Callee, isAnyRegCC);
+
+  // Set the root to the target-lowered call chain.
+  SDValue Chain = Result.second;
+  DAG.setRoot(Chain);
+
+  SDNode *CallEnd = Chain.getNode();
+  if (hasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
+    CallEnd = CallEnd->getOperand(0).getNode();
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool hasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the patchable intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Add the <id> and <numNopBytes> constants.
+  for (unsigned i = 0; i < 2; ++i) {
+    SDValue tmp = getValue(CI.getOperand(i));
+    Ops.push_back(DAG.getTargetConstant(
+        cast<ConstantSDNode>(tmp)->getZExtValue(), MVT::i32));
+  }
+  // Assume that the Callee is a constant address.
+  Ops.push_back(
+    DAG.getIntPtrConstant(cast<ConstantSDNode>(Callee)->getZExtValue(),
+                          /*isTarget=*/true));
+
+  // Adjust <numArgs> to account for any arguments that have been passed on the
+  // stack instead.
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  unsigned NumCallRegArgs = Call->getNumOperands() - (hasGlue ? 4 : 3);
+  NumCallRegArgs = isAnyRegCC ? NumArgs : NumCallRegArgs;
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
+
+  // Add the calling convention
+  Ops.push_back(DAG.getTargetConstant((unsigned)CC, MVT::i32));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (isAnyRegCC)
+    for (unsigned i = 4, e = NumArgs + 4; i != e; ++i)
+      Ops.push_back(getValue(CI.getArgOperand(i)));
+
+  // Push the arguments from the call instruction.
+  SDNode::op_iterator e = hasGlue ? Call->op_end()-2 : Call->op_end()-1;
+  for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i)
+    Ops.push_back(*i);
+
+  // Push live variables for the stack map.
+  for (unsigned i = NumArgs + 4, e = CI.getNumArgOperands(); i != e; ++i) {
+    SDValue OpVal = getValue(CI.getArgOperand(i));
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
+      Ops.push_back(
+        DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+      Ops.push_back(
+        DAG.getTargetConstant(C->getSExtValue(), MVT::i64));
+    } else
+      Ops.push_back(OpVal);
+  }
+
+  // Push the register mask info.
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-2));
+  else
+    Ops.push_back(*(Call->op_end()-1));
+
+  // Push the chain (this is originally the first operand of the call, but
+  // becomes now the last or second to last operand).
+  Ops.push_back(*(Call->op_begin()));
+
+  // Push the glue flag (last operand).
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-1));
+
+  SDVTList NodeTys;
+  if (isAnyRegCC && hasDef) {
+    // Create the return types based on the intrinsic definition
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SmallVector<EVT, 3> ValueVTs;
+    ComputeValueVTs(TLI, CI.getType(), ValueVTs);
+    assert(ValueVTs.size() == 1 && "Expected only one return value type.");
+
+    // There is always a chain and a glue type at the end
+    ValueVTs.push_back(MVT::Other);
+    ValueVTs.push_back(MVT::Glue);
+    NodeTys = DAG.getVTList(ValueVTs.data(), ValueVTs.size());
+  } else
+    NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a PATCHPOINT node.
+  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
+                                         getCurSDLoc(), NodeTys, Ops);
+
+  // Update the NodeMap.
+  if (hasDef) {
+    if (isAnyRegCC)
+      setValue(&CI, SDValue(MN, 0));
+    else
+      setValue(&CI, Result.first);
+  }
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence. Furthermore the location of the chain and glue can change
+  // when the AnyReg calling convention is used and the intrinsic returns a
+  // value.
+  if (isAnyRegCC && hasDef) {
+    SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
+    SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  } else
+    DAG.ReplaceAllUsesWith(Call, MN);
+  DAG.DeleteNode(Call);
+}
+
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
 /// implementation, which just calls LowerCall.
 /// FIXME: When all targets are
@@ -6438,6 +7000,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     for (unsigned i = 0; i != NumRegs; ++i) {
       ISD::InputArg MyFlags;
       MyFlags.VT = RegisterVT;
+      MyFlags.ArgVT = VT;
       MyFlags.Used = CLI.IsReturnValueUsed;
       if (CLI.RetSExt)
         MyFlags.Flags.setSExt();
@@ -6527,7 +7090,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
-        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
                                i < CLI.NumFixedArgs,
                                i, j*Parts[j].getValueType().getStoreSize());
         if (NumParts > 1 && j == 0)
@@ -6666,7 +7229,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
     MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
-    ISD::InputArg RetArg(Flags, RegisterVT, true, 0, 0);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true, 0, 0);
     Ins.push_back(RetArg);
   }
 
@@ -6677,6 +7240,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*TLI, I->getType(), ValueVTs);
     bool isArgValueUsed = !I->use_empty();
+    unsigned PartBase = 0;
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
@@ -6714,8 +7278,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
-        ISD::InputArg MyFlags(Flags, RegisterVT, isArgValueUsed,
-                              Idx-1, i*RegisterVT.getStoreSize());
+        ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
+                              Idx-1, PartBase+i*RegisterVT.getStoreSize());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
@@ -6723,6 +7287,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           MyFlags.Flags.setOrigAlign(1);
         Ins.push_back(MyFlags);
       }
+      PartBase += VT.getStoreSize();
     }
   }
 
@@ -6940,3 +7505,22 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
 
   ConstantsOut.clear();
 }
+
+/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
+/// is 0.
+MachineBasicBlock *
+SelectionDAGBuilder::StackProtectorDescriptor::
+AddSuccessorMBB(const BasicBlock *BB,
+                MachineBasicBlock *ParentMBB,
+                MachineBasicBlock *SuccMBB) {
+  // If SuccBB has not been created yet, create it.
+  if (!SuccMBB) {
+    MachineFunction *MF = ParentMBB->getParent();
+    MachineFunction::iterator BBI = ParentMBB;
+    SuccMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(++BBI, SuccMBB);
+  }
+  // Add it as a successor of ParentMBB.
+  ParentMBB->addSuccessor(SuccMBB);
+  return SuccMBB;
+}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index ef73c00..835f643 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -1,4 +1,4 @@
-//===-- SelectionDAGBuilder.h - Selection-DAG building --------*- c++ -*---===//
+//===-- SelectionDAGBuilder.h - Selection-DAG building --------*- C++ -*---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -26,6 +26,7 @@
 
 namespace llvm {
 
+class AddrSpaceCastInst;
 class AliasAnalysis;
 class AllocaInst;
 class BasicBlock;
@@ -84,7 +85,7 @@ class SelectionDAGBuilder {
   const Instruction *CurInst;
 
   DenseMap<const Value*, SDValue> NodeMap;
-  
+
   /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
   /// to preserve debug information for incoming arguments.
   DenseMap<const Value*, SDValue> UnusedArgNodeMap;
@@ -182,6 +183,17 @@ private:
 
   typedef std::vector<CaseRec> CaseRecVector;
 
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator()(const Case &C1, const Case &C2) {
+      assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High));
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+
   struct CaseBitsCmp {
     bool operator()(const CaseBits &C1, const CaseBits &C2) {
       return C1.Bits > C2.Bits;
@@ -224,7 +236,7 @@ private:
   struct JumpTable {
     JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
               MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
-  
+
     /// Reg - the virtual register containing the index of the jump table entry
     //. to jump to.
     unsigned Reg;
@@ -278,6 +290,201 @@ private:
     BitTestInfo Cases;
   };
 
+  /// A class which encapsulates all of the information needed to generate a
+  /// stack protector check and signals to isel via its state being initialized
+  /// that a stack protector needs to be generated.
+  ///
+  /// *NOTE* The following is a high level documentation of SelectionDAG Stack
+  /// Protector Generation. The reason that it is placed here is for a lack of
+  /// other good places to stick it.
+  ///
+  /// High Level Overview of SelectionDAG Stack Protector Generation:
+  ///
+  /// Previously, generation of stack protectors was done exclusively in the
+  /// pre-SelectionDAG Codegen LLVM IR Pass "Stack Protector". This necessitated
+  /// splitting basic blocks at the IR level to create the success/failure basic
+  /// blocks in the tail of the basic block in question. As a result of this,
+  /// calls that would have qualified for the sibling call optimization were no
+  /// longer eligible for optimization since said calls were no longer right in
+  /// the "tail position" (i.e. the immediate predecessor of a ReturnInst
+  /// instruction).
+  ///
+  /// Then it was noticed that since the sibling call optimization causes the
+  /// callee to reuse the caller's stack, if we could delay the generation of
+  /// the stack protector check until later in CodeGen after the sibling call
+  /// decision was made, we get both the tail call optimization and the stack
+  /// protector check!
+  ///
+  /// A few goals in solving this problem were:
+  ///
+  ///   1. Preserve the architecture independence of stack protector generation.
+  ///
+  ///   2. Preserve the normal IR level stack protector check for platforms like
+  ///      OpenBSD for which we support platform specific stack protector
+  ///      generation.
+  ///
+  /// The main problem that guided the present solution is that one can not
+  /// solve this problem in an architecture independent manner at the IR level
+  /// only. This is because:
+  ///
+  ///   1. The decision on whether or not to perform a sibling call on certain
+  ///      platforms (for instance i386) requires lower level information
+  ///      related to available registers that can not be known at the IR level.
+  ///
+  ///   2. Even if the previous point were not true, the decision on whether to
+  ///      perform a tail call is done in LowerCallTo in SelectionDAG which
+  ///      occurs after the Stack Protector Pass. As a result, one would need to
+  ///      put the relevant callinst into the stack protector check success
+  ///      basic block (where the return inst is placed) and then move it back
+  ///      later at SelectionDAG/MI time before the stack protector check if the
+  ///      tail call optimization failed. The MI level option was nixed
+  ///      immediately since it would require platform specific pattern
+  ///      matching. The SelectionDAG level option was nixed because
+  ///      SelectionDAG only processes one IR level basic block at a time
+  ///      implying one could not create a DAG Combine to move the callinst.
+  ///
+  /// To get around this problem a few things were realized:
+  ///
+  ///   1. While one can not handle multiple IR level basic blocks at the
+  ///      SelectionDAG Level, one can generate multiple machine basic blocks
+  ///      for one IR level basic block. This is how we handle bit tests and
+  ///      switches.
+  ///
+  ///   2. At the MI level, tail calls are represented via a special return
+  ///      MIInst called "tcreturn". Thus if we know the basic block in which we
+  ///      wish to insert the stack protector check, we get the correct behavior
+  ///      by always inserting the stack protector check right before the return
+  ///      statement. This is a "magical transformation" since no matter where
+  ///      the stack protector check intrinsic is, we always insert the stack
+  ///      protector check code at the end of the BB.
+  ///
+  /// Given the aforementioned constraints, the following solution was devised:
+  ///
+  ///   1. On platforms that do not support SelectionDAG stack protector check
+  ///      generation, allow for the normal IR level stack protector check
+  ///      generation to continue.
+  ///
+  ///   2. On platforms that do support SelectionDAG stack protector check
+  ///      generation:
+  ///
+  ///     a. Use the IR level stack protector pass to decide if a stack
+  ///        protector is required/which BB we insert the stack protector check
+  ///        in by reusing the logic already therein. If we wish to generate a
+  ///        stack protector check in a basic block, we place a special IR
+  ///        intrinsic called llvm.stackprotectorcheck right before the BB's
+  ///        returninst or if there is a callinst that could potentially be
+  ///        sibling call optimized, before the call inst.
+  ///
+  ///     b. Then when a BB with said intrinsic is processed, we codegen the BB
+  ///        normally via SelectBasicBlock. In said process, when we visit the
+  ///        stack protector check, we do not actually emit anything into the
+  ///        BB. Instead, we just initialize the stack protector descriptor
+  ///        class (which involves stashing information/creating the success
+  ///        mbbb and the failure mbb if we have not created one for this
+  ///        function yet) and export the guard variable that we are going to
+  ///        compare.
+  ///
+  ///     c. After we finish selecting the basic block, in FinishBasicBlock if
+  ///        the StackProtectorDescriptor attached to the SelectionDAGBuilder is
+  ///        initialized, we first find a splice point in the parent basic block
+  ///        before the terminator and then splice the terminator of said basic
+  ///        block into the success basic block. Then we code-gen a new tail for
+  ///        the parent basic block consisting of the two loads, the comparison,
+  ///        and finally two branches to the success/failure basic blocks. We
+  ///        conclude by code-gening the failure basic block if we have not
+  ///        code-gened it already (all stack protector checks we generate in
+  ///        the same function, use the same failure basic block).
+  class StackProtectorDescriptor {
+  public:
+    StackProtectorDescriptor() : ParentMBB(0), SuccessMBB(0), FailureMBB(0),
+                                 Guard(0) { }
+    ~StackProtectorDescriptor() { }
+
+    /// Returns true if all fields of the stack protector descriptor are
+    /// initialized implying that we should/are ready to emit a stack protector.
+    bool shouldEmitStackProtector() const {
+      return ParentMBB && SuccessMBB && FailureMBB && Guard;
+    }
+
+    /// Initialize the stack protector descriptor structure for a new basic
+    /// block.
+    void initialize(const BasicBlock *BB,
+                    MachineBasicBlock *MBB,
+                    const CallInst &StackProtCheckCall) {
+      // Make sure we are not initialized yet.
+      assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
+             "already initialized!");
+      ParentMBB = MBB;
+      SuccessMBB = AddSuccessorMBB(BB, MBB);
+      FailureMBB = AddSuccessorMBB(BB, MBB, FailureMBB);
+      if (!Guard)
+        Guard = StackProtCheckCall.getArgOperand(0);
+    }
+
+    /// Reset state that changes when we handle different basic blocks.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. The specific basic block we are generating a
+    /// stack protector for (ParentMBB).
+    ///
+    /// 2. The successor machine basic block that will contain the tail of
+    /// parent mbb after we create the stack protector check (SuccessMBB). This
+    /// BB is visited only on stack protector check success.
+    void resetPerBBState() {
+      ParentMBB = 0;
+      SuccessMBB = 0;
+    }
+
+    /// Reset state that only changes when we switch functions.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. FailureMBB since we reuse the failure code path for all stack
+    /// protector checks created in an individual function.
+    ///
+    /// 2.The guard variable since the guard variable we are checking against is
+    /// always the same.
+    void resetPerFunctionState() {
+      FailureMBB = 0;
+      Guard = 0;
+    }
+
+    MachineBasicBlock *getParentMBB() { return ParentMBB; }
+    MachineBasicBlock *getSuccessMBB() { return SuccessMBB; }
+    MachineBasicBlock *getFailureMBB() { return FailureMBB; }
+    const Value *getGuard() { return Guard; }
+
+  private:
+    /// The basic block for which we are generating the stack protector.
+    ///
+    /// As a result of stack protector generation, we will splice the
+    /// terminators of this basic block into the successor mbb SuccessMBB and
+    /// replace it with a compare/branch to the successor mbbs
+    /// SuccessMBB/FailureMBB depending on whether or not the stack protector
+    /// was violated.
+    MachineBasicBlock *ParentMBB;
+
+    /// A basic block visited on stack protector check success that contains the
+    /// terminators of ParentMBB.
+    MachineBasicBlock *SuccessMBB;
+
+    /// This basic block visited on stack protector check failure that will
+    /// contain a call to __stack_chk_fail().
+    MachineBasicBlock *FailureMBB;
+
+    /// The guard variable which we will compare against the stored value in the
+    /// stack protector stack slot.
+    const Value *Guard;
+
+    /// Add a successor machine basic block to ParentMBB. If the successor mbb
+    /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
+    /// block will be created.
+    MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
+                                       MachineBasicBlock *ParentMBB,
+                                       MachineBasicBlock *SuccMBB = 0);
+  };
+
 private:
   const TargetMachine &TM;
 public:
@@ -295,6 +502,9 @@ public:
   /// BitTestCases - Vector of BitTestBlock structures used to communicate
   /// SwitchInst code generation information.
   std::vector<BitTestBlock> BitTestCases;
+  /// A StackProtectorDescriptor structure used to communicate stack protector
+  /// information in between SelectBasicBlock and FinishBasicBlock.
+  StackProtectorDescriptor SPDescriptor;
 
   // Emit PHI-node-operand constants only once even if used by multiple
   // PHI nodes.
@@ -305,9 +515,9 @@ public:
   FunctionLoweringInfo &FuncInfo;
 
   /// OptLevel - What optimization level we're generating code for.
-  /// 
+  ///
   CodeGenOpt::Level OptLevel;
-  
+
   /// GFI - Garbage collection metadata for the function.
   GCFunctionInfo *GFI;
 
@@ -389,7 +599,7 @@ public:
     assert(N.getNode() == 0 && "Already set a value for this node!");
     N = NewN;
   }
-  
+
   void setUnusedArgValue(const Value *V, SDValue NewN) {
     SDValue &N = UnusedArgNodeMap[V];
     assert(N.getNode() == 0 && "Already set a value for this node!");
@@ -410,6 +620,12 @@ public:
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
                    MachineBasicBlock *LandingPad = NULL);
 
+  std::pair<SDValue, SDValue> LowerCallOperands(const CallInst &CI,
+                                                unsigned ArgIdx,
+                                                unsigned NumArgs,
+                                                SDValue Callee,
+                                                bool useVoidTy = false);
+
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that ned to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
@@ -451,6 +667,9 @@ private:
 public:
   void visitSwitchCase(CaseBlock &CB,
                        MachineBasicBlock *SwitchBB);
+  void visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                               MachineBasicBlock *ParentBB);
+  void visitSPDescriptorFailure(StackProtectorDescriptor &SPD);
   void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
   void visitBitTestCase(BitTestBlock &BB,
                         MachineBasicBlock* NextMBB,
@@ -461,7 +680,7 @@ public:
   void visitJumpTable(JumpTable &JT);
   void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
                             MachineBasicBlock *SwitchBB);
-  
+
 private:
   // These all get lowered before this pass.
   void visitInvoke(const InvokeInst &I);
@@ -502,6 +721,7 @@ private:
   void visitPtrToInt(const User &I);
   void visitIntToPtr(const User &I);
   void visitBitCast(const User &I);
+  void visitAddrSpaceCast(const User &I);
 
   void visitExtractElement(const User &I);
   void visitInsertElement(const User &I);
@@ -523,6 +743,11 @@ private:
   void visitPHI(const PHINode &I);
   void visitCall(const CallInst &I);
   bool visitMemCmpCall(const CallInst &I);
+  bool visitMemChrCall(const CallInst &I);
+  bool visitStrCpyCall(const CallInst &I, bool isStpcpy);
+  bool visitStrCmpCall(const CallInst &I);
+  bool visitStrLenCall(const CallInst &I);
+  bool visitStrNLenCall(const CallInst &I);
   bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
@@ -535,6 +760,8 @@ private:
   void visitVAArg(const VAArgInst &I);
   void visitVAEnd(const CallInst &I);
   void visitVACopy(const CallInst &I);
+  void visitStackmap(const CallInst &I);
+  void visitPatchpoint(const CallInst &I);
 
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
@@ -543,10 +770,13 @@ private:
     llvm_unreachable("UserOp2 should not exist at instruction selection time!");
   }
 
+  void processIntegerCallValue(const Instruction &I,
+                               SDValue Value, bool IsSigned);
+
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
   /// EmitFuncArgumentDbgValue - If V is an function argument then create
-  /// corresponding DBG_VALUE machine instruction for it now. At the end of 
+  /// corresponding DBG_VALUE machine instruction for it now. At the end of
   /// instruction selection, they will be inserted to the entry BB.
   bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
                                 int64_t Offset, const SDValue &N);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index d8ee221..c04a08d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -142,6 +142,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FCEIL:                      return "fceil";
   case ISD::FRINT:                      return "frint";
   case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::FROUND:                     return "fround";
   case ISD::FEXP:                       return "fexp";
   case ISD::FEXP2:                      return "fexp2";
   case ISD::FLOG:                       return "flog";
@@ -223,6 +224,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FP_TO_SINT:                 return "fp_to_sint";
   case ISD::FP_TO_UINT:                 return "fp_to_uint";
   case ISD::BITCAST:                    return "bitcast";
+  case ISD::ADDRSPACECAST:              return "addrspacecast";
   case ISD::FP16_TO_FP32:               return "fp16_to_fp32";
   case ISD::FP32_TO_FP16:               return "fp32_to_fp16";
 
@@ -484,6 +486,13 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << " " << offset;
     if (unsigned int TF = BA->getTargetFlags())
       OS << " [TF=" << TF << ']';
+  } else if (const AddrSpaceCastSDNode *ASC =
+               dyn_cast<AddrSpaceCastSDNode>(this)) {
+    OS << '['
+       << ASC->getSrcAddressSpace()
+       << " -> "
+       << ASC->getDestAddressSpace()
+       << ']';
   }
 
   if (unsigned Order = getIROrder())
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 01da51c..3a0cfa1 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -223,6 +223,44 @@ defaultListDAGScheduler("default", "Best scheduler for the target",
 
 namespace llvm {
   //===--------------------------------------------------------------------===//
+  /// \brief This class is used by SelectionDAGISel to temporarily override
+  /// the optimization level on a per-function basis.
+  class OptLevelChanger {
+    SelectionDAGISel &IS;
+    CodeGenOpt::Level SavedOptLevel;
+    bool SavedFastISel;
+
+  public:
+    OptLevelChanger(SelectionDAGISel &ISel,
+                    CodeGenOpt::Level NewOptLevel) : IS(ISel) {
+      SavedOptLevel = IS.OptLevel;
+      if (NewOptLevel == SavedOptLevel)
+        return;
+      IS.OptLevel = NewOptLevel;
+      IS.TM.setOptLevel(NewOptLevel);
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == CodeGenOpt::None)
+        IS.TM.setFastISel(true);
+      DEBUG(dbgs() << "\nChanging optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
+            << " ; After: -O" << NewOptLevel << "\n");
+    }
+
+    ~OptLevelChanger() {
+      if (IS.OptLevel == SavedOptLevel)
+        return;
+      DEBUG(dbgs() << "\nRestoring optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel
+            << " ; After: -O" << SavedOptLevel << "\n");
+      IS.OptLevel = SavedOptLevel;
+      IS.TM.setOptLevel(SavedOptLevel);
+      IS.TM.setFastISel(SavedFastISel);
+    }
+  };
+
+  //===--------------------------------------------------------------------===//
   /// createDefaultScheduler - This creates an instruction scheduler appropriate
   /// for the target.
   ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
@@ -230,7 +268,7 @@ namespace llvm {
     const TargetLowering *TLI = IS->getTargetLowering();
     const TargetSubtargetInfo &ST = IS->TM.getSubtarget<TargetSubtargetInfo>();
 
-    if (OptLevel == CodeGenOpt::None || ST.enableMachineScheduler() ||
+    if (OptLevel == CodeGenOpt::None || ST.useMachineScheduler() ||
         TLI->getSchedulingPreference() == Sched::Source)
       return createSourceListDAGScheduler(IS, OptLevel);
     if (TLI->getSchedulingPreference() == Sched::RegPressure)
@@ -356,6 +394,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   const Function &Fn = *mf.getFunction();
   const TargetInstrInfo &TII = *TM.getInstrInfo();
   const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetLowering *TLI = TM.getTargetLowering();
 
   MF = &mf;
   RegInfo = &MF->getRegInfo();
@@ -369,11 +408,17 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   ST.resetSubtargetFeatures(MF);
   TM.resetTargetOptions(MF);
 
+  // Reset OptLevel to None for optnone functions.
+  CodeGenOpt::Level NewOptLevel = OptLevel;
+  if (Fn.hasFnAttribute(Attribute::OptimizeNone))
+    NewOptLevel = CodeGenOpt::None;
+  OptLevelChanger OLC(*this, NewOptLevel);
+
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
 
-  CurDAG->init(*MF, TTI);
+  CurDAG->init(*MF, TTI, TLI);
   FuncInfo->set(Fn, *MF);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -408,9 +453,13 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       EntryMBB->insert(EntryMBB->begin(), MI);
     else {
       MachineInstr *Def = RegInfo->getVRegDef(Reg);
-      MachineBasicBlock::iterator InsertPos = Def;
-      // FIXME: VR def may not be in entry block.
-      Def->getParent()->insert(llvm::next(InsertPos), MI);
+      if (Def) {
+        MachineBasicBlock::iterator InsertPos = Def;
+        // FIXME: VR def may not be in entry block.
+        Def->getParent()->insert(llvm::next(InsertPos), MI);
+      } else
+        DEBUG(dbgs() << "Dropping debug info for dead vreg"
+              << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
     }
 
     // If Reg is live-in then update debug info to track its copy in a vreg.
@@ -422,7 +471,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       MachineBasicBlock::iterator InsertPos = Def;
       const MDNode *Variable =
         MI->getOperand(MI->getNumOperands()-1).getMetadata();
-      bool IsIndirect = MI->getOperand(1).isImm();
+      bool IsIndirect = MI->isIndirectDebugValue();
       unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       // Def is never a terminator here, so it is ok to increment InsertPos.
       BuildMI(*EntryMBB, ++InsertPos, MI->getDebugLoc(),
@@ -497,6 +546,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       if (J == E) break;
       To = J->second;
     }
+    // Make sure the new register has a sufficiently constrained register class.
+    if (TargetRegisterInfo::isVirtualRegister(From) &&
+        TargetRegisterInfo::isVirtualRegister(To))
+      MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
     MRI.replaceRegWith(From, To);
   }
@@ -617,6 +670,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
+  CurDAG->NewNodesMustHaveLegalTypes = true;
+
   if (Changed) {
     if (ViewDAGCombineLT)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
@@ -1140,6 +1195,91 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
   delete FastIS;
   SDB->clearDanglingDebugInfo();
+  SDB->SPDescriptor.resetPerFunctionState();
+}
+
+/// Given that the input MI is before a partial terminator sequence TSeq, return
+/// true if M + TSeq also a partial terminator sequence.
+///
+/// A Terminator sequence is a sequence of MachineInstrs which at this point in
+/// lowering copy vregs into physical registers, which are then passed into
+/// terminator instructors so we can satisfy ABI constraints. A partial
+/// terminator sequence is an improper subset of a terminator sequence (i.e. it
+/// may be the whole terminator sequence).
+static bool MIIsInTerminatorSequence(const MachineInstr *MI) {
+  // If we do not have a copy or an implicit def, we return true if and only if
+  // MI is a debug value.
+  if (!MI->isCopy() && !MI->isImplicitDef())
+    // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
+    // physical registers if there is debug info associated with the terminator
+    // of our mbb. We want to include said debug info in our terminator
+    // sequence, so we return true in that case.
+    return MI->isDebugValue();
+
+  // We have left the terminator sequence if we are not doing one of the
+  // following:
+  //
+  // 1. Copying a vreg into a physical register.
+  // 2. Copying a vreg into a vreg.
+  // 3. Defining a register via an implicit def.
+
+  // OPI should always be a register definition...
+  MachineInstr::const_mop_iterator OPI = MI->operands_begin();
+  if (!OPI->isReg() || !OPI->isDef())
+    return false;
+
+  // Defining any register via an implicit def is always ok.
+  if (MI->isImplicitDef())
+    return true;
+
+  // Grab the copy source...
+  MachineInstr::const_mop_iterator OPI2 = OPI;
+  ++OPI2;
+  assert(OPI2 != MI->operands_end()
+         && "Should have a copy implying we should have 2 arguments.");
+
+  // Make sure that the copy dest is not a vreg when the copy source is a
+  // physical register.
+  if (!OPI2->isReg() ||
+      (!TargetRegisterInfo::isPhysicalRegister(OPI->getReg()) &&
+       TargetRegisterInfo::isPhysicalRegister(OPI2->getReg())))
+    return false;
+
+  return true;
+}
+
+/// Find the split point at which to splice the end of BB into its success stack
+/// protector check machine basic block.
+///
+/// On many platforms, due to ABI constraints, terminators, even before register
+/// allocation, use physical registers. This creates an issue for us since
+/// physical registers at this point can not travel across basic
+/// blocks. Luckily, selectiondag always moves physical registers into vregs
+/// when they enter functions and moves them through a sequence of copies back
+/// into the physical registers right before the terminator creating a
+/// ``Terminator Sequence''. This function is searching for the beginning of the
+/// terminator sequence so that we can ensure that we splice off not just the
+/// terminator, but additionally the copies that move the vregs into the
+/// physical registers.
+static MachineBasicBlock::iterator
+FindSplitPointForStackProtector(MachineBasicBlock *BB, DebugLoc DL) {
+  MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
+  //
+  if (SplitPoint == BB->begin())
+    return SplitPoint;
+
+  MachineBasicBlock::iterator Start = BB->begin();
+  MachineBasicBlock::iterator Previous = SplitPoint;
+  --Previous;
+
+  while (MIIsInTerminatorSequence(Previous)) {
+    SplitPoint = Previous;
+    if (Previous == Start)
+      break;
+    --Previous;
+  }
+
+  return SplitPoint;
 }
 
 void
@@ -1152,11 +1292,13 @@ SelectionDAGISel::FinishBasicBlock() {
                  << FuncInfo->PHINodesToUpdate[i].first
                  << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
 
+  const bool MustUpdatePHINodes = SDB->SwitchCases.empty() &&
+                                  SDB->JTCases.empty() &&
+                                  SDB->BitTestCases.empty();
+
   // Next, now that we know what the last MBB the LLVM BB expanded is, update
   // PHI nodes in successors.
-  if (SDB->SwitchCases.empty() &&
-      SDB->JTCases.empty() &&
-      SDB->BitTestCases.empty()) {
+  if (MustUpdatePHINodes) {
     for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
       MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
       assert(PHI->isPHI() &&
@@ -1165,9 +1307,54 @@ SelectionDAGISel::FinishBasicBlock() {
         continue;
       PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
     }
-    return;
   }
 
+  // Handle stack protector.
+  if (SDB->SPDescriptor.shouldEmitStackProtector()) {
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+    MachineBasicBlock *SuccessMBB = SDB->SPDescriptor.getSuccessMBB();
+
+    // Find the split point to split the parent mbb. At the same time copy all
+    // physical registers used in the tail of parent mbb into virtual registers
+    // before the split point and back into physical registers after the split
+    // point. This prevents us needing to deal with Live-ins and many other
+    // register allocation issues caused by us splitting the parent mbb. The
+    // register allocator will clean up said virtual copies later on.
+    MachineBasicBlock::iterator SplitPoint =
+      FindSplitPointForStackProtector(ParentMBB, SDB->getCurDebugLoc());
+
+    // Splice the terminator of ParentMBB into SuccessMBB.
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
+                       SplitPoint,
+                       ParentMBB->end());
+
+    // Add compare/jump on neq/jump to the parent BB.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt = ParentMBB->end();
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // CodeGen Failure MBB if we have not codegened it yet.
+    MachineBasicBlock *FailureMBB = SDB->SPDescriptor.getFailureMBB();
+    if (!FailureMBB->size()) {
+      FuncInfo->MBB = FailureMBB;
+      FuncInfo->InsertPt = FailureMBB->end();
+      SDB->visitSPDescriptorFailure(SDB->SPDescriptor);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  }
+
+  // If we updated PHI Nodes, return early.
+  if (MustUpdatePHINodes)
+    return;
+
   for (unsigned i = 0, e = SDB->BitTestCases.size(); i != e; ++i) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->BitTestCases[i].Emitted) {
@@ -1741,15 +1928,15 @@ WalkChainUsers(const SDNode *ChainedNode,
 
     SDNode *User = *UI;
 
+    if (User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
+      continue;
+
     // If we see an already-selected machine node, then we've gone beyond the
     // pattern that we're selecting down into the already selected chunk of the
     // DAG.
-    if (User->isMachineOpcode() ||
-        User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
-      continue;
-
     unsigned UserOpcode = User->getOpcode();
-    if (UserOpcode == ISD::CopyToReg ||
+    if (User->isMachineOpcode() ||
+        UserOpcode == ISD::CopyToReg ||
         UserOpcode == ISD::CopyFromReg ||
         UserOpcode == ISD::INLINEASM ||
         UserOpcode == ISD::EH_LABEL ||
@@ -1886,7 +2073,6 @@ HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
     }
   }
 
-  SDValue Res;
   if (InputChains.size() == 1)
     return InputChains[0];
   return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
@@ -1962,6 +2148,18 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return N == RecordedNodes[RecNo].first;
 }
 
+/// CheckChildSame - Implements OP_CheckChildXSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N,
+             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
+             unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
+                     RecordedNodes);
+}
+
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
@@ -2076,6 +2274,13 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   case SelectionDAGISel::OPC_CheckSame:
     Result = !::CheckSame(Table, Index, N, RecordedNodes);
     return Index;
+  case SelectionDAGISel::OPC_CheckChild0Same:
+  case SelectionDAGISel::OPC_CheckChild1Same:
+  case SelectionDAGISel::OPC_CheckChild2Same:
+  case SelectionDAGISel::OPC_CheckChild3Same:
+    Result = !::CheckChildSame(Table, Index, N, RecordedNodes,
+                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Same);
+    return Index;
   case SelectionDAGISel::OPC_CheckPatternPredicate:
     Result = !::CheckPatternPredicate(Table, Index, SDISel);
     return Index;
@@ -2373,6 +2578,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_CheckSame:
       if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
       continue;
+
+    case OPC_CheckChild0Same: case OPC_CheckChild1Same:
+    case OPC_CheckChild2Same: case OPC_CheckChild3Same:
+      if (!::CheckChildSame(MatcherTable, MatcherIndex, N, RecordedNodes,
+                            Opcode-OPC_CheckChild0Same))
+        break;
+      continue;
+
     case OPC_CheckPatternPredicate:
       if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
       continue;
@@ -2432,7 +2645,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     }
 
     case OPC_SwitchType: {
-      MVT CurNodeVT = N.getValueType().getSimpleVT();
+      MVT CurNodeVT = N.getSimpleValueType();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
       while (1) {
@@ -2544,7 +2757,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_EmitConvertToTarget:  {
       // Convert from IMM/FPIMM to target version.
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitConvertToTarget");
       SDValue Imm = RecordedNodes[RecNo].first;
 
       if (Imm->getOpcode() == ISD::Constant) {
@@ -2569,7 +2782,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
       // Read all of the chained nodes.
       unsigned RecNo = Opcode == OPC_EmitMergeInputChains1_1;
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
       ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
       // FIXME: What if other value results of the node have uses not matched
@@ -2606,7 +2819,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // Read all of the chained nodes.
       for (unsigned i = 0; i != NumChains; ++i) {
         unsigned RecNo = MatcherTable[MatcherIndex++];
-        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
         ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
         // FIXME: What if other value results of the node have uses not matched
@@ -2633,7 +2846,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
     case OPC_EmitCopyToReg: {
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
       unsigned DestPhysReg = MatcherTable[MatcherIndex++];
 
       if (InputChain.getNode() == 0)
@@ -2650,7 +2863,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_EmitNodeXForm: {
       unsigned XFormNo = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitNodeXForm");
       SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
       RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, (SDNode*) 0));
       continue;
@@ -2827,7 +3040,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         if (RecNo & 128)
           RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
 
-        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        assert(RecNo < RecordedNodes.size() && "Invalid MarkGlueResults");
         GlueResultNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
       }
       continue;
@@ -2844,7 +3057,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         if (ResSlot & 128)
           ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
 
-        assert(ResSlot < RecordedNodes.size() && "Invalid CheckSame");
+        assert(ResSlot < RecordedNodes.size() && "Invalid CompleteMatch");
         SDValue Res = RecordedNodes[ResSlot].first;
 
         assert(i < NodeToMatch->getNumValues() &&
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e3c6306..82b068d 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -64,13 +64,29 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
   return isUsedByReturnOnly(Node, Chain);
 }
 
+/// \brief Set CallLoweringInfo attribute flags based on a call instruction
+/// and called function attributes.
+void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                                 unsigned AttrIdx) {
+  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  Alignment  = CS->getParamAlignment(AttrIdx);
+}
 
 /// Generate a libcall taking the given operands as arguments and returning a
 /// result of type RetVT.
-SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
-                                    RTLIB::Libcall LC, EVT RetVT,
-                                    const SDValue *Ops, unsigned NumOps,
-                                    bool isSigned, SDLoc dl) const {
+std::pair<SDValue, SDValue>
+TargetLowering::makeLibCall(SelectionDAG &DAG,
+                            RTLIB::Libcall LC, EVT RetVT,
+                            const SDValue *Ops, unsigned NumOps,
+                            bool isSigned, SDLoc dl,
+                            bool doesNotReturn,
+                            bool isReturnValueUsed) const {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumOps);
 
@@ -89,11 +105,9 @@ SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
   CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                     false, 0, getLibcallCallingConv(LC),
                     /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, dl);
-  std::pair<SDValue,SDValue> CallInfo = LowerCallTo(CLI);
-
-  return CallInfo.first;
+                    doesNotReturn, isReturnValueUsed, Callee, Args,
+                    DAG, dl);
+  return LowerCallTo(CLI);
 }
 
 
@@ -183,14 +197,16 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   // Use the target specific return value for comparions lib calls.
   EVT RetVT = getCmpLibcallReturnType();
   SDValue Ops[2] = { NewLHS, NewRHS };
-  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/,
+                       dl).first;
   NewRHS = DAG.getConstant(0, RetVT);
   CCCode = getCmpLibcallCC(LC1);
   if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
     SDValue Tmp = DAG.getNode(ISD::SETCC, dl,
                               getSetCCResultType(*DAG.getContext(), RetVT),
                               NewLHS, NewRHS, DAG.getCondCode(CCCode));
-    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/,
+                         dl).first;
     NewLHS = DAG.getNode(ISD::SETCC, dl,
                          getSetCCResultType(*DAG.getContext(), RetVT), NewLHS,
                          NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
@@ -632,6 +648,31 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                           TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
                                           NarrowShl));
         }
+        // Repeat the SHL optimization above in cases where an extension
+        // intervenes: (shl (anyext (shr x, c1)), c2) to
+        // (shl (anyext x), c2-c1).  This requires that the bottom c1 bits
+        // aren't demanded (as above) and that the shifted upper c1 bits of
+        // x aren't demanded.
+        if (InOp.hasOneUse() &&
+            InnerOp.getOpcode() == ISD::SRL &&
+            InnerOp.hasOneUse() &&
+            isa<ConstantSDNode>(InnerOp.getOperand(1))) {
+          uint64_t InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1))
+            ->getZExtValue();
+          if (InnerShAmt < ShAmt &&
+              InnerShAmt < InnerBits &&
+              NewMask.lshr(InnerBits - InnerShAmt + ShAmt) == 0 &&
+              NewMask.trunc(ShAmt) == 0) {
+            SDValue NewSA =
+              TLO.DAG.getConstant(ShAmt - InnerShAmt,
+                                  Op.getOperand(1).getValueType());
+            EVT VT = Op.getValueType();
+            SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
+                                             InnerOp.getOperand(0));
+            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT,
+                                                     NewExt, NewSA));
+          }
+        }
       }
 
       KnownZero <<= SA->getZExtValue();
@@ -722,13 +763,24 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
-      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits)
         return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
                                                  Op.getOperand(0),
                                                  Op.getOperand(1)));
-      } else if (KnownOne.intersects(SignBit)) { // New bits are known one.
-        KnownOne |= HighBits;
+
+      int Log2 = NewMask.exactLogBase2();
+      if (Log2 >= 0) {
+        // The bit must come from the sign.
+        SDValue NewSA =
+          TLO.DAG.getConstant(BitWidth - 1 - Log2,
+                              Op.getOperand(1).getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
+                                                 Op.getOperand(0), NewSA));
       }
+
+      if (KnownOne.intersects(SignBit))
+        // New bits are known one.
+        KnownOne |= HighBits;
     }
     break;
   case ISD::SIGN_EXTEND_INREG: {
@@ -1077,13 +1129,20 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   case ISD::SETFALSE:
   case ISD::SETFALSE2: return DAG.getConstant(0, VT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2:  return DAG.getConstant(1, VT);
+  case ISD::SETTRUE2: {
+    TargetLowering::BooleanContent Cnt = getBooleanContents(VT.isVector());
+    return DAG.getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
+  }
   }
 
   // Ensure that the constant occurs on the RHS, and fold constant
   // comparisons.
-  if (isa<ConstantSDNode>(N0.getNode()))
-    return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond));
+  ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
+  if (isa<ConstantSDNode>(N0.getNode()) &&
+      (DCI.isBeforeLegalizeOps() ||
+       isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
+    return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
 
   if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
     const APInt &C1 = N1C->getAPIntValue();
@@ -1178,6 +1237,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // the test is for equality or unsigned, and all 1 bits of the const are
     // in the same partial word, see if we can shorten the load.
     if (DCI.isBeforeLegalize() &&
+        !ISD::isSignedIntSetCC(Cond) &&
         N0.getOpcode() == ISD::AND && C1 == 0 &&
         N0.getNode()->hasOneUse() &&
         isa<LoadSDNode>(N0.getOperand(0)) &&
@@ -1322,7 +1382,9 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
         CC = ISD::getSetCCInverse(CC,
                                   N0.getOperand(0).getValueType().isInteger());
-        return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+        if (DCI.isBeforeLegalizeOps() ||
+            isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
       }
 
       if ((N0.getOpcode() == ISD::XOR ||
@@ -1759,16 +1821,22 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
         if (ValueHasExactlyOneBitSet(N1, DAG)) {
           Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          SDValue Zero = DAG.getConstant(0, N1.getValueType());
-          return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+          if (DCI.isBeforeLegalizeOps() ||
+              isCondCodeLegal(Cond, N0.getSimpleValueType())) {
+            SDValue Zero = DAG.getConstant(0, N1.getValueType());
+            return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+          }
         }
       }
     if (N1.getOpcode() == ISD::AND)
       if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
         if (ValueHasExactlyOneBitSet(N0, DAG)) {
           Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          SDValue Zero = DAG.getConstant(0, N0.getValueType());
-          return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+          if (DCI.isBeforeLegalizeOps() ||
+              isCondCodeLegal(Cond, N1.getSimpleValueType())) {
+            SDValue Zero = DAG.getConstant(0, N0.getValueType());
+            return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+          }
         }
       }
   }
@@ -1993,7 +2061,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
                              MVT VT) const {
-  if (Constraint[0] != '{')
+  if (Constraint.empty() || Constraint[0] != '{')
     return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
   assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
 
@@ -2142,8 +2210,9 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
           break;
         }
       } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
-        OpInfo.ConstraintVT = MVT::getIntegerVT(
-            8*getDataLayout()->getPointerSize(PT->getAddressSpace()));
+        unsigned PtrSize
+          = getDataLayout()->getPointerSizeInBits(PT->getAddressSpace());
+        OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize);
       } else {
         OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
       }
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
deleted file mode 100644
index 6c826de..0000000
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ /dev/null
@@ -1,1152 +0,0 @@
-//===-- ShrinkWrapping.cpp - Reduce spills/restores of callee-saved regs --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a shrink wrapping variant of prolog/epilog insertion:
-// - Spills and restores of callee-saved registers (CSRs) are placed in the
-//   machine CFG to tightly surround their uses so that execution paths that
-//   do not use CSRs do not pay the spill/restore penalty.
-//
-// - Avoiding placment of spills/restores in loops: if a CSR is used inside a
-//   loop the spills are placed in the loop preheader, and restores are
-//   placed in the loop exit nodes (the successors of loop _exiting_ nodes).
-//
-// - Covering paths without CSR uses:
-//   If a region in a CFG uses CSRs and has multiple entry and/or exit points,
-//   the use info for the CSRs inside the region is propagated outward in the
-//   CFG to ensure validity of the spill/restore placements. This decreases
-//   the effectiveness of shrink wrapping but does not require edge splitting
-//   in the machine CFG.
-//
-// This shrink wrapping implementation uses an iterative analysis to determine
-// which basic blocks require spills and restores for CSRs.
-//
-// This pass uses MachineDominators and MachineLoopInfo. Loop information
-// is used to prevent placement of callee-saved register spills/restores
-// in the bodies of loops.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "shrink-wrap"
-
-#include "PrologEpilogInserter.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include <sstream>
-
-using namespace llvm;
-
-STATISTIC(numSRReduced, "Number of CSR spills+restores reduced.");
-
-// Shrink Wrapping:
-static cl::opt<bool>
-ShrinkWrapping("shrink-wrap",
-               cl::desc("Shrink wrap callee-saved register spills/restores"));
-
-// Shrink wrap only the specified function, a debugging aid.
-static cl::opt<std::string>
-ShrinkWrapFunc("shrink-wrap-func", cl::Hidden,
-               cl::desc("Shrink wrap the specified function"),
-               cl::value_desc("funcname"),
-               cl::init(""));
-
-// Debugging level for shrink wrapping.
-enum ShrinkWrapDebugLevel {
-  Disabled, BasicInfo, Iterations, Details
-};
-
-static cl::opt<enum ShrinkWrapDebugLevel>
-ShrinkWrapDebugging("shrink-wrap-dbg", cl::Hidden,
-  cl::desc("Print shrink wrapping debugging information"),
-  cl::values(
-    clEnumVal(Disabled  , "disable debug output"),
-    clEnumVal(BasicInfo , "print basic DF sets"),
-    clEnumVal(Iterations, "print SR sets for each iteration"),
-    clEnumVal(Details   , "print all DF sets"),
-    clEnumValEnd));
-
-
-void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  if (ShrinkWrapping || ShrinkWrapFunc != "") {
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<MachineDominatorTree>();
-  }
-  AU.addPreserved<MachineLoopInfo>();
-  AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<TargetPassConfig>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-//===----------------------------------------------------------------------===//
-//  ShrinkWrapping implementation
-//===----------------------------------------------------------------------===//
-
-// Convienences for dealing with machine loops.
-MachineBasicBlock* PEI::getTopLevelLoopPreheader(MachineLoop* LP) {
-  assert(LP && "Machine loop is NULL.");
-  MachineBasicBlock* PHDR = LP->getLoopPreheader();
-  MachineLoop* PLP = LP->getParentLoop();
-  while (PLP) {
-    PHDR = PLP->getLoopPreheader();
-    PLP = PLP->getParentLoop();
-  }
-  return PHDR;
-}
-
-MachineLoop* PEI::getTopLevelLoopParent(MachineLoop *LP) {
-  if (LP == 0)
-    return 0;
-  MachineLoop* PLP = LP->getParentLoop();
-  while (PLP) {
-    LP = PLP;
-    PLP = PLP->getParentLoop();
-  }
-  return LP;
-}
-
-bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
-  return (MBB && !MBB->empty() && MBB->back().isReturn());
-}
-
-// Initialize shrink wrapping DFA sets, called before iterations.
-void PEI::clearAnticAvailSets() {
-  AnticIn.clear();
-  AnticOut.clear();
-  AvailIn.clear();
-  AvailOut.clear();
-}
-
-// Clear all sets constructed by shrink wrapping.
-void PEI::clearAllSets() {
-  ReturnBlocks.clear();
-  clearAnticAvailSets();
-  UsedCSRegs.clear();
-  CSRUsed.clear();
-  TLLoops.clear();
-  CSRSave.clear();
-  CSRRestore.clear();
-}
-
-// Initialize all shrink wrapping data.
-void PEI::initShrinkWrappingInfo() {
-  clearAllSets();
-  EntryBlock = 0;
-#ifndef NDEBUG
-  HasFastExitPath = false;
-#endif
-  ShrinkWrapThisFunction = ShrinkWrapping;
-  // DEBUG: enable or disable shrink wrapping for the current function
-  // via --shrink-wrap-func=<funcname>.
-#ifndef NDEBUG
-  if (ShrinkWrapFunc != "") {
-    std::string MFName = MF->getName().str();
-    ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
-  }
-#endif
-}
-
-
-/// placeCSRSpillsAndRestores - determine which MBBs of the function
-/// need save, restore code for callee-saved registers by doing a DF analysis
-/// similar to the one used in code motion (GVNPRE). This produces maps of MBBs
-/// to sets of registers (CSRs) for saves and restores. MachineLoopInfo
-/// is used to ensure that CSR save/restore code is not placed inside loops.
-/// This function computes the maps of MBBs -> CSRs to spill and restore
-/// in CSRSave, CSRRestore.
-///
-/// If shrink wrapping is not being performed, place all spills in
-/// the entry block, all restores in return blocks. In this case,
-/// CSRSave has a single mapping, CSRRestore has mappings for each
-/// return block.
-///
-void PEI::placeCSRSpillsAndRestores(MachineFunction &Fn) {
-
-  DEBUG(MF = &Fn);
-
-  initShrinkWrappingInfo();
-
-  DEBUG(if (ShrinkWrapThisFunction) {
-      dbgs() << "Place CSR spills/restores for "
-             << MF->getName() << "\n";
-    });
-
-  if (calculateSets(Fn))
-    placeSpillsAndRestores(Fn);
-}
-
-/// calcAnticInOut - calculate the anticipated in/out reg sets
-/// for the given MBB by looking forward in the MCFG at MBB's
-/// successors.
-///
-bool PEI::calcAnticInOut(MachineBasicBlock* MBB) {
-  bool changed = false;
-
-  // AnticOut[MBB] = INTERSECT(AnticIn[S] for S in SUCCESSORS(MBB))
-  SmallVector<MachineBasicBlock*, 4> successors;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    if (SUCC != MBB)
-      successors.push_back(SUCC);
-  }
-
-  unsigned i = 0, e = successors.size();
-  if (i != e) {
-    CSRegSet prevAnticOut = AnticOut[MBB];
-    MachineBasicBlock* SUCC = successors[i];
-
-    AnticOut[MBB] = AnticIn[SUCC];
-    for (++i; i != e; ++i) {
-      SUCC = successors[i];
-      AnticOut[MBB] &= AnticIn[SUCC];
-    }
-    if (prevAnticOut != AnticOut[MBB])
-      changed = true;
-  }
-
-  // AnticIn[MBB] = UNION(CSRUsed[MBB], AnticOut[MBB]);
-  CSRegSet prevAnticIn = AnticIn[MBB];
-  AnticIn[MBB] = CSRUsed[MBB] | AnticOut[MBB];
-  if (prevAnticIn != AnticIn[MBB])
-    changed = true;
-  return changed;
-}
-
-/// calcAvailInOut - calculate the available in/out reg sets
-/// for the given MBB by looking backward in the MCFG at MBB's
-/// predecessors.
-///
-bool PEI::calcAvailInOut(MachineBasicBlock* MBB) {
-  bool changed = false;
-
-  // AvailIn[MBB] = INTERSECT(AvailOut[P] for P in PREDECESSORS(MBB))
-  SmallVector<MachineBasicBlock*, 4> predecessors;
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    if (PRED != MBB)
-      predecessors.push_back(PRED);
-  }
-
-  unsigned i = 0, e = predecessors.size();
-  if (i != e) {
-    CSRegSet prevAvailIn = AvailIn[MBB];
-    MachineBasicBlock* PRED = predecessors[i];
-
-    AvailIn[MBB] = AvailOut[PRED];
-    for (++i; i != e; ++i) {
-      PRED = predecessors[i];
-      AvailIn[MBB] &= AvailOut[PRED];
-    }
-    if (prevAvailIn != AvailIn[MBB])
-      changed = true;
-  }
-
-  // AvailOut[MBB] = UNION(CSRUsed[MBB], AvailIn[MBB]);
-  CSRegSet prevAvailOut = AvailOut[MBB];
-  AvailOut[MBB] = CSRUsed[MBB] | AvailIn[MBB];
-  if (prevAvailOut != AvailOut[MBB])
-    changed = true;
-  return changed;
-}
-
-/// calculateAnticAvail - build the sets anticipated and available
-/// registers in the MCFG of the current function iteratively,
-/// doing a combined forward and backward analysis.
-///
-void PEI::calculateAnticAvail(MachineFunction &Fn) {
-  // Initialize data flow sets.
-  clearAnticAvailSets();
-
-  // Calculate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
-  bool changed = true;
-  unsigned iterations = 0;
-  while (changed) {
-    changed = false;
-    ++iterations;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-
-      // Calculate anticipated in, out regs at MBB from
-      // anticipated at successors of MBB.
-      changed |= calcAnticInOut(MBB);
-
-      // Calculate available in, out regs at MBB from
-      // available at predecessors of MBB.
-      changed |= calcAvailInOut(MBB);
-    }
-  }
-
-  DEBUG({
-      if (ShrinkWrapDebugging >= Details) {
-        dbgs()
-          << "-----------------------------------------------------------\n"
-          << " Antic/Avail Sets:\n"
-          << "-----------------------------------------------------------\n"
-          << "iterations = " << iterations << "\n"
-          << "-----------------------------------------------------------\n"
-          << "MBB | USED | ANTIC_IN | ANTIC_OUT | AVAIL_IN | AVAIL_OUT\n"
-          << "-----------------------------------------------------------\n";
-
-        for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-             MBBI != MBBE; ++MBBI) {
-          MachineBasicBlock* MBB = MBBI;
-          dumpSets(MBB);
-        }
-
-        dbgs()
-          << "-----------------------------------------------------------\n";
-      }
-    });
-}
-
-/// propagateUsesAroundLoop - copy used register info from MBB to all blocks
-/// of the loop given by LP and its parent loops. This prevents spills/restores
-/// from being placed in the bodies of loops.
-///
-void PEI::propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP) {
-  if (! MBB || !LP)
-    return;
-
-  std::vector<MachineBasicBlock*> loopBlocks = LP->getBlocks();
-  for (unsigned i = 0, e = loopBlocks.size(); i != e; ++i) {
-    MachineBasicBlock* LBB = loopBlocks[i];
-    if (LBB == MBB)
-      continue;
-    if (CSRUsed[LBB].contains(CSRUsed[MBB]))
-      continue;
-    CSRUsed[LBB] |= CSRUsed[MBB];
-  }
-}
-
-/// calculateSets - collect the CSRs used in this function, compute
-/// the DF sets that describe the initial minimal regions in the
-/// Machine CFG around which CSR spills and restores must be placed.
-///
-/// Additionally, this function decides if shrink wrapping should
-/// be disabled for the current function, checking the following:
-///  1. the current function has more than 500 MBBs: heuristic limit
-///     on function size to reduce compile time impact of the current
-///     iterative algorithm.
-///  2. all CSRs are used in the entry block.
-///  3. all CSRs are used in all immediate successors of the entry block.
-///  4. all CSRs are used in a subset of blocks, each of which dominates
-///     all return blocks. These blocks, taken as a subgraph of the MCFG,
-///     are equivalent to the entry block since all execution paths pass
-///     through them.
-///
-bool PEI::calculateSets(MachineFunction &Fn) {
-  // Sets used to compute spill, restore placement sets.
-  const std::vector<CalleeSavedInfo> CSI =
-    Fn.getFrameInfo()->getCalleeSavedInfo();
-
-  // If no CSRs used, we are done.
-  if (CSI.empty()) {
-    DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getName()
-                   << ": uses no callee-saved registers\n");
-    return false;
-  }
-
-  // Save refs to entry and return blocks.
-  EntryBlock = Fn.begin();
-  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
-       MBB != E; ++MBB)
-    if (isReturnBlock(MBB))
-      ReturnBlocks.push_back(MBB);
-
-  // Determine if this function has fast exit paths.
-  DEBUG(if (ShrinkWrapThisFunction)
-          findFastExitPath());
-
-  // Limit shrink wrapping via the current iterative bit vector
-  // implementation to functions with <= 500 MBBs.
-  if (Fn.size() > 500) {
-    DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getName()
-                   << ": too large (" << Fn.size() << " MBBs)\n");
-    ShrinkWrapThisFunction = false;
-  }
-
-  // Return now if not shrink wrapping.
-  if (! ShrinkWrapThisFunction)
-    return false;
-
-  // Collect set of used CSRs.
-  for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
-    UsedCSRegs.set(inx);
-  }
-
-  // Walk instructions in all MBBs, create CSRUsed[] sets, choose
-  // whether or not to shrink wrap this function.
-  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
-
-  bool allCSRUsesInEntryBlock = true;
-  for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-       MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock* MBB = MBBI;
-    for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end(); ++I) {
-      for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
-        unsigned Reg = CSI[inx].getReg();
-        // If instruction I reads or modifies Reg, add it to UsedCSRegs,
-        // CSRUsed map for the current block.
-        for (unsigned opInx = 0, opEnd = I->getNumOperands();
-             opInx != opEnd; ++opInx) {
-          const MachineOperand &MO = I->getOperand(opInx);
-          if (! (MO.isReg() && (MO.isUse() || MO.isDef())))
-            continue;
-          unsigned MOReg = MO.getReg();
-          if (!MOReg)
-            continue;
-          if (MOReg == Reg ||
-              (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-               TargetRegisterInfo::isPhysicalRegister(Reg) &&
-               TRI->isSubRegister(Reg, MOReg))) {
-            // CSR Reg is defined/used in block MBB.
-            CSRUsed[MBB].set(inx);
-            // Check for uses in EntryBlock.
-            if (MBB != EntryBlock)
-              allCSRUsesInEntryBlock = false;
-          }
-        }
-      }
-    }
-
-    if (CSRUsed[MBB].empty())
-      continue;
-
-    // Propagate CSRUsed[MBB] in loops
-    if (MachineLoop* LP = LI.getLoopFor(MBB)) {
-      // Add top level loop to work list.
-      MachineBasicBlock* HDR = getTopLevelLoopPreheader(LP);
-      MachineLoop* PLP = getTopLevelLoopParent(LP);
-
-      if (! HDR) {
-        HDR = PLP->getHeader();
-        assert(HDR->pred_size() > 0 && "Loop header has no predecessors?");
-        MachineBasicBlock::pred_iterator PI = HDR->pred_begin();
-        HDR = *PI;
-      }
-      TLLoops[HDR] = PLP;
-
-      // Push uses from inside loop to its parent loops,
-      // or to all other MBBs in its loop.
-      if (LP->getLoopDepth() > 1) {
-        for (MachineLoop* PLP = LP->getParentLoop(); PLP;
-             PLP = PLP->getParentLoop()) {
-          propagateUsesAroundLoop(MBB, PLP);
-        }
-      } else {
-        propagateUsesAroundLoop(MBB, LP);
-      }
-    }
-  }
-
-  if (allCSRUsesInEntryBlock) {
-    DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                 << ": all CSRs used in EntryBlock\n");
-    ShrinkWrapThisFunction = false;
-  } else {
-    bool allCSRsUsedInEntryFanout = true;
-    for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
-           SE = EntryBlock->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock* SUCC = *SI;
-      if (CSRUsed[SUCC] != UsedCSRegs)
-        allCSRsUsedInEntryFanout = false;
-    }
-    if (allCSRsUsedInEntryFanout) {
-      DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                   << ": all CSRs used in imm successors of EntryBlock\n");
-      ShrinkWrapThisFunction = false;
-    }
-  }
-
-  if (ShrinkWrapThisFunction) {
-    // Check if MBB uses CSRs and dominates all exit nodes.
-    // Such nodes are equiv. to the entry node w.r.t.
-    // CSR uses: every path through the function must
-    // pass through this node. If each CSR is used at least
-    // once by these nodes, shrink wrapping is disabled.
-    CSRegSet CSRUsedInChokePoints;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      if (MBB == EntryBlock || CSRUsed[MBB].empty() || MBB->succ_size() < 1)
-        continue;
-      bool dominatesExitNodes = true;
-      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
-        if (! DT.dominates(MBB, ReturnBlocks[ri])) {
-          dominatesExitNodes = false;
-          break;
-        }
-      if (dominatesExitNodes) {
-        CSRUsedInChokePoints |= CSRUsed[MBB];
-        if (CSRUsedInChokePoints == UsedCSRegs) {
-          DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                       << ": all CSRs used in choke point(s) at "
-                       << getBasicBlockName(MBB) << "\n");
-          ShrinkWrapThisFunction = false;
-          break;
-        }
-      }
-    }
-  }
-
-  // Return now if we have decided not to apply shrink wrapping
-  // to the current function.
-  if (! ShrinkWrapThisFunction)
-    return false;
-
-  DEBUG({
-      dbgs() << "ENABLED: " << Fn.getName();
-      if (HasFastExitPath)
-        dbgs() << " (fast exit path)";
-      dbgs() << "\n";
-      if (ShrinkWrapDebugging >= BasicInfo) {
-        dbgs() << "------------------------------"
-             << "-----------------------------\n";
-        dbgs() << "UsedCSRegs = " << stringifyCSRegSet(UsedCSRegs) << "\n";
-        if (ShrinkWrapDebugging >= Details) {
-          dbgs() << "------------------------------"
-               << "-----------------------------\n";
-          dumpAllUsed();
-        }
-      }
-    });
-
-  // Build initial DF sets to determine minimal regions in the
-  // Machine CFG around which CSRs must be spilled and restored.
-  calculateAnticAvail(Fn);
-
-  return true;
-}
-
-/// addUsesForMEMERegion - add uses of CSRs spilled or restored in
-/// multi-entry, multi-exit (MEME) regions so spill and restore
-/// placement will not break code that enters or leaves a
-/// shrink-wrapped region by inducing spills with no matching
-/// restores or restores with no matching spills. A MEME region
-/// is a subgraph of the MCFG with multiple entry edges, multiple
-/// exit edges, or both. This code propagates use information
-/// through the MCFG until all paths requiring spills and restores
-/// _outside_ the computed minimal placement regions have been covered.
-///
-bool PEI::addUsesForMEMERegion(MachineBasicBlock* MBB,
-                               SmallVectorImpl<MachineBasicBlock *> &blks) {
-  if (MBB->succ_size() < 2 && MBB->pred_size() < 2) {
-    bool processThisBlock = false;
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-           SE = MBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock* SUCC = *SI;
-      if (SUCC->pred_size() > 1) {
-        processThisBlock = true;
-        break;
-      }
-    }
-    if (!CSRRestore[MBB].empty() && MBB->succ_size() > 0) {
-      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-             PE = MBB->pred_end(); PI != PE; ++PI) {
-        MachineBasicBlock* PRED = *PI;
-        if (PRED->succ_size() > 1) {
-          processThisBlock = true;
-          break;
-        }
-      }
-    }
-    if (! processThisBlock)
-      return false;
-  }
-
-  CSRegSet prop;
-  if (!CSRSave[MBB].empty())
-    prop = CSRSave[MBB];
-  else if (!CSRRestore[MBB].empty())
-    prop = CSRRestore[MBB];
-  else
-    prop = CSRUsed[MBB];
-  if (prop.empty())
-    return false;
-
-  // Propagate selected bits to successors, predecessors of MBB.
-  bool addedUses = false;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    // Self-loop
-    if (SUCC == MBB)
-      continue;
-    if (! CSRUsed[SUCC].contains(prop)) {
-      CSRUsed[SUCC] |= prop;
-      addedUses = true;
-      blks.push_back(SUCC);
-      DEBUG(if (ShrinkWrapDebugging >= Iterations)
-              dbgs() << getBasicBlockName(MBB)
-                   << "(" << stringifyCSRegSet(prop) << ")->"
-                   << "successor " << getBasicBlockName(SUCC) << "\n");
-    }
-  }
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    // Self-loop
-    if (PRED == MBB)
-      continue;
-    if (! CSRUsed[PRED].contains(prop)) {
-      CSRUsed[PRED] |= prop;
-      addedUses = true;
-      blks.push_back(PRED);
-      DEBUG(if (ShrinkWrapDebugging >= Iterations)
-              dbgs() << getBasicBlockName(MBB)
-                   << "(" << stringifyCSRegSet(prop) << ")->"
-                   << "predecessor " << getBasicBlockName(PRED) << "\n");
-    }
-  }
-  return addedUses;
-}
-
-/// addUsesForTopLevelLoops - add uses for CSRs used inside top
-/// level loops to the exit blocks of those loops.
-///
-bool PEI::addUsesForTopLevelLoops(SmallVectorImpl<MachineBasicBlock *> &blks) {
-  bool addedUses = false;
-
-  // Place restores for top level loops where needed.
-  for (DenseMap<MachineBasicBlock*, MachineLoop*>::iterator
-         I = TLLoops.begin(), E = TLLoops.end(); I != E; ++I) {
-    MachineBasicBlock* MBB = I->first;
-    MachineLoop* LP = I->second;
-    MachineBasicBlock* HDR = LP->getHeader();
-    SmallVector<MachineBasicBlock*, 4> exitBlocks;
-    CSRegSet loopSpills;
-
-    loopSpills = CSRSave[MBB];
-    if (CSRSave[MBB].empty()) {
-      loopSpills = CSRUsed[HDR];
-      assert(!loopSpills.empty() && "No CSRs used in loop?");
-    } else if (CSRRestore[MBB].contains(CSRSave[MBB]))
-      continue;
-
-    LP->getExitBlocks(exitBlocks);
-    assert(exitBlocks.size() > 0 && "Loop has no top level exit blocks?");
-    for (unsigned i = 0, e = exitBlocks.size(); i != e; ++i) {
-      MachineBasicBlock* EXB = exitBlocks[i];
-      if (! CSRUsed[EXB].contains(loopSpills)) {
-        CSRUsed[EXB] |= loopSpills;
-        addedUses = true;
-        DEBUG(if (ShrinkWrapDebugging >= Iterations)
-                dbgs() << "LOOP " << getBasicBlockName(MBB)
-                     << "(" << stringifyCSRegSet(loopSpills) << ")->"
-                     << getBasicBlockName(EXB) << "\n");
-        if (EXB->succ_size() > 1 || EXB->pred_size() > 1)
-          blks.push_back(EXB);
-      }
-    }
-  }
-  return addedUses;
-}
-
-/// calcSpillPlacements - determine which CSRs should be spilled
-/// in MBB using AnticIn sets of MBB's predecessors, keeping track
-/// of changes to spilled reg sets. Add MBB to the set of blocks
-/// that need to be processed for propagating use info to cover
-/// multi-entry/exit regions.
-///
-bool PEI::calcSpillPlacements(MachineBasicBlock* MBB,
-                              SmallVectorImpl<MachineBasicBlock *> &blks,
-                              CSRegBlockMap &prevSpills) {
-  bool placedSpills = false;
-  // Intersect (CSRegs - AnticIn[P]) for P in Predecessors(MBB)
-  CSRegSet anticInPreds;
-  SmallVector<MachineBasicBlock*, 4> predecessors;
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    if (PRED != MBB)
-      predecessors.push_back(PRED);
-  }
-  unsigned i = 0, e = predecessors.size();
-  if (i != e) {
-    MachineBasicBlock* PRED = predecessors[i];
-    anticInPreds = UsedCSRegs - AnticIn[PRED];
-    for (++i; i != e; ++i) {
-      PRED = predecessors[i];
-      anticInPreds &= (UsedCSRegs - AnticIn[PRED]);
-    }
-  } else {
-    // Handle uses in entry blocks (which have no predecessors).
-    // This is necessary because the DFA formulation assumes the
-    // entry and (multiple) exit nodes cannot have CSR uses, which
-    // is not the case in the real world.
-    anticInPreds = UsedCSRegs;
-  }
-  // Compute spills required at MBB:
-  CSRSave[MBB] |= (AnticIn[MBB] - AvailIn[MBB]) & anticInPreds;
-
-  if (! CSRSave[MBB].empty()) {
-    if (MBB == EntryBlock) {
-      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
-        CSRRestore[ReturnBlocks[ri]] |= CSRSave[MBB];
-    } else {
-      // Reset all regs spilled in MBB that are also spilled in EntryBlock.
-      if (CSRSave[EntryBlock].intersects(CSRSave[MBB])) {
-        CSRSave[MBB] = CSRSave[MBB] - CSRSave[EntryBlock];
-      }
-    }
-  }
-  placedSpills = (CSRSave[MBB] != prevSpills[MBB]);
-  prevSpills[MBB] = CSRSave[MBB];
-  // Remember this block for adding restores to successor
-  // blocks for multi-entry region.
-  if (placedSpills)
-    blks.push_back(MBB);
-
-  DEBUG(if (! CSRSave[MBB].empty() && ShrinkWrapDebugging >= Iterations)
-          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRSave[MBB]) << "\n");
-
-  return placedSpills;
-}
-
-/// calcRestorePlacements - determine which CSRs should be restored
-/// in MBB using AvailOut sets of MBB's succcessors, keeping track
-/// of changes to restored reg sets. Add MBB to the set of blocks
-/// that need to be processed for propagating use info to cover
-/// multi-entry/exit regions.
-///
-bool PEI::calcRestorePlacements(MachineBasicBlock* MBB,
-                                SmallVectorImpl<MachineBasicBlock *> &blks,
-                                CSRegBlockMap &prevRestores) {
-  bool placedRestores = false;
-  // Intersect (CSRegs - AvailOut[S]) for S in Successors(MBB)
-  CSRegSet availOutSucc;
-  SmallVector<MachineBasicBlock*, 4> successors;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    if (SUCC != MBB)
-      successors.push_back(SUCC);
-  }
-  unsigned i = 0, e = successors.size();
-  if (i != e) {
-    MachineBasicBlock* SUCC = successors[i];
-    availOutSucc = UsedCSRegs - AvailOut[SUCC];
-    for (++i; i != e; ++i) {
-      SUCC = successors[i];
-      availOutSucc &= (UsedCSRegs - AvailOut[SUCC]);
-    }
-  } else {
-    if (! CSRUsed[MBB].empty() || ! AvailOut[MBB].empty()) {
-      // Handle uses in return blocks (which have no successors).
-      // This is necessary because the DFA formulation assumes the
-      // entry and (multiple) exit nodes cannot have CSR uses, which
-      // is not the case in the real world.
-      availOutSucc = UsedCSRegs;
-    }
-  }
-  // Compute restores required at MBB:
-  CSRRestore[MBB] |= (AvailOut[MBB] - AnticOut[MBB]) & availOutSucc;
-
-  // Postprocess restore placements at MBB.
-  // Remove the CSRs that are restored in the return blocks.
-  // Lest this be confusing, note that:
-  // CSRSave[EntryBlock] == CSRRestore[B] for all B in ReturnBlocks.
-  if (MBB->succ_size() && ! CSRRestore[MBB].empty()) {
-    if (! CSRSave[EntryBlock].empty())
-      CSRRestore[MBB] = CSRRestore[MBB] - CSRSave[EntryBlock];
-  }
-  placedRestores = (CSRRestore[MBB] != prevRestores[MBB]);
-  prevRestores[MBB] = CSRRestore[MBB];
-  // Remember this block for adding saves to predecessor
-  // blocks for multi-entry region.
-  if (placedRestores)
-    blks.push_back(MBB);
-
-  DEBUG(if (! CSRRestore[MBB].empty() && ShrinkWrapDebugging >= Iterations)
-          dbgs() << "RESTORE[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
-
-  return placedRestores;
-}
-
-/// placeSpillsAndRestores - place spills and restores of CSRs
-/// used in MBBs in minimal regions that contain the uses.
-///
-void PEI::placeSpillsAndRestores(MachineFunction &Fn) {
-  CSRegBlockMap prevCSRSave;
-  CSRegBlockMap prevCSRRestore;
-  SmallVector<MachineBasicBlock*, 4> cvBlocks, ncvBlocks;
-  bool changed = true;
-  unsigned iterations = 0;
-
-  // Iterate computation of spill and restore placements in the MCFG until:
-  //   1. CSR use info has been fully propagated around the MCFG, and
-  //   2. computation of CSRSave[], CSRRestore[] reach fixed points.
-  while (changed) {
-    changed = false;
-    ++iterations;
-
-    DEBUG(if (ShrinkWrapDebugging >= Iterations)
-            dbgs() << "iter " << iterations
-                 << " --------------------------------------------------\n");
-
-    // Calculate CSR{Save,Restore} sets using Antic, Avail on the MCFG,
-    // which determines the placements of spills and restores.
-    // Keep track of changes to spills, restores in each iteration to
-    // minimize the total iterations.
-    bool SRChanged = false;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-
-      // Place spills for CSRs in MBB.
-      SRChanged |= calcSpillPlacements(MBB, cvBlocks, prevCSRSave);
-
-      // Place restores for CSRs in MBB.
-      SRChanged |= calcRestorePlacements(MBB, cvBlocks, prevCSRRestore);
-    }
-
-    // Add uses of CSRs used inside loops where needed.
-    changed |= addUsesForTopLevelLoops(cvBlocks);
-
-    // Add uses for CSRs spilled or restored at branch, join points.
-    if (changed || SRChanged) {
-      while (! cvBlocks.empty()) {
-        MachineBasicBlock* MBB = cvBlocks.pop_back_val();
-        changed |= addUsesForMEMERegion(MBB, ncvBlocks);
-      }
-      if (! ncvBlocks.empty()) {
-        cvBlocks = ncvBlocks;
-        ncvBlocks.clear();
-      }
-    }
-
-    if (changed) {
-      calculateAnticAvail(Fn);
-      CSRSave.clear();
-      CSRRestore.clear();
-    }
-  }
-
-  // Check for effectiveness:
-  //  SR0 = {r | r in CSRSave[EntryBlock], CSRRestore[RB], RB in ReturnBlocks}
-  //  numSRReduced = |(UsedCSRegs - SR0)|, approx. SR0 by CSRSave[EntryBlock]
-  // Gives a measure of how many CSR spills have been moved from EntryBlock
-  // to minimal regions enclosing their uses.
-  CSRegSet notSpilledInEntryBlock = (UsedCSRegs - CSRSave[EntryBlock]);
-  unsigned numSRReducedThisFunc = notSpilledInEntryBlock.count();
-  numSRReduced += numSRReducedThisFunc;
-  DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
-      dbgs() << "-----------------------------------------------------------\n";
-      dbgs() << "total iterations = " << iterations << " ( "
-           << Fn.getName()
-           << " " << numSRReducedThisFunc
-           << " " << Fn.size()
-           << " )\n";
-      dbgs() << "-----------------------------------------------------------\n";
-      dumpSRSets();
-      dbgs() << "-----------------------------------------------------------\n";
-      if (numSRReducedThisFunc)
-        verifySpillRestorePlacement();
-    });
-}
-
-// Debugging methods.
-#ifndef NDEBUG
-/// findFastExitPath - debugging method used to detect functions
-/// with at least one path from the entry block to a return block
-/// directly or which has a very small number of edges.
-///
-void PEI::findFastExitPath() {
-  if (! EntryBlock)
-    return;
-  // Fina a path from EntryBlock to any return block that does not branch:
-  //        Entry
-  //          |     ...
-  //          v      |
-  //         B1<-----+
-  //          |
-  //          v
-  //       Return
-  for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
-         SE = EntryBlock->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-
-    // Assume positive, disprove existence of fast path.
-    HasFastExitPath = true;
-
-    // Check the immediate successors.
-    if (isReturnBlock(SUCC)) {
-      if (ShrinkWrapDebugging >= BasicInfo)
-        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
-             << "->" << getBasicBlockName(SUCC) << "\n";
-      break;
-    }
-    // Traverse df from SUCC, look for a branch block.
-    std::string exitPath = getBasicBlockName(SUCC);
-    for (df_iterator<MachineBasicBlock*> BI = df_begin(SUCC),
-           BE = df_end(SUCC); BI != BE; ++BI) {
-      MachineBasicBlock* SBB = *BI;
-      // Reject paths with branch nodes.
-      if (SBB->succ_size() > 1) {
-        HasFastExitPath = false;
-        break;
-      }
-      exitPath += "->" + getBasicBlockName(SBB);
-    }
-    if (HasFastExitPath) {
-      if (ShrinkWrapDebugging >= BasicInfo)
-        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
-             << "->" << exitPath << "\n";
-      break;
-    }
-  }
-}
-
-/// verifySpillRestorePlacement - check the current spill/restore
-/// sets for safety. Attempt to find spills without restores or
-/// restores without spills.
-/// Spills: walk df from each MBB in spill set ensuring that
-///         all CSRs spilled at MMBB are restored on all paths
-///         from MBB to all exit blocks.
-/// Restores: walk idf from each MBB in restore set ensuring that
-///           all CSRs restored at MBB are spilled on all paths
-///           reaching MBB.
-///
-void PEI::verifySpillRestorePlacement() {
-  unsigned numReturnBlocks = 0;
-  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-       MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock* MBB = MBBI;
-    if (isReturnBlock(MBB) || MBB->succ_size() == 0)
-      ++numReturnBlocks;
-  }
-  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
-         BE = CSRSave.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet spilled = BI->second;
-    CSRegSet restored;
-
-    if (spilled.empty())
-      continue;
-
-    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(spilled)
-                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
-
-    if (CSRRestore[MBB].intersects(spilled)) {
-      restored |= (CSRRestore[MBB] & spilled);
-    }
-
-    // Walk depth first from MBB to find restores of all CSRs spilled at MBB:
-    // we must find restores for all spills w/no intervening spills on all
-    // paths from MBB to all return blocks.
-    for (df_iterator<MachineBasicBlock*> BI = df_begin(MBB),
-           BE = df_end(MBB); BI != BE; ++BI) {
-      MachineBasicBlock* SBB = *BI;
-      if (SBB == MBB)
-        continue;
-      // Stop when we encounter spills of any CSRs spilled at MBB that
-      // have not yet been seen to be restored.
-      if (CSRSave[SBB].intersects(spilled) &&
-          !restored.contains(CSRSave[SBB] & spilled))
-        break;
-      // Collect the CSRs spilled at MBB that are restored
-      // at this DF successor of MBB.
-      if (CSRRestore[SBB].intersects(spilled))
-        restored |= (CSRRestore[SBB] & spilled);
-      // If we are at a retun block, check that the restores
-      // we have seen so far exhaust the spills at MBB, then
-      // reset the restores.
-      if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
-        if (restored != spilled) {
-          CSRegSet notRestored = (spilled - restored);
-          DEBUG(dbgs() << MF->getName() << ": "
-                       << stringifyCSRegSet(notRestored)
-                       << " spilled at " << getBasicBlockName(MBB)
-                       << " are never restored on path to return "
-                       << getBasicBlockName(SBB) << "\n");
-        }
-        restored.clear();
-      }
-    }
-  }
-
-  // Check restore placements.
-  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
-         BE = CSRRestore.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet restored = BI->second;
-    CSRegSet spilled;
-
-    if (restored.empty())
-      continue;
-
-    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRSave[MBB])
-                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(restored) << "\n");
-
-    if (CSRSave[MBB].intersects(restored)) {
-      spilled |= (CSRSave[MBB] & restored);
-    }
-    // Walk inverse depth first from MBB to find spills of all
-    // CSRs restored at MBB:
-    for (idf_iterator<MachineBasicBlock*> BI = idf_begin(MBB),
-           BE = idf_end(MBB); BI != BE; ++BI) {
-      MachineBasicBlock* PBB = *BI;
-      if (PBB == MBB)
-        continue;
-      // Stop when we encounter restores of any CSRs restored at MBB that
-      // have not yet been seen to be spilled.
-      if (CSRRestore[PBB].intersects(restored) &&
-          !spilled.contains(CSRRestore[PBB] & restored))
-        break;
-      // Collect the CSRs restored at MBB that are spilled
-      // at this DF predecessor of MBB.
-      if (CSRSave[PBB].intersects(restored))
-        spilled |= (CSRSave[PBB] & restored);
-    }
-    if (spilled != restored) {
-      CSRegSet notSpilled = (restored - spilled);
-      DEBUG(dbgs() << MF->getName() << ": "
-                   << stringifyCSRegSet(notSpilled)
-                   << " restored at " << getBasicBlockName(MBB)
-                   << " are never spilled\n");
-    }
-  }
-}
-
-// Debugging print methods.
-std::string PEI::getBasicBlockName(const MachineBasicBlock* MBB) {
-  if (!MBB)
-    return "";
-
-  if (MBB->getBasicBlock())
-    return MBB->getBasicBlock()->getName().str();
-
-  std::ostringstream name;
-  name << "_MBB_" << MBB->getNumber();
-  return name.str();
-}
-
-std::string PEI::stringifyCSRegSet(const CSRegSet& s) {
-  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
-  const std::vector<CalleeSavedInfo> CSI =
-    MF->getFrameInfo()->getCalleeSavedInfo();
-
-  std::ostringstream srep;
-  if (CSI.size() == 0) {
-    srep << "[]";
-    return srep.str();
-  }
-  srep << "[";
-  CSRegSet::iterator I = s.begin(), E = s.end();
-  if (I != E) {
-    unsigned reg = CSI[*I].getReg();
-    srep << TRI->getName(reg);
-    for (++I; I != E; ++I) {
-      reg = CSI[*I].getReg();
-      srep << ",";
-      srep << TRI->getName(reg);
-    }
-  }
-  srep << "]";
-  return srep.str();
-}
-
-void PEI::dumpSet(const CSRegSet& s) {
-  DEBUG(dbgs() << stringifyCSRegSet(s) << "\n");
-}
-
-void PEI::dumpUsed(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << "CSRUsed[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRUsed[MBB])  << "\n";
-    });
-}
-
-void PEI::dumpAllUsed() {
-    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      dumpUsed(MBB);
-    }
-}
-
-void PEI::dumpSets(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << getBasicBlockName(MBB)           << " | "
-               << stringifyCSRegSet(CSRUsed[MBB])  << " | "
-               << stringifyCSRegSet(AnticIn[MBB])  << " | "
-               << stringifyCSRegSet(AnticOut[MBB]) << " | "
-               << stringifyCSRegSet(AvailIn[MBB])  << " | "
-               << stringifyCSRegSet(AvailOut[MBB]) << "\n";
-    });
-}
-
-void PEI::dumpSets1(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << getBasicBlockName(MBB)             << " | "
-               << stringifyCSRegSet(CSRUsed[MBB])    << " | "
-               << stringifyCSRegSet(AnticIn[MBB])    << " | "
-               << stringifyCSRegSet(AnticOut[MBB])   << " | "
-               << stringifyCSRegSet(AvailIn[MBB])    << " | "
-               << stringifyCSRegSet(AvailOut[MBB])   << " | "
-               << stringifyCSRegSet(CSRSave[MBB])    << " | "
-               << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
-    });
-}
-
-void PEI::dumpAllSets() {
-    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      dumpSets1(MBB);
-    }
-}
-
-void PEI::dumpSRSets() {
-  DEBUG({
-      for (MachineFunction::iterator MBB = MF->begin(), E = MF->end();
-           MBB != E; ++MBB) {
-        if (!CSRSave[MBB].empty()) {
-          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRSave[MBB]);
-          if (CSRRestore[MBB].empty())
-            dbgs() << '\n';
-        }
-
-        if (!CSRRestore[MBB].empty() && !CSRSave[MBB].empty())
-          dbgs() << "    "
-                 << "RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
-      }
-    });
-}
-#endif
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 2fc8f46..da2e710 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -42,41 +42,40 @@ STATISTIC(NumInvokes, "Number of invokes replaced");
 STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
-  class SjLjEHPrepare : public FunctionPass {
-    const TargetMachine *TM;
-    Type *FunctionContextTy;
-    Constant *RegisterFn;
-    Constant *UnregisterFn;
-    Constant *BuiltinSetjmpFn;
-    Constant *FrameAddrFn;
-    Constant *StackAddrFn;
-    Constant *StackRestoreFn;
-    Constant *LSDAAddrFn;
-    Value *PersonalityFn;
-    Constant *CallSiteFn;
-    Constant *FuncCtxFn;
-    AllocaInst *FuncCtx;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit SjLjEHPrepare(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM) { }
-    bool doInitialization(Module &M);
-    bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
-    const char *getPassName() const {
-      return "SJLJ Exception Handling preparation";
-    }
+class SjLjEHPrepare : public FunctionPass {
+  const TargetMachine *TM;
+  Type *FunctionContextTy;
+  Constant *RegisterFn;
+  Constant *UnregisterFn;
+  Constant *BuiltinSetjmpFn;
+  Constant *FrameAddrFn;
+  Constant *StackAddrFn;
+  Constant *StackRestoreFn;
+  Constant *LSDAAddrFn;
+  Value *PersonalityFn;
+  Constant *CallSiteFn;
+  Constant *FuncCtxFn;
+  AllocaInst *FuncCtx;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit SjLjEHPrepare(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+  bool doInitialization(Module &M);
+  bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
+  const char *getPassName() const {
+    return "SJLJ Exception Handling preparation";
+  }
 
-  private:
-    bool setupEntryBlockAndCallSites(Function &F);
-    void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
-                              Value *SelVal);
-    Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads);
-    void lowerIncomingArguments(Function &F);
-    void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst*> Invokes);
-    void insertCallSiteStore(Instruction *I, int Number);
-  };
+private:
+  bool setupEntryBlockAndCallSites(Function &F);
+  void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
+  Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
+  void lowerIncomingArguments(Function &F);
+  void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst *> Invokes);
+  void insertCallSiteStore(Instruction *I, int Number);
+};
 } // end anonymous namespace
 
 char SjLjEHPrepare::ID = 0;
@@ -92,23 +91,19 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
   // builtin_setjmp uses a five word jbuf
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
-  FunctionContextTy =
-    StructType::get(VoidPtrTy,                        // __prev
-                    Int32Ty,                          // call_site
-                    ArrayType::get(Int32Ty, 4),       // __data
-                    VoidPtrTy,                        // __personality
-                    VoidPtrTy,                        // __lsda
-                    ArrayType::get(VoidPtrTy, 5),     // __jbuf
-                    NULL);
-  RegisterFn = M.getOrInsertFunction("_Unwind_SjLj_Register",
-                                     Type::getVoidTy(M.getContext()),
-                                     PointerType::getUnqual(FunctionContextTy),
-                                     (Type *)0);
-  UnregisterFn =
-    M.getOrInsertFunction("_Unwind_SjLj_Unregister",
-                          Type::getVoidTy(M.getContext()),
-                          PointerType::getUnqual(FunctionContextTy),
-                          (Type *)0);
+  FunctionContextTy = StructType::get(VoidPtrTy,                  // __prev
+                                      Int32Ty,                    // call_site
+                                      ArrayType::get(Int32Ty, 4), // __data
+                                      VoidPtrTy, // __personality
+                                      VoidPtrTy, // __lsda
+                                      ArrayType::get(VoidPtrTy, 5), // __jbuf
+                                      NULL);
+  RegisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), (Type *)0);
+  UnregisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), (Type *)0);
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
@@ -134,16 +129,17 @@ void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
   Value *CallSite = Builder.CreateGEP(FuncCtx, Idxs, "call_site");
 
   // Insert a store of the call-site number
-  ConstantInt *CallSiteNoC = ConstantInt::get(Type::getInt32Ty(I->getContext()),
-                                              Number);
-  Builder.CreateStore(CallSiteNoC, CallSite, true/*volatile*/);
+  ConstantInt *CallSiteNoC =
+      ConstantInt::get(Type::getInt32Ty(I->getContext()), Number);
+  Builder.CreateStore(CallSiteNoC, CallSite, true /*volatile*/);
 }
 
 /// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
 /// we reach blocks we've already seen.
 static void MarkBlocksLiveIn(BasicBlock *BB,
-                             SmallPtrSet<BasicBlock*, 64> &LiveBBs) {
-  if (!LiveBBs.insert(BB)) return; // already been here.
+                             SmallPtrSet<BasicBlock *, 64> &LiveBBs) {
+  if (!LiveBBs.insert(BB))
+    return; // already been here.
 
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     MarkBlocksLiveIn(*PI, LiveBBs);
@@ -153,12 +149,14 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
 /// instruction with those returned by the personality function.
 void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
                                          Value *SelVal) {
-  SmallVector<Value*, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
+  SmallVector<Value *, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
   while (!UseWorkList.empty()) {
     Value *Val = UseWorkList.pop_back_val();
     ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Val);
-    if (!EVI) continue;
-    if (EVI->getNumIndices() != 1) continue;
+    if (!EVI)
+      continue;
+    if (EVI->getNumIndices() != 1)
+      continue;
     if (*EVI->idx_begin() == 0)
       EVI->replaceAllUsesWith(ExnVal);
     else if (*EVI->idx_begin() == 1)
@@ -167,14 +165,15 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
       EVI->eraseFromParent();
   }
 
-  if (LPI->getNumUses() == 0)  return;
+  if (LPI->getNumUses() == 0)
+    return;
 
   // There are still some uses of LPI. Construct an aggregate with the exception
   // values and replace the LPI with that aggregate.
   Type *LPadType = LPI->getType();
   Value *LPadVal = UndefValue::get(LPadType);
-  IRBuilder<>
-    Builder(llvm::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
+  IRBuilder<> Builder(
+      llvm::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
   LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
   LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
 
@@ -183,8 +182,8 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
 
 /// setupFunctionContext - Allocate the function context on the stack and fill
 /// it with all of the data that we know at this point.
-Value *SjLjEHPrepare::
-setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
+Value *SjLjEHPrepare::setupFunctionContext(Function &F,
+                                           ArrayRef<LandingPadInst *> LPads) {
   BasicBlock *EntryBB = F.begin();
 
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
@@ -192,9 +191,9 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
   // because the value needs to be added to the global context list.
   const TargetLowering *TLI = TM->getTargetLowering();
   unsigned Align =
-    TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
-  FuncCtx =
-    new AllocaInst(FunctionContextTy, 0, Align, "fn_context", EntryBB->begin());
+      TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
+  FuncCtx = new AllocaInst(FunctionContextTy, 0, Align, "fn_context",
+                           EntryBB->begin());
 
   // Fill in the function context structure.
   for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
@@ -205,13 +204,13 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
     Value *FCData = Builder.CreateConstGEP2_32(FuncCtx, 0, 2, "__data");
 
     // The exception values come back in context->__data[0].
-    Value *ExceptionAddr = Builder.CreateConstGEP2_32(FCData, 0, 0,
-                                                      "exception_gep");
+    Value *ExceptionAddr =
+        Builder.CreateConstGEP2_32(FCData, 0, 0, "exception_gep");
     Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
     ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
 
-    Value *SelectorAddr = Builder.CreateConstGEP2_32(FCData, 0, 1,
-                                                     "exn_selector_gep");
+    Value *SelectorAddr =
+        Builder.CreateConstGEP2_32(FCData, 0, 1, "exn_selector_gep");
     Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
 
     substituteLPadValues(LPI, ExnVal, SelVal);
@@ -221,11 +220,11 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
   IRBuilder<> Builder(EntryBB->getTerminator());
   if (!PersonalityFn)
     PersonalityFn = LPads[0]->getPersonalityFn();
-  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 3,
-                                                          "pers_fn_gep");
-  Builder.CreateStore(Builder.CreateBitCast(PersonalityFn,
-                                            Builder.getInt8PtrTy()),
-                      PersonalityFieldPtr, /*isVolatile=*/true);
+  Value *PersonalityFieldPtr =
+      Builder.CreateConstGEP2_32(FuncCtx, 0, 3, "pers_fn_gep");
+  Builder.CreateStore(
+      Builder.CreateBitCast(PersonalityFn, Builder.getInt8PtrTy()),
+      PersonalityFieldPtr, /*isVolatile=*/true);
 
   // LSDA address
   Value *LSDA = Builder.CreateCall(LSDAAddrFn, "lsda_addr");
@@ -245,8 +244,8 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
          isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
     ++AfterAllocaInsPt;
 
-  for (Function::arg_iterator
-         AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) {
+  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
+       ++AI) {
     Type *Ty = AI->getType();
 
     // Aggregate types can't be cast, but are legal argument types, so we have
@@ -265,9 +264,8 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
       // This is always a no-op cast because we're casting AI to AI->getType()
       // so src and destination types are identical. BitCast is the only
       // possibility.
-      CastInst *NC =
-        new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
-                        AfterAllocaInsPt);
+      CastInst *NC = new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
+                                     AfterAllocaInsPt);
       AI->replaceAllUsesWith(NC);
 
       // Set the operand of the cast instruction back to the AllocaInst.
@@ -284,20 +282,21 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
 /// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
 /// edge and spill them.
 void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
-                                           ArrayRef<InvokeInst*> Invokes) {
+                                           ArrayRef<InvokeInst *> Invokes) {
   // Finally, scan the code looking for instructions with bad live ranges.
-  for (Function::iterator
-         BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
-    for (BasicBlock::iterator
-           II = BB->begin(), IIE = BB->end(); II != IIE; ++II) {
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), IIE = BB->end(); II != IIE;
+         ++II) {
       // Ignore obvious cases we don't have to handle. In particular, most
       // instructions either have no uses or only have a single use inside the
       // current block. Ignore them quickly.
       Instruction *Inst = II;
-      if (Inst->use_empty()) continue;
+      if (Inst->use_empty())
+        continue;
       if (Inst->hasOneUse() &&
           cast<Instruction>(Inst->use_back())->getParent() == BB &&
-          !isa<PHINode>(Inst->use_back())) continue;
+          !isa<PHINode>(Inst->use_back()))
+        continue;
 
       // If this is an alloca in the entry block, it's not a real register
       // value.
@@ -306,16 +305,16 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
           continue;
 
       // Avoid iterator invalidation by copying users to a temporary vector.
-      SmallVector<Instruction*, 16> Users;
-      for (Value::use_iterator
-             UI = Inst->use_begin(), E = Inst->use_end(); UI != E; ++UI) {
+      SmallVector<Instruction *, 16> Users;
+      for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
+           UI != E; ++UI) {
         Instruction *User = cast<Instruction>(*UI);
         if (User->getParent() != BB || isa<PHINode>(User))
           Users.push_back(User);
       }
 
       // Find all of the blocks that this value is live in.
-      SmallPtrSet<BasicBlock*, 64> LiveBBs;
+      SmallPtrSet<BasicBlock *, 64> LiveBBs;
       LiveBBs.insert(Inst->getParent());
       while (!Users.empty()) {
         Instruction *U = Users.back();
@@ -339,7 +338,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
         BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
         if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
           DEBUG(dbgs() << "SJLJ Spill: " << *Inst << " around "
-                << UnwindBlock->getName() << "\n");
+                       << UnwindBlock->getName() << "\n");
           NeedsSpill = true;
           break;
         }
@@ -362,15 +361,16 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
     LandingPadInst *LPI = UnwindBlock->getLandingPadInst();
 
     // Place PHIs into a set to avoid invalidating the iterator.
-    SmallPtrSet<PHINode*, 8> PHIsToDemote;
-    for (BasicBlock::iterator
-           PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
+    SmallPtrSet<PHINode *, 8> PHIsToDemote;
+    for (BasicBlock::iterator PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
       PHIsToDemote.insert(cast<PHINode>(PN));
-    if (PHIsToDemote.empty()) continue;
+    if (PHIsToDemote.empty())
+      continue;
 
     // Demote the PHIs to the stack.
-    for (SmallPtrSet<PHINode*, 8>::iterator
-           I = PHIsToDemote.begin(), E = PHIsToDemote.end(); I != E; ++I)
+    for (SmallPtrSet<PHINode *, 8>::iterator I = PHIsToDemote.begin(),
+                                             E = PHIsToDemote.end();
+         I != E; ++I)
       DemotePHIToStack(*I);
 
     // Move the landingpad instruction back to the top of the landing pad block.
@@ -382,9 +382,9 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
 /// the function context and marking the call sites with the appropriate
 /// values. These values are used by the DWARF EH emitter.
 bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
-  SmallVector<ReturnInst*, 16> Returns;
-  SmallVector<InvokeInst*, 16> Invokes;
-  SmallSetVector<LandingPadInst*, 16> LPads;
+  SmallVector<ReturnInst *, 16> Returns;
+  SmallVector<InvokeInst *, 16> Invokes;
+  SmallSetVector<LandingPadInst *, 16> LPads;
 
   // Look through the terminators of the basic blocks to find invokes.
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -404,7 +404,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
       Returns.push_back(RI);
     }
 
-  if (Invokes.empty()) return false;
+  if (Invokes.empty())
+    return false;
 
   NumInvokes += Invokes.size();
 
@@ -412,7 +413,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   lowerAcrossUnwindEdges(F, Invokes);
 
   Value *FuncCtx =
-    setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
+      setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
   BasicBlock *EntryBB = F.begin();
   IRBuilder<> Builder(EntryBB->getTerminator());
 
@@ -446,7 +447,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
     insertCallSiteStore(Invokes[I], I + 1);
 
     ConstantInt *CallSiteNum =
-      ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
+        ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
 
     // Record the call site value for the back end so it stays associated with
     // the invoke.
@@ -468,8 +469,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
       }
 
   // Register the function context and make sure it's known to not throw
-  CallInst *Register = CallInst::Create(RegisterFn, FuncCtx, "",
-                                        EntryBB->getTerminator());
+  CallInst *Register =
+      CallInst::Create(RegisterFn, FuncCtx, "", EntryBB->getTerminator());
   Register->setDoesNotThrow();
 
   // Following any allocas not in the entry block, update the saved SP in the
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 209792f..d5b3a4a 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -77,7 +77,7 @@ protected:
 
     DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
 
-    assert(li->weight != HUGE_VALF &&
+    assert(li->weight != llvm::huge_valf &&
            "Attempting to spill already spilled value.");
 
     assert(!TargetRegisterInfo::isStackSlot(li->reg) &&
@@ -115,15 +115,14 @@ protected:
         indices.push_back(i);
       }
 
-      // Create a new vreg & interval for this instr.
-      LiveInterval *newLI = &LRE.create();
-      newLI->weight = HUGE_VALF;
+      // Create a new virtual register for the load and/or store.
+      unsigned NewVReg = LRE.create();
 
       // Update the reg operands & kill flags.
       for (unsigned i = 0; i < indices.size(); ++i) {
         unsigned mopIdx = indices[i];
         MachineOperand &mop = mi->getOperand(mopIdx);
-        mop.setReg(newLI->reg);
+        mop.setReg(NewVReg);
         if (mop.isUse() && !mi->isRegTiedToDefOperand(mopIdx)) {
           mop.setIsKill(true);
         }
@@ -133,28 +132,20 @@ protected:
       // Insert reload if necessary.
       MachineBasicBlock::iterator miItr(mi);
       if (hasUse) {
-        tii->loadRegFromStackSlot(*mi->getParent(), miItr, newLI->reg, ss, trc,
+        MachineInstrSpan MIS(miItr);
+
+        tii->loadRegFromStackSlot(*mi->getParent(), miItr, NewVReg, ss, trc,
                                   tri);
-        MachineInstr *loadInstr(prior(miItr));
-        SlotIndex loadIndex =
-          lis->InsertMachineInstrInMaps(loadInstr).getRegSlot();
-        SlotIndex endIndex = loadIndex.getNextIndex();
-        VNInfo *loadVNI =
-          newLI->getNextValue(loadIndex, lis->getVNInfoAllocator());
-        newLI->addRange(LiveRange(loadIndex, endIndex, loadVNI));
+        lis->InsertMachineInstrRangeInMaps(MIS.begin(), miItr);
       }
 
       // Insert store if necessary.
       if (hasDef) {
-        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr),newLI->reg,
+        MachineInstrSpan MIS(miItr);
+
+        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr), NewVReg,
                                  true, ss, trc, tri);
-        MachineInstr *storeInstr(llvm::next(miItr));
-        SlotIndex storeIndex =
-          lis->InsertMachineInstrInMaps(storeInstr).getRegSlot();
-        SlotIndex beginIndex = storeIndex.getPrevIndex();
-        VNInfo *storeVNI =
-          newLI->getNextValue(beginIndex, lis->getVNInfoAllocator());
-        newLI->addRange(LiveRange(beginIndex, storeIndex, storeVNI));
+        lis->InsertMachineInstrRangeInMaps(llvm::next(miItr), MIS.end());
       }
     }
   }
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index e717fac..68a15f7 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -214,7 +214,7 @@ bool SplitAnalysis::calcLiveBlockInfo() {
 
       // When not live in, the first use should be a def.
       if (!BI.LiveIn) {
-        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
         assert(LVI->start == BI.FirstInstr && "First instr should be a def");
         BI.FirstDef = BI.FirstInstr;
       }
@@ -245,8 +245,8 @@ bool SplitAnalysis::calcLiveBlockInfo() {
           BI.FirstInstr = BI.FirstDef = LVI->start;
         }
 
-        // A LiveRange that starts in the middle of the block must be a def.
-        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        // A Segment that starts in the middle of the block must be a def.
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
         if (!BI.FirstDef)
           BI.FirstDef = LVI->start;
       }
@@ -377,7 +377,7 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   assert(ParentVNI && "Mapping  NULL value");
   assert(Idx.isValid() && "Invalid SlotIndex");
   assert(Edit->getParent().getVNInfoAt(Idx) == ParentVNI && "Bad Parent VNI");
-  LiveInterval *LI = Edit->get(RegIdx);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
   // Create a new value.
   VNInfo *VNI = LI->getNextValue(Idx, LIS.getVNInfoAllocator());
@@ -395,14 +395,14 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   // If the previous value was a simple mapping, add liveness for it now.
   if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
     SlotIndex Def = OldVNI->def;
-    LI->addRange(LiveRange(Def, Def.getDeadSlot(), OldVNI));
+    LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), OldVNI));
     // No longer a simple mapping.  Switch to a complex, non-forced mapping.
     InsP.first->second = ValueForcePair();
   }
 
   // This is a complex mapping, add liveness for VNI
   SlotIndex Def = VNI->def;
-  LI->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
 
   return VNI;
 }
@@ -422,7 +422,8 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   // This was previously a single mapping. Make sure the old def is represented
   // by a trivial live range.
   SlotIndex Def = VNI->def;
-  Edit->get(RegIdx)->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
   // Mark as complex mapped, forced.
   VFP = ValueForcePair(0, true);
 }
@@ -434,7 +435,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
                                    MachineBasicBlock::iterator I) {
   MachineInstr *CopyMI = 0;
   SlotIndex Def;
-  LiveInterval *LI = Edit->get(RegIdx);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
   // We may be trying to avoid interference that ends at a deleted instruction,
   // so always begin RegIdx 0 early and all others late.
@@ -462,11 +463,11 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
 unsigned SplitEditor::openIntv() {
   // Create the complement as index 0.
   if (Edit->empty())
-    Edit->create();
+    Edit->createEmptyInterval();
 
   // Create the open interval.
   OpenIdx = Edit->size();
-  Edit->create();
+  Edit->createEmptyInterval();
   return OpenIdx;
 }
 
@@ -631,7 +632,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
 //===----------------------------------------------------------------------===//
 
 void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
-  LiveInterval *LI = Edit->get(0);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
   RegAssignMap::iterator AssignI;
   AssignI.setMap(RegAssign);
@@ -730,7 +731,7 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
 
 void SplitEditor::hoistCopiesForSize() {
   // Get the complement interval, always RegIdx 0.
-  LiveInterval *LI = Edit->get(0);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   LiveInterval *Parent = &Edit->getParent();
 
   // Track the nearest common dominator for all back-copies for each ParentVNI,
@@ -861,13 +862,13 @@ bool SplitEditor::transferValues() {
 
       // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
       DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx);
-      LiveInterval *LI = Edit->get(RegIdx);
+      LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
 
       // Check for a simply defined value that can be blitted directly.
       ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
       if (VNInfo *VNI = VFP.getPointer()) {
         DEBUG(dbgs() << ':' << VNI->id);
-        LI->addRange(LiveRange(Start, End, VNI));
+        LR.addSegment(LiveInterval::Segment(Start, End, VNI));
         Start = End;
         continue;
       }
@@ -891,7 +892,7 @@ bool SplitEditor::transferValues() {
 
       // The first block may be live-in, or it may have its own def.
       if (Start != BlockStart) {
-        VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+        VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
         assert(VNI && "Missing def for complex mapped value");
         DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
         // MBB has its own def. Is it also live-out?
@@ -911,7 +912,7 @@ bool SplitEditor::transferValues() {
         if (BlockStart == ParentVNI->def) {
           // This block has the def of a parent PHI, so it isn't live-in.
           assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
-          VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+          VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
           assert(VNI && "Missing def for complex mapped parent PHI");
           if (End >= BlockEnd)
             LRC.setLiveOutValue(MBB, VNI); // Live-out as well.
@@ -919,10 +920,10 @@ bool SplitEditor::transferValues() {
           // This block needs a live-in value.  The last block covered may not
           // be live-out.
           if (End < BlockEnd)
-            LRC.addLiveInBlock(LI, MDT[MBB], End);
+            LRC.addLiveInBlock(LR, MDT[MBB], End);
           else {
             // Live-through, and we don't know the value.
-            LRC.addLiveInBlock(LI, MDT[MBB]);
+            LRC.addLiveInBlock(LR, MDT[MBB]);
             LRC.setLiveOutValue(MBB, 0);
           }
         }
@@ -949,7 +950,7 @@ void SplitEditor::extendPHIKillRanges() {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
-    LiveInterval *LI = Edit->get(RegIdx);
+    LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
     LiveRangeCalc &LRC = getLRCalc(RegIdx);
     MachineBasicBlock *MBB = LIS.getMBBFromIndex(PHIVNI->def);
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
@@ -961,7 +962,7 @@ void SplitEditor::extendPHIKillRanges() {
       if (Edit->getParent().liveAt(LastUse)) {
         assert(RegAssign.lookup(LastUse) == RegIdx &&
                "Different register assignment in phi predecessor");
-        LRC.extend(LI, End);
+        LRC.extend(LR, End);
       }
     }
   }
@@ -990,7 +991,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
 
     // Rewrite to the mapped register at Idx.
     unsigned RegIdx = RegAssign.lookup(Idx);
-    LiveInterval *LI = Edit->get(RegIdx);
+    LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
     MO.setReg(LI->reg);
     DEBUG(dbgs() << "  rewr BB#" << MI->getParent()->getNumber() << '\t'
                  << Idx << ':' << RegIdx << '\t' << *MI);
@@ -1011,14 +1012,14 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     } else
       Idx = Idx.getRegSlot(true);
 
-    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot());
+    getLRCalc(RegIdx).extend(*LI, Idx.getNextSlot());
   }
 }
 
 void SplitEditor::deleteRematVictims() {
   SmallVector<MachineInstr*, 8> Dead;
   for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
-    LiveInterval *LI = *I;
+    LiveInterval *LI = &LIS.getInterval(*I);
     for (LiveInterval::const_iterator LII = LI->begin(), LIE = LI->end();
            LII != LIE; ++LII) {
       // Dead defs end at the dead slot.
@@ -1091,8 +1092,10 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     deleteRematVictims();
 
   // Get rid of unused values and set phi-kill flags.
-  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I)
-    (*I)->RenumberValues(LIS);
+  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I) {
+    LiveInterval &LI = LIS.getInterval(*I);
+    LI.RenumberValues();
+  }
 
   // Provide a reverse mapping from original indices to Edit ranges.
   if (LRMap) {
@@ -1105,7 +1108,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   ConnectedVNInfoEqClasses ConEQ(LIS);
   for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
     // Don't use iterators, they are invalidated by create() below.
-    LiveInterval *li = Edit->get(i);
+    LiveInterval *li = &LIS.getInterval(Edit->get(i));
     unsigned NumComp = ConEQ.Classify(li);
     if (NumComp <= 1)
       continue;
@@ -1113,7 +1116,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     SmallVector<LiveInterval*, 8> dups;
     dups.push_back(li);
     for (unsigned j = 1; j != NumComp; ++j)
-      dups.push_back(&Edit->create());
+      dups.push_back(&Edit->createEmptyInterval());
     ConEQ.Distribute(&dups[0], MRI);
     // The new intervals all map back to i.
     if (LRMap)
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index faaa6e7..3dbc050 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -170,7 +170,7 @@ private:
   /// slots to use the joint slots.
   void remapInstructions(DenseMap<int, int> &SlotRemap);
 
-  /// The input program may contain intructions which are not inside lifetime
+  /// The input program may contain instructions which are not inside lifetime
   /// markers. This can happen due to a bug in the compiler or due to a bug in
   /// user code (for example, returning a reference to a local variable).
   /// This procedure checks all of the instructions in the function and
@@ -450,14 +450,14 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       SlotIndex F = Finishes[i];
       if (S < F) {
         // We have a single consecutive region.
-        Intervals[i]->addRange(LiveRange(S, F, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum));
       } else {
         // We have two non consecutive regions. This happens when
         // LIFETIME_START appears after the LIFETIME_END marker.
         SlotIndex NewStart = Indexes->getMBBStartIdx(MBB);
         SlotIndex NewFin = Indexes->getMBBEndIdx(MBB);
-        Intervals[i]->addRange(LiveRange(NewStart, F, ValNum));
-        Intervals[i]->addRange(LiveRange(S, NewFin, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum));
       }
     }
   }
@@ -763,7 +763,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
         // Merge disjoint slots.
         if (!First->overlaps(*Second)) {
           Changed = true;
-          First->MergeRangesInAsValue(*Second, First->getValNumInfo(0));
+          First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
           SlotRemap[SecondSlot] = FirstSlot;
           SortedSlots[J] = -1;
           DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
new file mode 100644
index 0000000..40893ea
--- /dev/null
+++ b/lib/CodeGen/StackMaps.cpp
@@ -0,0 +1,314 @@
+//===---------------------------- StackMaps.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackmaps"
+
+#include "llvm/CodeGen/StackMaps.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <iterator>
+
+using namespace llvm;
+
+PatchPointOpers::PatchPointOpers(const MachineInstr *MI):
+  MI(MI),
+  HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+         !MI->getOperand(0).isImplicit()),
+  IsAnyReg(MI->getOperand(getMetaIdx(CCPos)).getImm() == CallingConv::AnyReg) {
+
+#ifndef NDEBUG
+  {
+  unsigned CheckStartIdx = 0, e = MI->getNumOperands();
+  while (CheckStartIdx < e && MI->getOperand(CheckStartIdx).isReg() &&
+         MI->getOperand(CheckStartIdx).isDef() &&
+         !MI->getOperand(CheckStartIdx).isImplicit())
+    ++CheckStartIdx;
+
+  assert(getMetaIdx() == CheckStartIdx &&
+         "Unexpected additonal definition in Patchpoint intrinsic.");
+  }
+#endif
+}
+
+unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
+  if (!StartIdx)
+    StartIdx = getVarIdx();
+
+  // Find the next scratch register (implicit def and early clobber)
+  unsigned ScratchIdx = StartIdx, e = MI->getNumOperands();
+  while (ScratchIdx < e &&
+         !(MI->getOperand(ScratchIdx).isReg() &&
+           MI->getOperand(ScratchIdx).isDef() &&
+           MI->getOperand(ScratchIdx).isImplicit() &&
+           MI->getOperand(ScratchIdx).isEarlyClobber()))
+    ++ScratchIdx;
+
+  assert(ScratchIdx != e && "No scratch register available");
+  return ScratchIdx;
+}
+
+void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint32_t ID,
+                                    MachineInstr::const_mop_iterator MOI,
+                                    MachineInstr::const_mop_iterator MOE,
+                                    bool recordResult) {
+
+  MCContext &OutContext = AP.OutStreamer.getContext();
+  MCSymbol *MILabel = OutContext.CreateTempSymbol();
+  AP.OutStreamer.EmitLabel(MILabel);
+
+  LocationVec CallsiteLocs;
+
+  if (recordResult) {
+    std::pair<Location, MachineInstr::const_mop_iterator> ParseResult =
+      OpParser(MI.operands_begin(), llvm::next(MI.operands_begin()), AP.TM);
+
+    Location &Loc = ParseResult.first;
+    assert(Loc.LocType == Location::Register &&
+           "Stackmap return location must be a register.");
+    CallsiteLocs.push_back(Loc);
+  }
+
+  while (MOI != MOE) {
+    std::pair<Location, MachineInstr::const_mop_iterator> ParseResult =
+      OpParser(MOI, MOE, AP.TM);
+
+    Location &Loc = ParseResult.first;
+
+    // Move large constants into the constant pool.
+    if (Loc.LocType == Location::Constant && (Loc.Offset & ~0xFFFFFFFFULL)) {
+      Loc.LocType = Location::ConstantIndex;
+      Loc.Offset = ConstPool.getConstantIndex(Loc.Offset);
+    }
+
+    CallsiteLocs.push_back(Loc);
+    MOI = ParseResult.second;
+  }
+
+  const MCExpr *CSOffsetExpr = MCBinaryExpr::CreateSub(
+    MCSymbolRefExpr::Create(MILabel, OutContext),
+    MCSymbolRefExpr::Create(AP.CurrentFnSym, OutContext),
+    OutContext);
+
+  CSInfos.push_back(CallsiteInfo(CSOffsetExpr, ID, CallsiteLocs));
+}
+
+static MachineInstr::const_mop_iterator
+getStackMapEndMOP(MachineInstr::const_mop_iterator MOI,
+                  MachineInstr::const_mop_iterator MOE) {
+  for (; MOI != MOE; ++MOI)
+    if (MOI->isRegMask() || (MOI->isReg() && MOI->isImplicit()))
+      break;
+
+  return MOI;
+}
+
+void StackMaps::recordStackMap(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STACKMAP && "exected stackmap");
+
+  int64_t ID = MI.getOperand(0).getImm();
+  assert((int32_t)ID == ID && "Stack maps hold 32-bit IDs");
+  recordStackMapOpers(MI, ID, llvm::next(MI.operands_begin(), 2),
+                      getStackMapEndMOP(MI.operands_begin(),
+                                        MI.operands_end()));
+}
+
+void StackMaps::recordPatchPoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::PATCHPOINT && "exected stackmap");
+
+  PatchPointOpers opers(&MI);
+  int64_t ID = opers.getMetaOper(PatchPointOpers::IDPos).getImm();
+  assert((int32_t)ID == ID && "Stack maps hold 32-bit IDs");
+  MachineInstr::const_mop_iterator MOI =
+    llvm::next(MI.operands_begin(), opers.getStackMapStartIdx());
+  recordStackMapOpers(MI, ID, MOI, getStackMapEndMOP(MOI, MI.operands_end()),
+                      opers.isAnyReg() && opers.hasDef());
+
+#ifndef NDEBUG
+  // verify anyregcc
+  LocationVec &Locations = CSInfos.back().Locations;
+  if (opers.isAnyReg()) {
+    unsigned NArgs = opers.getMetaOper(PatchPointOpers::NArgPos).getImm();
+    for (unsigned i = 0, e = (opers.hasDef() ? NArgs+1 : NArgs); i != e; ++i)
+      assert(Locations[i].LocType == Location::Register &&
+             "anyreg arg must be in reg.");
+  }
+#endif
+}
+
+/// serializeToStackMapSection conceptually populates the following fields:
+///
+/// uint32 : Reserved (header)
+/// uint32 : NumConstants
+/// int64  : Constants[NumConstants]
+/// uint32 : NumRecords
+/// StkMapRecord[NumRecords] {
+///   uint32 : PatchPoint ID
+///   uint32 : Instruction Offset
+///   uint16 : Reserved (record flags)
+///   uint16 : NumLocations
+///   Location[NumLocations] {
+///     uint8  : Register | Direct | Indirect | Constant | ConstantIndex
+///     uint8  : Size in Bytes
+///     uint16 : Dwarf RegNum
+///     int32  : Offset
+///   }
+/// }
+///
+/// Location Encoding, Type, Value:
+///   0x1, Register, Reg                 (value in register)
+///   0x2, Direct, Reg + Offset          (frame index)
+///   0x3, Indirect, [Reg + Offset]      (spilled value)
+///   0x4, Constant, Offset              (small constant)
+///   0x5, ConstIndex, Constants[Offset] (large constant)
+///
+void StackMaps::serializeToStackMapSection() {
+  // Bail out if there's no stack map data.
+  if (CSInfos.empty())
+    return;
+
+  MCContext &OutContext = AP.OutStreamer.getContext();
+  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+
+  // Create the section.
+  const MCSection *StackMapSection =
+    OutContext.getObjectFileInfo()->getStackMapSection();
+  AP.OutStreamer.SwitchSection(StackMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  AP.OutStreamer.EmitLabel(
+    OutContext.GetOrCreateSymbol(Twine("__LLVM_StackMaps")));
+
+  // Serialize data.
+  const char *WSMP = "Stack Maps: ";
+  (void)WSMP;
+  const MCRegisterInfo &MCRI = *OutContext.getRegisterInfo();
+
+  DEBUG(dbgs() << "********** Stack Map Output **********\n");
+
+  // Header.
+  AP.OutStreamer.EmitIntValue(0, 4);
+
+  // Num constants.
+  AP.OutStreamer.EmitIntValue(ConstPool.getNumConstants(), 4);
+
+  // Constant pool entries.
+  for (unsigned i = 0; i < ConstPool.getNumConstants(); ++i)
+    AP.OutStreamer.EmitIntValue(ConstPool.getConstant(i), 8);
+
+  DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << "\n");
+  AP.OutStreamer.EmitIntValue(CSInfos.size(), 4);
+
+  for (CallsiteInfoList::const_iterator CSII = CSInfos.begin(),
+                                        CSIE = CSInfos.end();
+       CSII != CSIE; ++CSII) {
+
+    unsigned CallsiteID = CSII->ID;
+    const LocationVec &CSLocs = CSII->Locations;
+
+    DEBUG(dbgs() << WSMP << "callsite " << CallsiteID << "\n");
+
+    // Verify stack map entry. It's better to communicate a problem to the
+    // runtime than crash in case of in-process compilation. Currently, we do
+    // simple overflow checks, but we may eventually communicate other
+    // compilation errors this way.
+    if (CSLocs.size() > UINT16_MAX) {
+      AP.OutStreamer.EmitIntValue(UINT32_MAX, 4); // Invalid ID.
+      AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
+      AP.OutStreamer.EmitIntValue(0, 2); // Reserved.
+      AP.OutStreamer.EmitIntValue(0, 2); // 0 locations.
+      continue;
+    }
+
+    AP.OutStreamer.EmitIntValue(CallsiteID, 4);
+    AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
+
+    // Reserved for flags.
+    AP.OutStreamer.EmitIntValue(0, 2);
+
+    DEBUG(dbgs() << WSMP << "  has " << CSLocs.size() << " locations\n");
+
+    AP.OutStreamer.EmitIntValue(CSLocs.size(), 2);
+
+    unsigned operIdx = 0;
+    for (LocationVec::const_iterator LocI = CSLocs.begin(), LocE = CSLocs.end();
+         LocI != LocE; ++LocI, ++operIdx) {
+      const Location &Loc = *LocI;
+      DEBUG(
+        dbgs() << WSMP << "  Loc " << operIdx << ": ";
+        switch (Loc.LocType) {
+        case Location::Unprocessed:
+          dbgs() << "<Unprocessed operand>";
+          break;
+        case Location::Register:
+          dbgs() << "Register " << MCRI.getName(Loc.Reg);
+          break;
+        case Location::Direct:
+          dbgs() << "Direct " << MCRI.getName(Loc.Reg);
+          if (Loc.Offset)
+            dbgs() << " + " << Loc.Offset;
+          break;
+        case Location::Indirect:
+          dbgs() << "Indirect " << MCRI.getName(Loc.Reg)
+                 << " + " << Loc.Offset;
+          break;
+        case Location::Constant:
+          dbgs() << "Constant " << Loc.Offset;
+          break;
+        case Location::ConstantIndex:
+          dbgs() << "Constant Index " << Loc.Offset;
+          break;
+        }
+        dbgs() << "\n";
+      );
+
+      unsigned RegNo = 0;
+      int Offset = Loc.Offset;
+      if(Loc.Reg) {
+        RegNo = MCRI.getDwarfRegNum(Loc.Reg, false);
+        for (MCSuperRegIterator SR(Loc.Reg, TRI);
+             SR.isValid() && (int)RegNo < 0; ++SR) {
+          RegNo = TRI->getDwarfRegNum(*SR, false);
+        }
+        // If this is a register location, put the subregister byte offset in
+        // the location offset.
+        if (Loc.LocType == Location::Register) {
+          assert(!Loc.Offset && "Register location should have zero offset");
+          unsigned LLVMRegNo = MCRI.getLLVMRegNum(RegNo, false);
+          unsigned SubRegIdx = MCRI.getSubRegIndex(LLVMRegNo, Loc.Reg);
+          if (SubRegIdx)
+            Offset = MCRI.getSubRegIdxOffset(SubRegIdx);
+        }
+      }
+      else {
+        assert(Loc.LocType != Location::Register &&
+               "Missing location register");
+      }
+      AP.OutStreamer.EmitIntValue(Loc.LocType, 1);
+      AP.OutStreamer.EmitIntValue(Loc.Size, 1);
+      AP.OutStreamer.EmitIntValue(RegNo, 2);
+      AP.OutStreamer.EmitIntValue(Offset, 4);
+    }
+  }
+
+  AP.OutStreamer.AddBlankLine();
+
+  CSInfos.clear();
+}
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 4c56380..9020449 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -15,11 +15,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "stack-protector"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -27,12 +29,12 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLowering.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -40,137 +42,93 @@ STATISTIC(NumFunProtected, "Number of functions protected");
 STATISTIC(NumAddrTaken, "Number of local variables that have their address"
                         " taken.");
 
-namespace {
-  class StackProtector : public FunctionPass {
-    const TargetMachine *TM;
-
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// target type sizes.
-    const TargetLoweringBase *TLI;
-    const Triple Trip;
-
-    Function *F;
-    Module *M;
-
-    DominatorTree *DT;
-
-    /// \brief The minimum size of buffers that will receive stack smashing
-    /// protection when -fstack-protection is used.
-    unsigned SSPBufferSize;
-
-    /// VisitedPHIs - The set of PHI nodes visited when determining
-    /// if a variable's reference has been taken.  This set 
-    /// is maintained to ensure we don't visit the same PHI node multiple
-    /// times.
-    SmallPtrSet<const PHINode*, 16> VisitedPHIs;
-
-    /// InsertStackProtectors - Insert code into the prologue and epilogue of
-    /// the function.
-    ///
-    ///  - The prologue code loads and stores the stack guard onto the stack.
-    ///  - The epilogue checks the value stored in the prologue against the
-    ///    original value. It calls __stack_chk_fail if they differ.
-    bool InsertStackProtectors();
-
-    /// CreateFailBB - Create a basic block to jump to when the stack protector
-    /// check fails.
-    BasicBlock *CreateFailBB();
-
-    /// ContainsProtectableArray - Check whether the type either is an array or
-    /// contains an array of sufficient size so that we need stack protectors
-    /// for it.
-    bool ContainsProtectableArray(Type *Ty, bool Strong = false,
-                                  bool InStruct = false) const;
-
-    /// \brief Check whether a stack allocation has its address taken.
-    bool HasAddressTaken(const Instruction *AI);
-
-    /// RequiresStackProtector - Check whether or not this function needs a
-    /// stack protector based upon the stack protector level.
-    bool RequiresStackProtector();
-  public:
-    static char ID;             // Pass identification, replacement for typeid.
-    StackProtector() : FunctionPass(ID), TM(0), TLI(0), SSPBufferSize(0) {
-      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-    }
-    StackProtector(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), TLI(0), Trip(TM->getTargetTriple()),
-        SSPBufferSize(8) {
-      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addPreserved<DominatorTree>();
-    }
-
-    virtual bool runOnFunction(Function &Fn);
-  };
-} // end anonymous namespace
+static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
+                                          cl::init(true), cl::Hidden);
 
 char StackProtector::ID = 0;
-INITIALIZE_PASS(StackProtector, "stack-protector",
-                "Insert stack protectors", false, false)
+INITIALIZE_PASS(StackProtector, "stack-protector", "Insert stack protectors",
+                false, true)
 
 FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
   return new StackProtector(TM);
 }
 
+StackProtector::SSPLayoutKind
+StackProtector::getSSPLayout(const AllocaInst *AI) const {
+  return AI ? Layout.lookup(AI) : SSPLK_None;
+}
+
 bool StackProtector::runOnFunction(Function &Fn) {
   F = &Fn;
   M = F->getParent();
   DT = getAnalysisIfAvailable<DominatorTree>();
   TLI = TM->getTargetLowering();
 
-  if (!RequiresStackProtector()) return false;
+  if (!RequiresStackProtector())
+    return false;
 
-  Attribute Attr =
-    Fn.getAttributes().getAttribute(AttributeSet::FunctionIndex,
-                                    "stack-protector-buffer-size");
+  Attribute Attr = Fn.getAttributes().getAttribute(
+      AttributeSet::FunctionIndex, "stack-protector-buffer-size");
   if (Attr.isStringAttribute())
-    SSPBufferSize = atoi(Attr.getValueAsString().data());
+    Attr.getValueAsString().getAsInteger(10, SSPBufferSize);
 
   ++NumFunProtected;
   return InsertStackProtectors();
 }
 
-/// ContainsProtectableArray - Check whether the type either is an array or
-/// contains a char array of sufficient size so that we need stack protectors
-/// for it.
-bool StackProtector::ContainsProtectableArray(Type *Ty, bool Strong,
+/// \param [out] IsLarge is set to true if a protectable array is found and
+/// it is "large" ( >= ssp-buffer-size).  In the case of a structure with
+/// multiple arrays, this gets set if any of them is large.
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
+                                              bool Strong,
                                               bool InStruct) const {
-  if (!Ty) return false;
+  if (!Ty)
+    return false;
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
-    // In strong mode any array, regardless of type and size, triggers a
-    // protector
-    if (Strong)
-      return true;
     if (!AT->getElementType()->isIntegerTy(8)) {
       // If we're on a non-Darwin platform or we're inside of a structure, don't
       // add stack protectors unless the array is a character array.
-      if (InStruct || !Trip.isOSDarwin())
-          return false;
+      // However, in strong mode any array, regardless of type and size,
+      // triggers a protector.
+      if (!Strong && (InStruct || !Trip.isOSDarwin()))
+        return false;
     }
 
     // If an array has more than SSPBufferSize bytes of allocated space, then we
     // emit stack protectors.
-    if (SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT))
+    if (SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT)) {
+      IsLarge = true;
+      return true;
+    }
+
+    if (Strong)
+      // Require a protector for all arrays in strong mode
       return true;
   }
 
   const StructType *ST = dyn_cast<StructType>(Ty);
-  if (!ST) return false;
+  if (!ST)
+    return false;
 
+  bool NeedsProtector = false;
   for (StructType::element_iterator I = ST->element_begin(),
-         E = ST->element_end(); I != E; ++I)
-    if (ContainsProtectableArray(*I, Strong, true))
-      return true;
+                                    E = ST->element_end();
+       I != E; ++I)
+    if (ContainsProtectableArray(*I, IsLarge, Strong, true)) {
+      // If the element is a protectable array and is large (>= SSPBufferSize)
+      // then we are done.  If the protectable array is not large, then
+      // keep looking in case a subsequent element is a large array.
+      if (IsLarge)
+        return true;
+      NeedsProtector = true;
+    }
 
-  return false;
+  return NeedsProtector;
 }
 
 bool StackProtector::HasAddressTaken(const Instruction *AI) {
   for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
-        UI != UE; ++UI) {
+       UI != UE; ++UI) {
     const User *U = *UI;
     if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (AI == SI->getValueOperand())
@@ -217,11 +175,13 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
 /// address taken.
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
+  bool NeedsProtector = false;
   if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::StackProtectReq))
-    return true;
-  else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::StackProtectStrong))
+                                      Attribute::StackProtectReq)) {
+    NeedsProtector = true;
+    Strong = true; // Use the same heuristic as strong to determine SSPLayout
+  } else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                             Attribute::StackProtectStrong))
     Strong = true;
   else if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                             Attribute::StackProtect))
@@ -230,39 +190,116 @@ bool StackProtector::RequiresStackProtector() {
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     BasicBlock *BB = I;
 
-    for (BasicBlock::iterator
-           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
       if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
         if (AI->isArrayAllocation()) {
           // SSP-Strong: Enable protectors for any call to alloca, regardless
           // of size.
           if (Strong)
             return true;
-  
+
           if (const ConstantInt *CI =
-               dyn_cast<ConstantInt>(AI->getArraySize())) {
-            if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize)
+                  dyn_cast<ConstantInt>(AI->getArraySize())) {
+            if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
-              return true;
+              Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              NeedsProtector = true;
+            } else if (Strong) {
+              // Require protectors for all alloca calls in strong mode.
+              Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              NeedsProtector = true;
+            }
           } else {
             // A call to alloca with a variable size requires protectors.
-            return true;
+            Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            NeedsProtector = true;
           }
+          continue;
         }
 
-        if (ContainsProtectableArray(AI->getAllocatedType(), Strong))
-          return true;
+        bool IsLarge = false;
+        if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
+          Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
+                                                   : SSPLK_SmallArray));
+          NeedsProtector = true;
+          continue;
+        }
 
         if (Strong && HasAddressTaken(AI)) {
-          ++NumAddrTaken; 
-          return true;
+          ++NumAddrTaken;
+          Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          NeedsProtector = true;
         }
       }
     }
   }
 
-  return false;
+  return NeedsProtector;
+}
+
+static bool InstructionWillNotHaveChain(const Instruction *I) {
+  return !I->mayHaveSideEffects() && !I->mayReadFromMemory() &&
+         isSafeToSpeculativelyExecute(I);
+}
+
+/// Identify if RI has a previous instruction in the "Tail Position" and return
+/// it. Otherwise return 0.
+///
+/// This is based off of the code in llvm::isInTailCallPosition. The difference
+/// is that it inverts the first part of llvm::isInTailCallPosition since
+/// isInTailCallPosition is checking if a call is in a tail call position, and
+/// we are searching for an unknown tail call that might be in the tail call
+/// position. Once we find the call though, the code uses the same refactored
+/// code, returnTypeIsEligibleForTailCall.
+static CallInst *FindPotentialTailCall(BasicBlock *BB, ReturnInst *RI,
+                                       const TargetLoweringBase *TLI) {
+  // Establish a reasonable upper bound on the maximum amount of instructions we
+  // will look through to find a tail call.
+  unsigned SearchCounter = 0;
+  const unsigned MaxSearch = 4;
+  bool NoInterposingChain = true;
+
+  for (BasicBlock::reverse_iterator I = llvm::next(BB->rbegin()),
+                                    E = BB->rend();
+       I != E && SearchCounter < MaxSearch; ++I) {
+    Instruction *Inst = &*I;
+
+    // Skip over debug intrinsics and do not allow them to affect our MaxSearch
+    // counter.
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    // If we find a call and the following conditions are satisifed, then we
+    // have found a tail call that satisfies at least the target independent
+    // requirements of a tail call:
+    //
+    // 1. The call site has the tail marker.
+    //
+    // 2. The call site either will not cause the creation of a chain or if a
+    // chain is necessary there are no instructions in between the callsite and
+    // the call which would create an interposing chain.
+    //
+    // 3. The return type of the function does not impede tail call
+    // optimization.
+    if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+      if (CI->isTailCall() &&
+          (InstructionWillNotHaveChain(CI) || NoInterposingChain) &&
+          returnTypeIsEligibleForTailCall(BB->getParent(), CI, RI, *TLI))
+        return CI;
+    }
+
+    // If we did not find a call see if we have an instruction that may create
+    // an interposing chain.
+    NoInterposingChain =
+        NoInterposingChain && InstructionWillNotHaveChain(Inst);
+
+    // Increment max search.
+    SearchCounter++;
+  }
+
+  return 0;
 }
 
 /// Insert code into the entry block that stores the __stack_chk_guard
@@ -273,36 +310,36 @@ bool StackProtector::RequiresStackProtector() {
 ///     StackGuard = load __stack_chk_guard
 ///     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
 ///
-static void CreatePrologue(Function *F, Module *M, ReturnInst *RI,
+/// Returns true if the platform/triple supports the stackprotectorcreate pseudo
+/// node.
+static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
                            const TargetLoweringBase *TLI, const Triple &Trip,
                            AllocaInst *&AI, Value *&StackGuardVar) {
+  bool SupportsSelectionDAGSP = false;
   PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
   unsigned AddressSpace, Offset;
   if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
     Constant *OffsetVal =
-      ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
-    
-    StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
-                                              PointerType::get(PtrTy,
-                                                               AddressSpace));
+        ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
+
+    StackGuardVar = ConstantExpr::getIntToPtr(
+        OffsetVal, PointerType::get(PtrTy, AddressSpace));
   } else if (Trip.getOS() == llvm::Triple::OpenBSD) {
     StackGuardVar = M->getOrInsertGlobal("__guard_local", PtrTy);
     cast<GlobalValue>(StackGuardVar)
-      ->setVisibility(GlobalValue::HiddenVisibility);
+        ->setVisibility(GlobalValue::HiddenVisibility);
   } else {
+    SupportsSelectionDAGSP = true;
     StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
   }
-  
-  BasicBlock &Entry = F->getEntryBlock();
-  Instruction *InsPt = &Entry.front();
-  
-  AI = new AllocaInst(PtrTy, "StackGuardSlot", InsPt);
-  LoadInst *LI = new LoadInst(StackGuardVar, "StackGuard", false, InsPt);
-  
-  Value *Args[] = { LI, AI };
-  CallInst::
-    Create(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
-           Args, "", InsPt);
+
+  IRBuilder<> B(&F->getEntryBlock().front());
+  AI = B.CreateAlloca(PtrTy, 0, "StackGuardSlot");
+  LoadInst *LI = B.CreateLoad(StackGuardVar, "StackGuard");
+  B.CreateCall2(Intrinsic::getDeclaration(M, Intrinsic::stackprotector), LI,
+                AI);
+
+  return SupportsSelectionDAGSP;
 }
 
 /// InsertStackProtectors - Insert code into the prologue and epilogue of the
@@ -312,72 +349,102 @@ static void CreatePrologue(Function *F, Module *M, ReturnInst *RI,
 ///  - The epilogue checks the value stored in the prologue against the original
 ///    value. It calls __stack_chk_fail if they differ.
 bool StackProtector::InsertStackProtectors() {
-  BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
-  BasicBlock *FailBBDom = 0;    // FailBB's dominator.
-  AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
-  Value *StackGuardVar = 0;  // The stack guard variable.
+  bool HasPrologue = false;
+  bool SupportsSelectionDAGSP =
+      EnableSelectionDAGSP && !TM->Options.EnableFastISel;
+  AllocaInst *AI = 0;       // Place on stack that stores the stack guard.
+  Value *StackGuardVar = 0; // The stack guard variable.
 
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
     BasicBlock *BB = I++;
     ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
-    if (!RI) continue;
+    if (!RI)
+      continue;
 
-    if (!FailBB) {
-      CreatePrologue(F, M, RI, TLI, Trip, AI, StackGuardVar);
-      // Create the basic block to jump to when the guard check fails.
-      FailBB = CreateFailBB();
+    if (!HasPrologue) {
+      HasPrologue = true;
+      SupportsSelectionDAGSP &=
+          CreatePrologue(F, M, RI, TLI, Trip, AI, StackGuardVar);
     }
 
-    // For each block with a return instruction, convert this:
-    //
-    //   return:
-    //     ...
-    //     ret ...
-    //
-    // into this:
-    //
-    //   return:
-    //     ...
-    //     %1 = load __stack_chk_guard
-    //     %2 = load StackGuardSlot
-    //     %3 = cmp i1 %1, %2
-    //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
-    //
-    //   SP_return:
-    //     ret ...
-    //
-    //   CallStackCheckFailBlk:
-    //     call void @__stack_chk_fail()
-    //     unreachable
-
-    // Split the basic block before the return instruction.
-    BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+    if (SupportsSelectionDAGSP) {
+      // Since we have a potential tail call, insert the special stack check
+      // intrinsic.
+      Instruction *InsertionPt = 0;
+      if (CallInst *CI = FindPotentialTailCall(BB, RI, TLI)) {
+        InsertionPt = CI;
+      } else {
+        InsertionPt = RI;
+        // At this point we know that BB has a return statement so it *DOES*
+        // have a terminator.
+        assert(InsertionPt != 0 && "BB must have a terminator instruction at "
+                                   "this point.");
+      }
 
-    if (DT && DT->isReachableFromEntry(BB)) {
-      DT->addNewBlock(NewBB, BB);
-      FailBBDom = FailBBDom ? DT->findNearestCommonDominator(FailBBDom, BB) :BB;
-    }
+      Function *Intrinsic =
+          Intrinsic::getDeclaration(M, Intrinsic::stackprotectorcheck);
+      CallInst::Create(Intrinsic, StackGuardVar, "", InsertionPt);
+
+    } else {
+      // If we do not support SelectionDAG based tail calls, generate IR level
+      // tail calls.
+      //
+      // For each block with a return instruction, convert this:
+      //
+      //   return:
+      //     ...
+      //     ret ...
+      //
+      // into this:
+      //
+      //   return:
+      //     ...
+      //     %1 = load __stack_chk_guard
+      //     %2 = load StackGuardSlot
+      //     %3 = cmp i1 %1, %2
+      //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
+      //
+      //   SP_return:
+      //     ret ...
+      //
+      //   CallStackCheckFailBlk:
+      //     call void @__stack_chk_fail()
+      //     unreachable
+
+      // Create the FailBB. We duplicate the BB every time since the MI tail
+      // merge pass will merge together all of the various BB into one including
+      // fail BB generated by the stack protector pseudo instruction.
+      BasicBlock *FailBB = CreateFailBB();
+
+      // Split the basic block before the return instruction.
+      BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+
+      // Update the dominator tree if we need to.
+      if (DT && DT->isReachableFromEntry(BB)) {
+        DT->addNewBlock(NewBB, BB);
+        DT->addNewBlock(FailBB, BB);
+      }
 
-    // Remove default branch instruction to the new BB.
-    BB->getTerminator()->eraseFromParent();
+      // Remove default branch instruction to the new BB.
+      BB->getTerminator()->eraseFromParent();
 
-    // Move the newly created basic block to the point right after the old basic
-    // block so that it's in the "fall through" position.
-    NewBB->moveAfter(BB);
+      // Move the newly created basic block to the point right after the old
+      // basic block so that it's in the "fall through" position.
+      NewBB->moveAfter(BB);
 
-    // Generate the stack protector instructions in the old basic block.
-    LoadInst *LI1 = new LoadInst(StackGuardVar, "", false, BB);
-    LoadInst *LI2 = new LoadInst(AI, "", true, BB);
-    ICmpInst *Cmp = new ICmpInst(*BB, CmpInst::ICMP_EQ, LI1, LI2, "");
-    BranchInst::Create(NewBB, FailBB, Cmp, BB);
+      // Generate the stack protector instructions in the old basic block.
+      IRBuilder<> B(BB);
+      LoadInst *LI1 = B.CreateLoad(StackGuardVar);
+      LoadInst *LI2 = B.CreateLoad(AI);
+      Value *Cmp = B.CreateICmpEQ(LI1, LI2);
+      B.CreateCondBr(Cmp, NewBB, FailBB);
+    }
   }
 
   // Return if we didn't modify any basic blocks. I.e., there are no return
   // statements in the function.
-  if (!FailBB) return false;
-
-  if (DT && FailBBDom)
-    DT->addNewBlock(FailBB, FailBBDom);
+  if (!HasPrologue)
+    return false;
 
   return true;
 }
@@ -387,29 +454,18 @@ bool StackProtector::InsertStackProtectors() {
 BasicBlock *StackProtector::CreateFailBB() {
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
+  IRBuilder<> B(FailBB);
   if (Trip.getOS() == llvm::Triple::OpenBSD) {
     Constant *StackChkFail = M->getOrInsertFunction(
         "__stack_smash_handler", Type::getVoidTy(Context),
         Type::getInt8PtrTy(Context), NULL);
 
-    Constant *NameStr = ConstantDataArray::getString(Context, F->getName());
-    Constant *FuncName =
-        new GlobalVariable(*M, NameStr->getType(), true,
-                           GlobalVariable::PrivateLinkage, NameStr, "SSH");
-
-    SmallVector<Constant *, 2> IdxList;
-    IdxList.push_back(ConstantInt::get(Type::getInt8Ty(Context), 0));
-    IdxList.push_back(ConstantInt::get(Type::getInt8Ty(Context), 0));
-
-    SmallVector<Value *, 1> Args;
-    Args.push_back(ConstantExpr::getGetElementPtr(FuncName, IdxList));
-
-    CallInst::Create(StackChkFail, Args, "", FailBB);
+    B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
     Constant *StackChkFail = M->getOrInsertFunction(
         "__stack_chk_fail", Type::getVoidTy(Context), NULL);
-    CallInst::Create(StackChkFail, "", FailBB);
+    B.CreateCall(StackChkFail);
   }
-  new UnreachableInst(Context, FailBB);
+  B.CreateUnreachable();
   return FailBB;
 }
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
deleted file mode 100644
index b337c53..0000000
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ /dev/null
@@ -1,825 +0,0 @@
-//===- StrongPHIElimination.cpp - Eliminate PHI nodes by inserting copies -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass eliminates PHI instructions by aggressively coalescing the copies
-// that would be inserted by a naive algorithm and only inserting the copies
-// that are necessary. The coalescing technique initially assumes that all
-// registers appearing in a PHI instruction do not interfere. It then eliminates
-// proven interferences, using dominators to only perform a linear number of
-// interference tests instead of the quadratic number of interference tests
-// that this would naively require. This is a technique derived from:
-// 
-//    Budimlic, et al. Fast copy coalescing and live-range identification.
-//    In Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language
-//    Design and Implementation (Berlin, Germany, June 17 - 19, 2002).
-//    PLDI '02. ACM, New York, NY, 25-32.
-//
-// The original implementation constructs a data structure they call a dominance
-// forest for this purpose. The dominance forest was shown to be unnecessary,
-// as it is possible to emulate the creation and traversal of a dominance forest
-// by directly using the dominator tree, rather than actually constructing the
-// dominance forest.  This technique is explained in:
-//
-//   Boissinot, et al. Revisiting Out-of-SSA Translation for Correctness, Code
-//     Quality and Efficiency,
-//   In Proceedings of the 7th annual IEEE/ACM International Symposium on Code
-//   Generation and Optimization (Seattle, Washington, March 22 - 25, 2009).
-//   CGO '09. IEEE, Washington, DC, 114-125.
-//
-// Careful implementation allows for all of the dominator forest interference
-// checks to be performed at once in a single depth-first traversal of the
-// dominator tree, which is what is implemented here.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "strongphielim"
-#include "llvm/CodeGen/Passes.h"
-#include "PHIEliminationUtils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
-using namespace llvm;
-
-namespace {
-  class StrongPHIElimination : public MachineFunctionPass {
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    StrongPHIElimination() : MachineFunctionPass(ID) {
-      initializeStrongPHIEliminationPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage&) const;
-    bool runOnMachineFunction(MachineFunction&);
-
-  private:
-    /// This struct represents a single node in the union-find data structure
-    /// representing the variable congruence classes. There is one difference
-    /// from a normal union-find data structure. We steal two bits from the parent
-    /// pointer . One of these bits is used to represent whether the register
-    /// itself has been isolated, and the other is used to represent whether the
-    /// PHI with that register as its destination has been isolated.
-    ///
-    /// Note that this leads to the strange situation where the leader of a
-    /// congruence class may no longer logically be a member, due to being
-    /// isolated.
-    struct Node {
-      enum Flags {
-        kRegisterIsolatedFlag = 1,
-        kPHIIsolatedFlag = 2
-      };
-      Node(unsigned v) : value(v), rank(0) { parent.setPointer(this); }
-
-      Node *getLeader();
-
-      PointerIntPair<Node*, 2> parent;
-      unsigned value;
-      unsigned rank;
-    };
-
-    /// Add a register in a new congruence class containing only itself.
-    void addReg(unsigned);
-
-    /// Join the congruence classes of two registers. This function is biased
-    /// towards the left argument, i.e. after
-    ///
-    /// addReg(r2);
-    /// unionRegs(r1, r2);
-    ///
-    /// the leader of the unioned congruence class is the same as the leader of
-    /// r1's congruence class prior to the union. This is actually relied upon
-    /// in the copy insertion code.
-    void unionRegs(unsigned, unsigned);
-
-    /// Get the color of a register. The color is 0 if the register has been
-    /// isolated.
-    unsigned getRegColor(unsigned);
-
-    // Isolate a register.
-    void isolateReg(unsigned);
-
-    /// Get the color of a PHI. The color of a PHI is 0 if the PHI has been
-    /// isolated. Otherwise, it is the original color of its destination and
-    /// all of its operands (before they were isolated, if they were).
-    unsigned getPHIColor(MachineInstr*);
-
-    /// Isolate a PHI.
-    void isolatePHI(MachineInstr*);
-
-    /// Traverses a basic block, splitting any interferences found between
-    /// registers in the same congruence class. It takes two DenseMaps as
-    /// arguments that it also updates: CurrentDominatingParent, which maps
-    /// a color to the register in that congruence class whose definition was
-    /// most recently seen, and ImmediateDominatingParent, which maps a register
-    /// to the register in the same congruence class that most immediately
-    /// dominates it.
-    ///
-    /// This function assumes that it is being called in a depth-first traversal
-    /// of the dominator tree.
-    void SplitInterferencesForBasicBlock(
-      MachineBasicBlock&,
-      DenseMap<unsigned, unsigned> &CurrentDominatingParent,
-      DenseMap<unsigned, unsigned> &ImmediateDominatingParent);
-
-    // Lowers a PHI instruction, inserting copies of the source and destination
-    // registers as necessary.
-    void InsertCopiesForPHI(MachineInstr*, MachineBasicBlock*);
-
-    // Merges the live interval of Reg into NewReg and renames Reg to NewReg
-    // everywhere that Reg appears. Requires Reg and NewReg to have non-
-    // overlapping lifetimes.
-    void MergeLIsAndRename(unsigned Reg, unsigned NewReg);
-
-    MachineRegisterInfo *MRI;
-    const TargetInstrInfo *TII;
-    MachineDominatorTree *DT;
-    LiveIntervals *LI;
-
-    BumpPtrAllocator Allocator;
-
-    DenseMap<unsigned, Node*> RegNodeMap;
-
-    // Maps a basic block to a list of its defs of registers that appear as PHI
-    // sources.
-    DenseMap<MachineBasicBlock*, std::vector<MachineInstr*> > PHISrcDefs;
-
-    // Maps a color to a pair of a MachineInstr* and a virtual register, which
-    // is the operand of that PHI corresponding to the current basic block.
-    DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > CurrentPHIForColor;
-
-    // FIXME: Can these two data structures be combined? Would a std::multimap
-    // be any better?
-
-    // Stores pairs of predecessor basic blocks and the source registers of
-    // inserted copy instructions.
-    typedef DenseSet<std::pair<MachineBasicBlock*, unsigned> > SrcCopySet;
-    SrcCopySet InsertedSrcCopySet;
-
-    // Maps pairs of predecessor basic blocks and colors to their defining copy
-    // instructions.
-    typedef DenseMap<std::pair<MachineBasicBlock*, unsigned>, MachineInstr*>
-      SrcCopyMap;
-    SrcCopyMap InsertedSrcCopyMap;
-
-    // Maps inserted destination copy registers to their defining copy
-    // instructions.
-    typedef DenseMap<unsigned, MachineInstr*> DestCopyMap;
-    DestCopyMap InsertedDestCopies;
-  };
-
-  struct MIIndexCompare {
-    MIIndexCompare(LiveIntervals *LiveIntervals) : LI(LiveIntervals) { }
-
-    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
-      return LI->getInstructionIndex(LHS) < LI->getInstructionIndex(RHS);
-    }
-
-    LiveIntervals *LI;
-  };
-} // namespace
-
-STATISTIC(NumPHIsLowered, "Number of PHIs lowered");
-STATISTIC(NumDestCopiesInserted, "Number of destination copies inserted");
-STATISTIC(NumSrcCopiesInserted, "Number of source copies inserted");
-
-char StrongPHIElimination::ID = 0;
-INITIALIZE_PASS_BEGIN(StrongPHIElimination, "strong-phi-node-elimination",
-  "Eliminate PHI nodes for register allocation, intelligently", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(StrongPHIElimination, "strong-phi-node-elimination",
-  "Eliminate PHI nodes for register allocation, intelligently", false, false)
-
-char &llvm::StrongPHIEliminationID = StrongPHIElimination::ID;
-
-void StrongPHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<SlotIndexes>();
-  AU.addPreserved<SlotIndexes>();
-  AU.addRequired<LiveIntervals>();
-  AU.addPreserved<LiveIntervals>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-static MachineOperand *findLastUse(MachineBasicBlock *MBB, unsigned Reg) {
-  // FIXME: This only needs to check from the first terminator, as only the
-  // first terminator can use a virtual register.
-  for (MachineBasicBlock::reverse_iterator RI = MBB->rbegin(); ; ++RI) {
-    assert (RI != MBB->rend());
-    MachineInstr *MI = &*RI;
-
-    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-         OE = MI->operands_end(); OI != OE; ++OI) {
-      MachineOperand &MO = *OI;
-      if (MO.isReg() && MO.isUse() && MO.getReg() == Reg)
-        return &MO;
-    }
-  }
-}
-
-bool StrongPHIElimination::runOnMachineFunction(MachineFunction &MF) {
-  MRI = &MF.getRegInfo();
-  TII = MF.getTarget().getInstrInfo();
-  DT = &getAnalysis<MachineDominatorTree>();
-  LI = &getAnalysis<LiveIntervals>();
-
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      unsigned DestReg = BBI->getOperand(0).getReg();
-      addReg(DestReg);
-      PHISrcDefs[I].push_back(BBI);
-
-      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
-        MachineOperand &SrcMO = BBI->getOperand(i);
-        unsigned SrcReg = SrcMO.getReg();
-        addReg(SrcReg);
-        unionRegs(DestReg, SrcReg);
-
-        MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-        if (DefMI)
-          PHISrcDefs[DefMI->getParent()].push_back(DefMI);
-      }
-    }
-  }
-
-  // Perform a depth-first traversal of the dominator tree, splitting
-  // interferences amongst PHI-congruence classes.
-  DenseMap<unsigned, unsigned> CurrentDominatingParent;
-  DenseMap<unsigned, unsigned> ImmediateDominatingParent;
-  for (df_iterator<MachineDomTreeNode*> DI = df_begin(DT->getRootNode()),
-       DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
-    SplitInterferencesForBasicBlock(*DI->getBlock(),
-                                    CurrentDominatingParent,
-                                    ImmediateDominatingParent);
-  }
-
-  // Insert copies for all PHI source and destination registers.
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      InsertCopiesForPHI(BBI, I);
-    }
-  }
-
-  // FIXME: Preserve the equivalence classes during copy insertion and use
-  // the preversed equivalence classes instead of recomputing them.
-  RegNodeMap.clear();
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      unsigned DestReg = BBI->getOperand(0).getReg();
-      addReg(DestReg);
-
-      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
-        unsigned SrcReg = BBI->getOperand(i).getReg();
-        addReg(SrcReg);
-        unionRegs(DestReg, SrcReg);
-      }
-    }
-  }
-
-  DenseMap<unsigned, unsigned> RegRenamingMap;
-  bool Changed = false;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-    while (BBI != BBE && BBI->isPHI()) {
-      MachineInstr *PHI = BBI;
-
-      assert(PHI->getNumOperands() > 0);
-
-      unsigned SrcReg = PHI->getOperand(1).getReg();
-      unsigned SrcColor = getRegColor(SrcReg);
-      unsigned NewReg = RegRenamingMap[SrcColor];
-      if (!NewReg) {
-        NewReg = SrcReg;
-        RegRenamingMap[SrcColor] = SrcReg;
-      }
-      MergeLIsAndRename(SrcReg, NewReg);
-
-      unsigned DestReg = PHI->getOperand(0).getReg();
-      if (!InsertedDestCopies.count(DestReg))
-        MergeLIsAndRename(DestReg, NewReg);
-
-      for (unsigned i = 3; i < PHI->getNumOperands(); i += 2) {
-        unsigned SrcReg = PHI->getOperand(i).getReg();
-        MergeLIsAndRename(SrcReg, NewReg);
-      }
-
-      ++BBI;
-      LI->RemoveMachineInstrFromMaps(PHI);
-      PHI->eraseFromParent();
-      Changed = true;
-    }
-  }
-
-  // Due to the insertion of copies to split live ranges, the live intervals are
-  // guaranteed to not overlap, except in one case: an original PHI source and a
-  // PHI destination copy. In this case, they have the same value and thus don't
-  // truly intersect, so we merge them into the value live at that point.
-  // FIXME: Is there some better way we can handle this?
-  for (DestCopyMap::iterator I = InsertedDestCopies.begin(),
-       E = InsertedDestCopies.end(); I != E; ++I) {
-    unsigned DestReg = I->first;
-    unsigned DestColor = getRegColor(DestReg);
-    unsigned NewReg = RegRenamingMap[DestColor];
-
-    LiveInterval &DestLI = LI->getInterval(DestReg);
-    LiveInterval &NewLI = LI->getInterval(NewReg);
-
-    assert(DestLI.ranges.size() == 1
-           && "PHI destination copy's live interval should be a single live "
-               "range from the beginning of the BB to the copy instruction.");
-    LiveRange *DestLR = DestLI.begin();
-    VNInfo *NewVNI = NewLI.getVNInfoAt(DestLR->start);
-    if (!NewVNI) {
-      NewVNI = NewLI.createValueCopy(DestLR->valno, LI->getVNInfoAllocator());
-      MachineInstr *CopyInstr = I->second;
-      CopyInstr->getOperand(1).setIsKill(true);
-    }
-
-    LiveRange NewLR(DestLR->start, DestLR->end, NewVNI);
-    NewLI.addRange(NewLR);
-
-    LI->removeInterval(DestReg);
-    MRI->replaceRegWith(DestReg, NewReg);
-  }
-
-  // Adjust the live intervals of all PHI source registers to handle the case
-  // where the PHIs in successor blocks were the only later uses of the source
-  // register.
-  for (SrcCopySet::iterator I = InsertedSrcCopySet.begin(),
-       E = InsertedSrcCopySet.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I->first;
-    unsigned SrcReg = I->second;
-    if (unsigned RenamedRegister = RegRenamingMap[getRegColor(SrcReg)])
-      SrcReg = RenamedRegister;
-
-    LiveInterval &SrcLI = LI->getInterval(SrcReg);
-
-    bool isLiveOut = false;
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-      if (SrcLI.liveAt(LI->getMBBStartIdx(*SI))) {
-        isLiveOut = true;
-        break;
-      }
-    }
-
-    if (isLiveOut)
-      continue;
-
-    MachineOperand *LastUse = findLastUse(MBB, SrcReg);
-    assert(LastUse);
-    SlotIndex LastUseIndex = LI->getInstructionIndex(LastUse->getParent());
-    SrcLI.removeRange(LastUseIndex.getRegSlot(), LI->getMBBEndIdx(MBB));
-    LastUse->setIsKill(true);
-  }
-
-  Allocator.Reset();
-  RegNodeMap.clear();
-  PHISrcDefs.clear();
-  InsertedSrcCopySet.clear();
-  InsertedSrcCopyMap.clear();
-  InsertedDestCopies.clear();
-
-  return Changed;
-}
-
-void StrongPHIElimination::addReg(unsigned Reg) {
-  Node *&N = RegNodeMap[Reg];
-  if (!N)
-    N = new (Allocator) Node(Reg);
-}
-
-StrongPHIElimination::Node*
-StrongPHIElimination::Node::getLeader() {
-  Node *N = this;
-  Node *Parent = parent.getPointer();
-  Node *Grandparent = Parent->parent.getPointer();
-
-  while (Parent != Grandparent) {
-    N->parent.setPointer(Grandparent);
-    N = Grandparent;
-    Parent = Parent->parent.getPointer();
-    Grandparent = Parent->parent.getPointer();
-  }
-
-  return Parent;
-}
-
-unsigned StrongPHIElimination::getRegColor(unsigned Reg) {
-  DenseMap<unsigned, Node*>::iterator RI = RegNodeMap.find(Reg);
-  if (RI == RegNodeMap.end())
-    return 0;
-  Node *Node = RI->second;
-  if (Node->parent.getInt() & Node::kRegisterIsolatedFlag)
-    return 0;
-  return Node->getLeader()->value;
-}
-
-void StrongPHIElimination::unionRegs(unsigned Reg1, unsigned Reg2) {
-  Node *Node1 = RegNodeMap[Reg1]->getLeader();
-  Node *Node2 = RegNodeMap[Reg2]->getLeader();
-
-  if (Node1->rank > Node2->rank) {
-    Node2->parent.setPointer(Node1->getLeader());
-  } else if (Node1->rank < Node2->rank) {
-    Node1->parent.setPointer(Node2->getLeader());
-  } else if (Node1 != Node2) {
-    Node2->parent.setPointer(Node1->getLeader());
-    Node1->rank++;
-  }
-}
-
-void StrongPHIElimination::isolateReg(unsigned Reg) {
-  Node *Node = RegNodeMap[Reg];
-  Node->parent.setInt(Node->parent.getInt() | Node::kRegisterIsolatedFlag);
-}
-
-unsigned StrongPHIElimination::getPHIColor(MachineInstr *PHI) {
-  assert(PHI->isPHI());
-
-  unsigned DestReg = PHI->getOperand(0).getReg();
-  Node *DestNode = RegNodeMap[DestReg];
-  if (DestNode->parent.getInt() & Node::kPHIIsolatedFlag)
-    return 0;
-
-  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
-    unsigned SrcColor = getRegColor(PHI->getOperand(i).getReg());
-    if (SrcColor)
-      return SrcColor;
-  }
-  return 0;
-}
-
-void StrongPHIElimination::isolatePHI(MachineInstr *PHI) {
-  assert(PHI->isPHI());
-  Node *Node = RegNodeMap[PHI->getOperand(0).getReg()];
-  Node->parent.setInt(Node->parent.getInt() | Node::kPHIIsolatedFlag);
-}
-
-/// SplitInterferencesForBasicBlock - traverses a basic block, splitting any
-/// interferences found between registers in the same congruence class. It
-/// takes two DenseMaps as arguments that it also updates:
-///
-/// 1) CurrentDominatingParent, which maps a color to the register in that
-///    congruence class whose definition was most recently seen.
-///
-/// 2) ImmediateDominatingParent, which maps a register to the register in the
-///    same congruence class that most immediately dominates it.
-///
-/// This function assumes that it is being called in a depth-first traversal
-/// of the dominator tree.
-///
-/// The algorithm used here is a generalization of the dominance-based SSA test
-/// for two variables. If there are variables a_1, ..., a_n such that
-///
-///   def(a_1) dom ... dom def(a_n),
-///
-/// then we can test for an interference between any two a_i by only using O(n)
-/// interference tests between pairs of variables. If i < j and a_i and a_j
-/// interfere, then a_i is alive at def(a_j), so it is also alive at def(a_i+1).
-/// Thus, in order to test for an interference involving a_i, we need only check
-/// for a potential interference with a_i+1.
-///
-/// This method can be generalized to arbitrary sets of variables by performing
-/// a depth-first traversal of the dominator tree. As we traverse down a branch
-/// of the dominator tree, we keep track of the current dominating variable and
-/// only perform an interference test with that variable. However, when we go to
-/// another branch of the dominator tree, the definition of the current dominating
-/// variable may no longer dominate the current block. In order to correct this,
-/// we need to use a stack of past choices of the current dominating variable
-/// and pop from this stack until we find a variable whose definition actually
-/// dominates the current block.
-/// 
-/// There will be one push on this stack for each variable that has become the
-/// current dominating variable, so instead of using an explicit stack we can
-/// simply associate the previous choice for a current dominating variable with
-/// the new choice. This works better in our implementation, where we test for
-/// interference in multiple distinct sets at once.
-void
-StrongPHIElimination::SplitInterferencesForBasicBlock(
-    MachineBasicBlock &MBB,
-    DenseMap<unsigned, unsigned> &CurrentDominatingParent,
-    DenseMap<unsigned, unsigned> &ImmediateDominatingParent) {
-  // Sort defs by their order in the original basic block, as the code below
-  // assumes that it is processing definitions in dominance order.
-  std::vector<MachineInstr*> &DefInstrs = PHISrcDefs[&MBB];
-  std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LI));
-
-  for (std::vector<MachineInstr*>::const_iterator BBI = DefInstrs.begin(),
-       BBE = DefInstrs.end(); BBI != BBE; ++BBI) {
-    for (MachineInstr::const_mop_iterator I = (*BBI)->operands_begin(),
-         E = (*BBI)->operands_end(); I != E; ++I) {
-      const MachineOperand &MO = *I;
-
-      // FIXME: This would be faster if it were possible to bail out of checking
-      // an instruction's operands after the explicit defs, but this is incorrect
-      // for variadic instructions, which may appear before register allocation
-      // in the future.
-      if (!MO.isReg() || !MO.isDef())
-        continue;
-
-      unsigned DestReg = MO.getReg();
-      if (!DestReg || !TargetRegisterInfo::isVirtualRegister(DestReg))
-        continue;
-
-      // If the virtual register being defined is not used in any PHI or has
-      // already been isolated, then there are no more interferences to check.
-      unsigned DestColor = getRegColor(DestReg);
-      if (!DestColor)
-        continue;
-
-      // The input to this pass sometimes is not in SSA form in every basic
-      // block, as some virtual registers have redefinitions. We could eliminate
-      // this by fixing the passes that generate the non-SSA code, or we could
-      // handle it here by tracking defining machine instructions rather than
-      // virtual registers. For now, we just handle the situation conservatively
-      // in a way that will possibly lead to false interferences.
-      unsigned &CurrentParent = CurrentDominatingParent[DestColor];
-      unsigned NewParent = CurrentParent;
-      if (NewParent == DestReg)
-        continue;
-
-      // Pop registers from the stack represented by ImmediateDominatingParent
-      // until we find a parent that dominates the current instruction.
-      while (NewParent && (!DT->dominates(MRI->getVRegDef(NewParent), *BBI)
-                           || !getRegColor(NewParent)))
-        NewParent = ImmediateDominatingParent[NewParent];
-
-      // If NewParent is nonzero, then its definition dominates the current
-      // instruction, so it is only necessary to check for the liveness of
-      // NewParent in order to check for an interference.
-      if (NewParent
-          && LI->getInterval(NewParent).liveAt(LI->getInstructionIndex(*BBI))) {
-        // If there is an interference, always isolate the new register. This
-        // could be improved by using a heuristic that decides which of the two
-        // registers to isolate.
-        isolateReg(DestReg);
-        CurrentParent = NewParent;
-      } else {
-        // If there is no interference, update ImmediateDominatingParent and set
-        // the CurrentDominatingParent for this color to the current register.
-        ImmediateDominatingParent[DestReg] = NewParent;
-        CurrentParent = DestReg;
-      }
-    }
-  }
-
-  // We now walk the PHIs in successor blocks and check for interferences. This
-  // is necessary because the use of a PHI's operands are logically contained in
-  // the predecessor block. The def of a PHI's destination register is processed
-  // along with the other defs in a basic block.
-
-  CurrentPHIForColor.clear();
-
-  for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
-       SE = MBB.succ_end(); SI != SE; ++SI) {
-    for (MachineBasicBlock::iterator BBI = (*SI)->begin(), BBE = (*SI)->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      MachineInstr *PHI = BBI;
-
-      // If a PHI is already isolated, either by being isolated directly or
-      // having all of its operands isolated, ignore it.
-      unsigned Color = getPHIColor(PHI);
-      if (!Color)
-        continue;
-
-      // Find the index of the PHI operand that corresponds to this basic block.
-      unsigned PredIndex;
-      for (PredIndex = 1; PredIndex < PHI->getNumOperands(); PredIndex += 2) {
-        if (PHI->getOperand(PredIndex + 1).getMBB() == &MBB)
-          break;
-      }
-      assert(PredIndex < PHI->getNumOperands());
-      unsigned PredOperandReg = PHI->getOperand(PredIndex).getReg();
-
-      // Pop registers from the stack represented by ImmediateDominatingParent
-      // until we find a parent that dominates the current instruction.
-      unsigned &CurrentParent = CurrentDominatingParent[Color];
-      unsigned NewParent = CurrentParent;
-      while (NewParent
-             && (!DT->dominates(MRI->getVRegDef(NewParent)->getParent(), &MBB)
-                 || !getRegColor(NewParent)))
-        NewParent = ImmediateDominatingParent[NewParent];
-      CurrentParent = NewParent;
-
-      // If there is an interference with a register, always isolate the
-      // register rather than the PHI. It is also possible to isolate the
-      // PHI, but that introduces copies for all of the registers involved
-      // in that PHI.
-      if (NewParent && LI->isLiveOutOfMBB(LI->getInterval(NewParent), &MBB)
-                    && NewParent != PredOperandReg)
-        isolateReg(NewParent);
-
-      std::pair<MachineInstr*, unsigned>
-        &CurrentPHI = CurrentPHIForColor[Color];
-
-      // If two PHIs have the same operand from every shared predecessor, then
-      // they don't actually interfere. Otherwise, isolate the current PHI. This
-      // could possibly be improved, e.g. we could isolate the PHI with the
-      // fewest operands.
-      if (CurrentPHI.first && CurrentPHI.second != PredOperandReg)
-        isolatePHI(PHI);
-      else
-        CurrentPHI = std::make_pair(PHI, PredOperandReg);
-    }
-  }
-}
-
-void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
-                                              MachineBasicBlock *MBB) {
-  assert(PHI->isPHI());
-  ++NumPHIsLowered;
-  unsigned PHIColor = getPHIColor(PHI);
-
-  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
-    MachineOperand &SrcMO = PHI->getOperand(i);
-
-    // If a source is defined by an implicit def, there is no need to insert a
-    // copy in the predecessor.
-    if (SrcMO.isUndef())
-      continue;
-
-    unsigned SrcReg = SrcMO.getReg();
-    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-           "Machine PHI Operands must all be virtual registers!");
-
-    MachineBasicBlock *PredBB = PHI->getOperand(i + 1).getMBB();
-    unsigned SrcColor = getRegColor(SrcReg);
-
-    // If neither the PHI nor the operand were isolated, then we only need to
-    // set the phi-kill flag on the VNInfo at this PHI.
-    if (PHIColor && SrcColor == PHIColor) {
-      LiveInterval &SrcInterval = LI->getInterval(SrcReg);
-      SlotIndex PredIndex = LI->getMBBEndIdx(PredBB);
-      VNInfo *SrcVNI = SrcInterval.getVNInfoBefore(PredIndex);
-      (void)SrcVNI;
-      assert(SrcVNI);
-      continue;
-    }
-
-    unsigned CopyReg = 0;
-    if (PHIColor) {
-      SrcCopyMap::const_iterator I
-        = InsertedSrcCopyMap.find(std::make_pair(PredBB, PHIColor));
-      CopyReg
-        = I != InsertedSrcCopyMap.end() ? I->second->getOperand(0).getReg() : 0;
-    }
-
-    if (!CopyReg) {
-      const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-      CopyReg = MRI->createVirtualRegister(RC);
-
-      MachineBasicBlock::iterator
-        CopyInsertPoint = findPHICopyInsertPoint(PredBB, MBB, SrcReg);
-      unsigned SrcSubReg = SrcMO.getSubReg();
-      MachineInstr *CopyInstr = BuildMI(*PredBB,
-                                        CopyInsertPoint,
-                                        PHI->getDebugLoc(),
-                                        TII->get(TargetOpcode::COPY),
-                                        CopyReg).addReg(SrcReg, 0, SrcSubReg);
-      LI->InsertMachineInstrInMaps(CopyInstr);
-      ++NumSrcCopiesInserted;
-
-      // addLiveRangeToEndOfBlock() also adds the phikill flag to the VNInfo for
-      // the newly added range.
-      LI->addLiveRangeToEndOfBlock(CopyReg, CopyInstr);
-      InsertedSrcCopySet.insert(std::make_pair(PredBB, SrcReg));
-
-      addReg(CopyReg);
-      if (PHIColor) {
-        unionRegs(PHIColor, CopyReg);
-        assert(getRegColor(CopyReg) != CopyReg);
-      } else {
-        PHIColor = CopyReg;
-        assert(getRegColor(CopyReg) == CopyReg);
-      }
-
-      // Insert into map if not already there.
-      InsertedSrcCopyMap.insert(std::make_pair(std::make_pair(PredBB, PHIColor),
-                                               CopyInstr));
-    }
-
-    SrcMO.setReg(CopyReg);
-
-    // If SrcReg is not live beyond the PHI, trim its interval so that it is no
-    // longer live-in to MBB. Note that SrcReg may appear in other PHIs that are
-    // processed later, but this is still correct to do at this point because we
-    // never rely on LiveIntervals being correct while inserting copies.
-    // FIXME: Should this just count uses at PHIs like the normal PHIElimination
-    // pass does?
-    LiveInterval &SrcLI = LI->getInterval(SrcReg);
-    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-    SlotIndex NextInstrIndex = PHIIndex.getNextIndex();
-    if (SrcLI.liveAt(MBBStartIndex) && SrcLI.expiredAt(NextInstrIndex))
-      SrcLI.removeRange(MBBStartIndex, PHIIndex, true);
-  }
-
-  unsigned DestReg = PHI->getOperand(0).getReg();
-  unsigned DestColor = getRegColor(DestReg);
-
-  if (PHIColor && DestColor == PHIColor) {
-    LiveInterval &DestLI = LI->getInterval(DestReg);
-
-    // Set the phi-def flag for the VN at this PHI.
-    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-    VNInfo *DestVNI = DestLI.getVNInfoAt(PHIIndex.getRegSlot());
-    assert(DestVNI);
-  
-    // Prior to PHI elimination, the live ranges of PHIs begin at their defining
-    // instruction. After PHI elimination, PHI instructions are replaced by VNs
-    // with the phi-def flag set, and the live ranges of these VNs start at the
-    // beginning of the basic block.
-    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-    DestVNI->def = MBBStartIndex;
-    DestLI.addRange(LiveRange(MBBStartIndex,
-                              PHIIndex.getRegSlot(),
-                              DestVNI));
-    return;
-  }
-
-  const TargetRegisterClass *RC = MRI->getRegClass(DestReg);
-  unsigned CopyReg = MRI->createVirtualRegister(RC);
-
-  MachineInstr *CopyInstr = BuildMI(*MBB,
-                                    MBB->SkipPHIsAndLabels(MBB->begin()),
-                                    PHI->getDebugLoc(),
-                                    TII->get(TargetOpcode::COPY),
-                                    DestReg).addReg(CopyReg);
-  LI->InsertMachineInstrInMaps(CopyInstr);
-  PHI->getOperand(0).setReg(CopyReg);
-  ++NumDestCopiesInserted;
-
-  // Add the region from the beginning of MBB to the copy instruction to
-  // CopyReg's live interval, and give the VNInfo the phidef flag.
-  LiveInterval &CopyLI = LI->getOrCreateInterval(CopyReg);
-  SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-  SlotIndex DestCopyIndex = LI->getInstructionIndex(CopyInstr);
-  VNInfo *CopyVNI = CopyLI.getNextValue(MBBStartIndex,
-                                        LI->getVNInfoAllocator());
-  CopyLI.addRange(LiveRange(MBBStartIndex,
-                            DestCopyIndex.getRegSlot(),
-                            CopyVNI));
-
-  // Adjust DestReg's live interval to adjust for its new definition at
-  // CopyInstr.
-  LiveInterval &DestLI = LI->getOrCreateInterval(DestReg);
-  SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-  DestLI.removeRange(PHIIndex.getRegSlot(), DestCopyIndex.getRegSlot());
-
-  VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
-  assert(DestVNI);
-  DestVNI->def = DestCopyIndex.getRegSlot();
-
-  InsertedDestCopies[CopyReg] = CopyInstr;
-}
-
-void StrongPHIElimination::MergeLIsAndRename(unsigned Reg, unsigned NewReg) {
-  if (Reg == NewReg)
-    return;
-
-  LiveInterval &OldLI = LI->getInterval(Reg);
-  LiveInterval &NewLI = LI->getInterval(NewReg);
-
-  // Merge the live ranges of the two registers.
-  DenseMap<VNInfo*, VNInfo*> VNMap;
-  for (LiveInterval::iterator LRI = OldLI.begin(), LRE = OldLI.end();
-       LRI != LRE; ++LRI) {
-    LiveRange OldLR = *LRI;
-    VNInfo *OldVN = OldLR.valno;
-
-    VNInfo *&NewVN = VNMap[OldVN];
-    if (!NewVN) {
-      NewVN = NewLI.createValueCopy(OldVN, LI->getVNInfoAllocator());
-      VNMap[OldVN] = NewVN;
-    }
-
-    LiveRange LR(OldLR.start, OldLR.end, NewVN);
-    NewLI.addRange(LR);
-  }
-
-  // Remove the LiveInterval for the register being renamed and replace all
-  // of its defs and uses with the new register.
-  LI->removeInterval(Reg);
-  MRI->replaceRegWith(Reg, NewReg);
-}
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 8a1d567..ff0181e 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -638,8 +638,6 @@ bothUsedInPHI(const MachineBasicBlock &A,
 
 bool
 TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
-  SmallPtrSet<MachineBasicBlock*, 8> Succs(BB.succ_begin(), BB.succ_end());
-
   for (MachineBasicBlock::pred_iterator PI = BB.pred_begin(),
        PE = BB.pred_end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index bb8bd42..bf4fd65 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
@@ -276,6 +277,36 @@ bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
   return false;
 }
 
+bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
+                                        unsigned SubIdx, unsigned &Size,
+                                        unsigned &Offset,
+                                        const TargetMachine *TM) const {
+  if (!SubIdx) {
+    Size = RC->getSize();
+    Offset = 0;
+    return true;
+  }
+  unsigned BitSize = TM->getRegisterInfo()->getSubRegIdxSize(SubIdx);
+  // Convert bit size to byte size to be consistent with
+  // MCRegisterClass::getSize().
+  if (BitSize % 8)
+    return false;
+
+  int BitOffset = TM->getRegisterInfo()->getSubRegIdxOffset(SubIdx);
+  if (BitOffset < 0 || BitOffset % 8)
+    return false;
+
+  Size = BitSize /= 8;
+  Offset = (unsigned)BitOffset / 8;
+
+  assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
+
+  if (!TM->getDataLayout()->isLittleEndian()) {
+    Offset = RC->getSize() - (Offset + Size);
+  }
+  return true;
+}
+
 void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
                                     unsigned DestReg,
@@ -364,6 +395,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
 
   // Ask the target to do the actual folding.
   if (MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FI)) {
+    NewMI->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
     // Add a memory operand, foldMemoryOperandImpl doesn't do that.
     assert((!(Flags & MachineMemOperand::MOStore) ||
             NewMI->mayStore()) &&
@@ -424,9 +456,19 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
   NewMI = MBB.insert(MI, NewMI);
 
   // Copy the memoperands from the load to the folded instruction.
-  NewMI->setMemRefs(LoadMI->memoperands_begin(),
-                    LoadMI->memoperands_end());
-
+  if (MI->memoperands_empty()) {
+    NewMI->setMemRefs(LoadMI->memoperands_begin(),
+                      LoadMI->memoperands_end());
+  }
+  else {
+    // Handle the rare case of folding multiple loads.
+    NewMI->setMemRefs(MI->memoperands_begin(),
+                      MI->memoperands_end());
+    for (MachineInstr::mmo_iterator I = LoadMI->memoperands_begin(),
+           E = LoadMI->memoperands_end(); I != E; ++I) {
+      NewMI->addMemOperand(MF, *I);
+    }
+  }
   return NewMI;
 }
 
@@ -630,6 +672,10 @@ unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
   return 1;
 }
 
+unsigned TargetInstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 0;
+}
+
 unsigned TargetInstrInfo::
 getInstrLatency(const InstrItineraryData *ItinData,
                 const MachineInstr *MI,
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 8d8f81b..30305af 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -191,6 +191,11 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
   Names[RTLIB::NEARBYINT_F128] = "nearbyintl";
   Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::ROUND_F32] = "roundf";
+  Names[RTLIB::ROUND_F64] = "round";
+  Names[RTLIB::ROUND_F80] = "roundl";
+  Names[RTLIB::ROUND_F128] = "roundl";
+  Names[RTLIB::ROUND_PPCF128] = "roundl";
   Names[RTLIB::FLOOR_F32] = "floorf";
   Names[RTLIB::FLOOR_F64] = "floor";
   Names[RTLIB::FLOOR_F80] = "floorl";
@@ -313,34 +318,62 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] = "__sync_val_compare_and_swap_16";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_16] = "__sync_lock_test_and_set_16";
   Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
   Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
   Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
   Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_16] = "__sync_fetch_and_add_16";
   Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
   Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
   Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
   Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_16] = "__sync_fetch_and_sub_16";
   Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
   Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
   Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
   Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_16] = "__sync_fetch_and_and_16";
   Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
   Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
   Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
   Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_16] = "__sync_fetch_and_or_16";
   Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
   Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
   Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
   Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_16] = "__sync_fetch_and_xor_16";
   Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
   Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
   Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
   Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_16] = "__sync_fetch_and_nand_16";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_1] = "__sync_fetch_and_max_1";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_2] = "__sync_fetch_and_max_2";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_4] = "__sync_fetch_and_max_4";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_8] = "__sync_fetch_and_max_8";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_16] = "__sync_fetch_and_max_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_1] = "__sync_fetch_and_umax_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_2] = "__sync_fetch_and_umax_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_4] = "__sync_fetch_and_umax_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_8] = "__sync_fetch_and_umax_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_16] = "__sync_fetch_and_umax_16";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_1] = "__sync_fetch_and_min_1";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_2] = "__sync_fetch_and_min_2";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_4] = "__sync_fetch_and_min_4";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_8] = "__sync_fetch_and_min_8";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_16] = "__sync_fetch_and_min_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_1] = "__sync_fetch_and_umin_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_2] = "__sync_fetch_and_umin_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_4] = "__sync_fetch_and_umin_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
   
   if (Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU) {
     Names[RTLIB::SINCOS_F32] = "sincosf";
@@ -356,6 +389,13 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
     Names[RTLIB::SINCOS_F128] = 0;
     Names[RTLIB::SINCOS_PPCF128] = 0;
   }
+
+  if (Triple(TM.getTargetTriple()).getOS() != Triple::OpenBSD) {
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
+  } else {
+    // These are generally not available.
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = 0;
+  }
 }
 
 /// InitLibcallCallingConvs - Set default libcall CallingConvs.
@@ -624,7 +664,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
 
   // Perform these initializations only once.
   IsLittleEndian = TD->isLittleEndian();
-  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
   MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
     = MaxStoresPerMemmoveOptSize = 4;
@@ -682,6 +721,14 @@ void TargetLoweringBase::initActions() {
     // These operations default to expand.
     setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+
+    // These library functions default to expand.
+    setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand);
+
+    // These operations default to expand for vector types.
+    if (VT >= MVT::FIRST_VECTOR_VALUETYPE &&
+        VT <= MVT::LAST_VECTOR_VALUETYPE)
+      setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
@@ -747,6 +794,19 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
 }
 
+MVT TargetLoweringBase::getPointerTy(uint32_t AS) const {
+  return MVT::getIntegerVT(getPointerSizeInBits(AS));
+}
+
+unsigned TargetLoweringBase::getPointerSizeInBits(uint32_t AS) const {
+  return TD->getPointerSizeInBits(AS);
+}
+
+unsigned TargetLoweringBase::getPointerTypeSizeInBits(Type *Ty) const {
+  assert(Ty->isPointerTy());
+  return getPointerSizeInBits(Ty->getPointerAddressSpace());
+}
+
 MVT TargetLoweringBase::getScalarShiftAmountTy(EVT LHSTy) const {
   return MVT::getIntegerVT(8*TD->getPointerSize(0));
 }
@@ -1162,7 +1222,7 @@ void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true, 0, 0));
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isFixed=*/true, 0, 0));
   }
 }
 
@@ -1228,6 +1288,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case PtrToInt:       return ISD::BITCAST;
   case IntToPtr:       return ISD::BITCAST;
   case BitCast:        return ISD::BITCAST;
+  case AddrSpaceCast:  return ISD::ADDRSPACECAST;
   case ICmp:           return ISD::SETCC;
   case FCmp:           return ISD::SETCC;
   case PHI:            return 0;
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 07cf871..59d7b57 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -52,10 +52,10 @@ TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV,
   default:
     report_fatal_error("We do not support this DWARF encoding yet!");
   case dwarf::DW_EH_PE_absptr:
-    return  Mang->getSymbol(GV);
+    return  getSymbol(*Mang, GV);
   case dwarf::DW_EH_PE_pcrel: {
     return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
-                                          Mang->getSymbol(GV)->getName());
+                                          getSymbol(*Mang, GV)->getName());
   }
   }
 }
@@ -104,7 +104,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
     MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
-      MCSymbol *Sym = Mang->getSymbol(GV);
+      MCSymbol *Sym = getSymbol(*Mang, GV);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
 
@@ -252,7 +252,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
     Prefix = getSectionPrefixForGlobal(Kind);
 
     SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     Name.append(Sym->getName().begin(), Sym->getName().end());
     StringRef Group = "";
     unsigned Flags = getELFSectionFlags(Kind);
@@ -523,6 +523,11 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
 const MCSection *TargetLoweringObjectFileMachO::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler *Mang, const TargetMachine &TM) const {
+
+  // Handle thread local data.
+  if (Kind.isThreadBSS()) return TLSBSSSection;
+  if (Kind.isThreadData()) return TLSDataSection;
+
   if (Kind.isText())
     return GV->isWeakForLinker() ? TextCoalSection : TextSection;
 
@@ -575,10 +580,6 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   if (Kind.isBSSLocal())
     return DataBSSSection;
 
-  // Handle thread local data.
-  if (Kind.isThreadBSS()) return TLSBSSSection;
-  if (Kind.isThreadData()) return TLSDataSection;
-
   // Otherwise, just drop the variable in the normal data section.
   return DataSection;
 }
@@ -613,7 +614,7 @@ shouldEmitUsedDirectiveFor(const GlobalValue *GV, Mangler *Mang) const {
     // FIXME: ObjC metadata is currently emitted as internal symbols that have
     // \1L and \0l prefixes on them.  Fix them to be Private/LinkerPrivate and
     // this horrible hack can go away.
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     if (Sym->getName()[0] == 'L' || Sym->getName()[0] == 'l')
       return false;
   }
@@ -642,7 +643,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
       GV->hasHiddenVisibility() ? MachOMMI.getHiddenGVStubEntry(SSym) :
                                   MachOMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
-      MCSymbol *Sym = Mang->getSymbol(GV);
+      MCSymbol *Sym = getSymbol(*Mang, GV);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
 
@@ -671,7 +672,7 @@ getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
   MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
   MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
   if (StubSym.getPointer() == 0) {
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
   }
 
@@ -732,6 +733,7 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
   return getContext().getCOFFSection(Name,
                                      Characteristics,
                                      Kind,
+                                     "",
                                      Selection);
 }
 
@@ -767,16 +769,22 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 
     return getContext().getCOFFSection(Name.str(), Characteristics,
-                                       Kind, COFF::IMAGE_COMDAT_SELECT_ANY);
+                                       Kind, "", COFF::IMAGE_COMDAT_SELECT_ANY);
   }
 
   if (Kind.isText())
-    return getTextSection();
+    return TextSection;
 
   if (Kind.isThreadLocal())
-    return getTLSDataSection();
+    return TLSDataSection;
 
-  return getDataSection();
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  if (Kind.isBSS())
+    return BSSSection;
+
+  return DataSection;
 }
 
 void TargetLoweringObjectFileCOFF::
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index 7a39a4c..f7bf86b 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -22,10 +22,8 @@ using namespace llvm;
 bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
   // Check to see if we should eliminate non-leaf frame pointers and then
   // check to see if we should eliminate all frame pointers.
-  bool NoFramePointerElimNonLeaf =
-    MF.getFunction()->getFnAttribute("no-frame-pointer-elim-non-leaf")
-      .getValueAsString() == "true";
-  if (NoFramePointerElimNonLeaf && !NoFramePointerElim) {
+  if (MF.getFunction()->hasFnAttribute("no-frame-pointer-elim-non-leaf") &&
+      !NoFramePointerElim) {
     const MachineFrameInfo *MFI = MF.getFrameInfo();
     return MFI->hasCalls();
   }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index ffcee1f..5a15243 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -73,6 +73,14 @@ void PrintRegUnit::print(raw_ostream &OS) const {
     OS << '~' << TRI->getName(*Roots);
 }
 
+void PrintVRegOrUnit::print(raw_ostream &OS) const {
+  if (TRI && TRI->isVirtualRegister(Unit)) {
+    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    return;
+  }
+  PrintRegUnit::print(OS);
+}
+
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 64ee9d1..b0f2ca6 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -210,7 +210,8 @@ unsigned TargetSchedModel::computeOperandLatency(
   // unit latency (defaultDefLatency may be too conservative).
 #ifndef NDEBUG
   if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
-      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()) {
+      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()
+      && SchedModel.isComplete()) {
     std::string Err;
     raw_string_ostream ss(Err);
     ss << "DefIdx " << DefIdx << " exceeds machine model writes for "
@@ -224,10 +225,13 @@ unsigned TargetSchedModel::computeOperandLatency(
   return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI);
 }
 
-unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
+unsigned
+TargetSchedModel::computeInstrLatency(const MachineInstr *MI,
+                                      bool UseDefaultDefLatency) const {
   // For the itinerary model, fall back to the old subtarget hook.
   // Allow subtargets to compute Bundle latencies outside the machine model.
-  if (hasInstrItineraries() || MI->isBundle())
+  if (hasInstrItineraries() || MI->isBundle() ||
+      (!hasInstrSchedModel() && !UseDefaultDefLatency))
     return TII->getInstrLatency(&InstrItins, MI);
 
   if (hasInstrSchedModel()) {
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c52e675..b9a6b47 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1400,7 +1400,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
         SlotIndex endIdx =
           LIS->getInstructionIndex(MI).getRegSlot(IsEarlyClobber);
-        LI.addRange(LiveRange(LastCopyIdx, endIdx, VNI));
+        LI.addSegment(LiveInterval::Segment(LastCopyIdx, endIdx, VNI));
       }
     }
 
@@ -1457,7 +1457,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
 
       SlotIndex UseIdx = MIIdx.getRegSlot(IsEarlyClobber);
       if (I->end == UseIdx)
-        LI.removeRange(LastCopyIdx, UseIdx);
+        LI.removeSegment(LastCopyIdx, UseIdx);
     }
 
   } else if (RemovedKillFlag) {
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index a95ebcd..f735ef2 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -50,7 +49,6 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ProfileInfo>();
     }
   };
 }
@@ -87,9 +85,7 @@ bool UnreachableBlockElim::runOnFunction(Function &F) {
     }
 
   // Actually remove the blocks now.
-  ProfileInfo *PI = getAnalysisIfAvailable<ProfileInfo>();
   for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
-    if (PI) PI->removeBlock(DeadBlocks[i]);
     DeadBlocks[i]->eraseFromParent();
   }
 
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index cd012d2..e0aa405 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -264,15 +265,36 @@ void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
+  SmallPtrSet<const MachineInstr *, 4> NoReturnInsts;
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
     DEBUG(MBBI->print(dbgs(), Indexes));
+    bool IsExitBB = MBBI->succ_empty();
     for (MachineBasicBlock::instr_iterator
            MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
       MachineInstr *MI = MII;
       ++MII;
 
+      // Check if this instruction is a call to a noreturn function.
+      // If so, all the definitions set by this instruction can be ignored.
+      if (IsExitBB && MI->isCall())
+        for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+          MachineOperand &MO = *MOI;
+          if (!MO.isGlobal())
+            continue;
+          const Function *Func = dyn_cast<Function>(MO.getGlobal());
+          if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) ||
+              // We need to keep correct unwind information
+              // even if the function will not return, since the
+              // runtime may need it.
+              !Func->hasFnAttribute(Attribute::NoUnwind))
+            continue;
+          NoReturnInsts.insert(MI);
+          break;
+        }
+
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
            MOE = MI->operands_end(); MOI != MOE; ++MOI) {
         MachineOperand &MO = *MOI;
@@ -353,7 +375,25 @@ void VirtRegRewriter::rewrite() {
   }
 
   // Tell MRI about physical registers in use.
-  for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg)
-    if (!MRI->reg_nodbg_empty(Reg))
-      MRI->setPhysRegUsed(Reg);
+  if (NoReturnInsts.empty()) {
+    for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg)
+      if (!MRI->reg_nodbg_empty(Reg))
+        MRI->setPhysRegUsed(Reg);
+  } else {
+    for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg) {
+      if (MRI->reg_nodbg_empty(Reg))
+        continue;
+      // Check if this register has a use that will impact the rest of the
+      // code. Uses in debug and noreturn instructions do not impact the
+      // generated code.
+      for (MachineRegisterInfo::reg_nodbg_iterator It =
+             MRI->reg_nodbg_begin(Reg),
+             EndIt = MRI->reg_nodbg_end(); It != EndIt; ++It) {
+        if (!NoReturnInsts.count(&(*It))) {
+          MRI->setPhysRegUsed(Reg);
+          break;
+        }
+      }
+    }
+  }
 }
diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index 4a6221d..61a3fb0 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt
@@ -12,4 +12,6 @@ add_llvm_library(LLVMDebugInfo
   DWARFDebugLoc.cpp
   DWARFDebugRangeList.cpp
   DWARFFormValue.cpp
+  DWARFTypeUnit.cpp
+  DWARFUnit.cpp
   )
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
index 2de62ab..f46fd58 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
@@ -14,37 +14,51 @@
 using namespace llvm;
 using namespace dwarf;
 
-bool
-DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr){
-  return extract(data, offset_ptr, data.getULEB128(offset_ptr));
+void DWARFAbbreviationDeclaration::clear() {
+  Code = 0;
+  Tag = 0;
+  HasChildren = false;
+  Attributes.clear();
 }
 
-bool
-DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr,
-                                      uint32_t code) {
-  Code = code;
-  Attribute.clear();
-  if (Code) {
-    Tag = data.getULEB128(offset_ptr);
-    HasChildren = data.getU8(offset_ptr);
+DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
+  clear();
+}
 
-    while (data.isValidOffset(*offset_ptr)) {
-      uint16_t attr = data.getULEB128(offset_ptr);
-      uint16_t form = data.getULEB128(offset_ptr);
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor Data, uint32_t* OffsetPtr) {
+  clear();
+  Code = Data.getULEB128(OffsetPtr);
+  if (Code == 0) {
+    return false;
+  }
+  Tag = Data.getULEB128(OffsetPtr);
+  uint8_t ChildrenByte = Data.getU8(OffsetPtr);
+  HasChildren = (ChildrenByte == DW_CHILDREN_yes);
 
-      if (attr && form)
-        Attribute.push_back(DWARFAttribute(attr, form));
-      else
-        break;
+  while (true) {
+    uint32_t CurOffset = *OffsetPtr;
+    uint16_t Attr = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
     }
-
-    return Tag != 0;
-  } else {
-    Tag = 0;
-    HasChildren = false;
+    CurOffset = *OffsetPtr;
+    uint16_t Form = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
+    }
+    if (Attr == 0 && Form == 0)
+      break;
+    Attributes.push_back(AttributeSpec(Attr, Form));
   }
 
-  return false;
+  if (Tag == 0) {
+    clear();
+    return false;
+  }
+  return true;
 }
 
 void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
@@ -55,19 +69,19 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
   else
     OS << format("DW_TAG_Unknown_%x", getTag());
   OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
-  for (unsigned i = 0, e = Attribute.size(); i != e; ++i) {
+  for (unsigned i = 0, e = Attributes.size(); i != e; ++i) {
     OS << '\t';
-    const char *attrString = AttributeString(Attribute[i].getAttribute());
+    const char *attrString = AttributeString(Attributes[i].Attr);
     if (attrString)
       OS << attrString;
     else
-      OS << format("DW_AT_Unknown_%x", Attribute[i].getAttribute());
+      OS << format("DW_AT_Unknown_%x", Attributes[i].Attr);
     OS << '\t';
-    const char *formString = FormEncodingString(Attribute[i].getForm());
+    const char *formString = FormEncodingString(Attributes[i].Form);
     if (formString)
       OS << formString;
     else
-      OS << format("DW_FORM_Unknown_%x", Attribute[i].getForm());
+      OS << format("DW_FORM_Unknown_%x", Attributes[i].Form);
     OS << '\n';
   }
   OS << '\n';
@@ -75,8 +89,8 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
 
 uint32_t
 DWARFAbbreviationDeclaration::findAttributeIndex(uint16_t attr) const {
-  for (uint32_t i = 0, e = Attribute.size(); i != e; ++i) {
-    if (Attribute[i].getAttribute() == attr)
+  for (uint32_t i = 0, e = Attributes.size(); i != e; ++i) {
+    if (Attributes[i].Attr == attr)
       return i;
   }
   return -1U;
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.h b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
index 9a3fcd8..e9b072e 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.h
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 #define LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 
-#include "DWARFAttribute.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataExtractor.h"
 
@@ -22,31 +21,33 @@ class DWARFAbbreviationDeclaration {
   uint32_t Code;
   uint32_t Tag;
   bool HasChildren;
-  SmallVector<DWARFAttribute, 8> Attribute;
+
+  struct AttributeSpec {
+    AttributeSpec(uint16_t Attr, uint16_t Form) : Attr(Attr), Form(Form) {}
+    uint16_t Attr;
+    uint16_t Form;
+  };
+  SmallVector<AttributeSpec, 8> Attributes;
 public:
-  enum { InvalidCode = 0 };
-  DWARFAbbreviationDeclaration()
-    : Code(InvalidCode), Tag(0), HasChildren(0) {}
+  DWARFAbbreviationDeclaration();
 
   uint32_t getCode() const { return Code; }
   uint32_t getTag() const { return Tag; }
   bool hasChildren() const { return HasChildren; }
-  uint32_t getNumAttributes() const { return Attribute.size(); }
+  uint32_t getNumAttributes() const { return Attributes.size(); }
   uint16_t getAttrByIndex(uint32_t idx) const {
-    return Attribute.size() > idx ? Attribute[idx].getAttribute() : 0;
+    return idx < Attributes.size() ? Attributes[idx].Attr : 0;
   }
   uint16_t getFormByIndex(uint32_t idx) const {
-    return Attribute.size() > idx ? Attribute[idx].getForm() : 0;
+    return idx < Attributes.size() ? Attributes[idx].Form : 0;
   }
 
   uint32_t findAttributeIndex(uint16_t attr) const;
-  bool extract(DataExtractor data, uint32_t* offset_ptr);
-  bool extract(DataExtractor data, uint32_t* offset_ptr, uint32_t code);
-  bool isValid() const { return Code != 0 && Tag != 0; }
+  bool extract(DataExtractor Data, uint32_t* OffsetPtr);
   void dump(raw_ostream &OS) const;
-  const SmallVectorImpl<DWARFAttribute> &getAttributes() const {
-    return Attribute;
-  }
+
+private:
+  void clear();
 };
 
 }
diff --git a/lib/DebugInfo/DWARFAttribute.h b/lib/DebugInfo/DWARFAttribute.h
deleted file mode 100644
index 6f49b63..0000000
--- a/lib/DebugInfo/DWARFAttribute.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===-- DWARFAttribute.h ----------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_DWARFATTRIBUTE_H
-#define LLVM_DEBUGINFO_DWARFATTRIBUTE_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class DWARFAttribute {
-  uint16_t Attribute;
-  uint16_t Form;
-  public:
-  DWARFAttribute(uint16_t attr, uint16_t form)
-    : Attribute(attr), Form(form) {}
-
-  uint16_t getAttribute() const { return Attribute; }
-  uint16_t getForm() const { return Form; }
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
index 93b1622..33869d8 100644
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -8,96 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "DWARFCompileUnit.h"
-#include "DWARFContext.h"
-#include "llvm/DebugInfo/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-using namespace dwarf;
-
-DataExtractor DWARFCompileUnit::getDebugInfoExtractor() const {
-  return DataExtractor(InfoSection, isLittleEndian, AddrSize);
-}
-
-bool DWARFCompileUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
-  clear();
-
-  Offset = *offset_ptr;
-
-  if (debug_info.isValidOffset(*offset_ptr)) {
-    uint64_t abbrOffset;
-    Length = debug_info.getU32(offset_ptr);
-    Version = debug_info.getU16(offset_ptr);
-    abbrOffset = debug_info.getU32(offset_ptr);
-    AddrSize = debug_info.getU8(offset_ptr);
-
-    bool lengthOK = debug_info.isValidOffset(getNextCompileUnitOffset()-1);
-    bool versionOK = DWARFContext::isSupportedVersion(Version);
-    bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
-    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
-
-    if (lengthOK && versionOK && addrSizeOK && abbrOffsetOK && Abbrev != NULL) {
-      Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
-      return true;
-    }
-
-    // reset the offset to where we tried to parse from if anything went wrong
-    *offset_ptr = Offset;
-  }
-
-  return false;
-}
 
-uint32_t
-DWARFCompileUnit::extract(uint32_t offset, DataExtractor debug_info_data,
-                          const DWARFAbbreviationDeclarationSet *abbrevs) {
-  clear();
-
-  Offset = offset;
-
-  if (debug_info_data.isValidOffset(offset)) {
-    Length = debug_info_data.getU32(&offset);
-    Version = debug_info_data.getU16(&offset);
-    bool abbrevsOK = debug_info_data.getU32(&offset) == abbrevs->getOffset();
-    Abbrevs = abbrevs;
-    AddrSize = debug_info_data.getU8(&offset);
-
-    bool versionOK = DWARFContext::isSupportedVersion(Version);
-    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
-
-    if (versionOK && addrSizeOK && abbrevsOK &&
-        debug_info_data.isValidOffset(offset))
-      return offset;
-  }
-  return 0;
-}
-
-bool DWARFCompileUnit::extractRangeList(uint32_t RangeListOffset,
-                                        DWARFDebugRangeList &RangeList) const {
-  // Require that compile unit is extracted.
-  assert(DieArray.size() > 0);
-  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
-  return RangeList.extract(RangesData, &RangeListOffset);
-}
-
-void DWARFCompileUnit::clear() {
-  Offset = 0;
-  Length = 0;
-  Version = 0;
-  Abbrevs = 0;
-  AddrSize = 0;
-  BaseAddr = 0;
-  clearDIEs(false);
-}
+using namespace llvm;
 
 void DWARFCompileUnit::dump(raw_ostream &OS) {
-  OS << format("0x%08x", Offset) << ": Compile Unit:"
-     << " length = " << format("0x%08x", Length)
-     << " version = " << format("0x%04x", Version)
-     << " abbr_offset = " << format("0x%04x", Abbrevs->getOffset())
-     << " addr_size = " << format("0x%02x", AddrSize)
-     << " (next CU at " << format("0x%08x", getNextCompileUnitOffset())
+  OS << format("0x%08x", getOffset()) << ": Compile Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
      << ")\n";
 
   const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
@@ -105,174 +27,6 @@ void DWARFCompileUnit::dump(raw_ostream &OS) {
   CU->dump(OS, this, -1U);
 }
 
-const char *DWARFCompileUnit::getCompilationDir() {
-  extractDIEsIfNeeded(true);
-  if (DieArray.empty())
-    return 0;
-  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
-}
-
-void DWARFCompileUnit::setDIERelations() {
-  if (DieArray.empty())
-    return;
-  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
-  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
-  DWARFDebugInfoEntryMinimal *curr_die;
-  // We purposely are skipping the last element in the array in the loop below
-  // so that we can always have a valid next item
-  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
-    // Since our loop doesn't include the last element, we can always
-    // safely access the next die in the array.
-    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
-
-    const DWARFAbbreviationDeclaration *curr_die_abbrev =
-      curr_die->getAbbreviationDeclarationPtr();
-
-    if (curr_die_abbrev) {
-      // Normal DIE
-      if (curr_die_abbrev->hasChildren())
-        next_die->setParent(curr_die);
-      else
-        curr_die->setSibling(next_die);
-    } else {
-      // NULL DIE that terminates a sibling chain
-      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
-      if (parent)
-        parent->setSibling(next_die);
-    }
-  }
-
-  // Since we skipped the last element, we need to fix it up!
-  if (die_array_begin < die_array_end)
-    curr_die->setParent(die_array_begin);
-}
-
-void DWARFCompileUnit::extractDIEsToVector(
-    bool AppendCUDie, bool AppendNonCUDies,
-    std::vector<DWARFDebugInfoEntryMinimal> &Dies) const {
-  if (!AppendCUDie && !AppendNonCUDies)
-    return;
-
-  // Set the offset to that of the first DIE and calculate the start of the
-  // next compilation unit header.
-  uint32_t Offset = getFirstDIEOffset();
-  uint32_t NextCUOffset = getNextCompileUnitOffset();
-  DWARFDebugInfoEntryMinimal DIE;
-  uint32_t Depth = 0;
-  const uint8_t *FixedFormSizes =
-    DWARFFormValue::getFixedFormSizes(getAddressByteSize(), getVersion());
-  bool IsCUDie = true;
-
-  while (Offset < NextCUOffset &&
-         DIE.extractFast(this, FixedFormSizes, &Offset)) {
-    if (IsCUDie) {
-      if (AppendCUDie)
-        Dies.push_back(DIE);
-      if (!AppendNonCUDies)
-        break;
-      // The average bytes per DIE entry has been seen to be
-      // around 14-20 so let's pre-reserve the needed memory for
-      // our DIE entries accordingly.
-      Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
-      IsCUDie = false;
-    } else {
-      Dies.push_back(DIE);
-    }
-
-    const DWARFAbbreviationDeclaration *AbbrDecl =
-      DIE.getAbbreviationDeclarationPtr();
-    if (AbbrDecl) {
-      // Normal DIE
-      if (AbbrDecl->hasChildren())
-        ++Depth;
-    } else {
-      // NULL DIE.
-      if (Depth > 0)
-        --Depth;
-      if (Depth == 0)
-        break;  // We are done with this compile unit!
-    }
-  }
-
-  // Give a little bit of info if we encounter corrupt DWARF (our offset
-  // should always terminate at or before the start of the next compilation
-  // unit header).
-  if (Offset > NextCUOffset)
-    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
-                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), Offset);
-}
-
-size_t DWARFCompileUnit::extractDIEsIfNeeded(bool CUDieOnly) {
-  if ((CUDieOnly && DieArray.size() > 0) ||
-      DieArray.size() > 1)
-    return 0; // Already parsed.
-
-  extractDIEsToVector(DieArray.empty(), !CUDieOnly, DieArray);
-
-  // Set the base address of current compile unit.
-  if (!DieArray.empty()) {
-    uint64_t BaseAddr =
-      DieArray[0].getAttributeValueAsUnsigned(this, DW_AT_low_pc, -1U);
-    if (BaseAddr == -1U)
-      BaseAddr = DieArray[0].getAttributeValueAsUnsigned(this, DW_AT_entry_pc, 0);
-    setBaseAddress(BaseAddr);
-  }
-
-  setDIERelations();
-  return DieArray.size();
-}
-
-void DWARFCompileUnit::clearDIEs(bool KeepCUDie) {
-  if (DieArray.size() > (unsigned)KeepCUDie) {
-    // std::vectors never get any smaller when resized to a smaller size,
-    // or when clear() or erase() are called, the size will report that it
-    // is smaller, but the memory allocated remains intact (call capacity()
-    // to see this). So we need to create a temporary vector and swap the
-    // contents which will cause just the internal pointers to be swapped
-    // so that when temporary vector goes out of scope, it will destroy the
-    // contents.
-    std::vector<DWARFDebugInfoEntryMinimal> TmpArray;
-    DieArray.swap(TmpArray);
-    // Save at least the compile unit DIE
-    if (KeepCUDie)
-      DieArray.push_back(TmpArray.front());
-  }
-}
-
-void
-DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                                         bool clear_dies_if_already_not_parsed){
-  // This function is usually called if there in no .debug_aranges section
-  // in order to produce a compile unit level set of address ranges that
-  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
-  // all compile units to stay loaded when they weren't needed. So we can end
-  // up parsing the DWARF and then throwing them all away to keep memory usage
-  // down.
-  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
-                          clear_dies_if_already_not_parsed;
-  DieArray[0].buildAddressRangeTable(this, debug_aranges);
-
-  // Keep memory down by clearing DIEs if this generate function
-  // caused them to be parsed.
-  if (clear_dies)
-    clearDIEs(true);
-}
-
-DWARFDebugInfoEntryInlinedChain
-DWARFCompileUnit::getInlinedChainForAddress(uint64_t Address) {
-  // First, find a subprogram that contains the given address (the root
-  // of inlined chain).
-  extractDIEsIfNeeded(false);
-  const DWARFDebugInfoEntryMinimal *SubprogramDIE = 0;
-  for (size_t i = 0, n = DieArray.size(); i != n; i++) {
-    if (DieArray[i].isSubprogramDIE() &&
-        DieArray[i].addressRangeContainsAddress(this, Address)) {
-      SubprogramDIE = &DieArray[i];
-      break;
-    }
-  }
-  // Get inlined chain rooted at this subprogram DIE.
-  if (!SubprogramDIE)
-    return DWARFDebugInfoEntryInlinedChain();
-  return SubprogramDIE->getInlinedChainForAddress(this, Address);
+// VTable anchor.
+DWARFCompileUnit::~DWARFCompileUnit() {
 }
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index dc2214b..1c9573b 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -10,118 +10,19 @@
 #ifndef LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
 #define LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
 
-#include "DWARFDebugAbbrev.h"
-#include "DWARFDebugInfoEntry.h"
-#include "DWARFDebugRangeList.h"
-#include "DWARFRelocMap.h"
-#include <vector>
+#include "DWARFUnit.h"
 
 namespace llvm {
 
-class DWARFDebugAbbrev;
-class StringRef;
-class raw_ostream;
-
-class DWARFCompileUnit {
-  const DWARFDebugAbbrev *Abbrev;
-  StringRef InfoSection;
-  StringRef AbbrevSection;
-  StringRef RangeSection;
-  StringRef StringSection;
-  StringRef StringOffsetSection;
-  StringRef AddrOffsetSection;
-  const RelocAddrMap *RelocMap;
-  bool isLittleEndian;
-
-  uint32_t Offset;
-  uint32_t Length;
-  uint16_t Version;
-  const DWARFAbbreviationDeclarationSet *Abbrevs;
-  uint8_t AddrSize;
-  uint64_t BaseAddr;
-  // The compile unit debug information entry items.
-  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+class DWARFCompileUnit : public DWARFUnit {
 public:
-
   DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
                    StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
-                   const RelocAddrMap *M, bool LE) :
-    Abbrev(DA), InfoSection(IS), AbbrevSection(AS),
-    RangeSection(RS), StringSection(SS), StringOffsetSection(SOS),
-    AddrOffsetSection(AOS), RelocMap(M), isLittleEndian(LE) {
-    clear();
-  }
-
-  StringRef getStringSection() const { return StringSection; }
-  StringRef getStringOffsetSection() const { return StringOffsetSection; }
-  StringRef getAddrOffsetSection() const { return AddrOffsetSection; }
-  const RelocAddrMap *getRelocMap() const { return RelocMap; }
-  DataExtractor getDebugInfoExtractor() const;
-
-  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
-  uint32_t extract(uint32_t offset, DataExtractor debug_info_data,
-                   const DWARFAbbreviationDeclarationSet *abbrevs);
-
-  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
-  /// hasn't already been done. Returns the number of DIEs parsed at this call.
-  size_t extractDIEsIfNeeded(bool CUDieOnly);
-  /// extractRangeList - extracts the range list referenced by this compile
-  /// unit from .debug_ranges section. Returns true on success.
-  /// Requires that compile unit is already extracted.
-  bool extractRangeList(uint32_t RangeListOffset,
-                        DWARFDebugRangeList &RangeList) const;
-  void clear();
+                   const RelocAddrMap *M, bool LE)
+      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
   void dump(raw_ostream &OS);
-  uint32_t getOffset() const { return Offset; }
-  /// Size in bytes of the compile unit header.
-  uint32_t getSize() const { return 11; }
-  bool containsDIEOffset(uint32_t die_offset) const {
-    return die_offset >= getFirstDIEOffset() &&
-      die_offset < getNextCompileUnitOffset();
-  }
-  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
-  uint32_t getNextCompileUnitOffset() const { return Offset + Length + 4; }
-  /// Size in bytes of the .debug_info data associated with this compile unit.
-  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
-  uint32_t getLength() const { return Length; }
-  uint16_t getVersion() const { return Version; }
-  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
-    return Abbrevs;
-  }
-  uint8_t getAddressByteSize() const { return AddrSize; }
-  uint64_t getBaseAddress() const { return BaseAddr; }
-
-  void setBaseAddress(uint64_t base_addr) {
-    BaseAddr = base_addr;
-  }
-
-  const DWARFDebugInfoEntryMinimal *
-  getCompileUnitDIE(bool extract_cu_die_only = true) {
-    extractDIEsIfNeeded(extract_cu_die_only);
-    return DieArray.empty() ? NULL : &DieArray[0];
-  }
-
-  const char *getCompilationDir();
-
-  /// setDIERelations - We read in all of the DIE entries into our flat list
-  /// of DIE entries and now we need to go back through all of them and set the
-  /// parent, sibling and child pointers for quick DIE navigation.
-  void setDIERelations();
-
-  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                              bool clear_dies_if_already_not_parsed);
-
-  /// getInlinedChainForAddress - fetches inlined chain for a given address.
-  /// Returns empty chain if there is no subprogram containing address. The
-  /// chain is valid as long as parsed compile unit DIEs are not cleared.
-  DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
-
-private:
-  /// extractDIEsToVector - Appends all parsed DIEs to a vector.
-  void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
-                           std::vector<DWARFDebugInfoEntryMinimal> &DIEs) const;
-  /// clearDIEs - Clear parsed DIEs to keep memory usage low.
-  void clearDIEs(bool KeepCUDie);
+  // VTable anchor.
+  ~DWARFCompileUnit() LLVM_OVERRIDE;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 51ad645..e477190 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -23,6 +23,41 @@ using namespace object;
 
 typedef DWARFDebugLine::LineTable DWARFLineTable;
 
+DWARFContext::~DWARFContext() {
+  DeleteContainerPointers(CUs);
+  DeleteContainerPointers(TUs);
+  DeleteContainerPointers(DWOCUs);
+}
+
+static void dumpPubSection(raw_ostream &OS, StringRef Name, StringRef Data,
+                           bool LittleEndian, bool GnuStyle) {
+  OS << "\n." << Name << " contents:\n";
+  DataExtractor pubNames(Data, LittleEndian, 0);
+  uint32_t offset = 0;
+  OS << "length = " << format("0x%08x", pubNames.getU32(&offset));
+  OS << " version = " << format("0x%04x", pubNames.getU16(&offset));
+  OS << " unit_offset = " << format("0x%08x", pubNames.getU32(&offset));
+  OS << " unit_size = " << format("0x%08x", pubNames.getU32(&offset)) << '\n';
+  if (GnuStyle)
+    OS << "Offset     Linkage  Kind     Name\n";
+  else
+    OS << "Offset     Name\n";
+
+  while (offset < Data.size()) {
+    uint32_t dieRef = pubNames.getU32(&offset);
+    if (dieRef == 0)
+      break;
+    OS << format("0x%8.8x ", dieRef);
+    if (GnuStyle) {
+      PubIndexEntryDescriptor desc(pubNames.getU8(&offset));
+      OS << format("%-8s", dwarf::GDBIndexEntryLinkageString(desc.Linkage))
+         << ' ' << format("%-8s", dwarf::GDBIndexEntryKindString(desc.Kind))
+         << ' ';
+    }
+    OS << '\"' << pubNames.getCStr(&offset) << "\"\n";
+  }
+}
+
 void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
   if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
     OS << ".debug_abbrev contents:\n";
@@ -35,8 +70,14 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       getCompileUnitAtIndex(i)->dump(OS);
   }
 
+  if (DumpType == DIDT_All || DumpType == DIDT_Types) {
+    OS << "\n.debug_types contents:\n";
+    for (unsigned i = 0, e = getNumTypeUnits(); i != e; ++i)
+      getTypeUnitAtIndex(i)->dump(OS);
+  }
+
   if (DumpType == DIDT_All || DumpType == DIDT_Loc) {
-    OS << ".debug_loc contents:\n";
+    OS << "\n.debug_loc contents:\n";
     getDebugLoc()->dump(OS);
   }
 
@@ -61,13 +102,13 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
       savedAddressByteSize = cu->getAddressByteSize();
       unsigned stmtOffset =
-        cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
-                                                             -1U);
+          cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+              cu, DW_AT_stmt_list, -1U);
       if (stmtOffset != -1U) {
-        DataExtractor lineData(getLineSection(), isLittleEndian(),
+        DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                                savedAddressByteSize);
         DWARFDebugLine::DumpingState state(OS);
-        DWARFDebugLine::parseStatementTable(lineData, &lineRelocMap(), &stmtOffset, state);
+        DWARFDebugLine::parseStatementTable(lineData, &getLineSection().Relocs, &stmtOffset, state);
       }
     }
   }
@@ -97,23 +138,21 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       rangeList.dump(OS);
   }
 
-  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames) {
-    OS << "\n.debug_pubnames contents:\n";
-    DataExtractor pubNames(getPubNamesSection(), isLittleEndian(), 0);
-    offset = 0;
-    OS << "Length:                " << pubNames.getU32(&offset) << "\n";
-    OS << "Version:               " << pubNames.getU16(&offset) << "\n";
-    OS << "Offset in .debug_info: " << pubNames.getU32(&offset) << "\n";
-    OS << "Size:                  " << pubNames.getU32(&offset) << "\n";
-    OS << "\n  Offset    Name\n";
-    while (offset < getPubNamesSection().size()) {
-      uint32_t n = pubNames.getU32(&offset);
-      if (n == 0)
-        break;
-      OS << format("%8x    ", n);
-      OS << pubNames.getCStr(&offset) << "\n";
-    }
-  }
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames)
+    dumpPubSection(OS, "debug_pubnames", getPubNamesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubtypes)
+    dumpPubSection(OS, "debug_pubtypes", getPubTypesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubnames)
+    dumpPubSection(OS, "debug_gnu_pubnames", getGnuPubNamesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubtypes)
+    dumpPubSection(OS, "debug_gnu_pubtypes", getGnuPubTypesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
 
   if (DumpType == DIDT_All || DumpType == DIDT_AbbrevDwo) {
     const DWARFDebugAbbrev *D = getDebugAbbrevDWO();
@@ -180,8 +219,8 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() {
   if (Loc)
     return Loc.get();
 
-  DataExtractor LocData(getLocSection(), isLittleEndian(), 0);
-  Loc.reset(new DWARFDebugLoc(locRelocMap()));
+  DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0);
+  Loc.reset(new DWARFDebugLoc(getLocSection().Relocs));
   // assume all compile units have the same address byte size
   if (getNumCompileUnits())
     Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize());
@@ -192,13 +231,7 @@ const DWARFDebugAranges *DWARFContext::getDebugAranges() {
   if (Aranges)
     return Aranges.get();
 
-  DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
-
   Aranges.reset(new DWARFDebugAranges());
-  Aranges->extract(arangesData);
-  // Generate aranges from DIEs: even if .debug_aranges section is present,
-  // it may describe only a small subset of compilation units, so we need to
-  // manually build aranges for the rest of them.
   Aranges->generate(this);
   return Aranges.get();
 }
@@ -226,11 +259,11 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
 const DWARFLineTable *
 DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
   if (!Line)
-    Line.reset(new DWARFDebugLine(&lineRelocMap()));
+    Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
 
   unsigned stmtOffset =
-    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
-                                                         -1U);
+      cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+          cu, DW_AT_stmt_list, -1U);
   if (stmtOffset == -1U)
     return 0; // No line table for this compile unit.
 
@@ -239,64 +272,79 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
     return lt;
 
   // We have to parse it first.
-  DataExtractor lineData(getLineSection(), isLittleEndian(),
+  DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                          cu->getAddressByteSize());
   return Line->getOrParseLineTable(lineData, stmtOffset);
 }
 
 void DWARFContext::parseCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &DIData = DataExtractor(getInfoSection(),
+  const DataExtractor &DIData = DataExtractor(getInfoSection().Data,
                                               isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
-    CUs.push_back(DWARFCompileUnit(getDebugAbbrev(), getInfoSection(),
-                                   getAbbrevSection(), getRangeSection(),
-                                   getStringSection(), StringRef(),
-                                   getAddrSection(),
-                                   &infoRelocMap(),
-                                   isLittleEndian()));
-    if (!CUs.back().extract(DIData, &offset)) {
-      CUs.pop_back();
+    OwningPtr<DWARFCompileUnit> CU(new DWARFCompileUnit(
+        getDebugAbbrev(), getInfoSection().Data, getAbbrevSection(),
+        getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
+        &getInfoSection().Relocs, isLittleEndian()));
+    if (!CU->extract(DIData, &offset)) {
       break;
     }
+    CUs.push_back(CU.take());
+    offset = CUs.back()->getNextUnitOffset();
+  }
+}
 
-    offset = CUs.back().getNextCompileUnitOffset();
+void DWARFContext::parseTypeUnits() {
+  const std::map<object::SectionRef, Section> &Sections = getTypesSections();
+  for (std::map<object::SectionRef, Section>::const_iterator
+           I = Sections.begin(),
+           E = Sections.end();
+       I != E; ++I) {
+    uint32_t offset = 0;
+    const DataExtractor &DIData =
+        DataExtractor(I->second.Data, isLittleEndian(), 0);
+    while (DIData.isValidOffset(offset)) {
+      OwningPtr<DWARFTypeUnit> TU(new DWARFTypeUnit(
+          getDebugAbbrev(), I->second.Data, getAbbrevSection(),
+          getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
+          &I->second.Relocs, isLittleEndian()));
+      if (!TU->extract(DIData, &offset))
+        break;
+      TUs.push_back(TU.take());
+      offset = TUs.back()->getNextUnitOffset();
+    }
   }
 }
 
 void DWARFContext::parseDWOCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &DIData = DataExtractor(getInfoDWOSection(),
-                                              isLittleEndian(), 0);
+  const DataExtractor &DIData =
+      DataExtractor(getInfoDWOSection().Data, isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
-    DWOCUs.push_back(DWARFCompileUnit(getDebugAbbrevDWO(), getInfoDWOSection(),
-                                      getAbbrevDWOSection(),
-                                      getRangeDWOSection(),
-                                      getStringDWOSection(),
-                                      getStringOffsetDWOSection(),
-                                      getAddrSection(),
-                                      &infoDWORelocMap(),
-                                      isLittleEndian()));
-    if (!DWOCUs.back().extract(DIData, &offset)) {
-      DWOCUs.pop_back();
+    OwningPtr<DWARFCompileUnit> DWOCU(new DWARFCompileUnit(
+        getDebugAbbrevDWO(), getInfoDWOSection().Data, getAbbrevDWOSection(),
+        getRangeDWOSection(), getStringDWOSection(),
+        getStringOffsetDWOSection(), getAddrSection(),
+        &getInfoDWOSection().Relocs, isLittleEndian()));
+    if (!DWOCU->extract(DIData, &offset)) {
       break;
     }
-
-    offset = DWOCUs.back().getNextCompileUnitOffset();
+    DWOCUs.push_back(DWOCU.take());
+    offset = DWOCUs.back()->getNextUnitOffset();
   }
 }
 
 namespace {
   struct OffsetComparator {
-    bool operator()(const DWARFCompileUnit &LHS,
-                    const DWARFCompileUnit &RHS) const {
-      return LHS.getOffset() < RHS.getOffset();
+    bool operator()(const DWARFCompileUnit *LHS,
+                    const DWARFCompileUnit *RHS) const {
+      return LHS->getOffset() < RHS->getOffset();
     }
-    bool operator()(const DWARFCompileUnit &LHS, uint32_t RHS) const {
-      return LHS.getOffset() < RHS;
+    bool operator()(const DWARFCompileUnit *LHS, uint32_t RHS) const {
+      return LHS->getOffset() < RHS;
     }
-    bool operator()(uint32_t LHS, const DWARFCompileUnit &RHS) const {
-      return LHS < RHS.getOffset();
+    bool operator()(uint32_t LHS, const DWARFCompileUnit *RHS) const {
+      return LHS < RHS->getOffset();
     }
   };
 }
@@ -305,10 +353,11 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
   if (CUs.empty())
     parseCompileUnits();
 
-  DWARFCompileUnit *CU = std::lower_bound(CUs.begin(), CUs.end(), Offset,
-                                          OffsetComparator());
-  if (CU != CUs.end())
-    return &*CU;
+  DWARFCompileUnit **CU =
+      std::lower_bound(CUs.begin(), CUs.end(), Offset, OffsetComparator());
+  if (CU != CUs.end()) {
+    return *CU;
+  }
   return 0;
 }
 
@@ -380,7 +429,7 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
         CU->getInlinedChainForAddress(Address);
     if (InlinedChain.DIEs.size() > 0) {
       const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.CU))
+      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
   }
@@ -413,19 +462,16 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
         CU->getInlinedChainForAddress(Address);
     if (InlinedChain.DIEs.size() > 0) {
       const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.CU))
+      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
   }
 
-  StringRef  FuncNameRef = StringRef(FunctionName);
-
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
   if (!Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-    Lines.push_back(std::make_pair(Address,
-                                   DILineInfo(StringRef("<invalid>"),
-                                              FuncNameRef, 0, 0)));
+    Lines.push_back(
+        std::make_pair(Address, DILineInfo("<invalid>", FunctionName, 0, 0)));
     return Lines;
   }
 
@@ -446,9 +492,8 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
     std::string FileName = "<invalid>";
     getFileNameForCompileUnit(CU, LineTable, Row.File,
                               NeedsAbsoluteFilePath, FileName);
-    Lines.push_back(std::make_pair(Row.Address,
-                                   DILineInfo(StringRef(FileName),
-                                         FuncNameRef, Row.Line, Row.Column)));
+    Lines.push_back(std::make_pair(
+        Row.Address, DILineInfo(FileName, FunctionName, Row.Line, Row.Column)));
   }
 
   return Lines;
@@ -476,7 +521,7 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
     uint32_t Column = 0;
     // Get function name if necessary.
     if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
-      if (const char *Name = FunctionDIE.getSubroutineName(InlinedChain.CU))
+      if (const char *Name = FunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
     if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
@@ -500,7 +545,7 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
       }
       // Get call file/line/column of a current DIE.
       if (i + 1 < n) {
-        FunctionDIE.getCallerFrame(InlinedChain.CU, CallFile, CallLine,
+        FunctionDIE.getCallerFrame(InlinedChain.U, CallFile, CallLine,
                                    CallColumn);
       }
     }
@@ -557,29 +602,37 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
       UncompressedSections.push_back(UncompressedSection.take());
     }
 
-    StringRef *Section = StringSwitch<StringRef*>(name)
-        .Case("debug_info", &InfoSection)
-        .Case("debug_abbrev", &AbbrevSection)
-        .Case("debug_loc", &LocSection)
-        .Case("debug_line", &LineSection)
-        .Case("debug_aranges", &ARangeSection)
-        .Case("debug_frame", &DebugFrameSection)
-        .Case("debug_str", &StringSection)
-        .Case("debug_ranges", &RangeSection)
-        .Case("debug_pubnames", &PubNamesSection)
-        .Case("debug_info.dwo", &InfoDWOSection)
-        .Case("debug_abbrev.dwo", &AbbrevDWOSection)
-        .Case("debug_str.dwo", &StringDWOSection)
-        .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
-        .Case("debug_addr", &AddrSection)
-        // Any more debug info sections go here.
-        .Default(0);
+    StringRef *Section =
+        StringSwitch<StringRef *>(name)
+            .Case("debug_info", &InfoSection.Data)
+            .Case("debug_abbrev", &AbbrevSection)
+            .Case("debug_loc", &LocSection.Data)
+            .Case("debug_line", &LineSection.Data)
+            .Case("debug_aranges", &ARangeSection)
+            .Case("debug_frame", &DebugFrameSection)
+            .Case("debug_str", &StringSection)
+            .Case("debug_ranges", &RangeSection)
+            .Case("debug_pubnames", &PubNamesSection)
+            .Case("debug_pubtypes", &PubTypesSection)
+            .Case("debug_gnu_pubnames", &GnuPubNamesSection)
+            .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
+            .Case("debug_info.dwo", &InfoDWOSection.Data)
+            .Case("debug_abbrev.dwo", &AbbrevDWOSection)
+            .Case("debug_str.dwo", &StringDWOSection)
+            .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
+            .Case("debug_addr", &AddrSection)
+            // Any more debug info sections go here.
+            .Default(0);
     if (Section) {
       *Section = data;
       if (name == "debug_ranges") {
         // FIXME: Use the other dwo range section when we emit it.
         RangeDWOSection = data;
       }
+    } else if (name == "debug_types") {
+      // Find debug_types data by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      TypesSections[*i].Data = data;
     }
 
     section_iterator RelocatedSection = i->getRelocatedSection();
@@ -594,13 +647,18 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
     // TODO: Add support for relocations in other sections as needed.
     // Record relocations for the debug_info and debug_line sections.
     RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(RelSecName)
-        .Case("debug_info", &InfoRelocMap)
-        .Case("debug_loc", &LocRelocMap)
-        .Case("debug_info.dwo", &InfoDWORelocMap)
-        .Case("debug_line", &LineRelocMap)
+        .Case("debug_info", &InfoSection.Relocs)
+        .Case("debug_loc", &LocSection.Relocs)
+        .Case("debug_info.dwo", &InfoDWOSection.Relocs)
+        .Case("debug_line", &LineSection.Relocs)
         .Default(0);
-    if (!Map)
-      continue;
+    if (!Map) {
+      if (RelSecName != "debug_types")
+        continue;
+      // Find debug_types relocs by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      Map = &TypesSections[*RelocatedSection].Relocs;
+    }
 
     if (i->begin_relocations() != i->end_relocations()) {
       uint64_t SectionSize;
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index 5d8f714..03863ab 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -16,6 +16,7 @@
 #include "DWARFDebugLine.h"
 #include "DWARFDebugLoc.h"
 #include "DWARFDebugRangeList.h"
+#include "DWARFTypeUnit.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DIContext.h"
@@ -27,14 +28,15 @@ namespace llvm {
 /// information parsing. The actual data is supplied through pure virtual
 /// methods that a concrete implementation provides.
 class DWARFContext : public DIContext {
-  SmallVector<DWARFCompileUnit, 1> CUs;
+  SmallVector<DWARFCompileUnit *, 1> CUs;
+  SmallVector<DWARFTypeUnit *, 1> TUs;
   OwningPtr<DWARFDebugAbbrev> Abbrev;
   OwningPtr<DWARFDebugLoc> Loc;
   OwningPtr<DWARFDebugAranges> Aranges;
   OwningPtr<DWARFDebugLine> Line;
   OwningPtr<DWARFDebugFrame> DebugFrame;
 
-  SmallVector<DWARFCompileUnit, 1> DWOCUs;
+  SmallVector<DWARFCompileUnit *, 1> DWOCUs;
   OwningPtr<DWARFDebugAbbrev> AbbrevDWO;
 
   DWARFContext(DWARFContext &) LLVM_DELETED_FUNCTION;
@@ -43,12 +45,21 @@ class DWARFContext : public DIContext {
   /// Read compile units from the debug_info section and store them in CUs.
   void parseCompileUnits();
 
+  /// Read type units from the debug_types sections and store them in CUs.
+  void parseTypeUnits();
+
   /// Read compile units from the debug_info.dwo section and store them in
   /// DWOCUs.
   void parseDWOCompileUnits();
 
 public:
+  struct Section {
+    StringRef Data;
+    RelocAddrMap Relocs;
+  };
+
   DWARFContext() : DIContext(CK_DWARF) {}
+  virtual ~DWARFContext();
 
   static bool classof(const DIContext *DICtx) {
     return DICtx->getKind() == CK_DWARF;
@@ -63,6 +74,13 @@ public:
     return CUs.size();
   }
 
+  /// Get the number of compile units in this context.
+  unsigned getNumTypeUnits() {
+    if (TUs.empty())
+      parseTypeUnits();
+    return TUs.size();
+  }
+
   /// Get the number of compile units in the DWO context.
   unsigned getNumDWOCompileUnits() {
     if (DWOCUs.empty())
@@ -74,14 +92,21 @@ public:
   DWARFCompileUnit *getCompileUnitAtIndex(unsigned index) {
     if (CUs.empty())
       parseCompileUnits();
-    return &CUs[index];
+    return CUs[index];
+  }
+
+  /// Get the type unit at the specified index for this compile unit.
+  DWARFTypeUnit *getTypeUnitAtIndex(unsigned index) {
+    if (TUs.empty())
+      parseTypeUnits();
+    return TUs[index];
   }
 
   /// Get the compile unit at the specified index for the DWO compile units.
   DWARFCompileUnit *getDWOCompileUnitAtIndex(unsigned index) {
     if (DWOCUs.empty())
       parseDWOCompileUnits();
-    return &DWOCUs[index];
+    return DWOCUs[index];
   }
 
   /// Get a pointer to the parsed DebugAbbrev object.
@@ -112,27 +137,27 @@ public:
 
   virtual bool isLittleEndian() const = 0;
   virtual uint8_t getAddressSize() const = 0;
-  virtual const RelocAddrMap &infoRelocMap() const = 0;
-  virtual const RelocAddrMap &lineRelocMap() const = 0;
-  virtual const RelocAddrMap &locRelocMap() const = 0;
-  virtual StringRef getInfoSection() = 0;
+  virtual const Section &getInfoSection() = 0;
+  virtual const std::map<object::SectionRef, Section> &getTypesSections() = 0;
   virtual StringRef getAbbrevSection() = 0;
-  virtual StringRef getLocSection() = 0;
+  virtual const Section &getLocSection() = 0;
   virtual StringRef getARangeSection() = 0;
   virtual StringRef getDebugFrameSection() = 0;
-  virtual StringRef getLineSection() = 0;
+  virtual const Section &getLineSection() = 0;
   virtual StringRef getStringSection() = 0;
   virtual StringRef getRangeSection() = 0;
   virtual StringRef getPubNamesSection() = 0;
+  virtual StringRef getPubTypesSection() = 0;
+  virtual StringRef getGnuPubNamesSection() = 0;
+  virtual StringRef getGnuPubTypesSection() = 0;
 
   // Sections for DWARF5 split dwarf proposal.
-  virtual StringRef getInfoDWOSection() = 0;
+  virtual const Section &getInfoDWOSection() = 0;
   virtual StringRef getAbbrevDWOSection() = 0;
   virtual StringRef getStringDWOSection() = 0;
   virtual StringRef getStringOffsetDWOSection() = 0;
   virtual StringRef getRangeDWOSection() = 0;
   virtual StringRef getAddrSection() = 0;
-  virtual const RelocAddrMap &infoDWORelocMap() const = 0;
 
   static bool isSupportedVersion(unsigned version) {
     return version == 2 || version == 3 || version == 4;
@@ -153,22 +178,22 @@ class DWARFContextInMemory : public DWARFContext {
   virtual void anchor();
   bool IsLittleEndian;
   uint8_t AddressSize;
-  RelocAddrMap InfoRelocMap;
-  RelocAddrMap LocRelocMap;
-  RelocAddrMap LineRelocMap;
-  StringRef InfoSection;
+  Section InfoSection;
+  std::map<object::SectionRef, Section> TypesSections;
   StringRef AbbrevSection;
-  StringRef LocSection;
+  Section LocSection;
   StringRef ARangeSection;
   StringRef DebugFrameSection;
-  StringRef LineSection;
+  Section LineSection;
   StringRef StringSection;
   StringRef RangeSection;
   StringRef PubNamesSection;
+  StringRef PubTypesSection;
+  StringRef GnuPubNamesSection;
+  StringRef GnuPubTypesSection;
 
   // Sections for DWARF5 split dwarf proposal.
-  RelocAddrMap InfoDWORelocMap;
-  StringRef InfoDWOSection;
+  Section InfoDWOSection;
   StringRef AbbrevDWOSection;
   StringRef StringDWOSection;
   StringRef StringOffsetDWOSection;
@@ -182,21 +207,24 @@ public:
   ~DWARFContextInMemory();
   virtual bool isLittleEndian() const { return IsLittleEndian; }
   virtual uint8_t getAddressSize() const { return AddressSize; }
-  virtual const RelocAddrMap &infoRelocMap() const { return InfoRelocMap; }
-  virtual const RelocAddrMap &locRelocMap() const { return LocRelocMap; }
-  virtual const RelocAddrMap &lineRelocMap() const { return LineRelocMap; }
-  virtual StringRef getInfoSection() { return InfoSection; }
+  virtual const Section &getInfoSection() { return InfoSection; }
+  virtual const std::map<object::SectionRef, Section> &getTypesSections() {
+    return TypesSections;
+  }
   virtual StringRef getAbbrevSection() { return AbbrevSection; }
-  virtual StringRef getLocSection() { return LocSection; }
+  virtual const Section &getLocSection() { return LocSection; }
   virtual StringRef getARangeSection() { return ARangeSection; }
   virtual StringRef getDebugFrameSection() { return DebugFrameSection; }
-  virtual StringRef getLineSection() { return LineSection; }
+  virtual const Section &getLineSection() { return LineSection; }
   virtual StringRef getStringSection() { return StringSection; }
   virtual StringRef getRangeSection() { return RangeSection; }
   virtual StringRef getPubNamesSection() { return PubNamesSection; }
+  virtual StringRef getPubTypesSection() { return PubTypesSection; }
+  virtual StringRef getGnuPubNamesSection() { return GnuPubNamesSection; }
+  virtual StringRef getGnuPubTypesSection() { return GnuPubTypesSection; }
 
   // Sections for DWARF5 split dwarf proposal.
-  virtual StringRef getInfoDWOSection() { return InfoDWOSection; }
+  virtual const Section &getInfoDWOSection() { return InfoDWOSection; }
   virtual StringRef getAbbrevDWOSection() { return AbbrevDWOSection; }
   virtual StringRef getStringDWOSection() { return StringDWOSection; }
   virtual StringRef getStringOffsetDWOSection() {
@@ -206,9 +234,6 @@ public:
   virtual StringRef getAddrSection() {
     return AddrSection;
   }
-  virtual const RelocAddrMap &infoDWORelocMap() const {
-    return InfoDWORelocMap;
-  }
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARFDebugArangeSet.cpp
index 7dff9ff..229376e 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.cpp
+++ b/lib/DebugInfo/DWARFDebugArangeSet.cpp
@@ -20,32 +20,6 @@ void DWARFDebugArangeSet::clear() {
   ArangeDescriptors.clear();
 }
 
-void DWARFDebugArangeSet::compact() {
-  if (ArangeDescriptors.empty())
-    return;
-
-  // Iterate through all arange descriptors and combine any ranges that
-  // overlap or have matching boundaries. The ArangeDescriptors are assumed
-  // to be in ascending order.
-  uint32_t i = 0;
-  while (i + 1 < ArangeDescriptors.size()) {
-    if (ArangeDescriptors[i].getEndAddress() >= ArangeDescriptors[i+1].Address){
-      // The current range ends at or exceeds the start of the next address
-      // range. Compute the max end address between the two and use that to
-      // make the new length.
-      const uint64_t max_end_addr =
-        std::max(ArangeDescriptors[i].getEndAddress(),
-                 ArangeDescriptors[i+1].getEndAddress());
-      ArangeDescriptors[i].Length = max_end_addr - ArangeDescriptors[i].Address;
-      // Now remove the next entry as it was just combined with the previous one
-      ArangeDescriptors.erase(ArangeDescriptors.begin()+i+1);
-    } else {
-      // Discontiguous address range, just proceed to the next one.
-      ++i;
-    }
-  }
-}
-
 bool
 DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
   if (data.isValidOffset(*offset_ptr)) {
@@ -126,26 +100,3 @@ void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
        << format(" 0x%*.*" PRIx64 ")\n",
                  hex_width, hex_width, pos->getEndAddress());
 }
-
-
-namespace {
-  class DescriptorContainsAddress {
-    const uint64_t Address;
-  public:
-    DescriptorContainsAddress(uint64_t address) : Address(address) {}
-    bool operator()(const DWARFDebugArangeSet::Descriptor &desc) const {
-      return Address >= desc.Address && Address < (desc.Address + desc.Length);
-    }
-  };
-}
-
-uint32_t DWARFDebugArangeSet::findAddress(uint64_t address) const {
-  DescriptorConstIter end = ArangeDescriptors.end();
-  DescriptorConstIter pos =
-    std::find_if(ArangeDescriptors.begin(), end, // Range
-                 DescriptorContainsAddress(address)); // Predicate
-  if (pos != end)
-    return HeaderData.CuOffset;
-
-  return -1U;
-}
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.h b/lib/DebugInfo/DWARFDebugArangeSet.h
index d768676..49a7132 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.h
+++ b/lib/DebugInfo/DWARFDebugArangeSet.h
@@ -44,7 +44,6 @@ public:
 
 private:
   typedef std::vector<Descriptor> DescriptorColl;
-  typedef DescriptorColl::iterator DescriptorIter;
   typedef DescriptorColl::const_iterator DescriptorConstIter;
 
   uint32_t Offset;
@@ -54,15 +53,11 @@ private:
 public:
   DWARFDebugArangeSet() { clear(); }
   void clear();
-  void compact();
   bool extract(DataExtractor data, uint32_t *offset_ptr);
   void dump(raw_ostream &OS) const;
 
   uint32_t getCompileUnitDIEOffset() const { return HeaderData.CuOffset; }
-  uint32_t getOffsetOfNextEntry() const { return Offset + HeaderData.Length + 4; }
-  uint32_t findAddress(uint64_t address) const;
   uint32_t getNumDescriptors() const { return ArangeDescriptors.size(); }
-  const struct Header &getHeader() const { return HeaderData; }
   const Descriptor *getDescriptor(uint32_t i) const {
     if (i < ArangeDescriptors.size())
       return &ArangeDescriptors[i];
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index f79862d..591d4bd 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -16,128 +16,79 @@
 #include <cassert>
 using namespace llvm;
 
-// Compare function DWARFDebugAranges::Range structures
-static bool RangeLessThan(const DWARFDebugAranges::Range &range1,
-                          const DWARFDebugAranges::Range &range2) {
-  return range1.LoPC < range2.LoPC;
-}
-
-namespace {
-  class CountArangeDescriptors {
-  public:
-    CountArangeDescriptors(uint32_t &count_ref) : Count(count_ref) {}
-    void operator()(const DWARFDebugArangeSet &Set) {
-      Count += Set.getNumDescriptors();
-    }
-    uint32_t &Count;
-  };
-
-  class AddArangeDescriptors {
-  public:
-    AddArangeDescriptors(DWARFDebugAranges::RangeColl &Ranges,
-                         DWARFDebugAranges::ParsedCUOffsetColl &CUOffsets)
-      : RangeCollection(Ranges),
-        CUOffsetCollection(CUOffsets) {}
-    void operator()(const DWARFDebugArangeSet &Set) {
-      DWARFDebugAranges::Range Range;
-      Range.Offset = Set.getCompileUnitDIEOffset();
-      CUOffsetCollection.insert(Range.Offset);
-
-      for (uint32_t i = 0, n = Set.getNumDescriptors(); i < n; ++i) {
-        const DWARFDebugArangeSet::Descriptor *ArangeDescPtr =
-            Set.getDescriptor(i);
-        Range.LoPC = ArangeDescPtr->Address;
-        Range.Length = ArangeDescPtr->Length;
-
-        // Insert each item in increasing address order so binary searching
-        // can later be done!
-        DWARFDebugAranges::RangeColl::iterator InsertPos =
-          std::lower_bound(RangeCollection.begin(), RangeCollection.end(),
-                           Range, RangeLessThan);
-        RangeCollection.insert(InsertPos, Range);
-      }
-
-    }
-    DWARFDebugAranges::RangeColl &RangeCollection;
-    DWARFDebugAranges::ParsedCUOffsetColl &CUOffsetCollection;
-  };
-}
-
-bool DWARFDebugAranges::extract(DataExtractor debug_aranges_data) {
-  if (debug_aranges_data.isValidOffset(0)) {
-    uint32_t offset = 0;
-
-    typedef std::vector<DWARFDebugArangeSet> SetCollection;
-    SetCollection sets;
-
-    DWARFDebugArangeSet set;
-    Range range;
-    while (set.extract(debug_aranges_data, &offset))
-      sets.push_back(set);
-
-    uint32_t count = 0;
-
-    std::for_each(sets.begin(), sets.end(), CountArangeDescriptors(count));
-
-    if (count > 0) {
-      Aranges.reserve(count);
-      AddArangeDescriptors range_adder(Aranges, ParsedCUOffsets);
-      std::for_each(sets.begin(), sets.end(), range_adder);
-    }
+void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
+  if (!DebugArangesData.isValidOffset(0))
+    return;
+  uint32_t Offset = 0;
+  typedef std::vector<DWARFDebugArangeSet> RangeSetColl;
+  RangeSetColl Sets;
+  DWARFDebugArangeSet Set;
+  uint32_t TotalRanges = 0;
+
+  while (Set.extract(DebugArangesData, &Offset)) {
+    Sets.push_back(Set);
+    TotalRanges += Set.getNumDescriptors();
   }
-  return false;
-}
+  if (TotalRanges == 0)
+    return;
 
-bool DWARFDebugAranges::generate(DWARFContext *ctx) {
-  if (ctx) {
-    const uint32_t num_compile_units = ctx->getNumCompileUnits();
-    for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx) {
-      if (DWARFCompileUnit *cu = ctx->getCompileUnitAtIndex(cu_idx)) {
-        uint32_t CUOffset = cu->getOffset();
-        if (ParsedCUOffsets.insert(CUOffset).second)
-          cu->buildAddressRangeTable(this, true);
-      }
+  Aranges.reserve(TotalRanges);
+  for (RangeSetColl::const_iterator I = Sets.begin(), E = Sets.end(); I != E;
+       ++I) {
+    uint32_t CUOffset = I->getCompileUnitDIEOffset();
+
+    for (uint32_t i = 0, n = I->getNumDescriptors(); i < n; ++i) {
+      const DWARFDebugArangeSet::Descriptor *ArangeDescPtr =
+          I->getDescriptor(i);
+      uint64_t LowPC = ArangeDescPtr->Address;
+      uint64_t HighPC = LowPC + ArangeDescPtr->Length;
+      appendRange(CUOffset, LowPC, HighPC);
     }
   }
-  sort(true, /* overlap size */ 0);
-  return !isEmpty();
 }
 
-void DWARFDebugAranges::dump(raw_ostream &OS) const {
-  const uint32_t num_ranges = getNumRanges();
-  for (uint32_t i = 0; i < num_ranges; ++i) {
-    const Range &range = Aranges[i];
-    OS << format("0x%8.8x: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
-                 range.Offset, (uint64_t)range.LoPC, (uint64_t)range.HiPC());
+void DWARFDebugAranges::generate(DWARFContext *CTX) {
+  clear();
+  if (!CTX)
+    return;
+
+  // Extract aranges from .debug_aranges section.
+  DataExtractor ArangesData(CTX->getARangeSection(), CTX->isLittleEndian(), 0);
+  extract(ArangesData);
+
+  // Generate aranges from DIEs: even if .debug_aranges section is present,
+  // it may describe only a small subset of compilation units, so we need to
+  // manually build aranges for the rest of them.
+  for (uint32_t i = 0, n = CTX->getNumCompileUnits(); i < n; ++i) {
+    if (DWARFCompileUnit *CU = CTX->getCompileUnitAtIndex(i)) {
+      uint32_t CUOffset = CU->getOffset();
+      if (ParsedCUOffsets.insert(CUOffset).second)
+        CU->buildAddressRangeTable(this, true, CUOffset);
+    }
   }
-}
 
-void DWARFDebugAranges::Range::dump(raw_ostream &OS) const {
-  OS << format("{0x%8.8x}: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
-               Offset, LoPC, HiPC());
+  sortAndMinimize();
 }
 
-void DWARFDebugAranges::appendRange(uint32_t offset, uint64_t low_pc,
-                                    uint64_t high_pc) {
+void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
+                                    uint64_t HighPC) {
   if (!Aranges.empty()) {
-    if (Aranges.back().Offset == offset && Aranges.back().HiPC() == low_pc) {
-      Aranges.back().setHiPC(high_pc);
+    if (Aranges.back().CUOffset == CUOffset &&
+        Aranges.back().HighPC() == LowPC) {
+      Aranges.back().setHighPC(HighPC);
       return;
     }
   }
-  Aranges.push_back(Range(low_pc, high_pc, offset));
+  Aranges.push_back(Range(LowPC, HighPC, CUOffset));
 }
 
-void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
+void DWARFDebugAranges::sortAndMinimize() {
   const size_t orig_arange_size = Aranges.size();
   // Size of one? If so, no sorting is needed
   if (orig_arange_size <= 1)
     return;
   // Sort our address range entries
-  std::stable_sort(Aranges.begin(), Aranges.end(), RangeLessThan);
-
-  if (!minimize)
-    return;
+  std::stable_sort(Aranges.begin(), Aranges.end());
 
   // Most address ranges are contiguous from function to function
   // so our new ranges will likely be smaller. We calculate the size
@@ -151,7 +102,7 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   // copy the new minimal stuff over to the new collection.
   size_t minimal_size = 1;
   for (size_t i = 1; i < orig_arange_size; ++i) {
-    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i], n))
+    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i]))
       ++minimal_size;
   }
 
@@ -166,14 +117,14 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   uint32_t j = 0;
   minimal_aranges[j] = Aranges[0];
   for (size_t i = 1; i < orig_arange_size; ++i) {
-    if(Range::SortedOverlapCheck (minimal_aranges[j], Aranges[i], n)) {
-      minimal_aranges[j].setHiPC (Aranges[i].HiPC());
+    if (Range::SortedOverlapCheck(minimal_aranges[j], Aranges[i])) {
+      minimal_aranges[j].setHighPC(Aranges[i].HighPC());
     } else {
       // Only increment j if we aren't merging
       minimal_aranges[++j] = Aranges[i];
     }
   }
-  assert (j+1 == minimal_size);
+  assert(j+1 == minimal_size);
 
   // Now swap our new minimal aranges into place. The local
   // minimal_aranges will then contian the old big collection
@@ -181,50 +132,21 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   minimal_aranges.swap(Aranges);
 }
 
-uint32_t DWARFDebugAranges::findAddress(uint64_t address) const {
+uint32_t DWARFDebugAranges::findAddress(uint64_t Address) const {
   if (!Aranges.empty()) {
-    Range range(address);
+    Range range(Address);
     RangeCollIterator begin = Aranges.begin();
     RangeCollIterator end = Aranges.end();
-    RangeCollIterator pos = std::lower_bound(begin, end, range, RangeLessThan);
+    RangeCollIterator pos =
+        std::lower_bound(begin, end, range);
 
-    if (pos != end && pos->LoPC <= address && address < pos->HiPC()) {
-      return pos->Offset;
+    if (pos != end && pos->containsAddress(Address)) {
+      return pos->CUOffset;
     } else if (pos != begin) {
       --pos;
-      if (pos->LoPC <= address && address < pos->HiPC())
-        return (*pos).Offset;
+      if (pos->containsAddress(Address))
+        return pos->CUOffset;
     }
   }
   return -1U;
 }
-
-bool
-DWARFDebugAranges::allRangesAreContiguous(uint64_t &LoPC, uint64_t &HiPC) const{
-  if (Aranges.empty())
-    return false;
-
-  uint64_t next_addr = 0;
-  RangeCollIterator begin = Aranges.begin();
-  for (RangeCollIterator pos = begin, end = Aranges.end(); pos != end;
-       ++pos) {
-    if (pos != begin && pos->LoPC != next_addr)
-      return false;
-    next_addr = pos->HiPC();
-  }
-  // We checked for empty at the start of function so front() will be valid.
-  LoPC = Aranges.front().LoPC;
-  // We checked for empty at the start of function so back() will be valid.
-  HiPC = Aranges.back().HiPC();
-  return true;
-}
-
-bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const {
-  if (Aranges.empty())
-    return false;
-  // We checked for empty at the start of function so front() will be valid.
-  LoPC = Aranges.front().LoPC;
-  // We checked for empty at the start of function so back() will be valid.
-  HiPC = Aranges.back().HiPC();
-  return true;
-}
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
index 1509ffa..35ad8e5 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/lib/DebugInfo/DWARFDebugAranges.h
@@ -20,81 +20,61 @@ class DWARFContext;
 
 class DWARFDebugAranges {
 public:
+  void clear() {
+    Aranges.clear();
+    ParsedCUOffsets.clear();
+  }
+
+  void generate(DWARFContext *CTX);
+
+  // Use appendRange multiple times and then call sortAndMinimize.
+  void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
+
+  uint32_t findAddress(uint64_t Address) const;
+
+private:
+  void extract(DataExtractor DebugArangesData);
+  void sortAndMinimize();
+
   struct Range {
-    explicit Range(uint64_t lo = -1ULL, uint64_t hi = -1ULL,
-                   uint32_t off = -1U)
-      : LoPC(lo), Length(hi-lo), Offset(off) {}
-
-    void clear() {
-      LoPC = -1ULL;
-      Length = 0;
-      Offset = -1U;
-    }
+    explicit Range(uint64_t LowPC = -1ULL, uint64_t HighPC = -1ULL,
+                   uint32_t CUOffset = -1U)
+      : LowPC(LowPC), Length(HighPC - LowPC), CUOffset(CUOffset) {}
 
-    void setHiPC(uint64_t HiPC) {
-      if (HiPC == -1ULL || HiPC <= LoPC)
+    void setHighPC(uint64_t HighPC) {
+      if (HighPC == -1ULL || HighPC <= LowPC)
         Length = 0;
       else
-        Length = HiPC - LoPC;
+        Length = HighPC - LowPC;
     }
-    uint64_t HiPC() const {
+    uint64_t HighPC() const {
       if (Length)
-        return LoPC + Length;
+        return LowPC + Length;
       return -1ULL;
     }
-    bool isValidRange() const { return Length > 0; }
+    bool containsAddress(uint64_t Address) const {
+      return LowPC <= Address && Address < HighPC();
+    }
 
-    static bool SortedOverlapCheck(const Range &curr_range,
-                                   const Range &next_range, uint32_t n) {
-      if (curr_range.Offset != next_range.Offset)
-        return false;
-      return curr_range.HiPC() + n >= next_range.LoPC;
+    bool operator <(const Range &other) const {
+      return LowPC < other.LowPC;
     }
 
-    bool contains(const Range &range) const {
-      return LoPC <= range.LoPC && range.HiPC() <= HiPC();
+    static bool SortedOverlapCheck(const Range &Left, const Range &Right) {
+      if (Left.CUOffset != Right.CUOffset)
+        return false;
+      return Left.HighPC() >= Right.LowPC;
     }
 
-    void dump(raw_ostream &OS) const;
-    uint64_t LoPC; // Start of address range
-    uint32_t Length; // End of address range (not including this address)
-    uint32_t Offset; // Offset of the compile unit or die
+    uint64_t LowPC; // Start of address range.
+    uint32_t Length; // End of address range (not including this address).
+    uint32_t CUOffset; // Offset of the compile unit or die.
   };
 
-  void clear() {
-    Aranges.clear();
-    ParsedCUOffsets.clear();
-  }
-  bool allRangesAreContiguous(uint64_t& LoPC, uint64_t& HiPC) const;
-  bool getMaxRange(uint64_t& LoPC, uint64_t& HiPC) const;
-  bool extract(DataExtractor debug_aranges_data);
-  bool generate(DWARFContext *ctx);
-
-  // Use append range multiple times and then call sort
-  void appendRange(uint32_t cu_offset, uint64_t low_pc, uint64_t high_pc);
-  void sort(bool minimize, uint32_t n);
-
-  const Range *rangeAtIndex(uint32_t idx) const {
-    if (idx < Aranges.size())
-      return &Aranges[idx];
-    return NULL;
-  }
-  void dump(raw_ostream &OS) const;
-  uint32_t findAddress(uint64_t address) const;
-  bool isEmpty() const { return Aranges.empty(); }
-  uint32_t getNumRanges() const { return Aranges.size(); }
-
-  uint32_t offsetAtIndex(uint32_t idx) const {
-    if (idx < Aranges.size())
-      return Aranges[idx].Offset;
-    return -1U;
-  }
-
   typedef std::vector<Range>              RangeColl;
   typedef RangeColl::const_iterator       RangeCollIterator;
   typedef DenseSet<uint32_t>              ParsedCUOffsetColl;
 
-private:
   RangeColl Aranges;
   ParsedCUOffsetColl ParsedCUOffsets;
 };
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 0c7b7e3..babfd2e 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -19,11 +19,10 @@
 using namespace llvm;
 using namespace dwarf;
 
-void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
-                                      const DWARFCompileUnit *cu,
+void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, const DWARFUnit *u,
                                       unsigned recurseDepth,
                                       unsigned indent) const {
-  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  DataExtractor debug_info_data = u->getDebugInfoExtractor();
   uint32_t offset = Offset;
 
   if (debug_info_data.isValidOffset(offset)) {
@@ -45,13 +44,13 @@ void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
         for (uint32_t i = 0; i != numAttributes; ++i) {
           uint16_t attr = AbbrevDecl->getAttrByIndex(i);
           uint16_t form = AbbrevDecl->getFormByIndex(i);
-          dumpAttribute(OS, cu, &offset, attr, form, indent);
+          dumpAttribute(OS, u, &offset, attr, form, indent);
         }
 
         const DWARFDebugInfoEntryMinimal *child = getFirstChild();
         if (recurseDepth > 0 && child) {
           while (child) {
-            child->dump(OS, cu, recurseDepth-1, indent+2);
+            child->dump(OS, u, recurseDepth-1, indent+2);
             child = child->getSibling();
           }
         }
@@ -66,12 +65,11 @@ void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
 }
 
 void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
-                                               const DWARFCompileUnit *cu,
-                                               uint32_t* offset_ptr,
-                                               uint16_t attr,
-                                               uint16_t form,
+                                               const DWARFUnit *u,
+                                               uint32_t *offset_ptr,
+                                               uint16_t attr, uint16_t form,
                                                unsigned indent) const {
-  OS << format("0x%8.8x: ", *offset_ptr);
+  OS << "            ";
   OS.indent(indent+2);
   const char *attrString = AttributeString(attr);
   if (attrString)
@@ -86,57 +84,20 @@ void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
 
   DWARFFormValue formValue(form);
 
-  if (!formValue.extractValue(cu->getDebugInfoExtractor(), offset_ptr, cu))
+  if (!formValue.extractValue(u->getDebugInfoExtractor(), offset_ptr, u))
     return;
 
   OS << "\t(";
-  formValue.dump(OS, cu);
+  formValue.dump(OS, u);
   OS << ")\n";
 }
 
-bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *CU,
-                                             const uint8_t *FixedFormSizes,
+bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFUnit *U,
                                              uint32_t *OffsetPtr) {
   Offset = *OffsetPtr;
-  DataExtractor DebugInfoData = CU->getDebugInfoExtractor();
-  uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
-  if (0 == AbbrCode) {
-    // NULL debug tag entry.
-    AbbrevDecl = NULL;
-    return true;
-  }
-  AbbrevDecl = CU->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
-  assert(AbbrevDecl);
-  assert(FixedFormSizes); // For best performance this should be specified!
-
-  // Skip all data in the .debug_info for the attributes
-  for (uint32_t i = 0, n = AbbrevDecl->getNumAttributes(); i < n; ++i) {
-    uint16_t Form = AbbrevDecl->getFormByIndex(i);
-
-    // FIXME: Currently we're checking if this is less than the last
-    // entry in the fixed_form_sizes table, but this should be changed
-    // to use dynamic dispatch.
-    uint8_t FixedFormSize =
-        (Form < DW_FORM_ref_sig8) ? FixedFormSizes[Form] : 0;
-    if (FixedFormSize)
-      *OffsetPtr += FixedFormSize;
-    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr,
-                                        CU)) {
-      // Restore the original offset.
-      *OffsetPtr = Offset;
-      return false;
-    }
-  }
-  return true;
-}
-
-bool
-DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *CU,
-                                    uint32_t *OffsetPtr) {
-  DataExtractor DebugInfoData = CU->getDebugInfoExtractor();
-  const uint32_t CUEndOffset = CU->getNextCompileUnitOffset();
-  Offset = *OffsetPtr;
-  if ((Offset >= CUEndOffset) || !DebugInfoData.isValidOffset(Offset))
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t UEndOffset = U->getNextUnitOffset();
+  if (Offset >= UEndOffset || !DebugInfoData.isValidOffset(Offset))
     return false;
   uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
   if (0 == AbbrCode) {
@@ -144,31 +105,25 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *CU,
     AbbrevDecl = NULL;
     return true;
   }
-  AbbrevDecl = CU->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
+  AbbrevDecl = U->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
   if (0 == AbbrevDecl) {
     // Restore the original offset.
     *OffsetPtr = Offset;
     return false;
   }
-  bool IsCompileUnitTag = (AbbrevDecl->getTag() == DW_TAG_compile_unit);
-  if (IsCompileUnitTag)
-    const_cast<DWARFCompileUnit*>(CU)->setBaseAddress(0);
+  ArrayRef<uint8_t> FixedFormSizes = DWARFFormValue::getFixedFormSizes(
+      U->getAddressByteSize(), U->getVersion());
+  assert(FixedFormSizes.size() > 0);
 
   // Skip all data in the .debug_info for the attributes
   for (uint32_t i = 0, n = AbbrevDecl->getNumAttributes(); i < n; ++i) {
-    uint16_t Attr = AbbrevDecl->getAttrByIndex(i);
     uint16_t Form = AbbrevDecl->getFormByIndex(i);
 
-    if (IsCompileUnitTag &&
-        ((Attr == DW_AT_entry_pc) || (Attr == DW_AT_low_pc))) {
-      DWARFFormValue FormValue(Form);
-      if (FormValue.extractValue(DebugInfoData, OffsetPtr, CU)) {
-        if (Attr == DW_AT_low_pc || Attr == DW_AT_entry_pc)
-          const_cast<DWARFCompileUnit*>(CU)
-            ->setBaseAddress(FormValue.getUnsigned());
-      }
-    } else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr,
-                                          CU)) {
+    uint8_t FixedFormSize =
+        (Form < FixedFormSizes.size()) ? FixedFormSizes[Form] : 0;
+    if (FixedFormSize)
+      *OffsetPtr += FixedFormSize;
+    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, U)) {
       // Restore the original offset.
       *OffsetPtr = Offset;
       return false;
@@ -187,190 +142,179 @@ bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
          Tag == DW_TAG_inlined_subroutine;
 }
 
-uint32_t
-DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
-                                              const uint16_t attr,
-                                              DWARFFormValue &form_value,
-                                              uint32_t *end_attr_offset_ptr)
-                                              const {
-  if (AbbrevDecl) {
-    uint32_t attr_idx = AbbrevDecl->findAttributeIndex(attr);
-
-    if (attr_idx != -1U) {
-      uint32_t offset = getOffset();
+bool DWARFDebugInfoEntryMinimal::getAttributeValue(
+    const DWARFUnit *U, const uint16_t Attr, DWARFFormValue &FormValue) const {
+  if (!AbbrevDecl)
+    return false;
 
-      DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  uint32_t AttrIdx = AbbrevDecl->findAttributeIndex(Attr);
+  if (AttrIdx == -1U)
+    return false;
 
-      // Skip the abbreviation code so we are at the data for the attributes
-      debug_info_data.getULEB128(&offset);
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t DebugInfoOffset = getOffset();
 
-      uint32_t idx = 0;
-      while (idx < attr_idx)
-        DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(idx++),
-                                  debug_info_data, &offset, cu);
+  // Skip the abbreviation code so we are at the data for the attributes
+  DebugInfoData.getULEB128(&DebugInfoOffset);
 
-      const uint32_t attr_offset = offset;
-      form_value = DWARFFormValue(AbbrevDecl->getFormByIndex(idx));
-      if (form_value.extractValue(debug_info_data, &offset, cu)) {
-        if (end_attr_offset_ptr)
-          *end_attr_offset_ptr = offset;
-        return attr_offset;
-      }
-    }
+  // Skip preceding attribute values.
+  for (uint32_t i = 0; i < AttrIdx; ++i) {
+    DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(i),
+                              DebugInfoData, &DebugInfoOffset, U);
   }
 
-  return 0;
+  FormValue = DWARFFormValue(AbbrevDecl->getFormByIndex(AttrIdx));
+  return FormValue.extractValue(DebugInfoData, &DebugInfoOffset, U);
 }
 
-const char*
-DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     const char* fail_value)
-                                                     const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value)) {
-    DataExtractor stringExtractor(cu->getStringSection(), false, 0);
-    return form_value.getAsCString(&stringExtractor);
-  }
-  return fail_value;
+const char *DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
+    const DWARFUnit *U, const uint16_t Attr, const char *FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<const char *> Result = FormValue.getAsCString(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsAddress(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsAddress(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsigned(
-                                                    const DWARFCompileUnit* cu,
-                                                    const uint16_t attr,
-                                                    uint64_t fail_value) const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getUnsigned();
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsignedConstant(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsUnsignedConstant();
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-int64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsSigned(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     int64_t fail_value) const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getSigned();
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsReference(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     uint64_t fail_value)
-                                                     const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getReference(cu);
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSectionOffset(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsSectionOffset();
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFCompileUnit *CU,
+bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
                                                  uint64_t &LowPC,
                                                  uint64_t &HighPC) const {
-  HighPC = -1ULL;
-  LowPC = getAttributeValueAsUnsigned(CU, DW_AT_low_pc, -1ULL);
-  if (LowPC != -1ULL)
-    HighPC = getAttributeValueAsUnsigned(CU, DW_AT_high_pc, -1ULL);
+  LowPC = getAttributeValueAsAddress(U, DW_AT_low_pc, -1ULL);
+  if (LowPC == -1ULL)
+    return false;
+  HighPC = getAttributeValueAsAddress(U, DW_AT_high_pc, -1ULL);
+  if (HighPC == -1ULL) {
+    // Since DWARF4, DW_AT_high_pc may also be of class constant, in which case
+    // it represents function size.
+    HighPC = getAttributeValueAsUnsignedConstant(U, DW_AT_high_pc, -1ULL);
+    if (HighPC != -1ULL)
+      HighPC += LowPC;
+  }
   return (HighPC != -1ULL);
 }
 
-void
-DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *CU,
-                                               DWARFDebugAranges *DebugAranges)
-                                                   const {
+void DWARFDebugInfoEntryMinimal::buildAddressRangeTable(
+    const DWARFUnit *U, DWARFDebugAranges *DebugAranges,
+    uint32_t UOffsetInAranges) const {
   if (AbbrevDecl) {
     if (isSubprogramDIE()) {
       uint64_t LowPC, HighPC;
-      if (getLowAndHighPC(CU, LowPC, HighPC)) {
-        DebugAranges->appendRange(CU->getOffset(), LowPC, HighPC);
-      }
+      if (getLowAndHighPC(U, LowPC, HighPC))
+        DebugAranges->appendRange(UOffsetInAranges, LowPC, HighPC);
       // FIXME: try to append ranges from .debug_ranges section.
     }
 
-    const DWARFDebugInfoEntryMinimal *child = getFirstChild();
-    while (child) {
-      child->buildAddressRangeTable(CU, DebugAranges);
-      child = child->getSibling();
+    const DWARFDebugInfoEntryMinimal *Child = getFirstChild();
+    while (Child) {
+      Child->buildAddressRangeTable(U, DebugAranges, UOffsetInAranges);
+      Child = Child->getSibling();
     }
   }
 }
 
-bool
-DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
-                                                     const DWARFCompileUnit *CU,
-                                                     const uint64_t Address)
-                                                     const {
+bool DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+    const DWARFUnit *U, const uint64_t Address) const {
   if (isNULL())
     return false;
   uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(CU, LowPC, HighPC))
+  if (getLowAndHighPC(U, LowPC, HighPC))
     return (LowPC <= Address && Address <= HighPC);
   // Try to get address ranges from .debug_ranges section.
-  uint32_t RangesOffset = getAttributeValueAsReference(CU, DW_AT_ranges, -1U);
+  uint32_t RangesOffset =
+      getAttributeValueAsSectionOffset(U, DW_AT_ranges, -1U);
   if (RangesOffset != -1U) {
     DWARFDebugRangeList RangeList;
-    if (CU->extractRangeList(RangesOffset, RangeList))
-      return RangeList.containsAddress(CU->getBaseAddress(), Address);
+    if (U->extractRangeList(RangesOffset, RangeList))
+      return RangeList.containsAddress(U->getBaseAddress(), Address);
   }
   return false;
 }
 
-const char*
-DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFCompileUnit *CU)
-                                                                         const {
+const char *
+DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U) const {
   if (!isSubroutineDIE())
     return 0;
   // Try to get mangled name if possible.
   if (const char *name =
-      getAttributeValueAsString(CU, DW_AT_MIPS_linkage_name, 0))
+      getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(CU, DW_AT_linkage_name, 0))
+  if (const char *name = getAttributeValueAsString(U, DW_AT_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(CU, DW_AT_name, 0))
+  if (const char *name = getAttributeValueAsString(U, DW_AT_name, 0))
     return name;
   // Try to get name from specification DIE.
   uint32_t spec_ref =
-      getAttributeValueAsReference(CU, DW_AT_specification, -1U);
+      getAttributeValueAsReference(U, DW_AT_specification, -1U);
   if (spec_ref != -1U) {
     DWARFDebugInfoEntryMinimal spec_die;
-    if (spec_die.extract(CU, &spec_ref)) {
-      if (const char *name = spec_die.getSubroutineName(CU))
+    if (spec_die.extractFast(U, &spec_ref)) {
+      if (const char *name = spec_die.getSubroutineName(U))
         return name;
     }
   }
   // Try to get name from abstract origin DIE.
   uint32_t abs_origin_ref =
-      getAttributeValueAsReference(CU, DW_AT_abstract_origin, -1U);
+      getAttributeValueAsReference(U, DW_AT_abstract_origin, -1U);
   if (abs_origin_ref != -1U) {
     DWARFDebugInfoEntryMinimal abs_origin_die;
-    if (abs_origin_die.extract(CU, &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getSubroutineName(CU))
+    if (abs_origin_die.extractFast(U, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubroutineName(U))
         return name;
     }
   }
   return 0;
 }
 
-void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFCompileUnit *CU,
+void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFUnit *U,
                                                 uint32_t &CallFile,
                                                 uint32_t &CallLine,
                                                 uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsigned(CU, DW_AT_call_file, 0);
-  CallLine = getAttributeValueAsUnsigned(CU, DW_AT_call_line, 0);
-  CallColumn = getAttributeValueAsUnsigned(CU, DW_AT_call_column, 0);
+  CallFile = getAttributeValueAsUnsignedConstant(U, DW_AT_call_file, 0);
+  CallLine = getAttributeValueAsUnsignedConstant(U, DW_AT_call_line, 0);
+  CallColumn = getAttributeValueAsUnsignedConstant(U, DW_AT_call_column, 0);
 }
 
 DWARFDebugInfoEntryInlinedChain
 DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
-    const DWARFCompileUnit *CU, const uint64_t Address) const {
+    const DWARFUnit *U, const uint64_t Address) const {
   DWARFDebugInfoEntryInlinedChain InlinedChain;
-  InlinedChain.CU = CU;
+  InlinedChain.U = U;
   if (isNULL())
     return InlinedChain;
   for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
@@ -382,7 +326,7 @@ DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
     // Try to get child which also contains provided address.
     const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
     while (Child) {
-      if (Child->addressRangeContainsAddress(CU, Address)) {
+      if (Child->addressRangeContainsAddress(U, Address)) {
         // Assume there is only one such child.
         break;
       }
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index a69911f..aa61056 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -18,6 +18,7 @@ namespace llvm {
 
 class DWARFDebugAranges;
 class DWARFCompileUnit;
+class DWARFUnit;
 class DWARFContext;
 class DWARFFormValue;
 struct DWARFDebugInfoEntryInlinedChain;
@@ -39,23 +40,15 @@ public:
   DWARFDebugInfoEntryMinimal()
     : Offset(0), ParentIdx(0), SiblingIdx(0), AbbrevDecl(0) {}
 
-  void dump(raw_ostream &OS, const DWARFCompileUnit *cu,
-            unsigned recurseDepth, unsigned indent = 0) const;
-  void dumpAttribute(raw_ostream &OS, const DWARFCompileUnit *cu,
-                     uint32_t *offset_ptr, uint16_t attr, uint16_t form,
-                     unsigned indent = 0) const;
+  void dump(raw_ostream &OS, const DWARFUnit *u, unsigned recurseDepth,
+            unsigned indent = 0) const;
+  void dumpAttribute(raw_ostream &OS, const DWARFUnit *u, uint32_t *offset_ptr,
+                     uint16_t attr, uint16_t form, unsigned indent = 0) const;
 
-  /// Extracts a debug info entry, which is a child of a given compile unit,
+  /// Extracts a debug info entry, which is a child of a given unit,
   /// starting at a given offset. If DIE can't be extracted, returns false and
   /// doesn't change OffsetPtr.
-  bool extractFast(const DWARFCompileUnit *CU, const uint8_t *FixedFormSizes,
-                   uint32_t *OffsetPtr);
-
-  /// Extract a debug info entry for a given compile unit from the
-  /// .debug_info and .debug_abbrev data starting at the given offset.
-  /// If compile unit can't be parsed, returns false and doesn't change
-  /// OffsetPtr.
-  bool extract(const DWARFCompileUnit *CU, uint32_t *OffsetPtr);
+  bool extractFast(const DWARFUnit *U, uint32_t *OffsetPtr);
 
   uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
   bool isNULL() const { return AbbrevDecl == 0; }
@@ -120,54 +113,54 @@ public:
     return AbbrevDecl;
   }
 
-  uint32_t getAttributeValue(const DWARFCompileUnit *cu,
-                             const uint16_t attr, DWARFFormValue &formValue,
-                             uint32_t *end_attr_offset_ptr = 0) const;
+  bool getAttributeValue(const DWARFUnit *U, const uint16_t Attr,
+                         DWARFFormValue &FormValue) const;
+
+  const char *getAttributeValueAsString(const DWARFUnit *U, const uint16_t Attr,
+                                        const char *FailValue) const;
 
-  const char* getAttributeValueAsString(const DWARFCompileUnit* cu,
-                                        const uint16_t attr,
-                                        const char *fail_value) const;
+  uint64_t getAttributeValueAsAddress(const DWARFUnit *U, const uint16_t Attr,
+                                      uint64_t FailValue) const;
 
-  uint64_t getAttributeValueAsUnsigned(const DWARFCompileUnit *cu,
-                                       const uint16_t attr,
-                                       uint64_t fail_value) const;
+  uint64_t getAttributeValueAsUnsignedConstant(const DWARFUnit *U,
+                                               const uint16_t Attr,
+                                               uint64_t FailValue) const;
 
-  uint64_t getAttributeValueAsReference(const DWARFCompileUnit *cu,
-                                        const uint16_t attr,
-                                        uint64_t fail_value) const;
+  uint64_t getAttributeValueAsReference(const DWARFUnit *U, const uint16_t Attr,
+                                        uint64_t FailValue) const;
 
-  int64_t getAttributeValueAsSigned(const DWARFCompileUnit* cu,
-                                    const uint16_t attr,
-                                    int64_t fail_value) const;
+  uint64_t getAttributeValueAsSectionOffset(const DWARFUnit *U,
+                                            const uint16_t Attr,
+                                            uint64_t FailValue) const;
 
   /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
   /// Returns true if both attributes are present.
-  bool getLowAndHighPC(const DWARFCompileUnit *CU,
-                       uint64_t &LowPC, uint64_t &HighPC) const;
+  bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
+                       uint64_t &HighPC) const;
 
-  void buildAddressRangeTable(const DWARFCompileUnit *CU,
-                              DWARFDebugAranges *DebugAranges) const;
+  void buildAddressRangeTable(const DWARFUnit *U,
+                              DWARFDebugAranges *DebugAranges,
+                              uint32_t CUOffsetInAranges) const;
 
-  bool addressRangeContainsAddress(const DWARFCompileUnit *CU,
+  bool addressRangeContainsAddress(const DWARFUnit *U,
                                    const uint64_t Address) const;
 
   /// If a DIE represents a subprogram (or inlined subroutine),
   /// returns its mangled name (or short name, if mangled is missing).
   /// This name may be fetched from specification or abstract origin
   /// for this subprogram. Returns null if no name is found.
-  const char* getSubroutineName(const DWARFCompileUnit *CU) const;
+  const char *getSubroutineName(const DWARFUnit *U) const;
 
   /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
   /// DW_AT_call_column from DIE (or zeroes if they are missing).
-  void getCallerFrame(const DWARFCompileUnit *CU, uint32_t &CallFile,
+  void getCallerFrame(const DWARFUnit *U, uint32_t &CallFile,
                       uint32_t &CallLine, uint32_t &CallColumn) const;
 
   /// Get inlined chain for a given address, rooted at the current DIE.
   /// Returns empty chain if address is not contained in address range
   /// of current DIE.
   DWARFDebugInfoEntryInlinedChain
-  getInlinedChainForAddress(const DWARFCompileUnit *CU,
-                            const uint64_t Address) const;
+  getInlinedChainForAddress(const DWARFUnit *U, const uint64_t Address) const;
 };
 
 /// DWARFDebugInfoEntryInlinedChain - represents a chain of inlined_subroutine
@@ -176,9 +169,9 @@ public:
 /// (except the last DIE) in this chain is contained in address
 /// range for next DIE in the chain.
 struct DWARFDebugInfoEntryInlinedChain {
-  DWARFDebugInfoEntryInlinedChain() : CU(0) {}
+  DWARFDebugInfoEntryInlinedChain() : U(0) {}
   SmallVector<DWARFDebugInfoEntryMinimal, 4> DIEs;
-  const DWARFCompileUnit *CU;
+  const DWARFUnit *U;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index 192381c..13d09dd 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -211,7 +211,7 @@ DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
 
   if (*offset_ptr != end_prologue_offset) {
     fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
-                    " have ended at 0x%8.8x but it ended ad 0x%8.8x\n",
+                    " have ended at 0x%8.8x but it ended at 0x%8.8x\n",
             prologue_offset, end_prologue_offset, *offset_ptr);
     return false;
   }
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index 1a1cf24..da71fb3 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -10,6 +10,8 @@
 #include "llvm/DebugInfo/DWARFFormValue.h"
 #include "DWARFCompileUnit.h"
 #include "DWARFContext.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
@@ -19,64 +21,114 @@ using namespace llvm;
 using namespace dwarf;
 
 namespace {
-template <uint8_t AddrSize, uint8_t RefAddrSize> struct FixedFormSizes {
-  static const uint8_t sizes[];
-};
+uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
+  // FIXME: Support DWARF64.
+  return (Version == 2) ? AddrSize : 4;
 }
 
 template <uint8_t AddrSize, uint8_t RefAddrSize>
-const uint8_t FixedFormSizes<AddrSize, RefAddrSize>::sizes[] = {
-  0,           // 0x00 unused
-  AddrSize,    // 0x01 DW_FORM_addr
-  0,           // 0x02 unused
-  0,           // 0x03 DW_FORM_block2
-  0,           // 0x04 DW_FORM_block4
-  2,           // 0x05 DW_FORM_data2
-  4,           // 0x06 DW_FORM_data4
-  8,           // 0x07 DW_FORM_data8
-  0,           // 0x08 DW_FORM_string
-  0,           // 0x09 DW_FORM_block
-  0,           // 0x0a DW_FORM_block1
-  1,           // 0x0b DW_FORM_data1
-  1,           // 0x0c DW_FORM_flag
-  0,           // 0x0d DW_FORM_sdata
-  4,           // 0x0e DW_FORM_strp
-  0,           // 0x0f DW_FORM_udata
-  RefAddrSize, // 0x10 DW_FORM_ref_addr
-  1,           // 0x11 DW_FORM_ref1
-  2,           // 0x12 DW_FORM_ref2
-  4,           // 0x13 DW_FORM_ref4
-  8,           // 0x14 DW_FORM_ref8
-  0,           // 0x15 DW_FORM_ref_udata
-  0,           // 0x16 DW_FORM_indirect
-  4,           // 0x17 DW_FORM_sec_offset
-  0,           // 0x18 DW_FORM_exprloc
-  0,           // 0x19 DW_FORM_flag_present
-  8,           // 0x20 DW_FORM_ref_sig8
-};
-
-static uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
-  // FIXME: Support DWARF64.
-  return (Version == 2) ? AddrSize : 4;
+ArrayRef<uint8_t> makeFixedFormSizesArrayRef() {
+  static const uint8_t sizes[] = {
+    0,           // 0x00 unused
+    AddrSize,    // 0x01 DW_FORM_addr
+    0,           // 0x02 unused
+    0,           // 0x03 DW_FORM_block2
+    0,           // 0x04 DW_FORM_block4
+    2,           // 0x05 DW_FORM_data2
+    4,           // 0x06 DW_FORM_data4
+    8,           // 0x07 DW_FORM_data8
+    0,           // 0x08 DW_FORM_string
+    0,           // 0x09 DW_FORM_block
+    0,           // 0x0a DW_FORM_block1
+    1,           // 0x0b DW_FORM_data1
+    1,           // 0x0c DW_FORM_flag
+    0,           // 0x0d DW_FORM_sdata
+    4,           // 0x0e DW_FORM_strp
+    0,           // 0x0f DW_FORM_udata
+    RefAddrSize, // 0x10 DW_FORM_ref_addr
+    1,           // 0x11 DW_FORM_ref1
+    2,           // 0x12 DW_FORM_ref2
+    4,           // 0x13 DW_FORM_ref4
+    8,           // 0x14 DW_FORM_ref8
+    0,           // 0x15 DW_FORM_ref_udata
+    0,           // 0x16 DW_FORM_indirect
+    4,           // 0x17 DW_FORM_sec_offset
+    0,           // 0x18 DW_FORM_exprloc
+    0,           // 0x19 DW_FORM_flag_present
+  };
+  return makeArrayRef(sizes);
+}
 }
 
-const uint8_t *
-DWARFFormValue::getFixedFormSizes(uint8_t AddrSize, uint16_t Version) {
+ArrayRef<uint8_t> DWARFFormValue::getFixedFormSizes(uint8_t AddrSize,
+                                                    uint16_t Version) {
   uint8_t RefAddrSize = getRefAddrSize(AddrSize, Version);
   if (AddrSize == 4 && RefAddrSize == 4)
-    return FixedFormSizes<4, 4>::sizes;
+    return makeFixedFormSizesArrayRef<4, 4>();
   if (AddrSize == 4 && RefAddrSize == 8)
-    return FixedFormSizes<4, 8>::sizes;
+    return makeFixedFormSizesArrayRef<4, 8>();
   if (AddrSize == 8 && RefAddrSize == 4)
-    return FixedFormSizes<8, 4>::sizes;
+    return makeFixedFormSizesArrayRef<8, 4>();
   if (AddrSize == 8 && RefAddrSize == 8)
-    return FixedFormSizes<8, 8>::sizes;
-  return 0;
+    return makeFixedFormSizesArrayRef<8, 8>();
+  return None;
 }
 
-bool
-DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
-                             const DWARFCompileUnit *cu) {
+static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
+  DWARFFormValue::FC_Unknown,       // 0x0
+  DWARFFormValue::FC_Address,       // 0x01 DW_FORM_addr
+  DWARFFormValue::FC_Unknown,       // 0x02 unused
+  DWARFFormValue::FC_Block,         // 0x03 DW_FORM_block2
+  DWARFFormValue::FC_Block,         // 0x04 DW_FORM_block4
+  DWARFFormValue::FC_Constant,      // 0x05 DW_FORM_data2
+  // --- These can be FC_SectionOffset in DWARF3 and below:
+  DWARFFormValue::FC_Constant,      // 0x06 DW_FORM_data4
+  DWARFFormValue::FC_Constant,      // 0x07 DW_FORM_data8
+  // ---
+  DWARFFormValue::FC_String,        // 0x08 DW_FORM_string
+  DWARFFormValue::FC_Block,         // 0x09 DW_FORM_block
+  DWARFFormValue::FC_Block,         // 0x0a DW_FORM_block1
+  DWARFFormValue::FC_Constant,      // 0x0b DW_FORM_data1
+  DWARFFormValue::FC_Flag,          // 0x0c DW_FORM_flag
+  DWARFFormValue::FC_Constant,      // 0x0d DW_FORM_sdata
+  DWARFFormValue::FC_String,        // 0x0e DW_FORM_strp
+  DWARFFormValue::FC_Constant,      // 0x0f DW_FORM_udata
+  DWARFFormValue::FC_Reference,     // 0x10 DW_FORM_ref_addr
+  DWARFFormValue::FC_Reference,     // 0x11 DW_FORM_ref1
+  DWARFFormValue::FC_Reference,     // 0x12 DW_FORM_ref2
+  DWARFFormValue::FC_Reference,     // 0x13 DW_FORM_ref4
+  DWARFFormValue::FC_Reference,     // 0x14 DW_FORM_ref8
+  DWARFFormValue::FC_Reference,     // 0x15 DW_FORM_ref_udata
+  DWARFFormValue::FC_Indirect,      // 0x16 DW_FORM_indirect
+  DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
+  DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
+  DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
+};
+
+bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
+  // First, check DWARF4 form classes.
+  if (Form < ArrayRef<FormClass>(DWARF4FormClasses).size() &&
+      DWARF4FormClasses[Form] == FC)
+    return true;
+  // Check DW_FORM_ref_sig8 from DWARF4.
+  if (Form == DW_FORM_ref_sig8)
+    return (FC == FC_Reference);
+  // Check for some DWARF5 forms.
+  if (Form == DW_FORM_GNU_addr_index)
+    return (FC == FC_Address);
+  if (Form == DW_FORM_GNU_str_index)
+    return (FC == FC_String);
+  // In DWARF3 DW_FORM_data4 and DW_FORM_data8 served also as a section offset.
+  // Don't check for DWARF version here, as some producers may still do this
+  // by mistake.
+  if ((Form == DW_FORM_data4 || Form == DW_FORM_data8) &&
+      FC == FC_SectionOffset)
+    return true;
+  return false;
+}
+
+bool DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
+                                  const DWARFUnit *cu) {
   bool indirect = false;
   bool is_block = false;
   Value.data = NULL;
@@ -156,10 +208,6 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       break;
     case DW_FORM_string:
       Value.cstr = data.getCStr(offset_ptr);
-      // Set the string value to also be the data for inlined cstr form
-      // values only so we can tell the differnence between DW_FORM_string
-      // and DW_FORM_strp form values
-      Value.data = (const uint8_t*)Value.cstr;
       break;
     case DW_FORM_indirect:
       Form = data.getULEB128(offset_ptr);
@@ -183,8 +231,6 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       Value.uval = data.getU64(offset_ptr);
       break;
     case DW_FORM_GNU_addr_index:
-      Value.uval = data.getULEB128(offset_ptr);
-      break;
     case DW_FORM_GNU_str_index:
       Value.uval = data.getULEB128(offset_ptr);
       break;
@@ -207,13 +253,13 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
 
 bool
 DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
-                          const DWARFCompileUnit *cu) const {
+                          const DWARFUnit *cu) const {
   return DWARFFormValue::skipValue(Form, debug_info_data, offset_ptr, cu);
 }
 
 bool
 DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
-                          uint32_t *offset_ptr, const DWARFCompileUnit *cu) {
+                          uint32_t *offset_ptr, const DWARFUnit *cu) {
   bool indirect = false;
   do {
     switch (form) {
@@ -313,21 +359,20 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
 }
 
 void
-DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
+DWARFFormValue::dump(raw_ostream &OS, const DWARFUnit *cu) const {
   DataExtractor debug_str_data(cu->getStringSection(), true, 0);
   DataExtractor debug_str_offset_data(cu->getStringOffsetSection(), true, 0);
-  uint64_t uvalue = getUnsigned();
+  uint64_t uvalue = Value.uval;
   bool cu_relative_offset = false;
 
   switch (Form) {
   case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_GNU_addr_index: {
-    StringRef AddrOffsetSec = cu->getAddrOffsetSection();
     OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
-    if (AddrOffsetSec.size() != 0) {
-      DataExtractor DA(AddrOffsetSec, true, cu->getAddressByteSize());
-      OS << format("0x%016" PRIx64, getIndirectAddress(&DA, cu));
-    } else
+    uint64_t Address;
+    if (cu->getAddrOffsetSectionItem(uvalue, Address))
+      OS << format("0x%016" PRIx64, Address);
+    else
       OS << "<no .debug_addr section>";
     break;
   }
@@ -340,7 +385,7 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
   case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_string:
     OS << '"';
-    OS.write_escaped(getAsCString(NULL));
+    OS.write_escaped(Value.cstr);
     OS << '"';
     break;
   case DW_FORM_exprloc:
@@ -372,25 +417,24 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
     }
     break;
 
-  case DW_FORM_sdata:     OS << getSigned();   break;
-  case DW_FORM_udata:     OS << getUnsigned(); break;
+  case DW_FORM_sdata:     OS << Value.sval; break;
+  case DW_FORM_udata:     OS << Value.uval; break;
   case DW_FORM_strp: {
     OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
-    const char* dbg_str = getAsCString(&debug_str_data);
-    if (dbg_str) {
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
       OS << '"';
-      OS.write_escaped(dbg_str);
+      OS.write_escaped(DbgStr.getValue());
       OS << '"';
     }
     break;
   }
   case DW_FORM_GNU_str_index: {
     OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
-    const char *dbg_str = getIndirectCString(&debug_str_data,
-                                             &debug_str_offset_data);
-    if (dbg_str) {
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
       OS << '"';
-      OS.write_escaped(dbg_str);
+      OS.write_escaped(DbgStr.getValue());
       OS << '"';
     }
     break;
@@ -439,97 +483,67 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
     OS << format(" => {0x%8.8" PRIx64 "}", uvalue + (cu ? cu->getOffset() : 0));
 }
 
-const char*
-DWARFFormValue::getAsCString(const DataExtractor *debug_str_data_ptr) const {
-  if (isInlinedCStr()) {
+Optional<const char *> DWARFFormValue::getAsCString(const DWARFUnit *U) const {
+  if (!isFormClass(FC_String))
+    return None;
+  if (Form == DW_FORM_string)
     return Value.cstr;
-  } else if (debug_str_data_ptr) {
-    uint32_t offset = Value.uval;
-    return debug_str_data_ptr->getCStr(&offset);
+  if (U == 0)
+    return None;
+  uint32_t Offset = Value.uval;
+  if (Form == DW_FORM_GNU_str_index) {
+    uint32_t StrOffset;
+    if (!U->getStringOffsetSectionItem(Offset, StrOffset))
+      return None;
+    Offset = StrOffset;
   }
-  return NULL;
-}
-
-const char*
-DWARFFormValue::getIndirectCString(const DataExtractor *DS,
-                                   const DataExtractor *DSO) const {
-  if (!DS || !DSO) return NULL;
-
-  uint32_t offset = Value.uval * 4;
-  uint32_t soffset = DSO->getU32(&offset);
-  return DS->getCStr(&soffset);
-}
-
-uint64_t
-DWARFFormValue::getIndirectAddress(const DataExtractor *DA,
-                                   const DWARFCompileUnit *cu) const {
-  if (!DA) return 0;
-
-  uint32_t offset = Value.uval * cu->getAddressByteSize();
-  return DA->getAddress(&offset);
+  if (const char *Str = U->getStringExtractor().getCStr(&Offset)) {
+    return Str;
+  }
+  return None;
 }
 
-uint64_t DWARFFormValue::getReference(const DWARFCompileUnit *cu) const {
-  uint64_t die_offset = Value.uval;
-  switch (Form) {
-  case DW_FORM_ref1:
-  case DW_FORM_ref2:
-  case DW_FORM_ref4:
-  case DW_FORM_ref8:
-  case DW_FORM_ref_udata:
-      die_offset += (cu ? cu->getOffset() : 0);
-      break;
-  default:
-      break;
+Optional<uint64_t> DWARFFormValue::getAsAddress(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Address))
+    return None;
+  if (Form == DW_FORM_GNU_addr_index) {
+    uint32_t Index = Value.uval;
+    uint64_t Result;
+    if (U == 0 || !U->getAddrOffsetSectionItem(Index, Result))
+      return None;
+    return Result;
   }
-
-  return die_offset;
+  return Value.uval;
 }
 
-bool
-DWARFFormValue::resolveCompileUnitReferences(const DWARFCompileUnit *cu) {
+Optional<uint64_t> DWARFFormValue::getAsReference(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Reference))
+    return None;
   switch (Form) {
   case DW_FORM_ref1:
   case DW_FORM_ref2:
   case DW_FORM_ref4:
   case DW_FORM_ref8:
   case DW_FORM_ref_udata:
-    Value.uval += cu->getOffset();
-    Form = DW_FORM_ref_addr;
-    return true;
+    if (U == 0)
+      return None;
+    return Value.uval + U->getOffset();
+  case DW_FORM_ref_addr:
+    return Value.uval;
+  // FIXME: Add proper support for DW_FORM_ref_sig8
   default:
-    break;
+    return Value.uval;
   }
-  return false;
 }
 
-const uint8_t *DWARFFormValue::BlockData() const {
-  if (!isInlinedCStr())
-    return Value.data;
-  return NULL;
+Optional<uint64_t> DWARFFormValue::getAsSectionOffset() const {
+  if (!isFormClass(FC_SectionOffset))
+    return None;
+  return Value.uval;
 }
 
-bool DWARFFormValue::isBlockForm(uint16_t form) {
-  switch (form) {
-  case DW_FORM_exprloc:
-  case DW_FORM_block:
-  case DW_FORM_block1:
-  case DW_FORM_block2:
-  case DW_FORM_block4:
-    return true;
-  }
-  return false;
-}
-
-bool DWARFFormValue::isDataForm(uint16_t form) {
-  switch (form) {
-  case DW_FORM_sdata:
-  case DW_FORM_udata:
-  case DW_FORM_data1:
-  case DW_FORM_data2:
-  case DW_FORM_data4:
-  case DW_FORM_data8:
-    return true;
-  }
-  return false;
+Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
+  if (!isFormClass(FC_Constant) || Form == DW_FORM_sdata)
+    return None;
+  return Value.uval;
 }
diff --git a/lib/DebugInfo/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARFTypeUnit.cpp
new file mode 100644
index 0000000..303bf70
--- /dev/null
+++ b/lib/DebugInfo/DWARFTypeUnit.cpp
@@ -0,0 +1,39 @@
+//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFTypeUnit.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
+                                uint32_t *offset_ptr) {
+  if (!DWARFUnit::extractImpl(debug_info, offset_ptr))
+    return false;
+  TypeHash = debug_info.getU64(offset_ptr);
+  TypeOffset = debug_info.getU32(offset_ptr);
+  return TypeOffset < getLength();
+}
+
+void DWARFTypeUnit::dump(raw_ostream &OS) {
+  OS << format("0x%08x", getOffset()) << ": Type Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+     << " type_offset = " << format("0x%04x", TypeOffset)
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
+     << ")\n";
+
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
+}
diff --git a/lib/DebugInfo/DWARFTypeUnit.h b/lib/DebugInfo/DWARFTypeUnit.h
new file mode 100644
index 0000000..7a0dab2
--- /dev/null
+++ b/lib/DebugInfo/DWARFTypeUnit.h
@@ -0,0 +1,35 @@
+//===-- DWARFTypeUnit.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFTYPEUNIT_H
+#define LLVM_DEBUGINFO_DWARFTYPEUNIT_H
+
+#include "DWARFUnit.h"
+
+namespace llvm {
+
+class DWARFTypeUnit : public DWARFUnit {
+private:
+  uint64_t TypeHash;
+  uint32_t TypeOffset;
+public:
+  DWARFTypeUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+                StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+                const RelocAddrMap *M, bool LE)
+      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
+  uint32_t getSize() const LLVM_OVERRIDE { return DWARFUnit::getSize() + 12; }
+  void dump(raw_ostream &OS);
+protected:
+  bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) LLVM_OVERRIDE;
+};
+
+}
+
+#endif
+
diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
new file mode 100644
index 0000000..5167eb9
--- /dev/null
+++ b/lib/DebugInfo/DWARFUnit.cpp
@@ -0,0 +1,365 @@
+//===-- DWARFUnit.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFUnit.h"
+#include "DWARFContext.h"
+#include "llvm/DebugInfo/DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Path.h"
+#include <cstdio>
+
+using namespace llvm;
+using namespace dwarf;
+
+DWARFUnit::DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+                     StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+                     const RelocAddrMap *M, bool LE)
+    : Abbrev(DA), InfoSection(IS), AbbrevSection(AS), RangeSection(RS),
+      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
+      RelocMap(M), isLittleEndian(LE) {
+  clear();
+}
+
+DWARFUnit::~DWARFUnit() {
+}
+
+bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
+                                                uint64_t &Result) const {
+  uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
+  if (AddrOffsetSection.size() < Offset + AddrSize)
+    return false;
+  DataExtractor DA(AddrOffsetSection, isLittleEndian, AddrSize);
+  Result = DA.getAddress(&Offset);
+  return true;
+}
+
+bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
+                                                  uint32_t &Result) const {
+  // FIXME: string offset section entries are 8-byte for DWARF64.
+  const uint32_t ItemSize = 4;
+  uint32_t Offset = Index * ItemSize;
+  if (StringOffsetSection.size() < Offset + ItemSize)
+    return false;
+  DataExtractor DA(StringOffsetSection, isLittleEndian, 0);
+  Result = DA.getU32(&Offset);
+  return true;
+}
+
+bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
+  Length = debug_info.getU32(offset_ptr);
+  Version = debug_info.getU16(offset_ptr);
+  uint64_t abbrOffset = debug_info.getU32(offset_ptr);
+  AddrSize = debug_info.getU8(offset_ptr);
+
+  bool lengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
+  bool versionOK = DWARFContext::isSupportedVersion(Version);
+  bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
+  bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+  if (!lengthOK || !versionOK || !addrSizeOK || !abbrOffsetOK)
+    return false;
+
+  Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
+  return true;
+}
+
+bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
+  clear();
+
+  Offset = *offset_ptr;
+
+  if (debug_info.isValidOffset(*offset_ptr)) {
+    if (extractImpl(debug_info, offset_ptr))
+      return true;
+
+    // reset the offset to where we tried to parse from if anything went wrong
+    *offset_ptr = Offset;
+  }
+
+  return false;
+}
+
+bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
+                                        DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(DieArray.size() > 0);
+  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
+  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
+  return RangeList.extract(RangesData, &ActualRangeListOffset);
+}
+
+void DWARFUnit::clear() {
+  Offset = 0;
+  Length = 0;
+  Version = 0;
+  Abbrevs = 0;
+  AddrSize = 0;
+  BaseAddr = 0;
+  RangeSectionBase = 0;
+  AddrOffsetSectionBase = 0;
+  clearDIEs(false);
+  DWO.reset();
+}
+
+const char *DWARFUnit::getCompilationDir() {
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return 0;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+}
+
+uint64_t DWARFUnit::getDWOId() {
+  extractDIEsIfNeeded(true);
+  const uint64_t FailValue = -1ULL;
+  if (DieArray.empty())
+    return FailValue;
+  return DieArray[0]
+      .getAttributeValueAsUnsignedConstant(this, DW_AT_GNU_dwo_id, FailValue);
+}
+
+void DWARFUnit::setDIERelations() {
+  if (DieArray.empty())
+    return;
+  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
+  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
+  DWARFDebugInfoEntryMinimal *curr_die;
+  // We purposely are skipping the last element in the array in the loop below
+  // so that we can always have a valid next item
+  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
+    // Since our loop doesn't include the last element, we can always
+    // safely access the next die in the array.
+    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
+
+    const DWARFAbbreviationDeclaration *curr_die_abbrev =
+      curr_die->getAbbreviationDeclarationPtr();
+
+    if (curr_die_abbrev) {
+      // Normal DIE
+      if (curr_die_abbrev->hasChildren())
+        next_die->setParent(curr_die);
+      else
+        curr_die->setSibling(next_die);
+    } else {
+      // NULL DIE that terminates a sibling chain
+      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
+      if (parent)
+        parent->setSibling(next_die);
+    }
+  }
+
+  // Since we skipped the last element, we need to fix it up!
+  if (die_array_begin < die_array_end)
+    curr_die->setParent(die_array_begin);
+}
+
+void DWARFUnit::extractDIEsToVector(
+    bool AppendCUDie, bool AppendNonCUDies,
+    std::vector<DWARFDebugInfoEntryMinimal> &Dies) const {
+  if (!AppendCUDie && !AppendNonCUDies)
+    return;
+
+  // Set the offset to that of the first DIE and calculate the start of the
+  // next compilation unit header.
+  uint32_t Offset = getFirstDIEOffset();
+  uint32_t NextCUOffset = getNextUnitOffset();
+  DWARFDebugInfoEntryMinimal DIE;
+  uint32_t Depth = 0;
+  bool IsCUDie = true;
+
+  while (Offset < NextCUOffset && DIE.extractFast(this, &Offset)) {
+    if (IsCUDie) {
+      if (AppendCUDie)
+        Dies.push_back(DIE);
+      if (!AppendNonCUDies)
+        break;
+      // The average bytes per DIE entry has been seen to be
+      // around 14-20 so let's pre-reserve the needed memory for
+      // our DIE entries accordingly.
+      Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
+      IsCUDie = false;
+    } else {
+      Dies.push_back(DIE);
+    }
+
+    const DWARFAbbreviationDeclaration *AbbrDecl =
+      DIE.getAbbreviationDeclarationPtr();
+    if (AbbrDecl) {
+      // Normal DIE
+      if (AbbrDecl->hasChildren())
+        ++Depth;
+    } else {
+      // NULL DIE.
+      if (Depth > 0)
+        --Depth;
+      if (Depth == 0)
+        break;  // We are done with this compile unit!
+    }
+  }
+
+  // Give a little bit of info if we encounter corrupt DWARF (our offset
+  // should always terminate at or before the start of the next compilation
+  // unit header).
+  if (Offset > NextCUOffset)
+    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), Offset);
+}
+
+size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
+  if ((CUDieOnly && DieArray.size() > 0) ||
+      DieArray.size() > 1)
+    return 0; // Already parsed.
+
+  bool HasCUDie = DieArray.size() > 0;
+  extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
+
+  if (DieArray.empty())
+    return 0;
+
+  // If CU DIE was just parsed, copy several attribute values from it.
+  if (!HasCUDie) {
+    uint64_t BaseAddr =
+        DieArray[0].getAttributeValueAsAddress(this, DW_AT_low_pc, -1ULL);
+    if (BaseAddr == -1ULL)
+      BaseAddr = DieArray[0].getAttributeValueAsAddress(this, DW_AT_entry_pc, 0);
+    setBaseAddress(BaseAddr);
+    AddrOffsetSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_GNU_addr_base, 0);
+    RangeSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_GNU_ranges_base, 0);
+  }
+
+  setDIERelations();
+  return DieArray.size();
+}
+
+DWARFUnit::DWOHolder::DWOHolder(object::ObjectFile *DWOFile)
+    : DWOFile(DWOFile),
+      DWOContext(cast<DWARFContext>(DIContext::getDWARFContext(DWOFile))),
+      DWOU(0) {
+  if (DWOContext->getNumDWOCompileUnits() > 0)
+    DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
+}
+
+bool DWARFUnit::parseDWO() {
+  if (DWO.get() != 0)
+    return false;
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return false;
+  const char *DWOFileName =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_GNU_dwo_name, 0);
+  if (DWOFileName == 0)
+    return false;
+  const char *CompilationDir =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+  SmallString<16> AbsolutePath;
+  if (sys::path::is_relative(DWOFileName) && CompilationDir != 0) {
+    sys::path::append(AbsolutePath, CompilationDir);
+  }
+  sys::path::append(AbsolutePath, DWOFileName);
+  object::ObjectFile *DWOFile =
+      object::ObjectFile::createObjectFile(AbsolutePath);
+  if (!DWOFile)
+    return false;
+  // Reset DWOHolder.
+  DWO.reset(new DWOHolder(DWOFile));
+  DWARFUnit *DWOCU = DWO->getUnit();
+  // Verify that compile unit in .dwo file is valid.
+  if (DWOCU == 0 || DWOCU->getDWOId() != getDWOId()) {
+    DWO.reset();
+    return false;
+  }
+  // Share .debug_addr and .debug_ranges section with compile unit in .dwo
+  DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
+  DWOCU->setRangesSection(RangeSection, RangeSectionBase);
+  return true;
+}
+
+void DWARFUnit::clearDIEs(bool KeepCUDie) {
+  if (DieArray.size() > (unsigned)KeepCUDie) {
+    // std::vectors never get any smaller when resized to a smaller size,
+    // or when clear() or erase() are called, the size will report that it
+    // is smaller, but the memory allocated remains intact (call capacity()
+    // to see this). So we need to create a temporary vector and swap the
+    // contents which will cause just the internal pointers to be swapped
+    // so that when temporary vector goes out of scope, it will destroy the
+    // contents.
+    std::vector<DWARFDebugInfoEntryMinimal> TmpArray;
+    DieArray.swap(TmpArray);
+    // Save at least the compile unit DIE
+    if (KeepCUDie)
+      DieArray.push_back(TmpArray.front());
+  }
+}
+
+void
+DWARFUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                                         bool clear_dies_if_already_not_parsed,
+                                         uint32_t CUOffsetInAranges) {
+  // This function is usually called if there in no .debug_aranges section
+  // in order to produce a compile unit level set of address ranges that
+  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
+  // all compile units to stay loaded when they weren't needed. So we can end
+  // up parsing the DWARF and then throwing them all away to keep memory usage
+  // down.
+  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
+                          clear_dies_if_already_not_parsed;
+  DieArray[0].buildAddressRangeTable(this, debug_aranges, CUOffsetInAranges);
+  bool DWOCreated = parseDWO();
+  if (DWO.get()) {
+    // If there is a .dwo file for this compile unit, then skeleton CU DIE
+    // doesn't have children, and we should instead build address range table
+    // from DIEs in the .debug_info.dwo section of .dwo file.
+    DWO->getUnit()->buildAddressRangeTable(
+        debug_aranges, clear_dies_if_already_not_parsed, CUOffsetInAranges);
+  }
+  if (DWOCreated && clear_dies_if_already_not_parsed)
+    DWO.reset();
+
+  // Keep memory down by clearing DIEs if this generate function
+  // caused them to be parsed.
+  if (clear_dies)
+    clearDIEs(true);
+}
+
+const DWARFDebugInfoEntryMinimal *
+DWARFUnit::getSubprogramForAddress(uint64_t Address) {
+  extractDIEsIfNeeded(false);
+  for (size_t i = 0, n = DieArray.size(); i != n; i++)
+    if (DieArray[i].isSubprogramDIE() &&
+        DieArray[i].addressRangeContainsAddress(this, Address)) {
+      return &DieArray[i];
+    }
+  return 0;
+}
+
+DWARFDebugInfoEntryInlinedChain
+DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
+  // First, find a subprogram that contains the given address (the root
+  // of inlined chain).
+  const DWARFUnit *ChainCU = 0;
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE =
+      getSubprogramForAddress(Address);
+  if (SubprogramDIE) {
+    ChainCU = this;
+  } else {
+    // Try to look for subprogram DIEs in the DWO file.
+    parseDWO();
+    if (DWO.get()) {
+      SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address);
+      if (SubprogramDIE)
+        ChainCU = DWO->getUnit();
+    }
+  }
+
+  // Get inlined chain rooted at this subprogram DIE.
+  if (!SubprogramDIE)
+    return DWARFDebugInfoEntryInlinedChain();
+  return SubprogramDIE->getInlinedChainForAddress(ChainCU, Address);
+}
diff --git a/lib/DebugInfo/DWARFUnit.h b/lib/DebugInfo/DWARFUnit.h
new file mode 100644
index 0000000..bd768a6
--- /dev/null
+++ b/lib/DebugInfo/DWARFUnit.h
@@ -0,0 +1,168 @@
+//===-- DWARFUnit.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFUNIT_H
+#define LLVM_DEBUGINFO_DWARFUNIT_H
+
+#include "llvm/ADT/OwningPtr.h"
+#include "DWARFDebugAbbrev.h"
+#include "DWARFDebugInfoEntry.h"
+#include "DWARFDebugRangeList.h"
+#include "DWARFRelocMap.h"
+#include <vector>
+
+namespace llvm {
+
+namespace object {
+class ObjectFile;
+}
+
+class DWARFDebugAbbrev;
+class StringRef;
+class raw_ostream;
+
+class DWARFUnit {
+  const DWARFDebugAbbrev *Abbrev;
+  StringRef InfoSection;
+  StringRef AbbrevSection;
+  StringRef RangeSection;
+  uint32_t RangeSectionBase;
+  StringRef StringSection;
+  StringRef StringOffsetSection;
+  StringRef AddrOffsetSection;
+  uint32_t AddrOffsetSectionBase;
+  const RelocAddrMap *RelocMap;
+  bool isLittleEndian;
+
+  uint32_t Offset;
+  uint32_t Length;
+  uint16_t Version;
+  const DWARFAbbreviationDeclarationSet *Abbrevs;
+  uint8_t AddrSize;
+  uint64_t BaseAddr;
+  // The compile unit debug information entry items.
+  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+
+  class DWOHolder {
+    OwningPtr<object::ObjectFile> DWOFile;
+    OwningPtr<DWARFContext> DWOContext;
+    DWARFUnit *DWOU;
+  public:
+    DWOHolder(object::ObjectFile *DWOFile);
+    DWARFUnit *getUnit() const { return DWOU; }
+  };
+  OwningPtr<DWOHolder> DWO;
+
+protected:
+  virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
+
+public:
+
+  DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+            StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+            const RelocAddrMap *M, bool LE);
+
+  virtual ~DWARFUnit();
+
+  StringRef getStringSection() const { return StringSection; }
+  StringRef getStringOffsetSection() const { return StringOffsetSection; }
+  void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
+    AddrOffsetSection = AOS;
+    AddrOffsetSectionBase = Base;
+  }
+  void setRangesSection(StringRef RS, uint32_t Base) {
+    RangeSection = RS;
+    RangeSectionBase = Base;
+  }
+
+  bool getAddrOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
+  // FIXME: Result should be uint64_t in DWARF64.
+  bool getStringOffsetSectionItem(uint32_t Index, uint32_t &Result) const;
+
+  DataExtractor getDebugInfoExtractor() const {
+    return DataExtractor(InfoSection, isLittleEndian, AddrSize);
+  }
+  DataExtractor getStringExtractor() const {
+    return DataExtractor(StringSection, false, 0);
+  }
+
+  const RelocAddrMap *getRelocMap() const { return RelocMap; }
+
+  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
+
+  /// extractRangeList - extracts the range list referenced by this compile
+  /// unit from .debug_ranges section. Returns true on success.
+  /// Requires that compile unit is already extracted.
+  bool extractRangeList(uint32_t RangeListOffset,
+                        DWARFDebugRangeList &RangeList) const;
+  void clear();
+  uint32_t getOffset() const { return Offset; }
+  /// Size in bytes of the compile unit header.
+  virtual uint32_t getSize() const { return 11; }
+  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
+  uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
+  /// Size in bytes of the .debug_info data associated with this compile unit.
+  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
+  uint32_t getLength() const { return Length; }
+  uint16_t getVersion() const { return Version; }
+  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
+    return Abbrevs;
+  }
+  uint8_t getAddressByteSize() const { return AddrSize; }
+  uint64_t getBaseAddress() const { return BaseAddr; }
+
+  void setBaseAddress(uint64_t base_addr) {
+    BaseAddr = base_addr;
+  }
+
+  const DWARFDebugInfoEntryMinimal *
+  getCompileUnitDIE(bool extract_cu_die_only = true) {
+    extractDIEsIfNeeded(extract_cu_die_only);
+    return DieArray.empty() ? NULL : &DieArray[0];
+  }
+
+  const char *getCompilationDir();
+  uint64_t getDWOId();
+
+  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                              bool clear_dies_if_already_not_parsed,
+                              uint32_t CUOffsetInAranges);
+
+  /// getInlinedChainForAddress - fetches inlined chain for a given address.
+  /// Returns empty chain if there is no subprogram containing address. The
+  /// chain is valid as long as parsed compile unit DIEs are not cleared.
+  DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
+
+private:
+  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
+  /// hasn't already been done. Returns the number of DIEs parsed at this call.
+  size_t extractDIEsIfNeeded(bool CUDieOnly);
+  /// extractDIEsToVector - Appends all parsed DIEs to a vector.
+  void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
+                           std::vector<DWARFDebugInfoEntryMinimal> &DIEs) const;
+  /// setDIERelations - We read in all of the DIE entries into our flat list
+  /// of DIE entries and now we need to go back through all of them and set the
+  /// parent, sibling and child pointers for quick DIE navigation.
+  void setDIERelations();
+  /// clearDIEs - Clear parsed DIEs to keep memory usage low.
+  void clearDIEs(bool KeepCUDie);
+
+  /// parseDWO - Parses .dwo file for current compile unit. Returns true if
+  /// it was actually constructed.
+  bool parseDWO();
+
+  /// getSubprogramForAddress - Returns subprogram DIE with address range
+  /// encompassing the provided address. The pointer is alive as long as parsed
+  /// compile unit DIEs are not cleared.
+  const DWARFDebugInfoEntryMinimal *getSubprogramForAddress(uint64_t Address);
+};
+
+}
+
+#endif
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index c463e9f..2a610d5 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "jit"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
@@ -39,6 +40,11 @@ using namespace llvm;
 STATISTIC(NumInitBytes, "Number of bytes of global vars initialized");
 STATISTIC(NumGlobals  , "Number of global vars initialized");
 
+// Pin the vtable to this file.
+void ObjectCache::anchor() {}
+void ObjectBuffer::anchor() {}
+void ObjectBufferStream::anchor() {}
+
 ExecutionEngine *(*ExecutionEngine::JITCtor)(
   Module *M,
   std::string *ErrorStr,
@@ -56,9 +62,7 @@ ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
 
 ExecutionEngine::ExecutionEngine(Module *M)
   : EEState(*this),
-    LazyFunctionCreator(0),
-    ExceptionTableRegister(0),
-    ExceptionTableDeregister(0) {
+    LazyFunctionCreator(0) {
   CompilingLazily         = false;
   GVCompilationDisabled   = false;
   SymbolSearchingDisabled = false;
@@ -72,16 +76,6 @@ ExecutionEngine::~ExecutionEngine() {
     delete Modules[i];
 }
 
-void ExecutionEngine::DeregisterAllTables() {
-  if (ExceptionTableDeregister) {
-    DenseMap<const Function*, void*>::iterator it = AllExceptionTables.begin();
-    DenseMap<const Function*, void*>::iterator ite = AllExceptionTables.end();
-    for (; it != ite; ++it)
-      ExceptionTableDeregister(it->second);
-    AllExceptionTables.clear();
-  }
-}
-
 namespace {
 /// \brief Helper class which uses a value handler to automatically deletes the
 /// memory block when the GlobalVariable is destroyed.
@@ -556,6 +550,24 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       // with the correct bit width.
       Result.IntVal = APInt(C->getType()->getPrimitiveSizeInBits(), 0);
       break;
+    case Type::StructTyID: {
+      // if the whole struct is 'undef' just reserve memory for the value.
+      if(StructType *STy = dyn_cast<StructType>(C->getType())) {
+        unsigned int elemNum = STy->getNumElements();
+        Result.AggregateVal.resize(elemNum);
+        for (unsigned int i = 0; i < elemNum; ++i) {
+          Type *ElemTy = STy->getElementType(i);
+          if (ElemTy->isIntegerTy())
+            Result.AggregateVal[i].IntVal = 
+              APInt(ElemTy->getPrimitiveSizeInBits(), 0);
+          else if (ElemTy->isAggregateType()) {
+              const Constant *ElemUndef = UndefValue::get(ElemTy);
+              Result.AggregateVal[i] = getConstantValue(ElemUndef);
+            }
+          }
+        }
+      }
+      break;
     case Type::VectorTyID:
       // if the whole vector is 'undef' just reserve memory for the value.
       const VectorType* VTy = dyn_cast<VectorType>(C->getType());
@@ -564,7 +576,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       Result.AggregateVal.resize(elemNum);
       if (ElemTy->isIntegerTy())
         for (unsigned int i = 0; i < elemNum; ++i)
-          Result.AggregateVal[i].IntVal = 
+          Result.AggregateVal[i].IntVal =
             APInt(ElemTy->getPrimitiveSizeInBits(), 0);
       break;
     }
@@ -1283,6 +1295,10 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
   if (GA == 0) {
     // If it's not already specified, allocate memory for the global.
     GA = getMemoryForGV(GV);
+
+    // If we failed to allocate memory for this global, return.
+    if (GA == 0) return;
+
     addGlobalMapping(GV, GA);
   }
 
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 88e73bf..2d34eea 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -339,14 +339,10 @@ void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global) {
 namespace {
 
 struct SimpleBindingMMFunctions {
-  uint8_t *(*AllocateCodeSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID);
-  uint8_t *(*AllocateDataSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID, LLVMBool IsReadOnly);
-  LLVMBool (*FinalizeMemory)(void *Opaque, char **ErrMsg);
-  void (*Destroy)(void *Opaque);
+  LLVMMemoryManagerAllocateCodeSectionCallback AllocateCodeSection;
+  LLVMMemoryManagerAllocateDataSectionCallback AllocateDataSection;
+  LLVMMemoryManagerFinalizeMemoryCallback FinalizeMemory;
+  LLVMMemoryManagerDestroyCallback Destroy;
 };
 
 class SimpleBindingMemoryManager : public RTDyldMemoryManager {
@@ -355,12 +351,13 @@ public:
                              void *Opaque);
   virtual ~SimpleBindingMemoryManager();
   
-  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID);
+  virtual uint8_t *allocateCodeSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName);
 
-  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID,
-                                       bool isReadOnly);
+  virtual uint8_t *allocateDataSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName, bool isReadOnly);
 
   virtual bool finalizeMemory(std::string *ErrMsg);
   
@@ -388,13 +385,17 @@ SimpleBindingMemoryManager::~SimpleBindingMemoryManager() {
 }
 
 uint8_t *SimpleBindingMemoryManager::allocateCodeSection(
-  uintptr_t Size, unsigned Alignment, unsigned SectionID) {
-  return Functions.AllocateCodeSection(Opaque, Size, Alignment, SectionID);
+  uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  StringRef SectionName) {
+  return Functions.AllocateCodeSection(Opaque, Size, Alignment, SectionID,
+                                       SectionName.str().c_str());
 }
 
 uint8_t *SimpleBindingMemoryManager::allocateDataSection(
-  uintptr_t Size, unsigned Alignment, unsigned SectionID, bool isReadOnly) {
+  uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  StringRef SectionName, bool isReadOnly) {
   return Functions.AllocateDataSection(Opaque, Size, Alignment, SectionID,
+                                       SectionName.str().c_str(),
                                        isReadOnly);
 }
 
@@ -415,14 +416,10 @@ bool SimpleBindingMemoryManager::finalizeMemory(std::string *ErrMsg) {
 
 LLVMMCJITMemoryManagerRef LLVMCreateSimpleMCJITMemoryManager(
   void *Opaque,
-  uint8_t *(*AllocateCodeSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID),
-  uint8_t *(*AllocateDataSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID, LLVMBool IsReadOnly),
-  LLVMBool (*FinalizeMemory)(void *Opaque, char **ErrMsg),
-  void (*Destroy)(void *Opaque)) {
+  LLVMMemoryManagerAllocateCodeSectionCallback AllocateCodeSection,
+  LLVMMemoryManagerAllocateDataSectionCallback AllocateDataSection,
+  LLVMMemoryManagerFinalizeMemoryCallback FinalizeMemory,
+  LLVMMemoryManagerDestroyCallback Destroy) {
   
   if (!AllocateCodeSection || !AllocateDataSection || !FinalizeMemory ||
       !Destroy)
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
index 3d9ff53..777d0f1 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
@@ -61,7 +61,7 @@ public:
     GetNewMethodIDFunc(GetNewMethodIDImpl) {
   }
 
-  // Sends an event anncouncing that a function has been emitted
+  // Sends an event announcing that a function has been emitted
   //   return values are event-specific.  See Intel documentation for details.
   int  iJIT_NotifyEvent(iJIT_JVM_EVENT EventType, void *EventSpecificData) {
     if (!NotifyEventFunc)
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index fc3d579..5de0659 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -786,20 +786,31 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
 }
 
 static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2,
-                                      GenericValue Src3) {
-  return Src1.IntVal == 0 ? Src3 : Src2;
+                                      GenericValue Src3, const Type *Ty) {
+    GenericValue Dest;
+    if(Ty->isVectorTy()) {
+      assert(Src1.AggregateVal.size() == Src2.AggregateVal.size());
+      assert(Src2.AggregateVal.size() == Src3.AggregateVal.size());
+      Dest.AggregateVal.resize( Src1.AggregateVal.size() );
+      for (size_t i = 0; i < Src1.AggregateVal.size(); ++i)
+        Dest.AggregateVal[i] = (Src1.AggregateVal[i].IntVal == 0) ?
+          Src3.AggregateVal[i] : Src2.AggregateVal[i];
+    } else {
+      Dest = (Src1.IntVal == 0) ? Src3 : Src2;
+    }
+    return Dest;
 }
 
 void Interpreter::visitSelectInst(SelectInst &I) {
   ExecutionContext &SF = ECStack.back();
+  const Type * Ty = I.getOperand(0)->getType();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
-  GenericValue R = executeSelectInst(Src1, Src2, Src3);
+  GenericValue R = executeSelectInst(Src1, Src2, Src3, Ty);
   SetValue(&I, R, SF);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                     Terminator Instruction Implementations
 //===----------------------------------------------------------------------===//
@@ -887,40 +898,11 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
   // Check to see if any of the cases match...
   BasicBlock *Dest = 0;
   for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    IntegersSubset& Case = i.getCaseValueEx();
-    if (Case.isSingleNumber()) {
-      // FIXME: Currently work with ConstantInt based numbers.
-      const ConstantInt *CI = Case.getSingleNumber(0).toConstantInt();
-      GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
-      if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
-        Dest = cast<BasicBlock>(i.getCaseSuccessor());
-        break;        
-      }
+    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
+    if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
+      Dest = cast<BasicBlock>(i.getCaseSuccessor());
+      break;
     }
-    if (Case.isSingleNumbersOnly()) {
-      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
-        // FIXME: Currently work with ConstantInt based numbers.
-        const ConstantInt *CI = Case.getSingleNumber(n).toConstantInt();
-        GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
-        if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
-          Dest = cast<BasicBlock>(i.getCaseSuccessor());
-          break;        
-        }
-      }      
-    } else
-      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
-        IntegersSubset::Range r = Case.getItem(n);
-        // FIXME: Currently work with ConstantInt based numbers.
-        const ConstantInt *LowCI = r.getLow().toConstantInt();
-        const ConstantInt *HighCI = r.getHigh().toConstantInt();
-        GenericValue Low = getOperandValue(const_cast<ConstantInt*>(LowCI), SF);
-        GenericValue High = getOperandValue(const_cast<ConstantInt*>(HighCI), SF);
-        if (executeICMP_ULE(Low, CondVal, ElTy).IntVal != 0 &&
-            executeICMP_ULE(CondVal, High, ElTy).IntVal != 0) {
-          Dest = cast<BasicBlock>(i.getCaseSuccessor());
-          break;        
-        }
-      }
   }
   if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
   SwitchToNewBasicBlock(Dest, SF);
@@ -1793,10 +1775,204 @@ void Interpreter::visitExtractElementInst(ExtractElementInst &I) {
   SetValue(&I, Dest, SF);
 }
 
+void Interpreter::visitInsertElementInst(InsertElementInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Type *Ty = I.getType();
+
+  if(!(Ty->isVectorTy()) )
+    llvm_unreachable("Unhandled dest type for insertelement instruction");
+
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue Dest;
+
+  Type *TyContained = Ty->getContainedType(0);
+
+  const unsigned indx = unsigned(Src3.IntVal.getZExtValue());
+  Dest.AggregateVal = Src1.AggregateVal;
+
+  if(Src1.AggregateVal.size() <= indx)
+      llvm_unreachable("Invalid index in insertelement instruction");
+  switch (TyContained->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+    case Type::IntegerTyID:
+      Dest.AggregateVal[indx].IntVal = Src2.IntVal;
+      break;
+    case Type::FloatTyID:
+      Dest.AggregateVal[indx].FloatVal = Src2.FloatVal;
+      break;
+    case Type::DoubleTyID:
+      Dest.AggregateVal[indx].DoubleVal = Src2.DoubleVal;
+      break;
+  }
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
+  ExecutionContext &SF = ECStack.back();
+
+  Type *Ty = I.getType();
+  if(!(Ty->isVectorTy()))
+    llvm_unreachable("Unhandled dest type for shufflevector instruction");
+
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue Dest;
+
+  // There is no need to check types of src1 and src2, because the compiled
+  // bytecode can't contain different types for src1 and src2 for a
+  // shufflevector instruction.
+
+  Type *TyContained = Ty->getContainedType(0);
+  unsigned src1Size = (unsigned)Src1.AggregateVal.size();
+  unsigned src2Size = (unsigned)Src2.AggregateVal.size();
+  unsigned src3Size = (unsigned)Src3.AggregateVal.size();
+
+  Dest.AggregateVal.resize(src3Size);
+
+  switch (TyContained->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+      break;
+    case Type::IntegerTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].IntVal = Src1.AggregateVal[j].IntVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].IntVal = Src2.AggregateVal[j-src1Size].IntVal;
+        else
+          // The selector may not be greater than sum of lengths of first and
+          // second operands and llasm should not allow situation like
+          // %tmp = shufflevector <2 x i32> <i32 3, i32 4>, <2 x i32> undef,
+          //                      <2 x i32> < i32 0, i32 5 >,
+          // where i32 5 is invalid, but let it be additional check here:
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+      }
+      break;
+    case Type::FloatTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].FloatVal = Src1.AggregateVal[j].FloatVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].FloatVal = Src2.AggregateVal[j-src1Size].FloatVal;
+        else
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+        }
+      break;
+    case Type::DoubleTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].DoubleVal = Src1.AggregateVal[j].DoubleVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].DoubleVal =
+            Src2.AggregateVal[j-src1Size].DoubleVal;
+        else
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+      }
+      break;
+  }
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitExtractValueInst(ExtractValueInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Value *Agg = I.getAggregateOperand();
+  GenericValue Dest;
+  GenericValue Src = getOperandValue(Agg, SF);
+
+  ExtractValueInst::idx_iterator IdxBegin = I.idx_begin();
+  unsigned Num = I.getNumIndices();
+  GenericValue *pSrc = &Src;
+
+  for (unsigned i = 0 ; i < Num; ++i) {
+    pSrc = &pSrc->AggregateVal[*IdxBegin];
+    ++IdxBegin;
+  }
+
+  Type *IndexedType = ExtractValueInst::getIndexedType(Agg->getType(), I.getIndices());
+  switch (IndexedType->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for extractelement instruction");
+    break;
+    case Type::IntegerTyID:
+      Dest.IntVal = pSrc->IntVal;
+    break;
+    case Type::FloatTyID:
+      Dest.FloatVal = pSrc->FloatVal;
+    break;
+    case Type::DoubleTyID:
+      Dest.DoubleVal = pSrc->DoubleVal;
+    break;
+    case Type::ArrayTyID:
+    case Type::StructTyID:
+    case Type::VectorTyID:
+      Dest.AggregateVal = pSrc->AggregateVal;
+    break;
+    case Type::PointerTyID:
+      Dest.PointerVal = pSrc->PointerVal;
+    break;
+  }
+
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitInsertValueInst(InsertValueInst &I) {
+
+  ExecutionContext &SF = ECStack.back();
+  Value *Agg = I.getAggregateOperand();
+
+  GenericValue Src1 = getOperandValue(Agg, SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest = Src1; // Dest is a slightly changed Src1
+
+  ExtractValueInst::idx_iterator IdxBegin = I.idx_begin();
+  unsigned Num = I.getNumIndices();
+
+  GenericValue *pDest = &Dest;
+  for (unsigned i = 0 ; i < Num; ++i) {
+    pDest = &pDest->AggregateVal[*IdxBegin];
+    ++IdxBegin;
+  }
+  // pDest points to the target value in the Dest now
+
+  Type *IndexedType = ExtractValueInst::getIndexedType(Agg->getType(), I.getIndices());
+
+  switch (IndexedType->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+    break;
+    case Type::IntegerTyID:
+      pDest->IntVal = Src2.IntVal;
+    break;
+    case Type::FloatTyID:
+      pDest->FloatVal = Src2.FloatVal;
+    break;
+    case Type::DoubleTyID:
+      pDest->DoubleVal = Src2.DoubleVal;
+    break;
+    case Type::ArrayTyID:
+    case Type::StructTyID:
+    case Type::VectorTyID:
+      pDest->AggregateVal = Src2.AggregateVal;
+    break;
+    case Type::PointerTyID:
+      pDest->PointerVal = Src2.PointerVal;
+    break;
+  }
+
+  SetValue(&I, Dest, SF);
+}
+
 GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
                                                 ExecutionContext &SF) {
   switch (CE->getOpcode()) {
-  case Instruction::Trunc:   
+  case Instruction::Trunc:
       return executeTruncInst(CE->getOperand(0), CE->getType(), SF);
   case Instruction::ZExt:
       return executeZExtInst(CE->getOperand(0), CE->getType(), SF);
@@ -1832,7 +2008,8 @@ GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
   case Instruction::Select:
     return executeSelectInst(getOperandValue(CE->getOperand(0), SF),
                              getOperandValue(CE->getOperand(1), SF),
-                             getOperandValue(CE->getOperand(2), SF));
+                             getOperandValue(CE->getOperand(2), SF),
+                             CE->getOperand(0)->getType());
   default :
     break;
   }
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index bef4bbf..a03c7f5 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -406,6 +406,7 @@ GenericValue lle_X_sprintf(FunctionType *FT,
       break;
     }
   }
+  return GV;
 }
 
 // int printf(const char *, ...) - a very rough implementation to make output
@@ -434,7 +435,7 @@ GenericValue lle_X_sscanf(FunctionType *FT,
 
   GenericValue GV;
   GV.IntVal = APInt(32, sscanf(Args[0], Args[1], Args[2], Args[3], Args[4],
-                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+                    Args[5], Args[6], Args[7], Args[8], Args[9]));
   return GV;
 }
 
@@ -450,7 +451,7 @@ GenericValue lle_X_scanf(FunctionType *FT,
 
   GenericValue GV;
   GV.IntVal = APInt(32, scanf( Args[0], Args[1], Args[2], Args[3], Args[4],
-                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+                    Args[5], Args[6], Args[7], Args[8], Args[9]));
   return GV;
 }
 
@@ -470,6 +471,30 @@ GenericValue lle_X_fprintf(FunctionType *FT,
   return GV;
 }
 
+static GenericValue lle_X_memset(FunctionType *FT,
+                                 const std::vector<GenericValue> &Args) {
+  int val = (int)Args[1].IntVal.getSExtValue();
+  size_t len = (size_t)Args[2].IntVal.getZExtValue();
+  memset((void *)GVTOP(Args[0]), val, len);
+  // llvm.memset.* returns void, lle_X_* returns GenericValue,
+  // so here we return GenericValue with IntVal set to zero
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
+static GenericValue lle_X_memcpy(FunctionType *FT,
+                                 const std::vector<GenericValue> &Args) {
+  memcpy(GVTOP(Args[0]), GVTOP(Args[1]),
+         (size_t)(Args[2].IntVal.getLimitedValue()));
+
+  // llvm.memcpy* returns void, lle_X_* returns GenericValue,
+  // so here we return GenericValue with IntVal set to zero
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
 void Interpreter::initializeExternalFunctions() {
   sys::ScopedLock Writer(*FunctionsLock);
   FuncNames["lle_X_atexit"]       = lle_X_atexit;
@@ -481,4 +506,6 @@ void Interpreter::initializeExternalFunctions() {
   FuncNames["lle_X_sscanf"]       = lle_X_sscanf;
   FuncNames["lle_X_scanf"]        = lle_X_scanf;
   FuncNames["lle_X_fprintf"]      = lle_X_fprintf;
+  FuncNames["lle_X_memset"]       = lle_X_memset;
+  FuncNames["lle_X_memcpy"]       = lle_X_memcpy;
 }
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
index 2952d7e..98269ef 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -179,6 +179,12 @@ public:
 
   void visitVAArgInst(VAArgInst &I);
   void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+
   void visitInstruction(Instruction &I) {
     errs() << I << "\n";
     llvm_unreachable("Instruction not interpretable yet!");
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 53ea0a2..246a675 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -67,140 +67,6 @@ static struct RegisterJIT {
 extern "C" void LLVMLinkInJIT() {
 }
 
-// Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
-#define HAVE_EHTABLE_SUPPORT 1
-#else
-#define HAVE_EHTABLE_SUPPORT 0
-#endif
-
-#if HAVE_EHTABLE_SUPPORT
-
-// libgcc defines the __register_frame function to dynamically register new
-// dwarf frames for exception handling. This functionality is not portable
-// across compilers and is only provided by GCC. We use the __register_frame
-// function here so that code generated by the JIT cooperates with the unwinding
-// runtime of libgcc. When JITting with exception handling enable, LLVM
-// generates dwarf frames and registers it to libgcc with __register_frame.
-//
-// The __register_frame function works with Linux.
-//
-// Unfortunately, this functionality seems to be in libgcc after the unwinding
-// library of libgcc for darwin was written. The code for darwin overwrites the
-// value updated by __register_frame with a value fetched with "keymgr".
-// "keymgr" is an obsolete functionality, which should be rewritten some day.
-// In the meantime, since "keymgr" is on all libgccs shipped with apple-gcc, we
-// need a workaround in LLVM which uses the "keymgr" to dynamically modify the
-// values of an opaque key, used by libgcc to find dwarf tables.
-
-extern "C" void __register_frame(void*);
-extern "C" void __deregister_frame(void*);
-
-#if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED <= 1050
-# define USE_KEYMGR 1
-#else
-# define USE_KEYMGR 0
-#endif
-
-#if USE_KEYMGR
-
-namespace {
-
-// LibgccObject - This is the structure defined in libgcc. There is no #include
-// provided for this structure, so we also define it here. libgcc calls it
-// "struct object". The structure is undocumented in libgcc.
-struct LibgccObject {
-  void *unused1;
-  void *unused2;
-  void *unused3;
-
-  /// frame - Pointer to the exception table.
-  void *frame;
-
-  /// encoding -  The encoding of the object?
-  union {
-    struct {
-      unsigned long sorted : 1;
-      unsigned long from_array : 1;
-      unsigned long mixed_encoding : 1;
-      unsigned long encoding : 8;
-      unsigned long count : 21;
-    } b;
-    size_t i;
-  } encoding;
-
-  /// fde_end - libgcc defines this field only if some macro is defined. We
-  /// include this field even if it may not there, to make libgcc happy.
-  char *fde_end;
-
-  /// next - At least we know it's a chained list!
-  struct LibgccObject *next;
-};
-
-// "kemgr" stuff. Apparently, all frame tables are stored there.
-extern "C" void _keymgr_set_and_unlock_processwide_ptr(int, void *);
-extern "C" void *_keymgr_get_and_lock_processwide_ptr(int);
-#define KEYMGR_GCC3_DW2_OBJ_LIST        302     /* Dwarf2 object list  */
-
-/// LibgccObjectInfo - libgcc defines this struct as km_object_info. It
-/// probably contains all dwarf tables that are loaded.
-struct LibgccObjectInfo {
-
-  /// seenObjects - LibgccObjects already parsed by the unwinding runtime.
-  ///
-  struct LibgccObject* seenObjects;
-
-  /// unseenObjects - LibgccObjects not parsed yet by the unwinding runtime.
-  ///
-  struct LibgccObject* unseenObjects;
-
-  unsigned unused[2];
-};
-
-/// darwin_register_frame - Since __register_frame does not work with darwin's
-/// libgcc,we provide our own function, which "tricks" libgcc by modifying the
-/// "Dwarf2 object list" key.
-void DarwinRegisterFrame(void* FrameBegin) {
-  // Get the key.
-  LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
-    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
-  assert(LOI && "This should be preallocated by the runtime");
-
-  // Allocate a new LibgccObject to represent this frame. Deallocation of this
-  // object may be impossible: since darwin code in libgcc was written after
-  // the ability to dynamically register frames, things may crash if we
-  // deallocate it.
-  struct LibgccObject* ob = (struct LibgccObject*)
-    malloc(sizeof(struct LibgccObject));
-
-  // Do like libgcc for the values of the field.
-  ob->unused1 = (void *)-1;
-  ob->unused2 = 0;
-  ob->unused3 = 0;
-  ob->frame = FrameBegin;
-  ob->encoding.i = 0;
-  ob->encoding.b.encoding = llvm::dwarf::DW_EH_PE_omit;
-
-  // Put the info on both places, as libgcc uses the first or the second
-  // field. Note that we rely on having two pointers here. If fde_end was a
-  // char, things would get complicated.
-  ob->fde_end = (char*)LOI->unseenObjects;
-  ob->next = LOI->unseenObjects;
-
-  // Update the key's unseenObjects list.
-  LOI->unseenObjects = ob;
-
-  // Finally update the "key". Apparently, libgcc requires it.
-  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST,
-                                         LOI);
-
-}
-
-}
-#endif // __APPLE__
-#endif // HAVE_EHTABLE_SUPPORT
-
 /// createJIT - This is the factory method for creating a JIT for the current
 /// machine, it does not fall back to the interpreter.  This takes ownership
 /// of the module.
@@ -293,33 +159,11 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
     report_fatal_error("Target does not support machine code emission!");
   }
 
-  // Register routine for informing unwinding runtime about new EH frames
-#if HAVE_EHTABLE_SUPPORT
-#if USE_KEYMGR
-  struct LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
-    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
-
-  // The key is created on demand, and libgcc creates it the first time an
-  // exception occurs. Since we need the key to register frames, we create
-  // it now.
-  if (!LOI)
-    LOI = (LibgccObjectInfo*)calloc(sizeof(struct LibgccObjectInfo), 1);
-  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, LOI);
-  InstallExceptionTableRegister(DarwinRegisterFrame);
-  // Not sure about how to deregister on Darwin.
-#else
-  InstallExceptionTableRegister(__register_frame);
-  InstallExceptionTableDeregister(__deregister_frame);
-#endif // __APPLE__
-#endif // HAVE_EHTABLE_SUPPORT
-
   // Initialize passes.
   PM.doInitialization();
 }
 
 JIT::~JIT() {
-  // Unregister all exception tables registered by this JIT.
-  DeregisterAllTables();
   // Cleanup.
   AllJits->Remove(this);
   delete jitstate;
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 94db245..f58d31b 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -464,7 +464,7 @@ namespace {
 
     /// allocateCodeSection - Allocate memory for a code section.
     uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID) {
+                                 unsigned SectionID, StringRef SectionName) {
       // Grow the required block size to account for the block header
       Size += sizeof(*CurBlock);
 
@@ -510,7 +510,8 @@ namespace {
 
     /// allocateDataSection - Allocate memory for a data section.
     uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, bool IsReadOnly) {
+                                 unsigned SectionID, StringRef SectionName,
+                                 bool IsReadOnly) {
       return (uint8_t*)DataAllocator.Allocate(Size, Alignment);
     }
 
@@ -793,7 +794,7 @@ static void runAtExitHandlers() {
 // not inlined, and hiding their real definitions in a separate archive file
 // that the dynamic linker can't see. For more info, search for
 // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-#if defined(__linux__)
+#if defined(__linux__) && defined(__GLIBC__)
 /* stat functions are redirecting to __xstat with a version number.  On x86-64
  * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
  * available as an exported symbol, so we have to add it explicitly.
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 09dd924..195c458 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -14,10 +14,12 @@
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/PassManager.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -53,37 +55,63 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
 
 MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
              bool AllocateGVsWithCode)
-  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(MM), Dyld(MM),
-    IsLoaded(false), M(m), ObjCache(0) {
+  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(this, MM), Dyld(&MemMgr),
+    ObjCache(0) {
 
+  OwnedModules.addModule(m);
   setDataLayout(TM->getDataLayout());
 }
 
 MCJIT::~MCJIT() {
-  if (LoadedObject)
-    NotifyFreeingObject(*LoadedObject.get());
-  delete MemMgr;
+  MutexGuard locked(lock);
+  // FIXME: We are managing our modules, so we do not want the base class
+  // ExecutionEngine to manage them as well. To avoid double destruction
+  // of the first (and only) module added in ExecutionEngine constructor
+  // we remove it from EE and will destruct it ourselves.
+  //
+  // It may make sense to move our module manager (based on SmallStPtr) back
+  // into EE if the JIT and Interpreter can live with it.
+  // If so, additional functions: addModule, removeModule, FindFunctionNamed,
+  // runStaticConstructorsDestructors could be moved back to EE as well.
+  //
+  Modules.clear();
+  Dyld.deregisterEHFrames();
+
+  LoadedObjectMap::iterator it, end = LoadedObjects.end();
+  for (it = LoadedObjects.begin(); it != end; ++it) {
+    ObjectImage *Obj = it->second;
+    if (Obj) {
+      NotifyFreeingObject(*Obj);
+      delete Obj;
+    }
+  }
+  LoadedObjects.clear();
   delete TM;
 }
 
+void MCJIT::addModule(Module *M) {
+  MutexGuard locked(lock);
+  OwnedModules.addModule(M);
+}
+
+bool MCJIT::removeModule(Module *M) {
+  MutexGuard locked(lock);
+  return OwnedModules.removeModule(M);
+}
+
+
+
 void MCJIT::setObjectCache(ObjectCache* NewCache) {
+  MutexGuard locked(lock);
   ObjCache = NewCache;
 }
 
-ObjectBufferStream* MCJIT::emitObject(Module *m) {
-  /// Currently, MCJIT only supports a single module and the module passed to
-  /// this function call is expected to be the contained module.  The module
-  /// is passed as a parameter here to prepare for multiple module support in
-  /// the future.
-  assert(M == m);
-
-  // Get a thread lock to make sure we aren't trying to compile multiple times
+ObjectBufferStream* MCJIT::emitObject(Module *M) {
   MutexGuard locked(lock);
 
-  // FIXME: Track compilation state on a per-module basis when multiple modules
-  //        are supported.
-  // Re-compilation is not supported
-  assert(!IsLoaded);
+  // This must be a module which has already been added but not loaded to this
+  // MCJIT instance, since these conditions are tested by our caller,
+  // generateCodeForModule.
 
   PassManager PM;
 
@@ -99,7 +127,7 @@ ObjectBufferStream* MCJIT::emitObject(Module *m) {
   }
 
   // Initialize passes.
-  PM.run(*m);
+  PM.run(*M);
   // Flush the output buffer to get the generated code into memory
   CompiledObject->flush();
 
@@ -109,21 +137,22 @@ ObjectBufferStream* MCJIT::emitObject(Module *m) {
     // MemoryBuffer is a thin wrapper around the actual memory, so it's OK
     // to create a temporary object here and delete it after the call.
     OwningPtr<MemoryBuffer> MB(CompiledObject->getMemBuffer());
-    ObjCache->notifyObjectCompiled(m, MB.get());
+    ObjCache->notifyObjectCompiled(M, MB.get());
   }
 
   return CompiledObject.take();
 }
 
-void MCJIT::loadObject(Module *M) {
-
+void MCJIT::generateCodeForModule(Module *M) {
   // Get a thread lock to make sure we aren't trying to load multiple times
   MutexGuard locked(lock);
 
-  // FIXME: Track compilation state on a per-module basis when multiple modules
-  //        are supported.
+  // This must be a module which has already been added to this MCJIT instance.
+  assert(OwnedModules.ownsModule(M) &&
+         "MCJIT::generateCodeForModule: Unknown module.");
+
   // Re-compilation is not supported
-  if (IsLoaded)
+  if (OwnedModules.hasModuleBeenLoaded(M))
     return;
 
   OwningPtr<ObjectBuffer> ObjectToLoad;
@@ -141,59 +170,137 @@ void MCJIT::loadObject(Module *M) {
   }
 
   // Load the object into the dynamic linker.
-  // handing off ownership of the buffer
-  LoadedObject.reset(Dyld.loadObject(ObjectToLoad.take()));
+  // MCJIT now owns the ObjectImage pointer (via its LoadedObjects map).
+  ObjectImage *LoadedObject = Dyld.loadObject(ObjectToLoad.take());
+  LoadedObjects[M] = LoadedObject;
   if (!LoadedObject)
     report_fatal_error(Dyld.getErrorString());
 
-  // Resolve any relocations.
-  Dyld.resolveRelocations();
-
   // FIXME: Make this optional, maybe even move it to a JIT event listener
   LoadedObject->registerWithDebugger();
 
   NotifyObjectEmitted(*LoadedObject);
 
-  // FIXME: Add support for per-module compilation state
-  IsLoaded = true;
+  OwnedModules.markModuleAsLoaded(M);
 }
 
-// FIXME: Add a parameter to identify which object is being finalized when
-// MCJIT supports multiple modules.
-// FIXME: Provide a way to separate code emission, relocations and page 
-// protection in the interface.
+void MCJIT::finalizeLoadedModules() {
+  MutexGuard locked(lock);
+
+  // Resolve any outstanding relocations.
+  Dyld.resolveRelocations();
+
+  OwnedModules.markAllLoadedModulesAsFinalized();
+
+  // Register EH frame data for any module we own which has been loaded
+  Dyld.registerEHFrames();
+
+  // Set page permissions.
+  MemMgr.finalizeMemory();
+}
+
+// FIXME: Rename this.
 void MCJIT::finalizeObject() {
-  // If the module hasn't been compiled, just do that.
-  if (!IsLoaded) {
-    // If the call to Dyld.resolveRelocations() is removed from loadObject()
-    // we'll need to do that here.
-    loadObject(M);
-  } else {
-    // Resolve any relocations.
-    Dyld.resolveRelocations();
+  MutexGuard locked(lock);
+
+  for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
+                              E = OwnedModules.end_added();
+       I != E; ++I) {
+    Module *M = *I;
+    generateCodeForModule(M);
   }
 
-  StringRef EHData = Dyld.getEHFrameSection();
-  if (!EHData.empty())
-    MemMgr->registerEHFrames(EHData);
+  finalizeLoadedModules();
+}
 
-  // Set page permissions.
-  MemMgr->finalizeMemory();
+void MCJIT::finalizeModule(Module *M) {
+  MutexGuard locked(lock);
+
+  // This must be a module which has already been added to this MCJIT instance.
+  assert(OwnedModules.ownsModule(M) && "MCJIT::finalizeModule: Unknown module.");
+
+  // If the module hasn't been compiled, just do that.
+  if (!OwnedModules.hasModuleBeenLoaded(M))
+    generateCodeForModule(M);
+
+  finalizeLoadedModules();
 }
 
 void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
   report_fatal_error("not yet implemented");
 }
 
-void *MCJIT::getPointerToFunction(Function *F) {
-  // FIXME: This should really return a uint64_t since it's a pointer in the
-  // target address space, not our local address space. That's part of the
-  // ExecutionEngine interface, though. Fix that when the old JIT finally
-  // dies.
+uint64_t MCJIT::getExistingSymbolAddress(const std::string &Name) {
+  // Check with the RuntimeDyld to see if we already have this symbol.
+  if (Name[0] == '\1')
+    return Dyld.getSymbolLoadAddress(Name.substr(1));
+  return Dyld.getSymbolLoadAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+                                       + Name));
+}
+
+Module *MCJIT::findModuleForSymbol(const std::string &Name,
+                                   bool CheckFunctionsOnly) {
+  MutexGuard locked(lock);
+
+  // If it hasn't already been generated, see if it's in one of our modules.
+  for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
+                              E = OwnedModules.end_added();
+       I != E; ++I) {
+    Module *M = *I;
+    Function *F = M->getFunction(Name);
+    if (F && !F->isDeclaration())
+      return M;
+    if (!CheckFunctionsOnly) {
+      GlobalVariable *G = M->getGlobalVariable(Name);
+      if (G && !G->isDeclaration())
+        return M;
+      // FIXME: Do we need to worry about global aliases?
+    }
+  }
+  // We didn't find the symbol in any of our modules.
+  return NULL;
+}
+
+uint64_t MCJIT::getSymbolAddress(const std::string &Name,
+                                 bool CheckFunctionsOnly)
+{
+  MutexGuard locked(lock);
+
+  // First, check to see if we already have this symbol.
+  uint64_t Addr = getExistingSymbolAddress(Name);
+  if (Addr)
+    return Addr;
+
+  // If it hasn't already been generated, see if it's in one of our modules.
+  Module *M = findModuleForSymbol(Name, CheckFunctionsOnly);
+  if (!M)
+    return 0;
+
+  generateCodeForModule(M);
+
+  // Check the RuntimeDyld table again, it should be there now.
+  return getExistingSymbolAddress(Name);
+}
 
-  // FIXME: Add support for per-module compilation state
-  if (!IsLoaded)
-    loadObject(M);
+uint64_t MCJIT::getGlobalValueAddress(const std::string &Name) {
+  MutexGuard locked(lock);
+  uint64_t Result = getSymbolAddress(Name, false);
+  if (Result != 0)
+    finalizeLoadedModules();
+  return Result;
+}
+
+uint64_t MCJIT::getFunctionAddress(const std::string &Name) {
+  MutexGuard locked(lock);
+  uint64_t Result = getSymbolAddress(Name, true);
+  if (Result != 0)
+    finalizeLoadedModules();
+  return Result;
+}
+
+// Deprecated.  Use getFunctionAddress instead.
+void *MCJIT::getPointerToFunction(Function *F) {
+  MutexGuard locked(lock);
 
   if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
     bool AbortOnFailure = !F->hasExternalWeakLinkage();
@@ -202,6 +309,16 @@ void *MCJIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
+  Module *M = F->getParent();
+  bool HasBeenAddedButNotLoaded = OwnedModules.hasModuleBeenAddedButNotLoaded(M);
+
+  // Make sure the relevant module has been compiled and loaded.
+  if (HasBeenAddedButNotLoaded)
+    generateCodeForModule(M);
+  else if (!OwnedModules.hasModuleBeenLoaded(M))
+    // If this function doesn't belong to one of our modules, we're done.
+    return NULL;
+
   // FIXME: Should the Dyld be retaining module information? Probably not.
   // FIXME: Should we be using the mangler for this? Probably.
   //
@@ -222,6 +339,45 @@ void MCJIT::freeMachineCodeForFunction(Function *F) {
   report_fatal_error("not yet implemented");
 }
 
+void MCJIT::runStaticConstructorsDestructorsInModulePtrSet(
+    bool isDtors, ModulePtrSet::iterator I, ModulePtrSet::iterator E) {
+  for (; I != E; ++I) {
+    ExecutionEngine::runStaticConstructorsDestructors(*I, isDtors);
+  }
+}
+
+void MCJIT::runStaticConstructorsDestructors(bool isDtors) {
+  // Execute global ctors/dtors for each module in the program.
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_added(), OwnedModules.end_added());
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_loaded(), OwnedModules.end_loaded());
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_finalized(), OwnedModules.end_finalized());
+}
+
+Function *MCJIT::FindFunctionNamedInModulePtrSet(const char *FnName,
+                                                 ModulePtrSet::iterator I,
+                                                 ModulePtrSet::iterator E) {
+  for (; I != E; ++I) {
+    if (Function *F = (*I)->getFunction(FnName))
+      return F;
+  }
+  return 0;
+}
+
+Function *MCJIT::FindFunctionNamed(const char *FnName) {
+  Function *F = FindFunctionNamedInModulePtrSet(
+      FnName, OwnedModules.begin_added(), OwnedModules.end_added());
+  if (!F)
+    F = FindFunctionNamedInModulePtrSet(FnName, OwnedModules.begin_loaded(),
+                                        OwnedModules.end_loaded());
+  if (!F)
+    F = FindFunctionNamedInModulePtrSet(FnName, OwnedModules.begin_finalized(),
+                                        OwnedModules.end_finalized());
+  return F;
+}
+
 GenericValue MCJIT::runFunction(Function *F,
                                 const std::vector<GenericValue> &ArgValues) {
   assert(F && "Function *F was null at entry to run()");
@@ -324,12 +480,8 @@ GenericValue MCJIT::runFunction(Function *F,
 
 void *MCJIT::getPointerToNamedFunction(const std::string &Name,
                                        bool AbortOnFailure) {
-  // FIXME: Add support for per-module compilation state
-  if (!IsLoaded)
-    loadObject(M);
-
-  if (!isSymbolSearchingDisabled() && MemMgr) {
-    void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
+  if (!isSymbolSearchingDisabled()) {
+    void *ptr = MemMgr.getPointerToNamedFunction(Name, false);
     if (ptr)
       return ptr;
   }
@@ -365,6 +517,7 @@ void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
 }
 void MCJIT::NotifyObjectEmitted(const ObjectImage& Obj) {
   MutexGuard locked(lock);
+  MemMgr.notifyObjectLoaded(this, &Obj);
   for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
     EventListeners[I]->NotifyObjectEmitted(Obj);
   }
@@ -375,3 +528,14 @@ void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
     EventListeners[I]->NotifyFreeingObject(Obj);
   }
 }
+
+uint64_t LinkingMemoryManager::getSymbolAddress(const std::string &Name) {
+  uint64_t Result = ParentEngine->getSymbolAddress(Name, false);
+  // If the symbols wasn't found and it begins with an underscore, try again
+  // without the underscore.
+  if (!Result && Name[0] == '_')
+    Result = ParentEngine->getSymbolAddress(Name.substr(1), false);
+  if (Result)
+    return Result;
+  return ClientMM->getSymbolAddress(Name);
+}
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index a899d4f..86b478b 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -10,48 +10,235 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 #define LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/Module.h"
 
 namespace llvm {
+class MCJIT;
 
-class ObjectImage;
+// This is a helper class that the MCJIT execution engine uses for linking
+// functions across modules that it owns.  It aggregates the memory manager
+// that is passed in to the MCJIT constructor and defers most functionality
+// to that object.
+class LinkingMemoryManager : public RTDyldMemoryManager {
+public:
+  LinkingMemoryManager(MCJIT *Parent, RTDyldMemoryManager *MM)
+    : ParentEngine(Parent), ClientMM(MM) {}
+
+  virtual uint64_t getSymbolAddress(const std::string &Name);
+
+  // Functions deferred to client memory manager
+  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID, StringRef SectionName) {
+    return ClientMM->allocateCodeSection(Size, Alignment, SectionID, SectionName);
+  }
+
+  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID, StringRef SectionName,
+                                       bool IsReadOnly) {
+    return ClientMM->allocateDataSection(Size, Alignment,
+                                         SectionID, SectionName, IsReadOnly);
+  }
+
+  virtual void notifyObjectLoaded(ExecutionEngine *EE,
+                                  const ObjectImage *Obj) {
+    ClientMM->notifyObjectLoaded(EE, Obj);
+  }
+
+  virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+    ClientMM->registerEHFrames(Addr, LoadAddr, Size);
+  }
+
+  virtual void deregisterEHFrames(uint8_t *Addr,
+                                  uint64_t LoadAddr,
+                                  size_t Size) {
+    ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+  }
+
+  virtual bool finalizeMemory(std::string *ErrMsg = 0) {
+    return ClientMM->finalizeMemory(ErrMsg);
+  }
 
-// FIXME: This makes all kinds of horrible assumptions for the time being,
-// like only having one module, not needing to worry about multi-threading,
-// blah blah. Purely in get-it-up-and-limping mode for now.
+private:
+  MCJIT *ParentEngine;
+  OwningPtr<RTDyldMemoryManager> ClientMM;
+};
+
+// About Module states: added->loaded->finalized.
+//
+// The purpose of the "added" state is having modules in standby. (added=known
+// but not compiled). The idea is that you can add a module to provide function
+// definitions but if nothing in that module is referenced by a module in which
+// a function is executed (note the wording here because it's not exactly the
+// ideal case) then the module never gets compiled. This is sort of lazy
+// compilation.
+//
+// The purpose of the "loaded" state (loaded=compiled and required sections
+// copied into local memory but not yet ready for execution) is to have an
+// intermediate state wherein clients can remap the addresses of sections, using
+// MCJIT::mapSectionAddress, (in preparation for later copying to a new location
+// or an external process) before relocations and page permissions are applied.
+//
+// It might not be obvious at first glance, but the "remote-mcjit" case in the
+// lli tool does this.  In that case, the intermediate action is taken by the
+// RemoteMemoryManager in response to the notifyObjectLoaded function being
+// called.
 
 class MCJIT : public ExecutionEngine {
   MCJIT(Module *M, TargetMachine *tm, RTDyldMemoryManager *MemMgr,
         bool AllocateGVsWithCode);
 
+  typedef llvm::SmallPtrSet<Module *, 4> ModulePtrSet;
+
+  class OwningModuleContainer {
+  public:
+    OwningModuleContainer() {
+    }
+    ~OwningModuleContainer() {
+      freeModulePtrSet(AddedModules);
+      freeModulePtrSet(LoadedModules);
+      freeModulePtrSet(FinalizedModules);
+    }
+
+    ModulePtrSet::iterator begin_added() { return AddedModules.begin(); }
+    ModulePtrSet::iterator end_added() { return AddedModules.end(); }
+
+    ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); }
+    ModulePtrSet::iterator end_loaded() { return LoadedModules.end(); }
+
+    ModulePtrSet::iterator begin_finalized() { return FinalizedModules.begin(); }
+    ModulePtrSet::iterator end_finalized() { return FinalizedModules.end(); }
+
+    void addModule(Module *M) {
+      AddedModules.insert(M);
+    }
+
+    bool removeModule(Module *M) {
+      return AddedModules.erase(M) || LoadedModules.erase(M) ||
+             FinalizedModules.erase(M);
+    }
+
+    bool hasModuleBeenAddedButNotLoaded(Module *M) {
+      return AddedModules.count(M) != 0;
+    }
+
+    bool hasModuleBeenLoaded(Module *M) {
+      // If the module is in either the "loaded" or "finalized" sections it
+      // has been loaded.
+      return (LoadedModules.count(M) != 0 ) || (FinalizedModules.count(M) != 0);
+    }
+
+    bool hasModuleBeenFinalized(Module *M) {
+      return FinalizedModules.count(M) != 0;
+    }
+
+    bool ownsModule(Module* M) {
+      return (AddedModules.count(M) != 0) || (LoadedModules.count(M) != 0) ||
+             (FinalizedModules.count(M) != 0);
+    }
+
+    void markModuleAsLoaded(Module *M) {
+      // This checks against logic errors in the MCJIT implementation.
+      // This function should never be called with either a Module that MCJIT
+      // does not own or a Module that has already been loaded and/or finalized.
+      assert(AddedModules.count(M) &&
+             "markModuleAsLoaded: Module not found in AddedModules");
+
+      // Remove the module from the "Added" set.
+      AddedModules.erase(M);
+
+      // Add the Module to the "Loaded" set.
+      LoadedModules.insert(M);
+    }
+
+    void markModuleAsFinalized(Module *M) {
+      // This checks against logic errors in the MCJIT implementation.
+      // This function should never be called with either a Module that MCJIT
+      // does not own, a Module that has not been loaded or a Module that has
+      // already been finalized.
+      assert(LoadedModules.count(M) &&
+             "markModuleAsFinalized: Module not found in LoadedModules");
+
+      // Remove the module from the "Loaded" section of the list.
+      LoadedModules.erase(M);
+
+      // Add the Module to the "Finalized" section of the list by inserting it
+      // before the 'end' iterator.
+      FinalizedModules.insert(M);
+    }
+
+    void markAllLoadedModulesAsFinalized() {
+      for (ModulePtrSet::iterator I = LoadedModules.begin(),
+                                  E = LoadedModules.end();
+           I != E; ++I) {
+        Module *M = *I;
+        FinalizedModules.insert(M);
+      }
+      LoadedModules.clear();
+    }
+
+  private:
+    ModulePtrSet AddedModules;
+    ModulePtrSet LoadedModules;
+    ModulePtrSet FinalizedModules;
+
+    void freeModulePtrSet(ModulePtrSet& MPS) {
+      // Go through the module set and delete everything.
+      for (ModulePtrSet::iterator I = MPS.begin(), E = MPS.end(); I != E; ++I) {
+        Module *M = *I;
+        delete M;
+      }
+      MPS.clear();
+    }
+  };
+
   TargetMachine *TM;
   MCContext *Ctx;
-  RTDyldMemoryManager *MemMgr;
+  LinkingMemoryManager MemMgr;
   RuntimeDyld Dyld;
   SmallVector<JITEventListener*, 2> EventListeners;
 
-  // FIXME: Add support for multiple modules
-  bool IsLoaded;
-  Module *M;
-  OwningPtr<ObjectImage> LoadedObject;
+  OwningModuleContainer OwnedModules;
+
+  typedef DenseMap<Module *, ObjectImage *> LoadedObjectMap;
+  LoadedObjectMap  LoadedObjects;
 
   // An optional ObjectCache to be notified of compiled objects and used to
   // perform lookup of pre-compiled code to avoid re-compilation.
   ObjectCache *ObjCache;
 
+  Function *FindFunctionNamedInModulePtrSet(const char *FnName,
+                                            ModulePtrSet::iterator I,
+                                            ModulePtrSet::iterator E);
+
+  void runStaticConstructorsDestructorsInModulePtrSet(bool isDtors,
+                                                      ModulePtrSet::iterator I,
+                                                      ModulePtrSet::iterator E);
+
 public:
   ~MCJIT();
 
   /// @name ExecutionEngine interface implementation
   /// @{
+  virtual void addModule(Module *M);
+  virtual bool removeModule(Module *M);
+
+  /// FindFunctionNamed - Search all of the active modules to find the one that
+  /// defines FnName.  This is very slow operation and shouldn't be used for
+  /// general code.
+  virtual Function *FindFunctionNamed(const char *FnName);
 
   /// Sets the object manager that MCJIT should use to avoid compilation.
   virtual void setObjectCache(ObjectCache *manager);
 
+  virtual void generateCodeForModule(Module *M);
+
   /// finalizeObject - ensure the module is fully processed and is usable.
   ///
   /// It is the user-level function for completing the process of making the
@@ -59,7 +246,17 @@ public:
   /// object have been relocated using mapSectionAddress.  When this method is
   /// called the MCJIT execution engine will reapply relocations for a loaded
   /// object.
+  /// Is it OK to finalize a set of modules, add modules and finalize again.
+  // FIXME: Do we really need both of these?
   virtual void finalizeObject();
+  virtual void finalizeModule(Module *);
+  void finalizeLoadedModules();
+
+  /// runStaticConstructorsDestructors - This method is used to execute all of
+  /// the static constructors or destructors for a program.
+  ///
+  /// \param isDtors - Run the destructors instead of constructors.
+  void runStaticConstructorsDestructors(bool isDtors);
 
   virtual void *getPointerToBasicBlock(BasicBlock *BB);
 
@@ -91,10 +288,15 @@ public:
                                  uint64_t TargetAddress) {
     Dyld.mapSectionAddress(LocalAddress, TargetAddress);
   }
-
   virtual void RegisterJITEventListener(JITEventListener *L);
   virtual void UnregisterJITEventListener(JITEventListener *L);
 
+  // If successful, these function will implicitly finalize all loaded objects.
+  // To get a function address within MCJIT without causing a finalize, use
+  // getSymbolAddress.
+  virtual uint64_t getGlobalValueAddress(const std::string &Name);
+  virtual uint64_t getFunctionAddress(const std::string &Name);
+
   /// @}
   /// @name (Private) Registration Interfaces
   /// @{
@@ -111,6 +313,11 @@ public:
 
   // @}
 
+  // This is not directly exposed via the ExecutionEngine API, but it is
+  // used by the LinkingMemoryManager.
+  uint64_t getSymbolAddress(const std::string &Name,
+                          bool CheckFunctionsOnly);
+
 protected:
   /// emitObject -- Generate a JITed object in memory from the specified module
   /// Currently, MCJIT only supports a single module and the module passed to
@@ -119,10 +326,12 @@ protected:
   /// the future.
   ObjectBufferStream* emitObject(Module *M);
 
-  void loadObject(Module *M);
-
   void NotifyObjectEmitted(const ObjectImage& Obj);
   void NotifyFreeingObject(const ObjectImage& Obj);
+
+  uint64_t getExistingSymbolAddress(const std::string &Name);
+  Module *findModuleForSymbol(const std::string &Name,
+                              bool CheckFunctionsOnly);
 };
 
 } // End llvm namespace
diff --git a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
index 650832e..cf90e77 100644
--- a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
+++ b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -19,9 +19,10 @@
 namespace llvm {
 
 uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
-                                                    unsigned Alignment,
-                                                    unsigned SectionID,
-                                                    bool IsReadOnly) {
+                                                   unsigned Alignment,
+                                                   unsigned SectionID,
+                                                   StringRef SectionName,
+                                                   bool IsReadOnly) {
   if (IsReadOnly)
     return allocateSection(RODataMem, Size, Alignment);
   return allocateSection(RWDataMem, Size, Alignment);
@@ -29,7 +30,8 @@ uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
 
 uint8_t *SectionMemoryManager::allocateCodeSection(uintptr_t Size,
                                                    unsigned Alignment,
-                                                   unsigned SectionID) {
+                                                   unsigned SectionID,
+                                                   StringRef SectionName) {
   return allocateSection(CodeMem, Size, Alignment);
 }
 
@@ -105,6 +107,9 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
   // FIXME: Should in-progress permissions be reverted if an error occurs?
   error_code ec;
 
+  // Don't allow free memory blocks to be used after setting protection flags.
+  CodeMem.FreeMem.clear();
+
   // Make code memory executable.
   ec = applyMemoryGroupPermissions(CodeMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
@@ -115,6 +120,9 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
     return true;
   }
 
+  // Don't allow free memory blocks to be used after setting protection flags.
+  RODataMem.FreeMem.clear();
+
   // Make read-only data memory read-only.
   ec = applyMemoryGroupPermissions(RODataMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 38867ec..f11df82 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -20,7 +20,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
@@ -52,6 +54,10 @@ public:
                                 const JITEvent_EmittedFunctionDetails &Details);
 
   virtual void NotifyFreeingMachineCode(void *OldPtr);
+
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+
+  virtual void NotifyFreeingObject(const ObjectImage &Obj);
 };
 
 void OProfileJITEventListener::initialize() {
@@ -159,6 +165,66 @@ void OProfileJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
   }
 }
 
+void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+  if (!Wrapper.isAgentAvailable()) {
+    return;
+  }
+
+  // Use symbol info to iterate functions in the object.
+  error_code ec;
+  for (object::symbol_iterator I = Obj.begin_symbols(),
+                               E = Obj.end_symbols();
+                        I != E && !ec;
+                        I.increment(ec)) {
+    object::SymbolRef::Type SymType;
+    if (I->getType(SymType)) continue;
+    if (SymType == object::SymbolRef::ST_Function) {
+      StringRef  Name;
+      uint64_t   Addr;
+      uint64_t   Size;
+      if (I->getName(Name)) continue;
+      if (I->getAddress(Addr)) continue;
+      if (I->getSize(Size)) continue;
+
+      if (Wrapper.op_write_native_code(Name.data(), Addr, (void*)Addr, Size)
+                        == -1) {
+        DEBUG(dbgs() << "Failed to tell OProfile about native function "
+          << Name << " at ["
+          << (void*)Addr << "-" << ((char*)Addr + Size) << "]\n");
+        continue;
+      }
+      // TODO: support line number info (similar to IntelJITEventListener.cpp)
+    }
+  }
+}
+
+void OProfileJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
+  if (!Wrapper.isAgentAvailable()) {
+    return;
+  }
+
+  // Use symbol info to iterate functions in the object.
+  error_code ec;
+  for (object::symbol_iterator I = Obj.begin_symbols(),
+                               E = Obj.end_symbols();
+                        I != E && !ec;
+                        I.increment(ec)) {
+    object::SymbolRef::Type SymType;
+    if (I->getType(SymType)) continue;
+    if (SymType == object::SymbolRef::ST_Function) {
+      uint64_t   Addr;
+      if (I->getAddress(Addr)) continue;
+
+      if (Wrapper.op_unload_native_code(Addr) == -1) {
+        DEBUG(dbgs()
+          << "Failed to tell OProfile about unload of native function at "
+          << (void*)Addr << "\n");
+        continue;
+      }
+    }
+  }
+}
+
 }  // anonymous namespace.
 
 namespace llvm {
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index 7c0d395..61d8dc2 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -13,22 +13,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ExecutionEngine/OProfileWrapper.h"
-
 #define DEBUG_TYPE "oprofile-wrapper"
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/ADT/SmallString.h"
-
-#include <sstream>
 #include <cstring>
-#include <stddef.h>
 #include <dirent.h>
-#include <sys/stat.h>
 #include <fcntl.h>
+#include <sstream>
+#include <stddef.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 namespace {
@@ -143,6 +141,10 @@ bool OProfileWrapper::checkForOProfileProcEntry() {
         close(CmdLineFD);
         ssize_t Idx = 0;
 
+        if (ExeName[0] != '/') {
+          BaseName = ExeName;
+        }
+
         // Find the terminator for the first string
         while (Idx < NumRead-1 && ExeName[Idx] != 0) {
           Idx++;
@@ -161,7 +163,8 @@ bool OProfileWrapper::checkForOProfileProcEntry() {
         }
 
         // Test this to see if it is the oprofile daemon
-        if (BaseName != 0 && !strcmp("oprofiled", BaseName)) {
+        if (BaseName != 0 && (!strcmp("oprofiled", BaseName) ||
+                              !strcmp("operf", BaseName))) {
           // If it is, we're done
           closedir(ProcDir);
           return true;
diff --git a/lib/ExecutionEngine/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
index 4e76457..26e1fdd 100644
--- a/lib/ExecutionEngine/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
@@ -33,8 +33,8 @@ namespace llvm {
 RTDyldMemoryManager::~RTDyldMemoryManager() {}
 
 // Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
+#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) && \
+     !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__))
 #define HAVE_EHTABLE_SUPPORT 1
 #else
 #define HAVE_EHTABLE_SUPPORT 0
@@ -42,35 +42,180 @@ RTDyldMemoryManager::~RTDyldMemoryManager() {}
 
 #if HAVE_EHTABLE_SUPPORT
 extern "C" void __register_frame(void*);
+extern "C" void __deregister_frame(void*);
+#else
+// The building compiler does not have __(de)register_frame but
+// it may be found at runtime in a dynamically-loaded library.
+// For example, this happens when building LLVM with Visual C++
+// but using the MingW runtime.
+void __register_frame(void *p) {
+  static bool Searched = false;
+  static void *rf = 0;
+
+  if (!Searched) {
+    Searched = true;
+    rf = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__register_frame");
+  }
+  if (rf)
+    ((void (*)(void *))rf)(p);
+}
+
+void __deregister_frame(void *p) {
+  static bool Searched = false;
+  static void *df = 0;
+
+  if (!Searched) {
+    Searched = true;
+    df = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__deregister_frame");
+  }
+  if (df)
+    ((void (*)(void *))df)(p);
+}
+#endif
+
+#ifdef __APPLE__
 
-static const char *processFDE(const char *Entry) {
+static const char *processFDE(const char *Entry, bool isDeregister) {
   const char *P = Entry;
   uint32_t Length = *((const uint32_t *)P);
   P += 4;
   uint32_t Offset = *((const uint32_t *)P);
-  if (Offset != 0)
-    __register_frame(const_cast<char *>(Entry));
+  if (Offset != 0) {
+    if (isDeregister)
+      __deregister_frame(const_cast<char *>(Entry));
+    else
+      __register_frame(const_cast<char *>(Entry));
+  }
   return P + Length;
 }
-#endif
 
-void RTDyldMemoryManager::registerEHFrames(StringRef SectionData) {
-#if HAVE_EHTABLE_SUPPORT
-  const char *P = SectionData.data();
-  const char *End = SectionData.data() + SectionData.size();
+// This implementation handles frame registration for local targets.
+// Memory managers for remote targets should re-implement this function
+// and use the LoadAddr parameter.
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On OS X OS X __register_frame takes a single FDE as an argument.
+  // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
   do  {
-    P = processFDE(P);
+    P = processFDE(P, false);
   } while(P != End);
-#endif
 }
 
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
+  do  {
+    P = processFDE(P, true);
+  } while(P != End);
+}
+
+#else
+
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On Linux __register_frame takes a single argument: 
+  // a pointer to the start of the .eh_frame section.
+
+  // How can it find the end? Because crtendS.o is linked 
+  // in and it has an .eh_frame section with four zero chars.
+  __register_frame(Addr);
+}
+
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  __deregister_frame(Addr);
+}
+
+#endif
+
 static int jit_noop() {
   return 0;
 }
 
-void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                                     bool AbortOnFailure) {
-#if defined(__linux__)
+// ARM math functions are statically linked on Android from libgcc.a, but not
+// available at runtime for dynamic linking. On Linux these are usually placed
+// in libgcc_s.so so can be found by normal dynamic lookup.
+#if defined(__BIONIC__) && defined(__arm__)
+// List of functions which are statically linked on Android and can be generated
+// by LLVM. This is done as a nested macro which is used once to declare the
+// imported functions with ARM_MATH_DECL and once to compare them to the
+// user-requested symbol in getSymbolAddress with ARM_MATH_CHECK. The test
+// assumes that all functions start with __aeabi_ and getSymbolAddress must be
+// modified if that changes.
+#define ARM_MATH_IMPORTS(PP) \
+  PP(__aeabi_d2f) \
+  PP(__aeabi_d2iz) \
+  PP(__aeabi_d2lz) \
+  PP(__aeabi_d2uiz) \
+  PP(__aeabi_d2ulz) \
+  PP(__aeabi_dadd) \
+  PP(__aeabi_dcmpeq) \
+  PP(__aeabi_dcmpge) \
+  PP(__aeabi_dcmpgt) \
+  PP(__aeabi_dcmple) \
+  PP(__aeabi_dcmplt) \
+  PP(__aeabi_dcmpun) \
+  PP(__aeabi_ddiv) \
+  PP(__aeabi_dmul) \
+  PP(__aeabi_dsub) \
+  PP(__aeabi_f2d) \
+  PP(__aeabi_f2iz) \
+  PP(__aeabi_f2lz) \
+  PP(__aeabi_f2uiz) \
+  PP(__aeabi_f2ulz) \
+  PP(__aeabi_fadd) \
+  PP(__aeabi_fcmpeq) \
+  PP(__aeabi_fcmpge) \
+  PP(__aeabi_fcmpgt) \
+  PP(__aeabi_fcmple) \
+  PP(__aeabi_fcmplt) \
+  PP(__aeabi_fcmpun) \
+  PP(__aeabi_fdiv) \
+  PP(__aeabi_fmul) \
+  PP(__aeabi_fsub) \
+  PP(__aeabi_i2d) \
+  PP(__aeabi_i2f) \
+  PP(__aeabi_idiv) \
+  PP(__aeabi_idivmod) \
+  PP(__aeabi_l2d) \
+  PP(__aeabi_l2f) \
+  PP(__aeabi_lasr) \
+  PP(__aeabi_ldivmod) \
+  PP(__aeabi_llsl) \
+  PP(__aeabi_llsr) \
+  PP(__aeabi_lmul) \
+  PP(__aeabi_ui2d) \
+  PP(__aeabi_ui2f) \
+  PP(__aeabi_uidiv) \
+  PP(__aeabi_uidivmod) \
+  PP(__aeabi_ul2d) \
+  PP(__aeabi_ul2f) \
+  PP(__aeabi_uldivmod)
+
+// Declare statically linked math functions on ARM. The function declarations
+// here do not have the correct prototypes for each function in
+// ARM_MATH_IMPORTS, but it doesn't matter because only the symbol addresses are
+// needed. In particular the __aeabi_*divmod functions do not have calling
+// conventions which match any C prototype.
+#define ARM_MATH_DECL(name) extern "C" void name();
+ARM_MATH_IMPORTS(ARM_MATH_DECL)
+#undef ARM_MATH_DECL
+#endif
+
+uint64_t RTDyldMemoryManager::getSymbolAddress(const std::string &Name) {
+  // This implementation assumes that the host program is the target.
+  // Clients generating code for a remote target should implement their own
+  // memory manager.
+#if defined(__linux__) && defined(__GLIBC__)
   //===--------------------------------------------------------------------===//
   // Function stubs that are invoked instead of certain library calls
   //
@@ -80,15 +225,26 @@ void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
   // not inlined, and hiding their real definitions in a separate archive file
   // that the dynamic linker can't see. For more info, search for
   // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-  if (Name == "stat") return (void*)(intptr_t)&stat;
-  if (Name == "fstat") return (void*)(intptr_t)&fstat;
-  if (Name == "lstat") return (void*)(intptr_t)&lstat;
-  if (Name == "stat64") return (void*)(intptr_t)&stat64;
-  if (Name == "fstat64") return (void*)(intptr_t)&fstat64;
-  if (Name == "lstat64") return (void*)(intptr_t)&lstat64;
-  if (Name == "atexit") return (void*)(intptr_t)&atexit;
-  if (Name == "mknod") return (void*)(intptr_t)&mknod;
-#endif // __linux__
+  if (Name == "stat") return (uint64_t)&stat;
+  if (Name == "fstat") return (uint64_t)&fstat;
+  if (Name == "lstat") return (uint64_t)&lstat;
+  if (Name == "stat64") return (uint64_t)&stat64;
+  if (Name == "fstat64") return (uint64_t)&fstat64;
+  if (Name == "lstat64") return (uint64_t)&lstat64;
+  if (Name == "atexit") return (uint64_t)&atexit;
+  if (Name == "mknod") return (uint64_t)&mknod;
+#endif // __linux__ && __GLIBC__
+  
+  // See ARM_MATH_IMPORTS definition for explanation
+#if defined(__BIONIC__) && defined(__arm__)
+  if (Name.compare(0, 8, "__aeabi_") == 0) {
+    // Check if the user has requested any of the functions listed in
+    // ARM_MATH_IMPORTS, and if so redirect to the statically linked symbol.
+#define ARM_MATH_CHECK(fn) if (Name == #fn) return (uint64_t)&fn;
+    ARM_MATH_IMPORTS(ARM_MATH_CHECK)
+#undef ARM_MATH_CHECK
+  }
+#endif
 
   // We should not invoke parent's ctors/dtors from generated main()!
   // On Mingw and Cygwin, the symbol __main is resolved to
@@ -96,23 +252,31 @@ void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
   // (and register wrong callee's dtors with atexit(3)).
   // We expect ExecutionEngine::runStaticConstructorsDestructors()
   // is called before ExecutionEngine::runFunctionAsMain() is called.
-  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+  if (Name == "__main") return (uint64_t)&jit_noop;
 
   const char *NameStr = Name.c_str();
   void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-  if (Ptr) return Ptr;
+  if (Ptr)
+    return (uint64_t)Ptr;
 
   // If it wasn't found and if it starts with an underscore ('_') character,
   // try again without the underscore.
   if (NameStr[0] == '_') {
     Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
-    if (Ptr) return Ptr;
+    if (Ptr)
+      return (uint64_t)Ptr;
   }
+  return 0;
+}
 
-  if (AbortOnFailure)
+void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                     bool AbortOnFailure) {
+  uint64_t Addr = getSymbolAddress(Name);
+
+  if (!Addr && AbortOnFailure)
     report_fatal_error("Program used external function '" + Name +
                        "' which could not be resolved!");
-  return 0;
+  return (void*)Addr;
 }
 
 } // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
index 69e9dbe..6a514ea 100644
--- a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
+++ b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
@@ -16,6 +16,7 @@ namespace llvm {
 
 /// Global access point for the JIT debugging interface.
 class JITRegistrar {
+  virtual void anchor();
 public:
   /// Instantiates the JIT service.
   JITRegistrar() {}
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index 89350cc..9cbde5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -23,6 +23,7 @@ namespace llvm {
 class ObjectImageCommon : public ObjectImage {
   ObjectImageCommon(); // = delete
   ObjectImageCommon(const ObjectImageCommon &other); // = delete
+  virtual void anchor();
 
 protected:
   object::ObjectFile *ObjFile;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 943622f..161135a 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -13,12 +13,14 @@
 
 #define DEBUG_TYPE "dyld"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "JITRegistrar.h"
 #include "ObjectImageCommon.h"
 #include "RuntimeDyldELF.h"
 #include "RuntimeDyldImpl.h"
 #include "RuntimeDyldMachO.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MutexGuard.h"
 #include "llvm/Object/ELF.h"
 
 using namespace llvm;
@@ -27,30 +29,44 @@ using namespace llvm::object;
 // Empty out-of-line virtual destructor as the key function.
 RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
+// Pin the JITRegistrar's and ObjectImage*'s vtables to this file.
+void JITRegistrar::anchor() {}
+void ObjectImage::anchor() {}
+void ObjectImageCommon::anchor() {}
+
 namespace llvm {
 
-StringRef RuntimeDyldImpl::getEHFrameSection() {
-  return StringRef();
+void RuntimeDyldImpl::registerEHFrames() {
+}
+
+void RuntimeDyldImpl::deregisterEHFrames() {
 }
 
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
+  MutexGuard locked(lock);
+
   // First, resolve relocations associated with external symbols.
   resolveExternalSymbols();
 
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
   for (int i = 0, e = Sections.size(); i != e; ++i) {
+    // The Section here (Sections[i]) refers to the section in which the
+    // symbol for the relocation is located.  The SectionID in the relocation
+    // entry provides the section to which the relocation will be applied.
     uint64_t Addr = Sections[i].LoadAddress;
     DEBUG(dbgs() << "Resolving relocations Section #" << i
             << "\t" << format("%p", (uint8_t *)Addr)
             << "\n");
     resolveRelocationList(Relocations[i], Addr);
+    Relocations.erase(i);
   }
 }
 
 void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
                                         uint64_t TargetAddress) {
+  MutexGuard locked(lock);
   for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
     if (Sections[i].Address == LocalAddress) {
       reassignSectionAddress(i, TargetAddress);
@@ -67,11 +83,15 @@ ObjectImage *RuntimeDyldImpl::createObjectImage(ObjectBuffer *InputBuffer) {
 }
 
 ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
+  MutexGuard locked(lock);
+
   OwningPtr<ObjectImage> obj(createObjectImage(InputBuffer));
   if (!obj)
     report_fatal_error("Unable to create object image from memory buffer!");
 
+  // Save information about our target
   Arch = (Triple::ArchType)obj->getArch();
+  IsTargetLittleEndian = obj->getObjectFile()->isLittleEndian();
 
   // Symbols found in this object
   StringMap<SymbolLoc> LocalSymbols;
@@ -166,6 +186,9 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
     }
   }
 
+  // Give the subclasses a chance to tie-up any loose ends.
+  finalizeLoad(LocalSections);
+
   return obj.take();
 }
 
@@ -175,8 +198,8 @@ void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
                                         SymbolTableMap &SymbolTable) {
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
-  uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*),
-                                              SectionID, false);
+  uint8_t *Addr = MemMgr->allocateDataSection(
+    TotalSize, sizeof(void*), SectionID, StringRef(), false);
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
@@ -247,6 +270,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   bool IsZeroInit;
   bool IsReadOnly;
   uint64_t DataSize;
+  unsigned PaddingSize = 0;
   StringRef Name;
   Check(Section.isRequiredForExecution(IsRequired));
   Check(Section.isVirtual(IsVirtual));
@@ -261,6 +285,12 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
       StubBufSize += StubAlignment - EndAlignment;
   }
 
+  // The .eh_frame section (at least on Linux) needs an extra four bytes padded
+  // with zeroes added at the end.  For MachO objects, this section has a
+  // slightly different name, so this won't have any effect for MachO objects.
+  if (Name == ".eh_frame")
+    PaddingSize = 4;
+
   unsigned Allocate;
   unsigned SectionID = Sections.size();
   uint8_t *Addr;
@@ -269,10 +299,11 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   // Some sections, such as debug info, don't need to be loaded for execution.
   // Leave those where they are.
   if (IsRequired) {
-    Allocate = DataSize + StubBufSize;
+    Allocate = DataSize + PaddingSize + StubBufSize;
     Addr = IsCode
-      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID)
-      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID, IsReadOnly);
+      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID, Name)
+      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID, Name,
+                                    IsReadOnly);
     if (!Addr)
       report_fatal_error("Unable to allocate section memory!");
 
@@ -286,6 +317,13 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
     else
       memcpy(Addr, pData, DataSize);
 
+    // Fill in any extra bytes we allocated for padding
+    if (PaddingSize != 0) {
+      memset(Addr + DataSize, 0, PaddingSize);
+      // Update the DataSize variable so that the stub offset is set correctly.
+      DataSize += PaddingSize;
+    }
+
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
                  << " Name: " << Name
                  << " obj addr: " << format("%p", pData)
@@ -421,6 +459,10 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
     writeInt16BE(Addr+6,  0x07F1);     // brc 15,%r1
     // 8-byte address stored at Addr + 8
     return Addr;
+  } else if (Arch == Triple::x86_64) {
+    *Addr      = 0xFF; // jmp
+    *(Addr+1)  = 0x25; // rip
+    // 32-bit PC-relative address of the GOT entry will be stored at Addr+2
   }
   return Addr;
 }
@@ -454,30 +496,52 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
 }
 
 void RuntimeDyldImpl::resolveExternalSymbols() {
-  StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin(),
-                                      e = ExternalSymbolRelocations.end();
-  for (; i != e; i++) {
+  while(!ExternalSymbolRelocations.empty()) {
+    StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin();
+
     StringRef Name = i->first();
-    RelocationList &Relocs = i->second;
-    SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
-    if (Loc == GlobalSymbolTable.end()) {
-      if (Name.size() == 0) {
-        // This is an absolute symbol, use an address of zero.
-        DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
-        resolveRelocationList(Relocs, 0);
+    if (Name.size() == 0) {
+      // This is an absolute symbol, use an address of zero.
+      DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
+      RelocationList &Relocs = i->second;
+      resolveRelocationList(Relocs, 0);
+    } else {
+      uint64_t Addr = 0;
+      SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
+      if (Loc == GlobalSymbolTable.end()) {
+          // This is an external symbol, try to get its address from
+          // MemoryManager.
+          Addr = MemMgr->getSymbolAddress(Name.data());
+          // The call to getSymbolAddress may have caused additional modules to
+          // be loaded, which may have added new entries to the
+          // ExternalSymbolRelocations map.  Consquently, we need to update our
+          // iterator.  This is also why retrieval of the relocation list
+          // associated with this symbol is deferred until below this point.
+          // New entries may have been added to the relocation list.
+          i = ExternalSymbolRelocations.find(Name);
       } else {
-        // This is an external symbol, try to get its address from
-        // MemoryManager.
-        uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
-                                                                   true);
-        DEBUG(dbgs() << "Resolving relocations Name: " << Name
-                << "\t" << format("%p", Addr)
-                << "\n");
-        resolveRelocationList(Relocs, (uintptr_t)Addr);
+        // We found the symbol in our global table.  It was probably in a
+        // Module that we loaded previously.
+        SymbolLoc SymLoc = Loc->second;
+        Addr = getSectionLoadAddress(SymLoc.first) + SymLoc.second;
       }
-    } else {
-      report_fatal_error("Expected external symbol");
+
+      // FIXME: Implement error handling that doesn't kill the host program!
+      if (!Addr)
+        report_fatal_error("Program used external function '" + Name +
+                          "' which could not be resolved!");
+
+      updateGOTEntries(Name, Addr);
+      DEBUG(dbgs() << "Resolving relocations Name: " << Name
+              << "\t" << format("0x%lx", Addr)
+              << "\n");
+      // This list may have been updated when we called getSymbolAddress, so
+      // don't change this code to get the list earlier.
+      RelocationList &Relocs = i->second;
+      resolveRelocationList(Relocs, Addr);
     }
+
+    ExternalSymbolRelocations.erase(i);
   }
 }
 
@@ -526,8 +590,10 @@ ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
     case sys::fs::file_magic::bitcode:
     case sys::fs::file_magic::archive:
     case sys::fs::file_magic::coff_object:
+    case sys::fs::file_magic::coff_import_library:
     case sys::fs::file_magic::pecoff_executable:
     case sys::fs::file_magic::macho_universal_binary:
+    case sys::fs::file_magic::windows_resource:
       report_fatal_error("Incompatible object format!");
     }
   } else {
@@ -539,10 +605,14 @@ ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
 }
 
 void *RuntimeDyld::getSymbolAddress(StringRef Name) {
+  if (!Dyld)
+    return NULL;
   return Dyld->getSymbolAddress(Name);
 }
 
 uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) {
+  if (!Dyld)
+    return 0;
   return Dyld->getSymbolLoadAddress(Name);
 }
 
@@ -564,8 +634,14 @@ StringRef RuntimeDyld::getErrorString() {
   return Dyld->getErrorString();
 }
 
-StringRef RuntimeDyld::getEHFrameSection() {
-  return Dyld->getEHFrameSection();
+void RuntimeDyld::registerEHFrames() {
+  if (Dyld)
+    Dyld->registerEHFrames();
+}
+
+void RuntimeDyld::deregisterEHFrames() {
+  if (Dyld)
+    Dyld->deregisterEHFrames();
 }
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index cd99c3c..f2c69fc 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
@@ -151,12 +151,31 @@ void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
 
 namespace llvm {
 
-StringRef RuntimeDyldELF::getEHFrameSection() {
-  for (int i = 0, e = Sections.size(); i != e; ++i) {
-    if (Sections[i].Name == ".eh_frame")
-      return StringRef((const char*)Sections[i].Address, Sections[i].Size);
+void RuntimeDyldELF::registerEHFrames() {
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
+    SID EHFrameSID = UnregisteredEHFrameSections[i];
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
+    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    MemMgr->registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
+    RegisteredEHFrameSections.push_back(EHFrameSID);
   }
-  return StringRef();
+  UnregisteredEHFrameSections.clear();
+}
+
+void RuntimeDyldELF::deregisterEHFrames() {
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
+    SID EHFrameSID = RegisteredEHFrameSections[i];
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
+    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    MemMgr->deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
+  }
+  RegisteredEHFrameSections.clear();
 }
 
 ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
@@ -202,7 +221,8 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                                              uint64_t Offset,
                                              uint64_t Value,
                                              uint32_t Type,
-                                             int64_t Addend) {
+                                             int64_t  Addend,
+                                             uint64_t SymOffset) {
   switch (Type) {
   default:
     llvm_unreachable("Relocation type not implemented yet!");
@@ -227,6 +247,21 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                  << " at " << format("%p\n",Target));
     break;
   }
+  case ELF::R_X86_64_GOTPCREL: {
+    // findGOTEntry returns the 'G + GOT' part of the relocation calculation
+    // based on the load/target address of the GOT (not the current/local addr).
+    uint64_t GOTAddr = findGOTEntry(Value, SymOffset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
+    // The processRelocationRef method combines the symbol offset and the addend
+    // and in most cases that's what we want.  For this relocation type, we need
+    // the raw addend, so we subtract the symbol offset to get it.
+    int64_t RealOffset = GOTAddr + Addend - SymOffset - FinalAddress;
+    assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
+    int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
+    *Target = TruncOffset;
+    break;
+  }
   case ELF::R_X86_64_PC32: {
     // Get the placeholder value from the generated object since
     // a previous relocation attempt may have overwritten the loaded version
@@ -240,6 +275,16 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
     *Target = TruncOffset;
     break;
   }
+  case ELF::R_X86_64_PC64: {
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint64_t *Placeholder = reinterpret_cast<uint64_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint64_t *Target = reinterpret_cast<uint64_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
+    *Target = *Placeholder + Value + Addend - FinalAddress;
+    break;
+  }
   }
 }
 
@@ -304,7 +349,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
   }
   case ELF::R_AARCH64_PREL32: {
     uint64_t Result = Value + Addend - FinalAddress;
-    assert(static_cast<int64_t>(Result) >= INT32_MIN && 
+    assert(static_cast<int64_t>(Result) >= INT32_MIN &&
            static_cast<int64_t>(Result) <= UINT32_MAX);
     *TargetPtr = static_cast<uint32_t>(Result & 0xffffffffU);
     break;
@@ -316,7 +361,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     uint64_t BranchImm = Value + Addend - FinalAddress;
 
     // "Check that -2^27 <= result < 2^27".
-    assert(-(1LL << 27) <= static_cast<int64_t>(BranchImm) && 
+    assert(-(1LL << 27) <= static_cast<int64_t>(BranchImm) &&
            static_cast<int64_t>(BranchImm) < (1LL << 27));
 
     // AArch64 code is emitted with .rela relocations. The data already in any
@@ -341,7 +386,6 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
   case ELF::R_AARCH64_MOVW_UABS_G2_NC: {
     uint64_t Result = Value + Addend;
 
-
     // AArch64 code is emitted with .rela relocations. The data already in any
     // bits affected by the relocation on entry is garbage.
     *TargetPtr &= 0xffe0001fU;
@@ -585,7 +629,7 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
       // Finally compares the Symbol value and the target symbol offset
       // to check if this .opd entry refers to the symbol the relocation
       // points to.
-      if (Rel.Addend != (intptr_t)TargetSymbolOffset)
+      if (Rel.Addend != (int64_t)TargetSymbolOffset)
         continue;
 
       section_iterator tsi(Obj.end_sections());
@@ -735,20 +779,42 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
   }
 }
 
+// The target location for the relocation is described by RE.SectionID and
+// RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
+// SectionEntry has three members describing its location.
+// SectionEntry::Address is the address at which the section has been loaded
+// into memory in the current (host) process.  SectionEntry::LoadAddress is the
+// address that the section will have in the target process.
+// SectionEntry::ObjAddress is the address of the bits for this section in the
+// original emitted object image (also in the current address space).
+//
+// Relocations will be applied as if the section were loaded at
+// SectionEntry::LoadAddress, but they will be applied at an address based
+// on SectionEntry::Address.  SectionEntry::ObjAddress will be used to refer to
+// Target memory contents if they are required for value calculations.
+//
+// The Value parameter here is the load address of the symbol for the
+// relocation to be applied.  For relocations which refer to symbols in the
+// current object Value will be the LoadAddress of the section in which
+// the symbol resides (RE.Addend provides additional information about the
+// symbol location).  For external symbols, Value will be the address of the
+// symbol in the target address space.
 void RuntimeDyldELF::resolveRelocation(const RelocationEntry &RE,
-				       uint64_t Value) {
+                                       uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
-  return resolveRelocation(Section, RE.Offset, Value, RE.RelType, RE.Addend);
+  return resolveRelocation(Section, RE.Offset, Value, RE.RelType, RE.Addend,
+                           RE.SymOffset);
 }
 
 void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
                                        uint64_t Offset,
                                        uint64_t Value,
                                        uint32_t Type,
-                                       int64_t Addend) {
+                                       int64_t  Addend,
+                                       uint64_t SymOffset) {
   switch (Arch) {
   case Triple::x86_64:
-    resolveX86_64Relocation(Section, Offset, Value, Type, Addend);
+    resolveX86_64Relocation(Section, Offset, Value, Type, Addend, SymOffset);
     break;
   case Triple::x86:
     resolveX86Relocation(Section, Offset,
@@ -811,6 +877,7 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
   }
   if (lsi != Symbols.end()) {
     Value.SectionID = lsi->second.first;
+    Value.Offset = lsi->second.second;
     Value.Addend = lsi->second.second + Addend;
   } else {
     // Search for the symbol in the global symbol table
@@ -819,6 +886,7 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
       gsi = GlobalSymbolTable.find(TargetName.data());
     if (gsi != GlobalSymbolTable.end()) {
       Value.SectionID = gsi->second.first;
+      Value.Offset = gsi->second.second;
       Value.Addend = gsi->second.second + Addend;
     } else {
       switch (SymType) {
@@ -841,9 +909,17 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
           Value.Addend = Addend;
           break;
         }
+        case SymbolRef::ST_Data:
         case SymbolRef::ST_Unknown: {
           Value.SymbolName = TargetName.data();
           Value.Addend = Addend;
+
+          // Absolute relocations will have a zero symbol ID (STN_UNDEF), which
+          // will manifest here as a NULL symbol name.
+          // We can set this as a valid (but empty) symbol name, and rely
+          // on addRelocationForSymbol to handle this.
+          if (!Value.SymbolName)
+              Value.SymbolName = "";
           break;
         }
         default:
@@ -955,8 +1031,8 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
     //  Look up for existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
     if (i != Stubs.end()) {
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + i->second, RelType, 0);
+      RelocationEntry RE(SectionID, Offset, RelType, i->second);
+      addRelocationForSection(RE, SectionID);
       DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
@@ -981,9 +1057,8 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
         addRelocationForSection(RELo, Value.SectionID);
       }
 
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + Section.StubOffset,
-                        RelType, 0);
+      RelocationEntry RE(SectionID, Offset, RelType, Section.StubOffset);
+      addRelocationForSection(RE, SectionID);
       Section.StubOffset += getMaxStubSize();
     }
   } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
@@ -1069,7 +1144,10 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
       RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
       // Extra check to avoid relocation againt empty symbols (usually
       // the R_PPC64_TOC).
-      if (Value.SymbolName && !TargetName.empty())
+      if (SymType != SymbolRef::ST_Unknown && TargetName.empty())
+        Value.SymbolName = NULL;
+
+      if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
         addRelocationForSection(RE, Value.SectionID);
@@ -1121,8 +1199,67 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
                         ELF::R_390_PC32DBL, Addend);
     else
       resolveRelocation(Section, Offset, StubAddress, RelType, Addend);
+  } else if (Arch == Triple::x86_64 && RelType == ELF::R_X86_64_PLT32) {
+    // The way the PLT relocations normally work is that the linker allocates the
+    // PLT and this relocation makes a PC-relative call into the PLT.  The PLT
+    // entry will then jump to an address provided by the GOT.  On first call, the
+    // GOT address will point back into PLT code that resolves the symbol.  After
+    // the first call, the GOT entry points to the actual function.
+    //
+    // For local functions we're ignoring all of that here and just replacing
+    // the PLT32 relocation type with PC32, which will translate the relocation
+    // into a PC-relative call directly to the function. For external symbols we
+    // can't be sure the function will be within 2^32 bytes of the call site, so
+    // we need to create a stub, which calls into the GOT.  This case is
+    // equivalent to the usual PLT implementation except that we use the stub
+    // mechanism in RuntimeDyld (which puts stubs at the end of the section)
+    // rather than allocating a PLT section.
+    if (Value.SymbolName) {
+      // This is a call to an external function.
+      // Look for an existing stub.
+      SectionEntry &Section = Sections[SectionID];
+      StubMap::const_iterator i = Stubs.find(Value);
+      uintptr_t StubAddress;
+      if (i != Stubs.end()) {
+        StubAddress = uintptr_t(Section.Address) + i->second;
+        DEBUG(dbgs() << " Stub function found\n");
+      } else {
+        // Create a new stub function (equivalent to a PLT entry).
+        DEBUG(dbgs() << " Create a new stub function\n");
+
+        uintptr_t BaseAddress = uintptr_t(Section.Address);
+        uintptr_t StubAlignment = getStubAlignment();
+        StubAddress = (BaseAddress + Section.StubOffset +
+                      StubAlignment - 1) & -StubAlignment;
+        unsigned StubOffset = StubAddress - BaseAddress;
+        Stubs[Value] = StubOffset;
+        createStubFunction((uint8_t *)StubAddress);
+
+        // Create a GOT entry for the external function.
+        GOTEntries.push_back(Value);
+
+        // Make our stub function a relative call to the GOT entry.
+        RelocationEntry RE(SectionID, StubOffset + 2,
+                           ELF::R_X86_64_GOTPCREL, -4);
+        addRelocationForSymbol(RE, Value.SymbolName);
+
+        // Bump our stub offset counter
+        Section.StubOffset = StubOffset + getMaxStubSize();
+      }
+
+      // Make the target call a call into the stub table.
+      resolveRelocation(Section, Offset, StubAddress,
+                      ELF::R_X86_64_PC32, Addend);
+    } else {
+      RelocationEntry RE(SectionID, Offset, ELF::R_X86_64_PC32, Value.Addend,
+                         Value.Offset);
+      addRelocationForSection(RE, Value.SectionID);
+    }
   } else {
-    RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
+    if (Arch == Triple::x86_64 && RelType == ELF::R_X86_64_GOTPCREL) {
+      GOTEntries.push_back(Value);
+    }
+    RelocationEntry RE(SectionID, Offset, RelType, Value.Addend, Value.Offset);
     if (Value.SymbolName)
       addRelocationForSymbol(RE, Value.SymbolName);
     else
@@ -1130,6 +1267,137 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
   }
 }
 
+void RuntimeDyldELF::updateGOTEntries(StringRef Name, uint64_t Addr) {
+
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::iterator it;
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::iterator end = GOTs.end();
+
+  for (it = GOTs.begin(); it != end; ++it) {
+    GOTRelocations &GOTEntries = it->second;
+    for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
+      if (GOTEntries[i].SymbolName != 0 && GOTEntries[i].SymbolName == Name) {
+        GOTEntries[i].Offset = Addr;
+      }
+    }
+  }
+}
+
+size_t RuntimeDyldELF::getGOTEntrySize() {
+  // We don't use the GOT in all of these cases, but it's essentially free
+  // to put them all here.
+  size_t Result = 0;
+  switch (Arch) {
+  case Triple::x86_64:
+  case Triple::aarch64:
+  case Triple::ppc64:
+  case Triple::ppc64le:
+  case Triple::systemz:
+    Result = sizeof(uint64_t);
+    break;
+  case Triple::x86:
+  case Triple::arm:
+  case Triple::thumb:
+  case Triple::mips:
+  case Triple::mipsel:
+    Result = sizeof(uint32_t);
+    break;
+  default: llvm_unreachable("Unsupported CPU type!");
+  }
+  return Result;
+}
+
+uint64_t RuntimeDyldELF::findGOTEntry(uint64_t LoadAddress,
+                                      uint64_t Offset) {
+
+  const size_t GOTEntrySize = getGOTEntrySize();
+
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::const_iterator it;
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::const_iterator end = GOTs.end();
+
+  int GOTIndex = -1;
+  for (it = GOTs.begin(); it != end; ++it) {
+    SID GOTSectionID = it->first;
+    const GOTRelocations &GOTEntries = it->second;
+
+    // Find the matching entry in our vector.
+    uint64_t SymbolOffset = 0;
+    for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
+      if (GOTEntries[i].SymbolName == 0) {
+        if (getSectionLoadAddress(GOTEntries[i].SectionID) == LoadAddress &&
+            GOTEntries[i].Offset == Offset) {
+          GOTIndex = i;
+          SymbolOffset = GOTEntries[i].Offset;
+          break;
+        }
+      } else {
+        // GOT entries for external symbols use the addend as the address when
+        // the external symbol has been resolved.
+        if (GOTEntries[i].Offset == LoadAddress) {
+          GOTIndex = i;
+          // Don't use the Addend here.  The relocation handler will use it.
+          break;
+        }
+      }
+    }
+
+    if (GOTIndex != -1) {
+      if (GOTEntrySize == sizeof(uint64_t)) {
+        uint64_t *LocalGOTAddr = (uint64_t*)getSectionAddress(GOTSectionID);
+        // Fill in this entry with the address of the symbol being referenced.
+        LocalGOTAddr[GOTIndex] = LoadAddress + SymbolOffset;
+      } else {
+        uint32_t *LocalGOTAddr = (uint32_t*)getSectionAddress(GOTSectionID);
+        // Fill in this entry with the address of the symbol being referenced.
+        LocalGOTAddr[GOTIndex] = (uint32_t)(LoadAddress + SymbolOffset);
+      }
+
+      // Calculate the load address of this entry
+      return getSectionLoadAddress(GOTSectionID) + (GOTIndex * GOTEntrySize);
+    }
+  }
+
+  assert(GOTIndex != -1 && "Unable to find requested GOT entry.");
+  return 0;
+}
+
+void RuntimeDyldELF::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+  // If necessary, allocate the global offset table
+  if (MemMgr) {
+    // Allocate the GOT if necessary
+    size_t numGOTEntries = GOTEntries.size();
+    if (numGOTEntries != 0) {
+      // Allocate memory for the section
+      unsigned SectionID = Sections.size();
+      size_t TotalSize = numGOTEntries * getGOTEntrySize();
+      uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, getGOTEntrySize(),
+                                                  SectionID, ".got", false);
+      if (!Addr)
+        report_fatal_error("Unable to allocate memory for GOT!");
+
+      GOTs.push_back(std::make_pair(SectionID, GOTEntries));
+      Sections.push_back(SectionEntry(".got", Addr, TotalSize, 0));
+      // For now, initialize all GOT entries to zero.  We'll fill them in as
+      // needed when GOT-based relocations are applied.
+      memset(Addr, 0, TotalSize);
+    }
+  }
+  else {
+    report_fatal_error("Unable to allocate memory for GOT!");
+  }
+
+  // Look for and record the EH frame section.
+  ObjSectionToIDMap::iterator i, e;
+  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
+    const SectionRef &Section = i->first;
+    StringRef Name;
+    Section.getName(Name);
+    if (Name == ".eh_frame") {
+      UnregisteredEHFrameSections.push_back(i->second);
+      break;
+    }
+  }
+}
+
 bool RuntimeDyldELF::isCompatibleFormat(const ObjectBuffer *Buffer) const {
   if (Buffer->getBufferSize() < strlen(ELF::ElfMagic))
     return false;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 794c7ec..3adf827 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -15,6 +15,7 @@
 #define LLVM_RUNTIME_DYLD_ELF_H
 
 #include "RuntimeDyldImpl.h"
+#include "llvm/ADT/DenseMap.h"
 
 using namespace llvm;
 
@@ -35,13 +36,15 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                          uint64_t Offset,
                          uint64_t Value,
                          uint32_t Type,
-                         int64_t Addend);
+                         int64_t Addend,
+                         uint64_t SymOffset=0);
 
   void resolveX86_64Relocation(const SectionEntry &Section,
                                uint64_t Offset,
                                uint64_t Value,
                                uint32_t Type,
-                               int64_t Addend);
+                               int64_t  Addend,
+                               uint64_t SymOffset);
 
   void resolveX86Relocation(const SectionEntry &Section,
                             uint64_t Offset,
@@ -79,13 +82,55 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                                 uint32_t Type,
                                 int64_t Addend);
 
+  unsigned getMaxStubSize() {
+    if (Arch == Triple::aarch64)
+      return 20; // movz; movk; movk; movk; br
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::mipsel || Arch == Triple::mips)
+      return 16;
+    else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le)
+      return 44;
+    else if (Arch == Triple::x86_64)
+      return 6; // 2-byte jmp instruction + 32-bit relative address
+    else if (Arch == Triple::systemz)
+      return 16;
+    else
+      return 0;
+  }
+
+  unsigned getStubAlignment() {
+    if (Arch == Triple::systemz)
+      return 8;
+    else
+      return 1;
+  }
+
   uint64_t findPPC64TOC() const;
   void findOPDEntrySection(ObjectImage &Obj,
                            ObjSectionToIDMap &LocalSections,
                            RelocationValueRef &Rel);
 
+  uint64_t findGOTEntry(uint64_t LoadAddr, uint64_t Offset);
+  size_t getGOTEntrySize();
+
+  virtual void updateGOTEntries(StringRef Name, uint64_t Addr);
+
+  // Relocation entries for symbols whose position-independant offset is
+  // updated in a global offset table.
+  typedef SmallVector<RelocationValueRef, 2> GOTRelocations;
+  GOTRelocations GOTEntries; // List of entries requiring finalization.
+  SmallVector<std::pair<SID, GOTRelocations>, 8> GOTs; // Allocated tables.
+
+  // When a module is loaded we save the SectionID of the EH frame section
+  // in a table until we receive a request to register all unregistered
+  // EH frame sections with the memory manager.
+  SmallVector<SID, 2> UnregisteredEHFrameSections;
+  SmallVector<SID, 2> RegisteredEHFrameSections;
+
 public:
-  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
+  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm)
+                                          {}
 
   virtual void resolveRelocation(const RelocationEntry &RE, uint64_t Value);
   virtual void processRelocationRef(unsigned SectionID,
@@ -96,7 +141,9 @@ public:
                                     StubMap &Stubs);
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
   virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+  virtual void deregisterEHFrames();
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap);
   virtual ~RuntimeDyldELF();
 };
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 14d945b..3014b30 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
@@ -80,14 +81,18 @@ public:
   unsigned SectionID;
 
   /// Offset - offset into the section.
-  uintptr_t Offset;
+  uint64_t Offset;
 
   /// RelType - relocation type.
   uint32_t RelType;
 
   /// Addend - the relocation addend encoded in the instruction itself.  Also
   /// used to make a relocation section relative instead of symbol relative.
-  intptr_t Addend;
+  int64_t Addend;
+
+  /// SymOffset - Section offset of the relocation entry's symbol (used for GOT
+  /// lookup).
+  uint64_t SymOffset;
 
   /// True if this is a PCRel relocation (MachO specific).
   bool IsPCRel;
@@ -97,26 +102,39 @@ public:
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend)
     : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-      IsPCRel(false), Size(0) {}
+      SymOffset(0), IsPCRel(false), Size(0) {}
+
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
+                  uint64_t symoffset)
+    : SectionID(id), Offset(offset), RelType(type), Addend(addend),
+      SymOffset(symoffset), IsPCRel(false), Size(0) {}
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
                   bool IsPCRel, unsigned Size)
     : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-      IsPCRel(IsPCRel), Size(Size) {}
+      SymOffset(0), IsPCRel(IsPCRel), Size(Size) {}
 };
 
 class RelocationValueRef {
 public:
   unsigned  SectionID;
-  intptr_t  Addend;
+  uint64_t  Offset;
+  int64_t   Addend;
   const char *SymbolName;
-  RelocationValueRef(): SectionID(0), Addend(0), SymbolName(0) {}
+  RelocationValueRef(): SectionID(0), Offset(0), Addend(0), SymbolName(0) {}
 
   inline bool operator==(const RelocationValueRef &Other) const {
-    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) == 0;
+    return SectionID == Other.SectionID && Offset == Other.Offset &&
+           Addend == Other.Addend && SymbolName == Other.SymbolName;
   }
   inline bool operator <(const RelocationValueRef &Other) const {
-    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) < 0;
+    if (SectionID != Other.SectionID)
+      return SectionID < Other.SectionID;
+    if (Offset != Other.Offset)
+      return Offset < Other.Offset;
+    if (Addend != Other.Addend)
+      return Addend < Other.Addend;
+    return SymbolName < Other.SymbolName;
   }
 };
 
@@ -130,6 +148,9 @@ protected:
   typedef SmallVector<SectionEntry, 64> SectionList;
   SectionList Sections;
 
+  typedef unsigned SID; // Type for SectionIDs
+  #define RTDYLD_INVALID_SECTION_ID ((SID)(-1)) 
+
   // Keep a map of sections from object file to the SectionID which
   // references it.
   typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
@@ -164,30 +185,22 @@ protected:
   typedef std::map<RelocationValueRef, uintptr_t> StubMap;
 
   Triple::ArchType Arch;
-
-  inline unsigned getMaxStubSize() {
-    if (Arch == Triple::aarch64)
-      return 20; // movz; movk; movk; movk; br
-    if (Arch == Triple::arm || Arch == Triple::thumb)
-      return 8; // 32-bit instruction and 32-bit address
-    else if (Arch == Triple::mipsel || Arch == Triple::mips)
-      return 16;
-    else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le)
-      return 44;
-    else if (Arch == Triple::x86_64)
-      return 8; // GOT
-    else if (Arch == Triple::systemz)
-      return 16;
-    else
-      return 0;
-  }
-
-  inline unsigned getStubAlignment() {
-    if (Arch == Triple::systemz)
-      return 8;
-    else
-      return 1;
-  }
+  bool IsTargetLittleEndian;
+
+  // This mutex prevents simultaneously loading objects from two different
+  // threads.  This keeps us from having to protect individual data structures
+  // and guarantees that section allocation requests to the memory manager
+  // won't be interleaved between modules.  It is also used in mapSectionAddress
+  // and resolveRelocations to protect write access to internal data structures.
+  //
+  // loadObject may be called on the same thread during the handling of of
+  // processRelocations, and that's OK.  The handling of the relocation lists
+  // is written in such a way as to work correctly if new elements are added to
+  // the end of the list while the list is being processed.
+  sys::Mutex lock;
+
+  virtual unsigned getMaxStubSize() = 0;
+  virtual unsigned getStubAlignment() = 0;
 
   bool HasError;
   std::string ErrorStr;
@@ -208,14 +221,14 @@ protected:
   }
 
   void writeInt16BE(uint8_t *Addr, uint16_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 8) & 0xFF;
     *(Addr+1) = Value & 0xFF;
   }
 
   void writeInt32BE(uint8_t *Addr, uint32_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 24) & 0xFF;
     *(Addr+1) = (Value >> 16) & 0xFF;
@@ -224,7 +237,7 @@ protected:
   }
 
   void writeInt64BE(uint8_t *Addr, uint64_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 56) & 0xFF;
     *(Addr+1) = (Value >> 48) & 0xFF;
@@ -292,6 +305,11 @@ protected:
 
   /// \brief Resolve relocations to external symbols.
   void resolveExternalSymbols();
+
+  /// \brief Update GOT entries for external symbols.
+  // The base class does nothing.  ELF overrides this.
+  virtual void updateGOTEntries(StringRef Name, uint64_t Addr) {}
+
   virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm) : MemMgr(mm), HasError(false) {}
@@ -303,18 +321,20 @@ public:
   void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    if (pos == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    SymbolLoc Loc = pos->second;
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
   uint64_t getSymbolLoadAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    if (pos == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    SymbolLoc Loc = pos->second;
     return getSectionLoadAddress(Loc.first) + Loc.second;
   }
 
@@ -335,7 +355,11 @@ public:
 
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const = 0;
 
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+
+  virtual void deregisterEHFrames();
+
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap) {}
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 0384b32..5b92867 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -55,35 +55,80 @@ static intptr_t computeDelta(SectionEntry *A, SectionEntry *B) {
   return ObjDistance - MemDistance;
 }
 
-StringRef RuntimeDyldMachO::getEHFrameSection() {
-  SectionEntry *Text = NULL;
-  SectionEntry *EHFrame = NULL;
-  SectionEntry *ExceptTab = NULL;
-  for (int i = 0, e = Sections.size(); i != e; ++i) {
-    if (Sections[i].Name == "__eh_frame")
-      EHFrame = &Sections[i];
-    else if (Sections[i].Name == "__text")
-      Text = &Sections[i];
-    else if (Sections[i].Name == "__gcc_except_tab")
-      ExceptTab = &Sections[i];
+void RuntimeDyldMachO::registerEHFrames() {
+
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
+    EHFrameRelatedSections &SectionInfo = UnregisteredEHFrameSections[i];
+    if (SectionInfo.EHFrameSID == RTDYLD_INVALID_SECTION_ID ||
+        SectionInfo.TextSID == RTDYLD_INVALID_SECTION_ID)
+      continue;
+    SectionEntry *Text = &Sections[SectionInfo.TextSID];
+    SectionEntry *EHFrame = &Sections[SectionInfo.EHFrameSID];
+    SectionEntry *ExceptTab = NULL;
+    if (SectionInfo.ExceptTabSID != RTDYLD_INVALID_SECTION_ID)
+      ExceptTab = &Sections[SectionInfo.ExceptTabSID];
+
+    intptr_t DeltaForText = computeDelta(Text, EHFrame);
+    intptr_t DeltaForEH = 0;
+    if (ExceptTab)
+      DeltaForEH = computeDelta(ExceptTab, EHFrame);
+
+    unsigned char *P = EHFrame->Address;
+    unsigned char *End = P + EHFrame->Size;
+    do  {
+      P = processFDE(P, DeltaForText, DeltaForEH);
+    } while(P != End);
+
+    MemMgr->registerEHFrames(EHFrame->Address,
+                             EHFrame->LoadAddress,
+                             EHFrame->Size);
   }
-  if (Text == NULL || EHFrame == NULL)
-    return StringRef();
-
-  intptr_t DeltaForText = computeDelta(Text, EHFrame);
-  intptr_t DeltaForEH = 0;
-  if (ExceptTab)
-    DeltaForEH = computeDelta(ExceptTab, EHFrame);
-
-  unsigned char *P = EHFrame->Address;
-  unsigned char *End = P + EHFrame->Size;
-  do  {
-    P = processFDE(P, DeltaForText, DeltaForEH);
-  } while(P != End);
+  UnregisteredEHFrameSections.clear();
+}
 
-  return StringRef((char*)EHFrame->Address, EHFrame->Size);
+void RuntimeDyldMachO::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+  unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
+  ObjSectionToIDMap::iterator i, e;
+  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
+    const SectionRef &Section = i->first;
+    StringRef Name;
+    Section.getName(Name);
+    if (Name == "__eh_frame")
+      EHFrameSID = i->second;
+    else if (Name == "__text")
+      TextSID = i->second;
+    else if (Name == "__gcc_except_tab")
+      ExceptTabSID = i->second;
+  }
+  UnregisteredEHFrameSections.push_back(EHFrameRelatedSections(EHFrameSID,
+                                                               TextSID,
+                                                               ExceptTabSID));
 }
 
+// The target location for the relocation is described by RE.SectionID and
+// RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
+// SectionEntry has three members describing its location.
+// SectionEntry::Address is the address at which the section has been loaded
+// into memory in the current (host) process.  SectionEntry::LoadAddress is the
+// address that the section will have in the target process.
+// SectionEntry::ObjAddress is the address of the bits for this section in the
+// original emitted object image (also in the current address space).
+//
+// Relocations will be applied as if the section were loaded at
+// SectionEntry::LoadAddress, but they will be applied at an address based
+// on SectionEntry::Address.  SectionEntry::ObjAddress will be used to refer to
+// Target memory contents if they are required for value calculations.
+//
+// The Value parameter here is the load address of the symbol for the
+// relocation to be applied.  For relocations which refer to symbols in the
+// current object Value will be the LoadAddress of the section in which
+// the symbol resides (RE.Addend provides additional information about the
+// symbol location).  For external symbols, Value will be the address of the
+// symbol in the target address space.
 void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE,
                                          uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
@@ -160,7 +205,7 @@ bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
   switch (Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_Vanilla: {
+  case MachO::GENERIC_RELOC_VANILLA: {
     uint8_t *p = LocalAddress;
     uint64_t ValueToWrite = Value + Addend;
     for (unsigned i = 0; i < Size; ++i) {
@@ -169,9 +214,9 @@ bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
     }
     return false;
   }
-  case macho::RIT_Difference:
-  case macho::RIT_Generic_LocalDifference:
-  case macho::RIT_Generic_PreboundLazyPointer:
+  case MachO::GENERIC_RELOC_SECTDIFF:
+  case MachO::GENERIC_RELOC_LOCAL_SECTDIFF:
+  case MachO::GENERIC_RELOC_PB_LA_PTR:
     return Error("Relocation type not implemented yet!");
   }
 }
@@ -193,12 +238,12 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
   switch(Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_X86_64_Signed1:
-  case macho::RIT_X86_64_Signed2:
-  case macho::RIT_X86_64_Signed4:
-  case macho::RIT_X86_64_Signed:
-  case macho::RIT_X86_64_Unsigned:
-  case macho::RIT_X86_64_Branch: {
+  case MachO::X86_64_RELOC_SIGNED_1:
+  case MachO::X86_64_RELOC_SIGNED_2:
+  case MachO::X86_64_RELOC_SIGNED_4:
+  case MachO::X86_64_RELOC_SIGNED:
+  case MachO::X86_64_RELOC_UNSIGNED:
+  case MachO::X86_64_RELOC_BRANCH: {
     Value += Addend;
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
@@ -209,10 +254,10 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
     }
     return false;
   }
-  case macho::RIT_X86_64_GOTLoad:
-  case macho::RIT_X86_64_GOT:
-  case macho::RIT_X86_64_Subtractor:
-  case macho::RIT_X86_64_TLV:
+  case MachO::X86_64_RELOC_GOT_LOAD:
+  case MachO::X86_64_RELOC_GOT:
+  case MachO::X86_64_RELOC_SUBTRACTOR:
+  case MachO::X86_64_RELOC_TLV:
     return Error("Relocation type not implemented yet!");
   }
 }
@@ -237,7 +282,7 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
   switch(Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_Vanilla: {
+  case MachO::ARM_RELOC_VANILLA: {
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
     uint8_t *p = (uint8_t*)LocalAddress;
@@ -247,7 +292,7 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     }
     break;
   }
-  case macho::RIT_ARM_Branch24Bit: {
+  case MachO::ARM_RELOC_BR24: {
     // Mask the value into the target address. We know instructions are
     // 32-bit aligned, so we can do it all at once.
     uint32_t *p = (uint32_t*)LocalAddress;
@@ -263,14 +308,14 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     *p = (*p & ~0xffffff) | Value;
     break;
   }
-  case macho::RIT_ARM_ThumbBranch22Bit:
-  case macho::RIT_ARM_ThumbBranch32Bit:
-  case macho::RIT_ARM_Half:
-  case macho::RIT_ARM_HalfDifference:
-  case macho::RIT_Pair:
-  case macho::RIT_Difference:
-  case macho::RIT_ARM_LocalDifference:
-  case macho::RIT_ARM_PreboundLazyPointer:
+  case MachO::ARM_THUMB_RELOC_BR22:
+  case MachO::ARM_THUMB_32BIT_BRANCH:
+  case MachO::ARM_RELOC_HALF:
+  case MachO::ARM_RELOC_HALF_SECTDIFF:
+  case MachO::ARM_RELOC_PAIR:
+  case MachO::ARM_RELOC_SECTDIFF:
+  case MachO::ARM_RELOC_LOCAL_SECTDIFF:
+  case MachO::ARM_RELOC_PB_LA_PTR:
     return Error("Relocation type not implemented yet!");
   }
   return false;
@@ -284,9 +329,19 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
                                             StubMap &Stubs) {
   const ObjectFile *OF = Obj.getObjectFile();
   const MachOObjectFile *MachO = static_cast<const MachOObjectFile*>(OF);
-  macho::RelocationEntry RE = MachO->getRelocation(RelI.getRawDataRefImpl());
+  MachO::any_relocation_info RE= MachO->getRelocation(RelI.getRawDataRefImpl());
 
   uint32_t RelType = MachO->getAnyRelocationType(RE);
+
+  // FIXME: Properly handle scattered relocations.
+  //        For now, optimistically skip these: they can often be ignored, as
+  //        the static linker will already have applied the relocation, and it
+  //        only needs to be reapplied if symbols move relative to one another.
+  //        Note: This will fail horribly where the relocations *do* need to be
+  //        applied, but that was already the case.
+  if (MachO->isRelocationScattered(RE))
+    return;
+
   RelocationValueRef Value;
   SectionEntry &Section = Sections[SectionID];
 
@@ -329,7 +384,8 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
     Value.Addend = Addend - Addr;
   }
 
-  if (Arch == Triple::x86_64 && RelType == macho::RIT_X86_64_GOT) {
+  if (Arch == Triple::x86_64 && (RelType == MachO::X86_64_RELOC_GOT ||
+                                 RelType == MachO::X86_64_RELOC_GOT_LOAD)) {
     assert(IsPCRel);
     assert(Size == 2);
     StubMap::const_iterator i = Stubs.find(Value);
@@ -340,8 +396,7 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       Stubs[Value] = Section.StubOffset;
       uint8_t *GOTEntry = Section.Address + Section.StubOffset;
       RelocationEntry RE(SectionID, Section.StubOffset,
-                         macho::RIT_X86_64_Unsigned, Value.Addend - 4, false,
-                         3);
+                         MachO::X86_64_RELOC_UNSIGNED, 0, false, 3);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -350,9 +405,9 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       Addr = GOTEntry;
     }
     resolveRelocation(Section, Offset, (uint64_t)Addr,
-                      macho::RIT_X86_64_Unsigned, 4, true, 2);
+                      MachO::X86_64_RELOC_UNSIGNED, Value.Addend, true, 2);
   } else if (Arch == Triple::arm &&
-             (RelType & 0xf) == macho::RIT_ARM_Branch24Bit) {
+             (RelType & 0xf) == MachO::ARM_RELOC_BR24) {
     // This is an ARM branch relocation, need to use a stub function.
 
     //  Look up for existing stub.
@@ -367,7 +422,7 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
       RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
-                         macho::RIT_Vanilla, Value.Addend);
+                         MachO::GENERIC_RELOC_VANILLA, Value.Addend);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index df8d3bb..bbf6aa9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -54,6 +54,35 @@ class RuntimeDyldMachO : public RuntimeDyldImpl {
                          int64_t Addend,
                          bool isPCRel,
                          unsigned Size);
+
+  unsigned getMaxStubSize() {
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::x86_64)
+      return 8; // GOT entry
+    else
+      return 0;
+  }
+
+  unsigned getStubAlignment() {
+    return 1;
+  }
+
+  struct EHFrameRelatedSections {
+    EHFrameRelatedSections() : EHFrameSID(RTDYLD_INVALID_SECTION_ID),
+                               TextSID(RTDYLD_INVALID_SECTION_ID),
+                               ExceptTabSID(RTDYLD_INVALID_SECTION_ID) {}
+    EHFrameRelatedSections(SID EH, SID T, SID Ex) 
+      : EHFrameSID(EH), TextSID(T), ExceptTabSID(Ex) {}
+    SID EHFrameSID;
+    SID TextSID;
+    SID ExceptTabSID;
+  };
+
+  // When a module is loaded we save the SectionID of the EH frame section
+  // in a table until we receive a request to register all unregistered
+  // EH frame sections with the memory manager.
+  SmallVector<EHFrameRelatedSections, 2> UnregisteredEHFrameSections;
 public:
   RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
@@ -65,7 +94,8 @@ public:
                                     const SymbolTableMap &Symbols,
                                     StubMap &Stubs);
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap);
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 558d8b3..9b7d348 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -91,7 +91,7 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
   // FIXME: non-iOS ARM FastISel is broken with MCJIT.
   if (UseMCJIT &&
       TheTriple.getArch() == Triple::arm &&
-      TheTriple.getOS() != Triple::IOS &&
+      !TheTriple.isiOS() &&
       OptLevel == CodeGenOpt::None) {
     OptLevel = CodeGenOpt::Less;
   }
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index f275305..7decffd 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -74,6 +74,8 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   default:                         Out << "cc" << cc; break;
   case CallingConv::Fast:          Out << "fastcc"; break;
   case CallingConv::Cold:          Out << "coldcc"; break;
+  case CallingConv::WebKit_JS:     Out << "webkit_jscc"; break;
+  case CallingConv::AnyReg:        Out << "anyregcc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
@@ -1394,9 +1396,6 @@ static void PrintLinkage(GlobalValue::LinkageTypes LT,
   case GlobalValue::InternalLinkage:      Out << "internal ";       break;
   case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
   case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    Out << "linkonce_odr_auto_hide ";
-    break;
   case GlobalValue::WeakAnyLinkage:       Out << "weak ";           break;
   case GlobalValue::WeakODRLinkage:       Out << "weak_odr ";       break;
   case GlobalValue::CommonLinkage:        Out << "common ";         break;
@@ -1647,6 +1646,10 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << " align " << F->getAlignment();
   if (F->hasGC())
     Out << " gc \"" << F->getGC() << '"';
+  if (F->hasPrefixData()) {
+    Out << " prefix ";
+    writeOperand(F->getPrefixData(), true);
+  }
   if (F->isDeclaration()) {
     Out << '\n';
   } else {
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 9da3f96..ea954ac 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -94,6 +94,7 @@ public:
 /// attribute enties, which are for target-dependent attributes.
 
 class EnumAttributeImpl : public AttributeImpl {
+  virtual void anchor();
   Attribute::AttrKind Kind;
 
 protected:
@@ -108,6 +109,7 @@ public:
 };
 
 class AlignAttributeImpl : public EnumAttributeImpl {
+  virtual void anchor();
   unsigned Align;
 
 public:
@@ -122,6 +124,7 @@ public:
 };
 
 class StringAttributeImpl : public AttributeImpl {
+  virtual void anchor();
   std::string Kind;
   std::string Val;
 
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index f466d16..0f2b7a0 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -196,6 +196,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noreturn";
   if (hasAttribute(Attribute::NoUnwind))
     return "nounwind";
+  if (hasAttribute(Attribute::OptimizeNone))
+    return "optnone";
   if (hasAttribute(Attribute::OptimizeForSize))
     return "optsize";
   if (hasAttribute(Attribute::ReadNone))
@@ -284,7 +286,11 @@ bool Attribute::operator<(Attribute A) const {
 // AttributeImpl Definition
 //===----------------------------------------------------------------------===//
 
+// Pin the vtabels to this file.
 AttributeImpl::~AttributeImpl() {}
+void EnumAttributeImpl::anchor() {}
+void AlignAttributeImpl::anchor() {}
+void StringAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
   if (isStringAttribute()) return false;
@@ -381,6 +387,7 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::Returned:        return 1ULL << 39;
   case Attribute::Cold:            return 1ULL << 40;
   case Attribute::Builtin:         return 1ULL << 41;
+  case Attribute::OptimizeNone:    return 1ULL << 42;
   }
   llvm_unreachable("Unsupported attribute type");
 }
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index a4f5289..d12bf7b 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/AutoUpgrade.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -88,6 +89,20 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
   }
+  case 'o':
+    // We only need to change the name to match the mangling including the
+    // address space.
+    if (F->arg_size() == 2 && Name.startswith("objectsize.")) {
+      Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
+      if (F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
+        F->setName(Name + ".old");
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::objectsize, Tys);
+        return true;
+      }
+    }
+    break;
+
   case 'x': {
     if (Name.startswith("x86.sse2.pcmpeq.") ||
         Name.startswith("x86.sse2.pcmpgt.") ||
@@ -97,6 +112,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx.movnt.dq.256" ||
         Name == "x86.avx.movnt.pd.256" ||
         Name == "x86.avx.movnt.ps.256" ||
+        Name == "x86.sse42.crc32.64.8" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = 0;
       return true;
@@ -257,6 +273,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID);
       Rep = Builder.CreateCall3(VPCOM, CI->getArgOperand(0),
                                 CI->getArgOperand(1), Builder.getInt8(Imm));
+    } else if (Name == "llvm.x86.sse42.crc32.64.8") {
+      Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
+                                               Intrinsic::x86_sse42_crc32_32_8);
+      Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
+      Rep = Builder.CreateCall2(CRC32, Trunc0, CI->getArgOperand(1));
+      Rep = Builder.CreateZExt(Rep, CI->getType(), "");
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")
@@ -317,6 +339,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
 
+  case Intrinsic::objectsize:
+    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn,
+                                               CI->getArgOperand(0),
+                                               CI->getArgOperand(1),
+                                               Name));
+    CI->eraseFromParent();
+    return;
+
   case Intrinsic::arm_neon_vclz: {
     // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.*
     CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
@@ -391,3 +421,81 @@ void llvm::UpgradeCallsToIntrinsic(Function* F) {
   }
 }
 
+void llvm::UpgradeInstWithTBAATag(Instruction *I) {
+  MDNode *MD = I->getMetadata(LLVMContext::MD_tbaa);
+  assert(MD && "UpgradeInstWithTBAATag should have a TBAA tag");
+  // Check if the tag uses struct-path aware TBAA format.
+  if (isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3)
+    return;
+
+  if (MD->getNumOperands() == 3) {
+    Value *Elts[] = {
+      MD->getOperand(0),
+      MD->getOperand(1)
+    };
+    MDNode *ScalarType = MDNode::get(I->getContext(), Elts);
+    // Create a MDNode <ScalarType, ScalarType, offset 0, const>
+    Value *Elts2[] = {
+      ScalarType, ScalarType,
+      Constant::getNullValue(Type::getInt64Ty(I->getContext())),
+      MD->getOperand(2)
+    };
+    I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts2));
+  } else {
+    // Create a MDNode <MD, MD, offset 0>
+    Value *Elts[] = {MD, MD,
+      Constant::getNullValue(Type::getInt64Ty(I->getContext()))};
+    I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts));
+  }
+}
+
+Instruction *llvm::UpgradeBitCastInst(unsigned Opc, Value *V, Type *DestTy,
+                                      Instruction *&Temp) {
+  if (Opc != Instruction::BitCast)
+    return 0;
+
+  Temp = 0;
+  Type *SrcTy = V->getType();
+  if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
+      SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
+    LLVMContext &Context = V->getContext();
+
+    // We have no information about target data layout, so we assume that
+    // the maximum pointer size is 64bit.
+    Type *MidTy = Type::getInt64Ty(Context);
+    Temp = CastInst::Create(Instruction::PtrToInt, V, MidTy);
+
+    return CastInst::Create(Instruction::IntToPtr, Temp, DestTy);
+  }
+
+  return 0;
+}
+
+Value *llvm::UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy) {
+  if (Opc != Instruction::BitCast)
+    return 0;
+
+  Type *SrcTy = C->getType();
+  if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
+      SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
+    LLVMContext &Context = C->getContext();
+
+    // We have no information about target data layout, so we assume that
+    // the maximum pointer size is 64bit.
+    Type *MidTy = Type::getInt64Ty(Context);
+
+    return ConstantExpr::getIntToPtr(ConstantExpr::getPtrToInt(C, MidTy),
+                                     DestTy);
+  }
+
+  return 0;
+}
+
+/// Check the debug info version number, if it is out-dated, drop the debug
+/// info. Return true if module is modified.
+bool llvm::UpgradeDebugInfo(Module &M) {
+  if (getDebugMetadataVersionFromModule(M) == DEBUG_METADATA_VERSION)
+    return false;
+
+  return StripDebugInfo(M);
+}
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index c2a4ee3..581946c 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -6,10 +6,10 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
+  DIBuilder.cpp
   DataLayout.cpp
   DebugInfo.cpp
   DebugLoc.cpp
-  DIBuilder.cpp
   Dominators.cpp
   Function.cpp
   GCOV.cpp
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCore
   LLVMContext.cpp
   LLVMContextImpl.cpp
   LeakDetector.cpp
+  LegacyPassManager.cpp
   Metadata.cpp
   Module.cpp
   Pass.cpp
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 8c5a983..f5e225c 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -689,6 +689,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   }
   case Instruction::BitCast:
     return FoldBitCast(V, DestTy);
+  case Instruction::AddrSpaceCast:
+    return 0;
   }
 }
 
@@ -1897,6 +1899,37 @@ static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
   return true;
 }
 
+/// \brief Test whether a given ConstantInt is in-range for a SequentialType.
+static bool isIndexInRangeOfSequentialType(const SequentialType *STy,
+                                           const ConstantInt *CI) {
+  if (const PointerType *PTy = dyn_cast<PointerType>(STy))
+    // Only handle pointers to sized types, not pointers to functions.
+    return PTy->getElementType()->isSized();
+
+  uint64_t NumElements = 0;
+  // Determine the number of elements in our sequential type.
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+    NumElements = ATy->getNumElements();
+  else if (const VectorType *VTy = dyn_cast<VectorType>(STy))
+    NumElements = VTy->getNumElements();
+
+  assert((isa<ArrayType>(STy) || NumElements > 0) &&
+         "didn't expect non-array type to have zero elements!");
+
+  // We cannot bounds check the index if it doesn't fit in an int64_t.
+  if (CI->getValue().getActiveBits() > 64)
+    return false;
+
+  // A negative index or an index past the end of our sequential type is
+  // considered out-of-range.
+  int64_t IndexVal = CI->getSExtValue();
+  if (IndexVal < 0 || (NumElements > 0 && (uint64_t)IndexVal >= NumElements))
+    return false;
+
+  // Otherwise, it is in-range.
+  return true;
+}
+
 template<typename IndexTy>
 static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
                                                bool inBounds,
@@ -1940,7 +1973,32 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
            I != E; ++I)
         LastTy = *I;
 
-      if ((LastTy && isa<SequentialType>(LastTy)) || Idx0->isNullValue()) {
+      // We cannot combine indices if doing so would take us outside of an
+      // array or vector.  Doing otherwise could trick us if we evaluated such a
+      // GEP as part of a load.
+      //
+      // e.g. Consider if the original GEP was:
+      // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
+      //                    i32 0, i32 0, i64 0)
+      //
+      // If we then tried to offset it by '8' to get to the third element,
+      // an i8, we should *not* get:
+      // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
+      //                    i32 0, i32 0, i64 8)
+      //
+      // This GEP tries to index array element '8  which runs out-of-bounds.
+      // Subsequent evaluation would get confused and produce erroneous results.
+      //
+      // The following prohibits such a GEP from being formed by checking to see
+      // if the index is in-range with respect to an array or vector.
+      bool PerformFold = false;
+      if (Idx0->isNullValue())
+        PerformFold = true;
+      else if (SequentialType *STy = dyn_cast_or_null<SequentialType>(LastTy))
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx0))
+          PerformFold = isIndexInRangeOfSequentialType(STy, CI);
+
+      if (PerformFold) {
         SmallVector<Value*, 16> NewIndices;
         NewIndices.reserve(Idxs.size() + CE->getNumOperands());
         for (unsigned i = 1, e = CE->getNumOperands()-1; i != e; ++i)
@@ -2000,8 +2058,8 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   }
 
   // Check to see if any array indices are not within the corresponding
-  // notional array bounds. If so, try to determine if they can be factored
-  // out into preceding dimensions.
+  // notional array or vector bounds. If so, try to determine if they can be
+  // factored out into preceding dimensions.
   bool Unknown = false;
   SmallVector<Constant *, 8> NewIdxs;
   Type *Ty = C->getType();
@@ -2009,16 +2067,20 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   for (unsigned i = 0, e = Idxs.size(); i != e;
        Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
-      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
-        if (ATy->getNumElements() <= INT64_MAX &&
-            ATy->getNumElements() != 0 &&
-            CI->getSExtValue() >= (int64_t)ATy->getNumElements()) {
+      if (isa<ArrayType>(Ty) || isa<VectorType>(Ty))
+        if (CI->getSExtValue() > 0 &&
+            !isIndexInRangeOfSequentialType(cast<SequentialType>(Ty), CI)) {
           if (isa<SequentialType>(Prev)) {
             // It's out of range, but we can factor it into the prior
             // dimension.
             NewIdxs.resize(Idxs.size());
-            ConstantInt *Factor = ConstantInt::get(CI->getType(),
-                                                   ATy->getNumElements());
+            uint64_t NumElements = 0;
+            if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+              NumElements = ATy->getNumElements();
+            else
+              NumElements = cast<VectorType>(Ty)->getNumElements();
+
+            ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
             NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
 
             Constant *PrevIdx = cast<Constant>(Idxs[i-1]);
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 9067b34..690ac59 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -1126,6 +1126,7 @@ getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
     return ConstantExpr::getCast(getOpcode(), Ops[0], Ty);
   case Instruction::Select:
     return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
@@ -1461,6 +1462,7 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   case Instruction::PtrToInt: return getPtrToInt(C, Ty);
   case Instruction::IntToPtr: return getIntToPtr(C, Ty);
   case Instruction::BitCast:  return getBitCast(C, Ty);
+  case Instruction::AddrSpaceCast:  return getAddrSpaceCast(C, Ty);
   }
 }
 
@@ -1489,10 +1491,26 @@ Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
 
   if (Ty->isIntOrIntVectorTy())
     return getPtrToInt(S, Ty);
+
+  unsigned SrcAS = S->getType()->getPointerAddressSpace();
+  if (Ty->isPtrOrPtrVectorTy() && SrcAS != Ty->getPointerAddressSpace())
+    return getAddrSpaceCast(S, Ty);
+
   return getBitCast(S, Ty);
 }
 
-Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty, 
+Constant *ConstantExpr::getPointerBitCastOrAddrSpaceCast(Constant *S,
+                                                         Type *Ty) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast");
+
+  if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return getAddrSpaceCast(S, Ty);
+
+  return getBitCast(S, Ty);
+}
+
+Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty,
                                        bool isSigned) {
   assert(C->getType()->isIntOrIntVectorTy() &&
          Ty->isIntOrIntVectorTy() && "Invalid cast");
@@ -1662,6 +1680,13 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
   return getFoldedCast(Instruction::BitCast, C, DstTy);
 }
 
+Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy) {
+  assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
+         "Invalid constantexpr addrspacecast!");
+
+  return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy);
+}
+
 Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
                             unsigned Flags) {
   // Check the operands for consistency first.
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 66610bd..c70f459 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -97,7 +97,7 @@ LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) {
   return wrap(new Module(ModuleID, getGlobalContext()));
 }
 
-LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID, 
+LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID,
                                                 LLVMContextRef C) {
   return wrap(new Module(ModuleID, *unwrap(C)));
 }
@@ -147,6 +147,16 @@ LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
   return false;
 }
 
+char *LLVMPrintModuleToString(LLVMModuleRef M) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(M)->print(os, NULL);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 /*--.. Operations on inline assembler ......................................--*/
 void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
   unwrap(M)->setModuleInlineAsm(StringRef(Asm));
@@ -210,6 +220,20 @@ LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
   return wrap(&unwrap(Ty)->getContext());
 }
 
+void LLVMDumpType(LLVMTypeRef Ty) {
+  return unwrap(Ty)->dump();
+}
+
+char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(Ty)->print(os);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 /*--.. Operations on integer types .........................................--*/
 
 LLVMTypeRef LLVMInt1TypeInContext(LLVMContextRef C)  {
@@ -450,6 +474,16 @@ void LLVMDumpValue(LLVMValueRef Val) {
   unwrap(Val)->dump();
 }
 
+char* LLVMPrintValueToString(LLVMValueRef Val) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(Val)->print(os);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 void LLVMReplaceAllUsesWith(LLVMValueRef OldVal, LLVMValueRef NewVal) {
   unwrap(OldVal)->replaceAllUsesWith(unwrap(NewVal));
 }
@@ -681,7 +715,7 @@ LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str,
   return wrap(ConstantDataArray::getString(*unwrap(C), StringRef(Str, Length),
                                            DontNullTerminate == 0));
 }
-LLVMValueRef LLVMConstStructInContext(LLVMContextRef C, 
+LLVMValueRef LLVMConstStructInContext(LLVMContextRef C,
                                       LLVMValueRef *ConstantVals,
                                       unsigned Count, LLVMBool Packed) {
   Constant **Elements = unwrap<Constant>(ConstantVals, Count);
@@ -999,6 +1033,12 @@ LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
                                        unwrap(ToType)));
 }
 
+LLVMValueRef LLVMConstAddrSpaceCast(LLVMValueRef ConstantVal,
+                                    LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getAddrSpaceCast(unwrap<Constant>(ConstantVal),
+                                             unwrap(ToType)));
+}
+
 LLVMValueRef LLVMConstZExtOrBitCast(LLVMValueRef ConstantVal,
                                     LLVMTypeRef ToType) {
   return wrap(ConstantExpr::getZExtOrBitCast(unwrap<Constant>(ConstantVal),
@@ -1110,8 +1150,6 @@ LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
     return LLVMLinkOnceAnyLinkage;
   case GlobalValue::LinkOnceODRLinkage:
     return LLVMLinkOnceODRLinkage;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    return LLVMLinkOnceODRAutoHideLinkage;
   case GlobalValue::WeakAnyLinkage:
     return LLVMWeakAnyLinkage;
   case GlobalValue::WeakODRLinkage:
@@ -1156,7 +1194,8 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
     GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
     break;
   case LLVMLinkOnceODRAutoHideLinkage:
-    GV->setLinkage(GlobalValue::LinkOnceODRAutoHideLinkage);
+    DEBUG(errs() << "LLVMSetLinkage(): LLVMLinkOnceODRAutoHideLinkage is no "
+                    "longer supported.");
     break;
   case LLVMWeakAnyLinkage:
     GV->setLinkage(GlobalValue::WeakAnyLinkage);
@@ -1216,12 +1255,30 @@ void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz) {
     ->setVisibility(static_cast<GlobalValue::VisibilityTypes>(Viz));
 }
 
-unsigned LLVMGetAlignment(LLVMValueRef Global) {
-  return unwrap<GlobalValue>(Global)->getAlignment();
+/*--.. Operations on global variables, load and store instructions .........--*/
+
+unsigned LLVMGetAlignment(LLVMValueRef V) {
+  Value *P = unwrap<Value>(V);
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+    return GV->getAlignment();
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->getAlignment();
+  if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    return SI->getAlignment();
+
+  llvm_unreachable("only GlobalValue, LoadInst and StoreInst have alignment");
 }
 
-void LLVMSetAlignment(LLVMValueRef Global, unsigned Bytes) {
-  unwrap<GlobalValue>(Global)->setAlignment(Bytes);
+void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
+  Value *P = unwrap<Value>(V);
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+    GV->setAlignment(Bytes);
+  else if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    LI->setAlignment(Bytes);
+  else if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    SI->setAlignment(Bytes);
+  else
+    llvm_unreachable("only GlobalValue, LoadInst and StoreInst have alignment");
 }
 
 /*--.. Operations on global variables ......................................--*/
@@ -1553,7 +1610,7 @@ LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
   return (LLVMAttribute)A->getParent()->getAttributes().
     Raw(A->getArgNo()+1);
 }
-  
+
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
@@ -1745,7 +1802,7 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
   llvm_unreachable("LLVMSetInstructionCallConv applies only to call and invoke!");
 }
 
-void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index, 
+void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
                            LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
@@ -1755,7 +1812,7 @@ void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
                                                          index, B)));
 }
 
-void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index, 
+void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index,
                               LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
@@ -1765,7 +1822,7 @@ void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index,
                                                            index, B)));
 }
 
-void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, 
+void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
                                 unsigned align) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B;
@@ -2119,8 +2176,8 @@ LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
-  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
-                                               ITy, unwrap(Ty), AllocSize, 
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
+                                               ITy, unwrap(Ty), AllocSize,
                                                0, 0, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
@@ -2130,8 +2187,8 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
-  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
-                                               ITy, unwrap(Ty), AllocSize, 
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
+                                               ITy, unwrap(Ty), AllocSize,
                                                unwrap(Val), 0, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
@@ -2157,7 +2214,7 @@ LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
   return wrap(unwrap(B)->CreateLoad(unwrap(PointerVal), Name));
 }
 
-LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val, 
+LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val,
                             LLVMValueRef PointerVal) {
   return wrap(unwrap(B)->CreateStore(unwrap(Val), unwrap(PointerVal)));
 }
@@ -2267,6 +2324,11 @@ LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef B, LLVMValueRef Val,
   return wrap(unwrap(B)->CreateBitCast(unwrap(Val), unwrap(DestTy), Name));
 }
 
+LLVMValueRef LLVMBuildAddrSpaceCast(LLVMBuilderRef B, LLVMValueRef Val,
+                                    LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateAddrSpaceCast(unwrap(Val), unwrap(DestTy), Name));
+}
+
 LLVMValueRef LLVMBuildZExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
                                     LLVMTypeRef DestTy, const char *Name) {
   return wrap(unwrap(B)->CreateZExtOrBitCast(unwrap(Val), unwrap(DestTy),
@@ -2396,9 +2458,9 @@ LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS,
   return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name));
 }
 
-LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op, 
-                               LLVMValueRef PTR, LLVMValueRef Val, 
-                               LLVMAtomicOrdering ordering, 
+LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
+                               LLVMValueRef PTR, LLVMValueRef Val,
+                               LLVMAtomicOrdering ordering,
                                LLVMBool singleThread) {
   AtomicRMWInst::BinOp intop;
   switch (op) {
@@ -2421,14 +2483,14 @@ LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
     case LLVMAtomicOrderingMonotonic: intordering = Monotonic; break;
     case LLVMAtomicOrderingAcquire: intordering = Acquire; break;
     case LLVMAtomicOrderingRelease: intordering = Release; break;
-    case LLVMAtomicOrderingAcquireRelease: 
-      intordering = AcquireRelease; 
+    case LLVMAtomicOrderingAcquireRelease:
+      intordering = AcquireRelease;
       break;
-    case LLVMAtomicOrderingSequentiallyConsistent: 
-      intordering = SequentiallyConsistent; 
+    case LLVMAtomicOrderingSequentiallyConsistent:
+      intordering = SequentiallyConsistent;
       break;
   }
-  return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val), 
+  return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val),
     intordering, singleThread ? SingleThread : CrossThread));
 }
 
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 3005f77..c4a9f41 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -30,17 +30,24 @@ static Constant *GetTagConstant(LLVMContext &VMContext, unsigned Tag) {
 }
 
 DIBuilder::DIBuilder(Module &m)
-  : M(m), VMContext(M.getContext()), TempEnumTypes(0),
-    TempRetainTypes(0), TempSubprograms(0), TempGVs(0), DeclareFn(0),
-    ValueFn(0)
-{}
+    : M(m), VMContext(M.getContext()), TempEnumTypes(0), TempRetainTypes(0),
+      TempSubprograms(0), TempGVs(0), DeclareFn(0), ValueFn(0) {}
 
 /// finalize - Construct any deferred debug info descriptors.
 void DIBuilder::finalize() {
   DIArray Enums = getOrCreateArray(AllEnumTypes);
   DIType(TempEnumTypes).replaceAllUsesWith(Enums);
 
-  DIArray RetainTypes = getOrCreateArray(AllRetainTypes);
+  SmallVector<Value *, 16> RetainValues;
+  // Declarations and definitions of the same type may be retained. Some
+  // clients RAUW these pairs, leaving duplicates in the retained types
+  // list. Use a set to remove the duplicates while we transform the
+  // TrackingVHs back into Values.
+  SmallPtrSet<Value *, 16> RetainSet;
+  for (unsigned I = 0, E = AllRetainTypes.size(); I < E; I++)
+    if (RetainSet.insert(AllRetainTypes[I]))
+      RetainValues.push_back(AllRetainTypes[I]);
+  DIArray RetainTypes = getOrCreateArray(RetainValues);
   DIType(TempRetainTypes).replaceAllUsesWith(RetainTypes);
 
   DIArray SPs = getOrCreateArray(AllSubprograms);
@@ -79,7 +86,7 @@ static MDNode *createFilePathPair(LLVMContext &VMContext, StringRef Filename,
   assert(!Filename.empty() && "Unable to create file without name");
   Value *Pair[] = {
     MDString::get(VMContext, Filename),
-    MDString::get(VMContext, Directory),
+    MDString::get(VMContext, Directory)
   };
   return MDNode::get(VMContext, Pair);
 }
@@ -274,7 +281,7 @@ DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FromTy
+    FromTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -294,7 +301,7 @@ DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy
+    PointeeTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -308,12 +315,12 @@ DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy,
     NULL, // Unused
     NULL,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy,
-    Base
+    PointeeTy.getRef(),
+    Base.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -333,7 +340,7 @@ DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    RTy
+    RTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -346,14 +353,14 @@ DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    Ty
+    Ty.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -366,56 +373,59 @@ DIDerivedType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_friend),
     NULL,
-    Ty,
+    Ty.getRef(),
     NULL, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FriendTy
+    FriendTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createInheritance - Create debugging information entry to establish
 /// inheritance relationship between two types.
-DIDerivedType DIBuilder::createInheritance(
-    DIType Ty, DIType BaseTy, uint64_t BaseOffset, unsigned Flags) {
+DIDerivedType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
+                                           uint64_t BaseOffset,
+                                           unsigned Flags) {
   assert(Ty.isType() && "Unable to create inheritance");
   // TAG_inheritance is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_inheritance),
     NULL,
-    Ty,
+    Ty.getRef(),
     NULL, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), BaseOffset),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    BaseTy
+    BaseTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createMemberType - Create debugging information entry for a member.
-DIDerivedType DIBuilder::createMemberType(
-    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
-    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
-    unsigned Flags, DIType Ty) {
+DIDerivedType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
+                                          DIFile File, unsigned LineNumber,
+                                          uint64_t SizeInBits,
+                                          uint64_t AlignInBits,
+                                          uint64_t OffsetInBits, unsigned Flags,
+                                          DIType Ty) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty
+    Ty.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -432,14 +442,14 @@ DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*SizeInBits*/),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*AlignInBits*/),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*OffsetInBits*/),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty,
+    Ty.getRef(),
     Val
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
@@ -448,13 +458,11 @@ DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
 /// createObjCIVar - Create debugging information entry for Objective-C
 /// instance variable.
 DIDerivedType
-DIBuilder::createObjCIVar(StringRef Name,
-                          DIFile File, unsigned LineNumber,
+DIBuilder::createObjCIVar(StringRef Name, DIFile File, unsigned LineNumber,
                           uint64_t SizeInBits, uint64_t AlignInBits,
-                          uint64_t OffsetInBits, unsigned Flags,
-                          DIType Ty, StringRef PropertyName,
-                          StringRef GetterName, StringRef SetterName,
-                          unsigned PropertyAttributes) {
+                          uint64_t OffsetInBits, unsigned Flags, DIType Ty,
+                          StringRef PropertyName, StringRef GetterName,
+                          StringRef SetterName, unsigned PropertyAttributes) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
@@ -477,12 +485,12 @@ DIBuilder::createObjCIVar(StringRef Name,
 
 /// createObjCIVar - Create debugging information entry for Objective-C
 /// instance variable.
-DIDerivedType
-DIBuilder::createObjCIVar(StringRef Name,
-                          DIFile File, unsigned LineNumber,
-                          uint64_t SizeInBits, uint64_t AlignInBits,
-                          uint64_t OffsetInBits, unsigned Flags,
-                          DIType Ty, MDNode *PropertyNode) {
+DIDerivedType DIBuilder::createObjCIVar(StringRef Name, DIFile File,
+                                        unsigned LineNumber,
+                                        uint64_t SizeInBits,
+                                        uint64_t AlignInBits,
+                                        uint64_t OffsetInBits, unsigned Flags,
+                                        DIType Ty, MDNode *PropertyNode) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
@@ -502,12 +510,10 @@ DIBuilder::createObjCIVar(StringRef Name,
 
 /// createObjCProperty - Create debugging information entry for Objective-C
 /// property.
-DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
-                                             DIFile File, unsigned LineNumber,
-                                             StringRef GetterName,
-                                             StringRef SetterName,
-                                             unsigned PropertyAttributes,
-                                             DIType Ty) {
+DIObjCProperty
+DIBuilder::createObjCProperty(StringRef Name, DIFile File, unsigned LineNumber,
+                              StringRef GetterName, StringRef SetterName,
+                              unsigned PropertyAttributes, DIType Ty) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
     MDString::get(VMContext, Name),
@@ -529,9 +535,9 @@ DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
                                        unsigned ColumnNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_template_type_parameter),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
-    Ty,
+    Ty.getRef(),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
@@ -547,9 +553,9 @@ DIBuilder::createTemplateValueParameter(unsigned Tag, DIDescriptor Context,
                                         unsigned ColumnNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
-    Ty,
+    Ty.getRef(),
     Val,
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
@@ -598,30 +604,34 @@ DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
                                            uint64_t OffsetInBits,
                                            unsigned Flags, DIType DerivedFrom,
                                            DIArray Elements,
-                                           MDNode *VTableHolder,
-                                           MDNode *TemplateParams) {
+                                           DIType VTableHolder,
+                                           MDNode *TemplateParams,
+                                           StringRef UniqueIdentifier) {
   assert((!Context || Context.isScope() || Context.isType()) &&
          "createClassType should be called with a valid Context");
   // TAG_class_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
+    DerivedFrom.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    VTableHolder,
-    TemplateParams
+    VTableHolder.getRef(),
+    TemplateParams,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
          "createClassType should return a DICompositeType");
+  if (!UniqueIdentifier.empty())
+    retainType(R);
   return R;
 }
 
@@ -634,27 +644,31 @@ DICompositeType DIBuilder::createStructType(DIDescriptor Context,
                                             unsigned Flags, DIType DerivedFrom,
                                             DIArray Elements,
                                             unsigned RunTimeLang,
-                                            MDNode *VTableHolder) {
+                                            DIType VTableHolder,
+                                            StringRef UniqueIdentifier) {
  // TAG_structure_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_structure_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
+    DerivedFrom.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    VTableHolder,
+    VTableHolder.getRef(),
     NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
          "createStructType should return a DICompositeType");
+  if (!UniqueIdentifier.empty())
+    retainType(R);
   return R;
 }
 
@@ -664,45 +678,52 @@ DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
                                            uint64_t SizeInBits,
                                            uint64_t AlignInBits, unsigned Flags,
                                            DIArray Elements,
-                                           unsigned RunTimeLang) {
+                                           unsigned RunTimeLang,
+                                           StringRef UniqueIdentifier) {
   // TAG_union_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_union_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    NULL
+    NULL,
+    NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
-  return DICompositeType(MDNode::get(VMContext, Elts));
+  DICompositeType R(MDNode::get(VMContext, Elts));
+  if (!UniqueIdentifier.empty())
+    retainType(R);
+  return R;
 }
 
 /// createSubroutineType - Create subroutine type.
-DICompositeType
-DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
+DICompositeType DIBuilder::createSubroutineType(DIFile File,
+                                                DIArray ParameterTypes) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    NULL,
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     NULL,
     ParameterTypes,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -712,26 +733,30 @@ DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
 DICompositeType DIBuilder::createEnumerationType(
     DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
-    DIType UnderlyingType) {
+    DIType UnderlyingType, StringRef UniqueIdentifier) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    UnderlyingType,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    UnderlyingType.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
-  MDNode *Node = MDNode::get(VMContext, Elts);
-  AllEnumTypes.push_back(Node);
-  return DICompositeType(Node);
+  DICompositeType CTy(MDNode::get(VMContext, Elts));
+  AllEnumTypes.push_back(CTy);
+  if (!UniqueIdentifier.empty())
+    retainType(CTy);
+  return CTy;
 }
 
 /// createArrayType - Create debugging information entry for an array.
@@ -743,15 +768,17 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
     NULL, // Filename/Directory,
     NULL, // Unused
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Ty,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -759,22 +786,23 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
 /// createVectorType - Create debugging information entry for a vector.
 DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
                                             DIType Ty, DIArray Subscripts) {
-
   // A vector is an array type with the FlagVector flag applied.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
     NULL, // Filename/Directory,
     NULL, // Unused
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), DIType::FlagVector),
-    Ty,
+    Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -787,17 +815,14 @@ DIType DIBuilder::createArtificialType(DIType Ty) {
   SmallVector<Value *, 9> Elts;
   MDNode *N = Ty;
   assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i))
-      Elts.push_back(V);
-    else
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Elts.push_back(N->getOperand(i));
 
   unsigned CurFlags = Ty.getFlags();
   CurFlags = CurFlags | DIType::FlagArtificial;
 
   // Flags are stored at this slot.
+  // FIXME: Add an enum for this magic value.
   Elts[8] =  ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
 
   return DIType(MDNode::get(VMContext, Elts));
@@ -812,17 +837,14 @@ DIType DIBuilder::createObjectPointerType(DIType Ty) {
   SmallVector<Value *, 9> Elts;
   MDNode *N = Ty;
   assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i))
-      Elts.push_back(V);
-    else
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Elts.push_back(N->getOperand(i));
 
   unsigned CurFlags = Ty.getFlags();
   CurFlags = CurFlags | (DIType::FlagObjectPointer | DIType::FlagArtificial);
 
   // Flags are stored at this slot.
+  // FIXME: Add an enum for this magic value.
   Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
 
   return DIType(MDNode::get(VMContext, Elts));
@@ -831,7 +853,7 @@ DIType DIBuilder::createObjectPointerType(DIType Ty) {
 /// retainType - Retain DIType in a module even if it is not referenced
 /// through debug info anchors.
 void DIBuilder::retainType(DIType T) {
-  AllRetainTypes.push_back(T);
+  AllRetainTypes.push_back(TrackingVH<MDNode>(T));
 }
 
 /// createUnspecifiedParameter - Create unspeicified type descriptor
@@ -845,31 +867,35 @@ DIDescriptor DIBuilder::createUnspecifiedParameter() {
 
 /// createForwardDecl - Create a temporary forward-declared type that
 /// can be RAUW'd if the full type is seen.
-DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
-                                    DIDescriptor Scope, DIFile F,
-                                    unsigned Line, unsigned RuntimeLang,
-                                    uint64_t SizeInBits,
-                                    uint64_t AlignInBits) {
+DICompositeType
+DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
+                             DIFile F, unsigned Line, unsigned RuntimeLang,
+                             uint64_t SizeInBits, uint64_t AlignInBits,
+                             StringRef UniqueIdentifier) {
   // Create a temporary MDNode.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
     F.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext),
-                     DIDescriptor::FlagFwdDecl),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), DIDescriptor::FlagFwdDecl),
     NULL,
     DIArray(),
-    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang)
+    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
+    NULL,
+    NULL, //TemplateParams
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   MDNode *Node = MDNode::getTemporary(VMContext, Elts);
-  DIType RetTy(Node);
-  assert(RetTy.isType() &&
+  DICompositeType RetTy(Node);
+  assert(RetTy.isCompositeType() &&
          "createForwardDecl result should be a DIType");
+  if (!UniqueIdentifier.empty())
+    retainType(RetTy);
   return RetTy;
 }
 
@@ -895,10 +921,11 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
 }
 
 /// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::
-createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile F,
-                     unsigned LineNumber, DIType Ty, bool isLocalToUnit,
-                     Value *Val) {
+DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name,
+                                                 StringRef LinkageName,
+                                                 DIFile F, unsigned LineNumber,
+                                                 DIType Ty, bool isLocalToUnit,
+                                                 Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -920,19 +947,22 @@ createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile F,
 }
 
 /// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::
-createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, Value *Val) {
+DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name, DIFile F,
+                                                 unsigned LineNumber, DIType Ty,
+                                                 bool isLocalToUnit,
+                                                 Value *Val) {
   return createGlobalVariable(Name, Name, F, LineNumber, Ty, isLocalToUnit,
                               Val);
 }
 
 /// createStaticVariable - Create a new descriptor for the specified static
 /// variable.
-DIGlobalVariable DIBuilder::
-createStaticVariable(DIDescriptor Context, StringRef Name,
-                     StringRef LinkageName, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, Value *Val, MDNode *Decl) {
+DIGlobalVariable DIBuilder::createStaticVariable(DIDescriptor Context,
+                                                 StringRef Name,
+                                                 StringRef LinkageName,
+                                                 DIFile F, unsigned LineNumber,
+                                                 DIType Ty, bool isLocalToUnit,
+                                                 Value *Val, MDNode *Decl) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -1012,24 +1042,38 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
 }
 
 /// createFunction - Create a new descriptor for the specified function.
-DISubprogram DIBuilder::createFunction(DIDescriptor Context,
-                                       StringRef Name,
-                                       StringRef LinkageName,
-                                       DIFile File, unsigned LineNo,
-                                       DICompositeType Ty,
+/// FIXME: this is added for dragonegg. Once we update dragonegg
+/// to call resolve function, this will be removed.
+DISubprogram DIBuilder::createFunction(DIScopeRef Context, StringRef Name,
+                                       StringRef LinkageName, DIFile File,
+                                       unsigned LineNo, DICompositeType Ty,
+                                       bool isLocalToUnit, bool isDefinition,
+                                       unsigned ScopeLine, unsigned Flags,
+                                       bool isOptimized, Function *Fn,
+                                       MDNode *TParams, MDNode *Decl) {
+  // dragonegg does not generate identifier for types, so using an empty map
+  // to resolve the context should be fine.
+  DITypeIdentifierMap EmptyMap;
+  return createFunction(Context.resolve(EmptyMap), Name, LinkageName, File,
+                        LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
+                        Flags, isOptimized, Fn, TParams, Decl);
+}
+
+/// createFunction - Create a new descriptor for the specified function.
+DISubprogram DIBuilder::createFunction(DIDescriptor Context, StringRef Name,
+                                       StringRef LinkageName, DIFile File,
+                                       unsigned LineNo, DICompositeType Ty,
                                        bool isLocalToUnit, bool isDefinition,
-                                       unsigned ScopeLine,
-                                       unsigned Flags, bool isOptimized,
-                                       Function *Fn,
-                                       MDNode *TParams,
-                                       MDNode *Decl) {
+                                       unsigned ScopeLine, unsigned Flags,
+                                       bool isOptimized, Function *Fn,
+                                       MDNode *TParams, MDNode *Decl) {
   assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
          "function types should be subroutines");
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -1059,26 +1103,24 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
 }
 
 /// createMethod - Create a new descriptor for the specified C++ method.
-DISubprogram DIBuilder::createMethod(DIDescriptor Context,
-                                     StringRef Name,
-                                     StringRef LinkageName,
-                                     DIFile F,
+DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
+                                     StringRef LinkageName, DIFile F,
                                      unsigned LineNo, DICompositeType Ty,
-                                     bool isLocalToUnit,
-                                     bool isDefinition,
+                                     bool isLocalToUnit, bool isDefinition,
                                      unsigned VK, unsigned VIndex,
-                                     MDNode *VTableHolder,
-                                     unsigned Flags,
-                                     bool isOptimized,
-                                     Function *Fn,
+                                     DIType VTableHolder, unsigned Flags,
+                                     bool isOptimized, Function *Fn,
                                      MDNode *TParam) {
   assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
          "function types should be subroutines");
+  assert(getNonCompileUnitScope(Context) &&
+         "Methods should have both a Context and a context that isn't "
+         "the compile unit.");
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     F.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(Context).getRef(),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -1086,9 +1128,9 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     Ty,
     ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
     ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
-    ConstantInt::get(Type::getInt32Ty(VMContext), (unsigned)VK),
+    ConstantInt::get(Type::getInt32Ty(VMContext), VK),
     ConstantInt::get(Type::getInt32Ty(VMContext), VIndex),
-    VTableHolder,
+    VTableHolder.getRef(),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index d786d33..6bdc09e 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -629,6 +629,13 @@ Type *DataLayout::getSmallestLegalIntType(LLVMContext &C, unsigned Width) const
   return 0;
 }
 
+unsigned DataLayout::getLargestLegalIntTypeSize() const {
+  unsigned MaxWidth = 0;
+  for (unsigned i = 0, e = (unsigned)LegalIntWidths.size(); i != e; ++i)
+    MaxWidth = std::max<unsigned>(MaxWidth, LegalIntWidths[i]);
+  return MaxWidth;
+}
+
 uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
                                       ArrayRef<Value *> Indices) const {
   Type *Ty = ptrTy;
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index ff37542..70a756f 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -75,8 +75,8 @@ uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI
-        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+    if (ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
       return CI->getZExtValue();
 
   return 0;
@@ -87,8 +87,8 @@ int64_t DIDescriptor::getInt64Field(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI
-        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+    if (ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
       return CI->getSExtValue();
 
   return 0;
@@ -104,7 +104,7 @@ GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -113,7 +113,7 @@ Constant *DIDescriptor::getConstantField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -122,7 +122,7 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -131,19 +131,17 @@ void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
     return;
 
   if (Elt < DbgNode->getNumOperands()) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     Node->replaceOperandWith(Elt, F);
   }
 }
 
 unsigned DIVariable::getNumAddrElements() const {
-  return DbgNode->getNumOperands()-8;
+  return DbgNode->getNumOperands() - 8;
 }
 
 /// getInlinedAt - If this variable is inlined then return inline location.
-MDNode *DIVariable::getInlinedAt() const {
-  return getNodeField(DbgNode, 7);
-}
+MDNode *DIVariable::getInlinedAt() const { return getNodeField(DbgNode, 7); }
 
 //===----------------------------------------------------------------------===//
 // Predicates
@@ -152,7 +150,8 @@ MDNode *DIVariable::getInlinedAt() const {
 /// isBasicType - Return true if the specified tag is legal for
 /// DIBasicType.
 bool DIDescriptor::isBasicType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_unspecified_type:
@@ -164,7 +163,8 @@ bool DIDescriptor::isBasicType() const {
 
 /// isDerivedType - Return true if the specified tag is legal for DIDerivedType.
 bool DIDescriptor::isDerivedType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_pointer_type:
@@ -187,7 +187,8 @@ bool DIDescriptor::isDerivedType() const {
 /// isCompositeType - Return true if the specified tag is legal for
 /// DICompositeType.
 bool DIDescriptor::isCompositeType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_array_type:
   case dwarf::DW_TAG_structure_type:
@@ -203,7 +204,8 @@ bool DIDescriptor::isCompositeType() const {
 
 /// isVariable - Return true if the specified tag is legal for DIVariable.
 bool DIDescriptor::isVariable() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_auto_variable:
   case dwarf::DW_TAG_arg_variable:
@@ -240,17 +242,19 @@ bool DIDescriptor::isUnspecifiedParameter() const {
 /// isScope - Return true if the specified tag is one of the scope
 /// related tag.
 bool DIDescriptor::isScope() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_compile_unit:
   case dwarf::DW_TAG_lexical_block:
   case dwarf::DW_TAG_subprogram:
   case dwarf::DW_TAG_namespace:
+  case dwarf::DW_TAG_file_type:
     return true;
   default:
     break;
   }
-  return false;
+  return isType();
 }
 
 /// isTemplateTypeParameter - Return true if the specified tag is
@@ -286,13 +290,13 @@ bool DIDescriptor::isNameSpace() const {
 /// lexical block with an extra file.
 bool DIDescriptor::isLexicalBlockFile() const {
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-    (DbgNode->getNumOperands() == 3);
+         (DbgNode->getNumOperands() == 3);
 }
 
 /// isLexicalBlock - Return true if the specified tag is DW_TAG_lexical_block.
 bool DIDescriptor::isLexicalBlock() const {
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-    (DbgNode->getNumOperands() > 3);
+         (DbgNode->getNumOperands() > 3);
 }
 
 /// isSubrange - Return true if the specified tag is DW_TAG_subrange_type.
@@ -339,10 +343,10 @@ void DIType::replaceAllUsesWith(DIDescriptor &D) {
   // this detail by allowing a value to be replaced with replaceAllUsesWith()
   // itself.
   if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     const MDNode *DN = D;
     const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    Node->replaceAllUsesWith(const_cast<Value *>(V));
     MDNode::deleteTemporary(Node);
   }
 }
@@ -359,31 +363,14 @@ void DIType::replaceAllUsesWith(MDNode *D) {
   // this detail by allowing a value to be replaced with replaceAllUsesWith()
   // itself.
   if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     const MDNode *DN = D;
     const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    Node->replaceAllUsesWith(const_cast<Value *>(V));
     MDNode::deleteTemporary(Node);
   }
 }
 
-/// isUnsignedDIType - Return true if type encoding is unsigned.
-bool DIType::isUnsignedDIType() {
-  DIDerivedType DTy(DbgNode);
-  if (DTy.Verify())
-    return DTy.getTypeDerivedFrom().isUnsignedDIType();
-
-  DIBasicType BTy(DbgNode);
-  if (BTy.Verify()) {
-    unsigned Encoding = BTy.getEncoding();
-    if (Encoding == dwarf::DW_ATE_unsigned ||
-        Encoding == dwarf::DW_ATE_unsigned_char ||
-        Encoding == dwarf::DW_ATE_boolean)
-      return true;
-  }
-  return false;
-}
-
 /// Verify - Verify that a compile unit is well formed.
 bool DICompileUnit::Verify() const {
   if (!isCompileUnit())
@@ -413,22 +400,53 @@ static bool fieldIsMDNode(const MDNode *DbgNode, unsigned Elt) {
   // FIXME: This function should return true, if the field is null or the field
   // is indeed a MDNode: return !Fld || isa<MDNode>(Fld).
   Value *Fld = getField(DbgNode, Elt);
-  if (Fld && isa<MDString>(Fld) &&
-      !cast<MDString>(Fld)->getString().empty())
+  if (Fld && isa<MDString>(Fld) && !cast<MDString>(Fld)->getString().empty())
     return false;
   return true;
 }
 
+/// Check if a field at position Elt of a MDNode is a MDString.
+static bool fieldIsMDString(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return !Fld || isa<MDString>(Fld);
+}
+
+/// Check if a value can be a reference to a type.
+static bool isTypeRef(const Value *Val) {
+  return !Val ||
+         (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
+         (isa<MDNode>(Val) && DIType(cast<MDNode>(Val)).isType());
+}
+
+/// Check if a field at position Elt of a MDNode can be a reference to a type.
+static bool fieldIsTypeRef(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return isTypeRef(Fld);
+}
+
+/// Check if a value can be a ScopeRef.
+static bool isScopeRef(const Value *Val) {
+  return !Val ||
+         (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
+         (isa<MDNode>(Val) && DIScope(cast<MDNode>(Val)).isScope());
+}
+
+/// Check if a field at position Elt of a MDNode can be a ScopeRef.
+static bool fieldIsScopeRef(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return isScopeRef(Fld);
+}
+
 /// Verify - Verify that a type descriptor is well formed.
 bool DIType::Verify() const {
   if (!isType())
     return false;
   // Make sure Context @ field 2 is MDNode.
-  if (!fieldIsMDNode(DbgNode, 2))
+  if (!fieldIsScopeRef(DbgNode, 2))
     return false;
 
   // FIXME: Sink this into the various subclass verifies.
-  unsigned Tag = getTag();
+  uint16_t Tag = getTag();
   if (!isBasicType() && Tag != dwarf::DW_TAG_const_type &&
       Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_pointer_type &&
       Tag != dwarf::DW_TAG_ptr_to_member_type &&
@@ -460,12 +478,12 @@ bool DIBasicType::Verify() const {
 
 /// Verify - Verify that a derived type descriptor is well formed.
 bool DIDerivedType::Verify() const {
-  // Make sure DerivedFrom @ field 9 is MDNode.
-  if (!fieldIsMDNode(DbgNode, 9))
+  // Make sure DerivedFrom @ field 9 is TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 9))
     return false;
   if (getTag() == dwarf::DW_TAG_ptr_to_member_type)
-    // Make sure ClassType @ field 10 is MDNode.
-    if (!fieldIsMDNode(DbgNode, 10))
+    // Make sure ClassType @ field 10 is a TypeRef.
+    if (!fieldIsTypeRef(DbgNode, 10))
       return false;
 
   return isDerivedType() && DbgNode->getNumOperands() >= 10 &&
@@ -477,13 +495,17 @@ bool DICompositeType::Verify() const {
   if (!isCompositeType())
     return false;
 
-  // Make sure DerivedFrom @ field 9 and ContainingType @ field 12 are MDNodes.
-  if (!fieldIsMDNode(DbgNode, 9))
+  // Make sure DerivedFrom @ field 9 and ContainingType @ field 12 are TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 9))
     return false;
-  if (!fieldIsMDNode(DbgNode, 12))
+  if (!fieldIsTypeRef(DbgNode, 12))
     return false;
 
-  return DbgNode->getNumOperands() >= 10 && DbgNode->getNumOperands() <= 14;
+  // Make sure the type identifier at field 14 is MDString, it can be null.
+  if (!fieldIsMDString(DbgNode, 14))
+    return false;
+
+  return DbgNode->getNumOperands() == 15;
 }
 
 /// Verify - Verify that a subprogram descriptor is well formed.
@@ -491,13 +513,13 @@ bool DISubprogram::Verify() const {
   if (!isSubprogram())
     return false;
 
-  // Make sure context @ field 2 and type @ field 7 are MDNodes.
-  if (!fieldIsMDNode(DbgNode, 2))
+  // Make sure context @ field 2 is a ScopeRef and type @ field 7 is a MDNode.
+  if (!fieldIsScopeRef(DbgNode, 2))
     return false;
   if (!fieldIsMDNode(DbgNode, 7))
     return false;
   // Containing type @ field 12.
-  if (!fieldIsMDNode(DbgNode, 12))
+  if (!fieldIsTypeRef(DbgNode, 12))
     return false;
   return DbgNode->getNumOperands() == 20;
 }
@@ -550,9 +572,7 @@ bool DINameSpace::Verify() const {
 }
 
 /// \brief Retrieve the MDNode for the directory/file pair.
-MDNode *DIFile::getFileNode() const {
-  return getNodeField(DbgNode, 1);
-}
+MDNode *DIFile::getFileNode() const { return getNodeField(DbgNode, 1); }
 
 /// \brief Verify that the file descriptor is well formed.
 bool DIFile::Verify() const {
@@ -595,56 +615,77 @@ bool DIImportedEntity::Verify() const {
          (DbgNode->getNumOperands() == 4 || DbgNode->getNumOperands() == 5);
 }
 
-/// getOriginalTypeSize - If this type is derived from a base type then
-/// return base type size.
-uint64_t DIDerivedType::getOriginalTypeSize() const {
-  unsigned Tag = getTag();
-
-  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
-      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
-      Tag != dwarf::DW_TAG_restrict_type)
-    return getSizeInBits();
-
-  DIType BaseType = getTypeDerivedFrom();
-
-  // If this type is not derived from any type then take conservative approach.
-  if (!BaseType.isValid())
-    return getSizeInBits();
-
-  // If this is a derived type, go ahead and get the base type, unless it's a
-  // reference then it's just the size of the field. Pointer types have no need
-  // of this since they're a different type of qualification on the type.
-  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
-      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
-    return getSizeInBits();
-
-  if (BaseType.isDerivedType())
-    return DIDerivedType(BaseType).getOriginalTypeSize();
-
-  return BaseType.getSizeInBits();
-}
-
 /// getObjCProperty - Return property node, if this ivar is associated with one.
 MDNode *DIDerivedType::getObjCProperty() const {
   return getNodeField(DbgNode, 10);
 }
 
+MDString *DICompositeType::getIdentifier() const {
+  return cast_or_null<MDString>(getField(DbgNode, 14));
+}
+
+#ifndef NDEBUG
+static void VerifySubsetOf(const MDNode *LHS, const MDNode *RHS) {
+  for (unsigned i = 0; i != LHS->getNumOperands(); ++i) {
+    // Skip the 'empty' list (that's a single i32 0, rather than truly empty).
+    if (i == 0 && isa<ConstantInt>(LHS->getOperand(i)))
+      continue;
+    const MDNode *E = cast<MDNode>(LHS->getOperand(i));
+    bool found = false;
+    for (unsigned j = 0; !found && j != RHS->getNumOperands(); ++j)
+      found = E == RHS->getOperand(j);
+    assert(found && "Losing a member during member list replacement");
+  }
+}
+#endif
+
 /// \brief Set the array of member DITypes.
 void DICompositeType::setTypeArray(DIArray Elements, DIArray TParams) {
-  assert((!TParams || DbgNode->getNumOperands() == 14) &&
+  assert((!TParams || DbgNode->getNumOperands() == 15) &&
          "If you're setting the template parameters this should include a slot "
          "for that!");
   TrackingVH<MDNode> N(*this);
-  N->replaceOperandWith(10, Elements);
+  if (Elements) {
+#ifndef NDEBUG
+    // Check that the new list of members contains all the old members as well.
+    if (const MDNode *El = cast_or_null<MDNode>(N->getOperand(10)))
+      VerifySubsetOf(El, Elements);
+#endif
+    N->replaceOperandWith(10, Elements);
+  }
   if (TParams)
     N->replaceOperandWith(13, TParams);
   DbgNode = N;
 }
 
+void DICompositeType::addMember(DIDescriptor D) {
+  SmallVector<llvm::Value *, 16> M;
+  DIArray OrigM = getTypeArray();
+  unsigned Elements = OrigM.getNumElements();
+  if (Elements == 1 && !OrigM.getElement(0))
+    Elements = 0;
+  M.reserve(Elements + 1);
+  for (unsigned i = 0; i != Elements; ++i)
+    M.push_back(OrigM.getElement(i));
+  M.push_back(D);
+  setTypeArray(DIArray(MDNode::get(DbgNode->getContext(), M)));
+}
+
+/// Generate a reference to this DIType. Uses the type identifier instead
+/// of the actual MDNode if possible, to help type uniquing.
+DIScopeRef DIScope::getRef() const {
+  if (!isCompositeType())
+    return DIScopeRef(*this);
+  DICompositeType DTy(DbgNode);
+  if (!DTy.getIdentifier())
+    return DIScopeRef(*this);
+  return DIScopeRef(DTy.getIdentifier());
+}
+
 /// \brief Set the containing type.
 void DICompositeType::setContainingType(DICompositeType ContainingType) {
   TrackingVH<MDNode> N(*this);
-  N->replaceOperandWith(12, ContainingType);
+  N->replaceOperandWith(12, ContainingType.getRef());
   DbgNode = N;
 }
 
@@ -674,7 +715,7 @@ bool DISubprogram::describes(const Function *F) {
 }
 
 unsigned DISubprogram::isOptimized() const {
-  assert (DbgNode && "Invalid subprogram descriptor!");
+  assert(DbgNode && "Invalid subprogram descriptor!");
   if (DbgNode->getNumOperands() == 15)
     return getUnsignedField(14);
   return 0;
@@ -694,25 +735,39 @@ Value *DITemplateValueParameter::getValue() const {
 
 // If the current node has a parent scope then return that,
 // else return an empty scope.
-DIScope DIScope::getContext() const {
+DIScopeRef DIScope::getContext() const {
 
   if (isType())
     return DIType(DbgNode).getContext();
 
   if (isSubprogram())
-    return DISubprogram(DbgNode).getContext();
+    return DIScopeRef(DISubprogram(DbgNode).getContext());
 
   if (isLexicalBlock())
-    return DILexicalBlock(DbgNode).getContext();
+    return DIScopeRef(DILexicalBlock(DbgNode).getContext());
 
   if (isLexicalBlockFile())
-    return DILexicalBlockFile(DbgNode).getContext();
+    return DIScopeRef(DILexicalBlockFile(DbgNode).getContext());
 
   if (isNameSpace())
-    return DINameSpace(DbgNode).getContext();
+    return DIScopeRef(DINameSpace(DbgNode).getContext());
 
   assert((isFile() || isCompileUnit()) && "Unhandled type of scope.");
-  return DIScope();
+  return DIScopeRef(NULL);
+}
+
+// If the scope node has a name, return that, else return an empty string.
+StringRef DIScope::getName() const {
+  if (isType())
+    return DIType(DbgNode).getName();
+  if (isSubprogram())
+    return DISubprogram(DbgNode).getName();
+  if (isNameSpace())
+    return DINameSpace(DbgNode).getName();
+  assert((isLexicalBlock() || isLexicalBlockFile() || isFile() ||
+          isCompileUnit()) &&
+         "Unhandled type of scope.");
+  return StringRef();
 }
 
 StringRef DIScope::getFilename() const {
@@ -748,7 +803,6 @@ DIArray DICompileUnit::getSubprograms() const {
   return DIArray(getNodeField(DbgNode, 9));
 }
 
-
 DIArray DICompileUnit::getGlobalVariables() const {
   if (!DbgNode || DbgNode->getNumOperands() < 13)
     return DIArray();
@@ -813,8 +867,7 @@ DIVariable llvm::createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
   SmallVector<Value *, 16> Elts;
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ? Elts.push_back(InlinedScope) :
-             Elts.push_back(DV->getOperand(i));
+    i == 7 ? Elts.push_back(InlinedScope) : Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
@@ -823,9 +876,8 @@ DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
   SmallVector<Value *, 16> Elts;
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ?
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))):
-      Elts.push_back(DV->getOperand(i));
+    i == 7 ? Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)))
+           : Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
@@ -849,23 +901,42 @@ DICompositeType llvm::getDICompositeType(DIType T) {
   if (T.isCompositeType())
     return DICompositeType(T);
 
-  if (T.isDerivedType())
-    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+  if (T.isDerivedType()) {
+    // This function is currently used by dragonegg and dragonegg does
+    // not generate identifier for types, so using an empty map to resolve
+    // DerivedFrom should be fine.
+    DITypeIdentifierMap EmptyMap;
+    return getDICompositeType(
+        DIDerivedType(T).getTypeDerivedFrom().resolve(EmptyMap));
+  }
 
   return DICompositeType();
 }
 
-/// isSubprogramContext - Return true if Context is either a subprogram
-/// or another context nested inside a subprogram.
-bool llvm::isSubprogramContext(const MDNode *Context) {
-  if (!Context)
-    return false;
-  DIDescriptor D(Context);
-  if (D.isSubprogram())
-    return true;
-  if (D.isType())
-    return isSubprogramContext(DIType(Context).getContext());
-  return false;
+/// Update DITypeIdentifierMap by going through retained types of each CU.
+DITypeIdentifierMap
+llvm::generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes) {
+  DITypeIdentifierMap Map;
+  for (unsigned CUi = 0, CUe = CU_Nodes->getNumOperands(); CUi != CUe; ++CUi) {
+    DICompileUnit CU(CU_Nodes->getOperand(CUi));
+    DIArray Retain = CU.getRetainedTypes();
+    for (unsigned Ti = 0, Te = Retain.getNumElements(); Ti != Te; ++Ti) {
+      if (!Retain.getElement(Ti).isCompositeType())
+        continue;
+      DICompositeType Ty(Retain.getElement(Ti));
+      if (MDString *TypeId = Ty.getIdentifier()) {
+        // Definition has priority over declaration.
+        // Try to insert (TypeId, Ty) to Map.
+        std::pair<DITypeIdentifierMap::iterator, bool> P =
+            Map.insert(std::make_pair(TypeId, Ty));
+        // If TypeId already exists in Map and this is a definition, replace
+        // whatever we had (declaration or definition) with the definition.
+        if (!P.second && !Ty.isForwardDecl())
+          P.first->second = Ty;
+      }
+    }
+  }
+  return Map;
 }
 
 //===----------------------------------------------------------------------===//
@@ -879,10 +950,21 @@ void DebugInfoFinder::reset() {
   TYs.clear();
   Scopes.clear();
   NodesSeen.clear();
+  TypeIdentifierMap.clear();
+  TypeMapInitialized = false;
+}
+
+void DebugInfoFinder::InitializeTypeMap(const Module &M) {
+  if (!TypeMapInitialized)
+    if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
+      TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
+      TypeMapInitialized = true;
+    }
 }
 
 /// processModule - Process entire module and collect debug info.
 void DebugInfoFinder::processModule(const Module &M) {
+  InitializeTypeMap(M);
   if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
       DICompileUnit CU(CU_Nodes->getOperand(i));
@@ -904,27 +986,38 @@ void DebugInfoFinder::processModule(const Module &M) {
       DIArray RetainedTypes = CU.getRetainedTypes();
       for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i)
         processType(DIType(RetainedTypes.getElement(i)));
-      // FIXME: We really shouldn't be bailing out after visiting just one CU
-      return;
+      DIArray Imports = CU.getImportedEntities();
+      for (unsigned i = 0, e = Imports.getNumElements(); i != e; ++i) {
+        DIImportedEntity Import = DIImportedEntity(Imports.getElement(i));
+        DIDescriptor Entity = Import.getEntity();
+        if (Entity.isType())
+          processType(DIType(Entity));
+        else if (Entity.isSubprogram())
+          processSubprogram(DISubprogram(Entity));
+        else if (Entity.isNameSpace())
+          processScope(DINameSpace(Entity).getContext());
+      }
     }
   }
 }
 
 /// processLocation - Process DILocation.
-void DebugInfoFinder::processLocation(DILocation Loc) {
-  if (!Loc) return;
+void DebugInfoFinder::processLocation(const Module &M, DILocation Loc) {
+  if (!Loc)
+    return;
+  InitializeTypeMap(M);
   processScope(Loc.getScope());
-  processLocation(Loc.getOrigLocation());
+  processLocation(M, Loc.getOrigLocation());
 }
 
 /// processType - Process DIType.
 void DebugInfoFinder::processType(DIType DT) {
   if (!addType(DT))
     return;
-  processScope(DT.getContext());
+  processScope(DT.getContext().resolve(TypeIdentifierMap));
   if (DT.isCompositeType()) {
     DICompositeType DCT(DT);
-    processType(DCT.getTypeDerivedFrom());
+    processType(DCT.getTypeDerivedFrom().resolve(TypeIdentifierMap));
     DIArray DA = DCT.getTypeArray();
     for (unsigned i = 0, e = DA.getNumElements(); i != e; ++i) {
       DIDescriptor D = DA.getElement(i);
@@ -935,7 +1028,7 @@ void DebugInfoFinder::processType(DIType DT) {
     }
   } else if (DT.isDerivedType()) {
     DIDerivedType DDT(DT);
-    processType(DDT.getTypeDerivedFrom());
+    processType(DDT.getTypeDerivedFrom().resolve(TypeIdentifierMap));
   }
 }
 
@@ -975,8 +1068,7 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
   else if (Context.isLexicalBlockFile()) {
     DILexicalBlockFile DBF = DILexicalBlockFile(Context);
     return processLexicalBlock(DILexicalBlock(DBF.getScope()));
-  }
-  else
+  } else
     return processSubprogram(DISubprogram(Context));
 }
 
@@ -984,14 +1076,30 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
 void DebugInfoFinder::processSubprogram(DISubprogram SP) {
   if (!addSubprogram(SP))
     return;
-  processScope(SP.getContext());
+  processScope(SP.getContext().resolve(TypeIdentifierMap));
   processType(SP.getType());
+  DIArray TParams = SP.getTemplateParams();
+  for (unsigned I = 0, E = TParams.getNumElements(); I != E; ++I) {
+    DIDescriptor Element = TParams.getElement(I);
+    if (Element.isTemplateTypeParameter()) {
+      DITemplateTypeParameter TType(Element);
+      processScope(TType.getContext().resolve(TypeIdentifierMap));
+      processType(TType.getType().resolve(TypeIdentifierMap));
+    } else if (Element.isTemplateValueParameter()) {
+      DITemplateValueParameter TVal(Element);
+      processScope(TVal.getContext().resolve(TypeIdentifierMap));
+      processType(TVal.getType().resolve(TypeIdentifierMap));
+    }
+  }
 }
 
 /// processDeclare - Process DbgDeclareInst.
-void DebugInfoFinder::processDeclare(const DbgDeclareInst *DDI) {
+void DebugInfoFinder::processDeclare(const Module &M,
+                                     const DbgDeclareInst *DDI) {
   MDNode *N = dyn_cast<MDNode>(DDI->getVariable());
-  if (!N) return;
+  if (!N)
+    return;
+  InitializeTypeMap(M);
 
   DIDescriptor DV(N);
   if (!DV.isVariable())
@@ -1003,9 +1111,11 @@ void DebugInfoFinder::processDeclare(const DbgDeclareInst *DDI) {
   processType(DIVariable(N).getType());
 }
 
-void DebugInfoFinder::processValue(const DbgValueInst *DVI) {
+void DebugInfoFinder::processValue(const Module &M, const DbgValueInst *DVI) {
   MDNode *N = dyn_cast<MDNode>(DVI->getVariable());
-  if (!N) return;
+  if (!N)
+    return;
+  InitializeTypeMap(M);
 
   DIDescriptor DV(N);
   if (!DV.isVariable())
@@ -1083,12 +1193,14 @@ bool DebugInfoFinder::addScope(DIScope Scope) {
 
 /// dump - Print descriptor to dbgs() with a newline.
 void DIDescriptor::dump() const {
-  print(dbgs()); dbgs() << '\n';
+  print(dbgs());
+  dbgs() << '\n';
 }
 
 /// print - Print descriptor.
 void DIDescriptor::print(raw_ostream &OS) const {
-  if (!DbgNode) return;
+  if (!DbgNode)
+    return;
 
   if (const char *Tag = dwarf::TagString(getTag()))
     OS << "[ " << Tag << " ]";
@@ -1150,7 +1262,8 @@ void DIEnumerator::printInternal(raw_ostream &OS) const {
 }
 
 void DIType::printInternal(raw_ostream &OS) const {
-  if (!DbgNode) return;
+  if (!DbgNode)
+    return;
 
   StringRef Res = getName();
   if (!Res.empty())
@@ -1158,13 +1271,11 @@ void DIType::printInternal(raw_ostream &OS) const {
 
   // TODO: Print context?
 
-  OS << " [line " << getLineNumber()
-     << ", size " << getSizeInBits()
-     << ", align " << getAlignInBits()
-     << ", offset " << getOffsetInBits();
+  OS << " [line " << getLineNumber() << ", size " << getSizeInBits()
+     << ", align " << getAlignInBits() << ", offset " << getOffsetInBits();
   if (isBasicType())
     if (const char *Enc =
-        dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
+            dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
       OS << ", enc " << Enc;
   OS << "]";
 
@@ -1260,16 +1371,15 @@ void DIObjCProperty::printInternal(raw_ostream &OS) const {
   if (!Name.empty())
     OS << " [" << Name << ']';
 
-  OS << " [line " << getLineNumber()
-     << ", properties " << getUnsignedField(6) << ']';
+  OS << " [line " << getLineNumber() << ", properties " << getUnsignedField(6)
+     << ']';
 }
 
 static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
                           const LLVMContext &Ctx) {
-  if (!DL.isUnknown()) {          // Print source line info.
+  if (!DL.isUnknown()) { // Print source line info.
     DIScope Scope(DL.getScope(Ctx));
-    assert(Scope.isScope() &&
-      "Scope of a DebugLoc should be a DIScope.");
+    assert(Scope.isScope() && "Scope of a DebugLoc should be a DIScope.");
     // Omit the directory, because it's likely to be long and uninteresting.
     CommentOS << Scope.getFilename();
     CommentOS << ':' << DL.getLine();
@@ -1298,3 +1408,81 @@ void DIVariable::printExtendedName(raw_ostream &OS) const {
     }
   }
 }
+
+/// Specialize constructor to make sure it has the correct type.
+template <> DIRef<DIScope>::DIRef(const Value *V) : Val(V) {
+  assert(isScopeRef(V) && "DIScopeRef should be a MDString or MDNode");
+}
+template <> DIRef<DIType>::DIRef(const Value *V) : Val(V) {
+  assert(isTypeRef(V) && "DITypeRef should be a MDString or MDNode");
+}
+
+/// Specialize getFieldAs to handle fields that are references to DIScopes.
+template <>
+DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const {
+  return DIScopeRef(getField(DbgNode, Elt));
+}
+/// Specialize getFieldAs to handle fields that are references to DITypes.
+template <> DITypeRef DIDescriptor::getFieldAs<DITypeRef>(unsigned Elt) const {
+  return DITypeRef(getField(DbgNode, Elt));
+}
+
+/// Strip debug info in the module if it exists.
+/// To do this, we remove all calls to the debugger intrinsics and any named
+/// metadata for debugging. We also remove debug locations for instructions.
+/// Return true if module is modified.
+bool llvm::StripDebugInfo(Module &M) {
+
+  bool Changed = false;
+
+  // Remove all of the calls to the debugger intrinsics, and remove them from
+  // the module.
+  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      CI->eraseFromParent();
+    }
+    Declare->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
+    while (!DbgVal->use_empty()) {
+      CallInst *CI = cast<CallInst>(DbgVal->use_back());
+      CI->eraseFromParent();
+    }
+    DbgVal->eraseFromParent();
+    Changed = true;
+  }
+
+  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
+         NME = M.named_metadata_end(); NMI != NME;) {
+    NamedMDNode *NMD = NMI;
+    ++NMI;
+    if (NMD->getName().startswith("llvm.dbg.")) {
+      NMD->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
+    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
+         ++FI)
+      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
+           ++BI) {
+        if (!BI->getDebugLoc().isUnknown()) {
+          Changed = true;
+          BI->setDebugLoc(DebugLoc());
+        }
+      }
+
+  return Changed;
+}
+
+/// Return Debug Info Metadata Version by checking module flags.
+unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
+  Value *Val = M.getModuleFlag("Debug Info Version");
+  if (!Val)
+    return 0;
+  return cast<ConstantInt>(Val)->getZExtValue();
+}
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index bf9d949..e8a2402 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -276,6 +276,9 @@ void Function::dropAllReferences() {
   // blockaddresses, but BasicBlock's destructor takes care of those.
   while (!BasicBlocks.empty())
     BasicBlocks.begin()->eraseFromParent();
+
+  // Prefix data is stored in a side table.
+  setPrefixData(0);
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
@@ -351,6 +354,10 @@ void Function::copyAttributesFrom(const GlobalValue *Src) {
     setGC(SrcF->getGC());
   else
     clearGC();
+  if (SrcF->hasPrefixData())
+    setPrefixData(SrcF->getPrefixData());
+  else
+    setPrefixData(0);
 }
 
 /// getIntrinsicID - This method returns the ID number of the specified
@@ -446,7 +453,9 @@ enum IIT_Info {
   IIT_STRUCT5 = 22,
   IIT_EXTEND_VEC_ARG = 23,
   IIT_TRUNC_VEC_ARG = 24,
-  IIT_ANYPTR = 25
+  IIT_ANYPTR = 25,
+  IIT_V1   = 26,
+  IIT_VARARG = 27
 };
 
 
@@ -460,6 +469,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_Done:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
     return;
+  case IIT_VARARG:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VarArg, 0));
+    return;
   case IIT_MMX:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
     return;
@@ -490,6 +502,10 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_I64:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
     return;
+  case IIT_V1:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 1));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
   case IIT_V2:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2));
     DecodeIITType(NextElt, Infos, OutputTable);
@@ -601,6 +617,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
 
   switch (D.Kind) {
   case IITDescriptor::Void: return Type::getVoidTy(Context);
+  case IITDescriptor::VarArg: return Type::getVoidTy(Context);
   case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Half: return Type::getHalfTy(Context);
@@ -720,3 +737,32 @@ bool Function::callsFunctionThatReturnsTwice() const {
 
   return false;
 }
+
+Constant *Function::getPrefixData() const {
+  assert(hasPrefixData());
+  const LLVMContextImpl::PrefixDataMapTy &PDMap =
+      getContext().pImpl->PrefixDataMap;
+  assert(PDMap.find(this) != PDMap.end());
+  return cast<Constant>(PDMap.find(this)->second->getReturnValue());
+}
+
+void Function::setPrefixData(Constant *PrefixData) {
+  if (!PrefixData && !hasPrefixData())
+    return;
+
+  unsigned SCData = getSubclassDataFromValue();
+  LLVMContextImpl::PrefixDataMapTy &PDMap = getContext().pImpl->PrefixDataMap;
+  ReturnInst *&PDHolder = PDMap[this];
+  if (PrefixData) {
+    if (PDHolder)
+      PDHolder->setOperand(0, PrefixData);
+    else
+      PDHolder = ReturnInst::Create(getContext(), PrefixData);
+    SCData |= 2;
+  } else {
+    delete PDHolder;
+    PDMap.erase(this);
+    SCData &= ~2;
+  }
+  setValueSubclassData(SCData);
+}
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index e9baa5c..f0f8c7d 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -7,14 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// GCOV implements the interface to read and write coverage files that use 
+// GCOV implements the interface to read and write coverage files that use
 // 'gcov' format.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/GCOV.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/system_error.h"
 using namespace llvm;
@@ -43,25 +45,45 @@ bool GCOVFile::read(GCOVBuffer &Buffer) {
   if (Format == GCOV::InvalidGCOV)
     return false;
 
-  unsigned i = 0;
-  while (1) {
-    GCOVFunction *GFun = NULL;
-    if (isGCDAFile(Format)) {
-      // Use existing function while reading .gcda file.
-      assert(i < Functions.size() && ".gcda data does not match .gcno data");
-      GFun = Functions[i];
-    } else if (isGCNOFile(Format)){
-      GFun = new GCOVFunction();
+  if (isGCNOFile(Format)) {
+    while (true) {
+      if (!Buffer.readFunctionTag()) break;
+      GCOVFunction *GFun = new GCOVFunction();
+      if (!GFun->read(Buffer, Format))
+        return false;
       Functions.push_back(GFun);
     }
-    if (!GFun || !GFun->read(Buffer, Format))
-      break;
-    ++i;
   }
+  else if (isGCDAFile(Format)) {
+    for (size_t i = 0, e = Functions.size(); i < e; ++i) {
+      if (!Buffer.readFunctionTag()) {
+        errs() << "Unexpected number of functions.\n";
+        return false;
+      }
+      if (!Functions[i]->read(Buffer, Format))
+        return false;
+    }
+    if (Buffer.readObjectTag()) {
+      uint32_t Length;
+      uint32_t Dummy;
+      if (!Buffer.readInt(Length)) return false;
+      if (!Buffer.readInt(Dummy)) return false; // checksum
+      if (!Buffer.readInt(Dummy)) return false; // num
+      if (!Buffer.readInt(RunCount)) return false;;
+      Buffer.advanceCursor(Length-3);
+    }
+    while (Buffer.readProgramTag()) {
+      uint32_t Length;
+      if (!Buffer.readInt(Length)) return false;
+      Buffer.advanceCursor(Length);
+      ++ProgramCount;
+    }
+  }
+
   return true;
 }
 
-/// dump - Dump GCOVFile content on standard out for debugging purposes.
+/// dump - Dump GCOVFile content to dbgs() for debugging purposes.
 void GCOVFile::dump() {
   for (SmallVectorImpl<GCOVFunction *>::iterator I = Functions.begin(),
          E = Functions.end(); I != E; ++I)
@@ -72,9 +94,10 @@ void GCOVFile::dump() {
 /// reading .gcno and .gcda files.
 void GCOVFile::collectLineCounts(FileInfo &FI) {
   for (SmallVectorImpl<GCOVFunction *>::iterator I = Functions.begin(),
-         E = Functions.end(); I != E; ++I) 
+         E = Functions.end(); I != E; ++I)
     (*I)->collectLineCounts(FI);
-  FI.print();
+  FI.setRunCount(RunCount);
+  FI.setProgramCount(ProgramCount);
 }
 
 //===----------------------------------------------------------------------===//
@@ -85,76 +108,121 @@ GCOVFunction::~GCOVFunction() {
   DeleteContainerPointers(Blocks);
 }
 
-/// read - Read a aunction from the buffer. Return false if buffer cursor
+/// read - Read a function from the buffer. Return false if buffer cursor
 /// does not point to a function tag.
 bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
-  if (!Buff.readFunctionTag())
-    return false;
-
-  Buff.readInt(); // Function header length
-  Ident = Buff.readInt(); 
-  Buff.readInt(); // Checksum #1
+  uint32_t Dummy;
+  if (!Buff.readInt(Dummy)) return false; // Function header length
+  if (!Buff.readInt(Ident)) return false;
+  if (!Buff.readInt(Dummy)) return false; // Checksum #1
   if (Format != GCOV::GCNO_402 && Format != GCOV::GCDA_402)
-    Buff.readInt(); // Checksum #2
+    if (!Buff.readInt(Dummy)) return false; // Checksum #2
+
+  if (!Buff.readString(Name)) return false;
 
-  Name = Buff.readString();
   if (Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404)
-    Filename = Buff.readString();
+    if (!Buff.readString(Filename)) return false;
 
   if (Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404) {
-    Buff.readArcTag();
-    uint32_t Count = Buff.readInt() / 2;
-    for (unsigned i = 0, e = Count; i != e; ++i) {
-      Blocks[i]->addCount(Buff.readInt64());
+    if (!Buff.readArcTag()) {
+      errs() << "Arc tag not found.\n";
+      return false;
+    }
+    uint32_t Count;
+    if (!Buff.readInt(Count)) return false;
+    Count /= 2;
+
+    // This for loop adds the counts for each block. A second nested loop is
+    // required to combine the edge counts that are contained in the GCDA file.
+    for (uint32_t Line = 0; Count > 0; ++Line) {
+      if (Line >= Blocks.size()) {
+        errs() << "Unexpected number of edges.\n";
+        return false;
+      }
+      GCOVBlock &Block = *Blocks[Line];
+      for (size_t Edge = 0, End = Block.getNumEdges(); Edge < End; ++Edge) {
+        if (Count == 0) {
+          errs() << "Unexpected number of edges.\n";
+          return false;
+        }
+        uint64_t ArcCount;
+        if (!Buff.readInt64(ArcCount)) return false;
+        Block.addCount(ArcCount);
+        --Count;
+      }
     }
     return true;
   }
 
-  LineNumber = Buff.readInt();
+  if (!Buff.readInt(LineNumber)) return false;
 
   // read blocks.
-  bool BlockTagFound = Buff.readBlockTag();
-  (void)BlockTagFound;
-  assert(BlockTagFound && "Block Tag not found!");
-  uint32_t BlockCount = Buff.readInt();
-  for (int i = 0, e = BlockCount; i != e; ++i) {
-    Buff.readInt(); // Block flags;
-    Blocks.push_back(new GCOVBlock(i));
+  if (!Buff.readBlockTag()) {
+    errs() << "Block tag not found.\n";
+    return false;
+  }
+  uint32_t BlockCount;
+  if (!Buff.readInt(BlockCount)) return false;
+  for (uint32_t i = 0, e = BlockCount; i != e; ++i) {
+    if (!Buff.readInt(Dummy)) return false; // Block flags;
+    Blocks.push_back(new GCOVBlock(*this, i));
   }
 
   // read edges.
   while (Buff.readEdgeTag()) {
-    uint32_t EdgeCount = (Buff.readInt() - 1) / 2;
-    uint32_t BlockNo = Buff.readInt();
-    assert(BlockNo < BlockCount && "Unexpected Block number!");
-    for (int i = 0, e = EdgeCount; i != e; ++i) {
-      Blocks[BlockNo]->addEdge(Buff.readInt());
-      Buff.readInt(); // Edge flag
+    uint32_t EdgeCount;
+    if (!Buff.readInt(EdgeCount)) return false;
+    EdgeCount = (EdgeCount - 1) / 2;
+    uint32_t BlockNo;
+    if (!Buff.readInt(BlockNo)) return false;
+    if (BlockNo >= BlockCount) {
+      errs() << "Unexpected block number.\n";
+      return false;
+    }
+    for (uint32_t i = 0, e = EdgeCount; i != e; ++i) {
+      uint32_t Dst;
+      if (!Buff.readInt(Dst)) return false;
+      Blocks[BlockNo]->addEdge(Dst);
+      if (!Buff.readInt(Dummy)) return false; // Edge flag
     }
   }
 
   // read line table.
   while (Buff.readLineTag()) {
-    uint32_t LineTableLength = Buff.readInt();
-    uint32_t Size = Buff.getCursor() + LineTableLength*4;
-    uint32_t BlockNo = Buff.readInt();
-    assert(BlockNo < BlockCount && "Unexpected Block number!");
+    uint32_t LineTableLength;
+    if (!Buff.readInt(LineTableLength)) return false;
+    uint32_t EndPos = Buff.getCursor() + LineTableLength*4;
+    uint32_t BlockNo;
+    if (!Buff.readInt(BlockNo)) return false;
+    if (BlockNo >= BlockCount) {
+      errs() << "Unexpected block number.\n";
+      return false;
+    }
     GCOVBlock *Block = Blocks[BlockNo];
-    Buff.readInt(); // flag
-    while (Buff.getCursor() != (Size - 4)) {
-      StringRef Filename = Buff.readString();
-      if (Buff.getCursor() == (Size - 4)) break;
-      while (uint32_t L = Buff.readInt())
-        Block->addLine(Filename, L);
+    if (!Buff.readInt(Dummy)) return false; // flag
+    while (Buff.getCursor() != (EndPos - 4)) {
+      StringRef F;
+      if (!Buff.readString(F)) return false;
+      if (F != Filename) {
+        errs() << "Multiple sources for a single basic block.\n";
+        return false;
+      }
+      if (Buff.getCursor() == (EndPos - 4)) break;
+      while (true) {
+        uint32_t Line;
+        if (!Buff.readInt(Line)) return false;
+        if (!Line) break;
+        Block->addLine(Line);
+      }
     }
-    Buff.readInt(); // flag
+    if (!Buff.readInt(Dummy)) return false; // flag
   }
   return true;
 }
 
-/// dump - Dump GCOVFunction content on standard out for debugging purposes.
+/// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
 void GCOVFunction::dump() {
-  outs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
+  dbgs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
   for (SmallVectorImpl<GCOVBlock *>::iterator I = Blocks.begin(),
          E = Blocks.end(); I != E; ++I)
     (*I)->dump();
@@ -174,110 +242,73 @@ void GCOVFunction::collectLineCounts(FileInfo &FI) {
 /// ~GCOVBlock - Delete GCOVBlock and its content.
 GCOVBlock::~GCOVBlock() {
   Edges.clear();
-  DeleteContainerSeconds(Lines);
-}
-
-void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) {
-  GCOVLines *&LinesForFile = Lines[Filename];
-  if (!LinesForFile)
-    LinesForFile = new GCOVLines();
-  LinesForFile->add(LineNo);
+  Lines.clear();
 }
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVBlock::collectLineCounts(FileInfo &FI) {
-  for (StringMap<GCOVLines *>::iterator I = Lines.begin(),
+  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
          E = Lines.end(); I != E; ++I)
-    I->second->collectLineCounts(FI, I->first(), Counter);
+    FI.addLineCount(Parent.getFilename(), *I, Counter);
 }
 
-/// dump - Dump GCOVBlock content on standard out for debugging purposes.
+/// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
 void GCOVBlock::dump() {
-  outs() << "Block : " << Number << " Counter : " << Counter << "\n";
+  dbgs() << "Block : " << Number << " Counter : " << Counter << "\n";
   if (!Edges.empty()) {
-    outs() << "\tEdges : ";
+    dbgs() << "\tEdges : ";
     for (SmallVectorImpl<uint32_t>::iterator I = Edges.begin(), E = Edges.end();
          I != E; ++I)
-      outs() << (*I) << ",";
-    outs() << "\n";
+      dbgs() << (*I) << ",";
+    dbgs() << "\n";
   }
   if (!Lines.empty()) {
-    outs() << "\tLines : ";
-    for (StringMap<GCOVLines *>::iterator LI = Lines.begin(),
-           LE = Lines.end(); LI != LE; ++LI) {
-      outs() << LI->first() << " -> ";
-      LI->second->dump();
-      outs() << "\n";
-    }
+    dbgs() << "\tLines : ";
+    for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
+           E = Lines.end(); I != E; ++I)
+      dbgs() << (*I) << ",";
+    dbgs() << "\n";
   }
 }
 
 //===----------------------------------------------------------------------===//
-// GCOVLines implementation.
-
-/// collectLineCounts - Collect line counts. This must be used after
-/// reading .gcno and .gcda files.
-void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, 
-                                  uint32_t Count) {
-  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
-         E = Lines.end(); I != E; ++I)
-    FI.addLineCount(Filename, *I, Count);
-}
-
-/// dump - Dump GCOVLines content on standard out for debugging purposes.
-void GCOVLines::dump() {
-  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
-         E = Lines.end(); I != E; ++I)
-    outs() << (*I) << ",";
-}
-
-//===----------------------------------------------------------------------===//
 // FileInfo implementation.
 
-/// addLineCount - Add line count for the given line number in a file.
-void FileInfo::addLineCount(StringRef Filename, uint32_t Line, uint32_t Count) {
-  if (LineInfo.find(Filename) == LineInfo.end()) {
-    OwningPtr<MemoryBuffer> Buff;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
-      errs() << Filename << ": " << ec.message() << "\n";
-      return;
-    }
-    StringRef AllLines = Buff.take()->getBuffer();
-    LineCounts L(AllLines.count('\n')+2);
-    L[Line-1] = Count;
-    LineInfo[Filename] = L;
-    return;
-  }
-  LineCounts &L = LineInfo[Filename];
-  L[Line-1] = Count;
-}
-
 /// print -  Print source files with collected line count information.
-void FileInfo::print() {
+void FileInfo::print(raw_fd_ostream &OS, StringRef gcnoFile,
+                     StringRef gcdaFile) {
   for (StringMap<LineCounts>::iterator I = LineInfo.begin(), E = LineInfo.end();
        I != E; ++I) {
     StringRef Filename = I->first();
-    outs() << Filename << "\n";
+    OS << "        -:    0:Source:" << Filename << "\n";
+    OS << "        -:    0:Graph:" << gcnoFile << "\n";
+    OS << "        -:    0:Data:" << gcdaFile << "\n";
+    OS << "        -:    0:Runs:" << RunCount << "\n";
+    OS << "        -:    0:Programs:" << ProgramCount << "\n";
     LineCounts &L = LineInfo[Filename];
     OwningPtr<MemoryBuffer> Buff;
     if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
       errs() << Filename << ": " << ec.message() << "\n";
       return;
     }
-    StringRef AllLines = Buff.take()->getBuffer();
-    for (unsigned i = 0, e = L.size(); i != e; ++i) {
-      if (L[i])
-        outs() << L[i] << ":\t";
-      else
-        outs() << " :\t";
+    StringRef AllLines = Buff->getBuffer();
+    uint32_t i = 0;
+    while (!AllLines.empty()) {
+      if (L.find(i) != L.end()) {
+        if (L[i] == 0)
+          OS << "    #####:";
+        else
+          OS << format("%9" PRIu64 ":", L[i]);
+      } else {
+        OS << "        -:";
+      }
       std::pair<StringRef, StringRef> P = AllLines.split('\n');
       if (AllLines != P.first)
-        outs() << P.first;
-      outs() << "\n";
+        OS << format("%5u:", i+1) << P.first;
+      OS << "\n";
       AllLines = P.second;
+      ++i;
     }
   }
 }
-
-
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 6d547f3..da3b02a 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -229,14 +229,14 @@ void GlobalAlias::setAliasee(Constant *Aliasee) {
   setOperand(0, Aliasee);
 }
 
-const GlobalValue *GlobalAlias::getAliasedGlobal() const {
-  const Constant *C = getAliasee();
+GlobalValue *GlobalAlias::getAliasedGlobal() {
+  Constant *C = getAliasee();
   if (C == 0) return 0;
   
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return GV;
 
-  const ConstantExpr *CE = cast<ConstantExpr>(C);
+  ConstantExpr *CE = cast<ConstantExpr>(C);
   assert((CE->getOpcode() == Instruction::BitCast || 
           CE->getOpcode() == Instruction::GetElementPtr) &&
          "Unsupported aliasee");
@@ -244,18 +244,18 @@ const GlobalValue *GlobalAlias::getAliasedGlobal() const {
   return cast<GlobalValue>(CE->getOperand(0));
 }
 
-const GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) const {
-  SmallPtrSet<const GlobalValue*, 3> Visited;
+GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) {
+  SmallPtrSet<GlobalValue*, 3> Visited;
 
   // Check if we need to stop early.
   if (stopOnWeak && mayBeOverridden())
     return this;
 
-  const GlobalValue *GV = getAliasedGlobal();
+  GlobalValue *GV = getAliasedGlobal();
   Visited.insert(GV);
 
   // Iterate over aliasing chain, stopping on weak alias if necessary.
-  while (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
+  while (GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
     if (stopOnWeak && GA->mayBeOverridden())
       break;
 
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 2b5a0b3..a7773c4 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -223,18 +223,19 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case GetElementPtr: return "getelementptr";
 
   // Convert instructions...
-  case Trunc:     return "trunc";
-  case ZExt:      return "zext";
-  case SExt:      return "sext";
-  case FPTrunc:   return "fptrunc";
-  case FPExt:     return "fpext";
-  case FPToUI:    return "fptoui";
-  case FPToSI:    return "fptosi";
-  case UIToFP:    return "uitofp";
-  case SIToFP:    return "sitofp";
-  case IntToPtr:  return "inttoptr";
-  case PtrToInt:  return "ptrtoint";
-  case BitCast:   return "bitcast";
+  case Trunc:         return "trunc";
+  case ZExt:          return "zext";
+  case SExt:          return "sext";
+  case FPTrunc:       return "fptrunc";
+  case FPExt:         return "fpext";
+  case FPToUI:        return "fptoui";
+  case FPToSI:        return "fptosi";
+  case UIToFP:        return "uitofp";
+  case SIToFP:        return "sitofp";
+  case IntToPtr:      return "inttoptr";
+  case PtrToInt:      return "ptrtoint";
+  case BitCast:       return "bitcast";
+  case AddrSpaceCast: return "addrspacecast";
 
   // Other instructions...
   case ICmp:           return "icmp";
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 205cb43..8a6b77b 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -2095,7 +2095,9 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
     case Instruction::SIToFP:
     case Instruction::FPToUI:
     case Instruction::FPToSI:
-      return false; // These always modify bits
+    case Instruction::AddrSpaceCast:
+      // TODO: Target informations may give a more accurate answer here.
+      return false;
     case Instruction::BitCast:
       return true;  // BitCast never modifies bits.
     case Instruction::PtrToInt:
@@ -2137,44 +2139,46 @@ unsigned CastInst::isEliminableCastPair(
   // ZEXT          <       Integral   Unsigned     Integer      Any
   // SEXT          <       Integral    Signed      Integer      Any
   // FPTOUI       n/a      FloatPt      n/a        Integral   Unsigned
-  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed 
-  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a   
-  // SITOFP       n/a      Integral    Signed      FloatPt      n/a   
-  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a   
-  // FPEXT         <       FloatPt      n/a        FloatPt      n/a   
+  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed
+  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a
+  // SITOFP       n/a      Integral    Signed      FloatPt      n/a
+  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a
+  // FPEXT         <       FloatPt      n/a        FloatPt      n/a
   // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
   // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
-  // BITCAST       =       FirstClass   n/a       FirstClass    n/a   
+  // BITCAST       =       FirstClass   n/a       FirstClass    n/a
+  // ADDRSPCST    n/a      Pointer      n/a        Pointer      n/a
   //
   // NOTE: some transforms are safe, but we consider them to be non-profitable.
   // For example, we could merge "fptoui double to i32" + "zext i32 to i64",
   // into "fptoui double to i64", but this loses information about the range
-  // of the produced value (we no longer know the top-part is all zeros). 
+  // of the produced value (we no longer know the top-part is all zeros).
   // Further this conversion is often much more expensive for typical hardware,
-  // and causes issues when building libgcc.  We disallow fptosi+sext for the 
+  // and causes issues when building libgcc.  We disallow fptosi+sext for the
   // same reason.
-  const unsigned numCastOps = 
+  const unsigned numCastOps =
     Instruction::CastOpsEnd - Instruction::CastOpsBegin;
   static const uint8_t CastResults[numCastOps][numCastOps] = {
-    // T        F  F  U  S  F  F  P  I  B   -+
-    // R  Z  S  P  P  I  I  T  P  2  N  T    |
-    // U  E  E  2  2  2  2  R  E  I  T  C    +- secondOp
-    // N  X  X  U  S  F  F  N  X  N  2  V    |
-    // C  T  T  I  I  P  P  C  T  T  P  T   -+
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // Trunc      -+
-    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3 }, // ZExt        |
-    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3 }, // SExt        |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToUI      |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToSI      |
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // UIToFP      +- firstOp
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // SIToFP      |
-    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4 }, // FPTrunc     |
-    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4 }, // FPExt       |
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3 }, // PtrToInt    |
-    { 99,99,99,99,99,99,99,99,99,13,99,12 }, // IntToPtr    |
-    {  5, 5, 5, 6, 6, 5, 5, 6, 6,11, 5, 1 }, // BitCast    -+
+    // T        F  F  U  S  F  F  P  I  B  A  -+
+    // R  Z  S  P  P  I  I  T  P  2  N  T  S   |
+    // U  E  E  2  2  2  2  R  E  I  T  C  C   +- secondOp
+    // N  X  X  U  S  F  F  N  X  N  2  V  V   |
+    // C  T  T  I  I  P  P  C  T  T  P  T  T  -+
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc         -+
+    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3, 0}, // ZExt           |
+    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt           |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI         |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI         |
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP         +- firstOp
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP         |
+    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4, 0}, // FPTrunc        |
+    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4, 0}, // FPExt          |
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt       |
+    { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr       |
+    {  5, 5, 5, 6, 6, 5, 5, 6, 6,16, 5, 1,14}, // BitCast        |
+    {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
-  
+
   // If either of the casts are a bitcast from scalar to vector, disallow the
   // merging. However, bitcast of A->B->A are allowed.
   bool isFirstBitcast  = (firstOp == Instruction::BitCast);
@@ -2191,47 +2195,50 @@ unsigned CastInst::isEliminableCastPair(
                             [secondOp-Instruction::CastOpsBegin];
   switch (ElimCase) {
     case 0: 
-      // categorically disallowed
+      // Categorically disallowed.
       return 0;
     case 1: 
-      // allowed, use first cast's opcode
+      // Allowed, use first cast's opcode.
       return firstOp;
     case 2: 
-      // allowed, use second cast's opcode
+      // Allowed, use second cast's opcode.
       return secondOp;
     case 3: 
-      // no-op cast in second op implies firstOp as long as the DestTy 
+      // No-op cast in second op implies firstOp as long as the DestTy
       // is integer and we are not converting between a vector and a
       // non vector type.
       if (!SrcTy->isVectorTy() && DstTy->isIntegerTy())
         return firstOp;
       return 0;
     case 4:
-      // no-op cast in second op implies firstOp as long as the DestTy
+      // No-op cast in second op implies firstOp as long as the DestTy
       // is floating point.
       if (DstTy->isFloatingPointTy())
         return firstOp;
       return 0;
     case 5: 
-      // no-op cast in first op implies secondOp as long as the SrcTy
+      // No-op cast in first op implies secondOp as long as the SrcTy
       // is an integer.
       if (SrcTy->isIntegerTy())
         return secondOp;
       return 0;
     case 6:
-      // no-op cast in first op implies secondOp as long as the SrcTy
+      // No-op cast in first op implies secondOp as long as the SrcTy
       // is a floating point.
       if (SrcTy->isFloatingPointTy())
         return secondOp;
       return 0;
     case 7: {
+      // Cannot simplify if address spaces are different!
+      if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
+        return 0;
+
       unsigned MidSize = MidTy->getScalarSizeInBits();
-      // Check the address spaces first. If we know they are in the same address
-      // space, the pointer sizes must be the same so we can still fold this
-      // without knowing the actual sizes as long we know that the intermediate
-      // pointer is the largest possible pointer size.
-      if (MidSize == 64 &&
-          SrcTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace())
+      // We can still fold this without knowing the actual sizes as long we
+      // know that the intermediate pointer is the largest possible
+      // pointer size.
+      // FIXME: Is this always true?
+      if (MidSize == 64)
         return Instruction::BitCast;
 
       // ptrtoint, inttoptr -> bitcast (ptr -> ptr) if int size is >= ptr size.
@@ -2254,7 +2261,8 @@ unsigned CastInst::isEliminableCastPair(
         return firstOp;
       return secondOp;
     }
-    case 9: // zext, sext -> zext, because sext can't sign extend after zext
+    case 9:
+      // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 10:
       // fpext followed by ftrunc is allowed if the bit size returned to is
@@ -2263,46 +2271,6 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0; // If the types are not the same we can't eliminate it.
     case 11: {
-      // bitcast followed by ptrtoint is allowed as long as the bitcast is a
-      // pointer to pointer cast, and the pointers are the same size.
-      PointerType *SrcPtrTy = dyn_cast<PointerType>(SrcTy);
-      PointerType *MidPtrTy = dyn_cast<PointerType>(MidTy);
-      if (!SrcPtrTy || !MidPtrTy)
-        return 0;
-
-      // If the address spaces are the same, we know they are the same size
-      // without size information
-      if (SrcPtrTy->getAddressSpace() == MidPtrTy->getAddressSpace())
-        return secondOp;
-
-      if (!SrcIntPtrTy || !MidIntPtrTy)
-        return 0;
-
-      if (SrcIntPtrTy->getScalarSizeInBits() ==
-          MidIntPtrTy->getScalarSizeInBits())
-        return secondOp;
-
-      return 0;
-    }
-    case 12: {
-      // inttoptr, bitcast -> inttoptr if bitcast is a ptr to ptr cast
-      // and the ptrs are to address spaces of the same size
-      PointerType *MidPtrTy = dyn_cast<PointerType>(MidTy);
-      PointerType *DstPtrTy = dyn_cast<PointerType>(DstTy);
-      if (!MidPtrTy || !DstPtrTy)
-        return 0;
-
-      if (MidPtrTy->getAddressSpace() == DstPtrTy->getAddressSpace())
-        return firstOp;
-
-      if (MidIntPtrTy &&
-          DstIntPtrTy &&
-          MidIntPtrTy->getScalarSizeInBits() ==
-          DstIntPtrTy->getScalarSizeInBits())
-        return firstOp;
-      return 0;
-    }
-    case 13: {
       // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
       if (!MidIntPtrTy)
         return 0;
@@ -2313,8 +2281,65 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0;
     }
+    case 12: {
+      // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
+      // addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
+      if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
+        return Instruction::AddrSpaceCast;
+      return Instruction::BitCast;
+    }
+    case 13:
+      // FIXME: this state can be merged with (1), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        SrcTy->getPointerAddressSpace() != MidTy->getPointerAddressSpace() &&
+        MidTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+        "Illegal addrspacecast, bitcast sequence!");
+      // Allowed, use first cast's opcode
+      return firstOp;
+    case 14:
+      // FIXME: this state can be merged with (2), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
+        MidTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
+        "Illegal bitcast, addrspacecast sequence!");
+      // Allowed, use second cast's opcode
+      return secondOp;
+    case 15:
+      // FIXME: this state can be merged with (1), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isIntOrIntVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        MidTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+        "Illegal inttoptr, bitcast sequence!");
+      // Allowed, use first cast's opcode
+      return firstOp;
+    case 16:
+      // FIXME: this state can be merged with (2), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isIntOrIntVectorTy() &&
+        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
+        "Illegal bitcast, ptrtoint sequence!");
+      // Allowed, use second cast's opcode
+      return secondOp;
     case 99: 
-      // cast combination can't happen (error in input). This is for all cases
+      // Cast combination can't happen (error in input). This is for all cases
       // where the MidTy is not the same for the two cast instructions.
       llvm_unreachable("Invalid Cast Combination");
     default:
@@ -2327,19 +2352,20 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
-    case Trunc:    return new TruncInst    (S, Ty, Name, InsertBefore);
-    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertBefore);
-    case SExt:     return new SExtInst     (S, Ty, Name, InsertBefore);
-    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertBefore);
-    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertBefore);
-    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertBefore);
-    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertBefore);
-    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertBefore);
-    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertBefore);
-    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
-    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
-    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertBefore);
-    default: llvm_unreachable("Invalid opcode provided");
+  case Trunc:         return new TruncInst         (S, Ty, Name, InsertBefore);
+  case ZExt:          return new ZExtInst          (S, Ty, Name, InsertBefore);
+  case SExt:          return new SExtInst          (S, Ty, Name, InsertBefore);
+  case FPTrunc:       return new FPTruncInst       (S, Ty, Name, InsertBefore);
+  case FPExt:         return new FPExtInst         (S, Ty, Name, InsertBefore);
+  case UIToFP:        return new UIToFPInst        (S, Ty, Name, InsertBefore);
+  case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertBefore);
+  case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertBefore);
+  case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertBefore);
+  case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertBefore);
+  case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertBefore);
+  case BitCast:       return new BitCastInst       (S, Ty, Name, InsertBefore);
+  case AddrSpaceCast: return new AddrSpaceCastInst (S, Ty, Name, InsertBefore);
+  default: llvm_unreachable("Invalid opcode provided");
   }
 }
 
@@ -2348,19 +2374,20 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
-    case Trunc:    return new TruncInst    (S, Ty, Name, InsertAtEnd);
-    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertAtEnd);
-    case SExt:     return new SExtInst     (S, Ty, Name, InsertAtEnd);
-    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertAtEnd);
-    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertAtEnd);
-    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertAtEnd);
-    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertAtEnd);
-    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertAtEnd);
-    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertAtEnd);
-    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertAtEnd);
-    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertAtEnd);
-    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertAtEnd);
-    default: llvm_unreachable("Invalid opcode provided");
+  case Trunc:         return new TruncInst         (S, Ty, Name, InsertAtEnd);
+  case ZExt:          return new ZExtInst          (S, Ty, Name, InsertAtEnd);
+  case SExt:          return new SExtInst          (S, Ty, Name, InsertAtEnd);
+  case FPTrunc:       return new FPTruncInst       (S, Ty, Name, InsertAtEnd);
+  case FPExt:         return new FPExtInst         (S, Ty, Name, InsertAtEnd);
+  case UIToFP:        return new UIToFPInst        (S, Ty, Name, InsertAtEnd);
+  case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertAtEnd);
+  case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertAtEnd);
+  case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertAtEnd);
+  case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertAtEnd);
+  case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertAtEnd);
+  case BitCast:       return new BitCastInst       (S, Ty, Name, InsertAtEnd);
+  case AddrSpaceCast: return new AddrSpaceCastInst (S, Ty, Name, InsertAtEnd);
+  default: llvm_unreachable("Invalid opcode provided");
   }
 }
 
@@ -2425,6 +2452,11 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
 
   if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertAtEnd);
+
+  Type *STy = S->getType();
+  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertAtEnd);
+
   return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
 }
 
@@ -2442,6 +2474,11 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
 
   if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+
+  Type *STy = S->getType();
+  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertBefore);
+
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
@@ -2687,7 +2724,8 @@ CastInst::getCastOpcode(
     return BitCast;
   } else if (DestTy->isPointerTy()) {
     if (SrcTy->isPointerTy()) {
-      // TODO: Address space pointer sizes may not match
+      if (DestTy->getPointerAddressSpace() != SrcTy->getPointerAddressSpace())
+        return AddrSpaceCast;
       return BitCast;                               // ptr -> ptr
     } else if (SrcTy->isIntegerTy()) {
       return IntToPtr;                              // int -> ptr
@@ -2782,13 +2820,27 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
   case Instruction::BitCast:
     // BitCast implies a no-op cast of type only. No bits change.
     // However, you can't cast pointers to anything but pointers.
-    if (SrcTy->isPointerTy() != DstTy->isPointerTy())
+    if (SrcTy->isPtrOrPtrVectorTy() != DstTy->isPtrOrPtrVectorTy())
       return false;
 
-    // Now we know we're not dealing with a pointer/non-pointer mismatch. In all
-    // these cases, the cast is okay if the source and destination bit widths
-    // are identical.
-    return SrcTy->getPrimitiveSizeInBits() == DstTy->getPrimitiveSizeInBits();
+    // For non pointer cases, the cast is okay if the source and destination bit
+    // widths are identical.
+    if (!SrcTy->isPtrOrPtrVectorTy())
+      return SrcTy->getPrimitiveSizeInBits() == DstTy->getPrimitiveSizeInBits();
+
+    // If both are pointers then the address spaces must match and vector of
+    // pointers must have the same number of elements.
+    return SrcTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+           SrcTy->isVectorTy() == DstTy->isVectorTy() &&
+           (!SrcTy->isVectorTy() ||
+            SrcTy->getVectorNumElements() == SrcTy->getVectorNumElements());
+
+  case Instruction::AddrSpaceCast:
+    return SrcTy->isPtrOrPtrVectorTy() && DstTy->isPtrOrPtrVectorTy() &&
+           SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
+           SrcTy->isVectorTy() == DstTy->isVectorTy() &&
+           (!SrcTy->isVectorTy() ||
+            SrcTy->getVectorNumElements() == SrcTy->getVectorNumElements());
   }
 }
 
@@ -2935,6 +2987,18 @@ BitCastInst::BitCastInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
 
+AddrSpaceCastInst::AddrSpaceCastInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, AddrSpaceCast, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal AddrSpaceCast");
+}
+
+AddrSpaceCastInst::AddrSpaceCastInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, AddrSpaceCast, S, Name, InsertAtEnd) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal AddrSpaceCast");
+}
+
 //===----------------------------------------------------------------------===//
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
@@ -3267,7 +3331,6 @@ SwitchInst::SwitchInst(const SwitchInst &SI)
     OL[i] = InOL[i];
     OL[i+1] = InOL[i+1];
   }
-  TheSubsets = SI.TheSubsets;
   SubclassOptionalData = SI.SubclassOptionalData;
 }
 
@@ -3279,16 +3342,6 @@ SwitchInst::~SwitchInst() {
 /// addCase - Add an entry to the switch instruction...
 ///
 void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
-  IntegersSubsetToBB Mapping;
-  
-  // FIXME: Currently we work with ConstantInt based cases.
-  // So inititalize IntItem container directly from ConstantInt.
-  Mapping.add(IntItem::fromConstantInt(OnVal));
-  IntegersSubset CaseRanges = Mapping.getCase();
-  addCase(CaseRanges, Dest);
-}
-
-void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   unsigned NewCaseIdx = getNumCases(); 
   unsigned OpNo = NumOperands;
   if (OpNo+2 > ReservedSpace)
@@ -3296,17 +3349,14 @@ void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   NumOperands = OpNo+2;
-
-  SubsetsIt TheSubsetsIt = TheSubsets.insert(TheSubsets.end(), OnVal);
-  
-  CaseIt Case(this, NewCaseIdx, TheSubsetsIt);
-  Case.updateCaseValueOperand(OnVal);
+  CaseIt Case(this, NewCaseIdx);
+  Case.setValue(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt& i) {
+void SwitchInst::removeCase(CaseIt i) {
   unsigned idx = i.getCaseIndex();
   
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
@@ -3323,16 +3373,6 @@ void SwitchInst::removeCase(CaseIt& i) {
   // Nuke the last value.
   OL[NumOps-2].set(0);
   OL[NumOps-2+1].set(0);
-
-  // Do the same with TheCases collection:
-  if (i.SubsetIt != --TheSubsets.end()) {
-    *i.SubsetIt = TheSubsets.back();
-    TheSubsets.pop_back();
-  } else {
-    TheSubsets.pop_back();
-    i.SubsetIt = TheSubsets.end();
-  }
-  
   NumOperands = NumOps-2;
 }
 
@@ -3577,6 +3617,10 @@ BitCastInst *BitCastInst::clone_impl() const {
   return new BitCastInst(getOperand(0), getType());
 }
 
+AddrSpaceCastInst *AddrSpaceCastInst::clone_impl() const {
+  return new AddrSpaceCastInst(getOperand(0), getType());
+}
+
 CallInst *CallInst::clone_impl() const {
   return  new(getNumOperands()) CallInst(*this);
 }
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 0c659b8..407b985 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -355,6 +355,11 @@ public:
   typedef DenseMap<const Function*, unsigned> IntrinsicIDCacheTy;
   IntrinsicIDCacheTy IntrinsicIDCache;
 
+  /// \brief Mapping from a function to its prefix data, which is stored as the
+  /// operand of an unparented ReturnInst so that the prefix data has a Use.
+  typedef DenseMap<const Function *, ReturnInst *> PrefixDataMapTy;
+  PrefixDataMapTy PrefixDataMap;
+
   int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
   int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
   
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
new file mode 100644
index 0000000..a431d82
--- /dev/null
+++ b/lib/IR/LegacyPassManager.cpp
@@ -0,0 +1,1920 @@
+//===- LegacyPassManager.cpp - LLVM Pass Infrastructure Implementation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the legacy LLVM Pass Manager infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/PassNameParser.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+using namespace llvm::legacy;
+
+// See PassManagers.h for Pass Manager infrastructure overview.
+
+//===----------------------------------------------------------------------===//
+// Pass debugging information.  Often it is useful to find out what pass is
+// running when a crash occurs in a utility.  When this library is compiled with
+// debugging on, a command line option (--debug-pass) is enabled that causes the
+// pass name to be printed before it executes.
+//
+
+namespace {
+// Different debug levels that can be enabled...
+enum PassDebugLevel {
+  Disabled, Arguments, Structure, Executions, Details
+};
+}
+
+static cl::opt<enum PassDebugLevel>
+PassDebugging("debug-pass", cl::Hidden,
+                  cl::desc("Print PassManager debugging information"),
+                  cl::values(
+  clEnumVal(Disabled  , "disable debug output"),
+  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
+  clEnumVal(Structure , "print pass structure before run()"),
+  clEnumVal(Executions, "print pass name before it is executed"),
+  clEnumVal(Details   , "print pass details when it is executed"),
+                             clEnumValEnd));
+
+namespace {
+typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
+PassOptionList;
+}
+
+// Print IR out before/after specified passes.
+static PassOptionList
+PrintBefore("print-before",
+            llvm::cl::desc("Print IR before specified passes"),
+            cl::Hidden);
+
+static PassOptionList
+PrintAfter("print-after",
+           llvm::cl::desc("Print IR after specified passes"),
+           cl::Hidden);
+
+static cl::opt<bool>
+PrintBeforeAll("print-before-all",
+               llvm::cl::desc("Print IR before each pass"),
+               cl::init(false));
+static cl::opt<bool>
+PrintAfterAll("print-after-all",
+              llvm::cl::desc("Print IR after each pass"),
+              cl::init(false));
+
+/// This is a helper to determine whether to print IR before or
+/// after a pass.
+
+static bool ShouldPrintBeforeOrAfterPass(const PassInfo *PI,
+                                         PassOptionList &PassesToPrint) {
+  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
+    const llvm::PassInfo *PassInf = PassesToPrint[i];
+    if (PassInf)
+      if (PassInf->getPassArgument() == PI->getPassArgument()) {
+        return true;
+      }
+  }
+  return false;
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// before it.
+static bool ShouldPrintBeforePass(const PassInfo *PI) {
+  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PI, PrintBefore);
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// after it.
+static bool ShouldPrintAfterPass(const PassInfo *PI) {
+  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
+}
+
+/// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
+/// or higher is specified.
+bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
+  return PassDebugging >= Executions;
+}
+
+
+
+
+void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
+  if (V == 0 && M == 0)
+    OS << "Releasing pass '";
+  else
+    OS << "Running pass '";
+
+  OS << P->getPassName() << "'";
+
+  if (M) {
+    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
+    return;
+  }
+  if (V == 0) {
+    OS << '\n';
+    return;
+  }
+
+  OS << " on ";
+  if (isa<Function>(V))
+    OS << "function";
+  else if (isa<BasicBlock>(V))
+    OS << "basic block";
+  else
+    OS << "value";
+
+  OS << " '";
+  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
+  OS << "'\n";
+}
+
+
+namespace {
+//===----------------------------------------------------------------------===//
+// BBPassManager
+//
+/// BBPassManager manages BasicBlockPass. It batches all the
+/// pass together and sequence them to process one basic block before
+/// processing next basic block.
+class BBPassManager : public PMDataManager, public FunctionPass {
+
+public:
+  static char ID;
+  explicit BBPassManager()
+    : PMDataManager(), FunctionPass(ID) {}
+
+  /// Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the function, and if so, return true.
+  bool runOnFunction(Function &F);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M);
+  bool doInitialization(Function &F);
+  bool doFinalization(Module &M);
+  bool doFinalization(Function &F);
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  virtual const char *getPassName() const {
+    return "BasicBlock Pass Manager";
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      BP->dumpPassStructure(Offset + 1);
+      dumpLastUses(BP, Offset+1);
+    }
+  }
+
+  BasicBlockPass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
+    return BP;
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_BasicBlockPassManager;
+  }
+};
+
+char BBPassManager::ID = 0;
+} // End anonymous namespace
+
+namespace llvm {
+namespace legacy {
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl
+//
+/// FunctionPassManagerImpl manages FPPassManagers
+class FunctionPassManagerImpl : public Pass,
+                                public PMDataManager,
+                                public PMTopLevelManager {
+  virtual void anchor();
+private:
+  bool wasRun;
+public:
+  static char ID;
+  explicit FunctionPassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a function printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintFunctionPass(Banner, &O);
+  }
+
+  // Prepare for running an on the fly pass, freeing memory if needed
+  // from a previous run.
+  void releaseMemoryOnTheFly();
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Function &F);
+
+  /// doInitialization - Run all of the initializers for the function passes.
+  ///
+  bool doInitialization(Module &M);
+
+  /// doFinalization - Run all of the finalizers for the function passes.
+  ///
+  bool doFinalization(Module &M);
+
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_FunctionPassManager;
+  }
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  FPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
+    return FP;
+  }
+};
+
+void FunctionPassManagerImpl::anchor() {}
+
+char FunctionPassManagerImpl::ID = 0;
+} // End of legacy namespace
+} // End of llvm namespace
+
+namespace {
+//===----------------------------------------------------------------------===//
+// MPPassManager
+//
+/// MPPassManager manages ModulePasses and function pass managers.
+/// It batches all Module passes and function pass managers together and
+/// sequences them to process one module.
+class MPPassManager : public Pass, public PMDataManager {
+public:
+  static char ID;
+  explicit MPPassManager() :
+    Pass(PT_PassManager, ID), PMDataManager() { }
+
+  // Delete on the fly managers.
+  virtual ~MPPassManager() {
+    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+         I != E; ++I) {
+      FunctionPassManagerImpl *FPP = I->second;
+      delete FPP;
+    }
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  /// Add RequiredPass into list of lower level passes required by pass P.
+  /// RequiredPass is run on the fly by Pass Manager when P requests it
+  /// through getAnalysis interface.
+  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
+
+  /// Return function pass corresponding to PassInfo PI, that is
+  /// required by module pass MP. Instantiate analysis pass, by using
+  /// its runOnFunction() for function F.
+  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
+
+  virtual const char *getPassName() const {
+    return "Module Pass Manager";
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      ModulePass *MP = getContainedPass(Index);
+      MP->dumpPassStructure(Offset + 1);
+      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
+        OnTheFlyManagers.find(MP);
+      if (I != OnTheFlyManagers.end())
+        I->second->dumpPassStructure(Offset + 2);
+      dumpLastUses(MP, Offset+1);
+    }
+  }
+
+  ModulePass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    return static_cast<ModulePass *>(PassVector[N]);
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_ModulePassManager;
+  }
+
+ private:
+  /// Collection of on the fly FPPassManagers. These managers manage
+  /// function passes that are required by module passes.
+  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
+};
+
+char MPPassManager::ID = 0;
+} // End anonymous namespace
+
+namespace llvm {
+namespace legacy {
+//===----------------------------------------------------------------------===//
+// PassManagerImpl
+//
+
+/// PassManagerImpl manages MPPassManagers
+class PassManagerImpl : public Pass,
+                        public PMDataManager,
+                        public PMTopLevelManager {
+  virtual void anchor();
+
+public:
+  static char ID;
+  explicit PassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+                              PMTopLevelManager(new MPPassManager()) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_ModulePassManager;
+  }
+
+  MPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
+    return MP;
+  }
+};
+
+void PassManagerImpl::anchor() {}
+
+char PassManagerImpl::ID = 0;
+} // End of legacy namespace
+} // End of llvm namespace
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+/// TimingInfo Class - This class is used to calculate information about the
+/// amount of time each pass takes to execute.  This only happens when
+/// -time-passes is enabled on the command line.
+///
+
+static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
+
+class TimingInfo {
+  DenseMap<Pass*, Timer*> TimingData;
+  TimerGroup TG;
+public:
+  // Use 'create' member to get this.
+  TimingInfo() : TG("... Pass execution timing report ...") {}
+
+  // TimingDtor - Print out information about timing information
+  ~TimingInfo() {
+    // Delete all of the timers, which accumulate their info into the
+    // TimerGroup.
+    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
+         E = TimingData.end(); I != E; ++I)
+      delete I->second;
+    // TimerGroup is deleted next, printing the report.
+  }
+
+  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
+  // to a non null value (if the -time-passes option is enabled) or it leaves it
+  // null.  It may be called multiple times.
+  static void createTheTimeInfo();
+
+  /// getPassTimer - Return the timer for the specified pass if it exists.
+  Timer *getPassTimer(Pass *P) {
+    if (P->getAsPMDataManager())
+      return 0;
+
+    sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
+    Timer *&T = TimingData[P];
+    if (T == 0)
+      T = new Timer(P->getPassName(), TG);
+    return T;
+  }
+};
+
+} // End of anon namespace
+
+static TimingInfo *TheTimeInfo;
+
+//===----------------------------------------------------------------------===//
+// PMTopLevelManager implementation
+
+/// Initialize top level manager. Create first pass manager.
+PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
+  PMDM->setTopLevelManager(this);
+  addPassManager(PMDM);
+  activeStack.push(PMDM);
+}
+
+/// Set pass P as the last user of the given analysis passes.
+void
+PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
+  unsigned PDepth = 0;
+  if (P->getResolver())
+    PDepth = P->getResolver()->getPMDataManager().getDepth();
+
+  for (SmallVectorImpl<Pass *>::const_iterator I = AnalysisPasses.begin(),
+         E = AnalysisPasses.end(); I != E; ++I) {
+    Pass *AP = *I;
+    LastUser[AP] = P;
+
+    if (P == AP)
+      continue;
+
+    // Update the last users of passes that are required transitive by AP.
+    AnalysisUsage *AnUsage = findAnalysisUsage(AP);
+    const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+    SmallVector<Pass *, 12> LastUses;
+    SmallVector<Pass *, 12> LastPMUses;
+    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      assert(AnalysisPass && "Expected analysis pass to exist.");
+      AnalysisResolver *AR = AnalysisPass->getResolver();
+      assert(AR && "Expected analysis resolver to exist.");
+      unsigned APDepth = AR->getPMDataManager().getDepth();
+
+      if (PDepth == APDepth)
+        LastUses.push_back(AnalysisPass);
+      else if (PDepth > APDepth)
+        LastPMUses.push_back(AnalysisPass);
+    }
+
+    setLastUser(LastUses, P);
+
+    // If this pass has a corresponding pass manager, push higher level
+    // analysis to this pass manager.
+    if (P->getResolver())
+      setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
+
+
+    // If AP is the last user of other passes then make P last user of
+    // such passes.
+    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
+           LUE = LastUser.end(); LUI != LUE; ++LUI) {
+      if (LUI->second == AP)
+        // DenseMap iterator is not invalidated here because
+        // this is just updating existing entries.
+        LastUser[LUI->first] = P;
+    }
+  }
+}
+
+/// Collect passes whose last user is P
+void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
+                                        Pass *P) {
+  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
+    InversedLastUser.find(P);
+  if (DMI == InversedLastUser.end())
+    return;
+
+  SmallPtrSet<Pass *, 8> &LU = DMI->second;
+  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
+         E = LU.end(); I != E; ++I) {
+    LastUses.push_back(*I);
+  }
+
+}
+
+AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
+  AnalysisUsage *AnUsage = NULL;
+  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  if (DMI != AnUsageMap.end())
+    AnUsage = DMI->second;
+  else {
+    AnUsage = new AnalysisUsage();
+    P->getAnalysisUsage(*AnUsage);
+    AnUsageMap[P] = AnUsage;
+  }
+  return AnUsage;
+}
+
+/// Schedule pass P for execution. Make sure that passes required by
+/// P are run before P is run. Update analysis info maintained by
+/// the manager. Remove dead passes. This is a recursive function.
+void PMTopLevelManager::schedulePass(Pass *P) {
+
+  // TODO : Allocate function manager for this pass, other wise required set
+  // may be inserted into previous function manager
+
+  // Give pass a chance to prepare the stage.
+  P->preparePassManager(activeStack);
+
+  // If P is an analysis pass and it is available then do not
+  // generate the analysis again. Stale analysis info should not be
+  // available at this point.
+  const PassInfo *PI =
+    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
+    delete P;
+    return;
+  }
+
+  AnalysisUsage *AnUsage = findAnalysisUsage(P);
+
+  bool checkAnalysis = true;
+  while (checkAnalysis) {
+    checkAnalysis = false;
+
+    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
+           E = RequiredSet.end(); I != E; ++I) {
+
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      if (!AnalysisPass) {
+        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+
+        if (PI == NULL) {
+          // Pass P is not in the global PassRegistry
+          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
+          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
+          dbgs() << "Required Passes:" << "\n";
+          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
+                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
+            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+            if (AnalysisPass2) {
+              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
+            } else {
+              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
+              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
+              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
+            }
+          }
+        }
+
+        assert(PI && "Expected required passes to be initialized");
+        AnalysisPass = PI->createPass();
+        if (P->getPotentialPassManagerType () ==
+            AnalysisPass->getPotentialPassManagerType())
+          // Schedule analysis pass that is managed by the same pass manager.
+          schedulePass(AnalysisPass);
+        else if (P->getPotentialPassManagerType () >
+                 AnalysisPass->getPotentialPassManagerType()) {
+          // Schedule analysis pass that is managed by a new manager.
+          schedulePass(AnalysisPass);
+          // Recheck analysis passes to ensure that required analyses that
+          // are already checked are still available.
+          checkAnalysis = true;
+        } else
+          // Do not schedule this analysis. Lower level analsyis
+          // passes are run on the fly.
+          delete AnalysisPass;
+      }
+    }
+  }
+
+  // Now all required passes are available.
+  if (ImmutablePass *IP = P->getAsImmutablePass()) {
+    // P is a immutable pass and it will be managed by this
+    // top level manager. Set up analysis resolver to connect them.
+    PMDataManager *DM = getAsPMDataManager();
+    AnalysisResolver *AR = new AnalysisResolver(*DM);
+    P->setResolver(AR);
+    DM->initializeAnalysisImpl(P);
+    addImmutablePass(IP);
+    DM->recordAvailableAnalysis(IP);
+    return;
+  }
+
+  if (PI && !PI->isAnalysis() && ShouldPrintBeforePass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump Before ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+
+  // Add the requested pass to the best available pass manager.
+  P->assignPassManager(activeStack, getTopLevelPassManagerType());
+
+  if (PI && !PI->isAnalysis() && ShouldPrintAfterPass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump After ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+}
+
+/// Find the pass that implements Analysis AID. Search immutable
+/// passes and all pass managers. If desired pass is not found
+/// then return NULL.
+Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
+
+  // Check pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+         I = IndirectPassManagers.begin(),
+         E = IndirectPassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check the immutable passes. Iterate in reverse order so that we find
+  // the most recently registered passes first.
+  for (SmallVectorImpl<ImmutablePass *>::reverse_iterator I =
+       ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; ++I) {
+    AnalysisID PI = (*I)->getPassID();
+    if (PI == AID)
+      return *I;
+
+    // If Pass not found then check the interfaces implemented by Immutable Pass
+    const PassInfo *PassInf =
+      PassRegistry::getPassRegistry()->getPassInfo(PI);
+    assert(PassInf && "Expected all immutable passes to be initialized");
+    const std::vector<const PassInfo*> &ImmPI =
+      PassInf->getInterfacesImplemented();
+    for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
+         EE = ImmPI.end(); II != EE; ++II) {
+      if ((*II)->getTypeInfo() == AID)
+        return *I;
+    }
+  }
+
+  return 0;
+}
+
+// Print passes managed by this top level manager.
+void PMTopLevelManager::dumpPasses() const {
+
+  if (PassDebugging < Structure)
+    return;
+
+  // Print out the immutable passes
+  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
+    ImmutablePasses[i]->dumpPassStructure(0);
+  }
+
+  // Every class that derives from PMDataManager also derives from Pass
+  // (sometimes indirectly), but there's no inheritance relationship
+  // between PMDataManager and Pass, so we have to getAsPass to get
+  // from a PMDataManager* to a Pass*.
+  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
+       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
+    (*I)->getAsPass()->dumpPassStructure(1);
+}
+
+void PMTopLevelManager::dumpArguments() const {
+
+  if (PassDebugging < Arguments)
+    return;
+
+  dbgs() << "Pass Arguments: ";
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
+       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    if (const PassInfo *PI =
+        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+      assert(PI && "Expected all immutable passes to be initialized");
+      if (!PI->isAnalysisGroup())
+        dbgs() << " -" << PI->getPassArgument();
+    }
+  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
+       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
+    (*I)->dumpPassArguments();
+  dbgs() << "\n";
+}
+
+void PMTopLevelManager::initializeAllAnalysisInfo() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  // Initailize other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+       I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
+        DME = LastUser.end(); DMI != DME; ++DMI) {
+    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
+      InversedLastUser.find(DMI->second);
+    if (InvDMI != InversedLastUser.end()) {
+      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
+      L.insert(DMI->first);
+    } else {
+      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
+      InversedLastUser[DMI->second] = L;
+    }
+  }
+}
+
+/// Destructor
+PMTopLevelManager::~PMTopLevelManager() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    delete *I;
+
+  for (SmallVectorImpl<ImmutablePass *>::iterator
+         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    delete *I;
+
+  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
+         DME = AnUsageMap.end(); DMI != DME; ++DMI)
+    delete DMI->second;
+}
+
+//===----------------------------------------------------------------------===//
+// PMDataManager implementation
+
+/// Augement AvailableAnalysis by adding analysis made available by pass P.
+void PMDataManager::recordAvailableAnalysis(Pass *P) {
+  AnalysisID PI = P->getPassID();
+
+  AvailableAnalysis[PI] = P;
+
+  assert(!AvailableAnalysis.empty());
+
+  // This pass is the current implementation of all of the interfaces it
+  // implements as well.
+  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
+  if (PInf == 0) return;
+  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+  for (unsigned i = 0, e = II.size(); i != e; ++i)
+    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+}
+
+// Return true if P preserves high level analysis used by other
+// passes managed by this manager
+bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return true;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
+         E = HigherLevelAnalysis.end(); I  != E; ++I) {
+    Pass *P1 = *I;
+    if (P1->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(),
+                  P1->getPassID()) ==
+           PreservedSet.end())
+      return false;
+  }
+
+  return true;
+}
+
+/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
+void PMDataManager::verifyPreservedAnalysis(Pass *P) {
+  // Don't do this unless assertions are enabled.
+#ifdef NDEBUG
+  return;
+#endif
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+
+  // Verify preserved analysis
+  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
+         E = PreservedSet.end(); I != E; ++I) {
+    AnalysisID AID = *I;
+    if (Pass *AP = findAnalysisPass(AID, true)) {
+      TimeRegion PassTimer(getPassTimer(AP));
+      AP->verifyAnalysis();
+    }
+  }
+}
+
+/// Remove Analysis not preserved by Pass P
+void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
+         E = AvailableAnalysis.end(); I != E; ) {
+    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
+    if (Info->second->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+        PreservedSet.end()) {
+      // Remove this analysis
+      if (PassDebugging >= Details) {
+        Pass *S = Info->second;
+        dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+        dbgs() << S->getPassName() << "'\n";
+      }
+      AvailableAnalysis.erase(Info);
+    }
+  }
+
+  // Check inherited analysis also. If P is not preserving analysis
+  // provided by parent manager then remove it here.
+  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
+
+    if (!InheritedAnalysis[Index])
+      continue;
+
+    for (DenseMap<AnalysisID, Pass*>::iterator
+           I = InheritedAnalysis[Index]->begin(),
+           E = InheritedAnalysis[Index]->end(); I != E; ) {
+      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
+      if (Info->second->getAsImmutablePass() == 0 &&
+          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+             PreservedSet.end()) {
+        // Remove this analysis
+        if (PassDebugging >= Details) {
+          Pass *S = Info->second;
+          dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+          dbgs() << S->getPassName() << "'\n";
+        }
+        InheritedAnalysis[Index]->erase(Info);
+      }
+    }
+  }
+}
+
+/// Remove analysis passes that are not used any longer
+void PMDataManager::removeDeadPasses(Pass *P, StringRef Msg,
+                                     enum PassDebuggingString DBG_STR) {
+
+  SmallVector<Pass *, 12> DeadPasses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(DeadPasses, P);
+
+  if (PassDebugging >= Details && !DeadPasses.empty()) {
+    dbgs() << " -*- '" <<  P->getPassName();
+    dbgs() << "' is the last user of following pass instances.";
+    dbgs() << " Free these instances\n";
+  }
+
+  for (SmallVectorImpl<Pass *>::iterator I = DeadPasses.begin(),
+         E = DeadPasses.end(); I != E; ++I)
+    freePass(*I, Msg, DBG_STR);
+}
+
+void PMDataManager::freePass(Pass *P, StringRef Msg,
+                             enum PassDebuggingString DBG_STR) {
+  dumpPassInfo(P, FREEING_MSG, DBG_STR, Msg);
+
+  {
+    // If the pass crashes releasing memory, remember this.
+    PassManagerPrettyStackEntry X(P);
+    TimeRegion PassTimer(getPassTimer(P));
+
+    P->releaseMemory();
+  }
+
+  AnalysisID PI = P->getPassID();
+  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
+    // Remove the pass itself (if it is not already removed).
+    AvailableAnalysis.erase(PI);
+
+    // Remove all interfaces this pass implements, for which it is also
+    // listed as the available implementation.
+    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+    for (unsigned i = 0, e = II.size(); i != e; ++i) {
+      DenseMap<AnalysisID, Pass*>::iterator Pos =
+        AvailableAnalysis.find(II[i]->getTypeInfo());
+      if (Pos != AvailableAnalysis.end() && Pos->second == P)
+        AvailableAnalysis.erase(Pos);
+    }
+  }
+}
+
+/// Add pass P into the PassVector. Update
+/// AvailableAnalysis appropriately if ProcessAnalysis is true.
+void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
+  // This manager is going to manage pass P. Set up analysis resolver
+  // to connect them.
+  AnalysisResolver *AR = new AnalysisResolver(*this);
+  P->setResolver(AR);
+
+  // If a FunctionPass F is the last user of ModulePass info M
+  // then the F's manager, not F, records itself as a last user of M.
+  SmallVector<Pass *, 12> TransferLastUses;
+
+  if (!ProcessAnalysis) {
+    // Add pass
+    PassVector.push_back(P);
+    return;
+  }
+
+  // At the moment, this pass is the last user of all required passes.
+  SmallVector<Pass *, 12> LastUses;
+  SmallVector<Pass *, 8> RequiredPasses;
+  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
+
+  unsigned PDepth = this->getDepth();
+
+  collectRequiredAnalysis(RequiredPasses,
+                          ReqAnalysisNotAvailable, P);
+  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
+         E = RequiredPasses.end(); I != E; ++I) {
+    Pass *PRequired = *I;
+    unsigned RDepth = 0;
+
+    assert(PRequired->getResolver() && "Analysis Resolver is not set");
+    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
+    RDepth = DM.getDepth();
+
+    if (PDepth == RDepth)
+      LastUses.push_back(PRequired);
+    else if (PDepth > RDepth) {
+      // Let the parent claim responsibility of last use
+      TransferLastUses.push_back(PRequired);
+      // Keep track of higher level analysis used by this manager.
+      HigherLevelAnalysis.push_back(PRequired);
+    } else
+      llvm_unreachable("Unable to accommodate Required Pass");
+  }
+
+  // Set P as P's last user until someone starts using P.
+  // However, if P is a Pass Manager then it does not need
+  // to record its last user.
+  if (P->getAsPMDataManager() == 0)
+    LastUses.push_back(P);
+  TPM->setLastUser(LastUses, P);
+
+  if (!TransferLastUses.empty()) {
+    Pass *My_PM = getAsPass();
+    TPM->setLastUser(TransferLastUses, My_PM);
+    TransferLastUses.clear();
+  }
+
+  // Now, take care of required analyses that are not available.
+  for (SmallVectorImpl<AnalysisID>::iterator
+         I = ReqAnalysisNotAvailable.begin(),
+         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
+    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+    Pass *AnalysisPass = PI->createPass();
+    this->addLowerLevelRequiredPass(P, AnalysisPass);
+  }
+
+  // Take a note of analysis required and made available by this pass.
+  // Remove the analysis not preserved by this pass
+  removeNotPreservedAnalysis(P);
+  recordAvailableAnalysis(P);
+
+  // Add pass
+  PassVector.push_back(P);
+}
+
+
+/// Populate RP with analysis pass that are required by
+/// pass P and are available. Populate RP_NotAvail with analysis
+/// pass that are required by pass P but are not available.
+void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
+                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
+                                            Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+  for (AnalysisUsage::VectorType::const_iterator
+         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+
+  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+}
+
+// All Required analyses should be available to the pass as it runs!  Here
+// we fill in the AnalysisImpls member of the pass so that it can
+// successfully use the getAnalysis() method to retrieve the
+// implementations it needs.
+//
+void PMDataManager::initializeAnalysisImpl(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+
+  for (AnalysisUsage::VectorType::const_iterator
+         I = AnUsage->getRequiredSet().begin(),
+         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
+    Pass *Impl = findAnalysisPass(*I, true);
+    if (Impl == 0)
+      // This may be analysis pass that is initialized on the fly.
+      // If that is not the case then it will raise an assert when it is used.
+      continue;
+    AnalysisResolver *AR = P->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->addAnalysisImplsPair(*I, Impl);
+  }
+}
+
+/// Find the pass that implements Analysis AID. If desired pass is not found
+/// then return NULL.
+Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
+
+  // Check if AvailableAnalysis map has one entry.
+  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
+
+  if (I != AvailableAnalysis.end())
+    return I->second;
+
+  // Search Parents through TopLevelManager
+  if (SearchParent)
+    return TPM->findAnalysisPass(AID);
+
+  return NULL;
+}
+
+// Print list of passes that are last used by P.
+void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
+
+  SmallVector<Pass *, 12> LUses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(LUses, P);
+
+  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
+         E = LUses.end(); I != E; ++I) {
+    llvm::dbgs() << "--" << std::string(Offset*2, ' ');
+    (*I)->dumpPassStructure(0);
+  }
+}
+
+void PMDataManager::dumpPassArguments() const {
+  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
+        E = PassVector.end(); I != E; ++I) {
+    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
+      PMD->dumpPassArguments();
+    else
+      if (const PassInfo *PI =
+            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+        if (!PI->isAnalysisGroup())
+          dbgs() << " -" << PI->getPassArgument();
+  }
+}
+
+void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
+                                 enum PassDebuggingString S2,
+                                 StringRef Msg) {
+  if (PassDebugging < Executions)
+    return;
+  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
+  switch (S1) {
+  case EXECUTION_MSG:
+    dbgs() << "Executing Pass '" << P->getPassName();
+    break;
+  case MODIFICATION_MSG:
+    dbgs() << "Made Modification '" << P->getPassName();
+    break;
+  case FREEING_MSG:
+    dbgs() << " Freeing Pass '" << P->getPassName();
+    break;
+  default:
+    break;
+  }
+  switch (S2) {
+  case ON_BASICBLOCK_MSG:
+    dbgs() << "' on BasicBlock '" << Msg << "'...\n";
+    break;
+  case ON_FUNCTION_MSG:
+    dbgs() << "' on Function '" << Msg << "'...\n";
+    break;
+  case ON_MODULE_MSG:
+    dbgs() << "' on Module '"  << Msg << "'...\n";
+    break;
+  case ON_REGION_MSG:
+    dbgs() << "' on Region '"  << Msg << "'...\n";
+    break;
+  case ON_LOOP_MSG:
+    dbgs() << "' on Loop '" << Msg << "'...\n";
+    break;
+  case ON_CG_MSG:
+    dbgs() << "' on Call Graph Nodes '" << Msg << "'...\n";
+    break;
+  default:
+    break;
+  }
+}
+
+void PMDataManager::dumpRequiredSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
+}
+
+void PMDataManager::dumpPreservedSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
+}
+
+void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
+                                   const AnalysisUsage::VectorType &Set) const {
+  assert(PassDebugging >= Details);
+  if (Set.empty())
+    return;
+  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
+  for (unsigned i = 0; i != Set.size(); ++i) {
+    if (i) dbgs() << ',';
+    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    if (!PInf) {
+      // Some preserved passes, such as AliasAnalysis, may not be initialized by
+      // all drivers.
+      dbgs() << " Uninitialized Pass";
+      continue;
+    }
+    dbgs() << ' ' << PInf->getPassName();
+  }
+  dbgs() << '\n';
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+/// This should be handled by specific pass manager.
+void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  if (TPM) {
+    TPM->dumpArguments();
+    TPM->dumpPasses();
+  }
+
+  // Module Level pass may required Function Level analysis info
+  // (e.g. dominator info). Pass manager uses on the fly function pass manager
+  // to provide this on demand. In that case, in Pass manager terminology,
+  // module level pass is requiring lower level analysis info managed by
+  // lower level pass manager.
+
+  // When Pass manager is not able to order required analysis info, Pass manager
+  // checks whether any lower level manager will be able to provide this
+  // analysis info on demand or not.
+#ifndef NDEBUG
+  dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
+  dbgs() << "' required by '" << P->getPassName() << "'\n";
+#endif
+  llvm_unreachable("Unable to schedule pass");
+}
+
+Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
+  llvm_unreachable("Unable to find on the fly pass");
+}
+
+// Destructor
+PMDataManager::~PMDataManager() {
+  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
+         E = PassVector.end(); I != E; ++I)
+    delete *I;
+}
+
+//===----------------------------------------------------------------------===//
+// NOTE: Is this the right place to define this method ?
+// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
+Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
+  return PM.findAnalysisPass(ID, dir);
+}
+
+Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
+                                     Function &F) {
+  return PM.getOnTheFlyPass(P, AnalysisPI, F);
+}
+
+//===----------------------------------------------------------------------===//
+// BBPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool BBPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = doInitialization(F);
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      bool LocalChanged = false;
+
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
+      dumpRequiredSet(BP);
+
+      initializeAnalysisImpl(BP);
+
+      {
+        // If the pass crashes, remember this.
+        PassManagerPrettyStackEntry X(BP, *I);
+        TimeRegion PassTimer(getPassTimer(BP));
+
+        LocalChanged |= BP->runOnBasicBlock(*I);
+      }
+
+      Changed |= LocalChanged;
+      if (LocalChanged)
+        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
+                     I->getName());
+      dumpPreservedSet(BP);
+
+      verifyPreservedAnalysis(BP);
+      removeNotPreservedAnalysis(BP);
+      recordAvailableAnalysis(BP);
+      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
+    }
+
+  return doFinalization(F) || Changed;
+}
+
+// Implement doInitialization and doFinalization
+bool BBPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doInitialization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doInitialization(F);
+  }
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doFinalization(F);
+  }
+
+  return Changed;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManager implementation
+
+/// Create new Function pass manager
+FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
+  FPM = new FunctionPassManagerImpl();
+  // FPM is the top level manager.
+  FPM->setTopLevelManager(FPM);
+
+  AnalysisResolver *AR = new AnalysisResolver(*FPM);
+  FPM->setResolver(AR);
+}
+
+FunctionPassManager::~FunctionPassManager() {
+  delete FPM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes
+/// ownership of the Pass to the PassManager.  When the
+/// PassManager_X is destroyed, the pass will be destroyed as well, so
+/// there is no need to delete the pass. (TODO delete passes.)
+/// This implies that all passes MUST be allocated with 'new'.
+void FunctionPassManager::add(Pass *P) {
+  FPM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep
+/// track of whether any of the passes modifies the function, and if
+/// so, return true.
+///
+bool FunctionPassManager::run(Function &F) {
+  if (F.isMaterializable()) {
+    std::string errstr;
+    if (F.Materialize(&errstr))
+      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
+  }
+  return FPM->run(F);
+}
+
+
+/// doInitialization - Run all of the initializers for the function passes.
+///
+bool FunctionPassManager::doInitialization() {
+  return FPM->doInitialization(*M);
+}
+
+/// doFinalization - Run all of the finalizers for the function passes.
+///
+bool FunctionPassManager::doFinalization() {
+  return FPM->doFinalization(*M);
+}
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl implementation
+//
+bool FunctionPassManagerImpl::doInitialization(Module &M) {
+  bool Changed = false;
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FunctionPassManagerImpl::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
+    Changed |= getContainedManager(Index)->doFinalization(M);
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+/// cleanup - After running all passes, clean up pass manager cache.
+void FPPassManager::cleanup() {
+ for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    AnalysisResolver *AR = FP->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->clearAnalysisImpls();
+ }
+}
+
+void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
+  if (!wasRun)
+    return;
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
+    FPPassManager *FPPM = getContainedManager(Index);
+    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
+      FPPM->getContainedPass(Index)->releaseMemory();
+    }
+  }
+  wasRun = false;
+}
+
+// Execute all the passes managed by this top level manager.
+// Return true if any function is modified by a pass.
+bool FunctionPassManagerImpl::run(Function &F) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnFunction(F);
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    getContainedManager(Index)->cleanup();
+
+  wasRun = true;
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// FPPassManager implementation
+
+char FPPassManager::ID = 0;
+/// Print passes managed by this manager
+void FPPassManager::dumpPassStructure(unsigned Offset) {
+  dbgs().indent(Offset*2) << "FunctionPass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    FP->dumpPassStructure(Offset + 1);
+    dumpLastUses(FP, Offset+1);
+  }
+}
+
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnFunction method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool FPPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpRequiredSet(FP);
+
+    initializeAnalysisImpl(FP);
+
+    {
+      PassManagerPrettyStackEntry X(FP, F);
+      TimeRegion PassTimer(getPassTimer(FP));
+
+      LocalChanged |= FP->runOnFunction(F);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpPreservedSet(FP);
+
+    verifyPreservedAnalysis(FP);
+    removeNotPreservedAnalysis(FP);
+    recordAvailableAnalysis(FP);
+    removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
+  }
+  return Changed;
+}
+
+bool FPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    Changed |= runOnFunction(*I);
+
+  return Changed;
+}
+
+bool FPPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FPPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// MPPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnModule method.  Keep track of whether any of the passes modifies
+/// the module, and if so, return true.
+bool
+MPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Initialize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    Changed |= FPP->doInitialization(M);
+  }
+
+  // Initialize module passes
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    ModulePass *MP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
+    dumpRequiredSet(MP);
+
+    initializeAnalysisImpl(MP);
+
+    {
+      PassManagerPrettyStackEntry X(MP, M);
+      TimeRegion PassTimer(getPassTimer(MP));
+
+      LocalChanged |= MP->runOnModule(M);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
+                   M.getModuleIdentifier());
+    dumpPreservedSet(MP);
+
+    verifyPreservedAnalysis(MP);
+    removeNotPreservedAnalysis(MP);
+    recordAvailableAnalysis(MP);
+    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
+  }
+
+  // Finalize module passes
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  // Finalize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    // We don't know when is the last time an on-the-fly pass is run,
+    // so we need to releaseMemory / finalize here
+    FPP->releaseMemoryOnTheFly();
+    Changed |= FPP->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+  assert((P->getPotentialPassManagerType() <
+          RequiredPass->getPotentialPassManagerType()) &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
+  if (!FPP) {
+    FPP = new FunctionPassManagerImpl();
+    // FPP is the top level manager.
+    FPP->setTopLevelManager(FPP);
+
+    OnTheFlyManagers[P] = FPP;
+  }
+  FPP->add(RequiredPass);
+
+  // Register P as the last user of RequiredPass.
+  if (RequiredPass) {
+    SmallVector<Pass *, 1> LU;
+    LU.push_back(RequiredPass);
+    FPP->setLastUser(LU,  P);
+  }
+}
+
+/// Return function pass corresponding to PassInfo PI, that is
+/// required by module pass MP. Instantiate analysis pass, by using
+/// its runOnFunction() for function F.
+Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
+  assert(FPP && "Unable to find on the fly pass");
+
+  FPP->releaseMemoryOnTheFly();
+  FPP->run(F);
+  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
+}
+
+
+//===----------------------------------------------------------------------===//
+// PassManagerImpl implementation
+
+//
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManagerImpl::run(Module &M) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnModule(M);
+
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager implementation
+
+/// Create new pass manager
+PassManager::PassManager() {
+  PM = new PassManagerImpl();
+  // PM is the top level manager
+  PM->setTopLevelManager(PM);
+}
+
+PassManager::~PassManager() {
+  delete PM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes ownership of
+/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+/// will be destroyed as well, so there is no need to delete the pass.  This
+/// implies that all passes MUST be allocated with 'new'.
+void PassManager::add(Pass *P) {
+  PM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManager::run(Module &M) {
+  return PM->run(M);
+}
+
+//===----------------------------------------------------------------------===//
+// TimingInfo implementation
+
+bool llvm::TimePassesIsEnabled = false;
+static cl::opt<bool,true>
+EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
+            cl::desc("Time each pass, printing elapsed time for each on exit"));
+
+// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
+// a non null value (if the -time-passes option is enabled) or it leaves it
+// null.  It may be called multiple times.
+void TimingInfo::createTheTimeInfo() {
+  if (!TimePassesIsEnabled || TheTimeInfo) return;
+
+  // Constructed the first time this is called, iff -time-passes is enabled.
+  // This guarantees that the object will be constructed before static globals,
+  // thus it will be destroyed before them.
+  static ManagedStatic<TimingInfo> TTI;
+  TheTimeInfo = &*TTI;
+}
+
+/// If TimingInfo is enabled then start pass timer.
+Timer *llvm::getPassTimer(Pass *P) {
+  if (TheTimeInfo)
+    return TheTimeInfo->getPassTimer(P);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// PMStack implementation
+//
+
+// Pop Pass Manager from the stack and clear its analysis info.
+void PMStack::pop() {
+
+  PMDataManager *Top = this->top();
+  Top->initializeAnalysisInfo();
+
+  S.pop_back();
+}
+
+// Push PM on the stack and set its top level manager.
+void PMStack::push(PMDataManager *PM) {
+  assert(PM && "Unable to push. Pass Manager expected");
+  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
+
+  if (!this->empty()) {
+    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
+           && "pushing bad pass manager to PMStack");
+    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
+
+    assert(TPM && "Unable to find top level manager");
+    TPM->addIndirectPassManager(PM);
+    PM->setTopLevelManager(TPM);
+    PM->setDepth(this->top()->getDepth()+1);
+  } else {
+    assert((PM->getPassManagerType() == PMT_ModulePassManager
+           || PM->getPassManagerType() == PMT_FunctionPassManager)
+           && "pushing bad pass manager to PMStack");
+    PM->setDepth(1);
+  }
+
+  S.push_back(PM);
+}
+
+// Dump content of the pass manager stack.
+void PMStack::dump() const {
+  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
+         E = S.end(); I != E; ++I)
+    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
+
+  if (!S.empty())
+    dbgs() << '\n';
+}
+
+/// Find appropriate Module Pass Manager in the PM Stack and
+/// add self into that manager.
+void ModulePass::assignPassManager(PMStack &PMS,
+                                   PassManagerType PreferredType) {
+  // Find Module Pass Manager
+  while (!PMS.empty()) {
+    PassManagerType TopPMType = PMS.top()->getPassManagerType();
+    if (TopPMType == PreferredType)
+      break; // We found desired pass manager
+    else if (TopPMType > PMT_ModulePassManager)
+      PMS.pop();    // Pop children pass managers
+    else
+      break;
+  }
+  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
+  PMS.top()->add(this);
+}
+
+/// Find appropriate Function Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void FunctionPass::assignPassManager(PMStack &PMS,
+                                     PassManagerType PreferredType) {
+
+  // Find Function Pass Manager
+  while (!PMS.empty()) {
+    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
+      PMS.pop();
+    else
+      break;
+  }
+
+  // Create new Function Pass Manager if needed.
+  FPPassManager *FPP;
+  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
+    FPP = (FPPassManager *)PMS.top();
+  } else {
+    assert(!PMS.empty() && "Unable to create Function Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Function Pass Manager
+    FPP = new FPPassManager();
+    FPP->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(FPP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    FPP->assignPassManager(PMS, PMD->getPassManagerType());
+
+    // [4] Push new manager into PMS
+    PMS.push(FPP);
+  }
+
+  // Assign FPP as the manager of this pass.
+  FPP->add(this);
+}
+
+/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void BasicBlockPass::assignPassManager(PMStack &PMS,
+                                       PassManagerType PreferredType) {
+  BBPassManager *BBP;
+
+  // Basic Pass Manager is a leaf pass manager. It does not handle
+  // any other pass manager.
+  if (!PMS.empty() &&
+      PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
+    BBP = (BBPassManager *)PMS.top();
+  } else {
+    // If leaf manager is not Basic Block Pass manager then create new
+    // basic Block Pass manager.
+    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Basic Block Manager
+    BBP = new BBPassManager();
+
+    // [2] Set up new manager's top level manager
+    // Basic Block Pass Manager does not live by itself
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(BBP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    BBP->assignPassManager(PMS, PreferredType);
+
+    // [4] Push new manager into PMS
+    PMS.push(BBP);
+  }
+
+  // Assign BBP as the manager of this pass.
+  BBP->add(this);
+}
+
+PassManagerBase::~PassManagerBase() {}
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index bd4d9c0..a32d25c 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -65,7 +65,7 @@ class MDNodeOperand : public CallbackVH {
 
 public:
   MDNodeOperand(Value *V) : CallbackVH(V) {}
-  ~MDNodeOperand() {}
+  virtual ~MDNodeOperand();
 
   void set(Value *V) {
     unsigned IsFirst = this->getValPtrInt();
@@ -82,6 +82,8 @@ public:
 };
 } // end namespace llvm.
 
+// Provide out-of-line definition to prevent weak vtable.
+MDNodeOperand::~MDNodeOperand() {}
 
 void MDNodeOperand::deleted() {
   getParent()->replaceOperand(this, 0);
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 968b8f4..4f240c7 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -245,7 +245,7 @@ GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
 ///   1. If it does not exist, add a declaration of the global and return it.
 ///   2. Else, the global exists but has the wrong type: return the function
 ///      with a constantexpr cast to the right type.
-///   3. Finally, if the existing global is the correct delclaration, return the
+///   3. Finally, if the existing global is the correct declaration, return the
 ///      existing global.
 Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
   // See if we have a definition for the specified global already.
@@ -260,8 +260,10 @@ Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
 
   // If the variable exists but has the wrong type, return a bitcast to the
   // right type.
-  if (GV->getType() != PointerType::getUnqual(Ty))
-    return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty));
+  Type *GVTy = GV->getType();
+  PointerType *PTy = PointerType::get(Ty, GVTy->getPointerAddressSpace());
+  if (GVTy != PTy)
+    return ConstantExpr::getBitCast(GV, PTy);
 
   // Otherwise, we just found the existing function or a prototype.
   return GV;
@@ -316,11 +318,16 @@ getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
 
   for (unsigned i = 0, e = ModFlags->getNumOperands(); i != e; ++i) {
     MDNode *Flag = ModFlags->getOperand(i);
-    ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
-    MDString *Key = cast<MDString>(Flag->getOperand(1));
-    Value *Val = Flag->getOperand(2);
-    Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
-                                    Key, Val));
+    if (Flag->getNumOperands() >= 3 && isa<ConstantInt>(Flag->getOperand(0)) &&
+        isa<MDString>(Flag->getOperand(1))) {
+      // Check the operands of the MDNode before accessing the operands.
+      // The verifier will actually catch these failures.
+      ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
+      MDString *Key = cast<MDString>(Flag->getOperand(1));
+      Value *Val = Flag->getOperand(2);
+      Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
+                                      Key, Val));
+    }
   }
 }
 
@@ -399,9 +406,15 @@ bool Module::isDematerializable(const GlobalValue *GV) const {
 }
 
 bool Module::Materialize(GlobalValue *GV, std::string *ErrInfo) {
-  if (Materializer)
-    return Materializer->Materialize(GV, ErrInfo);
-  return false;
+  if (!Materializer)
+    return false;
+
+  error_code EC = Materializer->Materialize(GV);
+  if (!EC)
+    return false;
+  if (ErrInfo)
+    *ErrInfo = EC.message();
+  return true;
 }
 
 void Module::Dematerialize(GlobalValue *GV) {
@@ -412,7 +425,12 @@ void Module::Dematerialize(GlobalValue *GV) {
 bool Module::MaterializeAll(std::string *ErrInfo) {
   if (!Materializer)
     return false;
-  return Materializer->MaterializeModule(this, ErrInfo);
+  error_code EC = Materializer->MaterializeModule(this);
+  if (!EC)
+    return false;
+  if (ErrInfo)
+    *ErrInfo = EC.message();
+  return true;
 }
 
 bool Module::MaterializeAllPermanently(std::string *ErrInfo) {
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index ee53c85..966af7d 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -1,4 +1,4 @@
-//===- PassManager.cpp - LLVM Pass Infrastructure Implementation ----------===//
+//===- PassManager.h - Infrastructure for managing & running IR passes ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,1907 +6,152 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements the LLVM Pass Manager infrastructure.
-//
-//===----------------------------------------------------------------------===//
 
+#include "llvm/IR/PassManager.h"
+#include "llvm/ADT/STLExtras.h"
 
-#include "llvm/PassManagers.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/PassNameParser.h"
-#include "llvm/Support/Timer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <map>
 using namespace llvm;
 
-// See PassManagers.h for Pass Manager infrastructure overview.
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// Pass debugging information.  Often it is useful to find out what pass is
-// running when a crash occurs in a utility.  When this library is compiled with
-// debugging on, a command line option (--debug-pass) is enabled that causes the
-// pass name to be printed before it executes.
-//
-
-// Different debug levels that can be enabled...
-enum PassDebugLevel {
-  Disabled, Arguments, Structure, Executions, Details
-};
-
-static cl::opt<enum PassDebugLevel>
-PassDebugging("debug-pass", cl::Hidden,
-                  cl::desc("Print PassManager debugging information"),
-                  cl::values(
-  clEnumVal(Disabled  , "disable debug output"),
-  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
-  clEnumVal(Structure , "print pass structure before run()"),
-  clEnumVal(Executions, "print pass name before it is executed"),
-  clEnumVal(Details   , "print pass details when it is executed"),
-                             clEnumValEnd));
-
-typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
-PassOptionList;
-
-// Print IR out before/after specified passes.
-static PassOptionList
-PrintBefore("print-before",
-            llvm::cl::desc("Print IR before specified passes"),
-            cl::Hidden);
-
-static PassOptionList
-PrintAfter("print-after",
-           llvm::cl::desc("Print IR after specified passes"),
-           cl::Hidden);
-
-static cl::opt<bool>
-PrintBeforeAll("print-before-all",
-               llvm::cl::desc("Print IR before each pass"),
-               cl::init(false));
-static cl::opt<bool>
-PrintAfterAll("print-after-all",
-              llvm::cl::desc("Print IR after each pass"),
-              cl::init(false));
-
-/// This is a helper to determine whether to print IR before or
-/// after a pass.
-
-static bool ShouldPrintBeforeOrAfterPass(const PassInfo *PI,
-                                         PassOptionList &PassesToPrint) {
-  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
-    const llvm::PassInfo *PassInf = PassesToPrint[i];
-    if (PassInf)
-      if (PassInf->getPassArgument() == PI->getPassArgument()) {
-        return true;
-      }
-  }
-  return false;
-}
-
-/// This is a utility to check whether a pass should have IR dumped
-/// before it.
-static bool ShouldPrintBeforePass(const PassInfo *PI) {
-  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PI, PrintBefore);
-}
-
-/// This is a utility to check whether a pass should have IR dumped
-/// after it.
-static bool ShouldPrintAfterPass(const PassInfo *PI) {
-  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
-}
-
-} // End of llvm namespace
-
-/// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
-/// or higher is specified.
-bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
-  return PassDebugging >= Executions;
-}
-
-
-
-
-void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
-  if (V == 0 && M == 0)
-    OS << "Releasing pass '";
-  else
-    OS << "Running pass '";
-
-  OS << P->getPassName() << "'";
-
-  if (M) {
-    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
-    return;
-  }
-  if (V == 0) {
-    OS << '\n';
-    return;
-  }
-
-  OS << " on ";
-  if (isa<Function>(V))
-    OS << "function";
-  else if (isa<BasicBlock>(V))
-    OS << "basic block";
-  else
-    OS << "value";
-
-  OS << " '";
-  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
-  OS << "'\n";
+void ModulePassManager::run() {
+  for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx)
+    if (Passes[Idx]->run(M))
+      if (AM) AM->invalidateAll(M);
 }
 
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// BBPassManager
-//
-/// BBPassManager manages BasicBlockPass. It batches all the
-/// pass together and sequence them to process one basic block before
-/// processing next basic block.
-class BBPassManager : public PMDataManager, public FunctionPass {
-
-public:
-  static char ID;
-  explicit BBPassManager()
-    : PMDataManager(), FunctionPass(ID) {}
-
-  /// Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the function, and if so, return true.
-  bool runOnFunction(Function &F);
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  bool doInitialization(Module &M);
-  bool doInitialization(Function &F);
-  bool doFinalization(Module &M);
-  bool doFinalization(Function &F);
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-
-  virtual const char *getPassName() const {
-    return "BasicBlock Pass Manager";
-  }
-
-  // Print passes managed by this manager
-  void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      BasicBlockPass *BP = getContainedPass(Index);
-      BP->dumpPassStructure(Offset + 1);
-      dumpLastUses(BP, Offset+1);
-    }
-  }
-
-  BasicBlockPass *getContainedPass(unsigned N) {
-    assert(N < PassVector.size() && "Pass number out of range!");
-    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
-    return BP;
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    return PMT_BasicBlockPassManager;
-  }
-};
-
-char BBPassManager::ID = 0;
-}
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// FunctionPassManagerImpl
-//
-/// FunctionPassManagerImpl manages FPPassManagers
-class FunctionPassManagerImpl : public Pass,
-                                public PMDataManager,
-                                public PMTopLevelManager {
-  virtual void anchor();
-private:
-  bool wasRun;
-public:
-  static char ID;
-  explicit FunctionPassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P) {
-    schedulePass(P);
-  }
-
-  /// createPrinterPass - Get a function printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintFunctionPass(Banner, &O);
-  }
-
-  // Prepare for running an on the fly pass, freeing memory if needed
-  // from a previous run.
-  void releaseMemoryOnTheFly();
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool run(Function &F);
-
-  /// doInitialization - Run all of the initializers for the function passes.
-  ///
-  bool doInitialization(Module &M);
-
-  /// doFinalization - Run all of the finalizers for the function passes.
-  ///
-  bool doFinalization(Module &M);
-
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-  virtual PassManagerType getTopLevelPassManagerType() {
-    return PMT_FunctionPassManager;
-  }
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  FPPassManager *getContainedManager(unsigned N) {
-    assert(N < PassManagers.size() && "Pass number out of range!");
-    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
-    return FP;
-  }
-};
-
-void FunctionPassManagerImpl::anchor() {}
-
-char FunctionPassManagerImpl::ID = 0;
-
-//===----------------------------------------------------------------------===//
-// MPPassManager
-//
-/// MPPassManager manages ModulePasses and function pass managers.
-/// It batches all Module passes and function pass managers together and
-/// sequences them to process one module.
-class MPPassManager : public Pass, public PMDataManager {
-public:
-  static char ID;
-  explicit MPPassManager() :
-    Pass(PT_PassManager, ID), PMDataManager() { }
-
-  // Delete on the fly managers.
-  virtual ~MPPassManager() {
-    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-         I != E; ++I) {
-      FunctionPassManagerImpl *FPP = I->second;
-      delete FPP;
-    }
-  }
-
-  /// createPrinterPass - Get a module printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintModulePass(&O, false, Banner);
-  }
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool runOnModule(Module &M);
-
-  using llvm::Pass::doInitialization;
-  using llvm::Pass::doFinalization;
-
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  /// Add RequiredPass into list of lower level passes required by pass P.
-  /// RequiredPass is run on the fly by Pass Manager when P requests it
-  /// through getAnalysis interface.
-  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
-
-  /// Return function pass corresponding to PassInfo PI, that is
-  /// required by module pass MP. Instantiate analysis pass, by using
-  /// its runOnFunction() for function F.
-  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
-
-  virtual const char *getPassName() const {
-    return "Module Pass Manager";
-  }
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-
-  // Print passes managed by this manager
-  void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      ModulePass *MP = getContainedPass(Index);
-      MP->dumpPassStructure(Offset + 1);
-      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
-        OnTheFlyManagers.find(MP);
-      if (I != OnTheFlyManagers.end())
-        I->second->dumpPassStructure(Offset + 2);
-      dumpLastUses(MP, Offset+1);
-    }
-  }
-
-  ModulePass *getContainedPass(unsigned N) {
-    assert(N < PassVector.size() && "Pass number out of range!");
-    return static_cast<ModulePass *>(PassVector[N]);
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    return PMT_ModulePassManager;
-  }
-
- private:
-  /// Collection of on the fly FPPassManagers. These managers manage
-  /// function passes that are required by module passes.
-  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
-};
-
-char MPPassManager::ID = 0;
-//===----------------------------------------------------------------------===//
-// PassManagerImpl
-//
-
-/// PassManagerImpl manages MPPassManagers
-class PassManagerImpl : public Pass,
-                        public PMDataManager,
-                        public PMTopLevelManager {
-  virtual void anchor();
-
-public:
-  static char ID;
-  explicit PassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-                              PMTopLevelManager(new MPPassManager()) {}
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P) {
-    schedulePass(P);
-  }
-
-  /// createPrinterPass - Get a module printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintModulePass(&O, false, Banner);
-  }
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool run(Module &M);
-
-  using llvm::Pass::doInitialization;
-  using llvm::Pass::doFinalization;
-
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-  virtual PassManagerType getTopLevelPassManagerType() {
-    return PMT_ModulePassManager;
-  }
-
-  MPPassManager *getContainedManager(unsigned N) {
-    assert(N < PassManagers.size() && "Pass number out of range!");
-    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
-    return MP;
-  }
-};
-
-void PassManagerImpl::anchor() {}
-
-char PassManagerImpl::ID = 0;
-} // End of llvm namespace
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-/// TimingInfo Class - This class is used to calculate information about the
-/// amount of time each pass takes to execute.  This only happens when
-/// -time-passes is enabled on the command line.
-///
-
-static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
-
-class TimingInfo {
-  DenseMap<Pass*, Timer*> TimingData;
-  TimerGroup TG;
-public:
-  // Use 'create' member to get this.
-  TimingInfo() : TG("... Pass execution timing report ...") {}
-
-  // TimingDtor - Print out information about timing information
-  ~TimingInfo() {
-    // Delete all of the timers, which accumulate their info into the
-    // TimerGroup.
-    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
-         E = TimingData.end(); I != E; ++I)
-      delete I->second;
-    // TimerGroup is deleted next, printing the report.
-  }
-
-  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
-  // to a non null value (if the -time-passes option is enabled) or it leaves it
-  // null.  It may be called multiple times.
-  static void createTheTimeInfo();
-
-  /// getPassTimer - Return the timer for the specified pass if it exists.
-  Timer *getPassTimer(Pass *P) {
-    if (P->getAsPMDataManager())
-      return 0;
-
-    sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
-    Timer *&T = TimingData[P];
-    if (T == 0)
-      T = new Timer(P->getPassName(), TG);
-    return T;
-  }
-};
-
-} // End of anon namespace
-
-static TimingInfo *TheTimeInfo;
-
-//===----------------------------------------------------------------------===//
-// PMTopLevelManager implementation
-
-/// Initialize top level manager. Create first pass manager.
-PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
-  PMDM->setTopLevelManager(this);
-  addPassManager(PMDM);
-  activeStack.push(PMDM);
-}
-
-/// Set pass P as the last user of the given analysis passes.
-void
-PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
-  unsigned PDepth = 0;
-  if (P->getResolver())
-    PDepth = P->getResolver()->getPMDataManager().getDepth();
-
-  for (SmallVectorImpl<Pass *>::const_iterator I = AnalysisPasses.begin(),
-         E = AnalysisPasses.end(); I != E; ++I) {
-    Pass *AP = *I;
-    LastUser[AP] = P;
-
-    if (P == AP)
-      continue;
-
-    // Update the last users of passes that are required transitive by AP.
-    AnalysisUsage *AnUsage = findAnalysisUsage(AP);
-    const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
-    SmallVector<Pass *, 12> LastUses;
-    SmallVector<Pass *, 12> LastPMUses;
-    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-      Pass *AnalysisPass = findAnalysisPass(*I);
-      assert(AnalysisPass && "Expected analysis pass to exist.");
-      AnalysisResolver *AR = AnalysisPass->getResolver();
-      assert(AR && "Expected analysis resolver to exist.");
-      unsigned APDepth = AR->getPMDataManager().getDepth();
-
-      if (PDepth == APDepth)
-        LastUses.push_back(AnalysisPass);
-      else if (PDepth > APDepth)
-        LastPMUses.push_back(AnalysisPass);
-    }
-
-    setLastUser(LastUses, P);
-
-    // If this pass has a corresponding pass manager, push higher level
-    // analysis to this pass manager.
-    if (P->getResolver())
-      setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
-
-
-    // If AP is the last user of other passes then make P last user of
-    // such passes.
-    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
-           LUE = LastUser.end(); LUI != LUE; ++LUI) {
-      if (LUI->second == AP)
-        // DenseMap iterator is not invalidated here because
-        // this is just updating existing entries.
-        LastUser[LUI->first] = P;
-    }
-  }
-}
-
-/// Collect passes whose last user is P
-void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
-                                        Pass *P) {
-  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
-    InversedLastUser.find(P);
-  if (DMI == InversedLastUser.end())
-    return;
-
-  SmallPtrSet<Pass *, 8> &LU = DMI->second;
-  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
-         E = LU.end(); I != E; ++I) {
-    LastUses.push_back(*I);
-  }
-
-}
-
-AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
-  AnalysisUsage *AnUsage = NULL;
-  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
-  if (DMI != AnUsageMap.end())
-    AnUsage = DMI->second;
-  else {
-    AnUsage = new AnalysisUsage();
-    P->getAnalysisUsage(*AnUsage);
-    AnUsageMap[P] = AnUsage;
-  }
-  return AnUsage;
-}
-
-/// Schedule pass P for execution. Make sure that passes required by
-/// P are run before P is run. Update analysis info maintained by
-/// the manager. Remove dead passes. This is a recursive function.
-void PMTopLevelManager::schedulePass(Pass *P) {
-
-  // TODO : Allocate function manager for this pass, other wise required set
-  // may be inserted into previous function manager
-
-  // Give pass a chance to prepare the stage.
-  P->preparePassManager(activeStack);
-
-  // If P is an analysis pass and it is available then do not
-  // generate the analysis again. Stale analysis info should not be
-  // available at this point.
-  const PassInfo *PI =
-    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
-  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
-    delete P;
-    return;
-  }
-
-  AnalysisUsage *AnUsage = findAnalysisUsage(P);
-
-  bool checkAnalysis = true;
-  while (checkAnalysis) {
-    checkAnalysis = false;
-
-    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
-           E = RequiredSet.end(); I != E; ++I) {
-
-      Pass *AnalysisPass = findAnalysisPass(*I);
-      if (!AnalysisPass) {
-        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
-
-        if (PI == NULL) {
-          // Pass P is not in the global PassRegistry
-          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
-          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
-          dbgs() << "Required Passes:" << "\n";
-          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
-                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
-            Pass *AnalysisPass2 = findAnalysisPass(*I2);
-            if (AnalysisPass2) {
-              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
-            } else {
-              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
-              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
-              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
-            }
-          }
-        }
-
-        assert(PI && "Expected required passes to be initialized");
-        AnalysisPass = PI->createPass();
-        if (P->getPotentialPassManagerType () ==
-            AnalysisPass->getPotentialPassManagerType())
-          // Schedule analysis pass that is managed by the same pass manager.
-          schedulePass(AnalysisPass);
-        else if (P->getPotentialPassManagerType () >
-                 AnalysisPass->getPotentialPassManagerType()) {
-          // Schedule analysis pass that is managed by a new manager.
-          schedulePass(AnalysisPass);
-          // Recheck analysis passes to ensure that required analyses that
-          // are already checked are still available.
-          checkAnalysis = true;
-        } else
-          // Do not schedule this analysis. Lower level analsyis
-          // passes are run on the fly.
-          delete AnalysisPass;
+bool FunctionPassManager::run(Module *M) {
+  bool Changed = false;
+  for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
+    for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx)
+      if (Passes[Idx]->run(I)) {
+        Changed = true;
+        if (AM) AM->invalidateAll(I);
       }
-    }
-  }
-
-  // Now all required passes are available.
-  if (ImmutablePass *IP = P->getAsImmutablePass()) {
-    // P is a immutable pass and it will be managed by this
-    // top level manager. Set up analysis resolver to connect them.
-    PMDataManager *DM = getAsPMDataManager();
-    AnalysisResolver *AR = new AnalysisResolver(*DM);
-    P->setResolver(AR);
-    DM->initializeAnalysisImpl(P);
-    addImmutablePass(IP);
-    DM->recordAvailableAnalysis(IP);
-    return;
-  }
-
-  if (PI && !PI->isAnalysis() && ShouldPrintBeforePass(PI)) {
-    Pass *PP = P->createPrinterPass(
-      dbgs(), std::string("*** IR Dump Before ") + P->getPassName() + " ***");
-    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
-  }
-
-  // Add the requested pass to the best available pass manager.
-  P->assignPassManager(activeStack, getTopLevelPassManagerType());
-
-  if (PI && !PI->isAnalysis() && ShouldPrintAfterPass(PI)) {
-    Pass *PP = P->createPrinterPass(
-      dbgs(), std::string("*** IR Dump After ") + P->getPassName() + " ***");
-    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
-  }
-}
-
-/// Find the pass that implements Analysis AID. Search immutable
-/// passes and all pass managers. If desired pass is not found
-/// then return NULL.
-Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
-
-  // Check pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    if (Pass *P = (*I)->findAnalysisPass(AID, false))
-      return P;
-
-  // Check other pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator
-         I = IndirectPassManagers.begin(),
-         E = IndirectPassManagers.end(); I != E; ++I)
-    if (Pass *P = (*I)->findAnalysisPass(AID, false))
-      return P;
-
-  // Check the immutable passes. Iterate in reverse order so that we find
-  // the most recently registered passes first.
-  for (SmallVectorImpl<ImmutablePass *>::reverse_iterator I =
-       ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; ++I) {
-    AnalysisID PI = (*I)->getPassID();
-    if (PI == AID)
-      return *I;
-
-    // If Pass not found then check the interfaces implemented by Immutable Pass
-    const PassInfo *PassInf =
-      PassRegistry::getPassRegistry()->getPassInfo(PI);
-    assert(PassInf && "Expected all immutable passes to be initialized");
-    const std::vector<const PassInfo*> &ImmPI =
-      PassInf->getInterfacesImplemented();
-    for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
-         EE = ImmPI.end(); II != EE; ++II) {
-      if ((*II)->getTypeInfo() == AID)
-        return *I;
-    }
-  }
-
-  return 0;
-}
-
-// Print passes managed by this top level manager.
-void PMTopLevelManager::dumpPasses() const {
-
-  if (PassDebugging < Structure)
-    return;
-
-  // Print out the immutable passes
-  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
-    ImmutablePasses[i]->dumpPassStructure(0);
-  }
-
-  // Every class that derives from PMDataManager also derives from Pass
-  // (sometimes indirectly), but there's no inheritance relationship
-  // between PMDataManager and Pass, so we have to getAsPass to get
-  // from a PMDataManager* to a Pass*.
-  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
-       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
-    (*I)->getAsPass()->dumpPassStructure(1);
-}
-
-void PMTopLevelManager::dumpArguments() const {
-
-  if (PassDebugging < Arguments)
-    return;
-
-  dbgs() << "Pass Arguments: ";
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
-       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    if (const PassInfo *PI =
-        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
-      assert(PI && "Expected all immutable passes to be initialized");
-      if (!PI->isAnalysisGroup())
-        dbgs() << " -" << PI->getPassArgument();
-    }
-  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
-       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
-    (*I)->dumpPassArguments();
-  dbgs() << "\n";
+  return Changed;
 }
 
-void PMTopLevelManager::initializeAllAnalysisInfo() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    (*I)->initializeAnalysisInfo();
+void AnalysisManager::invalidateAll(Function *F) {
+  assert(F->getParent() == M && "Invalidating a function from another module!");
 
-  // Initailize other pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator
-       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+  // First invalidate any module results we still have laying about.
+  // FIXME: This is a total hack based on the fact that erasure doesn't
+  // invalidate iteration for DenseMap.
+  for (ModuleAnalysisResultMapT::iterator I = ModuleAnalysisResults.begin(),
+                                          E = ModuleAnalysisResults.end();
        I != E; ++I)
-    (*I)->initializeAnalysisInfo();
-
-  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
-        DME = LastUser.end(); DMI != DME; ++DMI) {
-    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
-      InversedLastUser.find(DMI->second);
-    if (InvDMI != InversedLastUser.end()) {
-      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
-      L.insert(DMI->first);
+    if (I->second->invalidate(M))
+      ModuleAnalysisResults.erase(I);
+
+  // Now clear all the invalidated results associated specifically with this
+  // function.
+  SmallVector<void *, 8> InvalidatedPassIDs;
+  FunctionAnalysisResultListT &ResultsList = FunctionAnalysisResultLists[F];
+  for (FunctionAnalysisResultListT::iterator I = ResultsList.begin(),
+                                             E = ResultsList.end();
+       I != E;)
+    if (I->second->invalidate(F)) {
+      InvalidatedPassIDs.push_back(I->first);
+      I = ResultsList.erase(I);
     } else {
-      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
-      InversedLastUser[DMI->second] = L;
+      ++I;
     }
-  }
-}
-
-/// Destructor
-PMTopLevelManager::~PMTopLevelManager() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    delete *I;
-
-  for (SmallVectorImpl<ImmutablePass *>::iterator
-         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    delete *I;
-
-  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
-         DME = AnUsageMap.end(); DMI != DME; ++DMI)
-    delete DMI->second;
-}
-
-//===----------------------------------------------------------------------===//
-// PMDataManager implementation
-
-/// Augement AvailableAnalysis by adding analysis made available by pass P.
-void PMDataManager::recordAvailableAnalysis(Pass *P) {
-  AnalysisID PI = P->getPassID();
-
-  AvailableAnalysis[PI] = P;
-
-  assert(!AvailableAnalysis.empty());
-
-  // This pass is the current implementation of all of the interfaces it
-  // implements as well.
-  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
-  if (PInf == 0) return;
-  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-  for (unsigned i = 0, e = II.size(); i != e; ++i)
-    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+  while (!InvalidatedPassIDs.empty())
+    FunctionAnalysisResults.erase(
+        std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
 }
 
-// Return true if P preserves high level analysis used by other
-// passes managed by this manager
-bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  if (AnUsage->getPreservesAll())
-    return true;
-
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-  for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
-         E = HigherLevelAnalysis.end(); I  != E; ++I) {
-    Pass *P1 = *I;
-    if (P1->getAsImmutablePass() == 0 &&
-        std::find(PreservedSet.begin(), PreservedSet.end(),
-                  P1->getPassID()) ==
-           PreservedSet.end())
-      return false;
-  }
-
-  return true;
-}
-
-/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
-void PMDataManager::verifyPreservedAnalysis(Pass *P) {
-  // Don't do this unless assertions are enabled.
-#ifdef NDEBUG
-  return;
-#endif
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-
-  // Verify preserved analysis
-  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
-         E = PreservedSet.end(); I != E; ++I) {
-    AnalysisID AID = *I;
-    if (Pass *AP = findAnalysisPass(AID, true)) {
-      TimeRegion PassTimer(getPassTimer(AP));
-      AP->verifyAnalysis();
-    }
-  }
-}
-
-/// Remove Analysis not preserved by Pass P
-void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  if (AnUsage->getPreservesAll())
-    return;
-
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
-         E = AvailableAnalysis.end(); I != E; ) {
-    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
-    if (Info->second->getAsImmutablePass() == 0 &&
-        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
-        PreservedSet.end()) {
-      // Remove this analysis
-      if (PassDebugging >= Details) {
-        Pass *S = Info->second;
-        dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
-        dbgs() << S->getPassName() << "'\n";
-      }
-      AvailableAnalysis.erase(Info);
-    }
-  }
-
-  // Check inherited analysis also. If P is not preserving analysis
-  // provided by parent manager then remove it here.
-  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
-
-    if (!InheritedAnalysis[Index])
-      continue;
-
-    for (DenseMap<AnalysisID, Pass*>::iterator
-           I = InheritedAnalysis[Index]->begin(),
-           E = InheritedAnalysis[Index]->end(); I != E; ) {
-      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
-      if (Info->second->getAsImmutablePass() == 0 &&
-          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
-             PreservedSet.end()) {
-        // Remove this analysis
-        if (PassDebugging >= Details) {
-          Pass *S = Info->second;
-          dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
-          dbgs() << S->getPassName() << "'\n";
-        }
-        InheritedAnalysis[Index]->erase(Info);
-      }
-    }
-  }
-}
-
-/// Remove analysis passes that are not used any longer
-void PMDataManager::removeDeadPasses(Pass *P, StringRef Msg,
-                                     enum PassDebuggingString DBG_STR) {
-
-  SmallVector<Pass *, 12> DeadPasses;
-
-  // If this is a on the fly manager then it does not have TPM.
-  if (!TPM)
-    return;
-
-  TPM->collectLastUses(DeadPasses, P);
-
-  if (PassDebugging >= Details && !DeadPasses.empty()) {
-    dbgs() << " -*- '" <<  P->getPassName();
-    dbgs() << "' is the last user of following pass instances.";
-    dbgs() << " Free these instances\n";
-  }
-
-  for (SmallVectorImpl<Pass *>::iterator I = DeadPasses.begin(),
-         E = DeadPasses.end(); I != E; ++I)
-    freePass(*I, Msg, DBG_STR);
-}
-
-void PMDataManager::freePass(Pass *P, StringRef Msg,
-                             enum PassDebuggingString DBG_STR) {
-  dumpPassInfo(P, FREEING_MSG, DBG_STR, Msg);
-
-  {
-    // If the pass crashes releasing memory, remember this.
-    PassManagerPrettyStackEntry X(P);
-    TimeRegion PassTimer(getPassTimer(P));
-
-    P->releaseMemory();
-  }
-
-  AnalysisID PI = P->getPassID();
-  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
-    // Remove the pass itself (if it is not already removed).
-    AvailableAnalysis.erase(PI);
-
-    // Remove all interfaces this pass implements, for which it is also
-    // listed as the available implementation.
-    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-    for (unsigned i = 0, e = II.size(); i != e; ++i) {
-      DenseMap<AnalysisID, Pass*>::iterator Pos =
-        AvailableAnalysis.find(II[i]->getTypeInfo());
-      if (Pos != AvailableAnalysis.end() && Pos->second == P)
-        AvailableAnalysis.erase(Pos);
-    }
-  }
-}
-
-/// Add pass P into the PassVector. Update
-/// AvailableAnalysis appropriately if ProcessAnalysis is true.
-void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
-  // This manager is going to manage pass P. Set up analysis resolver
-  // to connect them.
-  AnalysisResolver *AR = new AnalysisResolver(*this);
-  P->setResolver(AR);
-
-  // If a FunctionPass F is the last user of ModulePass info M
-  // then the F's manager, not F, records itself as a last user of M.
-  SmallVector<Pass *, 12> TransferLastUses;
-
-  if (!ProcessAnalysis) {
-    // Add pass
-    PassVector.push_back(P);
-    return;
-  }
-
-  // At the moment, this pass is the last user of all required passes.
-  SmallVector<Pass *, 12> LastUses;
-  SmallVector<Pass *, 8> RequiredPasses;
-  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
-
-  unsigned PDepth = this->getDepth();
-
-  collectRequiredAnalysis(RequiredPasses,
-                          ReqAnalysisNotAvailable, P);
-  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
-         E = RequiredPasses.end(); I != E; ++I) {
-    Pass *PRequired = *I;
-    unsigned RDepth = 0;
-
-    assert(PRequired->getResolver() && "Analysis Resolver is not set");
-    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
-    RDepth = DM.getDepth();
-
-    if (PDepth == RDepth)
-      LastUses.push_back(PRequired);
-    else if (PDepth > RDepth) {
-      // Let the parent claim responsibility of last use
-      TransferLastUses.push_back(PRequired);
-      // Keep track of higher level analysis used by this manager.
-      HigherLevelAnalysis.push_back(PRequired);
-    } else
-      llvm_unreachable("Unable to accommodate Required Pass");
-  }
-
-  // Set P as P's last user until someone starts using P.
-  // However, if P is a Pass Manager then it does not need
-  // to record its last user.
-  if (P->getAsPMDataManager() == 0)
-    LastUses.push_back(P);
-  TPM->setLastUser(LastUses, P);
-
-  if (!TransferLastUses.empty()) {
-    Pass *My_PM = getAsPass();
-    TPM->setLastUser(TransferLastUses, My_PM);
-    TransferLastUses.clear();
-  }
-
-  // Now, take care of required analyses that are not available.
-  for (SmallVectorImpl<AnalysisID>::iterator
-         I = ReqAnalysisNotAvailable.begin(),
-         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
-    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
-    Pass *AnalysisPass = PI->createPass();
-    this->addLowerLevelRequiredPass(P, AnalysisPass);
-  }
-
-  // Take a note of analysis required and made available by this pass.
-  // Remove the analysis not preserved by this pass
-  removeNotPreservedAnalysis(P);
-  recordAvailableAnalysis(P);
-
-  // Add pass
-  PassVector.push_back(P);
-}
-
-
-/// Populate RP with analysis pass that are required by
-/// pass P and are available. Populate RP_NotAvail with analysis
-/// pass that are required by pass P but are not available.
-void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
-                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
-                                            Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-  for (AnalysisUsage::VectorType::const_iterator
-         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
-    else
-      RP_NotAvail.push_back(*I);
-  }
-
-  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
-  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
-    else
-      RP_NotAvail.push_back(*I);
-  }
-}
-
-// All Required analyses should be available to the pass as it runs!  Here
-// we fill in the AnalysisImpls member of the pass so that it can
-// successfully use the getAnalysis() method to retrieve the
-// implementations it needs.
-//
-void PMDataManager::initializeAnalysisImpl(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-
-  for (AnalysisUsage::VectorType::const_iterator
-         I = AnUsage->getRequiredSet().begin(),
-         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
-    Pass *Impl = findAnalysisPass(*I, true);
-    if (Impl == 0)
-      // This may be analysis pass that is initialized on the fly.
-      // If that is not the case then it will raise an assert when it is used.
-      continue;
-    AnalysisResolver *AR = P->getResolver();
-    assert(AR && "Analysis Resolver is not set");
-    AR->addAnalysisImplsPair(*I, Impl);
-  }
-}
-
-/// Find the pass that implements Analysis AID. If desired pass is not found
-/// then return NULL.
-Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
-
-  // Check if AvailableAnalysis map has one entry.
-  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
-
-  if (I != AvailableAnalysis.end())
-    return I->second;
-
-  // Search Parents through TopLevelManager
-  if (SearchParent)
-    return TPM->findAnalysisPass(AID);
-
-  return NULL;
-}
-
-// Print list of passes that are last used by P.
-void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
-
-  SmallVector<Pass *, 12> LUses;
-
-  // If this is a on the fly manager then it does not have TPM.
-  if (!TPM)
-    return;
-
-  TPM->collectLastUses(LUses, P);
-
-  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
-         E = LUses.end(); I != E; ++I) {
-    llvm::dbgs() << "--" << std::string(Offset*2, ' ');
-    (*I)->dumpPassStructure(0);
-  }
-}
-
-void PMDataManager::dumpPassArguments() const {
-  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
-        E = PassVector.end(); I != E; ++I) {
-    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
-      PMD->dumpPassArguments();
-    else
-      if (const PassInfo *PI =
-            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
-        if (!PI->isAnalysisGroup())
-          dbgs() << " -" << PI->getPassArgument();
-  }
-}
-
-void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
-                                 enum PassDebuggingString S2,
-                                 StringRef Msg) {
-  if (PassDebugging < Executions)
-    return;
-  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
-  switch (S1) {
-  case EXECUTION_MSG:
-    dbgs() << "Executing Pass '" << P->getPassName();
-    break;
-  case MODIFICATION_MSG:
-    dbgs() << "Made Modification '" << P->getPassName();
-    break;
-  case FREEING_MSG:
-    dbgs() << " Freeing Pass '" << P->getPassName();
-    break;
-  default:
-    break;
-  }
-  switch (S2) {
-  case ON_BASICBLOCK_MSG:
-    dbgs() << "' on BasicBlock '" << Msg << "'...\n";
-    break;
-  case ON_FUNCTION_MSG:
-    dbgs() << "' on Function '" << Msg << "'...\n";
-    break;
-  case ON_MODULE_MSG:
-    dbgs() << "' on Module '"  << Msg << "'...\n";
-    break;
-  case ON_REGION_MSG:
-    dbgs() << "' on Region '"  << Msg << "'...\n";
-    break;
-  case ON_LOOP_MSG:
-    dbgs() << "' on Loop '" << Msg << "'...\n";
-    break;
-  case ON_CG_MSG:
-    dbgs() << "' on Call Graph Nodes '" << Msg << "'...\n";
-    break;
-  default:
-    break;
-  }
-}
-
-void PMDataManager::dumpRequiredSet(const Pass *P) const {
-  if (PassDebugging < Details)
-    return;
-
-  AnalysisUsage analysisUsage;
-  P->getAnalysisUsage(analysisUsage);
-  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
-}
-
-void PMDataManager::dumpPreservedSet(const Pass *P) const {
-  if (PassDebugging < Details)
-    return;
-
-  AnalysisUsage analysisUsage;
-  P->getAnalysisUsage(analysisUsage);
-  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
-}
-
-void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
-                                   const AnalysisUsage::VectorType &Set) const {
-  assert(PassDebugging >= Details);
-  if (Set.empty())
-    return;
-  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
-  for (unsigned i = 0; i != Set.size(); ++i) {
-    if (i) dbgs() << ',';
-    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
-    if (!PInf) {
-      // Some preserved passes, such as AliasAnalysis, may not be initialized by
-      // all drivers.
-      dbgs() << " Uninitialized Pass";
-      continue;
-    }
-    dbgs() << ' ' << PInf->getPassName();
-  }
-  dbgs() << '\n';
-}
-
-/// Add RequiredPass into list of lower level passes required by pass P.
-/// RequiredPass is run on the fly by Pass Manager when P requests it
-/// through getAnalysis interface.
-/// This should be handled by specific pass manager.
-void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
-  if (TPM) {
-    TPM->dumpArguments();
-    TPM->dumpPasses();
-  }
-
-  // Module Level pass may required Function Level analysis info
-  // (e.g. dominator info). Pass manager uses on the fly function pass manager
-  // to provide this on demand. In that case, in Pass manager terminology,
-  // module level pass is requiring lower level analysis info managed by
-  // lower level pass manager.
-
-  // When Pass manager is not able to order required analysis info, Pass manager
-  // checks whether any lower level manager will be able to provide this
-  // analysis info on demand or not.
-#ifndef NDEBUG
-  dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
-  dbgs() << "' required by '" << P->getPassName() << "'\n";
-#endif
-  llvm_unreachable("Unable to schedule pass");
-}
-
-Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
-  llvm_unreachable("Unable to find on the fly pass");
-}
-
-// Destructor
-PMDataManager::~PMDataManager() {
-  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
-         E = PassVector.end(); I != E; ++I)
-    delete *I;
-}
-
-//===----------------------------------------------------------------------===//
-// NOTE: Is this the right place to define this method ?
-// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
-Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
-  return PM.findAnalysisPass(ID, dir);
-}
-
-Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
-                                     Function &F) {
-  return PM.getOnTheFlyPass(P, AnalysisPI, F);
-}
-
-//===----------------------------------------------------------------------===//
-// BBPassManager implementation
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
-/// the function, and if so, return true.
-bool BBPassManager::runOnFunction(Function &F) {
-  if (F.isDeclaration())
-    return false;
-
-  bool Changed = doInitialization(F);
-
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      BasicBlockPass *BP = getContainedPass(Index);
-      bool LocalChanged = false;
-
-      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
-      dumpRequiredSet(BP);
-
-      initializeAnalysisImpl(BP);
-
-      {
-        // If the pass crashes, remember this.
-        PassManagerPrettyStackEntry X(BP, *I);
-        TimeRegion PassTimer(getPassTimer(BP));
-
-        LocalChanged |= BP->runOnBasicBlock(*I);
+void AnalysisManager::invalidateAll(Module *M) {
+  // First invalidate any module results we still have laying about.
+  // FIXME: This is a total hack based on the fact that erasure doesn't
+  // invalidate iteration for DenseMap.
+  for (ModuleAnalysisResultMapT::iterator I = ModuleAnalysisResults.begin(),
+                                          E = ModuleAnalysisResults.end();
+       I != E; ++I)
+    if (I->second->invalidate(M))
+      ModuleAnalysisResults.erase(I);
+
+  // Now walk all of the functions for which there are cached results, and
+  // attempt to invalidate each of those as the entire module may have changed.
+  // FIXME: How do we handle functions which have been deleted or RAUWed?
+  SmallVector<void *, 8> InvalidatedPassIDs;
+  for (FunctionAnalysisResultListMapT::iterator
+           FI = FunctionAnalysisResultLists.begin(),
+           FE = FunctionAnalysisResultLists.end();
+       FI != FE; ++FI) {
+    Function *F = FI->first;
+    FunctionAnalysisResultListT &ResultsList = FI->second;
+    for (FunctionAnalysisResultListT::iterator I = ResultsList.begin(),
+                                               E = ResultsList.end();
+         I != E;)
+      if (I->second->invalidate(F)) {
+        InvalidatedPassIDs.push_back(I->first);
+        I = ResultsList.erase(I);
+      } else {
+        ++I;
       }
-
-      Changed |= LocalChanged;
-      if (LocalChanged)
-        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
-                     I->getName());
-      dumpPreservedSet(BP);
-
-      verifyPreservedAnalysis(BP);
-      removeNotPreservedAnalysis(BP);
-      recordAvailableAnalysis(BP);
-      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
-    }
-
-  return doFinalization(F) || Changed;
-}
-
-// Implement doInitialization and doFinalization
-bool BBPassManager::doInitialization(Module &M) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-
-  return Changed;
-}
-
-bool BBPassManager::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-
-  return Changed;
-}
-
-bool BBPassManager::doInitialization(Function &F) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    BasicBlockPass *BP = getContainedPass(Index);
-    Changed |= BP->doInitialization(F);
+    while (!InvalidatedPassIDs.empty())
+      FunctionAnalysisResults.erase(
+          std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
   }
-
-  return Changed;
 }
 
-bool BBPassManager::doFinalization(Function &F) {
-  bool Changed = false;
+const AnalysisManager::AnalysisResultConcept<Module> &
+AnalysisManager::getResultImpl(void *PassID, Module *M) {
+  assert(M == this->M && "Wrong module used when querying the AnalysisManager");
+  ModuleAnalysisResultMapT::iterator RI;
+  bool Inserted;
+  llvm::tie(RI, Inserted) = ModuleAnalysisResults.insert(std::make_pair(
+      PassID, polymorphic_ptr<AnalysisResultConcept<Module> >()));
 
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    BasicBlockPass *BP = getContainedPass(Index);
-    Changed |= BP->doFinalization(F);
+  if (Inserted) {
+    // We don't have a cached result for this result. Look up the pass and run
+    // it to produce a result, which we then add to the cache.
+    ModuleAnalysisPassMapT::const_iterator PI =
+        ModuleAnalysisPasses.find(PassID);
+    assert(PI != ModuleAnalysisPasses.end() &&
+           "Analysis passes must be registered prior to being queried!");
+    RI->second = PI->second->run(M);
   }
 
-  return Changed;
+  return *RI->second;
 }
 
+const AnalysisManager::AnalysisResultConcept<Function> &
+AnalysisManager::getResultImpl(void *PassID, Function *F) {
+  assert(F->getParent() == M && "Analyzing a function from another module!");
 
-//===----------------------------------------------------------------------===//
-// FunctionPassManager implementation
-
-/// Create new Function pass manager
-FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
-  FPM = new FunctionPassManagerImpl();
-  // FPM is the top level manager.
-  FPM->setTopLevelManager(FPM);
-
-  AnalysisResolver *AR = new AnalysisResolver(*FPM);
-  FPM->setResolver(AR);
-}
-
-FunctionPassManager::~FunctionPassManager() {
-  delete FPM;
-}
+  FunctionAnalysisResultMapT::iterator RI;
+  bool Inserted;
+  llvm::tie(RI, Inserted) = FunctionAnalysisResults.insert(std::make_pair(
+      std::make_pair(PassID, F), FunctionAnalysisResultListT::iterator()));
 
-/// add - Add a pass to the queue of passes to run.  This passes
-/// ownership of the Pass to the PassManager.  When the
-/// PassManager_X is destroyed, the pass will be destroyed as well, so
-/// there is no need to delete the pass. (TODO delete passes.)
-/// This implies that all passes MUST be allocated with 'new'.
-void FunctionPassManager::add(Pass *P) {
-  FPM->add(P);
-}
-
-/// run - Execute all of the passes scheduled for execution.  Keep
-/// track of whether any of the passes modifies the function, and if
-/// so, return true.
-///
-bool FunctionPassManager::run(Function &F) {
-  if (F.isMaterializable()) {
-    std::string errstr;
-    if (F.Materialize(&errstr))
-      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
+  if (Inserted) {
+    // We don't have a cached result for this result. Look up the pass and run
+    // it to produce a result, which we then add to the cache.
+    FunctionAnalysisPassMapT::const_iterator PI =
+        FunctionAnalysisPasses.find(PassID);
+    assert(PI != FunctionAnalysisPasses.end() &&
+           "Analysis passes must be registered prior to being queried!");
+    FunctionAnalysisResultListT &ResultList = FunctionAnalysisResultLists[F];
+    ResultList.push_back(std::make_pair(PassID, PI->second->run(F)));
+    RI->second = llvm::prior(ResultList.end());
   }
-  return FPM->run(F);
-}
 
-
-/// doInitialization - Run all of the initializers for the function passes.
-///
-bool FunctionPassManager::doInitialization() {
-  return FPM->doInitialization(*M);
-}
-
-/// doFinalization - Run all of the finalizers for the function passes.
-///
-bool FunctionPassManager::doFinalization() {
-  return FPM->doFinalization(*M);
-}
-
-//===----------------------------------------------------------------------===//
-// FunctionPassManagerImpl implementation
-//
-bool FunctionPassManagerImpl::doInitialization(Module &M) {
-  bool Changed = false;
-
-  dumpArguments();
-  dumpPasses();
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doInitialization(M);
-  }
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->doInitialization(M);
-
-  return Changed;
+  return *RI->second->second;
 }
 
-bool FunctionPassManagerImpl::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
-    Changed |= getContainedManager(Index)->doFinalization(M);
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doFinalization(M);
-  }
-
-  return Changed;
+void AnalysisManager::invalidateImpl(void *PassID, Module *M) {
+  assert(M == this->M && "Invalidating a pass over a different module!");
+  ModuleAnalysisResults.erase(PassID);
 }
 
-/// cleanup - After running all passes, clean up pass manager cache.
-void FPPassManager::cleanup() {
- for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    AnalysisResolver *AR = FP->getResolver();
-    assert(AR && "Analysis Resolver is not set");
-    AR->clearAnalysisImpls();
- }
-}
+void AnalysisManager::invalidateImpl(void *PassID, Function *F) {
+  assert(F->getParent() == M &&
+         "Invalidating a pass over a function from another module!");
 
-void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
-  if (!wasRun)
+  FunctionAnalysisResultMapT::iterator RI =
+      FunctionAnalysisResults.find(std::make_pair(PassID, F));
+  if (RI == FunctionAnalysisResults.end())
     return;
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
-    FPPassManager *FPPM = getContainedManager(Index);
-    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
-      FPPM->getContainedPass(Index)->releaseMemory();
-    }
-  }
-  wasRun = false;
-}
-
-// Execute all the passes managed by this top level manager.
-// Return true if any function is modified by a pass.
-bool FunctionPassManagerImpl::run(Function &F) {
-  bool Changed = false;
-  TimingInfo::createTheTimeInfo();
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->runOnFunction(F);
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    getContainedManager(Index)->cleanup();
-
-  wasRun = true;
-  return Changed;
-}
 
-//===----------------------------------------------------------------------===//
-// FPPassManager implementation
-
-char FPPassManager::ID = 0;
-/// Print passes managed by this manager
-void FPPassManager::dumpPassStructure(unsigned Offset) {
-  dbgs().indent(Offset*2) << "FunctionPass Manager\n";
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    FP->dumpPassStructure(Offset + 1);
-    dumpLastUses(FP, Offset+1);
-  }
-}
-
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnFunction method.  Keep track of whether any of the passes modifies
-/// the function, and if so, return true.
-bool FPPassManager::runOnFunction(Function &F) {
-  if (F.isDeclaration())
-    return false;
-
-  bool Changed = false;
-
-  // Collect inherited analysis from Module level pass manager.
-  populateInheritedAnalysis(TPM->activeStack);
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    bool LocalChanged = false;
-
-    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getName());
-    dumpRequiredSet(FP);
-
-    initializeAnalysisImpl(FP);
-
-    {
-      PassManagerPrettyStackEntry X(FP, F);
-      TimeRegion PassTimer(getPassTimer(FP));
-
-      LocalChanged |= FP->runOnFunction(F);
-    }
-
-    Changed |= LocalChanged;
-    if (LocalChanged)
-      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
-    dumpPreservedSet(FP);
-
-    verifyPreservedAnalysis(FP);
-    removeNotPreservedAnalysis(FP);
-    recordAvailableAnalysis(FP);
-    removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
-  }
-  return Changed;
-}
-
-bool FPPassManager::runOnModule(Module &M) {
-  bool Changed = false;
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    Changed |= runOnFunction(*I);
-
-  return Changed;
-}
-
-bool FPPassManager::doInitialization(Module &M) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-  
-  return Changed;
-}
-
-bool FPPassManager::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-  
-  return Changed;
+  FunctionAnalysisResultLists[F].erase(RI->second);
 }
-
-//===----------------------------------------------------------------------===//
-// MPPassManager implementation
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnModule method.  Keep track of whether any of the passes modifies
-/// the module, and if so, return true.
-bool
-MPPassManager::runOnModule(Module &M) {
-  bool Changed = false;
-
-  // Initialize on-the-fly passes
-  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-       I != E; ++I) {
-    FunctionPassManagerImpl *FPP = I->second;
-    Changed |= FPP->doInitialization(M);
-  }
-
-  // Initialize module passes
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    ModulePass *MP = getContainedPass(Index);
-    bool LocalChanged = false;
-
-    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
-    dumpRequiredSet(MP);
-
-    initializeAnalysisImpl(MP);
-
-    {
-      PassManagerPrettyStackEntry X(MP, M);
-      TimeRegion PassTimer(getPassTimer(MP));
-
-      LocalChanged |= MP->runOnModule(M);
-    }
-
-    Changed |= LocalChanged;
-    if (LocalChanged)
-      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
-                   M.getModuleIdentifier());
-    dumpPreservedSet(MP);
-
-    verifyPreservedAnalysis(MP);
-    removeNotPreservedAnalysis(MP);
-    recordAvailableAnalysis(MP);
-    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
-  }
-
-  // Finalize module passes
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-
-  // Finalize on-the-fly passes
-  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-       I != E; ++I) {
-    FunctionPassManagerImpl *FPP = I->second;
-    // We don't know when is the last time an on-the-fly pass is run,
-    // so we need to releaseMemory / finalize here
-    FPP->releaseMemoryOnTheFly();
-    Changed |= FPP->doFinalization(M);
-  }
-  
-  return Changed;
-}
-
-/// Add RequiredPass into list of lower level passes required by pass P.
-/// RequiredPass is run on the fly by Pass Manager when P requests it
-/// through getAnalysis interface.
-void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
-  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
-         "Unable to handle Pass that requires lower level Analysis pass");
-  assert((P->getPotentialPassManagerType() <
-          RequiredPass->getPotentialPassManagerType()) &&
-         "Unable to handle Pass that requires lower level Analysis pass");
-
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
-  if (!FPP) {
-    FPP = new FunctionPassManagerImpl();
-    // FPP is the top level manager.
-    FPP->setTopLevelManager(FPP);
-
-    OnTheFlyManagers[P] = FPP;
-  }
-  FPP->add(RequiredPass);
-
-  // Register P as the last user of RequiredPass.
-  if (RequiredPass) {
-    SmallVector<Pass *, 1> LU;
-    LU.push_back(RequiredPass);
-    FPP->setLastUser(LU,  P);
-  }
-}
-
-/// Return function pass corresponding to PassInfo PI, that is
-/// required by module pass MP. Instantiate analysis pass, by using
-/// its runOnFunction() for function F.
-Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
-  assert(FPP && "Unable to find on the fly pass");
-
-  FPP->releaseMemoryOnTheFly();
-  FPP->run(F);
-  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
-}
-
-
-//===----------------------------------------------------------------------===//
-// PassManagerImpl implementation
-
-//
-/// run - Execute all of the passes scheduled for execution.  Keep track of
-/// whether any of the passes modifies the module, and if so, return true.
-bool PassManagerImpl::run(Module &M) {
-  bool Changed = false;
-  TimingInfo::createTheTimeInfo();
-
-  dumpArguments();
-  dumpPasses();
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doInitialization(M);
-  }
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->runOnModule(M);
-
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doFinalization(M);
-  }
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// PassManager implementation
-
-/// Create new pass manager
-PassManager::PassManager() {
-  PM = new PassManagerImpl();
-  // PM is the top level manager
-  PM->setTopLevelManager(PM);
-}
-
-PassManager::~PassManager() {
-  delete PM;
-}
-
-/// add - Add a pass to the queue of passes to run.  This passes ownership of
-/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-/// will be destroyed as well, so there is no need to delete the pass.  This
-/// implies that all passes MUST be allocated with 'new'.
-void PassManager::add(Pass *P) {
-  PM->add(P);
-}
-
-/// run - Execute all of the passes scheduled for execution.  Keep track of
-/// whether any of the passes modifies the module, and if so, return true.
-bool PassManager::run(Module &M) {
-  return PM->run(M);
-}
-
-//===----------------------------------------------------------------------===//
-// TimingInfo implementation
-
-bool llvm::TimePassesIsEnabled = false;
-static cl::opt<bool,true>
-EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
-            cl::desc("Time each pass, printing elapsed time for each on exit"));
-
-// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
-// a non null value (if the -time-passes option is enabled) or it leaves it
-// null.  It may be called multiple times.
-void TimingInfo::createTheTimeInfo() {
-  if (!TimePassesIsEnabled || TheTimeInfo) return;
-
-  // Constructed the first time this is called, iff -time-passes is enabled.
-  // This guarantees that the object will be constructed before static globals,
-  // thus it will be destroyed before them.
-  static ManagedStatic<TimingInfo> TTI;
-  TheTimeInfo = &*TTI;
-}
-
-/// If TimingInfo is enabled then start pass timer.
-Timer *llvm::getPassTimer(Pass *P) {
-  if (TheTimeInfo)
-    return TheTimeInfo->getPassTimer(P);
-  return 0;
-}
-
-//===----------------------------------------------------------------------===//
-// PMStack implementation
-//
-
-// Pop Pass Manager from the stack and clear its analysis info.
-void PMStack::pop() {
-
-  PMDataManager *Top = this->top();
-  Top->initializeAnalysisInfo();
-
-  S.pop_back();
-}
-
-// Push PM on the stack and set its top level manager.
-void PMStack::push(PMDataManager *PM) {
-  assert(PM && "Unable to push. Pass Manager expected");
-  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
-
-  if (!this->empty()) {
-    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
-           && "pushing bad pass manager to PMStack");
-    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
-
-    assert(TPM && "Unable to find top level manager");
-    TPM->addIndirectPassManager(PM);
-    PM->setTopLevelManager(TPM);
-    PM->setDepth(this->top()->getDepth()+1);
-  } else {
-    assert((PM->getPassManagerType() == PMT_ModulePassManager
-           || PM->getPassManagerType() == PMT_FunctionPassManager)
-           && "pushing bad pass manager to PMStack");
-    PM->setDepth(1);
-  }
-
-  S.push_back(PM);
-}
-
-// Dump content of the pass manager stack.
-void PMStack::dump() const {
-  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
-         E = S.end(); I != E; ++I)
-    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
-
-  if (!S.empty())
-    dbgs() << '\n';
-}
-
-/// Find appropriate Module Pass Manager in the PM Stack and
-/// add self into that manager.
-void ModulePass::assignPassManager(PMStack &PMS,
-                                   PassManagerType PreferredType) {
-  // Find Module Pass Manager
-  while (!PMS.empty()) {
-    PassManagerType TopPMType = PMS.top()->getPassManagerType();
-    if (TopPMType == PreferredType)
-      break; // We found desired pass manager
-    else if (TopPMType > PMT_ModulePassManager)
-      PMS.pop();    // Pop children pass managers
-    else
-      break;
-  }
-  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
-  PMS.top()->add(this);
-}
-
-/// Find appropriate Function Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager.
-void FunctionPass::assignPassManager(PMStack &PMS,
-                                     PassManagerType PreferredType) {
-
-  // Find Function Pass Manager
-  while (!PMS.empty()) {
-    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
-      PMS.pop();
-    else
-      break;
-  }
-
-  // Create new Function Pass Manager if needed.
-  FPPassManager *FPP;
-  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
-    FPP = (FPPassManager *)PMS.top();
-  } else {
-    assert(!PMS.empty() && "Unable to create Function Pass Manager");
-    PMDataManager *PMD = PMS.top();
-
-    // [1] Create new Function Pass Manager
-    FPP = new FPPassManager();
-    FPP->populateInheritedAnalysis(PMS);
-
-    // [2] Set up new manager's top level manager
-    PMTopLevelManager *TPM = PMD->getTopLevelManager();
-    TPM->addIndirectPassManager(FPP);
-
-    // [3] Assign manager to manage this new manager. This may create
-    // and push new managers into PMS
-    FPP->assignPassManager(PMS, PMD->getPassManagerType());
-
-    // [4] Push new manager into PMS
-    PMS.push(FPP);
-  }
-
-  // Assign FPP as the manager of this pass.
-  FPP->add(this);
-}
-
-/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager.
-void BasicBlockPass::assignPassManager(PMStack &PMS,
-                                       PassManagerType PreferredType) {
-  BBPassManager *BBP;
-
-  // Basic Pass Manager is a leaf pass manager. It does not handle
-  // any other pass manager.
-  if (!PMS.empty() &&
-      PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
-    BBP = (BBPassManager *)PMS.top();
-  } else {
-    // If leaf manager is not Basic Block Pass manager then create new
-    // basic Block Pass manager.
-    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
-    PMDataManager *PMD = PMS.top();
-
-    // [1] Create new Basic Block Manager
-    BBP = new BBPassManager();
-
-    // [2] Set up new manager's top level manager
-    // Basic Block Pass Manager does not live by itself
-    PMTopLevelManager *TPM = PMD->getTopLevelManager();
-    TPM->addIndirectPassManager(BBP);
-
-    // [3] Assign manager to manage this new manager. This may create
-    // and push new managers into PMS
-    BBP->assignPassManager(PMS, PreferredType);
-
-    // [4] Push new manager into PMS
-    PMS.push(BBP);
-  }
-
-  // Assign BBP as the manager of this pass.
-  BBP->add(this);
-}
-
-PassManagerBase::~PassManagerBase() {}
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 46c61fc..432cbc9 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -616,11 +616,7 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
 /// getTypeByName - Return the type with the specified name, or null if there
 /// is none by that name.
 StructType *Module::getTypeByName(StringRef Name) const {
-  StringMap<StructType*>::iterator I =
-    getContext().pImpl->NamedStructTypes.find(Name);
-  if (I != getContext().pImpl->NamedStructTypes.end())
-    return I->second;
-  return 0;
+  return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
 
 
diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index d5e6203..689b903 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp
@@ -44,6 +44,9 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
   for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
     incorporateType(FI->getType());
 
+    if (FI->hasPrefixData())
+      incorporateValue(FI->getPrefixData());
+
     // First incorporate the arguments.
     for (Function::const_arg_iterator AI = FI->arg_begin(),
            AE = FI->arg_end(); AI != AE; ++AI)
@@ -91,19 +94,27 @@ void TypeFinder::clear() {
 /// incorporateType - This method adds the type to the list of used structures
 /// if it's not in there already.
 void TypeFinder::incorporateType(Type *Ty) {
-  // Check to see if we're already visited this type.
+  // Check to see if we've already visited this type.
   if (!VisitedTypes.insert(Ty).second)
     return;
 
-  // If this is a structure or opaque type, add a name for the type.
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (!OnlyNamed || STy->hasName())
-      StructTypes.push_back(STy);
-
-  // Recursively walk all contained types.
-  for (Type::subtype_iterator I = Ty->subtype_begin(),
-         E = Ty->subtype_end(); I != E; ++I)
-    incorporateType(*I);
+  SmallVector<Type *, 4> TypeWorklist;
+  TypeWorklist.push_back(Ty);
+  do {
+    Ty = TypeWorklist.pop_back_val();
+
+    // If this is a structure or opaque type, add a name for the type.
+    if (StructType *STy = dyn_cast<StructType>(Ty))
+      if (!OnlyNamed || STy->hasName())
+        StructTypes.push_back(STy);
+
+    // Add all unvisited subtypes to worklist for processing
+    for (Type::subtype_reverse_iterator I = Ty->subtype_rbegin(),
+                                        E = Ty->subtype_rend();
+         I != E; ++I)
+      if (VisitedTypes.insert(*I).second)
+        TypeWorklist.push_back(*I);
+  } while (!TypeWorklist.empty());
 }
 
 /// incorporateValue - This method is used to walk operand lists finding types
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 81d7efa..62a3b31 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -365,7 +365,8 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
         break;
       }
       V = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+    } else if (Operator::getOpcode(V) == Instruction::BitCast ||
+               Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (StripKind == PSK_ZeroIndices || GA->mayBeOverridden())
@@ -393,6 +394,42 @@ Value *Value::stripInBoundsConstantOffsets() {
   return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
 }
 
+Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                        APInt &Offset) {
+  if (!getType()->isPointerTy())
+    return this;
+
+  assert(Offset.getBitWidth() == DL.getPointerSizeInBits(cast<PointerType>(
+                                     getType())->getAddressSpace()) &&
+         "The offset must have exactly as many bits as our pointer.");
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(this);
+  Value *V = this;
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->isInBounds())
+        return V;
+      APInt GEPOffset(Offset);
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+        return V;
+      Offset = GEPOffset;
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      V = GA->getAliasee();
+    } else {
+      return V;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(V));
+
+  return V;
+}
+
 Value *Value::stripInBoundsOffsets() {
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
@@ -698,9 +735,5 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
 #endif
 }
 
-// Default implementation for CallbackVH.
-void CallbackVH::allUsesReplacedWith(Value *) {}
-
-void CallbackVH::deleted() {
-  setValPtr(NULL);
-}
+// Pin the vtable to this file.
+void CallbackVH::anchor() {}
diff --git a/lib/IR/ValueTypes.cpp b/lib/IR/ValueTypes.cpp
index ba04d60..2d4da95 100644
--- a/lib/IR/ValueTypes.cpp
+++ b/lib/IR/ValueTypes.cpp
@@ -134,6 +134,7 @@ std::string EVT::getEVTString() const {
   case MVT::v16i1:   return "v16i1";
   case MVT::v32i1:   return "v32i1";
   case MVT::v64i1:   return "v64i1";
+  case MVT::v1i8:    return "v1i8";
   case MVT::v2i8:    return "v2i8";
   case MVT::v4i8:    return "v4i8";
   case MVT::v8i8:    return "v8i8";
@@ -156,11 +157,15 @@ std::string EVT::getEVTString() const {
   case MVT::v4i64:   return "v4i64";
   case MVT::v8i64:   return "v8i64";
   case MVT::v16i64:  return "v16i64";
+  case MVT::v1f32:   return "v1f32";
   case MVT::v2f32:   return "v2f32";
   case MVT::v2f16:   return "v2f16";
+  case MVT::v4f16:   return "v4f16";
+  case MVT::v8f16:   return "v8f16";
   case MVT::v4f32:   return "v4f32";
   case MVT::v8f32:   return "v8f32";
   case MVT::v16f32:  return "v16f32";
+  case MVT::v1f64:   return "v1f64";
   case MVT::v2f64:   return "v2f64";
   case MVT::v4f64:   return "v4f64";
   case MVT::v8f64:   return "v8f64";
@@ -197,6 +202,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
   case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
   case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::v1i8:    return VectorType::get(Type::getInt8Ty(Context), 1);
   case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
   case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
   case MVT::v8i8:    return VectorType::get(Type::getInt8Ty(Context), 8);
@@ -220,10 +226,14 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v8i64:   return VectorType::get(Type::getInt64Ty(Context), 8);
   case MVT::v16i64:  return VectorType::get(Type::getInt64Ty(Context), 16);
   case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v4f16:   return VectorType::get(Type::getHalfTy(Context), 4);
+  case MVT::v8f16:   return VectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::v1f32:   return VectorType::get(Type::getFloatTy(Context), 1);
   case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
   case MVT::v4f32:   return VectorType::get(Type::getFloatTy(Context), 4);
   case MVT::v8f32:   return VectorType::get(Type::getFloatTy(Context), 8);
   case MVT::v16f32:   return VectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::v1f64:   return VectorType::get(Type::getDoubleTy(Context), 1);
   case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
   case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4); 
   case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8); 
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 0eda97f..da6b573 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -78,7 +78,7 @@
 using namespace llvm;
 
 static cl::opt<bool> DisableDebugInfoVerifier("disable-debug-info-verifier",
-                                              cl::init(false));
+                                              cl::init(true));
 
 namespace {  // Anonymous namespace for class
   struct PreVerifier : public FunctionPass {
@@ -167,7 +167,6 @@ namespace {
     bool doInitialization(Module &M) {
       Mod = &M;
       Context = &M.getContext();
-      Finder.reset();
 
       DL = getAnalysisIfAvailable<DataLayout>();
 
@@ -183,10 +182,15 @@ namespace {
       Mod = F.getParent();
       if (!Context) Context = &F.getContext();
 
+      Finder.reset();
       visit(F);
       InstsInThisBlock.clear();
       PersonalityFn = 0;
 
+      if (!DisableDebugInfoVerifier)
+        // Verify Debug Info.
+        verifyDebugInfo();
+
       // We must abort before returning back to the pass manager, or else the
       // pass manager may try to run other passes on the broken module.
       return abortIfBroken();
@@ -214,9 +218,14 @@ namespace {
         visitNamedMDNode(*I);
 
       visitModuleFlags(M);
+      visitModuleIdents(M);
 
-      // Verify Debug Info.
-      verifyDebugInfo(M);
+      if (!DisableDebugInfoVerifier) {
+        Finder.reset();
+        Finder.processModule(M);
+        // Verify Debug Info.
+        verifyDebugInfo();
+      }
 
       // If the module is broken, abort at this time.
       return abortIfBroken();
@@ -258,6 +267,7 @@ namespace {
     void visitGlobalAlias(GlobalAlias &GA);
     void visitNamedMDNode(NamedMDNode &NMD);
     void visitMDNode(MDNode &MD, Function *F);
+    void visitModuleIdents(Module &M);
     void visitModuleFlags(Module &M);
     void visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*> &SeenIDs,
                          SmallVectorImpl<MDNode*> &Requirements);
@@ -279,6 +289,7 @@ namespace {
     void visitIntToPtrInst(IntToPtrInst &I);
     void visitPtrToIntInst(PtrToIntInst &I);
     void visitBitCastInst(BitCastInst &I);
+    void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
     void visitPHINode(PHINode &PN);
     void visitBinaryOperator(BinaryOperator &B);
     void visitICmpInst(ICmpInst &IC);
@@ -317,6 +328,8 @@ namespace {
     bool VerifyIntrinsicType(Type *Ty,
                              ArrayRef<Intrinsic::IITDescriptor> &Infos,
                              SmallVectorImpl<Type*> &ArgTys);
+    bool VerifyIntrinsicIsVarArg(bool isVarArg,
+                                 ArrayRef<Intrinsic::IITDescriptor> &Infos);
     bool VerifyAttributeCount(AttributeSet Attrs, unsigned Params);
     void VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
                               bool isFunction, const Value *V);
@@ -328,7 +341,7 @@ namespace {
     void VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy);
     void VerifyConstantExprBitcastType(const ConstantExpr *CE);
 
-    void verifyDebugInfo(Module &M);
+    void verifyDebugInfo();
 
     void WriteValue(const Value *V) {
       if (!V) return;
@@ -427,10 +440,6 @@ void Verifier::visitGlobalValue(GlobalValue &GV) {
     Assert1(GVar && GVar->getType()->getElementType()->isArrayTy(),
             "Only global arrays can have appending linkage!", GVar);
   }
-
-  Assert1(!GV.hasLinkOnceODRAutoHideLinkage() || GV.hasDefaultVisibility(),
-          "linkonce_odr_auto_hide can only have default visibility!",
-          &GV);
 }
 
 void Verifier::visitGlobalVariable(GlobalVariable &GV) {
@@ -527,8 +536,7 @@ void Verifier::visitGlobalVariable(GlobalVariable &GV) {
 void Verifier::visitGlobalAlias(GlobalAlias &GA) {
   Assert1(!GA.getName().empty(),
           "Alias name cannot be empty!", &GA);
-  Assert1(GA.hasExternalLinkage() || GA.hasLocalLinkage() ||
-          GA.hasWeakLinkage(),
+  Assert1(GlobalAlias::isValidLinkage(GA.getLinkage()),
           "Alias should have external or external weak linkage!", &GA);
   Assert1(GA.getAliasee(),
           "Aliasee cannot be NULL!", &GA);
@@ -612,6 +620,24 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
   }
 }
 
+void Verifier::visitModuleIdents(Module &M) {
+  const NamedMDNode *Idents = M.getNamedMetadata("llvm.ident");
+  if (!Idents) 
+    return;
+  
+  // llvm.ident takes a list of metadata entry. Each entry has only one string.
+  // Scan each llvm.ident entry and make sure that this requirement is met.
+  for (unsigned i = 0, e = Idents->getNumOperands(); i != e; ++i) {
+    const MDNode *N = Idents->getOperand(i);
+    Assert1(N->getNumOperands() == 1,
+            "incorrect number of operands in llvm.ident metadata", N);
+    Assert1(isa<MDString>(N->getOperand(0)),
+            ("invalid value for llvm.ident metadata entry operand"
+             "(the operand should be a string)"),
+            N->getOperand(0));
+  } 
+}
+
 void Verifier::visitModuleFlags(Module &M) {
   const NamedMDNode *Flags = M.getModuleFlagsMetadata();
   if (!Flags) return;
@@ -751,7 +777,8 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::NoDuplicate ||
         I->getKindAsEnum() == Attribute::Builtin ||
         I->getKindAsEnum() == Attribute::NoBuiltin ||
-        I->getKindAsEnum() == Attribute::Cold) {
+        I->getKindAsEnum() == Attribute::Cold ||
+        I->getKindAsEnum() == Attribute::OptimizeNone) {
       if (!isFunction) {
         CheckFailed("Attribute '" + I->getAsString() +
                     "' only applies to functions!", V);
@@ -897,6 +924,21 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
             Attrs.hasAttribute(AttributeSet::FunctionIndex,
                                Attribute::AlwaysInline)),
           "Attributes 'noinline and alwaysinline' are incompatible!", V);
+
+  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+                         Attribute::OptimizeNone)) {
+    Assert1(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::NoInline),
+            "Attribute 'optnone' requires 'noinline'!", V);
+
+    Assert1(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::OptimizeForSize),
+            "Attributes 'optsize and optnone' are incompatible!", V);
+
+    Assert1(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::MinSize),
+            "Attributes 'minsize and optnone' are incompatible!", V);
+  }
 }
 
 void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
@@ -930,11 +972,9 @@ void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
   unsigned SrcAS = SrcTy->getPointerAddressSpace();
   unsigned DstAS = DestTy->getPointerAddressSpace();
 
-  unsigned SrcASSize = DL->getPointerSizeInBits(SrcAS);
-  unsigned DstASSize = DL->getPointerSizeInBits(DstAS);
-  Assert1(SrcASSize == DstASSize,
-          "Bitcasts between pointers of different address spaces must have "
-          "the same size pointers, otherwise use PtrToInt/IntToPtr.", V);
+  Assert1(SrcAS == DstAS,
+          "Bitcasts between pointers of different address spaces is not legal."
+          "Use AddrSpaceCast instead.", V);
 }
 
 void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
@@ -1152,27 +1192,12 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
-  IntegerType *IntTy = cast<IntegerType>(SwitchTy);
-  IntegersSubsetToBB Mapping;
-  std::map<IntegersSubset::Range, unsigned> RangeSetMap;
+  SmallPtrSet<ConstantInt*, 32> Constants;
   for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    IntegersSubset CaseRanges = i.getCaseValueEx();
-    for (unsigned ri = 0, rie = CaseRanges.getNumItems(); ri < rie; ++ri) {
-      IntegersSubset::Range r = CaseRanges.getItem(ri);
-      Assert1(((const APInt&)r.getLow()).getBitWidth() == IntTy->getBitWidth(),
-              "Switch constants must all be same type as switch value!", &SI);
-      Assert1(((const APInt&)r.getHigh()).getBitWidth() == IntTy->getBitWidth(),
-              "Switch constants must all be same type as switch value!", &SI);
-      Mapping.add(r);
-      RangeSetMap[r] = i.getCaseIndex();
-    }
-  }
-
-  IntegersSubsetToBB::RangeIterator errItem;
-  if (!Mapping.verify(errItem)) {
-    unsigned CaseIndex = RangeSetMap[errItem->first];
-    SwitchInst::CaseIt i(&SI, CaseIndex);
-    Assert2(false, "Duplicate integer as switch case", &SI, i.getCaseValueEx());
+    Assert1(i.getCaseValue()->getType() == SwitchTy,
+            "Switch constants must all be same type as switch value!", &SI);
+    Assert2(Constants.insert(i.getCaseValue()),
+            "Duplicate integer as switch case", &SI, i.getCaseValue());
   }
 
   visitTerminatorInst(SI);
@@ -1435,6 +1460,22 @@ void Verifier::visitBitCastInst(BitCastInst &I) {
   visitInstruction(I);
 }
 
+void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  Assert1(SrcTy->isPtrOrPtrVectorTy(),
+          "AddrSpaceCast source must be a pointer", &I);
+  Assert1(DestTy->isPtrOrPtrVectorTy(),
+          "AddrSpaceCast result must be a pointer", &I);
+  Assert1(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(),
+          "AddrSpaceCast must be between different address spaces", &I);
+  if (SrcTy->isVectorTy())
+    Assert1(SrcTy->getVectorNumElements() == DestTy->getVectorNumElements(),
+            "AddrSpaceCast vector pointer number of elements mismatch", &I);
+  visitInstruction(I);
+}
+
 /// visitPHINode - Ensure that a PHI node is well formed.
 ///
 void Verifier::visitPHINode(PHINode &PN) {
@@ -1537,14 +1578,6 @@ void Verifier::VerifyCallSite(CallSite CS) {
               "Function has metadata parameter but isn't an intrinsic", I);
   }
 
-  // If the call site has the 'builtin' attribute, verify that it's applied to a
-  // direct call to a function with the 'nobuiltin' attribute.
-  if (CS.hasFnAttr(Attribute::Builtin))
-    Assert1(CS.getCalledFunction() &&
-            CS.getCalledFunction()->hasFnAttribute(Attribute::NoBuiltin),
-            "Attribute 'builtin' can only be used in a call to a function with "
-            "the 'nobuiltin' attribute.", I);
-
   visitInstruction(*I);
 }
 
@@ -2098,7 +2131,7 @@ void Verifier::visitInstruction(Instruction &I) {
 
   if (!DisableDebugInfoVerifier) {
     MD = I.getMetadata(LLVMContext::MD_dbg);
-    Finder.processLocation(DILocation(MD));
+    Finder.processLocation(*Mod, DILocation(MD));
   }
 
   InstsInThisBlock.insert(&I);
@@ -2121,6 +2154,7 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
 
   switch (D.Kind) {
   case IITDescriptor::Void: return !Ty->isVoidTy();
+  case IITDescriptor::VarArg: return true;
   case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
   case IITDescriptor::Metadata: return !Ty->isMetadataTy();
   case IITDescriptor::Half: return !Ty->isHalfTy();
@@ -2185,6 +2219,33 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
   llvm_unreachable("unhandled");
 }
 
+/// \brief Verify if the intrinsic has variable arguments.
+/// This method is intended to be called after all the fixed arguments have been
+/// verified first.
+///
+/// This method returns true on error and does not print an error message.
+bool
+Verifier::VerifyIntrinsicIsVarArg(bool isVarArg,
+                                  ArrayRef<Intrinsic::IITDescriptor> &Infos) {
+  using namespace Intrinsic;
+
+  // If there are no descriptors left, then it can't be a vararg.
+  if (Infos.empty())
+    return isVarArg ? true : false;
+
+  // There should be only one descriptor remaining at this point.
+  if (Infos.size() != 1)
+    return true;
+
+  // Check and verify the descriptor.
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  if (D.Kind == IITDescriptor::VarArg)
+    return isVarArg ? false : true;
+
+  return true;
+}
+
 /// visitIntrinsicFunction - Allow intrinsics to be verified in different ways.
 ///
 void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
@@ -2195,7 +2256,7 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   // Verify that the intrinsic prototype lines up with what the .td files
   // describe.
   FunctionType *IFTy = IF->getFunctionType();
-  Assert1(!IFTy->isVarArg(), "Intrinsic prototypes are not varargs", IF);
+  bool IsVarArg = IFTy->isVarArg();
 
   SmallVector<Intrinsic::IITDescriptor, 8> Table;
   getIntrinsicInfoTableEntries(ID, Table);
@@ -2207,6 +2268,16 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i)
     Assert1(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys),
             "Intrinsic has incorrect argument type!", IF);
+
+  // Verify if the intrinsic call matches the vararg property.
+  if (IsVarArg)
+    Assert1(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+            "Intrinsic was not defined with variable arguments!", IF);
+  else
+    Assert1(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+            "Callsite was not defined with variable arguments!", IF);
+
+  // All descriptors should be absorbed by now.
   Assert1(TableRef.empty(), "Intrinsic has too few arguments!", IF);
 
   // Now that we have the intrinsic ID and the actual argument types (and we
@@ -2238,13 +2309,13 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
     Assert1(MD->getNumOperands() == 1,
                 "invalid llvm.dbg.declare intrinsic call 2", &CI);
     if (!DisableDebugInfoVerifier)
-      Finder.processDeclare(cast<DbgDeclareInst>(&CI));
+      Finder.processDeclare(*Mod, cast<DbgDeclareInst>(&CI));
   } break;
   case Intrinsic::dbg_value: { //llvm.dbg.value
     if (!DisableDebugInfoVerifier) {
       Assert1(CI.getArgOperand(0) && isa<MDNode>(CI.getArgOperand(0)),
               "invalid llvm.dbg.value intrinsic call 1", &CI);
-      Finder.processValue(cast<DbgValueInst>(&CI));
+      Finder.processValue(*Mod, cast<DbgValueInst>(&CI));
     }
     break;
   }
@@ -2309,11 +2380,9 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   }
 }
 
-void Verifier::verifyDebugInfo(Module &M) {
+void Verifier::verifyDebugInfo() {
   // Verify Debug Info.
   if (!DisableDebugInfoVerifier) {
-    Finder.processModule(M);
-
     for (DebugInfoFinder::iterator I = Finder.compile_unit_begin(),
          E = Finder.compile_unit_end(); I != E; ++I)
       Assert1(DICompileUnit(*I).Verify(), "DICompileUnit does not Verify!", *I);
@@ -2352,6 +2421,7 @@ bool llvm::verifyFunction(const Function &f, VerifierFailureAction action) {
   FunctionPassManager FPM(F.getParent());
   Verifier *V = new Verifier(action);
   FPM.add(V);
+  FPM.doInitialization();
   FPM.run(F);
   return V->Broken;
 }
diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp
index 656fe18..935e81d 100644
--- a/lib/IRReader/IRReader.cpp
+++ b/lib/IRReader/IRReader.cpp
@@ -11,10 +11,15 @@
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Assembly/Parser.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/IRReader.h"
 
 using namespace llvm;
 
@@ -87,3 +92,30 @@ Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
 
   return ParseIR(File.take(), Err, Context);
 }
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef,
+                              LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
+                              char **OutMessage) {
+  SMDiagnostic Diag;
+
+  *OutM = wrap(ParseIR(unwrap(MemBuf), Diag, *unwrap(ContextRef)));
+
+  if(!*OutM) {
+    if (OutMessage) {
+      std::string buf;
+      raw_string_ostream os(buf);
+
+      Diag.print(NULL, os, false);
+      os.flush();
+
+      *OutMessage = strdup(buf.c_str());
+    }
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index ff288bc..00280c8 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Analysis AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker IR IRReader MC Object Option Support TableGen Target Transforms
+subdirectories = Analysis AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker IR IRReader LTO MC Object Option Support TableGen Target Transforms
 
 [component_0]
 type = Group
diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt
new file mode 100644
index 0000000..8e00bcb
--- /dev/null
+++ b/lib/LTO/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_llvm_library(LLVMLTO
+  LTOModule.cpp
+  LTOCodeGenerator.cpp
+  )
diff --git a/lib/LTO/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt
new file mode 100644
index 0000000..38c1170
--- /dev/null
+++ b/lib/LTO/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/LTO/LLVMBuild.txt ----------------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LTO
+parent = Libraries
+required_libraries = Analysis BitReader BitWriter Core IPO Linker MC MCParser Scalar Support Target Vectorize
+\ No newline at end of file
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
new file mode 100644
index 0000000..2b3648e
--- /dev/null
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -0,0 +1,521 @@
+//===-LTOCodeGenerator.cpp - LLVM Link Time Optimizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Link Time Optimization library. This library is
+// intended to be used by linker to optimize code at link time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/LTOCodeGenerator.h"
+#include "llvm/LTO/LTOModule.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Linker.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/ObjCARC.h"
+using namespace llvm;
+
+const char* LTOCodeGenerator::getVersionString() {
+#ifdef LLVM_VERSION_INFO
+  return PACKAGE_NAME " version " PACKAGE_VERSION ", " LLVM_VERSION_INFO;
+#else
+  return PACKAGE_NAME " version " PACKAGE_VERSION;
+#endif
+}
+
+LTOCodeGenerator::LTOCodeGenerator()
+    : Context(getGlobalContext()), Linker(new Module("ld-temp.o", Context)),
+      TargetMach(NULL), EmitDwarfDebugInfo(false), ScopeRestrictionsDone(false),
+      CodeModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC), NativeObjectFile(NULL) {
+  initializeLTOPasses();
+}
+
+LTOCodeGenerator::~LTOCodeGenerator() {
+  delete TargetMach;
+  delete NativeObjectFile;
+  TargetMach = NULL;
+  NativeObjectFile = NULL;
+
+  Linker.deleteModule();
+
+  for (std::vector<char *>::iterator I = CodegenOptions.begin(),
+                                     E = CodegenOptions.end();
+       I != E; ++I)
+    free(*I);
+}
+
+// Initialize LTO passes. Please keep this funciton in sync with
+// PassManagerBuilder::populateLTOPassManager(), and make sure all LTO
+// passes are initialized. 
+//
+void LTOCodeGenerator::initializeLTOPasses() {
+  PassRegistry &R = *PassRegistry::getPassRegistry();
+
+  initializeInternalizePassPass(R);
+  initializeIPSCCPPass(R);
+  initializeGlobalOptPass(R);
+  initializeConstantMergePass(R);
+  initializeDAHPass(R);
+  initializeInstCombinerPass(R);
+  initializeSimpleInlinerPass(R);
+  initializePruneEHPass(R);
+  initializeGlobalDCEPass(R);
+  initializeArgPromotionPass(R);
+  initializeJumpThreadingPass(R);
+  initializeSROAPass(R);
+  initializeSROA_DTPass(R);
+  initializeSROA_SSAUpPass(R);
+  initializeFunctionAttrsPass(R);
+  initializeGlobalsModRefPass(R);
+  initializeLICMPass(R);
+  initializeGVNPass(R);
+  initializeMemCpyOptPass(R);
+  initializeDCEPass(R);
+  initializeCFGSimplifyPassPass(R);
+}
+
+bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
+  bool ret = Linker.linkInModule(mod->getLLVVMModule(), &errMsg);
+
+  const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
+  for (int i = 0, e = undefs.size(); i != e; ++i)
+    AsmUndefinedRefs[undefs[i]] = 1;
+
+  return !ret;
+}
+
+void LTOCodeGenerator::setTargetOptions(TargetOptions options) {
+  Options.LessPreciseFPMADOption = options.LessPreciseFPMADOption;
+  Options.NoFramePointerElim = options.NoFramePointerElim;
+  Options.AllowFPOpFusion = options.AllowFPOpFusion;
+  Options.UnsafeFPMath = options.UnsafeFPMath;
+  Options.NoInfsFPMath = options.NoInfsFPMath;
+  Options.NoNaNsFPMath = options.NoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+    options.HonorSignDependentRoundingFPMathOption;
+  Options.UseSoftFloat = options.UseSoftFloat;
+  Options.FloatABIType = options.FloatABIType;
+  Options.NoZerosInBSS = options.NoZerosInBSS;
+  Options.GuaranteedTailCallOpt = options.GuaranteedTailCallOpt;
+  Options.DisableTailCalls = options.DisableTailCalls;
+  Options.StackAlignmentOverride = options.StackAlignmentOverride;
+  Options.TrapFuncName = options.TrapFuncName;
+  Options.PositionIndependentExecutable = options.PositionIndependentExecutable;
+  Options.EnableSegmentedStacks = options.EnableSegmentedStacks;
+  Options.UseInitArray = options.UseInitArray;
+}
+
+void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) {
+  switch (debug) {
+  case LTO_DEBUG_MODEL_NONE:
+    EmitDwarfDebugInfo = false;
+    return;
+
+  case LTO_DEBUG_MODEL_DWARF:
+    EmitDwarfDebugInfo = true;
+    return;
+  }
+  llvm_unreachable("Unknown debug format!");
+}
+
+void LTOCodeGenerator::setCodePICModel(lto_codegen_model model) {
+  switch (model) {
+  case LTO_CODEGEN_PIC_MODEL_STATIC:
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+    CodeModel = model;
+    return;
+  }
+  llvm_unreachable("Unknown PIC model!");
+}
+
+bool LTOCodeGenerator::writeMergedModules(const char *path,
+                                          std::string &errMsg) {
+  if (!determineTarget(errMsg))
+    return false;
+
+  // mark which symbols can not be internalized
+  applyScopeRestrictions();
+
+  // create output file
+  std::string ErrInfo;
+  tool_output_file Out(path, ErrInfo, sys::fs::F_Binary);
+  if (!ErrInfo.empty()) {
+    errMsg = "could not open bitcode file for writing: ";
+    errMsg += path;
+    return false;
+  }
+
+  // write bitcode to it
+  WriteBitcodeToFile(Linker.getModule(), Out.os());
+  Out.os().close();
+
+  if (Out.os().has_error()) {
+    errMsg = "could not write bitcode file: ";
+    errMsg += path;
+    Out.os().clear_error();
+    return false;
+  }
+
+  Out.keep();
+  return true;
+}
+
+bool LTOCodeGenerator::compile_to_file(const char** name, 
+                                       bool disableOpt,
+                                       bool disableInline,
+                                       bool disableGVNLoadPRE,
+                                       std::string& errMsg) {
+  // make unique temp .o file to put generated object file
+  SmallString<128> Filename;
+  int FD;
+  error_code EC = sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
+  if (EC) {
+    errMsg = EC.message();
+    return false;
+  }
+
+  // generate object file
+  tool_output_file objFile(Filename.c_str(), FD);
+
+  bool genResult = generateObjectFile(objFile.os(), disableOpt, disableInline,
+                                      disableGVNLoadPRE, errMsg);
+  objFile.os().close();
+  if (objFile.os().has_error()) {
+    objFile.os().clear_error();
+    sys::fs::remove(Twine(Filename));
+    return false;
+  }
+
+  objFile.keep();
+  if (!genResult) {
+    sys::fs::remove(Twine(Filename));
+    return false;
+  }
+
+  NativeObjectPath = Filename.c_str();
+  *name = NativeObjectPath.c_str();
+  return true;
+}
+
+const void* LTOCodeGenerator::compile(size_t* length,
+                                      bool disableOpt,
+                                      bool disableInline,
+                                      bool disableGVNLoadPRE,
+                                      std::string& errMsg) {
+  const char *name;
+  if (!compile_to_file(&name, disableOpt, disableInline, disableGVNLoadPRE,
+                       errMsg))
+    return NULL;
+
+  // remove old buffer if compile() called twice
+  delete NativeObjectFile;
+
+  // read .o file into memory buffer
+  OwningPtr<MemoryBuffer> BuffPtr;
+  if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
+    errMsg = ec.message();
+    sys::fs::remove(NativeObjectPath);
+    return NULL;
+  }
+  NativeObjectFile = BuffPtr.take();
+
+  // remove temp files
+  sys::fs::remove(NativeObjectPath);
+
+  // return buffer, unless error
+  if (NativeObjectFile == NULL)
+    return NULL;
+  *length = NativeObjectFile->getBufferSize();
+  return NativeObjectFile->getBufferStart();
+}
+
+bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
+  if (TargetMach != NULL)
+    return true;
+
+  std::string TripleStr = Linker.getModule()->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
+
+  // create target machine from info for merged modules
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
+  if (march == NULL)
+    return false;
+
+  // The relocation model is actually a static member of TargetMachine and
+  // needs to be set before the TargetMachine is instantiated.
+  Reloc::Model RelocModel = Reloc::Default;
+  switch (CodeModel) {
+  case LTO_CODEGEN_PIC_MODEL_STATIC:
+    RelocModel = Reloc::Static;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
+    RelocModel = Reloc::PIC_;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+    RelocModel = Reloc::DynamicNoPIC;
+    break;
+  }
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(Triple);
+  std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  if (MCpu.empty() && Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      MCpu = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      MCpu = "yonah";
+  }
+
+  TargetMach = march->createTargetMachine(TripleStr, MCpu, FeatureStr, Options,
+                                          RelocModel, CodeModel::Default,
+                                          CodeGenOpt::Aggressive);
+  return true;
+}
+
+void LTOCodeGenerator::
+applyRestriction(GlobalValue &GV,
+                 const ArrayRef<StringRef> &Libcalls,
+                 std::vector<const char*> &MustPreserveList,
+                 SmallPtrSet<GlobalValue*, 8> &AsmUsed,
+                 Mangler &Mangler) {
+  SmallString<64> Buffer;
+  Mangler.getNameWithPrefix(Buffer, &GV, false);
+
+  if (GV.isDeclaration())
+    return;
+  if (MustPreserveSymbols.count(Buffer))
+    MustPreserveList.push_back(GV.getName().data());
+  if (AsmUndefinedRefs.count(Buffer))
+    AsmUsed.insert(&GV);
+
+  // Conservatively append user-supplied runtime library functions to
+  // llvm.compiler.used.  These could be internalized and deleted by
+  // optimizations like -globalopt, causing problems when later optimizations
+  // add new library calls (e.g., llvm.memset => memset and printf => puts).
+  // Leave it to the linker to remove any dead code (e.g. with -dead_strip).
+  if (isa<Function>(GV) &&
+      std::binary_search(Libcalls.begin(), Libcalls.end(), GV.getName()))
+    AsmUsed.insert(&GV);
+}
+
+static void findUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSet<GlobalValue*, 8> &UsedValues) {
+  if (LLVMUsed == 0) return;
+
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV =
+        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
+static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
+                                      const TargetLibraryInfo& TLI,
+                                      const TargetLowering *Lowering)
+{
+  // TargetLibraryInfo has info on C runtime library calls on the current
+  // target.
+  for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
+       I != E; ++I) {
+    LibFunc::Func F = static_cast<LibFunc::Func>(I);
+    if (TLI.has(F))
+      Libcalls.push_back(TLI.getName(F));
+  }
+
+  // TargetLowering has info on library calls that CodeGen expects to be
+  // available, both from the C runtime and compiler-rt.
+  if (Lowering)
+    for (unsigned I = 0, E = static_cast<unsigned>(RTLIB::UNKNOWN_LIBCALL);
+         I != E; ++I)
+      if (const char *Name
+          = Lowering->getLibcallName(static_cast<RTLIB::Libcall>(I)))
+        Libcalls.push_back(Name);
+
+  array_pod_sort(Libcalls.begin(), Libcalls.end());
+  Libcalls.erase(std::unique(Libcalls.begin(), Libcalls.end()),
+                 Libcalls.end());
+}
+
+void LTOCodeGenerator::applyScopeRestrictions() {
+  if (ScopeRestrictionsDone)
+    return;
+  Module *mergedModule = Linker.getModule();
+
+  // Start off with a verification pass.
+  PassManager passes;
+  passes.add(createVerifierPass());
+
+  // mark which symbols can not be internalized
+  Mangler Mangler(TargetMach);
+  std::vector<const char*> MustPreserveList;
+  SmallPtrSet<GlobalValue*, 8> AsmUsed;
+  std::vector<StringRef> Libcalls;
+  TargetLibraryInfo TLI(Triple(TargetMach->getTargetTriple()));
+  accumulateAndSortLibcalls(Libcalls, TLI, TargetMach->getTargetLowering());
+
+  for (Module::iterator f = mergedModule->begin(),
+         e = mergedModule->end(); f != e; ++f)
+    applyRestriction(*f, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (Module::global_iterator v = mergedModule->global_begin(),
+         e = mergedModule->global_end(); v !=  e; ++v)
+    applyRestriction(*v, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (Module::alias_iterator a = mergedModule->alias_begin(),
+         e = mergedModule->alias_end(); a != e; ++a)
+    applyRestriction(*a, Libcalls, MustPreserveList, AsmUsed, Mangler);
+
+  GlobalVariable *LLVMCompilerUsed =
+    mergedModule->getGlobalVariable("llvm.compiler.used");
+  findUsedValues(LLVMCompilerUsed, AsmUsed);
+  if (LLVMCompilerUsed)
+    LLVMCompilerUsed->eraseFromParent();
+
+  if (!AsmUsed.empty()) {
+    llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(Context);
+    std::vector<Constant*> asmUsed2;
+    for (SmallPtrSet<GlobalValue*, 16>::const_iterator i = AsmUsed.begin(),
+           e = AsmUsed.end(); i !=e; ++i) {
+      GlobalValue *GV = *i;
+      Constant *c = ConstantExpr::getBitCast(GV, i8PTy);
+      asmUsed2.push_back(c);
+    }
+
+    llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, asmUsed2.size());
+    LLVMCompilerUsed =
+      new llvm::GlobalVariable(*mergedModule, ATy, false,
+                               llvm::GlobalValue::AppendingLinkage,
+                               llvm::ConstantArray::get(ATy, asmUsed2),
+                               "llvm.compiler.used");
+
+    LLVMCompilerUsed->setSection("llvm.metadata");
+  }
+
+  passes.add(createInternalizePass(MustPreserveList));
+
+  // apply scope restrictions
+  passes.run(*mergedModule);
+
+  ScopeRestrictionsDone = true;
+}
+
+/// Optimize merged modules using various IPO passes
+bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
+                                          bool DisableOpt,
+                                          bool DisableInline,
+                                          bool DisableGVNLoadPRE,
+                                          std::string &errMsg) {
+  if (!this->determineTarget(errMsg))
+    return false;
+
+  Module *mergedModule = Linker.getModule();
+
+  // Mark which symbols can not be internalized
+  this->applyScopeRestrictions();
+
+  // Instantiate the pass manager to organize the passes.
+  PassManager passes;
+
+  // Start off with a verification pass.
+  passes.add(createVerifierPass());
+
+  // Add an appropriate DataLayout instance for this module...
+  passes.add(new DataLayout(*TargetMach->getDataLayout()));
+  TargetMach->addAnalysisPasses(passes);
+
+  // Enabling internalize here would use its AllButMain variant. It
+  // keeps only main if it exists and does nothing for libraries. Instead
+  // we create the pass ourselves with the symbol list provided by the linker.
+  if (!DisableOpt)
+    PassManagerBuilder().populateLTOPassManager(passes,
+                                              /*Internalize=*/false,
+                                              !DisableInline,
+                                              DisableGVNLoadPRE);
+
+  // Make sure everything is still good.
+  passes.add(createVerifierPass());
+
+  PassManager codeGenPasses;
+
+  codeGenPasses.add(new DataLayout(*TargetMach->getDataLayout()));
+  TargetMach->addAnalysisPasses(codeGenPasses);
+
+  formatted_raw_ostream Out(out);
+
+  // If the bitcode files contain ARC code and were compiled with optimization,
+  // the ObjCARCContractPass must be run, so do it unconditionally here.
+  codeGenPasses.add(createObjCARCContractPass());
+
+  if (TargetMach->addPassesToEmitFile(codeGenPasses, Out,
+                                      TargetMachine::CGFT_ObjectFile)) {
+    errMsg = "target file type not supported";
+    return false;
+  }
+
+  // Run our queue of passes all at once now, efficiently.
+  passes.run(*mergedModule);
+
+  // Run the code generator, and write assembly file
+  codeGenPasses.run(*mergedModule);
+
+  return true;
+}
+
+/// setCodeGenDebugOptions - Set codegen debugging options to aid in debugging
+/// LTO problems.
+void LTOCodeGenerator::setCodeGenDebugOptions(const char *options) {
+  for (std::pair<StringRef, StringRef> o = getToken(options);
+       !o.first.empty(); o = getToken(o.second)) {
+    // ParseCommandLineOptions() expects argv[0] to be program name. Lazily add
+    // that.
+    if (CodegenOptions.empty())
+      CodegenOptions.push_back(strdup("libLLVMLTO"));
+    CodegenOptions.push_back(strdup(o.first.str().c_str()));
+  }
+}
+
+void LTOCodeGenerator::parseCodeGenDebugOptions() {
+  // if options were requested, set them
+  if (!CodegenOptions.empty())
+    cl::ParseCommandLineOptions(CodegenOptions.size(),
+                                const_cast<char **>(&CodegenOptions[0]));
+}
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
new file mode 100644
index 0000000..65416be
--- /dev/null
+++ b/lib/LTO/LTOModule.cpp
@@ -0,0 +1,794 @@
+//===-- LTOModule.cpp - LLVM Link Time Optimizer --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Link Time Optimization library. This library is
+// intended to be used by linker to optimize code at link time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/LTOModule.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+using namespace llvm;
+
+LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
+  : _module(m), _target(t),
+    _context(_target->getMCAsmInfo(), _target->getRegisterInfo(), NULL),
+    _mangler(t) {}
+
+/// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
+/// bitcode.
+bool LTOModule::isBitcodeFile(const void *mem, size_t length) {
+  return sys::fs::identify_magic(StringRef((const char *)mem, length)) ==
+         sys::fs::file_magic::bitcode;
+}
+
+bool LTOModule::isBitcodeFile(const char *path) {
+  sys::fs::file_magic type;
+  if (sys::fs::identify_magic(path, type))
+    return false;
+  return type == sys::fs::file_magic::bitcode;
+}
+
+/// isBitcodeFileForTarget - Returns 'true' if the file (or memory contents) is
+/// LLVM bitcode for the specified triple.
+bool LTOModule::isBitcodeFileForTarget(const void *mem, size_t length,
+                                       const char *triplePrefix) {
+  MemoryBuffer *buffer = makeBuffer(mem, length);
+  if (!buffer)
+    return false;
+  return isTargetMatch(buffer, triplePrefix);
+}
+
+bool LTOModule::isBitcodeFileForTarget(const char *path,
+                                       const char *triplePrefix) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (MemoryBuffer::getFile(path, buffer))
+    return false;
+  return isTargetMatch(buffer.take(), triplePrefix);
+}
+
+/// isTargetMatch - Returns 'true' if the memory buffer is for the specified
+/// target triple.
+bool LTOModule::isTargetMatch(MemoryBuffer *buffer, const char *triplePrefix) {
+  std::string Triple = getBitcodeTargetTriple(buffer, getGlobalContext());
+  delete buffer;
+  return strncmp(Triple.c_str(), triplePrefix, strlen(triplePrefix)) == 0;
+}
+
+/// makeLTOModule - Create an LTOModule. N.B. These methods take ownership of
+/// the buffer.
+LTOModule *LTOModule::makeLTOModule(const char *path, TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (error_code ec = MemoryBuffer::getFile(path, buffer)) {
+    errMsg = ec.message();
+    return NULL;
+  }
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
+                                    size_t size, TargetOptions options,
+                                    std::string &errMsg) {
+  return makeLTOModule(fd, path, size, 0, options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
+                                    size_t map_size,
+                                    off_t offset,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (error_code ec =
+          MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) {
+    errMsg = ec.message();
+    return NULL;
+  }
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer(makeBuffer(mem, length));
+  if (!buffer)
+    return NULL;
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  // parse bitcode buffer
+  OwningPtr<Module> m(getLazyBitcodeModule(buffer, getGlobalContext(),
+                                           &errMsg));
+  if (!m) {
+    delete buffer;
+    return NULL;
+  }
+
+  std::string TripleStr = m->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
+
+  // find machine architecture for this module
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
+  if (!march)
+    return NULL;
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(Triple);
+  std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  std::string CPU;
+  if (Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      CPU = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      CPU = "yonah";
+  }
+
+  TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
+                                                     options);
+  m->MaterializeAllPermanently();
+
+  LTOModule *Ret = new LTOModule(m.take(), target);
+  if (Ret->parseSymbols(errMsg)) {
+    delete Ret;
+    return NULL;
+  }
+
+  return Ret;
+}
+
+/// makeBuffer - Create a MemoryBuffer from a memory range.
+MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
+  const char *startPtr = (const char*)mem;
+  return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), "", false);
+}
+
+/// objcClassNameFromExpression - Get string that the data pointer points to.
+bool
+LTOModule::objcClassNameFromExpression(const Constant *c, std::string &name) {
+  if (const ConstantExpr *ce = dyn_cast<ConstantExpr>(c)) {
+    Constant *op = ce->getOperand(0);
+    if (GlobalVariable *gvn = dyn_cast<GlobalVariable>(op)) {
+      Constant *cn = gvn->getInitializer();
+      if (ConstantDataArray *ca = dyn_cast<ConstantDataArray>(cn)) {
+        if (ca->isCString()) {
+          name = ".objc_class_name_" + ca->getAsCString().str();
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/// addObjCClass - Parse i386/ppc ObjC class data structure.
+void LTOModule::addObjCClass(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+  if (!c) return;
+
+  // second slot in __OBJC,__class is pointer to superclass name
+  std::string superclassName;
+  if (objcClassNameFromExpression(c->getOperand(1), superclassName)) {
+    NameAndAttributes info;
+    StringMap<NameAndAttributes>::value_type &entry =
+      _undefines.GetOrCreateValue(superclassName);
+    if (!entry.getValue().name) {
+      const char *symbolName = entry.getKey().data();
+      info.name = symbolName;
+      info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+      info.isFunction = false;
+      info.symbol = clgv;
+      entry.setValue(info);
+    }
+  }
+
+  // third slot in __OBJC,__class is pointer to class name
+  std::string className;
+  if (objcClassNameFromExpression(c->getOperand(2), className)) {
+    StringSet::value_type &entry = _defines.GetOrCreateValue(className);
+    entry.setValue(1);
+
+    NameAndAttributes info;
+    info.name = entry.getKey().data();
+    info.attributes = LTO_SYMBOL_PERMISSIONS_DATA |
+      LTO_SYMBOL_DEFINITION_REGULAR | LTO_SYMBOL_SCOPE_DEFAULT;
+    info.isFunction = false;
+    info.symbol = clgv;
+    _symbols.push_back(info);
+  }
+}
+
+/// addObjCCategory - Parse i386/ppc ObjC category data structure.
+void LTOModule::addObjCCategory(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+  if (!c) return;
+
+  // second slot in __OBJC,__category is pointer to target class name
+  std::string targetclassName;
+  if (!objcClassNameFromExpression(c->getOperand(1), targetclassName))
+    return;
+
+  NameAndAttributes info;
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(targetclassName);
+
+  if (entry.getValue().name)
+    return;
+
+  const char *symbolName = entry.getKey().data();
+  info.name = symbolName;
+  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+  info.isFunction = false;
+  info.symbol = clgv;
+  entry.setValue(info);
+}
+
+/// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
+void LTOModule::addObjCClassRef(const GlobalVariable *clgv) {
+  std::string targetclassName;
+  if (!objcClassNameFromExpression(clgv->getInitializer(), targetclassName))
+    return;
+
+  NameAndAttributes info;
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(targetclassName);
+  if (entry.getValue().name)
+    return;
+
+  const char *symbolName = entry.getKey().data();
+  info.name = symbolName;
+  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+  info.isFunction = false;
+  info.symbol = clgv;
+  entry.setValue(info);
+}
+
+/// addDefinedDataSymbol - Add a data symbol as defined to the list.
+void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
+  // Add to list of defined symbols.
+  addDefinedSymbol(v, false);
+
+  if (!v->hasSection() /* || !isTargetDarwin */)
+    return;
+
+  // Special case i386/ppc ObjC data structures in magic sections:
+  // The issue is that the old ObjC object format did some strange
+  // contortions to avoid real linker symbols.  For instance, the
+  // ObjC class data structure is allocated statically in the executable
+  // that defines that class.  That data structures contains a pointer to
+  // its superclass.  But instead of just initializing that part of the
+  // struct to the address of its superclass, and letting the static and
+  // dynamic linkers do the rest, the runtime works by having that field
+  // instead point to a C-string that is the name of the superclass.
+  // At runtime the objc initialization updates that pointer and sets
+  // it to point to the actual super class.  As far as the linker
+  // knows it is just a pointer to a string.  But then someone wanted the
+  // linker to issue errors at build time if the superclass was not found.
+  // So they figured out a way in mach-o object format to use an absolute
+  // symbols (.objc_class_name_Foo = 0) and a floating reference
+  // (.reference .objc_class_name_Bar) to cause the linker into erroring when
+  // a class was missing.
+  // The following synthesizes the implicit .objc_* symbols for the linker
+  // from the ObjC data structures generated by the front end.
+
+  // special case if this data blob is an ObjC class definition
+  if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClass(gv);
+    }
+  }
+
+  // special case if this data blob is an ObjC category definition
+  else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCCategory(gv);
+    }
+  }
+
+  // special case if this data blob is the list of referenced classes
+  else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClassRef(gv);
+    }
+  }
+}
+
+/// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
+void LTOModule::addDefinedFunctionSymbol(const Function *f) {
+  // add to list of defined symbols
+  addDefinedSymbol(f, true);
+}
+
+static bool canBeHidden(const GlobalValue *GV) {
+  GlobalValue::LinkageTypes L = GV->getLinkage();
+
+  if (L != GlobalValue::LinkOnceODRLinkage)
+    return false;
+
+  if (GV->hasUnnamedAddr())
+    return true;
+
+  GlobalStatus GS;
+  if (GlobalStatus::analyzeGlobal(GV, GS))
+    return false;
+
+  return !GS.IsCompared;
+}
+
+/// addDefinedSymbol - Add a defined symbol to the list.
+void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
+  // ignore all llvm.* symbols
+  if (def->getName().startswith("llvm."))
+    return;
+
+  // string is owned by _defines
+  SmallString<64> Buffer;
+  _mangler.getNameWithPrefix(Buffer, def, false);
+
+  // set alignment part log2() can have rounding errors
+  uint32_t align = def->getAlignment();
+  uint32_t attr = align ? countTrailingZeros(def->getAlignment()) : 0;
+
+  // set permissions part
+  if (isFunction) {
+    attr |= LTO_SYMBOL_PERMISSIONS_CODE;
+  } else {
+    const GlobalVariable *gv = dyn_cast<GlobalVariable>(def);
+    if (gv && gv->isConstant())
+      attr |= LTO_SYMBOL_PERMISSIONS_RODATA;
+    else
+      attr |= LTO_SYMBOL_PERMISSIONS_DATA;
+  }
+
+  // set definition part
+  if (def->hasWeakLinkage() || def->hasLinkOnceLinkage() ||
+      def->hasLinkerPrivateWeakLinkage())
+    attr |= LTO_SYMBOL_DEFINITION_WEAK;
+  else if (def->hasCommonLinkage())
+    attr |= LTO_SYMBOL_DEFINITION_TENTATIVE;
+  else
+    attr |= LTO_SYMBOL_DEFINITION_REGULAR;
+
+  // set scope part
+  if (def->hasHiddenVisibility())
+    attr |= LTO_SYMBOL_SCOPE_HIDDEN;
+  else if (def->hasProtectedVisibility())
+    attr |= LTO_SYMBOL_SCOPE_PROTECTED;
+  else if (canBeHidden(def))
+    attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
+  else if (def->hasExternalLinkage() || def->hasWeakLinkage() ||
+           def->hasLinkOnceLinkage() || def->hasCommonLinkage() ||
+           def->hasLinkerPrivateWeakLinkage())
+    attr |= LTO_SYMBOL_SCOPE_DEFAULT;
+  else
+    attr |= LTO_SYMBOL_SCOPE_INTERNAL;
+
+  StringSet::value_type &entry = _defines.GetOrCreateValue(Buffer);
+  entry.setValue(1);
+
+  // fill information structure
+  NameAndAttributes info;
+  StringRef Name = entry.getKey();
+  info.name = Name.data();
+  assert(info.name[Name.size()] == '\0');
+  info.attributes = attr;
+  info.isFunction = isFunction;
+  info.symbol = def;
+
+  // add to table of symbols
+  _symbols.push_back(info);
+}
+
+/// addAsmGlobalSymbol - Add a global symbol from module-level ASM to the
+/// defined list.
+void LTOModule::addAsmGlobalSymbol(const char *name,
+                                   lto_symbol_attributes scope) {
+  StringSet::value_type &entry = _defines.GetOrCreateValue(name);
+
+  // only add new define if not already defined
+  if (entry.getValue())
+    return;
+
+  entry.setValue(1);
+
+  NameAndAttributes &info = _undefines[entry.getKey().data()];
+
+  if (info.symbol == 0) {
+    // FIXME: This is trying to take care of module ASM like this:
+    //
+    //   module asm ".zerofill __FOO, __foo, _bar_baz_qux, 0"
+    //
+    // but is gross and its mother dresses it funny. Have the ASM parser give us
+    // more details for this type of situation so that we're not guessing so
+    // much.
+
+    // fill information structure
+    info.name = entry.getKey().data();
+    info.attributes =
+      LTO_SYMBOL_PERMISSIONS_DATA | LTO_SYMBOL_DEFINITION_REGULAR | scope;
+    info.isFunction = false;
+    info.symbol = 0;
+
+    // add to table of symbols
+    _symbols.push_back(info);
+    return;
+  }
+
+  if (info.isFunction)
+    addDefinedFunctionSymbol(cast<Function>(info.symbol));
+  else
+    addDefinedDataSymbol(info.symbol);
+
+  _symbols.back().attributes &= ~LTO_SYMBOL_SCOPE_MASK;
+  _symbols.back().attributes |= scope;
+}
+
+/// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to the
+/// undefined list.
+void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(name);
+
+  _asm_undefines.push_back(entry.getKey().data());
+
+  // we already have the symbol
+  if (entry.getValue().name)
+    return;
+
+  uint32_t attr = LTO_SYMBOL_DEFINITION_UNDEFINED;;
+  attr |= LTO_SYMBOL_SCOPE_DEFAULT;
+  NameAndAttributes info;
+  info.name = entry.getKey().data();
+  info.attributes = attr;
+  info.isFunction = false;
+  info.symbol = 0;
+
+  entry.setValue(info);
+}
+
+/// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet to a
+/// list to be resolved later.
+void
+LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
+  // ignore all llvm.* symbols
+  if (decl->getName().startswith("llvm."))
+    return;
+
+  // ignore all aliases
+  if (isa<GlobalAlias>(decl))
+    return;
+
+  SmallString<64> name;
+  _mangler.getNameWithPrefix(name, decl, false);
+
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(name);
+
+  // we already have the symbol
+  if (entry.getValue().name)
+    return;
+
+  NameAndAttributes info;
+
+  info.name = entry.getKey().data();
+
+  if (decl->hasExternalWeakLinkage())
+    info.attributes = LTO_SYMBOL_DEFINITION_WEAKUNDEF;
+  else
+    info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+
+  info.isFunction = isFunc;
+  info.symbol = decl;
+
+  entry.setValue(info);
+}
+
+namespace {
+  class RecordStreamer : public MCStreamer {
+  public:
+    enum State { NeverSeen, Global, Defined, DefinedGlobal, Used };
+
+  private:
+    StringMap<State> Symbols;
+
+    void markDefined(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Global:
+        S = DefinedGlobal;
+        break;
+      case NeverSeen:
+      case Defined:
+      case Used:
+        S = Defined;
+        break;
+      }
+    }
+    void markGlobal(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Defined:
+        S = DefinedGlobal;
+        break;
+
+      case NeverSeen:
+      case Global:
+      case Used:
+        S = Global;
+        break;
+      }
+    }
+    void markUsed(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Defined:
+      case Global:
+        break;
+
+      case NeverSeen:
+      case Used:
+        S = Used;
+        break;
+      }
+    }
+
+    // FIXME: mostly copied for the obj streamer.
+    void AddValueSymbols(const MCExpr *Value) {
+      switch (Value->getKind()) {
+      case MCExpr::Target:
+        // FIXME: What should we do in here?
+        break;
+
+      case MCExpr::Constant:
+        break;
+
+      case MCExpr::Binary: {
+        const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+        AddValueSymbols(BE->getLHS());
+        AddValueSymbols(BE->getRHS());
+        break;
+      }
+
+      case MCExpr::SymbolRef:
+        markUsed(cast<MCSymbolRefExpr>(Value)->getSymbol());
+        break;
+
+      case MCExpr::Unary:
+        AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
+        break;
+      }
+    }
+
+  public:
+    typedef StringMap<State>::const_iterator const_iterator;
+
+    const_iterator begin() {
+      return Symbols.begin();
+    }
+
+    const_iterator end() {
+      return Symbols.end();
+    }
+
+    RecordStreamer(MCContext &Context) : MCStreamer(Context, 0) {}
+
+    virtual void EmitInstruction(const MCInst &Inst) {
+      // Scan for values.
+      for (unsigned i = Inst.getNumOperands(); i--; )
+        if (Inst.getOperand(i).isExpr())
+          AddValueSymbols(Inst.getOperand(i).getExpr());
+    }
+    virtual void EmitLabel(MCSymbol *Symbol) {
+      Symbol->setSection(*getCurrentSection().first);
+      markDefined(*Symbol);
+    }
+    virtual void EmitDebugLabel(MCSymbol *Symbol) {
+      EmitLabel(Symbol);
+    }
+    virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+      // FIXME: should we handle aliases?
+      markDefined(*Symbol);
+    }
+    virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
+      if (Attribute == MCSA_Global)
+        markGlobal(*Symbol);
+      return true;
+    }
+    virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size , unsigned ByteAlignment) {
+      markDefined(*Symbol);
+    }
+    virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                  unsigned ByteAlignment) {
+      markDefined(*Symbol);
+    }
+
+    virtual void EmitBundleAlignMode(unsigned AlignPow2) {}
+    virtual void EmitBundleLock(bool AlignToEnd) {}
+    virtual void EmitBundleUnlock() {}
+
+    // Noop calls.
+    virtual void ChangeSection(const MCSection *Section,
+                               const MCExpr *Subsection) {}
+    virtual void InitToTextSection() {}
+    virtual void InitSections() {}
+    virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {}
+    virtual void EmitThumbFunc(MCSymbol *Func) {}
+    virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
+    virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
+    virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
+    virtual void EmitCOFFSymbolStorageClass(int StorageClass) {}
+    virtual void EmitCOFFSymbolType(int Type) {}
+    virtual void EndCOFFSymbolDef() {}
+    virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {}
+    virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                uint64_t Size, unsigned ByteAlignment) {}
+    virtual void EmitBytes(StringRef Data) {}
+    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {}
+    virtual void EmitULEB128Value(const MCExpr *Value) {}
+    virtual void EmitSLEB128Value(const MCExpr *Value) {}
+    virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                      unsigned ValueSize,
+                                      unsigned MaxBytesToEmit) {}
+    virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                   unsigned MaxBytesToEmit) {}
+    virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                   unsigned char Value ) { return false; }
+    virtual void EmitFileDirective(StringRef Filename) {}
+    virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                          const MCSymbol *LastLabel,
+                                          const MCSymbol *Label,
+                                          unsigned PointerSize) {}
+    virtual void FinishImpl() {}
+    virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+      RecordProcEnd(Frame);
+    }
+  };
+} // end anonymous namespace
+
+/// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
+/// defined or undefined lists.
+bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
+  const std::string &inlineAsm = _module->getModuleInlineAsm();
+  if (inlineAsm.empty())
+    return false;
+
+  OwningPtr<RecordStreamer> Streamer(new RecordStreamer(_context));
+  MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(inlineAsm);
+  SourceMgr SrcMgr;
+  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+  OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr,
+                                                  _context, *Streamer,
+                                                  *_target->getMCAsmInfo()));
+  const Target &T = _target->getTarget();
+  OwningPtr<MCInstrInfo> MCII(T.createMCInstrInfo());
+  OwningPtr<MCSubtargetInfo>
+    STI(T.createMCSubtargetInfo(_target->getTargetTriple(),
+                                _target->getTargetCPU(),
+                                _target->getTargetFeatureString()));
+  OwningPtr<MCTargetAsmParser> TAP(T.createMCAsmParser(*STI, *Parser.get(), *MCII));
+  if (!TAP) {
+    errMsg = "target " + std::string(T.getName()) +
+      " does not define AsmParser.";
+    return true;
+  }
+
+  Parser->setTargetParser(*TAP);
+  if (Parser->Run(false))
+    return true;
+
+  for (RecordStreamer::const_iterator i = Streamer->begin(),
+         e = Streamer->end(); i != e; ++i) {
+    StringRef Key = i->first();
+    RecordStreamer::State Value = i->second;
+    if (Value == RecordStreamer::DefinedGlobal)
+      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_DEFAULT);
+    else if (Value == RecordStreamer::Defined)
+      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_INTERNAL);
+    else if (Value == RecordStreamer::Global ||
+             Value == RecordStreamer::Used)
+      addAsmGlobalSymbolUndef(Key.data());
+  }
+
+  return false;
+}
+
+/// isDeclaration - Return 'true' if the global value is a declaration.
+static bool isDeclaration(const GlobalValue &V) {
+  if (V.hasAvailableExternallyLinkage())
+    return true;
+
+  if (V.isMaterializable())
+    return false;
+
+  return V.isDeclaration();
+}
+
+/// parseSymbols - Parse the symbols from the module and model-level ASM and add
+/// them to either the defined or undefined lists.
+bool LTOModule::parseSymbols(std::string &errMsg) {
+  // add functions
+  for (Module::iterator f = _module->begin(), e = _module->end(); f != e; ++f) {
+    if (isDeclaration(*f))
+      addPotentialUndefinedSymbol(f, true);
+    else
+      addDefinedFunctionSymbol(f);
+  }
+
+  // add data
+  for (Module::global_iterator v = _module->global_begin(),
+         e = _module->global_end(); v !=  e; ++v) {
+    if (isDeclaration(*v))
+      addPotentialUndefinedSymbol(v, false);
+    else
+      addDefinedDataSymbol(v);
+  }
+
+  // add asm globals
+  if (addAsmGlobalSymbols(errMsg))
+    return true;
+
+  // add aliases
+  for (Module::alias_iterator a = _module->alias_begin(),
+         e = _module->alias_end(); a != e; ++a) {
+    if (isDeclaration(*a->getAliasedGlobal()))
+      // Is an alias to a declaration.
+      addPotentialUndefinedSymbol(a, false);
+    else
+      addDefinedDataSymbol(a);
+  }
+
+  // make symbols for all undefines
+  for (StringMap<NameAndAttributes>::iterator u =_undefines.begin(),
+         e = _undefines.end(); u != e; ++u) {
+    // If this symbol also has a definition, then don't make an undefine because
+    // it is a tentative definition.
+    if (_defines.count(u->getKey())) continue;
+    NameAndAttributes info = u->getValue();
+    _symbols.push_back(info);
+  }
+
+  return false;
+}
diff --git a/lib/LTO/Makefile b/lib/LTO/Makefile
new file mode 100644
index 0000000..55e2a5e
--- /dev/null
+++ b/lib/LTO/Makefile
@@ -0,0 +1,15 @@
+##===- lib/LTO/Makefile ------------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMLTO
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index ce02c37..8f2200e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -22,8 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-
-#include <ctype.h>
+#include <cctype>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -706,7 +705,11 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   if (DstGV->getVisibility() != SrcGV->getVisibility())
     return emitError(
             "Appending variables with different visibility need to be linked!");
-  
+
+  if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr())
+    return emitError(
+        "Appending variables with different unnamed_addr need to be linked!");
+
   if (DstGV->getSection() != SrcGV->getSection())
     return emitError(
           "Appending variables with different section name need to be linked!");
@@ -748,6 +751,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
   GlobalValue *DGV = getLinkedToGlobal(SGV);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SGV->hasUnnamedAddr();
 
   if (DGV) {
     // Concatenation of appending linkage variables is magic and handled later.
@@ -762,6 +766,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc))
       return true;
     NewVisibility = NV;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
     // If we're not linking from the source, then keep the definition that we
     // have.
@@ -770,10 +775,11 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
       if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
         if (DGVar->isDeclaration() && SGV->isConstant() && !DGVar->isConstant())
           DGVar->setConstant(true);
-      
-      // Set calculated linkage and visibility.
+
+      // Set calculated linkage, visibility and unnamed_addr.
       DGV->setLinkage(NewLinkage);
       DGV->setVisibility(*NewVisibility);
+      DGV->setUnnamedAddr(HasUnnamedAddr);
 
       // Make sure to remember this mapping.
       ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
@@ -799,6 +805,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
   copyGVAttributes(NewDGV, SGV);
   if (NewVisibility)
     NewDGV->setVisibility(*NewVisibility);
+  NewDGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (DGV) {
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
@@ -815,6 +822,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
 bool ModuleLinker::linkFunctionProto(Function *SF) {
   GlobalValue *DGV = getLinkedToGlobal(SF);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SF->hasUnnamedAddr();
 
   if (DGV) {
     GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
@@ -823,11 +831,13 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
     if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc))
       return true;
     NewVisibility = NV;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
     if (!LinkFromSrc) {
       // Set calculated linkage
       DGV->setLinkage(NewLinkage);
       DGV->setVisibility(*NewVisibility);
+      DGV->setUnnamedAddr(HasUnnamedAddr);
 
       // Make sure to remember this mapping.
       ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
@@ -855,6 +865,7 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
   copyGVAttributes(NewDF, SF);
   if (NewVisibility)
     NewDF->setVisibility(*NewVisibility);
+  NewDF->setUnnamedAddr(HasUnnamedAddr);
 
   if (DGV) {
     // Any uses of DF need to change to NewDF, with cast.
@@ -1250,6 +1261,13 @@ bool ModuleLinker::run() {
     // Skip if not linking from source.
     if (DoNotLinkFromSource.count(SF)) continue;
     
+    Function *DF = cast<Function>(ValueMap[SF]);
+    if (SF->hasPrefixData()) {
+      // Link in the prefix data.
+      DF->setPrefixData(MapValue(
+          SF->getPrefixData(), ValueMap, RF_None, &TypeMap, &ValMaterializer));
+    }
+
     // Skip if no body (function is external) or materialize.
     if (SF->isDeclaration()) {
       if (!SF->isMaterializable())
@@ -1258,7 +1276,7 @@ bool ModuleLinker::run() {
         return true;
     }
     
-    linkFunctionBody(cast<Function>(ValueMap[SF]), SF);
+    linkFunctionBody(DF, SF);
     SF->Dematerialize();
   }
 
@@ -1286,6 +1304,14 @@ bool ModuleLinker::run() {
         continue;
 
       Function *DF = cast<Function>(ValueMap[SF]);
+      if (SF->hasPrefixData()) {
+        // Link in the prefix data.
+        DF->setPrefixData(MapValue(SF->getPrefixData(),
+                                   ValueMap,
+                                   RF_None,
+                                   &TypeMap,
+                                   &ValMaterializer));
+      }
 
       // Materialize if necessary.
       if (SF->isDeclaration()) {
@@ -1326,6 +1352,11 @@ Linker::Linker(Module *M) : Composite(M) {
 Linker::~Linker() {
 }
 
+void Linker::deleteModule() {
+  delete Composite;
+  Composite = NULL;
+}
+
 bool Linker::linkInModule(Module *Src, unsigned Mode, std::string *ErrorMsg) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src, Mode);
   if (TheLinker.run()) {
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 89e2aaf..fa844ef 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMMC
   MCAsmInfo.cpp
   MCAsmInfoCOFF.cpp
   MCAsmInfoDarwin.cpp
+  MCAsmInfoELF.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
   MCAtom.cpp
@@ -25,6 +26,7 @@ add_llvm_library(LLVMMC
   MCMachOStreamer.cpp
   MCMachObjectTargetWriter.cpp
   MCModule.cpp
+  MCModuleYAML.cpp
   MCNullStreamer.cpp
   MCObjectFileInfo.cpp
   MCObjectDisassembler.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 2db59ac..9899bb2 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -73,10 +73,6 @@ class ELFObjectWriter : public MCObjectWriter {
 
       // Support lexicographic sorting.
       bool operator<(const ELFSymbolData &RHS) const {
-        if (MCELF::GetType(*SymbolData) == ELF::STT_FILE)
-          return true;
-        if (MCELF::GetType(*RHS.SymbolData) == ELF::STT_FILE)
-          return false;
         return SymbolData->getSymbol().getName() <
                RHS.SymbolData->getSymbol().getName();
       }
@@ -98,6 +94,7 @@ class ELFObjectWriter : public MCObjectWriter {
     /// @{
 
     SmallString<256> StringTable;
+    std::vector<uint64_t> FileSymbolData;
     std::vector<ELFSymbolData> LocalSymbolData;
     std::vector<ELFSymbolData> ExternalSymbolData;
     std::vector<ELFSymbolData> UndefinedSymbolData;
@@ -551,7 +548,7 @@ void ELFObjectWriter::WriteSymbol(MCDataFragment *SymtabF,
   uint8_t Type = MCELF::GetType(Data);
   uint8_t Info = (Binding << ELF_STB_Shift) | (Type << ELF_STT_Shift);
 
-  // Other and Visibility share the same byte with Visability using the lower
+  // Other and Visibility share the same byte with Visibility using the lower
   // 2 bits
   uint8_t Visibility = MCELF::GetVisibility(OrigData);
   uint8_t Other = MCELF::getOther(OrigData) <<
@@ -590,8 +587,15 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
   // The first entry is the undefined symbol entry.
   WriteSymbolEntry(SymtabF, ShndxF, 0, 0, 0, 0, 0, 0, false);
 
+  for (unsigned i = 0, e = FileSymbolData.size(); i != e; ++i) {
+    WriteSymbolEntry(SymtabF, ShndxF, FileSymbolData[i],
+                     ELF::STT_FILE | ELF::STB_LOCAL, 0, 0,
+                     ELF::STV_DEFAULT, ELF::SHN_ABS, true);
+  }
+
   // Write the symbol table entries.
-  LastLocalSymbolIndex = LocalSymbolData.size() + 1;
+  LastLocalSymbolIndex = FileSymbolData.size() + LocalSymbolData.size() + 1;
+
   for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i) {
     ELFSymbolData &MSD = LocalSymbolData[i];
     WriteSymbol(SymtabF, ShndxF, MSD, Layout);
@@ -880,6 +884,20 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
   // FIXME: We could optimize suffixes in strtab in the same way we
   // optimize them in shstrtab.
 
+  for (MCAssembler::const_file_name_iterator it = Asm.file_names_begin(),
+                                            ie = Asm.file_names_end();
+                                            it != ie;
+                                            ++it) {
+    StringRef Name = *it;
+    uint64_t &Entry = StringIndexMap[Name];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Name;
+      StringTable += '\x00';
+    }
+    FileSymbolData.push_back(Entry);
+  }
+
   // Add the data for the symbols.
   for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
          ie = Asm.symbol_end(); it != ie; ++it) {
@@ -964,7 +982,7 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
 
   // Set the symbol indices. Local symbols must come before all other
   // symbols with non-local bindings.
-  unsigned Index = 1;
+  unsigned Index = FileSymbolData.size() + 1;
   for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
     LocalSymbolData[i].SymbolData->setIndex(Index++);
 
@@ -1005,8 +1023,8 @@ void ELFObjectWriter::CreateRelocationSections(MCAssembler &Asm,
     unsigned Flags = 0;
     StringRef Group = "";
     if (Section.getFlags() & ELF::SHF_GROUP) {
-        Flags = ELF::SHF_GROUP;
-        Group = Section.getGroup()->getName();
+      Flags = ELF::SHF_GROUP;
+      Group = Section.getGroup()->getName();
     }
 
     const MCSectionELF *RelaSection =
@@ -1073,7 +1091,7 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
     else if (entry.Index < 0)
       entry.Index = getSymbolIndexInSymbolTable(Asm, entry.Symbol);
     else
-      entry.Index += LocalSymbolData.size();
+      entry.Index += FileSymbolData.size() + LocalSymbolData.size();
     if (is64Bit()) {
       String64(*F, entry.r_offset);
       if (TargetObjectWriter->isN64()) {
@@ -1104,11 +1122,10 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
   }
 }
 
-static int compareBySuffix(const void *a, const void *b) {
-  const MCSectionELF *secA = *static_cast<const MCSectionELF* const *>(a);
-  const MCSectionELF *secB = *static_cast<const MCSectionELF* const *>(b);
-  const StringRef &NameA = secA->getSectionName();
-  const StringRef &NameB = secB->getSectionName();
+static int compareBySuffix(const MCSectionELF *const *a,
+                           const MCSectionELF *const *b) {
+  const StringRef &NameA = (*a)->getSectionName();
+  const StringRef &NameB = (*b)->getSectionName();
   const unsigned sizeA = NameA.size();
   const unsigned sizeB = NameB.size();
   const unsigned len = std::min(sizeA, sizeB);
@@ -1299,10 +1316,12 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
     // Remove ".rel" and ".rela" prefixes.
     unsigned SecNameLen = (Section.getType() == ELF::SHT_REL) ? 4 : 5;
     StringRef SectionName = Section.getSectionName().substr(SecNameLen);
+    StringRef GroupName =
+        Section.getGroup() ? Section.getGroup()->getName() : "";
 
-    InfoSection = Asm.getContext().getELFSection(SectionName,
-                                                 ELF::SHT_PROGBITS, 0,
-                                                 SectionKind::getReadOnly());
+    InfoSection = Asm.getContext().getELFSection(SectionName, ELF::SHT_PROGBITS,
+                                                 0, SectionKind::getReadOnly(),
+                                                 0, GroupName);
     sh_info = SectionIndexMap.lookup(InfoSection);
     break;
   }
@@ -1352,11 +1371,12 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
                                        ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
                                        SectionKind::getText()));
     } else if (SecName.startswith(".ARM.exidx")) {
-      sh_link = SectionIndexMap.lookup(
-        Asm.getContext().getELFSection(SecName.substr(sizeof(".ARM.exidx") - 1),
-                                       ELF::SHT_PROGBITS,
-                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
-                                       SectionKind::getText()));
+      StringRef GroupName =
+          Section.getGroup() ? Section.getGroup()->getName() : "";
+      sh_link = SectionIndexMap.lookup(Asm.getContext().getELFSection(
+          SecName.substr(sizeof(".ARM.exidx") - 1), ELF::SHT_PROGBITS,
+          ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, SectionKind::getText(), 0,
+          GroupName));
     }
   }
 
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 84e4075..28f1c95 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -35,7 +35,7 @@ MCAsmInfo::MCAsmInfo() {
   LinkerRequiresNonEmptyDwarfLines = false;
   MaxInstLength = 4;
   MinInstAlignment = 1;
-  PCSymbol = "$";
+  DollarIsPC = false;
   SeparatorString = ";";
   CommentColumn = 40;
   CommentString = "#";
@@ -50,10 +50,7 @@ MCAsmInfo::MCAsmInfo() {
   Code32Directive = ".code32";
   Code64Directive = ".code64";
   AssemblerDialect = 0;
-  AllowQuotesInName = false;
-  AllowNameToStartWithDigit = false;
-  AllowPeriodsInName = true;
-  AllowUTF8 = true;
+  AllowAtInName = false;
   UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
@@ -76,8 +73,8 @@ MCAsmInfo::MCAsmInfo() {
   LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
   HasDotTypeDotSizeDirective = true;
   HasSingleParameterDotFile = true;
+  HasIdentDirective = false;
   HasNoDeadStrip = false;
-  HasSymbolResolver = false;
   WeakRefDirective = 0;
   WeakDefDirective = 0;
   LinkOnceDirective = 0;
@@ -87,7 +84,6 @@ MCAsmInfo::MCAsmInfo() {
   HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
-  DwarfUsesInlineInfoSection = false;
   DwarfUsesRelocationsAcrossSections = true;
   DwarfRegNumForCFI = false;
   HasMicrosoftFastStdCallMangling = false;
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 33350d9..9d9f98e 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -43,7 +43,6 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
 void MCAsmInfoMicrosoft::anchor() { }
 
 MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() {
-  AllowQuotesInName = true;
 }
 
 void MCAsmInfoGNUCOFF::anchor() { }
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index a0e3eba..704c816 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -26,7 +26,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   GlobalPrefix = "_";
   PrivateGlobalPrefix = "L";
   LinkerPrivateGlobalPrefix = "l";
-  AllowQuotesInName = true;
   HasSingleParameterDotFile = false;
   HasSubsectionsViaSymbols = true;
 
@@ -58,7 +57,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
-  HasSymbolResolver = true;
 
   DwarfUsesRelocationsAcrossSections = false;
 }
diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
new file mode 100644
index 0000000..8cf4e4f
--- /dev/null
+++ b/lib/MC/MCAsmInfoELF.cpp
@@ -0,0 +1,23 @@
+//===-- MCAsmInfoELF.cpp - ELF asm properties -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on ELF-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoELF.h"
+using namespace llvm;
+
+void MCAsmInfoELF::anchor() { }
+
+MCAsmInfoELF::MCAsmInfoELF() {
+  HasIdentDirective = true;
+  WeakRefDirective = "\t.weak\t";
+}
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 781e400..ca49f8f 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
@@ -65,17 +66,15 @@ private:
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame);
 
 public:
-  MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
-                bool isVerboseAsm, bool useLoc, bool useCFI,
-                bool useDwarfDirectory,
-                MCInstPrinter *printer, MCCodeEmitter *emitter,
-                MCAsmBackend *asmbackend,
-                bool showInst)
-    : MCStreamer(SK_AsmStreamer, Context), OS(os), MAI(Context.getAsmInfo()),
-      InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
-      CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
-      ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
-      UseDwarfDirectory(useDwarfDirectory) {
+  MCAsmStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                formatted_raw_ostream &os, bool isVerboseAsm, bool useLoc,
+                bool useCFI, bool useDwarfDirectory, MCInstPrinter *printer,
+                MCCodeEmitter *emitter, MCAsmBackend *asmbackend, bool showInst)
+      : MCStreamer(Context, TargetStreamer), OS(os), MAI(Context.getAsmInfo()),
+        InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
+        CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
+        ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
+        UseDwarfDirectory(useDwarfDirectory) {
     if (InstPrinter && IsVerboseAsm)
       InstPrinter->setCommentStream(CommentStream);
   }
@@ -154,7 +153,7 @@ public:
   virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                          const MCSymbol *Label);
 
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
 
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
@@ -214,6 +213,7 @@ public:
                                      unsigned Isa, unsigned Discriminator,
                                      StringRef FileName);
 
+  virtual void EmitIdent(StringRef IdentString);
   virtual void EmitCFISections(bool EH, bool Debug);
   virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
   virtual void EmitCFIDefCfaOffset(int64_t Offset);
@@ -229,6 +229,7 @@ public:
   virtual void EmitCFISignalFrame();
   virtual void EmitCFIUndefined(int64_t Register);
   virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
+  virtual void EmitCFIWindowSave();
 
   virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
   virtual void EmitWin64EHEndProc();
@@ -245,17 +246,6 @@ public:
   virtual void EmitWin64EHPushFrame(bool Code);
   virtual void EmitWin64EHEndProlog();
 
-  virtual void EmitFnStart();
-  virtual void EmitFnEnd();
-  virtual void EmitCantUnwind();
-  virtual void EmitPersonality(const MCSymbol *Personality);
-  virtual void EmitHandlerData();
-  virtual void EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
-  virtual void EmitPad(int64_t Offset);
-  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool);
-
-  virtual void EmitTCEntry(const MCSymbol &S);
-
   virtual void EmitInstruction(const MCInst &Inst);
 
   virtual void EmitBundleAlignMode(unsigned AlignPow2);
@@ -265,15 +255,9 @@ public:
   /// EmitRawText - If this file is backed by an assembly streamer, this dumps
   /// the specified string in the output .s file.  This capability is
   /// indicated by the hasRawTextSupport() predicate.
-  virtual void EmitRawText(StringRef String);
+  virtual void EmitRawTextImpl(StringRef String);
 
   virtual void FinishImpl();
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_AsmStreamer;
-  }
 };
 
 } // end anonymous namespace.
@@ -436,7 +420,7 @@ void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
 }
 
 
-void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                         MCSymbolAttr Attribute) {
   switch (Attribute) {
   case MCSA_Invalid: llvm_unreachable("Invalid symbol attribute");
@@ -447,11 +431,12 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_ELF_TypeCommon:      /// .type _foo, STT_COMMON  # aka @common
   case MCSA_ELF_TypeNoType:      /// .type _foo, STT_NOTYPE  # aka @notype
   case MCSA_ELF_TypeGnuUniqueObject:  /// .type _foo, @gnu_unique_object
-    assert(MAI->hasDotTypeDotSizeDirective() && "Symbol Attr not supported");
+    if (!MAI->hasDotTypeDotSizeDirective())
+      return false; // Symbol attribute not supported
     OS << "\t.type\t" << *Symbol << ','
        << ((MAI->getCommentString()[0] != '@') ? '@' : '%');
     switch (Attribute) {
-    default: llvm_unreachable("Unknown ELF .type");
+    default: return false;
     case MCSA_ELF_TypeFunction:    OS << "function"; break;
     case MCSA_ELF_TypeIndFunction: OS << "gnu_indirect_function"; break;
     case MCSA_ELF_TypeObject:      OS << "object"; break;
@@ -461,7 +446,7 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     case MCSA_ELF_TypeGnuUniqueObject: OS << "gnu_unique_object"; break;
     }
     EmitEOL();
-    return;
+    return true;
   case MCSA_Global: // .globl/.global
     OS << MAI->getGlobalDirective();
     FlagMap[Symbol] |= EHGlobal;
@@ -491,6 +476,8 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
 
   OS << *Symbol;
   EmitEOL();
+
+  return true;
 }
 
 void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -530,6 +517,9 @@ void MCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
 
 void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment) {
+  // Common symbols do not belong to any actual section.
+  AssignSection(Symbol, NULL);
+
   OS << "\t.comm\t" << *Symbol << ',' << Size;
   if (ByteAlignment != 0) {
     if (MAI->getCOMMDirectiveAlignmentIsInBytes())
@@ -546,6 +536,9 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 /// @param Size - The size of the common symbol.
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
+  // Common symbols do not belong to any actual section.
+  AssignSection(Symbol, NULL);
+
   OS << "\t.lcomm\t" << *Symbol << ',' << Size;
   if (ByteAlign > 1) {
     switch (MAI->getLCOMMDirectiveAlignmentType()) {
@@ -565,6 +558,9 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
 void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment) {
+  if (Symbol)
+    AssignSection(Symbol, Section);
+
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
 
@@ -585,6 +581,8 @@ void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
 // e.g. _a.
 void MCAsmStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
+  AssignSection(Symbol, Section);
+
   assert(Symbol != NULL && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
   // This is a mach-o specific directive and section.
@@ -654,7 +652,6 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {
     OS << MAI->getAsciiDirective();
   }
 
-  OS << ' ';
   PrintQuotedString(Data, OS);
   EmitEOL();
 }
@@ -884,6 +881,13 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitIdent(StringRef IdentString) {
+  assert(MAI->hasIdentDirective() && ".ident directive not supported");
+  OS << "\t.ident\t";
+  PrintQuotedString(IdentString, OS);
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
   MCStreamer::EmitCFISections(EH, Debug);
 
@@ -1085,6 +1089,16 @@ void MCAsmStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitCFIWindowSave() {
+  MCStreamer::EmitCFIWindowSave();
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_window_save";
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
   MCStreamer::EmitWin64EHStartProc(Symbol);
 
@@ -1290,73 +1304,6 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) {
   }
 }
 
-void MCAsmStreamer::EmitFnStart() {
-  OS << "\t.fnstart";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitFnEnd() {
-  OS << "\t.fnend";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitCantUnwind() {
-  OS << "\t.cantunwind";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitHandlerData() {
-  OS << "\t.handlerdata";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitPersonality(const MCSymbol *Personality) {
-  OS << "\t.personality " << Personality->getName();
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  OS << "\t.setfp\t";
-  InstPrinter->printRegName(OS, FpReg);
-  OS << ", ";
-  InstPrinter->printRegName(OS, SpReg);
-  if (Offset)
-    OS << ", #" << Offset;
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitPad(int64_t Offset) {
-  OS << "\t.pad\t#" << Offset;
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                                bool isVector) {
-  assert(RegList.size() && "RegList should not be empty");
-  if (isVector)
-    OS << "\t.vsave\t{";
-  else
-    OS << "\t.save\t{";
-
-  InstPrinter->printRegName(OS, RegList[0]);
-
-  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
-    OS << ", ";
-    InstPrinter->printRegName(OS, RegList[i]);
-  }
-
-  OS << "}";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitTCEntry(const MCSymbol &S) {
-  OS << "\t.tc ";
-  OS << S.getName();
-  OS << "[TC],";
-  OS << S.getName();
-  EmitEOL();
-}
-
 void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
@@ -1399,7 +1346,7 @@ void MCAsmStreamer::EmitBundleUnlock() {
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCAsmStreamer::EmitRawText(StringRef String) {
+void MCAsmStreamer::EmitRawTextImpl(StringRef String) {
   if (!String.empty() && String.back() == '\n')
     String = String.substr(0, String.size()-1);
   OS << String;
@@ -1418,14 +1365,16 @@ void MCAsmStreamer::FinishImpl() {
     MCGenDwarfInfo::Emit(this, LineSectionSymbol);
 
   if (!UseCFI)
-    EmitFrames(false);
+    EmitFrames(AsmBackend.get(), false);
 }
+
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
+                                    MCTargetStreamer *TargetStreamer,
                                     formatted_raw_ostream &OS,
-                                    bool isVerboseAsm, bool useLoc,
-                                    bool useCFI, bool useDwarfDirectory,
-                                    MCInstPrinter *IP, MCCodeEmitter *CE,
-                                    MCAsmBackend *MAB, bool ShowInst) {
-  return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, useCFI,
-                           useDwarfDirectory, IP, CE, MAB, ShowInst);
+                                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                                    bool useDwarfDirectory, MCInstPrinter *IP,
+                                    MCCodeEmitter *CE, MCAsmBackend *MAB,
+                                    bool ShowInst) {
+  return new MCAsmStreamer(Context, TargetStreamer, OS, isVerboseAsm, useLoc,
+                           useCFI, useDwarfDirectory, IP, CE, MAB, ShowInst);
 }
diff --git a/lib/MC/MCAtom.cpp b/lib/MC/MCAtom.cpp
index 2626b39..bc353cd 100644
--- a/lib/MC/MCAtom.cpp
+++ b/lib/MC/MCAtom.cpp
@@ -14,6 +14,9 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MCAtom::anchor() {}
+
 void MCAtom::remap(uint64_t NewBegin, uint64_t NewEnd) {
   Parent->remap(this, NewBegin, NewEnd);
 }
@@ -44,7 +47,7 @@ void MCAtom::remapForSplit(uint64_t SplitPt,
 
 void MCDataAtom::addData(const MCData &D) {
   Data.push_back(D);
-  if (Data.size() > Begin - End)
+  if (Data.size() > End + 1 - Begin)
     remap(Begin, End + 1);
 }
 
@@ -72,8 +75,8 @@ MCDataAtom *MCDataAtom::split(uint64_t SplitPt) {
 // MCTextAtom
 
 void MCTextAtom::addInst(const MCInst &I, uint64_t Size) {
-  if (NextInstAddress > End)
-    remap(Begin, NextInstAddress);
+  if (NextInstAddress + Size - 1 > End)
+    remap(Begin, NextInstAddress + Size - 1);
   Insts.push_back(MCDecodedInst(I, NextInstAddress, Size));
   NextInstAddress += Size;
 }
@@ -106,5 +109,6 @@ MCTextAtom *MCTextAtom::split(uint64_t SplitPt) {
 
   std::copy(I, Insts.end(), std::back_inserter(RightAtom->Insts));
   Insts.erase(I, Insts.end());
+  Parent->splitBasicBlocksForAtom(this, RightAtom);
   return RightAtom;
 }
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 6e4d82b..3b45d16 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -25,12 +25,16 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
+
+#include <map>
+
 using namespace llvm;
 
-typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
-typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
-typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
+typedef std::pair<std::string, std::string> SectionGroupPair;
 
+typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
+typedef std::map<SectionGroupPair, const MCSectionELF *> ELFUniqueMapTy;
+typedef std::map<SectionGroupPair, const MCSectionCOFF *> COFFUniqueMapTy;
 
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
@@ -249,8 +253,9 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)ELFUniquingMap;
 
   // Do the lookup, if we have a hit, return it.
-  StringMapEntry<const MCSectionELF*> &Entry = Map.GetOrCreateValue(Section);
-  if (Entry.getValue()) return Entry.getValue();
+  std::pair<ELFUniqueMapTy::iterator, bool> Entry = Map.insert(
+      std::make_pair(SectionGroupPair(Section, Group), (MCSectionELF *)0));
+  if (!Entry.second) return Entry.first->second;
 
   // Possibly refine the entry size first.
   if (!EntrySize) {
@@ -261,9 +266,9 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   if (!Group.empty())
     GroupSym = GetOrCreateSymbol(Group);
 
-  MCSectionELF *Result = new (*this) MCSectionELF(Entry.getKey(), Type, Flags,
-                                                  Kind, EntrySize, GroupSym);
-  Entry.setValue(Result);
+  MCSectionELF *Result = new (*this) MCSectionELF(
+      Entry.first->first.first, Type, Flags, Kind, EntrySize, GroupSym);
+  Entry.first->second = Result;
   return Result;
 }
 
@@ -274,32 +279,51 @@ const MCSectionELF *MCContext::CreateELFGroupSection() {
   return Result;
 }
 
-const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
-                                               unsigned Characteristics,
-                                               SectionKind Kind, int Selection,
-                                               const MCSectionCOFF *Assoc) {
+const MCSectionCOFF *
+MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind, StringRef COMDATSymName,
+                          int Selection, const MCSectionCOFF *Assoc) {
   if (COFFUniquingMap == 0)
     COFFUniquingMap = new COFFUniqueMapTy();
   COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
 
   // Do the lookup, if we have a hit, return it.
-  StringMapEntry<const MCSectionCOFF*> &Entry = Map.GetOrCreateValue(Section);
-  if (Entry.getValue()) return Entry.getValue();
 
-  MCSectionCOFF *Result = new (*this) MCSectionCOFF(Entry.getKey(),
-                                                    Characteristics,
-                                                    Selection, Assoc, Kind);
+  SectionGroupPair P(Section, COMDATSymName);
+  std::pair<COFFUniqueMapTy::iterator, bool> Entry =
+      Map.insert(std::make_pair(P, (MCSectionCOFF *)0));
+  COFFUniqueMapTy::iterator Iter = Entry.first;
+  if (!Entry.second)
+    return Iter->second;
 
-  Entry.setValue(Result);
+  const MCSymbol *COMDATSymbol = NULL;
+  if (!COMDATSymName.empty())
+    COMDATSymbol = GetOrCreateSymbol(COMDATSymName);
+
+  MCSectionCOFF *Result =
+      new (*this) MCSectionCOFF(Iter->first.first, Characteristics,
+                                COMDATSymbol, Selection, Assoc, Kind);
+
+  Iter->second = Result;
   return Result;
 }
 
+const MCSectionCOFF *
+MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind) {
+  return getCOFFSection(Section, Characteristics, Kind, "", 0);
+}
+
 const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) {
   if (COFFUniquingMap == 0)
     COFFUniquingMap = new COFFUniqueMapTy();
   COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
 
-  return Map.lookup(Section);
+  SectionGroupPair P(Section, "");
+  COFFUniqueMapTy::iterator Iter = Map.find(P);
+  if (Iter == Map.end())
+    return 0;
+  return Iter->second;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index ecc7aff..a0066c8 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolizer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -101,6 +102,7 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   if (!DC)
     return 0;
 
+  DC->setCPU(CPU);
   return DC;
 }
 
@@ -143,6 +145,112 @@ public:
 };
 } // end anonymous namespace
 
+/// \brief Emits the comments that are stored in \p DC comment stream.
+/// Each comment in the comment stream must end with a newline.
+static void emitComments(LLVMDisasmContext *DC,
+                         formatted_raw_ostream &FormattedOS) {
+  // Flush the stream before taking its content.
+  DC->CommentStream.flush();
+  StringRef Comments = DC->CommentsToEmit.str();
+  // Get the default information for printing a comment.
+  const MCAsmInfo *MAI = DC->getAsmInfo();
+  const char *CommentBegin = MAI->getCommentString();
+  unsigned CommentColumn = MAI->getCommentColumn();
+  bool IsFirst = true;
+  while (!Comments.empty()) {
+    if (!IsFirst)
+      FormattedOS << '\n';
+    // Emit a line of comments.
+    FormattedOS.PadToColumn(CommentColumn);
+    size_t Position = Comments.find('\n');
+    FormattedOS << CommentBegin << ' ' << Comments.substr(0, Position);
+    // Move after the newline character.
+    Comments = Comments.substr(Position+1);
+    IsFirst = false;
+  }
+  FormattedOS.flush();
+
+  // Tell the comment stream that the vector changed underneath it.
+  DC->CommentsToEmit.clear();
+  DC->CommentStream.resync();
+}
+
+/// \brief Gets latency information for \p Inst form the itinerary
+/// scheduling model, based on \p DC information.
+/// \return The maximum expected latency over all the operands or -1
+/// if no information are available.
+static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  const int NoInformationAvailable = -1;
+
+  // Check if we have a CPU to get the itinerary information.
+  if (DC->getCPU().empty())
+    return NoInformationAvailable;
+
+  // Get itinerary information.
+  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+  InstrItineraryData IID = STI->getInstrItineraryForCPU(DC->getCPU());
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+
+  int Latency = 0;
+  for (unsigned OpIdx = 0, OpIdxEnd = Inst.getNumOperands(); OpIdx != OpIdxEnd;
+       ++OpIdx)
+    Latency = std::max(Latency, IID.getOperandCycle(SCClass, OpIdx));
+
+  return Latency;
+}
+
+/// \brief Gets latency information for \p Inst, based on \p DC information.
+/// \return The maximum expected latency over all the definitions or -1
+/// if no information are available.
+static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  // Try to compute scheduling information.
+  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+  const MCSchedModel *SCModel = STI->getSchedModel();
+  const int NoInformationAvailable = -1;
+
+  // Check if we have a scheduling model for instructions.
+  if (!SCModel || !SCModel->hasInstrSchedModel())
+    // Try to fall back to the itinerary model if we do not have a
+    // scheduling model.
+    return getItineraryLatency(DC, Inst);
+
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+  const MCSchedClassDesc *SCDesc = SCModel->getSchedClassDesc(SCClass);
+  // Resolving the variant SchedClass requires an MI to pass to
+  // SubTargetInfo::resolveSchedClass.
+  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
+    return NoInformationAvailable;
+
+  // Compute output latency.
+  int Latency = 0;
+  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+       DefIdx != DefEnd; ++DefIdx) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc,
+                                                                   DefIdx);
+    Latency = std::max(Latency, WLEntry->Cycles);
+  }
+
+  return Latency;
+}
+
+
+/// \brief Emits latency information in DC->CommentStream for \p Inst, based
+/// on the information available in \p DC.
+static void emitLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  int Latency = getLatency(DC, Inst);
+
+  // Report only interesting latency.
+  if (Latency < 2)
+    return;
+
+  DC->CommentStream << "Latency: " << Latency << '\n';
+}
+
 //
 // LLVMDisasmInstruction() disassembles a single instruction using the
 // disassembler context specified in the parameter DC.  The bytes of the
@@ -167,8 +275,10 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
   const MCDisassembler *DisAsm = DC->getDisAsm();
   MCInstPrinter *IP = DC->getIP();
   MCDisassembler::DecodeStatus S;
+  SmallVector<char, 64> InsnStr;
+  raw_svector_ostream Annotations(InsnStr);
   S = DisAsm->getInstruction(Inst, Size, MemoryObject, PC,
-                             /*REMOVE*/ nulls(), DC->CommentStream);
+                             /*REMOVE*/ nulls(), Annotations);
   switch (S) {
   case MCDisassembler::Fail:
   case MCDisassembler::SoftFail:
@@ -176,17 +286,18 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
     return 0;
 
   case MCDisassembler::Success: {
-    DC->CommentStream.flush();
-    StringRef Comments = DC->CommentsToEmit.str();
+    Annotations.flush();
+    StringRef AnnotationsStr = Annotations.str();
 
     SmallVector<char, 64> InsnStr;
     raw_svector_ostream OS(InsnStr);
-    IP->printInst(&Inst, OS, Comments);
-    OS.flush();
+    formatted_raw_ostream FormattedOS(OS);
+    IP->printInst(&Inst, FormattedOS, AnnotationsStr);
 
-    // Tell the comment stream that the vector changed underneath it.
-    DC->CommentsToEmit.clear();
-    DC->CommentStream.resync();
+    if (DC->getOptions() & LLVMDisassembler_Option_PrintLatency)
+      emitLatency(DC, Inst);
+
+    emitComments(DC, FormattedOS);
 
     assert(OutStringSize != 0 && "Output buffer cannot be zero size");
     size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
@@ -208,12 +319,14 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
       LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
       MCInstPrinter *IP = DC->getIP();
       IP->setUseMarkup(1);
+      DC->addOptions(LLVMDisassembler_Option_UseMarkup);
       Options &= ~LLVMDisassembler_Option_UseMarkup;
   }
   if (Options & LLVMDisassembler_Option_PrintImmHex){
       LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
       MCInstPrinter *IP = DC->getIP();
       IP->setPrintImmHex(1);
+      DC->addOptions(LLVMDisassembler_Option_PrintImmHex);
       Options &= ~LLVMDisassembler_Option_PrintImmHex;
   }
   if (Options & LLVMDisassembler_Option_AsmPrinterVariant){
@@ -229,8 +342,21 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
           AsmPrinterVariant, *MAI, *MII, *MRI, *STI);
       if (IP) {
         DC->setIP(IP);
+        DC->addOptions(LLVMDisassembler_Option_AsmPrinterVariant);
         Options &= ~LLVMDisassembler_Option_AsmPrinterVariant;
       }
   }
+  if (Options & LLVMDisassembler_Option_SetInstrComments) {
+    LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+    MCInstPrinter *IP = DC->getIP();
+    IP->setCommentStream(DC->CommentStream);
+    DC->addOptions(LLVMDisassembler_Option_SetInstrComments);
+    Options &= ~LLVMDisassembler_Option_SetInstrComments;
+  }
+  if (Options & LLVMDisassembler_Option_PrintLatency) {
+    LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+    DC->addOptions(LLVMDisassembler_Option_PrintLatency);
+    Options &= ~LLVMDisassembler_Option_PrintLatency;
+  }
   return (Options == 0);
 }
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index 6eb59d0..4855af2 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -73,6 +73,10 @@ private:
   llvm::OwningPtr<const llvm::MCDisassembler> DisAsm;
   // The instruction printer for the target architecture.
   llvm::OwningPtr<llvm::MCInstPrinter> IP;
+  // The options used to set up the disassembler.
+  uint64_t Options;
+  // The CPU string.
+  std::string CPU;
 
 public:
   // Comment stream and backing vector.
@@ -90,6 +94,7 @@ public:
                     MCInstPrinter *iP) : TripleName(tripleName),
                     DisInfo(disInfo), TagType(tagType), GetOpInfo(getOpInfo),
                     SymbolLookUp(symbolLookUp), TheTarget(theTarget),
+                    Options(0),
                     CommentStream(CommentsToEmit) {
     MAI.reset(mAI);
     MRI.reset(mRI);
@@ -114,6 +119,10 @@ public:
   const MCSubtargetInfo *getSubtargetInfo() const { return MSI.get(); }
   MCInstPrinter *getIP() { return IP.get(); }
   void setIP(MCInstPrinter *NewIP) { IP.reset(NewIP); }
+  uint64_t getOptions() const { return Options; }
+  void addOptions(uint64_t Options) { this->Options |= Options; }
+  StringRef getCPU() const { return CPU; }
+  void setCPU(const char *CPU) { this->CPU = CPU; }
 };
 
 } // namespace llvm
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 195bbfe..1e5c2e3 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -873,9 +873,7 @@ namespace {
 
     void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
 
-    /// EmitCompactUnwind - Emit the unwind information in a compact way. If
-    /// we're successful, return 'true'. Otherwise, return 'false' and it will
-    /// emit the normal CIE and FDE.
+    /// EmitCompactUnwind - Emit the unwind information in a compact way.
     void EmitCompactUnwind(MCStreamer &streamer,
                            const MCDwarfFrameInfo &frame);
 
@@ -889,7 +887,7 @@ namespace {
                       const MCSymbol &cieStart,
                       const MCDwarfFrameInfo &frame);
     void EmitCFIInstructions(MCStreamer &streamer,
-                             const std::vector<MCCFIInstruction> &Instrs,
+                             ArrayRef<MCCFIInstruction> Instrs,
                              MCSymbol *BaseLabel);
     void EmitCFIInstruction(MCStreamer &Streamer,
                             const MCCFIInstruction &Instr);
@@ -961,6 +959,10 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
     Streamer.EmitULEB128IntValue(Reg2);
     return;
   }
+  case MCCFIInstruction::OpWindowSave: {
+    Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
+    return;
+  }
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
     if (VerboseAsm) {
@@ -1091,7 +1093,7 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
 /// EmitFrameMoves - Emit frame instructions to describe the layout of the
 /// frame.
 void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
-                                    const std::vector<MCCFIInstruction> &Instrs,
+                                           ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
   for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
     const MCCFIInstruction &Instr = Instrs[i];
@@ -1113,9 +1115,7 @@ void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
   }
 }
 
-/// EmitCompactUnwind - Emit the unwind information in a compact way. If we're
-/// successful, return 'true'. Otherwise, return 'false' and it will emit the
-/// normal CIE and FDE.
+/// EmitCompactUnwind - Emit the unwind information in a compact way.
 void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
                                          const MCDwarfFrameInfo &Frame) {
   MCContext &Context = Streamer.getContext();
@@ -1419,9 +1419,10 @@ namespace llvm {
   };
 }
 
-void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer,
-                               bool UsingCFI,
-                               bool IsEH) {
+void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer, MCAsmBackend *MAB,
+                               bool UsingCFI, bool IsEH) {
+  Streamer.generateCompactUnwindEncodings(MAB);
+
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
   FrameEmitterImpl Emitter(UsingCFI, IsEH);
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index 560cdbc..ebb189e 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -36,8 +36,8 @@ unsigned MCELF::GetBinding(const MCSymbolData &SD) {
 void MCELF::SetType(MCSymbolData &SD, unsigned Type) {
   assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
          Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
-         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
-         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+         Type == ELF::STT_COMMON || Type == ELF::STT_TLS ||
+         Type == ELF::STT_GNU_IFUNC);
 
   uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STT_Shift);
   SD.setFlags(OtherFlags | (Type << ELF_STT_Shift));
@@ -47,8 +47,7 @@ unsigned MCELF::GetType(const MCSymbolData &SD) {
   uint32_t Type = (SD.getFlags() & (0xf << ELF_STT_Shift)) >> ELF_STT_Shift;
   assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
          Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
-         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
-         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+         Type == ELF::STT_COMMON || Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
   return Type;
 }
 
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index ec7397d..0c39e4a 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -42,9 +42,9 @@ const MCSymbol *MCELFObjectTargetWriter::undefinedExplicitRelSym(const MCValue &
 // ELF doesn't require relocations to be in any order. We sort by the r_offset,
 // just to match gnu as for easier comparison. The use type and index is an
 // arbitrary way of making the sort deterministic.
-static int cmpRel(const void *AP, const void *BP) {
-  const ELFRelocationEntry &A = *(const ELFRelocationEntry *)AP;
-  const ELFRelocationEntry &B = *(const ELFRelocationEntry *)BP;
+static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) {
+  const ELFRelocationEntry &A = *AP;
+  const ELFRelocationEntry &B = *BP;
   if (A.r_offset != B.r_offset)
     return B.r_offset - A.r_offset;
   if (B.Type != A.Type)
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 6e5ff50..e806cb9 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
@@ -96,6 +97,9 @@ void MCELFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
 }
 
 void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+  // Do any generic stuff we need to do.
   switch (Flag) {
   case MCAF_SyntaxUnified: return; // no-op here.
   case MCAF_Code16: return; // Change parsing mode; no-op here.
@@ -148,8 +152,8 @@ static unsigned CombineSymbolTypes(unsigned T1, unsigned T2) {
   return T2;
 }
 
-void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
-                                          MCSymbolAttr Attribute) {
+bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                        MCSymbolAttr Attribute) {
   // Indirect symbols are handled differently, to match how 'as' handles
   // them. This makes writing matching .o files easier.
   if (Attribute == MCSA_IndirectSymbol) {
@@ -159,7 +163,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     ISD.Symbol = Symbol;
     ISD.SectionData = getCurrentSectionData();
     getAssembler().getIndirectSymbols().push_back(ISD);
-    return;
+    return true;
   }
 
   // Adding a symbol attribute always introduces the symbol, note that an
@@ -182,7 +186,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_WeakDefAutoPrivate:
   case MCSA_Invalid:
   case MCSA_IndirectSymbol:
-    llvm_unreachable("Invalid symbol attribute for ELF!");
+    return false;
 
   case MCSA_NoDeadStrip:
   case MCSA_ELF_TypeGnuUniqueObject:
@@ -251,6 +255,8 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     MCELF::SetVisibility(SD, ELF::STV_INTERNAL);
     break;
   }
+
+  return true;
 }
 
 void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -270,7 +276,8 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                                          ELF::SHF_WRITE |
                                                          ELF::SHF_ALLOC,
                                                          SectionKind::getBSS());
-    Symbol->setSection(*Section);
+
+    AssignSection(Symbol, Section);
 
     struct LocalCommon L = {&SD, Size, ByteAlignment};
     LocalCommons.push_back(L);
@@ -313,20 +320,29 @@ void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                          ValueSize, MaxBytesToEmit);
 }
 
-
-// Add a symbol for the file name of this module. This is the second
-// entry in the module's symbol table (the first being the null symbol).
+// Add a symbol for the file name of this module. They start after the
+// null symbol and don't count as normal symbol, i.e. a non-STT_FILE symbol
+// with the same name may appear.
 void MCELFStreamer::EmitFileDirective(StringRef Filename) {
-  MCSymbol *Symbol = getAssembler().getContext().GetOrCreateSymbol(Filename);
-  Symbol->setSection(*getCurrentSection().first);
-  Symbol->setAbsolute();
-
-  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-
-  SD.setFlags(ELF_STT_File | ELF_STB_Local | ELF_STV_Default);
+  getAssembler().addFileName(Filename);
+}
+
+void MCELFStreamer::EmitIdent(StringRef IdentString) {
+  const MCSection *Comment = getAssembler().getContext().getELFSection(
+      ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS,
+      SectionKind::getReadOnly(), 1, "");
+  PushSection();
+  SwitchSection(Comment);
+  if (!SeenIdent) {
+    EmitIntValue(0, 1);
+    SeenIdent = true;
+  }
+  EmitBytes(IdentString);
+  EmitIntValue(0, 1);
+  PopSection();
 }
 
-void  MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
+void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
   switch (expr->getKind()) {
   case MCExpr::Target:
     cast<MCTargetExpr>(expr)->fixELFSymbolsInTLSFixups(getAssembler());
@@ -525,9 +541,7 @@ void MCELFStreamer::EmitBundleUnlock() {
   SD->setBundleLockState(MCSectionData::NotBundleLocked);
 }
 
-void MCELFStreamer::FinishImpl() {
-  EmitFrames(true);
-
+void MCELFStreamer::Flush() {
   for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(),
                                                 e = LocalCommons.end();
        i != e; ++i) {
@@ -548,17 +562,23 @@ void MCELFStreamer::FinishImpl() {
       SectData.setAlignment(ByteAlignment);
   }
 
-  this->MCObjectStreamer::FinishImpl();
+  LocalCommons.clear();
 }
-void MCELFStreamer::EmitTCEntry(const MCSymbol &S) {
-  // Creates a R_PPC64_TOC relocation
-  MCObjectStreamer::EmitSymbolValue(&S, 8);
+
+void MCELFStreamer::FinishImpl() {
+  EmitFrames(NULL, true);
+
+  Flush();
+
+  this->MCObjectStreamer::FinishImpl();
 }
 
-MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                    raw_ostream &OS, MCCodeEmitter *CE,
-                                    bool RelaxAll, bool NoExecStack) {
-  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
+MCStreamer *llvm::createELFStreamer(MCContext &Context,
+                                    MCTargetStreamer *Streamer,
+                                    MCAsmBackend &MAB, raw_ostream &OS,
+                                    MCCodeEmitter *CE, bool RelaxAll,
+                                    bool NoExecStack) {
+  MCELFStreamer *S = new MCELFStreamer(Context, Streamer, MAB, OS, CE);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   if (NoExecStack)
diff --git a/lib/MC/MCExternalSymbolizer.cpp b/lib/MC/MCExternalSymbolizer.cpp
index 47ef6c4..ca368b2 100644
--- a/lib/MC/MCExternalSymbolizer.cpp
+++ b/lib/MC/MCExternalSymbolizer.cpp
@@ -63,6 +63,8 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
     }
     if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
       cStream << "symbol stub for: " << ReferenceName;
+    else if(ReferenceType == LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
     if (!Name && !IsBranch)
       return false;
   }
@@ -132,6 +134,8 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
 // literal pool's entry if the referenced address is that of a symbol. Or it
 // will return a pointer to a literal 'C' string if the referenced address of
 // the literal pool's entry is an address into a section with C string literals.
+// Or if the reference is to an Objective-C data structure it will return a
+// specific reference type for it and a string.
 void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
                                                            int64_t Value,
                                                            uint64_t Address) {
@@ -139,9 +143,26 @@ void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
     uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
     const char *ReferenceName;
     (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr ||
-       ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-      cStream << "literal pool for: " << ReferenceName;
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+      cStream << "literal pool symbol address: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+      cStream << "literal pool for: \"" << ReferenceName << "\"";
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+      cStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+      cStream << "Objc message ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+      cStream << "Objc selector ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+      cStream << "Objc class ref: " << ReferenceName;
   }
 }
 
diff --git a/lib/MC/MCFunction.cpp b/lib/MC/MCFunction.cpp
index 2665d3e..767e1e0 100644
--- a/lib/MC/MCFunction.cpp
+++ b/lib/MC/MCFunction.cpp
@@ -9,15 +9,15 @@
 
 #include "llvm/MC/MCFunction.h"
 #include "llvm/MC/MCAtom.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCModule.h"
 #include <algorithm>
 
 using namespace llvm;
 
 // MCFunction
 
-MCFunction::MCFunction(StringRef Name)
-  : Name(Name)
+MCFunction::MCFunction(StringRef Name, MCModule *Parent)
+  : Name(Name), ParentModule(Parent)
 {}
 
 MCFunction::~MCFunction() {
@@ -26,18 +26,32 @@ MCFunction::~MCFunction() {
 }
 
 MCBasicBlock &MCFunction::createBlock(const MCTextAtom &TA) {
-  Blocks.push_back(new MCBasicBlock(TA, this));
-  return *Blocks.back();
+  MCBasicBlock *MCBB = new MCBasicBlock(TA, this);
+  Blocks.push_back(MCBB);
+  return *MCBB;
+}
+
+MCBasicBlock *MCFunction::find(uint64_t StartAddr) {
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    if ((*I)->getInsts()->getBeginAddr() == StartAddr)
+      return *I;
+  return 0;
+}
+
+const MCBasicBlock *MCFunction::find(uint64_t StartAddr) const {
+  return const_cast<MCFunction *>(this)->find(StartAddr);
 }
 
 // MCBasicBlock
 
 MCBasicBlock::MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent)
-  : Insts(&Insts), Parent(Parent)
-{}
+  : Insts(&Insts), Parent(Parent) {
+  getParent()->getParent()->trackBBForAtom(&Insts, this);
+}
 
 void MCBasicBlock::addSuccessor(const MCBasicBlock *MCBB) {
-  Successors.push_back(MCBB);
+  if (!isSuccessor(MCBB))
+    Successors.push_back(MCBB);
 }
 
 bool MCBasicBlock::isSuccessor(const MCBasicBlock *MCBB) const {
@@ -46,10 +60,22 @@ bool MCBasicBlock::isSuccessor(const MCBasicBlock *MCBB) const {
 }
 
 void MCBasicBlock::addPredecessor(const MCBasicBlock *MCBB) {
-  Predecessors.push_back(MCBB);
+  if (!isPredecessor(MCBB))
+    Predecessors.push_back(MCBB);
 }
 
 bool MCBasicBlock::isPredecessor(const MCBasicBlock *MCBB) const {
   return std::find(Predecessors.begin(), Predecessors.end(),
                    MCBB) != Predecessors.end();
 }
+
+void MCBasicBlock::splitBasicBlock(MCBasicBlock *SplitBB) {
+  assert(Insts->getEndAddr() + 1 == SplitBB->Insts->getBeginAddr() &&
+         "Splitting unrelated basic blocks!");
+  SplitBB->addPredecessor(this);
+  assert(SplitBB->Successors.empty() &&
+         "Split basic block shouldn't already have successors!");
+  SplitBB->Successors = Successors;
+  Successors.clear();
+  addSuccessor(SplitBB);
+}
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 6a452c8..ba71245 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -31,9 +31,13 @@ void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
   if (!Annot.empty()) {
-    if (CommentStream)
+    if (CommentStream) {
       (*CommentStream) << Annot;
-    else
+      // By definition (see MCInstPrinter.h), CommentStream must end with
+      // a newline after each comment.
+      if (Annot.back() != '\n')
+        (*CommentStream) << '\n';
+    } else
       OS << " " << MAI.getCommentString() << " " << Annot;
   }
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 0729b7a..2924dcd 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -1,3 +1,4 @@
+//===-- MCMachOStreamer.cpp - MachO Streamer ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -36,7 +37,7 @@ private:
 public:
   MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
                   MCCodeEmitter *Emitter)
-      : MCObjectStreamer(SK_MachOStreamer, Context, MAB, OS, Emitter) {}
+      : MCObjectStreamer(Context, 0, MAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
@@ -51,7 +52,7 @@ public:
   virtual void EmitLinkerOptions(ArrayRef<std::string> Options);
   virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                 unsigned ByteAlignment);
@@ -81,16 +82,14 @@ public:
     // FIXME: Just ignore the .file; it isn't important enough to fail the
     // entire assembly.
 
-    //report_fatal_error("unsupported directive: '.file'");
+    // report_fatal_error("unsupported directive: '.file'");
   }
 
-  virtual void FinishImpl();
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_MachOStreamer;
+  virtual void EmitIdent(StringRef IdentString) {
+    llvm_unreachable("macho doesn't support this directive");
   }
+
+  virtual void FinishImpl();
 };
 
 } // end anonymous namespace.
@@ -122,7 +121,7 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
   // isSymbolLinkerVisible uses the section.
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
@@ -217,7 +216,7 @@ void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
   SD.setFlags(SD.getFlags() | SF_ThumbFunc);
 }
 
-void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
   // Indirect symbols are handled differently, to match how 'as' handles
   // them. This makes writing matching .o files easier.
@@ -228,7 +227,7 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     ISD.Symbol = Symbol;
     ISD.SectionData = getCurrentSectionData();
     getAssembler().getIndirectSymbols().push_back(ISD);
-    return;
+    return true;
   }
 
   // Adding a symbol attribute always introduces the symbol, note that an
@@ -257,7 +256,7 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_Protected:
   case MCSA_Weak:
   case MCSA_Local:
-    llvm_unreachable("Invalid symbol attribute for Mach-O!");
+    return false;
 
   case MCSA_Global:
     SD.setExternal(true);
@@ -309,6 +308,8 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     SD.setFlags(SD.getFlags() | SF_WeakDefinition | SF_WeakReference);
     break;
   }
+
+  return true;
 }
 
 void MCMachOStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -324,6 +325,8 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
+  AssignSection(Symbol, NULL);
+
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
   SD.setExternal(true);
   SD.setCommon(Size, ByteAlignment);
@@ -360,7 +363,7 @@ void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
   MCFragment *F = new MCFillFragment(0, 0, Size, &SectData);
   SD.setFragment(F);
 
-  Symbol->setSection(*Section);
+  AssignSection(Symbol, Section);
 
   // Update the maximum alignment on the zero fill section if necessary.
   if (ByteAlignment > SectData.getAlignment())
@@ -393,7 +396,7 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
 }
 
 void MCMachOStreamer::FinishImpl() {
-  EmitFrames(true);
+  EmitFrames(&getAssembler().getBackend(), true);
 
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCModule.cpp
index 5890b4b..7e9e18a 100644
--- a/lib/MC/MCModule.cpp
+++ b/lib/MC/MCModule.cpp
@@ -18,6 +18,10 @@ static bool AtomComp(const MCAtom *L, uint64_t Addr) {
   return L->getEndAddr() < Addr;
 }
 
+static bool AtomCompInv(uint64_t Addr, const MCAtom *R) {
+  return Addr < R->getEndAddr();
+}
+
 void MCModule::map(MCAtom *NewAtom) {
   uint64_t Begin = NewAtom->Begin;
 
@@ -54,9 +58,13 @@ void MCModule::remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd) {
   assert(*I == Atom && "Previous atom mapping was invalid!");
   Atoms.erase(I);
 
+  // FIXME: special case NewBegin == Atom->Begin
+
   // Insert the new mapping.
   AtomListTy::iterator NewI = std::lower_bound(atom_begin(), atom_end(),
                                                NewBegin, AtomComp);
+  assert((NewI == atom_end() || (*NewI)->getBeginAddr() > Atom->End)
+         && "Offset range already occupied!");
   Atoms.insert(NewI, Atom);
 
   // Update the atom internal bounds.
@@ -73,18 +81,55 @@ const MCAtom *MCModule::findAtomContaining(uint64_t Addr) const {
 }
 
 MCAtom *MCModule::findAtomContaining(uint64_t Addr) {
-  AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
-                                            Addr, AtomComp);
-  if (I != atom_end() && (*I)->getBeginAddr() <= Addr)
+  return const_cast<MCAtom*>(
+    const_cast<const MCModule *>(this)->findAtomContaining(Addr));
+}
+
+const MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) const {
+  AtomListTy::const_iterator I = std::upper_bound(atom_begin(), atom_end(),
+                                                  Addr, AtomCompInv);
+  if (I != atom_end())
     return *I;
   return 0;
 }
 
-MCFunction *MCModule::createFunction(const StringRef &Name) {
-  Functions.push_back(new MCFunction(Name));
+MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) {
+  return const_cast<MCAtom*>(
+    const_cast<const MCModule *>(this)->findFirstAtomAfter(Addr));
+}
+
+MCFunction *MCModule::createFunction(StringRef Name) {
+  Functions.push_back(new MCFunction(Name, this));
   return Functions.back();
 }
 
+static bool CompBBToAtom(MCBasicBlock *BB, const MCTextAtom *Atom) {
+  return BB->getInsts() < Atom;
+}
+
+void MCModule::splitBasicBlocksForAtom(const MCTextAtom *TA,
+                                       const MCTextAtom *NewTA) {
+  BBsByAtomTy::iterator
+    I = std::lower_bound(BBsByAtom.begin(), BBsByAtom.end(),
+                         TA, CompBBToAtom);
+  for (; I != BBsByAtom.end() && (*I)->getInsts() == TA; ++I) {
+    MCBasicBlock *BB = *I;
+    MCBasicBlock *NewBB = &BB->getParent()->createBlock(*NewTA);
+    BB->splitBasicBlock(NewBB);
+  }
+}
+
+void MCModule::trackBBForAtom(const MCTextAtom *Atom, MCBasicBlock *BB) {
+  assert(Atom == BB->getInsts() && "Text atom doesn't back the basic block!");
+  BBsByAtomTy::iterator I = std::lower_bound(BBsByAtom.begin(),
+                                             BBsByAtom.end(),
+                                             Atom, CompBBToAtom);
+  for (; I != BBsByAtom.end() && (*I)->getInsts() == Atom; ++I)
+    if (*I == BB)
+      return;
+  BBsByAtom.insert(I, BB);
+}
+
 MCModule::~MCModule() {
   for (AtomListTy::iterator AI = atom_begin(),
                             AE = atom_end();
diff --git a/lib/MC/MCModuleYAML.cpp b/lib/MC/MCModuleYAML.cpp
new file mode 100644
index 0000000..e2de578
--- /dev/null
+++ b/lib/MC/MCModuleYAML.cpp
@@ -0,0 +1,461 @@
+//===- MCModuleYAML.cpp - MCModule YAMLIO implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of MCModule.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCModuleYAML.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCFunction.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Object/YAML.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+
+namespace llvm {
+
+namespace {
+
+// This class is used to map opcode and register names to enum values.
+//
+// There are at least 3 obvious ways to do this:
+// 1- Generate an MII/MRI method using a tablegen StringMatcher
+// 2- Write an MII/MRI method using std::lower_bound and the assumption that
+//    the enums are sorted (starting at a fixed value).
+// 3- Do the matching manually as is done here.
+//
+// Why 3?
+// 1- A StringMatcher function for thousands of entries would incur
+//    a non-negligible binary size overhead.
+// 2- The lower_bound comparators would be somewhat involved and aren't
+//    obviously reusable (see LessRecordRegister in llvm/TableGen/Record.h)
+// 3- This isn't actually something useful outside tests (but the same argument
+//    can be made against having {MII,MRI}::getName).
+//
+// If this becomes useful outside this specific situation, feel free to do
+// the Right Thing (tm) and move the functionality to MII/MRI.
+//
+class InstrRegInfoHolder {
+  typedef StringMap<unsigned, BumpPtrAllocator> EnumValByNameTy;
+  EnumValByNameTy InstEnumValueByName;
+  EnumValByNameTy RegEnumValueByName;
+
+public:
+  const MCInstrInfo &MII;
+  const MCRegisterInfo &MRI;
+  InstrRegInfoHolder(const MCInstrInfo &MII, const MCRegisterInfo &MRI)
+      : InstEnumValueByName(NextPowerOf2(MII.getNumOpcodes())),
+        RegEnumValueByName(NextPowerOf2(MRI.getNumRegs())), MII(MII), MRI(MRI) {
+    for (int i = 0, e = MII.getNumOpcodes(); i != e; ++i)
+      InstEnumValueByName[MII.getName(i)] = i;
+    for (int i = 0, e = MRI.getNumRegs(); i != e; ++i)
+      RegEnumValueByName[MRI.getName(i)] = i;
+  }
+
+  bool matchRegister(StringRef Name, unsigned &Reg) {
+    EnumValByNameTy::const_iterator It = RegEnumValueByName.find(Name);
+    if (It == RegEnumValueByName.end())
+      return false;
+    Reg = It->getValue();
+    return true;
+  }
+  bool matchOpcode(StringRef Name, unsigned &Opc) {
+    EnumValByNameTy::const_iterator It = InstEnumValueByName.find(Name);
+    if (It == InstEnumValueByName.end())
+      return false;
+    Opc = It->getValue();
+    return true;
+  }
+};
+
+} // end unnamed namespace
+
+namespace MCModuleYAML {
+
+LLVM_YAML_STRONG_TYPEDEF(unsigned, OpcodeEnum)
+
+struct Operand {
+  MCOperand MCOp;
+};
+
+struct Inst {
+  OpcodeEnum Opcode;
+  std::vector<Operand> Operands;
+  uint64_t Size;
+};
+
+struct Atom {
+  MCAtom::AtomKind Type;
+  yaml::Hex64 StartAddress;
+  uint64_t Size;
+
+  std::vector<Inst> Insts;
+  object::yaml::BinaryRef Data;
+};
+
+struct BasicBlock {
+  yaml::Hex64 Address;
+  std::vector<yaml::Hex64> Preds;
+  std::vector<yaml::Hex64> Succs;
+};
+
+struct Function {
+  StringRef Name;
+  std::vector<BasicBlock> BasicBlocks;
+};
+
+struct Module {
+  std::vector<Atom> Atoms;
+  std::vector<Function> Functions;
+};
+
+} // end namespace MCModuleYAML
+} // end namespace llvm
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::Hex64)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::MCModuleYAML::Operand)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Inst)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Atom)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::BasicBlock)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Function)
+
+namespace llvm {
+
+namespace yaml {
+
+template <> struct ScalarEnumerationTraits<MCAtom::AtomKind> {
+  static void enumeration(IO &IO, MCAtom::AtomKind &Kind);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Atom> {
+  static void mapping(IO &IO, MCModuleYAML::Atom &A);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Inst> {
+  static void mapping(IO &IO, MCModuleYAML::Inst &I);
+};
+
+template <> struct MappingTraits<MCModuleYAML::BasicBlock> {
+  static void mapping(IO &IO, MCModuleYAML::BasicBlock &BB);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Function> {
+  static void mapping(IO &IO, MCModuleYAML::Function &Fn);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Module> {
+  static void mapping(IO &IO, MCModuleYAML::Module &M);
+};
+
+template <> struct ScalarTraits<MCModuleYAML::Operand> {
+  static void output(const MCModuleYAML::Operand &, void *,
+                     llvm::raw_ostream &);
+  static StringRef input(StringRef, void *, MCModuleYAML::Operand &);
+};
+
+template <> struct ScalarTraits<MCModuleYAML::OpcodeEnum> {
+  static void output(const MCModuleYAML::OpcodeEnum &, void *,
+                     llvm::raw_ostream &);
+  static StringRef input(StringRef, void *, MCModuleYAML::OpcodeEnum &);
+};
+
+void ScalarEnumerationTraits<MCAtom::AtomKind>::enumeration(
+    IO &IO, MCAtom::AtomKind &Value) {
+  IO.enumCase(Value, "Text", MCAtom::TextAtom);
+  IO.enumCase(Value, "Data", MCAtom::DataAtom);
+}
+
+void MappingTraits<MCModuleYAML::Atom>::mapping(IO &IO, MCModuleYAML::Atom &A) {
+  IO.mapRequired("StartAddress", A.StartAddress);
+  IO.mapRequired("Size", A.Size);
+  IO.mapRequired("Type", A.Type);
+  if (A.Type == MCAtom::TextAtom)
+    IO.mapRequired("Content", A.Insts);
+  else if (A.Type == MCAtom::DataAtom)
+    IO.mapRequired("Content", A.Data);
+}
+
+void MappingTraits<MCModuleYAML::Inst>::mapping(IO &IO, MCModuleYAML::Inst &I) {
+  IO.mapRequired("Inst", I.Opcode);
+  IO.mapRequired("Size", I.Size);
+  IO.mapRequired("Ops", I.Operands);
+}
+
+void
+MappingTraits<MCModuleYAML::BasicBlock>::mapping(IO &IO,
+                                                 MCModuleYAML::BasicBlock &BB) {
+  IO.mapRequired("Address", BB.Address);
+  IO.mapRequired("Preds", BB.Preds);
+  IO.mapRequired("Succs", BB.Succs);
+}
+
+void MappingTraits<MCModuleYAML::Function>::mapping(IO &IO,
+                                                    MCModuleYAML::Function &F) {
+  IO.mapRequired("Name", F.Name);
+  IO.mapRequired("BasicBlocks", F.BasicBlocks);
+}
+
+void MappingTraits<MCModuleYAML::Module>::mapping(IO &IO,
+                                                  MCModuleYAML::Module &M) {
+  IO.mapRequired("Atoms", M.Atoms);
+  IO.mapOptional("Functions", M.Functions);
+}
+
+void
+ScalarTraits<MCModuleYAML::Operand>::output(const MCModuleYAML::Operand &Val,
+                                            void *Ctx, raw_ostream &Out) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+
+  // FIXME: Doesn't support FPImm and expr/inst, but do these make sense?
+  if (Val.MCOp.isImm())
+    Out << "I" << Val.MCOp.getImm();
+  else if (Val.MCOp.isReg())
+    Out << "R" << IRI->MRI.getName(Val.MCOp.getReg());
+  else
+    llvm_unreachable("Trying to output invalid MCOperand!");
+}
+
+StringRef
+ScalarTraits<MCModuleYAML::Operand>::input(StringRef Scalar, void *Ctx,
+                                           MCModuleYAML::Operand &Val) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  char Type = 0;
+  if (Scalar.size() >= 1)
+    Type = Scalar.front();
+  if (Type != 'R' && Type != 'I')
+    return "Operand must start with 'R' (register) or 'I' (immediate).";
+  if (Type == 'R') {
+    unsigned Reg;
+    if (!IRI->matchRegister(Scalar.substr(1), Reg))
+      return "Invalid register name.";
+    Val.MCOp = MCOperand::CreateReg(Reg);
+  } else if (Type == 'I') {
+    int64_t RIVal;
+    if (Scalar.substr(1).getAsInteger(10, RIVal))
+      return "Invalid immediate value.";
+    Val.MCOp = MCOperand::CreateImm(RIVal);
+  } else {
+    Val.MCOp = MCOperand();
+  }
+  return StringRef();
+}
+
+void ScalarTraits<MCModuleYAML::OpcodeEnum>::output(
+    const MCModuleYAML::OpcodeEnum &Val, void *Ctx, raw_ostream &Out) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  Out << IRI->MII.getName(Val);
+}
+
+StringRef
+ScalarTraits<MCModuleYAML::OpcodeEnum>::input(StringRef Scalar, void *Ctx,
+                                              MCModuleYAML::OpcodeEnum &Val) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  unsigned Opc;
+  if (!IRI->matchOpcode(Scalar, Opc))
+    return "Invalid instruction opcode.";
+  Val = Opc;
+  return "";
+}
+
+} // end namespace yaml
+
+namespace {
+
+class MCModule2YAML {
+  const MCModule &MCM;
+  MCModuleYAML::Module YAMLModule;
+  void dumpAtom(const MCAtom *MCA);
+  void dumpFunction(const MCFunction *MCF);
+  void dumpBasicBlock(const MCBasicBlock *MCBB);
+
+public:
+  MCModule2YAML(const MCModule &MCM);
+  MCModuleYAML::Module &getYAMLModule();
+};
+
+class YAML2MCModule {
+  MCModule &MCM;
+
+public:
+  YAML2MCModule(MCModule &MCM);
+  StringRef parse(const MCModuleYAML::Module &YAMLModule);
+};
+
+} // end unnamed namespace
+
+MCModule2YAML::MCModule2YAML(const MCModule &MCM) : MCM(MCM), YAMLModule() {
+  for (MCModule::const_atom_iterator AI = MCM.atom_begin(), AE = MCM.atom_end();
+       AI != AE; ++AI)
+    dumpAtom(*AI);
+  for (MCModule::const_func_iterator FI = MCM.func_begin(), FE = MCM.func_end();
+       FI != FE; ++FI)
+    dumpFunction(*FI);
+}
+
+void MCModule2YAML::dumpAtom(const MCAtom *MCA) {
+  YAMLModule.Atoms.resize(YAMLModule.Atoms.size() + 1);
+  MCModuleYAML::Atom &A = YAMLModule.Atoms.back();
+  A.Type = MCA->getKind();
+  A.StartAddress = MCA->getBeginAddr();
+  A.Size = MCA->getEndAddr() - MCA->getBeginAddr() + 1;
+  if (const MCTextAtom *TA = dyn_cast<MCTextAtom>(MCA)) {
+    const size_t InstCount = TA->size();
+    A.Insts.resize(InstCount);
+    for (size_t i = 0; i != InstCount; ++i) {
+      const MCDecodedInst &MCDI = TA->at(i);
+      A.Insts[i].Opcode = MCDI.Inst.getOpcode();
+      A.Insts[i].Size = MCDI.Size;
+      const unsigned OpCount = MCDI.Inst.getNumOperands();
+      A.Insts[i].Operands.resize(OpCount);
+      for (unsigned oi = 0; oi != OpCount; ++oi)
+        A.Insts[i].Operands[oi].MCOp = MCDI.Inst.getOperand(oi);
+    }
+  } else if (const MCDataAtom *DA = dyn_cast<MCDataAtom>(MCA)) {
+    A.Data = DA->getData();
+  } else {
+    llvm_unreachable("Unknown atom type.");
+  }
+}
+
+void MCModule2YAML::dumpFunction(const MCFunction *MCF) {
+  YAMLModule.Functions.resize(YAMLModule.Functions.size() + 1);
+  MCModuleYAML::Function &F = YAMLModule.Functions.back();
+  F.Name = MCF->getName();
+  for (MCFunction::const_iterator BBI = MCF->begin(), BBE = MCF->end();
+       BBI != BBE; ++BBI) {
+    const MCBasicBlock *MCBB = *BBI;
+    F.BasicBlocks.resize(F.BasicBlocks.size() + 1);
+    MCModuleYAML::BasicBlock &BB = F.BasicBlocks.back();
+    BB.Address = MCBB->getInsts()->getBeginAddr();
+    for (MCBasicBlock::pred_const_iterator PI = MCBB->pred_begin(),
+                                           PE = MCBB->pred_end();
+         PI != PE; ++PI)
+      BB.Preds.push_back((*PI)->getInsts()->getBeginAddr());
+    for (MCBasicBlock::succ_const_iterator SI = MCBB->succ_begin(),
+                                           SE = MCBB->succ_end();
+         SI != SE; ++SI)
+      BB.Succs.push_back((*SI)->getInsts()->getBeginAddr());
+  }
+}
+
+MCModuleYAML::Module &MCModule2YAML::getYAMLModule() { return YAMLModule; }
+
+YAML2MCModule::YAML2MCModule(MCModule &MCM) : MCM(MCM) {}
+
+StringRef YAML2MCModule::parse(const MCModuleYAML::Module &YAMLModule) {
+  typedef std::vector<MCModuleYAML::Atom>::const_iterator AtomIt;
+  typedef std::vector<MCModuleYAML::Inst>::const_iterator InstIt;
+  typedef std::vector<MCModuleYAML::Operand>::const_iterator OpIt;
+
+  typedef DenseMap<uint64_t, MCTextAtom *> AddrToTextAtomTy;
+  AddrToTextAtomTy TAByAddr;
+
+  for (AtomIt AI = YAMLModule.Atoms.begin(), AE = YAMLModule.Atoms.end();
+       AI != AE; ++AI) {
+    uint64_t StartAddress = AI->StartAddress;
+    if (AI->Size == 0)
+      return "Atoms can't be empty!";
+    uint64_t EndAddress = StartAddress + AI->Size - 1;
+    switch (AI->Type) {
+    case MCAtom::TextAtom: {
+      MCTextAtom *TA = MCM.createTextAtom(StartAddress, EndAddress);
+      TAByAddr[StartAddress] = TA;
+      for (InstIt II = AI->Insts.begin(), IE = AI->Insts.end(); II != IE;
+           ++II) {
+        MCInst MI;
+        MI.setOpcode(II->Opcode);
+        for (OpIt OI = II->Operands.begin(), OE = II->Operands.end(); OI != OE;
+             ++OI)
+          MI.addOperand(OI->MCOp);
+        TA->addInst(MI, II->Size);
+      }
+      break;
+    }
+    case MCAtom::DataAtom: {
+      MCDataAtom *DA = MCM.createDataAtom(StartAddress, EndAddress);
+      SmallVector<char, 64> Data;
+      raw_svector_ostream OS(Data);
+      AI->Data.writeAsBinary(OS);
+      OS.flush();
+      for (size_t i = 0, e = Data.size(); i != e; ++i)
+        DA->addData((uint8_t)Data[i]);
+      break;
+    }
+    }
+  }
+
+  typedef std::vector<MCModuleYAML::Function>::const_iterator FuncIt;
+  typedef std::vector<MCModuleYAML::BasicBlock>::const_iterator BBIt;
+  typedef std::vector<yaml::Hex64>::const_iterator AddrIt;
+  for (FuncIt FI = YAMLModule.Functions.begin(),
+              FE = YAMLModule.Functions.end();
+       FI != FE; ++FI) {
+    MCFunction *MCFN = MCM.createFunction(FI->Name);
+    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
+         BBI != BBE; ++BBI) {
+      AddrToTextAtomTy::const_iterator It = TAByAddr.find(BBI->Address);
+      if (It == TAByAddr.end())
+        return "Basic block start address doesn't match any text atom!";
+      MCFN->createBlock(*It->second);
+    }
+    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
+         BBI != BBE; ++BBI) {
+      MCBasicBlock *MCBB = MCFN->find(BBI->Address);
+      if (!MCBB)
+        return "Couldn't find matching basic block in function.";
+      for (AddrIt PI = BBI->Preds.begin(), PE = BBI->Preds.end(); PI != PE;
+           ++PI) {
+        MCBasicBlock *Pred = MCFN->find(*PI);
+        if (!Pred)
+          return "Couldn't find predecessor basic block.";
+        MCBB->addPredecessor(Pred);
+      }
+      for (AddrIt SI = BBI->Succs.begin(), SE = BBI->Succs.end(); SI != SE;
+           ++SI) {
+        MCBasicBlock *Succ = MCFN->find(*SI);
+        if (!Succ)
+          return "Couldn't find predecessor basic block.";
+        MCBB->addSuccessor(Succ);
+      }
+    }
+  }
+  return "";
+}
+
+StringRef mcmodule2yaml(raw_ostream &OS, const MCModule &MCM,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
+  MCModule2YAML Dumper(MCM);
+  InstrRegInfoHolder IRI(MII, MRI);
+  yaml::Output YOut(OS, (void *)&IRI);
+  YOut << Dumper.getYAMLModule();
+  return "";
+}
+
+StringRef yaml2mcmodule(OwningPtr<MCModule> &MCM, StringRef YamlContent,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
+  MCM.reset(new MCModule);
+  YAML2MCModule Parser(*MCM);
+  MCModuleYAML::Module YAMLModule;
+  InstrRegInfoHolder IRI(MII, MRI);
+  yaml::Input YIn(YamlContent, (void *)&IRI);
+  YIn >> YAMLModule;
+  if (error_code ec = YIn.error())
+    return ec.message();
+  StringRef err = Parser.parse(YAMLModule);
+  if (!err.empty())
+    return err;
+  return "";
+}
+
+} // end namespace llvm
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 530c646..9b9c4aa 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -19,7 +19,7 @@ namespace {
 
   class MCNullStreamer : public MCStreamer {
   public:
-    MCNullStreamer(MCContext &Context) : MCStreamer(SK_NullStreamer, Context) {}
+    MCNullStreamer(MCContext &Context) : MCStreamer(Context, 0) {}
 
     /// @name MCStreamer Interface
     /// @{
@@ -37,7 +37,7 @@ namespace {
     virtual void EmitLabel(MCSymbol *Symbol) {
       assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
       assert(getCurrentSection().first &&"Cannot emit before setting section!");
-      Symbol->setSection(*getCurrentSection().first);
+      AssignSection(Symbol, getCurrentSection().first);
     }
     virtual void EmitDebugLabel(MCSymbol *Symbol) {
       EmitLabel(Symbol);
@@ -52,7 +52,9 @@ namespace {
                                           const MCSymbol *Label,
                                           unsigned PointerSize) {}
 
-    virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute){}
+    virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute){
+      return true;
+    }
 
     virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
 
@@ -107,13 +109,6 @@ namespace {
     virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
       RecordProcEnd(Frame);
     }
-
-    /// @}
-
-    static bool classof(const MCStreamer *S) {
-      return S->getKind() == SK_NullStreamer;
-    }
-
   };
 
 }
diff --git a/lib/MC/MCObjectDisassembler.cpp b/lib/MC/MCObjectDisassembler.cpp
index 1ea6eed..16a110f 100644
--- a/lib/MC/MCObjectDisassembler.cpp
+++ b/lib/MC/MCObjectDisassembler.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCObjectDisassembler.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -18,12 +18,15 @@
 #include "llvm/MC/MCFunction.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCObjectSymbolizer.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/StringRefMemoryObject.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
-#include <set>
 
 using namespace llvm;
 using namespace object;
@@ -31,10 +34,55 @@ using namespace object;
 MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
                                            const MCDisassembler &Dis,
                                            const MCInstrAnalysis &MIA)
-  : Obj(Obj), Dis(Dis), MIA(MIA) {}
+    : Obj(Obj), Dis(Dis), MIA(MIA), MOS(0) {}
 
-MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
+uint64_t MCObjectDisassembler::getEntrypoint() {
+  error_code ec;
+  for (symbol_iterator SI = Obj.begin_symbols(), SE = Obj.end_symbols();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    StringRef Name;
+    SI->getName(Name);
+    if (Name == "main" || Name == "_main") {
+      uint64_t Entrypoint;
+      SI->getAddress(Entrypoint);
+      return getEffectiveLoadAddr(Entrypoint);
+    }
+  }
+  return 0;
+}
+
+ArrayRef<uint64_t> MCObjectDisassembler::getStaticInitFunctions() {
+  return ArrayRef<uint64_t>();
+}
+
+ArrayRef<uint64_t> MCObjectDisassembler::getStaticExitFunctions() {
+  return ArrayRef<uint64_t>();
+}
+
+MemoryObject *MCObjectDisassembler::getRegionFor(uint64_t Addr) {
+  // FIXME: Keep track of object sections.
+  return FallbackRegion.get();
+}
+
+uint64_t MCObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
+  return Addr;
+}
+
+uint64_t MCObjectDisassembler::getOriginalLoadAddr(uint64_t Addr) {
+  return Addr;
+}
+
+MCModule *MCObjectDisassembler::buildEmptyModule() {
   MCModule *Module = new MCModule;
+  Module->Entrypoint = getEntrypoint();
+  return Module;
+}
+
+MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
+  MCModule *Module = buildEmptyModule();
+
   buildSectionAtoms(Module);
   if (withCFG)
     buildCFG(Module);
@@ -58,9 +106,10 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
     uint64_t SecSize; SI->getSize(SecSize);
     if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize)
       continue;
+    StartAddr = getEffectiveLoadAddr(StartAddr);
 
     StringRef Contents; SI->getContents(Contents);
-    StringRefMemoryObject memoryObject(Contents);
+    StringRefMemoryObject memoryObject(Contents, StartAddr);
 
     // We don't care about things like non-file-backed sections yet.
     if (Contents.size() != SecSize || !SecSize)
@@ -70,19 +119,31 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
     StringRef SecName; SI->getName(SecName);
 
     if (isText) {
-      MCTextAtom *Text = Module->createTextAtom(StartAddr, EndAddr);
-      Text->setName(SecName);
+      MCTextAtom *Text = 0;
+      MCDataAtom *InvalidData = 0;
+
       uint64_t InstSize;
       for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
+        const uint64_t CurAddr = StartAddr + Index;
         MCInst Inst;
-        if (Dis.getInstruction(Inst, InstSize, memoryObject, Index,
-                               nulls(), nulls()))
+        if (Dis.getInstruction(Inst, InstSize, memoryObject, CurAddr, nulls(),
+                               nulls())) {
+          if (!Text) {
+            Text = Module->createTextAtom(CurAddr, CurAddr);
+            Text->setName(SecName);
+          }
           Text->addInst(Inst, InstSize);
-        else
-          // We don't care about splitting mixed atoms either.
-          llvm_unreachable("Couldn't disassemble instruction in atom.");
+          InvalidData = 0;
+        } else {
+          assert(InstSize && "getInstruction() consumed no bytes");
+          if (!InvalidData) {
+            Text = 0;
+            InvalidData = Module->createDataAtom(CurAddr, CurAddr+InstSize - 1);
+          }
+          for (uint64_t I = 0; I < InstSize; ++I)
+            InvalidData->addData(Contents[Index+I]);
+        }
       }
-
     } else {
       MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);
       Data->setName(SecName);
@@ -94,13 +155,16 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
 
 namespace {
   struct BBInfo;
-  typedef std::set<BBInfo*> BBInfoSetTy;
+  typedef SmallPtrSet<BBInfo*, 2> BBInfoSetTy;
 
   struct BBInfo {
     MCTextAtom *Atom;
     MCBasicBlock *BB;
     BBInfoSetTy Succs;
     BBInfoSetTy Preds;
+    MCObjectDisassembler::AddressSetTy SuccAddrs;
+
+    BBInfo() : Atom(0), BB(0) {}
 
     void addSucc(BBInfo &Succ) {
       Succs.insert(&Succ);
@@ -109,13 +173,33 @@ namespace {
   };
 }
 
+static void RemoveDupsFromAddressVector(MCObjectDisassembler::AddressSetTy &V) {
+  std::sort(V.begin(), V.end());
+  V.erase(std::unique(V.begin(), V.end()), V.end());
+}
+
 void MCObjectDisassembler::buildCFG(MCModule *Module) {
   typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
   BBInfoByAddrTy BBInfos;
-  typedef std::set<uint64_t> AddressSetTy;
   AddressSetTy Splits;
   AddressSetTy Calls;
 
+  error_code ec;
+  for (symbol_iterator SI = Obj.begin_symbols(), SE = Obj.end_symbols();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    SymbolRef::Type SymType;
+    SI->getType(SymType);
+    if (SymType == SymbolRef::ST_Function) {
+      uint64_t SymAddr;
+      SI->getAddress(SymAddr);
+      SymAddr = getEffectiveLoadAddr(SymAddr);
+      Calls.push_back(SymAddr);
+      Splits.push_back(SymAddr);
+    }
+  }
+
   assert(Module->func_begin() == Module->func_end()
          && "Module already has a CFG!");
 
@@ -125,21 +209,24 @@ void MCObjectDisassembler::buildCFG(MCModule *Module) {
        AI != AE; ++AI) {
     MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
     if (!TA) continue;
-    Calls.insert(TA->getBeginAddr());
+    Calls.push_back(TA->getBeginAddr());
     BBInfos[TA->getBeginAddr()].Atom = TA;
     for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
          II != IE; ++II) {
       if (MIA.isTerminator(II->Inst))
-        Splits.insert(II->Address + II->Size);
+        Splits.push_back(II->Address + II->Size);
       uint64_t Target;
       if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {
         if (MIA.isCall(II->Inst))
-          Calls.insert(Target);
-        Splits.insert(Target);
+          Calls.push_back(Target);
+        Splits.push_back(Target);
       }
     }
   }
 
+  RemoveDupsFromAddressVector(Splits);
+  RemoveDupsFromAddressVector(Calls);
+
   // Split text atoms into basic block atoms.
   for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();
        SI != SE; ++SI) {
@@ -185,8 +272,8 @@ void MCObjectDisassembler::buildCFG(MCModule *Module) {
     // Create MCBBs.
     SmallSetVector<BBInfo*, 16> Worklist;
     Worklist.insert(&BBI);
-    for (size_t WI = 0; WI < Worklist.size(); ++WI) {
-      BBInfo *BBI = Worklist[WI];
+    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+      BBInfo *BBI = Worklist[wi];
       if (!BBI->Atom)
         continue;
       BBI->BB = &MCFN.createBlock(*BBI->Atom);
@@ -200,17 +287,298 @@ void MCObjectDisassembler::buildCFG(MCModule *Module) {
     }
 
     // Set preds/succs.
-    for (size_t WI = 0; WI < Worklist.size(); ++WI) {
-      BBInfo *BBI = Worklist[WI];
+    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+      BBInfo *BBI = Worklist[wi];
       MCBasicBlock *MCBB = BBI->BB;
       if (!MCBB)
         continue;
       for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
-                                 SI != SE; ++SI)
-        MCBB->addSuccessor((*SI)->BB);
+           SI != SE; ++SI)
+        if ((*SI)->BB)
+          MCBB->addSuccessor((*SI)->BB);
       for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
-                                 PI != PE; ++PI)
-        MCBB->addPredecessor((*PI)->BB);
+           PI != PE; ++PI)
+        if ((*PI)->BB)
+          MCBB->addPredecessor((*PI)->BB);
+    }
+  }
+}
+
+// Basic idea of the disassembly + discovery:
+//
+// start with the wanted address, insert it in the worklist
+// while worklist not empty, take next address in the worklist:
+// - check if atom exists there
+//   - if middle of atom:
+//     - split basic blocks referencing the atom
+//     - look for an already encountered BBInfo (using a map<atom, bbinfo>)
+//       - if there is, split it (new one, fallthrough, move succs, etc..)
+//   - if start of atom: nothing else to do
+//   - if no atom: create new atom and new bbinfo
+// - look at the last instruction in the atom, add succs to worklist
+// for all elements in the worklist:
+// - create basic block, update preds/succs, etc..
+//
+MCBasicBlock *MCObjectDisassembler::getBBAt(MCModule *Module, MCFunction *MCFN,
+                                            uint64_t BBBeginAddr,
+                                            AddressSetTy &CallTargets,
+                                            AddressSetTy &TailCallTargets) {
+  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
+  typedef SmallSetVector<uint64_t, 16> AddrWorklistTy;
+  BBInfoByAddrTy BBInfos;
+  AddrWorklistTy Worklist;
+
+  Worklist.insert(BBBeginAddr);
+  for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+
+    MCTextAtom *&TA = BBI->Atom;
+    assert(!TA && "Discovered basic block already has an associated atom!");
+
+    // Look for an atom at BeginAddr.
+    if (MCAtom *A = Module->findAtomContaining(BeginAddr)) {
+      // FIXME: We don't care about mixed atoms, see above.
+      TA = cast<MCTextAtom>(A);
+
+      // The found atom doesn't begin at BeginAddr, we have to split it.
+      if (TA->getBeginAddr() != BeginAddr) {
+        // FIXME: Handle overlapping atoms: middle-starting instructions, etc..
+        MCTextAtom *NewTA = TA->split(BeginAddr);
+
+        // Look for an already encountered basic block that needs splitting
+        BBInfoByAddrTy::iterator It = BBInfos.find(TA->getBeginAddr());
+        if (It != BBInfos.end() && It->second.Atom) {
+          BBI->SuccAddrs = It->second.SuccAddrs;
+          It->second.SuccAddrs.clear();
+          It->second.SuccAddrs.push_back(BeginAddr);
+        }
+        TA = NewTA;
+      }
+      BBI->Atom = TA;
+    } else {
+      // If we didn't find an atom, then we have to disassemble to create one!
+
+      MemoryObject *Region = getRegionFor(BeginAddr);
+      if (!Region)
+        llvm_unreachable(("Couldn't find suitable region for disassembly at " +
+                          utostr(BeginAddr)).c_str());
+
+      uint64_t InstSize;
+      uint64_t EndAddr = Region->getBase() + Region->getExtent();
+
+      // We want to stop before the next atom and have a fallthrough to it.
+      if (MCTextAtom *NextAtom =
+              cast_or_null<MCTextAtom>(Module->findFirstAtomAfter(BeginAddr)))
+        EndAddr = std::min(EndAddr, NextAtom->getBeginAddr());
+
+      for (uint64_t Addr = BeginAddr; Addr < EndAddr; Addr += InstSize) {
+        MCInst Inst;
+        if (Dis.getInstruction(Inst, InstSize, *Region, Addr, nulls(),
+                               nulls())) {
+          if (!TA)
+            TA = Module->createTextAtom(Addr, Addr);
+          TA->addInst(Inst, InstSize);
+        } else {
+          // We don't care about splitting mixed atoms either.
+          llvm_unreachable("Couldn't disassemble instruction in atom.");
+        }
+
+        uint64_t BranchTarget;
+        if (MIA.evaluateBranch(Inst, Addr, InstSize, BranchTarget)) {
+          if (MIA.isCall(Inst))
+            CallTargets.push_back(BranchTarget);
+        }
+
+        if (MIA.isTerminator(Inst))
+          break;
+      }
+      BBI->Atom = TA;
+    }
+
+    assert(TA && "Couldn't disassemble atom, none was created!");
+    assert(TA->begin() != TA->end() && "Empty atom!");
+
+    MemoryObject *Region = getRegionFor(TA->getBeginAddr());
+    assert(Region && "Couldn't find region for already disassembled code!");
+    uint64_t EndRegion = Region->getBase() + Region->getExtent();
+
+    // Now we have a basic block atom, add successors.
+    // Add the fallthrough block.
+    if ((MIA.isConditionalBranch(TA->back().Inst) ||
+         !MIA.isTerminator(TA->back().Inst)) &&
+        (TA->getEndAddr() + 1 < EndRegion)) {
+      BBI->SuccAddrs.push_back(TA->getEndAddr() + 1);
+      Worklist.insert(TA->getEndAddr() + 1);
+    }
+
+    // If the terminator is a branch, add the target block.
+    if (MIA.isBranch(TA->back().Inst)) {
+      uint64_t BranchTarget;
+      if (MIA.evaluateBranch(TA->back().Inst, TA->back().Address,
+                             TA->back().Size, BranchTarget)) {
+        StringRef ExtFnName;
+        if (MOS)
+          ExtFnName =
+              MOS->findExternalFunctionAt(getOriginalLoadAddr(BranchTarget));
+        if (!ExtFnName.empty()) {
+          TailCallTargets.push_back(BranchTarget);
+          CallTargets.push_back(BranchTarget);
+        } else {
+          BBI->SuccAddrs.push_back(BranchTarget);
+          Worklist.insert(BranchTarget);
+        }
+      }
+    }
+  }
+
+  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+
+    assert(BBI->Atom && "Found a basic block without an associated atom!");
+
+    // Look for a basic block at BeginAddr.
+    BBI->BB = MCFN->find(BeginAddr);
+    if (BBI->BB) {
+      // FIXME: check that the succs/preds are the same
+      continue;
+    }
+    // If there was none, we have to create one from the atom.
+    BBI->BB = &MCFN->createBlock(*BBI->Atom);
+  }
+
+  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+    MCBasicBlock *BB = BBI->BB;
+
+    RemoveDupsFromAddressVector(BBI->SuccAddrs);
+    for (AddressSetTy::const_iterator SI = BBI->SuccAddrs.begin(),
+         SE = BBI->SuccAddrs.end();
+         SE != SE; ++SI) {
+      MCBasicBlock *Succ = BBInfos[*SI].BB;
+      BB->addSuccessor(Succ);
+      Succ->addPredecessor(BB);
+    }
+  }
+
+  assert(BBInfos[Worklist[0]].BB &&
+         "No basic block created at requested address?");
+
+  return BBInfos[Worklist[0]].BB;
+}
+
+MCFunction *
+MCObjectDisassembler::createFunction(MCModule *Module, uint64_t BeginAddr,
+                                     AddressSetTy &CallTargets,
+                                     AddressSetTy &TailCallTargets) {
+  // First, check if this is an external function.
+  StringRef ExtFnName;
+  if (MOS)
+    ExtFnName = MOS->findExternalFunctionAt(getOriginalLoadAddr(BeginAddr));
+  if (!ExtFnName.empty())
+    return Module->createFunction(ExtFnName);
+
+  // If it's not, look for an existing function.
+  for (MCModule::func_iterator FI = Module->func_begin(),
+                               FE = Module->func_end();
+       FI != FE; ++FI) {
+    if ((*FI)->empty())
+      continue;
+    // FIXME: MCModule should provide a findFunctionByAddr()
+    if ((*FI)->getEntryBlock()->getInsts()->getBeginAddr() == BeginAddr)
+      return *FI;
+  }
+
+  // Finally, just create a new one.
+  MCFunction *MCFN = Module->createFunction("");
+  getBBAt(Module, MCFN, BeginAddr, CallTargets, TailCallTargets);
+  return MCFN;
+}
+
+// MachO MCObjectDisassembler implementation.
+
+MCMachOObjectDisassembler::MCMachOObjectDisassembler(
+    const MachOObjectFile &MOOF, const MCDisassembler &Dis,
+    const MCInstrAnalysis &MIA, uint64_t VMAddrSlide,
+    uint64_t HeaderLoadAddress)
+    : MCObjectDisassembler(MOOF, Dis, MIA), MOOF(MOOF),
+      VMAddrSlide(VMAddrSlide), HeaderLoadAddress(HeaderLoadAddress) {
+
+  error_code ec;
+  for (section_iterator SI = MOOF.begin_sections(), SE = MOOF.end_sections();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    StringRef Name;
+    SI->getName(Name);
+    // FIXME: We should use the S_ section type instead of the name.
+    if (Name == "__mod_init_func") {
+      DEBUG(dbgs() << "Found __mod_init_func section!\n");
+      SI->getContents(ModInitContents);
+    } else if (Name == "__mod_exit_func") {
+      DEBUG(dbgs() << "Found __mod_exit_func section!\n");
+      SI->getContents(ModExitContents);
+    }
+  }
+}
+
+// FIXME: Only do the translations for addresses actually inside the object.
+uint64_t MCMachOObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
+  return Addr + VMAddrSlide;
+}
+
+uint64_t
+MCMachOObjectDisassembler::getOriginalLoadAddr(uint64_t EffectiveAddr) {
+  return EffectiveAddr - VMAddrSlide;
+}
+
+uint64_t MCMachOObjectDisassembler::getEntrypoint() {
+  uint64_t EntryFileOffset = 0;
+
+  // Look for LC_MAIN.
+  {
+    uint32_t LoadCommandCount = MOOF.getHeader().ncmds;
+    MachOObjectFile::LoadCommandInfo Load = MOOF.getFirstLoadCommandInfo();
+    for (unsigned I = 0;; ++I) {
+      if (Load.C.cmd == MachO::LC_MAIN) {
+        EntryFileOffset =
+            ((const MachO::entry_point_command *)Load.Ptr)->entryoff;
+        break;
+      }
+
+      if (I == LoadCommandCount - 1)
+        break;
+      else
+        Load = MOOF.getNextLoadCommandInfo(Load);
     }
   }
+
+  // If we didn't find anything, default to the common implementation.
+  // FIXME: Maybe we could also look at LC_UNIXTHREAD and friends?
+  if (EntryFileOffset)
+    return MCObjectDisassembler::getEntrypoint();
+
+  return EntryFileOffset + HeaderLoadAddress;
+}
+
+ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticInitFunctions() {
+  // FIXME: We only handle 64bit mach-o
+  assert(MOOF.is64Bit());
+
+  size_t EntrySize = 8;
+  size_t EntryCount = ModInitContents.size() / EntrySize;
+  return ArrayRef<uint64_t>(
+      reinterpret_cast<const uint64_t *>(ModInitContents.data()), EntryCount);
+}
+
+ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticExitFunctions() {
+  // FIXME: We only handle 64bit mach-o
+  assert(MOOF.is64Bit());
+
+  size_t EntrySize = 8;
+  size_t EntryCount = ModExitContents.size() / EntrySize;
+  return ArrayRef<uint64_t>(
+      reinterpret_cast<const uint64_t *>(ModExitContents.data()), EntryCount);
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index bcf52d2..8ef4a0a 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -39,6 +39,9 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     = Ctx->getMachOSection("__DATA", "__data", 0,
                            SectionKind::getDataRel());
 
+  // BSSSection might not be expected initialized on msvc.
+  BSSSection = 0;
+
   TLSDataSection // .tdata
     = Ctx->getMachOSection("__DATA", "__thread_data",
                            MCSectionMachO::S_THREAD_LOCAL_REGULAR,
@@ -199,6 +202,14 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__debug_pubtypes",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_gnu_pubn",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_gnu_pubt",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getMachOSection("__DWARF", "__debug_str",
                          MCSectionMachO::S_ATTR_DEBUG,
@@ -223,6 +234,9 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__debug_inlined",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
+  StackMapSection =
+    Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps", 0,
+                         SectionKind::getMetadata());
 
   TLSExtraDataSection = TLSTLVSection;
 }
@@ -435,6 +449,12 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   DwarfPubTypesSection =
     Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
                        ELF::SHF_MERGE | ELF::SHF_STRINGS,
@@ -496,6 +516,12 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
 void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   // COFF
+  BSSSection =
+    Ctx->getCOFFSection(".bss",
+                        COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getBSS());
   TextSection =
     Ctx->getCOFFSection(".text",
                         COFF::IMAGE_SCN_CNT_CODE |
@@ -585,6 +611,16 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getCOFFSection(".debug_gnu_pubnames",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getCOFFSection(".debug_gnu_pubtypes",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getCOFFSection(".debug_str",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 36a923a..bc14c2a 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -22,19 +22,22 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
-MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context,
+                                   MCTargetStreamer *TargetStreamer,
                                    MCAsmBackend &TAB, raw_ostream &OS,
                                    MCCodeEmitter *Emitter_)
-    : MCStreamer(Kind, Context),
+    : MCStreamer(Context, TargetStreamer),
       Assembler(new MCAssembler(Context, TAB, *Emitter_,
                                 *TAB.createObjectWriter(OS), OS)),
       CurSectionData(0) {}
 
-MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context,
+                                   MCTargetStreamer *TargetStreamer,
                                    MCAsmBackend &TAB, raw_ostream &OS,
                                    MCCodeEmitter *Emitter_,
                                    MCAssembler *_Assembler)
-    : MCStreamer(Kind, Context), Assembler(_Assembler), CurSectionData(0) {}
+    : MCStreamer(Context, TargetStreamer), Assembler(_Assembler),
+      CurSectionData(0) {}
 
 MCObjectStreamer::~MCObjectStreamer() {
   delete &Assembler->getBackend();
@@ -302,6 +305,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
 }
 
 void MCObjectStreamer::EmitBytes(StringRef Data) {
+  MCLineEntry::Make(this, getCurrentSection().first);
   getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
 }
 
diff --git a/lib/MC/MCObjectSymbolizer.cpp b/lib/MC/MCObjectSymbolizer.cpp
index aa7648e..b9131d1 100644
--- a/lib/MC/MCObjectSymbolizer.cpp
+++ b/lib/MC/MCObjectSymbolizer.cpp
@@ -15,7 +15,7 @@
 #include "llvm/MC/MCRelocationInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Object/MachO.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -26,100 +26,127 @@ using namespace object;
 
 namespace {
 class MCMachObjectSymbolizer : public MCObjectSymbolizer {
+  const MachOObjectFile *MOOF;
+  // __TEXT;__stubs support.
+  uint64_t StubsStart;
+  uint64_t StubsCount;
+  uint64_t StubSize;
+  uint64_t StubsIndSymIndex;
+
 public:
   MCMachObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
-                         const object::MachOObjectFile *MachOOF)
-    : MCObjectSymbolizer(Ctx, RelInfo, MachOOF)
-  {}
+                         const MachOObjectFile *MOOF);
+
+  StringRef findExternalFunctionAt(uint64_t Addr) LLVM_OVERRIDE;
 
   void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
-                                       int64_t Value, uint64_t Address) {
-    AddrToRelocMap::iterator RI = AddrToReloc.find(Address);
-    if (RI != AddrToReloc.end()) {
-      const MCExpr *RelExpr = RelInfo->createExprForRelocation(RI->second);
-      if (!RelExpr || RelExpr->EvaluateAsAbsolute(Value) == false)
-        return;
-    }
-    uint64_t Addr = Value;
-    SortedSectionList::const_iterator SI = findSectionContaining(Addr);
-    if (SI != SortedSections.end()) {
-      const SectionRef &S = *SI;
-      StringRef Name; S.getName(Name);
-      uint64_t SAddr; S.getAddress(SAddr);
-      if (Name == "__cstring") {
-        StringRef Contents;
-        S.getContents(Contents);
-        Contents = Contents.substr(Addr - SAddr);
-        cStream << " ## literal pool for: "
-                << Contents.substr(0, Contents.find_first_of(0));
-      }
-    }
-  }
+                                       int64_t Value,
+                                       uint64_t Address) LLVM_OVERRIDE;
 };
 } // End unnamed namespace
 
-//===- MCObjectSymbolizer -------------------------------------------------===//
 
-MCObjectSymbolizer::MCObjectSymbolizer(MCContext &Ctx,
-                                       OwningPtr<MCRelocationInfo> &RelInfo,
-                                       const ObjectFile *Obj)
-    : MCSymbolizer(Ctx, RelInfo), Obj(Obj), SortedSections(), AddrToReloc() {
+MCMachObjectSymbolizer::
+MCMachObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
+                       const MachOObjectFile *MOOF)
+    : MCObjectSymbolizer(Ctx, RelInfo, MOOF), MOOF(MOOF),
+      StubsStart(0), StubsCount(0), StubSize(0), StubsIndSymIndex(0) {
+
   error_code ec;
-  for (section_iterator SI = Obj->begin_sections(),
-                        SE = Obj->end_sections();
-                        SI != SE;
-                        SI.increment(ec)) {
+  for (section_iterator SI = MOOF->begin_sections(), SE = MOOF->end_sections();
+       SI != SE; SI.increment(ec)) {
     if (ec) break;
-
-    section_iterator RelSecI = SI->getRelocatedSection();
-    if (RelSecI == Obj->end_sections())
-      continue;
-
-    uint64_t StartAddr; RelSecI->getAddress(StartAddr);
-    uint64_t Size; RelSecI->getSize(Size);
-    bool RequiredForExec; RelSecI->isRequiredForExecution(RequiredForExec);
-    if (RequiredForExec == false || Size == 0)
-      continue;
-    insertSection(*SI);
-    for (relocation_iterator RI = SI->begin_relocations(),
-                             RE = SI->end_relocations();
-                             RI != RE;
-                             RI.increment(ec)) {
-      if (ec) break;
-      // FIXME: libObject is inconsistent regarding error handling. The
-      // overwhelming majority of methods always return object_error::success,
-      // and assert for simple errors.. Here, ELFObjectFile::getRelocationOffset
-      // asserts when the file type isn't ET_REL.
-      // This workaround handles x86-64 elf, the only one that has a relocinfo.
-      uint64_t Offset;
-      if (Obj->isELF()) {
-        const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj);
-        if (ELFObj == 0)
-          break;
-        if (ELFObj->getElfHeader()->e_type == ELF::ET_REL) {
-          RI->getOffset(Offset);
-          Offset += StartAddr;
-        } else {
-          RI->getAddress(Offset);
-        }
+    StringRef Name; SI->getName(Name);
+    if (Name == "__stubs") {
+      SectionRef StubsSec = *SI;
+      if (MOOF->is64Bit()) {
+        MachO::section_64 S = MOOF->getSection64(StubsSec.getRawDataRefImpl());
+        StubsIndSymIndex = S.reserved1;
+        StubSize = S.reserved2;
       } else {
-        RI->getOffset(Offset);
-        Offset += StartAddr;
+        MachO::section S = MOOF->getSection(StubsSec.getRawDataRefImpl());
+        StubsIndSymIndex = S.reserved1;
+        StubSize = S.reserved2;
       }
-      // At a specific address, only keep the first relocation.
-      if (AddrToReloc.find(Offset) == AddrToReloc.end())
-        AddrToReloc[Offset] = *RI;
+      assert(StubSize && "Mach-O stub entry size can't be zero!");
+      StubsSec.getAddress(StubsStart);
+      StubsSec.getSize(StubsCount);
+      StubsCount /= StubSize;
+    }
+  }
+}
+
+StringRef MCMachObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
+  // FIXME: also, this can all be done at the very beginning, by iterating over
+  // all stubs and creating the calls to outside functions. Is it worth it
+  // though?
+  if (!StubSize)
+    return StringRef();
+  uint64_t StubIdx = (Addr - StubsStart) / StubSize;
+  if (StubIdx >= StubsCount)
+    return StringRef();
+
+  uint32_t SymtabIdx =
+    MOOF->getIndirectSymbolTableEntry(MOOF->getDysymtabLoadCommand(), StubIdx);
+
+  StringRef SymName;
+  symbol_iterator SI = MOOF->begin_symbols();
+  error_code ec;
+  for (uint32_t i = 0; i != SymtabIdx; ++i) {
+    SI.increment(ec);
+  }
+  SI->getName(SymName);
+  assert(SI != MOOF->end_symbols() && "Stub wasn't found in the symbol table!");
+  assert(SymName.front() == '_' && "Mach-O symbol doesn't start with '_'!");
+  return SymName.substr(1);
+}
+
+void MCMachObjectSymbolizer::
+tryAddingPcLoadReferenceComment(raw_ostream &cStream, int64_t Value,
+                                uint64_t Address) {
+  if (const RelocationRef *R = findRelocationAt(Address)) {
+    const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R);
+    if (!RelExpr || RelExpr->EvaluateAsAbsolute(Value) == false)
+      return;
+  }
+  uint64_t Addr = Value;
+  if (const SectionRef *S = findSectionContaining(Addr)) {
+    StringRef Name; S->getName(Name);
+    uint64_t SAddr; S->getAddress(SAddr);
+    if (Name == "__cstring") {
+      StringRef Contents;
+      S->getContents(Contents);
+      Contents = Contents.substr(Addr - SAddr);
+      cStream << " ## literal pool for: "
+              << Contents.substr(0, Contents.find_first_of(0));
     }
   }
 }
 
+//===- MCObjectSymbolizer -------------------------------------------------===//
+
+MCObjectSymbolizer::MCObjectSymbolizer(MCContext &Ctx,
+                                       OwningPtr<MCRelocationInfo> &RelInfo,
+                                       const ObjectFile *Obj)
+    : MCSymbolizer(Ctx, RelInfo), Obj(Obj), SortedSections(), AddrToReloc() {
+}
+
 bool MCObjectSymbolizer::
 tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
                          int64_t Value, uint64_t Address, bool IsBranch,
                          uint64_t Offset, uint64_t InstSize) {
-  AddrToRelocMap::iterator RI = AddrToReloc.find(Address + Offset);
-  if (RI != AddrToReloc.end()) {
-    if (const MCExpr *RelExpr = RelInfo->createExprForRelocation(RI->second)) {
+  if (IsBranch) {
+    StringRef ExtFnName = findExternalFunctionAt((uint64_t)Value);
+    if (!ExtFnName.empty()) {
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(ExtFnName);
+      const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      MI.addOperand(MCOperand::CreateExpr(Expr));
+      return true;
+    }
+  }
+
+  if (const RelocationRef *R = findRelocationAt(Address + Offset)) {
+    if (const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R)) {
       MI.addOperand(MCOperand::CreateExpr(RelExpr));
       return true;
     }
@@ -133,10 +160,8 @@ tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
   uint64_t UValue = Value;
   // FIXME: map instead of looping each time?
   error_code ec;
-  for (symbol_iterator SI = Obj->begin_symbols(),
-       SE = Obj->end_symbols();
-       SI != SE;
-       SI.increment(ec)) {
+  for (symbol_iterator SI = Obj->begin_symbols(), SE = Obj->end_symbols();
+       SI != SE; SI.increment(ec)) {
     if (ec) break;
     uint64_t SymAddr; SI->getAddress(SymAddr);
     uint64_t SymSize; SI->getSize(SymSize);
@@ -166,13 +191,16 @@ tryAddingPcLoadReferenceComment(raw_ostream &cStream,
                                 int64_t Value, uint64_t Address) {
 }
 
+StringRef MCObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
+  return StringRef();
+}
+
 MCObjectSymbolizer *
 MCObjectSymbolizer::createObjectSymbolizer(MCContext &Ctx,
                                            OwningPtr<MCRelocationInfo> &RelInfo,
                                            const ObjectFile *Obj) {
-  if (const MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(Obj)) {
-    return new MCMachObjectSymbolizer(Ctx, RelInfo, MachOOF);
-  }
+  if (const MachOObjectFile *MOOF = dyn_cast<MachOObjectFile>(Obj))
+    return new MCMachObjectSymbolizer(Ctx, RelInfo, MOOF);
   return new MCObjectSymbolizer(Ctx, RelInfo, Obj);
 }
 
@@ -183,32 +211,100 @@ static bool SectionStartsBefore(const SectionRef &S, uint64_t Addr) {
   return SAddr < Addr;
 }
 
-MCObjectSymbolizer::SortedSectionList::const_iterator
-MCObjectSymbolizer::findSectionContaining(uint64_t Addr) const {
-  SortedSectionList::const_iterator
+const SectionRef *MCObjectSymbolizer::findSectionContaining(uint64_t Addr) {
+  if (SortedSections.empty())
+    buildSectionList();
+
+  SortedSectionList::iterator
     EndIt = SortedSections.end(),
     It = std::lower_bound(SortedSections.begin(), EndIt,
                           Addr, SectionStartsBefore);
   if (It == EndIt)
-    return It;
+    return 0;
   uint64_t SAddr; It->getAddress(SAddr);
   uint64_t SSize; It->getSize(SSize);
   if (Addr >= SAddr + SSize)
-    return EndIt;
-  return It;
+    return 0;
+  return &*It;
+}
+
+const RelocationRef *MCObjectSymbolizer::findRelocationAt(uint64_t Addr) {
+  if (AddrToReloc.empty())
+    buildRelocationByAddrMap();
+
+  AddrToRelocMap::const_iterator RI = AddrToReloc.find(Addr);
+  if (RI == AddrToReloc.end())
+    return 0;
+  return &RI->second;
+}
+
+void MCObjectSymbolizer::buildSectionList() {
+  error_code ec;
+  for (section_iterator SI = Obj->begin_sections(), SE = Obj->end_sections();
+                        SI != SE; SI.increment(ec)) {
+    if (ec) break;
+
+    bool RequiredForExec; SI->isRequiredForExecution(RequiredForExec);
+    if (RequiredForExec == false)
+      continue;
+    uint64_t SAddr; SI->getAddress(SAddr);
+    uint64_t SSize; SI->getSize(SSize);
+    SortedSectionList::iterator It = std::lower_bound(SortedSections.begin(),
+                                                      SortedSections.end(),
+                                                      SAddr,
+                                                      SectionStartsBefore);
+    if (It != SortedSections.end()) {
+      uint64_t FoundSAddr; It->getAddress(FoundSAddr);
+      if (FoundSAddr < SAddr + SSize)
+        llvm_unreachable("Inserting overlapping sections");
+    }
+    SortedSections.insert(It, *SI);
+  }
 }
 
-void MCObjectSymbolizer::insertSection(SectionRef Sec) {
-  uint64_t SAddr; Sec.getAddress(SAddr);
-  uint64_t SSize; Sec.getSize(SSize);
-  SortedSectionList::iterator It = std::lower_bound(SortedSections.begin(),
-                                                    SortedSections.end(),
-                                                    SAddr,
-                                                    SectionStartsBefore);
-  if (It != SortedSections.end()) {
-    uint64_t FoundSAddr; It->getAddress(FoundSAddr);
-    if (FoundSAddr < SAddr + SSize)
-      llvm_unreachable("Inserting overlapping sections");
+void MCObjectSymbolizer::buildRelocationByAddrMap() {
+  error_code ec;
+  for (section_iterator SI = Obj->begin_sections(), SE = Obj->end_sections();
+                        SI != SE; SI.increment(ec)) {
+    if (ec) break;
+
+    section_iterator RelSecI = SI->getRelocatedSection();
+    if (RelSecI == Obj->end_sections())
+      continue;
+
+    uint64_t StartAddr; RelSecI->getAddress(StartAddr);
+    uint64_t Size; RelSecI->getSize(Size);
+    bool RequiredForExec; RelSecI->isRequiredForExecution(RequiredForExec);
+    if (RequiredForExec == false || Size == 0)
+      continue;
+    for (relocation_iterator RI = SI->begin_relocations(),
+                             RE = SI->end_relocations();
+                             RI != RE;
+                             RI.increment(ec)) {
+      if (ec) break;
+      // FIXME: libObject is inconsistent regarding error handling. The
+      // overwhelming majority of methods always return object_error::success,
+      // and assert for simple errors.. Here, ELFObjectFile::getRelocationOffset
+      // asserts when the file type isn't ET_REL.
+      // This workaround handles x86-64 elf, the only one that has a relocinfo.
+      uint64_t Offset;
+      if (Obj->isELF()) {
+        const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj);
+        if (ELFObj == 0)
+          break;
+        if (ELFObj->getELFFile()->getHeader()->e_type == ELF::ET_REL) {
+          RI->getOffset(Offset);
+          Offset += StartAddr;
+        } else {
+          RI->getAddress(Offset);
+        }
+      } else {
+        RI->getOffset(Offset);
+        Offset += StartAddr;
+      }
+      // At a specific address, only keep the first relocation.
+      if (AddrToReloc.find(Offset) == AddrToReloc.end())
+        AddrToReloc[Offset] = *RI;
+    }
   }
-  SortedSections.insert(It, Sec);
 }
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index c1c594a..b49dd01 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -91,9 +91,56 @@ AsmToken AsmLexer::LexFloatLiteral() {
                   StringRef(TokStart, CurPtr - TokStart));
 }
 
-/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
+/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
+/// while making sure there are enough actual digits around for the constant to
+/// be valid.
+///
+/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
+/// before we get here.
+AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
+  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
+         "unexpected parse state in floating hex");
+  bool NoFracDigits = true;
+
+  // Skip the fractional part if there is one
+  if (*CurPtr == '.') {
+    ++CurPtr;
+
+    const char *FracStart = CurPtr;
+    while (isxdigit(*CurPtr))
+      ++CurPtr;
+
+    NoFracDigits = CurPtr == FracStart;
+  }
+
+  if (NoIntDigits && NoFracDigits)
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected at least one significand digit");
+
+  // Make sure we do have some kind of proper exponent part
+  if (*CurPtr != 'p' && *CurPtr != 'P')
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected exponent part 'p'");
+  ++CurPtr;
+
+  if (*CurPtr == '+' || *CurPtr == '-')
+    ++CurPtr;
+
+  // N.b. exponent digits are *not* hex
+  const char *ExpStart = CurPtr;
+  while (isdigit(*CurPtr))
+    ++CurPtr;
+
+  if (CurPtr == ExpStart)
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected at least one exponent digit");
+
+  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
+}
+
+/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 static bool IsIdentifierChar(char c) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@';
+  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@' || c == '?';
 }
 AsmToken AsmLexer::LexIdentifier() {
   // Check for floating point literals.
@@ -265,7 +312,12 @@ AsmToken AsmLexer::LexDigit() {
     while (isxdigit(CurPtr[0]))
       ++CurPtr;
 
-    // Requires at least one hex digit.
+    // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
+    // diagnosed by LexHexFloatLiteral).
+    if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
+      return LexHexFloatLiteral(NumStart == CurPtr);
+
+    // Otherwise requires at least one hex digit.
     if (CurPtr == NumStart)
       return ReturnError(CurPtr-2, "invalid hexadecimal number");
 
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index dd0d181..a91bd93 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -94,13 +94,13 @@ public:
 };
 
 struct ParseStatementInfo {
-  /// ParsedOperands - The parsed operands from the last parsed statement.
+  /// \brief The parsed operands from the last parsed statement.
   SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
 
-  /// Opcode - The opcode from the last parsed instruction.
+  /// \brief The opcode from the last parsed instruction.
   unsigned Opcode;
 
-  /// Error - Was there an error parsing the inline assembly?
+  /// \brief Was there an error parsing the inline assembly?
   bool ParseError;
 
   SmallVectorImpl<AsmRewrite> *AsmRewrites;
@@ -138,18 +138,18 @@ private:
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
 
-  /// ExtensionDirectiveMap - maps directive names to handler methods in parser
+  /// \brief maps directive names to handler methods in parser
   /// extensions. Extensions register themselves in this map by calling
   /// addDirectiveHandler.
   StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
-  /// MacroMap - Map of currently defined macros.
+  /// \brief Map of currently defined macros.
   StringMap<MCAsmMacro*> MacroMap;
 
-  /// ActiveMacros - Stack of active macro instantiations.
+  /// \brief Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
 
-  /// MacroLikeBodies - List of bodies of anonymous macros.
+  /// \brief List of bodies of anonymous macros.
   std::deque<MCAsmMacro> MacroLikeBodies;
 
   /// Boolean tracking whether macro substitution is enabled.
@@ -165,7 +165,7 @@ private:
   int CppHashBuf;
   /// When generating dwarf for assembly source files we need to calculate the
   /// logical line number based on the last parsed cpp hash file line comment
-  /// and current line. Since this is slow and messes up the SourceMgr's 
+  /// and current line. Since this is slow and messes up the SourceMgr's
   /// cache we save the last info we queried with SrcMgr.FindLineNumber().
   SMLoc LastQueryIDLoc;
   int LastQueryBuffer;
@@ -174,10 +174,10 @@ private:
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
   unsigned AssemblerDialect;
 
-  /// IsDarwin - is Darwin compatibility enabled?
+  /// \brief is Darwin compatibility enabled?
   bool IsDarwin;
 
-  /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
+  /// \brief Are we parsing ms-style inline assembly?
   bool ParsingInlineAsm;
 
 public:
@@ -235,7 +235,7 @@ public:
   virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc);
   virtual bool parseAbsoluteExpression(int64_t &Res);
 
-  /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
+  /// \brief Parse an identifier or string (as a quoted identifier)
   /// and set \p Res to the identifier contents.
   virtual bool parseIdentifier(StringRef &Res);
   virtual void eatToEndOfStatement();
@@ -245,11 +245,11 @@ public:
 
 private:
 
-  bool ParseStatement(ParseStatementInfo &Info);
-  void EatToEndOfLine();
-  bool ParseCppHashLineFilenameComment(const SMLoc &L);
+  bool parseStatement(ParseStatementInfo &Info);
+  void eatToEndOfLine();
+  bool parseCppHashLineFilenameComment(const SMLoc &L);
 
-  void CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
+  void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
                         MCAsmMacroParameters Parameters);
   bool expandMacro(raw_svector_ostream &OS, StringRef Body,
                    const MCAsmMacroParameters &Parameters,
@@ -257,55 +257,56 @@ private:
                    const SMLoc &L);
 
   /// \brief Are macros enabled in the parser?
-  bool MacrosEnabled() {return MacrosEnabledFlag;}
+  bool areMacrosEnabled() {return MacrosEnabledFlag;}
 
   /// \brief Control a flag in the parser that enables or disables macros.
-  void SetMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
+  void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
 
   /// \brief Lookup a previously defined macro.
   /// \param Name Macro name.
   /// \returns Pointer to macro. NULL if no such macro was defined.
-  const MCAsmMacro* LookupMacro(StringRef Name);
+  const MCAsmMacro* lookupMacro(StringRef Name);
 
   /// \brief Define a new macro with the given name and information.
-  void DefineMacro(StringRef Name, const MCAsmMacro& Macro);
+  void defineMacro(StringRef Name, const MCAsmMacro& Macro);
 
   /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
-  void UndefineMacro(StringRef Name);
+  void undefineMacro(StringRef Name);
 
   /// \brief Are we inside a macro instantiation?
-  bool InsideMacroInstantiation() {return !ActiveMacros.empty();}
+  bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
 
-  /// \brief Handle entry to macro instantiation. 
+  /// \brief Handle entry to macro instantiation.
   ///
   /// \param M The macro.
   /// \param NameLoc Instantiation location.
-  bool HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
+  bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
 
   /// \brief Handle exit from macro instantiation.
-  void HandleMacroExit();
+  void handleMacroExit();
 
   /// \brief Extract AsmTokens for a macro argument. If the argument delimiter
   /// is initially unknown, set it to AsmToken::Eof. It will be set to the
   /// correct delimiter by the method.
-  bool ParseMacroArgument(MCAsmMacroArgument &MA,
+  bool parseMacroArgument(MCAsmMacroArgument &MA,
                           AsmToken::TokenKind &ArgumentDelimiter);
 
   /// \brief Parse all macro arguments for a given macro.
-  bool ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
+  bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
 
-  void PrintMacroInstantiations();
-  void PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
+  void printMacroInstantiations();
+  void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
                     ArrayRef<SMRange> Ranges = None) const {
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
   static void DiagHandler(const SMDiagnostic &Diag, void *Context);
 
-  /// EnterIncludeFile - Enter the specified file. This returns true on failure.
-  bool EnterIncludeFile(const std::string &Filename);
-  /// ProcessIncbinFile - Process the specified file for the .incbin directive.
+  /// \brief Enter the specified file. This returns true on failure.
+  bool enterIncludeFile(const std::string &Filename);
+
+  /// \brief Process the specified file for the .incbin directive.
   /// This returns true on failure.
-  bool ProcessIncbinFile(const std::string &Filename);
+  bool processIncbinFile(const std::string &Filename);
 
   /// \brief Reset the current lexer position to that given by \p Loc. The
   /// current token is not set; clients should ensure Lex() is called
@@ -313,7 +314,7 @@ private:
   ///
   /// \param InBuffer If not -1, should be the known buffer id that contains the
   /// location.
-  void JumpToLoc(SMLoc Loc, int InBuffer=-1);
+  void jumpToLoc(SMLoc Loc, int InBuffer=-1);
 
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
@@ -322,17 +323,16 @@ private:
 
   /// \brief Parse until the end of a statement or a comma is encountered,
   /// return the contents from the current token up to the end or comma.
-  StringRef ParseStringToComma();
+  StringRef parseStringToComma();
 
-  bool ParseAssignment(StringRef Name, bool allow_redef,
+  bool parseAssignment(StringRef Name, bool allow_redef,
                        bool NoDeadStrip = false);
 
-  bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
 
-  bool ParseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
+  bool parseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
 
   // Generic (target and platform independent) directive parsing.
   enum DirectiveKind {
@@ -342,7 +342,7 @@ private:
     DK_FLOAT, DK_DOUBLE, DK_ALIGN, DK_ALIGN32, DK_BALIGN, DK_BALIGNW,
     DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR,
     DK_BUNDLE_ALIGN_MODE, DK_BUNDLE_LOCK, DK_BUNDLE_UNLOCK,
-    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL, DK_INDIRECT_SYMBOL,
+    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL,
     DK_LAZY_REFERENCE, DK_NO_DEAD_STRIP, DK_SYMBOL_RESOLVER, DK_PRIVATE_EXTERN,
     DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
     DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT,
@@ -355,112 +355,113 @@ private:
     DK_CFI_OFFSET, DK_CFI_REL_OFFSET, DK_CFI_PERSONALITY, DK_CFI_LSDA,
     DK_CFI_REMEMBER_STATE, DK_CFI_RESTORE_STATE, DK_CFI_SAME_VALUE,
     DK_CFI_RESTORE, DK_CFI_ESCAPE, DK_CFI_SIGNAL_FRAME, DK_CFI_UNDEFINED,
-    DK_CFI_REGISTER,
+    DK_CFI_REGISTER, DK_CFI_WINDOW_SAVE,
     DK_MACROS_ON, DK_MACROS_OFF, DK_MACRO, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
     DK_SLEB128, DK_ULEB128
   };
 
-  /// DirectiveKindMap - Maps directive name --> DirectiveKind enum, for
+  /// \brief Maps directive name --> DirectiveKind enum, for
   /// directives parsed by this class.
   StringMap<DirectiveKind> DirectiveKindMap;
 
   // ".ascii", ".asciz", ".string"
-  bool ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
-  bool ParseDirectiveValue(unsigned Size); // ".byte", ".long", ...
-  bool ParseDirectiveRealValue(const fltSemantics &); // ".single", ...
-  bool ParseDirectiveFill(); // ".fill"
-  bool ParseDirectiveZero(); // ".zero"
+  bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool parseDirectiveValue(unsigned Size); // ".byte", ".long", ...
+  bool parseDirectiveRealValue(const fltSemantics &); // ".single", ...
+  bool parseDirectiveFill(); // ".fill"
+  bool parseDirectiveZero(); // ".zero"
   // ".set", ".equ", ".equiv"
-  bool ParseDirectiveSet(StringRef IDVal, bool allow_redef);
-  bool ParseDirectiveOrg(); // ".org"
+  bool parseDirectiveSet(StringRef IDVal, bool allow_redef);
+  bool parseDirectiveOrg(); // ".org"
   // ".align{,32}", ".p2align{,w,l}"
-  bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
+  bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize);
 
   // ".file", ".line", ".loc", ".stabs"
-  bool ParseDirectiveFile(SMLoc DirectiveLoc);
-  bool ParseDirectiveLine();
-  bool ParseDirectiveLoc();
-  bool ParseDirectiveStabs();
+  bool parseDirectiveFile(SMLoc DirectiveLoc);
+  bool parseDirectiveLine();
+  bool parseDirectiveLoc();
+  bool parseDirectiveStabs();
 
   // .cfi directives
-  bool ParseDirectiveCFIRegister(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFISections();
-  bool ParseDirectiveCFIStartProc();
-  bool ParseDirectiveCFIEndProc();
-  bool ParseDirectiveCFIDefCfaOffset();
-  bool ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIAdjustCfaOffset();
-  bool ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIOffset(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
-  bool ParseDirectiveCFIRememberState();
-  bool ParseDirectiveCFIRestoreState();
-  bool ParseDirectiveCFISameValue(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRestore(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIEscape();
-  bool ParseDirectiveCFISignalFrame();
-  bool ParseDirectiveCFIUndefined(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIWindowSave();
+  bool parseDirectiveCFISections();
+  bool parseDirectiveCFIStartProc();
+  bool parseDirectiveCFIEndProc();
+  bool parseDirectiveCFIDefCfaOffset();
+  bool parseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIAdjustCfaOffset();
+  bool parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
+  bool parseDirectiveCFIRememberState();
+  bool parseDirectiveCFIRestoreState();
+  bool parseDirectiveCFISameValue(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRestore(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIEscape();
+  bool parseDirectiveCFISignalFrame();
+  bool parseDirectiveCFIUndefined(SMLoc DirectiveLoc);
 
   // macro directives
-  bool ParseDirectivePurgeMacro(SMLoc DirectiveLoc);
-  bool ParseDirectiveEndMacro(StringRef Directive);
-  bool ParseDirectiveMacro(SMLoc DirectiveLoc);
-  bool ParseDirectiveMacrosOnOff(StringRef Directive);
+  bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveEndMacro(StringRef Directive);
+  bool parseDirectiveMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveMacrosOnOff(StringRef Directive);
 
   // ".bundle_align_mode"
-  bool ParseDirectiveBundleAlignMode();
+  bool parseDirectiveBundleAlignMode();
   // ".bundle_lock"
-  bool ParseDirectiveBundleLock();
+  bool parseDirectiveBundleLock();
   // ".bundle_unlock"
-  bool ParseDirectiveBundleUnlock();
+  bool parseDirectiveBundleUnlock();
 
   // ".space", ".skip"
-  bool ParseDirectiveSpace(StringRef IDVal);
+  bool parseDirectiveSpace(StringRef IDVal);
 
   // .sleb128 (Signed=true) and .uleb128 (Signed=false)
-  bool ParseDirectiveLEB128(bool Signed);
+  bool parseDirectiveLEB128(bool Signed);
 
-  /// ParseDirectiveSymbolAttribute - Parse a directive like ".globl" which
+  /// \brief Parse a directive like ".globl" which
   /// accepts a single symbol (which should be a label or an external).
-  bool ParseDirectiveSymbolAttribute(MCSymbolAttr Attr);
+  bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);
 
-  bool ParseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
+  bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
 
-  bool ParseDirectiveAbort(); // ".abort"
-  bool ParseDirectiveInclude(); // ".include"
-  bool ParseDirectiveIncbin(); // ".incbin"
+  bool parseDirectiveAbort(); // ".abort"
+  bool parseDirectiveInclude(); // ".include"
+  bool parseDirectiveIncbin(); // ".incbin"
 
-  bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if"
+  bool parseDirectiveIf(SMLoc DirectiveLoc); // ".if"
   // ".ifb" or ".ifnb", depending on ExpectBlank.
-  bool ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
   // ".ifc" or ".ifnc", depending on ExpectEqual.
-  bool ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
+  bool parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
   // ".ifdef" or ".ifndef", depending on expect_defined
-  bool ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
-  bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
-  bool ParseDirectiveElse(SMLoc DirectiveLoc); // ".else"
-  bool ParseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
+  bool parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
+  bool parseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
+  bool parseDirectiveElse(SMLoc DirectiveLoc); // ".else"
+  bool parseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
   virtual bool parseEscapedString(std::string &Data);
 
-  const MCExpr *ApplyModifierToExpr(const MCExpr *E,
+  const MCExpr *applyModifierToExpr(const MCExpr *E,
                                     MCSymbolRefExpr::VariantKind Variant);
 
   // Macro-like directives
-  MCAsmMacro *ParseMacroLikeBody(SMLoc DirectiveLoc);
-  void InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+  MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
+  void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                 raw_svector_ostream &OS);
-  bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
-  bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
-  bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
-  bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+  bool parseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
+  bool parseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
+  bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
+  bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
 
   // "_emit" or "__emit"
-  bool ParseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
+  bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
                             size_t Len);
 
   // "align"
-  bool ParseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
+  bool parseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
 
   void initializeDirectiveKindMap();
 };
@@ -476,12 +477,12 @@ extern MCAsmParserExtension *createCOFFAsmParser();
 
 enum { DEFAULT_ADDRSPACE = 0 };
 
-AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
-                     MCStreamer &_Out, const MCAsmInfo &_MAI)
-  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
-    PlatformParser(0),
-    CurBuffer(0), MacrosEnabledFlag(true), CppHashLineNumber(0),
-    AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
+AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
+                     const MCAsmInfo &_MAI)
+    : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
+      PlatformParser(0), CurBuffer(0), MacrosEnabledFlag(true),
+      CppHashLineNumber(0), AssemblerDialect(~0U), IsDarwin(false),
+      ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
@@ -512,37 +513,40 @@ AsmParser::~AsmParser() {
   assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
 
   // Destroy any macros.
-  for (StringMap<MCAsmMacro*>::iterator it = MacroMap.begin(),
-         ie = MacroMap.end(); it != ie; ++it)
+  for (StringMap<MCAsmMacro *>::iterator it = MacroMap.begin(),
+                                         ie = MacroMap.end();
+       it != ie; ++it)
     delete it->getValue();
 
   delete PlatformParser;
 }
 
-void AsmParser::PrintMacroInstantiations() {
+void AsmParser::printMacroInstantiations() {
   // Print the active macro instantiation stack.
-  for (std::vector<MacroInstantiation*>::const_reverse_iterator
-         it = ActiveMacros.rbegin(), ie = ActiveMacros.rend(); it != ie; ++it)
-    PrintMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
+  for (std::vector<MacroInstantiation *>::const_reverse_iterator
+           it = ActiveMacros.rbegin(),
+           ie = ActiveMacros.rend();
+       it != ie; ++it)
+    printMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
                  "while in macro instantiation");
 }
 
 bool AsmParser::Warning(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
   if (FatalAssemblerWarnings)
     return Error(L, Msg, Ranges);
-  PrintMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
-  PrintMacroInstantiations();
+  printMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
+  printMacroInstantiations();
   return false;
 }
 
 bool AsmParser::Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
   HadError = true;
-  PrintMessage(L, SourceMgr::DK_Error, Msg, Ranges);
-  PrintMacroInstantiations();
+  printMessage(L, SourceMgr::DK_Error, Msg, Ranges);
+  printMacroInstantiations();
   return true;
 }
 
-bool AsmParser::EnterIncludeFile(const std::string &Filename) {
+bool AsmParser::enterIncludeFile(const std::string &Filename) {
   std::string IncludedFile;
   int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
@@ -558,7 +562,7 @@ bool AsmParser::EnterIncludeFile(const std::string &Filename) {
 /// Process the specified .incbin file by searching for it in the include paths
 /// then just emitting the byte contents of the file to the streamer. This
 /// returns true on failure.
-bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
+bool AsmParser::processIncbinFile(const std::string &Filename) {
   std::string IncludedFile;
   int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
@@ -569,7 +573,7 @@ bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
   return false;
 }
 
-void AsmParser::JumpToLoc(SMLoc Loc, int InBuffer) {
+void AsmParser::jumpToLoc(SMLoc Loc, int InBuffer) {
   if (InBuffer != -1) {
     CurBuffer = InBuffer;
   } else {
@@ -586,7 +590,7 @@ const AsmToken &AsmParser::Lex() {
     // include stack.
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     if (ParentIncludeLoc != SMLoc()) {
-      JumpToLoc(ParentIncludeLoc);
+      jumpToLoc(ParentIncludeLoc);
       tok = &Lexer.Lex();
     }
   }
@@ -623,7 +627,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // While we have input, parse each statement.
   while (Lexer.isNot(AsmToken::Eof)) {
     ParseStatementInfo Info;
-    if (!ParseStatement(Info)) continue;
+    if (!parseStatement(Info))
+      continue;
 
     // We had an error, validate that one was emitted and recover by skipping to
     // the next line.
@@ -637,7 +642,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
   // Check to see there are no empty DwarfFile slots.
   const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
-    getContext().getMCDwarfFiles();
+      getContext().getMCDwarfFiles();
   for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
     if (!MCDwarfFiles[i])
       TokError("unassigned file number: " + Twine(i) + " for .file directives");
@@ -650,7 +655,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) {
     const MCContext::SymbolTable &Symbols = getContext().getSymbols();
     for (MCContext::SymbolTable::const_iterator i = Symbols.begin(),
-         e = Symbols.end();
+                                                e = Symbols.end();
          i != e; ++i) {
       MCSymbol *Sym = i->getValue();
       // Variable symbols may not be marked as defined, so check those
@@ -660,13 +665,12 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
         // FIXME: We would really like to refer back to where the symbol was
         // first referenced for a source location. We need to add something
         // to track that. Currently, we just point to the end of the file.
-        PrintMessage(getLexer().getLoc(), SourceMgr::DK_Error,
-                     "assembler local symbol '" + Sym->getName() +
-                     "' not defined");
+        printMessage(
+            getLexer().getLoc(), SourceMgr::DK_Error,
+            "assembler local symbol '" + Sym->getName() + "' not defined");
     }
   }
 
-
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
   if (!HadError && !NoFinalize)
@@ -682,10 +686,9 @@ void AsmParser::checkForValidSection() {
   }
 }
 
-/// eatToEndOfStatement - Throw away the rest of the line for testing purposes.
+/// \brief Throw away the rest of the line for testing purposes.
 void AsmParser::eatToEndOfStatement() {
-  while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Eof))
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   // Eat EOL.
@@ -696,33 +699,32 @@ void AsmParser::eatToEndOfStatement() {
 StringRef AsmParser::parseStringToEndOfStatement() {
   const char *Start = getTok().getLoc().getPointer();
 
-  while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Eof))
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
 }
 
-StringRef AsmParser::ParseStringToComma() {
+StringRef AsmParser::parseStringToComma() {
   const char *Start = getTok().getLoc().getPointer();
 
   while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Comma) &&
-         Lexer.isNot(AsmToken::Eof))
+         Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
 }
 
-/// ParseParenExpr - Parse a paren expression and return it.
+/// \brief Parse a paren expression and return it.
 /// NOTE: This assumes the leading '(' has already been consumed.
 ///
 /// parenexpr ::= expr)
 ///
-bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (parseExpression(Res)) return true;
+bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
   if (Lexer.isNot(AsmToken::RParen))
     return TokError("expected ')' in parentheses expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -730,13 +732,14 @@ bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// ParseBracketExpr - Parse a bracket expression and return it.
+/// \brief Parse a bracket expression and return it.
 /// NOTE: This assumes the leading '[' has already been consumed.
 ///
 /// bracketexpr ::= expr]
 ///
-bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (parseExpression(Res)) return true;
+bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
   if (Lexer.isNot(AsmToken::RBrac))
     return TokError("expected ']' in brackets expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -744,13 +747,13 @@ bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// ParsePrimaryExpr - Parse a primary expression and return it.
+/// \brief Parse a primary expression and return it.
 ///  primaryexpr ::= (parenexpr
 ///  primaryexpr ::= symbol
 ///  primaryexpr ::= number
 ///  primaryexpr ::= '.'
 ///  primaryexpr ::= ~,+,- primaryexpr
-bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   SMLoc FirstTokenLoc = getLexer().getLoc();
   AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
   switch (FirstTokenKind) {
@@ -761,36 +764,54 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     return true;
   case AsmToken::Exclaim:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateLNot(Res, getContext());
     return false;
   case AsmToken::Dollar:
+  case AsmToken::At:
   case AsmToken::String:
   case AsmToken::Identifier: {
     StringRef Identifier;
     if (parseIdentifier(Identifier)) {
-      if (FirstTokenKind == AsmToken::Dollar)
-        return Error(FirstTokenLoc, "invalid token in expression");
-      return true;
+      if (FirstTokenKind == AsmToken::Dollar) {
+        if (Lexer.getMAI().getDollarIsPC()) {
+          // This is a '$' reference, which references the current PC.  Emit a
+          // temporary label to the streamer and refer to it.
+          MCSymbol *Sym = Ctx.CreateTempSymbol();
+          Out.EmitLabel(Sym);
+          Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                        getContext());
+          EndLoc = FirstTokenLoc;
+          return false;
+        } else
+          return Error(FirstTokenLoc, "invalid token in expression");
+        return true;
+      }
     }
 
     EndLoc = SMLoc::getFromPointer(Identifier.end());
 
     // This is a symbol reference.
+    StringRef SymbolName = Identifier;
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     std::pair<StringRef, StringRef> Split = Identifier.split('@');
-    MCSymbol *Sym = getContext().GetOrCreateSymbol(Split.first);
 
     // Lookup the symbol variant if used.
-    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     if (Split.first.size() != Identifier.size()) {
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
-      if (Variant == MCSymbolRefExpr::VK_Invalid) {
+      if (Variant != MCSymbolRefExpr::VK_Invalid) {
+        SymbolName = Split.first;
+      } else if (MAI.doesAllowAtInName()) {
+        Variant = MCSymbolRefExpr::VK_None;
+      } else {
         Variant = MCSymbolRefExpr::VK_None;
         return TokError("invalid variant '" + Split.second + "'");
       }
     }
 
+    MCSymbol *Sym = getContext().GetOrCreateSymbol(SymbolName);
+
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
     if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) {
@@ -823,11 +844,11 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
           Variant = MCSymbolRefExpr::VK_None;
           return TokError("invalid variant '" + Split.second + "'");
         }
-	IDVal = Split.first;
+        IDVal = Split.first;
       }
-      if (IDVal == "f" || IDVal == "b"){
-        MCSymbol *Sym = Ctx.GetDirectionalLocalSymbol(IntVal,
-                                                      IDVal == "f" ? 1 : 0);
+      if (IDVal == "f" || IDVal == "b") {
+        MCSymbol *Sym =
+            Ctx.GetDirectionalLocalSymbol(IntVal, IDVal == "f" ? 1 : 0);
         Res = MCSymbolRefExpr::Create(Sym, Variant, getContext());
         if (IDVal == "b" && Sym->isUndefined())
           return Error(Loc, "invalid reference to undefined symbol");
@@ -857,27 +878,27 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   }
   case AsmToken::LParen:
     Lex(); // Eat the '('.
-    return ParseParenExpr(Res, EndLoc);
+    return parseParenExpr(Res, EndLoc);
   case AsmToken::LBrac:
     if (!PlatformParser->HasBracketExpressions())
       return TokError("brackets expression not supported on this target");
     Lex(); // Eat the '['.
-    return ParseBracketExpr(Res, EndLoc);
+    return parseBracketExpr(Res, EndLoc);
   case AsmToken::Minus:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateMinus(Res, getContext());
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreatePlus(Res, getContext());
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateNot(Res, getContext());
     return false;
@@ -889,13 +910,13 @@ bool AsmParser::parseExpression(const MCExpr *&Res) {
   return parseExpression(Res, EndLoc);
 }
 
-bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  return ParsePrimaryExpr(Res, EndLoc);
-}
-
 const MCExpr *
-AsmParser::ApplyModifierToExpr(const MCExpr *E,
+AsmParser::applyModifierToExpr(const MCExpr *E,
                                MCSymbolRefExpr::VariantKind Variant) {
+  // Ask the target implementation about this expression first.
+  const MCExpr *NewE = getTargetParser().applyModifierToExpr(E, Variant, Ctx);
+  if (NewE)
+    return NewE;
   // Recurse over the given expression, rebuilding it to apply the given variant
   // if there is exactly one symbol.
   switch (E->getKind()) {
@@ -907,8 +928,8 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
 
     if (SRE->getKind() != MCSymbolRefExpr::VK_None) {
-      TokError("invalid variant on expression '" +
-               getTok().getIdentifier() + "' (already modified)");
+      TokError("invalid variant on expression '" + getTok().getIdentifier() +
+               "' (already modified)");
       return E;
     }
 
@@ -917,7 +938,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 
   case MCExpr::Unary: {
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
-    const MCExpr *Sub = ApplyModifierToExpr(UE->getSubExpr(), Variant);
+    const MCExpr *Sub = applyModifierToExpr(UE->getSubExpr(), Variant);
     if (!Sub)
       return 0;
     return MCUnaryExpr::Create(UE->getOpcode(), Sub, getContext());
@@ -925,14 +946,16 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 
   case MCExpr::Binary: {
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
-    const MCExpr *LHS = ApplyModifierToExpr(BE->getLHS(), Variant);
-    const MCExpr *RHS = ApplyModifierToExpr(BE->getRHS(), Variant);
+    const MCExpr *LHS = applyModifierToExpr(BE->getLHS(), Variant);
+    const MCExpr *RHS = applyModifierToExpr(BE->getRHS(), Variant);
 
     if (!LHS && !RHS)
       return 0;
 
-    if (!LHS) LHS = BE->getLHS();
-    if (!RHS) RHS = BE->getRHS();
+    if (!LHS)
+      LHS = BE->getLHS();
+    if (!RHS)
+      RHS = BE->getRHS();
 
     return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext());
   }
@@ -941,7 +964,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
   llvm_unreachable("Invalid expression kind!");
 }
 
-/// parseExpression - Parse an expression and return it.
+/// \brief Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
 ///  expr ::= expr |,^,&,! expr
@@ -954,7 +977,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   // Parse the expression.
   Res = 0;
-  if (ParsePrimaryExpr(Res, EndLoc) || ParseBinOpRHS(1, Res, EndLoc))
+  if (parsePrimaryExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc))
     return true;
 
   // As a special case, we support 'a op b @ modifier' by rewriting the
@@ -967,11 +990,11 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
       return TokError("unexpected symbol modifier following '@'");
 
     MCSymbolRefExpr::VariantKind Variant =
-      MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
+        MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
     if (Variant == MCSymbolRefExpr::VK_Invalid)
       return TokError("invalid variant '" + getTok().getIdentifier() + "'");
 
-    const MCExpr *ModifiedRes = ApplyModifierToExpr(Res, Variant);
+    const MCExpr *ModifiedRes = applyModifierToExpr(Res, Variant);
     if (!ModifiedRes) {
       return TokError("invalid modifier '" + getTok().getIdentifier() +
                       "' (no symbols present)");
@@ -991,8 +1014,7 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
 
 bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   Res = 0;
-  return ParseParenExpr(Res, EndLoc) ||
-         ParseBinOpRHS(1, Res, EndLoc);
+  return parseParenExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc);
 }
 
 bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
@@ -1012,9 +1034,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
                                    MCBinaryExpr::Opcode &Kind) {
   switch (K) {
   default:
-    return 0;    // not a binop.
+    return 0; // not a binop.
 
-    // Lowest Precedence: &&, ||
+  // Lowest Precedence: &&, ||
   case AsmToken::AmpAmp:
     Kind = MCBinaryExpr::LAnd;
     return 1;
@@ -1022,10 +1044,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::LOr;
     return 1;
 
-
-    // Low Precedence: |, &, ^
-    //
-    // FIXME: gas seems to support '!' as an infix operator?
+  // Low Precedence: |, &, ^
+  //
+  // FIXME: gas seems to support '!' as an infix operator?
   case AsmToken::Pipe:
     Kind = MCBinaryExpr::Or;
     return 2;
@@ -1036,7 +1057,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::And;
     return 2;
 
-    // Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
+  // Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
   case AsmToken::EqualEqual:
     Kind = MCBinaryExpr::EQ;
     return 3;
@@ -1057,7 +1078,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::GTE;
     return 3;
 
-    // Intermediate Precedence: <<, >>
+  // Intermediate Precedence: <<, >>
   case AsmToken::LessLess:
     Kind = MCBinaryExpr::Shl;
     return 4;
@@ -1065,7 +1086,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Shr;
     return 4;
 
-    // High Intermediate Precedence: +, -
+  // High Intermediate Precedence: +, -
   case AsmToken::Plus:
     Kind = MCBinaryExpr::Add;
     return 5;
@@ -1073,7 +1094,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Sub;
     return 5;
 
-    // Highest Precedence: *, /, %
+  // Highest Precedence: *, /, %
   case AsmToken::Star:
     Kind = MCBinaryExpr::Mul;
     return 6;
@@ -1086,10 +1107,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
   }
 }
 
-
-/// ParseBinOpRHS - Parse all binary operators with precedence >= 'Precedence'.
+/// \brief Parse all binary operators with precedence >= 'Precedence'.
 /// Res contains the LHS of the expression on input.
-bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
+bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
                               SMLoc &EndLoc) {
   while (1) {
     MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
@@ -1104,15 +1124,15 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 
     // Eat the next primary expression.
     const MCExpr *RHS;
-    if (ParsePrimaryExpr(RHS, EndLoc)) return true;
+    if (parsePrimaryExpr(RHS, EndLoc))
+      return true;
 
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     MCBinaryExpr::Opcode Dummy;
     unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
-    if (TokPrec < NextTokPrec) {
-      if (ParseBinOpRHS(TokPrec+1, RHS, EndLoc)) return true;
-    }
+    if (TokPrec < NextTokPrec && parseBinOpRHS(TokPrec + 1, RHS, EndLoc))
+      return true;
 
     // Merge LHS and RHS according to operator.
     Res = MCBinaryExpr::Create(Kind, Res, RHS, getContext());
@@ -1123,7 +1143,7 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 ///   ::= EndOfStatement
 ///   ::= Label* Directive ...Operands... EndOfStatement
 ///   ::= Label* Identifier OperandList* EndOfStatement
-bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
+bool AsmParser::parseStatement(ParseStatementInfo &Info) {
   if (Lexer.is(AsmToken::EndOfStatement)) {
     Out.AddBlankLine();
     Lex();
@@ -1137,7 +1157,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   int64_t LocalLabelVal = -1;
   // A full line comment is a '#' as the first token.
   if (Lexer.is(AsmToken::Hash))
-    return ParseCppHashLineFilenameComment(IDLoc);
+    return parseCppHashLineFilenameComment(IDLoc);
 
   // Allow an integer followed by a ':' as a directional local label.
   if (Lexer.is(AsmToken::Integer)) {
@@ -1168,34 +1188,34 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // have to do this so that .endif isn't skipped in a ".if 0" block for
   // example.
   StringMap<DirectiveKind>::const_iterator DirKindIt =
-    DirectiveKindMap.find(IDVal);
-  DirectiveKind DirKind =
-    (DirKindIt == DirectiveKindMap.end()) ? DK_NO_DIRECTIVE :
-                                            DirKindIt->getValue();
+      DirectiveKindMap.find(IDVal);
+  DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
+                              ? DK_NO_DIRECTIVE
+                              : DirKindIt->getValue();
   switch (DirKind) {
-    default:
-      break;
-    case DK_IF:
-      return ParseDirectiveIf(IDLoc);
-    case DK_IFB:
-      return ParseDirectiveIfb(IDLoc, true);
-    case DK_IFNB:
-      return ParseDirectiveIfb(IDLoc, false);
-    case DK_IFC:
-      return ParseDirectiveIfc(IDLoc, true);
-    case DK_IFNC:
-      return ParseDirectiveIfc(IDLoc, false);
-    case DK_IFDEF:
-      return ParseDirectiveIfdef(IDLoc, true);
-    case DK_IFNDEF:
-    case DK_IFNOTDEF:
-      return ParseDirectiveIfdef(IDLoc, false);
-    case DK_ELSEIF:
-      return ParseDirectiveElseIf(IDLoc);
-    case DK_ELSE:
-      return ParseDirectiveElse(IDLoc);
-    case DK_ENDIF:
-      return ParseDirectiveEndIf(IDLoc);
+  default:
+    break;
+  case DK_IF:
+    return parseDirectiveIf(IDLoc);
+  case DK_IFB:
+    return parseDirectiveIfb(IDLoc, true);
+  case DK_IFNB:
+    return parseDirectiveIfb(IDLoc, false);
+  case DK_IFC:
+    return parseDirectiveIfc(IDLoc, true);
+  case DK_IFNC:
+    return parseDirectiveIfc(IDLoc, false);
+  case DK_IFDEF:
+    return parseDirectiveIfdef(IDLoc, true);
+  case DK_IFNDEF:
+  case DK_IFNOTDEF:
+    return parseDirectiveIfdef(IDLoc, false);
+  case DK_ELSEIF:
+    return parseDirectiveElseIf(IDLoc);
+  case DK_ELSE:
+    return parseDirectiveElse(IDLoc);
+  case DK_ENDIF:
+    return parseDirectiveEndIf(IDLoc);
   }
 
   // Ignore the statement if in the middle of inactive conditional
@@ -1242,6 +1262,8 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
       MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
                                  IDLoc);
 
+    getTargetParser().onLabelParsed(Sym);
+
     // Consume any end of statement token, if present, to avoid spurious
     // AddBlankLine calls().
     if (Lexer.is(AsmToken::EndOfStatement)) {
@@ -1257,24 +1279,24 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
     // identifier '=' ... -> assignment statement
     Lex();
 
-    return ParseAssignment(IDVal, true);
+    return parseAssignment(IDVal, true);
 
   default: // Normal instruction or directive.
     break;
   }
 
   // If macros are enabled, check to see if this is a macro instantiation.
-  if (MacrosEnabled())
-    if (const MCAsmMacro *M = LookupMacro(IDVal)) {
-      return HandleMacroEntry(M, IDLoc);
+  if (areMacrosEnabled())
+    if (const MCAsmMacro *M = lookupMacro(IDVal)) {
+      return handleMacroEntry(M, IDLoc);
     }
 
   // Otherwise, we have a normal instruction or directive.
-  
+
   // Directives start with "."
   if (IDVal[0] == '.' && IDVal != ".") {
     // There are several entities interested in parsing directives:
-    // 
+    //
     // 1. The target-specific assembly parser. Some directives are target
     //    specific or may potentially behave differently on certain targets.
     // 2. Asm parser extensions. For example, platform-specific parsers
@@ -1291,185 +1313,185 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
 
     // Next, check the extention directive map to see if any extension has
     // registered itself to parse this directive.
-    std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
-      ExtensionDirectiveMap.lookup(IDVal);
+    std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
+        ExtensionDirectiveMap.lookup(IDVal);
     if (Handler.first)
       return (*Handler.second)(Handler.first, IDVal, IDLoc);
 
     // Finally, if no one else is interested in this directive, it must be
     // generic and familiar to this class.
     switch (DirKind) {
-      default:
-        break;
-      case DK_SET:
-      case DK_EQU:
-        return ParseDirectiveSet(IDVal, true);
-      case DK_EQUIV:
-        return ParseDirectiveSet(IDVal, false);
-      case DK_ASCII:
-        return ParseDirectiveAscii(IDVal, false);
-      case DK_ASCIZ:
-      case DK_STRING:
-        return ParseDirectiveAscii(IDVal, true);
-      case DK_BYTE:
-        return ParseDirectiveValue(1);
-      case DK_SHORT:
-      case DK_VALUE:
-      case DK_2BYTE:
-        return ParseDirectiveValue(2);
-      case DK_LONG:
-      case DK_INT:
-      case DK_4BYTE:
-        return ParseDirectiveValue(4);
-      case DK_QUAD:
-      case DK_8BYTE:
-        return ParseDirectiveValue(8);
-      case DK_SINGLE:
-      case DK_FLOAT:
-        return ParseDirectiveRealValue(APFloat::IEEEsingle);
-      case DK_DOUBLE:
-        return ParseDirectiveRealValue(APFloat::IEEEdouble);
-      case DK_ALIGN: {
-        bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
-        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/1);
-      }
-      case DK_ALIGN32: {
-        bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
-        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/4);
-      }
-      case DK_BALIGN:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
-      case DK_BALIGNW:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
-      case DK_BALIGNL:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
-      case DK_P2ALIGN:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
-      case DK_P2ALIGNW:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
-      case DK_P2ALIGNL:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
-      case DK_ORG:
-        return ParseDirectiveOrg();
-      case DK_FILL:
-        return ParseDirectiveFill();
-      case DK_ZERO:
-        return ParseDirectiveZero();
-      case DK_EXTERN:
-        eatToEndOfStatement(); // .extern is the default, ignore it.
-        return false;
-      case DK_GLOBL:
-      case DK_GLOBAL:
-        return ParseDirectiveSymbolAttribute(MCSA_Global);
-      case DK_INDIRECT_SYMBOL:
-        return ParseDirectiveSymbolAttribute(MCSA_IndirectSymbol);
-      case DK_LAZY_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_LazyReference);
-      case DK_NO_DEAD_STRIP:
-        return ParseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
-      case DK_SYMBOL_RESOLVER:
-        return ParseDirectiveSymbolAttribute(MCSA_SymbolResolver);
-      case DK_PRIVATE_EXTERN:
-        return ParseDirectiveSymbolAttribute(MCSA_PrivateExtern);
-      case DK_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_Reference);
-      case DK_WEAK_DEFINITION:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
-      case DK_WEAK_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakReference);
-      case DK_WEAK_DEF_CAN_BE_HIDDEN:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
-      case DK_COMM:
-      case DK_COMMON:
-        return ParseDirectiveComm(/*IsLocal=*/false);
-      case DK_LCOMM:
-        return ParseDirectiveComm(/*IsLocal=*/true);
-      case DK_ABORT:
-        return ParseDirectiveAbort();
-      case DK_INCLUDE:
-        return ParseDirectiveInclude();
-      case DK_INCBIN:
-        return ParseDirectiveIncbin();
-      case DK_CODE16:
-      case DK_CODE16GCC:
-        return TokError(Twine(IDVal) + " not supported yet");
-      case DK_REPT:
-        return ParseDirectiveRept(IDLoc);
-      case DK_IRP:
-        return ParseDirectiveIrp(IDLoc);
-      case DK_IRPC:
-        return ParseDirectiveIrpc(IDLoc);
-      case DK_ENDR:
-        return ParseDirectiveEndr(IDLoc);
-      case DK_BUNDLE_ALIGN_MODE:
-        return ParseDirectiveBundleAlignMode();
-      case DK_BUNDLE_LOCK:
-        return ParseDirectiveBundleLock();
-      case DK_BUNDLE_UNLOCK:
-        return ParseDirectiveBundleUnlock();
-      case DK_SLEB128:
-        return ParseDirectiveLEB128(true);
-      case DK_ULEB128:
-        return ParseDirectiveLEB128(false);
-      case DK_SPACE:
-      case DK_SKIP:
-        return ParseDirectiveSpace(IDVal);
-      case DK_FILE:
-        return ParseDirectiveFile(IDLoc);
-      case DK_LINE:
-        return ParseDirectiveLine();
-      case DK_LOC:
-        return ParseDirectiveLoc();
-      case DK_STABS:
-        return ParseDirectiveStabs();
-      case DK_CFI_SECTIONS:
-        return ParseDirectiveCFISections();
-      case DK_CFI_STARTPROC:
-        return ParseDirectiveCFIStartProc();
-      case DK_CFI_ENDPROC:
-        return ParseDirectiveCFIEndProc();
-      case DK_CFI_DEF_CFA:
-        return ParseDirectiveCFIDefCfa(IDLoc);
-      case DK_CFI_DEF_CFA_OFFSET:
-        return ParseDirectiveCFIDefCfaOffset();
-      case DK_CFI_ADJUST_CFA_OFFSET:
-        return ParseDirectiveCFIAdjustCfaOffset();
-      case DK_CFI_DEF_CFA_REGISTER:
-        return ParseDirectiveCFIDefCfaRegister(IDLoc);
-      case DK_CFI_OFFSET:
-        return ParseDirectiveCFIOffset(IDLoc);
-      case DK_CFI_REL_OFFSET:
-        return ParseDirectiveCFIRelOffset(IDLoc);
-      case DK_CFI_PERSONALITY:
-        return ParseDirectiveCFIPersonalityOrLsda(true);
-      case DK_CFI_LSDA:
-        return ParseDirectiveCFIPersonalityOrLsda(false);
-      case DK_CFI_REMEMBER_STATE:
-        return ParseDirectiveCFIRememberState();
-      case DK_CFI_RESTORE_STATE:
-        return ParseDirectiveCFIRestoreState();
-      case DK_CFI_SAME_VALUE:
-        return ParseDirectiveCFISameValue(IDLoc);
-      case DK_CFI_RESTORE:
-        return ParseDirectiveCFIRestore(IDLoc);
-      case DK_CFI_ESCAPE:
-        return ParseDirectiveCFIEscape();
-      case DK_CFI_SIGNAL_FRAME:
-        return ParseDirectiveCFISignalFrame();
-      case DK_CFI_UNDEFINED:
-        return ParseDirectiveCFIUndefined(IDLoc);
-      case DK_CFI_REGISTER:
-        return ParseDirectiveCFIRegister(IDLoc);
-      case DK_MACROS_ON:
-      case DK_MACROS_OFF:
-        return ParseDirectiveMacrosOnOff(IDVal);
-      case DK_MACRO:
-        return ParseDirectiveMacro(IDLoc);
-      case DK_ENDM:
-      case DK_ENDMACRO:
-        return ParseDirectiveEndMacro(IDVal);
-      case DK_PURGEM:
-        return ParseDirectivePurgeMacro(IDLoc);
+    default:
+      break;
+    case DK_SET:
+    case DK_EQU:
+      return parseDirectiveSet(IDVal, true);
+    case DK_EQUIV:
+      return parseDirectiveSet(IDVal, false);
+    case DK_ASCII:
+      return parseDirectiveAscii(IDVal, false);
+    case DK_ASCIZ:
+    case DK_STRING:
+      return parseDirectiveAscii(IDVal, true);
+    case DK_BYTE:
+      return parseDirectiveValue(1);
+    case DK_SHORT:
+    case DK_VALUE:
+    case DK_2BYTE:
+      return parseDirectiveValue(2);
+    case DK_LONG:
+    case DK_INT:
+    case DK_4BYTE:
+      return parseDirectiveValue(4);
+    case DK_QUAD:
+    case DK_8BYTE:
+      return parseDirectiveValue(8);
+    case DK_SINGLE:
+    case DK_FLOAT:
+      return parseDirectiveRealValue(APFloat::IEEEsingle);
+    case DK_DOUBLE:
+      return parseDirectiveRealValue(APFloat::IEEEdouble);
+    case DK_ALIGN: {
+      bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
+      return parseDirectiveAlign(IsPow2, /*ExprSize=*/1);
+    }
+    case DK_ALIGN32: {
+      bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
+      return parseDirectiveAlign(IsPow2, /*ExprSize=*/4);
+    }
+    case DK_BALIGN:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
+    case DK_BALIGNW:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
+    case DK_BALIGNL:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
+    case DK_P2ALIGN:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
+    case DK_P2ALIGNW:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
+    case DK_P2ALIGNL:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
+    case DK_ORG:
+      return parseDirectiveOrg();
+    case DK_FILL:
+      return parseDirectiveFill();
+    case DK_ZERO:
+      return parseDirectiveZero();
+    case DK_EXTERN:
+      eatToEndOfStatement(); // .extern is the default, ignore it.
+      return false;
+    case DK_GLOBL:
+    case DK_GLOBAL:
+      return parseDirectiveSymbolAttribute(MCSA_Global);
+    case DK_LAZY_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_LazyReference);
+    case DK_NO_DEAD_STRIP:
+      return parseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
+    case DK_SYMBOL_RESOLVER:
+      return parseDirectiveSymbolAttribute(MCSA_SymbolResolver);
+    case DK_PRIVATE_EXTERN:
+      return parseDirectiveSymbolAttribute(MCSA_PrivateExtern);
+    case DK_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_Reference);
+    case DK_WEAK_DEFINITION:
+      return parseDirectiveSymbolAttribute(MCSA_WeakDefinition);
+    case DK_WEAK_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_WeakReference);
+    case DK_WEAK_DEF_CAN_BE_HIDDEN:
+      return parseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
+    case DK_COMM:
+    case DK_COMMON:
+      return parseDirectiveComm(/*IsLocal=*/false);
+    case DK_LCOMM:
+      return parseDirectiveComm(/*IsLocal=*/true);
+    case DK_ABORT:
+      return parseDirectiveAbort();
+    case DK_INCLUDE:
+      return parseDirectiveInclude();
+    case DK_INCBIN:
+      return parseDirectiveIncbin();
+    case DK_CODE16:
+    case DK_CODE16GCC:
+      return TokError(Twine(IDVal) + " not supported yet");
+    case DK_REPT:
+      return parseDirectiveRept(IDLoc);
+    case DK_IRP:
+      return parseDirectiveIrp(IDLoc);
+    case DK_IRPC:
+      return parseDirectiveIrpc(IDLoc);
+    case DK_ENDR:
+      return parseDirectiveEndr(IDLoc);
+    case DK_BUNDLE_ALIGN_MODE:
+      return parseDirectiveBundleAlignMode();
+    case DK_BUNDLE_LOCK:
+      return parseDirectiveBundleLock();
+    case DK_BUNDLE_UNLOCK:
+      return parseDirectiveBundleUnlock();
+    case DK_SLEB128:
+      return parseDirectiveLEB128(true);
+    case DK_ULEB128:
+      return parseDirectiveLEB128(false);
+    case DK_SPACE:
+    case DK_SKIP:
+      return parseDirectiveSpace(IDVal);
+    case DK_FILE:
+      return parseDirectiveFile(IDLoc);
+    case DK_LINE:
+      return parseDirectiveLine();
+    case DK_LOC:
+      return parseDirectiveLoc();
+    case DK_STABS:
+      return parseDirectiveStabs();
+    case DK_CFI_SECTIONS:
+      return parseDirectiveCFISections();
+    case DK_CFI_STARTPROC:
+      return parseDirectiveCFIStartProc();
+    case DK_CFI_ENDPROC:
+      return parseDirectiveCFIEndProc();
+    case DK_CFI_DEF_CFA:
+      return parseDirectiveCFIDefCfa(IDLoc);
+    case DK_CFI_DEF_CFA_OFFSET:
+      return parseDirectiveCFIDefCfaOffset();
+    case DK_CFI_ADJUST_CFA_OFFSET:
+      return parseDirectiveCFIAdjustCfaOffset();
+    case DK_CFI_DEF_CFA_REGISTER:
+      return parseDirectiveCFIDefCfaRegister(IDLoc);
+    case DK_CFI_OFFSET:
+      return parseDirectiveCFIOffset(IDLoc);
+    case DK_CFI_REL_OFFSET:
+      return parseDirectiveCFIRelOffset(IDLoc);
+    case DK_CFI_PERSONALITY:
+      return parseDirectiveCFIPersonalityOrLsda(true);
+    case DK_CFI_LSDA:
+      return parseDirectiveCFIPersonalityOrLsda(false);
+    case DK_CFI_REMEMBER_STATE:
+      return parseDirectiveCFIRememberState();
+    case DK_CFI_RESTORE_STATE:
+      return parseDirectiveCFIRestoreState();
+    case DK_CFI_SAME_VALUE:
+      return parseDirectiveCFISameValue(IDLoc);
+    case DK_CFI_RESTORE:
+      return parseDirectiveCFIRestore(IDLoc);
+    case DK_CFI_ESCAPE:
+      return parseDirectiveCFIEscape();
+    case DK_CFI_SIGNAL_FRAME:
+      return parseDirectiveCFISignalFrame();
+    case DK_CFI_UNDEFINED:
+      return parseDirectiveCFIUndefined(IDLoc);
+    case DK_CFI_REGISTER:
+      return parseDirectiveCFIRegister(IDLoc);
+    case DK_CFI_WINDOW_SAVE:
+      return parseDirectiveCFIWindowSave();
+    case DK_MACROS_ON:
+    case DK_MACROS_OFF:
+      return parseDirectiveMacrosOnOff(IDVal);
+    case DK_MACRO:
+      return parseDirectiveMacro(IDLoc);
+    case DK_ENDM:
+    case DK_ENDMACRO:
+      return parseDirectiveEndMacro(IDVal);
+    case DK_PURGEM:
+      return parseDirectivePurgeMacro(IDLoc);
     }
 
     return Error(IDLoc, "unknown directive");
@@ -1478,19 +1500,19 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // __asm _emit or __asm __emit
   if (ParsingInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
                            IDVal == "_EMIT" || IDVal == "__EMIT"))
-    return ParseDirectiveMSEmit(IDLoc, Info, IDVal.size());
+    return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());
 
   // __asm align
   if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
-    return ParseDirectiveMSAlign(IDLoc, Info);
+    return parseDirectiveMSAlign(IDLoc, Info);
 
   checkForValidSection();
 
   // Canonicalize the opcode to lower case.
   std::string OpcodeStr = IDVal.lower();
   ParseInstructionInfo IInfo(Info.AsmRewrites);
-  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr,
-                                                     IDLoc, Info.ParsedOperands);
+  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, IDLoc,
+                                                     Info.ParsedOperands);
   Info.ParseError = HadError;
 
   // Dump the parsed representation, if requested.
@@ -1505,7 +1527,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
     }
     OS << "]";
 
-    PrintMessage(IDLoc, SourceMgr::DK_Note, OS.str());
+    printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
   }
 
   // If we are generating dwarf for assembly source files and the current
@@ -1513,49 +1535,49 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // the instruction.
   if (!HadError && getContext().getGenDwarfForAssembly() &&
       getContext().getGenDwarfSection() ==
-      getStreamer().getCurrentSection().first) {
+          getStreamer().getCurrentSection().first) {
 
     unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
 
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles = 
-      getContext().getMCDwarfFiles();
+    const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
+        getContext().getMCDwarfFiles();
     if (CppHashFilename.size() != 0) {
       if (MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
           CppHashFilename)
         getStreamer().EmitDwarfFileDirective(
-          getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
-
-       // Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's 
-       // cache with the different Loc from the call above we save the last
-       // info we queried here with SrcMgr.FindLineNumber().
-       unsigned CppHashLocLineNo;
-       if (LastQueryIDLoc == CppHashLoc && LastQueryBuffer == CppHashBuf)
-         CppHashLocLineNo = LastQueryLine;
-       else {
-         CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc, CppHashBuf);
-         LastQueryLine = CppHashLocLineNo;
-         LastQueryIDLoc = CppHashLoc;
-         LastQueryBuffer = CppHashBuf;
-       }
-       Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
+            getContext().nextGenDwarfFileNumber(), StringRef(),
+            CppHashFilename);
+
+      // Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's
+      // cache with the different Loc from the call above we save the last
+      // info we queried here with SrcMgr.FindLineNumber().
+      unsigned CppHashLocLineNo;
+      if (LastQueryIDLoc == CppHashLoc && LastQueryBuffer == CppHashBuf)
+        CppHashLocLineNo = LastQueryLine;
+      else {
+        CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc, CppHashBuf);
+        LastQueryLine = CppHashLocLineNo;
+        LastQueryIDLoc = CppHashLoc;
+        LastQueryBuffer = CppHashBuf;
+      }
+      Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
     }
 
-    getStreamer().EmitDwarfLocDirective(getContext().getGenDwarfFileNumber(),
-                                        Line, 0, DWARF2_LINE_DEFAULT_IS_STMT ?
-                                        DWARF2_FLAG_IS_STMT : 0, 0, 0,
-                                        StringRef());
+    getStreamer().EmitDwarfLocDirective(
+        getContext().getGenDwarfFileNumber(), Line, 0,
+        DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
+        StringRef());
   }
 
   // If parsing succeeded, match the instruction.
   if (!HadError) {
     unsigned ErrorInfo;
-    HadError = getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode,
-                                                         Info.ParsedOperands,
-                                                         Out, ErrorInfo,
-                                                         ParsingInlineAsm);
+    HadError = getTargetParser().MatchAndEmitInstruction(
+        IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
+        ParsingInlineAsm);
   }
 
   // Don't skip the rest of the line, the instruction parser is responsible for
@@ -1563,25 +1585,25 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   return false;
 }
 
-/// EatToEndOfLine uses the Lexer to eat the characters to the end of the line
+/// eatToEndOfLine uses the Lexer to eat the characters to the end of the line
 /// since they may not be able to be tokenized to get to the end of line token.
-void AsmParser::EatToEndOfLine() {
+void AsmParser::eatToEndOfLine() {
   if (!Lexer.is(AsmToken::EndOfStatement))
     Lexer.LexUntilEndOfLine();
- // Eat EOL.
- Lex();
+  // Eat EOL.
+  Lex();
 }
 
-/// ParseCppHashLineFilenameComment as this:
+/// parseCppHashLineFilenameComment as this:
 ///   ::= # number "filename"
 /// or just as a full line comment if it doesn't have a number and a string.
-bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
+bool AsmParser::parseCppHashLineFilenameComment(const SMLoc &L) {
   Lex(); // Eat the hash token.
 
   if (getLexer().isNot(AsmToken::Integer)) {
     // Consume the line since in cases it is not a well-formed line directive,
     // as if were simply a full line comment.
-    EatToEndOfLine();
+    eatToEndOfLine();
     return false;
   }
 
@@ -1589,13 +1611,13 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   Lex();
 
   if (getLexer().isNot(AsmToken::String)) {
-    EatToEndOfLine();
+    eatToEndOfLine();
     return false;
   }
 
   StringRef Filename = getTok().getString();
   // Get rid of the enclosing quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
+  Filename = Filename.substr(1, Filename.size() - 2);
 
   // Save the SMLoc, Filename and LineNumber for later use by diagnostics.
   CppHashLoc = L;
@@ -1604,14 +1626,14 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   CppHashBuf = CurBuffer;
 
   // Ignore any trailing characters, they're just comment.
-  EatToEndOfLine();
+  eatToEndOfLine();
   return false;
 }
 
-/// DiagHandler - will use the last parsed cpp hash line filename comment
+/// \brief will use the last parsed cpp hash line filename comment
 /// for the Filename and LineNo if any in the diagnostic.
 void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
-  const AsmParser *Parser = static_cast<const AsmParser*>(Context);
+  const AsmParser *Parser = static_cast<const AsmParser *>(Context);
   raw_ostream &OS = errs();
 
   const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
@@ -1619,19 +1641,18 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   int DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   int CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
 
-  // Like SourceMgr::PrintMessage() we need to print the include stack if any
+  // Like SourceMgr::printMessage() we need to print the include stack if any
   // before printing the message.
   int DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   if (!Parser->SavedDiagHandler && DiagCurBuffer > 0) {
-     SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
-     DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
+    SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
+    DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
   }
 
   // If we have not parsed a cpp hash line filename comment or the source
   // manager changed or buffer changed (like in a nested include) then just
   // print the normal diagnostic using its Filename and LineNo.
-  if (!Parser->CppHashLineNumber ||
-      &DiagSrcMgr != &Parser->SrcMgr ||
+  if (!Parser->CppHashLineNumber || &DiagSrcMgr != &Parser->SrcMgr ||
       DiagBuf != CppHashBuf) {
     if (Parser->SavedDiagHandler)
       Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
@@ -1643,17 +1664,16 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   // Use the CppHashFilename and calculate a line number based on the
   // CppHashLoc and CppHashLineNumber relative to this Diag's SMLoc for
   // the diagnostic.
-  const std::string Filename = Parser->CppHashFilename;
+  const std::string &Filename = Parser->CppHashFilename;
 
   int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
   int CppHashLocLineNo =
       Parser->SrcMgr.FindLineNumber(Parser->CppHashLoc, CppHashBuf);
-  int LineNo = Parser->CppHashLineNumber - 1 +
-               (DiagLocLineNo - CppHashLocLineNo);
+  int LineNo =
+      Parser->CppHashLineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);
 
-  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(),
-                       Filename, LineNo, Diag.getColumnNo(),
-                       Diag.getKind(), Diag.getMessage(),
+  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
+                       Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
                        Diag.getLineContents(), Diag.getRanges());
 
   if (Parser->SavedDiagHandler)
@@ -1673,8 +1693,7 @@ static bool isIdentifierChar(char c) {
 
 bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                             const MCAsmMacroParameters &Parameters,
-                            const MCAsmMacroArguments &A,
-                            const SMLoc &L) {
+                            const MCAsmMacroArguments &A, const SMLoc &L) {
   unsigned NParameters = Parameters.size();
   if (NParameters != 0 && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
@@ -1710,27 +1729,28 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       break;
 
     if (!NParameters) {
-      switch (Body[Pos+1]) {
-        // $$ => $
+      switch (Body[Pos + 1]) {
+      // $$ => $
       case '$':
         OS << '$';
         break;
 
-        // $n => number of arguments
+      // $n => number of arguments
       case 'n':
         OS << A.size();
         break;
 
-        // $[0-9] => argument
+      // $[0-9] => argument
       default: {
         // Missing arguments are ignored.
-        unsigned Index = Body[Pos+1] - '0';
+        unsigned Index = Body[Pos + 1] - '0';
         if (Index >= A.size())
           break;
 
         // Otherwise substitute with the token values, with spaces eliminated.
         for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-               ie = A[Index].end(); it != ie; ++it)
+                                                ie = A[Index].end();
+             it != ie; ++it)
           OS << it->getString();
         break;
       }
@@ -1741,23 +1761,24 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       while (isIdentifierChar(Body[I]) && I + 1 != End)
         ++I;
 
-      const char *Begin = Body.data() + Pos +1;
-      StringRef Argument(Begin, I - (Pos +1));
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].first == Argument)
           break;
 
       if (Index == NParameters) {
-          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
-            Pos += 3;
-          else {
-            OS << '\\' << Argument;
-            Pos = I;
-          }
+        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+          Pos += 3;
+        else {
+          OS << '\\' << Argument;
+          Pos = I;
+        }
       } else {
         for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-               ie = A[Index].end(); it != ie; ++it)
+                                                ie = A[Index].end();
+             it != ie; ++it)
           if (it->getKind() == AsmToken::String)
             OS << it->getStringContents();
           else
@@ -1773,48 +1794,43 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
   return false;
 }
 
-MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL,
-                                       int EB, SMLoc EL,
-                                       MemoryBuffer *I)
-  : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
-    ExitLoc(EL)
-{
-}
+MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL, int EB,
+                                       SMLoc EL, MemoryBuffer *I)
+    : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
+      ExitLoc(EL) {}
 
-static bool IsOperator(AsmToken::TokenKind kind)
-{
-  switch (kind)
-  {
-    default:
-      return false;
-    case AsmToken::Plus:
-    case AsmToken::Minus:
-    case AsmToken::Tilde:
-    case AsmToken::Slash:
-    case AsmToken::Star:
-    case AsmToken::Dot:
-    case AsmToken::Equal:
-    case AsmToken::EqualEqual:
-    case AsmToken::Pipe:
-    case AsmToken::PipePipe:
-    case AsmToken::Caret:
-    case AsmToken::Amp:
-    case AsmToken::AmpAmp:
-    case AsmToken::Exclaim:
-    case AsmToken::ExclaimEqual:
-    case AsmToken::Percent:
-    case AsmToken::Less:
-    case AsmToken::LessEqual:
-    case AsmToken::LessLess:
-    case AsmToken::LessGreater:
-    case AsmToken::Greater:
-    case AsmToken::GreaterEqual:
-    case AsmToken::GreaterGreater:
-      return true;
+static bool isOperator(AsmToken::TokenKind kind) {
+  switch (kind) {
+  default:
+    return false;
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Tilde:
+  case AsmToken::Slash:
+  case AsmToken::Star:
+  case AsmToken::Dot:
+  case AsmToken::Equal:
+  case AsmToken::EqualEqual:
+  case AsmToken::Pipe:
+  case AsmToken::PipePipe:
+  case AsmToken::Caret:
+  case AsmToken::Amp:
+  case AsmToken::AmpAmp:
+  case AsmToken::Exclaim:
+  case AsmToken::ExclaimEqual:
+  case AsmToken::Percent:
+  case AsmToken::Less:
+  case AsmToken::LessEqual:
+  case AsmToken::LessLess:
+  case AsmToken::LessGreater:
+  case AsmToken::Greater:
+  case AsmToken::GreaterEqual:
+  case AsmToken::GreaterGreater:
+    return true;
   }
 }
 
-bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
+bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA,
                                    AsmToken::TokenKind &ArgumentDelimiter) {
   unsigned ParenLevel = 0;
   unsigned AddTokens = 0;
@@ -1848,7 +1864,7 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
       // one into this argument
       if (ArgumentDelimiter == AsmToken::Space ||
           ArgumentDelimiter == AsmToken::Eof) {
-        if (IsOperator(Lexer.getKind())) {
+        if (isOperator(Lexer.getKind())) {
           // Check to see whether the token is used as an operator,
           // or part of an identifier
           const char *NextChar = getTok().getEndLoc().getPointer();
@@ -1858,14 +1874,14 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
 
         if (!AddTokens && ParenLevel == 0) {
           if (ArgumentDelimiter == AsmToken::Eof &&
-              !IsOperator(Lexer.getKind()))
+              !isOperator(Lexer.getKind()))
             ArgumentDelimiter = AsmToken::Space;
           break;
         }
       }
     }
 
-    // HandleMacroEntry relies on not advancing the lexer here
+    // handleMacroEntry relies on not advancing the lexer here
     // to be able to fill in the remaining default parameter values
     if (Lexer.is(AsmToken::EndOfStatement))
       break;
@@ -1890,10 +1906,11 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
 }
 
 // Parse the macro instantiation arguments.
-bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A) {
+bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
+                                    MCAsmMacroArguments &A) {
   const unsigned NParameters = M ? M->Parameters.size() : 0;
   // Argument delimiter is initially unknown. It will be set by
-  // ParseMacroArgument()
+  // parseMacroArgument()
   AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
 
   // Parse two kinds of macro invocations:
@@ -1903,7 +1920,7 @@ bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A)
        ++Parameter) {
     MCAsmMacroArgument MA;
 
-    if (ParseMacroArgument(MA, ArgumentDelimiter))
+    if (parseMacroArgument(MA, ArgumentDelimiter))
       return true;
 
     if (!MA.empty() || !NParameters)
@@ -1934,31 +1951,31 @@ bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A)
   return TokError("Too many arguments");
 }
 
-const MCAsmMacro* AsmParser::LookupMacro(StringRef Name) {
-  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
+  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
   return (I == MacroMap.end()) ? NULL : I->getValue();
 }
 
-void AsmParser::DefineMacro(StringRef Name, const MCAsmMacro& Macro) {
+void AsmParser::defineMacro(StringRef Name, const MCAsmMacro &Macro) {
   MacroMap[Name] = new MCAsmMacro(Macro);
 }
 
-void AsmParser::UndefineMacro(StringRef Name) {
-  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+void AsmParser::undefineMacro(StringRef Name) {
+  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
   if (I != MacroMap.end()) {
     delete I->getValue();
     MacroMap.erase(I);
   }
 }
 
-bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
+bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
   // this, although we should protect against infinite loops.
   if (ActiveMacros.size() == 20)
     return TokError("macros cannot be nested more than 20 levels deep");
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(M, A))
+  if (parseMacroArguments(M, A))
     return true;
 
   // Remove any trailing empty arguments. Do this after-the-fact as we have
@@ -1981,14 +1998,12 @@ bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   OS << ".endmacro\n";
 
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
-  MacroInstantiation *MI = new MacroInstantiation(M, NameLoc,
-                                                  CurBuffer,
-                                                  getTok().getLoc(),
-                                                  Instantiation);
+  MacroInstantiation *MI = new MacroInstantiation(
+      M, NameLoc, CurBuffer, getTok().getLoc(), Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -1999,9 +2014,9 @@ bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   return false;
 }
 
-void AsmParser::HandleMacroExit() {
+void AsmParser::handleMacroExit() {
   // Jump to the EndOfStatement we should return to, and consume it.
-  JumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
+  jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
   Lex();
 
   // Pop the instantiation entry.
@@ -2009,29 +2024,30 @@ void AsmParser::HandleMacroExit() {
   ActiveMacros.pop_back();
 }
 
-static bool IsUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
+static bool isUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
   switch (Value->getKind()) {
   case MCExpr::Binary: {
-    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Value);
-    return IsUsedIn(Sym, BE->getLHS()) || IsUsedIn(Sym, BE->getRHS());
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Value);
+    return isUsedIn(Sym, BE->getLHS()) || isUsedIn(Sym, BE->getRHS());
   }
   case MCExpr::Target:
   case MCExpr::Constant:
     return false;
   case MCExpr::SymbolRef: {
-    const MCSymbol &S = static_cast<const MCSymbolRefExpr*>(Value)->getSymbol();
+    const MCSymbol &S =
+        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
     if (S.isVariable())
-      return IsUsedIn(Sym, S.getVariableValue());
+      return isUsedIn(Sym, S.getVariableValue());
     return &S == Sym;
   }
   case MCExpr::Unary:
-    return IsUsedIn(Sym, static_cast<const MCUnaryExpr*>(Value)->getSubExpr());
+    return isUsedIn(Sym, static_cast<const MCUnaryExpr *>(Value)->getSubExpr());
   }
 
   llvm_unreachable("Unknown expr kind!");
 }
 
-bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
+bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
                                 bool NoDeadStrip) {
   // FIXME: Use better location, we should use proper tokens.
   SMLoc EqualLoc = Lexer.getLoc();
@@ -2064,7 +2080,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
     //
     // FIXME: Diagnostics. Note the location of the definition as a label.
     // FIXME: Diagnose assignment to protected identifier (e.g., register name).
-    if (IsUsedIn(Sym, Value))
+    if (isUsedIn(Sym, Value))
       return Error(EqualLoc, "Recursive use of '" + Name + "'");
     else if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable())
       ; // Allow redefinitions of undefined symbols only used in directives.
@@ -2076,7 +2092,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
       return Error(EqualLoc, "invalid assignment to '" + Name + "'");
     else if (!isa<MCConstantExpr>(Sym->getVariableValue()))
       return Error(EqualLoc, "invalid reassignment of non-absolute variable '" +
-                   Name + "'");
+                                 Name + "'");
 
     // Don't count these checks as uses.
     Sym->setUsed(false);
@@ -2090,7 +2106,6 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
   if (NoDeadStrip)
     Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
 
-
   return false;
 }
 
@@ -2099,31 +2114,30 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
 ///   ::= string
 bool AsmParser::parseIdentifier(StringRef &Res) {
   // The assembler has relaxed rules for accepting identifiers, in particular we
-  // allow things like '.globl $foo', which would normally be separate
-  // tokens. At this level, we have already lexed so we cannot (currently)
+  // allow things like '.globl $foo' and '.def @feat.00', which would normally be
+  // separate tokens. At this level, we have already lexed so we cannot (currently)
   // handle this as a context dependent token, instead we detect adjacent tokens
   // and return the combined identifier.
-  if (Lexer.is(AsmToken::Dollar)) {
-    SMLoc DollarLoc = getLexer().getLoc();
+  if (Lexer.is(AsmToken::Dollar) || Lexer.is(AsmToken::At)) {
+    SMLoc PrefixLoc = getLexer().getLoc();
 
-    // Consume the dollar sign, and check for a following identifier.
+    // Consume the prefix character, and check for a following identifier.
     Lex();
     if (Lexer.isNot(AsmToken::Identifier))
       return true;
 
-    // We have a '$' followed by an identifier, make sure they are adjacent.
-    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+    // We have a '$' or '@' followed by an identifier, make sure they are adjacent.
+    if (PrefixLoc.getPointer() + 1 != getTok().getLoc().getPointer())
       return true;
 
     // Construct the joined identifier and consume the token.
-    Res = StringRef(DollarLoc.getPointer(),
-                    getTok().getIdentifier().size() + 1);
+    Res =
+        StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
     Lex();
     return false;
   }
 
-  if (Lexer.isNot(AsmToken::Identifier) &&
-      Lexer.isNot(AsmToken::String))
+  if (Lexer.isNot(AsmToken::Identifier) && Lexer.isNot(AsmToken::String))
     return true;
 
   Res = getTok().getIdentifier();
@@ -2133,11 +2147,11 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
   return false;
 }
 
-/// ParseDirectiveSet:
+/// parseDirectiveSet:
 ///   ::= .equ identifier ',' expression
 ///   ::= .equiv identifier ',' expression
 ///   ::= .set identifier ',' expression
-bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
+bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
   StringRef Name;
 
   if (parseIdentifier(Name))
@@ -2147,7 +2161,7 @@ bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
     return TokError("unexpected token in '" + Twine(IDVal) + "'");
   Lex();
 
-  return ParseAssignment(Name, allow_redef, true);
+  return parseAssignment(Name, allow_redef, true);
 }
 
 bool AsmParser::parseEscapedString(std::string &Data) {
@@ -2168,15 +2182,15 @@ bool AsmParser::parseEscapedString(std::string &Data) {
       return TokError("unexpected backslash at end of string");
 
     // Recognize octal sequences.
-    if ((unsigned) (Str[i] - '0') <= 7) {
+    if ((unsigned)(Str[i] - '0') <= 7) {
       // Consume up to three octal characters.
       unsigned Value = Str[i] - '0';
 
-      if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+      if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
         ++i;
         Value = Value * 8 + (Str[i] - '0');
 
-        if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+        if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
           ++i;
           Value = Value * 8 + (Str[i] - '0');
         }
@@ -2185,7 +2199,7 @@ bool AsmParser::parseEscapedString(std::string &Data) {
       if (Value > 255)
         return TokError("invalid octal escape sequence (out of range)");
 
-      Data += (unsigned char) Value;
+      Data += (unsigned char)Value;
       continue;
     }
 
@@ -2208,9 +2222,9 @@ bool AsmParser::parseEscapedString(std::string &Data) {
   return false;
 }
 
-/// ParseDirectiveAscii:
+/// parseDirectiveAscii:
 ///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
-bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
+bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2241,9 +2255,9 @@ bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   return false;
 }
 
-/// ParseDirectiveValue
+/// parseDirectiveValue
 ///  ::= (.byte | .short | ... ) [ expression (, expression)* ]
-bool AsmParser::ParseDirectiveValue(unsigned Size) {
+bool AsmParser::parseDirectiveValue(unsigned Size) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2277,9 +2291,9 @@ bool AsmParser::ParseDirectiveValue(unsigned Size) {
   return false;
 }
 
-/// ParseDirectiveRealValue
+/// parseDirectiveRealValue
 ///  ::= (.single | .double) [ expression (, expression)* ]
-bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
+bool AsmParser::parseDirectiveRealValue(const fltSemantics &Semantics) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2309,7 +2323,7 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
         else
           return TokError("invalid floating point literal");
       } else if (Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven) ==
-          APFloat::opInvalidOp)
+                 APFloat::opInvalidOp)
         return TokError("invalid floating point literal");
       if (IsNeg)
         Value.changeSign();
@@ -2335,9 +2349,9 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
   return false;
 }
 
-/// ParseDirectiveZero
+/// parseDirectiveZero
 ///  ::= .zero expression
-bool AsmParser::ParseDirectiveZero() {
+bool AsmParser::parseDirectiveZero() {
   checkForValidSection();
 
   int64_t NumBytes;
@@ -2361,35 +2375,40 @@ bool AsmParser::ParseDirectiveZero() {
   return false;
 }
 
-/// ParseDirectiveFill
-///  ::= .fill expression , expression , expression
-bool AsmParser::ParseDirectiveFill() {
+/// parseDirectiveFill
+///  ::= .fill expression [ , expression [ , expression ] ]
+bool AsmParser::parseDirectiveFill() {
   checkForValidSection();
 
   int64_t NumValues;
   if (parseAbsoluteExpression(NumValues))
     return true;
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.fill' directive");
-  Lex();
+  int64_t FillSize = 1;
+  int64_t FillExpr = 0;
 
-  int64_t FillSize;
-  if (parseAbsoluteExpression(FillSize))
-    return true;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.fill' directive");
+    Lex();
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.fill' directive");
-  Lex();
+    if (parseAbsoluteExpression(FillSize))
+      return true;
 
-  int64_t FillExpr;
-  if (parseAbsoluteExpression(FillExpr))
-    return true;
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in '.fill' directive");
+      Lex();
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.fill' directive");
+      if (parseAbsoluteExpression(FillExpr))
+        return true;
 
-  Lex();
+      if (getLexer().isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in '.fill' directive");
+
+      Lex();
+    }
+  }
 
   if (FillSize != 1 && FillSize != 2 && FillSize != 4 && FillSize != 8)
     return TokError("invalid '.fill' size, expected 1, 2, 4, or 8");
@@ -2400,9 +2419,9 @@ bool AsmParser::ParseDirectiveFill() {
   return false;
 }
 
-/// ParseDirectiveOrg
+/// parseDirectiveOrg
 ///  ::= .org expression [ , expression ]
-bool AsmParser::ParseDirectiveOrg() {
+bool AsmParser::parseDirectiveOrg() {
   checkForValidSection();
 
   const MCExpr *Offset;
@@ -2435,9 +2454,9 @@ bool AsmParser::ParseDirectiveOrg() {
   return false;
 }
 
-/// ParseDirectiveAlign
+/// parseDirectiveAlign
 ///  ::= {.align, ...} expression [ , expression [ , expression ]]
-bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
+bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   checkForValidSection();
 
   SMLoc AlignmentLoc = getLexer().getLoc();
@@ -2501,13 +2520,13 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   if (MaxBytesLoc.isValid()) {
     if (MaxBytesToFill < 1) {
       Error(MaxBytesLoc, "alignment directive can never be satisfied in this "
-            "many bytes, ignoring maximum bytes expression");
+                         "many bytes, ignoring maximum bytes expression");
       MaxBytesToFill = 0;
     }
 
     if (MaxBytesToFill >= Alignment) {
       Warning(MaxBytesLoc, "maximum bytes expression exceeds alignment and "
-              "has no effect");
+                           "has no effect");
       MaxBytesToFill = 0;
     }
   }
@@ -2527,10 +2546,10 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   return false;
 }
 
-/// ParseDirectiveFile
+/// parseDirectiveFile
 /// ::= .file [number] filename
 /// ::= .file number directory filename
-bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
   // FIXME: I'm not sure what this is.
   int64_t FileNumber = -1;
   SMLoc FileNumberLoc = getLexer().getLoc();
@@ -2546,17 +2565,21 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
     return TokError("unexpected token in '.file' directive");
 
   // Usually the directory and filename together, otherwise just the directory.
-  StringRef Path = getTok().getString();
-  Path = Path.substr(1, Path.size()-2);
+  // Allow the strings to have escaped octal character sequence.
+  std::string Path = getTok().getString();
+  if (parseEscapedString(Path))
+    return true;
   Lex();
 
   StringRef Directory;
   StringRef Filename;
+  std::string FilenameData;
   if (getLexer().is(AsmToken::String)) {
     if (FileNumber == -1)
       return TokError("explicit path specified, but no file number");
-    Filename = getTok().getString();
-    Filename = Filename.substr(1, Filename.size()-2);
+    if (parseEscapedString(FilenameData))
+      return true;
+    Filename = FilenameData;
     Directory = Path;
     Lex();
   } else {
@@ -2570,8 +2593,9 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
     getStreamer().EmitFileDirective(Filename);
   else {
     if (getContext().getGenDwarfForAssembly() == true)
-      Error(DirectiveLoc, "input can't have .file dwarf directives when -g is "
-                        "used to generate dwarf debug info for assembly code");
+      Error(DirectiveLoc,
+            "input can't have .file dwarf directives when -g is "
+            "used to generate dwarf debug info for assembly code");
 
     if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename))
       Error(FileNumberLoc, "file number already allocated");
@@ -2580,15 +2604,15 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveLine
+/// parseDirectiveLine
 /// ::= .line [number]
-bool AsmParser::ParseDirectiveLine() {
+bool AsmParser::parseDirectiveLine() {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getLexer().isNot(AsmToken::Integer))
       return TokError("unexpected token in '.line' directive");
 
     int64_t LineNumber = getTok().getIntVal();
-    (void) LineNumber;
+    (void)LineNumber;
     Lex();
 
     // FIXME: Do something with the .line.
@@ -2600,14 +2624,14 @@ bool AsmParser::ParseDirectiveLine() {
   return false;
 }
 
-/// ParseDirectiveLoc
+/// parseDirectiveLoc
 /// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
 ///                                [epilogue_begin] [is_stmt VALUE] [isa VALUE]
 /// The first number is a file number, must have been previously assigned with
 /// a .file directive, the second number is the line number and optionally the
 /// third number is a column position (zero if not specified).  The remaining
 /// optional items are .loc sub-directives.
-bool AsmParser::ParseDirectiveLoc() {
+bool AsmParser::parseDirectiveLoc() {
   if (getLexer().isNot(AsmToken::Integer))
     return TokError("unexpected token in '.loc' directive");
   int64_t FileNumber = getTok().getIntVal();
@@ -2620,8 +2644,8 @@ bool AsmParser::ParseDirectiveLoc() {
   int64_t LineNumber = 0;
   if (getLexer().is(AsmToken::Integer)) {
     LineNumber = getTok().getIntVal();
-    if (LineNumber < 1)
-      return TokError("line number less than one in '.loc' directive");
+    if (LineNumber < 0)
+      return TokError("line number less than zero in '.loc' directive");
     Lex();
   }
 
@@ -2701,15 +2725,15 @@ bool AsmParser::ParseDirectiveLoc() {
   return false;
 }
 
-/// ParseDirectiveStabs
+/// parseDirectiveStabs
 /// ::= .stabs string, number, number, number
-bool AsmParser::ParseDirectiveStabs() {
+bool AsmParser::parseDirectiveStabs() {
   return TokError("unsupported directive '.stabs'");
 }
 
-/// ParseDirectiveCFISections
+/// parseDirectiveCFISections
 /// ::= .cfi_sections section [, section]
-bool AsmParser::ParseDirectiveCFISections() {
+bool AsmParser::parseDirectiveCFISections() {
   StringRef Name;
   bool EH = false;
   bool Debug = false;
@@ -2738,22 +2762,22 @@ bool AsmParser::ParseDirectiveCFISections() {
   return false;
 }
 
-/// ParseDirectiveCFIStartProc
+/// parseDirectiveCFIStartProc
 /// ::= .cfi_startproc
-bool AsmParser::ParseDirectiveCFIStartProc() {
+bool AsmParser::parseDirectiveCFIStartProc() {
   getStreamer().EmitCFIStartProc();
   return false;
 }
 
-/// ParseDirectiveCFIEndProc
+/// parseDirectiveCFIEndProc
 /// ::= .cfi_endproc
-bool AsmParser::ParseDirectiveCFIEndProc() {
+bool AsmParser::parseDirectiveCFIEndProc() {
   getStreamer().EmitCFIEndProc();
   return false;
 }
 
-/// ParseRegisterOrRegisterNumber - parse register name or number.
-bool AsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
+/// \brief parse register name or number.
+bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
                                               SMLoc DirectiveLoc) {
   unsigned RegNo;
 
@@ -2767,11 +2791,11 @@ bool AsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
   return false;
 }
 
-/// ParseDirectiveCFIDefCfa
+/// parseDirectiveCFIDefCfa
 /// ::= .cfi_def_cfa register,  offset
-bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2786,9 +2810,9 @@ bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveCFIDefCfaOffset
+/// parseDirectiveCFIDefCfaOffset
 /// ::= .cfi_def_cfa_offset offset
-bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
+bool AsmParser::parseDirectiveCFIDefCfaOffset() {
   int64_t Offset = 0;
   if (parseAbsoluteExpression(Offset))
     return true;
@@ -2797,11 +2821,11 @@ bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
   return false;
 }
 
-/// ParseDirectiveCFIRegister
+/// parseDirectiveCFIRegister
 /// ::= .cfi_register register, register
-bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
   int64_t Register1 = 0;
-  if (ParseRegisterOrRegisterNumber(Register1, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2809,16 +2833,23 @@ bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
   Lex();
 
   int64_t Register2 = 0;
-  if (ParseRegisterOrRegisterNumber(Register2, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRegister(Register1, Register2);
   return false;
 }
 
-/// ParseDirectiveCFIAdjustCfaOffset
+/// parseDirectiveCFIWindowSave
+/// ::= .cfi_window_save
+bool AsmParser::parseDirectiveCFIWindowSave() {
+  getStreamer().EmitCFIWindowSave();
+  return false;
+}
+
+/// parseDirectiveCFIAdjustCfaOffset
 /// ::= .cfi_adjust_cfa_offset adjustment
-bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
+bool AsmParser::parseDirectiveCFIAdjustCfaOffset() {
   int64_t Adjustment = 0;
   if (parseAbsoluteExpression(Adjustment))
     return true;
@@ -2827,24 +2858,24 @@ bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
   return false;
 }
 
-/// ParseDirectiveCFIDefCfaRegister
+/// parseDirectiveCFIDefCfaRegister
 /// ::= .cfi_def_cfa_register register
-bool AsmParser::ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIDefCfaRegister(Register);
   return false;
 }
 
-/// ParseDirectiveCFIOffset
+/// parseDirectiveCFIOffset
 /// ::= .cfi_offset register, offset
-bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   int64_t Offset = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2858,12 +2889,12 @@ bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveCFIRelOffset
+/// parseDirectiveCFIRelOffset
 /// ::= .cfi_rel_offset register, offset
-bool AsmParser::ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2900,11 +2931,11 @@ static bool isValidEncoding(int64_t Encoding) {
   return true;
 }
 
-/// ParseDirectiveCFIPersonalityOrLsda
+/// parseDirectiveCFIPersonalityOrLsda
 /// IsPersonality true for cfi_personality, false for cfi_lsda
 /// ::= .cfi_personality encoding, [symbol_name]
 /// ::= .cfi_lsda encoding, [symbol_name]
-bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
+bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   int64_t Encoding = 0;
   if (parseAbsoluteExpression(Encoding))
     return true;
@@ -2931,46 +2962,46 @@ bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   return false;
 }
 
-/// ParseDirectiveCFIRememberState
+/// parseDirectiveCFIRememberState
 /// ::= .cfi_remember_state
-bool AsmParser::ParseDirectiveCFIRememberState() {
+bool AsmParser::parseDirectiveCFIRememberState() {
   getStreamer().EmitCFIRememberState();
   return false;
 }
 
-/// ParseDirectiveCFIRestoreState
+/// parseDirectiveCFIRestoreState
 /// ::= .cfi_remember_state
-bool AsmParser::ParseDirectiveCFIRestoreState() {
+bool AsmParser::parseDirectiveCFIRestoreState() {
   getStreamer().EmitCFIRestoreState();
   return false;
 }
 
-/// ParseDirectiveCFISameValue
+/// parseDirectiveCFISameValue
 /// ::= .cfi_same_value register
-bool AsmParser::ParseDirectiveCFISameValue(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFISameValue(Register);
   return false;
 }
 
-/// ParseDirectiveCFIRestore
+/// parseDirectiveCFIRestore
 /// ::= .cfi_restore register
-bool AsmParser::ParseDirectiveCFIRestore(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRestore(Register);
   return false;
 }
 
-/// ParseDirectiveCFIEscape
+/// parseDirectiveCFIEscape
 /// ::= .cfi_escape expression[,...]
-bool AsmParser::ParseDirectiveCFIEscape() {
+bool AsmParser::parseDirectiveCFIEscape() {
   std::string Values;
   int64_t CurrValue;
   if (parseAbsoluteExpression(CurrValue))
@@ -2991,9 +3022,9 @@ bool AsmParser::ParseDirectiveCFIEscape() {
   return false;
 }
 
-/// ParseDirectiveCFISignalFrame
+/// parseDirectiveCFISignalFrame
 /// ::= .cfi_signal_frame
-bool AsmParser::ParseDirectiveCFISignalFrame() {
+bool AsmParser::parseDirectiveCFISignalFrame() {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
                  "unexpected token in '.cfi_signal_frame'");
@@ -3002,40 +3033,40 @@ bool AsmParser::ParseDirectiveCFISignalFrame() {
   return false;
 }
 
-/// ParseDirectiveCFIUndefined
+/// parseDirectiveCFIUndefined
 /// ::= .cfi_undefined register
-bool AsmParser::ParseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIUndefined(Register);
   return false;
 }
 
-/// ParseDirectiveMacrosOnOff
+/// parseDirectiveMacrosOnOff
 /// ::= .macros_on
 /// ::= .macros_off
-bool AsmParser::ParseDirectiveMacrosOnOff(StringRef Directive) {
+bool AsmParser::parseDirectiveMacrosOnOff(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
                  "unexpected token in '" + Directive + "' directive");
 
-  SetMacrosEnabled(Directive == ".macros_on");
+  setMacrosEnabled(Directive == ".macros_on");
   return false;
 }
 
-/// ParseDirectiveMacro
+/// parseDirectiveMacro
 /// ::= .macro name [parameters]
-bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   StringRef Name;
   if (parseIdentifier(Name))
     return TokError("expected identifier in '.macro' directive");
 
   MCAsmMacroParameters Parameters;
   // Argument delimiter is initially unknown. It will be set by
-  // ParseMacroArgument()
+  // parseMacroArgument()
   AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
@@ -3045,7 +3076,7 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
 
       if (getLexer().is(AsmToken::Equal)) {
         Lex();
-        if (ParseMacroArgument(Parameter.second, ArgumentDelimiter))
+        if (parseMacroArgument(Parameter.second, ArgumentDelimiter))
           return true;
       }
 
@@ -3085,19 +3116,19 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
     eatToEndOfStatement();
   }
 
-  if (LookupMacro(Name)) {
+  if (lookupMacro(Name)) {
     return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
   }
 
   const char *BodyStart = StartToken.getLoc().getPointer();
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
-  CheckForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  DefineMacro(Name, MCAsmMacro(Name, Body, Parameters));
+  checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
+  defineMacro(Name, MCAsmMacro(Name, Body, Parameters));
   return false;
 }
 
-/// CheckForBadMacro
+/// checkForBadMacro
 ///
 /// With the support added for named parameters there may be code out there that
 /// is transitioning from positional parameters.  In versions of gas that did
@@ -3111,7 +3142,7 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
 /// intended or change the macro to use the named parameters.  It is possible
 /// this warning will trigger when the none of the named parameters are used
 /// and the strings like $1 are infact to simply to be passed trough unchanged.
-void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
+void AsmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
                                  StringRef Body,
                                  MCAsmMacroParameters Parameters) {
   // If this macro is not defined with named parameters the warning we are
@@ -3149,21 +3180,21 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
       break;
 
     if (Body[Pos] == '$') {
-      switch (Body[Pos+1]) {
-        // $$ => $
+      switch (Body[Pos + 1]) {
+      // $$ => $
       case '$':
         break;
 
-        // $n => number of arguments
+      // $n => number of arguments
       case 'n':
         PositionalParametersFound = true;
         break;
 
-        // $[0-9] => argument
+      // $[0-9] => argument
       default: {
         PositionalParametersFound = true;
         break;
-        }
+      }
       }
       Pos += 2;
     } else {
@@ -3171,19 +3202,19 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
       while (isIdentifierChar(Body[I]) && I + 1 != End)
         ++I;
 
-      const char *Begin = Body.data() + Pos +1;
-      StringRef Argument(Begin, I - (Pos +1));
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].first == Argument)
           break;
 
       if (Index == NParameters) {
-          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
-            Pos += 3;
-          else {
-            Pos = I;
-          }
+        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+          Pos += 3;
+        else {
+          Pos = I;
+        }
       } else {
         NamedParametersFound = true;
         Pos += 1 + Argument.size();
@@ -3199,29 +3230,29 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
                           "found in body which will have no effect");
 }
 
-/// ParseDirectiveEndMacro
+/// parseDirectiveEndMacro
 /// ::= .endm
 /// ::= .endmacro
-bool AsmParser::ParseDirectiveEndMacro(StringRef Directive) {
+bool AsmParser::parseDirectiveEndMacro(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '" + Directive + "' directive");
 
   // If we are inside a macro instantiation, terminate the current
   // instantiation.
-  if (InsideMacroInstantiation()) {
-    HandleMacroExit();
+  if (isInsideMacroInstantiation()) {
+    handleMacroExit();
     return false;
   }
 
   // Otherwise, this .endmacro is a stray entry in the file; well formed
   // .endmacro directives are handled during the macro definition parsing.
   return TokError("unexpected '" + Directive + "' in file, "
-                  "no current macro definition");
+                                               "no current macro definition");
 }
 
-/// ParseDirectivePurgeMacro
+/// parseDirectivePurgeMacro
 /// ::= .purgem
-bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   StringRef Name;
   if (parseIdentifier(Name))
     return TokError("expected identifier in '.purgem' directive");
@@ -3229,16 +3260,16 @@ bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.purgem' directive");
 
-  if (!LookupMacro(Name))
+  if (!lookupMacro(Name))
     return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
 
-  UndefineMacro(Name);
+  undefineMacro(Name);
   return false;
 }
 
-/// ParseDirectiveBundleAlignMode
+/// parseDirectiveBundleAlignMode
 /// ::= {.bundle_align_mode} expression
-bool AsmParser::ParseDirectiveBundleAlignMode() {
+bool AsmParser::parseDirectiveBundleAlignMode() {
   checkForValidSection();
 
   // Expect a single argument: an expression that evaluates to a constant
@@ -3262,9 +3293,9 @@ bool AsmParser::ParseDirectiveBundleAlignMode() {
   return false;
 }
 
-/// ParseDirectiveBundleLock
+/// parseDirectiveBundleLock
 /// ::= {.bundle_lock} [align_to_end]
-bool AsmParser::ParseDirectiveBundleLock() {
+bool AsmParser::parseDirectiveBundleLock() {
   checkForValidSection();
   bool AlignToEnd = false;
 
@@ -3272,7 +3303,7 @@ bool AsmParser::ParseDirectiveBundleLock() {
     StringRef Option;
     SMLoc Loc = getTok().getLoc();
     const char *kInvalidOptionError =
-      "invalid option for '.bundle_lock' directive";
+        "invalid option for '.bundle_lock' directive";
 
     if (parseIdentifier(Option))
       return Error(Loc, kInvalidOptionError);
@@ -3291,9 +3322,9 @@ bool AsmParser::ParseDirectiveBundleLock() {
   return false;
 }
 
-/// ParseDirectiveBundleLock
+/// parseDirectiveBundleLock
 /// ::= {.bundle_lock}
-bool AsmParser::ParseDirectiveBundleUnlock() {
+bool AsmParser::parseDirectiveBundleUnlock() {
   checkForValidSection();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -3304,9 +3335,9 @@ bool AsmParser::ParseDirectiveBundleUnlock() {
   return false;
 }
 
-/// ParseDirectiveSpace
+/// parseDirectiveSpace
 /// ::= (.skip | .space) expression [ , expression ]
-bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
+bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
   checkForValidSection();
 
   int64_t NumBytes;
@@ -3329,8 +3360,8 @@ bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
   Lex();
 
   if (NumBytes <= 0)
-    return TokError("invalid number of bytes in '" +
-                    Twine(IDVal) + "' directive");
+    return TokError("invalid number of bytes in '" + Twine(IDVal) +
+                    "' directive");
 
   // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
   getStreamer().EmitFill(NumBytes, FillExpr);
@@ -3338,9 +3369,9 @@ bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
   return false;
 }
 
-/// ParseDirectiveLEB128
+/// parseDirectiveLEB128
 /// ::= (.sleb128 | .uleb128) expression
-bool AsmParser::ParseDirectiveLEB128(bool Signed) {
+bool AsmParser::parseDirectiveLEB128(bool Signed) {
   checkForValidSection();
   const MCExpr *Value;
 
@@ -3358,9 +3389,9 @@ bool AsmParser::ParseDirectiveLEB128(bool Signed) {
   return false;
 }
 
-/// ParseDirectiveSymbolAttribute
+/// parseDirectiveSymbolAttribute
 ///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
-bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
+bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       StringRef Name;
@@ -3375,7 +3406,8 @@ bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
       if (Sym->isTemporary())
         return Error(Loc, "non-local symbol required in directive");
 
-      getStreamer().EmitSymbolAttribute(Sym, Attr);
+      if (!getStreamer().EmitSymbolAttribute(Sym, Attr))
+        return Error(Loc, "unable to emit symbol attribute");
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -3390,9 +3422,9 @@ bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   return false;
 }
 
-/// ParseDirectiveComm
+/// parseDirectiveComm
 ///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
-bool AsmParser::ParseDirectiveComm(bool IsLocal) {
+bool AsmParser::parseDirectiveComm(bool IsLocal) {
   checkForValidSection();
 
   SMLoc IDLoc = getLexer().getLoc();
@@ -3442,14 +3474,14 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   // but a size of .lcomm creates a bss symbol of size zero.
   if (Size < 0)
     return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
-                 "be less than zero");
+                          "be less than zero");
 
   // NOTE: The alignment in the directive is a power of 2 value, the assembler
   // may internally end up wanting an alignment in bytes.
   // FIXME: Diagnose overflow.
   if (Pow2Alignment < 0)
     return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
-                 "alignment, can't be less than zero");
+                                   "alignment, can't be less than zero");
 
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
@@ -3464,9 +3496,9 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   return false;
 }
 
-/// ParseDirectiveAbort
+/// parseDirectiveAbort
 ///  ::= .abort [... message ...]
-bool AsmParser::ParseDirectiveAbort() {
+bool AsmParser::parseDirectiveAbort() {
   // FIXME: Use loc from directive.
   SMLoc Loc = getLexer().getLoc();
 
@@ -3485,25 +3517,25 @@ bool AsmParser::ParseDirectiveAbort() {
   return false;
 }
 
-/// ParseDirectiveInclude
+/// parseDirectiveInclude
 ///  ::= .include "filename"
-bool AsmParser::ParseDirectiveInclude() {
+bool AsmParser::parseDirectiveInclude() {
   if (getLexer().isNot(AsmToken::String))
     return TokError("expected string in '.include' directive");
 
-  std::string Filename = getTok().getString();
+  // Allow the strings to have escaped octal character sequence.
+  std::string Filename;
+  if (parseEscapedString(Filename))
+    return true;
   SMLoc IncludeLoc = getLexer().getLoc();
   Lex();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.include' directive");
 
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
   // Attempt to switch the lexer to the included file before consuming the end
   // of statement to avoid losing it when we switch.
-  if (EnterIncludeFile(Filename)) {
+  if (enterIncludeFile(Filename)) {
     Error(IncludeLoc, "Could not find include file '" + Filename + "'");
     return true;
   }
@@ -3511,24 +3543,24 @@ bool AsmParser::ParseDirectiveInclude() {
   return false;
 }
 
-/// ParseDirectiveIncbin
+/// parseDirectiveIncbin
 ///  ::= .incbin "filename"
-bool AsmParser::ParseDirectiveIncbin() {
+bool AsmParser::parseDirectiveIncbin() {
   if (getLexer().isNot(AsmToken::String))
     return TokError("expected string in '.incbin' directive");
 
-  std::string Filename = getTok().getString();
+  // Allow the strings to have escaped octal character sequence.
+  std::string Filename;
+  if (parseEscapedString(Filename))
+    return true;
   SMLoc IncbinLoc = getLexer().getLoc();
   Lex();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.incbin' directive");
 
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
   // Attempt to process the included file.
-  if (ProcessIncbinFile(Filename)) {
+  if (processIncbinFile(Filename)) {
     Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
     return true;
   }
@@ -3536,9 +3568,9 @@ bool AsmParser::ParseDirectiveIncbin() {
   return false;
 }
 
-/// ParseDirectiveIf
+/// parseDirectiveIf
 /// ::= .if expression
-bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
   if (TheCondState.Ignore) {
@@ -3560,9 +3592,9 @@ bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveIfb
+/// parseDirectiveIfb
 /// ::= .ifb string
-bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+bool AsmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
 
@@ -3583,16 +3615,16 @@ bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   return false;
 }
 
-/// ParseDirectiveIfc
+/// parseDirectiveIfc
 /// ::= .ifc string1, string2
-bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
+bool AsmParser::parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
 
   if (TheCondState.Ignore) {
     eatToEndOfStatement();
   } else {
-    StringRef Str1 = ParseStringToComma();
+    StringRef Str1 = parseStringToComma();
 
     if (getLexer().isNot(AsmToken::Comma))
       return TokError("unexpected token in '.ifc' directive");
@@ -3613,9 +3645,9 @@ bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
   return false;
 }
 
-/// ParseDirectiveIfdef
+/// parseDirectiveIfdef
 /// ::= .ifdef symbol
-bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
+bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   StringRef Name;
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
@@ -3640,9 +3672,9 @@ bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   return false;
 }
 
-/// ParseDirectiveElseIf
+/// parseDirectiveElseIf
 /// ::= .elseif expression
-bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveElseIf(SMLoc DirectiveLoc) {
   if (TheCondState.TheCond != AsmCond::IfCond &&
       TheCondState.TheCond != AsmCond::ElseIfCond)
     Error(DirectiveLoc, "Encountered a .elseif that doesn't follow a .if or "
@@ -3671,9 +3703,9 @@ bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveElse
+/// parseDirectiveElse
 /// ::= .else
-bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.else' directive");
 
@@ -3695,16 +3727,15 @@ bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveEndIf
+/// parseDirectiveEndIf
 /// ::= .endif
-bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.endif' directive");
 
   Lex();
 
-  if ((TheCondState.TheCond == AsmCond::NoCond) ||
-      TheCondStack.empty())
+  if ((TheCondState.TheCond == AsmCond::NoCond) || TheCondStack.empty())
     Error(DirectiveLoc, "Encountered a .endif that doesn't follow a .if or "
                         ".else");
   if (!TheCondStack.empty()) {
@@ -3748,7 +3779,6 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".extern"] = DK_EXTERN;
   DirectiveKindMap[".globl"] = DK_GLOBL;
   DirectiveKindMap[".global"] = DK_GLOBAL;
-  DirectiveKindMap[".indirect_symbol"] = DK_INDIRECT_SYMBOL;
   DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
   DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
   DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
@@ -3810,6 +3840,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
   DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
   DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
+  DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
   DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
   DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
   DirectiveKindMap[".macro"] = DK_MACRO;
@@ -3818,8 +3849,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".purgem"] = DK_PURGEM;
 }
 
-
-MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
+MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
   AsmToken EndToken, StartToken = getTok();
 
   unsigned NestLevel = 0;
@@ -3836,8 +3866,7 @@ MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
     }
 
     // Otherwise, check whether we have reached the .endr.
-    if (Lexer.is(AsmToken::Identifier) &&
-        getTok().getIdentifier() == ".endr") {
+    if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
       if (NestLevel == 0) {
         EndToken = getTok();
         Lex();
@@ -3865,19 +3894,17 @@ MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
   return &MacroLikeBodies.back();
 }
 
-void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                          raw_svector_ostream &OS) {
   OS << ".endr\n";
 
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
-  MacroInstantiation *MI = new MacroInstantiation(M, DirectiveLoc,
-                                                  CurBuffer,
-                                                  getTok().getLoc(),
-                                                  Instantiation);
+  MacroInstantiation *MI = new MacroInstantiation(
+      M, DirectiveLoc, CurBuffer, getTok().getLoc(), Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -3886,7 +3913,7 @@ void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
   Lex();
 }
 
-bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc) {
   int64_t Count;
   if (parseAbsoluteExpression(Count))
     return TokError("unexpected token in '.rept' directive");
@@ -3901,7 +3928,7 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the rept definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3915,14 +3942,14 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
     if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc()))
       return true;
   }
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-/// ParseDirectiveIrp
+/// parseDirectiveIrp
 /// ::= .irp symbol,values
-bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   MCAsmMacroParameters Parameters;
   MCAsmMacroParameter Parameter;
 
@@ -3937,14 +3964,14 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(0, A))
+  if (parseMacroArguments(0, A))
     return true;
 
   // Eat the end of statement.
   Lex();
 
   // Lex the irp definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3961,14 +3988,14 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
       return true;
   }
 
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-/// ParseDirectiveIrpc
+/// parseDirectiveIrpc
 /// ::= .irpc symbol,values
-bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
   MCAsmMacroParameters Parameters;
   MCAsmMacroParameter Parameter;
 
@@ -3983,7 +4010,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(0, A))
+  if (parseMacroArguments(0, A))
     return true;
 
   if (A.size() != 1 || A.front().size() != 1)
@@ -3993,7 +4020,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the irpc definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -4006,7 +4033,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   std::size_t I, End = Values.size();
   for (I = 0; I < End; ++I) {
     MCAsmMacroArgument Arg;
-    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1)));
+    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I + 1)));
 
     MCAsmMacroArguments Args;
     Args.push_back(Arg);
@@ -4015,24 +4042,24 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
       return true;
   }
 
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
   if (ActiveMacros.empty())
     return TokError("unmatched '.endr' directive");
 
   // The only .repl that should get here are the ones created by
-  // InstantiateMacroLikeBody.
+  // instantiateMacroLikeBody.
   assert(getLexer().is(AsmToken::EndOfStatement));
 
-  HandleMacroExit();
+  handleMacroExit();
   return false;
 }
 
-bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
+bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
                                      size_t Len) {
   const MCExpr *Value;
   SMLoc ExprLoc = getLexer().getLoc();
@@ -4049,7 +4076,7 @@ bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
   return false;
 }
 
-bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
+bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   const MCExpr *Value;
   SMLoc ExprLoc = getLexer().getLoc();
   if (parseExpression(Value))
@@ -4061,16 +4088,15 @@ bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   if (!isPowerOf2_64(IntValue))
     return Error(ExprLoc, "literal value not a power of two greater then zero");
 
-  Info.AsmRewrites->push_back(AsmRewrite(AOK_Align, IDLoc, 5,
-                                         Log2_64(IntValue)));
+  Info.AsmRewrites->push_back(
+      AsmRewrite(AOK_Align, IDLoc, 5, Log2_64(IntValue)));
   return false;
 }
 
 // We are comparing pointers, but the pointers are relative to a single string.
 // Thus, this should always be deterministic.
-static int RewritesSort(const void *A, const void *B) {
-  const AsmRewrite *AsmRewriteA = static_cast<const AsmRewrite *>(A);
-  const AsmRewrite *AsmRewriteB = static_cast<const AsmRewrite *>(B);
+static int rewritesSort(const AsmRewrite *AsmRewriteA,
+                        const AsmRewrite *AsmRewriteB) {
   if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
     return -1;
   if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
@@ -4080,25 +4106,22 @@ static int RewritesSort(const void *A, const void *B) {
   // rewrite to the same location.  Make sure the SizeDirective rewrite is
   // performed first, then the Imm/ImmPrefix and finally the Input/Output.  This
   // ensures the sort algorithm is stable.
-  if (AsmRewritePrecedence [AsmRewriteA->Kind] >
-      AsmRewritePrecedence [AsmRewriteB->Kind])
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] >
+      AsmRewritePrecedence[AsmRewriteB->Kind])
     return -1;
 
-  if (AsmRewritePrecedence [AsmRewriteA->Kind] <
-      AsmRewritePrecedence [AsmRewriteB->Kind])
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] <
+      AsmRewritePrecedence[AsmRewriteB->Kind])
     return 1;
-  llvm_unreachable ("Unstable rewrite sort.");
+  llvm_unreachable("Unstable rewrite sort.");
 }
 
-bool
-AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
-                            unsigned &NumOutputs, unsigned &NumInputs,
-                            SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
-                            SmallVectorImpl<std::string> &Constraints,
-                            SmallVectorImpl<std::string> &Clobbers,
-                            const MCInstrInfo *MII,
-                            const MCInstPrinter *IP,
-                            MCAsmParserSemaCallback &SI) {
+bool AsmParser::parseMSInlineAsm(
+    void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
+    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+    SmallVectorImpl<std::string> &Constraints,
+    SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
+    const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
   SmallVector<void *, 4> InputDecls;
   SmallVector<void *, 4> OutputDecls;
   SmallVector<bool, 4> InputDeclsAddressOf;
@@ -4117,7 +4140,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   unsigned OutputIdx = 0;
   while (getLexer().isNot(AsmToken::Eof)) {
     ParseStatementInfo Info(&AsmStrRewrites);
-    if (ParseStatement(Info))
+    if (parseStatement(Info))
       return true;
 
     if (Info.ParseError)
@@ -4205,7 +4228,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   raw_string_ostream OS(AsmStringIR);
   const char *AsmStart = SrcMgr.getMemoryBuffer(0)->getBufferStart();
   const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd();
-  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), RewritesSort);
+  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
   for (SmallVectorImpl<AsmRewrite>::iterator I = AsmStrRewrites.begin(),
                                              E = AsmStrRewrites.end();
        I != E; ++I) {
@@ -4230,7 +4253,8 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     unsigned AdditionalSkip = 0;
     // Rewrite expressions in $N notation.
     switch (Kind) {
-    default: break;
+    default:
+      break;
     case AOK_Imm:
       OS << "$$" << (*I).Val;
       break;
@@ -4285,8 +4309,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
 }
 
 /// \brief Create an MCAsmParser instance.
-MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
-                                     MCContext &C, MCStreamer &Out,
-                                     const MCAsmInfo &MAI) {
+MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
+                                     MCStreamer &Out, const MCAsmInfo &MAI) {
   return new AsmParser(SM, C, Out, MAI);
 }
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index df1794c..d8343a3 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -35,6 +35,10 @@ class COFFAsmParser : public MCAsmParserExtension {
                           unsigned Characteristics,
                           SectionKind Kind);
 
+  bool ParseSectionSwitch(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind, StringRef COMDATSymName,
+                          COFF::COMDATType Type, const MCSectionCOFF *Assoc);
+
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionFlags(StringRef FlagsString, unsigned* Flags);
 
@@ -111,6 +115,8 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveEndef(StringRef, SMLoc);
   bool ParseDirectiveSecRel32(StringRef, SMLoc);
+  bool parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
+                               const MCSectionCOFF *&Assoc);
   bool ParseDirectiveLinkOnce(StringRef, SMLoc);
 
   // Win64 EH directives.
@@ -284,12 +290,22 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
+  return ParseSectionSwitch(Section, Characteristics, Kind, "",
+                            COFF::IMAGE_COMDAT_SELECT_ANY, 0);
+}
+
+bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
+                                       unsigned Characteristics,
+                                       SectionKind Kind,
+                                       StringRef COMDATSymName,
+                                       COFF::COMDATType Type,
+                                       const MCSectionCOFF *Assoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in section switching directive");
   Lex();
 
   getStreamer().SwitchSection(getContext().getCOFFSection(
-                                Section, Characteristics, Kind));
+      Section, Characteristics, Kind, COMDATSymName, Type, Assoc));
 
   return false;
 }
@@ -303,7 +319,7 @@ bool COFFAsmParser::ParseSectionName(StringRef &SectionName) {
   return false;
 }
 
-// .section name [, "flags"]
+// .section name [, "flags"] [, identifier [ identifier ], identifier]
 //
 // Supported flags:
 //   a: Ignored.
@@ -340,11 +356,30 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
       return true;
   }
 
+  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
+  const MCSectionCOFF *Assoc = 0;
+  StringRef COMDATSymName;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    Flags |= COFF::IMAGE_SCN_LNK_COMDAT;
+
+    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+      return true;
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("expected comma in directive");
+    Lex();
+
+    if (getParser().parseIdentifier(COMDATSymName))
+      return TokError("expected identifier in directive");
+  }
+
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
   SectionKind Kind = computeSectionKind(Flags);
-  ParseSectionSwitch(SectionName, Flags, Kind);
+  ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type, Assoc);
   return false;
 }
 
@@ -409,37 +444,29 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   return false;
 }
 
-/// ParseDirectiveLinkOnce
-///  ::= .linkonce [ identifier [ identifier ] ]
-bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
-  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-
-  if (getLexer().is(AsmToken::Identifier)) {
-    StringRef TypeId = getTok().getIdentifier();
+/// ::= [ identifier [ identifier ] ]
+bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
+                                            const MCSectionCOFF *&Assoc) {
+  StringRef TypeId = getTok().getIdentifier();
 
-    Type = StringSwitch<COFF::COMDATType>(TypeId)
-      .Case("one_only", COFF::IMAGE_COMDAT_SELECT_NODUPLICATES)
-      .Case("discard", COFF::IMAGE_COMDAT_SELECT_ANY)
-      .Case("same_size", COFF::IMAGE_COMDAT_SELECT_SAME_SIZE)
-      .Case("same_contents", COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH)
-      .Case("associative", COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
-      .Case("largest", COFF::IMAGE_COMDAT_SELECT_LARGEST)
-      .Case("newest", COFF::IMAGE_COMDAT_SELECT_NEWEST)
-      .Default((COFF::COMDATType)0);
+  Type = StringSwitch<COFF::COMDATType>(TypeId)
+    .Case("one_only", COFF::IMAGE_COMDAT_SELECT_NODUPLICATES)
+    .Case("discard", COFF::IMAGE_COMDAT_SELECT_ANY)
+    .Case("same_size", COFF::IMAGE_COMDAT_SELECT_SAME_SIZE)
+    .Case("same_contents", COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH)
+    .Case("associative", COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+    .Case("largest", COFF::IMAGE_COMDAT_SELECT_LARGEST)
+    .Case("newest", COFF::IMAGE_COMDAT_SELECT_NEWEST)
+    .Default((COFF::COMDATType)0);
 
-    if (Type == 0)
-      return TokError(Twine("unrecognized COMDAT type '" + TypeId + "'"));
+  if (Type == 0)
+    return TokError(Twine("unrecognized COMDAT type '" + TypeId + "'"));
 
-    Lex();
-  }
-  
-  const MCSectionCOFF *Current = static_cast<const MCSectionCOFF*>(
-                                       getStreamer().getCurrentSection().first);
+  Lex();
 
-  const MCSectionCOFF *Assoc = 0;
   if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    StringRef AssocName;
     SMLoc Loc = getTok().getLoc();
+    StringRef AssocName;
     if (ParseSectionName(AssocName))
       return TokError("expected associated section name");
 
@@ -447,14 +474,33 @@ bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
                                         getContext().getCOFFSection(AssocName));
     if (!Assoc)
       return Error(Loc, "cannot associate unknown section '" + AssocName + "'");
-    if (Assoc == Current)
-      return Error(Loc, "cannot associate a section with itself");
     if (!(Assoc->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT))
       return Error(Loc, "associated section must be a COMDAT section");
     if (Assoc->getSelection() == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
       return Error(Loc, "associated section cannot be itself associative");
   }
 
+  return false;
+}
+
+/// ParseDirectiveLinkOnce
+///  ::= .linkonce [ identifier [ identifier ] ]
+bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
+  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
+  const MCSectionCOFF *Assoc = 0;
+  if (getLexer().is(AsmToken::Identifier))
+    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+      return true;
+
+  const MCSectionCOFF *Current = static_cast<const MCSectionCOFF*>(
+                                       getStreamer().getCurrentSection().first);
+
+
+  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (Assoc == Current)
+      return Error(Loc, "cannot associate a section with itself");
+  }
+
   if (Current->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
     return Error(Loc, Twine("section '") + Current->getSectionName() +
                                                        "' is already linkonce");
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 0aeeaf6..4c9bafa 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -45,6 +45,8 @@ public:
     this->MCAsmParserExtension::Initialize(Parser);
 
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDesc>(".desc");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveIndirectSymbol>(
+      ".indirect_symbol");
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveLsym>(".lsym");
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols>(
       ".subsections_via_symbols");
@@ -69,6 +71,7 @@ public:
       ".end_data_region");
 
     // Special section directives.
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveBss>(".bss");
     addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
     addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(
       ".const_data");
@@ -163,6 +166,7 @@ public:
   }
 
   bool ParseDirectiveDesc(StringRef, SMLoc);
+  bool ParseDirectiveIndirectSymbol(StringRef, SMLoc);
   bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
   bool ParseDirectiveLsym(StringRef, SMLoc);
   bool ParseDirectiveLinkerOption(StringRef, SMLoc);
@@ -179,6 +183,10 @@ public:
   bool ParseDirectiveDataRegionEnd(StringRef, SMLoc);
 
   // Named Section Directive
+  bool ParseSectionDirectiveBss(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__bss");
+  }
+
   bool ParseSectionDirectiveConst(StringRef, SMLoc) {
     return ParseSectionSwitch("__TEXT", "__const");
   }
@@ -415,6 +423,39 @@ bool DarwinAsmParser::ParseDirectiveDesc(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveIndirectSymbol
+///  ::= .indirect_symbol identifier
+bool DarwinAsmParser::ParseDirectiveIndirectSymbol(StringRef, SMLoc Loc) {
+  const MCSectionMachO *Current = static_cast<const MCSectionMachO*>(
+                                       getStreamer().getCurrentSection().first);
+  unsigned SectionType = Current->getType();
+  if (SectionType != MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS &&
+      SectionType != MCSectionMachO::S_LAZY_SYMBOL_POINTERS &&
+      SectionType != MCSectionMachO::S_SYMBOL_STUBS)
+    return Error(Loc, "indirect symbol not in a symbol pointer or stub "
+                      "section");
+
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in .indirect_symbol directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  // Assembler local symbols don't make any sense here. Complain loudly.
+  if (Sym->isTemporary())
+    return TokError("non-local symbol required in directive");
+
+  if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_IndirectSymbol))
+    return TokError("unable to emit indirect symbol attribute for: " + Name);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.indirect_symbol' directive");
+
+  Lex();
+
+  return false;
+}
+
 /// ParseDirectiveDumpOrLoad
 ///  ::= ( .dump | .load ) "filename"
 bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 3134fc3..8807975 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
 
@@ -30,14 +31,11 @@ class ELFAsmParser : public MCAsmParserExtension {
     getParser().addDirectiveHandler(Directive, Handler);
   }
 
-  bool ParseSectionSwitch(StringRef Section, unsigned Type,
-                          unsigned Flags, SectionKind Kind);
-  bool SeenIdent;
+  bool ParseSectionSwitch(StringRef Section, unsigned Type, unsigned Flags,
+                          SectionKind Kind);
 
 public:
-  ELFAsmParser() : SeenIdent(false) {
-    BracketExpressionsSupported = true;
-  }
+  ELFAsmParser() { BracketExpressionsSupported = true; }
 
   virtual void Initialize(MCAsmParser &Parser) {
     // Call the base implementation.
@@ -241,7 +239,6 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   }
 
   for (;;) {
-    StringRef Tmp;
     unsigned CurSize;
 
     SMLoc PrevLoc = getLexer().getLoc();
@@ -279,14 +276,17 @@ static SectionKind computeSectionKind(unsigned Flags) {
   return SectionKind::getDataRel();
 }
 
-static int parseSectionFlags(StringRef flagsStr) {
-  int flags = 0;
+static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
+  unsigned flags = 0;
 
   for (unsigned i = 0; i < flagsStr.size(); i++) {
     switch (flagsStr[i]) {
     case 'a':
       flags |= ELF::SHF_ALLOC;
       break;
+    case 'e':
+      flags |= ELF::SHF_EXCLUDE;
+      break;
     case 'x':
       flags |= ELF::SHF_EXECINSTR;
       break;
@@ -311,8 +311,11 @@ static int parseSectionFlags(StringRef flagsStr) {
     case 'G':
       flags |= ELF::SHF_GROUP;
       break;
+    case '?':
+      *UseLastGroup = true;
+      break;
     default:
-      return -1;
+      return -1U;
     }
   }
 
@@ -352,6 +355,7 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
   StringRef GroupName;
   unsigned Flags = 0;
   const MCExpr *Subsection = 0;
+  bool UseLastGroup = false;
 
   // Set the defaults first.
   if (SectionName == ".fini" || SectionName == ".init" ||
@@ -377,13 +381,16 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
     StringRef FlagsStr = getTok().getStringContents();
     Lex();
 
-    int extraFlags = parseSectionFlags(FlagsStr);
-    if (extraFlags < 0)
+    unsigned extraFlags = parseSectionFlags(FlagsStr, &UseLastGroup);
+    if (extraFlags == -1U)
       return TokError("unknown flag");
     Flags |= extraFlags;
 
     bool Mergeable = Flags & ELF::SHF_MERGE;
     bool Group = Flags & ELF::SHF_GROUP;
+    if (Group && UseLastGroup)
+      return TokError("Section cannot specifiy a group name while also acting "
+                      "as a member of the last group");
 
     if (getLexer().isNot(AsmToken::Comma)) {
       if (Mergeable)
@@ -392,10 +399,13 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
         return TokError("Group section must specify the type");
     } else {
       Lex();
-      if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
-        return TokError("expected '@' or '%' before type");
+      if (getLexer().is(AsmToken::At) || getLexer().is(AsmToken::Percent) ||
+          getLexer().is(AsmToken::String)) {
+        if (!getLexer().is(AsmToken::String))
+          Lex();
+      } else
+        return TokError("expected '@<type>', '%<type>' or \"<type>\"");
 
-      Lex();
       if (getParser().parseIdentifier(TypeName))
         return TokError("expected identifier in directive");
 
@@ -461,6 +471,16 @@ EndStmt:
       return TokError("unknown section type");
   }
 
+  if (UseLastGroup) {
+    MCSectionSubPair CurrentSection = getStreamer().getCurrentSection();
+    if (const MCSectionELF *Section =
+            cast_or_null<MCSectionELF>(CurrentSection.first))
+      if (const MCSymbol *Group = Section->getGroup()) {
+        GroupName = Group->getName();
+        Flags |= ELF::SHF_GROUP;
+      }
+  }
+
   SectionKind Kind = computeSectionKind(Flags);
   getStreamer().SwitchSection(getContext().getELFSection(SectionName, Type,
                                                          Flags, Kind, Size,
@@ -479,7 +499,11 @@ bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
 }
 
 /// ParseDirectiveELFType
+///  ::= .type identifier , STT_<TYPE_IN_UPPER_CASE>
+///  ::= .type identifier , #attribute
 ///  ::= .type identifier , @attribute
+///  ::= .type identifier , %attribute
+///  ::= .type identifier , "attribute"
 bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   StringRef Name;
   if (getParser().parseIdentifier(Name))
@@ -492,26 +516,42 @@ bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
     return TokError("unexpected token in '.type' directive");
   Lex();
 
-  if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
-    return TokError("expected '@' or '%' before type");
-  Lex();
-
   StringRef Type;
   SMLoc TypeLoc;
+  MCSymbolAttr Attr;
+  if (getLexer().is(AsmToken::Identifier)) {
+    TypeLoc = getLexer().getLoc();
+    if (getParser().parseIdentifier(Type))
+      return TokError("expected symbol type in directive");
+    Attr = StringSwitch<MCSymbolAttr>(Type)
+               .Case("STT_FUNC", MCSA_ELF_TypeFunction)
+               .Case("STT_OBJECT", MCSA_ELF_TypeObject)
+               .Case("STT_TLS", MCSA_ELF_TypeTLS)
+               .Case("STT_COMMON", MCSA_ELF_TypeCommon)
+               .Case("STT_NOTYPE", MCSA_ELF_TypeNoType)
+               .Case("STT_GNU_IFUNC", MCSA_ELF_TypeIndFunction)
+               .Default(MCSA_Invalid);
+  } else if (getLexer().is(AsmToken::Hash) || getLexer().is(AsmToken::At) ||
+             getLexer().is(AsmToken::Percent) ||
+             getLexer().is(AsmToken::String)) {
+    if (!getLexer().is(AsmToken::String))
+      Lex();
 
-  TypeLoc = getLexer().getLoc();
-  if (getParser().parseIdentifier(Type))
-    return TokError("expected symbol type in directive");
-
-  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Type)
-    .Case("function", MCSA_ELF_TypeFunction)
-    .Case("object", MCSA_ELF_TypeObject)
-    .Case("tls_object", MCSA_ELF_TypeTLS)
-    .Case("common", MCSA_ELF_TypeCommon)
-    .Case("notype", MCSA_ELF_TypeNoType)
-    .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
-    .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
-    .Default(MCSA_Invalid);
+    TypeLoc = getLexer().getLoc();
+    if (getParser().parseIdentifier(Type))
+      return TokError("expected symbol type in directive");
+    Attr = StringSwitch<MCSymbolAttr>(Type)
+               .Case("function", MCSA_ELF_TypeFunction)
+               .Case("object", MCSA_ELF_TypeObject)
+               .Case("tls_object", MCSA_ELF_TypeTLS)
+               .Case("common", MCSA_ELF_TypeCommon)
+               .Case("notype", MCSA_ELF_TypeNoType)
+               .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+               .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
+               .Default(MCSA_Invalid);
+  } else
+    return TokError("expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', "
+                    "'%<type>' or \"<type>\"");
 
   if (Attr == MCSA_Invalid)
     return Error(TypeLoc, "unsupported attribute in '.type' directive");
@@ -536,22 +576,7 @@ bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
 
   Lex();
 
-  const MCSection *Comment =
-    getContext().getELFSection(".comment", ELF::SHT_PROGBITS,
-                               ELF::SHF_MERGE |
-                               ELF::SHF_STRINGS,
-                               SectionKind::getReadOnly(),
-                               1, "");
-
-  getStreamer().PushSection();
-  getStreamer().SwitchSection(Comment);
-  if (!SeenIdent) {
-    getStreamer().EmitIntValue(0, 1);
-    SeenIdent = true;
-  }
-  getStreamer().EmitBytes(Data);
-  getStreamer().EmitIntValue(0, 1);
-  getStreamer().PopSection();
+  getStreamer().EmitIdent(Data);
   return false;
 }
 
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index c5c3bb7..f7bf002 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -29,7 +29,7 @@ private:
 public:
   MCPureStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
                  MCCodeEmitter *Emitter)
-      : MCObjectStreamer(SK_PureStreamer, Context, TAB, OS, Emitter) {}
+      : MCObjectStreamer(Context, 0, TAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
@@ -51,8 +51,9 @@ public:
   virtual void FinishImpl();
 
 
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
     report_fatal_error("unsupported directive in pure streamer");
+    return false;
   }
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
     report_fatal_error("unsupported directive in pure streamer");
@@ -93,16 +94,13 @@ public:
   virtual void EmitFileDirective(StringRef Filename) {
     report_fatal_error("unsupported directive in pure streamer");
   }
+  virtual void EmitIdent(StringRef IdentString) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
   virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
                                       StringRef Filename, unsigned CUID = 0) {
     report_fatal_error("unsupported directive in pure streamer");
   }
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_PureStreamer;
-  }
 };
 
 } // end anonymous namespace.
@@ -120,7 +118,7 @@ void MCPureStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
 
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
 
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index bf1a984..09eb3e7 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -32,6 +32,29 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
   return false;
 }
 
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
 void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
@@ -44,27 +67,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     return;
   }
 
-  StringRef name = getSectionName();
-  if (name.find_first_not_of("0123456789_."
-                             "abcdefghijklmnopqrstuvwxyz"
-                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == name.npos) {
-    OS << "\t.section\t" << name;
-  } else {
-    OS << "\t.section\t\"";
-    for (const char *b = name.begin(), *e = name.end(); b < e; ++b) {
-      if (*b == '"') // Unquoted "
-        OS << "\\\"";
-      else if (*b != '\\') // Neither " or backslash
-        OS << *b;
-      else if (b + 1 == e) // Trailing backslash
-        OS << "\\\\";
-      else {
-        OS << b[0] << b[1]; // Quoted character
-        ++b;
-      }
-    }
-    OS << '"';
-  }
+  OS << "\t.section\t";
+  printName(OS, getSectionName());
 
   // Handle the weird solaris syntax if desired.
   if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
@@ -75,6 +79,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
       OS << ",#execinstr";
     if (Flags & ELF::SHF_WRITE)
       OS << ",#write";
+    if (Flags & ELF::SHF_EXCLUDE)
+      OS << ",#exclude";
     if (Flags & ELF::SHF_TLS)
       OS << ",#tls";
     OS << '\n';
@@ -84,6 +90,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
   OS << ",\"";
   if (Flags & ELF::SHF_ALLOC)
     OS << 'a';
+  if (Flags & ELF::SHF_EXCLUDE)
+    OS << 'e';
   if (Flags & ELF::SHF_EXECINSTR)
     OS << 'x';
   if (Flags & ELF::SHF_GROUP)
@@ -131,8 +139,11 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << "," << EntrySize;
   }
 
-  if (Flags & ELF::SHF_GROUP)
-    OS << "," << Group->getName() << ",comdat";
+  if (Flags & ELF::SHF_GROUP) {
+    OS << ",";
+    printName(OS, Group->getName());
+    OS << ",comdat";
+  }
   OS << '\n';
 
   if (Subsection)
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 6542f42..2e1d69b 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -21,10 +22,17 @@
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(StreamerKind Kind, MCContext &Ctx)
-    : Kind(Kind), Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
-      CurrentW64UnwindInfo(0), LastSymbol(0), AutoInitSections(false) {
+// Pin the vtables to this file.
+MCTargetStreamer::~MCTargetStreamer() {}
+void ARMTargetStreamer::anchor() {}
+
+MCStreamer::MCStreamer(MCContext &Ctx, MCTargetStreamer *TargetStreamer)
+    : Context(Ctx), TargetStreamer(TargetStreamer), EmitEHFrame(true),
+      EmitDebugFrame(false), CurrentW64UnwindInfo(0), LastSymbol(0),
+      AutoInitSections(false) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
+  if (TargetStreamer)
+    TargetStreamer->setStreamer(this);
 }
 
 MCStreamer::~MCStreamer() {
@@ -72,6 +80,13 @@ raw_ostream &MCStreamer::GetCommentOS() {
   return nulls();
 }
 
+void MCStreamer::generateCompactUnwindEncodings(MCAsmBackend *MAB) {
+  for (std::vector<MCDwarfFrameInfo>::iterator I = FrameInfos.begin(),
+         E = FrameInfos.end(); I != E; ++I)
+    I->CompactUnwindEncoding =
+      (MAB ? MAB->generateCompactUnwindEncoding(I->Instructions) : 0);
+}
+
 void MCStreamer::EmitDwarfSetLineAddr(int64_t LineDelta,
                                       const MCSymbol *Label, int PointerSize) {
   // emit the sequence to set the address
@@ -183,17 +198,28 @@ void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol) {
 }
 
+void MCStreamer::AssignSection(MCSymbol *Symbol, const MCSection *Section) {
+  if (Section)
+    Symbol->setSection(*Section);
+  else
+    Symbol->setUndefined();
+
+  // As we emit symbols into a section, track the order so that they can
+  // be sorted upon later. Zero is reserved to mean 'unemitted'.
+  SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
+}
+
 void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   LastSymbol = Symbol;
 }
 
 void MCStreamer::EmitDebugLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   LastSymbol = Symbol;
 }
 
@@ -380,6 +406,14 @@ void MCStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
+void MCStreamer::EmitCFIWindowSave() {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createWindowSave(Label);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
 void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) {
   W64UnwindInfos.push_back(Frame);
   CurrentW64UnwindInfo = W64UnwindInfos.back();
@@ -470,7 +504,9 @@ void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
     report_fatal_error("Frame register and offset already specified!");
   if (Offset & 0x0F)
     report_fatal_error("Misaligned frame pointer offset!");
-  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, 0, Register, Offset);
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, Label, Register, Offset);
+  EmitLabel(Label);
   CurFrame->LastFrameInst = CurFrame->Instructions.size();
   CurFrame->Instructions.push_back(Inst);
 }
@@ -534,54 +570,10 @@ void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
   llvm_unreachable("This file format doesn't support this directive");
 }
 
-void MCStreamer::EmitFnStart() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitFnEnd() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitCantUnwind() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitHandlerData() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitPersonality(const MCSymbol *Personality) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitPad(int64_t Offset) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitTCEntry(const MCSymbol &S) {
-  llvm_unreachable("Unsupported method");
-}
-
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCStreamer::EmitRawText(StringRef String) {
+void MCStreamer::EmitRawTextImpl(StringRef String) {
   errs() << "EmitRawText called on an MCStreamer that doesn't support it, "
   " something must not be fully mc'ized\n";
   abort();
@@ -589,19 +581,18 @@ void MCStreamer::EmitRawText(StringRef String) {
 
 void MCStreamer::EmitRawText(const Twine &T) {
   SmallString<128> Str;
-  T.toVector(Str);
-  EmitRawText(Str.str());
+  EmitRawTextImpl(T.toStringRef(Str));
 }
 
-void MCStreamer::EmitFrames(bool usingCFI) {
+void MCStreamer::EmitFrames(MCAsmBackend *MAB, bool usingCFI) {
   if (!getNumFrameInfos())
     return;
 
   if (EmitEHFrame)
-    MCDwarfFrameEmitter::Emit(*this, usingCFI, true);
+    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, true);
 
   if (EmitDebugFrame)
-    MCDwarfFrameEmitter::Emit(*this, usingCFI, false);
+    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, false);
 }
 
 void MCStreamer::EmitW64Tables() {
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index f18828d..8d8e290 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -27,6 +27,11 @@ MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
   FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
                                         ProcFeatures, NumFeatures);
 
+  InitCPUSchedModel(CPU);
+}
+
+void
+MCSubtargetInfo::InitCPUSchedModel(StringRef CPU) {
   if (!CPU.empty())
     CPUSchedModel = getSchedModelForCPU(CPU);
   else
@@ -91,10 +96,8 @@ MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
 #endif
 
   // Find entry
-  SubtargetInfoKV KV;
-  KV.Key = CPU.data();
   const SubtargetInfoKV *Found =
-    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, KV);
+    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, CPU);
   if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index b973c57..2416525 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -68,12 +68,23 @@ void MCSymbol::print(raw_ostream &OS) const {
   // The name for this MCSymbol is required to be a valid target name.  However,
   // some targets support quoting names with funny characters.  If the name
   // contains a funny character, then print it quoted.
-  if (!NameNeedsQuoting(getName())) {
-    OS << getName();
+  StringRef Name = getName();
+  if (!NameNeedsQuoting(Name)) {
+    OS << Name;
     return;
   }
 
-  OS << '"' << getName() << '"';
+  OS << '"';
+  for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+    char C = Name[I];
+    if (C == '\n')
+      OS << "\\n";
+    else if (C == '"')
+      OS << "\\\"";
+    else
+      OS << C;
+  }
+  OS << '"';
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index c5b637c..b8b07d3 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -64,7 +64,7 @@ static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
 
 static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
                            MCWin64EHInstruction &inst) {
-  uint8_t b1, b2;
+  uint8_t b2;
   uint16_t w;
   b2 = (inst.getOperation() & 0x0F);
   switch (inst.getOperation()) {
@@ -93,8 +93,7 @@ static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SetFPReg:
-    b1 = inst.getOffset() & 0xF0;
-    streamer.EmitIntValue(b1, 1);
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SaveNonVol:
@@ -129,14 +128,29 @@ static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
   }
 }
 
+static void EmitSymbolRefWithOfs(MCStreamer &streamer,
+                                 const MCSymbol *Base,
+                                 const MCSymbol *Other) {
+  MCContext &Context = streamer.getContext();
+  const MCSymbolRefExpr *BaseRef = MCSymbolRefExpr::Create(Base, Context);
+  const MCSymbolRefExpr *OtherRef = MCSymbolRefExpr::Create(Other, Context);
+  const MCExpr *Ofs = MCBinaryExpr::CreateSub(OtherRef, BaseRef, Context);
+  const MCSymbolRefExpr *BaseRefRel = MCSymbolRefExpr::Create(Base,
+                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                              Context);
+  streamer.EmitValue(MCBinaryExpr::CreateAdd(BaseRefRel, Ofs, Context), 4);
+}
+
 static void EmitRuntimeFunction(MCStreamer &streamer,
                                 const MCWin64EHUnwindInfo *info) {
   MCContext &context = streamer.getContext();
 
   streamer.EmitValueToAlignment(4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->Begin, context), 4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->End, context), 4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol, context), 4);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->End);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol,
+                                             MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                             context), 4);
 }
 
 static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
@@ -145,11 +159,11 @@ static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
 
   MCContext &context = streamer.getContext();
   streamer.EmitValueToAlignment(4);
-  // Upper 3 bits are the version number (currently 1).
-  uint8_t flags = 0x01;
   info->Symbol = context.CreateTempSymbol();
   streamer.EmitLabel(info->Symbol);
 
+  // Upper 3 bits are the version number (currently 1).
+  uint8_t flags = 0x01;
   if (info->ChainedParent)
     flags |= Win64EH::UNW_ChainInfo << 3;
   else {
@@ -185,20 +199,26 @@ static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
     EmitUnwindCode(streamer, info->Begin, inst);
   }
 
+  // For alignment purposes, the instruction array will always have an even
+  // number of entries, with the final entry potentially unused (in which case
+  // the array will be one longer than indicated by the count of unwind codes
+  // field).
+  if (numCodes & 1) {
+    streamer.EmitIntValue(0, 2);
+  }
+
   if (flags & (Win64EH::UNW_ChainInfo << 3))
     EmitRuntimeFunction(streamer, info->ChainedParent);
   else if (flags &
            ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3))
-    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler, context),
-                       4);
-  else if (numCodes < 2) {
+    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler,
+                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                              context), 4);
+  else if (numCodes == 0) {
     // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not
     // a chained unwind info, if there is no handler, and if there are fewer
     // than 2 slots used in the unwind code array, we have to pad to 8 bytes.
-    if (numCodes == 1)
-      streamer.EmitIntValue(0, 2);
-    else
-      streamer.EmitIntValue(0, 4);
+    streamer.EmitIntValue(0, 4);
   }
 }
 
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index a5ba3c3..8234aff 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -20,12 +20,11 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include <vector>
 using namespace llvm;
-using namespace llvm::object;
 
 void MachObjectWriter::reset() {
   Relocations.clear();
@@ -128,7 +127,7 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
   uint32_t Flags = 0;
 
   if (SubsectionsViaSymbols)
-    Flags |= macho::HF_SubsectionsViaSymbols;
+    Flags |= MachO::MH_SUBSECTIONS_VIA_SYMBOLS;
 
   // struct mach_header (28 bytes) or
   // struct mach_header_64 (32 bytes)
@@ -136,12 +135,12 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(is64Bit() ? macho::HM_Object64 : macho::HM_Object32);
+  Write32(is64Bit() ? MachO::MH_MAGIC_64 : MachO::MH_MAGIC);
 
   Write32(TargetObjectWriter->getCPUType());
   Write32(TargetObjectWriter->getCPUSubtype());
 
-  Write32(macho::HFT_Object);
+  Write32(MachO::MH_OBJECT);
   Write32(NumLoadCommands);
   Write32(LoadCommandsSize);
   Write32(Flags);
@@ -149,7 +148,7 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
     Write32(0); // reserved
 
   assert(OS.tell() - Start ==
-         (is64Bit() ? macho::Header64Size : macho::Header32Size));
+         (is64Bit()?sizeof(MachO::mach_header_64): sizeof(MachO::mach_header)));
 }
 
 /// WriteSegmentLoadCommand - Write a segment load command.
@@ -167,12 +166,12 @@ void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
   (void) Start;
 
   unsigned SegmentLoadCommandSize =
-    is64Bit() ? macho::SegmentLoadCommand64Size:
-    macho::SegmentLoadCommand32Size;
-  Write32(is64Bit() ? macho::LCT_Segment64 : macho::LCT_Segment);
+    is64Bit() ? sizeof(MachO::segment_command_64):
+    sizeof(MachO::segment_command);
+  Write32(is64Bit() ? MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT);
   Write32(SegmentLoadCommandSize +
-          NumSections * (is64Bit() ? macho::Section64Size :
-                         macho::Section32Size));
+          NumSections * (is64Bit() ? sizeof(MachO::section_64) :
+                         sizeof(MachO::section)));
 
   WriteBytes("", 16);
   if (is64Bit()) {
@@ -186,8 +185,10 @@ void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
     Write32(SectionDataStartOffset); // file offset
     Write32(SectionDataSize); // file size
   }
-  Write32(0x7); // maxprot
-  Write32(0x7); // initprot
+  // maxprot
+  Write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); 
+  // initprot
+  Write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); 
   Write32(NumSections);
   Write32(0); // flags
 
@@ -240,8 +241,8 @@ void MachObjectWriter::WriteSection(const MCAssembler &Asm,
   if (is64Bit())
     Write32(0); // reserved3
 
-  assert(OS.tell() - Start == (is64Bit() ? macho::Section64Size :
-                               macho::Section32Size));
+  assert(OS.tell() - Start == (is64Bit() ? sizeof(MachO::section_64) :
+                               sizeof(MachO::section)));
 }
 
 void MachObjectWriter::WriteSymtabLoadCommand(uint32_t SymbolOffset,
@@ -253,14 +254,14 @@ void MachObjectWriter::WriteSymtabLoadCommand(uint32_t SymbolOffset,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_Symtab);
-  Write32(macho::SymtabLoadCommandSize);
+  Write32(MachO::LC_SYMTAB);
+  Write32(sizeof(MachO::symtab_command));
   Write32(SymbolOffset);
   Write32(NumSymbols);
   Write32(StringTableOffset);
   Write32(StringTableSize);
 
-  assert(OS.tell() - Start == macho::SymtabLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::symtab_command));
 }
 
 void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
@@ -276,8 +277,8 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_Dysymtab);
-  Write32(macho::DysymtabLoadCommandSize);
+  Write32(MachO::LC_DYSYMTAB);
+  Write32(sizeof(MachO::dysymtab_command));
   Write32(FirstLocalSymbol);
   Write32(NumLocalSymbols);
   Write32(FirstExternalSymbol);
@@ -297,7 +298,7 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   Write32(0); // locreloff
   Write32(0); // nlocrel
 
-  assert(OS.tell() - Start == macho::DysymtabLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::dysymtab_command));
 }
 
 void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
@@ -312,20 +313,20 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
   //
   // FIXME: Are the prebound or indirect fields possible here?
   if (Symbol.isUndefined())
-    Type = macho::STT_Undefined;
+    Type = MachO::N_UNDF;
   else if (Symbol.isAbsolute())
-    Type = macho::STT_Absolute;
+    Type = MachO::N_ABS;
   else
-    Type = macho::STT_Section;
+    Type = MachO::N_SECT;
 
   // FIXME: Set STAB bits.
 
   if (Data.isPrivateExtern())
-    Type |= macho::STF_PrivateExtern;
+    Type |= MachO::N_PEXT;
 
   // Set external bit.
   if (Data.isExternal() || Symbol.isUndefined())
-    Type |= macho::STF_External;
+    Type |= MachO::N_EXT;
 
   // Compute the symbol address.
   if (Symbol.isDefined()) {
@@ -341,7 +342,8 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
       assert((1U << Log2Size) == Align && "Invalid 'common' alignment!");
       if (Log2Size > 15)
         report_fatal_error("invalid 'common' alignment '" +
-                           Twine(Align) + "'");
+                           Twine(Align) + "' for '" + Symbol.getName() + "'",
+                           false);
       // FIXME: Keep this mask with the SymbolFlags enumeration.
       Flags = (Flags & 0xF0FF) | (Log2Size << 8);
     }
@@ -369,17 +371,17 @@ void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
   (void) Start;
 
   Write32(Type);
-  Write32(macho::LinkeditLoadCommandSize);
+  Write32(sizeof(MachO::linkedit_data_command));
   Write32(DataOffset);
   Write32(DataSize);
 
-  assert(OS.tell() - Start == macho::LinkeditLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::linkedit_data_command));
 }
 
 static unsigned ComputeLinkerOptionsLoadCommandSize(
   const std::vector<std::string> &Options, bool is64Bit)
 {
-  unsigned Size = sizeof(macho::LinkerOptionsLoadCommand);
+  unsigned Size = sizeof(MachO::linker_options_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i)
     Size += Options[i].size() + 1;
   return RoundUpToAlignment(Size, is64Bit ? 8 : 4);
@@ -392,10 +394,10 @@ void MachObjectWriter::WriteLinkerOptionsLoadCommand(
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_LinkerOptions);
+  Write32(MachO::LC_LINKER_OPTIONS);
   Write32(Size);
   Write32(Options.size());
-  uint64_t BytesWritten = sizeof(macho::LinkerOptionsLoadCommand);
+  uint64_t BytesWritten = sizeof(MachO::linker_options_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i) {
     // Write each string, including the null byte.
     const std::string &Option = Options[i];
@@ -428,6 +430,22 @@ void MachObjectWriter::BindIndirectSymbols(MCAssembler &Asm) {
   //
   // FIXME: Revisit this when the dust settles.
 
+  // Report errors for use of .indirect_symbol not in a symbol pointer section
+  // or stub section.
+  for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
+         ie = Asm.indirect_symbol_end(); it != ie; ++it) {
+    const MCSectionMachO &Section =
+      cast<MCSectionMachO>(it->SectionData->getSection());
+
+    if (Section.getType() != MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MCSectionMachO::S_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MCSectionMachO::S_SYMBOL_STUBS) {
+	MCSymbol &Symbol = *it->Symbol;
+	report_fatal_error("indirect symbol '" + Symbol.getName() +
+                           "' not in a symbol pointer or stub section");
+    }
+  }
+
   // Bind non lazy symbol pointers first.
   unsigned IndirectIndex = 0;
   for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
@@ -723,14 +741,14 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   // section headers) and the symbol table.
   unsigned NumLoadCommands = 1;
   uint64_t LoadCommandsSize = is64Bit() ?
-    macho::SegmentLoadCommand64Size + NumSections * macho::Section64Size :
-    macho::SegmentLoadCommand32Size + NumSections * macho::Section32Size;
+    sizeof(MachO::segment_command_64) + NumSections * sizeof(MachO::section_64):
+    sizeof(MachO::segment_command) + NumSections * sizeof(MachO::section);
 
   // Add the data-in-code load command size, if used.
   unsigned NumDataRegions = Asm.getDataRegions().size();
   if (NumDataRegions) {
     ++NumLoadCommands;
-    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+    LoadCommandsSize += sizeof(MachO::linkedit_data_command);
   }
 
   // Add the symbol table load command sizes, if used.
@@ -738,8 +756,8 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     UndefinedSymbolData.size();
   if (NumSymbols) {
     NumLoadCommands += 2;
-    LoadCommandsSize += (macho::SymtabLoadCommandSize +
-                         macho::DysymtabLoadCommandSize);
+    LoadCommandsSize += (sizeof(MachO::symtab_command) +
+                         sizeof(MachO::dysymtab_command));
   }
 
   // Add the linker option load commands sizes.
@@ -753,8 +771,8 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   
   // Compute the total size of the section data, as well as its file size and vm
   // size.
-  uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size :
-                               macho::Header32Size) + LoadCommandsSize;
+  uint64_t SectionDataStart = (is64Bit() ? sizeof(MachO::mach_header_64) :
+                               sizeof(MachO::mach_header)) + LoadCommandsSize;
   uint64_t SectionDataSize = 0;
   uint64_t SectionDataFileSize = 0;
   uint64_t VMSize = 0;
@@ -791,11 +809,11 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
   for (MCAssembler::const_iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
-    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     unsigned NumRelocs = Relocs.size();
     uint64_t SectionStart = SectionDataStart + getSectionAddress(it);
     WriteSection(Asm, Layout, *it, SectionStart, RelocTableEnd, NumRelocs);
-    RelocTableEnd += NumRelocs * macho::RelocationInfoSize;
+    RelocTableEnd += NumRelocs * sizeof(MachO::any_relocation_info);
   }
 
   // Write the data-in-code load command, if used.
@@ -803,7 +821,7 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   if (NumDataRegions) {
     uint64_t DataRegionsOffset = RelocTableEnd;
     uint64_t DataRegionsSize = NumDataRegions * 8;
-    WriteLinkeditLoadCommand(macho::LCT_DataInCode, DataRegionsOffset,
+    WriteLinkeditLoadCommand(MachO::LC_DATA_IN_CODE, DataRegionsOffset,
                              DataRegionsSize);
   }
 
@@ -830,8 +848,9 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
 
     // The string table is written after symbol table.
     uint64_t StringTableOffset =
-      SymbolTableOffset + NumSymTabSymbols * (is64Bit() ? macho::Nlist64Size :
-                                              macho::Nlist32Size);
+      SymbolTableOffset + NumSymTabSymbols * (is64Bit() ?
+                                              sizeof(MachO::nlist_64) :
+                                              sizeof(MachO::nlist));
     WriteSymtabLoadCommand(SymbolTableOffset, NumSymTabSymbols,
                            StringTableOffset, StringTable.size());
 
@@ -864,10 +883,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
          ie = Asm.end(); it != ie; ++it) {
     // Write the section relocation entries, in reverse order to match 'as'
     // (approximately, the exact algorithm is more complicated than this).
-    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
-      Write32(Relocs[e - i - 1].Word0);
-      Write32(Relocs[e - i - 1].Word1);
+      Write32(Relocs[e - i - 1].r_word0);
+      Write32(Relocs[e - i - 1].r_word1);
     }
   }
 
@@ -906,9 +925,9 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
         // If this symbol is defined and internal, mark it as such.
         if (it->Symbol->isDefined() &&
             !Asm.getSymbolData(*it->Symbol).isExternal()) {
-          uint32_t Flags = macho::ISF_Local;
+          uint32_t Flags = MachO::INDIRECT_SYMBOL_LOCAL;
           if (it->Symbol->isAbsolute())
-            Flags |= macho::ISF_Absolute;
+            Flags |= MachO::INDIRECT_SYMBOL_ABS;
           Write32(Flags);
           continue;
         }
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 7625abd..2fb91f2 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -121,13 +121,10 @@ void SubtargetFeatures::AddFeature(const StringRef String,
 /// Find KV in array using binary search.
 static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A,
                                       size_t L) {
-  // Make the lower bound element we're looking for
-  SubtargetFeatureKV KV;
-  KV.Key = S.data();
   // Determine the end of the array
   const SubtargetFeatureKV *Hi = A + L;
   // Binary search the array
-  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, KV);
+  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, S);
   // If not found then return NULL
   if (F == Hi || StringRef(F->Key) != S) return NULL;
   // Return the found array item
@@ -353,8 +350,7 @@ void SubtargetFeatures::dump() const {
 }
 #endif
 
-/// getDefaultSubtargetFeatures - Return a string listing the features
-/// associated with the target triple.
+/// Adds the default features for the specified target triple.
 ///
 /// FIXME: This is an inelegant way of specifying the features of a
 /// subtarget. It would be better if we could encode this information
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 4e26934..d9ca86d 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -138,7 +138,7 @@ public:
   symbol_map  SymbolMap;
 
   WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW, raw_ostream &OS);
-  ~WinCOFFObjectWriter();
+  virtual ~WinCOFFObjectWriter();
 
   COFFSymbol *createSymbol(StringRef Name);
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol * Symbol);
@@ -148,13 +148,12 @@ public:
   object_t *createCOFFEntity(StringRef Name, list_t &List);
 
   void DefineSection(MCSectionData const &SectionData);
-  void DefineSymbol(MCSymbolData const &SymbolData,
-                    MCAssembler &Assembler);
+  void DefineSymbol(MCSymbolData const &SymbolData, MCAssembler &Assembler,
+                    const MCAsmLayout &Layout);
 
   void MakeSymbolReal(COFFSymbol &S, size_t Index);
   void MakeSectionReal(COFFSection &S, size_t Number);
 
-  bool ExportSection(COFFSection const *S);
   bool ExportSymbol(MCSymbolData const &SymbolData, MCAssembler &Asm);
 
   bool IsPhysicalSection(COFFSection *S);
@@ -190,17 +189,6 @@ static inline void write_uint32_le(void *Data, uint32_t const &Value) {
   Ptr[3] = (Value & 0xFF000000) >> 24;
 }
 
-static inline void write_uint16_le(void *Data, uint16_t const &Value) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
-  Ptr[0] = (Value & 0x00FF) >> 0;
-  Ptr[1] = (Value & 0xFF00) >> 8;
-}
-
-static inline void write_uint8_le(void *Data, uint8_t const &Value) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
-  Ptr[0] = (Value & 0xFF) >> 0;
-}
-
 //------------------------------------------------------------------------------
 // Symbol class implementation
 
@@ -411,7 +399,8 @@ void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
 /// This function takes a section data object from the assembler
 /// and creates the associated COFF symbol staging object.
 void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
-                                       MCAssembler &Assembler) {
+                                       MCAssembler &Assembler,
+                                       const MCAsmLayout &Layout) {
   MCSymbol const &Symbol = SymbolData.getSymbol();
   COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
   SymbolMap[&Symbol] = coff_symbol;
@@ -452,6 +441,12 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
     const MCSymbolData &ResSymData =
       Assembler.getSymbolData(Symbol.AliasedSymbol());
 
+    if (Symbol.isVariable()) {
+      int64_t Addr;
+      if (Symbol.getVariableValue()->EvaluateAsAbsolute(Addr, Layout))
+        coff_symbol->Data.Value = Addr;
+    }
+
     coff_symbol->Data.Type         = (ResSymData.getFlags() & 0x0000FFFF) >>  0;
     coff_symbol->Data.StorageClass = (ResSymData.getFlags() & 0x00FF0000) >> 16;
 
@@ -463,7 +458,9 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
        external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC;
     }
 
-    if (ResSymData.Fragment != NULL)
+    if (Symbol.isAbsolute() || Symbol.AliasedSymbol().isVariable())
+      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+    else if (ResSymData.Fragment != NULL)
       coff_symbol->Section =
         SectionMap[&ResSymData.Fragment->getParent()->getSection()];
 
@@ -508,10 +505,6 @@ void WinCOFFObjectWriter::MakeSymbolReal(COFFSymbol &S, size_t Index) {
   S.Index = Index;
 }
 
-bool WinCOFFObjectWriter::ExportSection(COFFSection const *S) {
-  return !S->MCData->getFragmentList().empty();
-}
-
 bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData,
                                        MCAssembler &Asm) {
   // This doesn't seem to be right. Strings referred to from the .data section
@@ -625,9 +618,10 @@ void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
     DefineSection(*i);
 
   for (MCAssembler::const_symbol_iterator i = Asm.symbol_begin(),
-                                          e = Asm.symbol_end(); i != e; i++) {
+                                          e = Asm.symbol_end();
+       i != e; i++) {
     if (ExportSymbol(*i, Asm)) {
-      DefineSymbol(*i, Asm);
+      DefineSymbol(*i, Asm, Layout);
     }
   }
 }
@@ -921,6 +915,9 @@ MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) :
   Machine(Machine_) {
 }
 
+// Pin the vtable to this file.
+void MCWinCOFFObjectTargetWriter::anchor() {}
+
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter factory function
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 04bfeb4..5b5aad7 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -55,7 +55,7 @@ public:
   virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void BeginCOFFSymbolDef(MCSymbol const *Symbol);
   virtual void EmitCOFFSymbolStorageClass(int StorageClass);
@@ -72,13 +72,10 @@ public:
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment);
   virtual void EmitFileDirective(StringRef Filename);
+  virtual void EmitIdent(StringRef IdentString);
   virtual void EmitWin64EHHandlerData();
   virtual void FinishImpl();
 
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_WinCOFFStreamer;
-  }
-
 private:
   virtual void EmitInstToData(const MCInst &Inst) {
     MCDataFragment *DF = getOrCreateDataFragment();
@@ -134,8 +131,7 @@ private:
 
 WinCOFFStreamer::WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                  MCCodeEmitter &CE, raw_ostream &OS)
-    : MCObjectStreamer(SK_WinCOFFStreamer, Context, MAB, OS, &CE),
-      CurSymbol(NULL) {}
+    : MCObjectStreamer(Context, 0, MAB, OS, &CE), CurSymbol(NULL) {}
 
 void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment, bool External) {
@@ -155,7 +151,8 @@ void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   int Selection = COFF::IMAGE_COMDAT_SELECT_LARGEST;
 
   const MCSection *Section = MCStreamer::getContext().getCOFFSection(
-    SectionName, Characteristics, SectionKind::getBSS(), Selection);
+      SectionName, Characteristics, SectionKind::getBSS(), Symbol->getName(),
+      Selection);
 
   MCSectionData &SectionData = getAssembler().getOrCreateSectionData(*Section);
 
@@ -164,7 +161,7 @@ void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
   SymbolData.setExternal(External);
 
-  Symbol->setSection(*Section);
+  AssignSection(Symbol, Section);
 
   if (ByteAlignment != 1)
       new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, &SectionData);
@@ -201,7 +198,7 @@ void WinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
   assert(Symbol && "Symbol must be non-null!");
   assert((Symbol->isInSection()
@@ -221,8 +218,10 @@ void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     break;
 
   default:
-    llvm_unreachable("unsupported attribute");
+    return false;
   }
+
+  return true;
 }
 
 void WinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -309,6 +308,11 @@ void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
   // info will be a much large effort.
 }
 
+// TODO: Implement this if you want to emit .comment section in COFF obj files.
+void WinCOFFStreamer::EmitIdent(StringRef IdentString) {
+  llvm_unreachable("unsupported directive");
+}
+
 void WinCOFFStreamer::EmitWin64EHHandlerData() {
   MCStreamer::EmitWin64EHHandlerData();
 
@@ -318,6 +322,7 @@ void WinCOFFStreamer::EmitWin64EHHandlerData() {
 }
 
 void WinCOFFStreamer::FinishImpl() {
+  EmitFrames(NULL, true);
   EmitW64Tables();
   MCObjectStreamer::FinishImpl();
 }
diff --git a/lib/Makefile b/lib/Makefile
index 0a4435e..2ed0636 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,9 +10,8 @@ LEVEL = ..
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen \
-                 Target ExecutionEngine Linker MC Object Option DebugInfo \
-								 IRReader
+PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen Target \
+                 ExecutionEngine Linker LTO MC Object Option DebugInfo   \
+                 IRReader
 
 include $(LEVEL)/Makefile.common
-
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index fd9d3b4..de57b4c 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -91,6 +91,7 @@ error_code object::createBinary(MemoryBuffer *Source,
       return object_error::success;
     }
     case sys::fs::file_magic::coff_object:
+    case sys::fs::file_magic::coff_import_library:
     case sys::fs::file_magic::pecoff_executable: {
       OwningPtr<Binary> ret(
           ObjectFile::createCOFFObjectFile(scopedSource.take()));
@@ -100,7 +101,8 @@ error_code object::createBinary(MemoryBuffer *Source,
       return object_error::success;
     }
     case sys::fs::file_magic::unknown:
-    case sys::fs::file_magic::bitcode: {
+    case sys::fs::file_magic::bitcode:
+    case sys::fs::file_magic::windows_resource: {
       // Unrecognized object file format.
       return object_error::invalid_file_type;
     }
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 2c2cc8e..1f07cbb 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMObject
   Binary.cpp
   COFFObjectFile.cpp
   COFFYAML.cpp
+  ELF.cpp
   ELFObjectFile.cpp
   ELFYAML.cpp
   Error.cpp
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index cb029f9..3434e70 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -16,6 +16,9 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 #include <ctype.h>
 
@@ -111,10 +114,8 @@ error_code COFFObjectFile::getSymbolFileOffset(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->PointerToRawData + symb->Value;
@@ -129,10 +130,8 @@ error_code COFFObjectFile::getSymbolAddress(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->VirtualAddress + symb->Value;
@@ -152,12 +151,16 @@ error_code COFFObjectFile::getSymbolType(DataRefImpl Symb,
     if (symb->getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
       Result = SymbolRef::ST_Function;
     } else {
-      char Type;
-      if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-        return ec;
-      if (Type == 'r' || Type == 'R') {
-        Result = SymbolRef::ST_Data;
+      uint32_t Characteristics = 0;
+      if (symb->SectionNumber > 0) {
+        const coff_section *Section = NULL;
+        if (error_code ec = getSection(symb->SectionNumber, Section))
+          return ec;
+        Characteristics = Section->Characteristics;
       }
+      if (Characteristics & COFF::IMAGE_SCN_MEM_READ &&
+          ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
+        Result = SymbolRef::ST_Data;
     }
   }
   return object_error::success;
@@ -196,10 +199,8 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->SizeOfRawData - symb->Value;
@@ -208,74 +209,6 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
-                                               char &Result) const {
-  const coff_symbol *symb = toSymb(Symb);
-  StringRef name;
-  if (error_code ec = getSymbolName(Symb, name))
-    return ec;
-  char ret = StringSwitch<char>(name)
-    .StartsWith(".debug", 'N')
-    .StartsWith(".sxdata", 'N')
-    .Default('?');
-
-  if (ret != '?') {
-    Result = ret;
-    return object_error::success;
-  }
-
-  uint32_t Characteristics = 0;
-  if (symb->SectionNumber > 0) {
-    const coff_section *Section = NULL;
-    if (error_code ec = getSection(symb->SectionNumber, Section))
-      return ec;
-    Characteristics = Section->Characteristics;
-  }
-
-  switch (symb->SectionNumber) {
-  case COFF::IMAGE_SYM_UNDEFINED:
-    // Check storage classes.
-    if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL) {
-      Result = 'w';
-      return object_error::success; // Don't do ::toupper.
-    } else if (symb->Value != 0) // Check for common symbols.
-      ret = 'c';
-    else
-      ret = 'u';
-    break;
-  case COFF::IMAGE_SYM_ABSOLUTE:
-    ret = 'a';
-    break;
-  case COFF::IMAGE_SYM_DEBUG:
-    ret = 'n';
-    break;
-  default:
-    // Check section type.
-    if (Characteristics & COFF::IMAGE_SCN_CNT_CODE)
-      ret = 't';
-    else if (  Characteristics & COFF::IMAGE_SCN_MEM_READ
-            && ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
-      ret = 'r';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
-      ret = 'd';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
-      ret = 'b';
-    else if (Characteristics & COFF::IMAGE_SCN_LNK_INFO)
-      ret = 'i';
-
-    // Check for section symbol.
-    else if (  symb->StorageClass == COFF::IMAGE_SYM_CLASS_STATIC
-            && symb->Value == 0)
-       ret = 's';
-  }
-
-  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
-    ret = ::toupper(static_cast<unsigned char>(ret));
-
-  Result = ret;
-  return object_error::success;
-}
-
 error_code COFFObjectFile::getSymbolSection(DataRefImpl Symb,
                                             section_iterator &Result) const {
   const coff_symbol *symb = toSymb(Symb);
@@ -406,7 +339,7 @@ error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl Sec,
   return object_error::success;
 }
 
-relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+relocation_iterator COFFObjectFile::section_rel_begin(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
   if (sec->NumberOfRelocations == 0)
@@ -417,7 +350,7 @@ relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(ret, this));
 }
 
-relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
   if (sec->NumberOfRelocations == 0)
@@ -431,6 +364,94 @@ relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(ret, this));
 }
 
+// Initialize the pointer to the symbol table.
+error_code COFFObjectFile::initSymbolTablePtr() {
+  if (error_code ec = getObject(
+          SymbolTable, Data, base() + COFFHeader->PointerToSymbolTable,
+          COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
+    return ec;
+
+  // Find string table. The first four byte of the string table contains the
+  // total size of the string table, including the size field itself. If the
+  // string table is empty, the value of the first four byte would be 4.
+  const uint8_t *StringTableAddr =
+      base() + COFFHeader->PointerToSymbolTable +
+      COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
+  const ulittle32_t *StringTableSizePtr;
+  if (error_code ec = getObject(StringTableSizePtr, Data, StringTableAddr))
+    return ec;
+  StringTableSize = *StringTableSizePtr;
+  if (error_code ec =
+      getObject(StringTable, Data, StringTableAddr, StringTableSize))
+    return ec;
+
+  // Check that the string table is null terminated if has any in it.
+  if (StringTableSize < 4 ||
+      (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0))
+    return  object_error::parse_failed;
+  return object_error::success;
+}
+
+// Returns the file offset for the given RVA.
+error_code COFFObjectFile::getRvaPtr(uint32_t Rva, uintptr_t &Res) const {
+  error_code ec;
+  for (section_iterator i = begin_sections(), e = end_sections(); i != e;
+       i.increment(ec)) {
+    if (ec)
+      return ec;
+    const coff_section *Section = getCOFFSection(i);
+    uint32_t SectionStart = Section->VirtualAddress;
+    uint32_t SectionEnd = Section->VirtualAddress + Section->VirtualSize;
+    if (SectionStart <= Rva && Rva < SectionEnd) {
+      uint32_t Offset = Rva - SectionStart;
+      Res = uintptr_t(base()) + Section->PointerToRawData + Offset;
+      return object_error::success;
+    }
+  }
+  return object_error::parse_failed;
+}
+
+// Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
+// table entry.
+error_code COFFObjectFile::
+getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const {
+  uintptr_t IntPtr = 0;
+  if (error_code ec = getRvaPtr(Rva, IntPtr))
+    return ec;
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(IntPtr);
+  Hint = *reinterpret_cast<const ulittle16_t *>(Ptr);
+  Name = StringRef(reinterpret_cast<const char *>(Ptr + 2));
+  return object_error::success;
+}
+
+// Find the import table.
+error_code COFFObjectFile::initImportTablePtr() {
+  // First, we get the RVA of the import table. If the file lacks a pointer to
+  // the import table, do nothing.
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::IMPORT_TABLE, DataEntry))
+    return object_error::success;
+
+  // Do nothing if the pointer to import table is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return object_error::success;
+
+  uint32_t ImportTableRva = DataEntry->RelativeVirtualAddress;
+  NumberOfImportDirectory = DataEntry->Size /
+      sizeof(import_directory_table_entry);
+
+  // Find the section that contains the RVA. This is needed because the RVA is
+  // the import table's memory address which is different from its file offset.
+  uintptr_t IntPtr = 0;
+  if (error_code ec = getRvaPtr(ImportTableRva, IntPtr))
+    return ec;
+  ImportDirectory = reinterpret_cast<
+      const import_directory_table_entry *>(IntPtr);
+
+  // It's an error if there's no section containing the Import Table RVA.
+  return object_error::parse_failed;
+}
+
 COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
   : ObjectFile(Binary::ID_COFF, Object)
   , COFFHeader(0)
@@ -439,7 +460,9 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
   , SectionTable(0)
   , SymbolTable(0)
   , StringTable(0)
-  , StringTableSize(0) {
+  , StringTableSize(0)
+  , ImportDirectory(0)
+  , NumberOfImportDirectory(0) {
   // Check that we at least have enough room for a header.
   if (!checkSize(Data, ec, sizeof(coff_file_header))) return;
 
@@ -486,49 +509,33 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
     CurPtr += COFFHeader->SizeOfOptionalHeader;
   }
 
-  if ((ec = getObject(SectionTable, Data, base() + CurPtr,
-                      COFFHeader->NumberOfSections * sizeof(coff_section))))
-    return;
-
-  if (COFFHeader->PointerToSymbolTable != 0) {
-    if ((ec = getObject(SymbolTable, Data,
-                        base() + COFFHeader->PointerToSymbolTable,
-                        COFFHeader->NumberOfSymbols * sizeof(coff_symbol))))
+  if (!COFFHeader->isImportLibrary())
+    if ((ec = getObject(SectionTable, Data, base() + CurPtr,
+                        COFFHeader->NumberOfSections * sizeof(coff_section))))
       return;
 
-    // Find string table. The first four byte of the string table contains the
-    // total size of the string table, including the size field itself. If the
-    // string table is empty, the value of the first four byte would be 4.
-    const uint8_t *StringTableAddr = base() + COFFHeader->PointerToSymbolTable
-        + COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
-    const ulittle32_t *StringTableSizePtr;
-    if ((ec = getObject(StringTableSizePtr, Data, StringTableAddr)))
-      return;
-    StringTableSize = *StringTableSizePtr;
-    if ((ec = getObject(StringTable, Data, StringTableAddr, StringTableSize)))
+  // Initialize the pointer to the symbol table.
+  if (COFFHeader->PointerToSymbolTable != 0)
+    if ((ec = initSymbolTablePtr()))
       return;
 
-    // Check that the string table is null terminated if has any in it.
-    if (StringTableSize < 4
-        || (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0)) {
-      ec = object_error::parse_failed;
-      return;
-    }
-  }
+  // Initialize the pointer to the beginning of the import table.
+  if ((ec = initImportTablePtr()))
+    return;
 
   ec = object_error::success;
 }
 
 symbol_iterator COFFObjectFile::begin_symbols() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SymbolTable);
+  ret.p = reinterpret_cast<uintptr_t>(SymbolTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
 
 symbol_iterator COFFObjectFile::end_symbols() const {
   // The symbol table ends where the string table begins.
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(StringTable);
+  ret.p = reinterpret_cast<uintptr_t>(StringTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
 
@@ -557,16 +564,34 @@ StringRef COFFObjectFile::getLoadName() const {
   return "";
 }
 
+import_directory_iterator COFFObjectFile::import_directory_begin() const {
+  DataRefImpl Imp;
+  Imp.p = reinterpret_cast<uintptr_t>(ImportDirectory);
+  return import_directory_iterator(ImportDirectoryEntryRef(Imp, this));
+}
+
+import_directory_iterator COFFObjectFile::import_directory_end() const {
+  DataRefImpl Imp;
+  if (ImportDirectory) {
+    Imp.p = reinterpret_cast<uintptr_t>(
+        ImportDirectory + (NumberOfImportDirectory - 1));
+  } else {
+    Imp.p = 0;
+  }
+  return import_directory_iterator(ImportDirectoryEntryRef(Imp, this));
+}
 
 section_iterator COFFObjectFile::begin_sections() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SectionTable);
+  ret.p = reinterpret_cast<uintptr_t>(SectionTable);
   return section_iterator(SectionRef(ret, this));
 }
 
 section_iterator COFFObjectFile::end_sections() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SectionTable + COFFHeader->NumberOfSections);
+  int numSections = COFFHeader->isImportLibrary()
+      ? 0 : COFFHeader->NumberOfSections;
+  ret.p = reinterpret_cast<uintptr_t>(SectionTable + numSections);
   return section_iterator(SectionRef(ret, this));
 }
 
@@ -678,7 +703,7 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol,
 ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
                                   const coff_symbol *symbol) const {
   const uint8_t *aux = NULL;
-  
+
   if ( symbol->NumberOfAuxSymbols > 0 ) {
   // AUX data comes immediately after the symbol in COFF
     aux = reinterpret_cast<const uint8_t *>(symbol + 1);
@@ -779,7 +804,6 @@ const coff_relocation *COFFObjectFile::getCOFFRelocation(
   return toRel(It->getRawDataRefImpl());
 }
 
-
 #define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \
   case COFF::enum: res = #enum; break;
 
@@ -860,6 +884,52 @@ error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
   report_fatal_error("getLibraryPath not implemented in COFFObjectFile");
 }
 
+bool ImportDirectoryEntryRef::
+operator==(const ImportDirectoryEntryRef &Other) const {
+  return ImportDirectoryPimpl == Other.ImportDirectoryPimpl;
+}
+
+static const import_directory_table_entry *toImportEntry(DataRefImpl Imp) {
+  return reinterpret_cast<const import_directory_table_entry *>(Imp.p);
+}
+
+error_code
+ImportDirectoryEntryRef::getNext(ImportDirectoryEntryRef &Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  Dir += 1;
+  DataRefImpl Next;
+  Next.p = reinterpret_cast<uintptr_t>(Dir);
+  Result = ImportDirectoryEntryRef(Next, OwningObject);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::
+getImportTableEntry(const import_directory_table_entry *&Result) const {
+  Result = toImportEntry(ImportDirectoryPimpl);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  uintptr_t IntPtr = 0;
+  if (error_code ec = OwningObject->getRvaPtr(Dir->NameRVA, IntPtr))
+    return ec;
+  const char *Ptr = reinterpret_cast<const char *>(IntPtr);
+  Result = StringRef(Ptr);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::getImportLookupEntry(
+    const import_lookup_table_entry32 *&Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  uintptr_t IntPtr = 0;
+  if (error_code ec = OwningObject->getRvaPtr(
+          Dir->ImportLookupTableRVA, IntPtr))
+    return ec;
+  Result = reinterpret_cast<const import_lookup_table_entry32 *>(IntPtr);
+  return object_error::success;
+}
+
 namespace llvm {
 
   ObjectFile *ObjectFile::createCOFFObjectFile(MemoryBuffer *Object) {
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
new file mode 100644
index 0000000..7c80d41
--- /dev/null
+++ b/lib/Object/ELF.cpp
@@ -0,0 +1,714 @@
+//===- ELF.cpp - ELF object file implementation -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELF.h"
+
+namespace llvm {
+namespace object {
+
+#define LLVM_ELF_SWITCH_RELOC_TYPE_NAME(enum)                                  \
+  case ELF::enum:                                                              \
+    return #enum;                                                              \
+
+StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
+  switch (Machine) {
+  case ELF::EM_X86_64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32S);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTTPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_386:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTPC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32PLT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTIE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDO_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_MIPS:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LITERAL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_OFST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SUB);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_A);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_B);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_DELETE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SCN_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_ADD_IMMEDIATE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PJUMP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_RELGOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JALR);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_LDM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GOTTPREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_26_S1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_PC16_S1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_CALL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_OFST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NUM);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_AARCH64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD_PREL_LO19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_LO21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_PG_HI21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADD_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST8_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TSTBR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CONDBR19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CALL26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST16_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST32_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST64_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST128_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD64_GOT_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADR_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_LD64_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADD_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_CALL);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_ARM:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PC24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ABS5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BREL_ADJ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_SWI8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_XPC25);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_XPC22);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_7_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_15_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_23_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SBREL_11_0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_19_12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_27_20_CK);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_V4BX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ALU_PREL_11_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESCSEQ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTRELAX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTENTRY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTINHERIT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE12GP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_4);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_7);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_9);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_10);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_13);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ME_TOO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ32);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_HEXAGON:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B32_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_12_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_10_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_9_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_7_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_JMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPMOD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_11_X);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_PPC:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HA);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_PPC64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HA);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_S390:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_JMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT16DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPCDBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLTENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LOAD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GDCALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDCALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IEENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPMOD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_TPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  default:
+    break;
+  }
+  return "Unknown";
+}
+
+#undef LLVM_ELF_SWITCH_RELOC_TYPE_NAME
+
+} // end namespace object
+} // end namespace llvm
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index a6c128e..20b7307 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/MathExtras.h"
 
 #include <ctype.h>
 
 namespace llvm {
-
 using namespace object;
 
 // Creates an in-memory object-file by default: createELFObjectFile(Buffer)
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index e530d3d..2f35cf9 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp
@@ -266,6 +266,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
 #define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
   BCase(SHF_WRITE)
   BCase(SHF_ALLOC)
+  BCase(SHF_EXCLUDE)
   BCase(SHF_EXECINSTR)
   BCase(SHF_MERGE)
   BCase(SHF_STRINGS)
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 5d0399e..d2cb8bd 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -14,11 +14,11 @@
 
 #include "llvm/Object/MachO.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <cstring>
 #include <limits>
@@ -29,16 +29,16 @@ using namespace object;
 namespace llvm {
 namespace object {
 
-struct SymbolTableEntryBase {
-  uint32_t StringIndex;
-  uint8_t Type;
-  uint8_t SectionIndex;
-  uint16_t Flags;
+struct nlist_base {
+  uint32_t n_strx;
+  uint8_t n_type;
+  uint8_t n_sect;
+  uint16_t n_desc;
 };
 
-struct SectionBase {
-  char Name[16];
-  char SegmentName[16];
+struct section_base {
+  char sectname[16];
+  char segname[16];
 };
 
 template<typename T>
@@ -50,167 +50,174 @@ template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
-void SwapStruct(macho::RelocationEntry &H) {
-  SwapValue(H.Word0);
-  SwapValue(H.Word1);
+void SwapStruct(MachO::any_relocation_info &H) {
+  SwapValue(H.r_word0);
+  SwapValue(H.r_word1);
 }
 
 template<>
-void SwapStruct(macho::LoadCommand &L) {
-  SwapValue(L.Type);
-  SwapValue(L.Size);
+void SwapStruct(MachO::load_command &L) {
+  SwapValue(L.cmd);
+  SwapValue(L.cmdsize);
 }
 
 template<>
-void SwapStruct(SymbolTableEntryBase &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
+void SwapStruct(nlist_base &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
 }
 
 template<>
-void SwapStruct(macho::Section &S) {
-  SwapValue(S.Address);
-  SwapValue(S.Size);
-  SwapValue(S.Offset);
-  SwapValue(S.Align);
-  SwapValue(S.RelocationTableOffset);
-  SwapValue(S.NumRelocationTableEntries);
-  SwapValue(S.Flags);
-  SwapValue(S.Reserved1);
-  SwapValue(S.Reserved2);
+void SwapStruct(MachO::section &S) {
+  SwapValue(S.addr);
+  SwapValue(S.size);
+  SwapValue(S.offset);
+  SwapValue(S.align);
+  SwapValue(S.reloff);
+  SwapValue(S.nreloc);
+  SwapValue(S.flags);
+  SwapValue(S.reserved1);
+  SwapValue(S.reserved2);
 }
 
 template<>
-void SwapStruct(macho::Section64 &S) {
-  SwapValue(S.Address);
-  SwapValue(S.Size);
-  SwapValue(S.Offset);
-  SwapValue(S.Align);
-  SwapValue(S.RelocationTableOffset);
-  SwapValue(S.NumRelocationTableEntries);
-  SwapValue(S.Flags);
-  SwapValue(S.Reserved1);
-  SwapValue(S.Reserved2);
-  SwapValue(S.Reserved3);
+void SwapStruct(MachO::section_64 &S) {
+  SwapValue(S.addr);
+  SwapValue(S.size);
+  SwapValue(S.offset);
+  SwapValue(S.align);
+  SwapValue(S.reloff);
+  SwapValue(S.nreloc);
+  SwapValue(S.flags);
+  SwapValue(S.reserved1);
+  SwapValue(S.reserved2);
+  SwapValue(S.reserved3);
 }
 
 template<>
-void SwapStruct(macho::SymbolTableEntry &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
-  SwapValue(S.Value);
+void SwapStruct(MachO::nlist &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
+  SwapValue(S.n_value);
 }
 
 template<>
-void SwapStruct(macho::Symbol64TableEntry &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
-  SwapValue(S.Value);
+void SwapStruct(MachO::nlist_64 &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
+  SwapValue(S.n_value);
 }
 
 template<>
-void SwapStruct(macho::Header &H) {
-  SwapValue(H.Magic);
-  SwapValue(H.CPUType);
-  SwapValue(H.CPUSubtype);
-  SwapValue(H.FileType);
-  SwapValue(H.NumLoadCommands);
-  SwapValue(H.SizeOfLoadCommands);
-  SwapValue(H.Flags);
+void SwapStruct(MachO::mach_header &H) {
+  SwapValue(H.magic);
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.filetype);
+  SwapValue(H.ncmds);
+  SwapValue(H.sizeofcmds);
+  SwapValue(H.flags);
 }
 
 template<>
-void SwapStruct(macho::Header64Ext &E) {
-  SwapValue(E.Reserved);
+void SwapStruct(MachO::mach_header_64 &H) {
+  SwapValue(H.magic);
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.filetype);
+  SwapValue(H.ncmds);
+  SwapValue(H.sizeofcmds);
+  SwapValue(H.flags);
+  SwapValue(H.reserved);
 }
 
 template<>
-void SwapStruct(macho::SymtabLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.SymbolTableOffset);
-  SwapValue(C.NumSymbolTableEntries);
-  SwapValue(C.StringTableOffset);
-  SwapValue(C.StringTableSize);
+void SwapStruct(MachO::symtab_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.symoff);
+  SwapValue(C.nsyms);
+  SwapValue(C.stroff);
+  SwapValue(C.strsize);
 }
 
 template<>
-void SwapStruct(macho::DysymtabLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.LocalSymbolsIndex);
-  SwapValue(C.NumLocalSymbols);
-  SwapValue(C.ExternalSymbolsIndex);
-  SwapValue(C.NumExternalSymbols);
-  SwapValue(C.UndefinedSymbolsIndex);
-  SwapValue(C.NumUndefinedSymbols);
-  SwapValue(C.TOCOffset);
-  SwapValue(C.NumTOCEntries);
-  SwapValue(C.ModuleTableOffset);
-  SwapValue(C.NumModuleTableEntries);
-  SwapValue(C.ReferenceSymbolTableOffset);
-  SwapValue(C.NumReferencedSymbolTableEntries);
-  SwapValue(C.IndirectSymbolTableOffset);
-  SwapValue(C.NumIndirectSymbolTableEntries);
-  SwapValue(C.ExternalRelocationTableOffset);
-  SwapValue(C.NumExternalRelocationTableEntries);
-  SwapValue(C.LocalRelocationTableOffset);
-  SwapValue(C.NumLocalRelocationTableEntries);
+void SwapStruct(MachO::dysymtab_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.ilocalsym);
+  SwapValue(C.nlocalsym);
+  SwapValue(C.iextdefsym);
+  SwapValue(C.nextdefsym);
+  SwapValue(C.iundefsym);
+  SwapValue(C.nundefsym);
+  SwapValue(C.tocoff);
+  SwapValue(C.ntoc);
+  SwapValue(C.modtaboff);
+  SwapValue(C.nmodtab);
+  SwapValue(C.extrefsymoff);
+  SwapValue(C.nextrefsyms);
+  SwapValue(C.indirectsymoff);
+  SwapValue(C.nindirectsyms);
+  SwapValue(C.extreloff);
+  SwapValue(C.nextrel);
+  SwapValue(C.locreloff);
+  SwapValue(C.nlocrel);
 }
 
 template<>
-void SwapStruct(macho::LinkeditDataLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.DataOffset);
-  SwapValue(C.DataSize);
+void SwapStruct(MachO::linkedit_data_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.dataoff);
+  SwapValue(C.datasize);
 }
 
 template<>
-void SwapStruct(macho::SegmentLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.VMAddress);
-  SwapValue(C.VMSize);
-  SwapValue(C.FileOffset);
-  SwapValue(C.FileSize);
-  SwapValue(C.MaxVMProtection);
-  SwapValue(C.InitialVMProtection);
-  SwapValue(C.NumSections);
-  SwapValue(C.Flags);
+void SwapStruct(MachO::segment_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.vmaddr);
+  SwapValue(C.vmsize);
+  SwapValue(C.fileoff);
+  SwapValue(C.filesize);
+  SwapValue(C.maxprot);
+  SwapValue(C.initprot);
+  SwapValue(C.nsects);
+  SwapValue(C.flags);
 }
 
 template<>
-void SwapStruct(macho::Segment64LoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.VMAddress);
-  SwapValue(C.VMSize);
-  SwapValue(C.FileOffset);
-  SwapValue(C.FileSize);
-  SwapValue(C.MaxVMProtection);
-  SwapValue(C.InitialVMProtection);
-  SwapValue(C.NumSections);
-  SwapValue(C.Flags);
+void SwapStruct(MachO::segment_command_64 &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.vmaddr);
+  SwapValue(C.vmsize);
+  SwapValue(C.fileoff);
+  SwapValue(C.filesize);
+  SwapValue(C.maxprot);
+  SwapValue(C.initprot);
+  SwapValue(C.nsects);
+  SwapValue(C.flags);
 }
 
 template<>
-void SwapStruct(macho::IndirectSymbolTableEntry &C) {
-  SwapValue(C.Index);
+void SwapStruct(uint32_t &C) {
+  SwapValue(C);
 }
 
 template<>
-void SwapStruct(macho::LinkerOptionsLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.Count);
+void SwapStruct(MachO::linker_options_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.count);
 }
 
 template<>
-void SwapStruct(macho::DataInCodeTableEntry &C) {
-  SwapValue(C.Offset);
-  SwapValue(C.Length);
-  SwapValue(C.Kind);
+void SwapStruct(MachO::data_in_code_entry &C) {
+  SwapValue(C.offset);
+  SwapValue(C.length);
+  SwapValue(C.kind);
 }
 
 template<typename T>
@@ -226,11 +233,11 @@ static uint32_t
 getSegmentLoadCommandNumSections(const MachOObjectFile *O,
                                  const MachOObjectFile::LoadCommandInfo &L) {
   if (O->is64Bit()) {
-    macho::Segment64LoadCommand S = O->getSegment64LoadCommand(L);
-    return S.NumSections;
+    MachO::segment_command_64 S = O->getSegment64LoadCommand(L);
+    return S.nsects;
   }
-  macho::SegmentLoadCommand S = O->getSegmentLoadCommand(L);
-  return S.NumSections;
+  MachO::segment_command S = O->getSegmentLoadCommand(L);
+  return S.nsects;
 }
 
 static const char *
@@ -239,10 +246,10 @@ getSectionPtr(const MachOObjectFile *O, MachOObjectFile::LoadCommandInfo L,
   uintptr_t CommandAddr = reinterpret_cast<uintptr_t>(L.Ptr);
 
   bool Is64 = O->is64Bit();
-  unsigned SegmentLoadSize = Is64 ? sizeof(macho::Segment64LoadCommand) :
-                                    sizeof(macho::SegmentLoadCommand);
-  unsigned SectionSize = Is64 ? sizeof(macho::Section64) :
-                                sizeof(macho::Section);
+  unsigned SegmentLoadSize = Is64 ? sizeof(MachO::segment_command_64) :
+                                    sizeof(MachO::segment_command);
+  unsigned SectionSize = Is64 ? sizeof(MachO::section_64) :
+                                sizeof(MachO::section);
 
   uintptr_t SectionAddr = CommandAddr + SegmentLoadSize + Sec * SectionSize;
   return reinterpret_cast<const char*>(SectionAddr);
@@ -252,10 +259,10 @@ static const char *getPtr(const MachOObjectFile *O, size_t Offset) {
   return O->getData().substr(Offset, 1).data();
 }
 
-static SymbolTableEntryBase
+static nlist_base
 getSymbolTableEntryBase(const MachOObjectFile *O, DataRefImpl DRI) {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<SymbolTableEntryBase>(O, P);
+  return getStruct<nlist_base>(O, P);
 }
 
 static StringRef parseSegmentOrSectionName(const char *P) {
@@ -283,11 +290,11 @@ static void advanceTo(T &it, size_t Val) {
 }
 
 static unsigned getCPUType(const MachOObjectFile *O) {
-  return O->getHeader().CPUType;
+  return O->getHeader().cputype;
 }
 
 static void printRelocationTargetName(const MachOObjectFile *O,
-                                      const macho::RelocationEntry &RE,
+                                      const MachO::any_relocation_info &RE,
                                       raw_string_ostream &fmt) {
   bool IsScattered = O->isRelocationScattered(RE);
 
@@ -355,59 +362,61 @@ static void printRelocationTargetName(const MachOObjectFile *O,
   fmt << S;
 }
 
-static uint32_t getPlainRelocationAddress(const macho::RelocationEntry &RE) {
-  return RE.Word0;
+static uint32_t
+getPlainRelocationAddress(const MachO::any_relocation_info &RE) {
+  return RE.r_word0;
 }
 
 static unsigned
-getScatteredRelocationAddress(const macho::RelocationEntry &RE) {
-  return RE.Word0 & 0xffffff;
+getScatteredRelocationAddress(const MachO::any_relocation_info &RE) {
+  return RE.r_word0 & 0xffffff;
 }
 
 static bool getPlainRelocationPCRel(const MachOObjectFile *O,
-                                    const macho::RelocationEntry &RE) {
+                                    const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return (RE.Word1 >> 24) & 1;
-  return (RE.Word1 >> 7) & 1;
+    return (RE.r_word1 >> 24) & 1;
+  return (RE.r_word1 >> 7) & 1;
 }
 
 static bool
 getScatteredRelocationPCRel(const MachOObjectFile *O,
-                            const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 30) & 1;
+                            const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 30) & 1;
 }
 
 static unsigned getPlainRelocationLength(const MachOObjectFile *O,
-                                         const macho::RelocationEntry &RE) {
+                                         const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return (RE.Word1 >> 25) & 3;
-  return (RE.Word1 >> 5) & 3;
+    return (RE.r_word1 >> 25) & 3;
+  return (RE.r_word1 >> 5) & 3;
 }
 
 static unsigned
-getScatteredRelocationLength(const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 28) & 3;
+getScatteredRelocationLength(const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 28) & 3;
 }
 
 static unsigned getPlainRelocationType(const MachOObjectFile *O,
-                                       const macho::RelocationEntry &RE) {
+                                       const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return RE.Word1 >> 28;
-  return RE.Word1 & 0xf;
+    return RE.r_word1 >> 28;
+  return RE.r_word1 & 0xf;
 }
 
-static unsigned getScatteredRelocationType(const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 24) & 0xf;
+static unsigned
+getScatteredRelocationType(const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 24) & 0xf;
 }
 
 static uint32_t getSectionFlags(const MachOObjectFile *O,
                                 DataRefImpl Sec) {
   if (O->is64Bit()) {
-    macho::Section64 Sect = O->getSection64(Sec);
-    return Sect.Flags;
+    MachO::section_64 Sect = O->getSection64(Sec);
+    return Sect.flags;
   }
-  macho::Section Sect = O->getSection(Sec);
-  return Sect.Flags;
+  MachO::section Sect = O->getSection(Sec);
+  return Sect.flags;
 }
 
 MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
@@ -415,22 +424,22 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
                                  error_code &ec)
     : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object),
       SymtabLoadCmd(NULL), DysymtabLoadCmd(NULL), DataInCodeLoadCmd(NULL) {
-  uint32_t LoadCommandCount = this->getHeader().NumLoadCommands;
-  macho::LoadCommandType SegmentLoadType = is64Bit() ?
-    macho::LCT_Segment64 : macho::LCT_Segment;
+  uint32_t LoadCommandCount = this->getHeader().ncmds;
+  MachO::LoadCommandType SegmentLoadType = is64Bit() ?
+    MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT;
 
   MachOObjectFile::LoadCommandInfo Load = getFirstLoadCommandInfo();
   for (unsigned I = 0; ; ++I) {
-    if (Load.C.Type == macho::LCT_Symtab) {
+    if (Load.C.cmd == MachO::LC_SYMTAB) {
       assert(!SymtabLoadCmd && "Multiple symbol tables");
       SymtabLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == macho::LCT_Dysymtab) {
+    } else if (Load.C.cmd == MachO::LC_DYSYMTAB) {
       assert(!DysymtabLoadCmd && "Multiple dynamic symbol tables");
       DysymtabLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == macho::LCT_DataInCode) {
+    } else if (Load.C.cmd == MachO::LC_DATA_IN_CODE) {
       assert(!DataInCodeLoadCmd && "Multiple data in code tables");
       DataInCodeLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == SegmentLoadType) {
+    } else if (Load.C.cmd == SegmentLoadType) {
       uint32_t NumSections = getSegmentLoadCommandNumSections(this, Load);
       for (unsigned J = 0; J < NumSections; ++J) {
         const char *Sec = getSectionPtr(this, Load, J);
@@ -448,8 +457,8 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
 error_code MachOObjectFile::getSymbolNext(DataRefImpl Symb,
                                           SymbolRef &Res) const {
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
   Symb.p += SymbolTableEntrySize;
   Res = SymbolRef(Symb, this);
   return object_error::success;
@@ -458,8 +467,8 @@ error_code MachOObjectFile::getSymbolNext(DataRefImpl Symb,
 error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
                                           StringRef &Res) const {
   StringRef StringTable = getStringTableData();
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  const char *Start = &StringTable.data()[Entry.StringIndex];
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  const char *Start = &StringTable.data()[Entry.n_strx];
   Res = StringRef(Start);
   return object_error::success;
 }
@@ -467,11 +476,11 @@ error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
 error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
                                              uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Symbol64TableEntry Entry = getSymbol64TableEntry(Symb);
-    Res = Entry.Value;
+    MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
+    Res = Entry.n_value;
   } else {
-    macho::SymbolTableEntry Entry = getSymbolTableEntry(Symb);
-    Res = Entry.Value;
+    MachO::nlist Entry = getSymbolTableEntry(Symb);
+    Res = Entry.n_value;
   }
   return object_error::success;
 }
@@ -479,18 +488,18 @@ error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
 error_code
 MachOObjectFile::getSymbolFileOffset(DataRefImpl Symb,
                                      uint64_t &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   getSymbolAddress(Symb, Res);
-  if (Entry.SectionIndex) {
+  if (Entry.n_sect) {
     uint64_t Delta;
     DataRefImpl SecRel;
-    SecRel.d.a = Entry.SectionIndex-1;
+    SecRel.d.a = Entry.n_sect-1;
     if (is64Bit()) {
-      macho::Section64 Sec = getSection64(SecRel);
-      Delta = Sec.Offset - Sec.Address;
+      MachO::section_64 Sec = getSection64(SecRel);
+      Delta = Sec.offset - Sec.addr;
     } else {
-      macho::Section Sec = getSection(SecRel);
-      Delta = Sec.Offset - Sec.Address;
+      MachO::section Sec = getSection(SecRel);
+      Delta = Sec.offset - Sec.addr;
     }
 
     Res += Delta;
@@ -504,8 +513,8 @@ error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
   uint32_t flags;
   this->getSymbolFlags(DRI, flags);
   if (flags & SymbolRef::SF_Common) {
-    SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
-    Result = 1 << MachO::GET_COMM_ALIGN(Entry.Flags);
+    nlist_base Entry = getSymbolTableEntryBase(this, DRI);
+    Result = 1 << MachO::GET_COMM_ALIGN(Entry.n_desc);
   } else {
     Result = 0;
   }
@@ -518,13 +527,13 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
   uint64_t EndOffset = 0;
   uint8_t SectionIndex;
 
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
+  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
   uint64_t Value;
   getSymbolAddress(DRI, Value);
 
   BeginOffset = Value;
 
-  SectionIndex = Entry.SectionIndex;
+  SectionIndex = Entry.n_sect;
   if (!SectionIndex) {
     uint32_t flags = SymbolRef::SF_None;
     this->getSymbolFlags(DRI, flags);
@@ -542,7 +551,7 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
     DataRefImpl DRI = I->getRawDataRefImpl();
     Entry = getSymbolTableEntryBase(this, DRI);
     getSymbolAddress(DRI, Value);
-    if (Entry.SectionIndex == SectionIndex && Value > BeginOffset)
+    if (Entry.n_sect == SectionIndex && Value > BeginOffset)
       if (!EndOffset || Value < EndOffset)
         EndOffset = Value;
   }
@@ -560,73 +569,47 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
 
 error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
                                           SymbolRef::Type &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t n_type = Entry.Type;
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  uint8_t n_type = Entry.n_type;
 
   Res = SymbolRef::ST_Other;
 
   // If this is a STAB debugging symbol, we can do nothing more.
-  if (n_type & MachO::NlistMaskStab) {
+  if (n_type & MachO::N_STAB) {
     Res = SymbolRef::ST_Debug;
     return object_error::success;
   }
 
-  switch (n_type & MachO::NlistMaskType) {
-    case MachO::NListTypeUndefined :
+  switch (n_type & MachO::N_TYPE) {
+    case MachO::N_UNDF :
       Res = SymbolRef::ST_Unknown;
       break;
-    case MachO::NListTypeSection :
+    case MachO::N_SECT :
       Res = SymbolRef::ST_Function;
       break;
   }
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
-                                                char &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t Type = Entry.Type;
-  uint16_t Flags = Entry.Flags;
-
-  char Char;
-  switch (Type & macho::STF_TypeMask) {
-    case macho::STT_Undefined:
-      Char = 'u';
-      break;
-    case macho::STT_Absolute:
-    case macho::STT_Section:
-      Char = 's';
-      break;
-    default:
-      Char = '?';
-      break;
-  }
-
-  if (Flags & (macho::STF_External | macho::STF_PrivateExtern))
-    Char = toupper(static_cast<unsigned char>(Char));
-  Res = Char;
-  return object_error::success;
-}
-
 error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
                                            uint32_t &Result) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
+  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
 
-  uint8_t MachOType = Entry.Type;
-  uint16_t MachOFlags = Entry.Flags;
+  uint8_t MachOType = Entry.n_type;
+  uint16_t MachOFlags = Entry.n_desc;
 
   // TODO: Correctly set SF_ThreadLocal
   Result = SymbolRef::SF_None;
 
-  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined)
+  if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF)
     Result |= SymbolRef::SF_Undefined;
 
-  if (MachOFlags & macho::STF_StabsEntryMask)
+  if (MachOType & MachO::N_STAB)
     Result |= SymbolRef::SF_FormatSpecific;
 
-  if (MachOType & MachO::NlistMaskExternal) {
+  if (MachOType & MachO::N_EXT) {
     Result |= SymbolRef::SF_Global;
-    if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined) {
+    if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF) {
       uint64_t Value;
       getSymbolAddress(DRI, Value);
       if (Value)
@@ -634,10 +617,10 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
     }
   }
 
-  if (MachOFlags & (MachO::NListDescWeakRef | MachO::NListDescWeakDef))
+  if (MachOFlags & (MachO::N_WEAK_REF | MachO::N_WEAK_DEF))
     Result |= SymbolRef::SF_Weak;
 
-  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeAbsolute)
+  if ((MachOType & MachO::N_TYPE) == MachO::N_ABS)
     Result |= SymbolRef::SF_Absolute;
 
   return object_error::success;
@@ -646,8 +629,8 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
 error_code
 MachOObjectFile::getSymbolSection(DataRefImpl Symb,
                                   section_iterator &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t index = Entry.SectionIndex;
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  uint8_t index = Entry.n_sect;
 
   if (index == 0) {
     Res = end_sections();
@@ -682,11 +665,11 @@ MachOObjectFile::getSectionName(DataRefImpl Sec, StringRef &Result) const {
 error_code
 MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Res = Sect.Address;
+    MachO::section_64 Sect = getSection64(Sec);
+    Res = Sect.addr;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Res = Sect.Address;
+    MachO::section Sect = getSection(Sec);
+    Res = Sect.addr;
   }
   return object_error::success;
 }
@@ -694,11 +677,11 @@ MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
 error_code
 MachOObjectFile::getSectionSize(DataRefImpl Sec, uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Res = Sect.Size;
+    MachO::section_64 Sect = getSection64(Sec);
+    Res = Sect.size;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Res = Sect.Size;
+    MachO::section Sect = getSection(Sec);
+    Res = Sect.size;
   }
 
   return object_error::success;
@@ -710,13 +693,13 @@ MachOObjectFile::getSectionContents(DataRefImpl Sec, StringRef &Res) const {
   uint64_t Size;
 
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.Offset;
-    Size = Sect.Size;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.offset;
+    Size = Sect.size;
   } else {
-    macho::Section Sect =getSection(Sec);
-    Offset = Sect.Offset;
-    Size = Sect.Size;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.offset;
+    Size = Sect.size;
   }
 
   Res = this->getData().substr(Offset, Size);
@@ -727,11 +710,11 @@ error_code
 MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
   uint32_t Align;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Align = Sect.Align;
+    MachO::section_64 Sect = getSection64(Sec);
+    Align = Sect.align;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Align = Sect.Align;
+    MachO::section Sect = getSection(Sec);
+    Align = Sect.align;
   }
 
   Res = uint64_t(1) << Align;
@@ -741,7 +724,7 @@ MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
 error_code
 MachOObjectFile::isSectionText(DataRefImpl Sec, bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
-  Res = Flags & macho::SF_PureInstructions;
+  Res = Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
   return object_error::success;
 }
 
@@ -775,9 +758,9 @@ error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
 error_code
 MachOObjectFile::isSectionZeroInit(DataRefImpl Sec, bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
-  unsigned SectionType = Flags & MachO::SectionFlagMaskSectionType;
-  Res = SectionType == MachO::SectionTypeZeroFill ||
-    SectionType == MachO::SectionTypeZeroFillLarge;
+  unsigned SectionType = Flags & MachO::SECTION_TYPE;
+  Res = SectionType == MachO::S_ZEROFILL ||
+    SectionType == MachO::S_GB_ZEROFILL;
   return object_error::success;
 }
 
@@ -814,14 +797,14 @@ MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
   return object_error::success;
 }
 
-relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+relocation_iterator MachOObjectFile::section_rel_begin(DataRefImpl Sec) const {
   uint32_t Offset;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.RelocationTableOffset;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.reloff;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Offset = Sect.RelocationTableOffset;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.reloff;
   }
 
   DataRefImpl Ret;
@@ -830,21 +813,21 @@ relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
 }
 
 relocation_iterator
-MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+MachOObjectFile::section_rel_end(DataRefImpl Sec) const {
   uint32_t Offset;
   uint32_t Num;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.RelocationTableOffset;
-    Num = Sect.NumRelocationTableEntries;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.reloff;
+    Num = Sect.nreloc;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Offset = Sect.RelocationTableOffset;
-    Num = Sect.NumRelocationTableEntries;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.reloff;
+    Num = Sect.nreloc;
   }
 
-  const macho::RelocationEntry *P =
-    reinterpret_cast<const macho::RelocationEntry*>(getPtr(this, Offset));
+  const MachO::any_relocation_info *P =
+    reinterpret_cast<const MachO::any_relocation_info *>(getPtr(this, Offset));
 
   DataRefImpl Ret;
   Ret.p = reinterpret_cast<uintptr_t>(P + Num);
@@ -853,8 +836,8 @@ MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
 
 error_code MachOObjectFile::getRelocationNext(DataRefImpl Rel,
                                               RelocationRef &Res) const {
-  const macho::RelocationEntry *P =
-    reinterpret_cast<const macho::RelocationEntry *>(Rel.p);
+  const MachO::any_relocation_info *P =
+    reinterpret_cast<const MachO::any_relocation_info *>(Rel.p);
   Rel.p = reinterpret_cast<uintptr_t>(P + 1);
   Res = RelocationRef(Rel, this);
   return object_error::success;
@@ -867,24 +850,24 @@ MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
 
 error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
                                                 uint64_t &Res) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationAddress(RE);
   return object_error::success;
 }
 
 symbol_iterator
 MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   uint32_t SymbolIdx = getPlainRelocationSymbolNum(RE);
   bool isExtern = getPlainRelocationExternal(RE);
   if (!isExtern)
     return end_symbols();
 
-  macho::SymtabLoadCommand S = getSymtabLoadCommand();
+  MachO::symtab_command S = getSymtabLoadCommand();
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
-  uint64_t Offset = S.SymbolTableOffset + SymbolIdx * SymbolTableEntrySize;
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
+  uint64_t Offset = S.symoff + SymbolIdx * SymbolTableEntrySize;
   DataRefImpl Sym;
   Sym.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
   return symbol_iterator(SymbolRef(Sym, this));
@@ -892,7 +875,7 @@ MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
 
 error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
                                               uint64_t &Res) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationType(RE);
   return object_error::success;
 }
@@ -993,7 +976,7 @@ MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
 error_code
 MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
 
   unsigned Arch = this->getArch();
 
@@ -1010,47 +993,47 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
     bool isPCRel = getAnyRelocationPCRel(RE);
 
     switch (Type) {
-      case macho::RIT_X86_64_GOTLoad:   // X86_64_RELOC_GOT_LOAD
-      case macho::RIT_X86_64_GOT: {     // X86_64_RELOC_GOT
+      case MachO::X86_64_RELOC_GOT_LOAD:
+      case MachO::X86_64_RELOC_GOT: {
         printRelocationTargetName(this, RE, fmt);
         fmt << "@GOT";
         if (isPCRel) fmt << "PCREL";
         break;
       }
-      case macho::RIT_X86_64_Subtractor: { // X86_64_RELOC_SUBTRACTOR
+      case MachO::X86_64_RELOC_SUBTRACTOR: {
         DataRefImpl RelNext = Rel;
         RelNext.d.a++;
-        macho::RelocationEntry RENext = getRelocation(RelNext);
+        MachO::any_relocation_info RENext = getRelocation(RelNext);
 
-        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_SUBTRACTOR must be followed by a relocation of type
         // X86_64_RELOC_UNSIGNED.
         // NOTE: Scattered relocations don't exist on x86_64.
         unsigned RType = getAnyRelocationType(RENext);
-        if (RType != 0)
+        if (RType != MachO::X86_64_RELOC_UNSIGNED)
           report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
                              "X86_64_RELOC_SUBTRACTOR.");
 
-        // The X86_64_RELOC_UNSIGNED contains the minuend symbol,
-        // X86_64_SUBTRACTOR contains to the subtrahend.
+        // The X86_64_RELOC_UNSIGNED contains the minuend symbol;
+        // X86_64_RELOC_SUBTRACTOR contains the subtrahend.
         printRelocationTargetName(this, RENext, fmt);
         fmt << "-";
         printRelocationTargetName(this, RE, fmt);
         break;
       }
-      case macho::RIT_X86_64_TLV:
+      case MachO::X86_64_RELOC_TLV:
         printRelocationTargetName(this, RE, fmt);
         fmt << "@TLV";
         if (isPCRel) fmt << "P";
         break;
-      case macho::RIT_X86_64_Signed1: // X86_64_RELOC_SIGNED1
+      case MachO::X86_64_RELOC_SIGNED_1:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-1";
         break;
-      case macho::RIT_X86_64_Signed2: // X86_64_RELOC_SIGNED2
+      case MachO::X86_64_RELOC_SIGNED_2:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-2";
         break;
-      case macho::RIT_X86_64_Signed4: // X86_64_RELOC_SIGNED4
+      case MachO::X86_64_RELOC_SIGNED_4:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-4";
         break;
@@ -1059,21 +1042,22 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
         break;
     }
   // X86 and ARM share some relocation types in common.
-  } else if (Arch == Triple::x86 || Arch == Triple::arm) {
+  } else if (Arch == Triple::x86 || Arch == Triple::arm ||
+             Arch == Triple::ppc) {
     // Generic relocation types...
     switch (Type) {
-      case macho::RIT_Pair: // GENERIC_RELOC_PAIR - prints no info
+      case MachO::GENERIC_RELOC_PAIR: // prints no info
         return object_error::success;
-      case macho::RIT_Difference: { // GENERIC_RELOC_SECTDIFF
+      case MachO::GENERIC_RELOC_SECTDIFF: {
         DataRefImpl RelNext = Rel;
         RelNext.d.a++;
-        macho::RelocationEntry RENext = getRelocation(RelNext);
+        MachO::any_relocation_info RENext = getRelocation(RelNext);
 
         // X86 sect diff's must be followed by a relocation of type
         // GENERIC_RELOC_PAIR.
         unsigned RType = getAnyRelocationType(RENext);
 
-        if (RType != 1)
+        if (RType != MachO::GENERIC_RELOC_PAIR)
           report_fatal_error("Expected GENERIC_RELOC_PAIR after "
                              "GENERIC_RELOC_SECTDIFF.");
 
@@ -1084,19 +1068,17 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       }
     }
 
-    if (Arch == Triple::x86) {
-      // All X86 relocations that need special printing were already
-      // handled in the generic code.
+    if (Arch == Triple::x86 || Arch == Triple::ppc) {
       switch (Type) {
-        case macho::RIT_Generic_LocalDifference:{// GENERIC_RELOC_LOCAL_SECTDIFF
+        case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
           DataRefImpl RelNext = Rel;
           RelNext.d.a++;
-          macho::RelocationEntry RENext = getRelocation(RelNext);
+          MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // X86 sect diff's must be followed by a relocation of type
           // GENERIC_RELOC_PAIR.
           unsigned RType = getAnyRelocationType(RENext);
-          if (RType != 1)
+          if (RType != MachO::GENERIC_RELOC_PAIR)
             report_fatal_error("Expected GENERIC_RELOC_PAIR after "
                                "GENERIC_RELOC_LOCAL_SECTDIFF.");
 
@@ -1105,7 +1087,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
           printRelocationTargetName(this, RENext, fmt);
           break;
         }
-        case macho::RIT_Generic_TLV: {
+        case MachO::GENERIC_RELOC_TLV: {
           printRelocationTargetName(this, RE, fmt);
           fmt << "@TLV";
           if (IsPCRel) fmt << "P";
@@ -1116,8 +1098,8 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       }
     } else { // ARM-specific relocations
       switch (Type) {
-        case macho::RIT_ARM_Half:             // ARM_RELOC_HALF
-        case macho::RIT_ARM_HalfDifference: { // ARM_RELOC_HALF_SECTDIFF
+        case MachO::ARM_RELOC_HALF:
+        case MachO::ARM_RELOC_HALF_SECTDIFF: {
           // Half relocations steal a bit from the length field to encode
           // whether this is an upper16 or a lower16 relocation.
           bool isUpper = getAnyRelocationLength(RE) >> 1;
@@ -1130,14 +1112,14 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
 
           DataRefImpl RelNext = Rel;
           RelNext.d.a++;
-          macho::RelocationEntry RENext = getRelocation(RelNext);
+          MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // ARM half relocs must be followed by a relocation of type
           // ARM_RELOC_PAIR.
           unsigned RType = getAnyRelocationType(RENext);
-          if (RType != 1)
+          if (RType != MachO::ARM_RELOC_PAIR)
             report_fatal_error("Expected ARM_RELOC_PAIR after "
-                               "GENERIC_RELOC_HALF");
+                               "ARM_RELOC_HALF");
 
           // NOTE: The half of the target virtual address is stashed in the
           // address field of the secondary relocation, but we can't reverse
@@ -1146,7 +1128,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
 
           // ARM_RELOC_HALF_SECTDIFF encodes the second section in the
           // symbol/section pointer of the follow-on relocation.
-          if (Type == macho::RIT_ARM_HalfDifference) {
+          if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
             fmt << "-";
             printRelocationTargetName(this, RENext, fmt);
           }
@@ -1177,17 +1159,17 @@ MachOObjectFile::getRelocationHidden(DataRefImpl Rel, bool &Result) const {
 
   // On arches that use the generic relocations, GENERIC_RELOC_PAIR
   // is always hidden.
-  if (Arch == Triple::x86 || Arch == Triple::arm) {
-    if (Type == macho::RIT_Pair) Result = true;
+  if (Arch == Triple::x86 || Arch == Triple::arm || Arch == Triple::ppc) {
+    if (Type == MachO::GENERIC_RELOC_PAIR) Result = true;
   } else if (Arch == Triple::x86_64) {
     // On x86_64, X86_64_RELOC_UNSIGNED is hidden only when it follows
     // an X86_64_RELOC_SUBTRACTOR.
-    if (Type == macho::RIT_X86_64_Unsigned && Rel.d.a > 0) {
+    if (Type == MachO::X86_64_RELOC_UNSIGNED && Rel.d.a > 0) {
       DataRefImpl RelPrev = Rel;
       RelPrev.d.a--;
       uint64_t PrevType;
       getRelocationType(RelPrev, PrevType);
-      if (PrevType == macho::RIT_X86_64_Subtractor)
+      if (PrevType == MachO::X86_64_RELOC_SUBTRACTOR)
         Result = true;
     }
   }
@@ -1210,8 +1192,8 @@ symbol_iterator MachOObjectFile::begin_symbols() const {
   if (!SymtabLoadCmd)
     return symbol_iterator(SymbolRef(DRI, this));
 
-  macho::SymtabLoadCommand Symtab = getSymtabLoadCommand();
-  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.SymbolTableOffset));
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
   return symbol_iterator(SymbolRef(DRI, this));
 }
 
@@ -1220,12 +1202,12 @@ symbol_iterator MachOObjectFile::end_symbols() const {
   if (!SymtabLoadCmd)
     return symbol_iterator(SymbolRef(DRI, this));
 
-  macho::SymtabLoadCommand Symtab = getSymtabLoadCommand();
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
-  unsigned Offset = Symtab.SymbolTableOffset +
-    Symtab.NumSymbolTableEntries * SymbolTableEntrySize;
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
+  unsigned Offset = Symtab.symoff +
+    Symtab.nsyms * SymbolTableEntrySize;
   DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
   return symbol_iterator(SymbolRef(DRI, this));
 }
@@ -1269,28 +1251,28 @@ StringRef MachOObjectFile::getFileFormatName() const {
   unsigned CPUType = getCPUType(this);
   if (!is64Bit()) {
     switch (CPUType) {
-    case llvm::MachO::CPUTypeI386:
+    case llvm::MachO::CPU_TYPE_I386:
       return "Mach-O 32-bit i386";
-    case llvm::MachO::CPUTypeARM:
+    case llvm::MachO::CPU_TYPE_ARM:
       return "Mach-O arm";
-    case llvm::MachO::CPUTypePowerPC:
+    case llvm::MachO::CPU_TYPE_POWERPC:
       return "Mach-O 32-bit ppc";
     default:
-      assert((CPUType & llvm::MachO::CPUArchABI64) == 0 &&
+      assert((CPUType & llvm::MachO::CPU_ARCH_ABI64) == 0 &&
              "64-bit object file when we're not 64-bit?");
       return "Mach-O 32-bit unknown";
     }
   }
 
   // Make sure the cpu type has the correct mask.
-  assert((CPUType & llvm::MachO::CPUArchABI64)
-         == llvm::MachO::CPUArchABI64 &&
+  assert((CPUType & llvm::MachO::CPU_ARCH_ABI64)
+         == llvm::MachO::CPU_ARCH_ABI64 &&
          "32-bit object file when we're 64-bit?");
 
   switch (CPUType) {
-  case llvm::MachO::CPUTypeX86_64:
+  case llvm::MachO::CPU_TYPE_X86_64:
     return "Mach-O 64-bit x86-64";
-  case llvm::MachO::CPUTypePowerPC64:
+  case llvm::MachO::CPU_TYPE_POWERPC64:
     return "Mach-O 64-bit ppc64";
   default:
     return "Mach-O 64-bit unknown";
@@ -1299,15 +1281,15 @@ StringRef MachOObjectFile::getFileFormatName() const {
 
 Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) {
   switch (CPUType) {
-  case llvm::MachO::CPUTypeI386:
+  case llvm::MachO::CPU_TYPE_I386:
     return Triple::x86;
-  case llvm::MachO::CPUTypeX86_64:
+  case llvm::MachO::CPU_TYPE_X86_64:
     return Triple::x86_64;
-  case llvm::MachO::CPUTypeARM:
+  case llvm::MachO::CPU_TYPE_ARM:
     return Triple::arm;
-  case llvm::MachO::CPUTypePowerPC:
+  case llvm::MachO::CPU_TYPE_POWERPC:
     return Triple::ppc;
-  case llvm::MachO::CPUTypePowerPC64:
+  case llvm::MachO::CPU_TYPE_POWERPC64:
     return Triple::ppc64;
   default:
     return Triple::UnknownArch;
@@ -1323,16 +1305,16 @@ StringRef MachOObjectFile::getLoadName() const {
   report_fatal_error("get_load_name() unimplemented in MachOObjectFile");
 }
 
-relocation_iterator MachOObjectFile::getSectionRelBegin(unsigned Index) const {
+relocation_iterator MachOObjectFile::section_rel_begin(unsigned Index) const {
   DataRefImpl DRI;
   DRI.d.a = Index;
-  return getSectionRelBegin(DRI);
+  return section_rel_begin(DRI);
 }
 
-relocation_iterator MachOObjectFile::getSectionRelEnd(unsigned Index) const {
+relocation_iterator MachOObjectFile::section_rel_end(unsigned Index) const {
   DataRefImpl DRI;
   DRI.d.a = Index;
-  return getSectionRelEnd(DRI);
+  return section_rel_end(DRI);
 }
 
 dice_iterator MachOObjectFile::begin_dices() const {
@@ -1340,8 +1322,8 @@ dice_iterator MachOObjectFile::begin_dices() const {
   if (!DataInCodeLoadCmd)
     return dice_iterator(DiceRef(DRI, this));
 
-  macho::LinkeditDataLoadCommand DicLC = getDataInCodeLoadCommand();
-  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, DicLC.DataOffset));
+  MachO::linkedit_data_command DicLC = getDataInCodeLoadCommand();
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, DicLC.dataoff));
   return dice_iterator(DiceRef(DRI, this));
 }
 
@@ -1350,8 +1332,8 @@ dice_iterator MachOObjectFile::end_dices() const {
   if (!DataInCodeLoadCmd)
     return dice_iterator(DiceRef(DRI, this));
 
-  macho::LinkeditDataLoadCommand DicLC = getDataInCodeLoadCommand();
-  unsigned Offset = DicLC.DataOffset + DicLC.DataSize;
+  MachO::linkedit_data_command DicLC = getDataInCodeLoadCommand();
+  unsigned Offset = DicLC.dataoff + DicLC.datasize;
   DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
   return dice_iterator(DiceRef(DRI, this));
 }
@@ -1364,80 +1346,82 @@ MachOObjectFile::getSectionFinalSegmentName(DataRefImpl Sec) const {
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawName(DataRefImpl Sec) const {
-  const SectionBase *Base =
-    reinterpret_cast<const SectionBase*>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->Name);
+  const section_base *Base =
+    reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
+  return ArrayRef<char>(Base->sectname);
 }
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawFinalSegmentName(DataRefImpl Sec) const {
-  const SectionBase *Base =
-    reinterpret_cast<const SectionBase*>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->SegmentName);
+  const section_base *Base =
+    reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
+  return ArrayRef<char>(Base->segname);
 }
 
 bool
-MachOObjectFile::isRelocationScattered(const macho::RelocationEntry &RE)
+MachOObjectFile::isRelocationScattered(const MachO::any_relocation_info &RE)
   const {
-  if (getCPUType(this) == llvm::MachO::CPUTypeX86_64)
+  if (getCPUType(this) == MachO::CPU_TYPE_X86_64)
     return false;
-  return getPlainRelocationAddress(RE) & macho::RF_Scattered;
+  return getPlainRelocationAddress(RE) & MachO::R_SCATTERED;
 }
 
 unsigned MachOObjectFile::getPlainRelocationSymbolNum(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isLittleEndian())
-    return RE.Word1 & 0xffffff;
-  return RE.Word1 >> 8;
+    return RE.r_word1 & 0xffffff;
+  return RE.r_word1 >> 8;
 }
 
 bool MachOObjectFile::getPlainRelocationExternal(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isLittleEndian())
-    return (RE.Word1 >> 27) & 1;
-  return (RE.Word1 >> 4) & 1;
+    return (RE.r_word1 >> 27) & 1;
+  return (RE.r_word1 >> 4) & 1;
 }
 
 bool MachOObjectFile::getScatteredRelocationScattered(
-    const macho::RelocationEntry &RE) const {
-  return RE.Word0 >> 31;
+    const MachO::any_relocation_info &RE) const {
+  return RE.r_word0 >> 31;
 }
 
 uint32_t MachOObjectFile::getScatteredRelocationValue(
-    const macho::RelocationEntry &RE) const {
-  return RE.Word1;
+    const MachO::any_relocation_info &RE) const {
+  return RE.r_word1;
 }
 
 unsigned MachOObjectFile::getAnyRelocationAddress(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationAddress(RE);
   return getPlainRelocationAddress(RE);
 }
 
-unsigned
-MachOObjectFile::getAnyRelocationPCRel(const macho::RelocationEntry &RE) const {
+unsigned MachOObjectFile::getAnyRelocationPCRel(
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationPCRel(this, RE);
   return getPlainRelocationPCRel(this, RE);
 }
 
 unsigned MachOObjectFile::getAnyRelocationLength(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationLength(RE);
   return getPlainRelocationLength(this, RE);
 }
 
 unsigned
-MachOObjectFile::getAnyRelocationType(const macho::RelocationEntry &RE) const {
+MachOObjectFile::getAnyRelocationType(
+                                   const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationType(RE);
   return getPlainRelocationType(this, RE);
 }
 
 SectionRef
-MachOObjectFile::getRelocationSection(const macho::RelocationEntry &RE) const {
+MachOObjectFile::getRelocationSection(
+                                   const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE) || getPlainRelocationExternal(RE))
     return *end_sections();
   unsigned SecNum = getPlainRelocationSymbolNum(RE) - 1;
@@ -1450,133 +1434,132 @@ MachOObjectFile::LoadCommandInfo
 MachOObjectFile::getFirstLoadCommandInfo() const {
   MachOObjectFile::LoadCommandInfo Load;
 
-  unsigned HeaderSize = is64Bit() ? macho::Header64Size : macho::Header32Size;
+  unsigned HeaderSize = is64Bit() ? sizeof(MachO::mach_header_64) :
+                                    sizeof(MachO::mach_header);
   Load.Ptr = getPtr(this, HeaderSize);
-  Load.C = getStruct<macho::LoadCommand>(this, Load.Ptr);
+  Load.C = getStruct<MachO::load_command>(this, Load.Ptr);
   return Load;
 }
 
 MachOObjectFile::LoadCommandInfo
 MachOObjectFile::getNextLoadCommandInfo(const LoadCommandInfo &L) const {
   MachOObjectFile::LoadCommandInfo Next;
-  Next.Ptr = L.Ptr + L.C.Size;
-  Next.C = getStruct<macho::LoadCommand>(this, Next.Ptr);
+  Next.Ptr = L.Ptr + L.C.cmdsize;
+  Next.C = getStruct<MachO::load_command>(this, Next.Ptr);
   return Next;
 }
 
-macho::Section MachOObjectFile::getSection(DataRefImpl DRI) const {
-  return getStruct<macho::Section>(this, Sections[DRI.d.a]);
+MachO::section MachOObjectFile::getSection(DataRefImpl DRI) const {
+  return getStruct<MachO::section>(this, Sections[DRI.d.a]);
 }
 
-macho::Section64 MachOObjectFile::getSection64(DataRefImpl DRI) const {
-  return getStruct<macho::Section64>(this, Sections[DRI.d.a]);
+MachO::section_64 MachOObjectFile::getSection64(DataRefImpl DRI) const {
+  return getStruct<MachO::section_64>(this, Sections[DRI.d.a]);
 }
 
-macho::Section MachOObjectFile::getSection(const LoadCommandInfo &L,
+MachO::section MachOObjectFile::getSection(const LoadCommandInfo &L,
                                            unsigned Index) const {
   const char *Sec = getSectionPtr(this, L, Index);
-  return getStruct<macho::Section>(this, Sec);
+  return getStruct<MachO::section>(this, Sec);
 }
 
-macho::Section64 MachOObjectFile::getSection64(const LoadCommandInfo &L,
-                                               unsigned Index) const {
+MachO::section_64 MachOObjectFile::getSection64(const LoadCommandInfo &L,
+                                                unsigned Index) const {
   const char *Sec = getSectionPtr(this, L, Index);
-  return getStruct<macho::Section64>(this, Sec);
+  return getStruct<MachO::section_64>(this, Sec);
 }
 
-macho::SymbolTableEntry
+MachO::nlist
 MachOObjectFile::getSymbolTableEntry(DataRefImpl DRI) const {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<macho::SymbolTableEntry>(this, P);
+  return getStruct<MachO::nlist>(this, P);
 }
 
-macho::Symbol64TableEntry
+MachO::nlist_64
 MachOObjectFile::getSymbol64TableEntry(DataRefImpl DRI) const {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<macho::Symbol64TableEntry>(this, P);
+  return getStruct<MachO::nlist_64>(this, P);
 }
 
-macho::LinkeditDataLoadCommand MachOObjectFile::getLinkeditDataLoadCommand(
-    const MachOObjectFile::LoadCommandInfo &L) const {
-  return getStruct<macho::LinkeditDataLoadCommand>(this, L.Ptr);
+MachO::linkedit_data_command
+MachOObjectFile::getLinkeditDataLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::linkedit_data_command>(this, L.Ptr);
 }
 
-macho::SegmentLoadCommand
+MachO::segment_command
 MachOObjectFile::getSegmentLoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::SegmentLoadCommand>(this, L.Ptr);
+  return getStruct<MachO::segment_command>(this, L.Ptr);
 }
 
-macho::Segment64LoadCommand
+MachO::segment_command_64
 MachOObjectFile::getSegment64LoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::Segment64LoadCommand>(this, L.Ptr);
+  return getStruct<MachO::segment_command_64>(this, L.Ptr);
 }
 
-macho::LinkerOptionsLoadCommand
+MachO::linker_options_command
 MachOObjectFile::getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::LinkerOptionsLoadCommand>(this, L.Ptr);
+  return getStruct<MachO::linker_options_command>(this, L.Ptr);
 }
 
-macho::RelocationEntry
+MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
   const char *P = reinterpret_cast<const char *>(Rel.p);
-  return getStruct<macho::RelocationEntry>(this, P);
+  return getStruct<MachO::any_relocation_info>(this, P);
 }
 
-macho::DataInCodeTableEntry
+MachO::data_in_code_entry
 MachOObjectFile::getDice(DataRefImpl Rel) const {
   const char *P = reinterpret_cast<const char *>(Rel.p);
-  return getStruct<macho::DataInCodeTableEntry>(this, P);
+  return getStruct<MachO::data_in_code_entry>(this, P);
 }
 
-macho::Header MachOObjectFile::getHeader() const {
-  return getStruct<macho::Header>(this, getPtr(this, 0));
+MachO::mach_header MachOObjectFile::getHeader() const {
+  return getStruct<MachO::mach_header>(this, getPtr(this, 0));
 }
 
-macho::Header64Ext MachOObjectFile::getHeader64Ext() const {
-  return
-    getStruct<macho::Header64Ext>(this, getPtr(this, sizeof(macho::Header)));
+MachO::mach_header_64 MachOObjectFile::getHeader64() const {
+  return getStruct<MachO::mach_header_64>(this, getPtr(this, 0));
 }
 
-macho::IndirectSymbolTableEntry MachOObjectFile::getIndirectSymbolTableEntry(
-                                          const macho::DysymtabLoadCommand &DLC,
-                                          unsigned Index) const {
-  uint64_t Offset = DLC.IndirectSymbolTableOffset +
-    Index * sizeof(macho::IndirectSymbolTableEntry);
-  return getStruct<macho::IndirectSymbolTableEntry>(this, getPtr(this, Offset));
+uint32_t MachOObjectFile::getIndirectSymbolTableEntry(
+                                             const MachO::dysymtab_command &DLC,
+                                             unsigned Index) const {
+  uint64_t Offset = DLC.indirectsymoff + Index * sizeof(uint32_t);
+  return getStruct<uint32_t>(this, getPtr(this, Offset));
 }
 
-macho::DataInCodeTableEntry
+MachO::data_in_code_entry
 MachOObjectFile::getDataInCodeTableEntry(uint32_t DataOffset,
                                          unsigned Index) const {
-  uint64_t Offset = DataOffset + Index * sizeof(macho::DataInCodeTableEntry);
-  return getStruct<macho::DataInCodeTableEntry>(this, getPtr(this, Offset));
+  uint64_t Offset = DataOffset + Index * sizeof(MachO::data_in_code_entry);
+  return getStruct<MachO::data_in_code_entry>(this, getPtr(this, Offset));
 }
 
-macho::SymtabLoadCommand MachOObjectFile::getSymtabLoadCommand() const {
-  return getStruct<macho::SymtabLoadCommand>(this, SymtabLoadCmd);
+MachO::symtab_command MachOObjectFile::getSymtabLoadCommand() const {
+  return getStruct<MachO::symtab_command>(this, SymtabLoadCmd);
 }
 
-macho::DysymtabLoadCommand MachOObjectFile::getDysymtabLoadCommand() const {
-  return getStruct<macho::DysymtabLoadCommand>(this, DysymtabLoadCmd);
+MachO::dysymtab_command MachOObjectFile::getDysymtabLoadCommand() const {
+  return getStruct<MachO::dysymtab_command>(this, DysymtabLoadCmd);
 }
 
-macho::LinkeditDataLoadCommand
+MachO::linkedit_data_command
 MachOObjectFile::getDataInCodeLoadCommand() const {
   if (DataInCodeLoadCmd)
-    return getStruct<macho::LinkeditDataLoadCommand>(this, DataInCodeLoadCmd);
+    return getStruct<MachO::linkedit_data_command>(this, DataInCodeLoadCmd);
 
   // If there is no DataInCodeLoadCmd return a load command with zero'ed fields.
-  macho::LinkeditDataLoadCommand Cmd;
-  Cmd.Type = macho::LCT_DataInCode;
-  Cmd.Size = macho::LinkeditLoadCommandSize;
-  Cmd.DataOffset = 0;
-  Cmd.DataSize = 0;
+  MachO::linkedit_data_command Cmd;
+  Cmd.cmd = MachO::LC_DATA_IN_CODE;
+  Cmd.cmdsize = sizeof(MachO::linkedit_data_command);
+  Cmd.dataoff = 0;
+  Cmd.datasize = 0;
   return Cmd;
 }
 
 StringRef MachOObjectFile::getStringTableData() const {
-  macho::SymtabLoadCommand S = getSymtabLoadCommand();
-  return getData().substr(S.StringTableOffset, S.StringTableSize);
+  MachO::symtab_command S = getSymtabLoadCommand();
+  return getData().substr(S.stroff, S.strsize);
 }
 
 bool MachOObjectFile::is64Bit() const {
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index b76f10e..75160af 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -31,18 +31,18 @@ template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
-void SwapStruct(macho::FatHeader &H) {
-  SwapValue(H.Magic);
-  SwapValue(H.NumFatArch);
+void SwapStruct(MachO::fat_header &H) {
+  SwapValue(H.magic);
+  SwapValue(H.nfat_arch);
 }
 
 template<>
-void SwapStruct(macho::FatArchHeader &H) {
-  SwapValue(H.CPUType);
-  SwapValue(H.CPUSubtype);
-  SwapValue(H.Offset);
-  SwapValue(H.Size);
-  SwapValue(H.Align);
+void SwapStruct(MachO::fat_arch &H) {
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.offset);
+  SwapValue(H.size);
+  SwapValue(H.align);
 }
 
 template<typename T>
@@ -63,10 +63,10 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch(
   } else {
     // Parse object header.
     StringRef ParentData = Parent->getData();
-    const char *HeaderPos = ParentData.begin() + macho::FatHeaderSize +
-                            Index * macho::FatArchHeaderSize;
-    Header = getUniversalBinaryStruct<macho::FatArchHeader>(HeaderPos);
-    if (ParentData.size() < Header.Offset + Header.Size) {
+    const char *HeaderPos = ParentData.begin() + sizeof(MachO::fat_header) +
+                            Index * sizeof(MachO::fat_arch);
+    Header = getUniversalBinaryStruct<MachO::fat_arch>(HeaderPos);
+    if (ParentData.size() < Header.offset + Header.size) {
       clear();
     }
   }
@@ -76,10 +76,10 @@ error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
     OwningPtr<ObjectFile> &Result) const {
   if (Parent) {
     StringRef ParentData = Parent->getData();
-    StringRef ObjectData = ParentData.substr(Header.Offset, Header.Size);
+    StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
     std::string ObjectName =
         Parent->getFileName().str() + ":" +
-        Triple::getArchTypeName(MachOObjectFile::getArch(Header.CPUType));
+        Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype));
     MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
         ObjectData, ObjectName, false);
     if (ObjectFile *Obj = ObjectFile::createMachOObjectFile(ObjBuffer)) {
@@ -96,31 +96,31 @@ MachOUniversalBinary::MachOUniversalBinary(MemoryBuffer *Source,
                                            error_code &ec)
   : Binary(Binary::ID_MachOUniversalBinary, Source),
     NumberOfObjects(0) {
-  if (Source->getBufferSize() < macho::FatHeaderSize) {
+  if (Source->getBufferSize() < sizeof(MachO::fat_header)) {
     ec = object_error::invalid_file_type;
     return;
   }
   // Check for magic value and sufficient header size.
   StringRef Buf = getData();
-  macho::FatHeader H = getUniversalBinaryStruct<macho::FatHeader>(Buf.begin());
-  NumberOfObjects = H.NumFatArch;
-  uint32_t MinSize = macho::FatHeaderSize +
-                     macho::FatArchHeaderSize * NumberOfObjects;
-  if (H.Magic != macho::HM_Universal || Buf.size() < MinSize) {
+  MachO::fat_header H= getUniversalBinaryStruct<MachO::fat_header>(Buf.begin());
+  NumberOfObjects = H.nfat_arch;
+  uint32_t MinSize = sizeof(MachO::fat_header) +
+                     sizeof(MachO::fat_arch) * NumberOfObjects;
+  if (H.magic != MachO::FAT_MAGIC || Buf.size() < MinSize) {
     ec = object_error::parse_failed;
     return;
   }
   ec = object_error::success;
 }
 
-static bool getCTMForArch(Triple::ArchType Arch, mach::CPUTypeMachine &CTM) {
+static bool getCTMForArch(Triple::ArchType Arch, MachO::CPUType &CTM) {
   switch (Arch) {
-    case Triple::x86:    CTM = mach::CTM_i386; return true;
-    case Triple::x86_64: CTM = mach::CTM_x86_64; return true;
-    case Triple::arm:    CTM = mach::CTM_ARM; return true;
-    case Triple::sparc:  CTM = mach::CTM_SPARC; return true;
-    case Triple::ppc:    CTM = mach::CTM_PowerPC; return true;
-    case Triple::ppc64:  CTM = mach::CTM_PowerPC64; return true;
+    case Triple::x86:    CTM = MachO::CPU_TYPE_I386; return true;
+    case Triple::x86_64: CTM = MachO::CPU_TYPE_X86_64; return true;
+    case Triple::arm:    CTM = MachO::CPU_TYPE_ARM; return true;
+    case Triple::sparc:  CTM = MachO::CPU_TYPE_SPARC; return true;
+    case Triple::ppc:    CTM = MachO::CPU_TYPE_POWERPC; return true;
+    case Triple::ppc64:  CTM = MachO::CPU_TYPE_POWERPC64; return true;
     default: return false;
   }
 }
@@ -128,7 +128,7 @@ static bool getCTMForArch(Triple::ArchType Arch, mach::CPUTypeMachine &CTM) {
 error_code
 MachOUniversalBinary::getObjectForArch(Triple::ArchType Arch,
                                        OwningPtr<ObjectFile> &Result) const {
-  mach::CPUTypeMachine CTM;
+  MachO::CPUType CTM;
   if (!getCTMForArch(Arch, CTM))
     return object_error::arch_not_found;
   for (object_iterator I = begin_objects(), E = end_objects(); I != E; ++I) {
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index 1d1dafd..0e626d6 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -49,6 +49,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
   case sys::fs::file_magic::bitcode:
   case sys::fs::file_magic::archive:
   case sys::fs::file_magic::macho_universal_binary:
+  case sys::fs::file_magic::windows_resource:
     delete Object;
     return 0;
   case sys::fs::file_magic::elf_relocatable:
@@ -68,6 +69,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
   case sys::fs::file_magic::macho_dsym_companion:
     return createMachOObjectFile(Object);
   case sys::fs::file_magic::coff_object:
+  case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
     return createCOFFObjectFile(Object);
   }
diff --git a/lib/Object/YAML.cpp b/lib/Object/YAML.cpp
index 21bacb8..c527bde 100644
--- a/lib/Object/YAML.cpp
+++ b/lib/Object/YAML.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Object/YAML.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 using namespace llvm;
 using namespace object::yaml;
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 98e63bc..6fa459a 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -14,26 +14,27 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cctype>
 #include <map>
 
 using namespace llvm;
 using namespace llvm::opt;
 
-// Ordering on Info. The ordering is *almost* lexicographic, with two
-// exceptions. First, '\0' comes at the end of the alphabet instead of
-// the beginning (thus options precede any other options which prefix
-// them). Second, for options with the same name, the less permissive
-// version should come first; a Flag option should precede a Joined
-// option, for example.
+namespace llvm {
+namespace opt {
 
-static int StrCmpOptionName(const char *A, const char *B) {
-  char a = *A, b = *B;
+// Ordering on Info. The ordering is *almost* case-insensitive lexicographic,
+// with an exceptions. '\0' comes at the end of the alphabet instead of the
+// beginning (thus options precede any other options which prefix them).
+static int StrCmpOptionNameIgnoreCase(const char *A, const char *B) {
+  const char *X = A, *Y = B;
+  char a = tolower(*A), b = tolower(*B);
   while (a == b) {
     if (a == '\0')
       return 0;
 
-    a = *++A;
-    b = *++B;
+    a = tolower(*++X);
+    b = tolower(*++Y);
   }
 
   if (a == '\0') // A is a prefix of B.
@@ -45,21 +46,25 @@ static int StrCmpOptionName(const char *A, const char *B) {
   return (a < b) ? -1 : 1;
 }
 
-namespace llvm {
-namespace opt {
+#ifndef NDEBUG
+static int StrCmpOptionName(const char *A, const char *B) {
+  if (int N = StrCmpOptionNameIgnoreCase(A, B))
+    return N;
+  return strcmp(A, B);
+}
 
 static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
   if (&A == &B)
     return false;
 
   if (int N = StrCmpOptionName(A.Name, B.Name))
-    return N == -1;
+    return N < 0;
 
   for (const char * const *APre = A.Prefixes,
                   * const *BPre = B.Prefixes;
                           *APre != 0 && *BPre != 0; ++APre, ++BPre) {
     if (int N = StrCmpOptionName(*APre, *BPre))
-      return N == -1;
+      return N < 0;
   }
 
   // Names are the same, check that classes are in order; exactly one
@@ -68,22 +73,22 @@ static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
          "Unexpected classes for options with same name.");
   return B.Kind == Option::JoinedClass;
 }
+#endif
 
 // Support lower_bound between info and an option name.
 static inline bool operator<(const OptTable::Info &I, const char *Name) {
-  return StrCmpOptionName(I.Name, Name) == -1;
-}
-static inline bool operator<(const char *Name, const OptTable::Info &I) {
-  return StrCmpOptionName(Name, I.Name) == -1;
+  return StrCmpOptionNameIgnoreCase(I.Name, Name) < 0;
 }
 }
 }
 
 OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {}
 
-OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos)
+OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos,
+                   bool _IgnoreCase)
   : OptionInfos(_OptionInfos),
     NumOptionInfos(_NumOptionInfos),
+    IgnoreCase(_IgnoreCase),
     TheInputOptionID(0),
     TheUnknownOptionID(0),
     FirstSearchableIndex(0)
@@ -171,11 +176,18 @@ static bool isInput(const llvm::StringSet<> &Prefixes, StringRef Arg) {
 }
 
 /// \returns Matched size. 0 means no match.
-static unsigned matchOption(const OptTable::Info *I, StringRef Str) {
+static unsigned matchOption(const OptTable::Info *I, StringRef Str,
+                            bool IgnoreCase) {
   for (const char * const *Pre = I->Prefixes; *Pre != 0; ++Pre) {
     StringRef Prefix(*Pre);
-    if (Str.startswith(Prefix) && Str.substr(Prefix.size()).startswith(I->Name))
-      return Prefix.size() + StringRef(I->Name).size();
+    if (Str.startswith(Prefix)) {
+      StringRef Rest = Str.substr(Prefix.size());
+      bool Matched = IgnoreCase
+          ? Rest.startswith_lower(I->Name)
+          : Rest.startswith(I->Name);
+      if (Matched)
+        return Prefix.size() + StringRef(I->Name).size();
+    }
   }
   return 0;
 }
@@ -210,7 +222,7 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
     unsigned ArgSize = 0;
     // Scan for first option which is a proper prefix.
     for (; Start != End; ++Start)
-      if ((ArgSize = matchOption(Start, Str)))
+      if ((ArgSize = matchOption(Start, Str, IgnoreCase)))
         break;
     if (Start == End)
       break;
@@ -259,20 +271,6 @@ InputArgList *OptTable::ParseArgs(const char *const *ArgBegin,
       continue;
     }
 
-    if (Str == "--") {
-      // Everything after -- is a filename.
-      ++Index;
-
-      assert(TheInputOptionID != 0 && "Invalid input option ID.");
-      while (Index < End) {
-        Args->append(new Arg(getOption(TheInputOptionID),
-                             Args->getArgString(Index), Index,
-                             Args->getArgString(Index)));
-        ++Index;
-      }
-      break;
-    }
-
     unsigned Prev = Index;
     Arg *A = ParseOneArg(*Args, Index, FlagsToInclude, FlagsToExclude);
     assert(Index > Prev && "Parser failed to consume argument.");
@@ -308,6 +306,7 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
     break;
 
   case Option::SeparateClass: case Option::JoinedOrSeparateClass:
+  case Option::RemainingArgsClass:
     Name += ' ';
     // FALLTHROUGH
   case Option::JoinedClass: case Option::CommaJoinedClass:
diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index 1d6a3d3..7b5ff2b 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp
@@ -52,6 +52,7 @@ void Option::dump() const {
     P(MultiArgClass);
     P(JoinedOrSeparateClass);
     P(JoinedAndSeparateClass);
+    P(RemainingArgsClass);
 #undef P
   }
 
@@ -214,6 +215,16 @@ Arg *Option::accept(const ArgList &Args,
     return new Arg(UnaliasedOption, Spelling, Index - 2,
                    Args.getArgString(Index - 2) + ArgSize,
                    Args.getArgString(Index - 1));
+  case RemainingArgsClass: {
+    // Matches iff this is an exact match.
+    // FIXME: Avoid strlen.
+    if (ArgSize != strlen(Args.getArgString(Index)))
+      return 0;
+    Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
+    while (Index < Args.getNumInputArgStrings())
+      A->getValues().push_back(Args.getArgString(Index++));
+    return A;
+  }
   default:
     llvm_unreachable("Invalid option kind!");
   }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 34bc6b6..676e2d4 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -3546,11 +3546,14 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
   // Set FormatPrecision if zero.  We want to do this before we
   // truncate trailing zeros, as those are part of the precision.
   if (!FormatPrecision) {
-    // It's an interesting question whether to use the nominal
-    // precision or the active precision here for denormals.
-
-    // FormatPrecision = ceil(significandBits / lg_2(10))
-    FormatPrecision = (semantics->precision * 59 + 195) / 196;
+    // We use enough digits so the number can be round-tripped back to an
+    // APFloat. The formula comes from "How to Print Floating-Point Numbers
+    // Accurately" by Steele and White.
+    // FIXME: Using a formula based purely on the precision is conservative;
+    // we can print fewer digits depending on the actual value being printed.
+
+    // FormatPrecision = 2 + floor(significandBits / lg_2(10))
+    FormatPrecision = 2 + semantics->precision * 59 / 196;
   }
 
   // Ignore trailing binary zeros.
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
index 3c4191b..6e7a541 100644
--- a/lib/Support/Allocator.cpp
+++ b/lib/Support/Allocator.cpp
@@ -26,6 +26,10 @@ BumpPtrAllocator::BumpPtrAllocator(size_t size, size_t threshold,
     : SlabSize(size), SizeThreshold(std::min(size, threshold)),
       Allocator(allocator), CurSlab(0), BytesAllocated(0) { }
 
+BumpPtrAllocator::BumpPtrAllocator(size_t size, size_t threshold)
+    : SlabSize(size), SizeThreshold(std::min(size, threshold)),
+      Allocator(DefaultSlabAllocator), CurSlab(0), BytesAllocated(0) { }
+
 BumpPtrAllocator::~BumpPtrAllocator() {
   DeallocateSlabs(CurSlab);
 }
@@ -167,9 +171,6 @@ void BumpPtrAllocator::PrintStats() const {
          << " (includes alignment, etc)\n";
 }
 
-MallocSlabAllocator BumpPtrAllocator::DefaultSlabAllocator =
-  MallocSlabAllocator();
-
 SlabAllocator::~SlabAllocator() { }
 
 MallocSlabAllocator::~MallocSlabAllocator() { }
diff --git a/lib/Support/BlockFrequency.cpp b/lib/Support/BlockFrequency.cpp
index 5e45e46..00efe90 100644
--- a/lib/Support/BlockFrequency.cpp
+++ b/lib/Support/BlockFrequency.cpp
@@ -19,52 +19,69 @@
 using namespace llvm;
 
 /// Multiply FREQ by N and store result in W array.
-static void mult96bit(uint64_t freq, uint32_t N, uint64_t W[2]) {
+static void mult96bit(uint64_t freq, uint32_t N, uint32_t W[3]) {
   uint64_t u0 = freq & UINT32_MAX;
   uint64_t u1 = freq >> 32;
 
-  // Represent 96-bit value as w[2]:w[1]:w[0];
-  uint32_t w[3] = { 0, 0, 0 };
-
+  // Represent 96-bit value as W[2]:W[1]:W[0];
   uint64_t t = u0 * N;
   uint64_t k = t >> 32;
-  w[0] = t;
+  W[0] = t;
   t = u1 * N + k;
-  w[1] = t;
-  w[2] = t >> 32;
-
-  // W[1] - higher bits.
-  // W[0] - lower bits.
-  W[0] = w[0] + ((uint64_t) w[1] << 32);
-  W[1] = w[2];
+  W[1] = t;
+  W[2] = t >> 32;
 }
 
-
-/// Divide 96-bit value stored in W array by D.
-/// Return 64-bit quotient, saturated to UINT64_MAX on overflow.
-static uint64_t div96bit(uint64_t W[2], uint32_t D) {
-  uint64_t y = W[0];
-  uint64_t x = W[1];
-  unsigned i;
-
-  assert(x != 0 && "This is really a 64-bit division");
-
-  // This long division algorithm automatically saturates on overflow.
-  for (i = 0; i < 64 && x; ++i) {
-    uint32_t t = -((x >> 31) & 1); // Splat bit 31 to bits 0-31.
-    x = (x << 1) | (y >> 63);
-    y = y << 1;
-    if ((x | t) >= D) {
-      x -= D;
-      ++y;
+/// Divide 96-bit value stored in W[2]:W[1]:W[0] by D. Since our word size is a
+/// 32 bit unsigned integer, we can use a short division algorithm.
+static uint64_t divrem96bit(uint32_t W[3], uint32_t D, uint32_t *Rout) {
+  // We assume that W[2] is non-zero since if W[2] is not then the user should
+  // just use hardware division.
+  assert(W[2] && "This routine assumes that W[2] is non-zero since if W[2] is "
+         "zero, the caller should just use 64/32 hardware.");
+  uint32_t Q[3] = { 0, 0, 0 };
+
+  // The generalized short division algorithm sets i to m + n - 1, where n is
+  // the number of words in the divisior and m is the number of words by which
+  // the divident exceeds the divisor (i.e. m + n == the length of the dividend
+  // in words). Due to our assumption that W[2] is non-zero, we know that the
+  // dividend is of length 3 implying since n is 1 that m = 2. Thus we set i to
+  // m + n - 1 = 2 + 1 - 1 = 2.
+  uint32_t R = 0;
+  for (int i = 2; i >= 0; --i) {
+    uint64_t PartialD = uint64_t(R) << 32 | W[i];
+    if (PartialD == 0) {
+      Q[i] = 0;
+      R = 0;
+    } else if (PartialD < D) {
+      Q[i] = 0;
+      R = uint32_t(PartialD);
+    } else if (PartialD == D) {
+      Q[i] = 1;
+      R = 0;
+    } else {
+      Q[i] = uint32_t(PartialD / D);
+      R = uint32_t(PartialD - (Q[i] * D));
     }
   }
 
-  return y << (64 - i);
-}
+  // If Q[2] is non-zero, then we overflowed.
+  uint64_t Result;
+  if (Q[2]) {
+    Result = UINT64_MAX;
+    R = D;
+  } else {
+    // Form the final uint64_t result, avoiding endianness issues.
+    Result = uint64_t(Q[0]) | (uint64_t(Q[1]) << 32);
+  }
+
+  if (Rout)
+    *Rout = R;
 
+  return Result;
+}
 
-void BlockFrequency::scale(uint32_t N, uint32_t D) {
+uint32_t BlockFrequency::scale(uint32_t N, uint32_t D) {
   assert(D != 0 && "Division by zero");
 
   // Calculate Frequency * N.
@@ -75,15 +92,16 @@ void BlockFrequency::scale(uint32_t N, uint32_t D) {
   // If the product fits in 64 bits, just use built-in division.
   if (MulHi <= UINT32_MAX && MulRes >= MulLo) {
     Frequency = MulRes / D;
-    return;
+    return MulRes % D;
   }
 
   // Product overflowed, use 96-bit operations.
-  // 96-bit value represented as W[1]:W[0].
-  uint64_t W[2];
+  // 96-bit value represented as W[2]:W[1]:W[0].
+  uint32_t W[3];
+  uint32_t R;
   mult96bit(Frequency, N, W);
-  Frequency = div96bit(W, D);
-  return;
+  Frequency = divrem96bit(W, D, &R);
+  return R;
 }
 
 BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
@@ -127,6 +145,10 @@ BlockFrequency::operator+(const BlockFrequency &Prob) const {
   return Freq;
 }
 
+uint32_t BlockFrequency::scale(const BranchProbability &Prob) {
+  return scale(Prob.getNumerator(), Prob.getDenominator());
+}
+
 void BlockFrequency::print(raw_ostream &OS) const {
   // Convert fixed-point number to decimal.
   OS << Frequency / getEntryFrequency() << ".";
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 5823836..3aecf3f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_library(LLVMSupport
   ToolOutputFile.cpp
   Triple.cpp
   Twine.cpp
+  Unicode.cpp
   YAMLParser.cpp
   YAMLTraits.cpp
   raw_os_ostream.cpp
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index a47af27..44a88d8 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -60,6 +60,8 @@ TEMPLATE_INSTANTIATION(class opt<char>);
 TEMPLATE_INSTANTIATION(class opt<bool>);
 } } // end namespace llvm::cl
 
+// Pin the vtables to this file.
+void GenericOptionValue::anchor() {}
 void OptionValue<boolOrDefault>::anchor() {}
 void OptionValue<std::string>::anchor() {}
 void Option::anchor() {}
@@ -73,6 +75,7 @@ void parser<double>::anchor() {}
 void parser<float>::anchor() {}
 void parser<std::string>::anchor() {}
 void parser<char>::anchor() {}
+void StringSaver::anchor() {}
 
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Support/Compression.cpp b/lib/Support/Compression.cpp
index fd8a874..b5ddb70 100644
--- a/lib/Support/Compression.cpp
+++ b/lib/Support/Compression.cpp
@@ -81,6 +81,10 @@ zlib::Status zlib::uncompress(StringRef InputBuffer,
   return Res;
 }
 
+uint32_t zlib::crc32(StringRef Buffer) {
+  return ::crc32(0, (const Bytef *)Buffer.data(), Buffer.size());
+}
+
 #else
 bool zlib::isAvailable() { return false; }
 zlib::Status zlib::compress(StringRef InputBuffer,
@@ -93,5 +97,8 @@ zlib::Status zlib::uncompress(StringRef InputBuffer,
                               size_t UncompressedSize) {
   return zlib::StatusUnsupported;
 }
+uint32_t zlib::crc32(StringRef Buffer) {
+  llvm_unreachable("zlib::crc32 is unavailable");
+}
 #endif
 
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index bb38cd1..265b6e9 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -144,9 +144,6 @@ bool ConstantRange::isSignWrappedSet() const {
 /// getSetSize - Return the number of elements in this set.
 ///
 APInt ConstantRange::getSetSize() const {
-  if (isEmptySet())
-    return APInt(getBitWidth()+1, 0);
-
   if (isFullSet()) {
     APInt Size(getBitWidth()+1, 0);
     Size.setBit(getBitWidth());
@@ -448,6 +445,11 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 
   unsigned SrcTySize = getBitWidth();
   assert(SrcTySize < DstTySize && "Not a value extension");
+
+  // special case: [X, INT_MIN) -- not really wrapping around
+  if (Upper.isMinSignedValue())
+    return ConstantRange(Lower.sext(DstTySize), Upper.zext(DstTySize));
+
   if (isFullSet() || isSignWrappedSet()) {
     return ConstantRange(APInt::getHighBitsSet(DstTySize,DstTySize-SrcTySize+1),
                          APInt::getLowBitsSet(DstTySize, SrcTySize-1) + 1);
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index d2a3895..92c370d 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/ThreadLocal.h"
 #include <cstdio>
@@ -21,7 +22,7 @@ namespace {
 
 struct CrashRecoveryContextImpl;
 
-static sys::ThreadLocal<const CrashRecoveryContextImpl> CurrentContext;
+static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
 
 struct CrashRecoveryContextImpl {
   CrashRecoveryContext *CRC;
@@ -34,11 +35,11 @@ public:
   CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC),
                                                         Failed(false),
                                                         SwitchedThread(false) {
-    CurrentContext.set(this);
+    CurrentContext->set(this);
   }
   ~CrashRecoveryContextImpl() {
     if (!SwitchedThread)
-      CurrentContext.erase();
+      CurrentContext->erase();
   }
 
   /// \brief Called when the separate crash-recovery thread was finished, to
@@ -48,7 +49,7 @@ public:
   void HandleCrash() {
     // Eliminate the current context entry, to avoid re-entering in case the
     // cleanup code crashes.
-    CurrentContext.erase();
+    CurrentContext->erase();
 
     assert(!Failed && "Crash recovery context already failed!");
     Failed = true;
@@ -62,10 +63,10 @@ public:
 
 }
 
-static sys::Mutex gCrashRecoveryContexMutex;
+static ManagedStatic<sys::Mutex> gCrashRecoveryContextMutex;
 static bool gCrashRecoveryEnabled = false;
 
-static sys::ThreadLocal<const CrashRecoveryContextCleanup> 
+static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextCleanup> >
        tlIsRecoveringFromCrash;
 
 CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
@@ -73,7 +74,7 @@ CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
 CrashRecoveryContext::~CrashRecoveryContext() {
   // Reclaim registered resources.
   CrashRecoveryContextCleanup *i = head;
-  tlIsRecoveringFromCrash.set(head);
+  tlIsRecoveringFromCrash->set(head);
   while (i) {
     CrashRecoveryContextCleanup *tmp = i;
     i = tmp->next;
@@ -81,21 +82,21 @@ CrashRecoveryContext::~CrashRecoveryContext() {
     tmp->recoverResources();
     delete tmp;
   }
-  tlIsRecoveringFromCrash.erase();
+  tlIsRecoveringFromCrash->erase();
   
   CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
   delete CRCI;
 }
 
 bool CrashRecoveryContext::isRecoveringFromCrash() {
-  return tlIsRecoveringFromCrash.get() != 0;
+  return tlIsRecoveringFromCrash->get() != 0;
 }
 
 CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
   if (!gCrashRecoveryEnabled)
     return 0;
 
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
   if (!CRCI)
     return 0;
 
@@ -154,7 +155,7 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
 static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 {
   // Lookup the current thread local recovery object.
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
   if (!CRCI) {
     // Something has gone horribly wrong, so let's just tell everyone
@@ -182,7 +183,7 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 static sys::ThreadLocal<const void> sCurrentExceptionHandle;
 
 void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (gCrashRecoveryEnabled)
     return;
@@ -198,7 +199,7 @@ void CrashRecoveryContext::Enable() {
 }
 
 void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (!gCrashRecoveryEnabled)
     return;
@@ -236,7 +237,7 @@ static struct sigaction PrevActions[NumSignals];
 
 static void CrashRecoverySignalHandler(int Signal) {
   // Lookup the current thread local recovery object.
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
   if (!CRCI) {
     // We didn't find a crash recovery context -- this means either we got a
@@ -267,7 +268,7 @@ static void CrashRecoverySignalHandler(int Signal) {
 }
 
 void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (gCrashRecoveryEnabled)
     return;
@@ -286,7 +287,7 @@ void CrashRecoveryContext::Enable() {
 }
 
 void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (!gCrashRecoveryEnabled)
     return;
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 8a80139..c000b63 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -59,8 +61,8 @@ const char *llvm::dwarf::TagString(unsigned Tag) {
   case DW_TAG_namelist_item:             return "DW_TAG_namelist_item";
   case DW_TAG_packed_type:               return "DW_TAG_packed_type";
   case DW_TAG_subprogram:                return "DW_TAG_subprogram";
-  case DW_TAG_template_type_parameter:  return "DW_TAG_template_type_parameter";
-  case DW_TAG_template_value_parameter:return "DW_TAG_template_value_parameter";
+  case DW_TAG_template_type_parameter:   return "DW_TAG_template_type_parameter";
+  case DW_TAG_template_value_parameter:  return "DW_TAG_template_value_parameter";
   case DW_TAG_thrown_type:               return "DW_TAG_thrown_type";
   case DW_TAG_try_block:                 return "DW_TAG_try_block";
   case DW_TAG_variant_part:              return "DW_TAG_variant_part";
@@ -454,10 +456,11 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   case DW_OP_bit_piece:                  return "DW_OP_bit_piece";
   case DW_OP_implicit_value:             return "DW_OP_implicit_value";
   case DW_OP_stack_value:                return "DW_OP_stack_value";
-  case DW_OP_lo_user:                    return "DW_OP_lo_user";
-  case DW_OP_hi_user:                    return "DW_OP_hi_user";
 
-    // DWARF5 Fission Proposal Op Extensions
+  // GNU thread-local storage
+  case DW_OP_GNU_push_tls_address:       return "DW_OP_GNU_push_tls_address";
+
+  // DWARF5 Fission Proposal Op Extensions
   case DW_OP_GNU_addr_index:             return "DW_OP_GNU_addr_index";
   case DW_OP_GNU_const_index:            return "DW_OP_GNU_const_index";
   }
@@ -723,3 +726,51 @@ const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   }
   return 0;
 }
+
+const char *llvm::dwarf::AtomTypeString(unsigned AT) {
+  switch (AT) {
+  case dwarf::DW_ATOM_null:
+    return "DW_ATOM_null";
+  case dwarf::DW_ATOM_die_offset:
+    return "DW_ATOM_die_offset";
+  case DW_ATOM_cu_offset:
+    return "DW_ATOM_cu_offset";
+  case DW_ATOM_die_tag:
+    return "DW_ATOM_die_tag";
+  case DW_ATOM_type_flags:
+    return "DW_ATOM_type_flags";
+  }
+  return 0;
+}
+
+const char *llvm::dwarf::GDBIndexEntryKindString(GDBIndexEntryKind Kind) {
+  switch (Kind) {
+  case GIEK_NONE:
+    return "NONE";
+  case GIEK_TYPE:
+    return "TYPE";
+  case GIEK_VARIABLE:
+    return "VARIABLE";
+  case GIEK_FUNCTION:
+    return "FUNCTION";
+  case GIEK_OTHER:
+    return "OTHER";
+  case GIEK_UNUSED5:
+    return "UNUSED5";
+  case GIEK_UNUSED6:
+    return "UNUSED6";
+  case GIEK_UNUSED7:
+    return "UNUSED7";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryKind value");
+}
+
+const char *llvm::dwarf::GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage) {
+  switch (Linkage) {
+  case GIEL_EXTERNAL:
+    return "EXTERNAL";
+  case GIEL_STATIC:
+    return "STATIC";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryLinkage value");
+}
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index f14cb45..a825c68 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -14,39 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm-c/Support.h"
 #include <cstdio>
 #include <cstring>
 
 // Collection of symbol name/value pairs to be searched prior to any libraries.
-static llvm::StringMap<void *> *ExplicitSymbols = 0;
-
-namespace {
-
-struct ExplicitSymbolsDeleter {
-  ~ExplicitSymbolsDeleter() {
-    delete ExplicitSymbols;
-  }
-};
-
-}
-
-static ExplicitSymbolsDeleter Dummy;
-
-
-static llvm::sys::SmartMutex<true>& getMutex() {
-  static llvm::sys::SmartMutex<true> HandlesMutex;
-  return HandlesMutex;
-}
+static llvm::ManagedStatic<llvm::StringMap<void *> > ExplicitSymbols;
+static llvm::ManagedStatic<llvm::sys::SmartMutex<true> > SymbolsMutex;
 
 void llvm::sys::DynamicLibrary::AddSymbol(StringRef symbolName,
                                           void *symbolValue) {
-  SmartScopedLock<true> lock(getMutex());
-  if (ExplicitSymbols == 0)
-    ExplicitSymbols = new StringMap<void*>();
+  SmartScopedLock<true> lock(*SymbolsMutex);
   (*ExplicitSymbols)[symbolName] = symbolValue;
 }
 
@@ -72,7 +55,7 @@ static DenseSet<void *> *OpenedHandles = 0;
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
-  SmartScopedLock<true> lock(getMutex());
+  SmartScopedLock<true> lock(*SymbolsMutex);
 
   void *handle = dlopen(filename, RTLD_LAZY|RTLD_GLOBAL);
   if (handle == 0) {
@@ -126,10 +109,10 @@ void *SearchForAddressOfSpecialSymbol(const char* symbolName);
 }
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
-  SmartScopedLock<true> Lock(getMutex());
+  SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
-  if (ExplicitSymbols) {
+  if (ExplicitSymbols.isConstructed()) {
     StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
 
     if (i != ExplicitSymbols->end())
@@ -187,3 +170,11 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 }
 
 #endif // LLVM_ON_WIN32
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMLoadLibraryPermanently(const char* Filename) {
+  return llvm::sys::DynamicLibrary::LoadLibraryPermanently(Filename);
+}
diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp
index ed17f60..1eefa3e 100644
--- a/lib/Support/Errno.cpp
+++ b/lib/Support/Errno.cpp
@@ -39,28 +39,27 @@ std::string StrError(int errnum) {
   char buffer[MaxErrStrLen];
   buffer[0] = '\0';
   std::string str;
+  if (errnum == 0)
+    return str;
+
 #ifdef HAVE_STRERROR_R
   // strerror_r is thread-safe.
-  if (errnum)
-# if defined(__GLIBC__) && defined(_GNU_SOURCE)
-    // glibc defines its own incompatible version of strerror_r
-    // which may not use the buffer supplied.
-    str = strerror_r(errnum,buffer,MaxErrStrLen-1);
-# else
-    strerror_r(errnum,buffer,MaxErrStrLen-1);
-    str = buffer;
-# endif
+#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+  // glibc defines its own incompatible version of strerror_r
+  // which may not use the buffer supplied.
+  str = strerror_r(errnum, buffer, MaxErrStrLen - 1);
+#else
+  strerror_r(errnum, buffer, MaxErrStrLen - 1);
+  str = buffer;
+#endif
 #elif HAVE_DECL_STRERROR_S // "Windows Secure API"
-    if (errnum) {
-      strerror_s(buffer, MaxErrStrLen - 1, errnum);
-      str = buffer;
-    }
+  strerror_s(buffer, MaxErrStrLen - 1, errnum);
+  str = buffer;
 #elif defined(HAVE_STRERROR)
   // Copy the thread un-safe result of strerror into
   // the buffer as fast as possible to minimize impact
   // of collision of strerror in multiple threads.
-  if (errnum)
-    str = strerror(errnum);
+  str = strerror(errnum);
 #else
   // Strange that this system doesn't even have strerror
   // but, oh well, just use a generic message
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 9425445..1eafb96 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
 #include <cassert>
 #include <cstdlib>
 
@@ -102,3 +103,19 @@ void llvm::llvm_unreachable_internal(const char *msg, const char *file,
   LLVM_BUILTIN_UNREACHABLE;
 #endif
 }
+
+static void bindingsErrorHandler(void *user_data, const std::string& reason,
+                                 bool gen_crash_diag) {
+  LLVMFatalErrorHandler handler =
+      LLVM_EXTENSION reinterpret_cast<LLVMFatalErrorHandler>(user_data);
+  handler(reason.c_str());
+}
+
+void LLVMInstallFatalErrorHandler(LLVMFatalErrorHandler Handler) {
+  install_fatal_error_handler(bindingsErrorHandler,
+                              LLVM_EXTENSION reinterpret_cast<void *>(Handler));
+}
+
+void LLVMResetFatalErrorHandler() {
+  remove_fatal_error_handler();
+}
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index 7a9400d..85be415 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -219,5 +219,8 @@ void llvm::DisplayGraph(StringRef FilenameRef, bool wait,
   errs() << "Running 'dotty' program... ";
   if (!ExecGraphViewer(dotty, args, Filename, wait, ErrMsg))
     return;
+#else
+  (void)Filename;
+  (void)ErrMsg;
 #endif
 }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 90e4389..6e9a5c9 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -52,8 +52,54 @@ using namespace llvm;
 
 /// GetX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
 /// specified arguments.  If we can't run cpuid on the host, return true.
-static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
-                            unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                               unsigned *rECX, unsigned *rEDX) {
+#if defined(__GNUC__) || defined(__clang__)
+  #if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+// pedantic #else returns to appease -Wunreachable-code (so we don't generate
+// postprocessed code that looks like "return true; return false;")
+  #else
+    return true;
+  #endif
+#elif defined(_MSC_VER)
+  // The MSVC intrinsic is portable across x86 and x64.
+  int registers[4];
+  __cpuid(registers, value);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return false;
+#else
+  return true;
+#endif
+}
+
+/// GetX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
+/// 4 values in the specified arguments.  If we can't run cpuid on the host,
+/// return true.
+bool GetX86CpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+                          unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
 #if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
   #if defined(__GNUC__)
     // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
@@ -64,16 +110,22 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
            "=S" (*rEBX),
            "=c" (*rECX),
            "=d" (*rEDX)
-         :  "a" (value));
+         :  "a" (value),
+            "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
-    int registers[4];
-    __cpuid(registers, value);
-    *rEAX = registers[0];
-    *rEBX = registers[1];
-    *rECX = registers[2];
-    *rEDX = registers[3];
-    return false;
+    // __cpuidex was added in MSVC++ 9.0 SP1
+    #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
+      int registers[4];
+      __cpuidex(registers, value, subleaf);
+      *rEAX = registers[0];
+      *rEBX = registers[1];
+      *rECX = registers[2];
+      *rEDX = registers[3];
+      return false;
+    #else
+      return true;
+    #endif
   #else
     return true;
   #endif
@@ -86,11 +138,13 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
            "=S" (*rEBX),
            "=c" (*rECX),
            "=d" (*rEDX)
-         :  "a" (value));
+         :  "a" (value),
+            "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
     __asm {
       mov   eax,value
+      mov   ecx,subleaf
       cpuid
       mov   esi,rEAX
       mov   dword ptr [esi],eax
@@ -102,8 +156,6 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
       mov   dword ptr [esi],edx
     }
     return false;
-// pedantic #else returns to appease -Wunreachable-code (so we don't generate
-// postprocessed code that looks like "return true; return false;")
   #else
     return true;
   #endif
@@ -148,6 +200,14 @@ std::string sys::getHostCPUName() {
   unsigned Model  = 0;
   DetectX86FamilyModel(EAX, Family, Model);
 
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  GetX86CpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+
+  unsigned MaxLeaf = EAX;
   bool HasSSE3 = (ECX & 0x1);
   bool HasSSE41 = (ECX & 0x80000);
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV 
@@ -155,15 +215,12 @@ std::string sys::getHostCPUName() {
   // switch, then we have full AVX support.
   const unsigned AVXBits = (1 << 27) | (1 << 28);
   bool HasAVX = ((ECX & AVXBits) == AVXBits) && OSHasAVXSupport();
+  bool HasAVX2 = HasAVX && MaxLeaf >= 0x7 &&
+                 !GetX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX) &&
+                 (EBX & 0x20);
   GetX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
   bool Em64T = (EDX >> 29) & 0x1;
 
-  union {
-    unsigned u[3];
-    char     c[12];
-  } text;
-
-  GetX86CpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
   if (memcmp(text.c, "GenuineIntel", 12) == 0) {
     switch (Family) {
     case 3:
@@ -271,10 +328,20 @@ std::string sys::getHostCPUName() {
 
       // Ivy Bridge:
       case 58:
+      case 62: // Ivy Bridge EP
         // Not all Ivy Bridge processors support AVX (such as the Pentium
         // versions instead of the i7 versions).
         return HasAVX ? "core-avx-i" : "corei7";
 
+      // Haswell:
+      case 60:
+      case 63:
+      case 69:
+      case 70:
+        // Not all Haswell processors support AVX too (such as the Pentium
+        // versions instead of the i7 versions).
+        return HasAVX2 ? "core-avx2" : "corei7";
+
       case 28: // Most 45 nm Intel Atom processors
       case 38: // 45 nm Atom Lincroft
       case 39: // 32 nm Atom Medfield
@@ -282,6 +349,12 @@ std::string sys::getHostCPUName() {
       case 54: // 32 nm Atom Midview
         return "atom";
 
+      // Atom Silvermont codes from the Intel software optimization guide.
+      case 55:
+      case 74:
+      case 77:
+        return "slm";
+
       default: return (Em64T) ? "x86-64" : "i686";
       }
     case 15: {
@@ -359,9 +432,11 @@ std::string sys::getHostCPUName() {
       case 21:
         if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
           return "btver1";
-        if (Model > 15 && Model <= 31)
-          return "bdver2";
-        return "bdver1";
+        if (Model >= 0x30)
+          return "bdver3"; // 30h-3Fh: Steamroller
+        if (Model >= 0x10)
+          return "bdver2"; // 10h-1Fh: Piledriver
+        return "bdver1";   // 00h-0Fh: Bulldozer
       case 22:
         if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
           return "btver1";
@@ -546,6 +621,48 @@ std::string sys::getHostCPUName() {
 
   return "generic";
 }
+#elif defined(__linux__) && defined(__s390x__)
+std::string sys::getHostCPUName() {
+  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return "generic";
+  }
+
+  // The "processor 0:" line comes after a fair amount of other information,
+  // including a cache breakdown, but this should be plenty.
+  char buffer[2048];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("processor ")) {
+      size_t Pos = Lines[I].find("machine = ");
+      if (Pos != StringRef::npos) {
+        Pos += sizeof("machine = ") - 1;
+        unsigned int Id;
+        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 2827)
+            return "zEC12";
+          if (Id >= 2817)
+            return "z196";
+        }
+      }
+      break;
+    }
+  }
+  
+  return "generic";
+}
 #else
 std::string sys::getHostCPUName() {
   return "generic";
diff --git a/lib/Support/Locale.cpp b/lib/Support/Locale.cpp
index 17b9b6c..35ddf7f 100644
--- a/lib/Support/Locale.cpp
+++ b/lib/Support/Locale.cpp
@@ -1,10 +1,31 @@
 #include "llvm/Support/Locale.h"
-#include "llvm/Config/config.h"
+#include "llvm/Support/Unicode.h"
 
-#ifdef __APPLE__
-#include "LocaleXlocale.inc"
-#elif LLVM_ON_WIN32
-#include "LocaleWindows.inc"
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef Text) {
+#if LLVM_ON_WIN32
+  return Text.size();
 #else
-#include "LocaleGeneric.inc"
+  return llvm::sys::unicode::columnWidthUTF8(Text);
 #endif
+}
+
+bool isPrint(int UCS) {
+#if LLVM_ON_WIN32
+  // Restrict characters that we'll try to print to the the lower part of ASCII
+  // except for the control characters (0x20 - 0x7E). In general one can not
+  // reliably output code points U+0080 and higher using narrow character C/C++
+  // output functions in Windows, because the meaning of the upper 128 codes is
+  // determined by the active code page in the console.
+  return ' ' <= UCS && UCS <= '~';
+#else
+  return llvm::sys::unicode::isPrintable(UCS);
+#endif
+}
+
+} // namespace locale
+} // namespace sys
+} // namespace llvm
diff --git a/lib/Support/LocaleGeneric.inc b/lib/Support/LocaleGeneric.inc
deleted file mode 100644
index 3a939b8..0000000
--- a/lib/Support/LocaleGeneric.inc
+++ /dev/null
@@ -1,382 +0,0 @@
-//===- llvm/Support/LocaleGeneric.inc - Locale-dependent stuff  -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements llvm::sys::locale::columnWidth and
-// llvm::sys::locale::isPrint functions for UTF-8 locales.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/ConvertUTF.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/UnicodeCharRanges.h"
-
-namespace llvm {
-namespace sys {
-namespace locale {
-
-enum ColumnWidthErrors {
-  ErrorInvalidUTF8 = -2,
-  ErrorNonPrintableCharacter = -1
-};
-
-/// Determines if a character is likely to be displayed correctly on the
-/// terminal. Exact implementation would have to depend on the specific
-/// terminal, so we define the semantic that should be suitable for generic case
-/// of a terminal capable to output Unicode characters.
-/// All characters from the Unicode codepoint range are considered printable
-/// except for:
-///   * C0 and C1 control character ranges;
-///   * default ignorable code points as per 5.21 of
-///     http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
-///   * format characters (category = Cf);
-///   * surrogates (category = Cs);
-///   * unassigned characters (category = Cn).
-/// \return true if the character is considered printable.
-bool isPrint(int UCS) {
-  // Sorted list of non-overlapping intervals of code points that are not
-  // supposed to be printable.
-  static const UnicodeCharRange NonPrintableRanges[] = {
-    { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x00AD, 0x00AD },
-    { 0x034F, 0x034F }, { 0x0378, 0x0379 }, { 0x037F, 0x0383 },
-    { 0x038B, 0x038B }, { 0x038D, 0x038D }, { 0x03A2, 0x03A2 },
-    { 0x0528, 0x0530 }, { 0x0557, 0x0558 }, { 0x0560, 0x0560 },
-    { 0x0588, 0x0588 }, { 0x058B, 0x058E }, { 0x0590, 0x0590 },
-    { 0x05C8, 0x05CF }, { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 },
-    { 0x061C, 0x061D }, { 0x06DD, 0x06DD }, { 0x070E, 0x070F },
-    { 0x074B, 0x074C }, { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF },
-    { 0x082E, 0x082F }, { 0x083F, 0x083F }, { 0x085C, 0x085D },
-    { 0x085F, 0x089F }, { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 },
-    { 0x08FF, 0x08FF }, { 0x0978, 0x0978 }, { 0x0980, 0x0980 },
-    { 0x0984, 0x0984 }, { 0x098D, 0x098E }, { 0x0991, 0x0992 },
-    { 0x09A9, 0x09A9 }, { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 },
-    { 0x09BA, 0x09BB }, { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA },
-    { 0x09CF, 0x09D6 }, { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE },
-    { 0x09E4, 0x09E5 }, { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 },
-    { 0x0A0B, 0x0A0E }, { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 },
-    { 0x0A31, 0x0A31 }, { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 },
-    { 0x0A3A, 0x0A3B }, { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 },
-    { 0x0A49, 0x0A4A }, { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 },
-    { 0x0A5D, 0x0A5D }, { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 },
-    { 0x0A84, 0x0A84 }, { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 },
-    { 0x0AA9, 0x0AA9 }, { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 },
-    { 0x0ABA, 0x0ABB }, { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA },
-    { 0x0ACE, 0x0ACF }, { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 },
-    { 0x0AF2, 0x0B00 }, { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E },
-    { 0x0B11, 0x0B12 }, { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 },
-    { 0x0B34, 0x0B34 }, { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 },
-    { 0x0B49, 0x0B4A }, { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B },
-    { 0x0B5E, 0x0B5E }, { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 },
-    { 0x0B84, 0x0B84 }, { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 },
-    { 0x0B96, 0x0B98 }, { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D },
-    { 0x0BA0, 0x0BA2 }, { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD },
-    { 0x0BBA, 0x0BBD }, { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 },
-    { 0x0BCE, 0x0BCF }, { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 },
-    { 0x0BFB, 0x0C00 }, { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D },
-    { 0x0C11, 0x0C11 }, { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 },
-    { 0x0C3A, 0x0C3C }, { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 },
-    { 0x0C4E, 0x0C54 }, { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F },
-    { 0x0C64, 0x0C65 }, { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 },
-    { 0x0C84, 0x0C84 }, { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 },
-    { 0x0CA9, 0x0CA9 }, { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB },
-    { 0x0CC5, 0x0CC5 }, { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 },
-    { 0x0CD7, 0x0CDD }, { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 },
-    { 0x0CF0, 0x0CF0 }, { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 },
-    { 0x0D0D, 0x0D0D }, { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C },
-    { 0x0D45, 0x0D45 }, { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 },
-    { 0x0D58, 0x0D5F }, { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 },
-    { 0x0D80, 0x0D81 }, { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 },
-    { 0x0DB2, 0x0DB2 }, { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF },
-    { 0x0DC7, 0x0DC9 }, { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 },
-    { 0x0DD7, 0x0DD7 }, { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 },
-    { 0x0E3B, 0x0E3E }, { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 },
-    { 0x0E85, 0x0E86 }, { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C },
-    { 0x0E8E, 0x0E93 }, { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 },
-    { 0x0EA4, 0x0EA4 }, { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 },
-    { 0x0EAC, 0x0EAC }, { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF },
-    { 0x0EC5, 0x0EC5 }, { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF },
-    { 0x0EDA, 0x0EDB }, { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 },
-    { 0x0F6D, 0x0F70 }, { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD },
-    { 0x0FCD, 0x0FCD }, { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 },
-    { 0x10C8, 0x10CC }, { 0x10CE, 0x10CF }, { 0x115F, 0x1160 },
-    { 0x1249, 0x1249 }, { 0x124E, 0x124F }, { 0x1257, 0x1257 },
-    { 0x1259, 0x1259 }, { 0x125E, 0x125F }, { 0x1289, 0x1289 },
-    { 0x128E, 0x128F }, { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 },
-    { 0x12BF, 0x12BF }, { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 },
-    { 0x12D7, 0x12D7 }, { 0x1311, 0x1311 }, { 0x1316, 0x1317 },
-    { 0x135B, 0x135C }, { 0x137D, 0x137F }, { 0x139A, 0x139F },
-    { 0x13F5, 0x13FF }, { 0x169D, 0x169F }, { 0x16F1, 0x16FF },
-    { 0x170D, 0x170D }, { 0x1715, 0x171F }, { 0x1737, 0x173F },
-    { 0x1754, 0x175F }, { 0x176D, 0x176D }, { 0x1771, 0x1771 },
-    { 0x1774, 0x177F }, { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF },
-    { 0x17EA, 0x17EF }, { 0x17FA, 0x17FF }, { 0x180B, 0x180D },
-    { 0x180F, 0x180F }, { 0x181A, 0x181F }, { 0x1878, 0x187F },
-    { 0x18AB, 0x18AF }, { 0x18F6, 0x18FF }, { 0x191D, 0x191F },
-    { 0x192C, 0x192F }, { 0x193C, 0x193F }, { 0x1941, 0x1943 },
-    { 0x196E, 0x196F }, { 0x1975, 0x197F }, { 0x19AC, 0x19AF },
-    { 0x19CA, 0x19CF }, { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D },
-    { 0x1A5F, 0x1A5F }, { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F },
-    { 0x1A9A, 0x1A9F }, { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F },
-    { 0x1B7D, 0x1B7F }, { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A },
-    { 0x1C4A, 0x1C4C }, { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF },
-    { 0x1CF7, 0x1CFF }, { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 },
-    { 0x1F1E, 0x1F1F }, { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F },
-    { 0x1F58, 0x1F58 }, { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C },
-    { 0x1F5E, 0x1F5E }, { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 },
-    { 0x1FC5, 0x1FC5 }, { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC },
-    { 0x1FF0, 0x1FF1 }, { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF },
-    { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x206F },
-    { 0x2072, 0x2073 }, { 0x208F, 0x208F }, { 0x209D, 0x209F },
-    { 0x20BB, 0x20CF }, { 0x20F1, 0x20FF }, { 0x218A, 0x218F },
-    { 0x23F4, 0x23FF }, { 0x2427, 0x243F }, { 0x244B, 0x245F },
-    { 0x2700, 0x2700 }, { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF },
-    { 0x2C2F, 0x2C2F }, { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 },
-    { 0x2D26, 0x2D26 }, { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F },
-    { 0x2D68, 0x2D6E }, { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F },
-    { 0x2DA7, 0x2DA7 }, { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 },
-    { 0x2DBF, 0x2DBF }, { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF },
-    { 0x2DD7, 0x2DD7 }, { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F },
-    { 0x2E9A, 0x2E9A }, { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF },
-    { 0x2FFC, 0x2FFF }, { 0x3040, 0x3040 }, { 0x3097, 0x3098 },
-    { 0x3100, 0x3104 }, { 0x312E, 0x3130 }, { 0x3164, 0x3164 },
-    { 0x318F, 0x318F }, { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF },
-    { 0x321F, 0x321F }, { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF },
-    { 0x9FCD, 0x9FFF }, { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF },
-    { 0xA62C, 0xA63F }, { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF },
-    { 0xA78F, 0xA78F }, { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 },
-    { 0xA82C, 0xA82F }, { 0xA83A, 0xA83F }, { 0xA878, 0xA87F },
-    { 0xA8C5, 0xA8CD }, { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF },
-    { 0xA954, 0xA95E }, { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE },
-    { 0xA9DA, 0xA9DD }, { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F },
-    { 0xAA4E, 0xAA4F }, { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F },
-    { 0xAAC3, 0xAADA }, { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 },
-    { 0xAB0F, 0xAB10 }, { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 },
-    { 0xAB2F, 0xABBF }, { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF },
-    { 0xD7A4, 0xD7AF }, { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF },
-    { 0xFA6E, 0xFA6F }, { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 },
-    { 0xFB18, 0xFB1C }, { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D },
-    { 0xFB3F, 0xFB3F }, { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 },
-    { 0xFBC2, 0xFBD2 }, { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 },
-    { 0xFDC8, 0xFDEF }, { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F },
-    { 0xFE27, 0xFE2F }, { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 },
-    { 0xFE6C, 0xFE6F }, { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF },
-    { 0xFF00, 0xFF00 }, { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 },
-    { 0xFFC8, 0xFFC9 }, { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 },
-    { 0xFFDD, 0xFFDF }, { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB },
-    { 0xFFFE, 0xFFFF }, { 0x1000C, 0x1000C }, { 0x10027, 0x10027 },
-    { 0x1003B, 0x1003B }, { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F },
-    { 0x1005E, 0x1007F }, { 0x100FB, 0x100FF }, { 0x10103, 0x10106 },
-    { 0x10134, 0x10136 }, { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF },
-    { 0x101FE, 0x1027F }, { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF },
-    { 0x1031F, 0x1031F }, { 0x10324, 0x1032F }, { 0x1034B, 0x1037F },
-    { 0x1039E, 0x1039E }, { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF },
-    { 0x1049E, 0x1049F }, { 0x104AA, 0x107FF }, { 0x10806, 0x10807 },
-    { 0x10809, 0x10809 }, { 0x10836, 0x10836 }, { 0x10839, 0x1083B },
-    { 0x1083D, 0x1083E }, { 0x10856, 0x10856 }, { 0x10860, 0x108FF },
-    { 0x1091C, 0x1091E }, { 0x1093A, 0x1093E }, { 0x10940, 0x1097F },
-    { 0x109B8, 0x109BD }, { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 },
-    { 0x10A07, 0x10A0B }, { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 },
-    { 0x10A34, 0x10A37 }, { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F },
-    { 0x10A59, 0x10A5F }, { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 },
-    { 0x10B56, 0x10B57 }, { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF },
-    { 0x10C49, 0x10E5F }, { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 },
-    { 0x11070, 0x1107F }, { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF },
-    { 0x110E9, 0x110EF }, { 0x110FA, 0x110FF }, { 0x11135, 0x11135 },
-    { 0x11144, 0x1117F }, { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F },
-    { 0x116B8, 0x116BF }, { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF },
-    { 0x12463, 0x1246F }, { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF },
-    { 0x16A39, 0x16EFF }, { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E },
-    { 0x16FA0, 0x1AFFF }, { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF },
-    { 0x1D127, 0x1D128 }, { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF },
-    { 0x1D246, 0x1D2FF }, { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF },
-    { 0x1D455, 0x1D455 }, { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 },
-    { 0x1D4A3, 0x1D4A4 }, { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD },
-    { 0x1D4BA, 0x1D4BA }, { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 },
-    { 0x1D506, 0x1D506 }, { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 },
-    { 0x1D51D, 0x1D51D }, { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F },
-    { 0x1D545, 0x1D545 }, { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 },
-    { 0x1D6A6, 0x1D6A7 }, { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF },
-    { 0x1EE04, 0x1EE04 }, { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 },
-    { 0x1EE25, 0x1EE26 }, { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 },
-    { 0x1EE38, 0x1EE38 }, { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 },
-    { 0x1EE43, 0x1EE46 }, { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A },
-    { 0x1EE4C, 0x1EE4C }, { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 },
-    { 0x1EE55, 0x1EE56 }, { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A },
-    { 0x1EE5C, 0x1EE5C }, { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 },
-    { 0x1EE63, 0x1EE63 }, { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B },
-    { 0x1EE73, 0x1EE73 }, { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D },
-    { 0x1EE7F, 0x1EE7F }, { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 },
-    { 0x1EEA4, 0x1EEA4 }, { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF },
-    { 0x1EEF2, 0x1EFFF }, { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F },
-    { 0x1F0AF, 0x1F0B0 }, { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 },
-    { 0x1F0E0, 0x1F0FF }, { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F },
-    { 0x1F16C, 0x1F16F }, { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F },
-    { 0x1F23B, 0x1F23F }, { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF },
-    { 0x1F321, 0x1F32F }, { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F },
-    { 0x1F394, 0x1F39F }, { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF },
-    { 0x1F3F1, 0x1F3FF }, { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 },
-    { 0x1F4F8, 0x1F4F8 }, { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F },
-    { 0x1F544, 0x1F54F }, { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 },
-    { 0x1F650, 0x1F67F }, { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF },
-    { 0x2A6D7, 0x2A6FF }, { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF },
-    { 0x2FA1E, 0xF0000 }, { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }
-  };
-
-  return UCS >= 0 && UCS <= 0x10FFFF && !isCharInSet(UCS, NonPrintableRanges);
-}
-
-/// Gets the number of positions a character is likely to occupy when output
-/// on a terminal ("character width"). This depends on the implementation of the
-/// terminal, and there's no standard definition of character width.
-/// The implementation defines it in a way that is expected to be compatible
-/// with a generic Unicode-capable terminal.
-/// \return Character width:
-///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
-///     identified by isPrint);
-///   * 0 for non-spacing and enclosing combining marks;
-///   * 2 for CJK characters excluding halfwidth forms;
-///   * 1 for all remaining characters.
-static inline int charWidth(int UCS)
-{
-  if (!isPrint(UCS))
-    return ErrorNonPrintableCharacter;
-
-  // Sorted list of non-spacing and enclosing combining mark intervals as
-  // defined in "3.6 Combination" of
-  // http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
-  static const UnicodeCharRange CombiningCharacters[] = {
-    { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
-    { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
-    { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, { 0x064B, 0x065F },
-    { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 },
-    { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x0711, 0x0711 },
-    { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 },
-    { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
-    { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08E4, 0x08FE },
-    { 0x0900, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C },
-    { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 },
-    { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC },
-    { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 },
-    { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 },
-    { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 },
-    { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 },
-    { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
-    { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
-    { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 },
-    { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B62, 0x0B63 },
-    { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD },
-    { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D },
-    { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0CBC, 0x0CBC },
-    { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
-    { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D },
-    { 0x0D62, 0x0D63 }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
-    { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
-    { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
-    { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
-    { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
-    { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
-    { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
-    { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
-    { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
-    { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
-    { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F },
-    { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
-    { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
-    { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
-    { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
-    { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
-    { 0x1A17, 0x1A18 }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E },
-    { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C },
-    { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1B00, 0x1B03 },
-    { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C },
-    { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 },
-    { 0x1BA2, 0x1BA5 }, { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAB },
-    { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED },
-    { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 },
-    { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 },
-    { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1DC0, 0x1DE6 },
-    { 0x1DFC, 0x1DFF }, { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 },
-    { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D },
-    { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
-    { 0xA69F, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
-    { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
-    { 0xA8C4, 0xA8C4 }, { 0xA8E0, 0xA8F1 }, { 0xA926, 0xA92D },
-    { 0xA947, 0xA951 }, { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 },
-    { 0xA9B6, 0xA9B9 }, { 0xA9BC, 0xA9BC }, { 0xAA29, 0xAA2E },
-    { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
-    { 0xAA4C, 0xAA4C }, { 0xAAB0, 0xAAB0 }, { 0xAAB2, 0xAAB4 },
-    { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, { 0xAAC1, 0xAAC1 },
-    { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, { 0xABE5, 0xABE5 },
-    { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, { 0xFB1E, 0xFB1E },
-    { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE26 }, { 0x101FD, 0x101FD },
-    { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
-    { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x11001, 0x11001 },
-    { 0x11038, 0x11046 }, { 0x11080, 0x11081 }, { 0x110B3, 0x110B6 },
-    { 0x110B9, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x1112B },
-    { 0x1112D, 0x11134 }, { 0x11180, 0x11181 }, { 0x111B6, 0x111BE },
-    { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 },
-    { 0x116B7, 0x116B7 }, { 0x16F8F, 0x16F92 }, { 0x1D167, 0x1D169 },
-    { 0x1D17B, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
-    { 0x1D242, 0x1D244 }, { 0xE0100, 0xE01EF },
-  };
-
-  if (isCharInSet(UCS, CombiningCharacters))
-    return 0;
-
-  static const UnicodeCharRange DoubleWidthCharacters[] = {
-    // Hangul Jamo
-    { 0x1100, 0x11FF },
-    // Deprecated fullwidth angle brackets
-    { 0x2329, 0x232A },
-    // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
-    // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
-    { 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
-    // Hangul
-    { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
-    // CJK Unified Ideographs
-    { 0xF900, 0xFAFF },
-    // Vertical forms
-    { 0xFE10, 0xFE19 },
-    // CJK Compatibility Forms + Small Form Variants
-    { 0xFE30, 0xFE6F },
-    // Fullwidth forms
-    { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
-    // CJK Unified Ideographs
-    { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
-  };
-
-  if (isCharInSet(UCS, DoubleWidthCharacters))
-    return 2;
-  return 1;
-}
-
-int columnWidth(StringRef Text) {
-  unsigned ColumnWidth = 0;
-  unsigned Length;
-  for (size_t i = 0, e = Text.size(); i < e; i += Length) {
-    Length = getNumBytesForUTF8(Text[i]);
-    if (Length <= 0 || i + Length > Text.size())
-      return ErrorInvalidUTF8;
-    UTF32 buf[1];
-    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Text.data() + i);
-    UTF32 *Target = &buf[0];
-    if (conversionOK != ConvertUTF8toUTF32(&Start, Start + Length, &Target,
-                                           Target + 1, strictConversion))
-      return ErrorInvalidUTF8;
-    int Width = charWidth(buf[0]);
-    if (Width < 0)
-      return ErrorNonPrintableCharacter;
-    ColumnWidth += Width;
-  }
-  return ColumnWidth;
-}
-
-}
-}
-}
diff --git a/lib/Support/LocaleWindows.inc b/lib/Support/LocaleWindows.inc
deleted file mode 100644
index 28e429c..0000000
--- a/lib/Support/LocaleWindows.inc
+++ /dev/null
@@ -1,15 +0,0 @@
-namespace llvm {
-namespace sys {
-namespace locale {
-
-int columnWidth(StringRef s) {
-    return s.size();
-}
-
-bool isPrint(int c) {
-    return ' ' <= c && c <= '~';
-}
-
-}
-}
-}
diff --git a/lib/Support/LocaleXlocale.inc b/lib/Support/LocaleXlocale.inc
deleted file mode 100644
index 389fe3d..0000000
--- a/lib/Support/LocaleXlocale.inc
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ManagedStatic.h"
-#include <cassert>
-#include <xlocale.h>
-
-
-namespace {
-  struct locale_holder {
-    locale_holder()
-    : l(newlocale(LC_CTYPE_MASK,"en_US.UTF-8",LC_GLOBAL_LOCALE))
-    {
-      assert(NULL!=l);
-    }
-    ~locale_holder() {
-      freelocale(l);
-    }
-
-    int mbswidth(llvm::SmallString<16> s) const {
-       // this implementation assumes no '\0' in s
-      assert(s.size()==strlen(s.c_str()));
-
-      size_t size = mbstowcs_l(NULL,s.c_str(),0,l);
-      assert(size!=(size_t)-1);
-      if (size==0)
-        return 0;
-      llvm::SmallVector<wchar_t,200> ws(size);
-      size = mbstowcs_l(&ws[0],s.c_str(),ws.size(),l);
-      assert(ws.size()==size);
-      return wcswidth_l(&ws[0],ws.size(),l);
-    }
-
-    int isprint(int c) const {
-      return iswprint_l(c,l);
-    }
-
-  private:
-
-    locale_t l;
-  };
-
-  llvm::ManagedStatic<locale_holder> l;
-}
-
-namespace llvm {
-namespace sys {
-namespace locale {
-
-int columnWidth(StringRef s) {
-  int width = l->mbswidth(s);
-  assert(width>=0);
-  return width;
-}
-
-bool isPrint(int c) {
-  return l->isprint(c);
-}
-
-}
-}
-}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index cab45c7..dcd5529 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -177,7 +177,7 @@ error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// \brief Memorry maps a file descriptor using sys::fs::mapped_file_region.
+/// \brief Memory maps a file descriptor using sys::fs::mapped_file_region.
 ///
 /// This handles converting the offset into a legal offset on the platform.
 class MemoryBufferMMapFile : public MemoryBuffer {
@@ -217,7 +217,7 @@ public:
 };
 }
 
-static error_code getMemoryBufferForStream(int FD, 
+static error_code getMemoryBufferForStream(int FD,
                                            StringRef BufferName,
                                            OwningPtr<MemoryBuffer> &result) {
   const ssize_t ChunkSize = 4096*4;
@@ -238,14 +238,19 @@ static error_code getMemoryBufferForStream(int FD,
   return error_code::success();
 }
 
-error_code MemoryBuffer::getFile(StringRef Filename,
+static error_code getFileAux(const char *Filename,
+                             OwningPtr<MemoryBuffer> &result, int64_t FileSize,
+                             bool RequiresNullTerminator);
+
+error_code MemoryBuffer::getFile(Twine Filename,
                                  OwningPtr<MemoryBuffer> &result,
                                  int64_t FileSize,
                                  bool RequiresNullTerminator) {
   // Ensure the path is null terminated.
-  SmallString<256> PathBuf(Filename.begin(), Filename.end());
-  return MemoryBuffer::getFile(PathBuf.c_str(), result, FileSize,
-                               RequiresNullTerminator);
+  SmallString<256> PathBuf;
+  StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf);
+  return getFileAux(NullTerminatedName.data(), result, FileSize,
+                    RequiresNullTerminator);
 }
 
 static error_code getOpenFileImpl(int FD, const char *Filename,
@@ -253,10 +258,9 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
                                   uint64_t FileSize, uint64_t MapSize,
                                   int64_t Offset, bool RequiresNullTerminator);
 
-error_code MemoryBuffer::getFile(const char *Filename,
-                                 OwningPtr<MemoryBuffer> &result,
-                                 int64_t FileSize,
-                                 bool RequiresNullTerminator) {
+static error_code getFileAux(const char *Filename,
+                             OwningPtr<MemoryBuffer> &result, int64_t FileSize,
+                             bool RequiresNullTerminator) {
   int FD;
   error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
@@ -276,7 +280,7 @@ static bool shouldUseMmap(int FD,
                           int PageSize) {
   // We don't use mmap for small files because this can severely fragment our
   // address space.
-  if (MapSize < 4096*4)
+  if (MapSize < 4 * 4096 || MapSize < (unsigned)PageSize)
     return false;
 
   if (!RequiresNullTerminator)
@@ -302,6 +306,15 @@ static bool shouldUseMmap(int FD,
   if (End != FileSize)
     return false;
 
+#if defined(_WIN32) || defined(__CYGWIN__)
+  // Don't peek the next page if file is multiple of *physical* pagesize(4k)
+  // but is not multiple of AllocationGranularity(64k),
+  // when a null terminator is required.
+  // FIXME: It's not good to hardcode 4096 here. dwPageSize shows 4096.
+  if ((FileSize & (4096 - 1)) == 0)
+    return false;
+#endif
+
   // Don't try to map files that are exactly a multiple of the system page size
   // if we need a null terminator.
   if ((FileSize & (PageSize -1)) == 0)
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index cfd9ed6..c869b30 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -77,7 +77,7 @@ namespace {
       return path.substr(0, 1);
 
     // * {file,directory}name
-    size_t end = path.find_first_of(separators, 2);
+    size_t end = path.find_first_of(separators);
     return path.substr(0, end);
   }
 
@@ -449,23 +449,18 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
 }
 
 void native(const Twine &path, SmallVectorImpl<char> &result) {
+  assert((!path.isSingleStringRef() ||
+          path.getSingleStringRef().data() != result.data()) &&
+         "path and result are not allowed to overlap!");
   // Clear result.
   result.clear();
-#ifdef LLVM_ON_WIN32
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-  result.reserve(p.size());
-  for (StringRef::const_iterator i = p.begin(),
-                                 e = p.end();
-                                 i != e;
-                                 ++i) {
-    if (*i == '/')
-      result.push_back('\\');
-    else
-      result.push_back(*i);
-  }
-#else
   path.toVector(result);
+  native(result);
+}
+
+void native(SmallVectorImpl<char> &path) {
+#ifdef LLVM_ON_WIN32
+  std::replace(path.begin(), path.end(), '/', '\\');
 #endif
 }
 
@@ -852,6 +847,21 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
   if (Magic.size() < 4)
     return file_magic::unknown;
   switch ((unsigned char)Magic[0]) {
+    case 0x00: {
+      // COFF short import library file
+      if (Magic[1] == (char)0x00 && Magic[2] == (char)0xff &&
+          Magic[3] == (char)0xff)
+        return file_magic::coff_import_library;
+      // Windows resource file
+      const char Expected[] = { 0, 0, 0, 0, '\x20', 0, 0, 0, '\xff' };
+      if (Magic.size() >= sizeof(Expected) &&
+          memcmp(Magic.data(), Expected, sizeof(Expected)) == 0)
+        return file_magic::windows_resource;
+      // 0x0000 = COFF unknown machine type
+      if (Magic[1] == 0)
+        return file_magic::coff_object;
+      break;
+    }
     case 0xDE:  // 0x0B17C0DE = BC wraper
       if (Magic[1] == (char)0xC0 && Magic[2] == (char)0x17 &&
           Magic[3] == (char)0x0B)
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 23ee5ab..722f4ca 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -15,10 +15,12 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ThreadLocal.h"
 #include "llvm/Support/Watchdog.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
 
 #ifdef HAVE_CRASHREPORTERCLIENT_H
 #include <CrashReporterClient.h>
@@ -26,12 +28,7 @@
 
 using namespace llvm;
 
-namespace llvm {
-  bool DisablePrettyStackTrace = false;
-}
-
-// FIXME: This should be thread local when llvm supports threads.
-static sys::ThreadLocal<const PrettyStackTraceEntry> PrettyStackTraceHead;
+static ManagedStatic<sys::ThreadLocal<const PrettyStackTraceEntry> > PrettyStackTraceHead;
 
 static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
   unsigned NextID = 0;
@@ -49,12 +46,12 @@ static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
 /// PrintCurStackTrace - Print the current stack trace to the specified stream.
 static void PrintCurStackTrace(raw_ostream &OS) {
   // Don't print an empty trace.
-  if (PrettyStackTraceHead.get() == 0) return;
+  if (PrettyStackTraceHead->get() == 0) return;
   
   // If there are pretty stack frames registered, walk and emit them.
   OS << "Stack dump:\n";
   
-  PrintStack(PrettyStackTraceHead.get(), OS);
+  PrintStack(PrettyStackTraceHead->get(), OS);
   OS.flush();
 }
 
@@ -102,26 +99,28 @@ static void CrashHandler(void *) {
 #endif
 }
 
-static bool RegisterCrashPrinter() {
-  if (!DisablePrettyStackTrace)
-    sys::AddSignalHandler(CrashHandler, 0);
-  return false;
-}
-
 PrettyStackTraceEntry::PrettyStackTraceEntry() {
-  // The first time this is called, we register the crash printer.
-  static bool HandlerRegistered = RegisterCrashPrinter();
-  (void)HandlerRegistered;
-    
   // Link ourselves.
-  NextEntry = PrettyStackTraceHead.get();
-  PrettyStackTraceHead.set(this);
+  NextEntry = PrettyStackTraceHead->get();
+  PrettyStackTraceHead->set(this);
 }
 
 PrettyStackTraceEntry::~PrettyStackTraceEntry() {
-  assert(PrettyStackTraceHead.get() == this &&
+  // Do nothing if PrettyStackTraceHead is uninitialized. This can only happen
+  // if a shutdown occurred after we created the PrettyStackTraceEntry. That
+  // does occur in the following idiom:
+  //
+  // PrettyStackTraceProgram X(...);
+  // llvm_shutdown_obj Y;
+  //
+  // Without this check, we may end up removing ourselves from the stack trace
+  // after PrettyStackTraceHead has already been destroyed.
+  if (!PrettyStackTraceHead.isConstructed())
+    return;
+  
+  assert(PrettyStackTraceHead->get() == this &&
          "Pretty stack trace entry destruction is out of order");
-  PrettyStackTraceHead.set(getNextEntry());
+  PrettyStackTraceHead->set(getNextEntry());
 }
 
 void PrettyStackTraceString::print(raw_ostream &OS) const {
@@ -135,3 +134,18 @@ void PrettyStackTraceProgram::print(raw_ostream &OS) const {
     OS << ArgV[i] << ' ';
   OS << '\n';
 }
+
+static bool RegisterCrashPrinter() {
+  sys::AddSignalHandler(CrashHandler, 0);
+  return false;
+}
+
+void llvm::EnablePrettyStackTrace() {
+  // The first time this is called, we register the crash printer.
+  static bool HandlerRegistered = RegisterCrashPrinter();
+  (void)HandlerRegistered;
+}
+
+void LLVMEnablePrettyStackTrace() {
+  EnablePrettyStackTrace();
+}
diff --git a/lib/Support/Process.cpp b/lib/Support/Process.cpp
index 2c0d37b..d5168f0 100644
--- a/lib/Support/Process.cpp
+++ b/lib/Support/Process.cpp
@@ -80,6 +80,24 @@ TimeValue self_process::get_wall_time() const {
 #endif
 
 
+#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
+
+#define ALLCOLORS(FGBG,BOLD) {\
+    COLOR(FGBG, "0", BOLD),\
+    COLOR(FGBG, "1", BOLD),\
+    COLOR(FGBG, "2", BOLD),\
+    COLOR(FGBG, "3", BOLD),\
+    COLOR(FGBG, "4", BOLD),\
+    COLOR(FGBG, "5", BOLD),\
+    COLOR(FGBG, "6", BOLD),\
+    COLOR(FGBG, "7", BOLD)\
+  }
+
+static const char colorcodes[2][2][8][10] = {
+ { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
+ { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
+};
+
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Process.inc"
diff --git a/lib/Support/Program.cpp b/lib/Support/Program.cpp
index 79f7e5f..83f2ec4 100644
--- a/lib/Support/Program.cpp
+++ b/lib/Support/Program.cpp
@@ -22,30 +22,40 @@ using namespace sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-static bool Execute(void **Data, StringRef Program, const char **args,
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
                     const char **env, const StringRef **Redirects,
                     unsigned memoryLimit, std::string *ErrMsg);
 
-static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
-                std::string *ErrMsg);
-
 int sys::ExecuteAndWait(StringRef Program, const char **args, const char **envp,
                         const StringRef **redirects, unsigned secondsToWait,
                         unsigned memoryLimit, std::string *ErrMsg,
                         bool *ExecutionFailed) {
-  void *Data = 0;
-  if (Execute(&Data, Program, args, envp, redirects, memoryLimit, ErrMsg)) {
-    if (ExecutionFailed) *ExecutionFailed = false;
-    return Wait(Data, Program, secondsToWait, ErrMsg);
+  ProcessInfo PI;
+  if (Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg)) {
+    if (ExecutionFailed)
+      *ExecutionFailed = false;
+    ProcessInfo Result = Wait(PI, secondsToWait, true, ErrMsg);
+    return Result.ReturnCode;
   }
-  if (ExecutionFailed) *ExecutionFailed = true;
+
+  if (ExecutionFailed)
+    *ExecutionFailed = true;
+
   return -1;
 }
 
-void sys::ExecuteNoWait(StringRef Program, const char **args, const char **envp,
-                        const StringRef **redirects, unsigned memoryLimit,
-                        std::string *ErrMsg) {
-  Execute(/*Data*/ 0, Program, args, envp, redirects, memoryLimit, ErrMsg);
+ProcessInfo sys::ExecuteNoWait(StringRef Program, const char **args,
+                               const char **envp, const StringRef **redirects,
+                               unsigned memoryLimit, std::string *ErrMsg,
+                               bool *ExecutionFailed) {
+  ProcessInfo PI;
+  if (ExecutionFailed)
+    *ExecutionFailed = false;
+  if (!Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg))
+    if (ExecutionFailed)
+      *ExecutionFailed = true;
+
+  return PI;
 }
 
 // Include the platform-specific parts of this class.
diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp
index dec967e..5413641 100644
--- a/lib/Support/Regex.cpp
+++ b/lib/Support/Regex.cpp
@@ -43,7 +43,7 @@ bool Regex::isValid(std::string &Error) {
   
   size_t len = llvm_regerror(error, preg, NULL, 0);
   
-  Error.resize(len);
+  Error.resize(len - 1);
   llvm_regerror(error, preg, &Error[0], len);
   return false;
 }
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index f0fed77..dd417b4 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -202,8 +202,13 @@ void SmallPtrSetImpl::CopyFrom(const SmallPtrSetImpl &RHS) {
   } else if (CurArraySize != RHS.CurArraySize) {
     if (isSmall())
       CurArray = (const void**)malloc(sizeof(void*) * RHS.CurArraySize);
-    else
-      CurArray = (const void**)realloc(CurArray, sizeof(void*)*RHS.CurArraySize);
+    else {
+      const void **T = (const void**)realloc(CurArray,
+                                             sizeof(void*) * RHS.CurArraySize);
+      if (!T)
+        free(CurArray);
+      CurArray = T;
+    }
     assert(CurArray && "Failed to allocate memory?");
   }
   
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 51162dd..d4b94f8 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -211,7 +211,8 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                       LineStr, ColRanges, FixIts);
 }
 
-void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
+                             SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
   SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges, FixIts);
@@ -222,8 +223,6 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
     return;
   }
 
-  raw_ostream &OS = errs();
-
   if (Loc != SMLoc()) {
     int CurBuf = FindBufferContainingLoc(Loc);
     assert(CurBuf != -1 && "Invalid or unspecified location!");
@@ -233,6 +232,12 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
   Diagnostic.print(0, OS, ShowColors);
 }
 
+void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+                             const Twine &Msg, ArrayRef<SMRange> Ranges,
+                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+  PrintMessage(llvm::errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
+}
+
 //===----------------------------------------------------------------------===//
 // SMDiagnostic Implementation
 //===----------------------------------------------------------------------===//
@@ -465,7 +470,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
   if (FixItInsertionLine.empty())
     return;
   
-  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i != e; ++i) {
+  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
       S << FixItInsertionLine[i];
       ++OutCol;
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index d7a0bfa..bfae754 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -37,20 +37,39 @@ static bool ascii_isdigit(char x) {
   return x >= '0' && x <= '9';
 }
 
-/// compare_lower - Compare strings, ignoring case.
-int StringRef::compare_lower(StringRef RHS) const {
-  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
-    unsigned char LHC = ascii_tolower(Data[I]);
-    unsigned char RHC = ascii_tolower(RHS.Data[I]);
+// strncasecmp() is not available on non-POSIX systems, so define an
+// alternative function here.
+static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
+  for (size_t I = 0; I < Length; ++I) {
+    unsigned char LHC = ascii_tolower(LHS[I]);
+    unsigned char RHC = ascii_tolower(RHS[I]);
     if (LHC != RHC)
       return LHC < RHC ? -1 : 1;
   }
+  return 0;
+}
 
+/// compare_lower - Compare strings, ignoring case.
+int StringRef::compare_lower(StringRef RHS) const {
+  if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length)))
+    return Res;
   if (Length == RHS.Length)
     return 0;
   return Length < RHS.Length ? -1 : 1;
 }
 
+/// Check if this string starts with the given \p Prefix, ignoring case.
+bool StringRef::startswith_lower(StringRef Prefix) const {
+  return Length >= Prefix.Length &&
+      ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
+}
+
+/// Check if this string ends with the given \p Suffix, ignoring case.
+bool StringRef::endswith_lower(StringRef Suffix) const {
+  return Length >= Suffix.Length &&
+      ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
+}
+
 /// compare_numeric - Compare strings, handle embedded numbers.
 int StringRef::compare_numeric(StringRef RHS) const {
   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
@@ -85,7 +104,7 @@ int StringRef::compare_numeric(StringRef RHS) const {
 // Compute the edit distance between the two given strings.
 unsigned StringRef::edit_distance(llvm::StringRef Other,
                                   bool AllowReplacements,
-                                  unsigned MaxEditDistance) {
+                                  unsigned MaxEditDistance) const {
   return llvm::ComputeEditDistance(
       llvm::ArrayRef<char>(data(), size()),
       llvm::ArrayRef<char>(Other.data(), Other.size()),
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index 9c81327..0c90c17 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -135,9 +135,9 @@ const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) {
   return TheTarget;
 }
 
-static int TargetArraySortFn(const void *LHS, const void *RHS) {
-  typedef std::pair<StringRef, const Target*> pair_ty;
-  return ((const pair_ty*)LHS)->first.compare(((const pair_ty*)RHS)->first);
+static int TargetArraySortFn(const std::pair<StringRef, const Target *> *LHS,
+                             const std::pair<StringRef, const Target *> *RHS) {
+  return LHS->first.compare(RHS->first);
 }
 
 void TargetRegistry::printRegisteredTargetsForVersion() {
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index 0587aae..868b6ea 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -23,7 +23,7 @@
 // Define all methods as no-ops if threading is explicitly disabled
 namespace llvm {
 using namespace sys;
-ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::ThreadLocalImpl() : data() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) {
   typedef int SIZE_TOO_BIG[sizeof(d) <= sizeof(data) ? 1 : -1];
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index d0d0e14..6c978a0 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -221,7 +221,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("i386", "i486", "i586", "i686", Triple::x86)
     // FIXME: Do we need to support these?
     .Cases("i786", "i886", "i986", Triple::x86)
-    .Cases("amd64", "x86_64", Triple::x86_64)
+    .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
     .Case("powerpc", Triple::ppc)
     .Cases("powerpc64", "ppu", Triple::ppc64)
     .Case("powerpc64le", Triple::ppc64le)
diff --git a/lib/Support/Unicode.cpp b/lib/Support/Unicode.cpp
new file mode 100644
index 0000000..b719bd8
--- /dev/null
+++ b/lib/Support/Unicode.cpp
@@ -0,0 +1,367 @@
+//===- llvm/Support/Unicode.cpp - Unicode character properties  -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions that allow querying certain properties of
+// Unicode characters.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Unicode.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/UnicodeCharRanges.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+
+bool isPrintable(int UCS) {
+  // Sorted list of non-overlapping intervals of code points that are not
+  // supposed to be printable.
+  static const UnicodeCharRange NonPrintableRanges[] = {
+    { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
+    { 0x0378, 0x0379 }, { 0x037F, 0x0383 }, { 0x038B, 0x038B },
+    { 0x038D, 0x038D }, { 0x03A2, 0x03A2 }, { 0x0528, 0x0530 },
+    { 0x0557, 0x0558 }, { 0x0560, 0x0560 }, { 0x0588, 0x0588 },
+    { 0x058B, 0x058E }, { 0x0590, 0x0590 }, { 0x05C8, 0x05CF },
+    { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 }, { 0x061C, 0x061D },
+    { 0x06DD, 0x06DD }, { 0x070E, 0x070F }, { 0x074B, 0x074C },
+    { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF }, { 0x082E, 0x082F },
+    { 0x083F, 0x083F }, { 0x085C, 0x085D }, { 0x085F, 0x089F },
+    { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 }, { 0x08FF, 0x08FF },
+    { 0x0978, 0x0978 }, { 0x0980, 0x0980 }, { 0x0984, 0x0984 },
+    { 0x098D, 0x098E }, { 0x0991, 0x0992 }, { 0x09A9, 0x09A9 },
+    { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 }, { 0x09BA, 0x09BB },
+    { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA }, { 0x09CF, 0x09D6 },
+    { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE }, { 0x09E4, 0x09E5 },
+    { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 }, { 0x0A0B, 0x0A0E },
+    { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 }, { 0x0A31, 0x0A31 },
+    { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 }, { 0x0A3A, 0x0A3B },
+    { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 }, { 0x0A49, 0x0A4A },
+    { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 }, { 0x0A5D, 0x0A5D },
+    { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 }, { 0x0A84, 0x0A84 },
+    { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 }, { 0x0AA9, 0x0AA9 },
+    { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 }, { 0x0ABA, 0x0ABB },
+    { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA }, { 0x0ACE, 0x0ACF },
+    { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 }, { 0x0AF2, 0x0B00 },
+    { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E }, { 0x0B11, 0x0B12 },
+    { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 }, { 0x0B34, 0x0B34 },
+    { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 }, { 0x0B49, 0x0B4A },
+    { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B }, { 0x0B5E, 0x0B5E },
+    { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 }, { 0x0B84, 0x0B84 },
+    { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 }, { 0x0B96, 0x0B98 },
+    { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D }, { 0x0BA0, 0x0BA2 },
+    { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD }, { 0x0BBA, 0x0BBD },
+    { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 }, { 0x0BCE, 0x0BCF },
+    { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 }, { 0x0BFB, 0x0C00 },
+    { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D }, { 0x0C11, 0x0C11 },
+    { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 }, { 0x0C3A, 0x0C3C },
+    { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 }, { 0x0C4E, 0x0C54 },
+    { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F }, { 0x0C64, 0x0C65 },
+    { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 }, { 0x0C84, 0x0C84 },
+    { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 }, { 0x0CA9, 0x0CA9 },
+    { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB }, { 0x0CC5, 0x0CC5 },
+    { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 }, { 0x0CD7, 0x0CDD },
+    { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 }, { 0x0CF0, 0x0CF0 },
+    { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 }, { 0x0D0D, 0x0D0D },
+    { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C }, { 0x0D45, 0x0D45 },
+    { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 }, { 0x0D58, 0x0D5F },
+    { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 }, { 0x0D80, 0x0D81 },
+    { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 }, { 0x0DB2, 0x0DB2 },
+    { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF }, { 0x0DC7, 0x0DC9 },
+    { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 }, { 0x0DD7, 0x0DD7 },
+    { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 }, { 0x0E3B, 0x0E3E },
+    { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 }, { 0x0E85, 0x0E86 },
+    { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C }, { 0x0E8E, 0x0E93 },
+    { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 }, { 0x0EA4, 0x0EA4 },
+    { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 }, { 0x0EAC, 0x0EAC },
+    { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF }, { 0x0EC5, 0x0EC5 },
+    { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF }, { 0x0EDA, 0x0EDB },
+    { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 }, { 0x0F6D, 0x0F70 },
+    { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD }, { 0x0FCD, 0x0FCD },
+    { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 }, { 0x10C8, 0x10CC },
+    { 0x10CE, 0x10CF }, { 0x115F, 0x1160 }, { 0x1249, 0x1249 },
+    { 0x124E, 0x124F }, { 0x1257, 0x1257 }, { 0x1259, 0x1259 },
+    { 0x125E, 0x125F }, { 0x1289, 0x1289 }, { 0x128E, 0x128F },
+    { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 }, { 0x12BF, 0x12BF },
+    { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 }, { 0x12D7, 0x12D7 },
+    { 0x1311, 0x1311 }, { 0x1316, 0x1317 }, { 0x135B, 0x135C },
+    { 0x137D, 0x137F }, { 0x139A, 0x139F }, { 0x13F5, 0x13FF },
+    { 0x169D, 0x169F }, { 0x16F1, 0x16FF }, { 0x170D, 0x170D },
+    { 0x1715, 0x171F }, { 0x1737, 0x173F }, { 0x1754, 0x175F },
+    { 0x176D, 0x176D }, { 0x1771, 0x1771 }, { 0x1774, 0x177F },
+    { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF }, { 0x17EA, 0x17EF },
+    { 0x17FA, 0x17FF }, { 0x180B, 0x180D }, { 0x180F, 0x180F },
+    { 0x181A, 0x181F }, { 0x1878, 0x187F }, { 0x18AB, 0x18AF },
+    { 0x18F6, 0x18FF }, { 0x191D, 0x191F }, { 0x192C, 0x192F },
+    { 0x193C, 0x193F }, { 0x1941, 0x1943 }, { 0x196E, 0x196F },
+    { 0x1975, 0x197F }, { 0x19AC, 0x19AF }, { 0x19CA, 0x19CF },
+    { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D }, { 0x1A5F, 0x1A5F },
+    { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F }, { 0x1A9A, 0x1A9F },
+    { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F }, { 0x1B7D, 0x1B7F },
+    { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A }, { 0x1C4A, 0x1C4C },
+    { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF }, { 0x1CF7, 0x1CFF },
+    { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 }, { 0x1F1E, 0x1F1F },
+    { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F }, { 0x1F58, 0x1F58 },
+    { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C }, { 0x1F5E, 0x1F5E },
+    { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 }, { 0x1FC5, 0x1FC5 },
+    { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC }, { 0x1FF0, 0x1FF1 },
+    { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF }, { 0x200B, 0x200F },
+    { 0x202A, 0x202E }, { 0x2060, 0x206F }, { 0x2072, 0x2073 },
+    { 0x208F, 0x208F }, { 0x209D, 0x209F }, { 0x20BB, 0x20CF },
+    { 0x20F1, 0x20FF }, { 0x218A, 0x218F }, { 0x23F4, 0x23FF },
+    { 0x2427, 0x243F }, { 0x244B, 0x245F }, { 0x2700, 0x2700 },
+    { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF }, { 0x2C2F, 0x2C2F },
+    { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 }, { 0x2D26, 0x2D26 },
+    { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F }, { 0x2D68, 0x2D6E },
+    { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F }, { 0x2DA7, 0x2DA7 },
+    { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 }, { 0x2DBF, 0x2DBF },
+    { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF }, { 0x2DD7, 0x2DD7 },
+    { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F }, { 0x2E9A, 0x2E9A },
+    { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF }, { 0x2FFC, 0x2FFF },
+    { 0x3040, 0x3040 }, { 0x3097, 0x3098 }, { 0x3100, 0x3104 },
+    { 0x312E, 0x3130 }, { 0x3164, 0x3164 }, { 0x318F, 0x318F },
+    { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF }, { 0x321F, 0x321F },
+    { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF }, { 0x9FCD, 0x9FFF },
+    { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF }, { 0xA62C, 0xA63F },
+    { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF }, { 0xA78F, 0xA78F },
+    { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 }, { 0xA82C, 0xA82F },
+    { 0xA83A, 0xA83F }, { 0xA878, 0xA87F }, { 0xA8C5, 0xA8CD },
+    { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF }, { 0xA954, 0xA95E },
+    { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE }, { 0xA9DA, 0xA9DD },
+    { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F }, { 0xAA4E, 0xAA4F },
+    { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F }, { 0xAAC3, 0xAADA },
+    { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 }, { 0xAB0F, 0xAB10 },
+    { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 }, { 0xAB2F, 0xABBF },
+    { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF }, { 0xD7A4, 0xD7AF },
+    { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF }, { 0xFA6E, 0xFA6F },
+    { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 }, { 0xFB18, 0xFB1C },
+    { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D }, { 0xFB3F, 0xFB3F },
+    { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 }, { 0xFBC2, 0xFBD2 },
+    { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 }, { 0xFDC8, 0xFDEF },
+    { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F }, { 0xFE27, 0xFE2F },
+    { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 }, { 0xFE6C, 0xFE6F },
+    { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF }, { 0xFF00, 0xFF00 },
+    { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 }, { 0xFFC8, 0xFFC9 },
+    { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 }, { 0xFFDD, 0xFFDF },
+    { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB }, { 0xFFFE, 0xFFFF },
+    { 0x1000C, 0x1000C }, { 0x10027, 0x10027 }, { 0x1003B, 0x1003B },
+    { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F }, { 0x1005E, 0x1007F },
+    { 0x100FB, 0x100FF }, { 0x10103, 0x10106 }, { 0x10134, 0x10136 },
+    { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF }, { 0x101FE, 0x1027F },
+    { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF }, { 0x1031F, 0x1031F },
+    { 0x10324, 0x1032F }, { 0x1034B, 0x1037F }, { 0x1039E, 0x1039E },
+    { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF }, { 0x1049E, 0x1049F },
+    { 0x104AA, 0x107FF }, { 0x10806, 0x10807 }, { 0x10809, 0x10809 },
+    { 0x10836, 0x10836 }, { 0x10839, 0x1083B }, { 0x1083D, 0x1083E },
+    { 0x10856, 0x10856 }, { 0x10860, 0x108FF }, { 0x1091C, 0x1091E },
+    { 0x1093A, 0x1093E }, { 0x10940, 0x1097F }, { 0x109B8, 0x109BD },
+    { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 }, { 0x10A07, 0x10A0B },
+    { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 }, { 0x10A34, 0x10A37 },
+    { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F }, { 0x10A59, 0x10A5F },
+    { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 }, { 0x10B56, 0x10B57 },
+    { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF }, { 0x10C49, 0x10E5F },
+    { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 }, { 0x11070, 0x1107F },
+    { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF }, { 0x110E9, 0x110EF },
+    { 0x110FA, 0x110FF }, { 0x11135, 0x11135 }, { 0x11144, 0x1117F },
+    { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F }, { 0x116B8, 0x116BF },
+    { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF }, { 0x12463, 0x1246F },
+    { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF }, { 0x16A39, 0x16EFF },
+    { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E }, { 0x16FA0, 0x1AFFF },
+    { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF }, { 0x1D127, 0x1D128 },
+    { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF }, { 0x1D246, 0x1D2FF },
+    { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF }, { 0x1D455, 0x1D455 },
+    { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 }, { 0x1D4A3, 0x1D4A4 },
+    { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD }, { 0x1D4BA, 0x1D4BA },
+    { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 }, { 0x1D506, 0x1D506 },
+    { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 }, { 0x1D51D, 0x1D51D },
+    { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F }, { 0x1D545, 0x1D545 },
+    { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 }, { 0x1D6A6, 0x1D6A7 },
+    { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF }, { 0x1EE04, 0x1EE04 },
+    { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 }, { 0x1EE25, 0x1EE26 },
+    { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 }, { 0x1EE38, 0x1EE38 },
+    { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 }, { 0x1EE43, 0x1EE46 },
+    { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A }, { 0x1EE4C, 0x1EE4C },
+    { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 }, { 0x1EE55, 0x1EE56 },
+    { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A }, { 0x1EE5C, 0x1EE5C },
+    { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 }, { 0x1EE63, 0x1EE63 },
+    { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B }, { 0x1EE73, 0x1EE73 },
+    { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D }, { 0x1EE7F, 0x1EE7F },
+    { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 }, { 0x1EEA4, 0x1EEA4 },
+    { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF }, { 0x1EEF2, 0x1EFFF },
+    { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F }, { 0x1F0AF, 0x1F0B0 },
+    { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 }, { 0x1F0E0, 0x1F0FF },
+    { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F }, { 0x1F16C, 0x1F16F },
+    { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F }, { 0x1F23B, 0x1F23F },
+    { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF }, { 0x1F321, 0x1F32F },
+    { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F }, { 0x1F394, 0x1F39F },
+    { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF }, { 0x1F3F1, 0x1F3FF },
+    { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 }, { 0x1F4F8, 0x1F4F8 },
+    { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F }, { 0x1F544, 0x1F54F },
+    { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 }, { 0x1F650, 0x1F67F },
+    { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF }, { 0x2A6D7, 0x2A6FF },
+    { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF }, { 0x2FA1E, 0xF0000 },
+    { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }
+  };
+  static const UnicodeCharSet NonPrintables(NonPrintableRanges);
+
+  return UCS >= 0 && UCS <= 0x10FFFF && !NonPrintables.contains(UCS);
+}
+
+/// Gets the number of positions a character is likely to occupy when output
+/// on a terminal ("character width"). This depends on the implementation of the
+/// terminal, and there's no standard definition of character width.
+/// The implementation defines it in a way that is expected to be compatible
+/// with a generic Unicode-capable terminal.
+/// \return Character width:
+///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
+///     identified by isPrintable);
+///   * 0 for non-spacing and enclosing combining marks;
+///   * 2 for CJK characters excluding halfwidth forms;
+///   * 1 for all remaining characters.
+static inline int charWidth(int UCS)
+{
+  if (!isPrintable(UCS))
+    return ErrorNonPrintableCharacter;
+
+  // Sorted list of non-spacing and enclosing combining mark intervals as
+  // defined in "3.6 Combination" of
+  // http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
+  static const UnicodeCharRange CombiningCharacterRanges[] = {
+    { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
+    { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
+    { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, { 0x064B, 0x065F },
+    { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 },
+    { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x0711, 0x0711 },
+    { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 },
+    { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
+    { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08E4, 0x08FE },
+    { 0x0900, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C },
+    { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 },
+    { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC },
+    { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 },
+    { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 },
+    { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 },
+    { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 },
+    { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
+    { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
+    { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 },
+    { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B62, 0x0B63 },
+    { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD },
+    { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D },
+    { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0CBC, 0x0CBC },
+    { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
+    { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D },
+    { 0x0D62, 0x0D63 }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
+    { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
+    { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
+    { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
+    { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
+    { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
+    { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
+    { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
+    { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
+    { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
+    { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F },
+    { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
+    { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
+    { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
+    { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
+    { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
+    { 0x1A17, 0x1A18 }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E },
+    { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C },
+    { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1B00, 0x1B03 },
+    { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C },
+    { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 },
+    { 0x1BA2, 0x1BA5 }, { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAB },
+    { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED },
+    { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 },
+    { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 },
+    { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1DC0, 0x1DE6 },
+    { 0x1DFC, 0x1DFF }, { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 },
+    { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D },
+    { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
+    { 0xA69F, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
+    { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
+    { 0xA8C4, 0xA8C4 }, { 0xA8E0, 0xA8F1 }, { 0xA926, 0xA92D },
+    { 0xA947, 0xA951 }, { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 },
+    { 0xA9B6, 0xA9B9 }, { 0xA9BC, 0xA9BC }, { 0xAA29, 0xAA2E },
+    { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
+    { 0xAA4C, 0xAA4C }, { 0xAAB0, 0xAAB0 }, { 0xAAB2, 0xAAB4 },
+    { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, { 0xAAC1, 0xAAC1 },
+    { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, { 0xABE5, 0xABE5 },
+    { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, { 0xFB1E, 0xFB1E },
+    { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE26 }, { 0x101FD, 0x101FD },
+    { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
+    { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x11001, 0x11001 },
+    { 0x11038, 0x11046 }, { 0x11080, 0x11081 }, { 0x110B3, 0x110B6 },
+    { 0x110B9, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x1112B },
+    { 0x1112D, 0x11134 }, { 0x11180, 0x11181 }, { 0x111B6, 0x111BE },
+    { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 },
+    { 0x116B7, 0x116B7 }, { 0x16F8F, 0x16F92 }, { 0x1D167, 0x1D169 },
+    { 0x1D17B, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
+    { 0x1D242, 0x1D244 }, { 0xE0100, 0xE01EF },
+  };
+  static const UnicodeCharSet CombiningCharacters(CombiningCharacterRanges);
+
+  if (CombiningCharacters.contains(UCS))
+    return 0;
+
+  static const UnicodeCharRange DoubleWidthCharacterRanges[] = {
+    // Hangul Jamo
+    { 0x1100, 0x11FF },
+    // Deprecated fullwidth angle brackets
+    { 0x2329, 0x232A },
+    // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
+    // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
+    { 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
+    // Hangul
+    { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
+    // CJK Unified Ideographs
+    { 0xF900, 0xFAFF },
+    // Vertical forms
+    { 0xFE10, 0xFE19 },
+    // CJK Compatibility Forms + Small Form Variants
+    { 0xFE30, 0xFE6F },
+    // Fullwidth forms
+    { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
+    // CJK Unified Ideographs
+    { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
+  };
+  static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges);
+
+  if (DoubleWidthCharacters.contains(UCS))
+    return 2;
+  return 1;
+}
+
+int columnWidthUTF8(StringRef Text) {
+  unsigned ColumnWidth = 0;
+  unsigned Length;
+  for (size_t i = 0, e = Text.size(); i < e; i += Length) {
+    Length = getNumBytesForUTF8(Text[i]);
+    if (Length <= 0 || i + Length > Text.size())
+      return ErrorInvalidUTF8;
+    UTF32 buf[1];
+    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Text.data() + i);
+    UTF32 *Target = &buf[0];
+    if (conversionOK != ConvertUTF8toUTF32(&Start, Start + Length, &Target,
+                                           Target + 1, strictConversion))
+      return ErrorInvalidUTF8;
+    int Width = charWidth(buf[0]);
+    if (Width < 0)
+      return ErrorNonPrintableCharacter;
+    ColumnWidth += Width;
+  }
+  return ColumnWidth;
+}
+
+} // namespace unicode
+} // namespace sys
+} // namespace llvm
+
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 4dcfa09..c9dc871 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -182,7 +182,7 @@ namespace sys  {
 namespace fs {
 #if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
     defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
-    defined(__linux__) || defined(__CYGWIN__)
+    defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__)
 static int
 test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
     const char *dir, const char *bin)
@@ -251,7 +251,8 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
       return link_path;
   }
 #elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-      defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
+      defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
+      defined(__FreeBSD_kernel__)
   char exe_path[PATH_MAX];
 
   if (getprogpath(exe_path, argv0) != NULL)
@@ -298,6 +299,18 @@ UniqueID file_status::getUniqueID() const {
 }
 
 error_code current_path(SmallVectorImpl<char> &result) {
+  result.clear();
+
+  const char *pwd = ::getenv("PWD");
+  llvm::sys::fs::file_status PWDStatus, DotStatus;
+  if (pwd && llvm::sys::path::is_absolute(pwd) &&
+      !llvm::sys::fs::status(pwd, PWDStatus) &&
+      !llvm::sys::fs::status(".", DotStatus) &&
+      PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
+    result.append(pwd, pwd + strlen(pwd));
+    return error_code::success();
+  }
+
 #ifdef MAXPATHLEN
   result.reserve(MAXPATHLEN);
 #else
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 0a797f6..c5778e7 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -13,6 +13,7 @@
 
 #include "Unix.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/TimeValue.h"
@@ -38,25 +39,6 @@
 #  include <termios.h>
 #endif
 
-// See if we can use curses to detect information about a terminal when
-// connected to one.
-#ifdef HAVE_CURSES
-# if defined(HAVE_CURSES_H)
-#  include <curses.h>
-# elif defined(HAVE_NCURSES_H)
-#  include <ncurses.h>
-# elif defined(HAVE_NCURSESW_H)
-#  include <ncursesw.h>
-# elif defined(HAVE_NCURSES_CURSES_H)
-#  include <ncurses/curses.h>
-# elif defined(HAVE_NCURSESW_CURSES_H)
-#  include <ncursesw/curses.h>
-# else
-#  error Have a curses library but unable to find a curses header!
-# endif
-# include <term.h>
-#endif
-
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that
 //===          is guaranteed to work on *all* UNIX variants.
@@ -107,13 +89,10 @@ TimeValue self_process::get_system_time() const {
   return getRUsageTimes().second;
 }
 
+// On Cygwin, getpagesize() returns 64k(AllocationGranularity) and
+// offset in mmap(3) should be aligned to the AllocationGranularity.
 static unsigned getPageSize() {
-#if defined(__CYGWIN__)
-  // On Cygwin, getpagesize() returns 64k but the page size for the purposes of
-  // memory protection and mmap() is 4k.
-  // See http://www.cygwin.com/ml/cygwin/2009-01/threads.html#00492
-  const int page_size = 0x1000;
-#elif defined(HAVE_GETPAGESIZE)
+#if defined(HAVE_GETPAGESIZE)
   const int page_size = ::getpagesize();
 #elif defined(HAVE_SYSCONF)
   long page_size = ::sysconf(_SC_PAGE_SIZE);
@@ -159,14 +138,6 @@ void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
   llvm::tie(user_time, sys_time) = getRUsageTimes();
 }
 
-int Process::GetCurrentUserId() {
-  return getuid();
-}
-
-int Process::GetCurrentGroupId() {
-  return getgid();
-}
-
 #if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
 #include <mach/mach.h>
 #endif
@@ -211,6 +182,22 @@ void Process::PreventCoreFiles() {
 #endif
 }
 
+Optional<std::string> Process::GetEnv(StringRef Name) {
+  std::string NameStr = Name.str();
+  const char *Val = ::getenv(NameStr.c_str());
+  if (!Val)
+    return None;
+  return std::string(Val);
+}
+
+error_code Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
+                                      ArrayRef<const char *> ArgsIn,
+                                      SpecificBumpPtrAllocator<char> &) {
+  ArgsOut.append(ArgsIn.begin(), ArgsIn.end());
+
+  return error_code::success();
+}
+
 bool Process::StandardInIsUserInput() {
   return FileDescriptorIsDisplayed(STDIN_FILENO);
 }
@@ -266,21 +253,50 @@ unsigned Process::StandardErrColumns() {
   return getColumns(2);
 }
 
+#ifdef HAVE_TERMINFO
+// We manually declare these extern functions because finding the correct
+// headers from various terminfo, curses, or other sources is harder than
+// writing their specs down.
+extern "C" int setupterm(char *term, int filedes, int *errret);
+extern "C" struct term *set_curterm(struct term *termp);
+extern "C" int del_curterm(struct term *termp);
+extern "C" int tigetnum(char *capname);
+#endif
+
 static bool terminalHasColors(int fd) {
-#ifdef HAVE_CURSES
-  // First, acquire a global lock because the curses C routines are thread
-  // hostile.
+#ifdef HAVE_TERMINFO
+  // First, acquire a global lock because these C routines are thread hostile.
   static sys::Mutex M;
   MutexGuard G(M);
 
   int errret = 0;
-  if (setupterm((char *)0, fd, &errret) != OK)
+  if (setupterm((char *)0, fd, &errret) != 0)
     // Regardless of why, if we can't get terminfo, we shouldn't try to print
     // colors.
     return false;
 
-  // Test whether the terminal as set up supports color output.
-  if (has_colors() == TRUE)
+  // Test whether the terminal as set up supports color output. How to do this
+  // isn't entirely obvious. We can use the curses routine 'has_colors' but it
+  // would be nice to avoid a dependency on curses proper when we can make do
+  // with a minimal terminfo parsing library. Also, we don't really care whether
+  // the terminal supports the curses-specific color changing routines, merely
+  // if it will interpret ANSI color escape codes in a reasonable way. Thus, the
+  // strategy here is just to query the baseline colors capability and if it
+  // supports colors at all to assume it will translate the escape codes into
+  // whatever range of colors it does support. We can add more detailed tests
+  // here if users report them as necessary.
+  //
+  // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if
+  // the terminfo says that no colors are supported.
+  bool HasColors = tigetnum(const_cast<char *>("colors")) > 0;
+
+  // Now extract the structure allocated by setupterm and free its memory
+  // through a really silly dance.
+  struct term *termp = set_curterm((struct term *)0);
+  (void)del_curterm(termp); // Drop any errors here.
+
+  // Return true if we found a color capabilities for the current terminal.
+  if (HasColors)
     return true;
 #endif
 
@@ -302,29 +318,15 @@ bool Process::StandardErrHasColors() {
   return FileDescriptorHasColors(STDERR_FILENO);
 }
 
+void Process::UseANSIEscapeCodes(bool /*enable*/) {
+  // No effect.
+}
+
 bool Process::ColorNeedsFlush() {
   // No, we use ANSI escape sequences.
   return false;
 }
 
-#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
-
-#define ALLCOLORS(FGBG,BOLD) {\
-    COLOR(FGBG, "0", BOLD),\
-    COLOR(FGBG, "1", BOLD),\
-    COLOR(FGBG, "2", BOLD),\
-    COLOR(FGBG, "3", BOLD),\
-    COLOR(FGBG, "4", BOLD),\
-    COLOR(FGBG, "5", BOLD),\
-    COLOR(FGBG, "6", BOLD),\
-    COLOR(FGBG, "7", BOLD)\
-  }
-
-static const char colorcodes[2][2][8][10] = {
- { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
- { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
-};
-
 const char *Process::OutputColor(char code, bool bold, bool bg) {
   return colorcodes[bg?1:0][bold?1:0][code&7];
 }
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index a93a912..78b2971 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -36,6 +36,9 @@
 #include <unistd.h>
 #endif
 #ifdef HAVE_POSIX_SPAWN
+#ifdef __sun__
+#define  _RESTRICT_KYWD
+#endif
 #include <spawn.h>
 #if !defined(__APPLE__)
   extern char **environ;
@@ -47,6 +50,8 @@
 namespace llvm {
 using namespace sys;
 
+ProcessInfo::ProcessInfo() : Pid(0), ReturnCode(0) {}
+
 // This function just uses the PATH environment variable to find the program.
 std::string
 sys::FindProgramByName(const std::string& progName) {
@@ -175,9 +180,16 @@ static void SetMemoryLimits (unsigned size)
 
 }
 
-static bool Execute(void **Data, StringRef Program, const char **args,
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
                     const char **envp, const StringRef **redirects,
                     unsigned memoryLimit, std::string *ErrMsg) {
+  if (!llvm::sys::fs::exists(Program)) {
+    if (ErrMsg)
+      *ErrMsg = std::string("Executable \"") + Program.str() +
+                std::string("\" doesn't exist!");
+    return false;
+  }
+
   // If this OS has posix_spawn and there is no memory limit being implied, use
   // posix_spawn.  It is more efficient than fork/exec.
 #ifdef HAVE_POSIX_SPAWN
@@ -239,8 +251,8 @@ static bool Execute(void **Data, StringRef Program, const char **args,
     if (Err)
      return !MakeErrMsg(ErrMsg, "posix_spawn failed", Err);
 
-    if (Data)
-      *Data = reinterpret_cast<void*>(PID);
+    PI.Pid = PID;
+
     return true;
   }
 #endif
@@ -303,56 +315,71 @@ static bool Execute(void **Data, StringRef Program, const char **args,
       break;
   }
 
-  if (Data)
-    *Data = reinterpret_cast<void*>(child);
+  PI.Pid = child;
 
   return true;
 }
 
-static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
-                std::string *ErrMsg) {
+namespace llvm {
+
+ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
+                      bool WaitUntilTerminates, std::string *ErrMsg) {
 #ifdef HAVE_SYS_WAIT_H
   struct sigaction Act, Old;
-  assert(Data && "invalid pid to wait on, process not started?");
-
-  // Install a timeout handler.  The handler itself does nothing, but the simple
-  // fact of having a handler at all causes the wait below to return with EINTR,
-  // unlike if we used SIG_IGN.
-  if (secondsToWait) {
+  assert(PI.Pid && "invalid pid to wait on, process not started?");
+
+  int WaitPidOptions = 0;
+  pid_t ChildPid = PI.Pid;
+  if (WaitUntilTerminates) {
+    SecondsToWait = 0;
+    ChildPid = -1; // mimic a wait() using waitpid()
+  } else if (SecondsToWait) {
+    // Install a timeout handler.  The handler itself does nothing, but the
+    // simple fact of having a handler at all causes the wait below to return
+    // with EINTR, unlike if we used SIG_IGN.
     memset(&Act, 0, sizeof(Act));
     Act.sa_handler = TimeOutHandler;
     sigemptyset(&Act.sa_mask);
     sigaction(SIGALRM, &Act, &Old);
-    alarm(secondsToWait);
-  }
+    alarm(SecondsToWait);
+  } else if (SecondsToWait == 0)
+    WaitPidOptions = WNOHANG;
 
   // Parent process: Wait for the child process to terminate.
   int status;
-  uint64_t pid = reinterpret_cast<uint64_t>(Data);
-  pid_t child = static_cast<pid_t>(pid);
-  while (waitpid(pid, &status, 0) != child)
-    if (secondsToWait && errno == EINTR) {
-      // Kill the child.
-      kill(child, SIGKILL);
-
-      // Turn off the alarm and restore the signal handler
-      alarm(0);
-      sigaction(SIGALRM, &Old, 0);
-
-      // Wait for child to die
-      if (wait(&status) != child)
-        MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
-      else
-        MakeErrMsg(ErrMsg, "Child timed out", 0);
-
-      return -2;   // Timeout detected
-    } else if (errno != EINTR) {
-      MakeErrMsg(ErrMsg, "Error waiting for child process");
-      return -1;
+  ProcessInfo WaitResult;
+  WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions);
+  if (WaitResult.Pid != PI.Pid) {
+    if (WaitResult.Pid == 0) {
+      // Non-blocking wait.
+      return WaitResult;
+    } else {
+      if (SecondsToWait && errno == EINTR) {
+        // Kill the child.
+        kill(PI.Pid, SIGKILL);
+
+        // Turn off the alarm and restore the signal handler
+        alarm(0);
+        sigaction(SIGALRM, &Old, 0);
+
+        // Wait for child to die
+        if (wait(&status) != ChildPid)
+          MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
+        else
+          MakeErrMsg(ErrMsg, "Child timed out", 0);
+
+        WaitResult.ReturnCode = -2; // Timeout detected
+        return WaitResult;
+      } else if (errno != EINTR) {
+        MakeErrMsg(ErrMsg, "Error waiting for child process");
+        WaitResult.ReturnCode = -1;
+        return WaitResult;
+      }
     }
+  }
 
   // We exited normally without timeout, so turn off the timer.
-  if (secondsToWait) {
+  if (SecondsToWait && !WaitUntilTerminates) {
     alarm(0);
     sigaction(SIGALRM, &Old, 0);
   }
@@ -362,24 +389,19 @@ static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
   int result = 0;
   if (WIFEXITED(status)) {
     result = WEXITSTATUS(status);
-#ifdef HAVE_POSIX_SPAWN
-    // The posix_spawn child process returns 127 on any kind of error.
-    // Following the POSIX convention for command-line tools (which posix_spawn
-    // itself apparently does not), check to see if the failure was due to some
-    // reason other than the file not existing, and return 126 in this case.
-    bool Exists;
-    if (result == 127 && !llvm::sys::fs::exists(Program, Exists) && Exists)
-      result = 126;
-#endif
+    WaitResult.ReturnCode = result;
+
     if (result == 127) {
       if (ErrMsg)
         *ErrMsg = llvm::sys::StrError(ENOENT);
-      return -1;
+      WaitResult.ReturnCode = -1;
+      return WaitResult;
     }
     if (result == 126) {
       if (ErrMsg)
         *ErrMsg = "Program could not be executed";
-      return -1;
+      WaitResult.ReturnCode = -1;
+      return WaitResult;
     }
   } else if (WIFSIGNALED(status)) {
     if (ErrMsg) {
@@ -391,18 +413,16 @@ static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
     }
     // Return a special value to indicate that the process received an unhandled
     // signal during execution as opposed to failing to execute.
-    return -2;
+    WaitResult.ReturnCode = -2;
   }
-  return result;
 #else
   if (ErrMsg)
     *ErrMsg = "Program::Wait is not implemented on this platform yet!";
-  return -1;
+  WaitResult.ReturnCode = -2;
 #endif
+  return WaitResult;
 }
 
-namespace llvm {
-
 error_code sys::ChangeStdinToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return make_error_code(errc::success);
@@ -438,5 +458,4 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   }
   return true;
 }
-
 }
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 800a6a7..13ae862 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -333,7 +333,7 @@ static void PrintStackTraceSignalHandler(void *) {
 void llvm::sys::PrintStackTraceOnErrorSignal() {
   AddSignalHandler(PrintStackTraceSignalHandler, 0);
 
-#if defined(__APPLE__) && !defined(ANDROID)
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
   // Environment variable to disable any kind of crash dialog.
   if (getenv("LLVM_DISABLE_CRASH_REPORT")) {
     mach_port_t self = mach_task_self();
@@ -359,7 +359,7 @@ void llvm::sys::PrintStackTraceOnErrorSignal() {
 // the same linkage unit by just defining our own versions of the assert handler
 // and abort.
 
-#if defined(__APPLE__) && !defined(ANDROID)
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
 
 #include <signal.h>
 #include <pthread.h>
diff --git a/lib/Support/Unix/ThreadLocal.inc b/lib/Support/Unix/ThreadLocal.inc
index 2b4c901..f14d0fa 100644
--- a/lib/Support/Unix/ThreadLocal.inc
+++ b/lib/Support/Unix/ThreadLocal.inc
@@ -18,7 +18,7 @@
 
 namespace llvm {
 using namespace sys;
-ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::ThreadLocalImpl() : data() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
 const void* ThreadLocalImpl::getInstance() { return data; }
diff --git a/lib/Support/Unix/Unix.h b/lib/Support/Unix/Unix.h
index dd11c04..ba688e3 100644
--- a/lib/Support/Unix/Unix.h
+++ b/lib/Support/Unix/Unix.h
@@ -47,6 +47,10 @@
 # include <sys/wait.h>
 #endif
 
+#ifdef HAVE_DLFCN_H
+# include <dlfcn.h>
+#endif
+
 #ifndef WEXITSTATUS
 # define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
 #endif
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 83da82a..5a7b219 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -71,7 +71,7 @@ extern "C" {
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
-  SmartScopedLock<true> lock(getMutex());
+  SmartScopedLock<true> lock(*SymbolsMutex);
 
   if (!filename) {
     // When no file is specified, enumerate all DLLs and EXEs in the process.
@@ -83,8 +83,15 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     // This is mostly to ensure that the return value still shows up as "valid".
     return DynamicLibrary(&OpenedHandles);
   }
+
+  SmallVector<wchar_t, MAX_PATH> filenameUnicode;
+  if (error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
+    SetLastError(ec.value());
+    MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16: ");
+    return DynamicLibrary();
+  }
   
-  HMODULE a_handle = LoadLibrary(filename);
+  HMODULE a_handle = LoadLibraryW(filenameUnicode.data());
 
   if (a_handle == 0) {
     MakeErrMsg(errMsg, std::string(filename) + ": Can't open : ");
@@ -114,10 +121,10 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 #undef EXPLICIT_SYMBOL2
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
-  SmartScopedLock<true> Lock(getMutex());
+  SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
-  if (ExplicitSymbols) {
+  if (ExplicitSymbols.isConstructed()) {
     StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
 
     if (i != ExplicitSymbols->end())
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index 4c5aebd..1260452 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -82,7 +82,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
 
   uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
                                 NearBlock->size()
-                           : NULL;
+                           : 0;
 
   // If the requested address is not aligned to the allocation granularity,
   // round up to get beyond NearBlock. VirtualAlloc would have rounded down.
@@ -106,7 +106,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
   MemoryBlock Result;
   Result.Address = PA;
   Result.Size = NumBlocks*Granularity;
-                                 ;
+
   if (Flags & MF_EXEC)
     Memory::InvalidateInstructionCache(Result.Address, Result.Size);
 
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 1694cb2..0b39198 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -37,70 +37,18 @@ typedef int errno_t;
 
 using namespace llvm;
 
+using llvm::sys::windows::UTF8ToUTF16;
+using llvm::sys::windows::UTF16ToUTF8;
+
 namespace {
   typedef BOOLEAN (WINAPI *PtrCreateSymbolicLinkW)(
     /*__in*/ LPCWSTR lpSymlinkFileName,
     /*__in*/ LPCWSTR lpTargetFileName,
     /*__in*/ DWORD dwFlags);
 
-  PtrCreateSymbolicLinkW create_symbolic_link_api = PtrCreateSymbolicLinkW(
-    ::GetProcAddress(::GetModuleHandleA("kernel32.dll"),
-                     "CreateSymbolicLinkW"));
-
-  error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16) {
-    int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
-                                    utf8.begin(), utf8.size(),
-                                    utf16.begin(), 0);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    utf16.reserve(len + 1);
-    utf16.set_size(len);
-
-    len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
-                                    utf8.begin(), utf8.size(),
-                                    utf16.begin(), utf16.size());
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    // Make utf16 null terminated.
-    utf16.push_back(0);
-    utf16.pop_back();
-
-    return error_code::success();
-  }
-
-  error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                               SmallVectorImpl<char> &utf8) {
-    // Get length.
-    int len = ::WideCharToMultiByte(CP_UTF8, 0,
-                                    utf16, utf16_len,
-                                    utf8.begin(), 0,
-                                    NULL, NULL);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    utf8.reserve(len);
-    utf8.set_size(len);
-
-    // Now do the actual conversion.
-    len = ::WideCharToMultiByte(CP_UTF8, 0,
-                                utf16, utf16_len,
-                                utf8.data(), utf8.size(),
-                                NULL, NULL);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    // Make utf8 null terminated.
-    utf8.push_back(0);
-    utf8.pop_back();
-
-    return error_code::success();
-  }
+  PtrCreateSymbolicLinkW create_symbolic_link_api =
+      PtrCreateSymbolicLinkW(::GetProcAddress(
+          ::GetModuleHandleW(L"Kernel32.dll"), "CreateSymbolicLinkW"));
 
   error_code TempDir(SmallVectorImpl<wchar_t> &result) {
   retry_temp_dir:
@@ -180,7 +128,7 @@ retry_random_path:
       BYTE val = 0;
       if (!::CryptGenRandom(CryptoProvider, 1, &val))
           return windows_error(::GetLastError());
-      random_path_utf16.push_back("0123456789abcdef"[val & 15]);
+      random_path_utf16.push_back(L"0123456789abcdef"[val & 15]);
     }
     else
       random_path_utf16.push_back(*i);
@@ -268,9 +216,28 @@ namespace sys  {
 namespace fs {
 
 std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
-  char pathname[MAX_PATH];
-  DWORD ret = ::GetModuleFileNameA(NULL, pathname, MAX_PATH);
-  return ret != MAX_PATH ? pathname : "";
+  SmallVector<wchar_t, MAX_PATH> PathName;
+  DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.capacity());
+
+  // A zero return value indicates a failure other than insufficient space.
+  if (Size == 0)
+    return "";
+
+  // Insufficient space is determined by a return value equal to the size of
+  // the buffer passed in.
+  if (Size == PathName.capacity())
+    return "";
+
+  // On success, GetModuleFileNameW returns the number of characters written to
+  // the buffer not including the NULL terminator.
+  PathName.set_size(Size);
+
+  // Convert the result from UTF-16 to UTF-8.
+  SmallVector<char, MAX_PATH> PathNameUTF8;
+  if (UTF16ToUTF8(PathName.data(), PathName.size(), PathNameUTF8))
+    return "";
+
+  return std::string(PathNameUTF8.data());
 }
 
 UniqueID file_status::getUniqueID() const {
@@ -293,47 +260,25 @@ TimeValue file_status::getLastModificationTime() const {
 }
 
 error_code current_path(SmallVectorImpl<char> &result) {
-  SmallVector<wchar_t, 128> cur_path;
-  cur_path.reserve(128);
-retry_cur_dir:
-  DWORD len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
-
-  // A zero return value indicates a failure other than insufficient space.
-  if (len == 0)
-    return windows_error(::GetLastError());
+  SmallVector<wchar_t, MAX_PATH> cur_path;
+  DWORD len = MAX_PATH;
 
-  // If there's insufficient space, the len returned is larger than the len
-  // given.
-  if (len > cur_path.capacity()) {
+  do {
     cur_path.reserve(len);
-    goto retry_cur_dir;
-  }
-
-  cur_path.set_size(len);
-  // cur_path now holds the current directory in utf-16. Convert to utf-8.
+    len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
 
-  // Find out how much space we need. Sadly, this function doesn't return the
-  // size needed unless you tell it the result size is 0, which means you
-  // _always_ have to call it twice.
-  len = ::WideCharToMultiByte(CP_UTF8, 0,
-                              cur_path.data(), cur_path.size(),
-                              result.data(), 0,
-                              NULL, NULL);
-
-  if (len == 0)
-    return make_error_code(windows_error(::GetLastError()));
+    // A zero return value indicates a failure other than insufficient space.
+    if (len == 0)
+      return windows_error(::GetLastError());
 
-  result.reserve(len);
-  result.set_size(len);
-  // Now do the actual conversion.
-  len = ::WideCharToMultiByte(CP_UTF8, 0,
-                              cur_path.data(), cur_path.size(),
-                              result.data(), result.size(),
-                              NULL, NULL);
-  if (len == 0)
-    return windows_error(::GetLastError());
+    // If there's insufficient space, the len returned is larger than the len
+    // given.
+  } while (len > cur_path.capacity());
 
-  return error_code::success();
+  // On success, GetCurrentDirectoryW returns the number of characters not
+  // including the null-terminator.
+  cur_path.set_size(len);
+  return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
 error_code create_directory(const Twine &path, bool &existed) {
@@ -746,12 +691,11 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   case priv:      flprotect = PAGE_WRITECOPY; break;
   }
 
-  FileMappingHandle = ::CreateFileMapping(FileHandle,
-                                          0,
-                                          flprotect,
-                                          Size >> 32,
-                                          Size & 0xffffffff,
-                                          0);
+  FileMappingHandle =
+      ::CreateFileMappingW(FileHandle, 0, flprotect,
+                           (Offset + Size) >> 32,
+                           (Offset + Size) & 0xffffffff,
+                           0);
   if (FileMappingHandle == NULL) {
     error_code ec = windows_error(GetLastError());
     if (FileDescriptor) {
@@ -816,7 +760,7 @@ mapped_file_region::mapped_file_region(const Twine &path,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec) 
+                                       error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping()
@@ -1018,7 +962,7 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return error_code::success();
 }
 
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
                                             bool map_writable, void *&result) {
   assert(0 && "NOT IMPLEMENTED");
   return windows_error::invalid_function;
@@ -1078,7 +1022,7 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   DWORD CreationDisposition;
   if (Flags & F_Excl)
     CreationDisposition = CREATE_NEW;
-  else if (Flags & F_Append) 
+  else if (Flags & F_Append)
     CreationDisposition = OPEN_ALWAYS;
   else
     CreationDisposition = CREATE_ALWAYS;
@@ -1115,7 +1059,64 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   ResultFD = FD;
   return error_code::success();
 }
-
 } // end namespace fs
+
+namespace windows {
+llvm::error_code UTF8ToUTF16(llvm::StringRef utf8,
+                             llvm::SmallVectorImpl<wchar_t> &utf16) {
+  int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                                  utf8.begin(), utf8.size(),
+                                  utf16.begin(), 0);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  utf16.reserve(len + 1);
+  utf16.set_size(len);
+
+  len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                              utf8.begin(), utf8.size(),
+                              utf16.begin(), utf16.size());
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  // Make utf16 null terminated.
+  utf16.push_back(0);
+  utf16.pop_back();
+
+  return llvm::error_code::success();
+}
+
+llvm::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                             llvm::SmallVectorImpl<char> &utf8) {
+  // Get length.
+  int len = ::WideCharToMultiByte(CP_UTF8, 0,
+                                  utf16, utf16_len,
+                                  utf8.begin(), 0,
+                                  NULL, NULL);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  utf8.reserve(len);
+  utf8.set_size(len);
+
+  // Now do the actual conversion.
+  len = ::WideCharToMultiByte(CP_UTF8, 0,
+                              utf16, utf16_len,
+                              utf8.data(), utf8.size(),
+                              NULL, NULL);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  // Make utf8 null terminated.
+  utf8.push_back(0);
+  utf8.pop_back();
+
+  return llvm::error_code::success();
+}
+} // end namespace windows
 } // end namespace sys
 } // end namespace llvm
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 359b99f..f9a3db9 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -11,18 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Allocator.h"
+
 #include "Windows.h"
 #include <direct.h>
 #include <io.h>
 #include <malloc.h>
 #include <psapi.h>
+#include <shellapi.h>
 
 #ifdef __MINGW32__
  #if (HAVE_LIBPSAPI != 1)
   #error "libpsapi.a should be present"
  #endif
+ #if (HAVE_LIBSHELL32 != 1)
+  #error "libshell32.a should be present"
+ #endif
 #else
  #pragma comment(lib, "psapi.lib")
+ #pragma comment(lib, "shell32.lib")
 #endif
 
 //===----------------------------------------------------------------------===//
@@ -83,6 +90,8 @@ static unsigned getPageSize() {
   // that LLVM ought to run as 64-bits on a 64-bit system, anyway.
   SYSTEM_INFO info;
   GetSystemInfo(&info);
+  // FIXME: FileOffset in MapViewOfFile() should be aligned to not dwPageSize,
+  // but dwAllocationGranularity.
   return static_cast<unsigned>(info.dwPageSize);
 }
 
@@ -119,28 +128,89 @@ void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
   sys_time = getTimeValueFromFILETIME(KernelTime);
 }
 
-int Process::GetCurrentUserId()
-{
-  return 65536;
-}
-
-int Process::GetCurrentGroupId()
-{
-  return 65536;
-}
-
 // Some LLVM programs such as bugpoint produce core files as a normal part of
-// their operation. To prevent the disk from filling up, this configuration item
-// does what's necessary to prevent their generation.
+// their operation. To prevent the disk from filling up, this configuration
+// item does what's necessary to prevent their generation.
 void Process::PreventCoreFiles() {
-  // Windows doesn't do core files, but it does do modal pop-up message
-  // boxes.  As this method is used by bugpoint, preventing these pop-ups
-  // is the moral equivalent of suppressing core files.
+  // Windows does have the concept of core files, called minidumps.  However,
+  // disabling minidumps for a particular application extends past the lifetime
+  // of that application, which is the incorrect behavior for this API.
+  // Additionally, the APIs require elevated privileges to disable and re-
+  // enable minidumps, which makes this untenable. For more information, see
+  // WerAddExcludedApplication and WerRemoveExcludedApplication (Vista and
+  // later).
+  //
+  // Windows also has modal pop-up message boxes.  As this method is used by
+  // bugpoint, preventing these pop-ups is additionally important.
   SetErrorMode(SEM_FAILCRITICALERRORS |
                SEM_NOGPFAULTERRORBOX |
                SEM_NOOPENFILEERRORBOX);
 }
 
+/// Returns the environment variable \arg Name's value as a string encoded in
+/// UTF-8. \arg Name is assumed to be in UTF-8 encoding.
+Optional<std::string> Process::GetEnv(StringRef Name) {
+  // Convert the argument to UTF-16 to pass it to _wgetenv().
+  SmallVector<wchar_t, 128> NameUTF16;
+  if (error_code ec = windows::UTF8ToUTF16(Name, NameUTF16))
+    return None;
+
+  // Environment variable can be encoded in non-UTF8 encoding, and there's no
+  // way to know what the encoding is. The only reliable way to look up
+  // multibyte environment variable is to use GetEnvironmentVariableW().
+  SmallVector<wchar_t, MAX_PATH> Buf;
+  size_t Size = MAX_PATH;
+  do {
+    Buf.reserve(Size);
+    Size =
+        GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity());
+    if (Size == 0)
+      return None;
+
+    // Try again with larger buffer.
+  } while (Size > Buf.capacity());
+  Buf.set_size(Size);
+
+  // Convert the result from UTF-16 to UTF-8.
+  SmallVector<char, MAX_PATH> Res;
+  if (error_code ec = windows::UTF16ToUTF8(Buf.data(), Size, Res))
+    return None;
+  return std::string(Res.data());
+}
+
+error_code
+Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
+                           ArrayRef<const char *>,
+                           SpecificBumpPtrAllocator<char> &ArgAllocator) {
+  int NewArgCount;
+  error_code ec;
+
+  wchar_t **UnicodeCommandLine = CommandLineToArgvW(GetCommandLineW(),
+                                                    &NewArgCount);
+  if (!UnicodeCommandLine)
+    return windows_error(::GetLastError());
+
+  Args.reserve(NewArgCount);
+
+  for (int i = 0; i < NewArgCount; ++i) {
+    SmallVector<char, MAX_PATH> NewArgString;
+    ec = windows::UTF16ToUTF8(UnicodeCommandLine[i],
+                              wcslen(UnicodeCommandLine[i]),
+                              NewArgString);
+    if (ec)
+      break;
+
+    char *Buffer = ArgAllocator.Allocate(NewArgString.size() + 1);
+    ::memcpy(Buffer, NewArgString.data(), NewArgString.size() + 1);
+    Args.push_back(Buffer);
+  }
+  LocalFree(UnicodeCommandLine);
+  if (ec)
+    return ec;
+
+  return error_code::success();
+}
+
 bool Process::StandardInIsUserInput() {
   return FileDescriptorIsDisplayed(0);
 }
@@ -187,6 +257,11 @@ bool Process::StandardErrHasColors() {
   return FileDescriptorHasColors(2);
 }
 
+static bool UseANSI = false;
+void Process::UseANSIEscapeCodes(bool enable) {
+  UseANSI = enable;
+}
+
 namespace {
 class DefaultColors
 {
@@ -208,10 +283,12 @@ DefaultColors defaultColors;
 }
 
 bool Process::ColorNeedsFlush() {
-  return true;
+  return !UseANSI;
 }
 
 const char *Process::OutputBold(bool bg) {
+  if (UseANSI) return "\033[1m";
+
   WORD colors = DefaultColors::GetCurrentColor();
   if (bg)
     colors |= BACKGROUND_INTENSITY;
@@ -222,6 +299,8 @@ const char *Process::OutputBold(bool bg) {
 }
 
 const char *Process::OutputColor(char code, bool bold, bool bg) {
+  if (UseANSI) return colorcodes[bg?1:0][bold?1:0][code&7];
+
   WORD colors;
   if (bg) {
     colors = ((code&1) ? BACKGROUND_RED : 0) |
@@ -247,6 +326,8 @@ static WORD GetConsoleTextAttribute(HANDLE hConsoleOutput) {
 }
 
 const char *Process::OutputReverse() {
+  if (UseANSI) return "\033[7m";
+
   const WORD attributes
    = GetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE));
 
@@ -273,6 +354,7 @@ const char *Process::OutputReverse() {
 }
 
 const char *Process::ResetColor() {
+  if (UseANSI) return "\033[0m";
   SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), defaultColors());
   return 0;
 }
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 8165ef4..dc09738 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -24,16 +24,11 @@
 //===          and must not be UNIX code
 //===----------------------------------------------------------------------===//
 
-namespace {
-  struct Win32ProcessInfo {
-    HANDLE hProcess;
-    DWORD  dwProcessId;
-  };
-}
-
 namespace llvm {
 using namespace sys;
 
+ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
+
 // This function just uses the PATH environment variable to find the program.
 std::string sys::FindProgramByName(const std::string &progName) {
   // Check some degenerate cases
@@ -47,42 +42,39 @@ std::string sys::FindProgramByName(const std::string &progName) {
 
   // At this point, the file name is valid and does not contain slashes.
   // Let Windows search for it.
-  std::string buffer;
-  buffer.resize(MAX_PATH);
-  char *dummy = NULL;
-  DWORD len = SearchPath(NULL, progName.c_str(), ".exe", MAX_PATH,
-                         &buffer[0], &dummy);
-
-  // See if it wasn't found.
-  if (len == 0)
+  SmallVector<wchar_t, MAX_PATH> progNameUnicode;
+  if (windows::UTF8ToUTF16(progName, progNameUnicode))
     return "";
 
-  // See if we got the entire path.
-  if (len < MAX_PATH)
-    return buffer;
+  SmallVector<wchar_t, MAX_PATH> buffer;
+  DWORD len = MAX_PATH;
+  do {
+    buffer.reserve(len);
+    len = ::SearchPathW(NULL, progNameUnicode.data(), L".exe",
+                        buffer.capacity(), buffer.data(), NULL);
 
-  // Buffer was too small; grow and retry.
-  while (true) {
-    buffer.resize(len+1);
-    DWORD len2 = SearchPath(NULL, progName.c_str(), ".exe", len+1, &buffer[0], &dummy);
-
-    // It is unlikely the search failed, but it's always possible some file
-    // was added or removed since the last search, so be paranoid...
-    if (len2 == 0)
+    // See if it wasn't found.
+    if (len == 0)
       return "";
-    else if (len2 <= len)
-      return buffer;
 
-    len = len2;
-  }
+    // Buffer was too small; grow and retry.
+  } while (len > buffer.capacity());
+
+  buffer.set_size(len);
+  SmallVector<char, MAX_PATH> result;
+  if (windows::UTF16ToUTF8(buffer.begin(), buffer.size(), result))
+    return "";
+
+  return std::string(result.data(), result.size());
 }
 
 static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
   HANDLE h;
   if (path == 0) {
-    DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
-                    GetCurrentProcess(), &h,
-                    0, TRUE, DUPLICATE_SAME_ACCESS);
+    if (!DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
+                         GetCurrentProcess(), &h,
+                         0, TRUE, DUPLICATE_SAME_ACCESS))
+      return INVALID_HANDLE_VALUE;
     return h;
   }
 
@@ -97,9 +89,13 @@ static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
   sa.lpSecurityDescriptor = 0;
   sa.bInheritHandle = TRUE;
 
-  h = CreateFile(fname.c_str(), fd ? GENERIC_WRITE : GENERIC_READ,
-                 FILE_SHARE_READ, &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
-                 FILE_ATTRIBUTE_NORMAL, NULL);
+  SmallVector<wchar_t, 128> fnameUnicode;
+  if (windows::UTF8ToUTF16(fname, fnameUnicode))
+    return INVALID_HANDLE_VALUE;
+
+  h = CreateFileW(fnameUnicode.data(), fd ? GENERIC_WRITE : GENERIC_READ,
+                  FILE_SHARE_READ, &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
+                  FILE_ATTRIBUTE_NORMAL, NULL);
   if (h == INVALID_HANDLE_VALUE) {
     MakeErrMsg(ErrMsg, std::string(fname) + ": Can't open file for " +
         (fd ? "input: " : "output: "));
@@ -171,13 +167,9 @@ static unsigned int ArgLenWithQuotes(const char *Str) {
 
 }
 
-static bool Execute(void **Data,
-                    StringRef Program,
-                    const char** args,
-                    const char** envp,
-                    const StringRef** redirects,
-                    unsigned memoryLimit,
-                    std::string* ErrMsg) {
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
+                    const char **envp, const StringRef **redirects,
+                    unsigned memoryLimit, std::string *ErrMsg) {
   if (!sys::fs::can_execute(Program)) {
     if (ErrMsg)
       *ErrMsg = "program not executable";
@@ -227,34 +219,28 @@ static bool Execute(void **Data,
   *p = 0;
 
   // The pointer to the environment block for the new process.
-  OwningArrayPtr<char> envblock;
+  std::vector<wchar_t> EnvBlock;
 
   if (envp) {
     // An environment block consists of a null-terminated block of
     // null-terminated strings. Convert the array of environment variables to
     // an environment block by concatenating them.
+    for (unsigned i = 0; envp[i]; ++i) {
+      SmallVector<wchar_t, MAX_PATH> EnvString;
+      if (error_code ec = windows::UTF8ToUTF16(envp[i], EnvString)) {
+        SetLastError(ec.value());
+        MakeErrMsg(ErrMsg, "Unable to convert environment variable to UTF-16");
+        return false;
+      }
 
-    // First, determine the length of the environment block.
-    len = 0;
-    for (unsigned i = 0; envp[i]; i++)
-      len += strlen(envp[i]) + 1;
-
-    // Now build the environment block.
-    envblock.reset(new char[len+1]);
-    p = envblock.get();
-
-    for (unsigned i = 0; envp[i]; i++) {
-      const char *ev = envp[i];
-      size_t len = strlen(ev) + 1;
-      memcpy(p, ev, len);
-      p += len;
+      EnvBlock.insert(EnvBlock.end(), EnvString.begin(), EnvString.end());
+      EnvBlock.push_back(0);
     }
-
-    *p = 0;
+    EnvBlock.push_back(0);
   }
 
   // Create a child process.
-  STARTUPINFO si;
+  STARTUPINFOW si;
   memset(&si, 0, sizeof(si));
   si.cb = sizeof(si);
   si.hStdInput = INVALID_HANDLE_VALUE;
@@ -278,9 +264,14 @@ static bool Execute(void **Data,
     if (redirects[1] && redirects[2] && *(redirects[1]) == *(redirects[2])) {
       // If stdout and stderr should go to the same place, redirect stderr
       // to the handle already open for stdout.
-      DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
-                      GetCurrentProcess(), &si.hStdError,
-                      0, TRUE, DUPLICATE_SAME_ACCESS);
+      if (!DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
+                           GetCurrentProcess(), &si.hStdError,
+                           0, TRUE, DUPLICATE_SAME_ACCESS)) {
+        CloseHandle(si.hStdInput);
+        CloseHandle(si.hStdOutput);
+        MakeErrMsg(ErrMsg, "can't dup stderr to stdout");
+        return false;
+      }
     } else {
       // Just redirect stderr
       si.hStdError = RedirectIO(redirects[2], 2, ErrMsg);
@@ -298,9 +289,27 @@ static bool Execute(void **Data,
 
   fflush(stdout);
   fflush(stderr);
-  std::string ProgramStr = Program;
-  BOOL rc = CreateProcess(ProgramStr.c_str(), command.get(), NULL, NULL, TRUE,
-                          0, envblock.get(), NULL, &si, &pi);
+
+  SmallVector<wchar_t, MAX_PATH> ProgramUtf16;
+  if (error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
+    SetLastError(ec.value());
+    MakeErrMsg(ErrMsg,
+               std::string("Unable to convert application name to UTF-16"));
+    return false;
+  }
+
+  SmallVector<wchar_t, MAX_PATH> CommandUtf16;
+  if (error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
+    SetLastError(ec.value());
+    MakeErrMsg(ErrMsg,
+               std::string("Unable to convert command-line to UTF-16"));
+    return false;
+  }
+
+  BOOL rc = CreateProcessW(ProgramUtf16.data(), CommandUtf16.data(), 0, 0,
+                           TRUE, CREATE_UNICODE_ENVIRONMENT,
+                           EnvBlock.empty() ? 0 : EnvBlock.data(), 0, &si,
+                           &pi);
   DWORD err = GetLastError();
 
   // Regardless of whether the process got created or not, we are done with
@@ -313,15 +322,12 @@ static bool Execute(void **Data,
   if (!rc) {
     SetLastError(err);
     MakeErrMsg(ErrMsg, std::string("Couldn't execute program '") +
-               ProgramStr + "'");
+               Program.str() + "'");
     return false;
   }
-  if (Data) {
-    Win32ProcessInfo* wpi = new Win32ProcessInfo;
-    wpi->hProcess = pi.hProcess;
-    wpi->dwProcessId = pi.dwProcessId;
-    *Data = wpi;
-  }
+
+  PI.Pid = pi.dwProcessId;
+  PI.ProcessHandle = pi.hProcess;
 
   // Make sure these get closed no matter what.
   ScopedCommonHandle hThread(pi.hThread);
@@ -329,7 +335,7 @@ static bool Execute(void **Data,
   // Assign the process to a job if a memory limit is defined.
   ScopedJobHandle hJob;
   if (memoryLimit != 0) {
-    hJob = CreateJobObject(0, 0);
+    hJob = CreateJobObjectW(0, 0);
     bool success = false;
     if (hJob) {
       JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli;
@@ -351,68 +357,72 @@ static bool Execute(void **Data,
     }
   }
 
-  // Don't leak the handle if the caller doesn't want it.
-  if (!Data)
-    CloseHandle(pi.hProcess);
-
   return true;
 }
 
-static int WaitAux(Win32ProcessInfo *wpi, unsigned secondsToWait,
-                   std::string *ErrMsg) {
-  // Wait for the process to terminate.
-  HANDLE hProcess = wpi->hProcess;
-  DWORD millisecondsToWait = INFINITE;
-  if (secondsToWait > 0)
-    millisecondsToWait = secondsToWait * 1000;
-
-  if (WaitForSingleObject(hProcess, millisecondsToWait) == WAIT_TIMEOUT) {
-    if (!TerminateProcess(hProcess, 1)) {
-      MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
-      // -2 indicates a crash or timeout as opposed to failure to execute.
-      return -2;
+namespace llvm {
+ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
+                      bool WaitUntilChildTerminates, std::string *ErrMsg) {
+  assert(PI.Pid && "invalid pid to wait on, process not started?");
+  assert(PI.ProcessHandle &&
+         "invalid process handle to wait on, process not started?");
+  DWORD milliSecondsToWait = 0;
+  if (WaitUntilChildTerminates)
+    milliSecondsToWait = INFINITE;
+  else if (SecondsToWait > 0)
+    milliSecondsToWait = SecondsToWait * 1000;
+
+  ProcessInfo WaitResult = PI;
+  DWORD WaitStatus = WaitForSingleObject(PI.ProcessHandle, milliSecondsToWait);
+  if (WaitStatus == WAIT_TIMEOUT) {
+    if (SecondsToWait) {
+      if (!TerminateProcess(PI.ProcessHandle, 1)) {
+        if (ErrMsg)
+          MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
+
+        // -2 indicates a crash or timeout as opposed to failure to execute.
+        WaitResult.ReturnCode = -2;
+        CloseHandle(PI.ProcessHandle);
+        return WaitResult;
+      }
+      WaitForSingleObject(PI.ProcessHandle, INFINITE);
+      CloseHandle(PI.ProcessHandle);
+    } else {
+      // Non-blocking wait.
+      return ProcessInfo();
     }
-    WaitForSingleObject(hProcess, INFINITE);
   }
 
   // Get its exit status.
   DWORD status;
-  BOOL rc = GetExitCodeProcess(hProcess, &status);
+  BOOL rc = GetExitCodeProcess(PI.ProcessHandle, &status);
   DWORD err = GetLastError();
+  CloseHandle(PI.ProcessHandle);
 
   if (!rc) {
     SetLastError(err);
-    MakeErrMsg(ErrMsg, "Failed getting status for program.");
+    if (ErrMsg)
+      MakeErrMsg(ErrMsg, "Failed getting status for program.");
+
     // -2 indicates a crash or timeout as opposed to failure to execute.
-    return -2;
+    WaitResult.ReturnCode = -2;
+    return WaitResult;
   }
 
   if (!status)
-    return 0;
+    return WaitResult;
 
   // Pass 10(Warning) and 11(Error) to the callee as negative value.
   if ((status & 0xBFFF0000U) == 0x80000000U)
-    return (int)status;
-
-  if (status & 0xFF)
-    return status & 0x7FFFFFFF;
-
-  return 1;
-}
-
-static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
-                std::string *ErrMsg) {
-  Win32ProcessInfo *wpi = reinterpret_cast<Win32ProcessInfo *>(Data);
-  int Ret = WaitAux(wpi, secondsToWait, ErrMsg);
-
-  CloseHandle(wpi->hProcess);
-  delete wpi;
-  Data = 0;
+    WaitResult.ReturnCode = static_cast<int>(status);
+  else if (status & 0xFF)
+    WaitResult.ReturnCode = status & 0x7FFFFFFF;
+  else
+    WaitResult.ReturnCode = 1;
 
-  return Ret;
+  return WaitResult;
 }
 
-namespace llvm {
 error_code sys::ChangeStdinToBinary(){
   int result = _setmode( _fileno(stdin), _O_BINARY );
   if (result == -1)
@@ -449,5 +459,4 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   }
   return true;
 }
-
 }
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 9593923..c431844 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -48,8 +48,7 @@ static bool loadSRW() {
   if (!sChecked) {
     sChecked = true;
 
-    HMODULE hLib = ::LoadLibrary(TEXT("Kernel32"));
-    if (hLib) {
+    if (HMODULE hLib = ::GetModuleHandleW(L"Kernel32.dll")) {
       fpInitializeSRWLock =
         (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
                                                "InitializeSRWLock");
@@ -65,7 +64,6 @@ static bool loadSRW() {
       fpReleaseSRWLockShared =
         (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
                                                "ReleaseSRWLockShared");
-      ::FreeLibrary(hLib);
 
       if (fpInitializeSRWLock != NULL) {
         sHasSRW = true;
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index bce83b9..4b40d51 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -135,7 +135,7 @@ typedef PVOID (WINAPI *fpSymFunctionTableAccess64)(HANDLE, DWORD64);
 static fpSymFunctionTableAccess64 SymFunctionTableAccess64;
 
 static bool load64BitDebugHelp(void) {
-  HMODULE hLib = ::LoadLibrary("Dbghelp.dll");
+  HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
   if (hLib) {
     StackWalk64 = (fpStackWalk64)
                       ::GetProcAddress(hLib, "StackWalk64");
diff --git a/lib/Support/Windows/TimeValue.inc b/lib/Support/Windows/TimeValue.inc
index 96f5579..98b07d6 100644
--- a/lib/Support/Windows/TimeValue.inc
+++ b/lib/Support/Windows/TimeValue.inc
@@ -12,10 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "Windows.h"
+#include <cctype>
 #include <time.h>
 
-namespace llvm {
-using namespace sys;
+using namespace llvm;
+using namespace llvm::sys;
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code.
@@ -49,13 +50,10 @@ std::string TimeValue::str() const {
   char Buffer[25];
   // FIXME: the windows version of strftime doesn't support %e
   strftime(Buffer, 25, "%b %d %H:%M %Y", LT);
-  assert((Buffer[3] == ' ' && isdigit(Buffer[5]) && Buffer[6] == ' ') ||
+  assert((Buffer[3] == ' ' && isdigit(Buffer[5]) && Buffer[6] == ' ') &&
          "Unexpected format in strftime()!");
   // Emulate %e on %d to mute '0'.
   if (Buffer[4] == '0')
     Buffer[4] = ' ';
   return std::string(Buffer);
 }
-
-
-}
diff --git a/lib/Support/Windows/Windows.h b/lib/Support/Windows/Windows.h
index 4cdac78..1f3417d 100644
--- a/lib/Support/Windows/Windows.h
+++ b/lib/Support/Windows/Windows.h
@@ -24,23 +24,31 @@
 #define _WIN32_IE    0x0600 // MinGW at it again.
 #define WIN32_LEAN_AND_MEAN
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/system_error.h"
 #include <windows.h>
 #include <wincrypt.h>
-#include <shlobj.h>
 #include <cassert>
 #include <string>
+#include <vector>
 
 inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
   if (!ErrMsg)
     return true;
   char *buffer = NULL;
-  FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM,
-      NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
-  *ErrMsg = prefix + buffer;
+  DWORD R = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                          FORMAT_MESSAGE_FROM_SYSTEM,
+                          NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
+  if (R)
+    *ErrMsg = prefix + buffer;
+  else
+    *ErrMsg = prefix + "Unknown error";
+
   LocalFree(buffer);
-  return true;
+  return R != 0;
 }
 
 template <typename HandleTraits>
@@ -148,4 +156,13 @@ c_str(SmallVectorImpl<T> &str) {
   str.pop_back();
   return str.data();
 }
+
+namespace sys {
+namespace windows {
+error_code UTF8ToUTF16(StringRef utf8,
+                       SmallVectorImpl<wchar_t> &utf16);
+error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                       SmallVectorImpl<char> &utf8);
+} // end namespace windows
+} // end namespace sys
 } // end namespace llvm.
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 213f5e1..9495cd4 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -96,6 +96,15 @@ static EncodingInfo getUnicodeEncoding(StringRef Input) {
 
 namespace llvm {
 namespace yaml {
+/// Pin the vtables to this file.
+void Node::anchor() {}
+void NullNode::anchor() {}
+void ScalarNode::anchor() {}
+void KeyValueNode::anchor() {}
+void MappingNode::anchor() {}
+void SequenceNode::anchor() {}
+void AliasNode::anchor() {}
+
 /// Token - A single YAML token.
 struct Token : ilist_node<Token> {
   enum TokenKind {
@@ -1070,14 +1079,22 @@ bool Scanner::scanDirective() {
   Current = skip_while(&Scanner::skip_ns_char, Current);
   StringRef Name(NameStart, Current - NameStart);
   Current = skip_while(&Scanner::skip_s_white, Current);
-
+  
+  Token T;
   if (Name == "YAML") {
     Current = skip_while(&Scanner::skip_ns_char, Current);
-    Token T;
     T.Kind = Token::TK_VersionDirective;
     T.Range = StringRef(Start, Current - Start);
     TokenQueue.push_back(T);
     return true;
+  } else if(Name == "TAG") {
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    Current = skip_while(&Scanner::skip_s_white, Current);
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    T.Kind = Token::TK_TagDirective;
+    T.Range = StringRef(Start, Current - Start);
+    TokenQueue.push_back(T);
+    return true;
   }
   return false;
 }
@@ -1564,10 +1581,6 @@ void Stream::printError(Node *N, const Twine &Msg) {
                      , Ranges);
 }
 
-void Stream::handleYAMLDirective(const Token &t) {
-  // TODO: Ensure version is 1.x.
-}
-
 document_iterator Stream::begin() {
   if (CurrentDoc)
     report_fatal_error("Can only iterate over the stream once");
@@ -1588,14 +1601,59 @@ void Stream::skip() {
     i->skip();
 }
 
-Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A)
+Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T)
   : Doc(D)
   , TypeID(Type)
-  , Anchor(A) {
+  , Anchor(A)
+  , Tag(T) {
   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
   SourceRange = SMRange(Start, Start);
 }
 
+std::string Node::getVerbatimTag() const {
+  StringRef Raw = getRawTag();
+  if (!Raw.empty() && Raw != "!") {
+    std::string Ret;
+    if (Raw.find_last_of('!') == 0) {
+      Ret = Doc->getTagMap().find("!")->second;
+      Ret += Raw.substr(1);
+      return llvm_move(Ret);
+    } else if (Raw.startswith("!!")) {
+      Ret = Doc->getTagMap().find("!!")->second;
+      Ret += Raw.substr(2);
+      return llvm_move(Ret);
+    } else {
+      StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
+      std::map<StringRef, StringRef>::const_iterator It =
+          Doc->getTagMap().find(TagHandle);
+      if (It != Doc->getTagMap().end())
+        Ret = It->second;
+      else {
+        Token T;
+        T.Kind = Token::TK_Tag;
+        T.Range = TagHandle;
+        setError(Twine("Unknown tag handle ") + TagHandle, T);
+      }
+      Ret += Raw.substr(Raw.find_last_of('!') + 1);
+      return llvm_move(Ret);
+    }
+  }
+
+  switch (getType()) {
+  case NK_Null:
+    return "tag:yaml.org,2002:null";
+  case NK_Scalar:
+    // TODO: Tag resolution.
+    return "tag:yaml.org,2002:str";
+  case NK_Mapping:
+    return "tag:yaml.org,2002:map";
+  case NK_Sequence:
+    return "tag:yaml.org,2002:seq";
+  }
+
+  return "";
+}
+
 Token &Node::peekNext() {
   return Doc->peekNext();
 }
@@ -1999,6 +2057,10 @@ void SequenceNode::increment() {
 }
 
 Document::Document(Stream &S) : stream(S), Root(0) {
+  // Tag maps starts with two default mappings.
+  TagMap["!"] = "!";
+  TagMap["!!"] = "tag:yaml.org,2002:";
+
   if (parseDirectives())
     expectToken(Token::TK_DocumentStart);
   Token &T = peekNext();
@@ -2042,6 +2104,7 @@ Node *Document::parseBlockNode() {
   Token T = peekNext();
   // Handle properties.
   Token AnchorInfo;
+  Token TagInfo;
 parse_property:
   switch (T.Kind) {
   case Token::TK_Alias:
@@ -2056,7 +2119,11 @@ parse_property:
     T = peekNext();
     goto parse_property;
   case Token::TK_Tag:
-    getNext(); // Skip TK_Tag.
+    if (TagInfo.Kind == Token::TK_Tag) {
+      setError("Already encountered a tag for this node!", T);
+      return 0;
+    }
+    TagInfo = getNext(); // Consume TK_Tag.
     T = peekNext();
     goto parse_property;
   default:
@@ -2070,42 +2137,49 @@ parse_property:
     // Don't eat the TK_BlockEntry, SequenceNode needs it.
     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
                                            , AnchorInfo.Range.substr(1)
+                                           , TagInfo.Range
                                            , SequenceNode::ST_Indentless);
   case Token::TK_BlockSequenceStart:
     getNext();
     return new (NodeAllocator)
       SequenceNode( stream.CurrentDoc
                   , AnchorInfo.Range.substr(1)
+                  , TagInfo.Range
                   , SequenceNode::ST_Block);
   case Token::TK_BlockMappingStart:
     getNext();
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Block);
   case Token::TK_FlowSequenceStart:
     getNext();
     return new (NodeAllocator)
       SequenceNode( stream.CurrentDoc
                   , AnchorInfo.Range.substr(1)
+                  , TagInfo.Range
                   , SequenceNode::ST_Flow);
   case Token::TK_FlowMappingStart:
     getNext();
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Flow);
   case Token::TK_Scalar:
     getNext();
     return new (NodeAllocator)
       ScalarNode( stream.CurrentDoc
                 , AnchorInfo.Range.substr(1)
+                , TagInfo.Range
                 , T.Range);
   case Token::TK_Key:
     // Don't eat the TK_Key, KeyValueNode expects it.
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Inline);
   case Token::TK_DocumentStart:
   case Token::TK_DocumentEnd:
@@ -2126,10 +2200,10 @@ bool Document::parseDirectives() {
   while (true) {
     Token T = peekNext();
     if (T.Kind == Token::TK_TagDirective) {
-      handleTagDirective(getNext());
+      parseTAGDirective();
       isDirective = true;
     } else if (T.Kind == Token::TK_VersionDirective) {
-      stream.handleYAMLDirective(getNext());
+      parseYAMLDirective();
       isDirective = true;
     } else
       break;
@@ -2137,6 +2211,21 @@ bool Document::parseDirectives() {
   return isDirective;
 }
 
+void Document::parseYAMLDirective() {
+  getNext(); // Eat %YAML <version>
+}
+
+void Document::parseTAGDirective() {
+  Token Tag = getNext(); // %TAG <handle> <prefix>
+  StringRef T = Tag.Range;
+  // Strip %TAG
+  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
+  std::size_t HandleEnd = T.find_first_of(" \t");
+  StringRef TagHandle = T.substr(0, HandleEnd);
+  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
+  TagMap[TagHandle] = TagPrefix;
+}
+
 bool Document::expectToken(int TK) {
   Token T = getNext();
   if (T.Kind != TK) {
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index b0cd415..42bff96 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstring>
+#include <cctype>
 using namespace llvm;
 using namespace yaml;
 
@@ -40,32 +41,43 @@ void IO::setContext(void *Context) {
 //  Input
 //===----------------------------------------------------------------------===//
 
-Input::Input(StringRef InputContent, void *Ctxt) 
-  : IO(Ctxt), 
+Input::Input(StringRef InputContent,
+             void *Ctxt,
+             SourceMgr::DiagHandlerTy DiagHandler,
+             void *DiagHandlerCtxt)
+  : IO(Ctxt),
     Strm(new Stream(InputContent, SrcMgr)),
     CurrentNode(NULL) {
+  if (DiagHandler)
+    SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
   DocIterator = Strm->begin();
 }
 
 Input::~Input() {
-  
 }
 
 error_code Input::error() {
   return EC;
 }
 
-void Input::setDiagHandler(SourceMgr::DiagHandlerTy Handler, void *Ctxt) {
-  SrcMgr.setDiagHandler(Handler, Ctxt);
-}
+// Pin the vtables to this file.
+void Input::HNode::anchor() {}
+void Input::EmptyHNode::anchor() {}
+void Input::ScalarHNode::anchor() {}
 
-bool Input::outputting() {
+bool Input::outputting() const {
   return false;
 }
 
 bool Input::setCurrentDocument() {
   if (DocIterator != Strm->end()) {
     Node *N = DocIterator->getRoot();
+    if (!N) {
+      assert(Strm->failed() && "Root is NULL iff parsing failed");
+      EC = make_error_code(errc::invalid_argument);
+      return false;
+    }
+
     if (isa<NullNode>(N)) {
       // Empty files are allowed and ignored
       ++DocIterator;
@@ -82,10 +94,21 @@ void Input::nextDocument() {
   ++DocIterator;
 }
 
+bool Input::mapTag(StringRef Tag, bool Default) {
+  std::string foundTag = CurrentNode->_node->getVerbatimTag();
+  if (foundTag.empty()) {
+    // If no tag found and 'Tag' is the default, say it was found.
+    return Default;
+  }
+  // Return true iff found tag matches supplied tag.
+  return Tag.equals(foundTag);
+}
+
 void Input::beginMapping() {
   if (EC)
     return;
-  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  // CurrentNode can be null if the document is empty.
+  MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
   if (MN) {
     MN->ValidKeys.clear();
   }
@@ -96,6 +119,15 @@ bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
   UseDefault = false;
   if (EC)
     return false;
+
+  // CurrentNode is null for empty documents, which is an error in case required
+  // nodes are present.
+  if (!CurrentNode) {
+    if (Required)
+      EC = make_error_code(errc::invalid_argument);
+    return false;
+  }
+
   MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
   if (!MN) {
     setError(CurrentNode, "not a mapping");
@@ -122,7 +154,8 @@ void Input::postflightKey(void *saveInfo) {
 void Input::endMapping() {
   if (EC)
     return;
-  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  // CurrentNode can be null if the document is empty.
+  MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
   if (!MN)
     return;
   for (MapHNode::NameToNode::iterator i = MN->Mapping.begin(),
@@ -263,6 +296,7 @@ void Input::scalarString(StringRef &S) {
 }
 
 void Input::setError(HNode *hnode, const Twine &message) {
+  assert(hnode && "HNode must not be NULL");
   this->setError(hnode->_node, message);
 }
 
@@ -334,6 +368,10 @@ void Input::setError(const Twine &Message) {
   this->setError(CurrentNode, Message);
 }
 
+bool Input::canElideEmptySequence() {
+  return false;
+}
+
 Input::MapHNode::~MapHNode() {
   for (MapHNode::NameToNode::iterator i = Mapping.begin(), End = Mapping.end();
                                                                 i != End; ++i) {
@@ -368,7 +406,7 @@ Output::Output(raw_ostream &yout, void *context)
 Output::~Output() {
 }
 
-bool Output::outputting() {
+bool Output::outputting() const {
   return true;
 }
 
@@ -377,6 +415,14 @@ void Output::beginMapping() {
   NeedsNewLine = true;
 }
 
+bool Output::mapTag(StringRef Tag, bool Use) {
+  if (Use) {
+    this->output(" ");
+    this->output(Tag);
+  }
+  return Use;
+}
+
 void Output::endMapping() {
   StateStack.pop_back();
 }
@@ -505,9 +551,20 @@ void Output::endBitSetScalar() {
 }
 
 void Output::scalarString(StringRef &S) {
+  const char ScalarSafeChars[] = "abcdefghijklmnopqrstuvwxyz"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-/^., \t";
+
   this->newLineCheck();
-  if (S.find('\n') == StringRef::npos) {
-    // No embedded new-line chars, just print string.
+  if (S.empty()) {
+    // Print '' for the empty string because leaving the field empty is not
+    // allowed.
+    this->outputUpToEndOfLine("''");
+    return;
+  }
+  if (S.find_first_not_of(ScalarSafeChars) == StringRef::npos &&
+      !isspace(S.front()) && !isspace(S.back())) {
+    // If the string consists only of safe characters, print it out without
+    // quotes.
     this->outputUpToEndOfLine(S);
     return;
   }
@@ -532,6 +589,19 @@ void Output::scalarString(StringRef &S) {
 void Output::setError(const Twine &message) {
 }
 
+bool Output::canElideEmptySequence() {
+  // Normally, with an optional key/value where the value is an empty sequence,
+  // the whole key/value can be not written.  But, that produces wrong yaml
+  // if the key/value is the only thing in the map and the map is used in
+  // a sequence.  This detects if the this sequence is the first key/value
+  // in map that itself is embedded in a sequnce.
+  if (StateStack.size() < 2)
+    return true;
+  if (StateStack.back() != inMapFirstKey)
+    return true;
+  return (StateStack[StateStack.size()-2] != inSeq);
+}
+
 void Output::output(StringRef s) {
   Column += s.size();
   Out << s;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 92fa8b5..cb96489 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -447,7 +447,8 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
   error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
 
   if (EC) {
-    ErrorInfo = "Error opening output file '" + std::string(Filename) + "'";
+    ErrorInfo = "Error opening output file '" + std::string(Filename) + "': " +
+                EC.message();
     ShouldClose = false;
     return;
   }
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 9ad2053..431f4aa 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -557,9 +557,23 @@ Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
   return const_cast<BitsInit *>(this);
 }
 
+namespace {
+  template<typename T>
+  class Pool : public T {
+  public:
+    ~Pool();
+  };
+  template<typename T>
+  Pool<T>::~Pool() {
+    for (typename T::iterator I = this->begin(), E = this->end(); I != E; ++I) {
+      typename T::value_type &Item = *I;
+      delete Item.second;
+    }
+  }
+}
+
 IntInit *IntInit::get(int64_t V) {
-  typedef DenseMap<int64_t, IntInit *> Pool;
-  static Pool ThePool;
+  static Pool<DenseMap<int64_t, IntInit *> > ThePool;
 
   IntInit *&I = ThePool[V];
   if (!I) I = new IntInit(V);
@@ -586,8 +600,7 @@ IntInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
 void StringInit::anchor() { }
 
 StringInit *StringInit::get(StringRef V) {
-  typedef StringMap<StringInit *> Pool;
-  static Pool ThePool;
+  static Pool<StringMap<StringInit *> > ThePool;
 
   StringInit *&I = ThePool[V];
   if (!I) I = new StringInit(V);
@@ -726,9 +739,7 @@ Init *OpInit::getBit(unsigned Bit) const {
 
 UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
   typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
-
-  typedef DenseMap<Key, UnOpInit *> Pool;
-  static Pool ThePool;  
+  static Pool<DenseMap<Key, UnOpInit *> > ThePool;
 
   Key TheKey(std::make_pair(std::make_pair(opc, lhs), Type));
 
@@ -873,8 +884,7 @@ BinOpInit *BinOpInit::get(BinaryOp opc, Init *lhs,
     RecTy *
     > Key;
 
-  typedef DenseMap<Key, BinOpInit *> Pool;
-  static Pool ThePool;  
+  static Pool<DenseMap<Key, BinOpInit *> > ThePool;
 
   Key TheKey(std::make_pair(std::make_pair(std::make_pair(opc, lhs), rhs),
                             Type));
@@ -1298,8 +1308,7 @@ VarInit *VarInit::get(const std::string &VN, RecTy *T) {
 
 VarInit *VarInit::get(Init *VN, RecTy *T) {
   typedef std::pair<RecTy *, Init *> Key;
-  typedef DenseMap<Key, VarInit *> Pool;
-  static Pool ThePool;
+  static Pool<DenseMap<Key, VarInit *> > ThePool;
 
   Key TheKey(std::make_pair(T, VN));
 
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 965cd00..daac574 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -2496,6 +2496,9 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     if (Lex.getCode() != tgtok::comma) break;
     Lex.Lex(); // eat ','.
 
+    if (Lex.getCode() != tgtok::Id)
+      return TokError("expected identifier");
+
     SubClassLoc = Lex.getLoc();
 
     // A defm can inherit from regular classes (non-multiclass) as
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e17052b..9c2c69a 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -21,8 +21,11 @@ include "llvm/Target/Target.td"
 // AArch64 Subtarget features.
 //
 
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
+  "Enable ARMv8 FP">;
+
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-  "Enable Advanced SIMD instructions">;
+  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
   "Enable cryptographic instructions">;
@@ -33,7 +36,7 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 
 include "AArch64Schedule.td"
 
-def : Processor<"generic", GenericItineraries, [FeatureNEON, FeatureCrypto]>;
+def : Processor<"generic", GenericItineraries, [FeatureFPARMv8]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 9498722..d59ca56 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -32,17 +32,18 @@ using namespace llvm;
 /// argument to be printed as "bN".
 static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
                                        const TargetRegisterInfo *TRI,
-                                       const TargetRegisterClass &RegClass,
-                                       raw_ostream &O) {
+                                       char RegType, raw_ostream &O) {
   if (!MO.isReg())
     return true;
 
   for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-    if (RegClass.contains(*AR)) {
-      O << AArch64InstPrinter::getRegisterName(*AR);
+    if (AArch64::FPR8RegClass.contains(*AR)) {
+      O << RegType << TRI->getEncodingValue(MO.getReg());
       return false;
     }
   }
+
+  // The register doesn't correspond to anything floating-point like.
   return true;
 }
 
@@ -81,9 +82,9 @@ bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
   StringRef Modifier;
   switch (MO.getType()) {
   default:
-    llvm_unreachable("Unexpected operand for symbolic address constraint");
+    return true;
   case MachineOperand::MO_GlobalAddress:
-    Name = Mang->getSymbol(MO.getGlobal())->getName();
+    Name = getSymbol(MO.getGlobal())->getName();
 
     // Global variables may be accessed either via a GOT or in various fun and
     // interesting TLS-model specific ways. Set the prefix modifier as
@@ -145,57 +146,29 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
   const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-  if (!ExtraCode || !ExtraCode[0]) {
-    // There's actually no operand modifier, which leads to a slightly eclectic
-    // set of behaviour which we have to handle here.
-    const MachineOperand &MO = MI->getOperand(OpNum);
-    switch (MO.getType()) {
-    default:
-      llvm_unreachable("Unexpected operand for inline assembly");
-    case MachineOperand::MO_Register:
-      // GCC prints the unmodified operand of a 'w' constraint as the vector
-      // register. Technically, we could allocate the argument as a VPR128, but
-      // that leads to extremely dodgy copies being generated to get the data
-      // there.
-      if (printModifiedFPRAsmOperand(MO, TRI, AArch64::VPR128RegClass, O))
-        O << AArch64InstPrinter::getRegisterName(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      O << '#' << MO.getImm();
-      break;
-    case MachineOperand::MO_FPImmediate:
-      assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
-      O << "#0.0";
-      break;
-    case MachineOperand::MO_BlockAddress:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_ExternalSymbol:
-      return printSymbolicAddress(MO, false, "", O);
-    }
-    return false;
-  }
 
-  // We have a real modifier to handle.
+  if (!ExtraCode)
+    ExtraCode = "";
+
   switch(ExtraCode[0]) {
   default:
-    // See if this is a generic operand
-    return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
-  case 'c': // Don't print "#" before an immediate operand.
-    if (!MI->getOperand(OpNum).isImm())
-      return true;
-    O << MI->getOperand(OpNum).getImm();
-    return false;
+    if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+        return false;
+    break;
   case 'w':
     // Output 32-bit general register operand, constant zero as wzr, or stack
     // pointer as wsp. Ignored when used with other operand types.
-    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::GPR32RegClass, O);
+    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    AArch64::GPR32RegClass, O))
+      return false;
+    break;
   case 'x':
     // Output 64-bit general register operand, constant zero as xzr, or stack
     // pointer as sp. Ignored when used with other operand types.
-    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::GPR64RegClass, O);
+    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    AArch64::GPR64RegClass, O))
+      return false;
+    break;
   case 'H':
     // Output higher numbered of a 64-bit general register pair
   case 'Q':
@@ -211,40 +184,65 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     // copies ...).
     llvm_unreachable("FIXME: Unimplemented register pairs");
   case 'b':
-    // Output 8-bit FP/SIMD scalar register operand, prefixed with b.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR8RegClass, O);
   case 'h':
-    // Output 16-bit FP/SIMD scalar register operand, prefixed with h.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR16RegClass, O);
   case 's':
-    // Output 32-bit FP/SIMD scalar register operand, prefixed with s.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR32RegClass, O);
   case 'd':
-    // Output 64-bit FP/SIMD scalar register operand, prefixed with d.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR64RegClass, O);
   case 'q':
-    // Output 128-bit FP/SIMD scalar register operand, prefixed with q.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR128RegClass, O);
+    if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    ExtraCode[0], O))
+      return false;
+    break;
   case 'A':
     // Output symbolic address with appropriate relocation modifier (also
     // suitable for ADRP).
-    return printSymbolicAddress(MI->getOperand(OpNum), false, "", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O))
+      return false;
+    break;
   case 'L':
     // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
     // modifier.
-    return printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O))
+      return false;
+    break;
   case 'G':
     // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
     // modifier (currently only for TLS local exec).
-    return printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O))
+      return false;
+    break;
+  case 'a':
+    return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
   }
 
+  // There's actually no operand modifier, which leads to a slightly eclectic
+  // set of behaviour which we have to handle here.
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unexpected operand for inline assembly");
+  case MachineOperand::MO_Register:
+    // GCC prints the unmodified operand of a 'w' constraint as the vector
+    // register. Technically, we could allocate the argument as a VPR128, but
+    // that leads to extremely dodgy copies being generated to get the data
+    // there.
+    if (printModifiedFPRAsmOperand(MO, TRI, 'v', O))
+      O << AArch64InstPrinter::getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << '#' << MO.getImm();
+    break;
+  case MachineOperand::MO_FPImmediate:
+    assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
+    O << "#0.0";
+    break;
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    return printSymbolicAddress(MO, false, "", O);
+  }
 
+  return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
index bff7eeb..a2a9f3f 100644
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ b/lib/Target/AArch64/AArch64CallingConv.td
@@ -59,9 +59,9 @@ def CC_A64_APCS : CallingConv<[
   // Canonicalise the various types that live in different floating-point
   // registers. This makes sense because the PCS does not distinguish Short
   // Vectors and Floating-point types.
-  CCIfType<[v2i8], CCBitConvertToType<f16>>,
-  CCIfType<[v4i8, v2i16], CCBitConvertToType<f32>>,
-  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64], CCBitConvertToType<f64>>,
+  CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
+  CCIfType<[v1i32, v4i8, v2i16, v1f32], CCBitConvertToType<f32>>,
+  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
            CCBitConvertToType<f128>>,
 
@@ -70,7 +70,8 @@ def CC_A64_APCS : CallingConv<[
   // argument is allocated to the least significant bits of register
   // v[NSRN]. The NSRN is incremented by one. The argument has now been
   // allocated."
-  CCIfType<[f16],  CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[f16],  CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
   CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
   CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ee819e0..ef99541 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -109,6 +109,45 @@ public:
 
   SDNode* Select(SDNode*);
 private:
+  /// Get the opcode for table lookup instruction
+  unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec);
+
+  /// Select NEON table lookup intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  /// IsExt is to indicate if the result will be extended with an argument.
+  SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
+
+  /// Select NEON load intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcode);
+
+  /// Select NEON store intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcodes);
+
+  /// Form sequences of consecutive 64/128-bit registers for use in NEON
+  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+  /// between 1 and 4 elements. If it contains a single element that is returned
+  /// unchanged; otherwise a REG_SEQUENCE value is returned.
+  SDValue createDTuple(ArrayRef<SDValue> Vecs);
+  SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+  /// Generic helper for the createDTuple/createQTuple
+  /// functions. Those should almost always be called instead.
+  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
+                      unsigned SubRegs[]);
+
+  /// Select NEON load-duplicate intrinsics.  NumVecs should be 2, 3 or 4.
+  /// The opcode array specifies the instructions used for load.
+  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                       const uint16_t *Opcodes);
+
+  /// Select NEON load/store lane intrinsics.  NumVecs should be 2, 3 or 4.
+  /// The opcode arrays specify the instructions used for load/store.
+  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                          unsigned NumVecs, const uint16_t *Opcodes);
+
+  SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+                               SDValue Operand);
 };
 }
 
@@ -390,12 +429,607 @@ SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
                               &Ops[0], Ops.size());
 }
 
+SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { AArch64::DPairRegClassID,
+                                    AArch64::DTripleRegClassID,
+                                    AArch64::DQuadRegClassID };
+  static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1,
+                                AArch64::dsub_2, AArch64::dsub_3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { AArch64::QPairRegClassID,
+                                    AArch64::QTripleRegClassID,
+                                    AArch64::QQuadRegClassID };
+  static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1,
+                                AArch64::qsub_2, AArch64::qsub_3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+                                         unsigned RegClassIDs[],
+                                         unsigned SubRegs[]) {
+  // There's no special register-class for a vector-list of 1 element: it's just
+  // a vector.
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+  SDLoc DL(Regs[0].getNode());
+
+  SmallVector<SDValue, 4> Ops;
+
+  // First operand of REG_SEQUENCE is the desired RegClass.
+  Ops.push_back(
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
+
+  // Then we get pairs of source & subregister-position for the components.
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Ops.push_back(Regs[i]);
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
+  }
+
+  SDNode *N =
+      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+
+// Get the register stride update opcode of a VLD/VST instruction that
+// is otherwise equivalent to the given fixed stride updating instruction.
+static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+  switch (Opc) {
+  default: break;
+  case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register;
+  case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register;
+  case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register;
+  case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register;
+  case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register;
+  case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register;
+  case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register;
+  case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register;
+
+  case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register;
+  case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register;
+  case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register;
+  case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register;
+  case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register;
+  case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register;
+  case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register;
+
+  case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register;
+  case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register;
+  case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register;
+  case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register;
+  case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register;
+  case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register;
+  case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register;
+
+  case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register;
+  case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register;
+  case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register;
+  case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register;
+  case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register;
+  case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register;
+  case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register;
+
+  case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register;
+  case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register;
+  case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register;
+  case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register;
+  case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register;
+  case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register;
+  case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register;
+  case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register;
+
+  case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register;
+  case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register;
+  case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register;
+  case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register;
+  case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register;
+  case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register;
+  case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register;
+  case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register;
+
+  case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register;
+  case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register;
+  case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register;
+  case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register;
+  case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register;
+  case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register;
+  case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register;
+  case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register;
+
+  case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register;
+  case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register;
+  case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register;
+  case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register;
+  case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register;
+  case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register;
+  case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register;
+  case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register;
+
+  case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register;
+  case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register;
+  case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register;
+  case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register;
+  case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register;
+  case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register;
+  case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register;
+
+  case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register;
+  case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register;
+  case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register;
+  case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register;
+  case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register;
+  case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register;
+  case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register;
+
+  case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register;
+  case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register;
+  case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register;
+  case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register;
+  case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register;
+  case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register;
+  case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register;
+
+  case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register;
+  case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register;
+  case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register;
+  case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register;
+  case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register;
+  case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register;
+  case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register;
+  case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register;
+
+  case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register;
+  case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register;
+  case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register;
+  case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register;
+  case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register;
+  case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register;
+  case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register;
+  case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register;
+
+  case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register;
+  case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register;
+  case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register;
+  case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register;
+  case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register;
+  case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
+  case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
+  case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
+
+  // Post-index of duplicate loads
+  case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
+  case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
+  case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
+  case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
+  case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
+  case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
+  case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
+  case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
+
+  case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
+  case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
+  case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
+  case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
+  case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
+  case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
+  case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
+  case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
+
+  case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
+  case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
+  case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
+  case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
+  case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
+  case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
+  case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
+  case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
+
+  // Post-index of lane loads
+  case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
+  case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
+  case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
+  case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
+
+  case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
+  case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
+  case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
+  case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
+
+  case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
+  case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
+  case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
+  case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
+
+  // Post-index of lane stores
+  case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
+  case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
+  case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
+  case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
+
+  case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
+  case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
+  case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
+  case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
+
+  case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
+  case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
+  case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
+  case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
+  }
+  return Opc; // If not one we handle, return it unchanged.
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
+                                       unsigned NumVecs,
+                                       const uint16_t *Opcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+
+  EVT VT = N->getValueType(0);
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector load type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<SDValue, 2> Ops;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  Ops.push_back(N->getOperand(0)); // Push back the Chain
+
+  SmallVector<EVT, 3> ResTys;
+  // Push back the type of return super register
+  if (NumVecs == 1)
+    ResTys.push_back(VT);
+  else if (NumVecs == 3)
+    ResTys.push_back(MVT::Untyped);
+  else {
+    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                 is64BitVector ? NumVecs : NumVecs * 2);
+    ResTys.push_back(ResTy);
+  }
+
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of the Chain
+  SDLoc dl(N);
+  SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
+  if (NumVecs == 1)
+    return VLd;
+
+  // If NumVecs > 1, the return result is a super register containing 2-4
+  // consecutive vector registers.
+  SDValue SuperReg = SDValue(VLd, 0);
+
+  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  // Update users of the Chain
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+
+  return NULL;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
+                                       unsigned NumVecs,
+                                       const uint16_t *Opcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3;
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector store type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<EVT, 2> ResTys;
+  if (isUpdating)
+    ResTys.push_back(MVT::i64);
+  ResTys.push_back(MVT::Other); // Type for the Chain
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs);
+  Ops.push_back(SrcReg);
+
+  // Push back the Chain
+  Ops.push_back(N->getOperand(0));
+
+  // Transfer memoperands.
+  SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+  return VSt;
+}
+
+SDValue
+AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+                                          SDValue Operand) {
+  SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
+                        VT, VTD, MVT::Other,
+                        CurDAG->getTargetConstant(0, MVT::i64),
+                        Operand,
+                        CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
+  return SDValue(Reg, 0);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+                                          unsigned NumVecs,
+                                          const uint16_t *Opcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+  SDLoc dl(N);
+
+  EVT VT = N->getValueType(0);
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector duplicate lane load type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SDValue SuperReg;
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Push back the Memory Address
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(2);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+  Ops.push_back(N->getOperand(0)); // Push back the Chain
+
+  SmallVector<EVT, 3> ResTys;
+  // Push back the type of return super register
+  if (NumVecs == 3)
+    ResTys.push_back(MVT::Untyped);
+  else {
+    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                 is64BitVector ? NumVecs : NumVecs * 2);
+    ResTys.push_back(ResTy);
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of the Chain
+  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+
+  SuperReg = SDValue(VLdDup, 0);
+  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
+  // Update uses of each registers in super register
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  // Update uses of the Chain
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+  return NULL;
+}
+
+// We only have 128-bit vector type of load/store lane instructions.
+// If it is 64-bit vector, we also select it to the 128-bit instructions.
+// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
+// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
+SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
+                                             bool isUpdating, unsigned NumVecs,
+                                             const uint16_t *Opcodes) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  SDLoc dl(N);
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3;
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+      cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+  EVT VT64; // 64-bit Vector Type
+
+  if (is64BitVector) {
+    VT64 = VT;
+    VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
+                          VT.getVectorNumElements() * 2);
+  }
+
+  unsigned OpcodeIndex;
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = 0; break;
+  case 16: OpcodeIndex = 1; break;
+  case 32: OpcodeIndex = 2; break;
+  case 64: OpcodeIndex = 3; break;
+  default: llvm_unreachable("unhandled vector lane load/store type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<EVT, 3> ResTys;
+  if (IsLoad) {
+    // Push back the type of return super register
+    if (NumVecs == 3)
+      ResTys.push_back(MVT::Untyped);
+    else {
+      EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                   is64BitVector ? NumVecs : NumVecs * 2);
+      ResTys.push_back(ResTy);
+    }
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of Chain
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  if (is64BitVector)
+    for (unsigned i = 0; i < Regs.size(); i++)
+      Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
+  SDValue SuperReg = createQTuple(Regs);
+
+  Ops.push_back(SuperReg); // Source Reg
+  SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
+  Ops.push_back(LaneValue);
+  Ops.push_back(Chain); // Push back the Chain
+
+  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+  if (!IsLoad)
+    return VLdLn;
+
+  // Extract the subregisters.
+  SuperReg = SDValue(VLdLn, 0);
+  unsigned Sub0 = AArch64::qsub_0;
+  // Update uses of each registers in super register
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+    SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
+    if (is64BitVector) {
+      SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
+    }
+    ReplaceUses(SDValue(N, Vec), SUB0);
+  }
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+  return NULL;
+}
+
+unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
+                                        unsigned NumOfVec) {
+  assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
+
+  unsigned Opc = 0;
+  switch (NumOfVec) {
+  default:
+    break;
+  case 1:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b;
+    break;
+  case 2:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b;
+    break;
+  case 3:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b;
+    break;
+  case 4:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b;
+    break;
+  }
+
+  return Opc;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs,
+                                        bool IsExt) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  // Check the element of look up table is 64-bit or not
+  unsigned Vec0Idx = IsExt ? 2 : 1;
+  assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() &&
+         "The element of lookup table for vtbl and vtbx must be 128-bit");
+
+  // Check the return value type is 64-bit or not
+  EVT ResVT = N->getValueType(0);
+  bool is64BitRes = ResVT.is64BitVector();
+
+  // Create new SDValue for vector list
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  SDValue TblReg = createQTuple(Regs);
+  unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs);
+
+  SmallVector<SDValue, 3> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(TblReg);
+  Ops.push_back(N->getOperand(Vec0Idx + NumVecs));
+  return CurDAG->getMachineNode(Opc, dl, ResVT, Ops);
+}
+
 SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
 
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
@@ -535,6 +1169,399 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     Node = ResNode;
     break;
   }
+  case AArch64ISD::NEON_LD1_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1WB_8B_fixed,  AArch64::LD1WB_4H_fixed,
+      AArch64::LD1WB_2S_fixed,  AArch64::LD1WB_1D_fixed,
+      AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
+      AArch64::LD1WB_4S_fixed,  AArch64::LD1WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 1, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD2WB_8B_fixed,  AArch64::LD2WB_4H_fixed,
+      AArch64::LD2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
+      AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
+      AArch64::LD2WB_4S_fixed,  AArch64::LD2WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD3WB_8B_fixed,  AArch64::LD3WB_4H_fixed,
+      AArch64::LD3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
+      AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
+      AArch64::LD3WB_4S_fixed,  AArch64::LD3WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD4WB_8B_fixed,  AArch64::LD4WB_4H_fixed,
+      AArch64::LD4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
+      AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
+      AArch64::LD4WB_4S_fixed,  AArch64::LD4WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x2WB_8B_fixed,  AArch64::LD1x2WB_4H_fixed,
+      AArch64::LD1x2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
+      AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
+      AArch64::LD1x2WB_4S_fixed,  AArch64::LD1x2WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x3WB_8B_fixed,  AArch64::LD1x3WB_4H_fixed,
+      AArch64::LD1x3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
+      AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
+      AArch64::LD1x3WB_4S_fixed,  AArch64::LD1x3WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x4WB_8B_fixed,  AArch64::LD1x4WB_4H_fixed,
+      AArch64::LD1x4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
+      AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
+      AArch64::LD1x4WB_4S_fixed,  AArch64::LD1x4WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1WB_8B_fixed,  AArch64::ST1WB_4H_fixed,
+      AArch64::ST1WB_2S_fixed,  AArch64::ST1WB_1D_fixed,
+      AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
+      AArch64::ST1WB_4S_fixed,  AArch64::ST1WB_2D_fixed
+    };
+    return SelectVST(Node, true, 1, Opcodes);
+  }
+  case AArch64ISD::NEON_ST2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST2WB_8B_fixed,  AArch64::ST2WB_4H_fixed,
+      AArch64::ST2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
+      AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
+      AArch64::ST2WB_4S_fixed,  AArch64::ST2WB_2D_fixed
+    };
+    return SelectVST(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST3WB_8B_fixed,  AArch64::ST3WB_4H_fixed,
+      AArch64::ST3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
+      AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
+      AArch64::ST3WB_4S_fixed,  AArch64::ST3WB_2D_fixed
+    };
+    return SelectVST(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST4WB_8B_fixed,  AArch64::ST4WB_4H_fixed,
+      AArch64::ST4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
+      AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
+      AArch64::ST4WB_4S_fixed,  AArch64::ST4WB_2D_fixed
+    };
+    return SelectVST(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
+        AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
+        AArch64::LD2R_4S, AArch64::LD2R_2D
+    };
+    return SelectVLDDup(Node, false, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
+        AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
+        AArch64::LD3R_4S, AArch64::LD3R_2D
+    };
+    return SelectVLDDup(Node, false, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
+        AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
+        AArch64::LD4R_4S, AArch64::LD4R_2D
+    };
+    return SelectVLDDup(Node, false, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD2R_WB_8B_fixed,  AArch64::LD2R_WB_4H_fixed,
+      AArch64::LD2R_WB_2S_fixed,  AArch64::LD2R_WB_1D_fixed,
+      AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
+      AArch64::LD2R_WB_4S_fixed,  AArch64::LD2R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD3R_WB_8B_fixed,  AArch64::LD3R_WB_4H_fixed,
+      AArch64::LD3R_WB_2S_fixed,  AArch64::LD3R_WB_1D_fixed,
+      AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
+      AArch64::LD3R_WB_4S_fixed,  AArch64::LD3R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD4R_WB_8B_fixed,  AArch64::LD4R_WB_4H_fixed,
+      AArch64::LD4R_WB_2S_fixed,  AArch64::LD4R_WB_1D_fixed,
+      AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
+      AArch64::LD4R_WB_4S_fixed,  AArch64::LD4R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
+        AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
+        AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
+        AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST2LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
+        AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST3LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
+        AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST4LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
+        AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x2WB_8B_fixed,  AArch64::ST1x2WB_4H_fixed,
+      AArch64::ST1x2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
+      AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
+      AArch64::ST1x2WB_4S_fixed,  AArch64::ST1x2WB_2D_fixed
+    };
+    return SelectVST(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x3WB_8B_fixed,  AArch64::ST1x3WB_4H_fixed,
+      AArch64::ST1x3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
+      AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
+      AArch64::ST1x3WB_4S_fixed,  AArch64::ST1x3WB_2D_fixed
+    };
+    return SelectVST(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x4WB_8B_fixed,  AArch64::ST1x4WB_4H_fixed,
+      AArch64::ST1x4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
+      AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
+      AArch64::ST1x4WB_4S_fixed,  AArch64::ST1x4WB_2D_fixed
+    };
+    return SelectVST(Node, true, 4, Opcodes);
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    bool IsExt = false;
+    switch (IntNo) {
+      default:
+        break;
+      case Intrinsic::aarch64_neon_vtbx1:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl1:
+        return SelectVTBL(Node, 1, IsExt);
+      case Intrinsic::aarch64_neon_vtbx2:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl2:
+        return SelectVTBL(Node, 2, IsExt);
+      case Intrinsic::aarch64_neon_vtbx3:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl3:
+        return SelectVTBL(Node, 3, IsExt);
+      case Intrinsic::aarch64_neon_vtbx4:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl4:
+        return SelectVTBL(Node, 4, IsExt);
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::arm_neon_vld1: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1_8B,  AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
+          AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
+      };
+      return SelectVLD(Node, false, 1, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD2_8B,  AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
+          AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
+      };
+      return SelectVLD(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD3_8B,  AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
+          AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
+      };
+      return SelectVLD(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD4_8B,  AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
+          AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
+      };
+      return SelectVLD(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x2_8B, AArch64::LD1x2_4H,  AArch64::LD1x2_2S,
+          AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
+          AArch64::LD1x2_4S, AArch64::LD1x2_2D
+      };
+      return SelectVLD(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x3_8B, AArch64::LD1x3_4H,  AArch64::LD1x3_2S,
+          AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
+          AArch64::LD1x3_4S, AArch64::LD1x3_2D
+      };
+      return SelectVLD(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x4_8B, AArch64::LD1x4_4H,  AArch64::LD1x4_2S,
+          AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
+          AArch64::LD1x4_4S, AArch64::LD1x4_2D
+      };
+      return SelectVLD(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst1: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1_8B,  AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
+          AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
+      };
+      return SelectVST(Node, false, 1, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST2_8B,  AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
+          AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
+      };
+      return SelectVST(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST3_8B,  AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
+          AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
+      };
+      return SelectVST(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST4_8B,  AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
+          AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
+      };
+      return SelectVST(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x2_8B, AArch64::ST1x2_4H,  AArch64::ST1x2_2S,
+          AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
+          AArch64::ST1x2_4S, AArch64::ST1x2_2D
+      };
+      return SelectVST(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x3_8B, AArch64::ST1x3_4H,  AArch64::ST1x3_2S,
+          AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
+          AArch64::ST1x3_4S, AArch64::ST1x3_2D
+      };
+      return SelectVST(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x4_8B, AArch64::ST1x4_4H,  AArch64::ST1x4_2S,
+          AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
+          AArch64::ST1x4_4S, AArch64::ST1x4_2D
+      };
+      return SelectVST(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld2lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld3lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld4lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst2lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst3lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst4lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 4, Opcodes);
+    }
+    } // End of switch IntNo
+    break;
+  } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
   default:
     break; // Let generic code handle it
   }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 44b691b..4fdb667 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,24 +50,33 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   // Scalar register <-> type mapping
   addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
   addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
-  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
-  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
-  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
-  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+
+  if (Subtarget->hasFPARMv8()) {
+    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+  }
 
   if (Subtarget->hasNEON()) {
     // And the vectors
-    addRegisterClass(MVT::v8i8, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v4i16, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v2i32, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v2f32, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v16i8, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v8i16, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v4i32, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v2i64, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v4f32, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v2f64, &AArch64::VPR128RegClass);
+    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
   }
 
   computeRegisterProperties();
@@ -77,6 +86,12 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
 
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::SHL);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
   // AArch64 does not have i1 loads, or much of anything for i1 really.
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -270,28 +285,89 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setExceptionSelectorRegister(AArch64::X1);
 
   if (Subtarget->hasNEON()) {
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
 
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
+
     setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
     setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
     setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
     setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
     setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
     setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
     setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1f32, Custom);
     setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
     setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
     setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
   }
 }
 
@@ -333,6 +409,29 @@ static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
   StrOpc = StoreOps[Log2_32(Size)];
 }
 
+// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
+// have value type mapped, and they are both being defined as MVT::untyped.
+// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
+// would fail to figure out the register pressure correctly.
+std::pair<const TargetRegisterClass*, uint8_t>
+AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
+  const TargetRegisterClass *RRC = 0;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  case MVT::v4i64:
+    RRC = &AArch64::QPairRegClass;
+    Cost = 2;
+    break;
+  case MVT::v8i64:
+    RRC = &AArch64::QQuadRegClass;
+    Cost = 4;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                                         unsigned Size,
@@ -658,6 +757,12 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
+  if (!NZCVKilled) {
+    // NZCV is live-through TrueBB.
+    TrueBB->addLiveIn(AArch64::NZCV);
+    EndBB->addLiveIn(AArch64::NZCV);
+  }
+
   // IfTrue:
   //     str qIFTRUE, [sp]
   BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
@@ -672,8 +777,6 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   // Done:
   //     ldr qDEST, [sp]
   //     [... rest of incoming MBB ...]
-  if (!NZCVKilled)
-    EndBB->addLiveIn(AArch64::NZCV);
   MachineInstr *StartOfEnd = EndBB->begin();
   BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
     .addFrameIndex(ScratchFI)
@@ -833,6 +936,86 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "AArch64ISD::NEON_CMPZ";
   case AArch64ISD::NEON_TST:
     return "AArch64ISD::NEON_TST";
+  case AArch64ISD::NEON_QSHLs:
+    return "AArch64ISD::NEON_QSHLs";
+  case AArch64ISD::NEON_QSHLu:
+    return "AArch64ISD::NEON_QSHLu";
+  case AArch64ISD::NEON_VDUP:
+    return "AArch64ISD::NEON_VDUP";
+  case AArch64ISD::NEON_VDUPLANE:
+    return "AArch64ISD::NEON_VDUPLANE";
+  case AArch64ISD::NEON_REV16:
+    return "AArch64ISD::NEON_REV16";
+  case AArch64ISD::NEON_REV32:
+    return "AArch64ISD::NEON_REV32";
+  case AArch64ISD::NEON_REV64:
+    return "AArch64ISD::NEON_REV64";
+  case AArch64ISD::NEON_UZP1:
+    return "AArch64ISD::NEON_UZP1";
+  case AArch64ISD::NEON_UZP2:
+    return "AArch64ISD::NEON_UZP2";
+  case AArch64ISD::NEON_ZIP1:
+    return "AArch64ISD::NEON_ZIP1";
+  case AArch64ISD::NEON_ZIP2:
+    return "AArch64ISD::NEON_ZIP2";
+  case AArch64ISD::NEON_TRN1:
+    return "AArch64ISD::NEON_TRN1";
+  case AArch64ISD::NEON_TRN2:
+    return "AArch64ISD::NEON_TRN2";
+  case AArch64ISD::NEON_LD1_UPD:
+    return "AArch64ISD::NEON_LD1_UPD";
+  case AArch64ISD::NEON_LD2_UPD:
+    return "AArch64ISD::NEON_LD2_UPD";
+  case AArch64ISD::NEON_LD3_UPD:
+    return "AArch64ISD::NEON_LD3_UPD";
+  case AArch64ISD::NEON_LD4_UPD:
+    return "AArch64ISD::NEON_LD4_UPD";
+  case AArch64ISD::NEON_ST1_UPD:
+    return "AArch64ISD::NEON_ST1_UPD";
+  case AArch64ISD::NEON_ST2_UPD:
+    return "AArch64ISD::NEON_ST2_UPD";
+  case AArch64ISD::NEON_ST3_UPD:
+    return "AArch64ISD::NEON_ST3_UPD";
+  case AArch64ISD::NEON_ST4_UPD:
+    return "AArch64ISD::NEON_ST4_UPD";
+  case AArch64ISD::NEON_LD1x2_UPD:
+    return "AArch64ISD::NEON_LD1x2_UPD";
+  case AArch64ISD::NEON_LD1x3_UPD:
+    return "AArch64ISD::NEON_LD1x3_UPD";
+  case AArch64ISD::NEON_LD1x4_UPD:
+    return "AArch64ISD::NEON_LD1x4_UPD";
+  case AArch64ISD::NEON_ST1x2_UPD:
+    return "AArch64ISD::NEON_ST1x2_UPD";
+  case AArch64ISD::NEON_ST1x3_UPD:
+    return "AArch64ISD::NEON_ST1x3_UPD";
+  case AArch64ISD::NEON_ST1x4_UPD:
+    return "AArch64ISD::NEON_ST1x4_UPD";
+  case AArch64ISD::NEON_LD2DUP:
+    return "AArch64ISD::NEON_LD2DUP";
+  case AArch64ISD::NEON_LD3DUP:
+    return "AArch64ISD::NEON_LD3DUP";
+  case AArch64ISD::NEON_LD4DUP:
+    return "AArch64ISD::NEON_LD4DUP";
+  case AArch64ISD::NEON_LD2DUP_UPD:
+    return "AArch64ISD::NEON_LD2DUP_UPD";
+  case AArch64ISD::NEON_LD3DUP_UPD:
+    return "AArch64ISD::NEON_LD3DUP_UPD";
+  case AArch64ISD::NEON_LD4DUP_UPD:
+    return "AArch64ISD::NEON_LD4DUP_UPD";
+  case AArch64ISD::NEON_LD2LN_UPD:
+    return "AArch64ISD::NEON_LD2LN_UPD";
+  case AArch64ISD::NEON_LD3LN_UPD:
+    return "AArch64ISD::NEON_LD3LN_UPD";
+  case AArch64ISD::NEON_LD4LN_UPD:
+    return "AArch64ISD::NEON_LD4LN_UPD";
+  case AArch64ISD::NEON_ST2LN_UPD:
+    return "AArch64ISD::NEON_ST2LN_UPD";
+  case AArch64ISD::NEON_ST3LN_UPD:
+    return "AArch64ISD::NEON_ST3LN_UPD";
+  case AArch64ISD::NEON_ST4LN_UPD:
+    return "AArch64ISD::NEON_ST4LN_UPD";
+  case AArch64ISD::NEON_VEXTRACT:
+    return "AArch64ISD::NEON_VEXTRACT";
   default:
     return NULL;
   }
@@ -908,24 +1091,31 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
     }
   }
 
+  if (getSubtarget()->hasFPARMv8()) {
   unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
   int FPRIdx = 0;
-  if (FPRSaveSize != 0) {
-    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
-                                   &AArch64::FPR128RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                                   MachinePointerInfo::getStack(i * 16),
-                                   false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(16, getPointerTy()));
+    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
+    // can omit a register save area if we know we'll never use registers of
+    // that class.
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
+            &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+            MachinePointerInfo::getStack(i * 16),
+            false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+            DAG.getConstant(16, getPointerTy()));
+      }
     }
+    FuncInfo->setVariadicFPRIdx(FPRIdx);
+    FuncInfo->setVariadicFPRSize(FPRSaveSize);
   }
 
   int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
@@ -933,8 +1123,6 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
   FuncInfo->setVariadicStackIdx(StackIdx);
   FuncInfo->setVariadicGPRIdx(GPRIdx);
   FuncInfo->setVariadicGPRSize(GPRSaveSize);
-  FuncInfo->setVariadicFPRIdx(FPRIdx);
-  FuncInfo->setVariadicFPRSize(FPRSaveSize);
 
   if (!MemOps.empty()) {
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
@@ -1875,7 +2063,7 @@ AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op));
+                     /*isSigned*/ false, SDLoc(Op)).first;
 }
 
 SDValue
@@ -1905,6 +2093,45 @@ AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   return LowerF128ToCall(Op, DAG, LC);
 }
 
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, MVT::i64);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return X30, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
+}
+
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
+                                              const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = AArch64::X29;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
 SDValue
 AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
                                                   SelectionDAG &DAG) const {
@@ -2650,6 +2877,8 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
 
   case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
@@ -2664,6 +2893,7 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VASTART: return LowerVASTART(Op, DAG);
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   }
 
   return SDValue();
@@ -3235,6 +3465,336 @@ static SDValue PerformSRACombine(SDNode *N,
                      DAG.getConstant(LSB + Width - 1, MVT::i64));
 }
 
+/// Check if this is a valid build_vector for the immediate operand of
+/// a vector shift operation, where all the elements of the build_vector
+/// must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// Check if this is a valid build_vector for the immediate operand of
+/// a vector shift left operation.  That value must be in the range:
+/// 0 <= Value < ElementBits
+static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && Cnt < ElementBits);
+}
+
+/// Check if this is a valid build_vector for the immediate operand of a
+/// vector shift right operation. The value must be in the range:
+///   1 <= Value <= ElementBits
+static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 1 && Cnt <= ElementBits);
+}
+
+/// Checks for immediate versions of vector shifts and lowers them.
+static SDValue PerformShiftCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const AArch64Subtarget *ST) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
+    return PerformSRACombine(N, DCI);
+
+  // Nothing to be done for scalar shifts.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
+      SDValue RHS =
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
+                      DAG.getConstant(Cnt, MVT::i32));
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
+    }
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
+      SDValue RHS =
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
+                      DAG.getConstant(Cnt, MVT::i32));
+      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
+    }
+    break;
+  }
+
+  return SDValue();
+}
+
+/// ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
+      break;
+    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
+                             ? AArch64ISD::NEON_QSHLs
+                             : AArch64ISD::NEON_QSHLu;
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoad = true;
+    bool isLaneOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: llvm_unreachable("unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
+        NumVecs = 1; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
+        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
+        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
+        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode for Neon base update");
+      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
+        NumVecs = 2; break;
+      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
+        NumVecs = 3; break;
+      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
+        NumVecs = 4; break;
+      }
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoad)
+      VecTy = N->getValueType(0);
+    else
+      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
+    }
+
+    // Create the new updating load/store node.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(AddrOpIdx));
+    Ops.push_back(Inc);
+    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
+                                           Ops.data(), Ops.size(),
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
+/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
+/// If so, combine them to a vldN-dup operation and return true.
+static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+  SDNode *VLD = N->getOperand(0).getNode();
+  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return SDValue();
+  unsigned NumVecs = 0;
+  unsigned NewOpc = 0;
+  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_neon_vld2lane) {
+    NumVecs = 2;
+    NewOpc = AArch64ISD::NEON_LD2DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+    NumVecs = 3;
+    NewOpc = AArch64ISD::NEON_LD3DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+    NumVecs = 4;
+    NewOpc = AArch64ISD::NEON_LD4DUP;
+  } else {
+    return SDValue();
+  }
+
+  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+  // numbers match the load.
+  unsigned VLDLaneNo =
+      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    // Ignore uses of the chain result.
+    if (UI.getUse().getResNo() == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
+        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+      return SDValue();
+  }
+
+  // Create the vldN-dup node.
+  EVT Tys[5];
+  unsigned n;
+  for (n = 0; n < NumVecs; ++n)
+    Tys[n] = VT;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
+  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
+                                           VLDMemInt->getMemoryVT(),
+                                           VLDMemInt->getMemOperand());
+
+  // Update the uses.
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    unsigned ResNo = UI.getUse().getResNo();
+    // Ignore uses of the chain result.
+    if (ResNo == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  }
+
+  // Now the vldN-lane intrinsic is dead except for its chain result.
+  // Update uses of the chain.
+  std::vector<SDValue> VLDDupResults;
+  for (unsigned n = 0; n < NumVecs; ++n)
+    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+  DCI.CombineTo(VLD, VLDDupResults);
+
+  return SDValue(N, 0);
+}
 
 SDValue
 AArch64TargetLowering::PerformDAGCombine(SDNode *N,
@@ -3243,7 +3803,45 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::AND: return PerformANDCombine(N, DCI);
   case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
-  case ISD::SRA: return PerformSRACombine(N, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    return PerformShiftCombine(N, DCI, getSubtarget());
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformIntrinsicCombine(N, DCI.DAG);
+  case AArch64ISD::NEON_VDUPLANE:
+    return CombineVLDDUP(N, DCI);
+  case AArch64ISD::NEON_LD2DUP:
+  case AArch64ISD::NEON_LD3DUP:
+  case AArch64ISD::NEON_LD4DUP:
+    return CombineBaseUpdate(N, DCI);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+    case Intrinsic::arm_neon_vld2lane:
+    case Intrinsic::arm_neon_vld3lane:
+    case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::aarch64_neon_vld1x2:
+    case Intrinsic::aarch64_neon_vld1x3:
+    case Intrinsic::aarch64_neon_vld1x4:
+    case Intrinsic::aarch64_neon_vst1x2:
+    case Intrinsic::aarch64_neon_vst1x3:
+    case Intrinsic::aarch64_neon_vst1x4:
+    case Intrinsic::arm_neon_vst2lane:
+    case Intrinsic::arm_neon_vst3lane:
+    case Intrinsic::arm_neon_vst4lane:
+      return CombineBaseUpdate(N, DCI);
+    default:
+      break;
+    }
   }
   return SDValue();
 }
@@ -3269,6 +3867,59 @@ AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   return false;
 }
 
+// Check whether a Build Vector could be presented as Shuffle Vector. If yes,
+// try to call LowerVECTOR_SHUFFLE to lower it.
+bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
+                                                 SDValue &Res) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned V0NumElts = 0;
+  int Mask[16];
+  SDValue V0, V1;
+
+  // Check if all elements are extracted from less than 3 vectors.
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return false;
+
+    if (V0.getNode() == 0) {
+      V0 = Elt.getOperand(0);
+      V0NumElts = V0.getValueType().getVectorNumElements();
+    }
+    if (Elt.getOperand(0) == V0) {
+      Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
+      continue;
+    } else if (V1.getNode() == 0) {
+      V1 = Elt.getOperand(0);
+    }
+    if (Elt.getOperand(0) == V1) {
+      unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
+      Mask[i] = (Lane + V0NumElts);
+      continue;
+    } else {
+      return false;
+    }
+  }
+
+  if (!V1.getNode() && V0NumElts == NumElts * 2) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                     DAG.getConstant(NumElts, MVT::i64));
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                     DAG.getConstant(0, MVT::i64));
+    V0NumElts = V0.getValueType().getVectorNumElements();
+  }
+
+  if (V1.getNode() && NumElts == V0NumElts &&
+      V0NumElts == V1.getValueType().getVectorNumElements()) {
+    SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
+    Res = LowerVECTOR_SHUFFLE(Shuffle, DAG);
+    return true;
+  } else
+    return false;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
 SDValue
@@ -3283,12 +3934,15 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   unsigned SplatBitSize;
   bool HasAnyUndefs;
 
+  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
+
   // Note we favor lowering MOVI over MVNI.
   // This has implications on the definition of patterns in TableGen to select
   // BIC immediate instructions but not ORR immediate instructions.
   // If this lowering order is changed, TableGen patterns for BIC immediate and
   // ORR immediate instructions have to be updated.
-  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+  if (UseNeonMov &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       // First attempt to use vector immediate-form MOVI
       EVT NeonMovVT;
@@ -3336,9 +3990,390 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       }
     }
   }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Loads are better lowered with insert_vector_elt.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (hasDominantValue && EltSize <= 64) {
+    // Use VDUP for non-constant splats.
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are DUPing a value that comes directly from a vector, we could
+      // just use DUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the DUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
+          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i64));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
+    if (usesOnlyOneValue && isConstant) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+    }
+  }
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Try to lower this in lowering ShuffleVector way.
+  SDValue Shuf;
+  if (isKnownShuffleVector(Op, DAG, Shuf))
+    return Shuf;
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
   return SDValue();
 }
 
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
+// TRN instruction.
+static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts < 4)
+    return 0;
+
+  bool ismatch = true;
+
+  // Check UZP1
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i * 2) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_UZP1;
+
+  // Check UZP2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i * 2 + 1) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_UZP2;
+
+  // Check ZIP1
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_ZIP1;
+
+  // Check ZIP2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_ZIP2;
+
+  // Check TRN1
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_TRN1;
+
+  // Check TRN2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_TRN2;
+
+  return 0;
+}
+
+SDValue
+AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize > 64)
+    return SDValue();
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
+
+  unsigned ISDNo = isPermuteMask(ShuffleMask, VT);
+  if (ISDNo)
+    return DAG.getNode(ISDNo, dl, VT, V1, V2);
+
+  // If the element of shuffle mask are all the same constant, we can
+  // transform it into either NEON_VDUP or NEON_VDUPLANE
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1) Lane = 0;
+
+    // Test if V1 is a SCALAR_TO_VECTOR.
+    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
+    }
+    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+      bool IsScalarToVector = true;
+      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
+        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
+            i != (unsigned)Lane) {
+          IsScalarToVector = false;
+          break;
+        }
+      if (IsScalarToVector)
+        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
+                           V1.getOperand(Lane));
+    }
+
+    // Test if V1 is a EXTRACT_SUBVECTOR.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
+                         DAG.getConstant(Lane + ExtLane, MVT::i64));
+    }
+    // Test if V1 is a CONCAT_VECTORS.
+    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+        V1.getOperand(1).getOpcode() == ISD::UNDEF) {
+      SDValue Op0 = V1.getOperand(0);
+      assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
+             "Invalid vector lane access");
+      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
+                         DAG.getConstant(Lane, MVT::i64));
+    }
+
+    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
+                       DAG.getConstant(Lane, MVT::i64));
+  }
+
+  int Length = ShuffleMask.size();
+  int V1EltNum = V1.getValueType().getVectorNumElements();
+
+  // If the number of v1 elements is the same as the number of shuffle mask
+  // element and the shuffle masks are sequential values, we can transform
+  // it into NEON_VEXTRACT.
+  if (V1EltNum == Length) {
+    // Check if the shuffle mask is sequential.
+    bool IsSequential = true;
+    int CurMask = ShuffleMask[0];
+    for (int I = 0; I < Length; ++I) {
+      if (ShuffleMask[I] != CurMask) {
+        IsSequential = false;
+        break;
+      }
+      CurMask++;
+    }
+    if (IsSequential) {
+      assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
+      unsigned VecSize = EltSize * V1EltNum;
+      unsigned Index = (EltSize/8) * ShuffleMask[0];
+      if (VecSize == 64 || VecSize == 128)
+        return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
+                           DAG.getConstant(Index, MVT::i64));
+    }
+  }
+
+  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
+  // by element from V2 to V1 .
+  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
+  // better choice to be inserted than V1 as less insert needed, so we count
+  // element to be inserted for both V1 and V2, and select less one as insert
+  // target.
+
+  // Collect elements need to be inserted and their index.
+  SmallVector<int, 8> NV1Elt;
+  SmallVector<int, 8> N1Index;
+  SmallVector<int, 8> NV2Elt;
+  SmallVector<int, 8> N2Index;
+  for (int I = 0; I != Length; ++I) {
+    if (ShuffleMask[I] != I) {
+      NV1Elt.push_back(ShuffleMask[I]);
+      N1Index.push_back(I);
+    }
+  }
+  for (int I = 0; I != Length; ++I) {
+    if (ShuffleMask[I] != (I + V1EltNum)) {
+      NV2Elt.push_back(ShuffleMask[I]);
+      N2Index.push_back(I);
+    }
+  }
+
+  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
+  // will be inserted.
+  SDValue InsV = V1;
+  SmallVector<int, 8> InsMasks = NV1Elt;
+  SmallVector<int, 8> InsIndex = N1Index;
+  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
+    if (NV1Elt.size() > NV2Elt.size()) {
+      InsV = V2;
+      InsMasks = NV2Elt;
+      InsIndex = N2Index;
+    }
+  } else {
+    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
+  }
+
+  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
+    SDValue ExtV = V1;
+    int Mask = InsMasks[I];
+    if (Mask >= V1EltNum) {
+      ExtV = V2;
+      Mask -= V1EltNum;
+    }
+    // Any value type smaller than i32 is illegal in AArch64, and this lower
+    // function is called after legalize pass, so we need to legalize
+    // the result here.
+    EVT EltVT;
+    if (VT.getVectorElementType().isFloatingPoint())
+      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
+    else
+      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
+
+    if (Mask >= 0) {
+      ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
+                         DAG.getConstant(Mask, MVT::i64));
+      InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
+                         DAG.getConstant(InsIndex[I], MVT::i64));
+    }
+  }
+  return InsV;
+}
+
 AArch64TargetLowering::ConstraintType
 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
   if (Constraint.size() == 1) {
@@ -3484,14 +4519,10 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
         return std::make_pair(0U, &AArch64::FPR16RegClass);
       else if (VT == MVT::f32)
         return std::make_pair(0U, &AArch64::FPR32RegClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, &AArch64::FPR64RegClass);
       else if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &AArch64::VPR64RegClass);
-      else if (VT == MVT::f128)
-        return std::make_pair(0U, &AArch64::FPR128RegClass);
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
       else if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &AArch64::VPR128RegClass);
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
       break;
     }
   }
@@ -3500,3 +4531,69 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   // constraint into a member of a register class.
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
+
+/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
+/// The associated MachineMemOperands record the alignment specified
+/// in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::aarch64_neon_vld1x2:
+  case Intrinsic::aarch64_neon_vld1x3:
+  case Intrinsic::aarch64_neon_vld1x4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::aarch64_neon_vst1x2:
+  case Intrinsic::aarch64_neon_vst1x3:
+  case Intrinsic::aarch64_neon_vst1x4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 67a908e..8ad5a79 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
-
+#include "llvm/IR/Intrinsics.h"
 
 namespace llvm {
 namespace AArch64ISD {
@@ -125,6 +125,19 @@ namespace AArch64ISD {
     // Vector FP move immediate
     NEON_FMOVIMM,
 
+    // Vector permute
+    NEON_UZP1,
+    NEON_UZP2,
+    NEON_ZIP1,
+    NEON_ZIP2,
+    NEON_TRN1,
+    NEON_TRN2,
+
+    // Vector Element reverse
+    NEON_REV64,
+    NEON_REV32,
+    NEON_REV16,
+
     // Vector compare
     NEON_CMP,
 
@@ -132,7 +145,58 @@ namespace AArch64ISD {
     NEON_CMPZ,
 
     // Vector compare bitwise test
-    NEON_TST
+    NEON_TST,
+
+    // Vector saturating shift
+    NEON_QSHLs,
+    NEON_QSHLu,
+
+    // Vector dup
+    NEON_VDUP,
+
+    // Vector dup by lane
+    NEON_VDUPLANE,
+
+    // Vector extract
+    NEON_VEXTRACT,
+
+    // NEON duplicate lane loads
+    NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    NEON_LD3DUP,
+    NEON_LD4DUP,
+
+    // NEON loads with post-increment base updates:
+    NEON_LD1_UPD,
+    NEON_LD2_UPD,
+    NEON_LD3_UPD,
+    NEON_LD4_UPD,
+    NEON_LD1x2_UPD,
+    NEON_LD1x3_UPD,
+    NEON_LD1x4_UPD,
+
+    // NEON stores with post-increment base updates:
+    NEON_ST1_UPD,
+    NEON_ST2_UPD,
+    NEON_ST3_UPD,
+    NEON_ST4_UPD,
+    NEON_ST1x2_UPD,
+    NEON_ST1x3_UPD,
+    NEON_ST1x4_UPD,
+
+    // NEON duplicate lane loads with post-increment base updates:
+    NEON_LD2DUP_UPD,
+    NEON_LD3DUP_UPD,
+    NEON_LD4DUP_UPD,
+
+    // NEON lane loads with post-increment base updates:
+    NEON_LD2LN_UPD,
+    NEON_LD3LN_UPD,
+    NEON_LD4LN_UPD,
+
+    // NEON lane store with post-increment base updates:
+    NEON_ST2LN_UPD,
+    NEON_ST3LN_UPD,
+    NEON_ST4LN_UPD
   };
 }
 
@@ -169,9 +233,13 @@ public:
                           SDLoc dl, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals) const;
 
+  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const;
+
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                             const AArch64Subtarget *ST) const;
 
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+
   void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
                            SDValue &Chain) const;
 
@@ -234,6 +302,8 @@ public:
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
@@ -269,6 +339,14 @@ public:
 
   std::pair<unsigned, const TargetRegisterClass*>
   getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+
+  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                                  unsigned Intrinsic) const LLVM_OVERRIDE;
+
+protected:
+  std::pair<const TargetRegisterClass*, uint8_t>
+  findRepresentativeClass(MVT VT) const;
+
 private:
   const InstrItineraryData *Itins;
 
@@ -280,6 +358,10 @@ enum NeonModImmType {
   Neon_Mov_Imm,
   Neon_Mvn_Imm
 };
+
+extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
+                                bool &usesOnlyOneValue, bool &hasDominantValue,
+                                bool &isConstant, bool &isUNDEF);
 } // namespace llvm
 
 #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 09451fd..34f917c 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -120,6 +120,14 @@ class A64InstRdnm<dag outs, dag ins, string asmstr,
   let Inst{20-16} = Rm;
 }
 
+class A64InstRtnm<dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rm;
+
+  let Inst{20-16} = Rm;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Actual A64 Instruction Formats
@@ -383,6 +391,8 @@ class A64I_extract<bit sf, bits<3> op, bit n,
   // Inherits Rd in 4-0
 }
 
+let Predicates = [HasFPARMv8] in {
+
 // Format for floating-point compare instructions.
 class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
                 dag outs, dag ins, string asmstr,
@@ -562,6 +572,8 @@ class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
   // Inherit Rd in 4-0
 }
 
+}
+
 // Format for load-register (literal) instructions.
 class A64I_LDRlit<bits<2> opc, bit v,
                   dag outs, dag ins, string asmstr,
@@ -971,31 +983,123 @@ class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
   : InstAlias<Asm, Result, Emit> {
 }
 
+// Format AdvSIMD bitwise extract
+class NeonI_BitExtract<bit q, bits<2> op2,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = op2;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  // imm4 in 14-11
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD perm
+class NeonI_Perm<bit q, bits<2> size, bits<3> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD table lookup
+class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = op2;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-13} = len;
+  let Inst{12} = op;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
 // Format AdvSIMD 3 vector registers with same vector type
 class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
                    dag outs, dag ins, string asmstr,
                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
   let Inst{31} = 0b0;
   let Inst{30} = q;
   let Inst{29} = u;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
   let Inst{21} = 0b1;
-   // Inherit Rm in 20-16
+  // Inherit Rm in 20-16
   let Inst{15-11} = opcode;
   let Inst{10} = 0b1;
   // Inherit Rn in 9-5
   // Inherit Rd in 4-0
 }
 
+// Format AdvSIMD 3 vector registers with different vector type
+class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11} = 0b0;
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD two registers and an element
+class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01111;
+  let Inst{23-22} = size;
+  // l in Inst{21}
+  // m in Inst{20}
+  // Inherit Rm in 19-16
+  let Inst{15-12} = opcode;
+  // h in Inst{11}
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
 // Format AdvSIMD 1 vector register with modified immediate
 class NeonI_1VModImm<bit q, bit op,
                      dag outs, dag ins, string asmstr,
                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs,ins, asmstr, patterns, itin>
-{
+  : A64InstRd<outs,ins, asmstr, patterns, itin> {
   bits<8> Imm;
   bits<4> cmode;
   let Inst{31} = 0b0;
@@ -1015,15 +1119,14 @@ class NeonI_1VModImm<bit q, bit op,
 class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
                           dag outs, dag ins, string asmstr,
                           list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
   let Inst{31} = 0b0;
   let Inst{30} = 0b1;
   let Inst{29} = u;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
   let Inst{21} = 0b1;
-   // Inherit Rm in 20-16
+  // Inherit Rm in 20-16
   let Inst{15-11} = opcode;
   let Inst{10} = 0b1;
   // Inherit Rn in 9-5
@@ -1035,6 +1138,98 @@ class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
 class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
                    dag outs, dag ins, string asmstr,
                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 2 vector 1 immediate shift
+class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<7> Imm;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = Imm;
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD duplicate and insert
+class NeonI_copy<bit q, bit op, bits<4> imm4,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Imm5;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{20-16} = Imm5;
+  let Inst{15} = 0b0;
+  let Inst{14-11} = imm4;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+// Format AdvSIMD insert from element to vector
+class NeonI_insert<bit q, bit op,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Imm5;
+  bits<4> Imm4;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{20-16} = Imm5;
+  let Inst{15} = 0b0;
+  let Inst{14-11} = Imm4;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar pairwise
+class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode,
+                          dag outs, dag ins, string asmstr,
+                          list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 2 vector across lanes
+class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
   : A64InstRdn<outs, ins, asmstr, patterns, itin>
 {
   let Inst{31} = 0b0;
@@ -1042,13 +1237,253 @@ class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
   let Inst{29} = u;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar two registers miscellaneous
+class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins,
+                            string asmstr, list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
   let Inst{21-17} = 0b10000;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD vector load/store multiple N-element structure
+class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011000;
+  let Inst{22} = l;
+  let Inst{21-16} = 0b000000;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load/store multiple N-element structure (post-index)
+class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
+                         dag outs, dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011001;
+  let Inst{22} = l;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+                      dag ins, string asmstr, list<dag> patterns,
+                      InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011010;
+  let Inst{22} = 0b1;
+  let Inst{21} = r;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-13} = opcode;
+  let Inst{12} = 0b0;
+  let Inst{11-10} = size;
+
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load/store Single N-element structure to/from one lane
+class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+                         dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  bits<4> lane;
+  let Inst{31} = 0b0;
+  let Inst{29-23} = 0b0011010;
+  let Inst{22} = l;
+  let Inst{21} = r;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-14} = op2_1;
+  let Inst{13} = op0;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+                           dag ins, string asmstr, list<dag> patterns,
+                           InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011011;
+  let Inst{22} = 0b1;
+  let Inst{21} = r;
+  // Inherit Rm in 20-16
+  let Inst{15-13} = opcode;
+  let Inst{12} = 0b0;
+  let Inst{11-10} = size;
+
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load/store Single N-element structure
+// to/from one lane
+class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+                         dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  bits<4> lane;
+  let Inst{31} = 0b0;
+  let Inst{29-23} = 0b0011011;
+  let Inst{22} = l;
+  let Inst{21} = r;
+  // Inherit Rm in 20-16
+  let Inst{15-14} = op2_1;
+  let Inst{13} = op0;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD 3 scalar registers with different type
+
+class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
+                          dag outs, dag ins, string asmstr,
+                          list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-30} = 0b01;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar shift by immediate
+
+class NeonI_ScalarShiftImm<bit u, bits<5> opcode,
+                           dag outs, dag ins, string asmstr,
+                           list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<4> Imm4;
+  bits<3> Imm3;
+  let Inst{31-30} = 0b01;
+  let Inst{29} = u;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-19} = Imm4;
+  let Inst{18-16} = Imm3;
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD crypto AES
+class NeonI_Crypto_AES<bits<2> size, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01001110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10100;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
 
+// Format AdvSIMD crypto SHA
+class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01011110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10100;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
   // Inherit Rn in 9-5
   // Inherit Rd in 4-0
 }
 
+// Format AdvSIMD crypto 3V SHA
+class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
+                         dag outs, dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01011110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opcode;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar x indexed element
+class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
+                               bits<4> opcode, dag outs, dag ins,
+                               string asmstr, list<dag> patterns,
+                               InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11111;
+  let Inst{23} = szhi;
+  let Inst{22} = szlo;
+  // l in Inst{21}
+  // m in Instr{20}
+  // Inherit Rm in 19-16
+  let Inst{15-12} = opcode;
+  // h in Inst{11}
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+// Format AdvSIMD scalar copy - insert from element to scalar
+class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
+  let Inst{28} = 0b1;
+}
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index d8f45eb..180110a 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -29,7 +29,7 @@
 
 #include <algorithm>
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
 using namespace llvm;
@@ -68,43 +68,71 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
       .addImm(A64SysReg::NZCV);
   } else if (AArch64::GPR64RegClass.contains(DestReg)) {
-    assert(AArch64::GPR64RegClass.contains(SrcReg));
-    Opc = AArch64::ORRxxx_lsl;
-    ZeroReg = AArch64::XZR;
+    if(AArch64::GPR64RegClass.contains(SrcReg)){
+      Opc = AArch64::ORRxxx_lsl;
+      ZeroReg = AArch64::XZR;
+    } else{
+      assert(AArch64::FPR64RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::GPR32RegClass.contains(DestReg)) {
-    assert(AArch64::GPR32RegClass.contains(SrcReg));
-    Opc = AArch64::ORRwww_lsl;
-    ZeroReg = AArch64::WZR;
+    if(AArch64::GPR32RegClass.contains(SrcReg)){
+      Opc = AArch64::ORRwww_lsl;
+      ZeroReg = AArch64::WZR;
+    } else{
+      assert(AArch64::FPR32RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR32RegClass.contains(DestReg)) {
-    assert(AArch64::FPR32RegClass.contains(SrcReg));
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
-      .addReg(SrcReg);
-    return;
+    if(AArch64::FPR32RegClass.contains(SrcReg)){
+      BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
+    else {
+      assert(AArch64::GPR32RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR64RegClass.contains(DestReg)) {
-    assert(AArch64::FPR64RegClass.contains(SrcReg));
-    BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
-      .addReg(SrcReg);
-    return;
+    if(AArch64::FPR64RegClass.contains(SrcReg)){
+      BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
+    else {
+      assert(AArch64::GPR64RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR128RegClass.contains(DestReg)) {
     assert(AArch64::FPR128RegClass.contains(SrcReg));
 
-    // FIXME: there's no good way to do this, at least without NEON:
-    //   + There's no single move instruction for q-registers
-    //   + We can't create a spill slot and use normal STR/LDR because stack
-    //     allocation has already happened
-    //   + We can't go via X-registers with FMOV because register allocation has
-    //     already happened.
-    // This may not be efficient, but at least it works.
-    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
-      .addReg(SrcReg)
-      .addReg(AArch64::XSP)
-      .addImm(0x1ff & -16);
-
-    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
-      .addReg(AArch64::XSP, RegState::Define)
-      .addReg(AArch64::XSP)
-      .addImm(16);
-    return;
+    // If NEON is enable, we use ORR to implement this copy.
+    // If NEON isn't available, emit STR and LDR to handle this.
+    if(getSubTarget().hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg)
+        .addReg(SrcReg)
+        .addReg(SrcReg);
+      return;
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
+        .addReg(SrcReg)
+        .addReg(AArch64::XSP)
+        .addImm(0x1ff & -16);
+
+      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
+        .addReg(AArch64::XSP, RegState::Define)
+        .addReg(AArch64::XSP)
+        .addImm(16);
+      return;
+    }
   } else {
     llvm_unreachable("Unknown register class in copyPhysReg");
   }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 07289b0..23d81fc 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -14,6 +14,8 @@
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
@@ -125,6 +127,8 @@ def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
 
 def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
 
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+
 //===----------------------------------------------------------------------===//
 // Call sequence pseudo-instructions
 //===----------------------------------------------------------------------===//
@@ -1274,7 +1278,7 @@ def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
 
 // UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
 // use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(zext i32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
+def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
                                          sub_32)>;
 
 //===-------------------------------
@@ -1978,6 +1982,13 @@ def fpz64 : Operand<f64>,
   let DecoderMethod = "DecodeFPZeroOperand";
 }
 
+def fpz64movi : Operand<i64>,
+            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
+}
+
 multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
   def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
                           (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
@@ -2186,23 +2197,23 @@ def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
 
 // Extra patterns for when we're allowed to optimise separate multiplication and
 // addition.
-let Predicates = [UseFusedMAC] in {
-def : Pat<(fadd FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)),
+let Predicates = [HasFPARMv8, UseFusedMAC] in {
+def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
           (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(fsub FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)),
+def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
           (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(fsub (fmul FPR32:$Rn, FPR32:$Rm), FPR32:$Ra),
+def : Pat<(f32 (fsub (f32 (fmul FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
           (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(fsub (fneg FPR32:$Ra), (fmul FPR32:$Rn, FPR32:$Rm)),
+def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
           (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-def : Pat<(fadd FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)),
+def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
           (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(fsub FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)),
+def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
           (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(fsub (fmul FPR64:$Rn, FPR64:$Rm), FPR64:$Ra),
+def : Pat<(f64 (fsub (f64 (fmul FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
           (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(fsub (fneg FPR64:$Ra), (fmul FPR64:$Rn, FPR64:$Rm)),
+def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
           (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 }
 
@@ -2342,6 +2353,7 @@ defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
 defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
 defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
 def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
 def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
@@ -2350,6 +2362,7 @@ def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
 def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
 def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
 def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
+}
 
 multiclass A64I_inttofp<bit o0, string asmop> {
   def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
@@ -2361,6 +2374,7 @@ multiclass A64I_inttofp<bit o0, string asmop> {
 defm S : A64I_inttofp<0b0, "scvtf">;
 defm U : A64I_inttofp<0b1, "ucvtf">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
 def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
 def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
@@ -2369,16 +2383,19 @@ def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
 def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
 def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
 def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
+}
 
 def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
 def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
 def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
 def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
 def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
 def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
 def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
+}
 
 def lane1_asmoperand : AsmOperandClass {
   let Name = "Lane1";
@@ -2401,11 +2418,13 @@ let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
                           "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>;
 }
 
+let Predicates = [HasFPARMv8] in {
 def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
                 (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
 
 def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
                 (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
+}
 
 //===----------------------------------------------------------------------===//
 // Floating-point immediate instructions
@@ -2499,11 +2518,15 @@ let mayLoad = 1 in {
   def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
 }
 
+let Predicates = [HasFPARMv8] in {
 def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
 def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
+}
 
 let mayLoad = 1 in {
+  let Predicates = [HasFPARMv8] in {
   def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
+  }
 
 
   def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
@@ -3097,6 +3120,7 @@ defm LS32
 defm LS64
   : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
 
+let Predicates = [HasFPARMv8] in {
 // STR/LDR to/from a B register
 defm LSFP8
   : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
@@ -3115,6 +3139,7 @@ defm LSFP64
 defm LSFP128
   : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
                          qword_addrparams>;
+}
 
 //===------------------------------
 // 2.3 Signed loads
@@ -3570,10 +3595,13 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
 
 defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
 defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
+
+let Predicates = [HasFPARMv8] in {
 defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
 defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
 defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
                                   "LSFPPair128">;
+}
 
 
 def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
@@ -5162,4 +5190,4 @@ defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
 // Advanced SIMD (NEON) Support
 //
 
-include "AArch64InstrNEON.td"
-\ No newline at end of file
+include "AArch64InstrNEON.td"
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index 98b9e3e..d71749d 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -41,6 +41,37 @@ def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
 def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
                  [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
 
+def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                     SDTCisVT<2, i32>]>;
+def Neon_sqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
+def Neon_uqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
+
+def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                               SDTCisSameAs<0, 2>]>;
+def Neon_uzp1    : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
+def Neon_uzp2    : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
+def Neon_zip1    : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
+def Neon_zip2    : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
+def Neon_trn1    : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
+def Neon_trn2    : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
+
+def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def Neon_rev64    : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
+def Neon_rev32    : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
+def Neon_rev16    : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>;
+def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
+                       [SDTCisVec<0>]>>;
+def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
+                           [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;
+def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
+                           [SDTCisVec<0>,  SDTCisSameAs<0, 1>,
+                           SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
+
+def SDT_assertext : SDTypeProfile<1, 1,
+  [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>;
+def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>;
+def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>;
+
 //===----------------------------------------------------------------------===//
 // Multiclasses
 //===----------------------------------------------------------------------===//
@@ -48,8 +79,7 @@ def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
 multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
                                 string asmop, SDPatternOperator opnode8B,
                                 SDPatternOperator opnode16B,
-                                bit Commutable = 0>
-{
+                                bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _8B :  NeonI_3VSame<0b0, u, size, opcode,
                (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -70,8 +100,7 @@ multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
 
 multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
                                   string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0>
-{
+                                  bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -105,8 +134,7 @@ multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
 multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
                                   string asmop, SDPatternOperator opnode,
                                   bit Commutable = 0>
-   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable>
-{
+   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable> {
   let isCommutable = Commutable in {
     def _8B :  NeonI_3VSame<0b0, u, 0b00, opcode,
                (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -127,8 +155,7 @@ multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
 multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
                                    string asmop, SDPatternOperator opnode,
                                    bit Commutable = 0>
-   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable>
-{
+   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable> {
   let isCommutable = Commutable in {
     def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
@@ -146,8 +173,7 @@ multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
                                  SDPatternOperator opnode4S,
                                  SDPatternOperator opnode2D,
                                  ValueType ResTy2S, ValueType ResTy4S,
-                                 ValueType ResTy2D, bit Commutable = 0>
-{
+                                 ValueType ResTy2D, bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -206,8 +232,8 @@ defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
 // class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
 // two operands constraints.
 class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
-  RegisterClass VPRC, ValueType OpTy, bit q, bit u, bits<2> size, bits<5> opcode,
-  SDPatternOperator opnode>
+  RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size,
+  bits<5> opcode, SDPatternOperator opnode>
   : NeonI_3VSame<q, u, size, opcode,
     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
     asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
@@ -312,26 +338,24 @@ defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
 // ORR disassembled as MOV if Vn==Vm
 
 // Vector Move - register
-// Alias for ORR if Vn=Vm and it is the preferred syntax
+// Alias for ORR if Vn=Vm.
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
 def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
-                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn)>;
+                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>;
 def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
-                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn)>;
-
-def Neon_immAllOnes: PatLeaf<(Neon_movi (i32 timm), (i32 imm)), [{
-  ConstantSDNode *ImmConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  ConstantSDNode *OpCmodeConstVal = cast<ConstantSDNode>(N->getOperand(1));
-  unsigned EltBits;
-  uint64_t EltVal = A64Imms::decodeNeonModImm(ImmConstVal->getZExtValue(),
-    OpCmodeConstVal->getZExtValue(), EltBits);
-  return (EltBits == 8 && EltVal == 0xff);
-}]>;
+                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>;
 
+// The MOVI instruction takes two immediate operands.  The first is the
+// immediate encoding, while the second is the cmode.  A cmode of 14, or
+// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC.
+def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>;
+def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>;
 
 def Neon_not8B  : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v8i8 Neon_immAllOnes)))>;
+                          (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>;
 def Neon_not16B : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v16i8 Neon_immAllOnes)))>;
+                          (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>;
 
 def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
                          (or node:$Rn, (Neon_not8B node:$Rm))>;
@@ -447,6 +471,9 @@ multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
   def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
                     (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src),
+                    (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
   def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
                     (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
             (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
@@ -562,7 +589,7 @@ def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
 // NeonI_compare_aliases class: swaps register operands to implement
 // comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
 class NeonI_compare_aliases<string asmop, string asmlane,
-                            Instruction inst, RegisterClass VPRC>
+                            Instruction inst, RegisterOperand VPRC>
   : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
                     ", $Rm" # asmlane,
                   (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
@@ -1023,6 +1050,20 @@ defm neon_mov_imm_LSLH  : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
   return (HasShift && !ShiftOnesIn);
 }]>;
 
+def neon_uimm1_asmoperand : AsmOperandClass
+{
+  let Name = "UImm1";
+  let PredicateMethod = "isUImm<1>";
+  let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm2_asmoperand : AsmOperandClass
+{
+  let Name = "UImm2";
+  let PredicateMethod = "isUImm<2>";
+  let RenderMethod = "addImmOperands";
+}
+
 def neon_uimm8_asmoperand : AsmOperandClass
 {
   let Name = "UImm8";
@@ -1032,7 +1073,7 @@ def neon_uimm8_asmoperand : AsmOperandClass
 
 def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
   let ParserMatchClass = neon_uimm8_asmoperand;
-  let PrintMethod = "printNeonUImm8Operand";
+  let PrintMethod = "printUImmHexOperand";
 }
 
 def neon_uimm64_mask_asmoperand : AsmOperandClass
@@ -1057,7 +1098,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR64:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
                               [(set (v2i32 VPR64:$Rd),
                                  (v2i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSL_operand:$Simm))))],
@@ -1070,7 +1111,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR128:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
                               [(set (v4i32 VPR128:$Rd),
                                  (v4i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSL_operand:$Simm))))],
@@ -1084,7 +1125,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR64:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, " $Rd.4h, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
                               [(set (v4i16 VPR64:$Rd),
                                  (v4i16 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSLH_operand:$Simm))))],
@@ -1097,7 +1138,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR128:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, " $Rd.8h, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
                               [(set (v8i16 VPR128:$Rd),
                                  (v8i16 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSLH_operand:$Simm))))],
@@ -1117,7 +1158,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR64:$Rd),
                  (ins VPR64:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
                  [(set (v2i32 VPR64:$Rd),
                     (v2i32 (opnode (v2i32 VPR64:$src),
                       (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm,
@@ -1131,7 +1172,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR128:$Rd),
                  (ins VPR128:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
                  [(set (v4i32 VPR128:$Rd),
                     (v4i32 (opnode (v4i32 VPR128:$src),
                       (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm,
@@ -1146,7 +1187,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR64:$Rd),
                  (ins VPR64:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, " $Rd.4h, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
                  [(set (v4i16 VPR64:$Rd),
                     (v4i16 (opnode (v4i16 VPR64:$src),
                        (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm,
@@ -1160,7 +1201,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR128:$Rd),
                  (ins VPR128:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, " $Rd.8h, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
                  [(set (v8i16 VPR128:$Rd),
                     (v8i16 (opnode (v8i16 VPR128:$src),
                       (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm,
@@ -1180,7 +1221,7 @@ multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
                              (outs VPR64:$Rd),
                              (ins neon_uimm8:$Imm,
                                neon_mov_imm_MSL_operand:$Simm),
-                             !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+                             !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
                               [(set (v2i32 VPR64:$Rd),
                                  (v2i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_MSL_operand:$Simm))))],
@@ -1193,7 +1234,7 @@ multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
                               (outs VPR128:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_MSL_operand:$Simm),
-                              !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
                               [(set (v4i32 VPR128:$Rd),
                                  (v4i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_MSL_operand:$Simm))))],
@@ -1315,8 +1356,8 @@ defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
 }
 
 class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
-                                Instruction inst, RegisterClass VPRC>
-  : NeonInstAlias<!strconcat(asmop, " $Rd," # asmlane # ", $Imm"),
+                                Instruction inst, RegisterOperand VPRC>
+  : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"),
                         (inst VPRC:$Rd, neon_uimm8:$Imm,  0), 0b0>;
 
 // Aliases for Vector Move Immediate Shifted
@@ -1382,9 +1423,8 @@ let isReMaterializable = 1 in {
 def MOVIdi : NeonI_1VModImm<0b0, 0b1,
                            (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
                            "movi\t $Rd, $Imm",
-                           [(set (f64 FPR64:$Rd),
-                              (f64 (bitconvert
-                                (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))))],
+                           [(set (v1i64 FPR64:$Rd),
+                             (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
                            NoItinerary> {
   let cmode = 0b1110;
 }
@@ -1392,7 +1432,7 @@ def MOVIdi : NeonI_1VModImm<0b0, 0b1,
 
 // Vector Floating Point Move Immediate
 
-class NeonI_FMOV_impl<string asmlane, RegisterClass VPRC, ValueType OpTy,
+class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
                       Operand immOpType, bit q, bit op>
   : NeonI_1VModImm<q, op,
                    (outs VPRC:$Rd), (ins immOpType:$Imm),
@@ -1409,49 +1449,3339 @@ def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
 def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
 }
 
-// Scalar Arithmetic
+// Vector Shift (Immediate)
+// Immediate in [0, 63]
+def imm0_63 : Operand<i32> {
+  let ParserMatchClass = uimm6_asmoperand;
+}
 
-class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar3Same<u, 0b11, opcode,
-                (outs FPR64:$Rd), (ins FPR64:$Rn, FPR64:$Rm),
-                !strconcat(asmop, " $Rd, $Rn, $Rm"),
-                [],
+// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
+// as follows:
+//
+//    Offset    Encoding
+//     8        immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0>
+//     16       immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0>
+//     32       immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0>
+//     64       immh:immb<6>   = '1xxxxxx', <imm> is encoded in immh:immb<5:0>
+//
+// The shift right immediate amount, in the range 1 to element bits, is computed
+// as Offset - UInt(immh:immb).  The shift left immediate amount, in the range 0
+// to element bits - 1, is computed as UInt(immh:immb) - Offset.
+
+class shr_imm_asmoperands<string OFFSET> : AsmOperandClass {
+  let Name = "ShrImm" # OFFSET;
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "ShrImm" # OFFSET;
+}
+
+class shr_imm<string OFFSET> : Operand<i32> {
+  let EncoderMethod = "getShiftRightImm" # OFFSET;
+  let DecoderMethod = "DecodeShiftRightImm" # OFFSET;
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand");
+}
+
+def shr_imm8_asmoperand : shr_imm_asmoperands<"8">;
+def shr_imm16_asmoperand : shr_imm_asmoperands<"16">;
+def shr_imm32_asmoperand : shr_imm_asmoperands<"32">;
+def shr_imm64_asmoperand : shr_imm_asmoperands<"64">;
+
+def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>;
+def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>;
+def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>;
+def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>;
+
+class shl_imm_asmoperands<string OFFSET> : AsmOperandClass {
+  let Name = "ShlImm" # OFFSET;
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "ShlImm" # OFFSET;
+}
+
+class shl_imm<string OFFSET> : Operand<i32> {
+  let EncoderMethod = "getShiftLeftImm" # OFFSET;
+  let DecoderMethod = "DecodeShiftLeftImm" # OFFSET;
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand");
+}
+
+def shl_imm8_asmoperand : shl_imm_asmoperands<"8">;
+def shl_imm16_asmoperand : shl_imm_asmoperands<"16">;
+def shl_imm32_asmoperand : shl_imm_asmoperands<"32">;
+def shl_imm64_asmoperand : shl_imm_asmoperands<"64">;
+
+def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>;
+def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>;
+def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>;
+def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>;
+
+class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
+               RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd),
+                        (Ty (OpNode (Ty VPRC:$Rn),
+                          (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
+  // 64-bit vector types.
+  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // 128-bit vector types.
+  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> {
+    let Inst{22} = 0b1;        // immh:immb = 1xxxxxx
+  }
+}
+
+multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
+  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                     OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                     OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                     OpNode> {
+     let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                      OpNode> {
+                      let Inst{22-19} = 0b0001;
+                    }
+
+  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                     OpNode> {
+                     let Inst{22-20} = 0b001;
+                    }
+
+  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                     OpNode> {
+                      let Inst{22-21} = 0b01;
+                    }
+
+  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                     OpNode> {
+                      let Inst{22} = 0b1;
+                    }
+}
+
+// Shift left
+defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
+
+// Shift right
+defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
+defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
+
+def Neon_High16B : PatFrag<(ops node:$in),
+                           (extract_subvector (v16i8 node:$in), (iPTR 8))>;
+def Neon_High8H  : PatFrag<(ops node:$in),
+                           (extract_subvector (v8i16 node:$in), (iPTR 4))>;
+def Neon_High4S  : PatFrag<(ops node:$in),
+                           (extract_subvector (v4i32 node:$in), (iPTR 2))>;
+def Neon_High2D  : PatFrag<(ops node:$in),
+                           (extract_subvector (v2i64 node:$in), (iPTR 1))>;
+def Neon_High4float : PatFrag<(ops node:$in),
+                               (extract_subvector (v4f32 node:$in), (iPTR 2))>;
+def Neon_High2double : PatFrag<(ops node:$in),
+                               (extract_subvector (v2f64 node:$in), (iPTR 1))>;
+
+def Neon_Low16B : PatFrag<(ops node:$in),
+                          (v8i8 (extract_subvector (v16i8 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low8H : PatFrag<(ops node:$in),
+                         (v4i16 (extract_subvector (v8i16 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low4S : PatFrag<(ops node:$in),
+                         (v2i32 (extract_subvector (v4i32 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low2D : PatFrag<(ops node:$in),
+                         (v1i64 (extract_subvector (v2i64 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low4float : PatFrag<(ops node:$in),
+                             (v2f32 (extract_subvector (v4f32 node:$in),
+                                                       (iPTR 0)))>;
+def Neon_Low2double : PatFrag<(ops node:$in),
+                              (v1f64 (extract_subvector (v2f64 node:$in),
+                                                        (iPTR 0)))>;
+
+class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                   string SrcT, ValueType DestTy, ValueType SrcTy,
+                   Operand ImmTy, SDPatternOperator ExtOp>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR64:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [(set (DestTy VPR128:$Rd),
+                        (DestTy (shl
+                          (DestTy (ExtOp (SrcTy VPR64:$Rn))),
+                            (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                       string SrcT, ValueType DestTy, ValueType SrcTy,
+                       int StartIndex, Operand ImmTy,
+                       SDPatternOperator ExtOp, PatFrag getTop>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [(set (DestTy VPR128:$Rd),
+                        (DestTy (shl
+                          (DestTy (ExtOp
+                            (SrcTy (getTop VPR128:$Rn)))),
+                              (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
+                         SDNode ExtOp> {
+  // 64-bit vector types.
+  def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8,
+                         shl_imm8, ExtOp> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16,
+                         shl_imm16, ExtOp> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32,
+                         shl_imm32, ExtOp> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // 128-bit vector types
+  def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8,
+                              8, shl_imm8, ExtOp, Neon_High16B> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16,
+                             4, shl_imm16, ExtOp, Neon_High8H> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32,
+                             2, shl_imm32, ExtOp, Neon_High4S> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // Use other patterns to match when the immediate is 0.
+  def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>;
+
+  def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>;
+
+  def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>;
+
+  def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>;
+
+  def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>;
+
+  def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>;
+}
+
+// Shift left long
+defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
+defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
+
+// Rounding/Saturating shift
+class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDPatternOperator OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
+                        (i32 ImmTy:$Imm))))],
+                     NoItinerary>;
+
+// shift right (vector by immediate)
+multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
+                           SDPatternOperator OpNode> {
+  def _8B  : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H  : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                         OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S  : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                         OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop,
+                          SDPatternOperator OpNode> {
+  // 64-bit vector types.
+  def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
+                        OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  // 128-bit vector types.
+  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Rounding shift right
+defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr",
+                                int_aarch64_neon_vsrshr>;
+defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr",
+                                int_aarch64_neon_vurshr>;
+
+// Saturating shift left unsigned
+defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>;
+
+// Saturating shift left
+defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>;
+defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>;
+
+class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDNode OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+           [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
+              (Ty (OpNode (Ty VPRC:$Rn),
+                (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
+           NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// Shift Right accumulate
+multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
+  def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                        OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Shift right and accumulate
+defm SSRAvvi    : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>;
+defm USRAvvi    : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>;
+
+// Rounding shift accumulate
+class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
+                    RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                    SDPatternOperator OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
+                        (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
+                     NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop,
+                             SDPatternOperator OpNode> {
+  def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                          OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                          OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                          OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                           OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                          OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                          OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                          OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Rounding shift right and accumulate
+defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>;
+defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>;
+
+// Shift insert by immediate
+class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDPatternOperator OpNode>
+    : NeonI_2VShiftImm<q, u, opcode,
+           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+           [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
+             (i32 ImmTy:$Imm))))],
+           NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// shift left insert (vector by immediate)
+multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> {
+  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-21} = 0b01;
+  }
+
+    // 128-bit vector types
+  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
+                         int_aarch64_neon_vsli> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
+                        int_aarch64_neon_vsli> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// shift right insert (vector by immediate)
+multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> {
+    // 64-bit vector types.
+  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-21} = 0b01;
+  }
+
+    // 128-bit vector types
+  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         int_aarch64_neon_vsri> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        int_aarch64_neon_vsri> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Shift left and insert
+defm SLIvvi   : NeonI_N2VShLIns<0b1, 0b01010, "sli">;
+
+// Shift right and insert
+defm SRIvvi   : NeonI_N2VShRIns<0b1, 0b01000, "sri">;
+
+class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                    string SrcT, Operand ImmTy>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [], NoItinerary>;
+
+class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                       string SrcT, Operand ImmTy>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [], NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// left long shift by immediate
+multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> {
+  def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> {
+    let Inst{22-21} = 0b01;
+  }
+
+  // Shift Narrow High
+  def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h",
+                              shr_imm8> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s",
+                             shr_imm16> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d",
+                             shr_imm32> {
+    let Inst{22-21} = 0b01;
+  }
+}
+
+// Shift right narrow
+defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">;
+
+// Shift right narrow (prefix Q is saturating, prefix R is rounding)
+defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">;
+defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">;
+defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">;
+defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">;
+defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">;
+defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">;
+defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">;
+
+def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v2i64 (concat_vectors (v1i64 node:$Rm),
+                                                     (v1i64 node:$Rn)))>;
+def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v8i16 (concat_vectors (v4i16 node:$Rm),
+                                                     (v4i16 node:$Rn)))>;
+def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v4i32 (concat_vectors (v2i32 node:$Rm),
+                                                     (v2i32 node:$Rn)))>;
+def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v4f32 (concat_vectors (v2f32 node:$Rm),
+                                                     (v2f32 node:$Rn)))>;
+def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v2f64 (concat_vectors (v1f64 node:$Rm),
+                                                     (v1f64 node:$Rn)))>;
+
+def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v8i16 (srl (v8i16 node:$lhs),
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v4i32 (srl (v4i32 node:$lhs),
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v2i64 (srl (v2i64 node:$lhs),
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v8i16 (sra (v8i16 node:$lhs),
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v4i32 (sra (v4i32 node:$lhs),
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v2i64 (sra (v2i64 node:$lhs),
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
+
+// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
+multiclass Neon_shiftNarrow_patterns<string shr> {
+  def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
+              (i32 shr_imm8:$Imm)))),
+            (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
+              (i32 shr_imm16:$Imm)))),
+            (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
+              (i32 shr_imm32:$Imm)))),
+            (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;
+
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
+                VPR128:$Rn, (i32 shr_imm8:$Imm))))))),
+            (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+                         VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
+                VPR128:$Rn, (i32 shr_imm16:$Imm))))))),
+            (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                        VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
+                VPR128:$Rn, (i32 shr_imm32:$Imm))))))),
+            (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                        VPR128:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> {
+  def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)),
+            (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)),
+            (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)),
+            (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>;
+
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v8i8
+                    (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))),
+            (!cast<Instruction>(prefix # "_16B")
+                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v4i16
+                    (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))),
+            (!cast<Instruction>(prefix # "_8H")
+                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v2i32
+                    (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))),
+            (!cast<Instruction>(prefix # "_4S")
+                  (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                  VPR128:$Rn, imm:$Imm)>;
+}
+
+defm : Neon_shiftNarrow_patterns<"lshr">;
+defm : Neon_shiftNarrow_patterns<"ashr">;
+
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">;
+
+// Convert fix-point and float-pointing
+class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
+                RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy,
+                Operand ImmTy, SDPatternOperator IntOp>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
+                       (i32 ImmTy:$Imm))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
+                              SDPatternOperator IntOp> {
+  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64,
+                      shr_imm64, IntOp> {
+    let Inst{22} = 0b1;
+  }
+}
+
+multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop,
+                              SDPatternOperator IntOp> {
+  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64,
+                      shr_imm64, IntOp> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Convert fixed-point to floating-point
+defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf",
+                                   int_arm_neon_vcvtfxs2fp>;
+defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf",
+                                   int_arm_neon_vcvtfxu2fp>;
+
+// Convert floating-point to fixed-point
+defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs",
+                                   int_arm_neon_vcvtfp2fxs>;
+defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu",
+                                   int_arm_neon_vcvtfp2fxu>;
+
+multiclass Neon_sshll2_0<SDNode ext>
+{
+  def _v8i8  : PatFrag<(ops node:$Rn),
+                       (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>;
+  def _v4i16 : PatFrag<(ops node:$Rn),
+                       (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>;
+  def _v2i32 : PatFrag<(ops node:$Rn),
+                       (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>;
+}
+
+defm NI_sext_high : Neon_sshll2_0<sext>;
+defm NI_zext_high : Neon_sshll2_0<zext>;
+
+
+//===----------------------------------------------------------------------===//
+// Multiclasses for NeonI_Across
+//===----------------------------------------------------------------------===//
+
+// Variant 1
+
+multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
+                            string asmop, SDPatternOperator opnode>
+{
+    def _1h8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
+                (outs FPR16:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.8b",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v8i8 VPR64:$Rn))))],
                 NoItinerary>;
 
-multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
-                                        string asmop, bit Commutable = 0>
+    def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
+                (outs FPR16:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.16b",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v16i8 VPR128:$Rn))))],
+                NoItinerary>;
+
+    def _1s4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
+                (outs FPR32:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.4h",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v4i16 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1s8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.8h",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v8i16 VPR128:$Rn))))],
+                NoItinerary>;
+
+    // _1d2s doesn't exist!
+
+    def _1d4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
+                (outs FPR64:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1i64 FPR64:$Rd),
+                    (v1i64 (opnode (v4i32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
+defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>;
+
+// Variant 2
+
+multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
+                            string asmop, SDPatternOperator opnode>
 {
+    def _1b8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
+                (outs FPR8:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.8b",
+                [(set (v1i8 FPR8:$Rd),
+                    (v1i8 (opnode (v8i8 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
+                (outs FPR8:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.16b",
+                [(set (v1i8 FPR8:$Rd),
+                    (v1i8 (opnode (v16i8 VPR128:$Rn))))],
+                NoItinerary>;
+
+    def _1h4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
+                (outs FPR16:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.4h",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v4i16 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1h8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
+                (outs FPR16:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.8h",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v8i16 VPR128:$Rn))))],
+                NoItinerary>;
+
+    // _1s2s doesn't exist!
+
+    def _1s4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v4i32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
+defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>;
+
+defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>;
+defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>;
+
+defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>;
+
+// Variant 3
+
+multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
+                            string asmop, SDPatternOperator opnode> {
+    def _1s4s:  NeonI_2VAcross<0b1, u, size, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1f32 FPR32:$Rd),
+                    (v1f32 (opnode (v4f32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
+                                int_aarch64_neon_vmaxnmv>;
+defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv",
+                                int_aarch64_neon_vminnmv>;
+
+defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv",
+                              int_aarch64_neon_vmaxv>;
+defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
+                              int_aarch64_neon_vminv>;
+
+// The followings are for instruction class (Perm)
+
+class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
+                    string asmop, RegisterOperand OpVPR, string OpS,
+                    SDPatternOperator opnode, ValueType Ty>
+  : NeonI_Perm<q, size, opcode,
+               (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+               asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
+               [(set (Ty OpVPR:$Rd),
+                  (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
+               NoItinerary>;
+
+multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
+                          SDPatternOperator opnode> {
+  def _8b  : NeonI_Permute<0b0, 0b00, opcode, asmop,
+                           VPR64, "8b", opnode, v8i8>;
+  def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
+                           VPR128, "16b",opnode, v16i8>;
+  def _4h  : NeonI_Permute<0b0, 0b01, opcode, asmop,
+                           VPR64, "4h", opnode, v4i16>;
+  def _8h  : NeonI_Permute<0b1, 0b01, opcode, asmop,
+                           VPR128, "8h", opnode, v8i16>;
+  def _2s  : NeonI_Permute<0b0, 0b10, opcode, asmop,
+                           VPR64, "2s", opnode, v2i32>;
+  def _4s  : NeonI_Permute<0b1, 0b10, opcode, asmop,
+                           VPR128, "4s", opnode, v4i32>;
+  def _2d  : NeonI_Permute<0b1, 0b11, opcode, asmop,
+                           VPR128, "2d", opnode, v2i64>;
+}
+
+defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
+defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
+defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
+defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
+defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
+defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
+
+multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
+  def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
+            (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
+
+  def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
+            (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
+
+  def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
+            (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
+}
+
+defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
+defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
+defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
+defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
+defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
+defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
+
+// The followings are for instruction class (3V Diff)
+
+// normal long/long2 pattern
+class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS,
+                 SDPatternOperator opnode, SDPatternOperator ext,
+                 RegisterOperand OpVPR,
+                 ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
+                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
+                        string asmop, SDPatternOperator opnode,
+                        bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                           opnode, sext, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                           opnode, sext, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                           opnode, sext, VPR64, v2i64, v2i32>;
+  }
+}
+
+multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                            opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
+    def _4s8h  : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                            opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
+    def _2d4s  : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                            opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
+  }
+}
+
+multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                           opnode, zext, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                           opnode, zext, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                           opnode, zext, VPR64, v2i64, v2i32>;
+  }
+}
+
+multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                            opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
+    def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                           opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                           opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
+  }
+}
+
+defm SADDLvvv :  NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>;
+defm UADDLvvv :  NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>;
+
+defm SADDL2vvv :  NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>;
+defm UADDL2vvv :  NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>;
+
+defm SSUBLvvv :  NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>;
+defm USUBLvvv :  NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>;
+
+defm SSUBL2vvv :  NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>;
+defm USUBL2vvv :  NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>;
+
+// normal wide/wide2 pattern
+class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS,
+                 SDPatternOperator opnode, SDPatternOperator ext,
+                 RegisterOperand OpVPR,
+                 ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (ResTy VPR128:$Rn),
+                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                         opnode, sext, VPR64, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                         opnode, sext, VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                         opnode, sext, VPR64, v2i64, v2i32>;
+}
+
+defm SADDWvvv :  NeonI_3VDW_s<0b0, 0b0001, "saddw", add>;
+defm SSUBWvvv :  NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>;
+
+multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode> {
+  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                          opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
+  def _4s8h  : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                          opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
+  def _2d4s  : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                          opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
+}
+
+defm SADDW2vvv :  NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>;
+defm SSUBW2vvv :  NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>;
+
+multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                         opnode, zext, VPR64, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                         opnode, zext, VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                         opnode, zext, VPR64, v2i64, v2i32>;
+}
+
+defm UADDWvvv :  NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>;
+defm USUBWvvv :  NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>;
+
+multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode> {
+  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                          opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
+  def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                         opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                         opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
+}
+
+defm UADDW2vvv :  NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>;
+defm USUBW2vvv :  NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>;
+
+// Get the high half part of the vector element.
+multiclass NeonI_get_high {
+  def _8h : PatFrag<(ops node:$Rn),
+                    (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
+                                             (v8i16 (Neon_vdup (i32 8)))))))>;
+  def _4s : PatFrag<(ops node:$Rn),
+                    (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
+                                              (v4i32 (Neon_vdup (i32 16)))))))>;
+  def _2d : PatFrag<(ops node:$Rn),
+                    (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
+                                              (v2i64 (Neon_vdup (i32 32)))))))>;
+}
+
+defm NI_get_hi : NeonI_get_high;
+
+// pattern for addhn/subhn with 2 operands
+class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator opnode, SDPatternOperator get_hi,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR64:$Rd),
+                    (ResTy (get_hi
+                      (OpTy (opnode (OpTy VPR128:$Rn),
+                                    (OpTy VPR128:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
+                                SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
+                                     opnode, NI_get_hi_8h, v8i8, v8i16>;
+    def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
+                                     opnode, NI_get_hi_4s, v4i16, v4i32>;
+    def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
+                                     opnode, NI_get_hi_2d, v2i32, v2i64>;
+  }
+}
+
+defm ADDHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>;
+defm SUBHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>;
+
+// pattern for operation with 2 operands
+class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                    string asmop, string ResS, string OpS,
+                    SDPatternOperator opnode,
+                    RegisterOperand ResVPR, RegisterOperand OpVPR,
+                    ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy ResVPR:$Rd),
+                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
+                 NoItinerary>;
+
+// normal narrow pattern
+multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
+                          SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
+                              opnode, VPR64, VPR128, v8i8, v8i16>;
+    def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
+                              opnode, VPR64, VPR128, v4i16, v4i32>;
+    def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
+                              opnode, VPR64, VPR128, v2i32, v2i64>;
+  }
+}
+
+defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>;
+defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>;
+
+// pattern for acle intrinsic with 3 operands
+class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [], NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let neverHasSideEffects = 1;
+}
+
+multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> {
+  def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">;
+  def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">;
+  def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">;
+}
+
+defm ADDHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">;
+defm SUBHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">;
+
+defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">;
+defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">;
+
+// Patterns have to be separate because there's a SUBREG_TO_REG in the output
+// part.
+class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy,
+                        SDPatternOperator coreop>
+  : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                      (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn),
+                                                        (SrcTy VPR128:$Rm)))))),
+        (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+              VPR128:$Rn, VPR128:$Rm)>;
+
+// addhn2 patterns
+def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8,  v8i16,
+          BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<ADDHN2vvv_8h4s,  v4i16, v4i32,
+          BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<ADDHN2vvv_4s2d,  v2i32, v2i64,
+          BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>;
+
+// subhn2 patterns
+def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8,  v8i16,
+          BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<SUBHN2vvv_8h4s,  v4i16, v4i32,
+          BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<SUBHN2vvv_4s2d,  v2i32, v2i64,
+          BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>;
+
+// raddhn2 patterns
+def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vraddhn>;
+def : NarrowHighHalfPat<RADDHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vraddhn>;
+def : NarrowHighHalfPat<RADDHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vraddhn>;
+
+// rsubhn2 patterns
+def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vrsubhn>;
+def : NarrowHighHalfPat<RSUBHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vrsubhn>;
+def : NarrowHighHalfPat<RSUBHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vrsubhn>;
+
+// pattern that need to extend result
+class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode,
+                     RegisterOperand OpVPR,
+                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
+                                                (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
+                           SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                               opnode, VPR64, v8i16, v8i8, v8i8>;
+    def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                               opnode, VPR64, v4i32, v4i16, v4i16>;
+    def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                               opnode, VPR64, v2i64, v2i32, v2i32>;
+  }
+}
+
+defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>;
+defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>;
+
+multiclass NeonI_Op_High<SDPatternOperator op> {
+  def _16B : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v8i8 (Neon_High16B node:$Rn)),
+                         (v8i8 (Neon_High16B node:$Rm)))>;
+  def _8H  : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v4i16 (Neon_High8H node:$Rn)),
+                         (v4i16 (Neon_High8H node:$Rm)))>;
+  def _4S  : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v2i32 (Neon_High4S node:$Rn)),
+                         (v2i32 (Neon_High4S node:$Rm)))>;
+}
+
+defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>;
+defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>;
+defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>;
+defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>;
+defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>;
+defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>;
+
+multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode,
+                            bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b  : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                !cast<PatFrag>(opnode # "_16B"),
+                                VPR128, v8i16, v16i8, v8i8>;
+    def _4s4h  : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                !cast<PatFrag>(opnode # "_8H"),
+                                VPR128, v4i32, v8i16, v4i16>;
+    def _2d2s  : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                !cast<PatFrag>(opnode # "_4S"),
+                                VPR128, v2i64, v4i32, v2i32>;
+  }
+}
+
+defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>;
+defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>;
+
+// For pattern that need two operators being chained.
+class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode, SDPatternOperator subop,
+                     RegisterOperand OpVPR,
+                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode
+                      (ResTy VPR128:$src),
+                      (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
+                                                 (OpTy OpVPR:$Rm))))))))],
+                 NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop,
+                             SDPatternOperator opnode, SDPatternOperator subop>{
+  def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                             opnode, subop, VPR64, v8i16, v8i8, v8i8>;
+  def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                             opnode, subop, VPR64, v4i32, v4i16, v4i16>;
+  def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                             opnode, subop, VPR64, v2i64, v2i32, v2i32>;
+}
+
+defm SABALvvv :  NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal",
+                                   add, int_arm_neon_vabds>;
+defm UABALvvv :  NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal",
+                                   add, int_arm_neon_vabdu>;
+
+multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop,
+                              SDPatternOperator opnode, string subop> {
+  def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                             opnode, !cast<PatFrag>(subop # "_16B"),
+                             VPR128, v8i16, v16i8, v8i8>;
+  def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                             opnode, !cast<PatFrag>(subop # "_8H"),
+                             VPR128, v4i32, v8i16, v4i16>;
+  def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                             opnode, !cast<PatFrag>(subop # "_4S"),
+                             VPR128, v2i64, v4i32, v2i32>;
+}
+
+defm SABAL2vvv :  NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add,
+                                     "NI_sabdl_hi">;
+defm UABAL2vvv :  NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
+                                     "NI_uabdl_hi">;
+
+// Long pattern with 2 operands
+multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
+                          SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                              opnode, VPR128, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                              opnode, VPR128, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                              opnode, VPR128, VPR64, v2i64, v2i32>;
+  }
+}
+
+defm SMULLvvv :  NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>;
+defm UMULLvvv :  NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>;
+
+class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator opnode,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
   let isCommutable = Commutable in {
-    def bbb : NeonI_Scalar3Same<u, 0b00, opcode,
-                                (outs FPR8:$Rd), (ins FPR8:$Rn, FPR8:$Rm),
-                                !strconcat(asmop, " $Rd, $Rn, $Rm"),
+    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                      !cast<PatFrag>(opnode # "_16B"),
+                                      v8i16, v16i8>;
+    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                     !cast<PatFrag>(opnode # "_8H"),
+                                     v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                     !cast<PatFrag>(opnode # "_4S"),
+                                     v2i64, v4i32>;
+  }
+}
+
+defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2",
+                                         "NI_smull_hi", 1>;
+defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2",
+                                         "NI_umull_hi", 1>;
+
+// Long pattern with 3 operands
+class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode,
+                     ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode
+                      (ResTy VPR128:$src),
+                      (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
+               NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop,
+                             SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                             opnode, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                             opnode, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                             opnode, v2i64, v2i32>;
+}
+
+def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (add node:$Rd,
+                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
+
+def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (add node:$Rd,
+                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
+
+def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (sub node:$Rd,
+                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
+
+def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (sub node:$Rd,
+                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
+
+defm SMLALvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>;
+defm UMLALvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>;
+
+defm SMLSLvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>;
+defm UMLSLvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>;
+
+class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator subop, SDPatternOperator opnode,
+                           RegisterOperand OpVPR,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+               (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
+               asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+               [(set (ResTy VPR128:$Rd),
+                  (ResTy (subop
+                    (ResTy VPR128:$src),
+                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
+               NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop,
+                                   SDPatternOperator subop, string opnode> {
+  def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                    subop, !cast<PatFrag>(opnode # "_16B"),
+                                    VPR128, v8i16, v16i8>;
+  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                   subop, !cast<PatFrag>(opnode # "_8H"),
+                                   VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                   subop, !cast<PatFrag>(opnode # "_4S"),
+                                   VPR128, v2i64, v4i32>;
+}
+
+defm SMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2",
+                                          add, "NI_smull_hi">;
+defm UMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2",
+                                          add, "NI_umull_hi">;
+
+defm SMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2",
+                                          sub, "NI_smull_hi">;
+defm UMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2",
+                                          sub, "NI_umull_hi">;
+
+multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop,
+                                    SDPatternOperator opnode> {
+  def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                                   opnode, int_arm_neon_vqdmull,
+                                   VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                                   opnode, int_arm_neon_vqdmull,
+                                   VPR64, v2i64, v2i32>;
+}
+
+defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal",
+                                           int_arm_neon_vqadds>;
+defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl",
+                                           int_arm_neon_vqsubs>;
+
+multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                              opnode, VPR128, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                              opnode, VPR128, VPR64, v2i64, v2i32>;
+  }
+}
+
+defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
+                                int_arm_neon_vqdmull, 1>;
+
+multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                     !cast<PatFrag>(opnode # "_8H"),
+                                     v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                     !cast<PatFrag>(opnode # "_4S"),
+                                     v2i64, v4i32>;
+  }
+}
+
+defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2",
+                                           "NI_qdmull_hi", 1>;
+
+multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop,
+                                     SDPatternOperator opnode> {
+  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                   opnode, NI_qdmull_hi_8H,
+                                   VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                   opnode, NI_qdmull_hi_4S,
+                                   VPR128, v2i64, v4i32>;
+}
+
+defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2",
+                                             int_arm_neon_vqadds>;
+defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
+                                             int_arm_neon_vqsubs>;
+
+multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                              opnode, VPR128, VPR64, v8i16, v8i8>;
+
+    def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode,
+                             (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+                             asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d",
+                             [], NoItinerary>;
+  }
+}
+
+defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>;
+
+multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                      !cast<PatFrag>(opnode # "_16B"),
+                                      v8i16, v16i8>;
+
+    def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                             asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
+                             [], NoItinerary>;
+  }
+}
+
+defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
+                                         1>;
+
+// End of implementation for instruction class (3V Diff)
+
+// The followings are vector load/store multiple N-element structure
+// (class SIMD lselem).
+
+// ld1:         load multiple 1-element structure to 1/2/3/4 registers.
+// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
+//              The structure consists of a sequence of sets of N values.
+//              The first element of the structure is placed in the first lane
+//              of the first first vector, the second element in the first lane
+//              of the second vector, and so on.
+// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
+// the three 64-bit vectors list {BA, DC, FE}.
+// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
+// 64-bit vectors list {DA, EB, FC}.
+// Store instructions store multiple structure to N registers like load.
+
+
+class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+  : NeonI_LdStMult<q, 1, opcode, size,
+                 (outs VecList:$Rt), (ins GPR64xsp:$Rn),
+                 asmop # "\t$Rt, [$Rn]",
+                 [],
+                 NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
+  def _8B : NeonI_LDVList<0, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_LDVList<0, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_LDVList<0, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _16B : NeonI_LDVList<1, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_LDVList<1, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_LDVList<1, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_LDVList<1, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
+defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
+def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
+
+defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
+
+defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
+
+defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+
+// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
+defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
+def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
+
+defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">;
+def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
+
+defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">;
+def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
+
+class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+  : NeonI_LdStMult<q, 0, opcode, size,
+                 (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
+                 asmop # "\t$Rt, [$Rn]",
+                 [],
+                 NoItinerary> {
+  let mayStore = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
+  def _8B : NeonI_STVList<0, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_STVList<0, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_STVList<0, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _16B : NeonI_STVList<1, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_STVList<1, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_STVList<1, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_STVList<1, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Store multiple N-element structures from N registers (N = 1,2,3,4)
+defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
+def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
+
+defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
+
+defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
+
+defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
+
+// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
+defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
+def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
+
+defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
+def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
+
+defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
+def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
+
+def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+
+def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+
+def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
+def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
+
+def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+
+def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+
+def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
+def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
+
+def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
+          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
+          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
+          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
+          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
+          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
+          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
+          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
+          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+
+def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
+          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
+          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+
+def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
+          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
+          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+
+// End of vector load/store multiple N-element structure(class SIMD lselem)
+
+// The followings are post-index vector load/store multiple N-element
+// structure(class SIMD lselem-post)
+def exact1_asmoperand : AsmOperandClass {
+  let Name = "Exact1";
+  let PredicateMethod = "isExactImm<1>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
+  let ParserMatchClass = exact1_asmoperand;
+}
+
+def exact2_asmoperand : AsmOperandClass {
+  let Name = "Exact2";
+  let PredicateMethod = "isExactImm<2>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
+  let ParserMatchClass = exact2_asmoperand;
+}
+
+def exact3_asmoperand : AsmOperandClass {
+  let Name = "Exact3";
+  let PredicateMethod = "isExactImm<3>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
+  let ParserMatchClass = exact3_asmoperand;
+}
+
+def exact4_asmoperand : AsmOperandClass {
+  let Name = "Exact4";
+  let PredicateMethod = "isExactImm<4>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
+  let ParserMatchClass = exact4_asmoperand;
+}
+
+def exact6_asmoperand : AsmOperandClass {
+  let Name = "Exact6";
+  let PredicateMethod = "isExactImm<6>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
+  let ParserMatchClass = exact6_asmoperand;
+}
+
+def exact8_asmoperand : AsmOperandClass {
+  let Name = "Exact8";
+  let PredicateMethod = "isExactImm<8>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
+  let ParserMatchClass = exact8_asmoperand;
+}
+
+def exact12_asmoperand : AsmOperandClass {
+  let Name = "Exact12";
+  let PredicateMethod = "isExactImm<12>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
+  let ParserMatchClass = exact12_asmoperand;
+}
+
+def exact16_asmoperand : AsmOperandClass {
+  let Name = "Exact16";
+  let PredicateMethod = "isExactImm<16>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> {
+  let ParserMatchClass = exact16_asmoperand;
+}
+
+def exact24_asmoperand : AsmOperandClass {
+  let Name = "Exact24";
+  let PredicateMethod = "isExactImm<24>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> {
+  let ParserMatchClass = exact24_asmoperand;
+}
+
+def exact32_asmoperand : AsmOperandClass {
+  let Name = "Exact32";
+  let PredicateMethod = "isExactImm<32>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> {
+  let ParserMatchClass = exact32_asmoperand;
+}
+
+def exact48_asmoperand : AsmOperandClass {
+  let Name = "Exact48";
+  let PredicateMethod = "isExactImm<48>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> {
+  let ParserMatchClass = exact48_asmoperand;
+}
+
+def exact64_asmoperand : AsmOperandClass {
+  let Name = "Exact64";
+  let PredicateMethod = "isExactImm<64>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> {
+  let ParserMatchClass = exact64_asmoperand;
+}
+
+multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
+                           RegisterOperand VecList, Operand ImmTy,
+                           string asmop> {
+  let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1,
+      DecoderMethod = "DecodeVLDSTPostInstruction" in {
+    def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size,
+                     (outs VecList:$Rt, GPR64xsp:$wb),
+                     (ins GPR64xsp:$Rn, ImmTy:$amt),
+                     asmop # "\t$Rt, [$Rn], $amt",
+                     [],
+                     NoItinerary> {
+      let Rm = 0b11111;
+    }
+
+    def _register : NeonI_LdStMult_Post<q, 1, opcode, size,
+                        (outs VecList:$Rt, GPR64xsp:$wb),
+                        (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
+                        asmop # "\t$Rt, [$Rn], $Rm",
+                        [],
+                        NoItinerary>;
+  }
+}
+
+multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
+    Operand ImmTy2, string asmop> {
+  defm _8B : NeonI_LDWB_VList<0, opcode, 0b00,
+                              !cast<RegisterOperand>(List # "8B_operand"),
+                              ImmTy, asmop>;
+
+  defm _4H : NeonI_LDWB_VList<0, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              ImmTy, asmop>;
+
+  defm _2S : NeonI_LDWB_VList<0, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              ImmTy, asmop>;
+
+  defm _16B : NeonI_LDWB_VList<1, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               ImmTy2, asmop>;
+
+  defm _8H : NeonI_LDWB_VList<1, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              ImmTy2, asmop>;
+
+  defm _4S : NeonI_LDWB_VList<1, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              ImmTy2, asmop>;
+
+  defm _2D : NeonI_LDWB_VList<1, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              ImmTy2, asmop>;
+}
+
+// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
+defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "ld1">;
+
+defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
+
+defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                             "ld3">;
+
+defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
+
+// Post-index load multiple 1-element structures from N consecutive registers
+// (N = 2,3,4)
+defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                               "ld1">;
+defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                   uimm_exact16, "ld1">;
+
+defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                               "ld1">;
+defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                   uimm_exact24, "ld1">;
+
+defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                "ld1">;
+defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                   uimm_exact32, "ld1">;
+
+multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
+                            RegisterOperand VecList, Operand ImmTy,
+                            string asmop> {
+  let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1,
+      DecoderMethod = "DecodeVLDSTPostInstruction" in {
+    def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size,
+                     (outs GPR64xsp:$wb),
+                     (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
+                     asmop # "\t$Rt, [$Rn], $amt",
+                     [],
+                     NoItinerary> {
+      let Rm = 0b11111;
+    }
+
+    def _register : NeonI_LdStMult_Post<q, 0, opcode, size,
+                      (outs GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
+                      asmop # "\t$Rt, [$Rn], $Rm",
+                      [],
+                      NoItinerary>;
+  }
+}
+
+multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
+                           Operand ImmTy2, string asmop> {
+  defm _8B : NeonI_STWB_VList<0, opcode, 0b00,
+                 !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>;
+
+  defm _4H : NeonI_STWB_VList<0, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              ImmTy, asmop>;
+
+  defm _2S : NeonI_STWB_VList<0, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              ImmTy, asmop>;
+
+  defm _16B : NeonI_STWB_VList<1, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               ImmTy2, asmop>;
+
+  defm _8H : NeonI_STWB_VList<1, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              ImmTy2, asmop>;
+
+  defm _4S : NeonI_STWB_VList<1, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              ImmTy2, asmop>;
+
+  defm _2D : NeonI_STWB_VList<1, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              ImmTy2, asmop>;
+}
+
+// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
+defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "st1">;
+
+defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
+
+defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                             "st3">;
+
+defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
+
+// Post-index load multiple 1-element structures from N consecutive registers
+// (N = 2,3,4)
+defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                               "st1">;
+defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                   uimm_exact16, "st1">;
+
+defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                               "st1">;
+defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                   uimm_exact24, "st1">;
+
+defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                               "st1">;
+defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                   uimm_exact32, "st1">;
+
+// End of post-index vector load/store multiple N-element structure
+// (class SIMD lselem-post)
+
+// The followings are vector load/store single N-element structure
+// (class SIMD lsone).
+def neon_uimm0_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm == 0;}]> {
+  let ParserMatchClass = neon_uimm0_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm1_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 2;}]> {
+  let ParserMatchClass = neon_uimm1_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm2_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 4;}]> {
+  let ParserMatchClass = neon_uimm2_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm3_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 8;}]> {
+  let ParserMatchClass = uimm3_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm4_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 16;}]> {
+  let ParserMatchClass = uimm4_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+    : NeonI_LdOne_Dup<q, r, opcode, size,
+                      (outs VecList:$Rt), (ins GPR64xsp:$Rn),
+                      asmop # "\t$Rt, [$Rn]",
+                      [],
+                      NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
+  def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "1D_operand"), asmop>;
+
+  def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Load single 1-element structure to all lanes of 1 register
+defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
+
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
+defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
+defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+
+
+class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
+                    Instruction INST>
+    : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
+          (VTy (INST GPR64xsp:$Rn))>;
+
+// Match all LD1R instructions
+def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+
+def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
+
+def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
+
+def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
+
+def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
+def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
+
+def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
+def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
+
+def : LD1R_pattern<v1i64, i64, load, LD1R_1D>;
+def : LD1R_pattern<v1f64, f64, load, LD1R_1D>;
+
+def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
+def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+
+
+multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
+                                RegisterClass RegList> {
+  defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
+  defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
+  defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
+  defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
+}
+
+// Special vector list operand of 128-bit vectors with bare layout.
+// i.e. only show ".b", ".h", ".s", ".d"
+defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
+defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
+defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
+defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
+
+class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                     Operand ImmOp, string asmop>
+    : NeonI_LdStOne_Lane<1, r, op2_1, op0,
+                         (outs VList:$Rt),
+                         (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
+                         asmop # "\t$Rt[$lane], [$Rn]",
+                         [],
+                         NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+  let hasExtraDefRegAllocReq = 1;
+  let Constraints = "$src = $Rt";
+}
+
+multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+  def _B : NeonI_LDN_Lane<r, 0b00, op0,
+                          !cast<RegisterOperand>(List # "B_operand"),
+                          neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H : NeonI_LDN_Lane<r, 0b01, op0,
+                          !cast<RegisterOperand>(List # "H_operand"),
+                          neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S : NeonI_LDN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "S_operand"),
+                          neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D : NeonI_LDN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "D_operand"),
+                          neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Load single 1-element structure to one lane of 1 register.
+defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
+
+// Load single N-element structure to one lane of N consecutive registers
+// (N = 2,3,4)
+defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
+defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
+defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+
+multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
+                          Instruction INST> {
+  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
+                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
+            (VTy (EXTRACT_SUBREG
+                     (INST GPR64xsp:$Rn,
+                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                           ImmOp:$lane),
+                     sub_64))>;
+
+  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
+                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
+            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+}
+
+// Match all LD1LN instructions
+defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                      extloadi8, LD1LN_B>;
+
+defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                      extloadi16, LD1LN_H>;
+
+defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                      load, LD1LN_S>;
+defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                      load, LD1LN_S>;
+
+defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                      load, LD1LN_D>;
+defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                      load, LD1LN_D>;
+
+class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                     Operand ImmOp, string asmop>
+    : NeonI_LdStOne_Lane<0, r, op2_1, op0,
+                         (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
+                         asmop # "\t$Rt[$lane], [$Rn]",
+                         [],
+                         NoItinerary> {
+  let mayStore = 1;
+  let neverHasSideEffects = 1;
+  let hasExtraDefRegAllocReq = 1;
+}
+
+multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+  def _B : NeonI_STN_Lane<r, 0b00, op0,
+                          !cast<RegisterOperand>(List # "B_operand"),
+                          neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H : NeonI_STN_Lane<r, 0b01, op0,
+                          !cast<RegisterOperand>(List # "H_operand"),
+                          neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S : NeonI_STN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "S_operand"),
+                           neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D : NeonI_STN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "D_operand"),
+                          neon_uimm1_bare, asmop>{
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Store single 1-element structure from one lane of 1 register.
+defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
+
+// Store single N-element structure from one lane of N consecutive registers
+// (N = 2,3,4)
+defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
+defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
+defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+
+multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                          Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
+                          Instruction INST> {
+  def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
+                     GPR64xsp:$Rn),
+            (INST GPR64xsp:$Rn,
+                  (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
+                  ImmOp:$lane)>;
+
+  def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
+                     GPR64xsp:$Rn),
+            (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
+}
+
+// Match all ST1LN instructions
+defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                      truncstorei8, ST1LN_B>;
+
+defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                      truncstorei16, ST1LN_H>;
+
+defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                      store, ST1LN_S>;
+defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                      store, ST1LN_S>;
+
+defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                      store, ST1LN_D>;
+defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                      store, ST1LN_D>;
+
+// End of vector load/store single N-element structure (class SIMD lsone).
+
+
+// The following are post-index load/store single N-element instructions
+// (class SIMD lsone-post)
+
+multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+                            RegisterOperand VecList, Operand ImmTy,
+                            string asmop> {
+  let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
+  DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+    def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+                      (outs VecList:$Rt, GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, ImmTy:$amt),
+                      asmop # "\t$Rt, [$Rn], $amt",
+                      [],
+                      NoItinerary> {
+                        let Rm = 0b11111;
+                      }
+
+    def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+                      (outs VecList:$Rt, GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
+                      asmop # "\t$Rt, [$Rn], $Rm",
+                      [],
+                      NoItinerary>;
+  }
+}
+
+multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
+                         Operand uimm_b, Operand uimm_h,
+                         Operand uimm_s, Operand uimm_d> {
+  defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
+                              !cast<RegisterOperand>(List # "8B_operand"),
+                              uimm_b, asmop>;
+
+  defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              uimm_h, asmop>;
+
+  defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              uimm_s, asmop>;
+
+  defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "1D_operand"),
+                              uimm_d, asmop>;
+
+  defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               uimm_b, asmop>;
+
+  defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              uimm_h, asmop>;
+
+  defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              uimm_s, asmop>;
+
+  defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              uimm_d, asmop>;
+}
+
+// Post-index load single 1-element structure to all lanes of 1 register
+defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
+                             uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
+                             uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
+                             uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
+                             uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
+    Constraints = "$Rn = $wb, $Rt = $src",
+    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+  class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                                Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+                                (outs VList:$Rt, GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, ImmTy:$amt,
+                                    VList:$src, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $amt",
                                 [],
-                                NoItinerary>;
-    def hhh : NeonI_Scalar3Same<u, 0b01, opcode,
-                                (outs FPR16:$Rd), (ins FPR16:$Rn, FPR16:$Rm),
-                                !strconcat(asmop, " $Rd, $Rn, $Rm"),
+                                NoItinerary> {
+    let Rm = 0b11111;
+  }
+
+  class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                                 Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+                                (outs VList:$Rt, GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
+                                    VList:$src, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
                                 [],
                                 NoItinerary>;
-    def sss : NeonI_Scalar3Same<u, 0b10, opcode,
-                                (outs FPR32:$Rd), (ins FPR32:$Rn, FPR32:$Rm),
-                                !strconcat(asmop, " $Rd, $Rn, $Rm"),
+}
+
+multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+                           Operand uimm_b, Operand uimm_h,
+                           Operand uimm_s, Operand uimm_d> {
+  def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
+                               !cast<RegisterOperand>(List # "B_operand"),
+                               uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
+                                   !cast<RegisterOperand>(List # "B_operand"),
+                                   uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
+                               !cast<RegisterOperand>(List # "H_operand"),
+                               uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
+                                   !cast<RegisterOperand>(List # "H_operand"),
+                                   uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "S_operand"),
+                               uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "S_operand"),
+                                   uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "D_operand"),
+                               uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+
+  def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "D_operand"),
+                                   uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Post-index load single 1-element structure to one lane of 1 register.
+defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
+                                uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to one lane of N consecutive
+// registers
+// (N = 2,3,4)
+defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
+                                uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
+                                uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
+                                uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayStore = 1, neverHasSideEffects = 1,
+    hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
+    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+  class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                      Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+                                (outs GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, ImmTy:$amt,
+                                    VList:$Rt, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $amt",
+                                [],
+                                NoItinerary> {
+    let Rm = 0b11111;
+  }
+
+  class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                       Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+                                (outs GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
+                                    ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
                                 [],
                                 NoItinerary>;
-    def ddd : NeonI_Scalar3Same<u, 0b11, opcode,
-                               (outs FPR64:$Rd), (ins FPR64:$Rn, FPR64:$Rm),
-                               !strconcat(asmop, " $Rd, $Rn, $Rm"),
-                               [],
-                               NoItinerary>;
+}
+
+multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+                           Operand uimm_b, Operand uimm_h,
+                           Operand uimm_s, Operand uimm_d> {
+  def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
+                               !cast<RegisterOperand>(List # "B_operand"),
+                               uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _B_register : STN_WBReg_Lane<r, 0b00, op0,
+                                   !cast<RegisterOperand>(List # "B_operand"),
+                                   uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
+                               !cast<RegisterOperand>(List # "H_operand"),
+                               uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _H_register : STN_WBReg_Lane<r, 0b01, op0,
+                                   !cast<RegisterOperand>(List # "H_operand"),
+                                   uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "S_operand"),
+                               uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _S_register : STN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "S_operand"),
+                                   uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "D_operand"),
+                               uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+
+  def _D_register : STN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "D_operand"),
+                                   uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
   }
 }
 
-class Neon_Scalar_D_size_patterns<SDPatternOperator opnode, Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
-        (SUBREG_TO_REG (i64 0),
-              (INSTD (EXTRACT_SUBREG VPR64:$Rn, sub_64),
-             (EXTRACT_SUBREG VPR64:$Rm, sub_64)),
-          sub_64)>;
+// Post-index store single 1-element structure from one lane of 1 register.
+defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
+                                uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index store single N-element structure from one lane of N consecutive
+// registers (N = 2,3,4)
+defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
+                                uimm_exact4, uimm_exact8, uimm_exact16>;
+defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
+                                uimm_exact6, uimm_exact12, uimm_exact24>;
+defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
+                                uimm_exact8, uimm_exact16, uimm_exact32>;
+
+// End of post-index load/store single N-element instructions
+// (class SIMD lsone-post)
+
+// Neon Scalar instructions implementation
+// Scalar Three Same
+
+class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
+                             RegisterClass FPRC>
+  : NeonI_Scalar3Same<u, size, opcode,
+                      (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
+                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                      [],
+                      NoItinerary>;
+
+class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
+
+multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop,
+                                      bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
+    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
+  }
+}
+
+multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode,
+                                      string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>;
+    def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>;
+  }
+}
+
+multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
+                                        string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>;
+    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
+    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
+    def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
+  }
+}
+
+multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode,
+                                            Instruction INSTD> {
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTB,
+                                               Instruction INSTH,
+                                               Instruction INSTS,
+                                               Instruction INSTD>
+  : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
+  def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
+           (INSTB FPR8:$Rn, FPR8:$Rm)>;
+
+  def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+           (INSTH FPR16:$Rn, FPR16:$Rm)>;
 
+  def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+           (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+class Neon_Scalar3Same_cmp_D_size_patterns<SDPatternOperator opnode,
+                                           Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTS,
+                                             Instruction INSTD> {
+  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_cmp_SD_size_patterns<SDPatternOperator opnode,
+                                                 Instruction INSTS,
+                                                 Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC,
+                                              Instruction INSTD>
+  : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+// Scalar Three Different
+
+class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
+                             RegisterClass FPRCD, RegisterClass FPRCS>
+  : NeonI_Scalar3Diff<u, size, opcode,
+                      (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
+                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                      [],
+                      NoItinerary>;
+
+multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
+  def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
+  def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>;
+}
+
+multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
+  let Constraints = "$Src = $Rd" in {
+    def shh : NeonI_Scalar3Diff<u, 0b01, opcode,
+                       (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                       [],
+                       NoItinerary>;
+    def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
+                       (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                       [],
+                       NoItinerary>;
+  }
+}
+
+multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>;
+}
+
+// Scalar Two Registers Miscellaneous
+
+class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop,
+                             RegisterClass FPRCD, RegisterClass FPRCS>
+  : NeonI_Scalar2SameMisc<u, size, opcode,
+                          (outs FPRCD:$Rd), (ins FPRCS:$Rn),
+                          !strconcat(asmop, "\t$Rd, $Rn"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
+                                         string asmop> {
+  def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32,
+                                      FPR32>;
+  def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64,
+                                      FPR64>;
+}
+
+multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> {
+  def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>;
+}
+
+multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> {
+  def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>;
+  def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>;
+  def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>;
+}
+
+class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>;
+
+multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode,
+                                                 string asmop> {
+  def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>;
+  def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>;
+  def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>;
+}
+
+class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
+                                       string asmop, RegisterClass FPRC>
+  : NeonI_Scalar2SameMisc<u, size, opcode,
+                          (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
+                          !strconcat(asmop, "\t$Rd, $Rn"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
+                                                 string asmop> {
+
+  let Constraints = "$Src = $Rd" in {
+    def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>;
+    def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>;
+    def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>;
+    def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>;
+  }
+}
+
+class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
+                                                  Instruction INSTD>
+  : Pat<(v1f32 (opnode (v1f64 FPR64:$Rn))),
+        (INSTD FPR64:$Rn)>;
+
+multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator Sopnode,
+                                                     SDPatternOperator Dopnode,
+                                                     Instruction INSTS,
+                                                     Instruction INSTD> {
+  def : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
+                                                 Instruction INSTS,
+                                                 Instruction INSTD> {
+  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc<u, 0b11, opcode,
+                          (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
+                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
+                                              string asmop> {
+  def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
+                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpz32:$FPImm),
+                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
+                           [],
+                           NoItinerary>;
+  def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
+                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpz32:$FPImm),
+                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
+                           [],
+                           NoItinerary>;
+}
+
+class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
+                       (v1i64 (bitconvert (v8i8 Neon_AllZero))))),
+        (INSTD FPR64:$Rn, 0)>;
+
+class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
+                                                   Instruction INSTD>
+  : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn),
+                          (i32 neon_uimm0:$Imm), CC)),
+        (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
+
+multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn),
+                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
+            (INSTS FPR32:$Rn, fpz32:$FPImm)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn),
+                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
+            (INSTD FPR64:$Rn, fpz32:$FPImm)>;
+}
+
+multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD> {
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode,
+                                                   Instruction INSTB,
+                                                   Instruction INSTH,
+                                                   Instruction INSTS,
+                                                   Instruction INSTD>
+  : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> {
+  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))),
+            (INSTB FPR8:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+
+}
+
+multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTB,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))),
+            (INSTB FPR8:$Src, FPR8:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Src, FPR16:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Src, FPR32:$Rn)>;
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Src, FPR64:$Rn)>;
+}
+
+// Scalar Shift By Immediate
+
+class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
+                                RegisterClass FPRC, Operand ImmTy>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary>;
+
+multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
+                                            string asmop> {
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode,
+                                               string asmop>
+  : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> {
+  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode,
+                                            string asmop> {
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode,
+                                              string asmop>
+  : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> {
+  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPR64:$Rd),
+                         (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+    let Constraints = "$Src = $Rd";
+}
+
+class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPR64:$Rd),
+                         (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+    let Constraints = "$Src = $Rd";
+}
+
+class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
+                                       RegisterClass FPRCD, RegisterClass FPRCS,
+                                       Operand ImmTy>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary>;
+
+multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
+                                                string asmop> {
+  def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16,
+                                             shr_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32,
+                                             shr_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64,
+                                             shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> {
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTD> {
+  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTD> {
+  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+class Neon_ScalarShiftImm_arm_D_size_patterns<SDPatternOperator opnode,
+                                              Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
+            (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
+        (INSTD FPR64:$Rn, imm:$Imm)>;
+
+multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode,
+                                                   Instruction INSTB,
+                                                   Instruction INSTH,
+                                                   Instruction INSTS,
+                                                   Instruction INSTD>
+  : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> {
+  def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))),
+                (INSTB FPR8:$Rn, imm:$Imm)>;
+  def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))),
+                (INSTH FPR16:$Rn, imm:$Imm)>;
+  def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+}
+
+class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
+            (i32 shl_imm64:$Imm))),
+        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
+
+class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
+            (i32 shr_imm64:$Imm))),
+        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
+
+multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))),
+                (INSTH FPR16:$Rn, imm:$Imm)>;
+  def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator Sopnode,
+                                                      SDPatternOperator Dopnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def ssi : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def ddi : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator Sopnode,
+                                                      SDPatternOperator Dopnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def ssi : Pat<(v1i32 (Sopnode (v1f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def ddi : Pat<(v1i64 (Dopnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+// Scalar Signed Shift Right (Immediate)
+defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<sra, SSHRddi>;
+
+// Scalar Unsigned Shift Right (Immediate)
+defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<srl, USHRddi>;
+
+// Scalar Signed Rounding Shift Right (Immediate)
+defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>;
+
+// Scalar Unigned Rounding Shift Right (Immediate)
+defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>;
+
+// Scalar Signed Shift Right and Accumulate (Immediate)
+def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsrads_n, SSRA>;
+
+// Scalar Unsigned Shift Right and Accumulate (Immediate)
+def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsradu_n, USRA>;
+
+// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
+def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vrsrads_n, SRSRA>;
+
+// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
+def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vrsradu_n, URSRA>;
+
+// Scalar Shift Left (Immediate)
+defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
+defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<shl, SHLddi>;
+
+// Signed Saturating Shift Left (Immediate)
+defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n,
+                                               SQSHLbbi, SQSHLhhi,
+                                               SQSHLssi, SQSHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>;
+
+// Unsigned Saturating Shift Left (Immediate)
+defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n,
+                                               UQSHLbbi, UQSHLhhi,
+                                               UQSHLssi, UQSHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>;
+
+// Signed Saturating Shift Left Unsigned (Immediate)
+defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu,
+                                               SQSHLUbbi, SQSHLUhhi,
+                                               SQSHLUssi, SQSHLUddi>;
+
+// Shift Right And Insert (Immediate)
+def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsri, SRI>;
+
+// Shift Left And Insert (Immediate)
+def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">;
+def : Neon_ScalarShiftLImm_accum_D_size_patterns
+          <int_aarch64_neon_vsli, SLI>;
+
+// Signed Saturating Shift Right Narrow (Immediate)
+defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn,
+                                                    SQSHRNbhi, SQSHRNhsi,
+                                                    SQSHRNsdi>;
+
+// Unsigned Saturating Shift Right Narrow (Immediate)
+defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn,
+                                                    UQSHRNbhi, UQSHRNhsi,
+                                                    UQSHRNsdi>;
+
+// Signed Saturating Rounded Shift Right Narrow (Immediate)
+defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn,
+                                                    SQRSHRNbhi, SQRSHRNhsi,
+                                                    SQRSHRNsdi>;
+
+// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
+defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn,
+                                                    UQRSHRNbhi, UQRSHRNhsi,
+                                                    UQRSHRNsdi>;
+
+// Signed Saturating Shift Right Unsigned Narrow (Immediate)
+defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun,
+                                                    SQSHRUNbhi, SQSHRUNhsi,
+                                                    SQSHRUNsdi>;
+
+// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
+defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
+                                                    SQRSHRUNbhi, SQRSHRUNhsi,
+                                                    SQRSHRUNsdi>;
+
+// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
+defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_s32,
+                                                  int_aarch64_neon_vcvtf64_n_s64,
+                                                  SCVTF_Nssi, SCVTF_Nddi>;
+
+// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
+defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_u32,
+                                                  int_aarch64_neon_vcvtf64_n_u64,
+                                                  UCVTF_Nssi, UCVTF_Nddi>;
+
+// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
+defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_s32_f32,
+                                                  int_aarch64_neon_vcvtd_n_s64_f64,
+                                                  FCVTZS_Nssi, FCVTZS_Nddi>;
+
+// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
+defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_u32_f32,
+                                                  int_aarch64_neon_vcvtd_n_u64_f64,
+                                                  FCVTZU_Nssi, FCVTZU_Nddi>;
+
+// Patterns For Convert Instructions Between v1f64 and v1i64
+class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode,
+                                             Instruction INST>
+    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+          (INST FPR64:$Rn, imm:$Imm)>;
+
+class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode,
+                                             Instruction INST>
+    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+          (INST FPR64:$Rn, imm:$Imm)>;
+
+def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp,
+                                             SCVTF_Nddi>;
+
+def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp,
+                                             UCVTF_Nddi>;
+
+def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs,
+                                             FCVTZS_Nddi>;
+
+def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu,
+                                             FCVTZU_Nddi>;
 
 // Scalar Integer Add
 let isCommutable = 1 in {
@@ -1461,9 +4791,15 @@ def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
 // Scalar Integer Sub
 def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
 
-// Pattern for Scalar Integer Add and Sub with D register
-def : Neon_Scalar_D_size_patterns<add, ADDddd>;
-def : Neon_Scalar_D_size_patterns<sub, SUBddd>;
+// Pattern for Scalar Integer Add and Sub with D register only
+defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>;
 
 // Scalar Integer Saturating Add (Signed, Unsigned)
 defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
@@ -1473,41 +4809,1212 @@ defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
 defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
 defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
 
-// Patterns for Scalar Integer Saturating Add, Sub with D register only
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqadds, SQADDddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqaddu, UQADDddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqsubs, SQSUBddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqsubu, UQSUBddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Saturating Add, Sub  (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb,
+                                           SQADDhhh, SQADDsss, SQADDddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb,
+                                           UQADDhhh, UQADDsss, UQADDddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb,
+                                           SQSUBhhh, SQSUBsss, SQSUBddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
+                                           UQSUBhhh, UQSUBsss, UQSUBddd>;
+
+// Scalar Integer Saturating Doubling Multiply Half High
+defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
+
+// Scalar Integer Saturating Rounding Doubling Multiply Half High
+defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Saturating Doubling Multiply Half High and
+// Scalar Integer Saturating Rounding Doubling Multiply Half High
+defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
+                                                               SQDMULHsss>;
+defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
+                                                                SQRDMULHsss>;
+
+// Scalar Floating-point Multiply Extended
+defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
+
+// Scalar Floating-point Reciprocal Step
+defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
+
+// Scalar Floating-point Reciprocal Square Root Step
+defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Floating-point Reciprocal Step and
+// Scalar Floating-point Reciprocal Square Root Step
+defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrecps, FRECPSsss,
+                                                              FRECPSddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrsqrts, FRSQRTSsss,
+                                                               FRSQRTSddd>;
+
+def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Floating-point Multiply Extended,
+multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
+                                                  Instruction INSTS,
+                                                  Instruction INSTD> {
+  def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
+                                              FMULXsss,FMULXddd>;
 
 // Scalar Integer Shift Left (Signed, Unsigned)
 def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
 def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
 
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>;
+
 // Scalar Integer Saturating Shift Left (Signed, Unsigned)
 defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
 defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
 
-// Scalar Integer Rouding Shift Left (Signed, Unsigned)
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb,
+                                           SQSHLhhh, SQSHLsss, SQSHLddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb,
+                                           UQSHLhhh, UQSHLsss, UQSHLddd>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
+
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
 def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
 def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
 
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
+
 // Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
 defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
 defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
 
-// Patterns for Scalar Integer Shift Lef, Saturating Shift Left,
-// Rounding Shift Left, Rounding Saturating Shift Left with D register only
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
-def : Neon_Scalar_D_size_patterns<shl, SSHLddd>;
-def : Neon_Scalar_D_size_patterns<shl, USHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb,
+                                           SQRSHLhhh, SQRSHLsss, SQRSHLddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
+                                           UQRSHLhhh, UQRSHLsss, UQRSHLddd>;
 
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
+
+// Signed Saturating Doubling Multiply-Add Long
+defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
+defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
+                                            SQDMLALshh, SQDMLALdss>;
+
+// Signed Saturating Doubling Multiply-Subtract Long
+defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
+defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
+                                            SQDMLSLshh, SQDMLSLdss>;
+
+// Signed Saturating Doubling Multiply Long
+defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
+defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
+                                         SQDMULLshh, SQDMULLdss>;
+
+// Scalar Signed Integer Convert To Floating-point
+defm SCVTF  : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_s32,
+                                                 int_aarch64_neon_vcvtf64_s64,
+                                                 SCVTFss, SCVTFdd>;
+
+// Scalar Unsigned Integer Convert To Floating-point
+defm UCVTF  : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_u32,
+                                                 int_aarch64_neon_vcvtf64_u64,
+                                                 UCVTFss, UCVTFdd>;
+
+// Scalar Floating-point Converts
+def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">;
+def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
+                                                  FCVTXN>;
+
+defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
+                                                  FCVTNSss, FCVTNSdd>;
+
+defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
+                                                  FCVTNUss, FCVTNUdd>;
+
+defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
+                                                  FCVTMSss, FCVTMSdd>;
+
+defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
+                                                  FCVTMUss, FCVTMUdd>;
+
+defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
+                                                  FCVTASss, FCVTASdd>;
+
+defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
+                                                  FCVTAUss, FCVTAUdd>;
+
+defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
+                                                  FCVTPSss, FCVTPSdd>;
+
+defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
+                                                  FCVTPUss, FCVTPUdd>;
+
+defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
+                                                  FCVTZSss, FCVTZSdd>;
+
+defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
+                                                  FCVTZUss, FCVTZUdd>;
+
+// Patterns For Convert Instructions Between v1f64 and v1i64
+class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
+                                              Instruction INST>
+    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode,
+                                              Instruction INST>
+    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>;
+def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>;
+
+def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>;
+def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
+
+// Scalar Floating-point Reciprocal Estimate
+defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrecpe,
+                                             FRECPEss, FRECPEdd>;
+
+// Scalar Floating-point Reciprocal Exponent
+defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
+                                             FRECPXss, FRECPXdd>;
+
+// Scalar Floating-point Reciprocal Square Root Estimate
+defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrsqrte,
+                                             FRSQRTEss, FRSQRTEdd>;
+
+// Scalar Floating-point Round
+class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>;
+def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>;
+def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>;
+def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>;
+def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>;
+def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>;
+def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
+
+// Scalar Integer Compare
+
+// Scalar Compare Bitwise Equal
+def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
+
+class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
+                                              Instruction INSTD,
+                                              CondCode CC>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
+
+// Scalar Compare Signed Greather Than Or Equal
+def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
+
+// Scalar Compare Unsigned Higher Or Same
+def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
+
+// Scalar Compare Unsigned Higher
+def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
+
+// Scalar Compare Signed Greater Than
+def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
+
+// Scalar Compare Bitwise Test Bits
+def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
+def : Neon_Scalar3Same_cmp_D_size_patterns<Neon_tst, CMTSTddd>;
+
+// Scalar Compare Bitwise Equal To Zero
+def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq,
+                                                CMEQddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>;
+
+// Scalar Compare Signed Greather Than Or Equal To Zero
+def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge,
+                                                CMGEddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>;
+
+// Scalar Compare Signed Greater Than Zero
+def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt,
+                                                CMGTddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>;
+
+// Scalar Compare Signed Less Than Or Equal To Zero
+def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez,
+                                                CMLEddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>;
+
+// Scalar Compare Less Than Zero
+def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz,
+                                                CMLTddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
+
+// Scalar Floating-point Compare
+
+// Scalar Floating-point Compare Mask Equal
+defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vceq,
+                                             FCMEQsss, FCMEQddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
+
+// Scalar Floating-point Compare Mask Equal To Zero
+defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vceq,
+                                                  FCMEQZssi, FCMEQZddi>;
+def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpz32:$FPImm), SETEQ)),
+          (FCMEQZddi FPR64:$Rn, fpz32:$FPImm)>;
+
+// Scalar Floating-point Compare Mask Greater Than Or Equal
+defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcge,
+                                             FCMGEsss, FCMGEddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
+
+// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
+defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcge,
+                                                  FCMGEZssi, FCMGEZddi>;
+
+// Scalar Floating-point Compare Mask Greather Than
+defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcgt,
+                                             FCMGTsss, FCMGTddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
+
+// Scalar Floating-point Compare Mask Greather Than Zero
+defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcgt,
+                                                  FCMGTZssi, FCMGTZddi>;
+
+// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
+defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vclez,
+                                                  FCMLEZssi, FCMLEZddi>;
+
+// Scalar Floating-point Compare Mask Less Than Zero
+defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcltz,
+                                                  FCMLTZssi, FCMLTZddi>;
+
+// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
+defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcage,
+                                             FACGEsss, FACGEddd>;
+
+// Scalar Floating-point Absolute Compare Mask Greater Than
+defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcagt,
+                                             FACGTsss, FACGTddd>;
+
+// Scakar Floating-point Absolute Difference
+defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd,
+                                         FABDsss, FABDddd>;
+
+// Scalar Absolute Value
+defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
+defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>;
+
+// Scalar Signed Saturating Absolute Value
+defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">;
+defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs,
+                                               SQABSbb, SQABShh, SQABSss, SQABSdd>;
+
+// Scalar Negate
+defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">;
+defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>;
+
+// Scalar Signed Saturating Negate
+defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">;
+defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg,
+                                               SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>;
+
+// Scalar Signed Saturating Accumulated of Unsigned Value
+defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">;
+defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd,
+                                                     SUQADDbb, SUQADDhh,
+                                                     SUQADDss, SUQADDdd>;
+
+// Scalar Unsigned Saturating Accumulated of Signed Value
+defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">;
+defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd,
+                                                     USQADDbb, USQADDhh,
+                                                     USQADDss, USQADDdd>;
+
+def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src),
+                                          (v1i64 FPR64:$Rn))),
+          (SUQADDdd FPR64:$Src, FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src),
+                                          (v1i64 FPR64:$Rn))),
+          (USQADDdd FPR64:$Src, FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))),
+          (ABSdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))),
+          (SQABSdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))),
+          (SQNEGdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))),
+                      (v1i64 FPR64:$Rn))),
+          (NEGdd FPR64:$Rn)>;
+
+// Scalar Signed Saturating Extract Unsigned Narrow
+defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu,
+                                                     SQXTUNbh, SQXTUNhs,
+                                                     SQXTUNsd>;
+
+// Scalar Signed Saturating Extract Narrow
+defm SQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns,
+                                                     SQXTNbh, SQXTNhs,
+                                                     SQXTNsd>;
+
+// Scalar Unsigned Saturating Extract Narrow
+defm UQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu,
+                                                     UQXTNbh, UQXTNhs,
+                                                     UQXTNsd>;
+
+// Scalar Reduce Pairwise
+
+multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
+                                     string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode,
+                                (outs FPR64:$Rd), (ins VPR128:$Rn),
+                                !strconcat(asmop, "\t$Rd, $Rn.2d"),
+                                [],
+                                NoItinerary>;
+  }
+}
+
+multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
+                                     string asmop, bit Commutable = 0>
+  : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> {
+  let isCommutable = Commutable in {
+    def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode,
+                                (outs FPR32:$Rd), (ins VPR64:$Rn),
+                                !strconcat(asmop, "\t$Rd, $Rn.2s"),
+                                [],
+                                NoItinerary>;
+  }
+}
+
+// Scalar Reduce Addition Pairwise (Integer) with
+// Pattern to match llvm.arm.* intrinsic
+defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>;
+
+// Pattern to match llvm.aarch64.* intrinsic for
+// Scalar Reduce Addition Pairwise (Integer)
+def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))),
+          (ADDPvv_D_2D VPR128:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))),
+          (ADDPvv_D_2D VPR128:$Rn)>;
+
+// Scalar Reduce Addition Pairwise (Floating Point)
+defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>;
+
+// Scalar Reduce Maximum Pairwise (Floating Point)
+defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>;
+
+// Scalar Reduce Minimum Pairwise (Floating Point)
+defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>;
+
+// Scalar Reduce maxNum Pairwise (Floating Point)
+defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
+
+// Scalar Reduce minNum Pairwise (Floating Point)
+defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
+
+multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnodeS,
+                                            SDPatternOperator opnodeD,
+                                            Instruction INSTS,
+                                            Instruction INSTD> {
+  def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))),
+            (INSTS VPR64:$Rn)>;
+  def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))),
+            (INSTD VPR128:$Rn)>;
+}
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
+  int_aarch64_neon_vpfaddq, FADDPvv_S_2S, FADDPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
+  int_aarch64_neon_vpmaxq, FMAXPvv_S_2S, FMAXPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
+  int_aarch64_neon_vpminq, FMINPvv_S_2S, FMINPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
+  int_aarch64_neon_vpfmaxnmq, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
+  int_aarch64_neon_vpfminnmq, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vaddv,
+    int_aarch64_neon_vaddv, FADDPvv_S_2S, FADDPvv_D_2D>;
+
+def : Pat<(v1f32 (int_aarch64_neon_vaddv (v4f32 VPR128:$Rn))),
+          (FADDPvv_S_2S (v2f32
+               (EXTRACT_SUBREG
+                   (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
+                   sub_64)))>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxv,
+    int_aarch64_neon_vmaxv, FMAXPvv_S_2S, FMAXPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminv,
+    int_aarch64_neon_vminv, FMINPvv_S_2S, FMINPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxnmv,
+    int_aarch64_neon_vmaxnmv, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminnmv,
+    int_aarch64_neon_vminnmv, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
+
+// Scalar by element Arithmetic
+
+class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
+                                    string rmlane, bit u, bit szhi, bit szlo,
+                                    RegisterClass ResFPR, RegisterClass OpFPR,
+                                    RegisterOperand OpVPR, Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
+                                                    string rmlane,
+                                                    bit u, bit szhi, bit szlo,
+                                                    RegisterClass ResFPR,
+                                                    RegisterClass OpFPR,
+                                                    RegisterOperand OpVPR,
+                                                    Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  let Constraints = "$src = $Rd";
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+// Scalar Floating Point  multiply (scalar, by element)
+def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point  multiply extended (scalar, by element)
+def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns<
+  SDPatternOperator opnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+
+  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))),
+             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))),
+             (ResTy (INST (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped operands
+  def  : Pat<(ResTy (opnode
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Rn))),
+             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Rn))),
+             (ResTy (INST (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+}
+
+// Patterns for Scalar Floating Point  multiply (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S,
+  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+// Patterns for Scalar Floating Point  multiply extended (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
+  FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
+  FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+
+
+// Scalar Floating Point fused multiply-add (scalar, by element)
+def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point fused multiply-subtract (scalar, by element)
+def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+// We are allowed to match the fma instruction regardless of compile options.
+multiclass Neon_ScalarXIndexedElem_FMA_Patterns<
+  Instruction FMLAI, Instruction FMLSI,
+  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+  // fmla
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped fmla operands
+  def  : Pat<(ResTy (fma
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // fmls
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped fmls operands
+  def  : Pat<(ResTy (fma
+               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma
+               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+}
+
+// Scalar Floating Point fused multiply-add and
+// multiply-subtract (scalar, by element)
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S,
+  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+// Scalar Signed saturating doubling multiply long (scalar, by element)
+def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
+  SDPatternOperator opnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass FPRC,
+  ValueType OpVTy, ValueType OpTy,
+  ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
+
+  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
+               (OpVTy (scalar_to_vector
+                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+
+  //swapped operands
+  def  : Pat<(ResTy (opnode
+               (OpVTy (scalar_to_vector
+                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
+                 (OpVTy FPRC:$Rn))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+}
+
+
+// Patterns for Scalar Signed saturating doubling
+// multiply long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Signed saturating doubling multiply-add long (scalar, by element)
+def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating doubling
+// multiply-subtract long (scalar, by element)
+def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
+  SDPatternOperator opnode,
+  SDPatternOperator coreopnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC,
+  ValueType OpTy,
+  ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
+
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode (OpTy FPRC:$Rn),
+                 (OpTy (scalar_to_vector
+                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+
+  // swapped operands
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode
+                 (OpTy (scalar_to_vector
+                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))),
+                 (OpTy FPRC:$Rn))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+}
+
+// Patterns for Scalar Signed saturating
+// doubling multiply-add long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Patterns for Scalar Signed saturating
+// doubling multiply-sub long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar general arithmetic operation
+class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (INST FPR64:$Rn, FPR64:$Rm)>;
+
+class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
+              (v1f64 FPR64:$Ra))),
+          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
+
+def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
+def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
+
+def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
+def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
+
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Patterns for Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Signed saturating rounding doubling multiply
+// returning high half (scalar, by element)
+def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32,
+  VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32,
+  VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32,
+  VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
+  VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Copy - DUP element to scalar
+class NeonI_Scalar_DUP<string asmop, string asmlane,
+                       RegisterClass ResRC, RegisterOperand VPRC,
+                       Operand OpImm>
+  : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
+                     asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
+                     [],
+                     NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+multiclass NeonI_Scalar_DUP_Elt_pattern<Instruction DUPI, ValueType ResTy,
+  ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+  def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+                OpNImm:$Imm))>;
+}
+
+// Patterns for vector extract of FP data using scalar DUP instructions
+defm : NeonI_Scalar_DUP_Elt_pattern<DUPsv_S, f32,
+  v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Elt_pattern<DUPdv_D, f64,
+  v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
+  ValueType ResTy, ValueType OpTy,Operand OpLImm,
+  ValueType NOpTy, ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)),
+            (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>;
+
+  def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+                OpNImm:$Imm))>;
+}
+
+// Patterns for extract subvectors of v1ix data using scalar DUP instructions.
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare,
+                                        v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare,
+                                        v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare,
+                                        v2i32, v4i32, neon_uimm1_bare>;
+
+multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy,
+                                          ValueType OpTy, ValueType ElemTy,
+                                          Operand OpImm, ValueType OpNTy,
+                                          ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (vector_insert (ResTy undef),
+              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
+              (neon_uimm0_bare:$Imm))),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (vector_insert (ResTy undef),
+              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
+              (OpNImm:$Imm))),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              OpNImm:$Imm))>;
+}
+
+multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy,
+                                          ValueType OpTy, ValueType ElemTy,
+                                          Operand OpImm, ValueType OpNTy,
+                                          ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (scalar_to_vector
+              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (scalar_to_vector
+              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              OpNImm:$Imm))>;
+}
+
+// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP
+// instructions.
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
+  v1i64, v2i64, i64, neon_uimm1_bare,
+  v1i64, v2i64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
+  v1i32, v4i32, i32, neon_uimm2_bare,
+  v2i32, v4i32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
+  v1i16, v8i16, i32, neon_uimm3_bare,
+  v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
+  v1i8, v16i8, i32, neon_uimm4_bare,
+  v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
+  v1f64, v2f64, f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
+  v1f32, v4f32, f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
+  v1i64, v2i64, i64, neon_uimm1_bare,
+  v1i64, v2i64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
+  v1i32, v4i32, i32, neon_uimm2_bare,
+  v2i32, v4i32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
+  v1i16, v8i16, i32, neon_uimm3_bare,
+  v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
+  v1i8, v16i8, i32, neon_uimm4_bare,
+  v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
+  v1f64, v2f64, f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
+  v1f32, v4f32, f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+
+multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
+                                  Instruction DUPI, Operand OpImm,
+                                  RegisterClass ResRC> {
+  def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"),
+          (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>;
+}
+
+// Aliases for Scalar copy - DUP element (scalar)
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
+defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>;
+
+multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy,
+                      ValueType OpTy> {
+  def : Pat<(ResTy (GetLow VPR128:$Rn)),
+            (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>;
+  def : Pat<(ResTy (GetHigh VPR128:$Rn)),
+            (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>;
+}
+
+defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>;
+defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>;
+defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>;
+defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
+defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
+defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -1578,57 +6085,2587 @@ def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
 
-
 // ...and scalar bitcasts...
+def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
+def : Pat<(f32 (bitconvert (v1i32  FPR32:$src))), (f32 FPR32:$src)>;
+def : Pat<(f64 (bitconvert (v1i64  FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f32 (bitconvert (v1f32  FPR32:$src))), (f32 FPR32:$src)>;
+def : Pat<(f64 (bitconvert (v1f64  FPR64:$src))), (f64 FPR64:$src)>;
+
+def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v1f64  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
+
+def : Pat<(i32 (bitconvert (v1i32  FPR32:$src))), (FMOVws $src)>;
+
+def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
+
+def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))), (f64 VPR64:$src)>;
+
+def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
+
+def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
+def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
+def : Pat<(v1i64 (bitconvert (f64  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f32 (bitconvert (f32  FPR32:$src))), (v1f32 FPR32:$src)>;
+def : Pat<(v1f64 (bitconvert (f64  FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v1f64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+
+def : Pat<(v1i32 (bitconvert (i32  GPR32:$src))), (FMOVsw $src)>;
+
+def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
+
+// Scalar Three Same
+
+def neon_uimm3 : Operand<i64>,
+                   ImmLeaf<i64, [{return Imm < 8;}]> {
+  let ParserMatchClass = uimm3_asmoperand;
+  let PrintMethod = "printUImmHexOperand";
+}
+
+def neon_uimm4 : Operand<i64>,
+                   ImmLeaf<i64, [{return Imm < 16;}]> {
+  let ParserMatchClass = uimm4_asmoperand;
+  let PrintMethod = "printUImmHexOperand";
+}
+
+// Bitwise Extract
+class NeonI_Extract<bit q, bits<2> op2, string asmop,
+                    string OpS, RegisterOperand OpVPR, Operand OpImm>
+  : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd),
+                     (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index),
+                     asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
+                     ", $Rm." # OpS # ", $Index",
+                     [],
+                     NoItinerary>{
+  bits<4> Index;
+}
+
+def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b",
+                               VPR64, neon_uimm3> {
+  let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}};
+}
+
+def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b",
+                               VPR128, neon_uimm4> {
+  let Inst{14-11} = Index;
+}
+
+class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST,
+                 Operand OpImm>
+  : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm),
+                                 (i64 OpImm:$Imm))),
+              (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>;
+
+def : NI_Extract<v8i8,  VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v4i16, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v2i32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v1i64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v2f32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v1f64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>;
+
+// Table lookup
+class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
+             string asmop, string OpS, RegisterOperand OpVPR,
+             RegisterOperand VecList>
+  : NeonI_TBL<q, op2, len, op,
+              (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
+              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
+              [],
+              NoItinerary>;
+
+// The vectors in look up table are always 16b
+multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
+  def _8b  : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+
+  def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+}
+
+defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">;
+defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
+defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
+defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
+
+// Table lookup extention
+class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
+             string asmop, string OpS, RegisterOperand OpVPR,
+             RegisterOperand VecList>
+  : NeonI_TBL<q, op2, len, op,
+              (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
+              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
+              [],
+              NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// The vectors in look up table are always 16b
+multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> {
+  def _8b  : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+
+  def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+}
+
+defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">;
+defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">;
+defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">;
+defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">;
+
+class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
+                     RegisterClass OpGPR, ValueType OpTy, Operand OpImm>
+  : NeonI_copy<0b1, 0b0, 0b0011,
+               (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd." # Res # "[$Imm], $Rn",
+               [(set (ResTy VPR128:$Rd),
+                 (ResTy (vector_insert
+                   (ResTy VPR128:$src),
+                   (OpTy OpGPR:$Rn),
+                   (OpImm:$Imm))))],
+               NoItinerary> {
+  bits<4> Imm;
+  let Constraints = "$src = $Rd";
+}
+
+//Insert element (vector, from main)
+def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32,
+                           neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32,
+                           neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32,
+                           neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64,
+                           neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn",
+                    (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn",
+                    (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn",
+                    (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn",
+                    (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>;
+
+class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy,
+                             RegisterClass OpGPR, ValueType OpTy,
+                             Operand OpImm, Instruction INS>
+  : Pat<(ResTy (vector_insert
+              (ResTy VPR64:$src),
+              (OpTy OpGPR:$Rn),
+              (OpImm:$Imm))),
+        (ResTy (EXTRACT_SUBREG
+          (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+            OpGPR:$Rn, OpImm:$Imm)), sub_64))>;
+
+def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32,
+                                          neon_uimm3_bare, INSbw>;
+def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32,
+                                          neon_uimm2_bare, INShw>;
+def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32,
+                                          neon_uimm1_bare, INSsw>;
+def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64,
+                                          neon_uimm0_bare, INSdx>;
+
+class NeonI_INS_element<string asmop, string Res, Operand ResImm>
+  : NeonI_insert<0b1, 0b1,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn,
+                 ResImm:$Immd, ResImm:$Immn),
+                 asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
+                 [],
+                 NoItinerary> {
+  let Constraints = "$src = $Rd";
+  bits<4> Immd;
+  bits<4> Immn;
+}
+
+//Insert element (vector, from element)
+def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> {
+  let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1};
+  let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}};
+}
+def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> {
+  let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0};
+  let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0};
+  // bit 11 is unspecified, but should be set to zero.
+}
+def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> {
+  let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0};
+  let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0};
+  // bits 11-12 are unspecified, but should be set to zero.
+}
+def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> {
+  let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0};
+  let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0};
+  // bits 11-13 are unspecified, but should be set to zero.
+}
+
+def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]",
+                    (INSELb VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]",
+                    (INSELh VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]",
+                    (INSELs VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]",
+                    (INSELd VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>;
+
+multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy,
+                                ValueType MidTy, Operand StImm, Operand NaImm,
+                                Instruction INS> {
+def : Pat<(ResTy (vector_insert
+            (ResTy VPR128:$src),
+            (MidTy (vector_extract
+              (ResTy VPR128:$Rn),
+              (StImm:$Immn))),
+            (StImm:$Immd))),
+          (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
+              StImm:$Immd, StImm:$Immn)>;
+
+def : Pat <(ResTy (vector_insert
+             (ResTy VPR128:$src),
+             (MidTy (vector_extract
+               (NaTy VPR64:$Rn),
+               (NaImm:$Immn))),
+             (StImm:$Immd))),
+           (INS (ResTy VPR128:$src),
+             (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
+             StImm:$Immd, NaImm:$Immn)>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy (vector_extract
+               (ResTy VPR128:$Rn),
+               (StImm:$Immn))),
+             (NaImm:$Immd))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy VPR128:$Rn),
+               NaImm:$Immd, StImm:$Immn)),
+             sub_64))>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy (vector_extract
+               (NaTy VPR64:$Rn),
+               (NaImm:$Immn))),
+             (NaImm:$Immd))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
+               NaImm:$Immd, NaImm:$Immn)),
+             sub_64))>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare,
+                            neon_uimm1_bare, INSELs>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare,
+                            neon_uimm0_bare, INSELd>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
+                            neon_uimm3_bare, INSELb>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
+                            neon_uimm2_bare, INSELh>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                            neon_uimm1_bare, INSELs>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare,
+                            neon_uimm0_bare, INSELd>;
+
+multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
+                                      ValueType MidTy,
+                                      RegisterClass OpFPR, Operand ResImm,
+                                      SubRegIndex SubIndex, Instruction INS> {
+def : Pat <(ResTy (vector_insert
+             (ResTy VPR128:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (INS (ResTy VPR128:$src),
+             (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
+             ResImm:$Imm,
+             (i64 0))>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
+               ResImm:$Imm,
+               (i64 0))),
+             sub_64))>;
+}
+
+defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
+                                  sub_32, INSELs>;
+defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
+                                  sub_64, INSELd>;
+
+class NeonI_SMOV<string asmop, string Res, bit Q,
+                 ValueType OpTy, ValueType eleTy,
+                 Operand OpImm, RegisterClass ResGPR, ValueType ResTy>
+  : NeonI_copy<Q, 0b0, 0b0101,
+               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
+               [(set (ResTy ResGPR:$Rd),
+                 (ResTy (sext_inreg
+                   (ResTy (vector_extract
+                     (OpTy VPR128:$Rn), (OpImm:$Imm))),
+                   eleTy)))],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+//Signed integer move (main, from element)
+def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare,
+                        GPR32, i32> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare,
+                        GPR32, i32> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
+                               ValueType eleTy, Operand StImm,  Operand NaImm,
+                               Instruction SMOVI> {
+  def : Pat<(i64 (sext_inreg
+              (i64 (anyext
+                (i32 (vector_extract
+                  (StTy VPR128:$Rn), (StImm:$Imm))))),
+              eleTy)),
+            (SMOVI VPR128:$Rn, StImm:$Imm)>;
+
+  def : Pat<(i64 (sext
+              (i32 (vector_extract
+                (StTy VPR128:$Rn), (StImm:$Imm))))),
+            (SMOVI VPR128:$Rn, StImm:$Imm)>;
+
+  def : Pat<(i64 (sext_inreg
+              (i64 (vector_extract
+                (NaTy VPR64:$Rn), (NaImm:$Imm))),
+              eleTy)),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+
+  def : Pat<(i64 (sext_inreg
+              (i64 (anyext
+                (i32 (vector_extract
+                  (NaTy VPR64:$Rn), (NaImm:$Imm))))),
+              eleTy)),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+
+  def : Pat<(i64 (sext
+              (i32 (vector_extract
+                (NaTy VPR64:$Rn), (NaImm:$Imm))))),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+}
+
+defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                          neon_uimm3_bare, SMOVxb>;
+defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                          neon_uimm2_bare, SMOVxh>;
+defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                          neon_uimm1_bare, SMOVxs>;
+
+class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
+                          ValueType eleTy, Operand StImm,  Operand NaImm,
+                          Instruction SMOVI>
+  : Pat<(i32 (sext_inreg
+          (i32 (vector_extract
+            (NaTy VPR64:$Rn), (NaImm:$Imm))),
+          eleTy)),
+        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+          NaImm:$Imm)>;
+
+def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                         neon_uimm3_bare, SMOVwb>;
+def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                         neon_uimm2_bare, SMOVwh>;
+
+class NeonI_UMOV<string asmop, string Res, bit Q,
+                 ValueType OpTy, Operand OpImm,
+                 RegisterClass ResGPR, ValueType ResTy>
+  : NeonI_copy<Q, 0b0, 0b0111,
+               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
+               [(set (ResTy ResGPR:$Rd),
+                  (ResTy (vector_extract
+                    (OpTy VPR128:$Rn), (OpImm:$Imm))))],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+//Unsigned integer move (main, from element)
+def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare,
+                         GPR64, i64> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]",
+                    (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]",
+                    (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>;
+
+class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
+                         Operand StImm,  Operand NaImm,
+                         Instruction SMOVI>
+  : Pat<(ResTy (vector_extract
+          (NaTy VPR64:$Rn), NaImm:$Imm)),
+        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+          NaImm:$Imm)>;
+
+def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
+                        neon_uimm3_bare, UMOVwb>;
+def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
+                        neon_uimm2_bare, UMOVwh>;
+def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                        neon_uimm1_bare, UMOVws>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))),
+            255)),
+          (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))),
+            65535)),
+          (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>;
+
+def : Pat<(i64 (zext
+            (i32 (vector_extract
+              (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))),
+          (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))),
+            255)),
+          (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm3_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))),
+            65535)),
+          (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm2_bare:$Imm)>;
+
+def : Pat<(i64 (zext
+            (i32 (vector_extract
+              (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))),
+          (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm0_bare:$Imm)>;
+
+// Additional copy patterns for scalar types
+def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))),
+          (UMOVwb (v16i8
+            (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>;
+
+def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))),
+          (UMOVwh (v8i16
+            (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>;
+
+def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))),
+          (FMOVws FPR32:$Rn)>;
+
+def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
+          (FMOVxd FPR64:$Rn)>;
+
+def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
+          (f64 FPR64:$Rn)>;
+
+def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))),
+          (f32 FPR32:$Rn)>;
+
+def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
+          (v1i8 (EXTRACT_SUBREG (v16i8
+            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_8))>;
+
+def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)),
+          (v1i16 (EXTRACT_SUBREG (v8i16
+            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_16))>;
+
+def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
+          (FMOVsw $src)>;
+
+def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
+          (FMOVdx $src)>;
+
+def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (v1f32 FPR32:$Rn)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (v1f64 FPR64:$Rn)>;
+
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
+          (FMOVdd $src)>;
+
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
+                         (f64 FPR64:$src), sub_64)>;
+
+class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
+                    RegisterOperand ResVPR, Operand OpImm>
+  : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
+               (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
+               [],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128,
+                              neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
+                                       ValueType OpTy,ValueType NaTy,
+                                       ValueType ExTy, Operand OpLImm,
+                                       Operand OpNImm> {
+def  : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
+        (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
+
+def : Pat<(ResTy (Neon_vduplane
+            (NaTy VPR64:$Rn), OpNImm:$Imm)),
+          (ResTy (DUPELT
+            (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
+}
+defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+
+def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v2f32 (DUPELT2s
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v4f32 (DUPELT4s
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
+          (v2f64 (DUPELT2d
+            (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
+            (i64 0)))>;
+
+class NeonI_DUP<bit Q, string asmop, string rdlane,
+                RegisterOperand ResVPR, ValueType ResTy,
+                RegisterClass OpGPR, ValueType OpTy>
+  : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
+               asmop # "\t$Rd" # rdlane # ", $Rn",
+               [(set (ResTy ResVPR:$Rd),
+                 (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
+               NoItinerary>;
+
+def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
+  let Inst{20-16} = 0b00001;
+  // bits 17-20 are unspecified, but should be set to zero.
+}
+
+def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
+  let Inst{20-16} = 0b00010;
+  // bits 18-20 are unspecified, but should be set to zero.
+}
+
+def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
+  let Inst{20-16} = 0b00100;
+  // bits 19-20 are unspecified, but should be set to zero.
+}
+
+def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
+  let Inst{20-16} = 0b01000;
+  // bit 20 is unspecified, but should be set to zero.
+}
+
+def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
+  let Inst{20-16} = 0b00001;
+  // bits 17-20 are unspecified, but should be set to zero.
+}
+
+def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
+  let Inst{20-16} = 0b00010;
+  // bits 18-20 are unspecified, but should be set to zero.
+}
+
+def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
+  let Inst{20-16} = 0b00100;
+  // bits 19-20 are unspecified, but should be set to zero.
+}
+
+// patterns for CONCAT_VECTORS
+multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
+          (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
+          (INSELd
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
+            (i64 1),
+            (i64 0))>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
+          (DUPELT2d
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (i64 0))> ;
+}
+
+defm : Concat_Vector_Pattern<v16i8, v8i8>;
+defm : Concat_Vector_Pattern<v8i16, v4i16>;
+defm : Concat_Vector_Pattern<v4i32, v2i32>;
+defm : Concat_Vector_Pattern<v2i64, v1i64>;
+defm : Concat_Vector_Pattern<v4f32, v2f32>;
+defm : Concat_Vector_Pattern<v2f64, v1f64>;
+
+//patterns for EXTRACT_SUBVECTOR
+def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
+          (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
+          (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
+          (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
+          (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
+          (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
+          (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+
+// The followings are for instruction class (3V Elem)
+
+// Variant 1
+
+class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
+             string asmop, string ResS, string OpS, string EleOpS,
+             Operand OpImm, RegisterOperand ResVPR,
+             RegisterOperand OpVPR, RegisterOperand EleOpVPR>
+  : NeonI_2VElem<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn,
+                                         EleOpVPR:$Re, OpImm:$Index),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
+                 ", $Re." # EleOpS # "[$Index]",
+                 [],
+                 NoItinerary> {
+  bits<3> Index;
+  bits<5> Re;
+
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
+                     neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
+                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">;
+defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                   RegisterOperand ResVPR, RegisterOperand OpVPR,
+                   RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
+                   ValueType EleOpTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                  RegisterOperand ResVPR, RegisterOperand OpVPR,
+                  RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
+                  ValueType EleOpTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST ResVPR:$src, OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op>
+{
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                     op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                     op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
+                     op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
+                     op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                    op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
+                    op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
+}
+
+defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>;
+defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>;
+
+class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS, string EleOpS,
+                 Operand OpImm, RegisterOperand ResVPR,
+                 RegisterOperand OpVPR, RegisterOperand EleOpVPR>
+  : NeonI_2VElem<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins OpVPR:$Rn,
+                                         EleOpVPR:$Re, OpImm:$Index),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
+                 ", $Re." # EleOpS # "[$Index]",
+                 [],
+                 NoItinerary> {
+  bits<3> Index;
+  bits<5> Re;
+}
+
+multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
+                         neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
+                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
+defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
+defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                       RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                       ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                      RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                      ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2i32, v2i32, v4i32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                         op, VPR128, VPR128, v4i32, v4i32, v4i32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
+                         op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
+                         op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2i32, v2i32, v2i32>;
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
+                        op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
+}
+
+defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>;
+defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>;
+defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>;
+
+// Variant 2
+
+multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // _1d2d doesn't exist!
+
+  def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
+                         neon_uimm1_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{0}};
+    let Inst{21} = 0b0;
+    let Inst{20-16} = Re;
+  }
+}
+
+defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
+defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
+
+class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
+                         RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                         ValueType ResTy, ValueType OpTy, ValueType EleOpTy,
+                         SDPatternOperator coreop>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))),
+        (INST OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>;
+
+multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2f32, v2f32, v4f32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                         op, VPR128, VPR128, v4f32, v4f32, v4f32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
+                         op, VPR128, VPR128, v2f64, v2f64, v2f64>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2f32, v2f32, v2f32>;
+
+  def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
+                           op, VPR128, VPR64, v2f64, v2f64, v1f64,
+                           BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
+}
+
+defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>;
+defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>;
+
+def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))),
+                       (v2f32 VPR64:$Rn))),
+          (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))),
+                       (v4f32 VPR128:$Rn))),
+          (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))),
+                       (v2f64 VPR128:$Rn))),
+          (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>;
+
+// The followings are patterns using fma
+// -ffp-contract=fast generates fma
+
+multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // _1d2d doesn't exist!
+
+  def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
+                     neon_uimm1_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{0}};
+    let Inst{21} = 0b0;
+    let Inst{20-16} = Re;
+  }
+}
+
+defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">;
+defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                       RegisterOperand ResVPR, RegisterOperand OpVPR,
+                       ValueType ResTy, ValueType OpTy,
+                       SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
+                   (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))),
+        (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane 0
+class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op,
+                      RegisterOperand ResVPR, ValueType ResTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$Rn),
+                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
+                   (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                      RegisterOperand ResVPR, RegisterOperand OpVPR,
+                      ValueType ResTy, ValueType OpTy,
+                      SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
+                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm,
+                           SDPatternOperator op,
+                           RegisterOperand ResVPR, RegisterOperand OpVPR,
+                           ValueType ResTy, ValueType OpTy,
+                           SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))),
+                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>;
+
+
+multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> {
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"),
+                        op, VPR64, v2f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"),
+                        op, VPR128, v4f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
+}
+
+defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>;
+
+// Pattern for lane 0
+class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op,
+                      RegisterOperand ResVPR, ValueType ResTy>
+  : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)),
+                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
+                   (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op>
+{
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"),
+                        op, VPR64, v2f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(fneg (Neon_vduplane
+                                     node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"),
+                        op, VPR128, v4f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(fneg (Neon_vduplane
+                                     node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(fneg (Neon_vduplane
+                                    node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(Neon_vduplane
+                                    (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
+                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
+                        BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
+                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
+                        BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(fneg (Neon_combine_2d
+                                         node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(Neon_combine_2d
+                                         (fneg node:$LHS), (fneg node:$RHS))>>;
+}
+
+defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>;
+
+// Variant 3: Long type
+// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S
+//      SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S
+
+multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
+                     neon_uimm2_bare, VPR128, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
+                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
+                     neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">;
+defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">;
+defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">;
+defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">;
+defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">;
+defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">;
+
+multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
+                         neon_uimm2_bare, VPR128, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
+                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
+                         neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
+defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
+defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
+
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
+          (FMOVdd $src)>;
+def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))),
+          (FMOVss $src)>;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                     RegisterOperand EleOpVPR, ValueType ResTy,
+                     ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                     SDPatternOperator hiop>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                    RegisterOperand EleOpVPR, ValueType ResTy,
+                    ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                    SDPatternOperator hiop>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$src, VPR128:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op,
+                     ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
+                     SDPatternOperator hiop, Instruction DupInst>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
+        (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>;
+
+multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                     op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                     op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                       op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                       op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
+                       op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
+                       op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                    op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                    op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                      op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                      op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>;
+defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>;
+defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>;
+defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                         RegisterOperand EleOpVPR, ValueType ResTy,
+                         ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                         SDPatternOperator hiop>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                        RegisterOperand EleOpVPR, ValueType ResTy,
+                        ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                        SDPatternOperator hiop>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+// Pattern for fixed lane 0
+class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op,
+                         ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
+                         SDPatternOperator hiop, Instruction DupInst>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
+        (INST VPR128:$Rn, (DupInst $Re), 0)>;
+
+multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                         op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                         op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                           op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"),
+                           op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"),
+                           op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                        op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                          op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                          op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>;
+defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>;
+defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>;
+
+multiclass NI_qdma<SDPatternOperator op> {
+  def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                    (op node:$Ra,
+                      (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
+
+  def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                    (op node:$Ra,
+                      (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
+}
+
+defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>;
+defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>;
+
+multiclass NI_2VEL_v3_qdma_pat<string subop, string op> {
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                     !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo,
+                     v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                     !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128,
+                     v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                       !cast<PatFrag>(op # "_4s"), VPR128Lo,
+                       v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                       !cast<PatFrag>(op # "_2d"), VPR128,
+                       v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
+                       !cast<PatFrag>(op # "_4s"),
+                       v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
+                       !cast<PatFrag>(op # "_2d"),
+                       v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                    !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo,
+                    v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                    !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64,
+                    v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                      !cast<PatFrag>(op # "_4s"), VPR64Lo,
+                      v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                      !cast<PatFrag>(op # "_2d"), VPR64,
+                      v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">;
+defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">;
+
+// End of implementation for instruction class (3V Elem)
+
+class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
+                bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy,
+                SDPatternOperator Neon_Rev>
+  : NeonI_2VMisc<Q, U, size, opcode,
+               (outs ResVPR:$Rd), (ins ResVPR:$Rn),
+               asmop # "\t$Rd." # Res # ", $Rn." # Res,
+               [(set (ResTy ResVPR:$Rd),
+                  (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
+               NoItinerary> ;
+
+def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
+                          v16i8, Neon_rev64>;
+def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128,
+                         v8i16, Neon_rev64>;
+def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128,
+                         v4i32, Neon_rev64>;
+def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64,
+                         v8i8, Neon_rev64>;
+def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64,
+                         v4i16, Neon_rev64>;
+def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64,
+                         v2i32, Neon_rev64>;
+
+def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>;
+def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>;
+
+def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128,
+                          v16i8, Neon_rev32>;
+def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128,
+                          v8i16, Neon_rev32>;
+def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64,
+                         v8i8, Neon_rev32>;
+def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64,
+                         v4i16, Neon_rev32>;
+
+def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128,
+                          v16i8, Neon_rev16>;
+def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
+                         v8i8, Neon_rev16>;
+
+multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
+                             SDPatternOperator Neon_Padd> {
+  def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.8h, $Rn.16b",
+                           [(set (v8i16 VPR128:$Rd),
+                              (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.8b",
+                          [(set (v4i16 VPR64:$Rd),
+                             (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
+                          NoItinerary>;
+
+  def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.4s, $Rn.8h",
+                           [(set (v4i32 VPR128:$Rd),
+                              (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.4h",
+                          [(set (v2i32 VPR64:$Rd),
+                             (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
+                          NoItinerary>;
+
+  def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.2d, $Rn.4s",
+                           [(set (v2i64 VPR128:$Rd),
+                              (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.1d, $Rn.2s",
+                          [(set (v1i64 VPR64:$Rd),
+                             (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
+                          NoItinerary>;
+}
+
+defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
+                                int_arm_neon_vpaddls>;
+defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
+                                int_arm_neon_vpaddlu>;
+
+multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
+                             SDPatternOperator Neon_Padd> {
+  let Constraints = "$src = $Rd" in {
+    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                             asmop # "\t$Rd.8h, $Rn.16b",
+                             [(set (v8i16 VPR128:$Rd),
+                                (v8i16 (Neon_Padd
+                                  (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
+                             NoItinerary>;
+
+    def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.4h, $Rn.8b",
+                            [(set (v4i16 VPR64:$Rd),
+                               (v4i16 (Neon_Padd
+                                 (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
+                            NoItinerary>;
+
+    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "\t$Rd.4s, $Rn.8h",
+                            [(set (v4i32 VPR128:$Rd),
+                               (v4i32 (Neon_Padd
+                                 (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
+                            NoItinerary>;
+
+    def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.2s, $Rn.4h",
+                            [(set (v2i32 VPR64:$Rd),
+                               (v2i32 (Neon_Padd
+                                 (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
+                            NoItinerary>;
+
+    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "\t$Rd.2d, $Rn.4s",
+                            [(set (v2i64 VPR128:$Rd),
+                               (v2i64 (Neon_Padd
+                                 (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
+                            NoItinerary>;
+
+    def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.1d, $Rn.2s",
+                            [(set (v1i64 VPR64:$Rd),
+                               (v1i64 (Neon_Padd
+                                 (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
+                            NoItinerary>;
+  }
+}
+
+defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110,
+                                   int_arm_neon_vpadals>;
+defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110,
+                                   int_arm_neon_vpadalu>;
+
+multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
+  def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [], NoItinerary>;
+
+  def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.8h, $Rn.8h",
+                        [], NoItinerary>;
+
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [], NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [], NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                         (outs VPR64:$Rd), (ins VPR64:$Rn),
+                         asmop # "\t$Rd.8b, $Rn.8b",
+                         [], NoItinerary>;
+
+  def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.4h, $Rn.4h",
+                        [], NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [], NoItinerary>;
+}
+
+defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
+defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>;
+defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>;
+defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>;
+
+multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix,
+                                          SDPatternOperator Neon_Op> {
+  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))),
+            (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>;
+
+  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))),
+            (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))),
+            (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))),
+            (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>;
+}
+
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>;
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>;
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>;
+
+def : Pat<(v16i8 (sub
+            (v16i8 Neon_AllZero),
+            (v16i8 VPR128:$Rn))),
+          (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (sub
+            (v8i8 Neon_AllZero),
+            (v8i8 VPR64:$Rn))),
+          (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>;
+def : Pat<(v8i16 (sub
+            (v8i16 (bitconvert (v16i8 Neon_AllZero))),
+            (v8i16 VPR128:$Rn))),
+          (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>;
+def : Pat<(v4i16 (sub
+            (v4i16 (bitconvert (v8i8 Neon_AllZero))),
+            (v4i16 VPR64:$Rn))),
+          (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>;
+def : Pat<(v4i32 (sub
+            (v4i32 (bitconvert (v16i8 Neon_AllZero))),
+            (v4i32 VPR128:$Rn))),
+          (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>;
+def : Pat<(v2i32 (sub
+            (v2i32 (bitconvert (v8i8 Neon_AllZero))),
+            (v2i32 VPR64:$Rn))),
+          (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>;
+def : Pat<(v2i64 (sub
+            (v2i64 (bitconvert (v16i8 Neon_AllZero))),
+            (v2i64 VPR128:$Rn))),
+          (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>;
+
+multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
+  let Constraints = "$src = $Rd" in {
+    def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                           asmop # "\t$Rd.16b, $Rn.16b",
+                           [], NoItinerary>;
+
+    def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.8h, $Rn.8h",
+                          [], NoItinerary>;
+
+    def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.4s, $Rn.4s",
+                          [], NoItinerary>;
+
+    def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.2d, $Rn.2d",
+                          [], NoItinerary>;
+
+    def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.8b, $Rn.8b",
+                          [], NoItinerary>;
+
+    def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4h",
+                          [], NoItinerary>;
+
+    def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2s",
+                          [], NoItinerary>;
+  }
+}
+
+defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>;
+defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>;
+
+multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix,
+                                           SDPatternOperator Neon_Op> {
+  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))),
+            (v16i8 (!cast<Instruction>(Prefix # 16b)
+              (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>;
+
+  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))),
+            (v8i16 (!cast<Instruction>(Prefix # 8h)
+              (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))),
+            (v4i32 (!cast<Instruction>(Prefix # 4s)
+              (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))),
+            (v2i64 (!cast<Instruction>(Prefix # 2d)
+              (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8b)
+              (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4h)
+              (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2s)
+              (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>;
+}
+
+defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>;
+defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>;
+
+multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
+                          SDPatternOperator Neon_Op> {
+  def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [(set (v16i8 VPR128:$Rd),
+                            (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
+                         NoItinerary>;
+
+  def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.8h, $Rn.8h",
+                        [(set (v8i16 VPR128:$Rd),
+                           (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4i32 VPR128:$Rd),
+                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.8b, $Rn.8b",
+                        [(set (v8i8 VPR64:$Rd),
+                           (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
+                        NoItinerary>;
+
+  def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.4h, $Rn.4h",
+                        [(set (v4i16 VPR64:$Rd),
+                           (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2i32 VPR64:$Rd),
+                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
+defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>;
+
+multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
+                              bits<5> Opcode> {
+  def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [], NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.8b, $Rn.8b",
+                        [], NoItinerary>;
+}
+
+defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
+defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>;
+defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>;
+
+def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b",
+                    (NOT16b VPR128:$Rd, VPR128:$Rn), 0>;
+def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b",
+                    (NOT8b VPR64:$Rd, VPR64:$Rn), 0>;
+
+def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))),
+          (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))),
+          (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>;
+
+def : Pat<(v16i8 (xor
+            (v16i8 VPR128:$Rn),
+            (v16i8 Neon_AllOne))),
+          (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (xor
+            (v8i8 VPR64:$Rn),
+            (v8i8 Neon_AllOne))),
+          (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>;
+def : Pat<(v8i16 (xor
+            (v8i16 VPR128:$Rn),
+            (v8i16 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+def : Pat<(v4i16 (xor
+            (v4i16 VPR64:$Rn),
+            (v4i16 (bitconvert (v8i8 Neon_AllOne))))),
+          (NOT8b VPR64:$Rn)>;
+def : Pat<(v4i32 (xor
+            (v4i32 VPR128:$Rn),
+            (v4i32 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+def : Pat<(v2i32 (xor
+            (v2i32 VPR64:$Rn),
+            (v2i32 (bitconvert (v8i8 Neon_AllOne))))),
+          (NOT8b VPR64:$Rn)>;
+def : Pat<(v2i64 (xor
+            (v2i64 VPR128:$Rn),
+            (v2i64 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+
+def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))),
+          (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))),
+          (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>;
+
+multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
+                                SDPatternOperator Neon_Op> {
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4f32 VPR128:$Rd),
+                           (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [(set (v2f64 VPR128:$Rd),
+                           (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2f32 VPR64:$Rd),
+                           (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
+defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>;
+
+multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
+  def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.8b, $Rn.8h",
+                          [], NoItinerary>;
+
+  def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4s",
+                          [], NoItinerary>;
+
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  let Constraints = "$Rd = $src" in {
+    def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                             asmop # "2\t$Rd.16b, $Rn.8h",
+                             [], NoItinerary>;
+
+    def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.8h, $Rn.4s",
+                            [], NoItinerary>;
+
+    def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.4s, $Rn.2d",
+                            [], NoItinerary>;
+  }
+}
+
+defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>;
+defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>;
+defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>;
+defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>;
+
+multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix,
+                                        SDPatternOperator Neon_Op> {
+  def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v16i8 (concat_vectors
+              (v8i8 VPR64:$src),
+              (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 8h16b)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+
+  def : Pat<(v8i16 (concat_vectors
+              (v4i16 VPR64:$src),
+              (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 4s8h)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+
+  def : Pat<(v4i32 (concat_vectors
+              (v2i32 VPR64:$src),
+              (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 2d4s)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+}
+
+defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>;
+defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>;
+defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>;
+defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>;
+
+multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
+  let DecoderMethod = "DecodeSHLLInstruction" in {
+    def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact8:$Imm),
+                            asmop # "\t$Rd.8h, $Rn.8b, $Imm",
+                            [], NoItinerary>;
+
+    def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact16:$Imm),
+                            asmop # "\t$Rd.4s, $Rn.4h, $Imm",
+                            [], NoItinerary>;
+
+    def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact32:$Imm),
+                            asmop # "\t$Rd.2d, $Rn.2s, $Imm",
+                            [], NoItinerary>;
+
+    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact8:$Imm),
+                            asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
+                            [], NoItinerary>;
+
+    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact16:$Imm),
+                            asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
+                            [], NoItinerary>;
+
+    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact32:$Imm),
+                            asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
+                            [], NoItinerary>;
+  }
+}
+
+defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>;
+
+class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy,
+                          SDPatternOperator ExtOp, Operand Neon_Imm,
+                          string suffix>
+  : Pat<(DesTy (shl
+          (DesTy (ExtOp (OpTy VPR64:$Rn))),
+            (DesTy (Neon_vdup
+              (i32 Neon_Imm:$Imm))))),
+        (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>;
+
+class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy,
+                               SDPatternOperator ExtOp, Operand Neon_Imm,
+                               string suffix, PatFrag GetHigh>
+  : Pat<(DesTy (shl
+          (DesTy (ExtOp
+            (OpTy (GetHigh VPR128:$Rn)))),
+              (DesTy (Neon_vdup
+                (i32 Neon_Imm:$Imm))))),
+        (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>;
+
+def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">;
+def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">;
+def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">;
+def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">;
+def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">;
+def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">;
+def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h",
+                               Neon_High16B>;
+def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h",
+                               Neon_High16B>;
+def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s",
+                               Neon_High8H>;
+def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s",
+                               Neon_High8H>;
+def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d",
+                               Neon_High4S>;
+def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d",
+                               Neon_High4S>;
+
+multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
+  def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4s",
+                          [], NoItinerary>;
+
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  let Constraints = "$src = $Rd" in {
+    def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.8h, $Rn.4s",
+                            [], NoItinerary>;
+
+    def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.4s, $Rn.2d",
+                            [], NoItinerary>;
+  }
+}
+
+defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>;
+
+multiclass NeonI_2VMisc_Narrow_Pattern<string prefix,
+                                       SDPatternOperator f32_to_f16_Op,
+                                       SDPatternOperator f64_to_f32_Op> {
+
+  def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))),
+              (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>;
+
+  def : Pat<(v8i16 (concat_vectors
+                (v4i16 VPR64:$src),
+                (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))),
+                  (!cast<Instruction>(prefix # "4s8h")
+                    (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+                    (v4f32 VPR128:$Rn))>;
+
+  def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))),
+            (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>;
+
+  def : Pat<(v4f32 (concat_vectors
+              (v2f32 VPR64:$src),
+              (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))),
+                (!cast<Instruction>(prefix # "2d4s")
+                  (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+                  (v2f64 VPR128:$Rn))>;
+}
+
+defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>;
+
+multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
+                                 bits<5> opcode> {
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "2\t$Rd.4s, $Rn.2d",
+                          [], NoItinerary> {
+    let Constraints = "$src = $Rd";
+  }
+
+  def : Pat<(v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))),
+            (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
+
+  def : Pat<(v4f32 (concat_vectors
+              (v2f32 VPR64:$src),
+              (v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "2d4s")
+               (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+               VPR128:$Rn)>;
+}
+
+defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>;
+
+def Neon_High4Float : PatFrag<(ops node:$in),
+                              (extract_subvector (v4f32 node:$in), (iPTR 2))>;
+
+multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
+  def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR128:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.4s, $Rn.4h",
+                          [], NoItinerary>;
+
+  def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.2d, $Rn.2s",
+                          [], NoItinerary>;
+
+  def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$Rn),
+                          asmop # "2\t$Rd.4s, $Rn.8h",
+                          [], NoItinerary>;
+
+  def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$Rn),
+                          asmop # "2\t$Rd.2d, $Rn.4s",
+                          [], NoItinerary>;
+}
+
+defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
+
+multiclass NeonI_2VMisc_Extend_Pattern<string prefix> {
+  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>;
+
+  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp
+              (v4i16 (Neon_High8H
+                (v8i16 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>;
+
+  def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>;
+
+  def : Pat<(v2f64 (fextend
+              (v2f32 (Neon_High4Float
+                (v4f32 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>;
+}
+
+defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">;
+
+multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
+                                ValueType ResTy4s, ValueType OpTy4s,
+                                ValueType ResTy2d, ValueType OpTy2d,
+                                ValueType ResTy2s, ValueType OpTy2s,
+                                SDPatternOperator Neon_Op> {
+
+  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (ResTy4s VPR128:$Rd),
+                           (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [(set (ResTy2d VPR128:$Rd),
+                           (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (ResTy2s VPR64:$Rd),
+                           (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
+                                  bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64,
+                                v2f64, v2i32, v2f32, Neon_Op>;
+}
+
+defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
+                                     int_aarch64_neon_fcvtns>;
+defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
+                                     int_aarch64_neon_fcvtnu>;
+defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
+                                     int_aarch64_neon_fcvtps>;
+defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
+                                     int_aarch64_neon_fcvtpu>;
+defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
+                                     int_aarch64_neon_fcvtms>;
+defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
+                                     int_aarch64_neon_fcvtmu>;
+defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
+defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
+defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
+                                     int_aarch64_neon_fcvtas>;
+defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
+                                     int_aarch64_neon_fcvtau>;
+
+multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
+                                  bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64,
+                                v2i64, v2f32, v2i32, Neon_Op>;
+}
+
+defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>;
+defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>;
+
+multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U,
+                                 bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64,
+                                v2f64, v2f32, v2f32, Neon_Op>;
+}
+
+defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000,
+                                     int_aarch64_neon_frintn>;
+defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>;
+defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>;
+defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>;
+defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>;
+defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>;
+defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>;
+defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
+                                    int_arm_neon_vrecpe>;
+defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
+                                     int_arm_neon_vrsqrte>;
+defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
+
+multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
+                               bits<5> opcode, SDPatternOperator Neon_Op> {
+  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4i32 VPR128:$Rd),
+                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2i32 VPR64:$Rd),
+                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
+                                  int_arm_neon_vrecpe>;
+defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100,
+                                   int_arm_neon_vrsqrte>;
+
+// Crypto Class
+class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_AES<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                     asmop # "\t$Rd.16b, $Rn.16b",
+                     [(set (v16i8 VPR128:$Rd),
+                        (v16i8 (opnode (v16i8 VPR128:$src),
+                                       (v16i8 VPR128:$Rn))))],
+                     NoItinerary>{
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>;
+def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>;
+
+class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
+                      string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_AES<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$Rn),
+                     asmop # "\t$Rd.16b, $Rn.16b",
+                     [(set (v16i8 VPR128:$Rd),
+                        (v16i8 (opnode (v16i8 VPR128:$Rn))))],
+                     NoItinerary>;
+
+def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
+def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
+
+class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_SHA<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                     asmop # "\t$Rd.4s, $Rn.4s",
+                     [(set (v4i32 VPR128:$Rd),
+                        (v4i32 (opnode (v4i32 VPR128:$src),
+                                       (v4i32 VPR128:$Rn))))],
+                     NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1",
+                                 int_arm_neon_sha1su1>;
+def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0",
+                                   int_arm_neon_sha256su0>;
+
+class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_SHA<size, opcode,
+                     (outs FPR32:$Rd), (ins FPR32:$Rn),
+                     asmop # "\t$Rd, $Rn",
+                     [(set (v1i32 FPR32:$Rd),
+                        (v1i32 (opnode (v1i32 FPR32:$Rn))))],
+                     NoItinerary> {
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
+
+class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs VPR128:$Rd),
+                       (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
+                       [(set (v4i32 VPR128:$Rd),
+                          (v4i32 (opnode (v4i32 VPR128:$src),
+                                         (v4i32 VPR128:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0",
+                                   int_arm_neon_sha1su0>;
+def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1",
+                                     int_arm_neon_sha256su1>;
+
+class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs FPR128:$Rd),
+                       (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd, $Rn, $Rm.4s",
+                       [(set (v4i32 FPR128:$Rd),
+                          (v4i32 (opnode (v4i32 FPR128:$src),
+                                         (v4i32 FPR128:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
+                                   int_arm_neon_sha256h>;
+def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
+                                    int_arm_neon_sha256h2>;
+
+class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs FPR128:$Rd),
+                       (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd, $Rn, $Rm.4s",
+                       [(set (v4i32 FPR128:$Rd),
+                          (v4i32 (opnode (v4i32 FPR128:$src),
+                                         (v1i32 FPR32:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c", int_aarch64_neon_sha1c>;
+def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p", int_aarch64_neon_sha1p>;
+def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m", int_aarch64_neon_sha1m>;
+
+//
+// Patterns for handling half-precision values
+//
+
+// Convert f16 value coming in as i16 value to f32
+def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
+          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
+def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))),
+          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
+
+def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 (
+            f32_to_f16 (f32 FPR32:$Rn))))))),
+          (f32 FPR32:$Rn)>;
+
+// Patterns for vector extract of half-precision FP value in i16 storage type
+def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
+            (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))),
+          (FCVTsh (f16 (DUPhv_H
+            (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            neon_uimm2_bare:$Imm)))>;
+
+def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
+            (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))),
+          (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>;
+
+// Patterns for vector insert of half-precision FP value 0 in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
+            (neon_uimm3_bare:$Imm))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn),
+            (v8i16 (SUBREG_TO_REG (i64 0),
+              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
+              sub_16)),
+            neon_uimm3_bare:$Imm, 0))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
+            (neon_uimm2_bare:$Imm))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0),
+                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
+                sub_16)),
+              neon_uimm2_bare:$Imm, 0)),
+            sub_64))>;
+
+// Patterns for vector insert of half-precision FP value in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint
+              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
+            (neon_uimm3_bare:$Imm))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn),
+            (v8i16 (SUBREG_TO_REG (i64 0),
+              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
+              sub_16)),
+            neon_uimm3_bare:$Imm, 0))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint
+              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
+            (neon_uimm2_bare:$Imm))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0),
+                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
+                sub_16)),
+              neon_uimm2_bare:$Imm, 0)),
+            sub_64))>;
+
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
+              (neon_uimm3_bare:$Imm1))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
+            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
+
+// Patterns for vector copy of half-precision FP value in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
+              (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
+              65535)))))))),
+            (neon_uimm3_bare:$Imm1))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
+            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
+              (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)),
+              65535)))))))),
+            (neon_uimm3_bare:$Imm1))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+              neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)),
+            sub_64))>;
+
 
-def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v8i8  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v4i16  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v2i32  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v2f32  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v1i64  VPR64:$src), sub_64))>;
-def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v16i8  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v8i16  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v4i32  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v2i64  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v4f32  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v2f64  VPR128:$src), sub_alias))>;
-
-def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))),
-                  (v8i8 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))),
-                  (v4i16 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))),
-                  (v2i32 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))),
-                  (v2f32 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))),
-                  (v1i64 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))),
-                  (v16i8 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))),
-                  (v8i16 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))),
-                  (v4i32 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))),
-                  (v2i64 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))),
-                  (v4f32 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))),
-                  (v2f64 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 7ce5ce3..8cfb968 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -121,7 +121,7 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_MachineBasicBlock:
     MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index b3a81b1..4e2022c 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -18,9 +18,19 @@ def sub_32 : SubRegIndex<32>;
 def sub_16 : SubRegIndex<16>;
 def sub_8  : SubRegIndex<8>;
 
-// The VPR registers are handled as sub-registers of FPR equivalents, but
-// they're really the same thing. We give this concept a special index.
-def sub_alias : SubRegIndex<128>;
+// Note: Code depends on these having consecutive numbers.
+def qqsub : SubRegIndex<256, 256>;
+
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
 }
 
 // Registers are identified with 5-bit ID numbers.
@@ -137,60 +147,51 @@ foreach Index = 0-31 in {
 }
 
 
-def FPR8 : RegisterClass<"AArch64", [i8], 8,
+def FPR8 : RegisterClass<"AArch64", [i8, v1i8], 8,
                           (sequence "B%u", 0, 31)> {
 }
 
-def FPR16 : RegisterClass<"AArch64", [f16], 16,
+def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
                           (sequence "H%u", 0, 31)> {
 }
 
-def FPR32 : RegisterClass<"AArch64", [f32], 32,
+def FPR32 : RegisterClass<"AArch64", [f32, v1i32, v1f32], 32,
                           (sequence "S%u", 0, 31)> {
 }
 
-def FPR64 : RegisterClass<"AArch64", [f64], 64,
-                          (sequence "D%u", 0, 31)> {
-}
+def FPR64 : RegisterClass<"AArch64",
+                          [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
+                          64, (sequence "D%u", 0, 31)>;
 
-def FPR128 : RegisterClass<"AArch64", [f128], 128,
-                          (sequence "Q%u", 0, 31)> {
-}
+def FPR128 : RegisterClass<"AArch64",
+                           [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
+                           128, (sequence "Q%u", 0, 31)>;
+
+def FPR64Lo : RegisterClass<"AArch64",
+                            [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
+                            64, (sequence "D%u", 0, 15)>;
 
+def FPR128Lo : RegisterClass<"AArch64",
+                             [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
+                             128, (sequence "Q%u", 0, 15)>;
 
 //===----------------------------------------------------------------------===//
 //  Vector registers:
 //===----------------------------------------------------------------------===//
 
-// NEON registers simply specify the overall vector, and it's expected that
-// Instructions will individually specify the acceptable data layout. In
-// principle this leaves two approaches open:
-//   + An operand, giving a single ADDvvv instruction (for example). This turns
-//     out to be unworkable in the assembly parser (without every Instruction
-//     having a "cvt" function, at least) because the constraints can't be
-//     properly enforced. It also complicates specifying patterns since each
-//     instruction will accept many types.
-//  + A bare token (e.g. ".2d"). This means the AsmParser has to know specific
-//    details about NEON registers, but simplifies most other details.
-//
-// The second approach was taken.
-
-foreach Index = 0-31 in {
-  def V # Index  : AArch64RegWithSubs<Index, "v" # Index,
-                                      [!cast<Register>("Q" # Index)],
-                                      [sub_alias]>,
-            DwarfRegNum<[!add(Index, 64)]>;
+def VPR64AsmOperand : AsmOperandClass {
+  let Name = "VPR";
+  let PredicateMethod = "isReg";
+  let RenderMethod = "addRegOperands";
 }
 
-// These two classes contain the same registers, which should be reasonably
-// sensible for MC and allocation purposes, but allows them to be treated
-// separately for things like stack spilling.
-def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8, v1i64], 64,
-                          (sequence "V%u", 0, 31)>;
+def VPR64 : RegisterOperand<FPR64, "printVPRRegister">;
 
-def VPR128 : RegisterClass<"AArch64",
-                           [v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128,
-                           (sequence "V%u", 0, 31)>;
+def VPR128 : RegisterOperand<FPR128, "printVPRRegister">;
+
+def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">;
+
+def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">;
 
 // Flags register
 def NZCV : Register<"nzcv"> {
@@ -201,3 +202,90 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
   let CopyCost = -1;
   let isAllocatable = 0;
 }
+
+//===----------------------------------------------------------------------===//
+//  Consecutive vector registers
+//===----------------------------------------------------------------------===//
+// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31
+def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
+                              [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+                              
+// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
+def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
+                              [(rotl FPR64, 0), (rotl FPR64, 1),
+                               (rotl FPR64, 2)]>;
+                               
+// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
+def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
+                              [(rotl FPR64, 0), (rotl FPR64, 1),
+                               (rotl FPR64, 2), (rotl FPR64, 3)]>;
+
+// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
+def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
+                              [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+
+// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
+def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
+                              [(rotl FPR128, 0), (rotl FPR128, 1),
+                               (rotl FPR128, 2)]>;
+                               
+// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
+def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
+                              [(rotl FPR128, 0), (rotl FPR128, 1),
+                               (rotl FPR128, 2), (rotl FPR128, 3)]>;
+
+// The followings are super register classes to model 2/3/4 consecutive
+// 64-bit/128-bit registers.
+
+def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
+
+def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
+
+def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
+
+def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
+  let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
+}
+
+def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
+
+
+// The followings are vector list operands
+multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
+                               RegisterClass RegList> {
+  def _asmoperand : AsmOperandClass {
+    let Name = PREFIX # LAYOUT # Count;
+    let RenderMethod = "addVectorListOperands";
+    let PredicateMethod = 
+        "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">";
+    let ParserMethod = "ParseVectorList";
+  }
+
+  def _operand : RegisterOperand<RegList,
+        "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> {
+    let ParserMatchClass =
+      !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
+  }
+}
+
+multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
+                           RegisterClass QRegList> {
+  defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
+  defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
+  defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
+  defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
+  defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
+  defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
+  defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
+  defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
+}
+
+// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
+defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
+defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
+defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
+defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
+\ No newline at end of file
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index d71bb4e..5c693c1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -25,11 +25,31 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void AArch64Subtarget::anchor() {}
+
 AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), HasNEON(false), HasCrypto(false),
-      TargetTriple(TT) {
+    : AArch64GenSubtargetInfo(TT, CPU, FS), HasFPARMv8(false), HasNEON(false),
+      HasCrypto(false), TargetTriple(TT), CPUString(CPU) {
+
+  initializeSubtargetFeatures(CPU, FS);
+}
+
+void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
+                                                   StringRef FS) {
+  if (CPU.empty())
+    CPUString = "generic";
+
+  std::string FullFS = FS;
+  if (CPUString == "generic") {
+    // Enable FP by default.
+    if (FullFS.empty())
+      FullFS = "+fp-armv8";
+    else
+      FullFS = "+fp-armv8," + FullFS;
+  }
 
-  ParseSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, FullFS);
 }
 
 bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 35a7c8d..bbfd3bc 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -27,18 +27,31 @@ class StringRef;
 class GlobalValue;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
+  virtual void anchor();
 protected:
+  bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+private:
+  void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS);
 
+  virtual bool enableMachineScheduler() const {
+    return true;
+  }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -46,11 +59,13 @@ public:
   bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
 
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 
+  bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
-
   bool hasCrypto() const { return HasCrypto; }
+
+  const std::string & getCPUString() const { return CPUString; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 43e91ac..fbbce11 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -54,8 +54,9 @@ public:
 #include "AArch64GenAsmMatcher.inc"
   };
 
-  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                   const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Initialize the set of available features.
@@ -126,6 +127,11 @@ public:
   OperandMatchResultTy
   ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
+  bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
+                      SMLoc &LayoutLoc);
+
+  OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
+
   bool validateInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
@@ -153,6 +159,7 @@ private:
     k_Immediate,      // Including expressions referencing symbols
     k_Register,
     k_ShiftExtend,
+    k_VectorList,     // A sequential list of 1 to 4 registers.
     k_SysReg,         // The register operand of MRS and MSR instructions
     k_Token,          // The mnemonic; other raw tokens the auto-generated
     k_WrappedRegister // Load/store exclusive permit a wrapped register.
@@ -188,6 +195,13 @@ private:
     bool ImplicitAmount;
   };
 
+  // A vector register list is a sequential list of 1 to 4 registers.
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    A64Layout::VectorLayout Layout;
+  };
+
   struct SysRegOp {
     const char *Data;
     unsigned Length;
@@ -205,6 +219,7 @@ private:
     struct ImmOp Imm;
     struct RegOp Reg;
     struct ShiftExtendOp ShiftExtend;
+    struct VectorListOp VectorList;
     struct SysRegOp SysReg;
     struct TokOp Tok;
   };
@@ -664,6 +679,44 @@ public:
     return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
   }
 
+  // if 0 < value <= w, return true
+  bool isShrFixedWidth(int w) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= w;
+  }
+
+  bool isShrImm8() const { return isShrFixedWidth(8); }
+
+  bool isShrImm16() const { return isShrFixedWidth(16); }
+
+  bool isShrImm32() const { return isShrFixedWidth(32); }
+
+  bool isShrImm64() const { return isShrFixedWidth(64); }
+
+  // if 0 <= value < w, return true
+  bool isShlFixedWidth(int w) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < w;
+  }
+
+  bool isShlImm8() const { return isShlFixedWidth(8); }
+
+  bool isShlImm16() const { return isShlFixedWidth(16); }
+
+  bool isShlImm32() const { return isShlFixedWidth(32); }
+
+  bool isShlImm64() const { return isShlFixedWidth(64); }
+
   bool isNeonMovImmShiftLSL() const {
     if (!isShiftOrExtend())
       return false;
@@ -697,6 +750,12 @@ public:
     return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
   }
 
+  template <A64Layout::VectorLayout Layout, unsigned Count>
+  bool isVectorList() const {
+    return Kind == k_VectorList && VectorList.Layout == Layout &&
+           VectorList.Count == Count;
+  }
+
   template <int MemSize> bool isSImm7Scaled() const {
     if (!isImm())
       return false;
@@ -756,6 +815,17 @@ public:
     return true;
   }
 
+  // if value == N, return true
+  template<int N>
+  bool isExactImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() == N;
+  }
+
   static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
                                           unsigned ShiftAmount,
                                           bool ImplicitAmount,
@@ -817,6 +887,18 @@ public:
     return Op;
   }
 
+  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                          A64Layout::VectorLayout Layout,
+                                          SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.Layout = Layout;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
     AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
     Op->Tok.Data = Str.data();
@@ -1164,6 +1246,11 @@ public:
     }
     Inst.addOperand(MCOperand::CreateImm(Imm));
   }
+
+  void addVectorListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+  }
 };
 
 } // end anonymous namespace.
@@ -1203,7 +1290,6 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       else
         return MatchOperand_Success;
     }
-
     // ... or it might be a symbolish thing
   }
     // Fall through
@@ -1247,7 +1333,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return ParseOperand(Operands, Mnemonic);
   }
   // The following will likely be useful later, but not in very early cases
-  case AsmToken::LCurly:  // Weird SIMD lists
+  case AsmToken::LCurly: // SIMD vector list is not parsed here
     llvm_unreachable("Don't know how to deal with '{' in operand");
     return MatchOperand_ParseFail;
   }
@@ -1405,7 +1491,7 @@ AArch64AsmParser::ParseImmWithLSLOperand(
 
   // The optional operand must be "lsl #N" where N is non-negative.
   if (Parser.getTok().is(AsmToken::Identifier)
-      && Parser.getTok().getIdentifier().lower() == "lsl") {
+      && Parser.getTok().getIdentifier().equals_lower("lsl")) {
     Parser.Lex();
 
     if (Parser.getTok().is(AsmToken::Hash)) {
@@ -1462,9 +1548,8 @@ AArch64AsmParser::ParseCRxOperand(
     return MatchOperand_ParseFail;
   }
 
-  std::string LowerTok = Parser.getTok().getIdentifier().lower();
-  StringRef Tok(LowerTok);
-  if (Tok[0] != 'c') {
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
     Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
   }
@@ -1536,22 +1621,11 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
   std::string LowerReg = Tok.getString().lower();
   size_t DotPos = LowerReg.find('.');
 
-  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
-  if (RegNum == AArch64::NoRegister) {
-    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
-      .Case("ip0", AArch64::X16)
-      .Case("ip1", AArch64::X17)
-      .Case("fp", AArch64::X29)
-      .Case("lr", AArch64::X30)
-      .Default(AArch64::NoRegister);
-  }
-  if (RegNum == AArch64::NoRegister)
-    return false;
-
+  bool IsVec128 = false;
   SMLoc S = Tok.getLoc();
   RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
 
-  if (DotPos == StringRef::npos) {
+  if (DotPos == std::string::npos) {
     Layout = StringRef();
   } else {
     // Everything afterwards needs to be a literal token, expected to be
@@ -1561,20 +1635,78 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
     // gives us a permanent string to use in the token (a pointer into LowerReg
     // would go out of scope when we return).
     LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
-    std::string LayoutText = LowerReg.substr(DotPos, StringRef::npos);
+    StringRef LayoutText = StringRef(LowerReg).substr(DotPos);
+
+    // See if it's a 128-bit layout first.
     Layout = StringSwitch<const char *>(LayoutText)
-      .Case(".d", ".d").Case(".1d", ".1d").Case(".2d", ".2d")
-      .Case(".s", ".s").Case(".2s", ".2s").Case(".4s", ".4s")
-      .Case(".h", ".h").Case(".4h", ".4h").Case(".8h", ".8h")
-      .Case(".b", ".b").Case(".8b", ".8b").Case(".16b", ".16b")
+      .Case(".q", ".q").Case(".1q", ".1q")
+      .Case(".d", ".d").Case(".2d", ".2d")
+      .Case(".s", ".s").Case(".4s", ".4s")
+      .Case(".h", ".h").Case(".8h", ".8h")
+      .Case(".b", ".b").Case(".16b", ".16b")
       .Default("");
 
+    if (Layout.size() != 0)
+      IsVec128 = true;
+    else {
+      Layout = StringSwitch<const char *>(LayoutText)
+                   .Case(".1d", ".1d")
+                   .Case(".2s", ".2s")
+                   .Case(".4h", ".4h")
+                   .Case(".8b", ".8b")
+                   .Default("");
+    }
+
     if (Layout.size() == 0) {
-      // Malformed register
+      // If we've still not pinned it down the register is malformed.
       return false;
     }
   }
 
+  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
+  if (RegNum == AArch64::NoRegister) {
+    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
+      .Case("ip0", AArch64::X16)
+      .Case("ip1", AArch64::X17)
+      .Case("fp", AArch64::X29)
+      .Case("lr", AArch64::X30)
+      .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0)
+      .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1)
+      .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2)
+      .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3)
+      .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4)
+      .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5)
+      .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6)
+      .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7)
+      .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8)
+      .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9)
+      .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10)
+      .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11)
+      .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12)
+      .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13)
+      .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14)
+      .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15)
+      .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16)
+      .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17)
+      .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18)
+      .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19)
+      .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20)
+      .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21)
+      .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22)
+      .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23)
+      .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24)
+      .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25)
+      .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26)
+      .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27)
+      .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28)
+      .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29)
+      .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30)
+      .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31)
+      .Default(AArch64::NoRegister);
+  }
+  if (RegNum == AArch64::NoRegister)
+    return false;
+
   return true;
 }
 
@@ -1606,6 +1738,7 @@ AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       case 'h': NumLanes = 8; break;
       case 's': NumLanes = 4; break;
       case 'd': NumLanes = 2; break;
+      case 'q': NumLanes = 1; break;
       }
     }
 
@@ -1824,6 +1957,148 @@ AArch64AsmParser::ParseShiftExtend(
   return MatchOperand_Success;
 }
 
+/// Try to parse a vector register token, If it is a vector register,
+/// the token is eaten and return true. Otherwise return false.
+bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
+                                      StringRef &Layout, SMLoc &LayoutLoc) {
+  bool IsVector = true;
+
+  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
+    IsVector = false;
+  else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID]
+                .contains(RegNum) &&
+           !AArch64MCRegisterClasses[AArch64::FPR128RegClassID]
+                .contains(RegNum))
+    IsVector = false;
+  else if (Layout.size() == 0)
+    IsVector = false;
+
+  if (!IsVector)
+    Error(Parser.getTok().getLoc(), "expected vector type register");
+
+  Parser.Lex(); // Eat this token.
+  return IsVector;
+}
+
+
+// A vector list contains 1-4 consecutive registers.
+// Now there are two kinds of vector list when number of vector > 1:
+//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
+//   (2) {Vn.layout - Vm.layout}
+// If the layout is like .b/.h/.s/.d, also parse the lane.
+AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  if (Parser.getTok().isNot(AsmToken::LCurly)) {
+    Error(Parser.getTok().getLoc(), "'{' expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc SLoc = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+
+  unsigned Reg, Count = 1;
+  StringRef LayoutStr;
+  SMLoc RegEndLoc, LayoutLoc;
+  if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
+    return MatchOperand_ParseFail;
+
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the minus.
+
+    unsigned Reg2;
+    StringRef LayoutStr2;
+    SMLoc RegEndLoc2, LayoutLoc2;
+    SMLoc RegLoc2 = Parser.getTok().getLoc();
+
+    if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
+      return MatchOperand_ParseFail;
+    unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
+
+    if (LayoutStr != LayoutStr2) {
+      Error(LayoutLoc2, "expected the same vector layout");
+      return MatchOperand_ParseFail;
+    }
+    if (Space == 0 || Space > 3) {
+      Error(RegLoc2, "invalid number of vectors");
+      return MatchOperand_ParseFail;
+    }
+
+    Count += Space;
+  } else {
+    unsigned LastReg = Reg;
+    while (Parser.getTok().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      unsigned Reg2;
+      StringRef LayoutStr2;
+      SMLoc RegEndLoc2, LayoutLoc2;
+      SMLoc RegLoc2 = Parser.getTok().getLoc();
+
+      if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
+        return MatchOperand_ParseFail;
+      unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
+                                        : (Reg2 + 32 - LastReg);
+      Count++;
+
+      // The space between two vectors should be 1. And they should have the same layout.
+      // Total count shouldn't be great than 4
+      if (Space != 1) {
+        Error(RegLoc2, "invalid space between two vectors");
+        return MatchOperand_ParseFail;
+      }
+      if (LayoutStr != LayoutStr2) {
+        Error(LayoutLoc2, "expected the same vector layout");
+        return MatchOperand_ParseFail;
+      }
+      if (Count > 4) {
+        Error(RegLoc2, "invalid number of vectors");
+        return MatchOperand_ParseFail;
+      }
+
+      LastReg = Reg2;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly)) {
+    Error(Parser.getTok().getLoc(), "'}' expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc ELoc = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
+  if (Count > 1) { // If count > 1, create vector list using super register.
+    bool IsVec64 = (Layout < A64Layout::VL_16B);
+    static unsigned SupRegIDs[3][2] = {
+      { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
+      { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
+      { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
+    };
+    unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
+    unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+    Reg = MRI->getMatchingSuperReg(Reg, Sub0,
+                                   &AArch64MCRegisterClasses[SupRegID]);
+  }
+  Operands.push_back(
+      AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
+
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    uint32_t NumLanes = 0;
+    switch(Layout) {
+    case A64Layout::VL_B : NumLanes = 16; break;
+    case A64Layout::VL_H : NumLanes = 8; break;
+    case A64Layout::VL_S : NumLanes = 4; break;
+    case A64Layout::VL_D : NumLanes = 2; break;
+    default:
+      SMLoc Loc = getLexer().getLoc();
+      Error(Loc, "expected comma before next operand");
+      return MatchOperand_ParseFail;
+    }
+    return ParseNEONLane(Operands, NumLanes);
+  } else {
+    return MatchOperand_Success;
+  }
+}
+
 // FIXME: We would really like to be able to tablegen'erate this.
 bool AArch64AsmParser::
 validateInstruction(MCInst &Inst,
@@ -2240,6 +2515,30 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_Width64:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
                  "expected integer in range [<lsb>, 63]");
+  case Match_ShrImm8:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 8]");
+  case Match_ShrImm16:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 16]");
+  case Match_ShrImm32:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 32]");
+  case Match_ShrImm64:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 64]");
+  case Match_ShlImm8:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 7]");
+  case Match_ShlImm16:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 15]");
+  case Match_ShlImm32:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 31]");
+  case Match_ShlImm64:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 63]");
   }
 
   llvm_unreachable("Implement any new match types added!");
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index a88a8e8..be4d7f2 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -82,15 +82,38 @@ static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
                                               unsigned RegNo, uint64_t Address,
                                               const void *Decoder);
-static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst,
+                                                unsigned RegNo, uint64_t Address,
+                                                const void *Decoder);
+
+static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
+                                                  unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+
+static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
 
 static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
                                                unsigned OptionHiS,
@@ -113,6 +136,30 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
                                         uint64_t Address,
                                         const void *Decoder);
 
+static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
                                              unsigned FullImm,
@@ -183,6 +230,17 @@ static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
                                                    uint64_t Address,
                                                    const void *Decoder);
 
+static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In);
 
@@ -331,6 +389,14 @@ DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus
+DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
+}
 
 static DecodeStatus
 DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
@@ -343,28 +409,80 @@ DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  if (RegNo > 31)
+static DecodeStatus
+DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
     return MCDisassembler::Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::VPR64RegClassID, RegNo);
+  return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
+                                                  unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder) {
+  if (RegNo > 30)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Register));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus
-DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-						  uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
+                                            unsigned RegID,
+                                            const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::VPR128RegClassID, RegNo);
+  uint16_t Register = getReg(Decoder, RegID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Register));
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
+                                 Decoder);
+}
+
 static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
                                                unsigned OptionHiS,
                                                uint64_t Address,
@@ -413,7 +531,73 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(8 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(16 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(32 - Val));
+  return MCDisassembler::Success;
+}
 
+static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  if (Val > 7)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 15)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 63)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
 
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
@@ -570,11 +754,11 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
 
   if (IsToVec) {
-    DecodeVPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
     DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
   } else {
     DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeVPR128RegisterClass(Inst, Rn, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
   }
 
   // Add the lane
@@ -838,3 +1022,551 @@ DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
   Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
   return MCDisassembler::Success;
 }
+
+// Decode post-index vector load/store instructions.
+// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
+// operand is an immediate equal the the length of vector list in bytes,
+// or Rm is decoded to a GPR64noxzr register.
+static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
+  unsigned Opcode = fieldFromInstruction(Insn, 12, 4);
+  unsigned IsLoad = fieldFromInstruction(Insn, 22, 1);
+  // 0 for 64bit vector list, 1 for 128bit vector list
+  unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1);
+
+  unsigned NumVecs;
+  switch (Opcode) {
+  case 0: // ld4/st4
+  case 2: // ld1/st1 with 4 vectors
+    NumVecs = 4; break;
+  case 4: // ld3/st3
+  case 6: // ld1/st1 with 3 vectors
+    NumVecs = 3; break;
+  case 7: // ld1/st1 with 1 vector
+    NumVecs = 1; break;
+  case 8:  // ld2/st2
+  case 10: // ld1/st1 with 2 vectors
+    NumVecs = 2; break;
+  default:
+    llvm_unreachable("Invalid opcode for post-index load/store instructions");
+  }
+
+  // Decode vector list of 1/2/3/4 vectors for load instructions.
+  if (IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  // Decode write back register, which is equal to Rn.
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte
+    Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8)));
+  else // Decode Rm
+    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+  // Decode vector list of 1/2/3/4 vectors for load instructions.
+  if (!IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  return MCDisassembler::Success;
+}
+
+// Decode post-index vector load/store lane instructions.
+// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
+// operand is an immediate equal the the length of the changed bytes,
+// or Rm is decoded to a GPR64noxzr register.
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  bool Is64bitVec = false;
+  bool IsLoadDup = false;
+  bool IsLoad = false;
+  // The total number of bytes transferred.
+  // TransferBytes = NumVecs * OneLaneBytes
+  unsigned TransferBytes = 0;
+  unsigned NumVecs = 0;
+  unsigned Opc = Inst.getOpcode();
+  switch (Opc) {
+  case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+  case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+  case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+  case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
+      TransferBytes = 8; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+  case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+  case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+  case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
+      TransferBytes = 8; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+  case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+  case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+  case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
+      TransferBytes = 16; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+  case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+  case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+  case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
+      TransferBytes = 16; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+  case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+  case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+  case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
+      TransferBytes = 24; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+  case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
+  case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
+  case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
+      TransferBytes = 24; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+  case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+  case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+  case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
+      TransferBytes = 32; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+  case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
+  case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
+  case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
+      TransferBytes = 32; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+  case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+  case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+  case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
+      TransferBytes = 8; break;
+    }
+    IsLoad = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+  case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+  case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+  case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
+      TransferBytes = 16; break;
+    }
+    IsLoad = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+  case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+  case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+  case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
+      TransferBytes = 24; break;
+    }
+    IsLoad = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+  case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+  case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+  case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
+      TransferBytes = 32; break;
+    }
+    IsLoad = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+  case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+  case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+  case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+      TransferBytes = 1; break;
+    case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+      TransferBytes = 2; break;
+    case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+      TransferBytes = 4; break;
+    case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
+      TransferBytes = 8; break;
+    }
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+  case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+  case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+  case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+      TransferBytes = 2; break;
+    case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+      TransferBytes = 4; break;
+    case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+      TransferBytes = 8; break;
+    case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
+      TransferBytes = 16; break;
+    }
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+  case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+  case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+  case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+      TransferBytes = 3; break;
+    case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+      TransferBytes = 6; break;
+    case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+      TransferBytes = 12; break;
+    case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
+      TransferBytes = 24; break;
+    }
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+  case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+  case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+  case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+      TransferBytes = 4; break;
+    case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+      TransferBytes = 8; break;
+    case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+      TransferBytes = 16; break;
+    case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
+      TransferBytes = 32; break;
+    }
+    NumVecs = 4;
+    break;
+  }
+
+  default:
+    return MCDisassembler::Fail;
+  } // End of switch (Opc)
+
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
+
+  // Decode post-index of load duplicate lane
+  if (IsLoadDup) {
+    switch (NumVecs) {
+    case 1:
+      Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+    }
+
+    // Decode write back register, which is equal to Rn.
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+    if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+      Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+    else // Decode Rm
+      DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+    return MCDisassembler::Success;
+  }
+
+  // Decode post-index of load/store lane
+  // Loads have a vector list as output.
+  if (IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+    }
+  }
+
+  // Decode write back register, which is equal to Rn.
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+    Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+  else // Decode Rm
+    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+  // Decode the source vector list.
+  switch (NumVecs) {
+  case 1:
+    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 2:
+    DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 3:
+    DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 4:
+    DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+  }
+
+  // Decode lane
+  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+  unsigned S = fieldFromInstruction(Insn, 10, 3);
+  unsigned lane = 0;
+  // Calculate the number of lanes by number of vectors and transfered bytes.
+  // NumLanes = 16 bytes / bytes of each lane
+  unsigned NumLanes = 16 / (TransferBytes / NumVecs);
+  switch (NumLanes) {
+  case 16: // A vector has 16 lanes, each lane is 1 bytes.
+    lane = (Q << 3) | S;
+    break;
+  case 8:
+    lane = (Q << 2) | (S >> 1);
+    break;
+  case 4:
+    lane = (Q << 1) | (S >> 2);
+    break;
+  case 2:
+    lane = Q;
+    break;
+  }
+  Inst.addOperand(MCOperand::CreateImm(lane));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned size = fieldFromInstruction(Insn, 22, 2);
+  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+
+  DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+
+  if(Q)
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  else
+    DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder);
+
+  switch (size) {
+  case 0:
+    Inst.addOperand(MCOperand::CreateImm(8));
+    break;
+  case 1:
+    Inst.addOperand(MCOperand::CreateImm(16));
+    break;
+  case 2:
+    Inst.addOperand(MCOperand::CreateImm(32));
+    break;
+  default :
+    return MCDisassembler::Fail;
+  }
+  return MCDisassembler::Success;
+}
+
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index b624331..0438de3 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -368,6 +368,14 @@ AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
   O << "#" << (Imm * MemScale);
 }
 
+void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNo).getReg();
+  std::string Name = getRegisterName(Reg);
+  Name[0] = 'v';
+  O << Name;
+}
+
 void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -454,8 +462,8 @@ void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
   o << "#0x0";
 }
 
-void AArch64InstPrinter::printNeonUImm8Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O) {
+void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum,
+                                             raw_ostream &O) {
   const MCOperand &MOUImm = MI->getOperand(OpNum);
 
   assert(MOUImm.isImm() &&
@@ -467,6 +475,18 @@ void AArch64InstPrinter::printNeonUImm8Operand(const MCInst *MI, unsigned OpNum,
   O.write_hex(Imm);
 }
 
+void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI,
+                                              unsigned OpNum,
+                                              raw_ostream &O) {
+  const MCOperand &MOUImm = MI->getOperand(OpNum);
+
+  assert(MOUImm.isImm()
+         && "Immediate operand required for Neon vector immediate inst.");
+
+  unsigned Imm = MOUImm.getImm();
+  O << Imm;
+}
+
 void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
                                                     unsigned OpNum,
                                                     raw_ostream &O) {
@@ -487,3 +507,33 @@ void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
   O << "#0x";
   O.write_hex(Mask);
 }
+
+// If Count > 1, there are two valid kinds of vector list:
+//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
+//   (2) {Vn.layout - Vm.layout}
+// We choose the first kind as output.
+template <A64Layout::VectorLayout Layout, unsigned Count>
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
+
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  std::string LayoutStr = A64VectorLayoutToString(Layout);
+  O << "{";
+  if (Count > 1) { // Print sub registers separately
+    bool IsVec64 = (Layout < A64Layout::VL_16B);
+    unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
+    for (unsigned I = 0; I < Count; I++) {
+      std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
+      Name[0] = 'v';
+      O << Name << LayoutStr;
+      if (I != Count - 1)
+        O << ", ";
+    }
+  } else { // Print the register directly when NumVecs is 1.
+    std::string Name = getRegisterName(Reg);
+    Name[0] = 'v';
+    O << Name << LayoutStr;
+  }
+  O << "}";
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index f7439be..37b7273 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -157,6 +157,7 @@ public:
   void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
                              raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
 
+  void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
@@ -168,9 +169,13 @@ public:
   void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
   void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printNeonUImm8Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O);
+
+  template <A64Layout::VectorLayout Layout, unsigned Count>
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 };
 }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a3373b1..8a9077c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -578,8 +578,8 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
 }
 
 MCAsmBackend *
-llvm::createAArch64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+llvm::createAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                              StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-
   return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS());
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 104e4d2..a64c463 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -55,11 +55,10 @@ namespace {
 /// by MachO. Beware!
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
-  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                 raw_ostream &OS, MCCodeEmitter *Emitter)
-    : MCELFStreamer(Context, TAB, OS, Emitter),
-      MappingSymbolCounter(0), LastEMS(EMS_None) {
-  }
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                     MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, 0, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
 
   ~AArch64ELFStreamer() {}
 
@@ -129,7 +128,7 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    AssignSection(Symbol, getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 8ec8cbf..add874c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -31,11 +31,12 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() {
 
   UseDataRegionDirectives = true;
 
-  WeakRefDirective = "\t.weak\t";
-
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
+
+// Pin the vtable to this file.
+void AArch64ELFMCAsmInfo::anchor() {}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index a20bc47..d1dd285 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -14,13 +14,15 @@
 #ifndef LLVM_AARCH64TARGETASMINFO_H
 #define LLVM_AARCH64TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
-  struct AArch64ELFMCAsmInfo : public MCAsmInfo {
-    explicit AArch64ELFMCAsmInfo();
-  };
+struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
+  explicit AArch64ELFMCAsmInfo();
+private:
+  virtual void anchor();
+};
 
 } // namespace llvm
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index b9770b3..b41c566 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -59,6 +59,23 @@ public:
   unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
                                    SmallVectorImpl<MCFixup> &Fixups) const;
 
+  unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
 
   // Labels are handled mostly the same way: a symbol is needed, and
   // just gets some fixup attached.
@@ -310,6 +327,45 @@ AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
   return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
 }
 
+unsigned AArch64MCCodeEmitter::getShiftRightImm8(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm16(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm32(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm64(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 8;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 16;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 32;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 64;
+}
 
 template<AArch64::Fixups fixupDesired> unsigned
 AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 3849fe3..670e657 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -43,8 +43,9 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
 MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
                                              uint8_t OSABI);
 
-MCAsmBackend *createAArch64AsmBackend(const Target &T, StringRef TT,
-                                      StringRef CPU);
+MCAsmBackend *createAArch64AsmBackend(const Target &T,
+                                      const MCRegisterInfo &MRI,
+                                      StringRef TT, StringRef CPU);
 
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e675efc..ce970b0 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -306,6 +306,65 @@ namespace A64SE {
     };
 }
 
+namespace A64Layout {
+    enum VectorLayout {
+        Invalid = -1,
+        VL_8B,
+        VL_4H,
+        VL_2S,
+        VL_1D,
+
+        VL_16B,
+        VL_8H,
+        VL_4S,
+        VL_2D,
+
+        // Bare layout for the 128-bit vector
+        // (only show ".b", ".h", ".s", ".d" without vector number)
+        VL_B,
+        VL_H,
+        VL_S,
+        VL_D
+    };
+}
+
+inline static const char *
+A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
+  switch (Layout) {
+  case A64Layout::VL_8B:  return ".8b";
+  case A64Layout::VL_4H:  return ".4h";
+  case A64Layout::VL_2S:  return ".2s";
+  case A64Layout::VL_1D:  return ".1d";
+  case A64Layout::VL_16B:  return ".16b";
+  case A64Layout::VL_8H:  return ".8h";
+  case A64Layout::VL_4S:  return ".4s";
+  case A64Layout::VL_2D:  return ".2d";
+  case A64Layout::VL_B:  return ".b";
+  case A64Layout::VL_H:  return ".h";
+  case A64Layout::VL_S:  return ".s";
+  case A64Layout::VL_D:  return ".d";
+  default: llvm_unreachable("Unknown Vector Layout");
+  }
+}
+
+inline static A64Layout::VectorLayout
+A64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", A64Layout::VL_8B)
+             .Case(".4h", A64Layout::VL_4H)
+             .Case(".2s", A64Layout::VL_2S)
+             .Case(".1d", A64Layout::VL_1D)
+             .Case(".16b", A64Layout::VL_16B)
+             .Case(".8h", A64Layout::VL_8H)
+             .Case(".4s", A64Layout::VL_4S)
+             .Case(".2d", A64Layout::VL_2D)
+             .Case(".b", A64Layout::VL_B)
+             .Case(".h", A64Layout::VL_H)
+             .Case(".s", A64Layout::VL_S)
+             .Case(".d", A64Layout::VL_D)
+             .Default(A64Layout::Invalid);
+}
+
 namespace A64SysReg {
   enum SysRegROValues {
     MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index e8c2f7c..ff585b4 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -657,6 +657,13 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
         Modified = true;
         for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
                E = Uses.end(); I != E; ++I) {
+          // Make sure to constrain the register class of the new register to
+          // match what we're replacing. Otherwise we can optimize a DPR_VFP2
+          // reference into a plain DPR, and that will end poorly. NewReg is
+          // always virtual here, so there will always be a matching subclass
+          // to find.
+          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
+
           DEBUG(dbgs() << "Replacing operand "
                        << **I << " with "
                        << PrintReg(NewReg) << "\n");
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index e5da3a5..36e5680 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -45,7 +45,7 @@ def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
 def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
                                      "Enable VFP4 instructions",
                                      [FeatureVFP3, FeatureFP16]>;
-def FeatureV8FP : SubtargetFeature<"v8fp", "HasV8FP",
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
                                    "true", "Enable ARMv8 FP",
                                    [FeatureVFP4]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
@@ -67,6 +67,11 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
                            "Enable support for Performance Monitor extensions">;
 def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
                           "Enable support for TrustZone security extensions">;
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+                          "Enable support for Cryptography extensions",
+                          [FeatureNEON]>;
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+                          "Enable support for CRC instructions">;
 
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
@@ -114,10 +119,24 @@ def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
                                  "Supports Multiprocessing extension">;
 
-// M-series ISA?
-def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true",
+// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
+def FeatureVirtualization : SubtargetFeature<"virtualization",
+                                 "HasVirtualization", "true",
+                                 "Supports Virtualization extension",
+                                 [FeatureHWDiv, FeatureHWDivARM]>;
+
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
                                      "Is microcontroller profile ('M' series)">;
 
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
+
+// A-series ISA
+def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
+                                     "Is application profile ('A' series)">;
+
 // Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
 // See ARMInstrInfo.td for details.
 def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
@@ -135,15 +154,19 @@ def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
 def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    "Support ARM v6 instructions",
                                    [HasV5TEOps]>;
+def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
+                                   "Support ARM v6M instructions",
+                                   [HasV6Ops]>;
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
-                                   [HasV6Ops, FeatureThumb2]>;
+                                   [HasV6MOps, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops, FeaturePerfMon]>;
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops]>;
+                                   [HasV7Ops, FeatureVirtualization,
+                                    FeatureMP]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
@@ -179,9 +202,23 @@ def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
 // FIXME: It has not been determined if A15 has these features.
 def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors",
-                                   [FeatureT2XtPk, FeatureFP16, FeatureVFP4,
+                                   [FeatureT2XtPk, FeatureVFP4,
+                                    FeatureMP, FeatureHWDiv, FeatureHWDivARM,
                                     FeatureAvoidPartialCPSR,
-                                    FeatureTrustZone]>;
+                                    FeatureTrustZone, FeatureVirtualization]>;
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
                                    "Cortex-R5 ARM processors",
                                    [FeatureSlowFPBrcc,
@@ -243,7 +280,7 @@ def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
                                                        FeatureHasSlowFPVMLx]>;
 
 // V6M Processors.
-def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6Ops, FeatureNoARM,
+def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
                                                        FeatureDB, FeatureMClass]>;
 
 // V6T2 Processors.
@@ -258,26 +295,30 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
 def : ProcessorModel<"cortex-a5",   CortexA8Model,
                                     [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureVFP4, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 def : ProcessorModel<"cortex-a8",   CortexA8Model,
                                     [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 def : ProcessorModel<"cortex-a9",   CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 // FIXME: A15 has currently the same ProcessorModel as A9.
 def : ProcessorModel<"cortex-a15",   CortexA9Model,
                                     [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 // FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,
                                     [ProcR5, HasV7Ops, FeatureDB,
                                      FeatureVFP3, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureVFPOnlySP,
+                                     FeatureD16, FeatureRClass]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
@@ -289,16 +330,22 @@ def : ProcNoItin<"cortex-m4",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
                                      FeatureHWDiv, FeatureDSPThumb2,
                                      FeatureT2XtPk, FeatureVFP4,
-                                     FeatureVFPOnlySP, FeatureMClass]>;
+                                     FeatureVFPOnlySP, FeatureD16,
+                                     FeatureMClass]>;
 
 // Swift uArch Processors.
 def : ProcessorModel<"swift",       SwiftModel,
                                     [ProcSwift, HasV7Ops, FeatureNEON,
                                      FeatureDB, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 
 // V8 Processors
-def : ProcNoItin<"cortex-a53",      [HasV8Ops]>;
+def : ProcNoItin<"cortex-a53",      [ProcA53, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
+def : ProcNoItin<"cortex-a57",      [ProcA57, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 13a22b1..e79f88d 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "ARM.h"
 #include "ARMBuildAttrs.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMFPUName.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
@@ -55,164 +56,6 @@
 #include <cctype>
 using namespace llvm;
 
-namespace {
-
-  // Per section and per symbol attributes are not supported.
-  // To implement them we would need the ability to delay this emission
-  // until the assembly file is fully parsed/generated as only then do we
-  // know the symbol and section numbers.
-  class AttributeEmitter {
-  public:
-    virtual void MaybeSwitchVendor(StringRef Vendor) = 0;
-    virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0;
-    virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0;
-    virtual void Finish() = 0;
-    virtual ~AttributeEmitter() {}
-  };
-
-  class AsmAttributeEmitter : public AttributeEmitter {
-    MCStreamer &Streamer;
-
-  public:
-    AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {}
-    void MaybeSwitchVendor(StringRef Vendor) { }
-
-    void EmitAttribute(unsigned Attribute, unsigned Value) {
-      Streamer.EmitRawText("\t.eabi_attribute " +
-                           Twine(Attribute) + ", " + Twine(Value));
-    }
-
-    void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      switch (Attribute) {
-      default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
-      case ARMBuildAttrs::CPU_name:
-        Streamer.EmitRawText(StringRef("\t.cpu ") + String.lower());
-        break;
-      /* GAS requires .fpu to be emitted regardless of EABI attribute */
-      case ARMBuildAttrs::Advanced_SIMD_arch:
-      case ARMBuildAttrs::VFP_arch:
-        Streamer.EmitRawText(StringRef("\t.fpu ") + String.lower());
-        break;
-      }
-    }
-    void Finish() { }
-  };
-
-  class ObjectAttributeEmitter : public AttributeEmitter {
-    // This structure holds all attributes, accounting for
-    // their string/numeric value, so we can later emmit them
-    // in declaration order, keeping all in the same vector
-    struct AttributeItemType {
-      enum {
-        HiddenAttribute = 0,
-        NumericAttribute,
-        TextAttribute
-      } Type;
-      unsigned Tag;
-      unsigned IntValue;
-      StringRef StringValue;
-    } AttributeItem;
-
-    MCObjectStreamer &Streamer;
-    StringRef CurrentVendor;
-    SmallVector<AttributeItemType, 64> Contents;
-
-    // Account for the ULEB/String size of each item,
-    // not just the number of items
-    size_t ContentsSize;
-    // FIXME: this should be in a more generic place, but
-    // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
-    size_t getULEBSize(int Value) {
-      size_t Size = 0;
-      do {
-        Value >>= 7;
-        Size += sizeof(int8_t); // Is this really necessary?
-      } while (Value);
-      return Size;
-    }
-
-  public:
-    ObjectAttributeEmitter(MCObjectStreamer &Streamer_) :
-      Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { }
-
-    void MaybeSwitchVendor(StringRef Vendor) {
-      assert(!Vendor.empty() && "Vendor cannot be empty.");
-
-      if (CurrentVendor.empty())
-        CurrentVendor = Vendor;
-      else if (CurrentVendor == Vendor)
-        return;
-      else
-        Finish();
-
-      CurrentVendor = Vendor;
-
-      assert(Contents.size() == 0);
-    }
-
-    void EmitAttribute(unsigned Attribute, unsigned Value) {
-      AttributeItemType attr = {
-        AttributeItemType::NumericAttribute,
-        Attribute,
-        Value,
-        StringRef("")
-      };
-      ContentsSize += getULEBSize(Attribute);
-      ContentsSize += getULEBSize(Value);
-      Contents.push_back(attr);
-    }
-
-    void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      AttributeItemType attr = {
-        AttributeItemType::TextAttribute,
-        Attribute,
-        0,
-        String
-      };
-      ContentsSize += getULEBSize(Attribute);
-      // String + \0
-      ContentsSize += String.size()+1;
-
-      Contents.push_back(attr);
-    }
-
-    void Finish() {
-      // Vendor size + Vendor name + '\0'
-      const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
-
-      // Tag + Tag Size
-      const size_t TagHeaderSize = 1 + 4;
-
-      Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
-      Streamer.EmitBytes(CurrentVendor);
-      Streamer.EmitIntValue(0, 1); // '\0'
-
-      Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
-      Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
-
-      // Size should have been accounted for already, now
-      // emit each field as its type (ULEB or String)
-      for (unsigned int i=0; i<Contents.size(); ++i) {
-        AttributeItemType item = Contents[i];
-        Streamer.EmitULEB128IntValue(item.Tag);
-        switch (item.Type) {
-        default: llvm_unreachable("Invalid attribute type");
-        case AttributeItemType::NumericAttribute:
-          Streamer.EmitULEB128IntValue(item.IntValue);
-          break;
-        case AttributeItemType::TextAttribute:
-          Streamer.EmitBytes(item.StringValue.upper());
-          Streamer.EmitIntValue(0, 1); // '\0'
-          break;
-        }
-      }
-
-      Contents.clear();
-    }
-  };
-
-} // end of anonymous namespace
-
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc,
                                    bool Indirect) const {
@@ -302,7 +145,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
   assert(GV && "C++ constructor pointer was not a GlobalValue!");
 
-  const MCExpr *E = MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+  const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV),
                                             (Subtarget->isTargetDarwin()
                                              ? MCSymbolRefExpr::VK_None
                                              : MCSymbolRefExpr::VK_ARM_TARGET1),
@@ -363,7 +206,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
              (TF & ARMII::MO_HI16))
       O << ":upper16:";
-    O << *Mang->getSymbol(GV);
+    O << *getSymbol(GV);
 
     printOffset(MO.getOffset(), O);
     if (TF == ARMII::MO_PLT)
@@ -496,6 +339,23 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       if (!FlagsOP.isImm())
         return true;
       unsigned Flags = FlagsOP.getImm();
+
+      // This operand may not be the one that actually provides the register. If
+      // it's tied to a previous one then we should refer instead to that one
+      // for registers and their classes.
+      unsigned TiedIdx;
+      if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) {
+        for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) {
+          unsigned OpFlags = MI->getOperand(OpNum).getImm();
+          OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+        Flags = MI->getOperand(OpNum).getImm();
+
+        // Later code expects OpNum to be pointing at the register rather than
+        // the flags.
+        OpNum += 1;
+      }
+
       unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
       unsigned RC;
       InlineAsm::hasRegClassConstraint(Flags, RC);
@@ -714,11 +574,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
-  // FIXME: This should eventually end up somewhere else where more
-  // intelligent flag decisions can be made. For now we are just maintaining
-  // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-  if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&OutStreamer))
-    MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 }
 
 //===----------------------------------------------------------------------===//
@@ -728,150 +583,150 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 // to appear in the .ARM.attributes section in ELF.
 // Instead of subclassing the MCELFStreamer, we do the work here.
 
-void ARMAsmPrinter::emitAttributes() {
-
-  emitARMAttributeSection();
-
-  /* GAS expect .fpu to be emitted, regardless of VFP build attribute */
-  bool emitFPU = false;
-  AttributeEmitter *AttrEmitter;
-  if (OutStreamer.hasRawTextSupport()) {
-    AttrEmitter = new AsmAttributeEmitter(OutStreamer);
-    emitFPU = true;
-  } else {
-    MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer);
-    AttrEmitter = new ObjectAttributeEmitter(O);
-  }
-
-  AttrEmitter->MaybeSwitchVendor("aeabi");
-
-  std::string CPUString = Subtarget->getCPUString();
+static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
+                                            const ARMSubtarget *Subtarget) {
+  if (CPU == "xscale")
+    return ARMBuildAttrs::v5TEJ;
 
-  if (CPUString == "cortex-a8" ||
-      Subtarget->isCortexA8()) {
-    AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8");
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                               ARMBuildAttrs::ApplicationProfile);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
-    // Fixme: figure out when this is emitted.
-    //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch,
-    //                           ARMBuildAttrs::AllowWMMXv1);
-    //
-
-    /// ADD additional Else-cases here!
-  } else if (CPUString == "xscale") {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::Allowed);
-  } else if (Subtarget->hasV8Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v8);
+  if (Subtarget->hasV8Ops())
+    return ARMBuildAttrs::v8;
   else if (Subtarget->hasV7Ops()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
+    if (Subtarget->isMClass() && Subtarget->hasThumb2DSP())
+      return ARMBuildAttrs::v7E_M;
+    return ARMBuildAttrs::v7;
   } else if (Subtarget->hasV6T2Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+    return ARMBuildAttrs::v6T2;
+  else if (Subtarget->hasV6MOps())
+    return ARMBuildAttrs::v6S_M;
   else if (Subtarget->hasV6Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+    return ARMBuildAttrs::v6;
   else if (Subtarget->hasV5TEOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+    return ARMBuildAttrs::v5TE;
   else if (Subtarget->hasV5TOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+    return ARMBuildAttrs::v5T;
   else if (Subtarget->hasV4TOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+    return ARMBuildAttrs::v4T;
   else
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4);
+    return ARMBuildAttrs::v4;
+}
 
-  if (Subtarget->hasNEON() && emitFPU) {
-    /* NEON is not exactly a VFP architecture, but GAS emit one of
-     * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
-    if (Subtarget->hasVFP4())
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                                     "neon-vfpv4");
-    else
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon");
-    /* If emitted for NEON, omit from VFP below, since you can have both
-     * NEON and VFP in build attributes but only one .fpu */
-    emitFPU = false;
+void ARMAsmPrinter::emitAttributes() {
+  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  ATS.switchVendor("aeabi");
+
+  std::string CPUString = Subtarget->getCPUString();
+
+  if (CPUString != "generic")
+    ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+
+  ATS.emitAttribute(ARMBuildAttrs::CPU_arch,
+                    getArchForCPU(CPUString, Subtarget));
+
+  if (Subtarget->isAClass()) {
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::ApplicationProfile);
+  } else if (Subtarget->isRClass()) {
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::RealTimeProfile);
+  } else if (Subtarget->isMClass()){
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::MicroControllerProfile);
   }
 
-  /* V8FP + .fpu */
-  if (Subtarget->hasV8FP()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowV8FPA);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "v8fp");
-    /* VFPv4 + .fpu */
-  } else if (Subtarget->hasVFP4()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv4A);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv4");
-
-  /* VFPv3 + .fpu */
-  } else if (Subtarget->hasVFP3()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv3A);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3");
-
-  /* VFPv2 + .fpu */
-  } else if (Subtarget->hasVFP2()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv2);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2");
+  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, Subtarget->hasARMOps() ?
+                      ARMBuildAttrs::Allowed : ARMBuildAttrs::Not_Allowed);
+  if (Subtarget->isThumb1Only()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::Allowed);
+  } else if (Subtarget->hasThumb2()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumb32);
   }
 
-  /* TODO: ARMBuildAttrs::Allowed is not completely accurate,
-   * since NEON can have 1 (allowed) or 2 (MAC operations) */
   if (Subtarget->hasNEON()) {
-    if (Subtarget->hasV8Ops())
-      AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                                 ARMBuildAttrs::AllowedNeonV8);
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+    if (Subtarget->hasFPARMv8()) {
+      if (Subtarget->hasCrypto())
+        ATS.emitFPU(ARM::CRYPTO_NEON_FP_ARMV8);
+      else
+        ATS.emitFPU(ARM::NEON_FP_ARMV8);
+    }
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(ARM::NEON_VFPV4);
     else
-      AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                                 ARMBuildAttrs::Allowed);
+      ATS.emitFPU(ARM::NEON);
+    // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
+    if (Subtarget->hasV8Ops())
+      ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                        ARMBuildAttrs::AllowNeonARMv8);
+  } else {
+    if (Subtarget->hasFPARMv8())
+      ATS.emitFPU(ARM::FP_ARMV8);
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4);
+    else if (Subtarget->hasVFP3())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV3_D16 : ARM::VFPV3);
+    else if (Subtarget->hasVFP2())
+      ATS.emitFPU(ARM::VFPV2);
   }
 
   // Signal various FP modes.
   if (!TM.Options.UnsafeFPMath) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
-                               ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+                      ARMBuildAttrs::Allowed);
   }
 
   if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                               ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::Allowed);
   else
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                               ARMBuildAttrs::AllowIEE754);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::AllowIEE754);
 
   // FIXME: add more flags to ARMBuildAttrs.h
   // 8-bytes alignment stuff.
-  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
-  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+
+  // ABI_HardFP_use attribute to indicate single precision FP.
+  if (Subtarget->isFPOnlySP())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
+                      ARMBuildAttrs::HardFPSinglePrecision);
 
   // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
-  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1);
-  }
+  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
+
   // FIXME: Should we signal R9 usage?
 
-  if (Subtarget->hasDivide())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1);
+  if (Subtarget->hasFP16())
+      ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+  if (Subtarget->hasMPExtension())
+      ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+
+  if (Subtarget->hasDivide()) {
+    // Check if hardware divide is only available in thumb2 or ARM as well.
+    ATS.emitAttribute(ARMBuildAttrs::DIV_use,
+      Subtarget->hasDivideInARMMode() ? ARMBuildAttrs::AllowDIVExt :
+                                        ARMBuildAttrs::AllowDIVIfExists);
+  }
 
-  AttrEmitter->Finish();
-  delete AttrEmitter;
+  if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZVirtualization);
+  else if (Subtarget->hasTrustZone())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZ);
+  else if (Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowVirtualization);
+
+  ATS.finishAttributeSection();
 }
 
 void ARMAsmPrinter::emitARMAttributeSection() {
@@ -923,7 +778,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
   bool isIndirect = Subtarget->isTargetDarwin() &&
     Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
   if (!isIndirect)
-    return Mang->getSymbol(GV);
+    return getSymbol(GV);
 
   // FIXME: Remove this when Darwin transition to @GOT like syntax.
   MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
@@ -934,7 +789,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
     MMIMachO.getGVStubEntry(MCSym);
   if (StubSym.getPointer() == 0)
     StubSym = MachineModuleInfoImpl::
-      StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+      StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
   return MCSym;
 }
 
@@ -1111,6 +966,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   assert(MI->getFlag(MachineInstr::FrameSetup) &&
       "Only instruction which are involved into frame setup code are allowed");
 
+  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   const MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
   const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
@@ -1173,7 +1030,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       RegList.push_back(SrcReg);
       break;
     }
-    OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+    ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
   } else {
     // Changes of stack / frame pointer.
     if (SrcReg == ARM::SP) {
@@ -1221,11 +1078,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       if (DstReg == FramePtr && FramePtr != ARM::SP)
         // Set-up of the frame pointer. Positive values correspond to "add"
         // instruction.
-        OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset);
+        ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
       else if (DstReg == ARM::SP) {
         // Change of SP by an offset. Positive values correspond to "sub"
         // instruction.
-        OutStreamer.EmitPad(Offset);
+        ATS.emitPad(Offset);
       } else {
         MI->dump();
         llvm_unreachable("Unsupported opcode for unwinding information");
@@ -1366,7 +1223,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0));
 
     const GlobalValue *GV = MI->getOperand(0).getGlobal();
-    MCSymbol *GVSym = Mang->getSymbol(GV);
+    MCSymbol *GVSym = getSymbol(GV);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(ARM::Bcc)
       .addExpr(GVSymExpr)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 977d936..f835a4e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMBaseInstrInfo.h"
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMFeatures.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
@@ -36,7 +37,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "ARMGenInstrInfo.inc"
 
 using namespace llvm;
@@ -520,11 +521,17 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
   if (!MI->isPredicable())
     return false;
 
-  if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) {
-    ARMFunctionInfo *AFI =
-      MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
-    return AFI->isThumb2Function();
+  ARMFunctionInfo *AFI =
+    MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+
+  if (AFI->isThumb2Function()) {
+    if (getSubtarget().restrictIT())
+      return isV8EligibleForIT(MI);
+  } else { // non-Thumb
+    if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+      return false;
   }
+
   return true;
 }
 
@@ -645,16 +652,16 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    unsigned DestReg, unsigned SrcReg,
                                    bool KillSrc) const {
   bool GPRDest = ARM::GPRRegClass.contains(DestReg);
-  bool GPRSrc  = ARM::GPRRegClass.contains(SrcReg);
+  bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
   if (GPRDest && GPRSrc) {
     AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
-                                  .addReg(SrcReg, getKillRegState(KillSrc))));
+                                    .addReg(SrcReg, getKillRegState(KillSrc))));
     return;
   }
 
   bool SPRDest = ARM::SPRRegClass.contains(DestReg);
-  bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
+  bool SPRSrc = ARM::SPRRegClass.contains(SrcReg);
 
   unsigned Opc = 0;
   if (SPRDest && SPRSrc)
@@ -683,26 +690,47 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   int Spacing = 1;
 
   // Use VORRq when possible.
-  if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 2;
-  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 4;
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 2;
+  } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 4;
   // Fall back to VMOVD.
-  else if (ARM::DPairRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2;
-  else if (ARM::DTripleRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3;
-  else if (ARM::DQuadRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4;
-  else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::MOVr, BeginIdx = ARM::gsub_0, SubRegs = 2;
-
-  else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2, Spacing = 2;
-  else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3, Spacing = 2;
-  else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
+  } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+  } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+  } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+  } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr;
+    BeginIdx = ARM::gsub_0;
+    SubRegs = 2;
+  } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+    Spacing = 2;
+  } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+    Spacing = 2;
+  } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+    Spacing = 2;
+  }
 
   assert(Opc && "Impossible reg-to-reg copy");
 
@@ -711,22 +739,21 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
   if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
-    BeginIdx = BeginIdx + ((SubRegs-1)*Spacing);
+    BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing);
     Spacing = -Spacing;
   }
 #ifndef NDEBUG
   SmallSet<unsigned, 4> DstRegs;
 #endif
   for (unsigned i = 0; i != SubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
-    unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
     assert(Dst && Src && "Bad sub-register");
 #ifndef NDEBUG
     assert(!DstRegs.count(Src) && "destructive vector copy");
     DstRegs.insert(Dst);
 #endif
-    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-      .addReg(Src);
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src);
     // VORR takes two source operands.
     if (Opc == ARM::VORRq)
       Mov.addReg(Src);
@@ -1404,9 +1431,11 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
   case ARM::t2LDRDi8:
   case ARM::t2LDRSHi8:
   case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
   case ARM::t2LDRSHi12:
     break;
   }
@@ -1423,8 +1452,10 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
   case ARM::t2LDRSHi8:
   case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
   case ARM::t2LDRSHi12:
     break;
   }
@@ -1471,7 +1502,16 @@ bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   if ((Offset2 - Offset1) / 8 > 64)
     return false;
 
-  if (Load1->getMachineOpcode() != Load2->getMachineOpcode())
+  // Check if the machine opcodes are different. If they are different
+  // then we consider them to not be of the same base address,
+  // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12.
+  // In this case, they are considered to be the same because they are different
+  // encoding forms of the same basic instruction.
+  if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) &&
+      !((Load1->getMachineOpcode() == ARM::t2LDRBi8 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi12) ||
+        (Load1->getMachineOpcode() == ARM::t2LDRBi12 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi8)))
     return false;  // FIXME: overly conservative?
 
   // Four loads in a row should be sufficient.
@@ -1686,7 +1726,7 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
                                                bool PreferFalse) const {
   assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
   bool Invert = !DefMI;
   if (!DefMI)
@@ -1694,11 +1734,17 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   if (!DefMI)
     return 0;
 
+  // Find new register class to use.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  unsigned       DestReg  = MI->getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return 0;
+
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
   MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      DefMI->getDesc(),
-                                      MI->getOperand(0).getReg());
+                                      DefMI->getDesc(), DestReg);
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1721,7 +1767,6 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   // register operand tied to the first def.
   // The tie makes the register allocator ensure the FalseReg is allocated the
   // same register as operand 0.
-  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
   FalseReg.setImplicit();
   NewMI.addOperand(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
@@ -1781,6 +1826,14 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
                                const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    return;
+  }
+
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -1804,6 +1857,115 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
   }
 }
 
+bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      unsigned NumBytes) {
+  // This optimisation potentially adds lots of load and store
+  // micro-operations, it's only really a great benefit to code-size.
+  if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
+    return false;
+
+  // If only one register is pushed/popped, LLVM can use an LDR/STR
+  // instead. We can't modify those so make sure we're dealing with an
+  // instruction we understand.
+  bool IsPop = isPopOpcode(MI->getOpcode());
+  bool IsPush = isPushOpcode(MI->getOpcode());
+  if (!IsPush && !IsPop)
+    return false;
+
+  bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
+                      MI->getOpcode() == ARM::VLDMDIA_UPD;
+  bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
+                     MI->getOpcode() == ARM::tPOP ||
+                     MI->getOpcode() == ARM::tPOP_RET;
+
+  assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
+                          MI->getOperand(1).getReg() == ARM::SP)) &&
+         "trying to fold sp update into non-sp-updating push/pop");
+
+  // The VFP push & pop act on D-registers, so we can only fold an adjustment
+  // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
+  // if this is violated.
+  if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
+    return false;
+
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4. Thumb1 starts after the predicate.
+  int RegListIdx = IsT1PushPop ? 2 : 4;
+
+  // Calculate the space we'll need in terms of registers.
+  unsigned FirstReg = MI->getOperand(RegListIdx).getReg();
+  unsigned RD0Reg, RegsNeeded;
+  if (IsVFPPushPop) {
+    RD0Reg = ARM::D0;
+    RegsNeeded = NumBytes / 8;
+  } else {
+    RD0Reg = ARM::R0;
+    RegsNeeded = NumBytes / 4;
+  }
+
+  // We're going to have to strip all list operands off before
+  // re-adding them since the order matters, so save the existing ones
+  // for later.
+  SmallVector<MachineOperand, 4> RegList;
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    RegList.push_back(MI->getOperand(i));
+
+  MachineBasicBlock *MBB = MI->getParent();
+  const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
+  // Now try to find enough space in the reglist to allocate NumBytes.
+  for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded;
+       --CurReg) {
+    if (!IsPop) {
+      // Pushing any register is completely harmless, mark the
+      // register involved as undef since we don't care about it in
+      // the slightest.
+      RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
+                                                  false, false, true));
+      --RegsNeeded;
+      continue;
+    }
+
+    // However, we can only pop an extra register if it's not live. For
+    // registers live within the function we might clobber a return value
+    // register; the other way a register can be live here is if it's
+    // callee-saved.
+    if (isCalleeSavedRegister(CurReg, CSRegs) ||
+        MBB->computeRegisterLiveness(TRI, CurReg, MI) !=
+            MachineBasicBlock::LQR_Dead) {
+      // VFP pops don't allow holes in the register list, so any skip is fatal
+      // for our transformation. GPR pops do, so we should just keep looking.
+      if (IsVFPPushPop)
+        return false;
+      else
+        continue;
+    }
+
+    // Mark the unimportant registers as <def,dead> in the POP.
+    RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false,
+                                                true));
+    --RegsNeeded;
+  }
+
+  if (RegsNeeded > 0)
+    return false;
+
+  // Finally we know we can profitably perform the optimisation so go
+  // ahead: strip all existing registers off and add them back again
+  // in the right order.
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    MI->RemoveOperand(i);
+
+  // Add the complete list back in.
+  MachineInstrBuilder MIB(MF, &*MI);
+  for (int i = RegList.size() - 1; i >= 0; --i)
+    MIB.addOperand(RegList[i]);
+
+  return true;
+}
+
 bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                                 unsigned FrameReg, int &Offset,
                                 const ARMBaseInstrInfo &TII) {
@@ -2210,8 +2372,32 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
           isSafe = true;
           break;
         }
-        // Condition code is after the operand before CPSR.
-        ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm();
+        // Condition code is after the operand before CPSR except for VSELs.
+        ARMCC::CondCodes CC;
+        bool IsInstrVSel = true;
+        switch (Instr.getOpcode()) {
+        default:
+          IsInstrVSel = false;
+          CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+          break;
+        case ARM::VSELEQD:
+        case ARM::VSELEQS:
+          CC = ARMCC::EQ;
+          break;
+        case ARM::VSELGTD:
+        case ARM::VSELGTS:
+          CC = ARMCC::GT;
+          break;
+        case ARM::VSELGED:
+        case ARM::VSELGES:
+          CC = ARMCC::GE;
+          break;
+        case ARM::VSELVSS:
+        case ARM::VSELVSD:
+          CC = ARMCC::VS;
+          break;
+        }
+
         if (Sub) {
           ARMCC::CondCodes NewCC = getSwappedCondition(CC);
           if (NewCC == ARMCC::AL)
@@ -2222,11 +2408,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
           // If it is safe to remove CmpInstr, the condition code of these
           // operands will be modified.
           if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-              Sub->getOperand(2).getReg() == SrcReg)
-            OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)),
-                                                      NewCC));
-        }
-        else
+              Sub->getOperand(2).getReg() == SrcReg) {
+            // VSel doesn't support condition code update.
+            if (IsInstrVSel)
+              return false;
+            OperandsToUpdate.push_back(
+                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+          }
+        } else
           switch (CC) {
           default:
             // CPSR can be used multiple times, we should continue.
@@ -3582,6 +3771,24 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
+   if (MI->isCopyLike() || MI->isInsertSubreg() ||
+      MI->isRegSequence() || MI->isImplicitDef())
+    return 0;
+
+  if (MI->isBundle())
+    return 0;
+
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    return 1;
+  }
+  return 0;
+}
+
 unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                            const MachineInstr *MI,
                                            unsigned *PredCost) const {
@@ -4114,7 +4321,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
   // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
   // the full D-register by loading the same value to both lanes.  The
   // instruction is micro-coded with 2 uops, so don't do this until we can
-  // properly schedule micro-coded instuctions.  The dispatcher stalls cause
+  // properly schedule micro-coded instructions.  The dispatcher stalls cause
   // too big regressions.
 
   // Insert the dependency-breaking FCONSTD before MI.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 96f8637..93e5964 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -264,6 +264,8 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
+  unsigned getPredicationCost(const MachineInstr *MI) const;
+
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr *MI,
                            unsigned *PredCost = 0) const;
@@ -360,6 +362,17 @@ bool isIndirectBranchOpcode(int Opc) {
   return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
 }
 
+static inline bool isPopOpcode(int Opc) {
+  return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
+         Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
+         Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
+}
+
+static inline bool isPushOpcode(int Opc) {
+  return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
+         Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
+}
+
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
@@ -399,6 +412,13 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                const ARMBaseRegisterInfo& MRI,
                                unsigned MIFlags = 0);
 
+/// Tries to add registers to the reglist of a given base-updating
+/// push/pop instruction to adjust the stack by an additional
+/// NumBytes. This can save a few bytes per function in code-size, but
+/// obviously generates more memory traffic. As such, it only takes
+/// effect in functions being optimised for size.
+bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI,
+                                unsigned NumBytes);
 
 /// rewriteARMFrameIndex / rewriteT2FrameIndex -
 /// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 58c06e3..8717dc0 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -51,20 +51,34 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
 
 const uint16_t*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  bool ghcCall = false;
- 
-  if (MF) {
-    const Function *F = MF->getFunction();
-    ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
-  }
- 
-  if (ghcCall)
+  const uint16_t *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+                                ? CSR_iOS_SaveList
+                                : CSR_AAPCS_SaveList;
+
+  if (!MF) return RegList;
+
+  const Function *F = MF->getFunction();
+  if (F->getCallingConv() == CallingConv::GHC) {
     // GHC set of callee saved regs is empty as all those regs are
     // used for passing STG regs around
     return CSR_NoRegs_SaveList;
-  else
-    return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-      ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+  } else if (F->hasFnAttribute("interrupt")) {
+    if (STI.isMClass()) {
+      // M-class CPUs have hardware which saves the registers needed to allow a
+      // function conforming to the AAPCS to function as a handler.
+      return CSR_AAPCS_SaveList;
+    } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
+      // Fast interrupt mode gives the handler a private copy of R8-R14, so less
+      // need to be saved to restore user-mode state.
+      return CSR_FIQ_SaveList;
+    } else {
+      // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by
+      // exception handling.
+      return CSR_GenericInt_SaveList;
+    }
+  }
+
+  return RegList;
 }
 
 const uint32_t*
@@ -371,14 +385,6 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return ARM::SP;
 }
 
-unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
 void ARMBaseRegisterInfo::
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index cdaad05..e28fff6 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -72,6 +72,14 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
   }
 }
 
+static inline bool isCalleeSavedRegister(unsigned Reg,
+                                         const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
 class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 protected:
   const ARMSubtarget &STI;
@@ -149,10 +157,6 @@ public:
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getBaseRegister() const { return BasePtr; }
 
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-
   bool isLowRegister(unsigned Reg) const;
 
 
diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h
index f614dca..b16d4ef 100644
--- a/lib/Target/ARM/ARMBuildAttrs.h
+++ b/lib/Target/ARM/ARMBuildAttrs.h
@@ -15,11 +15,13 @@
 #ifndef __TARGET_ARMBUILDATTRS_H__
 #define __TARGET_ARMBUILDATTRS_H__
 
+namespace llvm {
 namespace ARMBuildAttrs {
+
   enum SpecialAttr {
     // This is for the .cpu asm attr. It translates into one or more
     // AttrType (below) entries in the .ARM.attributes section in the ELF.
-    SEL_CPU 
+    SEL_CPU
   };
 
   enum AttrType {
@@ -57,7 +59,7 @@ namespace ARMBuildAttrs {
     ABI_FP_optimization_goals = 31,
     compatibility             = 32,
     CPU_unaligned_access      = 34,
-    VFP_HP_extension          = 36,
+    FP_HP_extension           = 36,
     ABI_FP_16bit_format       = 38,
     MPextension_use           = 42, // was 70, 2.08 ABI
     DIV_use                   = 44,
@@ -93,7 +95,7 @@ namespace ARMBuildAttrs {
     v8       = 14   // v8, AArch32
   };
 
-  enum CPUArchProfile { // (=7), uleb128 
+  enum CPUArchProfile { // (=7), uleb128
     Not_Applicable = 0, // pre v7, or cross-profile code
     ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8)
     RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4)
@@ -102,34 +104,67 @@ namespace ARMBuildAttrs {
   };
 
   // The following have a lot of common use cases
-  enum { 
-    //ARMISAUse (=8), uleb128  and THUMBISAUse (=9), uleb128
+  enum {
     Not_Allowed = 0,
     Allowed = 1,
-    AllowedNeonV8 = 3,
 
-    // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
+    // Tag_ARM_ISA_use (=8), uleb128
+
+    // Tag_THUMB_ISA_use, (=9), uleb128
+    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
+
+    // Tag_FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
     AllowFPv2  = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA)
     AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA)
-    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 
-    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) 
+    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31
+    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA)
     AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31
-    AllowV8FPA = 7, // Use of the ARM v8-A FP ISA was permitted
-    AllowV8FPB = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31
+    AllowFPARMv8A = 7, // Use of the ARM v8-A FP ISA was permitted
+    AllowFPARMv8B = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31
 
     // Tag_WMMX_arch, (=11), uleb128
-    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
-    
-    // Tag_WMMX_arch, (=11), uleb128
-    AllowWMMXv1 = 2,  // The user permitted this entity to use WMMX v2
+    AllowWMMXv1 = 1,  // The user permitted this entity to use WMMX v1
+    AllowWMMXv2 = 2,  // The user permitted this entity to use WMMX v2
+
+    // Tag_Advanced_SIMD_arch, (=12), uleb128
+    AllowNeon = 1, // SIMDv1 was permitted
+    AllowNeon2 = 2, // SIMDv2 was permitted (Half-precision FP, MAC operations)
+    AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
 
-    // Tag_ABI_FP_denormal, (=20), uleb128 
+    // Tag_ABI_FP_denormal, (=20), uleb128
     PreserveFPSign = 2, // sign when flushed-to-zero is preserved
 
     // Tag_ABI_FP_number_model, (=23), uleb128
     AllowRTABI = 2,  // numbers, infinities, and one quiet NaN (see [RTABI])
-    AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings
+    AllowIEE754 = 3, // this code to use all the IEEE 754-defined FP encodings
+
+    // Tag_ABI_HardFP_use, (=27), uleb128
+    HardFPImplied = 0, // FP use should be implied by Tag_FP_arch
+    HardFPSinglePrecision = 1, // Single-precision only
+
+    // Tag_ABI_VFP_args, (=28), uleb128
+    BaseAAPCS = 0,
+    HardFPAAPCS = 1,
+
+    // Tag_FP_HP_extension, (=36), uleb128
+    AllowHPFP = 1, // Allow use of Half Precision FP
+
+    // Tag_MPextension_use, (=42), uleb128
+    AllowMP = 1, // Allow use of MP extensions
+
+    // Tag_DIV_use, (=44), uleb128
+    AllowDIVIfExists = 0, // Allow hardware divide if available in arch, or no info exists.
+    DisallowDIV = 1, // Hardware divide explicitly disallowed
+    AllowDIVExt = 2, // Allow hardware divide as optional architecture extension above
+                     // the base arch specified by Tag_CPU_arch and Tag_CPU_arch_profile.
+
+    // Tag_Virtualization_use, (=68), uleb128
+    AllowTZ = 1,
+    AllowVirtualization = 2,
+    AllowTZVirtualization = 3
   };
-}
+
+} // namespace ARMBuildAttrs
+} // namespace llvm
 
 #endif // __TARGET_ARMBUILDATTRS_H__
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 89c5223..9bea4b2 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -207,4 +207,24 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
-                                          (sub CSR_AAPCS_ThisReturn, R9))>;
+                                         (sub CSR_AAPCS_ThisReturn, R9))>;
+
+// The "interrupt" attribute is used to generate code that is acceptable in
+// exception-handlers of various kinds. It makes us use a different return
+// instruction (handled elsewhere) and affects which registers we must return to
+// our "caller" in the same state as we receive them.
+
+// For most interrupts, all registers except SP and LR are shared with
+// user-space. We mark LR to be saved anyway, since this is what the ARM backend
+// generally does rather than tracking its liveness as a normal register.
+def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>;
+
+// The fast interrupt handlers have more private state and get their own copies
+// of R8-R12, in addition to SP and LR. As before, mark LR for saving too.
+
+// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and
+// current frame lowering expects to encounter it while processing callee-saved
+// registers.
+def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>;
+
+
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 4e703ec..7d41c69 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -163,21 +163,7 @@ const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
 
 int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
                                                        unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolConstant *APC = dyn_cast<ARMConstantPoolConstant>(CPV);
-      if (!APC) continue;
-      if (APC->CVal == CVal && equals(APC))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
 }
 
 bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
@@ -216,22 +202,7 @@ ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s,
 
 int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
                                                      unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolSymbol *APS = dyn_cast<ARMConstantPoolSymbol>(CPV);
-      if (!APS) continue;
-
-      if (APS->S == S && equals(APS))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
 }
 
 bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
@@ -271,22 +242,7 @@ ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
 
 int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
                                                   unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolMBB *APMBB = dyn_cast<ARMConstantPoolMBB>(CPV);
-      if (!APMBB) continue;
-
-      if (APMBB->MBB == MBB && equals(APMBB))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
 }
 
 bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 93812fe..7ae7bf4 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstddef>
 
@@ -64,6 +65,26 @@ protected:
   ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
                        bool AddCurrentAddress);
+
+  template <typename Derived>
+  int getExistingMachineCPValueImpl(MachineConstantPool *CP,
+                                    unsigned Alignment) {
+    unsigned AlignMask = Alignment - 1;
+    const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+    for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+      if (Constants[i].isMachineConstantPoolEntry() &&
+          (Constants[i].getAlignment() & AlignMask) == 0) {
+        ARMConstantPoolValue *CPV =
+            (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+        if (Derived *APC = dyn_cast<Derived>(CPV))
+          if (cast<Derived>(this)->equals(APC))
+            return i;
+      }
+    }
+
+    return -1;
+  }
+
 public:
   virtual ~ARMConstantPoolValue();
 
@@ -156,6 +177,10 @@ public:
   static bool classof(const ARMConstantPoolValue *APV) {
     return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
   }
+
+  bool equals(const ARMConstantPoolConstant *A) const {
+    return CVal == A->CVal && ARMConstantPoolValue::equals(A);
+  }
 };
 
 /// ARMConstantPoolSymbol - ARM-specific constantpool values for external
@@ -187,6 +212,10 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isExtSymbol();
   }
+
+  bool equals(const ARMConstantPoolSymbol *A) const {
+    return S == A->S && ARMConstantPoolValue::equals(A);
+  }
 };
 
 /// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
@@ -219,6 +248,10 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isMachineBasicBlock();
   }
+
+  bool equals(const ARMConstantPoolMBB *A) const {
+    return MBB == A->MBB && ARMConstantPoolValue::equals(A);
+  }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index beb843c..e6f7f86 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -692,10 +692,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
               MI.getOperand(1).getReg())
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg());
+        .addOperand(MI.getOperand(4));
 
       MI.eraseFromParent();
       return true;
@@ -705,10 +704,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
@@ -717,39 +715,36 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVCCsi: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
               (MI.getOperand(1).getReg()))
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm())
         .addImm(MI.getOperand(4).getImm()) // 'pred'
-        .addReg(MI.getOperand(5).getReg())
+        .addOperand(MI.getOperand(5))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
-
     case ARM::MOVCCsr: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
               (MI.getOperand(1).getReg()))
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
-        .addReg(MI.getOperand(3).getReg(),
-                getKillRegState(MI.getOperand(3).isKill()))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
         .addImm(MI.getOperand(4).getImm())
         .addImm(MI.getOperand(5).getImm()) // 'pred'
-        .addReg(MI.getOperand(6).getReg())
+        .addOperand(MI.getOperand(6))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MOVCCi16:
     case ARM::MOVCCi16: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16),
+      unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg());
-
+        .addOperand(MI.getOperand(4));
       MI.eraseFromParent();
       return true;
     }
@@ -760,23 +755,47 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MVNCCi:
     case ARM::MVNCCi: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi),
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MOVCClsl:
+    case ARM::t2MOVCClsr:
+    case ARM::t2MOVCCasr:
+    case ARM::t2MOVCCror: {
+      unsigned NewOpc;
+      switch (Opcode) {
+      case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break;
+      case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break;
+      case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break;
+      case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break;
+      default: llvm_unreachable("unexpeced conditional move");
+      }
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addOperand(MI.getOperand(5))
+        .addReg(0); // 's' bit
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::Int_eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
       const ARMBaseInstrInfo *AII =
@@ -823,7 +842,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     case ARM::MOVsrl_flag:
     case ARM::MOVsra_flag: {
-      // These are just fancy MOVs insructions.
+      // These are just fancy MOVs instructions.
       AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
                              MI.getOperand(0).getReg())
                      .addOperand(MI.getOperand(1))
@@ -938,6 +957,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       ExpandMOV32BitImm(MBB, MBBI);
       return true;
 
+    case ARM::SUBS_PC_LR: {
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
+              .addReg(ARM::LR)
+              .addOperand(MI.getOperand(0))
+              .addOperand(MI.getOperand(1))
+              .addOperand(MI.getOperand(2))
+              .addReg(ARM::CPSR, RegState::Undef);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::VLDMQIA: {
       unsigned NewOpc = ARM::VLDMDIA;
       MachineInstrBuilder MIB =
diff --git a/lib/Target/ARM/ARMFPUName.def b/lib/Target/ARM/ARMFPUName.def
new file mode 100644
index 0000000..9a1bbe7
--- /dev/null
+++ b/lib/Target/ARM/ARMFPUName.def
@@ -0,0 +1,32 @@
+//===-- ARMFPUName.def - List of the ARM FPU names --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the list of the supported ARM FPU names.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef ARM_FPU_NAME
+#error "You must define ARM_FPU_NAME(NAME, ID) before including ARMFPUName.h"
+#endif
+
+ARM_FPU_NAME("vfp", VFP)
+ARM_FPU_NAME("vfpv2", VFPV2)
+ARM_FPU_NAME("vfpv3", VFPV3)
+ARM_FPU_NAME("vfpv3-d16", VFPV3_D16)
+ARM_FPU_NAME("vfpv4", VFPV4)
+ARM_FPU_NAME("vfpv4-d16", VFPV4_D16)
+ARM_FPU_NAME("fp-armv8", FP_ARMV8)
+ARM_FPU_NAME("neon", NEON)
+ARM_FPU_NAME("neon-vfpv4", NEON_VFPV4)
+ARM_FPU_NAME("neon-fp-armv8", NEON_FP_ARMV8)
+ARM_FPU_NAME("crypto-neon-fp-armv8", CRYPTO_NEON_FP_ARMV8)
+
+#undef ARM_FPU_NAME
diff --git a/lib/Target/ARM/ARMFPUName.h b/lib/Target/ARM/ARMFPUName.h
new file mode 100644
index 0000000..2a64cce
--- /dev/null
+++ b/lib/Target/ARM/ARMFPUName.h
@@ -0,0 +1,26 @@
+//===-- ARMFPUName.h - List of the ARM FPU names ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMFPUNAME_H
+#define ARMFPUNAME_H
+
+namespace llvm {
+namespace ARM {
+
+enum FPUKind {
+  INVALID_FPU = 0
+
+#define ARM_FPU_NAME(NAME, ID) , ID
+#include "ARMFPUName.def"
+};
+
+} // namespace ARM
+} // namespace llvm
+
+#endif // ARMFPUNAME_H
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index ed054aa..a4004f3 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -176,6 +176,8 @@ class ARMFastISel : public FastISel {
 
     // Utility routines.
   private:
+    unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned OpNum,
+                                      unsigned Op);
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -252,10 +254,10 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
 bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
 
-  // If we're a thumb2 or not NEON function we were handled via isPredicable.
+  // If we're a thumb2 or not NEON function we'll be handled via isPredicable.
   if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
        AFI->isThumb2Function())
-    return false;
+    return MI->isPredicable();
 
   for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
     if (MCID.OpInfo[i].isPredicate())
@@ -276,7 +278,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   // Do we use a predicate? or...
   // Are we NEON in ARM mode and have a predicate operand? If so, I know
   // we're not predicable but add it anyways.
-  if (TII.isPredicable(MI) || isARMNEONPred(MI))
+  if (isARMNEONPred(MI))
     AddDefaultPred(MIB);
 
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
@@ -291,6 +293,23 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   return MIB;
 }
 
+unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II,
+                                               unsigned Op, unsigned OpNum) {
+  if (TargetRegisterInfo::isVirtualRegister(Op)) {
+    const TargetRegisterClass *RegClass =
+        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
+    if (!MRI.constrainRegClass(Op, RegClass)) {
+      // If it's not legal to COPY between the register classes, something
+      // has gone very wrong before we got here.
+      unsigned NewOp = createResultReg(RegClass);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(TargetOpcode::COPY), NewOp).addReg(Op));
+      return NewOp;
+    }
+  }
+  return Op;
+}
+
 unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode,
                                     const TargetRegisterClass* RC) {
   unsigned ResultReg = createResultReg(RC);
@@ -306,6 +325,9 @@ unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill));
@@ -326,6 +348,11 @@ unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -349,6 +376,12 @@ unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+  Op2 = constrainOperandRegClass(II, Op1, 3);
+
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -373,6 +406,9 @@ unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -395,6 +431,9 @@ unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -418,6 +457,10 @@ unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -610,6 +653,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
                     .addConstantPoolIndex(Idx));
   else
     // The extra immediate is for addrmode2.
+    DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(ARM::LDRcp), DestReg)
                     .addConstantPoolIndex(Idx)
@@ -685,6 +729,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
       AddOptionalDefs(MIB);
     } else {
       // The extra immediate is for addrmode2.
+      DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
                     DestReg)
         .addConstantPoolIndex(Idx)
@@ -855,13 +900,8 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
               TmpOffset += CI->getSExtValue() * S;
               break;
             }
-            if (isa<AddOperator>(Op) &&
-                (!isa<Instruction>(Op) ||
-                 FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
-                 == FuncInfo.MBB) &&
-                isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-              // An add (in the same block) with a constant operand. Fold the
-              // constant.
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
               ConstantInt *CI =
               cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
               TmpOffset += CI->getSExtValue() * S;
@@ -1139,6 +1179,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
         (const TargetRegisterClass*)&ARM::tGPRRegClass :
         (const TargetRegisterClass*)&ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+      SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(Opc), Res)
                       .addReg(SrcReg).addImm(1));
@@ -1210,6 +1251,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
   ARMSimplifyAddress(Addr, VT, useAM3);
 
   // Create the base instruction, then add the operands.
+  SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(StrOpc))
                             .addReg(SrcReg);
@@ -1333,6 +1375,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
         (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
       unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
       unsigned OpReg = getRegForValue(TI->getOperand(0));
+      OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(TstOpc))
                       .addReg(OpReg).addImm(1));
@@ -1370,6 +1413,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // and it left a value for us in a virtual register.  Ergo, we test
   // the one-bit value left in the virtual register.
   unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+  CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc))
                   .addReg(CmpReg).addImm(1));
 
@@ -1494,13 +1538,15 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     }
   }
 
+  const MCInstrDesc &II = TII.get(CmpOpc);
+  SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0);
   if (!UseImm) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(CmpOpc))
+    SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
                     .addReg(SrcReg1).addReg(SrcReg2));
   } else {
     MachineInstrBuilder MIB;
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(SrcReg1);
 
     // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
@@ -1699,6 +1745,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
   }
 
   unsigned CmpOpc = isThumb2 ? ARM::t2CMPri : ARM::CMPri;
+  CondReg = constrainOperandRegClass(TII.get(CmpOpc), CondReg, 0);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
                   .addReg(CondReg).addImm(0));
 
@@ -1715,12 +1762,16 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
       MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
   }
   unsigned ResultReg = createResultReg(RC);
-  if (!UseImm)
+  if (!UseImm) {
+    Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
     .addReg(Op2Reg).addReg(Op1Reg).addImm(ARMCC::NE).addReg(ARM::CPSR);
-  else
+  } else {
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
     .addReg(Op1Reg).addImm(Imm).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+  }
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1806,6 +1857,8 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   if (SrcReg2 == 0) return false;
 
   unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
+  SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(Opc), ResultReg)
                   .addReg(SrcReg1).addReg(SrcReg2));
@@ -1933,7 +1986,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
           !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
         return false;
     } else {
-      switch (static_cast<EVT>(ArgVT).getSimpleVT().SimpleTy) {
+      switch (ArgVT.SimpleTy) {
       default:
         return false;
       case MVT::i1:
@@ -2410,15 +2463,22 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
 
+  unsigned char OpFlags = 0;
+
+  // Add MO_PLT for global address or external symbol in the PIC relocation
+  // model.
+  if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_)
+    OpFlags = ARMII::MO_PLT;
+
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
     AddDefaultPred(MIB);
   if (UseReg)
     MIB.addReg(CalleeReg);
   else if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, 0);
+    MIB.addGlobalAddress(GV, 0, OpFlags);
   else
-    MIB.addExternalSymbol(IntrMemName, 0);
+    MIB.addExternalSymbol(IntrMemName, OpFlags);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
@@ -2731,6 +2791,7 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
         *FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opcode), ResultReg);
     if (setsCPSR)
       MIB.addReg(ARM::CPSR, RegState::Define);
+    SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
     AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
     if (hasS)
       AddDefaultCC(MIB);
@@ -2965,12 +3026,14 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
   // Load value.
   if (isThumb2) {
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(ARM::t2LDRpci), DestReg1)
                     .addConstantPoolIndex(Idx));
     Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
   } else {
     // The extra immediate is for addrmode2.
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                             DL, TII.get(ARM::LDRcp), DestReg1)
                     .addConstantPoolIndex(Idx).addImm(0));
@@ -2984,6 +3047,9 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   }
 
   unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
+  DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0);
+  DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1);
+  GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(Opc), DestReg2)
                             .addReg(DestReg1)
@@ -3049,7 +3115,7 @@ bool ARMFastISel::FastLowerArguments() {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
 
-  const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32);
+  const TargetRegisterClass *RC = &ARM::rGPRRegClass;
   Idx = 0;
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; ++I, ++Idx) {
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
new file mode 100644
index 0000000..dafc4b3
--- /dev/null
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -0,0 +1,93 @@
+//===-- ARMFeatures.h - Checks for ARM instruction features ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the code shared between ARM CodeGen and ARM MC
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_FEATURES_H
+#define TARGET_ARM_FEATURES_H
+
+#include "ARM.h"
+
+namespace llvm {
+
+template<typename InstrType> // could be MachineInstr or MCInst
+inline bool isV8EligibleForIT(InstrType *Instr, int BLXOperandIndex = 0) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM::tADC:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tADDrSPi:
+  case ARM::tADDrr:
+  case ARM::tAND:
+  case ARM::tASRri:
+  case ARM::tASRrr:
+  case ARM::tBIC:
+  case ARM::tCMNz:
+  case ARM::tCMPi8:
+  case ARM::tCMPr:
+  case ARM::tEOR:
+  case ARM::tLDRBi:
+  case ARM::tLDRBr:
+  case ARM::tLDRHi:
+  case ARM::tLDRHr:
+  case ARM::tLDRSB:
+  case ARM::tLDRSH:
+  case ARM::tLDRi:
+  case ARM::tLDRr:
+  case ARM::tLDRspi:
+  case ARM::tLSLri:
+  case ARM::tLSLrr:
+  case ARM::tLSRri:
+  case ARM::tLSRrr:
+  case ARM::tMOVi8:
+  case ARM::tMUL:
+  case ARM::tMVN:
+  case ARM::tORR:
+  case ARM::tROR:
+  case ARM::tRSB:
+  case ARM::tSBC:
+  case ARM::tSTRBi:
+  case ARM::tSTRBr:
+  case ARM::tSTRHi:
+  case ARM::tSTRHr:
+  case ARM::tSTRi:
+  case ARM::tSTRr:
+  case ARM::tSTRspi:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tSUBrr:
+  case ARM::tTST:
+    return true;
+// there are some "conditionally deprecated" opcodes
+  case ARM::tADDspr:
+    return Instr->getOperand(2).getReg() != ARM::PC;
+  // ADD PC, SP and BLX PC were always unpredictable,
+  // now on top of it they're deprecated
+  case ARM::tADDrSP:
+  case ARM::tBX:
+    return Instr->getOperand(0).getReg() != ARM::PC;
+  case ARM::tBLXr:
+    return Instr->getOperand(BLXOperandIndex).getReg() != ARM::PC;
+  case ARM::tADDhirr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(2).getReg() != ARM::PC;
+  case ARM::tCMPhir:
+  case ARM::tMOVr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(1).getReg() != ARM::PC;
+  }
+}
+
+}
+
+#endif
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index c8637be..d32bdbc 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -82,22 +82,11 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
 static bool isCSRestore(MachineInstr *MI,
                         const ARMBaseInstrInfo &TII,
                         const uint16_t *CSRegs) {
   // Integer spill area is handled with "pop".
-  if (MI->getOpcode() == ARM::LDMIA_RET ||
-      MI->getOpcode() == ARM::t2LDMIA_RET ||
-      MI->getOpcode() == ARM::LDMIA_UPD ||
-      MI->getOpcode() == ARM::t2LDMIA_UPD ||
-      MI->getOpcode() == ARM::VLDMDIA_UPD) {
+  if (isPopOpcode(MI->getOpcode())) {
     // The first two operands are predicates. The last two are
     // imp-def and imp-use of SP. Check everything in between.
     for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
@@ -115,20 +104,31 @@ static bool isCSRestore(MachineInstr *MI,
   return false;
 }
 
-static void
-emitSPUpdate(bool isARM,
-             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-             DebugLoc dl, const ARMBaseInstrInfo &TII,
-             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
-             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                                 const ARMBaseInstrInfo &TII, unsigned DestReg,
+                                 unsigned SrcReg, int NumBytes,
+                                 unsigned MIFlags = MachineInstr::NoFlags,
+                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0) {
   if (isARM)
-    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+    emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                             Pred, PredReg, TII, MIFlags);
   else
-    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+    emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                            Pred, PredReg, TII, MIFlags);
 }
 
+static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                         const ARMBaseInstrInfo &TII, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0) {
+  emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes,
+                       MIFlags, Pred, PredReg);
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -175,6 +175,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     unsigned Reg = CSI[i].getReg();
     int FI = CSI[i].getFrameIdx();
     switch (Reg) {
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
@@ -182,73 +186,61 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::LR:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
       GPRCS1Size += 4;
       break;
     case ARM::R8:
     case ARM::R9:
     case ARM::R10:
     case ARM::R11:
+    case ARM::R12:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      if (STI.isTargetIOS()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
+      if (STI.isTargetIOS())
         GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
+      else
         GPRCS1Size += 4;
-      }
       break;
     default:
       // This is a DPR. Exclude the aligned DPRCS2 spills.
       if (Reg == ARM::D8)
         D8SpillFI = FI;
-      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) {
-        AFI->addDPRCalleeSavedAreaFrame(FI);
+      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
         DPRCSSize += 8;
-      }
     }
   }
 
   // Move past area 1.
-  if (GPRCS1Size > 0) MBBI++;
-
-  // Set FP to point to the stack slot that contains the previous FP.
-  // For iOS, FP is R7, which has now been stored in spill area 1.
-  // Otherwise, if this is not iOS, all the callee-saved registers go
-  // into spill area 1, including the FP in R11.  In either case, it is
-  // now safe to emit this assignment.
-  bool HasFP = hasFP(MF);
-  if (HasFP) {
-    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0)
-      .setMIFlag(MachineInstr::FrameSetup);
-    AddDefaultCC(AddDefaultPred(MIB));
-  }
-
-  // Move past area 2.
-  if (GPRCS2Size > 0) MBBI++;
+  MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush;
+  if (GPRCS1Size > 0)
+    FramePtrPush = LastPush = MBBI++;
 
   // Determine starting offsets of spill areas.
+  bool HasFP = hasFP(MF);
   unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  if (HasFP)
+  int FramePtrOffsetInPush = 0;
+  if (HasFP) {
+    FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
     AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
+  }
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
   AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
+  // Move past area 2.
+  if (GPRCS2Size > 0) {
+    LastPush = MBBI++;
+  }
+
   // Move past area 3.
   if (DPRCSSize > 0) {
-    MBBI++;
+    LastPush = MBBI++;
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
-      MBBI++;
+      LastPush = MBBI++;
   }
 
   // Move past the aligned DPRCS2 area.
@@ -264,8 +256,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
-                 MachineInstr::FrameSetup);
+    if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes)) {
+      if (LastPush == FramePtrPush)
+        FramePtrOffsetInPush += NumBytes;
+    } else
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                   MachineInstr::FrameSetup);
+
     if (HasFP && isARM)
       // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
       // Note it's not safe to do this in Thumb2 mode because it would have
@@ -278,6 +275,18 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       AFI->setShouldRestoreSPFromFP(true);
   }
 
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For iOS, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not iOS, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it
+  // is in area one and the adjustment needs to take place just after
+  // that push.
+  if (HasFP)
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII,
+                         FramePtr, ARM::SP, FramePtrOffsetInPush,
+                         MachineInstr::FrameSetup);
+
+
   if (STI.isTargetELF() && hasFP(MF))
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
                              AFI->getFramePtrSpillOffset());
@@ -373,11 +382,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
-    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
     if (MBBI != MBB.begin()) {
-      do
+      do {
         --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
       if (!isCSRestore(MBBI, TII, CSRegs))
         ++MBBI;
     }
@@ -421,8 +430,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                                  ARM::SP)
             .addReg(FramePtr));
       }
-    } else if (NumBytes)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+    } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
+        emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
     if (AFI->getDPRCalleeSavedAreaSize()) {
@@ -501,12 +510,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
 
   FrameReg = ARM::SP;
   Offset += SPAdj;
-  if (AFI->isGPRCalleeSavedArea1Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea2Offset();
-  else if (AFI->isDPRCalleeSavedAreaFrame(FI))
-    return Offset - AFI->getDPRCalleeSavedAreaOffset();
 
   // SP can move around if there are allocas.  We may also lose track of SP
   // when emergency spilling inside a non-reserved call frame setup.
@@ -658,6 +661,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   unsigned RetOpcode = MI->getOpcode();
   bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
                      RetOpcode == ARM::TCRETURNri);
+  bool isInterrupt =
+      RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
 
   SmallVector<unsigned, 4> Regs;
   unsigned i = CSI.size();
@@ -672,7 +677,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) {
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
+          STI.hasV5TOps()) {
         Reg = ARM::PC;
         LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
         // Fold the return instruction into the LDM.
@@ -1199,7 +1205,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
@@ -1226,6 +1232,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       case ARM::LR:
         LRSpilled = true;
         // Fallthrough
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
       case ARM::R4: case ARM::R5:
       case ARM::R6: case ARM::R7:
         CS1Spilled = true;
@@ -1240,6 +1248,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
 
       switch (Reg) {
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
       case ARM::R4: case ARM::R5:
       case ARM::R6: case ARM::R7:
       case ARM::LR:
@@ -1295,8 +1305,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     if (!LRSpilled && CS1Spilled) {
       MRI.setPhysRegUsed(ARM::LR);
       NumGPRSpills++;
-      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
-                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      SmallVectorImpl<unsigned>::iterator LRPos;
+      LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
+                        (unsigned)ARM::LR);
+      if (LRPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(LRPos);
+
       ForceLRSpill = false;
       ExtraCSSpill = true;
     }
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 4ca3af6..87d1522 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -130,6 +130,13 @@ public:
     return true;
   }
 
+  bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) {
+    const ConstantSDNode *CN = cast<ConstantSDNode>(N);
+    Pred = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32);
+    return true;
+  }
+
   bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
@@ -239,21 +246,6 @@ private:
   /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
   SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
-  /// SelectCMOVOp - Select CMOV instructions for ARM.
-  SDNode *SelectCMOVOp(SDNode *N);
-  SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                              ARMCC::CondCodes CCVal, SDValue CCR,
-                              SDValue InFlag);
-  SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                               ARMCC::CondCodes CCVal, SDValue CCR,
-                               SDValue InFlag);
-  SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                              ARMCC::CondCodes CCVal, SDValue CCR,
-                              SDValue InFlag);
-  SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                               ARMCC::CondCodes CCVal, SDValue CCR,
-                               SDValue InFlag);
-
   // Select special operations if node forms integer ABS pattern
   SDNode *SelectABSOp(SDNode *N);
 
@@ -261,7 +253,7 @@ private:
 
   SDNode *SelectConcatVector(SDNode *N);
 
-  SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32, unsigned Op64);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -2321,204 +2313,6 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
   return NULL;
 }
 
-SDNode *ARMDAGToDAGISel::
-SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                    ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  SDValue CPTmp0;
-  SDValue CPTmp1;
-  if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) {
-    unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
-    unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
-    unsigned Opc = 0;
-    switch (SOShOp) {
-    case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break;
-    case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break;
-    case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break;
-    case ARM_AM::ror: Opc = ARM::t2MOVCCror; break;
-    default:
-      llvm_unreachable("Unknown so_reg opcode!");
-    }
-    SDValue SOShImm =
-      CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6);
-  }
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  SDValue CPTmp0;
-  SDValue CPTmp1;
-  SDValue CPTmp2;
-  if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) {
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6);
-  }
-
-  if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7);
-  }
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                  ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-  if (!T)
-    return 0;
-
-  unsigned Opc = 0;
-  unsigned TrueImm = T->getZExtValue();
-  if (is_t2_so_imm(TrueImm)) {
-    Opc = ARM::t2MOVCCi;
-  } else if (TrueImm <= 0xffff) {
-    Opc = ARM::t2MOVCCi16;
-  } else if (is_t2_so_imm_not(TrueImm)) {
-    TrueImm = ~TrueImm;
-    Opc = ARM::t2MVNCCi;
-  } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) {
-    // Large immediate.
-    Opc = ARM::t2MOVCCi32imm;
-  }
-
-  if (Opc) {
-    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
-  }
-
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                   ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-  if (!T)
-    return 0;
-
-  unsigned Opc = 0;
-  unsigned TrueImm = T->getZExtValue();
-  bool isSoImm = is_so_imm(TrueImm);
-  if (isSoImm) {
-    Opc = ARM::MOVCCi;
-  } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) {
-    Opc = ARM::MOVCCi16;
-  } else if (is_so_imm_not(TrueImm)) {
-    TrueImm = ~TrueImm;
-    Opc = ARM::MVNCCi;
-  } else if (TrueVal.getNode()->hasOneUse() &&
-             (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) {
-    // Large immediate.
-    Opc = ARM::MOVCCi32imm;
-  }
-
-  if (Opc) {
-    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
-  }
-
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  SDValue FalseVal = N->getOperand(0);
-  SDValue TrueVal  = N->getOperand(1);
-  SDValue CC = N->getOperand(2);
-  SDValue CCR = N->getOperand(3);
-  SDValue InFlag = N->getOperand(4);
-  assert(CC.getOpcode() == ISD::Constant);
-  assert(CCR.getOpcode() == ISD::Register);
-  ARMCC::CondCodes CCVal =
-    (ARMCC::CondCodes)cast<ConstantSDNode>(CC)->getZExtValue();
-
-  if (!Subtarget->isThumb1Only() && VT == MVT::i32) {
-    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Pattern complexity = 18  cost = 1  size = 0
-    if (Subtarget->isThumb()) {
-      SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal,
-                                        CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    } else {
-      SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal,
-                                         CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    }
-
-    // Pattern: (ARMcmov:i32 GPR:i32:$false,
-    //             (imm:i32)<<P:Pred_so_imm>>:$true,
-    //             (imm:i32):$cc)
-    // Emits: (MOVCCi:i32 GPR:i32:$false,
-    //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
-    // Pattern complexity = 10  cost = 1  size = 0
-    if (Subtarget->isThumb()) {
-      SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal,
-                                        CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    } else {
-      SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal,
-                                         CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    }
-  }
-
-  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Pattern complexity = 6  cost = 1  size = 0
-  //
-  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Pattern complexity = 6  cost = 11  size = 0
-  //
-  // Also VMOVScc and VMOVDcc.
-  SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32);
-  SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag };
-  unsigned Opc = 0;
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Illegal conditional move type!");
-  case MVT::i32:
-    Opc = Subtarget->isThumb()
-      ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo)
-      : ARM::MOVCCr;
-    break;
-  case MVT::f32:
-    Opc = ARM::VMOVScc;
-    break;
-  case MVT::f64:
-    Opc = ARM::VMOVDcc;
-    break;
-  }
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
-}
-
 /// Target-specific DAG combining for ISD::XOR.
 /// Target-independent combining lowers SELECT_CC nodes of the form
 /// select_cc setg[ge] X,  0,  X, -X
@@ -2567,30 +2361,45 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
 }
 
-SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+SDNode *ARMDAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
+                                      unsigned Op16,unsigned Op32,
+                                      unsigned Op64) {
+  // Mostly direct translation to the given operations, except that we preserve
+  // the AtomicOrdering for use later on.
+  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
+  EVT VT = AN->getMemoryVT();
+
+  unsigned Op;
+  SDVTList VTs = CurDAG->getVTList(AN->getValueType(0), MVT::Other);
+  if (VT == MVT::i8)
+    Op = Op8;
+  else if (VT == MVT::i16)
+    Op = Op16;
+  else if (VT == MVT::i32)
+    Op = Op32;
+  else if (VT == MVT::i64) {
+    Op = Op64;
+    VTs = CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other);
+  } else
+    llvm_unreachable("Unexpected atomic operation");
+
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Node->getOperand(1)); // Ptr
-  Ops.push_back(Node->getOperand(2)); // Low part of Val1
-  Ops.push_back(Node->getOperand(3)); // High part of Val1
-  if (Opc == ARM::ATOMCMPXCHG6432) {
-    Ops.push_back(Node->getOperand(4)); // Low part of Val2
-    Ops.push_back(Node->getOperand(5)); // High part of Val2
-  }
-  Ops.push_back(Node->getOperand(0)); // Chain
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
-                                           MVT::i32, MVT::i32, MVT::Other,
-                                           Ops);
-  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
-  return ResNode;
+  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
+      Ops.push_back(AN->getOperand(i));
+
+  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
+  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+
+  return CurDAG->SelectNodeTo(Node, Op, VTs, &Ops[0], Ops.size());
 }
 
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
 
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
@@ -2882,8 +2691,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                 SDValue(Chain.getNode(), Chain.getResNo()));
     return NULL;
   }
-  case ARMISD::CMOV:
-    return SelectCMOVOp(N);
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
@@ -3457,31 +3264,90 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
 
-  case ARMISD::ATOMOR64_DAG:
-    return SelectAtomic64(N, ARM::ATOMOR6432);
-  case ARMISD::ATOMXOR64_DAG:
-    return SelectAtomic64(N, ARM::ATOMXOR6432);
-  case ARMISD::ATOMADD64_DAG:
-    return SelectAtomic64(N, ARM::ATOMADD6432);
-  case ARMISD::ATOMSUB64_DAG:
-    return SelectAtomic64(N, ARM::ATOMSUB6432);
-  case ARMISD::ATOMNAND64_DAG:
-    return SelectAtomic64(N, ARM::ATOMNAND6432);
-  case ARMISD::ATOMAND64_DAG:
-    return SelectAtomic64(N, ARM::ATOMAND6432);
-  case ARMISD::ATOMSWAP64_DAG:
-    return SelectAtomic64(N, ARM::ATOMSWAP6432);
-  case ARMISD::ATOMCMPXCHG64_DAG:
-    return SelectAtomic64(N, ARM::ATOMCMPXCHG6432);
-
-  case ARMISD::ATOMMIN64_DAG:
-    return SelectAtomic64(N, ARM::ATOMMIN6432);
-  case ARMISD::ATOMUMIN64_DAG:
-    return SelectAtomic64(N, ARM::ATOMUMIN6432);
-  case ARMISD::ATOMMAX64_DAG:
-    return SelectAtomic64(N, ARM::ATOMMAX6432);
-  case ARMISD::ATOMUMAX64_DAG:
-    return SelectAtomic64(N, ARM::ATOMUMAX6432);
+  case ISD::ATOMIC_LOAD:
+    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
+      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_LOAD_I64);
+    else
+      break;
+
+  case ISD::ATOMIC_STORE:
+    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
+      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_STORE_I64);
+    else
+      break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_ADD_I8,
+                        ARM::ATOMIC_LOAD_ADD_I16,
+                        ARM::ATOMIC_LOAD_ADD_I32,
+                        ARM::ATOMIC_LOAD_ADD_I64);
+  case ISD::ATOMIC_LOAD_SUB:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_SUB_I8,
+                        ARM::ATOMIC_LOAD_SUB_I16,
+                        ARM::ATOMIC_LOAD_SUB_I32,
+                        ARM::ATOMIC_LOAD_SUB_I64);
+  case ISD::ATOMIC_LOAD_AND:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_AND_I8,
+                        ARM::ATOMIC_LOAD_AND_I16,
+                        ARM::ATOMIC_LOAD_AND_I32,
+                        ARM::ATOMIC_LOAD_AND_I64);
+  case ISD::ATOMIC_LOAD_OR:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_OR_I8,
+                        ARM::ATOMIC_LOAD_OR_I16,
+                        ARM::ATOMIC_LOAD_OR_I32,
+                        ARM::ATOMIC_LOAD_OR_I64);
+  case ISD::ATOMIC_LOAD_XOR:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_XOR_I8,
+                        ARM::ATOMIC_LOAD_XOR_I16,
+                        ARM::ATOMIC_LOAD_XOR_I32,
+                        ARM::ATOMIC_LOAD_XOR_I64);
+  case ISD::ATOMIC_LOAD_NAND:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_NAND_I8,
+                        ARM::ATOMIC_LOAD_NAND_I16,
+                        ARM::ATOMIC_LOAD_NAND_I32,
+                        ARM::ATOMIC_LOAD_NAND_I64);
+  case ISD::ATOMIC_LOAD_MIN:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_MIN_I8,
+                        ARM::ATOMIC_LOAD_MIN_I16,
+                        ARM::ATOMIC_LOAD_MIN_I32,
+                        ARM::ATOMIC_LOAD_MIN_I64);
+  case ISD::ATOMIC_LOAD_MAX:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_MAX_I8,
+                        ARM::ATOMIC_LOAD_MAX_I16,
+                        ARM::ATOMIC_LOAD_MAX_I32,
+                        ARM::ATOMIC_LOAD_MAX_I64);
+  case ISD::ATOMIC_LOAD_UMIN:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_UMIN_I8,
+                        ARM::ATOMIC_LOAD_UMIN_I16,
+                        ARM::ATOMIC_LOAD_UMIN_I32,
+                        ARM::ATOMIC_LOAD_UMIN_I64);
+  case ISD::ATOMIC_LOAD_UMAX:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_UMAX_I8,
+                        ARM::ATOMIC_LOAD_UMAX_I16,
+                        ARM::ATOMIC_LOAD_UMAX_I32,
+                        ARM::ATOMIC_LOAD_UMAX_I64);
+  case ISD::ATOMIC_SWAP:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_SWAP_I8,
+                        ARM::ATOMIC_SWAP_I16,
+                        ARM::ATOMIC_SWAP_I32,
+                        ARM::ATOMIC_SWAP_I64);
+  case ISD::ATOMIC_CMP_SWAP:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_CMP_SWAP_I8,
+                        ARM::ATOMIC_CMP_SWAP_I16,
+                        ARM::ATOMIC_CMP_SWAP_I32,
+                        ARM::ATOMIC_CMP_SWAP_I64);
   }
 
   return SelectCode(N);
@@ -3614,7 +3480,10 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     if(PairedReg.getNode()) {
       OpChanged[OpChanged.size() -1 ] = true;
       Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
-      Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
       // Replace the current flag.
       AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
           Flag, MVT::i32);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index caec11e..76a0a83 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include <utility>
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -174,9 +175,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  if (Subtarget->isTargetDarwin()) {
+  if (Subtarget->isTargetIOS()) {
     // Uses VFP for Thumb libfuncs if available.
-    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
+    if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+        Subtarget->hasARMOps()) {
       // Single-precision floating-point arithmetic.
       setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
       setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
@@ -421,7 +423,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
+  if (Subtarget->getTargetTriple().isiOS() &&
       !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
@@ -452,6 +454,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
@@ -564,16 +567,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 
-    // Custom expand long extensions to vectors.
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
-
     // NEON does not have single instruction CTPOP for vectors with element
     // types wider than 8-bits.  However, custom lowering can leverage the
     // v8i8/v16i8 vcnt instruction.
@@ -750,12 +743,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
-  // FIXME: This should be checking for v6k, not just v6.
-  if (Subtarget->hasDataBarrier() ||
-      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
-    // membarrier needs custom lowering; the rest are legal and handled
-    // normally.
-    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
+    // ATOMIC_FENCE needs custom lowering; the other 32-bit ones are legal and
+    // handled normally.
+    setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
     // Custom lowering for 64-bit ops
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
@@ -768,11 +759,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
-    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
-    setInsertFencesForAtomic(true);
+    // On v8, we have particularly efficient implementations of atomic fences
+    // if they can be combined with nearby atomic loads and stores.
+    if (!Subtarget->hasV8Ops()) {
+      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+      setInsertFencesForAtomic(true);
+    }
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
   } else {
+    // If there's anything we can use as a barrier, go through custom lowering
+    // for ATOMIC_FENCE.
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
+                       Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
     // Set them all for expansion, which will force libcalls.
-    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
@@ -869,6 +869,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
       setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
     }
   }
+      
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
+      // For iOS, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
 
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
@@ -908,6 +920,44 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
+static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
+                                  bool isThumb2, unsigned &LdrOpc,
+                                  unsigned &StrOpc) {
+  static const unsigned LoadBares[4][2] =  {{ARM::LDREXB, ARM::t2LDREXB},
+                                            {ARM::LDREXH, ARM::t2LDREXH},
+                                            {ARM::LDREX,  ARM::t2LDREX},
+                                            {ARM::LDREXD, ARM::t2LDREXD}};
+  static const unsigned LoadAcqs[4][2] =   {{ARM::LDAEXB, ARM::t2LDAEXB},
+                                            {ARM::LDAEXH, ARM::t2LDAEXH},
+                                            {ARM::LDAEX,  ARM::t2LDAEX},
+                                            {ARM::LDAEXD, ARM::t2LDAEXD}};
+  static const unsigned StoreBares[4][2] = {{ARM::STREXB, ARM::t2STREXB},
+                                            {ARM::STREXH, ARM::t2STREXH},
+                                            {ARM::STREX,  ARM::t2STREX},
+                                            {ARM::STREXD, ARM::t2STREXD}};
+  static const unsigned StoreRels[4][2] =  {{ARM::STLEXB, ARM::t2STLEXB},
+                                            {ARM::STLEXH, ARM::t2STLEXH},
+                                            {ARM::STLEX,  ARM::t2STLEX},
+                                            {ARM::STLEXD, ARM::t2STLEXD}};
+
+  const unsigned (*LoadOps)[2], (*StoreOps)[2];
+  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    LoadOps = LoadAcqs;
+  else
+    LoadOps = LoadBares;
+
+  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    StoreOps = StoreRels;
+  else
+    StoreOps = StoreBares;
+
+  assert(isPowerOf2_32(Size) && Size <= 8 &&
+         "unsupported size for atomic binary op!");
+
+  LdrOpc = LoadOps[Log2_32(Size)][isThumb2];
+  StrOpc = StoreOps[Log2_32(Size)][isThumb2];
+}
+
 // FIXME: It might make sense to define the representative register class as the
 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -970,6 +1020,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
   case ARMISD::CMN:           return "ARMISD::CMN";
@@ -1009,7 +1060,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
 
-  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
 
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
@@ -1068,6 +1118,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
+  case ARMISD::VMAXNM:        return "ARMISD::VMAX";
+  case ARMISD::VMINNM:        return "ARMISD::VMIN";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
@@ -1092,19 +1144,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
-
-  case ARMISD::ATOMADD64_DAG:     return "ATOMADD64_DAG";
-  case ARMISD::ATOMSUB64_DAG:     return "ATOMSUB64_DAG";
-  case ARMISD::ATOMOR64_DAG:      return "ATOMOR64_DAG";
-  case ARMISD::ATOMXOR64_DAG:     return "ATOMXOR64_DAG";
-  case ARMISD::ATOMAND64_DAG:     return "ATOMAND64_DAG";
-  case ARMISD::ATOMNAND64_DAG:    return "ATOMNAND64_DAG";
-  case ARMISD::ATOMSWAP64_DAG:    return "ATOMSWAP64_DAG";
-  case ARMISD::ATOMCMPXCHG64_DAG: return "ATOMCMPXCHG64_DAG";
-  case ARMISD::ATOMMIN64_DAG:     return "ATOMMIN64_DAG";
-  case ARMISD::ATOMUMIN64_DAG:    return "ATOMUMIN64_DAG";
-  case ARMISD::ATOMMAX64_DAG:     return "ATOMMAX64_DAG";
-  case ARMISD::ATOMUMAX64_DAG:    return "ATOMUMAX64_DAG";
   }
 }
 
@@ -1536,7 +1575,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                      MachinePointerInfo(),
-                                     false, false, false, 0);
+                                     false, false, false,
+                                     DAG.InferPtrAlignment(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
@@ -1745,24 +1785,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
-  if (isThisReturn) {
-    // For 'this' returns, use the R0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
-    if (!Mask) {
-      // Set isThisReturn to false if the calling convention is not one that
-      // allows 'returned' to be modeled in this way, so LowerCallResult does
-      // not try to pass 'this' straight through 
-      isThisReturn = false;
+  if (!isTailCall) {
+    const uint32_t *Mask;
+    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
+    if (isThisReturn) {
+      // For 'this' returns, use the R0-preserving mask if applicable
+      Mask = ARI->getThisReturnPreservedMask(CallConv);
+      if (!Mask) {
+        // Set isThisReturn to false if the calling convention is not one that
+        // allows 'returned' to be modeled in this way, so LowerCallResult does
+        // not try to pass 'this' straight through
+        isThisReturn = false;
+        Mask = ARI->getCallPreservedMask(CallConv);
+      }
+    } else
       Mask = ARI->getCallPreservedMask(CallConv);
-    }
-  } else
-    Mask = ARI->getCallPreservedMask(CallConv);
 
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -1933,6 +1975,12 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (isVarArg && !Outs.empty())
     return false;
 
+  // Exception-handling functions need a special set of instructions to indicate
+  // a return to the hardware. Tail-calling another function would probably
+  // break this.
+  if (CallerF->hasFnAttribute("interrupt"))
+    return false;
+
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
@@ -2061,6 +2109,39 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                                     isVarArg));
 }
 
+static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                    SDLoc DL, SelectionDAG &DAG) {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+
+  StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
+
+  // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
+  // version of the "preferred return address". These offsets affect the return
+  // instruction if this is a return from PL1 without hypervisor extensions.
+  //    IRQ/FIQ: +4     "subs pc, lr, #4"
+  //    SWI:     0      "subs pc, lr, #0"
+  //    ABORT:   +4     "subs pc, lr, #4"
+  //    UNDEF:   +4/+2  "subs pc, lr, #0"
+  // UNDEF varies depending on where the exception came from ARM or Thumb
+  // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
+
+  int64_t LROffset;
+  if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
+      IntKind == "ABORT")
+    LROffset = 4;
+  else if (IntKind == "SWI" || IntKind == "UNDEF")
+    LROffset = 0;
+  else
+    report_fatal_error("Unsupported interrupt attribute. If present, value "
+                       "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
+
+  RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false));
+
+  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other,
+                     RetOps.data(), RetOps.size());
+}
+
 SDValue
 ARMTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
@@ -2146,6 +2227,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
+  // CPUs which aren't M-class use a special sequence to return from
+  // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
+  // though we use "subs pc, lr, #N").
+  //
+  // M-class CPUs actually use a normal return sequence with a special
+  // (hardware-provided) value in LR, so the normal code path works.
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
+      !Subtarget->isMClass()) {
+    if (Subtarget->isThumb1Only())
+      report_fatal_error("interrupt attribute is not supported in Thumb1");
+    return LowerInterruptReturn(RetOps, dl, DAG);
+  }
+
   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
                      RetOps.data(), RetOps.size());
 }
@@ -2202,7 +2296,8 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   bool HasRet = false;
   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
        UI != UE; ++UI) {
-    if (UI->getOpcode() != ARMISD::RET_FLAG)
+    if (UI->getOpcode() != ARMISD::RET_FLAG &&
+        UI->getOpcode() != ARMISD::INTRET_FLAG)
       return false;
     HasRet = true;
   }
@@ -2589,7 +2684,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
-           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+           "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
                        DAG.getConstant(0, MVT::i32));
   }
@@ -2597,14 +2692,18 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
   unsigned Domain = ARM_MB::ISH;
-  if (Subtarget->isSwift() && Ord == Release) {
+  if (Subtarget->isMClass()) {
+    // Only a full system barrier exists in the M-class architectures.
+    Domain = ARM_MB::SY;
+  } else if (Subtarget->isSwift() && Ord == Release) {
     // Swift happens to implement ISHST barriers in a way that's compatible with
     // Release semantics but weaker than ISH so we'd be fools not to use
     // it. Beware: other processors probably don't!
     Domain = ARM_MB::ISHST;
   }
 
-  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+  return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(Intrinsic::arm_dmb, MVT::i32),
                      DAG.getConstant(Domain, MVT::i32));
 }
 
@@ -3177,6 +3276,61 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                          SelectTrue, SelectFalse, ISD::SETNE);
 }
 
+static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
+  if (CC == ISD::SETNE)
+    return ISD::SETEQ;
+  return ISD::getSetCCSwappedOperands(CC);
+}
+
+static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                                 bool &swpCmpOps, bool &swpVselOps) {
+  // Start by selecting the GE condition code for opcodes that return true for
+  // 'equality'
+  if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
+      CC == ISD::SETULE)
+    CondCode = ARMCC::GE;
+
+  // and GT for opcodes that return false for 'equality'.
+  else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
+           CC == ISD::SETULT)
+    CondCode = ARMCC::GT;
+
+  // Since we are constrained to GE/GT, if the opcode contains 'less', we need
+  // to swap the compare operands.
+  if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
+      CC == ISD::SETULT)
+    swpCmpOps = true;
+
+  // Both GT and GE are ordered comparisons, and return false for 'unordered'.
+  // If we have an unordered opcode, we need to swap the operands to the VSEL
+  // instruction (effectively negating the condition).
+  //
+  // This also has the effect of swapping which one of 'less' or 'greater'
+  // returns true, so we also swap the compare operands. It also switches
+  // whether we return true for 'equality', so we compensate by picking the
+  // opposite condition code to our original choice.
+  if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
+      CC == ISD::SETUGT) {
+    swpCmpOps = !swpCmpOps;
+    swpVselOps = !swpVselOps;
+    CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
+  }
+
+  // 'ordered' is 'anything but unordered', so use the VS condition code and
+  // swap the VSEL operands.
+  if (CC == ISD::SETO) {
+    CondCode = ARMCC::VS;
+    swpVselOps = true;
+  }
+
+  // 'unordered or not equal' is 'anything but equal', so use the EQ condition
+  // code and swap the VSEL operands.
+  if (CC == ISD::SETUNE) {
+    CondCode = ARMCC::EQ;
+    swpVselOps = true;
+  }
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -3187,15 +3341,66 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   if (LHS.getValueType() == MVT::i32) {
+    // Try to generate VSEL on ARMv8.
+    // The VSEL instruction can't use all the usual ARM condition
+    // codes: it only has two bits to select the condition code, so it's
+    // constrained to use only GE, GT, VS and EQ.
+    //
+    // To implement all the various ISD::SETXXX opcodes, we sometimes need to
+    // swap the operands of the previous compare instruction (effectively
+    // inverting the compare condition, swapping 'less' and 'greater') and
+    // sometimes need to swap the operands to the VSEL (which inverts the
+    // condition in the sense of firing whenever the previous condition didn't)
+    if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                      TrueVal.getValueType() == MVT::f64)) {
+      ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+      if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
+          CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
+        CC = getInverseCCForVSEL(CC);
+        std::swap(TrueVal, FalseVal);
+      }
+    }
+
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
-    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+                       Cmp);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
   FPCCToARMCC(CC, CondCode, CondCode2);
 
+  // Try to generate VSEL on ARMv8.
+  if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
+    // We can select VMAXNM/VMINNM from a compare followed by a select with the
+    // same operands, as follows:
+    //   c = fcmp [ogt, olt, ugt, ult] a, b
+    //   select c, a, b
+    // We only do this in unsafe-fp-math, because signed zeros and NaNs are
+    // handled differently than the original code sequence.
+    if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal &&
+        RHS == FalseVal) {
+      if (CC == ISD::SETOGT || CC == ISD::SETUGT)
+        return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
+      if (CC == ISD::SETOLT || CC == ISD::SETULT)
+        return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+    }
+
+    bool swpCmpOps = false;
+    bool swpVselOps = false;
+    checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
+
+    if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
+        CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
+      if (swpCmpOps)
+        std::swap(LHS, RHS);
+      if (swpVselOps)
+        std::swap(TrueVal, FalseVal);
+    }
+  }
+
   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
@@ -3627,47 +3832,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
-/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
-/// and size(DestVec) > 128-bits.
-/// This is achieved by doing the one extension from the SrcVec, splitting the
-/// result, extending these parts, and then concatenating these into the
-/// destination.
-static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op = N->getOperand(0);
-  EVT SrcVT = Op.getValueType();
-  EVT DestVT = N->getValueType(0);
-
-  assert(DestVT.getSizeInBits() > 128 &&
-         "Custom sext/zext expansion needs >128-bit vector.");
-  // If this is a normal length extension, use the default expansion.
-  if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
-      SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
-    return SDValue();
-
-  SDLoc dl(N);
-  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
-  unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
-  unsigned NumElts = SrcVT.getVectorNumElements();
-  LLVMContext &Ctx = *DAG.getContext();
-  SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
-
-  EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
-                               NumElts);
-  EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
-                                 NumElts/2);
-  EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
-                               NumElts/2);
-
-  Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
-  SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
-                        DAG.getIntPtrConstant(0));
-  SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
-                        DAG.getIntPtrConstant(NumElts/2));
-  ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
-  ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
-}
-
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -4271,17 +4435,25 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
 
 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                                            const ARMSubtarget *ST) const {
-  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
+  if (!ST->hasVFP3())
     return SDValue();
 
+  bool IsDouble = Op.getValueType() == MVT::f64;
   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
-  assert(Op.getValueType() == MVT::f32 &&
-         "ConstantFP custom lowering should only occur for f32.");
 
   // Try splatting with a VMOV.f32...
   APFloat FPVal = CFP->getValueAPF();
-  int ImmVal = ARM_AM::getFP32Imm(FPVal);
+  int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
+
   if (ImmVal != -1) {
+    if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
+      // We have code in place to select a valid ConstantFP already, no need to
+      // do any mangling.
+      return Op;
+    }
+
+    // It's a float and we are trying to use NEON operations where
+    // possible. Lower it to a splat followed by an extract.
     SDLoc DL(Op);
     SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
@@ -4290,15 +4462,31 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                        DAG.getConstant(0, MVT::i32));
   }
 
-  // If that fails, try a VMOV.i32
+  // The rest of our options are NEON only, make sure that's allowed before
+  // proceeding..
+  if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
+    return SDValue();
+
   EVT VMovVT;
-  unsigned iVal = FPVal.bitcastToAPInt().getZExtValue();
-  SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
-                                     VMOVModImm);
+  uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
+
+  // It wouldn't really be worth bothering for doubles except for one very
+  // important value, which does happen to match: 0.0. So make sure we don't do
+  // anything stupid.
+  if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
+    return SDValue();
+
+  // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
+  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                                     false, VMOVModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
                                       NewVal);
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
@@ -4306,11 +4494,16 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   }
 
   // Finally, try a VMVN.i32
-  NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
-                             VMVNModImm);
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                             false, VMVNModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
@@ -5769,6 +5962,70 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
                      Op.getOperand(1), Op.getOperand(2));
 }
 
+SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin());
+
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // return values are passed via sret.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Pair of floats / doubles used to pass the result.
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+
+  // Create stack object for sret.
+  const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
+  const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy);
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+  SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = SRet;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Entry.isSRet = true;
+  Args.push_back(Entry);
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName  = (ArgVT == MVT::f64)
+  ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), Type::getVoidTy(*DAG.getContext()),
+                       false, false, false, false, 0,
+                       CallingConv::C, /*isTaillCall=*/false,
+                       /*doesNotRet=*/false, /*isReturnValueUsed*/false,
+                       Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  // Address of cos field.
+  SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet,
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize()));
+  SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+                     LoadSin.getValue(0), LoadCos.getValue(0));
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   // Monotonic load/store is legal for all targets
   if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
@@ -5781,32 +6038,28 @@ static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
 
 static void
 ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
-                    SelectionDAG &DAG, unsigned NewOp) {
+                    SelectionDAG &DAG) {
   SDLoc dl(Node);
   assert (Node->getValueType(0) == MVT::i64 &&
           "Only know how to expand i64 atomics");
+  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
 
   SmallVector<SDValue, 6> Ops;
   Ops.push_back(Node->getOperand(0)); // Chain
   Ops.push_back(Node->getOperand(1)); // Ptr
-  // Low part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
-  // High part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
-  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
-    // High part of Val1
+  for(unsigned i=2; i<Node->getNumOperands(); i++) {
+    // Low part
     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
-    // High part of Val2
+                              Node->getOperand(i), DAG.getIntPtrConstant(0)));
+    // High part
     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
+                              Node->getOperand(i), DAG.getIntPtrConstant(1)));
   }
   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
   SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
-                            cast<MemSDNode>(Node)->getMemOperand());
+    DAG.getAtomic(Node->getOpcode(), dl, MVT::i64, Tys, Ops.data(), Ops.size(),
+                  cast<MemSDNode>(Node)->getMemOperand(), AN->getOrdering(),
+                  AN->getSynchScope());
   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
   Results.push_back(Result.getValue(2));
@@ -5904,6 +6157,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   }
@@ -5921,10 +6175,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::BITCAST:
     Res = ExpandBITCAST(N, DAG);
     break;
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-    Res = ExpandVectorExtension(N, DAG);
-    break;
   case ISD::SRL:
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
@@ -5932,41 +6182,21 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
+  case ISD::ATOMIC_STORE:
+  case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
-    return;
   case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
-    return;
   case ISD::ATOMIC_CMP_SWAP:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_MIN:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_UMIN:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_MAX:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_UMAX:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
+    ReplaceATOMIC_OP_64(N, Results, DAG);
     return;
   }
   if (Res.getNode())
@@ -5986,6 +6216,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   unsigned oldval  = MI->getOperand(2).getReg();
   unsigned newval  = MI->getOperand(3).getReg();
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6001,21 +6232,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   }
 
   unsigned ldrOpc, strOpc;
-  switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
-  case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    break;
-  case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    break;
-  case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
-    break;
-  }
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
 
   MachineFunction *MF = BB->getParent();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -6095,6 +6312,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned dest = MI->getOperand(0).getReg();
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6102,24 +6320,11 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   if (isThumb2) {
     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc;
-  switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
-  case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    break;
-  case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    break;
-  case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
-    break;
-  }
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
 
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -6203,6 +6408,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
   unsigned oldval = dest;
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6210,24 +6416,20 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   if (isThumb2) {
     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc, extendOpc;
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
   switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  default: llvm_unreachable("unsupported size for AtomicBinaryMinMax!");
   case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
     extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
     break;
   case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
     extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
     break;
   case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
     extendOpc = 0;
     break;
   }
@@ -6271,7 +6473,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
-    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
+    oldval = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass
+                                                : &ARM::GPRnopcRegClass);
+    if (!isThumb2)
+      MRI.constrainRegClass(dest, &ARM::GPRnopcRegClass);
     AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
                      .addReg(dest)
                      .addImm(0));
@@ -6309,7 +6514,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                                       unsigned Op1, unsigned Op2,
                                       bool NeedsCarry, bool IsCmpxchg,
                                       bool IsMinMax, ARMCC::CondCodes CC) const {
-  // This also handles ATOMIC_SWAP, indicated by Op1==0.
+  // This also handles ATOMIC_SWAP and ATOMIC_STORE, indicated by Op1==0.
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -6317,11 +6522,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction::iterator It = BB;
   ++It;
 
+  bool isStore = (MI->getOpcode() == ARM::ATOMIC_STORE_I64);
+  unsigned offset = (isStore ? -2 : 0);
   unsigned destlo = MI->getOperand(0).getReg();
   unsigned desthi = MI->getOperand(1).getReg();
-  unsigned ptr = MI->getOperand(2).getReg();
-  unsigned vallo = MI->getOperand(3).getReg();
-  unsigned valhi = MI->getOperand(4).getReg();
+  unsigned ptr = MI->getOperand(offset+2).getReg();
+  unsigned vallo = MI->getOperand(offset+3).getReg();
+  unsigned valhi = MI->getOperand(offset+4).getReg();
+  unsigned OrdIdx = offset + (IsCmpxchg ? 7 : 5);
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(OrdIdx).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6330,8 +6539,13 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
     MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
     MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(vallo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(valhi, &ARM::rGPRRegClass);
   }
 
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
+
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *contBB = 0, *cont2BB = 0;
   if (IsCmpxchg || IsMinMax)
@@ -6371,21 +6585,23 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   //   fallthrough --> exitMBB
   BB = loopMBB;
 
-  // Load
-  if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
-                   .addReg(destlo, RegState::Define)
-                   .addReg(desthi, RegState::Define)
-                   .addReg(ptr));
-  } else {
-    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
-                   .addReg(GPRPair0, RegState::Define).addReg(ptr));
-    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
-      .addReg(GPRPair0, 0, ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
-      .addReg(GPRPair0, 0, ARM::gsub_1);
+  if (!isStore) {
+    // Load
+    if (isThumb2) {
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                     .addReg(destlo, RegState::Define)
+                     .addReg(desthi, RegState::Define)
+                     .addReg(ptr));
+    } else {
+      unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                     .addReg(GPRPair0, RegState::Define).addReg(ptr));
+      // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
+        .addReg(GPRPair0, 0, ARM::gsub_0);
+      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
+        .addReg(GPRPair0, 0, ARM::gsub_1);
+    }
   }
 
   unsigned StoreLo, StoreHi;
@@ -6437,7 +6653,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Store
   if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
+    MRI.constrainRegClass(StoreLo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(StoreHi, &ARM::rGPRRegClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
                    .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
   } else {
     // Marshal a pair...
@@ -6455,7 +6673,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
       .addImm(ARM::gsub_1);
 
     // ...and store it
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
                    .addReg(StorePair).addReg(ptr));
   }
   // Cmp+jump
@@ -6476,6 +6694,51 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicLoad64(MachineInstr *MI, MachineBasicBlock *BB) const {
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  unsigned destlo = MI->getOperand(0).getReg();
+  unsigned desthi = MI->getOperand(1).getReg();
+  unsigned ptr = MI->getOperand(2).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+  }
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
+
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(ldrOpc));
+
+  if (isThumb2) {
+    MIB.addReg(destlo, RegState::Define)
+       .addReg(desthi, RegState::Define)
+       .addReg(ptr);
+
+  } else {
+    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    MIB.addReg(GPRPair0, RegState::Define).addReg(ptr);
+
+    // Copy GPRPair0 into dest.  (This copy will normally be coalesced.)
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), destlo)
+      .addReg(GPRPair0, 0, ARM::gsub_0);
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), desthi)
+      .addReg(GPRPair0, 0, ARM::gsub_1);
+  }
+  AddDefaultPred(MIB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
 void ARMTargetLowering::
@@ -7007,8 +7270,109 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
-MachineBasicBlock *ARMTargetLowering::
-EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+/// Return the load opcode for a given load size. If load size >= 8,
+/// neon opcode will be returned.
+static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
+  if (LdSize >= 8)
+    return LdSize == 16 ? ARM::VLD1q32wb_fixed
+                        : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
+  if (IsThumb1)
+    return LdSize == 4 ? ARM::tLDRi
+                       : LdSize == 2 ? ARM::tLDRHi
+                                     : LdSize == 1 ? ARM::tLDRBi : 0;
+  if (IsThumb2)
+    return LdSize == 4 ? ARM::t2LDR_POST
+                       : LdSize == 2 ? ARM::t2LDRH_POST
+                                     : LdSize == 1 ? ARM::t2LDRB_POST : 0;
+  return LdSize == 4 ? ARM::LDR_POST_IMM
+                     : LdSize == 2 ? ARM::LDRH_POST
+                                   : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
+}
+
+/// Return the store opcode for a given store size. If store size >= 8,
+/// neon opcode will be returned.
+static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
+  if (StSize >= 8)
+    return StSize == 16 ? ARM::VST1q32wb_fixed
+                        : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
+  if (IsThumb1)
+    return StSize == 4 ? ARM::tSTRi
+                       : StSize == 2 ? ARM::tSTRHi
+                                     : StSize == 1 ? ARM::tSTRBi : 0;
+  if (IsThumb2)
+    return StSize == 4 ? ARM::t2STR_POST
+                       : StSize == 2 ? ARM::t2STRH_POST
+                                     : StSize == 1 ? ARM::t2STRB_POST : 0;
+  return StSize == 4 ? ARM::STR_POST_IMM
+                     : StSize == 2 ? ARM::STRH_POST
+                                   : StSize == 1 ? ARM::STRB_POST_IMM : 0;
+}
+
+/// Emit a post-increment load operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned LdSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
+  assert(LdOpc != 0 && "Should have a load opcode");
+  if (LdSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(0));
+  } else if (IsThumb1) {
+    // load + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(LdSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(LdSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addReg(0).addImm(LdSize));
+  }
+}
+
+/// Emit a post-increment store operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned StSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
+  assert(StOpc != 0 && "Should have a store opcode");
+  if (StSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(AddrIn).addImm(0).addReg(Data));
+  } else if (IsThumb1) {
+    // store + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(StSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addImm(StSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addReg(0)
+                       .addImm(StSize));
+  }
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
@@ -7023,23 +7387,18 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   unsigned Align = MI->getOperand(3).getImm();
   DebugLoc dl = MI->getDebugLoc();
 
-  bool isThumb2 = Subtarget->isThumb2();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned ldrOpc, strOpc, UnitSize = 0;
+  unsigned UnitSize = 0;
+  const TargetRegisterClass *TRC = 0;
+  const TargetRegisterClass *VecTRC = 0;
 
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  const TargetRegisterClass *TRC_Vec = 0;
+  bool IsThumb1 = Subtarget->isThumb1Only();
+  bool IsThumb2 = Subtarget->isThumb2();
 
   if (Align & 1) {
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     UnitSize = 1;
   } else if (Align & 2) {
-    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
-    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
@@ -7047,27 +7406,27 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
           hasAttribute(AttributeSet::FunctionIndex,
                        Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
-      if ((Align % 16 == 0) && SizeVal >= 16) {
-        ldrOpc = ARM::VLD1q32wb_fixed;
-        strOpc = ARM::VST1q32wb_fixed;
+      if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
-      }
-      else if ((Align % 8 == 0) && SizeVal >= 8) {
-        ldrOpc = ARM::VLD1d32wb_fixed;
-        strOpc = ARM::VST1d32wb_fixed;
+      else if ((Align % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
-      }
     }
     // Can't use NEON instructions.
-    if (UnitSize == 0) {
-      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
-      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+    if (UnitSize == 0)
       UnitSize = 4;
-    }
   }
 
+  // Select the correct opcode and register class for unit size load/store
+  bool IsNeon = UnitSize >= 8;
+  TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass
+                               : (const TargetRegisterClass *)&ARM::GPRRegClass;
+  if (IsNeon)
+    VecTRC = UnitSize == 16
+                 ? (const TargetRegisterClass *)&ARM::DPairRegClass
+                 : UnitSize == 8
+                       ? (const TargetRegisterClass *)&ARM::DPRRegClass
+                       : 0;
+
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
 
@@ -7078,34 +7437,13 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     unsigned srcIn = src;
     unsigned destIn = dest;
     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
-      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (UnitSize >= 8) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(destIn).addImm(0).addReg(scratch));
-      } else if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addImm(UnitSize));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
-          .addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(UnitSize));
-      }
+      unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+      emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
@@ -7113,30 +7451,14 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     // Handle the leftover bytes with LDRB and STRB.
     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
     // [destOut] = STRB_POST(scratch, destIn, 1)
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     for (unsigned i = 0; i < BytesLeft; i++) {
-      unsigned scratch = MRI.createVirtualRegister(TRC);
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn)
-          .addReg(0).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      }
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
@@ -7177,17 +7499,16 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
 
   // Load an immediate to varEnd.
   unsigned varEnd = MRI.createVirtualRegister(TRC);
-  if (isThumb2) {
-    unsigned VReg1 = varEnd;
+  if (IsThumb2) {
+    unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
-      VReg1 = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
-                   .addImm(LoopSize & 0xFFFF));
+      Vtmp = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
+                       .addImm(LoopSize & 0xFFFF));
 
     if ((LoopSize & 0xFFFF0000) != 0)
       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
-                     .addReg(VReg1)
-                     .addImm(LoopSize >> 16));
+                         .addReg(Vtmp).addImm(LoopSize >> 16));
   } else {
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -7199,10 +7520,12 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
       Align = getDataLayout()->getTypeAllocSize(C->getType());
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
-                   .addReg(varEnd, RegState::Define)
-                   .addConstantPoolIndex(Idx)
-                   .addImm(0));
+    if (IsThumb1)
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx));
+    else
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
   }
   BB->addSuccessor(loopMBB);
 
@@ -7231,39 +7554,30 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
 
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
-  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
-  if (UnitSize >= 8) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(destPhi).addImm(0).addReg(scratch));
-  } else if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addImm(UnitSize));
-  } else {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
-      .addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addReg(0).addImm(UnitSize));
-  }
+  unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+  emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
+             IsThumb1, IsThumb2);
+  emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
+             IsThumb1, IsThumb2);
 
   // Decrement loop variable by UnitSize.
-  MachineInstrBuilder MIB = BuildMI(BB, dl,
-    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
-  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
-  MIB->getOperand(5).setReg(ARM::CPSR);
-  MIB->getOperand(5).setIsDef(true);
-
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  if (IsThumb1) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(varPhi).addImm(UnitSize);
+    AddDefaultPred(MIB);
+  } else {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl,
+                TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+  BuildMI(*BB, BB->end(), dl,
+          TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
 
   // loopMBB can loop back to loopMBB or fall through to exitMBB.
   BB->addSuccessor(loopMBB);
@@ -7272,34 +7586,19 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
   MachineInstr *StartOfExit = exitMBB->begin();
-  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
   unsigned srcIn = srcLoop;
   unsigned destIn = destLoop;
   for (unsigned i = 0; i < BytesLeft; i++) {
-    unsigned scratch = MRI.createVirtualRegister(TRC);
     unsigned srcOut = MRI.createVirtualRegister(TRC);
     unsigned destOut = MRI.createVirtualRegister(TRC);
-    if (isThumb2) {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addImm(1));
-    } else {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addReg(0).addImm(1));
-    }
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
+               IsThumb1, IsThumb2);
+    emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
+               IsThumb1, IsThumb2);
     srcIn = srcOut;
     destIn = destOut;
   }
@@ -7449,46 +7748,49 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
   case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
 
+  case ARM::ATOMIC_LOAD_I64:
+    return EmitAtomicLoad64(MI, BB);
 
-  case ARM::ATOMADD6432:
+  case ARM::ATOMIC_LOAD_ADD_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
                               isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
                               /*NeedsCarry*/ true);
-  case ARM::ATOMSUB6432:
+  case ARM::ATOMIC_LOAD_SUB_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true);
-  case ARM::ATOMOR6432:
+  case ARM::ATOMIC_LOAD_OR_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
                               isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
-  case ARM::ATOMXOR6432:
+  case ARM::ATOMIC_LOAD_XOR_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
                               isThumb2 ? ARM::t2EORrr : ARM::EORrr);
-  case ARM::ATOMAND6432:
+  case ARM::ATOMIC_LOAD_AND_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
                               isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
-  case ARM::ATOMSWAP6432:
+  case ARM::ATOMIC_STORE_I64:
+  case ARM::ATOMIC_SWAP_I64:
     return EmitAtomicBinary64(MI, BB, 0, 0, false);
-  case ARM::ATOMCMPXCHG6432:
+  case ARM::ATOMIC_CMP_SWAP_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ false, /*IsCmpxchg*/true);
-  case ARM::ATOMMIN6432:
+  case ARM::ATOMIC_LOAD_MIN_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::LT);
-  case ARM::ATOMMAX6432:
+  case ARM::ATOMIC_LOAD_MAX_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::GE);
-  case ARM::ATOMUMIN6432:
+  case ARM::ATOMIC_LOAD_UMIN_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::LO);
-  case ARM::ATOMUMAX6432:
+  case ARM::ATOMIC_LOAD_UMAX_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
@@ -8197,6 +8499,13 @@ static SDValue PerformSUBCombine(SDNode *N,
 /// is faster than
 ///   vadd d3, d0, d1
 ///   vmul d3, d3, d2
+//  However, for (A + B) * (A + B),
+//    vadd d2, d0, d1
+//    vmul d3, d0, d2
+//    vmla d3, d1, d2
+//  is slower than
+//    vadd d2, d0, d1
+//    vmul d3, d2, d2
 static SDValue PerformVMULCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -8216,6 +8525,9 @@ static SDValue PerformVMULCombine(SDNode *N,
     std::swap(N0, N1);
   }
 
+  if (N0 == N1)
+    return SDValue();
+
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue N00 = N0->getOperand(0);
@@ -10548,6 +10860,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'r':
       return RCPair(0U, &ARM::GPRRegClass);
     case 'w':
+      if (VT == MVT::Other)
+        break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
@@ -10556,6 +10870,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return RCPair(0U, &ARM::QPRRegClass);
       break;
     case 'x':
+      if (VT == MVT::Other)
+        break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPR_8RegClass);
       if (VT.getSizeInBits() == 64)
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 44c769f..90facdd 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -52,6 +52,7 @@ namespace llvm {
       BR_JT,        // Jumptable branch.
       BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
       RET_FLAG,     // Return with a flag operand.
+      INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
@@ -94,7 +95,6 @@ namespace llvm {
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
 
-      MEMBARRIER,   // Memory barrier (DMB)
       MEMBARRIER_MCR, // Memory barrier (MCR)
 
       PRELOAD,      // Preload
@@ -186,6 +186,8 @@ namespace llvm {
       // Floating-point max and min:
       FMAX,
       FMIN,
+      VMAXNM,
+      VMINNM,
 
       // Bit-field insert
       BFI,
@@ -222,21 +224,7 @@ namespace llvm {
       VST4_UPD,
       VST2LN_UPD,
       VST3LN_UPD,
-      VST4LN_UPD,
-
-      // 64-bit atomic ops (value split into two registers)
-      ATOMADD64_DAG,
-      ATOMSUB64_DAG,
-      ATOMOR64_DAG,
-      ATOMXOR64_DAG,
-      ATOMAND64_DAG,
-      ATOMNAND64_DAG,
-      ATOMSWAP64_DAG,
-      ATOMCMPXCHG64_DAG,
-      ATOMMIN64_DAG,
-      ATOMUMIN64_DAG,
-      ATOMMAX64_DAG,
-      ATOMUMAX64_DAG
+      VST4LN_UPD
     };
   }
 
@@ -375,6 +363,12 @@ namespace llvm {
     /// be used for loads / stores from the global.
     virtual unsigned getMaximalGlobalOffset() const;
 
+    /// Returns true if a cast between SrcAS and DestAS is a noop.
+    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
+      // Addrspacecasts are always noops.
+      return true;
+    }
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -460,6 +454,7 @@ namespace llvm {
                             const ARMSubtarget *ST) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
@@ -573,6 +568,8 @@ namespace llvm {
                                                unsigned Size,
                                                bool signExtend,
                                                ARMCC::CondCodes Cond) const;
+    MachineBasicBlock *EmitAtomicLoad64(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const;
 
     void SetupEntryBlockForSjLj(MachineInstr *MI,
                                 MachineBasicBlock *MBB,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 1349476..f93504f 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -155,6 +155,16 @@ def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
   let DecoderMethod = "DecodePredicateOperand";
 }
 
+// Selectable predicate operand for CMOV instructions. We can't use a normal
+// predicate because the default values interfere with instruction selection. In
+// all other respects it is identical though: pseudo-instruction expansion
+// relies on the MachineOperands being compatible.
+def cmovpred : Operand<i32>, PredicateOp,
+               ComplexPattern<i32, 2, "SelectCMOVPred"> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+  let PrintMethod = "printPredicateOperand";
+}
+
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
 def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
@@ -237,6 +247,8 @@ class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
 class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2,HasDPVFP]>;
 class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
 class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
@@ -490,8 +502,7 @@ class JTI<dag oops, dag iops, InstrItinClass itin,
   : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin,
        asm, "", pattern>;
 
-// Atomic load/store instructions
-class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+class AIldr_ex_or_acq<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
@@ -502,23 +513,52 @@ class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{20}    = 1;
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
-  let Inst{11-0}  = 0b111110011111;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-0}   = 0b10011111;
 }
-class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
-  bits<4> Rd;
   bits<4> Rt;
   bits<4> addr;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 0;
   let Inst{19-16} = addr;
-  let Inst{15-12} = Rd;
-  let Inst{11-4}  = 0b11111001;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-4}   = 0b1001;
   let Inst{3-0}   = Rt;
 }
+// Atomic load/store instructions
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b11, oops, iops, itin, opc, asm, pattern>;
+
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b11, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
+// Exclusive load/store instructions
+
+class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
 class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
   : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
   bits<4> Rt;
@@ -535,6 +575,18 @@ class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
   let Unpredictable{11-8} = 0b1111;
   let DecoderMethod = "DecodeSwap";
 }
+// Acquire/Release load/store instructions
+class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  let Inst{15-12}   = 0b1111;
+}
 
 // addrmode1 instructions
 class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
@@ -1520,6 +1572,8 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
   let Inst{8}     = 1;          // Double precision
   let Inst{7-6}   = opcod4;
   let Inst{4}     = opcod5;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // Double precision, unary, not-predicated
@@ -1572,6 +1626,8 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{8}     = 1;          // Double precision
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // FP, binary, not predicated
@@ -1601,6 +1657,8 @@ class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
   let Inst{8}     = 1; // double precision
   let Inst{6}     = opcod3;
   let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // Single precision, unary, predicated
@@ -1965,7 +2023,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 }
 
 // Same as N2V but not predicated.
-class N2Vnp<bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
             dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
             string Dt, ValueType ResTy, ValueType OpTy, list<dag> pattern>
    : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
@@ -1982,7 +2040,7 @@ class N2Vnp<bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
   // Encode constant bits
   let Inst{27-23} = 0b00111;
   let Inst{21-20} = 0b11;
-  let Inst{19-18} = 0b10;
+  let Inst{19-18} = op19_18;
   let Inst{17-16} = op17_16;
   let Inst{11} = 0;
   let Inst{10-8} = op10_8;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 8cdb853..df867b4 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -119,17 +120,29 @@ namespace {
       MachineBasicBlock &FirstMBB = MF.front();
       MachineBasicBlock::iterator MBBI = FirstMBB.begin();
       DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
-      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      unsigned TempReg =
+          MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
       unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
                      ARM::t2LDRpci : ARM::LDRcp;
       const TargetInstrInfo &TII = *TM->getInstrInfo();
       MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
-                                        TII.get(Opc), GlobalBaseReg)
+                                        TII.get(Opc), TempReg)
                                 .addConstantPoolIndex(Idx);
       if (Opc == ARM::LDRcp)
         MIB.addImm(0);
       AddDefaultPred(MIB);
 
+      // Fix the GOT address by adding pc.
+      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ? ARM::tPICADD
+                                                        : ARM::PICADD;
+      MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg)
+                .addReg(TempReg)
+                .addImm(ARMPCLabelIndex);
+      if (Opc == ARM::PICADD)
+        AddDefaultPred(MIB);
+
+
       return true;
     }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c243402..2042c04 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -71,6 +71,9 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
+def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+
 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
                                             [SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<0, 3>,
@@ -118,7 +121,8 @@ def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
 
 def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
+def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
 
@@ -162,8 +166,6 @@ def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
                                SDT_ARMEH_SJLJ_Longjmp,
                                [SDNPHasChain, SDNPSideEffect]>;
 
-def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
                                [SDNPHasChain, SDNPSideEffect]>;
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
@@ -174,9 +176,11 @@ def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
 def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
-
 def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 
+def ARMvmaxnm        : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
+def ARMvminnm        : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -189,6 +193,9 @@ def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
                                  AssemblerPredicate<"HasV6Ops", "armv6">;
 def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
+                                 AssemblerPredicate<"HasV6MOps",
+                                                    "armv6m or armv6t2">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
                                  AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
@@ -196,6 +203,8 @@ def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
                                  AssemblerPredicate<"HasV7Ops", "armv7">;
 def HasV8            : Predicate<"Subtarget->hasV8Ops()">,
                                  AssemblerPredicate<"HasV8Ops", "armv8">;
+def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -203,16 +212,23 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
                                  AssemblerPredicate<"FeatureVFP3", "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
                                  AssemblerPredicate<"FeatureVFP4", "VFP4">;
-def HasV8FP          : Predicate<"Subtarget->hasV8FP()">,
-                                 AssemblerPredicate<"FeatureV8FP", "V8FP">;
+def HasDPVFP         : Predicate<"!Subtarget->isFPOnlySP()">,
+                                 AssemblerPredicate<"!FeatureVFPOnlySP",
+                                                    "double precision VFP">;
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                                 AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
-                                 AssemblerPredicate<"FeatureHWDiv", "divide">;
+                                 AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
-                                 AssemblerPredicate<"FeatureHWDivARM">;
+                                 AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
                                  AssemblerPredicate<"FeatureT2XtPk",
                                                      "pack/extract">;
@@ -237,10 +253,10 @@ def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
                                  AssemblerPredicate<"ModeThumb,FeatureThumb2",
                                                     "thumb2">;
 def IsMClass         : Predicate<"Subtarget->isMClass()">,
-                                 AssemblerPredicate<"FeatureMClass", "armv7m">;
-def IsARClass        : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass", "armv*m">;
+def IsNotMClass      : Predicate<"!Subtarget->isMClass()">,
                                  AssemblerPredicate<"!FeatureMClass",
-                                                    "armv7a/r">;
+                                                    "!armv*m">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<"!ModeThumb", "arm-mode">;
 def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
@@ -587,17 +603,6 @@ def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
 def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
 def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
 
-/// imm0_4 predicate - Immediate in the range [0,4].
-def Imm0_4AsmOperand : ImmAsmOperand
-{ 
-  let Name = "Imm0_4"; 
-  let DiagnosticType = "ImmRange0_4";  
-}
-def imm0_4 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 5; }]> {
-  let ParserMatchClass = Imm0_4AsmOperand;
-  let DecoderMethod = "DecodeImm0_4";
-}
-
 /// imm0_7 predicate - Immediate in the range [0,7].
 def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
@@ -677,6 +682,15 @@ def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_63AsmOperand;
 }
 
+/// imm0_239 predicate - Immediate in the range [0,239].
+def Imm0_239AsmOperand : ImmAsmOperand {
+  let Name = "Imm0_239";
+  let DiagnosticType = "ImmRange0_239";
+}
+def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
+  let ParserMatchClass = Imm0_239AsmOperand;
+}
+
 /// imm0_255 predicate - Immediate in the range [0,255].
 def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
@@ -708,6 +722,11 @@ def imm0_65535_expr : Operand<i32> {
   let ParserMatchClass = Imm0_65535ExprAsmOperand;
 }
 
+def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def imm256_65535_expr : Operand<i32> {
+  let ParserMatchClass = Imm256_65535ExprAsmOperand;
+}
+
 /// imm24b - True if the 32-bit immediate is encodable in 24 bits.
 def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
 def imm24b : Operand<i32>, ImmLeaf<i32, [{
@@ -1665,53 +1684,11 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
            [(ARMcallseq_start timm:$amt)]>;
 }
 
-// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops.
-// (These pseudos use a hand-written selection code).
-let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in {
-def ATOMOR6432   : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMXOR6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMADD6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMSUB6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMAND6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                                 (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
-                                      GPR:$set1, GPR:$set2),
-                                 NoItinerary, []>;
-def ATOMMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMUMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMUMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-}
-
-def HINT : AI<(outs), (ins imm0_4:$imm), MiscFrm, NoItinerary,
+def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
               "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
-  bits<3> imm;
-  let Inst{27-3} = 0b0011001000001111000000000;
-  let Inst{2-0} = imm;
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
 }
 
 def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>;
@@ -1719,6 +1696,9 @@ def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+
+def : Pat<(int_arm_sevl), (HINT 5)>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
@@ -1746,6 +1726,16 @@ def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
   let Inst{7-4} = 0b0111;
 }
 
+def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010000;
+  let Inst{31-28} = 0xe; // AL
+  let Inst{7-4} = 0b0111;
+}
+
 // Change Processor State
 // FIXME: We should use InstAlias to handle the optional operands.
 class CPS<dag iops, string asm_ops>
@@ -1820,7 +1810,7 @@ defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
 defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
 
 def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
-                 "setend\t$end", []>, Requires<[IsARM]> {
+                 "setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> {
   bits<1> end;
   let Inst{31-10} = 0b1111000100000001000000;
   let Inst{9} = end;
@@ -1953,6 +1943,12 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
                Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
     let Inst{27-0} = 0b0001101000001111000000001110;
   }
+
+  // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets
+  // the user-space one).
+  def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p),
+                                 4, IIC_Br,
+                                 [(ARMintretflag imm:$offset)]>;
 }
 
 // Indirect branches
@@ -2283,6 +2279,13 @@ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
                  []>, Requires<[IsARM, HasV5TE]>;
 }
 
+def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "lda", "\t$Rt, $addr", []>;
+def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldab", "\t$Rt, $addr", []>;
+def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldah", "\t$Rt, $addr", []>;
+
 // Indexed loads
 multiclass AI2_ldridx<bit isByte, string opc,
                       InstrItinClass iii, InstrItinClass iir> {
@@ -2825,6 +2828,12 @@ multiclass AI3strT<bits<4> op, string opc> {
 
 defm STRHT : AI3strT<0b1011, "strht">;
 
+def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                   NoItinerary, "stl", "\t$Rt, $addr", []>;
+def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlb", "\t$Rt, $addr", []>;
+def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlh", "\t$Rt, $addr", []>;
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
@@ -4014,6 +4023,45 @@ def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
                (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
 
 //===----------------------------------------------------------------------===//
+// CRC Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary,
+               !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm",
+               [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>,
+               Requires<[IsARM, HasV8, HasCRC]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-28} = 0b1110;
+  let Inst{27-23} = 0b00010;
+  let Inst{22-21} = sz;
+  let Inst{20}    = 0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = 0b00;
+  let Inst{9}     = C;
+  let Inst{8}     = 0;
+  let Inst{7-4}   = 0b0100;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1101;
+}
+
+def CRC32B  : AI_crc32<0, 0b00, "b", int_arm_crc32b>;
+def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def CRC32H  : AI_crc32<0, 0b01, "h", int_arm_crc32h>;
+def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def CRC32W  : AI_crc32<0, 0b10, "w", int_arm_crc32w>;
+def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 
@@ -4139,56 +4187,65 @@ def BCCZi64 : PseudoInst<(outs),
 
 
 // Conditional moves
-// FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
 
 let isCommutable = 1, isSelect = 1 in
-def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, GPR:$Rm, cmovpred:$p),
                            4, IIC_iCMOVr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm,
+                                                   cmovpred:$p))]>,
+             RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg_imm:$shift, pred:$p),
-                           4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift,
-                            imm:$cc, CCR:$ccr))*/]>,
+                            (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p),
+                            4, IIC_iCMOVsr,
+                            [(set GPR:$Rd,
+                                  (ARMcmov GPR:$false, so_reg_imm:$shift,
+                                           cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg_reg:$shift, pred:$p),
+                            (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p),
                            4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
-                            imm:$cc, CCR:$ccr))*/]>,
+  [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 
 let isMoveImm = 1 in
-def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
-                             (ins GPR:$false, imm0_65535_expr:$imm, pred:$p),
-                             4, IIC_iMOVi,
-                             []>,
+def MOVCCi16
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                    4, IIC_iMOVi,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
       Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
 def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
-   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm,
+                                                   cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 // Two instruction predicate mov immediate.
 let isMoveImm = 1 in
-def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd),
-                                (ins GPR:$false, i32imm:$src, pred:$p),
-                  8, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">;
+def MOVCCi32imm
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, i32imm:$src, cmovpred:$p),
+                    8, IIC_iCMOVix2,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
 
 let isMoveImm = 1 in
 def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
- [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm,
+                                                   cmovpred:$p))]>,
                 RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 } // neverHasSideEffects
@@ -4221,7 +4278,7 @@ def instsyncb_opt : Operand<i32> {
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
-                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf57ff05;
@@ -4230,7 +4287,7 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
 }
 
 def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
-                "dsb", "\t$opt", []>,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf57ff04;
@@ -4246,124 +4303,219 @@ def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{3-0} = opt;
 }
 
+let usesCustomInserter = 1, Defs = [CPSR] in {
+
 // Pseudo instruction that combines movs + predicated rsbmi
 // to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR] in
-def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
+  def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
 
-let usesCustomInserter = 1 in {
-  let Defs = [CPSR] in {
+// Atomic pseudo-insts which will be lowered to ldrex/strex loops.
+// (64-bit pseudos use a hand-written selection code).
+  let mayLoad = 1, mayStore = 1 in {
     def ATOMIC_LOAD_ADD_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_ADD_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_ADD_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_32 GPR:$ptr, GPR:$val))]>;
-
-    def ATOMIC_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>;
-    def ATOMIC_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>;
-
-    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>;
-    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_CMP_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>;
-}
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_ADD_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_SUB_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_AND_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_OR_I64 :  PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_XOR_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_NAND_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_MIN_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_MAX_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_UMIN_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_UMAX_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
+           GPR:$set1, GPR:$set2, i32imm:$ordering),
+      NoItinerary, []>;
+  }
+  let mayLoad = 1 in
+    def ATOMIC_LOAD_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, i32imm:$ordering),
+      NoItinerary, []>;
+  let mayStore = 1 in
+    def ATOMIC_STORE_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
 }
 
 let usesCustomInserter = 1 in {
@@ -4402,8 +4554,7 @@ def strex_4 : PatFrag<(ops node:$val, node:$ptr),
 
 let mayLoad = 1 in {
 def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary,
-                     "ldrexb", "\t$Rt, $addr",
+                     NoItinerary, "ldrexb", "\t$Rt, $addr",
                      [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
 def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary, "ldrexh", "\t$Rt, $addr",
@@ -4412,10 +4563,22 @@ def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary, "ldrex", "\t$Rt, $addr",
                      [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
 let hasExtraDefRegAllocReq = 1 in
-def LDREXD: AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
                       NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
   let DecoderMethod = "DecodeDoubleRegLoad";
 }
+
+def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexb", "\t$Rt, $addr", []>;
+def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexh", "\t$Rt, $addr", []>;
+def LDAEX  : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaex", "\t$Rt, $addr", []>;
+let hasExtraDefRegAllocReq = 1 in
+def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
@@ -4434,8 +4597,22 @@ def STREXD : AIstrex<0b01, (outs GPR:$Rd),
                     NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
   let DecoderMethod = "DecodeDoubleRegStore";
 }
+def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
+                    []>;
+def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
+                    []>;
+def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
+                    []>;
+let hasExtraSrcRegAllocReq = 1 in
+def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
 }
-
 
 def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
                 [(int_arm_clrex)]>,
@@ -4452,12 +4629,43 @@ def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
 def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
              (STREXH GPR:$Rt, addr_offset_none:$addr)>;
 
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_store_release_8  : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+
+let AddedComplexity = 8 in {
+  def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr),  (LDAB addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA  addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (STLB GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
 // SWP/SWPB are deprecated in V6/V7.
 let mayLoad = 1, mayStore = 1 in {
 def SWP : AIswp<0, (outs GPRnopc:$Rt),
-                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>;
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
+                Requires<[PreV8]>;
 def SWPB: AIswp<1, (outs GPRnopc:$Rt),
-                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>;
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
+                Requires<[PreV8]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4468,7 +4676,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
             c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
             [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                          imm:$CRm, imm:$opc2)]> {
+                          imm:$CRm, imm:$opc2)]>,
+            Requires<[PreV8]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -4489,7 +4698,8 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                              imm:$CRm, imm:$opc2)]> {
+                              imm:$CRm, imm:$opc2)]>,
+               Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -4667,10 +4877,10 @@ defm LDC   : LdStCop <1, 0, "ldc">;
 defm LDCL  : LdStCop <1, 1, "ldcl">;
 defm STC   : LdStCop <0, 0, "stc">;
 defm STCL  : LdStCop <0, 1, "stcl">;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2">;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l">;
-defm STC2  : LdSt2Cop<0, 0, "stc2">;
-defm STC2L : LdSt2Cop<0, 1, "stc2l">;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -4703,7 +4913,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
                     (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, imm0_7:$opc2),
                     [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                  imm:$CRm, imm:$opc2)]>;
+                                  imm:$CRm, imm:$opc2)]>,
+                    ComplexDeprecationPredicate<"MCR">;
 def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
@@ -4746,14 +4957,16 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                       (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                            c_imm:$CRm, imm0_7:$opc2),
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                     imm:$CRm, imm:$opc2)]>;
+                                     imm:$CRm, imm:$opc2)]>,
+                      Requires<[PreV8]>;
 def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPRwithAPSR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
-                           imm0_7:$opc2), []>;
+                           imm0_7:$opc2), []>,
+                      Requires<[PreV8]>;
 def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -4790,7 +5003,8 @@ def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
          GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
-         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
+    Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -5341,4 +5555,5 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
 
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
 // is discarded.
-def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>;
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
+         ComplexDeprecationPredicate<"IT">;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index af4f4d1..43bd4c2 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -2355,17 +2355,36 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
-  : N2Vnp<op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+  : N2Vnp<0b10, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
           itin, OpcodeStr, Dt, ResTy, OpTy,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 
 class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
-  : N2Vnp<op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+  : N2Vnp<0b10, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
           itin, OpcodeStr, Dt, ResTy, OpTy,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
+// Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
+class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as N2VQIntXnp but with Vd as a src register.
+class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
 // Narrow 2-register operations.
 class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
@@ -2534,7 +2553,7 @@ class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 string Dt, ValueType ResTy, ValueType OpTy,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
-          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, OpcodeStr, Dt,
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
           ResTy, OpTy, IntOp, Commutable,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 
@@ -2592,6 +2611,19 @@ class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
           ResTy, OpTy, IntOp, Commutable,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
 
+// Same as N3VQIntnp but with Vd as a src register.
+class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr,
+          Dt, ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
+                                       (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
@@ -2842,6 +2874,7 @@ class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
+
 class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType TyQ, ValueType TyD, SDNode OpNode>
@@ -2897,6 +2930,17 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
+
+// Same as above, but not predicated.
+class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
@@ -3973,12 +4017,18 @@ defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
                             IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                             "vqadd", "u", int_arm_neon_vqaddu, 1>;
 //   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
-defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i",
-                            int_arm_neon_vaddhn, 1>;
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
 //   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
 defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
                             int_arm_neon_vraddhn, 1>;
 
+def : Pat<(v8i8  (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
 // Vector Multiply Operations.
 
 //   VMUL     : Vector Multiply (integer, polynomial and floating-point)
@@ -4016,6 +4066,17 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfd DPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfq QPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+
+
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
 defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
                           IIC_VMULi16Q, IIC_VMULi32Q,
@@ -4061,12 +4122,18 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
                                   (SubReg_i32_lane imm:$lane)))>;
 
 //   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
-defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                         "vmull", "s", NEONvmulls, 1>;
-defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                         "vmull", "u", NEONvmullu, 1>;
-def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
-                        v8i16, v8i8, int_arm_neon_vmullp, 1>;
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "NEONData" in {
+  defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "s", NEONvmulls, 1>;
+  defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "u", NEONvmullu, 1>;
+  def  VMULLp8   :  N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+                            v8i16, v8i8, int_arm_neon_vmullp, 1>;
+  def  VMULLp64  : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
+                          "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
+                    Requires<[HasV8, HasCrypto]>;
+}
 defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
 defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
 
@@ -4133,8 +4200,27 @@ defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
 
 //   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
 defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
-                            "vqdmlal", "s", int_arm_neon_vqdmlal>;
-defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
+                            "vqdmlal", "s", null_frag>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
 
 //   VMLS     : Vector Multiply Subtract (integer and floating-point)
 defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -4190,25 +4276,44 @@ defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
 
 //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
 defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
-                            "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
-defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
+                            "vqdmlsl", "s", null_frag>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
 
 // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
 def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
                           v2f32, fmul_su, fadd_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
                           v4f32, fmul_su, fadd_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 //   Fused Vector Multiply Subtract (floating-point)
 def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
                           v2f32, fmul_su, fsub_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                           v4f32, fmul_su, fsub_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
 def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
@@ -4256,12 +4361,18 @@ defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
                             IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
                             "vqsub", "u", int_arm_neon_vqsubu, 0>;
 //   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
-defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i",
-                            int_arm_neon_vsubhn, 0>;
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
 //   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
 defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
                             int_arm_neon_vrsubhn, 0>;
 
+def : Pat<(v8i8  (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
 // Vector Comparisons.
 
 //   VCEQ     : Vector Compare Equal
@@ -5047,10 +5158,10 @@ def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
 // Vector Move Operations.
 
 //   VMOV     : Vector Move (Register)
-def : InstAlias<"vmov${p} $Vd, $Vm",
-                (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
-def : InstAlias<"vmov${p} $Vd, $Vm",
-                (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -5461,6 +5572,25 @@ def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 }
 
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", 
+                    (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0", 
+                    (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0", 
+                    (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0", 
+                    (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0", 
+                    (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0", 
+                    (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", 
+                    (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", 
+                    (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+
 //   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
 def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
                         IIC_VUNAQ, "vcvt", "f16.f32",
@@ -5725,9 +5855,9 @@ multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
     }
   }
 
-  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
                   (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
-  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
                   (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
 }
 
@@ -5738,6 +5868,49 @@ defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
 defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
 defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
 
+// Cryptography instructions
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "v8Crypto" in {
+  class AES<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
+    : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
+                !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>,
+      Requires<[HasV8, HasCrypto]>;
+}
+
+def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>;
+def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
+def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
+def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
+
+def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, int_arm_neon_sha1h>;
+def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
+def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
+def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, int_arm_neon_sha1c>;
+def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, int_arm_neon_sha1m>;
+def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, int_arm_neon_sha1p>;
+def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>;
+def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
+def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
+def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index e7218c6..af5ef53 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -269,25 +269,26 @@ class T1SystemEncoding<bits<8> opc>
   let Inst{7-0} = opc;
 }
 
-def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
-           T1SystemEncoding<0x00>, // A8.6.110
-        Requires<[IsThumb2]>;
-
-def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
-           T1SystemEncoding<0x10>, // A8.6.410
-           Requires<[IsThumb2]>;
-
-def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
-           T1SystemEncoding<0x20>, // A8.6.408
-           Requires<[IsThumb2]>;
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm", []>,
+            T1SystemEncoding<0x00>,
+            Requires<[IsThumb, HasV6M]> {
+  bits<4> imm;
+  let Inst{7-4} = imm;
+}
 
-def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
-           T1SystemEncoding<0x30>, // A8.6.409
-           Requires<[IsThumb2]>;
+class tHintAlias<string Asm, dag Result> : tInstAlias<Asm, Result> {
+  let Predicates = [IsThumb, HasV6M];
+}
 
-def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
-           T1SystemEncoding<0x40>, // A8.6.157
-           Requires<[IsThumb2]>;
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p)>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p)>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p)>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p)>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
+def : T2Pat<(int_arm_sevl), (tHINT 5)>;
 
 // The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
@@ -300,8 +301,15 @@ def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
   let Inst{7-0} = val;
 }
 
+def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
+                []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
+  let Inst{9-6} = 0b1010;
+  bits<6> val;
+  let Inst{5-0} = val;
+}
+
 def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
-                  []>, T1Encoding<0b101101> {
+                  []>, T1Encoding<0b101101>, Deprecated<HasV8Ops> {
   bits<1> end;
   // A8.6.156
   let Inst{9-5} = 0b10010;
@@ -491,7 +499,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
              T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
     bits<11> target;
     let Inst{10-0} = target;
-  }
+    let AsmMatchConverter = "cvtThumbBranches";
+ }
 
   // Far jump
   // Just a pseudo for a tBL instruction. Needed to let regalloc know about
@@ -521,6 +530,7 @@ let isBranch = 1, isTerminator = 1 in
   bits<8> target;
   let Inst{11-8} = p;
   let Inst{7-0} = target;
+  let AsmMatchConverter = "cvtThumbBranches";
 }
 
 
@@ -660,9 +670,6 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
   let Inst{7-0}  = addr;
 }
 
-def : tInstAlias<"ldr${p}.n $Rt, $addr", 
-                 (tLDRpci tGPR:$Rt, t_addrmode_pc:$addr, pred:$p), 0>;
-
 // A8.6.194 & A8.6.192
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
                                 t_addrmode_is4, AddrModeT1_4,
@@ -1205,9 +1212,9 @@ def tUXTH :                     // A8.6.264
 // Expanded after instruction selection into a branch sequence.
 let usesCustomInserter = 1 in  // Expanded after instruction selection.
   def tMOVCCr_pseudo :
-  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
-              NoItinerary,
-             [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
+             NoItinerary,
+             [(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;
 
 // tLEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 84086a5..48acffd 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -465,6 +465,18 @@ class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
   let Inst{3-0}   = Rm;
 }
 
+class T2ThreeRegNoP<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
 class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : T2sI<oops, iops, itin, opc, asm, pattern> {
@@ -1396,6 +1408,32 @@ def t2LDRHT  : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
 def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
 def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 
+class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
+               string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
+            opc, asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-24} = 0b000;
+  let Inst{23-20} = bits23_20;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+
 // Store
 defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
                    BinOpFrag<(store node:$LHS, node:$RHS)>>;
@@ -1539,6 +1577,31 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
                  "$addr.base = $wb", []>;
 
+class T2Istrrel<bits<2> bit54, dag oops, dag iops,
+                string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
+            asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2STL  : T2Istrrel<0b10, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stl", "\t$Rt, $addr", []>;
+def t2STLB : T2Istrrel<0b00, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlb", "\t$Rt, $addr", []>;
+def t2STLH : T2Istrrel<0b01, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlh", "\t$Rt, $addr", []>;
+
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
 // data/instruction access.
 // instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
@@ -1855,6 +1918,9 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
   let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
+def : t2InstAlias<"mov${p} $Rd, $imm", 
+                  (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
+
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
 
@@ -2950,6 +3016,34 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
+// CRC32 Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class T2I_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : T2ThreeRegNoP<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary,
+               !strconcat("crc32", suffix, "\t$Rd, $Rn, $Rm"),
+               [(set rGPR:$Rd, (builtin rGPR:$Rn, rGPR:$Rm))]>,
+               Requires<[IsThumb2, HasV8, HasCRC]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b010110;
+  let Inst{20}    = C;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6}   = 0b10;
+  let Inst{5-4}   = sz;
+}
+
+def t2CRC32B  : T2I_crc32<0, 0b00, "b", int_arm_crc32b>;
+def t2CRC32CB : T2I_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def t2CRC32H  : T2I_crc32<0, 0b01, "h", int_arm_crc32h>;
+def t2CRC32CH : T2I_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def t2CRC32W  : T2I_crc32<0, 0b10, "w", int_arm_crc32w>;
+def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
@@ -3029,93 +3123,67 @@ defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                          BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
 
 // Conditional moves
-// FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
 
 let isCommutable = 1, isSelect = 1 in
 def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
-                            (ins rGPR:$false, rGPR:$Rm, pred:$p),
+                            (ins rGPR:$false, rGPR:$Rm, cmovpred:$p),
                             4, IIC_iCMOVr,
-   [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">,
-                Sched<[WriteALU]>;
+                            [(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm,
+                                                     cmovpred:$p))]>,
+               RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
-def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd),
-                            (ins rGPR:$false, t2_so_imm:$imm, pred:$p),
+def t2MOVCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
                    4, IIC_iCMOVi,
-[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,t2_so_imm:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
-// FIXME: Pseudo-ize these. For now, just mark codegen only.
 let isCodeGenOnly = 1 in {
 let isMoveImm = 1 in
-def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm),
-                      IIC_iCMOVi,
-                      "movw", "\t$Rd, $imm", []>,
-                      RegConstraint<"$false = $Rd">, Sched<[WriteALU]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25} = 1;
-  let Inst{24-21} = 0b0010;
-  let Inst{20} = 0; // The S bit.
-  let Inst{15} = 0;
-
-  bits<4> Rd;
-  bits<16> imm;
-
-  let Inst{11-8}  = Rd;
-  let Inst{19-16} = imm{15-12};
-  let Inst{26}    = imm{11};
-  let Inst{14-12} = imm{10-8};
-  let Inst{7-0}   = imm{7-0};
-}
+def t2MOVCCi16
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins  rGPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
-def t2MOVCCi32imm : PseudoInst<(outs rGPR:$dst),
-                               (ins rGPR:$false, i32imm:$src, pred:$p),
-                    IIC_iCMOVix2, []>, RegConstraint<"$false = $dst">;
+def t2MVNCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd,
+                         (ARMcmov rGPR:$false, t2_so_imm_not:$imm,
+                                  cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+class MOVCCShPseudo<SDPatternOperator opnode, Operand ty>
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, rGPR:$Rm, i32imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVsi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,
+                                            (opnode rGPR:$Rm, (i32 ty:$imm)),
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def t2MOVCClsl : MOVCCShPseudo<shl,  imm0_31>;
+def t2MOVCClsr : MOVCCShPseudo<srl,  imm_sr>;
+def t2MOVCCasr : MOVCCShPseudo<sra,  imm_sr>;
+def t2MOVCCror : MOVCCShPseudo<rotr, imm0_31>;
 
 let isMoveImm = 1 in
-def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
-                   IIC_iCMOVi, "mvn", "\t$Rd, $imm",
-[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm,
-                   imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd">, Sched<[WriteALU]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25} = 0;
-  let Inst{24-21} = 0b0011;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{15} = 0;
-}
-
-class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
-                   string opc, string asm, list<dag> pattern>
-  : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern>, Sched<[WriteALU]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b01;
-  let Inst{24-21} = 0b0010;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{5-4} = opcod; // Shift type.
-}
-def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "lsl", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "lsr", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "asr", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
+def t2MOVCCi32imm
+    : t2PseudoInst<(outs rGPR:$dst),
+                   (ins rGPR:$false, i32imm:$src, cmovpred:$p),
+                   8, IIC_iCMOVix2,
+                   [(set rGPR:$dst, (ARMcmov rGPR:$false, imm:$src,
+                                             cmovpred:$p))]>,
+      RegConstraint<"$false = $dst">;
 } // isCodeGenOnly = 1
 
 } // neverHasSideEffects
@@ -3127,7 +3195,7 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
-                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
                 Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f5;
@@ -3136,7 +3204,8 @@ def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
 }
 
 def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
-                "dsb", "\t$opt", []>, Requires<[HasDB]> {
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+                Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f4;
   let Inst{3-0} = opt;
@@ -3149,15 +3218,14 @@ def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
   let Inst{3-0} = opt;
 }
 
-class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
                 list<dag> pattern, bits<4> rt2 = 0b1111>
   : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001101;
   let Inst{11-8} = rt2;
-  let Inst{7-6} = 0b01;
-  let Inst{5-4} = opcod;
+  let Inst{7-4} = opcod;
   let Inst{3-0} = 0b1111;
 
   bits<4> addr;
@@ -3165,15 +3233,14 @@ class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
-class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+class T2I_strex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
                 list<dag> pattern, bits<4> rt2 = 0b1111>
   : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001100;
   let Inst{11-8} = rt2;
-  let Inst{7-6} = 0b01;
-  let Inst{5-4} = opcod;
+  let Inst{7-4} = opcod;
 
   bits<4> Rd;
   bits<4> addr;
@@ -3184,11 +3251,11 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexb", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexh", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
@@ -3206,7 +3273,7 @@ def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
   let Inst{7-0} = addr{7-0};
 }
 let hasExtraDefRegAllocReq = 1 in
-def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
+def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
                          (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
@@ -3214,16 +3281,48 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
+def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexb", "\t$Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexh", "\t$Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldaex", "\t$Rt, $addr", "",
+                     []>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = 0b11101111;
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+
+  let Inst{7} = 1;
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
+def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexb", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd, (strex_1 rGPR:$Rt,
                                                   addr_offset_none:$addr))]>;
-def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
+def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexh", "\t$Rd, $Rt, $addr", "",
@@ -3247,7 +3346,7 @@ def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
   let Inst{7-0} = addr{7-0};
 }
 let hasExtraSrcRegAllocReq = 1 in
-def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
+def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
@@ -3255,6 +3354,42 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
+def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexb", "\t$Rd, $Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+
+def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexh", "\t$Rd, $Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+
+def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             addr_offset_none:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "stlex", "\t$Rd, $Rt, $addr", "",
+                  []>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4}  = 0b11111110;
+  let Inst{3-0}   = Rd;
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
 }
 
 def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
@@ -3336,13 +3471,14 @@ def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
   let Inst{12} = 1;
 
   bits<24> target;
-  let Inst{26} = target{19};
-  let Inst{11} = target{18};
-  let Inst{13} = target{17};
+  let Inst{26} = target{23};
+  let Inst{13} = target{22};
+  let Inst{11} = target{21};
   let Inst{25-16} = target{20-11};
   let Inst{10-0} = target{10-0};
   let DecoderMethod = "DecodeT2BInstruction";
-}
+  let AsmMatchConverter = "cvtThumbBranches"; 
+} 
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT : t2PseudoInst<(outs),
@@ -3410,6 +3546,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
   let Inst{10-0} = target{11-1};
 
   let DecoderMethod = "DecodeThumb2BCCInstruction";
+  let AsmMatchConverter = "cvtThumbBranches";
 }
 
 // Tail calls. The IOS version of thumb tail calls uses a t2 branch, so
@@ -3428,7 +3565,8 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
                     AddrModeNone, 2,  IIC_iALUx,
-                    "it$mask\t$cc", "", []> {
+                    "it$mask\t$cc", "", []>,
+           ComplexDeprecationPredicate<"IT"> {
   // 16-bit instruction.
   let Inst{31-16} = 0x0000;
   let Inst{15-8} = 0b10111111;
@@ -3502,27 +3640,34 @@ class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
 
 let M = 1 in
   def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
-                      "$imod.w\t$iflags, $mode">;
+                      "$imod\t$iflags, $mode">;
 let mode = 0, M = 0 in
   def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
                       "$imod.w\t$iflags">;
 let imod = 0, iflags = 0, M = 1 in
   def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
 
+def : t2InstAlias<"cps$imod.w $iflags, $mode",
+                   (t2CPS3p imod_op:$imod, iflags_op:$iflags, i32imm:$mode), 0>;
+def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
+
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-def t2HINT : T2I<(outs), (ins imm0_4:$imm), NoItinerary, "hint", "\t$imm",[]> {
-  bits<3> imm;
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",[]> {
+  bits<8> imm;
   let Inst{31-3} = 0b11110011101011111000000000000;
-  let Inst{2-0} = imm;
+  let Inst{7-0} = imm;
 }
 
-def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_4:$imm, pred:$p)>;
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p)>;
 def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
 def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
 def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
 def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
 def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
   bits<4> opt;
@@ -3545,6 +3690,20 @@ def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
   let Inst{19-16} = opt;
 }
 
+class T2DCPS<bits<2> opt, string opc>
+  : T2I<(outs), (ins), NoItinerary, opc, "", []>, Requires<[IsThumb2, HasV8]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111000;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-2} = 0b0000000000;
+  let Inst{1-0} = opt;
+}
+
+def t2DCPS1 : T2DCPS<0b01, "dcps1">;
+def t2DCPS2 : T2DCPS<0b10, "dcps2">;
+def t2DCPS3 : T2DCPS<0b11, "dcps3">;
+
 class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
@@ -3600,9 +3759,12 @@ def t2RFEIA  : T2RFE<0b111010011001,
                    [/* For disassembly only; pattern left blank */]>;
 
 // B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction.
-let Defs = [PC], Uses = [LR] in
+// Exception return instruction is "subs pc, lr, #imm".
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
 def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
-                   "subs", "\tpc, lr, $imm", []>, Requires<[IsThumb2]> {
+                        "subs", "\tpc, lr, $imm",
+                        [(ARMintretflag imm0_255:$imm)]>,
+                   Requires<[IsThumb2]> {
   let Inst{31-8} = 0b111100111101111010001111;
 
   bits<8> imm;
@@ -3752,10 +3914,10 @@ defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
 defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
 defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
 defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
-defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">;
-defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -3767,7 +3929,7 @@ defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
 //
 // A/R class can only move from CPSR or SPSR.
 def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
-                  []>, Requires<[IsThumb2,IsARClass]> {
+                  []>, Requires<[IsThumb2,IsNotMClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
@@ -3777,7 +3939,7 @@ def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
 def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
 
 def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
-                   []>, Requires<[IsThumb2,IsARClass]> {
+                   []>, Requires<[IsThumb2,IsNotMClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111111111000;
   let Inst{11-8} = Rd;
@@ -3810,7 +3972,7 @@ def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
 // the mask with the fields to be accessed in the special register.
 def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
                    NoItinerary, "msr", "\t$mask, $Rn", []>,
-               Requires<[IsThumb2,IsARClass]> {
+               Requires<[IsThumb2,IsNotMClass]> {
   bits<5> mask;
   bits<4> Rn;
   let Inst{31-21} = 0b11110011100;
@@ -3892,7 +4054,8 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
            (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                 c_imm:$CRm, imm0_7:$opc2),
            [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                         imm:$CRm, imm:$opc2)]>;
+                         imm:$CRm, imm:$opc2)]>,
+           ComplexDeprecationPredicate<"MCR">;
 def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0, pred:$p)>;
@@ -3900,7 +4063,9 @@ def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
              (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, imm0_7:$opc2),
              [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                            imm:$CRm, imm:$opc2)]>;
+                            imm:$CRm, imm:$opc2)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
 def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, 0, pred:$p)>;
@@ -3915,7 +4080,9 @@ def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
 
 def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
              (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
-                                  c_imm:$CRm, imm0_7:$opc2), []>;
+                                  c_imm:$CRm, imm0_7:$opc2), []> {
+  let Predicates = [IsThumb2, PreV8];
+}
 def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                           c_imm:$CRm, 0, pred:$p)>;
@@ -3933,17 +4100,22 @@ def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0,
                                        imm:$CRm)]>;
 def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0,
                            [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
-                                           GPR:$Rt2, imm:$CRm)]>;
+                                           GPR:$Rt2, imm:$CRm)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+
 /* from coprocessor to ARM core register */
 def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1>;
 
-def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1>;
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1> {
+  let Predicates = [IsThumb2, PreV8];
+}
 
 //===----------------------------------------------------------------------===//
 // Other Coprocessor Instructions.
 //
 
-def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                  c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                  "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                  [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
@@ -3964,6 +4136,8 @@ def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{15-12} = CRd;
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
 }
 
 def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
@@ -3987,6 +4161,8 @@ def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{15-12} = CRd;
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
 }
 
 
@@ -4060,6 +4236,15 @@ def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
 def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
             (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
 
+let AddedComplexity = 8 in {
+  def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr),  (t2LDAB addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA  addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (t2STLB GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
 
 //===----------------------------------------------------------------------===//
 // Assembler aliases
@@ -4081,7 +4266,8 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
 
 // Aliases for ADD without the ".w" optional width specifier.
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
-        (t2ADDri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, 
+         cc_out:$s)>;
 def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
            (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
@@ -4156,9 +4342,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4185,7 +4371,7 @@ def : t2InstAlias<"ldrsh${p} $Rt, $addr",
                   (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
 
 def : t2InstAlias<"ldr${p} $Rt, $addr",
-                  (t2LDRpci GPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+                  (t2LDRpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrb${p} $Rt, $addr",
                   (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p} $Rt, $addr",
@@ -4347,16 +4533,16 @@ def : t2InstAlias<"mvn${p} $Rd, $imm",
                   (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
 // Same for AND <--> BIC
 def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
-                  (t2ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
-                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
-                  (t2BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"and${s}${p} $Rdn, $imm",
-                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 // Likewise, "add Rd, t2_so_imm_neg" -> sub
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
@@ -4398,7 +4584,7 @@ def : t2InstAlias<"adr${p} $Rd, $addr",
 
 // LDR(literal) w/ alternate [pc, #imm] syntax.
 def t2LDRpcrel   : t2AsmPseudo<"ldr${p} $Rt, $addr",
-                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+                         (ins GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def t2LDRBpcrel  : t2AsmPseudo<"ldrb${p} $Rt, $addr",
                          (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def t2LDRHpcrel  : t2AsmPseudo<"ldrh${p} $Rt, $addr",
@@ -4409,7 +4595,7 @@ def t2LDRSHpcrel  : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
                          (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
     // Version w/ the .w suffix.
 def : t2InstAlias<"ldr${p}.w $Rt, $addr",
-                  (t2LDRpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
+                  (t2LDRpcrel GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
 def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
                   (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index f9cfa15..a8cdc5c 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -333,45 +333,52 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
-multiclass vsel_inst<string op, bits<2> opc> {
-  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+multiclass vsel_inst<string op, bits<2> opc, int CC> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+      Uses = [CPSR], AddedComplexity = 4 in {
     def S : ASbInp<0b11100, opc, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>,
+                   Requires<[HasFPARMv8]>;
 
     def D : ADbInp<0b11100, opc, 0,
                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                    NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
   }
 }
 
-defm VSELGT : vsel_inst<"gt", 0b11>;
-defm VSELGE : vsel_inst<"ge", 0b10>;
-defm VSELEQ : vsel_inst<"eq", 0b00>;
-defm VSELVS : vsel_inst<"vs", 0b01>;
+// The CC constants here match ARMCC::CondCodes.
+defm VSELGT : vsel_inst<"gt", 0b11, 12>;
+defm VSELGE : vsel_inst<"ge", 0b10, 10>;
+defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
+defm VSELVS : vsel_inst<"vs", 0b01, 6>;
 
-multiclass vmaxmin_inst<string op, bit opc> {
+multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
     def S : ASbInp<0b11101, 0b00, opc,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>,
+                   Requires<[HasFPARMv8]>;
 
     def D : ADbInp<0b11101, 0b00, opc,
                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                    NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
   }
 }
 
-defm VMAXNM : vmaxmin_inst<"vmaxnm", 0>;
-defm VMINNM : vmaxmin_inst<"vminnm", 1>;
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>;
 
 // Match reassociated forms only if not sign dependent rounding.
 def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
-          (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+          (VNMULD DPR:$a, DPR:$b)>,
+          Requires<[NoHonorSignDependentRounding,HasDPVFP]>;
 def : Pat<(fmul (fneg SPR:$a), SPR:$b),
           (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
 
@@ -502,6 +509,8 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
   let Inst{11-8}  = 0b1011;
   let Inst{7-6}   = 0b11;
   let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // Between half, single and double-precision.  For disassembly only.
@@ -532,7 +541,7 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sm;
 
@@ -544,7 +553,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
 def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
                    (outs SPR:$Sd), (ins DPR:$Dm),
                    NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -559,7 +568,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
 def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sm;
 
@@ -571,7 +580,7 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
 def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
                    (outs SPR:$Sd), (ins DPR:$Dm),
                    NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -588,21 +597,21 @@ multiclass vcvt_inst<string opc, bits<2> rm> {
     def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
 
     def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
 
     def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins DPR:$Dm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
       bits<5> Dm;
 
       let Inst{17-16} = rm;
@@ -616,7 +625,7 @@ multiclass vcvt_inst<string opc, bits<2> rm> {
     def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
                     (outs SPR:$Sd), (ins DPR:$Dm),
                     NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
       bits<5> Dm;
 
       let Inst{17-16} = rm;
@@ -652,17 +661,24 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2> {
   def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
                (outs SPR:$Sd), (ins SPR:$Sm),
                NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
-               []>, Requires<[HasV8FP]> {
+               []>, Requires<[HasFPARMv8]> {
     let Inst{7} = op2;
     let Inst{16} = op;
   }
   def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
                 (outs DPR:$Dd), (ins DPR:$Dm),
                 NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
-                []>, Requires<[HasV8FP]> {
+                []>, Requires<[HasFPARMv8, HasDPVFP]> {
     let Inst{7} = op2;
     let Inst{16} = op;
   }
+
+  def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p)>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
 defm VRINTZ : vrint_inst_zrx<"z", 0, 1>;
@@ -674,21 +690,23 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm> {
     def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
     def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs DPR:$Dd), (ins DPR:$Dm),
                    NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
       let Inst{17-16} = rm;
     }
   }
 
   def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
-                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>;
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>,
+        Requires<[HasFPARMv8]>;
   def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
-                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
 defm VRINTA : vrint_inst_anpm<"a", 0b00>;
@@ -885,6 +903,8 @@ class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Sm{0};
   let Inst{15-12} = Dd{3-0};
   let Inst{22}    = Dd{4};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -956,6 +976,8 @@ class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Dm{4};
   let Inst{15-12} = Sd{4-1};
   let Inst{22}    = Sd{0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1077,6 +1099,8 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{4};
   let Inst{15-12} = dst{3-0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
@@ -1189,7 +1213,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1205,7 +1229,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
@@ -1216,7 +1240,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1232,7 +1256,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1243,7 +1267,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1259,7 +1283,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1270,7 +1294,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1285,7 +1309,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1299,7 +1323,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1314,7 +1338,7 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1323,7 +1347,7 @@ def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
 // (fma x, y, z) -> (vfms z, x, y)
 def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
           (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
           (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1334,7 +1358,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1349,7 +1373,7 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1358,14 +1382,14 @@ def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
 // (fma (fneg x), y, z) -> (vfms z, x, y)
 def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fma x, (fneg y), z) -> (vfms z, x, y)
 def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1376,7 +1400,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1391,7 +1415,7 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1400,14 +1424,14 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
 // (fneg (fma x, y, z)) -> (vfnma z, x, y)
 def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
 def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1418,7 +1442,7 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP4,UseFusedMAC]>;
+               Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1432,7 +1456,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1442,21 +1466,21 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
 // (fma x, y, (fneg z)) -> (vfnms z, x, y))
 def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1466,15 +1490,17 @@ def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
 //
 
 let neverHasSideEffects = 1 in {
-def VMOVDcc  : ARMPseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, pred:$p),
-                    4, IIC_fpUNA64,
-                    [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
-                 RegConstraint<"$Dn = $Dd">;
-
-def VMOVScc  : ARMPseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, pred:$p),
-                    4, IIC_fpUNA32,
-                    [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
-                 RegConstraint<"$Sn = $Sd">;
+def VMOVDcc  : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
+                    IIC_fpUNA64,
+                    [(set (f64 DPR:$Dd),
+                          (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
+               RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+
+def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
+                    IIC_fpUNA32,
+                    [(set (f32 SPR:$Sd),
+                          (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
+               RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -1520,6 +1546,8 @@ let Uses = [FPSCR] in {
                               "vmrs", "\t$Rt, mvfr0", []>;
   def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
                               "vmrs", "\t$Rt, mvfr1", []>;
+  def VMRS_MVFR2 : MovFromVFP<0b0101 /* mvfr2 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr2", []>, Requires<[HasFPARMv8]>;
   def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins),
                               "vmrs", "\t$Rt, fpinst", []>;
   def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins),
@@ -1573,7 +1601,8 @@ let isReMaterializable = 1 in {
 def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
                     VFPMiscFrm, IIC_fpUNA64,
                     "vmov", ".f64\t$Dd, $imm",
-                    [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
+                    [(set DPR:$Dd, vfp_f64imm:$imm)]>,
+              Requires<[HasVFP3,HasDPVFP]> {
   bits<5> Dd;
   bits<8> imm;
 
@@ -1655,23 +1684,23 @@ def : VFP2MnemonicAlias<"fmrx", "vmrs">;
 def : VFP2MnemonicAlias<"fmxr", "vmsr">;
 
 // Be friendly and accept the old form of zero-compare
-def : VFP2InstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
 def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
 
 
 def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
 def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
                     (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"faddd${p} $Dd, $Dn, $Dm",
-                    (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm",
+                      (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
 def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
                     (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"fsubd${p} $Dd, $Dn, $Dm",
-                    (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+                      (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
 
 // No need for the size suffix on VSQRT. It's implied by the register classes.
 def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
 
 // VLDR/VSTR accept an optional type suffix.
 def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 1803a8a..61596d5 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -90,6 +90,10 @@ namespace {
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
     typedef MemOpQueue::iterator MemOpQueueIter;
 
+    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
+                          const MemOpQueue &MemOps, unsigned DefReg,
+                          unsigned RangeBegin, unsigned RangeEnd);
+
     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   int Offset, unsigned Base, bool BaseKill, int Opcode,
                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
@@ -360,6 +364,62 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   return true;
 }
 
+/// \brief Find all instructions using a given imp-def within a range.
+///
+/// We are trying to combine a range of instructions, one of which (located at
+/// position RangeBegin) implicitly defines a register. The final LDM/STM will
+/// be placed at RangeEnd, and so any uses of this definition between RangeStart
+/// and RangeEnd must be modified to use an undefined value.
+///
+/// The live range continues until we find a second definition or one of the
+/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
+/// we must consider all uses and decide which are relevant in a second pass.
+void ARMLoadStoreOpt::findUsesOfImpDef(
+    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
+    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
+  std::map<unsigned, MachineOperand *> Uses;
+  unsigned LastLivePos = RangeEnd;
+
+  // First we find all uses of this register with Position between RangeBegin
+  // and RangeEnd, any or all of these could be uses of a definition at
+  // RangeBegin. We also record the latest position a definition at RangeBegin
+  // would be considered live.
+  for (unsigned i = 0; i < MemOps.size(); ++i) {
+    MachineInstr &MI = *MemOps[i].MBBI;
+    unsigned MIPosition = MemOps[i].Position;
+    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
+      continue;
+
+    // If this instruction defines the register, then any later use will be of
+    // that definition rather than ours.
+    if (MI.definesRegister(DefReg))
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
+    if (!UseOp)
+      continue;
+
+    // If this instruction kills the register then (assuming liveness is
+    // correct when we start) we don't need to think about anything after here.
+    if (UseOp->isKill())
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    Uses[MIPosition] = UseOp;
+  }
+
+  // Now we traverse the list of all uses, and append the ones that actually use
+  // our definition to the requested list.
+  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
+                                                      E = Uses.end();
+       I != E; ++I) {
+    // List is sorted by position so once we've found one out of range there
+    // will be no more to consider.
+    if (I->first > LastLivePos)
+      break;
+    UsesOfImpDefs.push_back(I->second);
+  }
+}
+
 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
 // success.
 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
@@ -392,6 +452,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 
   SmallVector<std::pair<unsigned, bool>, 8> Regs;
   SmallVector<unsigned, 8> ImpDefs;
+  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     unsigned Reg = memOps[i].Reg;
     // If we are inserting the merged operation after an operation that
@@ -406,6 +467,12 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
       unsigned DefReg = MO->getReg();
       if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
         ImpDefs.push_back(DefReg);
+
+      // There may be other uses of the definition between this instruction and
+      // the eventual LDM/STM position. These should be marked undef if the
+      // merge takes place.
+      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
+                       insertPos);
     }
   }
 
@@ -418,6 +485,16 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 
   // Merge succeeded, update records.
   Merges.push_back(prior(Loc));
+
+  // In gathering loads together, we may have moved the imp-def of a register
+  // past one of its uses. This is OK, since we know better than the rest of
+  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
+  // affected uses.
+  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
+                                                   E = UsesOfImpDefs.end();
+       I != E; ++I)
+    (*I)->setIsUndef();
+
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     // Remove kill flags from any memops that come before insertPos.
     if (Regs[i-memOpsBegin].second) {
@@ -489,7 +566,10 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     if (Reg != ARM::SP &&
         NewOffset == Offset + (int)Size &&
         ((isNotVFP && RegNum > PRegNum) ||
-         ((Count < Limit) && RegNum == PRegNum+1))) {
+         ((Count < Limit) && RegNum == PRegNum+1)) &&
+        // On Swift we don't want vldm/vstm to start with a odd register num
+        // because Q register unaligned vldm/vstm need more uops.
+        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
       Offset += Size;
       PRegNum = RegNum;
       ++Count;
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index b641483..e12c9c6 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -82,7 +82,7 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_ExternalSymbol:
    MCOp = GetSymbolRef(MO,
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index d9ec4fd..010edf3 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -84,12 +84,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   unsigned GPRCS2Size;
   unsigned DPRCSSize;
 
-  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
-  /// which belong to these spill areas.
-  BitVector GPRCS1Frames;
-  BitVector GPRCS2Frames;
-  BitVector DPRCSFrames;
-
   /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
   /// the aligned portion of the stack frame.  This is always a contiguous
   /// sequence of D-registers starting from d8.
@@ -128,7 +122,6 @@ public:
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-    GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
     NumAlignedDPRCS2Regs(0),
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
@@ -141,7 +134,6 @@ public:
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-    GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
 
@@ -190,59 +182,6 @@ public:
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
   void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
 
-  bool isGPRCalleeSavedArea1Frame(int fi) const {
-    if (fi < 0 || fi >= (int)GPRCS1Frames.size())
-      return false;
-    return GPRCS1Frames[fi];
-  }
-  bool isGPRCalleeSavedArea2Frame(int fi) const {
-    if (fi < 0 || fi >= (int)GPRCS2Frames.size())
-      return false;
-    return GPRCS2Frames[fi];
-  }
-  bool isDPRCalleeSavedAreaFrame(int fi) const {
-    if (fi < 0 || fi >= (int)DPRCSFrames.size())
-      return false;
-    return DPRCSFrames[fi];
-  }
-
-  void addGPRCalleeSavedArea1Frame(int fi) {
-    if (fi >= 0) {
-      int Size = GPRCS1Frames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        GPRCS1Frames.resize(Size);
-      }
-      GPRCS1Frames[fi] = true;
-    }
-  }
-  void addGPRCalleeSavedArea2Frame(int fi) {
-    if (fi >= 0) {
-      int Size = GPRCS2Frames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        GPRCS2Frames.resize(Size);
-      }
-      GPRCS2Frames[fi] = true;
-    }
-  }
-  void addDPRCalleeSavedAreaFrame(int fi) {
-    if (fi >= 0) {
-      int Size = DPRCSFrames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        DPRCSFrames.resize(Size);
-      }
-      DPRCSFrames[fi] = true;
-    }
-  }
-
   unsigned createJumpTableUId() {
     return JumpTableUId++;
   }
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index bb7d358..d045761 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -172,6 +172,7 @@ def ITSTATE    : ARMReg<4, "itstate">;
 
 // Special Registers - only available in privileged mode.
 def FPSID   : ARMReg<0,  "fpsid">;
+def MVFR2   : ARMReg<5,  "mvfr2">;
 def MVFR1   : ARMReg<6,  "mvfr1">;
 def MVFR0   : ARMReg<7,  "mvfr0">;
 def FPEXC   : ARMReg<8,  "fpexc">;
@@ -251,7 +252,7 @@ def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
 // to the saved value before the tail call, which would clobber a call address.
 // Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
 // this class and the preceding one(!)  This is what we want.
-def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R9, R12)> {
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
   let AltOrders = [(and tcGPR, tGPR)];
   let AltOrderSelect = [{
       return MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 74ee50b..603e775 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1879,6 +1879,10 @@ def CortexA9Itineraries : ProcessorItineraries<
 // The following definitions describe the simpler per-operand machine model.
 // This works with MachineScheduler and will eventually replace itineraries.
 
+class A9WriteLMOpsListType<list<WriteSequence> writes> {
+  list <WriteSequence> Writes = writes;
+  SchedMachineModel SchedModel = ?;
+}
 
 // Cortex-A9 machine model for scheduling and other instruction cost heuristics.
 def CortexA9Model : SchedMachineModel {
@@ -2011,7 +2015,7 @@ def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
 
 // Define a predicate to select the LDM based on number of memory addresses.
 def A9LMAdr#NumAddr#Pred :
-  SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+  SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>;
 
 } // foreach NumAddr
 
@@ -2054,48 +2058,30 @@ def A9WriteL#NumAddr#Hi : WriteSequence<
 //===----------------------------------------------------------------------===//
 // LDM: Load multiple into 32-bit integer registers.
 
+def A9WriteLMOpsList : A9WriteLMOpsListType<
+                 [A9WriteL1, A9WriteL1Hi,
+                  A9WriteL2, A9WriteL2Hi,
+                  A9WriteL3, A9WriteL3Hi,
+                  A9WriteL4, A9WriteL4Hi,
+                  A9WriteL5, A9WriteL5Hi,
+                  A9WriteL6, A9WriteL6Hi,
+                  A9WriteL7, A9WriteL7Hi,
+                  A9WriteL8, A9WriteL8Hi]>;
+
 // A9WriteLM variants expand into a pair of writes for each 64-bit
 // value loaded. When the number of registers is odd, the last
 // A9WriteLnHi is naturally ignored because the instruction has no
 // following def operands.  These variants take no issue resource, so
 // they may need to be part of a WriteSequence that includes A9WriteIssue.
 def A9WriteLM : SchedWriteVariant<[
-  SchedVar<A9LMAdr1Pred, [A9WriteL1, A9WriteL1Hi]>,
-  SchedVar<A9LMAdr2Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi]>,
-  SchedVar<A9LMAdr3Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi]>,
-  SchedVar<A9LMAdr4Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi]>,
-  SchedVar<A9LMAdr5Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi]>,
-  SchedVar<A9LMAdr6Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi]>,
-  SchedVar<A9LMAdr7Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi,
-                          A9WriteL7, A9WriteL7Hi]>,
-  SchedVar<A9LMAdr8Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi,
-                          A9WriteL7, A9WriteL7Hi,
-                          A9WriteL8, A9WriteL8Hi]>,
+  SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
   // For unknown LDMs, define the maximum number of writes, but only
   // make the first two consume resources.
   SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
@@ -2177,49 +2163,39 @@ def A9WriteLMfp#NumAddr#Hi : WriteSequence<
 // pair of writes for each 64-bit data loaded. When the number of
 // registers is odd, the last WriteLMfpnHi is naturally ignored because
 // the instruction has no following def operands.
+
+def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType<
+                 [A9WriteLMfp1, A9WriteLMfp2,       // 0-1
+                  A9WriteLMfp3, A9WriteLMfp4,       // 2-3
+                  A9WriteLMfp5, A9WriteLMfp6,       // 4-5
+                  A9WriteLMfp7, A9WriteLMfp8,       // 6-7
+                  A9WriteLMfp1Hi,                   // 8-8
+                  A9WriteLMfp2Hi, A9WriteLMfp2Hi,   // 9-10
+                  A9WriteLMfp3Hi, A9WriteLMfp3Hi,   // 11-12
+                  A9WriteLMfp4Hi, A9WriteLMfp4Hi,   // 13-14
+                  A9WriteLMfp5Hi, A9WriteLMfp5Hi,   // 15-16
+                  A9WriteLMfp6Hi, A9WriteLMfp6Hi,   // 17-18
+                  A9WriteLMfp7Hi, A9WriteLMfp7Hi,   // 19-20
+                  A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
+
 def A9WriteLMfpPostRA : SchedWriteVariant<[
-  SchedVar<A9LMAdr1Pred, [A9WriteLMfp1, A9WriteLMfp1Hi]>,
-  SchedVar<A9LMAdr2Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi]>,
-  SchedVar<A9LMAdr3Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi]>,
-  SchedVar<A9LMAdr4Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi]>,
-  SchedVar<A9LMAdr5Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi]>,
-  SchedVar<A9LMAdr6Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi]>,
-  SchedVar<A9LMAdr7Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi,
-                          A9WriteLMfp7, A9WriteLMfp7Hi]>,
-  SchedVar<A9LMAdr8Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi,
-                          A9WriteLMfp7, A9WriteLMfp7Hi,
-                          A9WriteLMfp8, A9WriteLMfp8Hi]>,
+  SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
   // For unknown LDMs, define the maximum number of writes, but only
-  // make the first two consume resources.
-  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                             A9WriteLMfp2, A9WriteLMfp2Hi,
-                             A9WriteLMfp3Hi, A9WriteLMfp3Hi,
-                             A9WriteLMfp4Hi, A9WriteLMfp4Hi,
+  // make the first two consume resources. We are optimizing for the case
+  // where the operands are DPRs, and this determines the first eight
+  // types. The remaining eight types are filled to cover the case
+  // where the operands are SPRs.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2,
+                             A9WriteLMfp3Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp8Hi,
                              A9WriteLMfp5Hi, A9WriteLMfp5Hi,
                              A9WriteLMfp6Hi, A9WriteLMfp6Hi,
                              A9WriteLMfp7Hi, A9WriteLMfp7Hi,
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 2a41616..8d7dbc2 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1345,20 +1345,25 @@ let SchedModel = SwiftModel in {
   // 4.2.20 Integer Load Signextended
   def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 3;
+    let NumMicroOps = 2;
   }
   def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 4;
+    let NumMicroOps = 2;
   }
   def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01,
                                                    SwiftUnitP01]> {
     let Latency = 4;
+    let NumMicroOps = 3;
   }
   def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> {
     let Latency = 3;
+    let NumMicroOps = 2;
   }
   def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2,
-                                                    SwiftUnitP01]> {
+                                                   SwiftUnitP01]> {
     let Latency = 3;
+    let NumMicroOps = 3;
   }
   def SwiftWrBackOne : SchedWriteRes<[]> {
     let Latency = 1;
@@ -1399,7 +1404,10 @@ let SchedModel = SwiftModel in {
     def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> {
       let Latency = Lat;
     }
-    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> { let Latency = Lat; }
+    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> {
+      let Latency = Lat;
+      let NumMicroOps = 0;
+    }
   }
   // Predicate.
   foreach NumAddr = 1-16 in {
@@ -1520,6 +1528,7 @@ let SchedModel = SwiftModel in {
   // 4.2.25 Integer Store, Multiple
   def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 0;
+    let NumMicroOps = 2;
   }
   foreach NumAddr = 1-16 in {
      def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 0a0f30c..8351c63 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -32,7 +32,7 @@ ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
 
 static cl::opt<bool>
-DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
+ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
 UseFusedMulOps("arm-use-mulops",
@@ -57,10 +57,28 @@ Align(cl::desc("Load/store alignment support"),
                      "Allow unaligned memory accesses"),
           clEnumValEnd));
 
+enum ITMode {
+  DefaultIT,
+  RestrictedIT,
+  NoRestrictedIT
+};
+
+static cl::opt<ITMode>
+IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
+   cl::ZeroOrMore,
+   cl::values(clEnumValN(DefaultIT, "arm-default-it",
+                         "Generate IT block based on arch"),
+              clEnumValN(RestrictedIT, "arm-restrict-it",
+                         "Disallow deprecated IT based on ARMv8"),
+              clEnumValN(NoRestrictedIT, "arm-no-restrict-it",
+                         "Allow IT blocks based on ARMv7"),
+              clEnumValEnd));
+
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, const TargetOptions &Options)
   : ARMGenSubtargetInfo(TT, CPU, FS)
   , ARMProcFamily(Others)
+  , ARMProcClass(None)
   , stackAlignment(4)
   , CPUString(CPU)
   , TargetTriple(TT)
@@ -75,13 +93,14 @@ void ARMSubtarget::initializeEnvironment() {
   HasV5TOps = false;
   HasV5TEOps = false;
   HasV6Ops = false;
+  HasV6MOps = false;
   HasV6T2Ops = false;
   HasV7Ops = false;
   HasV8Ops = false;
   HasVFPv2 = false;
   HasVFPv3 = false;
   HasVFPv4 = false;
-  HasV8FP = false;
+  HasFPARMv8 = false;
   HasNEON = false;
   UseNEONForSinglePrecisionFP = false;
   UseMulOps = UseFusedMulOps;
@@ -90,7 +109,6 @@ void ARMSubtarget::initializeEnvironment() {
   SlowFPBrcc = false;
   InThumbMode = false;
   HasThumb2 = false;
-  IsMClass = false;
   NoARM = false;
   PostRAScheduler = false;
   IsR9Reserved = ReserveR9;
@@ -107,9 +125,12 @@ void ARMSubtarget::initializeEnvironment() {
   AvoidMOVsShifterOperand = false;
   HasRAS = false;
   HasMPExtension = false;
+  HasVirtualization = false;
   FPOnlySP = false;
   HasPerfMon = false;
   HasTrustZone = false;
+  HasCrypto = false;
+  HasCRC = false;
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
   UseNaClTrap = false;
@@ -133,8 +154,13 @@ void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
 }
 
 void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
-  if (CPUString.empty())
-    CPUString = "generic";
+  if (CPUString.empty()) {
+    if (isTargetIOS() && TargetTriple.getArchName().endswith("v7s"))
+      // Default to the Swift CPU when targeting armv7s/thumbv7s.
+      CPUString = "swift";
+    else
+      CPUString = "generic";
+  }
 
   // Insert the architecture feature derived from the target triple into the
   // feature string. This is important for setting features that are implied
@@ -152,7 +178,7 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Thumb2 implies at least V6T2. FIXME: Fix tests to explicitly specify a
   // ARM version or CPU and then remove this.
   if (!HasV6T2Ops && hasThumb2())
-    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6T2Ops = true;
+    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6MOps = HasV6T2Ops = true;
 
   // Keep a pointer to static instruction cost data for the specified CPU.
   SchedModel = getSchedModelForCPU(CPUString);
@@ -169,11 +195,12 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isAAPCS_ABI())
     stackAlignment = 8;
 
-  if (!isTargetIOS())
-    UseMovt = hasV6T2Ops();
-  else {
+  UseMovt = hasV6T2Ops() && ArmUseMOVT;
+
+  if (!isTargetIOS()) {
+    IsR9Reserved = ReserveR9;
+  } else {
     IsR9Reserved = ReserveR9 | !HasV6Ops;
-    UseMovt = DarwinUseMOVT && hasV6T2Ops();
     SupportsTailCall = !getTargetTriple().isOSVersionLT(5, 0);
   }
 
@@ -207,6 +234,18 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
       break;
   }
 
+  switch (IT) {
+  case DefaultIT:
+    RestrictIT = hasV8Ops() ? true : false;
+    break;
+  case RestrictedIT:
+    RestrictIT = true;
+    break;
+  case NoRestrictedIT:
+    RestrictIT = false;
+    break;
+  }
+
   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
   uint64_t Bits = getFeatureBits();
   if ((Bits & ARM::ProcA5 || Bits & ARM::ProcA8) && // Where this matters
@@ -271,12 +310,15 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
   return SchedModel->MispredictPenalty;
 }
 
+bool ARMSubtarget::hasSinCos() const {
+  return getTargetTriple().getOS() == Triple::IOS &&
+    !getTargetTriple().isOSVersionLT(7, 0);
+}
+
 bool ARMSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
-  CriticalPathRCs.clear();
-  CriticalPathRCs.push_back(&ARM::GPRRegClass);
+  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index ad7f1b3..5276901 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -31,29 +31,36 @@ class TargetOptions;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift
+    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift, CortexA53, CortexA57
+  };
+  enum ARMProcClassEnum {
+    None, AClass, RClass, MClass
   };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
   ARMProcFamilyEnum ARMProcFamily;
 
+  /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
+  ARMProcClassEnum ARMProcClass;
+
   /// HasV4TOps, HasV5TOps, HasV5TEOps,
-  /// HasV6Ops, HasV6T2Ops, HasV7Ops, HasV8Ops -
+  /// HasV6Ops, HasV6MOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
   /// Specify whether target support specific ARM ISA variants.
   bool HasV4TOps;
   bool HasV5TOps;
   bool HasV5TEOps;
   bool HasV6Ops;
+  bool HasV6MOps;
   bool HasV6T2Ops;
   bool HasV7Ops;
   bool HasV8Ops;
 
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasV8FP, HasNEON - Specify what
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
   bool HasVFPv2;
   bool HasVFPv3;
   bool HasVFPv4;
-  bool HasV8FP;
+  bool HasFPARMv8;
   bool HasNEON;
 
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
@@ -82,10 +89,6 @@ protected:
   /// HasThumb2 - True if Thumb2 instructions are supported.
   bool HasThumb2;
 
-  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs -
-  /// v6m, v7m for example.
-  bool IsMClass;
-
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM;
 
@@ -147,6 +150,10 @@ protected:
   /// extension (ARMv7 only).
   bool HasMPExtension;
 
+  /// HasVirtualization - True if the subtarget supports the Virtualization
+  /// extension.
+  bool HasVirtualization;
+
   /// FPOnlySP - If true, the floating point unit only supports single
   /// precision.
   bool FPOnlySP;
@@ -159,11 +166,21 @@ protected:
   /// HasTrustZone - if true, processor supports TrustZone security extensions
   bool HasTrustZone;
 
+  /// HasCrypto - if true, processor supports Cryptography extensions
+  bool HasCrypto;
+
+  /// HasCRC - if true, processor supports CRC instructions
+  bool HasCRC;
+
   /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
   /// accesses for some types.  For details, see
   /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
   bool AllowsUnalignedMem;
 
+  /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
+  ///  blocks to conform to ARMv8 rule.
+  bool RestrictIT;
+
   /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith
   /// and such) instructions in Thumb2 code.
   bool Thumb2DSP;
@@ -228,6 +245,7 @@ public:
   bool hasV5TOps()  const { return HasV5TOps;  }
   bool hasV5TEOps() const { return HasV5TEOps; }
   bool hasV6Ops()   const { return HasV6Ops;   }
+  bool hasV6MOps()  const { return HasV6MOps;  }
   bool hasV6T2Ops() const { return HasV6T2Ops; }
   bool hasV7Ops()   const { return HasV7Ops;  }
   bool hasV8Ops()   const { return HasV8Ops;  }
@@ -246,8 +264,11 @@ public:
   bool hasVFP2() const { return HasVFPv2; }
   bool hasVFP3() const { return HasVFPv3; }
   bool hasVFP4() const { return HasVFPv4; }
-  bool hasV8FP() const { return HasV8FP; }
+  bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON;  }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+  bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
 
@@ -255,6 +276,9 @@ public:
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasAnyDataBarrier() const {
+    return HasDataBarrier || (hasV6Ops() && !isThumb());
+  }
   bool useMulOps() const { return UseMulOps; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
@@ -275,10 +299,10 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
-  bool isTargetIOS() const { return TargetTriple.getOS() == Triple::IOS; }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-  bool isTargetNaCl() const { return TargetTriple.getOS() == Triple::NaCl; }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetELF() const { return !isTargetDarwin(); }
   // ARM EABI is the bare-metal EABI described in ARM ABI documents and
   // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
@@ -296,8 +320,9 @@ public:
   bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
   bool isThumb2() const { return InThumbMode && HasThumb2; }
   bool hasThumb2() const { return HasThumb2; }
-  bool isMClass() const { return IsMClass; }
-  bool isARClass() const { return !IsMClass; }
+  bool isMClass() const { return ARMProcClass == MClass; }
+  bool isRClass() const { return ARMProcClass == RClass; }
+  bool isAClass() const { return ARMProcClass == AClass; }
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
@@ -306,9 +331,15 @@ public:
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
 
+  bool restrictIT() const { return RestrictIT; }
+
   const std::string & getCPUString() const { return CPUString; }
 
   unsigned getMispredictionPenalty() const;
+  
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
 
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index e6dbcb6..be84bf6 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -196,8 +196,13 @@ bool ARMPassConfig::addPreSched2() {
   addPass(createARMExpandPseudoPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
-    if (!getARMSubtarget().isThumb1Only())
+    if (!getARMSubtarget().isThumb1Only()) {
+      // in v8, IfConversion depends on Thumb instruction widths
+      if (getARMSubtarget().restrictIT() &&
+          !getARMSubtarget().prefers32BitThumb())
+        addPass(createThumb2SizeReductionPass());
       addPass(&IfConverterID);
+    }
   }
   if (getARMSubtarget().isThumb2())
     addPass(createThumb2ITBlockPass());
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index dfdf6ab..7ec71b2 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -47,7 +47,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
                         MCStreamer &Streamer) const {
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
-  return MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+  return MCSymbolRefExpr::Create(getSymbol(*Mang, GV),
                                  MCSymbolRefExpr::VK_ARM_TARGET2,
                                  getContext());
 }
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 34576ba..6bbb38f 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -129,6 +129,9 @@ public:
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                   OperandValueKind Op1Info = OK_AnyValue,
                                   OperandValueKind Op2Info = OK_AnyValue) const;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const;
   /// @}
 };
 
@@ -182,7 +185,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   assert(ISD && "Invalid opcode");
 
   // Single to/from double precision conversions.
-  static const CostTblEntry<MVT> NEONFltDblTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = {
     // Vector fptrunc/fpext conversions.
     { ISD::FP_ROUND,   MVT::v2f64, 2 },
     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
@@ -192,8 +195,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                           ISD == ISD::FP_EXTEND)) {
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
-    int Idx = CostTableLookup<MVT>(NEONFltDblTbl, array_lengthof(NEONFltDblTbl),
-                                ISD, LT.second);
+    int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
     if (Idx != -1)
       return LT.first * NEONFltDblTbl[Idx].Cost;
   }
@@ -207,7 +209,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
   // TODO: Get these tables to know at least what the related operations are.
-  static const TypeConversionCostTblEntry<MVT> NEONVectorConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONVectorConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
@@ -283,15 +286,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isVector() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONVectorConversionTbl,
-                                array_lengthof(NEONVectorConversionTbl),
-                                ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return NEONVectorConversionTbl[Idx].Cost;
   }
 
   // Scalar float to integer conversions.
-  static const TypeConversionCostTblEntry<MVT> NEONFloatConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONFloatConversionTbl[] = {
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
@@ -314,16 +317,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
   };
   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONFloatConversionTbl,
-                                        array_lengthof(NEONFloatConversionTbl),
-                                        ISD, DstTy.getSimpleVT(),
-                                        SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
         return NEONFloatConversionTbl[Idx].Cost;
   }
 
   // Scalar integer to float conversions.
-  static const TypeConversionCostTblEntry<MVT> NEONIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONIntegerConversionTbl[] = {
     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
@@ -347,16 +349,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isInteger() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONIntegerConversionTbl,
-                                       array_lengthof(NEONIntegerConversionTbl),
-                                       ISD, DstTy.getSimpleVT(),
-                                       SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return NEONIntegerConversionTbl[Idx].Cost;
   }
 
   // Scalar integer conversion costs.
-  static const TypeConversionCostTblEntry<MVT> ARMIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  ARMIntegerConversionTbl[] = {
     // i16 -> i64 requires two dependent operations.
     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
 
@@ -368,11 +369,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isInteger()) {
-    int Idx =
-      ConvertCostTableLookup<MVT>(ARMIntegerConversionTbl,
-                                  array_lengthof(ARMIntegerConversionTbl),
-                                  ISD, DstTy.getSimpleVT(),
-                                  SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return ARMIntegerConversionTbl[Idx].Cost;
   }
@@ -400,7 +398,8 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   // On NEON a a vector select gets lowered to vbsl.
   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
     // Lowering of some vector selects is currently far from perfect.
-    static const TypeConversionCostTblEntry<MVT> NEONVectorSelectTbl[] = {
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    NEONVectorSelectTbl[] = {
       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
@@ -412,10 +411,9 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     EVT SelCondTy = TLI->getValueType(CondTy);
     EVT SelValTy = TLI->getValueType(ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx = ConvertCostTableLookup<MVT>(NEONVectorSelectTbl,
-                                            array_lengthof(NEONVectorSelectTbl),
-                                            ISD, SelCondTy.getSimpleVT(),
-                                            SelValTy.getSimpleVT());
+      int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+                                       SelCondTy.getSimpleVT(),
+                                       SelValTy.getSimpleVT());
       if (Idx != -1)
         return NEONVectorSelectTbl[Idx].Cost;
     }
@@ -448,7 +446,7 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
   if (Kind != SK_Reverse)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  static const CostTblEntry<MVT> NEONShuffleTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
     // Reverse shuffle cost one instruction if we are shuffling within a double
     // word (vrev) or two if we shuffle a quad word (vrev, vext).
     { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
@@ -464,8 +462,7 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
 
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
 
-  int Idx = CostTableLookup<MVT>(NEONShuffleTbl, array_lengthof(NEONShuffleTbl),
-                                 ISD::VECTOR_SHUFFLE, LT.second);
+  int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
   if (Idx == -1)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
@@ -480,7 +477,7 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueK
 
   const unsigned FunctionCallDivCost = 20;
   const unsigned ReciprocalDivCost = 10;
-  static const CostTblEntry<MVT> CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = {
     // Division.
     // These costs are somewhat random. Choose a cost of 20 to indicate that
     // vectorizing devision (added function call) is going to be very expensive.
@@ -524,14 +521,37 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueK
   int Idx = -1;
 
   if (ST->hasNEON())
-    Idx = CostTableLookup<MVT>(CostTbl, array_lengthof(CostTbl), ISDOpcode,
-                               LT.second);
+    Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second);
 
   if (Idx != -1)
     return LT.first * CostTbl[Idx].Cost;
 
-
-  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
-                                                     Op2Info);
+  unsigned Cost =
+      TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+
+  // This is somewhat of a hack. The problem that we are facing is that SROA
+  // creates a sequence of shift, and, or instructions to construct values.
+  // These sequences are recognized by the ISel and have zero-cost. Not so for
+  // the vectorized code. Because we have support for v2i64 but not i64 those
+  // sequences look particularily beneficial to vectorize.
+  // To work around this we increase the cost of v2i64 operations to make them
+  // seem less beneficial.
+  if (LT.second == MVT::v2i64 &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+    Cost += 4;
+
+  return Cost;
 }
 
+unsigned ARMTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isDoubleTy()) {
+    // Unaligned loads/stores are extremely inefficient.
+    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+    return LT.first * 4;
+  }
+  return LT.first;
+}
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 80e5c6e..e3f9e0d 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -7,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBuildAttrs.h"
+#include "ARMFPUName.h"
+#include "ARMFeatures.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -24,6 +27,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -47,8 +51,14 @@ enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
 class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   const MCRegisterInfo *MRI;
 
+  ARMTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = getParser().getStreamer().getTargetStreamer();
+    return static_cast<ARMTargetStreamer &>(TS);
+  }
+
   // Unwind directives state
   SMLoc FnStartLoc;
   SMLoc CantUnwindLoc;
@@ -66,6 +76,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   // Map of register aliases registers via the .req directive.
   StringMap<unsigned> RegisterReqs;
 
+  bool NextSymbolIsThumb;
+
   struct {
     ARMCC::CondCodes Cond;    // Condition for IT block.
     unsigned Mask:4;          // Condition mask for instructions.
@@ -127,6 +139,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveUnreq(SMLoc L);
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveEabiAttr(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
+  bool parseDirectiveFPU(SMLoc L);
   bool parseDirectiveFnStart(SMLoc L);
   bool parseDirectiveFnEnd(SMLoc L);
   bool parseDirectiveCantUnwind(SMLoc L);
@@ -139,7 +153,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
                           bool &CarrySetting, unsigned &ProcessorIMod,
                           StringRef &ITMask);
-  void getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+  void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                             bool &CanAcceptCarrySet,
                              bool &CanAcceptPredicationCode);
 
   bool isThumb() const {
@@ -158,6 +173,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasV6Ops() const {
     return STI.getFeatureBits() & ARM::HasV6Ops;
   }
+  bool hasV6MOps() const {
+    return STI.getFeatureBits() & ARM::HasV6MOps;
+  }
   bool hasV7Ops() const {
     return STI.getFeatureBits() & ARM::HasV7Ops;
   }
@@ -221,6 +239,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   // Asm Match Converter Methods
   void cvtThumbMultiply(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtThumbBranches(MCInst &Inst,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &);
+                        
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
@@ -229,8 +250,6 @@ class ARMAsmParser : public MCTargetAsmParser {
                               SmallVectorImpl<MCParsedAsmOperand*> &Operands);
   bool shouldOmitPredicateOperand(StringRef Mnemonic,
                               SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  bool isDeprecated(MCInst &Inst, StringRef &Info);
-
 public:
   enum ARMMatchResultTy {
     Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
@@ -242,8 +261,9 @@ public:
 
   };
 
-  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser), FPReg(-1) {
+  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), FPReg(-1) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Cache the MCRegisterInfo.
@@ -255,12 +275,7 @@ public:
     // Not in an ITBlock to start with.
     ITState.CurPosition = ~0U;
 
-    // Set ELF header flags.
-    // FIXME: This should eventually end up somewhere else where more
-    // intelligent flag decisions can be made. For now we are just maintaining
-    // the statu/parseDirects quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-    if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&Parser.getStreamer()))
-      MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+    NextSymbolIsThumb = false;
   }
 
   // Implementation of the MCTargetAsmParser interface:
@@ -277,6 +292,8 @@ public:
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
                                bool MatchingInlineAsm);
+  void onLabelParsed(MCSymbol *Symbol);
+
 };
 } // end anonymous namespace
 
@@ -601,7 +618,7 @@ public:
   template<unsigned width, unsigned scale>
   bool isUnsignedOffset() const {
     if (!isImm()) return false;
-    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
       int64_t Val = CE->getValue();
       int64_t Align = 1LL << scale;
@@ -610,6 +627,22 @@ public:
     }
     return false;
   }
+  // checks whether this operand is an signed offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isSignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << (width-1)) - 1);
+      int64_t Min = -Align * (1LL << (width-1));
+      return ((Val % Align) == 0) && (Val >= Min) && (Val <= Max);
+    }
+    return false;
+  }
+
   // checks whether this operand is a memory operand computed as an offset
   // applied to PC. the offset may have 8 bits of magnitude and is represented
   // with two bits of shift. textually it may be either [pc, #imm], #imm or 
@@ -628,7 +661,7 @@ public:
       Val = Memory.OffsetImm->getValue();
     }
     else return false;
-    return ((Val % 4) == 0) && (Val >= -1020) && (Val <= 1020);
+    return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020);
   }
   bool isFPImm() const {
     if (!isImm()) return false;
@@ -658,13 +691,6 @@ public:
     int64_t Value = CE->getValue();
     return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
   }
-  bool isImm0_4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 5;
-  }
   bool isImm0_1020s4() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -687,6 +713,13 @@ public:
     // explicitly exclude zero. we want that to use the normal 0_508 version.
     return ((Value & 3) == 0) && Value > 0 && Value <= 508;
   }
+  bool isImm0_239() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 240;
+  }
   bool isImm0_255() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -848,6 +881,15 @@ public:
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 65536;
   }
+  bool isImm256_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 256 && Value < 65536;
+  }
   bool isImm0_65535Expr() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -927,7 +969,8 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return ARM_AM::getT2SOImmVal(~Value) != -1;
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(~Value) != -1;
   }
   bool isT2SOImmNeg() const {
     if (!isImm()) return false;
@@ -1773,8 +1816,6 @@ public:
   void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     int32_t Imm = Memory.OffsetImm->getValue();
-    // FIXME: Handle #-0
-    if (Imm == INT32_MIN) Imm = 0;
     Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
@@ -2494,7 +2535,7 @@ void ARMOperand::print(raw_ostream &OS) const {
     getImm()->print(OS);
     break;
   case k_MemBarrierOpt:
-    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
+    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt(), false) << ">";
     break;
   case k_InstSyncBarrierOpt:
     OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
@@ -2831,8 +2872,9 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
       return -1;
     switch (Name[2]) {
     default:  return -1;
-    case '0': return 10;
-    case '1': return 11;
+    // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON)
+    case '0': return CoprocOp == 'p'? -1: 10;
+    case '1': return CoprocOp == 'p'? -1: 11;
     case '2': return 12;
     case '3': return 13;
     case '4': return 14;
@@ -3439,18 +3481,27 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
       .Case("sy",    ARM_MB::SY)
       .Case("st",    ARM_MB::ST)
+      .Case("ld",    ARM_MB::LD)
       .Case("sh",    ARM_MB::ISH)
       .Case("ish",   ARM_MB::ISH)
       .Case("shst",  ARM_MB::ISHST)
       .Case("ishst", ARM_MB::ISHST)
+      .Case("ishld", ARM_MB::ISHLD)
       .Case("nsh",   ARM_MB::NSH)
       .Case("un",    ARM_MB::NSH)
       .Case("nshst", ARM_MB::NSHST)
+      .Case("nshld", ARM_MB::NSHLD)
       .Case("unst",  ARM_MB::NSHST)
       .Case("osh",   ARM_MB::OSH)
       .Case("oshst", ARM_MB::OSHST)
+      .Case("oshld", ARM_MB::OSHLD)
       .Default(~0U);
 
+    // ishld, oshld, nshld and ld are only available from ARMv8.
+    if (!hasV8Ops() && (Opt == ARM_MB::ISHLD || Opt == ARM_MB::OSHLD ||
+                        Opt == ARM_MB::NSHLD || Opt == ARM_MB::LD))
+      Opt = ~0U;
+
     if (Opt == ~0U)
       return MatchOperand_NoMatch;
 
@@ -3498,7 +3549,7 @@ parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Tok.is(AsmToken::Identifier)) {
     StringRef OptStr = Tok.getString();
 
-    if (OptStr.lower() == "sy")
+    if (OptStr.equals_lower("sy"))
       Opt = ARM_ISB::SY;
     else
       return MatchOperand_NoMatch;
@@ -4102,6 +4153,65 @@ cvtThumbMultiply(MCInst &Inst,
   ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
 }
 
+void ARMAsmParser::
+cvtThumbBranches(MCInst &Inst,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  int CondOp = -1, ImmOp = -1;
+  switch(Inst.getOpcode()) {
+    case ARM::tB:
+    case ARM::tBcc:  CondOp = 1; ImmOp = 2; break;
+
+    case ARM::t2B:
+    case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
+
+    default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
+  }
+  // first decide whether or not the branch should be conditional
+  // by looking at it's location relative to an IT block
+  if(inITBlock()) {
+    // inside an IT block we cannot have any conditional branches. any 
+    // such instructions needs to be converted to unconditional form
+    switch(Inst.getOpcode()) {
+      case ARM::tBcc: Inst.setOpcode(ARM::tB); break;
+      case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
+    }
+  } else {
+    // outside IT blocks we can only have unconditional branches with AL
+    // condition code or conditional branches with non-AL condition code
+    unsigned Cond = static_cast<ARMOperand*>(Operands[CondOp])->getCondCode();
+    switch(Inst.getOpcode()) {
+      case ARM::tB:
+      case ARM::tBcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); 
+        break;
+      case ARM::t2B:
+      case ARM::t2Bcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc);
+        break;
+    }
+  }
+  
+  // now decide on encoding size based on branch target range
+  switch(Inst.getOpcode()) {
+    // classify tB as either t2B or t1B based on range of immediate operand
+    case ARM::tB: {
+      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
+      if(!op->isSignedOffset<11, 1>() && isThumbTwo()) 
+        Inst.setOpcode(ARM::t2B);
+      break;
+    }
+    // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
+    case ARM::tBcc: {
+      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
+      if(!op->isSignedOffset<8, 1>() && isThumbTwo())
+        Inst.setOpcode(ARM::t2Bcc);
+      break;
+    }
+  }
+  ((ARMOperand*)Operands[ImmOp])->addImmOperands(Inst, 1);
+  ((ARMOperand*)Operands[CondOp])->addCondCodeOperands(Inst, 2);
+}
+
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
 bool ARMAsmParser::
@@ -4601,7 +4711,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
       Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
       Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
-      Mnemonic == "vaclt" || Mnemonic == "vacle"  ||
+      Mnemonic == "vaclt" || Mnemonic == "vacle"  || Mnemonic == "hlt" ||
       Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
       Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
       Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
@@ -4688,8 +4798,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
 //
 // FIXME: It would be nice to autogen this.
 void ARMAsmParser::
-getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
-                      bool &CanAcceptPredicationCode) {
+getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                     bool &CanAcceptCarrySet, bool &CanAcceptPredicationCode) {
   if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
       Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
       Mnemonic == "add" || Mnemonic == "adc" ||
@@ -4707,12 +4817,14 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
 
   if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
       Mnemonic == "cps" ||  Mnemonic == "it" ||  Mnemonic == "cbz" ||
-      Mnemonic == "trap" || Mnemonic == "setend" ||
+      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic.startswith("crc32") ||
       Mnemonic.startswith("cps") || Mnemonic.startswith("vsel") ||
       Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
       Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
       Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
-      Mnemonic == "vrintm") {
+      Mnemonic == "vrintm" || Mnemonic.startswith("aes") ||
+      Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
+      (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
     // These mnemonics are never predicable
     CanAcceptPredicationCode = false;
   } else if (!isThumb()) {
@@ -4726,7 +4838,10 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
         Mnemonic != "stc2" && Mnemonic != "stc2l" &&
         !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
   } else if (isThumbOne()) {
-    CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
+    if (hasV6MOps())
+      CanAcceptPredicationCode = Mnemonic != "movs";
+    else
+      CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
   } else
     CanAcceptPredicationCode = true;
 }
@@ -4877,14 +4992,6 @@ bool ARMAsmParser::shouldOmitPredicateOperand(
   return false;
 }
 
-bool ARMAsmParser::isDeprecated(MCInst &Inst, StringRef &Info) {
-  if (hasV8Ops() && Inst.getOpcode() == ARM::SETEND) {
-    Info = "armv8";
-    return true;
-  }
-  return false;
-}
-
 static bool isDataTypeToken(StringRef Tok) {
   return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
     Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
@@ -4980,7 +5087,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // the matcher deal with finding the right instruction or generating an
   // appropriate error.
   bool CanAcceptCarrySet, CanAcceptPredicationCode;
-  getMnemonicAcceptInfo(Mnemonic, CanAcceptCarrySet, CanAcceptPredicationCode);
+  getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode);
 
   // If we had a carry-set on an instruction that can't do that, issue an
   // error.
@@ -5115,8 +5222,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // expressed as a GPRPair, so we have to manually merge them.
   // FIXME: We would really like to be able to tablegen'erate this.
   if (!isThumb() && Operands.size() > 4 &&
-      (Mnemonic == "ldrexd" || Mnemonic == "strexd")) {
-    bool isLoad = (Mnemonic == "ldrexd");
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+       Mnemonic == "stlexd")) {
+    bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
     unsigned Idx = isLoad ? 2 : 3;
     ARMOperand* Op1 = static_cast<ARMOperand*>(Operands[Idx]);
     ARMOperand* Op2 = static_cast<ARMOperand*>(Operands[Idx+1]);
@@ -5200,45 +5308,44 @@ static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
   return false;
 }
 
-// FIXME: We would really prefer to have MCInstrInfo (the wrapper around
-// the ARMInsts array) instead. Getting that here requires awkward
-// API changes, though. Better way?
-namespace llvm {
-extern const MCInstrDesc ARMInsts[];
-}
-static const MCInstrDesc &getInstDesc(unsigned Opcode) {
-  return ARMInsts[Opcode];
+// Return true if instruction has the interesting property of being
+// allowed in IT blocks, but not being predicable.
+static bool instIsBreakpoint(const MCInst &Inst) {
+    return Inst.getOpcode() == ARM::tBKPT ||
+           Inst.getOpcode() == ARM::BKPT ||
+           Inst.getOpcode() == ARM::tHLT ||
+           Inst.getOpcode() == ARM::HLT;
+
 }
 
 // FIXME: We would really like to be able to tablegen'erate this.
 bool ARMAsmParser::
 validateInstruction(MCInst &Inst,
                     const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   SMLoc Loc = Operands[0]->getStartLoc();
+
   // Check the IT block state first.
-  // NOTE: BKPT instruction has the interesting property of being
-  // allowed in IT blocks, but not being predicable.  It just always
-  // executes.
-  if (inITBlock() && Inst.getOpcode() != ARM::tBKPT &&
-      Inst.getOpcode() != ARM::BKPT) {
-    unsigned bit = 1;
+  // NOTE: BKPT and HLT instructions have the interesting property of being
+  // allowed in IT blocks, but not being predicable. They just always execute.
+  if (inITBlock() && !instIsBreakpoint(Inst)) {
+    unsigned Bit = 1;
     if (ITState.FirstCond)
       ITState.FirstCond = false;
     else
-      bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+      Bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
     // The instruction must be predicable.
     if (!MCID.isPredicable())
       return Error(Loc, "instructions in IT block must be predicable");
     unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
-    unsigned ITCond = bit ? ITState.Cond :
+    unsigned ITCond = Bit ? ITState.Cond :
       ARMCC::getOppositeCondition(ITState.Cond);
     if (Cond != ITCond) {
       // Find the condition code Operand to get its SMLoc information.
       SMLoc CondLoc;
-      for (unsigned i = 1; i < Operands.size(); ++i)
-        if (static_cast<ARMOperand*>(Operands[i])->isCondCode())
-          CondLoc = Operands[i]->getStartLoc();
+      for (unsigned I = 1; I < Operands.size(); ++I)
+        if (static_cast<ARMOperand*>(Operands[I])->isCondCode())
+          CondLoc = Operands[I]->getStartLoc();
       return Error(CondLoc, "incorrect condition in IT block; got '" +
                    StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
                    "', but expected '" +
@@ -5247,20 +5354,55 @@ validateInstruction(MCInst &Inst,
   // Check for non-'al' condition codes outside of the IT block.
   } else if (isThumbTwo() && MCID.isPredicable() &&
              Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
-             ARMCC::AL && Inst.getOpcode() != ARM::tB &&
-             Inst.getOpcode() != ARM::t2B)
+             ARMCC::AL && Inst.getOpcode() != ARM::tBcc &&
+             Inst.getOpcode() != ARM::t2Bcc)
     return Error(Loc, "predicated instructions must be in IT block");
 
-  switch (Inst.getOpcode()) {
+  const unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
   case ARM::LDRD:
   case ARM::LDRD_PRE:
   case ARM::LDRD_POST: {
+    const unsigned RtReg = Inst.getOperand(0).getReg();
+
+    // Rt can't be R14.
+    if (RtReg == ARM::LR)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt can't be R14");
+
+    const unsigned Rt = MRI->getEncodingValue(RtReg);
+    // Rt must be even-numbered.
+    if ((Rt & 1) == 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt must be even-numbered");
+
     // Rt2 must be Rt + 1.
-    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
-    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     if (Rt2 != Rt + 1)
       return Error(Operands[3]->getStartLoc(),
                    "destination operands must be sequential");
+
+    if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
+      const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+      // For addressing modes with writeback, the base register needs to be
+      // different from the destination registers.
+      if (Rn == Rt || Rn == Rt2)
+        return Error(Operands[3]->getStartLoc(),
+                     "base register needs to be different from destination "
+                     "registers");
+    }
+
+    return false;
+  }
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRD_PRE:
+  case ARM::t2LDRD_POST: {
+    // Rt2 must be different from Rt.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 == Rt)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands can't be identical");
     return false;
   }
   case ARM::STRD: {
@@ -5284,50 +5426,77 @@ validateInstruction(MCInst &Inst,
   }
   case ARM::SBFX:
   case ARM::UBFX: {
-    // width must be in range [1, 32-lsb]
-    unsigned lsb = Inst.getOperand(2).getImm();
-    unsigned widthm1 = Inst.getOperand(3).getImm();
-    if (widthm1 >= 32 - lsb)
+    // Width must be in range [1, 32-lsb].
+    unsigned LSB = Inst.getOperand(2).getImm();
+    unsigned Widthm1 = Inst.getOperand(3).getImm();
+    if (Widthm1 >= 32 - LSB)
       return Error(Operands[5]->getStartLoc(),
                    "bitfield width must be in range [1,32-lsb]");
     return false;
   }
+  // Notionally handles ARM::tLDMIA_UPD too.
   case ARM::tLDMIA: {
     // If we're parsing Thumb2, the .w variant is available and handles
-    // most cases that are normally illegal for a Thumb1 LDM
-    // instruction. We'll make the transformation in processInstruction()
-    // if necessary.
+    // most cases that are normally illegal for a Thumb1 LDM instruction.
+    // We'll make the transformation in processInstruction() if necessary.
     //
     // Thumb LDM instructions are writeback iff the base register is not
     // in the register list.
     unsigned Rn = Inst.getOperand(0).getReg();
-    bool hasWritebackToken =
+    bool HasWritebackToken =
       (static_cast<ARMOperand*>(Operands[3])->isToken() &&
        static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) && !isThumbTwo())
-      return Error(Operands[3 + hasWritebackToken]->getStartLoc(),
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
                    "registers must be in range r0-r7");
     // If we should have writeback, then there should be a '!' token.
-    if (!listContainsBase && !hasWritebackToken && !isThumbTwo())
+    if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "writeback operator '!' expected");
     // If we should not have writeback, there must not be a '!'. This is
     // true even for the 32-bit wide encodings.
-    if (listContainsBase && hasWritebackToken)
+    if (ListContainsBase && HasWritebackToken)
       return Error(Operands[3]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
 
     break;
   }
-  case ARM::t2LDMIA_UPD: {
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+    // ARM variants loading and updating the same register are only officially
+    // UNPREDICTABLE on v7 upwards. Goodness knows what they did before.
+    if (!hasV7Ops())
+      break;
+    // Fallthrough
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
     if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
-      return Error(Operands[4]->getStartLoc(),
-                   "writeback operator '!' not allowed when base register "
-                   "in register list");
+      return Error(Operands.back()->getStartLoc(),
+                   "writeback register not allowed in register list");
     break;
   }
+  case ARM::sysLDMIA_UPD:
+  case ARM::sysLDMDA_UPD:
+  case ARM::sysLDMDB_UPD:
+  case ARM::sysLDMIB_UPD:
+    if (!listContainsReg(Inst, 3, ARM::PC))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback register only allowed on system LDM "
+                   "if PC in register-list");
+    break;
+  case ARM::sysSTMIA_UPD:
+  case ARM::sysSTMDA_UPD:
+  case ARM::sysSTMDB_UPD:
+  case ARM::sysSTMIB_UPD:
+    return Error(Operands[2]->getStartLoc(),
+                 "system STM cannot have writeback register");
+    break;
   case ARM::tMUL: {
     // The second source operand must be the same register as the destination
     // operand.
@@ -5351,26 +5520,35 @@ validateInstruction(MCInst &Inst,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
   case ARM::tPOP: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase) &&
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or pc");
     break;
   }
   case ARM::tPUSH: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase) &&
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or lr");
     break;
   }
   case ARM::tSTMIA_UPD: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 4, 0, 0, listContainsBase) && !isThumbTwo())
+    bool ListContainsBase, InvalidLowList;
+    InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
+                                          0, ListContainsBase);
+    if (InvalidLowList && !isThumbTwo())
       return Error(Operands[4]->getStartLoc(),
                    "registers must be in range r0-r7");
+
+    // This would be converted to a 32-bit stm, but that's not valid if the
+    // writeback register is in the list.
+    if (InvalidLowList && ListContainsBase)
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
     break;
   }
   case ARM::tADDrSP: {
@@ -5383,11 +5561,29 @@ validateInstruction(MCInst &Inst,
     }
     break;
   }
+  // Final range checking for Thumb unconditional branch instructions.
+  case ARM::tB:
+    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<11, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2B: {
+    int op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!(static_cast<ARMOperand*>(Operands[op]))->isSignedOffset<24, 1>())
+      return Error(Operands[op]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  // Final range checking for Thumb conditional branch instructions.
+  case ARM::tBcc:
+    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<8, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2Bcc: {
+    int Op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!(static_cast<ARMOperand*>(Operands[Op]))->isSignedOffset<20, 1>())
+      return Error(Operands[Op]->getStartLoc(), "branch target out of range");
+    break;
+  }
   }
-
-  StringRef DepInfo;
-  if (isDeprecated(Inst, DepInfo))
-    Warning(Loc, "deprecated on " + DepInfo);
 
   return false;
 }
@@ -7425,7 +7621,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // 16-bit thumb arithmetic instructions either require or preclude the 'S'
   // suffix depending on whether they're in an IT block or not.
   unsigned Opc = Inst.getOpcode();
-  const MCInstrDesc &MCID = getInstDesc(Opc);
+  const MCInstrDesc &MCID = MII.get(Opc);
   if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
     assert(MCID.hasOptionalDef() &&
            "optionally flag setting instruction missing optional def operand");
@@ -7486,12 +7682,22 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
 
-    // Some instructions need post-processing to, for example, tweak which
-    // encoding is selected. Loop on it while changes happen so the
-    // individual transformations can chain off each other. E.g.,
-    // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-    while (processInstruction(Inst, Operands))
-      ;
+    { // processInstruction() updates inITBlock state, we need to save it away
+      bool wasInITBlock = inITBlock();
+
+      // Some instructions need post-processing to, for example, tweak which
+      // encoding is selected. Loop on it while changes happen so the
+      // individual transformations can chain off each other. E.g.,
+      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+      while (processInstruction(Inst, Operands))
+        ;
+
+      // Only after the instruction is fully processed, we can validate it
+      if (wasInITBlock && hasV8Ops() && isThumb() &&
+          !isV8EligibleForIT(&Inst, 2)) {
+        Warning(IDLoc, "deprecated instruction in IT block");
+      }
+    }
 
     // Only move forward at the very end so that everything in validate
     // and process gets a consistent answer about whether we're in an IT
@@ -7544,15 +7750,15 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "instruction variant requires ARMv6 or later");
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
-  case Match_ImmRange0_4: {
+  case Match_ImmRange0_15: {
     SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    return Error(ErrorLoc, "immediate operand must be in the range [0,4]");
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
   }
-  case Match_ImmRange0_15: {
+  case Match_ImmRange0_239: {
     SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+    return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
   }
   }
 
@@ -7580,6 +7786,10 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveArch(DirectiveID.getLoc());
   else if (IDVal == ".eabi_attribute")
     return parseDirectiveEabiAttr(DirectiveID.getLoc());
+  else if (IDVal == ".cpu")
+    return parseDirectiveCPU(DirectiveID.getLoc());
+  else if (IDVal == ".fpu")
+    return parseDirectiveFPU(DirectiveID.getLoc());
   else if (IDVal == ".fnstart")
     return parseDirectiveFnStart(DirectiveID.getLoc());
   else if (IDVal == ".fnend")
@@ -7658,13 +7868,18 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
   return false;
 }
 
+void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
+  if (NextSymbolIsThumb) {
+    getParser().getStreamer().EmitThumbFunc(Symbol);
+    NextSymbolIsThumb = false;
+  }
+}
+
 /// parseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
 bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
   const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
   bool isMachO = MAI->hasSubsectionsViaSymbols();
-  StringRef Name;
-  bool needFuncName = true;
 
   // Darwin asm has (optionally) function name after .thumb_func direction
   // ELF doesn't
@@ -7673,29 +7888,19 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
     if (Tok.isNot(AsmToken::EndOfStatement)) {
       if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
         return Error(L, "unexpected token in .thumb_func directive");
-      Name = Tok.getIdentifier();
+      MCSymbol *Func =
+          getParser().getContext().GetOrCreateSymbol(Tok.getIdentifier());
+      getParser().getStreamer().EmitThumbFunc(Func);
       Parser.Lex(); // Consume the identifier token.
-      needFuncName = false;
+      return false;
     }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
 
-  // Eat the end of statement and any blank lines that follow.
-  while (getLexer().is(AsmToken::EndOfStatement))
-    Parser.Lex();
+  NextSymbolIsThumb = true;
 
-  // FIXME: assuming function name will be the line following .thumb_func
-  // We really should be checking the next symbol definition even if there's
-  // stuff in between.
-  if (needFuncName) {
-    Name = Parser.getTok().getIdentifier();
-  }
-
-  // Mark symbol as a thumb symbol.
-  MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name);
-  getParser().getStreamer().EmitThumbFunc(Func);
   return false;
 }
 
@@ -7807,7 +8012,48 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
 /// parseDirectiveEabiAttr
 ///  ::= .eabi_attribute int, int
 bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
-  return true;
+  if (Parser.getTok().isNot(AsmToken::Integer))
+    return Error(L, "integer expected");
+  int64_t Tag = Parser.getTok().getIntVal();
+  Parser.Lex(); // eat tag integer
+
+  if (Parser.getTok().isNot(AsmToken::Comma))
+    return Error(L, "comma expected");
+  Parser.Lex(); // skip comma
+
+  L = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::Integer))
+    return Error(L, "integer expected");
+  int64_t Value = Parser.getTok().getIntVal();
+  Parser.Lex(); // eat value integer
+
+  getTargetStreamer().emitAttribute(Tag, Value);
+  return false;
+}
+
+/// parseDirectiveCPU
+///  ::= .cpu str
+bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
+  StringRef CPU = getParser().parseStringToEndOfStatement().trim();
+  getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
+  return false;
+}
+
+/// parseDirectiveFPU
+///  ::= .fpu str
+bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+  StringRef FPU = getParser().parseStringToEndOfStatement().trim();
+
+  unsigned ID = StringSwitch<unsigned>(FPU)
+#define ARM_FPU_NAME(NAME, ID) .Case(NAME, ARM::ID)
+#include "ARMFPUName.def"
+    .Default(ARM::INVALID_FPU);
+
+  if (ID == ARM::INVALID_FPU)
+    return Error(L, "Unknown FPU name");
+
+  getTargetStreamer().emitFPU(ID);
+  return false;
 }
 
 /// parseDirectiveFnStart
@@ -7820,7 +8066,7 @@ bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
   }
 
   FnStartLoc = L;
-  getParser().getStreamer().EmitFnStart();
+  getTargetStreamer().emitFnStart();
   return false;
 }
 
@@ -7833,8 +8079,7 @@ bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
 
   // Reset the unwind directives parser state
   resetUnwindDirectiveParserState();
-
-  getParser().getStreamer().EmitFnEnd();
+  getTargetStreamer().emitFnEnd();
   return false;
 }
 
@@ -7856,7 +8101,7 @@ bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
     return true;
   }
 
-  getParser().getStreamer().EmitCantUnwind();
+  getTargetStreamer().emitCantUnwind();
   return false;
 }
 
@@ -7887,7 +8132,7 @@ bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
   Parser.Lex();
 
   MCSymbol *PR = getParser().getContext().GetOrCreateSymbol(Name);
-  getParser().getStreamer().EmitPersonality(PR);
+  getTargetStreamer().emitPersonality(PR);
   return false;
 }
 
@@ -7904,7 +8149,7 @@ bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
     return true;
   }
 
-  getParser().getStreamer().EmitHandlerData();
+  getTargetStreamer().emitHandlerData();
   return false;
 }
 
@@ -7964,9 +8209,8 @@ bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
     Offset = CE->getValue();
   }
 
-  getParser().getStreamer().EmitSetFP(static_cast<unsigned>(NewFPReg),
-                                      static_cast<unsigned>(NewSPReg),
-                                      Offset);
+  getTargetStreamer().emitSetFP(static_cast<unsigned>(NewFPReg),
+                                static_cast<unsigned>(NewSPReg), Offset);
   return false;
 }
 
@@ -7995,7 +8239,7 @@ bool ARMAsmParser::parseDirectivePad(SMLoc L) {
   if (!CE)
     return Error(ExLoc, "pad offset must be an immediate");
 
-  getParser().getStreamer().EmitPad(CE->getValue());
+  getTargetStreamer().emitPad(CE->getValue());
   return false;
 }
 
@@ -8027,7 +8271,7 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   if (IsVector && !Op->isDPRRegList())
     return Error(L, ".vsave expects DPR registers");
 
-  getParser().getStreamer().EmitRegSave(Op->getRegList(), IsVector);
+  getTargetStreamer().emitRegSave(Op->getRegList(), IsVector);
   return false;
 }
 
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 8a06664..9c7988f 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -323,8 +323,6 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
 
 
 static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
@@ -507,6 +505,14 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   MI.clear();
+  result = decodeInstruction(DecoderTablev8Crypto32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -823,16 +829,28 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       Check(result, AddThumbPredicate(MI));
       return result;
     }
-  }
 
-  MI.clear();
-  uint32_t NEONv8Insn = insn32;
-  NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
-  result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
-    Size = 4;
-    return result;
+    MI.clear();
+    uint32_t NEONCryptoInsn = insn32;
+    NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
+
+    MI.clear();
+    uint32_t NEONv8Insn = insn32;
+    NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+    result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+                               this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
   }
 
   MI.clear();
@@ -1185,20 +1203,22 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  bool writebackLoad = false;
-  unsigned writebackReg = 0;
+  bool NeedDisjointWriteback = false;
+  unsigned WritebackReg = 0;
   switch (Inst.getOpcode()) {
-    default:
-      break;
-    case ARM::LDMIA_UPD:
-    case ARM::LDMDB_UPD:
-    case ARM::LDMIB_UPD:
-    case ARM::LDMDA_UPD:
-    case ARM::t2LDMIA_UPD:
-    case ARM::t2LDMDB_UPD:
-      writebackLoad = true;
-      writebackReg = Inst.getOperand(0).getReg();
-      break;
+  default:
+    break;
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    NeedDisjointWriteback = true;
+    WritebackReg = Inst.getOperand(0).getReg();
+    break;
   }
 
   // Empty register lists are not allowed.
@@ -1208,7 +1228,7 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
       if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
         return MCDisassembler::Fail;
       // Writeback not allowed if Rn is in the target list.
-      if (writebackLoad && writebackReg == Inst.end()[-1].getReg())
+      if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
         Check(S, MCDisassembler::SoftFail);
     }
   }
@@ -1343,6 +1363,11 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
       break;
   }
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && (coproc != 14))
+    return MCDisassembler::Fail;
+
   Inst.addOperand(MCOperand::CreateImm(coproc));
   Inst.addOperand(MCOperand::CreateImm(CRd));
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
@@ -3794,6 +3819,11 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
   if (Val == 0xA || Val == 0xB)
     return MCDisassembler::Fail;
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && !(Val == 14 || Val == 15))
+    return MCDisassembler::Fail;
+
   Inst.addOperand(MCOperand::CreateImm(Val));
   return MCDisassembler::Success;
 }
@@ -4901,15 +4931,6 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder)
-{
-  unsigned Imm = fieldFromInstruction(Insn, 0, 3);
-  if (Imm > 4) return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::CreateImm(Imm));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 97da232..f897028 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -76,14 +76,23 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                StringRef Annot) {
   unsigned Opcode = MI->getOpcode();
 
+  switch(Opcode) {
+
   // Check for HINT instructions w/ canonical names.
-  if (Opcode == ARM::HINT || Opcode == ARM::t2HINT) {
+  case ARM::HINT:
+  case ARM::tHINT:
+  case ARM::t2HINT:
     switch (MI->getOperand(0).getImm()) {
     case 0: O << "\tnop"; break;
     case 1: O << "\tyield"; break;
     case 2: O << "\twfe"; break;
     case 3: O << "\twfi"; break;
     case 4: O << "\tsev"; break;
+    case 5:
+      if ((getAvailableFeatures() & ARM::HasV8Ops)) {
+        O << "\tsevl";
+        break;
+      } // Fallthrough for non-v8
     default:
       // Anything else should just print normally.
       printInstruction(MI, O);
@@ -95,10 +104,9 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       O << ".w";
     printAnnotation(O, Annot);
     return;
-  }
 
   // Check for MOVs and print canonical forms, instead.
-  if (Opcode == ARM::MOVsr) {
+  case ARM::MOVsr: {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -121,7 +129,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-  if (Opcode == ARM::MOVsi) {
+  case ARM::MOVsi: {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -149,81 +157,91 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-
   // A8.6.123 PUSH
-  if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP &&
-      MI->getNumOperands() > 5) {
-    // Should only print PUSH if there are at least two registers in the list.
-    O << '\t' << "push";
-    printPredicateOperand(MI, 2, O);
-    if (Opcode == ARM::t2STMDB_UPD)
-      O << ".w";
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-  if (Opcode == ARM::STR_PRE_IMM && MI->getOperand(2).getReg() == ARM::SP &&
-      MI->getOperand(3).getImm() == -4) {
-    O << '\t' << "push";
-    printPredicateOperand(MI, 4, O);
-    O << "\t{";
-    printRegName(O, MI->getOperand(1).getReg());
-    O << "}";
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::STMDB_UPD:
+  case ARM::t2STMDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print PUSH if there are at least two registers in the list.
+      O << '\t' << "push";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2STMDB_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::STR_PRE_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(3).getImm() == -4) {
+      O << '\t' << "push";
+      printPredicateOperand(MI, 4, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(1).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.122 POP
-  if ((Opcode == ARM::LDMIA_UPD || Opcode == ARM::t2LDMIA_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP &&
-      MI->getNumOperands() > 5) {
-    // Should only print POP if there are at least two registers in the list.
-    O << '\t' << "pop";
-    printPredicateOperand(MI, 2, O);
-    if (Opcode == ARM::t2LDMIA_UPD)
-      O << ".w";
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-  if (Opcode == ARM::LDR_POST_IMM && MI->getOperand(2).getReg() == ARM::SP &&
-      MI->getOperand(4).getImm() == 4) {
-    O << '\t' << "pop";
-    printPredicateOperand(MI, 5, O);
-    O << "\t{";
-    printRegName(O, MI->getOperand(0).getReg());
-    O << "}";
-    printAnnotation(O, Annot);
-    return;
-  }
-
+  case ARM::LDMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print POP if there are at least two registers in the list.
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2LDMIA_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::LDR_POST_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(4).getImm() == 4) {
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 5, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(0).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.355 VPUSH
-  if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP) {
-    O << '\t' << "vpush";
-    printPredicateOperand(MI, 2, O);
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::VSTMSDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpush";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.354 VPOP
-  if ((Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMDIA_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP) {
-    O << '\t' << "vpop";
-    printPredicateOperand(MI, 2, O);
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMDIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpop";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
-  if (Opcode == ARM::tLDMIA) {
+  case ARM::tLDMIA: {
     bool Writeback = true;
     unsigned BaseReg = MI->getOperand(0).getReg();
     for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
@@ -249,9 +267,10 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   // GPRs. However, when decoding them, the two GRPs cannot be automatically
   // expressed as a GPRPair, so we have to manually merge them.
   // FIXME: We would really like to be able to tablegen'erate this.
-  if (Opcode == ARM::LDREXD || Opcode == ARM::STREXD) {
+  case ARM::LDREXD: case ARM::STREXD:
+  case ARM::LDAEXD: case ARM::STLEXD:
     const MCRegisterClass& MRC = MRI.getRegClass(ARM::GPRRegClassID);
-    bool isStore = Opcode == ARM::STREXD;
+    bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD;
     unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
     if (MRC.contains(Reg)) {
       MCInst NewMI;
@@ -676,7 +695,7 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
 void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
                                      raw_ostream &O) {
   unsigned val = MI->getOperand(OpNum).getImm();
-  O << ARM_MB::MemBOptToString(val);
+  O << ARM_MB::MemBOptToString(val, (getAvailableFeatures() & ARM::HasV8Ops));
 }
 
 void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index b1e25d8..5615b80 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -25,9 +25,9 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -640,16 +640,16 @@ public:
 // FIXME: This should be in a separate file.
 class DarwinARMAsmBackend : public ARMAsmBackend {
 public:
-  const object::mach::CPUSubtypeARM Subtype;
+  const MachO::CPUSubTypeARM Subtype;
   DarwinARMAsmBackend(const Target &T, const StringRef TT,
-                      object::mach::CPUSubtypeARM st)
+                      MachO::CPUSubTypeARM st)
     : ARMAsmBackend(T, TT), Subtype(st) {
       HasDataInCodeSupport = true;
     }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_ARM,
+                                     MachO::CPU_TYPE_ARM,
                                      Subtype);
   }
 
@@ -660,22 +660,24 @@ public:
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin()) {
-    object::mach::CPUSubtypeARM CS =
-      StringSwitch<object::mach::CPUSubtypeARM>(TheTriple.getArchName())
-      .Cases("armv4t", "thumbv4t", object::mach::CSARM_V4T)
-      .Cases("armv5e", "thumbv5e",object::mach::CSARM_V5TEJ)
-      .Cases("armv6", "thumbv6", object::mach::CSARM_V6)
-      .Cases("armv6m", "thumbv6m", object::mach::CSARM_V6M)
-      .Cases("armv7em", "thumbv7em", object::mach::CSARM_V7EM)
-      .Cases("armv7f", "thumbv7f", object::mach::CSARM_V7F)
-      .Cases("armv7k", "thumbv7k", object::mach::CSARM_V7K)
-      .Cases("armv7m", "thumbv7m", object::mach::CSARM_V7M)
-      .Cases("armv7s", "thumbv7s", object::mach::CSARM_V7S)
-      .Default(object::mach::CSARM_V7);
+    MachO::CPUSubTypeARM CS =
+      StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
+      .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
+      .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
+      .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
+      .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
+      .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
+      .Cases("armv7f", "thumbv7f", MachO::CPU_SUBTYPE_ARM_V7F)
+      .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
+      .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
+      .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
+      .Default(MachO::CPU_SUBTYPE_ARM_V7);
 
     return new DarwinARMAsmBackend(T, TT, CS);
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index ff9917d..af939fc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -121,41 +121,41 @@ namespace ARM_MB {
   // the option field for memory barrier operations.
   enum MemBOpt {
     RESERVED_0 = 0,
-    RESERVED_1 = 1,
+    OSHLD = 1,
     OSHST = 2,
     OSH   = 3,
     RESERVED_4 = 4,
-    RESERVED_5 = 5,
+    NSHLD = 5,
     NSHST = 6,
     NSH   = 7,
     RESERVED_8 = 8,
-    RESERVED_9 = 9,
+    ISHLD = 9,
     ISHST = 10,
     ISH   = 11,
     RESERVED_12 = 12,
-    RESERVED_13 = 13,
+    LD = 13,
     ST    = 14,
     SY    = 15
   };
 
-  inline static const char *MemBOptToString(unsigned val) {
+  inline static const char *MemBOptToString(unsigned val, bool HasV8) {
     switch (val) {
     default: llvm_unreachable("Unknown memory operation");
     case SY:    return "sy";
     case ST:    return "st";
-    case RESERVED_13: return "#0xd";
+    case LD: return HasV8 ? "ld" : "#0xd";
     case RESERVED_12: return "#0xc";
     case ISH:   return "ish";
     case ISHST: return "ishst";
-    case RESERVED_9: return "#0x9";
+    case ISHLD: return HasV8 ?  "ishld" : "#0x9";
     case RESERVED_8: return "#0x8";
     case NSH:   return "nsh";
     case NSHST: return "nshst";
-    case RESERVED_5: return "#0x5";
+    case NSHLD: return HasV8 ? "nshld" : "#0x5";
     case RESERVED_4: return "#0x4";
     case OSH:   return "osh";
     case OSHST: return "oshst";
-    case RESERVED_1: return "#0x1";
+    case OSHLD: return HasV8 ? "oshld" : "#0x1";
     case RESERVED_0: return "#0x0";
     }
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 6b98205..471897d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -13,6 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBuildAttrs.h"
+#include "ARMFPUName.h"
 #include "ARMRegisterInfo.h"
 #include "ARMUnwindOp.h"
 #include "ARMUnwindOpAsm.h"
@@ -27,6 +29,7 @@
 #include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
@@ -36,7 +39,9 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -45,8 +50,218 @@ static std::string GetAEABIUnwindPersonalityName(unsigned Index) {
   return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str();
 }
 
+static const char *GetFPUName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown FPU kind");
+    break;
+#define ARM_FPU_NAME(NAME, ID) case ARM::ID: return NAME;
+#include "ARMFPUName.def"
+  }
+  return NULL;
+}
+
 namespace {
 
+class ARMELFStreamer;
+
+class ARMTargetAsmStreamer : public ARMTargetStreamer {
+  formatted_raw_ostream &OS;
+  MCInstPrinter &InstPrinter;
+
+  virtual void emitFnStart();
+  virtual void emitFnEnd();
+  virtual void emitCantUnwind();
+  virtual void emitPersonality(const MCSymbol *Personality);
+  virtual void emitHandlerData();
+  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
+  virtual void emitPad(int64_t Offset);
+  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
+  virtual void switchVendor(StringRef Vendor);
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitFPU(unsigned FPU);
+  virtual void finishAttributeSection();
+
+public:
+  ARMTargetAsmStreamer(formatted_raw_ostream &OS, MCInstPrinter &InstPrinter);
+};
+
+ARMTargetAsmStreamer::ARMTargetAsmStreamer(formatted_raw_ostream &OS,
+                                           MCInstPrinter &InstPrinter)
+    : OS(OS), InstPrinter(InstPrinter) {}
+void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
+void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
+void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
+  OS << "\t.personality " << Personality->getName() << '\n';
+}
+void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  OS << "\t.setfp\t";
+  InstPrinter.printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter.printRegName(OS, SpReg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
+void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
+  OS << "\t.pad\t#" << Offset << '\n';
+}
+void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  assert(RegList.size() && "RegList should not be empty");
+  if (isVector)
+    OS << "\t.vsave\t{";
+  else
+    OS << "\t.save\t{";
+
+  InstPrinter.printRegName(OS, RegList[0]);
+
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter.printRegName(OS, RegList[i]);
+  }
+
+  OS << "}\n";
+}
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
+}
+void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value) << "\n";
+}
+void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef String) {
+  switch (Attribute) {
+  default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
+  case ARMBuildAttrs::CPU_name:
+    OS << "\t.cpu\t" << String.lower() << "\n";
+    break;
+  }
+}
+void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
+  OS << "\t.fpu\t" << GetFPUName(FPU) << "\n";
+}
+void ARMTargetAsmStreamer::finishAttributeSection() {
+}
+
+class ARMTargetELFStreamer : public ARMTargetStreamer {
+private:
+  // This structure holds all attributes, accounting for
+  // their string/numeric value, so we can later emmit them
+  // in declaration order, keeping all in the same vector
+  struct AttributeItem {
+    enum {
+      HiddenAttribute = 0,
+      NumericAttribute,
+      TextAttribute
+    } Type;
+    unsigned Tag;
+    unsigned IntValue;
+    StringRef StringValue;
+
+    static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
+      return (LHS.Tag < RHS.Tag);
+    }
+  };
+
+  StringRef CurrentVendor;
+  unsigned FPU;
+  SmallVector<AttributeItem, 64> Contents;
+
+  const MCSection *AttributeSection;
+
+  // FIXME: this should be in a more generic place, but
+  // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
+  static size_t getULEBSize(int Value) {
+    size_t Size = 0;
+    do {
+      Value >>= 7;
+      Size += sizeof(int8_t); // Is this really necessary?
+    } while (Value);
+    return Size;
+  }
+
+  AttributeItem *getAttributeItem(unsigned Attribute) {
+    for (size_t i = 0; i < Contents.size(); ++i)
+      if (Contents[i].Tag == Attribute)
+        return &Contents[i];
+    return 0;
+  }
+
+  void setAttributeItem(unsigned Attribute, unsigned Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->IntValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAttribute,
+      Attribute,
+      Value,
+      StringRef("")
+    };
+    Contents.push_back(Item);
+  }
+
+  void setAttributeItem(unsigned Attribute, StringRef Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->StringValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::TextAttribute,
+      Attribute,
+      0,
+      Value
+    };
+    Contents.push_back(Item);
+  }
+
+  void emitFPUDefaultAttributes();
+
+  ARMELFStreamer &getStreamer();
+
+  virtual void emitFnStart();
+  virtual void emitFnEnd();
+  virtual void emitCantUnwind();
+  virtual void emitPersonality(const MCSymbol *Personality);
+  virtual void emitHandlerData();
+  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
+  virtual void emitPad(int64_t Offset);
+  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
+  virtual void switchVendor(StringRef Vendor);
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitFPU(unsigned FPU);
+  virtual void finishAttributeSection();
+
+  size_t calculateContentSize() const;
+
+public:
+  ARMTargetELFStreamer()
+    : ARMTargetStreamer(), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
+      AttributeSection(0) {
+  }
+};
+
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
 /// the appropriate points in the object files. These symbols are defined in the
 /// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf.
@@ -61,27 +276,29 @@ namespace {
 /// by MachO. Beware!
 class ARMELFStreamer : public MCELFStreamer {
 public:
-  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                 MCCodeEmitter *Emitter, bool IsThumb)
-      : MCELFStreamer(SK_ARMELFStreamer, Context, TAB, OS, Emitter),
+  friend class ARMTargetELFStreamer;
+
+  ARMELFStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                 MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter,
+                 bool IsThumb)
+      : MCELFStreamer(Context, TargetStreamer, TAB, OS, Emitter),
         IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) {
     Reset();
   }
 
   ~ARMELFStreamer() {}
 
+  virtual void FinishImpl();
+
   // ARM exception handling directives
-  virtual void EmitFnStart();
-  virtual void EmitFnEnd();
-  virtual void EmitCantUnwind();
-  virtual void EmitPersonality(const MCSymbol *Per);
-  virtual void EmitHandlerData();
-  virtual void EmitSetFP(unsigned NewFpReg,
-                         unsigned NewSpReg,
-                         int64_t Offset = 0);
-  virtual void EmitPad(int64_t Offset);
-  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                           bool isVector);
+  void emitFnStart();
+  void emitFnEnd();
+  void emitCantUnwind();
+  void emitPersonality(const MCSymbol *Per);
+  void emitHandlerData();
+  void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0);
+  void emitPad(int64_t Offset);
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
 
   virtual void ChangeSection(const MCSection *Section,
                              const MCExpr *Subsection) {
@@ -141,10 +358,6 @@ public:
     }
   }
 
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_ARMELFStreamer;
-  }
-
 private:
   enum ElfMappingSymbol {
     EMS_None,
@@ -183,7 +396,7 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    AssignSection(Symbol, getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
@@ -232,6 +445,224 @@ private:
 };
 } // end anonymous namespace
 
+ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
+  ARMELFStreamer *S = static_cast<ARMELFStreamer *>(Streamer);
+  return *S;
+}
+
+void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
+void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
+void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
+  getStreamer().emitPersonality(Personality);
+}
+void ARMTargetELFStreamer::emitHandlerData() {
+  getStreamer().emitHandlerData();
+}
+void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  getStreamer().emitSetFP(FpReg, SpReg, Offset);
+}
+void ARMTargetELFStreamer::emitPad(int64_t Offset) {
+  getStreamer().emitPad(Offset);
+}
+void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  getStreamer().emitRegSave(RegList, isVector);
+}
+void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
+  assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+  if (CurrentVendor == Vendor)
+    return;
+
+  if (!CurrentVendor.empty())
+    finishAttributeSection();
+
+  assert(Contents.empty() &&
+         ".ARM.attributes should be flushed before changing vendor");
+  CurrentVendor = Vendor;
+
+}
+void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitFPU(unsigned Value) {
+  FPU = Value;
+}
+void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
+  switch (FPU) {
+  case ARM::VFP:
+  case ARM::VFPV2:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3_D16:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4_D16:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_VFPV4:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_FP_ARMV8:
+  case ARM::CRYPTO_NEON_FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeonARMv8,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  default:
+    report_fatal_error("Unknown FPU: " + Twine(FPU));
+    break;
+  }
+}
+size_t ARMTargetELFStreamer::calculateContentSize() const {
+  size_t Result = 0;
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    switch (item.Type) {
+    case AttributeItem::HiddenAttribute:
+      break;
+    case AttributeItem::NumericAttribute:
+      Result += getULEBSize(item.Tag);
+      Result += getULEBSize(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Result += getULEBSize(item.Tag);
+      Result += item.StringValue.size() + 1; // string + '\0'
+      break;
+    }
+  }
+  return Result;
+}
+void ARMTargetELFStreamer::finishAttributeSection() {
+  // <format-version>
+  // [ <section-length> "vendor-name"
+  // [ <file-tag> <size> <attribute>*
+  //   | <section-tag> <size> <section-number>* 0 <attribute>*
+  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+  //   ]+
+  // ]*
+
+  if (FPU != ARM::INVALID_FPU)
+    emitFPUDefaultAttributes();
+
+  if (Contents.empty())
+    return;
+
+  std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+
+  ARMELFStreamer &Streamer = getStreamer();
+
+  // Switch to .ARM.attributes section
+  if (AttributeSection) {
+    Streamer.SwitchSection(AttributeSection);
+  } else {
+    AttributeSection =
+      Streamer.getContext().getELFSection(".ARM.attributes",
+                                          ELF::SHT_ARM_ATTRIBUTES,
+                                          0,
+                                          SectionKind::getMetadata());
+    Streamer.SwitchSection(AttributeSection);
+
+    // Format version
+    Streamer.EmitIntValue(0x41, 1);
+  }
+
+  // Vendor size + Vendor name + '\0'
+  const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+  // Tag + Tag Size
+  const size_t TagHeaderSize = 1 + 4;
+
+  const size_t ContentsSize = calculateContentSize();
+
+  Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+  Streamer.EmitBytes(CurrentVendor);
+  Streamer.EmitIntValue(0, 1); // '\0'
+
+  Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+  Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+  // Size should have been accounted for already, now
+  // emit each field as its type (ULEB or String)
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    Streamer.EmitULEB128IntValue(item.Tag);
+    switch (item.Type) {
+    default: llvm_unreachable("Invalid attribute type");
+    case AttributeItem::NumericAttribute:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
+    }
+  }
+
+  Contents.clear();
+  FPU = ARM::INVALID_FPU;
+}
+
+void ARMELFStreamer::FinishImpl() {
+  MCTargetStreamer &TS = getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.finishAttributeSection();
+
+  MCELFStreamer::FinishImpl();
+}
+
 inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
                                               unsigned Type,
                                               unsigned Flags,
@@ -295,29 +726,13 @@ void ARMELFStreamer::Reset() {
   UnwindOpAsm.Reset();
 }
 
-// Add the R_ARM_NONE fixup at the same position
-void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
-  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
-
-  const MCSymbolRefExpr *PersonalityRef =
-    MCSymbolRefExpr::Create(PersonalitySym,
-                            MCSymbolRefExpr::VK_ARM_NONE,
-                            getContext());
-
-  AddValueSymbols(PersonalityRef);
-  MCDataFragment *DF = getOrCreateDataFragment();
-  DF->getFixups().push_back(
-    MCFixup::Create(DF->getContents().size(), PersonalityRef,
-                    MCFixup::getKindForSize(4, false)));
-}
-
-void ARMELFStreamer::EmitFnStart() {
+void ARMELFStreamer::emitFnStart() {
   assert(FnStart == 0);
   FnStart = getContext().CreateTempSymbol();
   EmitLabel(FnStart);
 }
 
-void ARMELFStreamer::EmitFnEnd() {
+void ARMELFStreamer::emitFnEnd() {
   assert(FnStart && ".fnstart must preceeds .fnend");
 
   // Emit unwind opcodes if there is no .handlerdata directive
@@ -365,8 +780,20 @@ void ARMELFStreamer::EmitFnEnd() {
   Reset();
 }
 
-void ARMELFStreamer::EmitCantUnwind() {
-  CantUnwind = true;
+void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
+
+  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create(
+      PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
+
+  AddValueSymbols(PersonalityRef);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(),
+                                            PersonalityRef,
+                                            MCFixup::getKindForSize(4, false)));
 }
 
 void ARMELFStreamer::FlushPendingOffset() {
@@ -429,17 +856,14 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
     EmitIntValue(0, 4);
 }
 
-void ARMELFStreamer::EmitHandlerData() {
-  FlushUnwindOpcodes(false);
-}
+void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
 
-void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) {
+void ARMELFStreamer::emitPersonality(const MCSymbol *Per) {
   Personality = Per;
   UnwindOpAsm.setPersonality(Per);
 }
 
-void ARMELFStreamer::EmitSetFP(unsigned NewFPReg,
-                               unsigned NewSPReg,
+void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
                                int64_t Offset) {
   assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
          "the operand of .setfp directive should be either $sp or $fp");
@@ -453,7 +877,7 @@ void ARMELFStreamer::EmitSetFP(unsigned NewFPReg,
     FPOffset += Offset;
 }
 
-void ARMELFStreamer::EmitPad(int64_t Offset) {
+void ARMELFStreamer::emitPad(int64_t Offset) {
   // Track the change of the $sp offset
   SPOffset -= Offset;
 
@@ -462,7 +886,7 @@ void ARMELFStreamer::EmitPad(int64_t Offset) {
   PendingOffset -= Offset;
 }
 
-void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
+void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                  bool IsVector) {
   // Collect the registers in the register list
   unsigned Count = 0;
@@ -493,11 +917,31 @@ void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
 }
 
 namespace llvm {
+
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst) {
+  ARMTargetAsmStreamer *S = new ARMTargetAsmStreamer(OS, *InstPrint);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
+}
+
   MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                       raw_ostream &OS, MCCodeEmitter *Emitter,
                                       bool RelaxAll, bool NoExecStack,
                                       bool IsThumb) {
-    ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+    ARMTargetELFStreamer *TS = new ARMTargetELFStreamer();
+    ARMELFStreamer *S =
+        new ARMELFStreamer(Context, TS, TAB, OS, Emitter, IsThumb);
+    // FIXME: This should eventually end up somewhere else where more
+    // intelligent flag decisions can be made. For now we are just maintaining
+    // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+    S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+
     if (RelaxAll)
       S->getAssembler().setRelaxAll(true);
     if (NoExecStack)
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
deleted file mode 100644
index 77ae5d2..0000000
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- ARMELFStreamer.h - ELF Streamer for ARM ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF streamer information for the ARM backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_ELF_STREAMER_H
-#define ARM_ELF_STREAMER_H
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-
-  MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack,
-                                      bool IsThumb);
-}
-
-#endif // ARM_ELF_STREAMER_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index c1aab9c..ad796e6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -49,8 +49,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
 
-  WeakRefDirective = "\t.weak\t";
-
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index f0b289c..e1f716d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_ARMTARGETASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
@@ -24,7 +25,7 @@ namespace llvm {
     explicit ARMMCAsmInfoDarwin();
   };
 
-  class ARMELFMCAsmInfo : public MCAsmInfo {
+  class ARMELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit ARMELFMCAsmInfo();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index a18d465..4382d0d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -58,8 +58,7 @@ public:
   }
   bool isTargetDarwin() const {
     Triple TT(STI.getTargetTriple());
-    Triple::OSType OS = TT.getOS();
-    return OS == Triple::Darwin || OS == Triple::MacOSX || OS == Triple::IOS;
+  	return TT.isOSDarwin();
   }
 
   unsigned getMachineSoImmOpValue(unsigned SoImm) const;
@@ -638,8 +637,14 @@ getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
                        SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Val =
-    ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  unsigned Val = 0;
+  const MCOperand MO = MI.getOperand(OpIdx);
+    
+  if(MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  else 
+    Val = MO.getImm() >> 1;
+
   bool I  = (Val & 0x800000);
   bool J1 = (Val & 0x400000);
   bool J2 = (Val & 0x200000);
@@ -665,7 +670,7 @@ getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
                                     Fixups);
-  int32_t offset = MO.getImm();
+  int64_t offset = MO.getImm();
   uint32_t Val = 0x2000;
 
   int SoImmVal;
@@ -772,8 +777,10 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
     } else {
       Reg = ARM::PC;
       int32_t Offset = MO.getImm();
-      // FIXME: Handle #-0.
-      if (Offset < 0) {
+      if (Offset == INT32_MIN) {
+        Offset = 0;
+        isAdd = false;
+      } else if (Offset < 0) {
         Offset *= -1;
         isAdd = false;
       }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index caa1949..a99de0e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -12,30 +12,73 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMBaseInfo.h"
-#include "ARMELFStreamer.h"
 #include "ARMMCAsmInfo.h"
 #include "ARMMCTargetDesc.h"
 #include "InstPrinter/ARMInstPrinter.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_MC_DESC
 #include "ARMGenRegisterInfo.inc"
 
+static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV7Ops &&
+      (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
+      (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) &&
+      // Checks for the deprecated CP15ISB encoding:
+      // mcr p15, #0, rX, c7, c5, #4
+      (MI.getOperand(3).isImm() && MI.getOperand(3).getImm() == 7)) {
+    if ((MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 4)) {
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 5) {
+        Info = "deprecated since v7, use 'isb'";
+        return true;
+      }
+
+      // Checks for the deprecated CP15DSB encoding:
+      // mcr p15, #0, rX, c7, c10, #4
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10) {
+        Info = "deprecated since v7, use 'dsb'";
+        return true;
+      }
+    }
+    // Checks for the deprecated CP15DMB encoding:
+    // mcr p15, #0, rX, c7, c10, #5
+    if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10 &&
+        (MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 5)) {
+      Info = "deprecated since v7, use 'dmb'";
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV8Ops &&
+      MI.getOperand(1).isImm() && MI.getOperand(1).getImm() != 8) {
+    Info = "applying IT instruction to more than one subsequent instruction is deprecated";
+    return true;
+  }
+
+  return false;
+}
+
 #define GET_INSTRINFO_MC_DESC
 #include "ARMGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_MC_DESC
 #include "ARMGenSubtargetInfo.inc"
 
-using namespace llvm;
 
 std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   Triple triple(TT);
@@ -60,8 +103,13 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   if (Idx) {
     unsigned SubVer = TT[Idx];
     if (SubVer == '8') {
-      // FIXME: Parse v8 features
-      ARMArchFeature = "+v8";
+      if (NoCPU)
+        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, FeatureMP,
+        //      FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, FeatureT2XtPk, FeatureCrypto, FeatureCRC
+        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,+trustzone,+t2xtpk,+crypto,+crc";
+      else
+        // Use CPU to figure out the exact features
+        ARMArchFeature = "+v8";
     } else if (SubVer == '7') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
         isThumb = true;
@@ -106,7 +154,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
         isThumb = true;
         if (NoCPU)
           // v6m: FeatureNoARM, FeatureMClass
-          ARMArchFeature = "+v6,+noarm,+mclass";
+          ARMArchFeature = "+v6m,+noarm,+mclass";
         else
           ARMArchFeature = "+v6";
       } else
@@ -307,6 +355,10 @@ extern "C" void LLVMInitializeARMTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheARMTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheThumbTarget, createMCAsmStreamer);
+
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 4e94c53..959be8b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -18,13 +18,16 @@
 #include <string>
 
 namespace llvm {
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
+class MCInstPrinter;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class MCStreamer;
 class MCRelocationInfo;
 class StringRef;
 class Target;
@@ -42,12 +45,19 @@ namespace ARM_MC {
                                             StringRef FS);
 }
 
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst);
+
 MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
 
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index b9efe74..1f681ba 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -20,10 +20,9 @@
 #include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 using namespace llvm;
-using namespace llvm::object;
 
 namespace {
 class ARMMachObjectWriter : public MCMachObjectTargetWriter {
@@ -63,7 +62,7 @@ public:
 
 static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
                               unsigned &Log2Size) {
-  RelocType = unsigned(macho::RIT_Vanilla);
+  RelocType = unsigned(MachO::ARM_RELOC_VANILLA);
   Log2Size = ~0U;
 
   switch (Kind) {
@@ -92,21 +91,21 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   case ARM::fixup_arm_uncondbl:
   case ARM::fixup_arm_condbl:
   case ARM::fixup_arm_blx:
-    RelocType = unsigned(macho::RIT_ARM_Branch24Bit);
+    RelocType = unsigned(MachO::ARM_RELOC_BR24);
     // Report as 'long', even though that is not quite accurate.
     Log2Size = llvm::Log2_32(4);
     return true;
 
     // Handle Thumb branches.
   case ARM::fixup_arm_thumb_br:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
     Log2Size = llvm::Log2_32(2);
     return true;
 
   case ARM::fixup_t2_uncondbranch:
   case ARM::fixup_arm_thumb_bl:
   case ARM::fixup_arm_thumb_blx:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
     Log2Size = llvm::Log2_32(4);
     return true;
 
@@ -121,23 +120,23 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   //      1 - thumb instructions
   case ARM::fixup_arm_movt_hi16:
   case ARM::fixup_arm_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 1;
     return true;
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 3;
     return true;
 
   case ARM::fixup_arm_movw_lo16:
   case ARM::fixup_arm_movw_lo16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 0;
     return true;
   case ARM::fixup_t2_movw_lo16:
   case ARM::fixup_t2_movw_lo16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 2;
     return true;
   }
@@ -153,7 +152,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                                  uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_ARM_Half;
+  unsigned Type = MachO::ARM_RELOC_HALF;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -179,7 +178,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                          "' can not be undefined in a subtraction expression");
 
     // Select the appropriate difference relocation type.
-    Type = macho::RIT_ARM_HalfDifference;
+    Type = MachO::ARM_RELOC_HALF_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
@@ -223,29 +222,29 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
     break;
   }
 
-  if (Type == macho::RIT_ARM_HalfDifference) {
+  if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
     uint32_t OtherHalf = MovtBit
       ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
 
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((OtherHalf       <<  0) |
-                 (macho::RIT_Pair << 24) |
-                 (MovtBit         << 28) |
-                 (ThumbBit        << 29) |
-                 (IsPCRel         << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((OtherHalf             <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (MovtBit               << 28) |
+                   (ThumbBit              << 29) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (MovtBit     << 28) |
-               (ThumbBit    << 29) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (MovtBit     << 28) |
+                 (ThumbBit    << 29) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -259,7 +258,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
+  unsigned Type = MachO::ARM_RELOC_VANILLA;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -284,31 +283,31 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                          "' can not be undefined in a subtraction expression");
 
     // Select the appropriate difference relocation type.
-    Type = macho::RIT_Difference;
+    Type = MachO::ARM_RELOC_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+  if (Type == MachO::ARM_RELOC_SECTDIFF ||
+      Type == MachO::ARM_RELOC_LOCAL_SECTDIFF) {
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                     <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (Log2Size              << 28) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -326,13 +325,13 @@ bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
   switch (RelocType) {
   default:
     return false;
-  case macho::RIT_ARM_Branch24Bit:
+  case MachO::ARM_RELOC_BR24:
     // PC pre-adjustment of 8 for these instructions.
     Value -= 8;
     // ARM BL/BLX has a 25-bit offset.
     Range = 0x1ffffff;
     break;
-  case macho::RIT_ARM_ThumbBranch22Bit:
+  case MachO::ARM_THUMB_RELOC_BR22:
     // PC pre-adjustment of 4 for these instructions.
     Value -= 4;
     // Thumb BL/BLX has a 24-bit offset.
@@ -361,7 +360,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
                                            uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Log2Size;
-  unsigned RelocType = macho::RIT_Vanilla;
+  unsigned RelocType = MachO::ARM_RELOC_VANILLA;
   if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size))
     // If we failed to get fixup kind info, it's because there's no legal
     // relocation type for the fixup kind. This happens when it's a fixup that's
@@ -374,7 +373,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // scattered relocation entry.  Differences always require scattered
   // relocations.
   if (Target.getSymB()) {
-    if (RelocType == macho::RIT_ARM_Half)
+    if (RelocType == MachO::ARM_RELOC_HALF)
       return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
                                               Fixup, Target, FixedValue);
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
@@ -392,7 +391,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   //
   // Is this right for ARM?
   uint32_t Offset = Target.getConstant();
-  if (IsPCRel && RelocType == macho::RIT_Vanilla)
+  if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
     Offset += 1 << Log2Size;
   if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
@@ -445,17 +444,17 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
 
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
-  if (Type == macho::RIT_ARM_Half) {
+  if (Type == MachO::ARM_RELOC_HALF) {
     // The other-half value only gets populated for the movt and movw
     // relocation entries.
     uint32_t Value = 0;
@@ -474,11 +473,11 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
       Value = FixedValue & 0xffff;
       break;
     }
-    macho::RelocationEntry MREPair;
-    MREPair.Word0 = Value;
-    MREPair.Word1 = ((0xffffff) |
-                     (Log2Size << 25) |
-                     (macho::RIT_Pair << 28));
+    MachO::any_relocation_info MREPair;
+    MREPair.r_word0 = Value;
+    MREPair.r_word1 = ((0xffffff              <<  0) |
+                       (Log2Size              << 25) |
+                       (MachO::ARM_RELOC_PAIR << 28));
 
     Writer->addRelocation(Fragment->getParent(), MREPair);
   }
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index db49db8..cfb33f5 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -127,7 +127,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::LR:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
       GPRCS1Size += 4;
       break;
     case ARM::R8:
@@ -136,16 +135,12 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::R11:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      if (STI.isTargetIOS()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
+      if (STI.isTargetIOS())
         GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
+      else
         GPRCS1Size += 4;
-      }
       break;
     default:
-      AFI->addDPRCalleeSavedAreaFrame(FI);
       DPRCSSize += 8;
     }
   }
@@ -169,10 +164,17 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
   NumBytes = DPRCSOffset;
 
+  int FramePtrOffsetInBlock = 0;
+  if (tryFoldSPUpdateIntoPushPop(MF, prior(MBBI), NumBytes)) {
+    FramePtrOffsetInBlock = NumBytes;
+    NumBytes = 0;
+  }
+
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (HasFP) {
+    FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0)
+      .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
       .setMIFlags(MachineInstr::FrameSetup));
     if (NumBytes > 508)
       // If offset is > 508 then sp cannot be adjusted in a single instruction,
@@ -213,13 +215,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
 static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
   if (MI->getOpcode() == ARM::tLDRspi &&
       MI->getOperand(1).isFI() &&
@@ -296,8 +291,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
           &MBB.front() != MBBI &&
           prior(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = prior(MBBI);
-        emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
-      } else
+        if (!tryFoldSPUpdateIntoPushPop(MF, PMBBI, NumBytes))
+          emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+      } else if (!tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
         emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
     }
   }
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 6722614..65a7221 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -426,7 +426,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
                                 *this);
     } else {
       // Translate r0 = add sp, -imm to
-      // r0 = -imm (this is then translated into a series of instructons)
+      // r0 = -imm (this is then translated into a series of instructions)
       // r0 = add r0, sp
       emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
 
@@ -573,11 +573,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
                MF.getFrameInfo()->getStackSize() + SPAdj;
 
-  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea2Offset();
-  else if (MF.getFrameInfo()->hasVarSizedObjects()) {
+  if (MF.getFrameInfo()->hasVarSizedObjects()) {
     assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) &&
            "Unexpected");
     // There are alloca()'s in this function, must reference off the frame
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index d8596d7..0b7d3bb 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -28,6 +28,7 @@ namespace {
     static char ID;
     Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
 
+    bool restrictIT;
     const Thumb2InstrInfo *TII;
     const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
@@ -192,37 +193,42 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     // Form IT block.
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
-    // Branches, including tricky ones like LDM_RET, need to end an IT
-    // block so check the instruction we just put in the block.
-    for (; MBBI != E && Pos &&
-           (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
-      if (MBBI->isDebugValue())
-        continue;
-
-      MachineInstr *NMI = &*MBBI;
-      MI = NMI;
-
-      unsigned NPredReg = 0;
-      ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
-      if (NCC == CC || NCC == OCC) {
-        Mask |= (NCC & 1) << Pos;
-        // Add implicit use of ITSTATE.
-        NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
-                                               true/*isImp*/, false/*isKill*/));
-        LastITMI = NMI;
-      } else {
-        if (NCC == ARMCC::AL &&
-            MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
-          --MBBI;
-          MBB.remove(NMI);
-          MBB.insert(InsertPos, NMI);
-          ++NumMovedInsts;
+
+    // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it
+    // is set: skip the loop
+    if (!restrictIT) {
+      // Branches, including tricky ones like LDM_RET, need to end an IT
+      // block so check the instruction we just put in the block.
+      for (; MBBI != E && Pos &&
+             (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
+        if (MBBI->isDebugValue())
           continue;
+
+        MachineInstr *NMI = &*MBBI;
+        MI = NMI;
+
+        unsigned NPredReg = 0;
+        ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
+        if (NCC == CC || NCC == OCC) {
+          Mask |= (NCC & 1) << Pos;
+          // Add implicit use of ITSTATE.
+          NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                                 true/*isImp*/, false/*isKill*/));
+          LastITMI = NMI;
+        } else {
+          if (NCC == ARMCC::AL &&
+              MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+            --MBBI;
+            MBB.remove(NMI);
+            MBB.insert(InsertPos, NMI);
+            ++NumMovedInsts;
+            continue;
+          }
+          break;
         }
-        break;
+        TrackDefUses(NMI, Defs, Uses, TRI);
+        --Pos;
       }
-      TrackDefUses(NMI, Defs, Uses, TRI);
-      --Pos;
     }
 
     // Finalize IT mask.
@@ -250,6 +256,7 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
   TRI = TM.getRegisterInfo();
+  restrictIT = TM.getSubtarget<ARMSubtarget>().restrictIT();
 
   if (!AFI->isThumbFunction())
     return false;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 286eaa0..91788ac 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -36,7 +36,8 @@ Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
 void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  NopInst.setOpcode(ARM::tNOP);
+  NopInst.setOpcode(ARM::tHINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
   NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
   NopInst.addOperand(MCOperand::CreateReg(0));
 }
@@ -214,6 +215,13 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
                                const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+    return;
+  }
+
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -334,6 +342,7 @@ negativeOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:   return ARM::t2STRi8;
   case ARM::t2STRBi12:  return ARM::t2STRBi8;
   case ARM::t2STRHi12:  return ARM::t2STRHi8;
+  case ARM::t2PLDi12:   return ARM::t2PLDi8;
 
   case ARM::t2LDRi8:
   case ARM::t2LDRHi8:
@@ -343,6 +352,7 @@ negativeOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:
   case ARM::t2STRBi8:
   case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
     return opcode;
 
   default:
@@ -364,6 +374,7 @@ positiveOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:   return ARM::t2STRi12;
   case ARM::t2STRBi8:  return ARM::t2STRBi12;
   case ARM::t2STRHi8:  return ARM::t2STRHi12;
+  case ARM::t2PLDi8:   return ARM::t2PLDi12;
 
   case ARM::t2LDRi12:
   case ARM::t2LDRHi12:
@@ -373,6 +384,7 @@ positiveOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:
   case ARM::t2STRBi12:
   case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
     return opcode;
 
   default:
@@ -394,6 +406,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRs:   return ARM::t2STRi12;
   case ARM::t2STRBs:  return ARM::t2STRBi12;
   case ARM::t2STRHs:  return ARM::t2STRHi12;
+  case ARM::t2PLDs:   return ARM::t2PLDi12;
 
   case ARM::t2LDRi12:
   case ARM::t2LDRHi12:
@@ -403,6 +416,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:
   case ARM::t2STRBi12:
   case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
   case ARM::t2LDRi8:
   case ARM::t2LDRHi8:
   case ARM::t2LDRBi8:
@@ -411,6 +425,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:
   case ARM::t2STRBi8:
   case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
     return opcode;
 
   default:
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 0ddcad2..ddc7a66 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include <algorithm>
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <set>
@@ -291,8 +292,6 @@ void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
     Out << "GlobalValue::LinkOnceAnyLinkage "; break;
   case GlobalValue::LinkOnceODRLinkage:
     Out << "GlobalValue::LinkOnceODRLinkage "; break;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    Out << "GlobalValue::LinkOnceODRAutoHideLinkage"; break;
   case GlobalValue::WeakAnyLinkage:
     Out << "GlobalValue::WeakAnyLinkage"; break;
   case GlobalValue::WeakODRLinkage:
@@ -497,6 +496,7 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
       HANDLE_ATTR(ReadOnly);
       HANDLE_ATTR(NoInline);
       HANDLE_ATTR(AlwaysInline);
+      HANDLE_ATTR(OptimizeNone);
       HANDLE_ATTR(OptimizeForSize);
       HANDLE_ATTR(StackProtect);
       HANDLE_ATTR(StackProtectReq);
@@ -1139,7 +1139,7 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out);
     for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      const IntegersSubset CaseVal = i.getCaseValueEx();
+      const ConstantInt* CaseVal = i.getCaseValue();
       const BasicBlock *BB = i.getCaseSuccessor();
       Out << iName << "->addCase("
           << getOpName(CaseVal) << ", "
@@ -1160,8 +1160,7 @@ void CppWriter::printInstruction(const Instruction *I,
     break;
   }
   case Instruction::Resume: {
-    Out << "ResumeInst::Create(mod->getContext(), " << opNames[0]
-        << ", " << bbname << ");";
+    Out << "ResumeInst::Create(" << opNames[0] << ", " << bbname << ");";
     break;
   }
   case Instruction::Invoke: {
@@ -1175,7 +1174,7 @@ void CppWriter::printInstruction(const Instruction *I,
     }
     // FIXME: This shouldn't use magic numbers -3, -2, and -1.
     Out << "InvokeInst *" << iName << " = InvokeInst::Create("
-        << getOpName(inv->getCalledFunction()) << ", "
+        << getOpName(inv->getCalledValue()) << ", "
         << getOpName(inv->getNormalDest()) << ", "
         << getOpName(inv->getUnwindDest()) << ", "
         << iName << "_params, \"";
@@ -1589,6 +1588,20 @@ void CppWriter::printInstruction(const Instruction *I,
     Out << "\");";
     break;
   }
+  case Instruction::LandingPad: {
+    const LandingPadInst *lpi = cast<LandingPadInst>(I);
+    Out << "LandingPadInst* " << iName << " = LandingPadInst::Create(";
+    printCppName(lpi->getType());
+    Out << ", " << opNames[0] << ", " << lpi->getNumClauses() << ", \"";
+    printEscapedString(lpi->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCleanup("
+            << (lpi->isCleanup() ? "true" : "false")
+            << ");";
+    for (unsigned i = 0, e = lpi->getNumClauses(); i != e; ++i)
+      nl(Out) << iName << "->addClause(" << opNames[i+1] << ");";
+    break;
+  }
   }
   DefinedValues.insert(I);
   nl(Out);
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 2b79791..ae3c9eb 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_target(HexagonCodeGen
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
   HexagonFixupHwLoops.cpp
+  HexagonMachineFunctionInfo.cpp
   HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 88cd3fb..a2e04ba 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -99,7 +99,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     return;
   case MachineOperand::MO_GlobalAddress:
     // Computing the address of a global symbol, not calling it.
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     printOffset(MO.getOffset(), O);
     return;
   }
@@ -267,7 +267,7 @@ void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
   assert( (MO.getType() == MachineOperand::MO_GlobalAddress) &&
          "Expecting global address");
 
-  O << *Mang->getSymbol(MO.getGlobal());
+  O << *getSymbol(MO.getGlobal());
   if (MO.getOffset() != 0) {
     O << " + ";
     O << MO.getOffset();
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 3c4ca0f..52d5ab2 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -871,7 +871,7 @@ bool HexagonHardwareLoops::isInvalidLoopOperation(
 /// \brief - Return true if the loop contains an instruction that inhibits
 /// the use of the hardware loop function.
 bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
-  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *MBB = Blocks[i];
     for (MachineBasicBlock::iterator
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9e78e51..5ae9328 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1344,8 +1344,10 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
 
   switch (N->getOpcode()) {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 567faca..1374179 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -39,13 +39,24 @@
 
 using namespace llvm;
 
-const unsigned Hexagon_MAX_RET_SIZE = 64;
-
 static cl::opt<bool>
 EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
                cl::desc("Control jump table emission on Hexagon target"));
 
-int NumNamedVarArgParams = -1;
+namespace {
+class HexagonCCState : public CCState {
+  int NumNamedVarArgParams;
+
+public:
+  HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+                 const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
+                 LLVMContext &C, int NumNamedVarArgParams)
+      : CCState(CC, isVarArg, MF, TM, locs, C),
+        NumNamedVarArgParams(NumNamedVarArgParams) {}
+
+  int getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
+};
+}
 
 // Implement calling convention for Hexagon.
 static bool
@@ -82,12 +93,13 @@ static bool
 CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
             MVT LocVT, CCValAssign::LocInfo LocInfo,
             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  HexagonCCState &HState = static_cast<HexagonCCState &>(State);
 
   // NumNamedVarArgParams can not be zero for a VarArg function.
-  assert ( (NumNamedVarArgParams > 0) &&
-           "NumNamedVarArgParams is not bigger than zero.");
+  assert((HState.getNumNamedVarArgParams() > 0) &&
+         "NumNamedVarArgParams is not bigger than zero.");
 
-  if ( (int)ValNo < NumNamedVarArgParams ) {
+  if ((int)ValNo < HState.getNumNamedVarArgParams()) {
     // Deal with named arguments.
     return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
   }
@@ -394,13 +406,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
 
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
   // Check for varargs.
-  NumNamedVarArgParams = -1;
+  int NumNamedVarArgParams = -1;
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
   {
     const Function* CalleeFn = NULL;
@@ -417,6 +424,12 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  HexagonCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                        getTargetMachine(), ArgLocs, *DAG.getContext(),
+                        NumNamedVarArgParams);
+
   if (NumNamedVarArgParams > 0)
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
   else
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4fe0107..73da226 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -141,8 +141,11 @@ namespace llvm {
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    virtual EVT getSetCCResultType(LLVMContext &, EVT) const {
-      return MVT::i1;
+    virtual EVT getSetCCResultType(LLVMContext &C, EVT VT) const {
+      if (!VT.isVector())
+        return MVT::i1;
+      else
+        return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
     }
 
     virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index e71386a..d25bfa8 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -63,7 +63,7 @@ class MemAccessSize<bits<3> value> {
 def NoMemAccess      : MemAccessSize<0>;// Not a memory acces instruction.
 def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
 def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
-def WordAccess       : MemAccessSize<3>;// Word access instrution (memw).
+def WordAccess       : MemAccessSize<3>;// Word access instruction (memw).
 def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
 
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 5af645c..6b97609 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -26,7 +26,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "HexagonGenInstrInfo.inc"
 #include "HexagonGenDFAPacketizer.inc"
@@ -55,6 +55,8 @@ const int Hexagon_MEMH_AUTOINC_MIN = -16;
 const int Hexagon_MEMB_AUTOINC_MAX = 7;
 const int Hexagon_MEMB_AUTOINC_MIN = -8;
 
+// Pin the vtable to this file.
+void HexagonInstrInfo::anchor() {}
 
 HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
   : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 3c28df4..3f45b8b 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -26,6 +26,7 @@
 namespace llvm {
 
 class HexagonInstrInfo : public HexagonGenInstrInfo {
+  virtual void anchor();
   const HexagonRegisterInfo RI;
   const HexagonSubtarget &Subtarget;
   typedef unsigned Opcode_t;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index fee83fb..475c23d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -3390,4 +3390,3 @@ def : Pat<(i32 (load FoldGlobalAddrGP:$addr)),
 def : Pat<(atomic_load_32 FoldGlobalAddrGP:$addr),
           (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
            Requires<[HasV4T]>;
-
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index f011d51..bbb2fa4 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -73,7 +73,7 @@ void llvm::HexagonLowerToMC(const MachineInstr* MI, HexagonMCInst& MCI,
                AP.OutContext));
       break;
     case MachineOperand::MO_GlobalAddress:
-      MCO = GetSymbolRef(MO, AP.Mang->getSymbol(MO.getGlobal()), AP);
+      MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP);
       break;
     case MachineOperand::MO_ExternalSymbol:
       MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
new file mode 100644
index 0000000..9579c8b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineFunctionInfo.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void HexagonMachineFunctionInfo::anchor() {}
+
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index bd7b26a..a59c8c9 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- HexagonMachineFuctionInfo.h - Hexagon machine function info --*- C++ -*-=//
+//=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,7 @@
 #ifndef HexagonMACHINEFUNCTIONINFO_H
 #define HexagonMACHINEFUNCTIONINFO_H
 
+#include <map>
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -30,9 +31,8 @@ class HexagonMachineFunctionInfo : public MachineFunctionInfo {
   int VarArgsFrameIndex;
   bool HasClobberLR;
   bool HasEHReturn;
-
   std::map<const MachineInstr*, unsigned> PacketInfo;
-
+  virtual void anchor();
 
 public:
   HexagonMachineFunctionInfo() : SRetReturnReg(0), HasClobberLR(0),
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 10bb3e9..c94f081 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -407,11 +407,11 @@ SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
 #ifndef NDEBUG
 void ConvergingVLIWScheduler::traceCandidate(const char *Label,
                                              const ReadyQueue &Q,
-                                             SUnit *SU, PressureElement P) {
+                                             SUnit *SU, PressureChange P) {
   dbgs() << Label << " " << Q.getName() << " ";
   if (P.isValid())
-    dbgs() << DAG->TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
-           << " ";
+    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+           << P.getUnitInc() << " ";
   else
     dbgs() << "     ";
   SU->dump(DAG);
@@ -457,9 +457,7 @@ static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
-static const unsigned PriorityTwo = 100;
-static const unsigned PriorityThree = 50;
-static const unsigned PriorityFour = 20;
+static const unsigned PriorityTwo = 50;
 static const unsigned ScaleTwo = 10;
 static const unsigned FactorOne = 2;
 
@@ -517,8 +515,8 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
   ResCount += (NumNodesBlocking * ScaleTwo);
 
   // Factor in reg pressure as a heuristic.
-  ResCount -= (Delta.Excess.UnitIncrease*PriorityThree);
-  ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree);
+  ResCount -= (Delta.Excess.getUnitInc()*PriorityTwo);
+  ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityTwo);
 
   DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
 
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 171193e..8ac333f 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -233,7 +233,7 @@ protected:
                                SchedCandidate &Candidate);
 #ifndef NDEBUG
   void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      PressureElement P = PressureElement());
+                      PressureChange P = PressureChange());
 #endif
 };
 
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 89e3406..5490ecd 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -29,7 +29,7 @@
 //
 // Note: The peephole pass makes the instrucstions like
 // %vreg170<def> = SXTW %vreg166 or %vreg16<def> = NOT_p %vreg15<kill>
-// redundant and relies on some form of dead removal instrucions, like
+// redundant and relies on some form of dead removal instructions, like
 // DCE or DIE to actually eliminate them.
 
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d5ca4d7..1786e9d 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -295,13 +295,5 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
   return Hexagon::R29;
 }
 
-unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned HexagonRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 #define GET_REGINFO_TARGET_DESC
 #include "HexagonGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index fec86df..89af7c3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -78,10 +78,6 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getFrameRegister() const;
   unsigned getStackRegister() const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 3bf2f20..5166f8e 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -16,33 +16,31 @@
 // scheduled after register allocation.
 //
 //===----------------------------------------------------------------------===//
+
 #define DEBUG_TYPE "xfer"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
-#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "HexagonTargetMachine.h"
-#include "HexagonSubtarget.h"
-#include "HexagonMachineFunctionInfo.h"
 #include <map>
-#include <iostream>
-
-#include "llvm/Support/CommandLine.h"
-#define DEBUG_TYPE "xfer"
-
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 07d5ce1..fca6707 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -86,3 +86,5 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
     ModeIEEERndNear = false;
 }
 
+// Pin the vtable to this file.
+void HexagonSubtarget::anchor() {}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 76a8fba..690bef0 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -27,7 +27,7 @@
 namespace llvm {
 
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
-
+  virtual void anchor();
   bool UseMemOps;
   bool ModeIEEERndNear;
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index cd96b58..bb950a0 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -102,17 +102,25 @@ class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
-    // Enable MI scheduler.
-    if (!DisableHexagonMISched) {
+    // FIXME: Rather than calling enablePass(&MachineSchedulerID) below, define
+    // HexagonSubtarget::enableMachineScheduler() { return true; }.
+    // That will bypass the SelectionDAG VLIW scheduler, which is probably just
+    // hurting compile time and will be removed eventually anyway.
+    if (DisableHexagonMISched)
+      disablePass(&MachineSchedulerID);
+    else
       enablePass(&MachineSchedulerID);
-      MachineSchedRegistry::setDefault(createVLIWMachineSched);
-    }
   }
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
   }
 
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const {
+    return createVLIWMachineSched(C);
+  }
+
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
   virtual bool addPostRegAlloc();
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index 2ea0d2e..7c41507 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -21,7 +21,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cstdio>
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index e0f5a27..8519cf3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -73,7 +73,7 @@ namespace HexagonII {
     NoMemAccess = 0,            // Not a memory acces instruction.
     ByteAccess = 1,             // Byte access instruction (memb).
     HalfWordAccess = 2,         // Half word access instruction (memh).
-    WordAccess = 3,             // Word access instrution (memw).
+    WordAccess = 3,             // Word access instruction (memw).
     DoubleWordAccess = 4        // Double word access instruction (memd)
   };
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 495dbb9..3f9415b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -15,6 +15,9 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void HexagonMCAsmInfo::anchor() {}
+
 HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
@@ -29,7 +32,6 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   InlineAsmEnd = "# InlineAsm End";
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
-  WeakRefDirective = "\t.weak\t";
 
   SupportsDebugInformation = true;
   UsesELFSectionDirectiveForBSS  = true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index 0b94d21..bd8cb76 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -15,10 +15,11 @@
 #define HexagonMCASMINFO_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class HexagonMCAsmInfo : public MCAsmInfo {
+  class HexagonMCAsmInfo : public MCAsmInfoELF {
+    virtual void anchor();
   public:
     explicit HexagonMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index d213a45..acf2ab8 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -21,11 +21,8 @@ MSP430MCAsmInfo::MSP430MCAsmInfo(StringRef TT) {
   PointerSize = CalleeSaveStackSlotSize = 2;
 
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective ="\t.weak\t";
-  PCSymbol=".";
   CommentString = ";";
 
   AlignmentIsInBytes = false;
-  AllowNameToStartWithDigit = true;
   UsesELFSectionDirectiveForBSS = true;
 }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index feb040d..a7e0e58 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -14,12 +14,12 @@
 #ifndef MSP430TARGETASMINFO_H
 #define MSP430TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
 
-  class MSP430MCAsmInfo : public MCAsmInfo {
+  class MSP430MCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit MSP430MCAsmInfo(StringRef TT);
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 0a04e5d..18311c3 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -92,7 +92,7 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     if (Offset)
       O << '(' << Offset << '+';
 
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
 
     if (Offset)
       O << ')';
diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
index b448cc4..8a69d1e 100644
--- a/lib/Target/MSP430/MSP430CallingConv.td
+++ b/lib/Target/MSP430/MSP430CallingConv.td
@@ -23,18 +23,15 @@ def RetCC_MSP430 : CallingConv<[
 //===----------------------------------------------------------------------===//
 // MSP430 Argument Calling Conventions
 //===----------------------------------------------------------------------===//
-def CC_MSP430 : CallingConv<[
+def CC_MSP430_AssignStack : CallingConv<[
   // Pass by value if the byval attribute is given
   CCIfByVal<CCPassByVal<2, 2>>,
 
   // Promote i8 arguments to i16.
   CCIfType<[i8], CCPromoteToType<i16>>,
 
-  // The first 4 integer arguments of non-varargs functions are passed in
-  // integer registers.
-  CCIfNotVarArg<CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>>,
-
   // Integer values get stored in stack slots that are 2 bytes in
   // size and 2-byte aligned.
   CCIfType<[i16], CCAssignToStack<2, 2>>
 ]>;
+
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index c673f59..8370714 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -27,8 +27,8 @@ protected:
 
 public:
   explicit MSP430FrameLowering(const MSP430Subtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), STI(sti) {
-  }
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2),
+      STI(sti) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 543f54c..4152829 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -395,6 +395,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
     DEBUG(errs() << "== ";
           Node->dump(CurDAG);
           errs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 803e899..745cdf5 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -45,7 +45,7 @@ typedef enum {
 } HWMultUseMode;
 
 static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode",
+HWMultMode("msp430-hwmult-mode", cl::Hidden,
            cl::desc("Hardware multiplier use mode"),
            cl::init(HWMultNoIntr),
            cl::values(
@@ -250,6 +250,123 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
 
 #include "MSP430GenCallingConv.inc"
 
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+template<typename ArgT>
+static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
+                              SmallVectorImpl<unsigned> &Out) {
+  unsigned CurrentArgIndex = ~0U;
+  for (unsigned i = 0, e = Args.size(); i != e; i++) {
+    if (CurrentArgIndex == Args[i].OrigArgIndex) {
+      Out.back()++;
+    } else {
+      Out.push_back(1);
+      CurrentArgIndex++;
+    }
+  }
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeCallOperands(Outs, CC_MSP430_AssignStack);
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeFormalArguments(Ins, CC_MSP430_AssignStack);
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+template<typename ArgT>
+static void AnalyzeArguments(CCState &State,
+                             SmallVectorImpl<CCValAssign> &ArgLocs,
+                             const SmallVectorImpl<ArgT> &Args) {
+  static const uint16_t RegList[] = {
+    MSP430::R15W, MSP430::R14W, MSP430::R13W, MSP430::R12W
+  };
+  static const unsigned NbRegs = array_lengthof(RegList);
+
+  if (State.isVarArg()) {
+    AnalyzeVarArgs(State, Args);
+    return;
+  }
+
+  SmallVector<unsigned, 4> ArgsParts;
+  ParseFunctionArgs(Args, ArgsParts);
+
+  unsigned RegsLeft = NbRegs;
+  bool UseStack = false;
+  unsigned ValNo = 0;
+
+  for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
+    MVT ArgVT = Args[ValNo].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+    MVT LocVT = ArgVT;
+    CCValAssign::LocInfo LocInfo = CCValAssign::Full;
+
+    // Promote i8 to i16
+    if (LocVT == MVT::i8) {
+      LocVT = MVT::i16;
+      if (ArgFlags.isSExt())
+          LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+          LocInfo = CCValAssign::ZExt;
+      else
+          LocInfo = CCValAssign::AExt;
+    }
+
+    // Handle byval arguments
+    if (ArgFlags.isByVal()) {
+      State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, 2, ArgFlags);
+      continue;
+    }
+
+    unsigned Parts = ArgsParts[i];
+
+    if (!UseStack && Parts <= RegsLeft) {
+      unsigned FirstVal = ValNo;
+      for (unsigned j = 0; j < Parts; j++) {
+        unsigned Reg = State.AllocateReg(RegList, NbRegs);
+        State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+        RegsLeft--;
+      }
+
+      // Reverse the order of the pieces to agree with the "big endian" format
+      // required in the calling convention ABI.
+      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
+      std::reverse(B, B + Parts);
+    } else {
+      UseStack = true;
+      for (unsigned j = 0; j < Parts; j++)
+        CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    }
+  }
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeCallResult(Ins, RetCC_MSP430);
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeReturn(Outs, RetCC_MSP430);
+}
+
+template<typename ArgT>
+static void AnalyzeReturnValues(CCState &State,
+                                SmallVectorImpl<CCValAssign> &RVLocs,
+                                const SmallVectorImpl<ArgT> &Args) {
+  AnalyzeRetResult(State, Args);
+
+  // Reverse splitted return values to get the "big endian" format required
+  // to agree with the calling convention ABI.
+  std::reverse(RVLocs.begin(), RVLocs.end());
+}
+
 SDValue
 MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
                                            CallingConv::ID CallConv,
@@ -325,7 +442,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
+  AnalyzeArguments(CCInfo, ArgLocs, Ins);
 
   // Create frame index for the start of the first vararg value
   if (isVarArg) {
@@ -423,7 +540,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
                  getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
+  AnalyzeReturnValues(CCInfo, RVLocs, Outs);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -471,8 +588,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
+  AnalyzeArguments(CCInfo, ArgLocs, Outs);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -610,7 +726,7 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
+  AnalyzeReturnValues(CCInfo, RVLocs, Ins);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index c850594..7a0b00a 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -22,11 +22,14 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "MSP430GenInstrInfo.inc"
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MSP430InstrInfo::anchor() {}
+
 MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
   : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
     RI(tm) {}
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index d79f992..ad2b8cc 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -42,6 +42,7 @@ namespace MSP430II {
 
 class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
+  virtual void anchor();
 public:
   explicit MSP430InstrInfo(MSP430TargetMachine &TM);
 
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 043e5be..52f9ee5 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -33,7 +33,7 @@ GetGlobalAddressSymbol(const MachineOperand &MO) const {
   case 0: break;
   }
 
-  return Printer.Mang->getSymbol(MO.getGlobal());
+  return Printer.getSymbol(MO.getGlobal());
 }
 
 MCSymbol *MSP430MCInstLower::
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index f86428c..38be25c 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -23,84 +23,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-static bool isAcceptableChar(char C, bool AllowPeriod, bool AllowUTF8) {
-  if ((C < 'a' || C > 'z') &&
-      (C < 'A' || C > 'Z') &&
-      (C < '0' || C > '9') &&
-      C != '_' && C != '$' && C != '@' &&
-      !(AllowPeriod && C == '.') &&
-      !(AllowUTF8 && (C & 0x80)))
-    return false;
-  return true;
-}
-
-static char HexDigit(int V) {
-  return V < 10 ? V+'0' : V+'A'-10;
-}
-
-static void MangleLetter(SmallVectorImpl<char> &OutName, unsigned char C) {
-  OutName.push_back('_');
-  OutName.push_back(HexDigit(C >> 4));
-  OutName.push_back(HexDigit(C & 15));
-  OutName.push_back('_');
-}
-
-/// NameNeedsEscaping - Return true if the identifier \p Str needs quotes
-/// for this assembler.
-static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo *MAI) {
-  assert(!Str.empty() && "Cannot create an empty MCSymbol");
-  
-  // If the first character is a number and the target does not allow this, we
-  // need quotes.
-  if (!MAI->doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9')
-    return true;
-  
-  // If any of the characters in the string is an unacceptable character, force
-  // quotes.
-  bool AllowPeriod = MAI->doesAllowPeriodsInName();
-  bool AllowUTF8 = MAI->doesAllowUTF8();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i)
-    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
-      return true;
-  return false;
-}
-
-/// appendMangledName - Add the specified string in mangled form if it uses
-/// any unusual characters.
-static void appendMangledName(SmallVectorImpl<char> &OutName, StringRef Str,
-                              const MCAsmInfo *MAI) {
-  // The first character is not allowed to be a number unless the target
-  // explicitly allows it.
-  if (!MAI->doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9') {
-    MangleLetter(OutName, Str[0]);
-    Str = Str.substr(1);
-  }
-
-  bool AllowPeriod = MAI->doesAllowPeriodsInName();
-  bool AllowUTF8 = MAI->doesAllowUTF8();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
-      MangleLetter(OutName, Str[i]);
-    else
-      OutName.push_back(Str[i]);
-  }
-}
-
-
-/// appendMangledQuotedName - On systems that support quoted symbols, we still
-/// have to escape some (obscure) characters like " and \n which would break the
-/// assembler's lexing.
-static void appendMangledQuotedName(SmallVectorImpl<char> &OutName,
-                                   StringRef Str) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (Str[i] == '"' || Str[i] == '\n')
-      MangleLetter(OutName, Str[i]);
-    else
-      OutName.push_back(Str[i]);
-  }
-}
-
-
 /// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
 /// and the specified name as the global variable name.  GVName must not be
 /// empty.
@@ -111,7 +33,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   StringRef Name = GVName.toStringRef(TmpData);
   assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
   
-  const MCAsmInfo *MAI = Context.getAsmInfo();
+  const MCAsmInfo *MAI = TM->getMCAsmInfo();
   
   // If the global name is not led with \1, add the appropriate prefixes.
   if (Name[0] == '\1') {
@@ -136,26 +58,9 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
         OutName.append(Prefix, Prefix+strlen(Prefix));
     }
   }
-  
+
   // If this is a simple string that doesn't need escaping, just append it.
-  if (!NameNeedsEscaping(Name, MAI) ||
-      // If quotes are supported, they can be used unless the string contains
-      // a quote or newline.
-      (MAI->doesAllowQuotesInName() &&
-       Name.find_first_of("\n\"") == StringRef::npos)) {
-    OutName.append(Name.begin(), Name.end());
-    return;
-  }
-  
-  // On systems that do not allow quoted names, we need to mangle most
-  // strange characters.
-  if (!MAI->doesAllowQuotesInName())
-    return appendMangledName(OutName, Name, MAI);
-  
-  // Okay, the system allows quoted strings.  We can quote most anything, the
-  // only characters that need escaping are " and \n.
-  assert(Name.find_first_of("\n\"") != StringRef::npos);
-  return appendMangledQuotedName(OutName, Name);
+  OutName.append(Name.begin(), Name.end());
 }
 
 /// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
@@ -212,7 +117,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   
   // If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
   // add it.
-  if (Context.getAsmInfo()->hasMicrosoftFastStdCallMangling()) {
+  if (TM->getMCAsmInfo()->hasMicrosoftFastStdCallMangling()) {
     if (const Function *F = dyn_cast<Function>(GV)) {
       CallingConv::ID CC = F->getCallingConv();
     
@@ -236,13 +141,3 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
     }
   }
 }
-
-/// getSymbol - Return the MCSymbol for the specified global value.  This
-/// symbol is the main label that is the address of the global.
-MCSymbol *Mangler::getSymbol(const GlobalValue *GV) {
-  SmallString<60> NameStr;
-  getNameWithPrefix(NameStr, GV, false);
-  return Context.GetOrCreateSymbol(NameStr.str());
-}
-
-
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 3dd6562..cdae6c2 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsRegisterInfo.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -24,23 +25,25 @@
 
 using namespace llvm;
 
+namespace llvm {
+class MCInstrInfo;
+}
+
 namespace {
 class MipsAssemblerOptions {
 public:
-  MipsAssemblerOptions():
-    aTReg(1), reorder(true), macro(true) {
-  }
+  MipsAssemblerOptions() : aTReg(1), reorder(true), macro(true) {}
 
-  unsigned getATRegNum() {return aTReg;}
+  unsigned getATRegNum() { return aTReg; }
   bool setATReg(unsigned Reg);
 
-  bool isReorder() {return reorder;}
-  void setReorder() {reorder = true;}
-  void setNoreorder() {reorder = false;}
+  bool isReorder() { return reorder; }
+  void setReorder() { reorder = true; }
+  void setNoreorder() { reorder = false; }
 
-  bool isMacro() {return macro;}
-  void setMacro() {macro = true;}
-  void setNomacro() {macro = false;}
+  bool isMacro() { return macro; }
+  void setMacro() { macro = true; }
+  void setNomacro() { macro = false; }
 
 private:
   unsigned aTReg;
@@ -52,23 +55,21 @@ private:
 namespace {
 class MipsAsmParser : public MCTargetAsmParser {
 
-  enum FpFormatTy {
-    FP_FORMAT_NONE = -1,
-    FP_FORMAT_S,
-    FP_FORMAT_D,
-    FP_FORMAT_L,
-    FP_FORMAT_W
-  } FpFormat;
+  MipsTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = Parser.getStreamer().getTargetStreamer();
+    return static_cast<MipsTargetStreamer &>(TS);
+  }
 
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   MipsAssemblerOptions Options;
+  bool hasConsumedDollar;
 
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
                                bool MatchingInlineAsm);
 
@@ -76,56 +77,98 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   bool ParseDirective(AsmToken DirectiveID);
 
   MipsAsmParser::OperandMatchResultTy
-  parseRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                         int RegKind);
+  parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseGPR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                   int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseGPR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  bool parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                   int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseHW64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseACRegsDSP(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseFGRH32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128BRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128HRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128WRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128DRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128CtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                          unsigned RegKind);
 
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &,
                     StringRef Mnemonic);
 
   int tryParseRegister(bool is64BitReg);
 
-  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                bool is64BitReg);
 
   bool needsExpansion(MCInst &Inst);
@@ -139,17 +182,19 @@ class MipsAsmParser : public MCTargetAsmParser {
   void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
   void expandMemInst(MCInst &Inst, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions,
-                     bool isLoad,bool isImmOpnd);
+                     SmallVectorImpl<MCInst> &Instructions, bool isLoad,
+                     bool isImmOpnd);
   bool reportParseError(StringRef ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
   bool parseRelocOperand(const MCExpr *&Res);
 
-  const MCExpr* evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
+  const MCExpr *evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
 
   bool isEvaluated(const MCExpr *Expr);
   bool parseDirectiveSet();
+  bool parseDirectiveMipsHackStocg();
+  bool parseDirectiveMipsHackELFFlags();
 
   bool parseSetAtDirective();
   bool parseSetNoAtDirective();
@@ -161,6 +206,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetAssignment();
 
   bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveGpWord();
 
   MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
 
@@ -172,40 +218,49 @@ class MipsAsmParser : public MCTargetAsmParser {
     return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
   }
 
+  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
+
   int matchRegisterName(StringRef Symbol, bool is64BitReg);
 
   int matchCPURegisterName(StringRef Symbol);
 
   int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
 
-  int matchFPURegisterName(StringRef Name, FpFormatTy Format);
+  int matchFPURegisterName(StringRef Name);
 
-  void setFpFormat(FpFormatTy Format) {
-    FpFormat = Format;
-  }
+  int matchFCCRegisterName(StringRef Name);
 
-  void setDefaultFpFormat();
+  int matchACRegisterName(StringRef Name);
 
-  void setFpFormat(StringRef Format);
+  int matchMSA128RegisterName(StringRef Name);
 
-  FpFormatTy getFpFormat() {return FpFormat;}
+  int matchMSA128CtrlRegisterName(StringRef Name);
+
+  int regKindToRegClass(int RegKind);
 
   unsigned getReg(int RC, int RegNo);
 
   int getATReg();
 
   bool processInstruction(MCInst &Inst, SMLoc IDLoc,
-                        SmallVectorImpl<MCInst> &Instructions);
+                          SmallVectorImpl<MCInst> &Instructions);
+
+  // Helper function that checks if the value of a vector index is within the
+  // boundaries of accepted values for each RegisterKind
+  // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
+  bool validateMSAIndex(int Val, int RegKind);
+
 public:
-  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser),
+        hasConsumedDollar(false) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
 };
 }
 
@@ -221,13 +276,21 @@ public:
     Kind_GPR32,
     Kind_GPR64,
     Kind_HWRegs,
-    Kind_HW64Regs,
     Kind_FGR32Regs,
+    Kind_FGRH32Regs,
     Kind_FGR64Regs,
     Kind_AFGR64Regs,
     Kind_CCRRegs,
     Kind_FCCRegs,
-    Kind_ACRegsDSP
+    Kind_ACC64DSP,
+    Kind_LO32DSP,
+    Kind_HI32DSP,
+    Kind_COP2,
+    Kind_MSA128BRegs,
+    Kind_MSA128HRegs,
+    Kind_MSA128WRegs,
+    Kind_MSA128DRegs,
+    Kind_MSA128CtrlRegs
   };
 
 private:
@@ -238,7 +301,9 @@ private:
     k_Memory,
     k_PostIndexRegister,
     k_Register,
-    k_Token
+    k_PtrReg,
+    k_Token,
+    k_LSAImm
   } Kind;
 
   MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -277,7 +342,12 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+  void addPtrRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getPtrReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediate when possible.  Null MCExpr = 0.
     if (Expr == 0)
       Inst.addOperand(MCOperand::CreateImm(0));
@@ -306,6 +376,9 @@ public:
   bool isImm() const { return Kind == k_Immediate; }
   bool isToken() const { return Kind == k_Token; }
   bool isMem() const { return Kind == k_Memory; }
+  bool isPtrReg() const { return Kind == k_PtrReg; }
+  bool isInvNum() const { return Kind == k_Immediate; }
+  bool isLSAImm() const { return Kind == k_LSAImm; }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
@@ -317,13 +390,18 @@ public:
     return Reg.RegNum;
   }
 
+  unsigned getPtrReg() const {
+    assert((Kind == k_PtrReg) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
   void setRegKind(RegisterKind RegKind) {
-    assert((Kind == k_Register) && "Invalid access!");
+    assert((Kind == k_Register || Kind == k_PtrReg) && "Invalid access!");
     Reg.Kind = RegKind;
   }
 
   const MCExpr *getImm() const {
-    assert((Kind == k_Immediate) && "Invalid access!");
+    assert((Kind == k_Immediate || Kind == k_LSAImm) && "Invalid access!");
     return Imm.Val;
   }
 
@@ -354,6 +432,14 @@ public:
     return Op;
   }
 
+  static MipsOperand *CreatePtrReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_PtrReg);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
     MipsOperand *Op = new MipsOperand(k_Immediate);
     Op->Imm.Val = Val;
@@ -362,8 +448,16 @@ public:
     return Op;
   }
 
+  static MipsOperand *CreateLSAImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_LSAImm);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
-                                 SMLoc S, SMLoc E) {
+                                SMLoc S, SMLoc E) {
     MipsOperand *Op = new MipsOperand(k_Memory);
     Op->Mem.Base = Base;
     Op->Mem.Off = Off;
@@ -388,17 +482,12 @@ public:
     return Reg.Kind == Kind_HWRegs;
   }
 
-  bool isHW64RegsAsm() const {
-    assert((Kind == k_Register) && "Invalid access!");
-    return Reg.Kind == Kind_HW64Regs;
-  }
-
   bool isCCRAsm() const {
     assert((Kind == k_Register) && "Invalid access!");
     return Reg.Kind == Kind_CCRRegs;
   }
 
-   bool isAFGR64Asm() const {
+  bool isAFGR64Asm() const {
     return Kind == k_Register && Reg.Kind == Kind_AFGR64Regs;
   }
 
@@ -410,28 +499,58 @@ public:
     return (Kind == k_Register) && Reg.Kind == Kind_FGR32Regs;
   }
 
+  bool isFGRH32Asm() const {
+    return (Kind == k_Register) && Reg.Kind == Kind_FGRH32Regs;
+  }
+
   bool isFCCRegsAsm() const {
     return (Kind == k_Register) && Reg.Kind == Kind_FCCRegs;
   }
 
-  bool isACRegsDSPAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_ACRegsDSP;
+  bool isACC64DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_ACC64DSP;
   }
 
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const {
-    return StartLoc;
+  bool isLO32DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_LO32DSP;
   }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const {
-    return EndLoc;
+
+  bool isHI32DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_HI32DSP;
+  }
+
+  bool isCOP2Asm() const { return Kind == k_Register && Reg.Kind == Kind_COP2; }
+
+  bool isMSA128BAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128BRegs;
+  }
+
+  bool isMSA128HAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128HRegs;
+  }
+
+  bool isMSA128WAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128WRegs;
+  }
+
+  bool isMSA128DAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128DRegs;
   }
 
+  bool isMSA128CRAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128CtrlRegs;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
   virtual void print(raw_ostream &OS) const {
     llvm_unreachable("unimplemented!");
   }
 }; // class MipsOperand
-}  // namespace
+} // namespace
 
 namespace llvm {
 extern const MCInstrDesc MipsInsts[];
@@ -462,8 +581,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     // reference or immediate we may have to expand instructions.
     for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
       const MCOperandInfo &OpInfo = MCID.OpInfo[i];
-      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY)
-          || (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+          (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
         MCOperand &Op = Inst.getOperand(i);
         if (Op.isImm()) {
           int MemOffset = Op.getImm();
@@ -476,7 +595,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
           const MCExpr *Expr = Op.getExpr();
           if (Expr->getKind() == MCExpr::SymbolRef) {
             const MCSymbolRefExpr *SR =
-                static_cast<const MCSymbolRefExpr*>(Expr);
+                static_cast<const MCSymbolRefExpr *>(Expr);
             if (SR->getKind() == MCSymbolRefExpr::VK_None) {
               // Expand symbol.
               expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), false);
@@ -489,7 +608,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         }
       }
     } // for
-  } // if load/store
+  }   // if load/store
 
   if (needsExpansion(Inst))
     expandInstruction(Inst, IDLoc, Instructions);
@@ -512,7 +631,7 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
 }
 
 void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+                                      SmallVectorImpl<MCInst> &Instructions) {
   switch (Inst.getOpcode()) {
   case Mips::LoadImm32Reg:
     return expandLoadImm(Inst, IDLoc, Instructions);
@@ -567,8 +686,9 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
-void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+void
+MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(2);
   assert(ImmOp.isImm() && "expected immediate operand kind");
@@ -609,8 +729,9 @@ void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
-void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+void
+MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(1);
   assert(ImmOp.isImm() && "expected immediate operand kind");
@@ -643,14 +764,15 @@ void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
 }
 
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
-          SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd) {
+                                  SmallVectorImpl<MCInst> &Instructions,
+                                  bool isLoad, bool isImmOpnd) {
   const MCSymbolRefExpr *SR;
   MCInst TempInst;
   unsigned ImmOffset, HiOffset, LoOffset;
   const MCExpr *ExprOffset;
   unsigned TmpRegNum;
-  unsigned AtRegNum = getReg((isMips64()) ? Mips::GPR64RegClassID
-                             : Mips::GPR32RegClassID, getATReg());
+  unsigned AtRegNum = getReg(
+      (isMips64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg());
   // 1st operand is either the source or destination register.
   assert(Inst.getOperand(0).isReg() && "expected register operand kind");
   unsigned RegOpNum = Inst.getOperand(0).getReg();
@@ -680,7 +802,7 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
     TempInst.addOperand(MCOperand::CreateImm(HiOffset));
   else {
     if (ExprOffset->getKind() == MCExpr::SymbolRef) {
-      SR = static_cast<const MCSymbolRefExpr*>(ExprOffset);
+      SR = static_cast<const MCSymbolRefExpr *>(ExprOffset);
       const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
           SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_HI,
           getContext());
@@ -723,15 +845,14 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   TempInst.clear();
 }
 
-bool MipsAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool MipsAsmParser::MatchAndEmitInstruction(
+    SMLoc IDLoc, unsigned &Opcode,
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, MCStreamer &Out,
+    unsigned &ErrorInfo, bool MatchingInlineAsm) {
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
-  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
-                                              MatchingInlineAsm);
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
   default:
@@ -752,7 +873,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((MipsOperand*) Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((MipsOperand *)Operands[ErrorInfo])->getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -766,44 +887,44 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 }
 
 int MipsAsmParser::matchCPURegisterName(StringRef Name) {
-   int CC;
+  int CC;
 
   if (Name == "at")
     return getATReg();
 
-    CC = StringSwitch<unsigned>(Name)
-    .Case("zero", 0)
-    .Case("a0",   4)
-    .Case("a1",   5)
-    .Case("a2",   6)
-    .Case("a3",   7)
-    .Case("v0",   2)
-    .Case("v1",   3)
-    .Case("s0",  16)
-    .Case("s1",  17)
-    .Case("s2",  18)
-    .Case("s3",  19)
-    .Case("s4",  20)
-    .Case("s5",  21)
-    .Case("s6",  22)
-    .Case("s7",  23)
-    .Case("k0",  26)
-    .Case("k1",  27)
-    .Case("sp",  29)
-    .Case("fp",  30)
-    .Case("gp",  28)
-    .Case("ra",  31)
-    .Case("t0",   8)
-    .Case("t1",   9)
-    .Case("t2",  10)
-    .Case("t3",  11)
-    .Case("t4",  12)
-    .Case("t5",  13)
-    .Case("t6",  14)
-    .Case("t7",  15)
-    .Case("t8",  24)
-    .Case("t9",  25)
-    .Default(-1);
+  CC = StringSwitch<unsigned>(Name)
+           .Case("zero", 0)
+           .Case("a0", 4)
+           .Case("a1", 5)
+           .Case("a2", 6)
+           .Case("a3", 7)
+           .Case("v0", 2)
+           .Case("v1", 3)
+           .Case("s0", 16)
+           .Case("s1", 17)
+           .Case("s2", 18)
+           .Case("s3", 19)
+           .Case("s4", 20)
+           .Case("s5", 21)
+           .Case("s6", 22)
+           .Case("s7", 23)
+           .Case("k0", 26)
+           .Case("k1", 27)
+           .Case("sp", 29)
+           .Case("fp", 30)
+           .Case("gp", 28)
+           .Case("ra", 31)
+           .Case("t0", 8)
+           .Case("t1", 9)
+           .Case("t2", 10)
+           .Case("t3", 11)
+           .Case("t4", 12)
+           .Case("t5", 13)
+           .Case("t6", 14)
+           .Case("t7", 15)
+           .Case("t8", 24)
+           .Case("t9", 25)
+           .Default(-1);
 
   // Although SGI documentation just cuts out t0-t3 for n32/n64,
   // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
@@ -813,72 +934,140 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   if (CC == -1 && isMips64())
     CC = StringSwitch<unsigned>(Name)
-      .Case("a4",   8)
-      .Case("a5",   9)
-      .Case("a6",  10)
-      .Case("a7",  11)
-      .Case("kt0", 26)
-      .Case("kt1", 27)
-      .Case("s8",  30)
-      .Default(-1);
+             .Case("a4", 8)
+             .Case("a5", 9)
+             .Case("a6", 10)
+             .Case("a7", 11)
+             .Case("kt0", 26)
+             .Case("kt1", 27)
+             .Case("s8", 30)
+             .Default(-1);
 
   return CC;
 }
 
-int MipsAsmParser::matchFPURegisterName(StringRef Name, FpFormatTy Format) {
+int MipsAsmParser::matchFPURegisterName(StringRef Name) {
 
   if (Name[0] == 'f') {
     StringRef NumString = Name.substr(1);
     unsigned IntVal;
     if (NumString.getAsInteger(10, IntVal))
-      return -1; // This is not an integer.
-    if (IntVal > 31)
+      return -1;     // This is not an integer.
+    if (IntVal > 31) // Maximum index for fpu register.
       return -1;
+    return IntVal;
+  }
+  return -1;
+}
 
-    if (Format == FP_FORMAT_S || Format == FP_FORMAT_W)
-      return getReg(Mips::FGR32RegClassID, IntVal);
-    if (Format == FP_FORMAT_D) {
-      if (isFP64()) {
-        return getReg(Mips::FGR64RegClassID, IntVal);
-      }
-      // Only even numbers available as register pairs.
-      if ((IntVal > 31) || (IntVal % 2 != 0))
-        return -1;
-      return getReg(Mips::AFGR64RegClassID, IntVal / 2);
-    }
+int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
+
+  if (Name.startswith("fcc")) {
+    StringRef NumString = Name.substr(3);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 7) // There are only 8 fcc registers.
+      return -1;
+    return IntVal;
   }
   return -1;
 }
 
-int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
+int MipsAsmParser::matchACRegisterName(StringRef Name) {
+
+  if (Name.startswith("ac")) {
+    StringRef NumString = Name.substr(2);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 3) // There are only 3 acc registers.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
 
-  if (Name.equals("fcc0"))
-    return Mips::FCC0;
+int MipsAsmParser::matchMSA128RegisterName(StringRef Name) {
+  unsigned IntVal;
+
+  if (Name.front() != 'w' || Name.drop_front(1).getAsInteger(10, IntVal))
+    return -1;
+
+  if (IntVal > 31)
+    return -1;
+
+  return IntVal;
+}
+
+int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+           .Case("msair", 0)
+           .Case("msacsr", 1)
+           .Case("msaaccess", 2)
+           .Case("msasave", 3)
+           .Case("msamodify", 4)
+           .Case("msarequest", 5)
+           .Case("msamap", 6)
+           .Case("msaunmap", 7)
+           .Default(-1);
+
+  return CC;
+}
+
+int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
 
   int CC;
   CC = matchCPURegisterName(Name);
   if (CC != -1)
     return matchRegisterByNumber(CC, is64BitReg ? Mips::GPR64RegClassID
                                                 : Mips::GPR32RegClassID);
-  return matchFPURegisterName(Name, getFpFormat());
-}
-
-void MipsAsmParser::setDefaultFpFormat() {
-
-  if (isMips64() || isFP64())
-    FpFormat = FP_FORMAT_D;
-  else
-    FpFormat = FP_FORMAT_S;
+  CC = matchFPURegisterName(Name);
+  // TODO: decide about fpu register class
+  if (CC != -1)
+    return matchRegisterByNumber(CC, isFP64() ? Mips::FGR64RegClassID
+                                              : Mips::FGR32RegClassID);
+  return matchMSA128RegisterName(Name);
 }
 
-void MipsAsmParser::setFpFormat(StringRef Format) {
-
-  FpFormat = StringSwitch<FpFormatTy>(Format.lower())
-    .Case(".s",  FP_FORMAT_S)
-    .Case(".d",  FP_FORMAT_D)
-    .Case(".l",  FP_FORMAT_L)
-    .Case(".w",  FP_FORMAT_W)
-    .Default(FP_FORMAT_NONE);
+int MipsAsmParser::regKindToRegClass(int RegKind) {
+
+  switch (RegKind) {
+  case MipsOperand::Kind_GPR32:
+    return Mips::GPR32RegClassID;
+  case MipsOperand::Kind_GPR64:
+    return Mips::GPR64RegClassID;
+  case MipsOperand::Kind_HWRegs:
+    return Mips::HWRegsRegClassID;
+  case MipsOperand::Kind_FGR32Regs:
+    return Mips::FGR32RegClassID;
+  case MipsOperand::Kind_FGRH32Regs:
+    return Mips::FGRH32RegClassID;
+  case MipsOperand::Kind_FGR64Regs:
+    return Mips::FGR64RegClassID;
+  case MipsOperand::Kind_AFGR64Regs:
+    return Mips::AFGR64RegClassID;
+  case MipsOperand::Kind_CCRRegs:
+    return Mips::CCRRegClassID;
+  case MipsOperand::Kind_ACC64DSP:
+    return Mips::ACC64DSPRegClassID;
+  case MipsOperand::Kind_FCCRegs:
+    return Mips::FCCRegClassID;
+  case MipsOperand::Kind_MSA128BRegs:
+    return Mips::MSA128BRegClassID;
+  case MipsOperand::Kind_MSA128HRegs:
+    return Mips::MSA128HRegClassID;
+  case MipsOperand::Kind_MSA128WRegs:
+    return Mips::MSA128WRegClassID;
+  case MipsOperand::Kind_MSA128DRegs:
+    return Mips::MSA128DRegClassID;
+  case MipsOperand::Kind_MSA128CtrlRegs:
+    return Mips::MSACtrlRegClassID;
+  default:
+    return -1;
+  }
 }
 
 bool MipsAssemblerOptions::setATReg(unsigned Reg) {
@@ -889,17 +1078,15 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   return true;
 }
 
-int MipsAsmParser::getATReg() {
-  return Options.getATRegNum();
-}
+int MipsAsmParser::getATReg() { return Options.getATRegNum(); }
 
 unsigned MipsAsmParser::getReg(int RC, int RegNo) {
   return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
 }
 
 int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
-
-  if (RegNum > 31)
+  if (RegNum >
+      getContext().getRegisterInfo()->getRegClass(RegClass).getNumRegs())
     return -1;
 
   return getReg(RegClass, RegNum);
@@ -914,12 +1101,13 @@ int MipsAsmParser::tryParseRegister(bool is64BitReg) {
     RegNum = matchRegisterName(lowerCase, is64BitReg);
   } else if (Tok.is(AsmToken::Integer))
     RegNum = matchRegisterByNumber(static_cast<unsigned>(Tok.getIntVal()),
-        is64BitReg ? Mips::GPR64RegClassID : Mips::GPR32RegClassID);
+                                   is64BitReg ? Mips::GPR64RegClassID
+                                              : Mips::GPR32RegClassID);
   return RegNum;
 }
 
 bool MipsAsmParser::tryParseRegisterOperand(
-             SmallVectorImpl<MCParsedAsmOperand*> &Operands, bool is64BitReg) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, bool is64BitReg) {
 
   SMLoc S = Parser.getTok().getLoc();
   int RegNo = -1;
@@ -928,14 +1116,15 @@ bool MipsAsmParser::tryParseRegisterOperand(
   if (RegNo == -1)
     return true;
 
-  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
-                                            Parser.getTok().getLoc()));
+  Operands.push_back(
+      MipsOperand::CreateReg(RegNo, S, Parser.getTok().getLoc()));
   Parser.Lex(); // Eat register token.
   return false;
 }
 
-bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
-                                 StringRef Mnemonic) {
+bool
+MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                            StringRef Mnemonic) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -983,22 +1172,39 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
       return true;
 
     SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-
     MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
-
     // Otherwise create a symbol reference.
-    const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
-                                                getContext());
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
 
     Operands.push_back(MipsOperand::CreateImm(Res, S, E));
     return false;
   }
   case AsmToken::Identifier:
+    // For instruction aliases like "bc1f $Label" dedicated parser will
+    // eat the '$' sign before failing. So in order to look for appropriate
+    // label we must check first if we have already consumed '$'.
+    if (hasConsumedDollar) {
+      hasConsumedDollar = false;
+      SMLoc S = Parser.getTok().getLoc();
+      StringRef Identifier;
+      if (Parser.parseIdentifier(Identifier))
+        return true;
+      SMLoc E =
+          SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+      // Create a symbol reference.
+      const MCExpr *Res =
+          MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+      Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+      return false;
+    }
     // Look for the existing symbol, we should check if
     // we need to assigne the propper RegisterKind.
     if (searchSymbolAlias(Operands, MipsOperand::Kind_None))
       return false;
-    // Else drop to expression parsing.
+  // Else drop to expression parsing.
   case AsmToken::LParen:
   case AsmToken::Minus:
   case AsmToken::Plus:
@@ -1029,7 +1235,7 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
   return true;
 }
 
-const MCExpr* MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
+const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
                                                StringRef RelocStr) {
   const MCExpr *Res;
   // Check the type of the expression.
@@ -1099,7 +1305,7 @@ bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
 }
 
 bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
-  Parser.Lex(); // Eat the % token.
+  Parser.Lex();                          // Eat the % token.
   const AsmToken &Tok = Parser.getTok(); // Get next token, operation.
   if (Tok.isNot(AsmToken::Identifier))
     return true;
@@ -1145,7 +1351,7 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   StartLoc = Parser.getTok().getLoc();
   RegNo = tryParseRegister(isMips64());
   EndLoc = Parser.getTok().getLoc();
-  return (RegNo == (unsigned) -1);
+  return (RegNo == (unsigned)-1);
 }
 
 bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
@@ -1177,11 +1383,12 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
 }
 
 MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 
   const MCExpr *IdVal = 0;
   SMLoc S;
   bool isParenExpr = false;
+  MipsAsmParser::OperandMatchResultTy Res = MatchOperand_NoMatch;
   // First operand is the offset.
   S = Parser.getTok().getLoc();
 
@@ -1196,21 +1403,20 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
     const AsmToken &Tok = Parser.getTok(); // Get the next token.
     if (Tok.isNot(AsmToken::LParen)) {
-      MipsOperand *Mnemonic = static_cast<MipsOperand*>(Operands[0]);
+      MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
       if (Mnemonic->getToken() == "la") {
-        SMLoc E = SMLoc::getFromPointer(
-            Parser.getTok().getLoc().getPointer() - 1);
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
         Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
         return MatchOperand_Success;
       }
       if (Tok.is(AsmToken::EndOfStatement)) {
-        SMLoc E = SMLoc::getFromPointer(
-            Parser.getTok().getLoc().getPointer() - 1);
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
         // Zero register assumed, add a memory operand with ZERO as its base.
-        Operands.push_back(MipsOperand::CreateMem(isMips64() ? Mips::ZERO_64
-                                                             : Mips::ZERO,
-                           IdVal, S, E));
+        Operands.push_back(MipsOperand::CreateMem(
+            isMips64() ? Mips::ZERO_64 : Mips::ZERO, IdVal, S, E));
         return MatchOperand_Success;
       }
       Error(Parser.getTok().getLoc(), "'(' expected");
@@ -1220,21 +1426,12 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     Parser.Lex(); // Eat the '(' token.
   }
 
-  const AsmToken &Tok1 = Parser.getTok(); // Get next token
-  if (Tok1.is(AsmToken::Dollar)) {
-    Parser.Lex(); // Eat the '$' token.
-    if (tryParseRegisterOperand(Operands, isMips64())) {
-      Error(Parser.getTok().getLoc(), "unexpected token in operand");
-      return MatchOperand_ParseFail;
-    }
-
-  } else {
-    Error(Parser.getTok().getLoc(), "unexpected token in operand");
-    return MatchOperand_ParseFail;
-  }
+  Res = parseRegs(Operands, isMips64() ? (int)MipsOperand::Kind_GPR64
+                                       : (int)MipsOperand::Kind_GPR32);
+  if (Res != MatchOperand_Success)
+    return Res;
 
-  const AsmToken &Tok2 = Parser.getTok(); // Get next token.
-  if (Tok2.isNot(AsmToken::RParen)) {
+  if (Parser.getTok().isNot(AsmToken::RParen)) {
     Error(Parser.getTok().getLoc(), "')' expected");
     return MatchOperand_ParseFail;
   }
@@ -1247,7 +1444,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     IdVal = MCConstantExpr::Create(0, getContext());
 
   // Replace the register operand with the memory operand.
-  MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+  MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
   int RegNo = op->getReg();
   // Remove the register from the operands.
   Operands.pop_back();
@@ -1266,31 +1463,150 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
   return MatchOperand_Success;
 }
 
+bool MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                                int RegKind) {
+  // If the first token is not '$' we have an error.
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return false;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex();
+  AsmToken::TokenKind TkKind = getLexer().getKind();
+  int Reg;
+
+  if (TkKind == AsmToken::Integer) {
+    Reg = matchRegisterByNumber(Parser.getTok().getIntVal(),
+                                regKindToRegClass(RegKind));
+    if (Reg == -1)
+      return false;
+  } else if (TkKind == AsmToken::Identifier) {
+    if ((Reg = matchCPURegisterName(Parser.getTok().getString().lower())) == -1)
+      return false;
+    Reg = getReg(regKindToRegClass(RegKind), Reg);
+  } else {
+    return false;
+  }
+
+  MipsOperand *Op = MipsOperand::CreatePtrReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind((MipsOperand::RegisterKind)RegKind);
+  Operands.push_back(Op);
+  Parser.Lex();
+  return true;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  MipsOperand::RegisterKind RegKind =
+      isN64() ? MipsOperand::Kind_GPR64 : MipsOperand::Kind_GPR32;
+
+  // Parse index register.
+  if (!parsePtrReg(Operands, RegKind))
+    return MatchOperand_NoMatch;
+
+  // Parse '('.
+  if (Parser.getTok().isNot(AsmToken::LParen))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
+  Parser.Lex();
+
+  // Parse base register.
+  if (!parsePtrReg(Operands, RegKind))
+    return MatchOperand_NoMatch;
+
+  // Parse ')'.
+  if (Parser.getTok().isNot(AsmToken::RParen))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(MipsOperand::CreateToken(")", getLexer().getLoc()));
+  Parser.Lex();
+
+  return MatchOperand_Success;
+}
+
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                          int RegKind) {
   MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
-  if (getLexer().getKind() == AsmToken::Identifier) {
+  if (getLexer().getKind() == AsmToken::Identifier && !hasConsumedDollar) {
     if (searchSymbolAlias(Operands, Kind))
       return MatchOperand_Success;
     return MatchOperand_NoMatch;
   }
+  SMLoc S = Parser.getTok().getLoc();
   // If the first token is not '$', we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+  if (Parser.getTok().isNot(AsmToken::Dollar) && !hasConsumedDollar)
     return MatchOperand_NoMatch;
-
-  Parser.Lex(); // Eat $
-  if (!tryParseRegisterOperand(Operands,
-                               RegKind == MipsOperand::Kind_GPR64)) {
-    // Set the proper register kind.
-    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
-    op->setRegKind(Kind);
-    if ((Kind == MipsOperand::Kind_GPR32)
-      && (getLexer().is(AsmToken::LParen))) {
+  if (!hasConsumedDollar) {
+    Parser.Lex(); // Eat the '$'
+    hasConsumedDollar = true;
+  }
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    int RegNum = -1;
+    std::string RegName = Parser.getTok().getString().lower();
+    // Match register by name
+    switch (RegKind) {
+    case MipsOperand::Kind_GPR32:
+    case MipsOperand::Kind_GPR64:
+      RegNum = matchCPURegisterName(RegName);
+      break;
+    case MipsOperand::Kind_AFGR64Regs:
+    case MipsOperand::Kind_FGR64Regs:
+    case MipsOperand::Kind_FGR32Regs:
+    case MipsOperand::Kind_FGRH32Regs:
+      RegNum = matchFPURegisterName(RegName);
+      if (RegKind == MipsOperand::Kind_AFGR64Regs)
+        RegNum /= 2;
+      else if (RegKind == MipsOperand::Kind_FGRH32Regs && !isFP64())
+        if (RegNum != -1 && RegNum % 2 != 0)
+          Warning(S, "Float register should be even.");
+      break;
+    case MipsOperand::Kind_FCCRegs:
+      RegNum = matchFCCRegisterName(RegName);
+      break;
+    case MipsOperand::Kind_ACC64DSP:
+      RegNum = matchACRegisterName(RegName);
+      break;
+    default:
+      break; // No match, value is set to -1.
+    }
+    // No match found, return _NoMatch to give a chance to other round.
+    if (RegNum < 0)
+      return MatchOperand_NoMatch;
+
+    int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+    if (RegVal == -1)
+      return MatchOperand_NoMatch;
+
+    MipsOperand *Op =
+        MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+    Op->setRegKind(Kind);
+    Operands.push_back(Op);
+    hasConsumedDollar = false;
+    Parser.Lex(); // Eat the register name.
+    return MatchOperand_Success;
+  } else if (getLexer().getKind() == AsmToken::Integer) {
+    unsigned RegNum = Parser.getTok().getIntVal();
+    if (Kind == MipsOperand::Kind_HWRegs) {
+      if (RegNum != 29)
+        return MatchOperand_NoMatch;
+      // Only hwreg 29 is supported, found at index 0.
+      RegNum = 0;
+    }
+    int Reg = matchRegisterByNumber(RegNum, regKindToRegClass(Kind));
+    if (Reg == -1)
+      return MatchOperand_NoMatch;
+    MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+    Op->setRegKind(Kind);
+    Operands.push_back(Op);
+    hasConsumedDollar = false;
+    Parser.Lex(); // Eat the register number.
+    if ((RegKind == MipsOperand::Kind_GPR32) &&
+        (getLexer().is(AsmToken::LParen))) {
       // Check if it is indexed addressing operand.
       Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
       Parser.Lex(); // Eat the parenthesis.
-      if (parseRegs(Operands,RegKind) != MatchOperand_Success)
+      if (parseRegs(Operands, RegKind) != MatchOperand_Success)
         return MatchOperand_NoMatch;
       if (getLexer().isNot(AsmToken::RParen))
         return MatchOperand_NoMatch;
@@ -1302,49 +1618,254 @@ MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   return MatchOperand_NoMatch;
 }
 
+bool MipsAsmParser::validateMSAIndex(int Val, int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+
+  if (Val < 0)
+    return false;
+
+  switch (Kind) {
+  default:
+    return false;
+  case MipsOperand::Kind_MSA128BRegs:
+    return Val < 16;
+  case MipsOperand::Kind_MSA128HRegs:
+    return Val < 8;
+  case MipsOperand::Kind_MSA128WRegs:
+    return Val < 4;
+  case MipsOperand::Kind_MSA128DRegs:
+    return Val < 2;
+  }
+}
+
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseGPR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                            int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+  SMLoc S = Parser.getTok().getLoc();
+  std::string RegName;
 
-  if (!isMips64())
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  switch (RegKind) {
+  default:
+    return MatchOperand_ParseFail;
+  case MipsOperand::Kind_MSA128BRegs:
+  case MipsOperand::Kind_MSA128HRegs:
+  case MipsOperand::Kind_MSA128WRegs:
+  case MipsOperand::Kind_MSA128DRegs:
+    break;
+  }
+
+  Parser.Lex(); // Eat the '$'.
+  if (getLexer().getKind() == AsmToken::Identifier)
+    RegName = Parser.getTok().getString().lower();
+  else
+    return MatchOperand_ParseFail;
+
+  int RegNum = matchMSA128RegisterName(RegName);
+
+  if (RegNum < 0 || RegNum > 31)
+    return MatchOperand_ParseFail;
+
+  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+  if (RegVal == -1)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *Op = MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+  Op->setRegKind(Kind);
+  Operands.push_back(Op);
+
+  Parser.Lex(); // Eat the register identifier.
+
+  // MSA registers may be suffixed with an index in the form of:
+  // 1) Immediate expression.
+  // 2) General Purpose Register.
+  // Examples:
+  //   1) copy_s.b $29,$w0[0]
+  //   2) sld.b $w0,$w1[$1]
+
+  if (Parser.getTok().isNot(AsmToken::LBrac))
+    return MatchOperand_Success;
+
+  MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
+
+  Operands.push_back(MipsOperand::CreateToken("[", Parser.getTok().getLoc()));
+  Parser.Lex(); // Parse the '[' token.
+
+  if (Parser.getTok().is(AsmToken::Dollar)) {
+    // This must be a GPR.
+    MipsOperand *RegOp;
+    SMLoc VIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Parse the '$' token.
+
+    // GPR have aliases and we must account for that. Example: $30 == $fp
+    if (getLexer().getKind() == AsmToken::Integer) {
+      unsigned RegNum = Parser.getTok().getIntVal();
+      int Reg = matchRegisterByNumber(
+          RegNum, regKindToRegClass(MipsOperand::Kind_GPR32));
+      if (Reg == -1) {
+        Error(VIdx, "invalid general purpose register");
+        return MatchOperand_ParseFail;
+      }
+
+      RegOp = MipsOperand::CreateReg(Reg, VIdx, Parser.getTok().getLoc());
+    } else if (getLexer().getKind() == AsmToken::Identifier) {
+      int RegNum = -1;
+      std::string RegName = Parser.getTok().getString().lower();
+
+      RegNum = matchCPURegisterName(RegName);
+      if (RegNum == -1) {
+        Error(VIdx, "general purpose register expected");
+        return MatchOperand_ParseFail;
+      }
+      RegNum = getReg(regKindToRegClass(MipsOperand::Kind_GPR32), RegNum);
+      RegOp = MipsOperand::CreateReg(RegNum, VIdx, Parser.getTok().getLoc());
+    } else
+      return MatchOperand_ParseFail;
+
+    RegOp->setRegKind(MipsOperand::Kind_GPR32);
+    Operands.push_back(RegOp);
+    Parser.Lex(); // Eat the register identifier.
+
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
+    Parser.Lex(); // Parse the ']' token.
+
+    return MatchOperand_Success;
+  }
+
+  // The index must be a constant expression then.
+  SMLoc VIdx = Parser.getTok().getLoc();
+  const MCExpr *ImmVal;
+
+  if (getParser().parseExpression(ImmVal))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *expr = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!expr || !validateMSAIndex((int)expr->getValue(), Kind)) {
+    Error(VIdx, "invalid immediate value");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = Parser.getTok().getEndLoc();
+
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return MatchOperand_ParseFail;
+
+  bool insve =
+      Mnemonic->getToken() == "insve.b" || Mnemonic->getToken() == "insve.h" ||
+      Mnemonic->getToken() == "insve.w" || Mnemonic->getToken() == "insve.d";
+
+  // The second vector index of insve instructions is always 0.
+  if (insve && Operands.size() > 6) {
+    if (expr->getValue() != 0) {
+      Error(VIdx, "immediate value must be 0");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(MipsOperand::CreateToken("0", VIdx));
+  } else
+    Operands.push_back(MipsOperand::CreateImm(expr, VIdx, E));
+
+  Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
+
+  Parser.Lex(); // Parse the ']' token.
+
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                                int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+
+  if (Kind != MipsOperand::Kind_MSA128CtrlRegs)
     return MatchOperand_NoMatch;
-  return parseRegs(Operands, (int) MipsOperand::Kind_GPR64);
+
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_ParseFail;
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  Parser.Lex(); // Eat the '$' symbol.
+
+  int RegNum = -1;
+  if (getLexer().getKind() == AsmToken::Identifier)
+    RegNum = matchMSA128CtrlRegisterName(Parser.getTok().getString().lower());
+  else if (getLexer().getKind() == AsmToken::Integer)
+    RegNum = Parser.getTok().getIntVal();
+  else
+    return MatchOperand_ParseFail;
+
+  if (RegNum < 0 || RegNum > 7)
+    return MatchOperand_ParseFail;
+
+  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+  if (RegVal == -1)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *RegOp =
+      MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+  RegOp->setRegKind(MipsOperand::Kind_MSA128CtrlRegs);
+  Operands.push_back(RegOp);
+  Parser.Lex(); // Eat the register identifier.
+
+  return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseGPR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- return parseRegs(Operands, (int) MipsOperand::Kind_GPR32);
+MipsAsmParser::parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+
+  if (!isMips64())
+    return MatchOperand_NoMatch;
+  return parseRegs(Operands, (int)MipsOperand::Kind_GPR64);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_GPR32);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseAFGR64Regs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 
   if (isFP64())
     return MatchOperand_NoMatch;
-  // Double operand is expected, set appropriate format
-  setFpFormat(FP_FORMAT_D);
-
-  return parseRegs(Operands, (int) MipsOperand::Kind_AFGR64Regs);
+  return parseRegs(Operands, (int)MipsOperand::Kind_AFGR64Regs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   if (!isFP64())
     return MatchOperand_NoMatch;
-  // Double operand is expected, set appropriate format
-  setFpFormat(FP_FORMAT_D);
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGR64Regs);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGR32Regs);
+}
 
- return parseRegs(Operands, (int) MipsOperand::Kind_FGR64Regs);
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseFGRH32Regs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGRH32Regs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- // Single operand is expected, set appropriate format
-  setFpFormat(FP_FORMAT_S);
-  return parseRegs(Operands, (int) MipsOperand::Kind_FGR32Regs);
+MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FCCRegs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_ACC64DSP);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // If the first token is not '$' we have an error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
@@ -1357,19 +1878,19 @@ MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  if (!Tok.getIdentifier().startswith("fcc"))
+  if (!Tok.getIdentifier().startswith("ac"))
     return MatchOperand_NoMatch;
 
-  StringRef NumString = Tok.getIdentifier().substr(3);
+  StringRef NumString = Tok.getIdentifier().substr(2);
 
   unsigned IntVal;
   if (NumString.getAsInteger(10, IntVal))
     return MatchOperand_NoMatch;
 
-  unsigned Reg = matchRegisterByNumber(IntVal, Mips::FCCRegClassID);
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::LO32DSPRegClassID);
 
   MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_FCCRegs);
+  Op->setRegKind(MipsOperand::Kind_LO32DSP);
   Operands.push_back(Op);
 
   Parser.Lex(); // Eat the register number.
@@ -1377,7 +1898,7 @@ MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseACRegsDSP(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // If the first token is not '$' we have an error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
@@ -1390,27 +1911,78 @@ MipsAsmParser::parseACRegsDSP(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  if (!Tok.getIdentifier().startswith("acc"))
+  if (!Tok.getIdentifier().startswith("ac"))
     return MatchOperand_NoMatch;
 
-  StringRef NumString = Tok.getIdentifier().substr(3);
+  StringRef NumString = Tok.getIdentifier().substr(2);
 
   unsigned IntVal;
   if (NumString.getAsInteger(10, IntVal))
     return MatchOperand_NoMatch;
 
-  unsigned Reg = matchRegisterByNumber(IntVal, Mips::ACRegsDSPRegClassID);
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::HI32DSPRegClassID);
 
   MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_ACRegsDSP);
+  Op->setRegKind(MipsOperand::Kind_HI32DSP);
   Operands.push_back(Op);
 
   Parser.Lex(); // Eat the register number.
   return MatchOperand_Success;
 }
 
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  // If the first token is not '$' we have an error.
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat the '$'
+
+  const AsmToken &Tok = Parser.getTok(); // Get next token.
+
+  if (Tok.isNot(AsmToken::Integer))
+    return MatchOperand_NoMatch;
+
+  unsigned IntVal = Tok.getIntVal();
+
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::COP2RegClassID);
+
+  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind(MipsOperand::Kind_COP2);
+  Operands.push_back(Op);
+
+  Parser.Lex(); // Eat the register number.
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128BRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128BRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128HRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128HRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128WRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128WRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128DRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128DRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128CtrlRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSACtrlRegs(Operands, (int)MipsOperand::Kind_MSA128CtrlRegs);
+}
+
 bool MipsAsmParser::searchSymbolAlias(
-    SmallVectorImpl<MCParsedAsmOperand*> &Operands, unsigned RegKind) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, unsigned RegKind) {
 
   MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
   if (Sym) {
@@ -1421,38 +1993,39 @@ bool MipsAsmParser::searchSymbolAlias(
     else
       return false;
     if (Expr->getKind() == MCExpr::SymbolRef) {
-      MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind) RegKind;
-      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+      MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
       const StringRef DefSymbol = Ref->getSymbol().getName();
       if (DefSymbol.startswith("$")) {
         int RegNum = -1;
         APInt IntVal(32, -1);
         if (!DefSymbol.substr(1).getAsInteger(10, IntVal))
           RegNum = matchRegisterByNumber(IntVal.getZExtValue(),
-                                     isMips64()
-                                       ? Mips::GPR64RegClassID
-                                       : Mips::GPR32RegClassID);
+                                         isMips64() ? Mips::GPR64RegClassID
+                                                    : Mips::GPR32RegClassID);
         else {
           // Lookup for the register with the corresponding name.
           switch (Kind) {
           case MipsOperand::Kind_AFGR64Regs:
           case MipsOperand::Kind_FGR64Regs:
-            RegNum = matchFPURegisterName(DefSymbol.substr(1), FP_FORMAT_D);
+            RegNum = matchFPURegisterName(DefSymbol.substr(1));
             break;
           case MipsOperand::Kind_FGR32Regs:
-            RegNum = matchFPURegisterName(DefSymbol.substr(1), FP_FORMAT_S);
+            RegNum = matchFPURegisterName(DefSymbol.substr(1));
             break;
           case MipsOperand::Kind_GPR64:
           case MipsOperand::Kind_GPR32:
           default:
-            RegNum = matchRegisterName(DefSymbol.substr(1), isMips64());
+            RegNum = matchCPURegisterName(DefSymbol.substr(1));
             break;
           }
+          if (RegNum > -1)
+            RegNum = getReg(regKindToRegClass(Kind), RegNum);
         }
         if (RegNum > -1) {
           Parser.Lex();
-          MipsOperand *op = MipsOperand::CreateReg(RegNum, S,
-                                                   Parser.getTok().getLoc());
+          MipsOperand *op =
+              MipsOperand::CreateReg(RegNum, S, Parser.getTok().getLoc());
           op->setRegKind(Kind);
           Operands.push_back(op);
           return true;
@@ -1460,9 +2033,9 @@ bool MipsAsmParser::searchSymbolAlias(
       }
     } else if (Expr->getKind() == MCExpr::Constant) {
       Parser.Lex();
-      const MCConstantExpr *Const = static_cast<const MCConstantExpr*>(Expr);
-      MipsOperand *op = MipsOperand::CreateImm(Const, S,
-          Parser.getTok().getLoc());
+      const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
+      MipsOperand *op =
+          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc());
       Operands.push_back(op);
       return true;
     }
@@ -1471,115 +2044,101 @@ bool MipsAsmParser::searchSymbolAlias(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-
-  // If the first token is not '$' we have error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
-    return MatchOperand_NoMatch;
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'.
-
-  const AsmToken &Tok = Parser.getTok(); // Get the next token.
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
-
-  unsigned RegNum = Tok.getIntVal();
-  // At the moment only hwreg29 is supported.
-  if (RegNum != 29)
-    return MatchOperand_ParseFail;
-
-  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29, S,
-      Parser.getTok().getLoc());
-  op->setRegKind(MipsOperand::Kind_HWRegs);
-  Operands.push_back(op);
-
-  Parser.Lex(); // Eat the register number.
-  return MatchOperand_Success;
+MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_HWRegs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHW64Regs(
-    SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_CCRRegs);
+}
 
-  if (!isMips64())
-    return MatchOperand_NoMatch;
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  const MCExpr *IdVal;
+  // If the first token is '$' we may have register operand.
+  if (Parser.getTok().is(AsmToken::Dollar))
     return MatchOperand_NoMatch;
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat $
-
-  const AsmToken &Tok = Parser.getTok(); // Get the next token.
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
-
-  unsigned RegNum = Tok.getIntVal();
-  // At the moment only hwreg29 is supported.
-  if (RegNum != 29)
+  if (getParser().parseExpression(IdVal))
     return MatchOperand_ParseFail;
-
-  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29_64, S,
-                                           Parser.getTok().getLoc());
-  op->setRegKind(MipsOperand::Kind_HW64Regs);
-  Operands.push_back(op);
-
-  Parser.Lex(); // Eat the register number.
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal);
+  assert(MCE && "Unexpected MCExpr type.");
+  int64_t Val = MCE->getValue();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(MipsOperand::CreateImm(
+      MCConstantExpr::Create(0 - Val, getContext()), S, E));
   return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+MipsAsmParser::parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  switch (getLexer().getKind()) {
+  default:
     return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+    break;
+  }
 
+  const MCExpr *Expr;
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'
-
-  const AsmToken &Tok = Parser.getTok(); // Get next token.
 
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
 
-  unsigned Reg = matchRegisterByNumber(Tok.getIntVal(), Mips::CCRRegClassID);
+  int64_t Val;
+  if (!Expr->EvaluateAsAbsolute(Val)) {
+    Error(S, "expected immediate value");
+    return MatchOperand_ParseFail;
+  }
 
-  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_CCRRegs);
-  Operands.push_back(Op);
+  // The LSA instruction allows a 2-bit unsigned immediate. For this reason
+  // and because the CPU always adds one to the immediate field, the allowed
+  // range becomes 1..4. We'll only check the range here and will deal
+  // with the addition/subtraction when actually decoding/encoding
+  // the instruction.
+  if (Val < 1 || Val > 4) {
+    Error(S, "immediate not in range (1..4)");
+    return MatchOperand_ParseFail;
+  }
 
-  Parser.Lex(); // Eat the register number.
+  Operands.push_back(MipsOperand::CreateLSAImm(Expr, S,
+                                               Parser.getTok().getLoc()));
   return MatchOperand_Success;
 }
 
 MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 
-  MCSymbolRefExpr::VariantKind VK
-                   = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
-    .Case("hi",          MCSymbolRefExpr::VK_Mips_ABS_HI)
-    .Case("lo",          MCSymbolRefExpr::VK_Mips_ABS_LO)
-    .Case("gp_rel",      MCSymbolRefExpr::VK_Mips_GPREL)
-    .Case("call16",      MCSymbolRefExpr::VK_Mips_GOT_CALL)
-    .Case("got",         MCSymbolRefExpr::VK_Mips_GOT)
-    .Case("tlsgd",       MCSymbolRefExpr::VK_Mips_TLSGD)
-    .Case("tlsldm",      MCSymbolRefExpr::VK_Mips_TLSLDM)
-    .Case("dtprel_hi",   MCSymbolRefExpr::VK_Mips_DTPREL_HI)
-    .Case("dtprel_lo",   MCSymbolRefExpr::VK_Mips_DTPREL_LO)
-    .Case("gottprel",    MCSymbolRefExpr::VK_Mips_GOTTPREL)
-    .Case("tprel_hi",    MCSymbolRefExpr::VK_Mips_TPREL_HI)
-    .Case("tprel_lo",    MCSymbolRefExpr::VK_Mips_TPREL_LO)
-    .Case("got_disp",    MCSymbolRefExpr::VK_Mips_GOT_DISP)
-    .Case("got_page",    MCSymbolRefExpr::VK_Mips_GOT_PAGE)
-    .Case("got_ofst",    MCSymbolRefExpr::VK_Mips_GOT_OFST)
-    .Case("hi(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_HI)
-    .Case("lo(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_LO)
-    .Default(MCSymbolRefExpr::VK_None);
+  MCSymbolRefExpr::VariantKind VK =
+      StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+          .Case("hi", MCSymbolRefExpr::VK_Mips_ABS_HI)
+          .Case("lo", MCSymbolRefExpr::VK_Mips_ABS_LO)
+          .Case("gp_rel", MCSymbolRefExpr::VK_Mips_GPREL)
+          .Case("call16", MCSymbolRefExpr::VK_Mips_GOT_CALL)
+          .Case("got", MCSymbolRefExpr::VK_Mips_GOT)
+          .Case("tlsgd", MCSymbolRefExpr::VK_Mips_TLSGD)
+          .Case("tlsldm", MCSymbolRefExpr::VK_Mips_TLSLDM)
+          .Case("dtprel_hi", MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+          .Case("dtprel_lo", MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+          .Case("gottprel", MCSymbolRefExpr::VK_Mips_GOTTPREL)
+          .Case("tprel_hi", MCSymbolRefExpr::VK_Mips_TPREL_HI)
+          .Case("tprel_lo", MCSymbolRefExpr::VK_Mips_TPREL_LO)
+          .Case("got_disp", MCSymbolRefExpr::VK_Mips_GOT_DISP)
+          .Case("got_page", MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+          .Case("got_ofst", MCSymbolRefExpr::VK_Mips_GOT_OFST)
+          .Case("hi(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+          .Case("lo(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+          .Default(MCSymbolRefExpr::VK_None);
 
   return VK;
 }
 
-bool MipsAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool MipsAsmParser::ParseInstruction(
+    ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // Check if we have valid mnemonic
   if (!mnemonicIsValid(Name, 0)) {
     Parser.eatToEndOfStatement();
@@ -1757,12 +2316,13 @@ bool MipsAsmParser::parseSetAssignment() {
     // We have a '$' followed by something, make sure they are adjacent.
     if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
       return true;
-    StringRef Res = StringRef(DollarLoc.getPointer(),
-        getTok().getEndLoc().getPointer() - DollarLoc.getPointer());
+    StringRef Res =
+        StringRef(DollarLoc.getPointer(),
+                  getTok().getEndLoc().getPointer() - DollarLoc.getPointer());
     Symbol = getContext().GetOrCreateSymbol(Res);
     Parser.Lex();
-    Value = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
-                                    getContext());
+    Value =
+        MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, getContext());
   } else if (Parser.parseExpression(Value))
     return reportParseError("expected valid expression after comma");
 
@@ -1810,6 +2370,34 @@ bool MipsAsmParser::parseDirectiveSet() {
   return true;
 }
 
+bool MipsAsmParser::parseDirectiveMipsHackStocg() {
+  MCAsmParser &Parser = getParser();
+  StringRef Name;
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token");
+  Lex();
+
+  int64_t Flags = 0;
+  if (Parser.parseAbsoluteExpression(Flags))
+    return TokError("unexpected token");
+
+  getTargetStreamer().emitMipsHackSTOCG(Sym, Flags);
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveMipsHackELFFlags() {
+  int64_t Flags = 0;
+  if (Parser.parseAbsoluteExpression(Flags))
+    return TokError("unexpected token");
+
+  getTargetStreamer().emitMipsHackELFFlags(Flags);
+  return false;
+}
+
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
@@ -1835,6 +2423,22 @@ bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// parseDirectiveGpWord
+///  ::= .gpword local_sym
+bool MipsAsmParser::parseDirectiveGpWord() {
+  const MCExpr *Value;
+  // EmitGPRel32Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel32Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(), "unexpected token in directive");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   StringRef IDVal = DirectiveID.getString();
@@ -1875,7 +2479,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   if (IDVal == ".gpword") {
     // Ignore this directive for now.
-    Parser.eatToEndOfStatement();
+    parseDirectiveGpWord();
     return false;
   }
 
@@ -1884,6 +2488,12 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
+  if (IDVal == ".mips_hack_stocg")
+    return parseDirectiveMipsHackStocg();
+
+  if (IDVal == ".mips_hack_elf_flags")
+    return parseDirectiveMipsHackELFFlags();
+
   return true;
 }
 
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index aedb78b..6acc9a8 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -35,7 +35,6 @@ add_llvm_target(MipsCodeGen
   MipsMachineFunction.cpp
   MipsModuleISelDAGToDAG.cpp
   MipsOs16.cpp
-  MipsOptimizeMathLibCalls.cpp
   MipsRegisterInfo.cpp
   MipsSEFrameLowering.cpp
   MipsSEInstrInfo.cpp
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index d99df4d..60508a8 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -35,26 +35,33 @@ public:
   ///
   MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
                        bool bigEndian) :
-    MCDisassembler(STI), RegInfo(Info), isBigEndian(bigEndian) {}
+    MCDisassembler(STI), RegInfo(Info),
+    IsN64(STI.getFeatureBits() & Mips::FeatureN64), isBigEndian(bigEndian) {}
 
   virtual ~MipsDisassemblerBase() {}
 
   const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
 
+  bool isN64() const { return IsN64; }
+
 private:
   OwningPtr<const MCRegisterInfo> RegInfo;
+  bool IsN64;
 protected:
   bool isBigEndian;
 };
 
 /// MipsDisassembler - a disasembler class for Mips32.
 class MipsDisassembler : public MipsDisassemblerBase {
+  bool IsMicroMips;
 public:
   /// Constructor     - Initializes the disassembler.
   ///
   MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
                    bool bigEndian) :
-    MipsDisassemblerBase(STI, Info, bigEndian) {}
+    MipsDisassemblerBase(STI, Info, bigEndian) {
+      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+    }
 
   /// getInstruction - See MCDisassembler.
   virtual DecodeStatus getInstruction(MCInst &instr,
@@ -103,10 +110,15 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
                                              uint64_t Address,
                                              const void *Decoder);
 
-static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder);
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
 
 static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
@@ -118,6 +130,11 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
                                              uint64_t Address,
                                              const void *Decoder);
 
+static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
@@ -138,20 +155,45 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
                                               uint64_t Address,
                                               const void *Decoder);
 
-static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
 
-static DecodeStatus DecodeHIRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 
-static DecodeStatus DecodeLORegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
@@ -163,11 +205,38 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      uint64_t Address,
                                      const void *Decoder);
 
+// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+// DecodeJumpTargetMM - Decode microMIPS jump target, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
 static DecodeStatus DecodeMem(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
                               const void *Decoder);
 
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
 static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
@@ -177,6 +246,13 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  uint64_t Address,
                                  const void *Decoder);
 
+// Decode the immediate field of an LSA instruction which
+// is off by one.
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address,
@@ -237,7 +313,8 @@ static DecodeStatus readInstruction32(const MemoryObject &region,
                                       uint64_t address,
                                       uint64_t &size,
                                       uint32_t &insn,
-                                      bool isBigEndian) {
+                                      bool isBigEndian,
+                                      bool IsMicroMips) {
   uint8_t Bytes[4];
 
   // We want to read exactly 4 Bytes of data.
@@ -255,10 +332,20 @@ static DecodeStatus readInstruction32(const MemoryObject &region,
   }
   else {
     // Encoded as a small-endian 32-bit word in the stream.
-    insn = (Bytes[0] <<  0) |
-           (Bytes[1] <<  8) |
-           (Bytes[2] << 16) |
-           (Bytes[3] << 24);
+    // Little-endian byte ordering:
+    //   mips32r2:   4 | 3 | 2 | 1
+    //   microMIPS:  2 | 1 | 4 | 3
+    if (IsMicroMips) {
+      insn = (Bytes[2] <<  0) |
+             (Bytes[3] <<  8) |
+             (Bytes[0] << 16) |
+             (Bytes[1] << 24);
+    } else {
+      insn = (Bytes[0] <<  0) |
+             (Bytes[1] <<  8) |
+             (Bytes[2] << 16) |
+             (Bytes[3] << 24);
+    }
   }
 
   return MCDisassembler::Success;
@@ -274,10 +361,21 @@ MipsDisassembler::getInstruction(MCInst &instr,
   uint32_t Insn;
 
   DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian);
+                                          Insn, isBigEndian, IsMicroMips);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
+  if (IsMicroMips) {
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+    return MCDisassembler::Fail;
+  }
+
   // Calling the auto-generated decoder function.
   Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
                              this, STI);
@@ -299,7 +397,7 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   uint32_t Insn;
 
   DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian);
+                                          Insn, isBigEndian, false);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
@@ -359,10 +457,20 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (static_cast<const MipsDisassembler *>(Decoder)->isN64())
+    return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
+
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
   return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
 }
 
@@ -390,6 +498,18 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGRH32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
@@ -434,6 +554,58 @@ static DecodeStatus DecodeMem(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
+  unsigned Reg = fieldFromInstruction(Insn, 6, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<12>(Insn & 0x0fff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeFMem(MCInst &Inst,
                                unsigned Insn,
                                uint64_t Address,
@@ -477,38 +649,98 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::ACRegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeHIRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::HIRegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLORegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::LORegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
@@ -533,6 +765,24 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  unsigned BranchOffset = Offset & 0xffff;
+  BranchOffset = SignExtend32<18>(BranchOffset << 1);
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 1;
+  Inst.addOperand(MCOperand::CreateImm(JumpOffset));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  unsigned Insn,
@@ -542,6 +792,15 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  // We add one to the immediate field as it was encoded as 'imm - 1'.
+  Inst.addOperand(MCOperand::CreateImm(Insn + 1));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 369fece..7884589 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -184,6 +184,15 @@ void MipsInstPrinter::printUnsignedImm(const MCInst *MI, int opNum,
     printOperand(MI, opNum, O);
 }
 
+void MipsInstPrinter::printUnsignedImm8(const MCInst *MI, int opNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
 void MipsInstPrinter::
 printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
@@ -211,6 +220,11 @@ printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   O << MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
+void MipsInstPrinter::
+printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
+  llvm_unreachable("TODO");
+}
+
 bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
                                  unsigned OpNo, raw_ostream &OS) {
   OS << "\t" << Str << "\t";
@@ -230,8 +244,11 @@ bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
 bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
   switch (MI.getOpcode()) {
   case Mips::BEQ:
+    // beq $zero, $zero, $L2 => b $L2
     // beq $r0, $zero, $L2 => beqz $r0, $L2
-    return isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
+    return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
+            printAlias("b", MI, 2, OS)) ||
+           (isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS));
   case Mips::BEQ64:
     // beq $r0, $zero, $L2 => beqz $r0, $L2
     return isReg<Mips::ZERO_64>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
@@ -257,6 +274,7 @@ bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
     // jalr $ra, $r1 => jalr $r1
     return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
   case Mips::NOR:
+  case Mips::NOR_MM:
     // nor $r0, $r1, $zero => not $r0, $r1
     return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
   case Mips::NOR64:
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 1253ab0..f75ae24 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -93,9 +93,11 @@ public:
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printUnsignedImm(const MCInst *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
 
   bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
                   raw_ostream &OS);
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 1f08789..9116748 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -5,7 +5,7 @@ add_llvm_library(LLVMMipsDesc
   MipsMCTargetDesc.cpp
   MipsELFObjectWriter.cpp
   MipsReginfo.cpp
-  MipsELFStreamer.cpp
+  MipsTargetStreamer.cpp
   )
 
 add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 0b13607..3e70b23 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -45,6 +45,10 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_GOT_DISP:
   case Mips::fixup_Mips_GOT_LO16:
   case Mips::fixup_Mips_CALL_LO16:
+  case Mips::fixup_MICROMIPS_LO16:
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+  case Mips::fixup_MICROMIPS_GOT_DISP:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -65,6 +69,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_GOT_Local:
   case Mips::fixup_Mips_GOT_HI16:
   case Mips::fixup_Mips_CALL_HI16:
+  case Mips::fixup_MICROMIPS_HI16:
     // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
@@ -76,6 +81,13 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // Get the 4th 16-bits.
     Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
     break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Value >>= 1;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Value -= 4;
+    Value >>= 1;
+    break;
   }
 
   return Value;
@@ -188,7 +200,20 @@ public:
       { "fixup_Mips_GOT_HI16",     0,     16,   0 },
       { "fixup_Mips_GOT_LO16",     0,     16,   0 },
       { "fixup_Mips_CALL_HI16",    0,     16,   0 },
-      { "fixup_Mips_CALL_LO16",    0,     16,   0 }
+      { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+      { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
+      { "fixup_MICROMIPS_HI16",    0,     16,   0 },
+      { "fixup_MICROMIPS_LO16",    0,     16,   0 },
+      { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
+      { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_OFST",        0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -253,25 +278,33 @@ public:
 } // namespace
 
 // MCAsmBackend
-MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/true);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/true);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 6471b51..83c7d4b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -183,6 +183,45 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_Mips_CALL_LO16:
     Type = ELF::R_MIPS_CALL_LO16;
     break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Type = ELF::R_MICROMIPS_26_S1;
+    break;
+  case Mips::fixup_MICROMIPS_HI16:
+    Type = ELF::R_MICROMIPS_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_LO16:
+    Type = ELF::R_MICROMIPS_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT16:
+    Type = ELF::R_MICROMIPS_GOT16;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Type = ELF::R_MICROMIPS_PC16_S1;
+    break;
+  case Mips::fixup_MICROMIPS_CALL16:
+    Type = ELF::R_MICROMIPS_CALL16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+    Type = ELF::R_MICROMIPS_GOT_DISP;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+    Type = ELF::R_MICROMIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+    Type = ELF::R_MICROMIPS_GOT_OFST;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
+    break;
   }
   return Type;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
deleted file mode 100644
index cfcb877..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-//===-- MipsELFStreamer.cpp - MipsELFStreamer ---------------------------===//
-//
-//                       The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-#include "MCTargetDesc/MipsELFStreamer.h"
-#include "MipsSubtarget.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                       raw_ostream &OS, MCCodeEmitter *Emitter,
-                                       bool RelaxAll, bool NoExecStack) {
-    MipsELFStreamer *S = new MipsELFStreamer(Context, TAB, OS, Emitter,
-                                             RelaxAll, NoExecStack);
-    return S;
-  }
-
-  // For llc. Set a group of ELF header flags
-  void
-  MipsELFStreamer::emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget) {
-
-    if (hasRawTextSupport())
-      return;
-
-    // Update e_header flags
-    MCAssembler& MCA = getAssembler();
-    unsigned EFlags = MCA.getELFHeaderEFlags();
-
-    // TODO: Need to add -mabicalls and -mno-abicalls flags.
-    // Currently we assume that -mabicalls is the default.
-    EFlags |= ELF::EF_MIPS_CPIC;
-
-    if (Subtarget.inMips16Mode())
-      EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
-    else
-      EFlags |= ELF::EF_MIPS_NOREORDER;
-
-    // Architecture
-    if (Subtarget.hasMips64r2())
-      EFlags |= ELF::EF_MIPS_ARCH_64R2;
-    else if (Subtarget.hasMips64())
-      EFlags |= ELF::EF_MIPS_ARCH_64;
-    else if (Subtarget.hasMips32r2())
-      EFlags |= ELF::EF_MIPS_ARCH_32R2;
-    else
-      EFlags |= ELF::EF_MIPS_ARCH_32;
-
-    if (Subtarget.inMicroMipsMode())
-      EFlags |= ELF::EF_MIPS_MICROMIPS;
-
-    // ABI
-    if (Subtarget.isABI_O32())
-      EFlags |= ELF::EF_MIPS_ABI_O32;
-
-    // Relocation Model
-    Reloc::Model RM = Subtarget.getRelocationModel();
-    if (RM == Reloc::PIC_ || RM == Reloc::Default)
-      EFlags |= ELF::EF_MIPS_PIC;
-    else if (RM == Reloc::Static)
-      ; // Do nothing for Reloc::Static
-    else
-      llvm_unreachable("Unsupported relocation model for e_flags");
-
-    MCA.setELFHeaderEFlags(EFlags);
-  }
-
-  // For llc. Set a symbol's STO flags
-  void
-  MipsELFStreamer::emitMipsSTOCG(const MipsSubtarget &Subtarget,
-                                 MCSymbol *Sym,
-                                 unsigned Val) {
-
-    if (hasRawTextSupport())
-      return;
-
-    MCSymbolData &Data = getOrCreateSymbolData(Sym);
-    // The "other" values are stored in the last 6 bits of the second byte
-    // The traditional defines for STO values assume the full byte and thus
-    // the shift to pack it.
-    MCELF::setOther(Data, Val >> 2);
-  }
-
-} // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
deleted file mode 100644
index b10ccc7..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//=== MipsELFStreamer.h - MipsELFStreamer ------------------------------===//
-//
-//                    The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENCE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-#ifndef MIPSELFSTREAMER_H_
-#define MIPSELFSTREAMER_H_
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-class MipsAsmPrinter;
-class MipsSubtarget;
-class MCSymbol;
-
-class MipsELFStreamer : public MCELFStreamer {
-public:
-  MipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                  raw_ostream &OS, MCCodeEmitter *Emitter,
-                  bool RelaxAll, bool NoExecStack)
-    : MCELFStreamer(SK_MipsELFStreamer, Context, TAB, OS, Emitter) {
-  }
-
-  ~MipsELFStreamer() {}
-  void emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget);
-  void emitMipsSTOCG(const MipsSubtarget &Subtarget,
-                     MCSymbol *Sym,
-                     unsigned Val);
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_MipsELFStreamer;
-  }
-};
-
-  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                       raw_ostream &OS, MCCodeEmitter *Emitter,
-                                       bool RelaxAll, bool NoExecStack);
-}
-
-#endif /* MIPSELFSTREAMER_H_ */
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index f963900..6ed44b7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -128,6 +128,45 @@ namespace Mips {
     // resulting in - R_MIPS_CALL_LO16
     fixup_Mips_CALL_LO16,
 
+    // resulting in - R_MICROMIPS_26_S1
+    fixup_MICROMIPS_26_S1,
+
+    // resulting in - R_MICROMIPS_HI16
+    fixup_MICROMIPS_HI16,
+
+    // resulting in - R_MICROMIPS_LO16
+    fixup_MICROMIPS_LO16,
+
+    // resulting in - R_MICROMIPS_GOT16
+    fixup_MICROMIPS_GOT16,
+
+    // resulting in - R_MICROMIPS_PC16_S1
+    fixup_MICROMIPS_PC16_S1,
+
+    // resulting in - R_MICROMIPS_CALL16
+    fixup_MICROMIPS_CALL16,
+
+    // resulting in - R_MICROMIPS_GOT_DISP
+    fixup_MICROMIPS_GOT_DISP,
+
+    // resulting in - R_MICROMIPS_GOT_PAGE
+    fixup_MICROMIPS_GOT_PAGE,
+
+    // resulting in - R_MICROMIPS_GOT_OFST
+    fixup_MICROMIPS_GOT_OFST,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_HI16
+    fixup_MICROMIPS_TLS_DTPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_LO16
+    fixup_MICROMIPS_TLS_DTPREL_LO16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_HI16
+    fixup_MICROMIPS_TLS_TPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_LO16
+    fixup_MICROMIPS_TLS_TPREL_LO16,
+
     // Marker
     LastTargetFixupKind,
     NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 33f6f96..6aa3c76 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -38,7 +38,6 @@ MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
   GPRel64Directive            = "\t.gpdword\t";
-  WeakRefDirective            = "\t.weak\t";
   DebugLabelSuffix            = "=.";
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 772234e..1000113 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -14,12 +14,12 @@
 #ifndef MIPSTARGETASMINFO_H
 #define MIPSTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
 
-  class MipsMCAsmInfo : public MCAsmInfo {
+  class MipsMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit MipsMCAsmInfo(StringRef TT);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 4dc6917..66428bd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -39,11 +39,14 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
   MCContext &Ctx;
   const MCSubtargetInfo &STI;
   bool IsLittleEndian;
+  bool IsMicroMips;
 
 public:
   MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_,
                     const MCSubtargetInfo &sti, bool IsLittle) :
-    MCII(mcii), Ctx(Ctx_), STI (sti), IsLittleEndian(IsLittle) {}
+    MCII(mcii), Ctx(Ctx_), STI (sti), IsLittleEndian(IsLittle) {
+      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+    }
 
   ~MipsMCCodeEmitter() {}
 
@@ -53,9 +56,17 @@ public:
 
   void EmitInstruction(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the instruction encoding in little endian byte order.
-    for (unsigned i = 0; i < Size; ++i) {
-      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
-      EmitByte((Val >> Shift) & 0xff, OS);
+    // Little-endian byte ordering:
+    //   mips32r2:   4 | 3 | 2 | 1
+    //   microMIPS:  2 | 1 | 4 | 3
+    if (IsLittleEndian && Size == 4 && IsMicroMips) {
+      EmitInstruction(Val>>16, 2, OS);
+      EmitInstruction(Val, 2, OS);
+    } else {
+      for (unsigned i = 0; i < Size; ++i) {
+        unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+        EmitByte((Val >> Shift) & 0xff, OS);
+      }
     }
   }
 
@@ -73,12 +84,24 @@ public:
    unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
    // getBranchTargetOpValue - Return binary encoding of the branch
    // target operand. If the machine operand requires relocation,
    // record the relocation and return zero.
   unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                                   SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
    // getMachineOpValue - Return binary encoding of operand. If the machin
    // operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
@@ -86,11 +109,17 @@ public:
 
   unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getLSAImmEncoding - Return binary encoding of LSA immediate.
+  unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
   unsigned
   getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const;
 
@@ -142,6 +171,9 @@ static void LowerLargeShift(MCInst& Inst) {
   case Mips::DSRA:
     Inst.setOpcode(Mips::DSRA32);
     return;
+  case Mips::DROTR:
+    Inst.setOpcode(Mips::DROTR32);
+    return;
   }
 }
 
@@ -177,7 +209,7 @@ static void LowerDextDins(MCInst& InstIn) {
 }
 
 /// EncodeInstruction - Emit the instruction.
-/// Size the instruction (currently only 4 bytes
+/// Size the instruction with Desc.getSize().
 void MipsMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups) const
@@ -193,6 +225,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case Mips::DSLL:
   case Mips::DSRL:
   case Mips::DSRA:
+  case Mips::DROTR:
     LowerLargeShift(TmpInst);
     break;
     // Double extract instruction is chosen by pos and size operands
@@ -201,6 +234,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     LowerDextDins(TmpInst);
   }
 
+  unsigned long N = Fixups.size();
   uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups);
 
   // Check for unimplemented opcodes.
@@ -213,6 +247,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (STI.getFeatureBits() & Mips::FeatureMicroMips) {
     int NewOpcode = Mips::Std2MicroMips (Opcode, Mips::Arch_micromips);
     if (NewOpcode != -1) {
+      if (Fixups.size() > N)
+        Fixups.pop_back();
       Opcode = NewOpcode;
       TmpInst.setOpcode (NewOpcode);
       Binary = getBinaryCodeForInstr(TmpInst, Fixups);
@@ -250,6 +286,28 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMM expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                   MCFixupKind(Mips::
+                               fixup_MICROMIPS_PC16_S1)));
+  return 0;
+}
+
 /// getJumpTargetOpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -271,6 +329,23 @@ getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
+getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValueMM expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_26_S1)));
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
 getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
   int64_t Res;
 
@@ -300,31 +375,39 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
     FixupKind = Mips::fixup_Mips_GPOFF_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
-    FixupKind = Mips::fixup_Mips_GOT_PAGE;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_PAGE
+                            : Mips::fixup_Mips_GOT_PAGE;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_OFST :
-    FixupKind = Mips::fixup_Mips_GOT_OFST;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_OFST
+                            : Mips::fixup_Mips_GOT_OFST;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_DISP :
-    FixupKind = Mips::fixup_Mips_GOT_DISP;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_DISP
+                            : Mips::fixup_Mips_GOT_DISP;
     break;
   case MCSymbolRefExpr::VK_Mips_GPREL:
     FixupKind = Mips::fixup_Mips_GPREL16;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_CALL:
-    FixupKind = Mips::fixup_Mips_CALL16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_CALL16
+                            : Mips::fixup_Mips_CALL16;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT16:
-    FixupKind = Mips::fixup_Mips_GOT_Global;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
+                            : Mips::fixup_Mips_GOT_Global;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT:
-    FixupKind = Mips::fixup_Mips_GOT_Local;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
+                            : Mips::fixup_Mips_GOT_Local;
     break;
   case MCSymbolRefExpr::VK_Mips_ABS_HI:
-    FixupKind = Mips::fixup_Mips_HI16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_HI16
+                            : Mips::fixup_Mips_HI16;
     break;
   case MCSymbolRefExpr::VK_Mips_ABS_LO:
-    FixupKind = Mips::fixup_Mips_LO16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_LO16
+                            : Mips::fixup_Mips_LO16;
     break;
   case MCSymbolRefExpr::VK_Mips_TLSGD:
     FixupKind = Mips::fixup_Mips_TLSGD;
@@ -333,19 +416,23 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
     FixupKind = Mips::fixup_Mips_TLSLDM;
     break;
   case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
-    FixupKind = Mips::fixup_Mips_DTPREL_HI;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+                            : Mips::fixup_Mips_DTPREL_HI;
     break;
   case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
-    FixupKind = Mips::fixup_Mips_DTPREL_LO;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+                            : Mips::fixup_Mips_DTPREL_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_GOTTPREL:
     FixupKind = Mips::fixup_Mips_GOTTPREL;
     break;
   case MCSymbolRefExpr::VK_Mips_TPREL_HI:
-    FixupKind = Mips::fixup_Mips_TPREL_HI;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
+                            : Mips::fixup_Mips_TPREL_HI;
     break;
   case MCSymbolRefExpr::VK_Mips_TPREL_LO:
-    FixupKind = Mips::fixup_Mips_TPREL_LO;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
+                            : Mips::fixup_Mips_TPREL_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_HIGHER:
     FixupKind = Mips::fixup_Mips_HIGHER;
@@ -406,6 +493,17 @@ MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
   return (OffBits & 0xFFFF) | RegBits;
 }
 
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups);
+
+  return (OffBits & 0x0FFF) | RegBits;
+}
+
 unsigned
 MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups) const {
@@ -427,5 +525,13 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
   return Position + Size - 1;
 }
 
+unsigned
+MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpNo).isImm());
+  // The immediate is encoded as 'immediate - 1'.
+  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) - 1;
+}
+
 #include "MipsGenMCCodeEmitter.inc"
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 837fabe..5548aaa 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -11,17 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/MipsELFStreamer.h"
 #include "MipsMCTargetDesc.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsMCAsmInfo.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -125,14 +129,23 @@ static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
 }
 
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  return createMipsELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+                                    MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    bool RelaxAll, bool NoExecStack) {
+  MipsTargetELFStreamer *S = new MipsTargetELFStreamer();
+  return createELFStreamer(Context, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
+                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
+  MipsTargetAsmStreamer *S = new MipsTargetAsmStreamer(OS);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
 }
 
 extern "C" void LLVMInitializeMipsTargetMC() {
@@ -183,6 +196,12 @@ extern "C" void LLVMInitializeMipsTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget,
                                            createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheMipsTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMipselTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer);
+
   // Register the asm backend.
   TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
                                        createMipsAsmBackendEB32);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 71954a4..eabebfe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -42,14 +42,14 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT,
-                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
 
 MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
                                           uint8_t OSABI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
new file mode 100644
index 0000000..5e90bbc
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -0,0 +1,67 @@
+//===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> PrintHackDirectives("print-hack-directives",
+                                         cl::init(false), cl::Hidden);
+
+// pin vtable to this file
+void MipsTargetStreamer::anchor() {}
+
+MipsTargetAsmStreamer::MipsTargetAsmStreamer(formatted_raw_ostream &OS)
+    : OS(OS) {}
+
+void MipsTargetAsmStreamer::emitMipsHackELFFlags(unsigned Flags) {
+  if (!PrintHackDirectives)
+    return;
+
+  OS << "\t.mips_hack_elf_flags 0x";
+  OS.write_hex(Flags);
+  OS << '\n';
+}
+void MipsTargetAsmStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
+  if (!PrintHackDirectives)
+    return;
+
+  OS << "\t.mips_hack_stocg ";
+  OS << Sym->getName();
+  OS << ", ";
+  OS << Val;
+  OS << '\n';
+}
+
+MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(*Streamer);
+}
+
+void MipsTargetELFStreamer::emitMipsHackELFFlags(unsigned Flags) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+// Set a symbol's STO flags
+void MipsTargetELFStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
+  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(Sym);
+  // The "other" values are stored in the last 6 bits of the second byte
+  // The traditional defines for STO values assume the full byte and thus
+  // the shift to pack it.
+  MCELF::setOther(Data, Val >> 2);
+}
diff --git a/lib/Target/Mips/MSA.txt b/lib/Target/Mips/MSA.txt
new file mode 100644
index 0000000..d1c4193
--- /dev/null
+++ b/lib/Target/Mips/MSA.txt
@@ -0,0 +1,78 @@
+Code Generation Notes for MSA
+=============================
+
+Intrinsics are lowered to SelectionDAG nodes where possible in order to enable
+optimisation, reduce the size of the ISel matcher, and reduce repetition in
+the implementation. In a small number of cases, this can cause different
+(semantically equivalent) instructions to be used in place of the requested
+instruction, even when no optimisation has taken place.
+
+Instructions
+============
+
+This section describes any quirks of instruction selection for MSA. For
+example, two instructions might be equally valid for some given IR and one is
+chosen in preference to the other.
+
+bclri.b:
+        It is not possible to emit bclri.b since andi.b covers exactly the
+        same cases. andi.b should use fractionally less power than bclri.b in
+        most hardware implementations so it is used in preference to bclri.b.
+
+vshf.w:
+        It is not possible to emit vshf.w when the shuffle description is
+        constant since shf.w covers exactly the same cases. shf.w is used
+        instead. It is also impossible for the shuffle description to be
+        unknown at compile-time due to the definition of shufflevector in
+        LLVM IR.
+
+vshf.[bhwd]
+        When the shuffle description describes a splat operation, splat.[bhwd]
+        instructions will be selected instead of vshf.[bhwd]. Unlike the ilv*,
+        and pck* instructions, this is matched from MipsISD::VSHF instead of
+        a special-case MipsISD node.
+
+ilvl.d, pckev.d:
+        It is not possible to emit ilvl.d, or pckev.d since ilvev.d covers the
+        same shuffle. ilvev.d will be emitted instead.
+
+ilvr.d, ilvod.d, pckod.d:
+        It is not possible to emit ilvr.d, or pckod.d since ilvod.d covers the
+        same shuffle. ilvod.d will be emitted instead.
+
+splat.[bhwd]
+        The intrinsic will work as expected. However, unlike other intrinsics
+        it lowers directly to MipsISD::VSHF instead of using common IR.
+
+splati.w:
+        It is not possible to emit splati.w since shf.w covers the same cases.
+        shf.w will be emitted instead.
+
+copy_s.w:
+        On MIPS32, the copy_u.d intrinsic will emit this instruction instead of
+        copy_u.w. This is semantically equivalent since the general-purpose
+        register file is 32-bits wide.
+
+binsri.[bhwd],  binsli.[bhwd]:
+        These two operations are equivalent to each other with the operands
+        swapped and condition inverted. The compiler may use either one as
+        appropriate.
+        Furthermore, the compiler may use bsel.[bhwd] for some masks that do
+        not survive the legalization process (this is a bug and will be fixed).
+
+bmnz.v, bmz.v, bsel.v:
+        These three operations differ only in the operand that is tied to the
+        result.
+        It is (currently) not possible to emit bmz.v, or bsel.v since bmnz.v is
+        the same operation and will be emitted instead.
+        In future, the compiler may choose between these three instructions
+        according to register allocation.
+
+bmnzi.b, bmzi.b:
+        Like their non-immediate counterparts, bmnzi.v and bmzi.v are the same
+        operation with the operands swapped. bmnzi.v will (currently) be emitted
+        for both cases.
+
+bseli.v:
+        Unlike the non-immediate versions, bseli.v is distinguishable from
+        bmnzi.b and bmzi.b and can be emitted.
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index 665b4d2..c12a32e 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -39,8 +39,8 @@ class SLTI_FM_MM<bits<6> op> : MMArch {
   bits<32> Inst;
 
   let Inst{31-26} = op;
-  let Inst{25-21} = rs;
-  let Inst{20-16} = rt;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
   let Inst{15-0}  = imm16;
 }
 
@@ -110,3 +110,195 @@ class LW_FM_MM<bits<6> op> : MMArch {
   let Inst{20-16} = addr{20-16};
   let Inst{15-0}  = addr{15-0};
 }
+
+class LWL_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class CMov_F_I_FM_MM<bits<7> func> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = fcc;
+  let Inst{12-6}  = func;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MTLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class MFLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rd;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class CLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SEB_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EXT_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class J_FM_MM<bits<6> op> : MMArch {
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+class JR_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class JALR_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BEQ_FM_MM<bits<6> op> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZAL_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class TEQ_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<4> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = code_;
+  let Inst{11-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class TEQI_FM_MM<bits<5> funct> : MMArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 249d712..d9507fa 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,4 +1,51 @@
-let isCodeGenOnly = 1 in {
+def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddrMM", [frameindex]>;
+
+def simm12 : Operand<i32> {
+  let DecoderMethod = "DecodeSimm12";
+}
+
+def mem_mm_12 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm12);
+  let EncoderMethod = "getMemEncodingMMImm12";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def jmptarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def calltarget_mm : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def brtarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMM";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTargetMM";
+}
+
+let canFoldAsLoad = 1 in
+class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                      Operand MemOpnd> :
+  InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr, RO:$src))],
+         NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                       Operand MemOpnd>:
+  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addrimm12:$addr)], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+}
+
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   /// Arithmetic Instructions (ALU Immediate)
   def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd>,
                  ADDI_FM_MM<0xc>;
@@ -32,17 +79,21 @@ let isCodeGenOnly = 1 in {
   def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IIAlu, xor>,
                  ADD_FM_MM<0, 0x310>;
   def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
-  def MULT_MM  : MMRel, Mult<"mult", IIImul, GPR32Opnd, [HI, LO]>,
+  def MULT_MM  : MMRel, Mult<"mult", IIImul, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x22c>;
-  def MULTu_MM : MMRel, Mult<"multu", IIImul, GPR32Opnd, [HI, LO]>,
+  def MULTu_MM : MMRel, Mult<"multu", IIImul, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x26c>;
+  def SDIV_MM  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ac>;
+  def UDIV_MM  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ec>;
 
   /// Shift Instructions
-  def SLL_MM   : MMRel, shift_rotate_imm<"sll", shamt, GPR32Opnd>,
+  def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0, 0>;
-  def SRL_MM   : MMRel, shift_rotate_imm<"srl", shamt, GPR32Opnd>,
+  def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0x40, 0>;
-  def SRA_MM   : MMRel, shift_rotate_imm<"sra", shamt, GPR32Opnd>,
+  def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0x80, 0>;
   def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd>,
                  SRLV_FM_MM<0x10, 0>;
@@ -50,18 +101,119 @@ let isCodeGenOnly = 1 in {
                  SRLV_FM_MM<0x50, 0>;
   def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd>,
                  SRLV_FM_MM<0x90, 0>;
-  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", shamt, GPR32Opnd>,
+  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0xc0, 0>;
   def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd>,
                  SRLV_FM_MM<0xd0, 0>;
 
   /// Load and Store Instructions - aligned
-  defm LB_MM  : LoadM<"lb", GPR32Opnd, sextloadi8>, MMRel, LW_FM_MM<0x7>;
-  defm LBu_MM : LoadM<"lbu", GPR32Opnd, zextloadi8>, MMRel, LW_FM_MM<0x5>;
-  defm LH_MM  : LoadM<"lh", GPR32Opnd, sextloadi16>, MMRel, LW_FM_MM<0xf>;
-  defm LHu_MM : LoadM<"lhu", GPR32Opnd, zextloadi16>, MMRel, LW_FM_MM<0xd>;
-  defm LW_MM  : LoadM<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
-  defm SB_MM  : StoreM<"sb", GPR32Opnd, truncstorei8>, MMRel, LW_FM_MM<0x6>;
-  defm SH_MM  : StoreM<"sh", GPR32Opnd, truncstorei16>, MMRel, LW_FM_MM<0xe>;
-  defm SW_MM  : StoreM<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
+  let DecoderMethod = "DecodeMemMMImm16" in {
+    def LB_MM  : Load<"lb", GPR32Opnd>, MMRel, LW_FM_MM<0x7>;
+    def LBu_MM : Load<"lbu", GPR32Opnd>, MMRel, LW_FM_MM<0x5>;
+    def LH_MM  : Load<"lh", GPR32Opnd>, MMRel, LW_FM_MM<0xf>;
+    def LHu_MM : Load<"lhu", GPR32Opnd>, MMRel, LW_FM_MM<0xd>;
+    def LW_MM  : Load<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
+    def SB_MM  : Store<"sb", GPR32Opnd>, MMRel, LW_FM_MM<0x6>;
+    def SH_MM  : Store<"sh", GPR32Opnd>, MMRel, LW_FM_MM<0xe>;
+    def SW_MM  : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
+  }
+
+  /// Load and Store Instructions - unaligned
+  def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x0>;
+  def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x1>;
+  def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x8>;
+  def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x9>;
+
+  /// Move Conditional
+  def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x58>;
+  def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x18>;
+  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIAlu>,
+                  CMov_F_I_FM_MM<0x25>;
+  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIAlu>,
+                  CMov_F_I_FM_MM<0x5>;
+
+  /// Move to/from HI/LO
+  def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
+                MTLO_FM_MM<0x0b5>;
+  def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
+                MTLO_FM_MM<0x0f5>;
+  def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x035>;
+  def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x075>;
+
+  /// Multiply Add/Sub Instructions
+  def MADD_MM  : MMRel, MArithR<"madd", 1>, MULT_FM_MM<0x32c>;
+  def MADDU_MM : MMRel, MArithR<"maddu", 1>, MULT_FM_MM<0x36c>;
+  def MSUB_MM  : MMRel, MArithR<"msub">, MULT_FM_MM<0x3ac>;
+  def MSUBU_MM : MMRel, MArithR<"msubu">, MULT_FM_MM<0x3ec>;
+
+  /// Count Leading
+  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>;
+  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>;
+
+  /// Sign Ext In Register Instructions.
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM_MM<0x0ac>;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM_MM<0x0ec>;
+
+  /// Word Swap Bytes Within Halfwords
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>;
+
+  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
+               EXT_FM_MM<0x2c>;
+  def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>,
+               EXT_FM_MM<0x0c>;
+
+  /// Jump Instructions
+  let DecoderMethod = "DecodeJumpTargetMM" in {
+    def J_MM        : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+                      J_FM_MM<0x35>;
+    def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
+    def TAILCALL_MM : MMRel, JumpFJ<calltarget_mm, "j", MipsTailCall, imm,
+                                    "tcall">, J_FM_MM<0x3d>, IsTailCall;
+  }
+  def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
+  def JALR_MM : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+  def TAILCALL_R_MM : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>,
+                      JR_FM_MM<0x3c>, IsTailCall;
+  def RET_MM : MMRel, RetBase<"ret", GPR32Opnd>, JR_FM_MM<0x3c>;
+
+  /// Branch Instructions
+  def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
+                BEQ_FM_MM<0x25>;
+  def BNE_MM  : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
+                BEQ_FM_MM<0x2d>;
+  def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
+                BGEZ_FM_MM<0x2>;
+  def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
+                BGEZ_FM_MM<0x6>;
+  def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
+                BGEZ_FM_MM<0x4>;
+  def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
+                BGEZ_FM_MM<0x0>;
+  def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x03>;
+  def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x01>;
+
+  /// Trap Instructions
+  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
+  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM_MM<0x08>;
+  def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM_MM<0x10>;
+  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM_MM<0x20>;
+  def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM_MM<0x28>;
+  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM_MM<0x30>;
+
+  def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM_MM<0x0e>;
+  def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM_MM<0x09>;
+  def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM_MM<0x0b>;
+  def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM_MM<0x08>;
+  def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM_MM<0x0a>;
+  def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM_MM<0x0c>;
 }
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index b88c0d2..e796deb 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -28,7 +28,6 @@ namespace llvm {
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
   FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
-  FunctionPass *createMipsOptimizeMathLibCalls(MipsTargetMachine &TM);
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 2595e41..b8e3f39 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -78,6 +78,8 @@ def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
 def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
                                     "Mips DSP-R2 ASE", [FeatureDSP]>;
 
+def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
+
 def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
                                          "microMips mode">;
 
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 54fdb78..8ce2ced 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -20,7 +20,7 @@ namespace llvm {
 class Mips16FrameLowering : public MipsFrameLowering {
 public:
   explicit Mips16FrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, 8) {}
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 7e456aa..81bf18c 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <string>
 
 static void inlineAsmOut
@@ -321,6 +322,37 @@ static void assureFPCallStub(Function &F, Module *M,
 }
 
 //
+// Functions that are llvm intrinsics and don't need helpers.
+//
+static const char *IntrinsicInline[] =
+  {"fabs",
+   "fabsf",
+   "llvm.ceil.f32", "llvm.ceil.f64",
+   "llvm.copysign.f32", "llvm.copysign.f64",
+   "llvm.cos.f32", "llvm.cos.f64",
+   "llvm.exp.f32", "llvm.exp.f64",
+   "llvm.exp2.f32", "llvm.exp2.f64",
+   "llvm.fabs.f32", "llvm.fabs.f64",
+   "llvm.floor.f32", "llvm.floor.f64",
+   "llvm.fma.f32", "llvm.fma.f64",
+   "llvm.log.f32", "llvm.log.f64",
+   "llvm.log10.f32", "llvm.log10.f64",
+   "llvm.nearbyint.f32", "llvm.nearbyint.f64",
+   "llvm.pow.f32", "llvm.pow.f64",
+   "llvm.powi.f32", "llvm.powi.f64",
+   "llvm.rint.f32", "llvm.rint.f64",
+   "llvm.round.f32", "llvm.round.f64",
+   "llvm.sin.f32", "llvm.sin.f64",
+   "llvm.sqrt.f32", "llvm.sqrt.f64",
+   "llvm.trunc.f32", "llvm.trunc.f64",
+  };
+
+static bool isIntrinsicInline(Function *F) {
+  return std::binary_search(
+    IntrinsicInline, array_endof(IntrinsicInline),
+    F->getName());
+}
+//
 // Returns of float, double and complex need to be handled with a helper
 // function.
 //
@@ -372,7 +404,7 @@ static bool fixupFPReturnAndCall
           // helper functions
           if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
             Function *F_ =  CI->getCalledFunction();
-            if (F_ && needsFPHelperFromSig(*F_)) {
+            if (F_ && !isIntrinsicInline(F_) && needsFPHelperFromSig(*F_)) {
               assureFPCallStub(*F_, M, Subtarget);
               Modified=true;
             }
@@ -390,7 +422,7 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   std::string Name = F->getName();
   std::string SectionName = ".mips16.fn." + Name;
   std::string StubName = "__fn_stub_" + Name;
-  std::string LocalName = "__fn_local_" + Name;
+  std::string LocalName = "$$__fn_local_" + Name;
   Function *FStub = Function::Create
     (F->getFunctionType(),
      Function::InternalLinkage, StubName, M);
@@ -405,19 +437,36 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   IAH.Out(" .set  macro");
   if (PicMode) {
     IAH.Out(".set noreorder");
-    IAH.Out(".cpload  $$2");
+    IAH.Out(".cpload  $$25");
     IAH.Out(".set reorder");
     IAH.Out(".reloc 0,R_MIPS_NONE," + Name);
     IAH.Out("la $$25," + LocalName);
   }
-  else
-    IAH.Out("la $$25, " + Name);
+  else {
+    IAH.Out(".set reorder");
+    IAH.Out("la $$25," + Name);
+  }
   swapFPIntParams(PV, M, IAH, LE, false);
   IAH.Out("jr $$25");
   IAH.Out(LocalName + " = " + Name);
   new UnreachableInst(FStub->getContext(), BB);
 }
 
+//
+// remove the use-soft-float attribute
+//
+static void removeUseSoftFloat(Function &F) {
+  AttributeSet A;
+  DEBUG(errs() << "removing -use-soft-float\n");
+  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+                     "use-soft-float", "false");
+  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  if (F.hasFnAttribute("use-soft-float")) {
+    DEBUG(errs() << "still has -use-soft-float\n");
+  }
+  F.addAttributes(AttributeSet::FunctionIndex, A);
+}
+
 namespace llvm {
 
 //
@@ -441,6 +490,11 @@ bool Mips16HardFloat::runOnModule(Module &M) {
   DEBUG(errs() << "Run on Module Mips16HardFloat\n");
   bool Modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->hasFnAttribute("nomips16") &&
+        F->hasFnAttribute("use-soft-float")) {
+      removeUseSoftFloat(*F);
+      continue;
+    }
     if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
         F->hasFnAttribute("nomips16")) continue;
     Modified |= fixupFPReturnAndCall(*F, &M, Subtarget);
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 0caa277..4948f40 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -80,10 +80,11 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   V1 = RegInfo.createVirtualRegister(RC);
   V2 = RegInfo.createVirtualRegister(RC);
 
-  BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
-    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
-  BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
-    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  BuildMI(MBB, I, DL, TII.get(Mips::GotPrologue16), V0).
+    addReg(V1, RegState::Define).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+
   BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
   BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
     .addReg(V1).addReg(V2);
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 6ed1d9e..61d8bb8 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -90,6 +90,7 @@ static const Mips16Libcall HardFloatLibCalls[] = {
 };
 
 static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
+  {"__fixunsdfsi", "__mips16_call_stub_2" },
   {"ceil",  "__mips16_call_stub_df_2"},
   {"ceilf", "__mips16_call_stub_sf_1"},
   {"copysign",  "__mips16_call_stub_df_10"},
@@ -144,6 +145,11 @@ Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::ATOMIC_LOAD_UMIN,   MVT::i32,   Expand);
   setOperationAction(ISD::ATOMIC_LOAD_UMAX,   MVT::i32,   Expand);
 
+  setOperationAction(ISD::ROTR, MVT::i32,  Expand);
+  setOperationAction(ISD::ROTR, MVT::i64,  Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
   computeRegisterProperties();
 }
 
@@ -168,57 +174,57 @@ Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case Mips::SelBneZ:
     return emitSel16(Mips::BnezRxImm16, MI, BB);
   case Mips::SelTBteqZCmpi:
-    return emitSeliT16(Mips::BteqzX16, Mips::CmpiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::CmpiRxImmX16, MI, BB);
   case Mips::SelTBteqZSlti:
-    return emitSeliT16(Mips::BteqzX16, Mips::SltiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiRxImmX16, MI, BB);
   case Mips::SelTBteqZSltiu:
-    return emitSeliT16(Mips::BteqzX16, Mips::SltiuRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiuRxImmX16, MI, BB);
   case Mips::SelTBtneZCmpi:
-    return emitSeliT16(Mips::BtnezX16, Mips::CmpiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::CmpiRxImmX16, MI, BB);
   case Mips::SelTBtneZSlti:
-    return emitSeliT16(Mips::BtnezX16, Mips::SltiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::SltiRxImmX16, MI, BB);
   case Mips::SelTBtneZSltiu:
-    return emitSeliT16(Mips::BtnezX16, Mips::SltiuRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::SltiuRxImmX16, MI, BB);
   case Mips::SelTBteqZCmp:
-    return emitSelT16(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
   case Mips::SelTBteqZSlt:
-    return emitSelT16(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
   case Mips::SelTBteqZSltu:
-    return emitSelT16(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
   case Mips::SelTBtneZCmp:
-    return emitSelT16(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
   case Mips::SelTBtneZSlt:
-    return emitSelT16(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
   case Mips::SelTBtneZSltu:
-    return emitSelT16(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
   case Mips::BteqzT8CmpX16:
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
   case Mips::BteqzT8SltX16:
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
   case Mips::BteqzT8SltuX16:
     // TBD: figure out a way to get this or remove the instruction
     // altogether.
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
   case Mips::BtnezT8CmpX16:
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
   case Mips::BtnezT8SltX16:
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
   case Mips::BtnezT8SltuX16:
     // TBD: figure out a way to get this or remove the instruction
     // altogether.
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
   case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+    Mips::Bteqz16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+    Mips::Bteqz16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+    Mips::Bteqz16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
   case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+    Mips::Btnez16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+    Mips::Btnez16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+    Mips::Btnez16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
     break;
   case Mips::SltCCRxRy16:
     return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
@@ -418,6 +424,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
             bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
             CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
   SelectionDAG &DAG = CLI.DAG;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   const char* Mips16HelperFunction = 0;
   bool NeedMips16Helper = false;
 
@@ -473,7 +481,10 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     if (NeedMips16Helper) {
       RegsToPass.push_front(std::make_pair(V0Reg, Callee));
       JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
-      JumpTarget = getAddrGlobal(JumpTarget, DAG, MipsII::MO_GOT);
+      ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
+      JumpTarget = getAddrGlobal(S, JumpTarget.getValueType(), DAG,
+                                 MipsII::MO_GOT, Chain,
+                                 FuncInfo->callPtrInfo(S->getSymbol()));
     } else
       RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
   }
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 05e70ab..000ea28 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -10,7 +10,6 @@
 // This file contains the Mips16 implementation of the TargetInstrInfo class.
 //
 //===----------------------------------------------------------------------===//
-#include <stdio.h>
 #include "Mips16InstrInfo.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsMachineFunction.h"
@@ -20,10 +19,12 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -36,7 +37,7 @@ static cl::opt<bool> NeverUseSaveRestore(
 
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
-  : MipsInstrInfo(tm, Mips::BimmX16),
+  : MipsInstrInfo(tm, Mips::Bimm16),
     RI(*tm.getSubtargetImpl()) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
@@ -77,11 +78,11 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (Mips::GPR32RegClass.contains(DestReg) &&
            Mips::CPU16RegsRegClass.contains(SrcReg))
     Opc = Mips::Move32R16;
-  else if ((SrcReg == Mips::HI) &&
+  else if ((SrcReg == Mips::HI0) &&
            (Mips::CPU16RegsRegClass.contains(DestReg)))
     Opc = Mips::Mfhi16, SrcReg = 0;
 
-  else if ((SrcReg == Mips::LO) &&
+  else if ((SrcReg == Mips::LO0) &&
            (Mips::CPU16RegsRegClass.contains(DestReg)))
     Opc = Mips::Mflo16, SrcReg = 0;
 
@@ -151,13 +152,17 @@ unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   default:  llvm_unreachable("Illegal opcode!");
   case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
   case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BeqzRxImm16: return Mips::BnezRxImm16;
+  case Mips::BnezRxImm16: return Mips::BeqzRxImm16;
   case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
   case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
   case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::Btnez16: return Mips::Bteqz16;
   case Mips::BtnezX16: return Mips::BteqzX16;
   case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
   case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
   case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::Bteqz16: return Mips::Btnez16;
   case Mips::BteqzX16: return Mips::BtnezX16;
   case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
   case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
@@ -439,6 +444,9 @@ Mips16InstrInfo::basicLoadImmediate(
 
 unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::Bimm16  ||
+          Opc == Mips::Bteqz16        || Opc == Mips::Btnez16 ||
+          Opc == Mips::BeqzRxImm16    || Opc == Mips::BnezRxImm16   ||
           Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
           Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
           Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
@@ -473,7 +481,6 @@ const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) {
   return new Mips16InstrInfo(TM);
 }
 
-#include <stdio.h>
 bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
                                      int64_t Amount) {
   switch (Opcode) {
@@ -493,6 +500,49 @@ bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
       return isInt<16>(Amount);
     return isInt<15>(Amount);
   }
-  printf("Unexpected opcode %i \n", Opcode);
   llvm_unreachable("unexpected Opcode in validImmediate");
 }
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// We implement the special case of the .space directive taking only an
+/// integer argument, which is the size in bytes. This is used for creating
+/// inline code spacing for testing purposes using inline assembly.
+///
+unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+
+
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      if (strncmp(Str, ".space", 6)==0) {
+        char *EStr; int Sz;
+        Sz = strtol(Str+6, &EStr, 10);
+        while (isspace(*EStr)) ++EStr;
+        if (*EStr=='\0') {
+          DEBUG(dbgs() << "parsed .space " << Sz << '\n');
+          return Sz;
+        }
+      }
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index 118d258..d9a594b 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -108,6 +108,8 @@ public:
   void BuildAddiuSpImm
     (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
 
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const;
 private:
   virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
 
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index aef4e92..7441c78 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -32,6 +32,16 @@ def mem16_ea : Operand<i32> {
 }
 
 //
+// I-type instruction format
+//
+// this is only used by bimm. the actual assembly value is a 12 bit signed
+// number
+//
+class FI16_ins<bits<5> op, string asmstr, InstrItinClass itin>:
+  FI16<op, (outs), (ins brtarget:$imm16),
+            !strconcat(asmstr, "\t$imm16 # 16 bit inst"), [], itin>;
+
+//
 //
 // I8 instruction format
 //
@@ -41,7 +51,10 @@ class FI816_ins_base<bits<3> _func, string asmstr,
   FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
         [], itin>;
 
-
+class FI816_ins<bits<3> _func, string asmstr,
+                InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$imm  # 16 bit inst", itin>;
+ 
 class FI816_SP_ins<bits<3> _func, string asmstr,
                    InstrItinClass itin>:
   FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
@@ -60,6 +73,11 @@ class FRI16_ins<bits<5> op, string asmstr,
                 InstrItinClass itin>:
   FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
 
+class FRI16_TCP_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin>;
+            
 class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
                      InstrItinClass itin>:
   FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
@@ -172,6 +190,11 @@ class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
   FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
             !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
 
+class FEXT_RI16_TCP_ins<bits<5> _op, string asmstr,
+                        InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
 class FEXT_2RI16_ins<bits<5> _op, string asmstr,
                      InstrItinClass itin>:
   FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
@@ -220,7 +243,7 @@ class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
 // EXT-SHIFT instruction format
 //
 class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
-  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa),
                !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
 
 //
@@ -343,6 +366,14 @@ class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
   FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
               !strconcat(asmstr, "\t $rx"), [], itin> ;
 
+class FRR_SF16_ins
+  <bits<5> _funct, bits<3> _subfunc,
+    string asmstr, InstrItinClass itin>:
+  FRR_SF16<_funct, _subfunc, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_),
+           !strconcat(asmstr, "\t $rx"),
+           [], itin> {
+  let Constraints = "$rx_ = $rx";
+  }
 //
 // RRR-type instruction format
 //
@@ -447,7 +478,7 @@ def Constant32:
   MipsPseudo16<(outs), (ins imm32:$imm), "\t.word $imm", []>;
 
 def LwConstant32:
-  MipsPseudo16<(outs CPU16Regs:$rx), (ins imm32:$imm),
+  MipsPseudo16<(outs CPU16Regs:$rx), (ins imm32:$imm, imm32:$constid),
     "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
 
 
@@ -559,6 +590,14 @@ def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 //
 def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 
+//
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch (Extended)
+// To do an unconditional PC-relative branch.
+//
+
+def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16;
+
 // Format: B offset MIPS16e
 // Purpose: Unconditional Branch
 // To do an unconditional PC-relative branch.
@@ -591,6 +630,10 @@ def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>;
 // Purpose: Branch on T Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
+def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
 def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
   let Uses = [T8];
 }
@@ -614,6 +657,11 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
 // Purpose: Branch on T Not Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
+
+def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
 def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
   let Uses = [T8];
 }
@@ -665,7 +713,7 @@ def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
 // To divide 32-bit signed integers.
 //
 def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -674,7 +722,7 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
 // To divide 32-bit unsigned integers.
 //
 def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 //
 // Format: JAL target MIPS16e
@@ -684,10 +732,7 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
 //
 
 def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
-  let isBranch = 1;
   let hasDelaySlot = 0;  // not true, but we add the nop for now
-  let isTerminator=1;
-  let isBarrier=1;
   let isCall=1;
 }
 
@@ -771,6 +816,10 @@ def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
 //
 def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
 
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> {
+  let isCodeGenOnly = 1;
+}
+
 //
 // Format: LW ry, offset(rx) MIPS16e
 // Purpose: Load Word (Extended)
@@ -784,10 +833,13 @@ def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad{
 // Purpose: Load Word (SP-Relative, Extended)
 // To load an SP-relative word from memory as a signed value.
 //
-def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10110, "lw", IILoad>, MayLoad{
+def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10010, "lw", IILoad>, MayLoad{
   let Uses = [SP];
 }
 
+def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
+
+def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
 //
 // Format: MOVE r32, rz MIPS16e
 // Purpose: Move
@@ -808,7 +860,7 @@ def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
 // To copy the special purpose HI register to a GPR.
 //
 def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
-  let Uses = [HI];
+  let Uses = [HI0];
   let neverHasSideEffects = 1;
 }
 
@@ -818,7 +870,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
 // To copy the special purpose LO register to a GPR.
 //
 def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
-  let Uses = [LO];
+  let Uses = [LO0];
   let neverHasSideEffects = 1;
 }
 
@@ -828,13 +880,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
 def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -845,7 +897,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
 def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -856,7 +908,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
 def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -951,6 +1003,22 @@ def SbRxRyOffMemX16:
   FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIStore>, MayStore;
 
 //
+// Format: SEB rx MIPS16e
+// Purpose: Sign-Extend Byte
+// Sign-extend least significant byte in register rx.
+//
+def SebRx16
+  : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>;
+
+//
+// Format: SEH rx MIPS16e
+// Purpose: Sign-Extend Halfword
+// Sign-extend least significant word in register rx.
+//
+def SehRx16
+  : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>;
+
+//
 // The Sel(T) instructions are pseudos
 // T means that they use T8 implicitly.
 //
@@ -1075,7 +1143,7 @@ def ShRxRyOffMemX16:
 //
 // Format: SLL rx, ry, sa MIPS16e
 // Purpose: Shift Word Left Logical (Extended)
-// To execute a left-shift of a word by a fixed number of bits—0 to 31 bits.
+// To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
 //
 def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
 
@@ -1171,7 +1239,7 @@ def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
 // Format: SRA rx, ry, sa MIPS16e
 // Purpose: Shift Word Right Arithmetic (Extended)
 // To execute an arithmetic right-shift of a word by a fixed
-// number of bits—1 to 8 bits.
+// number of bits-1 to 8 bits.
 //
 def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
 
@@ -1189,7 +1257,7 @@ def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
 // Format: SRL rx, ry, sa MIPS16e
 // Purpose: Shift Word Right Logical (Extended)
 // To execute a logical right-shift of a word by a fixed
-// number of bits—1 to 31 bits.
+// number of bits-1 to 31 bits.
 //
 def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
 
@@ -1330,9 +1398,7 @@ def: Mips16Pat<(i32  addr16:$addr),
 
 
 // Large (>16 bit) immediate loads
-def : Mips16Pat<(i32 imm:$imm),
-                (OrRxRxRy16 (SllX16 (LiRxImmX16 (HI16 imm:$imm)), 16),
-                (LiRxImmX16 (LO16 imm:$imm)))>;
+def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
 
 // Carry MipsPatterns
 def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
@@ -1373,7 +1439,7 @@ def: Mips16Pat
 
 def: Mips16Pat
   <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
-   (BeqzRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BeqzRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1435,7 +1501,7 @@ def: Mips16Pat
 
 def: Mips16Pat
   <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
-   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1443,7 +1509,7 @@ def: Mips16Pat
 //
 def: Mips16Pat
   <(brcond CPU16Regs:$rx, bb:$targ16),
-   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1473,7 +1539,7 @@ def: Mips16Pat
 //   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
 //  >;
 
-def: UncondBranch16_pat<br, BimmX16>;
+def: UncondBranch16_pat<br, Bimm16>;
 
 // Small immediates
 def: Mips16Pat<(i32 immSExt16:$in),
@@ -1787,7 +1853,8 @@ def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
                (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
 
 // hi/lo relocs
-
+def : Mips16Pat<(MipsHi tblockaddress:$in),
+                (SllX16 (LiRxImmX16 tblockaddress:$in), 16)>;
 def : Mips16Pat<(MipsHi tglobaladdr:$in),
                 (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
 def : Mips16Pat<(MipsHi tjumptable:$in),
@@ -1795,6 +1862,8 @@ def : Mips16Pat<(MipsHi tjumptable:$in),
 def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
                 (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
 
+def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
+
 // wrapper_pic
 class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
   Mips16Pat<(MipsWrapper RC:$gp, node:$in),
@@ -1811,3 +1880,30 @@ def : Mips16Pat<(i32 (extloadi16  addr16:$src)),
 
 def: Mips16Pat<(trap), (Break16)>;
 
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i8),
+                (SebRx16 CPU16Regs:$val)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i16),
+                (SehRx16 CPU16Regs:$val)>;
+
+def GotPrologue16:   
+  MipsPseudo16<
+    (outs CPU16Regs:$rh, CPU16Regs:$rl),
+    (ins simm16:$immHi, simm16:$immLo),
+    ".align 2\n\tli\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  // let PrintMethod = "printCPInstOperand";
+}
+
+// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+// the function.  The first operand is the ID# for this instruction, the second
+// is the index into the MachineConstantPool that this is, the third is the
+// size in bytes of this constant pool entry.
+//
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                      i32imm:$size), "foo", []>;
+
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index a752ab8..15ef654 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -15,9 +15,6 @@
 // Mips Operand, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
-// Instruction operand types
-def shamt_64       : Operand<i64>;
-
 // Unsigned Operand
 def uimm16_64      : Operand<i64> {
   let PrintMethod = "printUnsignedImm";
@@ -34,36 +31,21 @@ def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
-let DecoderNamespace = "Mips64" in {
-
-multiclass Atomic2Ops64<PatFrag Op> {
-  def NAME : Atomic2Ops<Op, GPR64, GPR32>, Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Atomic2Ops<Op, GPR64, GPR64>, Requires<[IsN64, HasStdEnc]>;
-}
-
-multiclass AtomicCmpSwap64<PatFrag Op>  {
-  def NAME : AtomicCmpSwap<Op, GPR64, GPR32>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : AtomicCmpSwap<Op, GPR64, GPR64>,
-             Requires<[IsN64, HasStdEnc]>;
-}
-}
-let usesCustomInserter = 1, Predicates = [HasStdEnc],
-  DecoderNamespace = "Mips64" in {
-  defm ATOMIC_LOAD_ADD_I64  : Atomic2Ops64<atomic_load_add_64>;
-  defm ATOMIC_LOAD_SUB_I64  : Atomic2Ops64<atomic_load_sub_64>;
-  defm ATOMIC_LOAD_AND_I64  : Atomic2Ops64<atomic_load_and_64>;
-  defm ATOMIC_LOAD_OR_I64   : Atomic2Ops64<atomic_load_or_64>;
-  defm ATOMIC_LOAD_XOR_I64  : Atomic2Ops64<atomic_load_xor_64>;
-  defm ATOMIC_LOAD_NAND_I64 : Atomic2Ops64<atomic_load_nand_64>;
-  defm ATOMIC_SWAP_I64      : Atomic2Ops64<atomic_swap_64>;
-  defm ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap64<atomic_cmp_swap_64>;
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I64  : Atomic2Ops<atomic_load_add_64, GPR64>;
+  def ATOMIC_LOAD_SUB_I64  : Atomic2Ops<atomic_load_sub_64, GPR64>;
+  def ATOMIC_LOAD_AND_I64  : Atomic2Ops<atomic_load_and_64, GPR64>;
+  def ATOMIC_LOAD_OR_I64   : Atomic2Ops<atomic_load_or_64, GPR64>;
+  def ATOMIC_LOAD_XOR_I64  : Atomic2Ops<atomic_load_xor_64, GPR64>;
+  def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>;
+  def ATOMIC_SWAP_I64      : Atomic2Ops<atomic_swap_64, GPR64>;
+  def ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1 in {
-  defm LOAD_AC128  : LoadM<"", ACRegs128>;
-  defm STORE_AC128 : StoreM<"", ACRegs128>;
+  def LOAD_ACC128  : Load<"", ACC128>;
+  def STORE_ACC128 : Store<"", ACC128>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -110,103 +92,101 @@ def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
 }
 
 /// Shift Instructions
-def DSLL   : shift_rotate_imm<"dsll", shamt, GPR64Opnd, shl, immZExt6>,
+def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, shl, immZExt6>,
              SRA_FM<0x38, 0>;
-def DSRL   : shift_rotate_imm<"dsrl", shamt, GPR64Opnd, srl, immZExt6>,
+def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, srl, immZExt6>,
              SRA_FM<0x3a, 0>;
-def DSRA   : shift_rotate_imm<"dsra", shamt, GPR64Opnd, sra, immZExt6>,
+def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, sra, immZExt6>,
              SRA_FM<0x3b, 0>;
 def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, shl>, SRLV_FM<0x14, 0>;
 def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, srl>, SRLV_FM<0x16, 0>;
 def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, sra>, SRLV_FM<0x17, 0>;
-def DSLL32 : shift_rotate_imm<"dsll32", shamt, GPR64Opnd>, SRA_FM<0x3c, 0>;
-def DSRL32 : shift_rotate_imm<"dsrl32", shamt, GPR64Opnd>, SRA_FM<0x3e, 0>;
-def DSRA32 : shift_rotate_imm<"dsra32", shamt, GPR64Opnd>, SRA_FM<0x3f, 0>;
+def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd>, SRA_FM<0x3c, 0>;
+def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 0>;
+def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd>, SRA_FM<0x3f, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips64r2, HasStdEnc] in {
-  def DROTR  : shift_rotate_imm<"drotr", shamt, GPR64Opnd, rotr, immZExt6>,
+  def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, rotr, immZExt6>,
                SRA_FM<0x3a, 1>;
   def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, rotr>,
                SRLV_FM<0x16, 1>;
+  def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 1>;
 }
 
 /// Load and Store Instructions
 ///  aligned
 let isCodeGenOnly = 1 in {
-defm LB64  : LoadM<"lb", GPR64Opnd, sextloadi8, IILoad>, LW_FM<0x20>;
-defm LBu64 : LoadM<"lbu", GPR64Opnd, zextloadi8, IILoad>, LW_FM<0x24>;
-defm LH64  : LoadM<"lh", GPR64Opnd, sextloadi16, IILoad>, LW_FM<0x21>;
-defm LHu64 : LoadM<"lhu", GPR64Opnd, zextloadi16, IILoad>, LW_FM<0x25>;
-defm LW64  : LoadM<"lw", GPR64Opnd, sextloadi32, IILoad>, LW_FM<0x23>;
-defm SB64  : StoreM<"sb", GPR64Opnd, truncstorei8, IIStore>, LW_FM<0x28>;
-defm SH64  : StoreM<"sh", GPR64Opnd, truncstorei16, IIStore>, LW_FM<0x29>;
-defm SW64  : StoreM<"sw", GPR64Opnd, truncstorei32, IIStore>, LW_FM<0x2b>;
+def LB64  : Load<"lb", GPR64Opnd, sextloadi8, IILoad>, LW_FM<0x20>;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, IILoad>, LW_FM<0x24>;
+def LH64  : Load<"lh", GPR64Opnd, sextloadi16, IILoad>, LW_FM<0x21>;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, IILoad>, LW_FM<0x25>;
+def LW64  : Load<"lw", GPR64Opnd, sextloadi32, IILoad>, LW_FM<0x23>;
+def SB64  : Store<"sb", GPR64Opnd, truncstorei8, IIStore>, LW_FM<0x28>;
+def SH64  : Store<"sh", GPR64Opnd, truncstorei16, IIStore>, LW_FM<0x29>;
+def SW64  : Store<"sw", GPR64Opnd, truncstorei32, IIStore>, LW_FM<0x2b>;
 }
 
-defm LWu   : LoadM<"lwu", GPR64Opnd, zextloadi32, IILoad>, LW_FM<0x27>;
-defm LD    : LoadM<"ld", GPR64Opnd, load, IILoad>, LW_FM<0x37>;
-defm SD    : StoreM<"sd", GPR64Opnd, store, IIStore>, LW_FM<0x3f>;
+def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, IILoad>, LW_FM<0x27>;
+def LD    : Load<"ld", GPR64Opnd, load, IILoad>, LW_FM<0x37>;
+def SD    : Store<"sd", GPR64Opnd, store, IIStore>, LW_FM<0x3f>;
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
-defm LWL64 : LoadLeftRightM<"lwl", MipsLWL, GPR64Opnd>, LW_FM<0x22>;
-defm LWR64 : LoadLeftRightM<"lwr", MipsLWR, GPR64Opnd>, LW_FM<0x26>;
-defm SWL64 : StoreLeftRightM<"swl", MipsSWL, GPR64Opnd>, LW_FM<0x2a>;
-defm SWR64 : StoreLeftRightM<"swr", MipsSWR, GPR64Opnd>, LW_FM<0x2e>;
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, IILoad>, LW_FM<0x22>;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, IILoad>, LW_FM<0x26>;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, IIStore>, LW_FM<0x2a>;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, IIStore>, LW_FM<0x2e>;
 }
 
-defm LDL   : LoadLeftRightM<"ldl", MipsLDL, GPR64Opnd>, LW_FM<0x1a>;
-defm LDR   : LoadLeftRightM<"ldr", MipsLDR, GPR64Opnd>, LW_FM<0x1b>;
-defm SDL   : StoreLeftRightM<"sdl", MipsSDL, GPR64Opnd>, LW_FM<0x2c>;
-defm SDR   : StoreLeftRightM<"sdr", MipsSDR, GPR64Opnd>, LW_FM<0x2d>;
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, IILoad>, LW_FM<0x1a>;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, IILoad>, LW_FM<0x1b>;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, IIStore>, LW_FM<0x2c>;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, IIStore>, LW_FM<0x2d>;
 
 /// Load-linked, Store-conditional
-let Predicates = [NotN64, HasStdEnc] in {
-  def LLD : LLBase<"lld", GPR64Opnd, mem>, LW_FM<0x34>;
-  def SCD : SCBase<"scd", GPR64Opnd, mem>, LW_FM<0x3c>;
-}
-
-let Predicates = [IsN64, HasStdEnc], isCodeGenOnly = 1 in {
-  def LLD_P8 : LLBase<"lld", GPR64Opnd, mem64>, LW_FM<0x34>;
-  def SCD_P8 : SCBase<"scd", GPR64Opnd, mem64>, LW_FM<0x3c>;
-}
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>;
 
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
-def JR64   : IndirectBranch<GPR64Opnd>, MTLO_FM<8>;
-def BEQ64  : CBranch<"beq", seteq, GPR64Opnd>, BEQ_FM<4>;
-def BNE64  : CBranch<"bne", setne, GPR64Opnd>, BEQ_FM<5>;
-def BGEZ64 : CBranchZero<"bgez", setge, GPR64Opnd>, BGEZ_FM<1, 1>;
-def BGTZ64 : CBranchZero<"bgtz", setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
-def BLEZ64 : CBranchZero<"blez", setle, GPR64Opnd>, BGEZ_FM<6, 0>;
-def BLTZ64 : CBranchZero<"bltz", setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>;
+def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
+def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
+def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
+def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
+def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
+def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
 def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
 def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
-def TAILCALL64_R : JumpFR<GPR64Opnd, MipsTailCall>, MTLO_FM<8>, IsTailCall;
+def TAILCALL64_R : JumpFR<"tcallr", GPR64Opnd, MipsTailCall>,
+                   MTLO_FM<8>, IsTailCall;
 }
 
 /// Multiply and Divide Instructions.
-def DMULT  : Mult<"dmult", IIImult, GPR64Opnd, [HI64, LO64]>,
+def DMULT  : Mult<"dmult", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1c>;
-def DMULTu : Mult<"dmultu", IIImult, GPR64Opnd, [HI64, LO64]>,
+def DMULTu : Mult<"dmultu", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1d>;
-def PseudoDMULT  : MultDivPseudo<DMULT, ACRegs128, GPR64Opnd, MipsMult,
+def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
                                  IIImult>;
-def PseudoDMULTu : MultDivPseudo<DMULTu, ACRegs128, GPR64Opnd, MipsMultu,
+def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
                                  IIImult>;
-def DSDIV : Div<"ddiv", IIIdiv, GPR64Opnd, [HI64, LO64]>, MULT_FM<0, 0x1e>;
-def DUDIV : Div<"ddivu", IIIdiv, GPR64Opnd, [HI64, LO64]>, MULT_FM<0, 0x1f>;
-def PseudoDSDIV : MultDivPseudo<DSDIV, ACRegs128, GPR64Opnd, MipsDivRem,
+def DSDIV : Div<"ddiv", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1e>;
+def DUDIV : Div<"ddivu", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1f>;
+def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
                                 IIIdiv, 0, 1, 1>;
-def PseudoDUDIV : MultDivPseudo<DUDIV, ACRegs128, GPR64Opnd, MipsDivRemU,
+def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
                                 IIIdiv, 0, 1, 1>;
 
 let isCodeGenOnly = 1 in {
-def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI64]>, MTLO_FM<0x11>;
-def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO64]>, MTLO_FM<0x13>;
-def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, [HI64]>, MFLO_FM<0x10>;
-def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, [LO64]>, MFLO_FM<0x12>;
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
 
 /// Sign Ext In Register Instructions.
 def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd>, SEB_FM<0x10, 0x20>;
@@ -221,21 +201,18 @@ def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>;
 def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>;
 def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>;
 
-def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd, mem_ea_64>, LW_FM<0x19>;
+def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
 
 let isCodeGenOnly = 1 in
-def RDHWR64 : ReadHardware<GPR64Opnd, HW64RegsOpnd>, RDHWR_FM;
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
 
-def DEXT : ExtBase<"dext", GPR64Opnd>, EXT_FM<3>;
-let Pattern = []<dag> in {
-  def DEXTU : ExtBase<"dextu", GPR64Opnd>, EXT_FM<2>;
-  def DEXTM : ExtBase<"dextm", GPR64Opnd>, EXT_FM<1>;
-}
-def DINS : InsBase<"dins", GPR64Opnd>, EXT_FM<7>;
-let Pattern = []<dag> in {
-  def DINSU : InsBase<"dinsu", GPR64Opnd>, EXT_FM<6>;
-  def DINSM : InsBase<"dinsm", GPR64Opnd>, EXT_FM<5>;
-}
+def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
+def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
+def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
+
+def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
@@ -251,18 +228,12 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 //===----------------------------------------------------------------------===//
 
 // extended loads
-let Predicates = [NotN64, HasStdEnc] in {
+let Predicates = [HasStdEnc] in {
   def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
   def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 }
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64_P8 addr:$src)>;
-}
 
 // hi/lo relocs
 def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
@@ -316,7 +287,7 @@ defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
 // truncate
 def : MipsPat<(i32 (trunc GPR64:$src)),
               (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>,
-      Requires<[IsN64, HasStdEnc]>;
+      Requires<[HasStdEnc]>;
 
 // 32-to-64-bit extension
 def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
@@ -330,10 +301,6 @@ def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
 // bswap MipsPattern
 def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
 
-// mflo/hi patterns.
-def : MipsPat<(i64 (ExtractLOHI ACRegs128:$ac, imm:$lohi_idx)),
-              (EXTRACT_SUBREG ACRegs128:$ac, imm:$lohi_idx)>;
-
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
@@ -348,28 +315,16 @@ def : InstAlias<"dadd $rs, $rt, $imm",
                 0>;
 
 /// Move between CPU and coprocessor registers
-let DecoderNamespace = "Mips64" in {
-def DMFC0_3OP64 : MFC3OP<(outs GPR64Opnd:$rt),
-                         (ins GPR64Opnd:$rd, uimm16:$sel),
-                         "dmfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 1>;
-def DMTC0_3OP64 : MFC3OP<(outs GPR64Opnd:$rd, uimm16:$sel),
-                         (ins GPR64Opnd:$rt),
-                         "dmtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 5>;
-def DMFC2_3OP64 : MFC3OP<(outs GPR64Opnd:$rt),
-                         (ins GPR64Opnd:$rd, uimm16:$sel),
-                         "dmfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 1>;
-def DMTC2_3OP64 : MFC3OP<(outs GPR64Opnd:$rd, uimm16:$sel),
-                         (ins GPR64Opnd:$rt),
-                         "dmtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 5>;
+let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
+def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>;
+def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>;
 }
 
 // Two operand (implicit 0 selector) versions:
-def : InstAlias<"dmfc0 $rt, $rd",
-                (DMFC0_3OP64 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc0 $rt, $rd",
-                (DMTC0_3OP64 GPR64Opnd:$rd, 0, GPR64Opnd:$rt), 0>;
-def : InstAlias<"dmfc2 $rt, $rd",
-                (DMFC2_3OP64 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc2 $rt, $rd",
-                (DMTC2_3OP64 GPR64Opnd:$rd, 0, GPR64Opnd:$rt), 0>;
+def : InstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.h b/lib/Target/Mips/MipsAnalyzeImmediate.h
index a094dda..cc09034 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.h
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -22,7 +22,7 @@ namespace llvm {
     };
     typedef SmallVector<Inst, 7 > InstSeq;
 
-    /// Analyze - Get an instrucion sequence to load immediate Imm. The last
+    /// Analyze - Get an instruction sequence to load immediate Imm. The last
     /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
     /// true;
     const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
@@ -32,19 +32,19 @@ namespace llvm {
     /// AddInstr - Add I to all instruction sequences in SeqLs.
     void AddInstr(InstSeqLs &SeqLs, const Inst &I);
 
-    /// GetInstSeqLsADDiu - Get instrucion sequences which end with an ADDiu to
+    /// GetInstSeqLsADDiu - Get instruction sequences which end with an ADDiu to
     /// load immediate Imm
     void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLsORi - Get instrucion sequences which end with an ORi to
+    /// GetInstSeqLsORi - Get instrutcion sequences which end with an ORi to
     /// load immediate Imm
     void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLsSLL - Get instrucion sequences which end with a SLL to
+    /// GetInstSeqLsSLL - Get instruction sequences which end with a SLL to
     /// load immediate Imm
     void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLs - Get instrucion sequences to load immediate Imm.
+    /// GetInstSeqLs - Get instruction sequences to load immediate Imm.
     void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
     /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 1dc3326..45c4398 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -15,11 +15,11 @@
 #define DEBUG_TYPE "mips-asm-printer"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsELFStreamer.h"
 #include "Mips.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -33,8 +33,8 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -45,12 +45,17 @@
 
 using namespace llvm;
 
+MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
+  return static_cast<MipsTargetStreamer &>(OutStreamer.getTargetStreamer());
+}
+
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Initialize TargetLoweringObjectFile.
   if (Subtarget->allowMixed16_32())
     const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
       .Initialize(OutContext, TM);
   MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MCP = MF.getConstantPool();
   AsmPrinter::runOnMachineFunction(MF);
   return true;
 }
@@ -71,6 +76,39 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+    // CONSTPOOL_ENTRY - This instruction represents a floating
+    //constant pool in the function.  The first operand is the ID#
+    // for this instruction, the second is the index into the
+    // MachineConstantPool that this is, the third is the size in
+    // bytes of this constant pool entry.
+    // The required alignment is specified on the basic block holding this MI.
+    //
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer.EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    return;
+  }
+
+
   MachineBasicBlock::const_instr_iterator I = MI;
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
@@ -238,16 +276,15 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
   }
 
   if (Subtarget->inMicroMipsMode())
-    if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
-      MES->emitMipsSTOCG(*Subtarget, CurrentFnSym,
-      (unsigned)ELF::STO_MIPS_MICROMIPS);
+    getTargetStreamer().emitMipsHackSTOCG(CurrentFnSym,
+                                          (unsigned)ELF::STO_MIPS_MICROMIPS);
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
 /// EmitFunctionBodyStart - Targets can override this to emit stuff before
 /// the first basic block in the function.
 void MipsAsmPrinter::EmitFunctionBodyStart() {
-  MCInstLowering.Initialize(Mang, &MF->getContext());
+  MCInstLowering.Initialize(&MF->getContext());
 
   bool IsNakedFunction =
     MF->getFunction()->
@@ -284,6 +321,12 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
     }
     OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
   }
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
@@ -418,6 +461,11 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         return false;
       }
     }
+    case 'w':
+      // Print MSA registers for the 'f' constraint
+      // In LLVM, the 'w' modifier doesn't need to do anything.
+      // We can just call printOperand as normal.
+      break;
     }
   }
 
@@ -485,7 +533,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       return;
 
     case MachineOperand::MO_GlobalAddress:
-      O << *Mang->getSymbol(MO.getGlobal());
+      O << *getSymbol(MO.getGlobal());
       break;
 
     case MachineOperand::MO_BlockAddress: {
@@ -526,6 +574,15 @@ void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
     printOperand(MI, opNum, O);
 }
 
+void MipsAsmPrinter::printUnsignedImm8(const MachineInstr *MI, int opNum,
+                                       raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
 void MipsAsmPrinter::
 printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
@@ -587,15 +644,54 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
 }
 
-void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+static void emitELFHeaderFlagsCG(MipsTargetStreamer &TargetStreamer,
+                                 const MipsSubtarget &Subtarget) {
+  // Update e_header flags
+  unsigned EFlags = 0;
 
-  if (OutStreamer.hasRawTextSupport()) return;
+  // TODO: Need to add -mabicalls and -mno-abicalls flags.
+  // Currently we assume that -mabicalls is the default.
+  EFlags |= ELF::EF_MIPS_CPIC;
 
+  if (Subtarget.inMips16Mode())
+    EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
+  else
+    EFlags |= ELF::EF_MIPS_NOREORDER;
+
+  // Architecture
+  if (Subtarget.hasMips64r2())
+    EFlags |= ELF::EF_MIPS_ARCH_64R2;
+  else if (Subtarget.hasMips64())
+    EFlags |= ELF::EF_MIPS_ARCH_64;
+  else if (Subtarget.hasMips32r2())
+    EFlags |= ELF::EF_MIPS_ARCH_32R2;
+  else
+    EFlags |= ELF::EF_MIPS_ARCH_32;
+
+  if (Subtarget.inMicroMipsMode())
+    EFlags |= ELF::EF_MIPS_MICROMIPS;
+
+  // ABI
+  if (Subtarget.isABI_O32())
+    EFlags |= ELF::EF_MIPS_ABI_O32;
+
+  // Relocation Model
+  Reloc::Model RM = Subtarget.getRelocationModel();
+  if (RM == Reloc::PIC_ || RM == Reloc::Default)
+    EFlags |= ELF::EF_MIPS_PIC;
+  else if (RM == Reloc::Static)
+    ; // Do nothing for Reloc::Static
+  else
+    llvm_unreachable("Unsupported relocation model for e_flags");
+
+  TargetStreamer.emitMipsHackELFFlags(EFlags);
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
   // Emit Mips ELF register info
   Subtarget->getMReginfo().emitMipsReginfoSectionCG(
              OutStreamer, getObjFileLowering(), *Subtarget);
-  if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
-    MES->emitELFHeaderFlagsCG(*Subtarget);
+  emitELFHeaderFlagsCG(getTargetStreamer(), *Subtarget);
 }
 
 void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 4d1d624..11c6acd 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -25,10 +25,12 @@ namespace llvm {
 class MCStreamer;
 class MachineInstr;
 class MachineBasicBlock;
+class MipsTargetStreamer;
 class Module;
 class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
+  MipsTargetStreamer &getTargetStreamer();
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
@@ -40,6 +42,16 @@ private:
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
 
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+
+  bool UsingConstantPools;
+
 public:
 
   const MipsSubtarget *Subtarget;
@@ -47,8 +59,11 @@ public:
   MipsMCInstLower MCInstLowering;
 
   explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), MCInstLowering(*this) {
+    : AsmPrinter(TM, Streamer), MCP(0), InConstantPool(false),
+      MCInstLowering(*this) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
+    UsingConstantPools =
+      (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
   }
 
   virtual const char *getPassName() const {
@@ -57,6 +72,12 @@ public:
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
+  virtual void EmitConstantPool() LLVM_OVERRIDE {
+    if (!UsingConstantPools)
+      AsmPrinter::EmitConstantPool();
+    // we emit constant pools customly!
+  }
+
   void EmitInstruction(const MachineInstr *MI);
   void printSavedRegsBitmask(raw_ostream &O);
   void printHex32(unsigned int Value, raw_ostream &O);
@@ -75,6 +96,7 @@ public:
                              raw_ostream &O);
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index ac40b11..66391cb 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -26,8 +26,10 @@ def RetCC_MipsO32 : CallingConv<[
   // f32 are returned in registers F0, F2
   CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
 
-  // f64 are returned in register D0, D1
-  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0, D1]>>>
+  // f64 arguments are returned in D0_64 and D1_64 in FP64bit mode or
+  // in D0 and D1 in FP32bit mode.
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D1_64]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()", CCAssignToReg<[D0, D1]>>>
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -149,7 +151,16 @@ def RetCC_MipsEABI : CallingConv<[
 //===----------------------------------------------------------------------===//
 def CC_MipsO32_FastCC : CallingConv<[
   // f64 arguments are passed in double-precision floating pointer registers.
-  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7, D8, D9]>>,
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()",
+                                CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7,
+                                               D8, D9]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()",
+                                CCAssignToReg<[D0_64, D1_64, D2_64, D3_64,
+                                               D4_64, D5_64, D6_64, D7_64,
+                                               D8_64, D9_64, D10_64, D11_64,
+                                               D12_64, D13_64, D14_64, D15_64,
+                                               D16_64, D17_64, D18_64,
+                                               D19_64]>>>,
 
   // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 8>>
@@ -224,6 +235,9 @@ def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
 def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
                                    (sequence "S%u", 7, 0))>;
 
+def CSR_O32_FP64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 20), RA, FP,
+                                        (sequence "S%u", 7, 0))>;
+
 def CSR_N32 : CalleeSavedRegs<(add D31_64, D29_64, D27_64, D25_64, D24_64,
                                    D23_64, D22_64, D21_64, RA_64, FP_64, GP_64,
                                    (sequence "S%u_64", 7, 0))>;
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 813037e..ca4163d 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -105,11 +105,16 @@ private:
                          const MachineOperand &MO) const;
 
   unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getJumpTargetOpValueMM(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getBranchTargetOpValueMM(const MachineInstr &MI,
+                                    unsigned OpNo) const;
 
   unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
   void emitGlobalAddressUnaligned(const GlobalValue *GV, unsigned Reloc,
                                   int Offset) const;
@@ -186,6 +191,18 @@ unsigned MipsCodeEmitter::getJumpTargetOpValue(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getJumpTargetOpValueMM(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTargetOpValueMM(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
                                                  unsigned OpNo) const {
   MachineOperand MO = MI.getOperand(OpNo);
@@ -201,6 +218,12 @@ unsigned MipsCodeEmitter::getMemEncoding(const MachineInstr &MI,
   return (getMachineOpValue(MI, MI.getOperand(OpNo+1)) & 0xFFFF) | RegBits;
 }
 
+unsigned MipsCodeEmitter::getMemEncodingMMImm12(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getSizeExtEncoding(const MachineInstr &MI,
                                              unsigned OpNo) const {
   // size is encoded as size-1.
@@ -214,6 +237,12 @@ unsigned MipsCodeEmitter::getSizeInsEncoding(const MachineInstr &MI,
          getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
 }
 
+unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
@@ -316,6 +345,14 @@ bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI,
     BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO)
       .addReg(Mips::ZERO).addImm(0);
     break;
+  case Mips::B:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BEQ)).addReg(Mips::ZERO)
+      .addReg(Mips::ZERO).addOperand(MI->getOperand(0));
+    break;
+  case Mips::TRAP:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BREAK)).addImm(0)
+      .addImm(0);
+    break;
   case Mips::JALRPseudo:
     BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::JALR), Mips::RA)
       .addReg(MI->getOperand(0).getReg());
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 39862b3..2de1430 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -19,7 +19,7 @@
 class CMov_I_I_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
                   InstrItinClass Itin> :
   InstSE<(outs DRC:$rd), (ins DRC:$rs, CRC:$rt, DRC:$F),
-         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR> {
+         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR, opstr> {
   let Constraints = "$F = $rd";
 }
 
@@ -37,7 +37,7 @@ class CMov_F_I_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
   InstSE<(outs RC:$rd), (ins RC:$rs, FCCRegsOpnd:$fcc, RC:$F),
          !strconcat(opstr, "\t$rd, $rs, $fcc"),
          [(set RC:$rd, (OpNode RC:$rs, FCCRegsOpnd:$fcc, RC:$F))],
-         Itin, FrmFR> {
+         Itin, FrmFR, opstr> {
   let Constraints = "$F = $rd";
 }
 
@@ -103,91 +103,94 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 }
 
 // Instantiation of instructions.
-def MOVZ_I_I : CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, NoItinerary>,
+def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, IIArith>,
                ADD_FM<0, 0xa>;
 
 let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xa>;
-  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xa>;
-  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xa>;
+  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
+  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
+  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
 }
 
-def MOVN_I_I       : CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
+def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
 
 let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
-  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
-  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
+  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
+  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
+  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
 }
 
-def MOVZ_I_S : CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVZ_I_S : CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, IIFmove>,
                CMov_I_F_FM<18, 16>;
 
 let isCodeGenOnly = 1 in
-def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, IIFmove>,
                  CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVN_I_S : CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVN_I_S : CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, IIFmove>,
                CMov_I_F_FM<19, 16>;
 
 let isCodeGenOnly = 1 in
-def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, IIFmove>,
                  CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64RegsOpnd, IIFmove>,
+  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
                    CMov_I_F_FM<18, 17>;
-  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64RegsOpnd, IIFmove>,
+  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
                    CMov_I_F_FM<19, 17>;
 }
 
-let Predicates = [IsFP64bit, HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64RegsOpnd, IIFmove>,
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, IIFmove>,
                    CMov_I_F_FM<18, 17>;
-  def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64RegsOpnd,
-                                  IIFmove>, CMov_I_F_FM<18, 17>;
-  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64RegsOpnd, IIFmove>,
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, IIFmove>,
                    CMov_I_F_FM<19, 17>;
-  def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64RegsOpnd,
-                                  IIFmove>, CMov_I_F_FM<19, 17>;
+  let isCodeGenOnly = 1 in {
+    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
+                                   IIFmove>, CMov_I_F_FM<18, 17>;
+    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
+                                   IIFmove>, CMov_I_F_FM<19, 17>;
+  }
 }
 
-def MOVT_I : CMov_F_I_FT<"movt", GPR32Opnd, IIArith, MipsCMovFP_T>,
+def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIArith, MipsCMovFP_T>,
              CMov_F_I_FM<1>;
 
 let isCodeGenOnly = 1 in
 def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, IIArith, MipsCMovFP_T>,
                CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVF_I : CMov_F_I_FT<"movf", GPR32Opnd, IIArith, MipsCMovFP_F>,
+def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIArith, MipsCMovFP_F>,
              CMov_F_I_FM<0>;
 
 let isCodeGenOnly = 1 in
 def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, IIArith, MipsCMovFP_F>,
                CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVT_S : CMov_F_F_FT<"movt.s", FGR32RegsOpnd, IIFmove, MipsCMovFP_T>,
+def MOVT_S : CMov_F_F_FT<"movt.s", FGR32Opnd, IIFmove, MipsCMovFP_T>,
              CMov_F_F_FM<16, 1>;
-def MOVF_S : CMov_F_F_FT<"movf.s", FGR32RegsOpnd, IIFmove, MipsCMovFP_F>,
+def MOVF_S : CMov_F_F_FT<"movf.s", FGR32Opnd, IIFmove, MipsCMovFP_F>,
              CMov_F_F_FM<16, 0>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64RegsOpnd, IIFmove, MipsCMovFP_T>,
+  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64Opnd, IIFmove, MipsCMovFP_T>,
                  CMov_F_F_FM<17, 1>;
-  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64RegsOpnd, IIFmove, MipsCMovFP_F>,
+  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64Opnd, IIFmove, MipsCMovFP_F>,
                  CMov_F_F_FM<17, 0>;
 }
-let Predicates = [IsFP64bit, HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64RegsOpnd, IIFmove, MipsCMovFP_T>,
+
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, IIFmove, MipsCMovFP_T>,
                  CMov_F_F_FM<17, 1>;
-  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64RegsOpnd, IIFmove, MipsCMovFP_F>,
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, IIFmove, MipsCMovFP_F>,
                  CMov_F_F_FM<17, 0>;
 }
 
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index bda0167..c46bbac 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -9,9 +9,7 @@
 //
 //
 // This pass is used to make Pc relative loads of constants.
-// For now, only Mips16 will use this. While it has the same name and
-// uses many ideas from the LLVM ARM Constant Island Pass, it's not intended
-// to reuse any of the code from the ARM version.
+// For now, only Mips16 will use this. 
 //
 // Loading constants inline is expensive on Mips16 and it's in general better
 // to place the constant nearby in code space and then it can be loaded with a
@@ -27,31 +25,244 @@
 
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
 
 using namespace llvm;
 
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+
+// Rather than do make check tests with huge amounts of code, we force
+// the test to use this amount.
+//
+static cl::opt<int> ConstantIslandsSmallOffset(
+  "mips-constant-islands-small-offset",
+  cl::init(0),
+  cl::desc("Make small offsets be this amount for testing purposes"),
+  cl::Hidden);
+
+//
+// For testing purposes we tell it to not use relaxed load forms so that it
+// will split blocks.
+//
+static cl::opt<bool> NoLoadRelaxation(
+  "mips-constant-islands-no-load-relaxation",
+  cl::init(false),
+  cl::desc("Don't relax loads to long loads - for testing purposes"),
+  cl::Hidden);
+
+
 namespace {
+
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
+  /// MipsConstantIslands - Due to limited PC-relative displacements, Mips
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+
   class MipsConstantIslands : public MachineFunctionPass {
 
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      // FIXME: ignore LogAlign for this patch
+      //
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        return PO;
+      }
+
+      BasicBlockInfo() : Offset(0), Size(0) {}
+
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+    private:
+      unsigned MaxDisp;
+      unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
+                                // with different displacements
+      unsigned LongFormOpcode;
+    public:
+      bool NegOk;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg,
+             unsigned longformmaxdisp, unsigned longformopcode)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp),
+          LongFormMaxDisp(longformmaxdisp), LongFormOpcode(longformopcode),
+          NegOk(neg){
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      unsigned getMaxDisp() const {
+        unsigned xMaxDisp = ConstantIslandsSmallOffset?
+                            ConstantIslandsSmallOffset: MaxDisp;
+        return xMaxDisp;
+      }
+      void setMaxDisp(unsigned val) {
+        MaxDisp = val;
+      }
+      unsigned getLongFormMaxDisp() const {
+        return LongFormMaxDisp;
+      }
+      unsigned getLongFormOpcode() const {
+          return LongFormOpcode;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+  /// CPEntry - One per constant pool entry, keeping the machine instruction
+  /// pointer, the constpool index, and the number of CPUser's which
+  /// reference this entry.
+  struct CPEntry {
+    MachineInstr *CPEMI;
+    unsigned CPI;
+    unsigned RefCount;
+    CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+      : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+  };
+
+  /// CPEntries - Keep track of all of the constant pool entry machine
+  /// instructions. For each original constpool index (i.e. those that
+  /// existed upon entry to this pass), it keeps a vector of entries.
+  /// Original elements are cloned as we go along; the clones are
+  /// put in the vector of the original element, but have distinct CPIs.
+  std::vector<std::vector<CPEntry> > CPEntries;
+
+  /// ImmBranch - One per immediate branch, keeping the machine instruction
+  /// pointer, conditional or unconditional, the max displacement,
+  /// and (if isCond is true) the corresponding unconditional branch
+  /// opcode.
+  struct ImmBranch {
+    MachineInstr *MI;
+    unsigned MaxDisp : 31;
+    bool isCond : 1;
+    int UncondBr;
+    ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+      : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+  };
+
+  /// ImmBranches - Keep track of all the immediate branch instructions.
+  ///
+  std::vector<ImmBranch> ImmBranches;
+
+  /// HasFarJump - True if any far jump instruction has been emitted during
+  /// the branch fix up pass.
+  bool HasFarJump;
+
+  const TargetMachine &TM;
+  bool IsPIC;
+  unsigned ABI;
+  const MipsSubtarget *STI;
+  const Mips16InstrInfo *TII;
+  MipsFunctionInfo *MFI;
+  MachineFunction *MF;
+  MachineConstantPool *MCP;
+
+  unsigned PICLabelUId;
+  bool PrescannedForConstants;
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
   public:
     static char ID;
     MipsConstantIslands(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
-        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()) {}
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        STI(&TM.getSubtarget<MipsSubtarget>()), MF(0), MCP(0),
+        PrescannedForConstants(false){}
 
     virtual const char *getPassName() const {
       return "Mips Constant Islands";
@@ -59,26 +270,1264 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &F);
 
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+    void verify();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U);
+
+    bool isLongFormOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                                const CPUser &U);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    int findLongFormInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+
+    void prescanForConstants();
+
   private:
-    const TargetMachine &TM;
-    bool IsPIC;
-    unsigned ABI;
+
   };
 
   char MipsConstantIslands::ID = 0;
 } // end of anonymous namespace
 
+
+bool MipsConstantIslands::isLongFormOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getLongFormMaxDisp(), U.NegOk);
+}
+
+bool MipsConstantIslands::isOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getMaxDisp(), U.NegOk);
+}
+/// print block size and offset information - debugging
+void MipsConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
 /// createMipsLongBranchPass - Returns a pass that converts branches to long
 /// branches.
 FunctionPass *llvm::createMipsConstantIslandPass(MipsTargetMachine &tm) {
   return new MipsConstantIslands(tm);
 }
 
-bool MipsConstantIslands::runOnMachineFunction(MachineFunction &F) {
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // The intention is for this to be a mips16 only pass for now
   // FIXME:
-  // if (!TM.getSubtarget<MipsSubtarget>().inMips16Mode())
-  //  return false;
+  MF = &mf;
+  MCP = mf.getConstantPool();
+  DEBUG(dbgs() << "constant island machine function " << "\n");
+  if (!TM.getSubtarget<MipsSubtarget>().inMips16Mode() ||
+      !MipsSubtarget::useConstantIslands()) {
+    return false;
+  }
+  TII = (const Mips16InstrInfo*)MF->getTarget().getInstrInfo();
+  MFI = MF->getInfo<MipsFunctionInfo>();
+  DEBUG(dbgs() << "constant island processing " << "\n");
+  //
+  // will need to make predermination if there is any constants we need to
+  // put in constant islands. TBD.
+  //
+  if (!PrescannedForConstants) prescanForConstants();
+
+  HasFarJump = false;
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  bool MadeChange = false;
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  (void)NoBRIters;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= handleConstantPoolUser(i);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void
+MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
   return false;
 }
 
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+MipsConstantIslands::CPEntry
+*MipsConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return NULL;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  assert(CPEMI && CPEMI->getOpcode() == Mips::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void MipsConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+
+  // Compute block offsets.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other branches for now
+        case Mips::Bimm16:
+          Bits = 11;
+          Scale = 2;
+          isCond = false;
+          break;
+        case Mips::BimmX16:
+          Bits = 16;
+          Scale = 2;
+          isCond = false;
+        }
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == Mips::CONSTPOOL_ENTRY)
+        continue;
+
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          unsigned LongFormBits = 0;
+          unsigned LongFormScale = 0;
+          unsigned LongFormOpcode = 0;
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+          case Mips::LwRxPcTcp16:
+            Bits = 8;
+            Scale = 4;
+            LongFormOpcode = Mips::LwRxPcTcpX16;
+            LongFormBits = 16;
+            LongFormScale = 1;
+            break;
+          case Mips::LwRxPcTcpX16:
+            Bits = 16;
+            Scale = 1;
+            NegOk = true;
+            break;
+          }
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk,
+                                   LongFormMaxOffs, LongFormOpcode));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+
+        }
+
+    }
+  }
+
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void MipsConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I)
+    BBI.Size += TII->GetInstSizeInBytes(I);
+
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned MipsConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void MipsConstantIslands::updateForInsertedWaterBlock
+  (MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+unsigned MipsConstantIslands::getUserOffset(CPUser &U) const {
+  return getOffsetOf(U.MI);
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
+  (MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(Mips::Bimm16)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(llvm::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water;
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth;
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool MipsConstantIslands::isCPEntryInRange
+  (MachineInstr *MI, unsigned UserOffset,
+   MachineInstr *CPEMI, unsigned MaxDisp,
+   bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == Mips::Bimm16)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif
+
+void MipsConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned Offset = BBInfo[i - 1].Offset + BBInfo[i - 1].Size;
+    BBInfo[i].Offset = Offset;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = NULL;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// This version checks if the longer form of the instruction can be used to
+/// to satisfy things.
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findLongFormInRangeCPEntry
+  (CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
+                       U.getLongFormMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    UserMI->setDesc(TII->get(U.getLongFormOpcode()));
+    U.setMaxDisp(U.getLongFormMaxDisp());
+    return 2;  // instruction is longer length now
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
+                         U.getLongFormMaxDisp(), U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case Mips::Bimm16:
+    return ((1<<10)-1)*2;
+  case Mips::BimmX16:
+    return ((1<<16)-1)*2;
+  default:
+    break;
+  }
+  return ((1<<16)-1)*2;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  
+/// To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = prior(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = 2;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = Mips::Bimm16;
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  //MachineInstr *LastIT = 0;
+  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->GetInstSizeInBytes(MI),
+       MI = llvm::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+  }
+
+  --MI;
+  NewMBB = splitBlockBeforeInstr(MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
+
+  } else {
+    // No water found.
+    // we first see if a longer form of the instrucion could have reached
+    // the constant. in that case we won't bother to split
+    if (!NoLoadRelaxation) {
+      result = findLongFormInRangeCPEntry(U, UserOffset);
+      if (result != 0) return true;
+    }
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = createPICLabelUId();
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool MipsConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = NULL;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool MipsConstantIslands::isBBInRange
+  (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
+
+unsigned PCAdj = 4;
+
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool MipsConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  // Use BL to implement far jump.
+  Br.MaxDisp = ((1 << 16)-1) * 2;
+  MI->setDesc(TII->get(Mips::BimmX16));
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  unsigned CCReg = 0;  // FIXME
+  unsigned CC=0; //FIXME
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+
+void MipsConstantIslands::prescanForConstants() {
+  unsigned J = 0;
+  (void)J;
+  PrescannedForConstants = true;
+  for (MachineFunction::iterator B =
+         MF->begin(), E = MF->end(); B != E; ++B) {
+    for (MachineBasicBlock::instr_iterator I =
+        B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
+      switch(I->getDesc().getOpcode()) {
+        case Mips::LwConstant32: {
+          DEBUG(dbgs() << "constant island constant " << *I << "\n");
+          J = I->getNumOperands();
+          DEBUG(dbgs() << "num operands " << J  << "\n");
+          MachineOperand& Literal = I->getOperand(1);
+          if (Literal.isImm()) {
+            int64_t V = Literal.getImm();
+            DEBUG(dbgs() << "literal " << V  << "\n");
+            Type *Int32Ty =
+              Type::getInt32Ty(MF->getFunction()->getContext());
+            const Constant *C = ConstantInt::get(Int32Ty, V);
+            unsigned index = MCP->getConstantPoolIndex(C, 4);
+            I->getOperand(2).ChangeToImmediate(index);
+            DEBUG(dbgs() << "constant island constant " << *I << "\n");
+            I->setDesc(TII->get(Mips::LwRxPcTcp16));
+            I->RemoveOperand(1);
+            I->RemoveOperand(1);
+            I->addOperand(MachineOperand::CreateCPI(index, 0));
+            I->addOperand(MachineOperand::CreateImm(4));
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+}
+
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index 526821a..d268384 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -256,236 +256,235 @@ class PREPEND_ENC : APPEND_FMT<0b00001>;
 
 // Instruction desc.
 class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                        InstrItinClass itin, RegisterClass RCD,
-                        RegisterClass RCS,  RegisterClass RCT = RCS> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                        InstrItinClass itin, RegisterOperand ROD,
+                        RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                           InstrItinClass itin, RegisterClass RCD,
-                           RegisterClass RCS = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs);
+                           InstrItinClass itin, RegisterOperand ROD,
+                           RegisterOperand ROS = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
   InstrItinClass Itinerary = itin;
 }
 
 class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCS,
-                             RegisterClass RCT = RCS> {
+                             InstrItinClass itin, RegisterOperand ROS,
+                             RegisterOperand ROT = ROS> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
-  list<dag> Pattern = [(OpNode RCS:$rs, RCT:$rt)];
+  list<dag> Pattern = [(OpNode ROS:$rs, ROT:$rt)];
   InstrItinClass Itinerary = itin;
 }
 
 class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCD,
-                             RegisterClass RCS,  RegisterClass RCT = RCS> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                               InstrItinClass itin, RegisterClass RCT,
-                               RegisterClass RCS = RCT> {
-  dag OutOperandList = (outs RCT:$rt);
-  dag InOperandList = (ins RCS:$rs, shamt:$sa, RCS:$src);
+                               InstrItinClass itin, RegisterOperand ROT,
+                               RegisterOperand ROS = ROT> {
+  dag OutOperandList = (outs ROT:$rt);
+  dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
-  list<dag> Pattern = [(set RCT:$rt, (OpNode RCS:$src, RCS:$rs, immZExt5:$sa))];
+  list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
 
 class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCD,
-                             RegisterClass RCT = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCT:$rt);
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                     ImmLeaf immPat, InstrItinClass itin, RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
+                     ImmLeaf immPat, InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
   dag InOperandList = (ins uimm16:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
-  list<dag> Pattern = [(set RC:$rd, (OpNode immPat:$imm))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
 }
 
 class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                           InstrItinClass itin, RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
-  dag InOperandList =  (ins RC:$rt, GPR32:$rs_sa);
+                           InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList =  (ins RO:$rt, GPR32Opnd:$rs_sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
-  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, GPR32:$rs_sa))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
   InstrItinClass Itinerary = itin;
 }
 
 class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                            SDPatternOperator ImmPat, InstrItinClass itin,
-                           RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
-  dag InOperandList = (ins RC:$rt, uimm16:$rs_sa);
+                           RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins RO:$rt, uimm16:$rs_sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
-  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, ImmPat:$rs_sa))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
   InstrItinClass Itinerary = itin;
   bit hasSideEffects = 1;
 }
 
 class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rd);
-  dag InOperandList = (ins GPR32:$base, GPR32:$index);
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins PtrRC:$base, PtrRC:$index);
   string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
-  list<dag> Pattern = [(set GPR32:$rd,
-                       (OpNode GPR32:$base, GPR32:$index))];
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
   InstrItinClass Itinerary = itin;
   bit mayLoad = 1;
 }
 
 class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                         InstrItinClass itin, RegisterClass RCD,
-                         RegisterClass RCS = RCD,  RegisterClass RCT = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                         InstrItinClass itin, RegisterOperand ROD,
+                         RegisterOperand ROS = ROD,  RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                        SDPatternOperator ImmOp, InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins GPR32:$rs, shamt:$sa, GPR32:$src);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
-  list<dag> Pattern =  [(set GPR32:$rt,
-                        (OpNode GPR32:$src, GPR32:$rs, ImmOp:$sa))];
+  list<dag> Pattern =  [(set GPR32Opnd:$rt,
+                        (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
 
 class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins ACRegsDSP:$ac, GPR32:$shift_rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
 }
 
 class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins ACRegsDSP:$ac, uimm16:$shift_rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
 }
 
 class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins simm16:$shift, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode immSExt6:$shift, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rd);
+  dag OutOperandList = (outs GPR32Opnd:$rd);
   dag InOperandList = (ins uimm16:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
-  list<dag> Pattern = [(set GPR32:$rd, (OpNode immZExt10:$mask))];
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins GPR32:$rs, uimm16:$mask);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
-  list<dag> Pattern = [(OpNode GPR32:$rs, immZExt10:$mask)];
+  list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
   InstrItinClass Itinerary = itin;
 }
 
 class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, GPR32:$rt);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac, (OpNode GPR32:$rs, GPR32:$rt))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
   InstrItinClass Itinerary = itin;
-  int AddedComplexity = 20;
   bit isCommutable = 1;
 }
 
 class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   InstrItinClass Itinerary = itin;
-  int AddedComplexity = 20;
   string Constraints = "$acin = $ac";
 }
 
-class MFHI_DESC_BASE<string instr_asm, RegisterClass RC, InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rd);
-  dag InOperandList = (ins RC:$ac);
+class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins RO:$ac);
   string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
   InstrItinClass Itinerary = itin;
 }
 
-class MTHI_DESC_BASE<string instr_asm, RegisterClass RC, InstrItinClass itin> {
-  dag OutOperandList = (outs RC:$ac);
-  dag InOperandList = (ins GPR32:$rs);
+class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
+  dag OutOperandList = (outs RO:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   InstrItinClass Itinerary = itin;
 }
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
-  MipsPseudo<(outs GPR32:$dst), (ins), [(set GPR32:$dst, (OpNode))]> {
+  MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
   bit usesCustomInserter = 1;
 }
 
@@ -501,10 +500,10 @@ class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
 
 class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins GPR32:$src, GPR32:$rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$src, GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
-  list<dag> Pattern = [(set GPR32:$rt, (OpNode GPR32:$src, GPR32:$rs))];
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
@@ -515,209 +514,209 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 
 // Addition/subtraction
 class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
-                                        NoItinerary, GPR32, GPR32>,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
                       IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
-                                        NoItinerary, GPR32, GPR32>,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
                       Defs<[DSPOutFlag20]>;
 
 class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", null_frag, NoItinerary,
-                                     GPR32, GPR32>, IsCommutable,
+                                     GPR32Opnd, GPR32Opnd>, IsCommutable,
                    Defs<[DSPCarry]>;
 
 class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", null_frag, NoItinerary,
-                                     GPR32, GPR32>,
+                                     GPR32Opnd, GPR32Opnd>,
                    IsCommutable, Uses<[DSPCarry]>, Defs<[DSPOutFlag20]>;
 
 class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
-                                      GPR32, GPR32>;
+                                      GPR32Opnd, GPR32Opnd>;
 
 class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
-                                             NoItinerary, GPR32, DSPRegs>;
+                                             NoItinerary, GPR32Opnd, DSPROpnd>;
 
 // Absolute value
 class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
-                                             NoItinerary, GPR32>,
+                                             NoItinerary, GPR32Opnd>,
                       Defs<[DSPOutFlag20]>;
 
 // Precision reduce/expand
 class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
                                                  int_mips_precrq_qb_ph,
-                                                 NoItinerary, DSPRegs, DSPRegs>;
+                                                 NoItinerary, DSPROpnd, DSPROpnd>;
 
 class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
                                                 int_mips_precrq_ph_w,
-                                                NoItinerary, DSPRegs, GPR32>;
+                                                NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
                                                    int_mips_precrq_rs_ph_w,
-                                                   NoItinerary, DSPRegs,
-                                                   GPR32>,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>,
                             Defs<[DSPOutFlag22]>;
 
 class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
                                                     int_mips_precrqu_s_qb_ph,
-                                                    NoItinerary, DSPRegs,
-                                                    DSPRegs>,
+                                                    NoItinerary, DSPROpnd,
+                                                    DSPROpnd>,
                              Defs<[DSPOutFlag22]>;
 
 class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
                                                  int_mips_preceq_w_phl,
-                                                 NoItinerary, GPR32, DSPRegs>;
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
                                                  int_mips_preceq_w_phr,
-                                                 NoItinerary, GPR32, DSPRegs>;
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
                                                    int_mips_precequ_ph_qbl,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
                                                    int_mips_precequ_ph_qbr,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
                                                     int_mips_precequ_ph_qbla,
-                                                    NoItinerary, DSPRegs>;
+                                                    NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
                                                     int_mips_precequ_ph_qbra,
-                                                    NoItinerary, DSPRegs>;
+                                                    NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
                                                   int_mips_preceu_ph_qbl,
-                                                  NoItinerary, DSPRegs>;
+                                                  NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
                                                   int_mips_preceu_ph_qbr,
-                                                  NoItinerary, DSPRegs>;
+                                                  NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
                                                    int_mips_preceu_ph_qbla,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
                                                    int_mips_preceu_ph_qbra,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 // Shift
 class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>,
+                                          NoItinerary, DSPROpnd>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
-                                           NoItinerary, DSPRegs>,
+                                           NoItinerary, DSPROpnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>,
+                                          NoItinerary, DSPROpnd>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
-                                           NoItinerary, DSPRegs>,
+                                           NoItinerary, DSPROpnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
-                                            immZExt4, NoItinerary, DSPRegs>,
+                                            immZExt4, NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
-                                             NoItinerary, DSPRegs>,
+                                             NoItinerary, DSPROpnd>,
                         Defs<[DSPOutFlag22]>;
 
 class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
-                                            immZExt4, NoItinerary, DSPRegs>;
+                                            immZExt4, NoItinerary, DSPROpnd>;
 
 class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
-                                             NoItinerary, DSPRegs>;
+                                             NoItinerary, DSPROpnd>;
 
 class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
-                                           immZExt5, NoItinerary, GPR32>,
+                                           immZExt5, NoItinerary, GPR32Opnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
-                                            NoItinerary, GPR32>,
+                                            NoItinerary, GPR32Opnd>,
                        Defs<[DSPOutFlag22]>;
 
 class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
-                                           immZExt5, NoItinerary, GPR32>;
+                                           immZExt5, NoItinerary, GPR32Opnd>;
 
 class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
-                                            NoItinerary, GPR32>;
+                                            NoItinerary, GPR32Opnd>;
 
 // Multiplication
 class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
                                               int_mips_muleu_s_ph_qbl,
-                                              NoItinerary, DSPRegs, DSPRegs>,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
                             Defs<[DSPOutFlag21]>;
 
 class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
                                               int_mips_muleu_s_ph_qbr,
-                                              NoItinerary, DSPRegs, DSPRegs>,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
                             Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
                                              int_mips_muleq_s_w_phl,
-                                             NoItinerary, GPR32, DSPRegs>,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
                            IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
                                              int_mips_muleq_s_w_phr,
-                                             NoItinerary, GPR32, DSPRegs>,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
                            IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
-                                          NoItinerary, DSPRegs, DSPRegs>,
+                                          NoItinerary, DSPROpnd, DSPROpnd>,
                         IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
@@ -737,10 +736,10 @@ class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>,
                           Defs<[DSPOutFlag16_19]>;
 
 // Move from/to hi/lo.
-class MFHI_DESC : MFHI_DESC_BASE<"mfhi", HIRegsDSP, NoItinerary>;
-class MFLO_DESC : MFHI_DESC_BASE<"mflo", LORegsDSP, NoItinerary>;
-class MTHI_DESC : MTHI_DESC_BASE<"mthi", HIRegsDSP, NoItinerary>;
-class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LORegsDSP, NoItinerary>;
+class MFHI_DESC : MFHI_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, NoItinerary>;
+class MFLO_DESC : MFHI_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, NoItinerary>;
+class MTHI_DESC : MTHI_DESC_BASE<"mthi", HI32DSPOpnd, NoItinerary>;
+class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LO32DSPOpnd, NoItinerary>;
 
 // Dot product with accumulate/subtract
 class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl", MipsDPAU_H_QBL>;
@@ -773,67 +772,67 @@ class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
 // Comparison
 class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
                                                int_mips_cmpu_eq_qb, NoItinerary,
-                                               DSPRegs>,
+                                               DSPROpnd>,
                         IsCommutable, Defs<[DSPCCond]>;
 
 class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
                                                int_mips_cmpu_lt_qb, NoItinerary,
-                                               DSPRegs>, Defs<[DSPCCond]>;
+                                               DSPROpnd>, Defs<[DSPCCond]>;
 
 class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
                                                int_mips_cmpu_le_qb, NoItinerary,
-                                               DSPRegs>, Defs<[DSPCCond]>;
+                                               DSPROpnd>, Defs<[DSPCCond]>;
 
 class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
                                                 int_mips_cmpgu_eq_qb,
-                                                NoItinerary, GPR32, DSPRegs>,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>,
                          IsCommutable;
 
 class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
                                                 int_mips_cmpgu_lt_qb,
-                                                NoItinerary, GPR32, DSPRegs>;
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
                                                 int_mips_cmpgu_le_qb,
-                                                NoItinerary, GPR32, DSPRegs>;
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        IsCommutable, Defs<[DSPCCond]>;
 
 class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPCCond]>;
 
 class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPCCond]>;
 
 // Misc
 class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
-                                           NoItinerary, GPR32>;
+                                           NoItinerary, GPR32Opnd>;
 
 class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
-                                              NoItinerary, DSPRegs, DSPRegs>;
+                                              NoItinerary, DSPROpnd, DSPROpnd>;
 
 class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
-                                    NoItinerary, DSPRegs>;
+                                    NoItinerary, DSPROpnd>;
 
 class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
-                                    NoItinerary, DSPRegs>;
+                                    NoItinerary, DSPROpnd>;
 
 class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
-                                             NoItinerary, DSPRegs, GPR32>;
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
-                                             NoItinerary, DSPRegs, GPR32>;
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
-                                            NoItinerary, DSPRegs, DSPRegs>,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
                      Uses<[DSPCCond]>;
 
 class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
-                                            NoItinerary, DSPRegs, DSPRegs>,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
                      Uses<[DSPCCond]>;
 
 class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>;
@@ -905,97 +904,97 @@ class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>,
 // MIPS DSP Rev 2
 // Addition/subtraction
 class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
-                                         NoItinerary, DSPRegs>, IsCommutable;
+                                         NoItinerary, DSPROpnd>, IsCommutable;
 
 class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
-                                           NoItinerary, DSPRegs>, IsCommutable;
+                                           NoItinerary, DSPROpnd>, IsCommutable;
 
 class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
-                                         NoItinerary, DSPRegs>;
+                                         NoItinerary, DSPROpnd>;
 
 class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
-                                         NoItinerary, DSPRegs>, IsCommutable;
+                                         NoItinerary, DSPROpnd>, IsCommutable;
 
 class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
-                                           NoItinerary, DSPRegs>, IsCommutable;
+                                           NoItinerary, DSPROpnd>, IsCommutable;
 
 class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
-                                         NoItinerary, DSPRegs>;
+                                         NoItinerary, DSPROpnd>;
 
 class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
-                                        NoItinerary, GPR32>, IsCommutable;
+                                        NoItinerary, GPR32Opnd>, IsCommutable;
 
 class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
-                                          NoItinerary, GPR32>, IsCommutable;
+                                          NoItinerary, GPR32Opnd>, IsCommutable;
 
 class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
-                                        NoItinerary, GPR32>;
+                                        NoItinerary, GPR32Opnd>;
 
 class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
-                                          NoItinerary, GPR32>;
+                                          NoItinerary, GPR32Opnd>;
 
 // Comparison
 class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
                                                  int_mips_cmpgdu_eq_qb,
-                                                 NoItinerary, GPR32, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           IsCommutable, Defs<[DSPCCond]>;
 
 class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
                                                  int_mips_cmpgdu_lt_qb,
-                                                 NoItinerary, GPR32, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           Defs<[DSPCCond]>;
 
 class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
                                                  int_mips_cmpgdu_le_qb,
-                                                 NoItinerary, GPR32, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           Defs<[DSPCCond]>;
 
 // Absolute
 class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 // Multiplication
 class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", null_frag, NoItinerary,
-                                       DSPRegs>, IsCommutable,
+                                       DSPROpnd>, IsCommutable,
                     Defs<[DSPOutFlag21]>;
 
 class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
-                                         NoItinerary, DSPRegs>, IsCommutable,
+                                         NoItinerary, DSPROpnd>, IsCommutable,
                       Defs<[DSPOutFlag21]>;
 
 class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
-                                         NoItinerary, GPR32>, IsCommutable,
+                                         NoItinerary, GPR32Opnd>, IsCommutable,
                       Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
-                                          NoItinerary, GPR32>, IsCommutable,
+                                          NoItinerary, GPR32Opnd>, IsCommutable,
                        Defs<[DSPOutFlag21]>;
 
 class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag21]>;
 
 // Dot product with accumulate/subtract
@@ -1026,36 +1025,36 @@ class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
 // Precision reduce/expand
 class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
                                                 int_mips_precr_qb_ph,
-                                                NoItinerary, DSPRegs, DSPRegs>;
+                                                NoItinerary, DSPROpnd, DSPROpnd>;
 
 class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
                                                      int_mips_precr_sra_ph_w,
-                                                     NoItinerary, DSPRegs,
-                                                     GPR32>;
+                                                     NoItinerary, DSPROpnd,
+                                                     GPR32Opnd>;
 
 class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
                                                       int_mips_precr_sra_r_ph_w,
-                                                       NoItinerary, DSPRegs,
-                                                       GPR32>;
+                                                       NoItinerary, DSPROpnd,
+                                                       GPR32Opnd>;
 
 // Shift
 class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
-                                            immZExt3, NoItinerary, DSPRegs>;
+                                            immZExt3, NoItinerary, DSPROpnd>;
 
 class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
-                                             NoItinerary, DSPRegs>;
+                                             NoItinerary, DSPROpnd>;
 
 class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 // Misc
 class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
@@ -1240,24 +1239,24 @@ def PREPEND : PREPEND_ENC, PREPEND_DESC;
 }
 
 // Pseudos.
-let isPseudo = 1 in {
+let isPseudo = 1, isCodeGenOnly = 1 in {
   // Pseudo instructions for loading and storing accumulator registers.
-  defm LOAD_AC_DSP  : LoadM<"load_ac_dsp", ACRegsDSPOpnd>;
-  defm STORE_AC_DSP : StoreM<"store_ac_dsp", ACRegsDSPOpnd>;
+  def LOAD_ACC64DSP  : Load<"", ACC64DSPOpnd>;
+  def STORE_ACC64DSP : Store<"", ACC64DSPOpnd>;
 
   // Pseudos for loading and storing ccond field of DSP control register.
-  defm LOAD_CCOND_DSP  : LoadM<"load_ccond_dsp", DSPCC>;
-  defm STORE_CCOND_DSP : StoreM<"store_ccond_dsp", DSPCC>;
+  def LOAD_CCOND_DSP  : Load<"load_ccond_dsp", DSPCC>;
+  def STORE_CCOND_DSP : Store<"store_ccond_dsp", DSPCC>;
 }
 
 // Pseudo CMP and PICK instructions.
 class PseudoCMP<Instruction RealInst> :
-  PseudoDSP<(outs DSPCC:$cmp), (ins DSPRegs:$rs, DSPRegs:$rt), []>,
-  PseudoInstExpansion<(RealInst DSPRegs:$rs, DSPRegs:$rt)>, NeverHasSideEffects;
+  PseudoDSP<(outs DSPCC:$cmp), (ins DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rs, DSPROpnd:$rt)>, NeverHasSideEffects;
 
 class PseudoPICK<Instruction RealInst> :
-  PseudoDSP<(outs DSPRegs:$rd), (ins DSPCC:$cmp, DSPRegs:$rs, DSPRegs:$rt), []>,
-  PseudoInstExpansion<(RealInst DSPRegs:$rd, DSPRegs:$rs, DSPRegs:$rt)>,
+  PseudoDSP<(outs DSPROpnd:$rd), (ins DSPCC:$cmp, DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rd, DSPROpnd:$rs, DSPROpnd:$rt)>,
   NeverHasSideEffects;
 
 def PseudoCMP_EQ_PH : PseudoCMP<CMP_EQ_PH>;
@@ -1270,6 +1269,8 @@ def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
 def PseudoPICK_PH : PseudoPICK<PICK_PH>;
 def PseudoPICK_QB : PseudoPICK<PICK_QB>;
 
+def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+
 // Patterns.
 class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
   Pat<pattern, result>, Requires<[pred]>;
@@ -1279,19 +1280,19 @@ class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
    DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
           (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
 
-def : BitconvertPat<i32, v2i16, GPR32, DSPRegs>;
-def : BitconvertPat<i32, v4i8, GPR32, DSPRegs>;
-def : BitconvertPat<v2i16, i32, DSPRegs, GPR32>;
-def : BitconvertPat<v4i8, i32, DSPRegs, GPR32>;
+def : BitconvertPat<i32, v2i16, GPR32, DSPR>;
+def : BitconvertPat<i32, v4i8, GPR32, DSPR>;
+def : BitconvertPat<v2i16, i32, DSPR, GPR32>;
+def : BitconvertPat<v4i8, i32, DSPR, GPR32>;
 
 def : DSPPat<(v2i16 (load addr:$a)),
-             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
+             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
 def : DSPPat<(v4i8 (load addr:$a)),
-             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
-def : DSPPat<(store (v2i16 DSPRegs:$val), addr:$a),
-             (SW (COPY_TO_REGCLASS DSPRegs:$val, GPR32), addr:$a)>;
-def : DSPPat<(store (v4i8 DSPRegs:$val), addr:$a),
-             (SW (COPY_TO_REGCLASS DSPRegs:$val, GPR32), addr:$a)>;
+             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(store (v2i16 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
 
 // Binary operations.
 class DSPBinPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
@@ -1336,7 +1337,7 @@ class DSPSetCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
                   CondCode CC> :
   DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
          (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
-                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPRegs)),
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR)),
                       (ValTy ZERO)))>;
 
 class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
@@ -1344,7 +1345,7 @@ class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
   DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
          (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
                       (ValTy ZERO),
-                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPRegs))))>;
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR))))>;
 
 class DSPSelectCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
                      CondCode CC> :
@@ -1384,12 +1385,12 @@ def : DSPSelectCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
 
 // Extr patterns.
 class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
-  DSPPat<(i32 (OpNode GPR32:$rs, ACRegsDSP:$ac)),
-         (Instr ACRegsDSP:$ac, GPR32:$rs)>;
+  DSPPat<(i32 (OpNode GPR32:$rs, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, GPR32:$rs)>;
 
 class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
-  DSPPat<(i32 (OpNode immZExt5:$shift, ACRegsDSP:$ac)),
-         (Instr ACRegsDSP:$ac, immZExt5:$shift)>;
+  DSPPat<(i32 (OpNode immZExt5:$shift, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, immZExt5:$shift)>;
 
 def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
 def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
@@ -1404,11 +1405,6 @@ def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
 def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
 def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
 
-// mflo/hi patterns.
-let AddedComplexity = 20 in
-def : DSPPat<(i32 (ExtractLOHI ACRegsDSP:$ac, imm:$lohi_idx)),
-             (EXTRACT_SUBREG ACRegsDSP:$ac, imm:$lohi_idx)>;
-
 // Indexed load patterns.
 class IndexedLoadPat<SDPatternOperator LoadNode, Instruction Instr> :
   DSPPat<(i32 (LoadNode (add i32:$base, i32:$index))),
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 545a38d..ffbd83b 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -421,8 +421,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
     return false;
 
   if (const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V))
-    return !PSV->PseudoSourceValue::isConstant(0) &&
-      (V != PseudoSourceValue::getStack());
+    return !PSV->isConstant(0) && V != PseudoSourceValue::getStack();
 
   return true;
 }
@@ -563,14 +562,13 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
 
   RegDU.init(*Slot);
 
-  if (searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler)) {
-    MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
-    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
-    ++UsefulSlots;
-    return true;
-  }
+  if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler))
+    return false;
 
-  return false;
+  MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
+  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  ++UsefulSlots;
+  return true;
 }
 
 bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
@@ -584,14 +582,13 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
 
   RegDU.setCallerSaved(*Slot);
 
-  if (searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler)) {
-    MBB.splice(llvm::next(Slot), &MBB, Filler);
-    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
-    ++UsefulSlots;
-    return true;
-  }
+  if (!searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler))
+    return false;
 
-  return false;
+  MBB.splice(llvm::next(Slot), &MBB, Filler);
+  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  ++UsefulSlots;
+  return true;
 }
 
 bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 0002a5f..c417bd5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -69,6 +69,12 @@ bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) const {
   llvm_unreachable("Unimplemented function.");
@@ -81,12 +87,83 @@ bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                                     SDValue &Offset, SDValue &Alias) {
   llvm_unreachable("Unimplemented function.");
   return false;
 }
 
+bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
@@ -98,6 +175,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index cf0f9c5..a4d9da5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -57,6 +57,11 @@ private:
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
+  // Complex Pattern.
+  /// (reg + reg).
+  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   /// Fall back on this function if all else fails.
   virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
                                  SDValue &Offset) const;
@@ -65,9 +70,42 @@ private:
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
+  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
   virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                             SDValue &Offset, SDValue &Alias);
 
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
   virtual SDNode *Select(SDNode *N);
 
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index a62e84f..1e8250c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "MipsTargetMachine.h"
 #include "MipsTargetObjectFile.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -34,6 +35,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -67,7 +69,7 @@ static const uint16_t Mips64DPRegs[8] = {
 // For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
 static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   if (!isShiftedMask_64(I))
-     return false;
+    return false;
 
   Size = CountPopulation_64(I);
   Pos = countTrailingZeros(I);
@@ -79,72 +81,35 @@ SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
   return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
 }
 
-static SDValue getTargetNode(SDValue Op, SelectionDAG &DAG, unsigned Flag) {
-  EVT Ty = Op.getValueType();
+SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
 
-  if (GlobalAddressSDNode *N = dyn_cast<GlobalAddressSDNode>(Op))
-    return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(Op), Ty, 0,
-                                      Flag);
-  if (ExternalSymbolSDNode *N = dyn_cast<ExternalSymbolSDNode>(Op))
-    return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
-  if (BlockAddressSDNode *N = dyn_cast<BlockAddressSDNode>(Op))
-    return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
-  if (JumpTableSDNode *N = dyn_cast<JumpTableSDNode>(Op))
-    return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
-  if (ConstantPoolSDNode *N = dyn_cast<ConstantPoolSDNode>(Op))
-    return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
-                                     N->getOffset(), Flag);
-
-  llvm_unreachable("Unexpected node type.");
-  return SDValue();
+SDValue MipsTargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
 }
 
-static SDValue getAddrNonPIC(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  SDValue Hi = getTargetNode(Op, DAG, MipsII::MO_ABS_HI);
-  SDValue Lo = getTargetNode(Op, DAG, MipsII::MO_ABS_LO);
-  return DAG.getNode(ISD::ADD, DL, Ty,
-                     DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
-                     DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+SDValue MipsTargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
 }
 
-SDValue MipsTargetLowering::getAddrLocal(SDValue Op, SelectionDAG &DAG,
-                                         bool HasMips64) const {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
-  SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
-                            getTargetNode(Op, DAG, GOTFlag));
-  SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
-                             MachinePointerInfo::getGOT(), false, false, false,
-                             0);
-  unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
-  SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty, getTargetNode(Op, DAG, LoFlag));
-  return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
-}
-
-SDValue MipsTargetLowering::getAddrGlobal(SDValue Op, SelectionDAG &DAG,
+SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
                                           unsigned Flag) const {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
-                            getTargetNode(Op, DAG, Flag));
-  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Tgt,
-                     MachinePointerInfo::getGOT(), false, false, false, 0);
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
 }
 
-SDValue MipsTargetLowering::getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
-                                                  unsigned HiFlag,
-                                                  unsigned LoFlag) const {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(Op, DAG, HiFlag));
-  Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
-  SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
-                                getTargetNode(Op, DAG, LoFlag));
-  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Wrapper,
-                     MachinePointerInfo::getGOT(), false, false, false, 0);
+SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                   N->getOffset(), Flag);
 }
 
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -162,8 +127,9 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
   case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
   case MipsISD::TruncIntFP:        return "MipsISD::TruncIntFP";
-  case MipsISD::ExtractLOHI:       return "MipsISD::ExtractLOHI";
-  case MipsISD::InsertLOHI:        return "MipsISD::InsertLOHI";
+  case MipsISD::MFHI:              return "MipsISD::MFHI";
+  case MipsISD::MFLO:              return "MipsISD::MFLO";
+  case MipsISD::MTLOHI:            return "MipsISD::MTLOHI";
   case MipsISD::Mult:              return "MipsISD::Mult";
   case MipsISD::Multu:             return "MipsISD::Multu";
   case MipsISD::MAdd:              return "MipsISD::MAdd";
@@ -207,6 +173,30 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::SHRL_DSP:          return "MipsISD::SHRL_DSP";
   case MipsISD::SETCC_DSP:         return "MipsISD::SETCC_DSP";
   case MipsISD::SELECT_CC_DSP:     return "MipsISD::SELECT_CC_DSP";
+  case MipsISD::VALL_ZERO:         return "MipsISD::VALL_ZERO";
+  case MipsISD::VANY_ZERO:         return "MipsISD::VANY_ZERO";
+  case MipsISD::VALL_NONZERO:      return "MipsISD::VALL_NONZERO";
+  case MipsISD::VANY_NONZERO:      return "MipsISD::VANY_NONZERO";
+  case MipsISD::VCEQ:              return "MipsISD::VCEQ";
+  case MipsISD::VCLE_S:            return "MipsISD::VCLE_S";
+  case MipsISD::VCLE_U:            return "MipsISD::VCLE_U";
+  case MipsISD::VCLT_S:            return "MipsISD::VCLT_S";
+  case MipsISD::VCLT_U:            return "MipsISD::VCLT_U";
+  case MipsISD::VSMAX:             return "MipsISD::VSMAX";
+  case MipsISD::VSMIN:             return "MipsISD::VSMIN";
+  case MipsISD::VUMAX:             return "MipsISD::VUMAX";
+  case MipsISD::VUMIN:             return "MipsISD::VUMIN";
+  case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
+  case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
+  case MipsISD::VNOR:              return "MipsISD::VNOR";
+  case MipsISD::VSHF:              return "MipsISD::VSHF";
+  case MipsISD::SHF:               return "MipsISD::SHF";
+  case MipsISD::ILVEV:             return "MipsISD::ILVEV";
+  case MipsISD::ILVOD:             return "MipsISD::ILVOD";
+  case MipsISD::ILVL:              return "MipsISD::ILVL";
+  case MipsISD::ILVR:              return "MipsISD::ILVR";
+  case MipsISD::PCKEV:             return "MipsISD::PCKEV";
+  case MipsISD::PCKOD:             return "MipsISD::PCKOD";
   default:                         return NULL;
   }
 }
@@ -424,8 +414,8 @@ static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT Ty = N->getValueType(0);
-  unsigned LO = (Ty == MVT::i32) ? Mips::LO : Mips::LO64;
-  unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64;
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO0 : Mips::LO0_64;
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI0 : Mips::HI0_64;
   unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
                                                   MipsISD::DivRemU16;
   SDLoc DL(N);
@@ -566,7 +556,7 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
   // Pattern match EXT.
   //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
   //  => ext $dst, $src, size, pos
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
     return SDValue();
 
   SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
@@ -607,7 +597,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
   //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
   //  => ins $dst, $src, size, pos, $src1
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
     return SDValue();
 
   SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
@@ -779,13 +769,17 @@ static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
   // Insert instruction "teq $divisor_reg, $zero, 7".
   MachineBasicBlock::iterator I(MI);
   MachineInstrBuilder MIB;
+  MachineOperand &Divisor = MI->getOperand(2);
   MIB = BuildMI(MBB, llvm::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
-    .addOperand(MI->getOperand(2)).addReg(Mips::ZERO).addImm(7);
+    .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+    .addReg(Mips::ZERO).addImm(7);
 
   // Use the 32-bit sub-register if this is a 64-bit division.
   if (Is64Bit)
     MIB->getOperand(0).setSubReg(Mips::sub_32);
 
+  // Clear Divisor's kill flag.
+  Divisor.setIsKill(false);
   return &MBB;
 }
 
@@ -796,107 +790,75 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   default:
     llvm_unreachable("Unexpected instr type to insert");
   case Mips::ATOMIC_LOAD_ADD_I8:
-  case Mips::ATOMIC_LOAD_ADD_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I16:
-  case Mips::ATOMIC_LOAD_ADD_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I32:
-  case Mips::ATOMIC_LOAD_ADD_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I64:
-  case Mips::ATOMIC_LOAD_ADD_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
 
   case Mips::ATOMIC_LOAD_AND_I8:
-  case Mips::ATOMIC_LOAD_AND_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I16:
-  case Mips::ATOMIC_LOAD_AND_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I32:
-  case Mips::ATOMIC_LOAD_AND_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I64:
-  case Mips::ATOMIC_LOAD_AND_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::AND64);
 
   case Mips::ATOMIC_LOAD_OR_I8:
-  case Mips::ATOMIC_LOAD_OR_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I16:
-  case Mips::ATOMIC_LOAD_OR_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I32:
-  case Mips::ATOMIC_LOAD_OR_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I64:
-  case Mips::ATOMIC_LOAD_OR_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::OR64);
 
   case Mips::ATOMIC_LOAD_XOR_I8:
-  case Mips::ATOMIC_LOAD_XOR_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I16:
-  case Mips::ATOMIC_LOAD_XOR_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I32:
-  case Mips::ATOMIC_LOAD_XOR_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I64:
-  case Mips::ATOMIC_LOAD_XOR_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
 
   case Mips::ATOMIC_LOAD_NAND_I8:
-  case Mips::ATOMIC_LOAD_NAND_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I16:
-  case Mips::ATOMIC_LOAD_NAND_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I32:
-  case Mips::ATOMIC_LOAD_NAND_I32_P8:
     return emitAtomicBinary(MI, BB, 4, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I64:
-  case Mips::ATOMIC_LOAD_NAND_I64_P8:
     return emitAtomicBinary(MI, BB, 8, 0, true);
 
   case Mips::ATOMIC_LOAD_SUB_I8:
-  case Mips::ATOMIC_LOAD_SUB_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I16:
-  case Mips::ATOMIC_LOAD_SUB_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I32:
-  case Mips::ATOMIC_LOAD_SUB_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I64:
-  case Mips::ATOMIC_LOAD_SUB_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
 
   case Mips::ATOMIC_SWAP_I8:
-  case Mips::ATOMIC_SWAP_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, 0);
   case Mips::ATOMIC_SWAP_I16:
-  case Mips::ATOMIC_SWAP_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, 0);
   case Mips::ATOMIC_SWAP_I32:
-  case Mips::ATOMIC_SWAP_I32_P8:
     return emitAtomicBinary(MI, BB, 4, 0);
   case Mips::ATOMIC_SWAP_I64:
-  case Mips::ATOMIC_SWAP_I64_P8:
     return emitAtomicBinary(MI, BB, 8, 0);
 
   case Mips::ATOMIC_CMP_SWAP_I8:
-  case Mips::ATOMIC_CMP_SWAP_I8_P8:
     return emitAtomicCmpSwapPartword(MI, BB, 1);
   case Mips::ATOMIC_CMP_SWAP_I16:
-  case Mips::ATOMIC_CMP_SWAP_I16_P8:
     return emitAtomicCmpSwapPartword(MI, BB, 2);
   case Mips::ATOMIC_CMP_SWAP_I32:
-  case Mips::ATOMIC_CMP_SWAP_I32_P8:
     return emitAtomicCmpSwap(MI, BB, 4);
   case Mips::ATOMIC_CMP_SWAP_I64:
-  case Mips::ATOMIC_CMP_SWAP_I64_P8:
     return emitAtomicCmpSwap(MI, BB, 8);
   case Mips::PseudoSDIV:
   case Mips::PseudoUDIV:
@@ -923,16 +885,16 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
-    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    LL = Mips::LL;
+    SC = Mips::SC;
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
     BEQ = Mips::BEQ;
   }
   else {
-    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
-    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+    LL = Mips::LLD;
+    SC = Mips::SCD;
     AND = Mips::AND64;
     NOR = Mips::NOR64;
     ZERO = Mips::ZERO_64;
@@ -958,8 +920,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
@@ -990,7 +951,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
   BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
@@ -1001,15 +962,13 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
                                              unsigned Size, unsigned BinOpcode,
                                              bool Nand) const {
   assert((Size == 1 || Size == 2) &&
-      "Unsupported size for EmitAtomicBinaryPartial.");
+         "Unsupported size for EmitAtomicBinaryPartial.");
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
   unsigned Dest = MI->getOperand(0).getReg();
   unsigned Ptr = MI->getOperand(1).getReg();
@@ -1106,7 +1065,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   //   beq     success,$0,loopMBB
 
   BB = loopMBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
     //  and andres, oldval, incr2
     //  nor binopres, $0, andres
@@ -1120,7 +1079,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
     //  and newval, binopres, mask
     BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
     BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
-  } else {// atomic.swap
+  } else { // atomic.swap
     //  and newval, incr2, mask
     BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
   }
@@ -1129,7 +1088,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal0).addReg(NewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
     .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
     .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
@@ -1151,15 +1110,14 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
       .addReg(SllRes).addImm(ShiftImm);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
-                                      MachineBasicBlock *BB,
-                                      unsigned Size) const {
+MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
+                                                          MachineBasicBlock *BB,
+                                                          unsigned Size) const {
   assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
 
   MachineFunction *MF = BB->getParent();
@@ -1170,15 +1128,14 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   unsigned LL, SC, ZERO, BNE, BEQ;
 
   if (Size == 4) {
-    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    LL = Mips::LL;
+    SC = Mips::SC;
     ZERO = Mips::ZERO;
     BNE = Mips::BNE;
     BEQ = Mips::BEQ;
-  }
-  else {
-    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
-    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+  } else {
+    LL = Mips::LLD;
+    SC = Mips::SCD;
     ZERO = Mips::ZERO_64;
     BNE = Mips::BNE64;
     BEQ = Mips::BEQ64;
@@ -1233,7 +1190,7 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(BEQ))
     .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
@@ -1250,8 +1207,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
   unsigned Dest    = MI->getOperand(0).getReg();
   unsigned Ptr     = MI->getOperand(1).getReg();
@@ -1348,7 +1303,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   //    and     maskedoldval0,oldval,mask
   //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
     .addReg(OldVal).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::BNE))
@@ -1364,7 +1319,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
       .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
       .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
@@ -1421,9 +1376,7 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BRIND, DL, MVT::Other, Chain, Addr);
 }
 
-SDValue MipsTargetLowering::
-lowerBRCOND(SDValue Op, SelectionDAG &DAG) const
-{
+SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   // The first operand is the chain, the second is the condition, the third is
   // the block to branch to if the condition is true.
   SDValue Chain = Op.getOperand(0);
@@ -1489,7 +1442,9 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
   SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = N->getGlobal();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     const MipsTargetObjectFile &TLOF =
@@ -1506,26 +1461,31 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
     }
 
     // %hi/%lo relocation
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
   }
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
-    return getAddrLocal(Op, DAG, HasMips64);
+    return getAddrLocal(N, Ty, DAG, HasMips64);
 
   if (LargeGOT)
-    return getAddrGlobalLargeGOT(Op, DAG, MipsII::MO_GOT_HI16,
-                                 MipsII::MO_GOT_LO16);
+    return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
+                                 MipsII::MO_GOT_LO16, DAG.getEntryNode(),
+                                 MachinePointerInfo::getGOT());
 
-  return getAddrGlobal(Op, DAG,
-                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16);
+  return getAddrGlobal(N, Ty, DAG,
+                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16,
+                       DAG.getEntryNode(), MachinePointerInfo::getGOT());
 }
 
 SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::
@@ -1612,10 +1572,13 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 SDValue MipsTargetLowering::
 lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::
@@ -1630,11 +1593,13 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
   //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
   //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  EVT Ty = Op.getValueType();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -1652,7 +1617,8 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV), false, false, 0);
 }
 
-static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
   EVT TyX = Op.getOperand(0).getValueType();
   EVT TyY = Op.getOperand(1).getValueType();
   SDValue Const1 = DAG.getConstant(1, MVT::i32);
@@ -1671,7 +1637,7 @@ static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
     DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
                 Const1);
 
-  if (HasR2) {
+  if (HasExtractInsert) {
     // ext  E, Y, 31, 1  ; extract bit31 of Y
     // ins  X, E, 31, 1  ; insert extracted bit at bit31 of X
     SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
@@ -1697,7 +1663,8 @@ static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
   unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
   unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
   EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
@@ -1708,7 +1675,7 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
   SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
 
-  if (HasR2) {
+  if (HasExtractInsert) {
     // ext  E, Y, width(Y) - 1, 1  ; extract bit width(Y)-1 of Y
     // ins  X, E, width(X) - 1, 1  ; insert extracted bit at bit width(X)-1 of X
     SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
@@ -1748,12 +1715,13 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 SDValue
 MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64())
-    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasExtractInsert());
 
-  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
-static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
+                           bool HasExtractInsert) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
   SDLoc DL(Op);
 
@@ -1765,7 +1733,7 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
                 Const1);
 
   // Clear MSB.
-  if (HasR2)
+  if (HasExtractInsert)
     Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
                       DAG.getRegister(Mips::ZERO, MVT::i32),
                       DAG.getConstant(31, MVT::i32), Const1, X);
@@ -1782,7 +1750,8 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
+                           bool HasExtractInsert) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
   SDLoc DL(Op);
 
@@ -1790,7 +1759,7 @@ static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
 
   // Clear MSB.
-  if (HasR2)
+  if (HasExtractInsert)
     Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
                       DAG.getRegister(Mips::ZERO_64, MVT::i64),
                       DAG.getConstant(63, MVT::i32), Const1, X);
@@ -1805,9 +1774,9 @@ static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 SDValue
 MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
-    return lowerFABS64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFABS64(Op, DAG, Subtarget->hasExtractInsert());
 
-  return lowerFABS32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFABS32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
 SDValue MipsTargetLowering::
@@ -2152,21 +2121,14 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 //  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
 //===----------------------------------------------------------------------===//
 
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
-                       MVT LocVT, CCValAssign::LocInfo LocInfo,
-                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                       CCState &State, const uint16_t *F64Regs) {
 
-  static const unsigned IntRegsSize=4, FloatRegsSize=2;
+  static const unsigned IntRegsSize = 4, FloatRegsSize = 2;
 
-  static const uint16_t IntRegs[] = {
-      Mips::A0, Mips::A1, Mips::A2, Mips::A3
-  };
-  static const uint16_t F32Regs[] = {
-      Mips::F12, Mips::F14
-  };
-  static const uint16_t F64Regs[] = {
-      Mips::D6, Mips::D7
-  };
+  static const uint16_t IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+  static const uint16_t F32Regs[] = { Mips::F12, Mips::F14 };
 
   // Do not process byval args here.
   if (ArgFlags.isByVal())
@@ -2235,14 +2197,28 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
   return false;
 }
 
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const uint16_t F64Regs[] = { Mips::D6, Mips::D7 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const uint16_t F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
 #include "MipsGenCallingConv.inc"
 
 //===----------------------------------------------------------------------===//
 //                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-static const unsigned O32IntRegsSize = 4;
-
 // Return next O32 integer argument register.
 static unsigned getNextIntArgReg(unsigned Reg) {
   assert((Reg == Mips::A0) || (Reg == Mips::A2));
@@ -2339,6 +2315,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -2347,10 +2324,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                  getTargetMachine(), ArgLocs, *DAG.getContext());
   MipsCC::SpecialCallingConvType SpecialCallingConv =
     getSpecialCallingConv(Callee);
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo, SpecialCallingConv);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo,
+                    SpecialCallingConv);
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
-                                 getTargetMachine().Options.UseSoftFloat,
+                                 Subtarget->mipsSEUsesSoftFloat(),
                                  Callee.getNode(), CLI.Args);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -2467,32 +2445,40 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool IsPICCall = (IsN64 || IsPIC); // true if calls are translated to jalr $25
   bool GlobalOrExternal = false, InternalLinkage = false;
   SDValue CalleeLo;
+  EVT Ty = Callee.getValueType();
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (IsPICCall) {
-      InternalLinkage = G->getGlobal()->hasInternalLinkage();
+      const GlobalValue *Val = G->getGlobal();
+      InternalLinkage = Val->hasInternalLinkage();
 
       if (InternalLinkage)
-        Callee = getAddrLocal(Callee, DAG, HasMips64);
+        Callee = getAddrLocal(G, Ty, DAG, HasMips64);
       else if (LargeGOT)
-        Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
-                                       MipsII::MO_CALL_LO16);
+        Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
+                                       MipsII::MO_CALL_LO16, Chain,
+                                       FuncInfo->callPtrInfo(Val));
       else
-        Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+        Callee = getAddrGlobal(G, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                               FuncInfo->callPtrInfo(Val));
     } else
       Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
                                           MipsII::MO_NO_FLAG);
     GlobalOrExternal = true;
   }
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+
     if (!IsN64 && !IsPIC) // !N64 && static
-      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
                                             MipsII::MO_NO_FLAG);
     else if (LargeGOT)
-      Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
-                                     MipsII::MO_CALL_LO16);
+      Callee = getAddrGlobalLargeGOT(S, Ty, DAG, MipsII::MO_CALL_HI16,
+                                     MipsII::MO_CALL_LO16, Chain,
+                                     FuncInfo->callPtrInfo(Sym));
     else // N64 || PIC
-      Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+      Callee = getAddrGlobal(S, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                             FuncInfo->callPtrInfo(Sym));
 
     GlobalOrExternal = true;
   }
@@ -2534,9 +2520,9 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
 
-  MipsCCInfo.analyzeCallResult(Ins, getTargetMachine().Options.UseSoftFloat,
+  MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(),
                                CallNode, RetTy);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2581,10 +2567,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
-  bool UseSoftFloat = getTargetMachine().Options.UseSoftFloat;
+  bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat();
 
   MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
@@ -2613,21 +2599,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
     // Arguments stored on registers
     if (IsRegLoc) {
-      EVT RegVT = VA.getLocVT();
+      MVT RegVT = VA.getLocVT();
       unsigned ArgReg = VA.getLocReg();
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = Subtarget->inMips16Mode()? &Mips::CPU16RegsRegClass :
-                                        &Mips::GPR32RegClass;
-      else if (RegVT == MVT::i64)
-        RC = &Mips::GPR64RegClass;
-      else if (RegVT == MVT::f32)
-        RC = &Mips::FGR32RegClass;
-      else if (RegVT == MVT::f64)
-        RC = HasMips64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
-      else
-        llvm_unreachable("RegVT not supported by FormalArguments Lowering");
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
 
       // Transform the arguments stored on
       // physical registers into virtual ones
@@ -2677,9 +2651,11 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Create load nodes to retrieve arguments from the stack
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(ValVT, DL, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, false, 0));
+      SDValue Load = DAG.getLoad(ValVT, DL, Chain, FIN,
+                                 MachinePointerInfo::getFixedStack(FI),
+                                 false, false, false, 0);
+      InVals.push_back(Load);
+      OutChains.push_back(Load.getValue(1));
     }
   }
 
@@ -2740,10 +2716,10 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
                  *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
 
   // Analyze return values.
-  MipsCCInfo.analyzeReturn(Outs, getTargetMachine().Options.UseSoftFloat,
+  MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(),
                            MF.getFunction()->getReturnType());
 
   SDValue Flag;
@@ -2802,7 +2778,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 MipsTargetLowering::ConstraintType MipsTargetLowering::
 getConstraintType(const std::string &Constraint) const
 {
-  // Mips specific constrainy
+  // Mips specific constraints
   // GCC config/mips/constraints.md
   //
   // 'd' : An address register. Equivalent to r
@@ -2853,16 +2829,19 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     if (type->isIntegerTy())
       weight = CW_Register;
     break;
-  case 'f':
-    if (type->isFloatTy())
+  case 'f': // FPU or MSA register
+    if (Subtarget->hasMSA() && type->isVectorTy() &&
+        cast<VectorType>(type)->getBitWidth() == 128)
+      weight = CW_Register;
+    else if (type->isFloatTy())
       weight = CW_Register;
     break;
   case 'c': // $25 for indirect jumps
   case 'l': // lo register
   case 'x': // hilo register pair
-      if (type->isIntegerTy())
+    if (type->isIntegerTy())
       weight = CW_SpecificReg;
-      break;
+    break;
   case 'I': // signed 16 bit immediate
   case 'J': // integer zero
   case 'K': // unsigned 16 bit immediate
@@ -2880,6 +2859,104 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
   return weight;
 }
 
+/// This is a helper function to parse a physical register string and split it
+/// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
+/// that is returned indicates whether parsing was successful. The second flag
+/// is true if the numeric part exists.
+static std::pair<bool, bool>
+parsePhysicalReg(const StringRef &C, std::string &Prefix,
+                 unsigned long long &Reg) {
+  if (C.front() != '{' || C.back() != '}')
+    return std::make_pair(false, false);
+
+  // Search for the first numeric character.
+  StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
+  I = std::find_if(B, E, std::ptr_fun(isdigit));
+
+  Prefix.assign(B, I - B);
+
+  // The second flag is set to false if no numeric characters were found.
+  if (I == E)
+    return std::make_pair(true, false);
+
+  // Parse the numeric characters.
+  return std::make_pair(!getAsUnsignedInteger(StringRef(I, E - I), 10, Reg),
+                        true);
+}
+
+std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
+parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC;
+  std::string Prefix;
+  unsigned long long Reg;
+
+  std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
+
+  if (!R.first)
+    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+  if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
+    // No numeric characters follow "hi" or "lo".
+    if (R.second)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+    RC = TRI->getRegClass(Prefix == "hi" ?
+                          Mips::HI32RegClassID : Mips::LO32RegClassID);
+    return std::make_pair(*(RC->begin()), RC);
+  } else if (Prefix.compare(0, 4, "$msa") == 0) {
+    // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
+
+    // No numeric characters follow the name.
+    if (R.second)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+
+    Reg = StringSwitch<unsigned long long>(Prefix)
+              .Case("$msair", Mips::MSAIR)
+              .Case("$msacsr", Mips::MSACSR)
+              .Case("$msaaccess", Mips::MSAAccess)
+              .Case("$msasave", Mips::MSASave)
+              .Case("$msamodify", Mips::MSAModify)
+              .Case("$msarequest", Mips::MSARequest)
+              .Case("$msamap", Mips::MSAMap)
+              .Case("$msaunmap", Mips::MSAUnmap)
+              .Default(0);
+
+    if (!Reg)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+
+    RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
+    return std::make_pair(Reg, RC);
+  }
+
+  if (!R.second)
+    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+  if (Prefix == "$f") { // Parse $f0-$f31.
+    // If the size of FP registers is 64-bit or Reg is an even number, select
+    // the 64-bit register class. Otherwise, select the 32-bit register class.
+    if (VT == MVT::Other)
+      VT = (Subtarget->isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+
+    RC = getRegClassFor(VT);
+
+    if (RC == &Mips::AFGR64RegClass) {
+      assert(Reg % 2 == 0);
+      Reg >>= 1;
+    }
+  } else if (Prefix == "$fcc") // Parse $fcc0-$fcc7.
+    RC = TRI->getRegClass(Mips::FCCRegClassID);
+  else if (Prefix == "$w") { // Parse $w0-$w31.
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::v16i8 : VT);
+  } else { // Parse $0-$31.
+    assert(Prefix == "$");
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::i32 : VT);
+  }
+
+  assert(Reg < RC->getNumRegs());
+  return std::make_pair(*(RC->begin() + Reg), RC);
+}
+
 /// Given a register class constraint, like 'r', if this corresponds directly
 /// to an LLVM register class, return a register of 0 and the register class
 /// pointer.
@@ -2902,10 +2979,18 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
-    case 'f':
-      if (VT == MVT::f32)
+    case 'f': // FPU or MSA register
+      if (VT == MVT::v16i8)
+        return std::make_pair(0U, &Mips::MSA128BRegClass);
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
+        return std::make_pair(0U, &Mips::MSA128HRegClass);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return std::make_pair(0U, &Mips::MSA128WRegClass);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return std::make_pair(0U, &Mips::MSA128DRegClass);
+      else if (VT == MVT::f32)
         return std::make_pair(0U, &Mips::FGR32RegClass);
-      if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
+      else if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
         if (Subtarget->isFP64bit())
           return std::make_pair(0U, &Mips::FGR64RegClass);
         return std::make_pair(0U, &Mips::AFGR64RegClass);
@@ -2918,14 +3003,21 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
       return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
     case 'l': // register suitable for indirect jump
       if (VT == MVT::i32)
-        return std::make_pair((unsigned)Mips::LO, &Mips::LORegsRegClass);
-      return std::make_pair((unsigned)Mips::LO64, &Mips::LORegs64RegClass);
+        return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
+      return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
     case 'x': // register suitable for indirect jump
       // Fixme: Not triggering the use of both hi and low
       // This will generate an error message
       return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
     }
   }
+
+  std::pair<unsigned, const TargetRegisterClass *> R;
+  R = parseRegForInlineAsmConstraint(Constraint, VT);
+
+  if (R.second)
+    return R;
+
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
@@ -3024,8 +3116,8 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-bool
-MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM, Type *Ty) const {
+bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                               Type *Ty) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
@@ -3089,13 +3181,13 @@ static bool isF128SoftLibCall(const char *CallSym) {
      "log10l", "log2l", "logl", "nearbyintl", "powl", "rintl", "sinl", "sqrtl",
      "truncl"};
 
-  const char * const *End = LibCalls + array_lengthof(LibCalls);
+  const char *const *End = LibCalls + array_lengthof(LibCalls);
 
   // Check that LibCalls is sorted alphabetically.
   MipsTargetLowering::LTStr Comp;
 
 #ifndef NDEBUG
-  for (const char * const *I = LibCalls; I < End - 1; ++I)
+  for (const char *const *I = LibCalls; I < End - 1; ++I)
     assert(Comp(*I, *(I + 1)));
 #endif
 
@@ -3133,9 +3225,9 @@ MipsTargetLowering::MipsCC::SpecialCallingConvType
 }
 
 MipsTargetLowering::MipsCC::MipsCC(
-  CallingConv::ID CC, bool IsO32_, CCState &Info,
-    MipsCC::SpecialCallingConvType SpecialCallingConv_)
-  : CCInfo(Info), CallConv(CC), IsO32(IsO32_),
+  CallingConv::ID CC, bool IsO32_, bool IsFP64_, CCState &Info,
+  MipsCC::SpecialCallingConvType SpecialCallingConv_)
+  : CCInfo(Info), CallConv(CC), IsO32(IsO32_), IsFP64(IsFP64_),
     SpecialCallingConv(SpecialCallingConv_){
   // Pre-allocate reserved argument area.
   CCInfo.AllocateStack(reservedArgArea(), 1);
@@ -3249,11 +3341,10 @@ analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
   analyzeReturn(Outs, IsSoftFloat, 0, RetTy);
 }
 
-void
-MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
-                                           MVT LocVT,
-                                           CCValAssign::LocInfo LocInfo,
-                                           ISD::ArgFlagsTy ArgFlags) {
+void MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
+                                                MVT LocVT,
+                                                CCValAssign::LocInfo LocInfo,
+                                                ISD::ArgFlagsTy ArgFlags) {
   assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
 
   struct ByValArgInfo ByVal;
@@ -3291,11 +3382,11 @@ llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
 
   if (SpecialCallingConv == Mips16RetHelperConv)
     return CC_Mips16RetHelper;
-  return IsO32 ? CC_MipsO32 : CC_MipsN;
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN;
 }
 
 llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
-  return IsO32 ? CC_MipsO32 : CC_MipsN_VarArg;
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN_VarArg;
 }
 
 const uint16_t *MipsTargetLowering::MipsCC::shadowRegs() const {
@@ -3473,17 +3564,15 @@ passByValArg(SDValue Chain, SDLoc DL,
                             DAG.getConstant(Offset, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
                             DAG.getIntPtrConstant(ByVal.Address));
-  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
-                        DAG.getConstant(MemCpySize, PtrTy), Alignment,
-                        /*isVolatile=*/false, /*AlwaysInline=*/false,
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
+                        Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
                         MachinePointerInfo(0), MachinePointerInfo(0));
   MemOpChains.push_back(Chain);
 }
 
-void
-MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
-                                    const MipsCC &CC, SDValue Chain,
-                                    SDLoc DL, SelectionDAG &DAG) const {
+void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+                                         const MipsCC &CC, SDValue Chain,
+                                         SDLoc DL, SelectionDAG &DAG) const {
   unsigned NumRegs = CC.numIntArgRegs();
   const uint16_t *ArgRegs = CC.intArgRegs();
   const CCState &CCInfo = CC.getCCInfo();
@@ -3501,8 +3590,7 @@ MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
   if (NumRegs == Idx)
     VaArgOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), RegSize);
   else
-    VaArgOffset =
-      (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
+    VaArgOffset = (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
 
   // Record the frame index of the first variable argument
   // which is a value necessary to VASTART.
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 123a2a6..65f68f0 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -17,6 +17,7 @@
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
@@ -69,10 +70,11 @@ namespace llvm {
       EH_RETURN,
 
       // Node used to extract integer from accumulator.
-      ExtractLOHI,
+      MFHI,
+      MFLO,
 
       // Node used to insert integers to accumulator.
-      InsertLOHI,
+      MTLOHI,
 
       // Mult nodes.
       Mult,
@@ -152,6 +154,43 @@ namespace llvm {
       SETCC_DSP,
       SELECT_CC_DSP,
 
+      // Vector comparisons.
+      // These take a vector and return a boolean.
+      VALL_ZERO,
+      VANY_ZERO,
+      VALL_NONZERO,
+      VANY_NONZERO,
+
+      // These take a vector and return a vector bitmask.
+      VCEQ,
+      VCLE_S,
+      VCLE_U,
+      VCLT_S,
+      VCLT_U,
+
+      // Element-wise vector max/min.
+      VSMAX,
+      VSMIN,
+      VUMAX,
+      VUMIN,
+
+      // Vector Shuffle with mask as an operand
+      VSHF,  // Generic shuffle
+      SHF,   // 4-element set shuffle.
+      ILVEV, // Interleave even elements
+      ILVOD, // Interleave odd elements
+      ILVL,  // Interleave left elements
+      ILVR,  // Interleave right elements
+      PCKEV, // Pack even elements
+      PCKOD, // Pack odd elements
+
+      // Combined (XOR (OR $a, $b), -1)
+      VNOR,
+
+      // Extended vector element extraction
+      VEXTRACT_SEXT_ELT,
+      VEXTRACT_ZEXT_ELT,
+
       // Load/Store Left/Right nodes.
       LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LWR,
@@ -211,12 +250,72 @@ namespace llvm {
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
 
-    SDValue getAddrLocal(SDValue Op, SelectionDAG &DAG, bool HasMips64) const;
-
-    SDValue getAddrGlobal(SDValue Op, SelectionDAG &DAG, unsigned Flag) const;
-
-    SDValue getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
-                                  unsigned HiFlag, unsigned LoFlag) const;
+    // This method creates the following nodes, which are necessary for
+    // computing a local symbol's address:
+    //
+    // (add (load (wrapper $gp, %got(sym)), %lo(sym))
+    template<class NodeTy>
+    SDValue getAddrLocal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                         bool HasMips64) const {
+      SDLoc DL(N);
+      unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+      SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, GOTFlag));
+      SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+                                 MachinePointerInfo::getGOT(), false, false,
+                                 false, 0);
+      unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+      SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
+                               getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address:
+    //
+    // (load (wrapper $gp, %got(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag, SDValue Chain,
+                          const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, Flag));
+      return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo, false, false, false, 0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address in large-GOT mode:
+    //
+    // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                                  unsigned HiFlag, unsigned LoFlag,
+                                  SDValue Chain,
+                                  const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty,
+                               getTargetNode(N, Ty, DAG, HiFlag));
+      Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
+      SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
+                                    getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo, false, false, false,
+                         0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a symbol's address in non-PIC mode:
+    //
+    // (add %hi(sym), %lo(sym))
+    template<class NodeTy>
+    SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
+      SDLoc DL(N);
+      SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+      SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+      return DAG.getNode(ISD::ADD, DL, Ty,
+                         DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
+                         DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+    }
 
     /// This function fills Ops, which is the list of operands that will later
     /// be used when a function call node is created. It also generates
@@ -244,9 +343,8 @@ namespace llvm {
         Mips16RetHelperConv, NoSpecialCallingConv
       };
 
-      MipsCC(
-        CallingConv::ID CallConv, bool IsO32, CCState &Info,
-        SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
+      MipsCC(CallingConv::ID CallConv, bool IsO32, bool IsFP64, CCState &Info,
+             SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
 
 
       void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -319,17 +417,39 @@ namespace llvm {
 
       CCState &CCInfo;
       CallingConv::ID CallConv;
-      bool IsO32;
+      bool IsO32, IsFP64;
       SpecialCallingConvType SpecialCallingConv;
       SmallVector<ByValArgInfo, 2> ByValArgs;
     };
   protected:
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
     // Subtarget Info
     const MipsSubtarget *Subtarget;
 
     bool HasMips64, IsN64, IsO32;
 
   private:
+    // Create a TargetGlobalAddress node.
+    SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetExternalSymbol node.
+    SDValue getTargetNode(ExternalSymbolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetBlockAddress node.
+    SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetJumpTable node.
+    SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetConstantPool node.
+    SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
 
     MipsCC::SpecialCallingConvType getSpecialCallingConv(SDValue Callee) const;
     // Lower Operand helpers
@@ -361,8 +481,6 @@ namespace llvm {
     SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
     SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
                                  bool IsSRA) const;
-    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerADD(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
 
@@ -433,6 +551,11 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const;
 
+    /// This function parses registers that appear in inline-asm constraints.
+    /// It returns pair (0, 0) on failure.
+    std::pair<unsigned, const TargetRegisterClass *>
+    parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const;
+
     std::pair<unsigned, const TargetRegisterClass*>
               getRegForInlineAsmConstraint(const std::string &Constraint,
                                            MVT VT) const;
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index b992e77..9f7ce9a 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -99,9 +99,9 @@ class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
 
 multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
                   SDPatternOperator OpNode = null_frag> {
-  def _D32 : ADDS_FT<opstr, AFGR64RegsOpnd, Itin, IsComm, OpNode>,
+  def _D32 : ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ADDS_FT<opstr, FGR64RegsOpnd, Itin, IsComm, OpNode>,
+  def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
     string DecoderNamespace = "Mips64";
   }
@@ -115,18 +115,18 @@ class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
 
 multiclass ABSS_M<string opstr, InstrItinClass Itin,
                   SDPatternOperator OpNode= null_frag> {
-  def _D32 : ABSS_FT<opstr, AFGR64RegsOpnd, AFGR64RegsOpnd, Itin, OpNode>,
+  def _D32 : ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ABSS_FT<opstr, FGR64RegsOpnd, FGR64RegsOpnd, Itin, OpNode>,
+  def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
     string DecoderNamespace = "Mips64";
   }
 }
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
-  def _D32 : ABSS_FT<opstr, FGR32RegsOpnd, AFGR64RegsOpnd, Itin>,
+  def _D32 : ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ABSS_FT<opstr, FGR32RegsOpnd, FGR64RegsOpnd, Itin>,
+  def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>,
              Requires<[IsFP64bit, HasStdEnc]> {
     let DecoderNamespace = "Mips64";
   }
@@ -143,16 +143,16 @@ class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
          [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
 
 class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
-            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
   let mayLoad = 1;
 }
 
 class SW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
-            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
   let mayStore = 1;
@@ -171,19 +171,19 @@ class NMADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
          [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
          Itin, FrmFR>;
 
-class LWXC1_FT<string opstr, RegisterOperand DRC, RegisterOperand PRC,
+class LWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs DRC:$fd), (ins PRC:$base, PRC:$index),
+  InstSE<(outs DRC:$fd), (ins PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fd, ${index}(${base})"),
-         [(set DRC:$fd, (OpNode (add PRC:$base, PRC:$index)))], Itin, FrmFI> {
+         [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin, FrmFI> {
   let AddedComplexity = 20;
 }
 
-class SWXC1_FT<string opstr, RegisterOperand DRC, RegisterOperand PRC,
+class SWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs), (ins DRC:$fs, PRC:$base, PRC:$index),
+  InstSE<(outs), (ins DRC:$fs, PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fs, ${index}(${base})"),
-         [(OpNode DRC:$fs, (add PRC:$base, PRC:$index))], Itin, FrmFI> {
+         [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin, FrmFI> {
   let AddedComplexity = 20;
 }
 
@@ -231,24 +231,24 @@ multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt> {
   def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC>, C_COND_FM<fmt, 15>;
 }
 
-defm S : C_COND_M<"s", FGR32RegsOpnd, 16>;
-defm D32 : C_COND_M<"d", AFGR64RegsOpnd, 17>,
+defm S : C_COND_M<"s", FGR32Opnd, 16>;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17>,
                     Requires<[NotFP64bit, HasStdEnc]>;
 let DecoderNamespace = "Mips64" in
-defm D64 : C_COND_M<"d", FGR64RegsOpnd, 17>, Requires<[IsFP64bit, HasStdEnc]>;
+defm D64 : C_COND_M<"d", FGR64Opnd, 17>, Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xc, 16>;
-def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xd, 16>;
-def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xe, 16>;
-def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xf, 16>;
-def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0x24, 16>;
 
 defm ROUND_W : ROUND_M<"round.w.d", IIFcvt>, ABSS_FM<0xc, 17>;
@@ -258,71 +258,71 @@ defm FLOOR_W : ROUND_M<"floor.w.d", IIFcvt>, ABSS_FM<0xf, 17>;
 defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>;
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x8, 16>;
-  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0x8, 17>;
-  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x9, 16>;
-  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0x9, 17>;
-  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0xa, 16>;
-  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                    ABSS_FM<0xa, 17>;
-  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0xb, 16>;
-  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0xb, 17>;
 }
 
-def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, IIFcvt>,
               ABSS_FM<0x20, 20>;
-def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
               ABSS_FM<0x25, 16>;
-def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                ABSS_FM<0x25, 17>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32RegsOpnd, AFGR64RegsOpnd, IIFcvt>,
+  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, IIFcvt>,
                   ABSS_FM<0x20, 17>;
-  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 20>;
-  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 16>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, IIFcvt>,
                   ABSS_FM<0x20, 17>;
-  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, IIFcvt>,
                   ABSS_FM<0x20, 21>;
-  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 20>;
-  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 16>;
-  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, IIFcvt>,
                   ABSS_FM<0x21, 21>;
 }
 
 let isPseudo = 1, isCodeGenOnly = 1 in {
-  def PseudoCVT_S_W : ABSS_FT<"", FGR32RegsOpnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64RegsOpnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_S_L : ABSS_FT<"", FGR64RegsOpnd, GPR64Opnd, IIFcvt>;
-  def PseudoCVT_D64_W : ABSS_FT<"", FGR64RegsOpnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_D64_L : ABSS_FT<"", FGR64RegsOpnd, GPR64Opnd, IIFcvt>;
+  def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
+  def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
 }
 
 let Predicates = [NoNaNsFPMath, HasStdEnc] in {
-  def FABS_S : ABSS_FT<"abs.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt, fabs>,
+  def FABS_S : ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, IIFcvt, fabs>,
                ABSS_FM<0x5, 16>;
-  def FNEG_S : ABSS_FT<"neg.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt, fneg>,
+  def FNEG_S : ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, IIFcvt, fneg>,
                ABSS_FM<0x7, 16>;
   defm FABS : ABSS_M<"abs.d", IIFcvt, fabs>, ABSS_FM<0x5, 17>;
   defm FNEG : ABSS_M<"neg.d", IIFcvt, fneg>, ABSS_FM<0x7, 17>;
 }
 
-def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFsqrtSingle,
+def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, IIFsqrtSingle,
                fsqrt>, ABSS_FM<0x4, 16>;
 defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 
@@ -334,164 +334,134 @@ defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 /// Move Control Registers From/To CPU Registers
 def CFC1 : MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, IIFmove>, MFC1_FM<2>;
 def CTC1 : MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, IIFmove>, MFC1_FM<6>;
-def MFC1 : MFC1_FT<"mfc1", GPR32Opnd, FGR32RegsOpnd, IIFmoveC1, bitconvert>,
+def MFC1 : MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, IIFmoveC1, bitconvert>,
            MFC1_FM<0>;
-def MTC1 : MTC1_FT<"mtc1", FGR32RegsOpnd, GPR32Opnd, IIFmoveC1, bitconvert>,
+def MTC1 : MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, IIFmoveC1, bitconvert>,
            MFC1_FM<4>;
-def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64RegsOpnd, IIFmoveC1,
+def MFHC1 : MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, IIFmoveC1>,
+            MFC1_FM<3>;
+def MTHC1 : MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, IIFmoveC1>,
+            MFC1_FM<7>;
+def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, IIFmoveC1,
             bitconvert>, MFC1_FM<1>;
-def DMTC1 : MTC1_FT<"dmtc1", FGR64RegsOpnd, GPR64Opnd, IIFmoveC1,
+def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, IIFmoveC1,
             bitconvert>, MFC1_FM<5>;
 
-def FMOV_S   : ABSS_FT<"mov.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFmove>,
+def FMOV_S   : ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, IIFmove>,
                ABSS_FM<0x6, 16>;
-def FMOV_D32 : ABSS_FT<"mov.d", AFGR64RegsOpnd, AFGR64RegsOpnd, IIFmove>,
+def FMOV_D32 : ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, IIFmove>,
                ABSS_FM<0x6, 17>, Requires<[NotFP64bit, HasStdEnc]>;
-def FMOV_D64 : ABSS_FT<"mov.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFmove>,
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, IIFmove>,
                ABSS_FM<0x6, 17>, Requires<[IsFP64bit, HasStdEnc]> {
                  let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
-let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LWC1_P8 : LW_FT<"lwc1", FGR32RegsOpnd, IIFLoad, mem64, load>,
-                LW_FM<0x31>;
-  def SWC1_P8 : SW_FT<"swc1", FGR32RegsOpnd, IIFStore, mem64, store>,
-                LW_FM<0x39>;
-  def LDC164_P8 : LW_FT<"ldc1", FGR64RegsOpnd, IIFLoad, mem64, load>,
-                  LW_FM<0x35> {
-    let isCodeGenOnly =1;
-  }
-  def SDC164_P8 : SW_FT<"sdc1", FGR64RegsOpnd, IIFStore, mem64, store>,
-                  LW_FM<0x3d> {
-    let isCodeGenOnly =1;
-  }
+let Predicates = [HasStdEnc] in {
+  def LWC1 : LW_FT<"lwc1", FGR32Opnd, IIFLoad, load>, LW_FM<0x31>;
+  def SWC1 : SW_FT<"swc1", FGR32Opnd, IIFStore, store>, LW_FM<0x39>;
 }
 
-let Predicates = [NotN64, HasStdEnc] in {
-  def LWC1 : LW_FT<"lwc1", FGR32RegsOpnd, IIFLoad, mem, load>, LW_FM<0x31>;
-  def SWC1 : SW_FT<"swc1", FGR32RegsOpnd, IIFStore, mem, store>, LW_FM<0x39>;
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def LDC164 : LW_FT<"ldc1", FGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
+  def SDC164 : SW_FT<"sdc1", FGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
 }
 
-let Predicates = [NotN64, HasMips64, HasStdEnc],
-  DecoderNamespace = "Mips64" in {
-  def LDC164 : LW_FT<"ldc1", FGR64RegsOpnd, IIFLoad, mem, load>, LW_FM<0x35>;
-  def SDC164 : SW_FT<"sdc1", FGR64RegsOpnd, IIFStore, mem, store>, LW_FM<0x3d>;
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def LDC1 : LW_FT<"ldc1", AFGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
+  def SDC1 : SW_FT<"sdc1", AFGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
 }
 
-let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-  let isPseudo = 1, isCodeGenOnly = 1 in {
-    def PseudoLDC1 : LW_FT<"", AFGR64RegsOpnd, IIFLoad, mem, load>;
-    def PseudoSDC1 : SW_FT<"", AFGR64RegsOpnd, IIFStore, mem, store>;
-  }
-  def LDC1 : LW_FT<"ldc1", AFGR64RegsOpnd, IIFLoad, mem>, LW_FM<0x35>;
-  def SDC1 : SW_FT<"sdc1", AFGR64RegsOpnd, IIFStore, mem>, LW_FM<0x3d>;
+/// Cop2 Memory Instructions
+let Predicates = [HasStdEnc] in {
+  def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
+  def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
+  def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>;
+  def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>;
 }
 
 // Indexed loads and stores.
 let Predicates = [HasFPIdx, HasStdEnc] in {
-  def LWXC1 : LWXC1_FT<"lwxc1", FGR32RegsOpnd, GPR32Opnd, IIFLoad, load>,
-              LWXC1_FM<0>;
-  def SWXC1 : SWXC1_FT<"swxc1", FGR32RegsOpnd, GPR32Opnd, IIFStore, store>,
-              SWXC1_FM<8>;
-}
-
-let Predicates = [HasMips32r2, NotMips64, HasStdEnc] in {
-  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64RegsOpnd, GPR32Opnd, IIFLoad, load>,
-              LWXC1_FM<1>;
-  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64RegsOpnd, GPR32Opnd, IIFStore, store>,
-              SWXC1_FM<9>;
+  def LWXC1 : LWXC1_FT<"lwxc1", FGR32Opnd, IIFLoad, load>, LWXC1_FM<0>;
+  def SWXC1 : SWXC1_FT<"swxc1", FGR32Opnd, IIFStore, store>, SWXC1_FM<8>;
 }
 
-let Predicates = [HasMips64, NotN64, HasStdEnc], DecoderNamespace="Mips64" in {
-  def LDXC164 : LWXC1_FT<"ldxc1", FGR64RegsOpnd, GPR32Opnd, IIFLoad, load>,
-                LWXC1_FM<1>;
-  def SDXC164 : SWXC1_FT<"sdxc1", FGR64RegsOpnd, GPR32Opnd, IIFStore, store>,
-                SWXC1_FM<9>;
+let Predicates = [HasFPIdx, NotFP64bit, HasStdEnc] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
 }
 
-// n64
-let Predicates = [IsN64, HasStdEnc], isCodeGenOnly=1 in {
-  def LWXC1_P8 : LWXC1_FT<"lwxc1", FGR32RegsOpnd, GPR64Opnd, IIFLoad, load>,
-                 LWXC1_FM<0>;
-  def LDXC164_P8 : LWXC1_FT<"ldxc1", FGR64RegsOpnd, GPR64Opnd, IIFLoad,
-                             load>, LWXC1_FM<1>;
-  def SWXC1_P8 : SWXC1_FT<"swxc1", FGR32RegsOpnd, GPR64Opnd, IIFStore,
-                          store>, SWXC1_FM<8>;
-  def SDXC164_P8 : SWXC1_FT<"sdxc1", FGR64RegsOpnd, GPR64Opnd, IIFStore,
-                            store>, SWXC1_FM<9>;
+let Predicates = [HasFPIdx, IsFP64bit, HasStdEnc],
+    DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
 }
 
 // Load/store doubleword indexed unaligned.
-let Predicates = [NotMips64, HasStdEnc] in {
-  def LUXC1 : LWXC1_FT<"luxc1", AFGR64RegsOpnd, GPR32Opnd, IIFLoad>,
-              LWXC1_FM<0x5>;
-  def SUXC1 : SWXC1_FT<"suxc1", AFGR64RegsOpnd, GPR32Opnd, IIFStore>,
-              SWXC1_FM<0xd>;
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def LUXC1 : LWXC1_FT<"luxc1", AFGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
+  def SUXC1 : SWXC1_FT<"suxc1", AFGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
 }
 
-let Predicates = [HasMips64, HasStdEnc],
-  DecoderNamespace="Mips64" in {
-  def LUXC164 : LWXC1_FT<"luxc1", FGR64RegsOpnd, GPR32Opnd, IIFLoad>,
-                LWXC1_FM<0x5>;
-  def SUXC164 : SWXC1_FT<"suxc1", FGR64RegsOpnd, GPR32Opnd, IIFStore>,
-                SWXC1_FM<0xd>;
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
 }
 
 /// Floating-point Aritmetic
-def FADD_S : ADDS_FT<"add.s", FGR32RegsOpnd, IIFadd, 1, fadd>,
+def FADD_S : ADDS_FT<"add.s", FGR32Opnd, IIFadd, 1, fadd>,
              ADDS_FM<0x00, 16>;
 defm FADD :  ADDS_M<"add.d", IIFadd, 1, fadd>, ADDS_FM<0x00, 17>;
-def FDIV_S : ADDS_FT<"div.s", FGR32RegsOpnd, IIFdivSingle, 0, fdiv>,
+def FDIV_S : ADDS_FT<"div.s", FGR32Opnd, IIFdivSingle, 0, fdiv>,
              ADDS_FM<0x03, 16>;
 defm FDIV :  ADDS_M<"div.d", IIFdivDouble, 0, fdiv>, ADDS_FM<0x03, 17>;
-def FMUL_S : ADDS_FT<"mul.s", FGR32RegsOpnd, IIFmulSingle, 1, fmul>,
+def FMUL_S : ADDS_FT<"mul.s", FGR32Opnd, IIFmulSingle, 1, fmul>,
              ADDS_FM<0x02, 16>;
 defm FMUL :  ADDS_M<"mul.d", IIFmulDouble, 1, fmul>, ADDS_FM<0x02, 17>;
-def FSUB_S : ADDS_FT<"sub.s", FGR32RegsOpnd, IIFadd, 0, fsub>,
+def FSUB_S : ADDS_FT<"sub.s", FGR32Opnd, IIFadd, 0, fsub>,
              ADDS_FM<0x01, 16>;
 defm FSUB :  ADDS_M<"sub.d", IIFadd, 0, fsub>, ADDS_FM<0x01, 17>;
 
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def MADD_S : MADDS_FT<"madd.s", FGR32RegsOpnd, IIFmulSingle, fadd>,
+  def MADD_S : MADDS_FT<"madd.s", FGR32Opnd, IIFmulSingle, fadd>,
                MADDS_FM<4, 0>;
-  def MSUB_S : MADDS_FT<"msub.s", FGR32RegsOpnd, IIFmulSingle, fsub>,
+  def MSUB_S : MADDS_FT<"msub.s", FGR32Opnd, IIFmulSingle, fsub>,
                MADDS_FM<5, 0>;
 }
 
 let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32RegsOpnd, IIFmulSingle, fadd>,
+  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32Opnd, IIFmulSingle, fadd>,
                 MADDS_FM<6, 0>;
-  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32RegsOpnd, IIFmulSingle, fsub>,
+  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32Opnd, IIFmulSingle, fsub>,
                 MADDS_FM<7, 0>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
-  def MADD_D32 : MADDS_FT<"madd.d", AFGR64RegsOpnd, IIFmulDouble, fadd>,
+  def MADD_D32 : MADDS_FT<"madd.d", AFGR64Opnd, IIFmulDouble, fadd>,
                  MADDS_FM<4, 1>;
-  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64RegsOpnd, IIFmulDouble, fsub>,
+  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64Opnd, IIFmulDouble, fsub>,
                  MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64RegsOpnd, IIFmulDouble, fadd>,
+  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64Opnd, IIFmulDouble, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64RegsOpnd, IIFmulDouble, fsub>,
+  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64Opnd, IIFmulDouble, fsub>,
                   MADDS_FM<7, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
-  def MADD_D64 : MADDS_FT<"madd.d", FGR64RegsOpnd, IIFmulDouble, fadd>,
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, IIFmulDouble, fadd>,
                  MADDS_FM<4, 1>;
-  def MSUB_D64 : MADDS_FT<"msub.d", FGR64RegsOpnd, IIFmulDouble, fsub>,
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, IIFmulDouble, fsub>,
                  MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
     isCodeGenOnly=1 in {
-  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64RegsOpnd, IIFmulDouble, fadd>,
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, IIFmulDouble, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64RegsOpnd, IIFmulDouble, fsub>,
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, IIFmulDouble, fsub>,
                   MADDS_FM<7, 1>;
 }
 
@@ -542,20 +512,27 @@ def FCMP_D64 : CEQS_FT<"d", FGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
 
 // This pseudo instr gets expanded into 2 mtc1 instrs after register
 // allocation.
-def BuildPairF64 :
-  PseudoSE<(outs AFGR64RegsOpnd:$dst),
-           (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
-           [(set AFGR64RegsOpnd:$dst,
-            (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
+class BuildPairF64Base<RegisterOperand RO> :
+  PseudoSE<(outs RO:$dst), (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
+           [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
+
+def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>,
+                   Requires<[NotFP64bit, HasStdEnc]>;
+def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>,
+                      Requires<[IsFP64bit, HasStdEnc]>;
 
 // This pseudo instr gets expanded into 2 mfc1 instrs after register
 // allocation.
 // if n is 0, lower part of src is extracted.
 // if n is 1, higher part of src is extracted.
-def ExtractElementF64 :
-  PseudoSE<(outs GPR32Opnd:$dst), (ins AFGR64RegsOpnd:$src, i32imm:$n),
-           [(set GPR32Opnd:$dst,
-            (MipsExtractElementF64 AFGR64RegsOpnd:$src, imm:$n))]>;
+class ExtractElementF64Base<RegisterOperand RO> :
+  PseudoSE<(outs GPR32Opnd:$dst), (ins RO:$src, i32imm:$n),
+           [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))]>;
+
+def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>,
+                        Requires<[NotFP64bit, HasStdEnc]>;
+def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
+                           Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
 // InstAliases.
@@ -571,18 +548,18 @@ def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
 
 def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
               (PseudoCVT_S_W GPR32Opnd:$src)>;
-def : MipsPat<(MipsTruncIntFP FGR32RegsOpnd:$src),
-              (TRUNC_W_S FGR32RegsOpnd:$src)>;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_W_S FGR32Opnd:$src)>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
   def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
                 (PseudoCVT_D32_W GPR32Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP AFGR64RegsOpnd:$src),
-                (TRUNC_W_D32 AFGR64RegsOpnd:$src)>;
-  def : MipsPat<(f32 (fround AFGR64RegsOpnd:$src)),
-                (CVT_S_D32 AFGR64RegsOpnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32RegsOpnd:$src)),
-                (CVT_D32_S FGR32RegsOpnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+                (TRUNC_W_D32 AFGR64Opnd:$src)>;
+  def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
+                (CVT_S_D32 AFGR64Opnd:$src)>;
+  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+                (CVT_D32_S FGR32Opnd:$src)>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc] in {
@@ -592,44 +569,37 @@ let Predicates = [IsFP64bit, HasStdEnc] in {
   def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
                 (PseudoCVT_D64_W GPR32Opnd:$src)>;
   def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
-                (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_32)>;
+                (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>;
   def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
                 (PseudoCVT_D64_L GPR64Opnd:$src)>;
 
-  def : MipsPat<(MipsTruncIntFP FGR64RegsOpnd:$src),
-                (TRUNC_W_D64 FGR64RegsOpnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR32RegsOpnd:$src),
-                (TRUNC_L_S FGR32RegsOpnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR64RegsOpnd:$src),
-                (TRUNC_L_D64 FGR64RegsOpnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+                (TRUNC_W_D64 FGR64Opnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+                (TRUNC_L_S FGR32Opnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+                (TRUNC_L_D64 FGR64Opnd:$src)>;
 
-  def : MipsPat<(f32 (fround FGR64RegsOpnd:$src)),
-                (CVT_S_D64 FGR64RegsOpnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32RegsOpnd:$src)),
-                (CVT_D64_S FGR32RegsOpnd:$src)>;
+  def : MipsPat<(f32 (fround FGR64Opnd:$src)),
+                (CVT_S_D64 FGR64Opnd:$src)>;
+  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+                (CVT_D64_S FGR32Opnd:$src)>;
 }
 
 // Patterns for loads/stores with a reg+imm operand.
 let AddedComplexity = 40 in {
-  let Predicates = [IsN64, HasStdEnc] in {
-    def : LoadRegImmPat<LWC1_P8, f32, load>;
-    def : StoreRegImmPat<SWC1_P8, f32>;
-    def : LoadRegImmPat<LDC164_P8, f64, load>;
-    def : StoreRegImmPat<SDC164_P8, f64>;
-  }
-
-  let Predicates = [NotN64, HasStdEnc] in {
+  let Predicates = [HasStdEnc] in {
     def : LoadRegImmPat<LWC1, f32, load>;
     def : StoreRegImmPat<SWC1, f32>;
   }
 
-  let Predicates = [NotN64, HasMips64, HasStdEnc] in {
+  let Predicates = [IsFP64bit, HasStdEnc] in {
     def : LoadRegImmPat<LDC164, f64, load>;
     def : StoreRegImmPat<SDC164, f64>;
   }
 
-  let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-    def : LoadRegImmPat<PseudoLDC1, f64, load>;
-    def : StoreRegImmPat<PseudoSDC1, f64>;
+  let Predicates = [NotFP64bit, HasStdEnc] in {
+    def : LoadRegImmPat<LDC1, f64, load>;
+    def : StoreRegImmPat<SDC1, f64>;
   }
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 1322784..737a018 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -183,7 +183,7 @@ class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
 // Format J instruction class in Mips : <|opcode|address|>
 //===----------------------------------------------------------------------===//
 
-class FJ<bits<6> op>
+class FJ<bits<6> op> : StdArch
 {
   bits<26> target;
 
@@ -272,7 +272,7 @@ class SRLV_FM<bits<6> funct, bit rotate> : StdArch {
   let Inst{5-0}   = funct;
 }
 
-class BEQ_FM<bits<6> op> {
+class BEQ_FM<bits<6> op> : StdArch {
   bits<5>  rs;
   bits<5>  rt;
   bits<16> offset;
@@ -285,7 +285,7 @@ class BEQ_FM<bits<6> op> {
   let Inst{15-0}  = offset;
 }
 
-class BGEZ_FM<bits<6> op, bits<5> funct> {
+class BGEZ_FM<bits<6> op, bits<5> funct> : StdArch {
   bits<5>  rs;
   bits<16> offset;
 
@@ -297,17 +297,6 @@ class BGEZ_FM<bits<6> op, bits<5> funct> {
   let Inst{15-0}  = offset;
 }
 
-class B_FM {
-  bits<16> offset;
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 4;
-  let Inst{25-21} = 0;
-  let Inst{20-16} = 0;
-  let Inst{15-0}  = offset;
-}
-
 class SLTI_FM<bits<6> op> : StdArch {
   bits<5> rt;
   bits<5> rs;
@@ -321,7 +310,7 @@ class SLTI_FM<bits<6> op> : StdArch {
   let Inst{15-0}  = imm16;
 }
 
-class MFLO_FM<bits<6> funct> {
+class MFLO_FM<bits<6> funct> : StdArch {
   bits<5> rd;
 
   bits<32> Inst;
@@ -333,7 +322,7 @@ class MFLO_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class MTLO_FM<bits<6> funct> {
+class MTLO_FM<bits<6> funct> : StdArch {
   bits<5> rs;
 
   bits<32> Inst;
@@ -344,7 +333,7 @@ class MTLO_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class SEB_FM<bits<5> funct, bits<6> funct2> {
+class SEB_FM<bits<5> funct, bits<6> funct2> : StdArch {
   bits<5> rd;
   bits<5> rt;
 
@@ -358,7 +347,7 @@ class SEB_FM<bits<5> funct, bits<6> funct2> {
   let Inst{5-0}   = funct2;
 }
 
-class CLO_FM<bits<6> funct> {
+class CLO_FM<bits<6> funct> : StdArch {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -374,7 +363,7 @@ class CLO_FM<bits<6> funct> {
   let rt = rd;
 }
 
-class LUI_FM {
+class LUI_FM : StdArch {
   bits<5> rt;
   bits<16> imm16;
 
@@ -386,7 +375,7 @@ class LUI_FM {
   let Inst{15-0}  = imm16;
 }
 
-class JALR_FM {
+class JALR_FM : StdArch {
   bits<5> rd;
   bits<5> rs;
 
@@ -400,7 +389,7 @@ class JALR_FM {
   let Inst{5-0}   = 9;
 }
 
-class BGEZAL_FM<bits<5> funct> {
+class BGEZAL_FM<bits<5> funct> : StdArch {
   bits<5>  rs;
   bits<16> offset;
 
@@ -435,7 +424,7 @@ class MULT_FM<bits<6> op, bits<6> funct> : StdArch {
   let Inst{5-0}   = funct;
 }
 
-class EXT_FM<bits<6> funct> {
+class EXT_FM<bits<6> funct> : StdArch {
   bits<5> rt;
   bits<5> rs;
   bits<5> pos;
@@ -465,7 +454,7 @@ class RDHWR_FM {
   let Inst{5-0}   = 0x3b;
 }
 
-class TEQ_FM<bits<6> funct> {
+class TEQ_FM<bits<6> funct> : StdArch {
   bits<5> rs;
   bits<5> rt;
   bits<10> code_;
@@ -479,6 +468,17 @@ class TEQ_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
+class TEQI_FM<bits<5> funct> : StdArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16}   = funct;
+  let Inst{15-0}  = imm16;
+}
 //===----------------------------------------------------------------------===//
 //  System calls format <op|code_|funct>
 //===----------------------------------------------------------------------===//
@@ -520,6 +520,24 @@ class ER_FM<bits<6> funct>
   let Inst{5-0}   = funct;
 }
 
+
+//===----------------------------------------------------------------------===//
+//  Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
+//===----------------------------------------------------------------------===//
+
+class EI_FM<bits<1> sc>
+{
+  bits<32> Inst;
+  bits<5> rt;
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = 0xb;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0xc;
+  let Inst{10-6}  = 0;
+  let Inst{5}     = sc;
+  let Inst{4-0}   = 0;
+}
+
 //===----------------------------------------------------------------------===//
 //
 //  FLOATING POINT INSTRUCTION FORMATS
@@ -701,7 +719,7 @@ class CMov_I_F_FM<bits<6> funct, bits<5> fmt> {
   let Inst{5-0} = funct;
 }
 
-class CMov_F_I_FM<bit tf> {
+class CMov_F_I_FM<bit tf> : StdArch {
   bits<5> rd;
   bits<5> rs;
   bits<3> fcc;
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index eae05a3..0ebad05 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -22,11 +22,14 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "MipsGenInstrInfo.inc"
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MipsInstrInfo::anchor() {}
+
 MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm, unsigned UncondBr)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
     TM(tm), UncondBrOpc(UncondBr) {}
@@ -219,7 +222,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   // If there is only one terminator instruction, process it.
   if (!SecondLastOpc) {
-    // Unconditional branch
+    // Unconditional branch.
     if (LastOpc == UncondBrOpc) {
       TBB = LastInst->getOperand(0).getMBB();
       return BT_Uncond;
@@ -271,6 +274,10 @@ unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     const char *AsmStr = MI->getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
+  case Mips::CONSTPOOL_ENTRY:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI->getOperand(2).getImm();
   }
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index b6480ef..d9ac961 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -27,6 +27,7 @@
 namespace llvm {
 
 class MipsInstrInfo : public MipsGenInstrInfo {
+  virtual void anchor();
 protected:
   MipsTargetMachine &TM;
   unsigned UncondBrOpc;
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index b9e8895..ebdbaa4 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -23,11 +23,9 @@ def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisInt<4>]>;
 def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDT_ExtractLOHI : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVT<1, untyped>,
-                                           SDTCisVT<2, i32>]>;
-def SDT_InsertLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
-                                          SDTCisVT<1, i32>,
-                                          SDTCisSameAs<1, 2>]>;
+def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
+def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                      SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
 def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
                                     SDTCisSameAs<1, 2>]>;
 def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
@@ -86,11 +84,12 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
                            [SDNPHasChain, SDNPSideEffect,
                             SDNPOptInGlue, SDNPOutGlue]>;
 
-// Node used to extract integer from LO/HI register.
-def ExtractLOHI : SDNode<"MipsISD::ExtractLOHI", SDT_ExtractLOHI>;
+// Nodes used to extract LO/HI registers.
+def MipsMFHI : SDNode<"MipsISD::MFHI", SDT_MFLOHI>;
+def MipsMFLO : SDNode<"MipsISD::MFLO", SDT_MFLOHI>;
 
 // Node used to insert 32-bit integers to LOHI register pair.
-def InsertLOHI : SDNode<"MipsISD::InsertLOHI", SDT_InsertLOHI>;
+def MipsMTLOHI : SDNode<"MipsISD::MTLOHI", SDT_MTLOHI>;
 
 // Mult nodes.
 def MipsMult  : SDNode<"MipsISD::Mult", SDT_MipsMultDiv>;
@@ -115,7 +114,7 @@ def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
 // Wrapper node patterns give the instruction selector a chance to replace
 // target constant nodes that would otherwise remain unchanged with ADDiu
 // nodes. Without these wrapper node patterns, the following conditional move
-// instrucion is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// instruction is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
 // compiled:
 //  movn  %got(d)($gp), %got(c)($gp), $4
 // This instruction is illegal since movn can take only register operands.
@@ -182,6 +181,12 @@ def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
 def HasStdEnc :       Predicate<"Subtarget.hasStandardEncoding()">,
                       AssemblerPredicate<"!FeatureMips16">;
 def NotDSP :          Predicate<"!Subtarget.hasDSP()">;
+def InMicroMips    :  Predicate<"Subtarget.inMicroMipsMode()">,
+                      AssemblerPredicate<"FeatureMicroMips">;
+def NotInMicroMips :  Predicate<"!Subtarget.inMicroMipsMode()">,
+                      AssemblerPredicate<"!FeatureMicroMips">;
+def IsLE           :  Predicate<"Subtarget.isLittle()">;
+def IsBE           :  Predicate<"!Subtarget.isLittle()">;
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStdEnc];
@@ -242,7 +247,7 @@ def brtarget    : Operand<OtherVT> {
 def calltarget  : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValue";
 }
-def calltarget64: Operand<i64>;
+
 def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
 }
@@ -256,48 +261,67 @@ def uimm20      : Operand<i32> {
 def uimm10      : Operand<i32> {
 }
 
-def simm16_64   : Operand<i64>;
-def shamt       : Operand<i32>;
+def simm16_64   : Operand<i64> {
+  let DecoderMethod = "DecodeSimm16";
+}
 
 // Unsigned Operand
+def uimm5       : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm6 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
 def uimm16      : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
 
+def pcrel16      : Operand<i32> {
+}
+
 def MipsMemAsmOperand : AsmOperandClass {
   let Name = "Mem";
   let ParserMethod = "parseMemOperand";
 }
 
-// Address operand
-def mem : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR32, simm16);
-  let EncoderMethod = "getMemEncoding";
-  let ParserMatchClass = MipsMemAsmOperand;
-  let OperandType = "OPERAND_MEMORY";
+def MipsInvertedImmoperand : AsmOperandClass {
+  let Name = "InvNum";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseInvNum";
+}
+
+def PtrRegAsmOperand : AsmOperandClass {
+  let Name = "PtrReg";
+  let ParserMethod = "parsePtrReg";
+}
+
+
+def InvertedImOperand : Operand<i32> {
+  let ParserMatchClass = MipsInvertedImmoperand;
 }
 
-def mem64 : Operand<i64> {
+// Address operand
+def mem : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR64, simm16_64);
+  let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
   let ParserMatchClass = MipsMemAsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
-def mem_ea : Operand<i32> {
+def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
-  let MIOperandInfo = (ops GPR32, simm16);
+  let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
   let OperandType = "OPERAND_MEMORY";
 }
 
-def mem_ea_64 : Operand<i64> {
-  let PrintMethod = "printMemOperandEA";
-  let MIOperandInfo = (ops GPR64, simm16_64);
-  let EncoderMethod = "getMemEncoding";
-  let OperandType = "OPERAND_MEMORY";
+def PtrRC : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc);
+  let DecoderMethod = "DecodePtrRegisterClass";
+  let ParserMatchClass = PtrRegAsmOperand;
 }
 
 // size operand of ext instruction
@@ -370,6 +394,9 @@ def addr :
 def addrRegImm :
   ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
 
+def addrRegReg :
+  ComplexPattern<iPTR, 2, "selectAddrRegReg", [frameindex]>;
+
 def addrDefault :
   ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
 
@@ -404,9 +431,9 @@ class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
 // Arithmetic Multiply ADD/SUB
 class MArithR<string opstr, bit isComm = 0> :
   InstSE<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-         !strconcat(opstr, "\t$rs, $rt"), [], IIImult, FrmR> {
-  let Defs = [HI, LO];
-  let Uses = [HI, LO];
+         !strconcat(opstr, "\t$rs, $rt"), [], IIImult, FrmR, opstr> {
+  let Defs = [HI0, LO0];
+  let Uses = [HI0, LO0];
   let isCommutable = isComm;
 }
 
@@ -435,121 +462,66 @@ class shift_rotate_reg<string opstr, RegisterOperand RO,
 // Load Upper Imediate
 class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
   InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
-         [], IIArith, FrmI>, IsAsCheapAsAMove {
+         [], IIArith, FrmI, opstr>, IsAsCheapAsAMove {
   let neverHasSideEffects = 1;
   let isReMaterializable = 1;
 }
 
-class FMem<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-          InstrItinClass itin>: FFI<op, outs, ins, asmstr, pattern> {
-  bits<21> addr;
-  let Inst{25-21} = addr{20-16};
-  let Inst{15-0}  = addr{15-0};
-  let DecoderMethod = "DecodeMem";
-}
-
 // Memory Load/Store
-class Load<string opstr, SDPatternOperator OpNode, DAGOperand RO,
-           InstrItinClass Itin, Operand MemOpnd, ComplexPattern Addr,
-           string ofsuffix> :
-  InstSE<(outs RO:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RO:$rt, (OpNode Addr:$addr))], NoItinerary, FrmI,
-         !strconcat(opstr, ofsuffix)> {
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let canFoldAsLoad = 1;
   let mayLoad = 1;
 }
 
-class Store<string opstr, SDPatternOperator OpNode, DAGOperand RO,
-            InstrItinClass Itin, Operand MemOpnd, ComplexPattern Addr,
-            string ofsuffix> :
-  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RO:$rt, Addr:$addr)], NoItinerary, FrmI,
-         !strconcat(opstr, ofsuffix)> {
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
 }
 
-multiclass LoadM<string opstr, DAGOperand RO,
-                 SDPatternOperator OpNode = null_frag,
-                 InstrItinClass Itin = NoItinerary,
-                 ComplexPattern Addr = addr> {
-  def NAME : Load<opstr, OpNode, RO, Itin, mem, Addr, "">,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Load<opstr, OpNode, RO, Itin, mem64, Addr, "_p8">,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
-multiclass StoreM<string opstr, DAGOperand RO,
-                  SDPatternOperator OpNode = null_frag,
-                  InstrItinClass Itin = NoItinerary,
-                  ComplexPattern Addr = addr> {
-  def NAME : Store<opstr, OpNode, RO, Itin, mem, Addr, "">,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Store<opstr, OpNode, RO, Itin, mem64, Addr, "_p8">,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
 // Load/Store Left/Right
 let canFoldAsLoad = 1 in
 class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
-                    Operand MemOpnd> :
-  InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+                    InstrItinClass Itin> :
+  InstSE<(outs RO:$rt), (ins mem:$addr, RO:$src),
          !strconcat(opstr, "\t$rt, $addr"),
-         [(set RO:$rt, (OpNode addr:$addr, RO:$src))], NoItinerary, FrmI> {
+         [(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
   string Constraints = "$src = $rt";
 }
 
 class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
-                     Operand MemOpnd>:
-  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RO:$rt, addr:$addr)], NoItinerary, FrmI> {
+                     InstrItinClass Itin> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
 }
 
-multiclass LoadLeftRightM<string opstr, SDNode OpNode, RegisterOperand RO> {
-  def NAME : LoadLeftRight<opstr, OpNode, RO, mem>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : LoadLeftRight<opstr, OpNode, RO, mem64>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
-multiclass StoreLeftRightM<string opstr, SDNode OpNode, RegisterOperand RO> {
-  def NAME : StoreLeftRight<opstr, OpNode, RO, mem>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : StoreLeftRight<opstr, OpNode, RO, mem64>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
 // Conditional Branch
-class CBranch<string opstr, PatFrag cond_op, RegisterOperand RO> :
-  InstSE<(outs), (ins RO:$rs, RO:$rt, brtarget:$offset),
+class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
+              RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
          !strconcat(opstr, "\t$rs, $rt, $offset"),
          [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch,
-         FrmI> {
+         FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
   let Defs = [AT];
 }
 
-class CBranchZero<string opstr, PatFrag cond_op, RegisterOperand RO> :
-  InstSE<(outs), (ins RO:$rs, brtarget:$offset),
+class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
+                  RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, opnd:$offset),
          !strconcat(opstr, "\t$rs, $offset"),
-         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch, FrmI> {
+         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch,
+         FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -572,9 +544,9 @@ class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
 
 // Jump
 class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
-             SDPatternOperator targetoperator> :
+             SDPatternOperator targetoperator, string bopstr> :
   InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
-         [(operator targetoperator:$target)], IIBranch, FrmJ> {
+         [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
@@ -583,9 +555,9 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
 }
 
 // Unconditional branch
-class UncondBranch<string opstr> :
-  InstSE<(outs), (ins brtarget:$offset), !strconcat(opstr, "\t$offset"),
-         [(br bb:$offset)], IIBranch, FrmI> {
+class UncondBranch<Instruction BEQInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>,
+  PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
   let isBranch = 1;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -596,17 +568,20 @@ class UncondBranch<string opstr> :
 
 // Base class for indirect branch and return instruction classes.
 let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
-class JumpFR<RegisterOperand RO, SDPatternOperator operator = null_frag>:
-  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch, FrmR>;
+class JumpFR<string opstr, RegisterOperand RO,
+             SDPatternOperator operator = null_frag>:
+  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch,
+         FrmR, opstr>;
 
 // Indirect branch
-class IndirectBranch<RegisterOperand RO>: JumpFR<RO, brind> {
+class IndirectBranch<string opstr, RegisterOperand RO> :
+      JumpFR<opstr, RO, brind> {
   let isBranch = 1;
   let isIndirectBranch = 1;
 }
 
 // Return instruction
-class RetBase<RegisterOperand RO>: JumpFR<RO> {
+class RetBase<string opstr, RegisterOperand RO>: JumpFR<opstr, RO> {
   let isReturn = 1;
   let isCodeGenOnly = 1;
   let hasCtrlDep = 1;
@@ -615,9 +590,9 @@ class RetBase<RegisterOperand RO>: JumpFR<RO> {
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1, Defs = [RA] in {
-  class JumpLink<string opstr> :
-    InstSE<(outs), (ins calltarget:$target), !strconcat(opstr, "\t$target"),
-           [(MipsJmpLink imm:$target)], IIBranch, FrmJ> {
+  class JumpLink<string opstr, DAGOperand opnd> :
+    InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+           [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> {
     let DecoderMethod = "DecodeJumpTarget";
   }
 
@@ -628,11 +603,11 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
 
   class JumpLinkReg<string opstr, RegisterOperand RO>:
     InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-           [], IIBranch, FrmR>;
+           [], IIBranch, FrmR, opstr>;
 
-  class BGEZAL_FT<string opstr, RegisterOperand RO> :
-    InstSE<(outs), (ins RO:$rs, brtarget:$offset),
-           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI>;
+  class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, opnd:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
 
 }
 
@@ -660,6 +635,20 @@ class ER_FT<string opstr> :
   InstSE<(outs), (ins),
          opstr, [], NoItinerary, FrmOther>;
 
+// Interrupts
+class DEI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins),
+         !strconcat(opstr, "\t$rt"), [], NoItinerary, FrmOther>;
+
+// Wait
+class WAIT_FT<string opstr> :
+  InstSE<(outs), (ins), opstr, [], NoItinerary, FrmOther> {
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = 0x20;
+}
+
 // Sync
 let hasSideEffects = 1 in
 class SYNC_FT :
@@ -669,8 +658,12 @@ class SYNC_FT :
 let hasSideEffects = 1 in
 class TEQ_FT<string opstr, RegisterOperand RO> :
   InstSE<(outs), (ins RO:$rs, RO:$rt, uimm16:$code_),
-         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary, FrmI>;
+         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary,
+         FrmI, opstr>;
 
+class TEQI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, uimm16:$imm16),
+         !strconcat(opstr, "\t$rs, $imm16"), [], NoItinerary, FrmOther, opstr>;
 // Mul, Div
 class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
            list<Register> DefRegs> :
@@ -698,10 +691,10 @@ class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
 // Pseudo multiply add/sub instruction with explicit accumulator register
 // operands.
 class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode>
-  : PseudoSE<(outs ACRegs:$ac),
-             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACRegs:$acin),
-             [(set ACRegs:$ac,
-              (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACRegs:$acin))],
+  : PseudoSE<(outs ACC64:$ac),
+             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin),
+             [(set ACC64:$ac,
+              (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin))],
              IIImult>,
     PseudoInstExpansion<(RealInst GPR32Opnd:$rs, GPR32Opnd:$rt)> {
   string Constraints = "$acin = $ac";
@@ -710,25 +703,35 @@ class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode>
 class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
           list<Register> DefRegs> :
   InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$$zero, $rs, $rt"),
-         [], itin, FrmR> {
+         [], itin, FrmR, opstr> {
   let Defs = DefRegs;
 }
 
 // Move from Hi/Lo
-class MoveFromLOHI<string opstr, RegisterOperand RO, list<Register> UseRegs>:
-  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR> {
-  let Uses = UseRegs;
+class PseudoMFLOHI<RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode>
+  : PseudoSE<(outs DstRC:$rd), (ins SrcRC:$hilo),
+             [(set DstRC:$rd, (OpNode SrcRC:$hilo))], IIHiLo>;
+
+class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
+  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR,
+         opstr> {
+  let Uses = [UseReg];
   let neverHasSideEffects = 1;
 }
 
+class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
+  : PseudoSE<(outs DstRC:$lohi), (ins SrcRC:$lo, SrcRC:$hi),
+             [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))], IIHiLo>;
+
 class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
-  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo, FrmR> {
+  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo,
+  FrmR, opstr> {
   let Defs = DefRegs;
   let neverHasSideEffects = 1;
 }
 
-class EffectiveAddress<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class EffectiveAddress<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RO:$rt, addr:$addr)], NoItinerary, FrmI> {
   let isCodeGenOnly = 1;
   let DecoderMethod = "DecodeMem";
@@ -737,26 +740,26 @@ class EffectiveAddress<string opstr, RegisterOperand RO, Operand Mem> :
 // Count Leading Ones/Zeros in Word
 class CountLeading0<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz RO:$rs))], IIArith, FrmR>,
+         [(set RO:$rd, (ctlz RO:$rs))], IIArith, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 class CountLeading1<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz (not RO:$rs)))], IIArith, FrmR>,
+         [(set RO:$rd, (ctlz (not RO:$rs)))], IIArith, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 
 // Sign Extend in Register.
 class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO> :
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
-         [(set RO:$rd, (sext_inreg RO:$rt, vt))], IIseb, FrmR> {
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], IIseb, FrmR, opstr> {
   let Predicates = [HasSEInReg, HasStdEnc];
 }
 
 // Subword Swap
 class SubwordSwap<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
-         NoItinerary, FrmR> {
+         NoItinerary, FrmR, opstr> {
   let Predicates = [HasSwap, HasStdEnc];
   let neverHasSideEffects = 1;
 }
@@ -767,66 +770,60 @@ class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
          IIArith, FrmR>;
 
 // Ext and Ins
-class ExtBase<string opstr, RegisterOperand RO>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ext:$size),
+class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (MipsExt RO:$rs, imm:$pos, imm:$size))], NoItinerary,
-         FrmR> {
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], NoItinerary,
+         FrmR, opstr> {
   let Predicates = [HasMips32r2, HasStdEnc];
 }
 
-class InsBase<string opstr, RegisterOperand RO>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ins:$size, RO:$src),
+class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (MipsIns RO:$rs, imm:$pos, imm:$size, RO:$src))],
-         NoItinerary, FrmR> {
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
+         NoItinerary, FrmR, opstr> {
   let Predicates = [HasMips32r2, HasStdEnc];
   let Constraints = "$src = $rt";
 }
 
 // Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
-class Atomic2Ops<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
-  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$incr),
-           [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>;
-
-multiclass Atomic2Ops32<PatFrag Op> {
-  def NAME : Atomic2Ops<Op, GPR32, GPR32>, Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Atomic2Ops<Op, GPR32, GPR64>, Requires<[IsN64, HasStdEnc]>;
-}
+class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
 
 // Atomic Compare & Swap.
-class AtomicCmpSwap<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
-  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$cmp, DRC:$swap),
-           [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>;
-
-multiclass AtomicCmpSwap32<PatFrag Op>  {
-  def NAME : AtomicCmpSwap<Op, GPR32, GPR32>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : AtomicCmpSwap<Op, GPR32, GPR64>,
-             Requires<[IsN64, HasStdEnc]>;
-}
+class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
 
-class LLBase<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LLBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayLoad = 1;
 }
 
-class SCBase<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$dst), (ins RO:$rt, Mem:$addr),
+class SCBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
   let Constraints = "$rt = $dst";
 }
 
-class MFC3OP<dag outs, dag ins, string asmstr> :
-  InstSE<outs, ins, asmstr, [], NoItinerary, FrmFR>;
+class MFC3OP<string asmstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt, RO:$rd, uimm16:$sel), (ins),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], NoItinerary, FrmFR>;
 
-let isBarrier = 1, isTerminator = 1, isCodeGenOnly = 1 in
-def TRAP : InstSE<(outs), (ins), "break", [(trap)], NoItinerary, FrmOther> {
-   let Inst = 0x0000000d;
+class TrapBase<Instruction RealInst>
+  : PseudoSE<(outs), (ins), [(trap)], NoItinerary>,
+    PseudoInstExpansion<(RealInst 0, 0)> {
+  let isBarrier = 1;
+  let isTerminator = 1;
+  let isCodeGenOnly = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -845,38 +842,38 @@ def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 }
 
 let usesCustomInserter = 1 in {
-  defm ATOMIC_LOAD_ADD_I8   : Atomic2Ops32<atomic_load_add_8>;
-  defm ATOMIC_LOAD_ADD_I16  : Atomic2Ops32<atomic_load_add_16>;
-  defm ATOMIC_LOAD_ADD_I32  : Atomic2Ops32<atomic_load_add_32>;
-  defm ATOMIC_LOAD_SUB_I8   : Atomic2Ops32<atomic_load_sub_8>;
-  defm ATOMIC_LOAD_SUB_I16  : Atomic2Ops32<atomic_load_sub_16>;
-  defm ATOMIC_LOAD_SUB_I32  : Atomic2Ops32<atomic_load_sub_32>;
-  defm ATOMIC_LOAD_AND_I8   : Atomic2Ops32<atomic_load_and_8>;
-  defm ATOMIC_LOAD_AND_I16  : Atomic2Ops32<atomic_load_and_16>;
-  defm ATOMIC_LOAD_AND_I32  : Atomic2Ops32<atomic_load_and_32>;
-  defm ATOMIC_LOAD_OR_I8    : Atomic2Ops32<atomic_load_or_8>;
-  defm ATOMIC_LOAD_OR_I16   : Atomic2Ops32<atomic_load_or_16>;
-  defm ATOMIC_LOAD_OR_I32   : Atomic2Ops32<atomic_load_or_32>;
-  defm ATOMIC_LOAD_XOR_I8   : Atomic2Ops32<atomic_load_xor_8>;
-  defm ATOMIC_LOAD_XOR_I16  : Atomic2Ops32<atomic_load_xor_16>;
-  defm ATOMIC_LOAD_XOR_I32  : Atomic2Ops32<atomic_load_xor_32>;
-  defm ATOMIC_LOAD_NAND_I8  : Atomic2Ops32<atomic_load_nand_8>;
-  defm ATOMIC_LOAD_NAND_I16 : Atomic2Ops32<atomic_load_nand_16>;
-  defm ATOMIC_LOAD_NAND_I32 : Atomic2Ops32<atomic_load_nand_32>;
-
-  defm ATOMIC_SWAP_I8       : Atomic2Ops32<atomic_swap_8>;
-  defm ATOMIC_SWAP_I16      : Atomic2Ops32<atomic_swap_16>;
-  defm ATOMIC_SWAP_I32      : Atomic2Ops32<atomic_swap_32>;
-
-  defm ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap32<atomic_cmp_swap_8>;
-  defm ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap32<atomic_cmp_swap_16>;
-  defm ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap32<atomic_cmp_swap_32>;
+  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, GPR32>;
+  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, GPR32>;
+  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, GPR32>;
+  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, GPR32>;
+  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, GPR32>;
+  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, GPR32>;
+  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, GPR32>;
+  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, GPR32>;
+  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, GPR32>;
+  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, GPR32>;
+  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, GPR32>;
+  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, GPR32>;
+  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, GPR32>;
+  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, GPR32>;
+  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, GPR32>;
+  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, GPR32>;
+  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>;
+  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>;
+
+  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, GPR32>;
+  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, GPR32>;
+  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, GPR32>;
+
+  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
+  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
+  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1 in {
-  defm LOAD_AC64  : LoadM<"", ACRegs>;
-  defm STORE_AC64 : StoreM<"", ACRegs>;
+  def LOAD_ACC64  : Load<"", ACC64>;
+  def STORE_ACC64 : Store<"", ACC64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -911,6 +908,7 @@ def ADDu  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, IIArith, add>,
             ADD_FM<0, 0x21>;
 def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, IIArith, sub>,
             ADD_FM<0, 0x23>;
+let Defs = [HI0, LO0] in
 def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, IIImul, mul>,
             ADD_FM<0x1c, 2>;
 def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
@@ -926,11 +924,11 @@ def XOR   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IILogic, xor>,
 def NOR   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
-def SLL  : MMRel, shift_rotate_imm<"sll", shamt, GPR32Opnd, shl, immZExt5>,
+def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, shl, immZExt5>,
            SRA_FM<0, 0>;
-def SRL  : MMRel, shift_rotate_imm<"srl", shamt, GPR32Opnd, srl, immZExt5>,
+def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, srl, immZExt5>,
            SRA_FM<2, 0>;
-def SRA  : MMRel, shift_rotate_imm<"sra", shamt, GPR32Opnd, sra, immZExt5>,
+def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, sra, immZExt5>,
            SRA_FM<3, 0>;
 def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, shl>, SRLV_FM<4, 0>;
 def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, srl>, SRLV_FM<6, 0>;
@@ -938,7 +936,7 @@ def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, sra>, SRLV_FM<7, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def ROTR  : MMRel, shift_rotate_imm<"rotr", shamt, GPR32Opnd, rotr,
+  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, rotr,
                                       immZExt5>,
               SRA_FM<2, 1>;
   def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, rotr>,
@@ -947,65 +945,85 @@ let Predicates = [HasMips32r2, HasStdEnc] in {
 
 /// Load and Store Instructions
 ///  aligned
-defm LB  : LoadM<"lb", GPR32Opnd, sextloadi8, IILoad>, MMRel, LW_FM<0x20>;
-defm LBu : LoadM<"lbu", GPR32Opnd, zextloadi8, IILoad, addrDefault>, MMRel,
-           LW_FM<0x24>;
-defm LH  : LoadM<"lh", GPR32Opnd, sextloadi16, IILoad, addrDefault>, MMRel,
-           LW_FM<0x21>;
-defm LHu : LoadM<"lhu", GPR32Opnd, zextloadi16, IILoad>, MMRel, LW_FM<0x25>;
-defm LW  : LoadM<"lw", GPR32Opnd, load, IILoad, addrDefault>, MMRel, LW_FM<0x23>;
-defm SB  : StoreM<"sb", GPR32Opnd, truncstorei8, IIStore>, MMRel, LW_FM<0x28>;
-defm SH  : StoreM<"sh", GPR32Opnd, truncstorei16, IIStore>, MMRel, LW_FM<0x29>;
-defm SW  : StoreM<"sw", GPR32Opnd, store, IIStore>, MMRel, LW_FM<0x2b>;
+def LB  : Load<"lb", GPR32Opnd, sextloadi8, IILoad>, MMRel, LW_FM<0x20>;
+def LBu : Load<"lbu", GPR32Opnd, zextloadi8, IILoad, addrDefault>, MMRel,
+          LW_FM<0x24>;
+def LH  : Load<"lh", GPR32Opnd, sextloadi16, IILoad, addrDefault>, MMRel,
+          LW_FM<0x21>;
+def LHu : Load<"lhu", GPR32Opnd, zextloadi16, IILoad>, MMRel, LW_FM<0x25>;
+def LW  : Load<"lw", GPR32Opnd, load, IILoad, addrDefault>, MMRel,
+          LW_FM<0x23>;
+def SB  : Store<"sb", GPR32Opnd, truncstorei8, IIStore>, MMRel, LW_FM<0x28>;
+def SH  : Store<"sh", GPR32Opnd, truncstorei16, IIStore>, MMRel, LW_FM<0x29>;
+def SW  : Store<"sw", GPR32Opnd, store, IIStore>, MMRel, LW_FM<0x2b>;
 
 /// load/store left/right
-defm LWL : LoadLeftRightM<"lwl", MipsLWL, GPR32Opnd>, LW_FM<0x22>;
-defm LWR : LoadLeftRightM<"lwr", MipsLWR, GPR32Opnd>, LW_FM<0x26>;
-defm SWL : StoreLeftRightM<"swl", MipsSWL, GPR32Opnd>, LW_FM<0x2a>;
-defm SWR : StoreLeftRightM<"swr", MipsSWR, GPR32Opnd>, LW_FM<0x2e>;
+let Predicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, IILoad>, LW_FM<0x22>;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, IILoad>, LW_FM<0x26>;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, IIStore>, LW_FM<0x2a>;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, IIStore>, LW_FM<0x2e>;
+}
 
 def SYNC : SYNC_FT, SYNC_FM;
-def TEQ : TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
+def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
+def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
+def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
+def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
+def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
+def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
+
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>;
 
 def BREAK : BRK_FT<"break">, BRK_FM<0xd>;
 def SYSCALL : SYS_FT<"syscall">, SYS_FM<0xc>;
+def TRAP : TrapBase<BREAK>;
 
 def ERET : ER_FT<"eret">, ER_FM<0x18>;
 def DERET : ER_FT<"deret">, ER_FM<0x1f>;
 
-/// Load-linked, Store-conditional
-let Predicates = [NotN64, HasStdEnc] in {
-  def LL : LLBase<"ll", GPR32Opnd, mem>, LW_FM<0x30>;
-  def SC : SCBase<"sc", GPR32Opnd, mem>, LW_FM<0x38>;
-}
+def EI : DEI_FT<"ei", GPR32Opnd>, EI_FM<1>;
+def DI : DEI_FT<"di", GPR32Opnd>, EI_FM<0>;
 
-let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LL_P8 : LLBase<"ll", GPR32Opnd, mem64>, LW_FM<0x30>;
-  def SC_P8 : SCBase<"sc", GPR32Opnd, mem64>, LW_FM<0x38>;
-}
+def WAIT : WAIT_FT<"wait">;
+
+/// Load-linked, Store-conditional
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>;
 
 /// Jump and Branch Instructions
-def J       : JumpFJ<jmptarget, "j", br, bb>, FJ<2>,
+def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
               Requires<[RelocStatic, HasStdEnc]>, IsBranch;
-def JR      : IndirectBranch<GPR32Opnd>, MTLO_FM<8>;
-def B       : UncondBranch<"b">, B_FM;
-def BEQ     : CBranch<"beq", seteq, GPR32Opnd>, BEQ_FM<4>;
-def BNE     : CBranch<"bne", setne, GPR32Opnd>, BEQ_FM<5>;
-def BGEZ    : CBranchZero<"bgez", setge, GPR32Opnd>, BGEZ_FM<1, 1>;
-def BGTZ    : CBranchZero<"bgtz", setgt, GPR32Opnd>, BGEZ_FM<7, 0>;
-def BLEZ    : CBranchZero<"blez", setle, GPR32Opnd>, BGEZ_FM<6, 0>;
-def BLTZ    : CBranchZero<"bltz", setlt, GPR32Opnd>, BGEZ_FM<1, 0>;
-
-def JAL  : JumpLink<"jal">, FJ<3>;
-def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
+def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BGEZ    : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
+              BGEZ_FM<1, 1>;
+def BGTZ    : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
+              BGEZ_FM<7, 0>;
+def BLEZ    : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
+              BGEZ_FM<6, 0>;
+def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
+              BGEZ_FM<1, 0>;
+def B       : UncondBranch<BEQ>;
+
+def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
+def JALR : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
 def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
-def BGEZAL : BGEZAL_FT<"bgezal", GPR32Opnd>, BGEZAL_FM<0x11>;
-def BLTZAL : BGEZAL_FT<"bltzal", GPR32Opnd>, BGEZAL_FM<0x10>;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>;
 def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
-def TAILCALL : JumpFJ<calltarget, "j", MipsTailCall, imm>, FJ<2>, IsTailCall;
-def TAILCALL_R : JumpFR<GPR32Opnd, MipsTailCall>, MTLO_FM<8>, IsTailCall;
+def TAILCALL : MMRel, JumpFJ<calltarget, "j", MipsTailCall, imm, "tcall">,
+               FJ<2>, IsTailCall;
+def TAILCALL_R : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>, MTLO_FM<8>,
+                 IsTailCall;
 
-def RET : RetBase<GPR32Opnd>, MTLO_FM<8>;
+def RET : MMRel, RetBase<"ret", GPR32Opnd>, MTLO_FM<8>;
 
 // Exception handling related node and instructions.
 // The conversion sequence is:
@@ -1029,34 +1047,30 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 }
 
 /// Multiply and Divide Instructions.
-def MULT  : MMRel, Mult<"mult", IIImult, GPR32Opnd, [HI, LO]>,
+def MULT  : MMRel, Mult<"mult", IIImult, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x18>;
-def MULTu : MMRel, Mult<"multu", IIImult, GPR32Opnd, [HI, LO]>,
+def MULTu : MMRel, Mult<"multu", IIImult, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x19>;
-def PseudoMULT  : MultDivPseudo<MULT, ACRegs, GPR32Opnd, MipsMult, IIImult>;
-def PseudoMULTu : MultDivPseudo<MULTu, ACRegs, GPR32Opnd, MipsMultu, IIImult>;
-def SDIV  : Div<"div", IIIdiv, GPR32Opnd, [HI, LO]>, MULT_FM<0, 0x1a>;
-def UDIV  : Div<"divu", IIIdiv, GPR32Opnd, [HI, LO]>, MULT_FM<0, 0x1b>;
-def PseudoSDIV : MultDivPseudo<SDIV, ACRegs, GPR32Opnd, MipsDivRem, IIIdiv,
-                               0, 1, 1>;
-def PseudoUDIV : MultDivPseudo<UDIV, ACRegs, GPR32Opnd, MipsDivRemU, IIIdiv,
-                               0, 1, 1>;
+def SDIV  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1a>;
+def UDIV  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1b>;
 
-def MTHI : MoveToLOHI<"mthi", GPR32Opnd, [HI]>, MTLO_FM<0x11>;
-def MTLO : MoveToLOHI<"mtlo", GPR32Opnd, [LO]>, MTLO_FM<0x13>;
-def MFHI : MoveFromLOHI<"mfhi", GPR32Opnd, [HI]>, MFLO_FM<0x10>;
-def MFLO : MoveFromLOHI<"mflo", GPR32Opnd, [LO]>, MFLO_FM<0x12>;
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
 
 /// Sign Ext In Register Instructions.
-def SEB : SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM<0x10, 0x20>;
-def SEH : SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM<0x18, 0x20>;
+def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM<0x10, 0x20>;
+def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM<0x18, 0x20>;
 
 /// Count Leading
-def CLZ : CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
-def CLO : CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>;
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>;
 
 /// Word Swap Bytes Within Halfwords
-def WSBH : SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>;
+def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>;
 
 /// No operation.
 def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
@@ -1065,39 +1079,41 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu", GPR32Opnd, mem_ea>, LW_FM<9>;
+def LEA_ADDiu : EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
-def MADDU : MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
-def MSUB  : MArithR<"msub">, MULT_FM<0x1c, 4>;
-def MSUBU : MArithR<"msubu">, MULT_FM<0x1c, 5>;
+def MADD  : MMRel, MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
+def MADDU : MMRel, MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
+def MSUB  : MMRel, MArithR<"msub">, MULT_FM<0x1c, 4>;
+def MSUBU : MMRel, MArithR<"msubu">, MULT_FM<0x1c, 5>;
+
+let Predicates = [HasStdEnc, NotDSP] in {
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, IIImult>;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, IIImult>;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>;
 def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd>;
 def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu>;
 def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub>;
 def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu>;
+}
+
+def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, IIIdiv,
+                               0, 1, 1>;
+def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, IIIdiv,
+                               0, 1, 1>;
 
 def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
 
-def EXT : ExtBase<"ext", GPR32Opnd>, EXT_FM<0>;
-def INS : InsBase<"ins", GPR32Opnd>, EXT_FM<4>;
+def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
+def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
 
 /// Move Control Registers From/To CPU Registers
-def MFC0_3OP : MFC3OP<(outs GPR32Opnd:$rt),
-                      (ins GPR32Opnd:$rd, uimm16:$sel),
-                      "mfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 0>;
-
-def MTC0_3OP : MFC3OP<(outs GPR32Opnd:$rd, uimm16:$sel),
-                      (ins GPR32Opnd:$rt),
-                      "mtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 4>;
-
-def MFC2_3OP : MFC3OP<(outs GPR32Opnd:$rt),
-                      (ins GPR32Opnd:$rd, uimm16:$sel),
-                      "mfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 0>;
-
-def MTC2_3OP : MFC3OP<(outs GPR32Opnd:$rd, uimm16:$sel),
-                      (ins GPR32Opnd:$rt),
-                      "mtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 4>;
+def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>;
+def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>;
+def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
+def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
@@ -1129,14 +1145,11 @@ def : InstAlias<"xor $rs, $rt, $imm",
 def : InstAlias<"or $rs, $rt, $imm",
                 (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
 def : InstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
-def : InstAlias<"mfc0 $rt, $rd",
-                (MFC0_3OP GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc0 $rt, $rd",
-                (MTC0_3OP GPR32Opnd:$rd, 0, GPR32Opnd:$rt), 0>;
-def : InstAlias<"mfc2 $rt, $rd",
-                (MFC2_3OP GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc2 $rt, $rd",
-                (MTC2_3OP GPR32Opnd:$rd, 0, GPR32Opnd:$rt), 0>;
+def : InstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
 def : InstAlias<"bnez $rs,$offset",
                 (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
 def : InstAlias<"beqz $rs,$offset",
@@ -1145,6 +1158,20 @@ def : InstAlias<"syscall", (SYSCALL 0), 1>;
 
 def : InstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
 def : InstAlias<"break", (BREAK 0, 0), 1>;
+def : InstAlias<"ei", (EI ZERO), 1>;
+def : InstAlias<"di", (DI ZERO), 1>;
+
+def  : InstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : InstAlias<"sub, $rd, $rs, $imm",
+                (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
+def : InstAlias<"subu, $rd, $rs, $imm",
+                (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1152,7 +1179,7 @@ def : InstAlias<"break", (BREAK 0, 0), 1>;
 class LoadImm32< string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadImm32Reg : LoadImm32<"li", shamt,GPR32Opnd>;
+def LoadImm32Reg : LoadImm32<"li", uimm5, GPR32Opnd>;
 
 class LoadAddress<string instr_asm, Operand MemOpnd, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
@@ -1162,9 +1189,7 @@ def LoadAddr32Reg : LoadAddress<"la", mem, GPR32Opnd>;
 class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadAddr32Imm : LoadAddressImm<"la", shamt,GPR32Opnd>;
-
-
+def LoadAddr32Imm : LoadAddressImm<"la", uimm5, GPR32Opnd>;
 
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
@@ -1261,24 +1286,15 @@ def : MipsPat<(not GPR32:$in),
               (NOR GPR32Opnd:$in, ZERO)>;
 
 // extended loads
-let Predicates = [NotN64, HasStdEnc] in {
+let Predicates = [HasStdEnc] in {
   def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
 }
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_P8 addr:$src)>;
-}
 
 // peepholes
-let Predicates = [NotN64, HasStdEnc] in {
-  def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
-}
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(store (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
-}
+let Predicates = [HasStdEnc] in
+def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 
 // brcond patterns
 multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
@@ -1369,22 +1385,13 @@ defm : SetgeImmPats<GPR32, SLTi, SLTiu>;
 // bswap pattern
 def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
 
-// mflo/hi patterns.
-def : MipsPat<(i32 (ExtractLOHI ACRegs:$ac, imm:$lohi_idx)),
-              (EXTRACT_SUBREG ACRegs:$ac, imm:$lohi_idx)>;
-
 // Load halfword/word patterns.
 let AddedComplexity = 40 in {
-  let Predicates = [NotN64, HasStdEnc] in {
+  let Predicates = [HasStdEnc] in {
     def : LoadRegImmPat<LBu, i32, zextloadi8>;
     def : LoadRegImmPat<LH, i32, sextloadi16>;
     def : LoadRegImmPat<LW, i32, load>;
   }
-  let Predicates = [IsN64, HasStdEnc] in {
-    def : LoadRegImmPat<LBu_P8, i32, zextloadi8>;
-    def : LoadRegImmPat<LH_P8, i32, sextloadi16>;
-    def : LoadRegImmPat<LW_P8, i32, load>;
-  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1405,6 +1412,10 @@ include "Mips16InstrInfo.td"
 include "MipsDSPInstrFormats.td"
 include "MipsDSPInstrInfo.td"
 
+// MSA
+include "MipsMSAInstrFormats.td"
+include "MipsMSAInstrInfo.td"
+
 // Micromips
 include "MicroMipsInstrFormats.td"
 include "MicroMipsInstrInfo.td"
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 971176e..2efe578 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -237,6 +237,11 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
 
   MIB.addMBB(MBBOpnd);
 
+  // Bundle the instruction in the delay slot to the newly created branch
+  // and erase the original branch.
+  assert(Br->isBundledWithSucc());
+  MachineBasicBlock::instr_iterator II(Br);
+  MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
   Br->eraseFromParent();
 }
 
@@ -432,8 +437,10 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       if (!I->Br || I->HasLongBranch)
         continue;
 
+      int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4;
+
       // Check if offset fits into 16-bit immediate field of branches.
-      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4))
+      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / ShVal))
         continue;
 
       I->HasLongBranch = true;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index d836975..b6dfadc 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -28,8 +28,7 @@ using namespace llvm;
 MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
   : AsmPrinter(asmprinter) {}
 
-void MipsMCInstLower::Initialize(Mangler *M, MCContext *C) {
-  Mang = M;
+void MipsMCInstLower::Initialize(MCContext *C) {
   Ctx = C;
 }
 
@@ -74,7 +73,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
 
   case MachineOperand::MO_GlobalAddress:
-    Symbol = Mang->getSymbol(MO.getGlobal());
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
     Offset += MO.getOffset();
     break;
 
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index c4a6016..4570bd9 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -19,7 +19,6 @@ namespace llvm {
   class MCOperand;
   class MachineInstr;
   class MachineFunction;
-  class Mangler;
   class MipsAsmPrinter;
 
 /// MipsMCInstLower - This class is used to lower an MachineInstr into an
@@ -27,11 +26,10 @@ namespace llvm {
 class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
   typedef MachineOperand::MachineOperandType MachineOperandType;
   MCContext *Ctx;
-  Mangler *Mang;
   MipsAsmPrinter &AsmPrinter;
 public:
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
-  void Initialize(Mangler *mang, MCContext *C);
+  void Initialize(MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
   MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
 
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
new file mode 100644
index 0000000..875dc0b
--- /dev/null
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -0,0 +1,406 @@
+//===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasMSA : Predicate<"Subtarget.hasMSA()">,
+             AssemblerPredicate<"FeatureMSA">;
+
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasMSA];
+  let Inst{31-26} = 0b011110;
+}
+
+class MSACBranch : MSAInst {
+  let Inst{31-26} = 0b010001;
+}
+
+class MSASpecial : MSAInst {
+  let Inst{31-26} = 0b000000;
+}
+
+class PseudoMSA<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasMSA];
+}
+
+class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<3> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-19} = 0b1110;
+  let Inst{18-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_H_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<4> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-20} = 0b110;
+  let Inst{19-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_W_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<5> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = 0b10;
+  let Inst{20-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_D_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<6> m;
+
+  let Inst{25-23} = major;
+  let Inst{22} = 0b0;
+  let Inst{21-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2RF_FMT<bits<9> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-17} = major;
+  let Inst{16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3RF_FMT<bits<4> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_INDEX_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CFCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rd;
+  bits<5> cs;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = cs;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CTCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> cd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = cd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I5_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> imm;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = imm;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I8_FMT<bits<2> major, bits<6> minor>: MSAInst {
+  bits<8> u8;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-24} = major;
+  let Inst{23-16} = u8;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I10_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<10> s10;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-11} = s10;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_MI10_FMT<bits<2> df, bits<4> minor>: MSAInst {
+  bits<21> addr;
+  bits<5> wd;
+
+  let Inst{25-16} = addr{9-0};
+  let Inst{15-11} = addr{20-16};
+  let Inst{10-6} = wd;
+  let Inst{5-2} = minor;
+  let Inst{1-0} = df;
+}
+
+class MSA_VEC_FMT<bits<5> major, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_CBRANCH_FMT<bits<3> major, bits<2> df>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class MSA_CBRANCH_V_FMT<bits<5> major>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
new file mode 100644
index 0000000..82c51a6
--- /dev/null
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -0,0 +1,3694 @@
+//===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips MSA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
+def SDT_VSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                      SDTCisInt<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisVT<3, OtherVT>]>;
+def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                       SDTCisFP<1>,
+                                       SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, OtherVT>]>;
+def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>,
+                                    SDTCisInt<1>, SDTCisVec<1>,
+                                    SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
+def SDT_ILV : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+
+def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
+def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
+def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
+def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
+def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
+                      [SDNPCommutative, SDNPAssociative]>;
+def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
+def MipsSHF : SDNode<"MipsISD::SHF", SDT_SHF>;
+def MipsILVEV : SDNode<"MipsISD::ILVEV", SDT_ILV>;
+def MipsILVOD : SDNode<"MipsISD::ILVOD", SDT_ILV>;
+def MipsILVL  : SDNode<"MipsISD::ILVL",  SDT_ILV>;
+def MipsILVR  : SDNode<"MipsISD::ILVR",  SDT_ILV>;
+def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
+def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
+
+def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
+def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
+
+def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+
+// Operands
+
+def uimm2 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// The immediate of an LSA instruction needs special handling
+// as the encoded value should be subtracted by one.
+def uimm2LSAAsmOperand : AsmOperandClass {
+  let Name = "LSAImm";
+  let ParserMethod = "parseLSAImm";
+  let RenderMethod = "addImmOperands";
+}
+
+def LSAImm : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+  let EncoderMethod = "getLSAImmEncoding";
+  let DecoderMethod = "DecodeLSAImm";
+  let ParserMatchClass = uimm2LSAAsmOperand;
+}
+
+def uimm3 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def uimm4 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def uimm8 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def simm5 : Operand<i32>;
+
+def simm10 : Operand<i32>;
+
+def vsplat_uimm1 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm2 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm3 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm4 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm5 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm6 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm8 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_simm5 : Operand<vAny>;
+
+def vsplat_simm10 : Operand<vAny>;
+
+def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
+
+// Pattern fragments
+def vextract_sext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i8)>;
+def vextract_sext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i16)>;
+def vextract_sext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i32)>;
+
+def vextract_zext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i8)>;
+def vextract_zext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i16)>;
+def vextract_zext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i32)>;
+
+def vinsert_v16i8 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v16i8 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v8i16 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v8i16 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v4i32 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v4i32 (vector_insert node:$vec, node:$val, node:$idx))>;
+
+class vfsetcc_type<ValueType ResTy, ValueType OpTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vfsetcc (OpTy node:$lhs), (OpTy node:$rhs), CC))>;
+
+// ISD::SETFALSE cannot occur
+def vfsetoeq_v4f32 : vfsetcc_type<v4i32, v4f32, SETOEQ>;
+def vfsetoeq_v2f64 : vfsetcc_type<v2i64, v2f64, SETOEQ>;
+def vfsetoge_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGE>;
+def vfsetoge_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGE>;
+def vfsetogt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGT>;
+def vfsetogt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGT>;
+def vfsetole_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLE>;
+def vfsetole_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLE>;
+def vfsetolt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLT>;
+def vfsetolt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLT>;
+def vfsetone_v4f32 : vfsetcc_type<v4i32, v4f32, SETONE>;
+def vfsetone_v2f64 : vfsetcc_type<v2i64, v2f64, SETONE>;
+def vfsetord_v4f32 : vfsetcc_type<v4i32, v4f32, SETO>;
+def vfsetord_v2f64 : vfsetcc_type<v2i64, v2f64, SETO>;
+def vfsetun_v4f32  : vfsetcc_type<v4i32, v4f32, SETUO>;
+def vfsetun_v2f64  : vfsetcc_type<v2i64, v2f64, SETUO>;
+def vfsetueq_v4f32 : vfsetcc_type<v4i32, v4f32, SETUEQ>;
+def vfsetueq_v2f64 : vfsetcc_type<v2i64, v2f64, SETUEQ>;
+def vfsetuge_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGE>;
+def vfsetuge_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGE>;
+def vfsetugt_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGT>;
+def vfsetugt_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGT>;
+def vfsetule_v4f32 : vfsetcc_type<v4i32, v4f32, SETULE>;
+def vfsetule_v2f64 : vfsetcc_type<v2i64, v2f64, SETULE>;
+def vfsetult_v4f32 : vfsetcc_type<v4i32, v4f32, SETULT>;
+def vfsetult_v2f64 : vfsetcc_type<v2i64, v2f64, SETULT>;
+def vfsetune_v4f32 : vfsetcc_type<v4i32, v4f32, SETUNE>;
+def vfsetune_v2f64 : vfsetcc_type<v2i64, v2f64, SETUNE>;
+// ISD::SETTRUE cannot occur
+// ISD::SETFALSE2 cannot occur
+// ISD::SETTRUE2 cannot occur
+
+class vsetcc_type<ValueType ResTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vsetcc node:$lhs, node:$rhs, CC))>;
+
+def vseteq_v16i8  : vsetcc_type<v16i8, SETEQ>;
+def vseteq_v8i16  : vsetcc_type<v8i16, SETEQ>;
+def vseteq_v4i32  : vsetcc_type<v4i32, SETEQ>;
+def vseteq_v2i64  : vsetcc_type<v2i64, SETEQ>;
+def vsetle_v16i8  : vsetcc_type<v16i8, SETLE>;
+def vsetle_v8i16  : vsetcc_type<v8i16, SETLE>;
+def vsetle_v4i32  : vsetcc_type<v4i32, SETLE>;
+def vsetle_v2i64  : vsetcc_type<v2i64, SETLE>;
+def vsetlt_v16i8  : vsetcc_type<v16i8, SETLT>;
+def vsetlt_v8i16  : vsetcc_type<v8i16, SETLT>;
+def vsetlt_v4i32  : vsetcc_type<v4i32, SETLT>;
+def vsetlt_v2i64  : vsetcc_type<v2i64, SETLT>;
+def vsetule_v16i8 : vsetcc_type<v16i8, SETULE>;
+def vsetule_v8i16 : vsetcc_type<v8i16, SETULE>;
+def vsetule_v4i32 : vsetcc_type<v4i32, SETULE>;
+def vsetule_v2i64 : vsetcc_type<v2i64, SETULE>;
+def vsetult_v16i8 : vsetcc_type<v16i8, SETULT>;
+def vsetult_v8i16 : vsetcc_type<v8i16, SETULT>;
+def vsetult_v4i32 : vsetcc_type<v4i32, SETULT>;
+def vsetult_v2i64 : vsetcc_type<v2i64, SETULT>;
+
+def vsplati8  : PatFrag<(ops node:$e0),
+                        (v16i8 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati16 : PatFrag<(ops node:$e0),
+                        (v8i16 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati32 : PatFrag<(ops node:$e0),
+                        (v4i32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati64 : PatFrag<(ops node:$e0),
+                        (v2i64 (build_vector:$v0 node:$e0, node:$e0))>;
+def vsplatf32 : PatFrag<(ops node:$e0),
+                        (v4f32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplatf64 : PatFrag<(ops node:$e0),
+                        (v2f64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati8_elt  : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati8 node:$i), node:$v, node:$v)>;
+def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati16 node:$i), node:$v, node:$v)>;
+def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
+def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+
+class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
+                   SDNodeXForm xform = NOOP_SDNodeXForm>
+  : PatLeaf<frag, pred, xform> {
+  Operand OpClass = opclass;
+}
+
+class SplatComplexPattern<Operand opclass, ValueType ty, int numops, string fn,
+                          list<SDNode> roots = [],
+                          list<SDNodeProperty> props = []> :
+  ComplexPattern<ty, numops, fn, roots, props> {
+  Operand OpClass = opclass;
+}
+
+def vsplati8_uimm3 : SplatComplexPattern<vsplat_uimm3, v16i8, 1,
+                                         "selectVSplatUimm3",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm4 : SplatComplexPattern<vsplat_uimm4, v16i8, 1,
+                                         "selectVSplatUimm4",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm5 : SplatComplexPattern<vsplat_uimm5, v16i8, 1,
+                                         "selectVSplatUimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm8 : SplatComplexPattern<vsplat_uimm8, v16i8, 1,
+                                         "selectVSplatUimm8",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_simm5 : SplatComplexPattern<vsplat_simm5, v16i8, 1,
+                                         "selectVSplatSimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati16_uimm3 : SplatComplexPattern<vsplat_uimm3, v8i16, 1,
+                                          "selectVSplatUimm3",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm4 : SplatComplexPattern<vsplat_uimm4, v8i16, 1,
+                                          "selectVSplatUimm4",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm5 : SplatComplexPattern<vsplat_uimm5, v8i16, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_simm5 : SplatComplexPattern<vsplat_simm5, v8i16, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm2 : SplatComplexPattern<vsplat_uimm2, v4i32, 1,
+                                          "selectVSplatUimm2",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm5 : SplatComplexPattern<vsplat_uimm5, v4i32, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_simm5 : SplatComplexPattern<vsplat_simm5, v4i32, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm1 : SplatComplexPattern<vsplat_uimm1, v2i64, 1,
+                                          "selectVSplatUimm1",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm5 : SplatComplexPattern<vsplat_uimm5, v2i64, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm6 : SplatComplexPattern<vsplat_uimm6, v2i64, 1,
+                                          "selectVSplatUimm6",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_simm5 : SplatComplexPattern<vsplat_simm5, v2i64, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is an exact
+// power of 2
+def vsplat_uimm_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmPow2",
+                                      [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is the bitwise
+// inverse of an exact power of 2
+def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of left-most bits set.
+def vsplat_maskl_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskL",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of right-most bits set.
+def vsplat_maskr_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskR",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that equals 1
+// FIXME: These should be a ComplexPattern but we can't use them because the
+//        ISel generator requires the uses to have a name, but providing a name
+//        causes other errors ("used in pattern but not operand list")
+def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (N, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+  APInt Imm;
+  SDNode *BV = N->getOperand(0).getNode();
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (BV, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_h : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_w : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_d : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
+                                               node:$wt),
+                                          (bitconvert (v4i32 immAllOnesV))))>;
+
+def vbneg_b : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_h : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_w : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_d : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                          node:$wt))>;
+
+def vbset_b : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_h : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_w : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_d : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                         node:$wt))>;
+
+def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                  (fsub node:$wd, (fmul node:$ws, node:$wt))>;
+
+def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (add node:$wd, (mul node:$ws, node:$wt))>;
+
+def mulsub : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (sub node:$wd, (mul node:$ws, node:$wt))>;
+
+def mul_fexp2 : PatFrag<(ops node:$ws, node:$wt),
+                        (fmul node:$ws, (fexp2 node:$wt))>;
+
+// Immediates
+def immSExt5 : ImmLeaf<i32, [{return isInt<5>(Imm);}]>;
+def immSExt10: ImmLeaf<i32, [{return isInt<10>(Imm);}]>;
+
+// Instruction encoding.
+class ADD_A_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010000>;
+class ADD_A_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010000>;
+class ADD_A_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010000>;
+class ADD_A_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010000>;
+
+class ADDS_A_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010000>;
+class ADDS_A_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010000>;
+class ADDS_A_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010000>;
+class ADDS_A_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010000>;
+
+class ADDS_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010000>;
+class ADDS_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010000>;
+class ADDS_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010000>;
+class ADDS_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010000>;
+
+class ADDS_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010000>;
+class ADDS_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010000>;
+class ADDS_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010000>;
+class ADDS_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010000>;
+
+class ADDV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001110>;
+class ADDV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001110>;
+class ADDV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001110>;
+class ADDV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001110>;
+
+class ADDVI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000110>;
+class ADDVI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000110>;
+class ADDVI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000110>;
+class ADDVI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000110>;
+
+class AND_V_ENC : MSA_VEC_FMT<0b00000, 0b011110>;
+
+class ANDI_B_ENC : MSA_I8_FMT<0b00, 0b000000>;
+
+class ASUB_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010001>;
+class ASUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010001>;
+class ASUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010001>;
+class ASUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010001>;
+
+class ASUB_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010001>;
+class ASUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010001>;
+class ASUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010001>;
+class ASUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010001>;
+
+class AVE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010000>;
+class AVE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010000>;
+class AVE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010000>;
+class AVE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010000>;
+
+class AVE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010000>;
+class AVE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010000>;
+class AVE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010000>;
+class AVE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010000>;
+
+class AVER_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010000>;
+class AVER_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010000>;
+class AVER_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010000>;
+class AVER_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010000>;
+
+class AVER_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010000>;
+class AVER_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010000>;
+class AVER_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010000>;
+class AVER_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010000>;
+
+class BCLR_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001101>;
+class BCLR_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001101>;
+class BCLR_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001101>;
+class BCLR_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001101>;
+
+class BCLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001001>;
+class BCLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001001>;
+class BCLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001001>;
+class BCLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001001>;
+
+class BINSL_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001101>;
+class BINSL_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001101>;
+class BINSL_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001101>;
+class BINSL_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001101>;
+
+class BINSLI_B_ENC : MSA_BIT_B_FMT<0b110, 0b001001>;
+class BINSLI_H_ENC : MSA_BIT_H_FMT<0b110, 0b001001>;
+class BINSLI_W_ENC : MSA_BIT_W_FMT<0b110, 0b001001>;
+class BINSLI_D_ENC : MSA_BIT_D_FMT<0b110, 0b001001>;
+
+class BINSR_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001101>;
+class BINSR_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001101>;
+class BINSR_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001101>;
+class BINSR_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001101>;
+
+class BINSRI_B_ENC : MSA_BIT_B_FMT<0b111, 0b001001>;
+class BINSRI_H_ENC : MSA_BIT_H_FMT<0b111, 0b001001>;
+class BINSRI_W_ENC : MSA_BIT_W_FMT<0b111, 0b001001>;
+class BINSRI_D_ENC : MSA_BIT_D_FMT<0b111, 0b001001>;
+
+class BMNZ_V_ENC : MSA_VEC_FMT<0b00100, 0b011110>;
+
+class BMNZI_B_ENC : MSA_I8_FMT<0b00, 0b000001>;
+
+class BMZ_V_ENC : MSA_VEC_FMT<0b00101, 0b011110>;
+
+class BMZI_B_ENC : MSA_I8_FMT<0b01, 0b000001>;
+
+class BNEG_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001101>;
+class BNEG_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001101>;
+class BNEG_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001101>;
+class BNEG_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001101>;
+
+class BNEGI_B_ENC : MSA_BIT_B_FMT<0b101, 0b001001>;
+class BNEGI_H_ENC : MSA_BIT_H_FMT<0b101, 0b001001>;
+class BNEGI_W_ENC : MSA_BIT_W_FMT<0b101, 0b001001>;
+class BNEGI_D_ENC : MSA_BIT_D_FMT<0b101, 0b001001>;
+
+class BNZ_B_ENC : MSA_CBRANCH_FMT<0b111, 0b00>;
+class BNZ_H_ENC : MSA_CBRANCH_FMT<0b111, 0b01>;
+class BNZ_W_ENC : MSA_CBRANCH_FMT<0b111, 0b10>;
+class BNZ_D_ENC : MSA_CBRANCH_FMT<0b111, 0b11>;
+
+class BNZ_V_ENC : MSA_CBRANCH_V_FMT<0b01111>;
+
+class BSEL_V_ENC : MSA_VEC_FMT<0b00110, 0b011110>;
+
+class BSELI_B_ENC : MSA_I8_FMT<0b10, 0b000001>;
+
+class BSET_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001101>;
+class BSET_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001101>;
+class BSET_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001101>;
+class BSET_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001101>;
+
+class BSETI_B_ENC : MSA_BIT_B_FMT<0b100, 0b001001>;
+class BSETI_H_ENC : MSA_BIT_H_FMT<0b100, 0b001001>;
+class BSETI_W_ENC : MSA_BIT_W_FMT<0b100, 0b001001>;
+class BSETI_D_ENC : MSA_BIT_D_FMT<0b100, 0b001001>;
+
+class BZ_B_ENC : MSA_CBRANCH_FMT<0b110, 0b00>;
+class BZ_H_ENC : MSA_CBRANCH_FMT<0b110, 0b01>;
+class BZ_W_ENC : MSA_CBRANCH_FMT<0b110, 0b10>;
+class BZ_D_ENC : MSA_CBRANCH_FMT<0b110, 0b11>;
+
+class BZ_V_ENC : MSA_CBRANCH_V_FMT<0b01011>;
+
+class CEQ_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001111>;
+class CEQ_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001111>;
+class CEQ_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001111>;
+class CEQ_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001111>;
+
+class CEQI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000111>;
+class CEQI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000111>;
+class CEQI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000111>;
+class CEQI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000111>;
+
+class CFCMSA_ENC : MSA_ELM_CFCMSA_FMT<0b0001111110, 0b011001>;
+
+class CLE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001111>;
+class CLE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001111>;
+class CLE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001111>;
+class CLE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001111>;
+
+class CLE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001111>;
+class CLE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001111>;
+class CLE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001111>;
+class CLE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001111>;
+
+class CLEI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000111>;
+class CLEI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000111>;
+class CLEI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000111>;
+class CLEI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000111>;
+
+class CLEI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000111>;
+class CLEI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000111>;
+class CLEI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000111>;
+class CLEI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000111>;
+
+class CLT_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001111>;
+class CLT_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001111>;
+class CLT_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001111>;
+class CLT_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001111>;
+
+class CLT_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001111>;
+class CLT_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001111>;
+class CLT_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001111>;
+class CLT_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001111>;
+
+class CLTI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000111>;
+class CLTI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000111>;
+class CLTI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000111>;
+class CLTI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000111>;
+
+class CLTI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000111>;
+class CLTI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000111>;
+class CLTI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000111>;
+class CLTI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000111>;
+
+class COPY_S_B_ENC : MSA_ELM_COPY_B_FMT<0b0010, 0b011001>;
+class COPY_S_H_ENC : MSA_ELM_COPY_H_FMT<0b0010, 0b011001>;
+class COPY_S_W_ENC : MSA_ELM_COPY_W_FMT<0b0010, 0b011001>;
+
+class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
+class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
+class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
+
+class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
+
+class DIV_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010010>;
+class DIV_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010010>;
+class DIV_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010010>;
+class DIV_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010010>;
+
+class DIV_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010010>;
+class DIV_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010010>;
+class DIV_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010010>;
+class DIV_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010010>;
+
+class DOTP_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010011>;
+class DOTP_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010011>;
+class DOTP_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010011>;
+
+class DOTP_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010011>;
+class DOTP_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010011>;
+class DOTP_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010011>;
+
+class DPADD_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010011>;
+class DPADD_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010011>;
+class DPADD_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010011>;
+
+class DPADD_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010011>;
+class DPADD_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010011>;
+class DPADD_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010011>;
+
+class DPSUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010011>;
+class DPSUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010011>;
+class DPSUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010011>;
+
+class DPSUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010011>;
+class DPSUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010011>;
+class DPSUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010011>;
+
+class FADD_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011011>;
+class FADD_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011011>;
+
+class FCAF_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011010>;
+class FCAF_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011010>;
+
+class FCEQ_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011010>;
+class FCEQ_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011010>;
+
+class FCLASS_W_ENC : MSA_2RF_FMT<0b110010000, 0b0, 0b011110>;
+class FCLASS_D_ENC : MSA_2RF_FMT<0b110010000, 0b1, 0b011110>;
+
+class FCLE_W_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011010>;
+class FCLE_D_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011010>;
+
+class FCLT_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011010>;
+class FCLT_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011010>;
+
+class FCNE_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011100>;
+class FCNE_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011100>;
+
+class FCOR_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011100>;
+class FCOR_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011100>;
+
+class FCUEQ_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011010>;
+class FCUEQ_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011010>;
+
+class FCULE_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011010>;
+class FCULE_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011010>;
+
+class FCULT_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011010>;
+class FCULT_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011010>;
+
+class FCUN_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011010>;
+class FCUN_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011010>;
+
+class FCUNE_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011100>;
+class FCUNE_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011100>;
+
+class FDIV_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011011>;
+class FDIV_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011011>;
+
+class FEXDO_H_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011011>;
+class FEXDO_W_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011011>;
+
+class FEXP2_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011011>;
+class FEXP2_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011011>;
+
+class FEXUPL_W_ENC : MSA_2RF_FMT<0b110011000, 0b0, 0b011110>;
+class FEXUPL_D_ENC : MSA_2RF_FMT<0b110011000, 0b1, 0b011110>;
+
+class FEXUPR_W_ENC : MSA_2RF_FMT<0b110011001, 0b0, 0b011110>;
+class FEXUPR_D_ENC : MSA_2RF_FMT<0b110011001, 0b1, 0b011110>;
+
+class FFINT_S_W_ENC : MSA_2RF_FMT<0b110011110, 0b0, 0b011110>;
+class FFINT_S_D_ENC : MSA_2RF_FMT<0b110011110, 0b1, 0b011110>;
+
+class FFINT_U_W_ENC : MSA_2RF_FMT<0b110011111, 0b0, 0b011110>;
+class FFINT_U_D_ENC : MSA_2RF_FMT<0b110011111, 0b1, 0b011110>;
+
+class FFQL_W_ENC : MSA_2RF_FMT<0b110011010, 0b0, 0b011110>;
+class FFQL_D_ENC : MSA_2RF_FMT<0b110011010, 0b1, 0b011110>;
+
+class FFQR_W_ENC : MSA_2RF_FMT<0b110011011, 0b0, 0b011110>;
+class FFQR_D_ENC : MSA_2RF_FMT<0b110011011, 0b1, 0b011110>;
+
+class FILL_B_ENC : MSA_2R_FILL_FMT<0b11000000, 0b00, 0b011110>;
+class FILL_H_ENC : MSA_2R_FILL_FMT<0b11000000, 0b01, 0b011110>;
+class FILL_W_ENC : MSA_2R_FILL_FMT<0b11000000, 0b10, 0b011110>;
+
+class FLOG2_W_ENC : MSA_2RF_FMT<0b110010111, 0b0, 0b011110>;
+class FLOG2_D_ENC : MSA_2RF_FMT<0b110010111, 0b1, 0b011110>;
+
+class FMADD_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011011>;
+class FMADD_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011011>;
+
+class FMAX_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011011>;
+class FMAX_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011011>;
+
+class FMAX_A_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011011>;
+class FMAX_A_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011011>;
+
+class FMIN_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011011>;
+class FMIN_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011011>;
+
+class FMIN_A_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011011>;
+class FMIN_A_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011011>;
+
+class FMSUB_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011011>;
+class FMSUB_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011011>;
+
+class FMUL_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011011>;
+class FMUL_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011011>;
+
+class FRINT_W_ENC : MSA_2RF_FMT<0b110010110, 0b0, 0b011110>;
+class FRINT_D_ENC : MSA_2RF_FMT<0b110010110, 0b1, 0b011110>;
+
+class FRCP_W_ENC : MSA_2RF_FMT<0b110010101, 0b0, 0b011110>;
+class FRCP_D_ENC : MSA_2RF_FMT<0b110010101, 0b1, 0b011110>;
+
+class FRSQRT_W_ENC : MSA_2RF_FMT<0b110010100, 0b0, 0b011110>;
+class FRSQRT_D_ENC : MSA_2RF_FMT<0b110010100, 0b1, 0b011110>;
+
+class FSAF_W_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011010>;
+class FSAF_D_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011010>;
+
+class FSEQ_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011010>;
+class FSEQ_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011010>;
+
+class FSLE_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011010>;
+class FSLE_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011010>;
+
+class FSLT_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011010>;
+class FSLT_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011010>;
+
+class FSNE_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011100>;
+class FSNE_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011100>;
+
+class FSOR_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011100>;
+class FSOR_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011100>;
+
+class FSQRT_W_ENC : MSA_2RF_FMT<0b110010011, 0b0, 0b011110>;
+class FSQRT_D_ENC : MSA_2RF_FMT<0b110010011, 0b1, 0b011110>;
+
+class FSUB_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011011>;
+class FSUB_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011011>;
+
+class FSUEQ_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011010>;
+class FSUEQ_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011010>;
+
+class FSULE_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011010>;
+class FSULE_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011010>;
+
+class FSULT_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011010>;
+class FSULT_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011010>;
+
+class FSUN_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011010>;
+class FSUN_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011010>;
+
+class FSUNE_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011100>;
+class FSUNE_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011100>;
+
+class FTINT_S_W_ENC : MSA_2RF_FMT<0b110011100, 0b0, 0b011110>;
+class FTINT_S_D_ENC : MSA_2RF_FMT<0b110011100, 0b1, 0b011110>;
+
+class FTINT_U_W_ENC : MSA_2RF_FMT<0b110011101, 0b0, 0b011110>;
+class FTINT_U_D_ENC : MSA_2RF_FMT<0b110011101, 0b1, 0b011110>;
+
+class FTQ_H_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011011>;
+class FTQ_W_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011011>;
+
+class FTRUNC_S_W_ENC : MSA_2RF_FMT<0b110010001, 0b0, 0b011110>;
+class FTRUNC_S_D_ENC : MSA_2RF_FMT<0b110010001, 0b1, 0b011110>;
+
+class FTRUNC_U_W_ENC : MSA_2RF_FMT<0b110010010, 0b0, 0b011110>;
+class FTRUNC_U_D_ENC : MSA_2RF_FMT<0b110010010, 0b1, 0b011110>;
+
+class HADD_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010101>;
+class HADD_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010101>;
+class HADD_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010101>;
+
+class HADD_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010101>;
+class HADD_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010101>;
+class HADD_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010101>;
+
+class HSUB_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010101>;
+class HSUB_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010101>;
+class HSUB_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010101>;
+
+class HSUB_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010101>;
+class HSUB_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010101>;
+class HSUB_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010101>;
+
+class ILVEV_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010100>;
+class ILVEV_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010100>;
+class ILVEV_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010100>;
+class ILVEV_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010100>;
+
+class ILVL_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010100>;
+class ILVL_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010100>;
+class ILVL_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010100>;
+class ILVL_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010100>;
+
+class ILVOD_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010100>;
+class ILVOD_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010100>;
+class ILVOD_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010100>;
+class ILVOD_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010100>;
+
+class ILVR_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010100>;
+class ILVR_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010100>;
+class ILVR_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010100>;
+class ILVR_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010100>;
+
+class INSERT_B_ENC : MSA_ELM_INSERT_B_FMT<0b0100, 0b011001>;
+class INSERT_H_ENC : MSA_ELM_INSERT_H_FMT<0b0100, 0b011001>;
+class INSERT_W_ENC : MSA_ELM_INSERT_W_FMT<0b0100, 0b011001>;
+
+class INSVE_B_ENC : MSA_ELM_B_FMT<0b0101, 0b011001>;
+class INSVE_H_ENC : MSA_ELM_H_FMT<0b0101, 0b011001>;
+class INSVE_W_ENC : MSA_ELM_W_FMT<0b0101, 0b011001>;
+class INSVE_D_ENC : MSA_ELM_D_FMT<0b0101, 0b011001>;
+
+class LD_B_ENC   : MSA_MI10_FMT<0b00, 0b1000>;
+class LD_H_ENC   : MSA_MI10_FMT<0b01, 0b1000>;
+class LD_W_ENC   : MSA_MI10_FMT<0b10, 0b1000>;
+class LD_D_ENC   : MSA_MI10_FMT<0b11, 0b1000>;
+
+class LDI_B_ENC  : MSA_I10_FMT<0b110, 0b00, 0b000111>;
+class LDI_H_ENC  : MSA_I10_FMT<0b110, 0b01, 0b000111>;
+class LDI_W_ENC  : MSA_I10_FMT<0b110, 0b10, 0b000111>;
+class LDI_D_ENC  : MSA_I10_FMT<0b110, 0b11, 0b000111>;
+
+class LSA_ENC : SPECIAL_LSA_FMT<0b000101>;
+
+class MADD_Q_H_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011100>;
+class MADD_Q_W_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011100>;
+
+class MADDR_Q_H_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011100>;
+class MADDR_Q_W_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011100>;
+
+class MADDV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010010>;
+class MADDV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010010>;
+class MADDV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010010>;
+class MADDV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010010>;
+
+class MAX_A_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001110>;
+class MAX_A_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001110>;
+class MAX_A_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001110>;
+class MAX_A_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001110>;
+
+class MAX_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001110>;
+class MAX_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001110>;
+class MAX_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001110>;
+class MAX_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001110>;
+
+class MAX_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001110>;
+class MAX_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001110>;
+class MAX_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001110>;
+class MAX_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001110>;
+
+class MAXI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000110>;
+class MAXI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000110>;
+class MAXI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000110>;
+class MAXI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000110>;
+
+class MAXI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000110>;
+class MAXI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000110>;
+class MAXI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000110>;
+class MAXI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000110>;
+
+class MIN_A_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001110>;
+class MIN_A_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001110>;
+class MIN_A_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001110>;
+class MIN_A_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001110>;
+
+class MIN_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001110>;
+class MIN_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001110>;
+class MIN_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001110>;
+class MIN_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001110>;
+
+class MIN_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001110>;
+class MIN_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001110>;
+class MIN_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001110>;
+class MIN_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001110>;
+
+class MINI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000110>;
+class MINI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000110>;
+class MINI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000110>;
+class MINI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000110>;
+
+class MINI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000110>;
+class MINI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000110>;
+class MINI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000110>;
+class MINI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000110>;
+
+class MOD_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010010>;
+class MOD_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010010>;
+class MOD_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010010>;
+class MOD_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010010>;
+
+class MOD_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010010>;
+class MOD_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010010>;
+class MOD_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010010>;
+class MOD_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010010>;
+
+class MOVE_V_ENC : MSA_ELM_FMT<0b0010111110, 0b011001>;
+
+class MSUB_Q_H_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011100>;
+class MSUB_Q_W_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011100>;
+
+class MSUBR_Q_H_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011100>;
+class MSUBR_Q_W_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011100>;
+
+class MSUBV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010010>;
+class MSUBV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010010>;
+class MSUBV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010010>;
+class MSUBV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010010>;
+
+class MUL_Q_H_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011100>;
+class MUL_Q_W_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011100>;
+
+class MULR_Q_H_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011100>;
+class MULR_Q_W_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011100>;
+
+class MULV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010010>;
+class MULV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010010>;
+class MULV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010010>;
+class MULV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010010>;
+
+class NLOC_B_ENC : MSA_2R_FMT<0b11000010, 0b00, 0b011110>;
+class NLOC_H_ENC : MSA_2R_FMT<0b11000010, 0b01, 0b011110>;
+class NLOC_W_ENC : MSA_2R_FMT<0b11000010, 0b10, 0b011110>;
+class NLOC_D_ENC : MSA_2R_FMT<0b11000010, 0b11, 0b011110>;
+
+class NLZC_B_ENC : MSA_2R_FMT<0b11000011, 0b00, 0b011110>;
+class NLZC_H_ENC : MSA_2R_FMT<0b11000011, 0b01, 0b011110>;
+class NLZC_W_ENC : MSA_2R_FMT<0b11000011, 0b10, 0b011110>;
+class NLZC_D_ENC : MSA_2R_FMT<0b11000011, 0b11, 0b011110>;
+
+class NOR_V_ENC : MSA_VEC_FMT<0b00010, 0b011110>;
+
+class NORI_B_ENC : MSA_I8_FMT<0b10, 0b000000>;
+
+class OR_V_ENC : MSA_VEC_FMT<0b00001, 0b011110>;
+
+class ORI_B_ENC  : MSA_I8_FMT<0b01, 0b000000>;
+
+class PCKEV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010100>;
+class PCKEV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010100>;
+class PCKEV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010100>;
+class PCKEV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010100>;
+
+class PCKOD_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010100>;
+class PCKOD_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010100>;
+class PCKOD_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010100>;
+class PCKOD_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010100>;
+
+class PCNT_B_ENC : MSA_2R_FMT<0b11000001, 0b00, 0b011110>;
+class PCNT_H_ENC : MSA_2R_FMT<0b11000001, 0b01, 0b011110>;
+class PCNT_W_ENC : MSA_2R_FMT<0b11000001, 0b10, 0b011110>;
+class PCNT_D_ENC : MSA_2R_FMT<0b11000001, 0b11, 0b011110>;
+
+class SAT_S_B_ENC : MSA_BIT_B_FMT<0b000, 0b001010>;
+class SAT_S_H_ENC : MSA_BIT_H_FMT<0b000, 0b001010>;
+class SAT_S_W_ENC : MSA_BIT_W_FMT<0b000, 0b001010>;
+class SAT_S_D_ENC : MSA_BIT_D_FMT<0b000, 0b001010>;
+
+class SAT_U_B_ENC : MSA_BIT_B_FMT<0b001, 0b001010>;
+class SAT_U_H_ENC : MSA_BIT_H_FMT<0b001, 0b001010>;
+class SAT_U_W_ENC : MSA_BIT_W_FMT<0b001, 0b001010>;
+class SAT_U_D_ENC : MSA_BIT_D_FMT<0b001, 0b001010>;
+
+class SHF_B_ENC  : MSA_I8_FMT<0b00, 0b000010>;
+class SHF_H_ENC  : MSA_I8_FMT<0b01, 0b000010>;
+class SHF_W_ENC  : MSA_I8_FMT<0b10, 0b000010>;
+
+class SLD_B_ENC : MSA_3R_INDEX_FMT<0b000, 0b00, 0b010100>;
+class SLD_H_ENC : MSA_3R_INDEX_FMT<0b000, 0b01, 0b010100>;
+class SLD_W_ENC : MSA_3R_INDEX_FMT<0b000, 0b10, 0b010100>;
+class SLD_D_ENC : MSA_3R_INDEX_FMT<0b000, 0b11, 0b010100>;
+
+class SLDI_B_ENC : MSA_ELM_B_FMT<0b0000, 0b011001>;
+class SLDI_H_ENC : MSA_ELM_H_FMT<0b0000, 0b011001>;
+class SLDI_W_ENC : MSA_ELM_W_FMT<0b0000, 0b011001>;
+class SLDI_D_ENC : MSA_ELM_D_FMT<0b0000, 0b011001>;
+
+class SLL_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001101>;
+class SLL_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001101>;
+class SLL_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001101>;
+class SLL_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001101>;
+
+class SLLI_B_ENC : MSA_BIT_B_FMT<0b000, 0b001001>;
+class SLLI_H_ENC : MSA_BIT_H_FMT<0b000, 0b001001>;
+class SLLI_W_ENC : MSA_BIT_W_FMT<0b000, 0b001001>;
+class SLLI_D_ENC : MSA_BIT_D_FMT<0b000, 0b001001>;
+
+class SPLAT_B_ENC : MSA_3R_INDEX_FMT<0b001, 0b00, 0b010100>;
+class SPLAT_H_ENC : MSA_3R_INDEX_FMT<0b001, 0b01, 0b010100>;
+class SPLAT_W_ENC : MSA_3R_INDEX_FMT<0b001, 0b10, 0b010100>;
+class SPLAT_D_ENC : MSA_3R_INDEX_FMT<0b001, 0b11, 0b010100>;
+
+class SPLATI_B_ENC : MSA_ELM_B_FMT<0b0001, 0b011001>;
+class SPLATI_H_ENC : MSA_ELM_H_FMT<0b0001, 0b011001>;
+class SPLATI_W_ENC : MSA_ELM_W_FMT<0b0001, 0b011001>;
+class SPLATI_D_ENC : MSA_ELM_D_FMT<0b0001, 0b011001>;
+
+class SRA_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001101>;
+class SRA_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001101>;
+class SRA_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001101>;
+class SRA_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001101>;
+
+class SRAI_B_ENC : MSA_BIT_B_FMT<0b001, 0b001001>;
+class SRAI_H_ENC : MSA_BIT_H_FMT<0b001, 0b001001>;
+class SRAI_W_ENC : MSA_BIT_W_FMT<0b001, 0b001001>;
+class SRAI_D_ENC : MSA_BIT_D_FMT<0b001, 0b001001>;
+
+class SRAR_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010101>;
+class SRAR_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010101>;
+class SRAR_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010101>;
+class SRAR_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010101>;
+
+class SRARI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001010>;
+class SRARI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001010>;
+class SRARI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001010>;
+class SRARI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001010>;
+
+class SRL_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001101>;
+class SRL_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001101>;
+class SRL_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001101>;
+class SRL_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001101>;
+
+class SRLI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001001>;
+class SRLI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001001>;
+class SRLI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001001>;
+class SRLI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001001>;
+
+class SRLR_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010101>;
+class SRLR_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010101>;
+class SRLR_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010101>;
+class SRLR_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010101>;
+
+class SRLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001010>;
+class SRLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001010>;
+class SRLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001010>;
+class SRLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001010>;
+
+class ST_B_ENC   : MSA_MI10_FMT<0b00, 0b1001>;
+class ST_H_ENC   : MSA_MI10_FMT<0b01, 0b1001>;
+class ST_W_ENC   : MSA_MI10_FMT<0b10, 0b1001>;
+class ST_D_ENC   : MSA_MI10_FMT<0b11, 0b1001>;
+
+class SUBS_S_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010001>;
+class SUBS_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010001>;
+class SUBS_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010001>;
+class SUBS_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010001>;
+
+class SUBS_U_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010001>;
+class SUBS_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010001>;
+class SUBS_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010001>;
+class SUBS_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010001>;
+
+class SUBSUS_U_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010001>;
+class SUBSUS_U_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010001>;
+class SUBSUS_U_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010001>;
+class SUBSUS_U_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010001>;
+
+class SUBSUU_S_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010001>;
+class SUBSUU_S_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010001>;
+class SUBSUU_S_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010001>;
+class SUBSUU_S_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010001>;
+
+class SUBV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001110>;
+class SUBV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001110>;
+class SUBV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001110>;
+class SUBV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001110>;
+
+class SUBVI_B_ENC : MSA_I5_FMT<0b001, 0b00, 0b000110>;
+class SUBVI_H_ENC : MSA_I5_FMT<0b001, 0b01, 0b000110>;
+class SUBVI_W_ENC : MSA_I5_FMT<0b001, 0b10, 0b000110>;
+class SUBVI_D_ENC : MSA_I5_FMT<0b001, 0b11, 0b000110>;
+
+class VSHF_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010101>;
+class VSHF_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010101>;
+class VSHF_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010101>;
+class VSHF_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010101>;
+
+class XOR_V_ENC : MSA_VEC_FMT<0b00011, 0b011110>;
+
+class XORI_B_ENC : MSA_I8_FMT<0b11, 0b000000>;
+
+// Instruction desc.
+class MSA_BIT_B_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_H_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
+                               ComplexPattern Mask, RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, vsplat_uimm8:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$wd_in),
+                                               ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_BIT_BINSLI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskl_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_BINSRI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskr_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         ValueType VecTy, RegisterOperand ROD,
+                         RegisterOperand ROWS,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $ws[$n]");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), immZExt4:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
+                           RegisterClass RCD, RegisterClass RCWS> :
+      MipsPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
+                 [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
+  bit usesCustomInserter = 1;
+}
+
+class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $imm");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$imm))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed in the next few patches
+class MSA_I8_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt8:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                           RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I10_LDI_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins vsplat_simm10:$s10);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $s10");
+  // LDI is matched using custom matching code in MipsSEISelDAGToDAG.cpp
+  list<dag> Pattern = [];
+  bit hasSideEffects = 0;
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
+                            SDPatternOperator OpNode, RegisterOperand ROWD,
+                            RegisterOperand ROS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (VT (OpNode ROS:$rs)))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
+                              RegisterClass RCWD, RegisterClass RCWS = RCWD> :
+      MipsPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
+                 [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
+  let usesCustomInserter = 1;
+}
+
+class MSA_2RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       RegisterOperand ROWT = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_BINSX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             RegisterOperand ROWT = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_VSHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF ROWD:$wd_in, ROWS:$ws,
+                                                ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd,
+                       (OpNode ROWD:$wd_in, ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> :
+  MSA_3R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_3RF_4RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> :
+  MSA_3R_4R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = IIBranch;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [AT];
+}
+
+class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROS,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, uimm6:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              ROS:$rs,
+                                              immZExt6:$n))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                             RegisterOperand ROWD, RegisterOperand ROFS> :
+      MipsPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, uimm6:$n, ROFS:$fs),
+                 [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        immZExt6:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[0]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              immZExt6:$n,
+                                              ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_VEC_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SPLAT_DESC_BASE<string instr_asm, SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD,
+                              RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF SplatImm:$n, ROWS:$ws,
+                                                ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_VEC_PSEUDO_BASE<SDPatternOperator OpNode, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD> :
+      MipsPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
+                 [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
+
+class ADD_A_B_DESC : MSA_3R_DESC_BASE<"add_a.b", int_mips_add_a_b, MSA128BOpnd>,
+                     IsCommutable;
+class ADD_A_H_DESC : MSA_3R_DESC_BASE<"add_a.h", int_mips_add_a_h, MSA128HOpnd>,
+                     IsCommutable;
+class ADD_A_W_DESC : MSA_3R_DESC_BASE<"add_a.w", int_mips_add_a_w, MSA128WOpnd>,
+                     IsCommutable;
+class ADD_A_D_DESC : MSA_3R_DESC_BASE<"add_a.d", int_mips_add_a_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class ADDS_A_B_DESC : MSA_3R_DESC_BASE<"adds_a.b", int_mips_adds_a_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_A_H_DESC : MSA_3R_DESC_BASE<"adds_a.h", int_mips_adds_a_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_A_W_DESC : MSA_3R_DESC_BASE<"adds_a.w", int_mips_adds_a_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_A_D_DESC : MSA_3R_DESC_BASE<"adds_a.d", int_mips_adds_a_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_S_B_DESC : MSA_3R_DESC_BASE<"adds_s.b", int_mips_adds_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_S_H_DESC : MSA_3R_DESC_BASE<"adds_s.h", int_mips_adds_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_S_W_DESC : MSA_3R_DESC_BASE<"adds_s.w", int_mips_adds_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_S_D_DESC : MSA_3R_DESC_BASE<"adds_s.d", int_mips_adds_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_U_B_DESC : MSA_3R_DESC_BASE<"adds_u.b", int_mips_adds_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_U_H_DESC : MSA_3R_DESC_BASE<"adds_u.h", int_mips_adds_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_U_W_DESC : MSA_3R_DESC_BASE<"adds_u.w", int_mips_adds_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_U_D_DESC : MSA_3R_DESC_BASE<"adds_u.d", int_mips_adds_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDV_B_DESC : MSA_3R_DESC_BASE<"addv.b", add, MSA128BOpnd>, IsCommutable;
+class ADDV_H_DESC : MSA_3R_DESC_BASE<"addv.h", add, MSA128HOpnd>, IsCommutable;
+class ADDV_W_DESC : MSA_3R_DESC_BASE<"addv.w", add, MSA128WOpnd>, IsCommutable;
+class ADDV_D_DESC : MSA_3R_DESC_BASE<"addv.d", add, MSA128DOpnd>, IsCommutable;
+
+class ADDVI_B_DESC : MSA_I5_DESC_BASE<"addvi.b", add, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class ADDVI_H_DESC : MSA_I5_DESC_BASE<"addvi.h", add, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class ADDVI_W_DESC : MSA_I5_DESC_BASE<"addvi.w", add, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class ADDVI_D_DESC : MSA_I5_DESC_BASE<"addvi.d", add, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class AND_V_DESC : MSA_VEC_DESC_BASE<"and.v", and, MSA128BOpnd>;
+class AND_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128HOpnd>;
+class AND_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128WOpnd>;
+class AND_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128DOpnd>;
+
+class ANDI_B_DESC : MSA_I8_DESC_BASE<"andi.b", and, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class ASUB_S_B_DESC : MSA_3R_DESC_BASE<"asub_s.b", int_mips_asub_s_b,
+                                       MSA128BOpnd>;
+class ASUB_S_H_DESC : MSA_3R_DESC_BASE<"asub_s.h", int_mips_asub_s_h,
+                                       MSA128HOpnd>;
+class ASUB_S_W_DESC : MSA_3R_DESC_BASE<"asub_s.w", int_mips_asub_s_w,
+                                       MSA128WOpnd>;
+class ASUB_S_D_DESC : MSA_3R_DESC_BASE<"asub_s.d", int_mips_asub_s_d,
+                                       MSA128DOpnd>;
+
+class ASUB_U_B_DESC : MSA_3R_DESC_BASE<"asub_u.b", int_mips_asub_u_b,
+                                       MSA128BOpnd>;
+class ASUB_U_H_DESC : MSA_3R_DESC_BASE<"asub_u.h", int_mips_asub_u_h,
+                                       MSA128HOpnd>;
+class ASUB_U_W_DESC : MSA_3R_DESC_BASE<"asub_u.w", int_mips_asub_u_w,
+                                       MSA128WOpnd>;
+class ASUB_U_D_DESC : MSA_3R_DESC_BASE<"asub_u.d", int_mips_asub_u_d,
+                                       MSA128DOpnd>;
+
+class AVE_S_B_DESC : MSA_3R_DESC_BASE<"ave_s.b", int_mips_ave_s_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_S_H_DESC : MSA_3R_DESC_BASE<"ave_s.h", int_mips_ave_s_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_S_W_DESC : MSA_3R_DESC_BASE<"ave_s.w", int_mips_ave_s_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_S_D_DESC : MSA_3R_DESC_BASE<"ave_s.d", int_mips_ave_s_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVE_U_B_DESC : MSA_3R_DESC_BASE<"ave_u.b", int_mips_ave_u_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_U_H_DESC : MSA_3R_DESC_BASE<"ave_u.h", int_mips_ave_u_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_U_W_DESC : MSA_3R_DESC_BASE<"ave_u.w", int_mips_ave_u_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_U_D_DESC : MSA_3R_DESC_BASE<"ave_u.d", int_mips_ave_u_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVER_S_B_DESC : MSA_3R_DESC_BASE<"aver_s.b", int_mips_aver_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_S_H_DESC : MSA_3R_DESC_BASE<"aver_s.h", int_mips_aver_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_S_W_DESC : MSA_3R_DESC_BASE<"aver_s.w", int_mips_aver_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_S_D_DESC : MSA_3R_DESC_BASE<"aver_s.d", int_mips_aver_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class AVER_U_B_DESC : MSA_3R_DESC_BASE<"aver_u.b", int_mips_aver_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_U_H_DESC : MSA_3R_DESC_BASE<"aver_u.h", int_mips_aver_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_U_W_DESC : MSA_3R_DESC_BASE<"aver_u.w", int_mips_aver_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_U_D_DESC : MSA_3R_DESC_BASE<"aver_u.d", int_mips_aver_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class BCLR_B_DESC : MSA_3R_DESC_BASE<"bclr.b", vbclr_b, MSA128BOpnd>;
+class BCLR_H_DESC : MSA_3R_DESC_BASE<"bclr.h", vbclr_h, MSA128HOpnd>;
+class BCLR_W_DESC : MSA_3R_DESC_BASE<"bclr.w", vbclr_w, MSA128WOpnd>;
+class BCLR_D_DESC : MSA_3R_DESC_BASE<"bclr.d", vbclr_d, MSA128DOpnd>;
+
+class BCLRI_B_DESC : MSA_BIT_B_DESC_BASE<"bclri.b", and, vsplat_uimm_inv_pow2,
+                                         MSA128BOpnd>;
+class BCLRI_H_DESC : MSA_BIT_H_DESC_BASE<"bclri.h", and, vsplat_uimm_inv_pow2,
+                                         MSA128HOpnd>;
+class BCLRI_W_DESC : MSA_BIT_W_DESC_BASE<"bclri.w", and, vsplat_uimm_inv_pow2,
+                                         MSA128WOpnd>;
+class BCLRI_D_DESC : MSA_BIT_D_DESC_BASE<"bclri.d", and, vsplat_uimm_inv_pow2,
+                                         MSA128DOpnd>;
+
+class BINSL_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.b", int_mips_binsl_b,
+                                            MSA128BOpnd>;
+class BINSL_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.h", int_mips_binsl_h,
+                                            MSA128HOpnd>;
+class BINSL_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.w", int_mips_binsl_w,
+                                            MSA128WOpnd>;
+class BINSL_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.d", int_mips_binsl_d,
+                                            MSA128DOpnd>;
+
+class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, MSA128BOpnd>;
+class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, MSA128HOpnd>;
+class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, MSA128WOpnd>;
+class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, MSA128DOpnd>;
+
+class BINSR_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.b", int_mips_binsr_b,
+                                            MSA128BOpnd>;
+class BINSR_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.h", int_mips_binsr_h,
+                                            MSA128HOpnd>;
+class BINSR_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.w", int_mips_binsr_w,
+                                            MSA128WOpnd>;
+class BINSR_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.d", int_mips_binsr_d,
+                                            MSA128DOpnd>;
+
+class BINSRI_B_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, MSA128BOpnd>;
+class BINSRI_H_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, MSA128HOpnd>;
+class BINSRI_W_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, MSA128WOpnd>;
+class BINSRI_D_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, MSA128DOpnd>;
+
+class BMNZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmnz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMNZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmnzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BNEG_B_DESC : MSA_3R_DESC_BASE<"bneg.b", vbneg_b, MSA128BOpnd>;
+class BNEG_H_DESC : MSA_3R_DESC_BASE<"bneg.h", vbneg_h, MSA128HOpnd>;
+class BNEG_W_DESC : MSA_3R_DESC_BASE<"bneg.w", vbneg_w, MSA128WOpnd>;
+class BNEG_D_DESC : MSA_3R_DESC_BASE<"bneg.d", vbneg_d, MSA128DOpnd>;
+
+class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2, MSA128BOpnd>;
+class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2, MSA128HOpnd>;
+class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2, MSA128WOpnd>;
+class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2, MSA128DOpnd>;
+
+class BNZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bnz.b", MSA128BOpnd>;
+class BNZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bnz.h", MSA128HOpnd>;
+class BNZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bnz.w", MSA128WOpnd>;
+class BNZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bnz.d", MSA128DOpnd>;
+
+class BNZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bnz.v", MSA128BOpnd>;
+
+class BSEL_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bsel.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd,
+                        (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                                                  MSA128BOpnd:$wt))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSELI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bseli.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws,
+                                                      vsplati8_uimm8:$u8))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSET_B_DESC : MSA_3R_DESC_BASE<"bset.b", vbset_b, MSA128BOpnd>;
+class BSET_H_DESC : MSA_3R_DESC_BASE<"bset.h", vbset_h, MSA128HOpnd>;
+class BSET_W_DESC : MSA_3R_DESC_BASE<"bset.w", vbset_w, MSA128WOpnd>;
+class BSET_D_DESC : MSA_3R_DESC_BASE<"bset.d", vbset_d, MSA128DOpnd>;
+
+class BSETI_B_DESC : MSA_BIT_B_DESC_BASE<"bseti.b", or, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BSETI_H_DESC : MSA_BIT_H_DESC_BASE<"bseti.h", or, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BSETI_W_DESC : MSA_BIT_W_DESC_BASE<"bseti.w", or, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BSETI_D_DESC : MSA_BIT_D_DESC_BASE<"bseti.d", or, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
+
+class BZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bz.b", MSA128BOpnd>;
+class BZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bz.h", MSA128HOpnd>;
+class BZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bz.w", MSA128WOpnd>;
+class BZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bz.d", MSA128DOpnd>;
+
+class BZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bz.v", MSA128BOpnd>;
+
+class CEQ_B_DESC : MSA_3R_DESC_BASE<"ceq.b", vseteq_v16i8, MSA128BOpnd>,
+                   IsCommutable;
+class CEQ_H_DESC : MSA_3R_DESC_BASE<"ceq.h", vseteq_v8i16, MSA128HOpnd>,
+                   IsCommutable;
+class CEQ_W_DESC : MSA_3R_DESC_BASE<"ceq.w", vseteq_v4i32, MSA128WOpnd>,
+                   IsCommutable;
+class CEQ_D_DESC : MSA_3R_DESC_BASE<"ceq.d", vseteq_v2i64, MSA128DOpnd>,
+                   IsCommutable;
+
+class CEQI_B_DESC : MSA_I5_DESC_BASE<"ceqi.b", vseteq_v16i8, vsplati8_simm5,
+                                     MSA128BOpnd>;
+class CEQI_H_DESC : MSA_I5_DESC_BASE<"ceqi.h", vseteq_v8i16, vsplati16_simm5,
+                                     MSA128HOpnd>;
+class CEQI_W_DESC : MSA_I5_DESC_BASE<"ceqi.w", vseteq_v4i32, vsplati32_simm5,
+                                     MSA128WOpnd>;
+class CEQI_D_DESC : MSA_I5_DESC_BASE<"ceqi.d", vseteq_v2i64, vsplati64_simm5,
+                                     MSA128DOpnd>;
+
+class CFCMSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins MSA128CROpnd:$cs);
+  string AsmString = "cfcmsa\t$rd, $cs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
+class CLE_S_H_DESC : MSA_3R_DESC_BASE<"cle_s.h", vsetle_v8i16, MSA128HOpnd>;
+class CLE_S_W_DESC : MSA_3R_DESC_BASE<"cle_s.w", vsetle_v4i32, MSA128WOpnd>;
+class CLE_S_D_DESC : MSA_3R_DESC_BASE<"cle_s.d", vsetle_v2i64, MSA128DOpnd>;
+
+class CLE_U_B_DESC : MSA_3R_DESC_BASE<"cle_u.b", vsetule_v16i8, MSA128BOpnd>;
+class CLE_U_H_DESC : MSA_3R_DESC_BASE<"cle_u.h", vsetule_v8i16, MSA128HOpnd>;
+class CLE_U_W_DESC : MSA_3R_DESC_BASE<"cle_u.w", vsetule_v4i32, MSA128WOpnd>;
+class CLE_U_D_DESC : MSA_3R_DESC_BASE<"cle_u.d", vsetule_v2i64, MSA128DOpnd>;
+
+class CLEI_S_B_DESC : MSA_I5_DESC_BASE<"clei_s.b", vsetle_v16i8,
+                                       vsplati8_simm5,  MSA128BOpnd>;
+class CLEI_S_H_DESC : MSA_I5_DESC_BASE<"clei_s.h", vsetle_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLEI_S_W_DESC : MSA_I5_DESC_BASE<"clei_s.w", vsetle_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLEI_S_D_DESC : MSA_I5_DESC_BASE<"clei_s.d", vsetle_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLEI_U_B_DESC : MSA_I5_DESC_BASE<"clei_u.b", vsetule_v16i8,
+                                       vsplati8_uimm5,  MSA128BOpnd>;
+class CLEI_U_H_DESC : MSA_I5_DESC_BASE<"clei_u.h", vsetule_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLEI_U_W_DESC : MSA_I5_DESC_BASE<"clei_u.w", vsetule_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLEI_U_D_DESC : MSA_I5_DESC_BASE<"clei_u.d", vsetule_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class CLT_S_B_DESC : MSA_3R_DESC_BASE<"clt_s.b", vsetlt_v16i8, MSA128BOpnd>;
+class CLT_S_H_DESC : MSA_3R_DESC_BASE<"clt_s.h", vsetlt_v8i16, MSA128HOpnd>;
+class CLT_S_W_DESC : MSA_3R_DESC_BASE<"clt_s.w", vsetlt_v4i32, MSA128WOpnd>;
+class CLT_S_D_DESC : MSA_3R_DESC_BASE<"clt_s.d", vsetlt_v2i64, MSA128DOpnd>;
+
+class CLT_U_B_DESC : MSA_3R_DESC_BASE<"clt_u.b", vsetult_v16i8, MSA128BOpnd>;
+class CLT_U_H_DESC : MSA_3R_DESC_BASE<"clt_u.h", vsetult_v8i16, MSA128HOpnd>;
+class CLT_U_W_DESC : MSA_3R_DESC_BASE<"clt_u.w", vsetult_v4i32, MSA128WOpnd>;
+class CLT_U_D_DESC : MSA_3R_DESC_BASE<"clt_u.d", vsetult_v2i64, MSA128DOpnd>;
+
+class CLTI_S_B_DESC : MSA_I5_DESC_BASE<"clti_s.b", vsetlt_v16i8,
+                                       vsplati8_simm5, MSA128BOpnd>;
+class CLTI_S_H_DESC : MSA_I5_DESC_BASE<"clti_s.h", vsetlt_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLTI_S_W_DESC : MSA_I5_DESC_BASE<"clti_s.w", vsetlt_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLTI_S_D_DESC : MSA_I5_DESC_BASE<"clti_s.d", vsetlt_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLTI_U_B_DESC : MSA_I5_DESC_BASE<"clti_u.b", vsetult_v16i8,
+                                       vsplati8_uimm5, MSA128BOpnd>;
+class CLTI_U_H_DESC : MSA_I5_DESC_BASE<"clti_u.h", vsetult_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLTI_U_W_DESC : MSA_I5_DESC_BASE<"clti_u.w", vsetult_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLTI_U_D_DESC : MSA_I5_DESC_BASE<"clti_u.d", vsetult_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class COPY_S_B_DESC : MSA_COPY_DESC_BASE<"copy_s.b", vextract_sext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+
+class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+
+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
+                                                 MSA128W>;
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64, FGR64,
+                                                 MSA128D>;
+
+class CTCMSA_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MSA128CROpnd:$cd, GPR32Opnd:$rs);
+  string AsmString = "ctcmsa\t$cd, $rs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
+class DIV_S_H_DESC : MSA_3R_DESC_BASE<"div_s.h", sdiv, MSA128HOpnd>;
+class DIV_S_W_DESC : MSA_3R_DESC_BASE<"div_s.w", sdiv, MSA128WOpnd>;
+class DIV_S_D_DESC : MSA_3R_DESC_BASE<"div_s.d", sdiv, MSA128DOpnd>;
+
+class DIV_U_B_DESC : MSA_3R_DESC_BASE<"div_u.b", udiv, MSA128BOpnd>;
+class DIV_U_H_DESC : MSA_3R_DESC_BASE<"div_u.h", udiv, MSA128HOpnd>;
+class DIV_U_W_DESC : MSA_3R_DESC_BASE<"div_u.w", udiv, MSA128WOpnd>;
+class DIV_U_D_DESC : MSA_3R_DESC_BASE<"div_u.d", udiv, MSA128DOpnd>;
+
+class DOTP_S_H_DESC : MSA_3R_DESC_BASE<"dotp_s.h", int_mips_dotp_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_S_W_DESC : MSA_3R_DESC_BASE<"dotp_s.w", int_mips_dotp_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_S_D_DESC : MSA_3R_DESC_BASE<"dotp_s.d", int_mips_dotp_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DOTP_U_H_DESC : MSA_3R_DESC_BASE<"dotp_u.h", int_mips_dotp_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_U_W_DESC : MSA_3R_DESC_BASE<"dotp_u.w", int_mips_dotp_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_U_D_DESC : MSA_3R_DESC_BASE<"dotp_u.d", int_mips_dotp_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DPADD_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.h", int_mips_dpadd_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.w", int_mips_dpadd_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.d", int_mips_dpadd_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPADD_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.h", int_mips_dpadd_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.w", int_mips_dpadd_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.d", int_mips_dpadd_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPSUB_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.h", int_mips_dpsub_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.w", int_mips_dpsub_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.d", int_mips_dpsub_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class DPSUB_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.h", int_mips_dpsub_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.w", int_mips_dpsub_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.d", int_mips_dpsub_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class FADD_W_DESC : MSA_3RF_DESC_BASE<"fadd.w", fadd, MSA128WOpnd>,
+                    IsCommutable;
+class FADD_D_DESC : MSA_3RF_DESC_BASE<"fadd.d", fadd, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCAF_W_DESC : MSA_3RF_DESC_BASE<"fcaf.w", int_mips_fcaf_w, MSA128WOpnd>,
+                    IsCommutable;
+class FCAF_D_DESC : MSA_3RF_DESC_BASE<"fcaf.d", int_mips_fcaf_d, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCEQ_W_DESC : MSA_3RF_DESC_BASE<"fceq.w", vfsetoeq_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCEQ_D_DESC : MSA_3RF_DESC_BASE<"fceq.d", vfsetoeq_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCLASS_W_DESC : MSA_2RF_DESC_BASE<"fclass.w", int_mips_fclass_w,
+                                        MSA128WOpnd>;
+class FCLASS_D_DESC : MSA_2RF_DESC_BASE<"fclass.d", int_mips_fclass_d,
+                                        MSA128DOpnd>;
+
+class FCLE_W_DESC : MSA_3RF_DESC_BASE<"fcle.w", vfsetole_v4f32, MSA128WOpnd>;
+class FCLE_D_DESC : MSA_3RF_DESC_BASE<"fcle.d", vfsetole_v2f64, MSA128DOpnd>;
+
+class FCLT_W_DESC : MSA_3RF_DESC_BASE<"fclt.w", vfsetolt_v4f32, MSA128WOpnd>;
+class FCLT_D_DESC : MSA_3RF_DESC_BASE<"fclt.d", vfsetolt_v2f64, MSA128DOpnd>;
+
+class FCNE_W_DESC : MSA_3RF_DESC_BASE<"fcne.w", vfsetone_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCNE_D_DESC : MSA_3RF_DESC_BASE<"fcne.d", vfsetone_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCOR_W_DESC : MSA_3RF_DESC_BASE<"fcor.w", vfsetord_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCOR_D_DESC : MSA_3RF_DESC_BASE<"fcor.d", vfsetord_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUEQ_W_DESC : MSA_3RF_DESC_BASE<"fcueq.w", vfsetueq_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUEQ_D_DESC : MSA_3RF_DESC_BASE<"fcueq.d", vfsetueq_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULE_W_DESC : MSA_3RF_DESC_BASE<"fcule.w", vfsetule_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULE_D_DESC : MSA_3RF_DESC_BASE<"fcule.d", vfsetule_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULT_W_DESC : MSA_3RF_DESC_BASE<"fcult.w", vfsetult_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULT_D_DESC : MSA_3RF_DESC_BASE<"fcult.d", vfsetult_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCUN_W_DESC : MSA_3RF_DESC_BASE<"fcun.w", vfsetun_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCUN_D_DESC : MSA_3RF_DESC_BASE<"fcun.d", vfsetun_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUNE_W_DESC : MSA_3RF_DESC_BASE<"fcune.w", vfsetune_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUNE_D_DESC : MSA_3RF_DESC_BASE<"fcune.d", vfsetune_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FDIV_W_DESC : MSA_3RF_DESC_BASE<"fdiv.w", fdiv, MSA128WOpnd>;
+class FDIV_D_DESC : MSA_3RF_DESC_BASE<"fdiv.d", fdiv, MSA128DOpnd>;
+
+class FEXDO_H_DESC : MSA_3RF_DESC_BASE<"fexdo.h", int_mips_fexdo_h,
+                                       MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w,
+                                       MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+// The fexp2.df instruction multiplies the first operand by 2 to the power of
+// the second operand. We therefore need a pseudo-insn in order to invent the
+// 1.0 when we only need to match ISD::FEXP2.
+class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
+class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
+let usesCustomInserter = 1 in {
+  class FEXP2_W_1_PSEUDO_DESC :
+      MipsPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
+                 [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
+  class FEXP2_D_1_PSEUDO_DESC :
+      MipsPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
+                 [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
+}
+
+class FEXUPL_W_DESC : MSA_2RF_DESC_BASE<"fexupl.w", int_mips_fexupl_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPL_D_DESC : MSA_2RF_DESC_BASE<"fexupl.d", int_mips_fexupl_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FEXUPR_W_DESC : MSA_2RF_DESC_BASE<"fexupr.w", int_mips_fexupr_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPR_D_DESC : MSA_2RF_DESC_BASE<"fexupr.d", int_mips_fexupr_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FFINT_S_W_DESC : MSA_2RF_DESC_BASE<"ffint_s.w", sint_to_fp, MSA128WOpnd>;
+class FFINT_S_D_DESC : MSA_2RF_DESC_BASE<"ffint_s.d", sint_to_fp, MSA128DOpnd>;
+
+class FFINT_U_W_DESC : MSA_2RF_DESC_BASE<"ffint_u.w", uint_to_fp, MSA128WOpnd>;
+class FFINT_U_D_DESC : MSA_2RF_DESC_BASE<"ffint_u.d", uint_to_fp, MSA128DOpnd>;
+
+class FFQL_W_DESC : MSA_2RF_DESC_BASE<"ffql.w", int_mips_ffql_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQL_D_DESC : MSA_2RF_DESC_BASE<"ffql.d", int_mips_ffql_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FFQR_W_DESC : MSA_2RF_DESC_BASE<"ffqr.w", int_mips_ffqr_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQR_D_DESC : MSA_2RF_DESC_BASE<"ffqr.d", int_mips_ffqr_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FILL_B_DESC : MSA_2R_FILL_DESC_BASE<"fill.b", v16i8, vsplati8,
+                                          MSA128BOpnd, GPR32Opnd>;
+class FILL_H_DESC : MSA_2R_FILL_DESC_BASE<"fill.h", v8i16, vsplati16,
+                                          MSA128HOpnd, GPR32Opnd>;
+class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
+                                          MSA128WOpnd, GPR32Opnd>;
+
+class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
+                                                    FGR32>;
+class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v2f64, vsplatf64, MSA128D,
+                                                    FGR64>;
+
+class FLOG2_W_DESC : MSA_2RF_DESC_BASE<"flog2.w", flog2, MSA128WOpnd>;
+class FLOG2_D_DESC : MSA_2RF_DESC_BASE<"flog2.d", flog2, MSA128DOpnd>;
+
+class FMADD_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.w", fma, MSA128WOpnd>;
+class FMADD_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.d", fma, MSA128DOpnd>;
+
+class FMAX_W_DESC : MSA_3RF_DESC_BASE<"fmax.w", int_mips_fmax_w, MSA128WOpnd>;
+class FMAX_D_DESC : MSA_3RF_DESC_BASE<"fmax.d", int_mips_fmax_d, MSA128DOpnd>;
+
+class FMAX_A_W_DESC : MSA_3RF_DESC_BASE<"fmax_a.w", int_mips_fmax_a_w,
+                                        MSA128WOpnd>;
+class FMAX_A_D_DESC : MSA_3RF_DESC_BASE<"fmax_a.d", int_mips_fmax_a_d,
+                                        MSA128DOpnd>;
+
+class FMIN_W_DESC : MSA_3RF_DESC_BASE<"fmin.w", int_mips_fmin_w, MSA128WOpnd>;
+class FMIN_D_DESC : MSA_3RF_DESC_BASE<"fmin.d", int_mips_fmin_d, MSA128DOpnd>;
+
+class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
+                                        MSA128WOpnd>;
+class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
+                                        MSA128DOpnd>;
+
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+
+class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
+class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
+
+class FRINT_W_DESC : MSA_2RF_DESC_BASE<"frint.w", frint, MSA128WOpnd>;
+class FRINT_D_DESC : MSA_2RF_DESC_BASE<"frint.d", frint, MSA128DOpnd>;
+
+class FRCP_W_DESC : MSA_2RF_DESC_BASE<"frcp.w", int_mips_frcp_w, MSA128WOpnd>;
+class FRCP_D_DESC : MSA_2RF_DESC_BASE<"frcp.d", int_mips_frcp_d, MSA128DOpnd>;
+
+class FRSQRT_W_DESC : MSA_2RF_DESC_BASE<"frsqrt.w", int_mips_frsqrt_w,
+                                        MSA128WOpnd>;
+class FRSQRT_D_DESC : MSA_2RF_DESC_BASE<"frsqrt.d", int_mips_frsqrt_d,
+                                        MSA128DOpnd>;
+
+class FSAF_W_DESC : MSA_3RF_DESC_BASE<"fsaf.w", int_mips_fsaf_w, MSA128WOpnd>;
+class FSAF_D_DESC : MSA_3RF_DESC_BASE<"fsaf.d", int_mips_fsaf_d, MSA128DOpnd>;
+
+class FSEQ_W_DESC : MSA_3RF_DESC_BASE<"fseq.w", int_mips_fseq_w, MSA128WOpnd>;
+class FSEQ_D_DESC : MSA_3RF_DESC_BASE<"fseq.d", int_mips_fseq_d, MSA128DOpnd>;
+
+class FSLE_W_DESC : MSA_3RF_DESC_BASE<"fsle.w", int_mips_fsle_w, MSA128WOpnd>;
+class FSLE_D_DESC : MSA_3RF_DESC_BASE<"fsle.d", int_mips_fsle_d, MSA128DOpnd>;
+
+class FSLT_W_DESC : MSA_3RF_DESC_BASE<"fslt.w", int_mips_fslt_w, MSA128WOpnd>;
+class FSLT_D_DESC : MSA_3RF_DESC_BASE<"fslt.d", int_mips_fslt_d, MSA128DOpnd>;
+
+class FSNE_W_DESC : MSA_3RF_DESC_BASE<"fsne.w", int_mips_fsne_w, MSA128WOpnd>;
+class FSNE_D_DESC : MSA_3RF_DESC_BASE<"fsne.d", int_mips_fsne_d, MSA128DOpnd>;
+
+class FSOR_W_DESC : MSA_3RF_DESC_BASE<"fsor.w", int_mips_fsor_w, MSA128WOpnd>;
+class FSOR_D_DESC : MSA_3RF_DESC_BASE<"fsor.d", int_mips_fsor_d, MSA128DOpnd>;
+
+class FSQRT_W_DESC : MSA_2RF_DESC_BASE<"fsqrt.w", fsqrt, MSA128WOpnd>;
+class FSQRT_D_DESC : MSA_2RF_DESC_BASE<"fsqrt.d", fsqrt, MSA128DOpnd>;
+
+class FSUB_W_DESC : MSA_3RF_DESC_BASE<"fsub.w", fsub, MSA128WOpnd>;
+class FSUB_D_DESC : MSA_3RF_DESC_BASE<"fsub.d", fsub, MSA128DOpnd>;
+
+class FSUEQ_W_DESC : MSA_3RF_DESC_BASE<"fsueq.w", int_mips_fsueq_w,
+                                       MSA128WOpnd>;
+class FSUEQ_D_DESC : MSA_3RF_DESC_BASE<"fsueq.d", int_mips_fsueq_d,
+                                       MSA128DOpnd>;
+
+class FSULE_W_DESC : MSA_3RF_DESC_BASE<"fsule.w", int_mips_fsule_w,
+                                       MSA128WOpnd>;
+class FSULE_D_DESC : MSA_3RF_DESC_BASE<"fsule.d", int_mips_fsule_d,
+                                       MSA128DOpnd>;
+
+class FSULT_W_DESC : MSA_3RF_DESC_BASE<"fsult.w", int_mips_fsult_w,
+                                       MSA128WOpnd>;
+class FSULT_D_DESC : MSA_3RF_DESC_BASE<"fsult.d", int_mips_fsult_d,
+                                       MSA128DOpnd>;
+
+class FSUN_W_DESC : MSA_3RF_DESC_BASE<"fsun.w", int_mips_fsun_w,
+                                      MSA128WOpnd>;
+class FSUN_D_DESC : MSA_3RF_DESC_BASE<"fsun.d", int_mips_fsun_d,
+                                      MSA128DOpnd>;
+
+class FSUNE_W_DESC : MSA_3RF_DESC_BASE<"fsune.w", int_mips_fsune_w,
+                                       MSA128WOpnd>;
+class FSUNE_D_DESC : MSA_3RF_DESC_BASE<"fsune.d", int_mips_fsune_d,
+                                       MSA128DOpnd>;
+
+class FTINT_S_W_DESC : MSA_2RF_DESC_BASE<"ftint_s.w", int_mips_ftint_s_w,
+                                         MSA128WOpnd>;
+class FTINT_S_D_DESC : MSA_2RF_DESC_BASE<"ftint_s.d", int_mips_ftint_s_d,
+                                         MSA128DOpnd>;
+
+class FTINT_U_W_DESC : MSA_2RF_DESC_BASE<"ftint_u.w", int_mips_ftint_u_w,
+                                         MSA128WOpnd>;
+class FTINT_U_D_DESC : MSA_2RF_DESC_BASE<"ftint_u.d", int_mips_ftint_u_d,
+                                         MSA128DOpnd>;
+
+class FTQ_H_DESC : MSA_3RF_DESC_BASE<"ftq.h", int_mips_ftq_h,
+                                     MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FTQ_W_DESC : MSA_3RF_DESC_BASE<"ftq.w", int_mips_ftq_w,
+                                     MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+class FTRUNC_S_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.w", fp_to_sint,
+                                          MSA128WOpnd>;
+class FTRUNC_S_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.d", fp_to_sint,
+                                          MSA128DOpnd>;
+
+class FTRUNC_U_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.w", fp_to_uint,
+                                          MSA128WOpnd>;
+class FTRUNC_U_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.d", fp_to_uint,
+                                          MSA128DOpnd>;
+
+class HADD_S_H_DESC : MSA_3R_DESC_BASE<"hadd_s.h", int_mips_hadd_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_S_W_DESC : MSA_3R_DESC_BASE<"hadd_s.w", int_mips_hadd_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_S_D_DESC : MSA_3R_DESC_BASE<"hadd_s.d", int_mips_hadd_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HADD_U_H_DESC : MSA_3R_DESC_BASE<"hadd_u.h", int_mips_hadd_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_U_W_DESC : MSA_3R_DESC_BASE<"hadd_u.w", int_mips_hadd_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_U_D_DESC : MSA_3R_DESC_BASE<"hadd_u.d", int_mips_hadd_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_S_H_DESC : MSA_3R_DESC_BASE<"hsub_s.h", int_mips_hsub_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_S_W_DESC : MSA_3R_DESC_BASE<"hsub_s.w", int_mips_hsub_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_S_D_DESC : MSA_3R_DESC_BASE<"hsub_s.d", int_mips_hsub_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_U_H_DESC : MSA_3R_DESC_BASE<"hsub_u.h", int_mips_hsub_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_U_W_DESC : MSA_3R_DESC_BASE<"hsub_u.w", int_mips_hsub_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_U_D_DESC : MSA_3R_DESC_BASE<"hsub_u.d", int_mips_hsub_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class ILVEV_B_DESC : MSA_3R_DESC_BASE<"ilvev.b", MipsILVEV, MSA128BOpnd>;
+class ILVEV_H_DESC : MSA_3R_DESC_BASE<"ilvev.h", MipsILVEV, MSA128HOpnd>;
+class ILVEV_W_DESC : MSA_3R_DESC_BASE<"ilvev.w", MipsILVEV, MSA128WOpnd>;
+class ILVEV_D_DESC : MSA_3R_DESC_BASE<"ilvev.d", MipsILVEV, MSA128DOpnd>;
+
+class ILVL_B_DESC : MSA_3R_DESC_BASE<"ilvl.b", MipsILVL, MSA128BOpnd>;
+class ILVL_H_DESC : MSA_3R_DESC_BASE<"ilvl.h", MipsILVL, MSA128HOpnd>;
+class ILVL_W_DESC : MSA_3R_DESC_BASE<"ilvl.w", MipsILVL, MSA128WOpnd>;
+class ILVL_D_DESC : MSA_3R_DESC_BASE<"ilvl.d", MipsILVL, MSA128DOpnd>;
+
+class ILVOD_B_DESC : MSA_3R_DESC_BASE<"ilvod.b", MipsILVOD, MSA128BOpnd>;
+class ILVOD_H_DESC : MSA_3R_DESC_BASE<"ilvod.h", MipsILVOD, MSA128HOpnd>;
+class ILVOD_W_DESC : MSA_3R_DESC_BASE<"ilvod.w", MipsILVOD, MSA128WOpnd>;
+class ILVOD_D_DESC : MSA_3R_DESC_BASE<"ilvod.d", MipsILVOD, MSA128DOpnd>;
+
+class ILVR_B_DESC : MSA_3R_DESC_BASE<"ilvr.b", MipsILVR, MSA128BOpnd>;
+class ILVR_H_DESC : MSA_3R_DESC_BASE<"ilvr.h", MipsILVR, MSA128HOpnd>;
+class ILVR_W_DESC : MSA_3R_DESC_BASE<"ilvr.w", MipsILVR, MSA128WOpnd>;
+class ILVR_D_DESC : MSA_3R_DESC_BASE<"ilvr.d", MipsILVR, MSA128DOpnd>;
+
+class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8,
+                                           MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16,
+                                           MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
+                                           MSA128WOpnd, GPR32Opnd>;
+
+class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
+                                                     MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
+                                                     MSA128DOpnd, FGR64Opnd>;
+
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", int_mips_insve_b,
+                                         MSA128BOpnd>;
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", int_mips_insve_h,
+                                         MSA128HOpnd>;
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", int_mips_insve_w,
+                                         MSA128WOpnd>;
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", int_mips_insve_d,
+                                         MSA128DOpnd>;
+
+class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(set ROWD:$wd, (TyNode (OpNode Addr:$addr)))];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd>;
+class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd>;
+class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd>;
+class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd>;
+
+class LDI_B_DESC : MSA_I10_LDI_DESC_BASE<"ldi.b", MSA128BOpnd>;
+class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
+class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
+class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
+
+class LSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, LSAImm:$sa);
+  string AsmString = "lsa\t$rd, $rs, $rt, $sa";
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (add GPR32Opnd:$rs,
+                                                (shl GPR32Opnd:$rt,
+                                                     immZExt2Lsa:$sa)))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
+                                            MSA128HOpnd>;
+class MADD_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.w", int_mips_madd_q_w,
+                                            MSA128WOpnd>;
+
+class MADDR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.h", int_mips_maddr_q_h,
+                                             MSA128HOpnd>;
+class MADDR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.w", int_mips_maddr_q_w,
+                                             MSA128WOpnd>;
+
+class MADDV_B_DESC : MSA_3R_4R_DESC_BASE<"maddv.b", muladd, MSA128BOpnd>;
+class MADDV_H_DESC : MSA_3R_4R_DESC_BASE<"maddv.h", muladd, MSA128HOpnd>;
+class MADDV_W_DESC : MSA_3R_4R_DESC_BASE<"maddv.w", muladd, MSA128WOpnd>;
+class MADDV_D_DESC : MSA_3R_4R_DESC_BASE<"maddv.d", muladd, MSA128DOpnd>;
+
+class MAX_A_B_DESC : MSA_3R_DESC_BASE<"max_a.b", int_mips_max_a_b, MSA128BOpnd>;
+class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
+class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
+class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
+
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
+class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
+class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
+class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
+
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
+class MOD_S_H_DESC : MSA_3R_DESC_BASE<"mod_s.h", srem, MSA128HOpnd>;
+class MOD_S_W_DESC : MSA_3R_DESC_BASE<"mod_s.w", srem, MSA128WOpnd>;
+class MOD_S_D_DESC : MSA_3R_DESC_BASE<"mod_s.d", srem, MSA128DOpnd>;
+
+class MOD_U_B_DESC : MSA_3R_DESC_BASE<"mod_u.b", urem, MSA128BOpnd>;
+class MOD_U_H_DESC : MSA_3R_DESC_BASE<"mod_u.h", urem, MSA128HOpnd>;
+class MOD_U_W_DESC : MSA_3R_DESC_BASE<"mod_u.w", urem, MSA128WOpnd>;
+class MOD_U_D_DESC : MSA_3R_DESC_BASE<"mod_u.d", urem, MSA128DOpnd>;
+
+class MOVE_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$ws);
+  string AsmString = "move.v\t$wd, $ws";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
+                                            MSA128HOpnd>;
+class MSUB_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.w", int_mips_msub_q_w,
+                                            MSA128WOpnd>;
+
+class MSUBR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.h", int_mips_msubr_q_h,
+                                             MSA128HOpnd>;
+class MSUBR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.w", int_mips_msubr_q_w,
+                                             MSA128WOpnd>;
+
+class MSUBV_B_DESC : MSA_3R_4R_DESC_BASE<"msubv.b", mulsub, MSA128BOpnd>;
+class MSUBV_H_DESC : MSA_3R_4R_DESC_BASE<"msubv.h", mulsub, MSA128HOpnd>;
+class MSUBV_W_DESC : MSA_3R_4R_DESC_BASE<"msubv.w", mulsub, MSA128WOpnd>;
+class MSUBV_D_DESC : MSA_3R_4R_DESC_BASE<"msubv.d", mulsub, MSA128DOpnd>;
+
+class MUL_Q_H_DESC : MSA_3RF_DESC_BASE<"mul_q.h", int_mips_mul_q_h,
+                                       MSA128HOpnd>;
+class MUL_Q_W_DESC : MSA_3RF_DESC_BASE<"mul_q.w", int_mips_mul_q_w,
+                                       MSA128WOpnd>;
+
+class MULR_Q_H_DESC : MSA_3RF_DESC_BASE<"mulr_q.h", int_mips_mulr_q_h,
+                                        MSA128HOpnd>;
+class MULR_Q_W_DESC : MSA_3RF_DESC_BASE<"mulr_q.w", int_mips_mulr_q_w,
+                                        MSA128WOpnd>;
+
+class MULV_B_DESC : MSA_3R_DESC_BASE<"mulv.b", mul, MSA128BOpnd>;
+class MULV_H_DESC : MSA_3R_DESC_BASE<"mulv.h", mul, MSA128HOpnd>;
+class MULV_W_DESC : MSA_3R_DESC_BASE<"mulv.w", mul, MSA128WOpnd>;
+class MULV_D_DESC : MSA_3R_DESC_BASE<"mulv.d", mul, MSA128DOpnd>;
+
+class NLOC_B_DESC : MSA_2R_DESC_BASE<"nloc.b", int_mips_nloc_b, MSA128BOpnd>;
+class NLOC_H_DESC : MSA_2R_DESC_BASE<"nloc.h", int_mips_nloc_h, MSA128HOpnd>;
+class NLOC_W_DESC : MSA_2R_DESC_BASE<"nloc.w", int_mips_nloc_w, MSA128WOpnd>;
+class NLOC_D_DESC : MSA_2R_DESC_BASE<"nloc.d", int_mips_nloc_d, MSA128DOpnd>;
+
+class NLZC_B_DESC : MSA_2R_DESC_BASE<"nlzc.b", ctlz, MSA128BOpnd>;
+class NLZC_H_DESC : MSA_2R_DESC_BASE<"nlzc.h", ctlz, MSA128HOpnd>;
+class NLZC_W_DESC : MSA_2R_DESC_BASE<"nlzc.w", ctlz, MSA128WOpnd>;
+class NLZC_D_DESC : MSA_2R_DESC_BASE<"nlzc.d", ctlz, MSA128DOpnd>;
+
+class NOR_V_DESC : MSA_VEC_DESC_BASE<"nor.v", MipsVNOR, MSA128BOpnd>;
+class NOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128HOpnd>;
+class NOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128WOpnd>;
+class NOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128DOpnd>;
+
+class NORI_B_DESC : MSA_I8_DESC_BASE<"nori.b", MipsVNOR, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class OR_V_DESC : MSA_VEC_DESC_BASE<"or.v", or, MSA128BOpnd>;
+class OR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128HOpnd>;
+class OR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128WOpnd>;
+class OR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128DOpnd>;
+
+class ORI_B_DESC : MSA_I8_DESC_BASE<"ori.b", or, vsplati8_uimm8, MSA128BOpnd>;
+
+class PCKEV_B_DESC : MSA_3R_DESC_BASE<"pckev.b", MipsPCKEV, MSA128BOpnd>;
+class PCKEV_H_DESC : MSA_3R_DESC_BASE<"pckev.h", MipsPCKEV, MSA128HOpnd>;
+class PCKEV_W_DESC : MSA_3R_DESC_BASE<"pckev.w", MipsPCKEV, MSA128WOpnd>;
+class PCKEV_D_DESC : MSA_3R_DESC_BASE<"pckev.d", MipsPCKEV, MSA128DOpnd>;
+
+class PCKOD_B_DESC : MSA_3R_DESC_BASE<"pckod.b", MipsPCKOD, MSA128BOpnd>;
+class PCKOD_H_DESC : MSA_3R_DESC_BASE<"pckod.h", MipsPCKOD, MSA128HOpnd>;
+class PCKOD_W_DESC : MSA_3R_DESC_BASE<"pckod.w", MipsPCKOD, MSA128WOpnd>;
+class PCKOD_D_DESC : MSA_3R_DESC_BASE<"pckod.d", MipsPCKOD, MSA128DOpnd>;
+
+class PCNT_B_DESC : MSA_2R_DESC_BASE<"pcnt.b", ctpop, MSA128BOpnd>;
+class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
+class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
+class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
+
+class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
+                                           MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
+                                           MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
+                                           MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
+                                           MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
+                                           MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
+                                           MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
+                                           MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
+                                           MSA128DOpnd>;
+
+class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
+class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
+class SHF_W_DESC : MSA_I8_SHF_DESC_BASE<"shf.w", MSA128WOpnd>;
+
+class SLD_B_DESC : MSA_3R_SLD_DESC_BASE<"sld.b", int_mips_sld_b, MSA128BOpnd>;
+class SLD_H_DESC : MSA_3R_SLD_DESC_BASE<"sld.h", int_mips_sld_h, MSA128HOpnd>;
+class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
+class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
+
+class SLDI_B_DESC : MSA_ELM_DESC_BASE<"sldi.b", int_mips_sldi_b, MSA128BOpnd>;
+class SLDI_H_DESC : MSA_ELM_DESC_BASE<"sldi.h", int_mips_sldi_h, MSA128HOpnd>;
+class SLDI_W_DESC : MSA_ELM_DESC_BASE<"sldi.w", int_mips_sldi_w, MSA128WOpnd>;
+class SLDI_D_DESC : MSA_ELM_DESC_BASE<"sldi.d", int_mips_sldi_d, MSA128DOpnd>;
+
+class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
+class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
+class SLL_W_DESC : MSA_3R_DESC_BASE<"sll.w", shl, MSA128WOpnd>;
+class SLL_D_DESC : MSA_3R_DESC_BASE<"sll.d", shl, MSA128DOpnd>;
+
+class SLLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.b", shl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SLLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.h", shl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SLLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.w", shl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SLLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.d", shl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SPLAT_B_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.b", vsplati8_elt,
+                                            MSA128BOpnd>;
+class SPLAT_H_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.h", vsplati16_elt,
+                                            MSA128HOpnd>;
+class SPLAT_W_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.w", vsplati32_elt,
+                                            MSA128WOpnd>;
+class SPLAT_D_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.d", vsplati64_elt,
+                                            MSA128DOpnd>;
+
+class SPLATI_B_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.b", vsplati8_uimm4,
+                                              MSA128BOpnd>;
+class SPLATI_H_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.h", vsplati16_uimm3,
+                                              MSA128HOpnd>;
+class SPLATI_W_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.w", vsplati32_uimm2,
+                                              MSA128WOpnd>;
+class SPLATI_D_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.d", vsplati64_uimm1,
+                                              MSA128DOpnd>;
+
+class SRA_B_DESC : MSA_3R_DESC_BASE<"sra.b", sra, MSA128BOpnd>;
+class SRA_H_DESC : MSA_3R_DESC_BASE<"sra.h", sra, MSA128HOpnd>;
+class SRA_W_DESC : MSA_3R_DESC_BASE<"sra.w", sra, MSA128WOpnd>;
+class SRA_D_DESC : MSA_3R_DESC_BASE<"sra.d", sra, MSA128DOpnd>;
+
+class SRAI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.b", sra, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRAI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.h", sra, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRAI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.w", sra, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRAI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.d", sra, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRAR_B_DESC : MSA_3R_DESC_BASE<"srar.b", int_mips_srar_b, MSA128BOpnd>;
+class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
+class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
+class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
+
+class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
+                                           MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
+                                           MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
+                                           MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
+                                           MSA128DOpnd>;
+
+class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
+class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
+class SRL_W_DESC : MSA_3R_DESC_BASE<"srl.w", srl, MSA128WOpnd>;
+class SRL_D_DESC : MSA_3R_DESC_BASE<"srl.d", srl, MSA128DOpnd>;
+
+class SRLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.b", srl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.h", srl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.w", srl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.d", srl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRLR_B_DESC : MSA_3R_DESC_BASE<"srlr.b", int_mips_srlr_b, MSA128BOpnd>;
+class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
+class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
+class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
+
+class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
+                                           MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
+                                           MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
+                                           MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
+                                           MSA128DOpnd>;
+
+class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(OpNode (TyNode ROWD:$wd), Addr:$addr)];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd>;
+class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd>;
+class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd>;
+class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd>;
+
+class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
+                                       MSA128BOpnd>;
+class SUBS_S_H_DESC : MSA_3R_DESC_BASE<"subs_s.h", int_mips_subs_s_h,
+                                       MSA128HOpnd>;
+class SUBS_S_W_DESC : MSA_3R_DESC_BASE<"subs_s.w", int_mips_subs_s_w,
+                                       MSA128WOpnd>;
+class SUBS_S_D_DESC : MSA_3R_DESC_BASE<"subs_s.d", int_mips_subs_s_d,
+                                       MSA128DOpnd>;
+
+class SUBS_U_B_DESC : MSA_3R_DESC_BASE<"subs_u.b", int_mips_subs_u_b,
+                                       MSA128BOpnd>;
+class SUBS_U_H_DESC : MSA_3R_DESC_BASE<"subs_u.h", int_mips_subs_u_h,
+                                       MSA128HOpnd>;
+class SUBS_U_W_DESC : MSA_3R_DESC_BASE<"subs_u.w", int_mips_subs_u_w,
+                                       MSA128WOpnd>;
+class SUBS_U_D_DESC : MSA_3R_DESC_BASE<"subs_u.d", int_mips_subs_u_d,
+                                       MSA128DOpnd>;
+
+class SUBSUS_U_B_DESC : MSA_3R_DESC_BASE<"subsus_u.b", int_mips_subsus_u_b,
+                                         MSA128BOpnd>;
+class SUBSUS_U_H_DESC : MSA_3R_DESC_BASE<"subsus_u.h", int_mips_subsus_u_h,
+                                         MSA128HOpnd>;
+class SUBSUS_U_W_DESC : MSA_3R_DESC_BASE<"subsus_u.w", int_mips_subsus_u_w,
+                                         MSA128WOpnd>;
+class SUBSUS_U_D_DESC : MSA_3R_DESC_BASE<"subsus_u.d", int_mips_subsus_u_d,
+                                         MSA128DOpnd>;
+
+class SUBSUU_S_B_DESC : MSA_3R_DESC_BASE<"subsuu_s.b", int_mips_subsuu_s_b,
+                                         MSA128BOpnd>;
+class SUBSUU_S_H_DESC : MSA_3R_DESC_BASE<"subsuu_s.h", int_mips_subsuu_s_h,
+                                         MSA128HOpnd>;
+class SUBSUU_S_W_DESC : MSA_3R_DESC_BASE<"subsuu_s.w", int_mips_subsuu_s_w,
+                                         MSA128WOpnd>;
+class SUBSUU_S_D_DESC : MSA_3R_DESC_BASE<"subsuu_s.d", int_mips_subsuu_s_d,
+                                         MSA128DOpnd>;
+
+class SUBV_B_DESC : MSA_3R_DESC_BASE<"subv.b", sub, MSA128BOpnd>;
+class SUBV_H_DESC : MSA_3R_DESC_BASE<"subv.h", sub, MSA128HOpnd>;
+class SUBV_W_DESC : MSA_3R_DESC_BASE<"subv.w", sub, MSA128WOpnd>;
+class SUBV_D_DESC : MSA_3R_DESC_BASE<"subv.d", sub, MSA128DOpnd>;
+
+class SUBVI_B_DESC : MSA_I5_DESC_BASE<"subvi.b", sub, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class SUBVI_H_DESC : MSA_I5_DESC_BASE<"subvi.h", sub, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class SUBVI_W_DESC : MSA_I5_DESC_BASE<"subvi.w", sub, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class SUBVI_D_DESC : MSA_I5_DESC_BASE<"subvi.d", sub, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class VSHF_B_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.b", MSA128BOpnd>;
+class VSHF_H_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.h", MSA128HOpnd>;
+class VSHF_W_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.w", MSA128WOpnd>;
+class VSHF_D_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.d", MSA128DOpnd>;
+
+class XOR_V_DESC : MSA_VEC_DESC_BASE<"xor.v", xor, MSA128BOpnd>;
+class XOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128HOpnd>;
+class XOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128WOpnd>;
+class XOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128DOpnd>;
+
+class XORI_B_DESC : MSA_I8_DESC_BASE<"xori.b", xor, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+// Instruction defs.
+def ADD_A_B : ADD_A_B_ENC, ADD_A_B_DESC;
+def ADD_A_H : ADD_A_H_ENC, ADD_A_H_DESC;
+def ADD_A_W : ADD_A_W_ENC, ADD_A_W_DESC;
+def ADD_A_D : ADD_A_D_ENC, ADD_A_D_DESC;
+
+def ADDS_A_B : ADDS_A_B_ENC, ADDS_A_B_DESC;
+def ADDS_A_H : ADDS_A_H_ENC, ADDS_A_H_DESC;
+def ADDS_A_W : ADDS_A_W_ENC, ADDS_A_W_DESC;
+def ADDS_A_D : ADDS_A_D_ENC, ADDS_A_D_DESC;
+
+def ADDS_S_B : ADDS_S_B_ENC, ADDS_S_B_DESC;
+def ADDS_S_H : ADDS_S_H_ENC, ADDS_S_H_DESC;
+def ADDS_S_W : ADDS_S_W_ENC, ADDS_S_W_DESC;
+def ADDS_S_D : ADDS_S_D_ENC, ADDS_S_D_DESC;
+
+def ADDS_U_B : ADDS_U_B_ENC, ADDS_U_B_DESC;
+def ADDS_U_H : ADDS_U_H_ENC, ADDS_U_H_DESC;
+def ADDS_U_W : ADDS_U_W_ENC, ADDS_U_W_DESC;
+def ADDS_U_D : ADDS_U_D_ENC, ADDS_U_D_DESC;
+
+def ADDV_B : ADDV_B_ENC, ADDV_B_DESC;
+def ADDV_H : ADDV_H_ENC, ADDV_H_DESC;
+def ADDV_W : ADDV_W_ENC, ADDV_W_DESC;
+def ADDV_D : ADDV_D_ENC, ADDV_D_DESC;
+
+def ADDVI_B : ADDVI_B_ENC, ADDVI_B_DESC;
+def ADDVI_H : ADDVI_H_ENC, ADDVI_H_DESC;
+def ADDVI_W : ADDVI_W_ENC, ADDVI_W_DESC;
+def ADDVI_D : ADDVI_D_ENC, ADDVI_D_DESC;
+
+def AND_V : AND_V_ENC, AND_V_DESC;
+def AND_V_H_PSEUDO : AND_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_W_PSEUDO : AND_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_D_PSEUDO : AND_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def ANDI_B : ANDI_B_ENC, ANDI_B_DESC;
+
+def ASUB_S_B : ASUB_S_B_ENC, ASUB_S_B_DESC;
+def ASUB_S_H : ASUB_S_H_ENC, ASUB_S_H_DESC;
+def ASUB_S_W : ASUB_S_W_ENC, ASUB_S_W_DESC;
+def ASUB_S_D : ASUB_S_D_ENC, ASUB_S_D_DESC;
+
+def ASUB_U_B : ASUB_U_B_ENC, ASUB_U_B_DESC;
+def ASUB_U_H : ASUB_U_H_ENC, ASUB_U_H_DESC;
+def ASUB_U_W : ASUB_U_W_ENC, ASUB_U_W_DESC;
+def ASUB_U_D : ASUB_U_D_ENC, ASUB_U_D_DESC;
+
+def AVE_S_B : AVE_S_B_ENC, AVE_S_B_DESC;
+def AVE_S_H : AVE_S_H_ENC, AVE_S_H_DESC;
+def AVE_S_W : AVE_S_W_ENC, AVE_S_W_DESC;
+def AVE_S_D : AVE_S_D_ENC, AVE_S_D_DESC;
+
+def AVE_U_B : AVE_U_B_ENC, AVE_U_B_DESC;
+def AVE_U_H : AVE_U_H_ENC, AVE_U_H_DESC;
+def AVE_U_W : AVE_U_W_ENC, AVE_U_W_DESC;
+def AVE_U_D : AVE_U_D_ENC, AVE_U_D_DESC;
+
+def AVER_S_B : AVER_S_B_ENC, AVER_S_B_DESC;
+def AVER_S_H : AVER_S_H_ENC, AVER_S_H_DESC;
+def AVER_S_W : AVER_S_W_ENC, AVER_S_W_DESC;
+def AVER_S_D : AVER_S_D_ENC, AVER_S_D_DESC;
+
+def AVER_U_B : AVER_U_B_ENC, AVER_U_B_DESC;
+def AVER_U_H : AVER_U_H_ENC, AVER_U_H_DESC;
+def AVER_U_W : AVER_U_W_ENC, AVER_U_W_DESC;
+def AVER_U_D : AVER_U_D_ENC, AVER_U_D_DESC;
+
+def BCLR_B : BCLR_B_ENC, BCLR_B_DESC;
+def BCLR_H : BCLR_H_ENC, BCLR_H_DESC;
+def BCLR_W : BCLR_W_ENC, BCLR_W_DESC;
+def BCLR_D : BCLR_D_ENC, BCLR_D_DESC;
+
+def BCLRI_B : BCLRI_B_ENC, BCLRI_B_DESC;
+def BCLRI_H : BCLRI_H_ENC, BCLRI_H_DESC;
+def BCLRI_W : BCLRI_W_ENC, BCLRI_W_DESC;
+def BCLRI_D : BCLRI_D_ENC, BCLRI_D_DESC;
+
+def BINSL_B : BINSL_B_ENC, BINSL_B_DESC;
+def BINSL_H : BINSL_H_ENC, BINSL_H_DESC;
+def BINSL_W : BINSL_W_ENC, BINSL_W_DESC;
+def BINSL_D : BINSL_D_ENC, BINSL_D_DESC;
+
+def BINSLI_B : BINSLI_B_ENC, BINSLI_B_DESC;
+def BINSLI_H : BINSLI_H_ENC, BINSLI_H_DESC;
+def BINSLI_W : BINSLI_W_ENC, BINSLI_W_DESC;
+def BINSLI_D : BINSLI_D_ENC, BINSLI_D_DESC;
+
+def BINSR_B : BINSR_B_ENC, BINSR_B_DESC;
+def BINSR_H : BINSR_H_ENC, BINSR_H_DESC;
+def BINSR_W : BINSR_W_ENC, BINSR_W_DESC;
+def BINSR_D : BINSR_D_ENC, BINSR_D_DESC;
+
+def BINSRI_B : BINSRI_B_ENC, BINSRI_B_DESC;
+def BINSRI_H : BINSRI_H_ENC, BINSRI_H_DESC;
+def BINSRI_W : BINSRI_W_ENC, BINSRI_W_DESC;
+def BINSRI_D : BINSRI_D_ENC, BINSRI_D_DESC;
+
+def BMNZ_V : BMNZ_V_ENC, BMNZ_V_DESC;
+
+def BMNZI_B : BMNZI_B_ENC, BMNZI_B_DESC;
+
+def BMZ_V : BMZ_V_ENC, BMZ_V_DESC;
+
+def BMZI_B : BMZI_B_ENC, BMZI_B_DESC;
+
+def BNEG_B : BNEG_B_ENC, BNEG_B_DESC;
+def BNEG_H : BNEG_H_ENC, BNEG_H_DESC;
+def BNEG_W : BNEG_W_ENC, BNEG_W_DESC;
+def BNEG_D : BNEG_D_ENC, BNEG_D_DESC;
+
+def BNEGI_B : BNEGI_B_ENC, BNEGI_B_DESC;
+def BNEGI_H : BNEGI_H_ENC, BNEGI_H_DESC;
+def BNEGI_W : BNEGI_W_ENC, BNEGI_W_DESC;
+def BNEGI_D : BNEGI_D_ENC, BNEGI_D_DESC;
+
+def BNZ_B : BNZ_B_ENC, BNZ_B_DESC;
+def BNZ_H : BNZ_H_ENC, BNZ_H_DESC;
+def BNZ_W : BNZ_W_ENC, BNZ_W_DESC;
+def BNZ_D : BNZ_D_ENC, BNZ_D_DESC;
+
+def BNZ_V : BNZ_V_ENC, BNZ_V_DESC;
+
+def BSEL_V : BSEL_V_ENC, BSEL_V_DESC;
+
+class MSA_BSEL_PSEUDO_BASE<RegisterOperand RO, ValueType Ty> :
+  MipsPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
+             [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$ws, RO:$wt)))]>,
+  PseudoInstExpansion<(BSEL_V MSA128BOpnd:$wd, MSA128BOpnd:$wd_in,
+                              MSA128BOpnd:$ws, MSA128BOpnd:$wt)> {
+  let Constraints = "$wd_in = $wd";
+}
+
+def BSEL_H_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128HOpnd, v8i16>;
+def BSEL_W_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4i32>;
+def BSEL_D_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2i64>;
+def BSEL_FW_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4f32>;
+def BSEL_FD_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2f64>;
+
+def BSELI_B : BSELI_B_ENC, BSELI_B_DESC;
+
+def BSET_B : BSET_B_ENC, BSET_B_DESC;
+def BSET_H : BSET_H_ENC, BSET_H_DESC;
+def BSET_W : BSET_W_ENC, BSET_W_DESC;
+def BSET_D : BSET_D_ENC, BSET_D_DESC;
+
+def BSETI_B : BSETI_B_ENC, BSETI_B_DESC;
+def BSETI_H : BSETI_H_ENC, BSETI_H_DESC;
+def BSETI_W : BSETI_W_ENC, BSETI_W_DESC;
+def BSETI_D : BSETI_D_ENC, BSETI_D_DESC;
+
+def BZ_B : BZ_B_ENC, BZ_B_DESC;
+def BZ_H : BZ_H_ENC, BZ_H_DESC;
+def BZ_W : BZ_W_ENC, BZ_W_DESC;
+def BZ_D : BZ_D_ENC, BZ_D_DESC;
+
+def BZ_V : BZ_V_ENC, BZ_V_DESC;
+
+def CEQ_B : CEQ_B_ENC, CEQ_B_DESC;
+def CEQ_H : CEQ_H_ENC, CEQ_H_DESC;
+def CEQ_W : CEQ_W_ENC, CEQ_W_DESC;
+def CEQ_D : CEQ_D_ENC, CEQ_D_DESC;
+
+def CEQI_B : CEQI_B_ENC, CEQI_B_DESC;
+def CEQI_H : CEQI_H_ENC, CEQI_H_DESC;
+def CEQI_W : CEQI_W_ENC, CEQI_W_DESC;
+def CEQI_D : CEQI_D_ENC, CEQI_D_DESC;
+
+def CFCMSA : CFCMSA_ENC, CFCMSA_DESC;
+
+def CLE_S_B : CLE_S_B_ENC, CLE_S_B_DESC;
+def CLE_S_H : CLE_S_H_ENC, CLE_S_H_DESC;
+def CLE_S_W : CLE_S_W_ENC, CLE_S_W_DESC;
+def CLE_S_D : CLE_S_D_ENC, CLE_S_D_DESC;
+
+def CLE_U_B : CLE_U_B_ENC, CLE_U_B_DESC;
+def CLE_U_H : CLE_U_H_ENC, CLE_U_H_DESC;
+def CLE_U_W : CLE_U_W_ENC, CLE_U_W_DESC;
+def CLE_U_D : CLE_U_D_ENC, CLE_U_D_DESC;
+
+def CLEI_S_B : CLEI_S_B_ENC, CLEI_S_B_DESC;
+def CLEI_S_H : CLEI_S_H_ENC, CLEI_S_H_DESC;
+def CLEI_S_W : CLEI_S_W_ENC, CLEI_S_W_DESC;
+def CLEI_S_D : CLEI_S_D_ENC, CLEI_S_D_DESC;
+
+def CLEI_U_B : CLEI_U_B_ENC, CLEI_U_B_DESC;
+def CLEI_U_H : CLEI_U_H_ENC, CLEI_U_H_DESC;
+def CLEI_U_W : CLEI_U_W_ENC, CLEI_U_W_DESC;
+def CLEI_U_D : CLEI_U_D_ENC, CLEI_U_D_DESC;
+
+def CLT_S_B : CLT_S_B_ENC, CLT_S_B_DESC;
+def CLT_S_H : CLT_S_H_ENC, CLT_S_H_DESC;
+def CLT_S_W : CLT_S_W_ENC, CLT_S_W_DESC;
+def CLT_S_D : CLT_S_D_ENC, CLT_S_D_DESC;
+
+def CLT_U_B : CLT_U_B_ENC, CLT_U_B_DESC;
+def CLT_U_H : CLT_U_H_ENC, CLT_U_H_DESC;
+def CLT_U_W : CLT_U_W_ENC, CLT_U_W_DESC;
+def CLT_U_D : CLT_U_D_ENC, CLT_U_D_DESC;
+
+def CLTI_S_B : CLTI_S_B_ENC, CLTI_S_B_DESC;
+def CLTI_S_H : CLTI_S_H_ENC, CLTI_S_H_DESC;
+def CLTI_S_W : CLTI_S_W_ENC, CLTI_S_W_DESC;
+def CLTI_S_D : CLTI_S_D_ENC, CLTI_S_D_DESC;
+
+def CLTI_U_B : CLTI_U_B_ENC, CLTI_U_B_DESC;
+def CLTI_U_H : CLTI_U_H_ENC, CLTI_U_H_DESC;
+def CLTI_U_W : CLTI_U_W_ENC, CLTI_U_W_DESC;
+def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
+
+def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
+def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
+def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
+
+def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
+def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
+
+def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
+def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
+
+def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;
+
+def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;
+def DIV_S_H : DIV_S_H_ENC, DIV_S_H_DESC;
+def DIV_S_W : DIV_S_W_ENC, DIV_S_W_DESC;
+def DIV_S_D : DIV_S_D_ENC, DIV_S_D_DESC;
+
+def DIV_U_B : DIV_U_B_ENC, DIV_U_B_DESC;
+def DIV_U_H : DIV_U_H_ENC, DIV_U_H_DESC;
+def DIV_U_W : DIV_U_W_ENC, DIV_U_W_DESC;
+def DIV_U_D : DIV_U_D_ENC, DIV_U_D_DESC;
+
+def DOTP_S_H : DOTP_S_H_ENC, DOTP_S_H_DESC;
+def DOTP_S_W : DOTP_S_W_ENC, DOTP_S_W_DESC;
+def DOTP_S_D : DOTP_S_D_ENC, DOTP_S_D_DESC;
+
+def DOTP_U_H : DOTP_U_H_ENC, DOTP_U_H_DESC;
+def DOTP_U_W : DOTP_U_W_ENC, DOTP_U_W_DESC;
+def DOTP_U_D : DOTP_U_D_ENC, DOTP_U_D_DESC;
+
+def DPADD_S_H : DPADD_S_H_ENC, DPADD_S_H_DESC;
+def DPADD_S_W : DPADD_S_W_ENC, DPADD_S_W_DESC;
+def DPADD_S_D : DPADD_S_D_ENC, DPADD_S_D_DESC;
+
+def DPADD_U_H : DPADD_U_H_ENC, DPADD_U_H_DESC;
+def DPADD_U_W : DPADD_U_W_ENC, DPADD_U_W_DESC;
+def DPADD_U_D : DPADD_U_D_ENC, DPADD_U_D_DESC;
+
+def DPSUB_S_H : DPSUB_S_H_ENC, DPSUB_S_H_DESC;
+def DPSUB_S_W : DPSUB_S_W_ENC, DPSUB_S_W_DESC;
+def DPSUB_S_D : DPSUB_S_D_ENC, DPSUB_S_D_DESC;
+
+def DPSUB_U_H : DPSUB_U_H_ENC, DPSUB_U_H_DESC;
+def DPSUB_U_W : DPSUB_U_W_ENC, DPSUB_U_W_DESC;
+def DPSUB_U_D : DPSUB_U_D_ENC, DPSUB_U_D_DESC;
+
+def FADD_W : FADD_W_ENC, FADD_W_DESC;
+def FADD_D : FADD_D_ENC, FADD_D_DESC;
+
+def FCAF_W : FCAF_W_ENC, FCAF_W_DESC;
+def FCAF_D : FCAF_D_ENC, FCAF_D_DESC;
+
+def FCEQ_W : FCEQ_W_ENC, FCEQ_W_DESC;
+def FCEQ_D : FCEQ_D_ENC, FCEQ_D_DESC;
+
+def FCLE_W : FCLE_W_ENC, FCLE_W_DESC;
+def FCLE_D : FCLE_D_ENC, FCLE_D_DESC;
+
+def FCLT_W : FCLT_W_ENC, FCLT_W_DESC;
+def FCLT_D : FCLT_D_ENC, FCLT_D_DESC;
+
+def FCLASS_W : FCLASS_W_ENC, FCLASS_W_DESC;
+def FCLASS_D : FCLASS_D_ENC, FCLASS_D_DESC;
+
+def FCNE_W : FCNE_W_ENC, FCNE_W_DESC;
+def FCNE_D : FCNE_D_ENC, FCNE_D_DESC;
+
+def FCOR_W : FCOR_W_ENC, FCOR_W_DESC;
+def FCOR_D : FCOR_D_ENC, FCOR_D_DESC;
+
+def FCUEQ_W : FCUEQ_W_ENC, FCUEQ_W_DESC;
+def FCUEQ_D : FCUEQ_D_ENC, FCUEQ_D_DESC;
+
+def FCULE_W : FCULE_W_ENC, FCULE_W_DESC;
+def FCULE_D : FCULE_D_ENC, FCULE_D_DESC;
+
+def FCULT_W : FCULT_W_ENC, FCULT_W_DESC;
+def FCULT_D : FCULT_D_ENC, FCULT_D_DESC;
+
+def FCUN_W : FCUN_W_ENC, FCUN_W_DESC;
+def FCUN_D : FCUN_D_ENC, FCUN_D_DESC;
+
+def FCUNE_W : FCUNE_W_ENC, FCUNE_W_DESC;
+def FCUNE_D : FCUNE_D_ENC, FCUNE_D_DESC;
+
+def FDIV_W : FDIV_W_ENC, FDIV_W_DESC;
+def FDIV_D : FDIV_D_ENC, FDIV_D_DESC;
+
+def FEXDO_H : FEXDO_H_ENC, FEXDO_H_DESC;
+def FEXDO_W : FEXDO_W_ENC, FEXDO_W_DESC;
+
+def FEXP2_W : FEXP2_W_ENC, FEXP2_W_DESC;
+def FEXP2_D : FEXP2_D_ENC, FEXP2_D_DESC;
+def FEXP2_W_1_PSEUDO : FEXP2_W_1_PSEUDO_DESC;
+def FEXP2_D_1_PSEUDO : FEXP2_D_1_PSEUDO_DESC;
+
+def FEXUPL_W : FEXUPL_W_ENC, FEXUPL_W_DESC;
+def FEXUPL_D : FEXUPL_D_ENC, FEXUPL_D_DESC;
+
+def FEXUPR_W : FEXUPR_W_ENC, FEXUPR_W_DESC;
+def FEXUPR_D : FEXUPR_D_ENC, FEXUPR_D_DESC;
+
+def FFINT_S_W : FFINT_S_W_ENC, FFINT_S_W_DESC;
+def FFINT_S_D : FFINT_S_D_ENC, FFINT_S_D_DESC;
+
+def FFINT_U_W : FFINT_U_W_ENC, FFINT_U_W_DESC;
+def FFINT_U_D : FFINT_U_D_ENC, FFINT_U_D_DESC;
+
+def FFQL_W : FFQL_W_ENC, FFQL_W_DESC;
+def FFQL_D : FFQL_D_ENC, FFQL_D_DESC;
+
+def FFQR_W : FFQR_W_ENC, FFQR_W_DESC;
+def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
+
+def FILL_B : FILL_B_ENC, FILL_B_DESC;
+def FILL_H : FILL_H_ENC, FILL_H_DESC;
+def FILL_W : FILL_W_ENC, FILL_W_DESC;
+def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
+def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
+
+def FLOG2_W : FLOG2_W_ENC, FLOG2_W_DESC;
+def FLOG2_D : FLOG2_D_ENC, FLOG2_D_DESC;
+
+def FMADD_W : FMADD_W_ENC, FMADD_W_DESC;
+def FMADD_D : FMADD_D_ENC, FMADD_D_DESC;
+
+def FMAX_W : FMAX_W_ENC, FMAX_W_DESC;
+def FMAX_D : FMAX_D_ENC, FMAX_D_DESC;
+
+def FMAX_A_W : FMAX_A_W_ENC, FMAX_A_W_DESC;
+def FMAX_A_D : FMAX_A_D_ENC, FMAX_A_D_DESC;
+
+def FMIN_W : FMIN_W_ENC, FMIN_W_DESC;
+def FMIN_D : FMIN_D_ENC, FMIN_D_DESC;
+
+def FMIN_A_W : FMIN_A_W_ENC, FMIN_A_W_DESC;
+def FMIN_A_D : FMIN_A_D_ENC, FMIN_A_D_DESC;
+
+def FMSUB_W : FMSUB_W_ENC, FMSUB_W_DESC;
+def FMSUB_D : FMSUB_D_ENC, FMSUB_D_DESC;
+
+def FMUL_W : FMUL_W_ENC, FMUL_W_DESC;
+def FMUL_D : FMUL_D_ENC, FMUL_D_DESC;
+
+def FRINT_W : FRINT_W_ENC, FRINT_W_DESC;
+def FRINT_D : FRINT_D_ENC, FRINT_D_DESC;
+
+def FRCP_W : FRCP_W_ENC, FRCP_W_DESC;
+def FRCP_D : FRCP_D_ENC, FRCP_D_DESC;
+
+def FRSQRT_W : FRSQRT_W_ENC, FRSQRT_W_DESC;
+def FRSQRT_D : FRSQRT_D_ENC, FRSQRT_D_DESC;
+
+def FSAF_W : FSAF_W_ENC, FSAF_W_DESC;
+def FSAF_D : FSAF_D_ENC, FSAF_D_DESC;
+
+def FSEQ_W : FSEQ_W_ENC, FSEQ_W_DESC;
+def FSEQ_D : FSEQ_D_ENC, FSEQ_D_DESC;
+
+def FSLE_W : FSLE_W_ENC, FSLE_W_DESC;
+def FSLE_D : FSLE_D_ENC, FSLE_D_DESC;
+
+def FSLT_W : FSLT_W_ENC, FSLT_W_DESC;
+def FSLT_D : FSLT_D_ENC, FSLT_D_DESC;
+
+def FSNE_W : FSNE_W_ENC, FSNE_W_DESC;
+def FSNE_D : FSNE_D_ENC, FSNE_D_DESC;
+
+def FSOR_W : FSOR_W_ENC, FSOR_W_DESC;
+def FSOR_D : FSOR_D_ENC, FSOR_D_DESC;
+
+def FSQRT_W : FSQRT_W_ENC, FSQRT_W_DESC;
+def FSQRT_D : FSQRT_D_ENC, FSQRT_D_DESC;
+
+def FSUB_W : FSUB_W_ENC, FSUB_W_DESC;
+def FSUB_D : FSUB_D_ENC, FSUB_D_DESC;
+
+def FSUEQ_W : FSUEQ_W_ENC, FSUEQ_W_DESC;
+def FSUEQ_D : FSUEQ_D_ENC, FSUEQ_D_DESC;
+
+def FSULE_W : FSULE_W_ENC, FSULE_W_DESC;
+def FSULE_D : FSULE_D_ENC, FSULE_D_DESC;
+
+def FSULT_W : FSULT_W_ENC, FSULT_W_DESC;
+def FSULT_D : FSULT_D_ENC, FSULT_D_DESC;
+
+def FSUN_W : FSUN_W_ENC, FSUN_W_DESC;
+def FSUN_D : FSUN_D_ENC, FSUN_D_DESC;
+
+def FSUNE_W : FSUNE_W_ENC, FSUNE_W_DESC;
+def FSUNE_D : FSUNE_D_ENC, FSUNE_D_DESC;
+
+def FTINT_S_W : FTINT_S_W_ENC, FTINT_S_W_DESC;
+def FTINT_S_D : FTINT_S_D_ENC, FTINT_S_D_DESC;
+
+def FTINT_U_W : FTINT_U_W_ENC, FTINT_U_W_DESC;
+def FTINT_U_D : FTINT_U_D_ENC, FTINT_U_D_DESC;
+
+def FTQ_H : FTQ_H_ENC, FTQ_H_DESC;
+def FTQ_W : FTQ_W_ENC, FTQ_W_DESC;
+
+def FTRUNC_S_W : FTRUNC_S_W_ENC, FTRUNC_S_W_DESC;
+def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
+
+def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
+def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
+
+def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
+def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
+def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
+
+def HADD_U_H : HADD_U_H_ENC, HADD_U_H_DESC;
+def HADD_U_W : HADD_U_W_ENC, HADD_U_W_DESC;
+def HADD_U_D : HADD_U_D_ENC, HADD_U_D_DESC;
+
+def HSUB_S_H : HSUB_S_H_ENC, HSUB_S_H_DESC;
+def HSUB_S_W : HSUB_S_W_ENC, HSUB_S_W_DESC;
+def HSUB_S_D : HSUB_S_D_ENC, HSUB_S_D_DESC;
+
+def HSUB_U_H : HSUB_U_H_ENC, HSUB_U_H_DESC;
+def HSUB_U_W : HSUB_U_W_ENC, HSUB_U_W_DESC;
+def HSUB_U_D : HSUB_U_D_ENC, HSUB_U_D_DESC;
+
+def ILVEV_B : ILVEV_B_ENC, ILVEV_B_DESC;
+def ILVEV_H : ILVEV_H_ENC, ILVEV_H_DESC;
+def ILVEV_W : ILVEV_W_ENC, ILVEV_W_DESC;
+def ILVEV_D : ILVEV_D_ENC, ILVEV_D_DESC;
+
+def ILVL_B : ILVL_B_ENC, ILVL_B_DESC;
+def ILVL_H : ILVL_H_ENC, ILVL_H_DESC;
+def ILVL_W : ILVL_W_ENC, ILVL_W_DESC;
+def ILVL_D : ILVL_D_ENC, ILVL_D_DESC;
+
+def ILVOD_B : ILVOD_B_ENC, ILVOD_B_DESC;
+def ILVOD_H : ILVOD_H_ENC, ILVOD_H_DESC;
+def ILVOD_W : ILVOD_W_ENC, ILVOD_W_DESC;
+def ILVOD_D : ILVOD_D_ENC, ILVOD_D_DESC;
+
+def ILVR_B : ILVR_B_ENC, ILVR_B_DESC;
+def ILVR_H : ILVR_H_ENC, ILVR_H_DESC;
+def ILVR_W : ILVR_W_ENC, ILVR_W_DESC;
+def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
+
+def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
+def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
+def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
+
+// INSERT_FW_PSEUDO defined after INSVE_W
+// INSERT_FD_PSEUDO defined after INSVE_D
+
+def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
+def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
+def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
+def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+
+def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
+def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
+
+def LD_B: LD_B_ENC, LD_B_DESC;
+def LD_H: LD_H_ENC, LD_H_DESC;
+def LD_W: LD_W_ENC, LD_W_DESC;
+def LD_D: LD_D_ENC, LD_D_DESC;
+
+def LDI_B : LDI_B_ENC, LDI_B_DESC;
+def LDI_H : LDI_H_ENC, LDI_H_DESC;
+def LDI_W : LDI_W_ENC, LDI_W_DESC;
+def LDI_D : LDI_D_ENC, LDI_D_DESC;
+
+def LSA : LSA_ENC, LSA_DESC;
+
+def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
+def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
+
+def MADDR_Q_H : MADDR_Q_H_ENC, MADDR_Q_H_DESC;
+def MADDR_Q_W : MADDR_Q_W_ENC, MADDR_Q_W_DESC;
+
+def MADDV_B : MADDV_B_ENC, MADDV_B_DESC;
+def MADDV_H : MADDV_H_ENC, MADDV_H_DESC;
+def MADDV_W : MADDV_W_ENC, MADDV_W_DESC;
+def MADDV_D : MADDV_D_ENC, MADDV_D_DESC;
+
+def MAX_A_B : MAX_A_B_ENC, MAX_A_B_DESC;
+def MAX_A_H : MAX_A_H_ENC, MAX_A_H_DESC;
+def MAX_A_W : MAX_A_W_ENC, MAX_A_W_DESC;
+def MAX_A_D : MAX_A_D_ENC, MAX_A_D_DESC;
+
+def MAX_S_B : MAX_S_B_ENC, MAX_S_B_DESC;
+def MAX_S_H : MAX_S_H_ENC, MAX_S_H_DESC;
+def MAX_S_W : MAX_S_W_ENC, MAX_S_W_DESC;
+def MAX_S_D : MAX_S_D_ENC, MAX_S_D_DESC;
+
+def MAX_U_B : MAX_U_B_ENC, MAX_U_B_DESC;
+def MAX_U_H : MAX_U_H_ENC, MAX_U_H_DESC;
+def MAX_U_W : MAX_U_W_ENC, MAX_U_W_DESC;
+def MAX_U_D : MAX_U_D_ENC, MAX_U_D_DESC;
+
+def MAXI_S_B : MAXI_S_B_ENC, MAXI_S_B_DESC;
+def MAXI_S_H : MAXI_S_H_ENC, MAXI_S_H_DESC;
+def MAXI_S_W : MAXI_S_W_ENC, MAXI_S_W_DESC;
+def MAXI_S_D : MAXI_S_D_ENC, MAXI_S_D_DESC;
+
+def MAXI_U_B : MAXI_U_B_ENC, MAXI_U_B_DESC;
+def MAXI_U_H : MAXI_U_H_ENC, MAXI_U_H_DESC;
+def MAXI_U_W : MAXI_U_W_ENC, MAXI_U_W_DESC;
+def MAXI_U_D : MAXI_U_D_ENC, MAXI_U_D_DESC;
+
+def MIN_A_B : MIN_A_B_ENC, MIN_A_B_DESC;
+def MIN_A_H : MIN_A_H_ENC, MIN_A_H_DESC;
+def MIN_A_W : MIN_A_W_ENC, MIN_A_W_DESC;
+def MIN_A_D : MIN_A_D_ENC, MIN_A_D_DESC;
+
+def MIN_S_B : MIN_S_B_ENC, MIN_S_B_DESC;
+def MIN_S_H : MIN_S_H_ENC, MIN_S_H_DESC;
+def MIN_S_W : MIN_S_W_ENC, MIN_S_W_DESC;
+def MIN_S_D : MIN_S_D_ENC, MIN_S_D_DESC;
+
+def MIN_U_B : MIN_U_B_ENC, MIN_U_B_DESC;
+def MIN_U_H : MIN_U_H_ENC, MIN_U_H_DESC;
+def MIN_U_W : MIN_U_W_ENC, MIN_U_W_DESC;
+def MIN_U_D : MIN_U_D_ENC, MIN_U_D_DESC;
+
+def MINI_S_B : MINI_S_B_ENC, MINI_S_B_DESC;
+def MINI_S_H : MINI_S_H_ENC, MINI_S_H_DESC;
+def MINI_S_W : MINI_S_W_ENC, MINI_S_W_DESC;
+def MINI_S_D : MINI_S_D_ENC, MINI_S_D_DESC;
+
+def MINI_U_B : MINI_U_B_ENC, MINI_U_B_DESC;
+def MINI_U_H : MINI_U_H_ENC, MINI_U_H_DESC;
+def MINI_U_W : MINI_U_W_ENC, MINI_U_W_DESC;
+def MINI_U_D : MINI_U_D_ENC, MINI_U_D_DESC;
+
+def MOD_S_B : MOD_S_B_ENC, MOD_S_B_DESC;
+def MOD_S_H : MOD_S_H_ENC, MOD_S_H_DESC;
+def MOD_S_W : MOD_S_W_ENC, MOD_S_W_DESC;
+def MOD_S_D : MOD_S_D_ENC, MOD_S_D_DESC;
+
+def MOD_U_B : MOD_U_B_ENC, MOD_U_B_DESC;
+def MOD_U_H : MOD_U_H_ENC, MOD_U_H_DESC;
+def MOD_U_W : MOD_U_W_ENC, MOD_U_W_DESC;
+def MOD_U_D : MOD_U_D_ENC, MOD_U_D_DESC;
+
+def MOVE_V : MOVE_V_ENC, MOVE_V_DESC;
+
+def MSUB_Q_H : MSUB_Q_H_ENC, MSUB_Q_H_DESC;
+def MSUB_Q_W : MSUB_Q_W_ENC, MSUB_Q_W_DESC;
+
+def MSUBR_Q_H : MSUBR_Q_H_ENC, MSUBR_Q_H_DESC;
+def MSUBR_Q_W : MSUBR_Q_W_ENC, MSUBR_Q_W_DESC;
+
+def MSUBV_B : MSUBV_B_ENC, MSUBV_B_DESC;
+def MSUBV_H : MSUBV_H_ENC, MSUBV_H_DESC;
+def MSUBV_W : MSUBV_W_ENC, MSUBV_W_DESC;
+def MSUBV_D : MSUBV_D_ENC, MSUBV_D_DESC;
+
+def MUL_Q_H : MUL_Q_H_ENC, MUL_Q_H_DESC;
+def MUL_Q_W : MUL_Q_W_ENC, MUL_Q_W_DESC;
+
+def MULR_Q_H : MULR_Q_H_ENC, MULR_Q_H_DESC;
+def MULR_Q_W : MULR_Q_W_ENC, MULR_Q_W_DESC;
+
+def MULV_B : MULV_B_ENC, MULV_B_DESC;
+def MULV_H : MULV_H_ENC, MULV_H_DESC;
+def MULV_W : MULV_W_ENC, MULV_W_DESC;
+def MULV_D : MULV_D_ENC, MULV_D_DESC;
+
+def NLOC_B : NLOC_B_ENC, NLOC_B_DESC;
+def NLOC_H : NLOC_H_ENC, NLOC_H_DESC;
+def NLOC_W : NLOC_W_ENC, NLOC_W_DESC;
+def NLOC_D : NLOC_D_ENC, NLOC_D_DESC;
+
+def NLZC_B : NLZC_B_ENC, NLZC_B_DESC;
+def NLZC_H : NLZC_H_ENC, NLZC_H_DESC;
+def NLZC_W : NLZC_W_ENC, NLZC_W_DESC;
+def NLZC_D : NLZC_D_ENC, NLZC_D_DESC;
+
+def NOR_V : NOR_V_ENC, NOR_V_DESC;
+def NOR_V_H_PSEUDO : NOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_W_PSEUDO : NOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_D_PSEUDO : NOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def NORI_B : NORI_B_ENC, NORI_B_DESC;
+
+def OR_V : OR_V_ENC, OR_V_DESC;
+def OR_V_H_PSEUDO : OR_V_H_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_W_PSEUDO : OR_V_W_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_D_PSEUDO : OR_V_D_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+
+def ORI_B : ORI_B_ENC, ORI_B_DESC;
+
+def PCKEV_B : PCKEV_B_ENC, PCKEV_B_DESC;
+def PCKEV_H : PCKEV_H_ENC, PCKEV_H_DESC;
+def PCKEV_W : PCKEV_W_ENC, PCKEV_W_DESC;
+def PCKEV_D : PCKEV_D_ENC, PCKEV_D_DESC;
+
+def PCKOD_B : PCKOD_B_ENC, PCKOD_B_DESC;
+def PCKOD_H : PCKOD_H_ENC, PCKOD_H_DESC;
+def PCKOD_W : PCKOD_W_ENC, PCKOD_W_DESC;
+def PCKOD_D : PCKOD_D_ENC, PCKOD_D_DESC;
+
+def PCNT_B : PCNT_B_ENC, PCNT_B_DESC;
+def PCNT_H : PCNT_H_ENC, PCNT_H_DESC;
+def PCNT_W : PCNT_W_ENC, PCNT_W_DESC;
+def PCNT_D : PCNT_D_ENC, PCNT_D_DESC;
+
+def SAT_S_B : SAT_S_B_ENC, SAT_S_B_DESC;
+def SAT_S_H : SAT_S_H_ENC, SAT_S_H_DESC;
+def SAT_S_W : SAT_S_W_ENC, SAT_S_W_DESC;
+def SAT_S_D : SAT_S_D_ENC, SAT_S_D_DESC;
+
+def SAT_U_B : SAT_U_B_ENC, SAT_U_B_DESC;
+def SAT_U_H : SAT_U_H_ENC, SAT_U_H_DESC;
+def SAT_U_W : SAT_U_W_ENC, SAT_U_W_DESC;
+def SAT_U_D : SAT_U_D_ENC, SAT_U_D_DESC;
+
+def SHF_B : SHF_B_ENC, SHF_B_DESC;
+def SHF_H : SHF_H_ENC, SHF_H_DESC;
+def SHF_W : SHF_W_ENC, SHF_W_DESC;
+
+def SLD_B : SLD_B_ENC, SLD_B_DESC;
+def SLD_H : SLD_H_ENC, SLD_H_DESC;
+def SLD_W : SLD_W_ENC, SLD_W_DESC;
+def SLD_D : SLD_D_ENC, SLD_D_DESC;
+
+def SLDI_B : SLDI_B_ENC, SLDI_B_DESC;
+def SLDI_H : SLDI_H_ENC, SLDI_H_DESC;
+def SLDI_W : SLDI_W_ENC, SLDI_W_DESC;
+def SLDI_D : SLDI_D_ENC, SLDI_D_DESC;
+
+def SLL_B : SLL_B_ENC, SLL_B_DESC;
+def SLL_H : SLL_H_ENC, SLL_H_DESC;
+def SLL_W : SLL_W_ENC, SLL_W_DESC;
+def SLL_D : SLL_D_ENC, SLL_D_DESC;
+
+def SLLI_B : SLLI_B_ENC, SLLI_B_DESC;
+def SLLI_H : SLLI_H_ENC, SLLI_H_DESC;
+def SLLI_W : SLLI_W_ENC, SLLI_W_DESC;
+def SLLI_D : SLLI_D_ENC, SLLI_D_DESC;
+
+def SPLAT_B : SPLAT_B_ENC, SPLAT_B_DESC;
+def SPLAT_H : SPLAT_H_ENC, SPLAT_H_DESC;
+def SPLAT_W : SPLAT_W_ENC, SPLAT_W_DESC;
+def SPLAT_D : SPLAT_D_ENC, SPLAT_D_DESC;
+
+def SPLATI_B : SPLATI_B_ENC, SPLATI_B_DESC;
+def SPLATI_H : SPLATI_H_ENC, SPLATI_H_DESC;
+def SPLATI_W : SPLATI_W_ENC, SPLATI_W_DESC;
+def SPLATI_D : SPLATI_D_ENC, SPLATI_D_DESC;
+
+def SRA_B : SRA_B_ENC, SRA_B_DESC;
+def SRA_H : SRA_H_ENC, SRA_H_DESC;
+def SRA_W : SRA_W_ENC, SRA_W_DESC;
+def SRA_D : SRA_D_ENC, SRA_D_DESC;
+
+def SRAI_B : SRAI_B_ENC, SRAI_B_DESC;
+def SRAI_H : SRAI_H_ENC, SRAI_H_DESC;
+def SRAI_W : SRAI_W_ENC, SRAI_W_DESC;
+def SRAI_D : SRAI_D_ENC, SRAI_D_DESC;
+
+def SRAR_B : SRAR_B_ENC, SRAR_B_DESC;
+def SRAR_H : SRAR_H_ENC, SRAR_H_DESC;
+def SRAR_W : SRAR_W_ENC, SRAR_W_DESC;
+def SRAR_D : SRAR_D_ENC, SRAR_D_DESC;
+
+def SRARI_B : SRARI_B_ENC, SRARI_B_DESC;
+def SRARI_H : SRARI_H_ENC, SRARI_H_DESC;
+def SRARI_W : SRARI_W_ENC, SRARI_W_DESC;
+def SRARI_D : SRARI_D_ENC, SRARI_D_DESC;
+
+def SRL_B : SRL_B_ENC, SRL_B_DESC;
+def SRL_H : SRL_H_ENC, SRL_H_DESC;
+def SRL_W : SRL_W_ENC, SRL_W_DESC;
+def SRL_D : SRL_D_ENC, SRL_D_DESC;
+
+def SRLI_B : SRLI_B_ENC, SRLI_B_DESC;
+def SRLI_H : SRLI_H_ENC, SRLI_H_DESC;
+def SRLI_W : SRLI_W_ENC, SRLI_W_DESC;
+def SRLI_D : SRLI_D_ENC, SRLI_D_DESC;
+
+def SRLR_B : SRLR_B_ENC, SRLR_B_DESC;
+def SRLR_H : SRLR_H_ENC, SRLR_H_DESC;
+def SRLR_W : SRLR_W_ENC, SRLR_W_DESC;
+def SRLR_D : SRLR_D_ENC, SRLR_D_DESC;
+
+def SRLRI_B : SRLRI_B_ENC, SRLRI_B_DESC;
+def SRLRI_H : SRLRI_H_ENC, SRLRI_H_DESC;
+def SRLRI_W : SRLRI_W_ENC, SRLRI_W_DESC;
+def SRLRI_D : SRLRI_D_ENC, SRLRI_D_DESC;
+
+def ST_B: ST_B_ENC, ST_B_DESC;
+def ST_H: ST_H_ENC, ST_H_DESC;
+def ST_W: ST_W_ENC, ST_W_DESC;
+def ST_D: ST_D_ENC, ST_D_DESC;
+
+def SUBS_S_B : SUBS_S_B_ENC, SUBS_S_B_DESC;
+def SUBS_S_H : SUBS_S_H_ENC, SUBS_S_H_DESC;
+def SUBS_S_W : SUBS_S_W_ENC, SUBS_S_W_DESC;
+def SUBS_S_D : SUBS_S_D_ENC, SUBS_S_D_DESC;
+
+def SUBS_U_B : SUBS_U_B_ENC, SUBS_U_B_DESC;
+def SUBS_U_H : SUBS_U_H_ENC, SUBS_U_H_DESC;
+def SUBS_U_W : SUBS_U_W_ENC, SUBS_U_W_DESC;
+def SUBS_U_D : SUBS_U_D_ENC, SUBS_U_D_DESC;
+
+def SUBSUS_U_B : SUBSUS_U_B_ENC, SUBSUS_U_B_DESC;
+def SUBSUS_U_H : SUBSUS_U_H_ENC, SUBSUS_U_H_DESC;
+def SUBSUS_U_W : SUBSUS_U_W_ENC, SUBSUS_U_W_DESC;
+def SUBSUS_U_D : SUBSUS_U_D_ENC, SUBSUS_U_D_DESC;
+
+def SUBSUU_S_B : SUBSUU_S_B_ENC, SUBSUU_S_B_DESC;
+def SUBSUU_S_H : SUBSUU_S_H_ENC, SUBSUU_S_H_DESC;
+def SUBSUU_S_W : SUBSUU_S_W_ENC, SUBSUU_S_W_DESC;
+def SUBSUU_S_D : SUBSUU_S_D_ENC, SUBSUU_S_D_DESC;
+
+def SUBV_B : SUBV_B_ENC, SUBV_B_DESC;
+def SUBV_H : SUBV_H_ENC, SUBV_H_DESC;
+def SUBV_W : SUBV_W_ENC, SUBV_W_DESC;
+def SUBV_D : SUBV_D_ENC, SUBV_D_DESC;
+
+def SUBVI_B : SUBVI_B_ENC, SUBVI_B_DESC;
+def SUBVI_H : SUBVI_H_ENC, SUBVI_H_DESC;
+def SUBVI_W : SUBVI_W_ENC, SUBVI_W_DESC;
+def SUBVI_D : SUBVI_D_ENC, SUBVI_D_DESC;
+
+def VSHF_B : VSHF_B_ENC, VSHF_B_DESC;
+def VSHF_H : VSHF_H_ENC, VSHF_H_DESC;
+def VSHF_W : VSHF_W_ENC, VSHF_W_DESC;
+def VSHF_D : VSHF_D_ENC, VSHF_D_DESC;
+
+def XOR_V : XOR_V_ENC, XOR_V_DESC;
+def XOR_V_H_PSEUDO : XOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_W_PSEUDO : XOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_D_PSEUDO : XOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def XORI_B : XORI_B_ENC, XORI_B_DESC;
+
+// Patterns.
+class MSAPat<dag pattern, dag result, list<Predicate> pred = [HasMSA]> :
+  Pat<pattern, result>, Requires<pred>;
+
+def : MSAPat<(extractelt (v4i32 MSA128W:$ws), immZExt4:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt4:$idx)>;
+
+def : MSAPat<(v16i8 (load addr:$addr)), (LD_B addr:$addr)>;
+def : MSAPat<(v8i16 (load addr:$addr)), (LD_H addr:$addr)>;
+def : MSAPat<(v4i32 (load addr:$addr)), (LD_W addr:$addr)>;
+def : MSAPat<(v2i64 (load addr:$addr)), (LD_D addr:$addr)>;
+def : MSAPat<(v8f16 (load addr:$addr)), (LD_H addr:$addr)>;
+def : MSAPat<(v4f32 (load addr:$addr)), (LD_W addr:$addr)>;
+def : MSAPat<(v2f64 (load addr:$addr)), (LD_D addr:$addr)>;
+
+def : MSAPat<(v8f16 (load addrRegImm:$addr)), (LD_H addrRegImm:$addr)>;
+def : MSAPat<(v4f32 (load addrRegImm:$addr)), (LD_W addrRegImm:$addr)>;
+def : MSAPat<(v2f64 (load addrRegImm:$addr)), (LD_D addrRegImm:$addr)>;
+
+def : MSAPat<(store (v16i8 MSA128B:$ws), addr:$addr),
+             (ST_B MSA128B:$ws, addr:$addr)>;
+def : MSAPat<(store (v8i16 MSA128H:$ws), addr:$addr),
+             (ST_H MSA128H:$ws, addr:$addr)>;
+def : MSAPat<(store (v4i32 MSA128W:$ws), addr:$addr),
+             (ST_W MSA128W:$ws, addr:$addr)>;
+def : MSAPat<(store (v2i64 MSA128D:$ws), addr:$addr),
+             (ST_D MSA128D:$ws, addr:$addr)>;
+def : MSAPat<(store (v8f16 MSA128H:$ws), addr:$addr),
+             (ST_H MSA128H:$ws, addr:$addr)>;
+def : MSAPat<(store (v4f32 MSA128W:$ws), addr:$addr),
+             (ST_W MSA128W:$ws, addr:$addr)>;
+def : MSAPat<(store (v2f64 MSA128D:$ws), addr:$addr),
+             (ST_D MSA128D:$ws, addr:$addr)>;
+
+def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrRegImm:$addr),
+                   (ST_H MSA128H:$ws, addrRegImm:$addr)>;
+def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrRegImm:$addr),
+                   (ST_W MSA128W:$ws, addrRegImm:$addr)>;
+def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrRegImm:$addr),
+                   (ST_D MSA128D:$ws, addrRegImm:$addr)>;
+
+class MSA_FABS_PSEUDO_DESC_BASE<RegisterOperand ROWD,
+                                RegisterOperand ROWS = ROWD,
+                                InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs ROWD:$wd),
+             (ins ROWS:$ws),
+             [(set ROWD:$wd, (fabs ROWS:$ws))]> {
+  InstrItinClass Itinerary = itin;
+}
+def FABS_W : MSA_FABS_PSEUDO_DESC_BASE<MSA128WOpnd>,
+             PseudoInstExpansion<(FMAX_A_W MSA128WOpnd:$wd, MSA128WOpnd:$ws,
+                                           MSA128WOpnd:$ws)>;
+def FABS_D : MSA_FABS_PSEUDO_DESC_BASE<MSA128DOpnd>,
+             PseudoInstExpansion<(FMAX_A_D MSA128DOpnd:$wd, MSA128DOpnd:$ws,
+                                           MSA128DOpnd:$ws)>;
+
+class MSABitconvertPat<ValueType DstVT, ValueType SrcVT,
+                       RegisterClass DstRC, list<Predicate> preds = [HasMSA]> :
+   MSAPat<(DstVT (bitconvert SrcVT:$src)),
+          (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>;
+
+// These are endian-independant because the element size doesnt change
+def : MSABitconvertPat<v8i16, v8f16, MSA128H>;
+def : MSABitconvertPat<v4i32, v4f32, MSA128W>;
+def : MSABitconvertPat<v2i64, v2f64, MSA128D>;
+def : MSABitconvertPat<v8f16, v8i16, MSA128H>;
+def : MSABitconvertPat<v4f32, v4i32, MSA128W>;
+def : MSABitconvertPat<v2f64, v2i64, MSA128D>;
+
+// Little endian bitcasts are always no-ops
+def : MSABitconvertPat<v16i8, v8i16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4i32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2i64, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v8f16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4f32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2f64, MSA128B, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v8i16, v16i8, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4i32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2i64, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4f32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2f64, MSA128H, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4i32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2i64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4f32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2f64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+// Big endian bitcasts expand to shuffle instructions.
+// This is because bitcast is defined to be a store/load sequence and the
+// vector store/load instructions are mixed-endian with respect to the vector
+// as a whole (little endian with respect to element order, but big endian
+// elements).
+
+class MSABitconvertReverseQuartersPat<ValueType DstVT, ValueType SrcVT,
+                                      RegisterClass DstRC, MSAInst Insn,
+                                      RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 27),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHalvesPat<ValueType DstVT, ValueType SrcVT,
+                                    RegisterClass DstRC, MSAInst Insn,
+                                    RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 177),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseBInHPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS
+           (SHF_W
+             (COPY_TO_REGCLASS
+               (SHF_B (COPY_TO_REGCLASS SrcVT:$src, MSA128B), 27),
+               MSA128W), 177),
+           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseHInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseWInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_W, MSA128W>;
+
+def : MSABitconvertReverseBInHPat<v8i16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInHPat<v8f16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInWPat<v4i32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInWPat<v4f32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInDPat<v2i64, v16i8, MSA128D>;
+def : MSABitconvertReverseBInDPat<v2f64, v16i8, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8i16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8i16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8i16, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8f16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8f16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8f16, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4i32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4i32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4i32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4i32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4i32, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4f32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4f32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4f32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4f32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4f32, MSA128D>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2i64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2i64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2i64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2i64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2i64, MSA128W>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2f64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2f64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2f64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2f64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2f64, MSA128W>;
+
+// Pseudos used to implement BNZ.df, and BZ.df
+
+class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
+                                   RegisterClass RCWS,
+                                   InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs GPR32:$dst),
+             (ins RCWS:$ws),
+             [(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
+  bit usesCustomInserter = 1;
+}
+
+def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+def SNZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16,
+                                                MSA128H, NoItinerary>;
+def SNZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32,
+                                                MSA128W, NoItinerary>;
+def SNZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64,
+                                                MSA128D, NoItinerary>;
+def SNZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+
+def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8,
+                                               MSA128B, NoItinerary>;
+def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16,
+                                               MSA128H, NoItinerary>;
+def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32,
+                                               MSA128W, NoItinerary>;
+def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
+                                               MSA128D, NoItinerary>;
+def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
+                                               MSA128B, NoItinerary>;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index a7299d7..dedf802 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -22,6 +23,53 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
+// class MipsCallEntry.
+MipsCallEntry::MipsCallEntry(const StringRef &N) {
+#ifndef NDEBUG
+  Name = N;
+  Val = 0;
+#endif
+}
+
+MipsCallEntry::MipsCallEntry(const GlobalValue *V) {
+#ifndef NDEBUG
+  Val = V;
+#endif
+}
+
+bool MipsCallEntry::isConstant(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::isAliased(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const {
+  return false;
+}
+
+void MipsCallEntry::printCustom(raw_ostream &O) const {
+  O << "MipsCallEntry: ";
+#ifndef NDEBUG
+  if (Val)
+    O << Val->getName();
+  else
+    O << Name;
+#endif
+}
+
+MipsFunctionInfo::~MipsFunctionInfo() {
+  for (StringMap<const MipsCallEntry *>::iterator
+       I = ExternalCallEntries.begin(), E = ExternalCallEntries.end(); I != E;
+       ++I)
+    delete I->getValue();
+
+  for (ValueMap<const GlobalValue *, const MipsCallEntry *>::iterator
+       I = GlobalCallEntries.begin(), E = GlobalCallEntries.end(); I != E; ++I)
+    delete I->second;
+}
+
 bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
 }
@@ -72,4 +120,22 @@ bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
                         || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
 }
 
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const StringRef &Name) {
+  const MipsCallEntry *&E = ExternalCallEntries[Name];
+
+  if (!E)
+    E = new MipsCallEntry(Name);
+
+  return MachinePointerInfo(E);
+}
+
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) {
+  const MipsCallEntry *&E = GlobalCallEntries[Val];
+
+  if (!E)
+    E = new MipsCallEntry(Val);
+
+  return MachinePointerInfo(E);
+}
+
 void MipsFunctionInfo::anchor() { }
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index b05b348..43bf682 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -15,56 +15,48 @@
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
 #include "MipsSubtarget.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include <utility>
 
 namespace llvm {
 
+/// \brief A class derived from PseudoSourceValue that represents a GOT entry
+/// resolved by lazy-binding.
+class MipsCallEntry : public PseudoSourceValue {
+public:
+  explicit MipsCallEntry(const StringRef &N);
+  explicit MipsCallEntry(const GlobalValue *V);
+  virtual bool isConstant(const MachineFrameInfo *) const;
+  virtual bool isAliased(const MachineFrameInfo *) const;
+  virtual bool mayAlias(const MachineFrameInfo *) const;
+
+private:
+  virtual void printCustom(raw_ostream &O) const;
+#ifndef NDEBUG
+  std::string Name;
+  const GlobalValue *Val;
+#endif
+};
+
 /// MipsFunctionInfo - This class is derived from MachineFunction private
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-
-  MachineFunction& MF;
-  /// SRetReturnReg - Some subtargets require that sret lowering includes
-  /// returning the value of the returned struct in a register. This field
-  /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
-
-  /// GlobalBaseReg - keeps track of the virtual register initialized for
-  /// use as the global base register. This is used for PIC in some PIC
-  /// relocation models.
-  unsigned GlobalBaseReg;
-
-  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
-  /// use as an alias for SP for use in load/store of halfword/byte from/to
-  /// the stack
-  unsigned Mips16SPAliasReg;
-
-  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
-
-  /// True if function has a byval argument.
-  bool HasByvalArg;
-
-  /// Size of incoming argument area.
-  unsigned IncomingArgSize;
-
-  /// CallsEhReturn - Whether the function calls llvm.eh.return.
-  bool CallsEhReturn;
-
-  /// Frame objects for spilling eh data registers.
-  int EhDataRegFI[4];
-
 public:
   MipsFunctionInfo(MachineFunction& MF)
    : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
      VarArgsFrameIndex(0), CallsEhReturn(false)
   {}
 
+  ~MipsFunctionInfo();
+
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
@@ -92,6 +84,51 @@ public:
   int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
   bool isEhDataRegFI(int FI) const;
 
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for an external function.
+  MachinePointerInfo callPtrInfo(const StringRef &Name);
+
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for a global function.
+  MachinePointerInfo callPtrInfo(const GlobalValue *Val);
+
+private:
+  virtual void anchor();
+
+  MachineFunction& MF;
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
+  /// use as an alias for SP for use in load/store of halfword/byte from/to
+  /// the stack
+  unsigned Mips16SPAliasReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// True if function has a byval argument.
+  bool HasByvalArg;
+
+  /// Size of incoming argument area.
+  unsigned IncomingArgSize;
+
+  /// CallsEhReturn - Whether the function calls llvm.eh.return.
+  bool CallsEhReturn;
+
+  /// Frame objects for spilling eh data registers.
+  int EhDataRegFI[4];
+
+  /// MipsCallEntry maps.
+  StringMap<const MipsCallEntry *> ExternalCallEntries;
+  ValueMap<const GlobalValue *, const MipsCallEntry *> GlobalCallEntries;
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp b/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp
deleted file mode 100644
index de3f09c..0000000
--- a/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-//===---- MipsOptimizeMathLibCalls.cpp - Optimize math lib calls.      ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass does an IR transformation which enables the backend to emit native
-// math instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsTargetMachine.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-using namespace llvm;
-
-static cl::opt<bool> DisableOpt("disable-mips-math-optimization",
-                                cl::init(false),
-                                cl::desc("MIPS: Disable math lib call "
-                                         "optimization."), cl::Hidden);
-
-namespace {
-  class MipsOptimizeMathLibCalls : public FunctionPass {
-  public:
-    static char ID;
-
-    MipsOptimizeMathLibCalls(MipsTargetMachine &TM_) :
-      FunctionPass(ID), TM(TM_) {}
-
-    virtual const char *getPassName() const {
-      return "MIPS: Optimize calls to math library functions.";
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-
-    virtual bool runOnFunction(Function &F);
-
-  private:
-    /// Optimize calls to sqrt.
-    bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
-                      BasicBlock &CurrBB,
-                      Function::iterator &BB);
-
-    const TargetMachine &TM;
-  };
-
-  char MipsOptimizeMathLibCalls::ID = 0;
-}
-
-FunctionPass *llvm::createMipsOptimizeMathLibCalls(MipsTargetMachine &TM) {
-  return new MipsOptimizeMathLibCalls(TM);
-}
-
-void MipsOptimizeMathLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<TargetLibraryInfo>();
-  FunctionPass::getAnalysisUsage(AU);
-}
-
-bool MipsOptimizeMathLibCalls::runOnFunction(Function &F) {
-  if (DisableOpt)
-    return false;
-
-  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-
-  if (Subtarget.inMips16Mode())
-    return false;
-
-  bool Changed = false;
-  Function::iterator CurrBB;
-  const TargetLibraryInfo *LibInfo = &getAnalysis<TargetLibraryInfo>();
-
-  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
-    CurrBB = BB++;
-
-    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
-         II != IE; ++II) {
-      CallInst *Call = dyn_cast<CallInst>(&*II);
-      Function *CalledFunc;
-
-      if (!Call || !(CalledFunc = Call->getCalledFunction()))
-        continue;
-
-      LibFunc::Func LibFunc;
-      Attribute A = CalledFunc->getAttributes()
-        .getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
-
-      // Skip if function has "use-soft-float" attribute.
-      if ((A.isStringAttribute() && (A.getValueAsString() == "true")) ||
-          TM.Options.UseSoftFloat)
-        continue;
-
-      // Skip if function either has local linkage or is not a known library
-      // function.
-      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
-          !LibInfo->getLibFunc(CalledFunc->getName(), LibFunc))
-        continue;
-
-      switch (LibFunc) {
-      case LibFunc::sqrtf:
-      case LibFunc::sqrt:
-        if (optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
-          break;
-        continue;
-      default:
-        continue;
-      }
-
-      Changed = true;
-      break;
-    }
-  }
-
-  return Changed;
-}
-
-bool MipsOptimizeMathLibCalls::optimizeSQRT(CallInst *Call,
-                                            Function *CalledFunc,
-                                            BasicBlock &CurrBB,
-                                            Function::iterator &BB) {
-  // There is no need to change the IR, since backend will emit sqrt
-  // instruction if the call has already been marked read-only.
-  if (Call->onlyReadsMemory())
-    return false;
-
-  // Do the following transformation:
-  //
-  // (before)
-  // dst = sqrt(src)
-  //
-  // (after)
-  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
-  // if (v0 is a NaN)
-  //   v1 = sqrt(src)         # library call.
-  // dst = phi(v0, v1)
-  //
-
-  // Move all instructions following Call to newly created block JoinBB.
-  // Create phi and replace all uses.
-  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this);
-  IRBuilder<> Builder(JoinBB, JoinBB->begin());
-  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
-  Call->replaceAllUsesWith(Phi);
-
-  // Create basic block LibCallBB and insert a call to library function sqrt.
-  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
-                                             CurrBB.getParent(), JoinBB);
-  Builder.SetInsertPoint(LibCallBB);
-  Instruction *LibCall = Call->clone();
-  Builder.Insert(LibCall);
-  Builder.CreateBr(JoinBB);
-
-  // Add attribute "readnone" so that backend can use a native sqrt instruction
-  // for this call. Insert a FP compare instruction and a conditional branch
-  // at the end of CurrBB.
-  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
-  CurrBB.getTerminator()->eraseFromParent();
-  Builder.SetInsertPoint(&CurrBB);
-  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
-  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
-
-  // Add phi operands.
-  Phi->addIncoming(Call, &CurrBB);
-  Phi->addIncoming(LibCall, LibCallBB);
-
-  BB = JoinBB;
-  return true;
-}
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 1919077..fe60841 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -14,9 +14,17 @@
 #define DEBUG_TYPE "mips-os16"
 #include "MipsOs16.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+
+static cl::opt<std::string> Mips32FunctionMask(
+  "mips32-function-mask",
+  cl::init(""),
+  cl::desc("Force function to be mips32"),
+  cl::Hidden);
+
 namespace {
 
   // Figure out if we need float point based on the function signature.
@@ -85,18 +93,43 @@ namespace llvm {
 
 
 bool MipsOs16::runOnModule(Module &M) {
-  DEBUG(errs() << "Run on Module MipsOs16\n");
+  bool usingMask = Mips32FunctionMask.length() > 0;
+  bool doneUsingMask = false; // this will make it stop repeating
+  DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+  if (usingMask)
+    DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+  unsigned int functionIndex = 0;
   bool modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     if (F->isDeclaration()) continue;
     DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-    if (needsFP(*F)) {
-      DEBUG(dbgs() << " need to compile as nomips16 \n");
-      F->addFnAttr("nomips16");
+    if (usingMask) {
+      if (!doneUsingMask) {
+        if (functionIndex == Mips32FunctionMask.length())
+          functionIndex = 0;
+        switch (Mips32FunctionMask[functionIndex]) {
+        case '1':
+          DEBUG(dbgs() << "mask forced mips32: " << F->getName() << "\n");
+          F->addFnAttr("nomips16");
+          break;
+        case '.':
+          doneUsingMask = true;
+          break;
+        default:
+          break;
+        }
+        functionIndex++;
+      }
     }
     else {
-      F->addFnAttr("mips16");
-      DEBUG(dbgs() << " no need to compile as nomips16 \n");
+      if (needsFP(*F)) {
+        DEBUG(dbgs() << "os16 forced mips32: " << F->getName() << "\n");
+        F->addFnAttr("nomips16");
+      }
+      else {
+        DEBUG(dbgs() << "os16 forced mips16: " << F->getName() << "\n");
+        F->addFnAttr("mips16");
+      }
     }
   }
   return modified;
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 0b5fc33..3105b02 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -47,6 +47,11 @@ MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
+const TargetRegisterClass *
+MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                     unsigned Kind) const {
+  return Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+}
 
 unsigned
 MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
@@ -56,7 +61,7 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
     return 0;
   case Mips::GPR32RegClassID:
   case Mips::GPR64RegClassID:
-  case Mips::DSPRegsRegClassID: {
+  case Mips::DSPRRegClassID: {
     const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
     return 28 - TFI->hasFP(MF);
   }
@@ -78,26 +83,34 @@ const uint16_t* MipsRegisterInfo::
 getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
-  else if (!Subtarget.hasMips64())
-    return CSR_O32_SaveList;
-  else if (Subtarget.isABI_N32())
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_SaveList;
+
+  if (Subtarget.isABI_N32())
     return CSR_N32_SaveList;
 
-  assert(Subtarget.isABI_N64());
-  return CSR_N64_SaveList;
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_SaveList;
+
+  return CSR_O32_SaveList;
 }
 
 const uint32_t*
 MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_RegMask;
-  else if (!Subtarget.hasMips64())
-    return CSR_O32_RegMask;
-  else if (Subtarget.isABI_N32())
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_RegMask;
+
+  if (Subtarget.isABI_N32())
     return CSR_N32_RegMask;
 
-  assert(Subtarget.isABI_N64());
-  return CSR_N64_RegMask;
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_RegMask;
+
+  return CSR_O32_RegMask;
 }
 
 const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
@@ -123,7 +136,7 @@ getReservedRegs(const MachineFunction &MF) const {
   for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
     Reserved.set(ReservedGPR64[I]);
 
-  if (Subtarget.hasMips64()) {
+  if (Subtarget.isFP64bit()) {
     // Reserve all registers in AFGR64.
     for (RegIter Reg = Mips::AFGR64RegClass.begin(),
          EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
@@ -146,7 +159,6 @@ getReservedRegs(const MachineFunction &MF) const {
 
   // Reserve hardware registers.
   Reserved.set(Mips::HWR29);
-  Reserved.set(Mips::HWR29_64);
 
   // Reserve DSP control register.
   Reserved.set(Mips::DSPPos);
@@ -155,6 +167,16 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(Mips::DSPEFI);
   Reserved.set(Mips::DSPOutFlag);
 
+  // Reserve MSA control registers.
+  Reserved.set(Mips::MSAIR);
+  Reserved.set(Mips::MSACSR);
+  Reserved.set(Mips::MSAAccess);
+  Reserved.set(Mips::MSASave);
+  Reserved.set(Mips::MSAModify);
+  Reserved.set(Mips::MSARequest);
+  Reserved.set(Mips::MSAMap);
+  Reserved.set(Mips::MSAUnmap);
+
   // Reserve RA if in mips16 mode.
   if (Subtarget.inMips16Mode()) {
     Reserved.set(Mips::RA);
@@ -218,12 +240,3 @@ getFrameRegister(const MachineFunction &MF) const {
 
 }
 
-unsigned MipsRegisterInfo::
-getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned MipsRegisterInfo::
-getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 20ba41d..0450c6f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -42,6 +42,9 @@ public:
   void adjustMipsStackFrame(MachineFunction &MF) const;
 
   /// Code Generation virtual methods...
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const;
+
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const;
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
@@ -65,10 +68,6 @@ public:
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
-  /// Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-
   /// \brief Return GPR register class.
   virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index c72c30d..3173d09 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -11,9 +11,8 @@
 //  Declarations that describe the MIPS register file
 //===----------------------------------------------------------------------===//
 let Namespace = "Mips" in {
-def sub_fpeven : SubRegIndex<32>;
-def sub_fpodd  : SubRegIndex<32, 32>;
 def sub_32     : SubRegIndex<32>;
+def sub_64     : SubRegIndex<64>;
 def sub_lo     : SubRegIndex<32>;
 def sub_hi     : SubRegIndex<32, 32>;
 def sub_dsp16_19 : SubRegIndex<4, 16>;
@@ -54,17 +53,24 @@ class FPR<bits<16> Enc, string n> : MipsReg<Enc, n>;
 // Mips 64-bit (aliased) FPU Registers
 class AFPR<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
-  let SubRegIndices = [sub_fpeven, sub_fpodd];
+  let SubRegIndices = [sub_lo, sub_hi];
   let CoveredBySubRegs = 1;
 }
 
 class AFPR64<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
-  let SubRegIndices = [sub_32];
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips 128-bit (aliased) MSA Registers
+class AFPR128<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_64];
 }
 
 // Accumulator Registers
-class ACC<bits<16> Enc, string n, list<Register> subregs>
+class ACCReg<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
   let SubRegIndices = [sub_lo, sub_hi];
   let CoveredBySubRegs = 1;
@@ -150,6 +156,10 @@ let Namespace = "Mips" in {
   foreach I = 0-31 in
   def F#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
 
+  // Higher half of 64-bit FP registers.
+  foreach I = 0-31 in
+  def F_HI#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
   foreach I = 0-15 in
@@ -159,22 +169,28 @@ let Namespace = "Mips" in {
 
   /// Mips Double point precision FPU Registers in MFP64 mode.
   foreach I = 0-31 in
-  def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I)]>,
+  def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I), !cast<FPR>("F_HI"#I)]>,
                 DwarfRegNum<[!add(I, 32)]>;
 
+  /// Mips MSA registers
+  /// MSA and FPU cannot both be present unless the FPU has 64-bit registers
+  foreach I = 0-31 in
+  def W#I : AFPR128<I, "w"#I, [!cast<AFPR64>("D"#I#"_64")]>,
+            DwarfRegNum<[!add(I, 32)]>;
+
   // Hi/Lo registers
-  def HI  : Register<"ac0">, DwarfRegNum<[64]>;
-  def HI1 : Register<"ac1">, DwarfRegNum<[176]>;
-  def HI2 : Register<"ac2">, DwarfRegNum<[178]>;
-  def HI3 : Register<"ac3">, DwarfRegNum<[180]>;
-  def LO  : Register<"ac0">, DwarfRegNum<[65]>;
-  def LO1 : Register<"ac1">, DwarfRegNum<[177]>;
-  def LO2 : Register<"ac2">, DwarfRegNum<[179]>;
-  def LO3 : Register<"ac3">, DwarfRegNum<[181]>;
+  def HI0 : MipsReg<0, "ac0">, DwarfRegNum<[64]>;
+  def HI1 : MipsReg<1, "ac1">, DwarfRegNum<[176]>;
+  def HI2 : MipsReg<2, "ac2">, DwarfRegNum<[178]>;
+  def HI3 : MipsReg<3, "ac3">, DwarfRegNum<[180]>;
+  def LO0 : MipsReg<0, "ac0">, DwarfRegNum<[65]>;
+  def LO1 : MipsReg<1, "ac1">, DwarfRegNum<[177]>;
+  def LO2 : MipsReg<2, "ac2">, DwarfRegNum<[179]>;
+  def LO3 : MipsReg<3, "ac3">, DwarfRegNum<[181]>;
 
   let SubRegIndices = [sub_32] in {
-  def HI64  : RegisterWithSubRegs<"hi", [HI]>;
-  def LO64  : RegisterWithSubRegs<"lo", [LO]>;
+  def HI0_64  : RegisterWithSubRegs<"hi", [HI0]>;
+  def LO0_64  : RegisterWithSubRegs<"lo", [LO0]>;
   }
 
   // FP control registers.
@@ -185,20 +201,22 @@ let Namespace = "Mips" in {
   foreach I = 0-7 in
   def FCC#I : MipsReg<#I, "fcc"#I>;
 
+  // COP2 registers.
+  foreach I = 0-31 in
+  def COP2#I : MipsReg<#I, ""#I>;
+
   // PC register
   def PC : Register<"pc">;
 
   // Hardware register $29
   def HWR29 : MipsReg<29, "29">;
-  def HWR29_64 : MipsReg<29, "29">;
 
   // Accum registers
-  def AC0 : ACC<0, "ac0", [LO, HI]>;
-  def AC1 : ACC<1, "ac1", [LO1, HI1]>;
-  def AC2 : ACC<2, "ac2", [LO2, HI2]>;
-  def AC3 : ACC<3, "ac3", [LO3, HI3]>;
+  foreach I = 0-3 in
+  def AC#I : ACCReg<#I, "ac"#I,
+                    [!cast<Register>("LO"#I), !cast<Register>("HI"#I)]>;
 
-  def AC0_64 : ACC<0, "ac0", [LO64, HI64]>;
+  def AC0_64 : ACCReg<0, "ac0", [LO0_64, HI0_64]>;
 
   // DSP-ASE control register fields.
   def DSPPos : Register<"">;
@@ -217,6 +235,16 @@ let Namespace = "Mips" in {
   def DSPOutFlag : RegisterWithSubRegs<"", [DSPOutFlag16_19, DSPOutFlag20,
                                             DSPOutFlag21, DSPOutFlag22,
                                             DSPOutFlag23]>;
+
+  // MSA-ASE control registers.
+  def MSAIR      : MipsReg<0, "0">;
+  def MSACSR     : MipsReg<1, "1">;
+  def MSAAccess  : MipsReg<2, "2">;
+  def MSASave    : MipsReg<3, "3">;
+  def MSAModify  : MipsReg<4, "4">;
+  def MSARequest : MipsReg<5, "5">;
+  def MSAMap     : MipsReg<6, "6">;
+  def MSAUnmap   : MipsReg<7, "7">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -239,7 +267,7 @@ class GPR32Class<list<ValueType> regTypes> :
   K0, K1, GP, SP, FP, RA)>;
 
 def GPR32 : GPR32Class<[i32]>;
-def DSPRegs : GPR32Class<[v4i8, v2i16]>;
+def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
 def GPR64 : RegisterClass<"Mips", [i64], 64, (add
 // Reserved
@@ -281,6 +309,9 @@ def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
 // * FGR32 - 32 32-bit registers (single float only mode)
 def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
 
+def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>,
+             Unallocatable;
+
 def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Return Values and Arguments
   D0, D1,
@@ -303,33 +334,48 @@ def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
 def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
           Unallocatable;
 
+def MSA128B: RegisterClass<"Mips", [v16i8], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
+                           (sequence "W%u", 0, 31)>;
+
+def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
+  MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
+
 // Hi/Lo Registers
-def LORegs : RegisterClass<"Mips", [i32], 32, (add LO)>;
-def HIRegs : RegisterClass<"Mips", [i32], 32, (add HI)>;
-def LORegsDSP : RegisterClass<"Mips", [i32], 32, (add LO, LO1, LO2, LO3)>;
-def HIRegsDSP : RegisterClass<"Mips", [i32], 32, (add HI, HI1, HI2, HI3)>;
-def LORegs64 : RegisterClass<"Mips", [i64], 64, (add LO64)>;
-def HIRegs64 : RegisterClass<"Mips", [i64], 64, (add HI64)>;
+def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>;
+def HI32 : RegisterClass<"Mips", [i32], 32, (add HI0)>;
+def LO32DSP : RegisterClass<"Mips", [i32], 32, (sequence "LO%u", 0, 3)>;
+def HI32DSP : RegisterClass<"Mips", [i32], 32, (sequence "HI%u", 0, 3)>;
+def LO64 : RegisterClass<"Mips", [i64], 64, (add LO0_64)>;
+def HI64 : RegisterClass<"Mips", [i64], 64, (add HI0_64)>;
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>, Unallocatable;
-def HWRegs64 : RegisterClass<"Mips", [i64], 64, (add HWR29_64)>, Unallocatable;
 
 // Accumulator Registers
-def ACRegs : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
+def ACC64 : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
   let Size = 64;
 }
 
-def ACRegs128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
+def ACC128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
   let Size = 128;
 }
 
-def ACRegsDSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
+def ACC64DSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
   let Size = 64;
 }
 
 def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
 
+// Coprocessor 2 registers.
+def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
+           Unallocatable;
+
 // Register Operands.
 
 class MipsAsmRegOperand : AsmOperandClass {
@@ -345,9 +391,19 @@ def GPR64AsmOperand : MipsAsmRegOperand {
   let ParserMethod = "parseGPR64";
 }
 
-def ACRegsDSPAsmOperand : MipsAsmRegOperand {
-  let Name = "ACRegsDSPAsm";
-  let ParserMethod = "parseACRegsDSP";
+def ACC64DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "ACC64DSPAsm";
+  let ParserMethod = "parseACC64DSP";
+}
+
+def LO32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "LO32DSPAsm";
+  let ParserMethod = "parseLO32DSP";
+}
+
+def HI32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "HI32DSPAsm";
+  let ParserMethod = "parseHI32DSP";
 }
 
 def CCRAsmOperand : MipsAsmRegOperand {
@@ -370,11 +426,41 @@ def FGR32AsmOperand : MipsAsmRegOperand {
   let ParserMethod = "parseFGR32Regs";
 }
 
+def FGRH32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGRH32Asm";
+  let ParserMethod = "parseFGRH32Regs";
+}
+
 def FCCRegsAsmOperand : MipsAsmRegOperand {
   let Name = "FCCRegsAsm";
   let ParserMethod = "parseFCCRegs";
 }
 
+def MSA128BAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128BAsm";
+  let ParserMethod = "parseMSA128BRegs";
+}
+
+def MSA128HAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128HAsm";
+  let ParserMethod = "parseMSA128HRegs";
+}
+
+def MSA128WAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128WAsm";
+  let ParserMethod = "parseMSA128WRegs";
+}
+
+def MSA128DAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128DAsm";
+  let ParserMethod = "parseMSA128DRegs";
+}
+
+def MSA128CRAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128CRAsm";
+  let ParserMethod = "parseMSA128CtrlRegs";
+}
+
 def GPR32Opnd : RegisterOperand<GPR32> {
   let ParserMatchClass = GPR32AsmOperand;
 }
@@ -383,6 +469,10 @@ def GPR64Opnd : RegisterOperand<GPR64> {
   let ParserMatchClass = GPR64AsmOperand;
 }
 
+def DSPROpnd : RegisterOperand<DSPR> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
 def CCROpnd : RegisterOperand<CCR> {
   let ParserMatchClass = CCRAsmOperand;
 }
@@ -392,35 +482,68 @@ def HWRegsAsmOperand : MipsAsmRegOperand {
   let ParserMethod = "parseHWRegs";
 }
 
-def HW64RegsAsmOperand : MipsAsmRegOperand {
-  let Name = "HW64RegsAsm";
-  let ParserMethod = "parseHW64Regs";
+def COP2AsmOperand : MipsAsmRegOperand {
+  let Name = "COP2Asm";
+  let ParserMethod = "parseCOP2";
 }
 
 def HWRegsOpnd : RegisterOperand<HWRegs> {
   let ParserMatchClass = HWRegsAsmOperand;
 }
 
-def HW64RegsOpnd : RegisterOperand<HWRegs64> {
-  let ParserMatchClass = HW64RegsAsmOperand;
-}
-
-def AFGR64RegsOpnd : RegisterOperand<AFGR64> {
+def AFGR64Opnd : RegisterOperand<AFGR64> {
   let ParserMatchClass = AFGR64AsmOperand;
 }
 
-def FGR64RegsOpnd : RegisterOperand<FGR64> {
+def FGR64Opnd : RegisterOperand<FGR64> {
   let ParserMatchClass = FGR64AsmOperand;
 }
 
-def FGR32RegsOpnd : RegisterOperand<FGR32> {
+def FGR32Opnd : RegisterOperand<FGR32> {
   let ParserMatchClass = FGR32AsmOperand;
 }
 
+def FGRH32Opnd : RegisterOperand<FGRH32> {
+  let ParserMatchClass = FGRH32AsmOperand;
+}
+
 def FCCRegsOpnd : RegisterOperand<FCC> {
   let ParserMatchClass = FCCRegsAsmOperand;
 }
 
-def ACRegsDSPOpnd : RegisterOperand<ACRegsDSP> {
-  let ParserMatchClass = ACRegsDSPAsmOperand;
+def LO32DSPOpnd : RegisterOperand<LO32DSP> {
+  let ParserMatchClass = LO32DSPAsmOperand;
 }
+
+def HI32DSPOpnd : RegisterOperand<HI32DSP> {
+  let ParserMatchClass = HI32DSPAsmOperand;
+}
+
+def ACC64DSPOpnd : RegisterOperand<ACC64DSP> {
+  let ParserMatchClass = ACC64DSPAsmOperand;
+}
+
+def COP2Opnd : RegisterOperand<COP2> {
+  let ParserMatchClass = COP2AsmOperand;
+}
+
+def MSA128BOpnd : RegisterOperand<MSA128B> {
+  let ParserMatchClass = MSA128BAsmOperand;
+}
+
+def MSA128HOpnd : RegisterOperand<MSA128H> {
+  let ParserMatchClass = MSA128HAsmOperand;
+}
+
+def MSA128WOpnd : RegisterOperand<MSA128W> {
+  let ParserMatchClass = MSA128WAsmOperand;
+}
+
+def MSA128DOpnd : RegisterOperand<MSA128D> {
+  let ParserMatchClass = MSA128DAsmOperand;
+}
+
+def MSA128CROpnd : RegisterOperand<MSACtrl> {
+  let ParserMatchClass = MSA128CRAsmOperand;
+}
+
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index d9e0fa4..33ed4b3 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -32,6 +32,21 @@ using namespace llvm;
 namespace {
 typedef MachineBasicBlock::iterator Iter;
 
+static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
+  if (Mips::ACC64RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI,
+                          (unsigned)Mips::PseudoMFLO);
+
+  if (Mips::ACC64DSPRegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::MFHI_DSP, (unsigned)Mips::MFLO_DSP);
+
+  if (Mips::ACC128RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI64,
+                          (unsigned)Mips::PseudoMFLO64);
+
+  return std::make_pair(0, 0);
+}
+
 /// Helper class to expand pseudos.
 class ExpandPseudo {
 public:
@@ -43,10 +58,11 @@ private:
   void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
   void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
   void expandLoadACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
-  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                      unsigned MFLoOpc, unsigned RegSize);
   bool expandCopy(MachineBasicBlock &MBB, Iter I);
-  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
-                     unsigned Src, unsigned RegSize);
+  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                     unsigned MFLoOpc);
 
   MachineFunction &MF;
   MachineRegisterInfo &MRI;
@@ -70,32 +86,26 @@ bool ExpandPseudo::expand() {
 bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
   switch(I->getOpcode()) {
   case Mips::LOAD_CCOND_DSP:
-  case Mips::LOAD_CCOND_DSP_P8:
     expandLoadCCond(MBB, I);
     break;
   case Mips::STORE_CCOND_DSP:
-  case Mips::STORE_CCOND_DSP_P8:
     expandStoreCCond(MBB, I);
     break;
-  case Mips::LOAD_AC64:
-  case Mips::LOAD_AC64_P8:
-  case Mips::LOAD_AC_DSP:
-  case Mips::LOAD_AC_DSP_P8:
+  case Mips::LOAD_ACC64:
+  case Mips::LOAD_ACC64DSP:
     expandLoadACC(MBB, I, 4);
     break;
-  case Mips::LOAD_AC128:
-  case Mips::LOAD_AC128_P8:
+  case Mips::LOAD_ACC128:
     expandLoadACC(MBB, I, 8);
     break;
-  case Mips::STORE_AC64:
-  case Mips::STORE_AC64_P8:
-  case Mips::STORE_AC_DSP:
-  case Mips::STORE_AC_DSP_P8:
-    expandStoreACC(MBB, I, 4);
+  case Mips::STORE_ACC64:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI, Mips::PseudoMFLO, 4);
+    break;
+  case Mips::STORE_ACC64DSP:
+    expandStoreACC(MBB, I, Mips::MFHI_DSP, Mips::MFLO_DSP, 4);
     break;
-  case Mips::STORE_AC128:
-  case Mips::STORE_AC128_P8:
-    expandStoreACC(MBB, I, 8);
+  case Mips::STORE_ACC128:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8);
     break;
   case TargetOpcode::COPY:
     if (!expandCopy(MBB, I))
@@ -179,10 +189,11 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
 }
 
 void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
+                                  unsigned MFHiOpc, unsigned MFLoOpc,
                                   unsigned RegSize) {
-  //  copy $vr0, lo
+  //  mflo $vr0, src
   //  store $vr0, FI
-  //  copy $vr1, hi
+  //  mfhi $vr1, src
   //  store $vr1, FI + 4
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
@@ -197,33 +208,29 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
   unsigned VR1 = MRI.createVirtualRegister(RC);
   unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
   unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
-  unsigned Lo = RegInfo.getSubReg(Src, Mips::sub_lo);
-  unsigned Hi = RegInfo.getSubReg(Src, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
 
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(Lo, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
   TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(Hi, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
   TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
 }
 
 bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
-  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
-
-  if (Mips::ACRegsDSPRegClass.contains(Dst, Src))
-    return expandCopyACC(MBB, I, Dst, Src, 4);
+  unsigned Src = I->getOperand(1).getReg();
+  std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
 
-  if (Mips::ACRegs128RegClass.contains(Dst, Src))
-    return expandCopyACC(MBB, I, Dst, Src, 8);
+  if (!Opcodes.first)
+    return false;
 
-  return false;
+  return expandCopyACC(MBB, I, Opcodes.first, Opcodes.second);
 }
 
-bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
-                                 unsigned Src, unsigned RegSize) {
-  //  copy $vr0, src_lo
+bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
+                                 unsigned MFHiOpc, unsigned MFLoOpc) {
+  //  mflo $vr0, src
   //  copy dst_lo, $vr0
-  //  copy $vr1, src_hi
+  //  mfhi $vr1, src
   //  copy dst_hi, $vr1
 
   const MipsSEInstrInfo &TII =
@@ -231,20 +238,20 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
   const MipsRegisterInfo &RegInfo =
     *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
 
-  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+  unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+  const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
   unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
   unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
   unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
-  unsigned SrcLo = RegInfo.getSubReg(Src, Mips::sub_lo);
-  unsigned SrcHi = RegInfo.getSubReg(Src, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
 
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(SrcLo, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
   BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstLo)
     .addReg(VR0, RegState::Kill);
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(SrcHi, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
   BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstHi)
     .addReg(VR1, RegState::Kill);
   return true;
@@ -321,9 +328,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
       // one for each of the paired single precision registers.
       if (Mips::AFGR64RegClass.contains(Reg)) {
         unsigned Reg0 =
-            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_fpeven), true);
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_lo), true);
         unsigned Reg1 =
-            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_fpodd), true);
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_hi), true);
 
         if (!STI.isLittle())
           std::swap(Reg0, Reg1);
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 193a66c..8fa9e46 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -21,7 +21,7 @@ namespace llvm {
 class MipsSEFrameLowering : public MipsFrameLowering {
 public:
   explicit MipsSEFrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, STI.hasMips64() ? 16 : 8) {}
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 3b6480a..737660e 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -66,6 +66,21 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
     MIB.addReg(Mips::DSPEFI, Flag);
 }
 
+unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
+  switch (cast<ConstantSDNode>(RegIdx)->getZExtValue()) {
+  default:
+    llvm_unreachable("Could not map int to register");
+  case 0: return Mips::MSAIR;
+  case 1: return Mips::MSACSR;
+  case 2: return Mips::MSAAccess;
+  case 3: return Mips::MSASave;
+  case 4: return Mips::MSAModify;
+  case 5: return Mips::MSARequest;
+  case 6: return Mips::MSAMap;
+  case 7: return Mips::MSAUnmap;
+  }
+}
+
 bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
                                                 const MachineInstr& MI) {
   unsigned DstReg = 0, ZeroReg = 0;
@@ -301,6 +316,20 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   return false;
 }
 
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  return false;
+}
+
 bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                            SDValue &Offset) const {
   Base = Addr;
@@ -314,6 +343,263 @@ bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
     selectAddrDefault(Addr, Base, Offset);
 }
 
+/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  EVT ValTy = Addr.getValueType();
+
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<12>(CN->getSExtValue())) {
+
+      // If the first operand is a FI then get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm12(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+// Select constant vector splats.
+//
+// Returns true and sets Imm if:
+// * MSA is enabled
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  if (!Subtarget.hasMSA())
+    return false;
+
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
+
+  if (Node == NULL)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                             HasAnyUndefs, 8,
+                             !Subtarget.isLittle()))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Select constant vector splats.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value fits in an integer with the specified signed-ness and
+//   width.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+//
+// It's worth noting that this function is not used as part of the selection
+// of ldi.[bhwd] since it does not permit using the wrong-typed ldi.[bhwd]
+// instruction to achieve the desired bit pattern. ldi.[bhwd] is selected in
+// MipsSEDAGToDAGISel::selectNode.
+bool MipsSEDAGToDAGISel::
+selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                   unsigned ImmBitSize) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) ||
+        (!Signed && ImmValue.isIntN(ImmBitSize))) {
+      Imm = CurDAG->getTargetConstant(ImmValue, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 1);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 2);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 3);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 4);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 5);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 6);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 8);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, true, 5);
+}
+
+// Select constant vector splats whose value is a power of 2.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a power of two.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = ImmValue.exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of left-most bits set (e.g. 0b11...1100...00).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of left-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero from the bitwise
+    // inverse of ImmValue, and test that the inverse of this is the same
+    // as the original value.
+    if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
+
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of right-most bits set (e.g. 0b00...0011...11).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of right-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero, and test that the
+    // result is the same as the original value
+    if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
+                                                 SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = (~ImmValue).exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
   SDLoc DL(Node);
@@ -348,6 +634,11 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO_64, MVT::i64);
         Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
+      } else if (Subtarget.isFP64bit()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64,
+                                        Zero, Zero);
       } else {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO, MVT::i32);
@@ -401,24 +692,71 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     return std::make_pair(true, RegOpnd);
   }
 
+  case ISD::INTRINSIC_W_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_cfcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx = Node->getOperand(2);
+      SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
+                                           getMSACtrlReg(RegIdx), MVT::i32);
+      return std::make_pair(true, Reg.getNode());
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_move_v:
+      // Like an assignment but will always produce a move.v even if
+      // unnecessary.
+      return std::make_pair(true,
+                            CurDAG->getMachineNode(Mips::MOVE_V, DL,
+                                                   Node->getValueType(0),
+                                                   Node->getOperand(1)));
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_VOID: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_ctcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx  = Node->getOperand(2);
+      SDValue Value   = Node->getOperand(3);
+      SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
+                                              getMSACtrlReg(RegIdx), Value);
+      return std::make_pair(true, ChainOut.getNode());
+    }
+    }
+    break;
+  }
+
   case MipsISD::ThreadPointer: {
     EVT PtrVT = getTargetLowering()->getPointerTy();
-    unsigned RdhwrOpc, SrcReg, DestReg;
+    unsigned RdhwrOpc, DestReg;
 
     if (PtrVT == MVT::i32) {
       RdhwrOpc = Mips::RDHWR;
-      SrcReg = Mips::HWR29;
       DestReg = Mips::V1;
     } else {
       RdhwrOpc = Mips::RDHWR64;
-      SrcReg = Mips::HWR29_64;
       DestReg = Mips::V1_64;
     }
 
     SDNode *Rdhwr =
       CurDAG->getMachineNode(RdhwrOpc, SDLoc(Node),
                              Node->getValueType(0),
-                             CurDAG->getRegister(SrcReg, PtrVT));
+                             CurDAG->getRegister(Mips::HWR29, MVT::i32));
     SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
                                          SDValue(Rdhwr, 0));
     SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
@@ -426,18 +764,81 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     return std::make_pair(true, ResNode.getNode());
   }
 
-  case MipsISD::InsertLOHI: {
-    unsigned RCID = Subtarget.hasDSP() ? Mips::ACRegsDSPRegClassID :
-                                         Mips::ACRegsRegClassID;
-    SDValue RegClass = CurDAG->getTargetConstant(RCID, MVT::i32);
-    SDValue LoIdx = CurDAG->getTargetConstant(Mips::sub_lo, MVT::i32);
-    SDValue HiIdx = CurDAG->getTargetConstant(Mips::sub_hi, MVT::i32);
-    const SDValue Ops[] = { RegClass, Node->getOperand(0), LoIdx,
-                            Node->getOperand(1), HiIdx };
-    SDNode *Res = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
-                                         MVT::Untyped, Ops);
+  case ISD::BUILD_VECTOR: {
+    // Select appropriate ldi.[bhwd] instructions for constant splats of
+    // 128-bit when MSA is enabled. Fixup any register class mismatches that
+    // occur as a result.
+    //
+    // This allows the compiler to use a wider range of immediates than would
+    // otherwise be allowed. If, for example, v4i32 could only use ldi.h then
+    // it would not be possible to load { 0x01010101, 0x01010101, 0x01010101,
+    // 0x01010101 } without using a constant pool. This would be sub-optimal
+    // when // 'ldi.b wd, 1' is capable of producing that bit-pattern in the
+    // same set/ of registers. Similarly, ldi.h isn't capable of producing {
+    // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
+
+    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned LdiOp;
+    EVT ResVecTy = BVN->getValueType(0);
+    EVT ViaVecTy;
+
+    if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector())
+      return std::make_pair(false, (SDNode*)NULL);
+
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, 8,
+                              !Subtarget.isLittle()))
+      return std::make_pair(false, (SDNode*)NULL);
+
+    switch (SplatBitSize) {
+    default:
+      return std::make_pair(false, (SDNode*)NULL);
+    case 8:
+      LdiOp = Mips::LDI_B;
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      LdiOp = Mips::LDI_H;
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      LdiOp = Mips::LDI_W;
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      LdiOp = Mips::LDI_D;
+      ViaVecTy = MVT::v2i64;
+      break;
+    }
+
+    if (!SplatValue.isSignedIntN(10))
+      return std::make_pair(false, (SDNode*)NULL);
+
+    SDValue Imm = CurDAG->getTargetConstant(SplatValue,
+                                            ViaVecTy.getVectorElementType());
+
+    SDNode *Res = CurDAG->getMachineNode(LdiOp, SDLoc(Node), ViaVecTy, Imm);
+
+    if (ResVecTy != ViaVecTy) {
+      // If LdiOp is writing to a different register class to ResVecTy, then
+      // fix it up here. This COPY_TO_REGCLASS should never cause a move.v
+      // since the source and destination register sets contain the same
+      // registers.
+      const TargetLowering *TLI = getTargetLowering();
+      MVT ResVecTySimple = ResVecTy.getSimpleVT();
+      const TargetRegisterClass *RC = TLI->getRegClassFor(ResVecTySimple);
+      Res = CurDAG->getMachineNode(Mips::COPY_TO_REGCLASS, SDLoc(Node),
+                                   ResVecTy, SDValue(Res, 0),
+                                   CurDAG->getTargetConstant(RC->getID(),
+                                                             MVT::i32));
+    }
+
     return std::make_pair(true, Res);
   }
+
   }
 
   return std::make_pair(false, (SDNode*)NULL);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 03ed1f9..dc52064 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -30,6 +30,8 @@ private:
   void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                              MachineFunction &MF);
 
+  unsigned getMSACtrlReg(const SDValue RegIdx) const;
+
   bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
 
   std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc dl,
@@ -41,12 +43,54 @@ private:
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
+  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
                                  SDValue &Offset) const;
 
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
+  virtual bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                  SDValue &Offset) const;
+
+  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a given integer.
+  virtual bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                                  unsigned ImmBitSize) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
 
   virtual void processFunctionAfterISel(MachineFunction &MF);
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index a0aacb5..809adc0 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -10,6 +10,7 @@
 // Subclass of MipsTargetLowering specialized for mips32/64.
 //
 //===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelLowering.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
@@ -17,6 +18,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
@@ -25,22 +28,40 @@ static cl::opt<bool>
 EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
                     cl::desc("MIPS: Enable tail calls."), cl::init(false));
 
+static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
+                                   cl::desc("Expand double precision loads and "
+                                            "stores to their single precision "
+                                            "counterparts"));
+
 MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   : MipsTargetLowering(TM) {
   // Set up the register classes
-
-  clearRegisterClasses();
-
   addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
 
   if (HasMips64)
     addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
 
+  if (Subtarget->hasDSP() || Subtarget->hasMSA()) {
+    // Expand all truncating stores and extending loads.
+    unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+    unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
+
+    for (unsigned VT0 = FirstVT; VT0 <= LastVT; ++VT0) {
+      for (unsigned VT1 = FirstVT; VT1 <= LastVT; ++VT1)
+        setTruncStoreAction((MVT::SimpleValueType)VT0,
+                            (MVT::SimpleValueType)VT1, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+    }
+  }
+
   if (Subtarget->hasDSP()) {
     MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
 
     for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
-      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
+      addRegisterClass(VecTys[i], &Mips::DSPRRegClass);
 
       // Expand all builtin opcodes.
       for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
@@ -53,20 +74,6 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
       setOperationAction(ISD::BITCAST, VecTys[i], Legal);
     }
 
-    // Expand all truncating stores and extending loads.
-    unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-    unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
-
-    for (unsigned VT0 = FirstVT; VT0 <= LastVT; ++VT0) {
-      for (unsigned VT1 = FirstVT; VT1 <= LastVT; ++VT1)
-        setTruncStoreAction((MVT::SimpleValueType)VT0,
-                            (MVT::SimpleValueType)VT1, Expand);
-
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-    }
-
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRA);
     setTargetDAGCombine(ISD::SRL);
@@ -77,12 +84,28 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   if (Subtarget->hasDSPR2())
     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
 
-  if (!TM.Options.UseSoftFloat) {
+  if (Subtarget->hasMSA()) {
+    addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass);
+    addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass);
+    addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass);
+    addMSAIntType(MVT::v2i64, &Mips::MSA128DRegClass);
+    addMSAFloatType(MVT::v8f16, &Mips::MSA128HRegClass);
+    addMSAFloatType(MVT::v4f32, &Mips::MSA128WRegClass);
+    addMSAFloatType(MVT::v2f64, &Mips::MSA128DRegClass);
+
+    setTargetDAGCombine(ISD::AND);
+    setTargetDAGCombine(ISD::OR);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::VSELECT);
+    setTargetDAGCombine(ISD::XOR);
+  }
+
+  if (!Subtarget->mipsSEUsesSoftFloat()) {
     addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
 
     // When dealing with single precision only, use libcalls
     if (!Subtarget->isSingleFloat()) {
-      if (HasMips64)
+      if (Subtarget->isFP64bit())
         addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
       else
         addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
@@ -115,6 +138,15 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  if (NoDPLoadStore) {
+    setOperationAction(ISD::LOAD, MVT::f64, Custom);
+    setOperationAction(ISD::STORE, MVT::f64, Custom);
+  }
+
   computeRegisterProperties();
 }
 
@@ -123,6 +155,93 @@ llvm::createMipsSETargetLowering(MipsTargetMachine &TM) {
   return new MipsSETargetLowering(TM);
 }
 
+// Enable MSA support for the given integer type and Register class.
+void MipsSETargetLowering::
+addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  setOperationAction(ISD::ADD, Ty, Legal);
+  setOperationAction(ISD::AND, Ty, Legal);
+  setOperationAction(ISD::CTLZ, Ty, Legal);
+  setOperationAction(ISD::CTPOP, Ty, Legal);
+  setOperationAction(ISD::MUL, Ty, Legal);
+  setOperationAction(ISD::OR, Ty, Legal);
+  setOperationAction(ISD::SDIV, Ty, Legal);
+  setOperationAction(ISD::SREM, Ty, Legal);
+  setOperationAction(ISD::SHL, Ty, Legal);
+  setOperationAction(ISD::SRA, Ty, Legal);
+  setOperationAction(ISD::SRL, Ty, Legal);
+  setOperationAction(ISD::SUB, Ty, Legal);
+  setOperationAction(ISD::UDIV, Ty, Legal);
+  setOperationAction(ISD::UREM, Ty, Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
+  setOperationAction(ISD::VSELECT, Ty, Legal);
+  setOperationAction(ISD::XOR, Ty, Legal);
+
+  if (Ty == MVT::v4i32 || Ty == MVT::v2i64) {
+    setOperationAction(ISD::FP_TO_SINT, Ty, Legal);
+    setOperationAction(ISD::FP_TO_UINT, Ty, Legal);
+    setOperationAction(ISD::SINT_TO_FP, Ty, Legal);
+    setOperationAction(ISD::UINT_TO_FP, Ty, Legal);
+  }
+
+  setOperationAction(ISD::SETCC, Ty, Legal);
+  setCondCodeAction(ISD::SETNE, Ty, Expand);
+  setCondCodeAction(ISD::SETGE, Ty, Expand);
+  setCondCodeAction(ISD::SETGT, Ty, Expand);
+  setCondCodeAction(ISD::SETUGE, Ty, Expand);
+  setCondCodeAction(ISD::SETUGT, Ty, Expand);
+}
+
+// Enable MSA support for the given floating-point type and Register class.
+void MipsSETargetLowering::
+addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  if (Ty != MVT::v8f16) {
+    setOperationAction(ISD::FABS,  Ty, Legal);
+    setOperationAction(ISD::FADD,  Ty, Legal);
+    setOperationAction(ISD::FDIV,  Ty, Legal);
+    setOperationAction(ISD::FEXP2, Ty, Legal);
+    setOperationAction(ISD::FLOG2, Ty, Legal);
+    setOperationAction(ISD::FMA,   Ty, Legal);
+    setOperationAction(ISD::FMUL,  Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FSQRT, Ty, Legal);
+    setOperationAction(ISD::FSUB,  Ty, Legal);
+    setOperationAction(ISD::VSELECT, Ty, Legal);
+
+    setOperationAction(ISD::SETCC, Ty, Legal);
+    setCondCodeAction(ISD::SETOGE, Ty, Expand);
+    setCondCodeAction(ISD::SETOGT, Ty, Expand);
+    setCondCodeAction(ISD::SETUGE, Ty, Expand);
+    setCondCodeAction(ISD::SETUGT, Ty, Expand);
+    setCondCodeAction(ISD::SETGE,  Ty, Expand);
+    setCondCodeAction(ISD::SETGT,  Ty, Expand);
+  }
+}
 
 bool
 MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
@@ -142,6 +261,8 @@ MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
 SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch(Op.getOpcode()) {
+  case ISD::LOAD:  return lowerLOAD(Op, DAG);
+  case ISD::STORE: return lowerSTORE(Op, DAG);
   case ISD::SMUL_LOHI: return lowerMulDiv(Op, MipsISD::Mult, true, true, DAG);
   case ISD::UMUL_LOHI: return lowerMulDiv(Op, MipsISD::Multu, true, true, DAG);
   case ISD::MULHS:     return lowerMulDiv(Op, MipsISD::Mult, false, true, DAG);
@@ -152,6 +273,10 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
                                           DAG);
   case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:     return lowerINTRINSIC_VOID(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:       return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, DAG);
   }
 
   return MipsTargetLowering::LowerOperation(Op, DAG);
@@ -204,7 +329,7 @@ static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
   SDLoc DL(ADDENode);
 
   // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
                                   ADDCNode->getOperand(1),
                                   ADDENode->getOperand(1));
 
@@ -218,15 +343,11 @@ static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
 
   // replace uses of adde and addc here
   if (!SDValue(ADDCNode, 0).use_empty()) {
-    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
-    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
-                                    LoIdx);
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
   }
   if (!SDValue(ADDENode, 0).use_empty()) {
-    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
-    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
-                                    HiIdx);
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
   }
 
@@ -280,7 +401,7 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
   SDLoc DL(SUBENode);
 
   // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
                                   SUBCNode->getOperand(0),
                                   SUBENode->getOperand(0));
 
@@ -294,15 +415,11 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
 
   // replace uses of sube and subc here
   if (!SDValue(SUBCNode, 0).use_empty()) {
-    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
-    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
-                                    LoIdx);
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
   }
   if (!SDValue(SUBENode, 0).use_empty()) {
-    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
-    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
-                                    HiIdx);
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
   }
 
@@ -322,6 +439,248 @@ static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to zero extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::AND.
+// - Removes redundant zero extensions performed by an ISD::AND.
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget *Subtarget) {
+  if (!Subtarget->hasMSA())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned Op0Opcode = Op0->getOpcode();
+
+  // (and (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d)
+  // where $d + 1 == 2^n and n == 32
+  // or    $d + 1 == 2^n and n <= 32 and ZExt
+  // -> (MipsVExtractZExt $a, $b, $c)
+  if (Op0Opcode == MipsISD::VEXTRACT_SEXT_ELT ||
+      Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT) {
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Op1);
+
+    if (!Mask)
+      return SDValue();
+
+    int32_t Log2IfPositive = (Mask->getAPIntValue() + 1).exactLogBase2();
+
+    if (Log2IfPositive <= 0)
+      return SDValue(); // Mask+1 is not a power of 2
+
+    SDValue Op0Op2 = Op0->getOperand(2);
+    EVT ExtendTy = cast<VTSDNode>(Op0Op2)->getVT();
+    unsigned ExtendTySize = ExtendTy.getSizeInBits();
+    unsigned Log2 = Log2IfPositive;
+
+    if ((Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT && Log2 >= ExtendTySize) ||
+        Log2 == ExtendTySize) {
+      SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
+      DAG.MorphNodeTo(Op0.getNode(), MipsISD::VEXTRACT_ZEXT_ELT,
+                      Op0->getVTList(), Ops, Op0->getNumOperands());
+      return Op0;
+    }
+  }
+
+  return SDValue();
+}
+
+// Determine if the specified node is a constant vector splat.
+//
+// Returns true and sets Imm if:
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+//
+// This function is quite similar to MipsSEDAGToDAGISel::selectVSplat. The
+// differences are that it assumes the MSA has already been checked and the
+// arbitrary requirement for a maximum of 32-bit integers isn't applied (and
+// must not be in order for binsri.d to be selectable).
+static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
+
+  if (Node == NULL)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             8, !IsLittleEndian))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Test whether the given node is an all-ones build_vector.
+static bool isVectorAllOnes(SDValue N) {
+  // Look through bitcasts. Endianness doesn't matter because we are looking
+  // for an all-ones value.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+  if (!BVN)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  // Endianness doesn't matter in this context because we are looking for
+  // an all-ones value.
+  if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs))
+    return SplatValue.isAllOnesValue();
+
+  return false;
+}
+
+// Test whether N is the bitwise inverse of OfNode.
+static bool isBitwiseInverse(SDValue N, SDValue OfNode) {
+  if (N->getOpcode() != ISD::XOR)
+    return false;
+
+  if (isVectorAllOnes(N->getOperand(0)))
+    return N->getOperand(1) == OfNode;
+
+  if (isVectorAllOnes(N->getOperand(1)))
+    return N->getOperand(0) == OfNode;
+
+  return false;
+}
+
+// Perform combines where ISD::OR is the root node.
+//
+// Performs the following transformations:
+// - (or (and $a, $mask), (and $b, $inv_mask)) => (vselect $mask, $a, $b)
+//   where $inv_mask is the bitwise inverse of $mask and the 'or' has a 128-bit
+//   vector type.
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget *Subtarget) {
+  if (!Subtarget->hasMSA())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+
+  if (!Ty.is128BitVector())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (Op0->getOpcode() == ISD::AND && Op1->getOpcode() == ISD::AND) {
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+    SDValue Op1Op0 = Op1->getOperand(0);
+    SDValue Op1Op1 = Op1->getOperand(1);
+    bool IsLittleEndian = !Subtarget->isLittle();
+
+    SDValue IfSet, IfClr, Cond;
+    bool IsConstantMask = false;
+    APInt Mask, InvMask;
+
+    // If Op0Op0 is an appropriate mask, try to find it's inverse in either
+    // Op1Op0, or Op1Op1. Keep track of the Cond, IfSet, and IfClr nodes, while
+    // looking.
+    // IfClr will be set if we find a valid match.
+    if (isVSplat(Op0Op0, Mask, IsLittleEndian)) {
+      Cond = Op0Op0;
+      IfSet = Op0Op1;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, and Op0Op1 is an appropriate mask, try the same
+    // thing again using this mask.
+    // IfClr will be set if we find a valid match.
+    if (!IfClr.getNode() && isVSplat(Op0Op1, Mask, IsLittleEndian)) {
+      Cond = Op0Op1;
+      IfSet = Op0Op0;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, try looking for a non-constant match.
+    // IfClr will be set if we find a valid match amongst the eight
+    // possibilities.
+    if (!IfClr.getNode()) {
+      if (isBitwiseInverse(Op0Op0, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op0Op0, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op0;
+      }
+    }
+
+    // At this point, IfClr will be set if we have a valid match.
+    if (!IfClr.getNode())
+      return SDValue();
+
+    assert(Cond.getNode() && IfSet.getNode());
+
+    // Fold degenerate cases.
+    if (IsConstantMask) {
+      if (Mask.isAllOnesValue())
+        return IfSet;
+      else if (Mask == 0)
+        return IfClr;
+    }
+
+    // Transform the DAG into an equivalent VSELECT.
+    return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfClr, IfSet);
+  }
+
+  return SDValue();
+}
+
 static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const MipsSubtarget *Subtarget) {
@@ -396,6 +755,9 @@ static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
   unsigned EltSize = Ty.getVectorElementType().getSizeInBits();
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
 
+  if (!Subtarget->hasDSP())
+    return SDValue();
+
   if (!BV ||
       !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
                            EltSize, !Subtarget->isLittle()) ||
@@ -418,11 +780,57 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
   return performDSPShiftCombine(MipsISD::SHLL_DSP, N, Ty, DAG, Subtarget);
 }
 
+// Fold sign-extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT for MSA and fold
+// constant splats into MipsISD::SHRA_DSP for DSPr2.
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to sign extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::SRA and ISD::SHL nodes.
+// - Removes redundant sign extensions performed by an ISD::SRA and ISD::SHL
+//   sequence.
+//
+// See performDSPShiftCombine for more information about the transformation
+// used for DSPr2.
 static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget *Subtarget) {
   EVT Ty = N->getValueType(0);
 
+  if (Subtarget->hasMSA()) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+
+    // (sra (shl (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d), imm:$d)
+    // where $d + sizeof($c) == 32
+    // or    $d + sizeof($c) <= 32 and SExt
+    // -> (MipsVExtractSExt $a, $b, $c)
+    if (Op0->getOpcode() == ISD::SHL && Op1 == Op0->getOperand(1)) {
+      SDValue Op0Op0 = Op0->getOperand(0);
+      ConstantSDNode *ShAmount = dyn_cast<ConstantSDNode>(Op1);
+
+      if (!ShAmount)
+        return SDValue();
+
+      if (Op0Op0->getOpcode() != MipsISD::VEXTRACT_SEXT_ELT &&
+          Op0Op0->getOpcode() != MipsISD::VEXTRACT_ZEXT_ELT)
+        return SDValue();
+
+      EVT ExtendTy = cast<VTSDNode>(Op0Op0->getOperand(2))->getVT();
+      unsigned TotalBits = ShAmount->getZExtValue() + ExtendTy.getSizeInBits();
+
+      if (TotalBits == 32 ||
+          (Op0Op0->getOpcode() == MipsISD::VEXTRACT_SEXT_ELT &&
+           TotalBits <= 32)) {
+        SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
+                          Op0Op0->getOperand(2) };
+        DAG.MorphNodeTo(Op0Op0.getNode(), MipsISD::VEXTRACT_SEXT_ELT,
+                        Op0Op0->getVTList(), Ops, Op0Op0->getNumOperands());
+        return Op0Op0;
+      }
+    }
+  }
+
   if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget->hasDSPR2()))
     return SDValue();
 
@@ -475,17 +883,84 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
   EVT Ty = N->getValueType(0);
 
-  if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
-    return SDValue();
+  if (Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
+    // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
+    // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
+    // legalizer.
+    SDValue Op0 = N->getOperand(0);
+
+    if (Op0->getOpcode() != ISD::SETCC)
+      return SDValue();
+
+    ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
+    bool Signed;
+
+    if (CondCode == ISD::SETLT  || CondCode == ISD::SETLE)
+      Signed = true;
+    else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
+      Signed = false;
+    else
+      return SDValue();
+
+    SDValue Op1 = N->getOperand(1);
+    SDValue Op2 = N->getOperand(2);
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+
+    if (Op1 == Op0Op0 && Op2 == Op0Op1)
+      return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
+                         Ty, Op1, Op2);
+    else if (Op1 == Op0Op1 && Op2 == Op0Op0)
+      return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
+                         Ty, Op1, Op2);
+  } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+    SDValue SetCC = N->getOperand(0);
+
+    if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
+      return SDValue();
+
+    return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
+                       SetCC.getOperand(0), SetCC.getOperand(1),
+                       N->getOperand(1), N->getOperand(2), SetCC.getOperand(2));
+  }
 
-  SDValue SetCC = N->getOperand(0);
+  return SDValue();
+}
 
-  if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
-    return SDValue();
+static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
+                                 const MipsSubtarget *Subtarget) {
+  EVT Ty = N->getValueType(0);
 
-  return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
-                     SetCC.getOperand(0), SetCC.getOperand(1), N->getOperand(1),
-                     N->getOperand(2), SetCC.getOperand(2));
+  if (Subtarget->hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (xor (or $a, $b), (build_vector allones))
+    //   (xor (or $a, $b), (bitcast (build_vector allones)))
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    SDValue NotOp;
+
+    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+      NotOp = Op1;
+    else if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+      NotOp = Op0;
+    else
+      return SDValue();
+
+    if (NotOp->getOpcode() == ISD::OR)
+      return DAG.getNode(MipsISD::VNOR, SDLoc(N), Ty, NotOp->getOperand(0),
+                         NotOp->getOperand(1));
+  }
+
+  return SDValue();
 }
 
 SDValue
@@ -496,6 +971,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   case ISD::ADDE:
     return performADDECombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    Val = performANDCombine(N, DAG, DCI, Subtarget);
+    break;
+  case ISD::OR:
+    Val = performORCombine(N, DAG, DCI, Subtarget);
+    break;
   case ISD::SUBE:
     return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
@@ -508,14 +989,22 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
     return performSRLCombine(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
     return performVSELECTCombine(N, DAG);
-  case ISD::SETCC: {
+  case ISD::XOR:
+    Val = performXORCombine(N, DAG, Subtarget);
+    break;
+  case ISD::SETCC:
     Val = performSETCCCombine(N, DAG);
     break;
   }
-  }
 
-  if (Val.getNode())
+  if (Val.getNode()) {
+    DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+          N->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n=> \n";
+          Val.getNode()->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n");
     return Val;
+  }
 
   return MipsTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -528,6 +1017,42 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case Mips::BPOSGE32_PSEUDO:
     return emitBPOSGE32(MI, BB);
+  case Mips::SNZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_B);
+  case Mips::SNZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_H);
+  case Mips::SNZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_W);
+  case Mips::SNZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_D);
+  case Mips::SNZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_V);
+  case Mips::SZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_B);
+  case Mips::SZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_H);
+  case Mips::SZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_W);
+  case Mips::SZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
+  case Mips::SZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
+  case Mips::COPY_FW_PSEUDO:
+    return emitCOPY_FW(MI, BB);
+  case Mips::COPY_FD_PSEUDO:
+    return emitCOPY_FD(MI, BB);
+  case Mips::INSERT_FW_PSEUDO:
+    return emitINSERT_FW(MI, BB);
+  case Mips::INSERT_FD_PSEUDO:
+    return emitINSERT_FD(MI, BB);
+  case Mips::FILL_FW_PSEUDO:
+    return emitFILL_FW(MI, BB);
+  case Mips::FILL_FD_PSEUDO:
+    return emitFILL_FD(MI, BB);
+  case Mips::FEXP2_W_1_PSEUDO:
+    return emitFEXP2_W_1(MI, BB);
+  case Mips::FEXP2_D_1_PSEUDO:
+    return emitFEXP2_D_1(MI, BB);
   }
 }
 
@@ -564,6 +1089,68 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
                                   InternalLinkage, CLI, Callee, Chain);
 }
 
+SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode &Nd = *cast<LoadSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerLOAD(Op, DAG);
+
+  // Replace a double precision load with two i32 loads and a buildpair64.
+  SDLoc DL(Op);
+  SDValue Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+
+  // i32 load from lower address.
+  SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           Nd.getAlignment());
+
+  // i32 load from higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  SDValue Hi = DAG.getLoad(MVT::i32, DL, Lo.getValue(1), Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           std::min(Nd.getAlignment(), 4U));
+
+  if (!Subtarget->isLittle())
+    std::swap(Lo, Hi);
+
+  SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+  SDValue Ops[2] = {BP, Hi.getValue(1)};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode &Nd = *cast<StoreSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerSTORE(Op, DAG);
+
+  // Replace a double precision store with two extractelement64s and i32 stores.
+  SDLoc DL(Op);
+  SDValue Val = Nd.getValue(), Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+  SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(1, MVT::i32));
+
+  if (!Subtarget->isLittle())
+    std::swap(Lo, Hi);
+
+  // i32 store to lower address.
+  Chain = DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(),
+                       Nd.isVolatile(), Nd.isNonTemporal(), Nd.getAlignment(),
+                       Nd.getTBAAInfo());
+
+  // i32 store to higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
+                      Nd.isVolatile(), Nd.isNonTemporal(),
+                      std::min(Nd.getAlignment(), 4U), Nd.getTBAAInfo());
+}
+
 SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
                                           bool HasLo, bool HasHi,
                                           SelectionDAG &DAG) const {
@@ -574,11 +1161,9 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
   SDValue Lo, Hi;
 
   if (HasLo)
-    Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
-                     DAG.getConstant(Mips::sub_lo, MVT::i32));
+    Lo = DAG.getNode(MipsISD::MFLO, DL, Ty, Mult);
   if (HasHi)
-    Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
-                     DAG.getConstant(Mips::sub_hi, MVT::i32));
+    Hi = DAG.getNode(MipsISD::MFHI, DL, Ty, Mult);
 
   if (!HasLo || !HasHi)
     return HasLo ? Lo : Hi;
@@ -593,14 +1178,12 @@ static SDValue initAccumulator(SDValue In, SDLoc DL, SelectionDAG &DAG) {
                              DAG.getConstant(0, MVT::i32));
   SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
                              DAG.getConstant(1, MVT::i32));
-  return DAG.getNode(MipsISD::InsertLOHI, DL, MVT::Untyped, InLo, InHi);
+  return DAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, InLo, InHi);
 }
 
 static SDValue extractLOHI(SDValue Op, SDLoc DL, SelectionDAG &DAG) {
-  SDValue Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
-                           DAG.getConstant(Mips::sub_lo, MVT::i32));
-  SDValue Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
-                           DAG.getConstant(Mips::sub_hi, MVT::i32));
+  SDValue Lo = DAG.getNode(MipsISD::MFLO, DL, MVT::i32, Op);
+  SDValue Hi = DAG.getNode(MipsISD::MFHI, DL, MVT::i32, Op);
   return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
 }
 
@@ -664,8 +1247,156 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
   return DAG.getMergeValues(Vals, 2, DL);
 }
 
+// Lower an MSA copy intrinsic into the specified SelectionDAG node
+static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(Op);
+  SDValue Vec = Op->getOperand(1);
+  SDValue Idx = Op->getOperand(2);
+  EVT ResTy = Op->getValueType(0);
+  EVT EltTy = Vec->getValueType(0).getVectorElementType();
+
+  SDValue Result = DAG.getNode(Opc, DL, ResTy, Vec, Idx,
+                               DAG.getValueType(EltTy));
+
+  return Result;
+}
+
+static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
+  EVT ResVecTy = Op->getValueType(0);
+  EVT ViaVecTy = ResVecTy;
+  SDLoc DL(Op);
+
+  // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
+  // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
+  // lanes.
+  SDValue LaneA;
+  SDValue LaneB = Op->getOperand(2);
+
+  if (ResVecTy == MVT::v2i64) {
+    LaneA = DAG.getConstant(0, MVT::i32);
+    ViaVecTy = MVT::v4i32;
+  } else
+    LaneA = LaneB;
+
+  SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
+                      LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (ViaVecTy != ResVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
+  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), Op->getValueType(0));
+}
+
+static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
+                                   bool BigEndian, SelectionDAG &DAG) {
+  EVT ViaVecTy = VecTy;
+  SDValue SplatValueA = SplatValue;
+  SDValue SplatValueB = SplatValue;
+  SDLoc DL(SplatValue);
+
+  if (VecTy == MVT::v2i64) {
+    // v2i64 BUILD_VECTOR must be performed via v4i32 so split into i32's.
+    ViaVecTy = MVT::v4i32;
+
+    SplatValueA = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValue);
+    SplatValueB = DAG.getNode(ISD::SRL, DL, MVT::i64, SplatValue,
+                              DAG.getConstant(32, MVT::i32));
+    SplatValueB = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValueB);
+  }
+
+  // We currently hold the parts in little endian order. Swap them if
+  // necessary.
+  if (BigEndian)
+    std::swap(SplatValueA, SplatValueB);
+
+  SDValue Ops[16] = { SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (VecTy != ViaVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
+                                        unsigned Opc, SDValue Imm,
+                                        bool BigEndian) {
+  EVT VecTy = Op->getValueType(0);
+  SDValue Exp2Imm;
+  SDLoc DL(Op);
+
+  // The DAG Combiner can't constant fold bitcasted vectors yet so we must do it
+  // here for now.
+  if (VecTy == MVT::v2i64) {
+    if (ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(Imm)) {
+      APInt BitImm = APInt(64, 1) << CImm->getAPIntValue();
+
+      SDValue BitImmHiOp = DAG.getConstant(BitImm.lshr(32).trunc(32), MVT::i32);
+      SDValue BitImmLoOp = DAG.getConstant(BitImm.trunc(32), MVT::i32);
+
+      if (BigEndian)
+        std::swap(BitImmLoOp, BitImmHiOp);
+
+      Exp2Imm =
+          DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, BitImmLoOp,
+                                  BitImmHiOp, BitImmLoOp, BitImmHiOp));
+    }
+  }
+
+  if (Exp2Imm.getNode() == NULL) {
+    // We couldnt constant fold, do a vector shift instead
+
+    // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
+    // only values 0-63 are valid.
+    if (VecTy == MVT::v2i64)
+      Imm = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Imm);
+
+    Exp2Imm = getBuildVectorSplat(VecTy, Imm, BigEndian, DAG);
+
+    Exp2Imm =
+        DAG.getNode(ISD::SHL, DL, VecTy, DAG.getConstant(1, VecTy), Exp2Imm);
+  }
+
+  return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
+}
+
+static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  SDValue One = DAG.getConstant(1, ResTy);
+  SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1),
+                     DAG.getNOT(DL, Bit, ResTy));
+}
+
+static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  APInt BitImm = APInt(ResTy.getVectorElementType().getSizeInBits(), 1)
+                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+  SDValue BitMask = DAG.getConstant(~BitImm, ResTy);
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
+}
+
 SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
   switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
   default:
     return SDValue();
@@ -701,12 +1432,610 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerDSPIntr(Op, DAG, MipsISD::MSub);
   case Intrinsic::mips_msubu:
     return lowerDSPIntr(Op, DAG, MipsISD::MSubu);
+  case Intrinsic::mips_addv_b:
+  case Intrinsic::mips_addv_h:
+  case Intrinsic::mips_addv_w:
+  case Intrinsic::mips_addv_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_addvi_b:
+  case Intrinsic::mips_addvi_h:
+  case Intrinsic::mips_addvi_w:
+  case Intrinsic::mips_addvi_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_and_v:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_andi_b:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_bclr_b:
+  case Intrinsic::mips_bclr_h:
+  case Intrinsic::mips_bclr_w:
+  case Intrinsic::mips_bclr_d:
+    return lowerMSABitClear(Op, DAG);
+  case Intrinsic::mips_bclri_b:
+  case Intrinsic::mips_bclri_h:
+  case Intrinsic::mips_bclri_w:
+  case Intrinsic::mips_bclri_d:
+    return lowerMSABitClearImm(Op, DAG);
+  case Intrinsic::mips_binsli_b:
+  case Intrinsic::mips_binsli_h:
+  case Intrinsic::mips_binsli_w:
+  case Intrinsic::mips_binsli_d: {
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
+                                       Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_binsri_b:
+  case Intrinsic::mips_binsri_h:
+  case Intrinsic::mips_binsri_w:
+  case Intrinsic::mips_binsri_d: {
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
+                                      Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_bmnz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_bmnzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(2),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bmz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_bmzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bneg_b:
+  case Intrinsic::mips_bneg_h:
+  case Intrinsic::mips_bneg_w:
+  case Intrinsic::mips_bneg_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bnegi_b:
+  case Intrinsic::mips_bnegi_h:
+  case Intrinsic::mips_bnegi_w:
+  case Intrinsic::mips_bnegi_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2),
+                                    !Subtarget->isLittle());
+  case Intrinsic::mips_bnz_b:
+  case Intrinsic::mips_bnz_h:
+  case Intrinsic::mips_bnz_w:
+  case Intrinsic::mips_bnz_d:
+    return DAG.getNode(MipsISD::VALL_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bnz_v:
+    return DAG.getNode(MipsISD::VANY_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bsel_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2),
+                       Op->getOperand(3));
+  case Intrinsic::mips_bseli_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2),
+                       lowerMSASplatImm(Op, 3, DAG));
+  case Intrinsic::mips_bset_b:
+  case Intrinsic::mips_bset_h:
+  case Intrinsic::mips_bset_w:
+  case Intrinsic::mips_bset_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bseti_b:
+  case Intrinsic::mips_bseti_h:
+  case Intrinsic::mips_bseti_w:
+  case Intrinsic::mips_bseti_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2),
+                                    !Subtarget->isLittle());
+  case Intrinsic::mips_bz_b:
+  case Intrinsic::mips_bz_h:
+  case Intrinsic::mips_bz_w:
+  case Intrinsic::mips_bz_d:
+    return DAG.getNode(MipsISD::VALL_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bz_v:
+    return DAG.getNode(MipsISD::VANY_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ceq_b:
+  case Intrinsic::mips_ceq_h:
+  case Intrinsic::mips_ceq_w:
+  case Intrinsic::mips_ceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETEQ);
+  case Intrinsic::mips_ceqi_b:
+  case Intrinsic::mips_ceqi_h:
+  case Intrinsic::mips_ceqi_w:
+  case Intrinsic::mips_ceqi_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+  case Intrinsic::mips_cle_s_b:
+  case Intrinsic::mips_cle_s_h:
+  case Intrinsic::mips_cle_s_w:
+  case Intrinsic::mips_cle_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLE);
+  case Intrinsic::mips_clei_s_b:
+  case Intrinsic::mips_clei_s_h:
+  case Intrinsic::mips_clei_s_w:
+  case Intrinsic::mips_clei_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+  case Intrinsic::mips_cle_u_b:
+  case Intrinsic::mips_cle_u_h:
+  case Intrinsic::mips_cle_u_w:
+  case Intrinsic::mips_cle_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_clei_u_b:
+  case Intrinsic::mips_clei_u_h:
+  case Intrinsic::mips_clei_u_w:
+  case Intrinsic::mips_clei_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULE);
+  case Intrinsic::mips_clt_s_b:
+  case Intrinsic::mips_clt_s_h:
+  case Intrinsic::mips_clt_s_w:
+  case Intrinsic::mips_clt_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLT);
+  case Intrinsic::mips_clti_s_b:
+  case Intrinsic::mips_clti_s_h:
+  case Intrinsic::mips_clti_s_w:
+  case Intrinsic::mips_clti_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+  case Intrinsic::mips_clt_u_b:
+  case Intrinsic::mips_clt_u_h:
+  case Intrinsic::mips_clt_u_w:
+  case Intrinsic::mips_clt_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_clti_u_b:
+  case Intrinsic::mips_clti_u_h:
+  case Intrinsic::mips_clti_u_w:
+  case Intrinsic::mips_clti_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULT);
+  case Intrinsic::mips_copy_s_b:
+  case Intrinsic::mips_copy_s_h:
+  case Intrinsic::mips_copy_s_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+  case Intrinsic::mips_copy_s_d:
+    // Don't lower directly into VEXTRACT_SEXT_ELT since i64 might be illegal.
+    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
+    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_copy_u_b:
+  case Intrinsic::mips_copy_u_h:
+  case Intrinsic::mips_copy_u_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+  case Intrinsic::mips_copy_u_d:
+    // Don't lower directly into VEXTRACT_ZEXT_ELT since i64 might be illegal.
+    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
+    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+    //
+    // Note: When i64 is illegal, this results in copy_s.w instructions instead
+    // of copy_u.w instructions. This makes no difference to the behaviour
+    // since i64 is only illegal when the register file is 32-bit.
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_div_s_b:
+  case Intrinsic::mips_div_s_h:
+  case Intrinsic::mips_div_s_w:
+  case Intrinsic::mips_div_s_d:
+    return DAG.getNode(ISD::SDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_div_u_b:
+  case Intrinsic::mips_div_u_h:
+  case Intrinsic::mips_div_u_w:
+  case Intrinsic::mips_div_u_d:
+    return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fadd_w:
+  case Intrinsic::mips_fadd_d:
+    return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
+  case Intrinsic::mips_fceq_w:
+  case Intrinsic::mips_fceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOEQ);
+  case Intrinsic::mips_fcle_w:
+  case Intrinsic::mips_fcle_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLE);
+  case Intrinsic::mips_fclt_w:
+  case Intrinsic::mips_fclt_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLT);
+  case Intrinsic::mips_fcne_w:
+  case Intrinsic::mips_fcne_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETONE);
+  case Intrinsic::mips_fcor_w:
+  case Intrinsic::mips_fcor_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETO);
+  case Intrinsic::mips_fcueq_w:
+  case Intrinsic::mips_fcueq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUEQ);
+  case Intrinsic::mips_fcule_w:
+  case Intrinsic::mips_fcule_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_fcult_w:
+  case Intrinsic::mips_fcult_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_fcun_w:
+  case Intrinsic::mips_fcun_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUO);
+  case Intrinsic::mips_fcune_w:
+  case Intrinsic::mips_fcune_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUNE);
+  case Intrinsic::mips_fdiv_w:
+  case Intrinsic::mips_fdiv_d:
+    return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ffint_u_w:
+  case Intrinsic::mips_ffint_u_d:
+    return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ffint_s_w:
+  case Intrinsic::mips_ffint_s_d:
+    return DAG.getNode(ISD::SINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_fill_b:
+  case Intrinsic::mips_fill_h:
+  case Intrinsic::mips_fill_w:
+  case Intrinsic::mips_fill_d: {
+    SmallVector<SDValue, 16> Ops;
+    EVT ResTy = Op->getValueType(0);
+
+    for (unsigned i = 0; i < ResTy.getVectorNumElements(); ++i)
+      Ops.push_back(Op->getOperand(1));
+
+    // If ResTy is v2i64 then the type legalizer will break this node down into
+    // an equivalent v4i32.
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, &Ops[0], Ops.size());
+  }
+  case Intrinsic::mips_fexp2_w:
+  case Intrinsic::mips_fexp2_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(
+        ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
+        DAG.getNode(ISD::FEXP2, SDLoc(Op), ResTy, Op->getOperand(2)));
+  }
+  case Intrinsic::mips_flog2_w:
+  case Intrinsic::mips_flog2_d:
+    return DAG.getNode(ISD::FLOG2, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fmadd_w:
+  case Intrinsic::mips_fmadd_d:
+    return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_fmul_w:
+  case Intrinsic::mips_fmul_d:
+    return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fmsub_w:
+  case Intrinsic::mips_fmsub_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_frint_w:
+  case Intrinsic::mips_frint_d:
+    return DAG.getNode(ISD::FRINT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsqrt_w:
+  case Intrinsic::mips_fsqrt_d:
+    return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsub_w:
+  case Intrinsic::mips_fsub_d:
+    return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ftrunc_u_w:
+  case Intrinsic::mips_ftrunc_u_d:
+    return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ftrunc_s_w:
+  case Intrinsic::mips_ftrunc_s_d:
+    return DAG.getNode(ISD::FP_TO_SINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ilvev_b:
+  case Intrinsic::mips_ilvev_h:
+  case Intrinsic::mips_ilvev_w:
+  case Intrinsic::mips_ilvev_d:
+    return DAG.getNode(MipsISD::ILVEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvl_b:
+  case Intrinsic::mips_ilvl_h:
+  case Intrinsic::mips_ilvl_w:
+  case Intrinsic::mips_ilvl_d:
+    return DAG.getNode(MipsISD::ILVL, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvod_b:
+  case Intrinsic::mips_ilvod_h:
+  case Intrinsic::mips_ilvod_w:
+  case Intrinsic::mips_ilvod_d:
+    return DAG.getNode(MipsISD::ILVOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvr_b:
+  case Intrinsic::mips_ilvr_h:
+  case Intrinsic::mips_ilvr_w:
+  case Intrinsic::mips_ilvr_d:
+    return DAG.getNode(MipsISD::ILVR, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_insert_b:
+  case Intrinsic::mips_insert_h:
+  case Intrinsic::mips_insert_w:
+  case Intrinsic::mips_insert_d:
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(3), Op->getOperand(2));
+  case Intrinsic::mips_ldi_b:
+  case Intrinsic::mips_ldi_h:
+  case Intrinsic::mips_ldi_w:
+  case Intrinsic::mips_ldi_d:
+    return lowerMSASplatImm(Op, 1, DAG);
+  case Intrinsic::mips_lsa: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_maddv_b:
+  case Intrinsic::mips_maddv_h:
+  case Intrinsic::mips_maddv_w:
+  case Intrinsic::mips_maddv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_max_s_b:
+  case Intrinsic::mips_max_s_h:
+  case Intrinsic::mips_max_s_w:
+  case Intrinsic::mips_max_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_max_u_b:
+  case Intrinsic::mips_max_u_h:
+  case Intrinsic::mips_max_u_w:
+  case Intrinsic::mips_max_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_maxi_s_b:
+  case Intrinsic::mips_maxi_s_h:
+  case Intrinsic::mips_maxi_s_w:
+  case Intrinsic::mips_maxi_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_maxi_u_b:
+  case Intrinsic::mips_maxi_u_h:
+  case Intrinsic::mips_maxi_u_w:
+  case Intrinsic::mips_maxi_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_min_s_b:
+  case Intrinsic::mips_min_s_h:
+  case Intrinsic::mips_min_s_w:
+  case Intrinsic::mips_min_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_min_u_b:
+  case Intrinsic::mips_min_u_h:
+  case Intrinsic::mips_min_u_w:
+  case Intrinsic::mips_min_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_mini_s_b:
+  case Intrinsic::mips_mini_s_h:
+  case Intrinsic::mips_mini_s_w:
+  case Intrinsic::mips_mini_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mini_u_b:
+  case Intrinsic::mips_mini_u_h:
+  case Intrinsic::mips_mini_u_w:
+  case Intrinsic::mips_mini_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mod_s_b:
+  case Intrinsic::mips_mod_s_h:
+  case Intrinsic::mips_mod_s_w:
+  case Intrinsic::mips_mod_s_d:
+    return DAG.getNode(ISD::SREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mod_u_b:
+  case Intrinsic::mips_mod_u_h:
+  case Intrinsic::mips_mod_u_w:
+  case Intrinsic::mips_mod_u_d:
+    return DAG.getNode(ISD::UREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mulv_b:
+  case Intrinsic::mips_mulv_h:
+  case Intrinsic::mips_mulv_w:
+  case Intrinsic::mips_mulv_d:
+    return DAG.getNode(ISD::MUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_msubv_b:
+  case Intrinsic::mips_msubv_h:
+  case Intrinsic::mips_msubv_w:
+  case Intrinsic::mips_msubv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::SUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_nlzc_b:
+  case Intrinsic::mips_nlzc_h:
+  case Intrinsic::mips_nlzc_w:
+  case Intrinsic::mips_nlzc_d:
+    return DAG.getNode(ISD::CTLZ, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_nor_v: {
+    SDValue Res = DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                              Op->getOperand(1), Op->getOperand(2));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_nori_b: {
+    SDValue Res =  DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                               Op->getOperand(1),
+                               lowerMSASplatImm(Op, 2, DAG));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_or_v:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ori_b:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_pckev_b:
+  case Intrinsic::mips_pckev_h:
+  case Intrinsic::mips_pckev_w:
+  case Intrinsic::mips_pckev_d:
+    return DAG.getNode(MipsISD::PCKEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pckod_b:
+  case Intrinsic::mips_pckod_h:
+  case Intrinsic::mips_pckod_w:
+  case Intrinsic::mips_pckod_d:
+    return DAG.getNode(MipsISD::PCKOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pcnt_b:
+  case Intrinsic::mips_pcnt_h:
+  case Intrinsic::mips_pcnt_w:
+  case Intrinsic::mips_pcnt_d:
+    return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_shf_b:
+  case Intrinsic::mips_shf_h:
+  case Intrinsic::mips_shf_w:
+    return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_sll_b:
+  case Intrinsic::mips_sll_h:
+  case Intrinsic::mips_sll_w:
+  case Intrinsic::mips_sll_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_slli_b:
+  case Intrinsic::mips_slli_h:
+  case Intrinsic::mips_slli_w:
+  case Intrinsic::mips_slli_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_splat_b:
+  case Intrinsic::mips_splat_h:
+  case Intrinsic::mips_splat_w:
+  case Intrinsic::mips_splat_d:
+    // We can't lower via VECTOR_SHUFFLE because it requires constant shuffle
+    // masks, nor can we lower via BUILD_VECTOR & EXTRACT_VECTOR_ELT because
+    // EXTRACT_VECTOR_ELT can't extract i64's on MIPS32.
+    // Instead we lower to MipsISD::VSHF and match from there.
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatZExt(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_splati_b:
+  case Intrinsic::mips_splati_h:
+  case Intrinsic::mips_splati_w:
+  case Intrinsic::mips_splati_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_sra_b:
+  case Intrinsic::mips_sra_h:
+  case Intrinsic::mips_sra_w:
+  case Intrinsic::mips_sra_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srai_b:
+  case Intrinsic::mips_srai_h:
+  case Intrinsic::mips_srai_w:
+  case Intrinsic::mips_srai_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srl_b:
+  case Intrinsic::mips_srl_h:
+  case Intrinsic::mips_srl_w:
+  case Intrinsic::mips_srl_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srli_b:
+  case Intrinsic::mips_srli_h:
+  case Intrinsic::mips_srli_w:
+  case Intrinsic::mips_srli_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_subv_b:
+  case Intrinsic::mips_subv_h:
+  case Intrinsic::mips_subv_w:
+  case Intrinsic::mips_subv_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_subvi_b:
+  case Intrinsic::mips_subvi_h:
+  case Intrinsic::mips_subvi_w:
+  case Intrinsic::mips_subvi_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_vshf_b:
+  case Intrinsic::mips_vshf_h:
+  case Intrinsic::mips_vshf_w:
+  case Intrinsic::mips_vshf_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_xor_v:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_xori_b:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
   }
 }
 
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Address = Op->getOperand(2);
+  SDValue Offset  = Op->getOperand(3);
+  EVT ResTy = Op->getValueType(0);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(), false,
+                     false, false, 16);
+}
+
 SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  switch (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue()) {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
   default:
     return SDValue();
   case Intrinsic::mips_extp:
@@ -749,7 +2078,522 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
     return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH);
   case Intrinsic::mips_dpsqx_sa_w_ph:
     return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH);
+  case Intrinsic::mips_ld_b:
+  case Intrinsic::mips_ld_h:
+  case Intrinsic::mips_ld_w:
+  case Intrinsic::mips_ld_d:
+   return lowerMSALoadIntr(Op, DAG, Intr);
+  }
+}
+
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Value   = Op->getOperand(2);
+  SDValue Address = Op->getOperand(3);
+  SDValue Offset  = Op->getOperand(4);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(), false,
+                      false, 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_st_b:
+  case Intrinsic::mips_st_h:
+  case Intrinsic::mips_st_w:
+  case Intrinsic::mips_st_d:
+    return lowerMSAStoreIntr(Op, DAG, Intr);
+  }
+}
+
+/// \brief Check if the given BuildVectorSDNode is a splat.
+/// This method currently relies on DAG nodes being reused when equivalent,
+/// so it's possible for this to return false even when isConstantSplat returns
+/// true.
+static bool isSplatVector(const BuildVectorSDNode *N) {
+  unsigned int nOps = N->getNumOperands();
+  assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector");
+
+  SDValue Operand0 = N->getOperand(0);
+
+  for (unsigned int i = 1; i < nOps; ++i) {
+    if (N->getOperand(i) != Operand0)
+      return false;
+  }
+
+  return true;
+}
+
+// Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT.
+//
+// The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We
+// choose to sign-extend but we could have equally chosen zero-extend. The
+// DAGCombiner will fold any sign/zero extension of the ISD::EXTRACT_VECTOR_ELT
+// result into this node later (possibly changing it to a zero-extend in the
+// process).
+SDValue MipsSETargetLowering::
+lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDValue Op0 = Op->getOperand(0);
+  EVT VecTy = Op0->getValueType(0);
+
+  if (!VecTy.is128BitVector())
+    return SDValue();
+
+  if (ResTy.isInteger()) {
+    SDValue Op1 = Op->getOperand(1);
+    EVT EltTy = VecTy.getVectorElementType();
+    return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
+                       DAG.getValueType(EltTy));
+  }
+
+  return Op;
+}
+
+static bool isConstantOrUndef(const SDValue Op) {
+  if (Op->getOpcode() == ISD::UNDEF)
+    return true;
+  if (dyn_cast<ConstantSDNode>(Op))
+    return true;
+  if (dyn_cast<ConstantFPSDNode>(Op))
+    return true;
+  return false;
+}
+
+static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+  for (unsigned i = 0; i < Op->getNumOperands(); ++i)
+    if (isConstantOrUndef(Op->getOperand(i)))
+      return true;
+  return false;
+}
+
+// Lowers ISD::BUILD_VECTOR into appropriate SelectionDAG nodes for the
+// backend.
+//
+// Lowers according to the following rules:
+// - Constant splats are legal as-is as long as the SplatBitSize is a power of
+//   2 less than or equal to 64 and the value fits into a signed 10-bit
+//   immediate
+// - Constant splats are lowered to bitconverted BUILD_VECTORs if SplatBitSize
+//   is a power of 2 less than or equal to 64 and the value does not fit into a
+//   signed 10-bit immediate
+// - Non-constant splats are legal as-is.
+// - Non-constant non-splats are lowered to sequences of INSERT_VECTOR_ELT.
+// - All others are illegal and must be expanded.
+SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Subtarget->hasMSA() || !ResTy.is128BitVector())
+    return SDValue();
+
+  if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                            HasAnyUndefs, 8,
+                            !Subtarget->isLittle()) && SplatBitSize <= 64) {
+    // We can only cope with 8, 16, 32, or 64-bit elements
+    if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
+        SplatBitSize != 64)
+      return SDValue();
+
+    // If the value fits into a simm10 then we can use ldi.[bhwd]
+    // However, if it isn't an integer type we will have to bitcast from an
+    // integer type first. Also, if there are any undefs, we must lower them
+    // to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+      return Op;
+
+    EVT ViaVecTy;
+
+    switch (SplatBitSize) {
+    default:
+      return SDValue();
+    case 8:
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      // There's no fill.d to fall back on for 64-bit values
+      return SDValue();
+    }
+
+    // SelectionDAG::getConstant will promote SplatValue appropriately.
+    SDValue Result = DAG.getConstant(SplatValue, ViaVecTy);
+
+    // Bitcast to the type we originally wanted
+    if (ViaVecTy != ResTy)
+      Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
+
+    return Result;
+  } else if (isSplatVector(Node))
+    return Op;
+  else if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+    // Use INSERT_VECTOR_ELT operations rather than expand to stores.
+    // The resulting code is the same length as the expansion, but it doesn't
+    // use memory operations
+    EVT ResTy = Node->getValueType(0);
+
+    assert(ResTy.isVector());
+
+    unsigned NumElts = ResTy.getVectorNumElements();
+    SDValue Vector = DAG.getUNDEF(ResTy);
+    for (unsigned i = 0; i < NumElts; ++i) {
+      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
+                           Node->getOperand(i),
+                           DAG.getConstant(i, MVT::i32));
+    }
+    return Vector;
+  }
+
+  return SDValue();
+}
+
+// Lower VECTOR_SHUFFLE into SHF (if possible).
+//
+// SHF splits the vector into blocks of four elements, then shuffles these
+// elements according to a <4 x i2> constant (encoded as an integer immediate).
+//
+// It is therefore possible to lower into SHF when the mask takes the form:
+//   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+// When undef's appear they are treated as if they were whatever value is
+// necessary in order to fit the above form.
+//
+// For example:
+//   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+//                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+//                                 i32 7, i32 6, i32 5, i32 4>
+// is lowered to:
+//   (SHF_H $w0, $w1, 27)
+// where the 27 comes from:
+//   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
+                                       SmallVector<int, 16> Indices,
+                                       SelectionDAG &DAG) {
+  int SHFIndices[4] = { -1, -1, -1, -1 };
+
+  if (Indices.size() < 4)
+    return SDValue();
+
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Indices.size(); j += 4) {
+      int Idx = Indices[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SHFIndices[i] == -1)
+        SHFIndices[i] = Idx;
+
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      if (!(Idx == -1 || Idx == SHFIndices[i]))
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(32, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SHFIndices[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(MipsISD::SHF, SDLoc(Op), ResTy,
+                     DAG.getConstant(Imm, MVT::i32), Op->getOperand(0));
+}
+
+// Lower VECTOR_SHUFFLE into ILVEV (if possible).
+//
+// ILVEV interleaves the even elements from each vector.
+//
+// It is possible to lower into ILVEV when the mask takes the form:
+//   <0, n, 2, n+2, 4, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVOD (if possible).
+//
+// ILVOD interleaves the odd elements from each vector.
+//
+// It is possible to lower into ILVOD when the mask takes the form:
+//   <1, n+1, 3, n+3, 5, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 1;
+  int WtIdx = ResTy.getVectorNumElements() + 1;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVL (if possible).
+//
+// ILVL interleaves consecutive elements from the left half of each vector.
+//
+// It is possible to lower into ILVL when the mask takes the form:
+//   <0, n, 1, n+1, 2, n+2, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
   }
+
+  return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVR (if possible).
+//
+// ILVR interleaves consecutive elements from the right half of each vector.
+//
+// It is possible to lower into ILVR when the mask takes the form:
+//   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+// where n is the number of elements in the vector and x is half n.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  unsigned NumElts = ResTy.getVectorNumElements();
+  int WsIdx = NumElts / 2;
+  int WtIdx = NumElts + NumElts / 2;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
+  }
+
+  return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKEV (if possible).
+//
+// PCKEV copies the even elements of each vector into the result vector.
+//
+// It is possible to lower into PCKEV when the mask takes the form:
+//   <0, 2, 4, ..., n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 0;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKOD (if possible).
+//
+// PCKOD copies the odd elements of each vector into the result vector.
+//
+// It is possible to lower into PCKOD when the mask takes the form:
+//   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 1;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into VSHF.
+//
+// This mostly consists of converting the shuffle indices in Indices into a
+// BUILD_VECTOR and adding it as an operand to the resulting VSHF. There is
+// also code to eliminate unused operands of the VECTOR_SHUFFLE. For example,
+// if the type is v8i16 and all the indices are less than 8 then the second
+// operand is unused and can be replaced with anything. We choose to replace it
+// with the used operand since this reduces the number of instructions overall.
+static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  SmallVector<SDValue, 16> Ops;
+  SDValue Op0;
+  SDValue Op1;
+  EVT MaskVecTy = ResTy.changeVectorElementTypeToInteger();
+  EVT MaskEltTy = MaskVecTy.getVectorElementType();
+  bool Using1stVec = false;
+  bool Using2ndVec = false;
+  SDLoc DL(Op);
+  int ResTyNumElts = ResTy.getVectorNumElements();
+
+  for (int i = 0; i < ResTyNumElts; ++i) {
+    // Idx == -1 means UNDEF
+    int Idx = Indices[i];
+
+    if (0 <= Idx && Idx < ResTyNumElts)
+      Using1stVec = true;
+    if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2)
+      Using2ndVec = true;
+  }
+
+  for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end();
+       ++I)
+    Ops.push_back(DAG.getTargetConstant(*I, MaskEltTy));
+
+  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, &Ops[0],
+                                Ops.size());
+
+  if (Using1stVec && Using2ndVec) {
+    Op0 = Op->getOperand(0);
+    Op1 = Op->getOperand(1);
+  } else if (Using1stVec)
+    Op0 = Op1 = Op->getOperand(0);
+  else if (Using2ndVec)
+    Op0 = Op1 = Op->getOperand(1);
+  else
+    llvm_unreachable("shuffle vector mask references neither vector operand?");
+
+  return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op0, Op1);
+}
+
+// Lower VECTOR_SHUFFLE into one of a number of instructions depending on the
+// indices in the shuffle.
+SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *Node = cast<ShuffleVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+
+  if (!ResTy.is128BitVector())
+    return SDValue();
+
+  int ResTyNumElts = ResTy.getVectorNumElements();
+  SmallVector<int, 16> Indices;
+
+  for (int i = 0; i < ResTyNumElts; ++i)
+    Indices.push_back(Node->getMaskElt(i));
+
+  SDValue Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
 }
 
 MachineBasicBlock * MipsSETargetLowering::
@@ -814,3 +2658,318 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   MI->eraseFromParent();   // The pseudo instruction is gone now.
   return Sink;
 }
+
+MachineBasicBlock * MipsSETargetLowering::
+emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
+                     unsigned BranchOp) const{
+  // $bb:
+  //  vany_nonzero $rd, $ws
+  //  =>
+  // $bb:
+  //  bnz.b $ws, $tbb
+  //  b $fbb
+  // $fbb:
+  //  li $rd1, 0
+  //  b $sink
+  // $tbb:
+  //  li $rd2, 1
+  // $sink:
+  //  $rd = phi($rd1, $fbb, $rd2, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bnz.b instruction to $BB.
+  BuildMI(BB, DL, TII->get(BranchOp))
+    .addReg(MI->getOperand(1).getReg())
+    .addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned RD1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned RD2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(RD1).addMBB(FBB).addReg(RD2).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
+
+// Emit the COPY_FW pseudo instruction.
+//
+// copy_fw_pseudo $fd, $ws, n
+// =>
+// copy_u_w $rt, $ws, $n
+// mtc1     $rt, $fd
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is never valid
+// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Fd = MI->getOperand(0).getReg();
+  unsigned Ws = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the COPY_FD pseudo instruction.
+//
+// copy_fd_pseudo $fd, $ws, n
+// =>
+// splati.d $wt, $ws, $n
+// copy $fd, $wt:sub_64
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is always
+// valid because FR=1 mode which is the only supported mode in MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  unsigned Fd  = MI->getOperand(0).getReg();
+  unsigned Ws  = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm() * 2;
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FW pseudo instruction.
+//
+// insert_fw_pseudo $wd, $wd_in, $n, $fs
+// =>
+// subreg_to_reg $wt:sub_lo, $fs
+// insve_w $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_W), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FD pseudo instruction.
+//
+// insert_fd_pseudo $wd, $fs, n
+// =>
+// subreg_to_reg $wt:sub_64, $fs
+// insve_d $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_D), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FW pseudo instruction.
+//
+// fill_fw_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_lo, $wt1, $fs
+// splati.w $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FD pseudo instruction.
+//
+// fill_fd_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_64, $wt1, $fs
+// splati.d $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_W_1 pseudo instructions.
+//
+// fexp2_w_1_pseudo $wd, $wt
+// =>
+// ldi.w $ws, 1
+// fexp2.w $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_W), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_W), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_D_1 pseudo instructions.
+//
+// fexp2_d_1_pseudo $wd, $wt
+// =>
+// ldi.d $ws, 1
+// fexp2.d $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_D_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_D), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_D), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index ec8a5c7..c5210d9 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -22,6 +22,14 @@ namespace llvm {
   public:
     explicit MipsSETargetLowering(MipsTargetMachine &TM);
 
+    /// \brief Enable MSA support for the given integer type and Register
+    /// class.
+    void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
+    /// \brief Enable MSA support for the given floating-point type and
+    /// Register class.
+    void addMSAFloatType(MVT::SimpleValueType Ty,
+                         const TargetRegisterClass *RC);
+
     virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
 
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
@@ -38,8 +46,8 @@ namespace llvm {
 
     virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
       if (VT == MVT::Untyped)
-        return Subtarget->hasDSP() ? &Mips::ACRegsDSPRegClass :
-                                     &Mips::ACRegsRegClass;
+        return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass :
+                                     &Mips::ACC64RegClass;
 
       return TargetLowering::getRepRegClassFor(VT);
     }
@@ -56,14 +64,50 @@ namespace llvm {
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
                 CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
 
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
                         SelectionDAG &DAG) const;
 
     SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+    /// depending on the indices in the shuffle.
+    SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
 
     MachineBasicBlock *emitBPOSGE32(MachineInstr *MI,
                                     MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            unsigned BranchOp) const;
+    /// \brief Emit the COPY_FW pseudo instruction
+    MachineBasicBlock *emitCOPY_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the COPY_FD pseudo instruction
+    MachineBasicBlock *emitCOPY_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FW pseudo instruction
+    MachineBasicBlock *emitINSERT_FW(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FD pseudo instruction
+    MachineBasicBlock *emitINSERT_FD(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FW pseudo instruction
+    MachineBasicBlock *emitFILL_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FD pseudo instruction
+    MachineBasicBlock *emitFILL_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_W_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_D_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
   };
 }
 
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 9521043..02931a3 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -24,11 +24,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
-                                   cl::desc("Expand double precision loads and "
-                                            "stores to their single precision "
-                                            "counterparts."));
-
 MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm,
                   tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J),
@@ -49,10 +44,8 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   unsigned Opc = MI->getOpcode();
 
-  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
-      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
-      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
-      (Opc == Mips::LDC164_P8)) {
+  if ((Opc == Mips::LW)   || (Opc == Mips::LD)   ||
+      (Opc == Mips::LWC1) || (Opc == Mips::LDC1) || (Opc == Mips::LDC164)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -74,10 +67,8 @@ isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   unsigned Opc = MI->getOpcode();
 
-  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
-      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
-      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
-      (Opc == Mips::SDC164_P8)) {
+  if ((Opc == Mips::SW)   || (Opc == Mips::SD)   ||
+      (Opc == Mips::SWC1) || (Opc == Mips::SDC1) || (Opc == Mips::SDC164)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -101,32 +92,34 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
       Opc = Mips::MFC1;
-    else if (Mips::HIRegsRegClass.contains(SrcReg))
+    else if (Mips::HI32RegClass.contains(SrcReg))
       Opc = Mips::MFHI, SrcReg = 0;
-    else if (Mips::LORegsRegClass.contains(SrcReg))
+    else if (Mips::LO32RegClass.contains(SrcReg))
       Opc = Mips::MFLO, SrcReg = 0;
-    else if (Mips::HIRegsDSPRegClass.contains(SrcReg))
+    else if (Mips::HI32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFHI_DSP;
-    else if (Mips::LORegsDSPRegClass.contains(SrcReg))
+    else if (Mips::LO32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFLO_DSP;
     else if (Mips::DSPCCRegClass.contains(SrcReg)) {
       BuildMI(MBB, I, DL, get(Mips::RDDSP), DestReg).addImm(1 << 4)
         .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
       return;
     }
+    else if (Mips::MSACtrlRegClass.contains(SrcReg))
+      Opc = Mips::CFCMSA;
   }
   else if (Mips::GPR32RegClass.contains(SrcReg)) { // Copy from CPU Reg.
     if (Mips::CCRRegClass.contains(DestReg))
       Opc = Mips::CTC1;
     else if (Mips::FGR32RegClass.contains(DestReg))
       Opc = Mips::MTC1;
-    else if (Mips::HIRegsRegClass.contains(DestReg))
+    else if (Mips::HI32RegClass.contains(DestReg))
       Opc = Mips::MTHI, DestReg = 0;
-    else if (Mips::LORegsRegClass.contains(DestReg))
+    else if (Mips::LO32RegClass.contains(DestReg))
       Opc = Mips::MTLO, DestReg = 0;
-    else if (Mips::HIRegsDSPRegClass.contains(DestReg))
+    else if (Mips::HI32DSPRegClass.contains(DestReg))
       Opc = Mips::MTHI_DSP;
-    else if (Mips::LORegsDSPRegClass.contains(DestReg))
+    else if (Mips::LO32DSPRegClass.contains(DestReg))
       Opc = Mips::MTLO_DSP;
     else if (Mips::DSPCCRegClass.contains(DestReg)) {
       BuildMI(MBB, I, DL, get(Mips::WRDSP))
@@ -134,6 +127,8 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(DestReg, RegState::ImplicitDefine);
       return;
     }
+    else if (Mips::MSACtrlRegClass.contains(DestReg))
+      Opc = Mips::CTCMSA;
   }
   else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
     Opc = Mips::FMOV_S;
@@ -144,21 +139,25 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
     if (Mips::GPR64RegClass.contains(SrcReg))
       Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
-    else if (Mips::HIRegs64RegClass.contains(SrcReg))
+    else if (Mips::HI64RegClass.contains(SrcReg))
       Opc = Mips::MFHI64, SrcReg = 0;
-    else if (Mips::LORegs64RegClass.contains(SrcReg))
+    else if (Mips::LO64RegClass.contains(SrcReg))
       Opc = Mips::MFLO64, SrcReg = 0;
     else if (Mips::FGR64RegClass.contains(SrcReg))
       Opc = Mips::DMFC1;
   }
   else if (Mips::GPR64RegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
-    if (Mips::HIRegs64RegClass.contains(DestReg))
+    if (Mips::HI64RegClass.contains(DestReg))
       Opc = Mips::MTHI64, DestReg = 0;
-    else if (Mips::LORegs64RegClass.contains(DestReg))
+    else if (Mips::LO64RegClass.contains(DestReg))
       Opc = Mips::MTLO64, DestReg = 0;
     else if (Mips::FGR64RegClass.contains(DestReg))
       Opc = Mips::DMTC1;
   }
+  else if (Mips::MSA128BRegClass.contains(DestReg)) { // Copy to MSA reg
+    if (Mips::MSA128BRegClass.contains(SrcReg))
+      Opc = Mips::MOVE_V;
+  }
 
   assert(Opc && "Cannot copy registers");
 
@@ -186,23 +185,31 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   unsigned Opc = 0;
 
   if (Mips::GPR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
+    Opc = Mips::SW;
   else if (Mips::GPR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
-  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC64_P8 : Mips::STORE_AC64;
-  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC_DSP_P8 : Mips::STORE_AC_DSP;
-  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC128_P8 : Mips::STORE_AC128;
+    Opc = Mips::SD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC128;
   else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_CCOND_DSP_P8 : Mips::STORE_CCOND_DSP;
+    Opc = Mips::STORE_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
+    Opc = Mips::SWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
+    Opc = Mips::SDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::ST_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::ST_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::ST_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::ST_D;
 
   assert(Opc && "Register class not handled!");
   BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
@@ -219,23 +226,31 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   unsigned Opc = 0;
 
   if (Mips::GPR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
+    Opc = Mips::LW;
   else if (Mips::GPR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
-  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC64_P8 : Mips::LOAD_AC64;
-  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC_DSP_P8 : Mips::LOAD_AC_DSP;
-  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC128_P8 : Mips::LOAD_AC128;
+    Opc = Mips::LD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC128;
   else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_CCOND_DSP_P8 : Mips::LOAD_CCOND_DSP;
+    Opc = Mips::LOAD_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
+    Opc = Mips::LWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
+    Opc = Mips::LDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::LD_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::LD_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::LD_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::LD_D;
 
   assert(Opc && "Register class not handled!");
   BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
@@ -251,6 +266,27 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case Mips::RetRA:
     expandRetRA(MBB, MI, Mips::RET);
     break;
+  case Mips::PseudoMFHI:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI);
+    break;
+  case Mips::PseudoMFLO:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO);
+    break;
+  case Mips::PseudoMFHI64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI64);
+    break;
+  case Mips::PseudoMFLO64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO64);
+    break;
+  case Mips::PseudoMTLOHI:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO, Mips::MTHI, false);
+    break;
+  case Mips::PseudoMTLOHI64:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO64, Mips::MTHI64, false);
+    break;
+  case Mips::PseudoMTLOHI_DSP:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
+    break;
   case Mips::PseudoCVT_S_W:
     expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
     break;
@@ -267,16 +303,16 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
     break;
   case Mips::BuildPairF64:
-    expandBuildPairF64(MBB, MI);
+    expandBuildPairF64(MBB, MI, false);
     break;
-  case Mips::ExtractElementF64:
-    expandExtractElementF64(MBB, MI);
+  case Mips::BuildPairF64_64:
+    expandBuildPairF64(MBB, MI, true);
     break;
-  case Mips::PseudoLDC1:
-    expandDPLoadStore(MBB, MI, Mips::LDC1, Mips::LWC1);
+  case Mips::ExtractElementF64:
+    expandExtractElementF64(MBB, MI, false);
     break;
-  case Mips::PseudoSDC1:
-    expandDPLoadStore(MBB, MI, Mips::SDC1, Mips::SWC1);
+  case Mips::ExtractElementF64_64:
+    expandExtractElementF64(MBB, MI, true);
     break;
   case Mips::MIPSeh_return32:
   case Mips::MIPSeh_return64:
@@ -399,6 +435,41 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc,
   return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
 }
 
+void MipsSEInstrInfo::expandPseudoMFHiLo(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned NewOpc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(NewOpc), I->getOperand(0).getReg());
+}
+
+void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned LoOpc,
+                                         unsigned HiOpc,
+                                         bool HasExplicitDef) const {
+  // Expand
+  //  lo_hi pseudomtlohi $gpr0, $gpr1
+  // to these two instructions:
+  //  mtlo $gpr0
+  //  mthi $gpr1
+
+  DebugLoc DL = I->getDebugLoc();
+  const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
+  MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
+  MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
+  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
+
+  // Add lo/hi registers if the mtlo/hi instructions created have explicit
+  // def registers.
+  if (HasExplicitDef) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+    unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+    LoInst.addReg(DstLo, RegState::Define);
+    HiInst.addReg(DstHi, RegState::Define);
+  }
+}
+
 void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
                                      unsigned CvtOpc, unsigned MovOpc,
@@ -408,100 +479,63 @@ void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
   unsigned DstReg = Dst.getReg(), SrcReg = Src.getReg(), TmpReg = DstReg;
   unsigned KillSrc =  getKillRegState(Src.isKill());
   DebugLoc DL = I->getDebugLoc();
-  unsigned SubIdx = (IsI64 ? Mips::sub_32 : Mips::sub_fpeven);
   bool DstIsLarger, SrcIsLarger;
 
   tie(DstIsLarger, SrcIsLarger) = compareOpndSize(CvtOpc, *MBB.getParent());
 
   if (DstIsLarger)
-    TmpReg = getRegisterInfo().getSubReg(DstReg, SubIdx);
+    TmpReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
 
   if (SrcIsLarger)
-    DstReg = getRegisterInfo().getSubReg(DstReg, SubIdx);
+    DstReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
 
   BuildMI(MBB, I, DL, MovDesc, TmpReg).addReg(SrcReg, KillSrc);
   BuildMI(MBB, I, DL, CvtDesc, DstReg).addReg(TmpReg, RegState::Kill);
 }
 
 void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator I) const {
+                                              MachineBasicBlock::iterator I,
+                                              bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned SrcReg = I->getOperand(1).getReg();
   unsigned N = I->getOperand(2).getImm();
-  const MCInstrDesc& Mfc1Tdd = get(Mips::MFC1);
   DebugLoc dl = I->getDebugLoc();
 
   assert(N < 2 && "Invalid immediate");
-  unsigned SubIdx = N ? Mips::sub_fpodd : Mips::sub_fpeven;
+  unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
   unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
 
-  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg);
+  if (SubIdx == Mips::sub_hi && FP64)
+    BuildMI(MBB, I, dl, get(Mips::MFHC1), DstReg).addReg(SubReg);
+  else
+    BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
 }
 
 void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator I) const {
+                                         MachineBasicBlock::iterator I,
+                                         bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
   DebugLoc dl = I->getDebugLoc();
   const TargetRegisterInfo &TRI = getRegisterInfo();
 
-  // mtc1 Lo, $fp
-  // mtc1 Hi, $fp + 1
-  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpeven))
-    .addReg(LoReg);
-  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpodd))
-    .addReg(HiReg);
-}
-
-/// Add 4 to the displacement of operand MO.
-static void fixDisp(MachineOperand &MO) {
-  switch (MO.getType()) {
-  default:
-    llvm_unreachable("Unhandled operand type.");
-  case MachineOperand::MO_Immediate:
-    MO.setImm(MO.getImm() + 4);
-    break;
-  case MachineOperand::MO_GlobalAddress:
-  case MachineOperand::MO_ConstantPoolIndex:
-  case MachineOperand::MO_BlockAddress:
-  case MachineOperand::MO_TargetIndex:
-  case MachineOperand::MO_ExternalSymbol:
-    MO.setOffset(MO.getOffset() + 4);
-    break;
-  }
-}
-
-void MipsSEInstrInfo::expandDPLoadStore(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned OpcD, unsigned OpcS) const {
-  // If NoDPLoadStore is false, just change the opcode.
-  if (!NoDPLoadStore) {
-    genInstrWithNewOpc(OpcD, I);
-    return;
-  }
+  // For FP32 mode:
+  //   mtc1 Lo, $fp
+  //   mtc1 Hi, $fp + 1
+  // For FP64 mode:
+  //   mtc1 Lo, $fp
+  //   mthc1 Hi, $fp
 
-  // Expand a double precision FP load or store to two single precision
-  // instructions.
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
+    .addReg(LoReg);
 
-  const TargetRegisterInfo &TRI = getRegisterInfo();
-  const MachineOperand &ValReg = I->getOperand(0);
-  unsigned LoReg = TRI.getSubReg(ValReg.getReg(), Mips::sub_fpeven);
-  unsigned HiReg = TRI.getSubReg(ValReg.getReg(), Mips::sub_fpodd);
-
-  if (!TM.getSubtarget<MipsSubtarget>().isLittle())
-    std::swap(LoReg, HiReg);
-
-  // Create an instruction which loads from or stores to the lower memory
-  // address.
-  MachineInstrBuilder MIB = genInstrWithNewOpc(OpcS, I);
-  MIB->getOperand(0).setReg(LoReg);
-
-  // Create an instruction which loads from or stores to the higher memory
-  // address.
-  MIB = genInstrWithNewOpc(OpcS, I);
-  MIB->getOperand(0).setReg(HiReg);
-  fixDisp(MIB->getOperand(2));
+  if (FP64)
+    BuildMI(MBB, I, dl, get(Mips::MTHC1), TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
+  else
+    BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
 }
 
 void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index d962ef0..6d2dd90 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -87,6 +87,13 @@ private:
   std::pair<bool, bool> compareOpndSize(unsigned Opc,
                                         const MachineFunction &MF) const;
 
+  void expandPseudoMFHiLo(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned NewOpc) const;
+
+  void expandPseudoMTLoHi(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned LoOpc, unsigned HiOpc,
+                          bool HasExplicitDef) const;
+
   /// Expand pseudo Int-to-FP conversion instructions.
   ///
   /// For example, the following pseudo instruction
@@ -101,12 +108,9 @@ private:
                       unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
 
   void expandExtractElementF64(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I) const;
+                               MachineBasicBlock::iterator I, bool FP64) const;
   void expandBuildPairF64(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator I) const;
-  void expandDPLoadStore(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator I, unsigned OpcD,
-                         unsigned OpcS) const;
+                          MachineBasicBlock::iterator I, bool FP64) const;
   void expandEhReturn(MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 };
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 286a2e2..2d44084 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -62,6 +62,24 @@ MipsSERegisterInfo::intRegClass(unsigned Size) const {
   return &Mips::GPR64RegClass;
 }
 
+/// Determine whether a given opcode is an MSA load/store (supporting 10-bit
+/// offsets) or a non-MSA load/store (supporting 16-bit offsets).
+static inline bool isMSALoadOrStore(const unsigned Opcode) {
+  switch (Opcode) {
+  case Mips::LD_B:
+  case Mips::LD_H:
+  case Mips::LD_W:
+  case Mips::LD_D:
+  case Mips::ST_B:
+  case Mips::ST_H:
+  case Mips::ST_W:
+  case Mips::ST_D:
+    return true;
+  default:
+    return false;
+  }
+}
+
 void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
                                      unsigned OpNo, int FrameIndex,
                                      uint64_t StackSize,
@@ -111,23 +129,49 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
-  // If MI is not a debug value, make sure Offset fits in the 16-bit immediate
-  // field.
-  if (!MI.isDebugValue() && !isInt<16>(Offset)) {
-    MachineBasicBlock &MBB = *MI.getParent();
-    DebugLoc DL = II->getDebugLoc();
-    unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-    unsigned NewImm;
-    const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo*>(
-        MBB.getParent()->getTarget().getInstrInfo());
-    unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
-    BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
-      .addReg(Reg, RegState::Kill);
-
-    FrameReg = Reg;
-    Offset = SignExtend64<16>(NewImm);
-    IsKill = true;
+  if (!MI.isDebugValue()) {
+    // Make sure Offset fits within the field available.
+    // For MSA instructions, this is a 10-bit signed immediate, otherwise it is
+    // a 16-bit signed immediate.
+    unsigned OffsetBitSize = isMSALoadOrStore(MI.getOpcode()) ? 10 : 16;
+
+    if (OffsetBitSize == 10 && !isInt<10>(Offset) && isInt<16>(Offset)) {
+      // If we have an offset that needs to fit into a signed 10-bit immediate
+      // and doesn't, but does fit into 16-bits then use an ADDiu
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDiu = Subtarget.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
+      const TargetRegisterClass *RC =
+          Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+      MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+      unsigned Reg = RegInfo.createVirtualRegister(RC);
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      BuildMI(MBB, II, DL, TII.get(ADDiu), Reg).addReg(FrameReg).addImm(Offset);
+
+      FrameReg = Reg;
+      Offset = 0;
+      IsKill = true;
+    } else if (!isInt<16>(Offset)) {
+      // Otherwise split the offset into 16-bit pieces and add it in multiple
+      // instructions.
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+      unsigned NewImm = 0;
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
+                                       OffsetBitSize == 16 ? &NewImm : NULL);
+      BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
+        .addReg(Reg, RegState::Kill);
+
+      FrameReg = Reg;
+      Offset = SignExtend64<16>(NewImm);
+      IsKill = true;
+    }
   }
 
   MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 541e2ca..0a81072 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -53,6 +53,12 @@ Mips16HardFloat("mips16-hard-float", cl::NotHidden,
                 cl::desc("MIPS: mips16 hard float enable."),
                 cl::init(false));
 
+static cl::opt<bool>
+Mips16ConstantIslands(
+  "mips16-constant-islands", cl::Hidden,
+  cl::desc("MIPS: mips16 constant islands enable. experimental feature"),
+  cl::init(false));
+
 void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
@@ -65,7 +71,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   HasBitCount(false), HasFPIdx(false),
   InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
   InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
+  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
   RM(_RM), OverrideMode(NoOverride), TM(_TM)
 {
   std::string CPUName = CPU;
@@ -89,12 +95,20 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
           (hasMips64() && (isABI_N32() || isABI_N64()))) &&
          "Invalid  Arch & ABI pair.");
 
+  if (hasMSA() && !isFP64bit())
+    report_fatal_error("MSA requires a 64-bit FPU register file (FR=1 mode). "
+                       "See -mattr=+fp64.",
+                       false);
+
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 
   // Set UseSmallSection.
   UseSmallSection = !IsLinux && (RM == Reloc::Static);
+  // set some subtarget specific features
+  if (inMips16Mode())
+    HasBitCount=false;
 }
 
 bool
@@ -152,3 +166,11 @@ void MipsSubtarget::resetSubtarget(MachineFunction *MF) {
   }
 }
 
+bool MipsSubtarget::mipsSEUsesSoftFloat() const {
+  return TM->Options.UseSoftFloat && !InMips16HardFloat;
+}
+
+bool MipsSubtarget::useConstantIslands() {
+  DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+  return Mips16ConstantIslands;
+}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index bfb13bb..6b2ab12 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -113,6 +113,9 @@ protected:
   // compiled as Mips32
   bool Os16;
 
+  // HasMSA -- supports MSA ASE.
+  bool HasMSA;
+
   InstrItineraryData InstrItins;
 
   // The instance to the register info section object
@@ -157,6 +160,7 @@ public:
 
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
+  bool isNotFP64bit() const { return !IsFP64bit; }
   bool isGP64bit() const { return IsGP64bit; }
   bool isGP32bit() const { return !IsGP64bit; }
   bool isSingleFloat() const { return IsSingleFloat; }
@@ -182,17 +186,25 @@ public:
   bool inMicroMipsMode() const { return InMicroMipsMode; }
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
+  bool hasMSA() const { return HasMSA; }
   bool isLinux() const { return IsLinux; }
   bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 
+  bool mipsSEUsesSoftFloat() const;
+
+  bool enableLongBranchPass() const {
+    return hasStandardEncoding() || allowMixed16_32();
+  }
+
   /// Features related to the presence of specific instructions.
   bool hasSEInReg()   const { return HasSEInReg; }
   bool hasCondMov()   const { return HasCondMov; }
   bool hasSwap()      const { return HasSwap; }
   bool hasBitCount()  const { return HasBitCount; }
   bool hasFPIdx()     const { return HasFPIdx; }
+  bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
   bool allowMixed16_32() const { return inMips16ModeDefault() |
@@ -200,6 +212,13 @@ public:
 
   bool os16() const { return Os16;};
 
+// for now constant islands are on for the whole compilation unit but we only
+// really use them if in addition we are in mips16 mode
+//
+static bool useConstantIslands();
+
+  unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
+
   // Grab MipsRegInfo object
   const MipsReginfo &getMReginfo() const { return MRI; }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index ced6a09..5046c1b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 
@@ -134,7 +135,13 @@ namespace {
 class MipsPassConfig : public TargetPassConfig {
 public:
   MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    // The current implementation of long branch pass requires a scratch
+    // register ($at) to be available before branch instructions. Tail merging
+    // can break this requirement, so disable it when long branch pass is
+    // enabled.
+    EnableTailMerge = !getMipsSubtarget().enableLongBranchPass();
+  }
 
   MipsTargetMachine &getMipsTargetMachine() const {
     return getTM<MipsTargetMachine>();
@@ -160,7 +167,7 @@ void MipsPassConfig::addIRPasses() {
     addPass(createMipsOs16(getMipsTargetMachine()));
   if (getMipsSubtarget().inMips16HardFloat())
     addPass(createMips16HardFloat(getMipsTargetMachine()));
-  addPass(createMipsOptimizeMathLibCalls(getMipsTargetMachine()));
+  addPass(createPartiallyInlineLibCallsPass());
 }
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
@@ -196,8 +203,7 @@ bool MipsPassConfig::addPreEmitPass() {
   const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
   addPass(createMipsDelaySlotFillerPass(TM));
 
-  if (Subtarget.hasStandardEncoding() ||
-      Subtarget.allowMixed16_32())
+  if (Subtarget.enableLongBranchPass())
     addPass(createMipsLongBranchPass(TM));
   if (Subtarget.inMips16Mode() ||
       Subtarget.allowMixed16_32())
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
new file mode 100644
index 0000000..96966fd
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -0,0 +1,44 @@
+//===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETSTREAMER_H
+#define MIPSTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class MipsTargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+
+public:
+  virtual void emitMipsHackELFFlags(unsigned Flags) = 0;
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) = 0;
+};
+
+// This part is for ascii assembly output
+class MipsTargetAsmStreamer : public MipsTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  MipsTargetAsmStreamer(formatted_raw_ostream &OS);
+  virtual void emitMipsHackELFFlags(unsigned Flags);
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+};
+
+// This part is for ELF object output
+class MipsTargetELFStreamer : public MipsTargetStreamer {
+public:
+  MCELFStreamer &getStreamer();
+  virtual void emitMipsHackELFFlags(unsigned Flags);
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+};
+}
+
+#endif
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index c7b8aa4..d5be0e4 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -277,3 +278,12 @@ void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
     printOperand(MI, OpNum + 1, O);
   }
 }
+
+void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  assert(Op.isExpr() && "Call prototype is not an MCExpr?");
+  const MCExpr *Expr = Op.getExpr();
+  const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
+  O << Sym.getName();
+}
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
index e0f44da..93029ae 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -44,7 +44,8 @@ public:
                      raw_ostream &O, const char *Modifier = 0);
   void printMemOperand(const MCInst *MI, int OpNum,
                        raw_ostream &O, const char *Modifier = 0);
-
+  void printProtoIdent(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = 0);
 };
 
 }
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index dfa1ff5..f2784b8 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -35,8 +35,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
 
   PrivateGlobalPrefix = "$L__";
 
-  AllowPeriodsInName = false;
-
   HasSetDirective = false;
 
   HasSingleParameterDotFile = false;
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index a2b9bec..7552fe7 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
 #include "cl_common_defines.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -47,21 +48,17 @@
 #include <sstream>
 using namespace llvm;
 
-bool RegAllocNilUsed = true;
-
 #define DEPOTNAME "__local_depot"
 
 static cl::opt<bool>
-EmitLineNumbers("nvptx-emit-line-numbers",
+EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
                 cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
                 cl::init(true));
 
-namespace llvm { bool InterleaveSrcInPtx = false; }
-
-static cl::opt<bool, true>
-InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore,
+static cl::opt<bool>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
               cl::desc("NVPTX Specific: Emit source line in ptx file"),
-              cl::location(llvm::InterleaveSrcInPtx));
+              cl::init(false));
 
 namespace {
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
@@ -129,7 +126,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
@@ -294,7 +291,7 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
     return;
 
   // Emit the line from the source file.
-  if (llvm::InterleaveSrcInPtx)
+  if (InterleaveSrc)
     this->emitSrcInText(fileName.str(), curLoc.getLine());
 
   std::stringstream temp;
@@ -317,6 +314,14 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
   OutMI.setOpcode(MI->getOpcode());
 
+  // Special: Do not mangle symbol operand of CALL_PROTOTYPE
+  if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
+    const MachineOperand &MO = MI->getOperand(0);
+    OutMI.addOperand(GetSymbolRef(MO,
+      OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName()))));
+    return;
+  }
+
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
@@ -344,7 +349,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_FPImmediate: {
     const ConstantFP *Cnt = MO.getFPImm();
@@ -550,6 +555,19 @@ void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
   VRegMapping.clear();
 }
 
+void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+  unsigned RegNo = MI->getOperand(0).getReg();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  if (TRI->isVirtualRegister(RegNo)) {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           getVirtualRegisterName(RegNo));
+  } else {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           TM.getRegisterInfo()->getName(RegNo));
+  }
+  OutStreamer.AddBlankLine();
+}
+
 void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
                                                    raw_ostream &O) const {
   // If the NVVM IR has some of reqntid* specified, then output
@@ -601,23 +619,30 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
     O << ".minnctapersm " << mincta << "\n";
 }
 
-void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
-                                             raw_ostream &O) {
-  const TargetRegisterClass *RC = MRI->getRegClass(vr);
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
 
-  DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
-  unsigned mapped_vr = regmap[vr];
+  std::string Name;
+  raw_string_ostream NameStr(Name);
 
-  if (!isVec) {
-    O << getNVPTXRegClassStr(RC) << mapped_vr;
-    return;
-  }
-  report_fatal_error("Bad register!");
+  VRegRCMap::const_iterator I = VRegMapping.find(RC);
+  assert(I != VRegMapping.end() && "Bad register class");
+  const DenseMap<unsigned, unsigned> &RegMap = I->second;
+
+  VRegMap::const_iterator VI = RegMap.find(Reg);
+  assert(VI != RegMap.end() && "Bad virtual register");
+  unsigned MappedVR = VI->second;
+
+  NameStr << getNVPTXRegClassStr(RC) << MappedVR;
+
+  NameStr.flush();
+  return Name;
 }
 
-void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, bool isVec,
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
                                           raw_ostream &O) {
-  getVirtualRegisterName(vr, isVec, O);
+  O << getVirtualRegisterName(vr);
 }
 
 void NVPTXAsmPrinter::printVecModifiedImmediate(
@@ -660,7 +685,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
   else
     O << ".func ";
   printReturnValStr(F, O);
-  O << *Mang->getSymbol(F) << "\n";
+  O << *getSymbol(F) << "\n";
   emitFunctionParamList(F, O);
   O << ";\n";
 }
@@ -870,7 +895,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
 
-  Mang = new Mangler(OutContext, &TM);
+  Mang = new Mangler(&TM);
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1);
@@ -1190,7 +1215,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     else
       O << getPTXFundamentalTypeStr(ETy, false);
     O << " ";
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
 
     // Ptx allows variable initilization only for constant and global state
     // spaces.
@@ -1226,15 +1251,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           bufferAggregateConstant(Initializer, &aggBuffer);
           if (aggBuffer.numSymbols) {
             if (nvptxSubtarget.is64Bit()) {
-              O << " .u64 " << *Mang->getSymbol(GVar) << "[";
+              O << " .u64 " << *getSymbol(GVar) << "[";
               O << ElementSize / 8;
             } else {
-              O << " .u32 " << *Mang->getSymbol(GVar) << "[";
+              O << " .u32 " << *getSymbol(GVar) << "[";
               O << ElementSize / 4;
             }
             O << "]";
           } else {
-            O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+            O << " .b8 " << *getSymbol(GVar) << "[";
             O << ElementSize;
             O << "]";
           }
@@ -1242,7 +1267,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           aggBuffer.print();
           O << "}";
         } else {
-          O << " .b8 " << *Mang->getSymbol(GVar);
+          O << " .b8 " << *getSymbol(GVar);
           if (ElementSize) {
             O << "[";
             O << ElementSize;
@@ -1250,7 +1275,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           }
         }
       } else {
-        O << " .b8 " << *Mang->getSymbol(GVar);
+        O << " .b8 " << *getSymbol(GVar);
         if (ElementSize) {
           O << "[";
           O << ElementSize;
@@ -1357,7 +1382,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
     return;
   }
 
@@ -1372,7 +1397,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   case Type::ArrayTyID:
   case Type::VectorTyID:
     ElementSize = TD->getTypeStoreSize(ETy);
-    O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+    O << " .b8 " << *getSymbol(GVar) << "[";
     if (ElementSize) {
       O << itostr(ElementSize);
     }
@@ -1427,7 +1452,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
   if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
       (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
-    O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex;
+    O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
   else {
     std::string argName = I->getName();
     const char *p = argName.c_str();
@@ -1486,13 +1511,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       if (llvm::isImage(*I)) {
         std::string sname = I->getName();
         if (llvm::isImageWriteOnly(*I))
-          O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_"
+          O << "\t.param .surfref " << *getSymbol(F) << "_param_"
             << paramIndex;
         else // Default image is read_only
-          O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_"
+          O << "\t.param .texref " << *getSymbol(F) << "_param_"
             << paramIndex;
       } else // Should be llvm::isSampler(*I)
-        O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_"
+        O << "\t.param .samplerref " << *getSymbol(F) << "_param_"
           << paramIndex;
       continue;
     }
@@ -1739,13 +1764,13 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     return;
   }
   if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
     return;
   }
   if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
     const Value *v = Cexpr->stripPointerCasts();
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-      O << *Mang->getSymbol(GVar);
+      O << *getSymbol(GVar);
       return;
     } else {
       O << *LowerConstant(CPV, *this);
@@ -1863,7 +1888,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::VectorTyID:
   case Type::StructTyID: {
     if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
-        isa<ConstantStruct>(CPV)) {
+        isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
       int ElementSize = TD->getTypeAllocSize(CPV->getType());
       bufferAggregateConstant(CPV, aggBuffer);
       if (Bytes > ElementSize)
@@ -1993,6 +2018,116 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   return false;
 }
 
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+    const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+    const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << NVPTXInstPrinter::getRegisterName(MO.getReg());
+    } else {
+      emitVirtualRegister(MO.getReg(), O);
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable(
+          "Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *symbname = MO.getSymbolName();
+    if (strstr(symbname, ".PARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 6, "%u[];", &index);
+      printParamName(index, O);
+    } else if (strstr(symbname, ".HLPPARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 9, "%u[];", &index);
+      O << *CurrentFnSym << "_param_" << index << "_offset";
+    } else
+      O << symbname;
+    break;
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum + 1, O);
+  } else {
+    if (MI->getOperand(opNum + 1).isImm() &&
+        MI->getOperand(opNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum + 1, O);
+  }
+}
+
+
 // Force static initialization.
 extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 27bfa54..3abe5d1 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -155,7 +155,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
           if (pos == nextSymbolPos) {
             const Value *v = Symbols[nSym];
             if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-              MCSymbol *Name = AP.Mang->getSymbol(GVar);
+              MCSymbol *Name = AP.getSymbol(GVar);
               O << *Name;
             } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
               O << *nvptx::LowerConstant(Cexpr, AP);
@@ -188,6 +188,7 @@ private:
   void EmitFunctionEntryLabel();
   void EmitFunctionBodyStart();
   void EmitFunctionBodyEnd();
+  void emitImplicitDef(const MachineInstr *MI) const;
 
   void EmitInstruction(const MachineInstr *);
   void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
@@ -213,7 +214,7 @@ private:
   void emitGlobals(const Module &M);
   void emitHeader(Module &M, raw_ostream &O);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
-  void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
+  void emitVirtualRegister(unsigned int vr, raw_ostream &);
   void emitFunctionExternParamList(const MachineFunction &MF);
   void emitFunctionParamList(const Function *, raw_ostream &O);
   void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
@@ -222,7 +223,14 @@ private:
   bool isImageType(const Type *Ty);
   void printReturnValStr(const Function *, raw_ostream &O);
   void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
-
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier = 0);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &);
 protected:
   bool doInitialization(Module &M);
   bool doFinalization(Module &M);
@@ -287,7 +295,7 @@ public:
 
   bool ignoreLoc(const MachineInstr &);
 
-  virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &);
+  std::string getVirtualRegisterName(unsigned) const;
 
   DebugLoc prevDebugLoc;
   void emitLineNumberAsDotLoc(const MachineInstr &);
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 9f92a5b..9fb0dd8 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -142,7 +142,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
     GlobalVariable *GV = I->first;
     GlobalVariable *NewGV = I->second;
     ++I;
-    Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType());
+    Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
     // At this point, the remaining uses of GV should be found only in global
     // variable initializers, as other uses have been already been removed
     // while walking through the instructions in function definitions.
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ba85e35..4b8b306 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -26,24 +26,24 @@
 using namespace llvm;
 
 static cl::opt<int>
-FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
+FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
                  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
                           " 1: do it  2: do it aggressively"),
                  cl::init(2));
 
 static cl::opt<int> UsePrecDivF32(
-    "nvptx-prec-divf32", cl::ZeroOrMore,
+    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
              " IEEE Compliant F32 div.rnd if avaiable."),
     cl::init(2));
 
 static cl::opt<bool>
-UsePrecSqrtF32("nvptx-prec-sqrtf32",
+UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
           cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
           cl::init(true));
 
 static cl::opt<bool>
-FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore,
+FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
            cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
            cl::init(false));
 
@@ -118,8 +118,10 @@ bool NVPTXDAGToDAGISel::useF32FTZ() const {
 /// expanded, promoted and normal instructions.
 SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL; // Already selected.
+  }
 
   SDNode *ResNode = NULL;
   switch (N->getOpcode()) {
@@ -249,7 +251,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   SDValue Addr;
   SDValue Offset, Base;
   unsigned Opcode;
-  MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy;
+  MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
     switch (TargetVT) {
@@ -1347,8 +1349,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   SDValue Addr;
   SDValue Offset, Base;
   unsigned Opcode;
-  MVT::SimpleValueType SourceVT =
-      N1.getNode()->getValueType(0).getSimpleVT().SimpleTy;
+  MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N2, Addr)) {
     switch (SourceVT) {
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 828242d..6a8be75 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -310,6 +310,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::CallSeqBegin";
   case NVPTXISD::CallSeqEnd:
     return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::CallPrototype:
+    return "NVPTXISD::CallPrototype";
   case NVPTXISD::LoadV2:
     return "NVPTXISD::LoadV2";
   case NVPTXISD::LoadV4:
@@ -471,22 +473,47 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
                                           Type *Ty,
                                           unsigned Idx) const {
   const DataLayout *TD = getDataLayout();
-  unsigned align = 0;
-  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned Align = 0;
+  const Value *DirectCallee = CS->getCalledFunction();
+
+  if (!DirectCallee) {
+    // We don't have a direct function symbol, but that may be because of
+    // constant cast instructions in the call.
+    const Instruction *CalleeI = CS->getInstruction();
+    assert(CalleeI && "Call target is not a function or derived value?");
+
+    // With bitcast'd call targets, the instruction will be the call
+    if (isa<CallInst>(CalleeI)) {
+      // Check if we have call alignment metadata
+      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+        return Align;
+
+      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+      // Ignore any bitcast instructions
+      while(isa<ConstantExpr>(CalleeV)) {
+        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+        if (!CE->isCast())
+          break;
+        // Look through the bitcast
+        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+      }
 
-  if (Func) { // direct call
-    assert(CS->getCalledFunction() &&
-           "direct call cannot find callee");
-    if (!llvm::getAlign(*(CS->getCalledFunction()), Idx, align))
-      align = TD->getABITypeAlignment(Ty);
-  }
-  else { // indirect call
-    const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction());
-    if (!llvm::getAlign(*CallI, Idx, align))
-      align = TD->getABITypeAlignment(Ty);
+      // We have now looked past all of the bitcasts.  Do we finally have a
+      // Function?
+      if (isa<Function>(CalleeV))
+        DirectCallee = CalleeV;
+    }
   }
 
-  return align;
+  // Check for function alignment information if we found that the
+  // ultimate target is a Function
+  if (DirectCallee)
+    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+      return Align;
+
+  // Call is indirect or alignment information is not available, fall back to
+  // the ABI type alignment
+  return TD->getABITypeAlignment(Ty);
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -860,18 +887,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
     // to be emitted, and the label has to used as the last arg of call
     // instruction.
-    // The prototype is embedded in a string and put as the operand for an
-    // INLINEASM SDNode.
-    SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string proto_string =
-        getPrototype(retTy, Args, Outs, retAlignment, CS);
-    const char *asmstr = nvTM->getManagedStrPool()
-        ->getManagedString(proto_string.c_str())->c_str();
-    SDValue InlineAsmOps[] = {
-      Chain, DAG.getTargetExternalSymbol(asmstr, getPointerTy()),
-      DAG.getMDNode(0), DAG.getTargetConstant(0, MVT::i32), InFlag
+    // The prototype is embedded in a string and put as the operand for a
+    // CallPrototype SDNode which will print out to the value of the string.
+    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
+    const char *ProtoStr =
+      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+    SDValue ProtoOps[] = {
+      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
     };
-    Chain = DAG.getNode(ISD::INLINEASM, dl, InlineAsmVTs, InlineAsmOps, 5);
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, &ProtoOps[0], 3);
     InFlag = Chain.getValue(1);
   }
   // Op to just print "call"
@@ -1595,7 +1620,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             }
             Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
           }
-          InsIdx += VecSize;
+          InsIdx += NumElts;
         }
 
         if (NumElts > 0)
@@ -2263,3 +2288,29 @@ void NVPTXTargetLowering::ReplaceNodeResults(
     return;
   }
 }
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+  delete TextSection;
+  delete DataSection;
+  delete BSSSection;
+  delete ReadOnlySection;
+
+  delete StaticCtorSection;
+  delete StaticDtorSection;
+  delete LSDASection;
+  delete EHFrameSection;
+  delete DwarfAbbrevSection;
+  delete DwarfInfoSection;
+  delete DwarfLineSection;
+  delete DwarfFrameSection;
+  delete DwarfPubTypesSection;
+  delete DwarfDebugInlineSection;
+  delete DwarfStrSection;
+  delete DwarfLocSection;
+  delete DwarfARangesSection;
+  delete DwarfRangesSection;
+  delete DwarfMacroInfoSection;
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 3418437..66e708f 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -49,6 +49,7 @@ enum NodeType {
   RETURN,
   CallSeqBegin,
   CallSeqEnd,
+  CallPrototype,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index b406aa9..86ddd38 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -14,17 +14,19 @@
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXTargetMachine.h"
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "NVPTXGenInstrInfo.inc"
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include <cstdio>
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void NVPTXInstrInfo::anchor() {}
+
 // FIXME: Add the subtarget support on this constructor.
 NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
     : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index b1972e9..600fc5c 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -26,6 +26,7 @@ namespace llvm {
 class NVPTXInstrInfo : public NVPTXGenInstrInfo {
   NVPTXTargetMachine &TM;
   const NVPTXRegisterInfo RegInfo;
+  virtual void anchor();
 public:
   explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3e430bf..b23f1e4 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2607,6 +2607,20 @@ def trapinst : NVPTXInst<(outs), (ins),
                          "trap;",
                          [(trap)]>;
 
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype
+  : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+           [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+  let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE
+  : NVPTXInst<(outs), (ins ProtoIdent:$ident),
+              "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+
 include "NVPTXIntrinsics.td"
 
 
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index e57ace9..f8a692e 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -24,10 +24,10 @@ namespace llvm {
 /// the ASMPrint interface.
 ///
 class NVPTXSection : public MCSection {
-
+  virtual void anchor();
 public:
   NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
-  ~NVPTXSection() {}
+  virtual ~NVPTXSection() {}
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
index 83dfe12..b64c308 100644
--- a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
@@ -36,7 +36,7 @@ bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
     BasicBlock::iterator II = IB;
     BasicBlock::iterator IE = BI->end();
 
-    // Skit the first intruction. No splitting is needed at this
+    // Skit the first instruction. No splitting is needed at this
     // point even if this is a bar.
     while (II != IE) {
       if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index c4d0d6e..9771a17 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -20,6 +20,9 @@
 using namespace llvm;
 
 
+// Pin the vtable to this file.
+void NVPTXSubtarget::anchor() {}
+
 NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
                                const std::string &FS, bool is64Bit)
     : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 670077d..004be11 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -25,7 +25,7 @@
 namespace llvm {
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
-
+  virtual void anchor();
   std::string TargetName;
   NVPTX::DrvInterface drvInterface;
   bool Is64Bit;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 72afe8d..46edd6d 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -57,9 +57,6 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
   RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
 
-  RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32);
-  RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64);
-
   // FIXME: This pass is really intended to be invoked during IR optimization,
   // but it's very NVPTX-specific.
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
@@ -129,6 +126,7 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&PrologEpilogCodeInserterID);
   disablePass(&MachineCopyPropagationID);
   disablePass(&BranchFolderPassID);
+  disablePass(&TailDuplicateID);
 
   TargetPassConfig::addIRPasses();
   addPass(createGenericToNVVMPass());
@@ -154,10 +152,30 @@ FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
 
 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
   assert(!RegAllocPass && "NVPTX uses no regalloc!");
-  addPass(&StrongPHIEliminationID);
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
 }
 
 void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   assert(!RegAllocPass && "NVPTX uses no regalloc!");
-  addPass(&StrongPHIEliminationID);
+
+  addPass(&ProcessImplicitDefsID);
+  addPass(&LiveVariablesID);
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
+
+  addPass(&TwoAddressInstructionPassID);
+  addPass(&RegisterCoalescerID);
+
+  // PreRA instruction scheduling.
+  if (addPass(&MachineSchedulerID))
+    printAndVerify("After Machine Scheduling");
+
+
+  addPass(&StackSlotColoringID);
+
+  // FIXME: Needs physical registers
+  //addPass(&PostRAMachineLICMID);
+
+  printAndVerify("After StackSlotColoring");
 }
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index bfd6ab1..2a7394b 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -44,30 +44,10 @@ public:
     DwarfMacroInfoSection = 0;
   }
 
-  ~NVPTXTargetObjectFile() {
-    delete TextSection;
-    delete DataSection;
-    delete BSSSection;
-    delete ReadOnlySection;
-
-    delete StaticCtorSection;
-    delete StaticDtorSection;
-    delete LSDASection;
-    delete EHFrameSection;
-    delete DwarfAbbrevSection;
-    delete DwarfInfoSection;
-    delete DwarfLineSection;
-    delete DwarfFrameSection;
-    delete DwarfPubTypesSection;
-    delete DwarfDebugInlineSection;
-    delete DwarfStrSection;
-    delete DwarfLocSection;
-    delete DwarfARangesSection;
-    delete DwarfRangesSection;
-    delete DwarfMacroInfoSection;
-  }
+  virtual ~NVPTXTargetObjectFile();
 
   virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+    TargetLoweringObjectFile::Initialize(ctx, TM);
     TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
     DataSection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 3cc324b..7406207 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -79,7 +79,7 @@ ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
 }
 
 static cl::opt<bool>
-NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true),
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
                    cl::desc("NVVM reflection, enabled by default"));
 
 char NVVMReflect::ID = 0;
@@ -88,7 +88,7 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 false)
 
 static cl::list<std::string>
-ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"),
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
             cl::desc("A list of string=num assignments"),
             cl::ValueRequired);
 
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index a8f7509..fe83fe1 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
@@ -174,6 +175,7 @@ struct PPCOperand;
 class PPCAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   bool IsPPC64;
 
   MCAsmParser &getParser() const { return Parser; }
@@ -218,8 +220,9 @@ class PPCAsmParser : public MCTargetAsmParser {
 
 
 public:
-  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &_MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     Triple TheTriple(STI.getTargetTriple());
     IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
@@ -235,6 +238,10 @@ public:
   virtual bool ParseDirective(AsmToken DirectiveID);
 
   unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
+
+  virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                            MCSymbolRefExpr::VariantKind,
+                                            MCContext &Ctx);
 };
 
 /// PPCOperand - Instances of this class represent a parsed PowerPC machine
@@ -900,19 +907,19 @@ MatchRegisterName(const AsmToken &Tok, unsigned &RegNo, int64_t &IntVal) {
       RegNo = PPC::VRSAVE;
       IntVal = 256;
       return false;
-    } else if (Name.substr(0, 1).equals_lower("r") &&
+    } else if (Name.startswith_lower("r") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 1).equals_lower("f") &&
+    } else if (Name.startswith_lower("f") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = FRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 1).equals_lower("v") &&
+    } else if (Name.startswith_lower("v") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = VRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 2).equals_lower("cr") &&
+    } else if (Name.startswith_lower("cr") &&
                !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
       RegNo = CRRegs[IntVal];
       return false;
@@ -1353,6 +1360,8 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
   switch (Kind) {
     case MCK_0: ImmVal = 0; break;
     case MCK_1: ImmVal = 1; break;
+    case MCK_2: ImmVal = 2; break;
+    case MCK_3: ImmVal = 3; break;
     default: return Match_InvalidOperand;
   }
 
@@ -1363,3 +1372,26 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
   return Match_InvalidOperand;
 }
 
+const MCExpr *
+PPCAsmParser::applyModifierToExpr(const MCExpr *E,
+                                  MCSymbolRefExpr::VariantKind Variant,
+                                  MCContext &Ctx) {
+  switch (Variant) {
+  case MCSymbolRefExpr::VK_PPC_LO:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HI:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHER:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHERA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHEST:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+  default:
+    return 0;
+  }
+}
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 08d7665..8281b5c 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -18,9 +18,17 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
+// FIXME: Once the integrated assembler supports full register names, tie this
+// to the verbose-asm setting.
+static cl::opt<bool>
+FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
+             cl::desc("Use full register names when printing assembly"));
+
 #include "PPCGenAsmWriter.inc"
 
 void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
@@ -78,6 +86,17 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
   }
   
+  // For fast-isel, a COPY_TO_REGCLASS may survive this long.  This is
+  // used when converting a 32-bit float to a 64-bit float as part of
+  // conversion to an integer (see PPCFastISel.cpp:SelectFPToI()),
+  // as otherwise we have problems with incorrect register classes
+  // in machine instruction verification.  For now, just avoid trying
+  // to print it as such an instruction has no effect (a 32-bit float
+  // in a register is already in 64-bit form, just with lower
+  // precision).  FIXME: Is there a better solution?
+  if (MI->getOpcode() == TargetOpcode::COPY_TO_REGCLASS)
+    return;
+  
   printInstruction(MI, O);
   printAnnotation(O, Annot);
 }
@@ -285,6 +304,9 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
 /// stripRegisterPrefix - This method strips the character prefix from a
 /// register name so that only the number is left.  Used by for linux asm.
 static const char *stripRegisterPrefix(const char *RegName) {
+  if (FullRegNames)
+    return RegName;
+
   switch (RegName[0]) {
   case 'r':
   case 'f':
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index 45be471..3efa5ec 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMPowerPCDesc
   PPCMCCodeEmitter.cpp
   PPCMCExpr.cpp
   PPCPredicates.cpp
+  PPCMachObjectWriter.cpp
   PPCELFObjectWriter.cpp
   )
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index b2a8701..0d42081 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -16,9 +16,9 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -69,19 +69,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 }
 
 namespace {
-class PPCMachObjectWriter : public MCMachObjectTargetWriter {
-public:
-  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
-
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) {
-    llvm_unreachable("Relocation emission for MachO/PPC unimplemented!");
-  }
-};
 
 class PPCAsmBackend : public MCAsmBackend {
 const Target &TheTarget;
@@ -145,14 +132,17 @@ public:
   }
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-    // Can't emit NOP with size not multiple of 32-bits
-    if (Count % 4 != 0)
-      return false;
-
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
       OW->Write32(0x60000000);
 
+    switch (Count % 4) {
+    default: break; // No leftover bytes to write
+    case 1: OW->Write8(0); break;
+    case 2: OW->Write16(0); break;
+    case 3: OW->Write16(0); OW->Write8(0); break;
+    }
+
     return true;
   }
 
@@ -174,12 +164,11 @@ namespace {
 
     MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
       bool is64 = getPointerSize() == 8;
-      return createMachObjectWriter(new PPCMachObjectWriter(
-                                      /*Is64Bit=*/is64,
-                                      (is64 ? object::mach::CTM_PowerPC64 :
-                                       object::mach::CTM_PowerPC),
-                                      object::mach::CSPPC_ALL),
-                                    OS, /*IsLittleEndian=*/false);
+      return createPPCMachObjectWriter(
+          OS,
+          /*Is64Bit=*/is64,
+          (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
+          MachO::CPU_SUBTYPE_POWERPC_ALL);
     }
 
     virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
@@ -206,10 +195,9 @@ namespace {
 
 } // end anonymous namespace
 
-
-
-
-MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU) {
   if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 6822507..f3dddce 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -22,7 +22,6 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
   }
   IsLittleEndian = false;
 
-  PCSymbol = ".";
   CommentString = ";";
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
@@ -47,15 +46,14 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
   CommentString = "#";
   GlobalPrefix = "";
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
-  
+
   // Uses '.section' before '.bss' directive
   UsesELFSectionDirectiveForBSS = true;  
 
   // Debug Information
   SupportsDebugInformation = true;
 
-  PCSymbol = ".";
+  DollarIsPC = true;
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 7b4ed9f..1530e77 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -15,6 +15,7 @@
 #define PPCTARGETASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
@@ -24,7 +25,7 @@ namespace llvm {
     explicit PPCMCAsmInfoDarwin(bool is64Bit);
   };
 
-  class PPCLinuxMCAsmInfo : public MCAsmInfo {
+  class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit PPCLinuxMCAsmInfo(bool is64Bit);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 59ba9c4..346a9be 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
@@ -76,11 +77,17 @@ public:
                                  SmallVectorImpl<MCFixup> &Fixups) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups) const {
+    // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
+    // It's just a nop to keep the register classes happy, so don't
+    // generate anything.
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == TargetOpcode::COPY_TO_REGCLASS)
+      return;
+
     uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
 
     // BL8_NOP etc. all have a size of 8 because of the following 'nop'.
     unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
-    unsigned Opcode = MI.getOpcode();
     if (Opcode == PPC::BL8_NOP || Opcode == PPC::BLA8_NOP ||
         Opcode == PPC::BL8_NOP_TLS)
       Size = 8;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 9529267..d7e8402 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -54,7 +54,7 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
                                      const MCAsmLayout *Layout) const {
   MCValue Value;
 
-  if (!getSubExpr()->EvaluateAsRelocatable(Value, *Layout))
+  if (!Layout || !getSubExpr()->EvaluateAsRelocatable(Value, *Layout))
     return false;
 
   if (Value.isAbsolute()) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 5f7a39a..f18d095 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -14,13 +14,16 @@
 #include "PPCMCTargetDesc.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "PPCMCAsmInfo.h"
+#include "PPCTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -34,6 +37,9 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+PPCTargetStreamer::~PPCTargetStreamer() {}
+
 static MCInstrInfo *createPPCMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitPPCMCInstrInfo(X);
@@ -101,6 +107,29 @@ static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
   return X;
 }
 
+namespace {
+class PPCTargetAsmStreamer : public PPCTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  PPCTargetAsmStreamer(formatted_raw_ostream &OS) : OS(OS) {}
+  virtual void emitTCEntry(const MCSymbol &S) {
+    OS << "\t.tc ";
+    OS << S.getName();
+    OS << "[TC],";
+    OS << S.getName();
+    OS << '\n';
+  }
+};
+
+class PPCTargetELFStreamer : public PPCTargetStreamer {
+  virtual void emitTCEntry(const MCSymbol &S) {
+    // Creates a R_PPC64_TOC relocation
+    Streamer->EmitSymbolValue(&S, 8);
+  }
+};
+}
+
 // This is duplicated code. Refactor this.
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
@@ -111,7 +140,20 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   if (Triple(TT).isOSDarwin())
     return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
 
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  PPCTargetStreamer *S = new PPCTargetELFStreamer();
+  return createELFStreamer(Ctx, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
+                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
+  PPCTargetStreamer *S = new PPCTargetAsmStreamer(OS);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
 }
 
 static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
@@ -171,6 +213,11 @@ extern "C" void LLVMInitializePowerPCTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(ThePPC64LETarget, createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(ThePPC32Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64LETarget, createMCAsmStreamer);
+
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 9f29132..0b0ca24 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -40,12 +40,17 @@ MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
 
 /// createPPCELFObjectWriter - Construct an PPC ELF object writer.
 MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
                                          bool Is64Bit,
                                          uint8_t OSABI);
+/// createPPCELFObjectWriter - Construct a PPC Mach-O object writer.
+MCObjectWriter *createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
 } // End llvm namespace
 
 // Generated files will use "namespace PPC". To avoid symbol clash,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
new file mode 100644
index 0000000..bbafe2e
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -0,0 +1,389 @@
+//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup, MCValue Target,
+                                 unsigned Log2Size, uint64_t &FixedValue);
+
+  void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                           const MCAsmLayout &Layout,
+                           const MCFragment *Fragment, const MCFixup &Fixup,
+                           MCValue Target, uint64_t &FixedValue);
+
+public:
+  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) {
+    if (Writer->is64Bit()) {
+      report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
+    } else
+      RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+/// computes the log2 of the size of the relocation,
+/// used for relocation_info::r_length.
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    report_fatal_error("log2size(FixupKind): Unhandled fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_br24:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_Data_8:
+    return 3;
+  }
+  return 0;
+}
+
+/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
+/// Outline based on PPCELFObjectWriter::getRelocTypeInner().
+static unsigned getRelocType(const MCValue &Target,
+                             const MCFixupKind FixupKind, // from
+                                                          // Fixup.getKind()
+                             const bool IsPCRel) {
+  const MCSymbolRefExpr::VariantKind Modifier =
+      Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                          : Target.getSymA()->getKind();
+  // determine the type of the relocation
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+  if (IsPCRel) { // relative to PC
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (relative)");
+    case PPC::fixup_ppc_br24:
+      Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24
+      break;
+    case PPC::fixup_ppc_brcond14:
+      Type = MachO::PPC_RELOC_BR14;
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (absolute)!");
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16_SECTDIFF;
+        break;
+      }
+      break;
+    case FK_Data_4:
+      break;
+    case FK_Data_2:
+      break;
+    }
+  }
+  return Type;
+}
+
+static void makeRelocationInfo(MachO::any_relocation_info &MRE,
+                               const uint32_t FixupOffset, const uint32_t Index,
+                               const unsigned IsPCRel, const unsigned Log2Size,
+                               const unsigned IsExtern, const unsigned Type) {
+  MRE.r_word0 = FixupOffset;
+  // The bitfield offsets that work (as determined by trial-and-error)
+  // are different than what is documented in the mach-o manuals.
+  // This appears to be an endianness issue; reversing the order of the
+  // documented bitfields in <llvm/Support/MachO.h> fixes this (but
+  // breaks x86/ARM assembly).
+  MRE.r_word1 = ((Index << 8) |    // was << 0
+                 (IsPCRel << 7) |  // was << 24
+                 (Log2Size << 5) | // was << 25
+                 (IsExtern << 4) | // was << 27
+                 (Type << 0));     // was << 28
+}
+
+static void
+makeScatteredRelocationInfo(MachO::any_relocation_info &MRE,
+                            const uint32_t Addr, const unsigned Type,
+                            const unsigned Log2Size, const unsigned IsPCRel,
+                            const uint32_t Value2) {
+  // For notes on bitfield positions and endianness, see:
+  // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry
+  MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) |
+                 (IsPCRel << 30) | MachO::R_SCATTERED);
+  MRE.r_word1 = Value2;
+}
+
+/// Compute fixup offset (address).
+static uint32_t getFixupOffset(const MCAsmLayout &Layout,
+                               const MCFragment *Fragment,
+                               const MCFixup &Fixup) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  // On Mach-O, ppc_fixup_half16 relocations must refer to the
+  // start of the instruction, not the second halfword, as ELF does
+  if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+    FixupOffset &= ~uint32_t(3);
+  return FixupOffset;
+}
+
+/// \return false if falling back to using non-scattered relocation,
+/// otherwise true for normal scattered relocation.
+/// based on X86MachObjectWriter::RecordScatteredRelocation
+/// and ARMMachObjectWriter::RecordScatteredRelocation
+bool PPCMachObjectWriter::RecordScatteredRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    unsigned Log2Size, uint64_t &FixedValue) {
+  // caller already computes these, can we just pass and reuse?
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  const MCFixupKind FK = Fixup.getKind();
+  const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned Type = getRelocType(Target, FK, IsPCRel);
+
+  // Is this a local or SECTDIFF relocation entry?
+  // SECTDIFF relocation entries have symbol subtractions,
+  // and require two entries, the first for the add-symbol value,
+  // the second for the subtract-symbol value.
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr =
+      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // FIXME: is Type correct? see include/llvm/Support/MachO.h
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+  // FIXME: does FixedValue get used??
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::PPC_RELOC_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HI16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HA16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO14_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) {
+    // X86 had this piece, but ARM does not
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  Twine("Section too large, can't encode "
+                                        "r_address (") +
+                                      Buffer + ") into 24 bits of scattered "
+                                               "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
+    // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
+    // see PPCMCExpr::EvaluateAsRelocatableImpl()
+    uint32_t other_half = 0;
+    switch (Type) {
+    case MachO::PPC_RELOC_LO16_SECTDIFF:
+      other_half = (FixedValue >> 16) & 0xffff;
+      // applyFixupOffset longer extracts the high part because it now assumes
+      // this was already done.
+      // It looks like this is not true for the FixedValue needed with Mach-O
+      // relocs.
+      // So we need to adjust FixedValue again here.
+      FixedValue &= 0xffff;
+      break;
+    case MachO::PPC_RELOC_HA16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue =
+          ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff;
+      break;
+    case MachO::PPC_RELOC_HI16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue = (FixedValue >> 16) & 0xffff;
+      break;
+    default:
+      llvm_unreachable("Invalid PPC scattered relocation type.");
+      break;
+    }
+
+    MachO::any_relocation_info MRE;
+    makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
+                                Log2Size, IsPCRel, Value2);
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
+  }
+  MachO::any_relocation_info MRE;
+  makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
+}
+
+// see PPCELFObjectWriter for a general outline of cases
+void PPCMachObjectWriter::RecordPPCRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  const MCFixupKind FK = Fixup.getKind(); // unsigned
+  const unsigned Log2Size = getFixupKindLog2Size(FK);
+  const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned RelocType = getRelocType(Target, FK, IsPCRel);
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB() &&
+      // Q: are branch targets ever scattered?
+      RelocType != MachO::PPC_RELOC_BR24 &&
+      RelocType != MachO::PPC_RELOC_BR14) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              Log2Size, FixedValue);
+    return;
+  }
+
+  // this doesn't seem right for RIT_PPC_BR24
+  // Get the symbol data, if any.
+  MCSymbolData *SD = 0;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // See <reloc.h>.
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = RelocType;
+
+  if (Target.isAbsolute()) { // constant
+                             // SymbolNum of 0 indicates the absolute section.
+                             //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+    // the above line stolen from ARM, not sure
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, IsExtern,
+                     Type);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/false);
+}
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 806822c..54e3d40 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -57,6 +57,8 @@ def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction">;
+def FeatureFCPSGN    : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
+                                        "Enable the fcpsgn instruction">;
 def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
                                         "Enable the fre instruction">;
 def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
@@ -85,6 +87,13 @@ def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
                                         "Enable Book E instructions">;
 def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
                                         "Enable QPX instructions">;
+def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
+                                        "Enable VSX instructions">;
+
+def DeprecatedMFTB   : SubtargetFeature<"", "DeprecatedMFTB", "true",
+                                        "Treat mftb as deprecated">;
+def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
+  "Treat vector data stream cache control instructions as deprecated">;
 
 // Note: Future features to add when support is extended to more
 // recent ISA levels:
@@ -146,10 +155,10 @@ include "PPCInstrInfo.td"
 def : Processor<"generic", G3Itineraries, [Directive32]>;
 def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
                                            FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE]>;
+                                           FeatureBookE, DeprecatedMFTB]>;
 def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
                                            FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE]>;
+                                           FeatureBookE, DeprecatedMFTB]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603,
@@ -185,29 +194,32 @@ def : ProcessorModel<"g5", G5Model,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    FeatureFRES, FeatureFRSQRTE,
-                   Feature64Bit /*, Feature64BitRegs */]>;
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc, FeatureMFOCRF,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
 def : ProcessorModel<"e5500", PPCE5500Model,
                   [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
 def : ProcessorModel<"a2", PPCA2Model,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */]>;
+               /*, Feature64BitRegs */, DeprecatedMFTB]>;
 def : ProcessorModel<"a2q", PPCA2Model,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */, FeatureQPX]>;
+               /*, Feature64BitRegs */, FeatureQPX, DeprecatedMFTB]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
                    FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
@@ -220,32 +232,37 @@ def : ProcessorModel<"pwr5", G5Model,
                   [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, Feature64Bit]>;
+                   FeatureSTFIWX, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr5x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6", G5Model,
                   [DirectivePwr6, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>;
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, Feature64Bit]>;
+                   FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr7", G5Model,
                   [DirectivePwr7, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX,
-                   Feature64Bit /*, Feature64BitRegs */]>;
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : ProcessorModel<"ppc64", G5Model,
                   [Directive64, FeatureAltivec,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bbfad87..ada34ed 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -23,6 +23,7 @@
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "PPCTargetStreamer.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -202,7 +203,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
             .getGVStubEntry(SymToPrint);
         if (StubSym.getPointer() == 0)
           StubSym = MachineModuleInfoImpl::
-            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
                  GV->hasAvailableExternallyLinkage()) {
         SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
@@ -212,12 +213,12 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                     getHiddenGVStubEntry(SymToPrint);
         if (StubSym.getPointer() == 0)
           StubSym = MachineModuleInfoImpl::
-            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else {
-        SymToPrint = Mang->getSymbol(GV);
+        SymToPrint = getSymbol(GV);
       }
     } else {
-      SymToPrint = Mang->getSymbol(GV);
+      SymToPrint = getSymbol(GV);
     }
     
     O << *SymToPrint;
@@ -363,7 +364,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
     MCSymbol *MOSymbol = 0;
     if (MO.isGlobal())
-      MOSymbol = Mang->getSymbol(MO.getGlobal());
+      MOSymbol = getSymbol(MO.getGlobal());
     else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
@@ -402,7 +403,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
       IsCommon = GVar && RealGValue->hasCommonLinkage();
@@ -413,7 +414,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
 
-    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI())
+    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI() ||
+        TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
@@ -438,18 +440,22 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     if (MO.isJTI())
       MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
-    else if (MO.isCPI())
+    else if (MO.isCPI()) {
       MOSymbol = GetCPISymbol(MO.getIndex());
+      if (TM.getCodeModel() == CodeModel::Large)
+        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
     
       if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage())
+          RealGValue->hasAvailableExternallyLinkage() ||
+          TM.getCodeModel() == CodeModel::Large)
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
 
@@ -479,14 +485,14 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
       IsFunction = !GVar;
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
-    if (IsFunction || IsExternal)
+    if (IsFunction || IsExternal || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
@@ -502,7 +508,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
                               OutContext);
@@ -520,7 +526,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.setOpcode(PPC::LD);
     const MachineOperand &MO = MI->getOperand(1);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *Exp =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
                               OutContext);
@@ -534,7 +540,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
                               OutContext);
@@ -550,7 +556,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO,
                               OutContext);
@@ -571,7 +577,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
                               OutContext);
@@ -586,7 +592,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
                               OutContext);
@@ -602,7 +608,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO,
                               OutContext);
@@ -623,7 +629,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
                               OutContext);
@@ -638,7 +644,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
                               OutContext);
@@ -654,7 +660,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
@@ -704,6 +710,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     break;
   case PPC::LD:
   case PPC::STD:
+  case PPC::LWA_32:
   case PPC::LWA: {
     // Verify alignment is legal, so we don't create relocations
     // that can't be supported.
@@ -765,6 +772,9 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
 
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
+  PPCTargetStreamer &TS =
+      static_cast<PPCTargetStreamer &>(OutStreamer.getTargetStreamer());
+
   if (isPPC64 && !TOC.empty()) {
     const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".toc",
         ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
@@ -775,7 +785,7 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
          E = TOC.end(); I != E; ++I) {
       OutStreamer.EmitLabel(I->second);
       MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
-      OutStreamer.EmitTCEntry(*S);
+      TS.emitTCEntry(*S);
     }
   }
 
@@ -1051,7 +1061,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
         MCSymbol *NLPSym = GetSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym =
           MMIMacho.getGVStubEntry(NLPSym);
-        StubSym = MachineModuleInfoImpl::StubValueTy(Mang->getSymbol(*I), true);
+        StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
       }
     }
   }
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 4e30c537..4224ae2 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -253,12 +253,19 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           case Intrinsic::sin:
           case Intrinsic::cos:
             return true;
+          case Intrinsic::copysign:
+            if (CI->getArgOperand(0)->getType()->getScalarType()->
+                isPPC_FP128Ty())
+              return true;
+            else
+              continue; // ISD::FCOPYSIGN is never a library call.
           case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
           case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
           case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
           case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
           case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
           case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
           }
         }
 
@@ -283,8 +290,9 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           default: return true;
           case LibFunc::copysign:
           case LibFunc::copysignf:
-          case LibFunc::copysignl:
             continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc::copysignl:
+            return true;
           case LibFunc::fabs:
           case LibFunc::fabsf:
           case LibFunc::fabsl:
@@ -309,6 +317,10 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           case LibFunc::rintf:
           case LibFunc::rintl:
             Opcode = ISD::FRINT; break;
+          case LibFunc::round:
+          case LibFunc::roundf:
+          case LibFunc::roundl:
+            Opcode = ISD::FROUND; break;
           case LibFunc::trunc:
           case LibFunc::truncf:
           case LibFunc::truncl:
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index a584188..e8e7f4c 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -37,6 +37,37 @@ def RetCC_PPC : CallingConv<[
 ]>;
 
 
+// Note that we don't currently have calling conventions for 64-bit
+// PowerPC, but handle all the complexities of the ABI in the lowering
+// logic.  FIXME: See if the logic can be simplified with use of CCs.
+// This may require some extensions to current table generation.
+
+// Simple calling convention for 64-bit ELF PowerPC fast isel.
+// Only handle ints and floats.  All ints are promoted to i64.
+// Vector types and quadword ints are not handled.
+def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i8],  CCPromoteToType<i64>>,
+  CCIfType<[i16], CCPromoteToType<i64>>,
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>
+]>;
+
+// Simple return-value convention for 64-bit ELF PowerPC fast isel.
+// All small ints are promoted to i64.  Vector types, quadword ints,
+// and multiple register returns are "supported" to avoid compile
+// errors, but none are handled by the fast selector.
+def RetCC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i8],   CCPromoteToType<i64>>,
+  CCIfType<[i16],  CCPromoteToType<i64>>,
+  CCIfType<[i32],  CCPromoteToType<i64>>,
+  CCIfType<[i64],  CCAssignToReg<[X3, X4]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[f32],  CCAssignToReg<[F1, F2]>>,
+  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC System V Release 4 32-bit ABI
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 8cbf1fb..09117e7 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -37,6 +37,25 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
+//===----------------------------------------------------------------------===//
+//
+// TBD:
+//   FastLowerArguments: Handle simple cases.
+//   PPCMaterializeGV: Handle TLS.
+//   SelectCall: Handle function pointers.
+//   SelectCall: Handle multi-register return values.
+//   SelectCall: Optimize away nops for local calls.
+//   processCallArgs: Handle bit-converted arguments.
+//   finishCall: Handle multi-register return values.
+//   PPCComputeAddress: Handle parameter references as FrameIndex's.
+//   PPCEmitCmp: Handle immediate as operand 1.
+//   SelectCall: Handle small byval arguments.
+//   SelectIntrinsicCall: Implement.
+//   SelectSelect: Implement.
+//   Consider factoring isTypeLegal into the base class.
+//   Implement switches and jump tables.
+//
+//===----------------------------------------------------------------------===//
 using namespace llvm;
 
 namespace {
@@ -52,7 +71,7 @@ typedef struct Address {
     int FI;
   } Base;
 
-  int Offset;
+  long Offset;
 
   // Innocuous defaults for our address.
   Address()
@@ -89,15 +108,76 @@ class PPCFastISel : public FastISel {
     virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                      const LoadInst *LI);
     virtual bool FastLowerArguments();
+    virtual unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm);
+    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     uint64_t Imm);
+    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    unsigned Op0, bool Op0IsKill);
+    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill);
+
+  // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectIToFP(const Instruction *I, bool IsSigned);
+    bool SelectFPToI(const Instruction *I, bool IsSigned);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectCall(const Instruction *I);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
 
   // Utility routines.
   private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt, unsigned DestReg);
+    bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     const TargetRegisterClass *RC, bool IsZExt = true,
+                     unsigned FP64LoadOpc = PPC::LFD);
+    bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
+    bool PPCComputeAddress(const Value *Obj, Address &Addr);
+    void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                            unsigned &IndexReg);
+    bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                           unsigned DestReg, bool IsZExt);
     unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
     unsigned PPCMaterializeInt(const Constant *C, MVT VT);
     unsigned PPCMaterialize32BitInt(int64_t Imm,
                                     const TargetRegisterClass *RC);
     unsigned PPCMaterialize64BitInt(int64_t Imm,
                                     const TargetRegisterClass *RC);
+    unsigned PPCMoveToIntReg(const Instruction *I, MVT VT,
+                             unsigned SrcReg, bool IsSigned);
+    unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned);
+
+  // Call handling routines.
+  private:
+    bool processCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool IsVarArg);
+    void finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool IsVarArg);
+    CCAssignFn *usePPC32CCs(unsigned Flag);
 
   private:
   #include "PPCGenFastISel.inc"
@@ -106,10 +186,1601 @@ class PPCFastISel : public FastISel {
 
 } // end anonymous namespace
 
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
+  if (Flag == 1)
+    return CC_PPC32_SVR4;
+  else if (Flag == 2)
+    return CC_PPC32_SVR4_ByVal;
+  else if (Flag == 3)
+    return CC_PPC32_SVR4_VarArg;
+  else
+    return RetCC_PPC;
+}
+
+static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // These are not representable with any single compare.
+    case CmpInst::FCMP_FALSE:
+    case CmpInst::FCMP_UEQ:
+    case CmpInst::FCMP_UGT:
+    case CmpInst::FCMP_UGE:
+    case CmpInst::FCMP_ULT:
+    case CmpInst::FCMP_ULE:
+    case CmpInst::FCMP_UNE:
+    case CmpInst::FCMP_TRUE:
+    default:
+      return Optional<PPC::Predicate>();
+
+    case CmpInst::FCMP_OEQ:
+    case CmpInst::ICMP_EQ:
+      return PPC::PRED_EQ;
+
+    case CmpInst::FCMP_OGT:
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      return PPC::PRED_GT;
+
+    case CmpInst::FCMP_OGE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      return PPC::PRED_GE;
+
+    case CmpInst::FCMP_OLT:
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_SLT:
+      return PPC::PRED_LT;
+
+    case CmpInst::FCMP_OLE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      return PPC::PRED_LE;
+
+    case CmpInst::FCMP_ONE:
+    case CmpInst::ICMP_NE:
+      return PPC::PRED_NE;
+
+    case CmpInst::FCMP_ORD:
+      return PPC::PRED_NU;
+
+    case CmpInst::FCMP_UNO:
+      return PPC::PRED_UN;
+  }
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel, and return its equivalent machine type in VT.
+// FIXME: Copied directly from ARM -- factor into base class?
+bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT Evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (Evt == MVT::Other || !Evt.isSimple()) return false;
+  VT = Evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel as a load target, and return its equivalent machine type in VT.
+bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
+    return true;
+  }
+
+  return false;
+}
+
+// Given a value Obj, create an Address object Addr that represents its
+// address.  Return false if we can't handle it.
+bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+    default:
+      break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return PPCComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      long TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
+           II != IE; ++II, ++GTI) {
+        const Value *Op = *II;
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = TD.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // FIXME: References to parameters fall through to the behavior
+  // below.  They should be able to reference a frame index since
+  // they are stored to the stack, so we can get "ld rx, offset(r1)"
+  // instead of "addi ry, r1, offset / ld rx, 0(ry)".  Obj will
+  // just contain the parameter.  Try to handle this with a FI.
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0)
+    Addr.Base.Reg = getRegForValue(Obj);
+
+  // Prevent assignment of base register to X0, which is inappropriate
+  // for loads and stores alike.
+  if (Addr.Base.Reg != 0)
+    MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  return Addr.Base.Reg != 0;
+}
+
+// Fix up some addresses that can't be used directly.  For example, if
+// an offset won't fit in an instruction field, we may need to move it
+// into an index register.
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                                     unsigned &IndexReg) {
+
+  // Check whether the offset fits in the instruction field.
+  if (!isInt<16>(Addr.Offset))
+    UseOffset = false;
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  if (!UseOffset) {
+    IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context)
+                             : Type::getInt64Ty(*Context));
+    const ConstantInt *Offset =
+      ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
+    IndexReg = PPCMaterializeInt(Offset, MVT::i64);
+    assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
+  }
+}
+
+// Emit a load instruction if possible, returning true if we succeeded,
+// otherwise false.  See commentary below for how the register class of
+// the load is determined. 
+bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              const TargetRegisterClass *RC,
+                              bool IsZExt, unsigned FP64LoadOpc) {
+  unsigned Opc;
+  bool UseOffset = true;
+
+  // If ResultReg is given, it determines the register class of the load.
+  // Otherwise, RC is the register class to use.  If the result of the
+  // load isn't anticipated in this block, both may be zero, in which
+  // case we must make a conservative guess.  In particular, don't assign
+  // R0 or X0 to the result register, as the result may be used in a load,
+  // store, add-immediate, or isel that won't permit this.  (Though
+  // perhaps the spill and reload of live-exit values would handle this?)
+  const TargetRegisterClass *UseRC =
+    (ResultReg ? MRI.getRegClass(ResultReg) :
+     (RC ? RC :
+      (VT == MVT::f64 ? &PPC::F8RCRegClass :
+       (VT == MVT::f32 ? &PPC::F4RCRegClass :
+        (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+         &PPC::GPRC_and_GPRC_NOR0RegClass)))));
+
+  bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
+      break;
+    case MVT::i16:
+      Opc = (IsZExt ?
+             (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : 
+             (Is32BitInt ? PPC::LHA : PPC::LHA8));
+      break;
+    case MVT::i32:
+      Opc = (IsZExt ? 
+             (Is32BitInt ? PPC::LWZ : PPC::LWZ8) :
+             (Is32BitInt ? PPC::LWA_32 : PPC::LWA));
+      if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
+        UseOffset = false;
+      break;
+    case MVT::i64:
+      Opc = PPC::LD;
+      assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && 
+             "64-bit load with 32-bit target??");
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::LFS;
+      break;
+    case MVT::f64:
+      Opc = FP64LoadOpc;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+  if (ResultReg == 0)
+    ResultReg = createResultReg(UseRC);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::LBZ:    Opc = PPC::LBZX;    break;
+      case PPC::LBZ8:   Opc = PPC::LBZX8;   break;
+      case PPC::LHZ:    Opc = PPC::LHZX;    break;
+      case PPC::LHZ8:   Opc = PPC::LHZX8;   break;
+      case PPC::LHA:    Opc = PPC::LHAX;    break;
+      case PPC::LHA8:   Opc = PPC::LHAX8;   break;
+      case PPC::LWZ:    Opc = PPC::LWZX;    break;
+      case PPC::LWZ8:   Opc = PPC::LWZX8;   break;
+      case PPC::LWA:    Opc = PPC::LWAX;    break;
+      case PPC::LWA_32: Opc = PPC::LWAX_32; break;
+      case PPC::LD:     Opc = PPC::LDX;     break;
+      case PPC::LFS:    Opc = PPC::LFSX;    break;
+      case PPC::LFD:    Opc = PPC::LFDX;    break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a load instruction.
+bool PPCFastISel::SelectLoad(const Instruction *I) {
+  // FIXME: No atomic loads are supported.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  This is necessary
+  // to constrain RA from using R0/X0 when this is not legal.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Emit a store instruction to store SrcReg at Addr.
+bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
+  assert(SrcReg && "Nothing to store!");
+  unsigned Opc;
+  bool UseOffset = true;
+
+  const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+  bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::STB : PPC::STB8;
+      break;
+    case MVT::i16:
+      Opc = Is32BitInt ? PPC::STH : PPC::STH8;
+      break;
+    case MVT::i32:
+      assert(Is32BitInt && "Not GPRC for i32??");
+      Opc = PPC::STW;
+      break;
+    case MVT::i64:
+      Opc = PPC::STD;
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::STFS;
+      break;
+    case MVT::f64:
+      Opc = PPC::STFD;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)).addReg(SrcReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::STB:  Opc = PPC::STBX;  break;
+      case PPC::STH : Opc = PPC::STHX;  break;
+      case PPC::STW : Opc = PPC::STWX;  break;
+      case PPC::STB8: Opc = PPC::STBX8; break;
+      case PPC::STH8: Opc = PPC::STHX8; break;
+      case PPC::STW8: Opc = PPC::STWX8; break;
+      case PPC::STD:  Opc = PPC::STDX;  break;
+      case PPC::STFS: Opc = PPC::STFSX; break;
+      case PPC::STFD: Opc = PPC::STFDX; break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a store instruction.
+bool PPCFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // FIXME: No atomics loads are supported.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(Op0->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!PPCEmitStore(VT, SrcReg, Addr))
+    return false;
+
+  return true;
+}
+
+// Attempt to fast-select a branch instruction.
+bool PPCFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *BrBB = FuncInfo.MBB;
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // For now, just try the simplest case where it's fed by a compare.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate());
+    if (!OptPPCPred)
+      return false;
+
+    PPC::Predicate PPCPred = OptPPCPred.getValue();
+
+    // Take advantage of fall-through opportunities.
+    if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+      std::swap(TBB, FBB);
+      PPCPred = PPC::InvertPredicate(PPCPred);
+    }
+
+    unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
+
+    if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                    CondReg))
+      return false;
+
+    BuildMI(*BrBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCC))
+      .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+    FastEmitBranch(FBB, DL);
+    FuncInfo.MBB->addSuccessor(TBB);
+    return true;
+
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    FastEmitBranch(Target, DL);
+    return true;
+  }
+
+  // FIXME: ARM looks for a case where the block containing the compare
+  // has been split from the block containing the branch.  If this happens,
+  // there is a vreg available containing the result of the compare.  I'm
+  // not sure we can do much, as we've lost the predicate information with
+  // the compare instruction -- we have a 4-bit CR but don't know which bit
+  // to test here.
+  return false;
+}
+
+// Attempt to emit a compare of the two source values.  Signed and unsigned
+// comparisons are supported.  Return false if we can't handle it.
+bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
+                             bool IsZExt, unsigned DestReg) {
+  Type *Ty = SrcValue1->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // See if operand 2 is an immediate encodeable in the compare.
+  // FIXME: Operands are not in canonical order at -O0, so an immediate
+  // operand in position 1 is a lost opportunity for now.  We are
+  // similar to ARM in this regard.
+  long Imm = 0;
+  bool UseImm = false;
+
+  // Only 16-bit integer constants can be represented in compares for 
+  // PowerPC.  Others will be materialized into a register.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue();
+      if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm)))
+        UseImm = true;
+    }
+  }
+
+  unsigned CmpOpc;
+  bool NeedsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    case MVT::f32:
+      CmpOpc = PPC::FCMPUS;
+      break;
+    case MVT::f64:
+      CmpOpc = PPC::FCMPUD;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      NeedsExt = true;
+      // Intentional fall-through.
+    case MVT::i32:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI;
+      break;
+    case MVT::i64:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(SrcValue1);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(SrcValue2);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  if (NeedsExt) {
+    unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
+      return false;
+    SrcReg1 = ExtReg;
+
+    if (!UseImm) {
+      unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+      if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
+        return false;
+      SrcReg2 = ExtReg;
+    }
+  }
+
+  if (!UseImm)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addReg(SrcReg2);
+  else
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addImm(Imm);
+
+  return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool PPCFastISel::SelectFPExt(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // No code is generated for a FP extend.
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // Round the result to single precision.
+  unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP), DestReg)
+    .addReg(SrcReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move an i32 or i64 value in a GPR to an f64 value in an FPR.
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+// FIXME: The code here is sloppy for the 4-byte case.  Can use a 4-byte
+// stack slot and 4-byte store/load sequence.  Or just sext the 4-byte
+// case to 8 bytes which produces tighter code but wastes stack space.
+unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
+                                     bool IsSigned) {
+
+  // If necessary, extend 32-bit int to 64-bit.
+  if (SrcVT == MVT::i32) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return 0;
+    SrcReg = TmpReg;
+  }
+
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the GPR.
+  if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
+    return 0;
+
+  // Load the integer value into an FPR.  The kind of load used depends
+  // on a number of conditions.
+  unsigned LoadOpc = PPC::LFD;
+
+  if (SrcVT == MVT::i32) {
+    Addr.Offset = 4;
+    if (!IsSigned)
+      LoadOpc = PPC::LFIWZX;
+    else if (PPCSubTarget.hasLFIWAX())
+      LoadOpc = PPC::LFIWAX;
+  }
+
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select an integer-to-floating-point conversion.
+bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
+  MVT DstVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::f32 && DstVT != MVT::f64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i8  && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i32 && SrcVT != MVT::i64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // We can only lower an unsigned convert if we have the newer
+  // floating-point conversion operations.
+  if (!IsSigned && !PPCSubTarget.hasFPCVT())
+    return false;
+
+  // FIXME: For now we require the newer floating-point conversion operations
+  // (which are present only on P7 and A2 server models) when converting
+  // to single-precision float.  Otherwise we have to generate a lot of
+  // fiddly code to avoid double rounding.  If necessary, the fiddly code
+  // can be found in PPCTargetLowering::LowerINT_TO_FP().
+  if (DstVT == MVT::f32 && !PPCSubTarget.hasFPCVT())
+    return false;
+
+  // Extend the input if necessary.
+  if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return false;
+    SrcVT = MVT::i64;
+    SrcReg = TmpReg;
+  }
+
+  // Move the integer value to an FPR.
+  unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned);
+  if (FPReg == 0)
+    return false;
+
+  // Determine the opcode for the conversion.
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned DestReg = createResultReg(RC);
+  unsigned Opc;
+
+  if (DstVT == MVT::f32)
+    Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS;
+  else
+    Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    .addReg(FPReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move the floating-point value in SrcReg into an integer destination
+// register, and return the register (or zero if we can't handle it).
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
+                                      unsigned SrcReg, bool IsSigned) {
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  // Note that if have STFIWX available, we could use a 4-byte stack
+  // slot for i32, but this being fast-isel we'll just go with the
+  // easiest code gen possible.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the FPR.
+  if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
+    return 0;
+
+  // Reload it into a GPR.  If we want an i32, modify the address
+  // to have a 4-byte offset so we load from the right place.
+  if (VT == MVT::i32)
+    Addr.Offset = 4;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
+  MVT DstVT, SrcVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::i32 && DstVT != MVT::i64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+  if (!isTypeLegal(SrcTy, SrcVT))
+    return false;
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // Convert f32 to f64 if necessary.  This is just a meaningless copy
+  // to get the register class right.  COPY_TO_REGCLASS is needed since
+  // a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream.
+  const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
+  if (InRC == &PPC::F4RCRegClass) {
+    unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg)
+      .addReg(SrcReg).addImm(PPC::F8RCRegClassID);
+    SrcReg = TmpReg;
+  }
+
+  // Determine the opcode for the conversion, which takes place
+  // entirely within FPRs.
+  unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+  unsigned Opc;
+
+  if (DstVT == MVT::i32)
+    if (IsSigned)
+      Opc = PPC::FCTIWZ;
+    else
+      Opc = PPCSubTarget.hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+  else
+    Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    .addReg(SrcReg);
+
+  // Now move the integer value from a float register to an integer register.
+  unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+  if (IntReg == 0)
+    return false;
+
+  UpdateValueMap(I, IntReg);
+  return true;
+}
+
+// Attempt to fast-select a binary integer operation that isn't already
+// handled automatically.
+bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  If there is no register,
+  // make a conservative choice (don't assign R0).
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     &PPC::GPRC_and_GPRC_NOR0RegClass);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
+      break;
+    case ISD::OR:
+      Opc = IsGPRC ? PPC::OR : PPC::OR8;
+      break;
+    case ISD::SUB:
+      Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
+      break;
+  }
+
+  unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // Handle case of small immediate operand.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    const APInt &CIVal = ConstInt->getValue();
+    int Imm = (int)CIVal.getSExtValue();
+    bool UseImm = true;
+    if (isInt<16>(Imm)) {
+      switch (Opc) {
+        default:
+          llvm_unreachable("Missing case!");
+        case PPC::ADD4:
+          Opc = PPC::ADDI;
+          MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+          break;
+        case PPC::ADD8:
+          Opc = PPC::ADDI8;
+          MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+          break;
+        case PPC::OR:
+          Opc = PPC::ORI;
+          break;
+        case PPC::OR8:
+          Opc = PPC::ORI8;
+          break;
+        case PPC::SUBF:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI;
+            MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+            Imm = -Imm;
+          }
+          break;
+        case PPC::SUBF8:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI8;
+            MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+            Imm = -Imm;
+          }
+          break;
+      }
+
+      if (UseImm) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+          .addReg(SrcReg1).addImm(Imm);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  // Reg-reg case.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  // Reverse operands for subtract-from.
+  if (ISDOpcode == ISD::SUB)
+    std::swap(SrcReg1, SrcReg2);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    .addReg(SrcReg1).addReg(SrcReg2);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Handle arguments to a call that we're attempting to fast-select.
+// Return false if the arguments are too complex for us at the moment.
+bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool IsVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+
+  // Bail out if we can't handle any of the arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Skip vector arguments for now, as well as long double and
+    // uint128_t, and anything that isn't passed in a register.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 ||
+        !VA.isRegLoc() || VA.needsCustom())
+      return false;
+
+    // Skip bit-converted arguments for now.
+    if (VA.getLocInfo() == CCValAssign::BCvt)
+      return false;
+  }
+
+  // Get a count of how many bytes are to be pushed onto the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameSetupOpcode()))
+    .addImm(NumBytes);
+
+  // Prepare to assign register arguments.  Every argument uses up a
+  // GPR protocol register even if it's passed in a floating-point
+  // register.
+  unsigned NextGPR = PPC::X3;
+  unsigned NextFPR = PPC::F1;
+
+  // Process arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle argument promotion and bitcasts.
+    switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
+          llvm_unreachable("Failed to emit a sext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::AExt:
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
+          llvm_unreachable("Failed to emit a zext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        // FIXME: Not yet handled.
+        llvm_unreachable("Should have bailed before getting here!");
+        break;
+      }
+    }
+
+    // Copy this argument to the appropriate register.
+    unsigned ArgReg;
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
+      ArgReg = NextFPR++;
+      ++NextGPR;
+    } else
+      ArgReg = NextGPR++;
+      
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ArgReg).addReg(Arg);
+    RegArgs.push_back(ArgReg);
+  }
+
+  return true;
+}
+
+// For a call that we've determined we can fast-select, finish the
+// call sequence and generate a copy to obtain the return value (if any).
+void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool IsVarArg) {
+  // Issue CallSEQ_END.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameDestroyOpcode()))
+    .addImm(NumBytes).addImm(0);
+
+  // Next, generate a copy to obtain the return value.
+  // FIXME: No multi-register return values yet, though I don't foresee
+  // any real difficulties there.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCValAssign &VA = RVLocs[0];
+    assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    MVT DestVT = VA.getValVT();
+    MVT CopyVT = DestVT;
+
+    // Ints smaller than a register still arrive in a full 64-bit
+    // register, so make sure we recognize this.
+    if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
+      CopyVT = MVT::i64;
+
+    unsigned SourcePhysReg = VA.getLocReg();
+    unsigned ResultReg = 0;
+
+    if (RetVT == CopyVT) {
+      const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
+      ResultReg = createResultReg(CpyRC);
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+
+    // If necessary, round the floating result to single precision.
+    } else if (CopyVT == MVT::f64) {
+      ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP),
+              ResultReg).addReg(SourcePhysReg);
+
+    // If only the low half of a general register is needed, generate
+    // a GPRC copy instead of a G8RC copy.  (EXTRACT_SUBREG can't be
+    // used along the fast-isel path (not lowered), and downstream logic
+    // also doesn't like a direct subreg copy on a physical reg.)
+    } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
+      ResultReg = createResultReg(&PPC::GPRCRegClass);
+      // Convert physical register from G8RC to GPRC.
+      SourcePhysReg -= PPC::X0 - PPC::R0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+    }
+
+    assert(ResultReg && "ResultReg unset!");
+    UsedRegs.push_back(SourcePhysReg);
+    UpdateValueMap(I, ResultReg);
+  }
+}
+
+// Attempt to fast-select a call instruction.
+bool PPCFastISel::SelectCall(const Instruction *I) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall())
+    return false;
+
+  // Obtain calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool IsVarArg = FTy->isVarArg();
+
+  // Not ready for varargs yet.
+  if (IsVarArg)
+    return false;
+
+  // Handle simple calls for now, with legal return types and
+  // those that can be extended.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8)
+    return false;
+
+  // FIXME: No multi-register return values yet.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
+      RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
+      RetVT != MVT::f64) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    if (RVLocs.size() > 1)
+      return false;
+  }
+
+  // Bail early if more than 8 arguments, as we only currently
+  // handle arguments passed in registers.
+  unsigned NumArgs = CS.arg_size();
+  if (NumArgs > 8)
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+
+  Args.reserve(NumArgs);
+  ArgRegs.reserve(NumArgs);
+  ArgVTs.reserve(NumArgs);
+  ArgFlags.reserve(NumArgs);
+
+  for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end();
+       II != IE; ++II) {
+    // FIXME: ARM does something for intrinsic calls here, check into that.
+
+    unsigned AttrIdx = II - CS.arg_begin() + 1;
+    
+    // Only handle easy calls for now.  It would be reasonably easy
+    // to handle <= 8-byte structures passed ByVal in registers, but we
+    // have to ensure they are right-justified in the register.
+    if (CS.paramHasAttr(AttrIdx, Attribute::InReg) ||
+        CS.paramHasAttr(AttrIdx, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrIdx, Attribute::Nest) ||
+        CS.paramHasAttr(AttrIdx, Attribute::ByVal))
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    if (CS.paramHasAttr(AttrIdx, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrIdx, Attribute::ZExt))
+      Flags.setZExt();
+
+    Type *ArgTy = (*II)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
+      return false;
+
+    if (ArgVT.isVector())
+      return false;
+
+    unsigned Arg = getRegForValue(*II);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*II);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Process the arguments.
+  SmallVector<unsigned, 8> RegArgs;
+  unsigned NumBytes;
+
+  if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, IsVarArg))
+    return false;
+
+  // FIXME: No handling for function pointers yet.  This requires
+  // implementing the function descriptor (OPD) setup.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Build direct call with NOP for TOC restore.
+  // FIXME: We can and should optimize away the NOP for local calls.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(PPC::BL8_NOP));
+  // Add callee.
+  MIB.addGlobalAddress(GV);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
+    MIB.addReg(RegArgs[II], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.  Proper
+  // defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg);
+
+  // Set all unused physregs defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+// Attempt to fast-select a return instruction.
+bool PPCFastISel::SelectRet(const Instruction *I) {
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, *Context);
+    CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
+    const Value *RV = Ret->getOperand(0);
+    
+    // FIXME: Only one output register for now.
+    if (ValLocs.size() > 1)
+      return false;
+
+    // Special case for returning a constant integer of any size.
+    // Materialize the constant as an i64 and copy it to the return
+    // register.  This avoids an unnecessary extend or truncate.
+    if (isa<ConstantInt>(*RV)) {
+      const Constant *C = cast<Constant>(RV);
+      unsigned SrcReg = PPCMaterializeInt(C, MVT::i64);
+      unsigned RetReg = ValLocs[0].getLocReg();
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              RetReg).addReg(SrcReg);
+      RetRegs.push_back(RetReg);
+
+    } else {
+      unsigned Reg = getRegForValue(RV);
+
+      if (Reg == 0)
+        return false;
+
+      // Copy the result values into the output registers.
+      for (unsigned i = 0; i < ValLocs.size(); ++i) {
+
+        CCValAssign &VA = ValLocs[i];
+        assert(VA.isRegLoc() && "Can only return in registers!");
+        RetRegs.push_back(VA.getLocReg());
+        unsigned SrcReg = Reg + VA.getValNo();
+
+        EVT RVEVT = TLI.getValueType(RV->getType());
+        if (!RVEVT.isSimple())
+          return false;
+        MVT RVVT = RVEVT.getSimpleVT();
+        MVT DestVT = VA.getLocVT();
+
+        if (RVVT != DestVT && RVVT != MVT::i8 &&
+            RVVT != MVT::i16 && RVVT != MVT::i32)
+          return false;
+      
+        if (RVVT != DestVT) {
+          switch (VA.getLocInfo()) {
+            default:
+              llvm_unreachable("Unknown loc info!");
+            case CCValAssign::Full:
+              llvm_unreachable("Full value assign but types don't match?");
+            case CCValAssign::AExt:
+            case CCValAssign::ZExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+            case CCValAssign::SExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+          }
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(TargetOpcode::COPY), RetRegs[i])
+          .addReg(SrcReg);
+      }
+    }
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(PPC::BLR));
+
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+
+  return true;
+}
+
+// Attempt to emit an integer extend of SrcReg into DestReg.  Both
+// signed and zero extensions are supported.  Return false if we
+// can't handle it.
+bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                unsigned DestReg, bool IsZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i64)
+    return false;
+  if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32)
+    return false;
+
+  // Signed extensions use EXTSB, EXTSH, EXTSW.
+  if (!IsZExt) {
+    unsigned Opc;
+    if (SrcVT == MVT::i8)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64;
+    else if (SrcVT == MVT::i16)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64;
+    else {
+      assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
+      Opc = PPC::EXTSW_32_64;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      .addReg(SrcReg);
+
+  // Unsigned 32-bit extensions use RLWINM.
+  } else if (DestVT == MVT::i32) {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 24;
+    else {
+      assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
+      MB = 16;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::RLWINM),
+            DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
+
+  // Unsigned 64-bit extensions use RLDICL (with a 32-bit source).
+  } else {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 56;
+    else if (SrcVT == MVT::i16)
+      MB = 48;
+    else
+      MB = 32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(PPC::RLDICL_32_64), DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select an indirect branch instruction.
+bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::MTCTR8))
+    .addReg(AddrReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCTR8));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+
+  return true;
+}
+
+// Attempt to fast-select an integer truncate instruction.
+bool PPCFastISel::SelectTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
+    return false;
+
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // The only interesting case is when we need to switch register classes.
+  if (SrcVT == MVT::i64) {
+    unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(SrcReg, 0, PPC::sub_32);
+    SrcReg = ResultReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select an integer extend instruction.
+bool PPCFastISel::SelectIntExt(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool IsZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  // If we know the register class needed for the result of this
+  // instruction, use it.  Otherwise pick the register class of the
+  // correct size that does not contain X0/R0, since we don't know
+  // whether downstream uses permit that assignment.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+      &PPC::GPRC_and_GPRC_NOR0RegClass));
+  unsigned ResultReg = createResultReg(RC);
+
+  if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
 // Attempt to fast-select an instruction that wasn't handled by
-// the table-generated machinery.  TBD.
+// the table-generated machinery.
 bool PPCFastISel::TargetSelectInstruction(const Instruction *I) {
-  return I && false;
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*IsSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*IsSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*IsSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*IsSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::Call:
+      if (dyn_cast<IntrinsicInst>(I))
+        return false;
+      return SelectCall(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    // Here add other flavors of Instruction::XXX that automated
+    // cases don't catch.  For example, switches are terminators
+    // that aren't yet handled.
+    default:
+      break;
+  }
+  return false;
 }
 
 // Materialize a floating-point constant into a register, and return
@@ -131,21 +1802,94 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
       MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
       (VT == MVT::f32) ? 4 : 8, Align);
 
-  // For small code model, generate a LDtocCPT.
-  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+  unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+  unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocCPT),
-            DestReg)
-      .addConstantPoolIndex(Idx).addReg(PPC::X2).addMemOperand(MMO);
-  else {
+            TmpReg)
+      .addConstantPoolIndex(Idx).addReg(PPC::X2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      .addImm(0).addReg(TmpReg).addMemOperand(MMO);
+  } else {
     // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
-    unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
-    unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
             TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
-      .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
-      .addReg(TmpReg)
-      .addMemOperand(MMO);
+    // But for large code model, we must generate a LDtocL followed
+    // by the LF[SD].
+    if (CModel == CodeModel::Large) {
+      unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+              TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addImm(0).addReg(TmpReg2);
+    } else 
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
+        .addReg(TmpReg)
+        .addMemOperand(MMO);
+  }
+
+  return DestReg;
+}
+
+// Materialize the address of a global value into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+  assert(VT == MVT::i64 && "Non-address!");
+  const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // Global values may be plain old object addresses, TLS object
+  // addresses, constant pool entries, or jump tables.  How we generate
+  // code for these may depend on small, medium, or large code model.
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  // FIXME: Jump tables are not yet required because fast-isel doesn't
+  // handle switches; if that changes, we need them as well.  For now,
+  // what follows assumes everything's a generic (or TLS) global address.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias, use the aliasee for determining thread-locality.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+  }
+
+  // FIXME: We don't yet handle the complexity of TLS.
+  bool IsTLS = GVar && GVar->isThreadLocal();
+  if (IsTLS)
+    return 0;
+
+  // For small code model, generate a simple TOC load.
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtoc), DestReg)
+      .addGlobalAddress(GV).addReg(PPC::X2);
+  else {
+    // If the address is an externally defined symbol, a symbol with
+    // common or externally available linkage, a function address, or a
+    // jump table address (not yet needed), or if we are generating code
+    // for large code model, we generate:
+    //       LDtocL(GV, ADDIStocHA(%X2, GV))
+    // Otherwise we generate:
+    //       ADDItocL(ADDIStocHA(%X2, GV), GV)
+    // Either way, start with the ADDIStocHA:
+    unsigned HighPartReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
+            HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
+
+    // !GVar implies a function address.  An external variable is one
+    // without an initializer.
+    // If/when switches are implemented, jump tables should be handled
+    // on the "if" path here.
+    if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() ||
+        GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage())
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+              DestReg).addGlobalAddress(GV).addReg(HighPartReg);
+    else
+      // Otherwise generate the ADDItocL.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDItocL),
+              DestReg).addReg(HighPartReg).addGlobalAddress(GV);
   }
 
   return DestReg;
@@ -283,23 +2027,112 @@ unsigned PPCFastISel::TargetMaterializeConstant(const Constant *C) {
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
     return PPCMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return PPCMaterializeGV(GV, VT);
   else if (isa<ConstantInt>(C))
     return PPCMaterializeInt(C, VT);
-  // TBD: Global values.
 
   return 0;
 }
 
 // Materialize the address created by an alloca into a register, and
-// return the register number (or zero if we failed to handle it).  TBD.
+// return the register number (or zero if we failed to handle it).
 unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
-  return AI && 0;
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(SI->second).addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
 }
 
-// Fold loads into extends when possible.  TBD.
+// Fold loads into extends when possible.
+// FIXME: We can have multiple redundant extend/trunc instructions
+// following a load.  The folding only picks up one.  Extend this
+// to check subsequent instructions for the same pattern and remove
+// them.  Thus ResultReg should be the def reg for the last redundant
+// instruction in a chain, and all intervening instructions can be
+// removed from parent.  Change test/CodeGen/PowerPC/fast-isel-fold.ll
+// to add ELF64-NOT: rldicl to the appropriate tests when this works.
 bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                       const LoadInst *LI) {
-  return MI && OpNo && LI && false;
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  bool IsZExt = false;
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+
+    case PPC::RLDICL:
+    case PPC::RLDICL_32_64: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 56) ||
+          (VT == MVT::i16 && MB <= 48) ||
+          (VT == MVT::i32 && MB <= 32))
+        break;
+      return false;
+    }
+
+    case PPC::RLWINM:
+    case PPC::RLWINM8: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 24) ||
+          (VT == MVT::i16 && MB <= 16))
+        break;
+      return false;
+    }
+
+    case PPC::EXTSB:
+    case PPC::EXTSB8:
+    case PPC::EXTSB8_32_64:
+      /* There is no sign-extending load-byte instruction. */
+      return false;
+
+    case PPC::EXTSH:
+    case PPC::EXTSH8:
+    case PPC::EXTSH8_32_64: {
+      if (VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+
+    case PPC::EXTSW:
+    case PPC::EXTSW_32_64: {
+      if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(LI->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+
+  if (!PPCEmitLoad(VT, ResultReg, Addr, 0, IsZExt))
+    return false;
+
+  MI->eraseFromParent();
+  return true;
 }
 
 // Attempt to lower call arguments in a faster way than done by
@@ -312,6 +2145,81 @@ bool PPCFastISel::FastLowerArguments() {
   return false;
 }
 
+// Handle materializing integer constants into a register.  This is not
+// automatically generated for PowerPC, so must be explicitly created here.
+unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
+  
+  if (Opc != ISD::Constant)
+    return 0;
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
+      VT != MVT::i8 && VT != MVT::i1) 
+    return 0;
+
+  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+                                   &PPC::GPRCRegClass);
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else
+    return PPCMaterialize32BitInt(Imm, RC);
+}
+
+// Override for ADDI and ADDI8 to set the correct register class
+// on RHS operand 0.  The automatic infrastructure naively assumes
+// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
+// for these cases.  At the moment, none of the other automatically
+// generated RI instructions require special treatment.  However, once
+// SelectSelect is implemented, "isel" requires similar handling.
+//
+// Also be conservative about the output register class.  Avoid
+// assigning R0 or X0 to the output register for GPRC and G8RC
+// register classes, as any such result could be used in ADDI, etc.,
+// where those regs have another meaning.
+unsigned PPCFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  if (MachineInstOpcode == PPC::ADDI)
+    MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
+  else if (MachineInstOpcode == PPC::ADDI8)
+    MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_ri(MachineInstOpcode, UseRC,
+                                   Op0, Op0IsKill, Imm);
+}
+
+// Override for instructions with one register operand to avoid use of
+// R0/X0.  The automatic infrastructure isn't aware of the context so
+// we must be conservative.
+unsigned PPCFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass* RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+}
+
+// Override for instructions with two register operands to avoid use
+// of R0/X0.  The automatic infrastructure isn't aware of the context
+// so we must be conservative.
+unsigned PPCFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass* RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+                                   Op1, Op1IsKill);
+}
+
 namespace llvm {
   // Create the fast instruction selector for PowerPC64 ELF.
   FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 24d3a0b..0ac2ced 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -204,10 +204,9 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned FrameSize =
     UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize();
 
-  // Get the alignments provided by the target, and the maximum alignment
-  // (if any) of the fixed frame objects.
-  unsigned TargetAlign = getStackAlignment();
-  unsigned MaxAlign = MFI->getMaxAlignment();
+  // Get stack alignments. The frame must be aligned to the greatest of these:
+  unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI
+  unsigned MaxAlign = MFI->getMaxAlignment(); // algmt required by data in frame
   unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
 
   const PPCRegisterInfo *RegInfo =
@@ -346,12 +345,20 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   bool needsFrameMoves = MMI.hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
 
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get the ABI.
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  assert((isDarwinABI || isSVR4ABI) &&
+         "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+
   // Prepare for frame info.
   MCSymbol *FrameLabel = 0;
 
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
   // process it.
-  if (!Subtarget.isSVR4ABI())
+  if (!isSVR4ABI)
     for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
       if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
         HandleVRSaveUpdate(MBBI, TII);
@@ -371,23 +378,52 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   if (MFI->isFrameAddressTaken())
     replaceFPWithRealFP(MF);
 
-  // Get processor type.
-  bool isPPC64 = Subtarget.isPPC64();
-  // Get operating system
-  bool isDarwinABI = Subtarget.isDarwinABI();
   // Check if the link register (LR) must be saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
   const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
-  // Do we have a frame pointer for this function?
+  // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
 
+  unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg       = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  //  ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
+  const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
+                                                : PPC::MFLR );
+  const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD
+                                                 : PPC::STW );
+  const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU
+                                                     : PPC::STWU );
+  const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX
+                                                        : PPC::STWUX);
+  const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8
+                                                          : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8
+                                                 : PPC::ORI );
+  const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+                                              : PPC::OR );
+  const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8
+                                                            : PPC::SUBFC);
+  const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8
+                                                               : PPC::SUBFIC);
+
+  // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
+  // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
+  // Red Zone, an asynchronous event (a form of "callee") could claim a frame &
+  // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR.
+  assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
+         "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
+
   int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
 
   int FPOffset = 0;
   if (HasFP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int FPIndex = FI->getFramePointerSaveIndex();
       assert(FPIndex && "No Frame Pointer Save Slot!");
@@ -399,7 +435,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   int BPOffset = 0;
   if (HasBP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int BPIndex = FI->getBasePointerSaveIndex();
       assert(BPIndex && "No Base Pointer Save Slot!");
@@ -410,181 +446,116 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
-  if (isPPC64) {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR8), PPC::X0);
+  // Get stack alignments.
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  if (HasBP && MaxAlign > 1)
+    assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+           "Invalid alignment!");
+
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
+  bool isLargeFrame = !isInt<16>(NegFrameSize);
 
-    if (!MustSaveCRs.empty()) {
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), PPC::X12);
-      for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-        MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
-    }
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X31)
-        .addImm(FPOffset)
-        .addReg(PPC::X1);
-
-    if (HasBP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X30)
-        .addImm(BPOffset)
-        .addReg(PPC::X1);
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X0)
-        .addImm(LROffset)
-        .addReg(PPC::X1);
-
-    if (!MustSaveCRs.empty())
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
-        .addReg(PPC::X12, getKillRegState(true))
-        .addImm(8)
-        .addReg(PPC::X1);
-  } else {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
-
-    if (HasFP)
-      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
-      // offsets of R1 is not allowed.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R31)
-        .addImm(FPOffset)
-        .addReg(PPC::R1);
-
-    if (HasBP)
-      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
-      // offsets of R1 is not allowed.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R30)
-        .addImm(BPOffset)
-        .addReg(PPC::R1);
-
-    assert(MustSaveCRs.empty() &&
-           "Prologue CR saving supported only in 64-bit mode");
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R0)
-        .addImm(LROffset)
-        .addReg(PPC::R1);
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Prologue CR saving supported only in 64-bit mode");
+
+  if (!MustSaveCRs.empty()) { // will only occur for PPC64
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
   }
 
-  // Skip if a leaf routine.
+  if (HasFP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
+
+  if (HasBP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
+
+  if (MustSaveLR)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
+
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+      .addReg(TempReg, getKillRegState(true))
+      .addImm(8)
+      .addReg(SPReg);
+
+  // Skip the rest if this is a leaf function & all spills fit in the Red Zone.
   if (!FrameSize) return;
 
-  // Get stack alignments.
-  unsigned MaxAlign = MFI->getMaxAlignment();
-
   // Adjust stack pointer: r1 += NegFrameSize.
   // If there is a preferred stack alignment, align R1 now
-  if (!isPPC64) {
-    // PPC32.
-
-    if (HasBP) {
-      // Save a copy of r1 as the base pointer.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R30)
-        .addReg(PPC::R1)
-        .addReg(PPC::R1);
-    }
 
-    if (HasBP && MaxAlign > 1) {
-      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
-             "Invalid alignment!");
+  if (HasBP) {
+    // Save a copy of r1 as the base pointer.
+    BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
+  }
 
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
-        .addReg(PPC::R1)
+  if (HasBP && MaxAlign > 1) {
+    if (isPPC64)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
+        .addReg(SPReg)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+    else // PPC32...
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
+        .addReg(SPReg)
         .addImm(0)
         .addImm(32 - Log2_32(MaxAlign))
         .addImm(31);
-      if (isInt<16>(NegFrameSize)) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC), PPC::R0)
-          .addReg(PPC::R0, RegState::Kill)
-          .addImm(NegFrameSize);
-      } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R12)
-          .addImm(NegFrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R12)
-          .addReg(PPC::R12, RegState::Kill)
-          .addImm(NegFrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFC), PPC::R0)
-          .addReg(PPC::R0, RegState::Kill)
-          .addReg(PPC::R12, RegState::Kill);
-      }
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
-        .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1)
-        .addReg(PPC::R0);
-    } else if (isInt<16>(NegFrameSize)) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
-        .addReg(PPC::R1)
-        .addImm(NegFrameSize)
-        .addReg(PPC::R1);
+    if (!isLargeFrame) {
+      BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(NegFrameSize);
     } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
         .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
-        .addReg(PPC::R0, RegState::Kill)
+      BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
+        .addReg(TempReg, RegState::Kill)
         .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
-        .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1)
-        .addReg(PPC::R0);
-    }
-  } else {    // PPC64.
-    if (HasBP) {
-      // Save a copy of r1 as the base pointer.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X30)
-        .addReg(PPC::X1)
-        .addReg(PPC::X1);
+      BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addReg(TempReg, RegState::Kill);
     }
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
 
-    if (HasBP && MaxAlign > 1) {
-      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
-             "Invalid alignment!");
+  } else if (!isLargeFrame) {
+    BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg)
+      .addReg(SPReg)
+      .addImm(NegFrameSize)
+      .addReg(SPReg);
 
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
-        .addReg(PPC::X1)
-        .addImm(0)
-        .addImm(64 - Log2_32(MaxAlign));
-      if (isInt<16>(NegFrameSize)) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
-          .addReg(PPC::X0, RegState::Kill)
-          .addImm(NegFrameSize);
-      } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X12)
-          .addImm(NegFrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X12)
-          .addReg(PPC::X12, RegState::Kill)
-          .addImm(NegFrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFC8), PPC::X0)
-          .addReg(PPC::X0, RegState::Kill)
-          .addReg(PPC::X12, RegState::Kill);
-      }
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(PPC::X0);
-    } else if (isInt<16>(NegFrameSize)) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
-        .addReg(PPC::X1)
-        .addImm(NegFrameSize)
-        .addReg(PPC::X1);
-    } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
-        .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
-        .addReg(PPC::X0, RegState::Kill)
-        .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(PPC::X0);
-    }
+  } else {
+    BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+      .addImm(NegFrameSize >> 16);
+    BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .addImm(NegFrameSize & 0xFFFF);
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
   }
 
   // Add the "machine moves" for the instructions we generated above, but in
@@ -600,22 +571,19 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
         MCCFIInstruction::createDefCfaOffset(FrameLabel, NegFrameSize));
 
     if (HasFP) {
-      unsigned Reg = isPPC64 ? PPC::X31 : PPC::R31;
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       MMI.addFrameInst(
           MCCFIInstruction::createOffset(FrameLabel, Reg, FPOffset));
     }
 
     if (HasBP) {
-      unsigned Reg = isPPC64 ? PPC::X30 : PPC::R30;
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
       MMI.addFrameInst(
           MCCFIInstruction::createOffset(FrameLabel, Reg, BPOffset));
     }
 
     if (MustSaveLR) {
-      unsigned Reg = isPPC64 ? PPC::LR8 : PPC::LR;
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
       MMI.addFrameInst(
           MCCFIInstruction::createOffset(FrameLabel, Reg, LROffset));
     }
@@ -625,15 +593,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // If there is a frame pointer, copy R1 into R31
   if (HasFP) {
-    if (!isPPC64) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R31)
-        .addReg(PPC::R1)
-        .addReg(PPC::R1);
-    } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X31)
-        .addReg(PPC::X1)
-        .addReg(PPC::X1);
-    }
+    BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
 
     if (needsFrameMoves) {
       ReadyLabel = MMI.getContext().CreateTempSymbol();
@@ -641,9 +603,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       // Mark effective beginning of when frame pointer is ready.
       BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(ReadyLabel);
 
-      unsigned Reg = HasFP ? (isPPC64 ? PPC::X31 : PPC::R31)
-                           : (isPPC64 ? PPC::X1 : PPC::R1);
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(ReadyLabel, Reg));
     }
   }
@@ -664,19 +624,16 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // For SVR4, don't emit a move for the CR spill slot if we haven't
       // spilled CRs.
-      if (Subtarget.isSVR4ABI()
-	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
-	  && MustSaveCRs.empty())
-	continue;
+      if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+          && MustSaveCRs.empty())
+        continue;
 
       // For 64-bit SVR4 when we have spilled CRs, the spill location
       // is SP+8, not a frame-relative slot.
-      if (Subtarget.isSVR4ABI()
-	  && Subtarget.isPPC64()
-	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+      if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         MMI.addFrameInst(MCCFIInstruction::createOffset(
             Label, MRI->getDwarfRegNum(PPC::CR2, true), 8));
-	continue;
+        continue;
       }
 
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
@@ -707,7 +664,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
           RetOpcode == PPC::TCRETURNai8) &&
          "Can only insert epilog into returning blocks");
 
-  // Get alignment info so we know how to restore r1
+  // Get alignment info so we know how to restore the SP.
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Get the number of bytes allocated from the FrameInfo.
@@ -715,21 +672,41 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
-  // Get operating system
+  // Get the ABI.
   bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+
   // Check if the link register (LR) has been saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
   const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
-  // Do we have a frame pointer for this function?
+  // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
 
+  unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg      = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
+                                                 : PPC::MTLR );
+  const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD
+                                                 : PPC::LWZ );
+  const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8
+                                                           : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8
+                                                  : PPC::ORI );
+  const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8
+                                                   : PPC::ADDI );
+  const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
+                                                : PPC::ADD4 );
+
   int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
 
   int FPOffset = 0;
   if (HasFP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int FPIndex = FI->getFramePointerSaveIndex();
       assert(FPIndex && "No Frame Pointer Save Slot!");
@@ -741,7 +718,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   int BPOffset = 0;
   if (HasBP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int BPIndex = FI->getBasePointerSaveIndex();
       assert(BPIndex && "No Base Pointer Save Slot!");
@@ -773,106 +750,76 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       FrameSize += StackAdj;
   }
 
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple LD/LWZ immediate offset operand.
+  bool isLargeFrame = !isInt<16>(FrameSize);
+
   if (FrameSize) {
-    // The loaded (or persistent) stack pointer value is offset by the 'stwu'
-    // on entry to the function.  Add this offset back now.
-    if (!isPPC64) {
-      // If this function contained a fastcc call and GuaranteedTailCallOpt is
-      // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
-      // call which invalidates the stack pointer value in SP(0). So we use the
-      // value of R31 in this case.
-      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
-        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
-          .addReg(PPC::R31).addImm(FrameSize);
-      } else if(FI->hasFastCall()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
-          .addImm(FrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
-          .addReg(PPC::R0, RegState::Kill)
-          .addImm(FrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD4))
-          .addReg(PPC::R1)
-          .addReg(PPC::R31)
-          .addReg(PPC::R0);
-      } else if (isInt<16>(FrameSize) &&
-                 !HasBP &&
-                 !MFI->hasVarSizedObjects()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
-          .addReg(PPC::R1).addImm(FrameSize);
+    // In the prologue, the loaded (or persistent) stack pointer value is offset
+    // by the STDU/STDUX/STWU/STWUX instruction.  Add this offset back now.
+
+    // If this function contained a fastcc call and GuaranteedTailCallOpt is
+    // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+    // call which invalidates the stack pointer value in SP(0). So we use the
+    // value of R31 in this case.
+    if (FI->hasFastCall()) {
+      assert(HasFP && "Expecting a valid frame pointer.");
+      if (!isLargeFrame) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+          .addReg(FPReg).addImm(FrameSize);
       } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ),PPC::R1)
-          .addImm(0).addReg(PPC::R1);
-      }
-    } else {
-      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
-        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
-          .addReg(PPC::X31).addImm(FrameSize);
-      } else if(FI->hasFastCall()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
           .addImm(FrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
-          .addReg(PPC::X0, RegState::Kill)
+        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
           .addImm(FrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD8))
-          .addReg(PPC::X1)
-          .addReg(PPC::X31)
-          .addReg(PPC::X0);
-      } else if (isInt<16>(FrameSize) && !HasBP &&
-            !MFI->hasVarSizedObjects()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
-           .addReg(PPC::X1).addImm(FrameSize);
-      } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X1)
-           .addImm(0).addReg(PPC::X1);
+        BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
+          .addReg(FPReg)
+          .addReg(ScratchReg);
       }
+    } else if (!isLargeFrame && !HasBP && !MFI->hasVarSizedObjects()) {
+      BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+        .addReg(SPReg)
+        .addImm(FrameSize);
+    } else {
+      BuildMI(MBB, MBBI, dl, LoadInst, SPReg)
+        .addImm(0)
+        .addReg(SPReg);
     }
-  }
 
-  if (isPPC64) {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X0)
-        .addImm(LROffset).addReg(PPC::X1);
-
-    if (!MustSaveCRs.empty())
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), PPC::X12)
-        .addImm(8).addReg(PPC::X1);
+  }
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X31)
-        .addImm(FPOffset).addReg(PPC::X1);
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
 
-    if (HasBP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X30)
-        .addImm(BPOffset).addReg(PPC::X1);
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Epilogue CR restoring supported only in 64-bit mode");
 
-    if (!MustSaveCRs.empty())
-      for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
-          .addReg(PPC::X12, getKillRegState(i == e-1));
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+      .addImm(8)
+      .addReg(SPReg);
 
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR8)).addReg(PPC::X0);
-  } else {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R0)
-          .addImm(LROffset).addReg(PPC::R1);
+  if (HasFP)
+    BuildMI(MBB, MBBI, dl, LoadInst, FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
 
-    assert(MustSaveCRs.empty() &&
-           "Epilogue CR restoring supported only in 64-bit mode");
+  if (HasBP)
+    BuildMI(MBB, MBBI, dl, LoadInst, BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R31)
-          .addImm(FPOffset).addReg(PPC::R1);
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+        .addReg(TempReg, getKillRegState(i == e-1));
 
-    if (HasBP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R30)
-          .addImm(FPOffset).addReg(PPC::R1);
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR)).addReg(PPC::R0);
-  }
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg);
 
   // Callee pop calling convention. Pop parameter/linkage area. Used for tail
   // call optimization
@@ -880,27 +827,20 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       MF.getFunction()->getCallingConv() == CallingConv::Fast) {
      PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
      unsigned CallerAllocatedAmt = FI->getMinReservedArea();
-     unsigned StackReg = isPPC64 ? PPC::X1 : PPC::R1;
-     unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
-     unsigned TmpReg = isPPC64 ? PPC::X0 : PPC::R0;
-     unsigned ADDIInstr = isPPC64 ? PPC::ADDI8 : PPC::ADDI;
-     unsigned ADDInstr = isPPC64 ? PPC::ADD8 : PPC::ADD4;
-     unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS;
-     unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI;
 
      if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
-       BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
-         .addReg(StackReg).addImm(CallerAllocatedAmt);
+       BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+         .addReg(SPReg).addImm(CallerAllocatedAmt);
      } else {
-       BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+       BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
           .addImm(CallerAllocatedAmt >> 16);
-       BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
-          .addReg(TmpReg, RegState::Kill)
+       BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
           .addImm(CallerAllocatedAmt & 0xFFFF);
-       BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
-          .addReg(StackReg)
+       BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
           .addReg(FPReg)
-          .addReg(TmpReg);
+          .addReg(ScratchReg);
      }
   } else if (RetOpcode == PPC::TCRETURNdi) {
     MBBI = MBB.getLastNonDebugInstr();
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 475bde1..6ba6af6 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -876,8 +876,10 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
 // target-specific node if it hasn't already been changed.
 SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 664dd12..8da5f05 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -149,28 +149,24 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
         Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  if (Subtarget->hasFCPSGN()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
+  } else {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  }
 
   if (Subtarget->hasFPRND()) {
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::f64, Legal);
 
     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-
-    // frin does not implement "ties to even." Thus, this is safe only in
-    // fast-math mode.
-    if (TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-      setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-
-      // These need to set FE_INEXACT, and use a custom inserter.
-      setOperationAction(ISD::FRINT, MVT::f64, Legal);
-      setOperationAction(ISD::FRINT, MVT::f32, Legal);
-    }
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
   }
 
   // PowerPC does not have BSWAP, CTPOP or CTTZ
@@ -560,7 +556,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
-  setSchedulingPreference(Sched::Hybrid);
+  if (Subtarget->enableMachineScheduler())
+    setSchedulingPreference(Sched::Source);
+  else
+    setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties();
 
@@ -579,24 +578,47 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   }
 }
 
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
+                             unsigned MaxMaxAlign) {
+  if (MaxAlign == MaxMaxAlign)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
+      MaxAlign = 32;
+    else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == MaxMaxAlign)
+        break;
+    }
+  }
+}
+
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
-  const TargetMachine &TM = getTargetMachine();
   // Darwin passes everything on 4 byte boundary.
-  if (TM.getSubtarget<PPCSubtarget>().isDarwin())
+  if (PPCSubTarget.isDarwin())
     return 4;
 
   // 16byte and wider vectors are passed on 16byte boundary.
-  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    if (VTy->getBitWidth() >= 128)
-      return 16;
-
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
-   if (PPCSubTarget.isPPC64())
-     return 8;
-
-  return 4;
+  unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4;
+  if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX())
+    getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16);
+  return Align;
 }
 
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1386,6 +1408,10 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
 
+  // FIXME: TLS addresses currently use medium model code sequences,
+  // which is the most useful form.  Eventually support for small and
+  // large models could be added if users need it, at the cost of
+  // additional complexity.
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
@@ -1814,6 +1840,12 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 #include "PPCGenCallingConv.inc"
 
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
+  return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
+}
+
 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                       CCValAssign::LocInfo &LocInfo,
                                       ISD::ArgFlagsTy &ArgFlags,
@@ -2276,6 +2308,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         InVals.push_back(FIN);
         continue;
       }
+
+      unsigned BVAlign = Flags.getByValAlign();
+      if (BVAlign > 8) {
+        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
+        CurArgOffset = ArgOffset;
+      }
+
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (ObjSize < PtrByteSize)
         CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
@@ -3448,7 +3487,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
       needsTOCRestore = true;
-    } else if ((CallOpc == PPCISD::CALL) && !isLocalCall(Callee)) {
+    } else if ((CallOpc == PPCISD::CALL) &&
+               (!isLocalCall(Callee) ||
+                DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
       // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP;
     }
@@ -3865,6 +3906,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       if (Size == 0)
         continue;
 
+      unsigned BVAlign = Flags.getByValAlign();
+      if (BVAlign > 8) {
+        if (BVAlign % PtrByteSize != 0)
+          llvm_unreachable(
+            "ByVal alignment is not a multiple of the pointer size");
+
+        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
+      }
+
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (Size==1 || Size==2 || Size==4) {
         EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
@@ -3956,7 +4006,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       continue;
     }
 
-    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i32:
     case MVT::i64:
@@ -3979,7 +4029,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
           // must be passed right-justified in the stack doubleword, and
           // in the GPR, if one is available.
           SDValue StoreOff;
-          if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) {
+          if (Arg.getSimpleValueType().SimpleTy == MVT::f32) {
             SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
             StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
           } else
@@ -4287,7 +4337,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
       continue;
     }
 
-    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i32:
     case MVT::i64:
@@ -4752,7 +4802,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
     Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
 
   SDValue Tmp;
-  switch (Op.getValueType().getSimpleVT().SimpleTy) {
+  switch (Op.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
@@ -6676,51 +6726,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Restore FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
-  } else if (MI->getOpcode() == PPC::FRINDrint ||
-             MI->getOpcode() == PPC::FRINSrint) {
-    bool isf32 = MI->getOpcode() == PPC::FRINSrint;
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
-    DebugLoc dl   = MI->getDebugLoc();
-
-    MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
-
-    // Perform the rounding.
-    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FRINS : PPC::FRIND), Dest)
-      .addReg(Src);
-
-    // Compare the results.
-    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FCMPUS : PPC::FCMPUD), CRReg)
-      .addReg(Dest).addReg(Src);
-
-    // If the results were not equal, then set the FPSCR XX bit.
-    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    F->insert(It, midMBB);
-    F->insert(It, exitMBB);
-    exitMBB->splice(exitMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
-    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-    BuildMI(*BB, MI, dl, TII->get(PPC::BCC))
-      .addImm(PPC::PRED_EQ).addReg(CRReg).addMBB(exitMBB);
-
-    BB->addSuccessor(midMBB);
-    BB->addSuccessor(exitMBB);
-
-    BB = midMBB;
-
-    // Set the FPSCR XX bit (FE_INEXACT). Note that we cannot just set
-    // the FI bit here because that will not automatically set XX also,
-    // and XX is what libm interprets as the FE_INEXACT flag.
-    BuildMI(BB, dl, TII->get(PPC::MTFSB1)).addImm(/* 38 - 32 = */ 6);
-    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
-
-    BB->addSuccessor(exitMBB);
-
-    BB = exitMBB;
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
@@ -7061,8 +7066,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     if (RV.getNode() != 0) {
       DCI.AddToWorklist(RV.getNode());
       RV = DAGCombineFastRecip(RV, DCI);
-      if (RV.getNode() != 0)
+      if (RV.getNode() != 0) {
+	// Unfortunately, RV is now NaN if the input was exactly 0. Select out
+	// this case and force the answer to 0.
+
+        EVT VT = RV.getValueType();
+
+        SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
+        if (VT.isVector()) {
+          assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
+          Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
+        }
+
+        SDValue ZeroCmp =
+          DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
+                       N->getOperand(0), Zero, ISD::SETEQ);
+        DCI.AddToWorklist(ZeroCmp.getNode());
+        DCI.AddToWorklist(RV.getNode());
+
+        RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
+                         ZeroCmp, Zero, RV);
         return RV;
+      }
     }
 
     }
@@ -7158,7 +7183,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
     if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
         TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
-        DCI.getDAGCombineLevel() == AfterLegalizeTypes &&
+        (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+         VT == MVT::v4i32 || VT == MVT::v4f32) &&
         LD->getAlignment() < ABIAlignment) {
       // This is a type-legal unaligned Altivec load.
       SDValue Chain = LD->getChain();
@@ -7302,6 +7328,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         }
       }
     }
+
+    break;
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
@@ -7645,7 +7673,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       PPC::GPRCRegClass.contains(R.first)) {
     const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
     return std::make_pair(TRI->getMatchingSuperReg(R.first,
-                            PPC::sub_32, &PPC::GPRCRegClass),
+                            PPC::sub_32, &PPC::G8RCRegClass),
                           &PPC::G8RCRegClass);
   }
 
@@ -7896,7 +7924,7 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 }
 
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
-  if (DisableILPPref)
+  if (DisableILPPref || PPCSubTarget.enableMachineScheduler())
     return TargetLowering::getSchedulingPreference(N);
 
   return Sched::ILP;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index aa5e821..df3af35 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -627,6 +627,8 @@ namespace llvm {
 
     SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
+
+    CCAssignFn *useFastISelCCs(unsigned Flag) const;
   };
 
   namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index f78bb38..46db4fe 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -270,6 +270,7 @@ def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
 def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
                         "mfocrf $rT, $FXM", SprMFCR>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
@@ -506,6 +507,14 @@ defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
                         [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
 } // Interpretation64Bit
 
+// For fast-isel:
+let isCodeGenOnly = 1 in {
+def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsb $rA, $rS", IntSimple, []>, isPPC64;
+def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsh $rA, $rS", IntSimple, []>, isPPC64;
+} // isCodeGenOnly for fast-isel
+
 defm EXTSW  : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
                         "extsw", "$rA, $rS", IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
@@ -520,16 +529,16 @@ defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
 defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzd", "$rA, $rS", IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
-defm POPCNTD : XForm_11r<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
-                         "popcntd", "$rA, $rS", IntGeneral,
-                         [(set i64:$rA, (ctpop i64:$rS))]>;
+def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
+                       "popcntd $rA, $rS", IntGeneral,
+                       [(set i64:$rA, (ctpop i64:$rS))]>;
 
 // popcntw also does a population count on the high 32 bits (storing the
 // results in the high 32-bits of the output). We'll ignore that here (which is
 // safe because we never separately use the high part of the 64-bit registers).
-defm POPCNTW : XForm_11r<31, 378, (outs gprc:$rA), (ins gprc:$rS),
-                         "popcntw", "$rA, $rS", IntGeneral,
-                         [(set i32:$rA, (ctpop i32:$rS))]>;
+def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
+                       "popcntw $rA, $rS", IntGeneral,
+                       [(set i32:$rA, (ctpop i32:$rS))]>;
 
 defm DIVD  : XOForm_1r<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        "divd", "$rT, $rA, $rB", IntDivD,
@@ -569,6 +578,14 @@ defm RLDICL : MDForm_1r<30, 0,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicl", "$rA, $rS, $SH, $MBE", IntRotateDI,
                         []>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def RLDICL_32_64 : MDForm_1<30, 0,
+                           (outs g8rc:$rA),
+                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                           "rldicl $rA, $rS, $SH, $MBE", IntRotateDI,
+                           []>, isPPC64;
+// End fast-isel.
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicr", "$rA, $rS, $SH, $MBE", IntRotateDI,
@@ -620,6 +637,15 @@ def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
                    "lwax $rD, $src", LdStLHA,
                    [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
                    PPC970_DGroup_Cracked;
+// For fast-isel:
+let isCodeGenOnly = 1, mayLoad = 1 in {
+def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
+                      "lwa $rD, $src", LdStLWA, []>, isPPC64,
+                      PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
+                     "lwax $rD, $src", LdStLHA, []>, isPPC64,
+                     PPC970_DGroup_Cracked;
+} // end fast-isel isCodeGenOnly
 
 // Update forms.
 let mayLoad = 1, neverHasSideEffects = 1 in {
@@ -942,6 +968,9 @@ let PPC970_Unit = 3, neverHasSideEffects = 1,
 defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fcfid", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctid", "$frD, $frB", FPGeneral,
+                        []>, isPPC64;
 defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctidz", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index fdea51d..a55abe3 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -229,35 +229,45 @@ let Predicates = [HasAltivec] in {
 let isCodeGenOnly = 1 in {
 def DSS      : DSS_Form<822, (outs),
                         (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dss $STRM", LdStLoad /*FIXME*/, []>;
+                        "dss $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSSALL   : DSS_Form<822, (outs),
                         (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dssall", LdStLoad /*FIXME*/, []>;
+                        "dssall", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DST      : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTT     : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTST    : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTSTT   : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 
 def DST64    : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTT64   : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTST64  : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTSTT64 : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 }
 
 def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 42adc02..29233d4 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -398,6 +398,13 @@ class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let RST = 0;
 }
 
+class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+}
+
 class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -438,6 +445,17 @@ class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<1> L;
+
+  let Inst{6-10} = RS;
+  let Inst{15} = L;
+  let Inst{21-30} = xo;
+}
+
 class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                    InstrItinClass itin>
   : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -534,6 +552,21 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = RC;
 }
 
+class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+  let B = 0;
+}
+
+class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+}
+
 // DCB_Form - Form X instruction, used for dcb* instructions.
 class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
                       InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 375daee..315ad04 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -33,7 +33,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRMAP_INFO
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "PPCGenInstrInfo.inc"
 
 using namespace llvm;
@@ -45,6 +45,9 @@ opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
 static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
 cl::desc("Disable compare instruction optimization"), cl::Hidden);
 
+// Pin the vtable to this file.
+void PPCInstrInfo::anchor() {}
+
 PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
   : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
     TM(tm), RI(*TM.getSubtargetImpl()) {}
@@ -985,6 +988,10 @@ bool PPCInstrInfo::SubsumesPredicate(
   if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR)
     return false;
 
+  // P1 can only subsume P2 if they test the same condition register.
+  if (Pred1[1].getReg() != Pred2[1].getReg())
+    return false;
+
   PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm();
   PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm();
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index bd72a4d..f140c41 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -78,6 +78,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             const TargetRegisterClass *RC,
                             SmallVectorImpl<MachineInstr*> &NewMIs,
                             bool &NonRI, bool &SpillsVRS) const;
+  virtual void anchor();
 public:
   explicit PPCInstrInfo(PPCTargetMachine &TM);
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 398a11b..2bd3aad 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -785,6 +785,20 @@ multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   }
 }
 
+multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
 multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
                     string asmbase, string asmstr, InstrItinClass itin,
                     list<dag> pattern> {
@@ -1678,6 +1692,9 @@ let isCompare = 1, neverHasSideEffects = 1 in {
 
 let Uses = [RM] in {
   let neverHasSideEffects = 1 in {
+  defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiw", "$frD, $frB", FPGeneral,
+                          []>;
   defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiwz", "$frD, $frB", FPGeneral,
                           [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
@@ -1686,23 +1703,13 @@ let Uses = [RM] in {
                           "frsp", "$frD, $frB", FPGeneral,
                           [(set f32:$frD, (fround f64:$frB))]>;
 
-  // The frin -> nearbyint mapping is valid only in fast-math mode.
   let Interpretation64Bit = 1 in
   defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frin", "$frD, $frB", FPGeneral,
-                          [(set f64:$frD, (fnearbyint f64:$frB))]>;
+                          [(set f64:$frD, (frnd f64:$frB))]>;
   defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frin", "$frD, $frB", FPGeneral,
-                          [(set f32:$frD, (fnearbyint f32:$frB))]>;
-  }
-
-  // These pseudos expand to rint but also set FE_INEXACT when the result does
-  // not equal the argument.
-  let usesCustomInserter = 1, Defs = [RM] in { // FIXME: Model FPSCR!
-    def FRINDrint : Pseudo<(outs f8rc:$frD), (ins f8rc:$frB),
-                            "#FRINDrint", [(set f64:$frD, (frint f64:$frB))]>;
-    def FRINSrint : Pseudo<(outs f4rc:$frD), (ins f4rc:$frB),
-                            "#FRINSrint", [(set f32:$frD, (frint f32:$frB))]>;
+                          [(set f32:$frD, (frnd f32:$frB))]>;
   }
 
   let neverHasSideEffects = 1 in {
@@ -1772,6 +1779,14 @@ defm FNEGD  : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fneg", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (fneg f64:$frB))]>;
 
+defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
+let Interpretation64Bit = 1 in
+defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
+
 // Reciprocal estimates.
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fre", "$frD, $frB", FPGeneral,
@@ -1855,7 +1870,7 @@ def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
                       "mtspr $SPR, $RT", SprMTSPR>;
 
 def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
-                     "mftb $RT, $SPR", SprMFTB>;
+                     "mftb $RT, $SPR", SprMFTB>, Deprecated<DeprecatedMFTB>;
 
 let Uses = [CTR] in {
 def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
@@ -1927,6 +1942,7 @@ def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
 def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
                        "mfocrf $rT, $FXM", SprMFCR>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
@@ -2280,6 +2296,12 @@ def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
 def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
           (FNMSUBS $A, $C, $B)>;
 
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign f64:$frB, f32:$frA),
+          (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
+def : Pat<(fcopysign f32:$frB, f64:$frA),
+          (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
 
@@ -2300,6 +2322,35 @@ def EIEIO : XForm_24_eieio<31, 854, (outs), (ins),
 def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
                          "wait $L", LdStLoad, []>;
 
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsr $RS, $L", SprMTMSR>;
+
+def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
+                  "mfmsr $RT", SprMFMSR, []>;
+
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsrd $RS, $L", SprMTMSRD>;
+
+def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
+                        "slbie $RB", SprSLBIE, []>;
+
+def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
+                    "slbmte $RS, $RB", SprSLBMTE, []>;
+
+def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
+                       "slbmfee $RT, $RB", SprSLBMFEE, []>;
+
+def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", SprSLBIA, []>;
+
+def TLBSYNC : XForm_0<31, 566, (outs), (ins),
+                        "tlbsync", SprTLBSYNC, []>;
+
+def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
+                          "tlbiel $RB", SprTLBIEL, []>;
+
+def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
+                          "tlbie $RB,$RS", SprTLBIE, []>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Assembler Instruction Aliases
 //
@@ -2368,6 +2419,46 @@ def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
 def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
 def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
 
+def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
+def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
+
+def : InstAlias<"mfsprg $RT, 0", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg $RT, 1", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg $RT, 2", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg $RT, 3", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mfsprg0 $RT", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg1 $RT", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg2 $RT", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg3 $RT", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mtsprg 0, $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg 1, $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg 2, $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg 3, $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtsprg0 $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg1 $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg2 $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg3 $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
+
+def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
+def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
+
+def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
+
+def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
+def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
+
+def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
+def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
+def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
+def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
+
+def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
+
 def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
 def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index d69aa4a..f61c8bf 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -69,7 +69,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     if (MO.isGlobal()) {
       StubSym =
       MachineModuleInfoImpl::
-      StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+      StubValueTy(AP.getSymbol(MO.getGlobal()),
                   !MO.getGlobal()->hasInternalLinkage());
     } else {
       Name.erase(Name.end()-5, Name.end());
@@ -95,7 +95,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     if (StubSym.getPointer() == 0) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym = MachineModuleInfoImpl::
-                   StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+                   StubValueTy(AP.getSymbol(MO.getGlobal()),
                                !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index adba613..19ccbfc 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -69,6 +69,7 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST)
   ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
   ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
   ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+  ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32;
 
   // 64-bit
   ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
@@ -532,6 +533,7 @@ static bool usesIXAddr(const MachineInstr &MI) {
   default:
     return false;
   case PPC::LWA:
+  case PPC::LWA_32:
   case PPC::LD:
   case PPC::STD:
     return true;
@@ -689,14 +691,6 @@ unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
     return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
 }
 
-unsigned PPCRegisterInfo::getEHExceptionRegister() const {
-  return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3;
-}
-
-unsigned PPCRegisterInfo::getEHHandlerRegister() const {
-  return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
-}
-
 unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
   if (!hasBasePointer(MF))
     return getFrameRegister(MF);
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index d02af9e..dd3bb40 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -97,10 +97,6 @@ public:
   bool hasBasePointer(const MachineFunction &MF) const;
   bool canRealignStack(const MachineFunction &MF) const;
   bool needsStackRealignment(const MachineFunction &MF) const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 660c0c3..92ba69c 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -108,6 +108,14 @@ def VecPerm      : InstrItinClass;
 def VecFPRound   : InstrItinClass;
 def VecVSL       : InstrItinClass;
 def VecVSR       : InstrItinClass;
+def SprMTMSRD    : InstrItinClass;
+def SprSLIE      : InstrItinClass;
+def SprSLBIE     : InstrItinClass;
+def SprSLBMTE    : InstrItinClass;
+def SprSLBMFEE   : InstrItinClass;
+def SprSLBIA     : InstrItinClass;
+def SprTLBIEL    : InstrItinClass;
+def SprTLBIE     : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 8d5838e..1612cd2 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -14,39 +14,8 @@
 //===----------------------------------------------------------------------===//
 // Functional units on the PowerPC A2 chip sets
 //
-def IU0to3_0  : FuncUnit; // Fetch unit 1 to 4 slot 1
-def IU0to3_1  : FuncUnit; // Fetch unit 1 to 4 slot 2
-def IU0to3_2  : FuncUnit; // Fetch unit 1 to 4 slot 3
-def IU0to3_3  : FuncUnit; // Fetch unit 1 to 4 slot 4
-def IU4_0  : FuncUnit; // Instruction buffer slot 1
-def IU4_1  : FuncUnit; // Instruction buffer slot 2
-def IU4_2  : FuncUnit; // Instruction buffer slot 3
-def IU4_3  : FuncUnit; // Instruction buffer slot 4
-def IU4_4  : FuncUnit; // Instruction buffer slot 5
-def IU4_5  : FuncUnit; // Instruction buffer slot 6
-def IU4_6  : FuncUnit; // Instruction buffer slot 7
-def IU4_7  : FuncUnit; // Instruction buffer slot 8
-def IU5    : FuncUnit; // Dependency resolution
-def IU6    : FuncUnit; // Instruction issue
-def RF0    : FuncUnit;
-def XRF1   : FuncUnit;
-def XEX1   : FuncUnit; // Execution stage 1 for the XU pipeline
-def XEX2   : FuncUnit; // Execution stage 2 for the XU pipeline
-def XEX3   : FuncUnit; // Execution stage 3 for the XU pipeline
-def XEX4   : FuncUnit; // Execution stage 4 for the XU pipeline
-def XEX5   : FuncUnit; // Execution stage 5 for the XU pipeline
-def XEX6   : FuncUnit; // Execution stage 6 for the XU pipeline
-def FRF1   : FuncUnit;
-def FEX1   : FuncUnit; // Execution stage 1 for the FU pipeline
-def FEX2   : FuncUnit; // Execution stage 2 for the FU pipeline
-def FEX3   : FuncUnit; // Execution stage 3 for the FU pipeline
-def FEX4   : FuncUnit; // Execution stage 4 for the FU pipeline
-def FEX5   : FuncUnit; // Execution stage 5 for the FU pipeline
-def FEX6   : FuncUnit; // Execution stage 6 for the FU pipeline
-
-def CR_Bypass  : Bypass; // The bypass for condition regs.
-//def GPR_Bypass : Bypass; // The bypass for general-purpose regs.
-//def FPR_Bypass : Bypass; // The bypass for floating-point regs.
+def XU     : FuncUnit; // XU pipeline
+def FU     : FuncUnit; // FI pipeline
 
 //
 // This file defines the itinerary class data for the PPC A2 processor.
@@ -55,699 +24,119 @@ def CR_Bypass  : Bypass; // The bypass for condition regs.
 
 
 def PPCA2Itineraries : ProcessorItineraries<
-  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3,
-   IU4_0, IU4_1, IU4_2, IU4_3, IU4_4, IU4_5, IU4_6, IU4_7,
-   IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6,
-   FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6],
-  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
-  InstrItinData<IntSimple   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntGeneral  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntCompare  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntDivW     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<38, [XEX6]>],
-                              [53, 7, 7],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMFFS     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHW    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHWU   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulLI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotate   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateD  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateDI , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<IntShift    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapW    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapD    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrB         , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<BrCR        , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCR       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCRX      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBA    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBF    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLoad    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStLDU     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStStore   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStICBI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFD    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
-  InstrItinData<LdStLFD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLFDU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLHA     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLHAU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLMW     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLWARX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStSTDCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSync    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<12, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [GPR_Bypass, NoBypass]>,
-  InstrItinData<SprMTMSR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7], 
-                              [GPR_Bypass, CR_Bypass]>,
-  InstrItinData<SprMFMSR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [GPR_Bypass, NoBypass]>,
-  InstrItinData<SprMFSPR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMFTB     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSPR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprRFI      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprSC       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<FPGeneral   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPAddSub    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPCompare   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [13, 7, 7],
-                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivD      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<71, [FRF1], 0>,
-                               InstrStage<71, [FEX1], 0>,
-                                  InstrStage<71, [FEX2], 0>,
-                               InstrStage<71, [FEX3], 0>,
-                                  InstrStage<71, [FEX4], 0>,
-                               InstrStage<71, [FEX5], 0>,
-                                  InstrStage<71, [FEX6]>],
-                              [86, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivS      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<58, [FRF1], 0>,
-                               InstrStage<58, [FEX1], 0>,
-                                  InstrStage<58, [FEX2], 0>,
-                               InstrStage<58, [FEX3], 0>,
-                                  InstrStage<58, [FEX4], 0>,
-                               InstrStage<58, [FEX5], 0>,
-                                  InstrStage<58, [FEX6]>],
-                              [73, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPSqrt      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<68, [FRF1], 0>,
-                               InstrStage<68, [FEX1], 0>,
-                                  InstrStage<68, [FEX2], 0>,
-                               InstrStage<68, [FEX3], 0>,
-                                  InstrStage<68, [FEX4], 0>,
-                               InstrStage<68, [FEX5], 0>,
-                                  InstrStage<68, [FEX6]>],
-                              [86, 7], // FIXME: should be [86, 7] for double
-                                       // and [82, 7] for single. Likewise,
-                                       // the FEX? cycle count should be 68
-                                       // for double and 64 for single.
-                              [NoBypass, FPR_Bypass]>,
-  InstrItinData<FPFused     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPRes       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7],
-                              [FPR_Bypass, FPR_Bypass]>
+  [XU, FU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [XU]>],
+                              [39, 1, 1]>,
+  InstrItinData<IntDivD     , [InstrStage<1, [XU]>],
+                              [71, 1, 1]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntRotateD  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntRotateDI , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntShift    , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [XU]>],
+                              [2, 1]>,
+  InstrItinData<IntTrapD    , [InstrStage<1, [XU]>],
+                              [2, 1]>,
+  InstrItinData<BrB         , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<BrCR        , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [XU]>],
+                              [6, 8, 1, 1]>,
+  InstrItinData<LdStLDU     , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStStore   , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStICBI,     [InstrStage<1, [XU]>],
+                              [16, 1, 1]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStLFD     , [InstrStage<1, [XU]>],
+                              [7, 1, 1]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [XU]>],
+                              [7, 9, 1, 1]>,
+  InstrItinData<LdStLHA     , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [XU]>],
+                              [6, 8, 1, 1]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSTD     , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSync    , [InstrStage<1, [XU]>],
+                              [6]>,
+  InstrItinData<SprISYNC    , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [XU]>],
+                              [16, 1]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [XU]>],
+                              [4, 1]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [XU]>],
+                              [4, 1]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<SprSC       , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FU]>],
+                              [6, 1, 1]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FU]>],
+                              [6, 1, 1]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FU]>],
+                              [5, 1, 1]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [FU]>],
+                              [72, 1, 1]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [FU]>],
+                              [59, 1, 1]>,
+  InstrItinData<FPSqrt      , [InstrStage<1, [FU]>],
+                              [69, 1, 1]>,
+  InstrItinData<FPFused     , [InstrStage<1, [FU]>],
+                              [6, 1, 1, 1]>,
+  InstrItinData<FPRes       , [InstrStage<1, [FU]>],
+                              [6, 1]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index 9bb779a..c189b9e 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -36,6 +36,8 @@ def CFX_0 : FuncUnit; // CFX pipeline
 def LSU_0 : FuncUnit; // LSU pipeline
 def FPU_0 : FuncUnit; // FPU pipeline
 
+def CR_Bypass : Bypass;
+
 def PPCE500mcItineraries : ProcessorItineraries<
   [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
   [CR_Bypass, GPR_Bypass, FPR_Bypass], [
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index d7e11ac..7a24d20 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -39,6 +39,7 @@ def CFX_1 : FuncUnit; // CFX pipeline stage 1
 // def LSU_0 : FuncUnit; // LSU pipeline
 // def FPU_0 : FuncUnit; // FPU pipeline
 
+// def CR_Bypass : Bypass;
 
 def PPCE5500Itineraries : ProcessorItineraries<
   [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 12d0326..7231ab1 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -15,6 +15,7 @@
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Function.h"
@@ -74,6 +75,7 @@ void PPCSubtarget::initializeEnvironment() {
   Use64BitRegs = false;
   HasAltivec = false;
   HasQPX = false;
+  HasFCPSGN = false;
   HasFSQRT = false;
   HasFRE = false;
   HasFRES = false;
@@ -88,6 +90,8 @@ void PPCSubtarget::initializeEnvironment() {
   HasPOPCNTD = false;
   HasLDBRX = false;
   IsBookE = false;
+  DeprecatedMFTB = false;
+  DeprecatedDST = false;
   HasLazyResolverStubs = false;
   IsJITCodeModel = false;
 }
@@ -163,14 +167,7 @@ bool PPCSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  // FIXME: It would be best to use TargetSubtargetInfo::ANTIDEP_ALL here,
-  // but we can't because we can't reassign the cr registers. There is a
-  // dependence between the cr register and the RLWINM instruction used
-  // to extract its value which the anti-dependency breaker can't currently
-  // see. Maybe we should make a late-expanded pseudo to encode this dependency.
-  // (the relevant code is in PPCDAGToDAGISel::SelectSETCC)
-
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  Mode = TargetSubtargetInfo::ANTIDEP_ALL;
 
   CriticalPathRCs.clear();
 
@@ -179,9 +176,44 @@ bool PPCSubtarget::enablePostRAScheduler(
   else
     CriticalPathRCs.push_back(&PPC::GPRCRegClass);
     
-  CriticalPathRCs.push_back(&PPC::F8RCRegClass);
-  CriticalPathRCs.push_back(&PPC::VRRCRegClass);
-
   return OptLevel >= CodeGenOpt::Default;
 }
 
+// Embedded cores need aggressive scheduling.
+static bool needsAggressiveScheduling(unsigned Directive) {
+  switch (Directive) {
+  default: return false;
+  case PPC::DIR_440:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+    return true;
+  }
+}
+
+bool PPCSubtarget::enableMachineScheduler() const {
+  // Enable MI scheduling for the embedded cores.
+  // FIXME: Enable this for all cores (some additional modeling
+  // may be necessary).
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
+void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                       MachineInstr *begin,
+                                       MachineInstr *end,
+                                       unsigned NumRegionInstrs) const {
+  if (needsAggressiveScheduling(DarwinDirective)) {
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+
+  // Spilling is generally expensive on all PPC cores, so always enable
+  // register-pressure tracking.
+  Policy.ShouldTrackPressure = true;
+}
+
+bool PPCSubtarget::useAA() const {
+  // Use AA during code generation for the embedded cores.
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 3f3fc0e..c863a6e 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -76,6 +76,8 @@ protected:
   bool IsPPC64;
   bool HasAltivec;
   bool HasQPX;
+  bool HasVSX;
+  bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
   bool HasRecipPrec;
@@ -87,6 +89,8 @@ protected:
   bool HasPOPCNTD;
   bool HasLDBRX;
   bool IsBookE;
+  bool DeprecatedMFTB;
+  bool DeprecatedDST;
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
   bool IsLittleEndian;
@@ -171,6 +175,7 @@ public:
   bool isLittleEndian() const { return IsLittleEndian; }
 
   // Specific obvious features.
+  bool hasFCPSGN() const { return HasFCPSGN; }
   bool hasFSQRT() const { return HasFSQRT; }
   bool hasFRE() const { return HasFRE; }
   bool hasFRES() const { return HasFRES; }
@@ -188,6 +193,8 @@ public:
   bool hasPOPCNTD() const { return HasPOPCNTD; }
   bool hasLDBRX() const { return HasLDBRX; }
   bool isBookE() const { return IsBookE; }
+  bool isDeprecatedMFTB() const { return DeprecatedMFTB; }
+  bool isDeprecatedDST() const { return DeprecatedDST; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -205,6 +212,14 @@ public:
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
                              RegClassVector& CriticalPathRCs) const;
+
+  // Scheduling customization.
+  bool enableMachineScheduler() const;
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const;
+  bool useAA() const;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
new file mode 100644
index 0000000..e876be1
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -0,0 +1,23 @@
+//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETSTREAMER_H
+#define PPCTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class PPCTargetStreamer : public MCTargetStreamer {
+public:
+  virtual ~PPCTargetStreamer();
+  virtual void emitTCEntry(const MCSymbol &S) = 0;
+};
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2504ba7..8879630 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -77,6 +77,7 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -129,6 +130,14 @@ PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
   return PSK_Software;
 }
 
+void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (ST->getDarwinDirective() == PPC::DIR_A2) {
+    // The A2 is in-order with a deep pipeline, and concatenation unrolling
+    // helps expose latency-hiding opportunities to the instruction scheduler.
+    UP.Partial = UP.Runtime = true;
+  }
+}
+
 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasAltivec())
     return 0;
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 6b374cb..025b28e 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -29,11 +29,13 @@ FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
 FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
 FunctionPass *createR600Packetizer(TargetMachine &tm);
 FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
 FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm);
 
 // SI Passes
+FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
@@ -43,7 +45,6 @@ FunctionPass *createSIInsertWaits(TargetMachine &tm);
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
-FunctionPass *createAMDGPUIndirectAddressingPass(TargetMachine &tm);
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 
 /// \brief Creates an AMDGPU-specific Target Transformation Info pass.
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 0048e25..182235b 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -21,8 +21,18 @@ def FeatureDumpCode : SubtargetFeature <"DumpCode",
         "true",
         "Dump MachineInstrs in the CodeEmitter">;
 
+def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
+        "EnableIRStructurizer",
+        "false",
+        "Disable IR Structurizer">;
+
 // Target features
 
+def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
+        "EnableIfCvt",
+        "false",
+        "Disable the if conversion pass">;
+
 def FeatureFP64     : SubtargetFeature<"fp64",
         "FP64",
         "true",
@@ -82,6 +92,8 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
         [Feature64BitPtr, FeatureFP64]>;
 
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+        [Feature64BitPtr, FeatureFP64]>;
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index e039b77..67bdba2 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -45,32 +45,60 @@ extern "C" void LLVMInitializeR600AsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
 }
 
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer)
+{
+  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode() &&
+                  ! Streamer.hasRawTextSupport();
+}
+
 /// We need to override this function so we can avoid
 /// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-  if (STM.dumpCode()) {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    MF.dump();
-#endif
-  }
   SetupMachineFunction(MF);
   if (OutStreamer.hasRawTextSupport()) {
     OutStreamer.EmitRawText("@" + MF.getName() + ":");
   }
 
-  const MCSectionELF *ConfigSection = getObjFileLowering().getContext()
-                                              .getELFSection(".AMDGPU.config",
+  MCContext &Context = getObjFileLowering().getContext();
+  const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
                                               ELF::SHT_PROGBITS, 0,
                                               SectionKind::getReadOnly());
   OutStreamer.SwitchSection(ConfigSection);
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     EmitProgramInfoSI(MF);
   } else {
     EmitProgramInfoR600(MF);
   }
+
+  DisasmLines.clear();
+  HexLines.clear();
+  DisasmLineMaxLen = 0;
+
   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
   EmitFunctionBody();
+
+  if (STM.dumpCode()) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    MF.dump();
+#endif
+
+    if (DisasmEnabled) {
+      OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
+                                                  ELF::SHT_NOTE, 0,
+                                                  SectionKind::getReadOnly()));
+
+      for (size_t i = 0; i < DisasmLines.size(); ++i) {
+        std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+        Comment += " ; " + HexLines[i] + "\n";
+
+        OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
+        OutStreamer.EmitBytes(StringRef(Comment));
+      }
+    }
+  }
+
   return false;
 }
 
@@ -139,6 +167,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
@@ -154,7 +183,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
 
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        MachineOperand & MO = MI.getOperand(op_idx);
+        MachineOperand &MO = MI.getOperand(op_idx);
         unsigned maxUsed;
         unsigned width = 0;
         bool isSGPR = false;
@@ -168,8 +197,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
           VCCUsed = true;
           continue;
         }
+
         switch (reg) {
         default: break;
+        case AMDGPU::SCC:
         case AMDGPU::EXEC:
         case AMDGPU::M0:
           continue;
@@ -202,6 +233,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
           isSGPR = false;
           width = 8;
+        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 16;
         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
           isSGPR = false;
           width = 16;
@@ -234,13 +268,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
   OutStreamer.EmitIntValue(RsrcReg, 4);
   OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4);
 
+  unsigned LDSAlignShift;
+  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    // LDS is allocated in 64 dword blocks
+    LDSAlignShift = 8;
+  } else {
+    // LDS is allocated in 128 dword blocks
+    LDSAlignShift = 9;
+  }
+  unsigned LDSBlocks =
+          RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+
   if (MFI->ShaderType == ShaderType::COMPUTE) {
     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4);
+    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
   }
   if (MFI->ShaderType == ShaderType::PIXEL) {
     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4);
+    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
     OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
   }
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index f425ef4..05dc9bb 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,14 +16,15 @@
 #define AMDGPU_ASMPRINTER_H
 
 #include "llvm/CodeGen/AsmPrinter.h"
+#include <string>
+#include <vector>
 
 namespace llvm {
 
 class AMDGPUAsmPrinter : public AsmPrinter {
 
 public:
-  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) { }
+  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -38,6 +39,11 @@ public:
 
   /// Implemented in AMDGPUMCInstLower.cpp
   virtual void EmitInstruction(const MachineInstr *MI);
+
+protected:
+  bool DisasmEnabled;
+  std::vector<std::string> DisasmLines, HexLines;
+  size_t DisasmLineMaxLen;
 };
 
 } // End anonymous llvm
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index fc95d58..65cdb24 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -19,12 +19,13 @@ def CC_SI : CallingConv<[
 
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
-    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
     [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ]
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
   >>>,
 
   CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
@@ -32,21 +33,33 @@ def CC_SI : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
-  ]>>>
+  ]>>>,
+
+  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+  >>>
 
 ]>;
 
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
+
 // Calling convention for compute kernels
 def CC_AMDGPU_Kernel : CallingConv<[
-  CCIfType<[v4i32, v4f32],               CCAssignToStack <16, 16>>,
-  CCIfType<[i64, f64, v2f32, v2i32],     CCAssignToStack < 8, 8>>,
-  CCIfType<[i32, f32],                   CCAssignToStack < 4, 4>>,
-  CCIfType<[i16],                        CCAssignToStack < 2, 4>>,
-  CCIfType<[i8],                         CCAssignToStack < 1, 4>>
+  CCCustom<"allocateStack">
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() == "
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= "
        "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
        "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
@@ -55,5 +68,7 @@ def CC_AMDGPU : CallingConv<[
        "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->"
        "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
   CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
-       ".getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>
+       ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
+       ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>>
 ]>;
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index f222901..a989135 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -77,6 +77,7 @@ private:
   bool isLocalLoad(const LoadSDNode *N) const;
   bool isRegionLoad(const LoadSDNode *N) const;
 
+  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
   bool SelectGlobalValueVariableOffset(SDValue Addr,
       SDValue &BaseReg, SDValue& Offset);
@@ -102,6 +103,37 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
 AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
 }
 
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+                                                          unsigned OpNo) const {
+  if (!N->isMachineOpcode()) {
+    return NULL;
+  }
+  switch (N->getMachineOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
+    unsigned OpIdx = Desc.getNumDefs() + OpNo;
+    if (OpIdx >= Desc.getNumOperands())
+      return NULL;
+    int RegClass = Desc.OpInfo[OpIdx].RegClass;
+    if (RegClass == -1) {
+      return NULL;
+    }
+    return TM.getRegisterInfo()->getRegClass(RegClass);
+  }
+  case AMDGPU::REG_SEQUENCE: {
+    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(
+                      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+    unsigned SubRegIdx =
+            dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue();
+    return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
+  }
+  }
+}
+
 SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
   return CurDAG->getTargetConstant(Imm, MVT::i32);
 }
@@ -161,130 +193,94 @@ bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
 }
 
 SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
-  const R600InstrInfo *TII =
-                      static_cast<const R600InstrInfo*>(TM.getInstrInfo());
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
   }
   switch (Opc) {
   default: break;
-  case AMDGPUISD::CONST_ADDRESS: {
-    for (SDNode::use_iterator I = N->use_begin(), Next = llvm::next(I);
-                              I != SDNode::use_end(); I = Next) {
-      Next = llvm::next(I);
-      if (!I->isMachineOpcode()) {
-        continue;
-      }
-      unsigned Opcode = I->getMachineOpcode();
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
-      int SrcIdx = I.getOperandNo();
-      int SelIdx;
-      // Unlike MachineInstrs, SDNodes do not have results in their operand
-      // list, so we need to increment the SrcIdx, since
-      // R600InstrInfo::getOperandIdx is based on the MachineInstr indices.
-      if (HasDst) {
-        SrcIdx++;
-      }
-
-      SelIdx = TII->getSelIdx(I->getMachineOpcode(), SrcIdx);
-      if (SelIdx < 0) {
-        continue;
-      }
-
-      SDValue CstOffset;
-      if (N->getValueType(0).isVector() ||
-          !SelectGlobalValueConstantOffset(N->getOperand(0), CstOffset))
-        continue;
-
-      // Gather constants values
-      int SrcIndices[] = {
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
-      };
-      std::vector<unsigned> Consts;
-      for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
-        int OtherSrcIdx = SrcIndices[i];
-        int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
-        if (OtherSrcIdx < 0 || OtherSelIdx < 0) {
+  case ISD::BUILD_VECTOR: {
+    unsigned RegClassID;
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    const AMDGPURegisterInfo *TRI =
+                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
+    const SIRegisterInfo *SIRI =
+                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      bool UseVReg = true;
+      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                    U != E; ++U) {
+        if (!U->isMachineOpcode()) {
           continue;
         }
-        if (HasDst) {
-          OtherSrcIdx--;
-          OtherSelIdx--;
+        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+        if (!RC) {
+          continue;
         }
-        if (RegisterSDNode *Reg =
-                         dyn_cast<RegisterSDNode>(I->getOperand(OtherSrcIdx))) {
-          if (Reg->getReg() == AMDGPU::ALU_CONST) {
-            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(I->getOperand(OtherSelIdx));
-            Consts.push_back(Cst->getZExtValue());
-          }
+        if (SIRI->isSGPRClass(RC)) {
+          UseVReg = false;
         }
       }
-
-      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
-      Consts.push_back(Cst->getZExtValue());
-      if (!TII->fitsConstReadLimitations(Consts))
-        continue;
-
-      // Convert back to SDNode indices
-      if (HasDst) {
-        SrcIdx--;
-        SelIdx--;
+      switch(NumVectorElts) {
+      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+                                     AMDGPU::SReg_32RegClassID;
+        break;
+      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
+                                     AMDGPU::SReg_64RegClassID;
+        break;
+      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
+                                     AMDGPU::SReg_128RegClassID;
+        break;
+      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
+                                     AMDGPU::SReg_256RegClassID;
+        break;
+      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
+                                      AMDGPU::SReg_512RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
       }
-      std::vector<SDValue> Ops;
-      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (i == SrcIdx) {
-          Ops.push_back(CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32));
-        } else if (i == SelIdx) {
-          Ops.push_back(CstOffset);
-        } else {
-          Ops.push_back(I->getOperand(i));
-        }
+    } else {
+      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+      // that adds a 128 bits reg copy when going through TwoAddressInstructions
+      // pass. We want to avoid 128 bits copies as much as possible because they
+      // can't be bundled by our scheduler.
+      switch(NumVectorElts) {
+      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
       }
-      CurDAG->UpdateNodeOperands(*I, Ops.data(), Ops.size());
-    }
-    break;
-  }
-  case ISD::BUILD_VECTOR: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-      break;
     }
 
-    unsigned RegClassID;
-    switch(N->getValueType(0).getVectorNumElements()) {
-    case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
-    case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
-    default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+
+    if (NumVectorElts == 1) {
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
+                                  VT.getVectorElementType(),
+                                  N->getOperand(0), RegClass);
     }
-    // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
-    // that adds a 128 bits reg copy when going through TwoAddressInstructions
-    // pass. We want to avoid 128 bits copies as much as possible because they
-    // can't be bundled by our scheduler.
-    SDValue RegSeqArgs[9] = {
-      CurDAG->getTargetConstant(RegClassID, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
+
+    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+    // 16 = Max Num Vector Elements
+    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+    // 1 = Vector Register Class
+    SDValue RegSeqArgs[16 * 2 + 1];
+
+    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
     for (unsigned i = 0; i < N->getNumOperands(); i++) {
+      // XXX: Why is this here?
       if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
         IsRegSeq = false;
         break;
       }
-      RegSeqArgs[2 * i + 1] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i) + 1] =
+              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
     }
     if (!IsRegSeq)
       break;
@@ -313,285 +309,44 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                   SDLoc(N), N->getValueType(0), Ops);
   }
-
-  case ISD::ConstantFP:
-  case ISD::Constant: {
+  case AMDGPUISD::REGISTER_LOAD: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    // XXX: Custom immediate lowering not implemented yet.  Instead we use
-    // pseudo instructions defined in SIInstructions.td
-    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
-    }
-
-    uint64_t ImmValue = 0;
-    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
-
-    if (N->getOpcode() == ISD::ConstantFP) {
-      // XXX: 64-bit Immediates not supported yet
-      assert(N->getValueType(0) != MVT::f64);
-
-      ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
-      APFloat Value = C->getValueAPF();
-      float FloatValue = Value.convertToFloat();
-      if (FloatValue == 0.0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (FloatValue == 0.5) {
-        ImmReg = AMDGPU::HALF;
-      } else if (FloatValue == 1.0) {
-        ImmReg = AMDGPU::ONE;
-      } else {
-        ImmValue = Value.bitcastToAPInt().getZExtValue();
-      }
-    } else {
-      // XXX: 64-bit Immediates not supported yet
-      assert(N->getValueType(0) != MVT::i64);
-
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
-      if (C->getZExtValue() == 0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (C->getZExtValue() == 1) {
-        ImmReg = AMDGPU::ONE_INT;
-      } else {
-        ImmValue = C->getZExtValue();
-      }
-    }
-
-    for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
-                              Use != SDNode::use_end(); Use = Next) {
-      Next = llvm::next(Use);
-      std::vector<SDValue> Ops;
-      for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
-        Ops.push_back(Use->getOperand(i));
-      }
-
-      if (!Use->isMachineOpcode()) {
-          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
-            // We can only use literal constants (e.g. AMDGPU::ZERO,
-            // AMDGPU::ONE, etc) in machine opcodes.
-            continue;
-          }
-      } else {
-        if (!TII->isALUInstr(Use->getMachineOpcode()) ||
-            (TII->get(Use->getMachineOpcode()).TSFlags &
-            R600_InstFlag::VECTOR)) {
-          continue;
-        }
-
-        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
-                                        AMDGPU::OpName::literal);
-        if (ImmIdx == -1) {
-          continue;
-        }
-
-        if (TII->getOperandIdx(Use->getMachineOpcode(),
-                               AMDGPU::OpName::dst) != -1) {
-          // subtract one from ImmIdx, because the DST operand is usually index
-          // 0 for MachineInstrs, but we have no DST in the Ops vector.
-          ImmIdx--;
-        }
-
-        // Check that we aren't already using an immediate.
-        // XXX: It's possible for an instruction to have more than one
-        // immediate operand, but this is not supported yet.
-        if (ImmReg == AMDGPU::ALU_LITERAL_X) {
-          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
-          assert(C);
-
-          if (C->getZExtValue() != 0) {
-            // This instruction is already using an immediate.
-            continue;
-          }
-
-          // Set the immediate value
-          Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
-        }
-      }
-      // Set the immediate register
-      Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
-
-      CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
-    }
-    break;
-  }
-  }
-  SDNode *Result = SelectCode(N);
-
-  // Fold operands of selected node
-
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    const R600InstrInfo *TII =
-        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-    if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) {
-      bool IsModified = false;
-      do {
-        std::vector<SDValue> Ops;
-        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
-            I != E; ++I)
-          Ops.push_back(*I);
-        IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops);
-        if (IsModified) {
-          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
-        }
-      } while (IsModified);
-
-    }
-    if (Result && Result->isMachineOpcode() &&
-        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
-        && TII->hasInstrModifiers(Result->getMachineOpcode())) {
-      // Fold FNEG/FABS
-      // TODO: Isel can generate multiple MachineInst, we need to recursively
-      // parse Result
-      bool IsModified = false;
-      do {
-        std::vector<SDValue> Ops;
-        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
-            I != E; ++I)
-          Ops.push_back(*I);
-        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
-        if (IsModified) {
-          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
-        }
-      } while (IsModified);
-
-      // If node has a single use which is CLAMP_R600, folds it
-      if (Result->hasOneUse() && Result->isMachineOpcode()) {
-        SDNode *PotentialClamp = *Result->use_begin();
-        if (PotentialClamp->isMachineOpcode() &&
-            PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
-          unsigned ClampIdx =
-            TII->getOperandIdx(Result->getMachineOpcode(), AMDGPU::OpName::clamp);
-          std::vector<SDValue> Ops;
-          unsigned NumOp = Result->getNumOperands();
-          for (unsigned i = 0; i < NumOp; ++i) {
-            Ops.push_back(Result->getOperand(i));
-          }
-          Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-          Result = CurDAG->SelectNodeTo(PotentialClamp,
-              Result->getMachineOpcode(), PotentialClamp->getVTList(),
-              Ops.data(), NumOp);
-        }
-      }
-    }
+    SDValue Addr, Offset;
+
+    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
+    const SDValue Ops[] = {
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, SDLoc(N),
+                                  CurDAG->getVTList(MVT::i32, MVT::i64, MVT::Other),
+                                  Ops);
   }
-
-  return Result;
-}
-
-bool AMDGPUDAGToDAGISel::FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg,
-                                     SDValue &Abs, const R600InstrInfo *TII) {
-  switch (Src.getOpcode()) {
-  case ISD::FNEG:
-    Src = Src.getOperand(0);
-    Neg = CurDAG->getTargetConstant(1, MVT::i32);
-    return true;
-  case ISD::FABS:
-    if (!Abs.getNode())
-      return false;
-    Src = Src.getOperand(0);
-    Abs = CurDAG->getTargetConstant(1, MVT::i32);
-    return true;
-  case ISD::BITCAST:
-    Src = Src.getOperand(0);
-    return true;
-  default:
-    return false;
+  case AMDGPUISD::REGISTER_STORE: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
+    const SDValue Ops[] = {
+      N->getOperand(1),
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, SDLoc(N),
+                                        CurDAG->getVTList(MVT::Other),
+                                        Ops);
   }
-}
-
-bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
-    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
-  int OperandIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
-  };
-  int SelIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_sel)
-  };
-  int NegIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
-  };
-  int AbsIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
-    -1
-  };
-
-
-  for (unsigned i = 0; i < 3; i++) {
-    if (OperandIdx[i] < 0)
-      return false;
-    SDValue &Src = Ops[OperandIdx[i] - 1];
-    SDValue &Sel = Ops[SelIdx[i] - 1];
-    SDValue &Neg = Ops[NegIdx[i] - 1];
-    SDValue FakeAbs;
-    SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
-    if (FoldOperand(Src, Sel, Neg, Abs, TII))
-      return true;
   }
-  return false;
+  return SelectCode(N);
 }
 
-bool AMDGPUDAGToDAGISel::FoldDotOperands(unsigned Opcode,
-    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
-  int OperandIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
-  };
-  int SelIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_W)
-  };
-  int NegIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
-  };
-  int AbsIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
-  };
-
-  for (unsigned i = 0; i < 8; i++) {
-    if (OperandIdx[i] < 0)
-      return false;
-    SDValue &Src = Ops[OperandIdx[i] - 1];
-    SDValue &Sel = Ops[SelIdx[i] - 1];
-    SDValue &Neg = Ops[NegIdx[i] - 1];
-    SDValue &Abs = Ops[AbsIdx[i] - 1];
-    if (FoldOperand(Src, Sel, Neg, Abs, TII))
-      return true;
-  }
-  return false;
-}
 
 bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
   if (!ptr) {
@@ -804,26 +559,27 @@ bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
 }
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
-
-  if (Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    return;
-  }
-
-  // Go over all selected nodes and try to fold them a bit more
   const AMDGPUTargetLowering& Lowering =
     (*(const AMDGPUTargetLowering*)getTargetLowering());
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-       E = CurDAG->allnodes_end(); I != E; ++I) {
+  bool IsModified = false;
+  do {
+    IsModified = false;
+    // Go over all selected nodes and try to fold them a bit more
+    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
 
-    SDNode *Node = I;
+      SDNode *Node = I;
 
-    MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
-    if (!MachineNode)
-      continue;
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+      if (!MachineNode)
+        continue;
 
-    SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
-    if (ResNode != Node) {
-      ReplaceUses(Node, ResNode);
+      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+      if (ResNode != Node) {
+        ReplaceUses(Node, ResNode);
+        IsModified = true;
+      }
     }
-  }
+    CurDAG->RemoveDeadNodes();
+  } while (IsModified);
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index efd2756..c4d75ff 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDILIntrinsicInfo.h"
@@ -28,6 +29,14 @@
 #include "llvm/IR/DataLayout.h"
 
 using namespace llvm;
+static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                      CCValAssign::LocInfo LocInfo,
+                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() / 8, ArgFlags.getOrigAlign());
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+
+  return true;
+}
 
 #include "AMDGPUGenCallingConv.inc"
 
@@ -49,6 +58,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FABS,   MVT::f32, Legal);
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FROUND, MVT::f32, Legal);
 
   // The hardware supports ROTR, but not ROTL
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -64,9 +74,29 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+
   setOperationAction(ISD::STORE, MVT::f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 
+  // Custom lowering of vector stores is required for local address space
+  // stores.
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  // XXX: Native v2i32 local address space stores are possible, but not
+  // currently implemented.
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  // XXX: This can be change to Custom, once ExpandVectorStores can
+  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
@@ -76,15 +106,38 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+
   setOperationAction(ISD::LOAD, MVT::f64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
 
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Expand);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Expand);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
 
   setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
 
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
@@ -93,14 +146,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
 
-  static const int types[] = {
-    (int)MVT::v2i32,
-    (int)MVT::v4i32
+  static const MVT::SimpleValueType IntTypes[] = {
+    MVT::v2i32, MVT::v4i32
   };
-  const size_t NumTypes = array_lengthof(types);
+  const size_t NumIntTypes = array_lengthof(IntTypes);
 
-  for (unsigned int x  = 0; x < NumTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
+  for (unsigned int x  = 0; x < NumIntTypes; ++x) {
+    MVT::SimpleValueType VT = IntTypes[x];
     //Expand the following operations for the current type by default
     setOperationAction(ISD::ADD,  VT, Expand);
     setOperationAction(ISD::AND,  VT, Expand);
@@ -119,6 +171,23 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::XOR,  VT, Expand);
   }
+
+  static const MVT::SimpleValueType FloatTypes[] = {
+    MVT::v2f32, MVT::v4f32
+  };
+  const size_t NumFloatTypes = array_lengthof(FloatTypes);
+
+  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
+    MVT::SimpleValueType VT = FloatTypes[x];
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -129,6 +198,18 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const {
   return MVT::i32;
 }
 
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+                                                   EVT CastTy) const {
+  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
+    return true;
+
+  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+
+  return ((LScalarSize <= CastScalarSize) ||
+          (CastScalarSize >= 32) ||
+          (LScalarSize < 32));
+}
 
 //===---------------------------------------------------------------------===//
 // Target Properties
@@ -182,8 +263,12 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   // AMDGPU DAG lowering
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   }
   return Op;
 }
@@ -194,18 +279,82 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
   const DataLayout *TD = getTargetMachine().getDataLayout();
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+
+  assert(G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS);
   // XXX: What does the value of G->getOffset() mean?
   assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
 
-  unsigned Offset = MFI->LDSSize;
   const GlobalValue *GV = G->getGlobal();
-  uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
 
-  // XXX: Account for alignment?
-  MFI->LDSSize += Size;
+  unsigned Offset;
+  if (MFI->LocalMemoryObjects.count(GV) == 0) {
+    uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+    Offset = MFI->LDSSize;
+    MFI->LocalMemoryObjects[GV] = Offset;
+    // XXX: Account for alignment?
+    MFI->LDSSize += Size;
+  } else {
+    Offset = MFI->LocalMemoryObjects[GV];
+  }
+
+  return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+}
+
+void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &Args,
+                                         unsigned Start,
+                                         unsigned Count) const {
+  EVT VT = Op.getValueType();
+  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                               VT.getVectorElementType(),
+                               Op, DAG.getConstant(i, MVT::i32)));
+  }
+}
+
+SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SmallVector<SDValue, 8> Args;
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  ExtractVectorElements(A, DAG, Args, 0,
+                        A.getValueType().getVectorNumElements());
+  ExtractVectorElements(B, DAG, Args, 0,
+                        B.getValueType().getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
+                     &Args[0], Args.size());
+}
+
+SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Args;
+  EVT VT = Op.getValueType();
+  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  ExtractVectorElements(Op.getOperand(0), DAG, Args, Start,
+                        VT.getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
+                     &Args[0], Args.size());
+}
+
+SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
+                                              SelectionDAG &DAG) const {
 
-  return DAG.getConstant(Offset, TD->getPointerSize() == 8 ? MVT::i64 : MVT::i32);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+
+  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
+  assert(FIN);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
+                         Op.getValueType());
 }
 
 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -335,7 +484,122 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   return Op;
 }
 
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
+                                              SelectionDAG &DAG) const {
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  EVT PtrVT = Load->getBasePtr().getValueType();
+  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
+  SmallVector<SDValue, 8> Loads;
+  SDLoc SL(Op);
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
+                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
+    Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+                        Load->getChain(), Ptr,
+                        MachinePointerInfo(Load->getMemOperand()->getValue()),
+                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+                        Load->getAlignment()));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), &Loads[0],
+                     Loads.size());
+}
+
+SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
+  EVT MemVT = Store->getMemoryVT();
+  unsigned MemBits = MemVT.getSizeInBits();
+
+  // Byte stores are really expensive, so if possible, try to pack
+  // 32-bit vector truncatating store into an i32 store.
+  // XXX: We could also handle optimize other vector bitwidths
+  if (!MemVT.isVector() || MemBits > 32) {
+    return SDValue();
+  }
+
+  SDLoc DL(Op);
+  const SDValue &Value = Store->getValue();
+  EVT VT = Value.getValueType();
+  const SDValue &Ptr = Store->getBasePtr();
+  EVT MemEltVT = MemVT.getVectorElementType();
+  unsigned MemEltBits = MemEltVT.getSizeInBits();
+  unsigned MemNumElements = MemVT.getVectorNumElements();
+  EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+  SDValue Mask;
+  switch(MemEltBits) {
+  case 8:
+    Mask = DAG.getConstant(0xFF, PackedVT);
+    break;
+  case 16:
+    Mask = DAG.getConstant(0xFFFF, PackedVT);
+    break;
+  default:
+    llvm_unreachable("Cannot lower this vector store");
+  }
+  SDValue PackedValue;
+  for (unsigned i = 0; i < MemNumElements; ++i) {
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
+                              DAG.getConstant(i, MVT::i32));
+    Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
+    Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
+    SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
+    Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+    if (i == 0) {
+      PackedValue = Elt;
+    } else {
+      PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+    }
+  }
+  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
+                      MachinePointerInfo(Store->getMemOperand()->getValue()),
+                      Store->isVolatile(),  Store->isNonTemporal(),
+                      Store->getAlignment());
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
+  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
+  EVT PtrVT = Store->getBasePtr().getValueType();
+  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
+  SDLoc SL(Op);
+
+  SmallVector<SDValue, 8> Chains;
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Store->getValue(), DAG.getConstant(i, MVT::i32));
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
+                              Store->getBasePtr(),
+                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
+                                            PtrVT));
+    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+                         MachinePointerInfo(Store->getMemOperand()->getValue()),
+                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
+                         Store->getAlignment()));
+  }
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
+}
 
+SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
+
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+      Store->getValue().getValueType().isVector()) {
+    return SplitVectorStore(Op, DAG);
+  }
+  return SDValue();
+}
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
     SelectionDAG &DAG) const {
@@ -392,13 +656,13 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
                                                  DAG.getConstant(-1, VT),
                                                  DAG.getConstant(0, VT),
-                                                 ISD::SETGE);
-  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
-  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
-                                                  DAG.getConstant(0, VT),
+                                                 ISD::SETUGE);
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
+                                                  Num_S_Remainder,
                                                   DAG.getConstant(-1, VT),
                                                   DAG.getConstant(0, VT),
-                                                  ISD::SETGE);
+                                                  ISD::SETUGE);
   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
                                                Remainder_GE_Zero);
@@ -442,10 +706,62 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
+SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue S0 = Op.getOperand(0);
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
+    return SDValue();
+
+  // f32 uint_to_fp i64
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(0, MVT::i32));
+  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(1, MVT::i32));
+  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
+                        DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
+  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
 
+void AMDGPUTargetLowering::getOriginalFunctionArgs(
+                               SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    if (Ins[i].ArgVT == Ins[i].VT) {
+      OrigIns.push_back(Ins[i]);
+      continue;
+    }
+
+    EVT VT;
+    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
+      // Vector has been split into scalars.
+      VT = Ins[i].ArgVT.getVectorElementType();
+    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
+               Ins[i].ArgVT.getVectorElementType() !=
+               Ins[i].VT.getVectorElementType()) {
+      // Vector elements have been promoted
+      VT = Ins[i].ArgVT;
+    } else {
+      // Vector has been spilt into smaller vectors.
+      VT = Ins[i].VT;
+    }
+
+    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
+                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
+    OrigIns.push_back(Arg);
+  }
+}
+
 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->isExactlyValue(1.0);
@@ -507,5 +823,13 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
+  NODE_NAME_CASE(LOAD_CONSTANT)
+  NODE_NAME_CASE(LOAD_INPUT)
+  NODE_NAME_CASE(SAMPLE)
+  NODE_NAME_CASE(SAMPLEB)
+  NODE_NAME_CASE(SAMPLED)
+  NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index f614e23..2dfd3cf 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -25,8 +25,20 @@ class MachineRegisterInfo;
 
 class AMDGPUTargetLowering : public TargetLowering {
 private:
+  void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &Args,
+                             unsigned Start, unsigned Count) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Lower vector stores by merging the vector elements into an integer
+  /// of the same bitwidth.
+  SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
+  /// \brief Split a vector store into multiple scalar stores.
+  /// \returns The resulting chain. 
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
 protected:
 
@@ -39,10 +51,23 @@ protected:
                                        unsigned Reg, EVT VT) const;
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const;
-
+  /// \brief Split a vector load into multiple scalar loads.
+  SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
+  SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
+  /// The SelectionDAGBuilder will automatically promote function arguments
+  /// with illegal types.  However, this does not work for the AMDGPU targets
+  /// since the function arguments are stored in memory as these illegal types.
+  /// In order to handle this properly we need to get the origianl types sizes
+  /// from the LLVM IR Function and fixup the ISD:InputArg values before
+  /// passing them to AnalyzeFormalArguments()
+  void getOriginalFunctionArgs(SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const;
   void AnalyzeFormalArguments(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
 
@@ -52,6 +77,7 @@ public:
   virtual bool isFAbsFree(EVT VT) const;
   virtual bool isFNegFree(EVT VT) const;
   virtual MVT getVectorIdxTy() const;
+  virtual bool isLoadBitCastBeneficial(EVT, EVT) const LLVM_OVERRIDE;
   virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -139,6 +165,15 @@ enum {
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
+  LOAD_INPUT,
+  SAMPLE,
+  SAMPLEB,
+  SAMPLED,
+  SAMPLEL,
+  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  STORE_MSKOR,
+  LOAD_CONSTANT,
+  TBUFFER_STORE_FORMAT,
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
deleted file mode 100644
index 3ce3ecf..0000000
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Instructions can use indirect addressing to index the register file as if it
-/// were memory.  This pass lowers RegisterLoad and RegisterStore instructions
-/// to either a COPY or a MOV that uses indirect addressing.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const AMDGPUInstrInfo *TII;
-
-  bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
-
-public:
-  AMDGPUIndirectAddressingPass(TargetMachine &tm) :
-    MachineFunctionPass(ID),
-    TII(0)
-    { }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  const char *getPassName() const { return "R600 Handle indirect addressing"; }
-
-};
-
-} // End anonymous namespace
-
-char AMDGPUIndirectAddressingPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
-  return new AMDGPUIndirectAddressingPass(tm);
-}
-
-bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  TII = static_cast<const AMDGPUInstrInfo*>(MF.getTarget().getInstrInfo());
-
-  int IndirectBegin = TII->getIndirectIndexBegin(MF);
-  int IndirectEnd = TII->getIndirectIndexEnd(MF);
-
-  if (IndirectBegin == -1) {
-    // No indirect addressing, we can skip this pass
-    assert(IndirectEnd == -1);
-    return false;
-  }
-
-  // The map keeps track of the indirect address that is represented by
-  // each virtual register. The key is the register and the value is the
-  // indirect address it uses.
-  std::map<unsigned, unsigned> RegisterAddressMap;
-
-  // First pass - Lower all of the RegisterStore instructions and track which
-  // registers are live.
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                      BB != BB_E; ++BB) {
-    // This map keeps track of the current live indirect registers.
-    // The key is the address and the value is the register
-    std::map<unsigned, unsigned> LiveAddressRegisterMap;
-    MachineBasicBlock &MBB = *BB;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
-      Next = llvm::next(I);
-      MachineInstr &MI = *I;
-
-      if (!TII->isRegisterStore(MI)) {
-        continue;
-      }
-
-      // Lower RegisterStore
-
-      unsigned RegIndex = MI.getOperand(2).getImm();
-      unsigned Channel = MI.getOperand(3).getImm();
-      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
-      const TargetRegisterClass *IndirectStoreRegClass =
-                   TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg());
-
-      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
-        // Direct register access.
-        unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
-
-        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg)
-                .addOperand(MI.getOperand(0));
-
-        RegisterAddressMap[DstReg] = Address;
-        LiveAddressRegisterMap[Address] = DstReg;
-      } else {
-        // Indirect register access.
-        MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I,
-                                           MI.getOperand(0).getReg(), // Value
-                                           Address,
-                                           MI.getOperand(1).getReg()); // Offset
-        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
-          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
-          unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
-          MOV.addReg(DstReg, RegState::Define | RegState::Implicit);
-          RegisterAddressMap[DstReg] = Addr;
-          LiveAddressRegisterMap[Addr] = DstReg;
-        }
-      }
-      MI.eraseFromParent();
-    }
-
-    // Update the live-ins of the succesor blocks
-    for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
-                                          SuccEnd = MBB.succ_end();
-                                          SuccEnd != Succ; ++Succ) {
-      std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
-      for (Key = LiveAddressRegisterMap.begin(),
-           KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
-        (*Succ)->addLiveIn(Key->second);
-      }
-    }
-  }
-
-  // Second pass - Lower the RegisterLoad instructions
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                      BB != BB_E; ++BB) {
-    // Key is the address and the value is the register
-    std::map<unsigned, unsigned> LiveAddressRegisterMap;
-    MachineBasicBlock &MBB = *BB;
-
-    MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
-    while (LI != MBB.livein_end()) {
-      std::vector<unsigned> PhiRegisters;
-
-      // Make sure this live in is used for indirect addressing
-      if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
-        ++LI;
-        continue;
-      }
-
-      unsigned Address = RegisterAddressMap[*LI];
-      LiveAddressRegisterMap[Address] = *LI;
-      PhiRegisters.push_back(*LI);
-
-      // Check if there are other live in registers which map to the same
-      // indirect address.
-      for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
-                                              LE = MBB.livein_end();
-                                              LJ != LE; ++LJ) {
-        unsigned Reg = *LJ;
-        if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
-          continue;
-        }
-
-        if (RegisterAddressMap[Reg] == Address) {
-          PhiRegisters.push_back(Reg);
-        }
-      }
-
-      if (PhiRegisters.size() == 1) {
-        // We don't need to insert a Phi instruction, so we can just add the
-        // registers to the live list for the block.
-        LiveAddressRegisterMap[Address] = *LI;
-        MBB.removeLiveIn(*LI);
-      } else {
-        // We need to insert a PHI, because we have the same address being
-        // written in multiple predecessor blocks.
-        const TargetRegisterClass *PhiDstClass =
-                   TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
-        unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
-        MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
-                                          MBB.findDebugLoc(MBB.begin()),
-                                          TII->get(AMDGPU::PHI), PhiDstReg);
-
-        for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
-                                                   RE = PhiRegisters.end();
-                                                   RI != RE; ++RI) {
-          unsigned Reg = *RI;
-          MachineInstr *DefInst = MRI.getVRegDef(Reg);
-          assert(DefInst);
-          MachineBasicBlock *RegBlock = DefInst->getParent();
-          Phi.addReg(Reg);
-          Phi.addMBB(RegBlock);
-          MBB.removeLiveIn(Reg);
-        }
-        RegisterAddressMap[PhiDstReg] = Address;
-        LiveAddressRegisterMap[Address] = PhiDstReg;
-      }
-      LI = MBB.livein_begin();
-    }
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
-      Next = llvm::next(I);
-      MachineInstr &MI = *I;
-
-      if (!TII->isRegisterLoad(MI)) {
-        if (MI.getOpcode() == AMDGPU::PHI) {
-          continue;
-        }
-        // Check for indirect register defs
-        for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands();
-                                 OpIdx < NumOperands; ++OpIdx) {
-          MachineOperand &MO = MI.getOperand(OpIdx);
-          if (MO.isReg() && MO.isDef() &&
-              RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) {
-            unsigned Reg = MO.getReg();
-            unsigned LiveAddress = RegisterAddressMap[Reg];
-            // Chain the live-ins
-            if (LiveAddressRegisterMap.find(LiveAddress) !=
-                LiveAddressRegisterMap.end()) {
-              MI.addOperand(MachineOperand::CreateReg(
-                                  LiveAddressRegisterMap[LiveAddress],
-                                  false, // isDef
-                                  true,  // isImp
-                                  true));  // isKill
-            }
-            LiveAddressRegisterMap[LiveAddress] = Reg;
-          }
-        }
-        continue;
-      }
-
-      const TargetRegisterClass *SuperIndirectRegClass =
-                                                TII->getSuperIndirectRegClass();
-      const TargetRegisterClass *IndirectLoadRegClass =
-                                             TII->getIndirectAddrLoadRegClass();
-      unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass);
-
-      unsigned RegIndex = MI.getOperand(2).getImm();
-      unsigned Channel = MI.getOperand(3).getImm();
-      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
-
-      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
-        // Direct register access
-        unsigned Reg = LiveAddressRegisterMap[Address];
-        unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
-
-        if (regHasExplicitDef(MRI, Reg)) {
-          // If the register we are reading from has an explicit def, then that
-          // means it was written via a direct register access (i.e. COPY
-          // or other instruction that doesn't use indirect addressing).  In
-          // this case we know where the value has been stored, so we can just
-          // issue a copy.
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
-                  MI.getOperand(0).getReg())
-                  .addReg(Reg);
-        } else {
-          // If the register we are reading has an implicit def, then that
-          // means it was written by an indirect register access (i.e. An
-          // instruction that uses indirect addressing. 
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
-                   MI.getOperand(0).getReg())
-                   .addReg(AddrReg)
-                   .addReg(Reg, RegState::Implicit);
-        }
-      } else {
-        // Indirect register access
-
-        // Note on REQ_SEQUENCE instructons: You can't actually use the register
-        // it defines unless  you have an instruction that takes the defined
-        // register class as an operand.
-
-        MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I),
-                                               TII->get(AMDGPU::REG_SEQUENCE),
-                                               IndirectReg);
-        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
-          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
-          if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) {
-            continue;
-          }
-          unsigned Reg = LiveAddressRegisterMap[Addr];
-
-          // We only need to use REG_SEQUENCE for explicit defs, since the
-          // register coalescer won't do anything with the implicit defs.
-          if (!regHasExplicitDef(MRI, Reg)) {
-            continue;
-          }
-
-          // Insert a REQ_SEQUENCE instruction to force the register allocator
-          // to allocate the virtual register to the correct physical register.
-          Sequence.addReg(LiveAddressRegisterMap[Addr]);
-          Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr));
-        }
-        MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I,
-                                           MI.getOperand(0).getReg(), // Value
-                                           Address,
-                                           MI.getOperand(1).getReg()); // Offset
-
-
-
-        Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
-        Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit);
-
-      }
-      MI.eraseFromParent();
-    }
-  }
-  return false;
-}
-
-bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI,
-                                                  unsigned Reg) const {
-  MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-
-  if (!DefInstr) {
-    return false;
-  }
-
-  if (DefInstr->getOpcode() == AMDGPU::PHI) {
-    bool Explicit = false;
-    for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(),
-                                          E = DefInstr->operands_end();
-                                          I != E; ++I) {
-      const MachineOperand &MO = *I;
-      if (!MO.isReg() || MO.isDef()) {
-        continue;
-      }
-
-      Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg());
-    }
-    return Explicit;
-  }
-
-  return DefInstr->getOperand(0).isReg() &&
-         DefInstr->getOperand(0).getReg() == Reg;
-}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 61437e9..4f7084b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -20,15 +20,19 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRINFO_NAMED_OPS
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void AMDGPUInstrInfo::anchor() {}
+
 AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
-  : AMDGPUGenInstrInfo(0,0), RI(tm), TM(tm) { }
+  : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { }
 
 const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
   return RI;
@@ -118,6 +122,55 @@ AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   assert(!"Not Implemented");
 }
 
+bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+   int OffsetOpIdx =
+       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::addr);
+   // addr is a custom operand with multiple MI operands, and only the
+   // first MI operand is given a name.
+  int RegOpIdx = OffsetOpIdx + 1;
+  int ChanOpIdx =
+      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::chan);
+
+  if (isRegisterLoad(*MI)) {
+    int DstOpIdx =
+        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                    getIndirectAddrRegClass()->getRegister(Address));
+    } else {
+      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                        Address, OffsetReg);
+    }
+  } else if (isRegisterStore(*MI)) {
+    int ValOpIdx =
+        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val);
+    AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                    MI->getOperand(ValOpIdx).getReg());
+    } else {
+      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
+                         calculateIndirectAddress(RegIndex, Channel),
+                         OffsetReg);
+    }
+  } else {
+    return false;
+  }
+
+  MBB->erase(MI);
+  return true;
+}
+
+
 MachineInstr *
 AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
@@ -223,6 +276,57 @@ bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
 }
 
+int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  assert(!MFI->hasVarSizedObjects());
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
 
 void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const {
@@ -244,3 +348,12 @@ void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
     }
   }
 }
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
+  switch (Channels) {
+  default: return Opcode;
+  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
+  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
+  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+  }
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 306f467..ce5b58c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -43,6 +43,7 @@ private:
   const AMDGPURegisterInfo RI;
   bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
                           MachineBasicBlock &MBB) const;
+  virtual void anchor();
 protected:
   TargetMachine &TM;
 public:
@@ -87,6 +88,8 @@ public:
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
 
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
@@ -97,6 +100,14 @@ protected:
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const;
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+
 public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
                             const SmallVectorImpl<unsigned> &Ops) const;
@@ -139,19 +150,9 @@ public:
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
 
-  virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                       int64_t Imm) const = 0;
   virtual unsigned getIEQOpcode() const = 0;
   virtual bool isMov(unsigned opcode) const = 0;
 
-  /// \returns the smallest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0;
-
-  /// \returns the largest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0;
-
   /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
   ///        \p Channel
   ///
@@ -162,14 +163,9 @@ public:
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const = 0;
 
-  /// \returns The register class to be used for storing values to an
-  /// "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                  unsigned SourceReg) const = 0;
-
-  /// \returns The register class to be used for loading values from
-  /// an "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0;
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
 
   /// \brief Build instruction(s) for an indirect register write.
   ///
@@ -187,16 +183,21 @@ public:
                                     unsigned ValueReg, unsigned Address,
                                     unsigned OffsetReg) const = 0;
 
-  /// \returns the register class whose sub registers are the set of all
-  /// possible registers that can be used for indirect addressing.
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0;
-
 
   /// \brief Convert the AMDIL MachineInstr to a supported ISA
   /// MachineInstr
   virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const;
 
+  /// \brief Build a MOV instruction.
+  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DstReg, unsigned SrcReg) const = 0;
+
+  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
+  /// equivalent opcode that writes \p Channels Channels.
+  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+
 };
 
 namespace AMDGPU {
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 48d89dd..fccede0 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -72,3 +72,17 @@ def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
 def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
                            SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
                            [SDNPHasChain, SDNPMayStore]>;
+
+// MSKOR instructions are atomic memory instructions used mainly for storing
+// 8-bit and 16-bit values.  The definition is:
+//
+// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
+//
+// src0: vec4(src, 0, 0, mask)
+// src1: dst - rat offset (aka pointer) in dwords  
+def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
+                        SDTypeProfile<0, 2, []>,
+                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AMDGPUround : SDNode<"ISD::FROUND",
+                         SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index d6a7759..3c5375d 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -35,46 +35,75 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 }
 
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
-def COND_EQ : PatLeaf <
+//===----------------------------------------------------------------------===//
+// PatLeafs for floating-point comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_OEQ : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOEQ: case ISD::SETUEQ:
-                     case ISD::SETEQ: return true;}}}]
+  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
 >;
 
-def COND_NE : PatLeaf <
+def COND_OGT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
+>;
+
+def COND_OGE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
+>;
+
+def COND_OLT : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETONE: case ISD::SETUNE:
-                     case ISD::SETNE: return true;}}}]
+  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
 >;
-def COND_GT : PatLeaf <
+
+def COND_OLE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOGT: case ISD::SETUGT:
-                     case ISD::SETGT: return true;}}}]
+  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
 >;
 
-def COND_GE : PatLeaf <
+def COND_UNE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOGE: case ISD::SETUGE:
-                     case ISD::SETGE: return true;}}}]
+  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
 >;
 
-def COND_LT : PatLeaf <
+def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
+def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for unsigned comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
+def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
+def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
+def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for signed comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
+def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
+def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
+def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for integer equality
+//===----------------------------------------------------------------------===//
+
+def COND_EQ : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOLT: case ISD::SETULT:
-                     case ISD::SETLT: return true;}}}]
+  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
 >;
 
-def COND_LE : PatLeaf <
+def COND_NE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOLE: case ISD::SETULE:
-                     case ISD::SETLE: return true;}}}]
+  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
 >;
 
 def COND_NULL : PatLeaf <
@@ -96,6 +125,10 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
 
+def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
@@ -108,8 +141,12 @@ def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
 
-def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
 def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
@@ -132,6 +169,14 @@ def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
 
+def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
@@ -146,20 +191,55 @@ def az_extloadi32_constant : PatFrag<(ops node:$ptr),
   return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
 
-def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
 }]>;
 
 def local_store : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
-    return isLocalStore(dyn_cast<StoreSDNode>(N));
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value),
+                                    (atomic_load_add node:$ptr, node:$value), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value),
+                                    (atomic_load_sub node:$ptr, node:$value), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
-int FP_UINT_MAX_PLUS_1 = 0x4f800000;	// 1 << 32 in floating point encoding
+int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 }
 def CONST : Constants;
 
@@ -205,6 +285,8 @@ class FNEG <RegisterClass rc> : AMDGPUShaderInst <
 
 multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
                     ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
   def RegisterLoad : AMDGPUShaderInst <
     (outs dstClass:$dst),
     (ins addrClass:$addr, i32imm:$chan),
@@ -223,6 +305,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
     let isRegisterStore = 1;
   }
 }
+}
 
 } // End isCodeGenOnly = 1, isPseudo = 1
 
@@ -254,61 +337,12 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
   (INSERT_SUBREG $vec, $elem, sub_reg)
 >;
 
-// Vector Build pattern
-class Vector1_Build <ValueType vecType, ValueType elemType,
-                     RegisterClass rc> : Pat <
-  (vecType (build_vector elemType:$src)),
-  (vecType (COPY_TO_REGCLASS $src, rc))
->;
-
-class Vector2_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1)),
-  (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1)
->;
-
 class Vector4_Build <ValueType vecType, ValueType elemType> : Pat <
   (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)),
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
     (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3)
 >;
 
-class Vector8_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                              $sub2, sub2), $sub3, sub3),
-                              $sub4, sub4), $sub5, sub5),
-                              $sub6, sub6), $sub7, sub7)
->;
-
-class Vector16_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7,
-                         elemType:$sub8, elemType:$sub9,
-                         elemType:$sub10, elemType:$sub11,
-                         elemType:$sub12, elemType:$sub13,
-                         elemType:$sub14, elemType:$sub15)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                            $sub2, sub2), $sub3, sub3),
-                            $sub4, sub4), $sub5, sub5),
-                            $sub6, sub6), $sub7, sub7),
-                            $sub8, sub8), $sub9, sub9),
-                            $sub10, sub10), $sub11, sub11),
-                            $sub12, sub12), $sub13, sub13),
-                            $sub14, sub14), $sub15, sub15)
->;
-
 // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
 // can handle COPY instructions.
 // bitconvert pattern
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 1dc1c65..0ed598e 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -15,14 +15,19 @@
 
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -69,15 +74,45 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MachineBasicBlock::const_instr_iterator I = MI;
     ++I;
     while (I != MBB->end() && I->isInsideBundle()) {
-      MCInst MCBundleInst;
-      const MachineInstr *BundledInst = I;
-      MCInstLowering.lower(BundledInst, MCBundleInst);
-      OutStreamer.EmitInstruction(MCBundleInst);
+      EmitInstruction(I);
       ++I;
     }
   } else {
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     OutStreamer.EmitInstruction(TmpInst);
+
+    if (DisasmEnabled) {
+      // Disassemble instruction/operands to text.
+      DisasmLines.resize(DisasmLines.size() + 1);
+      std::string &DisasmLine = DisasmLines.back();
+      raw_string_ostream DisasmStream(DisasmLine);
+
+      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(),
+                                    *TM.getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
+
+      // Disassemble instruction/operands to hex representation.
+      SmallVector<MCFixup, 4> Fixups;
+      SmallVector<char, 16> CodeBytes;
+      raw_svector_ostream CodeStream(CodeBytes);
+
+      MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer;
+      MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
+      InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups);
+      CodeStream.flush();
+
+      HexLines.resize(HexLines.size() + 1);
+      std::string &HexLine = HexLines.back();
+      raw_string_ostream HexStream(HexLine);
+
+      for (size_t i = 0; i < CodeBytes.size(); i += 4) {
+        unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
+        HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
+      }
+
+      DisasmStream.flush();
+      DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
+    }
   }
 }
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index f2342b0..14171f4 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -6,6 +6,9 @@ using namespace llvm;
 
 static const char *const ShaderTypeAttribute = "ShaderType";
 
+// Pin the vtable to this file.
+void AMDGPUMachineFunction::anchor() {}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
     MachineFunctionInfo() {
   ShaderType = ShaderType::COMPUTE;
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index 789b96a..fea0b39 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -14,13 +14,18 @@
 #define AMDGPUMACHINEFUNCTION_H
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include <map>
 
 namespace llvm {
 
 class AMDGPUMachineFunction : public MachineFunctionInfo {
+  virtual void anchor();
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
   unsigned ShaderType;
+  /// A map to keep track of local memory objects and their offsets within
+  /// the local memory space.
+  std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
 };
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 3402092..47617a7 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -46,27 +46,21 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return 0;
 }
 
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  static const unsigned SubRegs[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+    AMDGPU::sub15
+  };
+
+  assert (Channel < array_lengthof(SubRegs));
+  return SubRegs[Channel];
+}
+
 unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
 
-  switch(IndirectIndex) {
-  case 0: return AMDGPU::sub0;
-  case 1: return AMDGPU::sub1;
-  case 2: return AMDGPU::sub2;
-  case 3: return AMDGPU::sub3;
-  case 4: return AMDGPU::sub4;
-  case 5: return AMDGPU::sub5;
-  case 6: return AMDGPU::sub6;
-  case 7: return AMDGPU::sub7;
-  case 8: return AMDGPU::sub8;
-  case 9: return AMDGPU::sub9;
-  case 10: return AMDGPU::sub10;
-  case 11: return AMDGPU::sub11;
-  case 12: return AMDGPU::sub12;
-  case 13: return AMDGPU::sub13;
-  case 14: return AMDGPU::sub14;
-  case 15: return AMDGPU::sub15;
-  default: llvm_unreachable("indirect index out of range");
-  }
+  return getSubRegFromChannel(IndirectIndex);
 }
 
 #define GET_REGINFO_TARGET_DESC
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 7cbd34b..688e1a0 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -50,6 +50,14 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
     assert(!"Unimplemented"); return NULL;
   }
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const {
+    assert(!"Unimplemented"); return 0;
+  }
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
   const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 8ed5a74..061793a 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
-#include <stdio.h>
 
 using namespace llvm;
 
@@ -37,6 +36,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   Gen = AMDGPUSubtarget::R600;
   FP64 = false;
   CaymanISA = false;
+  EnableIRStructurizer = true;
+  EnableIfCvt = true;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
 }
@@ -66,6 +67,14 @@ AMDGPUSubtarget::hasCaymanISA() const {
   return CaymanISA;
 }
 bool
+AMDGPUSubtarget::IsIRStructurizerEnabled() const {
+  return EnableIRStructurizer;
+}
+bool
+AMDGPUSubtarget::isIfCvtEnabled() const {
+  return EnableIfCvt;
+}
+bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
 }
@@ -98,6 +107,10 @@ AMDGPUSubtarget::getDataLayout() const {
     DataLayout.append("-p:32:32:32");
   }
 
+  if (Gen >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    DataLayout.append("-p3:32:32:32");
+  }
+
   return DataLayout;
 }
 
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 8c65096..4288d27 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -33,7 +33,8 @@ public:
     R700,
     EVERGREEN,
     NORTHERN_ISLANDS,
-    SOUTHERN_ISLANDS
+    SOUTHERN_ISLANDS,
+    SEA_ISLANDS
   };
 
 private:
@@ -48,6 +49,8 @@ private:
   enum Generation Gen;
   bool FP64;
   bool CaymanISA;
+  bool EnableIRStructurizer;
+  bool EnableIfCvt;
 
   InstrItineraryData InstrItins;
 
@@ -63,6 +66,12 @@ public:
   enum Generation getGeneration() const;
   bool hasHWFP64() const;
   bool hasCaymanISA() const;
+  bool IsIRStructurizerEnabled() const;
+  bool isIfCvtEnabled() const;
+
+  virtual bool enableMachineScheduler() const {
+    return getGeneration() <= NORTHERN_ISLANDS;
+  }
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 5ebc5f2..bc4f5d7 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include <llvm/CodeGen/Passes.h>
 
+
 using namespace llvm;
 
 extern "C" void LLVMInitializeR600Target() {
@@ -58,8 +59,9 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
   LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
   Subtarget(TT, CPU, FS),
   Layout(Subtarget.getDataLayout()),
-  FrameLowering(TargetFrameLowering::StackGrowsUp, 16 // Stack Alignment
-                                                 , 0),
+  FrameLowering(TargetFrameLowering::StackGrowsUp,
+                64 * 16 // Maximum stack alignment (long16)
+               , 0),
   IntrinsicInfo(this),
   InstrItins(&Subtarget.getInstrItineraryData()) {
   // TLInfo uses InstrInfo so it must be initialized after.
@@ -80,17 +82,20 @@ namespace {
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {
-    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-      enablePass(&MachineSchedulerID);
-      MachineSchedRegistry::setDefault(createR600MachineScheduler);
-    }
-  }
+    : TargetPassConfig(TM, PM) {}
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     return getTM<AMDGPUTargetMachine>();
   }
+
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const {
+    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      return createR600MachineScheduler(C);
+    return 0;
+  }
+
   virtual bool addPreISel();
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
@@ -120,8 +125,11 @@ bool
 AMDGPUPassConfig::addPreISel() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
   addPass(createFlattenCFGPass());
-  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+  if (ST.IsIRStructurizerEnabled())
     addPass(createStructurizeCFGPass());
+  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createSinkingPass());
+    addPass(createSITypeRewriter());
     addPass(createSIAnnotateControlFlowPass());
   } else {
     addPass(createR600TextureIntrinsicsReplacer());
@@ -131,12 +139,6 @@ AMDGPUPassConfig::addPreISel() {
 
 bool AMDGPUPassConfig::addInstSelector() {
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
-
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    // This callbacks this pass uses are not implemented yet on SI.
-    addPass(createAMDGPUIndirectAddressingPass(*TM));
-  }
   return false;
 }
 
@@ -164,10 +166,12 @@ bool AMDGPUPassConfig::addPostRegAlloc() {
 bool AMDGPUPassConfig::addPreSched2() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
     addPass(createR600EmitClauseMarkers(*TM));
-  }
-  addPass(&IfConverterID);
+  if (ST.isIfCvtEnabled())
+    addPass(&IfConverterID);
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    addPass(createR600ClauseMergePass(*TM));
   return false;
 }
 
@@ -185,4 +189,3 @@ bool AMDGPUPassConfig::addPreEmitPass() {
 
   return false;
 }
-
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index 687eadb..507570f 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -1005,13 +1005,14 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
     return 0;
 
   assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
 
   MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
-  serialPatternMatch(TrueMBB);
-  ifPatternMatch(TrueMBB);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
   MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
-  serialPatternMatch(FalseMBB);
-  ifPatternMatch(FalseMBB);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
   MachineBasicBlock *LandBlk;
   int Cloned = 0;
 
@@ -1040,7 +1041,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
     && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
     LandBlk = *TrueMBB->succ_begin();
   } else {
-    return handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
   }
 
   // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
@@ -1068,7 +1069,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
 
   numClonedBlock += Cloned;
 
-  return 1 + Cloned;
+  return 1 + Cloned + NumMatch;
 }
 
 int AMDGPUCFGStructurizer::loopendPatternMatch() {
@@ -1233,7 +1234,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
 
       numClonedBlock += Num;
       Num += serialPatternMatch(*HeadMBB->succ_begin());
-      Num += serialPatternMatch(*(++HeadMBB->succ_begin()));
+      Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
       Num += ifPatternMatch(HeadMBB);
       assert(Num > 0);
 
@@ -1335,32 +1336,77 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   // add initReg = initVal to headBlk
 
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  unsigned InitReg =
-    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  if (!MigrateTrue || !MigrateFalse)
-    llvm_unreachable("Extra register needed to handle CFG");
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
 
   int NumNewBlk = 0;
 
-  if (!LandBlk) {
-    LandBlk = HeadMBB->getParent()->CreateMachineBasicBlock();
-    HeadMBB->getParent()->push_back(LandBlk);  //insert to function
-
-    if (TrueMBB) {
-      TrueMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    if (FalseMBB) {
-      FalseMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    NumNewBlk ++;
-  }
-
   bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
 
   //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
@@ -1375,6 +1421,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
         CmpResReg, DebugLoc());
   }
 
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
   insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
       DebugLoc());
 
@@ -1713,7 +1763,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
   if (MBB->succ_size() != 2)
     return;
   MachineBasicBlock *MBB1 = *MBB->succ_begin();
-  MachineBasicBlock *MBB2 = *(++MBB->succ_begin());
+  MachineBasicBlock *MBB2 = *llvm::next(MBB->succ_begin());
   if (MBB1 != MBB2)
     return;
 
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
index f7d0bd5..0f0c88d 100644
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ b/lib/Target/R600/AMDILInstrInfo.td
@@ -118,15 +118,15 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Multiclass Instruction formats
 //===--------------------------------------------------------------------===//
 // Multiclass that handles branch instructions
-multiclass BranchConditional<SDNode Op> {
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
     def _i32 : ILFormat<(outs),
-  (ins brtarget:$target, GPRI32:$src0),
+  (ins brtarget:$target, rci:$src0),
         "; i32 Pseudo branch instruction",
-  [(Op bb:$target, GPRI32:$src0)]>;
+  [(Op bb:$target, (i32 rci:$src0))]>;
     def _f32 : ILFormat<(outs),
-  (ins brtarget:$target, GPRF32:$src0),
+  (ins brtarget:$target, rcf:$src0),
         "; f32 Pseudo branch instruction",
-  [(Op bb:$target, GPRF32:$src0)]>;
+  [(Op bb:$target, (f32 rcf:$src0))]>;
 }
 
 // Only scalar types should generate flow control
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index fde187b..9f8f6a8 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -17,7 +17,6 @@ add_llvm_target(R600CodeGen
   AMDILISelLowering.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
-  AMDGPUIndirectAddressing.cpp
   AMDGPUISelDAGToDAG.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
@@ -28,6 +27,7 @@ add_llvm_target(R600CodeGen
   AMDGPUConvertToISA.cpp
   AMDGPUInstrInfo.cpp
   AMDGPURegisterInfo.cpp
+  R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
   R600ExpandSpecialInstrs.cpp
@@ -47,6 +47,7 @@ add_llvm_target(R600CodeGen
   SILowerControlFlow.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
+  SITypeRewriter.cpp
   )
 
 add_dependencies(LLVMR600CodeGen AMDGPUCommonTableGen intrinsics_gen)
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index fac3c39..99e1377 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -23,6 +23,63 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
+  switch (reg) {
+  case AMDGPU::VCC:
+    O << "vcc";
+    return;
+  case AMDGPU::SCC:
+    O << "scc";
+    return;
+  case AMDGPU::EXEC:
+    O << "exec";
+    return;
+  case AMDGPU::M0:
+    O << "m0";
+    return;
+  default:
+    break;
+  }
+
+  // It's seems there's no way to use SIRegisterInfo here, and dealing with the
+  // giant enum of all the different shifted sets of registers is pretty
+  // unmanagable, so parse the name and reformat it to be prettier.
+  StringRef Name(getRegisterName(reg));
+
+  std::pair<StringRef, StringRef> Split = Name.split('_');
+  StringRef SubRegName = Split.first;
+  StringRef Rest = Split.second;
+
+  if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR".
+    O << Name;
+    return;
+  }
+
+  unsigned RegIndex;
+  StringRef RegIndexStr = SubRegName.drop_front(4);
+
+  if (RegIndexStr.getAsInteger(10, RegIndex)) {
+    O << Name;
+    return;
+  }
+
+  if (SubRegName.front() == 'V')
+    O << 'v';
+  else if (SubRegName.front() == 'S')
+    O << 's';
+  else {
+    O << Name;
+    return;
+  }
+
+  if (Rest.empty()) // Only 1 32-bit register
+    O << RegIndex;
+  else {
+    unsigned NumReg = Rest.count('_') + 2;
+    O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']';
+  }
+}
+
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
 
@@ -30,8 +87,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     switch (Op.getReg()) {
     // This is the default predicate state, so we don't need to print it.
-    case AMDGPU::PRED_SEL_OFF: break;
-    default: O << getRegisterName(Op.getReg()); break;
+    case AMDGPU::PRED_SEL_OFF:
+      break;
+
+    default:
+      printRegOperand(Op.getReg(), O);
+      break;
     }
   } else if (Op.isImm()) {
     O << Op.getImm();
@@ -255,4 +316,21 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
+  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
+  // works so it might be a misprint in docs.
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Vmcnt = SImm16 & 0xF;
+  unsigned Expcnt = (SImm16 >> 4) & 0xF;
+  unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
+  if (Vmcnt != 0xF)
+    O << "vmcnt(" << Vmcnt << ") ";
+  if (Expcnt != 0x7)
+    O << "expcnt(" << Expcnt << ") ";
+  if (Lgkmcnt != 0x7)
+    O << "lgkmcnt(" << Lgkmcnt << ")";
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 4c1dfa6..77af942 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -32,6 +32,7 @@ public:
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
 private:
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -52,6 +53,7 @@ private:
   void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 9a36903..29d0acf 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -95,7 +95,9 @@ public:
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
                                            StringRef CPU) {
   return new ELFAMDGPUAsmBackend(T);
 }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 59136f3..4a8e1b0 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -21,7 +21,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   HasStaticCtorDtorReferenceInStaticMode = false;
   LinkerRequiresNonEmptyDwarfLines = true;
   MaxInstLength = 16;
-  PCSymbol = "$";
   SeparatorString = "\n";
   CommentColumn = 40;
   CommentString = ";";
@@ -32,9 +31,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
   AssemblerDialect = 0;
-  AllowQuotesInName = false;
-  AllowNameToStartWithDigit = false;
-  AllowPeriodsInName = false;
 
   //===--- Data Emission Directives -------------------------------------===//
   ZeroDirective = ".zero";
@@ -56,13 +52,11 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
 
   //===--- Global Variable Emission Directives --------------------------===//
   GlobalDirective = ".global";
-  ExternDirective = ".extern";
   HasSetDirective = false;
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = false;
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
-  HasSymbolResolver = false;
   WeakRefDirective = ".weakref\t";
   LinkOnceDirective = 0;
   //===--- Dwarf Emission Directives -----------------------------------===//
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
new file mode 100644
index 0000000..521b3b3
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -0,0 +1,21 @@
+//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCCodeEmitter.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void AMDGPUMCCodeEmitter::anchor() {}
+
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index cd3a7ce..d8cf64a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -24,6 +24,7 @@ class MCInst;
 class MCOperand;
 
 class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+  virtual void anchor();
 public:
 
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 61d70bb..a1bec28 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -88,7 +88,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCCodeEmitter *_Emitter,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
+  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, false, false);
 }
 
 extern "C" void LLVMInitializeR600TargetMC() {
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index abb0320..f6b3376 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -40,8 +40,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCSubtargetInfo &STI,
                                      MCContext &Ctx);
 
-MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
-                                     StringRef CPU);
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
 
 MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS);
 } // End llvm namespace
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
index 3ccdf42..98f6925 100644
--- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_llvm_library(LLVMR600Desc
   AMDGPUAsmBackend.cpp
   AMDGPUELFObjectWriter.cpp
+  AMDGPUMCCodeEmitter.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUMCAsmInfo.cpp
   R600MCCodeEmitter.cpp
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index f470783..dd8df65 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -24,7 +24,6 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
-#include <stdio.h>
 
 using namespace llvm;
 
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 4631c04..ee190e4 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -48,6 +48,7 @@ def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
 def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
 def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
 def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
-def : Proc<"bonaire",    SI_Itin, [FeatureSouthernIslands]>;
-def : Proc<"kabini",     SI_Itin, [FeatureSouthernIslands]>;
-def : Proc<"kaveri",     SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"bonaire",    SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
new file mode 100644
index 0000000..33d2ca3
--- /dev/null
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -0,0 +1,204 @@
+//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// This pass is merging consecutive CFAlus where applicable.
+/// It needs to be called after IfCvt for best results.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "r600mergeclause"
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+static bool isCFAlu(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::CF_ALU:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+class R600ClauseMergePass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned getCFAluSize(const MachineInstr *MI) const;
+  bool isCFAluEnabled(const MachineInstr *MI) const;
+
+  /// IfCvt pass can generate "disabled" ALU clause marker that need to be
+  /// removed and their content affected to the previous alu clause.
+  /// This function parse instructions after CFAlu untill it find a disabled
+  /// CFAlu and merge the content, or an enabled CFAlu.
+  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
+
+  /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
+  /// it is the case.
+  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
+      const;
+
+public:
+  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const;
+};
+
+char R600ClauseMergePass::ID = 0;
+
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
+}
+
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
+}
+
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
+    const {
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
+  I++;
+  do {
+    while (I!= E && !isCFAlu(I))
+      I++;
+    if (I == E)
+      return;
+    MachineInstr *MI = I++;
+    if (isCFAluEnabled(MI))
+      break;
+    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI->eraseFromParent();
+  } while (I != E);
+}
+
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
+                                          const MachineInstr *LatrCFAlu) const {
+  assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  unsigned RootInstCount = getCFAluSize(RootCFAlu),
+      LaterInstCount = getCFAluSize(LatrCFAlu);
+  unsigned CumuledInsts = RootInstCount + LaterInstCount;
+  if (CumuledInsts >= TII->getMaxAlusPerClause()) {
+    DEBUG(dbgs() << "Excess inst counts\n");
+    return false;
+  }
+  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    return false;
+  // Is KCache Bank 0 compatible ?
+  int Mode0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+  int KBank0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+  int KBank0LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
+      RootCFAlu->getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
+       RootCFAlu->getOperand(KBank0Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  // Is KCache Bank 1 compatible ?
+  int Mode1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+  int KBank1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+  int KBank1LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
+      RootCFAlu->getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
+      RootCFAlu->getOperand(KBank1Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
+    RootCFAlu->getOperand(Mode0Idx).setImm(
+        LatrCFAlu->getOperand(Mode0Idx).getImm());
+    RootCFAlu->getOperand(KBank0Idx).setImm(
+        LatrCFAlu->getOperand(KBank0Idx).getImm());
+    RootCFAlu->getOperand(KBank0LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
+  }
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
+    RootCFAlu->getOperand(Mode1Idx).setImm(
+        LatrCFAlu->getOperand(Mode1Idx).getImm());
+    RootCFAlu->getOperand(KBank1Idx).setImm(
+        LatrCFAlu->getOperand(KBank1Idx).getImm());
+    RootCFAlu->getOperand(KBank1LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
+  }
+  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
+  return true;
+}
+
+bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
+    MachineBasicBlock::iterator LatestCFAlu = E;
+    while (I != E) {
+      MachineInstr *MI = I++;
+      if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
+          TII->mustBeLastInClause(MI->getOpcode()))
+        LatestCFAlu = E;
+      if (!isCFAlu(MI))
+        continue;
+      cleanPotentialDisabledCFAlu(MI);
+
+      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
+        MI->eraseFromParent();
+      } else {
+        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
+        LatestCFAlu = MI;
+      }
+    }
+  }
+  return false;
+}
+
+const char *R600ClauseMergePass::getPassName() const {
+  return "R600 Merge Clause Markers Pass";
+}
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
+  return new R600ClauseMergePass(TM);
+}
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 715be37..ac3d8f6 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -373,15 +373,6 @@ public:
         case AMDGPU::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
-        case AMDGPU::EG_ExportBuf:
-        case AMDGPU::EG_ExportSwz:
-        case AMDGPU::R600_ExportBuf:
-        case AMDGPU::R600_ExportSwz:
-        case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-        case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
-        case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
-        case AMDGPU::RAT_STORE_DWORD32_cm:
-        case AMDGPU::RAT_STORE_DWORD64_cm:
           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
           CfCount++;
           break;
@@ -491,6 +482,10 @@ public:
             EmitALUClause(I, AluClauses[i], CfCount);
         }
         default:
+          if (TII->isExport(MI->getOpcode())) {
+            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            CfCount++;
+          }
           break;
         }
       }
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 90fc29c..1781f2a 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -44,7 +44,9 @@ namespace R600_InstFlag {
     TEX_INST = (1 << 13),
     ALU_INST = (1 << 14),
     LDS_1A = (1 << 15),
-    LDS_1A1D = (1 << 16)
+    LDS_1A1D = (1 << 16),
+    IS_EXPORT = (1 << 17),
+    LDS_1A2D = (1 << 18)
   };
 }
 
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index fac2b47..1bbfd2b 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -47,6 +47,11 @@ private:
       break;
     }
 
+    // These will be expanded to two ALU instructions in the
+    // ExpandSpecialInstructions pass.
+    if (TII->isLDSRetInstr(MI->getOpcode()))
+      return 2;
+
     if(TII->isVector(*MI) ||
         TII->isCubeOp(MI->getOpcode()) ||
         TII->isReductionOp(MI->getOpcode()))
@@ -84,6 +89,7 @@ private:
     switch (MI->getOpcode()) {
     case AMDGPU::KILL:
     case AMDGPU::RETURN:
+    case AMDGPU::IMPLICIT_DEF:
       return true;
     default:
       return false;
@@ -105,8 +111,13 @@ private:
   }
 
   bool SubstituteKCacheBank(MachineInstr *MI,
-      std::vector<std::pair<unsigned, unsigned> > &CachedConsts) const {
+      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
+      bool UpdateInstr = true) const {
     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+
+    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+      return true;
+
     const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
         TII->getSrcs(MI);
     assert((TII->isALUInstr(MI->getOpcode()) ||
@@ -139,6 +150,9 @@ private:
       return false;
     }
 
+    if (!UpdateInstr)
+      return true;
+
     for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
       if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
         continue;
@@ -159,6 +173,52 @@ private:
     return true;
   }
 
+  bool canClauseLocalKillFitInClause(
+                        unsigned AluInstCount,
+                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        MachineBasicBlock::iterator Def,
+                        MachineBasicBlock::iterator BBEnd) {
+    const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    for (MachineInstr::const_mop_iterator
+           MOI = Def->operands_begin(),
+           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef() ||
+          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+        continue;
+
+      // Def defines a clause local register, so check that its use will fit
+      // in the clause.
+      unsigned LastUseCount = 0;
+      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
+        AluInstCount += OccupiedDwords(UseI);
+        // Make sure we won't need to end the clause due to KCache limitations.
+        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
+          return false;
+
+        // We have reached the maximum instruction limit before finding the
+        // use that kills this register, so we cannot use this def in the
+        // current clause.
+        if (AluInstCount >= TII->getMaxAlusPerClause())
+          return false;
+
+        // Register kill flags have been cleared by the time we get to this
+        // pass, but it is safe to assume that all uses of this register
+        // occur in the same basic block as its definition, because
+        // it is illegal for the scheduler to schedule them in
+        // different blocks.
+        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+          LastUseCount = AluInstCount;
+
+        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+          break;
+      }
+      if (LastUseCount)
+        return LastUseCount <= TII->getMaxAlusPerClause();
+      llvm_unreachable("Clause local register live at end of clause.");
+    }
+    return true;
+  }
+
   MachineBasicBlock::iterator
   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
     MachineBasicBlock::iterator ClauseHead = I;
@@ -173,6 +233,14 @@ private:
       if (AluInstCount > TII->getMaxAlusPerClause())
         break;
       if (I->getOpcode() == AMDGPU::PRED_X) {
+        // We put PRED_X in its own clause to ensure that ifcvt won't create
+        // clauses with more than 128 insts.
+        // IfCvt is indeed checking that "then" and "else" branches of an if
+        // statement have less than ~60 insts thus converted clauses can't be
+        // bigger than ~121 insts (predicate setter needs to be in the same
+        // clause as predicated alus).
+        if (AluInstCount > 0)
+          break;
         if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
           PushBeforeModifier = true;
         AluInstCount ++;
@@ -189,11 +257,13 @@ private:
         I++;
         break;
       }
-      if (TII->isALUInstr(I->getOpcode()) &&
-          !SubstituteKCacheBank(I, KCacheBanks))
+
+      // If this instruction defines a clause local register, make sure
+      // its use can fit in this clause.
+      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
         break;
-      if (I->getOpcode() == AMDGPU::DOT_4 &&
-          !SubstituteKCacheBank(I, KCacheBanks))
+
+      if (!SubstituteKCacheBank(I, KCacheBanks))
         break;
       AluInstCount += OccupiedDwords(I);
     }
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index 67b42d7..aeee4aa 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -68,6 +68,23 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *I;
       I = llvm::next(I);
 
+      // Expand LDS_*_RET instructions
+      if (TII->isLDSRetInstr(MI.getOpcode())) {
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        assert(DstIdx != -1);
+        MachineOperand &DstOp = MI.getOperand(DstIdx);
+        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
+                                               DstOp.getReg(), AMDGPU::OQAP);
+        DstOp.setReg(AMDGPU::OQAP);
+        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        // Copy the pred_sel bit
+        Mov->getOperand(MovPredSelIdx).setReg(
+            MI.getOperand(LDSPredSelIdx).getReg());
+      }
+
       switch (MI.getOpcode()) {
       default: break;
       // Expand PRED_X to one of the PRED_SET instructions.
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index ce6ac89..0fcb488 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -38,14 +38,24 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 
   computeRegisterProperties();
 
-  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
-  setOperationAction(ISD::FADD, MVT::v2f32, Expand);
-  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
-  setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
-  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
-  setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
-  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
-  setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
+  // Set condition code actions
+  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
 
   setOperationAction(ISD::FCOS, MVT::f32, Custom);
   setOperationAction(ISD::FSIN, MVT::f32, Custom);
@@ -69,21 +79,33 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
 
-  setOperationAction(ISD::SELECT, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+
   setOperationAction(ISD::STORE, MVT::i8, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
@@ -99,7 +121,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-  setSchedulingPreference(Sched::VLIW);
+  setSchedulingPreference(Sched::Source);
 }
 
 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
@@ -111,7 +133,25 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 
   switch (MI->getOpcode()) {
-  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  default:
+    // Replace LDS_*_RET instruction that don't have any uses with the
+    // equivalent LDS_*_NORET instruction.
+    if (TII->isLDSRetInstr(MI->getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+      assert(DstIdx != -1);
+      MachineInstrBuilder NewMI;
+      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
+        return BB;
+
+      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
+      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI->getOperand(i));
+      }
+    } else {
+      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    }
+    break;
   case AMDGPU::CLAMP_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
                                                    AMDGPU::MOV,
@@ -147,19 +187,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     break;
   }
 
-  case AMDGPU::LDS_READ_RET: {
-    MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
-                                        TII->get(MI->getOpcode()),
-                                        AMDGPU::OQAP);
-    for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
-      NewMI.addOperand(MI->getOperand(i));
-    }
-    TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
-                                 MI->getOperand(0).getReg(),
-                                 AMDGPU::OQAP);
-    break;
-  }
-
   case AMDGPU::MOV_IMM_F32:
     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
                      MI->getOperand(1).getFPImm()->getValueAPF()
@@ -486,10 +513,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::FCOS:
   case ISD::FSIN: return LowerTrig(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
@@ -554,7 +579,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
             DL, MVT::f32, SDValue(interp, 0));
       }
-
       MachineFunction &MF = DAG.getMachineFunction();
       MachineRegisterInfo &MRI = MF.getRegInfo();
       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
@@ -576,6 +600,24 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
             RegisterJNode, RegisterINode);
       return SDValue(interp, slot % 2);
     }
+    case AMDGPUIntrinsic::R600_interp_xy:
+    case AMDGPUIntrinsic::R600_interp_zw: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      MachineSDNode *interp;
+      SDValue RegisterINode = Op.getOperand(2);
+      SDValue RegisterJNode = Op.getOperand(3);
+
+      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
+          SDValue(interp, 0), SDValue(interp, 1));
+    }
     case AMDGPUIntrinsic::R600_tex:
     case AMDGPUIntrinsic::R600_texc:
     case AMDGPUIntrinsic::R600_txl:
@@ -585,7 +627,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case AMDGPUIntrinsic::R600_txf:
     case AMDGPUIntrinsic::R600_txq:
     case AMDGPUIntrinsic::R600_ddx:
-    case AMDGPUIntrinsic::R600_ddy: {
+    case AMDGPUIntrinsic::R600_ddy:
+    case AMDGPUIntrinsic::R600_ldptr: {
       unsigned TextureOp;
       switch (IntrinsicID) {
       case AMDGPUIntrinsic::R600_tex:
@@ -618,6 +661,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       case AMDGPUIntrinsic::R600_ddy:
         TextureOp = 9;
         break;
+      case AMDGPUIntrinsic::R600_ldptr:
+        TextureOp = 10;
+        break;
       default:
         llvm_unreachable("Unknow Texture Operation");
       }
@@ -792,20 +838,6 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                      false, false, false, 0);
 }
 
-SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
-
-  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
-  assert(FIN);
-
-  unsigned FrameIndex = FIN->getIndex();
-  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
-  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
-}
-
 bool R600TargetLowering::isZero(SDValue Op) const {
   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     return Cst->isNullValue();
@@ -836,16 +868,27 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   //
   // SET* can match the following patterns:
   //
-  // select_cc f32, f32, -1,  0, cc_any
-  // select_cc f32, f32, 1.0f, 0.0f, cc_any
-  // select_cc i32, i32, -1,  0, cc_any
+  // select_cc f32, f32, -1,  0, cc_supported
+  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
+  // select_cc i32, i32, -1,  0, cc_supported
   //
 
   // Move hardware True/False values to the correct operand.
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode InverseCC =
+     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   if (isHWTrueValue(False) && isHWFalseValue(True)) {
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-    std::swap(False, True);
-    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
+      std::swap(False, True);
+      CC = DAG.getCondCode(InverseCC);
+    } else {
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
+      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
+        std::swap(False, True);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(SwapInvCC);
+      }
+    }
   }
 
   if (isHWTrueValue(True) && isHWFalseValue(False) &&
@@ -858,14 +901,34 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   //
   // CND* can match the following patterns:
   //
-  // select_cc f32, 0.0, f32, f32, cc_any
-  // select_cc f32, 0.0, i32, i32, cc_any
-  // select_cc i32, 0,   f32, f32, cc_any
-  // select_cc i32, 0,   i32, i32, cc_any
+  // select_cc f32, 0.0, f32, f32, cc_supported
+  // select_cc f32, 0.0, i32, i32, cc_supported
+  // select_cc i32, 0,   f32, f32, cc_supported
+  // select_cc i32, 0,   i32, i32, cc_supported
   //
-  if (isZero(LHS) || isZero(RHS)) {
-    SDValue Cond = (isZero(LHS) ? RHS : LHS);
-    SDValue Zero = (isZero(LHS) ? LHS : RHS);
+
+  // Try to move the zero value to the RHS
+  if (isZero(LHS)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    // Try swapping the operands
+    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
+    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(CCSwapped);
+    } else {
+      // Try inverting the conditon and then swapping the operands
+      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
+      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
+      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+        std::swap(True, False);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(CCSwapped);
+      }
+    }
+  }
+  if (isZero(RHS)) {
+    SDValue Cond = LHS;
+    SDValue Zero = RHS;
     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     if (CompareVT != VT) {
       // Bitcast True / False to the correct types.  This will end up being
@@ -875,20 +938,11 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
     }
-    if (isZero(LHS)) {
-      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
-    }
 
     switch (CCOpcode) {
     case ISD::SETONE:
     case ISD::SETUNE:
     case ISD::SETNE:
-    case ISD::SETULE:
-    case ISD::SETULT:
-    case ISD::SETOLE:
-    case ISD::SETOLT:
-    case ISD::SETLE:
-    case ISD::SETLT:
       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
       Temp = True;
       True = False;
@@ -936,17 +990,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       DAG.getCondCode(ISD::SETNE));
 }
 
-SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  return DAG.getNode(ISD::SELECT_CC,
-      SDLoc(Op),
-      Op.getValueType(),
-      Op.getOperand(0),
-      DAG.getConstant(0, MVT::i32),
-      Op.getOperand(1),
-      Op.getOperand(2),
-      DAG.getCondCode(ISD::SETNE));
-}
-
 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
@@ -1009,19 +1052,59 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Value = Op.getOperand(1);
   SDValue Ptr = Op.getOperand(2);
 
-  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
-    // Convert pointer from byte address to dword address.
-    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
-                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
-                                  Ptr, DAG.getConstant(2, MVT::i32)));
+  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
 
-    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
-      assert(!"Truncated and indexed stores not supported yet");
-    } else {
-      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
+    if (StoreNode->isTruncatingStore()) {
+      EVT VT = Value.getValueType();
+      assert(VT.bitsLE(MVT::i32));
+      EVT MemVT = StoreNode->getMemoryVT();
+      SDValue MaskConstant;
+      if (MemVT == MVT::i8) {
+        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
+      } else {
+        assert(MemVT == MVT::i16);
+        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
+      }
+      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
+                                      DAG.getConstant(2, MVT::i32));
+      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
+                                      DAG.getConstant(0x00000003, VT));
+      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+                                   DAG.getConstant(3, VT));
+      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
+      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
+      // vector instead.
+      SDValue Src[4] = {
+        ShiftedValue,
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(0, MVT::i32),
+        Mask
+      };
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
+      SDValue Args[3] = { Chain, Input, DWordAddr };
+      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
+                                     Op->getVTList(), Args, 3, MemVT,
+                                     StoreNode->getMemOperand());
+    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
+               Value.getValueType().bitsGE(MVT::i32)) {
+      // Convert pointer from byte address to dword address.
+      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                    Ptr, DAG.getConstant(2, MVT::i32)));
+
+      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+        assert(!"Truncated and indexed stores not supported yet");
+      } else {
+        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+      }
+      return Chain;
     }
-    return Chain;
   }
 
   EVT ValueVT = Value.getValueType();
@@ -1121,12 +1204,22 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   SDValue Ptr = Op.getOperand(1);
   SDValue LoweredLoad;
 
+  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
+    SDValue MergedValues[2] = {
+      SplitVectorLoad(Op, DAG),
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
+
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
-  if (ConstantBlock > -1) {
+  if (ConstantBlock > -1 &&
+      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
-        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
-        dyn_cast<ConstantSDNode>(Ptr)) {
+    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
+        isa<Constant>(LoadNode->getSrcValue()) ||
+        isa<ConstantSDNode>(Ptr)) {
       SDValue Slots[4];
       for (unsigned i = 0; i < 4; i++) {
         // We want Const position encoded with the following formula :
@@ -1166,13 +1259,13 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     return DAG.getMergeValues(MergedValues, 2, DL);
   }
 
-  // For most operations returning SDValue() will result int he node being
-  // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
-  // we need to manually expand loads that may be legal in some address spaces
-  // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
-  // for compute shaders, since the data is sign extended when it is uploaded
-  // to the buffer.  Howerver SEXT loads from other addresspaces are not
-  // supported, so we need to expand them here.
+  // For most operations returning SDValue() will result in the node being
+  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+  // need to manually expand loads that may be legal in some address spaces and
+  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+  // compute shaders, since the data is sign extended when it is uploaded to the
+  // buffer. However SEXT loads from other address spaces are not supported, so
+  // we need to expand them here.
   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
     EVT MemVT = LoadNode->getMemoryVT();
     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
@@ -1252,23 +1345,39 @@ SDValue R600TargetLowering::LowerFormalArguments(
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
+
+  SmallVector<ISD::InputArg, 8> LocalIns;
+
+  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                          LocalIns);
 
-  AnalyzeFormalArguments(CCInfo, Ins);
+  AnalyzeFormalArguments(CCInfo, LocalIns);
 
   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    EVT VT = VA.getLocVT();
+    EVT VT = Ins[i].VT;
+    EVT MemVT = LocalIns[i].VT;
+
+    if (ShaderType != ShaderType::COMPUTE) {
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Register);
+      continue;
+    }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                                    AMDGPUAS::CONSTANT_BUFFER_0);
 
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
-    SDValue Arg = DAG.getLoad(VT, DL, Chain,
-                           DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
-                           MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                           false, false, 4); // 4 is the prefered alignment for
-                                             // the CONSTANT memory space.
+    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
+                                 MachinePointerInfo(UndefValue::get(PtrTy)),
+                                 MemVT, false, false, 4);
+                                 // 4 is the prefered alignment for
+                                 // the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
@@ -1292,6 +1401,11 @@ CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
   };
 
   for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      // We mask write here to teach later passes that the ith element of this
+      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+      // break false dependencies and additionnaly make assembly easier to read.
+      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
       if (C->isZero()) {
         RemapSwizzle[i] = 4; // SEL_0
@@ -1335,12 +1449,16 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
           ->getZExtValue();
-      if (!isUnmovable[Idx]) {
-        // Swap i and Idx
-        std::swap(NewBldVec[Idx], NewBldVec[i]);
-        std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
+      if (i == Idx) {
+        isUnmovable[Idx] = true;
+        continue;
       }
-      isUnmovable[Idx] = true;
+      if (isUnmovable[Idx])
+        continue;
+      // Swap i and Idx
+      std::swap(NewBldVec[Idx], NewBldVec[i]);
+      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+      break;
     }
   }
 
@@ -1422,8 +1540,8 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
 
-  // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
-  // => build_vector elt0, …, NewEltIdx, …, eltN
+  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+  // => build_vector elt0, ... , NewEltIdx, ... , eltN
   case ISD::INSERT_VECTOR_ELT: {
     SDValue InVec = N->getOperand(0);
     SDValue InVal = N->getOperand(1);
@@ -1525,15 +1643,20 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
       LHSCC = ISD::getSetCCInverse(LHSCC,
                                   LHS.getOperand(0).getValueType().isInteger());
-      return DAG.getSelectCC(SDLoc(N),
-                             LHS.getOperand(0),
-                             LHS.getOperand(1),
-                             LHS.getOperand(2),
-                             LHS.getOperand(3),
-                             LHSCC);
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
+        return DAG.getSelectCC(SDLoc(N),
+                               LHS.getOperand(0),
+                               LHS.getOperand(1),
+                               LHS.getOperand(2),
+                               LHS.getOperand(3),
+                               LHSCC);
+      break;
     }
     }
+    return SDValue();
   }
+
   case AMDGPUISD::EXPORT: {
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
@@ -1586,3 +1709,253 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   }
   return SDValue();
 }
+
+static bool
+FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
+            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Src.isMachineOpcode())
+    return false;
+  switch (Src.getMachineOpcode()) {
+  case AMDGPU::FNEG_R600:
+    if (!Neg.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Neg = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::FABS_R600:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::CONST_COPY: {
+    unsigned Opcode = ParentNode->getMachineOpcode();
+    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+
+    if (!Sel.getNode())
+      return false;
+
+    SDValue CstOffset = Src.getOperand(0);
+    if (ParentNode->getValueType(0).isVector())
+      return false;
+
+    // Gather constants values
+    int SrcIndices[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+    };
+    std::vector<unsigned> Consts;
+    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
+      int OtherSrcIdx = SrcIndices[i];
+      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
+      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
+        continue;
+      if (HasDst) {
+        OtherSrcIdx--;
+        OtherSelIdx--;
+      }
+      if (RegisterSDNode *Reg =
+          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
+        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
+              ParentNode->getOperand(OtherSelIdx));
+          Consts.push_back(Cst->getZExtValue());
+        }
+      }
+    }
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts)) {
+      return false;
+    }
+
+    Sel = CstOffset;
+    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    return true;
+  }
+  case AMDGPU::MOV_IMM_I32:
+  case AMDGPU::MOV_IMM_F32: {
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+    uint64_t ImmValue = 0;
+
+
+    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+      float FloatValue = FPC->getValueAPF().convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+      uint64_t Value = C->getZExtValue();
+      if (Value == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (Value == 1) {
+        ImmReg = AMDGPU::ONE_INT;
+      } else {
+        ImmValue = Value;
+      }
+    }
+
+    // Check that we aren't already using an immediate.
+    // XXX: It's possible for an instruction to have more than one
+    // immediate operand, but this is not supported yet.
+    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+      if (!Imm.getNode())
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
+      assert(C);
+      if (C->getZExtValue())
+        return false;
+      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
+    }
+    Src = DAG.getRegister(ImmReg, MVT::i32);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+
+/// \brief Fold the instructions after selecting them
+SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
+                                            SelectionDAG &DAG) const {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Node->isMachineOpcode())
+    return Node;
+  unsigned Opcode = Node->getMachineOpcode();
+  SDValue FakeOp;
+
+  std::vector<SDValue> Ops;
+  for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
+              I != E; ++I)
+          Ops.push_back(*I);
+
+  if (Opcode == AMDGPU::DOT_4) {
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+        };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+    };
+    for (unsigned i = 0; i < 8; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue &Abs = Ops[AbsIdx[i] - 1];
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      if (HasDst)
+        SelIdx--;
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
+      SDValue &Src = Ops[i];
+      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::CLAMP_R600) {
+    SDValue Src = Node->getOperand(0);
+    if (!Src.isMachineOpcode() ||
+        !TII->hasInstrModifiers(Src.getMachineOpcode()))
+      return Node;
+    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
+        AMDGPU::OpName::clamp);
+    if (ClampIdx < 0)
+      return Node;
+    std::vector<SDValue> Ops;
+    unsigned NumOp = Src.getNumOperands();
+    for(unsigned i = 0; i < NumOp; ++i)
+          Ops.push_back(Src.getOperand(i));
+    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
+    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
+        Node->getVTList(), Ops);
+  } else {
+    if (!TII->hasInstrModifiers(Opcode))
+      return Node;
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+    };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      -1
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue FakeAbs;
+      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      if (HasDst) {
+        SelIdx--;
+        ImmIdx--;
+      }
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      SDValue &Imm = Ops[ImmIdx];
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  }
+
+  return Node;
+}
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index a033fcb..c10257e 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -56,11 +56,9 @@ private:
   SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
@@ -68,6 +66,7 @@ private:
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
+  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
 };
 
 } // End namespace llvm;
diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td
index 2d72404..9428bab 100644
--- a/lib/Target/R600/R600InstrFormats.td
+++ b/lib/Target/R600/R600InstrFormats.td
@@ -16,7 +16,6 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
     : AMDGPUInst <outs, ins, asm, pattern> {
 
   field bits<64> Inst;
-  bit TransOnly = 0;
   bit Trig = 0;
   bit Op3 = 0;
   bit isVector = 0;
@@ -29,6 +28,8 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   bit VTXInst = 0;
   bit TEXInst = 0;
   bit ALUInst = 0;
+  bit IsExport = 0;
+  bit LDS_1A2D = 0;
 
   let Namespace = "AMDGPU";
   let OutOperandList = outs;
@@ -37,7 +38,6 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   let Pattern = pattern;
   let Itinerary = itin;
 
-  let TSFlags{0} = TransOnly;
   let TSFlags{4} = Trig;
   let TSFlags{5} = Op3;
 
@@ -53,6 +53,8 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   let TSFlags{14} = ALUInst;
   let TSFlags{15} = LDS_1A;
   let TSFlags{16} = LDS_1A1D;
+  let TSFlags{17} = IsExport;
+  let TSFlags{18} = LDS_1A2D;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 4e7eff9..c0827fc 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
 using namespace llvm;
@@ -77,16 +77,16 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 }
 
-MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
-                                             unsigned DstReg, int64_t Imm) const {
-  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
-  MachineInstrBuilder MIB(*MF, MI);
-  MIB.addReg(DstReg, RegState::Define);
-  MIB.addReg(AMDGPU::ALU_LITERAL_X);
-  MIB.addImm(Imm);
-  MIB.addReg(0); // PREDICATE_BIT
-
-  return MI;
+/// \returns true if \p MBBI can be moved into a new basic.
+bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) const {
+  for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
+                                        E = MBBI->operands_end(); I != E; ++I) {
+    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
+        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+      return false;
+  }
+  return true;
 }
 
 unsigned R600InstrInfo::getIEQOpcode() const {
@@ -149,17 +149,58 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
   unsigned TargetFlags = get(Opcode).TSFlags;
 
   return ((TargetFlags & R600_InstFlag::LDS_1A) |
-          (TargetFlags & R600_InstFlag::LDS_1A1D));
+          (TargetFlags & R600_InstFlag::LDS_1A1D) |
+          (TargetFlags & R600_InstFlag::LDS_1A2D));
+}
+
+bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
+}
+
+bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+}
+
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
+  if (isALUInstr(MI->getOpcode()))
+    return true;
+  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
+    return true;
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::COPY:
+  case AMDGPU::DOT_4:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
-  return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY);
+  if (ST.hasCaymanISA())
+    return false;
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
 }
 
 bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
   return isTransOnly(MI->getOpcode());
 }
 
+bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+}
+
+bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
+  return isVectorOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::isExport(unsigned Opcode) const {
+  return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
+}
+
 bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
   return ST.hasVertexCache() && IS_VTX(get(Opcode));
 }
@@ -189,6 +230,30 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
   }
 }
 
+bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
+  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
+  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
+  if (!isALUInstr(MI->getOpcode())) {
+    return false;
+  }
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+                                        E = MI->operands_end(); I != E; ++I) {
+    if (!I->isReg() || !I->isUse() ||
+        TargetRegisterInfo::isVirtualRegister(I->getReg()))
+      continue;
+
+    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+      return true;
+  }
+  return false;
+}
+
 int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
   static const unsigned OpTable[] = {
     AMDGPU::OpName::src0,
@@ -321,6 +386,8 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
 static std::vector<std::pair<int, unsigned> >
 Swizzle(std::vector<std::pair<int, unsigned> > Src,
         R600InstrInfo::BankSwizzle Swz) {
+  if (Src[0] == Src[1])
+    Src[1].first = -1;
   switch (Swz) {
   case R600InstrInfo::ALU_VEC_012_SCL_210:
     break;
@@ -462,6 +529,9 @@ static bool
 isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
                   const std::vector<std::pair<int, unsigned> > &TransOps,
                   unsigned ConstCount) {
+  // TransALU can't read 3 constants
+  if (ConstCount > 2)
+    return false;
   for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
     const std::pair<int, unsigned> &Src = TransOps[i];
     unsigned Cycle = getTransSwizzle(TransSwz, i);
@@ -615,6 +685,11 @@ bool isJump(unsigned Opcode) {
   return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
 }
 
+static bool isBranch(unsigned Opcode) {
+  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
+      Opcode == AMDGPU::BRANCH_COND_f32;
+}
+
 bool
 R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                              MachineBasicBlock *&TBB,
@@ -633,6 +708,10 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return false;
     --I;
   }
+  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // handled
+  if (isBranch(I->getOpcode()))
+    return true;
   if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
     return false;
   }
@@ -942,6 +1021,20 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
     return true;
   }
 
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
+        .setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
   if (PIdx != -1) {
     MachineOperand &PMO = MI->getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
@@ -953,6 +1046,10 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
   return false;
 }
 
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 2;
+}
+
 unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                             const MachineInstr *MI,
                                             unsigned *PredCost) const {
@@ -961,67 +1058,25 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return 2;
 }
 
-int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  int Offset = 0;
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  if (MRI.livein_empty()) {
-    return 0;
-  }
-
-  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
-                                            LE = MRI.livein_end();
-                                            LI != LE; ++LI) {
-    Offset = std::max(Offset,
-                      GET_REG_INDEX(RI.getEncodingValue(LI->first)));
-  }
-
-  return Offset + 1;
-}
-
-int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  int Offset = 0;
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Variable sized objects are not supported
-  assert(!MFI->hasVarSizedObjects());
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
-
-  return getIndirectIndexBegin(MF) + Offset;
-}
-
-std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs(
+void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
   const AMDGPUFrameLowering *TFL =
                  static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
-  std::vector<unsigned> Regs;
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);
 
-  if (End == -1) {
-    return Regs;
-  }
+  if (End == -1)
+    return;
 
   for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
     unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
-    Regs.push_back(SuperReg);
+    Reserved.set(SuperReg);
     for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
-      Regs.push_back(Reg);
+      Reserved.set(Reg);
     }
   }
-  return Regs;
 }
 
 unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
@@ -1031,13 +1086,8 @@ unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
   return RegIndex;
 }
 
-const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass(
-                                                     unsigned SourceReg) const {
-  return &AMDGPU::R600_TReg32RegClass;
-}
-
-const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const {
-  return &AMDGPU::TRegMemRegClass;
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::R600_TReg32_XRegClass;
 }
 
 MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
@@ -1076,10 +1126,6 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   return Mov;
 }
 
-const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const {
-  return &AMDGPU::IndirectRegRegClass;
-}
-
 unsigned R600InstrInfo::getMaxAlusPerClause() const {
   return 115;
 }
@@ -1197,6 +1243,11 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     AMDGPU::OpName::src1_sel,
   };
 
+  MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      .setReg(MO.getReg());
+
   for (unsigned i = 0; i < 14; i++) {
     MachineOperand &MO = MI->getOperand(
         getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
@@ -1217,6 +1268,12 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
   return MovImm;
 }
 
+MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DstReg, unsigned SrcReg) const {
+  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+}
+
 int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
   return getOperandIdx(MI.getOpcode(), Op);
 }
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index cdaa2fb..13d9810 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -55,6 +55,8 @@ namespace llvm {
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const;
 
   bool isTrig(const MachineInstr &MI) const;
   bool isPlaceHolderOpcode(unsigned opcode) const;
@@ -65,9 +67,18 @@ namespace llvm {
   bool isALUInstr(unsigned Opcode) const;
   bool hasInstrModifiers(unsigned Opcode) const;
   bool isLDSInstr(unsigned Opcode) const;
+  bool isLDSNoRetInstr(unsigned Opcode) const;
+  bool isLDSRetInstr(unsigned Opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction or an
+  /// instruction that will be lowered in ExpandSpecialInstrs Pass.
+  bool canBeConsideredALU(const MachineInstr *MI) const;
 
   bool isTransOnly(unsigned Opcode) const;
   bool isTransOnly(const MachineInstr *MI) const;
+  bool isVectorOnly(unsigned Opcode) const;
+  bool isVectorOnly(const MachineInstr *MI) const;
+  bool isExport(unsigned Opcode) const;
 
   bool usesVertexCache(unsigned Opcode) const;
   bool usesVertexCache(const MachineInstr *MI) const;
@@ -75,6 +86,9 @@ namespace llvm {
   bool usesTextureCache(const MachineInstr *MI) const;
 
   bool mustBeLastInClause(unsigned Opcode) const;
+  bool usesAddressRegister(MachineInstr *MI) const;
+  bool definesAddressRegister(MachineInstr *MI) const;
+  bool readsLDSSrcReg(const MachineInstr *MI) const;
 
   /// \returns The operand index for the given source number.  Legal values
   /// for SrcNum are 0, 1, and 2.
@@ -128,9 +142,6 @@ namespace llvm {
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                        int64_t Imm) const;
-
   virtual unsigned getIEQOpcode() const;
   virtual bool isMov(unsigned Opcode) const;
 
@@ -177,6 +188,8 @@ namespace llvm {
   bool PredicateInstruction(MachineInstr *MI,
                         const SmallVectorImpl<MachineOperand> &Pred) const;
 
+  unsigned int getPredicationCost(const MachineInstr *) const;
+
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
                                const MachineInstr *MI,
                                unsigned *PredCost = 0) const;
@@ -184,22 +197,14 @@ namespace llvm {
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const { return 1;}
 
-  /// \returns a list of all the registers that may be accesed using indirect
-  /// addressing.
-  std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
-
+  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
 
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const;
 
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                      unsigned SourceReg) const;
-
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
 
   virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
                                   MachineBasicBlock::iterator I,
@@ -211,8 +216,6 @@ namespace llvm {
                                   unsigned ValueReg, unsigned Address,
                                   unsigned OffsetReg) const;
 
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
-
   unsigned getMaxAlusPerClause() const;
 
   ///buildDefaultInstruction - This function returns a MachineInstr with
@@ -239,6 +242,10 @@ namespace llvm {
                                   unsigned DstReg,
                                   uint64_t Imm) const;
 
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const;
+
   /// \brief Get the index of Op in the MachineInstr.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
@@ -272,6 +279,12 @@ namespace llvm {
   void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
 };
 
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
 } // End llvm namespace
 
 #endif // R600INSTRINFO_H_
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 7e61b18..0346e24 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -75,7 +75,6 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
 def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
 def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
 
 def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
@@ -230,7 +229,7 @@ def TEX_RECT : PatLeaf<
 def TEX_ARRAY : PatLeaf<
   (imm),
   [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 9 || TType == 10 || TType == 15 || TType == 16;
+    return TType == 9 || TType == 10 || TType == 16;
   }]
 >;
 
@@ -241,12 +240,26 @@ def TEX_SHADOW_ARRAY : PatLeaf<
   }]
 >;
 
-class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> mask, dag outs,
-                 dag ins, string asm, list<dag> pattern> :
+def TEX_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 14;
+  }]
+>;
+
+def TEX_ARRAY_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 15;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
+                 dag outs, dag ins, string asm, list<dag> pattern> :
     InstR600ISA <outs, ins, asm, pattern>,
     CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF  {
 
-  let rat_id = 0;
+  let rat_id = ratid;
   let rat_inst = ratinst;
   let rim         = 0;
   // XXX: Have a separate instruction for non-indexed writes.
@@ -264,6 +277,7 @@ class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> mask, dag outs,
 
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 
 }
 
@@ -403,7 +417,7 @@ def INTERP_VEC_LOAD :  AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
   (ins i32imm:$src0),
   "INTERP_LOAD $src0 : $dst",
-  []>;
+  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
 
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
@@ -537,6 +551,7 @@ class ExportSwzInst : InstR600ISA<(
   let elem_size = 3;
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 }
 
 } // End usesCustomInserter = 1
@@ -550,6 +565,7 @@ class ExportBufInst : InstR600ISA<(
   let elem_size = 0;
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -573,6 +589,7 @@ i32imm:$COUNT, i32imm:$Enabled),
   let ALT_CONST = 0;
   let WHOLE_QUAD_MODE = 0;
   let BARRIER = 1;
+  let UseNamedOperandTable = 1;
 
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
@@ -672,42 +689,42 @@ def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
 // XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
 def SETE : R600_2OP <
   0x08, "SETE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_EQ))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
 >;
 
 def SGT : R600_2OP <
   0x09, "SETGT",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GT))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
 >;
 
 def SGE : R600_2OP <
   0xA, "SETGE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
 >;
 
 def SNE : R600_2OP <
   0xB, "SETNE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_NE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))]
 >;
 
 def SETE_DX10 : R600_2OP <
   0xC, "SETE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_EQ))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
 >;
 
 def SETGT_DX10 : R600_2OP <
   0xD, "SETGT_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GT))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
 >;
 
 def SETGE_DX10 : R600_2OP <
   0xE, "SETGE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GE))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
 >;
 
 def SETNE_DX10 : R600_2OP <
   0xF, "SETNE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_NE))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))]
 >;
 
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
@@ -805,12 +822,12 @@ def CNDE_INT : R600_3OP <
 
 def CNDGE_INT : R600_3OP <
   0x1E, "CNDGE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GE))]
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
 >;
 
 def CNDGT_INT : R600_3OP <
   0x1D, "CNDGT_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GT))]
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
 >;
 
 //===----------------------------------------------------------------------===//
@@ -863,6 +880,9 @@ def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
 def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
 def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
 def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
+  let INST_MOD = 1;
+}
 def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
 def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
 def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
@@ -881,6 +901,7 @@ defm : TexPattern<6, TEX_LD, v4i32>;
 defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
 defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
 defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
+defm : TexPattern<10, TEX_LDPTR, v4i32>;
 
 //===----------------------------------------------------------------------===//
 // Helper classes for common instructions
@@ -903,18 +924,22 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
 
 class CNDE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_EQ))]
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
 >;
 
 class CNDGT_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGT",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GT))]
->;
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+> {
+  let Itinerary = VecALU;
+}
 
 class CNDGE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))]
->;
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+> {
+  let Itinerary = VecALU;
+}
 
 
 let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
@@ -984,35 +1009,30 @@ multiclass CUBE_Common <bits<11> inst> {
 class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "EXP_IEEE", fexp2
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "FLT_TO_INT", fp_to_sint
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "INT_TO_FLT", sint_to_fp
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "FLT_TO_UINT", fp_to_uint
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "UINT_TO_FLT", uint_to_fp
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
@@ -1023,7 +1043,6 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
 class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "LOG_IEEE", flog2
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
@@ -1033,75 +1052,68 @@ class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
 class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULHI_INT", mulhs
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULHI", mulhu
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULLO_INT", mul
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
   inst, "RECIP_CLAMPED", []
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
   inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "RECIP_UINT", AMDGPUurecip
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
   inst, "RECIPSQRT_IEEE", []
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class SIN_Common <bits<11> inst> : R600_1OP <
   inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
   let Trig = 1;
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class COS_Common <bits<11> inst> : R600_1OP <
   inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
   let Trig = 1;
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
 //===----------------------------------------------------------------------===//
 // Helper patterns for complex intrinsics
 //===----------------------------------------------------------------------===//
@@ -1124,6 +1136,13 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie
   (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
 >;
 
+// FROUND pattern
+class FROUNDPat<Instruction CNDGE> : Pat <
+  (AMDGPUround f32:$x),
+  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+>;
+
+
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1165,11 +1184,12 @@ let Predicates = [isR600] in {
   def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
 
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+  def : FROUNDPat <CNDGE_r600>;
 
   def R600_ExportSwz : ExportSwzInst {
     let Word1{20-17} = 0; // BURST_COUNT
     let Word1{21} = eop;
-    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{22} = 0; // VALID_PIXEL_MODE
     let Word1{30-23} = inst;
     let Word1{31} = 1; // BARRIER
   }
@@ -1178,7 +1198,7 @@ let Predicates = [isR600] in {
   def R600_ExportBuf : ExportBufInst {
     let Word1{20-17} = 0; // BURST_COUNT
     let Word1{21} = eop;
-    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{22} = 0; // VALID_PIXEL_MODE
     let Word1{30-23} = inst;
     let Word1{31} = 1; // BARRIER
   }
@@ -1247,6 +1267,33 @@ let Predicates = [isR700] in {
 }
 
 //===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                           string name, list<dag> pattern>
+    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+                 "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+                  list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+                 "MEM_RAT "#name, pattern>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+  "MSKOR $rw_gpr.XW, $index_gpr",
+  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+  let eop = 0;
+}
+
+} // End Predicates = [isEGorCayman]
+
+
+//===----------------------------------------------------------------------===//
 // Evergreen Only instructions
 //===----------------------------------------------------------------------===//
 
@@ -1274,36 +1321,32 @@ def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
-let usesCustomInserter = 1 in {
 
-class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> mask, string name,
-                              list<dag> pattern>
-    : EG_CF_RAT <0x57, 0x2, mask, (outs), ins, name, pattern> {
-}
-
-} // End usesCustomInserter = 1
+let usesCustomInserter = 1 in {
 
 // 32-bit store
-def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
   (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0x1, "RAT_WRITE_CACHELESS_32_eg $rw_gpr, $index_gpr, $eop",
+  "STORE_RAW $rw_gpr, $index_gpr, $eop",
   [(global_store i32:$rw_gpr, i32:$index_gpr)]
 >;
 
 // 64-bit store
-def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
   (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0x3, "RAT_WRITE_CACHELESS_64_eg $rw_gpr.XY, $index_gpr, $eop",
+  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
   [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
 >;
 
 //128-bit store
-def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
   (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0xf, "RAT_WRITE_CACHELESS_128 $rw_gpr.XYZW, $index_gpr, $eop",
+  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
   [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
 >;
 
+} // End usesCustomInserter = 1
+
 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
     : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
 
@@ -1508,7 +1551,6 @@ let hasSideEffects = 1 in {
 
   def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
     let Pattern = [];
-    let TransOnly = 0;
     let Itinerary = AnyALU;
   }
 
@@ -1600,29 +1642,83 @@ class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
   let DisableEncoding = "$dst";
 }
 
-class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> :
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
     R600_LDS <
-  lds_op,
-  (outs),
+  lds_op, outs,
   (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
        R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
        LAST:$last, R600_Pred:$pred_sel,
        BANK_SWIZZLE:$bank_swizzle),
-  "  "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel",
+  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
   pattern
   > {
 
+  field string BaseOp;
+
   let src2 = 0;
   let src2_rel = 0;
   let LDS_1A1D = 1;
 }
 
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+  let Defs = [OQAP];
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS <
+  lds_op,
+  (outs),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+  "  "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  pattern> {
+  let LDS_1A2D = 1;
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+  [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+  [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
 def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
   [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
 >;
-
-def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
-  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+  [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+  [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
 >;
 
   // TRUNC is used for the FLT_TO_INT instructions to work around a
@@ -1642,9 +1738,11 @@ def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
   // SHA-256 Patterns
   def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
 
+  def : FROUNDPat <CNDGE_eg>;
+
   def EG_ExportSwz : ExportSwzInst {
     let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{20} = 0; // VALID_PIXEL_MODE
     let Word1{21} = eop;
     let Word1{29-22} = inst;
     let Word1{30} = 0; // MARK
@@ -1654,7 +1752,7 @@ def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
 
   def EG_ExportBuf : ExportBufInst {
     let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{20} = 0; // VALID_PIXEL_MODE
     let Word1{21} = eop;
     let Word1{29-22} = inst;
     let Word1{30} = 0; // MARK
@@ -1771,23 +1869,17 @@ def : Pat <
 
 def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
 
-
-class  RAT_STORE_DWORD_cm <bits<4> mask, dag ins, list<dag> pat> : EG_CF_RAT <
-  0x57, 0x14, mask, (outs), ins,
-  "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr", pat
-> {
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+                        "STORE_DWORD $rw_gpr, $index_gpr",
+                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
   let eop = 0; // This bit is not used on Cayman.
 }
 
-def RAT_STORE_DWORD32_cm : RAT_STORE_DWORD_cm <0x1,
-  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
-  [(global_store i32:$rw_gpr, i32:$index_gpr)]
->;
-
-def RAT_STORE_DWORD64_cm : RAT_STORE_DWORD_cm <0x3,
-  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr),
-  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
->;
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
 
 class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
     : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
@@ -2012,10 +2104,6 @@ def TXD_SHADOW: InstR600 <
 } // End isPseudo = 1
 } // End usesCustomInserter = 1
 
-def CLAMP_R600 :  CLAMP <R600_Reg32>;
-def FABS_R600 : FABS<R600_Reg32>;
-def FNEG_R600 : FNEG<R600_Reg32>;
-
 //===---------------------------------------------------------------------===//
 // Return instruction
 //===---------------------------------------------------------------------===//
@@ -2164,7 +2252,7 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
   def BRANCH : ILFormat<(outs), (ins brtarget:$target),
       "; Pseudo unconditional branch instruction",
       [(br bb:$target)]>;
-  defm BRANCH_COND : BranchConditional<IL_brcond>;
+  defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -2235,7 +2323,7 @@ def : CND_INT_f32 <CNDGE_INT, SETGE>;
 
 //CNDGE_INT extra pattern
 def : Pat <
-  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_GT),
+  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
   (CNDGE_INT $src0, $src1, $src2)
 >;
 
@@ -2250,86 +2338,6 @@ def KIL : Pat <
   (MASK_WRITE (KILLGT (f32 ZERO), $src0))
 >;
 
-// SGT Reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LT),
-  (SGT $src1, $src0)
->;
-
-// SGE Reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LE),
-  (SGE $src1, $src0)
->;
-
-// SETGT_DX10 reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, COND_LT),
-  (SETGT_DX10 $src1, $src0)
->;
-
-// SETGE_DX10 reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, COND_LE),
-  (SETGE_DX10 $src1, $src0)
->;
-
-// SETGT_INT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETLT),
-  (SETGT_INT $src1, $src0)
->;
-
-// SETGE_INT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETLE),
-  (SETGE_INT $src1, $src0)
->;
-
-// SETGT_UINT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETULT),
-  (SETGT_UINT $src1, $src0)
->;
-
-// SETGE_UINT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETULE),
-  (SETGE_UINT $src1, $src0)
->;
-
-// The next two patterns are special cases for handling 'true if ordered' and
-// 'true if unordered' conditionals.  The assumption here is that the behavior of
-// SETE and SNE conforms to the Direct3D 10 rules for floating point values
-// described here:
-// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
-// We assume that  SETE returns false when one of the operands is NAN and
-// SNE returns true when on of the operands is NAN
-
-//SETE - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETO),
-  (SETE $src0, $src1)
->;
-
-//SETE_DX10 - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, SETO),
-  (SETE_DX10 $src0, $src1)
->;
-
-//SNE - 'true if unordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETUO),
-  (SNE $src0, $src1)
->;
-
-//SETNE_DX10 - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, SETUO),
-  (SETNE_DX10 $src0, $src1)
->;
-
 def : Extract_Element <f32, v4f32, 0, sub0>;
 def : Extract_Element <f32, v4f32, 1, sub1>;
 def : Extract_Element <f32, v4f32, 2, sub2>;
@@ -2378,3 +2386,11 @@ def : BitConvert <v4i32, v4f32, R600_Reg128>;
 def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
+
+def getLDSNoRetOp : InstrMapping {
+  let FilterClass = "R600_LDS_1A1D";
+  let RowFields = ["BaseOp"];
+  let ColFields = ["DisableEncoding"];
+  let KeyCol = ["$dst"];
+  let ValueCols = [[""""]];
+}
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
index 58d86b6..9681747 100644
--- a/lib/Target/R600/R600Intrinsics.td
+++ b/lib/Target/R600/R600Intrinsics.td
@@ -43,6 +43,12 @@ let TargetPrefix = "R600", isTarget = 1 in {
     Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_R600_interp_input :
     Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_const :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_R600_interp_xy :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+def int_R600_interp_zw :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_R600_load_texbuf :
     Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_R600_tex : TextureIntrinsicFloatInput;
@@ -52,6 +58,7 @@ let TargetPrefix = "R600", isTarget = 1 in {
   def int_R600_txb : TextureIntrinsicFloatInput;
   def int_R600_txbc : TextureIntrinsicFloatInput;
   def int_R600_txf : TextureIntrinsicInt32Input;
+  def int_R600_ldptr : TextureIntrinsicInt32Input;
   def int_R600_txq : TextureIntrinsicInt32Input;
   def int_R600_ddx : TextureIntrinsicFloatInput;
   def int_R600_ddy : TextureIntrinsicFloatInput;
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
index 018b403..01105c6 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.cpp
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -12,7 +12,9 @@
 
 using namespace llvm;
 
-R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
-  : AMDGPUMachineFunction(MF) { }
 
+// Pin the vtable to this file.
+void R600MachineFunctionInfo::anchor() {}
 
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF) { }
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index f23d9b7..c1bec0a 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -21,6 +21,7 @@
 namespace llvm {
 
 class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+  virtual void anchor();
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
   SmallVector<unsigned, 4> LiveOuts;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 0dc0365..da2a4d8 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -9,7 +9,6 @@
 //
 /// \file
 /// \brief R600 Machine Scheduler interface
-// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,6 +28,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
   MRI = &DAG->MRI;
   CurInstKind = IDOther;
   CurEmitted = 0;
@@ -92,15 +92,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       AllowSwitchFromAlu = true;
   }
 
-
-  // We want to scheduled AR defs as soon as possible to make sure they aren't
-  // put in a different ALU clause from their uses.
-  if (!SU && !UnscheduledARDefs.empty()) {
-      SU = UnscheduledARDefs[0];
-      UnscheduledARDefs.erase(UnscheduledARDefs.begin());
-      NextInstKind = IDAlu;
-  }
-
   if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
       (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
     // try to pick ALU
@@ -130,15 +121,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       NextInstKind = IDOther;
   }
 
-  // We want to schedule the AR uses as late as possible to make sure that
-  // the AR defs have been released.
-  if (!SU && !UnscheduledARUses.empty()) {
-      SU = UnscheduledARUses[0];
-      UnscheduledARUses.erase(UnscheduledARUses.begin());
-      NextInstKind = IDAlu;
-  }
-
-
   DEBUG(
       if (SU) {
         dbgs() << " ** Pick node **\n";
@@ -217,20 +199,6 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
 
   int IK = getInstKind(SU);
 
-  // Check for AR register defines
-  for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(),
-                                        E = SU->getInstr()->operands_end();
-                                        I != E; ++I) {
-    if (I->isReg() && I->getReg() == AMDGPU::AR_X) {
-      if (I->isDef()) {
-        UnscheduledARDefs.push_back(SU);
-      } else {
-        UnscheduledARUses.push_back(SU);
-      }
-      return;
-    }
-  }
-
   // There is no export clause, we can schedule one as soon as its ready
   if (IK == IDOther)
     Available[IDOther].push_back(SU);
@@ -314,6 +282,10 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
     if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
       return AluT_XYZW;
 
+    // LDS src registers cannot be used in the Trans slot.
+    if (TII->readsLDSSrcReg(MI))
+      return AluT_XYZW;
+
     return AluAny;
 
 }
@@ -342,14 +314,16 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
   }
 }
 
-SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
   if (Q.empty())
     return NULL;
   for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
       It != E; ++It) {
     SUnit *SU = *It;
     InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)) {
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
+        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
+    ) {
       InstructionsGroupCandidate.pop_back();
       Q.erase((It + 1).base());
       return SU;
@@ -373,6 +347,8 @@ void R600SchedStrategy::PrepareNextSlot() {
   DEBUG(dbgs() << "New Slot\n");
   assert (OccupedSlotsMask && "Slot wasn't filled");
   OccupedSlotsMask = 0;
+//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+//    OccupedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
 }
@@ -409,12 +385,12 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
   }
 }
 
-SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
   static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
-  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
   if (SlotedSU)
     return SlotedSU;
-  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
   if (UnslotedSU)
     AssignSlot(UnslotedSU->getInstr(), Slot);
   return UnslotedSU;
@@ -434,30 +410,35 @@ SUnit* R600SchedStrategy::pickAlu() {
       // Bottom up scheduling : predX must comes first
       if (!AvailableAlus[AluPredX].empty()) {
         OccupedSlotsMask |= 31;
-        return PopInst(AvailableAlus[AluPredX]);
+        return PopInst(AvailableAlus[AluPredX], false);
       }
       // Flush physical reg copies (RA will discard them)
       if (!AvailableAlus[AluDiscarded].empty()) {
         OccupedSlotsMask |= 31;
-        return PopInst(AvailableAlus[AluDiscarded]);
+        return PopInst(AvailableAlus[AluDiscarded], false);
       }
       // If there is a T_XYZW alu available, use it
       if (!AvailableAlus[AluT_XYZW].empty()) {
         OccupedSlotsMask |= 15;
-        return PopInst(AvailableAlus[AluT_XYZW]);
+        return PopInst(AvailableAlus[AluT_XYZW], false);
       }
     }
     bool TransSlotOccuped = OccupedSlotsMask & 16;
-    if (!TransSlotOccuped) {
+    if (!TransSlotOccuped && VLIW5) {
       if (!AvailableAlus[AluTrans].empty()) {
         OccupedSlotsMask |= 16;
-        return PopInst(AvailableAlus[AluTrans]);
+        return PopInst(AvailableAlus[AluTrans], false);
+      }
+      SUnit *SU = AttemptFillSlot(3, true);
+      if (SU) {
+        OccupedSlotsMask |= 16;
+        return SU;
       }
     }
     for (int Chan = 3; Chan > -1; --Chan) {
       bool isOccupied = OccupedSlotsMask & (1 << Chan);
       if (!isOccupied) {
-        SUnit *SU = AttemptFillSlot(Chan);
+        SUnit *SU = AttemptFillSlot(Chan, false);
         if (SU) {
           OccupedSlotsMask |= (1 << Chan);
           InstructionsGroupCandidate.push_back(SU->getInstr());
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index f8965d8..97c8cde 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -53,8 +53,6 @@ class R600SchedStrategy : public MachineSchedStrategy {
 
   std::vector<SUnit *> Available[IDLast], Pending[IDLast];
   std::vector<SUnit *> AvailableAlus[AluLast];
-  std::vector<SUnit *> UnscheduledARDefs;
-  std::vector<SUnit *> UnscheduledARUses;
   std::vector<SUnit *> PhysicalRegCopy;
 
   InstKind CurInstKind;
@@ -84,15 +82,16 @@ public:
 
 private:
   std::vector<MachineInstr *> InstructionsGroupCandidate;
+  bool VLIW5;
 
   int getInstKind(SUnit *SU);
   bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
   AluKind getAluKind(SUnit *SU) const;
   void LoadAlu();
   unsigned AvailablesAluCount() const;
-  SUnit *AttemptFillSlot (unsigned Slot);
+  SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
   void PrepareNextSlot();
-  SUnit *PopInst(std::vector<SUnit*> &Q);
+  SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
 
   void AssignSlot(MachineInstr *MI, unsigned Slot);
   SUnit* pickAlu();
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index acacffa..cf719c0 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -50,6 +50,9 @@ isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
       E = MRI.def_end(); It != E; ++It) {
     return (*It).isImplicitDef();
   }
+  if (MRI.isReserved(Reg)) {
+    return false;
+  }
   llvm_unreachable("Reg without a def");
   return false;
 }
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 5cf1fd3..cd9b6ea 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -58,6 +58,8 @@ class R600PacketizerList : public VLIWPacketizerList {
 private:
   const R600InstrInfo *TII;
   const R600RegisterInfo &TRI;
+  bool VLIW5;
+  bool ConsideredInstUsesAlreadyWrittenVectorElement;
 
   unsigned getSlot(const MachineInstr *MI) const {
     return TRI.getHWRegChan(MI->getOperand(0).getReg());
@@ -74,7 +76,13 @@ private:
     MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
     if (I->isBundle())
       BI++;
+    int LastDstChan = -1;
     do {
+      bool isTrans = false;
+      int BISlot = getSlot(BI);
+      if (LastDstChan >= BISlot)
+        isTrans = true;
+      LastDstChan = BISlot;
       if (TII->isPredicated(BI))
         continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
@@ -85,7 +93,7 @@ private:
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
-      if (TII->isTransOnly(BI)) {
+      if (isTrans || TII->isTransOnly(BI)) {
         Result[Dst] = AMDGPU::PS;
         continue;
       }
@@ -142,10 +150,14 @@ public:
                         MachineDominatorTree &MDT)
   : VLIWPacketizerList(MF, MLI, MDT, true),
     TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
-    TRI(TII->getRegisterInfo()) { }
+    TRI(TII->getRegisterInfo()) {
+    VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+  }
 
   // initPacketizerState - initialize some internal flags.
-  void initPacketizerState() { }
+  void initPacketizerState() {
+    ConsideredInstUsesAlreadyWrittenVectorElement = false;
+  }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
   bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
@@ -172,8 +184,8 @@ public:
   // together.
   bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) <= getSlot(MIJ) && !TII->isTransOnly(MII))
-      return false;
+    if (getSlot(MII) == getSlot(MIJ))
+      ConsideredInstUsesAlreadyWrittenVectorElement = true;
     // Does MII and MIJ share the same pred_sel ?
     int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
         OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
@@ -194,6 +206,14 @@ public:
         return false;
       }
     }
+
+    bool ARDef = TII->definesAddressRegister(MII) ||
+                 TII->definesAddressRegister(MIJ);
+    bool ARUse = TII->usesAddressRegister(MII) ||
+                 TII->usesAddressRegister(MIJ);
+    if (ARDef && ARUse)
+      return false;
+
     return true;
   }
 
@@ -211,6 +231,20 @@ public:
                                  std::vector<R600InstrInfo::BankSwizzle> &BS,
                                  bool &isTransSlot) {
     isTransSlot = TII->isTransOnly(MI);
+    assert (!isTransSlot || VLIW5);
+
+    // Is the dst reg sequence legal ?
+    if (!isTransSlot && !CurrentPacketMIs.empty()) {
+      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
+            !TII->isVectorOnly(MI) && VLIW5) {
+          isTransSlot = true;
+          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
+        }
+        else
+          return false;
+      }
+    }
 
     // Are the Constants limitations met ?
     CurrentPacketMIs.push_back(MI);
@@ -246,6 +280,10 @@ public:
       return false;
     }
 
+    // We cannot read LDS source registrs from the Trans slot.
+    if (isTransSlot && TII->readsLDSSrcReg(MI))
+      return false;
+
     CurrentPacketMIs.pop_back();
     return true;
   }
@@ -278,6 +316,8 @@ public:
       return It;
     }
     endPacket(MI->getParent(), MI);
+    if (TII->isTransOnly(MI))
+      return MI;
     return VLIWPacketizerList::addToPacket(MI);
   }
 };
@@ -308,7 +348,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator End = MBB->end();
     MachineBasicBlock::iterator MI = MBB->begin();
     while (MI != End) {
-      if (MI->isKill() ||
+      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
           (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
         MachineBasicBlock::iterator DeleteMI = MI;
         ++MI;
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index a42043b..f3bb88b 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -28,6 +28,8 @@ R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
+  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
   Reserved.set(AMDGPU::ONE);
@@ -41,26 +43,15 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(AMDGPU::PRED_SEL_OFF);
   Reserved.set(AMDGPU::PRED_SEL_ZERO);
   Reserved.set(AMDGPU::PRED_SEL_ONE);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
 
   for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
                         E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
     Reserved.set(*I);
   }
 
-  for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(),
-                                     E = AMDGPU::TRegMemRegClass.end();
-                                     I !=  E; ++I) {
-    Reserved.set(*I);
-  }
+  TII->reserveIndirectRegisters(Reserved, MF);
 
-  const R600InstrInfo *RII =
-    static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-  std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
-  for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
-                                       E = IndirectRegs.end();
-                                       I != E; ++I) {
-    Reserved.set(*I);
-  }
   return Reserved;
 }
 
@@ -78,6 +69,10 @@ unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
 
+unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return GET_REG_INDEX(getEncodingValue(Reg));
+}
+
 const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
@@ -86,17 +81,20 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
-unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
-  switch (Channel) {
-    default: assert(!"Invalid channel index"); return 0;
-    case 0: return AMDGPU::sub0;
-    case 1: return AMDGPU::sub1;
-    case 2: return AMDGPU::sub2;
-    case 3: return AMDGPU::sub3;
-  }
-}
-
 const RegClassWeight &R600RegisterInfo::getRegClassWeight(
   const TargetRegisterClass *RC) const {
   return RCW;
 }
+
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  switch (Reg) {
+  case AMDGPU::OQAP:
+  case AMDGPU::OQBP:
+  case AMDGPU::AR_X:
+    return false;
+  default:
+    return true;
+  }
+}
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 9b286ee..c74c49e 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -39,16 +39,16 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const;
+
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
   virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
 
-  /// \returns the sub reg enum value for the given \p Channel
-  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
-  unsigned getSubRegFromChannel(unsigned Channel) const;
-
   virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
 
+  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
+  virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index fa987cf..68bcd20 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -39,8 +39,6 @@ foreach Index = 0-127 in {
     // Indirect addressing offset registers
     def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
                                               Index, Chan>;
-    def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index,
-                                                Chan>;
   }
   // 128-bit Temporary Registers
   def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
@@ -95,6 +93,12 @@ foreach Index = 448-480 in {
 
 // Special Registers
 
+def OQA : R600Reg<"OQA", 219>;
+def OQB : R600Reg<"OQB", 220>;
+def OQAP : R600Reg<"OQAP", 221>;
+def OQBP : R600Reg<"OQAP", 222>;
+def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
+def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
 def ZERO : R600Reg<"0.0", 248>;
 def ONE : R600Reg<"1.0", 249>;
 def NEG_ONE : R600Reg<"-1.0", 249>;
@@ -115,7 +119,6 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
 def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
 def AR_X : R600Reg<"AR.x", 0>;
-def OQAP : R600Reg<"OQAP", 221>;
 
 def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
                           (add (sequence "ArrayBase%u", 448, 480))>;
@@ -130,7 +133,8 @@ let isAllocatable = 0 in {
 // XXX: Only use the X channel, until we support wider stack widths
 def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
 
-} // End isAllocatable = 0
+def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
+  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
 
 def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
                               (add (sequence "KC0_%u_X", 128, 159))>;
@@ -164,6 +168,8 @@ def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (interleave R600_KC1_X, R600_KC1_Y,
                                                R600_KC1_Z, R600_KC1_W)>;
 
+} // End isAllocatable = 0
+
 def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (add (sequence "T%u_X", 0, 127), AR_X)>;
 
@@ -184,6 +190,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
     R600_TReg32,
     R600_ArrayBase,
     R600_Addr,
+    R600_KC0, R600_KC1,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
     ALU_CONST, ALU_PARAM, OQAP
     )>;
@@ -201,33 +208,3 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
 
 def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
                                 (add (sequence "T%u_XY", 0, 63))>;
-
-//===----------------------------------------------------------------------===//
-// Register classes for indirect addressing
-//===----------------------------------------------------------------------===//
-
-// Super register for all the Indirect Registers.  This register class is used
-// by the REG_SEQUENCE instruction to specify the registers to use for direct
-// reads / writes which may be written / read by an indirect address.
-class IndirectSuper<string n, list<Register> subregs> :
-    RegisterWithSubRegs<n, subregs> {
-  let Namespace = "AMDGPU";
-  let SubRegIndices =
- [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-  sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15];
-}
-
-def IndirectSuperReg : IndirectSuper<"Indirect",
-  [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X,
-   TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X,
-   TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X]
->;
-
-def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>;
-
-// This register class defines the registers that are the storage units for
-// the "Indirect Addressing" pseudo memory space.
-// XXX: Only use the X channel, until we support wider stack widths
-def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32,
-  (add (sequence "TRegMem%u_X", 0, 16))
->;
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
index 3768ba0..3258894 100644
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -35,9 +35,9 @@ class R600TextureIntrinsicsReplacer :
   FunctionType *TexSign;
   FunctionType *TexQSign;
 
-  void getAdjustementFromTextureTarget(unsigned TextureType, bool hasLOD,
-                                       unsigned SrcSelect[4], unsigned CT[4],
-                                       bool &useShadowVariant) {
+  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
+                                      unsigned SrcSelect[4], unsigned CT[4],
+                                      bool &useShadowVariant) {
     enum TextureTypes {
       TEXTURE_1D = 1,
       TEXTURE_2D,
@@ -60,6 +60,7 @@ class R600TextureIntrinsicsReplacer :
 
     switch (TextureType) {
     case 0:
+      useShadowVariant = false;
       return;
     case TEXTURE_RECT:
     case TEXTURE_1D:
@@ -93,9 +94,8 @@ class R600TextureIntrinsicsReplacer :
     }
 
     if (TextureType == TEXTURE_CUBE_ARRAY ||
-        TextureType == TEXTURE_SHADOWCUBE_ARRAY) {
+        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
       CT[2] = 0;
-    }
 
     if (TextureType == TEXTURE_1D_ARRAY ||
         TextureType == TEXTURE_SHADOW1D_ARRAY) {
@@ -114,9 +114,8 @@ class R600TextureIntrinsicsReplacer :
         TextureType == TEXTURE_SHADOW2D ||
         TextureType == TEXTURE_SHADOWRECT ||
         TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-        !(hasLOD && useShadowVariant)) {
+        !(hasLOD && useShadowVariant))
       SrcSelect[3] = 2;
-    }
   }
 
   void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
@@ -174,8 +173,8 @@ class R600TextureIntrinsicsReplacer :
     };
     bool useShadowVariant;
 
-    getAdjustementFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
-                                    useShadowVariant);
+    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
+                                   useShadowVariant);
 
     ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
                     Offset, ResourceId, SamplerId, CT, Coord);
@@ -198,8 +197,8 @@ class R600TextureIntrinsicsReplacer :
     };
     bool useShadowVariant;
 
-    getAdjustementFromTextureTarget(TextureType, false, SrcSelect, CT,
-                                    useShadowVariant);
+    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
+                                   useShadowVariant);
 
     ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
                     Offset, ResourceId, SamplerId, CT, Coord);
@@ -259,6 +258,9 @@ public:
   }
 
   void visitCallInst(CallInst &I) {
+    if (!I.getCalledFunction())
+      return;
+
     StringRef Name = I.getCalledFunction()->getName();
     if (Name == "llvm.AMDGPU.tex") {
       ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 147578c..2cbce28 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -11,6 +11,18 @@
 #ifndef SIDEFINES_H_
 #define SIDEFINES_H_
 
+namespace SIInstrFlags {
+enum {
+  MIMG = 1 << 3,
+  SMRD = 1 << 4,
+  VOP1 = 1 << 5,
+  VOP2 = 1 << 6,
+  VOP3 = 1 << 7,
+  VOPC = 1 << 8,
+  SALU = 1 << 9
+};
+}
+
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 435172a..3370c79 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -23,9 +23,9 @@
 ///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
 ///  BB2:
 ///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
-///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> 
+///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+///
 ///
-/// 
 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting
 /// code will look like this:
 ///
@@ -43,7 +43,7 @@
 /// Now that the result of the PHI instruction is an SGPR, the register
 /// allocator is now forced to constrain the register class of %vreg3 to
 /// <sgpr> so we end up with final code like this:
-/// 
+///
 /// BB0:
 ///   %vreg0 <sgpr> = SCALAR_INST
 ///    ...
@@ -55,7 +55,7 @@
 ///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
 ///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
 ///
-/// Now this code contains an illegal copy from a VGPR to an SGPR. 
+/// Now this code contains an illegal copy from a VGPR to an SGPR.
 ///
 /// In order to avoid this problem, this pass searches for PHI instructions
 /// which define a <vsrc> register and constrains its definition class to
@@ -65,10 +65,14 @@
 /// ultimately led to the creation of an illegal COPY.
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "sgpr-copies"
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -79,9 +83,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 
 private:
   static char ID;
-  const TargetRegisterClass *inferRegClass(const TargetRegisterInfo *TRI,
+  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
                                            const MachineRegisterInfo &MRI,
-                                           unsigned Reg) const;
+                                           unsigned Reg,
+                                           unsigned SubReg) const;
+  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const;
+  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
+                        const MachineRegisterInfo &MRI) const;
 
 public:
   SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
@@ -102,25 +113,41 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
   return new SIFixSGPRCopies(tm);
 }
 
-/// This functions walks the use/def chains starting with the definition of
-/// \p Reg until it finds an Instruction that isn't a COPY returns
-/// the register class of that instruction.
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClass(
-                                                 const TargetRegisterInfo *TRI,
+static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (!MI.getOperand(i).isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+      continue;
+
+    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+      return true;
+  }
+  return false;
+}
+
+/// This functions walks the use list of Reg until it finds an Instruction
+/// that isn't a COPY returns the register class of that instruction.
+/// \return The register defined by the first non-COPY instruction.
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
+                                                 const SIRegisterInfo *TRI,
                                                  const MachineRegisterInfo &MRI,
-                                                 unsigned Reg) const {
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
   // The Reg parameter to the function must always be defined by either a PHI
   // or a COPY, therefore it cannot be a physical register.
   assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
          "Reg cannot be a physical register");
 
   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  RC = TRI->getSubRegClass(RC, SubReg);
   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
                                          E = MRI.use_end(); I != E; ++I) {
     switch (I->getOpcode()) {
     case AMDGPU::COPY:
-      RC = TRI->getCommonSubClass(RC, inferRegClass(TRI, MRI,
-                                                    I->getOperand(0).getReg()));
+      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
+                                  I->getOperand(0).getReg(),
+                                  I->getOperand(0).getSubReg()));
       break;
     }
   }
@@ -128,9 +155,48 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClass(
   return RC;
 }
 
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
+    return TRI->getSubRegClass(RC, SubReg);
+  }
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def->getOpcode() != AMDGPU::COPY) {
+    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
+  }
+
+  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
+                                   Def->getOperand(1).getSubReg());
+}
+
+bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
+                                      const SIRegisterInfo *TRI,
+                                      const MachineRegisterInfo &MRI) const {
+
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  const TargetRegisterClass *SrcRC;
+
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      DstRC == &AMDGPU::M0RegRegClass)
+    return false;
+
+  SrcRC = inferRegClassFromDef(TRI, MRI, SrcReg, SrcSubReg);
+  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
 
@@ -138,13 +204,58 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
                                                       I != E; ++I) {
       MachineInstr &MI = *I;
-      if (MI.getOpcode() != AMDGPU::PHI) {
-        continue;
+      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
+        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
+        DEBUG(MI.print(dbgs()));
+        TII->moveToVALU(MI);
+
+      }
+
+      switch (MI.getOpcode()) {
+      default: continue;
+      case AMDGPU::PHI: {
+        DEBUG(dbgs() << " Fixing PHI:\n");
+        DEBUG(MI.print(dbgs()));
+
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+          MRI.constrainRegClass(Reg, RC);
+        }
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+        if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) {
+          MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
+        }
+
+        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
+          break;
+
+        // If a PHI node defines an SGPR and any of its operands are VGPRs,
+        // then we need to move it to the VALU.
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
+            TII->moveToVALU(MI);
+            break;
+          }
+        }
+
+        break;
+      }
+      case AMDGPU::REG_SEQUENCE: {
+        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
+            !hasVGPROperands(MI, TRI))
+          continue;
+
+        DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n");
+        DEBUG(MI.print(dbgs()));
+
+        TII->moveToVALU(MI);
+        break;
       }
-      unsigned Reg = MI.getOperand(0).getReg();
-      const TargetRegisterClass *RC = inferRegClass(TRI, MRI, Reg);
-      if (RC == &AMDGPU::VSrc_32RegClass) {
-        MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
       }
     }
   }
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index c64027f..d5d2b68 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -34,18 +34,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
 
-  addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass);
-  addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass);
-
-  addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
   addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
 
-  addRegisterClass(MVT::v1i32, &AMDGPU::VSrc_32RegClass);
-
   addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
   addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
   addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
@@ -62,6 +56,21 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   computeRegisterProperties();
 
+  // Condition Codes
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
@@ -69,6 +78,32 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+
+  setOperationAction(ISD::BITCAST, MVT::i128, Legal);
+
+  // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
+
+  // We need to custom lower loads/stores from private memory
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i64, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i64, Custom);
+  setOperationAction(ISD::STORE, MVT::i128, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
 
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@@ -78,14 +113,31 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
+  setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
 
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
 
   setTargetDAGCombine(ISD::SELECT_CC);
 
@@ -102,24 +154,28 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
                                                      bool *IsFast) const {
   // XXX: This depends on the address space and also we may want to revist
   // the alignment values we specify in the DataLayout.
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
   return VT.bitsGT(MVT::i32);
 }
 
+bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const {
+  return VT.bitsLE(MVT::i16);
+}
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT,
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc DL, SDValue Chain,
                                          unsigned Offset) const {
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                             AMDGPUAS::CONSTANT_ADDRESS);
-  EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits());
   SDValue BasePtr =  DAG.getCopyFromReg(Chain, DL,
                            MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                              DAG.getConstant(Offset, MVT::i64));
-  return DAG.getLoad(VT, DL, Chain, Ptr,
-                            MachinePointerInfo(UndefValue::get(PtrTy)),
-                            false, false, false, ArgVT.getSizeInBits() >> 3);
+  return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr,
+                            MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
+                            false, false, MemVT.getSizeInBits() >> 3);
 
 }
 
@@ -146,7 +202,8 @@ SDValue SITargetLowering::LowerFormalArguments(
     const ISD::InputArg &Arg = Ins[i];
 
     // First check if it's a PS input addr
-    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
+    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+        !Arg.Flags.isByVal()) {
 
       assert((PSInputNum <= 15) && "Too many PS inputs!");
 
@@ -177,7 +234,7 @@ SDValue SITargetLowering::LowerFormalArguments(
         NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
 
-    } else {
+    } else if (Info->ShaderType != ShaderType::COMPUTE) {
       Splits.push_back(Arg);
     }
   }
@@ -200,6 +257,11 @@ SDValue SITargetLowering::LowerFormalArguments(
     MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
   }
 
+  if (Info->ShaderType == ShaderType::COMPUTE) {
+    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                            Splits);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
@@ -214,9 +276,11 @@ SDValue SITargetLowering::LowerFormalArguments(
     EVT VT = VA.getLocVT();
 
     if (VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = Splits[i].VT;
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(),
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
                                    36 + VA.getLocMemOffset());
       InVals.push_back(Arg);
       continue;
@@ -320,6 +384,19 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
+  case AMDGPU::SI_RegisterStorePseudo: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
+                Reg);
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+      MIB.addOperand(MI->getOperand(i));
+
+    MI->eraseFromParent();
+  }
   }
   return BB;
 }
@@ -335,6 +412,24 @@ MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
   return MVT::i32;
 }
 
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    return false; /* There is V_MAD_F32 for f32 */
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
@@ -344,9 +439,27 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::ADD: return LowerADD(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::LOAD: {
+    LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+    if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+         Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+        Op.getValueType().isVector()) {
+      SDValue MergedValues[2] = {
+        SplitVectorLoad(Op, DAG),
+        Load->getChain()
+      };
+      return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
+    } else {
+      return LowerLOAD(Op, DAG);
+    }
+  }
+
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::ANY_EXTEND: // Fall-through
   case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -359,23 +472,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     switch (IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case Intrinsic::r600_read_ngroups_x:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0);
     case Intrinsic::r600_read_ngroups_y:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4);
     case Intrinsic::r600_read_ngroups_z:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8);
     case Intrinsic::r600_read_global_size_x:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12);
     case Intrinsic::r600_read_global_size_y:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16);
     case Intrinsic::r600_read_global_size_z:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20);
     case Intrinsic::r600_read_local_size_x:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24);
     case Intrinsic::r600_read_local_size_y:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28);
     case Intrinsic::r600_read_local_size_z:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32);
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
@@ -394,13 +507,102 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
                                   AMDGPU::VGPR2, VT);
-
+    case AMDGPUIntrinsic::SI_load_const: {
+      SDValue Ops [] = {
+        ResourceDescriptorToi128(Op.getOperand(1), DAG),
+        Op.getOperand(2)
+      };
+
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MachinePointerInfo(),
+          MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+          VT.getSizeInBits() / 8, 4);
+      return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                     Op->getVTList(), Ops, 2, VT, MMO);
+    }
+    case AMDGPUIntrinsic::SI_sample:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampleb:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampled:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+    case AMDGPUIntrinsic::SI_samplel:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+    case AMDGPUIntrinsic::SI_vs_load_input:
+      return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                         ResourceDescriptorToi128(Op.getOperand(1), DAG),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
     }
   }
+
+  case ISD::INTRINSIC_VOID:
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    switch (IntrinsicID) {
+      case AMDGPUIntrinsic::SI_tbuffer_store: {
+        SDLoc DL(Op);
+        SDValue Ops [] = {
+          Chain,
+          ResourceDescriptorToi128(Op.getOperand(2), DAG),
+          Op.getOperand(3),
+          Op.getOperand(4),
+          Op.getOperand(5),
+          Op.getOperand(6),
+          Op.getOperand(7),
+          Op.getOperand(8),
+          Op.getOperand(9),
+          Op.getOperand(10),
+          Op.getOperand(11),
+          Op.getOperand(12),
+          Op.getOperand(13),
+          Op.getOperand(14)
+        };
+        EVT VT = Op.getOperand(3).getValueType();
+
+        MachineMemOperand *MMO = MF.getMachineMemOperand(
+            MachinePointerInfo(),
+            MachineMemOperand::MOStore,
+            VT.getSizeInBits() / 8, 4);
+        return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                       Op->getVTList(), Ops,
+                                       sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+      }
+      default:
+        break;
+    }
   }
   return SDValue();
 }
 
+SDValue SITargetLowering::LowerADD(SDValue Op,
+                                   SelectionDAG &DAG) const {
+  if (Op.getValueType() != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero);
+  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One);
+
+  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero);
+  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One);
+
+  SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue);
+
+  SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1);
+  SDValue Carry = AddLo.getValue(1);
+  SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry);
+
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0));
+}
+
 /// \brief Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
@@ -495,6 +697,53 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+
+  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    return SDValue();
+
+  SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                                 Load->getBasePtr(), DAG.getConstant(0, MVT::i32));
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+                            DAG.getConstant(2, MVT::i32));
+
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(), Ptr,
+                            DAG.getTargetConstant(0, MVT::i32),
+                            Op.getOperand(2));
+  SDValue MergedValues[2] = {
+    Ret,
+    Load->getChain()
+  };
+  return DAG.getMergeValues(MergedValues, 2, DL);
+
+}
+
+SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
+                                             SelectionDAG &DAG) const {
+
+  if (Op.getValueType() == MVT::i128) {
+    return Op;
+  }
+
+  assert(Op.getOpcode() == ISD::UNDEF);
+
+  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
+                     DAG.getConstant(0, MVT::i64),
+                     DAG.getConstant(0, MVT::i64));
+}
+
+SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
+                                               const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
+                     Op.getOperand(2),
+                     ResourceDescriptorToi128(Op.getOperand(3), DAG),
+                     Op.getOperand(4));
+}
+
 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -529,6 +778,56 @@ SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
   return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
 }
 
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode())
+    return Ret;
+
+  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+      return SplitVectorStore(Op, DAG);
+
+  if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    return SDValue();
+
+  SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+                            DAG.getConstant(2, MVT::i32));
+  SDValue Chain = Store->getChain();
+  SmallVector<SDValue, 8> Values;
+
+  if (VT == MVT::i64) {
+    for (unsigned i = 0; i < 2; ++i) {
+      Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                       Store->getValue(), DAG.getConstant(i, MVT::i32)));
+    }
+  } else if (VT == MVT::i128) {
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned j = 0; j < 2; ++j) {
+        Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                           DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
+                           Store->getValue(), DAG.getConstant(i, MVT::i32)),
+                         DAG.getConstant(j, MVT::i32)));
+      }
+    }
+  } else {
+    Values.push_back(Store->getValue());
+  }
+
+  for (unsigned i = 0; i < Values.size(); ++i) {
+    SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32,
+                                  Ptr, DAG.getConstant(i, MVT::i32));
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                        Chain, Values[i], PartPtr,
+                        DAG.getTargetConstant(0, MVT::i32));
+  }
+  return Chain;
+}
+
+
 SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
                                            SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -555,7 +854,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
     default: break;
     case ISD::SELECT_CC: {
-      N->dump();
       ConstantSDNode *True, *False;
       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
       if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
@@ -759,8 +1057,8 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
     return;
   }
 
-  // This is a conservative aproach, it is possible that we can't determine
-  // the correct register class and copy too often, but better save than sorry.
+  // This is a conservative aproach. It is possible that we can't determine the
+  // correct register class and copy too often, but better safe than sorry.
   SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
   SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
                                     Operand.getValueType(), Operand, RC);
@@ -937,7 +1235,9 @@ static unsigned SubIdx2Lane(unsigned Idx) {
 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                        SelectionDAG &DAG) const {
   SDNode *Users[4] = { };
-  unsigned Writemask = 0, Lane = 0;
+  unsigned Lane = 0;
+  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned NewDmask = 0;
 
   // Try to figure out the used register components
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -948,29 +1248,42 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
       return;
 
+    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+    // Note that subregs are packed, i.e. Lane==0 is the first bit set
+    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+    // set, etc.
     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
 
+    // Set which texture component corresponds to the lane.
+    unsigned Comp;
+    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+      assert(Dmask);
+      Comp = countTrailingZeros(Dmask);
+      Dmask &= ~(1 << Comp);
+    }
+
     // Abort if we have more than one user per component
     if (Users[Lane])
       return;
 
     Users[Lane] = *I;
-    Writemask |= 1 << Lane;
+    NewDmask |= 1 << Comp;
   }
 
-  // Abort if all components are used
-  if (Writemask == 0xf)
+  // Abort if there's no change
+  if (NewDmask == OldDmask)
     return;
 
   // Adjust the writemask in the node
   std::vector<SDValue> Ops;
-  Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
+  Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
     Ops.push_back(Node->getOperand(i));
   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
 
   // If we only got one lane, replace it with a copy
-  if (Writemask == (1U << Lane)) {
+  // (if NewDmask has only one bit set...)
+  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
     SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                       SDLoc(), Users[Lane]->getValueType(0),
@@ -1001,9 +1314,11 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 /// \brief Fold the instructions after slecting them
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
   Node = AdjustRegClass(Node, DAG);
 
-  if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
+  if (TII->isMIMG(Node->getMachineOpcode()))
     adjustWritemask(Node, DAG);
 
   return foldOperands(Node, DAG);
@@ -1013,7 +1328,9 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                      SDNode *Node) const {
-  if (AMDGPU::isMIMG(MI->getOpcode()) == -1)
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  if (!TII->isMIMG(MI->getOpcode()))
     return;
 
   unsigned VReg = MI->getOperand(0).getReg();
@@ -1030,6 +1347,8 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   case 3:  RC = &AMDGPU::VReg_96RegClass; break;
   }
 
+  unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
+  MI->setDesc(TII->get(NewOpcode));
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MRI.setRegClass(VReg, RC);
 }
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index b4202c4..9933ece 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -21,13 +21,19 @@
 namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL,
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
                          SDValue Chain, unsigned Offset) const;
+  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
+                               SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
   const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
@@ -44,6 +50,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
 public:
   SITargetLowering(TargetMachine &tm);
   bool allowsUnalignedMemoryAccesses(EVT  VT, bool *IsFast) const;
+  virtual bool shouldSplitVectorElementType(EVT VT) const;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
@@ -55,6 +62,7 @@ public:
                                               MachineBasicBlock * BB) const;
   virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
   virtual MVT getScalarShiftAmountTy(EVT VT) const;
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index ba202e3..7ef662e 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -134,14 +134,19 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
   // LGKM may uses larger values
   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 
-    MachineOperand &Op = MI.getOperand(0);
-    if (!Op.isReg())
-      Op = MI.getOperand(1);
-    assert(Op.isReg() && "First LGKM operand must be a register!");
+    if (TII->isSMRD(MI.getOpcode())) {
 
-    unsigned Reg = Op.getReg();
-    unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-    Result.Named.LGKM = Size > 4 ? 2 : 1;
+      MachineOperand &Op = MI.getOperand(0);
+      assert(Op.isReg() && "First LGKM operand must be a register!");
+
+      unsigned Reg = Op.getReg();
+      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+      Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+    } else {
+      // DS
+      Result.Named.LGKM = 1;
+    }
 
   } else {
     Result.Named.LGKM = 0;
@@ -181,7 +186,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 
 RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
 
-  if (!Op.isReg())
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     return std::make_pair(0, 0);
 
   unsigned Reg = Op.getReg();
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 434aa7e..53ebaaf 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -17,10 +17,24 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
   field bits<1> LGKM_CNT = 0;
+  field bits<1> MIMG = 0;
+  field bits<1> SMRD = 0;
+  field bits<1> VOP1 = 0;
+  field bits<1> VOP2 = 0;
+  field bits<1> VOP3 = 0;
+  field bits<1> VOPC = 0;
+  field bits<1> SALU = 0;
 
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
   let TSFlags{2} = LGKM_CNT;
+  let TSFlags{3} = MIMG;
+  let TSFlags{4} = SMRD;
+  let TSFlags{5} = VOP1;
+  let TSFlags{6} = VOP2;
+  let TSFlags{7} = VOP3;
+  let TSFlags{8} = VOPC;
+  let TSFlags{9} = SALU;
 }
 
 class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -55,6 +69,7 @@ class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -73,6 +88,7 @@ class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -90,6 +106,7 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -106,6 +123,7 @@ class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
@@ -123,6 +141,7 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
@@ -140,6 +159,7 @@ class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
   let Inst{31-27} = 0x18; //encoding
 
   let LGKM_CNT = 1;
+  let SMRD = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -162,6 +182,8 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP1 = 1;
 }
 
 class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -180,6 +202,8 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP2 = 1;
 }
 
 class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -208,6 +232,8 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
 }
 
 class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -234,6 +260,8 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
 }
 
 class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
@@ -251,6 +279,7 @@ class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let VOPC = 1;
 }
 
 class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -414,6 +443,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
+  let MIMG = 1;
 }
 
 def EXP : Enc64<
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 551ae86..ab55c1b 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -15,10 +15,10 @@
 
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIDefines.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include <stdio.h>
 
 using namespace llvm;
 
@@ -31,6 +31,10 @@ const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
   return RI;
 }
 
+//===----------------------------------------------------------------------===//
+// TargetInstrInfo callbacks
+//===----------------------------------------------------------------------===//
+
 void
 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -118,14 +122,14 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_32RegClass.contains(SrcReg));
+           AMDGPU::SReg_32RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
 
   } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_64RegClass.contains(SrcReg));
+           AMDGPU::SReg_64RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_1;
 
@@ -136,19 +140,19 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_128RegClass.contains(SrcReg));
+           AMDGPU::SReg_128RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_3;
 
   } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_256RegClass.contains(SrcReg));
+           AMDGPU::SReg_256RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_7;
 
   } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_512RegClass.contains(SrcReg));
+           AMDGPU::SReg_512RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_15;
 
@@ -168,7 +172,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
-
   int NewOpc;
 
   // Try to map original to commuted opcode
@@ -185,11 +188,36 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                               bool NewMI) const {
 
-  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg() ||
-      !MI->getOperand(2).isReg())
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
     return 0;
 
-  MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  // Cannot commute VOP2 if src0 is SGPR.
+  if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
+      RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
+   return 0;
+
+  if (!MI->getOperand(2).isReg()) {
+    // XXX: Commute instructions with FPImm operands
+    if (NewMI || MI->getOperand(2).isFPImm() ||
+       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+      return 0;
+    }
+
+    // XXX: Commute VOP3 instructions with abs and neg set.
+    if (isVOP3(MI->getOpcode()) &&
+        (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::abs)).getImm() ||
+         MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::neg)).getImm()))
+      return 0;
+
+    unsigned Reg = MI->getOperand(1).getReg();
+    MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm());
+    MI->getOperand(2).ChangeToRegister(Reg, false);
+  } else {
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
 
   if (MI)
     MI->setDesc(get(commuteOpcode(MI->getOpcode())));
@@ -197,15 +225,12 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
   return MI;
 }
 
-MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                           int64_t Imm) const {
-  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc());
-  MachineInstrBuilder MIB(*MF, MI);
-  MIB.addReg(DstReg, RegState::Define);
-  MIB.addImm(Imm);
-
-  return MI;
-
+MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         unsigned SrcReg) const {
+  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
+                 DstReg) .addReg(SrcReg);
 }
 
 bool SIInstrInfo::isMov(unsigned Opcode) const {
@@ -224,32 +249,397 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return RC != &AMDGPU::EXECRegRegClass;
 }
 
-//===----------------------------------------------------------------------===//
-// Indirect addressing callbacks
-//===----------------------------------------------------------------------===//
+int SIInstrInfo::isMIMG(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+}
 
-unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  assert(Channel == 0);
-  return RegIndex;
+int SIInstrInfo::isSMRD(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+}
+
+bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+}
+
+bool SIInstrInfo::isVOP2(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+}
+
+bool SIInstrInfo::isVOP3(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+}
+
+bool SIInstrInfo::isVOPC(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+}
+
+bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
+  if(MO.isImm()) {
+    return MO.getImm() >= -16 && MO.getImm() <= 64;
+  }
+  if (MO.isFPImm()) {
+    return MO.getFPImm()->isExactlyValue(0.0)  ||
+           MO.getFPImm()->isExactlyValue(0.5)  ||
+           MO.getFPImm()->isExactlyValue(-0.5) ||
+           MO.getFPImm()->isExactlyValue(1.0)  ||
+           MO.getFPImm()->isExactlyValue(-1.0) ||
+           MO.getFPImm()->isExactlyValue(2.0)  ||
+           MO.getFPImm()->isExactlyValue(-2.0) ||
+           MO.getFPImm()->isExactlyValue(4.0)  ||
+           MO.getFPImm()->isExactlyValue(-4.0);
+  }
+  return false;
+}
+
+bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
+  return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                    StringRef &ErrInfo) const {
+  uint16_t Opcode = MI->getOpcode();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  // Verify VOP*
+  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+    unsigned ConstantBusCount = 0;
+    unsigned SGPRUsed = AMDGPU::NoRegister;
+    for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() &&
+          !TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+
+        // EXEC register uses the constant bus.
+        if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+          ++ConstantBusCount;
+
+        // SGPRs use the constant bus
+        if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
+            (!MO.isImplicit() &&
+            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+            AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
+          if (SGPRUsed != MO.getReg()) {
+            ++ConstantBusCount;
+            SGPRUsed = MO.getReg();
+          }
+        }
+      }
+      // Literal constants use the constant bus.
+      if (isLiteralConstant(MO))
+        ++ConstantBusCount;
+    }
+    if (ConstantBusCount > 1) {
+      ErrInfo = "VOP* instruction uses the constant bus more than once";
+      return false;
+    }
+  }
+
+  // Verify SRC1 for VOP2 and VOPC
+  if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
+    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
+    if (Src1.isImm() || Src1.isFPImm()) {
+      ErrInfo = "VOP[2C] src1 cannot be an immediate.";
+      return false;
+    }
+  }
+
+  // Verify VOP3
+  if (isVOP3(Opcode)) {
+    if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) {
+      ErrInfo = "VOP3 src0 cannot be a literal constant.";
+      return false;
+    }
+    if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) {
+      ErrInfo = "VOP3 src1 cannot be a literal constant.";
+      return false;
+    }
+    if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) {
+      ErrInfo = "VOP3 src2 cannot be a literal constant.";
+      return false;
+    }
+  }
+  return true;
+}
+
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return AMDGPU::INSTRUCTION_LIST_END;
+  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
+  case AMDGPU::COPY: return AMDGPU::COPY;
+  case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
+  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+  case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
+  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
+  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
+  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
+  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  }
+}
+
+bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
+  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
+}
+
+const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
+                                                      unsigned OpNo) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &Desc = get(MI.getOpcode());
+  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
+      Desc.OpInfo[OpNo].RegClass == -1)
+    return MRI.getRegClass(MI.getOperand(OpNo).getReg());
+
+  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  return RI.getRegClass(RCID);
+}
+
+bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::REG_SEQUENCE:
+    return RI.hasVGPRs(getOpRegClass(MI, 0));
+  default:
+    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
+  }
 }
 
+void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &MO = MI->getOperand(OpIdx);
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
+  const TargetRegisterClass *RC = RI.getRegClass(RCID);
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (MO.isReg()) {
+    Opcode = AMDGPU::COPY;
+  } else if (RI.isSGPRClass(RC)) {
+    Opcode = AMDGPU::S_MOV_B32;
+  }
+
+  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+  unsigned Reg = MRI.createVirtualRegister(VRC);
+  BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode),
+          Reg).addOperand(MO);
+  MO.ChangeToRegister(Reg, false);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src2);
+
+  // Legalize VOP2
+  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+    // If the instruction implicitly reads VCC, we can't have any SGPR operands,
+    // so move any.
+    bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI);
+    if (ReadsVCC && Src0.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) {
+      legalizeOpWithMove(MI, Src0Idx);
+      return;
+    }
+
+    if (ReadsVCC && Src1.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+      legalizeOpWithMove(MI, Src1Idx);
+      return;
+    }
+
+    // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must
+    // be the first operand, and there can only be one.
+    if (Src1.isImm() || Src1.isFPImm() ||
+        (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) {
+      if (MI->isCommutable()) {
+        if (commuteInstruction(MI))
+          return;
+      }
+      legalizeOpWithMove(MI, Src1Idx);
+    }
+  }
+
+  // XXX - Do any VOP3 instructions read VCC?
+  // Legalize VOP3
+  if (isVOP3(MI->getOpcode())) {
+    int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
+    unsigned SGPRReg = AMDGPU::NoRegister;
+    for (unsigned i = 0; i < 3; ++i) {
+      int Idx = VOP3Idx[i];
+      if (Idx == -1)
+        continue;
+      MachineOperand &MO = MI->getOperand(Idx);
+
+      if (MO.isReg()) {
+        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+          continue; // VGPRs are legal
+
+        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+
+        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+          SGPRReg = MO.getReg();
+          // We can use one SGPR in each VOP3 instruction.
+          continue;
+        }
+      } else if (!isLiteralConstant(MO)) {
+        // If it is not a register and not a literal constant, then it must be
+        // an inline constant which is always legal.
+        continue;
+      }
+      // If we make it this far, then the operand is not legal and we must
+      // legalize it.
+      legalizeOpWithMove(MI, Idx);
+    }
+  }
+
+  // Legalize REG_SEQUENCE
+  // The register class of the operands much be the same type as the register
+  // class of the output.
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+    const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL;
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      const TargetRegisterClass *OpRC =
+              MRI.getRegClass(MI->getOperand(i).getReg());
+      if (RI.hasVGPRs(OpRC)) {
+        VRC = OpRC;
+      } else {
+        SRC = OpRC;
+      }
+    }
+
+    // If any of the operands are VGPR registers, then they all most be
+    // otherwise we will create illegal VGPR->SGPR copies when legalizing
+    // them.
+    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
+      if (!VRC) {
+        assert(SRC);
+        VRC = RI.getEquivalentVGPRClass(SRC);
+      }
+      RC = VRC;
+    } else {
+      RC = SRC;
+    }
 
-int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  llvm_unreachable("Unimplemented");
+    // Update all the operands so they have the same type.
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      unsigned DstReg = MRI.createVirtualRegister(RC);
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+              get(AMDGPU::COPY), DstReg)
+              .addOperand(MI->getOperand(i));
+      MI->getOperand(i).setReg(DstReg);
+    }
+  }
 }
 
-int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  llvm_unreachable("Unimplemented");
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+  SmallVector<MachineInstr *, 128> Worklist;
+  Worklist.push_back(&TopInst);
+
+  while (!Worklist.empty()) {
+    MachineInstr *Inst = Worklist.pop_back_val();
+    unsigned NewOpcode = getVALUOp(*Inst);
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+      continue;
+
+    MachineRegisterInfo &MRI = Inst->getParent()->getParent()->getRegInfo();
+
+    // Use the new VALU Opcode.
+    const MCInstrDesc &NewDesc = get(NewOpcode);
+    Inst->setDesc(NewDesc);
+
+    // Remove any references to SCC. Vector instructions can't read from it, and
+    // We're just about to add the implicit use / defs of VCC, and we don't want
+    // both.
+    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst->getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
+        Inst->RemoveOperand(i);
+    }
+
+    // Add the implict and explicit register definitions.
+    if (NewDesc.ImplicitUses) {
+      for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+        unsigned Reg = NewDesc.ImplicitUses[i];
+        Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+      }
+    }
+
+    if (NewDesc.ImplicitDefs) {
+      for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+        unsigned Reg = NewDesc.ImplicitDefs[i];
+        Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+      }
+    }
+
+    legalizeOperands(Inst);
+
+    // Update the destination register class.
+    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
+
+    switch (Inst->getOpcode()) {
+      // For target instructions, getOpRegClass just returns the virtual
+      // register class associated with the operand, so we need to find an
+      // equivalent VGPR register class in order to move the instruction to the
+      // VALU.
+    case AMDGPU::COPY:
+    case AMDGPU::PHI:
+    case AMDGPU::REG_SEQUENCE:
+      if (RI.hasVGPRs(NewDstRC))
+        continue;
+      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+      if (!NewDstRC)
+        continue;
+      break;
+    default:
+      break;
+    }
+
+    unsigned DstReg = Inst->getOperand(0).getReg();
+    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+    MRI.replaceRegWith(DstReg, NewDstReg);
+
+    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+           E = MRI.use_end(); I != E; ++I) {
+      MachineInstr &UseMI = *I;
+      if (!canReadVGPR(UseMI, I.getOperandNo())) {
+        Worklist.push_back(&UseMI);
+      }
+    }
+  }
 }
 
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass(
-                                                     unsigned SourceReg) const {
-  llvm_unreachable("Unimplemented");
+//===----------------------------------------------------------------------===//
+// Indirect addressing callbacks
+//===----------------------------------------------------------------------===//
+
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
 }
 
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const {
-  llvm_unreachable("Unimplemented");
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::VReg_32RegClass;
 }
 
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
@@ -257,7 +647,17 @@ MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock::iterator I,
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
-  llvm_unreachable("Unimplemented");
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
+          .addReg(IndirectBaseReg, RegState::Define)
+          .addOperand(I->getOperand(0))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0)
+          .addReg(ValueReg);
 }
 
 MachineInstrBuilder SIInstrInfo::buildIndirectRead(
@@ -265,9 +665,43 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
                                    MachineBasicBlock::iterator I,
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
-  llvm_unreachable("Unimplemented");
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0);
+
 }
 
-const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const {
-  llvm_unreachable("Unimplemented");
+void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                            const MachineFunction &MF) const {
+  int End = getIndirectIndexEnd(MF);
+  int Begin = getIndirectIndexBegin(MF);
+
+  if (End == -1)
+    return;
+
+
+  for (int Index = Begin; Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
 }
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 87eff4d..4af6348 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
+//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -25,6 +25,14 @@ class SIInstrInfo : public AMDGPUInstrInfo {
 private:
   const SIRegisterInfo RI;
 
+  MachineInstrBuilder buildIndirectIndexLoop(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator I,
+                                             unsigned OffsetVGPR,
+                                             unsigned MovRelOp,
+                                             unsigned Dst,
+                                             unsigned Src0) const;
+  // If you add or remove instructions from this function, you will
+
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
@@ -40,25 +48,65 @@ public:
   virtual MachineInstr *commuteInstruction(MachineInstr *MI,
                                            bool NewMI=false) const;
 
-  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                        int64_t Imm) const;
-
   virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const;
   virtual bool isMov(unsigned Opcode) const;
 
   virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
-
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+  int isMIMG(uint16_t Opcode) const;
+  int isSMRD(uint16_t Opcode) const;
+  bool isVOP1(uint16_t Opcode) const;
+  bool isVOP2(uint16_t Opcode) const;
+  bool isVOP3(uint16_t Opcode) const;
+  bool isVOPC(uint16_t Opcode) const;
+  bool isInlineConstant(const MachineOperand &MO) const;
+  bool isLiteralConstant(const MachineOperand &MO) const;
+
+  virtual bool verifyInstruction(const MachineInstr *MI,
+                                 StringRef &ErrInfo) const;
+
+  bool isSALUInstr(const MachineInstr &MI) const;
+  static unsigned getVALUOp(const MachineInstr &MI);
+  bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+
+  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// instructions, this will return the register class that has been defined
+  /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
+  /// the register class of its machine operand.
+  /// to infer the correct register class base on the other operands.
+  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
+                                           unsigned OpNo) const;\
+
+  /// \returns true if it is legal for the operand at index \p OpNo
+  /// to read a VGPR.
+  bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
+
+  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// a MOV.  For example:
+  /// ADD_I32_e32 VGPR0, 15
+  /// to
+  /// MOV VGPR1, 15
+  /// ADD_I32_e32 VGPR0, VGPR1
+  ///
+  /// If the operand being legalized is a register, then a COPY will be used
+  /// instead of MOV.
+  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr *MI) const;
+
+  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// opcode.  This function will also move the users of \p MI to the
+  /// VALU if necessary.
+  void moveToVALU(MachineInstr &MI) const;
 
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const;
 
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                      unsigned SourceReg) const;
-
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
 
   virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
                                                  MachineBasicBlock::iterator I,
@@ -71,16 +119,18 @@ public:
                                                 unsigned ValueReg,
                                                 unsigned Address,
                                                 unsigned OffsetReg) const;
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
 
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
-  };
+  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
+              unsigned SavReg, unsigned IndexReg) const;
+};
 
 namespace AMDGPU {
 
   int getVOPe64(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
-  int isMIMG(uint16_t Opcode);
 
 } // End namespace AMDGPU
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 52af79c..4cd0daa 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -16,6 +16,45 @@ def SIadd64bit32bit : SDNode<"ISD::ADD",
   SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
 >;
 
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+                      [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
+  SDTypeProfile<0, 13,
+    [SDTCisVT<0, i128>,   // rsrc(SGPR)
+     SDTCisVT<1, iAny>,   // vdata(VGPR)
+     SDTCisVT<2, i32>,    // num_channels(imm)
+     SDTCisVT<3, i32>,    // vaddr(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // offen(imm)
+     SDTCisVT<9, i32>,    // idxen(imm)
+     SDTCisVT<10, i32>,   // glc(imm)
+     SDTCisVT<11, i32>,   // slc(imm)
+     SDTCisVT<12, i32>    // tfe(imm)
+    ]>,
+  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+                       SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
+                       SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
 // Transformation function, extract the lower 32bit of a 64bit immediate
 def LO32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
@@ -45,6 +84,14 @@ def IMM8bitDWORD : ImmLeaf <
   }]>
 >;
 
+def as_i1imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i1);
+}]>;
+
+def as_i8imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i8);
+}]>;
+
 def as_i16imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
 }]>;
@@ -58,6 +105,26 @@ class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
     (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
 }]>;
 
+class SGPRImm <dag frag> : PatLeaf<frag, [{
+  if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() <
+      AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return false;
+  }
+  const SIRegisterInfo *SIRI =
+                       static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                U != E; ++U) {
+    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+      return true;
+    }
+  }
+  return false;
+}]>;
+
+def FRAMEri64 : Operand<iPTR> {
+  let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
+}
+
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
@@ -109,6 +176,11 @@ class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   opName#" $dst, $src0, $src1", pattern
 >;
 
+class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
 class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
   op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -184,6 +256,12 @@ multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
 multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
   : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
 
+multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>;
+
+multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>;
+
 multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
                         string opName, list<dag> pattern, string revOp> {
   def _e32 : VOP2 <
@@ -320,6 +398,18 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
   let vdst = 0;
 }
 
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS <
+  op,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i8imm:$offset0,
+       i8imm:$offset1),
+  asm#" $gds, $vdst, $addr, $data0, $offset0, $offset1, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+  let data1 = 0;
+}
+
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
@@ -358,9 +448,9 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
   }
 }
 
-class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
-                         ValueType VT> :
-    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+                            i16imm:$offset),
           name#" $vdata, $srsrc + $vaddr + $offset",
          []> {
 
@@ -391,11 +481,18 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
   let mayStore = 0;
 }
 
-class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
+class MIMG_Mask <string op, int channels> {
+  string Op = op;
+  int Channels = channels;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+                             RegisterClass dst_rc,
+                             RegisterClass src_rc> : MIMG <
   op,
-  (outs VReg_128:$vdata),
+  (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc",
@@ -406,11 +503,31 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
-class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+                                      RegisterClass dst_rc,
+                                      int channels> {
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+                           RegisterClass dst_rc,
+                           RegisterClass src_rc> : MIMG <
   op,
-  (outs VReg_128:$vdata),
+  (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc, SReg_128:$ssamp),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
@@ -420,6 +537,28 @@ class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
+}
+
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
 //===----------------------------------------------------------------------===//
@@ -442,6 +581,14 @@ def getCommuteRev : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+def getMaskedMIMGOp : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["4"];
+  let ValueCols = [["1"], ["2"], ["3"] ];
+}
+
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "VOP2_REV";
@@ -451,13 +598,4 @@ def getCommuteOrig : InstrMapping {
   let ValueCols = [["1"]];
 }
 
-// Test if the supplied opcode is an MIMG instruction
-def isMIMG : InstrMapping {
-  let FilterClass = "MIMG";
-  let RowFields = ["Inst"];
-  let ColFields = ["Size"];
-  let KeyCol = ["8"];
-  let ValueCols = [["8"]];
-}
-
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 500d15e..76f05eb 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -23,7 +23,9 @@ def InterpSlot : Operand<i32> {
 }
 
 def isSI : Predicate<"Subtarget.getGeneration() "
-                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+
+def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
 let Predicates = [isSI] in {
 
@@ -126,8 +128,11 @@ def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
 def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
 } // End isCompare = 1
 
-def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
-def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+let Defs = [SCC], isCommutable = 1 in {
+  def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
+  def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+}
+
 //def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
 def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
 def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
@@ -138,19 +143,19 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 let isCompare = 1 in {
 
 defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
-defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_LT>;
-defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_EQ>;
-defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_LE>;
-defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_GT>;
-defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", f32, COND_NE>;
-defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_GE>;
-defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32">;
-defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32">;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>;
 defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
 defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
 defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
 defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
-defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_NE>;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
 defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
 defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
 
@@ -176,19 +181,19 @@ defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
-defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_LT>;
-defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_EQ>;
-defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_LE>;
-defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_GT>;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>;
 defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
-defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_GE>;
-defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64">;
-defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64">;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>;
 defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
 defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
 defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
 defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
-defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_NE>;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
 defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
 defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
 
@@ -290,12 +295,12 @@ defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
-defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_LT>;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>;
 defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_LE>;
-defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_GT>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>;
 defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
-defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_GE>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
 defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -312,12 +317,12 @@ defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
-defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64">;
-defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64">;
-defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64">;
-defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64">;
-defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64">;
-defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64">;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
 defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -334,12 +339,12 @@ defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
-defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32">;
-defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32">;
-defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32">;
-defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32">;
-defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32">;
-defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32">;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
 defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -356,12 +361,12 @@ defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
-defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64">;
-defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64">;
-defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64">;
-defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64">;
-defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64">;
-defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64">;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
 defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -391,8 +396,16 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
+def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
+def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
+def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
+def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
 def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
+def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>;
+def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>;
+def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>;
+def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>;
 
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
@@ -409,19 +422,25 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", V
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
-//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
-//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
+
+def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
+  0x00000018, "BUFFER_STORE_BYTE", VReg_32
+>;
+
+def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
+  0x0000001a, "BUFFER_STORE_SHORT", VReg_32
+>;
 
 def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32
+  0x0000001c, "BUFFER_STORE_DWORD", VReg_32
 >;
 
 def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64
+  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64
 >;
 
 def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
-  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32
+  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128
 >;
 //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
 //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
@@ -463,21 +482,24 @@ def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
 //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
 //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
 def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
-//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
-//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
-//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
-//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
+def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>;
+def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>;
+def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
+def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
 
 let mayLoad = 1 in {
 
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>;
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
 defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
 defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
 defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
 defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
@@ -500,8 +522,8 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 
 //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
 //def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
-//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
-def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
 //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
 //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
@@ -510,7 +532,7 @@ def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
 //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
 //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
 //def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
 //def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
@@ -528,20 +550,20 @@ def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-def IMAGE_SAMPLE : MIMG_Sampler_Helper <0x00000020, "IMAGE_SAMPLE">; 
+defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">;
 //def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
-def IMAGE_SAMPLE_D : MIMG_Sampler_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">;
 //def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
-def IMAGE_SAMPLE_L : MIMG_Sampler_Helper <0x00000024, "IMAGE_SAMPLE_L">;
-def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
-def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper <0x0000002a, "IMAGE_SAMPLE_C_D">;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
-def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
 //def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
 //def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
@@ -603,15 +625,21 @@ defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
 } // End neverHasSideEffects = 1, isMoveImm = 1
 
 defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
-//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
-//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
+defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64",
+  [(set i32:$dst, (fp_to_sint f64:$src0))]
+>;
+defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32",
+  [(set f64:$dst, (sint_to_fp i32:$src0))]
+>;
 defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
   [(set f32:$dst, (sint_to_fp i32:$src0))]
 >;
 defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
   [(set f32:$dst, (uint_to_fp i32:$src0))]
 >;
-defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
+defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32",
+  [(set i32:$dst, (fp_to_uint f32:$src0))]
+>;
 defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
   [(set i32:$dst, (fp_to_sint f32:$src0))]
 >;
@@ -621,8 +649,12 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
 //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
 //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
 //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
-//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
-//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
+defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
+  [(set f32:$dst, (fround f64:$src0))]
+>;
+defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
+  [(set f64:$dst, (fextend f32:$src0))]
+>;
 //defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
 //defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
 //defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
@@ -791,7 +823,7 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
   let mayStore = 1;
 }
 
-def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
   []
 >;
 } // End hasSideEffects
@@ -827,6 +859,11 @@ def : Pat <
   (V_CNDMASK_B32_e64 $src0, $src1, $src2)
 >;
 
+def : Pat <
+  (i32 (trunc i64:$val)),
+  (EXTRACT_SUBREG $val, sub0)
+>;
+
 //use two V_CNDMASK_B32_e64 instructions for f64
 def : Pat <
   (f64 (select i1:$src2, f64:$src1, f64:$src0)),
@@ -910,9 +947,13 @@ defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
 >;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
 
+let hasPostISelHook = 1 in {
+
 defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
   [(set i32:$dst, (shl i32:$src0, i32:$src1))]
 >;
+
+}
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
 
 defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
@@ -936,16 +977,13 @@ defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
 defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
-  [(set i32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
->;
-
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
-  [(set i32:$dst, (sub i32:$src0, i32:$src1))]
->;
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", []>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", []>;
 defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">;
 
-let Uses = [VCC] in { // Carry-out comes from VCC
+let Uses = [VCC] in { // Carry-in comes from VCC
 defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>;
 defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>;
 defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], "V_SUBB_U32">;
@@ -999,8 +1037,12 @@ def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
 def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
 def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
 defm : BFIPatterns <V_BFI_B32>;
-def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
-def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
+def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
+  [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+>;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
 def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
@@ -1087,12 +1129,31 @@ def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
 def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
 def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
 def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
 def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
 def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
 def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
@@ -1124,7 +1185,9 @@ def : Pat <
   (S_OR_B64 $src0, $src1)
 >;
 def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+>;
 def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
 def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
 def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
@@ -1135,12 +1198,31 @@ def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
 def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
 def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
 def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
-def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
-def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
-def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
 def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
 def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
 def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
@@ -1160,7 +1242,7 @@ def LOAD_CONST : AMDGPUShaderInst <
   [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
 >;
 
-// SI Psuedo instructions. These are used by the CFG structurizer pass
+// SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
 let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
@@ -1233,6 +1315,36 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
+//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri64, ADDRIndirect>;
+
+let UseNamedOperandTable = 1 in {
+
+def SI_RegisterLoad : AMDGPUShaderInst <
+  (outs VReg_32:$dst, SReg_64:$temp),
+  (ins FRAMEri64:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterLoad = 1;
+  let mayLoad = 1;
+}
+
+class SIRegStore<dag outs> : AMDGPUShaderInst <
+  outs,
+  (ins VReg_32:$val, FRAMEri64:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterStore = 1;
+  let mayStore = 1;
+}
+
+let usesCustomInserter = 1 in {
+def SI_RegisterStorePseudo : SIRegStore<(outs)>;
+} // End usesCustomInserter = 1
+def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
+
+
+} // End UseNamedOperandTable = 1
+
 def SI_INDIRECT_SRC : InstSI <
   (outs VReg_32:$dst, SReg_64:$temp),
   (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
@@ -1249,6 +1361,7 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   let Constraints = "$src = $dst";
 }
 
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
@@ -1258,7 +1371,7 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
 let usesCustomInserter = 1 in {
 
-// This psuedo instruction takes a pointer as input and outputs a resource
+// This pseudo instruction takes a pointer as input and outputs a resource
 // constant that can be used with the ADDR64 MUBUF instructions.
 def SI_ADDR64_RSRC : InstSI <
   (outs SReg_128:$srsrc),
@@ -1289,7 +1402,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset)
 >;
 
@@ -1310,67 +1423,85 @@ def : Pat <
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
-/* int_SI_sample for simple 1D texture lookup */
+/* SIsample for simple 1D texture lookup */
 def : Pat <
-  (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
-  (IMAGE_SAMPLE 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+  (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleRectPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT),
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
     (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleArrayPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY),
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowPattern<Intrinsic name, MIMG opcode,
+class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-/* int_SI_sample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<ValueType addr_type> {
-  def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-
-  def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-
-  def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
-
-  def : SamplePattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
-  def : SampleArrayPattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
-  def : SampleShadowPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+  def : SamplePattern <SIsample, sample, addr_type>;
+  def : SampleRectPattern <SIsample, sample, addr_type>;
+  def : SampleArrayPattern <SIsample, sample, addr_type>;
+  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+  def : SamplePattern <SIsamplel, sample_l, addr_type>;
+  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <SIsampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <SIsampled, sample_d, addr_type>;
+  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
 }
 
-defm : SamplePatterns<v2i32>;
-defm : SamplePatterns<v4i32>;
-defm : SamplePatterns<v8i32>;
-defm : SamplePatterns<v16i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+                      v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+                      v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+                      v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+                      v16i32>;
 
 /* int_SI_imageload for texture fetches consuming varying address parameters */
 class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
@@ -1383,23 +1514,46 @@ class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> :
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
 >;
 
-multiclass ImageLoadPatterns<ValueType addr_type> {
-  def : ImageLoadPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
-  def : ImageLoadArrayPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
+class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
+}
+
+multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
 }
 
-defm : ImageLoadPatterns<v2i32>;
-defm : ImageLoadPatterns<v4i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
+
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
 
 /* Image resource information */
 def : Pat <
   (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
-  (IMAGE_GET_RESINFO 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
 >;
 
 def : Pat <
   (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
-  (IMAGE_GET_RESINFO 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
 >;
 
 /********** ============================================ **********/
@@ -1470,16 +1624,6 @@ foreach Index = 0-15 in {
   >;
 }
 
-def : Vector1_Build <v1i32, i32, VReg_32>;
-def : Vector2_Build <v2i32, i32>;
-def : Vector2_Build <v2f32, f32>;
-def : Vector4_Build <v4i32, i32>;
-def : Vector4_Build <v4f32, f32>;
-def : Vector8_Build <v8i32, i32>;
-def : Vector8_Build <v8f32, f32>;
-def : Vector16_Build <v16i32, i32>;
-def : Vector16_Build <v16f32, f32>;
-
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <i32, f32, VReg_32>;
 
@@ -1492,9 +1636,17 @@ def : BitConvert <f64, i64, VReg_64>;
 
 def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <v2i32, v2f32, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
 
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
+def : BitConvert <v4i32, i128,  VReg_128>;
+def : BitConvert <i128, v4i32,  VReg_128>;
+
+def : BitConvert <v8i32, v32i8, SReg_256>;
+def : BitConvert <v32i8, v8i32, SReg_256>;
+def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v32i8, v8i32, VReg_256>;
 
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
@@ -1523,6 +1675,16 @@ def : Pat <
 /********** ================== **********/
 
 def : Pat <
+  (SGPRImm<(i32 imm)>:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+def : Pat <
+  (SGPRImm<(f32 fpimm)>:$imm),
+  (S_MOV_B32 fpimm:$imm)
+>;
+
+def : Pat <
   (i32 imm:$imm),
   (V_MOV_B32_e32 imm:$imm)
 >;
@@ -1634,19 +1796,19 @@ def : Pat <
 
 // 1. Offset as 8bit DWORD immediate
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, IMM8bitDWORD:$offset),
+  (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
   (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset)
 >;
 
 // 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, imm:$offset),
+  (SIload_constant i128:$sbase, imm:$offset),
   (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
 >;
 
 // 3. Offset in an 32Bit VGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, i32:$voff),
+  (SIload_constant i128:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff)
 >;
 
@@ -1677,17 +1839,36 @@ def : Pat <
 /**********   Load/Store Patterns   **********/
 /********** ======================= **********/
 
-def : Pat <
-    (local_load i64:$src0),
-    (i32 (DS_READ_B32 0, (EXTRACT_SUBREG $src0, sub0),
-                      (EXTRACT_SUBREG $src0, sub0), (EXTRACT_SUBREG $src0, sub0), 0, 0))
+class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag i32:$src0),
+  (vt (inst 0, $src0, $src0, $src0, 0, 0))
 >;
 
+def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
+def : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
+def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
+def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
+def : DSReadPat <DS_READ_B32, i32, local_load>;
 def : Pat <
-    (local_store i32:$src1, i64:$src0),
-    (DS_WRITE_B32 0, (EXTRACT_SUBREG $src0, sub0), $src1, $src1, 0, 0)
+    (local_load i32:$src0),
+    (i32 (DS_READ_B32 0, $src0, $src0, $src0, 0, 0))
+>;
+
+class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag i32:$src1, i32:$src0),
+  (inst 0, $src0, $src1, $src1, 0, 0)
 >;
 
+def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
+def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
+def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+
+def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
+           (DS_ADD_U32_RTN 0, $ptr, $val, 0, 0)>;
+
+def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
+           (DS_SUB_U32_RTN 0, $ptr, $val, 0, 0)>;
+
 /********** ================== **********/
 /**********   SMRD Patterns    **********/
 /********** ================== **********/
@@ -1717,8 +1898,11 @@ defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
 
 //===----------------------------------------------------------------------===//
 // MUBUF Patterns
@@ -1766,23 +1950,46 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
                           global_load, constant_load>;
 
-multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt> {
+multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> {
 
   def : Pat <
-    (global_store vt:$value, i64:$ptr),
+    (st vt:$value, i64:$ptr),
     (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
   >;
 
   def : Pat <
-    (global_store vt:$value, (add i64:$ptr, i64:$offset)),
+    (st vt:$value, (add i64:$ptr, i64:$offset)),
     (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0)
    >;
 }
 
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE, i32, truncstorei8_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT, i32, truncstorei16_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
+  (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
+                   imm:$nfmt, imm:$offen, imm:$idxen,
+                   imm:$glc, imm:$slc, imm:$tfe),
+  (opcode
+    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
@@ -1834,6 +2041,37 @@ def : Pat<
   (V_CMP_U_F32_e64 $src0, $src1)
 >;
 
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i64 (trunc i128:$x)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (i32 (EXTRACT_SUBREG $x, sub0)), sub0),
+    (i32 (EXTRACT_SUBREG $x, sub1)), sub1)
+>;
+
+def : Pat <
+  (i32 (trunc i64:$a)),
+  (EXTRACT_SUBREG $a, sub0)
+>;
+
+// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+  (i32 (addc i32:$src0, i32:$src1)),
+  (S_ADD_I32 $src0, $src1)
+>;
+
+def : Pat <
+  (or i64:$a, i64:$b),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (IMPLICIT_DEF),
+      (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub0), (EXTRACT_SUBREG $b, sub0)), sub0),
+    (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub1), (EXTRACT_SUBREG $b, sub1)), sub1)
+>;
+
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 2fa073e..7fcc964 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -17,10 +17,28 @@ let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
-  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
+  def int_SI_tbuffer_store : Intrinsic <
+    [],
+    [llvm_anyint_ty, // rsrc(SGPR)
+     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // inst_offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i32_ty,    // offen(imm)
+     llvm_i32_ty,    // idxen(imm)
+     llvm_i32_ty,    // glc(imm)
+     llvm_i32_ty,    // slc(imm)
+     llvm_i32_ty],   // tfe(imm)
+    []>;
+
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index c2e8f02..958763d 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -377,10 +377,13 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Vec = MI.getOperand(2).getReg();
   unsigned Off = MI.getOperand(4).getImm();
+  unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Vec;
 
-  MachineInstr *MovRel = 
+  MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
+            .addReg(SubReg + Off)
             .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Vec, RegState::Implicit);
 
@@ -395,10 +398,13 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Off = MI.getOperand(4).getImm();
   unsigned Val = MI.getOperand(5).getReg();
+  unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Dst;
 
   MachineInstr *MovRel = 
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
+            .addReg(SubReg + Off, RegState::Define)
             .addReg(Val)
             .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Dst, RegState::Implicit);
@@ -409,6 +415,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
   bool NeedM0 = false;
@@ -476,6 +483,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           IndirectSrc(MI);
           break;
 
+        case AMDGPU::SI_INDIRECT_DST_V1:
         case AMDGPU::SI_INDIRECT_DST_V2:
         case AMDGPU::SI_INDIRECT_DST_V4:
         case AMDGPU::SI_INDIRECT_DST_V8:
@@ -487,6 +495,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           NeedWQM = true;
           // Fall through
         case AMDGPU::DS_WRITE_B32:
+        case AMDGPU::DS_ADD_U32_RTN:
           NeedM0 = true;
           break;
 
@@ -508,7 +517,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
             AMDGPU::M0).addImm(0xffffffff);
   }
 
-  if (NeedWQM) {
+  if (NeedWQM && MFI->ShaderType != ShaderType::COMPUTE) {
     MachineBasicBlock &MBB = MF.front();
     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index ee0e307..071f9fa 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -13,6 +13,10 @@
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void SIMachineFunctionInfo::anchor() {}
+
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     PSInputAddr(0) { }
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 6da9f7f..2f1961c 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -22,6 +22,7 @@ namespace llvm {
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  virtual void anchor();
 public:
   SIMachineFunctionInfo(const MachineFunction &MF);
   unsigned PSInputAddr;
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 50fd4c7..ed0bbaf 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -15,6 +15,7 @@
 
 #include "SIRegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIInstrInfo.h"
 
 using namespace llvm;
 
@@ -25,6 +26,10 @@ SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
+  Reserved.set(AMDGPU::EXEC);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+  TII->reserveIndirectRegisters(Reserved, MF);
   return Reserved;
 }
 
@@ -50,6 +55,10 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
+unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return getEncodingValue(Reg);
+}
+
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
@@ -70,3 +79,53 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   }
   return NULL;
 }
+
+bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
+  if (!RC) {
+    return false;
+  }
+  return !hasVGPRs(RC);
+}
+
+bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
+  return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
+                                         const TargetRegisterClass *SRC) const {
+    if (hasVGPRs(SRC)) {
+      return SRC;
+    } else if (SRC == &AMDGPU::SCCRegRegClass) {
+      return &AMDGPU::VCCRegRegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
+      return &AMDGPU::VReg_32RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
+      return &AMDGPU::VReg_64RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
+      return &AMDGPU::VReg_128RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
+      return &AMDGPU::VReg_256RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
+      return &AMDGPU::VReg_512RegClass;
+    }
+    return NULL;
+}
+
+const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
+                         const TargetRegisterClass *RC, unsigned SubIdx) const {
+  if (SubIdx == AMDGPU::NoSubRegister)
+    return RC;
+
+  // If this register has a sub-register, we can safely assume it is a 32-bit
+  // register, becuase all of SI's sub-registers are 32-bit.
+  if (isSGPRClass(RC)) {
+    return &AMDGPU::SGPR_32RegClass;
+  } else {
+    return &AMDGPU::VGPR_32RegClass;
+  }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index d0df4f9..8148f7f 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -42,9 +42,27 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// CFGStructurizer
   virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const;
+
   /// \brief Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
   const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+
+  /// \returns true if this class contains only SGPR registers
+  bool isSGPRClass(const TargetRegisterClass *RC) const;
+
+  /// \returns true if this class contains VGPR registers.
+  bool hasVGPRs(const TargetRegisterClass *RC) const;
+
+  /// \returns A VGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentVGPRClass(
+                                          const TargetRegisterClass *SRC) const;
+
+  /// \returns The register class that is used for a sub-register of \p RC for
+  /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
+  /// be returned.
+  const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
+                                            unsigned SubIdx) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 292b9d2..49bdbc9 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -43,7 +43,7 @@ def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
                             (add (sequence "SGPR%u", 0, 101))>;
 
 // SGPR 64-bit registers
-def SGPR_64 : RegisterTuples<[sub0, sub1],
+def SGPR_64Regs : RegisterTuples<[sub0, sub1],
                              [(add (decimate (trunc SGPR_32, 101), 2)),
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
@@ -153,15 +153,17 @@ def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
   (add SGPR_32, M0Reg)
 >;
 
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
+
 def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
-  (add SGPR_64, VCCReg, EXECReg)
+  (add SGPR_64Regs, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>;
 
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
 def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
@@ -172,9 +174,9 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
   let Size = 96;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>;
 
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
new file mode 100644
index 0000000..f194d8b
--- /dev/null
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -0,0 +1,162 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+///   - v16i8 is used for constant memory resource descriptors.  This type is
+///      legal for some compute APIs, and we don't want to declare it as legal
+///      in the backend, because we want the legalizer to expand all v16i8
+///      operations.
+/// v1* => *
+///   - Having v1* types complicates the legalizer and we can easily replace
+///   - them with the element type.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+                       public InstVisitor<SITypeRewriter> {
+
+  static char ID;
+  Module *Mod;
+  Type *v16i8;
+  Type *i128;
+
+public:
+  SITypeRewriter() : FunctionPass(ID) { }
+  virtual bool doInitialization(Module &M);
+  virtual bool runOnFunction(Function &F);
+  virtual const char *getPassName() const {
+    return "SI Type Rewriter";
+  }
+  void visitLoadInst(LoadInst &I);
+  void visitCallInst(CallInst &I);
+  void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+  Mod = &M;
+  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+  i128 = Type::getIntNTy(M.getContext(), 128);
+  return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+  AttributeSet Set = F.getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType");
+
+  unsigned ShaderType = ShaderType::COMPUTE;
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    Str.getAsInteger(0, ShaderType);
+  }
+  if (ShaderType != ShaderType::COMPUTE) {
+    visit(F);
+  }
+
+  visit(F);
+
+  return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  Type *PtrTy = Ptr->getType();
+  Type *ElemTy = PtrTy->getPointerElementType();
+  IRBuilder<> Builder(&I);
+  if (ElemTy == v16i8)  {
+    Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+    LoadInst *Load = Builder.CreateLoad(BitCast);
+    SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
+    I.getAllMetadataOtherThanDebugLoc(MD);
+    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+      Load->setMetadata(MD[i].first, MD[i].second);
+    }
+    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+    I.replaceAllUsesWith(BitCastLoad);
+    I.eraseFromParent();
+  }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+  IRBuilder<> Builder(&I);
+  SmallVector <Value*, 8> Args;
+  SmallVector <Type*, 8> Types;
+  bool NeedToReplace = false;
+  Function *F = I.getCalledFunction();
+  std::string Name = F->getName().str();
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    Value *Arg = I.getArgOperand(i);
+    if (Arg->getType() == v16i8) {
+      Args.push_back(Builder.CreateBitCast(Arg, i128));
+      Types.push_back(i128);
+      NeedToReplace = true;
+      Name = Name + ".i128";
+    } else if (Arg->getType()->isVectorTy() &&
+               Arg->getType()->getVectorNumElements() == 1 &&
+               Arg->getType()->getVectorElementType() ==
+                                              Type::getInt32Ty(I.getContext())){
+      Type *ElementTy = Arg->getType()->getVectorElementType();
+      std::string TypeName = "i32";
+      InsertElementInst *Def = dyn_cast<InsertElementInst>(Arg);
+      assert(Def);
+      Args.push_back(Def->getOperand(1));
+      Types.push_back(ElementTy);
+      std::string VecTypeName = "v1" + TypeName;
+      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
+      NeedToReplace = true;
+    } else {
+      Args.push_back(Arg);
+      Types.push_back(Arg->getType());
+    }
+  }
+
+  if (!NeedToReplace) {
+    return;
+  }
+  Function *NewF = Mod->getFunction(Name);
+  if (!NewF) {
+    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+    NewF->setAttributes(F->getAttributes());
+  }
+  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+  I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+  IRBuilder<> Builder(&I);
+  if (I.getDestTy() != i128) {
+    return;
+  }
+
+  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+    if (Op->getSrcTy() == i128) {
+      I.replaceAllUsesWith(Op->getOperand(0));
+      I.eraseFromParent();
+    }
+  }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+  return new SITypeRewriter();
+}
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index acf7496..6339394 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_TARGET_DEFINITIONS Sparc.td)
 
 tablegen(LLVM SparcGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM SparcGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM SparcGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SparcGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM SparcGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM SparcGenSubtargetInfo.inc -gen-subtarget)
@@ -20,6 +21,8 @@ add_llvm_target(SparcCodeGen
   SparcSubtarget.cpp
   SparcTargetMachine.cpp
   SparcSelectionDAGInfo.cpp
+  SparcJITInfo.cpp
+  SparcCodeEmitter.cpp
   )
 
 add_dependencies(LLVMSparcCodeGen SparcCommonTableGen intrinsics_gen)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index b101751..9a0466a 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "delay-slot-filler"
 #include "Sparc.h"
+#include "SparcSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -39,10 +40,13 @@ namespace {
     /// layout, etc.
     ///
     TargetMachine &TM;
+    const SparcSubtarget *Subtarget;
 
     static char ID;
     Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm) { }
+      : MachineFunctionPass(ID), TM(tm),
+        Subtarget(&TM.getSubtarget<SparcSubtarget>()) {
+    }
 
     virtual const char *getPassName() const {
       return "SPARC Delay Slot Filler";
@@ -102,6 +106,8 @@ FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
     MachineBasicBlock::iterator MI = I;
     ++I;
@@ -114,6 +120,14 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    if (!Subtarget->isV9() &&
+        (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
+         || MI->getOpcode() == SP::FCMPQ)) {
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+      Changed = true;
+      continue;
+    }
+
     // If MI has no delay slot, skip.
     if (!MI->hasDelaySlot())
       continue;
@@ -126,7 +140,6 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     ++FilledSlots;
     Changed = true;
 
-    const TargetInstrInfo *TII = TM.getInstrInfo();
     if (D == MBB.end())
       BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
     else
@@ -156,7 +169,7 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
   if (slot == MBB.begin())
     return MBB.end();
 
-  if (slot->getOpcode() == SP::RET)
+  if (slot->getOpcode() == SP::RET || slot->getOpcode() == SP::TLS_CALL)
     return MBB.end();
 
   if (slot->getOpcode() == SP::RETL) {
@@ -342,6 +355,7 @@ bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
   case SP::CALL: structSizeOpNum = 1; break;
   case SP::JMPLrr:
   case SP::JMPLri: structSizeOpNum = 2; break;
+  case SP::TLS_CALL: return false;
   }
 
   const MachineOperand &MO = I->getOperand(structSizeOpNum);
diff --git a/lib/Target/Sparc/LLVMBuild.txt b/lib/Target/Sparc/LLVMBuild.txt
index 7d54d32..fd8e5d9 100644
--- a/lib/Target/Sparc/LLVMBuild.txt
+++ b/lib/Target/Sparc/LLVMBuild.txt
@@ -23,6 +23,7 @@ type = TargetGroup
 name = Sparc
 parent = Target
 has_asmprinter = 1
+has_jit = 1
 
 [component_1]
 type = Library
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
index aac0e8d..f3caeaa 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
@@ -53,7 +53,27 @@ enum TOF {
 
   // Extract bits 41-32 of an address.
   // Assembler: %hm(addr)
-  MO_HM
+  MO_HM,
+
+  // TargetFlags for Thread Local Storage.
+  MO_TLS_GD_HI22,
+  MO_TLS_GD_LO10,
+  MO_TLS_GD_ADD,
+  MO_TLS_GD_CALL,
+  MO_TLS_LDM_HI22,
+  MO_TLS_LDM_LO10,
+  MO_TLS_LDM_ADD,
+  MO_TLS_LDM_CALL,
+  MO_TLS_LDO_HIX22,
+  MO_TLS_LDO_LOX10,
+  MO_TLS_LDO_ADD,
+  MO_TLS_IE_HI22,
+  MO_TLS_IE_LO10,
+  MO_TLS_IE_LD,
+  MO_TLS_IE_LDX,
+  MO_TLS_IE_ADD,
+  MO_TLS_LE_HIX22,
+  MO_TLS_LE_LOX10
 };
 
 } // end namespace SPII
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 5a52abe..baac36b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -21,23 +21,26 @@ void SparcELFMCAsmInfo::anchor() { }
 SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   IsLittleEndian = false;
   Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::sparcv9) {
+  bool isV9 = (TheTriple.getArch() == Triple::sparcv9);
+
+  if (isV9) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
 
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  // .xword is only supported by V9.
+  Data64bitsDirective = (isV9) ? "\t.xword\t" : 0;
   ZeroDirective = "\t.skip\t";
   CommentString = "!";
   HasLEB128 = true;
   SupportsDebugInformation = true;
-  
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  WeakRefDirective = "\t.weak\t";
-
   PrivateGlobalPrefix = ".L";
 }
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 621e8ff..1e58e37 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -14,12 +14,12 @@
 #ifndef SPARCTARGETASMINFO_H
 #define SPARCTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
 
-  class SparcELFMCAsmInfo : public MCAsmInfo {
+  class SparcELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit SparcELFMCAsmInfo(StringRef TT);
diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
index 4b81ada..c171db7 100644
--- a/lib/Target/Sparc/Makefile
+++ b/lib/Target/Sparc/Makefile
@@ -14,7 +14,8 @@ TARGET = Sparc
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = SparcGenRegisterInfo.inc SparcGenInstrInfo.inc \
 		SparcGenAsmWriter.inc SparcGenDAGISel.inc \
-		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc
+		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc \
+		SparcGenCodeEmitter.inc
 
 DIRS = TargetInfo MCTargetDesc
 
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index 98563db..f44b604 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -26,6 +26,8 @@ namespace llvm {
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
   FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                              JITCodeEmitter &JCE);
 
 } // end namespace llvm;
 
@@ -103,5 +105,22 @@ namespace llvm {
     }
     llvm_unreachable("Invalid cond code");
   }
+
+  inline static unsigned HI22(int64_t imm) {
+    return (unsigned)((imm >> 10) & ((1 << 22)-1));
+  }
+
+  inline static unsigned LO10(int64_t imm) {
+    return (unsigned)(imm & 0x3FF);
+  }
+
+  inline static unsigned HIX22(int64_t imm) {
+    return HI22(~imm);
+  }
+
+  inline static unsigned LOX10(int64_t imm) {
+    return ~LO10(~imm);
+  }
+
 }  // end namespace llvm
 #endif
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index d42c40f..0df48f6 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -30,6 +30,10 @@ def FeatureVIS
   : SubtargetFeature<"vis", "IsVIS", "true",
                      "Enable UltraSPARC Visual Instruction Set extensions">;
 
+def FeatureHardQuad
+  : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
+                     "Enable quad-word floating point instructions">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 3fe2b44..d06c894 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -43,6 +44,7 @@ namespace {
                          const char *Modifier = 0);
     void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
 
+    virtual void EmitFunctionBodyStart();
     virtual void EmitInstruction(const MachineInstr *MI) {
       SmallString<128> Str;
       raw_svector_ostream OS(Str);
@@ -63,11 +65,35 @@ namespace {
 
     virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
                        const;
+    void EmitGlobalRegisterDecl(unsigned reg) {
+      SmallString<128> Str;
+      raw_svector_ostream OS(Str);
+      OS << "\t.register "
+         << "%" << StringRef(getRegisterName(reg)).lower()
+         << ", "
+         << ((reg == SP::G6 || reg == SP::G7)? "#ignore" : "#scratch");
+      OutStreamer.EmitRawText(OS.str());
+    }
+
   };
 } // end of anonymous namespace
 
 #include "SparcGenAsmWriter.inc"
 
+void SparcAsmPrinter::EmitFunctionBodyStart() {
+  if (!TM.getSubtarget<SparcSubtarget>().is64Bit())
+    return;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const unsigned globalRegs[] = { SP::G2, SP::G3, SP::G6, SP::G7, 0 };
+  for (unsigned i = 0; globalRegs[i] != 0; ++i) {
+    unsigned reg = globalRegs[i];
+    if (MRI.use_empty(reg))
+      continue;
+    EmitGlobalRegisterDecl(reg);
+  }
+}
+
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand (opNum);
@@ -79,11 +105,37 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       assert(TF == SPII::MO_NO_FLAG &&
              "Cannot handle target flags on call address");
     else if (MI->getOpcode() == SP::SETHIi)
-      assert((TF == SPII::MO_HI || TF == SPII::MO_H44 || TF == SPII::MO_HH) &&
+      assert((TF == SPII::MO_HI || TF == SPII::MO_H44 || TF == SPII::MO_HH
+              || TF == SPII::MO_TLS_GD_HI22
+              || TF == SPII::MO_TLS_LDM_HI22
+              || TF == SPII::MO_TLS_LDO_HIX22
+              || TF == SPII::MO_TLS_IE_HI22
+              || TF == SPII::MO_TLS_LE_HIX22) &&
              "Invalid target flags for address operand on sethi");
+    else if (MI->getOpcode() == SP::TLS_CALL)
+      assert((TF == SPII::MO_NO_FLAG
+              || TF == SPII::MO_TLS_GD_CALL
+              || TF == SPII::MO_TLS_LDM_CALL) &&
+             "Cannot handle target flags on tls call address");
+    else if (MI->getOpcode() == SP::TLS_ADDrr)
+      assert((TF == SPII::MO_TLS_GD_ADD || TF == SPII::MO_TLS_LDM_ADD
+              || TF == SPII::MO_TLS_LDO_ADD || TF == SPII::MO_TLS_IE_ADD) &&
+             "Cannot handle target flags on add for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDrr)
+      assert(TF == SPII::MO_TLS_IE_LD &&
+             "Cannot handle target flags on ld for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDXrr)
+      assert(TF == SPII::MO_TLS_IE_LDX &&
+             "Cannot handle target flags on ldx for TLS");
+    else if (MI->getOpcode() == SP::XORri)
+      assert((TF == SPII::MO_TLS_LDO_LOX10 || TF == SPII::MO_TLS_LE_LOX10) &&
+             "Cannot handle target flags on xor for TLS");
     else
-      assert((TF == SPII::MO_LO || TF == SPII::MO_M44 || TF == SPII::MO_L44 ||
-              TF == SPII::MO_HM) &&
+      assert((TF == SPII::MO_LO || TF == SPII::MO_M44 || TF == SPII::MO_L44
+              || TF == SPII::MO_HM
+              || TF == SPII::MO_TLS_GD_LO10
+              || TF == SPII::MO_TLS_LDM_LO10
+              || TF == SPII::MO_TLS_IE_LO10 ) &&
              "Invalid target flags for small address operand");
   }
 #endif
@@ -102,6 +154,24 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case SPII::MO_L44: O << "%l44("; break;
   case SPII::MO_HH:  O << "%hh(";  break;
   case SPII::MO_HM:  O << "%hm(";  break;
+  case SPII::MO_TLS_GD_HI22:   O << "%tgd_hi22(";   break;
+  case SPII::MO_TLS_GD_LO10:   O << "%tgd_lo10(";   break;
+  case SPII::MO_TLS_GD_ADD:    O << "%tgd_add(";    break;
+  case SPII::MO_TLS_GD_CALL:   O << "%tgd_call(";   break;
+  case SPII::MO_TLS_LDM_HI22:  O << "%tldm_hi22(";  break;
+  case SPII::MO_TLS_LDM_LO10:  O << "%tldm_lo10(";  break;
+  case SPII::MO_TLS_LDM_ADD:   O << "%tldm_add(";   break;
+  case SPII::MO_TLS_LDM_CALL:  O << "%tldm_call(";  break;
+  case SPII::MO_TLS_LDO_HIX22: O << "%tldo_hix22("; break;
+  case SPII::MO_TLS_LDO_LOX10: O << "%tldo_lox10("; break;
+  case SPII::MO_TLS_LDO_ADD:   O << "%tldo_add(";   break;
+  case SPII::MO_TLS_IE_HI22:   O << "%tie_hi22(";   break;
+  case SPII::MO_TLS_IE_LO10:   O << "%tie_lo10(";   break;
+  case SPII::MO_TLS_IE_LD:     O << "%tie_ld(";     break;
+  case SPII::MO_TLS_IE_LDX:    O << "%tie_ldx(";    break;
+  case SPII::MO_TLS_IE_ADD:    O << "%tie_add(";    break;
+  case SPII::MO_TLS_LE_HIX22:  O << "%tle_hix22(";  break;
+  case SPII::MO_TLS_LE_LOX10:  O << "%tle_lox10(";   break;
   }
 
   switch (MO.getType()) {
@@ -116,7 +186,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << *MO.getMBB()->getSymbol();
     return;
   case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     break;
   case MachineOperand::MO_BlockAddress:
     O <<  GetBlockAddressSymbol(MO.getBlockAddress())->getName();
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
index a181bcf..acd4ec2 100644
--- a/lib/Target/Sparc/SparcCallingConv.td
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -117,3 +117,14 @@ def CC_Sparc64 : CallingConv<[
   // arguments whether they are passed in registers or not.
   CCCustom<"CC_Sparc64_Full">
 ]>;
+
+// Callee-saved registers are handled by the register window mechanism.
+def CSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add (sequence "I%u", 0, 7),
+                            (sequence "L%u", 0, 7));
+}
+
+// Callee-saved registers for calls with ReturnsTwice attribute.
+def RTCSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add I6, I7);
+}
diff --git a/lib/Target/Sparc/SparcCodeEmitter.cpp b/lib/Target/Sparc/SparcCodeEmitter.cpp
new file mode 100644
index 0000000..9bfe31f
--- /dev/null
+++ b/lib/Target/Sparc/SparcCodeEmitter.cpp
@@ -0,0 +1,245 @@
+//===-- Sparc/SparcCodeEmitter.cpp - Convert Sparc Code to Machine Code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Sparc machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "Sparc.h"
+#include "MCTargetDesc/SparcBaseInfo.h"
+#include "SparcRelocations.h"
+#include "SparcTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class SparcCodeEmitter : public MachineFunctionPass {
+  SparcJITInfo *JTI;
+  const SparcInstrInfo *II;
+  const DataLayout *TD;
+  const SparcSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+public:
+  SparcCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
+      TM(tm), MCE(mce), MCPEs(0),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "Sparc Machine Code Emitter";
+  }
+
+  /// getBinaryCodeForInstr - This function, generated by the
+  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+  /// machine instructions.
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+  void emitInstruction(MachineBasicBlock::instr_iterator MI,
+                       MachineBasicBlock &MBB);
+
+private:
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MachineInstr &MI,
+                             const MachineOperand &MO) const;
+
+  void emitWord(unsigned Word);
+
+  unsigned getRelocation(const MachineInstr &MI,
+                         const MachineOperand &MO) const;
+
+  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc) const;
+  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+};
+}  // end anonymous namespace.
+
+char SparcCodeEmitter::ID = 0;
+
+bool SparcCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  SparcTargetMachine &Target = static_cast<SparcTargetMachine &>(
+                                const_cast<TargetMachine &>(MF.getTarget()));
+
+  JTI = Target.getJITInfo();
+  II = Target.getInstrInfo();
+  TD = Target.getDataLayout();
+  Subtarget = &TM.getSubtarget<SparcSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E;)
+        emitInstruction(*I++, *MBB);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void SparcCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
+                                      MachineBasicBlock &MBB) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
+
+  MCE.processDebugLoc(MI->getDebugLoc(), true);
+
+  ++NumEmitted;
+
+  switch (MI->getOpcode()) {
+  default: {
+    emitWord(getBinaryCodeForInstr(*MI));
+    break;
+  }
+  case TargetOpcode::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI->getOperand(0).getSymbolName()[0]) {
+      report_fatal_error("JIT does not support inline asm!");
+    }
+    break;
+  }
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL: {
+    MCE.emitLabel(MI->getOperand(0).getMCSymbol());
+    break;
+  }
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL: {
+    // Do nothing.
+    break;
+  }
+  case SP::GETPCX: {
+    report_fatal_error("JIT does not support pseudo instruction GETPCX yet!");
+    break;
+  }
+  }
+
+  MCE.processDebugLoc(MI->getDebugLoc(), false);
+}
+
+void SparcCodeEmitter::emitWord(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  MCE.emitWordBE(Word);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned SparcCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                             const MachineOperand &MO) const {
+  if (MO.isReg())
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO));
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+unsigned SparcCodeEmitter::getRelocation(const MachineInstr &MI,
+                                         const MachineOperand &MO) const {
+
+  unsigned TF = MO.getTargetFlags();
+  switch (TF) {
+  default:
+  case SPII::MO_NO_FLAG: break;
+  case SPII::MO_LO: return SP::reloc_sparc_lo;
+  case SPII::MO_HI: return SP::reloc_sparc_hi;
+  case SPII::MO_H44:
+  case SPII::MO_M44:
+  case SPII::MO_L44:
+  case SPII::MO_HH:
+  case SPII::MO_HM: assert(0 && "FIXME: Implement Medium/Large code model.");
+  }
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default: break;
+  case SP::CALL:    return SP::reloc_sparc_pc30;
+  case SP::BA:
+  case SP::BCOND:
+  case SP::FBCOND:  return SP::reloc_sparc_pc22;
+  case SP::BPXCC:   return SP::reloc_sparc_pc19;
+  }
+  llvm_unreachable("unknown reloc!");
+}
+
+void SparcCodeEmitter::emitGlobalAddress(const GlobalValue *GV,
+                                        unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             const_cast<GlobalValue *>(GV), 0,
+                                             true));
+}
+
+void SparcCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0));
+}
+
+void SparcCodeEmitter::
+emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void SparcCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                            unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+
+/// createSparcJITCodeEmitterPass - Return a pass that emits the collected Sparc
+/// code to the specified MCE object.
+FunctionPass *llvm::createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                                 JITCodeEmitter &JCE) {
+  return new SparcCodeEmitter(TM, JCE);
+}
+
+#include "SparcGenCodeEmitter.inc"
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 536e466..c75998a 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -33,6 +33,51 @@ DisableLeafProc("disable-sparc-leaf-proc",
                 cl::Hidden);
 
 
+void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          int NumBytes,
+                                          unsigned ADDrr,
+                                          unsigned ADDri) const {
+
+  DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  if (NumBytes >= -4096 && NumBytes < 4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+    return;
+  }
+
+  // Emit this the hard way.  This clobbers G1 which we always know is
+  // available here.
+  if (NumBytes >= 0) {
+    // Emit nonnegative numbers with sethi + or.
+    // sethi %hi(NumBytes), %g1
+    // or %g1, %lo(NumBytes), %g1
+    // add %sp, %g1, %sp
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(LO10(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+    return ;
+  }
+
+  // Emit negative numbers with sethi + xor.
+  // sethi %hix(NumBytes), %g1
+  // xor %g1, %lox(NumBytes), %g1
+  // add %sp, %g1, %sp
+  BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+    .addReg(SP::O6).addReg(SP::G1);
+}
+
 void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
 
@@ -55,21 +100,27 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
     SAVErr = SP::ADDrr;
   }
   NumBytes = - SubTarget.getAdjustedFrameSize(NumBytes);
-
-  if (NumBytes >= -4096) {
-    BuildMI(MBB, MBBI, dl, TII.get(SAVEri), SP::O6)
-      .addReg(SP::O6).addImm(NumBytes);
-  } else {
-    // Emit this the hard way.  This clobbers G1 which we always know is
-    // available here.
-    unsigned OffHi = (unsigned)NumBytes >> 10U;
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
-      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
-    BuildMI(MBB, MBBI, dl, TII.get(SAVErr), SP::O6)
-      .addReg(SP::O6).addReg(SP::G1);
-  }
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl, TII.get(SP::PROLOG_LABEL)).addSym(FrameLabel);
+
+  unsigned regFP = MRI->getDwarfRegNum(SP::I6, true);
+
+  // Emit ".cfi_def_cfa_register 30".
+  MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
+                                                          regFP));
+  // Emit ".cfi_window_save".
+  MMI.addFrameInst(MCCFIInstruction::createWindowSave(FrameLabel));
+
+  unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
+  unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
+  // Emit ".cfi_register 15, 31".
+  MMI.addFrameInst(MCCFIInstruction::createRegister(FrameLabel,
+                                                    regOutRA,
+                                                    regInRA));
 }
 
 void SparcFrameLowering::
@@ -77,15 +128,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   if (!hasReservedCallFrame(MF)) {
     MachineInstr &MI = *I;
-    DebugLoc DL = MI.getDebugLoc();
     int Size = MI.getOperand(0).getImm();
     if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
       Size = -Size;
-    const SparcInstrInfo &TII =
-      *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+
     if (Size)
-      BuildMI(MBB, I, DL, TII.get(SP::ADDri), SP::O6).addReg(SP::O6)
-        .addImm(Size);
+      emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
   }
   MBB.erase(I);
 }
@@ -112,21 +160,7 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
     return;
 
   NumBytes = SubTarget.getAdjustedFrameSize(NumBytes);
-
-  if (NumBytes < 4096) {
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), SP::O6)
-      .addReg(SP::O6).addImm(NumBytes);
-  } else {
-    // Emit this the hard way.  This clobbers G1 which we always know is
-    // available here.
-    unsigned OffHi = (unsigned)NumBytes >> 10U;
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
-      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ADDrr), SP::O6)
-      .addReg(SP::O6).addReg(SP::G1);
-  }
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
 }
 
 bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 8eaef59..072fde3 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -49,6 +49,14 @@ private:
 
   // Returns true if MF is a leaf procedure.
   bool isLeafProc(MachineFunction &MF) const;
+
+
+  // Emits code for adjusting SP in function prologue/epilogue.
+  void emitSPAdjustment(MachineFunction &MF,
+                        MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        int NumBytes, unsigned ADDrr, unsigned ADDri) const;
+
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index db62151..b012bfd 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -80,7 +80,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
     return true;
   }
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress)
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
     return false;  // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
@@ -117,7 +118,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
 bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   if (Addr.getOpcode() == ISD::FrameIndex) return false;
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress)
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
     return false;  // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
@@ -139,8 +141,10 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
 
 SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 4b0fa67..64625f7 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -14,6 +14,7 @@
 
 #include "SparcISelLowering.h"
 #include "SparcMachineFunctionInfo.h"
+#include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
 #include "MCTargetDesc/SparcBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -648,6 +649,27 @@ SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   return LowerCall_32(CLI, InVals);
 }
 
+static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
+                                     ImmutableCallSite *CS) {
+  if (CS)
+    return CS->hasFnAttr(Attribute::ReturnsTwice);
+
+  const Function *CalleeFn = 0;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+  }
+
+  if (!CalleeFn)
+    return false;
+  return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
+}
+
 // Lower a call for the 32-bit ABI.
 SDValue
 SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
@@ -861,6 +883,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   }
 
   unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
 
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
@@ -880,6 +903,16 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
                                   RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CallConv)
+                          : TRI->getCallPreservedMask(CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
@@ -908,6 +941,23 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   return Chain;
 }
 
+// This functions returns true if CalleeName is a ABI function that returns
+// a long double (fp128).
+static bool isFP128ABICall(const char *CalleeName)
+{
+  static const char *const ABICalls[] =
+    {  "_Q_add", "_Q_sub", "_Q_mul", "_Q_div",
+       "_Q_sqrt", "_Q_neg",
+       "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
+       "_Q_lltoq", "_Q_ulltoq",
+       0
+    };
+  for (const char * const *I = ABICalls; *I != 0; ++I)
+    if (strcmp(CalleeName, *I) == 0)
+      return true;
+  return false;
+}
+
 unsigned
 SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 {
@@ -918,7 +968,10 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
              dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const Function *Fn = DAG.getMachineFunction().getFunction();
     const Module *M = Fn->getParent();
-    CalleeFn = M->getFunction(E->getSymbol());
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+    if (!CalleeFn && isFP128ABICall(CalleeName))
+      return 16; // Return sizeof(fp128)
   }
 
   if (!CalleeFn)
@@ -983,6 +1036,9 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
 
+  // Sparc target does not yet support tail call optimization.
+  CLI.IsTailCall = false;
+
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
@@ -1099,6 +1155,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy());
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1112,6 +1169,15 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CLI.CallConv)
+                          : TRI->getCallPreservedMask(CLI.CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   // Make sure the CopyToReg nodes are glued to the call instruction which
   // consumes the registers.
   if (InGlue.getNode())
@@ -1244,15 +1310,21 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
   addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
   addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+  addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
   if (Subtarget->is64Bit())
     addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
 
   // Turn FP extload into load/fextend
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+
   // Sparc doesn't have i1 sign extending load
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
   // Turn FP truncstore into trunc + store.
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 
   // Custom legalize GlobalAddress nodes into LO/HI parts.
   setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom);
@@ -1271,13 +1343,25 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 
+  // ... nor does SparcV9.
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UREM, MVT::i64, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  }
+
   // Custom expand fp<->sint
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 
-  // Expand fp<->uint
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+  // Custom Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
   setOperationAction(ISD::BITCAST, MVT::f32, Expand);
   setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -1286,9 +1370,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f128, Expand);
+
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::SETCC, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Expand);
 
   // Sparc doesn't have BRCOND either, it has BR_CC.
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -1297,18 +1384,34 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
 
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 
   if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ADDC, MVT::i64, Custom);
+    setOperationAction(ISD::ADDE, MVT::i64, Custom);
+    setOperationAction(ISD::SUBC, MVT::i64, Custom);
+    setOperationAction(ISD::SUBE, MVT::i64, Custom);
     setOperationAction(ISD::BITCAST, MVT::f64, Expand);
     setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
     setOperationAction(ISD::SETCC, MVT::i64, Expand);
     setOperationAction(ISD::BR_CC, MVT::i64, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+    setOperationAction(ISD::ROTL , MVT::i64, Expand);
+    setOperationAction(ISD::ROTR , MVT::i64, Expand);
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   }
 
   // FIXME: There are instructions available for ATOMIC_FENCE
@@ -1321,6 +1424,11 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FABS, MVT::f64, Custom);
   }
 
+  setOperationAction(ISD::FSIN , MVT::f128, Expand);
+  setOperationAction(ISD::FCOS , MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FREM , MVT::f128, Expand);
+  setOperationAction(ISD::FMA  , MVT::f128, Expand);
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
@@ -1339,8 +1447,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f128, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
 
@@ -1352,7 +1462,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 
-  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+  }
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -1366,14 +1481,100 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
 
-  // No debug info support yet.
-  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  setExceptionPointerRegister(SP::I0);
+  setExceptionSelectorRegister(SP::I1);
 
   setStackPointerRegisterToSaveRestore(SP::O6);
 
   if (Subtarget->isV9())
     setOperationAction(ISD::CTPOP, MVT::i32, Legal);
 
+  if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::LOAD, MVT::f128, Legal);
+    setOperationAction(ISD::STORE, MVT::f128, Legal);
+  } else {
+    setOperationAction(ISD::LOAD, MVT::f128, Custom);
+    setOperationAction(ISD::STORE, MVT::f128, Custom);
+  }
+
+  if (Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::FADD,  MVT::f128, Legal);
+    setOperationAction(ISD::FSUB,  MVT::f128, Legal);
+    setOperationAction(ISD::FMUL,  MVT::f128, Legal);
+    setOperationAction(ISD::FDIV,  MVT::f128, Legal);
+    setOperationAction(ISD::FSQRT, MVT::f128, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Legal);
+    if (Subtarget->isV9()) {
+      setOperationAction(ISD::FNEG, MVT::f128, Legal);
+      setOperationAction(ISD::FABS, MVT::f128, Legal);
+    } else {
+      setOperationAction(ISD::FNEG, MVT::f128, Custom);
+      setOperationAction(ISD::FABS, MVT::f128, Custom);
+    }
+
+    if (!Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+    }
+
+  } else {
+    // Custom legalize f128 operations.
+
+    setOperationAction(ISD::FADD,  MVT::f128, Custom);
+    setOperationAction(ISD::FSUB,  MVT::f128, Custom);
+    setOperationAction(ISD::FMUL,  MVT::f128, Custom);
+    setOperationAction(ISD::FDIV,  MVT::f128, Custom);
+    setOperationAction(ISD::FSQRT, MVT::f128, Custom);
+    setOperationAction(ISD::FNEG,  MVT::f128, Custom);
+    setOperationAction(ISD::FABS,  MVT::f128, Custom);
+
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
+
+    // Setup Runtime library names.
+    if (Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::ADD_F128,  "_Qp_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Qp_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Qp_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Qp_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
+    } else {
+      setLibcallName(RTLIB::ADD_F128,  "_Q_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Q_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Q_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Q_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
+    }
+  }
+
   setMinFunctionAlignment(2);
 
   computeRegisterProperties();
@@ -1394,13 +1595,24 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case SPISD::Lo:         return "SPISD::Lo";
   case SPISD::FTOI:       return "SPISD::FTOI";
   case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::FTOX:       return "SPISD::FTOX";
+  case SPISD::XTOF:       return "SPISD::XTOF";
   case SPISD::CALL:       return "SPISD::CALL";
   case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
   case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
   case SPISD::FLUSHW:     return "SPISD::FLUSHW";
+  case SPISD::TLS_ADD:    return "SPISD::TLS_ADD";
+  case SPISD::TLS_LD:     return "SPISD::TLS_LD";
+  case SPISD::TLS_CALL:   return "SPISD::TLS_CALL";
   }
 }
 
+EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
 /// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
@@ -1505,6 +1717,10 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue HiLo = makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
     SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
     return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
                        MachinePointerInfo::getGOT(), false, false, false, 0);
   }
@@ -1513,6 +1729,7 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   switch(getTargetMachine().getCodeModel()) {
   default:
     llvm_unreachable("Unsupported absolute code model");
+  case CodeModel::JITDefault:
   case CodeModel::Small:
     // abs32.
     return makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
@@ -1549,23 +1766,430 @@ SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
   return makeAddress(Op, DAG);
 }
 
-static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    unsigned HiTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_HI22
+                     : SPII::MO_TLS_LDM_HI22);
+    unsigned LoTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_LO10
+                     : SPII::MO_TLS_LDM_LO10);
+    unsigned addTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_ADD
+                      : SPII::MO_TLS_LDM_ADD);
+    unsigned callTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_CALL
+                       : SPII::MO_TLS_LDM_CALL);
+
+    SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+    SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
+                               withTargetFlags(Op, addTF, DAG));
+
+    SDValue Chain = DAG.getEntryNode();
+    SDValue InFlag;
+
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, true), DL);
+    Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
+    InFlag = Chain.getValue(1);
+    SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
+    SDValue Symbol = withTargetFlags(Op, callTF, DAG);
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+    Ops.push_back(Symbol);
+    Ops.push_back(DAG.getRegister(SP::O0, PtrVT));
+    const uint32_t *Mask = getTargetMachine()
+      .getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, &Ops[0], Ops.size());
+    InFlag = Chain.getValue(1);
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+    SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                             withTargetFlags(Op, SPII::MO_TLS_LDO_HIX22, DAG));
+    SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                             withTargetFlags(Op, SPII::MO_TLS_LDO_LOX10, DAG));
+    HiLo =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
+                       withTargetFlags(Op, SPII::MO_TLS_LDO_ADD, DAG));
+  }
+
+  if (model == TLSModel::InitialExec) {
+    unsigned ldTF     = ((PtrVT == MVT::i64)? SPII::MO_TLS_IE_LDX
+                         : SPII::MO_TLS_IE_LD);
+
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
+
+    SDValue TGA = makeHiLoPair(Op,
+                               SPII::MO_TLS_IE_HI22, SPII::MO_TLS_IE_LO10, DAG);
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
+    SDValue Offset = DAG.getNode(SPISD::TLS_LD,
+                                 DL, PtrVT, Ptr,
+                                 withTargetFlags(Op, ldTF, DAG));
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
+                       DAG.getRegister(SP::G7, PtrVT), Offset,
+                       withTargetFlags(Op, SPII::MO_TLS_IE_ADD, DAG));
+  }
+
+  assert(model == TLSModel::LocalExec);
+  SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                           withTargetFlags(Op, SPII::MO_TLS_LE_HIX22, DAG));
+  SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                           withTargetFlags(Op, SPII::MO_TLS_LE_LOX10, DAG));
+  SDValue Offset =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getRegister(SP::G7, PtrVT), Offset);
+}
+
+SDValue
+SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                          SDValue Arg, SDLoc DL,
+                                          SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListEntry Entry;
+  Entry.Node = Arg;
+  Entry.Ty   = ArgTy;
+
+  if (ArgTy->isFP128Ty()) {
+    // Create a stack object and pass the pointer to the library function.
+    int FI = MFI->CreateStackObject(16, 8, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    Chain = DAG.getStore(Chain,
+                         DL,
+                         Entry.Node,
+                         FIPtr,
+                         MachinePointerInfo(),
+                         false,
+                         false,
+                         8);
+
+    Entry.Node = FIPtr;
+    Entry.Ty   = PointerType::getUnqual(ArgTy);
+  }
+  Args.push_back(Entry);
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                                 const char *LibFuncName,
+                                 unsigned numArgs) const {
+
+  ArgListTy Args;
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  SDValue Callee = DAG.getExternalSymbol(LibFuncName, getPointerTy());
+  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+  Type *RetTyABI = RetTy;
+  SDValue Chain = DAG.getEntryNode();
+  SDValue RetPtr;
+
+  if (RetTy->isFP128Ty()) {
+    // Create a Stack Object to receive the return value of type f128.
+    ArgListEntry Entry;
+    int RetFI = MFI->CreateStackObject(16, 8, false);
+    RetPtr = DAG.getFrameIndex(RetFI, getPointerTy());
+    Entry.Node = RetPtr;
+    Entry.Ty   = PointerType::getUnqual(RetTy);
+    if (!Subtarget->is64Bit())
+      Entry.isSRet = true;
+    Entry.isReturned = false;
+    Args.push_back(Entry);
+    RetTyABI = Type::getVoidTy(*DAG.getContext());
+  }
+
+  assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
+  for (unsigned i = 0, e = numArgs; i != e; ++i) {
+    Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
+  }
+  TargetLowering::
+    CallLoweringInfo CLI(Chain,
+                         RetTyABI,
+                         false, false, false, false,
+                         0, CallingConv::C,
+                         false, false, true,
+                         Callee, Args, DAG, SDLoc(Op));
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // chain is in second result.
+  if (RetTyABI == RetTy)
+    return CallInfo.first;
+
+  assert (RetTy->isFP128Ty() && "Unexpected return type!");
+
+  Chain = CallInfo.second;
+
+  // Load RetPtr to get the return value.
+  return DAG.getLoad(Op.getValueType(),
+                     SDLoc(Op),
+                     Chain,
+                     RetPtr,
+                     MachinePointerInfo(),
+                     false, false, false, 8);
+}
+
+SDValue
+SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
+                                      unsigned &SPCC,
+                                      SDLoc DL,
+                                      SelectionDAG &DAG) const {
+
+  const char *LibCall = 0;
+  bool is64Bit = Subtarget->is64Bit();
+  switch(SPCC) {
+  default: llvm_unreachable("Unhandled conditional code!");
+  case SPCC::FCC_E  : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
+  case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
+  case SPCC::FCC_L  : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
+  case SPCC::FCC_G  : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
+  case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
+  case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
+  case SPCC::FCC_UL :
+  case SPCC::FCC_ULE:
+  case SPCC::FCC_UG :
+  case SPCC::FCC_UGE:
+  case SPCC::FCC_U  :
+  case SPCC::FCC_O  :
+  case SPCC::FCC_LG :
+  case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(LibCall, getPointerTy());
+  Type *RetTy = Type::getInt32Ty(*DAG.getContext());
+  ArgListTy Args;
+  SDValue Chain = DAG.getEntryNode();
+  Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
+  Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
+
+  TargetLowering::
+    CallLoweringInfo CLI(Chain,
+                         RetTy,
+                         false, false, false, false,
+                         0, CallingConv::C,
+                         false, false, true,
+                         Callee, Args, DAG, DL);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // result is in first, and chain is in second result.
+  SDValue Result =  CallInfo.first;
+
+  switch(SPCC) {
+  default: {
+    SDValue RHS = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UL : {
+    SDValue Mask   = DAG.getTargetConstant(1, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_ULE: {
+    SDValue RHS = DAG.getTargetConstant(2, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UG :  {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_G;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UGE: {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+
+  case SPCC::FCC_U  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_O  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_LG :  {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UE : {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  }
+}
+
+static SDValue
+LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
+                   const SparcTargetLowering &TLI) {
+
+  if (Op.getOperand(0).getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);
+
+  if (Op.getOperand(0).getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
+
+  llvm_unreachable("fpextend with non-float operand!");
+  return SDValue(0, 0);
+}
+
+static SDValue
+LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
+                  const SparcTargetLowering &TLI) {
+  // FP_ROUND on f64 and f32 are legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128)
+    return Op;
+
+  if (Op.getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
+  if (Op.getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
+
+  llvm_unreachable("fpround to non-float!");
+  return SDValue(0, 0);
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  // Expand f128 operations to fp128 abi calls.
+  if (Op.getOperand(0).getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(VT))) {
+    const char *libName = TLI.getLibcallName(VT == MVT::i32
+                                             ? RTLIB::FPTOSINT_F128_I32
+                                             : RTLIB::FPTOSINT_F128_I64);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the resulting type is illegal.
+  if (!TLI.isTypeLegal(VT))
+    return SDValue(0, 0);
+
+  // Otherwise, Convert the fp value to integer in an FP register.
+  if (VT == MVT::i32)
+    Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  else
+    Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || (OpVT == MVT::i64));
+
+  EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;
+
+  // Expand f128 operations to fp128 ABI calls.
+  if (Op.getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(OpVT))) {
+    const char *libName = TLI.getLibcallName(OpVT == MVT::i32
+                                             ? RTLIB::SINTTOFP_I32_F128
+                                             : RTLIB::SINTTOFP_I64_F128);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the operand type is illegal.
+  if (!TLI.isTypeLegal(OpVT))
+    return SDValue(0, 0);
+
+  // Otherwise, Convert the int value to FP in an FP register.
+  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
+  unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
+  return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
   SDLoc dl(Op);
-  // Convert the fp value to integer in an FP register.
-  assert(Op.getValueType() == MVT::i32);
-  Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
-  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+  EVT VT = Op.getValueType();
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the resulting type is legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128 ||
+      (hasHardQuad && TLI.isTypeLegal(VT)))
+    return SDValue(0, 0);
+
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(VT == MVT::i32
+                                            ? RTLIB::FPTOUINT_F128_I32
+                                            : RTLIB::FPTOUINT_F128_I64),
+                         1);
 }
 
-static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
   SDLoc dl(Op);
-  assert(Op.getOperand(0).getValueType() == MVT::i32);
-  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
-  // Convert the int value to FP in an FP register.
-  return DAG.getNode(SPISD::ITOF, dl, Op.getValueType(), Tmp);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || OpVT == MVT::i64);
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the operand type is legal.
+  if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
+    return SDValue(0, 0);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(OpVT == MVT::i32
+                                            ? RTLIB::UINTTOFP_I32_F128
+                                            : RTLIB::UINTTOFP_I64_F128),
+                         1);
 }
 
-static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
+                          const SparcTargetLowering &TLI,
+                          bool hasHardQuad) {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
@@ -1586,15 +2210,23 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
     // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
     Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
   } else {
-    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
-    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
-    Opc = SPISD::BRFCC;
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::BRICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      Opc = SPISD::BRFCC;
+    }
   }
   return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
                      DAG.getConstant(SPCC, MVT::i32), CompareFlag);
 }
 
-static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const SparcTargetLowering &TLI,
+                              bool hasHardQuad) {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -1614,9 +2246,15 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
           SPISD::SELECT_ICC : SPISD::SELECT_XCC;
     if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
   } else {
-    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
-    Opc = SPISD::SELECT_FCC;
-    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::SELECT_ICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      Opc = SPISD::SELECT_FCC;
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    }
   }
   return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
                      DAG.getConstant(SPCC, MVT::i32), CompareFlag);
@@ -1665,20 +2303,25 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits())/8);
 }
 
-static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                       const SparcSubtarget *Subtarget) {
   SDValue Chain = Op.getOperand(0);  // Legalize the chain.
   SDValue Size  = Op.getOperand(1);  // Legalize the size.
+  EVT VT = Size->getValueType(0);
   SDLoc dl(Op);
 
   unsigned SPReg = SP::O6;
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
-  SDValue NewSP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size); // Value
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
 
   // The resultant pointer is actually 16 words from the bottom of the stack,
   // to provide a register spill area.
-  SDValue NewVal = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
-                                 DAG.getConstant(96, MVT::i32));
+  unsigned regSpillArea = Subtarget->is64Bit() ? 128 : 96;
+  regSpillArea += Subtarget->getStackPointerBias();
+
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
+                               DAG.getConstant(regSpillArea, VT));
   SDValue Ops[2] = { NewVal, Chain };
   return DAG.getMergeValues(Ops, 2, dl);
 }
@@ -1759,12 +2402,12 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
   return RetAddr;
 }
 
-static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG)
+static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG, unsigned opcode)
 {
   SDLoc dl(Op);
 
   assert(Op.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
-  assert(Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS);
+  assert(opcode == ISD::FNEG || opcode == ISD::FABS);
 
   // Lower fneg/fabs on f64 to fneg/fabs on f32.
   // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
@@ -1776,7 +2419,7 @@ static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG)
   SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
                                             SrcReg64);
 
-  Hi32 = DAG.getNode(Op.getOpcode(), dl, MVT::f32, Hi32);
+  Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
 
   SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                 dl, MVT::f64), 0);
@@ -1787,28 +2430,244 @@ static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG)
   return DstReg64;
 }
 
+// Lower a f128 load into two f64 loads.
+static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+  assert(LdNode && LdNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+
+  unsigned alignment = LdNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue Hi64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LdNode->getBasePtr(),
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+  EVT addrVT = LdNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              LdNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  SDValue Lo64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LoPtr,
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                       dl, MVT::f128);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Hi64,
+                               SubRegEven);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Lo64,
+                               SubRegOdd);
+  SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
+                           SDValue(Lo64.getNode(), 1) };
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                 &OutChains[0], 2);
+  SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegEven);
+  SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegOdd);
+
+  unsigned alignment = StNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue OutChains[2];
+  OutChains[0] = DAG.getStore(StNode->getChain(),
+                              dl,
+                              SDValue(Hi64, 0),
+                              StNode->getBasePtr(),
+                              MachinePointerInfo(),
+                              false, false, alignment);
+  EVT addrVT = StNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              StNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  OutChains[1] = DAG.getStore(StNode->getChain(),
+                             dl,
+                             SDValue(Lo64, 0),
+                             LoPtr,
+                             MachinePointerInfo(),
+                             false, false, alignment);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], 2);
+}
+
+static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG,
+                         const SparcTargetLowering &TLI,
+                         bool is64Bit) {
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op, DAG, ISD::FNEG);
+  if (Op.getValueType() == MVT::f128)
+    return TLI.LowerF128Op(Op, DAG, ((is64Bit) ? "_Qp_neg" : "_Q_neg"), 1);
+  return Op;
+}
+
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op, DAG, ISD::FABS);
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  // Lower fabs on f128 to fabs on f64
+  // fabs f128 => fabs f64:sub_even64, fmov f64:sub_odd64
+
+  SDLoc dl(Op);
+  SDValue SrcReg128 = Op.getOperand(0);
+  SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
+                                            SrcReg128);
+  SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
+                                            SrcReg128);
+  if (isV9)
+    Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
+  else
+    Hi64 = LowerF64Op(Hi64, DAG, ISD::FABS);
+
+  SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, MVT::f128), 0);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
+                                        DstReg128, Hi64);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
+                                        DstReg128, Lo64);
+  return DstReg128;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+
+  if (Op.getValueType() != MVT::i64)
+    return Op;
+
+  SDLoc dl(Op);
+  SDValue Src1 = Op.getOperand(0);
+  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
+  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
+                               DAG.getConstant(32, MVT::i64));
+  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
+
+  SDValue Src2 = Op.getOperand(1);
+  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
+  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
+                               DAG.getConstant(32, MVT::i64));
+  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
+
+
+  bool hasChain = false;
+  unsigned hiOpc = Op.getOpcode();
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode");
+  case ISD::ADDC: hiOpc = ISD::ADDE; break;
+  case ISD::ADDE: hasChain = true; break;
+  case ISD::SUBC: hiOpc = ISD::SUBE; break;
+  case ISD::SUBE: hasChain = true; break;
+  }
+  SDValue Lo;
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
+  if (hasChain) {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
+                     Op.getOperand(2));
+  } else {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
+  }
+  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
+  SDValue Carry = Hi.getValue(1);
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
+  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
+                   DAG.getConstant(32, MVT::i64));
+
+  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
+  SDValue Ops[2] = { Dst, Carry };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+
+  bool hasHardQuad = Subtarget->hasHardQuad();
+  bool is64Bit     = Subtarget->is64Bit();
+  bool isV9        = Subtarget->isV9();
+
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
 
-  case ISD::FNEG:
-  case ISD::FABS:               return LowerF64Op(Op, DAG);
-
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG, *this);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
-  case ISD::GlobalTLSAddress:
-    llvm_unreachable("TLS not implemented for Sparc.");
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
-  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
-  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
-  case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
-  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG, *this,
+                                                  hasHardQuad);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG, *this,
+                                                      hasHardQuad);
   case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
-  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
+                                                               Subtarget);
+
+  case ISD::LOAD:               return LowerF128Load(Op, DAG);
+  case ISD::STORE:              return LowerF128Store(Op, DAG);
+  case ISD::FADD:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::ADD_F128), 2);
+  case ISD::FSUB:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SUB_F128), 2);
+  case ISD::FMUL:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::MUL_F128), 2);
+  case ISD::FDIV:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::DIV_F128), 2);
+  case ISD::FSQRT:              return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SQRT_F128),1);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG, *this, is64Bit);
+  case ISD::FABS:               return LowerFABS(Op, DAG, isV9);
+  case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
+  case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   }
 }
 
@@ -1825,11 +2684,13 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case SP::SELECT_CC_Int_ICC:
   case SP::SELECT_CC_FP_ICC:
   case SP::SELECT_CC_DFP_ICC:
+  case SP::SELECT_CC_QFP_ICC:
     BROpcode = SP::BCOND;
     break;
   case SP::SELECT_CC_Int_FCC:
   case SP::SELECT_CC_FP_FCC:
   case SP::SELECT_CC_DFP_FCC:
+  case SP::SELECT_CC_QFP_FCC:
     BROpcode = SP::FBCOND;
     break;
   }
@@ -1924,3 +2785,50 @@ SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The Sparc target isn't yet aware of offsets.
   return false;
 }
+
+void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>& Results,
+                                             SelectionDAG &DAG) const {
+
+  SDLoc dl(N);
+
+  RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getOperand(0).getValueType() != MVT::f128
+        || N->getValueType(0) != MVT::i64)
+      return;
+    libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
+               ? RTLIB::FPTOSINT_F128_I64
+               : RTLIB::FPTOUINT_F128_I64);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getValueType(0) != MVT::f128
+        || N->getOperand(0).getValueType() != MVT::i64)
+      return;
+
+    libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
+               ? RTLIB::SINTTOFP_I64_F128
+               : RTLIB::UINTTOFP_I64_F128);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+  }
+}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 261c25a..2659fc8 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -37,11 +37,17 @@ namespace llvm {
 
       FTOI,        // FP to Int within a FP register.
       ITOF,        // Int to FP within a FP register.
+      FTOX,        // FP to Int64 within a FP register.
+      XTOF,        // Int64 to FP within a FP register.
 
       CALL,        // A call instruction.
       RET_FLAG,    // Return with a flag operand.
-      GLOBAL_BASE_REG, // Global base reg for PIC
-      FLUSHW       // FLUSH register windows to stack
+      GLOBAL_BASE_REG, // Global base reg for PIC.
+      FLUSHW,      // FLUSH register windows to stack.
+
+      TLS_ADD,     // For Thread Local Storage (TLS).
+      TLS_LD,
+      TLS_CALL
     };
   }
 
@@ -73,6 +79,9 @@ namespace llvm {
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
@@ -119,6 +128,7 @@ namespace llvm {
                            SDLoc DL, SelectionDAG &DAG) const;
 
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
@@ -127,6 +137,27 @@ namespace llvm {
     SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                          SelectionDAG &DAG) const;
     SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                 SDValue Arg, SDLoc DL,
+                                 SelectionDAG &DAG) const;
+    SDValue LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                        const char *LibFuncName,
+                        unsigned numArgs) const;
+    SDValue LowerF128Compare(SDValue LHS, SDValue RHS,
+                             unsigned &SPCC,
+                             SDLoc DL,
+                             SelectionDAG &DAG) const;
+
+    bool ShouldShrinkFPConstant(EVT VT) const {
+      // Do not shrink FP constpool if VT == MVT::f128.
+      // (ldd, call _Q_fdtoq) is more expensive than two ldds.
+      return VT != MVT::f128;
+    }
+
+    virtual void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue>& Results,
+                                    SelectionDAG &DAG) const;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 47658ee..8656de5 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -153,15 +153,11 @@ def : Pat<(xor i64:$a, (not i64:$b)), (XNORrr $a, $b)>;
 def : Pat<(add i64:$a, i64:$b), (ADDrr $a, $b)>;
 def : Pat<(sub i64:$a, i64:$b), (SUBrr $a, $b)>;
 
-// Add/sub with carry were renamed to addc/subc in SPARC v9.
-def : Pat<(adde i64:$a, i64:$b), (ADDXrr $a, $b)>;
-def : Pat<(sube i64:$a, i64:$b), (SUBXrr $a, $b)>;
-
-def : Pat<(addc i64:$a, i64:$b), (ADDCCrr $a, $b)>;
-def : Pat<(subc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
-
 def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
 
+def : Pat<(tlsadd i64:$a, i64:$b, tglobaltlsaddr:$sym),
+          (TLS_ADDrr $a, $b, $sym)>;
+
 // Register-immediate instructions.
 
 def : Pat<(and i64:$a, (i64 simm13:$b)), (ANDri $a, (as_i32imm $b))>;
@@ -173,6 +169,14 @@ def : Pat<(sub i64:$a, (i64 simm13:$b)), (SUBri $a, (as_i32imm $b))>;
 
 def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
 
+def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+
+// "LEA" form of add
+def LEAX_ADDri : F3_2<2, 0b000000,
+                     (outs I64Regs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
+
 } // Predicates = [Is64Bit]
 
 
@@ -237,6 +241,12 @@ def LDXri  : F3_2<3, 0b001011,
                   (outs I64Regs:$dst), (ins MEMri:$addr),
                   "ldx [$addr], $dst",
                   [(set i64:$dst, (load ADDRri:$addr))]>;
+let mayLoad = 1 in
+  def TLS_LDXrr : F3_1<3, 0b001011,
+                       (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                       "ldx [$addr], $dst, $sym",
+                       [(set i64:$dst,
+                           (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
 
 // Extending loads to i64.
 def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
@@ -312,9 +322,9 @@ def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
 let Predicates = [Is64Bit] in {
 
 let Uses = [ICC] in
-def BPXCC : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                     "b$cc %xcc, $dst",
-                     [(SPbrxcc bb:$dst, imm:$cc)]>;
+def BPXCC : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                     "b$cond %xcc, $imm22",
+                     [(SPbrxcc bb:$imm22, imm:$cond)]>;
 
 // Conditional moves on %xcc.
 let Uses = [ICC], Constraints = "$f = $rd" in {
@@ -340,6 +350,42 @@ def FMOVD_XCC : Pseudo<(outs DFPRegs:$rd),
                        (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
 } // Uses, Constraints
 
+//===----------------------------------------------------------------------===//
+// 64-bit Floating Point Conversions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def FXTOS : F3_3u<2, 0b110100, 0b010000100,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtos $src, $dst",
+                 [(set FPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+def FXTOD : F3_3u<2, 0b110100, 0b010001000,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtod $src, $dst",
+                 [(set DFPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
+                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtoq $src, $dst",
+                 [(set QFPRegs:$dst, (SPxtof DFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
+
+def FSTOX : F3_3u<2, 0b110100, 0b010000001,
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fstox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox FPRegs:$src))]>;
+def FDTOX : F3_3u<2, 0b110100, 0b010000010,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox DFPRegs:$src))]>;
+def FQTOX : F3_3u<2, 0b110100, 0b010000011,
+                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox QFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
+
+} // Predicates = [Is64Bit]
+
 def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
           (MOVXCCrr $t, $f, imm:$cond)>;
 def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
index 6cdf6bc..afa2874 100644
--- a/lib/Target/Sparc/SparcInstrFormats.td
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -47,12 +47,11 @@ class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{29-25} = rd;
 }
 
-class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr,
+class F2_2<bits<3> op2Val, dag outs, dag ins, string asmstr,
            list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
   bits<4>   cond;
   bit       annul = 0;     // currently unused
 
-  let cond        = condVal;
   let op2         = op2Val;
 
   let Inst{29}    = annul;
@@ -112,6 +111,32 @@ class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
   let Inst{4-0}  = rs2;
 }
 
+// floating-point unary operations.
+class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rs1        = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// floating-point compares.
+class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rd         = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
 // Shift by register rs2.
 class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
             string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
@@ -150,3 +175,59 @@ multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
                  !strconcat(OpcStr, " $rs, $shcnt, $rd"),
                  [(set VT:$rd, (OpNode VT:$rs, (i32 imm:$shcnt)))]>;
 }
+
+class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+
+  let op          = 2;
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+}
+
+
+class F4_1<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+
+  bits<3> cc;
+  bits<4> cond;
+  bits<5> rs2;
+
+  let Inst{4-0}   = rs2;
+  let Inst{11}    = cc{0};
+  let Inst{12}    = cc{1};
+  let Inst{13}    = 0;
+  let Inst{17-14} = cond;
+  let Inst{18}    = cc{2};
+
+}
+
+class F4_2<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bits<3>  cc;
+  bits<4>  cond;
+  bits<11> simm11;
+
+  let Inst{10-0}  = simm11;
+  let Inst{11}    = cc{0};
+  let Inst{12}    = cc{1};
+  let Inst{13}    = 1;
+  let Inst{17-14} = cond;
+  let Inst{18}    = cc{2};
+}
+
+class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
+           string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bits<4> cond;
+  bits<3> opf_cc;
+  bits<5> rs2;
+
+  let Inst{18}     = 0;
+  let Inst{17-14}  = cond;
+  let Inst{13-11}  = opf_cc;
+  let Inst{10-5}   = opf_low;
+  let Inst{4-0}    = rs2;
+}
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 6c14bc9..c10b5b3 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -24,11 +24,15 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "SparcGenInstrInfo.inc"
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void SparcInstrInfo::anchor() {}
+
 SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
   : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
     RI(ST), Subtarget(ST) {
@@ -44,7 +48,8 @@ unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   if (MI->getOpcode() == SP::LDri ||
       MI->getOpcode() == SP::LDXri ||
       MI->getOpcode() == SP::LDFri ||
-      MI->getOpcode() == SP::LDDFri) {
+      MI->getOpcode() == SP::LDDFri ||
+      MI->getOpcode() == SP::LDQFri) {
     if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
         MI->getOperand(2).getImm() == 0) {
       FrameIndex = MI->getOperand(1).getIndex();
@@ -64,7 +69,8 @@ unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   if (MI->getOpcode() == SP::STri ||
       MI->getOpcode() == SP::STXri ||
       MI->getOpcode() == SP::STFri ||
-      MI->getOpcode() == SP::STDFri) {
+      MI->getOpcode() == SP::STDFri ||
+      MI->getOpcode() == SP::STQFri) {
     if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
         MI->getOperand(1).getImm() == 0) {
       FrameIndex = MI->getOperand(0).getIndex();
@@ -100,14 +106,14 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
 
   case SPCC::FCC_U:    return SPCC::FCC_O;
   case SPCC::FCC_O:    return SPCC::FCC_U;
-  case SPCC::FCC_G:    return SPCC::FCC_LE;
-  case SPCC::FCC_LE:   return SPCC::FCC_G;
-  case SPCC::FCC_UG:   return SPCC::FCC_ULE;
-  case SPCC::FCC_ULE:  return SPCC::FCC_UG;
-  case SPCC::FCC_L:    return SPCC::FCC_GE;
-  case SPCC::FCC_GE:   return SPCC::FCC_L;
-  case SPCC::FCC_UL:   return SPCC::FCC_UGE;
-  case SPCC::FCC_UGE:  return SPCC::FCC_UL;
+  case SPCC::FCC_G:    return SPCC::FCC_ULE;
+  case SPCC::FCC_LE:   return SPCC::FCC_UG;
+  case SPCC::FCC_UG:   return SPCC::FCC_LE;
+  case SPCC::FCC_ULE:  return SPCC::FCC_G;
+  case SPCC::FCC_L:    return SPCC::FCC_UGE;
+  case SPCC::FCC_GE:   return SPCC::FCC_UL;
+  case SPCC::FCC_UL:   return SPCC::FCC_GE;
+  case SPCC::FCC_UGE:  return SPCC::FCC_L;
   case SPCC::FCC_LG:   return SPCC::FCC_UE;
   case SPCC::FCC_UE:   return SPCC::FCC_LG;
   case SPCC::FCC_NE:   return SPCC::FCC_E;
@@ -273,6 +279,16 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I, DebugLoc DL,
                                  unsigned DestReg, unsigned SrcReg,
                                  bool KillSrc) const {
+  unsigned numSubRegs = 0;
+  unsigned movOpc     = 0;
+  const unsigned *subRegIdx = 0;
+
+  const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
+  const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
+  const unsigned QFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd,
+                                          SP::sub_odd64_then_sub_even,
+                                          SP::sub_odd64_then_sub_odd };
+
   if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
       .addReg(SrcReg, getKillRegState(KillSrc));
@@ -285,23 +301,47 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(KillSrc));
     } else {
       // Use two FMOVS instructions.
-      const TargetRegisterInfo *TRI = &getRegisterInfo();
-      MachineInstr *MovMI = 0;
-      unsigned subRegIdx[] = {SP::sub_even, SP::sub_odd};
-      for (unsigned i = 0; i != 2; ++i) {
-        unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
-        unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
-        assert(Dst && Src && "Bad sub-register");
-
-        MovMI = BuildMI(MBB, I, DL, get(SP::FMOVS), Dst).addReg(Src);
+      subRegIdx  = DFP_FP_SubRegsIdx;
+      numSubRegs = 2;
+      movOpc     = SP::FMOVS;
+    }
+  } else if (SP::QFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      if (Subtarget.hasHardQuad()) {
+        BuildMI(MBB, I, DL, get(SP::FMOVQ), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // Use two FMOVD instructions.
+        subRegIdx  = QFP_DFP_SubRegsIdx;
+        numSubRegs = 2;
+        movOpc     = SP::FMOVD;
       }
-      // Add implicit super-register defs and kills to the last MovMI.
-      MovMI->addRegisterDefined(DestReg, TRI);
-      if (KillSrc)
-        MovMI->addRegisterKilled(SrcReg, TRI);
+    } else {
+      // Use four FMOVS instructions.
+      subRegIdx  = QFP_FP_SubRegsIdx;
+      numSubRegs = 4;
+      movOpc     = SP::FMOVS;
     }
   } else
     llvm_unreachable("Impossible reg-to-reg copy");
+
+  if (numSubRegs == 0 || subRegIdx == 0 || movOpc == 0)
+    return;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstr *MovMI = 0;
+
+  for (unsigned i = 0; i != numSubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+    unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
+    assert(Dst && Src && "Bad sub-register");
+
+    MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src);
+  }
+  // Add implicit super-register defs and kills to the last MovMI.
+  MovMI->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    MovMI->addRegisterKilled(SrcReg, TRI);
 }
 
 void SparcInstrInfo::
@@ -321,7 +361,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                              MFI.getObjectAlignment(FI));
 
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
-  if (RC == &SP::I64RegsRegClass)
+ if (RC == &SP::I64RegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
   else if (RC == &SP::IntRegsRegClass)
@@ -330,9 +370,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
-  else if (RC == &SP::DFPRegsRegClass)
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
     BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use STQFri irrespective of its legality. If STQ is not legal, it will be
+    // lowered into two STDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::STQFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
   else
     llvm_unreachable("Can't store this register to stack slot");
 }
@@ -362,9 +407,14 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
       .addMemOperand(MMO);
-  else if (RC == &SP::DFPRegsRegClass)
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
     BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0)
       .addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use LDQFri irrespective of its legality. If LDQ is not legal, it will be
+    // lowered into two LDDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::LDQFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
   else
     llvm_unreachable("Can't load this register from stack slot");
 }
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index d0b220b..a86cbcb 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -37,6 +37,7 @@ namespace SPII {
 class SparcInstrInfo : public SparcGenInstrInfo {
   const SparcRegisterInfo RI;
   const SparcSubtarget& Subtarget;
+  virtual void anchor();
 public:
   explicit SparcInstrInfo(SparcSubtarget &ST);
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index d4cac4d..ef7a114 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -39,6 +39,10 @@ def HasNoV9 : Predicate<"!Subtarget.isV9()">;
 // HasVIS - This is true when the target processor has VIS extensions.
 def HasVIS : Predicate<"Subtarget.isVIS()">;
 
+// HasHardQuad - This is true when the target processor supports quad floating
+// point instructions.
+def HasHardQuad : Predicate<"Subtarget.hasHardQuad()">;
+
 // UseDeprecatedInsts - This predicate is true when the target processor is a
 // V8, or when it is V9 but the V8 deprecated instructions are efficient enough
 // to use when appropriate.  In either of these cases, the instruction selector
@@ -81,6 +85,8 @@ def MEMri : Operand<iPTR> {
   let MIOperandInfo = (ops ptr_rc, i32imm);
 }
 
+def TLSSym : Operand<iPTR>;
+
 // Branch targets have OtherVT type.
 def brtarget : Operand<OtherVT>;
 def calltarget : Operand<i32>;
@@ -101,6 +107,15 @@ def SDTSPFTOI :
 SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
 def SDTSPITOF :
 SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDTSPFTOX :
+SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisFP<1>]>;
+def SDTSPXTOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f64>]>;
+
+def SDTSPtlsadd :
+SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDTSPtlsld :
+SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 
 def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
 def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
@@ -113,6 +128,8 @@ def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
 
 def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
 def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+def SPftox  : SDNode<"SPISD::FTOX", SDTSPFTOX>;
+def SPxtof  : SDNode<"SPISD::XTOF", SDTSPXTOF>;
 
 def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
 def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
@@ -140,6 +157,12 @@ def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
                            [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
+def tlsadd        : SDNode<"SPISD::TLS_ADD", SDTSPtlsadd>;
+def tlsld         : SDNode<"SPISD::TLS_LD",  SDTSPtlsld>;
+def tlscall       : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";
 }
@@ -242,8 +265,9 @@ let hasSideEffects = 1, mayStore = 1 in {
                    [(flushw)]>;
 }
 
-def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
-                "unimp $val", []>;
+let rd = 0 in
+  def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
+                  "unimp $val", []>;
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.  This has to handle all
@@ -263,6 +287,11 @@ let Uses = [ICC], usesCustomInserter = 1 in {
    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
             "; SELECT_CC_DFP_ICC PSEUDO!",
             [(set f64:$dst, (SPselecticc f64:$T, f64:$F, imm:$Cond))]>;
+
+  def SELECT_CC_QFP_ICC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_ICC PSEUDO!",
+            [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
 let usesCustomInserter = 1, Uses = [FCC] in {
@@ -280,17 +309,21 @@ let usesCustomInserter = 1, Uses = [FCC] in {
    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
             "; SELECT_CC_DFP_FCC PSEUDO!",
             [(set f64:$dst, (SPselectfcc f64:$T, f64:$F, imm:$Cond))]>;
+  def SELECT_CC_QFP_FCC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_FCC PSEUDO!",
+            [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
 
 // Section A.3 - Synthetic Instructions, p. 85
 // special cases of JMPL:
 let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
-  let rd = O7.Num, rs1 = G0.Num in
+  let rd = 0, rs1 = 15 in
     def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
                    "jmp %o7+$val", [(retflag simm13:$val)]>;
 
-  let rd = I7.Num, rs1 = G0.Num in
+  let rd = 0, rs1 = 31 in
     def RET: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
                   "jmp %i7+$val", []>;
 }
@@ -354,56 +387,76 @@ def LDDFri : F3_2<3, 0b100011,
                   (outs DFPRegs:$dst), (ins MEMri:$addr),
                   "ldd [$addr], $dst",
                   [(set f64:$dst, (load ADDRri:$addr))]>;
+def LDQFrr : F3_1<3, 0b100010,
+                  (outs QFPRegs:$dst), (ins MEMrr:$addr),
+                  "ldq [$addr], $dst",
+                  [(set f128:$dst, (load ADDRrr:$addr))]>,
+                  Requires<[HasV9, HasHardQuad]>;
+def LDQFri : F3_2<3, 0b100010,
+                  (outs QFPRegs:$dst), (ins MEMri:$addr),
+                  "ldq [$addr], $dst",
+                  [(set f128:$dst, (load ADDRri:$addr))]>,
+                  Requires<[HasV9, HasHardQuad]>;
 
 // Section B.4 - Store Integer Instructions, p. 95
 def STBrr : F3_1<3, 0b000101,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "stb $src, [$addr]",
-                 [(truncstorei8 i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "stb $rd, [$addr]",
+                 [(truncstorei8 i32:$rd, ADDRrr:$addr)]>;
 def STBri : F3_2<3, 0b000101,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "stb $src, [$addr]",
-                 [(truncstorei8 i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "stb $rd, [$addr]",
+                 [(truncstorei8 i32:$rd, ADDRri:$addr)]>;
 def STHrr : F3_1<3, 0b000110,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "sth $src, [$addr]",
-                 [(truncstorei16 i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "sth $rd, [$addr]",
+                 [(truncstorei16 i32:$rd, ADDRrr:$addr)]>;
 def STHri : F3_2<3, 0b000110,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "sth $src, [$addr]",
-                 [(truncstorei16 i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "sth $rd, [$addr]",
+                 [(truncstorei16 i32:$rd, ADDRri:$addr)]>;
 def STrr  : F3_1<3, 0b000100,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "st $src, [$addr]",
-                 [(store i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "st $rd, [$addr]",
+                 [(store i32:$rd, ADDRrr:$addr)]>;
 def STri  : F3_2<3, 0b000100,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "st $src, [$addr]",
-                 [(store i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "st $rd, [$addr]",
+                 [(store i32:$rd, ADDRri:$addr)]>;
 
 // Section B.5 - Store Floating-point Instructions, p. 97
 def STFrr   : F3_1<3, 0b100100,
-                   (outs), (ins MEMrr:$addr, FPRegs:$src),
-                   "st $src, [$addr]",
-                   [(store f32:$src, ADDRrr:$addr)]>;
+                   (outs), (ins MEMrr:$addr, FPRegs:$rd),
+                   "st $rd, [$addr]",
+                   [(store f32:$rd, ADDRrr:$addr)]>;
 def STFri   : F3_2<3, 0b100100,
-                   (outs), (ins MEMri:$addr, FPRegs:$src),
-                   "st $src, [$addr]",
-                   [(store f32:$src, ADDRri:$addr)]>;
+                   (outs), (ins MEMri:$addr, FPRegs:$rd),
+                   "st $rd, [$addr]",
+                   [(store f32:$rd, ADDRri:$addr)]>;
 def STDFrr  : F3_1<3, 0b100111,
-                   (outs), (ins MEMrr:$addr, DFPRegs:$src),
-                   "std  $src, [$addr]",
-                   [(store f64:$src, ADDRrr:$addr)]>;
+                   (outs), (ins MEMrr:$addr, DFPRegs:$rd),
+                   "std  $rd, [$addr]",
+                   [(store f64:$rd, ADDRrr:$addr)]>;
 def STDFri  : F3_2<3, 0b100111,
-                   (outs), (ins MEMri:$addr, DFPRegs:$src),
-                   "std $src, [$addr]",
-                   [(store f64:$src, ADDRri:$addr)]>;
+                   (outs), (ins MEMri:$addr, DFPRegs:$rd),
+                   "std $rd, [$addr]",
+                   [(store f64:$rd, ADDRri:$addr)]>;
+def STQFrr  : F3_1<3, 0b100110,
+                   (outs), (ins MEMrr:$addr, QFPRegs:$rd),
+                   "stq  $rd, [$addr]",
+                   [(store f128:$rd, ADDRrr:$addr)]>,
+                   Requires<[HasV9, HasHardQuad]>;
+def STQFri  : F3_2<3, 0b100110,
+                   (outs), (ins MEMri:$addr, QFPRegs:$rd),
+                   "stq $rd, [$addr]",
+                   [(store f128:$rd, ADDRri:$addr)]>,
+                   Requires<[HasV9, HasHardQuad]>;
 
 // Section B.9 - SETHI Instruction, p. 104
 def SETHIi: F2_1<0b100,
-                 (outs IntRegs:$dst), (ins i32imm:$src),
-                 "sethi $src, $dst",
-                 [(set i32:$dst, SETHIimm:$src)]>;
+                 (outs IntRegs:$rd), (ins i32imm:$imm22),
+                 "sethi $imm22, $rd",
+                 [(set i32:$rd, SETHIimm:$imm22)]>;
 
 // Section B.10 - NOP Instruction, p. 105
 // (It's a special case of SETHI)
@@ -449,30 +502,32 @@ defm SRA : F3_12<"sra", 0b100111, sra>;
 defm ADD   : F3_12<"add", 0b000000, add>;
 
 // "LEA" forms of add (patterns to make tblgen happy)
-def LEA_ADDri   : F3_2<2, 0b000000,
-                   (outs IntRegs:$dst), (ins MEMri:$addr),
-                   "add ${addr:arith}, $dst",
-                   [(set iPTR:$dst, ADDRri:$addr)]>;
+let Predicates = [Is32Bit] in
+  def LEA_ADDri   : F3_2<2, 0b000000,
+                     (outs IntRegs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
 
 let Defs = [ICC] in
   defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
 
-let Uses = [ICC] in
-  defm ADDX  : F3_12<"addx", 0b001000, adde>;
+let Uses = [ICC], Defs = [ICC] in
+  defm ADDX  : F3_12<"addxcc", 0b011000, adde>;
 
 // Section B.15 - Subtract Instructions, p. 110
 defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
-let Uses = [ICC] in
-  defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
+let Uses = [ICC], Defs = [ICC] in
+  defm SUBX   : F3_12  <"subxcc" , 0b011100, sube>;
 
-let Defs = [ICC] in {
+let Defs = [ICC] in
   defm SUBCC  : F3_12  <"subcc", 0b010100, subc>;
 
+let Defs = [ICC], rd = 0 in {
   def CMPrr   : F3_1<2, 0b010100,
                      (outs), (ins IntRegs:$b, IntRegs:$c),
                      "cmp $b, $c",
                      [(SPcmpicc i32:$b, i32:$c)]>;
-  def CMPri   : F3_1<2, 0b010100,
+  def CMPri   : F3_2<2, 0b010100,
                      (outs), (ins IntRegs:$b, i32imm:$c),
                      "cmp $b, $c",
                      [(SPcmpicc i32:$b, (i32 simm13:$c))]>;
@@ -502,23 +557,30 @@ defm RESTORE : F3_12np<"restore", 0b111101>;
 
 // Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
 
+// unconditional branch class.
+class BranchAlways<dag ins, string asmstr, list<dag> pattern>
+  : F2_2<0b010, (outs), ins, asmstr, pattern> {
+  let isBranch     = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let isBarrier    = 1;
+}
+
+let cond = 8 in
+  def BA : BranchAlways<(ins brtarget:$imm22), "ba $imm22", [(br bb:$imm22)]>;
+
 // conditional branch class:
-class BranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
- : F2_2<cc, 0b010, (outs), ins, asmstr, pattern> {
+class BranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, (outs), ins, asmstr, pattern> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
 }
 
-let isBarrier = 1 in
-  def BA   : BranchSP<0b1000, (ins brtarget:$dst),
-                      "ba $dst",
-                      [(br bb:$dst)]>;
-
 // Indirect branch instructions.
 let isTerminator = 1, isBarrier = 1,
      hasDelaySlot = 1, isBranch =1,
-     isIndirectBranch = 1 in {
+     isIndirectBranch = 1, rd = 0 in {
   def BINDrr  : F3_1<2, 0b111000,
                    (outs), (ins MEMrr:$ptr),
                    "jmp $ptr",
@@ -529,37 +591,31 @@ let isTerminator = 1, isBarrier = 1,
                    [(brind ADDRri:$ptr)]>;
 }
 
-// FIXME: the encoding for the JIT should look at the condition field.
 let Uses = [ICC] in
-  def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                         "b$cc $dst",
-                        [(SPbricc bb:$dst, imm:$cc)]>;
-
+  def BCOND : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond $imm22",
+                        [(SPbricc bb:$imm22, imm:$cond)]>;
 
 // Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
 
 // floating-point conditional branch class:
-class FPBranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
- : F2_2<cc, 0b110, (outs), ins, asmstr, pattern> {
+class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, (outs), ins, asmstr, pattern> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
 }
 
-// FIXME: the encoding for the JIT should look at the condition field.
 let Uses = [FCC] in
-  def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                              "fb$cc $dst",
-                              [(SPbrfcc bb:$dst, imm:$cc)]>;
+  def FBCOND  : FPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                              "fb$cond $imm22",
+                              [(SPbrfcc bb:$imm22, imm:$cond)]>;
 
 
 // Section B.24 - Call and Link Instruction, p. 125
 // This is the only Format 1 instruction
 let Uses = [O6],
-    hasDelaySlot = 1, isCall = 1,
-    Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7,
-    D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15,
-        ICC, FCC, Y] in {
+    hasDelaySlot = 1, isCall = 1 in {
   def CALL : InstSP<(outs), (ins calltarget:$dst, variable_ops),
                     "call $dst", []> {
     bits<30> disp;
@@ -571,21 +627,21 @@ let Uses = [O6],
   def JMPLrr : F3_1<2, 0b111000,
                     (outs), (ins MEMrr:$ptr, variable_ops),
                     "call $ptr",
-                    [(call ADDRrr:$ptr)]>;
+                    [(call ADDRrr:$ptr)]> { let rd = 15; }
   def JMPLri : F3_2<2, 0b111000,
                     (outs), (ins MEMri:$ptr, variable_ops),
                     "call $ptr",
-                    [(call ADDRri:$ptr)]>;
+                    [(call ADDRri:$ptr)]> { let rd = 15; }
 }
 
 // Section B.28 - Read State Register Instructions
-let Uses = [Y] in
+let Uses = [Y], rs1 = 0, rs2 = 0 in
   def RDY : F3_1<2, 0b101000,
                  (outs IntRegs:$dst), (ins),
                  "rd %y, $dst", []>;
 
 // Section B.29 - Write State Register Instructions
-let Defs = [Y] in {
+let Defs = [Y], rd = 0 in {
   def WRYrr : F3_1<2, 0b110000,
                    (outs), (ins IntRegs:$b, IntRegs:$c),
                    "wr $b, $c, %y", []>;
@@ -594,58 +650,93 @@ let Defs = [Y] in {
                    "wr $b, $c, %y", []>;
 }
 // Convert Integer to Floating-point Instructions, p. 141
-def FITOS : F3_3<2, 0b110100, 0b011000100,
+def FITOS : F3_3u<2, 0b110100, 0b011000100,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fitos $src, $dst",
                  [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
-def FITOD : F3_3<2, 0b110100, 0b011001000,
+def FITOD : F3_3u<2, 0b110100, 0b011001000,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fitod $src, $dst",
                  [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+def FITOQ : F3_3u<2, 0b110100, 0b011001100,
+                 (outs QFPRegs:$dst), (ins FPRegs:$src),
+                 "fitoq $src, $dst",
+                 [(set QFPRegs:$dst, (SPitof FPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Convert Floating-point to Integer Instructions, p. 142
-def FSTOI : F3_3<2, 0b110100, 0b011010001,
+def FSTOI : F3_3u<2, 0b110100, 0b011010001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fstoi $src, $dst",
                  [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
-def FDTOI : F3_3<2, 0b110100, 0b011010010,
+def FDTOI : F3_3u<2, 0b110100, 0b011010010,
                  (outs FPRegs:$dst), (ins DFPRegs:$src),
                  "fdtoi $src, $dst",
                  [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+def FQTOI : F3_3u<2, 0b110100, 0b011010011,
+                 (outs FPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi QFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Convert between Floating-point Formats Instructions, p. 143
-def FSTOD : F3_3<2, 0b110100, 0b011001001,
+def FSTOD : F3_3u<2, 0b110100, 0b011001001,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fstod $src, $dst",
                  [(set f64:$dst, (fextend f32:$src))]>;
-def FDTOS : F3_3<2, 0b110100, 0b011000110,
+def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
+                 (outs QFPRegs:$dst), (ins FPRegs:$src),
+                 "fstoq $src, $dst",
+                 [(set f128:$dst, (fextend f32:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FDTOS : F3_3u<2, 0b110100, 0b011000110,
                  (outs FPRegs:$dst), (ins DFPRegs:$src),
                  "fdtos $src, $dst",
                  [(set f32:$dst, (fround f64:$src))]>;
+def FDTOQ : F3_3u<2, 0b110100, 0b01101110,
+                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtoq $src, $dst",
+                 [(set f128:$dst, (fextend f64:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOS : F3_3u<2, 0b110100, 0b011000111,
+                 (outs FPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtos $src, $dst",
+                 [(set f32:$dst, (fround f128:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOD : F3_3u<2, 0b110100, 0b011001011,
+                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtod $src, $dst",
+                 [(set f64:$dst, (fround f128:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Floating-point Move Instructions, p. 144
-def FMOVS : F3_3<2, 0b110100, 0b000000001,
+def FMOVS : F3_3u<2, 0b110100, 0b000000001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fmovs $src, $dst", []>;
-def FNEGS : F3_3<2, 0b110100, 0b000000101,
+def FNEGS : F3_3u<2, 0b110100, 0b000000101,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fnegs $src, $dst",
                  [(set f32:$dst, (fneg f32:$src))]>;
-def FABSS : F3_3<2, 0b110100, 0b000001001,
+def FABSS : F3_3u<2, 0b110100, 0b000001001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fabss $src, $dst",
                  [(set f32:$dst, (fabs f32:$src))]>;
 
 
 // Floating-point Square Root Instructions, p.145
-def FSQRTS : F3_3<2, 0b110100, 0b000101001,
+def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
                   (outs FPRegs:$dst), (ins FPRegs:$src),
                   "fsqrts $src, $dst",
                   [(set f32:$dst, (fsqrt f32:$src))]>;
-def FSQRTD : F3_3<2, 0b110100, 0b000101010,
+def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
                   "fsqrtd $src, $dst",
                   [(set f64:$dst, (fsqrt f64:$src))]>;
+def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                  "fsqrtq $src, $dst",
+                  [(set f128:$dst, (fsqrt f128:$src))]>,
+                  Requires<[HasHardQuad]>;
 
 
 
@@ -658,6 +749,12 @@ def FADDD  : F3_3<2, 0b110100, 0b001000010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "faddd $src1, $src2, $dst",
                   [(set f64:$dst, (fadd f64:$src1, f64:$src2))]>;
+def FADDQ  : F3_3<2, 0b110100, 0b001000011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "faddq $src1, $src2, $dst",
+                  [(set f128:$dst, (fadd f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 def FSUBS  : F3_3<2, 0b110100, 0b001000101,
                   (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                   "fsubs $src1, $src2, $dst",
@@ -666,6 +763,12 @@ def FSUBD  : F3_3<2, 0b110100, 0b001000110,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "fsubd $src1, $src2, $dst",
                   [(set f64:$dst, (fsub f64:$src1, f64:$src2))]>;
+def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "fsubq $src1, $src2, $dst",
+                  [(set f128:$dst, (fsub f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 
 // Floating-point Multiply and Divide Instructions, p. 147
 def FMULS  : F3_3<2, 0b110100, 0b001001001,
@@ -676,11 +779,24 @@ def FMULD  : F3_3<2, 0b110100, 0b001001010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "fmuld $src1, $src2, $dst",
                   [(set f64:$dst, (fmul f64:$src1, f64:$src2))]>;
+def FMULQ  : F3_3<2, 0b110100, 0b001001011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "fmulq $src1, $src2, $dst",
+                  [(set f128:$dst, (fmul f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 def FSMULD : F3_3<2, 0b110100, 0b001101001,
                   (outs DFPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                   "fsmuld $src1, $src2, $dst",
                   [(set f64:$dst, (fmul (fextend f32:$src1),
                                         (fextend f32:$src2)))]>;
+def FDMULQ : F3_3<2, 0b110100, 0b001101110,
+                  (outs QFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fdmulq $src1, $src2, $dst",
+                  [(set f128:$dst, (fmul (fextend f64:$src1),
+                                         (fextend f64:$src2)))]>,
+                  Requires<[HasHardQuad]>;
+
 def FDIVS  : F3_3<2, 0b110100, 0b001001101,
                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                  "fdivs $src1, $src2, $dst",
@@ -689,21 +805,61 @@ def FDIVD  : F3_3<2, 0b110100, 0b001001110,
                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                  "fdivd $src1, $src2, $dst",
                  [(set f64:$dst, (fdiv f64:$src1, f64:$src2))]>;
+def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
+                 (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                 "fdivq $src1, $src2, $dst",
+                 [(set f128:$dst, (fdiv f128:$src1, f128:$src2))]>,
+                 Requires<[HasHardQuad]>;
 
 // Floating-point Compare Instructions, p. 148
 // Note: the 2nd template arg is different for these guys.
 // Note 2: the result of a FCMP is not available until the 2nd cycle
-// after the instr is retired, but there is no interlock. This behavior
-// is modelled with a forced noop after the instruction.
+// after the instr is retired, but there is no interlock in Sparc V8.
+// This behavior is modeled with a forced noop after the instruction in
+// DelaySlotFiller.
+
 let Defs = [FCC] in {
-  def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+  def FCMPS  : F3_3c<2, 0b110101, 0b001010001,
                    (outs), (ins FPRegs:$src1, FPRegs:$src2),
-                   "fcmps $src1, $src2\n\tnop",
+                   "fcmps $src1, $src2",
                    [(SPcmpfcc f32:$src1, f32:$src2)]>;
-  def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+  def FCMPD  : F3_3c<2, 0b110101, 0b001010010,
                    (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
-                   "fcmpd $src1, $src2\n\tnop",
+                   "fcmpd $src1, $src2",
                    [(SPcmpfcc f64:$src1, f64:$src2)]>;
+  def FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                   (outs), (ins QFPRegs:$src1, QFPRegs:$src2),
+                   "fcmpq $src1, $src2",
+                   [(SPcmpfcc f128:$src1, f128:$src2)]>,
+                   Requires<[HasHardQuad]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions for Thread Local Storage(TLS).
+//===----------------------------------------------------------------------===//
+
+def TLS_ADDrr : F3_1<2, 0b000000,
+                    (outs IntRegs:$rd),
+                    (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
+                    "add $rs1, $rs2, $rd, $sym",
+                    [(set i32:$rd,
+                        (tlsadd i32:$rs1, i32:$rs2, tglobaltlsaddr:$sym))]>;
+
+let mayLoad = 1 in
+  def TLS_LDrr : F3_1<3, 0b000000,
+                      (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                      "ld [$addr], $dst, $sym",
+                      [(set i32:$dst,
+                          (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
+  def TLS_CALL : InstSP<(outs),
+                        (ins calltarget:$disp, TLSSym:$sym, variable_ops),
+                        "call $disp, $sym",
+                        [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)]> {
+  bits<30> disp;
+  let op = 1;
+  let Inst{29-0} = disp;
 }
 
 //===----------------------------------------------------------------------===//
@@ -713,73 +869,108 @@ let Defs = [FCC] in {
 // V9 Conditional Moves.
 let Predicates = [HasV9], Constraints = "$f = $rd" in {
   // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
-  // FIXME: Add instruction encodings for the JIT some day.
-  let Uses = [ICC] in {
+  let Uses = [ICC], cc = 0b100 in {
     def MOVICCrr
-      : Pseudo<(outs IntRegs:$rd), (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cc),
-               "mov$cc %icc, $rs2, $rd",
-               [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cc))]>;
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $rs2, $rd",
+             [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cond))]>;
+
     def MOVICCri
-      : Pseudo<(outs IntRegs:$rd), (ins i32imm:$i, IntRegs:$f, CCOp:$cc),
-               "mov$cc %icc, $i, $rd",
-               [(set i32:$rd, (SPselecticc simm11:$i, i32:$f, imm:$cc))]>;
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselecticc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [FCC] in {
+  let Uses = [FCC], cc = 0b000 in {
     def MOVFCCrr
-      : Pseudo<(outs IntRegs:$rd), (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cc),
-               "mov$cc %fcc0, $rs2, $rd",
-               [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cc))]>;
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $rs2, $rd",
+             [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cond))]>;
     def MOVFCCri
-      : Pseudo<(outs IntRegs:$rd), (ins i32imm:$i, IntRegs:$f, CCOp:$cc),
-               "mov$cc %fcc0, $i, $rd",
-               [(set i32:$rd, (SPselectfcc simm11:$i, i32:$f, imm:$cc))]>;
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselectfcc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [ICC] in {
+  let Uses = [ICC], opf_cc = 0b100 in {
     def FMOVS_ICC
-      : Pseudo<(outs FPRegs:$rd), (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cc),
-               "fmovs$cc %icc, $rs2, $rd",
-               [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %icc, $rs2, $rd",
+             [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cond))]>;
     def FMOVD_ICC
-      : Pseudo<(outs DFPRegs:$rd), (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cc),
-               "fmovd$cc %icc, $rs2, $rd",
-               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+               (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_ICC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+               (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
-  let Uses = [FCC] in {
+  let Uses = [FCC], opf_cc = 0b000 in {
     def FMOVS_FCC
-      : Pseudo<(outs FPRegs:$rd), (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cc),
-               "fmovs$cc %fcc0, $rs2, $rd",
-               [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %fcc0, $rs2, $rd",
+             [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cond))]>;
     def FMOVD_FCC
-      : Pseudo<(outs DFPRegs:$rd), (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cc),
-               "fmovd$cc %fcc0, $rs2, $rd",
-               [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
 }
 
 // Floating-Point Move Instructions, p. 164 of the V9 manual.
 let Predicates = [HasV9] in {
-  def FMOVD : F3_3<2, 0b110100, 0b000000010,
+  def FMOVD : F3_3u<2, 0b110100, 0b000000010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fmovd $src, $dst", []>;
-  def FNEGD : F3_3<2, 0b110100, 0b000000110,
+  def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fmovq $src, $dst", []>,
+                   Requires<[HasHardQuad]>;
+  def FNEGD : F3_3u<2, 0b110100, 0b000000110,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fnegd $src, $dst",
                    [(set f64:$dst, (fneg f64:$src))]>;
-  def FABSD : F3_3<2, 0b110100, 0b000001010,
+  def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fnegq $src, $dst",
+                   [(set f128:$dst, (fneg f128:$src))]>,
+                   Requires<[HasHardQuad]>;
+  def FABSD : F3_3u<2, 0b110100, 0b000001010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fabsd $src, $dst",
                    [(set f64:$dst, (fabs f64:$src))]>;
+  def FABSQ : F3_3u<2, 0b110100, 0b000001011,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fabsq $src, $dst",
+                   [(set f128:$dst, (fabs f128:$src))]>,
+                   Requires<[HasHardQuad]>;
 }
 
 // POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
 // the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
-def POPCrr : F3_1<2, 0b101110,
-                  (outs IntRegs:$dst), (ins IntRegs:$src),
-                  "popc $src, $dst", []>, Requires<[HasV9]>;
+let rs1 = 0 in
+  def POPCrr : F3_1<2, 0b101110,
+                    (outs IntRegs:$dst), (ins IntRegs:$src),
+                    "popc $src, $dst", []>, Requires<[HasV9]>;
 def : Pat<(ctpop i32:$src),
           (POPCrr (SLLri $src, 0))>;
 
@@ -801,6 +992,14 @@ def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
 def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
 def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
 
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORri (i32 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
 // Blockaddress
 def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
 def : Pat<(SPlo tblockaddress:$in), (ORri (i32 G0), tblockaddress:$in)>;
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
new file mode 100644
index 0000000..6493c7d
--- /dev/null
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -0,0 +1,165 @@
+//===-- SparcJITInfo.cpp - Implement the Sparc JIT Interface --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Sparc target.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "jit"
+#include "SparcJITInfo.h"
+#include "SparcRelocations.h"
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Memory.h"
+
+using namespace llvm;
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+extern "C" void SparcCompilationCallback();
+
+extern "C" {
+#if defined (__sparc__)
+  asm(
+      ".text\n"
+      "\t.align 4\n"
+      "\t.global SparcCompilationCallback\n"
+      "\t.type SparcCompilationCallback, #function\n"
+      "SparcCompilationCallback:\n"
+      // Save current register window.
+      "\tsave %sp, -192, %sp\n"
+      // stubaddr+4 is in %g1.
+      "\tcall SparcCompilationCallbackC\n"
+      "\t  sub %g1, 4, %o0\n"
+      // restore original register window and
+      // copy %o0 to %g1
+      "\t  restore %o0, 0, %g1\n"
+      // call the new stub
+      "\tjmp %g1\n"
+      "\t  nop\n"
+      "\t.size   SparcCompilationCallback, .-SparcCompilationCallback"
+      );
+
+#else
+  void SparcCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call SparcCompilationCallback() on a non-sparc arch!");
+  }
+#endif
+}
+
+#define HI(Val) (((unsigned)(Val)) >> 10)
+#define LO(Val) (((unsigned)(Val)) & 0x3FF)
+
+#define SETHI_INST(imm, rd)    (0x01000000 | ((rd) << 25) | ((imm) & 0x3FFFFF))
+#define JMP_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x38 << 19) \
+                                | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define NOP_INST               SETHI_INST(0, 0)
+
+extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the first three instructions of the
+  // stub with code that jumps to the compiled function:
+  //   sethi %hi(NewVal), %g1
+  //   jmp %g1+%lo(NewVal)
+  //   nop
+
+  *(intptr_t *)(StubAddr)      = SETHI_INST(HI(NewVal), 1);
+  *(intptr_t *)(StubAddr + 4)  = JMP_INST(1, LO(NewVal), 0);
+  *(intptr_t *)(StubAddr + 8)  = NOP_INST;
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 12);
+  return (void*)StubAddr;
+}
+
+void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  assert(0 && "FIXME: Implement SparcJITInfo::replaceMachineCodeForFunction");
+}
+
+
+TargetJITInfo::StubLayout SparcJITInfo::getStubLayout() {
+  // The stub contains 3 4-byte instructions, aligned at 4 bytes. See
+  // emitFunctionStub for details.
+
+  StubLayout Result = { 3*4, 4 };
+  return Result;
+}
+
+void *SparcJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                     JITCodeEmitter &JCE)
+{
+  JCE.emitAlignment(4);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+  if (!sys::Memory::setRangeWritable(Addr, 12))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  intptr_t EmittedAddr;
+  if (Fn != (void*)(intptr_t)SparcCompilationCallback)
+    EmittedAddr = (intptr_t)Fn;
+  else
+    EmittedAddr = (intptr_t)SparcCompilationCallback;
+
+  // sethi %hi(EmittedAddr), %g1
+  // jmp   %g1+%lo(EmittedAddr), %g1
+  // nop
+
+  JCE.emitWordBE(SETHI_INST(HI(EmittedAddr), 1));
+  JCE.emitWordBE(JMP_INST(1, LO(EmittedAddr), 1));
+  JCE.emitWordBE(NOP_INST);
+
+  sys::Memory::InvalidateInstructionCache(Addr, 12);
+  if (!sys::Memory::setRangeExecutable(Addr, 12))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+TargetJITInfo::LazyResolverFn
+SparcJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return SparcCompilationCallback;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void SparcJITInfo::relocate(void *Function, MachineRelocation *MR,
+                            unsigned NumRelocs, unsigned char *GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((SP::RelocationType) MR->getRelocationType()) {
+    case SP::reloc_sparc_hi:
+      ResultPtr = (ResultPtr >> 10) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_lo:
+      ResultPtr = (ResultPtr & 0x3ff);
+      break;
+
+    case SP::reloc_sparc_pc30:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffffff;
+      break;
+
+    case SP::reloc_sparc_pc22:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_pc19:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x7ffff;
+      break;
+    }
+    *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+  }
+}
diff --git a/lib/Target/Sparc/SparcJITInfo.h b/lib/Target/Sparc/SparcJITInfo.h
new file mode 100644
index 0000000..9c6e488
--- /dev/null
+++ b/lib/Target/Sparc/SparcJITInfo.h
@@ -0,0 +1,67 @@
+//==- SparcJITInfo.h - Sparc Implementation of the JIT Interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCJITINFO_H
+#define SPARCJITINFO_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+class SparcTargetMachine;
+
+class SparcJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+
+  public:
+  explicit SparcJITInfo()
+    :  IsPIC(false) {}
+
+  /// replaceMachineCodeForFunction - Make it so that calling the function
+  /// whose machine code is at OLD turns into a call to NEW, perhaps by
+  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+  /// code.
+  ///
+  virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+  // getStubLayout - Returns the size and alignment of the largest call stub
+  // on Sparc.
+  virtual StubLayout getStubLayout();
+
+
+  /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+  /// small native function that simply calls the function at the specified
+  /// address.
+  virtual void *emitFunctionStub(const Function *F, void *Fn,
+                                 JITCodeEmitter &JCE);
+
+  /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+  virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+  /// relocate - Before the JIT can run a block of code that has been emitted,
+  /// it must rewrite the code to contain the actual addresses of any
+  /// referenced global symbols.
+  virtual void relocate(void *Function, MachineRelocation *MR,
+                        unsigned NumRelocs, unsigned char *GOTBase);
+
+  /// Initialize - Initialize internal stage for the function being JITted.
+  void Initialize(const MachineFunction &MF, bool isPIC) {
+    IsPIC = isPIC;
+  }
+
+};
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index dc97f06..c98613a 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -40,8 +40,17 @@ SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
 
 const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
                                                                          const {
-  static const uint16_t CalleeSavedRegs[] = { 0 };
-  return CalleeSavedRegs;
+  return CSR_SaveList;
+}
+
+const uint32_t*
+SparcRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  return CSR_RegMask;
+}
+
+const uint32_t*
+SparcRegisterInfo::getRTCallPreservedMask(CallingConv::ID CC) const {
+  return RTCSR_RegMask;
 }
 
 BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -65,6 +74,15 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(SP::G0);
   Reserved.set(SP::G6);
   Reserved.set(SP::G7);
+
+  // Unaliased double registers are not available in non-V9 targets.
+  if (!Subtarget.isV9()) {
+    for (unsigned n = 0; n != 16; ++n) {
+      for (MCRegAliasIterator AI(SP::D16 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
   return Reserved;
 }
 
@@ -74,6 +92,62 @@ SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
   return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
 }
 
+static void replaceFI(MachineFunction &MF,
+                      MachineBasicBlock::iterator II,
+                      MachineInstr &MI,
+                      DebugLoc dl,
+                      unsigned FIOperandNum, int Offset,
+                      unsigned FramePtr)
+{
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving G1 all of the time.
+  if (Offset >= 0) {
+    // Emit nonnegaive immediates with sethi + or.
+    // sethi %hi(Offset), %g1
+    // add %g1, %fp, %g1
+    // Insert G1+%lo(offset) into the user.
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(Offset));
+
+
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(FramePtr);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(LO10(Offset));
+    return;
+  }
+
+  // Emit Negative numbers with sethi + xor
+  // sethi %hix(Offset), %g1
+  // xor  %g1, %lox(offset), %g1
+  // add %g1, %fp, %g1
+  // Insert: G1 + 0 into the user.
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(Offset));
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(Offset));
+
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+    .addReg(FramePtr);
+  // Insert: G1+%lo(offset) into the user.
+  MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+}
+
+
 void
 SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                        int SPAdj, unsigned FIOperandNum,
@@ -98,35 +172,40 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ;
   }
 
-  // Replace frame index with a frame pointer reference.
-  if (Offset >= -4096 && Offset <= 4095) {
-    // If the offset is small enough to fit in the immediate field, directly
-    // encode it.
-    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-  } else {
-    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to
-    // scavenge a register here instead of reserving G1 all of the time.
-    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-    unsigned OffHi = (unsigned)Offset >> 10U;
-    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
-      .addReg(FramePtr);
-    // Insert: G1+%lo(offset) into the user.
-    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset & ((1 << 10)-1));
+  if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
+    if (MI.getOpcode() == SP::STQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned SrcReg = MI.getOperand(2).getReg();
+      unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+      unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
+        .addReg(FramePtr).addImm(0).addReg(SrcEvenReg);
+      replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr);
+      MI.setDesc(TII.get(SP::STDFri));
+      MI.getOperand(2).setReg(SrcOddReg);
+      Offset += 8;
+    } else if (MI.getOpcode() == SP::LDQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned DestReg     = MI.getOperand(0).getReg();
+      unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+      unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
+        .addReg(FramePtr).addImm(0);
+      replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr);
+
+      MI.setDesc(TII.get(SP::LDDFri));
+      MI.getOperand(0).setReg(DestOddReg);
+      Offset += 8;
+    }
   }
+
+  replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr);
+
 }
 
 unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return SP::I6;
 }
 
-unsigned SparcRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned SparcRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 6b77d4e..00b5a98 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -32,6 +32,9 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const;
+
+  const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -47,10 +50,6 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index a59c442..2a575c0 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -11,8 +11,8 @@
 //  Declarations that describe the Sparc register file
 //===----------------------------------------------------------------------===//
 
-class SparcReg<string n> : Register<n> {
-  field bits<5> Num;
+class SparcReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
   let Namespace = "SP";
 }
 
@@ -23,25 +23,31 @@ class SparcCtrlReg<string n>: Register<n> {
 let Namespace = "SP" in {
 def sub_even : SubRegIndex<32>;
 def sub_odd  : SubRegIndex<32, 32>;
+def sub_even64 : SubRegIndex<64>;
+def sub_odd64  : SubRegIndex<64, 64>;
 }
 
 // Registers are identified with 5-bit ID numbers.
 // Ri - 32-bit integer registers
-class Ri<bits<5> num, string n> : SparcReg<n> {
-  let Num = num;
-}
+class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
 // Rf - 32-bit floating-point registers
-class Rf<bits<5> num, string n> : SparcReg<n> {
-  let Num = num;
-}
+class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
 // Rd - Slots in the FP register file for 64-bit floating-point values.
-class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
-  let Num = num;
+class Rd<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
   let SubRegs = subregs;
   let SubRegIndices = [sub_even, sub_odd];
   let CoveredBySubRegs = 1;
 }
 
+// Rq - Slots in the FP register file for 128-bit floating-point values.
+class Rq<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even64, sub_odd64];
+  let CoveredBySubRegs = 1;
+}
+
 // Control Registers
 def ICC : SparcCtrlReg<"ICC">; // This represents icc and xcc in 64-bit code.
 def FCC : SparcCtrlReg<"FCC">;
@@ -135,6 +141,43 @@ def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
 def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
 def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 
+// Unaliased double precision floating point registers.
+// FIXME: Define DwarfRegNum for these registers.
+def D16 : SparcReg< 1, "F32">;
+def D17 : SparcReg< 3, "F34">;
+def D18 : SparcReg< 5, "F36">;
+def D19 : SparcReg< 7, "F38">;
+def D20 : SparcReg< 9, "F40">;
+def D21 : SparcReg<11, "F42">;
+def D22 : SparcReg<13, "F44">;
+def D23 : SparcReg<15, "F46">;
+def D24 : SparcReg<17, "F48">;
+def D25 : SparcReg<19, "F50">;
+def D26 : SparcReg<21, "F52">;
+def D27 : SparcReg<23, "F54">;
+def D28 : SparcReg<25, "F56">;
+def D29 : SparcReg<27, "F58">;
+def D30 : SparcReg<29, "F60">;
+def D31 : SparcReg<31, "F62">;
+
+// Aliases of the F* registers used to hold 128-bit for values (long doubles).
+def Q0  : Rq< 0,  "F0", [D0,   D1]>;
+def Q1  : Rq< 4,  "F4", [D2,   D3]>;
+def Q2  : Rq< 8,  "F8", [D4,   D5]>;
+def Q3  : Rq<12, "F12", [D6,   D7]>;
+def Q4  : Rq<16, "F16", [D8,   D9]>;
+def Q5  : Rq<20, "F20", [D10, D11]>;
+def Q6  : Rq<24, "F24", [D12, D13]>;
+def Q7  : Rq<28, "F28", [D14, D15]>;
+def Q8  : Rq< 1, "F32", [D16, D17]>;
+def Q9  : Rq< 5, "F36", [D18, D19]>;
+def Q10 : Rq< 9, "F40", [D20, D21]>;
+def Q11 : Rq<13, "F44", [D22, D23]>;
+def Q12 : Rq<17, "F48", [D24, D25]>;
+def Q13 : Rq<21, "F52", [D26, D27]>;
+def Q14 : Rq<25, "F56", [D28, D29]>;
+def Q15 : Rq<29, "F60", [D30, D31]>;
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
@@ -158,4 +201,6 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
 // Floating point register classes.
 def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
 
-def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
+
+def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
diff --git a/lib/Target/Sparc/SparcRelocations.h b/lib/Target/Sparc/SparcRelocations.h
new file mode 100644
index 0000000..388cfe7
--- /dev/null
+++ b/lib/Target/Sparc/SparcRelocations.h
@@ -0,0 +1,41 @@
+//===-- SparcRelocations.h - Sparc Code Relocations -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Sparc target-specific relocation types
+// (for relocation-model=static).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_RELOCATIONS_H
+#define SPARC_RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace SP {
+    enum RelocationType {
+      // reloc_sparc_hi - upper 22 bits
+      reloc_sparc_hi = 1,
+
+      // reloc_sparc_lo - lower 10 bits
+      reloc_sparc_lo = 2,
+
+      // reloc_sparc_pc30 - pc rel. 30 bits for call
+      reloc_sparc_pc30 = 3,
+
+     // reloc_sparc_pc22 - pc rel. 22 bits for branch
+      reloc_sparc_pc22 = 4,
+
+      // reloc_sparc_pc22 - pc rel. 19 bits for branch with icc/xcc
+      reloc_sparc_pc19 = 5
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index f9ce098..7d09d0e 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -30,7 +30,8 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   IsV9(false),
   V8DeprecatedInsts(false),
   IsVIS(false),
-  Is64Bit(is64Bit) {
+  Is64Bit(is64Bit),
+  HasHardQuad(false) {
 
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 2bf599d..0f81cc9 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -29,6 +29,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool V8DeprecatedInsts;
   bool IsVIS;
   bool Is64Bit;
+  bool HasHardQuad;
 
 public:
   SparcSubtarget(const std::string &TT, const std::string &CPU,
@@ -37,6 +38,7 @@ public:
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
   bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  bool hasHardQuad() const { return HasHardQuad; }
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index a7355f4..0f93674 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -65,6 +65,13 @@ bool SparcPassConfig::addInstSelector() {
   return false;
 }
 
+bool SparcTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Sparc.
+  PM.add(createSparcJITCodeEmitterPass(*this, JCE));
+  return false;
+}
+
 /// addPreEmitPass - This pass may be implemented by targets that want to run
 /// passes immediately before machine code is emitted.  This should return
 /// true if -print-machineinstrs should print out the code after the passes.
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 081075d..8c9bcd3 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -17,6 +17,7 @@
 #include "SparcFrameLowering.h"
 #include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
+#include "SparcJITInfo.h"
 #include "SparcSelectionDAGInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,6 +33,7 @@ class SparcTargetMachine : public LLVMTargetMachine {
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
+  SparcJITInfo JITInfo;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -52,10 +54,14 @@ public:
   virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
     return &TSInfo;
   }
+  virtual SparcJITInfo *getJITInfo() {
+    return &JITInfo;
+  }
   virtual const DataLayout       *getDataLayout() const { return &DL; }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
 };
 
 /// SparcV8TargetMachine - Sparc 32-bit target machine
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index bb71463..4eea163 100644
--- a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -15,7 +15,9 @@ using namespace llvm;
 Target llvm::TheSparcTarget;
 Target llvm::TheSparcV9Target;
 
-extern "C" void LLVMInitializeSparcTargetInfo() { 
-  RegisterTarget<Triple::sparc> X(TheSparcTarget, "sparc", "Sparc");
-  RegisterTarget<Triple::sparcv9> Y(TheSparcV9Target, "sparcv9", "Sparc V9");
+extern "C" void LLVMInitializeSparcTargetInfo() {
+  RegisterTarget<Triple::sparc, /*HasJIT=*/ true>
+    X(TheSparcTarget, "sparc", "Sparc");
+  RegisterTarget<Triple::sparcv9, /*HasJIT=*/ true>
+    Y(TheSparcV9Target, "sparcv9", "Sparc V9");
 }
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 58af2c4..763f40c 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -32,6 +32,7 @@ static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
 namespace {
 enum RegisterKind {
   GR32Reg,
+  GRH32Reg,
   GR64Reg,
   GR128Reg,
   ADDR32Reg,
@@ -262,6 +263,8 @@ public:
 
   // Used by the TableGen code to check for particular operand types.
   bool isGR32() const { return isReg(GR32Reg); }
+  bool isGRH32() const { return isReg(GRH32Reg); }
+  bool isGRX32() const { return false; }
   bool isGR64() const { return isReg(GR64Reg); }
   bool isGR128() const { return isReg(GR128Reg); }
   bool isADDR32() const { return isReg(ADDR32Reg); }
@@ -327,8 +330,9 @@ private:
                     StringRef Mnemonic);
 
 public:
-  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                   const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
     // Initialize the set of available features.
@@ -355,6 +359,14 @@ public:
     return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
   }
   OperandMatchResultTy
+  parseGRH32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
+  }
+  OperandMatchResultTy
+  parseGRX32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    llvm_unreachable("GRX32 should only be used for pseudo instructions");
+  }
+  OperandMatchResultTy
   parseGR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
   }
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index ab657f6..d21c0a8 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -21,9 +21,11 @@ add_llvm_target(SystemZCodeGen
   SystemZISelLowering.cpp
   SystemZInstrInfo.cpp
   SystemZLongBranch.cpp
+  SystemZMachineFunctionInfo.cpp
   SystemZMCInstLower.cpp
   SystemZRegisterInfo.cpp
   SystemZSelectionDAGInfo.cpp
+  SystemZShortenInst.cpp
   SystemZSubtarget.cpp
   SystemZTargetMachine.cpp
   )
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 79469b6..fc3c38d 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -48,14 +48,11 @@ extern "C" void LLVMInitializeSystemZDisassembler() {
 }
 
 static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                        const unsigned *Regs,
-                                        bool isAddress = false) {
+                                        const unsigned *Regs) {
   assert(RegNo < 16 && "Invalid register");
-  if (!isAddress || RegNo) {
-    RegNo = Regs[RegNo];
-    if (RegNo == 0)
-      return MCDisassembler::Fail;
-  }
+  RegNo = Regs[RegNo];
+  if (RegNo == 0)
+    return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateReg(RegNo));
   return MCDisassembler::Success;
 }
@@ -66,6 +63,12 @@ static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs);
 }
 
+static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs);
+}
+
 static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
                                                const void *Decoder) {
@@ -81,7 +84,7 @@ static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
 static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder) {
-  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, true);
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
 }
 
 static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 37ebff3..e1e64d3 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -124,18 +124,6 @@ void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
     O << *MO.getExpr();
 }
 
-void SystemZInstPrinter::printCallOperand(const MCInst *MI, int OpNum,
-                                          raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.isImm()) {
-    O << "0x";
-    O.write_hex(MO.getImm());
-  } else {
-    O << *MO.getExpr();
-    O << "@PLT";
-  }
-}
-
 void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
                                       raw_ostream &O) {
   printOperand(MI->getOperand(OpNum), O);
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 30cdee5..734ecf0 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -58,7 +58,6 @@ private:
   void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
-  void printCallOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printAccessRegOperand(const MCInst *MI, int OpNum, raw_ostream &O);
 
   // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 027db44..26a8fae 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -35,15 +35,6 @@ static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
   llvm_unreachable("Unknown fixup kind!");
 }
 
-// If Opcode is a relaxable interprocedural reference, return the relaxed form,
-// otherwise return 0.
-static unsigned getRelaxedOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  case SystemZ::BRAS: return SystemZ::BRASL;
-  }
-  return 0;
-}
-
 namespace {
 class SystemZMCAsmBackend : public MCAsmBackend {
   uint8_t OSABI;
@@ -59,14 +50,20 @@ public:
     LLVM_OVERRIDE;
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                           uint64_t Value) const LLVM_OVERRIDE;
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE;
+  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE {
+    return false;
+  }
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
                                     uint64_t Value,
                                     const MCRelaxableFragment *Fragment,
                                     const MCAsmLayout &Layout) const
-    LLVM_OVERRIDE;
+    LLVM_OVERRIDE {
+    return false;
+  }
   virtual void relaxInstruction(const MCInst &Inst,
-                                MCInst &Res) const LLVM_OVERRIDE;
+                                MCInst &Res) const LLVM_OVERRIDE {
+    llvm_unreachable("SystemZ does do not have assembler relaxation");
+  }
   virtual bool writeNopData(uint64_t Count,
                             MCObjectWriter *OW) const LLVM_OVERRIDE;
   virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const
@@ -114,28 +111,6 @@ void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   }
 }
 
-bool SystemZMCAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  return getRelaxedOpcode(Inst.getOpcode()) != 0;
-}
-
-bool
-SystemZMCAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                          uint64_t Value,
-                                          const MCRelaxableFragment *Fragment,
-                                          const MCAsmLayout &Layout) const {
-  // At the moment we just need to relax 16-bit fields to wider fields.
-  Value = extractBitsForFixup(Fixup.getKind(), Value);
-  return (int16_t)Value != (int64_t)Value;
-}
-
-void SystemZMCAsmBackend::relaxInstruction(const MCInst &Inst,
-                                           MCInst &Res) const {
-  unsigned Opcode = getRelaxedOpcode(Inst.getOpcode());
-  assert(Opcode && "Unexpected insn to relax");
-  Res = Inst;
-  Res.setOpcode(Opcode);
-}
-
 bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
                                        MCObjectWriter *OW) const {
   for (uint64_t I = 0; I != Count; ++I)
@@ -143,8 +118,9 @@ bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
   return true;
 }
 
-MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T, StringRef TT,
-                                              StringRef CPU) {
+MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T,
+                                              const MCRegisterInfo &MRI,
+                                              StringRef TT, StringRef CPU) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
   return new SystemZMCAsmBackend(OSABI);
 }
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 9e27aa0..965c41e 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -19,10 +19,8 @@ SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
   IsLittleEndian = false;
 
   CommentString = "#";
-  PCSymbol = ".";
   GlobalPrefix = "";
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = "\t.quad\t";
   UsesELFSectionDirectiveForBSS = true;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index d440787..b9ac92a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -10,13 +10,13 @@
 #ifndef SystemZTARGETASMINFO_H
 #define SystemZTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
 class StringRef;
 
-class SystemZMCAsmInfo : public MCAsmInfo {
+class SystemZMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit SystemZMCAsmInfo(StringRef TT);
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index bda7714..f07ea7b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -79,14 +79,6 @@ private:
                               SmallVectorImpl<MCFixup> &Fixups) const {
     return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
   }
-  uint64_t getPLT16DBLEncoding(const MCInst &MI, unsigned OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT16DBL, 2);
-  }
-  uint64_t getPLT32DBLEncoding(const MCInst &MI, unsigned OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT32DBL, 2);
-  }
 };
 }
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 3653192..9e1296b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -28,10 +28,17 @@
 using namespace llvm;
 
 const unsigned SystemZMC::GR32Regs[16] = {
-  SystemZ::R0W, SystemZ::R1W, SystemZ::R2W, SystemZ::R3W,
-  SystemZ::R4W, SystemZ::R5W, SystemZ::R6W, SystemZ::R7W,
-  SystemZ::R8W, SystemZ::R9W, SystemZ::R10W, SystemZ::R11W,
-  SystemZ::R12W, SystemZ::R13W, SystemZ::R14W, SystemZ::R15W
+  SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
+  SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
+  SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
+  SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L
+};
+
+const unsigned SystemZMC::GRH32Regs[16] = {
+  SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
+  SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
+  SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
+  SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H
 };
 
 const unsigned SystemZMC::GR64Regs[16] = {
@@ -69,6 +76,24 @@ const unsigned SystemZMC::FP128Regs[16] = {
   SystemZ::F12Q, SystemZ::F13Q, 0, 0
 };
 
+unsigned SystemZMC::getFirstReg(unsigned Reg) {
+  static unsigned Map[SystemZ::NUM_TARGET_REGS];
+  static bool Initialized = false;
+  if (!Initialized) {
+    for (unsigned I = 0; I < 16; ++I) {
+      Map[GR32Regs[I]] = I;
+      Map[GRH32Regs[I]] = I;
+      Map[GR64Regs[I]] = I;
+      Map[GR128Regs[I]] = I;
+      Map[FP32Regs[I]] = I;
+      Map[FP64Regs[I]] = I;
+      Map[FP128Regs[I]] = I;
+    }
+  }
+  assert(Reg < SystemZ::NUM_TARGET_REGS);
+  return Map[Reg];
+}
+
 static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
@@ -162,7 +187,7 @@ static MCStreamer *createSystemZMCObjectStreamer(const Target &T, StringRef TT,
                                                  MCCodeEmitter *Emitter,
                                                  bool RelaxAll,
                                                  bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, 0, MAB, OS, Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeSystemZTargetMC() {
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 3c9f0cb..97e325b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -42,11 +42,31 @@ namespace SystemZMC {
   // as %r0-%r15.  It seems better to provide the same interface for
   // all classes though.
   extern const unsigned GR32Regs[16];
+  extern const unsigned GRH32Regs[16];
   extern const unsigned GR64Regs[16];
   extern const unsigned GR128Regs[16];
   extern const unsigned FP32Regs[16];
   extern const unsigned FP64Regs[16];
   extern const unsigned FP128Regs[16];
+
+  // Return the 0-based number of the first architectural register that
+  // contains the given LLVM register.   E.g. R1D -> 1.
+  unsigned getFirstReg(unsigned Reg);
+
+  // Return the given register as a GR64.
+  inline unsigned getRegAsGR64(unsigned Reg) {
+    return GR64Regs[getFirstReg(Reg)];
+  }
+
+  // Return the given register as a low GR32.
+  inline unsigned getRegAsGR32(unsigned Reg) {
+    return GR32Regs[getFirstReg(Reg)];
+  }
+
+  // Return the given register as a high GR32.
+  inline unsigned getRegAsGRH32(unsigned Reg) {
+    return GRH32Regs[getFirstReg(Reg)];
+  }
 }
 
 MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
@@ -54,8 +74,9 @@ MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCSubtargetInfo &STI,
                                           MCContext &Ctx);
 
-MCAsmBackend *createSystemZMCAsmBackend(const Target &T, StringRef TT,
-                                        StringRef CPU);
+MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU);
 
 MCObjectWriter *createSystemZObjectWriter(raw_ostream &OS, uint8_t OSABI);
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 2782b63..afa6cf0 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -35,19 +35,11 @@ performance measurements.
 
 --
 
-We don't support tail calls at present.
-
---
-
-We don't support prefetching yet.
-
---
-
 There is no scheduling support.
 
 --
 
-We don't use the BRANCH ON COUNT or BRANCH ON INDEX families of instruction.
+We don't use the BRANCH ON INDEX instructions.
 
 --
 
@@ -56,18 +48,7 @@ and conditional returns.
 
 --
 
-We don't use the condition code results of anything except comparisons.
-
-Implementing this may need something more finely grained than the z_cmp
-and z_ucmp that we have now.  It might (or might not) also be useful to
-have a mask of "don't care" values in conditional branches.  For example,
-integer comparisons never set CC to 3, so the bottom bit of the CC mask
-isn't particularly relevant.  JNLH and JE are equally good for testing
-equality after an integer comparison, etc.
-
---
-
-We don't use the LOAD AND TEST or TEST DATA CLASS instructions.
+We don't use the TEST DATA CLASS instructions.
 
 --
 
@@ -77,20 +58,16 @@ condition codes.  For example, we could use LCDFR instead of LCDBR.
 
 --
 
-We don't optimize block memory operations.
+We only use MVC, XC and CLC for constant-length block operations.
+We could extend them to variable-length operations too,
+using EXECUTE RELATIVE LONG.
 
-It's definitely worth using things like MVC, CLC, NC, XC and OC with
-constant lengths.  MVCIN may be worthwhile too.
-
-We should probably implement things like memcpy using MVC with EXECUTE.
-Likewise memcmp and CLC.  MVCLE and CLCLE could be useful too.
+MVCIN, MVCLE and CLCLE may be worthwhile too.
 
 --
 
-We don't optimize string operations.
-
-MVST, CLST, SRST and CUSE could be useful here.  Some of the TRANSLATE
-family might be too, although they are probably more difficult to exploit.
+We don't use CUSE or the TRANSLATE family of instructions for string
+operations.  The TRANSLATE ones are probably more difficult to exploit.
 
 --
 
@@ -113,14 +90,7 @@ We don't use the halfword forms of LOAD REVERSED and STORE REVERSED
 
 --
 
-We could take advantage of the various ... UNDER MASK instructions,
-such as ICM and STCM.
-
---
-
-DAGCombiner can detect integer absolute, but there's not yet an associated
-ISD opcode.  We could add one and implement it using LOAD POSITIVE.
-Negated absolutes could use LOAD NEGATIVE.
+We don't use ICM or STCM.
 
 --
 
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index eccc2aa..dcebbad 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -52,6 +52,29 @@ namespace llvm {
     const unsigned CCMASK_CS_NE = CCMASK_1;
     const unsigned CCMASK_CS    = CCMASK_0 | CCMASK_1;
 
+    // Condition-code mask assignments for a completed SRST loop.
+    const unsigned CCMASK_SRST_FOUND    = CCMASK_1;
+    const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
+    const unsigned CCMASK_SRST          = CCMASK_1 | CCMASK_2;
+
+    // Condition-code mask assignments for TEST UNDER MASK.
+    const unsigned CCMASK_TM_ALL_0       = CCMASK_0;
+    const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
+    const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
+    const unsigned CCMASK_TM_ALL_1       = CCMASK_3;
+    const unsigned CCMASK_TM_SOME_0      = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
+    const unsigned CCMASK_TM_SOME_1      = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
+    const unsigned CCMASK_TM_MSB_0       = CCMASK_0 | CCMASK_1;
+    const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
+    const unsigned CCMASK_TM             = CCMASK_ANY;
+
+    // The position of the low CC bit in an IPM result.
+    const unsigned IPM_CC = 28;
+
+    // Mask assignments for PFD.
+    const unsigned PFD_READ  = 1;
+    const unsigned PFD_WRITE = 2;
+
     // Return true if Val fits an LLILL operand.
     static inline bool isImmLL(uint64_t Val) {
       return (Val & ~0x000000000000ffffULL) == 0;
@@ -86,6 +109,7 @@ namespace llvm {
   FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel);
   FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+  FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
   FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
 } // end namespace llvm;
 #endif
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 3a57ea0..75cbda4 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -19,16 +19,142 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GR32s.
+static MCInst lowerRILow(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GRH32s.
+static MCInst lowerRIHigh(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// R2 register turned into a GR64.
+static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg())
+    .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()))
+    .addImm(MI->getOperand(3).getImm())
+    .addImm(MI->getOperand(4).getImm())
+    .addImm(MI->getOperand(5).getImm());
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+  SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
-  Lower.lower(MI, LoweredMI);
+  switch (MI->getOpcode()) {
+  case SystemZ::Return:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::CallBRASL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBASR:
+    LoweredMI = MCInstBuilder(SystemZ::BASR)
+      .addReg(SystemZ::R14D)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+
+  case SystemZ::CallJG:
+    LoweredMI = MCInstBuilder(SystemZ::JG)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBR:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+    break;
+
+  case SystemZ::IILF64:
+    LoweredMI = MCInstBuilder(SystemZ::IILF)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::IIHF64:
+    LoweredMI = MCInstBuilder(SystemZ::IIHF)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::RISBHH:
+  case SystemZ::RISBHL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBHG);
+    break;
+
+  case SystemZ::RISBLH:
+  case SystemZ::RISBLL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG);
+    break;
+
+#define LOWER_LOW(NAME)                                                 \
+  case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
+
+  LOWER_LOW(IILL);
+  LOWER_LOW(IILH);
+  LOWER_LOW(TMLL);
+  LOWER_LOW(TMLH);
+  LOWER_LOW(NILL);
+  LOWER_LOW(NILH);
+  LOWER_LOW(NILF);
+  LOWER_LOW(OILL);
+  LOWER_LOW(OILH);
+  LOWER_LOW(OILF);
+  LOWER_LOW(XILF);
+
+#undef LOWER_LOW
+
+#define LOWER_HIGH(NAME) \
+  case SystemZ::NAME##64: LoweredMI = lowerRIHigh(MI, SystemZ::NAME); break
+
+  LOWER_HIGH(IIHL);
+  LOWER_HIGH(IIHH);
+  LOWER_HIGH(TMHL);
+  LOWER_HIGH(TMHH);
+  LOWER_HIGH(NIHL);
+  LOWER_HIGH(NIHH);
+  LOWER_HIGH(NIHF);
+  LOWER_HIGH(OIHL);
+  LOWER_HIGH(OIHH);
+  LOWER_HIGH(OIHF);
+  LOWER_HIGH(XIHF);
+
+#undef LOWER_HIGH
+
+  default:
+    Lower.lower(MI, LoweredMI);
+    break;
+  }
   OutStreamer.EmitInstruction(LoweredMI);
 }
 
@@ -48,7 +174,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     static_cast<SystemZConstantPoolValue*>(MCPV);
 
   const MCExpr *Expr =
-    MCSymbolRefExpr::Create(Mang->getSymbol(ZCPV->getGlobalValue()),
+    MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
                             getModifierVariantKind(ZCPV->getModifier()),
                             OutContext);
   uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
@@ -66,7 +192,7 @@ bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
       return true;
     OS << -int64_t(MI->getOperand(OpNo).getImm());
   } else {
-    SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+    SystemZMCInstLower Lower(MF->getContext(), *this);
     MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
     SystemZInstPrinter::printOperand(MO, OS);
   }
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index c2d727f..c4f641e 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -23,7 +23,7 @@ def RetCC_SystemZ : CallingConv<[
   // call-clobbered argument registers available for code that doesn't
   // care about the ABI.  (R6 is an argument register too, but is
   // call-saved and therefore not suitable for return values.)
-  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W]>>,
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L]>>,
   CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
 
   // ABI-complaint code returns float and double in F0.  Make the
@@ -53,7 +53,7 @@ def CC_SystemZ : CallingConv<[
 
   // The first 5 integer arguments are passed in R2-R6.  Note that R6
   // is call-saved.
-  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W, R6W]>>,
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L, R6L]>>,
   CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
 
   // The first 4 float and double arguments are passed in even registers F0-F6.
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index e9c4f6d..6c70811 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -39,7 +39,7 @@ unsigned SystemZConstantPoolValue::getRelocationInfo() const {
 int SystemZConstantPoolValue::
 getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
   unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
   for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
     if (Constants[I].isMachineConstantPoolEntry() &&
         (Constants[I].getAlignment() & AlignMask) == 0) {
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a58da90..acfb491 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -111,7 +111,7 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
                         const SystemZTargetMachine &TM,
                         unsigned GPR64, bool IsImplicit) {
   const SystemZRegisterInfo *RI = TM.getRegisterInfo();
-  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_32bit);
+  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
   bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
   if (!IsLive || !IsImplicit) {
     MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
@@ -420,8 +420,7 @@ void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
 
   // Skip the return instruction.
-  assert(MBBI->getOpcode() == SystemZ::RET &&
-         "Can only insert epilogue into returning blocks");
+  assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
 
   uint64_t StackSize = getAllocatedStackSize(MF);
   if (ZFI->getLowSavedGPR()) {
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index d9794b1..f4a2773 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -107,7 +107,8 @@ static uint64_t allOnes(unsigned int Count) {
 //
 //   (and (rotl Input, Rotate), Mask)
 //
-// otherwise.  The value has BitSize bits.
+// otherwise.  The output value has BitSize bits, although Input may be
+// narrower (in which case the upper bits are don't care).
 struct RxSBGOperands {
   RxSBGOperands(unsigned Op, SDValue N)
     : Opcode(Op), BitSize(N.getValueType().getSizeInBits()),
@@ -128,7 +129,7 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   const SystemZSubtarget &Subtarget;
 
   // Used by SystemZOperands.td to create integer constants.
-  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) const {
     return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
   }
 
@@ -142,33 +143,39 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
 
   // Try to fold more of the base or index of AM into AM, where IsBase
   // selects between the base and index.
-  bool expandAddress(SystemZAddressingMode &AM, bool IsBase);
+  bool expandAddress(SystemZAddressingMode &AM, bool IsBase) const;
 
   // Try to describe N in AM, returning true on success.
-  bool selectAddress(SDValue N, SystemZAddressingMode &AM);
+  bool selectAddress(SDValue N, SystemZAddressingMode &AM) const;
 
   // Extract individual target operands from matched address AM.
   void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
-                          SDValue &Base, SDValue &Disp);
+                          SDValue &Base, SDValue &Disp) const;
   void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
-                          SDValue &Base, SDValue &Disp, SDValue &Index);
+                          SDValue &Base, SDValue &Disp, SDValue &Index) const;
 
   // Try to match Addr as a FormBD address with displacement type DR.
   // Return true on success, storing the base and displacement in
   // Base and Disp respectively.
   bool selectBDAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
-                    SDValue &Base, SDValue &Disp);
+                    SDValue &Base, SDValue &Disp) const;
+
+  // Try to match Addr as a FormBDX address with displacement type DR.
+  // Return true on success and if the result had no index.  Store the
+  // base and displacement in Base and Disp respectively.
+  bool selectMVIAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp) const;
 
   // Try to match Addr as a FormBDX* address of form Form with
   // displacement type DR.  Return true on success, storing the base,
   // displacement and index in Base, Disp and Index respectively.
   bool selectBDXAddr(SystemZAddressingMode::AddrForm Form,
                      SystemZAddressingMode::DispRange DR, SDValue Addr,
-                     SDValue &Base, SDValue &Disp, SDValue &Index);
+                     SDValue &Base, SDValue &Disp, SDValue &Index) const;
 
   // PC-relative address matching routines used by SystemZOperands.td.
-  bool selectPCRelAddress(SDValue Addr, SDValue &Target) {
-    if (Addr.getOpcode() == SystemZISD::PCREL_WRAPPER) {
+  bool selectPCRelAddress(SDValue Addr, SDValue &Target) const {
+    if (SystemZISD::isPCREL(Addr.getOpcode())) {
       Target = Addr.getOperand(0);
       return true;
     }
@@ -176,64 +183,72 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   }
 
   // BD matching routines used by SystemZOperands.td.
-  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp12Only, Addr, Base, Disp);
   }
-  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
   }
-  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp20Only, Addr, Base, Disp);
   }
-  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
   }
 
+  // MVI matching routines used by SystemZOperands.td.
+  bool selectMVIAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectMVIAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
   // BDX matching routines used by SystemZOperands.td.
   bool selectBDXAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp12Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp12Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectDynAlloc12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                            SDValue &Index) {
+                            SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXDynAlloc,
                          SystemZAddressingMode::Disp12Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Only128(SDValue Addr, SDValue &Base, SDValue &Disp,
-                              SDValue &Index) {
+                              SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Only128,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectLAAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                          SDValue &Index) {
+                          SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
                          SystemZAddressingMode::Disp12Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectLAAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                          SDValue &Index) {
+                          SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
                          SystemZAddressingMode::Disp20Pair,
                          Addr, Base, Disp, Index);
@@ -242,21 +257,21 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // Check whether (or Op (and X InsertMask)) is effectively an insertion
   // of X into bits InsertMask of some Y != Op.  Return true if so and
   // set Op to that Y.
-  bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask);
+  bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const;
 
   // Try to update RxSBG so that only the bits of RxSBG.Input in Mask are used.
   // Return true on success.
-  bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask);
+  bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) const;
 
   // Try to fold some of RxSBG.Input into other fields of RxSBG.
   // Return true on success.
-  bool expandRxSBG(RxSBGOperands &RxSBG);
+  bool expandRxSBG(RxSBGOperands &RxSBG) const;
 
-  // Return an undefined i64 value.
-  SDValue getUNDEF64(SDLoc DL);
+  // Return an undefined value of type VT.
+  SDValue getUNDEF(SDLoc DL, EVT VT) const;
 
   // Convert N to VT, if it isn't already.
-  SDValue convertTo(SDLoc DL, EVT VT, SDValue N);
+  SDValue convertTo(SDLoc DL, EVT VT, SDValue N) const;
 
   // Try to implement AND or shift node N using RISBG with the zero flag set.
   // Return the selected node on success, otherwise return null.
@@ -276,8 +291,26 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
                               uint64_t UpperVal, uint64_t LowerVal);
 
+  // Return true if Load and Store are loads and stores of the same size
+  // and are guaranteed not to overlap.  Such operations can be implemented
+  // using block (SS-format) instructions.
+  //
+  // Partial overlap would lead to incorrect code, since the block operations
+  // are logically bytewise, even though they have a fast path for the
+  // non-overlapping case.  We also need to avoid full overlap (i.e. two
+  // addresses that might be equal at run time) because although that case
+  // would be handled correctly, it might be implemented by millicode.
+  bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const;
+
+  // N is a (store (load Y), X) pattern.  Return true if it can use an MVC
+  // from Y to X.
   bool storeLoadCanUseMVC(SDNode *N) const;
 
+  // N is a (store (op (load A[0]), (load A[1])), X) pattern.  Return true
+  // if A[1 - I] == X and if N can use a block operation like NC from A[I]
+  // to X.
+  bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+
 public:
   SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(TM, OptLevel),
@@ -363,9 +396,9 @@ static bool expandIndex(SystemZAddressingMode &AM, SDValue Base,
 // The base or index of AM is equivalent to Op0 + Op1, where IsBase selects
 // between the base and index.  Try to fold Op1 into AM's displacement.
 static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
-                       SDValue Op0, ConstantSDNode *Op1) {
+                       SDValue Op0, uint64_t Op1) {
   // First try adjusting the displacement.
-  int64_t TestDisp = AM.Disp + Op1->getSExtValue();
+  int64_t TestDisp = AM.Disp + Op1;
   if (selectDisp(AM.DR, TestDisp)) {
     changeComponent(AM, IsBase, Op0);
     AM.Disp = TestDisp;
@@ -378,7 +411,7 @@ static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
 }
 
 bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
-                                        bool IsBase) {
+                                        bool IsBase) const {
   SDValue N = IsBase ? AM.Base : AM.Index;
   unsigned Opcode = N.getOpcode();
   if (Opcode == ISD::TRUNCATE) {
@@ -398,13 +431,23 @@ bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
       return expandAdjDynAlloc(AM, IsBase, Op0);
 
     if (Op0Code == ISD::Constant)
-      return expandDisp(AM, IsBase, Op1, cast<ConstantSDNode>(Op0));
+      return expandDisp(AM, IsBase, Op1,
+                        cast<ConstantSDNode>(Op0)->getSExtValue());
     if (Op1Code == ISD::Constant)
-      return expandDisp(AM, IsBase, Op0, cast<ConstantSDNode>(Op1));
+      return expandDisp(AM, IsBase, Op0,
+                        cast<ConstantSDNode>(Op1)->getSExtValue());
 
     if (IsBase && expandIndex(AM, Op0, Op1))
       return true;
   }
+  if (Opcode == SystemZISD::PCREL_OFFSET) {
+    SDValue Full = N.getOperand(0);
+    SDValue Base = N.getOperand(1);
+    SDValue Anchor = Base.getOperand(0);
+    uint64_t Offset = (cast<GlobalAddressSDNode>(Full)->getOffset() -
+                       cast<GlobalAddressSDNode>(Anchor)->getOffset());
+    return expandDisp(AM, IsBase, Base, Offset);
+  }
   return false;
 }
 
@@ -483,14 +526,15 @@ static bool shouldUseLA(SDNode *Base, int64_t Disp, SDNode *Index) {
 
 // Return true if Addr is suitable for AM, updating AM if so.
 bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
-                                        SystemZAddressingMode &AM) {
+                                        SystemZAddressingMode &AM) const {
   // Start out assuming that the address will need to be loaded separately,
   // then try to extend it as much as we can.
   AM.Base = Addr;
 
   // First try treating the address as a constant.
   if (Addr.getOpcode() == ISD::Constant &&
-      expandDisp(AM, true, SDValue(), cast<ConstantSDNode>(Addr)))
+      expandDisp(AM, true, SDValue(),
+                 cast<ConstantSDNode>(Addr)->getSExtValue()))
     ;
   else
     // Otherwise try expanding each component.
@@ -530,7 +574,7 @@ static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
 
 void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
                                              EVT VT, SDValue &Base,
-                                             SDValue &Disp) {
+                                             SDValue &Disp) const {
   Base = AM.Base;
   if (!Base.getNode())
     // Register 0 means "no base".  This is mostly useful for shifts.
@@ -555,7 +599,8 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
 
 void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
                                              EVT VT, SDValue &Base,
-                                             SDValue &Disp, SDValue &Index) {
+                                             SDValue &Disp,
+                                             SDValue &Index) const {
   getAddressOperands(AM, VT, Base, Disp);
 
   Index = AM.Index;
@@ -566,7 +611,7 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
 
 bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
                                        SDValue Addr, SDValue &Base,
-                                       SDValue &Disp) {
+                                       SDValue &Disp) const {
   SystemZAddressingMode AM(SystemZAddressingMode::FormBD, DR);
   if (!selectAddress(Addr, AM))
     return false;
@@ -575,10 +620,21 @@ bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
   return true;
 }
 
+bool SystemZDAGToDAGISel::selectMVIAddr(SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp) const {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBDXNormal, DR);
+  if (!selectAddress(Addr, AM) || AM.Index.getNode())
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
 bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
                                         SystemZAddressingMode::DispRange DR,
                                         SDValue Addr, SDValue &Base,
-                                        SDValue &Disp, SDValue &Index) {
+                                        SDValue &Disp, SDValue &Index) const {
   SystemZAddressingMode AM(Form, DR);
   if (!selectAddress(Addr, AM))
     return false;
@@ -588,7 +644,7 @@ bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
 }
 
 bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
-                                               uint64_t InsertMask) {
+                                               uint64_t InsertMask) const {
   // We're only interested in cases where the insertion is into some operand
   // of Op, rather than into Op itself.  The only useful case is an AND.
   if (Op.getOpcode() != ISD::AND)
@@ -619,7 +675,8 @@ bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
   return true;
 }
 
-bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) {
+bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG,
+                                          uint64_t Mask) const {
   const SystemZInstrInfo *TII = getInstrInfo();
   if (RxSBG.Rotate != 0)
     Mask = (Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate));
@@ -631,26 +688,15 @@ bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) {
   return false;
 }
 
-// RxSBG.Input is a shift of Count bits in the direction given by IsLeft.
-// Return true if the result depends on the signs or zeros that are
-// shifted in.
-static bool shiftedInBitsMatter(RxSBGOperands &RxSBG, uint64_t Count,
-                                bool IsLeft) {
-  // Work out which bits of the shift result are zeros or sign copies.
-  uint64_t ShiftedIn = allOnes(Count);
-  if (!IsLeft)
-    ShiftedIn <<= RxSBG.BitSize - Count;
-
-  // Rotate that mask in the same way as RxSBG.Input is rotated.
+// Return true if any bits of (RxSBG.Input & Mask) are significant.
+static bool maskMatters(RxSBGOperands &RxSBG, uint64_t Mask) {
+  // Rotate the mask in the same way as RxSBG.Input is rotated.
   if (RxSBG.Rotate != 0)
-    ShiftedIn = ((ShiftedIn << RxSBG.Rotate) |
-                 (ShiftedIn >> (64 - RxSBG.Rotate)));
-
-  // Fail if any of the zero or sign bits are used.
-  return (ShiftedIn & RxSBG.Mask) != 0;
+    Mask = ((Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate)));
+  return (Mask & RxSBG.Mask) != 0;
 }
 
-bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
+bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
   SDValue N = RxSBG.Input;
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
@@ -706,7 +752,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
 
   case ISD::ROTL: {
     // Any 64-bit rotate left can be merged into the RxSBG.
-    if (RxSBG.BitSize != 64)
+    if (RxSBG.BitSize != 64 || N.getValueType() != MVT::i64)
       return false;
     ConstantSDNode *CountNode
       = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
@@ -718,6 +764,19 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
     return true;
   }
       
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    // Check that the extension bits are don't-care (i.e. are masked out
+    // by the final mask).
+    unsigned InnerBitSize = N.getOperand(0).getValueType().getSizeInBits();
+    if (maskMatters(RxSBG, allOnes(RxSBG.BitSize) - allOnes(InnerBitSize)))
+      return false;
+
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
   case ISD::SHL: {
     ConstantSDNode *CountNode =
       dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
@@ -725,17 +784,18 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
       return false;
 
     uint64_t Count = CountNode->getZExtValue();
-    if (Count < 1 || Count >= RxSBG.BitSize)
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
       return false;
 
     if (RxSBG.Opcode == SystemZ::RNSBG) {
       // Treat (shl X, count) as (rotl X, size-count) as long as the bottom
       // count bits from RxSBG.Input are ignored.
-      if (shiftedInBitsMatter(RxSBG, Count, true))
+      if (maskMatters(RxSBG, allOnes(Count)))
         return false;
     } else {
       // Treat (shl X, count) as (and (rotl X, count), ~0<<count).
-      if (!refineRxSBGMask(RxSBG, allOnes(RxSBG.BitSize - Count) << Count))
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count) << Count))
         return false;
     }
 
@@ -752,18 +812,19 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
       return false;
 
     uint64_t Count = CountNode->getZExtValue();
-    if (Count < 1 || Count >= RxSBG.BitSize)
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
       return false;
 
     if (RxSBG.Opcode == SystemZ::RNSBG || Opcode == ISD::SRA) {
       // Treat (srl|sra X, count) as (rotl X, size-count) as long as the top
       // count bits from RxSBG.Input are ignored.
-      if (shiftedInBitsMatter(RxSBG, Count, false))
+      if (maskMatters(RxSBG, allOnes(Count) << (BitSize - Count)))
         return false;
     } else {
       // Treat (srl X, count), mask) as (and (rotl X, size-count), ~0>>count),
       // which is similar to SLL above.
-      if (!refineRxSBGMask(RxSBG, allOnes(RxSBG.BitSize - Count)))
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count)))
         return false;
     }
 
@@ -776,24 +837,17 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
   }
 }
 
-SDValue SystemZDAGToDAGISel::getUNDEF64(SDLoc DL) {
-  SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64);
+SDValue SystemZDAGToDAGISel::getUNDEF(SDLoc DL, EVT VT) const {
+  SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
   return SDValue(N, 0);
 }
 
-SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) {
-  if (N.getValueType() == MVT::i32 && VT == MVT::i64) {
-    SDValue Index = CurDAG->getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-    SDNode *Insert = CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG,
-                                            DL, VT, getUNDEF64(DL), N, Index);
-    return SDValue(Insert, 0);
-  }
-  if (N.getValueType() == MVT::i64 && VT == MVT::i32) {
-    SDValue Index = CurDAG->getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-    SDNode *Extract = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                             DL, VT, N, Index);
-    return SDValue(Extract, 0);
-  }
+SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) const {
+  if (N.getValueType() == MVT::i32 && VT == MVT::i64)
+    return CurDAG->getTargetInsertSubreg(SystemZ::subreg_l32,
+                                         DL, VT, getUNDEF(DL, MVT::i64), N);
+  if (N.getValueType() == MVT::i64 && VT == MVT::i32)
+    return CurDAG->getTargetExtractSubreg(SystemZ::subreg_l32, DL, VT, N);
   assert(N.getValueType() == VT && "Unexpected value types");
   return N;
 }
@@ -803,7 +857,8 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
   RxSBGOperands RISBG(SystemZ::RISBG, SDValue(N, 0));
   unsigned Count = 0;
   while (expandRxSBG(RISBG))
-    Count += 1;
+    if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND)
+      Count += 1;
   if (Count == 0)
     return 0;
   if (Count == 1) {
@@ -831,14 +886,22 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
     }
   }  
 
+  unsigned Opcode = SystemZ::RISBG;
+  EVT OpcodeVT = MVT::i64;
+  if (VT == MVT::i32 && Subtarget.hasHighWord()) {
+    Opcode = SystemZ::RISBMux;
+    OpcodeVT = MVT::i32;
+    RISBG.Start &= 31;
+    RISBG.End &= 31;
+  }
   SDValue Ops[5] = {
-    getUNDEF64(SDLoc(N)),
-    convertTo(SDLoc(N), MVT::i64, RISBG.Input),
+    getUNDEF(SDLoc(N), OpcodeVT),
+    convertTo(SDLoc(N), OpcodeVT, RISBG.Input),
     CurDAG->getTargetConstant(RISBG.Start, MVT::i32),
     CurDAG->getTargetConstant(RISBG.End | 128, MVT::i32),
     CurDAG->getTargetConstant(RISBG.Rotate, MVT::i32)
   };
-  N = CurDAG->getMachineNode(SystemZ::RISBG, SDLoc(N), MVT::i64, Ops);
+  N = CurDAG->getMachineNode(Opcode, SDLoc(N), OpcodeVT, Ops);
   return convertTo(SDLoc(N), VT, SDValue(N, 0)).getNode();
 }
 
@@ -852,7 +915,8 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
   unsigned Count[] = { 0, 0 };
   for (unsigned I = 0; I < 2; ++I)
     while (expandRxSBG(RxSBG[I]))
-      Count[I] += 1;
+      if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND)
+        Count[I] += 1;
 
   // Do nothing if neither operand is suitable.
   if (Count[0] == 0 && Count[1] == 0)
@@ -900,49 +964,64 @@ SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
   return Or.getNode();
 }
 
-// N is a (store (load ...), ...) pattern.  Return true if it can use MVC.
-bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
-  StoreSDNode *Store = cast<StoreSDNode>(N);
-  LoadSDNode *Load = cast<LoadSDNode>(Store->getValue().getNode());
+bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+                                               LoadSDNode *Load) const {
+  // Check that the two memory operands have the same size.
+  if (Load->getMemoryVT() != Store->getMemoryVT())
+    return false;
 
-  // MVC is logically a bytewise copy, so can't be used for volatile accesses.
+  // Volatility stops an access from being decomposed.
   if (Load->isVolatile() || Store->isVolatile())
     return false;
 
-  // Prefer not to use MVC if either address can use ... RELATIVE LONG
-  // instructions.
-  assert(Load->getMemoryVT() == Store->getMemoryVT() &&
-         "Should already have checked that the types match");
-  uint64_t Size = Load->getMemoryVT().getStoreSize();
-  if (Size > 1 && Size <= 8) {
-    // Prefer LHRL, LRL and LGRL.
-    if (Load->getBasePtr().getOpcode() == SystemZISD::PCREL_WRAPPER)
-      return false;
-    // Prefer STHRL, STRL and STGRL.
-    if (Store->getBasePtr().getOpcode() == SystemZISD::PCREL_WRAPPER)
-      return false;
-  }
-
   // There's no chance of overlap if the load is invariant.
   if (Load->isInvariant())
     return true;
 
-  // If both operands are aligned, they must be equal or not overlap.
-  if (Load->getAlignment() >= Size && Store->getAlignment() >= Size)
-    return true;
-
   // Otherwise we need to check whether there's an alias.
   const Value *V1 = Load->getSrcValue();
   const Value *V2 = Store->getSrcValue();
   if (!V1 || !V2)
     return false;
 
+  // Reject equality.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
   int64_t End1 = Load->getSrcValueOffset() + Size;
   int64_t End2 = Store->getSrcValueOffset() + Size;
+  if (V1 == V2 && End1 == End2)
+    return false;
+
   return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getTBAAInfo()),
                     AliasAnalysis::Location(V2, End2, Store->getTBAAInfo()));
 }
 
+bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
+  StoreSDNode *Store = cast<StoreSDNode>(N);
+  LoadSDNode *Load = cast<LoadSDNode>(Store->getValue());
+
+  // Prefer not to use MVC if either address can use ... RELATIVE LONG
+  // instructions.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  if (Size > 1 && Size <= 8) {
+    // Prefer LHRL, LRL and LGRL.
+    if (SystemZISD::isPCREL(Load->getBasePtr().getOpcode()))
+      return false;
+    // Prefer STHRL, STRL and STGRL.
+    if (SystemZISD::isPCREL(Store->getBasePtr().getOpcode()))
+      return false;
+  }
+
+  return canUseBlockOperation(Store, Load);
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
+                                                     unsigned I) const {
+  StoreSDNode *StoreA = cast<StoreSDNode>(N);
+  LoadSDNode *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
+  LoadSDNode *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
+  return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
+}
+
 SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
@@ -950,6 +1029,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
     return 0;
   }
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6acdcd4..f6e1853 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -23,8 +23,23 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
+#include <cctype>
+
 using namespace llvm;
 
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+  IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+    : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+  int64_t XORValue;
+  int64_t AddValue;
+  unsigned Bit;
+};
+}
+
 // Classify VT as either 32 or 64 bit.
 static bool is32Bit(EVT VT) {
   switch (VT.getSimpleVT().SimpleTy) {
@@ -51,7 +66,10 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   MVT PtrVT = getPointerTy();
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32,  &SystemZ::GR32BitRegClass);
+  if (Subtarget.hasHighWord())
+    addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
+  else
+    addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
   addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
   addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
   addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
@@ -83,8 +101,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
        ++I) {
     MVT VT = MVT::SimpleValueType(I);
     if (isTypeLegal(VT)) {
-      // Expand SETCC(X, Y, COND) into SELECT_CC(X, Y, 1, 0, COND).
-      setOperationAction(ISD::SETCC, VT, Expand);
+      // Lower SET_CC into an IPM-based sequence.
+      setOperationAction(ISD::SETCC, VT, Custom);
 
       // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
       setOperationAction(ISD::SELECT, VT, Expand);
@@ -128,9 +146,11 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::ROTR,            VT, Expand);
 
-      // Use *MUL_LOHI where possible and a wider multiplication otherwise.
+      // Use *MUL_LOHI where possible instead of MULH*.
       setOperationAction(ISD::MULHS, VT, Expand);
       setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Custom);
+      setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // We have instructions for signed but not unsigned FP conversion.
       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
@@ -165,14 +185,6 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   // Give LowerOperation the chance to replace 64-bit ORs with subregs.
   setOperationAction(ISD::OR, MVT::i64, Custom);
 
-  // The architecture has 32-bit SMUL_LOHI and UMUL_LOHI (MR and MLR),
-  // but they aren't really worth using.  There is no 64-bit SMUL_LOHI,
-  // but there is a 64-bit UMUL_LOHI: MLGR.
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
-
   // FIXME: Can we support these natively?
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
@@ -200,6 +212,9 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
 
+  // Handle prefetches with PFD or PFDRL.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
   // Handle floating-point types.
   for (unsigned I = MVT::FIRST_FP_VALUETYPE;
        I <= MVT::LAST_FP_VALUETYPE;
@@ -209,6 +224,15 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       // We can use FI for FRINT.
       setOperationAction(ISD::FRINT, VT, Legal);
 
+      // We can use the extended form of FI for other rounding operations.
+      if (Subtarget.hasFPExtension()) {
+        setOperationAction(ISD::FNEARBYINT, VT, Legal);
+        setOperationAction(ISD::FFLOOR, VT, Legal);
+        setOperationAction(ISD::FCEIL, VT, Legal);
+        setOperationAction(ISD::FTRUNC, VT, Legal);
+        setOperationAction(ISD::FROUND, VT, Legal);
+      }
+
       // No special instructions for these.
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
@@ -255,8 +279,13 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   MaxStoresPerMemsetOptSize = 0;
 }
 
-bool
-SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
   if (!VT.isSimple())
@@ -305,6 +334,22 @@ bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return AM.Scale == 0 || AM.Scale == 1;
 }
 
+bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
+  if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
+    return false;
+  unsigned FromBits = FromType->getPrimitiveSizeInBits();
+  unsigned ToBits = ToType->getPrimitiveSizeInBits();
+  return FromBits > ToBits;
+}
+
+bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
+  if (!FromVT.isInteger() || !ToVT.isInteger())
+    return false;
+  unsigned FromBits = FromVT.getSizeInBits();
+  unsigned ToBits = ToVT.getSizeInBits();
+  return FromBits > ToBits;
+}
+
 //===----------------------------------------------------------------------===//
 // Inline asm support
 //===----------------------------------------------------------------------===//
@@ -316,6 +361,7 @@ SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'a': // Address register
     case 'd': // Data register (equivalent to 'r')
     case 'f': // Floating-point register
+    case 'h': // High-part register
     case 'r': // General-purpose register
       return C_RegisterClass;
 
@@ -358,6 +404,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
 
   case 'a': // Address register
   case 'd': // Data register (equivalent to 'r')
+  case 'h': // High-part register
   case 'r': // General-purpose register
     if (CallOperandVal->getType()->isIntegerTy())
       weight = CW_Register;
@@ -438,6 +485,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
         return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
       return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
 
+    case 'h': // High-part register (an LLVM extension)
+      return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
+
     case 'f': // Floating-point register
       if (VT == MVT::f64)
         return std::make_pair(0U, &SystemZ::FP64BitRegClass);
@@ -527,6 +577,17 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
 
 #include "SystemZGenCallingConv.inc"
 
+bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
+                                                     Type *ToType) const {
+  return isTruncateFree(FromType, ToType);
+}
+
+bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+  return true;
+}
+
 // Value is a value that has been passed to us in the location described by VA
 // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
 // any loads onto Chain.
@@ -689,6 +750,23 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
   return Chain;
 }
 
+static bool canUseSiblingCall(CCState ArgCCInfo,
+                              SmallVectorImpl<CCValAssign> &ArgLocs) {
+  // Punt if there are any indirect or stack arguments, or if the call
+  // needs the call-saved argument register R6.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      return false;
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = VA.getLocReg();
+    if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
+      return false;
+  }
+  return true;
+}
+
 SDValue
 SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
@@ -699,26 +777,29 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
-  bool &isTailCall = CLI.IsTailCall;
+  bool &IsTailCall = CLI.IsTailCall;
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = getPointerTy();
 
-  // SystemZ target does not yet support tail call optimization.
-  isTailCall = false;
-
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
+  // We don't support GuaranteedTailCallOpt, only automatically-detected
+  // sibling calls.
+  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs))
+    IsTailCall = false;
+
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = ArgCCInfo.getNextStackOffset();
 
   // Mark the start of the call.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
-                               DL);
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
+                                 DL);
 
   // Copy argument values to their designated locations.
   SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -767,22 +848,27 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes, chained and glued together.
-  SDValue Glue;
-  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
-                             RegsToPass[I].second, Glue);
-    Glue = Chain.getValue(1);
-  }
-
   // Accept direct calls by converting symbolic call addresses to the
-  // associated Target* opcodes.
+  // associated Target* opcodes.  Force %r1 to be used for indirect
+  // tail calls.
+  SDValue Glue;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (IsTailCall) {
+    Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
+    Glue = Chain.getValue(1);
+    Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
+  }
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, Glue);
+    Glue = Chain.getValue(1);
   }
 
   // The first call operand is the chain and the second is the target address.
@@ -802,6 +888,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (IsTailCall)
+    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, &Ops[0], Ops.size());
   Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
   Glue = Chain.getValue(1);
 
@@ -910,6 +998,73 @@ static unsigned CCMaskForCondCode(ISD::CondCode CC) {
 #undef CONV
 }
 
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+  // Deal with cases where the result can be taken directly from a bit
+  // of the IPM result.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+  // Deal with cases where we can add a value to force the sign bit
+  // to contain the right value.  Putting the bit in 31 means we can
+  // use SRL rather than RISBG(L), and also makes it easier to get a
+  // 0/-1 value, so it has priority over the other tests below.
+  //
+  // These sequences rely on the fact that the upper two bits of the
+  // IPM result are zero.
+  uint64_t TopBit = uint64_t(1) << 31;
+  if (CCMask == (CCValid & SystemZ::CCMASK_0))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+    return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2)))
+    return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_3))
+    return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  // Next try inverting the value and testing a bit.  0/1 could be
+  // handled this way too, but we dealt with that case above.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+    return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+  // Handle cases where adding a value forces a non-sign bit to contain
+  // the right value.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+    return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+  // The remaing cases are 1, 2, 0/1/3 and 0/2/3.  All these are
+  // can be done by inverting the low CC bit and applying one of the
+  // sign-based extractions above.
+  if (CCMask == (CCValid & SystemZ::CCMASK_1))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_2))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  llvm_unreachable("Unexpected CC combination");
+}
+
 // If a comparison described by IsUnsigned, CCMask, CmpOp0 and CmpOp1
 // can be converted to a comparison against zero, adjust the operands
 // as necessary.
@@ -1009,74 +1164,309 @@ static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
     CmpOp1 = DAG.getConstant(Value, MVT::i32);
 }
 
-// Return true if a comparison described by CCMask, CmpOp0 and CmpOp1
-// is an equality comparison that is better implemented using unsigned
-// rather than signed comparison instructions.
-static bool preferUnsignedComparison(SelectionDAG &DAG, SDValue CmpOp0,
-                                     SDValue CmpOp1, unsigned CCMask) {
-  // The test must be for equality or inequality.
-  if (CCMask != SystemZ::CCMASK_CMP_EQ && CCMask != SystemZ::CCMASK_CMP_NE)
+// Return true if Op is either an unextended load, or a load suitable
+// for integer register-memory comparisons of type ICmpType.
+static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op.getNode());
+  if (Load) {
+    // There are no instructions to compare a register with a memory byte.
+    if (Load->getMemoryVT() == MVT::i8)
+      return false;
+    // Otherwise decide on extension type.
+    switch (Load->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      return true;
+    case ISD::SEXTLOAD:
+      return ICmpType != SystemZICMP::UnsignedOnly;
+    case ISD::ZEXTLOAD:
+      return ICmpType != SystemZICMP::SignedOnly;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+// Return true if it is better to swap comparison operands Op0 and Op1.
+// ICmpType is the type of an integer comparison.
+static bool shouldSwapCmpOperands(SDValue Op0, SDValue Op1,
+                                  unsigned ICmpType) {
+  // Leave f128 comparisons alone, since they have no memory forms.
+  if (Op0.getValueType() == MVT::f128)
     return false;
 
-  if (CmpOp1.getOpcode() == ISD::Constant) {
-    uint64_t Value = cast<ConstantSDNode>(CmpOp1)->getSExtValue();
+  // Always keep a floating-point constant second, since comparisons with
+  // zero can use LOAD TEST and comparisons with other constants make a
+  // natural memory operand.
+  if (isa<ConstantFPSDNode>(Op1))
+    return false;
 
-    // If we're comparing with memory, prefer unsigned comparisons for
-    // values that are in the unsigned 16-bit range but not the signed
-    // 16-bit range.  We want to use CLFHSI and CLGHSI.
-    if (CmpOp0.hasOneUse() &&
-        ISD::isNormalLoad(CmpOp0.getNode()) &&
-        (Value >= 32768 && Value < 65536))
-      return true;
+  // Never swap comparisons with zero since there are many ways to optimize
+  // those later.
+  ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+  if (COp1 && COp1->getZExtValue() == 0)
+    return false;
 
-    // Use unsigned comparisons for values that are in the CLGFI range
-    // but not in the CGFI range.
-    if (CmpOp0.getValueType() == MVT::i64 && (Value >> 31) == 1)
+  // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
+  // In that case we generally prefer the memory to be second.
+  if ((isNaturalMemoryOperand(Op0, ICmpType) && Op0.hasOneUse()) &&
+      !(isNaturalMemoryOperand(Op1, ICmpType) && Op1.hasOneUse())) {
+    // The only exceptions are when the second operand is a constant and
+    // we can use things like CHHSI.
+    if (!COp1)
       return true;
+    // The unsigned memory-immediate instructions can handle 16-bit
+    // unsigned integers.
+    if (ICmpType != SystemZICMP::SignedOnly &&
+        isUInt<16>(COp1->getZExtValue()))
+      return false;
+    // The signed memory-immediate instructions can handle 16-bit
+    // signed integers.
+    if (ICmpType != SystemZICMP::UnsignedOnly &&
+        isInt<16>(COp1->getSExtValue()))
+      return false;
+    return true;
+  }
+  return false;
+}
+
+// Return true if shift operation N has an in-range constant shift value.
+// Store it in ShiftVal if so.
+static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
+  ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!Shift)
+    return false;
 
+  uint64_t Amount = Shift->getZExtValue();
+  if (Amount >= N.getValueType().getSizeInBits())
     return false;
+
+  ShiftVal = Amount;
+  return true;
+}
+
+// Check whether an AND with Mask is suitable for a TEST UNDER MASK
+// instruction and whether the CC value is descriptive enough to handle
+// a comparison of type Opcode between the AND result and CmpVal.
+// CCMask says which comparison result is being tested and BitSize is
+// the number of bits in the operands.  If TEST UNDER MASK can be used,
+// return the corresponding CC mask, otherwise return 0.
+static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
+                                     uint64_t Mask, uint64_t CmpVal,
+                                     unsigned ICmpType) {
+  assert(Mask != 0 && "ANDs with zero should have been removed by now");
+
+  // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
+  if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
+      !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
+    return 0;
+
+  // Work out the masks for the lowest and highest bits.
+  unsigned HighShift = 63 - countLeadingZeros(Mask);
+  uint64_t High = uint64_t(1) << HighShift;
+  uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
+
+  // Signed ordered comparisons are effectively unsigned if the sign
+  // bit is dropped.
+  bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
+
+  // Check for equality comparisons with 0, or the equivalent.
+  if (CmpVal == 0) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal <= Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal < Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_SOME_1;
   }
 
-  // Prefer CL for zero-extended loads.
-  if (CmpOp1.getOpcode() == ISD::ZERO_EXTEND ||
-      ISD::isZEXTLoad(CmpOp1.getNode()))
-    return true;
+  // Check for equality comparisons with the mask, or the equivalent.
+  if (CmpVal == Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
 
-  // ...and for "in-register" zero extensions.
-  if (CmpOp1.getOpcode() == ISD::AND && CmpOp1.getValueType() == MVT::i64) {
-    SDValue Mask = CmpOp1.getOperand(1);
-    if (Mask.getOpcode() == ISD::Constant &&
-        cast<ConstantSDNode>(Mask)->getZExtValue() == 0xffffffff)
-      return true;
+  // Check for ordered comparisons with the top bit.
+  if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_MSB_1;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_MSB_1;
   }
 
-  return false;
+  // If there are just two bits, we can do equality checks for Low and High
+  // as well.
+  if (Mask == Low + High) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
+  }
+
+  // Looks like we've exhausted our options.
+  return 0;
+}
+
+// See whether the comparison (Opcode CmpOp0, CmpOp1, ICmpType) can be
+// implemented as a TEST UNDER MASK instruction when the condition being
+// tested is as described by CCValid and CCMask.  Update the arguments
+// with the TM version if so.
+static void adjustForTestUnderMask(SelectionDAG &DAG, unsigned &Opcode,
+                                   SDValue &CmpOp0, SDValue &CmpOp1,
+                                   unsigned &CCValid, unsigned &CCMask,
+                                   unsigned &ICmpType) {
+  // Check that we have a comparison with a constant.
+  ConstantSDNode *ConstCmpOp1 = dyn_cast<ConstantSDNode>(CmpOp1);
+  if (!ConstCmpOp1)
+    return;
+  uint64_t CmpVal = ConstCmpOp1->getZExtValue();
+
+  // Check whether the nonconstant input is an AND with a constant mask.
+  if (CmpOp0.getOpcode() != ISD::AND)
+    return;
+  SDValue AndOp0 = CmpOp0.getOperand(0);
+  SDValue AndOp1 = CmpOp0.getOperand(1);
+  ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(AndOp1.getNode());
+  if (!Mask)
+    return;
+  uint64_t MaskVal = Mask->getZExtValue();
+
+  // Check whether the combination of mask, comparison value and comparison
+  // type are suitable.
+  unsigned BitSize = CmpOp0.getValueType().getSizeInBits();
+  unsigned NewCCMask, ShiftVal;
+  if (ICmpType != SystemZICMP::SignedOnly &&
+      AndOp0.getOpcode() == ISD::SHL &&
+      isSimpleShift(AndOp0, ShiftVal) &&
+      (NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal >> ShiftVal,
+                                        CmpVal >> ShiftVal,
+                                        SystemZICMP::Any))) {
+    AndOp0 = AndOp0.getOperand(0);
+    AndOp1 = DAG.getConstant(MaskVal >> ShiftVal, AndOp0.getValueType());
+  } else if (ICmpType != SystemZICMP::SignedOnly &&
+             AndOp0.getOpcode() == ISD::SRL &&
+             isSimpleShift(AndOp0, ShiftVal) &&
+             (NewCCMask = getTestUnderMaskCond(BitSize, CCMask,
+                                               MaskVal << ShiftVal,
+                                               CmpVal << ShiftVal,
+                                               SystemZICMP::UnsignedOnly))) {
+    AndOp0 = AndOp0.getOperand(0);
+    AndOp1 = DAG.getConstant(MaskVal << ShiftVal, AndOp0.getValueType());
+  } else {
+    NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal, CmpVal,
+                                     ICmpType);
+    if (!NewCCMask)
+      return;
+  }
+
+  // Go ahead and make the change.
+  Opcode = SystemZISD::TM;
+  CmpOp0 = AndOp0;
+  CmpOp1 = AndOp1;
+  ICmpType = (bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
+              bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
+  CCValid = SystemZ::CCMASK_TM;
+  CCMask = NewCCMask;
 }
 
 // Return a target node that compares CmpOp0 with CmpOp1 and stores a
 // 2-bit result in CC.  Set CCValid to the CCMASK_* of all possible
 // 2-bit results and CCMask to the subset of those results that are
 // associated with Cond.
-static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+static SDValue emitCmp(const SystemZTargetMachine &TM, SelectionDAG &DAG,
+                       SDLoc DL, SDValue CmpOp0, SDValue CmpOp1,
                        ISD::CondCode Cond, unsigned &CCValid,
                        unsigned &CCMask) {
   bool IsUnsigned = false;
   CCMask = CCMaskForCondCode(Cond);
-  if (CmpOp0.getValueType().isFloatingPoint())
+  unsigned Opcode, ICmpType = 0;
+  if (CmpOp0.getValueType().isFloatingPoint()) {
     CCValid = SystemZ::CCMASK_FCMP;
-  else {
+    Opcode = SystemZISD::FCMP;
+  } else {
     IsUnsigned = CCMask & SystemZ::CCMASK_CMP_UO;
     CCValid = SystemZ::CCMASK_ICMP;
     CCMask &= CCValid;
     adjustZeroCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
     adjustSubwordCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
-    if (preferUnsignedComparison(DAG, CmpOp0, CmpOp1, CCMask))
-      IsUnsigned = true;
+    Opcode = SystemZISD::ICMP;
+    // Choose the type of comparison.  Equality and inequality tests can
+    // use either signed or unsigned comparisons.  The choice also doesn't
+    // matter if both sign bits are known to be clear.  In those cases we
+    // want to give the main isel code the freedom to choose whichever
+    // form fits best.
+    if (CCMask == SystemZ::CCMASK_CMP_EQ ||
+        CCMask == SystemZ::CCMASK_CMP_NE ||
+        (DAG.SignBitIsZero(CmpOp0) && DAG.SignBitIsZero(CmpOp1)))
+      ICmpType = SystemZICMP::Any;
+    else if (IsUnsigned)
+      ICmpType = SystemZICMP::UnsignedOnly;
+    else
+      ICmpType = SystemZICMP::SignedOnly;
   }
 
-  SDLoc DL(CmpOp0);
-  return DAG.getNode((IsUnsigned ? SystemZISD::UCMP : SystemZISD::CMP),
-                     DL, MVT::Glue, CmpOp0, CmpOp1);
+  if (shouldSwapCmpOperands(CmpOp0, CmpOp1, ICmpType)) {
+    std::swap(CmpOp0, CmpOp1);
+    CCMask = ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+              (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+              (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+              (CCMask & SystemZ::CCMASK_CMP_UO));
+  }
+
+  adjustForTestUnderMask(DAG, Opcode, CmpOp0, CmpOp1, CCValid, CCMask,
+                         ICmpType);
+  if (Opcode == SystemZISD::ICMP || Opcode == SystemZISD::TM)
+    return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1,
+                       DAG.getConstant(ICmpType, MVT::i32));
+  return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1);
+}
+
+// Implement a 32-bit *MUL_LOHI operation by extending both operands to
+// 64 bits.  Extend is the extension type to use.  Store the high part
+// in Hi and the low part in Lo.
+static void lowerMUL_LOHI32(SelectionDAG &DAG, SDLoc DL,
+                            unsigned Extend, SDValue Op0, SDValue Op1,
+                            SDValue &Hi, SDValue &Lo) {
+  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
+  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
+  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
+  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, MVT::i64));
+  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
 }
 
 // Lower a binary operation that produces two VT results, one in each
@@ -1092,14 +1482,38 @@ static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
   SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
                                SDValue(In128, 0), Op1);
   bool Is32Bit = is32Bit(VT);
-  SDValue SubReg0 = DAG.getTargetConstant(SystemZ::even128(Is32Bit), VT);
-  SDValue SubReg1 = DAG.getTargetConstant(SystemZ::odd128(Is32Bit), VT);
-  SDNode *Reg0 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                    VT, Result, SubReg0);
-  SDNode *Reg1 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                    VT, Result, SubReg1);
-  Even = SDValue(Reg0, 0);
-  Odd = SDValue(Reg1, 0);
+  Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
+  Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+
+  unsigned CCValid, CCMask;
+  SDValue Glue = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+
+  IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
+  SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+
+  if (Conversion.XORValue)
+    Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.XORValue, MVT::i32));
+
+  if (Conversion.AddValue)
+    Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.AddValue, MVT::i32));
+
+  // The SHR/AND sequence should get optimized to an RISBG.
+  Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
+                       DAG.getConstant(Conversion.Bit, MVT::i32));
+  if (Conversion.Bit != 31)
+    Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+                         DAG.getConstant(1, MVT::i32));
+  return Result;
 }
 
 SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -1111,7 +1525,7 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
 
   unsigned CCValid, CCMask;
-  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
   return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
                      Chain, DAG.getConstant(CCValid, MVT::i32),
                      DAG.getConstant(CCMask, MVT::i32), Dest, Flags);
@@ -1127,7 +1541,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
   SDLoc DL(Op);
 
   unsigned CCValid, CCMask;
-  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
 
   SmallVector<SDValue, 5> Ops;
   Ops.push_back(TrueOp);
@@ -1151,18 +1565,18 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
 
   SDValue Result;
   if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
-    // Make sure that the offset is aligned to a halfword.  If it isn't,
-    // create an "anchor" at the previous 12-bit boundary.
-    // FIXME check whether there is a better way of handling this.
-    if (Offset & 1) {
-      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
-                                          Offset & ~uint64_t(0xfff));
-      Offset &= 0xfff;
-    } else {
-      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Offset);
+    // Assign anchors at 1<<12 byte boundaries.
+    uint64_t Anchor = Offset & ~uint64_t(0xfff);
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+
+    // The offset can be folded into the address if it is aligned to a halfword.
+    Offset -= Anchor;
+    if (Offset != 0 && (Offset & 1) == 0) {
+      SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
+      Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
       Offset = 0;
     }
-    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
   } else {
     Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
     Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -1264,24 +1678,33 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
   EVT InVT = In.getValueType();
   EVT ResVT = Op.getValueType();
 
-  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-  SDValue Shift32 = DAG.getConstant(32, MVT::i64);
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
-    SDValue In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
-    SDValue Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, In64, Shift32);
-    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Shift);
-    SDNode *Out = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                     MVT::f32, Out64, SubReg32);
-    return SDValue(Out, 0);
+    SDValue In64;
+    if (Subtarget.hasHighWord()) {
+      SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
+                                       MVT::i64);
+      In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                       MVT::i64, SDValue(U64, 0), In);
+    } else {
+      In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
+      In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
+                         DAG.getConstant(32, MVT::i64));
+    }
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
+    return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
+                                      DL, MVT::f32, Out64);
   }
   if (InVT == MVT::f32 && ResVT == MVT::i32) {
     SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
-    SDNode *In64 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
-                                      MVT::f64, SDValue(U64, 0), In, SubReg32);
-    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, SDValue(In64, 0));
-    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64, Shift32);
-    SDValue Out = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
-    return Out;
+    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                             MVT::f64, SDValue(U64, 0), In);
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
+    if (Subtarget.hasHighWord())
+      return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
+                                        MVT::i32, Out64);
+    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
+                                DAG.getConstant(32, MVT::i64));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
   }
   llvm_unreachable("Unexpected bitcast combination");
 }
@@ -1364,18 +1787,64 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
-SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  assert(!is32Bit(VT) && "Only support 64-bit UMUL_LOHI");
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else {
+    // Do a full 128-bit multiplication based on UMUL_LOHI64:
+    //
+    //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
+    //
+    // but using the fact that the upper halves are either all zeros
+    // or all ones:
+    //
+    //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
+    //
+    // and grouping the right terms together since they are quicker than the
+    // multiplication:
+    //
+    //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
+    SDValue C63 = DAG.getConstant(63, MVT::i64);
+    SDValue LL = Op.getOperand(0);
+    SDValue RL = Op.getOperand(1);
+    SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
+    SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  SMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     LL, RL, Ops[1], Ops[0]);
+    SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
+    SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
+    SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
+    Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
+  }
+  return DAG.getMergeValues(Ops, 2, DL);
+}
 
-  // UMUL_LOHI64 returns the low result in the odd register and the high
-  // result in the even register.  UMUL_LOHI is defined to return the
-  // low half first, so the results are in reverse order.
+SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
   SDValue Ops[2];
-  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
-                   Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  UMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
@@ -1464,10 +1933,10 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   // high 32 bits and just masks out low bits.  We can skip it if so.
   if (HighOp.getOpcode() == ISD::AND &&
       HighOp.getOperand(1).getOpcode() == ISD::Constant) {
-    ConstantSDNode *MaskNode = cast<ConstantSDNode>(HighOp.getOperand(1));
-    uint64_t Mask = MaskNode->getZExtValue() | Masks[High];
-    if ((Mask >> 32) == 0xffffffff)
-      HighOp = HighOp.getOperand(0);
+    SDValue HighOp0 = HighOp.getOperand(0);
+    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+    if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
+      HighOp = HighOp0;
   }
 
   // Take advantage of the fact that all GR32 operations only change the
@@ -1476,10 +1945,8 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   // can be folded.
   SDLoc DL(Op);
   SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
-  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-  SDNode *Result = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
-                                      MVT::i64, HighOp, Low32, SubReg32);
-  return SDValue(Result, 0);
+  return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
+                                   MVT::i64, HighOp, Low32);
 }
 
 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
@@ -1618,6 +2085,26 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
                           SystemZ::R15D, Op.getOperand(1));
 }
 
+SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (!IsData)
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
+  MemIntrinsicSDNode *Node = cast<MemIntrinsicSDNode>(Op.getNode());
+  SDValue Ops[] = {
+    Op.getOperand(0),
+    DAG.getConstant(Code, MVT::i32),
+    Op.getOperand(1)
+  };
+  return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, SDLoc(Op),
+                                 Node->getVTList(), Ops, array_lengthof(Ops),
+                                 Node->getMemoryVT(), Node->getMemOperand());
+}
+
 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -1625,6 +2112,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerBR_CC(Op, DAG);
   case ISD::SELECT_CC:
     return lowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return lowerSETCC(Op, DAG);
   case ISD::GlobalAddress:
     return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
   case ISD::GlobalTLSAddress:
@@ -1643,6 +2132,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerVACOPY(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::SMUL_LOHI:
+    return lowerSMUL_LOHI(Op, DAG);
   case ISD::UMUL_LOHI:
     return lowerUMUL_LOHI(Op, DAG);
   case ISD::SDIVREM:
@@ -1679,6 +2170,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerSTACKSAVE(Op, DAG);
   case ISD::STACKRESTORE:
     return lowerSTACKRESTORE(Op, DAG);
+  case ISD::PREFETCH:
+    return lowerPREFETCH(Op, DAG);
   default:
     llvm_unreachable("Unexpected node to lower");
   }
@@ -1689,9 +2182,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
     OPCODE(RET_FLAG);
     OPCODE(CALL);
+    OPCODE(SIBCALL);
     OPCODE(PCREL_WRAPPER);
-    OPCODE(CMP);
-    OPCODE(UCMP);
+    OPCODE(PCREL_OFFSET);
+    OPCODE(ICMP);
+    OPCODE(FCMP);
+    OPCODE(TM);
     OPCODE(BR_CCMASK);
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
@@ -1701,6 +2197,19 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(UDIVREM32);
     OPCODE(UDIVREM64);
     OPCODE(MVC);
+    OPCODE(MVC_LOOP);
+    OPCODE(NC);
+    OPCODE(NC_LOOP);
+    OPCODE(OC);
+    OPCODE(OC_LOOP);
+    OPCODE(XC);
+    OPCODE(XC_LOOP);
+    OPCODE(CLC);
+    OPCODE(CLC_LOOP);
+    OPCODE(STRCMP);
+    OPCODE(STPCPY);
+    OPCODE(SEARCH_STRING);
+    OPCODE(IPM);
     OPCODE(ATOMIC_SWAPW);
     OPCODE(ATOMIC_LOADW_ADD);
     OPCODE(ATOMIC_LOADW_SUB);
@@ -1713,6 +2222,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(ATOMIC_LOADW_UMIN);
     OPCODE(ATOMIC_LOADW_UMAX);
     OPCODE(ATOMIC_CMP_SWAPW);
+    OPCODE(PREFETCH);
   }
   return NULL;
 #undef OPCODE
@@ -1742,6 +2252,31 @@ static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
   return NewMBB;
 }
 
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Force base value Base into a register before MI.  Return the register.
+static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
+                         const SystemZInstrInfo *TII) {
+  if (Base.isReg())
+    return Base.getReg();
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg)
+    .addOperand(Base).addImm(0).addReg(0);
+  return Reg;
+}
+
 // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
 MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr *MI,
@@ -1756,7 +2291,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   DebugLoc DL       = MI->getDebugLoc();
 
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
   //  StartMBB:
@@ -1777,7 +2312,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
   //  ...
   MBB = JoinMBB;
-  BuildMI(*MBB, MBB->begin(), DL, TII->get(SystemZ::PHI), DestReg)
+  BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
     .addReg(TrueReg).addMBB(StartMBB)
     .addReg(FalseReg).addMBB(FalseMBB);
 
@@ -1824,7 +2359,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
     CCMask ^= CCValid;
 
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
   //  StartMBB:
@@ -1900,7 +2435,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
 
   // Insert a basic block for the main loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
 
   //  StartMBB:
@@ -1934,11 +2469,11 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
       .addReg(RotatedOldVal).addOperand(Src2);
     if (BitSize < 32)
       // XILF with the upper BitSize bits set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
         .addReg(Tmp).addImm(uint32_t(~0 << (32 - BitSize)));
     else if (BitSize == 32)
       // XILF with every bit set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
         .addReg(Tmp).addImm(~uint32_t(0));
     else {
       // Use LCGR and add -1 to the result, which is more compact than
@@ -2022,7 +2557,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
 
   // Insert 3 basic blocks for the loop.
   MachineBasicBlock *StartMBB  = MBB;
-  MachineBasicBlock *DoneMBB   = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB   = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
   MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
   MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
@@ -2129,7 +2664,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
 
   // Insert 2 basic blocks for the loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
   MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
 
@@ -2205,8 +2740,8 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
 
 // Emit an extension from a GR32 or GR64 to a GR128.  ClearEven is true
 // if the high register of the GR128 value must be cleared or false if
-// it's "don't care".  SubReg is subreg_odd32 when extending a GR32
-// and subreg_odd when extending a GR64.
+// it's "don't care".  SubReg is subreg_l32 when extending a GR32
+// and subreg_l64 when extending a GR64.
 MachineBasicBlock *
 SystemZTargetLowering::emitExt128(MachineInstr *MI,
                                   MachineBasicBlock *MBB,
@@ -2228,7 +2763,7 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
     BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
       .addImm(0);
     BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
-      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_high);
+      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
     In128 = NewIn128;
   }
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
@@ -2239,28 +2774,237 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI,
-                                      MachineBasicBlock *MBB) const {
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
   const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
-  MachineOperand DestBase = MI->getOperand(0);
+  MachineOperand DestBase = earlyUseOperand(MI->getOperand(0));
   uint64_t       DestDisp = MI->getOperand(1).getImm();
-  MachineOperand SrcBase  = MI->getOperand(2);
+  MachineOperand SrcBase  = earlyUseOperand(MI->getOperand(2));
   uint64_t       SrcDisp  = MI->getOperand(3).getImm();
   uint64_t       Length   = MI->getOperand(4).getImm();
 
-  BuildMI(*MBB, MI, DL, TII->get(SystemZ::MVC))
-    .addOperand(DestBase).addImm(DestDisp).addImm(Length)
-    .addOperand(SrcBase).addImm(SrcDisp);
+  // When generating more than one CLC, all but the last will need to
+  // branch to the end when a difference is found.
+  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
+                               splitBlockAfter(MI, MBB) : 0);
+
+  // Check for the loop form, in which operand 5 is the trip count.
+  if (MI->getNumExplicitOperands() > 5) {
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+    uint64_t StartCountReg = MI->getOperand(5).getReg();
+    uint64_t StartSrcReg   = forceReg(MI, SrcBase, TII);
+    uint64_t StartDestReg  = (HaveSingleBase ? StartSrcReg :
+                              forceReg(MI, DestBase, TII));
+
+    const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+    uint64_t ThisSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+                            MRI.createVirtualRegister(RC));
+    uint64_t NextSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+                            MRI.createVirtualRegister(RC));
+
+    RC = &SystemZ::GR64BitRegClass;
+    uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+    uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+    MachineBasicBlock *StartMBB = MBB;
+    MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+    MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
+
+    //  StartMBB:
+    //   # fall through to LoopMMB
+    MBB->addSuccessor(LoopMBB);
+
+    //  LoopMBB:
+    //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+    //                      [ %NextDestReg, NextMBB ]
+    //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+    //                     [ %NextSrcReg, NextMBB ]
+    //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+    //                       [ %NextCountReg, NextMBB ]
+    //   ( PFD 2, 768+DestDisp(%ThisDestReg) )
+    //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+    //   ( JLH EndMBB )
+    //
+    // The prefetch is used only for MVC.  The JLH is used only for CLC.
+    MBB = LoopMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+      .addReg(StartDestReg).addMBB(StartMBB)
+      .addReg(NextDestReg).addMBB(NextMBB);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+        .addReg(StartSrcReg).addMBB(StartMBB)
+        .addReg(NextSrcReg).addMBB(NextMBB);
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+      .addReg(StartCountReg).addMBB(StartMBB)
+      .addReg(NextCountReg).addMBB(NextMBB);
+    if (Opcode == SystemZ::MVC)
+      BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+        .addImm(SystemZ::PFD_WRITE)
+        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+    BuildMI(MBB, DL, TII->get(Opcode))
+      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+      .addReg(ThisSrcReg).addImm(SrcDisp);
+    if (EndMBB) {
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+    }
+
+    // NextMBB:
+    //   %NextDestReg = LA 256(%ThisDestReg)
+    //   %NextSrcReg = LA 256(%ThisSrcReg)
+    //   %NextCountReg = AGHI %ThisCountReg, -1
+    //   CGHI %NextCountReg, 0
+    //   JLH LoopMBB
+    //   # fall through to DoneMMB
+    //
+    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+    MBB = NextMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+      .addReg(ThisDestReg).addImm(256).addReg(0);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+        .addReg(ThisSrcReg).addImm(256).addReg(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+      .addReg(ThisCountReg).addImm(-1);
+    BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+      .addReg(NextCountReg).addImm(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+      .addMBB(LoopMBB);
+    MBB->addSuccessor(LoopMBB);
+    MBB->addSuccessor(DoneMBB);
+
+    DestBase = MachineOperand::CreateReg(NextDestReg, false);
+    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+    Length &= 255;
+    MBB = DoneMBB;
+  }
+  // Handle any remaining bytes with straight-line code.
+  while (Length > 0) {
+    uint64_t ThisLength = std::min(Length, uint64_t(256));
+    // The previous iteration might have created out-of-range displacements.
+    // Apply them using LAY if so.
+    if (!isUInt<12>(DestDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(DestBase).addImm(DestDisp).addReg(0);
+      DestBase = MachineOperand::CreateReg(Reg, false);
+      DestDisp = 0;
+    }
+    if (!isUInt<12>(SrcDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(SrcBase).addImm(SrcDisp).addReg(0);
+      SrcBase = MachineOperand::CreateReg(Reg, false);
+      SrcDisp = 0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(Opcode))
+      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+      .addOperand(SrcBase).addImm(SrcDisp);
+    DestDisp += ThisLength;
+    SrcDisp += ThisLength;
+    Length -= ThisLength;
+    // If there's another CLC to go, branch to the end if a difference
+    // was found.
+    if (EndMBB && Length > 0) {
+      MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+      MBB = NextMBB;
+    }
+  }
+  if (EndMBB) {
+    MBB->addSuccessor(EndMBB);
+    MBB = EndMBB;
+    MBB->addLiveIn(SystemZ::CC);
+  }
 
   MI->eraseFromParent();
   return MBB;
 }
 
+// Decompose string pseudo-instruction MI into a loop that continually performs
+// Opcode until CC != 3.
+MachineBasicBlock *
+SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  uint64_t End1Reg   = MI->getOperand(0).getReg();
+  uint64_t Start1Reg = MI->getOperand(1).getReg();
+  uint64_t Start2Reg = MI->getOperand(2).getReg();
+  uint64_t CharReg   = MI->getOperand(3).getReg();
+
+  const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
+  uint64_t This1Reg = MRI.createVirtualRegister(RC);
+  uint64_t This2Reg = MRI.createVirtualRegister(RC);
+  uint64_t End2Reg  = MRI.createVirtualRegister(RC);
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   # fall through to LoopMMB
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
+  //   %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
+  //   R0L = %CharReg
+  //   %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
+  //   JO LoopMBB
+  //   # fall through to DoneMMB
+  //
+  // The load of R0L can be hoisted by post-RA LICM.
+  MBB = LoopMBB;
+
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
+    .addReg(Start1Reg).addMBB(StartMBB)
+    .addReg(End1Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
+    .addReg(Start2Reg).addMBB(StartMBB)
+    .addReg(End2Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
+  BuildMI(MBB, DL, TII->get(Opcode))
+    .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
+    .addReg(This1Reg).addReg(This2Reg);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  DoneMBB->addLiveIn(SystemZ::CC);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
 MachineBasicBlock *SystemZTargetLowering::
 EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   switch (MI->getOpcode()) {
+  case SystemZ::Select32Mux:
   case SystemZ::Select32:
   case SystemZ::SelectF32:
   case SystemZ::Select64:
@@ -2268,18 +3012,14 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   case SystemZ::SelectF128:
     return emitSelect(MI, MBB);
 
-  case SystemZ::CondStore8_32:
-    return emitCondStore(MI, MBB, SystemZ::STC32, 0, false);
-  case SystemZ::CondStore8_32Inv:
-    return emitCondStore(MI, MBB, SystemZ::STC32, 0, true);
-  case SystemZ::CondStore16_32:
-    return emitCondStore(MI, MBB, SystemZ::STH32, 0, false);
-  case SystemZ::CondStore16_32Inv:
-    return emitCondStore(MI, MBB, SystemZ::STH32, 0, true);
-  case SystemZ::CondStore32_32:
-    return emitCondStore(MI, MBB, SystemZ::ST32, SystemZ::STOC32, false);
-  case SystemZ::CondStore32_32Inv:
-    return emitCondStore(MI, MBB, SystemZ::ST32, SystemZ::STOC32, true);
+  case SystemZ::CondStore8Mux:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
+  case SystemZ::CondStore8MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
+  case SystemZ::CondStore16Mux:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
+  case SystemZ::CondStore16MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
   case SystemZ::CondStore8:
     return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
   case SystemZ::CondStore8Inv:
@@ -2306,11 +3046,11 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
     return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
 
   case SystemZ::AEXT128_64:
-    return emitExt128(MI, MBB, false, SystemZ::subreg_low);
+    return emitExt128(MI, MBB, false, SystemZ::subreg_l64);
   case SystemZ::ZEXT128_32:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_low32);
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l32);
   case SystemZ::ZEXT128_64:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_low);
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l64);
 
   case SystemZ::ATOMIC_SWAPW:
     return emitAtomicLoadBinary(MI, MBB, 0, 0);
@@ -2346,98 +3086,98 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   case SystemZ::ATOMIC_LOADW_NR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
   case SystemZ::ATOMIC_LOADW_NILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
   case SystemZ::ATOMIC_LOAD_NR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
-  case SystemZ::ATOMIC_LOAD_NILL32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32);
-  case SystemZ::ATOMIC_LOAD_NILH32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32);
-  case SystemZ::ATOMIC_LOAD_NILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32);
-  case SystemZ::ATOMIC_LOAD_NGR:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
   case SystemZ::ATOMIC_LOAD_NILL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
   case SystemZ::ATOMIC_LOAD_NILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64);
-  case SystemZ::ATOMIC_LOAD_NIHL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64);
-  case SystemZ::ATOMIC_LOAD_NIHH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
   case SystemZ::ATOMIC_LOAD_NILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64);
-  case SystemZ::ATOMIC_LOAD_NIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
+  case SystemZ::ATOMIC_LOAD_NGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
+  case SystemZ::ATOMIC_LOAD_NILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
+  case SystemZ::ATOMIC_LOAD_NILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_NILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_OR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
   case SystemZ::ATOMIC_LOADW_OILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
   case SystemZ::ATOMIC_LOAD_OR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
-  case SystemZ::ATOMIC_LOAD_OILL32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL32, 32);
-  case SystemZ::ATOMIC_LOAD_OILH32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 32);
-  case SystemZ::ATOMIC_LOAD_OILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF32, 32);
-  case SystemZ::ATOMIC_LOAD_OGR:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
   case SystemZ::ATOMIC_LOAD_OILL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
   case SystemZ::ATOMIC_LOAD_OILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 64);
-  case SystemZ::ATOMIC_LOAD_OIHL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL, 64);
-  case SystemZ::ATOMIC_LOAD_OIHH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
   case SystemZ::ATOMIC_LOAD_OILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 64);
-  case SystemZ::ATOMIC_LOAD_OIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
+  case SystemZ::ATOMIC_LOAD_OGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
+  case SystemZ::ATOMIC_LOAD_OILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
+  case SystemZ::ATOMIC_LOAD_OILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_OILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_XR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
   case SystemZ::ATOMIC_LOADW_XILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
   case SystemZ::ATOMIC_LOAD_XR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
-  case SystemZ::ATOMIC_LOAD_XILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 32);
+  case SystemZ::ATOMIC_LOAD_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
   case SystemZ::ATOMIC_LOAD_XGR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
-  case SystemZ::ATOMIC_LOAD_XILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 64);
-  case SystemZ::ATOMIC_LOAD_XIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF, 64);
+  case SystemZ::ATOMIC_LOAD_XILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
+  case SystemZ::ATOMIC_LOAD_XIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_NRi:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
   case SystemZ::ATOMIC_LOADW_NILHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
   case SystemZ::ATOMIC_LOAD_NRi:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILL32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILH32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILF32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NGRi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
   case SystemZ::ATOMIC_LOAD_NILLi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
   case SystemZ::ATOMIC_LOAD_NILHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHLi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
   case SystemZ::ATOMIC_LOAD_NILFi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHFi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
+  case SystemZ::ATOMIC_LOAD_NGRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
 
   case SystemZ::ATOMIC_LOADW_MIN:
     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
@@ -2481,8 +3221,27 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
 
   case SystemZ::ATOMIC_CMP_SWAPW:
     return emitAtomicCmpSwapW(MI, MBB);
-  case SystemZ::MVCWrapper:
-    return emitMVCWrapper(MI, MBB);
+  case SystemZ::MVCSequence:
+  case SystemZ::MVCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+  case SystemZ::NCSequence:
+  case SystemZ::NCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::NC);
+  case SystemZ::OCSequence:
+  case SystemZ::OCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::OC);
+  case SystemZ::XCSequence:
+  case SystemZ::XCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::XC);
+  case SystemZ::CLCSequence:
+  case SystemZ::CLCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+  case SystemZ::CLSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::CLST);
+  case SystemZ::MVSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::MVST);
+  case SystemZ::SRSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::SRST);
   default:
     llvm_unreachable("Unexpected instr type to insert");
   }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index c0dbe49..c6dcca6 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -32,17 +32,32 @@ namespace SystemZISD {
     // is the target address.  The arguments start at operand 2.
     // There is an optional glue operand at the end.
     CALL,
+    SIBCALL,
 
     // Wraps a TargetGlobalAddress that should be loaded using PC-relative
     // accesses (LARL).  Operand 0 is the address.
     PCREL_WRAPPER,
 
-    // Signed integer and floating-point comparisons.  The operands are the
-    // two values to compare.
-    CMP,
+    // Used in cases where an offset is applied to a TargetGlobalAddress.
+    // Operand 0 is the full TargetGlobalAddress and operand 1 is a
+    // PCREL_WRAPPER for an anchor point.  This is used so that we can
+    // cheaply refer to either the full address or the anchor point
+    // as a register base.
+    PCREL_OFFSET,
 
-    // Likewise unsigned integer comparison.
-    UCMP,
+    // Integer comparisons.  There are three operands: the two values
+    // to compare, and an integer of type SystemZICMP.
+    ICMP,
+
+    // Floating-point comparisons.  The two operands are the values to compare.
+    FCMP,
+
+    // Test under mask.  The first operand is ANDed with the second operand
+    // and the condition codes are set on the result.  The third operand is
+    // a boolean that is true if the condition codes need to distinguish
+    // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+    // register forms do but the memory forms don't).
+    TM,
 
     // Branches if a condition is true.  Operand 0 is the chain operand;
     // operand 1 is the 4-bit condition-code mask, with bit N in
@@ -73,13 +88,50 @@ namespace SystemZISD {
     UDIVREM32,
     UDIVREM64,
 
-    // Use MVC to copy bytes from one memory location to another.
-    // The first operand is the target address, the second operand is the
-    // source address, and the third operand is the constant length.
+    // Use a series of MVCs to copy bytes from one memory location to another.
+    // The operands are:
+    // - the target address
+    // - the source address
+    // - the constant length
+    //
     // This isn't a memory opcode because we'd need to attach two
     // MachineMemOperands rather than one.
     MVC,
 
+    // Like MVC, but implemented as a loop that handles X*256 bytes
+    // followed by straight-line code to handle the rest (if any).
+    // The value of X is passed as an additional operand.
+    MVC_LOOP,
+
+    // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+    NC,
+    NC_LOOP,
+    OC,
+    OC_LOOP,
+    XC,
+    XC_LOOP,
+
+    // Use CLC to compare two blocks of memory, with the same comments
+    // as for MVC and MVC_LOOP.
+    CLC,
+    CLC_LOOP,
+
+    // Use an MVST-based sequence to implement stpcpy().
+    STPCPY,
+
+    // Use a CLST-based sequence to implement strcmp().  The two input operands
+    // are the addresses of the strings to compare.
+    STRCMP,
+
+    // Use an SRST-based sequence to search a block of memory.  The first
+    // operand is the end address, the second is the start, and the third
+    // is the character to search for.  CC is set to 1 on success and 2
+    // on failure.
+    SEARCH_STRING,
+
+    // Store the CC value in bits 29 and 28 of an integer.
+    IPM,
+
     // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
     // ATOMIC_LOAD_<op>.
     //
@@ -111,7 +163,27 @@ namespace SystemZISD {
     //            operand into the high bits
     // Operand 4: the negative of operand 2, for rotating the other way
     // Operand 5: the width of the field in bits (8 or 16)
-    ATOMIC_CMP_SWAPW
+    ATOMIC_CMP_SWAPW,
+
+    // Prefetch from the second operand using the 4-bit control code in
+    // the first operand.  The code is 1 for a load prefetch and 2 for
+    // a store prefetch.
+    PREFETCH
+  };
+
+  // Return true if OPCODE is some kind of PC-relative address.
+  inline bool isPCREL(unsigned Opcode) {
+    return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+  }
+}
+
+namespace SystemZICMP {
+  // Describes whether an integer comparison needs to be signed or unsigned,
+  // or whether either type is OK.
+  enum {
+    Any,
+    UnsignedOnly,
+    SignedOnly
   };
 }
 
@@ -126,15 +198,15 @@ public:
   virtual MVT getScalarShiftAmountTy(EVT LHSTy) const LLVM_OVERRIDE {
     return MVT::i32;
   }
-  virtual EVT getSetCCResultType(LLVMContext &, EVT) const LLVM_OVERRIDE {
-    return MVT::i32;
-  }
+  virtual EVT getSetCCResultType(LLVMContext &, EVT) const LLVM_OVERRIDE;
   virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const LLVM_OVERRIDE;
   virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const LLVM_OVERRIDE;
   virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const
      LLVM_OVERRIDE;
   virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const
     LLVM_OVERRIDE;
+  virtual bool isTruncateFree(Type *, Type *) const LLVM_OVERRIDE;
+  virtual bool isTruncateFree(EVT, EVT) const LLVM_OVERRIDE;
   virtual const char *getTargetNodeName(unsigned Opcode) const LLVM_OVERRIDE;
   virtual std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const std::string &Constraint,
@@ -154,6 +226,8 @@ public:
                                 MachineBasicBlock *BB) const LLVM_OVERRIDE;
   virtual SDValue LowerOperation(SDValue Op,
                                  SelectionDAG &DAG) const LLVM_OVERRIDE;
+  virtual bool allowTruncateForTailCall(Type *, Type *) const LLVM_OVERRIDE;
+  virtual bool mayBeEmittedAsTailCall(CallInst *CI) const LLVM_OVERRIDE;
   virtual SDValue
     LowerFormalArguments(SDValue Chain,
                          CallingConv::ID CallConv, bool isVarArg,
@@ -176,6 +250,7 @@ private:
   const SystemZTargetMachine &TM;
 
   // Implement LowerOperation for individual opcodes.
+  SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
@@ -189,6 +264,7 @@ private:
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
@@ -199,6 +275,7 @@ private:
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
 
   // If the last instruction before MBBI in MBB was some form of COMPARE,
   // try to replace it with a COMPARE AND BRANCH just before MBBI.
@@ -230,8 +307,12 @@ private:
                                           unsigned BitSize) const;
   MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
                                         MachineBasicBlock *BB) const;
-  MachineBasicBlock *emitMVCWrapper(MachineInstr *MI,
-                                    MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
+  MachineBasicBlock *emitStringWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index b903b51..6080046 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -27,9 +27,9 @@ defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
 
 // Load zero.
 let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-  def LZER : InherentRRE<"lze", 0xB374, FP32,  (fpimm0)>;
-  def LZDR : InherentRRE<"lzd", 0xB375, FP64,  (fpimm0)>;
-  def LZXR : InherentRRE<"lzx", 0xB376, FP128, (fpimm0)>;
+  def LZER : InherentRRE<"lzer", 0xB374, FP32,  (fpimm0)>;
+  def LZDR : InherentRRE<"lzdr", 0xB375, FP64,  (fpimm0)>;
+  def LZXR : InherentRRE<"lzxr", 0xB376, FP128, (fpimm0)>;
 }
 
 // Moves between two floating-point registers.
@@ -62,7 +62,7 @@ let isCodeGenOnly = 1 in {
 
 // The sign of an FP128 is in the high register.
 def : Pat<(fcopysign FP32:$src1, FP128:$src2),
-          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
 // fcopysign with an FP64 result.
 let isCodeGenOnly = 1 in
@@ -71,24 +71,24 @@ def CPSDRdd : BinaryRRF<"cpsd", 0xB372, fcopysign, FP64, FP64>;
 
 // The sign of an FP128 is in the high register.
 def : Pat<(fcopysign FP64:$src1, FP128:$src2),
-          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
 // fcopysign with an FP128 result.  Use "upper" as the high half and leave
 // the low half as-is.
 class CopySign128<RegisterOperand cls, dag upper>
   : Pat<(fcopysign FP128:$src1, cls:$src2),
-        (INSERT_SUBREG FP128:$src1, upper, subreg_high)>;
+        (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
 
-def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_high),
+def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
                                   FP32:$src2)>;
-def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
+def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
                                   FP64:$src2)>;
-def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
-                                  (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
-defm LoadStoreF32  : MVCLoadStore<load, store, f32,  MVCWrapper, 4>;
-defm LoadStoreF64  : MVCLoadStore<load, store, f64,  MVCWrapper, 8>;
-defm LoadStoreF128 : MVCLoadStore<load, store, f128, MVCWrapper, 16>;
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
 
 //===----------------------------------------------------------------------===//
 // Load instructions
@@ -134,9 +134,9 @@ def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>;
 def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>;
 
 def : Pat<(f32 (fround FP128:$src)),
-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_32bit)>;
+          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>;
 def : Pat<(f64 (fround FP128:$src)),
-          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_high)>;
+          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
 
 // Extend register floating-point values to wider representations.
 def LDEBR : UnaryRRE<"ldeb", 0xB304, fextend, FP64,  FP32>;
@@ -212,21 +212,56 @@ def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32, 4>;
 def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64, 8>;
 
 // Round to an integer, with the second operand (modifier M3) specifying
-// the rounding mode.
-//
-// These forms always check for inexact conditions.  z196 added versions
-// that allow this to suppressed (as for fnearbyint), but we don't yet
-// support -march=z196.
+// the rounding mode.  These forms always check for inexact conditions.
 def FIEBR : UnaryRRF<"fieb", 0xB357, FP32,  FP32>;
 def FIDBR : UnaryRRF<"fidb", 0xB35F, FP64,  FP64>;
 def FIXBR : UnaryRRF<"fixb", 0xB347, FP128, FP128>;
 
+// Extended forms of the previous three instructions.  M4 can be set to 4
+// to suppress detection of inexact conditions.
+def FIEBRA : UnaryRRF4<"fiebra", 0xB357, FP32,  FP32>,
+             Requires<[FeatureFPExtension]>;
+def FIDBRA : UnaryRRF4<"fidbra", 0xB35F, FP64,  FP64>,
+             Requires<[FeatureFPExtension]>;
+def FIXBRA : UnaryRRF4<"fixbra", 0xB347, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+
 // frint rounds according to the current mode (modifier 0) and detects
 // inexact conditions.
 def : Pat<(frint FP32:$src),  (FIEBR 0, FP32:$src)>;
 def : Pat<(frint FP64:$src),  (FIDBR 0, FP64:$src)>;
 def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
 
+let Predicates = [FeatureFPExtension] in {
+  // fnearbyint is like frint but does not detect inexact conditions.
+  def : Pat<(fnearbyint FP32:$src),  (FIEBRA 0, FP32:$src,  4)>;
+  def : Pat<(fnearbyint FP64:$src),  (FIDBRA 0, FP64:$src,  4)>;
+  def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
+
+  // floor is no longer allowed to raise an inexact condition,
+  // so restrict it to the cases where the condition can be suppressed.
+  // Mode 7 is round towards -inf.
+  def : Pat<(ffloor FP32:$src),  (FIEBRA 7, FP32:$src,  4)>;
+  def : Pat<(ffloor FP64:$src),  (FIDBRA 7, FP64:$src,  4)>;
+  def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
+
+  // Same idea for ceil, where mode 6 is round towards +inf.
+  def : Pat<(fceil FP32:$src),  (FIEBRA 6, FP32:$src,  4)>;
+  def : Pat<(fceil FP64:$src),  (FIDBRA 6, FP64:$src,  4)>;
+  def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
+
+  // Same idea for trunc, where mode 5 is round towards zero.
+  def : Pat<(ftrunc FP32:$src),  (FIEBRA 5, FP32:$src,  4)>;
+  def : Pat<(ftrunc FP64:$src),  (FIDBRA 5, FP64:$src,  4)>;
+  def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
+
+  // Same idea for round, where mode 1 is round towards nearest with
+  // ties away from zero.
+  def : Pat<(frnd FP32:$src),  (FIEBRA 1, FP32:$src,  4)>;
+  def : Pat<(frnd FP64:$src),  (FIDBRA 1, FP64:$src,  4)>;
+  def : Pat<(frnd FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Binary arithmetic
 //===----------------------------------------------------------------------===//
@@ -265,26 +300,26 @@ def MDB  : BinaryRXE<"mdb",  0xED1C, fmul, FP64, load, 8>;
 def MDEBR : BinaryRRE<"mdeb", 0xB30C, null_frag, FP64, FP32>;
 def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))),
           (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                FP32:$src1, subreg_32bit), FP32:$src2)>;
+                                FP32:$src1, subreg_h32), FP32:$src2)>;
 
 // f64 multiplication of an FP32 register and an f32 memory.
 def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
 def : Pat<(fmul (f64 (fextend FP32:$src1)),
                 (f64 (extloadf32 bdxaddr12only:$addr))),
-          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_32bit),
+          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
                 bdxaddr12only:$addr)>;
 
 // f128 multiplication of two FP64 registers.
 def MXDBR : BinaryRRE<"mxdb", 0xB307, null_frag, FP128, FP64>;
 def : Pat<(fmul (f128 (fextend FP64:$src1)), (f128 (fextend FP64:$src2))),
           (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
-                                FP64:$src1, subreg_high), FP64:$src2)>;
+                                FP64:$src1, subreg_h64), FP64:$src2)>;
 
 // f128 multiplication of an FP64 register and an f64 memory.
 def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
 def : Pat<(fmul (f128 (fextend FP64:$src1)),
                 (f128 (extloadf64 bdxaddr12only:$addr))),
-          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_high),
+          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
                 bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
@@ -314,12 +349,12 @@ def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
 //===----------------------------------------------------------------------===//
 
 let Defs = [CC], CCValues = 0xF in {
-  def CEBR : CompareRRE<"ceb", 0xB309, z_cmp, FP32,  FP32>;
-  def CDBR : CompareRRE<"cdb", 0xB319, z_cmp, FP64,  FP64>;
-  def CXBR : CompareRRE<"cxb", 0xB349, z_cmp, FP128, FP128>;
+  def CEBR : CompareRRE<"ceb", 0xB309, z_fcmp, FP32,  FP32>;
+  def CDBR : CompareRRE<"cdb", 0xB319, z_fcmp, FP64,  FP64>;
+  def CXBR : CompareRRE<"cxb", 0xB349, z_fcmp, FP128, FP128>;
 
-  def CEB : CompareRXE<"ceb", 0xED09, z_cmp, FP32, load, 4>;
-  def CDB : CompareRXE<"cdb", 0xED19, z_cmp, FP64, load, 8>;
+  def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 954df11..a8efe16 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -308,10 +308,11 @@ class InstRRF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   bits<4> R1;
   bits<4> R2;
   bits<4> R3;
+  bits<4> R4;
 
   let Inst{31-16} = op;
   let Inst{15-12} = R3;
-  let Inst{11-8}  = 0;
+  let Inst{11-8}  = R4;
   let Inst{7-4}   = R1;
   let Inst{3-0}   = R2;
 }
@@ -539,6 +540,10 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //     One output operand and five input operands.  The first two operands
 //     are registers and the other three are immediates.
 //
+//   Prefetch:
+//     One 4-bit immediate operand and one address operand.  The immediate
+//     operand is 1 for a load prefetch and 2 for a store prefetch.
+//
 // The format determines which input operands are tied to output operands,
 // and also determines the shape of any address operand.
 //
@@ -552,7 +557,7 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
                   dag src>
   : InstRRE<opcode, (outs cls:$R1), (ins),
-            mnemonic#"r\t$R1",
+            mnemonic#"\t$R1",
             [(set cls:$R1, src)]> {
   let R2 = 0;
 }
@@ -626,27 +631,33 @@ class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
   let mayStore = 1;
 }
 
+// StoreSI* instructions are used to store an integer to memory, but the
+// addresses are more restricted than for normal stores.  If we are in the
+// situation of having to force either the address into a register or the
+// constant into a register, it's usually better to do the latter.
+// We therefore match the address in the same way as a normal store and
+// only use the StoreSI* instruction if the matched address is suitable.
 class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              Immediate imm, AddressingMode mode = bdaddr12only>
-  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+              Immediate imm>
+  : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
-           [(operator imm:$I2, mode:$BD1)]> {
+           [(operator imm:$I2, mviaddr12pair:$BD1)]> {
   let mayStore = 1;
 }
 
 class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               Immediate imm, AddressingMode mode = bdaddr20only>
-  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+               Immediate imm>
+  : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(operator imm:$I2, mode:$BD1)]> {
+            [(operator imm:$I2, mviaddr20pair:$BD1)]> {
   let mayStore = 1;
 }
 
 class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                Immediate imm>
-  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+  : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(operator imm:$I2, bdaddr12only:$BD1)]> {
+            [(operator imm:$I2, mviaddr12pair:$BD1)]> {
   let mayStore = 1;
 }
 
@@ -654,9 +665,9 @@ multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
                        SDPatternOperator operator, Immediate imm> {
   let DispKey = mnemonic in {
     let DispSize = "12" in
-      def "" : StoreSI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
+      def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
     let DispSize = "20" in
-      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm>;
   }
 }
 
@@ -719,8 +730,14 @@ class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
             mnemonic#"r\t$R1, $R3, $R2", []> {
   let OpKey = mnemonic ## cls1;
   let OpType = "reg";
+  let R4 = 0;
 }
 
+class UnaryRRF4<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2, uimm8zx4:$R4),
+            mnemonic#"\t$R1, $R3, $R2, $R4", []>;
+
 // These instructions are generated by if conversion.  The old value of R1
 // is added as an implicit use.
 class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
@@ -729,6 +746,7 @@ class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
             mnemonic#"r$R3\t$R1, $R2", []>,
     Requires<[FeatureLoadStoreOnCond]> {
   let CCMaskLast = 1;
+  let R4 = 0;
 }
 
 // Like CondUnaryRRF, but used for the raw assembly form.  The condition-code
@@ -740,6 +758,7 @@ class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
     Requires<[FeatureLoadStoreOnCond]> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
+  let R4 = 0;
 }
 
 // Like CondUnaryRRF, but with a fixed CC mask.
@@ -751,6 +770,7 @@ class FixedCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
   let R3 = ccmask;
+  let R4 = 0;
 }
 
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
@@ -898,13 +918,16 @@ class BinaryRRF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
             [(set cls1:$R1, (operator cls1:$R3, cls2:$R2))]> {
   let OpKey = mnemonic ## cls1;
   let OpType = "reg";
+  let R4 = 0;
 }
 
 class BinaryRRFK<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls1, RegisterOperand cls2>
   : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R2, cls2:$R3),
             mnemonic#"rk\t$R1, $R2, $R3",
-            [(set cls1:$R1, (operator cls1:$R2, cls2:$R3))]>;
+            [(set cls1:$R1, (operator cls1:$R2, cls2:$R3))]> {
+  let R4 = 0;
+}
 
 multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
                         SDPatternOperator operator, RegisterOperand cls1,
@@ -1285,6 +1308,22 @@ class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let DisableEncoding = "$R1src";
 }
 
+class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
+  : InstRXY<opcode, (outs), (ins uimm8zx4:$R1, bdxaddr20only:$XBD2),
+            mnemonic##"\t$R1, $XBD2",
+            [(operator uimm8zx4:$R1, bdxaddr20only:$XBD2)]>;
+
+class PrefetchRILPC<string mnemonic, bits<12> opcode,
+                    SDPatternOperator operator>
+  : InstRIL<opcode, (outs), (ins uimm8zx4:$R1, pcrel32:$I2),
+            mnemonic##"\t$R1, $I2",
+            [(operator uimm8zx4:$R1, pcrel32:$I2)]> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
 // A floating-point load-and test operation.  Create both a normal unary
 // operation and one that acts as a comparison against zero.
 multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
@@ -1310,6 +1349,100 @@ class Pseudo<dag outs, dag ins, list<dag> pattern>
   let isCodeGenOnly = 1;
 }
 
+// Like UnaryRI, but expanded after RA depending on the choice of register.
+class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins imm:$I2),
+           [(set cls:$R1, (operator imm:$I2))]>;
+
+// Like UnaryRXY, but expanded after RA depending on the choice of register.
+class UnaryRXYPseudo<string key, SDPatternOperator operator,
+                     RegisterOperand cls, bits<5> bytes,
+                     AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs cls:$R1), (ins mode:$XBD2),
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = key ## cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like UnaryRR, but expanded after RA depending on the choice of registers.
+class UnaryRRPseudo<string key, SDPatternOperator operator,
+                    RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1), (ins cls2:$R2),
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = key ## cls1;
+  let OpType = "reg";
+}
+
+// Like BinaryRI, but expanded after RA depending on the choice of register.
+class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// Like BinaryRIE, but expanded after RA depending on the choice of register.
+class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
+           [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+// Like BinaryRIAndK, but expanded after RA depending on the choice of register.
+multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
+                              RegisterOperand cls, Immediate imm> {
+  let NumOpsKey = key in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIEPseudo<null_frag, cls, imm>,
+              Requires<[FeatureHighWord, FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRIPseudo<operator, cls, imm>,
+               Requires<[FeatureHighWord]>;
+  }
+}
+
+// Like CompareRI, but expanded after RA depending on the choice of register.
+class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]>;
+
+// Like CompareRXY, but expanded after RA depending on the choice of register.
+class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                       SDPatternOperator load, bits<5> bytes,
+                       AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like StoreRXY, but expanded after RA depending on the choice of register.
+class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let mayStore = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like RotateSelectRIEf, but expanded after RA depending on the choice
+// of registers.
+class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1),
+           (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5),
+           []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
 // the value of the PSW's 2-bit condition code field.
 class SelectWrapper<RegisterOperand cls>
@@ -1386,3 +1519,85 @@ class AtomicLoadWBinaryReg<SDPatternOperator operator>
   : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
 class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
   : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
+
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
+multiclass MemorySS<string mnemonic, bits<8> opcode,
+                    SDPatternOperator sequence, SDPatternOperator loop> {
+  def "" : InstSS<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
+                                       bdaddr12only:$BD2),
+                  mnemonic##"\t$BDL1, $BD2", []>;
+  let usesCustomInserter = 1 in {
+    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                       imm64:$length),
+                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+                                      imm64:$length)]>;
+    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                   imm64:$length, GR64:$count256),
+                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
+                             imm64:$length, GR64:$count256)]>;
+  }
+}
+
+// Define an instruction that operates on two strings, both terminated
+// by the character in R0.  The instruction processes a CPU-determinated
+// number of bytes at a time and sets CC to 3 if the instruction needs
+// to be repeated.  Also define a pseudo instruction that represents
+// the full loop (the main instruction plus the branch on CC==3).
+multiclass StringRRE<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator> {
+  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
+                   (ins GR64:$R1src, GR64:$R2src),
+                   mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+  }
+  let usesCustomInserter = 1 in
+    def Loop : Pseudo<(outs GR64:$end),
+                      (ins GR64:$start1, GR64:$start2, GR32:$char),
+                      [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
+                                                 GR32:$char))]>;
+}
+
+// A pseudo instruction that is a direct alias of a real instruction.
+// These aliases are used in cases where a particular register operand is
+// fixed or where the same instruction is used with different register sizes.
+// The size parameter is the size in bytes of the associated real instruction.
+class Alias<int size, dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<size, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// An alias of a BinaryRI, but with different register sizes.
+class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryRIL, but with different register sizes.
+class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a CompareRI, but with different register sizes.
+class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+// An alias of a RotateSelectRIEf, but with different register sizes.
+class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
+  : Alias<6, (outs cls1:$R1),
+          (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), []> {
+  let Constraints = "$R1 = $R1src";
+}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 9ee60aa..acfeed8 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "SystemZGenInstrInfo.inc"
 
@@ -28,6 +28,18 @@ static uint64_t allOnes(unsigned int Count) {
   return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
 }
 
+// Reg should be a 32-bit GPR.  Return true if it is a high register rather
+// than a low register.
+static bool isHighReg(unsigned int Reg) {
+  if (SystemZ::GRH32BitRegClass.contains(Reg))
+    return true;
+  assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+  return false;
+}
+
+// Pin the vtable to this file.
+void SystemZInstrInfo::anchor() {}
+
 SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm)
   : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
     RI(tm), TM(tm) {
@@ -48,8 +60,8 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   // Set up the two 64-bit registers.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
-  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_high));
-  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_low));
+  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
+  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
 
   // The address in the first (high) instruction is already correct.
   // Adjust the offset in the second (low) instruction.
@@ -82,6 +94,97 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
   OffsetMO.setImm(Offset);
 }
 
+// MI is an RI-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.  ConvertHigh is true if LowOpcode takes a signed operand
+// and HighOpcode takes an unsigned 32-bit operand.  In those cases,
+// MI has the same kind of operand as LowOpcode, so needs to be converted
+// if HighOpcode is used.
+void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                      unsigned HighOpcode,
+                                      bool ConvertHigh) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  bool IsHigh = isHighReg(Reg);
+  MI->setDesc(get(IsHigh ? HighOpcode : LowOpcode));
+  if (IsHigh && ConvertHigh)
+    MI->getOperand(1).setImm(uint32_t(MI->getOperand(1).getImm()));
+}
+
+// MI is a three-operand RIE-style pseudo instruction.  Replace it with
+// LowOpcode3 if the registers are both low GR32s, otherwise use a move
+// followed by HighOpcode or LowOpcode, depending on whether the target
+// is a high or low GR32.
+void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned LowOpcodeK,
+                                       unsigned HighOpcode) const {
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (!DestIsHigh && !SrcIsHigh)
+    MI->setDesc(get(LowOpcodeK));
+  else {
+    emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                  DestReg, SrcReg, SystemZ::LR, 32,
+                  MI->getOperand(1).isKill());
+    MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
+    MI->getOperand(1).setReg(DestReg);
+  }
+}
+
+// MI is an RXY-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.
+void SystemZInstrInfo::expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned HighOpcode) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
+                                       MI->getOperand(2).getImm());
+  MI->setDesc(get(Opcode));
+}
+
+// MI is an RR-style pseudo instruction that zero-extends the low Size bits
+// of one GRX32 into another.  Replace it with LowOpcode if both operands
+// are low registers, otherwise use RISB[LH]G.
+void SystemZInstrInfo::expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                        unsigned Size) const {
+  emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                MI->getOperand(0).getReg(), MI->getOperand(1).getReg(),
+                LowOpcode, Size, MI->getOperand(1).isKill());
+  MI->eraseFromParent();
+}
+
+// Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
+// DestReg before MBBI in MBB.  Use LowLowOpcode when both DestReg and SrcReg
+// are low registers, otherwise use RISB[LH]G.  Size is the number of bits
+// taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
+// KillSrc is true if this move is the last use of SrcReg.
+void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, unsigned DestReg,
+                                     unsigned SrcReg, unsigned LowLowOpcode,
+                                     unsigned Size, bool KillSrc) const {
+  unsigned Opcode;
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBHH;
+  else if (DestIsHigh && !SrcIsHigh)
+    Opcode = SystemZ::RISBHL;
+  else if (!DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBLH;
+  else {
+    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(DestReg, RegState::Undef)
+    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
+}
+
 // If MI is a simple load or store for a frame object, return the register
 // it loads or stores and set FrameIndex to the index of the frame object.
 // Return 0 otherwise.
@@ -293,6 +396,103 @@ SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   return Count;
 }
 
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  assert(MI->isCompare() && "Caller should have checked for a comparison");
+
+  if (MI->getNumExplicitOperands() == 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isImm()) {
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    Value = MI->getOperand(1).getImm();
+    Mask = ~0;
+    return true;
+  }
+
+  return false;
+}
+
+// If Reg is a virtual register, return its definition, otherwise return null.
+static MachineInstr *getDef(unsigned Reg,
+                            const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return 0;
+  return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) {
+  return (MI->getOpcode() == Opcode &&
+          !MI->getOperand(2).getReg() &&
+          MI->getOperand(3).getImm() == Imm);
+}
+
+// If the destination of MI has no uses, delete it as dead.
+static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
+  if (MRI->use_nodbg_empty(MI->getOperand(0).getReg()))
+    MI->eraseFromParent();
+}
+
+// Compare compares SrcReg against zero.  Check whether SrcReg contains
+// the result of an IPM sequence whose input CC survives until Compare,
+// and whether Compare is therefore redundant.  Delete it and return
+// true if so.
+static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
+                                  const MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo *TRI) {
+  MachineInstr *LGFR = 0;
+  MachineInstr *RLL = getDef(SrcReg, MRI);
+  if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
+    LGFR = RLL;
+    RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
+  }
+  if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
+    return false;
+
+  MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI);
+  if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC))
+    return false;
+
+  MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI);
+  if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+    return false;
+
+  // Check that there are no assignments to CC between the IPM and Compare,
+  if (IPM->getParent() != Compare->getParent())
+    return false;
+  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    if (MI->modifiesRegister(SystemZ::CC, TRI))
+      return false;
+  }
+
+  Compare->eraseFromParent();
+  if (LGFR)
+    eraseIfDead(LGFR, MRI);
+  eraseIfDead(RLL, MRI);
+  eraseIfDead(SRL, MRI);
+  eraseIfDead(IPM, MRI);
+
+  return true;
+}
+
+bool
+SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
+                                       unsigned SrcReg, unsigned SrcReg2,
+                                       int Mask, int Value,
+                                       const MachineRegisterInfo *MRI) const {
+  assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+  bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
+  if (Value == 0 &&
+      !IsLogical &&
+      removeIPMBasedCompare(Compare, SrcReg, MRI, TM.getRegisterInfo()))
+    return true;
+  return false;
+}
+
 // If Opcode is a move that has a conditional variant, return that variant,
 // otherwise return 0.
 static unsigned getConditionalMove(unsigned Opcode) {
@@ -356,18 +556,21 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 			      bool KillSrc) const {
   // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
   if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
-    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_high),
-                RI.getSubReg(SrcReg, SystemZ::subreg_high), KillSrc);
-    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_low),
-                RI.getSubReg(SrcReg, SystemZ::subreg_low), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_h64), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_l64), KillSrc);
+    return;
+  }
+
+  if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
     return;
   }
 
   // Everything else needs only one instruction.
   unsigned Opcode;
-  if (SystemZ::GR32BitRegClass.contains(DestReg, SrcReg))
-    Opcode = SystemZ::LR;
-  else if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
+  if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LGR;
   else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LER;
@@ -438,15 +641,15 @@ namespace {
 
 static LogicOp interpretAndImmediate(unsigned Opcode) {
   switch (Opcode) {
-  case SystemZ::NILL32: return LogicOp(32,  0, 16);
-  case SystemZ::NILH32: return LogicOp(32, 16, 16);
-  case SystemZ::NILL:   return LogicOp(64,  0, 16);
-  case SystemZ::NILH:   return LogicOp(64, 16, 16);
-  case SystemZ::NIHL:   return LogicOp(64, 32, 16);
-  case SystemZ::NIHH:   return LogicOp(64, 48, 16);
-  case SystemZ::NILF32: return LogicOp(32,  0, 32);
-  case SystemZ::NILF:   return LogicOp(64,  0, 32);
-  case SystemZ::NIHF:   return LogicOp(64, 32, 32);
+  case SystemZ::NILMux: return LogicOp(32,  0, 16);
+  case SystemZ::NIHMux: return LogicOp(32, 16, 16);
+  case SystemZ::NILL64: return LogicOp(64,  0, 16);
+  case SystemZ::NILH64: return LogicOp(64, 16, 16);
+  case SystemZ::NIHL64: return LogicOp(64, 32, 16);
+  case SystemZ::NIHH64: return LogicOp(64, 48, 16);
+  case SystemZ::NIFMux: return LogicOp(32,  0, 32);
+  case SystemZ::NILF64: return LogicOp(64,  0, 32);
+  case SystemZ::NIHF64: return LogicOp(64, 32, 32);
   default:              return LogicOp();
   }
 }
@@ -473,6 +676,7 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                         LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
   MachineBasicBlock *MBB = MI->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
   unsigned Opcode = MI->getOpcode();
   unsigned NumOps = MI->getNumOperands();
@@ -482,10 +686,23 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // because it tends to be shorter and because some instructions
   // have memory forms that can be used during spilling.
   if (TM.getSubtargetImpl()->hasDistinctOps()) {
+    MachineOperand &Dest = MI->getOperand(0);
+    MachineOperand &Src = MI->getOperand(1);
+    unsigned DestReg = Dest.getReg();
+    unsigned SrcReg = Src.getReg();
+    // AHIMux is only really a three-operand instruction when both operands
+    // are low registers.  Try to constrain both operands to be low if
+    // possible.
+    if (Opcode == SystemZ::AHIMux &&
+        TargetRegisterInfo::isVirtualRegister(DestReg) &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+        MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
+        MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
+      MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
+      MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
+    }
     int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
     if (ThreeOperandOpcode >= 0) {
-      MachineOperand &Dest = MI->getOperand(0);
-      MachineOperand &Src = MI->getOperand(1);
       MachineInstrBuilder MIB =
         BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
         .addOperand(Dest);
@@ -500,34 +717,27 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   // Try to convert an AND into an RISBG-type instruction.
   if (LogicOp And = interpretAndImmediate(Opcode)) {
-    unsigned NewOpcode;
-    if (And.RegSize == 64)
-      NewOpcode = SystemZ::RISBG;
-    else if (TM.getSubtargetImpl()->hasHighWord())
-      NewOpcode = SystemZ::RISBLG32;
-    else
-      // We can't use RISBG for 32-bit operations because it clobbers the
-      // high word of the destination too.
-      NewOpcode = 0;
-    if (NewOpcode) {
-      uint64_t Imm = MI->getOperand(2).getImm() << And.ImmLSB;
-      // AND IMMEDIATE leaves the other bits of the register unchanged.
-      Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
-      unsigned Start, End;
-      if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
-        if (NewOpcode == SystemZ::RISBLG32) {
-          Start &= 31;
-          End &= 31;
-        }
-        MachineOperand &Dest = MI->getOperand(0);
-        MachineOperand &Src = MI->getOperand(1);
-        MachineInstrBuilder MIB =
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(NewOpcode))
-          .addOperand(Dest).addReg(0)
-          .addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg())
-          .addImm(Start).addImm(End + 128).addImm(0);
-        return finishConvertToThreeAddress(MI, MIB, LV);
+    uint64_t Imm = MI->getOperand(2).getImm() << And.ImmLSB;
+    // AND IMMEDIATE leaves the other bits of the register unchanged.
+    Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
+    unsigned Start, End;
+    if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
+      unsigned NewOpcode;
+      if (And.RegSize == 64)
+        NewOpcode = SystemZ::RISBG;
+      else {
+        NewOpcode = SystemZ::RISBMux;
+        Start &= 31;
+        End &= 31;
       }
+      MachineOperand &Dest = MI->getOperand(0);
+      MachineOperand &Src = MI->getOperand(1);
+      MachineInstrBuilder MIB =
+        BuildMI(*MBB, MI, MI->getDebugLoc(), get(NewOpcode))
+        .addOperand(Dest).addReg(0)
+        .addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg())
+        .addImm(Start).addImm(End + 128).addImm(0);
+      return finishConvertToThreeAddress(MI, MIB, LV);
     }
   }
   return 0;
@@ -540,8 +750,21 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                         int FrameIndex) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Opcode = MI->getOpcode();
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    if ((Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+        isInt<8>(MI->getOperand(2).getImm()) &&
+        !MI->getOperand(3).getReg()) {
+      // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+      return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::AGSI))
+        .addFrameIndex(FrameIndex).addImm(0)
+        .addImm(MI->getOperand(2).getImm());
+    }
+    return 0;
+  }
 
-  // Eary exit for cases we don't care about
+  // All other cases require a single operand.
   if (Ops.size() != 1)
     return 0;
 
@@ -550,7 +773,16 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
          .getRegClass(MI->getOperand(OpNum).getReg())->getSize() &&
          "Invalid size combination");
 
-  unsigned Opcode = MI->getOpcode();
+  if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) &&
+      OpNum == 0 &&
+      isInt<8>(MI->getOperand(2).getImm())) {
+    // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
+    Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
+    return BuildMI(MF, MI->getDebugLoc(), get(Opcode))
+      .addFrameIndex(FrameIndex).addImm(0)
+      .addImm(MI->getOperand(2).getImm());
+  }
+
   if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
     bool Op0IsGPR = (Opcode == SystemZ::LGDR);
     bool Op1IsGPR = (Opcode == SystemZ::LDGR);
@@ -577,10 +809,14 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   //
   // Although MVC is in practice a fast choice in these cases, it is still
   // logically a bytewise copy.  This means that we cannot use it if the
-  // load or store is volatile.  It also means that the transformation is
-  // not valid in cases where the two memories partially overlap; however,
-  // that is not a problem here, because we know that one of the memories
-  // is a full frame index.
+  // load or store is volatile.  We also wouldn't be able to use MVC if
+  // the two memories partially overlap, but that case cannot occur here,
+  // because we know that one of the memories is a full frame index.
+  //
+  // For performance reasons, we also want to avoid using MVC if the addresses
+  // might be equal.  We don't worry about that case here, because spill slot
+  // coloring happens later, and because we have special code to remove
+  // MVCs that turn out to be redundant.
   if (OpNum == 0 && MI->hasOneMemOperand()) {
     MachineMemOperand *MMO = *MI->memoperands_begin();
     if (MMO->getSize() == Size && !MMO->isVolatile()) {
@@ -651,6 +887,138 @@ SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     splitMove(MI, SystemZ::STD);
     return true;
 
+  case SystemZ::LBMux:
+    expandRXYPseudo(MI, SystemZ::LB, SystemZ::LBH);
+    return true;
+
+  case SystemZ::LHMux:
+    expandRXYPseudo(MI, SystemZ::LH, SystemZ::LHH);
+    return true;
+
+  case SystemZ::LLCRMux:
+    expandZExtPseudo(MI, SystemZ::LLCR, 8);
+    return true;
+
+  case SystemZ::LLHRMux:
+    expandZExtPseudo(MI, SystemZ::LLHR, 16);
+    return true;
+
+  case SystemZ::LLCMux:
+    expandRXYPseudo(MI, SystemZ::LLC, SystemZ::LLCH);
+    return true;
+
+  case SystemZ::LLHMux:
+    expandRXYPseudo(MI, SystemZ::LLH, SystemZ::LLHH);
+    return true;
+
+  case SystemZ::LMux:
+    expandRXYPseudo(MI, SystemZ::L, SystemZ::LFH);
+    return true;
+
+  case SystemZ::STCMux:
+    expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
+    return true;
+
+  case SystemZ::STHMux:
+    expandRXYPseudo(MI, SystemZ::STH, SystemZ::STHH);
+    return true;
+
+  case SystemZ::STMux:
+    expandRXYPseudo(MI, SystemZ::ST, SystemZ::STFH);
+    return true;
+
+  case SystemZ::LHIMux:
+    expandRIPseudo(MI, SystemZ::LHI, SystemZ::IIHF, true);
+    return true;
+
+  case SystemZ::IIFMux:
+    expandRIPseudo(MI, SystemZ::IILF, SystemZ::IIHF, false);
+    return true;
+
+  case SystemZ::IILMux:
+    expandRIPseudo(MI, SystemZ::IILL, SystemZ::IIHL, false);
+    return true;
+
+  case SystemZ::IIHMux:
+    expandRIPseudo(MI, SystemZ::IILH, SystemZ::IIHH, false);
+    return true;
+
+  case SystemZ::NIFMux:
+    expandRIPseudo(MI, SystemZ::NILF, SystemZ::NIHF, false);
+    return true;
+
+  case SystemZ::NILMux:
+    expandRIPseudo(MI, SystemZ::NILL, SystemZ::NIHL, false);
+    return true;
+
+  case SystemZ::NIHMux:
+    expandRIPseudo(MI, SystemZ::NILH, SystemZ::NIHH, false);
+    return true;
+
+  case SystemZ::OIFMux:
+    expandRIPseudo(MI, SystemZ::OILF, SystemZ::OIHF, false);
+    return true;
+
+  case SystemZ::OILMux:
+    expandRIPseudo(MI, SystemZ::OILL, SystemZ::OIHL, false);
+    return true;
+
+  case SystemZ::OIHMux:
+    expandRIPseudo(MI, SystemZ::OILH, SystemZ::OIHH, false);
+    return true;
+
+  case SystemZ::XIFMux:
+    expandRIPseudo(MI, SystemZ::XILF, SystemZ::XIHF, false);
+    return true;
+
+  case SystemZ::TMLMux:
+    expandRIPseudo(MI, SystemZ::TMLL, SystemZ::TMHL, false);
+    return true;
+
+  case SystemZ::TMHMux:
+    expandRIPseudo(MI, SystemZ::TMLH, SystemZ::TMHH, false);
+    return true;
+
+  case SystemZ::AHIMux:
+    expandRIPseudo(MI, SystemZ::AHI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::AHIMuxK:
+    expandRIEPseudo(MI, SystemZ::AHI, SystemZ::AHIK, SystemZ::AIH);
+    return true;
+
+  case SystemZ::AFIMux:
+    expandRIPseudo(MI, SystemZ::AFI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::CFIMux:
+    expandRIPseudo(MI, SystemZ::CFI, SystemZ::CIH, false);
+    return true;
+
+  case SystemZ::CLFIMux:
+    expandRIPseudo(MI, SystemZ::CLFI, SystemZ::CLIH, false);
+    return true;
+
+  case SystemZ::CMux:
+    expandRXYPseudo(MI, SystemZ::C, SystemZ::CHF);
+    return true;
+
+  case SystemZ::CLMux:
+    expandRXYPseudo(MI, SystemZ::CL, SystemZ::CLHF);
+    return true;
+
+  case SystemZ::RISBMux: {
+    bool DestIsHigh = isHighReg(MI->getOperand(0).getReg());
+    bool SrcIsHigh = isHighReg(MI->getOperand(2).getReg());
+    if (SrcIsHigh == DestIsHigh)
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
+    else {
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
+      MI->getOperand(5).setImm(MI->getOperand(5).getImm() ^ 32);
+    }
+    return true;
+  }
+
   case SystemZ::ADJDYNALLOC:
     splitAdjDynAlloc(MI);
     return true;
@@ -697,11 +1065,21 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
     return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
                              MI->getOperand(2).getImm(), &MI->getOperand(3));
 
+  case SystemZ::CLIJ:
+  case SystemZ::CLRJ:
+    return SystemZII::Branch(SystemZII::BranchCL, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
   case SystemZ::CGIJ:
   case SystemZ::CGRJ:
     return SystemZII::Branch(SystemZII::BranchCG, SystemZ::CCMASK_ICMP,
                              MI->getOperand(2).getImm(), &MI->getOperand(3));
 
+  case SystemZ::CLGIJ:
+  case SystemZ::CLGRJ:
+    return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
   default:
     llvm_unreachable("Unrecognized branch opcode");
   }
@@ -712,7 +1090,13 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
                                            unsigned &StoreOpcode) const {
   if (RC == &SystemZ::GR32BitRegClass || RC == &SystemZ::ADDR32BitRegClass) {
     LoadOpcode = SystemZ::L;
-    StoreOpcode = SystemZ::ST32;
+    StoreOpcode = SystemZ::ST;
+  } else if (RC == &SystemZ::GRH32BitRegClass) {
+    LoadOpcode = SystemZ::LFH;
+    StoreOpcode = SystemZ::STFH;
+  } else if (RC == &SystemZ::GRX32BitRegClass) {
+    LoadOpcode = SystemZ::LMux;
+    StoreOpcode = SystemZ::STMux;
   } else if (RC == &SystemZ::GR64BitRegClass ||
              RC == &SystemZ::ADDR64BitRegClass) {
     LoadOpcode = SystemZ::LG;
@@ -830,6 +1214,14 @@ unsigned SystemZInstrInfo::getCompareAndBranch(unsigned Opcode,
     return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CIJ : 0;
   case SystemZ::CGHI:
     return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CGIJ : 0;
+  case SystemZ::CLR:
+    return SystemZ::CLRJ;
+  case SystemZ::CLGR:
+    return SystemZ::CLGRJ;
+  case SystemZ::CLFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLIJ : 0;
+  case SystemZ::CLGFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLGIJ : 0;
   default:
     return 0;
   }
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 276fd3b..be4c8fe 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -70,10 +70,18 @@ namespace SystemZII {
     // on the result.
     BranchC,
 
+    // An instruction that peforms a 32-bit unsigned comparison and branches
+    // on the result.
+    BranchCL,
+
     // An instruction that peforms a 64-bit signed comparison and branches
     // on the result.
     BranchCG,
 
+    // An instruction that peforms a 64-bit unsigned comparison and branches
+    // on the result.
+    BranchCLG,
+
     // An instruction that decrements a 32-bit register and branches if
     // the result is nonzero.
     BranchCT,
@@ -108,7 +116,19 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
 
   void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
   void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
-
+  void expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                      unsigned HighOpcode, bool ConvertHigh) const;
+  void expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned LowOpcodeK, unsigned HighOpcode) const;
+  void expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned HighOpcode) const;
+  void expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                        unsigned Size) const;
+  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+  virtual void anchor();
+  
 public:
   explicit SystemZInstrInfo(SystemZTargetMachine &TM);
 
@@ -129,6 +149,12 @@ public:
                                 MachineBasicBlock *FBB,
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 DebugLoc DL) const LLVM_OVERRIDE;
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const
+    LLVM_OVERRIDE;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const LLVM_OVERRIDE;
   virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE;
   virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                    unsigned ExtraPredCycles,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index b318d67..6524e44 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -32,12 +32,9 @@ let neverHasSideEffects = 1 in {
 // Control flow instructions
 //===----------------------------------------------------------------------===//
 
-// A return instruction.  R1 is the condition-code mask (all 1s)
-// and R2 is the target address, which is always stored in %r14.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1,
-    R1 = 15, R2 = 14, isCodeGenOnly = 1 in {
-  def RET : InstRR<0x07, (outs), (ins), "br\t%r14", [(z_retflag)]>;
-}
+// A return instruction (br %r14).
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+  def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
 
 // Unconditional branches.  R1 is the condition-code mask (all 1s).
 let isBranch = 1, isTerminator = 1, isBarrier = 1, R1 = 15 in {
@@ -70,6 +67,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
                       "brc\t$R1, $I2", []>;
   def AsmBRCL : InstRIL<0xC04, (outs), (ins uimm8zx4:$R1, brtarget32:$I2),
                         "brcl\t$R1, $I2", []>;
+  def AsmBCR : InstRR<0x07, (outs), (ins uimm8zx4:$R1, GR64:$R2),
+                      "bcr\t$R1, $R2", []>;
 }
 
 // Fused compare-and-branch instructions.  As for normal branches,
@@ -94,6 +93,18 @@ multiclass CompareBranches<Operand ccmask, string pos1, string pos2> {
     def GIJ : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2, ccmask:$M3,
                                             brtarget16:$RI4),
                        "cgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LRJ  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LGRJ : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LIJ  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LGIJ : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
   }
 }
 let isCodeGenOnly = 1 in
@@ -108,6 +119,7 @@ multiclass CondExtendedMnemonic<bits<4> ccmask, string name> {
                    "j"##name##"\t$I2", []>;
     def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2),
                      "jg"##name##"\t$I2", []>;
+    def BR : InstRR<0x07, (outs), (ins ADDR64:$R2), "b"##name##"r\t$R2", []>;
   }
   def LOCR  : FixedCondUnaryRRF<"locr"##name,  0xB9F2, GR32, GR32, ccmask>;
   def LOCGR : FixedCondUnaryRRF<"locgr"##name, 0xB9E2, GR64, GR64, ccmask>;
@@ -152,6 +164,18 @@ multiclass IntCondExtendedMnemonicA<bits<4> ccmask, string name> {
     def CGI : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2,
                                             brtarget16:$RI4),
                        "cgij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLR  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2,
+                                            brtarget16:$RI4),
+                        "clrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLGR : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2,
+                                             brtarget16:$RI4),
+                        "clgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLI  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLGI : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clgij"##name##"\t$R1, $I2, $RI4", []>;
   }
 }
 multiclass IntCondExtendedMnemonic<bits<4> ccmask, string name1, string name2>
@@ -177,22 +201,31 @@ let Defs = [CC] in {
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-def Select32 : SelectWrapper<GR32>;
-def Select64 : SelectWrapper<GR64>;
-
-defm CondStore8_32  : CondStores<GR32, nonvolatile_truncstorei8,
+def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<GR32>;
+def Select64    : SelectWrapper<GR64>;
+
+// We don't define 32-bit Mux stores because the low-only STOC should
+// always be used if possible.
+defm CondStore8Mux  : CondStores<GRX32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore8     : CondStores<GR32, nonvolatile_truncstorei8,
                                  nonvolatile_anyextloadi8, bdxaddr20only>;
-defm CondStore16_32 : CondStores<GR32, nonvolatile_truncstorei16,
+defm CondStore16    : CondStores<GR32, nonvolatile_truncstorei16,
                                  nonvolatile_anyextloadi16, bdxaddr20only>;
-defm CondStore32_32 : CondStores<GR32, nonvolatile_store,
+defm CondStore32    : CondStores<GR32, nonvolatile_store,
                                  nonvolatile_load, bdxaddr20only>;
 
-defm CondStore8  : CondStores<GR64, nonvolatile_truncstorei8,
-                              nonvolatile_anyextloadi8, bdxaddr20only>;
-defm CondStore16 : CondStores<GR64, nonvolatile_truncstorei16,
-                              nonvolatile_anyextloadi16, bdxaddr20only>;
-defm CondStore32 : CondStores<GR64, nonvolatile_truncstorei32,
-                              nonvolatile_anyextloadi32, bdxaddr20only>;
+defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
+                    nonvolatile_anyextloadi8, bdxaddr20only>;
+defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
+                    nonvolatile_anyextloadi16, bdxaddr20only>;
+defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
+                    nonvolatile_anyextloadi32, bdxaddr20only>;
 defm CondStore64 : CondStores<GR64, nonvolatile_store,
                               nonvolatile_load, bdxaddr20only>;
 
@@ -202,24 +235,30 @@ defm CondStore64 : CondStores<GR64, nonvolatile_store,
 
 // The definitions here are for the call-clobbered registers.
 let isCall = 1, Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D,
-                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D, CC],
-    R1 = 14, isCodeGenOnly = 1 in {
-  def BRAS  : InstRI<0xA75, (outs), (ins pcrel16call:$I2, variable_ops),
-                     "bras\t%r14, $I2", []>;
-  def BRASL : InstRIL<0xC05, (outs), (ins pcrel32call:$I2, variable_ops),
-                      "brasl\t%r14, $I2", [(z_call pcrel32call:$I2)]>;
-  def BASR  : InstRR<0x0D, (outs), (ins ADDR64:$R2, variable_ops),
-                     "basr\t%r14, $R2", [(z_call ADDR64:$R2)]>;
+                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D, CC] in {
+  def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
+                        [(z_call pcrel32:$I2)]>;
+  def CallBASR  : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
+                        [(z_call ADDR64:$R2)]>;
+}
+
+// Sibling calls.  Indirect sibling calls must be via R1, since R2 upwards
+// are argument registers and since branching to R0 is a no-op.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
+                     [(z_sibcall pcrel32:$I2)]>;
+  let Uses = [R1D] in
+    def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
 }
 
 // Define the general form of the call instructions for the asm parser.
 // These instructions don't hard-code %r14 as the return address register.
-def AsmBRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
-                      "bras\t$R1, $I2", []>;
-def AsmBRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
-                       "brasl\t$R1, $I2", []>;
-def AsmBASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
-                      "basr\t$R1, $R2", []>;
+def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
+                   "bras\t$R1, $I2", []>;
+def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
+                    "brasl\t$R1, $I2", []>;
+def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
+                   "basr\t$R1, $R2", []>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
@@ -227,6 +266,9 @@ def AsmBASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
 
 // Register moves.
 let neverHasSideEffects = 1 in {
+  // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
+  def LRMux : UnaryRRPseudo<"l", null_frag, GRX32, GRX32>,
+              Requires<[FeatureHighWord]>;
   def LR  : UnaryRR <"l",  0x18,   null_frag, GR32, GR32>;
   def LGR : UnaryRRE<"lg", 0xB904, null_frag, GR64, GR64>;
 }
@@ -248,7 +290,10 @@ let Uses = [CC] in {
 // Immediate moves.
 let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
     isReMaterializable = 1 in {
-  // 16-bit sign-extended immediates.
+  // 16-bit sign-extended immediates.  LHIMux expands to LHI or IIHF,
+  // deopending on the choice of register.
+  def LHIMux : UnaryRIPseudo<bitconvert, GRX32, imm32sx16>,
+               Requires<[FeatureHighWord]>;
   def LHI  : UnaryRI<"lhi",  0xA78, bitconvert, GR32, imm32sx16>;
   def LGHI : UnaryRI<"lghi", 0xA79, bitconvert, GR64, imm64sx16>;
 
@@ -266,7 +311,12 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
 
 // Register loads.
 let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  // Expands to L, LY or LFH, depending on the choice of register.
+  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+             Requires<[FeatureHighWord]>;
   defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+            Requires<[FeatureHighWord]>;
   def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
@@ -298,8 +348,12 @@ let Uses = [CC] in {
 
 // Register stores.
 let SimpleBDXStore = 1 in {
-  let isCodeGenOnly = 1 in
-    defm ST32 : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+  // Expands to ST, STY or STFH, depending on the choice of register.
+  def STMux : StoreRXYPseudo<store, GRX32, 4>,
+              Requires<[FeatureHighWord]>;
+  defm ST : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+  def STFH : StoreRXY<"stfh", 0xE3CB, store, GRH32, 4>,
+             Requires<[FeatureHighWord]>;
   def STG : StoreRXY<"stg", 0xE324, store, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
@@ -309,15 +363,13 @@ let SimpleBDXStore = 1 in {
                        [(store GR128:$src, bdxaddr20only128:$dst)]>;
   }
 }
-let isCodeGenOnly = 1 in
-  def STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+def STRL  : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
 def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
 
 // Store on condition.
 let isCodeGenOnly = 1, Uses = [CC] in {
-  def STOC32 : CondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
-  def STOC   : CondStoreRSY<"stoc",  0xEBF3, GR64, 4>;
-  def STOCG  : CondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
+  def STOC  : CondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
+  def STOCG : CondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
 }
 let Uses = [CC] in {
   def AsmSTOC  : AsmCondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
@@ -334,33 +386,22 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
-  def MVC : InstSS<0xD2, (outs), (ins bdladdr12onlylen8:$BDL1,
-                                      bdaddr12only:$BD2),
-                   "mvc\t$BDL1, $BD2", []>;
-
-let mayLoad = 1, mayStore = 1, usesCustomInserter = 1 in
-  def MVCWrapper : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm32len8:$length),
-                          [(z_mvc bdaddr12only:$dest, bdaddr12only:$src,
-                                  imm32len8:$length)]>;
-
-defm LoadStore8_32  : MVCLoadStore<anyextloadi8, truncstorei8, i32,
-                                   MVCWrapper, 1>;
-defm LoadStore16_32 : MVCLoadStore<anyextloadi16, truncstorei16, i32,
-                                   MVCWrapper, 2>;
-defm LoadStore32_32 : MVCLoadStore<load, store, i32, MVCWrapper, 4>;
-
-defm LoadStore8  : MVCLoadStore<anyextloadi8, truncstorei8, i64,
-                                MVCWrapper, 1>;
-defm LoadStore16 : MVCLoadStore<anyextloadi16, truncstorei16, i64,
-                                MVCWrapper, 2>;
-defm LoadStore32 : MVCLoadStore<anyextloadi32, truncstorei32, i64,
-                                MVCWrapper, 4>;
-defm LoadStore64 : MVCLoadStore<load, store, i64, MVCWrapper, 8>;
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+
+// String moves.
+let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in
+  defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
 
 //===----------------------------------------------------------------------===//
 // Sign extensions
 //===----------------------------------------------------------------------===//
+//
+// Note that putting these before zero extensions mean that we will prefer
+// them for anyextload*.  There's not really much to choose between the two
+// either way, but signed-extending loads have a short LH and a long LHY,
+// while zero-extending loads have only the long LLH.
+//
+//===----------------------------------------------------------------------===//
 
 // 32-bit extensions from registers.
 let neverHasSideEffects = 1 in {
@@ -380,40 +421,33 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
 // Match 32-to-64-bit sign extensions in which the source is already
 // in a 64-bit register.
 def : Pat<(sext_inreg GR64:$src, i32),
-          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
-
-// 32-bit extensions from memory.
-def  LB   : UnaryRXY<"lb", 0xE376, sextloadi8, GR32, 1>;
-defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, sextloadi16, GR32, 2>;
-def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_sextloadi16, GR32>;
+          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
+// depending on the choice of register.
+def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+            Requires<[FeatureHighWord]>;
+def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+          Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
+// depending on the choice of register.
+def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+            Requires<[FeatureHighWord]>;
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+            Requires<[FeatureHighWord]>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LGB   : UnaryRXY<"lgb", 0xE377, sextloadi8,  GR64, 1>;
-def LGH   : UnaryRXY<"lgh", 0xE315, sextloadi16, GR64, 2>;
-def LGF   : UnaryRXY<"lgf", 0xE314, sextloadi32, GR64, 4>;
-def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_sextloadi16, GR64>;
-def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_sextloadi32, GR64>;
+def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
-  def LTGF : UnaryRXY<"ltgf", 0xE332, sextloadi32, GR64, 4>;
-
-// If the sign of a load-extend operation doesn't matter, use the signed ones.
-// There's not really much to choose between the sign and zero extensions,
-// but LH is more compact than LLH for small offsets.
-def : Pat<(i32 (extloadi8  bdxaddr20only:$src)), (LB  bdxaddr20only:$src)>;
-def : Pat<(i32 (extloadi16 bdxaddr12pair:$src)), (LH  bdxaddr12pair:$src)>;
-def : Pat<(i32 (extloadi16 bdxaddr20pair:$src)), (LHY bdxaddr20pair:$src)>;
-
-def : Pat<(i64 (extloadi8  bdxaddr20only:$src)), (LGB bdxaddr20only:$src)>;
-def : Pat<(i64 (extloadi16 bdxaddr20only:$src)), (LGH bdxaddr20only:$src)>;
-def : Pat<(i64 (extloadi32 bdxaddr20only:$src)), (LGF bdxaddr20only:$src)>;
-
-// We want PC-relative addresses to be tried ahead of BD and BDX addresses.
-// However, BDXs have two extra operands and are therefore 6 units more
-// complex.
-let AddedComplexity = 7 in {
-  def : Pat<(i32 (extloadi16 pcrel32:$src)), (LHRL  pcrel32:$src)>;
-  def : Pat<(i64 (extloadi16 pcrel32:$src)), (LGHRL pcrel32:$src)>;
-}
+  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
@@ -421,8 +455,14 @@ let AddedComplexity = 7 in {
 
 // 32-bit extensions from registers.
 let neverHasSideEffects = 1 in {
-  def LLCR : UnaryRRE<"llc", 0xB994, zext8,  GR32, GR32>;
-  def LLHR : UnaryRRE<"llh", 0xB995, zext16, GR32, GR32>;
+  // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
+  def LLCRMux : UnaryRRPseudo<"llc", zext8, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLCR    : UnaryRRE<"llc", 0xB994, zext8,  GR32, GR32>;
+  // Expands to LLHR or RISB[LH]G, depending on the choice of registers.
+  def LLHRMux : UnaryRRPseudo<"llh", zext16, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLHR    : UnaryRRE<"llh", 0xB995, zext16, GR32, GR32>;
 }
 
 // 64-bit extensions from registers.
@@ -435,19 +475,31 @@ let neverHasSideEffects = 1 in {
 // Match 32-to-64-bit zero extensions in which the source is already
 // in a 64-bit register.
 def : Pat<(and GR64:$src, 0xffffffff),
-          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
 
-// 32-bit extensions from memory.
-def LLC   : UnaryRXY<"llc", 0xE394, zextloadi8,  GR32, 1>;
-def LLH   : UnaryRXY<"llh", 0xE395, zextloadi16, GR32, 2>;
-def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_zextloadi16, GR32>;
+// 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
+// depending on the choice of register.
+def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
+// depending on the choice of register.
+def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+             Requires<[FeatureHighWord]>;
+def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>,
+            Requires<[FeatureHighWord]>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LLGC   : UnaryRXY<"llgc", 0xE390, zextloadi8,  GR64, 1>;
-def LLGH   : UnaryRXY<"llgh", 0xE391, zextloadi16, GR64, 2>;
-def LLGF   : UnaryRXY<"llgf", 0xE316, zextloadi32, GR64, 4>;
-def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_zextloadi16, GR64>;
-def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_zextloadi32, GR64>;
+def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
 
 //===----------------------------------------------------------------------===//
 // Truncations
@@ -455,21 +507,31 @@ def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_zextloadi32, GR64>;
 
 // Truncations of 64-bit registers to 32-bit registers.
 def : Pat<(i32 (trunc GR64:$src)),
-          (EXTRACT_SUBREG GR64:$src, subreg_32bit)>;
+          (EXTRACT_SUBREG GR64:$src, subreg_l32)>;
 
-// Truncations of 32-bit registers to memory.
-let isCodeGenOnly = 1 in {
-  defm STC32   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR32, 1>;
-  defm STH32   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
-  def  STHRL32 : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
-}
+// Truncations of 32-bit registers to 8-bit memory.  STCMux expands to
+// STC, STCY or STCH, depending on the choice of register.
+def STCMux : StoreRXYPseudo<truncstorei8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STC : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8, GR32, 1>;
+def STCH : StoreRXY<"stch", 0xE3C3, truncstorei8, GRH32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// Truncations of 32-bit registers to 16-bit memory.  STHMux expands to
+// STH, STHY or STHH, depending on the choice of register.
+def STHMux : StoreRXYPseudo<truncstorei16, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STH : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
+def STHH : StoreRXY<"sthh", 0xE3C7, truncstorei16, GRH32, 2>,
+           Requires<[FeatureHighWord]>;
+def STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
 
 // Truncations of 64-bit registers to memory.
-defm STC   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR64, 1>;
-defm STH   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR64, 2>;
-def  STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR64>;
-defm ST    : StoreRXPair<"st", 0x50, 0xE350, truncstorei32, GR64, 4>;
-def  STRL  : StoreRILPC<"strl", 0xC4F, aligned_truncstorei32, GR64>;
+defm : StoreGR64Pair<STC, STCY, truncstorei8>;
+defm : StoreGR64Pair<STH, STHY, truncstorei16>;
+def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
+defm : StoreGR64Pair<ST, STY, truncstorei32>;
+def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -528,11 +590,31 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
 }
 
 //===----------------------------------------------------------------------===//
-// Negation
+// Absolute and Negation
 //===----------------------------------------------------------------------===//
 
 let Defs = [CC] in {
   let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LPR  : UnaryRR <"lp",  0x10,   z_iabs32, GR32, GR32>;
+    def LPGR : UnaryRRE<"lpg", 0xB900, z_iabs64, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LPGFR : UnaryRRE<"lpgf", 0xB910, null_frag, GR64, GR32>;
+}
+defm : SXU<z_iabs64, LPGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LNR  : UnaryRR <"ln",  0x11,   z_inegabs32, GR32, GR32>;
+    def LNGR : UnaryRRE<"lng", 0xB901, z_inegabs64, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LNGFR : UnaryRRE<"lngf", 0xB911, null_frag, GR64, GR32>;
+}
+defm : SXU<z_inegabs64, LNGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
     def LCR  : UnaryRR <"lc",  0x13,   ineg, GR32, GR32>;
     def LCGR : UnaryRRE<"lcg", 0xB903, ineg, GR64, GR64>;
   }
@@ -546,43 +628,51 @@ defm : SXU<ineg, LCGFR>;
 //===----------------------------------------------------------------------===//
 
 let isCodeGenOnly = 1 in
-  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, zextloadi8, 1>;
-defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, zextloadi8, 1>;
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
 
-defm : InsertMem<"inserti8", IC32,  GR32, zextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", IC32Y, GR32, zextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
 
-defm : InsertMem<"inserti8", IC,  GR64, zextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", ICY, GR64, zextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
 
 // Insertions of a 16-bit immediate, leaving other bits unaffected.
 // We don't have or_as_insert equivalents of these operations because
 // OI is available instead.
-let isCodeGenOnly = 1 in {
-  def IILL32 : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
-  def IILH32 : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
-}
-def IILL : BinaryRI<"iill", 0xA53, insertll, GR64, imm64ll16>;
-def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR64, imm64lh16>;
-def IIHL : BinaryRI<"iihl", 0xA51, inserthl, GR64, imm64hl16>;
-def IIHH : BinaryRI<"iihh", 0xA50, inserthh, GR64, imm64hh16>;
+//
+// IIxMux expands to II[LH]x, depending on the choice of register.
+def IILMux : BinaryRIPseudo<insertll, GRX32, imm32ll16>,
+             Requires<[FeatureHighWord]>;
+def IIHMux : BinaryRIPseudo<insertlh, GRX32, imm32lh16>,
+             Requires<[FeatureHighWord]>;
+def IILL : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
+def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
+def IIHL : BinaryRI<"iihl", 0xA51, insertll, GRH32, imm32ll16>;
+def IIHH : BinaryRI<"iihh", 0xA50, insertlh, GRH32, imm32lh16>;
+def IILL64 : BinaryAliasRI<insertll, GR64, imm64ll16>;
+def IILH64 : BinaryAliasRI<insertlh, GR64, imm64lh16>;
+def IIHL64 : BinaryAliasRI<inserthl, GR64, imm64hl16>;
+def IIHH64 : BinaryAliasRI<inserthh, GR64, imm64hh16>;
 
 // ...likewise for 32-bit immediates.  For GR32s this is a general
 // full-width move.  (We use IILF rather than something like LLILF
 // for 32-bit moves because IILF leaves the upper 32 bits of the
 // GR64 unchanged.)
-let isCodeGenOnly = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
-    isReMaterializable = 1 in {
-  def IILF32 : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
+  def IIFMux : UnaryRIPseudo<bitconvert, GRX32, uimm32>,
+               Requires<[FeatureHighWord]>;
+  def IILF : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+  def IIHF : UnaryRIL<"iihf", 0xC08, bitconvert, GRH32, uimm32>;
 }
-def IILF : BinaryRIL<"iilf", 0xC09, insertlf, GR64, imm64lf32>;
-def IIHF : BinaryRIL<"iihf", 0xC08, inserthf, GR64, imm64hf32>;
+def IILF64 : BinaryAliasRIL<insertlf, GR64, imm64lf32>;
+def IIHF64 : BinaryAliasRIL<inserthf, GR64, imm64hf32>;
 
 // An alternative model of inserthf, with the first operand being
 // a zero-extended value.
 def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
-          (IIHF (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit),
-                imm64hf32:$imm)>;
+          (IIHF64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+                  imm64hf32:$imm)>;
 
 //===----------------------------------------------------------------------===//
 // Addition
@@ -598,17 +688,22 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   def AGFR : BinaryRRE<"agf", 0xB918, null_frag, GR64, GR32>;
 
   // Addition of signed 16-bit immediates.
+  defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
   defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
   defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
 
   // Addition of signed 32-bit immediates.
+  def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
   def AFI  : BinaryRIL<"afi",  0xC29, add, GR32, simm32>;
+  def AIH  : BinaryRIL<"aih",  0xCC8, add, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
   def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
 
   // Addition of memory.
-  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, sextloadi16, 2>;
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
   defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
-  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, sextloadi32, 4>;
+  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
   def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
 
   // Addition to memory.
@@ -638,7 +733,7 @@ let Defs = [CC] in {
 
   // Addition of memory.
   defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
-  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, zextloadi32, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
   def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
 }
 defm : ZXB<addc, GR64, ALGFR>;
@@ -667,9 +762,9 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   defm SGR : BinaryRREAndK<"sg", 0xB909, 0xB9E9, sub, GR64, GR64>;
 
   // Subtraction of memory.
-  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, sextloadi16, 2>;
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
-  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, sextloadi32, 4>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
   def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
 }
 defm : SXB<sub, GR64, SGFR>;
@@ -688,7 +783,7 @@ let Defs = [CC] in {
 
   // Subtraction of memory.
   defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
-  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, zextloadi32, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
   def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load, 8>;
 }
 defm : ZXB<subc, GR64, SLGFR>;
@@ -718,22 +813,33 @@ let Defs = [CC] in {
   let isConvertibleToThreeAddress = 1 in {
     // ANDs of a 16-bit immediate, leaving other bits unaffected.
     // The CC result only reflects the 16-bit field, not the full register.
-    let isCodeGenOnly = 1 in {
-      def NILL32 : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
-      def NILH32 : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
-    }
-    def NILL : BinaryRI<"nill", 0xA57, and, GR64, imm64ll16c>;
-    def NILH : BinaryRI<"nilh", 0xA56, and, GR64, imm64lh16c>;
-    def NIHL : BinaryRI<"nihl", 0xA55, and, GR64, imm64hl16c>;
-    def NIHH : BinaryRI<"nihh", 0xA54, and, GR64, imm64hh16c>;
+    //
+    // NIxMux expands to NI[LH]x, depending on the choice of register.
+    def NILMux : BinaryRIPseudo<and, GRX32, imm32ll16c>,
+                 Requires<[FeatureHighWord]>;
+    def NIHMux : BinaryRIPseudo<and, GRX32, imm32lh16c>,
+                 Requires<[FeatureHighWord]>;
+    def NILL : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
+    def NILH : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+    def NIHL : BinaryRI<"nihl", 0xA55, and, GRH32, imm32ll16c>;
+    def NIHH : BinaryRI<"nihh", 0xA54, and, GRH32, imm32lh16c>;
+    def NILL64 : BinaryAliasRI<and, GR64, imm64ll16c>;
+    def NILH64 : BinaryAliasRI<and, GR64, imm64lh16c>;
+    def NIHL64 : BinaryAliasRI<and, GR64, imm64hl16c>;
+    def NIHH64 : BinaryAliasRI<and, GR64, imm64hh16c>;
 
     // ANDs of a 32-bit immediate, leaving other bits unaffected.
     // The CC result only reflects the 32-bit field, which means we can
     // use it as a zero indicator for i32 operations but not otherwise.
-    let isCodeGenOnly = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in
-      def NILF32 : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
-    def NILF : BinaryRIL<"nilf", 0xC0B, and, GR64, imm64lf32c>;
-    def NIHF : BinaryRIL<"nihf", 0xC0A, and, GR64, imm64hf32c>;
+    let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+      // Expands to NILF or NIHF, depending on the choice of register.
+      def NIFMux : BinaryRIPseudo<and, GRX32, uimm32>,
+                   Requires<[FeatureHighWord]>;
+      def NILF : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
+      def NIHF : BinaryRIL<"nihf", 0xC0A, and, GRH32, uimm32>;
+    }
+    def NILF64 : BinaryAliasRIL<and, GR64, imm64lf32c>;
+    def NIHF64 : BinaryAliasRIL<and, GR64, imm64hf32c>;
   }
 
   // ANDs of memory.
@@ -744,6 +850,10 @@ let Defs = [CC] in {
 
   // AND to memory
   defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, uimm8>;
+
+  // Block AND.
+  let mayLoad = 1, mayStore = 1 in
+    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
 }
 defm : RMWIByte<and, bdaddr12pair, NI>;
 defm : RMWIByte<and, bdaddr20pair, NIY>;
@@ -761,22 +871,33 @@ let Defs = [CC] in {
 
   // ORs of a 16-bit immediate, leaving other bits unaffected.
   // The CC result only reflects the 16-bit field, not the full register.
-  let isCodeGenOnly = 1 in {
-    def OILL32 : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
-    def OILH32 : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
-  }
-  def OILL : BinaryRI<"oill", 0xA5B, or, GR64, imm64ll16>;
-  def OILH : BinaryRI<"oilh", 0xA5A, or, GR64, imm64lh16>;
-  def OIHL : BinaryRI<"oihl", 0xA59, or, GR64, imm64hl16>;
-  def OIHH : BinaryRI<"oihh", 0xA58, or, GR64, imm64hh16>;
+  //
+  // OIxMux expands to OI[LH]x, depending on the choice of register.
+  def OILMux : BinaryRIPseudo<or, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def OIHMux : BinaryRIPseudo<or, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def OILL : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
+  def OILH : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
+  def OIHL : BinaryRI<"oihl", 0xA59, or, GRH32, imm32ll16>;
+  def OIHH : BinaryRI<"oihh", 0xA58, or, GRH32, imm32lh16>;
+  def OILL64 : BinaryAliasRI<or, GR64, imm64ll16>;
+  def OILH64 : BinaryAliasRI<or, GR64, imm64lh16>;
+  def OIHL64 : BinaryAliasRI<or, GR64, imm64hl16>;
+  def OIHH64 : BinaryAliasRI<or, GR64, imm64hh16>;
 
   // ORs of a 32-bit immediate, leaving other bits unaffected.
   // The CC result only reflects the 32-bit field, which means we can
   // use it as a zero indicator for i32 operations but not otherwise.
-  let isCodeGenOnly = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in
-    def OILF32 : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
-  def OILF : BinaryRIL<"oilf", 0xC0D, or, GR64, imm64lf32>;
-  def OIHF : BinaryRIL<"oihf", 0xC0C, or, GR64, imm64hf32>;
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to OILF or OIHF, depending on the choice of register.
+    def OIFMux : BinaryRIPseudo<or, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def OILF : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
+    def OIHF : BinaryRIL<"oihf", 0xC0C, or, GRH32, uimm32>;
+  }
+  def OILF64 : BinaryAliasRIL<or, GR64, imm64lf32>;
+  def OIHF64 : BinaryAliasRIL<or, GR64, imm64hf32>;
 
   // ORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
@@ -786,6 +907,10 @@ let Defs = [CC] in {
 
   // OR to memory
   defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, uimm8>;
+
+  // Block OR.
+  let mayLoad = 1, mayStore = 1 in
+    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
 }
 defm : RMWIByte<or, bdaddr12pair, OI>;
 defm : RMWIByte<or, bdaddr20pair, OIY>;
@@ -804,10 +929,15 @@ let Defs = [CC] in {
   // XORs of a 32-bit immediate, leaving other bits unaffected.
   // The CC result only reflects the 32-bit field, which means we can
   // use it as a zero indicator for i32 operations but not otherwise.
-  let isCodeGenOnly = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in
-    def XILF32 : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
-  def XILF : BinaryRIL<"xilf", 0xC07, xor, GR64, imm64lf32>;
-  def XIHF : BinaryRIL<"xihf", 0xC06, xor, GR64, imm64hf32>;
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to XILF or XIHF, depending on the choice of register.
+    def XIFMux : BinaryRIPseudo<xor, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def XILF : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
+    def XIHF : BinaryRIL<"xihf", 0xC06, xor, GRH32, uimm32>;
+  }
+  def XILF64 : BinaryAliasRIL<xor, GR64, imm64lf32>;
+  def XIHF64 : BinaryAliasRIL<xor, GR64, imm64hf32>;
 
   // XORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
@@ -817,6 +947,10 @@ let Defs = [CC] in {
 
   // XOR to memory
   defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, uimm8>;
+
+  // Block XOR.
+  let mayLoad = 1, mayStore = 1 in
+    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
 }
 defm : RMWIByte<xor, bdaddr12pair, XI>;
 defm : RMWIByte<xor, bdaddr20pair, XIY>;
@@ -842,9 +976,9 @@ def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
 def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 
 // Multiplication of memory.
-defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, sextloadi16, 2>;
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
 defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
-def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, sextloadi32, 4>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
 // Multiplication of a register, producing two results.
@@ -909,13 +1043,15 @@ let Defs = [CC] in {
 
 // Forms of RISBG that only affect one word of the destination register.
 // They do not set CC.
-let isCodeGenOnly = 1 in
-  def RISBLG32 : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR32>,
-                 Requires<[FeatureHighWord]>;
-def RISBHG : RotateSelectRIEf<"risbhg", 0xEC5D, GR64, GR64>,
-             Requires<[FeatureHighWord]>;
-def RISBLG : RotateSelectRIEf<"risblg", 0xEC51, GR64, GR64>,
-             Requires<[FeatureHighWord]>;
+def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>, Requires<[FeatureHighWord]>;
+def RISBLL  : RotateSelectAliasRIEf<GR32,  GR32>,  Requires<[FeatureHighWord]>;
+def RISBLH  : RotateSelectAliasRIEf<GR32,  GRH32>, Requires<[FeatureHighWord]>;
+def RISBHL  : RotateSelectAliasRIEf<GRH32, GR32>,  Requires<[FeatureHighWord]>;
+def RISBHH  : RotateSelectAliasRIEf<GRH32, GRH32>, Requires<[FeatureHighWord]>;
+def RISBLG  : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>,
+              Requires<[FeatureHighWord]>;
+def RISBHG  : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>,
+              Requires<[FeatureHighWord]>;
 
 // Rotate second operand left and perform a logical operation with selected
 // bits of the first operand.  The CC result only describes the selected bits,
@@ -930,39 +1066,50 @@ let Defs = [CC] in {
 // Comparison
 //===----------------------------------------------------------------------===//
 
-// Signed comparisons.
+// Signed comparisons.  We put these before the unsigned comparisons because
+// some of the signed forms have COMPARE AND BRANCH equivalents whereas none
+// of the unsigned forms do.
 let Defs = [CC], CCValues = 0xE in {
   // Comparison with a register.
-  def CR   : CompareRR <"c",   0x19,   z_cmp,     GR32, GR32>;
+  def CR   : CompareRR <"c",   0x19,   z_scmp,    GR32, GR32>;
   def CGFR : CompareRRE<"cgf", 0xB930, null_frag, GR64, GR32>;
-  def CGR  : CompareRRE<"cg",  0xB920, z_cmp,     GR64, GR64>;
+  def CGR  : CompareRRE<"cg",  0xB920, z_scmp,    GR64, GR64>;
 
   // Comparison with a signed 16-bit immediate.
-  def CHI  : CompareRI<"chi",  0xA7E, z_cmp, GR32, imm32sx16>;
-  def CGHI : CompareRI<"cghi", 0xA7F, z_cmp, GR64, imm64sx16>;
-
-  // Comparison with a signed 32-bit immediate.
-  def CFI  : CompareRIL<"cfi",  0xC2D, z_cmp, GR32, simm32>;
-  def CGFI : CompareRIL<"cgfi", 0xC2C, z_cmp, GR64, imm64sx32>;
+  def CHI  : CompareRI<"chi",  0xA7E, z_scmp, GR32, imm32sx16>;
+  def CGHI : CompareRI<"cghi", 0xA7F, z_scmp, GR64, imm64sx16>;
+
+  // Comparison with a signed 32-bit immediate.  CFIMux expands to CFI or CIH,
+  // depending on the choice of register.
+  def CFIMux : CompareRIPseudo<z_scmp, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
+  def CFI  : CompareRIL<"cfi",  0xC2D, z_scmp, GR32, simm32>;
+  def CIH  : CompareRIL<"cih",  0xCCD, z_scmp, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+  def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
 
   // Comparison with memory.
-  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_cmp, GR32, sextloadi16, 2>;
-  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_cmp, GR32, load, 4>;
-  def  CGH   : CompareRXY<"cgh", 0xE334, z_cmp, GR64, sextloadi16, 2>;
-  def  CGF   : CompareRXY<"cgf", 0xE330, z_cmp, GR64, sextloadi32, 4>;
-  def  CG    : CompareRXY<"cg",  0xE320, z_cmp, GR64, load, 8>;
-  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_cmp, GR32, aligned_sextloadi16>;
-  def  CRL   : CompareRILPC<"crl",   0xC6D, z_cmp, GR32, aligned_load>;
-  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_cmp, GR64, aligned_sextloadi16>;
-  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_cmp, GR64, aligned_sextloadi32>;
-  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_cmp, GR64, aligned_load>;
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
 
   // Comparison between memory and a signed 16-bit immediate.
-  def CHHSI : CompareSIL<"chhsi", 0xE554, z_cmp, sextloadi16, imm32sx16>;
-  def CHSI  : CompareSIL<"chsi",  0xE55C, z_cmp, load,        imm32sx16>;
-  def CGHSI : CompareSIL<"cghsi", 0xE558, z_cmp, load,        imm64sx16>;
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
 }
-defm : SXB<z_cmp, GR64, CGFR>;
+defm : SXB<z_scmp, GR64, CGFR>;
 
 // Unsigned comparisons.
 let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
@@ -971,35 +1118,79 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFR : CompareRRE<"clgf", 0xB931, null_frag, GR64, GR32>;
   def CLGR  : CompareRRE<"clg",  0xB921, z_ucmp,    GR64, GR64>;
 
-  // Comparison with a signed 32-bit immediate.
+  // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
+  // or CLIH, depending on the choice of register.
+  def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
+                Requires<[FeatureHighWord]>;
   def CLFI  : CompareRIL<"clfi",  0xC2F, z_ucmp, GR32, uimm32>;
+  def CLIH  : CompareRIL<"clih",  0xCCF, z_ucmp, GR32, uimm32>,
+              Requires<[FeatureHighWord]>;
   def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
 
   // Comparison with memory.
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+                Requires<[FeatureHighWord]>;
   defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
-  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, zextloadi32, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
   def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
   def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
-                             aligned_zextloadi16>;
+                             aligned_azextloadi16>;
   def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
                              aligned_load>;
   def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
-                             aligned_zextloadi16>;
+                             aligned_azextloadi16>;
   def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
-                             aligned_zextloadi32>;
+                             aligned_azextloadi32>;
   def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
                              aligned_load>;
 
   // Comparison between memory and an unsigned 8-bit immediate.
-  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, zextloadi8, imm32zx8>;
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
 
   // Comparison between memory and an unsigned 16-bit immediate.
-  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, zextloadi16, imm32zx16>;
-  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load,        imm32zx16>;
-  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load,        imm64zx16>;
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
 }
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
+// Memory-to-memory comparison.
+let mayLoad = 1, Defs = [CC] in
+  defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+
+// String comparison.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
+
+// Test under mask.
+let Defs = [CC] in {
+  // TMxMux expands to TM[LH]x, depending on the choice of register.
+  def TMLMux : CompareRIPseudo<z_tm_reg, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def TMHMux : CompareRIPseudo<z_tm_reg, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def TMLL : CompareRI<"tmll", 0xA71, z_tm_reg, GR32, imm32ll16>;
+  def TMLH : CompareRI<"tmlh", 0xA70, z_tm_reg, GR32, imm32lh16>;
+  def TMHL : CompareRI<"tmhl", 0xA73, z_tm_reg, GRH32, imm32ll16>;
+  def TMHH : CompareRI<"tmhh", 0xA72, z_tm_reg, GRH32, imm32lh16>;
+
+  def TMLL64 : CompareAliasRI<z_tm_reg, GR64, imm64ll16>;
+  def TMLH64 : CompareAliasRI<z_tm_reg, GR64, imm64lh16>;
+  def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
+  def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
+
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//===----------------------------------------------------------------------===//
+
+def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
+def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
@@ -1024,60 +1215,60 @@ def ATOMIC_LOAD_SGR     : AtomicLoadBinaryReg64<atomic_load_sub_64>;
 def ATOMIC_LOADW_NR     : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
 def ATOMIC_LOADW_NILH   : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
 def ATOMIC_LOAD_NR      : AtomicLoadBinaryReg32<atomic_load_and_32>;
-def ATOMIC_LOAD_NILL32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
-def ATOMIC_LOAD_NILH32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
-def ATOMIC_LOAD_NILF32  : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
+def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
+def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
 def ATOMIC_LOAD_NGR     : AtomicLoadBinaryReg64<atomic_load_and_64>;
-def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
-def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
-def ATOMIC_LOAD_NIHL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
-def ATOMIC_LOAD_NIHH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
-def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
-def ATOMIC_LOAD_NIHF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
+def ATOMIC_LOAD_NILL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
+def ATOMIC_LOAD_NILH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
+def ATOMIC_LOAD_NIHL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
+def ATOMIC_LOAD_NIHH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
+def ATOMIC_LOAD_NILF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
+def ATOMIC_LOAD_NIHF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
 
 def ATOMIC_LOADW_OR     : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
 def ATOMIC_LOADW_OILH   : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
 def ATOMIC_LOAD_OR      : AtomicLoadBinaryReg32<atomic_load_or_32>;
-def ATOMIC_LOAD_OILL32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
-def ATOMIC_LOAD_OILH32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
-def ATOMIC_LOAD_OILF32  : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
 def ATOMIC_LOAD_OGR     : AtomicLoadBinaryReg64<atomic_load_or_64>;
-def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
-def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
-def ATOMIC_LOAD_OIHL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
-def ATOMIC_LOAD_OIHH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
-def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
-def ATOMIC_LOAD_OIHF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+def ATOMIC_LOAD_OILL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+def ATOMIC_LOAD_OILH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+def ATOMIC_LOAD_OIHL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+def ATOMIC_LOAD_OIHH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+def ATOMIC_LOAD_OILF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+def ATOMIC_LOAD_OIHF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
 
 def ATOMIC_LOADW_XR     : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
 def ATOMIC_LOADW_XILF   : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
 def ATOMIC_LOAD_XR      : AtomicLoadBinaryReg32<atomic_load_xor_32>;
-def ATOMIC_LOAD_XILF32  : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
 def ATOMIC_LOAD_XGR     : AtomicLoadBinaryReg64<atomic_load_xor_64>;
-def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
-def ATOMIC_LOAD_XIHF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+def ATOMIC_LOAD_XILF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+def ATOMIC_LOAD_XIHF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
 
 def ATOMIC_LOADW_NRi    : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
 def ATOMIC_LOADW_NILHi  : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
                                                imm32lh16c>;
 def ATOMIC_LOAD_NRi     : AtomicLoadBinaryReg32<atomic_load_nand_32>;
-def ATOMIC_LOAD_NILL32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
                                                 imm32ll16c>;
-def ATOMIC_LOAD_NILH32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
                                                 imm32lh16c>;
-def ATOMIC_LOAD_NILF32i : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
+def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
 def ATOMIC_LOAD_NGRi    : AtomicLoadBinaryReg64<atomic_load_nand_64>;
-def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64ll16c>;
-def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64lh16c>;
-def ATOMIC_LOAD_NIHLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hl16c>;
-def ATOMIC_LOAD_NIHHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hh16c>;
-def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64lf32c>;
-def ATOMIC_LOAD_NIHFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hf32c>;
 
 def ATOMIC_LOADW_MIN    : AtomicLoadWBinaryReg<z_atomic_loadw_min>;
@@ -1119,6 +1310,10 @@ let Defs = [CC] in {
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
+// Extract CC into bits 29 and 28 of a register.
+let Uses = [CC] in
+  def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>;
+
 // Read a 32-bit access register into a GR32.  As with all GR32 operations,
 // the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
 // when a 64-bit address is stored in a pair of access registers.
@@ -1134,19 +1329,11 @@ let Defs = [CC] in {
   def FLOGR : UnaryRRE<"flog", 0xB983, null_frag, GR128, GR64>;
 }
 def : Pat<(ctlz GR64:$src),
-          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_high)>;
+          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
 
 // Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
 def : Pat<(i64 (anyext GR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit)>;
-
-// There are no 32-bit equivalents of LLILL and LLILH, so use a full
-// 64-bit move followed by a subreg.  This preserves the invariant that
-// all GR32 operations only modify the low 32 bits.
-def : Pat<(i32 imm32ll16:$src),
-          (EXTRACT_SUBREG (LLILL (LL16 imm:$src)), subreg_32bit)>;
-def : Pat<(i32 imm32lh16:$src),
-          (EXTRACT_SUBREG (LLILH (LH16 imm:$src)), subreg_32bit)>;
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
 
 // Extend GR32s and GR64s to GR128s.
 let usesCustomInserter = 1 in {
@@ -1155,6 +1342,10 @@ let usesCustomInserter = 1 in {
   def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
 }
 
+// Search a block of memory for a character.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+
 //===----------------------------------------------------------------------===//
 // Peepholes.
 //===----------------------------------------------------------------------===//
@@ -1163,14 +1354,14 @@ let usesCustomInserter = 1 in {
 defm : ZXB<add, GR64, ALGFR>;
 def  : Pat<(add GR64:$src1, imm64zx32:$src2),
            (ALGFI GR64:$src1, imm64zx32:$src2)>;
-def  : Pat<(add GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+def  : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
            (ALGF GR64:$src1, bdxaddr20only:$addr)>;
 
 // Use SL* for GR64 subtractions of unsigned 32-bit values.
 defm : ZXB<sub, GR64, SLGFR>;
 def  : Pat<(add GR64:$src1, imm64zx32n:$src2),
            (SLGFI GR64:$src1, imm64zx32n:$src2)>;
-def  : Pat<(sub GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+def  : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
            (SLGF GR64:$src1, bdxaddr20only:$addr)>;
 
 // Optimize sign-extended 1/0 selects to -1/0 selects.  This is important
@@ -1184,3 +1375,19 @@ def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid,
                     (i32 63)),
                (i32 63)),
           (Select64 (LGHI -1), (LGHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>;
+
+// Peepholes for turning scalar operations into block operations.
+defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
+                      OCSequence, XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 8>;
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 114f74e..ba027d4 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -225,11 +225,13 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
       Terminator.ExtraRelaxSize = 6;
       break;
     case SystemZ::CRJ:
-      // Relaxes to a CR/BRCL sequence, which is 2 bytes longer.
+    case SystemZ::CLRJ:
+      // Relaxes to a C(L)R/BRCL sequence, which is 2 bytes longer.
       Terminator.ExtraRelaxSize = 2;
       break;
     case SystemZ::CGRJ:
-      // Relaxes to a CGR/BRCL sequence, which is 4 bytes longer.
+    case SystemZ::CLGRJ:
+      // Relaxes to a C(L)GR/BRCL sequence, which is 4 bytes longer.
       Terminator.ExtraRelaxSize = 4;
       break;
     case SystemZ::CIJ:
@@ -237,6 +239,11 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
       // Relaxes to a C(G)HI/BRCL sequence, which is 4 bytes longer.
       Terminator.ExtraRelaxSize = 4;
       break;
+    case SystemZ::CLIJ:
+    case SystemZ::CLGIJ:
+      // Relaxes to a CL(G)FI/BRCL sequence, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
     default:
       llvm_unreachable("Unrecognized branch instruction");
     }
@@ -401,6 +408,18 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
   case SystemZ::CGIJ:
     splitCompareBranch(Branch, SystemZ::CGHI);
     break;
+  case SystemZ::CLRJ:
+    splitCompareBranch(Branch, SystemZ::CLR);
+    break;
+  case SystemZ::CLGRJ:
+    splitCompareBranch(Branch, SystemZ::CLGR);
+    break;
+  case SystemZ::CLIJ:
+    splitCompareBranch(Branch, SystemZ::CLFI);
+    break;
+  case SystemZ::CLGIJ:
+    splitCompareBranch(Branch, SystemZ::CLGFI);
+    break;
   default:
     llvm_unreachable("Unrecognized branch");
   }
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 432a0d3..ff9a6c0 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -15,15 +15,6 @@
 
 using namespace llvm;
 
-// If Opcode is an interprocedural reference that can be shortened,
-// return the short form, otherwise return 0.
-static unsigned getShortenedInstr(unsigned Opcode) {
-  switch (Opcode) {
-  case SystemZ::BRASL: return SystemZ::BRAS;
-  }
-  return Opcode;
-}
-
 // Return the VK_* enumeration for MachineOperand target flags Flags.
 static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
   switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
@@ -35,70 +26,71 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
 }
 
-SystemZMCInstLower::SystemZMCInstLower(Mangler *mang, MCContext &ctx,
+SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
                                        SystemZAsmPrinter &asmprinter)
-  : Mang(mang), Ctx(ctx), AsmPrinter(asmprinter) {}
+  : Ctx(ctx), AsmPrinter(asmprinter) {}
 
-MCOperand SystemZMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
-                                                 const MCSymbol *Symbol,
-                                                 int64_t Offset) const {
-  MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
-  if (Offset) {
-    const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
-    Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+const MCExpr *
+SystemZMCInstLower::getExpr(const MachineOperand &MO,
+                            MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbol *Symbol;
+  bool HasOffset = true;
+  switch (MO.getType()) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  default:
+    llvm_unreachable("unknown operand type");
   }
-  return MCOperand::CreateExpr(Expr);
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
+  if (HasOffset)
+    if (int64_t Offset = MO.getOffset()) {
+      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    }
+  return Expr;
 }
 
 MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
   switch (MO.getType()) {
-  default:
-    llvm_unreachable("unknown operand type");
-
   case MachineOperand::MO_Register:
     return MCOperand::CreateReg(MO.getReg());
 
   case MachineOperand::MO_Immediate:
     return MCOperand::CreateImm(MO.getImm());
 
-  case MachineOperand::MO_MachineBasicBlock:
-    return lowerSymbolOperand(MO, MO.getMBB()->getSymbol(),
-                              /* MO has no offset field */0);
-
-  case MachineOperand::MO_GlobalAddress:
-    return lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()),
-                              MO.getOffset());
-
-  case MachineOperand::MO_ExternalSymbol: {
-    StringRef Name = MO.getSymbolName();
-    return lowerSymbolOperand(MO, AsmPrinter.GetExternalSymbolSymbol(Name),
-                              MO.getOffset());
-  }
-
-  case MachineOperand::MO_JumpTableIndex:
-    return lowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()),
-                              /* MO has no offset field */0);
-
-  case MachineOperand::MO_ConstantPoolIndex:
-    return lowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()),
-                              MO.getOffset());
-
-  case MachineOperand::MO_BlockAddress: {
-    const BlockAddress *BA = MO.getBlockAddress();
-    return lowerSymbolOperand(MO, AsmPrinter.GetBlockAddressSymbol(BA),
-                              MO.getOffset());
+  default: {
+    MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
+    return MCOperand::CreateExpr(getExpr(MO, Kind));
   }
   }
 }
 
 void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-  unsigned Opcode = MI->getOpcode();
-  // When emitting binary code, start with the shortest form of an instruction
-  // and then relax it where necessary.
-  if (!AsmPrinter.OutStreamer.hasRawTextSupport())
-    Opcode = getShortenedInstr(Opcode);
-  OutMI.setOpcode(Opcode);
+  OutMI.setOpcode(MI->getOpcode());
   for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
     // Ignore all implicit register operands.
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h
index db5bdb0..f6d5ac8 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -10,27 +10,24 @@
 #ifndef LLVM_SYSTEMZMCINSTLOWER_H
 #define LLVM_SYSTEMZMCINSTLOWER_H
 
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class MCContext;
 class MCInst;
 class MCOperand;
-class MCSymbol;
 class MachineInstr;
 class MachineOperand;
 class Mangler;
 class SystemZAsmPrinter;
 
 class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
-  Mangler *Mang;
   MCContext &Ctx;
   SystemZAsmPrinter &AsmPrinter;
 
 public:
-  SystemZMCInstLower(Mangler *mang, MCContext &ctx,
-                     SystemZAsmPrinter &asmPrinter);
+  SystemZMCInstLower(MCContext &ctx, SystemZAsmPrinter &asmPrinter);
 
   // Lower MachineInstr MI to MCInst OutMI.
   void lower(const MachineInstr *MI, MCInst &OutMI) const;
@@ -38,9 +35,9 @@ public:
   // Return an MCOperand for MO.
   MCOperand lowerOperand(const MachineOperand& MO) const;
 
-  // Return an MCOperand for MO, given that it equals Symbol + Offset.
-  MCOperand lowerSymbolOperand(const MachineOperand &MO,
-                               const MCSymbol *Symbol, int64_t Offset) const;
+  // Return an MCExpr for symbolic operand MO with variant kind Kind.
+  const MCExpr *getExpr(const MachineOperand &MO,
+                        MCSymbolRefExpr::VariantKind Kind) const;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
new file mode 100644
index 0000000..00572d0
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -0,0 +1,17 @@
+//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineFunctionInfo.h"
+
+using namespace llvm;
+
+
+// pin vtable to this file
+void SystemZMachineFunctionInfo::anchor() {}
+
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 69c2691..845291f 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -15,6 +15,7 @@
 namespace llvm {
 
 class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
   unsigned LowSavedGPR;
   unsigned HighSavedGPR;
   unsigned VarArgsFirstGPR;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 9d79439..3ad146c 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -46,7 +46,8 @@ class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
 // address with address size VT.  SELF is the name of the operand and
 // ASMOP is the associated asm operand.
 class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
-  : ComplexPattern<vt, 1, "selectPCRelAddress", [z_pcrel_wrapper]>,
+  : ComplexPattern<vt, 1, "selectPCRelAddress",
+                   [z_pcrel_wrapper, z_pcrel_offset]>,
     PCRelOperand<vt, asmop> {
   let MIOperandInfo = (ops !cast<Operand>(self));
 }
@@ -219,11 +220,6 @@ def uimm8    : Immediate<i8, [{}], UIMM8, "U8Imm">;
 // i32 immediates
 //===----------------------------------------------------------------------===//
 
-// Immediates for 8-bit lengths.
-def imm32len8 : Immediate<i32, [{
-  return isUInt<8>(N->getZExtValue() - 1);
-}], NOOP_SDNodeXForm, "U32Imm">;
-
 // Immediates for the lower and upper 16 bits of an i32, with the other
 // bits of the i32 being zero.
 def imm32ll16 : Immediate<i32, [{
@@ -338,6 +334,10 @@ def imm64sx8 : Immediate<i64, [{
   return isInt<8>(N->getSExtValue());
 }], SIMM8, "S8Imm">;
 
+def imm64zx8 : Immediate<i64, [{
+  return isUInt<8>(N->getSExtValue());
+}], UIMM8, "U8Imm">;
+
 def imm64sx16 : Immediate<i64, [{
   return isInt<16>(N->getSExtValue());
 }], SIMM16, "S16Imm">;
@@ -358,7 +358,7 @@ def imm64zx32n : Immediate<i64, [{
   return isUInt<32>(-N->getSExtValue());
 }], NEGIMM32, "U32Imm">;
 
-def imm64 : ImmLeaf<i64, [{}]>;
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point immediates
@@ -396,19 +396,6 @@ def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
   let DecoderMethod = "decodePC32DBLOperand";
 }
 
-// A PC-relative offset of a global value when the value is used as a
-// call target.  The offset is sign-extended and multiplied by 2.
-def pcrel16call : PCRelAddress<i64, "pcrel16call", PCRel16> {
-  let PrintMethod = "printCallOperand";
-  let EncoderMethod = "getPLT16DBLEncoding";
-  let DecoderMethod = "decodePC16DBLOperand";
-}
-def pcrel32call : PCRelAddress<i64, "pcrel32call", PCRel32> {
-  let PrintMethod = "printCallOperand";
-  let EncoderMethod = "getPLT32DBLEncoding";
-  let DecoderMethod = "decodePC32DBLOperand";
-}
-
 //===----------------------------------------------------------------------===//
 // Addressing modes
 //===----------------------------------------------------------------------===//
@@ -435,6 +422,7 @@ def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
 // <type> is one of:
 //   shift    : base + displacement (32-bit)
 //   bdaddr   : base + displacement
+//   mviaddr  : like bdaddr, but reject cases with a natural index
 //   bdxaddr  : base + displacement + index
 //   laaddr   : like bdxaddr, but used for Load Address operations
 //   dynalloc : base + displacement + index + ADJDYNALLOC
@@ -460,6 +448,8 @@ def bdaddr12only      : BDMode <"BDAddr",   "64", "12", "Only">;
 def bdaddr12pair      : BDMode <"BDAddr",   "64", "12", "Pair">;
 def bdaddr20only      : BDMode <"BDAddr",   "64", "20", "Only">;
 def bdaddr20pair      : BDMode <"BDAddr",   "64", "20", "Pair">;
+def mviaddr12pair     : BDMode <"MVIAddr",  "64", "12", "Pair">;
+def mviaddr20pair     : BDMode <"MVIAddr",  "64", "20", "Pair">;
 def bdxaddr12only     : BDXMode<"BDXAddr",  "64", "12", "Only">;
 def bdxaddr12pair     : BDXMode<"BDXAddr",  "64", "12", "Pair">;
 def bdxaddr20only     : BDXMode<"BDXAddr",  "64", "20", "Only">;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 6a3af2b..31cabaa 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -15,6 +15,9 @@ def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
                                             SDTCisVT<1, i64>]>;
 def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_ZCmp                : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ZICmp               : SDTypeProfile<0, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
 def SDT_ZBRCCMask           : SDTypeProfile<0, 3,
                                             [SDTCisVT<0, i8>,
                                              SDTCisVT<1, i8>,
@@ -27,6 +30,10 @@ def SDT_ZSelectCCMask       : SDTypeProfile<1, 4,
 def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisPtrTy<0>]>;
+def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisPtrTy<0>]>;
 def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
 def SDT_ZExtractAccess      : SDTypeProfile<1, 1,
                                             [SDTCisVT<0, i32>,
@@ -54,10 +61,24 @@ def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
                                              SDTCisVT<4, i32>,
                                              SDTCisVT<5, i32>,
                                              SDTCisVT<6, i32>]>;
-def SDT_ZCopy               : SDTypeProfile<0, 3,
+def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
-                                             SDTCisVT<2, i32>]>;
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>,
+                                             SDTCisVT<3, i64>]>;
+def SDT_ZString             : SDTypeProfile<1, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZPrefetch           : SDTypeProfile<0, 2,
+                                            [SDTCisVT<0, i8>,
+                                             SDTCisPtrTy<1>]>;
 
 //===----------------------------------------------------------------------===//
 // Node definitions
@@ -76,9 +97,15 @@ def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
 def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
-def z_cmp               : SDNode<"SystemZISD::CMP", SDT_ZCmp, [SDNPOutGlue]>;
-def z_ucmp              : SDNode<"SystemZISD::UCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
+                                 SDT_ZWrapOffset, []>;
+def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
+def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
 def z_br_ccmask         : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
                                  [SDNPHasChain, SDNPInGlue]>;
 def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
@@ -109,13 +136,56 @@ def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
 def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
 def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
 
-def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZCopy,
+def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm               : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+                                 [SDNPInGlue]>;
+def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                                  SDNPMemOperand]>;
 
 //===----------------------------------------------------------------------===//
 // Pattern fragments
 //===----------------------------------------------------------------------===//
 
+// Signed and unsigned comparisons.
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::UnsignedOnly;
+}]>;
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::SignedOnly;
+}]>;
+
+// Register- and memory-based TEST UNDER MASK.
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
+
 // Register sign-extend operations.  Sub-32-bit values are represented as i32s.
 def sext8  : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
 def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
@@ -130,6 +200,36 @@ def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>;
 def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>;
 
+// Extending loads in which the extension type can be signed.
+def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+}]>;
+def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type can be unsigned.
+def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+}]>;
+def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
 // Extending loads in which the extension type doesn't matter.
 def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
@@ -150,11 +250,11 @@ class AlignedLoad<SDPatternOperator load>
   LoadSDNode *Load = cast<LoadSDNode>(N);
   return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
 }]>;
-def aligned_load        : AlignedLoad<load>;
-def aligned_sextloadi16 : AlignedLoad<sextloadi16>;
-def aligned_sextloadi32 : AlignedLoad<sextloadi32>;
-def aligned_zextloadi16 : AlignedLoad<zextloadi16>;
-def aligned_zextloadi32 : AlignedLoad<zextloadi32>;
+def aligned_load         : AlignedLoad<load>;
+def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
+def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
+def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
+def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
 
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
@@ -189,6 +289,31 @@ def nonvolatile_truncstorei8  : NonvolatileStore<truncstorei8>;
 def nonvolatile_truncstorei16 : NonvolatileStore<truncstorei16>;
 def nonvolatile_truncstorei32 : NonvolatileStore<truncstorei32>;
 
+// A store of a load that can be implemented using MVC.
+def mvc_store : PatFrag<(ops node:$value, node:$addr),
+                        (unindexedstore node:$value, node:$addr),
+                        [{ return storeLoadCanUseMVC(N); }]>;
+
+// Binary read-modify-write operations on memory in which the other
+// operand is also memory and for which block operations like NC can
+// be used.  There are two patterns for each operator, depending on
+// which operand contains the "other" load.
+multiclass block_op<SDPatternOperator operator> {
+  def "1" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator node:$value,
+                                              (unindexedload node:$addr)),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 0); }]>;
+  def "2" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator (unindexedload node:$addr),
+                                              node:$value),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 1); }]>;
+}
+defm block_and : block_op<and>;
+defm block_or  : block_op<or>;
+defm block_xor : block_op<xor>;
+
 // Insertions.
 def inserti8 : PatFrag<(ops node:$src1, node:$src2),
                        (or (and node:$src1, -256), node:$src2)>;
@@ -221,6 +346,16 @@ def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
                                    APInt::getLowBitsSet(BitWidth, 8));
 }]>;
 
+// Integer absolute, matching the canonical form generated by DAGCombiner.
+def z_iabs32 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 31))),
+                            (sra node:$src, (i32 31)))>;
+def z_iabs64 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 63))),
+                            (sra node:$src, (i32 63)))>;
+def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
+def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+
 // Fused multiply-add and multiply-subtract, but with the order of the
 // operands matching SystemZ's MA and MS instructions.
 def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index c442ae0..7706351 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -13,7 +13,7 @@ multiclass SXU<SDPatternOperator operator, Instruction insn> {
   def : Pat<(operator (sext (i32 GR32:$src))),
             (insn GR32:$src)>;
   def : Pat<(operator (sext_inreg GR64:$src, i32)),
-            (insn (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+            (insn (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
 }
 
 // Record that INSN performs a 64-bit version of binary operator OPERATOR
@@ -24,7 +24,7 @@ multiclass SXB<SDPatternOperator operator, RegisterOperand cls,
   def : Pat<(operator cls:$src1, (sext GR32:$src2)),
             (insn cls:$src1, GR32:$src2)>;
   def : Pat<(operator cls:$src1, (sext_inreg GR64:$src2, i32)),
-            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
 }
 
 // Like SXB, but for zero extension.
@@ -33,7 +33,7 @@ multiclass ZXB<SDPatternOperator operator, RegisterOperand cls,
   def : Pat<(operator cls:$src1, (zext GR32:$src2)),
             (insn cls:$src1, GR32:$src2)>;
   def : Pat<(operator cls:$src1, (and GR64:$src2, 0xffffffff)),
-            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
 }
 
 // Record that INSN performs a binary read-modify-write operation,
@@ -66,22 +66,87 @@ multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
             (insn cls:$src1, mode:$src2)>;
 }
 
-// Use MVC instruction INSN for a load of type LOAD followed by a store
-// of type STORE.  VT is the type of the intermediate register and LENGTH
-// is the number of bytes to copy (which may be smaller than VT).
-multiclass MVCLoadStore<SDPatternOperator load, SDPatternOperator store,
-                        ValueType vt, Instruction insn, bits<5> length> {
-  def Pat : PatFrag<(ops node:$dest, node:$src),
-                    (store (vt (load node:$src)), node:$dest),
-                    [{ return storeLoadCanUseMVC(N); }]>;
+// INSN stores the low 32 bits of a GPR to a memory with addressing mode MODE.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64<Instruction insn, SDPatternOperator operator,
+                AddressingMode mode>
+  : Pat<(operator GR64:$R1, mode:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), mode:$XBD2)>;
 
-  def : Pat<(!cast<SDPatternOperator>(NAME##"Pat") bdaddr12only:$dest,
-                                                   bdaddr12only:$src),
+// INSN and INSNY are an RX/RXY pair of instructions that store the low
+// 32 bits of a GPR to memory.  Record that they are equivalent to using
+// OPERATOR to store a GR64.
+multiclass StoreGR64Pair<Instruction insn, Instruction insny,
+                         SDPatternOperator operator> {
+  def : StoreGR64<insn, operator, bdxaddr12pair>;
+  def : StoreGR64<insny, operator, bdxaddr20pair>;
+}
+
+// INSN stores the low 32 bits of a GPR using PC-relative addressing.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64PC<Instruction insn, SDPatternOperator operator>
+  : Pat<(operator GR64:$R1, pcrel32:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), pcrel32:$XBD2)> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+// INSN and INSNINV conditionally store the low 32 bits of a GPR to memory,
+// with INSN storing when the condition is true and INSNINV storing when the
+// condition is false.  Record that they are equivalent to a LOAD/select/STORE
+// sequence for GR64s.
+multiclass CondStores64<Instruction insn, Instruction insninv,
+                        SDPatternOperator store, SDPatternOperator load,
+                        AddressingMode mode> {
+  def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
+                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                   mode:$addr),
+            (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                  uimm8zx4:$valid, uimm8zx4:$cc)>;
+  def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
+                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                   mode:$addr),
+            (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                     uimm8zx4:$valid, uimm8zx4:$cc)>;
+}
+
+// Try to use MVC instruction INSN for a load of type LOAD followed by a store
+// of the same size.  VT is the type of the intermediate (legalized) value and
+// LENGTH is the number of bytes loaded by LOAD.
+multiclass MVCLoadStore<SDPatternOperator load, ValueType vt, Instruction insn,
+                        bits<5> length> {
+  def : Pat<(mvc_store (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
             (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
 }
 
+// Use NC-like instruction INSN for block_op operation OPERATOR.
+// The other operand is a load of type LOAD, which accesses LENGTH bytes.
+// VT is the intermediate legalized type in which the binary operation
+// is actually done.
+multiclass BinaryLoadStore<SDPatternOperator operator, SDPatternOperator load,
+                           ValueType vt, Instruction insn, bits<5> length> {
+  def : Pat<(operator (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// A convenient way of generating all block peepholes for a particular
+// LOAD/VT/LENGTH combination.
+multiclass BlockLoadStore<SDPatternOperator load, ValueType vt,
+                          Instruction mvc, Instruction nc, Instruction oc,
+                          Instruction xc, bits<5> length> {
+  defm : MVCLoadStore<load, vt, mvc, length>;
+  defm : BinaryLoadStore<block_and1, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_and2, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_or1,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_or2,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_xor1, load, vt, xc, length>;
+  defm : BinaryLoadStore<block_xor2, load, vt, xc, length>;
+}
+
 // Record that INSN is a LOAD AND TEST that can be used to compare
 // registers in CLS against zero.  The instruction has separate R1 and R2
 // operands, but they must be the same when the instruction is used like this.
 class CompareZeroFP<Instruction insn, RegisterOperand cls>
-  : Pat<(z_cmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
+  : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 7e14aa7..f241fb0 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -31,8 +31,16 @@ def FeatureHighWord : SystemZFeature<
   "Assume that the high-word facility is installed"
 >;
 
-def : Processor<"z10",   NoItineraries, []>;
-def : Processor<"z196",  NoItineraries,
-                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord]>;
+def FeatureFPExtension : SystemZFeature<
+  "fp-extension", "FPExtension",
+  "Assume that the floating-point extension facility is installed"
+>;
+
+def : Processor<"generic", NoItineraries, []>;
+def : Processor<"z10", NoItineraries, []>;
+def : Processor<"z196", NoItineraries,
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension]>;
 def : Processor<"zEC12", NoItineraries,
-                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord]>;
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension]>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 8ce6d6a..b61ae88 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -42,13 +42,15 @@ SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (TFI->hasFP(MF)) {
     // R11D is the frame pointer.  Reserve all aliases.
     Reserved.set(SystemZ::R11D);
-    Reserved.set(SystemZ::R11W);
+    Reserved.set(SystemZ::R11L);
+    Reserved.set(SystemZ::R11H);
     Reserved.set(SystemZ::R10Q);
   }
 
   // R15D is the stack pointer.  Reserve all aliases.
   Reserved.set(SystemZ::R15D);
-  Reserved.set(SystemZ::R15W);
+  Reserved.set(SystemZ::R15L);
+  Reserved.set(SystemZ::R15H);
   Reserved.set(SystemZ::R14Q);
   return Reserved;
 }
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index c447e4d..13f45fa 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -22,10 +22,10 @@ namespace SystemZ {
   // Return the subreg to use for referring to the even and odd registers
   // in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
   inline unsigned even128(bool Is32bit) {
-    return Is32bit ? subreg_32bit : subreg_high;
+    return Is32bit ? subreg_hl32 : subreg_h64;
   }
   inline unsigned odd128(bool Is32bit) {
-    return Is32bit ? subreg_low32 : subreg_low;
+    return Is32bit ? subreg_l32 : subreg_l64;
   }
 }
 
@@ -48,6 +48,10 @@ public:
     LLVM_OVERRIDE {
     return true;
   }
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const
+    LLVM_OVERRIDE {
+    return true;
+  }
   virtual const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0)
     const LLVM_OVERRIDE;
   virtual BitVector getReservedRegs(const MachineFunction &MF)
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index ffffe72..93d7c83 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -21,11 +21,12 @@ class SystemZRegWithSubregs<string n, list<Register> subregs>
 }
 
 let Namespace = "SystemZ" in {
-def subreg_32bit  : SubRegIndex<32>; // could also be named "subreg_high32"
-// Indices are used in a variety of ways, so don't set an Offset.
-def subreg_high   : SubRegIndex<64, -1>;
-def subreg_low    : SubRegIndex<64, -1>;
-def subreg_low32  : ComposedSubRegIndex<subreg_low, subreg_32bit>;
+def subreg_l32   : SubRegIndex<32, 0>;  // Also acts as subreg_ll32.
+def subreg_h32   : SubRegIndex<32, 32>; // Also acts as subreg_lh32.
+def subreg_l64   : SubRegIndex<64, 0>;
+def subreg_h64   : SubRegIndex<64, 64>;
+def subreg_hh32  : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+def subreg_hl32  : ComposedSubRegIndex<subreg_h64, subreg_l32>;
 }
 
 // Define a register class that contains values of type TYPE and an
@@ -55,36 +56,49 @@ class GPR32<bits<16> num, string n> : SystemZReg<n> {
 }
 
 // One of the 16 64-bit general-purpose registers.
-class GPR64<bits<16> num, string n, GPR32 low>
- : SystemZRegWithSubregs<n, [low]> {
+class GPR64<bits<16> num, string n, GPR32 low, GPR32 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_32bit];
+  let SubRegIndices = [subreg_l32, subreg_h32];
 }
 
 // 8 even-odd pairs of GPR64s.
-class GPR128<bits<16> num, string n, GPR64 high, GPR64 low>
- : SystemZRegWithSubregs<n, [high, low]> {
+class GPR128<bits<16> num, string n, GPR64 low, GPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_high, subreg_low];
+  let SubRegIndices = [subreg_l64, subreg_h64];
 }
 
 // General-purpose registers
 foreach I = 0-15 in {
-  def R#I#W : GPR32<I, "r"#I>;
-  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"W")>, DwarfRegNum<[I]>;
+  def R#I#L : GPR32<I, "r"#I>;
+  def R#I#H : GPR32<I, "r"#I>;
+  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"L"), !cast<GPR32>("R"#I#"H")>,
+                    DwarfRegNum<[I]>;
 }
 
 foreach I = [0, 2, 4, 6, 8, 10, 12, 14] in {
-  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#I#"D"),
-                     !cast<GPR64>("R"#!add(I, 1)#"D")>;
+  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#!add(I, 1)#"D"),
+                     !cast<GPR64>("R"#I#"D")>;
 }
 
 /// Allocate the callee-saved R6-R13 backwards. That way they can be saved
 /// together with R14 and R15 in one prolog instruction.
-defm GR32 : SystemZRegClass<"GR32", i32, 32, (add (sequence "R%uW",  0, 5),
-                                                  (sequence "R%uW", 15, 6))>;
-defm GR64 : SystemZRegClass<"GR64", i64, 64, (add (sequence "R%uD",  0, 5),
-                                                  (sequence "R%uD", 15, 6))>;
+defm GR32  : SystemZRegClass<"GR32",  i32, 32, (add (sequence "R%uL",  0, 5),
+                                                    (sequence "R%uL", 15, 6))>;
+defm GRH32 : SystemZRegClass<"GRH32", i32, 32, (add (sequence "R%uH",  0, 5),
+                                                    (sequence "R%uH", 15, 6))>;
+defm GR64  : SystemZRegClass<"GR64",  i64, 64, (add (sequence "R%uD",  0, 5),
+                                                    (sequence "R%uD", 15, 6))>;
+
+// Combine the low and high GR32s into a single class.  This can only be
+// used for virtual registers if the high-word facility is available.
+defm GRX32 : SystemZRegClass<"GRX32", i32, 32,
+                             (add (sequence "R%uL",  0, 5),
+                                  (sequence "R%uH",  0, 5),
+                                  R15L, R15H, R14L, R14H, R13L, R13H,
+                                  R12L, R12H, R11L, R11H, R10L, R10H,
+                                  R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H)>;
 
 // The architecture doesn't really have any i128 support, so model the
 // register pairs as untyped instead.
@@ -94,7 +108,7 @@ defm GR128 : SystemZRegClass<"GR128", untyped, 128, (add R0Q, R2Q, R4Q,
 
 // Base and index registers.  Everything except R0, which in an address
 // context evaluates as 0.
-defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0W)>;
+defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0L)>;
 defm ADDR64 : SystemZRegClass<"ADDR64", i64, 64, (sub GR64Bit, R0D)>;
 
 // Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
@@ -114,14 +128,14 @@ class FPR32<bits<16> num, string n> : SystemZReg<n> {
 class FPR64<bits<16> num, string n, FPR32 low>
  : SystemZRegWithSubregs<n, [low]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_32bit];
+  let SubRegIndices = [subreg_h32];
 }
 
 // 8 pairs of FPR64s, with a one-register gap inbetween.
-class FPR128<bits<16> num, string n, FPR64 high, FPR64 low>
- : SystemZRegWithSubregs<n, [high, low]> {
+class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_high, subreg_low];
+  let SubRegIndices = [subreg_l64, subreg_h64];
 }
 
 // Floating-point registers
@@ -132,8 +146,8 @@ foreach I = 0-15 in {
 }
 
 foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
-  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#I#"D"),
-                     !cast<FPR64>("F"#!add(I, 2)#"D")>;
+  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#!add(I, 2)#"D"),
+                     !cast<FPR64>("F"#I#"D")>;
 }
 
 // There's no store-multiple instruction for FPRs, so we're not fussy
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 4ca9292..c7ebb5d 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -25,6 +25,34 @@ SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
 SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
 }
 
+// Decide whether it is best to use a loop or straight-line code for
+// a block operation of Size bytes with source address Src and destination
+// address Dest.  Sequence is the opcode to use for straight-line code
+// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
+// Return the chain for the completed operation.
+static SDValue emitMemMem(SelectionDAG &DAG, SDLoc DL, unsigned Sequence,
+                          unsigned Loop, SDValue Chain, SDValue Dst,
+                          SDValue Src, uint64_t Size) {
+  EVT PtrVT = Src.getValueType();
+  // The heuristic we use is to prefer loops for anything that would
+  // require 7 or more MVCs.  With these kinds of sizes there isn't
+  // much to choose between straight-line code and looping code,
+  // since the time will be dominated by the MVCs themselves.
+  // However, the loop has 4 or 5 instructions (depending on whether
+  // the base addresses can be proved equal), so there doesn't seem
+  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
+  // the range (5 * 256, 6 * 256) will need another instruction after
+  // the loop, so it doesn't seem worth using a loop then either.
+  // The next value up, 6 * 256, can be implemented in the same
+  // number of straight-line MVCs as 6 * 256 - 1.
+  if (Size > 6 * 256)
+    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size, PtrVT));
+}
+
 SDValue SystemZSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                         SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
@@ -34,14 +62,9 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   if (IsVolatile)
     return SDValue();
 
-  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
-    uint64_t Bytes = CSize->getZExtValue();
-    if (Bytes >= 1 && Bytes <= 0x100) {
-      // A single MVC.
-      return DAG.getNode(SystemZISD::MVC, DL, MVT::Other,
-                         Chain, Dst, Src, Size);
-    }
-  }
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size))
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, Dst, Src, CSize->getZExtValue());
   return SDValue();
 }
 
@@ -65,7 +88,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                         SDValue Dst, SDValue Byte, SDValue Size,
                         unsigned Align, bool IsVolatile,
                         MachinePointerInfo DstPtrInfo) const {
-  EVT DstVT = Dst.getValueType();
+  EVT PtrVT = Dst.getValueType();
 
   if (IsVolatile)
     return SDValue();
@@ -89,8 +112,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                      Align, DstPtrInfo);
         if (Size2 == 0)
           return Chain1;
-        Dst = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                          DAG.getConstant(Size1, DstVT));
+        Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                          DAG.getConstant(Size1, PtrVT));
         DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
         SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
                                      std::min(Align, Size1), DstPtrInfo);
@@ -103,8 +126,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                       false, false, Align);
         if (Bytes == 1)
           return Chain1;
-        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                                   DAG.getConstant(1, DstVT));
+        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
         SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
                                       DstPtrInfo.getWithOffset(1),
                                       false, false, 1);
@@ -112,16 +135,159 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
       }
     }
     assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
-    if (Bytes <= 0x101) {
-      // Copy the byte to the first location and then use MVC to copy
-      // it to the rest.
-      Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
-                           false, false, Align);
-      SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                                 DAG.getConstant(1, DstVT));
-      return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst2, Dst,
-                         DAG.getConstant(Bytes - 1, MVT::i32));
-    }
+
+    // Handle the special case of a memset of 0, which can use XC.
+    ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte);
+    if (CByte && CByte->getZExtValue() == 0)
+      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
+                        Chain, Dst, Dst, Bytes);
+
+    // Copy the byte to the first location and then use MVC to copy
+    // it to the rest.
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+                         false, false, Align);
+    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, DstPlus1, Dst, Bytes - 1);
   }
   return SDValue();
 }
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
+// deciding whether to use a loop or straight-line code.
+static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                       SDValue Src1, SDValue Src2, uint64_t Size) {
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  EVT PtrVT = Src1.getValueType();
+  // A two-CLC sequence is a clear win over a loop, not least because it
+  // needs only one branch.  A three-CLC sequence needs the same number
+  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
+  // lengths greater than 768 bytes.  It seems relatively likely that
+  // a difference will be found within the first 768 bytes, so we just
+  // optimize for the smallest number of branch instructions, in order
+  // to avoid polluting the prediction buffer too much.  A loop only ever
+  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
+  if (Size > 3 * 256)
+    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+                     DAG.getConstant(Size, PtrVT));
+}
+
+// Convert the current CC value into an integer that is 0 if CC == 0,
+// less than zero if CC == 1 and greater than zero if CC >= 2.
+// The sequence starts with IPM, which puts CC into bits 29 and 28
+// of an integer and clears bits 30 and 31.
+static SDValue addIPMSequence(SDLoc DL, SDValue Glue, SelectionDAG &DAG) {
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                            DAG.getConstant(SystemZ::IPM_CC, MVT::i32));
+  SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
+                             DAG.getConstant(31, MVT::i32));
+  return ROTL;
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2, SDValue Size,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    assert(Bytes > 0 && "Caller should have handled 0-size case");
+    Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+    SDValue Glue = Chain.getValue(1);
+    return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+  }
+  return std::make_pair(SDValue(), SDValue());
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, SDValue Char, SDValue Length,
+                        MachinePointerInfo SrcPtrInfo) const {
+  // Use SRST to find the character.  End is its address on success.
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
+  Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
+  Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
+                     DAG.getConstant(255, MVT::i32));
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, Char);
+  Chain = End.getValue(1);
+  SDValue Glue = End.getValue(2);
+
+  // Now select between End and null, depending on whether the character
+  // was found.
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(End);
+  Ops.push_back(DAG.getConstant(0, PtrVT));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST, MVT::i32));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32));
+  Ops.push_back(Glue);
+  VTs = DAG.getVTList(PtrVT, MVT::Glue);
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+  return std::make_pair(End, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dest, SDValue Src,
+                        MachinePointerInfo DestPtrInfo,
+                        MachinePointerInfo SrcPtrInfo, bool isStpcpy) const {
+  SDVTList VTs = DAG.getVTList(Dest.getValueType(), MVT::Other);
+  SDValue EndDest = DAG.getNode(SystemZISD::STPCPY, DL, VTs, Chain, Dest, Src,
+                                DAG.getConstant(0, MVT::i32));
+  return std::make_pair(isStpcpy ? EndDest : Dest, EndDest.getValue(1));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+  SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
+                               DAG.getConstant(0, MVT::i32));
+  Chain = Unused.getValue(1);
+  SDValue Glue = Chain.getValue(2);
+  return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+}
+
+// Search from Src for a null character, stopping once Src reaches Limit.
+// Return a pair of values, the first being the number of nonnull characters
+// and the second being the out chain.
+//
+// This can be used for strlen by setting Limit to 0.
+static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG, SDLoc DL,
+                                                    SDValue Chain, SDValue Src,
+                                                    SDValue Limit) {
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, DAG.getConstant(0, MVT::i32));
+  Chain = End.getValue(1);
+  SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
+  return std::make_pair(Len, Chain);
+}    
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  return getBoundedStrlen(DAG, DL, Chain, Src, DAG.getConstant(0, PtrVT));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                         SDValue Src, SDValue MaxLength,
+                         MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  MaxLength = DAG.getZExtOrTrunc(MaxLength, DL, PtrVT);
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, MaxLength);
+  return getBoundedStrlen(DAG, DL, Chain, Src, Limit);
+}
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 9138a9c..281d1e2 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -38,7 +38,41 @@ public:
   EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
                           SDValue Chain, SDValue Dst, SDValue Byte,
                           SDValue Size, unsigned Align, bool IsVolatile,
-                          MachinePointerInfo DstPtrInfo) const;
+                          MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2, SDValue Size,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, SDValue Char, SDValue Length,
+                          MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Dest, SDValue Src,
+                          MachinePointerInfo DestPtrInfo,
+                          MachinePointerInfo SrcPtrInfo,
+                          bool isStpcpy) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, MachinePointerInfo SrcPtrInfo) const
+    LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                           SDValue Src, SDValue MaxLength,
+                           MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
 };
 
 }
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
new file mode 100644
index 0000000..537a545
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -0,0 +1,163 @@
+//===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to replace instructions with shorter forms.  For example,
+// IILF can be replaced with LLILL or LLILH if the constant fits and if the
+// other 32 bits of the GR64 destination are not live.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-shorten-inst"
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+  class SystemZShortenInst : public MachineFunctionPass {
+  public:
+    static char ID;
+    SystemZShortenInst(const SystemZTargetMachine &tm);
+
+    virtual const char *getPassName() const {
+      return "SystemZ Instruction Shortening";
+    }
+
+    bool processBlock(MachineBasicBlock *MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
+                    unsigned LLIxL, unsigned LLIxH);
+
+    const SystemZInstrInfo *TII;
+
+    // LowGPRs[I] has bit N set if LLVM register I includes the low
+    // word of GPR N.  HighGPRs is the same for the high word.
+    unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
+    unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
+  };
+
+  char SystemZShortenInst::ID = 0;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
+  return new SystemZShortenInst(TM);
+}
+
+SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
+  : MachineFunctionPass(ID), TII(0), LowGPRs(), HighGPRs() {
+  // Set up LowGPRs and HighGPRs.
+  for (unsigned I = 0; I < 16; ++I) {
+    LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
+    LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    if (unsigned GR128 = SystemZMC::GR128Regs[I]) {
+      LowGPRs[GR128] |= 3 << I;
+      HighGPRs[GR128] |= 3 << I;
+    }
+  }
+}
+
+// MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
+// are the halfword immediate loads for the same word.  Try to use one of them
+// instead of IIxF.  If MI loads the high word, GPRMap[X] is the set of high
+// words referenced by LLVM register X while LiveOther is the mask of low
+// words that are currently live, and vice versa.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
+                                    unsigned LiveOther, unsigned LLIxL,
+                                    unsigned LLIxH) {
+  unsigned Reg = MI.getOperand(0).getReg();
+  assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+  unsigned GPRs = GPRMap[Reg];
+  assert(GPRs != 0 && "Register must be a GPR");
+  if (GPRs & LiveOther)
+    return false;
+
+  uint64_t Imm = MI.getOperand(1).getImm();
+  if (SystemZ::isImmLL(Imm)) {
+    MI.setDesc(TII->get(LLIxL));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    return true;
+  }
+  if (SystemZ::isImmLH(Imm)) {
+    MI.setDesc(TII->get(LLIxH));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    MI.getOperand(1).setImm(Imm >> 16);
+    return true;
+  }
+  return false;
+}
+
+// Process all instructions in MBB.  Return true if something changed.
+bool SystemZShortenInst::processBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  // Work out which words are live on exit from the block.
+  unsigned LiveLow = 0;
+  unsigned LiveHigh = 0;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
+           LE = (*SI)->livein_end(); LI != LE; ++LI) {
+      unsigned Reg = *LI;
+      assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+      LiveLow |= LowGPRs[Reg];
+      LiveHigh |= HighGPRs[Reg];
+    }
+  }
+
+  // Iterate backwards through the block looking for instructions to change.
+  for (MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin(),
+         MBBE = MBB->rend(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == SystemZ::IILF)
+      Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
+                            SystemZ::LLILH);
+    else if (Opcode == SystemZ::IIHF)
+      Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
+                            SystemZ::LLIHH);
+    unsigned UsedLow = 0;
+    unsigned UsedHigh = 0;
+    for (MachineInstr::mop_iterator MOI = MI.operands_begin(),
+           MOE = MI.operands_end(); MOI != MOE; ++MOI) {
+      MachineOperand &MO = *MOI;
+      if (MO.isReg()) {
+        if (unsigned Reg = MO.getReg()) {
+          assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+          if (MO.isDef()) {
+            LiveLow &= ~LowGPRs[Reg];
+            LiveHigh &= ~HighGPRs[Reg];
+          } else if (!MO.isUndef()) {
+            UsedLow |= LowGPRs[Reg];
+            UsedHigh |= HighGPRs[Reg];
+          }
+        }
+      }
+    }
+    LiveLow |= UsedLow;
+    LiveHigh |= UsedHigh;
+  }
+
+  return Changed;
+}
+
+bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+
+  bool Changed = false;
+  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
+       MFI != MFE; ++MFI)
+    Changed |= processBlock(MFI);
+
+  return Changed;
+}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 036ec05..3971d5e 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -9,6 +9,7 @@
 
 #include "SystemZSubtarget.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Host.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -17,14 +18,22 @@
 
 using namespace llvm;
 
+// Pin the vtabel to this file.
+void SystemZSubtarget::anchor() {}
+
 SystemZSubtarget::SystemZSubtarget(const std::string &TT,
                                    const std::string &CPU,
                                    const std::string &FS)
   : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
-    HasLoadStoreOnCond(false), HasHighWord(false), TargetTriple(TT) {
+    HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+    TargetTriple(TT) {
   std::string CPUName = CPU;
   if (CPUName.empty())
-    CPUName = "z10";
+    CPUName = "generic";
+#if defined(__linux__) && defined(__s390x__)
+  if (CPUName == "generic")
+    CPUName = sys::getHostCPUName();
+#endif
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 4efb58d..5817491 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -26,10 +26,12 @@ class GlobalValue;
 class StringRef;
 
 class SystemZSubtarget : public SystemZGenSubtargetInfo {
+  virtual void anchor();
 protected:
   bool HasDistinctOps;
   bool HasLoadStoreOnCond;
   bool HasHighWord;
+  bool HasFPExtension;
 
 private:
   Triple TargetTriple;
@@ -38,6 +40,9 @@ public:
   SystemZSubtarget(const std::string &TT, const std::string &CPU,
                    const std::string &FS);
 
+  // This is important for reducing register pressure in vector code.
+  virtual bool useAA() const LLVM_OVERRIDE { return true; }
+
   // Automatically generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
@@ -50,6 +55,9 @@ public:
   // Return true if the target has the high-word facility.
   bool hasHighWord() const { return HasHighWord; }
 
+  // Return true if the target has the floating-point extension facility.
+  bool hasFPExtension() const { return HasFPExtension; }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 856183c..dee92e9 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -10,6 +10,7 @@
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
@@ -47,12 +48,18 @@ public:
     return getTM<SystemZTargetMachine>();
   }
 
+  virtual void addIRPasses() LLVM_OVERRIDE;
   virtual bool addInstSelector() LLVM_OVERRIDE;
   virtual bool addPreSched2() LLVM_OVERRIDE;
   virtual bool addPreEmitPass() LLVM_OVERRIDE;
 };
 } // end anonymous namespace
 
+void SystemZPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  addPass(createPartiallyInlineLibCallsPass());
+}
+
 bool SystemZPassConfig::addInstSelector() {
   addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
   return false;
@@ -90,6 +97,8 @@ bool SystemZPassConfig::addPreEmitPass() {
   // preventing that would be a win or not.
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createSystemZElimComparePass(getSystemZTargetMachine()));
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()));
   addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
   return true;
 }
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 3d92f29..2190198 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -88,6 +88,14 @@ LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
   return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
 }
 
+LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C)));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C), AS));
+}
+
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
 }
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 8696b57..3e68fe1 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -38,6 +38,8 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "_ZnwjRKSt9nothrow_t",
     "_Znwm",
     "_ZnwmRKSt9nothrow_t",
+    "__cospi",
+    "__cospif",
     "__cxa_atexit",
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
@@ -45,6 +47,10 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__isoc99_scanf",
     "__isoc99_sscanf",
     "__memcpy_chk",
+    "__sincospi_stret",
+    "__sincospi_stretf",
+    "__sinpi",
+    "__sinpif",
     "__sqrt_finite",
     "__sqrtf_finite",
     "__sqrtl_finite",
@@ -331,6 +337,24 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "write"
   };
 
+static bool hasSinCosPiStret(const Triple &T) {
+  // Only Darwin variants have _stret versions of combined trig functions.
+  if (!T.isMacOSX() && T.getOS() != Triple::IOS)
+    return false;
+
+  // The ABI is rather complicated on x86, so don't do anything special there.
+  if (T.getArch() == Triple::x86)
+    return false;
+
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 9))
+    return false;
+
+  if (T.getOS() == Triple::IOS && T.isOSVersionLT(7, 0))
+    return false;
+
+  return true;
+}
+
 /// initialize - Initialize the set of available library functions based on the
 /// specified target triple.  This should be carefully written so that a missing
 /// target triple gets a sane set of defaults.
@@ -350,13 +374,22 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
   if (T.isMacOSX()) {
     if (T.isMacOSXVersionLT(10, 5))
       TLI.setUnavailable(LibFunc::memset_pattern16);
-  } else if (T.getOS() == Triple::IOS) {
+  } else if (T.isiOS()) {
     if (T.isOSVersionLT(3, 0))
       TLI.setUnavailable(LibFunc::memset_pattern16);
   } else {
     TLI.setUnavailable(LibFunc::memset_pattern16);
   }
 
+  if (!hasSinCosPiStret(T)) {
+    TLI.setUnavailable(LibFunc::sinpi);
+    TLI.setUnavailable(LibFunc::sinpif);
+    TLI.setUnavailable(LibFunc::cospi);
+    TLI.setUnavailable(LibFunc::cospif);
+    TLI.setUnavailable(LibFunc::sincospi_stret);
+    TLI.setUnavailable(LibFunc::sincospi_stretf);
+  }
+
   if (T.isMacOSX() && T.getArch() == Triple::x86 &&
       !T.isMacOSXVersionLT(10, 7)) {
     // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
@@ -562,7 +595,7 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
   }
 
   // The following functions are available on at least Linux:
-  if (T.getOS() != Triple::Linux) {
+  if (!T.isOSLinux()) {
     TLI.setUnavailable(LibFunc::dunder_strdup);
     TLI.setUnavailable(LibFunc::dunder_strtok_r);
     TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index cd810b6..7b8d110 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -97,10 +97,20 @@ static bool IsNullTerminatedString(const Constant *C) {
   return false;
 }
 
+/// Return the MCSymbol for the specified global value.  This
+/// symbol is the main label that is the address of the global.
+MCSymbol *TargetLoweringObjectFile::getSymbol(Mangler &M, 
+                                              const GlobalValue *GV) const {
+  SmallString<60> NameStr;
+  M.getNameWithPrefix(NameStr, GV, false);
+  return Ctx->GetOrCreateSymbol(NameStr.str());
+}
+
+
 MCSymbol *TargetLoweringObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
-  return Mang->getSymbol(GV);
+  return getSymbol(*Mang, GV);
 }
 
 void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
@@ -293,7 +303,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI, unsigned Encoding,
                         MCStreamer &Streamer) const {
   const MCSymbolRefExpr *Ref =
-    MCSymbolRefExpr::Create(Mang->getSymbol(GV), getContext());
+    MCSymbolRefExpr::Create(getSymbol(*Mang, GV), getContext());
 
   return getTTypeReference(Ref, Encoding, Streamer);
 }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index df4a03c..cb42e83 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -164,6 +164,11 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const {
   return CodeGenInfo->getOptLevel();
 }
 
+void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
+  if (CodeGenInfo)
+    CodeGenInfo->setOptLevel(Level);
+}
+
 bool TargetMachine::getAsmVerbosityDefault() {
   return AsmVerbosityDefault;
 }
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 7419122..3d5f827 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdlib>
@@ -60,13 +61,44 @@ inline LLVMTargetRef wrap(const Target * P) {
 }
 
 LLVMTargetRef LLVMGetFirstTarget() {
-   const Target* target = &*TargetRegistry::begin();
-   return wrap(target);
+  if(TargetRegistry::begin() == TargetRegistry::end()) {
+    return NULL;
+  }
+
+  const Target* target = &*TargetRegistry::begin();
+  return wrap(target);
 }
 LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
   return wrap(unwrap(T)->getNext());
 }
 
+LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
+  StringRef NameRef = Name;
+  for (TargetRegistry::iterator IT = TargetRegistry::begin(),
+                                IE = TargetRegistry::end(); IT != IE; ++IT) {
+    if (IT->getName() == NameRef)
+      return wrap(&*IT);
+  }
+  
+  return NULL;
+}
+
+LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
+                                 char **ErrorMessage) {
+  std::string Error;
+  
+  *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+  
+  if (!*T) {
+    if (ErrorMessage)
+      *ErrorMessage = strdup(Error.c_str());
+
+    return 1;
+  }
+  
+  return 0;
+}
+
 const char * LLVMGetTargetName(LLVMTargetRef T) {
   return unwrap(T)->getName();
 }
@@ -87,9 +119,10 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
   return unwrap(T)->hasMCAsmBackend();
 }
 
-LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, char* Triple,
-  char* CPU, char* Features, LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
-  LLVMCodeModel CodeModel) {
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
+        const char* Triple, const char* CPU, const char* Features,
+        LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+        LLVMCodeModel CodeModel) {
   Reloc::Model RM;
   switch (Reloc){
     case LLVMRelocStatic:
@@ -158,6 +191,11 @@ LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
   return wrap(unwrap(T)->getDataLayout());
 }
 
+void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
+                                      LLVMBool VerboseAsm) {
+  unwrap(T)->setAsmVerbosityDefault(VerboseAsm);
+}
+
 static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
   formatted_raw_ostream &OS, LLVMCodeGenFileType codegen, char **ErrorMessage) {
   TargetMachine* TM = unwrap(T);
@@ -201,11 +239,11 @@ LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
   std::string error;
   raw_fd_ostream dest(Filename, error, sys::fs::F_Binary);
-  formatted_raw_ostream destf(dest);
   if (!error.empty()) {
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
+  formatted_raw_ostream destf(dest);
   bool Result = LLVMTargetMachineEmit(T, M, destf, codegen, ErrorMessage);
   dest.flush();
   return Result;
@@ -225,3 +263,7 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
                                                      Data.length(), "");
   return Result;
 }
+
+char *LLVMGetDefaultTargetTriple(void) {
+  return strdup(sys::getDefaultTargetTriple().c_str());
+}
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index af0cef6..10e8db5 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/ADT/SmallVector.h"
 using namespace llvm;
@@ -22,6 +23,21 @@ TargetSubtargetInfo::TargetSubtargetInfo() {}
 
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
+// Temporary option to compare overall performance change when moving from the
+// SD scheduler to the MachineScheduler pass pipeline. It should be removed
+// before 3.4. The normal way to enable/disable the MachineScheduling pass
+// itself is by using -enable-misched. For targets that already use MI sched
+// (via MySubTarget::enableMachineScheduler()) -misched-bench=false negates the
+// subtarget hook.
+static cl::opt<bool> BenchMachineSched("misched-bench", cl::Hidden,
+    cl::desc("Migrate from the target's default SD scheduler to MI scheduler"));
+
+bool TargetSubtargetInfo::useMachineScheduler() const {
+  if (BenchMachineSched.getNumOccurrences())
+    return BenchMachineSched;
+  return enableMachineScheduler();
+}
+
 bool TargetSubtargetInfo::enableMachineScheduler() const {
   return false;
 }
@@ -35,3 +51,6 @@ bool TargetSubtargetInfo::enablePostRAScheduler(
   return false;
 }
 
+bool TargetSubtargetInfo::useAA() const {
+  return false;
+}
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index ad83d97..bc8f367 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -80,7 +80,7 @@ private:
       PostfixStack.push_back(std::make_pair(Op, Val));
     }
     
-    void popOperator() { InfixOperatorStack.pop_back_val(); }
+    void popOperator() { InfixOperatorStack.pop_back(); }
     void pushOperator(InfixCalculatorTok Op) {
       // Push the new operator if the stack is empty.
       if (InfixOperatorStack.empty()) {
@@ -118,12 +118,12 @@ private:
         
         if (StackOp == IC_RPAREN) {
           ++ParenCount;
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
         } else if (StackOp == IC_LPAREN) {
           --ParenCount;
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
         } else {
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
           PostfixStack.push_back(std::make_pair(StackOp, 0));
         }
       }
@@ -495,16 +495,17 @@ private:
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
   X86Operand *ParseIntelOffsetOfOperator();
-  X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
   X86Operand *ParseIntelOperator(unsigned OpKind);
-  X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp,
-                                   SMLoc StartLoc);
-  X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+  X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc,
+                                   unsigned Size);
+  bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
   X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
                                        int64_t ImmDisp, unsigned Size);
-  X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
-                                   InlineAsmIdentifierInfo &Info,
-                                   bool IsUnevaluatedOperand, SMLoc &End);
+  bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+                            InlineAsmIdentifierInfo &Info,
+                            bool IsUnevaluatedOperand, SMLoc &End);
 
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
@@ -555,8 +556,9 @@ private:
   /// }
 
 public:
-  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+               const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -814,6 +816,9 @@ struct X86Operand : public MCParsedAsmOperand {
   bool isMem256() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 256);
   }
+  bool isMem512() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+  }
 
   bool isMemVX32() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
@@ -840,17 +845,36 @@ struct X86Operand : public MCParsedAsmOperand {
       getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
   }
 
-  bool isMem512() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
-  }
-
   bool isAbsMem() const {
     return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
       !getMemIndexReg() && getMemScale() == 1;
   }
 
+  bool isMemOffs8() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64);
+  }
+
   bool isReg() const { return Kind == Register; }
 
+  bool isGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
@@ -864,53 +888,40 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
+  static unsigned getGR32FromGR64(unsigned RegNo) {
+    switch (RegNo) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::RAX: return X86::EAX;
+    case X86::RCX: return X86::ECX;
+    case X86::RDX: return X86::EDX;
+    case X86::RBX: return X86::EBX;
+    case X86::RBP: return X86::EBP;
+    case X86::RSP: return X86::ESP;
+    case X86::RSI: return X86::ESI;
+    case X86::RDI: return X86::EDI;
+    case X86::R8: return X86::R8D;
+    case X86::R9: return X86::R9D;
+    case X86::R10: return X86::R10D;
+    case X86::R11: return X86::R11D;
+    case X86::R12: return X86::R12D;
+    case X86::R13: return X86::R13D;
+    case X86::R14: return X86::R14D;
+    case X86::R15: return X86::R15D;
+    case X86::RIP: return X86::EIP;
+    }
   }
 
-  void addMem8Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem16Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem80Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem128Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem256Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVX32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVY32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVX64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVY64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
+  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getGR32FromGR64(RegNo);
+    Inst.addOperand(MCOperand::CreateReg(RegNo));
   }
 
-  void addMemVZ32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVZ64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem512Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
   }
 
   void addMemOperands(MCInst &Inst, unsigned N) const {
@@ -931,6 +942,15 @@ struct X86Operand : public MCParsedAsmOperand {
       Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
   }
 
+  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
   static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
     SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
     X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
@@ -1249,8 +1269,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
   }
 }
 
-X86Operand *
-X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   const AsmToken &Tok = Parser.getTok();
 
   bool Done = false;
@@ -1272,7 +1291,7 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
         Done = true;
         break;
       }
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return Error(Tok.getLoc(), "unknown token in expression");
     }
     case AsmToken::EndOfStatement: {
       Done = true;
@@ -1291,18 +1310,18 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       } else {
         if (!isParsingInlineAsm()) {
           if (getParser().parsePrimaryExpr(Val, End))
-            return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+            return Error(Tok.getLoc(), "Unexpected identifier!");
         } else {
           InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-          if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                                     /*Unevaluated*/ false, End))
-            return Err;
+          if (ParseIntelIdentifier(Val, Identifier, Info,
+                                   /*Unevaluated=*/false, End))
+            return true;
         }
         SM.onIdentifierExpr(Val, Identifier);
         UpdateLocLex = false;
         break;
       }
-      return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+      return Error(Tok.getLoc(), "Unexpected identifier!");
     }
     case AsmToken::Integer:
       if (isParsingInlineAsm() && SM.getAddImmPrefix())
@@ -1320,14 +1339,14 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     case AsmToken::RParen:  SM.onRParen(); break;
     }
     if (SM.hadError())
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return Error(Tok.getLoc(), "unknown token in expression");
 
     if (!Done && UpdateLocLex) {
       End = Tok.getLoc();
       Parser.Lex(); // Consume the token.
     }
   }
-  return 0;
+  return false;
 }
 
 X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
@@ -1344,8 +1363,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // may have already parsed an immediate displacement before the bracketed
   // expression.
   IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
-  if (X86Operand *Err = ParseIntelExpression(SM, End))
-    return Err;
+  if (ParseIntelExpression(SM, End))
+    return 0;
 
   const MCExpr *Disp;
   if (const MCExpr *Sym = SM.getSym()) {
@@ -1363,8 +1382,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // Parse the dot operator (e.g., [ebx].foo.bar).
   if (Tok.getString().startswith(".")) {
     const MCExpr *NewDisp;
-    if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp))
-      return Err;
+    if (ParseIntelDotOperator(Disp, NewDisp))
+      return 0;
     
     End = Tok.getEndLoc();
     Parser.Lex();  // Eat the field.
@@ -1392,11 +1411,10 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
 }
 
 // Inline assembly may use variable names with namespace alias qualifiers.
-X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
-                                               StringRef &Identifier,
-                                               InlineAsmIdentifierInfo &Info,
-                                               bool IsUnevaluatedOperand,
-                                               SMLoc &End) {
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+                                        StringRef &Identifier,
+                                        InlineAsmIdentifierInfo &Info,
+                                        bool IsUnevaluatedOperand, SMLoc &End) {
   assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
   Val = 0;
 
@@ -1421,68 +1439,89 @@ X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
   Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
-  return 0;
+  return false;
 }
 
-/// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg,
-                                               int64_t ImmDisp,
-                                               SMLoc Start) {
-  const AsmToken &Tok = Parser.getTok();
-  SMLoc End;
-
-  unsigned Size = getIntelMemOperandSize(Tok.getString());
-  if (Size) {
-    Parser.Lex(); // Eat operand size (e.g., byte, word).
-    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
-      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
-    Parser.Lex(); // Eat ptr.
-  }
-
-  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+/// \brief Parse intel style segment override.
+X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
+                                                    SMLoc Start,
+                                                    unsigned Size) {
+  assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+  const AsmToken &Tok = Parser.getTok(); // Eat colon.
+  if (Tok.isNot(AsmToken::Colon))
+    return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+  Parser.Lex(); // Eat ':'
+
+  int64_t ImmDisp = 0;
   if (getLexer().is(AsmToken::Integer)) {
+    ImmDisp = Tok.getIntVal();
+    AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
     if (isParsingInlineAsm())
-      InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
-                                                  Tok.getLoc()));
-    int64_t ImmDisp = Tok.getIntVal();
-    Parser.Lex(); // Eat the integer.
-    if (getLexer().isNot(AsmToken::LBrac))
-      return ErrorOperand(Start, "Expected '[' token!");
-    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+      InstInfo->AsmRewrites->push_back(
+          AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc()));
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // An immediate following a 'segment register', 'colon' token sequence can
+      // be followed by a bracketed expression.  If it isn't we know we have our
+      // final segment override.
+      const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext());
+      return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0,
+                                   /*Scale=*/1, Start, ImmDispToken.getEndLoc(),
+                                   Size);
+    }
   }
 
   if (getLexer().is(AsmToken::LBrac))
     return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
 
-  if (!ParseRegister(SegReg, Start, End)) {
-    // Handel SegReg : [ ... ]
-    if (getLexer().isNot(AsmToken::Colon))
-      return ErrorOperand(Start, "Expected ':' token!");
-    Parser.Lex(); // Eat :
-    if (getLexer().isNot(AsmToken::LBrac))
-      return ErrorOperand(Start, "Expected '[' token!");
-    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+  const MCExpr *Val;
+  SMLoc End;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(Val, Start, End, Size);
   }
 
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+/// ParseIntelMemOperand - Parse intel style memory operand.
+X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
+                                               unsigned Size) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc End;
+
+  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
+
   const MCExpr *Val;
   if (!isParsingInlineAsm()) {
     if (getParser().parsePrimaryExpr(Val, End))
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
 
     return X86Operand::CreateMem(Val, Start, End, Size);
   }
 
   InlineAsmIdentifierInfo Info;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ false, End))
-    return Err;
-  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
                                /*Scale=*/1, Start, End, Size, Identifier, Info);
 }
 
 /// Parse the '.' operator.
-X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
                                                 const MCExpr *&NewDisp) {
   const AsmToken &Tok = Parser.getTok();
   int64_t OrigDispVal, DotDispVal;
@@ -1491,7 +1530,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
     OrigDispVal = OrigDisp->getValue();
   else
-    return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!");
+    return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
 
   // Drop the '.'.
   StringRef DotDispStr = Tok.getString().drop_front(1);
@@ -1506,10 +1545,10 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
     std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
     if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
                                            DotDisp))
-      return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!");
+      return Error(Tok.getLoc(), "Unable to lookup field reference!");
     DotDispVal = DotDisp;
   } else
-    return ErrorOperand(Tok.getLoc(), "Unexpected token type!");
+    return Error(Tok.getLoc(), "Unexpected token type!");
 
   if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
     SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
@@ -1520,7 +1559,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   }
 
   NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
-  return 0;
+  return false;
 }
 
 /// Parse the 'offset' operator.  This operator is used to specify the
@@ -1534,9 +1573,9 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ false, End))
-    return Err;
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
 
   // Don't emit the offset operator.
   InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -1570,9 +1609,12 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ true, End))
-    return Err;
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/true, End))
+    return 0;
+
+  if (!Info.OpDecl)
+    return ErrorOperand(Start, "unable to lookup expression");
 
   unsigned CVal = 0;
   switch(OpKind) {
@@ -1593,7 +1635,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
-  SMLoc Start = Tok.getLoc(), End;
+  SMLoc Start, End;
 
   // Offset, length, type and size operators.
   if (isParsingInlineAsm()) {
@@ -1608,14 +1650,23 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
       return ParseIntelOperator(IOK_TYPE);
   }
 
+  unsigned Size = getIntelMemOperandSize(Tok.getString());
+  if (Size) {
+    Parser.Lex(); // Eat operand size (e.g., byte, word).
+    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
+    Parser.Lex(); // Eat ptr.
+  }
+  Start = Tok.getLoc();
+
   // Immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
       getLexer().is(AsmToken::LParen)) {    
     AsmToken StartTok = Tok;
     IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
                              /*AddImmPrefix=*/false);
-    if (X86Operand *Err = ParseIntelExpression(SM, End))
-      return Err;
+    if (ParseIntelExpression(SM, End))
+      return 0;
 
     int64_t Imm = SM.getImm();
     if (isParsingInlineAsm()) {
@@ -1639,23 +1690,22 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
                           "before bracketed expr.");
 
     // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
-    return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start);
+    return ParseIntelMemOperand(Imm, Start, Size);
   }
 
   // Register.
   unsigned RegNo = 0;
   if (!ParseRegister(RegNo, Start, End)) {
     // If this is a segment register followed by a ':', then this is the start
-    // of a memory reference, otherwise this is a normal register reference.
+    // of a segment override, otherwise this is a normal register reference.
     if (getLexer().isNot(AsmToken::Colon))
       return X86Operand::CreateReg(RegNo, Start, End);
 
-    getParser().Lex(); // Eat the colon.
-    return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start);
+    return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
   }
 
   // Memory operand.
-  return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start);
+  return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
 }
 
 X86Operand *X86AsmParser::ParseATTOperand() {
@@ -1967,6 +2017,47 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       }
     }
 
+    if (STI.getFeatureBits() & X86::FeatureAVX512) {
+      // Parse mask register {%k1}
+      if (getLexer().is(AsmToken::LCurly)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(X86Operand::CreateToken("{", Loc));
+        Parser.Lex();  // Eat the {
+        if (X86Operand *Op = ParseOperand()) {
+          Operands.push_back(Op);
+          if (!getLexer().is(AsmToken::RCurly)) {
+            SMLoc Loc = getLexer().getLoc();
+            Parser.eatToEndOfStatement();
+            return Error(Loc, "Expected } at this point");
+          }
+          Loc = Parser.getTok().getLoc();
+          Operands.push_back(X86Operand::CreateToken("}", Loc));
+          Parser.Lex();  // Eat the }
+        } else {
+          Parser.eatToEndOfStatement();
+          return true;
+        }
+      }
+      // Parse "zeroing non-masked" semantic {z}
+      if (getLexer().is(AsmToken::LCurly)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(X86Operand::CreateToken("{z}", Loc));
+        Parser.Lex();  // Eat the {
+        if (!getLexer().is(AsmToken::Identifier) || getLexer().getTok().getIdentifier() != "z") {
+          SMLoc Loc = getLexer().getLoc();
+          Parser.eatToEndOfStatement();
+          return Error(Loc, "Expected z at this point");
+        }
+        Parser.Lex();  // Eat the z
+        if (!getLexer().is(AsmToken::RCurly)) {
+            SMLoc Loc = getLexer().getLoc();
+            Parser.eatToEndOfStatement();
+            return Error(Loc, "Expected } at this point");
+        }
+        Parser.Lex();  // Eat the }
+      }
+    }
+
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       SMLoc Loc = getLexer().getLoc();
       Parser.eatToEndOfStatement();
@@ -2218,6 +2309,55 @@ processInstruction(MCInst &Inst,
   case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
   case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
   case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg()))
+      return false;
+
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
+    case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+    case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
+    case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+    case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
+    case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+    case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
+    case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+    case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
+    case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+    case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
+    case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg()))
+      return false;
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV;   break;
+    case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV;   break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
   }
 }
 
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 82af6fa..903e36c 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -231,16 +231,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     default:
       break;
     case 1:
-      type = TYPE_MOFFS8;
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
       break;
     case 2:
-      type = TYPE_MOFFS16;
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
       break;
     case 4:
-      type = TYPE_MOFFS32;
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
       break;
     case 8:
-      type = TYPE_MOFFS64;
       break;
     }
   }
@@ -263,16 +265,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
           Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
           Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
           Opcode != X86::VINSERTPSrr)
-        type = TYPE_MOFFS8;
+        if(immediate & 0x80)
+          immediate |= ~(0xffull);
       break;
     case ENCODING_IW:
-      type = TYPE_MOFFS16;
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
       break;
     case ENCODING_ID:
-      type = TYPE_MOFFS32;
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
       break;
     case ENCODING_IO:
-      type = TYPE_MOFFS64;
       break;
     }
   }
@@ -292,30 +296,21 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   case TYPE_REL8:
     isBranch = true;
     pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    // fall through to sign extend the immediate if needed.
-  case TYPE_MOFFS8:
     if(immediate & 0x80)
       immediate |= ~(0xffull);
     break;
-  case TYPE_MOFFS16:
-    if(immediate & 0x8000)
-      immediate |= ~(0xffffull);
-    break;
   case TYPE_REL32:
   case TYPE_REL64:
     isBranch = true;
     pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    // fall through to sign extend the immediate if needed.
-  case TYPE_MOFFS32:
     if(immediate & 0x80000000)
       immediate |= ~(0xffffffffull);
     break;
-  case TYPE_MOFFS64:
   default:
     // operand is 64 bits wide.  Do nothing.
     break;
   }
-    
+
   if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
                                insn.immediateOffset, insn.immediateSize,
                                mcInst, Dis))
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index bb195ee..c81a857 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -25,8 +25,6 @@
 #define TRUE  1
 #define FALSE 0
 
-typedef int8_t bool;
-
 #ifndef NDEBUG
 #define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
 #else
@@ -81,6 +79,15 @@ static int modRMRequired(OpcodeType type,
   case THREEBYTE_A7:
     decision = &THREEBYTEA7_SYM;
     break;
+  case XOP8_MAP:
+    decision = &XOP8_MAP_SYM;
+    break;
+  case XOP9_MAP:
+    decision = &XOP9_MAP_SYM;
+    break;
+  case XOPA_MAP:
+    decision = &XOPA_MAP_SYM;
+    break;
   }
 
   return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
@@ -122,6 +129,15 @@ static InstrUID decode(OpcodeType type,
   case THREEBYTE_A7:
     dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
+  case XOP8_MAP:
+    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP9_MAP:
+    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOPA_MAP:
+    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
   }
 
   switch (dec->modrm_type) {
@@ -305,6 +321,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
   BOOL prefixGroups[4] = { FALSE };
   uint64_t prefixLocation;
   uint8_t byte = 0;
+  uint8_t nextByte;
 
   BOOL hasAdSize = FALSE;
   BOOL hasOpSize = FALSE;
@@ -314,20 +331,21 @@ static int readPrefixes(struct InternalInstruction* insn) {
   while (isPrefix) {
     prefixLocation = insn->readerCursor;
 
+    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
     if (consumeByte(insn, &byte))
-      return -1;
+      break;
 
     /*
      * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
      * break and let it be disassembled as a normal "instruction".
      */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
+
     if (insn->readerCursor - 1 == insn->startLocation
-        && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) {
-      uint8_t nextByte;
-      if (byte == 0xf0)
-        break;
-      if (lookAtByte(insn, &nextByte))
-        return -1;
+        && (byte == 0xf2 || byte == 0xf3)
+        && !lookAtByte(insn, &nextByte))
+    {
       /*
        * If the byte is 0xf2 or 0xf3, and any of the following conditions are
        * met:
@@ -426,7 +444,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       dbgprintf(insn, "Found prefix 0x%hhx", byte);
   }
 
-  insn->vexSize = 0;
+  insn->vexXopType = TYPE_NO_VEX_XOP;
 
   if (byte == 0xc4) {
     uint8_t byte1;
@@ -437,7 +455,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexSize = 3;
+      insn->vexXopType = TYPE_VEX_3B;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
     else {
@@ -445,22 +463,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
 
-    if (insn->vexSize == 3) {
-      insn->vexPrefix[0] = byte;
-      consumeByte(insn, &insn->vexPrefix[1]);
-      consumeByte(insn, &insn->vexPrefix[2]);
+    if (insn->vexXopType == TYPE_VEX_3B) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
+      consumeByte(insn, &insn->vexXopPrefix[2]);
 
       /* We simulate the REX prefix for simplicity's sake */
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
-                        | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
-                        | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
-                        | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+                        | (wFromVEX3of3(insn->vexXopPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vexXopPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vexXopPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vexXopPrefix[1]) << 0);
       }
 
-      switch (ppFromVEX3of3(insn->vexPrefix[2]))
+      switch (ppFromVEX3of3(insn->vexXopPrefix[2]))
       {
       default:
         break;
@@ -469,7 +487,9 @@ static int readPrefixes(struct InternalInstruction* insn) {
         break;
       }
 
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]);
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
+                insn->vexXopPrefix[2]);
     }
   }
   else if (byte == 0xc5) {
@@ -481,22 +501,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexSize = 2;
+      insn->vexXopType = TYPE_VEX_2B;
     }
     else {
       unconsumeByte(insn);
     }
 
-    if (insn->vexSize == 2) {
-      insn->vexPrefix[0] = byte;
-      consumeByte(insn, &insn->vexPrefix[1]);
+    if (insn->vexXopType == TYPE_VEX_2B) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+                        | (rFromVEX2of2(insn->vexXopPrefix[1]) << 2);
       }
 
-      switch (ppFromVEX2of2(insn->vexPrefix[1]))
+      switch (ppFromVEX2of2(insn->vexXopPrefix[1]))
       {
       default:
         break;
@@ -505,7 +525,53 @@ static int readPrefixes(struct InternalInstruction* insn) {
         break;
       }
 
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]);
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexXopPrefix[0], insn->vexXopPrefix[1]);
+    }
+  }
+  else if (byte == 0x8f) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of XOP");
+      return -1;
+    }
+
+    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+      insn->vexXopType = TYPE_XOP;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vexXopType == TYPE_XOP) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
+      consumeByte(insn, &insn->vexXopPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromXOP3of3(insn->vexXopPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vexXopPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vexXopPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vexXopPrefix[1]) << 0);
+      }
+
+      switch (ppFromXOP3of3(insn->vexXopPrefix[2]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = TRUE;
+        break;
+      }
+
+      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
+                insn->vexXopPrefix[2]);
     }
   }
   else {
@@ -580,37 +646,49 @@ static int readOpcode(struct InternalInstruction* insn) {
 
   insn->opcodeType = ONEBYTE;
 
-  if (insn->vexSize == 3)
+  if (insn->vexXopType == TYPE_VEX_3B)
   {
-    switch (mmmmmFromVEX2of3(insn->vexPrefix[1]))
+    switch (mmmmmFromVEX2of3(insn->vexXopPrefix[1]))
     {
     default:
-      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1]));
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
       return -1;
-    case 0:
-      break;
     case VEX_LOB_0F:
-      insn->twoByteEscape = 0x0f;
       insn->opcodeType = TWOBYTE;
       return consumeByte(insn, &insn->opcode);
     case VEX_LOB_0F38:
-      insn->twoByteEscape = 0x0f;
-      insn->threeByteEscape = 0x38;
       insn->opcodeType = THREEBYTE_38;
       return consumeByte(insn, &insn->opcode);
     case VEX_LOB_0F3A:
-      insn->twoByteEscape = 0x0f;
-      insn->threeByteEscape = 0x3a;
       insn->opcodeType = THREEBYTE_3A;
       return consumeByte(insn, &insn->opcode);
     }
   }
-  else if (insn->vexSize == 2)
+  else if (insn->vexXopType == TYPE_VEX_2B)
   {
-    insn->twoByteEscape = 0x0f;
     insn->opcodeType = TWOBYTE;
     return consumeByte(insn, &insn->opcode);
   }
+  else if (insn->vexXopType == TYPE_XOP)
+  {
+    switch (mmmmmFromXOP2of3(insn->vexXopPrefix[1]))
+    {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
+      return -1;
+    case XOP_MAP_SELECT_8:
+      insn->opcodeType = XOP8_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_9:
+      insn->opcodeType = XOP9_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_A:
+      insn->opcodeType = XOPA_MAP;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
 
   if (consumeByte(insn, &current))
     return -1;
@@ -618,16 +696,12 @@ static int readOpcode(struct InternalInstruction* insn) {
   if (current == 0x0f) {
     dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
 
-    insn->twoByteEscape = current;
-
     if (consumeByte(insn, &current))
       return -1;
 
     if (current == 0x38) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -635,8 +709,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0x3a) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -644,8 +716,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0xa6) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -653,8 +723,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0xa7) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -768,11 +836,27 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
   if (insn->mode == MODE_64BIT)
     attrMask |= ATTR_64BIT;
 
-  if (insn->vexSize) {
+  if (insn->vexXopType != TYPE_NO_VEX_XOP) {
     attrMask |= ATTR_VEX;
 
-    if (insn->vexSize == 3) {
-      switch (ppFromVEX3of3(insn->vexPrefix[2])) {
+    if (insn->vexXopType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vexXopPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vexXopPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vexXopType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vexXopPrefix[1])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -784,11 +868,11 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX3of3(insn->vexPrefix[2]))
+      if (lFromVEX2of2(insn->vexXopPrefix[1]))
         attrMask |= ATTR_VEXL;
     }
-    else if (insn->vexSize == 2) {
-      switch (ppFromVEX2of2(insn->vexPrefix[1])) {
+    else if (insn->vexXopType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vexXopPrefix[2])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -800,7 +884,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX2of2(insn->vexPrefix[1]))
+      if (lFromXOP3of3(insn->vexXopPrefix[2]))
         attrMask |= ATTR_VEXL;
     }
     else {
@@ -826,42 +910,6 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
 
   /* The following clauses compensate for limitations of the tables. */
 
-  if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) &&
-      !(attrMask & ATTR_OPSIZE)) {
-    /*
-     * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit
-     * has precedence since there are no L-bit with W-bit entries in the tables.
-     * So if the L-bit isn't significant we should use the W-bit instead.
-     * We only need to do this if the instruction doesn't specify OpSize since
-     * there is a VEX_L_W_OPSIZE table.
-     */
-
-    const struct InstructionSpecifier *spec;
-    uint16_t instructionIDWithWBit;
-    const struct InstructionSpecifier *specWithWBit;
-
-    spec = specifierForUID(instructionID);
-
-    if (getIDWithAttrMask(&instructionIDWithWBit,
-                          insn,
-                          (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) {
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-      return 0;
-    }
-
-    specWithWBit = specifierForUID(instructionIDWithWBit);
-
-    if (instructionID != instructionIDWithWBit) {
-      insn->instructionID = instructionIDWithWBit;
-      insn->spec = specWithWBit;
-    } else {
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-    }
-    return 0;
-  }
-
   if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
     /*
      * The instruction tables make no distinction between instructions that
@@ -1502,10 +1550,12 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 static int readVVVV(struct InternalInstruction* insn) {
   dbgprintf(insn, "readVVVV()");
 
-  if (insn->vexSize == 3)
-    insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]);
-  else if (insn->vexSize == 2)
-    insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]);
+  if (insn->vexXopType == TYPE_VEX_3B)
+    insn->vvvv = vvvvFromVEX3of3(insn->vexXopPrefix[2]);
+  else if (insn->vexXopType == TYPE_VEX_2B)
+    insn->vvvv = vvvvFromVEX2of2(insn->vexXopPrefix[1]);
+  else if (insn->vexXopType == TYPE_XOP)
+    insn->vvvv = vvvvFromXOP3of3(insn->vexXopPrefix[2]);
   else
     return -1;
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index dcb6aad..6d03d5c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -59,6 +59,15 @@ extern "C" {
 #define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
 #define ppFromVEX2of2(vex)      ((vex) & 0x3)
 
+#define rFromXOP2of3(xop)       (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop)       (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop)       (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop)   ((xop) & 0x1f)
+#define wFromXOP3of3(xop)       (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop)      ((xop) & 0x3)
+
 /*
  * These enums represent Intel registers for use by the decoder.
  */
@@ -447,6 +456,12 @@ typedef enum {
   VEX_LOB_0F3A = 0x3
 } VEXLeadingOpcodeByte;
 
+typedef enum {
+  XOP_MAP_SELECT_8 = 0x8,
+  XOP_MAP_SELECT_9 = 0x9,
+  XOP_MAP_SELECT_A = 0xA
+} XOPMapSelect;
+
 /*
  * VEXPrefixCode - Possible values for the VEX.pp field
  */
@@ -458,6 +473,13 @@ typedef enum {
   VEX_PREFIX_F2 = 0x3
 } VEXPrefixCode;
 
+typedef enum {
+  TYPE_NO_VEX_XOP = 0x0,
+  TYPE_VEX_2B = 0x1,
+  TYPE_VEX_3B = 0x2,
+  TYPE_XOP = 0x3
+} VEXXOPType;
+
 typedef uint8_t BOOL;
 
 /*
@@ -514,10 +536,10 @@ struct InternalInstruction {
   uint8_t prefixPresent[0x100];
   /* contains the location (for use with the reader) of the prefix byte */
   uint64_t prefixLocations[0x100];
-  /* The value of the VEX prefix, if present */
-  uint8_t vexPrefix[3];
+  /* The value of the VEX/XOP prefix, if present */
+  uint8_t vexXopPrefix[3];
   /* The length of the VEX prefix (0 if not present) */
-  uint8_t vexSize;
+  VEXXOPType vexXopType;
   /* The value of the REX prefix, if present */
   uint8_t rexPrefix;
   /* The location where a mandatory prefix would have to be (i.e., right before
@@ -541,10 +563,6 @@ struct InternalInstruction {
 
   /* opcode state */
 
-  /* The value of the two-byte escape prefix (usually 0x0f) */
-  uint8_t twoByteEscape;
-  /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
-  uint8_t threeByteEscape;
   /* The last byte of the opcode, not counting any ModR/M extension */
   uint8_t opcode;
   /* The ModR/M byte of the instruction, if it is an opcode extension */
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index d291441..dd1719c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -32,6 +32,9 @@
 #define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
 #define THREEBYTEA6_SYM   x86DisassemblerThreeByteA6Opcodes
 #define THREEBYTEA7_SYM   x86DisassemblerThreeByteA7Opcodes
+#define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
 
 #define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
 #define CONTEXTS_STR      "x86DisassemblerContexts"
@@ -41,6 +44,9 @@
 #define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
 #define THREEBYTEA6_STR   "x86DisassemblerThreeByteA6Opcodes"
 #define THREEBYTEA7_STR   "x86DisassemblerThreeByteA7Opcodes"
+#define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
 
 /*
  * Attributes of an instruction that must be known before the opcode can be
@@ -116,10 +122,10 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
   ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
   ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")         \
-  ENUM_ENTRY(IC_VEX_L_W,            3,  "requires VEX, L and W")               \
-  ENUM_ENTRY(IC_VEX_L_W_XS,         4,  "requires VEX, L, W and XS prefix")    \
-  ENUM_ENTRY(IC_VEX_L_W_XD,         4,  "requires VEX, L, W and XD prefix")    \
-  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     4,  "requires VEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_VEX_L_W,            4,  "requires VEX, L and W")               \
+  ENUM_ENTRY(IC_VEX_L_W_XS,         5,  "requires VEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_XD,         5,  "requires VEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")       \
   ENUM_ENTRY(IC_EVEX,               1,  "requires an EVEX prefix")             \
   ENUM_ENTRY(IC_EVEX_XS,            2,  "requires EVEX and the XS prefix")     \
   ENUM_ENTRY(IC_EVEX_XD,            2,  "requires EVEX and the XD prefix")     \
@@ -215,7 +221,55 @@ enum attributeBits {
   ENUM_ENTRY(IC_EVEX_L2_W_K_B,        3,  "requires EVEX_B, EVEX_K, L2 and W")               \
   ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
   ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
-  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, W and OpSize") 
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ_B,             1,  "requires EVEX_B and EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,      2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ_B,          3,  "requires EVEX_B, EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B,   4,  "requires EVEX_B, EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ_B,        3,  "requires EVEX_B, EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4,  "requires EVEX_B, EVEX_KZ, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ,             1,  "requires an EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ,          2,  "requires EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ,          2,  "requires EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ,      2,  "requires EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ,           3,  "requires EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ,        4,  "requires EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ,        4,  "requires EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ,    4,  "requires EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ,           3,  "requires EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ,        4,  "requires EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ,        4,  "requires EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ,    4,  "requires EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ,         3,  "requires EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ,      4,  "requires EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ,      4,  "requires EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ,  4,  "requires EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ,          3,  "requires EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ,       4,  "requires EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ,       4,  "requires EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ,   4,  "requires EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ,        3,  "requires EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ,     4,  "requires EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ,     4,  "requires EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")     
 
 #define ENUM_ENTRY(n, r, d) n,
 typedef enum {
@@ -234,7 +288,10 @@ typedef enum {
   THREEBYTE_38  = 2,
   THREEBYTE_3A  = 3,
   THREEBYTE_A6  = 4,
-  THREEBYTE_A7  = 5
+  THREEBYTE_A7  = 5,
+  XOP8_MAP      = 6,
+  XOP9_MAP      = 7,
+  XOPA_MAP      = 8
 } OpcodeType;
 
 /*
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index b9d0082..4439311 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -215,3 +215,19 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
 
   O << markup(">");
 }
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                       raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+
+  O << markup("<mem:");
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << markup(">");
+}
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 8d05256..a8fab72 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -42,7 +42,8 @@ public:
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
@@ -86,6 +87,19 @@ public:
   void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
+
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
 };
   
 }
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 9dfc9a9..e7e7b15 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -200,3 +200,19 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   
   O << ']';
 }
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+
+  O << '[';
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << ']';
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 45beeda..590bf68 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -39,7 +39,8 @@ public:
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "opaque ptr ";
     printMemReference(MI, OpNo, O);
@@ -97,6 +98,23 @@ public:
     O << "zmmword ptr ";
     printMemReference(MI, OpNo, O);
   }
+
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
 };
   
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 598ddee..f8e359b 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -19,10 +20,10 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -67,9 +68,16 @@ public:
 
 class X86AsmBackend : public MCAsmBackend {
   StringRef CPU;
+  bool HasNopl;
 public:
   X86AsmBackend(const Target &T, StringRef _CPU)
-    : MCAsmBackend(), CPU(_CPU) {}
+    : MCAsmBackend(), CPU(_CPU) {
+    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+              CPU != "c3" && CPU != "c3-2";
+  }
 
   unsigned getNumFixupKinds() const {
     return X86::NumTargetFixupKinds;
@@ -308,8 +316,8 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 
   // This CPU doesnt support long nops. If needed add more.
   // FIXME: Can we get this from the subtarget somehow?
-  if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" ||
-      CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") {
+  // FIXME: We could generated something better than plain 0x90.
+  if (!HasNopl) {
     for (uint64_t i = 0; i < Count; ++i)
       OW->Write8(0x90);
     return true;
@@ -334,6 +342,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 /* *** */
 
 namespace {
+
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
@@ -382,35 +391,368 @@ public:
   }
 };
 
+namespace CU {
+
+  /// Compact unwind encoding values.
+  enum CompactUnwindEncodings {
+    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+    /// the return address, then [RE]SP is moved to [RE]BP.
+    UNWIND_MODE_BP_FRAME                   = 0x01000000,
+
+    /// A frameless function with a small constant stack size.
+    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
+
+    /// A frameless function with a large constant stack size.
+    UNWIND_MODE_STACK_IND                  = 0x03000000,
+
+    /// No compact unwind encoding is available.
+    UNWIND_MODE_DWARF                      = 0x04000000,
+
+    /// Mask for encoding the frame registers.
+    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
+
+    /// Mask for encoding the frameless registers.
+    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+  };
+
+} // end CU namespace
+
 class DarwinX86AsmBackend : public X86AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Number of registers that can be saved in a compact unwind encoding.
+  enum { CU_NUM_SAVED_REGS = 6 };
+
+  mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  bool Is64Bit;
+
+  unsigned OffsetSize;                   ///< Offset of a "push" instruction.
+  unsigned PushInstrSize;                ///< Size of a "push" instruction.
+  unsigned MoveInstrSize;                ///< Size of a "move" instruction.
+  unsigned StackDivide;                  ///< Amount to adjust stack stize by.
+protected:
+  /// \brief Implementation of algorithm to generate the compact unwind encoding
+  /// for the CFI instructions.
+  uint32_t
+  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+    if (Instrs.empty()) return 0;
+
+    // Reset the saved registers.
+    unsigned SavedRegIdx = 0;
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+
+    bool HasFP = false;
+
+    // Encode that we are using EBP/RBP as the frame pointer.
+    uint32_t CompactUnwindEncoding = 0;
+
+    unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+    unsigned InstrOffset = 0;
+    unsigned StackAdjust = 0;
+    unsigned StackSize = 0;
+    unsigned PrevStackSize = 0;
+    unsigned NumDefCFAOffsets = 0;
+
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Any other CFI directives indicate a frame that we aren't prepared
+        // to represent via compact unwind, so just bail out.
+        return 0;
+      case MCCFIInstruction::OpDefCfaRegister: {
+        // Defines a frame pointer. E.g.
+        //
+        //     movq %rsp, %rbp
+        //  L0:
+        //     .cfi_def_cfa_register %rbp
+        //
+        HasFP = true;
+        assert(MRI.getLLVMRegNum(Inst.getRegister(), true) ==
+               (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!");
+
+        // Reset the counts.
+        memset(SavedRegs, 0, sizeof(SavedRegs));
+        StackAdjust = 0;
+        SavedRegIdx = 0;
+        InstrOffset += MoveInstrSize;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        // Defines a new offset for the CFA. E.g.
+        //
+        //  With frame:
+        //  
+        //     pushq %rbp
+        //  L0:
+        //     .cfi_def_cfa_offset 16
+        //
+        //  Without frame:
+        //
+        //     subq $72, %rsp
+        //  L0:
+        //     .cfi_def_cfa_offset 80
+        //
+        PrevStackSize = StackSize;
+        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        ++NumDefCFAOffsets;
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Defines a "push" of a callee-saved register. E.g.
+        //
+        //     pushq %r15
+        //     pushq %r14
+        //     pushq %rbx
+        //  L0:
+        //     subq $120, %rsp
+        //  L1:
+        //     .cfi_offset %rbx, -40
+        //     .cfi_offset %r14, -32
+        //     .cfi_offset %r15, -24
+        //
+        if (SavedRegIdx == CU_NUM_SAVED_REGS)
+          // If there are too many saved registers, we cannot use a compact
+          // unwind encoding.
+          return CU::UNWIND_MODE_DWARF;
+
+        unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        SavedRegs[SavedRegIdx++] = Reg;
+        StackAdjust += OffsetSize;
+        InstrOffset += PushInstrSize;
+        break;
+      }
+      }
+    }
+
+    StackAdjust /= StackDivide;
+
+    if (HasFP) {
+      if ((StackAdjust & 0xFF) != StackAdjust)
+        // Offset was too big for a compact unwind encoding.
+        return CU::UNWIND_MODE_DWARF;
+
+      // Get the encoding of the saved registers when we have a frame pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+      CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+      CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+    } else {
+      // If the amount of the stack allocation is the size of a register, then
+      // we "push" the RAX/EAX register onto the stack instead of adjusting the
+      // stack pointer with a SUB instruction. We don't support the push of the
+      // RAX/EAX register with compact unwind. So we check for that situation
+      // here.
+      if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+           StackSize - PrevStackSize == 1) ||
+          (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+        return CU::UNWIND_MODE_DWARF;
+
+      SubtractInstrIdx += InstrOffset;
+      ++StackAdjust;
+
+      if ((StackSize & 0xFF) == StackSize) {
+        // Frameless stack with a small stack size.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+        // Encode the stack size.
+        CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+      } else {
+        if ((StackAdjust & 0x7) != StackAdjust)
+          // The extra stack adjustments are too big for us to handle.
+          return CU::UNWIND_MODE_DWARF;
+
+        // Frameless stack with an offset too large for us to encode compactly.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+        // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+        // instruction.
+        CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+        // Encode any extra stack stack adjustments (done via push
+        // instructions).
+        CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+      }
+
+      // Encode the number of registers saved. (Reverse the list first.)
+      std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+      CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+      // Get the encoding of the saved registers when we don't have a frame
+      // pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      // Encode the register encoding.
+      CompactUnwindEncoding |=
+        RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+    }
+
+    return CompactUnwindEncoding;
+  }
+
+private:
+  /// \brief Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const uint16_t CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const uint16_t CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// \brief Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// \brief Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
 public:
-  DarwinX86AsmBackend(const Target &T, StringRef CPU)
-    : X86AsmBackend(T, CPU) { }
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+                      bool Is64Bit)
+    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+    PushInstrSize = 1;
+  }
 };
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+  bool SupportsCU;
 public:
-  DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
-    : DarwinX86AsmBackend(T, CPU) {}
+  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, bool SupportsCU)
+    : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_i386,
-                                     object::mach::CSX86_ALL);
+                                     MachO::CPU_TYPE_I386,
+                                     MachO::CPU_SUBTYPE_I386_ALL);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
   }
 };
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+  bool SupportsCU;
+  const MachO::CPUSubTypeX86 Subtype;
 public:
-  DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
-    : DarwinX86AsmBackend(T, CPU) {
+  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, bool SupportsCU,
+                         MachO::CPUSubTypeX86 st)
+    : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU),
+      Subtype(st) {
     HasReliableSymbolDifference = true;
   }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
-                                     object::mach::CTM_x86_64,
-                                     object::mach::CSX86_ALL);
+                                     MachO::CPU_TYPE_X86_64, Subtype);
   }
 
   virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
@@ -445,15 +787,26 @@ public:
       return false;
     }
   }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
+  }
 };
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_32AsmBackend(T, CPU);
+    return new DarwinX86_32AsmBackend(T, MRI, CPU,
+                                      TheTriple.isMacOSX() &&
+                                      !TheTriple.isMacOSXVersionLT(10, 7));
 
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return new WindowsX86AsmBackend(T, false, CPU);
@@ -462,11 +815,21 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, String
   return new ELFX86_32AsmBackend(T, OSABI, CPU);
 }
 
-MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_64AsmBackend(T, CPU);
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+    MachO::CPUSubTypeX86 CS =
+        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+    return new DarwinX86_64AsmBackend(T, MRI, CPU,
+                                      TheTriple.isMacOSX() &&
+                                      !TheTriple.isMacOSXVersionLT(10, 7), CS);
+  }
 
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return new WindowsX86AsmBackend(T, true, CPU);
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 25d1af3..1ef9814 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -354,6 +354,9 @@ namespace X86II {
     // XOP9 - Prefix to exclude use of imm byte.
     XOP9 = 21 << Op0Shift,
 
+    // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+    XOPA = 22 << Op0Shift,
+
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
     // They are used to specify GPRs and SSE registers, 64-bit operand size,
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index b400b87..3ddd865 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -108,6 +108,15 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
         case MCSymbolRefExpr::VK_None:
           Type = ELF::R_X86_64_64;
           break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_X86_64_GOT64;
+          break;
+        case MCSymbolRefExpr::VK_GOTOFF:
+          Type = ELF::R_X86_64_GOTOFF64;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_X86_64_TPOFF64;
+          break;
         case MCSymbolRefExpr::VK_DTPOFF:
           Type = ELF::R_X86_64_DTPOFF64;
           break;
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index 8f4ab46..a3eb4fb 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -13,7 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCRelocationInfo.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/ELF.h"
 
 using namespace llvm;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 7815ae9..3861e1c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -59,10 +59,8 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   // for .S files on other systems.  Perhaps this is because the file system
   // wasn't always case preserving or something.
   CommentString = "##";
-  PCSymbol = ".";
 
   SupportsDebugInformation = true;
-  DwarfUsesInlineInfoSection = true;
   UseDataRegionDirectives = MarkedJTDataRegions;
 
   // Exceptions handling
@@ -92,8 +90,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   TextAlignFillValue = 0x90;
 
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
-  PCSymbol = ".";
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
@@ -139,6 +135,8 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
+
+  AllowAtInName = true;
 }
 
 void X86MCAsmInfoGNUCOFF::anchor() { }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index b6b70fd..80979dd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class Triple;
@@ -35,7 +36,7 @@ namespace llvm {
                                 MCStreamer &Streamer) const;
   };
 
-  class X86ELFMCAsmInfo : public MCAsmInfo {
+  class X86ELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit X86ELFMCAsmInfo(const Triple &Triple);
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 8515879..7952607 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -564,7 +564,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   unsigned char VEX_W = 0;
 
   // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  unsigned char XOP = 0;
+  bool XOP = false;
 
   // VEX_5M (VEX m-mmmmm field):
   //
@@ -574,7 +574,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b00011: implied 0F 3A leading opcode bytes
   //  0b00100-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
-  //  0b10001: XOP map select - 09h instructions with no imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
   unsigned char VEX_5M = 0x1;
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -620,7 +621,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     VEX_W = 1;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = 1;
+    XOP = true;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
@@ -665,11 +666,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::XOP9:
     VEX_5M = 0x9;
     break;
-  case X86II::A6:  // Bypass: Not used by VEX
-  case X86II::A7:  // Bypass: Not used by VEX
-  case X86II::TB:  // Bypass: Not used by VEX
-  case 0:
-    break;  // No prefix!
+  case X86II::XOPA:
+    VEX_5M = 0xA;
+    break;
+  case X86II::TB: // VEX_5M/VEX_PP already correct
+    break;
   }
 
 
@@ -786,8 +787,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
       if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
         EVEX_V2 = 0x0;
+      CurOp++;
     }
-    CurOp++;
 
     if (HasEVEX_K)
       EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
@@ -868,11 +869,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::MRM6r: case X86II::MRM7r:
     // MRM0r-MRM7r instructions forms:
     //  dst(VEX_4V), src(ModR/M), imm8
-    VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-        EVEX_V2 = 0x0;
-    CurOp++;
-    
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+          EVEX_V2 = 0x0;
+      CurOp++;
+    }    
     if (HasEVEX_K)
       EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index bd23ce4..1cbdafd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -368,7 +368,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
 
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 2f459b4..41ae435 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -79,8 +79,10 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
-MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
 
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index 75b5acf..209b1d0 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -17,7 +17,7 @@
 
 using namespace llvm;
 using namespace object;
-using namespace macho;
+using namespace MachO;
 
 namespace {
 class X86_64MachORelocationInfo : public MCRelocationInfo {
@@ -33,7 +33,7 @@ public:
     StringRef SymName; SymI->getName(SymName);
     uint64_t  SymAddr; SymI->getAddress(SymAddr);
 
-    RelocationEntry RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+    any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
     bool isPCRel = Obj->getAnyRelocationPCRel(RE);
 
     MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
@@ -43,44 +43,44 @@ public:
     const MCExpr *Expr = 0;
 
     switch(RelType) {
-    case RIT_X86_64_TLV:
+    case X86_64_RELOC_TLV:
       Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
       break;
-    case RIT_X86_64_Signed4:
+    case X86_64_RELOC_SIGNED_4:
       Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
                                      MCConstantExpr::Create(4, Ctx),
                                      Ctx);
       break;
-    case RIT_X86_64_Signed2:
+    case X86_64_RELOC_SIGNED_2:
       Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
                                      MCConstantExpr::Create(2, Ctx),
                                      Ctx);
       break;
-    case RIT_X86_64_Signed1:
+    case X86_64_RELOC_SIGNED_1:
       Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
                                      MCConstantExpr::Create(1, Ctx),
                                      Ctx);
       break;
-    case RIT_X86_64_GOTLoad:
+    case X86_64_RELOC_GOT_LOAD:
       Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
       break;
-    case RIT_X86_64_GOT:
+    case X86_64_RELOC_GOT:
       Expr = MCSymbolRefExpr::Create(Sym, isPCRel ?
                                      MCSymbolRefExpr::VK_GOTPCREL :
                                      MCSymbolRefExpr::VK_GOT,
                                      Ctx);
       break;
-    case RIT_X86_64_Subtractor:
+    case X86_64_RELOC_SUBTRACTOR:
       {
         RelocationRef RelNext;
         Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext);
-        RelocationEntry RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
+        any_relocation_info RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
 
         // X86_64_SUBTRACTOR must be followed by a relocation of type
-        // X86_64_RELOC_UNSIGNED    .
+        // X86_64_RELOC_UNSIGNED.
         // NOTE: Scattered relocations don't exist on x86_64.
         unsigned RType = Obj->getAnyRelocationType(RENext);
-        if (RType != RIT_X86_64_Unsigned)
+        if (RType != X86_64_RELOC_UNSIGNED)
           report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
                              "X86_64_RELOC_SUBTRACTOR.");
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 64f005c..eb7c0b1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -16,12 +16,11 @@
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
 
 using namespace llvm;
-using namespace llvm::object;
 
 namespace {
 class X86MachObjectWriter : public MCMachObjectTargetWriter {
@@ -132,7 +131,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
 
   if (Target.isAbsolute()) { // constant
     // SymbolNum of 0 indicates the absolute section.
-    Type = macho::RIT_X86_64_Unsigned;
+    Type = MachO::X86_64_RELOC_UNSIGNED;
     Index = 0;
 
     // FIXME: I believe this is broken, I don't think the linker can understand
@@ -141,26 +140,31 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     // is to use an absolute symbol (which we don't support yet).
     if (IsPCRel) {
       IsExtern = 1;
-      Type = macho::RIT_X86_64_Branch;
+      Type = MachO::X86_64_RELOC_BRANCH;
     }
   } else if (Target.getSymB()) { // A - B + constant
     const MCSymbol *A = &Target.getSymA()->getSymbol();
+    if (A->isTemporary())
+      A = &A->AliasedSymbol();
     MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
+    if (B->isTemporary())
+      B = &B->AliasedSymbol();
     MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
 
     // Neither symbol can be modified.
     if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
         Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      report_fatal_error("unsupported relocation of modified symbol");
+      report_fatal_error("unsupported relocation of modified symbol", false);
 
     // We don't support PCrel relocations of differences. Darwin 'as' doesn't
     // implement most of these correctly.
     if (IsPCRel)
-      report_fatal_error("unsupported pc-relative relocation of difference");
+      report_fatal_error("unsupported pc-relative relocation of difference",
+                         false);
 
     // The support for the situation where one or both of the symbols would
     // require a local relocation is handled just like if the symbols were
@@ -173,7 +177,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     // single SIGNED relocation); reject it for now.  Except the case where both
     // symbols don't have a base, equal but both NULL.
     if (A_Base == B_Base && A_Base)
-      report_fatal_error("unsupported relocation with identical base");
+      report_fatal_error("unsupported relocation with identical base", false);
+
+    // A subtraction expression where both symbols are undefined is a
+    // non-relocatable expression.
+    if (A->isUndefined() && B->isUndefined())
+      report_fatal_error("unsupported relocation with subtraction expression",
+                         false);
 
     Value += Writer->getSymbolAddress(&A_SD, Layout) -
       (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
@@ -188,15 +198,15 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
       IsExtern = 0;
     }
-    Type = macho::RIT_X86_64_Unsigned;
-
-    macho::RelocationEntry MRE;
-    MRE.Word0 = FixupOffset;
-    MRE.Word1 = ((Index     <<  0) |
-                 (IsPCRel   << 24) |
-                 (Log2Size  << 25) |
-                 (IsExtern  << 27) |
-                 (Type      << 28));
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index     <<  0) |
+                   (IsPCRel   << 24) |
+                   (Log2Size  << 25) |
+                   (IsExtern  << 27) |
+                   (Type      << 28));
     Writer->addRelocation(Fragment->getParent(), MRE);
 
     if (B_Base) {
@@ -207,7 +217,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
       IsExtern = 0;
     }
-    Type = macho::RIT_X86_64_Subtractor;
+    Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
     MCSymbolData &SD = Asm.getSymbolData(*Symbol);
@@ -252,11 +262,11 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
         return;
       } else {
         report_fatal_error("unsupported relocation of variable '" +
-                           Symbol->getName() + "'");
+                           Symbol->getName() + "'", false);
       }
     } else {
       report_fatal_error("unsupported relocation of undefined symbol '" +
-                         Symbol->getName() + "'");
+                         Symbol->getName() + "'", false);
     }
 
     MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
@@ -267,15 +277,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
           // rewrite the movq to an leaq at link time if the symbol ends up in
           // the same linkage unit.
           if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
-            Type = macho::RIT_X86_64_GOTLoad;
+            Type = MachO::X86_64_RELOC_GOT_LOAD;
           else
-            Type = macho::RIT_X86_64_GOT;
+            Type = MachO::X86_64_RELOC_GOT;
         }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-          Type = macho::RIT_X86_64_TLV;
+          Type = MachO::X86_64_RELOC_TLV;
         }  else if (Modifier != MCSymbolRefExpr::VK_None) {
-          report_fatal_error("unsupported symbol modifier in relocation");
+          report_fatal_error("unsupported symbol modifier in relocation",
+                             false);
         } else {
-          Type = macho::RIT_X86_64_Signed;
+          Type = MachO::X86_64_RELOC_SIGNED;
 
           // The Darwin x86_64 relocation format has a problem where it cannot
           // encode an address (L<foo> + <constant>) which is outside the atom
@@ -292,34 +303,40 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
           // (the additional bias), but instead appear to just look at the final
           // offset.
           switch (-(Target.getConstant() + (1LL << Log2Size))) {
-          case 1: Type = macho::RIT_X86_64_Signed1; break;
-          case 2: Type = macho::RIT_X86_64_Signed2; break;
-          case 4: Type = macho::RIT_X86_64_Signed4; break;
+          case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+          case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+          case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
           }
         }
       } else {
         if (Modifier != MCSymbolRefExpr::VK_None)
           report_fatal_error("unsupported symbol modifier in branch "
-                             "relocation");
+                             "relocation", false);
 
-        Type = macho::RIT_X86_64_Branch;
+        Type = MachO::X86_64_RELOC_BRANCH;
       }
     } else {
       if (Modifier == MCSymbolRefExpr::VK_GOT) {
-        Type = macho::RIT_X86_64_GOT;
+        Type = MachO::X86_64_RELOC_GOT;
       } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
         // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
         // case all we do is set the PCrel bit in the relocation entry; this is
         // used with exception handling, for example. The source is required to
         // include any necessary offset directly.
-        Type = macho::RIT_X86_64_GOT;
+        Type = MachO::X86_64_RELOC_GOT;
         IsPCRel = 1;
       } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-        report_fatal_error("TLVP symbol modifier should have been rip-rel");
+        report_fatal_error("TLVP symbol modifier should have been rip-rel",
+                           false);
       } else if (Modifier != MCSymbolRefExpr::VK_None)
-        report_fatal_error("unsupported symbol modifier in relocation");
-      else
-        Type = macho::RIT_X86_64_Unsigned;
+        report_fatal_error("unsupported symbol modifier in relocation", false);
+      else {
+        Type = MachO::X86_64_RELOC_UNSIGNED;
+        unsigned Kind = Fixup.getKind();
+        if (Kind == X86::reloc_signed_4byte)
+          report_fatal_error("32-bit absolute addressing is not supported in "
+                             "64-bit mode", false);
+      }
     }
   }
 
@@ -327,13 +344,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
   FixedValue = Value;
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -347,7 +364,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -355,7 +372,8 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
   if (!A_SD->getFragment())
     report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression");
+                       "' can not be undefined in a subtraction expression",
+                       false);
 
   uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
   uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
@@ -367,22 +385,23 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
     if (!B_SD->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression");
+                         "' can not be undefined in a subtraction expression",
+                         false);
 
     // Select the appropriate difference relocation type.
     //
     // Note that there is no longer any semantic difference between these two
     // relocation types from the linkers point of view, this is done solely for
     // pedantic compatibility with 'as'.
-    Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference :
-      (unsigned)macho::RIT_Generic_LocalDifference;
+    Type = A_SD->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF :
+      (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
+  if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+      Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
     // If the offset is too large to fit in a scattered relocation,
     // we're hosed. It's an unfortunate limitation of the MachO format.
     if (FixupOffset > 0xffffff) {
@@ -396,13 +415,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
       llvm_unreachable("fatal error returned?!");
     }
 
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                         <<  0) | // r_address
+                   (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+                   (Log2Size                  << 28) |
+                   (IsPCRel                   << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   } else {
     // If the offset is more than 24-bits, it won't fit in a scattered
@@ -416,13 +435,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
       return false;
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
   return true;
 }
@@ -464,13 +483,13 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = Value;
-  MRE.Word1 = ((Index                  <<  0) |
-               (IsPCRel                << 24) |
-               (Log2Size               << 25) |
-               (1                      << 27) | // Extern
-               (macho::RIT_Generic_TLV << 28)); // Type
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = Value;
+  MRE.r_word1 = ((Index                    <<  0) |
+                 (IsPCRel                  << 24) |
+                 (Log2Size                 << 25) |
+                 (1                        << 27) | // r_extern
+                 (MachO::GENERIC_RELOC_TLV << 28)); // r_type
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -530,7 +549,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     //
     // FIXME: Currently, these are never generated (see code below). I cannot
     // find a case where they are actually emitted.
-    Type = macho::RIT_Vanilla;
+    Type = MachO::GENERIC_RELOC_VANILLA;
   } else {
     // Resolve constant variables.
     if (SD->getSymbol().isVariable()) {
@@ -561,17 +580,17 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     if (IsPCRel)
       FixedValue -= Writer->getSectionAddress(Fragment->getParent());
 
-    Type = macho::RIT_Vanilla;
+    Type = MachO::GENERIC_RELOC_VANILLA;
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index ed64a32..6da4142 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -27,7 +27,7 @@ namespace {
 
   public:
     X86WinCOFFObjectWriter(bool Is64Bit_);
-    ~X86WinCOFFObjectWriter();
+    virtual ~X86WinCOFFObjectWriter();
 
     virtual unsigned getRelocType(const MCValue &Target,
                                   const MCFixup &Fixup,
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 461ea9b..65c5552 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -50,10 +50,10 @@ def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
 def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
                                       "Enable SSSE3 instructions",
                                       [FeatureSSE3]>;
-def FeatureSSE41   : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+def FeatureSSE41   : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
                                       "Enable SSE 4.1 instructions",
                                       [FeatureSSSE3]>;
-def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
                                       "Enable SSE 4.2 instructions",
                                       [FeatureSSE41]>;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
@@ -68,7 +68,7 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       "Support 64-bit instructions",
                                       [FeatureCMOV]>;
-def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
                                       "64-bit with cmpxchg16b",
                                       [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
@@ -86,16 +86,16 @@ def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
 def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
                                       "Enable AVX2 instructions",
                                       [FeatureAVX]>;
-def FeatureAVX512   : SubtargetFeature<"avx-512", "X86SSELevel", "AVX512",
+def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
                                       "Enable AVX-512 instructions",
                                       [FeatureAVX2]>;
-def FeatureERI      : SubtargetFeature<"avx-512-eri", "HasERI", "true",
+def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
                       "Enable AVX-512 Exponential and Reciprocal Instructions",
                                       [FeatureAVX512]>;
-def FeatureCDI      : SubtargetFeature<"avx-512-cdi", "HasCDI", "true",
+def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
                       "Enable AVX-512 Conflict Detection Instructions",
                                       [FeatureAVX512]>;
-def FeaturePFI      : SubtargetFeature<"avx-512-pfi", "HasPFI", "true",
+def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
                       "Enable AVX-512 PreFetch Instructions",
                                       [FeatureAVX512]>;
 
@@ -117,12 +117,15 @@ def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions",
                                       [FeatureSSE2]>;
+def FeatureTBM     : SubtargetFeature<"tbm", "HasTBM", "true",
+                                      "Enable TBM instructions">;
 def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
                                       "Support MOVBE instruction">;
-def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
+def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
                                       "Support RDRAND instruction">;
 def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
-                       "Support 16-bit floating point conversion instructions">;
+                       "Support 16-bit floating point conversion instructions",
+                       [FeatureAVX]>;
 def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
                                        "Support FS/GS Base instructions">;
 def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
@@ -137,6 +140,9 @@ def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
                                       "Support HLE">;
 def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
                                       "Support ADX instructions">;
+def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
+                                      "Enable SHA instructions",
+                                      [FeatureSSE2]>;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
@@ -163,6 +169,8 @@ include "X86Schedule.td"
 
 def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
+def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+                    "Intel Silvermont processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -206,6 +214,14 @@ def : ProcessorModel<"atom", AtomModel,
                       FeatureLEAUsesAG,
                       FeaturePadShortFunctions]>;
 
+// Atom Silvermont.
+def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
+                               FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureMOVBE, FeaturePOPCNT,
+                               FeaturePCLMUL, FeatureAES,
+                               FeatureCallRegIndirect,
+                               FeaturePRFCHW,
+                               FeatureSlowBTMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
                      [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
@@ -276,21 +292,30 @@ def : Proc<"amdfam10",        [FeatureSSE4A,
                                FeaturePOPCNT, FeatureSlowBTMem]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
 // Jaguar
 def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL, FeatureBMI,
-                               FeatureF16C, FeatureMOVBE, FeatureLZCNT,
-                               FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
+                               FeatureBMI, FeatureF16C, FeatureMOVBE,
+                               FeatureLZCNT, FeaturePOPCNT]>;
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureLZCNT, FeaturePOPCNT]>;
 // Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
+                               FeatureFMA]>;
+
+// Steamroller
+def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
+                               FeatureFMA, FeatureFSGSBase]>;
+
 def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 9e0ab82..1258441 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -96,7 +96,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
              MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
       GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
     else
-      GVSym = Mang->getSymbol(GV);
+      GVSym = getSymbol(GV);
 
     // Handle dllimport linkage.
     if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
@@ -109,21 +109,21 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
       MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
       MachineModuleInfoImpl::StubValueTy &StubSym =
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
       MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
       MachineModuleInfoImpl::StubValueTy &StubSym =
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     }
 
     // If the name begins with a dollar-sign, enclose it in parens.  We do this
@@ -333,21 +333,21 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
   const MachineOperand &IndexReg = MI->getOperand(Op+2);
   const MachineOperand &DispSpec = MI->getOperand(Op+3);
   const MachineOperand &SegReg   = MI->getOperand(Op+4);
-  
+
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+4, O, Modifier, AsmVariant);
     O << ':';
   }
-  
+
   O << '[';
-  
+
   bool NeedPlus = false;
   if (BaseReg.getReg()) {
     printOperand(MI, Op, O, Modifier, AsmVariant);
     NeedPlus = true;
   }
-  
+
   if (IndexReg.getReg()) {
     if (NeedPlus) O << " + ";
     if (ScaleVal != 1)
@@ -394,6 +394,7 @@ bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
     Reg = getX86SubSuperRegister(Reg, MVT::i32);
     break;
   case 'q': // Print DImode register
+    // FIXME: gcc will actually print e instead of r for 32-bit.
     Reg = getX86SubSuperRegister(Reg, MVT::i64);
     break;
   }
@@ -518,6 +519,27 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (Subtarget->isTargetEnvMacho())
     OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
+  if (Subtarget->isTargetCOFF()) {
+    // Emit an absolute @feat.00 symbol.  This appears to be some kind of
+    // compiler features bitfield read by link.exe.
+    if (!Subtarget->is64Bit()) {
+      MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00"));
+      OutStreamer.BeginCOFFSymbolDef(S);
+      OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+      OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+      OutStreamer.EndCOFFSymbolDef();
+      // According to the PE-COFF spec, the LSB of this value marks the object
+      // for "registered SEH".  This means that all SEH handler entry points
+      // must be registered in .sxdata.  Use of any unregistered handlers will
+      // cause the process to terminate immediately.  LLVM does not know how to
+      // register any SEH handlers, so its object files should be safe.
+      S->setAbsolute();
+      OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
+      OutStreamer.EmitAssignment(
+          S, MCConstantExpr::Create(int64_t(1), MMI->getContext()));
+    }
+  }
 }
 
 
@@ -606,6 +628,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer.AddBlankLine();
     }
 
+    SM.serializeToStackMapSection();
+
     // Funny Darwin hack: This flag tells the linker that no global symbols
     // contain code that falls through to other global symbols (e.g. the obvious
     // implementation of multiple entry points).  If this doesn't occur, the
@@ -645,12 +669,12 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
 
     for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
       if (I->hasDLLExportLinkage())
-        DLLExportedFns.push_back(Mang->getSymbol(I));
+        DLLExportedFns.push_back(getSymbol(I));
 
     for (Module::const_global_iterator I = M.global_begin(),
            E = M.global_end(); I != E; ++I)
       if (I->hasDLLExportLinkage())
-        DLLExportedGlobals.push_back(Mang->getSymbol(I));
+        DLLExportedGlobals.push_back(getSymbol(I));
 
     // Output linker support code for dllexported globals on windows.
     if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 6eed5ce..24a768b 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
@@ -24,9 +25,21 @@ class MCStreamer;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
+  StackMaps SM;
+
+  // Parses operands of PATCHPOINT and STACKMAP to produce stack map Location
+  // structures. Returns a result location and an iterator to the operand
+  // immediately following the operands consumed.
+  //
+  // This method is implemented in X86MCInstLower.cpp.
+  static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+    stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
+                          MachineInstr::const_mop_iterator MOE,
+                          const TargetMachine &TM);
+
  public:
   explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
+    : AsmPrinter(TM, Streamer), SM(*this, stackmapOperandParser) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
   }
 
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 0000000..e76f9fd
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,35 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86CALLINGCONV_H
+#define X86CALLINGCONV_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to X86 C calling convention on Release builds.
+  return false;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 38e2591..a78b5c0 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -151,6 +151,26 @@ def RetCC_X86_64_HiPE : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
 ]>;
 
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: RAX
+  CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
 // This is the root return-value convention for the X86-32 backend.
 def RetCC_X86_32 : CallingConv<[
   // If FastCC, use RetCC_X86_32_Fast.
@@ -167,6 +187,10 @@ def RetCC_X86_64 : CallingConv<[
   // HiPE uses RetCC_X86_64_HiPE
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
 
+  // Handle JavaScript calls.
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
   // Handle explicit CC selection
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
@@ -279,10 +303,10 @@ def CC_X86_Win64_C : CallingConv<[
   // The first 4 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
                                           [XMM0, XMM1, XMM2, XMM3]>>,
-  
+
   // Do not pass the sret argument in RCX, the Win64 thiscall calling
-  // convention requires "this" to be passed in RCX.                                        
-  CCIfCC<"CallingConv::X86_ThisCall", 
+  // convention requires "this" to be passed in RCX.
+  CCIfCC<"CallingConv::X86_ThisCall",
     CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
                                                      [XMM1, XMM2, XMM3]>>>>,
 
@@ -329,6 +353,25 @@ def CC_X86_64_HiPE : CallingConv<[
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
 ]>;
 
+def CC_X86_64_WebKit_JS : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer/FP values are always stored in stack slots that are 8 bytes in size
+  // and 8-byte aligned.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
 //===----------------------------------------------------------------------===//
 // X86 C Calling Convention
 //===----------------------------------------------------------------------===//
@@ -354,7 +397,7 @@ def CC_X86_32_Common : CallingConv<[
   // Integer/Float values get stored in stack slots that are 4 bytes in
   // size and 4-byte aligned.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  
+
   // Doubles get 8-byte slots that are 4-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 4>>,
 
@@ -520,6 +563,8 @@ def CC_X86_32 : CallingConv<[
 def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
 
@@ -558,11 +603,11 @@ def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
 
 // Standard C + YMM6-15
 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
-                                                  R13, R14, R15, 
+                                                  R13, R14, R15,
                                                   (sequence "YMM%u", 6, 15))>;
 
 def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
-                                                     R12, R13, R14, R15, 
+                                                     R12, R13, R14, R15,
                                                      (sequence "ZMM%u", 6, 21),
                                                      K4, K5, K6, K7)>;
 //Standard C + XMM 8-15
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 5d72b44..14385ed 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -840,7 +840,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   unsigned char VEX_W = 0;
 
   // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  unsigned char XOP = 0;
+  bool XOP = false;
 
   // VEX_5M (VEX m-mmmmm field):
   //
@@ -850,7 +850,8 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   //  0b00011: implied 0F 3A leading opcode bytes
   //  0b00100-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
-  //  0b10001: XOP map select - 09h instructions with no imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
   unsigned char VEX_5M = 0x1;
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -882,7 +883,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     VEX_W = 1;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = 1;
+    XOP = true;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
@@ -919,11 +920,11 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     case X86II::XOP9:
       VEX_5M = 0x9;
       break;
-    case X86II::A6:  // Bypass: Not used by VEX
-    case X86II::A7:  // Bypass: Not used by VEX
-    case X86II::TB:  // Bypass: Not used by VEX
-    case 0:
-      break;  // No prefix!
+    case X86II::XOPA:
+      VEX_5M = 0xA;
+      break;
+    case X86II::TB: // VEX_5M/VEX_PP already correct
+      break;
   }
 
 
@@ -982,11 +983,14 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       //  FMA4:
       //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
       //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_R = 0x0;
+      CurOp++;
 
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, 1);
+      if (HasVEX_4V) {
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+        CurOp++;
+      }
 
       if (X86II::isX86_64ExtendedReg(
                           MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -996,7 +1000,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
         VEX_X = 0x0;
 
       if (HasVEX_4VOp3)
-        VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
       break;
     case X86II::MRM0m: case X86II::MRM1m:
     case X86II::MRM2m: case X86II::MRM3m:
@@ -1006,7 +1010,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       //  MemAddr
       //  src1(VEX_4V), MemAddr
       if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, 0);
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
 
       if (X86II::isX86_64ExtendedReg(
                           MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -1059,8 +1063,10 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     case X86II::MRM6r: case X86II::MRM7r:
       // MRM0r-MRM7r instructions forms:
       //  dst(VEX_4V), src(ModR/M), imm8
-      VEX_4V = getVEXRegisterEncoding(MI, 0);
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      CurOp++;
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_B = 0x0;
       break;
     default: // RawFrm
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 5bc3420..97f96ab 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
+#include "X86CallingConv.h"
 #include "X86ISelLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86RegisterInfo.h"
@@ -125,6 +126,8 @@ private:
     return static_cast<const X86TargetMachine *>(&TM);
   }
 
+  bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
   unsigned TargetMaterializeConstant(const Constant *C);
 
   unsigned TargetMaterializeAlloca(const AllocaInst *C);
@@ -344,9 +347,126 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
   return true;
 }
 
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // Can't handle TLS yet.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
+    // it works...).
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (const GlobalVariable *GVar =
+            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+        if (GVar->isThreadLocal())
+          return false;
+
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
+
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
+      }
+
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
+      } else {
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = NULL;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy() == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = &X86::GR64RegClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = &X86::GR32RegClass;
+        }
+
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
+
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
+
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = 0;
+      return true;
+    }
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
 /// X86SelectAddress - Attempt to fill in an address from the given value.
 ///
 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+  SmallVector<const Value *, 32> GEPs;
+redo_gep:
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
@@ -441,13 +561,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
           Disp += CI->getSExtValue() * S;
           break;
         }
-        if (isa<AddOperator>(Op) &&
-            (!isa<Instruction>(Op) ||
-             FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
-               == FuncInfo.MBB) &&
-            isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-          // An add (in the same block) with a constant operand. Fold the
-          // constant.
+        if (canFoldAddIntoGEP(U, Op)) {
+          // A compatible add with a constant operand. Fold the constant.
           ConstantInt *CI =
             cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
           Disp += CI->getSExtValue() * S;
@@ -469,139 +584,43 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
         goto unsupported_gep;
       }
     }
+
     // Check for displacement overflow.
     if (!isInt<32>(Disp))
       break;
-    // Ok, the GEP indices were covered by constant-offset and scaled-index
-    // addressing. Update the address state and move on to examining the base.
+
     AM.IndexReg = IndexReg;
     AM.Scale = Scale;
     AM.Disp = (uint32_t)Disp;
-    if (X86SelectAddress(U->getOperand(0), AM))
+    GEPs.push_back(V);
+
+    if (const GetElementPtrInst *GEP =
+          dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+      // Ok, the GEP indices were covered by constant-offset and scaled-index
+      // addressing. Update the address state and move on to examining the base.
+      V = GEP;
+      goto redo_gep;
+    } else if (X86SelectAddress(U->getOperand(0), AM)) {
       return true;
+    }
 
     // If we couldn't merge the gep value into this addr mode, revert back to
     // our address and just match the value instead of completely failing.
     AM = SavedAM;
-    break;
-  unsupported_gep:
-    // Ok, the GEP indices weren't all covered.
-    break;
-  }
-  }
-
-  // Handle constant address.
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    // Can't handle alternate code models yet.
-    if (TM.getCodeModel() != CodeModel::Small)
-      return false;
 
-    // Can't handle TLS yet.
-    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-      if (GVar->isThreadLocal())
-        return false;
-
-    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
-    // it works...).
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      if (const GlobalVariable *GVar =
-            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
-        if (GVar->isThreadLocal())
-          return false;
-
-    // RIP-relative addresses can't have additional register operands, so if
-    // we've already folded stuff into the addressing mode, just force the
-    // global value into its own register, which we can use as the basereg.
-    if (!Subtarget->isPICStyleRIPRel() ||
-        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
-      // Okay, we've committed to selecting this global. Set up the address.
-      AM.GV = GV;
-
-      // Allow the subtarget to classify the global.
-      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
-      // If this reference is relative to the pic base, set it now.
-      if (isGlobalRelativeToPICBase(GVFlags)) {
-        // FIXME: How do we know Base.Reg is free??
-        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-      }
-
-      // Unless the ABI requires an extra load, return a direct reference to
-      // the global.
-      if (!isGlobalStubReference(GVFlags)) {
-        if (Subtarget->isPICStyleRIPRel()) {
-          // Use rip-relative addressing if we can.  Above we verified that the
-          // base and index registers are unused.
-          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
-          AM.Base.Reg = X86::RIP;
-        }
-        AM.GVOpFlags = GVFlags;
+    for (SmallVectorImpl<const Value *>::reverse_iterator
+           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
+      if (handleConstantAddresses(*I, AM))
         return true;
-      }
-
-      // Ok, we need to do a load from a stub.  If we've already loaded from
-      // this stub, reuse the loaded pointer, otherwise emit the load now.
-      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
-      unsigned LoadReg;
-      if (I != LocalValueMap.end() && I->second != 0) {
-        LoadReg = I->second;
-      } else {
-        // Issue load from stub.
-        unsigned Opc = 0;
-        const TargetRegisterClass *RC = NULL;
-        X86AddressMode StubAM;
-        StubAM.Base.Reg = AM.Base.Reg;
-        StubAM.GV = GV;
-        StubAM.GVOpFlags = GVFlags;
-
-        // Prepare for inserting code in the local-value area.
-        SavePoint SaveInsertPt = enterLocalValueArea();
-
-        if (TLI.getPointerTy() == MVT::i64) {
-          Opc = X86::MOV64rm;
-          RC  = &X86::GR64RegClass;
 
-          if (Subtarget->isPICStyleRIPRel())
-            StubAM.Base.Reg = X86::RIP;
-        } else {
-          Opc = X86::MOV32rm;
-          RC  = &X86::GR32RegClass;
-        }
-
-        LoadReg = createResultReg(RC);
-        MachineInstrBuilder LoadMI =
-          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
-        addFullAddress(LoadMI, StubAM);
-
-        // Ok, back to normal mode.
-        leaveLocalValueArea(SaveInsertPt);
-
-        // Prevent loading GV stub multiple times in same MBB.
-        LocalValueMap[V] = LoadReg;
-      }
-
-      // Now construct the final address. Note that the Disp, Scale,
-      // and Index values may already be set here.
-      AM.Base.Reg = LoadReg;
-      AM.GV = 0;
-      return true;
-    }
+    return false;
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
   }
-
-  // If all else fails, try to materialize the value in a register.
-  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
-    if (AM.Base.Reg == 0) {
-      AM.Base.Reg = getRegForValue(V);
-      return AM.Base.Reg != 0;
-    }
-    if (AM.IndexReg == 0) {
-      assert(AM.Scale == 1 && "Scale with no index!");
-      AM.IndexReg = getRegForValue(V);
-      return AM.IndexReg != 0;
-    }
   }
 
-  return false;
+  return handleConstantAddresses(V, AM);
 }
 
 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
@@ -609,9 +628,35 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+  const Instruction *I = dyn_cast<Instruction>(V);
+  // Record if the value is defined in the same basic block.
+  //
+  // This information is crucial to know whether or not folding an
+  // operand is valid.
+  // Indeed, FastISel generates or reuses a virtual register for all
+  // operands of all instructions it selects. Obviously, the definition and
+  // its uses must use the same virtual register otherwise the produced
+  // code is incorrect.
+  // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+  // registers for values that are alive across basic blocks. This ensures
+  // that the values are consistently set between across basic block, even
+  // if different instruction selection mechanisms are used (e.g., a mix of
+  // SDISel and FastISel).
+  // For values local to a basic block, the instruction selection process
+  // generates these virtual registers with whatever method is appropriate
+  // for its needs. In particular, FastISel and SDISel do not share the way
+  // local virtual registers are set.
+  // Therefore, this is impossible (or at least unsafe) to share values
+  // between basic blocks unless they use the same instruction selection
+  // method, which is not guarantee for X86.
+  // Moreover, things like hasOneUse could not be used accurately, if we
+  // allow to reference values across basic blocks whereas they are not
+  // alive across basic blocks initially.
+  bool InMBB = true;
+  if (I) {
     Opcode = I->getOpcode();
     U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
     Opcode = C->getOpcode();
     U = C;
@@ -620,18 +665,22 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   switch (Opcode) {
   default: break;
   case Instruction::BitCast:
-    // Look past bitcasts.
-    return X86SelectCallAddress(U->getOperand(0), AM);
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
 
   case Instruction::IntToPtr:
-    // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
-    // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
   }
@@ -1026,7 +1075,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
     return false;
 
   // Handle zero-extension from i1 to i8, which is common.
-  MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()).getSimpleVT();
+  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
   if (SrcVT.SimpleTy == MVT::i1) {
     // Set the high bits to zero.
     ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index d21cb8a..38a8351 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -125,6 +125,15 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
       // which requires isImm() to be true
       return 0;
     }
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+    if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) {
+      // if src1 != src2, then convertToThreeAddress will
+      // need to create a Virtual register, which we cannot do
+      // after register allocation.
+      return 0;
+    }
   }
   return TII->convertToThreeAddress(MFI, MBBI, 0);
 }
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index b994e67..a06ba9d 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -365,270 +365,13 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   }
 }
 
-/// getCompactUnwindRegNum - Get the compact unwind number for a given
-/// register. The number corresponds to the enum lists in
-/// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
-  static const uint16_t CU32BitRegs[] = {
-    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-  };
-  static const uint16_t CU64BitRegs[] = {
-    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-  };
-  const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
-  for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
-    if (*CURegs == Reg)
-      return Idx;
-
-  return -1;
-}
-
-// Number of registers that can be saved in a compact unwind encoding.
-#define CU_NUM_SAVED_REGS 6
-
-/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
-/// used with frameless stacks. It is passed the number of registers to be saved
-/// and an array of the registers saved.
-static uint32_t
-encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
-                                         unsigned RegCount, bool Is64Bit) {
-  // The saved registers are numbered from 1 to 6. In order to encode the order
-  // in which they were saved, we re-number them according to their place in the
-  // register order. The re-numbering is relative to the last re-numbered
-  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
-  //
-  //    Orig  Re-Num
-  //    ----  ------
-  //     6       6
-  //     2       2
-  //     4       3
-  //     5       3
-  //
-  for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
-    int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
-    if (CUReg == -1) return ~0U;
-    SavedRegs[i] = CUReg;
-  }
-
-  // Reverse the list.
-  std::swap(SavedRegs[0], SavedRegs[5]);
-  std::swap(SavedRegs[1], SavedRegs[4]);
-  std::swap(SavedRegs[2], SavedRegs[3]);
-
-  uint32_t RenumRegs[CU_NUM_SAVED_REGS];
-  for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) {
-    unsigned Countless = 0;
-    for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
-      if (SavedRegs[j] < SavedRegs[i])
-        ++Countless;
-
-    RenumRegs[i] = SavedRegs[i] - Countless - 1;
-  }
-
-  // Take the renumbered values and encode them into a 10-bit number.
-  uint32_t permutationEncoding = 0;
-  switch (RegCount) {
-  case 6:
-    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                           +     RenumRegs[4];
-    break;
-  case 5:
-    permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
-                           + 6 * RenumRegs[3] +  2 * RenumRegs[4]
-                           +     RenumRegs[5];
-    break;
-  case 4:
-    permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
-                           + 3 * RenumRegs[4] +      RenumRegs[5];
-    break;
-  case 3:
-    permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
-                           +     RenumRegs[5];
-    break;
-  case 2:
-    permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
-    break;
-  case 1:
-    permutationEncoding |=       RenumRegs[5];
-    break;
-  }
-
-  assert((permutationEncoding & 0x3FF) == permutationEncoding &&
-         "Invalid compact register encoding!");
-  return permutationEncoding;
-}
-
-/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
-/// compact encoding with a frame pointer.
-static uint32_t
-encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
-                                      bool Is64Bit) {
-  // Encode the registers in the order they were saved, 3-bits per register. The
-  // registers are numbered from 1 to CU_NUM_SAVED_REGS.
-  uint32_t RegEnc = 0;
-  for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) {
-    unsigned Reg = SavedRegs[I];
-    if (Reg == 0) continue;
-
-    int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
-    if (CURegNum == -1) return ~0U;
-
-    // Encode the 3-bit register number in order, skipping over 3-bits for each
-    // register.
-    RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
-  }
-
-  assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!");
-  return RegEnc;
-}
-
-uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
-  unsigned StackPtr = RegInfo->getStackRegister();
-
-  bool Is64Bit = STI.is64Bit();
-  bool HasFP = hasFP(MF);
-
-  unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 };
-  unsigned SavedRegIdx = 0;
-
-  unsigned OffsetSize = (Is64Bit ? 8 : 4);
-
-  unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r);
-  unsigned PushInstrSize = 1;
-  unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
-  unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
-  unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
-
-  unsigned StackDivide = (Is64Bit ? 8 : 4);
-
-  unsigned InstrOffset = 0;
-  unsigned StackAdjust = 0;
-  unsigned StackSize = 0;
-
-  MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
-  bool ExpectEnd = false;
-  for (MachineBasicBlock::iterator
-         MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) {
-    MachineInstr &MI = *MBBI;
-    unsigned Opc = MI.getOpcode();
-    if (Opc == X86::PROLOG_LABEL) continue;
-    if (!MI.getFlag(MachineInstr::FrameSetup)) break;
-
-    // We don't exect any more prolog instructions.
-    if (ExpectEnd) return CU::UNWIND_MODE_DWARF;
-
-    if (Opc == PushInstr) {
-      // If there are too many saved registers, we cannot use compact encoding.
-      if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
-
-      unsigned Reg = MI.getOperand(0).getReg();
-      if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
-        ExpectEnd = true;
-        continue;
-      }
-
-      SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
-      StackAdjust += OffsetSize;
-      InstrOffset += PushInstrSize;
-    } else if (Opc == MoveInstr) {
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
-
-      if (DstReg != FramePtr || SrcReg != StackPtr)
-        return CU::UNWIND_MODE_DWARF;
-
-      StackAdjust = 0;
-      memset(SavedRegs, 0, sizeof(SavedRegs));
-      SavedRegIdx = 0;
-      InstrOffset += MoveInstrSize;
-    } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
-               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) {
-      if (StackSize)
-        // We already have a stack size.
-        return CU::UNWIND_MODE_DWARF;
-
-      if (!MI.getOperand(0).isReg() ||
-          MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
-          MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm())
-        // We need this to be a stack adjustment pointer. Something like:
-        //
-        //   %RSP<def> = SUB64ri8 %RSP, 48
-        return CU::UNWIND_MODE_DWARF;
-
-      StackSize = MI.getOperand(2).getImm() / StackDivide;
-      SubtractInstrIdx += InstrOffset;
-      ExpectEnd = true;
-    }
-  }
-
-  // Encode that we are using EBP/RBP as the frame pointer.
-  uint32_t CompactUnwindEncoding = 0;
-  StackAdjust /= StackDivide;
-  if (HasFP) {
-    if ((StackAdjust & 0xFF) != StackAdjust)
-      // Offset was too big for compact encoding.
-      return CU::UNWIND_MODE_DWARF;
-
-    // Get the encoding of the saved registers when we have a frame pointer.
-    uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
-    if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
-
-    CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
-    CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
-    CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
-  } else {
-    ++StackAdjust;
-    uint32_t TotalStackSize = StackAdjust + StackSize;
-    if ((TotalStackSize & 0xFF) == TotalStackSize) {
-      // Frameless stack with a small stack size.
-      CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
-
-      // Encode the stack size.
-      CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16;
-    } else {
-      if ((StackAdjust & 0x7) != StackAdjust)
-        // The extra stack adjustments are too big for us to handle.
-        return CU::UNWIND_MODE_DWARF;
-
-      // Frameless stack with an offset too large for us to encode compactly.
-      CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
-
-      // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
-      // instruction.
-      CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
-
-      // Encode any extra stack stack adjustments (done via push instructions).
-      CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
-    }
-
-    // Encode the number of registers saved.
-    CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
-
-    // Get the encoding of the saved registers when we don't have a frame
-    // pointer.
-    uint32_t RegEnc =
-      encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx,
-                                               Is64Bit);
-    if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
-
-    // Encode the register encoding.
-    CompactUnwindEncoding |=
-      RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
-  }
-
-  return CompactUnwindEncoding;
-}
-
 /// usesTheStack - This function checks if any of the users of EFLAGS
 /// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
 /// to use the stack, and if we don't adjust the stack we clobber the first
 /// frame index.
 /// See X86InstrInfo::copyPhysReg.
-static bool usesTheStack(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+static bool usesTheStack(const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS),
        re = MRI.reg_end(); ri != re; ++ri)
@@ -863,7 +606,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
-  if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) {
+  if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetEnvMacho()) {
     const char *StackProbeSymbol;
     bool isSPUpdateNeeded = false;
 
@@ -964,11 +707,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (PushedRegs)
       emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
   }
-
-  // Darwin 10.7 and greater has support for compact unwind encoding.
-  if (STI.getTargetTriple().isMacOSX() &&
-      !STI.getTargetTriple().isMacOSXVersionLT(10, 7))
-    MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF));
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 6e309d8..3d3b011 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -20,32 +20,6 @@
 
 namespace llvm {
 
-namespace CU {
-
-  /// Compact unwind encoding values.
-  enum CompactUnwindEncodings {
-    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
-    /// the return address, then [RE]SP is moved to [RE]BP.
-    UNWIND_MODE_BP_FRAME                   = 0x01000000,
-
-    /// A frameless function with a small constant stack size.
-    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
-
-    /// A frameless function with a large constant stack size.
-    UNWIND_MODE_STACK_IND                  = 0x03000000,
-
-    /// No compact unwind encoding is available.
-    UNWIND_MODE_DWARF                      = 0x04000000,
-
-    /// Mask for encoding the frame registers.
-    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
-
-    /// Mask for encoding the frameless registers.
-    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
-  };
-
-} // end CU namespace
-
 class MCSymbol;
 class X86TargetMachine;
 
@@ -91,7 +65,6 @@ public:
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const;
-  uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9465420..36d1690 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -79,7 +79,8 @@ namespace {
     }
 
     bool hasBaseOrIndexReg() const {
-      return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+      return BaseType == FrameIndexBase ||
+             IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
     }
 
     /// isRIPRelative - Return true if this addressing mode is already RIP
@@ -183,7 +184,7 @@ namespace {
     SDNode *Select(SDNode *N);
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
-    SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
+    SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -491,8 +492,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
 
-    EVT SrcVT = N->getOperand(0).getValueType();
-    EVT DstVT = N->getValueType(0);
+    MVT SrcVT = N->getOperand(0).getSimpleValueType();
+    MVT DstVT = N->getSimpleValueType(0);
 
     // If any of the sources are vectors, no fp stack involved.
     if (SrcVT.isVector() || DstVT.isVector())
@@ -519,7 +520,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
     // FPStack has extload and truncstore.  SSE can fold direct loads into other
     // operations.  Based on this, decide what we want to do.
-    EVT MemVT;
+    MVT MemVT;
     if (N->getOpcode() == ISD::FP_ROUND)
       MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
     else
@@ -783,7 +784,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
       Mask != (0xffu << ScaleLog))
     return true;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
   SDValue Eight = DAG.getConstant(8, MVT::i8);
   SDValue NewMask = DAG.getConstant(0xff, VT);
@@ -831,7 +832,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
     return true;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
@@ -904,7 +905,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
   // Scale the leading zero count down based on the actual size of the value.
   // Also scale it down based on the size of the shift.
-  MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt;
+  MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
 
   // The final check is to ensure that any masked out high bits of X are
   // already known to be zero. Otherwise, the mask has a semantic impact
@@ -914,23 +915,23 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   // replace them with zero extensions cheaply if necessary.
   bool ReplacingAnyExtend = false;
   if (X.getOpcode() == ISD::ANY_EXTEND) {
-    unsigned ExtendBits =
-      X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits();
+    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+                          X.getOperand(0).getSimpleValueType().getSizeInBits();
     // Assume that we'll replace the any-extend with a zero-extend, and
     // narrow the search to the extended value.
     X = X.getOperand(0);
     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
     ReplacingAnyExtend = true;
   }
-  APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
-                                               MaskLZ);
+  APInt MaskedHighBits =
+    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
   APInt KnownZero, KnownOne;
   DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
   if (MaskedHighBits != KnownZero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
   // and an addressing mode. Make it so.
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   if (ReplacingAnyExtend) {
     assert(X.getValueType() != VT);
     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
@@ -1059,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
-    if (X.getValueSizeInBits() > 64) break;
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     // The mask used for the transform is expected to be post-shift, but we
     // found the shift first so just apply the shift to the mask before passing
@@ -1244,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
-    if (X.getValueSizeInBits() > 64) break;
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     if (!isa<ConstantSDNode>(N.getOperand(1)))
       break;
@@ -1323,7 +1324,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
   if (MatchAddress(N, AM))
     return false;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   if (AM.BaseType == X86ISelAddressMode::RegBase) {
     if (!AM.Base_Reg.getNode())
       AM.Base_Reg = CurDAG->getRegister(0, VT);
@@ -1465,7 +1466,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
   assert (T == AM.Segment);
   AM.Segment = Copy;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   unsigned Complexity = 0;
   if (AM.BaseType == X86ISelAddressMode::RegBase)
     if (AM.Base_Reg.getNode())
@@ -1706,7 +1707,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
 // + non-empty, otherwise.
 static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
                                                 SDLoc dl,
-                                                enum AtomicOpc &Op, EVT NVT,
+                                                enum AtomicOpc &Op, MVT NVT,
                                                 SDValue Val) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
     int64_t CNVal = CN->getSExtValue();
@@ -1753,7 +1754,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
   return Val;
 }
 
-SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
 
@@ -1793,7 +1794,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
 
   unsigned Opc = 0;
-  switch (NVT.getSimpleVT().SimpleTy) {
+  switch (NVT.SimpleTy) {
     default: return 0;
     case MVT::i8:
       if (isCN)
@@ -2047,7 +2048,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
 }
 
 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
-  EVT NVT = Node->getValueType(0);
+  MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
   SDLoc dl(Node);
@@ -2056,6 +2057,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    Node->setNodeId(-1);
     return NULL;   // Already selected.
   }
 
@@ -2187,7 +2189,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       break;
 
     unsigned ShlOp, Op;
-    EVT CstVT = NVT;
+    MVT CstVT = NVT;
 
     // Check the minimum bitwidth for the new constant.
     // TODO: AND32ri is the same as AND64ri32 with zext imm.
@@ -2202,7 +2204,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     if (NVT == CstVT)
       break;
 
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i32:
       assert(CstVT == MVT::i8);
@@ -2239,7 +2241,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N1 = Node->getOperand(1);
 
     unsigned LoReg;
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
     case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
@@ -2268,7 +2270,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     bool isSigned = Opcode == ISD::SMUL_LOHI;
     bool hasBMI2 = Subtarget->hasBMI2();
     if (!isSigned) {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
       case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
@@ -2278,7 +2280,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                      MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
       }
     } else {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
       case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
@@ -2415,7 +2417,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
       case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
@@ -2423,7 +2425,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
       }
     } else {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
       case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
@@ -2434,7 +2436,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     unsigned LoReg, HiReg, ClrReg;
     unsigned SExtOpcode;
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:
       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
@@ -2489,7 +2491,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       } else {
         // Zero out the high part, effectively zero extending the input.
         SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);       
-        switch (NVT.getSimpleVT().SimpleTy) {
+        switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
               SDValue(CurDAG->getMachineNode(
@@ -2609,7 +2611,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // On x86-32, only the ABCD registers have 8-bit subregisters.
         if (!Subtarget->is64Bit()) {
           const TargetRegisterClass *TRC;
-          switch (N0.getValueType().getSimpleVT().SimpleTy) {
+          switch (N0.getSimpleValueType().SimpleTy) {
           case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
           case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
           default: llvm_unreachable("Unsupported TEST operand type!");
@@ -2644,7 +2646,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
         // Put the value in an ABCD register.
         const TargetRegisterClass *TRC;
-        switch (N0.getValueType().getSimpleVT().SimpleTy) {
+        switch (N0.getSimpleValueType().SimpleTy) {
         case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
         case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
         case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9ffe29f..081c558 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86.h"
+#include "X86CallingConv.h"
 #include "X86InstrBuilder.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
@@ -91,7 +92,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
                                VecIdx);
 
   return Result;
-  
+
 }
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
@@ -631,7 +632,7 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+  if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
   else if (TM.Options.EnableSegmentedStacks)
@@ -1150,9 +1151,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
 
-    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
-    setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
-
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
@@ -1160,7 +1158,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
 
@@ -1193,10 +1190,16 @@ void X86TargetLowering::resetOperationActions() {
 
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
 
     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
@@ -1330,7 +1333,16 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
     setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
 
-
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
+    }
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
@@ -1390,6 +1402,9 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
+    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
 
     // Custom lower several nodes.
     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1409,14 +1424,6 @@ void X86TargetLowering::resetOperationActions() {
       if (!VT.is512BitVector())
         continue;
 
-      if (VT != MVT::v8i64) {
-        setOperationAction(ISD::XOR,   VT, Promote);
-        AddPromotedToType (ISD::XOR,   VT, MVT::v8i64);
-        setOperationAction(ISD::OR,    VT, Promote);
-        AddPromotedToType (ISD::OR,    VT, MVT::v8i64);
-        setOperationAction(ISD::AND,   VT, Promote);
-        AddPromotedToType (ISD::AND,   VT, MVT::v8i64);
-      }
       if ( EltSize >= 32) {
         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
@@ -1434,8 +1441,6 @@ void X86TargetLowering::resetOperationActions() {
       if (!VT.is512BitVector())
         continue;
 
-      setOperationAction(ISD::LOAD,   VT, Promote);
-      AddPromotedToType (ISD::LOAD,   VT, MVT::v8i64);
       setOperationAction(ISD::SELECT, VT, Promote);
       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
     }
@@ -1452,6 +1457,7 @@ void X86TargetLowering::resetOperationActions() {
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -1541,7 +1547,16 @@ void X86TargetLowering::resetOperationActions() {
 }
 
 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector()) return MVT::i8;
+  if (!VT.isVector())
+    return MVT::i8;
+
+  const TargetMachine &TM = getTargetMachine();
+  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512())
+    switch(VT.getVectorNumElements()) {
+    case  8: return MVT::v8i1;
+    case 16: return MVT::v16i1;
+    }
+
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -1750,6 +1765,13 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   return true;
 }
 
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                            unsigned DestAS) const {
+  assert(SrcAS != DestAS && "Expected different address spaces!");
+
+  return SrcAS < 256 && DestAS < 256;
+}
+
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1767,6 +1789,11 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
+const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const uint16_t ScratchRegs[] = { X86::R11, 0 };
+  return ScratchRegs;
+}
+
 SDValue
 X86TargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
@@ -3532,7 +3559,7 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
+static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -3542,7 +3569,7 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
     return false;
 
@@ -3571,7 +3598,7 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 
 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
     return false;
 
@@ -3600,14 +3627,14 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 
 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
                           const X86Subtarget *Subtarget) {
   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
       (VT.is256BitVector() && !Subtarget->hasInt256()))
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   // Do not handle 64-bit element shuffles with palignr.
@@ -3690,10 +3717,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
 /// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
-                        bool Commuted = false) {
-  if (!HasFp256 && VT.is256BitVector())
-    return false;
+static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits()/128;
@@ -3702,6 +3726,10 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
   if (NumLaneElems != 2 && NumLaneElems != 4)
     return false;
 
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  bool symetricMaskRequired =
+    (VT.getSizeInBits() >= 256) && (EltSize == 32);
+
   // VSHUFPSY divides the resulting vector into 4 chunks.
   // The sources are also splitted into 4 chunks, and each destination
   // chunk must come from a different source chunk.
@@ -3721,6 +3749,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
   //
   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   //
+  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
   unsigned HalfLaneElems = NumLaneElems/2;
   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
     for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -3731,9 +3760,13 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
       // For VSHUFPSY, the mask of the second half must be the same as the
       // first but with the appropriate offsets. This works in the same way as
       // VPERMILPS works with masks.
-      if (NumElems != 8 || l == 0 || Mask[i] < 0)
+      if (!symetricMaskRequired || Idx < 0)
         continue;
-      if (!isUndefOrEqual(Idx, Mask[i]+l))
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Idx - l;
+        continue;
+      }
+      if ((signed)(Idx - l) != MaskVal[i])
         return false;
     }
   }
@@ -3743,7 +3776,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
 
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3762,7 +3795,7 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
-static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3779,7 +3812,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
 
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3801,7 +3834,7 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3827,7 +3860,7 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
 static
 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
                                SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   SDLoc dl(SVOp);
 
   if (VT != MVT::v8i32 && VT != MVT::v8f32)
@@ -3870,70 +3903,92 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
+static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
                          bool HasInt256, bool V2IsSplat = false) {
-  unsigned NumElts = VT.getVectorNumElements();
 
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "Unsupported vector type for unpckh");
+  assert(VT.getSizeInBits() >= 128 &&
+         "Unsupported vector type for unpckl");
 
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
 
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
 
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (!isUndefOrEqual(BitI1, NumElts))
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
           return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j + NumElts))
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
           return false;
       }
     }
   }
-
   return true;
 }
 
 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
+static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
                          bool HasInt256, bool V2IsSplat = false) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+  assert(VT.getSizeInBits() >= 128 &&
          "Unsupported vector type for unpckh");
 
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
 
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
 
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (isUndefOrEqual(BitI1, NumElts))
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
           return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j+NumElts))
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
           return false;
       }
     }
@@ -3944,10 +3999,12 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
 /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   unsigned NumElts = VT.getVectorNumElements();
   bool Is256BitVec = VT.is256BitVector();
 
+  if (VT.is512BitVector())
+    return false;
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
@@ -3985,9 +4042,12 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
 /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   unsigned NumElts = VT.getVectorNumElements();
 
+  if (VT.is512BitVector())
+    return false;
+
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
@@ -4040,7 +4100,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   if (!HasFp256 || !VT.is256BitVector())
     return false;
 
@@ -4072,7 +4132,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   unsigned HalfSize = VT.getVectorNumElements()/2;
 
@@ -4093,6 +4153,44 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   return (FstHalf | (SndHalf << 4));
 }
 
+// Symetric in-lane mask. Each lane has 4 elements (for imm8)
+static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize < 32)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  Imm8 = 0;
+  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      Imm8 |= Mask[i] << (i*2);
+    }
+    return true;
+  }
+
+  unsigned LaneSize = 4;
+  SmallVector<int, 4> MaskVal(LaneSize, -1);
+
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+        return false;
+      if (Mask[i+l] < 0)
+        continue;
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Mask[i+l] - l;
+        Imm8 |= MaskVal[i] << (i*2);
+        continue;
+      }
+      if (Mask[i+l] != (signed)(MaskVal[i]+l))
+        return false;
+    }
+  }
+  return true;
+}
+
 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
 /// Note that VPERMIL mask matching is different depending whether theunderlying
@@ -4100,38 +4198,39 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
 /// to the same elements of the low, but to the higher half of the source.
 /// In VPERMILPD the two lanes could be shuffled independently of each other
 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
-  if (!HasFp256)
+static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (VT.getSizeInBits() < 256 || EltSize < 32)
     return false;
-
+  bool symetricMaskRequired = (EltSize == 32);
   unsigned NumElts = VT.getVectorNumElements();
-  // Only match 256-bit with 32/64-bit types
-  if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
-    return false;
 
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned LaneSize = NumElts/NumLanes;
+  // 2 or 4 elements in one lane
+  
+  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
   for (unsigned l = 0; l != NumElts; l += LaneSize) {
     for (unsigned i = 0; i != LaneSize; ++i) {
       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
         return false;
-      if (NumElts != 8 || l == 0)
-        continue;
-      // VPERMILPS handling
-      if (Mask[i] < 0)
-        continue;
-      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
-        return false;
+      if (symetricMaskRequired) {
+        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
+          ExpectedMaskVal[i] = Mask[i+l] - l;
+          continue;
+        }
+        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
+          return false;
+      }
     }
   }
-
   return true;
 }
 
 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
   if (!VT.is128BitVector())
     return false;
@@ -4155,7 +4254,7 @@ static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
                            const X86Subtarget *Subtarget) {
   if (!Subtarget->hasSSE3())
     return false;
@@ -4163,7 +4262,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8))
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
     return false;
 
   // "i+1" is the value the indexed mask element must have
@@ -4178,7 +4278,7 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
                            const X86Subtarget *Subtarget) {
   if (!Subtarget->hasSSE3())
     return false;
@@ -4186,7 +4286,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8))
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
     return false;
 
   // "i" is the value the indexed mask element must have
@@ -4201,7 +4302,7 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   if (!HasFp256 || !VT.is256BitVector())
     return false;
 
@@ -4221,7 +4322,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 128-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -4247,7 +4348,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   bool Result = (Index * ElSize) % vecWidth == 0;
 
@@ -4265,7 +4366,7 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   bool Result = (Index * ElSize) % vecWidth == 0;
 
@@ -4292,9 +4393,9 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
 /// Handles 128-bit and 256-bit.
 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+  assert((VT.getSizeInBits() >= 128) &&
          "Unsupported vector type for PSHUF/SHUFP");
 
   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
@@ -4303,10 +4404,10 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
-  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
-         "Only supports 2 or 4 elements per lane");
+  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
+         "Only supports 2, 4 or 8 elements per lane");
 
-  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
+  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
   unsigned Mask = 0;
   for (unsigned i = 0; i != NumElts; ++i) {
     int Elt = N->getMaskElt(i);
@@ -4322,7 +4423,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4346,7 +4447,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4370,11 +4471,12 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+  MVT VT = SVOp->getSimpleValueType(0);
+  unsigned EltSize = VT.is512BitVector() ? 1 :
+    VT.getVectorElementType().getSizeInBits() >> 3;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   int Val = 0;
@@ -4399,7 +4501,7 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+  MVT VecVT = N->getOperand(0).getSimpleValueType();
   MVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
@@ -4414,7 +4516,7 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  MVT VecVT = N->getValueType(0).getSimpleVT();
+  MVT VecVT = N->getSimpleValueType(0);
   MVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
@@ -4449,27 +4551,6 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   return getInsertVINSERTImmediate(N, 256);
 }
 
-/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
-/// Handles 256-bit.
-static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
-
-  unsigned NumElts = VT.getVectorNumElements();
-
-  assert((VT.is256BitVector() && NumElts == 4) &&
-         "Unsupported vector type for VPERMQ/VPERMPD");
-
-  unsigned Mask = 0;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt < 0)
-      continue;
-    Mask |= Elt << (i*2);
-  }
-
-  return Mask;
-}
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
@@ -4484,7 +4565,7 @@ bool X86::isZeroNode(SDValue Elt) {
 /// their permute mask.
 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                                     SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> MaskVec;
 
@@ -4506,7 +4587,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
 /// match movhlps. The lower half elements should come from upper half of
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
   if (VT.getVectorNumElements() != 4)
@@ -4563,7 +4644,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) {
 /// half of V2 (and in order). And since V1 will become the source of the
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
-                               ArrayRef<int> Mask, EVT VT) {
+                               ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -4659,6 +4740,11 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
                         array_lengthof(Ops));
     }
+  } else if (VT.is512BitVector()) { // AVX-512
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                        Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
   } else
     llvm_unreachable("Unexpected vector type");
 
@@ -4715,7 +4801,7 @@ static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4727,7 +4813,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4743,7 +4829,7 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
 // Generate shuffles which repeat i16 and i8 several times until they can be
 // represented by v4f32 and then be manipulated by target suported shuffles.
 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
-  EVT VT = V.getValueType();
+  MVT VT = V.getSimpleValueType();
   int NumElems = VT.getVectorNumElements();
   SDLoc dl(V);
 
@@ -4761,7 +4847,7 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
 
 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
-  EVT VT = V.getValueType();
+  MVT VT = V.getSimpleValueType();
   SDLoc dl(V);
 
   if (VT.is128BitVector()) {
@@ -4787,7 +4873,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
 
 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
-  EVT SrcVT = SV->getValueType(0);
+  MVT SrcVT = SV->getSimpleValueType(0);
   SDValue V1 = SV->getOperand(0);
   SDLoc dl(SV);
 
@@ -4810,7 +4896,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // instruction because the target has no such instruction. Generate shuffles
   // which repeat i16 and i8 several times until they fit in i32, and then can
   // be manipulated by target suported shuffles.
-  EVT EltVT = SrcVT.getVectorElementType();
+  MVT EltVT = SrcVT.getVectorElementType();
   if (EltVT == MVT::i8 || EltVT == MVT::i16)
     V1 = PromoteSplati8i16(V1, DAG, EltNo);
 
@@ -4832,7 +4918,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                            bool IsZero,
                                            const X86Subtarget *Subtarget,
                                            SelectionDAG &DAG) {
-  EVT VT = V2.getValueType();
+  MVT VT = V2.getSimpleValueType();
   SDValue V1 = IsZero
     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
@@ -4950,7 +5036,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
-    MVT ShufVT = V.getValueType().getSimpleVT();
+    MVT ShufVT = V.getSimpleValueType();
     unsigned NumElems = ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     bool IsUnary;
@@ -5048,7 +5134,8 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
 /// logical left shift of a vector.
 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
   unsigned NumZeros = getNumOfConsecutiveZeros(
       SVOp, NumElems, false /* check zeros from right */, DAG,
       SVOp->getMaskElt(0));
@@ -5082,7 +5169,8 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
 /// logical left shift of a vector.
 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
   unsigned NumZeros = getNumOfConsecutiveZeros(
       SVOp, NumElems, true /* check zeros from left */, DAG,
       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
@@ -5118,7 +5206,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   // Although the logic below support any bitwidth size, there are no
   // shift instructions which handle more than 128-bit vectors.
-  if (!SVOp->getValueType(0).is128BitVector())
+  if (!SVOp->getSimpleValueType(0).is128BitVector())
     return false;
 
   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
@@ -5223,9 +5311,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
 }
 
-SDValue
-X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
-                                          SelectionDAG &DAG) const {
+static SDValue
+LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
 
   // Check if the scalar load can be widened into a vector load. And if
   // the address is "base + cst" see if the cst can be "absorbed" into
@@ -5288,7 +5375,10 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
                              LD->getPointerInfo().getWithOffset(StartOffset),
                              false, false, false, 0);
 
-    SmallVector<int, 8> Mask(NumElems, EltNo);
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumElems; ++i)
+      Mask.push_back(EltNo);
+
     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   }
 
@@ -5305,7 +5395,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
 /// There's even a handy isZeroNode for that purpose.
 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
-                                        SDLoc &DL, SelectionDAG &DAG) {
+                                        SDLoc &DL, SelectionDAG &DAG,
+                                        bool isAfterLegalize) {
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
@@ -5341,7 +5432,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   // load of the entire vector width starting at the base pointer.  If we found
   // consecutive loads for the low half, generate a vzext_load node.
   if (LastLoadedElt == NumElems - 1) {
+
+    if (isAfterLegalize &&
+        !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
     SDValue NewLd = SDValue();
+
     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                           LDBase->getPointerInfo(),
@@ -5398,12 +5495,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 /// a scalar load, or a constant.
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
-SDValue
-X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+                                    SelectionDAG &DAG) {
   if (!Subtarget->hasFp256())
     return SDValue();
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
@@ -5450,7 +5547,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
           return SDValue();
 
         // Use the register form of the broadcast instruction available on AVX2.
-        if (VT.is256BitVector())
+        if (VT.getSizeInBits() >= 256)
           Sc = Extract128BitVector(Sc, 0, DAG, dl);
         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
       }
@@ -5492,7 +5589,8 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
 
       assert(C && "Invalid constant type");
 
-      SDValue CP = DAG.getConstantPool(C, getPointerTy());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
                        MachinePointerInfo::getConstantPool(),
@@ -5528,12 +5626,12 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
 
   // Skip if insert_vec_elt is not supported.
-  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
     return SDValue();
 
   SDLoc DL(Op);
@@ -5606,7 +5704,7 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
          "Unexpected type in LowerBUILD_VECTORvXi1!");
 
@@ -5645,13 +5743,16 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
       DAG.getConstant(Immediate, MVT::i16));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
-		       DAG.getIntPtrConstant(0));
+                       DAG.getIntPtrConstant(0));
   }
 
-  if (!isSplatVector(Op.getNode()))
-    llvm_unreachable("Unsupported predicate operation");
-
+  // Splat vector (with undefs)
   SDValue In = Op.getOperand(0);
+  for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) {
+    if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF)
+      llvm_unreachable("Unsupported predicate operation");
+  }
+
   SDValue EFLAGS, X86CC;
   if (In.getOpcode() == ISD::SETCC) {
     SDValue Op0 = In.getOperand(0);
@@ -5679,7 +5780,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     //     res = allOnes ### CMOVNE -1, %res
     //   else
     //     res = allZero
-    MVT InVT = In.getValueType().getSimpleVT();
+    MVT InVT = In.getSimpleValueType();
     SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
     EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
     X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
@@ -5708,7 +5809,7 @@ SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT ExtVT = VT.getVectorElementType();
   unsigned NumElems = Op.getNumOperands();
 
@@ -5720,7 +5821,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
-    if (VT == MVT::v4i32 || VT == MVT::v8i32)
+    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
       return Op;
 
     return getZeroVector(VT, Subtarget, DAG, dl);
@@ -5733,10 +5834,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
       return Op;
 
-    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+    if (!VT.is512BitVector())
+      return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
   }
 
-  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+  SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
   if (Broadcast.getNode())
     return Broadcast;
 
@@ -5815,7 +5917,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
-        if (VT.is256BitVector()) {
+        if (VT.is256BitVector() || VT.is512BitVector()) {
           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
                              Item, DAG.getIntPtrConstant(0));
@@ -5980,7 +6082,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       V[i] = Op.getOperand(i);
 
     // Check for elements which are consecutive loads.
-    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
     if (LD.getNode())
       return LD;
 
@@ -6044,7 +6146,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
-  MVT ResVT = Op.getValueType().getSimpleVT();
+  MVT ResVT = Op.getSimpleValueType();
 
   assert((ResVT.is256BitVector() ||
           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
@@ -6073,10 +6175,14 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
     return SDValue();
   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
@@ -6382,10 +6488,10 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
 // 1. [ssse3] 1 x pshufb
 // 2. [ssse3] 2 x pshufb + 1 x por
 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
-static
-SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
-                                 SelectionDAG &DAG,
-                                 const X86TargetLowering &TLI) {
+static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                        const X86Subtarget* Subtarget,
+                                        SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
@@ -6401,7 +6507,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   // present, fall back to case 3.
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3()) {
+  if (Subtarget->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If all result elements are from one input vector, then only translate
@@ -6514,7 +6620,7 @@ static
 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
                                  const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
@@ -6562,7 +6668,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
 static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                  SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   SDLoc dl(SVOp);
   unsigned NumElems = VT.getVectorNumElements();
   MVT NewVT;
@@ -6599,7 +6705,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
 
 /// getVZextMovL - Return a zero-extending vector move low node.
 ///
-static SDValue getVZextMovL(MVT VT, EVT OpVT,
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
                             const X86Subtarget *Subtarget, SDLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
@@ -6641,7 +6747,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   if (NewOp.getNode())
     return NewOp;
 
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLaneElems = NumElems / 2;
@@ -6753,7 +6859,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   assert(VT.is128BitVector() && "Unsupported vector size");
 
@@ -6904,7 +7010,7 @@ static bool MayFoldVectorLoad(SDValue V) {
 
 static
 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   // Canonizalize to v2f64.
   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
@@ -6918,7 +7024,7 @@ SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
                         bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   assert(VT != MVT::v2i64 && "unsupported shuffle type");
 
@@ -6936,7 +7042,7 @@ static
 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
          "unsupported shuffle type");
@@ -6952,7 +7058,7 @@ static
 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   unsigned NumElems = VT.getVectorNumElements();
 
   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
@@ -7006,13 +7112,13 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
 }
 
 // Reduce a vector shuffle to zext.
-SDValue
-X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
   // PMOVZX is only available from SSE41.
   if (!Subtarget->hasSSE41())
     return SDValue();
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   // Only AVX2 support 256-bit vector integer extending.
   if (!Subtarget->hasInt256() && VT.is256BitVector())
@@ -7051,12 +7157,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
   }
 
-  LLVMContext *Context = DAG.getContext();
   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
-  EVT NeVT = EVT::getIntegerVT(*Context, NBits);
-  EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
+  MVT NeVT = MVT::getIntegerVT(NBits);
+  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
 
-  if (!isTypeLegal(NVT))
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
     return SDValue();
 
   // Simplify the operand as it's prepared to be fed into shuffle.
@@ -7064,8 +7169,8 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
   if (V1.getOpcode() == ISD::BITCAST &&
       V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
       V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      V1.getOperand(0)
-        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+      V1.getOperand(0).getOperand(0)
+        .getSimpleValueType().getSizeInBits() == SignificantBits) {
     // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
     SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
     ConstantSDNode *CIdx =
@@ -7074,19 +7179,19 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
     // selection to fold it. Otherwise, we will short the conversion sequence.
     if (CIdx && CIdx->getZExtValue() == 0 &&
         (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
-      if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+      MVT FullVT = V.getSimpleValueType();
+      MVT V1VT = V1.getSimpleValueType();
+      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
         // The "ext_vec_elt" node is wider than the result node.
         // In this case we should extract subvector from V.
         // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
-        unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
-        EVT FullVT = V.getValueType();
-        EVT SubVecVT = EVT::getVectorVT(*Context,
-                                        FullVT.getVectorElementType(),
+        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
+        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
                                         FullVT.getVectorNumElements()/Ratio);
         V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
                         DAG.getIntPtrConstant(0));
       }
-      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
     }
   }
 
@@ -7094,10 +7199,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
 }
 
-SDValue
-X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
+static SDValue
+NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                       SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
@@ -7108,13 +7214,13 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   // Handle splat operations
   if (SVOp->isSplat()) {
     // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
     if (Broadcast.getNode())
       return Broadcast;
   }
 
   // Check integer expanding shuffles.
-  SDValue NewOp = LowerVectorIntExtend(Op, DAG);
+  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -7132,7 +7238,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        MVT NewVT = NewOp.getSimpleValueType();
         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
                                NewVT, true, false))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
@@ -7141,7 +7247,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        MVT NewVT = NewOp.getSimpleValueType();
         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
                               DAG, Subtarget, dl);
@@ -7156,7 +7262,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   unsigned NumElems = VT.getVectorNumElements();
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
@@ -7194,7 +7300,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   // narrowing and commutation of operands should be handled. The actual code
   // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
+  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -7354,7 +7460,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
     return CommuteVectorShuffle(SVOp, DAG);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
@@ -7377,7 +7483,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
-  if (isSHUFPMask(M, VT, HasFp256))
+  if (isSHUFPMask(M, VT))
     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
                                 getShuffleSHUFImmediate(SVOp), DAG);
 
@@ -7396,8 +7502,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
   // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, HasFp256)) {
-    if (HasInt256 && VT == MVT::v8i32)
+  if (isVPERMILPMask(M, VT)) {
+    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
                                   getShuffleSHUFImmediate(SVOp), DAG);
     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
@@ -7413,21 +7519,28 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (BlendOp.getNode())
     return BlendOp;
 
-  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
-    SmallVector<SDValue, 8> permclMask;
-    for (unsigned i = 0; i != 8; ++i) {
-      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+  unsigned Imm8;
+  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
+
+  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
+      VT.is512BitVector()) {
+    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
+    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
+    SmallVector<SDValue, 16> permclMask;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
     }
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
-                               &permclMask[0], 8);
-    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
-    return DAG.getNode(X86ISD::VPERMV, dl, VT,
-                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
-  }
 
-  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
-    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
-                                getShuffleCLImmediate(SVOp), DAG);
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
+                                &permclMask[0], NumElems);
+    if (V2IsUndef)
+      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+      return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+    return DAG.getNode(X86ISD::VPERMV3, dl, VT,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
+  }
 
   //===--------------------------------------------------------------------===//
   // Since no target specific shuffle was selected for this generic one,
@@ -7443,7 +7556,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (VT == MVT::v16i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
     if (NewOp.getNode())
       return NewOp;
   }
@@ -7467,10 +7580,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
+  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
@@ -7532,21 +7645,38 @@ SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-
   SDValue Vec = Op.getOperand(0);
-  MVT VecVT = Vec.getValueType().getSimpleVT();
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  if (!isa<ConstantSDNode>(Idx)) {
+    if (VecVT.is512BitVector() ||
+        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+         VecVT.getVectorElementType().getSizeInBits() == 32)) {
+
+      MVT MaskEltVT =
+        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+                                    MaskEltVT.getSizeInBits());
+      
+      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+                                getZeroVector(MaskVT, Subtarget, DAG, dl),
+                                Idx, DAG.getConstant(0, getPointerTy()));
+      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
+                        Perm, DAG.getConstant(0, getPointerTy()));
+    }
+    return SDValue();
+  }
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
-    SDValue Idx = Op.getOperand(1);
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
     // Get the 128-bit vector.
     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
-    EVT EltVT = VecVT.getVectorElementType();
+    MVT EltVT = VecVT.getVectorElementType();
 
     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
 
@@ -7565,7 +7695,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
       return Res;
   }
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
     SDValue Vec = Op.getOperand(0);
@@ -7592,7 +7722,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
     // SHUFPS the element to the lowest double word, then movss.
     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7611,7 +7741,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7622,7 +7752,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 }
 
 static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   SDLoc dl(Op);
 
@@ -7676,7 +7806,7 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
 
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
 
   SDLoc dl(Op);
@@ -7724,17 +7854,15 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
-  MVT OpVT = Op.getValueType().getSimpleVT();
+  MVT OpVT = Op.getSimpleValueType();
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
   if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
     unsigned SizeFactor = OpVT.getSizeInBits()/128;
-    EVT VT128 = EVT::getVectorVT(*Context,
-                                 OpVT.getVectorElementType(),
+    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
                                  OpVT.getVectorNumElements() / SizeFactor);
 
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
@@ -7762,8 +7890,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   SDValue In =  Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  EVT ResVT   = Op.getValueType();
-  EVT InVT    = In.getValueType();
+  MVT ResVT   = Op.getSimpleValueType();
+  MVT InVT    = In.getSimpleValueType();
 
   if (Subtarget->hasFp256()) {
     if (ResVT.is128BitVector() &&
@@ -7790,16 +7918,16 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
     SDValue SubVec = Op.getNode()->getOperand(1);
     SDValue Idx = Op.getNode()->getOperand(2);
 
-    if ((Op.getNode()->getValueType(0).is256BitVector() ||
-         Op.getNode()->getValueType(0).is512BitVector()) &&
-        SubVec.getNode()->getValueType(0).is128BitVector() &&
+    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
+         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
+        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
         isa<ConstantSDNode>(Idx)) {
       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
     }
 
-    if (Op.getNode()->getValueType(0).is512BitVector() &&
-        SubVec.getNode()->getValueType(0).is256BitVector() &&
+    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
+        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
         isa<ConstantSDNode>(Idx)) {
       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
       return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
@@ -8108,10 +8236,9 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
                                                          is64Bit ? 257 : 256));
 
-  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
-                                      DAG.getIntPtrConstant(0),
-                                      MachinePointerInfo(Ptr),
-                                      false, false, false, 0);
+  SDValue ThreadPointer =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
+                  MachinePointerInfo(Ptr), false, false, false, 0);
 
   unsigned char OperandFlags = 0;
   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
@@ -8133,21 +8260,20 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   // emit "addl x@ntpoff,%eax" (local exec)
   // or "addl x@indntpoff,%eax" (initial exec)
   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(), OperandFlags);
+  SDValue TGA =
+      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                 GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   if (model == TLSModel::InitialExec) {
     if (isPIC && !is64Bit) {
       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
-                          DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
     }
 
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false,
-                         0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
   }
 
   // The address of the thread local variable is the add of the thread
@@ -8305,6 +8431,11 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, MVT::i8));
   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                                      DAG.getConstant(VTBits - 1, MVT::i8))
                        : DAG.getConstant(0, VT);
@@ -8312,12 +8443,15 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   SDValue Tmp2, Tmp3;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   } else {
     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
-    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   }
 
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                 DAG.getConstant(VTBits, MVT::i8));
   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
@@ -8750,9 +8884,9 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
                               const X86Subtarget *Subtarget) {
-  MVT VT = Op->getValueType(0).getSimpleVT();
+  MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
-  MVT InVT = In.getValueType().getSimpleVT();
+  MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
   // Optimize vectors in AVX mode:
@@ -8768,7 +8902,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   //   Concat upper and lower parts.
   //
 
-  if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
     return SDValue();
 
@@ -8790,8 +8925,39 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
-SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
+static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op->getValueType(0).getSimpleVT();
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getValueType().getSimpleVT();
+  SDLoc DL(Op);
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Now we have only mask extension
+  assert(InVT.getVectorElementType() == MVT::i1);
+  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
+  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                               SelectionDAG &DAG) {
   if (Subtarget->hasFp256()) {
     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
     if (Res.getNode())
@@ -8800,12 +8966,16 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
 
   return SDValue();
 }
-SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
   SDLoc DL(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
+  MVT SVT = In.getSimpleValueType();
+
+  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+    return LowerZERO_EXTEND_AVX512(Op, DAG);
 
   if (Subtarget->hasFp256()) {
     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
@@ -8813,33 +8983,44 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
       return Res;
   }
 
-  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
-      VT.getVectorNumElements() != SVT.getVectorNumElements())
-    return SDValue();
-
-  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
-
-  // AVX2 has better support of integer extending.
-  if (Subtarget->hasInt256())
-    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
-
-  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
-  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
-  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
-                           DAG.getVectorShuffle(MVT::v8i16, DL, In,
-                                                DAG.getUNDEF(MVT::v8i16),
-                                                &Mask[0]));
-
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+         VT.getVectorNumElements() != SVT.getVectorNumElements());
+  return SDValue();
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();  
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
-
-  if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
+  MVT InVT = In.getSimpleValueType();
+  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+         "Invalid TRUNCATE operation");
+
+  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
+    if (VT.getVectorElementType().getSizeInBits() >=8)
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+
+    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+    unsigned NumElts = InVT.getVectorNumElements();
+    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
+    if (InVT.getSizeInBits() < 512) {
+      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
+      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+      InVT = ExtVT;
+    }
+    SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
+    const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+    SDValue CP = DAG.getConstantPool(C, getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+    SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+    SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
+    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
+    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
+  }
+
+  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget->hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
@@ -8870,7 +9051,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
   }
 
-  if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
+  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
     if (Subtarget->hasInt256()) {
       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
@@ -8928,11 +9109,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Handle truncation of V256 to V128 using shuffles.
-  if (!VT.is128BitVector() || !SVT.is256BitVector())
+  if (!VT.is128BitVector() || !InVT.is256BitVector())
     return SDValue();
 
-  assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
-         "Invalid op");
   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -8952,7 +9131,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   if (VT.isVector()) {
     if (VT == MVT::v8i16)
       return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
@@ -8996,9 +9175,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
 
 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
+  MVT SVT = In.getSimpleValueType();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
@@ -9010,7 +9189,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
@@ -9044,7 +9223,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
@@ -9065,7 +9244,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, Alignment);
   if (VT.isVector()) {
-    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
     return DAG.getNode(ISD::BITCAST, dl, VT,
                        DAG.getNode(ISD::XOR, dl, XORVT,
                                    DAG.getNode(ISD::BITCAST, dl, XORVT,
@@ -9081,8 +9260,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
-  MVT SrcVT = Op1.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT SrcVT = Op1.getSimpleValueType();
 
   // If second operand is smaller, extend it first.
   if (SrcVT.bitsLT(VT)) {
@@ -9158,7 +9337,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
@@ -9168,8 +9347,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
 
 // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
 //
-SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
-                                                  SelectionDAG &DAG) const {
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
   if (!Subtarget->hasSSE41())
@@ -9294,7 +9473,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   unsigned NumOperands = 0;
 
   // Truncate operations may prevent the merge of the SETCC instruction
-  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // and the arithmetic instruction before it. Attempt to truncate the operands
   // of the arithmetic instruction and use a reduced bit-width instruction.
   bool NeedTruncation = false;
   SDValue ArithOp = Op;
@@ -9402,7 +9581,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
-        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
         if (EFLAGS.getNode())
           return EFLAGS;
       }
@@ -9638,7 +9817,7 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
 // ones, and then concatenate the result back.
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
@@ -9665,25 +9844,62 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+
+  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
+         Op.getValueType().getScalarType() == MVT::i1 &&
+         "Cannot set masked compare for this operation");
+
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDLoc dl(Op);
+
+  bool Unsigned = false;
+  unsigned SSECC;
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETEQ:  SSECC = 0; break;
+  case ISD::SETUGT: Unsigned = true;
+  case ISD::SETGT:  SSECC = 6; break; // NLE
+  case ISD::SETULT: Unsigned = true;
+  case ISD::SETLT:  SSECC = 1; break;
+  case ISD::SETUGE: Unsigned = true;
+  case ISD::SETGE:  SSECC = 5; break; // NLT
+  case ISD::SETULE: Unsigned = true;
+  case ISD::SETLE:  SSECC = 2; break;
+  }
+  unsigned  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+  return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                     DAG.getConstant(SSECC, MVT::i8));
+
+}
+
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
                            SelectionDAG &DAG) {
-  SDValue Cond;
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
+  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   SDLoc dl(Op);
 
   if (isFP) {
 #ifndef NDEBUG
-    MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
+    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
-
+    unsigned Opc = X86ISD::CMPP;
+    if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+      assert(VT.getVectorNumElements() <= 16);
+      Opc = X86ISD::CMPM;
+    }
     // In the two special cases we can't handle, emit two comparisons.
     if (SSECC == 8) {
       unsigned CC0, CC1;
@@ -9695,14 +9911,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
       }
 
-      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC0, MVT::i8));
-      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC1, MVT::i8));
       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     }
     // Handle all other FP comparisons here.
-    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
                        DAG.getConstant(SSECC, MVT::i8));
   }
 
@@ -9710,6 +9926,24 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (VT.is256BitVector() && !Subtarget->hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
+  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
+  EVT OpVT = Op1.getValueType();
+  if (Subtarget->hasAVX512()) {
+    if (Op1.getValueType().is512BitVector() ||
+        (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
+      return LowerIntVSETCC_AVX512(Op, DAG);
+
+    // In AVX-512 architecture setcc returns mask with i1 elements,
+    // But there is no compare instruction for i8 and i16 elements.
+    // We are not talking about 512-bit operands in this case, these
+    // types are illegal.
+    if (MaskResult &&
+        (OpVT.getVectorElementType().getSizeInBits() < 32 &&
+         OpVT.getVectorElementType().getSizeInBits() >= 8))
+      return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+  }
+
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
@@ -9719,15 +9953,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  Invert = true;
-  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
+  case ISD::SETEQ:  Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break;
   case ISD::SETLT:  Swap = true;
-  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
+  case ISD::SETGT:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break;
   case ISD::SETGE:  Swap = true;
-  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
+  case ISD::SETLE:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    Invert = true; break;
   case ISD::SETULT: Swap = true;
-  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
+  case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    FlipSigns = true; break;
   case ISD::SETUGE: Swap = true;
-  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
+  case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    FlipSigns = true; Invert = true; break;
   }
   
   // Special case: Use min/max operations for SETULE/SETUGE
@@ -9841,7 +10078,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
@@ -9885,7 +10122,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   if (X86CC == X86::COND_INVALID)
     return SDValue();
@@ -10040,7 +10277,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
-    MVT VT = Op.getValueType().getSimpleVT();
+    MVT VT = Op.getSimpleValueType();
 
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
@@ -10149,15 +10386,50 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
 }
 
-SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  MVT VT = Op->getValueType(0).getSimpleVT();
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
-  MVT InVT = In.getValueType().getSimpleVT();
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+
+  MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
+  Constant *C = ConstantInt::get(*DAG.getContext(),
+    APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
+
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
+                          MachinePointerInfo::getConstantPool(),
+                          false, false, false, Alignment);
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return LowerSIGN_EXTEND_AVX512(Op, DAG);
+
   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
-      (VT != MVT::v8i32 || InVT != MVT::v8i16))
+      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+      (VT != MVT::v16i16 || InVT != MVT::v16i8))
     return SDValue();
 
   if (Subtarget->hasInt256())
@@ -10502,7 +10774,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
-  // FIXME: Ensure alignment here
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Op.getNode()->getValueType(0);
 
   bool Is64Bit = Subtarget->is64Bit();
   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
@@ -10540,14 +10813,20 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
-    Flag = Chain.getValue(1);
 
     const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
-    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                               SPTy).getValue(1);
+    unsigned SPReg = RegInfo->getStackRegister();
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+    Chain = SP.getValue(1);
 
-    SDValue Ops1[2] = { Chain.getValue(0), Chain };
+    if (Align) {
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, VT));
+      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+    }
+
+    SDValue Ops1[2] = { SP, Chain };
     return DAG.getMergeValues(Ops1, 2, dl);
   }
 }
@@ -10698,6 +10977,26 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
+// getTargetVShiftByConstNode - Handle vector element shifts where the shift
+// amount is a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
+                                          SDValue SrcOp, uint64_t ShiftAmt,
+                                          SelectionDAG &DAG) {
+
+  // Check for ShiftAmt >= element width
+  if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
+    if (Opc == X86ISD::VSRAI)
+      ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
+    else
+      return DAG.getConstant(0, VT);
+  }
+
+  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+         && "Unknown target vector shift-by-constant node");
+
+  return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
+}
+
 // getTargetVShiftNode - Handle vector element shifts where the shift amount
 // may or may not be a constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
@@ -10705,18 +11004,10 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SelectionDAG &DAG) {
   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
 
-  if (isa<ConstantSDNode>(ShAmt)) {
-    // Constant may be a TargetConstant. Use a regular constant.
-    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
-    switch (Opc) {
-      default: llvm_unreachable("Unknown target vector shift node");
-      case X86ISD::VSHLI:
-      case X86ISD::VSRLI:
-      case X86ISD::VSRAI:
-        return DAG.getNode(Opc, dl, VT, SrcOp,
-                           DAG.getConstant(ShiftAmt, MVT::i32));
-    }
-  }
+  // Catch shift-by-constant.
+  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+                                      CShAmt->getZExtValue(), DAG);
 
   // Change opcode to non-immediate version
   switch (Opc) {
@@ -10919,24 +11210,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_avx2_pmaxu_b:
   case Intrinsic::x86_avx2_pmaxu_w:
   case Intrinsic::x86_avx2_pmaxu_d:
+  case Intrinsic::x86_avx512_pmaxu_d:
+  case Intrinsic::x86_avx512_pmaxu_q:
   case Intrinsic::x86_sse2_pminu_b:
   case Intrinsic::x86_sse41_pminuw:
   case Intrinsic::x86_sse41_pminud:
   case Intrinsic::x86_avx2_pminu_b:
   case Intrinsic::x86_avx2_pminu_w:
   case Intrinsic::x86_avx2_pminu_d:
+  case Intrinsic::x86_avx512_pminu_d:
+  case Intrinsic::x86_avx512_pminu_q:
   case Intrinsic::x86_sse41_pmaxsb:
   case Intrinsic::x86_sse2_pmaxs_w:
   case Intrinsic::x86_sse41_pmaxsd:
   case Intrinsic::x86_avx2_pmaxs_b:
   case Intrinsic::x86_avx2_pmaxs_w:
   case Intrinsic::x86_avx2_pmaxs_d:
+  case Intrinsic::x86_avx512_pmaxs_d:
+  case Intrinsic::x86_avx512_pmaxs_q:
   case Intrinsic::x86_sse41_pminsb:
   case Intrinsic::x86_sse2_pmins_w:
   case Intrinsic::x86_sse41_pminsd:
   case Intrinsic::x86_avx2_pmins_b:
   case Intrinsic::x86_avx2_pmins_w:
-  case Intrinsic::x86_avx2_pmins_d: {
+  case Intrinsic::x86_avx2_pmins_d: 
+  case Intrinsic::x86_avx512_pmins_d:
+  case Intrinsic::x86_avx512_pmins_q: {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -10946,6 +11245,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxu_b:
     case Intrinsic::x86_avx2_pmaxu_w:
     case Intrinsic::x86_avx2_pmaxu_d:
+    case Intrinsic::x86_avx512_pmaxu_d:
+    case Intrinsic::x86_avx512_pmaxu_q:
       Opcode = X86ISD::UMAX;
       break;
     case Intrinsic::x86_sse2_pminu_b:
@@ -10954,6 +11255,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pminu_b:
     case Intrinsic::x86_avx2_pminu_w:
     case Intrinsic::x86_avx2_pminu_d:
+    case Intrinsic::x86_avx512_pminu_d:
+    case Intrinsic::x86_avx512_pminu_q:
       Opcode = X86ISD::UMIN;
       break;
     case Intrinsic::x86_sse41_pmaxsb:
@@ -10962,6 +11265,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxs_b:
     case Intrinsic::x86_avx2_pmaxs_w:
     case Intrinsic::x86_avx2_pmaxs_d:
+    case Intrinsic::x86_avx512_pmaxs_d:
+    case Intrinsic::x86_avx512_pmaxs_q:
       Opcode = X86ISD::SMAX;
       break;
     case Intrinsic::x86_sse41_pminsb:
@@ -10970,6 +11275,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmins_b:
     case Intrinsic::x86_avx2_pmins_w:
     case Intrinsic::x86_avx2_pmins_d:
+    case Intrinsic::x86_avx512_pmins_d:
+    case Intrinsic::x86_avx512_pmins_q:
       Opcode = X86ISD::SMIN;
       break;
     }
@@ -10982,10 +11289,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_sse2_max_pd:
   case Intrinsic::x86_avx_max_ps_256:
   case Intrinsic::x86_avx_max_pd_256:
+  case Intrinsic::x86_avx512_max_ps_512:
+  case Intrinsic::x86_avx512_max_pd_512:
   case Intrinsic::x86_sse_min_ps:
   case Intrinsic::x86_sse2_min_pd:
   case Intrinsic::x86_avx_min_ps_256:
-  case Intrinsic::x86_avx_min_pd_256: {
+  case Intrinsic::x86_avx_min_pd_256:
+  case Intrinsic::x86_avx512_min_ps_512:
+  case Intrinsic::x86_avx512_min_pd_512:  {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -10993,12 +11304,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_sse2_max_pd:
     case Intrinsic::x86_avx_max_ps_256:
     case Intrinsic::x86_avx_max_pd_256:
+    case Intrinsic::x86_avx512_max_ps_512:
+    case Intrinsic::x86_avx512_max_pd_512:
       Opcode = X86ISD::FMAX;
       break;
     case Intrinsic::x86_sse_min_ps:
     case Intrinsic::x86_sse2_min_pd:
     case Intrinsic::x86_avx_min_ps_256:
     case Intrinsic::x86_avx_min_pd_256:
+    case Intrinsic::x86_avx512_min_ps_512:
+    case Intrinsic::x86_avx512_min_pd_512:
       Opcode = X86ISD::FMIN;
       break;
     }
@@ -11069,7 +11384,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
     // Operands intentionally swapped. Mask is last operand to intrinsic,
-    // but second operand for node/intruction.
+    // but second operand for node/instruction.
     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(1));
 
@@ -11144,6 +11459,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+  case Intrinsic::x86_avx512_kortestz:
+  case Intrinsic::x86_avx512_kortestc: {
+    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
+    SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
 
   // SSE/AVX shift intrinsics
   case Intrinsic::x86_sse2_psll_w:
@@ -11338,7 +11663,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_fma_vfmaddsub_ps_256:
   case Intrinsic::x86_fma_vfmaddsub_pd_256:
   case Intrinsic::x86_fma_vfmsubadd_ps_256:
-  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+  case Intrinsic::x86_fma_vfmsubadd_pd_256:
+  case Intrinsic::x86_fma_vfmadd_ps_512:
+  case Intrinsic::x86_fma_vfmadd_pd_512:
+  case Intrinsic::x86_fma_vfmsub_ps_512:
+  case Intrinsic::x86_fma_vfmsub_pd_512:
+  case Intrinsic::x86_fma_vfnmadd_ps_512:
+  case Intrinsic::x86_fma_vfnmadd_pd_512:
+  case Intrinsic::x86_fma_vfnmsub_ps_512:
+  case Intrinsic::x86_fma_vfnmsub_pd_512:
+  case Intrinsic::x86_fma_vfmaddsub_ps_512:
+  case Intrinsic::x86_fma_vfmaddsub_pd_512:
+  case Intrinsic::x86_fma_vfmsubadd_ps_512:
+  case Intrinsic::x86_fma_vfmsubadd_pd_512: { 
     unsigned Opc;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -11346,36 +11683,48 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_fma_vfmadd_pd:
     case Intrinsic::x86_fma_vfmadd_ps_256:
     case Intrinsic::x86_fma_vfmadd_pd_256:
+    case Intrinsic::x86_fma_vfmadd_ps_512:
+    case Intrinsic::x86_fma_vfmadd_pd_512:
       Opc = X86ISD::FMADD;
       break;
     case Intrinsic::x86_fma_vfmsub_ps:
     case Intrinsic::x86_fma_vfmsub_pd:
     case Intrinsic::x86_fma_vfmsub_ps_256:
     case Intrinsic::x86_fma_vfmsub_pd_256:
+    case Intrinsic::x86_fma_vfmsub_ps_512:
+    case Intrinsic::x86_fma_vfmsub_pd_512:
       Opc = X86ISD::FMSUB;
       break;
     case Intrinsic::x86_fma_vfnmadd_ps:
     case Intrinsic::x86_fma_vfnmadd_pd:
     case Intrinsic::x86_fma_vfnmadd_ps_256:
     case Intrinsic::x86_fma_vfnmadd_pd_256:
+    case Intrinsic::x86_fma_vfnmadd_ps_512:
+    case Intrinsic::x86_fma_vfnmadd_pd_512:
       Opc = X86ISD::FNMADD;
       break;
     case Intrinsic::x86_fma_vfnmsub_ps:
     case Intrinsic::x86_fma_vfnmsub_pd:
     case Intrinsic::x86_fma_vfnmsub_ps_256:
     case Intrinsic::x86_fma_vfnmsub_pd_256:
+    case Intrinsic::x86_fma_vfnmsub_ps_512:
+    case Intrinsic::x86_fma_vfnmsub_pd_512:
       Opc = X86ISD::FNMSUB;
       break;
     case Intrinsic::x86_fma_vfmaddsub_ps:
     case Intrinsic::x86_fma_vfmaddsub_pd:
     case Intrinsic::x86_fma_vfmaddsub_ps_256:
     case Intrinsic::x86_fma_vfmaddsub_pd_256:
+    case Intrinsic::x86_fma_vfmaddsub_ps_512:
+    case Intrinsic::x86_fma_vfmaddsub_pd_512:
       Opc = X86ISD::FMADDSUB;
       break;
     case Intrinsic::x86_fma_vfmsubadd_ps:
     case Intrinsic::x86_fma_vfmsubadd_pd:
     case Intrinsic::x86_fma_vfmsubadd_ps_256:
     case Intrinsic::x86_fma_vfmsubadd_pd_256:
+    case Intrinsic::x86_fma_vfmsubadd_ps_512:
+    case Intrinsic::x86_fma_vfmsubadd_pd_512:
       Opc = X86ISD::FMSUBADD;
       break;
     }
@@ -11386,7 +11735,87 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   }
 }
 
-static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                             SDValue Base, SDValue Index,
+                             SDValue ScaleOp, SDValue Chain,
+                             const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
+  EVT MaskVT = MVT::getVectorVT(MVT::i1, 
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+}
+
+static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  if (Src.getOpcode() == ISD::UNDEF)
+    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Base, SDValue Index,
+                              SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   switch (IntNo) {
@@ -11421,7 +11850,144 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
-
+  //int_gather(index, base, scale);
+  case Intrinsic::x86_avx512_gather_qpd_512:
+  case Intrinsic::x86_avx512_gather_qps_512:
+  case Intrinsic::x86_avx512_gather_dpd_512:
+  case Intrinsic::x86_avx512_gather_qpi_512:
+  case Intrinsic::x86_avx512_gather_qpq_512:
+  case Intrinsic::x86_avx512_gather_dpq_512:
+  case Intrinsic::x86_avx512_gather_dps_512:
+  case Intrinsic::x86_avx512_gather_dpi_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
+      case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
+      case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Index = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Scale = Op.getOperand(4);
+    return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
+  }
+  //int_gather_mask(v1, mask, index, base, scale);
+  case Intrinsic::x86_avx512_gather_qps_mask_512:
+  case Intrinsic::x86_avx512_gather_qpd_mask_512:
+  case Intrinsic::x86_avx512_gather_dpd_mask_512:
+  case Intrinsic::x86_avx512_gather_dps_mask_512:
+  case Intrinsic::x86_avx512_gather_qpi_mask_512:
+  case Intrinsic::x86_avx512_gather_qpq_mask_512:
+  case Intrinsic::x86_avx512_gather_dpi_mask_512:
+  case Intrinsic::x86_avx512_gather_dpq_mask_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_gather_qps_mask_512: 
+        Opc = X86::VGATHERQPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpd_mask_512:
+        Opc = X86::VGATHERQPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpd_mask_512:
+        Opc = X86::VGATHERDPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dps_mask_512:
+        Opc = X86::VGATHERDPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpi_mask_512:
+        Opc = X86::VPGATHERQDZrm; break;
+      case Intrinsic::x86_avx512_gather_qpq_mask_512:
+        Opc = X86::VPGATHERQQZrm; break;
+      case Intrinsic::x86_avx512_gather_dpi_mask_512:
+        Opc = X86::VPGATHERDDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpq_mask_512:
+        Opc = X86::VPGATHERDQZrm; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Base  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+                          Subtarget);
+  }
+  //int_scatter(base, index, v1, scale);
+  case Intrinsic::x86_avx512_scatter_qpd_512:
+  case Intrinsic::x86_avx512_scatter_qps_512:
+  case Intrinsic::x86_avx512_scatter_dpd_512:
+  case Intrinsic::x86_avx512_scatter_qpi_512:
+  case Intrinsic::x86_avx512_scatter_qpq_512:
+  case Intrinsic::x86_avx512_scatter_dpq_512:
+  case Intrinsic::x86_avx512_scatter_dps_512:
+  case Intrinsic::x86_avx512_scatter_dpi_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_scatter_qpd_512: 
+        Opc = X86::VSCATTERQPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qps_512:
+        Opc = X86::VSCATTERQPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpd_512:
+        Opc = X86::VSCATTERDPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_dps_512:
+        Opc = X86::VSCATTERDPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpi_512:
+        Opc = X86::VPSCATTERQDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpq_512:
+        Opc = X86::VPSCATTERQQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpq_512:
+        Opc = X86::VPSCATTERDQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpi_512:
+        Opc = X86::VPSCATTERDDZmr; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Src   = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
+  }
+  //int_scatter_mask(base, mask, index, v1, scale);
+  case Intrinsic::x86_avx512_scatter_qps_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpd_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+  case Intrinsic::x86_avx512_scatter_dps_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_scatter_qpd_mask_512: 
+        Opc = X86::VSCATTERQPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qps_mask_512:
+        Opc = X86::VSCATTERQPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+        Opc = X86::VSCATTERDPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_dps_mask_512:
+        Opc = X86::VSCATTERDPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+        Opc = X86::VPSCATTERQDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+        Opc = X86::VPSCATTERQQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpq_mask_512:
+        Opc = X86::VPSCATTERDQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+        Opc = X86::VPSCATTERDDZmr; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Src   = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+  }
   // XTEST intrinsics.
   case Intrinsic::x86_xtest: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
@@ -11913,8 +12479,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   }
 
-  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
-         "Only know how to lower V2I64/V4I64 multiply");
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+         "Only know how to lower V2I64/V4I64/V8I64 multiply");
 
   //  Ahi = psrlqi(a, 32);
   //  Bhi = psrlqi(b, 32);
@@ -11927,13 +12493,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   //  AhiBlo = psllqi(AhiBlo, 32);
   //  return AloBlo + AloBhi + AhiBlo;
 
-  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
-
-  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
-  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
+  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
 
   // Bit cast to 32-bit vectors for MULUDQ
-  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
@@ -11943,14 +12508,14 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
 
-  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
-  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
+  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
 
   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
-SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT EltTy = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
@@ -11972,16 +12537,26 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
 
   if ((SplatValue != 0) &&
       (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
-    unsigned lg2 = SplatValue.countTrailingZeros();
+    unsigned Lg2 = SplatValue.countTrailingZeros();
     // Splat the sign bit.
-    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
-    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
+    SmallVector<SDValue, 16> Sz(NumElts,
+                                DAG.getConstant(EltTy.getSizeInBits() - 1,
+                                                EltTy));
+    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
+                                          NumElts));
     // Add (N0 < 0) ? abs2 - 1 : 0;
-    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
-    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
+    SmallVector<SDValue, 16> Amt(NumElts,
+                                 DAG.getConstant(EltTy.getSizeInBits() - Lg2,
+                                                 EltTy));
+    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
+                                          NumElts));
     SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
-    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
-    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
+    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
+    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
+                                          NumElts));
 
     // If we're dividing by a positive value, we're done.  Otherwise, we must
     // negate the result.
@@ -12010,23 +12585,26 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
           (Subtarget->hasInt256() &&
-           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+          (Subtarget->hasAVX512() &&
+           (VT == MVT::v8i64 || VT == MVT::v16i32))) {
         if (Op.getOpcode() == ISD::SHL)
-          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                            DAG);
         if (Op.getOpcode() == ISD::SRL)
-          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                            DAG);
         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
-          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                            DAG);
       }
 
       if (VT == MVT::v16i8) {
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
-          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG); 
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 16> V(16,
@@ -12037,8 +12615,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
-          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG);
           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
           // Zero out the leftmost bits.
           SmallVector<SDValue, 16> V(16,
@@ -12069,8 +12648,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
-          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 32> V(32,
@@ -12081,8 +12661,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
-          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
           // Zero out the leftmost bits.
           SmallVector<SDValue, 32> V(32,
@@ -12147,14 +12728,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     default:
       llvm_unreachable("Unknown shift opcode!");
     case ISD::SHL:
-      return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                        DAG);
     case ISD::SRL:
-      return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                        DAG);
     case ISD::SRA:
-      return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                        DAG);
     }
   }
 
@@ -12172,7 +12753,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       VT == MVT::v4i32 || VT == MVT::v8i16 ||
       (Subtarget->hasInt256() &&
        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
-        VT == MVT::v8i32 || VT == MVT::v16i16))) {
+        VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+       (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
     SDValue BaseShAmt;
     EVT EltVT = VT.getVectorElementType();
 
@@ -12240,6 +12822,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v4i64:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRA:
@@ -12249,6 +12833,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v8i16:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRL:
@@ -12260,6 +12846,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v4i64:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
         }
       }
@@ -12268,7 +12856,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   if (!Subtarget->is64Bit() &&
-      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
+      (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
       Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
@@ -12297,7 +12886,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+                          SelectionDAG &DAG) {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
@@ -12316,6 +12906,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   if (V.getNode())
       return V;
 
+  if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
+    return Op;
   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
   if (Subtarget->hasInt256()) {
     if (Op.getOpcode() == ISD::SRL &&
@@ -12356,8 +12948,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
-    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
-                            DAG.getConstant(4, MVT::i32), DAG);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
 
@@ -12368,8 +12959,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
-    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
-                            DAG.getConstant(2, MVT::i32), DAG);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
 
@@ -12512,7 +13102,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 
   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                       ExtraVT.getScalarType().getSizeInBits();
-  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
 
   switch (VT.getSimpleVT().SimpleTy) {
     default: return SDValue();
@@ -12546,24 +13135,34 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
       // fall through
     case MVT::v4i32:
     case MVT::v8i16: {
-      // (sext (vzext x)) -> (vsext x)
       SDValue Op0 = Op.getOperand(0);
       SDValue Op00 = Op0.getOperand(0);
       SDValue Tmp1;
       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
       if (Op0.getOpcode() == ISD::BITCAST &&
-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
-        Tmp1 = LowerVectorIntExtend(Op00, DAG);
-      if (Tmp1.getNode()) {
-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
-               "This optimization is invalid without a VZEXT.");
-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
+        // (sext (vzext x)) -> (vsext x)
+        Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
+        if (Tmp1.getNode()) {
+          EVT ExtraEltVT = ExtraVT.getVectorElementType();
+          // This folding is only valid when the in-reg type is a vector of i8,
+          // i16, or i32.
+          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
+              ExtraEltVT == MVT::i32) {
+            SDValue Tmp1Op0 = Tmp1.getOperand(0);
+            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+                   "This optimization is invalid without a VZEXT.");
+            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          }
+          Op0 = Tmp1;
+        }
       }
 
       // If the above didn't work, then just use Shift-Left + Shift-Right.
-      Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
-      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
+      Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
+                                        DAG);
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
+                                        DAG);
     }
   }
 }
@@ -12655,9 +13254,10 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
 }
 
-SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
-  EVT SrcVT = Op.getOperand(0).getValueType();
-  EVT DstVT = Op.getValueType();
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
@@ -12742,7 +13342,8 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
                      Op.getOperand(1), Op.getOperand(2));
 }
 
-SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
@@ -12753,8 +13354,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
-  ArgListTy Args;
-  ArgListEntry Entry;
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
@@ -12767,7 +13368,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
 
   Type *RetTy = isF64
     ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
@@ -12778,7 +13380,7 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
                          CallingConv::C, /*isTaillCall=*/false,
                          /*doesNotRet=*/false, /*isReturnValueUsed*/true,
                          Callee, Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
   if (isF64)
     // Returned in xmm0 and xmm1.
@@ -12822,9 +13424,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
-  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, DAG);
-  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, DAG);
-  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, DAG);
+  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
+  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
@@ -12840,7 +13442,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::FRAME_TO_ARGS_OFFSET:
@@ -12858,7 +13461,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
-  case ISD::SHL:                return LowerShift(Op, DAG);
+  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -12866,7 +13469,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMULO:
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
-  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
@@ -12874,7 +13477,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
   case ISD::SDIV:               return LowerSDIV(Op, DAG);
-  case ISD::FSINCOS:            return LowerFSINCOS(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   }
 }
 
@@ -13128,6 +13731,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMP:                return "X86ISD::CMP";
   case X86ISD::COMI:               return "X86ISD::COMI";
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::CMPM:               return "X86ISD::CMPM";
+  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
@@ -13187,6 +13792,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
+  case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
+  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
@@ -13200,6 +13808,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
+  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
+  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   case X86ISD::ADD:                return "X86ISD::ADD";
   case X86ISD::SUB:                return "X86ISD::SUB";
   case X86ISD::ADC:                return "X86ISD::ADC";
@@ -13214,9 +13824,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::BLSI:               return "X86ISD::BLSI";
   case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
   case X86ISD::BLSR:               return "X86ISD::BLSR";
+  case X86ISD::BZHI:               return "X86ISD::BZHI";
+  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::TESTM:              return "X86ISD::TESTM";
+  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::KTEST:              return "X86ISD::KTEST";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
@@ -13239,6 +13854,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
@@ -13422,37 +14038,46 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
 bool
 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
     return false;
 
   // FIXME: pshufb, blends, shifts.
-  return (VT.getVectorNumElements() == 2 ||
+  return (SVT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
-          isMOVLMask(M, VT) ||
-          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
-          isPSHUFDMask(M, VT) ||
-          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
-          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
-          isPALIGNRMask(M, VT, Subtarget) ||
-          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
+          isMOVLMask(M, SVT) ||
+          isSHUFPMask(M, SVT) ||
+          isPSHUFDMask(M, SVT) ||
+          isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPALIGNRMask(M, SVT, Subtarget) ||
+          isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
 }
 
 bool
 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                           EVT VT) const {
-  unsigned NumElts = VT.getVectorNumElements();
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+  unsigned NumElts = SVT.getVectorNumElements();
   // FIXME: This collection of masks seems suspect.
   if (NumElts == 2)
     return true;
-  if (NumElts == 4 && VT.is128BitVector()) {
-    return (isMOVLMask(Mask, VT)  ||
-            isCommutedMOVLMask(Mask, VT, true) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
+  if (NumElts == 4 && SVT.is128BitVector()) {
+    return (isMOVLMask(Mask, SVT)  ||
+            isCommutedMOVLMask(Mask, SVT, true) ||
+            isSHUFPMask(Mask, SVT) ||
+            isSHUFPMask(Mask, SVT, /* Commuted */ true));
   }
   return false;
 }
@@ -15194,6 +15819,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::CMOV_V8F32:
   case X86::CMOV_V4F64:
   case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
@@ -15642,7 +16270,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
 
-  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
 }
 
 /// PerformTruncateCombine - Converts truncate operation to
@@ -15749,6 +16377,44 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) {
+  SDValue Vec = N->getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = N->getOperand(1);
+  MVT EltVT = N->getSimpleValueType(0);
+  
+  assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) ||
+         "Unexpected operands in ExtractBitFromMaskVector");
+
+  // variable index
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits());
+  unsigned MaxShift = VecVT.getSizeInBits() - 1;
+  Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec);
+  Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, 
+              DAG.getConstant(MaxShift - IdxVal, ScalarVT));
+  Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec,
+    DAG.getConstant(MaxShift, ScalarVT));
+
+  if (VecVT == MVT::v16i1) {
+    Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec);
+  }
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec);
+}
+
 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
 /// generation and convert it from being a bunch of shuffles and extracts
 /// to a simple store and scalar loads to extract the elements.
@@ -15759,6 +16425,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
+
+  if (InputVector.getValueType().getVectorElementType() == MVT::i1 &&
+      !DCI.isBeforeLegalize())
+    return ExtractBitFromMaskVector(N, DAG);
+
   // Detect whether we are trying to convert from mmx to i32 and the bitcast
   // from mmx to v2i32 has a single usage.
   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
@@ -15846,24 +16517,28 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
-static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
-                                   SDValue RHS, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+static std::pair<unsigned, bool>
+matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
+                   SelectionDAG &DAG, const X86Subtarget *Subtarget) {
   if (!VT.isVector())
-    return 0;
+    return std::make_pair(0, false);
 
+  bool NeedSplit = false;
   switch (VT.getSimpleVT().SimpleTy) {
-  default: return 0;
+  default: return std::make_pair(0, false);
   case MVT::v32i8:
   case MVT::v16i16:
   case MVT::v8i32:
     if (!Subtarget->hasAVX2())
-      return 0;
+      NeedSplit = true;
+    if (!Subtarget->hasAVX())
+      return std::make_pair(0, false);
+    break;
   case MVT::v16i8:
   case MVT::v8i16:
   case MVT::v4i32:
     if (!Subtarget->hasSSE2())
-      return 0;
+      return std::make_pair(0, false);
   }
 
   // SSE2 has only a small subset of the operations.
@@ -15874,6 +16549,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
+  unsigned Opc = 0;
   // Check for x CC y ? x : y.
   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
@@ -15881,16 +16557,16 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      return hasUnsigned ? X86ISD::UMIN : 0;
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      return hasUnsigned ? X86ISD::UMAX : 0;
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      return hasSigned ? X86ISD::SMIN : 0;
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      return hasSigned ? X86ISD::SMAX : 0;
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
     }
   // Check for x CC y ? y : x -- a min/max with reversed arms.
   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -15899,20 +16575,20 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      return hasUnsigned ? X86ISD::UMAX : 0;
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      return hasUnsigned ? X86ISD::UMIN : 0;
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      return hasSigned ? X86ISD::SMAX : 0;
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      return hasSigned ? X86ISD::SMIN : 0;
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
     }
   }
 
-  return 0;
+  return std::make_pair(Opc, NeedSplit);
 }
 
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
@@ -15926,13 +16602,14 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   EVT VT = LHS.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      VT != MVT::f80 && TLI.isTypeLegal(VT) &&
       (Subtarget->hasSSE2() ||
        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -16071,6 +16748,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   }
 
+  EVT CondVT = Cond.getValueType();
+  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+      CondVT.getVectorElementType() == MVT::i1) {
+    // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+    // lowering on AVX-512. In this case we convert it to
+    // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+    // The same situation for all 128 and 256-bit vectors of i8 and i16
+    EVT OpVT = LHS.getValueType();
+    if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
+        (OpVT.getVectorElementType() == MVT::i8 ||
+         OpVT.getVectorElementType() == MVT::i16)) {
+      Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
+      DCI.AddToWorklist(Cond.getNode());
+      return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
+    }
+  }
   // If this is a select between two integer constants, try to do some
   // optimizations.
   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
@@ -16195,9 +16888,12 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Early exit check
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
   // Match VSELECTs into subs with unsigned saturation.
-  if (!DCI.isBeforeLegalize() &&
-      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
@@ -16251,14 +16947,35 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   }
 
   // Try to match a min/max vector operation.
-  if (!DCI.isBeforeLegalize() &&
-      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
-    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
-      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
+    std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
+    unsigned Opc = ret.first;
+    bool NeedSplit = ret.second;
+
+    if (Opc && NeedSplit) {
+      unsigned NumElems = VT.getVectorNumElements();
+      // Extract the LHS vectors
+      SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
+      SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
+
+      // Extract the RHS vectors
+      SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
+      SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
+
+      // Create min/max for each subvector
+      LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
+      RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
+
+      // Merge the result
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
+    } else if (Opc)
+      return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
 
   // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
-  if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
-      Cond.getOpcode() == ISD::SETCC) {
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // Check if SETCC has already been promoted
+      TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) {
 
     assert(Cond.getValueType().isVector() &&
            "vector select expects a vector selector!");
@@ -16305,7 +17022,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   // to simplify previous instructions.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
@@ -16314,6 +17030,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if (BitWidth == 1)
       return SDValue();
 
+    // Check all uses of that condition operand to check whether it will be
+    // consumed by non-BLEND instructions, which may depend on all bits are set
+    // properly.
+    for (SDNode::use_iterator I = Cond->use_begin(),
+                              E = Cond->use_end(); I != E; ++I)
+      if (I->getOpcode() != ISD::VSELECT)
+        // TODO: Add other opcodes eventually lowered into BLEND.
+        return SDValue();
+
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
 
@@ -16990,33 +17715,80 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
-  // Create BLSI, and BLSR instructions
+  // Create BLSI, BLSR, and BZHI instructions
   // BLSI is X & (-X)
   // BLSR is X & (X-1)
-  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+  // BZHI is X & ((1 << Y) - 1)
+  // BEXTR is ((X >> imm) & (2**size-1))
+  if (VT == MVT::i32 || VT == MVT::i64) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDLoc DL(N);
 
-    // Check LHS for neg
-    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
-        isZero(N0.getOperand(0)))
-      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
-
-    // Check RHS for neg
-    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
-        isZero(N1.getOperand(0)))
-      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+    if (Subtarget->hasBMI()) {
+      // Check LHS for neg
+      if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+          isZero(N0.getOperand(0)))
+        return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+      // Check RHS for neg
+      if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+          isZero(N1.getOperand(0)))
+        return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+      // Check LHS for X-1
+      if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+          isAllOnes(N0.getOperand(1)))
+        return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+      // Check RHS for X-1
+      if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+          isAllOnes(N1.getOperand(1)))
+        return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+    }
+
+    if (Subtarget->hasBMI2()) {
+      // Check for (and (add (shl 1, Y), -1), X)
+      if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::SHL) {
+          SDValue N001 = N00.getOperand(1);
+          assert(N001.getValueType() == MVT::i8 && "unexpected type");
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
+          if (C && C->getZExtValue() == 1)
+            return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
+        }
+      }
 
-    // Check LHS for X-1
-    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
-        isAllOnes(N0.getOperand(1)))
-      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+      // Check for (and X, (add (shl 1, Y), -1))
+      if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::SHL) {
+          SDValue N101 = N10.getOperand(1);
+          assert(N101.getValueType() == MVT::i8 && "unexpected type");
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
+          if (C && C->getZExtValue() == 1)
+            return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
+        }
+      }
+    }
 
-    // Check RHS for X-1
-    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
-        isAllOnes(N1.getOperand(1)))
-      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+    // Check for BEXTR.
+    if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
+        (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
+      ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+      ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (MaskNode && ShiftNode) {
+        uint64_t Mask = MaskNode->getZExtValue();
+        uint64_t Shift = ShiftNode->getZExtValue();
+        if (isMask_64(Mask)) {
+          uint64_t MaskSize = CountPopulation_64(Mask);
+          if (Shift + MaskSize <= VT.getSizeInBits())
+            return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+                               DAG.getConstant(Shift | (MaskSize << 8), VT));
+        }
+      }
+    } // BEXTR
 
     return SDValue();
   }
@@ -17732,7 +18504,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
-  MVT VT = LHS.getValueType().getSimpleVT();
+  MVT VT = LHS.getSimpleValueType();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for horizontal add/sub");
@@ -18205,7 +18977,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
         !XTLI->getSubtarget()->is64Bit() &&
-        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+        VT == MVT::i64) {
       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
                                           Ld->getChain(), Op0, DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
@@ -18531,6 +19303,22 @@ namespace {
   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
 }
 
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+      if (AsmPieces.size() == 3)
+        return true;
+      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+        return true;
+    }
+  }
+  return false;
+}
+
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
 
@@ -18572,12 +19360,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (AsmPieces.size() == 4 &&
-          AsmPieces[0] == "~{cc}" &&
-          AsmPieces[1] == "~{dirflag}" &&
-          AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}")
-      return IntrinsicLowering::LowerToByteSwap(CI);
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
     }
     break;
   case 3:
@@ -18590,11 +19374,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (AsmPieces.size() == 4 &&
-          AsmPieces[0] == "~{cc}" &&
-          AsmPieces[1] == "~{dirflag}" &&
-          AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}")
+      if (clobbersFlagRegisters(AsmPieces))
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 2703274..bc3dd60 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -53,7 +53,7 @@ namespace llvm {
       /// to X86::XORPS or X86::XORPD.
       FXOR,
 
-      /// FAND - Bitwise logical ANDNOT of floating point values. This
+      /// FANDN - Bitwise logical ANDNOT of floating point values. This
       /// corresponds to X86::ANDNPS or X86::ANDNPD.
       FANDN,
 
@@ -254,6 +254,12 @@ namespace llvm {
       // VSEXT - Vector integer signed-extend.
       VSEXT,
 
+      // VTRUNC - Vector integer truncate.
+      VTRUNC,
+
+      // VTRUNC - Vector integer truncate with mask.
+      VTRUNCM,
+
       // VFPEXT - Vector FP extend.
       VFPEXT,
 
@@ -274,6 +280,13 @@ namespace llvm {
 
       // PCMP* - Vector integer comparisons.
       PCMPEQ, PCMPGT,
+      // PCMP*M - Vector integer comparisons, the result is in a mask vector.
+      PCMPEQM, PCMPGTM,
+
+      /// CMPM, CMPMU - Vector comparison generating mask bits for fp and
+      /// integer signed and unsigned data types.
+      CMPM,
+      CMPMU,
 
       // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
       ADD, SUB, ADC, SBB, SMUL,
@@ -282,18 +295,23 @@ namespace llvm {
       BLSI,   // BLSI - Extract lowest set isolated bit
       BLSMSK, // BLSMSK - Get mask up to lowest set bit
       BLSR,   // BLSR - Reset lowest set bit
+      BZHI,   // BZHI - Zero high bits
+      BEXTR,  // BEXTR - Bit field extract
 
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
 
       // MUL_IMM - X86 specific multiply by immediate.
       MUL_IMM,
 
-      // PTEST - Vector bitwise comparisons
+      // PTEST - Vector bitwise comparisons.
       PTEST,
 
-      // TESTP - Vector packed fp sign bitwise comparisons
+      // TESTP - Vector packed fp sign bitwise comparisons.
       TESTP,
 
+      // TESTM - Vector "test" in AVX-512, the result is in a mask vector.
+      TESTM,
+
       // OR/AND test for masks
       KORTEST,
       KTEST,
@@ -318,11 +336,13 @@ namespace llvm {
       UNPCKH,
       VPERMILP,
       VPERMV,
+      VPERMV3,
       VPERMI,
       VPERM2X128,
       VBROADCAST,
       // masked broadcast
       VBROADCASTM,
+      VINSERT,
 
       // PMULUDQ - Vector multiply packed unsigned doubleword integers
       PMULUDQ,
@@ -755,6 +775,8 @@ namespace llvm {
     SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
                       SelectionDAG &DAG) const;
 
+    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const LLVM_OVERRIDE;
+
     /// \brief Reset the operation actions based on target options.
     virtual void resetOperationActions();
 
@@ -831,8 +853,6 @@ namespace llvm {
                                                bool isSigned,
                                                bool isReplace) const;
 
-    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
-                                   SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
@@ -846,18 +866,12 @@ namespace llvm {
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
@@ -881,19 +895,7 @@ namespace llvm {
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
-
-    // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
-    SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
-    SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
-    SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -925,6 +927,8 @@ namespace llvm {
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                    LLVMContext &Context) const;
 
+    virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const;
+
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
     /// expand, the associated machine basic block, and the associated X86
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 8abae14..cb19fbd 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -80,6 +80,21 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))),  (v16i16 VR256X:$src)>;
 }
 
+//
+// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
+//
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+}
+
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
@@ -376,6 +391,11 @@ def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
 def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
           (VBROADCASTSDZrm addr:$src)>;
 
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+          (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+          (VBROADCASTSDZrm addr:$src)>;
+
 multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
                           RegisterClass SrcRC, RegisterClass KRC> {
   def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
@@ -402,6 +422,13 @@ def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
         (VPBROADCASTDrZrr GR32:$src)>;
 def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
         (VPBROADCASTQrZrr GR64:$src)>;
+def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
+        (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
+        (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
+        (VPBROADCASTQrZrr GR64:$src)>;
 
 multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86MemOperand x86memop, PatFrag ld_frag,
@@ -414,10 +441,11 @@ multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
   def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
                                                          VR128X:$src),
                     !strconcat(OpcodeStr, 
-                    "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+                    "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                     [(set DstRC:$dst,
                       (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
                     EVEX, EVEX_KZ;
+  let mayLoad = 1 in {
   def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set DstRC:$dst, 
@@ -425,9 +453,10 @@ multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
   def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
                                                          x86memop:$src),
                   !strconcat(OpcodeStr, 
-                      "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+                      "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                   [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, 
                                      (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
+  }
 }
 
 defm VPBROADCASTDZ  : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
@@ -437,10 +466,20 @@ defm VPBROADCASTQZ  : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
                       loadi64, VR512, v8i64, v2i64, VK8WM>,  EVEX_V512, VEX_W,
                       EVEX_CD8<64, CD8VT1>;
 
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
+          (VPBROADCASTDZrr VR128X:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
+          (VPBROADCASTQZrr VR128X:$src)>;
+
 def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
           (VBROADCASTSSZrr VR128X:$src)>;
 def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
           (VBROADCASTSDZrr VR128X:$src)>;
+
+def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
+          (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
+          (VBROADCASTSDZrr VR128X:$src)>;
     
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
@@ -473,6 +512,306 @@ defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
 defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
                                             VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
 
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERM
+//
+// -- immediate form --
+multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPERMQZ  : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64,
+                        i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedDouble in 
+defm VPERMPDZ  : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, 
+                        f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM - register form --
+multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, 
+                     PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
+                     EVEX_4V;
+}
+
+defm VPERMDZ   : avx512_perm<0x36, "vpermd",  VR512,  memopv16i32, i512mem,
+                           v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMQZ   : avx512_perm<0x36, "vpermq",  VR512,  memopv8i64,  i512mem, 
+                           v8i64>,  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPSZ  : avx512_perm<0x16, "vpermps", VR512,  memopv16f32, f512mem,
+                           v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  memopv8f64, f512mem, 
+                           v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM2I - 3 source operands form --
+multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          PatFrag mem_frag, X86MemOperand x86memop,
+                          ValueType OpVT> {
+let Constraints = "$src1 = $dst" in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, RC:$src3)))]>,
+                    EVEX_4V;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, 
+                      (mem_frag addr:$src3))))]>, EVEX_4V;
+  }
+}
+defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32, i512mem, 
+                               v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64, i512mem, 
+                               v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32, i512mem, 
+                               v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64, i512mem, 
+                               v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, Intrinsic Int, 
+                          RegisterClass KRC, RegisterClass RC,
+                          X86MemOperand x86memop, PatFrag mem_frag,
+                          SDNode OpNode, ValueType vt> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+               [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2), 
+                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
+  def rr_Int : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+               [(set RC:$dst, (Int KRC:$mask, (vt RC:$src2),
+                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
+
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
+                 []>, 
+                 EVEX_4V, EVEX_K;
+
+    def rm_Int : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
+                 [(set RC:$dst, (Int KRC:$mask, (vt RC:$src1),
+                   (mem_frag addr:$src2)))]>,
+                 EVEX_4V, EVEX_K;
+  }
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", 
+                              int_x86_avx512_mskblend_ps_512,
+                              VK16WM, VR512, f512mem,
+                              memopv16f32, vselect, v16f32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+let ExeDomain = SSEPackedDouble in
+defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", 
+                              int_x86_avx512_mskblend_pd_512,
+                              VK8WM, VR512, f512mem,
+                              memopv8f64, vselect, v8f64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", 
+                              int_x86_avx512_mskblend_d_512,
+                              VK16WM, VR512, f512mem, 
+                              memopv16i32, vselect, v16i32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+
+defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", 
+                              int_x86_avx512_mskblend_q_512, 
+                              VK8WM, VR512, f512mem, 
+                              memopv8i64, vselect, v8i64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+                            (v8f32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+              (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+                            (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+                (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+}
+
+defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512;
+defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W;
+
+defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512;
+defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W;
+
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPGTDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPEQDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt, Operand CC, string asm,
+              string asm_alt> {
+  def rri : AVX512AIi8<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rmi : AVX512AIi8<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
+                              imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  }
+}
+
+defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32,
+                              X86cmpm, v16i32, AVXCC,
+              "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32,
+                               X86cmpmu, v16i32, AVXCC,
+              "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64,
+                              X86cmpm, v8i64, AVXCC,
+              "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64,
+                               X86cmpmu, v8i64, AVXCC,
+              "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// avx512_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
+                           X86MemOperand x86memop, Operand CC,
+                           SDNode OpNode, ValueType vt, string asm,
+                           string asm_alt, Domain d> {
+  def rri : AVX512PIi8<0xC2, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>;
+  def rmi : AVX512PIi8<0xC2, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set KRC:$dst,
+              (OpNode (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], d>;
+    def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], d>;
+  }
+}
+
+defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, AVXCC, X86cmpm, v16f32,
+               "vcmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "vcmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, AVXCC, X86cmpm, v8f64,
+               "vcmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "vcmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, OpSize, EVEX_4V, VEX_W, EVEX_V512,
+               EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VCMPPSZrri
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPUDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+               
 // Mask register copy, including
 // - copy between mask registers
 // - load/store mask registers
@@ -713,3 +1052,2475 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
 
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+multiclass avx512_mov_packed<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+                            X86MemOperand x86memop, PatFrag ld_frag, 
+                            string asm, Domain d> {
+let neverHasSideEffects = 1 in
+  def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+              EVEX;
+let canFoldAsLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(set RC:$dst, (ld_frag addr:$src))], d>, EVEX;
+let Constraints = "$src1 = $dst" in {
+  def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), 
+                                     (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
+              EVEX, EVEX_K;
+  def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                                (ins RC:$src1, KRC:$mask, x86memop:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+               [], d>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
+                              "vmovaps", SSEPackedSingle>,
+                               EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
+                              "vmovapd", SSEPackedDouble>,
+                              OpSize, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32,
+                              "vmovups", SSEPackedSingle>,
+                              EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64,
+                              "vmovupd", SSEPackedDouble>,
+                               OpSize, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovaps\t{$src, $dst|$dst, $src}",
+                    [(alignedstore512 (v16f32 VR512:$src), addr:$dst)],
+                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovapd\t{$src, $dst|$dst, $src}",
+                    [(alignedstore512 (v8f64 VR512:$src), addr:$dst)],
+                    SSEPackedDouble>, EVEX, EVEX_V512,
+                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovups\t{$src, $dst|$dst, $src}",
+                    [(store (v16f32 VR512:$src), addr:$dst)],
+                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovupd\t{$src, $dst|$dst, $src}",
+                    [(store (v8f64 VR512:$src), addr:$dst)],
+                    SSEPackedDouble>, EVEX, EVEX_V512,
+                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+let neverHasSideEffects = 1 in {
+  def VMOVDQA32rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+                             (ins VR512:$src),
+                             "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                             EVEX, EVEX_V512;
+  def VMOVDQA64rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+                             (ins VR512:$src),
+                             "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                             EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in {
+  def VMOVDQA32mr  : AVX512BI<0x7F, MRMDestMem, (outs),
+                     (ins i512mem:$dst, VR512:$src),
+                     "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def VMOVDQA64mr  : AVX512BI<0x7F, MRMDestMem, (outs),
+                     (ins i512mem:$dst, VR512:$src),
+                     "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                     EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+let mayLoad = 1 in {
+def VMOVDQA32rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), 
+                           (ins i512mem:$src),
+                           "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                           EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVDQA64rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst),
+                           (ins i512mem:$src),
+                           "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                           EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+}
+
+// 512-bit aligned load/store
+def : Pat<(alignedloadv8i64 addr:$src),  (VMOVDQA64rm addr:$src)>;
+def : Pat<(alignedloadv16i32 addr:$src), (VMOVDQA32rm addr:$src)>;
+
+def : Pat<(alignedstore512 (v8i64  VR512:$src), addr:$dst),
+          (VMOVDQA64mr addr:$dst, VR512:$src)>;
+def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst),
+          (VMOVDQA32mr addr:$dst, VR512:$src)>;
+
+multiclass avx512_mov_int<bits<8> load_opc, bits<8> store_opc, string asm,
+                          RegisterClass RC, RegisterClass KRC,
+                          PatFrag ld_frag, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in
+  def rr : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
+let canFoldAsLoad = 1 in
+  def rm : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (ld_frag addr:$src))]>, EVEX;
+let mayStore = 1 in
+  def mr : AVX512XSI<store_opc, MRMDestMem, (outs),
+                     (ins x86memop:$dst, VR512:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
+let Constraints = "$src1 = $dst" in {
+  def rrk : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst),
+                                      (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>,
+              EVEX, EVEX_K;
+  def rmk : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst),
+                                  (ins RC:$src1, KRC:$mask, x86memop:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+               []>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVDQU32 : avx512_mov_int<0x6F, 0x7F, "vmovdqu32", VR512, VK16WM,
+                                memopv16i32, i512mem>,
+                                EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQU64 : avx512_mov_int<0x6F, 0x7F, "vmovdqu64", VR512, VK8WM,
+                                memopv8i64, i512mem>,
+                                EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// 512-bit unaligned load/store
+def : Pat<(loadv8i64 addr:$src),         (VMOVDQU64rm addr:$src)>;
+def : Pat<(loadv16i32 addr:$src),        (VMOVDQU32rm addr:$src)>;
+
+def : Pat<(store (v8i64  VR512:$src), addr:$dst),
+          (VMOVDQU64mr addr:$dst, VR512:$src)>;
+def : Pat<(store (v16i32 VR512:$src), addr:$dst),
+          (VMOVDQU32mr addr:$dst, VR512:$src)>;
+
+let AddedComplexity = 20 in {
+def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), 
+                           (v16f32 VR512:$src2))),
+                  (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), 
+                           (v8f64 VR512:$src2))),
+                  (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), 
+                           (v16i32 VR512:$src2))),
+                  (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), 
+                           (v8i64 VR512:$src2))),
+                  (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+}
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        EVEX, VEX_LIG;
+def VMOVDI2PDIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512SI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+}
+def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         EVEX_CD8<64, CD8VT1>;
+
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr  : AVX512SI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
+
+def VMOVDI2SSZrm  : AVX512SI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                       EVEX, VEX_LIG;
+def VMOVPDI2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128X:$src),
+                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                                                   (iPTR 0)))],
+                      IIC_SSE_MOVD_ToGP>, TB, OpSize, EVEX, VEX_LIG, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs),
+                       (ins i64mem:$dst, VR128X:$src),
+                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+                               addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, OpSize, VEX_LIG, VEX_W, TB, EVEX_CD8<64, CD8VT1>,
+                       Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst),
+                      (ins FR32X:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32X:$src))],
+                      IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
+def VMOVSS2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+                      (ins i32mem:$dst, FR32X:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Quadword Int to Packed Quadword Int
+//
+def VMOVQI2PQIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                      EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar <string asm, RegisterClass RC, 
+                              SDNode OpNode, ValueType vt,
+                              X86MemOperand x86memop, PatFrag mem_pat> {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), 
+              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
+                                      (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
+  def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
+              EVEX, VEX_LIG;
+  def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+             [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+             EVEX, VEX_LIG;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMOVSSZ : avx512_move_scalar<"movss{z}", FR32X, X86Movss, v4f32, f32mem,
+                                 loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+
+let ExeDomain = SSEPackedDouble in
+defm VMOVSDZ : avx512_move_scalar<"movsd{z}", FR64X, X86Movsd, v2f64, f64mem,
+                                 loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+
+// For the disassembler
+let isCodeGenOnly = 1 in {
+  def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR32X:$src2),
+                        "movss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XS, EVEX_4V, VEX_LIG;
+  def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR64X:$src2),
+                        "movsd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XD, EVEX_4V, VEX_LIG, VEX_W;
+}
+
+let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4f32 (V_SET0)), 
+              (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+                                            FR32X:$src)), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+                                     FR64X:$src)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4i32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+  def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4f32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+                                (ins VR128X:$src),
+                                "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                [(set VR128X:$dst, (v2i64 (X86vzmovl 
+                                                   (v2i64 VR128X:$src))))],
+                                IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                                 (ins i128mem:$src),
+                                 "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                 [(set VR128X:$dst, (v2i64 (X86vzmovl
+                                                     (loadv2i64 addr:$src))))],
+                                 IIC_SSE_MOVDQ>, EVEX, VEX_W,
+                                 EVEX_CD8<8, CD8VT8>;
+
+let Predicates = [HasAVX512] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIZrr GR64:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIZrr GR32:$src)>;
+              
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+            (VMOVZPQILo2PQIZrr VR128X:$src)>;
+  }
+
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, PatFrag scalar_mfrag,
+                        X86MemOperand x86scalar_mop, string BrdcstStr,
+                        OpndItins itins, bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       itins.rr>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
+                                     itins.rm>, EVEX_4V;
+  def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       [(set RC:$dst, (OpNode RC:$src1, 
+                       (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+                        itins.rm>, EVEX_4V, EVEX_B;
+}
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V, VEX_W;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V, VEX_W;
+}
+
+defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 0>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, 
+                   EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
+
+defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32,
+                VR512, memopv8i64, i512mem, SSE_INTALU_ITINS_P, 1>, T8,
+                EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32,
+                 VR512, memopv8i64, i512mem, SSE_INTMUL_ITINS_P, 1>, EVEX_V512,
+                 EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))),
+          (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
+
+defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : AVX512PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           d>, EVEX_4V;
+    def rm : AVX512PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                       (vt (OpNode RC:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))],
+                        d>, EVEX_4V;
+}
+
+defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64,
+      VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64,
+      VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64,
+      VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64,
+      VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop> {
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       IIC_SSE_UNPCK>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     IIC_SSE_UNPCK>, EVEX_4V;
+}
+defm VPUNPCKLDQZ  : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPUNPCKHDQZ  : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - PSHUFD
+//
+
+multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
+                      i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
+                      memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512,
+                      EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
+                      memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512,
+                      VEX_W, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPSZri VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPDZri VR512:$src1, imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPORDZ  : avx512_binop_rm<0xEB, "vpord", or, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPORQZ  : avx512_binop_rm<0xEB, "vporq", or, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VR512,
+                      memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                      SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 0>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  FP arithmetic
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss{z}"), OpNode, FR32X,
+                             f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<32, CD8VT1>;
+  defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd{z}"), OpNode, FR64X,
+                             f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<64, CD8VT1>;
+}
+
+let isCommutable = 1 in {
+defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>;
+defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>;
+defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>;
+defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>;
+}
+let isCommutable = 0 in {
+defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>;
+defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>;
+}
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+                           string BrdcstStr,
+                           Domain d, OpndItins itins, bit commutable> {
+  let isCommutable = commutable in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       EVEX_4V, TB;
+  let mayLoad = 1 in {
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>, EVEX_4V, TB;
+    def rmb : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       [(set RC:$dst, (OpNode RC:$src1, 
+                       (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+       itins.rm, d>, EVEX_4V, EVEX_B, TB;
+    }
+}
+
+defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, 
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                   
+defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VR512, v8f64, f512mem, 
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, 
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX5128I<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))]>, EVEX_4V;
+  def rm : AVX5128I<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), 
+              (bitconvert (memop_frag addr:$src2))))]>, EVEX_4V;
+}
+
+defm VPTESTMDZ  : avx512_vptest<0x27, "vptestmd", VK16, VR512,  f512mem,
+                              memopv16i32, X86testm, v16i32>, EVEX_V512,
+                              EVEX_CD8<32, CD8VF>;
+defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem,
+                              memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, RegisterClass RC,
+                         ValueType vt, X86MemOperand x86memop, PatFrag mem_frag,
+                         RegisterClass KRC> {
+  def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode (mem_frag addr:$src1),
+                     (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          RegisterClass RC, ValueType vt, ValueType SrcVT,
+                          PatFrag bc_frag, RegisterClass KRC> {
+  // src2 is always 128-bit
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1,
+                       (bc_frag (memopv2i64 addr:$src2)))))],
+                        SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag> {
+  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (vt RC:$src2))))]>,
+             EVEX_4V;
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
+             EVEX_4V;
+}
+
+defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, 
+                        X86MemOperand x86memop, PatFrag memop_frag> {
+def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
+def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst,
+                      (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
+}
+
+defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
+          (VMOVDDUPZrm addr:$src)>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+  def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
+}
+
+defm VMOVSHDUPZ  : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+defm VMOVSLDUPZ  : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))),
+           (VMOVSHDUPZrm addr:$src)>;
+def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))),
+           (VMOVSLDUPZrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+           IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+          IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+            (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, RC:$src3),
+          !strconcat(OpcodeStr,"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>;
+
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, x86memop:$src3),
+          !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
+                                               (mem_frag addr:$src3))))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86scalar_mop:$src3),
+           !strconcat(OpcodeStr, "\t{${src3}", BrdcstStr, 
+            ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD213PSZ    : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB213PSZ    : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD213PSZ   : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB213PSZ   : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD213PDZ    : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB213PDZ    : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src3, x86memop:$src2),
+          !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src3, x86scalar_mop:$src2),
+           !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, 
+            ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, 
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD132PSZ    : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB132PSZ    : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD132PSZ   : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB132PSZ   : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD132PDZ    : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB132PDZ    : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                 RegisterClass RC, ValueType OpVT, 
+                 X86MemOperand x86memop, Operand memop, 
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def r     : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+  let mayLoad = 1 in
+  def m     : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+
+defm VFMADDSSZ  : avx512_fma3s_rm<0xA9, "vfmadd213ss{z}", X86Fmadd, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMADDSDZ  : avx512_fma3s_rm<0xA9, "vfmadd213sd{z}", X86Fmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFMSUBSSZ  : avx512_fma3s_rm<0xAB, "vfmsub213ss{z}", X86Fmsub, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMSUBSDZ  : avx512_fma3s_rm<0xAB, "vfmsub213sd{z}", X86Fmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMADDSSZ  : avx512_fma3s_rm<0xAD, "vfnmadd213ss{z}", X86Fnmadd, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMADDSDZ  : avx512_fma3s_rm<0xAD, "vfnmadd213sd{z}", X86Fnmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMSUBSSZ  : avx512_fma3s_rm<0xAF, "vfnmsub213ss{z}", X86Fnmsub, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMSUBSDZ  : avx512_fma3s_rm<0xAF, "vfnmsub213sd{z}", X86Fnmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let neverHasSideEffects = 1 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+} // neverHasSideEffects = 1
+}
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ   : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}{z}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}{z}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ   : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}{z}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}{z}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+          (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+          (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+          (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+          (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ   : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}{z}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}{z}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}{z}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}{z}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                          string asm> {
+let neverHasSideEffects = 1 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
+} // neverHasSideEffects = 1
+}
+let Predicates = [HasAVX512] in {
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64,
+                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi,
+                                   ssmem, sse_load_f32, "cvtss2usi{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtss2usi64, ssmem,
+                                   sse_load_f32, "cvtss2usi{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64,
+                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi,
+                                   sdmem, sse_load_f64, "cvtsd2usi{z}">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtsd2usi64, sdmem,
+                                   sse_load_f64, "cvtsd2usi{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+
+defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
+                                   ssmem, sse_load_f32, "cvttss2si{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
+                                   sdmem, sse_load_f64, "cvttsd2si{z}">, XD,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                   "cvttsd2si{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                   int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
+                                   "cvttss2si{z}">, XS, EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                   int_x86_avx512_cvttss2usi64, ssmem,
+                                   sse_load_f32, "cvttss2usi{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                   int_x86_avx512_cvttsd2usi,
+                                   sdmem, sse_load_f64, "cvttsd2usi{z}">, XD,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                   int_x86_avx512_cvttsd2usi64, sdmem,
+                                   sse_load_f64, "cvttsd2usi{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
+}
+
+defm VCVTTSS2SIZ    : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si{z}">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USIZ   : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi{z}">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z  : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si{z}">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi{z}">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ    : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si{z}">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USIZ   : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi{z}">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z  : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si{z}">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi{z}">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+//===----------------------------------------------------------------------===//
+// AVX-512  Convert form float to double and back
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst),
+                    (ins FR32X:$src1, FR32X:$src2),
+                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst),
+                    (ins FR32X:$src1, f32mem:$src2),
+                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+                    EVEX_CD8<32, CD8VT1>;
+
+// Convert scalar double to scalar single
+def VCVTSD2SSZrr  : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst),
+                      (ins FR64X:$src1, FR64X:$src2),
+                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSZrm  : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst),
+                      (ins FR64X:$src1, f64mem:$src2),
+                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>,
+      Requires<[HasAVX512]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+      Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
+    Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
+           Requires<[HasAVX512]>;
+
+multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, 
+               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, 
+               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
+               Domain d> {
+let neverHasSideEffects = 1 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
+} // neverHasSideEffects = 1
+}
+
+defm VCVTPD2PSZ : avx512_vcvt_fp<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
+                                memopv8f64, f512mem, v8f32, v8f64,
+                                SSEPackedSingle>, EVEX_V512, VEX_W, OpSize,
+                                EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
+                                memopv4f64, f256mem, v8f64, v8f32,
+                                SSEPackedDouble>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+defm VCVTDQ2PSZ : avx512_vcvt_fp<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
+                                memopv8i64, i512mem, v16f32, v16i32,
+                                SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
+                                memopv4i64, i256mem, v8f64, v8i32,
+                                SSEPackedDouble>, EVEX_V512, XS,
+                                EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
+                                 memopv8f64, f512mem, v8i32, v8f64, 
+                                 SSEPackedDouble>, EVEX_V512, OpSize, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, 
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
+                                 memopv8f64, f512mem, v8i32, v8f64,
+                                 SSEPackedDouble>, EVEX_V512, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+                                 
+defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
+                                 memopv4i64, f256mem, v8f64, v8i32,
+                                 SSEPackedDouble>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VH>;
+                                 
+defm VCVTUDQ2PSZ : avx512_vcvt_fp<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
+                                 memopv16i32, f512mem, v16f32, v16i32,
+                                 SSEPackedSingle>, EVEX_V512, XD,
+                                 EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr 
+           (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+                                 
+
+def : Pat<(int_x86_avx512_cvtdq2_ps_512 VR512:$src),
+          (VCVTDQ2PSZrr VR512:$src)>;
+def : Pat<(int_x86_avx512_cvtdq2_ps_512 (bitconvert (memopv8i64 addr:$src))),
+          (VCVTDQ2PSZrm addr:$src)>;
+
+def VCVTPS2DQZrr : AVX512BI<0x5B, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst,
+                          (int_x86_avx512_cvt_ps2dq_512 VR512:$src))],
+                        IIC_SSE_CVT_PS_RR>, EVEX, EVEX_V512;
+def VCVTPS2DQZrm : AVX512BI<0x5B, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst,
+                          (int_x86_avx512_cvt_ps2dq_512 (memopv16f32 addr:$src)))],
+                        IIC_SSE_CVT_PS_RM>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
+            (VCVTPD2PSZrm addr:$src)>;
+  def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_f16c_ph2ps<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop, Intrinsic Int> {
+  def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set destRC:$dst, (Int srcRC:$src))]>, EVEX;
+  let neverHasSideEffects = 1, mayLoad = 1 in
+  def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX;
+}
+
+multiclass avx512_f16c_ps2ph<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop, Intrinsic Int> {
+  def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
+               (ins srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               [(set destRC:$dst, (Int srcRC:$src1, imm:$src2))]>, EVEX;
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+}
+
+defm VCVTPH2PSZ : avx512_f16c_ph2ps<VR512, VR256X, f256mem,
+                                    int_x86_avx512_vcvtph2ps_512>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+defm VCVTPS2PHZ : avx512_f16c_ps2ph<VR256X, VR512, f256mem,
+                                    int_x86_avx512_vcvtps2ph_512>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+                                 "ucomiss{z}">, TB, EVEX, VEX_LIG,
+                                 EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd{z}">, TB, OpSize, EVEX,
+                                  VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let Pattern = []<dag> in {
+    defm VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load,
+                                   "comiss{z}">, TB, EVEX, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>;
+    defm VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load,
+                                   "comisd{z}">, TB, OpSize, EVEX,
+                                    VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+  defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+                            load, "ucomiss">, TB, EVEX, VEX_LIG,
+                            EVEX_CD8<32, CD8VT1>;
+  defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+                            load, "ucomisd">, TB, OpSize, EVEX,
+                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+  defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+                            load, "comiss">, TB, EVEX, VEX_LIG,
+                            EVEX_CD8<32, CD8VT1>;
+  defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+                            load, "comisd">, TB, OpSize, EVEX,
+                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+  
+/// avx512_unop_p - AVX-512 unops in packed form.
+multiclass avx512_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PSZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        !strconcat(OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))]>,
+                        EVEX, EVEX_V512;
+  def PSZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f256mem:$src),
+                        !strconcat(OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
+                        EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        !strconcat(OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))]>,
+                        EVEX, EVEX_V512, VEX_W;
+  def PDZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                        !strconcat(OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
+                        EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+/// avx512_fp_unop_p_int - AVX-512 intrinsics unops in packed forms.
+multiclass avx512_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V16F32Int, Intrinsic V8F64Int> {
+  def PSZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
+                           EVEX, EVEX_V512;
+  def PSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "ps\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, 
+                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "pd\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
+                           EVEX, EVEX_V512, VEX_W;
+  def PDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "pd\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, 
+                           (V8F64Int (memopv8f64 addr:$src)))]>,
+                            EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+/// avx512_fp_unop_s - AVX-512 unops in scalar form.
+multiclass avx512_fp_unop_s<bits<8> opc, string OpcodeStr> {
+  let hasSideEffects = 0 in {
+  def SSZr : AVX5128I<opc, MRMSrcReg, (outs FR32X:$dst),
+               (ins FR32X:$src1, FR32X:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, EVEX_4V;
+  let mayLoad = 1 in {
+  def SSZm : AVX5128I<opc, MRMSrcMem, (outs FR32X:$dst),
+               (ins FR32X:$src1, f32mem:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  def SSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
+                   (ins VR128X:$src1, ssmem:$src2),
+                   !strconcat(OpcodeStr,
+                              "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  }
+  def SDZr : AVX5128I<opc, MRMSrcReg, (outs FR64X:$dst),
+               (ins FR64X:$src1, FR64X:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                      EVEX_4V, VEX_W;
+  let mayLoad = 1 in {
+  def SDZm : AVX5128I<opc, MRMSrcMem, (outs FR64X:$dst),
+               (ins FR64X:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+               EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def SDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
+                  (ins VR128X:$src1, sdmem:$src2),
+                   !strconcat(OpcodeStr,
+                              "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  []>, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+}
+
+defm VRCP14   : avx512_fp_unop_s<0x4D, "vrcp14">,
+                avx512_fp_unop_p<0x4C, "vrcp14", X86frcp>,
+                avx512_fp_unop_p_int<0x4C, "vrcp14", 
+                    int_x86_avx512_rcp14_ps_512, int_x86_avx512_rcp14_pd_512>;
+
+defm VRSQRT14  : avx512_fp_unop_s<0x4F, "vrsqrt14">,
+                avx512_fp_unop_p<0x4E, "vrsqrt14", X86frsqrt>,
+                avx512_fp_unop_p_int<0x4E, "vrsqrt14", 
+                    int_x86_avx512_rsqrt14_ps_512, int_x86_avx512_rsqrt14_pd_512>;
+
+def : Pat<(int_x86_avx512_rsqrt14_ss VR128X:$src),
+          (COPY_TO_REGCLASS (VRSQRT14SSZr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                            VR128X)>;
+def : Pat<(int_x86_avx512_rsqrt14_ss sse_load_f32:$src),
+          (VRSQRT14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+def : Pat<(int_x86_avx512_rcp14_ss VR128X:$src),
+          (COPY_TO_REGCLASS (VRCP14SSZr (f32 (IMPLICIT_DEF)),
+                                      (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                            VR128X)>;
+def : Pat<(int_x86_avx512_rcp14_ss sse_load_f32:$src),
+          (VRCP14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+let AddedComplexity = 20, Predicates = [HasERI] in {
+defm VRCP28   : avx512_fp_unop_s<0xCB, "vrcp28">,
+                avx512_fp_unop_p<0xCA, "vrcp28", X86frcp>,
+                avx512_fp_unop_p_int<0xCA, "vrcp28",
+                    int_x86_avx512_rcp28_ps_512, int_x86_avx512_rcp28_pd_512>;
+
+defm VRSQRT28  : avx512_fp_unop_s<0xCD, "vrsqrt28">,
+                avx512_fp_unop_p<0xCC, "vrsqrt28", X86frsqrt>,
+                avx512_fp_unop_p_int<0xCC, "vrsqrt28",
+                    int_x86_avx512_rsqrt28_ps_512, int_x86_avx512_rsqrt28_pd_512>;
+}
+
+let Predicates = [HasERI] in {
+  def : Pat<(int_x86_avx512_rsqrt28_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VRSQRT28SSZr (f32 (IMPLICIT_DEF)),
+                                         (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_avx512_rsqrt28_ss sse_load_f32:$src),
+            (VRSQRT28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_avx512_rcp28_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VRCP28SSZr (f32 (IMPLICIT_DEF)),
+                                       (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_avx512_rcp28_ss sse_load_f32:$src),
+            (VRCP28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              Intrinsic V16F32Int, Intrinsic V8F64Int,
+                              OpndItins itins_s, OpndItins itins_d> {
+  def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>,
+             EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+  def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, 
+                (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))],
+              itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+  def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>,
+              EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+    def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                [(set VR512:$dst, (OpNode
+                  (v8f64 (bitconvert (memopv16f32 addr:$src)))))],
+                itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+  def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
+                           EVEX, EVEX_V512;
+  def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                          !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR512:$dst, 
+                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
+                          EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
+                           EVEX, EVEX_V512, VEX_W;
+  def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "pd\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>,
+                         EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
+                          Intrinsic F32Int, Intrinsic F64Int,
+                          OpndItins itins_s, OpndItins itins_d> {
+  def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
+               (ins FR32X:$src1, FR32X:$src2),
+               !strconcat(OpcodeStr,
+                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rr>, XS, EVEX_4V;
+  def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F32Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XS, EVEX_4V;
+  let mayLoad = 1 in {
+  def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
+               (ins FR32X:$src1, f32mem:$src2),
+               !strconcat(OpcodeStr,
+                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                   (ins VR128X:$src1, ssmem:$src2),
+                   !strconcat(OpcodeStr,
+                 "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR128X:$dst, 
+                     (F32Int VR128X:$src1, sse_load_f32:$src2))],
+                   itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  }
+  def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
+               (ins FR64X:$src1, FR64X:$src2),
+               !strconcat(OpcodeStr,
+                          "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                      XD, EVEX_4V, VEX_W;
+  def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F64Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XD, EVEX_4V, VEX_W;
+  let mayLoad = 1 in {
+  def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
+               (ins FR64X:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+               XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                  (ins VR128X:$src1, sdmem:$src2),
+                   !strconcat(OpcodeStr,
+                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR128X:$dst, 
+                    (F64Int VR128X:$src1, sse_load_f64:$src2))]>, 
+                  XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+
+
+defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt", 
+                int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, 
+                SSE_SQRTSS, SSE_SQRTSD>,
+              avx512_sqrt_packed<0x51, "vsqrt", fsqrt,
+                int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512,
+                SSE_SQRTPS, SSE_SQRTPD>;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(f32 (fsqrt FR32X:$src)),
+            (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64X:$src)),
+            (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32X:$src)),
+            (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32X:$src)),
+            (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR64)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
+
+
+multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int,
+                            CD8VForm VForm> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : AVX512AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PSm : AVX512AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                    EVEX_CD8<32, VForm>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : AVX512AIi8<opcpd, MRMSrcReg,
+                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PDm : AVX512AIi8<opcpd, MRMSrcMem,
+                     (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                     EVEX_CD8<64, VForm>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int> {
+let ExeDomain = GenericDomain in {
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SSr : AVX512AIi8<opcss, MRMSrcReg,
+      (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3),
+      !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : AVX512AIi8<opcss, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>;
+
+  // Intrinsic operation, mem.
+  def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst),
+                     (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                   "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     [(set VR128X:$dst, (F32Int VR128X:$src1, 
+                                         sse_load_f32:$src2, imm:$src3))]>,
+                     EVEX_CD8<32, CD8VT1>;
+
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SDr : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, VEX_W;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+        VEX_W;
+
+  // Intrinsic operation, mem.
+  def SDm : AVX512AIi8<opcsd, MRMSrcMem,
+        (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst,
+              (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        VEX_W, EVEX_CD8<64, CD8VT1>;
+} // ExeDomain = GenericDomain
+}
+
+let Predicates = [HasAVX512] in {
+  defm VRNDSCALE  : avx512_fp_binop_rm<0x0A, 0x0B, "vrndscale",
+                              int_x86_avx512_rndscale_ss,
+                              int_x86_avx512_rndscale_sd>, EVEX_4V;
+
+  defm VRNDSCALEZ : avx512_fp_unop_rm<0x08, 0x09, "vrndscale", f256mem, VR512,
+                                  memopv16f32, memopv8f64,
+                                  int_x86_avx512_rndscale_ps_512,
+                                  int_x86_avx512_rndscale_pd_512, CD8VF>, 
+                                  EVEX, EVEX_V512;
+}
+
+def : Pat<(ffloor FR32X:$src),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>;
+def : Pat<(f64 (ffloor FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>;
+def : Pat<(f32 (fnearbyint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>;
+def : Pat<(f64 (fnearbyint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>;
+def : Pat<(f32 (fceil FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>;
+def : Pat<(f64 (fceil FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>;
+def : Pat<(f32 (frint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>;
+def : Pat<(f64 (frint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>;
+def : Pat<(f32 (ftrunc FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>;
+def : Pat<(f64 (ftrunc FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>;
+
+def : Pat<(v16f32 (ffloor VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x3))>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
+                          RegisterClass dstRC, RegisterClass srcRC,
+                          RegisterClass KRC, X86MemOperand x86memop> {
+  def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins srcRC:$src),
+               !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+
+  def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr,
+                 "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               []>, EVEX, EVEX_KZ;
+
+  def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+}
+defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVSQB   : avx512_trunc_sat<0x22, "vpmovsqb",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVUSQB  : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVQW    : avx512_trunc_sat<0x34, "vpmovqw",   VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVSQW   : avx512_trunc_sat<0x24, "vpmovsqw",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVUSQW  : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVQD    : avx512_trunc_sat<0x35, "vpmovqd",   VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVSQD   : avx512_trunc_sat<0x25, "vpmovsqd",  VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVUSQD  : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVDW    : avx512_trunc_sat<0x33, "vpmovdw",   VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVSDW   : avx512_trunc_sat<0x23, "vpmovsdw",  VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVUSDW  : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVDB    : avx512_trunc_sat<0x31, "vpmovdb",   VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVSDB   : avx512_trunc_sat<0x21, "vpmovsdb",  VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVUSDB  : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+
+def : Pat<(v16i8  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQBrr  VR512:$src)>;
+def : Pat<(v8i16  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQWrr  VR512:$src)>;
+def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr  VR512:$src)>;
+def : Pat<(v16i8  (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr  VR512:$src)>;
+def : Pat<(v8i32  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQDrr  VR512:$src)>;
+
+def : Pat<(v16i8  (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDBkrr VK16WM:$mask, VR512:$src)>;
+def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDWkrr VK16WM:$mask, VR512:$src)>;
+def : Pat<(v8i16  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQWkrr  VK8WM:$mask, VR512:$src)>;
+def : Pat<(v8i32  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQDkrr  VK8WM:$mask, VR512:$src)>;
+
+
+multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC,
+                      RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag, 
+                      X86MemOperand x86memop, ValueType OpVT, ValueType InVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins SrcRC:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins x86memop:$src),
+              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
+              EVEX;
+}
+
+defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext, 
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext, 
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+                             
+defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext, 
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, 
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayLoad = 1,
+  Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb),
+            (ins RC:$src1, KRC:$mask, memop:$src2),
+            !strconcat(OpcodeStr,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+  
+defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512,  vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512,  vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X,  vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayStore = 1, Constraints = "$mask = $mask_wb" in
+  def mr  : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb),
+            (ins memop:$dst, KRC:$mask, RC:$src2),
+            !strconcat(OpcodeStr,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+  
+defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
+                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
+                                  EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+
+multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
+                      ValueType vt, string OpcodeStr, PatFrag mem_frag,
+                      Domain d> {
+  def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
+                  SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
+                  SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v16i32 (X86Shufp VR512:$src1,
+                    (memopv16i32 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v8i64 (X86Shufp VR512:$src1,
+                            (memopv8i64 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, RC:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+  let mayLoad = 1 in
+  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
+                     (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+}
+defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, 
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, 
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+
+multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                    EVEX;
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), 
+                   (ins x86memop:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                   EVEX;
+}
+
+defm VPABSD : avx512_vpabs<0x1E, "vpabsd", VR512, i512mem>, EVEX_V512,
+                        EVEX_CD8<32, CD8VF>;
+defm VPABSQ : avx512_vpabs<0x1F, "vpabsq", VR512, i512mem>, EVEX_V512, VEX_W,
+                        EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_conflict<bits<8> opc, string OpcodeStr, 
+                        RegisterClass RC, RegisterClass KRC, PatFrag memop_frag,
+                        X86MemOperand x86memop, PatFrag scalar_mfrag,
+                        X86MemOperand x86scalar_mop, string BrdcstStr,
+                        Intrinsic Int, Intrinsic maskInt, Intrinsic maskzInt> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src),
+       !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"),
+       [(set RC:$dst, (Int RC:$src))]>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86memop:$src),
+       !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"),
+       [(set RC:$dst, (Int (memop_frag addr:$src)))]>, EVEX;
+  def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86scalar_mop:$src),
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+                  ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_B;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src),
+       !strconcat(OpcodeStr,
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       [(set RC:$dst, (maskzInt KRC:$mask, RC:$src))]>, EVEX, EVEX_KZ;
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src),
+       !strconcat(OpcodeStr,
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       [(set RC:$dst, (maskzInt KRC:$mask, (memop_frag addr:$src)))]>,
+       EVEX, EVEX_KZ;
+  def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86scalar_mop:$src),
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+                  ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
+                  BrdcstStr, "}"),
+       []>, EVEX, EVEX_KZ, EVEX_B;
+       
+  let Constraints = "$src1 = $dst" in {
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, RC:$src2),
+       !strconcat(OpcodeStr,
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, RC:$src2))]>, EVEX, EVEX_K;
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86memop:$src2),
+       !strconcat(OpcodeStr,
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, (memop_frag addr:$src2)))]>, EVEX, EVEX_K;
+  def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_K, EVEX_B;
+   }
+}
+
+let Predicates = [HasCDI] in {
+defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM,
+                    memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                    int_x86_avx512_conflict_d_512,
+                    int_x86_avx512_conflict_d_mask_512,
+                    int_x86_avx512_conflict_d_maskz_512>,
+                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM,
+                    memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                    int_x86_avx512_conflict_q_512,
+                    int_x86_avx512_conflict_q_mask_512,
+                    int_x86_avx512_conflict_q_maskz_512>,
+                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 9ce02ba..7fc9c44 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -497,6 +497,21 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                 Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
+let isCodeGenOnly = 1, CodeSize = 2 in {
+def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                  "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize, Requires<[In32BitMode]>;
+def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                  "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                Requires<[In32BitMode]>;
+def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                  "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize, Requires<[In32BitMode]>;
+def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                  "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                Requires<[In32BitMode]>;
+} // isCodeGenOnly = 1, CodeSize = 2
+
 } // Constraints = "$src1 = $dst", SchedRW
 
 let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
@@ -578,7 +593,6 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
 } // CodeSize = 2, SchedRW
 } // Defs = [EFLAGS]
 
-
 /// X86TypeInfo - This is a bunch of information that describes relevant X86
 /// information about value types.  For example, it can tell you what the
 /// register class and preferred load to use.
@@ -726,20 +740,25 @@ class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
                   (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
-                          EFLAGS))], IIC_BIN_NONMEM>;
+                          EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
-class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, MRMSrcReg, typeinfo,
         (outs typeinfo.RegClass:$dst),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
     Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
   let hasSideEffects = 0;
 }
 
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
 // BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
 class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo, (outs),
@@ -753,10 +772,11 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
 
 // BinOpRM - Instructions like "add reg, reg, [mem]".
 class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, list<dag> pattern>
+              dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, MRMSrcMem, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALULd, ReadAfterLd]>;
 
 // BinOpRM_R - Instructions like "add reg, reg, [mem]".
@@ -786,14 +806,15 @@ class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
-                    EFLAGS))]>;
+                    EFLAGS))], IIC_BIN_CARRY_MEM>;
 
 // BinOpRI - Instructions like "add reg, reg, imm".
 class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, dag outlist, list<dag> pattern>
+              Format f, dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
 }
@@ -824,14 +845,15 @@ class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
                 (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
-                        EFLAGS))]>;
+                        EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpRI8 - Instructions like "add reg, reg, imm8".
 class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-               Format f, dag outlist, list<dag> pattern>
+               Format f, dag outlist, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALU]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
@@ -863,14 +885,14 @@ class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
              [(set typeinfo.RegClass:$dst, EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
-                       EFLAGS))]>;
+                       EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpMR - Instructions like "add [mem], reg".
 class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              list<dag> pattern>
+              list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, MRMDestMem, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]>;
 
 // BinOpMR_RMW - Instructions like "add [mem], reg".
@@ -886,7 +908,7 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMR<opcode, mnemonic, typeinfo,
           [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
                   addr:$dst),
-           (implicit EFLAGS)]>;
+           (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
 
 // BinOpMR_F - Instructions like "cmp [mem], reg".
 class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -896,10 +918,11 @@ class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI - Instructions like "add [mem], imm".
 class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
-              Format f, list<dag> pattern, bits<8> opcode = 0x80>
+              Format f, list<dag> pattern, bits<8> opcode = 0x80,
+              InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]> {
   let ImmT = typeinfo.ImmEncoding;
 }
@@ -917,7 +940,7 @@ class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI<mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-             (implicit EFLAGS)]>;
+             (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>;
 
 // BinOpMI_F - Instructions like "cmp [mem], imm".
 class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
@@ -929,10 +952,11 @@ class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI8 - Instructions like "add [mem], imm8".
 class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
-               Format f, list<dag> pattern>
+               Format f, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_MEM>
   : ITy<0x82, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
@@ -951,7 +975,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
-              (implicit EFLAGS)]>;
+              (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
 
 // BinOpMI8_F - Instructions like "cmp [mem], imm8".
 class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
@@ -962,10 +986,11 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
 class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg, string operands>
+              Register areg, string operands,
+              InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, RawFrm, typeinfo,
         (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, operands, []>, Sched<[WriteALU]> {
+        mnemonic, operands, [], itin>, Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
   let Defs = [areg, EFLAGS];
@@ -976,7 +1001,8 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 // and use EFLAGS.
 class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 Register areg, string operands>
-  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+            IIC_BIN_CARRY_NONMEM> {
   let Uses = [areg, EFLAGS];
 }
 
@@ -1070,10 +1096,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
       } // isCommutable
 
-      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
 
       def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
       def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 8969946..7d10b67 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -884,6 +884,24 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
                     [(set VR256:$dst,
                       (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
                                           EFLAGS)))]>;
+  def CMOV_V8I64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8I64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V8F64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8F64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V16F32 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V16F32 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
 }
 
 
@@ -917,8 +935,6 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
 def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV32mi addr:$dst, tblockaddress:$src)>;
 
-
-
 // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
 // code model mode, should use 'movabs'.  FIXME: This is really a hack, the
 //  'movabs' predicate should handle this sort of thing.
@@ -966,14 +982,12 @@ def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tblockaddress:$src)>,
           Requires<[NearData, IsStatic]>;
 
-
-
 // Calls
 
 // tls has some funny stuff here...
 // This corresponds to movabs $foo@tpoff, %rax
 def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
-          (MOV64ri tglobaltlsaddr :$dst)>;
+          (MOV64ri32 tglobaltlsaddr :$dst)>;
 // This corresponds to add $foo@tpoff, %rax
 def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
           (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 0e69651..e4ccc06 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -49,10 +49,12 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
 let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
   def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
                         "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+  let hasSideEffects = 0 in
   def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
                        "jmp\t$dst", [], IIC_JMP_REL>;
   // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
   // with JMP_1.
+  let hasSideEffects = 0 in
   def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
                        "jmpq\t$dst", [], IIC_JMP_REL>;
 }
@@ -60,6 +62,7 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
 // Conditional Branches.
 let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
   multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+    let hasSideEffects = 0 in
     def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
                        IIC_Jcc>;
     def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
@@ -85,7 +88,7 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
 defm JG  : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
 
 // jcx/jecx/jrcx instructions.
-let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
   // These are the 32-bit versions of this instruction for the asmparser.  In
   // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
   // jecxz.
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 28954c6..4090550 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -14,26 +14,26 @@
 let neverHasSideEffects = 1 in {
   let Defs = [AX], Uses = [AL] in
   def CBW : I<0x98, RawFrm, (outs), (ins),
-              "{cbtw|cbw}", []>, OpSize;   // AX = signext(AL)
+              "{cbtw|cbw}", [], IIC_CBW>, OpSize;   // AX = signext(AL)
   let Defs = [EAX], Uses = [AX] in
   def CWDE : I<0x98, RawFrm, (outs), (ins),
-              "{cwtl|cwde}", []>;   // EAX = signext(AX)
+              "{cwtl|cwde}", [], IIC_CBW>;   // EAX = signext(AX)
 
   let Defs = [AX,DX], Uses = [AX] in
   def CWD : I<0x99, RawFrm, (outs), (ins),
-              "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+              "{cwtd|cwd}", [], IIC_CBW>, OpSize; // DX:AX = signext(AX)
   let Defs = [EAX,EDX], Uses = [EAX] in
   def CDQ : I<0x99, RawFrm, (outs), (ins),
-              "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+              "{cltd|cdq}", [], IIC_CBW>; // EDX:EAX = signext(EAX)
 
 
   let Defs = [RAX], Uses = [EAX] in
   def CDQE : RI<0x98, RawFrm, (outs), (ins),
-               "{cltq|cdqe}", []>;     // RAX = signext(EAX)
+               "{cltq|cdqe}", [], IIC_CBW>;     // RAX = signext(EAX)
 
   let Defs = [RAX,RDX], Uses = [RAX] in
   def CQO  : RI<0x99, RawFrm, (outs), (ins),
-                "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+                "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
 }
 
 
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 7759a8a..69cd5a5 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -74,43 +74,43 @@ let neverHasSideEffects = 1 in {
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
-                                 memopv8f32, X86Fmadd, v4f32, v8f32>;
-  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
-                                 memopv8f32, X86Fmsub, v4f32, v8f32>;
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
+                                 loadv8f32, X86Fmadd, v4f32, v8f32>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
+                                 loadv8f32, X86Fmsub, v4f32, v8f32>;
   defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
-                                 memopv4f32, memopv8f32, X86Fmaddsub,
+                                 loadv4f32, loadv8f32, X86Fmaddsub,
                                  v4f32, v8f32>;
   defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
-                                 memopv4f32, memopv8f32, X86Fmsubadd,
+                                 loadv4f32, loadv8f32, X86Fmsubadd,
                                  v4f32, v8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
-                                 memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
-  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
-                                 memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64,
+                                 loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64,
+                                 loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
   defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
-                                 memopv2f64, memopv4f64, X86Fmaddsub,
+                                 loadv2f64, loadv4f64, X86Fmaddsub,
                                  v2f64, v4f64>, VEX_W;
   defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
-                                 memopv2f64, memopv4f64, X86Fmsubadd,
+                                 loadv2f64, loadv4f64, X86Fmsubadd,
                                  v2f64, v4f64>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
-                               memopv8f32, X86Fnmadd, v4f32, v8f32>;
-  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
-                               memopv8f32, X86Fnmsub, v4f32, v8f32>;
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmadd, v4f32, v8f32>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmsub, v4f32, v8f32>;
 }
 let ExeDomain = SSEPackedDouble in {
-  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
-                               memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
+                               loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
   defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
-                               memopv2f64, memopv4f64, X86Fnmsub, v2f64,
+                               loadv2f64, loadv4f64, X86Fnmsub, v2f64,
                                v4f64>, VEX_W;
 }
 
@@ -206,25 +206,26 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
-             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
   def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
-                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+                           (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
   def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
-             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
 // For disassembler
 let isCodeGenOnly = 1, hasSideEffects = 0 in
   def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
-               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               VEX_LIG;
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
@@ -235,19 +236,19 @@ multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4;
   def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, memop:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
-                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+                                  mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4;
   def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, memop:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
 }
 
 multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -338,31 +339,31 @@ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
 
 let ExeDomain = SSEPackedSingle in {
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
 }
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 64018b3..0fd9011 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -139,6 +139,7 @@ class T8XS   { bits<5> Prefix = 18; }
 class TAXD   { bits<5> Prefix = 19; }
 class XOP8   { bits<5> Prefix = 20; }
 class XOP9   { bits<5> Prefix = 21; }
+class XOPA   { bits<5> Prefix = 22; }
 class VEX    { bit hasVEXPrefix = 1; }
 class VEX_W  { bit hasVEX_WPrefix = 1; }
 class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
@@ -339,10 +340,11 @@ def __xd : XD;
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [UseAVX],
                    !if(!eq(Prefix, __xs.Prefix), [UseSSE1],
                    !if(!eq(Prefix, __xd.Prefix), [UseSSE2],
-                   !if(hasOpSizePrefix, [UseSSE2], [UseSSE1]))));
+                   !if(hasOpSizePrefix, [UseSSE2], [UseSSE1])))));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -352,8 +354,9 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [UseAVX],
+                   !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -363,8 +366,9 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [HasAVX],
+                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -381,11 +385,12 @@ class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> patter
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [HasAVX],
+                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
-  let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
 }
 
 // SSE1 Instruction Templates:
@@ -460,7 +465,7 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
-        Requires<[HasAVX]>;
+        Requires<[UseAVX]>;
 class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@@ -472,7 +477,7 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB,
-        OpSize, Requires<[HasAVX]>;
+        OpSize, Requires<[UseAVX]>;
 class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB,
@@ -641,7 +646,7 @@ class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB,
       Requires<[HasAVX512]>;
 class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -653,10 +658,10 @@ class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
 class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+      : I<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
 class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, T8,
@@ -667,7 +672,7 @@ class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
 // AES8I
 // These use the same encoding as the SSE4.2 T8 and TA encodings.
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag>pattern, InstrItinClass itin = NoItinerary>
+            list<dag>pattern, InstrItinClass itin = IIC_AES>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
         Requires<[HasAES]>;
 
@@ -767,6 +772,7 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 //
 
 // MMXI   - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
 // MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
 // MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
 // MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
@@ -776,6 +782,9 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In32BitMode]>;
 class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
              list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 0b51521..1fed424 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -105,6 +105,13 @@ def X86vsext   : SDNode<"X86ISD::VSEXT",
                          SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                               SDTCisInt<0>, SDTCisInt<1>]>>;
 
+def X86vtrunc   : SDNode<"X86ISD::VTRUNC",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+def X86vtruncm   : SDNode<"X86ISD::VTRUNCM",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisVec<2>, SDTCisInt<2>]>>;
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>]>>;
@@ -118,6 +125,15 @@ def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
 def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
 def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
 
+def X86IntCmpMask : SDTypeProfile<1, 2,
+    [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>;
+def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86cmpm   : SDNode<"X86ISD::CMPM",    X86CmpMaskCC>;
+def X86cmpmu  : SDNode<"X86ISD::CMPMU",   X86CmpMaskCC>;
+
 def X86vshl    : SDNode<"X86ISD::VSHL",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisVec<2>]>>;
@@ -140,6 +156,9 @@ def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
 def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+def X86testm  : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>>;
 
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
@@ -151,6 +170,8 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
 def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>]>;
+def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
 
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisInt<2>]>;
@@ -194,11 +215,14 @@ def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
 def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
 def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
+def X86VPermv3   : SDNode<"X86ISD::VPERMV3",  SDTShuff3Op>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
@@ -236,13 +260,13 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
 def ssmem : Operand<v4f32> {
   let PrintMethod = "printf32mem";
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
+  let ParserMatchClass = X86Mem32AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 def sdmem : Operand<v2f64> {
   let PrintMethod = "printf64mem";
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
+  let ParserMatchClass = X86Mem64AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
@@ -262,9 +286,16 @@ def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
-// 128-/256-bit extload pattern fragments
+// 512-bit load pattern fragments
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
 def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
 
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -278,6 +309,12 @@ def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
   return cast<StoreSDNode>(N)->getAlignment() >= 32;
 }]>;
 
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
 // Like 'load', but always requires 128-bit vector alignment.
 def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 16;
@@ -293,6 +330,11 @@ def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 32;
 }]>;
 
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
 def alignedloadfsf32 : PatFrag<(ops node:$ptr),
                                (f32 (alignedload node:$ptr))>;
 def alignedloadfsf64 : PatFrag<(ops node:$ptr),
@@ -316,6 +358,16 @@ def alignedloadv4f64 : PatFrag<(ops node:$ptr),
 def alignedloadv4i64 : PatFrag<(ops node:$ptr),
                                (v4i64 (alignedload256 node:$ptr))>;
 
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+                                (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64  : PatFrag<(ops node:$ptr),
+                                (v8f64  (alignedload512 node:$ptr))>;
+def alignedloadv8i64  : PatFrag<(ops node:$ptr),
+                                (v8i64  (alignedload512 node:$ptr))>;
+
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
 // be naturally aligned on some targets but not on others.  If the subtarget
@@ -327,6 +379,16 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
          || cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
+def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
 def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
 def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
 
@@ -342,6 +404,12 @@ def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
 def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
 def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
 
+// 512-bit memop pattern fragments
+def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>;
+def memopv8f64  : PatFrag<(ops node:$ptr), (v8f64  (memop8 node:$ptr))>;
+def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>;
+def memopv8i64  : PatFrag<(ops node:$ptr), (v8i64  (memop8 node:$ptr))>;
+
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
 // FIXME: 8 byte alignment for mmx reads is not required
@@ -391,6 +459,10 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
 def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
 def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
 
+// 512-bit bitconvert pattern fragments
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+
 def vzmovl_v2i64 : PatFrag<(ops node:$src),
                            (bitconvert (v2i64 (X86vzmovl
                              (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 0443a93..2461773 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -35,7 +36,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include <limits>
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "X86GenInstrInfo.inc"
 
 using namespace llvm;
@@ -81,6 +82,7 @@ enum {
   TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
   TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
   TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
   TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
 };
 
@@ -90,6 +92,9 @@ struct X86OpTblEntry {
   uint16_t Flags;
 };
 
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKDOWN64
@@ -298,8 +303,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
     { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
     { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
-    { X86::FsMOVAPDrr,  X86::MOVSDmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
-    { X86::FsMOVAPSrr,  X86::MOVSSmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
     { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
     { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
     { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
@@ -356,8 +359,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
     // AVX 128-bit versions of foldable instructions
     { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
-    { X86::FsVMOVAPDrr, X86::VMOVSDmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
-    { X86::FsVMOVAPSrr, X86::VMOVSSmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
     { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -374,7 +375,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE }
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+    // AVX-512 foldable instructions
+    { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr,  TB_FOLDED_STORE }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
@@ -400,8 +403,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
     { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
     { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
-    { X86::FsMOVAPDrr,      X86::MOVSDrm,             TB_NO_REVERSE },
-    { X86::FsMOVAPSrr,      X86::MOVSSrm,             TB_NO_REVERSE },
     { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
     { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
     { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
@@ -444,7 +445,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
     { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
     { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
-    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm,        0 },
     { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
     { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
     { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
@@ -493,8 +493,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
     { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
     { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
-    { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
-    { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
     { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
     { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
@@ -507,7 +505,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
     { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
     { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
-    { X86::VMOVZDI2PDIrr,   X86::VMOVZDI2PDIrm,       0 },
     { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
     { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
     { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
@@ -552,11 +549,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
-    // BMI/BMI2/LZCNT/POPCNT foldable instructions
+    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
     { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
+    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
+    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
+    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
+    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
+    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
+    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
+    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
+    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
+    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
+    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
+    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
+    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
+    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
     { X86::BLSI32rr,        X86::BLSI32rm,            0 },
     { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
+    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
     { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
     { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
     { X86::BLSR32rr,        X86::BLSR32rm,            0 },
@@ -577,9 +590,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::SHRX64rr,        X86::SHRX64rm,            0 },
     { X86::SHLX32rr,        X86::SHLX32rm,            0 },
     { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
+    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
     { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
     { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
     { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
+    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
+    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
+
+    // AVX-512 foldable instructions
+    { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
+    { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
+    { X86::VMOVDQA32rr,     X86::VMOVDQA32rm,         TB_ALIGN_64 },
+    { X86::VMOVDQA64rr,     X86::VMOVDQA64rm,         TB_ALIGN_64 },
+    { X86::VMOVDQU32rr,     X86::VMOVDQU32rm,         0 },
+    { X86::VMOVDQU64rr,     X86::VMOVDQU64rm,         0 },
+
+    // AES foldable instructions
+    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
+    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -1177,6 +1208,52 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::PDEP64rr,          X86::PDEP64rm,            0 },
     { X86::PEXT32rr,          X86::PEXT32rm,            0 },
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
+
+    // AVX-512 foldable instructions
+    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
+    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
+    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
+    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
+    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
+    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
+    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
+    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
+    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
+    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
+    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
+    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
+    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
+    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
+    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
+    { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
+    { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
+
+    // AES foldable instructions
+    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
+    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
+    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
+    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       TB_ALIGN_16 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           TB_ALIGN_16 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       TB_ALIGN_16 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           TB_ALIGN_16 },
+
+    // SHA foldable instructions
+    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
+    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
+    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
+    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
+    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
+    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1338,6 +1415,11 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
     { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
     { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
+    // AVX-512 VPERMI instructions with 3 source operands.
+    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
+    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
+    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
+    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1454,6 +1536,8 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::VMOVDQAYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
+  case X86::VMOVDQA32rm:
+  case X86::VMOVDQA64rm:
     return true;
   }
 }
@@ -2890,23 +2974,29 @@ static bool isHReg(unsigned Reg) {
 
 // Try and copy between VR128/VR64 and GR64 registers.
 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
-                                        bool HasAVX) {
+                                        const X86Subtarget& Subtarget) {
+
+
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
   // SrcReg(GR64)  -> DestReg(VR128)
   // SrcReg(GR64)  -> DestReg(VR64)
 
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
   if (X86::GR64RegClass.contains(DestReg)) {
-    if (X86::VR128RegClass.contains(SrcReg))
+    if (X86::VR128XRegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
-      return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
+      return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
+                                               X86::MOVPQIto64rr);
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
-    if (X86::VR128RegClass.contains(DestReg))
-      return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
+    if (X86::VR128XRegClass.contains(DestReg))
+      return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
+                                               X86::MOV64toPQIrr);
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
@@ -2915,14 +3005,30 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(FR32) -> DestReg(GR32)
   // SrcReg(GR32) -> DestReg(FR32)
 
-  if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
     // Copy from a FR32 register to a GR32 register.
-    return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+    return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
 
-  if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+  if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
     // Copy from a GR32 register to a FR32 register.
-    return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+    return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
+  return 0;
+}
 
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+  if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR256XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR512RegClass.contains(DestReg, SrcReg)) {
+     DestReg = get512BitSuperRegister(DestReg);
+     SrcReg = get512BitSuperRegister(SrcReg);
+     return X86::VMOVAPSZrr;
+  }
+  if ((X86::VK8RegClass.contains(DestReg) ||
+       X86::VK16RegClass.contains(DestReg)) &&
+      (X86::VK8RegClass.contains(SrcReg) ||
+       X86::VK16RegClass.contains(SrcReg)))
+    return X86::KMOVWkk;
   return 0;
 }
 
@@ -2932,7 +3038,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  unsigned Opc;
+  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
   else if (X86::GR32RegClass.contains(DestReg, SrcReg))
@@ -2950,14 +3057,17 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
              "8-bit H register can not be copied outside GR8_NOREX");
     } else
       Opc = X86::MOV8rr;
-  } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+  }
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+  else if (HasAVX512)
+    Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
     Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
-  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
-    Opc = X86::MMX_MOVQ64rr;
-  else
-    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
+  if (!Opc)
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>());
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -3005,6 +3115,18 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       bool isStackAligned,
                                       const TargetMachine &TM,
                                       bool load) {
+  if (TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+    if (X86::VK8RegClass.hasSubClassEq(RC)  ||
+      X86::VK16RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
+    if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
+    if (X86::VR512RegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (RC->getSize()) {
   default:
@@ -3046,7 +3168,8 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
   case 16: {
-    assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
+    assert((X86::VR128RegClass.hasSubClassEq(RC) ||
+            X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ?
@@ -3058,12 +3181,19 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
         (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
   }
   case 32:
-    assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+    assert((X86::VR256RegClass.hasSubClassEq(RC) ||
+            X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
     else
       return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+  case 64:
+    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+    if (isStackAligned)
+      return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 }
 
@@ -3090,7 +3220,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   const MachineFunction &MF = *MBB.getParent();
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@@ -3106,7 +3236,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@@ -3126,7 +3256,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
@@ -3140,7 +3270,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
@@ -3722,6 +3852,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::AVX_SET0:
     assert(HasAVX && "AVX not supported");
     return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::AVX512_512_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
@@ -3729,6 +3861,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
+  case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
+  case X86::KSET1B:
+  case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
   }
   return false;
 }
@@ -3942,18 +4077,6 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::RSQRTSSr_Int:
   case X86::SQRTSSr:
   case X86::SQRTSSr_Int:
-  // AVX encoded versions
-  case X86::VCVTSD2SSrr:
-  case X86::Int_VCVTSD2SSrr:
-  case X86::VCVTSS2SDrr:
-  case X86::Int_VCVTSS2SDrr:
-  case X86::VRCPSSr:
-  case X86::VROUNDSDr:
-  case X86::VROUNDSDr_Int:
-  case X86::VROUNDSSr:
-  case X86::VROUNDSSr_Int:
-  case X86::VRSQRTSSr:
-  case X86::VSQRTSSr:
     return true;
   }
 
@@ -3985,10 +4108,77 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
   return 16;
 }
 
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::VCVTSI2SSrr:
+  case X86::Int_VCVTSI2SSrr:
+  case X86::VCVTSI2SS64rr:
+  case X86::Int_VCVTSI2SS64rr:
+  case X86::VCVTSI2SDrr:
+  case X86::Int_VCVTSI2SDrr:
+  case X86::VCVTSI2SD64rr:
+  case X86::Int_VCVTSI2SD64rr:
+  case X86::VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::VRCPSSr:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSDr_Int:
+  case X86::VROUNDSSr:
+  case X86::VROUNDSSr_Int:
+  case X86::VRSQRTSSr:
+  case X86::VSQRTSSr:
+
+  // AVX-512
+  case X86::VCVTSD2SSZrr:
+  case X86::VCVTSS2SDZrr:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned X86InstrInfo::
+getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                     const TargetRegisterInfo *TRI) const {
+  if (!hasUndefRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // Set the OpNum parameter to the first source operand.
+  OpNum = 1;
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    // Use the same magic number as getPartialRegUpdateClearance.
+    return 16;
+  }
+  return 0;
+}
+
 void X86InstrInfo::
 breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                           const TargetRegisterInfo *TRI) const {
   unsigned Reg = MI->getOperand(OpNum).getReg();
+  // If MI kills this register, the false dependence is already broken.
+  if (MI->killsRegister(Reg, TRI))
+    return;
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
@@ -4008,10 +4198,75 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   MI->addRegisterKilled(Reg, TRI, true);
 }
 
-MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                                  int FrameIndex) const {
+static MachineInstr* foldPatchpoint(MachineFunction &MF,
+                                    MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex,
+                                    const TargetInstrInfo &TII) {
+  unsigned StartIdx = 0;
+  switch (MI->getOpcode()) {
+  case TargetOpcode::STACKMAP:
+    StartIdx = 2; // Skip ID, nShadowBytes.
+    break;
+  case TargetOpcode::PATCHPOINT: {
+    // For PatchPoint, the call args are not foldable.
+    PatchPointOpers opers(MI);
+    StartIdx = opers.getVarIdx();
+    break;
+  }
+  default:
+    llvm_unreachable("unexpected stackmap opcode");
+  }
+
+  // Return false if any operands requested for folding are not foldable (not
+  // part of the stackmap's live values).
+  for (SmallVectorImpl<unsigned>::const_iterator I = Ops.begin(), E = Ops.end();
+       I != E; ++I) {
+    if (*I < StartIdx)
+      return 0;
+  }
+
+  MachineInstr *NewMI =
+    MF.CreateMachineInstr(TII.get(MI->getOpcode()), MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  // No need to fold return, the meta data, and function arguments
+  for (unsigned i = 0; i < StartIdx; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  for (unsigned i = StartIdx; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (std::find(Ops.begin(), Ops.end(), i) != Ops.end()) {
+      assert(MO.getReg() && "patchpoint can only fold a vreg operand");
+      // Compute the spill slot size and offset.
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(MO.getReg());
+      unsigned SpillSize;
+      unsigned SpillOffset;
+      bool Valid = TII.getStackSlotRange(RC, MO.getSubReg(), SpillSize,
+                                         SpillOffset, &MF.getTarget());
+      if (!Valid)
+        report_fatal_error("cannot spill patchpoint subregister operand");
+
+      MIB.addOperand(MachineOperand::CreateImm(StackMaps::IndirectMemRefOp));
+      MIB.addOperand(MachineOperand::CreateImm(SpillSize));
+      MIB.addOperand(MachineOperand::CreateFI(FrameIndex));
+      addOffset(MIB, SpillOffset);
+    }
+    else
+      MIB.addOperand(MO);
+  }
+  return NewMI;
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex) const {
+  // Special case stack map and patch point intrinsics.
+  if (MI->getOpcode() == TargetOpcode::STACKMAP
+      || MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+    return foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
+  }
   // Check switch flag
   if (NoFusing) return NULL;
 
@@ -4025,6 +4280,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
   unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  // If the function stack isn't realigned we don't want to fold instructions
+  // that need increased alignment.
+  if (!RI.needsStackRealignment(MF))
+    Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -4054,6 +4313,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
                                            const SmallVectorImpl<unsigned> &Ops,
                                                   MachineInstr *LoadMI) const {
+  // If loading from a FrameIndex, fold directly from the FrameIndex.
+  unsigned NumOps = LoadMI->getDesc().getNumOperands();
+  int FrameIndex;
+  if (isLoadFromStackSlot(LoadMI, FrameIndex))
+    return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+
   // Check switch flag
   if (NoFusing) return NULL;
 
@@ -4179,7 +4444,6 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       return NULL;
 
     // Folding a normal load. Just copy the load's address operands.
-    unsigned NumOps = LoadMI->getDesc().getNumOperands();
     for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
       MOs.push_back(LoadMI->getOperand(i));
     break;
@@ -5001,6 +5265,37 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::VSQRTSSm:
   case X86::VSQRTSSm_Int:
   case X86::VSQRTSSr:
+  case X86::VSQRTPDZrm:
+  case X86::VSQRTPDZrr:
+  case X86::VSQRTPSZrm:
+  case X86::VSQRTPSZrr:
+  case X86::VSQRTSDZm:
+  case X86::VSQRTSDZm_Int:
+  case X86::VSQRTSDZr:
+  case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSSZr:
+  case X86::VSQRTSSZm:
+  case X86::VDIVSDZrm:
+  case X86::VDIVSDZrr:
+  case X86::VDIVSSZrm:
+  case X86::VDIVSSZrr:
+
+  case X86::VGATHERQPSZrm:
+  case X86::VGATHERQPDZrm:
+  case X86::VGATHERDPDZrm:
+  case X86::VGATHERDPSZrm:
+  case X86::VPGATHERQDZrm:
+  case X86::VPGATHERQQZrm:
+  case X86::VPGATHERDDZrm:
+  case X86::VPGATHERDQZrm:
+  case X86::VSCATTERQPDZmr:
+  case X86::VSCATTERQPSZmr:
+  case X86::VSCATTERDPDZmr:
+  case X86::VSCATTERDPSZmr:
+  case X86::VPSCATTERQDZmr:
+  case X86::VPSCATTERQQZmr:
+  case X86::VPSCATTERDDZmr:
+  case X86::VPSCATTERDQZmr:
     return true;
   }
 }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index a0d1ba7..600e392 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -152,6 +152,8 @@ class X86InstrInfo : public X86GenInstrInfo {
                             MemOp2RegOpTableType &M2RTable,
                             unsigned RegOp, unsigned MemOp, unsigned Flags);
 
+  virtual void anchor();
+
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
 
@@ -369,6 +371,8 @@ public:
 
   unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
                                         const TargetRegisterInfo *TRI) const;
+  unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                                const TargetRegisterInfo *TRI) const;
   void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                                  const TargetRegisterInfo *TRI) const;
 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0960a2a..6e5d543 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -248,11 +248,12 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
-def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
 
 def X86blsi   : SDNode<"X86ISD::BLSI",   SDTIntUnaryOp>;
 def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
 def X86blsr   : SDNode<"X86ISD::BLSR",   SDTIntUnaryOp>;
+def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntShiftOp>;
+def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
@@ -278,53 +279,52 @@ def ptr_rc_nosp : PointerLikeRegClass<1>;
 
 // *mem - Operand definitions for the funky X86 addressing mode operands.
 //
-def X86MemAsmOperand : AsmOperandClass { 
- let Name = "Mem"; let PredicateMethod = "isMem"; 
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
 }
-def X86Mem8AsmOperand : AsmOperandClass { 
-  let Name = "Mem8"; let PredicateMethod = "isMem8";
+def X86Mem8AsmOperand : AsmOperandClass {
+  let Name = "Mem8"; let RenderMethod = "addMemOperands";
 }
-def X86Mem16AsmOperand : AsmOperandClass { 
-  let Name = "Mem16"; let PredicateMethod = "isMem16";
+def X86Mem16AsmOperand : AsmOperandClass {
+  let Name = "Mem16"; let RenderMethod = "addMemOperands";
 }
-def X86Mem32AsmOperand : AsmOperandClass { 
-  let Name = "Mem32"; let PredicateMethod = "isMem32";
+def X86Mem32AsmOperand : AsmOperandClass {
+  let Name = "Mem32"; let RenderMethod = "addMemOperands";
 }
-def X86Mem64AsmOperand : AsmOperandClass { 
-  let Name = "Mem64"; let PredicateMethod = "isMem64";
+def X86Mem64AsmOperand : AsmOperandClass {
+  let Name = "Mem64"; let RenderMethod = "addMemOperands";
 }
-def X86Mem80AsmOperand : AsmOperandClass { 
-  let Name = "Mem80"; let PredicateMethod = "isMem80";
+def X86Mem80AsmOperand : AsmOperandClass {
+  let Name = "Mem80"; let RenderMethod = "addMemOperands";
 }
-def X86Mem128AsmOperand : AsmOperandClass { 
-  let Name = "Mem128"; let PredicateMethod = "isMem128";
+def X86Mem128AsmOperand : AsmOperandClass {
+  let Name = "Mem128"; let RenderMethod = "addMemOperands";
 }
-def X86Mem256AsmOperand : AsmOperandClass { 
-  let Name = "Mem256"; let PredicateMethod = "isMem256";
+def X86Mem256AsmOperand : AsmOperandClass {
+  let Name = "Mem256"; let RenderMethod = "addMemOperands";
+}
+def X86Mem512AsmOperand : AsmOperandClass {
+  let Name = "Mem512"; let RenderMethod = "addMemOperands";
 }
 
 // Gather mem operands
 def X86MemVX32Operand : AsmOperandClass {
-  let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+  let Name = "MemVX32"; let RenderMethod = "addMemOperands";
 }
 def X86MemVY32Operand : AsmOperandClass {
-  let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+  let Name = "MemVY32"; let RenderMethod = "addMemOperands";
+}
+def X86MemVZ32Operand : AsmOperandClass {
+  let Name = "MemVZ32"; let RenderMethod = "addMemOperands";
 }
 def X86MemVX64Operand : AsmOperandClass {
-  let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+  let Name = "MemVX64"; let RenderMethod = "addMemOperands";
 }
 def X86MemVY64Operand : AsmOperandClass {
-  let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+  let Name = "MemVY64"; let RenderMethod = "addMemOperands";
 }
-
 def X86MemVZ64Operand : AsmOperandClass {
-  let Name = "MemVZ64"; let PredicateMethod = "isMemVZ64";
-}
-def X86MemVZ32Operand : AsmOperandClass {
-  let Name = "MemVZ32"; let PredicateMethod = "isMemVZ32";
-}
-def X86Mem512AsmOperand : AsmOperandClass {
-  let Name = "Mem512"; let PredicateMethod = "isMem512";
+  let Name = "MemVZ64"; let RenderMethod = "addMemOperands";
 }
 
 def X86AbsMemAsmOperand : AsmOperandClass {
@@ -343,29 +343,29 @@ def opaque48mem : X86MemOperand<"printopaquemem">;
 def opaque80mem : X86MemOperand<"printopaquemem">;
 def opaque512mem : X86MemOperand<"printopaquemem">;
 
-def i8mem   : X86MemOperand<"printi8mem"> { 
+def i8mem   : X86MemOperand<"printi8mem"> {
   let ParserMatchClass = X86Mem8AsmOperand; }
-def i16mem  : X86MemOperand<"printi16mem"> { 
+def i16mem  : X86MemOperand<"printi16mem"> {
   let ParserMatchClass = X86Mem16AsmOperand; }
-def i32mem  : X86MemOperand<"printi32mem"> { 
+def i32mem  : X86MemOperand<"printi32mem"> {
   let ParserMatchClass = X86Mem32AsmOperand; }
-def i64mem  : X86MemOperand<"printi64mem"> { 
+def i64mem  : X86MemOperand<"printi64mem"> {
   let ParserMatchClass = X86Mem64AsmOperand; }
-def i128mem : X86MemOperand<"printi128mem"> { 
+def i128mem : X86MemOperand<"printi128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
-def i256mem : X86MemOperand<"printi256mem"> { 
+def i256mem : X86MemOperand<"printi256mem"> {
   let ParserMatchClass = X86Mem256AsmOperand; }
-def i512mem : X86MemOperand<"printi512mem"> { 
+def i512mem : X86MemOperand<"printi512mem"> {
   let ParserMatchClass = X86Mem512AsmOperand; }
-def f32mem  : X86MemOperand<"printf32mem"> { 
+def f32mem  : X86MemOperand<"printf32mem"> {
   let ParserMatchClass = X86Mem32AsmOperand; }
-def f64mem  : X86MemOperand<"printf64mem"> { 
+def f64mem  : X86MemOperand<"printf64mem"> {
   let ParserMatchClass = X86Mem64AsmOperand; }
-def f80mem  : X86MemOperand<"printf80mem"> { 
+def f80mem  : X86MemOperand<"printf80mem"> {
   let ParserMatchClass = X86Mem80AsmOperand; }
-def f128mem : X86MemOperand<"printf128mem"> { 
+def f128mem : X86MemOperand<"printf128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
-def f256mem : X86MemOperand<"printf256mem">{ 
+def f256mem : X86MemOperand<"printf256mem">{
   let ParserMatchClass = X86Mem256AsmOperand; }
 def f512mem : X86MemOperand<"printf512mem">{
   let ParserMatchClass = X86Mem512AsmOperand; }
@@ -439,17 +439,49 @@ let OperandType = "OPERAND_PCREL",
 def i32imm_pcrel : Operand<i32>;
 def i16imm_pcrel : Operand<i16>;
 
-def offset8 : Operand<i64>;
-def offset16 : Operand<i64>;
-def offset32 : Operand<i64>;
-def offset64 : Operand<i64>;
-
 // Branch targets have OtherVT type and print as pc-relative values.
 def brtarget : Operand<OtherVT>;
 def brtarget8 : Operand<OtherVT>;
 
 }
 
+def X86MemOffs8AsmOperand : AsmOperandClass {
+  let Name = "MemOffs8";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86MemOffs16AsmOperand : AsmOperandClass {
+  let Name = "MemOffs16";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86MemOffs32AsmOperand : AsmOperandClass {
+  let Name = "MemOffs32";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86MemOffs64AsmOperand : AsmOperandClass {
+  let Name = "MemOffs64";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
+
+let OperandType = "OPERAND_MEMORY" in {
+def offset8 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs8AsmOperand;
+  let PrintMethod = "printMemOffs8"; }
+def offset16 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs16AsmOperand;
+  let PrintMethod = "printMemOffs16"; }
+def offset32 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs32AsmOperand;
+  let PrintMethod = "printMemOffs32"; }
+def offset64 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs64AsmOperand;
+  let PrintMethod = "printMemOffs64"; }
+}
+
+
 def SSECC : Operand<i8> {
   let PrintMethod = "printSSECC";
   let OperandType = "OPERAND_IMMEDIATE";
@@ -470,6 +502,14 @@ class ImmZExtAsmOperandClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+  let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+  let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+
 // Sign-extended immediate classes. We don't need to define the full lattice
 // here because there is no instruction with an ambiguity between ImmSExti64i32
 // and ImmSExti32i8.
@@ -617,13 +657,13 @@ def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasAVX512      : Predicate<"Subtarget->hasAVX512()">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">;
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
 def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512       : Predicate<"!Subtarget->hasAVX512()">;
 def HasCDI       : Predicate<"Subtarget->hasCDI()">;
 def HasPFI       : Predicate<"Subtarget->hasPFI()">;
-def HasEMI       : Predicate<"Subtarget->hasERI()">;
+def HasERI       : Predicate<"Subtarget->hasERI()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
@@ -632,6 +672,7 @@ def HasFMA       : Predicate<"Subtarget->hasFMA()">;
 def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
+def HasTBM       : Predicate<"Subtarget->hasTBM()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
 def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
 def HasF16C      : Predicate<"Subtarget->hasF16C()">;
@@ -643,9 +684,10 @@ def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def HasHLE       : Predicate<"Subtarget->hasHLE()">;
 def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
+def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
-def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -944,53 +986,56 @@ let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
-                  IIC_BSF>, TB, OpSize, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, TB, OpSize, Sched<[WriteShift]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
-                  IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, OpSize, Sched<[WriteShiftLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB,
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+                 IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
-                 IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
-                  IIC_BSF>, TB, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
-                  IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+                 IIC_BIT_SCAN_REG>,
                  TB, OpSize, Sched<[WriteShift]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
-                 IIC_BSR>, TB,
+                 IIC_BIT_SCAN_MEM>, TB,
                  OpSize, Sched<[WriteShiftLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB,
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+                 IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
-                 IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB,
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
-                  IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
@@ -1072,9 +1117,12 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
 } // SchedRW
 
+let hasSideEffects = 0 in {
+
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
                    Requires<[In32BitMode]>;
@@ -1084,6 +1132,8 @@ def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
                       "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
+}
+let mayStore = 1 in {
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
                   Requires<[In32BitMode]>;
@@ -1094,34 +1144,40 @@ def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
                       "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
 }
+}
 
 // These forms all have full 64-bit absolute addresses in their instructions
 // and use the movabs mnemonic to indicate this specific form.
-def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset64:$src),
+let mayLoad = 1 in {
+def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset8:$src),
                      "movabs{b}\t{$src, %al|al, $src}", []>,
                      Requires<[In64BitMode]>;
-def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset16:$src),
                      "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize,
                      Requires<[In64BitMode]>;
-def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset32:$src),
                      "movabs{l}\t{$src, %eax|eax, $src}", []>,
                      Requires<[In64BitMode]>;
 def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src),
                      "movabs{q}\t{$src, %rax|rax, $src}", []>,
                      Requires<[In64BitMode]>;
+}
 
-def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset64:$dst), (ins),
+let mayStore = 1 in {
+def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset8:$dst), (ins),
                      "movabs{b}\t{%al, $dst|$dst, al}", []>,
                      Requires<[In64BitMode]>;
-def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset16:$dst), (ins),
                      "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize,
                      Requires<[In64BitMode]>;
-def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset32:$dst), (ins),
                      "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
                      Requires<[In64BitMode]>;
 def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins),
                      "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
                      Requires<[In64BitMode]>;
+}
+} // hasSideEffects = 0
 
 let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
@@ -1173,7 +1229,7 @@ def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
                    Sched<[WriteMove]>;
-let mayStore = 1 in
+let mayStore = 1, neverHasSideEffects = 1 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
@@ -1814,6 +1870,30 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
                                int_x86_bmi_bzhi_64, loadi64>, VEX_W;
 }
 
+def : Pat<(X86bzhi GR32:$src1, GR8:$src2),
+          (BZHI32rr GR32:$src1,
+                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2),
+          (BZHI32rm addr:$src1,
+                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi GR64:$src1, GR8:$src2),
+          (BZHI64rr GR64:$src1,
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2),
+          (BZHI64rm addr:$src1,
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+            (BEXTR32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+            (BEXTR32rm addr:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+            (BEXTR64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+            (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                          X86MemOperand x86memop, Intrinsic Int,
                          PatFrag ld_frag> {
@@ -1838,6 +1918,134 @@ let Predicates = [HasBMI2] in {
 }
 
 //===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                                X86MemOperand x86memop, PatFrag ld_frag,
+                                Intrinsic Int, Operand immtype,
+                                SDPatternOperator immoperator> {
+  def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+           XOP, XOPA, VEX;
+  def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
+                (ins x86memop:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+           XOP, XOPA, VEX;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+                                     int_x86_tbm_bextri_u32, i32imm, imm>;
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+                                     int_x86_tbm_bextri_u64, i64i32imm,
+                                     i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+                         RegisterClass RC, string OpcodeStr,
+                         X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+  def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP, XOP9, VEX_4V;
+  let mayLoad = 1 in
+  def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP, XOP9, VEX_4V;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+                           Format FormReg, Format FormMem> {
+  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+                               loadi32>;
+  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+                               loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI    : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC   : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS    : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC   : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+            (BEXTRI32ri GR32:$src1, imm:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+            (BEXTRI32mi addr:$src1, imm:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+            (BLCFILL32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+            (BLCFILL64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+            (BLCI64rr GR64:$src)>;
+
+  // Extra patterns because opt can optimize the above patterns to this.
+  def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+            (BLCI64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+            (BLCIC32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+            (BLCIC64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+            (BLCMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+            (BLCMSK64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+            (BLCS32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+            (BLCS64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+            (BLSFILL32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+            (BLSFILL64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+            (BLSIC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+            (BLSIC64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+            (T1MSKC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+            (T1MSKC64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+            (TZMSK32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+            (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
 // Subsystems.
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index cb12956..ba58143 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -204,7 +204,7 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
 //===----------------------------------------------------------------------===//
 
 def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
-                     [(int_x86_mmx_emms)]>;
+                     [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
 
 //===----------------------------------------------------------------------===//
 // MMX Scalar Instructions
@@ -236,10 +236,10 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                           (MMX_X86movd2w (x86mmx VR64:$src)))],
                           IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
 
-let neverHasSideEffects = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
-                             [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+                             [(set VR64:$dst, (bitconvert GR64:$src))],
+                             IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
 
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
@@ -250,10 +250,6 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                "movd\t{$src, $dst|$dst, $src}", 
                              [(set GR64:$dst,
                               (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
-def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst,
-                              (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
@@ -289,7 +285,7 @@ def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                                     (i64 (bitconvert (x86mmx VR64:$src))))))],
                               IIC_MMX_MOVQ_RR>;
 
-let neverHasSideEffects = 1 in
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
 def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
                                (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
                                [], IIC_MMX_MOVQ_RR>;
@@ -297,6 +293,7 @@ def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
 def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                               (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                               [], IIC_MMX_MOVQ_RR>;
+}
 } // SchedRW
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
@@ -304,21 +301,15 @@ def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
                          IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
 
-let AddedComplexity = 15 in
-// movd to MMX register zero-extends
-def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-              [(set VR64:$dst,
-                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
-                            IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
-let AddedComplexity = 20 in
-def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
-                           (ins i32mem:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-          [(set VR64:$dst,
-                (x86mmx (X86vzmovl (x86mmx
-                                   (scalar_to_vector (loadi32 addr:$src))))))],
-                            IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+  // movd to MMX register zero-extends
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+            (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+            (MMX_MOVD64rm addr:$src)>;
+}
 
 // Arithmetic Instructions
 defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
@@ -555,18 +546,18 @@ let Constraints = "$src1 = $dst" in {
 
 // Extract / Insert
 def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
-                           (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
-                           "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                             (iPTR imm:$src2)))],
-                           IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2),
+                       "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                         (iPTR imm:$src2)))],
+                       IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
 let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
                       (outs VR64:$dst), 
-                      (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
+                      (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3),
                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                        GR32:$src2, (iPTR imm:$src3)))],
+                                        GR32orGR64:$src2, (iPTR imm:$src3)))],
                       IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
 
   def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
@@ -580,9 +571,10 @@ let Constraints = "$src1 = $dst" in {
 }
 
 // Mask creation
-def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+                          (ins VR64:$src),
                           "pmovmskb\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst, 
+                          [(set GR32orGR64:$dst,
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
 
@@ -599,10 +591,10 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
 // Misc.
 let SchedRW = [WriteShuffle] in {
 let Uses = [EDI] in
-def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
-                        "maskmovq\t{$mask, $src|$src, $mask}",
-                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
-                        IIC_MMX_MASKMOV>;
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                          "maskmovq\t{$mask, $src|$src, $mask}",
+                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                          IIC_MMX_MASKMOV>;
 let Uses = [RDI] in
 def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            "maskmovq\t{$mask, $src|$src, $mask}",
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a86006a..a5debc0 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -151,6 +151,34 @@ def SSE_MOVU_ITINS : OpndItins<
   IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
 >;
 
+def SSE_DPPD_ITINS : OpndItins<
+  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+def SSE_MPSADBW_ITINS : OpndItins<
+  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+def SSE_PMULLD_ITINS : OpndItins<
+  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 Instructions Classes
 //===----------------------------------------------------------------------===//
@@ -455,10 +483,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // SSE 1 & 2 - Move FP Scalar Instructions
 //
 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
-// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
-// is used instead. Register-to-register movss/movsd is not modeled as an
-// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
-// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
 //===----------------------------------------------------------------------===//
 
 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
@@ -526,7 +554,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
 }
 
 // Patterns
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVS{S,D} to the lower bits.
@@ -1074,23 +1102,6 @@ let Predicates = [UseSSE1] in {
             (MOVUPSmr addr:$dst, VR128:$src)>;
 }
 
-// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
-// bits are disregarded. FIXME: Set encoding to pseudo!
-let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
-def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                       "movaps\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>, VEX;
-def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                       "movapd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>, VEX;
-def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                     "movaps\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_MOVA_P_RR>;
-def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                     "movapd\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_MOVA_P_RR>;
-}
-
 // Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
 // bits are disregarded. FIXME: Set encoding to pseudo!
 let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
@@ -1103,15 +1114,15 @@ let isCodeGenOnly = 1 in {
                          "movapd\t{$src, $dst|$dst, $src}",
                          [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
                          IIC_SSE_MOVA_P_RM>, VEX;
+  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                       "movaps\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
+  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                       "movapd\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
 }
-def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
-                     "movaps\t{$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
-                     IIC_SSE_MOVA_P_RM>;
-def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
-                     "movapd\t{$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
-                     IIC_SSE_MOVA_P_RM>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1327,7 +1338,7 @@ let Predicates = [UseSSE2] in {
 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-let AddedComplexity = 20 in {
+let AddedComplexity = 20, Predicates = [UseAVX] in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1358,7 +1369,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                         IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
@@ -1440,7 +1451,7 @@ let neverHasSideEffects = 1 in {
 
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
            Sched<[WriteCvtI2F]>;
@@ -1452,6 +1463,7 @@ let neverHasSideEffects = 1 in {
 } // neverHasSideEffects = 1
 }
 
+let Predicates = [UseAVX] in {
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}",
                                 SSE_CVT_SS2SI_32>,
@@ -1485,7 +1497,7 @@ def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
-
+}
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
 // register, but the same isn't true when only using memory operands,
 // provide other assembly "l" and "q" forms to address this explicitly
@@ -1499,12 +1511,12 @@ defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
                                   XD, VEX_4V, VEX_W, VEX_LIG;
 
-def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+let Predicates = [UseAVX] in {
+  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
-def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
 
-let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1606,19 +1618,21 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
+let Predicates = [UseAVX] in {
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
                   int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
                   SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
                     int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
                     SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
-
+}
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
                  sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
                    sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
 
 
+let Predicates = [UseAVX] in {
 defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
           SSE_CVT_Scalar, 0>, XS, VEX_4V;
@@ -1633,7 +1647,7 @@ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
           SSE_CVT_Scalar, 0>, XD,
           VEX_4V, VEX_W;
-
+}
 let Constraints = "$src1 = $dst" in {
   defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         int_x86_sse_cvtsi2ss, i32mem, loadi32,
@@ -1652,6 +1666,7 @@ let Constraints = "$src1 = $dst" in {
 /// SSE 1 Only
 
 // Aliases for intrinsics
+let Predicates = [UseAVX] in {
 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS, VEX;
@@ -1666,6 +1681,7 @@ defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
                                   "cvttsd2si", SSE_CVT_SD2SI>,
                                   XD, VEX, VEX_W;
+}
 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS;
@@ -1679,13 +1695,14 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
                                   "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
 
+let Predicates = [UseAVX] in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                   ssmem, sse_load_f32, "cvtss2si",
                                   SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
                                   ssmem, sse_load_f32, "cvtss2si",
                                   SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
-
+}
 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                ssmem, sse_load_f32, "cvtss2si",
                                SSE_CVT_SS2SI_32>, XS;
@@ -1707,6 +1724,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             SSEPackedSingle, SSE_CVT_PS>,
                             TB, Requires<[UseSSE2]>;
 
+let Predicates = [UseAVX] in {
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
                 (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1723,6 +1741,7 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
 
 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
                 (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
@@ -1744,7 +1763,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
@@ -1760,7 +1779,7 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
 }
 
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
-          Requires<[HasAVX]>;
+          Requires<[UseAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
@@ -1778,27 +1797,27 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
                        Sched<[WriteCvtF2F]>;
 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
                        Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                        IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtF2F]>;
 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
-                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
                        IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
@@ -1807,7 +1826,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1824,16 +1843,16 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
 }
 
 def : Pat<(f64 (fextend FR32:$src)),
-    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fextend (loadf32 addr:$src)),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
 
 def : Pat<(extloadf32 addr:$src),
     (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
-    Requires<[HasAVX, OptForSize]>;
+    Requires<[UseAVX, OptForSize]>;
 def : Pat<(extloadf32 addr:$src),
     (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
-    Requires<[HasAVX, OptForSpeed]>;
+    Requires<[UseAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
@@ -1861,14 +1880,14 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
                     Sched<[WriteCvtF2F]>;
 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
@@ -1895,7 +1914,7 @@ def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
                        IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1905,7 +1924,7 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
-                          (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
+                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
                         IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1934,7 +1953,7 @@ def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
 def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX,
+                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
                        Sched<[WriteCvtF2ILd]>;
 
 // YMM only
@@ -1946,7 +1965,7 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
+                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
                        VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
@@ -1972,7 +1991,7 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttps2dq
-                                            (memopv4f32 addr:$src)))],
+                                            (loadv4f32 addr:$src)))],
                          IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1982,7 +2001,7 @@ def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
-                                             (memopv8f32 addr:$src)))],
+                                             (loadv8f32 addr:$src)))],
                           IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
                           Sched<[WriteCvtF2ILd]>;
 
@@ -1999,27 +2018,27 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
             (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
             (VCVTTPS2DQrr VR128:$src)>;
-  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
             (VCVTTPS2DQrm addr:$src)>;
 
   def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
             (VCVTDQ2PSYrr VR256:$src)>;
-  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
             (VCVTDQ2PSYrm addr:$src)>;
 
   def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
             (VCVTTPS2DQYrr VR256:$src)>;
-  def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
+  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
             (VCVTTPS2DQYrm addr:$src)>;
 }
 
@@ -2056,7 +2075,7 @@ def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttpd2dqx\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                            (memopv2f64 addr:$src)))],
+                                            (loadv2f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 
 // YMM only
@@ -2068,7 +2087,7 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
-                          (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
@@ -2076,7 +2095,7 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
             (VCVTTPD2DQYrr VR256:$src)>;
-  def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
             (VCVTTPD2DQYrm addr:$src)>;
 } // Predicates = [HasAVX]
 
@@ -2110,7 +2129,7 @@ def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
-                       (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
+                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
                      IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 }
 
@@ -2140,7 +2159,7 @@ def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvtdq2_pd_256
-                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L,
+                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
                     Sched<[WriteCvtI2FLd]>;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2162,7 +2181,7 @@ def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PDYrr VR128:$src)>;
-  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PDYrm addr:$src)>;
 } // Predicates = [HasAVX]
 
@@ -2181,7 +2200,7 @@ def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvtpd2psx\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
 
 // YMM only
@@ -2193,7 +2212,7 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
+                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
@@ -2215,13 +2234,13 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
             (VCVTDQ2PSYrr VR256:$src)>;
-  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
             (VCVTDQ2PSYrm addr:$src)>;
 
   // Match fround and fextend for 128/256-bit conversions
   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
             (VCVTPD2PSrr VR128:$src)>;
-  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
             (VCVTPD2PSXrm addr:$src)>;
   def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
             (VCVTPD2PSYrr VR256:$src)>;
@@ -2299,7 +2318,7 @@ let Constraints = "$src1 = $dst" in {
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SSE_ALU_F32S>, // same latency as 32 bit compare
+                  SSE_ALU_F64S>,
                   XD;
 }
 
@@ -2334,7 +2353,7 @@ let Constraints = "$src1 = $dst" in {
                        SSE_ALU_F32S>, XS;
   defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                       SSE_ALU_F32S>, // same latency as f32
+                       SSE_ALU_F64S>,
                        XD;
 }
 
@@ -2403,26 +2422,27 @@ let Defs = [EFLAGS] in {
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Operand CC, Intrinsic Int, string asm,
-                            string asm_alt, Domain d> {
+                            string asm_alt, Domain d,
+                            OpndItins itins = SSE_ALU_F32P> {
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
-             IIC_SSE_CMPP_RR, d>,
+             itins.rr, d>,
             Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
-             IIC_SSE_CMPP_RM, d>,
+             itins.rm, d>,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let neverHasSideEffects = 1 in {
     def rri_alt : PIi8<0xC2, MRMSrcReg,
                (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>;
+               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
     def rmi_alt : PIi8<0xC2, MRMSrcMem,
                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_CMPP_RM, d>,
+               asm_alt, [], itins.rm, d>,
                Sched<[WriteFAddLd, ReadAfterLd]>;
   }
 }
@@ -2447,11 +2467,11 @@ let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedSingle>, TB;
+                 SSEPackedSingle, SSE_ALU_F32P>, TB;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedDouble>, TB, OpSize;
+                 SSEPackedDouble, SSE_ALU_F64P>, TB, OpSize;
 }
 
 let Predicates = [HasAVX] in {
@@ -2511,16 +2531,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
 
 defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
+           loadv4f32, SSEPackedSingle>, TB, VEX_4V;
 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
+           loadv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
-           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
-           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2535,13 +2555,13 @@ let Constraints = "$src1 = $dst" in {
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
-                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
             (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
   def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
             (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
 
   def : Pat<(v2i64 (X86Shufp VR128:$src1,
-                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
             (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
   def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
             (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
@@ -2550,13 +2570,13 @@ let Predicates = [HasAVX] in {
   def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
             (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   def : Pat<(v8i32 (X86Shufp VR256:$src1,
-                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
             (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 
   def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
             (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   def : Pat<(v4i64 (X86Shufp VR256:$src1,
-                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
+                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -2600,29 +2620,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
              Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V;
-defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V;
-defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V;
-defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V;
 
-defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V, VEX_L;
-defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
-defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V, VEX_L;
-defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
@@ -2642,20 +2662,20 @@ let Constraints = "$src1 = $dst" in {
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
 
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
@@ -2686,13 +2706,10 @@ let Predicates = [UseSSE2] in {
 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
 multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
                                 Domain d> {
-  def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
-                !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                     [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
-             Sched<[WriteVecLogic]>;
-  def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
-                !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
-                IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>;
+  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+              Sched<[WriteVecLogic]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -2709,29 +2726,15 @@ let Predicates = [HasAVX] in {
                                         OpSize, VEX, VEX_L;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
   def : Pat<(i32 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>;
+            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
-  // Assembler Only
-  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>;
-  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedDouble>, TB,
-             OpSize, VEX, Sched<[WriteVecLogic]>;
-  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>;
-  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedDouble>, TB,
-             OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>;
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
@@ -2740,16 +2743,18 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
       Requires<[UseSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
       Requires<[UseSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
       Requires<[UseSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
       Requires<[UseSSE2]>;
 
 //===---------------------------------------------------------------------===//
@@ -2788,7 +2793,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          OpndItins itins, bit IsCommutable = 0> {
 let Predicates = [HasAVX] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                    VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2796,7 +2801,7 @@ let Constraints = "$src1 = $dst" in
 
 let Predicates = [HasAVX2] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, memopv4i64, i256mem, itins,
+                               OpVT256, VR256, loadv4i64, i256mem, itins,
                                IsCommutable, 0>, VEX_4V, VEX_L;
 }
 
@@ -2836,16 +2841,18 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
 }
 
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
-defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
-              SSE_BIT_ITINS_P>;
-defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
-              SSE_BIT_ITINS_P>;
-defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
-              SSE_BIT_ITINS_P>;
-
-let isCommutable = 0 in
-  defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
+let isCodeGenOnly = 1 in {
+  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+                SSE_BIT_ITINS_P>;
+  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
                 SSE_BIT_ITINS_P>;
+  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+                SSE_BIT_ITINS_P>;
+
+  let isCommutable = 0 in
+    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
+                  SSE_BIT_ITINS_P>;
+}
 
 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
 ///
@@ -2855,14 +2862,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
         !strconcat(OpcodeStr, "ps"), f256mem,
         [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                           (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
+                           (loadv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem,
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                   (bc_v4i64 (v4f64 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
-                                  (memopv4i64 addr:$src2)))], 0>,
+                                  (loadv4i64 addr:$src2)))], 0>,
                                   TB, OpSize, VEX_4V, VEX_L;
 
   // In AVX no need to add a pattern for 128-bit logical rr ps, because they
@@ -2872,14 +2879,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, [],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                 (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+                                 (loadv2i64 addr:$src2)))], 0>, TB, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem,
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (bc_v2i64 (v2f64 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                 (memopv2i64 addr:$src2)))], 0>,
+                                 (loadv2i64 addr:$src2)))], 0>,
                                                  TB, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
@@ -2921,120 +2928,93 @@ let isCommutable = 0 in
 
 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
 /// classes below
-multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SizeItins itins,
-                                  bit Is2Addr = 1> {
-  defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
-                            OpNode, FR32, f32mem,
-                            itins.s, Is2Addr>, XS;
-  defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
-                            OpNode, FR64, f64mem,
-                            itins.d, Is2Addr>, XD;
-}
-
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
                                   SDNode OpNode, SizeItins itins> {
-let Predicates = [HasAVX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-                               VR128, v4f32, f128mem, memopv4f32,
+                               VR128, v4f32, f128mem, loadv4f32,
                                SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-                               VR128, v2f64, f128mem, memopv2f64,
+                               VR128, v2f64, f128mem, loadv2f64,
                                SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
-                        OpNode, VR256, v8f32, f256mem, memopv8f32,
+                        OpNode, VR256, v8f32, f256mem, loadv8f32,
                         SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
-                        OpNode, VR256, v4f64, f256mem, memopv4f64,
+                        OpNode, VR256, v4f64, f256mem, loadv4f64,
                         SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
-}
-
-let Constraints = "$src1 = $dst" in {
-  defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
-                            v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                            itins.s, 1>, TB;
-  defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
-                            v2f64, f128mem, memopv2f64, SSEPackedDouble,
-                            itins.d, 1>, TB, OpSize;
-}
-}
 
-multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
-                                      SizeItins itins,
-                                      bit Is2Addr = 1> {
-  defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
-     itins.s, Is2Addr>, XS;
-  defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
-     itins.d, Is2Addr>, XD;
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
+                              itins.s>, TB;
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
+                              itins.d>, TB, OpSize;
+  }
 }
 
-// Binary Arithmetic instructions
-defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>;
-defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>;
-let isCommutable = 0 in {
-  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>;
-  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>;
-  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>;
-  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>;
-}
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
 
-let isCodeGenOnly = 1 in {
-  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
-  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                              OpNode, FR32, f32mem, itins.s>, XS;
+    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                              OpNode, FR64, f64mem, itins.d>, XD;
+  }
 }
 
-defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
-            basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
-              VEX_4V, VEX_LIG;
-defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
-            basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
-              VEX_4V, VEX_LIG;
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                      SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
 
-let isCommutable = 0 in {
-  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s>, XS;
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d>, XD;
+  }
 }
 
-let Constraints = "$src1 = $dst" in {
-  defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
-  defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
-
-  let isCommutable = 0 in {
-    defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
-    defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
-    defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
-    defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
-  }
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
 }
 
 let isCodeGenOnly = 1 in {
-  defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
-       VEX_4V, VEX_LIG;
-  defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
-       VEX_4V, VEX_LIG;
-  let Constraints = "$src1 = $dst" in {
-    defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
-    defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
-  }
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
 }
 
 /// Unop Arithmetic
@@ -3180,7 +3160,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
+                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
@@ -3190,7 +3170,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
+                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3217,7 +3197,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                           !strconcat("v", OpcodeStr,
                           "ps\t{$src, $dst|$dst, $src}"),
-                          [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
                           itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                             !strconcat("v", OpcodeStr,
@@ -3228,7 +3208,7 @@ let Predicates = [HasAVX] in {
                           (ins f256mem:$src),
                           !strconcat("v", OpcodeStr,
                                     "ps\t{$src, $dst|$dst, $src}"),
-                          [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
+                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
                           itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3298,7 +3278,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
+                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
@@ -3308,7 +3288,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
+                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3341,30 +3321,31 @@ defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
                                 int_x86_avx_rcp_ps_256, SSE_RCPP>;
 
-def : Pat<(f32 (fsqrt FR32:$src)),
-          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (fsqrt (load addr:$src))),
-          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-def : Pat<(f64 (fsqrt FR64:$src)),
-          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
-def : Pat<(f64 (fsqrt (load addr:$src))),
-          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-def : Pat<(f32 (X86frsqrt FR32:$src)),
-          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (X86frsqrt (load addr:$src))),
-          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-def : Pat<(f32 (X86frcp FR32:$src)),
-          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (X86frcp (load addr:$src))),
-          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
+  def : Pat<(f32 (fsqrt FR32:$src)),
+            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64:$src)),
+            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32:$src)),
+            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32:$src)),
+            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+}
+let Predicates = [UseAVX] in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3378,7 +3359,9 @@ let Predicates = [HasAVX] in {
                               VR128)>;
   def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
             (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
 
+let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
                                          (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3662,7 +3645,7 @@ def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                  XS, Requires<[UseSSE2]>;
 }
 
-let mayStore = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
@@ -3725,7 +3708,7 @@ multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
                              bit IsCommutable = 0> {
 let Predicates = [HasAVX] in
   defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
-                                 VR128, memopv2i64, i128mem, itins,
+                                 VR128, loadv2i64, i128mem, itins,
                                  IsCommutable, 0>, VEX_4V;
 
 let Constraints = "$src1 = $dst" in
@@ -3734,7 +3717,7 @@ let Constraints = "$src1 = $dst" in
 
 let Predicates = [HasAVX2] in
   defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
-                                   VR256, memopv4i64, i256mem, itins,
+                                   VR256, loadv4i64, i256mem, itins,
                                    IsCommutable, 0>, VEX_4V, VEX_L;
 }
 
@@ -3761,11 +3744,11 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
                        (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
       Sched<[WriteVecShiftLd, ReadAfterLd]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
-       (ins RC:$src1, i32i8imm:$src2),
+       (ins RC:$src1, i8imm:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>,
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
        Sched<[WriteVecShift]>;
 }
 
@@ -3849,15 +3832,15 @@ defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
 defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
                                  int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
 defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
-                                 int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>;
+                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
 
 let Predicates = [HasAVX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
-                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
                               VEX_4V;
 let Predicates = [HasAVX2] in
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
-                               VR256, memopv4i64, i256mem,
+                               VR256, loadv4i64, i256mem,
                                SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in
 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
@@ -3993,12 +3976,14 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "pslldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>;
+                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
   def PSRLDQri : PDIi8<0x73, MRM3r,
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "psrldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>;
+                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
   // PSRADQri doesn't exist in SSE[1-3].
 }
 } // Constraints = "$src1 = $dst"
@@ -4082,14 +4067,14 @@ let Predicates = [HasAVX] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>;
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, i8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
-                       (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                        (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX,
+                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
                   Sched<[WriteShuffleLd]>;
 }
 
@@ -4100,14 +4085,14 @@ let Predicates = [HasAVX2] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>;
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, i8imm:$src2),
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
-                        (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
-                         (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L,
+                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
                    Sched<[WriteShuffleLd]>;
 }
 
@@ -4118,14 +4103,14 @@ let Predicates = [UseSSE2] in {
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                IIC_SSE_PSHUF>, Sched<[WriteShuffle]>;
+                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
   def mi : Ii8<0x70, MRMSrcMem,
                (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                          (i8 imm:$src2))))], IIC_SSE_PSHUF>,
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
            Sched<[WriteShuffleLd]>;
 }
 }
@@ -4136,7 +4121,7 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
 
 let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
             (VPSHUFDmi addr:$src1, imm:$imm)>;
   def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
             (VPSHUFDri VR128:$src1, imm:$imm)>;
@@ -4259,13 +4244,13 @@ let ExeDomain = SSEPackedInt in {
 multiclass sse2_pinsrw<bit Is2Addr = 1> {
   def rri : Ii8<0xC4, MRMSrcReg,
        (outs VR128:$dst), (ins VR128:$src1,
-        GR32:$src2, i32i8imm:$src3),
+        GR32orGR64:$src2, i32i8imm:$src3),
        !if(Is2Addr,
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
-         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>,
-       Sched<[WriteShuffle]>;
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
   def rmi : Ii8<0xC4, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1,
                         i16mem:$src2, i32i8imm:$src3),
@@ -4281,29 +4266,24 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
 // Extract
 let Predicates = [HasAVX] in
 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
-                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))]>, TB, OpSize, VEX,
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))]>, TB, OpSize, VEX,
                 Sched<[WriteShuffle]>;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
-                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))], IIC_SSE_PEXTRW>,
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))], IIC_SSE_PEXTRW>,
                Sched<[WriteShuffleLd, ReadAfterLd]>;
 
 // Insert
-let Predicates = [HasAVX] in {
-  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
-  def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
-       (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
-       "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-       []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>;
-}
+let Predicates = [HasAVX] in
+defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
 
-let Constraints = "$src1 = $dst" in
-  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, TB, OpSize;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4313,24 +4293,23 @@ let Constraints = "$src1 = $dst" in
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
 
-def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
            IIC_SSE_MOVMSK>, VEX;
-def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX;
 
 let Predicates = [HasAVX2] in {
-def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
-def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
+           VEX, VEX_L;
 }
 
-def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
            IIC_SSE_MOVMSK>;
 
 } // ExeDomain = SSEPackedInt
@@ -4341,25 +4320,25 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
 
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [HasAVX,In32BitMode] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
            IIC_SSE_MASKMOV>, VEX;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
            IIC_SSE_MASKMOV>, VEX;
 
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [UseSSE2,In32BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
            IIC_SSE_MASKMOV>;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
@@ -4386,12 +4365,13 @@ def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                         IIC_SSE_MOVDQ>,
                       VEX, Sched<[WriteLoad]>;
 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1 in
 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 
@@ -4410,6 +4390,7 @@ def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1 in
 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
@@ -4418,25 +4399,27 @@ def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
 //
-def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-
-def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>,
-                      VEX, Sched<[WriteLoad]>;
-def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
-
-def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in {
+  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteLoad]>;
+  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
@@ -4463,12 +4446,24 @@ def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 let SchedRW = [WriteMove] in {
 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          "movq\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
@@ -4484,121 +4479,112 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 //===---------------------------------------------------------------------===//
 // Bitcast FR64 <-> GR64
 //
-let Predicates = [HasAVX] in
-def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
-                        "vmovq\t{$src, $dst|$dst, $src}",
-                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
-                        VEX, Sched<[WriteLoad]>;
-def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in
+  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                          VEX, Sched<[WriteLoad]>;
+  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (bitconvert FR64:$src))],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "mov{d|q}\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
-                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          "movq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
-
-def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
-                       "movq\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
-                       IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
-                       [(set GR64:$dst, (bitconvert FR64:$src))],
-                       IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
-                       "movq\t{$src, $dst|$dst, $src}",
-                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                       IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
 //
-def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (bitconvert FR32:$src))],
-                      IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
-def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
-def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (bitconvert FR32:$src))],
-                      IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+let isCodeGenOnly = 1 in {
+  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Patterns and instructions to describe movd/movq to XMM register zero-extends
 //
-let SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
 let AddedComplexity = 15 in {
-def VMOVZDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (v4i32 (X86vzmovl
-                                      (v4i32 (scalar_to_vector GR32:$src)))))],
-                                      IIC_SSE_MOVDQ>, VEX;
 def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
                                       IIC_SSE_MOVDQ>,
                                       VEX, VEX_W;
-}
-let AddedComplexity = 15 in {
-def MOVZDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (v4i32 (X86vzmovl
-                                      (v4i32 (scalar_to_vector GR32:$src)))))],
-                                      IIC_SSE_MOVDQ>;
 def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
                                       IIC_SSE_MOVDQ>;
 }
-} // SchedRW
+} // isCodeGenOnly, SchedRW
 
-let AddedComplexity = 20, SchedRW = [WriteLoad] in {
-def VMOVZDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
-                                                   (loadi32 addr:$src))))))],
-                                                   IIC_SSE_MOVDQ>, VEX;
-def MOVZDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
-                                                   (loadi32 addr:$src))))))],
-                                                   IIC_SSE_MOVDQ>;
-} // AddedComplexity, SchedRW
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIrr GR32:$src)>;
 
-let Predicates = [HasAVX] in {
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
   let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-              (VMOVZDI2PDIrm addr:$src)>;
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-              (VMOVZDI2PDIrm addr:$src)>;
+              (VMOVDI2PDIrm addr:$src)>;
   }
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
-let Predicates = [UseSSE2], AddedComplexity = 20 in {
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-            (MOVZDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-            (MOVZDI2PDIrm addr:$src)>;
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (MOVDI2PDIrr GR32:$src)>;
+
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+  }
 }
 
 // These are the correct encodings of the instructions so that we know how to
@@ -4607,15 +4593,12 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in {
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOV64toSDrr FR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOVSDto64rr GR64:$dst, FR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Quadword
@@ -4630,7 +4613,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
-                    VEX, Requires<[HasAVX]>;
+                    VEX, Requires<[UseAVX]>;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -4667,16 +4650,15 @@ def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 
-let AddedComplexity = 20 in
+let isCodeGenOnly = 1, AddedComplexity = 20 in {
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vmovq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
-                     XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>;
+                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
 
-let AddedComplexity = 20 in
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -4684,10 +4666,9 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
                      XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+}
 
-let Predicates = [HasAVX], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-            (VMOVZQI2PQIrm addr:$src)>;
+let Predicates = [UseAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (VMOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)),
@@ -4695,8 +4676,6 @@ let Predicates = [HasAVX], AddedComplexity = 20 in {
 }
 
 let Predicates = [UseSSE2], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-            (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
@@ -4719,7 +4698,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, VEX, Requires<[HasAVX]>;
+                      XS, VEX, Requires<[UseAVX]>;
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -4728,14 +4707,14 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       XS, Requires<[UseSSE2]>;
 } // SchedRW
 
-let SchedRW = [WriteVecLogicLd] in {
+let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))],
                                              IIC_SSE_MOVDQ>,
-                      XS, VEX, Requires<[HasAVX]>;
+                      XS, VEX, Requires<[UseAVX]>;
 let AddedComplexity = 20 in {
 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -4744,49 +4723,19 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                                              IIC_SSE_MOVDQ>,
                       XS, Requires<[UseSSE2]>;
 }
-} // SchedRW
+} // isCodeGenOnly, SchedRW
 
 let AddedComplexity = 20 in {
-  let Predicates = [HasAVX] in {
-    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-              (VMOVZPQILo2PQIrm addr:$src)>;
+  let Predicates = [UseAVX] in {
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (VMOVZPQILo2PQIrr VR128:$src)>;
   }
   let Predicates = [UseSSE2] in {
-    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-              (MOVZPQILo2PQIrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (MOVZPQILo2PQIrr VR128:$src)>;
   }
 }
 
-// Instructions to match in the assembler
-let SchedRW = [WriteMove] in {
-def VMOVQs64rr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                      "movq\t{$src, $dst|$dst, $src}", [],
-                      IIC_SSE_MOVDQ>, VEX, VEX_W;
-def VMOVQd64rr : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                      "movq\t{$src, $dst|$dst, $src}", [],
-                      IIC_SSE_MOVDQ>, VEX, VEX_W;
-// Recognize "movd" with GR64 destination, but encode as a "movq"
-def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "movd\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVDQ>, VEX, VEX_W;
-} // SchedRW
-
-// Instructions for the disassembler
-// xr = XMM register
-// xm = mem64
-
-let SchedRW = [WriteMove] in {
-let Predicates = [HasAVX] in
-def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
-def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
-} // SchedRW
-
 //===---------------------------------------------------------------------===//
 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
@@ -4805,13 +4754,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
 
 let Predicates = [HasAVX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -4821,19 +4770,19 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (VMOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSLDUPrm addr:$src)>;
   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
             (VMOVSHDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSHDUPYrm addr:$src)>;
   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
             (VMOVSLDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSLDUPYrm addr:$src)>;
 }
 
@@ -4887,20 +4836,20 @@ let Predicates = [HasAVX] in {
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
 let Predicates = [HasAVX] in {
-  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   def : Pat<(X86Movddup (bc_v2f64
                              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
 
   // 256-bit version
-  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
+  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
             (VMOVDDUPYrm addr:$src)>;
-  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
+  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
             (VMOVDDUPYrm addr:$src)>;
   def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
             (VMOVDDUPYrm addr:$src)>;
@@ -5102,12 +5051,12 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
 // Helper fragments to match sext vXi1 to vXiY.
 def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
                                                VR128:$src))>;
-def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>;
-def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>;
+def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
 def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
                                                VR256:$src))>;
-def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>;
-def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
 
 let Predicates = [HasAVX] in {
   defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
@@ -5263,34 +5212,34 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSHUFB, 0>, VEX_4V;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
@@ -5310,28 +5259,28 @@ defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                         int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
@@ -5389,7 +5338,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>;
+      [], IIC_SSE_PALIGNRR>, OpSize, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
   def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
@@ -5397,7 +5346,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+      [], IIC_SSE_PALIGNRM>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
   }
 }
 
@@ -5489,16 +5438,17 @@ def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
 // SSE4.1 - Packed Move with Sign/Zero Extend
 //===----------------------------------------------------------------------===//
 
-multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
-         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
-       OpSize;
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
+         itins.rm>, OpSize;
 }
 
 multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
@@ -5509,22 +5459,23 @@ multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
 
   def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize;
+                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
+                  OpSize;
 }
 
 let Predicates = [HasAVX] in {
-defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
-                                     VEX;
-defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
-                                     VEX;
-defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
-                                     VEX;
-defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
-                                     VEX;
-defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
-                                     VEX;
-defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
-                                     VEX;
+defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
+                                     int_x86_sse41_pmovsxbw>, VEX;
+defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
+                                     int_x86_sse41_pmovsxwd>, VEX;
+defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
+                                     int_x86_sse41_pmovsxdq>, VEX;
+defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
+                                     int_x86_sse41_pmovzxbw>, VEX;
+defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
+                                     int_x86_sse41_pmovzxwd>, VEX;
+defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
+                                     int_x86_sse41_pmovzxdq>, VEX;
 }
 
 let Predicates = [HasAVX2] in {
@@ -5542,12 +5493,12 @@ defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
                                         int_x86_avx2_pmovzxdq>, VEX, VEX_L;
 }
 
-defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
-defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
-defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
-defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
-defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
-defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,                                       SSE_INTALU_ITINS_P>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,                                       SSE_INTALU_ITINS_P>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,                                       SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load.
@@ -5645,32 +5596,39 @@ let Predicates = [HasAVX2] in {
               (VPMOVZXDQYrr VR128:$src)>;
     def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
               (VPMOVZXWDYrr VR128:$src)>;
+    def : Pat<(v16i16 (X86vzmovly (v16i8 VR128:$src))),
+              (VPMOVZXBWYrr VR128:$src)>;
   }
 
   def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
   def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+  def : Pat<(v16i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
 }
 
 let Predicates = [HasAVX] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
 }
 
 
-multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
-         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
+         itins.rm>,
           OpSize;
 }
 
@@ -5709,10 +5667,14 @@ defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
                                        int_x86_avx2_pmovzxwq>, VEX, VEX_L;
 }
 
-defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
-defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
-defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
-defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
+                                      SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load
@@ -5740,7 +5702,8 @@ let Predicates = [UseSSE41] in {
             (PMOVZXWQrm addr:$src)>;
 }
 
-multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
@@ -5779,8 +5742,10 @@ defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
 defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
                                        int_x86_avx2_pmovzxbq>, VEX, VEX_L;
 }
-defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
-defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
+                                      SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX2] in {
   def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
@@ -6032,35 +5997,39 @@ let Predicates = [UseSSE41] in {
 
 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
-  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+                                         imm:$src2))]>,
                  OpSize;
   let neverHasSideEffects = 1, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  []>, OpSize;
 // FIXME:
 // There's an AssertZext in the way of writing the store pattern
 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in
   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
-  def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
-         (ins VR128:$src1, i32i8imm:$src2),
-         "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
-}
 
 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 
 
 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  let isCodeGenOnly = 1, hasSideEffects = 0 in
+  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins VR128:$src1, i32i8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, OpSize;
+
   let neverHasSideEffects = 1, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
@@ -6122,31 +6091,28 @@ defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
-multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
-  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+                            OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR32:$dst,
-                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+                 [(set GR32orGR64:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+                    itins.rr>,
            OpSize;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
-                          addr:$dst)]>, OpSize;
+                          addr:$dst)], itins.rm>, OpSize;
 }
 
 let ExeDomain = SSEPackedSingle in {
-  let Predicates = [UseAVX] in {
+  let Predicates = [UseAVX] in
     defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
-    def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
-                    (ins VR128:$src1, i32i8imm:$src2),
-                    "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, OpSize, VEX;
-  }
-  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
 }
 
 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
@@ -6167,13 +6133,13 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
 
 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
       !if(Is2Addr,
@@ -6246,7 +6212,8 @@ let Constraints = "$src1 = $dst" in
 // are optimized inserts that won't zero arbitrary elements in the destination
 // vector. The next one matches the intrinsic and could zero arbitrary elements
 // in the target vector.
-multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
       !if(Is2Addr,
@@ -6254,7 +6221,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
       OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
@@ -6265,14 +6232,14 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
       [(set VR128:$dst,
         (X86insrtps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                    imm:$src3))]>, OpSize;
+                    imm:$src3))], itins.rm>, OpSize;
 }
 
 let ExeDomain = SSEPackedSingle in {
-  let Predicates = [HasAVX] in
+  let Predicates = [UseAVX] in
     defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
   let Constraints = "$src1 = $dst" in
-    defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6290,7 +6257,8 @@ let ExeDomain = SSEPackedSingle in {
                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 
   // Vector intrinsic operation, mem
@@ -6299,7 +6267,8 @@ let ExeDomain = SSEPackedSingle in {
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
-                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_MEM>,
                     OpSize;
 } // ExeDomain = SSEPackedSingle
 
@@ -6309,7 +6278,8 @@ let ExeDomain = SSEPackedDouble in {
                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 
   // Vector intrinsic operation, mem
@@ -6318,7 +6288,8 @@ let ExeDomain = SSEPackedDouble in {
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
-                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 } // ExeDomain = SSEPackedDouble
 }
@@ -6402,11 +6373,11 @@ let ExeDomain = GenericDomain in {
 let Predicates = [HasAVX] in {
   // Intrinsic form
   defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
-                                  memopv4f32, memopv2f64,
+                                  loadv4f32, loadv2f64,
                                   int_x86_sse41_round_ps,
                                   int_x86_sse41_round_pd>, VEX;
   defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
-                                  memopv8f32, memopv4f64,
+                                  loadv8f32, loadv4f64,
                                   int_x86_avx_round_ps_256,
                                   int_x86_avx_round_pd_256>, VEX, VEX_L;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
@@ -6544,7 +6515,7 @@ def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 OpSize, VEX;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
                 OpSize, VEX;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
@@ -6553,7 +6524,7 @@ def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 OpSize, VEX, VEX_L;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
+                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
                 OpSize, VEX, VEX_L;
 }
 
@@ -6582,13 +6553,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
 
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
-defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
                             VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
-defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
                             VEX_L;
 }
 }
@@ -6600,30 +6571,33 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
-                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>,
                      OpSize, XS;
   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
-                      (implicit EFLAGS)]>, OpSize, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, OpSize, XS;
 
   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
-                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>,
                      XS;
   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
-                      (implicit EFLAGS)]>, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
 
   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+                      IIC_SSE_POPCNT_RR>,
                       XS;
   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
-                       (implicit EFLAGS)]>, XS;
+                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
 }
 
 
@@ -6651,14 +6625,16 @@ defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
 
 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                              Intrinsic IntId128, bit Is2Addr = 1> {
+                              Intrinsic IntId128, bit Is2Addr = 1,
+                              OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
+        itins.rr>, OpSize;
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, i128mem:$src2),
        !if(Is2Addr,
@@ -6666,7 +6642,8 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv2i64 addr:$src2))))],
+          itins.rm>, OpSize;
 }
 
 /// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
@@ -6682,14 +6659,15 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
 }
 
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                          X86MemOperand x86memop, bit Is2Addr = 1> {
+                          X86MemOperand x86memop, bit Is2Addr = 1,
+                          OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
@@ -6712,21 +6690,21 @@ let Predicates = [HasAVX] in {
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
                                                          0>, VEX_4V;
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
                                                          0>, VEX_4V;
 }
@@ -6736,21 +6714,21 @@ let Predicates = [HasAVX2] in {
   defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
                                         int_x86_avx2_packusdw>, VEX_4V, VEX_L;
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
                                         int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
 }
@@ -6759,22 +6737,23 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in
   defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
-                                 memopv2i64, i128mem>;
-  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq,
+                                     1, SSE_INTMUL_ITINS_P>;
 }
 
 let Predicates = [HasAVX] in {
@@ -6792,15 +6771,16 @@ let Predicates = [HasAVX2] in {
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
-                 X86MemOperand x86memop, bit Is2Addr = 1> {
+                 X86MemOperand x86memop, bit Is2Addr = 1,
+                 OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, u32u8imm:$src3),
@@ -6809,7 +6789,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
         OpSize;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
@@ -6820,7 +6800,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
           (IntId RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
         OpSize;
 }
 
@@ -6828,40 +6808,40 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     let ExeDomain = SSEPackedSingle in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, memopv4f32, f128mem, 0>, VEX_4V;
+                                        VR128, loadv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
+                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
                                     f256mem, 0>, VEX_4V, VEX_L;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, memopv2f64, f128mem, 0>, VEX_4V;
+                                        VR128, loadv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
+                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
                                      f256mem, 0>, VEX_4V, VEX_L;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
-                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
   defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
   }
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, memopv4f32, f128mem, 0>, VEX_4V;
+                                   VR128, loadv4f32, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, memopv2f64, f128mem, 0>, VEX_4V;
+                                   VR128, loadv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv8f32, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -6869,21 +6849,27 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
-                                     VR128, memopv4f32, f128mem>;
+                                     VR128, memopv4f32, f128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
-                                     VR128, memopv2f64, f128mem>;
+                                     VR128, memopv2f64, f128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
-                                     VR128, memopv2i64, i128mem>;
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem>;
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_INTMUL_ITINS_P>;
   }
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, f128mem>;
+                                  VR128, memopv4f32, f128mem, 1,
+                                  SSE_DPPS_ITINS>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, f128mem>;
+                                  VR128, memopv2f64, f128mem, 1,
+                                  SSE_DPPD_ITINS>;
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
@@ -6910,23 +6896,23 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           memopv2f64, int_x86_sse41_blendvpd>;
+                                           loadv2f64, int_x86_sse41_blendvpd>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
-                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
+                                  loadv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           memopv4f32, int_x86_sse41_blendvps>;
+                                           loadv4f32, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
-                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
+                                  loadv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           memopv2i64, int_x86_sse41_pblendvb>;
+                                           loadv2i64, int_x86_sse41_pblendvb>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
+                                      loadv4i64, int_x86_avx2_pblendvb>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6979,7 +6965,7 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
                             (v32i8 VR256:$src2))),
-            (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
+            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
                                (imm:$mask))),
             (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
@@ -6988,13 +6974,14 @@ let Predicates = [HasAVX2] in {
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                               X86MemOperand x86memop, Intrinsic IntId> {
+                               X86MemOperand x86memop, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
                      "\t{$src2, $dst|$dst, $src2}"),
-                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
-                    OpSize;
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+                    itins.rr>, OpSize;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
@@ -7002,7 +6989,8 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize;
+                       (bitconvert (mem_frag addr:$src2)), XMM0))],
+                       itins.rm>, OpSize;
   }
 }
 
@@ -7099,11 +7087,11 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 memopv2i64, i128mem, 0>, VEX_4V;
+                                 loadv2i64, i128mem, 0>, VEX_4V;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -7263,70 +7251,100 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
 // crc intrinsic instruction
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+                   RegisterClass RCIn, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+                   X86MemOperand x86memop, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+         IIC_CRC32_MEM>;
+
 let Constraints = "$src1 = $dst" in {
-  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i8mem:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_8 GR32:$src1,
-                         (load addr:$src2)))]>;
-  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR8:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
-  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i16mem:$src2),
-                      "crc32{w}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_16 GR32:$src1,
-                         (load addr:$src2)))]>,
-                         OpSize;
-  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR16:$src2),
-                      "crc32{w}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
-                         OpSize;
-  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i32mem:$src2),
-                      "crc32{l}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_32 GR32:$src1,
-                         (load addr:$src2)))]>;
-  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR32:$src2),
-                      "crc32{l}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
-  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
-                      (ins GR64:$src1, i8mem:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_8 GR64:$src1,
-                         (load addr:$src2)))]>,
-                         REX_W;
-  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
-                      (ins GR64:$src1, GR8:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
-                         REX_W;
-  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
-                      (ins GR64:$src1, i64mem:$src2),
-                      "crc32{q}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_64 GR64:$src1,
-                         (load addr:$src2)))]>,
-                         REX_W;
-  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
-                      (ins GR64:$src1, GR64:$src2),
-                      "crc32{q}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
-                         REX_W;
+  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+                                 int_x86_sse42_crc32_32_16>, OpSize;
+  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+                                 int_x86_sse42_crc32_32_16>, OpSize;
+  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+                                 int_x86_sse42_crc32_32_32>;
+  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+                                 int_x86_sse42_crc32_32_32>;
+  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  let hasSideEffects = 0 in {
+    let mayLoad = 1 in
+    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+                                   null_frag>, REX_W;
+    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+                                   null_frag>, REX_W;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+                      bit UsesXMM0 = 0> {
+  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+                            (i8 imm:$src3)))]>, TA;
+  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1,
+                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (i8 imm:$src3)))]>, TA;
+
+  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+  let Uses=[XMM0] in
+  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
 }
 
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
 //===----------------------------------------------------------------------===//
 // AES-NI Instructions
 //===----------------------------------------------------------------------===//
@@ -7383,7 +7401,7 @@ let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
-      [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
       OpSize, VEX;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -7410,7 +7428,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1, i8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
       OpSize, VEX;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -7441,7 +7459,7 @@ def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
-                              (memopv2i64 addr:$src2), imm:$src3))]>;
+                              (loadv2i64 addr:$src2), imm:$src3))]>;
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
@@ -7449,13 +7467,15 @@ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst,
-             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+             IIC_SSE_PCLMULQDQ_RR>;
 
 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
-                              (memopv2i64 addr:$src2), imm:$src3))]>;
+                              (memopv2i64 addr:$src2), imm:$src3))],
+                              IIC_SSE_PCLMULQDQ_RM>;
 } // Constraints = "$src1 = $dst"
 
 
@@ -7595,11 +7615,11 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7623,22 +7643,22 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
-                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
-                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
-                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7670,12 +7690,12 @@ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
                     (v4f64 VR256:$src1),
                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
@@ -7785,15 +7805,15 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7801,15 +7821,15 @@ def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
 def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
+def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
                                (i8 imm:$imm))),
           (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDYmi addr:$src1, imm:$imm)>;
 
 def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
           (VPERMILPDri VR128:$src1, imm:$imm)>;
-def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDmi addr:$src1, imm:$imm)>;
 }
 
@@ -7825,7 +7845,7 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 }
 
@@ -7833,7 +7853,7 @@ let Predicates = [HasAVX] in {
 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -7848,16 +7868,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
-                  (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4i64 addr:$src2), (i8 imm:$imm))),
+                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
-                  (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
-                  (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -7901,7 +7921,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
                TA, OpSize, VEX;
 }
 
-let Predicates = [HasAVX, HasF16C] in {
+let Predicates = [HasF16C] in {
   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
   defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
@@ -7935,9 +7955,9 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
 
 let isCommutable = 0 in {
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
-                                   VR128, memopv2i64, i128mem>;
+                                   VR128, loadv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
-                                    VR256, memopv4i64, i256mem>, VEX_L;
+                                    VR256, loadv4i64, i256mem>, VEX_L;
 }
 
 def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
@@ -8115,9 +8135,9 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    VEX_4V, VEX_L;
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT> {
@@ -8137,9 +8157,9 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                               (i8 imm:$src2))))]>, VEX, VEX_L;
 }
 
-defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
 let ExeDomain = SSEPackedDouble in
-defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
@@ -8152,7 +8172,7 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 
 let Predicates = [HasAVX2] in {
@@ -8163,13 +8183,13 @@ def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
                   (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
-                   (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
                   (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
@@ -8208,22 +8228,22 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
-                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
-                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
-                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -8262,20 +8282,20 @@ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
                     (v32i8 VR256:$src1),
                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
@@ -8333,7 +8353,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
-                       (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>,
+                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
              VEX_4V;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
@@ -8346,7 +8366,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
-                       (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
+                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
              VEX_4V, VEX_L;
 }
 
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 2aa08fa..2b6ee5c 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -20,23 +20,21 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
-  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
-  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
-  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
-  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
-  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
-  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
-  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
-  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
-  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
-  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
-  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
-  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
-  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
-  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
-}
+defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
+defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
+defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
+defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
+defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
+defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
+defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
+defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
+defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
+defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
+defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
+defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
+defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
+defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
+defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
 
 // Scalar load 2 addr operand instructions
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
@@ -49,12 +47,10 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
-                   ssmem, sse_load_f32>;
-  defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
-                   sdmem, sse_load_f64>;
-}
+defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                 ssmem, sse_load_f32>;
+defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                 sdmem, sse_load_f64>;
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
@@ -66,10 +62,8 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
-}
+defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
@@ -81,12 +75,8 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
-                           memopv8f32>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
-                           memopv4f64>;
-}
+defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>;
+defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>;
 
 multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
@@ -107,20 +97,18 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
              VEX_4VOp3;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
-  defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
-  defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
-  defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
-  defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
-  defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
-  defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
-  defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
-  defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
-  defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
-  defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
-  defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
-}
+defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
 
 multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
@@ -134,12 +122,10 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
              (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
-  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
-  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
-  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
-}
+defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
 
 // Instruction where second source can be memory, but third must be register
 multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -158,20 +144,18 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               VR128:$src3))]>, VEX_4V, VEX_I8IMM;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
-  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
-  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
-  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
-  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
-  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
-  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
-  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
-  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
-  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
-  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
-  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
-}
+defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
 
 // Instruction where second source can be memory, third must be imm8
 multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -190,16 +174,14 @@ multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               imm:$src3))]>, VEX_4V;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
-  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
-  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
-  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
-  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
-  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
-  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
-  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
-}
+defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
 
 // Instruction where either second or third source can be memory
 multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -227,10 +209,8 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            VEX_4V, VEX_I8IMM;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
-  defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
-}
+defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
 
 multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
@@ -257,9 +237,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            VEX_4V, VEX_I8IMM, VEX_L;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
-}
+defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
 
 multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
                   Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index fc86e1e..e99f2d9 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -127,7 +127,7 @@ extern "C" {
     "movaps  %xmm6, 96(%rsp)\n"
     "movaps  %xmm7, 112(%rsp)\n"
     // JIT callee
-#ifdef _WIN64
+#if defined(_WIN64) || defined(__CYGWIN__)
     "subq    $32, %rsp\n"
     "movq    %rbp, %rcx\n"    // Pass prev frame and return address
     "movq    8(%rbp), %rdx\n"
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index c7c00b5..6649c82 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "X86COFFMachineModuleInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -34,14 +35,12 @@ namespace {
 /// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
 class X86MCInstLower {
   MCContext &Ctx;
-  Mangler *Mang;
   const MachineFunction &MF;
   const TargetMachine &TM;
   const MCAsmInfo &MAI;
   X86AsmPrinter &AsmPrinter;
 public:
-  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
-                 X86AsmPrinter &asmprinter);
+  X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
 
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
@@ -50,13 +49,16 @@ public:
 
 private:
   MachineModuleInfoMachO &getMachOMMI() const;
+  Mangler *getMang() const {
+    return AsmPrinter.Mang;
+  }
 };
 
 } // end anonymous namespace
 
-X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
-: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
+: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()),
   MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
 
 MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
@@ -81,7 +83,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
         MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
       isImplicitlyPrivate = true;
 
-    Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+    getMang()->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
   } else if (MO.isSymbol()) {
     Name += MAI.getGlobalPrefix();
     Name += MO.getSymbolName();
@@ -110,7 +112,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -124,7 +126,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -140,7 +142,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     if (MO.isGlobal()) {
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     } else {
       Name.erase(Name.end()-5, Name.end());
@@ -591,18 +593,6 @@ ReSimplify:
   case X86::MOVSX64rr32:
     SimplifyMOVSX(OutMI);
     break;
-
-  case X86::MORESTACK_RET:
-    OutMI.setOpcode(X86::RET);
-    break;
-
-  case X86::MORESTACK_RET_RESTORE_R10:
-    OutMI.setOpcode(X86::MOV64rr);
-    OutMI.addOperand(MCOperand::CreateReg(X86::R10));
-    OutMI.addOperand(MCOperand::CreateReg(X86::RAX));
-
-    AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
-    break;
   }
 }
 
@@ -685,8 +675,140 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
     .addExpr(tlsRef));
 }
 
+static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+parseMemoryOperand(StackMaps::Location::LocationType LocTy, unsigned Size,
+                   MachineInstr::const_mop_iterator MOI,
+                   MachineInstr::const_mop_iterator MOE) {
+
+  typedef StackMaps::Location Location;
+
+  assert(std::distance(MOI, MOE) >= 5 && "Too few operands to encode mem op.");
+
+  const MachineOperand &Base = *MOI;
+  const MachineOperand &Scale = *(++MOI);
+  const MachineOperand &Index = *(++MOI);
+  const MachineOperand &Disp = *(++MOI);
+  const MachineOperand &ZeroReg = *(++MOI);
+
+  // Sanity check for supported operand format.
+  assert(Base.isReg() &&
+         Scale.isImm() && Scale.getImm() == 1 &&
+         Index.isReg() && Index.getReg() == 0 &&
+         Disp.isImm() && ZeroReg.isReg() && (ZeroReg.getReg() == 0) &&
+         "Unsupported x86 memory operand sequence.");
+  (void)Scale;
+  (void)Index;
+  (void)ZeroReg;
+
+  return std::make_pair(
+    Location(LocTy, Size, Base.getReg(), Disp.getImm()), ++MOI);
+}
+
+std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+X86AsmPrinter::stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
+                                     MachineInstr::const_mop_iterator MOE,
+                                     const TargetMachine &TM) {
+
+  typedef StackMaps::Location Location;
+
+  const MachineOperand &MOP = *MOI;
+  assert(!MOP.isRegMask() && (!MOP.isReg() || !MOP.isImplicit()) &&
+         "Register mask and implicit operands should not be processed.");
+
+  if (MOP.isImm()) {
+    // Verify anyregcc
+    // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+
+    switch (MOP.getImm()) {
+    default: llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp: {
+      unsigned Size = TM.getDataLayout()->getPointerSizeInBits();
+      assert((Size % 8) == 0 && "Need pointer size in bytes.");
+      Size /= 8;
+      return parseMemoryOperand(StackMaps::Location::Direct, Size,
+                                llvm::next(MOI), MOE);
+    }
+    case StackMaps::IndirectMemRefOp: {
+      ++MOI;
+      int64_t Size = MOI->getImm();
+      assert(Size > 0 && "Need a valid size for indirect memory locations.");
+      return parseMemoryOperand(StackMaps::Location::Indirect, Size,
+                                llvm::next(MOI), MOE);
+    }
+    case StackMaps::ConstantOp: {
+      ++MOI;
+      assert(MOI->isImm() && "Expected constant operand.");
+      int64_t Imm = MOI->getImm();
+      return std::make_pair(
+        Location(Location::Constant, sizeof(int64_t), 0, Imm), ++MOI);
+    }
+    }
+  }
+
+  // Otherwise this is a reg operand. The physical register number will
+  // ultimately be encoded as a DWARF regno. The stack map also records the size
+  // of a spill slot that can hold the register content. (The runtime can
+  // track the actual size of the data type if it needs to.)
+  assert(MOP.isReg() && "Expected register operand here.");
+  assert(TargetRegisterInfo::isPhysicalRegister(MOP.getReg()) &&
+         "Virtreg operands should have been rewritten before now.");
+  const TargetRegisterClass *RC =
+    TM.getRegisterInfo()->getMinimalPhysRegClass(MOP.getReg());
+  assert(!MOP.getSubReg() && "Physical subreg still around.");
+  return std::make_pair(
+    Location(Location::Register, RC->getSize(), MOP.getReg(), 0), ++MOI);
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+static void LowerSTACKMAP(MCStreamer &OutStreamer,
+                          StackMaps &SM,
+                          const MachineInstr &MI)
+{
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+  SM.recordStackMap(MI);
+  // Emit padding.
+  // FIXME: These nops ensure that the stackmap's shadow is covered by
+  // instructions from the same basic block, but the nops should not be
+  // necessary if instructions from the same block follow the stackmap.
+  for (unsigned i = 0; i < NumNOPBytes; ++i)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+static void LowerPATCHPOINT(MCStreamer &OutStreamer,
+                            StackMaps &SM,
+                            const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers opers(&MI);
+  unsigned ScratchIdx = opers.getNextScratchIdx();
+  unsigned EncodedBytes = 0;
+  int64_t CallTarget = opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  if (CallTarget) {
+    // Emit MOV to materialize the target address and the CALL to target.
+    // This is encoded with 12-13 bytes, depending on which register is used.
+    // We conservatively assume that it is 12 bytes and emit in worst case one
+    // extra NOP byte.
+    EncodedBytes = 12;
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64ri)
+                                .addReg(MI.getOperand(ScratchIdx).getReg())
+                                .addImm(CallTarget));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::CALL64r)
+                                .addReg(MI.getOperand(ScratchIdx).getReg()));
+  }
+  // Emit padding.
+  unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+
+  for (unsigned i = EncodedBytes; i < NumBytes; ++i)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  X86MCInstLower MCInstLowering(Mang, *MF, *this);
+  X86MCInstLower MCInstLowering(*MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
@@ -774,6 +896,24 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addExpr(DotExpr));
     return;
   }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+
+  case X86::MORESTACK_RET:
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64rr)
+      .addReg(X86::R10)
+      .addReg(X86::RAX));
+    return;
   }
 
   MCInst TmpInst;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 0923310..dbda556 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -101,8 +101,8 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // Only enable when post-RA scheduling is enabled and this is needed.
-  return TM.getSubtargetImpl()->postRAScheduler();
+  // ExeDepsFixer and PostRAScheduler require liveness.
+  return true;
 }
 
 int
@@ -239,6 +239,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case CallingConv::HiPE:
     return CSR_NoRegs_SaveList;
 
+  case CallingConv::WebKit_JS:
+    return CSR_64_SaveList;
+  case CallingConv::AnyReg:
+    return CSR_MostRegs_64_SaveList;
+
   case CallingConv::Intel_OCL_BI: {
     bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
     bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
@@ -296,6 +301,8 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   }
   if (CC == CallingConv::GHC || CC == CallingConv::HiPE)
     return CSR_NoRegs_RegMask;
+  if (CC == CallingConv::WebKit_JS || CC == CallingConv::AnyReg)
+    return CSR_MostRegs_64_RegMask;
   if (!Is64Bit)
     return CSR_32_RegMask;
   if (CC == CallingConv::Cold)
@@ -512,14 +519,6 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 
-unsigned X86RegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned X86RegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 namespace llvm {
 unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
                                 bool High) {
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index fb17682..22251b2 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -126,10 +126,6 @@ public:
   unsigned getBaseRegister() const { return BasePtr; }
   // FIXME: Move to FrameInfok
   unsigned getSlotSize() const { return SlotSize; }
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 // getX86SubSuperRegister - X86 utility function. It returns the sub or super
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 62ba2bc..9748261 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -19,6 +19,10 @@ def HaswellModel : SchedMachineModel {
   let MicroOpBufferSize = 192; // Based on the reorder buffer.
   let LoadLatency = 4;
   let MispredictPenalty = 16;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
 }
 
 let SchedModel = HaswellModel in {
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 52ead94..3011c6d 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -20,6 +20,10 @@ def SandyBridgeModel : SchedMachineModel {
   let MicroOpBufferSize = 168; // Based on the reorder buffer.
   let LoadLatency = 4;
   let MispredictPenalty = 16;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
 }
 
 let SchedModel = SandyBridgeModel in {
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index ceb2e05..0556437 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -141,9 +141,12 @@ def IIC_IDIV64      : InstrItinClass;
 // neg/not/inc/dec
 def IIC_UNARY_REG   : InstrItinClass;
 def IIC_UNARY_MEM   : InstrItinClass;
-// add/sub/and/or/xor/adc/sbc/cmp/test
+// add/sub/and/or/xor/sbc/cmp/test
 def IIC_BIN_MEM     : InstrItinClass;
 def IIC_BIN_NONMEM  : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM     : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM  : InstrItinClass;
 // shift/rotate
 def IIC_SR          : InstrItinClass;
 // shift double
@@ -250,11 +253,11 @@ def IIC_SSE_INTSH_P_RR : InstrItinClass;
 def IIC_SSE_INTSH_P_RM : InstrItinClass;
 def IIC_SSE_INTSH_P_RI : InstrItinClass;
 
-def IIC_SSE_CMPP_RR : InstrItinClass;
-def IIC_SSE_CMPP_RM : InstrItinClass;
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
 
 def IIC_SSE_SHUFP : InstrItinClass;
-def IIC_SSE_PSHUF : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
 
 def IIC_SSE_UNPCK : InstrItinClass;
 
@@ -316,7 +319,8 @@ def IIC_SSE_PSIGN_RM : InstrItinClass;
 
 def IIC_SSE_PMADD : InstrItinClass;
 def IIC_SSE_PMULHRSW : InstrItinClass;
-def IIC_SSE_PALIGNR : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
 def IIC_SSE_MWAIT : InstrItinClass;
 def IIC_SSE_MONITOR : InstrItinClass;
 
@@ -492,8 +496,8 @@ def IIC_PUSH_REG : InstrItinClass;
 def IIC_PUSH_F : InstrItinClass;
 def IIC_PUSH_A : InstrItinClass;
 def IIC_BSWAP : InstrItinClass;
-def IIC_BSF : InstrItinClass;
-def IIC_BSR : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
 def IIC_MOVS : InstrItinClass;
 def IIC_STOS : InstrItinClass;
 def IIC_SCAS : InstrItinClass;
@@ -540,6 +544,33 @@ def IIC_BOUND : InstrItinClass;
 def IIC_ARPL_REG : InstrItinClass;
 def IIC_ARPL_MEM : InstrItinClass;
 def IIC_MOVBE : InstrItinClass;
+def IIC_AES   : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW   : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
 
 def IIC_NOP : InstrItinClass;
 
@@ -561,7 +592,7 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
-// The GenericModel contains no instruciton itineraries.
+// The GenericModel contains no instruction itineraries.
 def GenericModel : SchedMachineModel {
   let IssueWidth = 4;
   let MicroOpBufferSize = 32;
@@ -572,3 +603,4 @@ def GenericModel : SchedMachineModel {
 include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 14a1471..ba72f29 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Intel Atom (Bonnell)
-// processors.
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
 //
 //===----------------------------------------------------------------------===//
 
@@ -79,9 +79,12 @@ def AtomItineraries : ProcessorItineraries<
   // neg/not/inc/dec
   InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
-  // add/sub/and/or/xor/adc/sbc/cmp/test
+  // add/sub/and/or/xor/cmp/test
   InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+  // adc/sbc
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
   // shift/rotate
   InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
   // shift double
@@ -203,11 +206,11 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
 
-  InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
 
   InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
 
   InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
 
@@ -278,7 +281,8 @@ def AtomItineraries : ProcessorItineraries<
 
   InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
   InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
   InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
 
@@ -470,8 +474,8 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
 
   InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
-  InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
   InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
   InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
@@ -518,6 +522,8 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
   InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
   InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
 
   InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
   ]>;
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 0000000..6c2a304
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,668 @@
+//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// (Silvermont) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def IEC_RSV0 : FuncUnit;
+def IEC_RSV1 : FuncUnit;
+def FPC_RSV0 : FuncUnit;
+def FPC_RSV1 : FuncUnit;
+def MEC_RSV : FuncUnit;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def SLMItineraries : ProcessorItineraries<
+  [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ],
+  [], [
+  // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>] 
+  // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>]
+  // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>] 
+  // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>]
+  //
+  // Default is 1 cycle, IEC_RSV0 or IEC_RSV1
+  //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // mul
+  InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  // imul by al, ax, eax, rax
+  InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  // imul reg by reg|mem
+  InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>]  >,
+  // imul reg = reg/mem * imm
+  InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  // idiv - min latency
+  InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >,
+  // div - min latency
+  InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<25, [MEC_RSV]>] >,
+  InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >,
+  // neg/not/inc/dec
+  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // add/sub/and/or/xor/adc/sbc/cmp/test
+  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // adc/sbb
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  // shift/rotate
+  InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>,
+                   InstrStage<1, [MEC_RSV]>] >,
+  // shift double
+  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  // cmov
+  InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  // set
+  InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jcc
+  InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jcxz/jecxz/jrcxz
+  InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jmp rel
+  InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jmp indirect
+  InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // jmp far
+  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // loop/loope/loopne
+  InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // call - all but reg/imm
+  InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //ret
+  InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //sign extension movs
+  InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //zero extension movs
+  InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  // SSE binary operations
+  // arithmetic fp scalar
+  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<13, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<13, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+
+  // arithmetic fp parallel
+  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<27, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<27, [MEC_RSV]>] >,
+
+  // bitwise parallel
+  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  // arithmetic int parallel
+  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+
+  // multiply int parallel
+  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>,
+                   InstrStage<5, [MEC_RSV]>] >,
+
+  // shift parallel
+  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>,
+                   InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>,
+                   InstrStage<26, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>,
+                   InstrStage<13, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>,
+                   InstrStage<26, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>,
+                   InstrStage<13, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>,
+                   InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  // conversions
+  // to/from PD ...
+  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  // to/from PS except to/from PD and PS2PI
+  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >,
+  InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<12, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<15, [MEC_RSV]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<11, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >
+  ]>;
+
+// Silvermont machine model.
+def SLMModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
+                       // OperandCycles may be used for expected latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  let Itineraries = SLMItineraries;
+}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index d1db79f..b9c620f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -46,8 +46,6 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       !ConstantSize ||
       ConstantSize->getZExtValue() >
         Subtarget->getMaxInlineSizeThreshold()) {
-    SDValue InFlag(0, 0);
-
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index fae90f2..01353b2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -276,20 +276,29 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
          (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
          (Family == 6 && Model == 0x2A) || // SandyBridge
          (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
-         (Family == 6 && Model == 0x3A))) {// IvyBridge
+         (Family == 6 && Model == 0x3A) || // IvyBridge
+         (Family == 6 && Model == 0x3E) || // IvyBridge EP
+         (Family == 6 && Model == 0x3C) || // Haswell
+         (Family == 6 && Model == 0x3F) || // ...
+         (Family == 6 && Model == 0x45) || // ...
+         (Family == 6 && Model == 0x46))) { // ...
       IsUAMemFast = true;
       ToggleFeature(X86::FeatureFastUAMem);
     }
 
-    // Set processor type. Currently only Atom is detected.
+    // Set processor type. Currently only Atom or Silvermont (SLM) is detected.
     if (Family == 6 &&
-        (Model == 28 || Model == 38 || Model == 39
-         || Model == 53 || Model == 54)) {
+        (Model == 28 || Model == 38 || Model == 39 ||
+         Model == 53 || Model == 54)) {
       X86ProcFamily = IntelAtom;
 
       UseLeaForSP = true;
       ToggleFeature(X86::FeatureLeaForSP);
     }
+    else if (Family == 6 &&
+        (Model == 55 || Model == 74 || Model == 77)) {
+      X86ProcFamily = IntelSLM;
+    }
 
     unsigned MaxExtLevel;
     X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -351,14 +360,38 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
         HasRTM = true;
         ToggleFeature(X86::FeatureRTM);
       }
-      if (IsIntel && ((EBX >> 19) & 0x1)) {
-        HasADX = true;
-        ToggleFeature(X86::FeatureADX);
+      if (IsIntel && ((EBX >> 16) & 0x1)) {
+        X86SSELevel = AVX512F;
+        ToggleFeature(X86::FeatureAVX512);
       }
       if (IsIntel && ((EBX >> 18) & 0x1)) {
         HasRDSEED = true;
         ToggleFeature(X86::FeatureRDSEED);
       }
+      if (IsIntel && ((EBX >> 19) & 0x1)) {
+        HasADX = true;
+        ToggleFeature(X86::FeatureADX);
+      }
+      if (IsIntel && ((EBX >> 26) & 0x1)) {
+        HasPFI = true;
+        ToggleFeature(X86::FeaturePFI);
+      }
+      if (IsIntel && ((EBX >> 27) & 0x1)) {
+        HasERI = true;
+        ToggleFeature(X86::FeatureERI);
+      }
+      if (IsIntel && ((EBX >> 28) & 0x1)) {
+        HasCDI = true;
+        ToggleFeature(X86::FeatureCDI);
+      }
+      if (IsIntel && ((EBX >> 29) & 0x1)) {
+        HasSHA = true;
+        ToggleFeature(X86::FeatureSHA);
+      }
+    }
+    if (IsAMD && ((ECX >> 21) & 0x1)) {
+      HasTBM = true;
+      ToggleFeature(X86::FeatureTBM);
     }
   }
 }
@@ -416,8 +449,8 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
     // Make sure 64-bit features are available in 64-bit mode.
     if (In64BitMode) {
-      HasX86_64 = true; ToggleFeature(X86::Feature64Bit);
-      HasCMov = true;   ToggleFeature(X86::FeatureCMOV);
+      if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); }
+      if (!HasCMov)   { HasCMov   = true; ToggleFeature(X86::FeatureCMOV); }
 
       if (X86SSELevel < SSE2) {
         X86SSELevel = SSE2;
@@ -429,9 +462,9 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   // CPUName may have been set by the CPU detection code. Make sure the
   // new MCSchedModel is used.
-  InitMCProcessorInfo(CPUName, FS);
+  InitCPUSchedModel(CPUName);
 
-  if (X86ProcFamily == IntelAtom)
+  if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM)
     PostRAScheduler = true;
 
   InstrItins = getInstrItineraryForCPU(CPUName);
@@ -468,6 +501,7 @@ void X86Subtarget::initializeEnvironment() {
   HasFMA = false;
   HasFMA4 = false;
   HasXOP = false;
+  HasTBM = false;
   HasMOVBE = false;
   HasRDRAND = false;
   HasF16C = false;
@@ -479,8 +513,9 @@ void X86Subtarget::initializeEnvironment() {
   HasHLE = false;
   HasERI = false;
   HasCDI = false;
-  HasPFI=false;
+  HasPFI = false;
   HasADX = false;
+  HasSHA = false;
   HasPRFCHW = false;
   HasRDSEED = false;
   IsBTMemSlow = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 8793238..dd8c081 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -42,7 +42,7 @@ enum Style {
 class X86Subtarget : public X86GenSubtargetInfo {
 protected:
   enum X86SSEEnum {
-    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
   };
 
   enum X863DNowEnum {
@@ -50,7 +50,7 @@ protected:
   };
 
   enum X86ProcFamilyEnum {
-    Others, IntelAtom
+    Others, IntelAtom, IntelSLM
   };
 
   /// X86ProcFamily - X86 processor family: Intel Atom, and others
@@ -97,6 +97,9 @@ protected:
   /// HasXOP - Target has XOP instructions
   bool HasXOP;
 
+  /// HasTBM - Target has TBM instructions.
+  bool HasTBM;
+
   /// HasMOVBE - True if the processor has the MOVBE instruction.
   bool HasMOVBE;
 
@@ -127,6 +130,9 @@ protected:
   /// HasADX - Processor has ADX instructions.
   bool HasADX;
 
+  /// HasSHA - Processor has SHA instructions.
+  bool HasSHA;
+
   /// HasPRFCHW - Processor has PRFCHW instructions.
   bool HasPRFCHW;
 
@@ -258,7 +264,7 @@ public:
   bool hasSSE42() const { return X86SSELevel >= SSE42; }
   bool hasAVX() const { return X86SSELevel >= AVX; }
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
-  bool hasAVX512() const { return X86SSELevel >= AVX512; }
+  bool hasAVX512() const { return X86SSELevel >= AVX512F; }
   bool hasFp256() const { return hasAVX(); }
   bool hasInt256() const { return hasAVX2(); }
   bool hasSSE4A() const { return HasSSE4A; }
@@ -271,6 +277,7 @@ public:
   // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
   bool hasFMA4() const { return HasFMA4 && !HasFMA; }
   bool hasXOP() const { return HasXOP; }
+  bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
   bool hasF16C() const { return HasF16C; }
@@ -281,6 +288,7 @@ public:
   bool hasRTM() const { return HasRTM; }
   bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
+  bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
@@ -311,10 +319,8 @@ public:
     return (TargetTriple.getEnvironment() == Triple::ELF ||
             TargetTriple.isOSBinFormatELF());
   }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
-  bool isTargetNaCl() const {
-    return TargetTriple.getOS() == Triple::NaCl;
-  }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
   bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
@@ -327,15 +333,14 @@ public:
   }
   bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
 
+  bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
   bool isTargetWin64() const {
-    // FIXME: x86_64-cygwin has not been released yet.
     return In64BitMode && TargetTriple.isOSWindows();
   }
 
   bool isTargetWin32() const {
-    // FIXME: Cygwin is included for isTargetWin64 -- should it be included
-    // here too?
-    return !In64BitMode && (isTargetMingw() || isTargetWindows());
+    return !In64BitMode && (isTargetCygMing() || isTargetWindows());
   }
 
   bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
@@ -380,11 +385,14 @@ public:
   /// memset with zero passed as the second argument. Otherwise it
   /// returns null.
   const char *getBZeroEntry() const;
-  
+
   /// This function returns true if the target has sincos() routine in its
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
 
+  /// Enable the MachineScheduler pass for all X86 subtargets.
+  bool enableMachineScheduler() const LLVM_OVERRIDE { return true; }
+
   /// enablePostRAScheduler - run for Atom optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 49ebd1a..ddf580f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -92,7 +92,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   } else if (Subtarget.is64Bit()) {
     // PIC in 64 bit mode is always rip-rel.
     Subtarget.setPICStyle(PICStyles::RIPRel);
-  } else if (Subtarget.isTargetCygMing()) {
+  } else if (Subtarget.isTargetCOFF()) {
     Subtarget.setPICStyle(PICStyles::None);
   } else if (Subtarget.isTargetDarwin()) {
     if (getRelocationModel() == Reloc::PIC_)
@@ -114,14 +114,14 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
 // Command line options for x86
 //===----------------------------------------------------------------------===//
 static cl::opt<bool>
-UseVZeroUpper("x86-use-vzeroupper",
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
 // Temporary option to control early if-conversion for x86 while adding machine
 // models.
 static cl::opt<bool>
-X86EarlyIfConv("x86-early-ifcvt",
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
 	       cl::desc("Enable early if-conversion on X86"));
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index a19c5a6..086cd4d 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -25,7 +25,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
   // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
   // is an indirect pc-relative reference.
   if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = Mang->getSymbol(GV);
+    const MCSymbol *Sym = getSymbol(*Mang, GV);
     const MCExpr *Res =
       MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
     const MCExpr *Four = MCConstantExpr::Create(4, getContext());
@@ -39,7 +39,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
 MCSymbol *X86_64MachoTargetObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
-  return Mang->getSymbol(GV);
+  return getSymbol(*Mang, GV);
 }
 
 void
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 3bbddad..f88a666 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -101,6 +101,9 @@ public:
                                    unsigned AddressSpace) const;
 
   virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const;
+  
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwiseForm) const;
 
   /// @}
 };
@@ -127,8 +130,8 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   // TODO: Currently the __builtin_popcount() implementation using SSE3
   //   instructions is inefficient. Once the problem is fixed, we should
-  //   call ST->hasSSE3() instead of ST->hasSSE4().
-  return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
+  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
+  return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
@@ -174,7 +177,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const CostTblEntry<MVT> AVX2CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
     { ISD::SHL,     MVT::v4i32,    1 },
@@ -211,13 +214,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   // Look for AVX2 lowering tricks.
   if (ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX2CostTable[Idx].Cost;
   }
 
-  static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType>
+  SSE2UniformConstCostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // Constant splats are cheaper for the following instructions.
@@ -238,15 +241,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
-    int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable,
-                                   array_lengthof(SSE2UniformConstCostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2UniformConstCostTable[Idx].Cost;
   }
 
 
-  static const CostTblEntry<MVT> SSE2CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // For some cases, where the shift amount is a scalar we would be able
@@ -287,13 +288,12 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   };
 
   if (ST->hasSSE2()) {
-    int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2CostTable[Idx].Cost;
   }
 
-  static const CostTblEntry<MVT> AVX1CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = {
     // We don't have to scalarize unsupported ops. We can issue two half-sized
     // operations and we only need to extract the upper YMM half.
     // Two ops + 1 extract + 1 insert = 4.
@@ -312,21 +312,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   // Look for AVX1 lowering tricks.
   if (ST->hasAVX() && !ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX1CostTable[Idx].Cost;
   }
 
   // Custom lowering of vectors.
-  static const CostTblEntry<MVT> CustomLowered[] = {
+  static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = {
     // A v2i64/v4i64 and multiply is custom lowered as a series of long
     // multiplies(3), shifts(4) and adds(2).
     { ISD::MUL,     MVT::v2i64,    9 },
     { ISD::MUL,     MVT::v4i64,    9 },
   };
-  int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
-                                 ISD, LT.second);
+  int Idx = CostTableLookup(CustomLowered, ISD, LT.second);
   if (Idx != -1)
     return LT.first * CustomLowered[Idx].Cost;
 
@@ -363,7 +361,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
   std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
 
-  static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  SSE2ConvTbl[] = {
     // These are somewhat magic numbers justified by looking at the output of
     // Intel's IACA, running some kernels and making sure when we take
     // legalization into account the throughput will be overestimated.
@@ -387,9 +386,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   };
 
   if (ST->hasSSE2() && !ST->hasAVX()) {
-    int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl,
-                                          array_lengthof(SSE2ConvTbl),
-                                          ISD, LTDest.second, LTSrc.second);
+    int Idx =
+        ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second);
     if (Idx != -1)
       return LTSrc.first * SSE2ConvTbl[Idx].Cost;
   }
@@ -401,13 +399,17 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 
-  static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
     { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 2 },
 
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
@@ -446,9 +448,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   };
 
   if (ST->hasAVX()) {
-    int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
-                                 array_lengthof(AVXConversionTbl),
-                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
+                                     SrcTy.getSimpleVT());
     if (Idx != -1)
       return AVXConversionTbl[Idx].Cost;
   }
@@ -466,7 +467,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const CostTblEntry<MVT> SSE42CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = {
     { ISD::SETCC,   MVT::v2f64,   1 },
     { ISD::SETCC,   MVT::v4f32,   1 },
     { ISD::SETCC,   MVT::v2i64,   1 },
@@ -475,7 +476,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i8,   1 },
   };
 
-  static const CostTblEntry<MVT> AVX1CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = {
     { ISD::SETCC,   MVT::v4f64,   1 },
     { ISD::SETCC,   MVT::v8f32,   1 },
     // AVX1 does not support 8-wide integer compare.
@@ -485,7 +486,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v32i8,   4 },
   };
 
-  static const CostTblEntry<MVT> AVX2CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = {
     { ISD::SETCC,   MVT::v4i64,   1 },
     { ISD::SETCC,   MVT::v8i32,   1 },
     { ISD::SETCC,   MVT::v16i16,  1 },
@@ -493,22 +494,19 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   };
 
   if (ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl),
-                                   ISD, MTy);
+    int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX2CostTbl[Idx].Cost;
   }
 
   if (ST->hasAVX()) {
-    int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl),
-                                   ISD, MTy);
+    int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX1CostTbl[Idx].Cost;
   }
 
   if (ST->hasSSE42()) {
-    int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl),
-                                   ISD, MTy);
+    int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * SSE42CostTbl[Idx].Cost;
   }
@@ -613,3 +611,84 @@ unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
 
   return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex);
 }
+
+unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
+                                  bool IsPairwise) const {
+  
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  
+  MVT MTy = LT.second;
+  
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+ 
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 
+  // and make it as the cost. 
+ 
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v8i16,   5 },
+  };
+ 
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::FADD,  MVT::v4f64,   5 },
+    { ISD::FADD,  MVT::v8f32,   7 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
+    { ISD::ADD,   MVT::v8i16,   5 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
+    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
+  };
+  
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   3 },
+    { ISD::FADD,  MVT::v4f64,   3 },
+    { ISD::FADD,  MVT::v8f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
+    { ISD::ADD,   MVT::v4i64,   3 },
+    { ISD::ADD,   MVT::v8i16,   4 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+  
+  if (IsPairwise) {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblPairWise[Idx].Cost;
+    }
+  
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblPairWise[Idx].Cost;
+    }
+  } else {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
+    }
+    
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblNoPairWise[Idx].Cost;
+    }
+  }
+
+  return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 477f75a..66ae9c2 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -122,11 +122,11 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
 }
 
 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
-  for (unsigned reg = X86::YMM0; reg < X86::YMM31; ++reg) {
+  for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
-  for (unsigned reg = X86::ZMM0; reg < X86::ZMM31; ++reg) {
+  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
@@ -148,6 +148,25 @@ static bool hasYmmReg(MachineInstr *MI) {
   return false;
 }
 
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
+/// instruction.
+static bool clobbersAnyYmmReg(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isRegMask())
+      continue;
+    for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+    for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+  }
+  return false;
+}
+
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
@@ -231,8 +250,9 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
   bool BBHasCall = false;
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
-    MachineInstr *MI = I;
     DebugLoc dl = I->getDebugLoc();
+    MachineInstr *MI = I;
+
     bool isControlFlow = MI->isCall() || MI->isReturn();
 
     // Shortcut: don't need to check regular instructions in dirty state.
@@ -251,6 +271,14 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
     if (!isControlFlow)
       continue;
 
+    // If the call won't clobber any YMM register, skip it as well. It usually
+    // happens on helper function calls (such as '_chkstk', '_ftol2') where
+    // standard calling convention is not used (RegMask is not used to mark
+    // register clobbered and register usage (def/imp-def/use) is well-dfined
+    // and explicitly specified.
+    if (MI->isCall() && !clobbersAnyYmmReg(MI))
+      continue;
+
     BBHasCall = true;
 
     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 85d2a1d..3fa3b34 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_target(XCoreCodeGen
   XCoreSubtarget.cpp
   XCoreTargetMachine.cpp
   XCoreTargetObjectFile.cpp
+  XCoreTargetTransformInfo.cpp
   XCoreSelectionDAGInfo.cpp
   )
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 6f44551..3d1c474 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -23,10 +23,14 @@ XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
     
   PrivateGlobalPrefix = ".L";
   AscizDirective = ".asciiz";
-  WeakDefDirective = "\t.weak\t";
-  WeakRefDirective = "\t.weak\t";
+
+  HiddenVisibilityAttr = MCSA_Invalid;
+  HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
 
   // Debug
   HasLEB128 = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  DwarfRegNumForCFI = true;
 }
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index b5a9660..e53c96b 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -14,13 +14,13 @@
 #ifndef XCORETARGETASMINFO_H
 #define XCORETARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
   class Target;
 
-  class XCoreMCAsmInfo : public MCAsmInfo {
+  class XCoreMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit XCoreMCAsmInfo(StringRef TT);
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index 2f375fc..73c310b 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -31,6 +31,8 @@ namespace llvm {
                                    CodeGenOpt::Level OptLevel);
   ModulePass *createXCoreLowerThreadLocalPass();
 
+  ImmutablePass *createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM);
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 35ba299..c03dfe6 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -88,14 +89,12 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
     MCSymbol *SymGlob = OutContext.GetOrCreateSymbol(
                           Twine(Sym->getName() + StringRef(".globound")));
     OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Global);
-
-    OutStreamer.EmitRawText("\t.set\t" + Twine(Sym->getName()) +
-                            ".globound," + Twine(ATy->getNumElements()));
-
+    OutStreamer.EmitAssignment(SymGlob,
+                               MCConstantExpr::Create(ATy->getNumElements(),
+                                                      OutContext));
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
       // TODO Use COMDAT groups for LinkOnceLinkage
-      OutStreamer.EmitRawText(MAI->getWeakDefDirective() +Twine(Sym->getName())+
-                              ".globound");
+      OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Weak);
     }
   }
 }
@@ -110,7 +109,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
 
   
-  MCSymbol *GVSym = Mang->getSymbol(GV);
+  MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
   unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
   
@@ -217,7 +216,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << *MO.getMBB()->getSymbol();
     break;
   case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     break;
   case MachineOperand::MO_ExternalSymbol:
     O << MO.getSymbolName();
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index b57cf9d..c34b35c 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -30,10 +30,6 @@
 using namespace llvm;
 
 // helper functions. FIXME: Eliminate.
-static inline bool isImmUs(unsigned val) {
-  return val <= 11;
-}
-
 static inline bool isImmU6(unsigned val) {
   return val < (1 << 6);
 }
@@ -92,11 +88,16 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo *MMI = &MF.getMMI();
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
   const XCoreInstrInfo &TII =
     *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
+  if (MFI->getMaxAlignment() > getStackAlignment())
+    report_fatal_error("emitPrologue unsupported alignment: "
+                       + Twine(MFI->getMaxAlignment()));
+
   bool FP = hasFP(MF);
   const AttributeSet &PAL = MF.getFunction()->getAttributes();
 
@@ -119,21 +120,28 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   bool saveLR = XFI->getUsesLR();
   // Do we need to allocate space on the stack?
   if (FrameSize) {
+    bool LRSavedOnEntry = false;
     int Opcode;
     if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
       Opcode = (isU6) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
       MBB.addLiveIn(XCore::LR);
       saveLR = false;
+      LRSavedOnEntry = true;
     } else {
       Opcode = (isU6) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
     }
     BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
 
     if (emitFrameMoves) {
-
       // Show update of SP.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(FrameLabel,
+                                                             -FrameSize*4));
+      if (LRSavedOnEntry) {
+        unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
+        MMI->addFrameInst(MCCFIInstruction::createOffset(FrameLabel, Reg, 0));
+      }
     }
   }
   if (saveLR) {
@@ -144,6 +152,9 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     if (emitFrameMoves) {
       MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
+      unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveLRLabel, Reg,
+                                                       LRSpillOffset));
     }
   }
 
@@ -156,15 +167,34 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     if (emitFrameMoves) {
       MCSymbol *SaveR10Label = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveR10Label);
+      unsigned Reg = MRI->getDwarfRegNum(XCore::R10, true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveR10Label, Reg,
+                                                       FPSpillOffset));
     }
     // Set the FP from the SP.
     unsigned FramePtr = XCore::R10;
-    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr)
-      .addImm(0);
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr).addImm(0);
     if (emitFrameMoves) {
       // Show FP is now valid.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
+      unsigned Reg = MRI->getDwarfRegNum(FramePtr, true);
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
+                                                               Reg));
+    }
+  }
+
+  if (emitFrameMoves) {
+    // Frame moves for callee saved.
+    std::vector<std::pair<MCSymbol*, CalleeSavedInfo> >&SpillLabels =
+        XFI->getSpillLabels();
+    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
+      MCSymbol *SpillLabel = SpillLabels[I].first;
+      CalleeSavedInfo &CSI = SpillLabels[I].second;
+      int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
+      unsigned Reg = MRI->getDwarfRegNum(CSI.getReg(), true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SpillLabel, Reg,
+                                                       Offset));
     }
   }
 }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 6fc7eef5..89ad27d 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -59,6 +59,7 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::CRC8              : return "XCoreISD::CRC8";
     case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
     case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
+    case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
     default                          : return NULL;
   }
 }
@@ -79,7 +80,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   setStackPointerRegisterToSaveRestore(XCore::SP);
 
-  setSchedulingPreference(Sched::RegPressure);
+  setSchedulingPreference(Sched::Source);
 
   // Use i32 for setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -148,6 +149,13 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
+  // Exception handling
+  setExceptionPointerRegister(XCore::R0);
+  setExceptionSelectorRegister(XCore::R1);
+
+  // Atomic operations
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
   // TRAMPOLINE is custom lowered.
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
@@ -166,6 +174,24 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setMinFunctionAlignment(1);
 }
 
+bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+    return true;
+  }
+
+  return false;
+}
+
 SDValue XCoreTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode())
@@ -188,6 +214,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -259,11 +286,6 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
   return GA;
 }
 
-static inline SDValue BuildGetId(SelectionDAG &DAG, SDLoc dl) {
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
-                     DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
-}
-
 SDValue XCoreTargetLowering::
 LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
 {
@@ -834,6 +856,12 @@ LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+SDValue XCoreTargetLowering::
+LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1200,7 +1228,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
        ArgDI != ArgDE; ++ArgDI) {
     if (ArgDI->Flags.isByVal() && ArgDI->Flags.getByValSize()) {
       unsigned Size = ArgDI->Flags.getByValSize();
-      unsigned Align = ArgDI->Flags.getByValAlign();
+      unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
       // Create a new object on the stack and copy the pointee into it.
       int FI = MFI->CreateStackObject(Size, Align, false, false);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 7761b7c..bc08497 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -70,7 +70,10 @@ namespace llvm {
       BR_JT,
 
       // Jumptable branch using long branches for each entry.
-      BR_JT32
+      BR_JT32,
+
+      // Memory barrier.
+      MEMBARRIER
     };
   }
 
@@ -83,6 +86,10 @@ namespace llvm {
 
     explicit XCoreTargetLowering(XCoreTargetMachine &TM);
 
+    using TargetLowering::isZExtFree;
+    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+
     virtual unsigned getJumpTableEncoding() const;
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
@@ -154,6 +161,7 @@ namespace llvm {
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index d6b8c2d..33c7f31 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "XCoreGenInstrInfo.inc"
 
 namespace llvm {
@@ -39,6 +39,10 @@ namespace XCore {
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void XCoreInstrInfo::anchor() {}
+
 XCoreInstrInfo::XCoreInstrInfo()
   : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
     RI() {
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 51d66a1..4429b07 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -24,6 +24,7 @@ namespace llvm {
 
 class XCoreInstrInfo : public XCoreGenInstrInfo {
   const XCoreRegisterInfo RI;
+  virtual void anchor();
 public:
   XCoreInstrInfo();
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 81fa84d..934a707 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -70,6 +70,11 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
 def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def SDT_XCoreMEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def XCoreMemBarrier : SDNode<"XCoreISD::MEMBARRIER", SDT_XCoreMEMBARRIER,
+                             [SDNPHasChain]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
 //===----------------------------------------------------------------------===//
@@ -343,6 +348,10 @@ let usesCustomInserter = 1 in {
                                  (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
 }
 
+let hasSideEffects = 1 in
+def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
+                                     [(XCoreMemBarrier)]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 2e328b4..afce753 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -22,6 +22,9 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/NoFolder.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "xcore-lower-thread-local"
 
@@ -71,13 +74,104 @@ createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
   return ConstantArray::get(NewType, Elements);
 }
 
-static bool hasNonInstructionUse(GlobalVariable *GV) {
-  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;
-       ++UI)
-    if (!isa<Instruction>(*UI))
-      return true;
+static Instruction *
+createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
+  IRBuilder<true,NoFolder> Builder(Instr);
+  unsigned OpCode = CE->getOpcode();
+  switch (OpCode) {
+    case Instruction::GetElementPtr: {
+      SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
+      ArrayRef<Value *> CEOps(CEOpVec);
+      return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(CEOps[0],
+                                                             CEOps.slice(1)));
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      return dyn_cast<Instruction>(
+                  Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
+                                      CE->getOperand(0), CE->getOperand(1),
+                                      CE->getName()));
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return dyn_cast<Instruction>(
+                  Builder.CreateCast((Instruction::CastOps)OpCode,
+                                     CE->getOperand(0), CE->getType(),
+                                     CE->getName()));
+    default:
+      llvm_unreachable("Unhandled constant expression!\n");
+  }
+}
+
+static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
+  do {
+    SmallVector<WeakVH,8> WUsers;
+    for (Value::use_iterator I = CE->use_begin(), E = CE->use_end();
+         I != E; ++I)
+      WUsers.push_back(WeakVH(*I));
+    std::sort(WUsers.begin(), WUsers.end());
+    WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
+    while (!WUsers.empty())
+      if (WeakVH WU = WUsers.pop_back_val()) {
+        if (PHINode *PN = dyn_cast<PHINode>(WU)) {
+          for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
+            if (PN->getIncomingValue(I) == CE) {
+              BasicBlock *PredBB = PN->getIncomingBlock(I);
+              if (PredBB->getTerminator()->getNumSuccessors() > 1)
+                PredBB = SplitEdge(PredBB, PN->getParent(), P);
+              Instruction *InsertPos = PredBB->getTerminator();
+              Instruction *NewInst = createReplacementInstr(CE, InsertPos);
+              PN->setOperand(I, NewInst);
+            }
+        } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
+          Instruction *NewInst = createReplacementInstr(CE, Instr);
+          Instr->replaceUsesOfWith(CE, NewInst);
+        } else {
+          ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
+          if (!CExpr || !replaceConstantExprOp(CExpr, P))
+            return false;
+        }
+      }
+  } while (CE->hasNUsesOrMore(1)); // We need to check becasue a recursive
+  // sibbling may have used 'CE' when createReplacementInstr was called.
+  CE->destroyConstant();
+  return true;
+}
 
-  return false;
+static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
+  SmallVector<WeakVH,8> WUsers;
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I)
+    if (!isa<Instruction>(*I))
+      WUsers.push_back(WeakVH(*I));
+  while (!WUsers.empty())
+    if (WeakVH WU = WUsers.pop_back_val()) {
+      ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
+      if (!CE || !replaceConstantExprOp(CE, P))
+        return false;
+    }
+  return true;
 }
 
 static bool isZeroLengthArray(Type *Ty) {
@@ -92,14 +186,16 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
     return false;
 
   // Skip globals that we can't lower and leave it for the backend to error.
-  if (hasNonInstructionUse(GV) ||
+  if (!rewriteNonInstructionUses(GV, this) ||
       !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
     return false;
 
   // Create replacement global.
   ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
-  Constant *NewInitializer = createLoweredInitializer(NewType,
-                                                      GV->getInitializer());
+  Constant *NewInitializer = 0;
+  if (GV->hasInitializer())
+    NewInitializer = createLoweredInitializer(NewType,
+                                              GV->getInitializer());
   GlobalVariable *NewGV =
     new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
                        NewInitializer, "", 0, GlobalVariable::NotThreadLocal,
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
index f96eda9..def2673 100644
--- a/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -43,7 +43,7 @@ MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
       Symbol = MO.getMBB()->getSymbol();
       break;
     case MachineOperand::MO_GlobalAddress:
-      Symbol = Mang->getSymbol(MO.getGlobal());
+      Symbol = Printer.getSymbol(MO.getGlobal());
       Offset += MO.getOffset();
       break;
     case MachineOperand::MO_BlockAddress:
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 3ef1520..9ae0b86 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -70,3 +70,11 @@ bool XCorePassConfig::addInstSelector() {
 extern "C" void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
 }
+
+void XCoreTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our XCore pass. This
+  // allows the XCore pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createXCoreTargetTransformInfoPass(this));
+}
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index eb9a1aa..a19a677 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -57,6 +57,8 @@ public:
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  virtual void addAnalysisPasses(PassManagerBase &PM);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
new file mode 100644
index 0000000..cc165f7
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -0,0 +1,83 @@
+//===-- XCoreTargetTransformInfo.cpp - XCore specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// XCore target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xcoretti"
+#include "XCore.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeXCoreTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class XCoreTTI : public ImmutablePass, public TargetTransformInfo {
+public:
+  XCoreTTI() : ImmutablePass(ID) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  XCoreTTI(const XCoreTargetMachine *TM)
+      : ImmutablePass(ID) {
+    initializeXCoreTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    if (Vector) {
+       return 0;
+    }
+    return 12;
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(XCoreTTI, TargetTransformInfo, "xcoretti",
+                   "XCore Target Transform Info", true, true, false)
+char XCoreTTI::ID = 0;
+
+
+ImmutablePass *
+llvm::createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM) {
+  return new XCoreTTI(TM);
+}
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index 9f2343b..9251783 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -52,7 +52,7 @@ namespace {
       return false;
     }
 
-    // We don't modify the program, so we preserve all analyses
+    // We don't modify the program, so we preserve all analyses.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
     }
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index c42d506..df08091 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -88,7 +88,7 @@ char ArgPromotion::ID = 0;
 INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
 
@@ -504,7 +504,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // OriginalLoads - Keep track of a representative load instruction from the
   // original function so that we can tell the alias analysis implementation
   // what the new GEP/Load instructions we are inserting look like.
-  std::map<IndicesVector, LoadInst*> OriginalLoads;
+  // We need to keep the original loads for each argument and the elements
+  // of the argument that are accessed.
+  std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads;
 
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
@@ -569,7 +571,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         else
           // Take any load, we will use it only to update Alias Analysis
           OrigLoad = cast<LoadInst>(User->use_back());
-        OriginalLoads[Indices] = OrigLoad;
+        OriginalLoads[std::make_pair(I, Indices)] = OrigLoad;
       }
 
       // Add a parameter to the function for each element passed in.
@@ -676,7 +678,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         for (ScalarizeTable::iterator SI = ArgIndices.begin(),
                E = ArgIndices.end(); SI != E; ++SI) {
           Value *V = *AI;
-          LoadInst *OrigLoad = OriginalLoads[*SI];
+          LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, *SI)];
           if (!SI->empty()) {
             Ops.reserve(SI->size());
             Type *ElTy = V->getType();
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index a7bf188..d94c0f4 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -93,9 +93,12 @@ bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const {
 }
 
 unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
+  unsigned Align = GV->getAlignment();
+  if (Align)
+    return Align;
   if (TD)
     return TD->getPreferredAlignment(GV);
-  return GV->getAlignment();
+  return 0;
 }
 
 bool ConstantMerge::runOnModule(Module &M) {
@@ -210,9 +213,9 @@ bool ConstantMerge::runOnModule(Module &M) {
       // Bump the alignment if necessary.
       if (Replacements[i].first->getAlignment() ||
           Replacements[i].second->getAlignment()) {
-        Replacements[i].second->setAlignment(std::max(
-            Replacements[i].first->getAlignment(),
-            Replacements[i].second->getAlignment()));
+        Replacements[i].second->setAlignment(
+            std::max(getAlignment(Replacements[i].first),
+                     getAlignment(Replacements[i].second)));
       }
 
       // Eliminate any uses of the dead global.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 6ee6162..911c14e 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -357,6 +357,19 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
     return false;
 
+  // If a function seen at compile time is not necessarily the one linked to
+  // the binary being built, it is illegal to change the actual arguments
+  // passed to it. These functions can be captured by isWeakForLinker().
+  // *NOTE* that mayBeOverridden() is insufficient for this purpose as it
+  // doesn't include linkage types like AvailableExternallyLinkage and
+  // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of
+  // *EQUIVALENT* globals that can be merged at link-time. However, the
+  // semantic of *EQUIVALENT*-functions includes parameters. Changing
+  // parameters breaks this assumption.
+  //
+  if (Fn.isWeakForLinker())
+    return false;
+
   if (Fn.use_empty())
     return false;
 
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index fa3d72d..50fb3e6 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -21,6 +21,38 @@
 #include <algorithm>
 using namespace llvm;
 
+/// Make sure GV is visible from both modules. Delete is true if it is
+/// being deleted from this module.
+/// This also makes sure GV cannot be dropped so that references from
+/// the split module remain valid.
+static void makeVisible(GlobalValue &GV, bool Delete) {
+  bool Local = GV.hasLocalLinkage();
+  if (Local)
+    GV.setVisibility(GlobalValue::HiddenVisibility);
+
+  if (Local || Delete) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    return;
+  }
+
+  if (!GV.hasLinkOnceLinkage()) {
+    assert(!GV.isDiscardableIfUnused());
+    return;
+  }
+
+  // Map linkonce* to weak* so that llvm doesn't drop this GV.
+  switch(GV.getLinkage()) {
+  default:
+    llvm_unreachable("Unexpected linkage");
+  case GlobalValue::LinkOnceAnyLinkage:
+    GV.setLinkage(GlobalValue::WeakAnyLinkage);
+    return;
+  case GlobalValue::LinkOnceODRLinkage:
+    GV.setLinkage(GlobalValue::WeakODRLinkage);
+    return;
+  }
+}
+
 namespace {
   /// @brief A pass to extract specific functions and their dependencies.
   class GVExtractorPass : public ModulePass {
@@ -60,12 +92,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->isDiscardableIfUnused();
-        if (Local)
-          I->setVisibility(GlobalValue::HiddenVisibility);
-
-        if (Local || Delete)
-          I->setLinkage(GlobalValue::ExternalLinkage);
+	makeVisible(*I, Delete);
 
         if (Delete)
           I->setInitializer(0);
@@ -80,12 +107,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->isDiscardableIfUnused();
-        if (Local)
-          I->setVisibility(GlobalValue::HiddenVisibility);
-
-        if (Local || Delete)
-          I->setLinkage(GlobalValue::ExternalLinkage);
+	makeVisible(*I, Delete);
 
         if (Delete)
           I->deleteBody();
@@ -97,12 +119,10 @@ namespace {
         Module::alias_iterator CurI = I;
         ++I;
 
-        if (CurI->isDiscardableIfUnused()) {
-          CurI->setVisibility(GlobalValue::HiddenVisibility);
-          CurI->setLinkage(GlobalValue::ExternalLinkage);
-        }
+	bool Delete = deleteStuff == (bool)Named.count(CurI);
+	makeVisible(*CurI, Delete);
 
-        if (deleteStuff == (bool)Named.count(CurI)) {
+        if (Delete) {
           Type *Ty =  CurI->getType()->getElementType();
 
           CurI->removeFromParent();
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 1366883..60e5f06 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -136,7 +136,8 @@ namespace {
 char FunctionAttrs::ID = 0;
 INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
@@ -366,6 +367,7 @@ namespace {
         }
       }
       assert(Found && "Capturing call-site captured nothing?");
+      (void)Found;
       return false;
     }
 
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 201f320..901295d 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -179,6 +179,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
     // any globals used will be marked as needed.
     Function *F = cast<Function>(G);
 
+    if (F->hasPrefixData())
+      MarkUsedGlobalsAsNeeded(F->getPrefixData());
+
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 64cd515..2ea89a1 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -37,7 +37,9 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 using namespace llvm;
@@ -60,7 +62,6 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
 
 namespace {
-  struct GlobalStatus;
   struct GlobalOpt : public ModulePass {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetLibraryInfo>();
@@ -80,7 +81,6 @@ namespace {
     bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
     bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
     bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
-                               const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
 
@@ -98,209 +98,6 @@ INITIALIZE_PASS_END(GlobalOpt, "globalopt",
 
 ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
 
-namespace {
-
-/// GlobalStatus - As we analyze each global, keep track of some information
-/// about it.  If we find out that the address of the global is taken, none of
-/// this info will be accurate.
-struct GlobalStatus {
-  /// isCompared - True if the global's address is used in a comparison.
-  bool isCompared;
-
-  /// isLoaded - True if the global is ever loaded.  If the global isn't ever
-  /// loaded it can be deleted.
-  bool isLoaded;
-
-  /// StoredType - Keep track of what stores to the global look like.
-  ///
-  enum StoredType {
-    /// NotStored - There is no store to this global.  It can thus be marked
-    /// constant.
-    NotStored,
-
-    /// isInitializerStored - This global is stored to, but the only thing
-    /// stored is the constant it was initialized with.  This is only tracked
-    /// for scalar globals.
-    isInitializerStored,
-
-    /// isStoredOnce - This global is stored to, but only its initializer and
-    /// one other value is ever stored to it.  If this global isStoredOnce, we
-    /// track the value stored to it in StoredOnceValue below.  This is only
-    /// tracked for scalar globals.
-    isStoredOnce,
-
-    /// isStored - This global is stored to by multiple values or something else
-    /// that we cannot track.
-    isStored
-  } StoredType;
-
-  /// StoredOnceValue - If only one value (besides the initializer constant) is
-  /// ever stored to this global, keep track of what value it is.
-  Value *StoredOnceValue;
-
-  /// AccessingFunction/HasMultipleAccessingFunctions - These start out
-  /// null/false.  When the first accessing function is noticed, it is recorded.
-  /// When a second different accessing function is noticed,
-  /// HasMultipleAccessingFunctions is set to true.
-  const Function *AccessingFunction;
-  bool HasMultipleAccessingFunctions;
-
-  /// HasNonInstructionUser - Set to true if this global has a user that is not
-  /// an instruction (e.g. a constant expr or GV initializer).
-  bool HasNonInstructionUser;
-
-  /// AtomicOrdering - Set to the strongest atomic ordering requirement.
-  AtomicOrdering Ordering;
-
-  GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored),
-                   StoredOnceValue(0), AccessingFunction(0),
-                   HasMultipleAccessingFunctions(false),
-                   HasNonInstructionUser(false), Ordering(NotAtomic) {}
-};
-
-}
-
-/// StrongerOrdering - Return the stronger of the two ordering. If the two
-/// orderings are acquire and release, then return AcquireRelease.
-///
-static AtomicOrdering StrongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
-  if (X == Acquire && Y == Release) return AcquireRelease;
-  if (Y == Acquire && X == Release) return AcquireRelease;
-  return (AtomicOrdering)std::max(X, Y);
-}
-
-/// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
-/// by constants itself.  Note that constants cannot be cyclic, so this test is
-/// pretty easy to implement recursively.
-///
-static bool SafeToDestroyConstant(const Constant *C) {
-  if (isa<GlobalValue>(C)) return false;
-
-  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E;
-       ++UI)
-    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
-      if (!SafeToDestroyConstant(CU)) return false;
-    } else
-      return false;
-  return true;
-}
-
-
-/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus
-/// structure.  If the global has its address taken, return true to indicate we
-/// can't do anything with it.
-///
-static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
-                          SmallPtrSet<const PHINode*, 16> &PHIUsers) {
-  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
-       ++UI) {
-    const User *U = *UI;
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
-      GS.HasNonInstructionUser = true;
-
-      // If the result of the constantexpr isn't pointer type, then we won't
-      // know to expect it in various places.  Just reject early.
-      if (!isa<PointerType>(CE->getType())) return true;
-
-      if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
-    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
-      if (!GS.HasMultipleAccessingFunctions) {
-        const Function *F = I->getParent()->getParent();
-        if (GS.AccessingFunction == 0)
-          GS.AccessingFunction = F;
-        else if (GS.AccessingFunction != F)
-          GS.HasMultipleAccessingFunctions = true;
-      }
-      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        GS.isLoaded = true;
-        // Don't hack on volatile loads.
-        if (LI->isVolatile()) return true;
-        GS.Ordering = StrongerOrdering(GS.Ordering, LI->getOrdering());
-      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // Don't allow a store OF the address, only stores TO the address.
-        if (SI->getOperand(0) == V) return true;
-
-        // Don't hack on volatile stores.
-        if (SI->isVolatile()) return true;
-
-        GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering());
-
-        // If this is a direct store to the global (i.e., the global is a scalar
-        // value, not an aggregate), keep more specific information about
-        // stores.
-        if (GS.StoredType != GlobalStatus::isStored) {
-          if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(
-                                                           SI->getOperand(1))) {
-            Value *StoredVal = SI->getOperand(0);
-
-            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
-              if (C->isThreadDependent()) {
-                // The stored value changes between threads; don't track it.
-                return true;
-              }
-            }
-
-            if (StoredVal == GV->getInitializer()) {
-              if (GS.StoredType < GlobalStatus::isInitializerStored)
-                GS.StoredType = GlobalStatus::isInitializerStored;
-            } else if (isa<LoadInst>(StoredVal) &&
-                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
-              if (GS.StoredType < GlobalStatus::isInitializerStored)
-                GS.StoredType = GlobalStatus::isInitializerStored;
-            } else if (GS.StoredType < GlobalStatus::isStoredOnce) {
-              GS.StoredType = GlobalStatus::isStoredOnce;
-              GS.StoredOnceValue = StoredVal;
-            } else if (GS.StoredType == GlobalStatus::isStoredOnce &&
-                       GS.StoredOnceValue == StoredVal) {
-              // noop.
-            } else {
-              GS.StoredType = GlobalStatus::isStored;
-            }
-          } else {
-            GS.StoredType = GlobalStatus::isStored;
-          }
-        }
-      } else if (isa<BitCastInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<GetElementPtrInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<SelectInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
-        // PHI nodes we can check just like select or GEP instructions, but we
-        // have to be careful about infinite recursion.
-        if (PHIUsers.insert(PN))  // Not already visited.
-          if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<CmpInst>(I)) {
-        GS.isCompared = true;
-      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
-        if (MTI->isVolatile()) return true;
-        if (MTI->getArgOperand(0) == V)
-          GS.StoredType = GlobalStatus::isStored;
-        if (MTI->getArgOperand(1) == V)
-          GS.isLoaded = true;
-      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
-        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
-        if (MSI->isVolatile()) return true;
-        GS.StoredType = GlobalStatus::isStored;
-      } else {
-        return true;  // Any other non-load instruction might take address!
-      }
-    } else if (const Constant *C = dyn_cast<Constant>(U)) {
-      GS.HasNonInstructionUser = true;
-      // We might have a dead and dangling constant hanging off of here.
-      if (!SafeToDestroyConstant(C))
-        return true;
-    } else {
-      GS.HasNonInstructionUser = true;
-      // Otherwise must be some other user.
-      return true;
-    }
-  }
-
-  return false;
-}
-
 /// isLeakCheckerRoot - Is this global variable possibly used by a leak checker
 /// as a root?  If so, we might not really want to eliminate the stores to it.
 static bool isLeakCheckerRoot(GlobalVariable *GV) {
@@ -434,7 +231,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
         Changed = true;
       }
     } else if (Constant *C = dyn_cast<Constant>(U)) {
-      if (SafeToDestroyConstant(C)) {
+      if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
@@ -471,9 +268,17 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                                        DataLayout *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
-  SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end());
+  // Note that we need to use a weak value handle for the worklist items. When
+  // we delete a constant array, we may also be holding pointer to one of its
+  // elements (or an element of one of its elements if we're dealing with an
+  // array of arrays) in the worklist.
+  SmallVector<WeakVH, 8> WorkList(V->use_begin(), V->use_end());
   while (!WorkList.empty()) {
-    User *U = WorkList.pop_back_val();
+    Value *UV = WorkList.pop_back_val();
+    if (!UV)
+      continue;
+
+    User *U = cast<User>(UV);
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (Init) {
@@ -534,7 +339,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
     } else if (Constant *C = dyn_cast<Constant>(U)) {
       // If we have a chain of dead constantexprs or other things dangling from
       // us, and if they are all dead, nuke them without remorse.
-      if (SafeToDestroyConstant(C)) {
+      if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
         CleanupConstantGlobalUsers(V, Init, TD, TLI);
         return true;
@@ -549,7 +354,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
 static bool isSafeSROAElementUse(Value *V) {
   // We might have a dead and dangling constant hanging off of here.
   if (Constant *C = dyn_cast<Constant>(V))
-    return SafeToDestroyConstant(C);
+    return isSafeToDestroyConstant(C);
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
@@ -1373,8 +1178,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
     // field.
-    StructType *ST =
-      cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
+    StructType *ST = cast<StructType>(PN->getType()->getPointerElementType());
 
     PHINode *NewPN =
      PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)),
@@ -1505,7 +1309,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
     if (StructType *ST = dyn_cast<StructType>(FieldTy))
       TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
-    Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    Type *IntPtrTy = TD->getIntPtrType(CI->getType());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems, 0,
@@ -1735,7 +1539,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
     if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
-      Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+      Type *IntPtrTy = TD->getIntPtrType(CI->getType());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
@@ -1917,13 +1721,12 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
   if (!GV->hasLocalLinkage())
     return false;
 
-  SmallPtrSet<const PHINode*, 16> PHIUsers;
   GlobalStatus GS;
 
-  if (AnalyzeGlobal(GV, GS, PHIUsers))
+  if (GlobalStatus::analyzeGlobal(GV, GS))
     return false;
 
-  if (!GS.isCompared && !GV->hasUnnamedAddr()) {
+  if (!GS.IsCompared && !GV->hasUnnamedAddr()) {
     GV->setUnnamedAddr(true);
     NumUnnamed++;
   }
@@ -1931,14 +1734,13 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
   if (GV->isConstant() || !GV->hasInitializer())
     return false;
 
-  return ProcessInternalGlobal(GV, GVI, PHIUsers, GS);
+  return ProcessInternalGlobal(GV, GVI, GS);
 }
 
 /// ProcessInternalGlobal - Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
 bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                       Module::global_iterator &GVI,
-                                const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                       const GlobalStatus &GS) {
   // If this is a first class global and has only one accessing function
   // and this function is main (which we know is not recursive), we replace
@@ -1971,7 +1773,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
   // If the global is never loaded (but may be stored to), it is dead.
   // Delete it now.
-  if (!GS.isLoaded) {
+  if (!GS.IsLoaded) {
     DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV);
 
     bool Changed;
@@ -1992,7 +1794,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     }
     return Changed;
 
-  } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
+  } else if (GS.StoredType <= GlobalStatus::InitializerStored) {
     DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
@@ -2015,7 +1817,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
         GVI = FirstNewGV;  // Don't skip the newly produced globals!
         return true;
       }
-  } else if (GS.StoredType == GlobalStatus::isStoredOnce) {
+  } else if (GS.StoredType == GlobalStatus::StoredOnce) {
     // If the initial value for the global was an undef value, and if only
     // one other value was stored into it, we can just change the
     // initializer to be the stored value, then delete all stores to the
@@ -2048,11 +1850,14 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
     // boolean.
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
-      if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
-        ++NumShrunkToBool;
-        return true;
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
+      if (GS.Ordering == NotAtomic) {
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+          ++NumShrunkToBool;
+          return true;
+        }
       }
+    }
   }
 
   return false;
@@ -2210,8 +2015,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   CSVals[1] = 0;
 
   StructType *StructTy =
-    cast <StructType>(
-    cast<ArrayType>(GCL->getType()->getElementType())->getElementType());
+    cast<StructType>(GCL->getType()->getElementType()->getArrayElementType());
 
   // Create the new init list.
   std::vector<Constant*> CAList;
@@ -3041,14 +2845,8 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
   return true;
 }
 
-static int compareNames(const void *A, const void *B) {
-  const GlobalValue *VA = *reinterpret_cast<GlobalValue* const*>(A);
-  const GlobalValue *VB = *reinterpret_cast<GlobalValue* const*>(B);
-  if (VA->getName() < VB->getName())
-    return -1;
-  if (VB->getName() < VA->getName())
-    return 1;
-  return 0;
+static int compareNames(Constant *const *A, Constant *const *B) {
+  return (*A)->getName().compare((*B)->getName());
 }
 
 static void setUsedInitializer(GlobalVariable &V,
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index a0095da..437597e 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -63,7 +63,7 @@ public:
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index a4f7026..57379a3 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Actaul inliner pass implementation.
+/// \brief Actual inliner pass implementation.
 ///
 /// The common implementation of the inlining logic is shared between this
 /// inliner pass and the always inliner pass. The two passes use different cost
@@ -61,7 +61,7 @@ public:
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index d56a06f..64e2ced 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -11,6 +11,12 @@
 // If the function or variable is not in the list of external names given to
 // the pass it is marked as internal.
 //
+// This transformation would not be legal in a regular compilation, but it gets
+// extra information from the linker about what is safe.
+//
+// For example: Internalizing a function with external linkage. Only if we are
+// told it is only used from within this module, it is safe to do it.
+//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "internalize"
@@ -23,6 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <fstream>
 #include <set>
@@ -50,10 +57,8 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit InternalizePass();
-    explicit InternalizePass(ArrayRef<const char *> exportList);
+    explicit InternalizePass(ArrayRef<const char *> ExportList);
     void LoadFile(const char *Filename);
-    void ClearExportList();
-    void AddToExportList(const std::string &val);
     virtual bool runOnModule(Module &M);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -72,15 +77,14 @@ InternalizePass::InternalizePass()
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
   if (!APIFile.empty())           // If a filename is specified, use it.
     LoadFile(APIFile.c_str());
-  if (!APIList.empty())           // If a list is specified, use it as well.
-    ExternalNames.insert(APIList.begin(), APIList.end());
+  ExternalNames.insert(APIList.begin(), APIList.end());
 }
 
-InternalizePass::InternalizePass(ArrayRef<const char *> exportList)
+InternalizePass::InternalizePass(ArrayRef<const char *> ExportList)
   : ModulePass(ID){
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
-  for(ArrayRef<const char *>::const_iterator itr = exportList.begin();
-        itr != exportList.end(); itr++) {
+  for(ArrayRef<const char *>::const_iterator itr = ExportList.begin();
+        itr != ExportList.end(); itr++) {
     ExternalNames.insert(*itr);
   }
 }
@@ -101,12 +105,25 @@ void InternalizePass::LoadFile(const char *Filename) {
   }
 }
 
-void InternalizePass::ClearExportList() {
-  ExternalNames.clear();
-}
+static bool shouldInternalize(const GlobalValue &GV,
+                              const std::set<std::string> &ExternalNames) {
+  // Function must be defined here
+  if (GV.isDeclaration())
+    return false;
+
+  // Available externally is really just a "declaration with a body".
+  if (GV.hasAvailableExternallyLinkage())
+    return false;
 
-void InternalizePass::AddToExportList(const std::string &val) {
-  ExternalNames.insert(val);
+  // Already has internal linkage
+  if (GV.hasLocalLinkage())
+    return false;
+
+  // Marked to keep external?
+  if (ExternalNames.count(GV.getName()))
+    return false;
+
+  return true;
 }
 
 bool InternalizePass::runOnModule(Module &M) {
@@ -114,11 +131,6 @@ bool InternalizePass::runOnModule(Module &M) {
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
   bool Changed = false;
 
-  // Never internalize functions which code-gen might insert.
-  // FIXME: We should probably add this (and the __stack_chk_guard) via some
-  // type of call-back in CodeGen.
-  ExternalNames.insert("__stack_chk_fail");
-
   SmallPtrSet<GlobalValue *, 8> Used;
   collectUsedGlobalVariables(M, Used, false);
 
@@ -139,19 +151,20 @@ bool InternalizePass::runOnModule(Module &M) {
 
   // Mark all functions not in the api as internal.
   // FIXME: maybe use private linkage?
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration() &&         // Function must be defined here
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !I->hasLocalLinkage() &&  // Can't already have internal linkage
-        !ExternalNames.count(I->getName())) {// Not marked to keep external?
-      I->setLinkage(GlobalValue::InternalLinkage);
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+
+    if (ExternalNode)
       // Remove a callgraph edge from the external node to this function.
-      if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
-      Changed = true;
-      ++NumFunctions;
-      DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
-    }
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
+
+    Changed = true;
+    ++NumFunctions;
+    DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
+  }
 
   // Never internalize the llvm.used symbol.  It is used to implement
   // attribute((used)).
@@ -166,35 +179,36 @@ bool InternalizePass::runOnModule(Module &M) {
   ExternalNames.insert("llvm.global.annotations");
 
   // Never internalize symbols code-gen inserts.
+  // FIXME: We should probably add this (and the __stack_chk_guard) via some
+  // type of call-back in CodeGen.
+  ExternalNames.insert("__stack_chk_fail");
   ExternalNames.insert("__stack_chk_guard");
 
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
   // FIXME: maybe use private linkage?
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasLocalLinkage() &&
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !ExternalNames.count(I->getName())) {
-      I->setLinkage(GlobalValue::InternalLinkage);
-      Changed = true;
-      ++NumGlobals;
-      DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
-    }
+       I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+    Changed = true;
+    ++NumGlobals;
+    DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
+  }
 
   // Mark all aliases that are not in the api as internal as well.
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasInternalLinkage() &&
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !ExternalNames.count(I->getName())) {
-      I->setLinkage(GlobalValue::InternalLinkage);
-      Changed = true;
-      ++NumAliases;
-      DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
-    }
+       I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+    Changed = true;
+    ++NumAliases;
+    DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
+  }
 
   return Changed;
 }
@@ -203,6 +217,6 @@ ModulePass *llvm::createInternalizePass() {
   return new InternalizePass();
 }
 
-ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) {
-  return new InternalizePass(el);
+ModulePass *llvm::createInternalizePass(ArrayRef<const char *> ExportList) {
+  return new InternalizePass(ExportList);
 }
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 4ce749c..3861421 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -210,16 +210,20 @@ private:
 // Any two pointers in the same address space are equivalent, intptr_t and
 // pointers are equivalent. Otherwise, standard type equivalence rules apply.
 bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
+
+  PointerType *PTy1 = dyn_cast<PointerType>(Ty1);
+  PointerType *PTy2 = dyn_cast<PointerType>(Ty2);
+
+  if (TD) {
+    if (PTy1 && PTy1->getAddressSpace() == 0) Ty1 = TD->getIntPtrType(Ty1);
+    if (PTy2 && PTy2->getAddressSpace() == 0) Ty2 = TD->getIntPtrType(Ty2);
+  }
+
   if (Ty1 == Ty2)
     return true;
-  if (Ty1->getTypeID() != Ty2->getTypeID()) {
-    if (TD) {
-      LLVMContext &Ctx = Ty1->getContext();
-      if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true;
-      if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true;
-    }
+
+  if (Ty1->getTypeID() != Ty2->getTypeID())
     return false;
-  }
 
   switch (Ty1->getTypeID()) {
   default:
@@ -241,8 +245,7 @@ bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
     return true;
 
   case Type::PointerTyID: {
-    PointerType *PTy1 = cast<PointerType>(Ty1);
-    PointerType *PTy2 = cast<PointerType>(Ty2);
+    assert(PTy1 && PTy2 && "Both types must be pointers here.");
     return PTy1->getAddressSpace() == PTy2->getAddressSpace();
   }
 
@@ -352,14 +355,19 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
 // Determine whether two GEP operations perform the same underlying arithmetic.
 bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
                                          const GEPOperator *GEP2) {
-  // When we have target data, we can reduce the GEP down to the value in bytes
-  // added to the address.
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1;
-  APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
-  if (TD &&
-      GEP1->accumulateConstantOffset(*TD, Offset1) &&
-      GEP2->accumulateConstantOffset(*TD, Offset2)) {
-    return Offset1 == Offset2;
+  unsigned AS = GEP1->getPointerAddressSpace();
+  if (AS != GEP2->getPointerAddressSpace())
+    return false;
+
+  if (TD) {
+    // When we have target data, we can reduce the GEP down to the value in bytes
+    // added to the address.
+    unsigned BitWidth = TD ? TD->getPointerSizeInBits(AS) : 1;
+    APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
+    if (GEP1->accumulateConstantOffset(*TD, Offset1) &&
+        GEP2->accumulateConstantOffset(*TD, Offset2)) {
+      return Offset1 == Offset2;
+    }
   }
 
   if (GEP1->getPointerOperand()->getType() !=
@@ -713,6 +721,19 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
   writeThunk(F, G);
 }
 
+// Helper for writeThunk,
+// Selects proper bitcast operation,
+// but a bit simplier then CastInst::getCastOpcode.
+static Value* createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
 // Replace G with a simple tail call to bitcast(F). Also replace direct uses
 // of G with bitcast(F). Deletes G.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
@@ -738,7 +759,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   FunctionType *FFTy = F->getFunctionType();
   for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
        AI != AE; ++AI) {
-    Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i)));
+    Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i)));
     ++i;
   }
 
@@ -748,13 +769,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   if (NewG->getReturnType()->isVoidTy()) {
     Builder.CreateRetVoid();
   } else {
-    Type *RetTy = NewG->getReturnType();
-    if (CI->getType()->isIntegerTy() && RetTy->isPointerTy())
-      Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy));
-    else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy())
-      Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy));
-    else
-      Builder.CreateRet(Builder.CreateBitCast(CI, RetTy));
+    Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType()));
   }
 
   NewG->copyAttributesFrom(G);
@@ -829,6 +844,18 @@ bool MergeFunctions::insert(ComparableFunction &NewF) {
 
   const ComparableFunction &OldF = *Result.first;
 
+  // Don't merge tiny functions, since it can just end up making the function
+  // larger.
+  // FIXME: Should still merge them if they are unnamed_addr and produce an
+  // alias.
+  if (NewF.getFunc()->size() == 1) {
+    if (NewF.getFunc()->front().size() <= 2) {
+      DEBUG(dbgs() << NewF.getFunc()->getName()
+            << " is to small to bother merging\n");
+      return false;
+    }
+  }
+
   // Never thunk a strong function to a weak function.
   assert(!OldF.getFunc()->mayBeOverridden() ||
          NewF.getFunc()->mayBeOverridden());
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index a6b3f4e..24c5018 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -29,20 +29,20 @@
 using namespace llvm;
 
 static cl::opt<bool>
-RunLoopVectorization("vectorize-loops",
+RunLoopVectorization("vectorize-loops", cl::Hidden,
                      cl::desc("Run the Loop vectorization passes"));
 
 static cl::opt<bool>
-LateVectorization("late-vectorize", cl::init(false), cl::Hidden,
+LateVectorization("late-vectorize", cl::init(true), cl::Hidden,
                   cl::desc("Run the vectorization pasess late in the pass "
                            "pipeline (after the inliner)"));
 
 static cl::opt<bool>
-RunSLPVectorization("vectorize-slp",
+RunSLPVectorization("vectorize-slp", cl::Hidden,
                     cl::desc("Run the SLP vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize-slp-aggressive",
+RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
                     cl::desc("Run the BB vectorization passes"));
 
 static cl::opt<bool>
@@ -54,6 +54,10 @@ static cl::opt<bool> UseNewSROA("use-new-sroa",
   cl::init(true), cl::Hidden,
   cl::desc("Enable the new, experimental SROA pass"));
 
+static cl::opt<bool>
+RunLoopRerolling("reroll-loops", cl::Hidden,
+                 cl::desc("Run the loop rerolling pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -65,6 +69,7 @@ PassManagerBuilder::PassManagerBuilder() {
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     LateVectorize = LateVectorization;
+    RerollLoops = RunLoopRerolling;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -195,8 +200,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
-  if (!LateVectorize && LoopVectorize && OptLevel > 1 && SizeLevel < 2)
-      MPM.add(createLoopVectorizePass());
+  if (!LateVectorize && LoopVectorize)
+      MPM.add(createLoopVectorizePass(DisableUnrollLoops));
 
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());          // Unroll small loops
@@ -216,22 +221,22 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
-  if (!LateVectorize) {
-    if (SLPVectorize)
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      MPM.add(createInstructionCombiningPass());
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(createGVNPass());           // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
-    }
+  if (RerollLoops)
+    MPM.add(createLoopRerollPass());
+  if (SLPVectorize)
+    MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
+
+  if (BBVectorize) {
+    MPM.add(createBBVectorizePass());
+    MPM.add(createInstructionCombiningPass());
+    if (OptLevel > 1 && UseGVNAfterVectorization)
+      MPM.add(createGVNPass());           // Remove redundancies
+    else
+      MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
+
+    // BBVectorize may have significantly shortened a loop body; unroll again.
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());
   }
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
@@ -241,7 +246,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   // As an experimental mode, run any vectorization passes in a separate
   // pipeline from the CGSCC pass manager that runs iteratively with the
   // inliner.
-  if (LateVectorize) {
+  if (LateVectorize && LoopVectorize) {
     // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
     // pass manager that we are specifically trying to avoid. To prevent this
     // we must insert a no-op module pass to reset the pass manager.
@@ -249,35 +254,9 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
     // Add the various vectorization passes and relevant cleanup passes for
     // them since we are no longer in the middle of the main scalar pipeline.
-    if (LoopVectorize && OptLevel > 1 && SizeLevel < 2) {
-      MPM.add(createLoopVectorizePass());
-
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());    // Unroll small loops
-
-      // FIXME: Is this necessary/useful? Should we also do SimplifyCFG?
-      MPM.add(createInstructionCombiningPass());
-    }
-
-    if (SLPVectorize) {
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-      // FIXME: Is this necessary/useful? Should we also do SimplifyCFG?
-      MPM.add(createInstructionCombiningPass());
-    }
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      MPM.add(createInstructionCombiningPass());
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(createGVNPass());           // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
-    }
+    MPM.add(createLoopVectorizePass(DisableUnrollLoops));
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createCFGSimplificationPass());
   }
 
   if (!DisableUnitAtATime) {
@@ -304,11 +283,8 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // Now that composite has been compiled, scan through the module, looking
   // for a main function.  If main is defined, mark all other functions
   // internal.
-  if (Internalize) {
-    std::vector<const char*> E;
-    E.push_back("main");
-    PM.add(createInternalizePass(E));
-  }
+  if (Internalize)
+    PM.add(createInternalizePass("main"));
 
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
@@ -349,6 +325,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // The IPO passes may leave cruft around.  Clean up after them.
   PM.add(createInstructionCombiningPass());
   PM.add(createJumpThreadingPass());
+
   // Break up allocas
   if (UseNewSROA)
     PM.add(createSROAPass());
@@ -362,6 +339,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createLICMPass());                 // Hoist loop invariants.
   PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
+
   // Nuke dead stores.
   PM.add(createDeadStoreEliminationPass());
 
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 89529de..b160913 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -51,7 +51,7 @@ namespace {
 char PruneEH::ID = 0;
 INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
                 "Remove unused exception handling info", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_END(PruneEH, "prune-eh",
                 "Remove unused exception handling info", false, false)
 
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 2791106..c4f5cfc 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -9,7 +9,7 @@
 //
 // The StripSymbols transformation implements code stripping. Specifically, it
 // can delete:
-// 
+//
 //   * names for virtual registers
 //   * symbols for internal globals and functions
 //   * debug information
@@ -39,7 +39,7 @@ namespace {
     bool OnlyDebugInfo;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit StripSymbols(bool ODI = false) 
+    explicit StripSymbols(bool ODI = false)
       : ModulePass(ID), OnlyDebugInfo(ODI) {
         initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
       }
@@ -144,7 +144,7 @@ static void RemoveDeadConstant(Constant *C) {
   assert(C->use_empty() && "Constant is not dead!");
   SmallPtrSet<Constant*, 4> Operands;
   for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
-    if (OnlyUsedBy(C->getOperand(i), C)) 
+    if (OnlyUsedBy(C->getOperand(i), C))
       Operands.insert(cast<Constant>(C->getOperand(i)));
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
     if (!GV->hasLocalLinkage()) return;   // Don't delete non static globals.
@@ -182,7 +182,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *STy = StructTypes[i];
     if (STy->isLiteral() || STy->getName().empty()) continue;
-    
+
     if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
       continue;
 
@@ -199,7 +199,7 @@ static void findUsedValues(GlobalVariable *LLVMUsed,
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
-    if (GlobalValue *GV = 
+    if (GlobalValue *GV =
           dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
       UsedValues.insert(GV);
 }
@@ -217,71 +217,20 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
   }
-  
+
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
     StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
   }
-  
+
   // Remove all names from types.
   StripTypeNames(M, PreserveDbgInfo);
 
   return true;
 }
 
-// StripDebugInfo - Strip debug info in the module if it exists.  
-// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and 
-// llvm.dbg.region.end calls, and any globals they point to if now dead.
-static bool StripDebugInfo(Module &M) {
-
-  bool Changed = false;
-
-  // Remove all of the calls to the debugger intrinsics, and remove them from
-  // the module.
-  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
-    while (!Declare->use_empty()) {
-      CallInst *CI = cast<CallInst>(Declare->use_back());
-      CI->eraseFromParent();
-    }
-    Declare->eraseFromParent();
-    Changed = true;
-  }
-
-  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
-    while (!DbgVal->use_empty()) {
-      CallInst *CI = cast<CallInst>(DbgVal->use_back());
-      CI->eraseFromParent();
-    }
-    DbgVal->eraseFromParent();
-    Changed = true;
-  }
-
-  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
-         NME = M.named_metadata_end(); NMI != NME;) {
-    NamedMDNode *NMD = NMI;
-    ++NMI;
-    if (NMD->getName().startswith("llvm.dbg.")) {
-      NMD->eraseFromParent();
-      Changed = true;
-    }
-  }
-
-  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
-    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
-         ++FI)
-      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
-           ++BI) {
-        if (!BI->getDebugLoc().isUnknown()) {
-          Changed = true;
-          BI->setDebugLoc(DebugLoc());
-        }
-      }
-
-  return Changed;
-}
-
 bool StripSymbols::runOnModule(Module &M) {
   bool Changed = false;
   Changed |= StripDebugInfo(M);
@@ -307,13 +256,13 @@ bool StripDebugDeclare::runOnModule(Module &M) {
       assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
       CI->eraseFromParent();
       if (Arg1->use_empty()) {
-        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+        if (Constant *C = dyn_cast<Constant>(Arg1))
           DeadConstants.push_back(C);
-        else 
+        else
           RecursivelyDeleteTriviallyDeadInstructions(Arg1);
       }
       if (Arg2->use_empty())
-        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+        if (Constant *C = dyn_cast<Constant>(Arg2))
           DeadConstants.push_back(C);
     }
     Declare->eraseFromParent();
@@ -332,76 +281,107 @@ bool StripDebugDeclare::runOnModule(Module &M) {
   return true;
 }
 
+/// Remove any debug info for global variables/functions in the given module for
+/// which said global variable/function no longer exists (i.e. is null).
+///
+/// Debugging information is encoded in llvm IR using metadata. This is designed
+/// such a way that debug info for symbols preserved even if symbols are
+/// optimized away by the optimizer. This special pass removes debug info for
+/// such symbols.
 bool StripDeadDebugInfo::runOnModule(Module &M) {
   bool Changed = false;
 
-  // Debugging infomration is encoded in llvm IR using metadata. This is designed
-  // such a way that debug info for symbols preserved even if symbols are
-  // optimized away by the optimizer. This special pass removes debug info for 
-  // such symbols.
-
-  // llvm.dbg.gv keeps track of debug info for global variables.
-  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) {
-    SmallVector<MDNode *, 8> MDs;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      if (NMD->getOperand(i)) {
-        assert(DIGlobalVariable(NMD->getOperand(i)).isGlobalVariable() &&
-          "A MDNode in llvm.dbg.gv should be a DIGlobalVariable.");
-        MDs.push_back(NMD->getOperand(i));
-      }
-      else
-        Changed = true;
-    NMD->eraseFromParent();
-    NMD = NULL;
-
-    for (SmallVectorImpl<MDNode *>::iterator I = MDs.begin(),
-           E = MDs.end(); I != E; ++I) {
-      GlobalVariable *GV = DIGlobalVariable(*I).getGlobal();
-      if (GV && M.getGlobalVariable(GV->getName(), true)) {
-        if (!NMD)
-          NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
-        NMD->addOperand(*I);
-      }
+  LLVMContext &C = M.getContext();
+
+  // Find all debug info in F. This is actually overkill in terms of what we
+  // want to do, but we want to try and be as resilient as possible in the face
+  // of potential debug info changes by using the formal interfaces given to us
+  // as much as possible.
+  DebugInfoFinder F;
+  F.processModule(M);
+
+  // For each compile unit, find the live set of global variables/functions and
+  // replace the current list of potentially dead global variables/functions
+  // with the live list.
+  SmallVector<Value *, 64> LiveGlobalVariables;
+  SmallVector<Value *, 64> LiveSubprograms;
+  DenseSet<const MDNode *> VisitedSet;
+
+  for (DebugInfoFinder::iterator CI = F.compile_unit_begin(),
+         CE = F.compile_unit_end(); CI != CE; ++CI) {
+    // Create our compile unit.
+    DICompileUnit DIC(*CI);
+    assert(DIC.Verify() && "DIC must verify as a DICompileUnit.");
+
+    // Create our live subprogram list.
+    DIArray SPs = DIC.getSubprograms();
+    bool SubprogramChange = false;
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram DISP(SPs.getElement(i));
+      assert(DISP.Verify() && "DISP must verify as a DISubprogram.");
+
+      // Make sure we visit each subprogram only once.
+      if (!VisitedSet.insert(DISP).second)
+        continue;
+
+      // If the function referenced by DISP is not null, the function is live.
+      if (DISP.getFunction())
+        LiveSubprograms.push_back(DISP);
       else
-        Changed = true;
+        SubprogramChange = true;
     }
-  }
 
-  // llvm.dbg.sp keeps track of debug info for subprograms.
-  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) {
-    SmallVector<MDNode *, 8> MDs;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      if (NMD->getOperand(i)) {
-        assert(DISubprogram(NMD->getOperand(i)).isSubprogram() &&
-          "A MDNode in llvm.dbg.sp should be a DISubprogram.");
-        MDs.push_back(NMD->getOperand(i));
-      }
+    // Create our live global variable list.
+    DIArray GVs = DIC.getGlobalVariables();
+    bool GlobalVariableChange = false;
+    for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) {
+      DIGlobalVariable DIG(GVs.getElement(i));
+      assert(DIG.Verify() && "DIG must verify as DIGlobalVariable.");
+
+      // Make sure we only visit each global variable only once.
+      if (!VisitedSet.insert(DIG).second)
+        continue;
+
+      // If the global variable referenced by DIG is not null, the global
+      // variable is live.
+      if (DIG.getGlobal())
+        LiveGlobalVariables.push_back(DIG);
       else
-        Changed = true;
-    NMD->eraseFromParent();
-    NMD = NULL;
-
-    for (SmallVectorImpl<MDNode *>::iterator I = MDs.begin(),
-           E = MDs.end(); I != E; ++I) {
-      bool FnIsLive = false;
-      if (Function *F = DISubprogram(*I).getFunction())
-        if (M.getFunction(F->getName()))
-          FnIsLive = true;
-      if (FnIsLive) {
-          if (!NMD)
-            NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
-          NMD->addOperand(*I);
-      } else {
-        // Remove llvm.dbg.lv.fnname named mdnode which may have been used
-        // to hold debug info for dead function's local variables.
-        StringRef FName = DISubprogram(*I).getLinkageName();
-        if (FName.empty())
-          FName = DISubprogram(*I).getName();
-        if (NamedMDNode *LVNMD = M.getNamedMetadata(
-                "llvm.dbg.lv." + Function::getRealLinkageName(FName)))
-          LVNMD->eraseFromParent();
-      }
+        GlobalVariableChange = true;
+    }
+
+    // If we found dead subprograms or global variables, replace the current
+    // subprogram list/global variable list with our new live subprogram/global
+    // variable list.
+    if (SubprogramChange) {
+      // Make sure that 9 is still the index of the subprograms. This is to make
+      // sure that an assert is hit if the location of the subprogram array
+      // changes. This is just to make sure that this is updated if such an
+      // event occurs.
+      assert(DIC->getNumOperands() >= 10 &&
+             SPs == DIC->getOperand(9) &&
+             "DICompileUnits is expected to store Subprograms in operand "
+             "9.");
+      DIC->replaceOperandWith(9, MDNode::get(C, LiveSubprograms));
+      Changed = true;
     }
+
+    if (GlobalVariableChange) {
+      // Make sure that 10 is still the index of global variables. This is to
+      // make sure that an assert is hit if the location of the subprogram array
+      // changes. This is just to make sure that this index is updated if such
+      // an event occurs.
+      assert(DIC->getNumOperands() >= 11 &&
+             GVs == DIC->getOperand(10) &&
+             "DICompileUnits is expected to store Global Variables in operand "
+             "10.");
+      DIC->replaceOperandWith(10, MDNode::get(C, LiveGlobalVariables));
+      Changed = true;
+    }
+
+    // Reset lists for the next iteration.
+    LiveSubprograms.clear();
+    LiveGlobalVariables.clear();
   }
 
   return Changed;
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index b3084cc..a5eddc2 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -158,8 +158,8 @@ public:
                               ConstantInt *DivRHS);
   Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
                               ConstantInt *DivRHS);
-  Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI,
-                                ICmpInst::Predicate Pred, Value *TheAdd);
+  Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI,
+                                ICmpInst::Predicate Pred);
   Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                            ICmpInst::Predicate Cond, Instruction &I);
   Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
@@ -178,6 +178,7 @@ public:
   Instruction *visitPtrToInt(PtrToIntInst &CI);
   Instruction *visitIntToPtr(IntToPtrInst &CI);
   Instruction *visitBitCast(BitCastInst &CI);
+  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
   Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                               Instruction *FI);
   Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
@@ -212,8 +213,8 @@ private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
-  Type *FindElementAtOffset(Type *Ty, int64_t Offset,
-                                  SmallVectorImpl<Value*> &NewIndices);
+  Type *FindElementAtOffset(Type *PtrTy, int64_t Offset,
+                            SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
@@ -271,7 +272,7 @@ public:
     if (&I == V)
       V = UndefValue::get(I.getType());
 
-    DEBUG(errs() << "IC: Replacing " << I << "\n"
+    DEBUG(dbgs() << "IC: Replacing " << I << "\n"
                     "    with " << *V << '\n');
 
     I.replaceAllUsesWith(V);
@@ -283,7 +284,7 @@ public:
   // instruction.  Instead, visit methods should return the value returned by
   // this function.
   Instruction *EraseInstFromFunction(Instruction &I) {
-    DEBUG(errs() << "IC: ERASE " << I << '\n');
+    DEBUG(dbgs() << "IC: ERASE " << I << '\n');
 
     assert(I.use_empty() && "Cannot erase instruction that is used!");
     // Make sure that we reprocess all operands now that we reduced their
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b474bd8..88bb69b 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -488,6 +488,26 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
   return result;
 }
 
+/// Convert an analysis of a masked ICmp into its equivalent if all boolean
+/// operations had the opposite sense. Since each "NotXXX" flag (recording !=)
+/// is adjacent to the corresponding normal flag (recording ==), this just
+/// involves swapping those bits over.
+static unsigned conjugateICmpMask(unsigned Mask) {
+  unsigned NewMask;
+  NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes |
+                     FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed |
+                     FoldMskICmp_BMask_Mixed))
+            << 1;
+
+  NewMask |=
+      (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes |
+               FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed |
+               FoldMskICmp_BMask_NotMixed))
+      >> 1;
+
+  return NewMask;
+}
+
 /// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z)
 /// if possible. The returned predicate is either == or !=. Returns false if
 /// decomposition fails.
@@ -548,14 +568,22 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     L21 = L22 = L1 = 0;
   } else {
     // Look for ANDs in the LHS icmp.
-    if (match(L1, m_And(m_Value(L11), m_Value(L12)))) {
-      if (!match(L2, m_And(m_Value(L21), m_Value(L22))))
-        L21 = L22 = 0;
-    } else {
-      if (!match(L2, m_And(m_Value(L11), m_Value(L12))))
-        return 0;
-      std::swap(L1, L2);
+    if (!L1->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
+      L11 = L12 = 0;
+    } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+      // Any icmp can be viewed as being trivially masked; if it allows us to
+      // remove one, it's worth it.
+      L11 = L1;
+      L12 = Constant::getAllOnesValue(L1->getType());
+    }
+
+    if (!L2->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
       L21 = L22 = 0;
+    } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
+      L21 = L2;
+      L22 = Constant::getAllOnesValue(L2->getType());
     }
   }
 
@@ -576,7 +604,14 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
       return 0;
     }
     E = R2; R1 = 0; ok = true;
-  } else if (match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+  } else if (R1->getType()->isIntegerTy()) {
+    if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+      // As before, model no mask as a trivial mask if it'll let us do an
+      // optimisation.
+      R11 = R1;
+      R12 = Constant::getAllOnesValue(R1->getType());
+    }
+
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11; D = R12; E = R2; ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
@@ -589,7 +624,12 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     return 0;
 
   // Look for ANDs in on the right side of the RHS icmp.
-  if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+  if (!ok && R2->getType()->isIntegerTy()) {
+    if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+      R11 = R2;
+      R12 = Constant::getAllOnesValue(R2->getType());
+    }
+
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11; D = R12; E = R1; ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
@@ -618,8 +658,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 /// foldLogOpOfMaskedICmps:
 /// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y)
-static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                     ICmpInst::Predicate NEWCC,
+static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy* Builder) {
   Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0;
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
@@ -629,8 +668,24 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
   assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
          "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
 
-  if (NEWCC == ICmpInst::ICMP_NE)
-    mask >>= 1; // treat "Not"-states as normal states
+  // In full generality:
+  //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
+  // ==  ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ]
+  //
+  // If the latter can be converted into (icmp (A & X) Op Y) then the former is
+  // equivalent to (icmp (A & X) !Op Y).
+  //
+  // Therefore, we can pretend for the rest of this function that we're dealing
+  // with the conjunction, provided we flip the sense of any comparisons (both
+  // input and output).
+
+  // In most cases we're going to produce an EQ for the "&&" case.
+  ICmpInst::Predicate NEWCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+  if (!IsAnd) {
+    // Convert the masking analysis into its equivalent with negated
+    // comparisons.
+    mask = conjugateICmpMask(mask);
+  }
 
   if (mask & FoldMskICmp_Mask_AllZeroes) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
@@ -657,6 +712,40 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     Value* newAnd = Builder->CreateAnd(A, newAnd1);
     return Builder->CreateICmp(NEWCC, newAnd, A);
   }
+
+  // Remaining cases assume at least that B and D are constant, and depend on
+  // their actual values. This isn't strictly, necessary, just a "handle the
+  // easy cases for now" decision.
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  if (BCst == 0) return 0;
+  ConstantInt *DCst = dyn_cast<ConstantInt>(D);
+  if (DCst == 0) return 0;
+
+  if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
+    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+    // Only valid if one of the masks is a superset of the other (check "B&D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() & DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
+  if (mask & FoldMskICmp_AMask_NotAllOnes) {
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+    // Only valid if one of the masks is a superset of the other (check "B|D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() | DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
   if (mask & FoldMskICmp_BMask_Mixed) {
     // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
@@ -665,14 +754,9 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     // contradict, then we can transform to
     // -> (icmp eq (A & (B|D)), (C|E))
     // Currently, we only handle the case of B, C, D, and E being constant.
-    ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-    if (BCst == 0) return 0;
-    ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-    if (DCst == 0) return 0;
     // we can't simply use C and E, because we might actually handle
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set
-
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
     if (CCst == 0) return 0;
     if (LHSCC != NEWCC)
@@ -715,7 +799,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   }
 
   // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
     return V;
 
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
@@ -849,10 +933,15 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
       return RHS;
     case ICmpInst::ICMP_NE:
+      // Special case to get the ordering right when the values wrap around
+      // zero.
+      if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue())
+        std::swap(LHSCst, RHSCst);
       if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
         Constant *AddCST = ConstantExpr::getNeg(LHSCst);
         Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1));
+        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1),
+                                      Val->getName()+".cmp");
       }
       break;                        // (X != 13 & X != 15) -> no change
     }
@@ -1454,10 +1543,60 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
   return 0;
 }
 
+/// IsOneHotValue - Returns true for "one-hot" values (values where at most
+/// one bit can be set).
+static bool IsOneHotValue(Value *V) {
+  // Match 1<<K.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if (BO->getOpcode() == Instruction::Shl) {
+      ConstantInt *One = dyn_cast<ConstantInt>(BO->getOperand(0));
+      return One && One->isOne();
+    }
+
+  // Check for power of two integer constants.
+  if (ConstantInt *K = dyn_cast<ConstantInt>(V))
+    return K->getValue().isPowerOf2();
+
+  return false;
+}
+
 /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
 
+  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+
+  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() &&
+      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
+
+    BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
+    BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
+    if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() &&
+        LAnd->getOpcode() == Instruction::And &&
+        RAnd->getOpcode() == Instruction::And) {
+
+      Value *Mask = 0;
+      Value *Masked = 0;
+      if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
+          IsOneHotValue(LAnd->getOperand(1)) &&
+          IsOneHotValue(RAnd->getOperand(1))) {
+        Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
+        Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
+      } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
+                 IsOneHotValue(LAnd->getOperand(0)) &&
+                 IsOneHotValue(RAnd->getOperand(0))) {
+        Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
+        Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
+      }
+
+      if (Masked)
+        return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask);
+    }
+  }
+
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
   if (PredicatesFoldable(LHSCC, RHSCC)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
@@ -1474,13 +1613,10 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   // handle (roughly):
   // (icmp ne (A & B), C) | (icmp ne (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_NE, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
     return V;
 
   Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
-
   if (LHS->hasOneUse() || RHS->hasOneUse()) {
     // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
     // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 9f74fd6..0cd7b14 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -999,20 +999,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   // Check to see if we are changing the return type...
   if (OldRetTy != NewRetTy) {
-    if (Callee->isDeclaration() &&
-        // Conversion is ok if changing from one pointer type to another or from
-        // a pointer to an integer of the same size.
-        !((OldRetTy->isPointerTy() || !TD ||
-           OldRetTy == TD->getIntPtrType(Caller->getContext())) &&
-          (NewRetTy->isPointerTy() || !TD ||
-           NewRetTy == TD->getIntPtrType(Caller->getContext()))))
-      return false;   // Cannot transform this return value.
+    if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) {
+      if (Callee->isDeclaration())
+        return false;   // Cannot transform this return value.
 
-    if (!Caller->use_empty() &&
-        // void -> non-void is handled specially
-        !NewRetTy->isVoidTy() &&
-        !CastInst::isBitCastable(NewRetTy, OldRetTy))
+      if (!Caller->use_empty() &&
+          // void -> non-void is handled specially
+          !NewRetTy->isVoidTy())
       return false;   // Cannot transform this return value.
+    }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
       AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
@@ -1045,9 +1040,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     Type *ParamTy = FT->getParamType(i);
     Type *ActTy = (*AI)->getType();
 
-    if (!CastInst::isBitCastable(ActTy, ParamTy)) {
+    if (!CastInst::isBitCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
-    }
 
     if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
           hasAttributes(AttributeFuncs::
@@ -1063,21 +1057,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
 
-      Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
+      Type *CurElTy = ActTy->getPointerElementType();
       if (TD->getTypeAllocSize(CurElTy) !=
           TD->getTypeAllocSize(ParamPTy->getElementType()))
         return false;
     }
-
-    // Converting from one pointer type to another or between a pointer and an
-    // integer of the same size is safe even if we do not have a body.
-    bool isConvertible = ActTy == ParamTy ||
-      (TD && ((ParamTy->isPointerTy() ||
-      ParamTy == TD->getIntPtrType(Caller->getContext())) &&
-              (ActTy->isPointerTy() ||
-              ActTy == TD->getIntPtrType(Caller->getContext()))));
-    if (Callee->isDeclaration() && !isConvertible)
-      return false;
   }
 
   if (Callee->isDeclaration()) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 361acdd..72377dc 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1229,6 +1229,19 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     }
   }
 
+  // (fptrunc (select cond, R1, Cst)) -->
+  // (select cond, (fptrunc R1), (fptrunc Cst))
+  SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));
+  if (SI &&
+      (isa<ConstantFP>(SI->getOperand(1)) ||
+       isa<ConstantFP>(SI->getOperand(2)))) {
+    Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1),
+                                             CI.getType());
+    Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2),
+                                             CI.getType());
+    return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc);
+  }
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
   if (II) {
     switch (II->getIntrinsicID()) {
@@ -1249,9 +1262,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   }
 
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
+  // Note that we restrict this transformation based on
+  // TLI->has(LibFunc::sqrtf), even for the sqrt intrinsic, because
+  // TLI->has(LibFunc::sqrtf) is sufficient to guarantee that the
+  // single-precision intrinsic can be expanded in the backend.
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
   if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
-      Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) &&
+      (Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) ||
+       Call->getCalledFunction()->getIntrinsicID() == Intrinsic::sqrt) &&
       Call->getNumArgOperands() == 1 &&
       Call->hasOneUse()) {
     CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
@@ -1262,11 +1280,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Arg->getOperand(0)->getType()->isFloatTy()) {
       Function *Callee = Call->getCalledFunction();
       Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
-                                                   Callee->getAttributes(),
-                                                   Builder->getFloatTy(),
-                                                   Builder->getFloatTy(),
-                                                   NULL);
+      Constant *SqrtfFunc = (Callee->getIntrinsicID() == Intrinsic::sqrt) ?
+        Intrinsic::getDeclaration(M, Intrinsic::sqrt, Builder->getFloatTy()) :
+        M->getOrInsertFunction("sqrtf", Callee->getAttributes(),
+                               Builder->getFloatTy(), Builder->getFloatTy(),
+                               NULL);
       CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
                                        "sqrtfcall");
       ret->setAttributes(Callee->getAttributes());
@@ -1338,14 +1356,18 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
-  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() !=
-      TD->getPointerSizeInBits()) {
-    Type *Ty = TD->getIntPtrType(CI.getContext());
-    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
-
-    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
-    return new IntToPtrInst(P, CI.getType());
+
+  if (TD) {
+    unsigned AS = CI.getAddressSpace();
+    if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+        TD->getPointerSizeInBits(AS)) {
+      Type *Ty = TD->getIntPtrType(CI.getContext(), AS);
+      if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+        Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+      Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+      return new IntToPtrInst(P, CI.getType());
+    }
   }
 
   if (Instruction *I = commonCastTransforms(CI))
@@ -1370,25 +1392,32 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       return &CI;
     }
 
+    if (!TD)
+      return commonCastTransforms(CI);
+
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
     // non-type-safe code.
-    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
-    if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) &&
+    unsigned AS = GEP->getPointerAddressSpace();
+    unsigned OffsetBits = TD->getPointerSizeInBits(AS);
+    APInt Offset(OffsetBits, 0);
+    BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0));
+    if (GEP->hasOneUse() &&
+        BCI &&
         GEP->accumulateConstantOffset(*TD, Offset)) {
       // Get the base pointer input of the bitcast, and the type it points to.
-      Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
-      Type *GEPIdxTy =
-      cast<PointerType>(OrigBase->getType())->getElementType();
+      Value *OrigBase = BCI->getOperand(0);
       SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) {
+      if (FindElementAtOffset(OrigBase->getType(),
+                              Offset.getSExtValue(),
+                              NewIndices)) {
         // If we were able to index down into an element, create the GEP
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
         Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ?
-        Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
-        Builder->CreateGEP(OrigBase, NewIndices);
+          Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
+          Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
 
         if (isa<BitCastInst>(CI))
@@ -1406,16 +1435,22 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
-  if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) {
-    Type *Ty = TD->getIntPtrType(CI.getContext());
-    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
 
-    Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty);
-    return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false);
-  }
+  if (!TD)
+    return commonPointerCastTransforms(CI);
+
+  Type *Ty = CI.getType();
+  unsigned AS = CI.getPointerAddressSpace();
+
+  if (Ty->getScalarSizeInBits() == TD->getPointerSizeInBits(AS))
+    return commonPointerCastTransforms(CI);
 
-  return commonPointerCastTransforms(CI);
+  Type *PtrTy = TD->getIntPtrType(CI.getContext(), AS);
+  if (Ty->isVectorTy()) // Handle vectors of pointers.
+    PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements());
+
+  Value *P = Builder->CreatePtrToInt(CI.getOperand(0), PtrTy);
+  return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
 }
 
 /// OptimizeVectorResize - This input value (which is known to have vector type)
@@ -1488,12 +1523,17 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
 /// insertions into the vector.  See the example in the comment for
 /// OptimizeIntegerToVectorInsertions for the pattern this handles.
 /// The type of V is always a non-zero multiple of VecEltTy's size.
+/// Shift is the number of bits between the lsb of V and the lsb of
+/// the vector.
 ///
 /// This returns false if the pattern can't be matched or true if it can,
 /// filling in Elements with the elements found here.
-static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
+static bool CollectInsertionElements(Value *V, unsigned Shift,
                                      SmallVectorImpl<Value*> &Elements,
-                                     Type *VecEltTy) {
+                                     Type *VecEltTy, InstCombiner &IC) {
+  assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
+         "Shift should be a multiple of the element type size");
+
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
 
@@ -1505,8 +1545,12 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
       if (C->isNullValue())
         return true;
 
+    unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
+    if (IC.getDataLayout()->isBigEndian())
+      ElementIndex = Elements.size() - ElementIndex - 1;
+
     // Fail if multiple elements are inserted into this slot.
-    if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
+    if (Elements[ElementIndex] != 0)
       return false;
 
     Elements[ElementIndex] = V;
@@ -1522,7 +1566,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     // it to the right type so it gets properly inserted.
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
-                                      ElementIndex, Elements, VecEltTy);
+                                      Shift, Elements, VecEltTy, IC);
 
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
@@ -1533,10 +1577,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
 
     for (unsigned i = 0; i != NumElts; ++i) {
+      unsigned ShiftI = Shift+i*ElementSize;
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
-                                                               i*ElementSize));
+                                                                  ShiftI));
       Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
-      if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy))
+      if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, IC))
         return false;
     }
     return true;
@@ -1549,29 +1594,28 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::Or:
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy) &&
-           CollectInsertionElements(I->getOperand(1), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC) &&
+           CollectInsertionElements(I->getOperand(1), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::Shl: {
     // Must be shifting by a constant that is a multiple of the element size.
     ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
     if (CI == 0) return false;
-    if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
-    unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
-
-    return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
-                                    Elements, VecEltTy);
+    Shift += CI->getZExtValue();
+    if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   }
 
   }
@@ -1594,12 +1638,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
+  // We need to know the target byte order to perform this optimization.
+  if (!IC.getDataLayout()) return 0;
+
   VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
   if (!CollectInsertionElements(IntInput, 0, Elements,
-                                DestVecTy->getElementType()))
+                                DestVecTy->getElementType(), IC))
     return 0;
 
   // If we succeeded, we know that all of the element are specified by Elements
@@ -1785,10 +1832,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
     if (SVI->hasOneUse() && DestTy->isVectorTy() &&
-        cast<VectorType>(DestTy)->getNumElements() ==
-              SVI->getType()->getNumElements() &&
+        DestTy->getVectorNumElements() == SVI->getType()->getNumElements() &&
         SVI->getType()->getNumElements() ==
-          cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) {
+        SVI->getOperand(0)->getType()->getVectorNumElements()) {
       BitCastInst *Tmp;
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
@@ -1810,3 +1856,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
 }
+
+Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+  return commonCastTransforms(CI);
+}
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c0225ae..9bb65ef 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -227,7 +227,8 @@ Instruction *InstCombiner::
 FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
   // We need TD information to know the pointer size unless this is inbounds.
-  if (!GEP->isInBounds() && TD == 0) return 0;
+  if (!GEP->isInBounds() && TD == 0)
+    return 0;
 
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
@@ -393,9 +394,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // If the index is larger than the pointer size of the target, truncate the
   // index down like the GEP would do implicitly.  We don't have to do this for
   // an inbounds GEP because the index can't be out of range.
-  if (!GEP->isInBounds() &&
-      Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits())
-    Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext()));
+  if (!GEP->isInBounds()) {
+    Type *IntPtrTy = TD->getIntPtrType(GEP->getType());
+    unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
+    if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
+      Idx = Builder->CreateTrunc(Idx, IntPtrTy);
+  }
 
   // If the comparison is only true for one or two elements, emit direct
   // comparisons.
@@ -562,16 +566,18 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     }
   }
 
+
+
   // Okay, we know we have a single variable index, which must be a
   // pointer/array/vector index.  If there is no offset, life is simple, return
   // the index.
-  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  Type *IntPtrTy = TD.getIntPtrType(GEP->getOperand(0)->getType());
+  unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
   if (Offset == 0) {
     // Cast to intptrty in case a truncation occurs.  If an extension is needed,
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
-      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
       VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
@@ -593,7 +599,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     return 0;
 
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
-  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
   if (VariableIdx->getType() != IntPtrTy)
     VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
                                             true /*Signed*/);
@@ -737,10 +742,9 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 }
 
 /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
-Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
+Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
                                             Value *X, ConstantInt *CI,
-                                            ICmpInst::Predicate Pred,
-                                            Value *TheAdd) {
+                                            ICmpInst::Predicate Pred) {
   // If we have X+0, exit early (simplifying logic below) and let it get folded
   // elsewhere.   icmp X+0, X  -> icmp X, X
   if (CI->isZero()) {
@@ -1194,11 +1198,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       Type *AndTy = AndCST->getType();          // Type of the and.
 
       // We can fold this as long as we can't shift unknown bits
-      // into the mask.  This can only happen with signed shift
-      // rights, as they sign-extend.
+      // into the mask. This can happen with signed shift
+      // rights, as they sign-extend. With logical shifts,
+      // we must still make sure the comparison is not signed
+      // because we are effectively changing the
+      // position of the sign bit (PR17827).
+      // TODO: We can relax these constraints a bit more.
       if (ShAmt) {
-        bool CanFold = Shift->isLogicalShift();
-        if (!CanFold) {
+        bool CanFold = false;
+        unsigned ShiftOpcode = Shift->getOpcode();
+        if (ShiftOpcode == Instruction::AShr) {
           // To test for the bad case of the signed shr, see if any
           // of the bits shifted in could be tested after the mask.
           uint32_t TyBits = Ty->getPrimitiveSizeInBits();
@@ -1208,6 +1217,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
                AndCST->getValue()) == 0)
             CanFold = true;
+        } else if (ShiftOpcode == Instruction::Shl ||
+                   ShiftOpcode == Instruction::LShr) {
+          CanFold = !ICI.isSigned();
         }
 
         if (CanFold) {
@@ -1781,8 +1793,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
   if (TD && LHSCI->getOpcode() == Instruction::PtrToInt &&
-      TD->getPointerSizeInBits() ==
-         cast<IntegerType>(DestTy)->getBitWidth()) {
+      TD->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
     Value *RHSOp = 0;
     if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
       RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
@@ -2035,14 +2046,59 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
 
 }
 
+/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst
+/// should be swapped.
+/// The descision is based on how many times these two operands are reused
+/// as subtract operands and their positions in those instructions.
+/// The rational is that several architectures use the same instruction for
+/// both subtract and cmp, thus it is better if the order of those operands
+/// match.
+/// \return true if Op0 and Op1 should be swapped.
+static bool swapMayExposeCSEOpportunities(const Value * Op0,
+                                          const Value * Op1) {
+  // Filter out pointer value as those cannot appears directly in subtract.
+  // FIXME: we may want to go through inttoptrs or bitcasts.
+  if (Op0->getType()->isPointerTy())
+    return false;
+  // Count every uses of both Op0 and Op1 in a subtract.
+  // Each time Op0 is the first operand, count -1: swapping is bad, the
+  // subtract has already the same layout as the compare.
+  // Each time Op0 is the second operand, count +1: swapping is good, the
+  // subtract has a diffrent layout as the compare.
+  // At the end, if the benefit is greater than 0, Op0 should come second to
+  // expose more CSE opportunities.
+  int GlobalSwapBenefits = 0;
+  for (Value::const_use_iterator UI = Op0->use_begin(), UIEnd = Op0->use_end(); UI != UIEnd; ++UI) {
+    const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(*UI);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Sub)
+      continue;
+    // If Op0 is the first argument, this is not beneficial to swap the
+    // arguments.
+    int LocalSwapBenefits = -1;
+    unsigned Op1Idx = 1;
+    if (BinOp->getOperand(Op1Idx) == Op0) {
+      Op1Idx = 0;
+      LocalSwapBenefits = 1;
+    }
+    if (BinOp->getOperand(Op1Idx) != Op1)
+      continue;
+    GlobalSwapBenefits += LocalSwapBenefits;
+  }
+  return GlobalSwapBenefits > 0;
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  unsigned Op0Cplxity = getComplexity(Op0);
+  unsigned Op1Cplxity = getComplexity(Op1);
 
   /// Orders the operands of the compare so that they are listed from most
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
-  if (getComplexity(Op0) < getComplexity(Op1)) {
+  if (Op0Cplxity < Op1Cplxity ||
+        (Op0Cplxity == Op1Cplxity &&
+         swapMayExposeCSEOpportunities(Op0, Op1))) {
     I.swapOperands();
     std::swap(Op0, Op1);
     Changed = true;
@@ -2477,7 +2533,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
         if (RHSC->isNullValue() && TD &&
-            TD->getIntPtrType(RHSC->getContext()) ==
+            TD->getIntPtrType(RHSC->getType()) ==
                LHSI->getOperand(0)->getType())
           return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
                         Constant::getNullValue(LHSI->getOperand(0)->getType()));
@@ -2900,6 +2956,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                             Builder->CreateTrunc(B, A->getType()));
     }
 
+    // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
+    // For lshr and ashr pairs.
+    if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+         match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
+        (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+         match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
+      unsigned TypeBits = Cst1->getBitWidth();
+      unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+      if (ShAmt < TypeBits && ShAmt != 0) {
+        ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
+                                       ? ICmpInst::ICMP_UGE
+                                       : ICmpInst::ICMP_ULT;
+        Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+        APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
+        return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
+      }
+    }
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
@@ -2930,20 +3004,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     Value *X; ConstantInt *Cst;
     // icmp X+Cst, X
     if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X)
-      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0);
+      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate());
 
     // icmp X, X+Cst
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
-      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1);
+      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate());
   }
   return Changed ? &I : 0;
 }
 
-
-
-
-
-
 /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
 ///
 Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e2d7966..4c861b3 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -154,7 +154,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
   if (TD) {
-    Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(AI.getType());
     if (AI.getArraySize()->getType() != IntPtrTy) {
       Value *V = Builder->CreateIntCast(AI.getArraySize(),
                                         IntPtrTy, false);
@@ -180,12 +180,13 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       // Now that I is pointing to the first non-allocation-inst in the block,
       // insert our getelementptr instruction...
       //
-      Value *NullIdx =Constant::getNullValue(Type::getInt32Ty(AI.getContext()));
-      Value *Idx[2];
-      Idx[0] = NullIdx;
-      Idx[1] = NullIdx;
+      Type *IdxTy = TD
+                  ? TD->getIntPtrType(AI.getType())
+                  : Type::getInt64Ty(AI.getContext());
+      Value *NullIdx = Constant::getNullValue(IdxTy);
+      Value *Idx[2] = { NullIdx, NullIdx };
       Instruction *GEP =
-           GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub");
+        GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
       InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
@@ -262,9 +263,9 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
           EraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
-        Instruction *NewI
-          = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc,
-                                                             AI.getType()));
+        Constant *Cast
+          = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
+        Instruction *NewI = ReplaceInstUsesWith(AI, Cast);
         EraseInstFromFunction(*Copy);
         ++NumGlobalCopies;
         return NewI;
@@ -302,9 +303,11 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
       if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
         if (Constant *CSrc = dyn_cast<Constant>(CastOp))
           if (ASrcTy->getNumElements() != 0) {
-            Value *Idxs[2];
-            Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext()));
-            Idxs[1] = Idxs[0];
+            Type *IdxTy = TD
+                        ? TD->getIntPtrType(SrcTy)
+                        : Type::getInt64Ty(SrcTy->getContext());
+            Value *Idx = Constant::getNullValue(IdxTy);
+            Value *Idxs[2] = { Idx, Idx };
             CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
             SrcTy = cast<PointerType>(CastOp->getType());
             SrcPTy = SrcTy->getElementType();
@@ -315,7 +318,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
             SrcPTy->isVectorTy()) &&
           // Do not allow turning this into a load of an integer, which is then
           // casted to a pointer, this pessimizes pointer analysis a lot.
-          (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) &&
+          (SrcPTy->isPtrOrPtrVectorTy() ==
+           LI.getType()->isPtrOrPtrVectorTy()) &&
           IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
                IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index cc6a301..a759548 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -374,9 +374,12 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
   } else {
     if (C0) {
       // (C0 / X) * C => (C0 * C) / X
-      ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
-      if (isNormalFp(F))
-        R = BinaryOperator::CreateFDiv(F, Opnd1);
+      if (FMulOrDiv->hasOneUse()) {
+        // It would otherwise introduce another div.
+        ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
+        if (isNormalFp(F))
+          R = BinaryOperator::CreateFDiv(F, Opnd1);
+      }
     } else {
       // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
       ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1));
@@ -460,10 +463,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
             if (Swap && FAddSub->getOpcode() == Instruction::FSub)
               std::swap(M0, M1);
 
-            Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ?
-                        BinaryOperator::CreateFAdd(M0, M1) :
-                        BinaryOperator::CreateFSub(M0, M1);
-            Instruction *RI = cast<Instruction>(R);
+            Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd)
+                                  ? BinaryOperator::CreateFAdd(M0, M1)
+                                  : BinaryOperator::CreateFSub(M0, M1);
             RI->copyFastMathFlags(&I);
             return RI;
           }
@@ -490,13 +492,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
     // if pattern detected emit alternate sequence
     if (OpX && OpY) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->SetFastMathFlags(Log2->getFastMathFlags());
       Log2->setArgOperand(0, OpY);
       Value *FMulVal = Builder->CreateFMul(OpX, Log2);
-      Instruction *FMul = cast<Instruction>(FMulVal);
-      FMul->copyFastMathFlags(Log2);
-      Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX);
-      FSub->copyFastMathFlags(Log2);
-      return FSub;
+      Value *FSub = Builder->CreateFSub(FMulVal, OpX);
+      FSub->takeName(&I);
+      return ReplaceInstUsesWith(I, FSub);
     }
   }
 
@@ -506,6 +508,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   for (int i = 0; i < 2; i++) {
     bool IgnoreZeroSign = I.hasNoSignedZeros();
     if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->SetFastMathFlags(I.getFastMathFlags());
+
       Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
       Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
 
@@ -516,13 +521,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       if (Opnd0->hasOneUse()) {
         // -X * Y => -(X*Y) (Promote negation as high as possible)
         Value *T = Builder->CreateFMul(N0, Opnd1);
-        cast<Instruction>(T)->setDebugLoc(I.getDebugLoc());
-        Instruction *Neg = BinaryOperator::CreateFNeg(T);
-        if (I.getFastMathFlags().any()) {
-          cast<Instruction>(T)->copyFastMathFlags(&I);
-          Neg->copyFastMathFlags(&I);
-        }
-        return Neg;
+        Value *Neg = Builder->CreateFNeg(T);
+        Neg->takeName(&I);
+        return ReplaceInstUsesWith(I, Neg);
       }
     }
 
@@ -545,13 +546,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
           Y = Opnd0_0;
 
         if (Y) {
-          Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1));
-          T->copyFastMathFlags(&I);
-          T->setDebugLoc(I.getDebugLoc());
+          BuilderTy::FastMathFlagGuard Guard(*Builder);
+          Builder->SetFastMathFlags(I.getFastMathFlags());
+          Value *T = Builder->CreateFMul(Opnd1, Opnd1);
 
-          Instruction *R = BinaryOperator::CreateFMul(T, Y);
-          R->copyFastMathFlags(&I);
-          return R;
+          Value *R = Builder->CreateFMul(T, Y);
+          R->takeName(&I);
+          return ReplaceInstUsesWith(I, R);
         }
       }
     }
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index bd14e81..4c6d0c4 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -604,8 +604,6 @@ namespace llvm {
              LHS.Width == RHS.Width;
     }
   };
-  template <>
-  struct isPodLike<LoweredPHIRecord> { static const bool value = true; };
 }
 
 
@@ -688,10 +686,10 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // extracted out of it.  First, sort the users by their offset and size.
   array_pod_sort(PHIUsers.begin(), PHIUsers.end());
 
-  DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n';
-            for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
-              errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n';
-        );
+  DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+        for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+          dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';
+    );
 
   // PredValues - This is a temporary used when rewriting PHI nodes.  It is
   // hoisted out here to avoid construction/destruction thrashing.
@@ -772,7 +770,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       }
       PredValues.clear();
 
-      DEBUG(errs() << "  Made element PHI for offset " << Offset << ": "
+      DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": "
                    << *EltPHI << '\n');
       ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
     }
@@ -792,7 +790,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, TD))
+  if (Value *V = SimplifyInstruction(&PN, TD, TLI))
     return ReplaceInstUsesWith(PN, V);
 
   // If all PHI operands are the same operation, pull them through the PHI,
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 7581dbe..283bec2 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -367,7 +367,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
-  if (!IC || !IC->isEquality())
+  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
     return 0;
 
   Value *CmpLHS = IC->getOperand(0);
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a7bfe09..c831ddd 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -808,7 +808,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
         return 0;
@@ -845,21 +844,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) {
 
-  unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue();
-  unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue();
+  const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue();
+  const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue();
+  if (!ShlOp1 || !ShrOp1)
+      return 0; // Noop.
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+  unsigned BitWidth = Ty->getIntegerBitWidth();
+  if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
+    return 0; // Undef.
+
+  unsigned ShlAmt = ShlOp1.getZExtValue();
+  unsigned ShrAmt = ShrOp1.getZExtValue();
 
   KnownOne.clearAllBits();
   KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
   KnownZero &= DemandedMask;
 
-  if (ShlAmt == 0 || ShrAmt == 0)
-    return 0;
-
-  Value *VarX = Shr->getOperand(0);
-  Type *Ty = VarX->getType();
-
-  APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
-  APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+  APInt BitMask1(APInt::getAllOnesValue(BitWidth));
+  APInt BitMask2(APInt::getAllOnesValue(BitWidth));
 
   bool isLshr = (Shr->getOpcode() == Instruction::LShr);
   BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index f3de6e2..1e72410 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -106,8 +106,8 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
 }
 
 // If we have a PHI node with a vector type that has only 2 uses: feed
-// itself and be an operand of extractelemnt at a constant location,
-// try to replace the PHI of the vector type with a PHI of a scalar type
+// itself and be an operand of extractelement at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type.
 Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
   if (!PN->hasNUses(2))
@@ -282,6 +282,38 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+      if (SI->hasOneUse()) {
+        // TODO: For a select on vectors, it might be useful to do this if it
+        // has multiple extractelement uses. For vector select, that seems to
+        // fight the vectorizer.
+
+        // If we are extracting an element from a vector select or a select on
+        // vectors, a select on the scalars extracted from the vector arguments.
+        Value *TrueVal = SI->getTrueValue();
+        Value *FalseVal = SI->getFalseValue();
+
+        Value *Cond = SI->getCondition();
+        if (Cond->getType()->isVectorTy()) {
+          Cond = Builder->CreateExtractElement(Cond,
+                                               EI.getIndexOperand(),
+                                               Cond->getName() + ".elt");
+        }
+
+        Value *V1Elem
+          = Builder->CreateExtractElement(TrueVal,
+                                          EI.getIndexOperand(),
+                                          TrueVal->getName() + ".elt");
+
+        Value *V2Elem
+          = Builder->CreateExtractElement(FalseVal,
+                                          EI.getIndexOperand(),
+                                          FalseVal->getName() + ".elt");
+        return SelectInst::Create(Cond,
+                                  V1Elem,
+                                  V2Elem,
+                                  SI->getName() + ".elt");
+      }
     }
   }
   return 0;
@@ -294,7 +326,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                                          SmallVectorImpl<Constant*> &Mask) {
   assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
          "Invalid CollectSingleShuffleElements");
-  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+  unsigned NumElts = V->getType()->getVectorNumElements();
 
   if (isa<UndefValue>(V)) {
     Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 19959c0..f84db27 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -37,7 +37,7 @@ public:
   /// in it.
   void Add(Instruction *I) {
     if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
-      DEBUG(errs() << "IC: ADD: " << *I << '\n');
+      DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
       Worklist.push_back(I);
     }
   }
@@ -54,7 +54,7 @@ public:
     assert(Worklist.empty() && "Worklist must be empty to add initial group");
     Worklist.reserve(NumEntries+16);
     WorklistMap.resize(NumEntries);
-    DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
+    DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
     for (unsigned Idx = 0; NumEntries; --NumEntries) {
       Instruction *I = List[NumEntries-1];
       WorklistMap.insert(std::make_pair(I, Idx++));
@@ -74,8 +74,7 @@ public:
   }
 
   Instruction *RemoveOne() {
-    Instruction *I = Worklist.back();
-    Worklist.pop_back();
+    Instruction *I = Worklist.pop_back_val();
     WorklistMap.erase(I);
     return I;
   }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index b34ae21..191a101 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -699,7 +699,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
       Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
       Value *InV = 0;
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+      // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
+      // even if currently isNullValue gives false.
+      Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
+      if (InC && !isa<ConstantExpr>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
       else
         InV = Builder->CreateSelect(PN->getIncomingValue(i),
@@ -755,19 +758,25 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return ReplaceInstUsesWith(I, NewPN);
 }
 
-/// FindElementAtOffset - Given a type and a constant offset, determine whether
-/// or not there is a sequence of GEP indices into the type that will land us at
-/// the specified offset.  If so, fill them into NewIndices and return the
-/// resultant element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
-                                          SmallVectorImpl<Value*> &NewIndices) {
-  if (!TD) return 0;
-  if (!Ty->isSized()) return 0;
+/// FindElementAtOffset - Given a pointer type and a constant offset, determine
+/// whether or not there is a sequence of GEP indices into the pointed type that
+/// will land us at the specified offset.  If so, fill them into NewIndices and
+/// return the resultant element type, otherwise return null.
+Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
+                                        SmallVectorImpl<Value*> &NewIndices) {
+  assert(PtrTy->isPtrOrPtrVectorTy());
+
+  if (!TD)
+    return 0;
+
+  Type *Ty = PtrTy->getPointerElementType();
+  if (!Ty->isSized())
+    return 0;
 
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(PtrTy);
   int64_t FirstIdx = 0;
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
@@ -1176,6 +1185,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
   }
 
+  // Canonicalize (gep i8* X, -(ptrtoint Y)) to (sub (ptrtoint X), (ptrtoint Y))
+  // The GEP pattern is emitted by the SCEV expander for certain kinds of
+  // pointer arithmetic.
+  if (TD && GEP.getNumIndices() == 1 &&
+      match(GEP.getOperand(1), m_Neg(m_PtrToInt(m_Value())))) {
+    unsigned AS = GEP.getPointerAddressSpace();
+    if (GEP.getType() == Builder->getInt8PtrTy(AS) &&
+        GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
+        TD->getPointerSizeInBits(AS)) {
+      Operator *Index = cast<Operator>(GEP.getOperand(1));
+      Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType());
+      Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1));
+      return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+    }
+  }
+
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
   Value *StrippedPtr = PtrOp->stripPointerCasts();
   PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
@@ -1231,13 +1256,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
       Type *SrcElTy = StrippedPtrTy->getElementType();
-      Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      Type *ResElTy = PtrOp->getType()->getPointerElementType();
       if (TD && SrcElTy->isArrayTy() &&
-          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypeAllocSize(SrcElTy->getArrayElementType()) ==
           TD->getTypeAllocSize(ResElTy)) {
-        Value *Idx[2];
-        Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
-        Idx[1] = GEP.getOperand(1);
+        Type *IdxType = TD->getIntPtrType(GEP.getType());
+        Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
         Value *NewGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
           Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
@@ -1261,7 +1285,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1287,8 +1311,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // Check that changing to the array element type amounts to dividing the
         // index by a scale factor.
         uint64_t ResSize = TD->getTypeAllocSize(ResElTy);
-        uint64_t ArrayEltSize =
-          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
+        uint64_t ArrayEltSize
+          = TD->getTypeAllocSize(SrcElTy->getArrayElementType());
         if (ResSize && ArrayEltSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
@@ -1296,7 +1320,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1304,9 +1328,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
             // If the multiplication NewIdx * Scale may overflow then the new
             // GEP may not be "inbounds".
-            Value *Off[2];
-            Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
-            Off[1] = NewIdx;
+            Value *Off[2] = {
+              Constant::getNullValue(TD->getIntPtrType(GEP.getType())),
+              NewIdx
+            };
+
             Value *NewGEP = GEP.isInBounds() && NSW ?
               Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) :
               Builder->CreateGEP(StrippedPtr, Off, GEP.getName());
@@ -1318,15 +1344,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
+  if (!TD)
+    return 0;
+
   /// See if we can simplify:
   ///   X = bitcast A* to B*
   ///   Y = gep X, <...constant indices...>
   /// into a gep of the original struct.  This is important for SROA and alias
   /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
-    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
-    if (TD &&
-        !isa<BitCastInst>(BCI->getOperand(0)) &&
+    Value *Operand = BCI->getOperand(0);
+    PointerType *OpType = cast<PointerType>(Operand->getType());
+    unsigned OffsetBits = TD->getPointerTypeSizeInBits(OpType);
+    APInt Offset(OffsetBits, 0);
+    if (!isa<BitCastInst>(Operand) &&
         GEP.accumulateConstantOffset(*TD, Offset) &&
         StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
@@ -1335,8 +1366,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       if (!Offset) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
-        if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isAllocationFn(BCI->getOperand(0), TLI)) {
+        if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, TLI)) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1347,19 +1377,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             return &GEP;
           }
         }
-        return new BitCastInst(BCI->getOperand(0), GEP.getType());
+        return new BitCastInst(Operand, GEP.getType());
       }
 
       // Otherwise, if the offset is non-zero, we need to find out if there is a
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
       SmallVector<Value*, 8> NewIndices;
-      Type *InTy =
-        cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
-      if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) {
+      if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
-          Builder->CreateGEP(BCI->getOperand(0), NewIndices);
+          Builder->CreateInBoundsGEP(Operand, NewIndices) :
+          Builder->CreateGEP(Operand, NewIndices);
 
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
@@ -1372,8 +1400,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   return 0;
 }
 
-
-
 static bool
 isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
                      const TargetLibraryInfo *TLI) {
@@ -2209,7 +2235,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       // DCE instruction if trivially dead.
       if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
-        DEBUG(errs() << "IC: DCE: " << *Inst << '\n');
+        DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
         continue;
       }
@@ -2217,7 +2243,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
         if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) {
-          DEBUG(errs() << "IC: ConstFold to: " << *C << " from: "
+          DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: "
                        << *Inst << '\n');
           Inst->replaceAllUsesWith(C);
           ++NumConstProp;
@@ -2293,7 +2319,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
 bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   MadeIRChange = false;
 
-  DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+  DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                << F.getName() << "\n");
 
   {
@@ -2338,7 +2364,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, TLI)) {
-      DEBUG(errs() << "IC: DCE: " << *I << '\n');
+      DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
       EraseInstFromFunction(*I);
       ++NumDeadInst;
       MadeIRChange = true;
@@ -2348,7 +2374,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     // Instruction isn't dead, see if we can constant propagate it.
     if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
       if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
-        DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+        DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
 
         // Add operands to the worklist.
         ReplaceInstUsesWith(*I, C);
@@ -2396,13 +2422,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     std::string OrigI;
 #endif
     DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
-    DEBUG(errs() << "IC: Visiting: " << OrigI << '\n');
+    DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
 
     if (Instruction *Result = visit(*I)) {
       ++NumCombined;
       // Should we replace the old instruction with a new one?
       if (Result != I) {
-        DEBUG(errs() << "IC: Old = " << *I << '\n'
+        DEBUG(dbgs() << "IC: Old = " << *I << '\n'
                      << "    New = " << *Result << '\n');
 
         if (!I->getDebugLoc().isUnknown())
@@ -2431,7 +2457,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
         EraseInstFromFunction(*I);
       } else {
 #ifndef NDEBUG
-        DEBUG(errs() << "IC: Mod = " << OrigI << '\n'
+        DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
                      << "    New = " << *I << '\n');
 #endif
 
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index d77e20b..d731ec5 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/DIBuilder.h"
@@ -59,6 +60,7 @@ static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
 static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000;
 
+static const size_t kMinStackMallocSize = 1 << 6;  // 64B
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
 static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
@@ -75,21 +77,30 @@ static const char *const kAsanUnregisterGlobalsName =
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *const kAsanInitName = "__asan_init_v3";
+static const char *const kAsanCovName = "__sanitizer_cov";
 static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *const kAsanMappingOffsetName = "__asan_mapping_offset";
 static const char *const kAsanMappingScaleName = "__asan_mapping_scale";
-static const char *const kAsanStackMallocName = "__asan_stack_malloc";
-static const char *const kAsanStackFreeName = "__asan_stack_free";
+static const int         kMaxAsanStackMallocSizeClass = 10;
+static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
+static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
 static const char *const kAsanGenPrefix = "__asan_gen_";
 static const char *const kAsanPoisonStackMemoryName =
     "__asan_poison_stack_memory";
 static const char *const kAsanUnpoisonStackMemoryName =
     "__asan_unpoison_stack_memory";
 
+static const char *const kAsanOptionDetectUAR =
+    "__asan_option_detect_stack_use_after_return";
+
+// These constants must match the definitions in the run-time library.
 static const int kAsanStackLeftRedzoneMagic = 0xf1;
 static const int kAsanStackMidRedzoneMagic = 0xf2;
 static const int kAsanStackRightRedzoneMagic = 0xf3;
 static const int kAsanStackPartialRedzoneMagic = 0xf4;
+#ifndef NDEBUG
+static const int kAsanStackAfterReturnMagic = 0xf5;
+#endif
 
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
@@ -124,6 +135,8 @@ static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
        cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClCoverage("asan-coverage",
+       cl::desc("ASan coverage"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
        cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClMemIntrin("asan-memintrin",
@@ -184,6 +197,13 @@ static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
 static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
                                cl::Hidden, cl::init(-1));
 
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalArray,
+          "Number of optimized accesses to global arrays");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+          "Number of optimized accesses to global vars");
+
 namespace {
 /// A set of dynamically initialized globals extracted from metadata.
 class SetOfDynamicallyInitializedGlobals {
@@ -306,6 +326,8 @@ struct AddressSanitizer : public FunctionPass {
   bool ShouldInstrumentGlobal(GlobalVariable *G);
   bool LooksLikeCodeInBug11395(Instruction *I);
   void FindDynamicInitializers(Module &M);
+  bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool InjectCoverage(Function &F);
 
   bool CheckInitOrder;
   bool CheckUseAfterReturn;
@@ -321,6 +343,7 @@ struct AddressSanitizer : public FunctionPass {
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
+  Function *AsanCovFunction;
   OwningPtr<SpecialCaseList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
@@ -396,12 +419,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   uint64_t TotalStackSize;
   unsigned StackAlignment;
 
-  Function *AsanStackMallocFunc, *AsanStackFreeFunc;
+  Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+           *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
   Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
 
   // Stores a place and arguments of poisoning/unpoisoning call for alloca.
   struct AllocaPoisonCall {
     IntrinsicInst *InsBefore;
+    AllocaInst *AI;
     uint64_t Size;
     bool DoPoison;
   };
@@ -480,7 +505,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
     if (!AI) return;
     bool DoPoison = (ID == Intrinsic::lifetime_end);
-    AllocaPoisonCall APC = {&II, SizeValue, DoPoison};
+    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
     AllocaPoisonCallVec.push_back(APC);
   }
 
@@ -488,7 +513,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void initializeCallbacks(Module &M);
 
   // Check if we want (and can) handle this alloca.
-  bool isInterestingAlloca(AllocaInst &AI) {
+  bool isInterestingAlloca(AllocaInst &AI) const {
     return (!AI.isArrayAllocation() &&
             AI.isStaticAlloca() &&
             AI.getAlignment() <= RedzoneSize() &&
@@ -498,24 +523,27 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   size_t RedzoneSize() const {
     return RedzoneSizeForScale(Mapping.Scale);
   }
-  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty);
     return SizeInBytes;
   }
-  uint64_t getAlignedSize(uint64_t SizeInBytes) {
+  uint64_t getAlignedSize(uint64_t SizeInBytes) const {
     size_t RZ = RedzoneSize();
     return ((SizeInBytes + RZ - 1) / RZ) * RZ;
   }
-  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
+  uint64_t getAlignedAllocaSize(AllocaInst *AI) const {
     uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
     return getAlignedSize(SizeInBytes);
   }
   /// Finds alloca where the value comes from.
   AllocaInst *findAllocaForValue(Value *V);
-  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
+  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB,
                       Value *ShadowBase, bool DoPoison);
-  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison);
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+  void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase,
+                                          int Size);
 };
 
 }  // namespace
@@ -642,6 +670,13 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
   return NULL;
 }
 
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+  // If a global variable does not have dynamic initialization we don't
+  // have to instrument it.  However, if a global does not have initializer
+  // at all, we assume it has dynamic initializer (in other TU).
+  return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G);
+}
+
 void AddressSanitizer::instrumentMop(Instruction *I) {
   bool IsWrite = false;
   Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
@@ -650,13 +685,19 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
       // If initialization order checking is disabled, a simple access to a
       // dynamically initialized global is always valid.
-      if (!CheckInitOrder)
-        return;
-      // If a global variable does not have dynamic initialization we don't
-      // have to instrument it.  However, if a global does not have initailizer
-      // at all, we assume it has dynamic initializer (in other TU).
-      if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G))
+      if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) {
+        NumOptimizedAccessesToGlobalVar++;
         return;
+      }
+    }
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr);
+    if (CE && CE->isGEPWithNoNotionalOverIndexing()) {
+      if (GlobalVariable *G = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
+        if (CE->getOperand(1)->isNullValue() && GlobalIsLinkerInitialized(G)) {
+          NumOptimizedAccessesToGlobalArray++;
+          return;
+        }
+      }
     }
   }
 
@@ -668,6 +709,11 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
 
   assert((TypeSize % 8) == 0);
 
+  if (IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
   // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
   if (TypeSize == 8  || TypeSize == 16 ||
       TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
@@ -883,7 +929,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   if (BL->isIn(M)) return false;
   C = &(M.getContext());
   int LongSize = TD->getPointerSizeInBits();
@@ -914,8 +960,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy, NULL);
-  SmallVector<Constant *, 16> Initializers(n), DynamicInit;
-
+  SmallVector<Constant *, 16> Initializers(n);
 
   Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
   assert(CtorFunc);
@@ -1046,6 +1091,8 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
 
   AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
+  AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanCovName, IRB.getVoidTy(), IntptrTy, NULL));
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
@@ -1076,7 +1123,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
 
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   DynamicallyInitializedGlobals.Init(M);
 
   C = &(M.getContext());
@@ -1117,6 +1164,47 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   return false;
 }
 
+// Poor man's coverage that works with ASan.
+// We create a Guard boolean variable with the same linkage
+// as the function and inject this code into the entry block:
+// if (*Guard) {
+//    __sanitizer_cov(&F);
+//    *Guard = 1;
+// }
+// The accesses to Guard are atomic. The rest of the logic is
+// in __sanitizer_cov (it's fine to call it more than once).
+//
+// This coverage implementation provides very limited data:
+// it only tells if a given function was ever executed.
+// No counters, no per-basic-block or per-edge data.
+// But for many use cases this is what we need and the added slowdown
+// is negligible. This simple implementation will probably be obsoleted
+// by the upcoming Clang-based coverage implementation.
+// By having it here and now we hope to
+//  a) get the functionality to users earlier and
+//  b) collect usage statistics to help improve Clang coverage design.
+bool AddressSanitizer::InjectCoverage(Function &F) {
+  if (!ClCoverage) return false;
+  IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt());
+  Type *Int8Ty = IRB.getInt8Ty();
+  GlobalVariable *Guard = new GlobalVariable(
+      *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName());
+  LoadInst *Load = IRB.CreateLoad(Guard);
+  Load->setAtomic(Monotonic);
+  Load->setAlignment(1);
+  Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load);
+  Instruction *Ins = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+  IRB.SetInsertPoint(Ins);
+  // We pass &F to __sanitizer_cov. We could avoid this and rely on
+  // GET_CALLER_PC, but having the PC of the first instruction is just nice.
+  IRB.CreateCall(AsanCovFunction, IRB.CreatePointerCast(&F, IntptrTy));
+  StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
+  Store->setAtomic(Monotonic);
+  Store->setAlignment(1);
+  return true;
+}
+
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
@@ -1212,6 +1300,10 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   }
 
   bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+
+  if (InjectCoverage(F))
+    res = true;
+
   DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n");
 
   if (ClKeepUninstrumented) {
@@ -1271,11 +1363,15 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
 
 void FunctionStackPoisoner::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
-  AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL));
-  AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackFreeName, IRB.getVoidTy(),
-      IntptrTy, IntptrTy, IntptrTy, NULL));
+  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+    std::string Suffix = itostr(i);
+    AsanStackMallocFunc[i] = checkInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
+                              IntptrTy, IntptrTy, NULL));
+    AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy,
+        IntptrTy, IntptrTy, NULL));
+  }
   AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
@@ -1283,7 +1379,7 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
 }
 
 void FunctionStackPoisoner::poisonRedZones(
-  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase,
+  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase,
   bool DoPoison) {
   size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale;
   assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
@@ -1344,12 +1440,40 @@ void FunctionStackPoisoner::poisonRedZones(
   }
 }
 
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+  assert(LocalStackSize <= kMaxStackMallocSize);
+  uint64_t MaxSize = kMinStackMallocSize;
+  for (int i = 0; ; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize)
+      return i;
+  llvm_unreachable("impossible LocalStackSize");
+}
+
+// Set Size bytes starting from ShadowBase to kAsanStackAfterReturnMagic.
+// We can not use MemSet intrinsic because it may end up calling the actual
+// memset. Size is a multiple of 8.
+// Currently this generates 8-byte stores on x86_64; it may be better to
+// generate wider stores.
+void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
+    IRBuilder<> &IRB, Value *ShadowBase, int Size) {
+  assert(!(Size % 8));
+  assert(kAsanStackAfterReturnMagic == 0xf5);
+  for (int i = 0; i < Size; i += 8) {
+    Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+    IRB.CreateStore(ConstantInt::get(IRB.getInt64Ty(), 0xf5f5f5f5f5f5f5f5ULL),
+                    IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo()));
+  }
+}
+
 void FunctionStackPoisoner::poisonStack() {
   uint64_t LocalStackSize = TotalStackSize +
                             (AllocaVec.size() + 1) * RedzoneSize();
 
   bool DoStackMalloc = ASan.CheckUseAfterReturn
       && LocalStackSize <= kMaxStackMallocSize;
+  int StackMallocIdx = -1;
 
   assert(AllocaVec.size() > 0);
   Instruction *InsBefore = AllocaVec[0];
@@ -1367,8 +1491,28 @@ void FunctionStackPoisoner::poisonStack() {
   Value *LocalStackBase = OrigStackBase;
 
   if (DoStackMalloc) {
-    LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc,
+    // LocalStackBase = OrigStackBase
+    // if (__asan_option_detect_stack_use_after_return)
+    //   LocalStackBase = __asan_stack_malloc_N(LocalStackBase, OrigStackBase);
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUAR, IRB.getInt32Ty());
+    Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR),
+                                  Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+    BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent();
+    IRBuilder<> IRBIf(Term);
+    LocalStackBase = IRBIf.CreateCall2(
+        AsanStackMallocFunc[StackMallocIdx],
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
+    BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent();
+    IRB.SetInsertPoint(InsBefore);
+    PHINode *Phi = IRB.CreatePHI(IntptrTy, 2);
+    Phi->addIncoming(OrigStackBase, CmpBlock);
+    Phi->addIncoming(LocalStackBase, SetBlock);
+    LocalStackBase = Phi;
   }
 
   // This string will be parsed by the run-time (DescribeAddressIfStack).
@@ -1380,11 +1524,10 @@ void FunctionStackPoisoner::poisonStack() {
   bool HavePoisonedAllocas = false;
   for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) {
     const AllocaPoisonCall &APC = AllocaPoisonCallVec[i];
-    IntrinsicInst *II = APC.InsBefore;
-    AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
-    assert(AI);
-    IRBuilder<> IRB(II);
-    poisonAlloca(AI, APC.Size, IRB, APC.DoPoison);
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
     HavePoisonedAllocas |= APC.DoPoison;
   }
 
@@ -1442,10 +1585,35 @@ void FunctionStackPoisoner::poisonStack() {
     // Unpoison the stack.
     poisonRedZones(AllocaVec, IRBRet, ShadowBase, false);
     if (DoStackMalloc) {
+      assert(StackMallocIdx >= 0);
       // In use-after-return mode, mark the whole stack frame unaddressable.
-      IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
-                         ConstantInt::get(IntptrTy, LocalStackSize),
-                         OrigStackBase);
+      if (StackMallocIdx <= 4) {
+        // For small sizes inline the whole thing:
+        // if LocalStackBase != OrigStackBase:
+        //     memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+        //     **SavedFlagPtr(LocalStackBase) = 0
+        // FIXME: if LocalStackBase != OrigStackBase don't call poisonRedZones.
+        Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase);
+        TerminatorInst *PoisonTerm =
+            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+        IRBuilder<> IRBPoison(PoisonTerm);
+        int ClassSize = kMinStackMallocSize << StackMallocIdx;
+        SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase,
+                                           ClassSize >> Mapping.Scale);
+        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+            LocalStackBase,
+            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(
+            IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        IRBPoison.CreateStore(
+            Constant::getNullValue(IRBPoison.getInt8Ty()),
+            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+      } else {
+        // For larger frames call __asan_stack_free_*.
+        IRBRet.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase,
+                           ConstantInt::get(IntptrTy, LocalStackSize),
+                           OrigStackBase);
+      }
     } else if (HavePoisonedAllocas) {
       // If we poisoned some allocas in llvm.lifetime analysis,
       // unpoison whole stack frame now.
@@ -1460,7 +1628,7 @@ void FunctionStackPoisoner::poisonStack() {
 }
 
 void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
-                                         IRBuilder<> IRB, bool DoPoison) {
+                                         IRBuilder<> &IRB, bool DoPoison) {
   // For now just insert the call to ASan runtime.
   Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
   Value *SizeArg = ConstantInt::get(IntptrTy, Size);
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index b094d42..7a9f0f6 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -80,7 +80,7 @@ BasicBlock *BoundsChecking::getTrapBB() {
     return TrapBB;
 
   Function *Fn = Inst->getParent()->getParent();
-  BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint();
+  IRBuilder<>::InsertPointGuard Guard(*Builder);
   TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
   Builder->SetInsertPoint(TrapBB);
 
@@ -91,7 +91,6 @@ BasicBlock *BoundsChecking::getTrapBB() {
   TrapCall->setDebugLoc(Inst->getDebugLoc());
   Builder->CreateUnreachable();
 
-  Builder->SetInsertPoint(PrevInsertPoint);
   return TrapBB;
 }
 
@@ -173,7 +172,8 @@ bool BoundsChecking::runOnFunction(Function &F) {
   TrapBB = 0;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
   Builder = &TheBuilder;
-  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext());
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext(),
+                                           /*RoundToAlign=*/true);
   ObjSizeEval = &TheObjSizeEval;
 
   // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 5e34863..3563593 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,14 +1,11 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
   BoundsChecking.cpp
+  DataFlowSanitizer.cpp
   DebugIR.cpp
-  EdgeProfiling.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
   Instrumentation.cpp
-  OptimalEdgeProfiling.cpp
-  PathProfiling.cpp
-  ProfilingUtils.cpp
   ThreadSanitizer.cpp
   )
 
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
new file mode 100644
index 0000000..9b9e725
--- /dev/null
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -0,0 +1,1397 @@
+//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own.  Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation.  Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range.  See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SpecialCaseList.h"
+#include <iterator>
+
+using namespace llvm;
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct.  For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16.  This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+    "dfsan-preserve-alignment",
+    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+    cl::init(false));
+
+// The ABI list file controls how shadow parameters are passed.  The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown.  The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::opt<std::string> ClABIListFile(
+    "dfsan-abilist",
+    cl::desc("File listing native ABI functions and how the pass treats them"),
+    cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+    "dfsan-args-abi",
+    cl::desc("Use the argument ABI rather than the TLS ABI"),
+    cl::Hidden);
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+    "dfsan-debug-nonzero-labels",
+    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+             "load or return with a nonzero label"),
+    cl::Hidden);
+
+namespace {
+
+class DataFlowSanitizer : public ModulePass {
+  friend struct DFSanFunction;
+  friend class DFSanVisitor;
+
+  enum {
+    ShadowWidth = 16
+  };
+
+  /// Which ABI should be used for instrumented functions?
+  enum InstrumentedABI {
+    /// Argument and return value labels are passed through additional
+    /// arguments and by modifying the return type.
+    IA_Args,
+
+    /// Argument and return value labels are passed through TLS variables
+    /// __dfsan_arg_tls and __dfsan_retval_tls.
+    IA_TLS
+  };
+
+  /// How should calls to uninstrumented functions be handled?
+  enum WrapperKind {
+    /// This function is present in an uninstrumented form but we don't know
+    /// how it should be handled.  Print a warning and call the function anyway.
+    /// Don't label the return value.
+    WK_Warning,
+
+    /// This function does not write to (user-accessible) memory, and its return
+    /// value is unlabelled.
+    WK_Discard,
+
+    /// This function does not write to (user-accessible) memory, and the label
+    /// of its return value is the union of the label of its arguments.
+    WK_Functional,
+
+    /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+    /// where F is the name of the function.  This function may wrap the
+    /// original function or provide its own implementation.  This is similar to
+    /// the IA_Args ABI, except that IA_Args uses a struct return type to
+    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// extra pointer argument to return the shadow.  This allows the wrapped
+    /// form of the function type to be expressed in C.
+    WK_Custom
+  };
+
+  DataLayout *DL;
+  Module *Mod;
+  LLVMContext *Ctx;
+  IntegerType *ShadowTy;
+  PointerType *ShadowPtrTy;
+  IntegerType *IntptrTy;
+  ConstantInt *ZeroShadow;
+  ConstantInt *ShadowPtrMask;
+  ConstantInt *ShadowPtrMul;
+  Constant *ArgTLS;
+  Constant *RetvalTLS;
+  void *(*GetArgTLSPtr)();
+  void *(*GetRetvalTLSPtr)();
+  Constant *GetArgTLS;
+  Constant *GetRetvalTLS;
+  FunctionType *DFSanUnionFnTy;
+  FunctionType *DFSanUnionLoadFnTy;
+  FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanSetLabelFnTy;
+  FunctionType *DFSanNonzeroLabelFnTy;
+  Constant *DFSanUnionFn;
+  Constant *DFSanUnionLoadFn;
+  Constant *DFSanUnimplementedFn;
+  Constant *DFSanSetLabelFn;
+  Constant *DFSanNonzeroLabelFn;
+  MDNode *ColdCallWeights;
+  OwningPtr<SpecialCaseList> ABIList;
+  DenseMap<Value *, Function *> UnwrappedFnMap;
+  AttributeSet ReadOnlyNoneAttrs;
+
+  Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+  bool isInstrumented(const Function *F);
+  bool isInstrumented(const GlobalAlias *GA);
+  FunctionType *getArgsFunctionType(FunctionType *T);
+  FunctionType *getTrampolineFunctionType(FunctionType *T);
+  FunctionType *getCustomFunctionType(FunctionType *T);
+  InstrumentedABI getInstrumentedABI();
+  WrapperKind getWrapperKind(Function *F);
+  void addGlobalNamePrefix(GlobalValue *GV);
+  Function *buildWrapperFunction(Function *F, StringRef NewFName,
+                                 GlobalValue::LinkageTypes NewFLink,
+                                 FunctionType *NewFT);
+  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+
+ public:
+  DataFlowSanitizer(StringRef ABIListFile = StringRef(),
+                    void *(*getArgTLS)() = 0, void *(*getRetValTLS)() = 0);
+  static char ID;
+  bool doInitialization(Module &M);
+  bool runOnModule(Module &M);
+};
+
+struct DFSanFunction {
+  DataFlowSanitizer &DFS;
+  Function *F;
+  DataFlowSanitizer::InstrumentedABI IA;
+  bool IsNativeABI;
+  Value *ArgTLSPtr;
+  Value *RetvalTLSPtr;
+  AllocaInst *LabelReturnAlloca;
+  DenseMap<Value *, Value *> ValShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  std::vector<std::pair<PHINode *, PHINode *> > PHIFixups;
+  DenseSet<Instruction *> SkipInsts;
+  DenseSet<Value *> NonZeroChecks;
+
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
+        IsNativeABI(IsNativeABI), ArgTLSPtr(0), RetvalTLSPtr(0),
+        LabelReturnAlloca(0) {}
+  Value *getArgTLSPtr();
+  Value *getArgTLS(unsigned Index, Instruction *Pos);
+  Value *getRetvalTLS();
+  Value *getShadow(Value *V);
+  void setShadow(Instruction *I, Value *Shadow);
+  Value *combineOperandShadows(Instruction *Inst);
+  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+                    Instruction *Pos);
+  void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow,
+                   Instruction *Pos);
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+ public:
+  DFSanFunction &DFSF;
+  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+  void visitOperandShadowInst(Instruction &I);
+
+  void visitBinaryOperator(BinaryOperator &BO);
+  void visitCastInst(CastInst &CI);
+  void visitCmpInst(CmpInst &CI);
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  void visitLoadInst(LoadInst &LI);
+  void visitStoreInst(StoreInst &SI);
+  void visitReturnInst(ReturnInst &RI);
+  void visitCallSite(CallSite CS);
+  void visitPHINode(PHINode &PN);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitMemSetInst(MemSetInst &I);
+  void visitMemTransferInst(MemTransferInst &I);
+};
+
+}
+
+char DataFlowSanitizer::ID;
+INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
+                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
+
+ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile,
+                                              void *(*getArgTLS)(),
+                                              void *(*getRetValTLS)()) {
+  return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS);
+}
+
+DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile,
+                                     void *(*getArgTLS)(),
+                                     void *(*getRetValTLS)())
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS),
+      ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile
+                                                               : ABIListFile)) {
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    RetType = StructType::get(RetType, ShadowTy, (Type *)0);
+  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  ArgTypes.push_back(T->getPointerTo());
+  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
+       i != e; ++i) {
+    FunctionType *FT;
+    if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
+                                     *i)->getElementType()))) {
+      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+    } else {
+      ArgTypes.push_back(*i);
+    }
+  }
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+bool DataFlowSanitizer::doInitialization(Module &M) {
+  DL = getAnalysisIfAvailable<DataLayout>();
+  if (!DL)
+    return false;
+
+  Mod = &M;
+  Ctx = &M.getContext();
+  ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
+  ShadowPtrTy = PointerType::getUnqual(ShadowTy);
+  IntptrTy = DL->getIntPtrType(*Ctx);
+  ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
+  ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+
+  Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
+  DFSanUnionFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false);
+  Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
+  DFSanUnionLoadFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false);
+  DFSanUnimplementedFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy };
+  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                        DFSanSetLabelArgs, /*isVarArg=*/false);
+  DFSanNonzeroLabelFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), ArrayRef<Type *>(), /*isVarArg=*/false);
+
+  if (GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = 0;
+    GetArgTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy), (Type *)0)));
+  }
+  if (GetRetvalTLSPtr) {
+    RetvalTLS = 0;
+    GetRetvalTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ShadowTy), (Type *)0)));
+  }
+
+  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+  return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+  return !ABIList->isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+  return !ABIList->isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+  return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+  if (ABIList->isIn(*F, "functional"))
+    return WK_Functional;
+  if (ABIList->isIn(*F, "discard"))
+    return WK_Discard;
+  if (ABIList->isIn(*F, "custom"))
+    return WK_Custom;
+
+  return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+  std::string GVName = GV->getName(), Prefix = "dfs$";
+  GV->setName(Prefix + GVName);
+
+  // Try to change the name of the function in module inline asm.  We only do
+  // this for specific asm directives, currently only ".symver", to try to avoid
+  // corrupting asm which happens to contain the symbol name as a substring.
+  // Note that the substitution for .symver assumes that the versioned symbol
+  // also has an instrumented name.
+  std::string Asm = GV->getParent()->getModuleInlineAsm();
+  std::string SearchStr = ".symver " + GVName + ",";
+  size_t Pos = Asm.find(SearchStr);
+  if (Pos != std::string::npos) {
+    Asm.replace(Pos, SearchStr.size(),
+                ".symver " + Prefix + GVName + "," + Prefix);
+    GV->getParent()->setModuleInlineAsm(Asm);
+  }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+                                        GlobalValue::LinkageTypes NewFLink,
+                                        FunctionType *NewFT) {
+  FunctionType *FT = F->getFunctionType();
+  Function *NewF = Function::Create(NewFT, NewFLink, NewFName,
+                                    F->getParent());
+  NewF->copyAttributesFrom(F);
+  NewF->removeAttributes(
+      AttributeSet::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewFT->getReturnType(),
+                                       AttributeSet::ReturnIndex));
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+  std::vector<Value *> Args;
+  unsigned n = FT->getNumParams();
+  for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+    Args.push_back(&*ai);
+  CallInst *CI = CallInst::Create(F, Args, "", BB);
+  if (FT->getReturnType()->isVoidTy())
+    ReturnInst::Create(*Ctx, BB);
+  else
+    ReturnInst::Create(*Ctx, CI, BB);
+
+  return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+                                                          StringRef FName) {
+  FunctionType *FTT = getTrampolineFunctionType(FT);
+  Constant *C = Mod->getOrInsertFunction(FName, FTT);
+  Function *F = dyn_cast<Function>(C);
+  if (F && F->isDeclaration()) {
+    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    std::vector<Value *> Args;
+    Function::arg_iterator AI = F->arg_begin(); ++AI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+      Args.push_back(&*AI);
+    CallInst *CI =
+        CallInst::Create(&F->getArgumentList().front(), Args, "", BB);
+    ReturnInst *RI;
+    if (FT->getReturnType()->isVoidTy())
+      RI = ReturnInst::Create(*Ctx, BB);
+    else
+      RI = ReturnInst::Create(*Ctx, CI, BB);
+
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
+      DFSF.ValShadowMap[ValAI] = ShadowAI;
+    DFSanVisitor(DFSF).visitCallInst(*CI);
+    if (!FT->getReturnType()->isVoidTy())
+      new StoreInst(DFSF.getShadow(RI->getReturnValue()),
+                    &F->getArgumentList().back(), RI);
+  }
+
+  return C;
+}
+
+bool DataFlowSanitizer::runOnModule(Module &M) {
+  if (!DL)
+    return false;
+
+  if (ABIList->isIn(M, "skip"))
+    return false;
+
+  if (!GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+  if (!GetRetvalTLSPtr) {
+    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+
+  DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(1, Attribute::ZExt);
+    F->addAttribute(2, Attribute::ZExt);
+  }
+  DFSanUnionLoadFn =
+      Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  }
+  DFSanUnimplementedFn =
+      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  DFSanSetLabelFn =
+      Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) {
+    F->addAttribute(1, Attribute::ZExt);
+  }
+  DFSanNonzeroLabelFn =
+      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+
+  std::vector<Function *> FnsToInstrument;
+  llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+    if (!i->isIntrinsic() &&
+        i != DFSanUnionFn &&
+        i != DFSanUnionLoadFn &&
+        i != DFSanUnimplementedFn &&
+        i != DFSanSetLabelFn &&
+        i != DFSanNonzeroLabelFn)
+      FnsToInstrument.push_back(&*i);
+  }
+
+  // Give function aliases prefixes when necessary, and build wrappers where the
+  // instrumentedness is inconsistent.
+  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+    GlobalAlias *GA = &*i;
+    ++i;
+    // Don't stop on weak.  We assume people aren't playing games with the
+    // instrumentedness of overridden weak aliases.
+    if (Function *F = dyn_cast<Function>(
+            GA->resolveAliasedGlobal(/*stopOnWeak=*/false))) {
+      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+      if (GAInst && FInst) {
+        addGlobalNamePrefix(GA);
+      } else if (GAInst != FInst) {
+        // Non-instrumented alias of an instrumented function, or vice versa.
+        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
+        // below will take care of instrumenting it.
+        Function *NewF =
+            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+        GA->replaceAllUsesWith(NewF);
+        NewF->takeName(GA);
+        GA->eraseFromParent();
+        FnsToInstrument.push_back(NewF);
+      }
+    }
+  }
+
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
+  ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B);
+
+  // First, change the ABI of every function in the module.  ABI-listed
+  // functions keep their original ABI and get a wrapper function.
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    Function &F = **i;
+    FunctionType *FT = F.getFunctionType();
+
+    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+                              FT->getReturnType()->isVoidTy());
+
+    if (isInstrumented(&F)) {
+      // Instrumented functions get a 'dfs$' prefix.  This allows us to more
+      // easily identify cases of mismatching ABIs.
+      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+        FunctionType *NewFT = getArgsFunctionType(FT);
+        Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
+        NewF->copyAttributesFrom(&F);
+        NewF->removeAttributes(
+            AttributeSet::ReturnIndex,
+            AttributeFuncs::typeIncompatible(NewFT->getReturnType(),
+                                             AttributeSet::ReturnIndex));
+        for (Function::arg_iterator FArg = F.arg_begin(),
+                                    NewFArg = NewF->arg_begin(),
+                                    FArgEnd = F.arg_end();
+             FArg != FArgEnd; ++FArg, ++NewFArg) {
+          FArg->replaceAllUsesWith(NewFArg);
+        }
+        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+        for (Function::use_iterator ui = F.use_begin(), ue = F.use_end();
+             ui != ue;) {
+          BlockAddress *BA = dyn_cast<BlockAddress>(ui.getUse().getUser());
+          ++ui;
+          if (BA) {
+            BA->replaceAllUsesWith(
+                BlockAddress::get(NewF, BA->getBasicBlock()));
+            delete BA;
+          }
+        }
+        F.replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+        NewF->takeName(&F);
+        F.eraseFromParent();
+        *i = NewF;
+        addGlobalNamePrefix(NewF);
+      } else {
+        addGlobalNamePrefix(&F);
+      }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = 0;
+    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+      // Build a wrapper function for F.  The wrapper simply calls F, and is
+      // added to FnsToInstrument so that any instrumentation according to its
+      // WrapperKind is done in the second pass below.
+      FunctionType *NewFT = getInstrumentedABI() == IA_Args
+                                ? getArgsFunctionType(FT)
+                                : FT;
+      Function *NewF = buildWrapperFunction(
+          &F, std::string("dfsw$") + std::string(F.getName()),
+          GlobalValue::LinkOnceODRLinkage, NewFT);
+      if (getInstrumentedABI() == IA_TLS)
+        NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs);
+
+      Value *WrappedFnCst =
+          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+      F.replaceAllUsesWith(WrappedFnCst);
+      UnwrappedFnMap[WrappedFnCst] = &F;
+      *i = NewF;
+
+      if (!F.isDeclaration()) {
+        // This function is probably defining an interposition of an
+        // uninstrumented function and hence needs to keep the original ABI.
+        // But any functions it may call need to use the instrumented ABI, so
+        // we instrument it in a mode which preserves the original ABI.
+        FnsWithNativeABI.insert(&F);
+
+        // This code needs to rebuild the iterators, as they may be invalidated
+        // by the push_back, taking care that the new range does not include
+        // any functions added by this code.
+        size_t N = i - FnsToInstrument.begin(),
+               Count = e - FnsToInstrument.begin();
+        FnsToInstrument.push_back(&F);
+        i = FnsToInstrument.begin() + N;
+        e = FnsToInstrument.begin() + Count;
+      }
+    }
+  }
+
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    if (!*i || (*i)->isDeclaration())
+      continue;
+
+    removeUnreachableBlocks(**i);
+
+    DFSanFunction DFSF(*this, *i, FnsWithNativeABI.count(*i));
+
+    // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+    // Build a copy of the list before iterating over it.
+    llvm::SmallVector<BasicBlock *, 4> BBList;
+    std::copy(df_begin(&(*i)->getEntryBlock()), df_end(&(*i)->getEntryBlock()),
+              std::back_inserter(BBList));
+
+    for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(),
+                                                      e = BBList.end();
+         i != e; ++i) {
+      Instruction *Inst = &(*i)->front();
+      while (1) {
+        // DFSanVisitor may split the current basic block, changing the current
+        // instruction's next pointer and moving the next instruction to the
+        // tail block from which we should continue.
+        Instruction *Next = Inst->getNextNode();
+        // DFSanVisitor may delete Inst, so keep track of whether it was a
+        // terminator.
+        bool IsTerminator = isa<TerminatorInst>(Inst);
+        if (!DFSF.SkipInsts.count(Inst))
+          DFSanVisitor(DFSF).visit(Inst);
+        if (IsTerminator)
+          break;
+        Inst = Next;
+      }
+    }
+
+    // We will not necessarily be able to compute the shadow for every phi node
+    // until we have visited every block.  Therefore, the code that handles phi
+    // nodes adds them to the PHIFixups list so that they can be properly
+    // handled here.
+    for (std::vector<std::pair<PHINode *, PHINode *> >::iterator
+             i = DFSF.PHIFixups.begin(),
+             e = DFSF.PHIFixups.end();
+         i != e; ++i) {
+      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+           ++val) {
+        i->second->setIncomingValue(
+            val, DFSF.getShadow(i->first->getIncomingValue(val)));
+      }
+    }
+
+    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+    // places (i.e. instructions in basic blocks we haven't even begun visiting
+    // yet).  To make our life easier, do this work in a pass after the main
+    // instrumentation.
+    if (ClDebugNonzeroLabels) {
+      for (DenseSet<Value *>::iterator i = DFSF.NonZeroChecks.begin(),
+                                       e = DFSF.NonZeroChecks.end();
+           i != e; ++i) {
+        Instruction *Pos;
+        if (Instruction *I = dyn_cast<Instruction>(*i))
+          Pos = I->getNextNode();
+        else
+          Pos = DFSF.F->getEntryBlock().begin();
+        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+          Pos = Pos->getNextNode();
+        IRBuilder<> IRB(Pos);
+        Instruction *NeInst = cast<Instruction>(
+            IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow));
+        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+            NeInst, /*Unreachable=*/ false, ColdCallWeights));
+        IRBuilder<> ThenIRB(BI);
+        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn);
+      }
+    }
+  }
+
+  return false;
+}
+
+Value *DFSanFunction::getArgTLSPtr() {
+  if (ArgTLSPtr)
+    return ArgTLSPtr;
+  if (DFS.ArgTLS)
+    return ArgTLSPtr = DFS.ArgTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS);
+}
+
+Value *DFSanFunction::getRetvalTLS() {
+  if (RetvalTLSPtr)
+    return RetvalTLSPtr;
+  if (DFS.RetvalTLS)
+    return RetvalTLSPtr = DFS.RetvalTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS);
+}
+
+Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) {
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx);
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+  if (!isa<Argument>(V) && !isa<Instruction>(V))
+    return DFS.ZeroShadow;
+  Value *&Shadow = ValShadowMap[V];
+  if (!Shadow) {
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      if (IsNativeABI)
+        return DFS.ZeroShadow;
+      switch (IA) {
+      case DataFlowSanitizer::IA_TLS: {
+        Value *ArgTLSPtr = getArgTLSPtr();
+        Instruction *ArgTLSPos =
+            DFS.ArgTLS ? &*F->getEntryBlock().begin()
+                       : cast<Instruction>(ArgTLSPtr)->getNextNode();
+        IRBuilder<> IRB(ArgTLSPos);
+        Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos));
+        break;
+      }
+      case DataFlowSanitizer::IA_Args: {
+        unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2;
+        Function::arg_iterator i = F->arg_begin();
+        while (ArgIdx--)
+          ++i;
+        Shadow = i;
+        assert(Shadow->getType() == DFS.ShadowTy);
+        break;
+      }
+      }
+      NonZeroChecks.insert(Shadow);
+    } else {
+      Shadow = DFS.ZeroShadow;
+    }
+  }
+  return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+  assert(!ValShadowMap.count(I));
+  assert(Shadow->getType() == DFS.ShadowTy);
+  ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+  assert(Addr != RetvalTLS && "Reinstrumenting?");
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateIntToPtr(
+      IRB.CreateMul(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask),
+          ShadowPtrMul),
+      ShadowPtrTy);
+}
+
+// Generates IR to compute the union of the two given shadows, inserting it
+// before Pos.  Returns the computed union Value.
+Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2,
+                                         Instruction *Pos) {
+  if (V1 == ZeroShadow)
+    return V2;
+  if (V2 == ZeroShadow)
+    return V1;
+  if (V1 == V2)
+    return V1;
+  IRBuilder<> IRB(Pos);
+  BasicBlock *Head = Pos->getParent();
+  Value *Ne = IRB.CreateICmpNE(V1, V2);
+  Instruction *NeInst = dyn_cast<Instruction>(Ne);
+  if (NeInst) {
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        NeInst, /*Unreachable=*/ false, ColdCallWeights));
+    IRBuilder<> ThenIRB(BI);
+    CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2);
+    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(1, Attribute::ZExt);
+    Call->addAttribute(2, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
+    PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin());
+    Phi->addIncoming(Call, Call->getParent());
+    Phi->addIncoming(V1, Head);
+    Pos = Phi;
+    return Phi;
+  } else {
+    assert(0 && "todo");
+    return 0;
+  }
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst.  Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+  if (Inst->getNumOperands() == 0)
+    return DFS.ZeroShadow;
+
+  Value *Shadow = getShadow(Inst->getOperand(0));
+  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+    Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+  }
+  return Shadow;
+}
+
+void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+  Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+  DFSF.setShadow(&I, CombinedShadow);
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
+// Addr has alignment Align, and take the union of each of those shadows.
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                 Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      return IRB.CreateLoad(i->second);
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  SmallVector<Value *, 2> Objs;
+  GetUnderlyingObjects(Addr, Objs, DFS.DL);
+  bool AllConstants = true;
+  for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end();
+       i != e; ++i) {
+    if (isa<Function>(*i) || isa<BlockAddress>(*i))
+      continue;
+    if (isa<GlobalVariable>(*i) && cast<GlobalVariable>(*i)->isConstant())
+      continue;
+
+    AllConstants = false;
+    break;
+  }
+  if (AllConstants)
+    return DFS.ZeroShadow;
+
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  switch (Size) {
+  case 0:
+    return DFS.ZeroShadow;
+  case 1: {
+    LoadInst *LI = new LoadInst(ShadowAddr, "", Pos);
+    LI->setAlignment(ShadowAlign);
+    return LI;
+  }
+  case 2: {
+    IRBuilder<> IRB(Pos);
+    Value *ShadowAddr1 =
+        IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1));
+    return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
+                              IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign),
+                              Pos);
+  }
+  }
+  if (Size % (64 / DFS.ShadowWidth) == 0) {
+    // Fast path for the common case where each byte has identical shadow: load
+    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+    // shadow is non-equal.
+    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+    IRBuilder<> FallbackIRB(FallbackBB);
+    CallInst *FallbackCall = FallbackIRB.CreateCall2(
+        DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size));
+    FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+
+    // Compare each of the shadows stored in the loaded 64 bits to each other,
+    // by computing (WideShadow rotl ShadowWidth) == WideShadow.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
+    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+    BasicBlock *Head = Pos->getParent();
+    BasicBlock *Tail = Head->splitBasicBlock(Pos);
+    // In the following code LastBr will refer to the previous basic block's
+    // conditional branch instruction, whose true successor is fixed up to point
+    // to the next block during the loop below or to the tail after the final
+    // iteration.
+    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+    ReplaceInstWithInst(Head->getTerminator(), LastBr);
+
+    for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidth) {
+      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      IRBuilder<> NextIRB(NextBB);
+      WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+      LastBr->setSuccessor(0, NextBB);
+      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+    }
+
+    LastBr->setSuccessor(0, Tail);
+    FallbackIRB.CreateBr(Tail);
+    PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Shadow->addIncoming(FallbackCall, FallbackBB);
+    Shadow->addIncoming(TruncShadow, LastBr->getParent());
+    return Shadow;
+  }
+
+  IRBuilder<> IRB(Pos);
+  CallInst *FallbackCall = IRB.CreateCall2(
+      DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size));
+  FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+  uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType());
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = LI.getAlignment();
+    if (Align == 0)
+      Align = DFSF.DFS.DL->getABITypeAlignment(LI.getType());
+  } else {
+    Align = 1;
+  }
+  IRBuilder<> IRB(&LI);
+  Value *LoadedShadow =
+      DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
+  Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
+  Value *CombinedShadow = DFSF.DFS.combineShadows(LoadedShadow, PtrShadow, &LI);
+  if (CombinedShadow != DFSF.DFS.ZeroShadow)
+    DFSF.NonZeroChecks.insert(CombinedShadow);
+
+  DFSF.setShadow(&LI, CombinedShadow);
+}
+
+void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                Value *Shadow, Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      IRB.CreateStore(Shadow, i->second);
+      return;
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  IRBuilder<> IRB(Pos);
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  if (Shadow == DFS.ZeroShadow) {
+    IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth);
+    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+    Value *ExtShadowAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+    return;
+  }
+
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
+  uint64_t Offset = 0;
+  if (Size >= ShadowVecSize) {
+    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    Value *ShadowVec = UndefValue::get(ShadowVecTy);
+    for (unsigned i = 0; i != ShadowVecSize; ++i) {
+      ShadowVec = IRB.CreateInsertElement(
+          ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
+    }
+    Value *ShadowVecAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+    do {
+      Value *CurShadowVecAddr = IRB.CreateConstGEP1_32(ShadowVecAddr, Offset);
+      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+      Size -= ShadowVecSize;
+      ++Offset;
+    } while (Size >= ShadowVecSize);
+    Offset *= ShadowVecSize;
+  }
+  while (Size > 0) {
+    Value *CurShadowAddr = IRB.CreateConstGEP1_32(ShadowAddr, Offset);
+    IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
+    --Size;
+    ++Offset;
+  }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+  uint64_t Size =
+      DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType());
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = SI.getAlignment();
+    if (Align == 0)
+      Align = DFSF.DFS.DL->getABITypeAlignment(SI.getValueOperand()->getType());
+  } else {
+    Align = 1;
+  }
+  DFSF.storeShadow(SI.getPointerOperand(), Size, Align,
+                   DFSF.getShadow(SI.getValueOperand()), &SI);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+  bool AllLoadsStores = true;
+  for (Instruction::use_iterator i = I.use_begin(), e = I.use_end(); i != e;
+       ++i) {
+    if (isa<LoadInst>(*i))
+      continue;
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(*i)) {
+      if (SI->getPointerOperand() == &I)
+        continue;
+    }
+
+    AllLoadsStores = false;
+    break;
+  }
+  if (AllLoadsStores) {
+    IRBuilder<> IRB(&I);
+    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
+  }
+  DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+  Value *CondShadow = DFSF.getShadow(I.getCondition());
+  Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+  Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+
+  if (isa<VectorType>(I.getCondition()->getType())) {
+    DFSF.setShadow(
+        &I, DFSF.DFS.combineShadows(
+                CondShadow,
+                DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I));
+  } else {
+    Value *ShadowSel;
+    if (TrueShadow == FalseShadow) {
+      ShadowSel = TrueShadow;
+    } else {
+      ShadowSel =
+          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+    }
+    DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I));
+  }
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *ValShadow = DFSF.getShadow(I.getValue());
+  IRB.CreateCall3(
+      DFSF.DFS.DFSanSetLabelFn, ValShadow,
+      IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(*DFSF.DFS.Ctx)),
+      IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy));
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+  Value *LenShadow = IRB.CreateMul(
+      I.getLength(),
+      ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
+  Value *AlignShadow;
+  if (ClPreserveAlignment) {
+    AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
+                                ConstantInt::get(I.getAlignmentCst()->getType(),
+                                                 DFSF.DFS.ShadowWidth / 8));
+  } else {
+    AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
+                                   DFSF.DFS.ShadowWidth / 8);
+  }
+  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+  DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
+  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+  IRB.CreateCall5(I.getCalledValue(), DestShadow, SrcShadow, LenShadow,
+                  AlignShadow, I.getVolatileCst());
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+  if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+    switch (DFSF.IA) {
+    case DataFlowSanitizer::IA_TLS: {
+      Value *S = DFSF.getShadow(RI.getReturnValue());
+      IRBuilder<> IRB(&RI);
+      IRB.CreateStore(S, DFSF.getRetvalTLS());
+      break;
+    }
+    case DataFlowSanitizer::IA_Args: {
+      IRBuilder<> IRB(&RI);
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      Value *InsVal =
+          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+      Value *InsShadow =
+          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+      RI.setOperand(0, InsShadow);
+      break;
+    }
+    }
+  }
+}
+
+void DFSanVisitor::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) {
+    visitOperandShadowInst(*CS.getInstruction());
+    return;
+  }
+
+  IRBuilder<> IRB(CS.getInstruction());
+
+  DenseMap<Value *, Function *>::iterator i =
+      DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
+  if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+    Function *F = i->second;
+    switch (DFSF.DFS.getWrapperKind(F)) {
+    case DataFlowSanitizer::WK_Warning: {
+      CS.setCalledFunction(F);
+      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+                     IRB.CreateGlobalStringPtr(F->getName()));
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Discard: {
+      CS.setCalledFunction(F);
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Functional: {
+      CS.setCalledFunction(F);
+      visitOperandShadowInst(*CS.getInstruction());
+      return;
+    }
+    case DataFlowSanitizer::WK_Custom: {
+      // Don't try to handle invokes of custom functions, it's too complicated.
+      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+      // wrapper.
+      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+        FunctionType *FT = F->getFunctionType();
+        FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
+        std::string CustomFName = "__dfsw_";
+        CustomFName += F->getName();
+        Constant *CustomF =
+            DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
+        if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
+          CustomFn->copyAttributesFrom(F);
+
+          // Custom functions returning non-void will write to the return label.
+          if (!FT->getReturnType()->isVoidTy()) {
+            CustomFn->removeAttributes(AttributeSet::FunctionIndex,
+                                       DFSF.DFS.ReadOnlyNoneAttrs);
+          }
+        }
+
+        std::vector<Value *> Args;
+
+        CallSite::arg_iterator i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+          Type *T = (*i)->getType();
+          FunctionType *ParamFT;
+          if (isa<PointerType>(T) &&
+              (ParamFT = dyn_cast<FunctionType>(
+                   cast<PointerType>(T)->getElementType()))) {
+            std::string TName = "dfst";
+            TName += utostr(FT->getNumParams() - n);
+            TName += "$";
+            TName += F->getName();
+            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+            Args.push_back(T);
+            Args.push_back(
+                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+          } else {
+            Args.push_back(*i);
+          }
+        }
+
+        i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+          Args.push_back(DFSF.getShadow(*i));
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          if (!DFSF.LabelReturnAlloca) {
+            DFSF.LabelReturnAlloca =
+                new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
+                               DFSF.F->getEntryBlock().begin());
+          }
+          Args.push_back(DFSF.LabelReturnAlloca);
+        }
+
+        CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+        CustomCI->setCallingConv(CI->getCallingConv());
+        CustomCI->setAttributes(CI->getAttributes());
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
+          DFSF.setShadow(CustomCI, LabelLoad);
+        }
+
+        CI->replaceAllUsesWith(CustomCI);
+        CI->eraseFromParent();
+        return;
+      }
+      break;
+    }
+    }
+  }
+
+  FunctionType *FT = cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType());
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+    for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
+      IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
+                      DFSF.getArgTLS(i, CS.getInstruction()));
+    }
+  }
+
+  Instruction *Next = 0;
+  if (!CS.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      if (II->getNormalDest()->getSinglePredecessor()) {
+        Next = II->getNormalDest()->begin();
+      } else {
+        BasicBlock *NewBB =
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS);
+        Next = NewBB->begin();
+      }
+    } else {
+      Next = CS->getNextNode();
+    }
+
+    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+      IRBuilder<> NextIRB(Next);
+      LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS());
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setShadow(CS.getInstruction(), LI);
+      DFSF.NonZeroChecks.insert(LI);
+    }
+  }
+
+  // Do all instrumentation for IA_Args down here to defer tampering with the
+  // CFG in a way that SplitEdge may be able to detect.
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+    Value *Func =
+        IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
+    std::vector<Value *> Args;
+
+    CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(*i);
+
+    i = CS.arg_begin();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(DFSF.getShadow(*i));
+
+    if (FT->isVarArg()) {
+      unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
+      ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
+      AllocaInst *VarArgShadow =
+          new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin());
+      Args.push_back(IRB.CreateConstGEP2_32(VarArgShadow, 0, 0));
+      for (unsigned n = 0; i != e; ++i, ++n) {
+        IRB.CreateStore(DFSF.getShadow(*i),
+                        IRB.CreateConstGEP2_32(VarArgShadow, 0, n));
+        Args.push_back(*i);
+      }
+    }
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(),
+                               Args);
+    } else {
+      NewCS = IRB.CreateCall(Func, Args);
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(CS.getAttributes().removeAttributes(
+        *DFSF.DFS.Ctx, AttributeSet::ReturnIndex,
+        AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType(),
+                                         AttributeSet::ReturnIndex)));
+
+    if (Next) {
+      ExtractValueInst *ExVal =
+          ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
+      DFSF.SkipInsts.insert(ExVal);
+      ExtractValueInst *ExShadow =
+          ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
+      DFSF.SkipInsts.insert(ExShadow);
+      DFSF.setShadow(ExVal, ExShadow);
+      DFSF.NonZeroChecks.insert(ExShadow);
+
+      CS.getInstruction()->replaceAllUsesWith(ExVal);
+    }
+
+    CS.getInstruction()->eraseFromParent();
+  }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
+  PHINode *ShadowPN =
+      PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);
+
+  // Give the shadow phi node valid predecessors to fool SplitEdge into working.
+  Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
+  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+       ++i) {
+    ShadowPN->addIncoming(UndefShadow, *i);
+  }
+
+  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+  DFSF.setShadow(&PN, ShadowPN);
+}
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
index 651381d..f50a044 100644
--- a/lib/Transforms/Instrumentation/DebugIR.cpp
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp
@@ -25,6 +25,7 @@
 #include "llvm/InstVisitor.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -401,7 +402,7 @@ private:
       Type *PointeeTy = T->getPointerElementType();
       if (!(N = getType(PointeeTy)))
         N = Builder.createPointerType(
-            getOrCreateType(PointeeTy), Layout.getPointerSizeInBits(),
+            getOrCreateType(PointeeTy), Layout.getPointerTypeSizeInBits(T),
             Layout.getPrefTypeAlignment(T), getTypeName(T));
     } else if (T->isArrayTy()) {
       SmallVector<Value *, 1> Subrange;
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
deleted file mode 100644
index a2459fb..0000000
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments the specified program with counters for edge profiling.
-// Edge profiling can give a reasonable approximation of the hot paths through a
-// program, and is used for a wide variety of program transformations.
-//
-// Note that this implementation is very naive.  We insert a counter for *every*
-// edge in the program, instead of using control flow information to prune the
-// number of counters inserted.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-edge-profiling"
-
-#include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumEdgesInserted, "The # of edges inserted.");
-
-namespace {
-  class EdgeProfiler : public ModulePass {
-    bool runOnModule(Module &M);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    EdgeProfiler() : ModulePass(ID) {
-      initializeEdgeProfilerPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual const char *getPassName() const {
-      return "Edge Profiler";
-    }
-  };
-}
-
-char EdgeProfiler::ID = 0;
-INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling",
-                "Insert instrumentation for edge profiling", false, false)
-
-ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
-
-bool EdgeProfiler::runOnModule(Module &M) {
-  Function *Main = M.getFunction("main");
-  if (Main == 0) {
-    errs() << "WARNING: cannot insert edge profiling into a module"
-           << " with no main function!\n";
-    return false;  // No main, no instrumentation!
-  }
-
-  std::set<BasicBlock*> BlocksToInstrument;
-  unsigned NumEdges = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Reserve space for (0,entry) edge.
-    ++NumEdges;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Keep track of which blocks need to be instrumented.  We don't want to
-      // instrument blocks that are added as the result of breaking critical
-      // edges!
-      BlocksToInstrument.insert(BB);
-      NumEdges += BB->getTerminator()->getNumSuccessors();
-    }
-  }
-
-  Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
-  GlobalVariable *Counters =
-    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
-                       Constant::getNullValue(ATy), "EdgeProfCounters");
-  NumEdgesInserted = NumEdges;
-
-  // Instrument all of the edges...
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Create counter for (0,entry) edge.
-    IncrementCounterInBlock(&F->getEntryBlock(), i++, Counters);
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      if (BlocksToInstrument.count(BB)) {  // Don't instrument inserted blocks
-        // Okay, we have to add a counter of each outgoing edge.  If the
-        // outgoing edge is not critical don't split it, just insert the counter
-        // in the source or destination of the edge.
-        TerminatorInst *TI = BB->getTerminator();
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          // If the edge is critical, split it.
-          SplitCriticalEdge(TI, s, this);
-
-          // Okay, we are guaranteed that the edge is no longer critical.  If we
-          // only have a single successor, insert the counter in this block,
-          // otherwise insert it in the successor block.
-          if (TI->getNumSuccessors() == 1) {
-            // Insert counter at the start of the block
-            IncrementCounterInBlock(BB, i++, Counters, false);
-          } else {
-            // Insert counter at the start of the block
-            IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters);
-          }
-        }
-      }
-  }
-
-  // Add the initialization call to main.
-  InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters);
-  return true;
-}
-
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 4c2681f..206bffb 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -17,7 +17,6 @@
 #define DEBUG_TYPE "insert-gcov-profiling"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -103,6 +102,7 @@ namespace {
     Constant *getIncrementIndirectCounterFunc();
     Constant *getEmitFunctionFunc();
     Constant *getEmitArcsFunc();
+    Constant *getSummaryInfoFunc();
     Constant *getDeleteWriteoutFunctionListFunc();
     Constant *getDeleteFlushFunctionListFunc();
     Constant *getEndFileFunc();
@@ -519,15 +519,15 @@ bool GCOVProfiler::emitProfileArcs() {
         TerminatorInst *TI = BB->getTerminator();
         int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
         if (Successors) {
-          IRBuilder<> Builder(TI);
-          
           if (Successors == 1) {
+            IRBuilder<> Builder(BB->getFirstInsertionPt());
             Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                                 Edge);
             Value *Count = Builder.CreateLoad(Counter);
             Count = Builder.CreateAdd(Count, Builder.getInt64(1));
             Builder.CreateStore(Count, Counter);
           } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            IRBuilder<> Builder(BI);
             Value *Sel = Builder.CreateSelect(BI->getCondition(),
                                               Builder.getInt64(Edge),
                                               Builder.getInt64(Edge + 1));
@@ -543,6 +543,7 @@ bool GCOVProfiler::emitProfileArcs() {
             for (int i = 0; i != Successors; ++i)
               ComplexEdgeSuccs.insert(TI->getSuccessor(i));
           }
+
           Edge += Successors;
         }
       }
@@ -554,14 +555,13 @@ bool GCOVProfiler::emitProfileArcs() {
         GlobalVariable *EdgeState = getEdgeStateValue();
         
         for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
-          IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
+          IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt());
           Builder.CreateStore(Builder.getInt32(i), EdgeState);
         }
+
         for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
-          // call runtime to perform increment
-          BasicBlock::iterator InsertPt =
-            ComplexEdgeSuccs[i+1]->getFirstInsertionPt();
-          IRBuilder<> Builder(InsertPt);
+          // Call runtime to perform increment.
+          IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt());
           Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
@@ -599,7 +599,7 @@ bool GCOVProfiler::emitProfileArcs() {
     };
     FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
 
-    // Inialize the environment and register the local writeout and flush
+    // Initialize the environment and register the local writeout and flush
     // functions.
     Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
     Builder.CreateCall2(GCOVInit, WriteoutF, FlushF);
@@ -701,6 +701,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
+Constant *GCOVProfiler::getSummaryInfoFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
 Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
@@ -747,6 +752,7 @@ Function *GCOVProfiler::insertCounterWriteout(
   Constant *StartFile = getStartFileFunc();
   Constant *EmitFunction = getEmitFunctionFunc();
   Constant *EmitArcs = getEmitArcsFunc();
+  Constant *SummaryInfo = getSummaryInfoFunc();
   Constant *EndFile = getEndFileFunc();
 
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
@@ -773,6 +779,7 @@ Function *GCOVProfiler::insertCounterWriteout(
                             Builder.getInt32(Arcs),
                             Builder.CreateConstGEP2_64(GV, 0, 0));
       }
+      Builder.CreateCall(SummaryInfo);
       Builder.CreateCall(EndFile);
     }
   }
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 9f35396..b1bea38 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -24,12 +24,10 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerPass(Registry);
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
-  initializeEdgeProfilerPass(Registry);
   initializeGCOVProfilerPass(Registry);
-  initializeOptimalEdgeProfilerPass(Registry);
-  initializePathProfilerPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
+  initializeDataFlowSanitizerPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 0251f16..d547adc 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -66,6 +66,31 @@
 /// avoids storing origin to memory when a fully initialized value is stored.
 /// This way it avoids needless overwritting origin of the 4-byte region on
 /// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+///                            Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "msan"
@@ -157,6 +182,18 @@ static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
        cl::desc("File containing the list of functions where MemorySanitizer "
                 "should not report bugs"), cl::Hidden);
 
+// Experimental. Wraps all indirect calls in the instrumented code with
+// a call to the given function. This is needed to assist the dynamic
+// helper tool (MSanDR) to regain control on transition between instrumented and
+// non-instrumented code.
+static cl::opt<std::string> ClWrapIndirectCalls("msan-wrap-indirect-calls",
+       cl::desc("Wrap indirect calls with a given function"),
+       cl::Hidden);
+
+static cl::opt<bool> ClWrapIndirectCallsFast("msan-wrap-indirect-calls-fast",
+       cl::desc("Do not wrap indirect calls with target in the same module"),
+       cl::Hidden, cl::init(true));
+
 namespace {
 
 /// \brief An instrumentation pass implementing detection of uninitialized
@@ -168,12 +205,12 @@ class MemorySanitizer : public FunctionPass {
  public:
   MemorySanitizer(bool TrackOrigins = false,
                   StringRef BlacklistFile = StringRef())
-    : FunctionPass(ID),
-      TrackOrigins(TrackOrigins || ClTrackOrigins),
-      TD(0),
-      WarningFn(0),
-      BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                          : BlacklistFile) { }
+      : FunctionPass(ID),
+        TrackOrigins(TrackOrigins || ClTrackOrigins),
+        TD(0),
+        WarningFn(0),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile),
+        WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {}
   const char *getPassName() const { return "MemorySanitizer"; }
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
@@ -207,13 +244,16 @@ class MemorySanitizer : public FunctionPass {
   /// function.
   GlobalVariable *OriginTLS;
 
+  GlobalVariable *MsandrModuleStart;
+  GlobalVariable *MsandrModuleEnd;
+
   /// \brief The run-time callback to print a warning.
   Value *WarningFn;
   /// \brief Run-time helper that copies origin info for a memory range.
   Value *MsanCopyOriginFn;
   /// \brief Run-time helper that generates a new origin value for a stack
   /// allocation.
-  Value *MsanSetAllocaOriginFn;
+  Value *MsanSetAllocaOrigin4Fn;
   /// \brief Run-time helper that poisons stack on function entry.
   Value *MsanPoisonStackFn;
   /// \brief MSan runtime replacements for memmove, memcpy and memset.
@@ -236,6 +276,12 @@ class MemorySanitizer : public FunctionPass {
   /// \brief An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
 
+  bool WrapIndirectCalls;
+  /// \brief Run-time wrapper for indirect calls.
+  Value *IndirectCallWrapperFn;
+  // Argument and return type of IndirectCallWrapperFn: void (*f)(void).
+  Type *AnyFunctionPtrTy;
+
   friend struct MemorySanitizerVisitor;
   friend struct VarArgAMD64Helper;
 };
@@ -281,9 +327,9 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   MsanCopyOriginFn = M.getOrInsertFunction(
     "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(),
     IRB.getInt8PtrTy(), IntptrTy, NULL);
-  MsanSetAllocaOriginFn = M.getOrInsertFunction(
-    "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
-    IRB.getInt8PtrTy(), NULL);
+  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
   MsanPoisonStackFn = M.getOrInsertFunction(
     "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
   MemmoveFn = M.getOrInsertFunction(
@@ -329,6 +375,24 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
+
+  if (WrapIndirectCalls) {
+    AnyFunctionPtrTy =
+        PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false));
+    IndirectCallWrapperFn = M.getOrInsertFunction(
+        ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL);
+  }
+
+  if (ClWrapIndirectCallsFast) {
+    MsandrModuleStart = new GlobalVariable(
+        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
+        0, "__executable_start");
+    MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility);
+    MsandrModuleEnd = new GlobalVariable(
+        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
+        0, "_end");
+    MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility);
+  }
 }
 
 /// \brief Module-level initialization.
@@ -338,7 +402,7 @@ bool MemorySanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   C = &(M.getContext());
   unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0);
   switch (PtrSize) {
@@ -423,22 +487,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   MemorySanitizer &MS;
   SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
   ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  OwningPtr<VarArgHelper> VAHelper;
+
+  // The following flags disable parts of MSan instrumentation based on
+  // blacklist contents and command-line options.
   bool InsertChecks;
   bool LoadShadow;
   bool PoisonStack;
   bool PoisonUndef;
-  OwningPtr<VarArgHelper> VAHelper;
+  bool CheckReturnValue;
 
   struct ShadowOriginAndInsertPoint {
-    Instruction *Shadow;
-    Instruction *Origin;
+    Value *Shadow;
+    Value *Origin;
     Instruction *OrigIns;
-    ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I)
+    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
       : Shadow(S), Origin(O), OrigIns(I) { }
     ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { }
   };
   SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
   SmallVector<Instruction*, 16> StoreList;
+  SmallVector<CallSite, 16> IndirectCallList;
 
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
@@ -449,6 +518,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     LoadShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
     PoisonUndef = SanitizeFunction && ClPoisonUndef;
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    CheckReturnValue = SanitizeFunction && (F.getName() == "main");
 
     DEBUG(if (!InsertChecks)
           dbgs() << "MemorySanitizer is not inserting checks into '"
@@ -462,7 +534,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       IRBuilder<> IRB(&I);
       Value *Val = I.getValueOperand();
       Value *Addr = I.getPointerOperand();
-      Value *Shadow = getShadow(Val);
+      Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
       StoreInst *NewSI =
@@ -471,7 +543,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       (void)NewSI;
 
       if (ClCheckAccessAddress)
-        insertCheck(Addr, &I);
+        insertShadowCheck(Addr, &I);
+
+      if (I.isAtomic())
+        I.setOrdering(addReleaseOrdering(I.getOrdering()));
 
       if (MS.TrackOrigins) {
         unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
@@ -481,11 +556,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         } else {
           Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
 
-          Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow);
           // TODO(eugenis): handle non-zero constant shadow by inserting an
           // unconditional check (can not simply fail compilation as this could
           // be in the dead code).
-          if (Cst)
+          if (isa<Constant>(ConvertedShadow))
             continue;
 
           Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
@@ -503,12 +577,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void materializeChecks() {
     for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
-      Instruction *Shadow = InstrumentationList[i].Shadow;
+      Value *Shadow = InstrumentationList[i].Shadow;
       Instruction *OrigIns = InstrumentationList[i].OrigIns;
       IRBuilder<> IRB(OrigIns);
       DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
       Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
       DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+      // See the comment in materializeStores().
+      if (isa<Constant>(ConvertedShadow))
+        continue;
       Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
                                     getCleanShadow(ConvertedShadow), "_mscmp");
       Instruction *CheckTerm =
@@ -518,7 +595,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       IRB.SetInsertPoint(CheckTerm);
       if (MS.TrackOrigins) {
-        Instruction *Origin = InstrumentationList[i].Origin;
+        Value *Origin = InstrumentationList[i].Origin;
         IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0),
                         MS.OriginTLS);
       }
@@ -530,6 +607,48 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "DONE:\n" << F);
   }
 
+  void materializeIndirectCalls() {
+    for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) {
+      CallSite CS = IndirectCallList[i];
+      Instruction *I = CS.getInstruction();
+      BasicBlock *B = I->getParent();
+      IRBuilder<> IRB(I);
+      Value *Fn0 = CS.getCalledValue();
+      Value *Fn = IRB.CreateBitCast(Fn0, MS.AnyFunctionPtrTy);
+
+      if (ClWrapIndirectCallsFast) {
+        // Check that call target is inside this module limits.
+        Value *Start =
+            IRB.CreateBitCast(MS.MsandrModuleStart, MS.AnyFunctionPtrTy);
+        Value *End = IRB.CreateBitCast(MS.MsandrModuleEnd, MS.AnyFunctionPtrTy);
+
+        Value *NotInThisModule = IRB.CreateOr(IRB.CreateICmpULT(Fn, Start),
+                                              IRB.CreateICmpUGE(Fn, End));
+
+        PHINode *NewFnPhi =
+            IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target");
+
+        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+            cast<Instruction>(NotInThisModule),
+            /* Unreachable */ false, MS.ColdCallWeights);
+
+        IRB.SetInsertPoint(CheckTerm);
+        // Slow path: call wrapper function to possibly transform the call
+        // target.
+        Value *NewFn = IRB.CreateBitCast(
+            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
+
+        NewFnPhi->addIncoming(Fn0, B);
+        NewFnPhi->addIncoming(NewFn, dyn_cast<Instruction>(NewFn)->getParent());
+        CS.setCalledFunction(NewFnPhi);
+      } else {
+        Value *NewFn = IRB.CreateBitCast(
+            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
+        CS.setCalledFunction(NewFn);
+      }
+    }
+  }
+
   /// \brief Add MemorySanitizer instrumentation to a function.
   bool runOnFunction() {
     MS.initializeCallbacks(*F.getParent());
@@ -572,6 +691,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Insert shadow value checks.
     materializeChecks();
 
+    // Wrap indirect calls.
+    materializeIndirectCalls();
+
     return true;
   }
 
@@ -835,20 +957,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Remember the place where a shadow check should be inserted.
   ///
   /// This location will be later instrumented with a check that will print a
-  /// UMR warning in runtime if the value is not fully defined.
-  void insertCheck(Value *Val, Instruction *OrigIns) {
-    assert(Val);
+  /// UMR warning in runtime if the shadow value is not 0.
+  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+    assert(Shadow);
     if (!InsertChecks) return;
-    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
-    if (!Shadow) return;
 #ifndef NDEBUG
     Type *ShadowTy = Shadow->getType();
     assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
            "Can only insert checks for integer and vector shadow types");
 #endif
-    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
     InstrumentationList.push_back(
-      ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+    if (!Shadow) return;
+    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    insertShadowCheck(Shadow, Origin, OrigIns);
+  }
+
+  AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Release:
+        return Release;
+      case Acquire:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Acquire:
+        return Acquire;
+      case Release:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
   }
 
   // ------------------- Visitors.
@@ -859,7 +1024,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Optionally, checks that the load address is fully defined.
   void visitLoadInst(LoadInst &I) {
     assert(I.getType()->isSized() && "Load type must have size");
-    IRBuilder<> IRB(&I);
+    IRBuilder<> IRB(I.getNextNode());
     Type *ShadowTy = getShadowTy(&I);
     Value *Addr = I.getPointerOperand();
     if (LoadShadow) {
@@ -871,7 +1036,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
 
     if (ClCheckAccessAddress)
-      insertCheck(I.getPointerOperand(), &I);
+      insertShadowCheck(I.getPointerOperand(), &I);
+
+    if (I.isAtomic())
+      I.setOrdering(addAcquireOrdering(I.getOrdering()));
 
     if (MS.TrackOrigins) {
       if (LoadShadow) {
@@ -892,9 +1060,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     StoreList.push_back(&I);
   }
 
+  void handleCASOrRMW(Instruction &I) {
+    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getOperand(0);
+    Value *ShadowPtr = getShadowPtr(Addr, I.getType(), IRB);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // Only test the conditional argument of cmpxchg instruction.
+    // The other argument can potentially be uninitialized, but we can not
+    // detect this situation reliably without possible false positives.
+    if (isa<AtomicCmpXchgInst>(I))
+      insertShadowCheck(I.getOperand(1), &I);
+
+    IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+    setShadow(&I, getCleanShadow(&I));
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
   // Vector manipulation.
   void visitExtractElementInst(ExtractElementInst &I) {
-    insertCheck(I.getOperand(1), &I);
+    insertShadowCheck(I.getOperand(1), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
               "_msprop"));
@@ -902,7 +1101,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitInsertElementInst(InsertElementInst &I) {
-    insertCheck(I.getOperand(2), &I);
+    insertShadowCheck(I.getOperand(2), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
               I.getOperand(2), "_msprop"));
@@ -910,7 +1109,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitShuffleVectorInst(ShuffleVectorInst &I) {
-    insertCheck(I.getOperand(2), &I);
+    insertShadowCheck(I.getOperand(2), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
               I.getOperand(2), "_msprop"));
@@ -1109,18 +1308,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   /// \brief Cast between two shadow types, extending or truncating as
   /// necessary.
-  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) {
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+                          bool Signed = false) {
     Type *srcTy = V->getType();
     if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
-      return IRB.CreateIntCast(V, dstTy, false);
+      return IRB.CreateIntCast(V, dstTy, Signed);
     if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
         dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
-      return IRB.CreateIntCast(V, dstTy, false);
+      return IRB.CreateIntCast(V, dstTy, Signed);
     size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
     size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
     Value *V2 =
-      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false);
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
     return IRB.CreateBitCast(V2, dstTy);
     // TODO: handle struct types.
   }
@@ -1145,7 +1345,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleDiv(Instruction &I) {
     IRBuilder<> IRB(&I);
     // Strict on the second argument.
-    insertCheck(I.getOperand(1), &I);
+    insertShadowCheck(I.getOperand(1), &I);
     setShadow(&I, getShadow(&I, 0));
     setOrigin(&I, getOrigin(&I, 0));
   }
@@ -1428,7 +1628,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
 
     if (ClCheckAccessAddress)
-      insertCheck(Addr, &I);
+      insertShadowCheck(Addr, &I);
 
     // FIXME: use ClStoreCleanOrigin
     // FIXME: factor out common code from materializeStores
@@ -1455,9 +1655,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setShadow(&I, getCleanShadow(&I));
     }
 
-
     if (ClCheckAccessAddress)
-      insertCheck(Addr, &I);
+      insertShadowCheck(Addr, &I);
 
     if (MS.TrackOrigins) {
       if (LoadShadow)
@@ -1554,11 +1753,119 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(Op));
   }
 
+  // \brief Instrument vector convert instrinsic.
+  //
+  // This function instruments intrinsics like cvtsi2ss:
+  // %Out = int_xxx_cvtyyy(%ConvertOp)
+  // or
+  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+  // number \p Out elements, and (if has 2 arguments) copies the rest of the
+  // elements from \p CopyOp.
+  // In most cases conversion involves floating-point value which may trigger a
+  // hardware exception when not fully initialized. For this reason we require
+  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+  // return a fully initialized value.
+  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) {
+    IRBuilder<> IRB(&I);
+    Value *CopyOp, *ConvertOp;
+
+    switch (I.getNumArgOperands()) {
+    case 2:
+      CopyOp = I.getArgOperand(0);
+      ConvertOp = I.getArgOperand(1);
+      break;
+    case 1:
+      ConvertOp = I.getArgOperand(0);
+      CopyOp = NULL;
+      break;
+    default:
+      llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+    }
+
+    // The first *NumUsedElements* elements of ConvertOp are converted to the
+    // same number of output elements. The rest of the output is copied from
+    // CopyOp, or (if not available) filled with zeroes.
+    // Combine shadow for elements of ConvertOp that are used in this operation,
+    // and insert a check.
+    // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+    // int->any conversion.
+    Value *ConvertShadow = getShadow(ConvertOp);
+    Value *AggShadow = 0;
+    if (ConvertOp->getType()->isVectorTy()) {
+      AggShadow = IRB.CreateExtractElement(
+          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+      for (int i = 1; i < NumUsedElements; ++i) {
+        Value *MoreShadow = IRB.CreateExtractElement(
+            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+        AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+      }
+    } else {
+      AggShadow = ConvertShadow;
+    }
+    assert(AggShadow->getType()->isIntegerTy());
+    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+    // Build result shadow by zero-filling parts of CopyOp shadow that come from
+    // ConvertOp.
+    if (CopyOp) {
+      assert(CopyOp->getType() == I.getType());
+      assert(CopyOp->getType()->isVectorTy());
+      Value *ResultShadow = getShadow(CopyOp);
+      Type *EltTy = ResultShadow->getType()->getVectorElementType();
+      for (int i = 0; i < NumUsedElements; ++i) {
+        ResultShadow = IRB.CreateInsertElement(
+            ResultShadow, ConstantInt::getNullValue(EltTy),
+            ConstantInt::get(IRB.getInt32Ty(), i));
+      }
+      setShadow(&I, ResultShadow);
+      setOrigin(&I, getOrigin(CopyOp));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
       handleBswap(I);
       break;
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtusi2sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi2ss:
+    case llvm::Intrinsic::x86_avx512_cvtusi642sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi642ss:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si:
+    case llvm::Intrinsic::x86_sse2_cvtsd2ss:
+    case llvm::Intrinsic::x86_sse2_cvtsi2sd:
+    case llvm::Intrinsic::x86_sse2_cvtsi642sd:
+    case llvm::Intrinsic::x86_sse2_cvtss2sd:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si:
+    case llvm::Intrinsic::x86_sse_cvtsi2ss:
+    case llvm::Intrinsic::x86_sse_cvtsi642ss:
+    case llvm::Intrinsic::x86_sse_cvtss2si64:
+    case llvm::Intrinsic::x86_sse_cvtss2si:
+    case llvm::Intrinsic::x86_sse_cvttss2si64:
+    case llvm::Intrinsic::x86_sse_cvttss2si:
+      handleVectorConvertIntrinsic(I, 1);
+      break;
+    case llvm::Intrinsic::x86_sse2_cvtdq2pd:
+    case llvm::Intrinsic::x86_sse2_cvtps2pd:
+    case llvm::Intrinsic::x86_sse_cvtps2pi:
+    case llvm::Intrinsic::x86_sse_cvttps2pi:
+      handleVectorConvertIntrinsic(I, 2);
+      break;
     default:
       if (!handleUnknownIntrinsic(I))
         visitInstruction(I);
@@ -1604,6 +1911,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
     }
     IRBuilder<> IRB(&I);
+
+    if (MS.WrapIndirectCalls && !CS.getCalledFunction())
+      IndirectCallList.push_back(CS);
+
     unsigned ArgOffset = 0;
     DEBUG(dbgs() << "  CallSite: " << I << "\n");
     for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
@@ -1647,7 +1958,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "  done with call args\n");
 
     FunctionType *FT =
-      cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0));
+      cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0));
     if (FT->isVarArg()) {
       VAHelper->visitCallSite(CS, IRB);
     }
@@ -1686,12 +1997,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void visitReturnInst(ReturnInst &I) {
     IRBuilder<> IRB(&I);
-    if (Value *RetVal = I.getReturnValue()) {
-      // Set the shadow for the RetVal.
+    Value *RetVal = I.getReturnValue();
+    if (!RetVal) return;
+    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+    if (CheckReturnValue) {
+      insertShadowCheck(RetVal, &I);
+      Value *Shadow = getCleanShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+    } else {
       Value *Shadow = getShadow(RetVal);
-      Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
-      DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n");
       IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      // FIXME: make it conditional if ClStoreCleanOrigin==0
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -1734,18 +2050,34 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Descr =
           createPrivateNonConstGlobalForString(*F.getParent(),
                                                StackDescription.str());
-      IRB.CreateCall3(MS.MsanSetAllocaOriginFn,
+
+      IRB.CreateCall4(MS.MsanSetAllocaOrigin4Fn,
                       IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
                       ConstantInt::get(MS.IntptrTy, Size),
-                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()));
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+                      IRB.CreatePointerCast(&F, MS.IntptrTy));
     }
   }
 
   void visitSelectInst(SelectInst& I) {
     IRBuilder<> IRB(&I);
-    setShadow(&I,  IRB.CreateSelect(I.getCondition(),
-              getShadow(I.getTrueValue()), getShadow(I.getFalseValue()),
-              "_msprop"));
+    // a = select b, c, d
+    Value *S = IRB.CreateSelect(I.getCondition(), getShadow(I.getTrueValue()),
+                                getShadow(I.getFalseValue()));
+    if (I.getType()->isAggregateType()) {
+      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+      // an extra "select". This results in much more compact IR.
+      // Sa = select Sb, poisoned, (select b, Sc, Sd)
+      S = IRB.CreateSelect(getShadow(I.getCondition()),
+                           getPoisonedShadow(getShadowTy(I.getType())), S,
+                           "_msprop_select_agg");
+    } else {
+      // Sa = (sext Sb) | (select b, Sc, Sd)
+      S = IRB.CreateOr(S, CreateShadowCast(IRB, getShadow(I.getCondition()),
+                                           S->getType(), true),
+                       "_msprop_select");
+    }
+    setShadow(&I, S);
     if (MS.TrackOrigins) {
       // Origins are always i32, so any vector conditions must be flattened.
       // FIXME: consider tracking vector origins for app vectors?
@@ -1780,7 +2112,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
     DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
     setShadow(&I, ResShadow);
-    setOrigin(&I, getCleanOrigin());
+    setOriginForNaryOp(I);
   }
 
   void visitInsertValueInst(InsertValueInst &I) {
@@ -1793,7 +2125,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
     DEBUG(dbgs() << "   Res:        " << *Res << "\n");
     setShadow(&I, Res);
-    setOrigin(&I, getCleanOrigin());
+    setOriginForNaryOp(I);
   }
 
   void dumpInst(Instruction &I) {
@@ -1816,7 +2148,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       dumpInst(I);
     DEBUG(dbgs() << "DEFAULT: " << I << "\n");
     for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
-      insertCheck(I.getOperand(i), &I);
+      insertShadowCheck(I.getOperand(i), &I);
     setShadow(&I, getCleanShadow(&I));
     setOrigin(&I, getCleanOrigin());
   }
@@ -1970,8 +2302,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
       Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
       Value *OverflowArgAreaShadowPtr =
         MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
-      Value *SrcPtr =
-        getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset);
+      Value *SrcPtr = IRB.CreateConstGEP1_32(VAArgTLSCopy, AMD64FpEndOffset);
       IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
     }
   }
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
deleted file mode 100644
index b45aef6..0000000
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-//===- OptimalEdgeProfiling.cpp - Insert counters for opt. edge profiling -===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments the specified program with counters for edge profiling.
-// Edge profiling can give a reasonable approximation of the hot paths through a
-// program, and is used for a wide variety of program transformations.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-optimal-edge-profiling"
-#include "llvm/Transforms/Instrumentation.h"
-#include "MaximumSpanningTree.h"
-#include "ProfilingUtils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-using namespace llvm;
-
-STATISTIC(NumEdgesInserted, "The # of edges inserted.");
-
-namespace {
-  class OptimalEdgeProfiler : public ModulePass {
-    bool runOnModule(Module &M);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    OptimalEdgeProfiler() : ModulePass(ID) {
-      initializeOptimalEdgeProfilerPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequiredID(ProfileEstimatorPassID);
-      AU.addRequired<ProfileInfo>();
-    }
-
-    virtual const char *getPassName() const {
-      return "Optimal Edge Profiler";
-    }
-  };
-}
-
-char OptimalEdgeProfiler::ID = 0;
-INITIALIZE_PASS_BEGIN(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
-                "Insert optimal instrumentation for edge profiling",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(ProfileEstimatorPass)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
-                "Insert optimal instrumentation for edge profiling",
-                false, false)
-
-ModulePass *llvm::createOptimalEdgeProfilerPass() {
-  return new OptimalEdgeProfiler();
-}
-
-inline static void printEdgeCounter(ProfileInfo::Edge e,
-                                    BasicBlock* b,
-                                    unsigned i) {
-  DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \
-               << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n");
-}
-
-bool OptimalEdgeProfiler::runOnModule(Module &M) {
-  Function *Main = M.getFunction("main");
-  if (Main == 0) {
-    errs() << "WARNING: cannot insert edge profiling into a module"
-           << " with no main function!\n";
-    return false;  // No main, no instrumentation!
-  }
-
-  // NumEdges counts all the edges that may be instrumented. Later on its
-  // decided which edges to actually instrument, to achieve optimal profiling.
-  // For the entry block a virtual edge (0,entry) is reserved, for each block
-  // with no successors an edge (BB,0) is reserved. These edges are necessary
-  // to calculate a truly optimal maximum spanning tree and thus an optimal
-  // instrumentation.
-  unsigned NumEdges = 0;
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Reserve space for (0,entry) edge.
-    ++NumEdges;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Keep track of which blocks need to be instrumented.  We don't want to
-      // instrument blocks that are added as the result of breaking critical
-      // edges!
-      if (BB->getTerminator()->getNumSuccessors() == 0) {
-        // Reserve space for (BB,0) edge.
-        ++NumEdges;
-      } else {
-        NumEdges += BB->getTerminator()->getNumSuccessors();
-      }
-    }
-  }
-
-  // In the profiling output a counter for each edge is reserved, but only few
-  // are used. This is done to be able to read back in the profile without
-  // calulating the maximum spanning tree again, instead each edge counter that
-  // is not used is initialised with -1 to signal that this edge counter has to
-  // be calculated from other edge counters on reading the profile info back
-  // in.
-
-  Type *Int32 = Type::getInt32Ty(M.getContext());
-  ArrayType *ATy = ArrayType::get(Int32, NumEdges);
-  GlobalVariable *Counters =
-    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
-                       Constant::getNullValue(ATy), "OptEdgeProfCounters");
-  NumEdgesInserted = 0;
-
-  std::vector<Constant*> Initializer(NumEdges);
-  Constant *Zero = ConstantInt::get(Int32, 0);
-  Constant *Uncounted = ConstantInt::get(Int32, ProfileInfoLoader::Uncounted);
-
-  // Instrument all of the edges not in MST...
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-
-    // Calculate a Maximum Spanning Tree with the edge weights determined by
-    // ProfileEstimator. ProfileEstimator also assign weights to the virtual
-    // edges (0,entry) and (BB,0) (for blocks with no successors) and this
-    // edges also participate in the maximum spanning tree calculation.
-    // The third parameter of MaximumSpanningTree() has the effect that not the
-    // actual MST is returned but the edges _not_ in the MST.
-
-    ProfileInfo::EdgeWeights ECs =
-      getAnalysis<ProfileInfo>(*F).getEdgeWeights(F);
-    std::vector<ProfileInfo::EdgeWeight> EdgeVector(ECs.begin(), ECs.end());
-    MaximumSpanningTree<BasicBlock> MST(EdgeVector);
-    std::stable_sort(MST.begin(), MST.end());
-
-    // Check if (0,entry) not in the MST. If not, instrument edge
-    // (IncrementCounterInBlock()) and set the counter initially to zero, if
-    // the edge is in the MST the counter is initialised to -1.
-
-    BasicBlock *entry = &(F->getEntryBlock());
-    ProfileInfo::Edge edge = ProfileInfo::getEdge(0, entry);
-    if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-      printEdgeCounter(edge, entry, i);
-      IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted;
-      Initializer[i++] = (Zero);
-    } else{
-      Initializer[i++] = (Uncounted);
-    }
-
-    // InsertedBlocks contains all blocks that were inserted for splitting an
-    // edge, this blocks do not have to be instrumented.
-    DenseSet<BasicBlock*> InsertedBlocks;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Check if block was not inserted and thus does not have to be
-      // instrumented.
-      if (InsertedBlocks.count(BB)) continue;
-
-      // Okay, we have to add a counter of each outgoing edge not in MST. If
-      // the outgoing edge is not critical don't split it, just insert the
-      // counter in the source or destination of the edge. Also, if the block
-      // has no successors, the virtual edge (BB,0) is processed.
-      TerminatorInst *TI = BB->getTerminator();
-      if (TI->getNumSuccessors() == 0) {
-        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB, 0);
-        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-          printEdgeCounter(edge, BB, i);
-          IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
-          Initializer[i++] = (Zero);
-        } else{
-          Initializer[i++] = (Uncounted);
-        }
-      }
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-        BasicBlock *Succ = TI->getSuccessor(s);
-        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,Succ);
-        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-
-          // If the edge is critical, split it.
-          bool wasInserted = SplitCriticalEdge(TI, s, this);
-          Succ = TI->getSuccessor(s);
-          if (wasInserted)
-            InsertedBlocks.insert(Succ);
-
-          // Okay, we are guaranteed that the edge is no longer critical.  If
-          // we only have a single successor, insert the counter in this block,
-          // otherwise insert it in the successor block.
-          if (TI->getNumSuccessors() == 1) {
-            // Insert counter at the start of the block
-            printEdgeCounter(edge, BB, i);
-            IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
-          } else {
-            // Insert counter at the start of the block
-            printEdgeCounter(edge, Succ, i);
-            IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted;
-          }
-          Initializer[i++] = (Zero);
-        } else {
-          Initializer[i++] = (Uncounted);
-        }
-      }
-    }
-  }
-
-  // Check if the number of edges counted at first was the number of edges we
-  // considered for instrumentation.
-  assert(i == NumEdges && "the number of edges in counting array is wrong");
-
-  // Assign the now completely defined initialiser to the array.
-  Constant *init = ConstantArray::get(ATy, Initializer);
-  Counters->setInitializer(init);
-
-  // Add the initialization call to main.
-  InsertProfilingInitCall(Main, "llvm_start_opt_edge_profiling", Counters);
-  return true;
-}
-
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
deleted file mode 100644
index 7de7326..0000000
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ /dev/null
@@ -1,1424 +0,0 @@
-//===- PathProfiling.cpp - Inserts counters for path profiling ------------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments functions for Ball-Larus path profiling.  Ball-Larus
-// profiling converts the CFG into a DAG by replacing backedges with edges
-// from entry to the start block and from the end block to exit.  The paths
-// along the new DAG are enumrated, i.e. each path is given a path number.
-// Edges are instrumented to increment the path number register, such that the
-// path number register will equal the path number of the path taken at the
-// exit.
-//
-// This file defines classes for building a CFG for use with different stages
-// in the Ball-Larus path profiling instrumentation [Ball96].  The
-// requirements are formatting the llvm CFG into the Ball-Larus DAG, path
-// numbering, finding a spanning tree, moving increments from the spanning
-// tree to chords.
-//
-// Terms:
-// DAG            - Directed Acyclic Graph.
-// Ball-Larus DAG - A CFG with an entry node, an exit node, and backedges
-//                  removed in the following manner.  For every backedge
-//                  v->w, insert edge ENTRY->w and edge v->EXIT.
-// Path Number    - The number corresponding to a specific path through a
-//                  Ball-Larus DAG.
-// Spanning Tree  - A subgraph, S, is a spanning tree if S covers all
-//                  vertices and is a tree.
-// Chord          - An edge not in the spanning tree.
-//
-// [Ball96]
-//  T. Ball and J. R. Larus. "Efficient Path Profiling."
-//  International Symposium on Microarchitecture, pages 46-57, 1996.
-//  http://portal.acm.org/citation.cfm?id=243857
-//
-// [Ball94]
-//  Thomas Ball.  "Efficiently Counting Program Events with Support for
-//  On-line queries."
-//  ACM Transactions on Programmmg Languages and Systems, Vol 16, No 5,
-//  September 1994, Pages 1399-1410.
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-path-profiling"
-
-#include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
-#include "llvm/Analysis/PathNumbering.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <vector>
-
-#define HASH_THRESHHOLD 100000
-
-using namespace llvm;
-
-namespace {
-class BLInstrumentationNode;
-class BLInstrumentationEdge;
-class BLInstrumentationDag;
-
-// ---------------------------------------------------------------------------
-// BLInstrumentationNode extends BallLarusNode with member used by the
-// instrumentation algortihms.
-// ---------------------------------------------------------------------------
-class BLInstrumentationNode : public BallLarusNode {
-public:
-  // Creates a new BLInstrumentationNode from a BasicBlock.
-  BLInstrumentationNode(BasicBlock* BB);
-
-  // Get/sets the Value corresponding to the pathNumber register,
-  // constant or phinode.  Used by the instrumentation code to remember
-  // path number Values.
-  Value* getStartingPathNumber();
-  void setStartingPathNumber(Value* pathNumber);
-
-  Value* getEndingPathNumber();
-  void setEndingPathNumber(Value* pathNumber);
-
-  // Get/set the PHINode Instruction for this node.
-  PHINode* getPathPHI();
-  void setPathPHI(PHINode* pathPHI);
-
-private:
-
-  Value* _startingPathNumber; // The Value for the current pathNumber.
-  Value* _endingPathNumber; // The Value for the current pathNumber.
-  PHINode* _pathPHI; // The PHINode for current pathNumber.
-};
-
-// --------------------------------------------------------------------------
-// BLInstrumentationEdge extends BallLarusEdge with data about the
-// instrumentation that will end up on each edge.
-// --------------------------------------------------------------------------
-class BLInstrumentationEdge : public BallLarusEdge {
-public:
-  BLInstrumentationEdge(BLInstrumentationNode* source,
-                        BLInstrumentationNode* target);
-
-  // Sets the target node of this edge.  Required to split edges.
-  void setTarget(BallLarusNode* node);
-
-  // Get/set whether edge is in the spanning tree.
-  bool isInSpanningTree() const;
-  void setIsInSpanningTree(bool isInSpanningTree);
-
-  // Get/ set whether this edge will be instrumented with a path number
-  // initialization.
-  bool isInitialization() const;
-  void setIsInitialization(bool isInitialization);
-
-  // Get/set whether this edge will be instrumented with a path counter
-  // increment.  Notice this is incrementing the path counter
-  // corresponding to the path number register.  The path number
-  // increment is determined by getIncrement().
-  bool isCounterIncrement() const;
-  void setIsCounterIncrement(bool isCounterIncrement);
-
-  // Get/set the path number increment that this edge will be instrumented
-  // with.  This is distinct from the path counter increment and the
-  // weight.  The counter increment counts the number of executions of
-  // some path, whereas the path number keeps track of which path number
-  // the program is on.
-  long getIncrement() const;
-  void setIncrement(long increment);
-
-  // Get/set whether the edge has been instrumented.
-  bool hasInstrumentation();
-  void setHasInstrumentation(bool hasInstrumentation);
-
-  // Returns the successor number of this edge in the source.
-  unsigned getSuccessorNumber();
-
-private:
-  // The increment that the code will be instrumented with.
-  long long _increment;
-
-  // Whether this edge is in the spanning tree.
-  bool _isInSpanningTree;
-
-  // Whether this edge is an initialiation of the path number.
-  bool _isInitialization;
-
-  // Whether this edge is a path counter increment.
-  bool _isCounterIncrement;
-
-  // Whether this edge has been instrumented.
-  bool _hasInstrumentation;
-};
-
-// ---------------------------------------------------------------------------
-// BLInstrumentationDag extends BallLarusDag with algorithms that
-// determine where instrumentation should be placed.
-// ---------------------------------------------------------------------------
-class BLInstrumentationDag : public BallLarusDag {
-public:
-  BLInstrumentationDag(Function &F);
-
-  // Returns the Exit->Root edge. This edge is required for creating
-  // directed cycles in the algorithm for moving instrumentation off of
-  // the spanning tree
-  BallLarusEdge* getExitRootEdge();
-
-  // Returns an array of phony edges which mark those nodes
-  // with function calls
-  BLEdgeVector getCallPhonyEdges();
-
-  // Gets/sets the path counter array
-  GlobalVariable* getCounterArray();
-  void setCounterArray(GlobalVariable* c);
-
-  // Calculates the increments for the chords, thereby removing
-  // instrumentation from the spanning tree edges. Implementation is based
-  // on the algorithm in Figure 4 of [Ball94]
-  void calculateChordIncrements();
-
-  // Updates the state when an edge has been split
-  void splitUpdate(BLInstrumentationEdge* formerEdge, BasicBlock* newBlock);
-
-  // Calculates a spanning tree of the DAG ignoring cycles.  Whichever
-  // edges are in the spanning tree will not be instrumented, but this
-  // implementation does not try to minimize the instrumentation overhead
-  // by trying to find hot edges.
-  void calculateSpanningTree();
-
-  // Pushes initialization further down in order to group the first
-  // increment and initialization.
-  void pushInitialization();
-
-  // Pushes the path counter increments up in order to group the last path
-  // number increment.
-  void pushCounters();
-
-  // Removes phony edges from the successor list of the source, and the
-  // predecessor list of the target.
-  void unlinkPhony();
-
-  // Generate dot graph for the function
-  void generateDotGraph();
-
-protected:
-  // BLInstrumentationDag creates BLInstrumentationNode objects in this
-  // method overriding the creation of BallLarusNode objects.
-  //
-  // Allows subclasses to determine which type of Node is created.
-  // Override this method to produce subclasses of BallLarusNode if
-  // necessary.
-  virtual BallLarusNode* createNode(BasicBlock* BB);
-
-  // BLInstrumentationDag create BLInstrumentationEdges.
-  //
-  // Allows subclasses to determine which type of Edge is created.
-  // Override this method to produce subclasses of BallLarusEdge if
-  // necessary.  Parameters source and target will have been created by
-  // createNode and can be cast to the subclass of BallLarusNode*
-  // returned by createNode.
-  virtual BallLarusEdge* createEdge(
-    BallLarusNode* source, BallLarusNode* target, unsigned edgeNumber);
-
-private:
-  BLEdgeVector _treeEdges; // All edges in the spanning tree.
-  BLEdgeVector _chordEdges; // All edges not in the spanning tree.
-  GlobalVariable* _counterArray; // Array to store path counters
-
-  // Removes the edge from the appropriate predecessor and successor lists.
-  void unlinkEdge(BallLarusEdge* edge);
-
-  // Makes an edge part of the spanning tree.
-  void makeEdgeSpanning(BLInstrumentationEdge* edge);
-
-  // Pushes initialization and calls itself recursively.
-  void pushInitializationFromEdge(BLInstrumentationEdge* edge);
-
-  // Pushes path counter increments up recursively.
-  void pushCountersFromEdge(BLInstrumentationEdge* edge);
-
-  // Depth first algorithm for determining the chord increments.f
-  void calculateChordIncrementsDfs(
-    long weight, BallLarusNode* v, BallLarusEdge* e);
-
-  // Determines the relative direction of two edges.
-  int calculateChordIncrementsDir(BallLarusEdge* e, BallLarusEdge* f);
-};
-
-// ---------------------------------------------------------------------------
-// PathProfiler is a module pass which instruments path profiling instructions
-// ---------------------------------------------------------------------------
-class PathProfiler : public ModulePass {
-private:
-  // Current context for multi threading support.
-  LLVMContext* Context;
-
-  // Which function are we currently instrumenting
-  unsigned currentFunctionNumber;
-
-  // The function prototype in the profiling runtime for incrementing a
-  // single path counter in a hash table.
-  Constant* llvmIncrementHashFunction;
-  Constant* llvmDecrementHashFunction;
-
-  // Instruments each function with path profiling.  'main' is instrumented
-  // with code to save the profile to disk.
-  bool runOnModule(Module &M);
-
-  // Analyzes the function for Ball-Larus path profiling, and inserts code.
-  void runOnFunction(std::vector<Constant*> &ftInit, Function &F, Module &M);
-
-  // Creates an increment constant representing incr.
-  ConstantInt* createIncrementConstant(long incr, int bitsize);
-
-  // Creates an increment constant representing the value in
-  // edge->getIncrement().
-  ConstantInt* createIncrementConstant(BLInstrumentationEdge* edge);
-
-  // Finds the insertion point after pathNumber in block.  PathNumber may
-  // be NULL.
-  BasicBlock::iterator getInsertionPoint(
-    BasicBlock* block, Value* pathNumber);
-
-  // Inserts source's pathNumber Value* into target.  Target may or may not
-  // have multiple predecessors, and may or may not have its phiNode
-  // initalized.
-  void pushValueIntoNode(
-    BLInstrumentationNode* source, BLInstrumentationNode* target);
-
-  // Inserts source's pathNumber Value* into the appropriate slot of
-  // target's phiNode.
-  void pushValueIntoPHI(
-    BLInstrumentationNode* target, BLInstrumentationNode* source);
-
-  // The Value* in node, oldVal,  is updated with a Value* correspodning to
-  // oldVal + addition.
-  void insertNumberIncrement(BLInstrumentationNode* node, Value* addition,
-                             bool atBeginning);
-
-  // Creates a counter increment in the given node.  The Value* in node is
-  // taken as the index into a hash table.
-  void insertCounterIncrement(
-    Value* incValue,
-    BasicBlock::iterator insertPoint,
-    BLInstrumentationDag* dag,
-    bool increment = true);
-
-  // A PHINode is created in the node, and its values initialized to -1U.
-  void preparePHI(BLInstrumentationNode* node);
-
-  // Inserts instrumentation for the given edge
-  //
-  // Pre: The edge's source node has pathNumber set if edge is non zero
-  // path number increment.
-  //
-  // Post: Edge's target node has a pathNumber set to the path number Value
-  // corresponding to the value of the path register after edge's
-  // execution.
-  void insertInstrumentationStartingAt(
-    BLInstrumentationEdge* edge,
-    BLInstrumentationDag* dag);
-
-  // If this edge is a critical edge, then inserts a node at this edge.
-  // This edge becomes the first edge, and a new BallLarusEdge is created.
-  bool splitCritical(BLInstrumentationEdge* edge, BLInstrumentationDag* dag);
-
-  // Inserts instrumentation according to the marked edges in dag.  Phony
-  // edges must be unlinked from the DAG, but accessible from the
-  // backedges.  Dag must have initializations, path number increments, and
-  // counter increments present.
-  //
-  // Counter storage is created here.
-  void insertInstrumentation( BLInstrumentationDag& dag, Module &M);
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  PathProfiler() : ModulePass(ID) {
-    initializePathProfilerPass(*PassRegistry::getPassRegistry());
-  }
-
-  virtual const char *getPassName() const {
-    return "Path Profiler";
-  }
-};
-} // end anonymous namespace
-
-// Should we print the dot-graphs
-static cl::opt<bool> DotPathDag("path-profile-pathdag", cl::Hidden,
-        cl::desc("Output the path profiling DAG for each function."));
-
-// Register the path profiler as a pass
-char PathProfiler::ID = 0;
-INITIALIZE_PASS(PathProfiler, "insert-path-profiling",
-                "Insert instrumentation for Ball-Larus path profiling",
-                false, false)
-
-ModulePass *llvm::createPathProfilerPass() { return new PathProfiler(); }
-
-namespace llvm {
-  class PathProfilingFunctionTable {};
-
-  // Type for global array storing references to hashes or arrays
-  template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable,
-                                            xcompile> {
-  public:
-    static StructType *get(LLVMContext& C) {
-      return( StructType::get(
-                TypeBuilder<types::i<32>, xcompile>::get(C), // type
-                TypeBuilder<types::i<32>, xcompile>::get(C), // array size
-                TypeBuilder<types::i<8>*, xcompile>::get(C), // array/hash ptr
-                NULL));
-    }
-  };
-
-  typedef TypeBuilder<PathProfilingFunctionTable, true>
-  ftEntryTypeBuilder;
-
-  // BallLarusEdge << operator overloading
-  raw_ostream& operator<<(raw_ostream& os,
-                          const BLInstrumentationEdge& edge)
-      LLVM_ATTRIBUTE_USED;
-  raw_ostream& operator<<(raw_ostream& os,
-                          const BLInstrumentationEdge& edge) {
-    os << "[" << edge.getSource()->getName() << " -> "
-       << edge.getTarget()->getName() << "] init: "
-       << (edge.isInitialization() ? "yes" : "no")
-       << " incr:" << edge.getIncrement() << " cinc: "
-       << (edge.isCounterIncrement() ? "yes" : "no");
-    return(os);
-  }
-}
-
-// Creates a new BLInstrumentationNode from a BasicBlock.
-BLInstrumentationNode::BLInstrumentationNode(BasicBlock* BB) :
-  BallLarusNode(BB),
-  _startingPathNumber(NULL), _endingPathNumber(NULL), _pathPHI(NULL) {}
-
-// Constructor for BLInstrumentationEdge.
-BLInstrumentationEdge::BLInstrumentationEdge(BLInstrumentationNode* source,
-                                             BLInstrumentationNode* target)
-  : BallLarusEdge(source, target, 0),
-    _increment(0), _isInSpanningTree(false), _isInitialization(false),
-    _isCounterIncrement(false), _hasInstrumentation(false) {}
-
-// Sets the target node of this edge.  Required to split edges.
-void BLInstrumentationEdge::setTarget(BallLarusNode* node) {
-  _target = node;
-}
-
-// Returns whether this edge is in the spanning tree.
-bool BLInstrumentationEdge::isInSpanningTree() const {
-  return(_isInSpanningTree);
-}
-
-// Sets whether this edge is in the spanning tree.
-void BLInstrumentationEdge::setIsInSpanningTree(bool isInSpanningTree) {
-  _isInSpanningTree = isInSpanningTree;
-}
-
-// Returns whether this edge will be instrumented with a path number
-// initialization.
-bool BLInstrumentationEdge::isInitialization() const {
-  return(_isInitialization);
-}
-
-// Sets whether this edge will be instrumented with a path number
-// initialization.
-void BLInstrumentationEdge::setIsInitialization(bool isInitialization) {
-  _isInitialization = isInitialization;
-}
-
-// Returns whether this edge will be instrumented with a path counter
-// increment.  Notice this is incrementing the path counter
-// corresponding to the path number register.  The path number
-// increment is determined by getIncrement().
-bool BLInstrumentationEdge::isCounterIncrement() const {
-  return(_isCounterIncrement);
-}
-
-// Sets whether this edge will be instrumented with a path counter
-// increment.
-void BLInstrumentationEdge::setIsCounterIncrement(bool isCounterIncrement) {
-  _isCounterIncrement = isCounterIncrement;
-}
-
-// Gets the path number increment that this edge will be instrumented
-// with.  This is distinct from the path counter increment and the
-// weight.  The counter increment is counts the number of executions of
-// some path, whereas the path number keeps track of which path number
-// the program is on.
-long BLInstrumentationEdge::getIncrement() const {
-  return(_increment);
-}
-
-// Set whether this edge will be instrumented with a path number
-// increment.
-void BLInstrumentationEdge::setIncrement(long increment) {
-  _increment = increment;
-}
-
-// True iff the edge has already been instrumented.
-bool BLInstrumentationEdge::hasInstrumentation() {
-  return(_hasInstrumentation);
-}
-
-// Set whether this edge has been instrumented.
-void BLInstrumentationEdge::setHasInstrumentation(bool hasInstrumentation) {
-  _hasInstrumentation = hasInstrumentation;
-}
-
-// Returns the successor number of this edge in the source.
-unsigned BLInstrumentationEdge::getSuccessorNumber() {
-  BallLarusNode* sourceNode = getSource();
-  BallLarusNode* targetNode = getTarget();
-  BasicBlock* source = sourceNode->getBlock();
-  BasicBlock* target = targetNode->getBlock();
-
-  if(source == NULL || target == NULL)
-    return(0);
-
-  TerminatorInst* terminator = source->getTerminator();
-
-        unsigned i;
-  for(i=0; i < terminator->getNumSuccessors(); i++) {
-    if(terminator->getSuccessor(i) == target)
-      break;
-  }
-
-  return(i);
-}
-
-// BLInstrumentationDag constructor initializes a DAG for the given Function.
-BLInstrumentationDag::BLInstrumentationDag(Function &F) : BallLarusDag(F),
-                                                          _counterArray(0) {
-}
-
-// Returns the Exit->Root edge. This edge is required for creating
-// directed cycles in the algorithm for moving instrumentation off of
-// the spanning tree
-BallLarusEdge* BLInstrumentationDag::getExitRootEdge() {
-  BLEdgeIterator erEdge = getExit()->succBegin();
-  return(*erEdge);
-}
-
-BLEdgeVector BLInstrumentationDag::getCallPhonyEdges () {
-  BLEdgeVector callEdges;
-
-  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-       edge != end; edge++ ) {
-    if( (*edge)->getType() == BallLarusEdge::CALLEDGE_PHONY )
-      callEdges.push_back(*edge);
-  }
-
-  return callEdges;
-}
-
-// Gets the path counter array
-GlobalVariable* BLInstrumentationDag::getCounterArray() {
-  return _counterArray;
-}
-
-void BLInstrumentationDag::setCounterArray(GlobalVariable* c) {
-  _counterArray = c;
-}
-
-// Calculates the increment for the chords, thereby removing
-// instrumentation from the spanning tree edges. Implementation is based on
-// the algorithm in Figure 4 of [Ball94]
-void BLInstrumentationDag::calculateChordIncrements() {
-  calculateChordIncrementsDfs(0, getRoot(), NULL);
-
-  BLInstrumentationEdge* chord;
-  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
-      end = _chordEdges.end(); chordEdge != end; chordEdge++) {
-    chord = (BLInstrumentationEdge*) *chordEdge;
-    chord->setIncrement(chord->getIncrement() + chord->getWeight());
-  }
-}
-
-// Updates the state when an edge has been split
-void BLInstrumentationDag::splitUpdate(BLInstrumentationEdge* formerEdge,
-                                       BasicBlock* newBlock) {
-  BallLarusNode* oldTarget = formerEdge->getTarget();
-  BallLarusNode* newNode = addNode(newBlock);
-  formerEdge->setTarget(newNode);
-  newNode->addPredEdge(formerEdge);
-
-  DEBUG(dbgs() << "  Edge split: " << *formerEdge << "\n");
-
-  oldTarget->removePredEdge(formerEdge);
-  BallLarusEdge* newEdge = addEdge(newNode, oldTarget,0);
-
-  if( formerEdge->getType() == BallLarusEdge::BACKEDGE ||
-                        formerEdge->getType() == BallLarusEdge::SPLITEDGE) {
-                newEdge->setType(formerEdge->getType());
-    newEdge->setPhonyRoot(formerEdge->getPhonyRoot());
-    newEdge->setPhonyExit(formerEdge->getPhonyExit());
-    formerEdge->setType(BallLarusEdge::NORMAL);
-                formerEdge->setPhonyRoot(NULL);
-    formerEdge->setPhonyExit(NULL);
-  }
-}
-
-// Calculates a spanning tree of the DAG ignoring cycles.  Whichever
-// edges are in the spanning tree will not be instrumented, but this
-// implementation does not try to minimize the instrumentation overhead
-// by trying to find hot edges.
-void BLInstrumentationDag::calculateSpanningTree() {
-  std::stack<BallLarusNode*> dfsStack;
-
-  for(BLNodeIterator nodeIt = _nodes.begin(), end = _nodes.end();
-      nodeIt != end; nodeIt++) {
-    (*nodeIt)->setColor(BallLarusNode::WHITE);
-  }
-
-  dfsStack.push(getRoot());
-  while(dfsStack.size() > 0) {
-    BallLarusNode* node = dfsStack.top();
-    dfsStack.pop();
-
-    if(node->getColor() == BallLarusNode::WHITE)
-      continue;
-
-    BallLarusNode* nextNode;
-    bool forward = true;
-    BLEdgeIterator succEnd = node->succEnd();
-
-    node->setColor(BallLarusNode::WHITE);
-    // first iterate over successors then predecessors
-    for(BLEdgeIterator edge = node->succBegin(), predEnd = node->predEnd();
-        edge != predEnd; edge++) {
-      if(edge == succEnd) {
-        edge = node->predBegin();
-        forward = false;
-      }
-
-      // Ignore split edges
-      if ((*edge)->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      nextNode = forward? (*edge)->getTarget(): (*edge)->getSource();
-      if(nextNode->getColor() != BallLarusNode::WHITE) {
-        nextNode->setColor(BallLarusNode::WHITE);
-        makeEdgeSpanning((BLInstrumentationEdge*)(*edge));
-      }
-    }
-  }
-
-  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-      edge != end; edge++) {
-    BLInstrumentationEdge* instEdge = (BLInstrumentationEdge*) (*edge);
-      // safe since createEdge is overriden
-    if(!instEdge->isInSpanningTree() && (*edge)->getType()
-        != BallLarusEdge::SPLITEDGE)
-      _chordEdges.push_back(instEdge);
-  }
-}
-
-// Pushes initialization further down in order to group the first
-// increment and initialization.
-void BLInstrumentationDag::pushInitialization() {
-  BLInstrumentationEdge* exitRootEdge =
-                (BLInstrumentationEdge*) getExitRootEdge();
-  exitRootEdge->setIsInitialization(true);
-  pushInitializationFromEdge(exitRootEdge);
-}
-
-// Pushes the path counter increments up in order to group the last path
-// number increment.
-void BLInstrumentationDag::pushCounters() {
-  BLInstrumentationEdge* exitRootEdge =
-    (BLInstrumentationEdge*) getExitRootEdge();
-  exitRootEdge->setIsCounterIncrement(true);
-  pushCountersFromEdge(exitRootEdge);
-}
-
-// Removes phony edges from the successor list of the source, and the
-// predecessor list of the target.
-void BLInstrumentationDag::unlinkPhony() {
-  BallLarusEdge* edge;
-
-  for(BLEdgeIterator next = _edges.begin(),
-      end = _edges.end(); next != end; next++) {
-    edge = (*next);
-
-    if( edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
-        edge->getType() == BallLarusEdge::SPLITEDGE_PHONY ||
-        edge->getType() == BallLarusEdge::CALLEDGE_PHONY ) {
-      unlinkEdge(edge);
-    }
-  }
-}
-
-// Generate a .dot graph to represent the DAG and pathNumbers
-void BLInstrumentationDag::generateDotGraph() {
-  std::string errorInfo;
-  std::string functionName = getFunction().getName().str();
-  std::string filename = "pathdag." + functionName + ".dot";
-
-  DEBUG (dbgs() << "Writing '" << filename << "'...\n");
-  raw_fd_ostream dotFile(filename.c_str(), errorInfo);
-
-  if (!errorInfo.empty()) {
-    errs() << "Error opening '" << filename.c_str() <<"' for writing!";
-    errs() << "\n";
-    return;
-  }
-
-  dotFile << "digraph " << functionName << " {\n";
-
-  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-       edge != end; edge++) {
-    std::string sourceName = (*edge)->getSource()->getName();
-    std::string targetName = (*edge)->getTarget()->getName();
-
-    dotFile << "\t\"" << sourceName.c_str() << "\" -> \""
-            << targetName.c_str() << "\" ";
-
-    long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
-
-    switch( (*edge)->getType() ) {
-    case BallLarusEdge::NORMAL:
-      dotFile << "[label=" << inc << "] [color=black];\n";
-      break;
-
-    case BallLarusEdge::BACKEDGE:
-      dotFile << "[color=cyan];\n";
-      break;
-
-    case BallLarusEdge::BACKEDGE_PHONY:
-      dotFile << "[label=" << inc
-              << "] [color=blue];\n";
-      break;
-
-    case BallLarusEdge::SPLITEDGE:
-      dotFile << "[color=violet];\n";
-      break;
-
-    case BallLarusEdge::SPLITEDGE_PHONY:
-      dotFile << "[label=" << inc << "] [color=red];\n";
-      break;
-
-    case BallLarusEdge::CALLEDGE_PHONY:
-      dotFile << "[label=" << inc     << "] [color=green];\n";
-      break;
-    }
-  }
-
-  dotFile << "}\n";
-}
-
-// Allows subclasses to determine which type of Node is created.
-// Override this method to produce subclasses of BallLarusNode if
-// necessary. The destructor of BallLarusDag will call free on each pointer
-// created.
-BallLarusNode* BLInstrumentationDag::createNode(BasicBlock* BB) {
-  return( new BLInstrumentationNode(BB) );
-}
-
-// Allows subclasses to determine which type of Edge is created.
-// Override this method to produce subclasses of BallLarusEdge if
-// necessary. The destructor of BallLarusDag will call free on each pointer
-// created.
-BallLarusEdge* BLInstrumentationDag::createEdge(BallLarusNode* source,
-                                                BallLarusNode* target, unsigned edgeNumber) {
-  // One can cast from BallLarusNode to BLInstrumentationNode since createNode
-  // is overriden to produce BLInstrumentationNode.
-  return( new BLInstrumentationEdge((BLInstrumentationNode*)source,
-                                    (BLInstrumentationNode*)target) );
-}
-
-// Sets the Value corresponding to the pathNumber register, constant,
-// or phinode.  Used by the instrumentation code to remember path
-// number Values.
-Value* BLInstrumentationNode::getStartingPathNumber(){
-  return(_startingPathNumber);
-}
-
-// Sets the Value of the pathNumber.  Used by the instrumentation code.
-void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) {
-  DEBUG(dbgs() << "  SPN-" << getName() << " <-- " << (pathNumber ?
-                                                       pathNumber->getName() :
-                                                       "unused") << "\n");
-  _startingPathNumber = pathNumber;
-}
-
-Value* BLInstrumentationNode::getEndingPathNumber(){
-  return(_endingPathNumber);
-}
-
-void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) {
-  DEBUG(dbgs() << "  EPN-" << getName() << " <-- "
-               << (pathNumber ? pathNumber->getName() : "unused") << "\n");
-  _endingPathNumber = pathNumber;
-}
-
-// Get the PHINode Instruction for this node.  Used by instrumentation
-// code.
-PHINode* BLInstrumentationNode::getPathPHI() {
-  return(_pathPHI);
-}
-
-// Set the PHINode Instruction for this node.  Used by instrumentation
-// code.
-void BLInstrumentationNode::setPathPHI(PHINode* pathPHI) {
-  _pathPHI = pathPHI;
-}
-
-// Removes the edge from the appropriate predecessor and successor
-// lists.
-void BLInstrumentationDag::unlinkEdge(BallLarusEdge* edge) {
-  if(edge == getExitRootEdge())
-    DEBUG(dbgs() << " Removing exit->root edge\n");
-
-  edge->getSource()->removeSuccEdge(edge);
-  edge->getTarget()->removePredEdge(edge);
-}
-
-// Makes an edge part of the spanning tree.
-void BLInstrumentationDag::makeEdgeSpanning(BLInstrumentationEdge* edge) {
-  edge->setIsInSpanningTree(true);
-  _treeEdges.push_back(edge);
-}
-
-// Pushes initialization and calls itself recursively.
-void BLInstrumentationDag::pushInitializationFromEdge(
-  BLInstrumentationEdge* edge) {
-  BallLarusNode* target;
-
-  target = edge->getTarget();
-  if( target->getNumberPredEdges() > 1 || target == getExit() ) {
-    return;
-  } else {
-    for(BLEdgeIterator next = target->succBegin(),
-          end = target->succEnd(); next != end; next++) {
-      BLInstrumentationEdge* intoEdge = (BLInstrumentationEdge*) *next;
-
-      // Skip split edges
-      if (intoEdge->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      intoEdge->setIncrement(intoEdge->getIncrement() +
-                             edge->getIncrement());
-      intoEdge->setIsInitialization(true);
-      pushInitializationFromEdge(intoEdge);
-    }
-
-    edge->setIncrement(0);
-    edge->setIsInitialization(false);
-  }
-}
-
-// Pushes path counter increments up recursively.
-void BLInstrumentationDag::pushCountersFromEdge(BLInstrumentationEdge* edge) {
-  BallLarusNode* source;
-
-  source = edge->getSource();
-  if(source->getNumberSuccEdges() > 1 || source == getRoot()
-     || edge->isInitialization()) {
-    return;
-  } else {
-    for(BLEdgeIterator previous = source->predBegin(),
-          end = source->predEnd(); previous != end; previous++) {
-      BLInstrumentationEdge* fromEdge = (BLInstrumentationEdge*) *previous;
-
-      // Skip split edges
-      if (fromEdge->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      fromEdge->setIncrement(fromEdge->getIncrement() +
-                             edge->getIncrement());
-      fromEdge->setIsCounterIncrement(true);
-      pushCountersFromEdge(fromEdge);
-    }
-
-    edge->setIncrement(0);
-    edge->setIsCounterIncrement(false);
-  }
-}
-
-// Depth first algorithm for determining the chord increments.
-void BLInstrumentationDag::calculateChordIncrementsDfs(long weight,
-                                                       BallLarusNode* v, BallLarusEdge* e) {
-  BLInstrumentationEdge* f;
-
-  for(BLEdgeIterator treeEdge = _treeEdges.begin(),
-        end = _treeEdges.end(); treeEdge != end; treeEdge++) {
-    f = (BLInstrumentationEdge*) *treeEdge;
-    if(e != f && v == f->getTarget()) {
-      calculateChordIncrementsDfs(
-        calculateChordIncrementsDir(e,f)*(weight) +
-        f->getWeight(), f->getSource(), f);
-    }
-    if(e != f && v == f->getSource()) {
-      calculateChordIncrementsDfs(
-        calculateChordIncrementsDir(e,f)*(weight) +
-        f->getWeight(), f->getTarget(), f);
-    }
-  }
-
-  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
-        end = _chordEdges.end(); chordEdge != end; chordEdge++) {
-    f = (BLInstrumentationEdge*) *chordEdge;
-    if(v == f->getSource() || v == f->getTarget()) {
-      f->setIncrement(f->getIncrement() +
-                      calculateChordIncrementsDir(e,f)*weight);
-    }
-  }
-}
-
-// Determines the relative direction of two edges.
-int BLInstrumentationDag::calculateChordIncrementsDir(BallLarusEdge* e,
-                                                      BallLarusEdge* f) {
-  if( e == NULL)
-    return(1);
-  else if(e->getSource() == f->getTarget()
-          || e->getTarget() == f->getSource())
-    return(1);
-
-  return(-1);
-}
-
-// Creates an increment constant representing incr.
-ConstantInt* PathProfiler::createIncrementConstant(long incr,
-                                                   int bitsize) {
-  return(ConstantInt::get(IntegerType::get(*Context, 32), incr));
-}
-
-// Creates an increment constant representing the value in
-// edge->getIncrement().
-ConstantInt* PathProfiler::createIncrementConstant(
-  BLInstrumentationEdge* edge) {
-  return(createIncrementConstant(edge->getIncrement(), 32));
-}
-
-// Finds the insertion point after pathNumber in block.  PathNumber may
-// be NULL.
-BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
-                                                     pathNumber) {
-  if(pathNumber == NULL || isa<ConstantInt>(pathNumber)
-     || (((Instruction*)(pathNumber))->getParent()) != block) {
-    return(block->getFirstInsertionPt());
-  } else {
-    Instruction* pathNumberInst = (Instruction*) (pathNumber);
-    BasicBlock::iterator insertPoint;
-    BasicBlock::iterator end = block->end();
-
-    for(insertPoint = block->begin();
-        insertPoint != end; insertPoint++) {
-      Instruction* insertInst = &(*insertPoint);
-
-      if(insertInst == pathNumberInst)
-        return(++insertPoint);
-    }
-
-    return(insertPoint);
-  }
-}
-
-// A PHINode is created in the node, and its values initialized to -1U.
-void PathProfiler::preparePHI(BLInstrumentationNode* node) {
-  BasicBlock* block = node->getBlock();
-  BasicBlock::iterator insertPoint = block->getFirstInsertionPt();
-  pred_iterator PB = pred_begin(node->getBlock()),
-          PE = pred_end(node->getBlock());
-  PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context),
-                                 std::distance(PB, PE), "pathNumber",
-                                 insertPoint );
-  node->setPathPHI(phi);
-  node->setStartingPathNumber(phi);
-  node->setEndingPathNumber(phi);
-
-  for(pred_iterator predIt = PB; predIt != PE; predIt++) {
-    BasicBlock* pred = (*predIt);
-
-    if(pred != NULL)
-      phi->addIncoming(createIncrementConstant((long)-1, 32), pred);
-  }
-}
-
-// Inserts source's pathNumber Value* into target.  Target may or may not
-// have multiple predecessors, and may or may not have its phiNode
-// initalized.
-void PathProfiler::pushValueIntoNode(BLInstrumentationNode* source,
-                                     BLInstrumentationNode* target) {
-  if(target->getBlock() == NULL)
-    return;
-
-
-  if(target->getNumberPredEdges() <= 1) {
-    assert(target->getStartingPathNumber() == NULL &&
-           "Target already has path number");
-    target->setStartingPathNumber(source->getEndingPathNumber());
-    target->setEndingPathNumber(source->getEndingPathNumber());
-    DEBUG(dbgs() << "  Passing path number"
-          << (source->getEndingPathNumber() ? "" : " (null)")
-          << " value through.\n");
-  } else {
-    if(target->getPathPHI() == NULL) {
-      DEBUG(dbgs() << "  Initializing PHI node for block '"
-            << target->getName() << "'\n");
-      preparePHI(target);
-    }
-    pushValueIntoPHI(target, source);
-    DEBUG(dbgs() << "  Passing number value into PHI for block '"
-          << target->getName() << "'\n");
-  }
-}
-
-// Inserts source's pathNumber Value* into the appropriate slot of
-// target's phiNode.
-void PathProfiler::pushValueIntoPHI(BLInstrumentationNode* target,
-                                    BLInstrumentationNode* source) {
-  PHINode* phi = target->getPathPHI();
-  assert(phi != NULL && "  Tried to push value into node with PHI, but node"
-         " actually had no PHI.");
-  phi->removeIncomingValue(source->getBlock(), false);
-  phi->addIncoming(source->getEndingPathNumber(), source->getBlock());
-}
-
-// The Value* in node, oldVal,  is updated with a Value* correspodning to
-// oldVal + addition.
-void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node,
-                                         Value* addition, bool atBeginning) {
-  BasicBlock* block = node->getBlock();
-  assert(node->getStartingPathNumber() != NULL);
-  assert(node->getEndingPathNumber() != NULL);
-
-  BasicBlock::iterator insertPoint;
-
-  if( atBeginning )
-    insertPoint = block->getFirstInsertionPt();
-  else
-    insertPoint = block->getTerminator();
-
-  DEBUG(errs() << "  Creating addition instruction.\n");
-  Value* newpn = BinaryOperator::Create(Instruction::Add,
-                                        node->getStartingPathNumber(),
-                                        addition, "pathNumber", insertPoint);
-
-  node->setEndingPathNumber(newpn);
-
-  if( atBeginning )
-    node->setStartingPathNumber(newpn);
-}
-
-// Creates a counter increment in the given node.  The Value* in node is
-// taken as the index into an array or hash table.  The hash table access
-// is a call to the runtime.
-void PathProfiler::insertCounterIncrement(Value* incValue,
-                                          BasicBlock::iterator insertPoint,
-                                          BLInstrumentationDag* dag,
-                                          bool increment) {
-  // Counter increment for array
-  if( dag->getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    // Get pointer to the array location
-    std::vector<Value*> gepIndices(2);
-    gepIndices[0] = Constant::getNullValue(Type::getInt32Ty(*Context));
-    gepIndices[1] = incValue;
-
-    GetElementPtrInst* pcPointer =
-      GetElementPtrInst::Create(dag->getCounterArray(), gepIndices,
-                                "counterInc", insertPoint);
-
-    // Load from the array - call it oldPC
-    LoadInst* oldPc = new LoadInst(pcPointer, "oldPC", insertPoint);
-
-    // Test to see whether adding 1 will overflow the counter
-    ICmpInst* isMax = new ICmpInst(insertPoint, CmpInst::ICMP_ULT, oldPc,
-                                   createIncrementConstant(0xffffffff, 32),
-                                   "isMax");
-
-    // Select increment for the path counter based on overflow
-    SelectInst* inc =
-      SelectInst::Create( isMax, createIncrementConstant(increment?1:-1,32),
-                          createIncrementConstant(0,32),
-                          "pathInc", insertPoint);
-
-    // newPc = oldPc + inc
-    BinaryOperator* newPc = BinaryOperator::Create(Instruction::Add,
-                                                   oldPc, inc, "newPC",
-                                                   insertPoint);
-
-    // Store back in to the array
-    new StoreInst(newPc, pcPointer, insertPoint);
-  } else { // Counter increment for hash
-    std::vector<Value*> args(2);
-    args[0] = ConstantInt::get(Type::getInt32Ty(*Context),
-                               currentFunctionNumber);
-    args[1] = incValue;
-
-    CallInst::Create(
-      increment ? llvmIncrementHashFunction : llvmDecrementHashFunction,
-      args, "", insertPoint);
-  }
-}
-
-// Inserts instrumentation for the given edge
-//
-// Pre: The edge's source node has pathNumber set if edge is non zero
-// path number increment.
-//
-// Post: Edge's target node has a pathNumber set to the path number Value
-// corresponding to the value of the path register after edge's
-// execution.
-//
-// FIXME: This should be reworked so it's not recursive.
-void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
-                                                   BLInstrumentationDag* dag) {
-  // Mark the edge as instrumented
-  edge->setHasInstrumentation(true);
-  DEBUG(dbgs() << "\nInstrumenting edge: " << (*edge) << "\n");
-
-  // create a new node for this edge's instrumentation
-  splitCritical(edge, dag);
-
-  BLInstrumentationNode* sourceNode = (BLInstrumentationNode*)edge->getSource();
-  BLInstrumentationNode* targetNode = (BLInstrumentationNode*)edge->getTarget();
-  BLInstrumentationNode* instrumentNode;
-  BLInstrumentationNode* nextSourceNode;
-
-  bool atBeginning = false;
-
-  // Source node has only 1 successor so any information can be simply
-  // inserted in to it without splitting
-  if( sourceNode->getBlock() && sourceNode->getNumberSuccEdges() <= 1) {
-    DEBUG(dbgs() << "  Potential instructions to be placed in: "
-          << sourceNode->getName() << " (at end)\n");
-    instrumentNode = sourceNode;
-    nextSourceNode = targetNode; // ... since we never made any new nodes
-  }
-
-  // The target node only has one predecessor, so we can safely insert edge
-  // instrumentation into it. If there was splitting, it must have been
-  // successful.
-  else if( targetNode->getNumberPredEdges() == 1 ) {
-    DEBUG(dbgs() << "  Potential instructions to be placed in: "
-          << targetNode->getName() << " (at beginning)\n");
-    pushValueIntoNode(sourceNode, targetNode);
-    instrumentNode = targetNode;
-    nextSourceNode = NULL; // ... otherwise we'll just keep splitting
-    atBeginning = true;
-  }
-
-  // Somehow, splitting must have failed.
-  else {
-    errs() << "Instrumenting could not split a critical edge.\n";
-    DEBUG(dbgs() << "  Couldn't split edge " << (*edge) << ".\n");
-    return;
-  }
-
-  // Insert instrumentation if this is a back or split edge
-  if( edge->getType() == BallLarusEdge::BACKEDGE ||
-      edge->getType() == BallLarusEdge::SPLITEDGE ) {
-    BLInstrumentationEdge* top =
-      (BLInstrumentationEdge*) edge->getPhonyRoot();
-    BLInstrumentationEdge* bottom =
-      (BLInstrumentationEdge*) edge->getPhonyExit();
-
-    assert( top->isInitialization() && " Top phony edge did not"
-            " contain a path number initialization.");
-    assert( bottom->isCounterIncrement() && " Bottom phony edge"
-            " did not contain a path counter increment.");
-
-    // split edge has yet to be initialized
-    if( !instrumentNode->getEndingPathNumber() ) {
-      instrumentNode->setStartingPathNumber(createIncrementConstant(0,32));
-      instrumentNode->setEndingPathNumber(createIncrementConstant(0,32));
-    }
-
-    BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstInsertionPt() :
-      instrumentNode->getBlock()->getTerminator();
-
-    // add information from the bottom edge, if it exists
-    if( bottom->getIncrement() ) {
-      Value* newpn =
-        BinaryOperator::Create(Instruction::Add,
-                               instrumentNode->getStartingPathNumber(),
-                               createIncrementConstant(bottom),
-                               "pathNumber", insertPoint);
-      instrumentNode->setEndingPathNumber(newpn);
-    }
-
-    insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                           insertPoint, dag);
-
-    if( atBeginning )
-      instrumentNode->setStartingPathNumber(createIncrementConstant(top));
-
-    instrumentNode->setEndingPathNumber(createIncrementConstant(top));
-
-    // Check for path counter increments
-    if( top->isCounterIncrement() ) {
-      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                             instrumentNode->getBlock()->getTerminator(),dag);
-      instrumentNode->setEndingPathNumber(0);
-    }
-  }
-
-  // Insert instrumentation if this is a normal edge
-  else {
-    BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstInsertionPt() :
-      instrumentNode->getBlock()->getTerminator();
-
-    if( edge->isInitialization() ) { // initialize path number
-      instrumentNode->setEndingPathNumber(createIncrementConstant(edge));
-    } else if( edge->getIncrement() )       {// increment path number
-      Value* newpn =
-        BinaryOperator::Create(Instruction::Add,
-                               instrumentNode->getStartingPathNumber(),
-                               createIncrementConstant(edge),
-                               "pathNumber", insertPoint);
-      instrumentNode->setEndingPathNumber(newpn);
-
-      if( atBeginning )
-        instrumentNode->setStartingPathNumber(newpn);
-    }
-
-    // Check for path counter increments
-    if( edge->isCounterIncrement() ) {
-      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                             insertPoint, dag);
-      instrumentNode->setEndingPathNumber(0);
-    }
-  }
-
-  // Push it along
-  if (nextSourceNode && instrumentNode->getEndingPathNumber())
-    pushValueIntoNode(instrumentNode, nextSourceNode);
-
-  // Add all the successors
-  for( BLEdgeIterator next = targetNode->succBegin(),
-         end = targetNode->succEnd(); next != end; next++ ) {
-    // So long as it is un-instrumented, add it to the list
-    if( !((BLInstrumentationEdge*)(*next))->hasInstrumentation() )
-      insertInstrumentationStartingAt((BLInstrumentationEdge*)*next,dag);
-    else
-      DEBUG(dbgs() << "  Edge " << *(BLInstrumentationEdge*)(*next)
-            << " already instrumented.\n");
-  }
-}
-
-// Inserts instrumentation according to the marked edges in dag.  Phony edges
-// must be unlinked from the DAG, but accessible from the backedges.  Dag
-// must have initializations, path number increments, and counter increments
-// present.
-//
-// Counter storage is created here.
-void PathProfiler::insertInstrumentation(
-  BLInstrumentationDag& dag, Module &M) {
-
-  BLInstrumentationEdge* exitRootEdge =
-    (BLInstrumentationEdge*) dag.getExitRootEdge();
-  insertInstrumentationStartingAt(exitRootEdge, &dag);
-
-  // Iterate through each call edge and apply the appropriate hash increment
-  // and decrement functions
-  BLEdgeVector callEdges = dag.getCallPhonyEdges();
-  for( BLEdgeIterator edge = callEdges.begin(),
-         end = callEdges.end(); edge != end; edge++ ) {
-    BLInstrumentationNode* node =
-      (BLInstrumentationNode*)(*edge)->getSource();
-    BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt();
-
-    // Find the first function call
-    while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call )
-      insertPoint++;
-
-    DEBUG(dbgs() << "\nInstrumenting method call block '"
-                 << node->getBlock()->getName() << "'\n");
-    DEBUG(dbgs() << "   Path number initialized: "
-                 << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
-
-    Value* newpn;
-    if( node->getStartingPathNumber() ) {
-      long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
-      if ( inc )
-        newpn = BinaryOperator::Create(Instruction::Add,
-                                       node->getStartingPathNumber(),
-                                       createIncrementConstant(inc,32),
-                                       "pathNumber", insertPoint);
-      else
-        newpn = node->getStartingPathNumber();
-    } else {
-      newpn = (Value*)createIncrementConstant(
-        ((BLInstrumentationEdge*)(*edge))->getIncrement(), 32);
-    }
-
-    insertCounterIncrement(newpn, insertPoint, &dag);
-    insertCounterIncrement(newpn, node->getBlock()->getTerminator(),
-                           &dag, false);
-  }
-}
-
-// Entry point of the module
-void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
-                                 Function &F, Module &M) {
-  // Build DAG from CFG
-  BLInstrumentationDag dag = BLInstrumentationDag(F);
-  dag.init();
-
-  // give each path a unique integer value
-  dag.calculatePathNumbers();
-
-  // modify path increments to increase the efficiency
-  // of instrumentation
-  dag.calculateSpanningTree();
-  dag.calculateChordIncrements();
-  dag.pushInitialization();
-  dag.pushCounters();
-  dag.unlinkPhony();
-
-  // potentially generate .dot graph for the dag
-  if (DotPathDag)
-    dag.generateDotGraph ();
-
-  // Should we store the information in an array or hash
-  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    Type* t = ArrayType::get(Type::getInt32Ty(*Context),
-                                   dag.getNumberOfPaths());
-
-    dag.setCounterArray(new GlobalVariable(M, t, false,
-                                           GlobalValue::InternalLinkage,
-                                           Constant::getNullValue(t), ""));
-  }
-
-  insertInstrumentation(dag, M);
-
-  // Add to global function reference table
-  unsigned type;
-  Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
-
-  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD )
-    type = ProfilingArray;
-  else
-    type = ProfilingHash;
-
-  std::vector<Constant*> entryArray(3);
-  entryArray[0] = createIncrementConstant(type,32);
-  entryArray[1] = createIncrementConstant(dag.getNumberOfPaths(),32);
-  entryArray[2] = dag.getCounterArray() ?
-    ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) :
-    Constant::getNullValue(voidPtr);
-
-  StructType* at = ftEntryTypeBuilder::get(*Context);
-  ConstantStruct* functionEntry =
-    (ConstantStruct*)ConstantStruct::get(at, entryArray);
-  ftInit.push_back(functionEntry);
-}
-
-// Output the bitcode if we want to observe instrumentation changess
-#define PRINT_MODULE dbgs() <<                               \
-  "\n\n============= MODULE BEGIN ===============\n" << M << \
-  "\n============== MODULE END ================\n"
-
-bool PathProfiler::runOnModule(Module &M) {
-  Context = &M.getContext();
-
-  DEBUG(dbgs()
-        << "****************************************\n"
-        << "****************************************\n"
-        << "**                                    **\n"
-        << "**   PATH PROFILING INSTRUMENTATION   **\n"
-        << "**                                    **\n"
-        << "****************************************\n"
-        << "****************************************\n");
-
-  // No main, no instrumentation!
-  Function *Main = M.getFunction("main");
-
-  // Using fortran? ... this kind of works
-  if (!Main)
-    Main = M.getFunction("MAIN__");
-
-  if (!Main) {
-    errs() << "WARNING: cannot insert path profiling into a module"
-           << " with no main function!\n";
-    return false;
-  }
-
-  llvmIncrementHashFunction = M.getOrInsertFunction(
-    "llvm_increment_path_count",
-    Type::getVoidTy(*Context), // return type
-    Type::getInt32Ty(*Context), // function number
-    Type::getInt32Ty(*Context), // path number
-    NULL );
-
-  llvmDecrementHashFunction = M.getOrInsertFunction(
-    "llvm_decrement_path_count",
-    Type::getVoidTy(*Context), // return type
-    Type::getInt32Ty(*Context), // function number
-    Type::getInt32Ty(*Context), // path number
-    NULL );
-
-  std::vector<Constant*> ftInit;
-  unsigned functionNumber = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
-    if (F->isDeclaration())
-      continue;
-
-    DEBUG(dbgs() << "Function: " << F->getName() << "\n");
-    functionNumber++;
-
-    // set function number
-    currentFunctionNumber = functionNumber;
-    runOnFunction(ftInit, *F, M);
-  }
-
-  Type *t = ftEntryTypeBuilder::get(*Context);
-  ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
-  Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit);
-
-  DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n");
-
-  GlobalVariable* functionTable =
-    new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage,
-                       ftInitConstant, "functionPathTable");
-  Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
-  InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable,
-                          PointerType::getUnqual(eltType));
-
-  DEBUG(PRINT_MODULE);
-
-  return true;
-}
-
-// If this edge is a critical edge, then inserts a node at this edge.
-// This edge becomes the first edge, and a new BallLarusEdge is created.
-// Returns true if the edge was split
-bool PathProfiler::splitCritical(BLInstrumentationEdge* edge,
-                                 BLInstrumentationDag* dag) {
-  unsigned succNum = edge->getSuccessorNumber();
-  BallLarusNode* sourceNode = edge->getSource();
-  BallLarusNode* targetNode = edge->getTarget();
-  BasicBlock* sourceBlock = sourceNode->getBlock();
-  BasicBlock* targetBlock = targetNode->getBlock();
-
-  if(sourceBlock == NULL || targetBlock == NULL
-     || sourceNode->getNumberSuccEdges() <= 1
-     || targetNode->getNumberPredEdges() == 1 ) {
-    return(false);
-  }
-
-  TerminatorInst* terminator = sourceBlock->getTerminator();
-
-  if( SplitCriticalEdge(terminator, succNum, this, false)) {
-    BasicBlock* newBlock = terminator->getSuccessor(succNum);
-    dag->splitUpdate(edge, newBlock);
-    return(true);
-  } else
-    return(false);
-}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
deleted file mode 100644
index 4b3de6d..0000000
--- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a few helper functions which are used by profile
-// instrumentation code to instrument the code.  This allows the profiler pass
-// to worry about *what* to insert, and these functions take care of *how* to do
-// it.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ProfilingUtils.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-
-void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
-                                   GlobalValue *Array,
-                                   PointerType *arrayType) {
-  LLVMContext &Context = MainFn->getContext();
-  Type *ArgVTy =
-    PointerType::getUnqual(Type::getInt8PtrTy(Context));
-  PointerType *UIntPtr = arrayType ? arrayType :
-    Type::getInt32PtrTy(Context);
-  Module &M = *MainFn->getParent();
-  Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context),
-                                           Type::getInt32Ty(Context),
-                                           ArgVTy, UIntPtr,
-                                           Type::getInt32Ty(Context),
-                                           (Type *)0);
-
-  // This could force argc and argv into programs that wouldn't otherwise have
-  // them, but instead we just pass null values in.
-  std::vector<Value*> Args(4);
-  Args[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-  Args[1] = Constant::getNullValue(ArgVTy);
-
-  // Skip over any allocas in the entry block.
-  BasicBlock *Entry = MainFn->begin();
-  BasicBlock::iterator InsertPos = Entry->begin();
-  while (isa<AllocaInst>(InsertPos)) ++InsertPos;
-
-  std::vector<Constant*> GEPIndices(2,
-                             Constant::getNullValue(Type::getInt32Ty(Context)));
-  unsigned NumElements = 0;
-  if (Array) {
-    Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices);
-    NumElements =
-      cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
-  } else {
-    // If this profiling instrumentation doesn't have a constant array, just
-    // pass null.
-    Args[2] = ConstantPointerNull::get(UIntPtr);
-  }
-  Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements);
-
-  CallInst *InitCall = CallInst::Create(InitFn, Args, "newargc", InsertPos);
-
-  // If argc or argv are not available in main, just pass null values in.
-  Function::arg_iterator AI;
-  switch (MainFn->arg_size()) {
-  default:
-  case 2:
-    AI = MainFn->arg_begin(); ++AI;
-    if (AI->getType() != ArgVTy) {
-      Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy,
-                                                            false);
-      InitCall->setArgOperand(1,
-          CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
-    } else {
-      InitCall->setArgOperand(1, AI);
-    }
-    /* FALL THROUGH */
-
-  case 1:
-    AI = MainFn->arg_begin();
-    // If the program looked at argc, have it look at the return value of the
-    // init call instead.
-    if (!AI->getType()->isIntegerTy(32)) {
-      Instruction::CastOps opcode;
-      if (!AI->use_empty()) {
-        opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true);
-        AI->replaceAllUsesWith(
-          CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos));
-      }
-      opcode = CastInst::getCastOpcode(AI, true,
-                                       Type::getInt32Ty(Context), true);
-      InitCall->setArgOperand(0,
-          CastInst::Create(opcode, AI, Type::getInt32Ty(Context),
-                           "argc.cast", InitCall));
-    } else {
-      AI->replaceAllUsesWith(InitCall);
-      InitCall->setArgOperand(0, AI);
-    }
-
-  case 0: break;
-  }
-}
-
-void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
-                                   GlobalValue *CounterArray, bool beginning) {
-  // Insert the increment after any alloca or PHI instructions...
-  BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() :
-                                   BB->getTerminator();
-  while (isa<AllocaInst>(InsertPos))
-    ++InsertPos;
-
-  LLVMContext &Context = BB->getContext();
-
-  // Create the getelementptr constant expression
-  std::vector<Constant*> Indices(2);
-  Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-  Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum);
-  Constant *ElementPtr =
-    ConstantExpr::getGetElementPtr(CounterArray, Indices);
-
-  // Load, increment and store the value back.
-  Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
-  Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal,
-                                 ConstantInt::get(Type::getInt32Ty(Context), 1),
-                                         "NewFuncCounter", InsertPos);
-  new StoreInst(NewVal, ElementPtr, InsertPos);
-}
-
-void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) {
-  // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those
-  // types.
-  Type *GlobalDtorElems[2] = {
-    Type::getInt32Ty(Mod->getContext()),
-    FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo()
-  };
-  StructType *GlobalDtorElemTy =
-      StructType::get(Mod->getContext(), GlobalDtorElems, false);
-
-  // Construct the new element we'll be adding.
-  Constant *Elem[2] = {
-    ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535),
-    ConstantExpr::getBitCast(Callee, GlobalDtorElems[1])
-  };
-
-  // If llvm.global_dtors exists, make a copy of the things in its list and
-  // delete it, to replace it with one that has a larger array type.
-  std::vector<Constant *> dtors;
-  if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) {
-    if (ConstantArray *InitList =
-        dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) {
-      for (unsigned i = 0, e = InitList->getType()->getNumElements();
-           i != e; ++i)
-        dtors.push_back(cast<Constant>(InitList->getOperand(i)));
-    }
-    GlobalDtors->eraseFromParent();
-  }
-
-  // Build up llvm.global_dtors with our new item in it.
-  GlobalVariable *GlobalDtors = new GlobalVariable(
-      *Mod, ArrayType::get(GlobalDtorElemTy, 1), false,
-      GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors");
-                                    
-  dtors.push_back(ConstantStruct::get(GlobalDtorElemTy, Elem));
-  GlobalDtors->setInitializer(ConstantArray::get(
-      cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors));
-}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h
deleted file mode 100644
index 09b2217..0000000
--- a/lib/Transforms/Instrumentation/ProfilingUtils.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a few helper functions which are used by profile
-// instrumentation code to instrument the code.  This allows the profiler pass
-// to worry about *what* to insert, and these functions take care of *how* to do
-// it.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PROFILINGUTILS_H
-#define PROFILINGUTILS_H
-
-namespace llvm {
-  class BasicBlock;
-  class Function;
-  class GlobalValue;
-  class Module;
-  class PointerType;
-
-  void InsertProfilingInitCall(Function *MainFn, const char *FnName,
-                               GlobalValue *Arr = 0,
-                               PointerType *arrayType = 0);
-  void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
-                               GlobalValue *CounterArray,
-                               bool beginning = true);
-  void InsertProfilingShutdownCall(Function *Callee, Module *Mod);
-}
-
-#endif
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index cc971a3..89fb746 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -227,7 +227,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -240,12 +240,8 @@ bool ThreadSanitizer::doInitialization(Module &M) {
 }
 
 static bool isVtableAccess(Instruction *I) {
-  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) {
-    if (Tag->getNumOperands() < 1) return false;
-    if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
-      if (Tag1->getString() == "vtable pointer") return true;
-    }
-  }
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
   return false;
 }
 
@@ -362,7 +358,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // (e.g. variables that do not escape, etc).
 
   // Instrument memory accesses.
-  if (ClInstrumentMemoryAccesses)
+  if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread))
     for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
       Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
     }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 6f94a7c..2976df6 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -176,91 +176,6 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   return 0;
 }
 
-/// \brief Test whether the given retainable object pointer escapes.
-///
-/// This differs from regular escape analysis in that a use as an
-/// argument to a call is not considered an escape.
-///
-static bool DoesRetainableObjPtrEscape(const User *Ptr) {
-  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n");
-
-  // Walk the def-use chains.
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(Ptr);
-  // If Ptr has any operands add them as well.
-  for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E;
-       ++I) {
-    Worklist.push_back(*I);
-  }
-
-  // Ensure we do not visit any value twice.
-  SmallPtrSet<const Value *, 8> VisitedSet;
-
-  do {
-    const Value *V = Worklist.pop_back_val();
-
-    DEBUG(dbgs() << "Visiting: " << *V << "\n");
-
-    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-         UI != UE; ++UI) {
-      const User *UUser = *UI;
-
-      DEBUG(dbgs() << "User: " << *UUser << "\n");
-
-      // Special - Use by a call (callee or argument) is not considered
-      // to be an escape.
-      switch (GetBasicInstructionClass(UUser)) {
-      case IC_StoreWeak:
-      case IC_InitWeak:
-      case IC_StoreStrong:
-      case IC_Autorelease:
-      case IC_AutoreleaseRV: {
-        DEBUG(dbgs() << "User copies pointer arguments. Pointer Escapes!\n");
-        // These special functions make copies of their pointer arguments.
-        return true;
-      }
-      case IC_IntrinsicUser:
-        // Use by the use intrinsic is not an escape.
-        continue;
-      case IC_User:
-      case IC_None:
-        // Use by an instruction which copies the value is an escape if the
-        // result is an escape.
-        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
-            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
-
-          if (VisitedSet.insert(UUser)) {
-            DEBUG(dbgs() << "User copies value. Ptr escapes if result escapes."
-                  " Adding to list.\n");
-            Worklist.push_back(UUser);
-          } else {
-            DEBUG(dbgs() << "Already visited node.\n");
-          }
-          continue;
-        }
-        // Use by a load is not an escape.
-        if (isa<LoadInst>(UUser))
-          continue;
-        // Use by a store is not an escape if the use is the address.
-        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
-          if (V != SI->getValueOperand())
-            continue;
-        break;
-      default:
-        // Regular calls and other stuff are not considered escapes.
-        continue;
-      }
-      // Otherwise, conservatively assume an escape.
-      DEBUG(dbgs() << "Assuming ptr escapes.\n");
-      return true;
-    }
-  } while (!Worklist.empty());
-
-  // No escapes found.
-  DEBUG(dbgs() << "Ptr does not escape.\n");
-  return false;
-}
-
 /// This is a wrapper around getUnderlyingObjCPtr along the lines of
 /// GetUnderlyingObjects except that it returns early when it sees the first
 /// alloca.
@@ -517,7 +432,7 @@ namespace {
     bool Partial;
 
     /// The current position in the sequence.
-    Sequence Seq : 8;
+    unsigned char Seq : 8;
 
     /// Unidirectional information about the current sequence.
     RRInfo RRI;
@@ -583,7 +498,7 @@ namespace {
     }
 
     Sequence GetSeq() const {
-      return Seq;
+      return static_cast<Sequence>(Seq);
     }
 
     void ClearSequenceProgress() {
@@ -623,7 +538,8 @@ namespace {
 
 void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
-  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
+  Seq = MergeSeqs(static_cast<Sequence>(Seq), static_cast<Sequence>(Other.Seq),
+                  TopDown);
   KnownPositiveRefCount &= Other.KnownPositiveRefCount;
 
   // If we're not in a sequence (anymore), drop all associated state.
@@ -674,7 +590,9 @@ namespace {
     SmallVector<BasicBlock *, 2> Succs;
 
   public:
-    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+    static const unsigned OverflowOccurredValue;
+
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) { }
 
     typedef MapTy::iterator ptr_iterator;
     typedef MapTy::const_iterator ptr_const_iterator;
@@ -745,27 +663,31 @@ namespace {
     /// Returns true if overflow occured. Returns false if overflow did not
     /// occur.
     bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
-      assert(TopDownPathCount != 0);
-      assert(BottomUpPathCount != 0);
+      if (TopDownPathCount == OverflowOccurredValue ||
+          BottomUpPathCount == OverflowOccurredValue)
+        return true;
       unsigned long long Product =
         (unsigned long long)TopDownPathCount*BottomUpPathCount;
-      PathCount = Product;
-      // Overflow occured if any of the upper bits of Product are set.
-      return Product >> 32;
+      // Overflow occured if any of the upper bits of Product are set or if all
+      // the lower bits of Product are all set.
+      return (Product >> 32) ||
+             ((PathCount = Product) == OverflowOccurredValue);
     }
 
     // Specialized CFG utilities.
     typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
-    edge_iterator pred_begin() { return Preds.begin(); }
-    edge_iterator pred_end() { return Preds.end(); }
-    edge_iterator succ_begin() { return Succs.begin(); }
-    edge_iterator succ_end() { return Succs.end(); }
+    edge_iterator pred_begin() const { return Preds.begin(); }
+    edge_iterator pred_end() const { return Preds.end(); }
+    edge_iterator succ_begin() const { return Succs.begin(); }
+    edge_iterator succ_end() const { return Succs.end(); }
 
     void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
     void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
 
     bool isExit() const { return Succs.empty(); }
   };
+
+  const unsigned BBState::OverflowOccurredValue = 0xffffffff;
 }
 
 void BBState::InitFromPred(const BBState &Other) {
@@ -781,13 +703,25 @@ void BBState::InitFromSucc(const BBState &Other) {
 /// The top-down traversal uses this to merge information about predecessors to
 /// form the initial state for a new block.
 void BBState::MergePred(const BBState &Other) {
+  if (TopDownPathCount == OverflowOccurredValue)
+    return;
+
   // Other.TopDownPathCount can be 0, in which case it is either dead or a
   // loop backedge. Loop backedges are special.
   TopDownPathCount += Other.TopDownPathCount;
 
+  // In order to be consistent, we clear the top down pointers when by adding
+  // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occured.
+  if (TopDownPathCount == OverflowOccurredValue) {
+    clearTopDownPointers();
+    return;
+  }
+
   // Check for overflow. If we have overflow, fall back to conservative
   // behavior.
   if (TopDownPathCount < Other.TopDownPathCount) {
+    TopDownPathCount = OverflowOccurredValue;
     clearTopDownPointers();
     return;
   }
@@ -813,13 +747,25 @@ void BBState::MergePred(const BBState &Other) {
 /// The bottom-up traversal uses this to merge information about successors to
 /// form the initial state for a new block.
 void BBState::MergeSucc(const BBState &Other) {
+  if (BottomUpPathCount == OverflowOccurredValue)
+    return;
+
   // Other.BottomUpPathCount can be 0, in which case it is either dead or a
   // loop backedge. Loop backedges are special.
   BottomUpPathCount += Other.BottomUpPathCount;
 
+  // In order to be consistent, we clear the top down pointers when by adding
+  // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occured.
+  if (BottomUpPathCount == OverflowOccurredValue) {
+    clearBottomUpPointers();
+    return;
+  }
+
   // Check for overflow. If we have overflow, fall back to conservative
   // behavior.
   if (BottomUpPathCount < Other.BottomUpPathCount) {
+    BottomUpPathCount = OverflowOccurredValue;
     clearBottomUpPointers();
     return;
   }
@@ -1158,13 +1104,9 @@ namespace {
     unsigned ARCAnnotationProvenanceSourceMDKind;
 #endif // ARC_ANNOATIONS
 
-    bool IsRetainBlockOptimizable(const Instruction *Inst);
-
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                                    InstructionClass &Class);
-    bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock,
-                                 InstructionClass &Class);
     void OptimizeIndividualCalls(Function &F);
 
     void CheckForCFGHazards(const BasicBlock *BB,
@@ -1253,22 +1195,6 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
 }
 
-bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
-  // Without the magic metadata tag, we have to assume this might be an
-  // objc_retainBlock call inserted to convert a block pointer to an id,
-  // in which case it really is needed.
-  if (!Inst->getMetadata(CopyOnEscapeMDKind))
-    return false;
-
-  // If the pointer "escapes" (not including being used in a call),
-  // the copy may be needed.
-  if (DoesRetainableObjPtrEscape(Inst))
-    return false;
-
-  // Otherwise, it's not needed.
-  return true;
-}
-
 /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
 /// not a return value.  Or, if it can be paired with an
 /// objc_autoreleaseReturnValue, delete the pair and return true.
@@ -1369,41 +1295,6 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
 
 }
 
-// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain
-// calls.
-//
-// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and
-// does not escape (following the rules of block escaping), strength reduce the
-// objc_retainBlock to an objc_retain.
-//
-// TODO: If an objc_retainBlock call is dominated period by a previous
-// objc_retainBlock call, strength reduce the objc_retainBlock to an
-// objc_retain.
-bool
-ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst,
-                                    InstructionClass &Class) {
-  assert(GetBasicInstructionClass(Inst) == Class);
-  assert(IC_RetainBlock == Class);
-
-  // If we can not optimize Inst, return false.
-  if (!IsRetainBlockOptimizable(Inst))
-    return false;
-
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "Strength reduced retainBlock => retain.\n");
-  DEBUG(dbgs() << "Old: " << *Inst << "\n");
-  CallInst *RetainBlock = cast<CallInst>(Inst);
-  Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
-  RetainBlock->setCalledFunction(NewDecl);
-  // Remove copy_on_escape metadata.
-  RetainBlock->setMetadata(CopyOnEscapeMDKind, 0);
-  Class = IC_Retain;
-  DEBUG(dbgs() << "New: " << *Inst << "\n");
-  return true;
-}
-
 /// Visit each call, one at a time, and make simplifications without doing any
 /// additional analysis.
 void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
@@ -1480,11 +1371,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
       break;
     }
-    case IC_RetainBlock:
-      // If we strength reduce an objc_retainBlock to an objc_retain, continue
-      // onto the objc_retain peephole optimizations. Otherwise break.
-      OptimizeRetainBlockCall(F, Inst, Class);
-      break;
     case IC_RetainRV:
       if (OptimizeRetainRVCall(F, Inst))
         continue;
@@ -2520,15 +2406,26 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         if (Jt == Releases.end())
           return false;
         const RRInfo &NewRetainReleaseRRI = Jt->second;
-        assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+
+        // If the release does not have a reference to the retain as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewRetainReleaseRRI.Calls.count(NewRetain))
+          return false;
+
         if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
 
           // If we overflow when we compute the path count, don't remove/move
           // anything.
           const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
-          unsigned PathCount;
+          unsigned PathCount = BBState::OverflowOccurredValue;
           if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
             return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
           OldDelta -= PathCount;
 
           // Merge the ReleaseMetadata and IsTailCallRelease values.
@@ -2558,8 +2455,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                 // If we overflow when we compute the path count, don't
                 // remove/move anything.
                 const BBState &RIPBBState = BBStates[RIP->getParent()];
+                PathCount = BBState::OverflowOccurredValue;
                 if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
                   return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
                 NewDelta -= PathCount;
               }
             }
@@ -2589,15 +2490,25 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         if (Jt == Retains.end())
           return false;
         const RRInfo &NewReleaseRetainRRI = Jt->second;
-        assert(NewReleaseRetainRRI.Calls.count(NewRelease));
-        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
 
+        // If the retain does not have a reference to the release as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewReleaseRetainRRI.Calls.count(NewRelease))
+          return false;
+
+        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
           // If we overflow when we compute the path count, don't remove/move
           // anything.
           const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
-          unsigned PathCount;
+          unsigned PathCount = BBState::OverflowOccurredValue;
           if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
             return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
           OldDelta += PathCount;
           OldCount += PathCount;
 
@@ -2612,8 +2523,13 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                 // If we overflow when we compute the path count, don't
                 // remove/move anything.
                 const BBState &RIPBBState = BBStates[RIP->getParent()];
+
+                PathCount = BBState::OverflowOccurredValue;
                 if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
                   return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
                 NewDelta += PathCount;
                 NewCount += PathCount;
               }
diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
deleted file mode 100644
index e755008..0000000
--- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a very simple profile guided basic block placement
-// algorithm.  The idea is to put frequently executed blocks together at the
-// start of the function, and hopefully increase the number of fall-through
-// conditional branches.  If there is no profile information for a particular
-// function, this pass basically orders blocks in depth-first order
-//
-// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
-// Positioning" by Pettis and Hansen, except that it uses basic block counts
-// instead of edge counts.  This should be improved in many ways, but is very
-// simple for now.
-//
-// Basically we "place" the entry block, then loop over all successors in a DFO,
-// placing the most frequently executed successor until we run out of blocks.  I
-// told you this was _extremely_ simplistic. :) This is also much slower than it
-// could be.  When it becomes important, this pass will be rewritten to use a
-// better algorithm, and then we can worry about efficiency.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "block-placement"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumMoved, "Number of basic blocks moved");
-
-namespace {
-  struct BlockPlacement : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    BlockPlacement() : FunctionPass(ID) {
-      initializeBlockPlacementPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<ProfileInfo>();
-      //AU.addPreserved<ProfileInfo>();  // Does this work?
-    }
-  private:
-    /// PI - The profile information that is guiding us.
-    ///
-    ProfileInfo *PI;
-
-    /// NumMovedBlocks - Every time we move a block, increment this counter.
-    ///
-    unsigned NumMovedBlocks;
-
-    /// PlacedBlocks - Every time we place a block, remember it so we don't get
-    /// into infinite loops.
-    std::set<BasicBlock*> PlacedBlocks;
-
-    /// InsertPos - This an iterator to the next place we want to insert a
-    /// block.
-    Function::iterator InsertPos;
-
-    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
-    /// successors.
-    void PlaceBlocks(BasicBlock *BB);
-  };
-}
-
-char BlockPlacement::ID = 0;
-INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-
-FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
-
-bool BlockPlacement::runOnFunction(Function &F) {
-  PI = &getAnalysis<ProfileInfo>();
-
-  NumMovedBlocks = 0;
-  InsertPos = F.begin();
-
-  // Recursively place all blocks.
-  PlaceBlocks(F.begin());
-
-  PlacedBlocks.clear();
-  NumMoved += NumMovedBlocks;
-  return NumMovedBlocks != 0;
-}
-
-
-/// PlaceBlocks - Recursively place the specified blocks and any unplaced
-/// successors.
-void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
-  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
-  PlacedBlocks.insert(BB);
-
-  // Place the specified block.
-  if (&*InsertPos != BB) {
-    // Use splice to move the block into the right place.  This avoids having to
-    // remove the block from the function then readd it, which causes a bunch of
-    // symbol table traffic that is entirely pointless.
-    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
-    Blocks.splice(InsertPos, Blocks, BB);
-
-    ++NumMovedBlocks;
-  } else {
-    // This block is already in the right place, we don't have to do anything.
-    ++InsertPos;
-  }
-
-  // Keep placing successors until we run out of ones to place.  Note that this
-  // loop is very inefficient (N^2) for blocks with many successors, like switch
-  // statements.  FIXME!
-  while (1) {
-    // Okay, now place any unplaced successors.
-    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
-
-    // Scan for the first unplaced successor.
-    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
-      /*empty*/;
-    if (SI == E) return;  // No more successors to place.
-
-    double MaxExecutionCount = PI->getExecutionCount(*SI);
-    BasicBlock *MaxSuccessor = *SI;
-
-    // Scan for more frequently executed successors
-    for (; SI != E; ++SI)
-      if (!PlacedBlocks.count(*SI)) {
-        double Count = PI->getExecutionCount(*SI);
-        if (Count > MaxExecutionCount ||
-            // Prefer to not disturb the code.
-            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
-          MaxExecutionCount = Count;
-          MaxSuccessor = *SI;
-        }
-      }
-
-    // Now that we picked the maximally executed successor, place it.
-    PlaceBlocks(MaxSuccessor);
-  }
-}
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index f5d1db1..626c810 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_llvm_library(LLVMScalarOpts
   ADCE.cpp
-  BasicBlockPlacement.cpp
   CodeGenPrepare.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
@@ -17,12 +16,15 @@ add_llvm_library(LLVMScalarOpts
   LoopInstSimplify.cpp
   LoopRotation.cpp
   LoopStrengthReduce.cpp
+  LoopRerollPass.cpp
   LoopUnrollPass.cpp
   LoopUnswitch.cpp
   LowerAtomic.cpp
   MemCpyOptimizer.cpp
+  PartiallyInlineLibCalls.cpp
   Reassociate.cpp
   Reg2Mem.cpp
+  SampleProfile.cpp
   SCCP.cpp
   SROA.cpp
   Scalar.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 44804a2..007e9b7 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -80,7 +79,6 @@ namespace {
     const TargetLowering *TLI;
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
-    ProfileInfo *PFI;
 
     /// CurInstIterator - As we scan instructions optimizing them, this is the
     /// next instruction to optimize.  Xforms that can invalidate this should
@@ -111,7 +109,6 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ProfileInfo>();
       AU.addRequired<TargetLibraryInfo>();
     }
 
@@ -151,7 +148,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (TM) TLI = TM->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
-  PFI = getAnalysisIfAvailable<ProfileInfo>();
   OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                            Attribute::OptimizeForSize);
 
@@ -442,10 +438,6 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
     DT->changeImmediateDominator(DestBB, NewIDom);
     DT->eraseNode(BB);
   }
-  if (PFI) {
-    PFI->replaceAllUses(BB, DestBB);
-    PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB));
-  }
   BB->eraseFromParent();
   ++NumBlocksElim;
 
@@ -840,10 +832,12 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
   }
 };
 
+#ifndef NDEBUG
 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
   AM.print(OS);
   return OS;
 }
+#endif
 
 void ExtAddrMode::print(raw_ostream &OS) const {
   bool NeedPlus = false;
@@ -1035,7 +1029,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
   case Instruction::IntToPtr:
     // This inttoptr is a no-op if the integer type is pointer sized.
     if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy())
+        TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace()))
       return MatchAddr(AddrInst->getOperand(0), Depth);
     return false;
   case Instruction::BitCast:
@@ -1418,8 +1412,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     Value *Address = User->getOperand(OpNo);
     if (!Address->getType()->isPointerTy())
       return false;
-    Type *AddressAccessTy =
-      cast<PointerType>(Address->getType())->getElementType();
+    Type *AddressAccessTy = Address->getType()->getPointerElementType();
 
     // Do a match against the root of this address, ignoring profitability. This
     // will tell us if the addressing mode for the memory operation will
@@ -1573,9 +1566,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
-    Type *IntPtrTy =
-          TLI->getDataLayout()->getIntPtrType(AccessTy->getContext());
-
+    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
     Value *Result = 0;
 
     // Start with the base register. Do this first so that subsequent address
@@ -1894,7 +1885,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P)) {
+    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0,
+                                       TLInfo, DT)) {
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
       ++NumPHIsElim;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 3c08634..5266894 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -72,11 +72,6 @@ namespace {
 }
 
 namespace llvm {
-// SimpleValue is POD.
-template<> struct isPodLike<SimpleValue> {
-  static const bool value = true;
-};
-
 template<> struct DenseMapInfo<SimpleValue> {
   static inline SimpleValue getEmptyKey() {
     return DenseMapInfo<Instruction*>::getEmptyKey();
@@ -220,11 +215,6 @@ namespace {
 }
 
 namespace llvm {
-  // CallValue is POD.
-  template<> struct isPodLike<CallValue> {
-    static const bool value = true;
-  };
-
   template<> struct DenseMapInfo<CallValue> {
     static inline CallValue getEmptyKey() {
       return DenseMapInfo<Instruction*>::getEmptyKey();
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index bc418af..6af269d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
@@ -507,7 +508,9 @@ namespace {
     enum ValType {
       SimpleVal,  // A simple offsetted value that is accessed.
       LoadVal,    // A value produced by a load.
-      MemIntrin   // A memory intrinsic which is loaded from.
+      MemIntrin,  // A memory intrinsic which is loaded from.
+      UndefVal    // A UndefValue representing a value from dead block (which
+                  // is not yet physically removed from the CFG). 
     };
   
     /// V - The value that is live out of the block.
@@ -545,10 +548,20 @@ namespace {
       Res.Offset = Offset;
       return Res;
     }
-  
+
+    static AvailableValueInBlock getUndef(BasicBlock *BB) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(0);
+      Res.Val.setInt(UndefVal);
+      Res.Offset = 0;
+      return Res;
+    }
+
     bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
     bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
     bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+    bool isUndefValue() const { return Val.getInt() == UndefVal; }
   
     Value *getSimpleValue() const {
       assert(isSimpleValue() && "Wrong accessor");
@@ -576,6 +589,7 @@ namespace {
     DominatorTree *DT;
     const DataLayout *TD;
     const TargetLibraryInfo *TLI;
+    SetVector<BasicBlock *> DeadBlocks;
 
     ValueTable VN;
 
@@ -698,6 +712,9 @@ namespace {
     unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
                                          const BasicBlockEdge &Root);
     bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root);
+    bool processFoldableCondBr(BranchInst *BI);
+    void addDeadBlock(BasicBlock *BB);
+    void assignValNumForDeadCode();
   };
 
   char GVN::ID = 0;
@@ -1071,14 +1088,15 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (Offset == -1)
     return Offset;
 
+  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   if (ConstantFoldLoadFromConstPtr(Src, &TD))
     return Offset;
   return -1;
@@ -1155,7 +1173,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     Type *DestPTy =
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
     DestPTy = PointerType::get(DestPTy,
-                       cast<PointerType>(PtrVal->getType())->getAddressSpace());
+                               PtrVal->getType()->getPointerAddressSpace());
     Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
@@ -1230,15 +1248,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
 
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
-  ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
 
@@ -1253,8 +1272,10 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   // just use the dominating value directly.
   if (ValuesPerBlock.size() == 1 &&
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
-                                               LI->getParent()))
+                                               LI->getParent())) {
+    assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
     return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
+  }
 
   // Otherwise, we have to construct SSA form.
   SmallVector<PHINode*, 8> NewPHIs;
@@ -1324,7 +1345,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
                    << *getCoercedLoadValue() << '\n'
                    << *Res << '\n' << "\n\n\n");
     }
-  } else {
+  } else if (isMemIntrinValue()) {
     const DataLayout *TD = gvn.getDataLayout();
     assert(TD && "Need target data to handle type mismatch case");
     Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
@@ -1332,6 +1353,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
                  << *Res << '\n' << "\n\n\n");
+  } else {
+    assert(isUndefValue() && "Should be UndefVal");
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    return UndefValue::get(LoadTy);
   }
   return Res;
 }
@@ -1355,6 +1380,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
+    if (DeadBlocks.count(DepBB)) {
+      // Dead dependent mem-op disguise as a load evaluating the same value
+      // as the load in question.
+      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+      continue;
+    }
+
     if (!DepInfo.isDef() && !DepInfo.isClobber()) {
       UnavailableBlocks.push_back(DepBB);
       continue;
@@ -2191,11 +2223,13 @@ bool GVN::processInstruction(Instruction *I) {
   // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+    if (!BI->isConditional())
       return false;
 
-    Value *BranchCond = BI->getCondition();
+    if (isa<Constant>(BI->getCondition()))
+      return processFoldableCondBr(BI);
 
+    Value *BranchCond = BI->getCondition();
     BasicBlock *TrueSucc = BI->getSuccessor(0);
     BasicBlock *FalseSucc = BI->getSuccessor(1);
     // Avoid multiple edges early.
@@ -2312,6 +2346,9 @@ bool GVN::runOnFunction(Function& F) {
   }
 
   if (EnablePRE) {
+    // Fabricate val-num for dead-code in order to suppress assertion in
+    // performPRE().
+    assignValNumForDeadCode();
     bool PREChanged = true;
     while (PREChanged) {
       PREChanged = performPRE(F);
@@ -2325,6 +2362,9 @@ bool GVN::runOnFunction(Function& F) {
   // Actually, when this happens, we should just fully integrate PRE into GVN.
 
   cleanupGlobalSets();
+  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+  // iteration. 
+  DeadBlocks.clear();
 
   return Changed;
 }
@@ -2335,6 +2375,9 @@ bool GVN::processBlock(BasicBlock *BB) {
   // (and incrementing BI before processing an instruction).
   assert(InstrsToErase.empty() &&
          "We expect InstrsToErase to be empty across iterations");
+  if (DeadBlocks.count(BB))
+    return false;
+
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
@@ -2628,3 +2671,133 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
     }
   }
 }
+
+// BB is declared dead, which implied other blocks become dead as well. This
+// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+// live successors, update their phi nodes by replacing the operands
+// corresponding to dead blocks with UndefVal.
+//
+void GVN::addDeadBlock(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 4> NewDead;
+  SmallSetVector<BasicBlock *, 4> DF;
+
+  NewDead.push_back(BB);
+  while (!NewDead.empty()) {
+    BasicBlock *D = NewDead.pop_back_val();
+    if (DeadBlocks.count(D))
+      continue;
+
+    // All blocks dominated by D are dead.
+    SmallVector<BasicBlock *, 8> Dom;
+    DT->getDescendants(D, Dom);
+    DeadBlocks.insert(Dom.begin(), Dom.end());
+    
+    // Figure out the dominance-frontier(D).
+    for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(),
+           E = Dom.end(); I != E; I++) {
+      BasicBlock *B = *I;
+      for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) {
+        BasicBlock *S = *SI;
+        if (DeadBlocks.count(S))
+          continue;
+
+        bool AllPredDead = true;
+        for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++)
+          if (!DeadBlocks.count(*PI)) {
+            AllPredDead = false;
+            break;
+          }
+
+        if (!AllPredDead) {
+          // S could be proved dead later on. That is why we don't update phi
+          // operands at this moment.
+          DF.insert(S);
+        } else {
+          // While S is not dominated by D, it is dead by now. This could take
+          // place if S already have a dead predecessor before D is declared
+          // dead.
+          NewDead.push_back(S);
+        }
+      }
+    }
+  }
+
+  // For the dead blocks' live successors, update their phi nodes by replacing
+  // the operands corresponding to dead blocks with UndefVal.
+  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+        I != E; I++) {
+    BasicBlock *B = *I;
+    if (DeadBlocks.count(B))
+      continue;
+
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+    for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(),
+           PE = Preds.end(); PI != PE; PI++) {
+      BasicBlock *P = *PI;
+
+      if (!DeadBlocks.count(P))
+        continue;
+
+      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+        if (BasicBlock *S = splitCriticalEdges(P, B))
+          DeadBlocks.insert(P = S);
+      }
+
+      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+        PHINode &Phi = cast<PHINode>(*II);
+        Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+                             UndefValue::get(Phi.getType()));
+      }
+    }
+  }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+//  1) If the dead out-coming edge is a critical-edge, split it. Let 
+//     R be the target of the dead out-coming edge.
+//  1) Identify the set of dead blocks implied by the branch's dead outcoming
+//     edge. The result of this step will be {X| X is dominated by R}
+//  2) Identify those blocks which haves at least one dead prodecessor. The
+//     result of this step will be dominance-frontier(R).
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
+//     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  BasicBlock *DeadRoot = Cond->getZExtValue() ? 
+                         BI->getSuccessor(1) : BI->getSuccessor(0);
+  if (DeadBlocks.count(DeadRoot))
+    return false;
+
+  if (!DeadRoot->getSinglePredecessor())
+    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+  addDeadBlock(DeadRoot);
+  return true;
+}
+
+// performPRE() will trigger assert if it come across an instruciton without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+  for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(),
+        E = DeadBlocks.end(); I != E; I++) {
+    BasicBlock *BB = *I;
+    for (BasicBlock::iterator II = BB->begin(), EE = BB->end();
+          II != EE; II++) {
+      Instruction *Inst = &*II;
+      unsigned ValNum = VN.lookup_or_add(Inst);
+      addToLeaderTable(ValNum, Inst, BB);
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index d51e034..235aaaa 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -532,7 +532,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // and varies predictably *inside* the loop.  Evaluate the value it
         // contains when the loop exits, if possible.
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (!SE->isLoopInvariant(ExitValue, L) || !isSafeToExpand(ExitValue))
+        if (!SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE))
           continue;
 
         // Computing the value outside of the loop brings no benefit if :
@@ -1479,8 +1480,14 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
   if (IndVar->getType()->isPointerTy()
       && !IVCount->getType()->isPointerTy()) {
 
+    // IVOffset will be the new GEP offset that is interpreted by GEP as a
+    // signed value. IVCount on the other hand represents the loop trip count,
+    // which is an unsigned value. FindLoopCounter only allows induction
+    // variables that have a positive unit stride of one. This means we don't
+    // have to handle the case of negative offsets (yet) and just need to zero
+    // extend IVCount.
     Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
-    const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy);
+    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
 
     // Expand the code for the iteration count.
     assert(SE->isLoopInvariant(IVOffset, L) &&
@@ -1492,7 +1499,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
     // We could handle pointer IVs other than i8*, but we need to compensate for
     // gep index scaling. See canExpandBackedgeTakenCount comments.
-    assert(SE->getSizeOfExpr(
+    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
              cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
            && "unit stride pointer IV must be i8*");
 
@@ -1506,9 +1513,10 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
     //
     // Valid Cases: (1) both integers is most common; (2) both may be pointers
-    // for simple memset-style loops; (3) IVInit is an integer and IVCount is a
-    // pointer may occur when enable-iv-rewrite generates a canonical IV on top
-    // of case #2.
+    // for simple memset-style loops.
+    //
+    // IVInit integer and IVCount pointer would only occur if a canonical IV
+    // were generated on top of case #2, which is not expected.
 
     const SCEV *IVLimit = 0;
     // For unit stride, IVCount = Start + BECount with 2's complement overflow.
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 0b8906d..b3ec2fc 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -827,7 +827,6 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   return false;
 }
 
-
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -842,6 +841,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (LoadBB->getSinglePredecessor())
     return false;
 
+  // If the load is defined in a landing pad, it can't be partially redundant,
+  // because the edges between the invoke and the landing pad cannot have other
+  // instructions between them.
+  if (LoadBB->isLandingPad())
+    return false;
+
   Value *LoadedPtr = LI->getOperand(0);
 
   // If the loaded operand is defined in the LoadBB, it can't be available.
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 07d991b..952b76b 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -314,7 +314,7 @@ bool NclPopcountRecognize::preliminaryScreen() {
   if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
     return false;
 
-  // Counting population are usually conducted by few arithmetic instrutions.
+  // Counting population are usually conducted by few arithmetic instructions.
   // Such instructions can be easilly "absorbed" by vacant slots in a
   // non-compact loop. Therefore, recognizing popcount idiom only makes sense
   // in a compact loop.
@@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = 0;
 
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+
   // If we're allowed to form a memset, and the stored value would be acceptable
   // for memset, use it.
   if (SplatValue && TLI->has(LibFunc::memset) &&
@@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
       CurLoop->isLoopInvariant(SplatValue)) {
     // Keep and use SplatValue.
     PatternValue = 0;
-  } else if (TLI->has(LibFunc::memset_pattern16) &&
+  } else if (DestAS == 0 &&
+             TLI->has(LibFunc::memset_pattern16) &&
              (PatternValue = getMemSetPatternValue(StoredVal, *TD))) {
+    // Don't create memset_pattern16s with address spaces.
     // It looks like we can use PatternValue!
     SplatValue = 0;
   } else {
@@ -978,20 +982,20 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, "loop-idiom");
 
+  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
   // would be unsafe to do if there is anything else in the loop that may read
   // or write to the aliased location.  Check for any overlap by generating the
   // base pointer and checking the region.
-  unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
   Value *BasePtr =
-    Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
+    Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy,
                            Preheader->getTerminator());
 
-
   if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
                             CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
+                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     deleteIfDeadInstruction(BasePtr, *SE, TLI);
@@ -1002,27 +1006,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  Type *IntPtr = Builder.getIntPtrTy(TD, DestAS);
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
                                          SCEV::FlagNUW);
-  if (StoreSize != 1)
+  if (StoreSize != 1) {
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
                                SCEV::FlagNUW);
+  }
 
   Value *NumBytes =
     Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
   CallInst *NewCall;
-  if (SplatValue)
-    NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment);
-  else {
+  if (SplatValue) {
+    NewCall = Builder.CreateMemSet(BasePtr,
+                                   SplatValue,
+                                   NumBytes,
+                                   StoreAlignment);
+  } else {
+    // Everything is emitted in default address space
+    Type *Int8PtrTy = DestInt8PtrTy;
+
     Module *M = TheStore->getParent()->getParent()->getParent();
     Value *MSP = M->getOrInsertFunction("memset_pattern16",
                                         Builder.getVoidTy(),
-                                        Builder.getInt8PtrTy(),
-                                        Builder.getInt8PtrTy(), IntPtr,
+                                        Int8PtrTy,
+                                        Int8PtrTy,
+                                        IntPtr,
                                         (void*)0);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -1032,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                                             PatternValue, ".memset_pattern");
     GV->setUnnamedAddr(true); // Ok to merge these.
     GV->setAlignment(16);
-    Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy());
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes);
   }
 
@@ -1108,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(SI->getContext());
-  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace());
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
 
-  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
+  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1),
                                          SCEV::FlagNUW);
   if (StoreSize != 1)
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
                                SCEV::FlagNUW);
 
   Value *NumBytes =
-    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+    Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
   CallInst *NewCall =
     Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 0000000..335af81
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1184 @@
+//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reroll"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+  cl::desc("The maximum increment for loop rerolling"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; i += 3) {
+//     foo(i);
+//     foo(i+1);
+//     foo(i+2);
+//   }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i)
+//     foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i) {
+//     x[3*i] = foo(0);
+//     x[3*i+1] = foo(0);
+//     x[3*i+2] = foo(0);
+//   }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 1500; ++i)
+//     x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+  class LoopReroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopReroll() : LoopPass(ID) {
+      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+protected:
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    DataLayout *DL;
+    TargetLibraryInfo *TLI;
+    DominatorTree *DT;
+
+    typedef SmallVector<Instruction *, 16> SmallInstructionVector;
+    typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
+
+    // A chain of isomorphic instructions, indentified by a single-use PHI,
+    // representing a reduction. Only the last value may be used outside the
+    // loop.
+    struct SimpleLoopReduction {
+      SimpleLoopReduction(Instruction *P, Loop *L)
+        : Valid(false), Instructions(1, P) {
+        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+        add(L);
+      }
+
+      bool valid() const {
+        return Valid;
+      }
+
+      Instruction *getPHI() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.front();
+      }
+
+      Instruction *getReducedValue() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.back();
+      }
+
+      Instruction *get(size_t i) const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions[i+1];
+      }
+
+      Instruction *operator [] (size_t i) const { return get(i); }
+
+      // The size, ignoring the initial PHI.
+      size_t size() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.size()-1;
+      }
+
+      typedef SmallInstructionVector::iterator iterator;
+      typedef SmallInstructionVector::const_iterator const_iterator;
+
+      iterator begin() {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      const_iterator begin() const {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      iterator end() { return Instructions.end(); }
+      const_iterator end() const { return Instructions.end(); }
+
+    protected:
+      bool Valid;
+      SmallInstructionVector Instructions;
+
+      void add(Loop *L);
+    };
+
+    // The set of all reductions, and state tracking of possible reductions
+    // during loop instruction processing.
+    struct ReductionTracker {
+      typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+
+      // Add a new possible reduction.
+      void addSLR(SimpleLoopReduction &SLR) {
+        PossibleReds.push_back(SLR);
+      }
+
+      // Setup to track possible reductions corresponding to the provided
+      // rerolling scale. Only reductions with a number of non-PHI instructions
+      // that is divisible by the scale are considered. Three instructions sets
+      // are filled in:
+      //   - A set of all possible instructions in eligible reductions.
+      //   - A set of all PHIs in eligible reductions
+      //   - A set of all reduced values (last instructions) in eligible reductions.
+      void restrictToScale(uint64_t Scale,
+                           SmallInstructionSet &PossibleRedSet,
+                           SmallInstructionSet &PossibleRedPHISet,
+                           SmallInstructionSet &PossibleRedLastSet) {
+        PossibleRedIdx.clear();
+        PossibleRedIter.clear();
+        Reds.clear();
+
+        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+          if (PossibleReds[i].size() % Scale == 0) {
+            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+      
+            PossibleRedSet.insert(PossibleReds[i].getPHI());
+            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+            for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+                 JE = PossibleReds[i].end(); J != JE; ++J) {
+              PossibleRedSet.insert(*J);
+              PossibleRedIdx[*J] = i;
+            }
+          }
+      }
+
+      // The functions below are used while processing the loop instructions.
+
+      // Are the two instructions both from reductions, and furthermore, from
+      // the same reduction?
+      bool isPairInSame(Instruction *J1, Instruction *J2) {
+        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+        if (J1I != PossibleRedIdx.end()) {
+          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+            return true;
+        }
+
+        return false;
+      }
+
+      // The two provided instructions, the first from the base iteration, and
+      // the second from iteration i, form a matched pair. If these are part of
+      // a reduction, record that fact.
+      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+        if (PossibleRedIdx.count(J1)) {
+          assert(PossibleRedIdx.count(J2) &&
+                 "Recording reduction vs. non-reduction instruction?");
+
+          PossibleRedIter[J1] = 0;
+          PossibleRedIter[J2] = i;
+
+          int Idx = PossibleRedIdx[J1];
+          assert(Idx == PossibleRedIdx[J2] &&
+                 "Recording pair from different reductions?");
+          Reds.insert(Idx);
+        }
+      }
+
+      // The functions below can be called after we've finished processing all
+      // instructions in the loop, and we know which reductions were selected.
+
+      // Is the provided instruction the PHI of a reduction selected for
+      // rerolling?
+      bool isSelectedPHI(Instruction *J) {
+        if (!isa<PHINode>(J))
+          return false;
+
+        for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+             RI != RIE; ++RI) {
+          int i = *RI;
+          if (cast<Instruction>(J) == PossibleReds[i].getPHI())
+            return true;
+        }
+
+        return false;
+      }
+
+      bool validateSelected();
+      void replaceSelected();
+
+    protected:
+      // The vector of all possible reductions (for any scale).
+      SmallReductionVector PossibleReds;
+
+      DenseMap<Instruction *, int> PossibleRedIdx;
+      DenseMap<Instruction *, int> PossibleRedIter;
+      DenseSet<int> Reds;
+    };
+
+    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+    void collectPossibleReductions(Loop *L,
+           ReductionTracker &Reductions);
+    void collectInLoopUserSet(Loop *L,
+           const SmallInstructionVector &Roots,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    void collectInLoopUserSet(Loop *L,
+           Instruction * Root,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                          Instruction *&IV,
+                          SmallInstructionVector &LoopIncs);
+    bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV,
+                         SmallVector<SmallInstructionVector, 32> &Roots,
+                         SmallInstructionSet &AllRoots,
+                         SmallInstructionVector &LoopIncs);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+                ReductionTracker &Reductions);
+  };
+}
+
+char LoopReroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+  return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+  for (Value::use_iterator UI = I->use_begin(),
+       UIE = I->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!L->contains(User))
+      return true;
+  }
+
+  return false;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+                                    SmallInstructionVector &PossibleIVs) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isIntegerTy())
+      continue;
+
+    if (const SCEVAddRecExpr *PHISCEV =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) {
+      if (PHISCEV->getLoop() != L)
+        continue;
+      if (!PHISCEV->isAffine())
+        continue;
+      if (const SCEVConstant *IncSCEV =
+          dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
+        if (!IncSCEV->getValue()->getValue().isStrictlyPositive())
+          continue;
+        if (IncSCEV->getValue()->uge(MaxInc))
+          continue;
+
+        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " <<
+              *PHISCEV << "\n");
+        PossibleIVs.push_back(I);
+      }
+    }
+  }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+  assert(!Valid && "Cannot add to an already-valid chain");
+
+  // The reduction variable must be a chain of single-use instructions
+  // (including the PHI), except for the last value (which is used by the PHI
+  // and also outside the loop).
+  Instruction *C = Instructions.front();
+
+  do {
+    C = cast<Instruction>(*C->use_begin());
+    if (C->hasOneUse()) {
+      if (!C->isBinaryOp())
+        return;
+
+      if (!(isa<PHINode>(Instructions.back()) ||
+            C->isSameOperationAs(Instructions.back())))
+        return;
+
+      Instructions.push_back(C);
+    }
+  } while (C->hasOneUse());
+
+  if (Instructions.size() < 2 ||
+      !C->isSameOperationAs(Instructions.back()) ||
+      C->use_begin() == C->use_end())
+    return;
+
+  // C is now the (potential) last instruction in the reduction chain.
+  for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end();
+       UI != UIE; ++UI) {
+    // The only in-loop user can be the initial PHI.
+    if (L->contains(cast<Instruction>(*UI)))
+      if (cast<Instruction>(*UI ) != Instructions.front())
+        return;
+  }
+
+  Instructions.push_back(C);
+  Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+  ReductionTracker &Reductions) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isSingleValueType())
+      continue;
+
+    SimpleLoopReduction SLR(I, L);
+    if (!SLR.valid())
+      continue;
+
+    DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+          SLR.size() << " chained instructions)\n");
+    Reductions.addSLR(SLR);
+  }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+//   1. Instructions in the set of excluded instructions are never added to the
+//   use set (even if they are users). This is used, for example, to exclude
+//   including root increments in the use set of the primary IV.
+//
+//   2. Instructions in the set of final instructions are added to the use set
+//   if they are users, but their users are not added. This is used, for
+//   example, to prevent a reduction update from forcing all later reduction
+//   updates into the use set.
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  Instruction *Root, const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  SmallInstructionVector Queue(1, Root);
+  while (!Queue.empty()) {
+    Instruction *I = Queue.pop_back_val();
+    if (!Users.insert(I).second)
+      continue;
+
+    if (!Final.count(I))
+      for (Value::use_iterator UI = I->use_begin(),
+           UIE = I->use_end(); UI != UIE; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (PHINode *PN = dyn_cast<PHINode>(User)) {
+          // Ignore "wrap-around" uses to PHIs of this loop's header.
+          if (PN->getIncomingBlock(UI) == L->getHeader())
+            continue;
+        }
+  
+        if (L->contains(User) && !Exclude.count(User)) {
+          Queue.push_back(User);
+        }
+      }
+
+    // We also want to collect single-user "feeder" values.
+    for (User::op_iterator OI = I->op_begin(),
+         OIE = I->op_end(); OI != OIE; ++OI) {
+      if (Instruction *Op = dyn_cast<Instruction>(*OI))
+        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+            !Final.count(Op))
+          Queue.push_back(Op);
+    }
+  }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  const SmallInstructionVector &Roots,
+  const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  for (SmallInstructionVector::const_iterator I = Roots.begin(),
+       IE = Roots.end(); I != IE; ++I)
+    collectInLoopUserSet(L, *I, Exclude, Final, Users);
+}
+
+static bool isSimpleLoadStore(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return false;
+}
+
+// Recognize loops that are setup like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs.
+bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                                  Instruction *&IV,
+                                  SmallInstructionVector &LoopIncs) {
+  // This is a special case: here we're looking for all uses (except for
+  // the increment) to be multiplied by a common factor. The increment must
+  // be by one. This is to capture loops like:
+  //   for (int i = 0; i < 500; ++i) {
+  //     foo(3*i); foo(3*i+1); foo(3*i+2);
+  //   }
+  if (RealIV->getNumUses() != 2)
+    return false;
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV));
+  Instruction *User1 = cast<Instruction>(*RealIV->use_begin()),
+              *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin()));
+  if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
+    return false;
+  const SCEVAddRecExpr *User1SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)),
+                       *User2SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2));
+  if (!User1SCEV || !User1SCEV->isAffine() ||
+      !User2SCEV || !User2SCEV->isAffine())
+    return false;
+
+  // We assume below that User1 is the scale multiply and User2 is the
+  // increment. If this can't be true, then swap them.
+  if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) {
+    std::swap(User1, User2);
+    std::swap(User1SCEV, User2SCEV);
+  }
+
+  if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE))
+    return false;
+  assert(User2SCEV->getStepRecurrence(*SE)->isOne() &&
+         "Invalid non-unit step for multiplicative scaling");
+  LoopIncs.push_back(User2);
+
+  if (const SCEVConstant *MulScale =
+      dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) {
+    // Make sure that both the start and step have the same multiplier.
+    if (RealIVSCEV->getStart()->getType() != MulScale->getType())
+      return false;
+    if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) !=
+        User1SCEV->getStart())
+      return false;
+
+    ConstantInt *MulScaleCI = MulScale->getValue();
+    if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc))
+      return false;
+    Scale = MulScaleCI->getZExtValue();
+    IV = User1;
+  } else
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n");
+  return true;
+}
+
+// Collect all root increments with respect to the provided induction variable
+// (normally the PHI, but sometimes a multiply). A root increment is an
+// instruction, normally an add, with a positive constant less than Scale. In a
+// rerollable loop, each of these increments is the root of an instruction
+// graph isomorphic to the others. Also, we collect the final induction
+// increment (the increment equal to the Scale), and its users in LoopIncs.
+bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
+                                 Instruction *IV,
+                                 SmallVector<SmallInstructionVector, 32> &Roots,
+                                 SmallInstructionSet &AllRoots,
+                                 SmallInstructionVector &LoopIncs) {
+  for (Value::use_iterator UI = IV->use_begin(),
+       UIE = IV->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!SE->isSCEVable(User->getType()))
+      continue;
+    if (User->getType() != IV->getType())
+      continue;
+    if (!L->contains(User))
+      continue;
+    if (hasUsesOutsideLoop(User, L))
+      continue;
+
+    if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(
+          SE->getSCEV(User), SE->getSCEV(IV)))) {
+      uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
+      if (Idx > 0 && Idx < Scale) {
+        Roots[Idx-1].push_back(User);
+        AllRoots.insert(User);
+      } else if (Idx == Scale && Inc > 1) {
+        LoopIncs.push_back(User);
+      }
+    }
+  }
+
+  if (Roots[0].empty())
+    return false;
+  bool AllSame = true;
+  for (unsigned i = 1; i < Scale-1; ++i)
+    if (Roots[i].size() != Roots[0].size()) {
+      AllSame = false;
+      break;
+    }
+
+  if (!AllSame)
+    return false;
+
+  return true;
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+         JE = PossibleReds[i].end(); J != JE; ++J) {
+	// Note that all instructions in the chain must have been found because
+	// all instructions in the function must have been assigned to some
+	// iteration.
+      int Iter = PossibleRedIter[*J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+                        *J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+                " reduction use count " << Count <<
+                " is not equal to the base use count " <<
+                BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (Value::use_iterator UI =
+           PossibleReds[i].getReducedValue()->use_begin(),
+         UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI)
+      Users.push_back(cast<Instruction>(*UI));
+
+    for (SmallInstructionVector::iterator J = Users.begin(),
+         JE = Users.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *IterCount,
+                        ReductionTracker &Reductions) {
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
+  uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
+                   getValue()->getZExtValue();
+  // The collection of loop increment instructions.
+  SmallInstructionVector LoopIncs;
+  uint64_t Scale = Inc;
+
+  // The effective induction variable, IV, is normally also the real induction
+  // variable. When we're dealing with a loop like:
+  //   for (int i = 0; i < 500; ++i)
+  //     x[3*i] = ...;
+  //     x[3*i+1] = ...;
+  //     x[3*i+2] = ...;
+  // then the real IV is still i, but the effective IV is (3*i).
+  Instruction *RealIV = IV;
+  if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs))
+    return false;
+
+  assert(Scale <= MaxInc && "Scale is too large");
+  assert(Scale > 1 && "Scale must be at least 2");
+
+  // The set of increment instructions for each increment value.
+  SmallVector<SmallInstructionVector, 32> Roots(Scale-1);
+  SmallInstructionSet AllRoots;
+  if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs))
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+                  *RealIV << "\n");
+
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet,
+                             PossibleRedLastSet);
+
+  // We now need to check for equivalence of the use graph of each root with
+  // that of the primary induction variable (excluding the roots). Our goal
+  // here is not to solve the full graph isomorphism problem, but rather to
+  // catch common cases without a lot of work. As a result, we will assume
+  // that the relative order of the instructions in each unrolled iteration
+  // is the same (although we will not make an assumption about how the
+  // different iterations are intermixed). Note that while the order must be
+  // the same, the instructions may not be in the same basic block.
+  SmallInstructionSet Exclude(AllRoots);
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  DenseSet<Instruction *> BaseUseSet;
+  collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet);
+
+  DenseSet<Instruction *> AllRootUses;
+  std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1);
+
+  bool MatchFailed = false;
+  for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
+    DenseSet<Instruction *> &RootUseSet = RootUseSets[i];
+    collectInLoopUserSet(L, Roots[i], SmallInstructionSet(),
+                         PossibleRedSet, RootUseSet);
+
+    DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
+                    " vs. iteration increment " << (i+1) <<
+                    " use set size: " << RootUseSet.size() << "\n");
+
+    if (BaseUseSet.size() != RootUseSet.size()) {
+      MatchFailed = true;
+      break;
+    }
+
+    // In addition to regular aliasing information, we need to look for
+    // instructions from later (future) iterations that have side effects
+    // preventing us from reordering them past other instructions with side
+    // effects.
+    bool FutureSideEffects = false;
+    AliasSetTracker AST(*AA);
+
+    // The map between instructions in f(%iv.(i+1)) and f(%iv).
+    DenseMap<Value *, Value *> BaseMap;
+
+    assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
+    for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
+         JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
+      if (cast<Instruction>(J1) == RealIV)
+        continue;
+      if (cast<Instruction>(J1) == IV)
+        continue;
+      if (!BaseUseSet.count(J1))
+        continue;
+      if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
+        continue;
+
+      while (J2 != JE && (!RootUseSet.count(J2) ||
+             std::find(Roots[i].begin(), Roots[i].end(), J2) !=
+               Roots[i].end())) {
+        // As we iterate through the instructions, instructions that don't
+        // belong to previous iterations (or the base case), must belong to
+        // future iterations. We want to track the alias set of writes from
+        // previous iterations.
+        if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) &&
+            !AllRootUses.count(J2)) {
+          if (J2->mayWriteToMemory())
+            AST.add(J2);
+
+          // Note: This is specifically guarded by a check on isa<PHINode>,
+          // which while a valid (somewhat arbitrary) micro-optimization, is
+          // needed because otherwise isSafeToSpeculativelyExecute returns
+          // false on PHI nodes.
+          if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
+            FutureSideEffects = true; 
+        }
+
+        ++J2;
+      }
+
+      if (!J1->isSameOperationAs(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << "\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that this instruction, which is in the use set of this
+      // root instruction, does not also belong to the base set or the set of
+      // some previous root instruction.
+      if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (prev. case overlap)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that we don't alias with any instruction in the alias set
+      // tracker. If we do, then we depend on a future iteration, and we
+      // can't reroll.
+      if (J2->mayReadFromMemory()) {
+        for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
+             K != KE && !MatchFailed; ++K) {
+          if (K->aliasesUnknownInst(J2, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                            " vs. " << *J2 << " (depends on future store)\n");
+            MatchFailed = true;
+            break;
+          }
+        }
+      }
+
+      // If we've past an instruction from a future iteration that may have
+      // side effects, and this instruction might also, then we can't reorder
+      // them, and this matching fails. As an exception, we allow the alias
+      // set tracker to handle regular (simple) load/store dependencies.
+      if (FutureSideEffects &&
+            ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) ||
+             (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 <<
+                        " (side effects prevent reordering)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // For instructions that are part of a reduction, if the operation is
+      // associative, then don't bother matching the operands (because we
+      // already know that the instructions are isomorphic, and the order
+      // within the iteration does not matter). For non-associative reductions,
+      // we do need to match the operands, because we need to reject
+      // out-of-order instructions within an iteration!
+      // For example (assume floating-point addition), we need to reject this:
+      //   x += a[i]; x += b[i];
+      //   x += a[i+1]; x += b[i+1];
+      //   x += b[i+2]; x += a[i+2];
+      bool InReduction = Reductions.isPairInSame(J1, J2);
+
+      if (!(InReduction && J1->isAssociative())) {
+        bool Swapped = false, SomeOpMatched = false;;
+        for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
+          Value *Op2 = J2->getOperand(j);
+
+	  // If this is part of a reduction (and the operation is not
+	  // associatve), then we match all operands, but not those that are
+	  // part of the reduction.
+          if (InReduction)
+            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+              if (Reductions.isPairInSame(J2, Op2I))
+                continue;
+
+          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+          if (BMI != BaseMap.end())
+            Op2 = BMI->second;
+          else if (std::find(Roots[i].begin(), Roots[i].end(),
+                             (Instruction*) Op2) != Roots[i].end())
+            Op2 = IV;
+
+          if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+	    // If we've not already decided to swap the matched operands, and
+	    // we've not already matched our first operand (note that we could
+	    // have skipped matching the first operand because it is part of a
+	    // reduction above), and the instruction is commutative, then try
+	    // the swapped match.
+            if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
+                J1->getOperand(!j) == Op2) {
+              Swapped = true;
+            } else {
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                              " vs. " << *J2 << " (operand " << j << ")\n");
+              MatchFailed = true;
+              break;
+            }
+          }
+
+          SomeOpMatched = true;
+        }
+      }
+
+      if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
+          (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (uses outside loop)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      if (!MatchFailed)
+        BaseMap.insert(std::pair<Value *, Value *>(J2, J1));
+
+      AllRootUses.insert(J2);
+      Reductions.recordPair(J1, J2, i+1);
+
+      ++J2;
+    }
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+                  *RealIV << "\n");
+
+  DenseSet<Instruction *> LoopIncUseSet;
+  collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(),
+                       SmallInstructionSet(), LoopIncUseSet);
+  DEBUG(dbgs() << "LRR: Loop increment set size: " <<
+                  LoopIncUseSet.size() << "\n");
+
+  // Make sure that all instructions in the loop have been included in some
+  // use set.
+  for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
+       J != JE; ++J) {
+    if (isa<DbgInfoIntrinsic>(J))
+      continue;
+    if (cast<Instruction>(J) == RealIV)
+      continue;
+    if (cast<Instruction>(J) == IV)
+      continue;
+    if (BaseUseSet.count(J) || AllRootUses.count(J) ||
+        (LoopIncUseSet.count(J) && (J->isTerminator() ||
+                                    isSafeToSpeculativelyExecute(J, DL))))
+      continue;
+
+    if (AllRoots.count(J))
+      continue;
+
+    if (Reductions.isSelectedPHI(J))
+      continue;
+
+    DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
+                    " unprocessed instruction found: " << *J << "\n");
+    MatchFailed = true;
+    break;
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: all instructions processed from " <<
+                  *RealIV << "\n");
+
+  if (!Reductions.validateSelected())
+    return false;
+
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+
+  // Remove instructions associated with non-base iterations.
+  for (BasicBlock::reverse_iterator J = Header->rbegin();
+       J != Header->rend();) {
+    if (AllRootUses.count(&*J)) {
+      Instruction *D = &*J;
+      DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
+      D->eraseFromParent();
+      continue;
+    }
+
+    ++J; 
+  }
+
+  // Insert the new induction variable.
+  const SCEV *Start = RealIVSCEV->getStart();
+  if (Inc == 1)
+    Start = SE->getMulExpr(Start,
+                           SE->getConstant(Start->getType(), Scale));
+  const SCEVAddRecExpr *H =
+    cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start,
+                           SE->getConstant(RealIVSCEV->getType(), 1),
+                           L, SCEV::FlagAnyWrap));
+  { // Limit the lifetime of SCEVExpander.
+    SCEVExpander Expander(*SE, "reroll");
+    PHINode *NewIV =
+      cast<PHINode>(Expander.expandCodeFor(H, IV->getType(),
+                                           Header->begin()));
+    for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
+         JE = BaseUseSet.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(IV, NewIV);
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+      if (LoopIncUseSet.count(BI)) {
+        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+        if (Inc == 1)
+          ICSCEV =
+            SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
+        Value *IC;
+        if (isa<SCEVConstant>(ICSCEV)) {
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI);
+        } else {
+          BasicBlock *Preheader = L->getLoopPreheader();
+          if (!Preheader)
+            Preheader = InsertPreheaderForLoop(L, this);
+
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(),
+                                      Preheader->getTerminator());
+        }
+ 
+        Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); 
+        Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC,
+                                   "exitcond");
+        BI->setCondition(Cond);
+
+        if (BI->getSuccessor(1) != Header)
+          BI->swapSuccessors();
+      }
+    }
+  }
+
+  SimplifyInstructionsInBlock(Header, DL, TLI);
+  DeleteDeadPHIs(Header, TLI);
+  ++NumRerolledLoops;
+  return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  AA = &getAnalysis<AliasAnalysis>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  DL = getAnalysisIfAvailable<DataLayout>();
+  DT = &getAnalysis<DominatorTree>();
+
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+        "] Loop %" << Header->getName() << " (" <<
+        L->getNumBlocks() << " block(s))\n");
+
+  bool Changed = false;
+
+  // For now, we'll handle only single BB loops.
+  if (L->getNumBlocks() > 1)
+    return Changed;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return Changed;
+
+  const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+  const SCEV *IterCount =
+    SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1));
+  DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+  // First, we need to find the induction variable with respect to which we can
+  // reroll (there may be several possible options).
+  SmallInstructionVector PossibleIVs;
+  collectPossibleIVs(L, PossibleIVs);
+
+  if (PossibleIVs.empty()) {
+    DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    return Changed;
+  }
+
+  ReductionTracker Reductions;
+  collectPossibleReductions(L, Reductions);
+
+  // For each possible IV, collect the associated possible set of 'root' nodes
+  // (i+1, i+2, etc.).
+  for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
+       IE = PossibleIVs.end(); I != IE; ++I)
+    if (reroll(*I, L, Header, IterCount, Reductions)) {
+      Changed = true;
+      break;
+    }
+
+  return Changed;
+}
+
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 14cb979..eff5268 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1170,6 +1170,13 @@ public:
   /// may be used.
   bool AllFixupsOutsideLoop;
 
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula;
+
   /// WidestFixupType - This records the widest use type for any fixup using
   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
   /// max fixup widths to be equivalent, because the narrower one may be relying
@@ -1188,6 +1195,7 @@ public:
                                       MinOffset(INT64_MAX),
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true),
+                                      RigidFormula(false),
                                       WidestFixupType(0) {}
 
   bool HasFormulaWithSameRegs(const Formula &F) const;
@@ -1214,6 +1222,9 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
+  if (!Formulae.empty() && RigidFormula)
+    return false;
+
   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
@@ -1433,7 +1444,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
   }
   case LSRUse::ICmpZero:
     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
-    // Therefore, return 0 in case F.Scale == -1. 
+    // Therefore, return 0 in case F.Scale == -1.
     return F.Scale != -1;
 
   case LSRUse::Basic:
@@ -2943,7 +2954,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
-        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
           N = TransformForPostIncUse(Normalize, N, CI, 0,
@@ -2986,6 +2997,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 /// and loop-computable portions.
 void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  // Mark uses whose expressions cannot be expanded.
+  if (!isSafeToExpand(S, SE))
+    LU.RigidFormula = true;
+
   Formula F;
   F.InitialMatch(S, L, SE);
   bool Inserted = InsertFormula(LU, LUIdx, F);
@@ -4353,6 +4368,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                            SCEVExpander &Rewriter,
                            SmallVectorImpl<WeakVH> &DeadInsts) const {
   const LSRUse &LU = Uses[LF.LUIdx];
+  if (LU.RigidFormula)
+    return LF.OperandValToReplace;
 
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 80d060b..08ac38d 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -49,12 +49,17 @@ namespace {
   class LoopUnroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopUnroll(int T = -1, int C = -1,  int P = -1) : LoopPass(ID) {
+    LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {
       CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
       CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
+      CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+      UserAllowPartial = (P != -1) ||
+                         (UnrollAllowPartial.getNumOccurrences() > 0);
+      UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0);
+      UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0);
 
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
@@ -75,7 +80,11 @@ namespace {
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
+    bool     CurrentRuntime;
+    bool     UserCount;            // CurrentCount is user-specified.
     bool     UserThreshold;        // CurrentThreshold is user-specified.
+    bool     UserAllowPartial;     // CurrentAllowPartial is user-specified.
+    bool     UserRuntime;          // CurrentRuntime is user-specified.
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -110,8 +119,9 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
-  return new LoopUnroll(Threshold, Count, AllowPartial);
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
+                                 int Runtime) {
+  return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
@@ -145,16 +155,24 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         << "] Loop %" << Header->getName() << "\n");
   (void)Header;
 
+  TargetTransformInfo::UnrollingPreferences UP;
+  UP.Threshold = CurrentThreshold;
+  UP.OptSizeThreshold = OptSizeUnrollThreshold;
+  UP.Count = CurrentCount;
+  UP.Partial = CurrentAllowPartial;
+  UP.Runtime = CurrentRuntime;
+  TTI.getUnrollingPreferences(L, UP);
+
   // Determine the current unrolling threshold.  While this is normally set
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
-  unsigned Threshold = CurrentThreshold;
+  unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
   if (!UserThreshold &&
       Header->getParent()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex,
                      Attribute::OptimizeForSize))
-    Threshold = OptSizeUnrollThreshold;
+    Threshold = UP.OptSizeThreshold;
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -167,11 +185,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
     TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
   }
+
+  bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime;
+
   // Use a default unroll-count if the user doesn't specify a value
   // and the trip count is a run-time value.  The default is different
   // for run-time or compile-time trip count loops.
-  unsigned Count = CurrentCount;
-  if (UnrollRuntime && CurrentCount == 0 && TripCount == 0)
+  unsigned Count = UserCount ? CurrentCount : UP.Count;
+  if (Runtime && Count == 0 && TripCount == 0)
     Count = UnrollRuntimeCount;
 
   if (Count == 0) {
@@ -204,7 +225,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (TripCount != 1 && Size > Threshold) {
       DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
             << " because size: " << Size << ">" << Threshold << "\n");
-      if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) {
+      bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
+      if (!AllowPartial && !(Runtime && TripCount == 0)) {
         DEBUG(dbgs() << "  will not try to unroll partially because "
               << "-unroll-allow-partial not given\n");
         return false;
@@ -215,7 +237,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         while (Count != 0 && TripCount%Count != 0)
           Count--;
       }
-      else if (UnrollRuntime) {
+      else if (Runtime) {
         // Reduce unroll count to be a lower power-of-two value
         while (Count != 0 && Size > Threshold) {
           Count >>= 1;
@@ -231,7 +253,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 59aff31..c4ebfd5 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -212,8 +212,6 @@ namespace {
                                         Instruction *InsertPt);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
-    void RemoveBlockIfDead(BasicBlock *BB,
-                           std::vector<Instruction*> &Worklist, Loop *l);
     void RemoveLoopFromHierarchy(Loop *L);
     bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
                                     BasicBlock **LoopExit = 0);
@@ -946,114 +944,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   ++NumSimplify;
 }
 
-/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
-/// information, and remove any dead successors it has.
-///
-void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
-                                     std::vector<Instruction*> &Worklist,
-                                     Loop *L) {
-  if (pred_begin(BB) != pred_end(BB)) {
-    // This block isn't dead, since an edge to BB was just removed, see if there
-    // are any easy simplifications we can do now.
-    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
-      // If it has one pred, fold phi nodes in BB.
-      while (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
-        ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
-
-      // If this is the header of a loop and the only pred is the latch, we now
-      // have an unreachable loop.
-      if (Loop *L = LI->getLoopFor(BB))
-        if (loopHeader == BB && L->contains(Pred)) {
-          // Remove the branch from the latch to the header block, this makes
-          // the header dead, which will make the latch dead (because the header
-          // dominates the latch).
-          LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
-          Pred->getTerminator()->eraseFromParent();
-          new UnreachableInst(BB->getContext(), Pred);
-
-          // The loop is now broken, remove it from LI.
-          RemoveLoopFromHierarchy(L);
-
-          // Reprocess the header, which now IS dead.
-          RemoveBlockIfDead(BB, Worklist, L);
-          return;
-        }
-
-      // If pred ends in a uncond branch, add uncond branch to worklist so that
-      // the two blocks will get merged.
-      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-        if (BI->isUnconditional())
-          Worklist.push_back(BI);
-    }
-    return;
-  }
-
-  DEBUG(dbgs() << "Nuking dead block: " << *BB);
-
-  // Remove the instructions in the basic block from the worklist.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    RemoveFromWorklist(I, Worklist);
-
-    // Anything that uses the instructions in this basic block should have their
-    // uses replaced with undefs.
-    // If I is not void type then replaceAllUsesWith undef.
-    // This allows ValueHandlers and custom metadata to adjust itself.
-    if (!I->getType()->isVoidTy())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
-  }
-
-  // If this is the edge to the header block for a loop, remove the loop and
-  // promote all subloops.
-  if (Loop *BBLoop = LI->getLoopFor(BB)) {
-    if (BBLoop->getLoopLatch() == BB) {
-      RemoveLoopFromHierarchy(BBLoop);
-      if (currentLoop == BBLoop) {
-        currentLoop = 0;
-        redoLoop = false;
-      }
-    }
-  }
-
-  // Remove the block from the loop info, which removes it from any loops it
-  // was in.
-  LI->removeBlock(BB);
-
-  // Remove phi node entries in successors for this block.
-  TerminatorInst *TI = BB->getTerminator();
-  SmallVector<BasicBlock*, 4> Succs;
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-    Succs.push_back(TI->getSuccessor(i));
-    TI->getSuccessor(i)->removePredecessor(BB);
-  }
-
-  // Unique the successors, remove anything with multiple uses.
-  array_pod_sort(Succs.begin(), Succs.end());
-  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
-
-  // Remove the basic block, including all of the instructions contained in it.
-  LPM->deleteSimpleAnalysisValue(BB, L);
-  BB->eraseFromParent();
-  // Remove successor blocks here that are not dead, so that we know we only
-  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
-  // then getting removed before we revisit them, which is badness.
-  //
-  for (unsigned i = 0; i != Succs.size(); ++i)
-    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
-      // One exception is loop headers.  If this block was the preheader for a
-      // loop, then we DO want to visit the loop so the loop gets deleted.
-      // We know that if the successor is a loop header, that this loop had to
-      // be the preheader: the case where this was the latch block was handled
-      // above and headers can only have two predecessors.
-      if (!LI->isLoopHeader(Succs[i])) {
-        Succs.erase(Succs.begin()+i);
-        --i;
-      }
-    }
-
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
-    RemoveBlockIfDead(Succs[i], Worklist, L);
-}
-
 /// RemoveLoopFromHierarchy - We have discovered that the specified loop has
 /// become unwrapped, either because the backedge was deleted, or because the
 /// edge into the header was removed.  If the edge into the header from the
@@ -1262,23 +1152,6 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         continue;
       }
 
-      if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
-        // Conditional branch.  Turn it into an unconditional branch, then
-        // remove dead blocks.
-        continue;  // FIXME: Enable.
-
-        DEBUG(dbgs() << "Folded branch: " << *BI);
-        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
-        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
-        DeadSucc->removePredecessor(BI->getParent(), true);
-        Worklist.push_back(BranchInst::Create(LiveSucc, BI));
-        LPM->deleteSimpleAnalysisValue(BI, L);
-        BI->eraseFromParent();
-        RemoveFromWorklist(BI, Worklist);
-        ++NumSimplify;
-
-        RemoveBlockIfDead(DeadSucc, Worklist, L);
-      }
       continue;
     }
   }
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 8f61ffd..9912d3d 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -170,14 +170,17 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
   // pessimize the llvm optimizer.
   //
   // Since we don't have perfect knowledge here, make some assumptions: assume
-  // the maximum GPR width is the same size as the pointer size and assume that
-  // this width can be stored.  If so, check to see whether we will end up
-  // actually reducing the number of stores used.
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
   unsigned Bytes = unsigned(End-Start);
-  unsigned NumPointerStores = Bytes/TD.getPointerSize();
+  unsigned MaxIntSize = TD.getLargestLegalIntTypeSize();
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+  unsigned NumPointerStores = Bytes / MaxIntSize;
 
   // Assume the remaining bytes if any are done a byte at a time.
-  unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
+  unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize;
 
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
new file mode 100644
index 0000000..15cee44
--- /dev/null
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -0,0 +1,156 @@
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+  class PartiallyInlineLibCalls : public FunctionPass {
+  public:
+    static char ID;
+
+    PartiallyInlineLibCalls() :
+      FunctionPass(ID) {
+      initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    /// Optimize calls to sqrt.
+    bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                      BasicBlock &CurrBB, Function::iterator &BB);
+  };
+
+  char PartiallyInlineLibCalls::ID = 0;
+}
+
+INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
+                "Partially inline calls to library functions", false, false)
+
+void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<TargetTransformInfo>();
+  FunctionPass::getAnalysisUsage(AU);
+}
+
+bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
+  bool Changed = false;
+  Function::iterator CurrBB;
+  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>();
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      LibFunc::Func LibFunc;
+      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+        continue;
+
+      switch (LibFunc) {
+      case LibFunc::sqrtf:
+      case LibFunc::sqrt:
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
+                                           Function *CalledFunc,
+                                           BasicBlock &CurrBB,
+                                           Function::iterator &BB) {
+  // There is no need to change the IR, since backend will emit sqrt
+  // instruction if the call has already been marked read-only.
+  if (Call->onlyReadsMemory())
+    return false;
+
+  // Do the following transformation:
+  //
+  // (before)
+  // dst = sqrt(src)
+  //
+  // (after)
+  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+  // if (v0 is a NaN)
+  //   v1 = sqrt(src)         # library call.
+  // dst = phi(v0, v1)
+  //
+
+  // Move all instructions following Call to newly created block JoinBB.
+  // Create phi and replace all uses.
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this);
+  IRBuilder<> Builder(JoinBB, JoinBB->begin());
+  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Call->replaceAllUsesWith(Phi);
+
+  // Create basic block LibCallBB and insert a call to library function sqrt.
+  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+                                             CurrBB.getParent(), JoinBB);
+  Builder.SetInsertPoint(LibCallBB);
+  Instruction *LibCall = Call->clone();
+  Builder.Insert(LibCall);
+  Builder.CreateBr(JoinBB);
+
+  // Add attribute "readnone" so that backend can use a native sqrt instruction
+  // for this call. Insert a FP compare instruction and a conditional branch
+  // at the end of CurrBB.
+  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  CurrBB.getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(&CurrBB);
+  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+  // Add phi operands.
+  Phi->addIncoming(Call, &CurrBB);
+  Phi->addIncoming(LibCall, LibCallBB);
+
+  BB = JoinBB;
+  return true;
+}
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+  return new PartiallyInlineLibCalls();
+}
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 5c55143..9f3fc83 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -733,9 +733,9 @@ class AllocaPromoter : public LoadAndStorePromoter {
   SmallVector<DbgValueInst *, 4> DVIs;
 
 public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+  AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S,
                  AllocaInst &AI, DIBuilder &DIB)
-    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+      : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
 
   void run(const SmallVectorImpl<Instruction*> &Insts) {
     // Retain the debug information attached to the alloca for use when
@@ -762,9 +762,30 @@ public:
 
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction*> &Insts) const {
+    Value *Ptr;
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
-      return LI->getOperand(0) == &AI;
-    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+
+    // Only used to detect cycles, which will be rare and quickly found as
+    // we're walking up a chain of defs rather than down through uses.
+    SmallPtrSet<Value *, 4> Visited;
+
+    do {
+      if (Ptr == &AI)
+        return true;
+
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr))
+        Ptr = BCI->getOperand(0);
+      else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
+        Ptr = GEPI->getPointerOperand();
+      else
+        return false;
+
+    } while (Visited.insert(Ptr));
+
+    return false;
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
@@ -917,6 +938,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
                             AllocaSlices::const_iterator E,
                             uint64_t EndOffset) {
   Type *Ty = 0;
+  bool IgnoreNonIntegralTypes = false;
   for (AllocaSlices::const_iterator I = B; I != E; ++I) {
     Use *U = I->getUse();
     if (isa<IntrinsicInst>(*U->getUser()))
@@ -925,29 +947,37 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       continue;
 
     Type *UserTy = 0;
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
       UserTy = LI->getType();
-    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
       UserTy = SI->getValueOperand()->getType();
-    else
-      return 0; // Bail if we have weird uses.
+    } else {
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
+      continue;
+    }
 
     if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
       // If the type is larger than the partition, skip it. We only encounter
       // this for split integer operations where we want to use the type of the
-      // entity causing the split.
-      if (ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (ITy->getBitWidth() % 8 != 0 ||
+          ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
         continue;
 
       // If we have found an integer type use covering the alloca, use that
-      // regardless of the other types, as integers are often used for a
-      // "bucket
-      // of bits" type.
+      // regardless of the other types, as integers are often used for
+      // a "bucket of bits" type.
+      //
+      // NB: This *must* be the only return from inside the loop so that the
+      // order of slices doesn't impact the computed type.
       return ITy;
+    } else if (IgnoreNonIntegralTypes) {
+      continue;
     }
 
     if (Ty && Ty != UserTy)
-      return 0;
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
 
     Ty = UserTy;
   }
@@ -1431,6 +1461,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
 
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
     if (NewTy->isPointerTy() && OldTy->isPointerTy())
       return true;
@@ -1449,21 +1483,53 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
-                           Type *Ty) {
-  assert(canConvertValue(DL, V->getType(), Ty) &&
-         "Value not convertable to type");
-  if (V->getType() == Ty)
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
     return V;
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
       if (NewITy->getBitWidth() > OldITy->getBitWidth())
         return IRB.CreateZExt(V, NewITy);
-  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
-    return IRB.CreateIntToPtr(V, Ty);
-  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
-    return IRB.CreatePtrToInt(V, Ty);
 
-  return IRB.CreateBitCast(V, Ty);
+  // See if we need inttoptr for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isIntegerTy() &&
+      NewTy->getScalarType()->isPointerTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    return IRB.CreateIntToPtr(V, NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isPointerTy() &&
+      NewTy->getScalarType()->isIntegerTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    return IRB.CreatePtrToInt(V, NewTy);
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
 }
 
 /// \brief Test whether the given slice use can be promoted to a vector.
@@ -3364,12 +3430,12 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
 }
 
 static void enqueueUsersInWorklist(Instruction &I,
-                                   SmallVectorImpl<Use *> &UseWorklist,
-                                   SmallPtrSet<Use *, 8> &VisitedUses) {
+                                   SmallVectorImpl<Instruction *> &Worklist,
+                                   SmallPtrSet<Instruction *, 8> &Visited) {
   for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
        ++UI)
-    if (VisitedUses.insert(&UI.getUse()))
-      UseWorklist.push_back(&UI.getUse());
+    if (Visited.insert(cast<Instruction>(*UI)))
+      Worklist.push_back(cast<Instruction>(*UI));
 }
 
 /// \brief Promote the allocas, using the best available technique.
@@ -3388,7 +3454,7 @@ bool SROA::promoteAllocas(Function &F) {
 
   if (DT && !ForceSSAUpdater) {
     DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, *DT, DL);
+    PromoteMemToReg(PromotableAllocas, *DT);
     PromotableAllocas.clear();
     return true;
   }
@@ -3396,29 +3462,29 @@ bool SROA::promoteAllocas(Function &F) {
   DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
   SSAUpdater SSA;
   DIBuilder DIB(*F.getParent());
-  SmallVector<Instruction*, 64> Insts;
+  SmallVector<Instruction *, 64> Insts;
 
   // We need a worklist to walk the uses of each alloca.
-  SmallVector<Use *, 8> UseWorklist;
-  SmallPtrSet<Use *, 8> VisitedUses;
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
   SmallVector<Instruction *, 32> DeadInsts;
 
   for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
     AllocaInst *AI = PromotableAllocas[Idx];
-    UseWorklist.clear();
-    VisitedUses.clear();
+    Insts.clear();
+    Worklist.clear();
+    Visited.clear();
 
-    enqueueUsersInWorklist(*AI, UseWorklist, VisitedUses);
+    enqueueUsersInWorklist(*AI, Worklist, Visited);
 
-    while (!UseWorklist.empty()) {
-      Use *U = UseWorklist.pop_back_val();
-      Instruction &I = *cast<Instruction>(U->getUser());
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
 
       // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
       // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
       // leading to them) here. Eventually it should use them to optimize the
       // scalar values produced.
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
                II->getIntrinsicID() == Intrinsic::lifetime_end);
         II->eraseFromParent();
@@ -3428,12 +3494,12 @@ bool SROA::promoteAllocas(Function &F) {
       // Push the loads and stores we find onto the list. SROA will already
       // have validated that all loads and stores are viable candidates for
       // promotion.
-      if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
         assert(LI->getType() == AI->getAllocatedType());
         Insts.push_back(LI);
         continue;
       }
-      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         assert(SI->getValueOperand()->getType() == AI->getAllocatedType());
         Insts.push_back(SI);
         continue;
@@ -3442,11 +3508,10 @@ bool SROA::promoteAllocas(Function &F) {
       // For everything else, we know that only no-op bitcasts and GEPs will
       // make it this far, just recurse through them and recall them for later
       // removal.
-      DeadInsts.push_back(&I);
-      enqueueUsersInWorklist(I, UseWorklist, VisitedUses);
+      DeadInsts.push_back(I);
+      enqueueUsersInWorklist(*I, Worklist, Visited);
     }
     AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
-    Insts.clear();
     while (!DeadInsts.empty())
       DeadInsts.pop_back_val()->eraseFromParent();
     AI->eraseFromParent();
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
new file mode 100644
index 0000000..9bcd702
--- /dev/null
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -0,0 +1,479 @@
+//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileLoader transformation. This pass
+// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
+// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
+// profile information in the given profile.
+//
+// This pass generates branch weight annotations on the IR:
+//
+// - prof: Represents branch weights. This annotation is added to branches
+//      to indicate the weights of each edge coming out of the branch.
+//      The weight of each edge is the weight of the target block for
+//      that edge. The weight of a block B is computed as the maximum
+//      number of samples found in B.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sample-profile"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+// Command line option to specify the file to read samples from. This is
+// mainly used for debugging.
+static cl::opt<std::string> SampleProfileFile(
+    "sample-profile-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+
+namespace {
+/// \brief Sample-based profile reader.
+///
+/// Each profile contains sample counts for all the functions
+/// executed. Inside each function, statements are annotated with the
+/// collected samples on all the instructions associated with that
+/// statement.
+///
+/// For this to produce meaningful data, the program needs to be
+/// compiled with some debug information (at minimum, line numbers:
+/// -gline-tables-only). Otherwise, it will be impossible to match IR
+/// instructions to the line numbers collected by the profiler.
+///
+/// From the profile file, we are interested in collecting the
+/// following information:
+///
+/// * A list of functions included in the profile (mangled names).
+///
+/// * For each function F:
+///   1. The total number of samples collected in F.
+///
+///   2. The samples collected at each line in F. To provide some
+///      protection against source code shuffling, line numbers should
+///      be relative to the start of the function.
+class SampleProfile {
+public:
+  SampleProfile(StringRef F) : Profiles(0), Filename(F) {}
+
+  void dump();
+  void loadText();
+  void loadNative() { llvm_unreachable("not implemented"); }
+  bool emitAnnotations(Function &F);
+  void printFunctionProfile(raw_ostream &OS, StringRef FName);
+  void dumpFunctionProfile(StringRef FName);
+
+protected:
+  typedef DenseMap<uint32_t, uint32_t> BodySampleMap;
+  typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap;
+
+  /// \brief Representation of the runtime profile for a function.
+  ///
+  /// This data structure contains the runtime profile for a given
+  /// function. It contains the total number of samples collected
+  /// in the function and a map of samples collected in every statement.
+  struct FunctionProfile {
+    /// \brief Total number of samples collected inside this function.
+    ///
+    /// Samples are cumulative, they include all the samples collected
+    /// inside this function and all its inlined callees.
+    unsigned TotalSamples;
+
+    // \brief Total number of samples collected at the head of the function.
+    unsigned TotalHeadSamples;
+
+    /// \brief Map line offsets to collected samples.
+    ///
+    /// Each entry in this map contains the number of samples
+    /// collected at the corresponding line offset. All line locations
+    /// are an offset from the start of the function.
+    BodySampleMap BodySamples;
+
+    /// \brief Map basic blocks to their computed weights.
+    ///
+    /// The weight of a basic block is defined to be the maximum
+    /// of all the instruction weights in that block.
+    BlockWeightMap BlockWeights;
+  };
+
+  uint32_t getInstWeight(Instruction &I, unsigned FirstLineno,
+                         BodySampleMap &BodySamples);
+  uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                              BodySampleMap &BodySamples);
+
+  /// \brief Map every function to its associated profile.
+  ///
+  /// The profile of every function executed at runtime is collected
+  /// in the structure FunctionProfile. This maps function objects
+  /// to their corresponding profiles.
+  StringMap<FunctionProfile> Profiles;
+
+  /// \brief Path name to the file holding the profile data.
+  ///
+  /// The format of this file is defined by each profiler
+  /// independently. If possible, the profiler should have a text
+  /// version of the profile format to be used in constructing test
+  /// cases and debugging.
+  StringRef Filename;
+};
+
+/// \brief Loader class for text-based profiles.
+///
+/// This class defines a simple interface to read text files containing
+/// profiles. It keeps track of line number information and location of
+/// the file pointer. Users of this class are responsible for actually
+/// parsing the lines returned by the readLine function.
+///
+/// TODO - This does not really belong here. It is a generic text file
+/// reader. It should be moved to the Support library and made more general.
+class ExternalProfileTextLoader {
+public:
+  ExternalProfileTextLoader(StringRef F) : Filename(F) {
+    error_code EC;
+    EC = MemoryBuffer::getFile(Filename, Buffer);
+    if (EC)
+      report_fatal_error("Could not open profile file " + Filename + ": " +
+                         EC.message());
+    FP = Buffer->getBufferStart();
+    Lineno = 0;
+  }
+
+  /// \brief Read a line from the mapped file.
+  StringRef readLine() {
+    size_t Length = 0;
+    const char *start = FP;
+    while (FP != Buffer->getBufferEnd() && *FP != '\n') {
+      Length++;
+      FP++;
+    }
+    if (FP != Buffer->getBufferEnd())
+      FP++;
+    Lineno++;
+    return StringRef(start, Length);
+  }
+
+  /// \brief Return true, if we've reached EOF.
+  bool atEOF() const { return FP == Buffer->getBufferEnd(); }
+
+  /// \brief Report a parse error message and stop compilation.
+  void reportParseError(Twine Msg) const {
+    report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n");
+  }
+
+private:
+  /// \brief Memory buffer holding the text file.
+  OwningPtr<MemoryBuffer> Buffer;
+
+  /// \brief Current position into the memory buffer.
+  const char *FP;
+
+  /// \brief Current line number.
+  int64_t Lineno;
+
+  /// \brief Path name where to the profile file.
+  StringRef Filename;
+};
+
+/// \brief Sample profile pass.
+///
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader : public FunctionPass {
+public:
+  // Class identification, replacement for typeinfo
+  static char ID;
+
+  SampleProfileLoader(StringRef Name = SampleProfileFile)
+      : FunctionPass(ID), Profiler(0), Filename(Name) {
+    initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool doInitialization(Module &M);
+
+  void dump() { Profiler->dump(); }
+
+  virtual const char *getPassName() const { return "Sample profile pass"; }
+
+  virtual bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+  }
+
+protected:
+  /// \brief Profile reader object.
+  OwningPtr<SampleProfile> Profiler;
+
+  /// \brief Name of the profile file to load.
+  StringRef Filename;
+};
+}
+
+/// \brief Print the function profile for \p FName on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param FName Name of the function to print.
+void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) {
+  FunctionProfile FProfile = Profiles[FName];
+  OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", "
+     << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size()
+     << " sampled lines\n";
+  for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(),
+                                     SE = FProfile.BodySamples.end();
+       SI != SE; ++SI)
+    OS << "\tline offset: " << SI->first
+       << ", number of samples: " << SI->second << "\n";
+  OS << "\n";
+}
+
+/// \brief Dump the function profile for \p FName.
+///
+/// \param FName Name of the function to print.
+void SampleProfile::dumpFunctionProfile(StringRef FName) {
+  printFunctionProfile(dbgs(), FName);
+}
+
+/// \brief Dump all the function profiles found.
+void SampleProfile::dump() {
+  for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(),
+                                                  E = Profiles.end();
+       I != E; ++I)
+    dumpFunctionProfile(I->getKey());
+}
+
+/// \brief Load samples from a text file.
+///
+/// The file is divided in two segments:
+///
+/// Symbol table (represented with the string "symbol table")
+///    Number of symbols in the table
+///    symbol 1
+///    symbol 2
+///    ...
+///    symbol N
+///
+/// Function body profiles
+///    function1:total_samples:total_head_samples:number_of_locations
+///    location_offset_1: number_of_samples
+///    location_offset_2: number_of_samples
+///    ...
+///    location_offset_N: number_of_samples
+///
+/// Function names must be mangled in order for the profile loader to
+/// match them in the current translation unit.
+///
+/// Since this is a flat profile, a function that shows up more than
+/// once gets all its samples aggregated across all its instances.
+/// TODO - flat profiles are too imprecise to provide good optimization
+/// opportunities. Convert them to context-sensitive profile.
+///
+/// This textual representation is useful to generate unit tests and
+/// for debugging purposes, but it should not be used to generate
+/// profiles for large programs, as the representation is extremely
+/// inefficient.
+void SampleProfile::loadText() {
+  ExternalProfileTextLoader Loader(Filename);
+
+  // Read the symbol table.
+  StringRef Line = Loader.readLine();
+  if (Line != "symbol table")
+    Loader.reportParseError("Expected 'symbol table', found " + Line);
+  int NumSymbols;
+  Line = Loader.readLine();
+  if (Line.getAsInteger(10, NumSymbols))
+    Loader.reportParseError("Expected a number, found " + Line);
+  for (int I = 0; I < NumSymbols; I++) {
+    StringRef FName = Loader.readLine();
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.BodySamples.clear();
+    FProfile.TotalSamples = 0;
+    FProfile.TotalHeadSamples = 0;
+  }
+
+  // Read the profile of each function. Since each function may be
+  // mentioned more than once, and we are collecting flat profiles,
+  // accumulate samples as we parse them.
+  Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$");
+  Regex LineSample("^([0-9]+): ([0-9]+)$");
+  while (!Loader.atEOF()) {
+    SmallVector<StringRef, 4> Matches;
+    Line = Loader.readLine();
+    if (!HeadRE.match(Line, &Matches))
+      Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " +
+                              Line);
+    assert(Matches.size() == 5);
+    StringRef FName = Matches[1];
+    unsigned NumSamples, NumHeadSamples, NumSampledLines;
+    Matches[2].getAsInteger(10, NumSamples);
+    Matches[3].getAsInteger(10, NumHeadSamples);
+    Matches[4].getAsInteger(10, NumSampledLines);
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.TotalSamples += NumSamples;
+    FProfile.TotalHeadSamples += NumHeadSamples;
+    BodySampleMap &SampleMap = FProfile.BodySamples;
+    unsigned I;
+    for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) {
+      Line = Loader.readLine();
+      if (!LineSample.match(Line, &Matches))
+        Loader.reportParseError("Expected 'NUM: NUM', found " + Line);
+      assert(Matches.size() == 3);
+      unsigned LineOffset, NumSamples;
+      Matches[1].getAsInteger(10, LineOffset);
+      Matches[2].getAsInteger(10, NumSamples);
+      SampleMap[LineOffset] += NumSamples;
+    }
+
+    if (I < NumSampledLines)
+      Loader.reportParseError("Unexpected end of file");
+  }
+}
+
+/// \brief Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use \p FirstLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using \p BodySamples.
+///
+/// \param Inst Instruction to query.
+/// \param FirstLineno Line number of the first instruction in the function.
+/// \param BodySamples Map of relative source line locations to samples.
+///
+/// \returns The profiled weight of I.
+uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno,
+                                      BodySampleMap &BodySamples) {
+  unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1;
+  return BodySamples.lookup(LOffset);
+}
+
+/// \brief Compute the weight of a basic block.
+///
+/// The weight of basic block \p B is the maximum weight of all the
+/// instructions in B.
+///
+/// \param B The basic block to query.
+/// \param FirstLineno The line number for the first line in the
+///     function holding B.
+/// \param BodySamples The map containing all the samples collected in that
+///     function.
+///
+/// \returns The computed weight of B.
+uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                                           BodySampleMap &BodySamples) {
+  // If we've computed B's weight before, return it.
+  Function *F = B->getParent();
+  FunctionProfile &FProfile = Profiles[F->getName()];
+  std::pair<BlockWeightMap::iterator, bool> Entry =
+      FProfile.BlockWeights.insert(std::make_pair(B, 0));
+  if (!Entry.second)
+    return Entry.first->second;
+
+  // Otherwise, compute and cache B's weight.
+  uint32_t Weight = 0;
+  for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+    uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples);
+    if (InstWeight > Weight)
+      Weight = InstWeight;
+  }
+  Entry.first->second = Weight;
+  return Weight;
+}
+
+/// \brief Generate branch weight metadata for all branches in \p F.
+///
+/// For every branch instruction B in \p F, we compute the weight of the
+/// target block for each of the edges out of B. This is the weight
+/// that we associate with that branch.
+///
+/// TODO - This weight assignment will most likely be wrong if the
+/// target branch has more than two predecessors. This needs to be done
+/// using some form of flow propagation.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on B using the computed values.
+///
+/// \param F The function to query.
+bool SampleProfile::emitAnnotations(Function &F) {
+  bool Changed = false;
+  FunctionProfile &FProfile = Profiles[F.getName()];
+  unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine();
+  MDBuilder MDB(F.getContext());
+
+  // Clear the block weights cache.
+  FProfile.BlockWeights.clear();
+
+  // When we find a branch instruction: For each edge E out of the branch,
+  // the weight of E is the weight of the target block.
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *B = I;
+    TerminatorInst *TI = B->getTerminator();
+    if (TI->getNumSuccessors() == 1)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+
+    SmallVector<uint32_t, 4> Weights;
+    unsigned NSuccs = TI->getNumSuccessors();
+    for (unsigned I = 0; I < NSuccs; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      uint32_t Weight =
+          computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples);
+      Weights.push_back(Weight);
+    }
+
+    TI->setMetadata(llvm::LLVMContext::MD_prof,
+                    MDB.createBranchWeights(Weights));
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+char SampleProfileLoader::ID = 0;
+INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader",
+                false, false)
+
+bool SampleProfileLoader::runOnFunction(Function &F) {
+  return Profiler->emitAnnotations(F);
+}
+
+bool SampleProfileLoader::doInitialization(Module &M) {
+  Profiler.reset(new SampleProfile(Filename));
+  Profiler->loadText();
+  return true;
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass() {
+  return new SampleProfileLoader(SampleProfileFile);
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+  return new SampleProfileLoader(Name);
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 758334d..857597e 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
-  initializeBlockPlacementPass(Registry);
+  initializeSampleProfileLoaderPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
@@ -44,12 +44,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopInstSimplifyPass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
+  initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnswitchPass(Registry);
   initializeLoopIdiomRecognizePass(Registry);
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
+  initializePartiallyInlineLibCallsPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
@@ -111,6 +113,10 @@ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopRotatePass());
 }
 
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRerollPass());
+}
+
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnrollPass());
 }
@@ -123,6 +129,10 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createMemCpyOptPass());
 }
 
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
 void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPromoteMemoryToRegisterPass());
 }
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 73b2edf..57b290e 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
     SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
@@ -1426,7 +1426,7 @@ bool SROA::performPromotion(Function &F) {
     if (Allocas.empty()) break;
 
     if (HasDomTree)
-      PromoteMemToReg(Allocas, *DT, TD);
+      PromoteMemToReg(Allocas, *DT);
     else {
       SSAUpdater SSA;
       for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 6d05640..8371f6d 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -66,161 +66,6 @@ FunctionPass *llvm::createCFGSimplificationPass() {
   return new CFGSimplifyPass();
 }
 
-/// changeToUnreachable - Insert an unreachable instruction before the specified
-/// instruction, making it and the rest of the code in the block dead.
-static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
-  BasicBlock *BB = I->getParent();
-  // Loop over all of the successors, removing BB's entry from any PHI
-  // nodes.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    (*SI)->removePredecessor(BB);
-
-  // Insert a call to llvm.trap right before this.  This turns the undefined
-  // behavior into a hard fail instead of falling through into random code.
-  if (UseLLVMTrap) {
-    Function *TrapFn =
-      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
-    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
-    CallTrap->setDebugLoc(I->getDebugLoc());
-  }
-  new UnreachableInst(I->getContext(), I);
-
-  // All instructions after this are dead.
-  BasicBlock::iterator BBI = I, BBE = BB->end();
-  while (BBI != BBE) {
-    if (!BBI->use_empty())
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
-    BB->getInstList().erase(BBI++);
-  }
-}
-
-/// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II) {
-  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
-  NewCall->takeName(II);
-  NewCall->setCallingConv(II->getCallingConv());
-  NewCall->setAttributes(II->getAttributes());
-  NewCall->setDebugLoc(II->getDebugLoc());
-  II->replaceAllUsesWith(NewCall);
-
-  // Follow the call by a branch to the normal destination.
-  BranchInst::Create(II->getNormalDest(), II);
-
-  // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(II->getParent());
-  II->eraseFromParent();
-}
-
-static bool markAliveBlocks(BasicBlock *BB,
-                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
-
-  SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(BB);
-  Reachable.insert(BB);
-  bool Changed = false;
-  do {
-    BB = Worklist.pop_back_val();
-
-    // Do a quick scan of the basic block, turning any obviously unreachable
-    // instructions into LLVM unreachable insts.  The instruction combining pass
-    // canonicalizes unreachable insts into stores to null or undef.
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
-      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
-        if (CI->doesNotReturn()) {
-          // If we found a call to a no-return function, insert an unreachable
-          // instruction after it.  Make sure there isn't *already* one there
-          // though.
-          ++BBI;
-          if (!isa<UnreachableInst>(BBI)) {
-            // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(BBI, false);
-            Changed = true;
-          }
-          break;
-        }
-      }
-
-      // Store to undef and store to null are undefined and used to signal that
-      // they should be changed to unreachable by passes that can't modify the
-      // CFG.
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
-        // Don't touch volatile stores.
-        if (SI->isVolatile()) continue;
-
-        Value *Ptr = SI->getOperand(1);
-
-        if (isa<UndefValue>(Ptr) ||
-            (isa<ConstantPointerNull>(Ptr) &&
-             SI->getPointerAddressSpace() == 0)) {
-          changeToUnreachable(SI, true);
-          Changed = true;
-          break;
-        }
-      }
-    }
-
-    // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      Value *Callee = II->getCalledValue();
-      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        changeToUnreachable(II, true);
-        Changed = true;
-      } else if (II->doesNotThrow()) {
-        if (II->use_empty() && II->onlyReadsMemory()) {
-          // jump to the normal destination branch.
-          BranchInst::Create(II->getNormalDest(), II);
-          II->getUnwindDest()->removePredecessor(II->getParent());
-          II->eraseFromParent();
-        } else
-          changeToCall(II);
-        Changed = true;
-      }
-    }
-
-    Changed |= ConstantFoldTerminator(BB, true);
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.insert(*SI))
-        Worklist.push_back(*SI);
-  } while (!Worklist.empty());
-  return Changed;
-}
-
-/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
-/// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise.
-static bool removeUnreachableBlocksFromFn(Function &F) {
-  SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = markAliveBlocks(F.begin(), Reachable);
-
-  // If there are unreachable blocks in the CFG...
-  if (Reachable.size() == F.size())
-    return Changed;
-
-  assert(Reachable.size() < F.size());
-  NumSimpl += F.size()-Reachable.size();
-
-  // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references...
-  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(BB))
-      continue;
-
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.count(*SI))
-        (*SI)->removePredecessor(BB);
-    BB->dropAllReferences();
-  }
-
-  for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(I))
-      I = F.getBasicBlockList().erase(I);
-    else
-      ++I;
-
-  return true;
-}
-
 /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
 static bool mergeEmptyReturnBlocks(Function &F) {
@@ -325,7 +170,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
   const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
-  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
   EverChanged |= iterativelySimplifyCFG(F, TTI, TD);
 
@@ -333,16 +178,16 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
   if (!EverChanged) return false;
 
   // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // removeUnreachableBlocks is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
   // avoid reruning iterativelySimplifyCFG if the second pass of
-  // removeUnreachableBlocksFromFn doesn't do anything.
-  if (!removeUnreachableBlocksFromFn(F))
+  // removeUnreachableBlocks doesn't do anything.
+  if (!removeUnreachableBlocks(F))
     return true;
 
   do {
     EverChanged = iterativelySimplifyCFG(F, TTI, TD);
-    EverChanged |= removeUnreachableBlocksFromFn(F);
+    EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
   return true;
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index bb6f163..5045ff8 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -231,7 +231,7 @@ public:
 
   StructurizeCFG() :
     RegionPass(ID) {
-    initializeRegionInfoPass(*PassRegistry::getPassRegistry());
+    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
   using Pass::doInitialization;
@@ -244,6 +244,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTree>();
     AU.addPreserved<DominatorTree>();
     RegionPass::getAnalysisUsage(AU);
@@ -256,6 +257,7 @@ char StructurizeCFG::ID = 0;
 
 INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(RegionInfo)
 INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
@@ -321,21 +323,32 @@ Value *StructurizeCFG::invert(Value *Condition) {
   if (match(Condition, m_Not(m_Value(Condition))))
     return Condition;
 
-  // Third: Check all the users for an invert
-  BasicBlock *Parent = cast<Instruction>(Condition)->getParent();
-  for (Value::use_iterator I = Condition->use_begin(),
-       E = Condition->use_end(); I != E; ++I) {
+  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+    // Third: Check all the users for an invert
+    BasicBlock *Parent = Inst->getParent();
+    for (Value::use_iterator I = Condition->use_begin(),
+           E = Condition->use_end(); I != E; ++I) {
 
-    Instruction *User = dyn_cast<Instruction>(*I);
-    if (!User || User->getParent() != Parent)
-      continue;
+      Instruction *User = dyn_cast<Instruction>(*I);
+      if (!User || User->getParent() != Parent)
+        continue;
+
+      if (match(*I, m_Not(m_Specific(Condition))))
+        return *I;
+    }
 
-    if (match(*I, m_Not(m_Specific(Condition))))
-      return *I;
+    // Last option: Create a new instruction
+    return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
   }
 
-  // Last option: Create a new instruction
-  return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+    return BinaryOperator::CreateNot(Condition,
+                                     Arg->getName() + ".inv",
+                                     EntryBlock.getTerminator());
+  }
+
+  llvm_unreachable("Unhandled condition to invert");
 }
 
 /// \brief Build the condition for one edge
@@ -766,6 +779,20 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed,
     handleLoops(false, LoopEnd);
   }
 
+  // If the start of the loop is the entry block, we can't branch to it so
+  // insert a new dummy entry block.
+  Function *LoopFunc = LoopStart->getParent();
+  if (LoopStart == &LoopFunc->getEntryBlock()) {
+    LoopStart->setName("entry.orig");
+
+    BasicBlock *NewEntry =
+      BasicBlock::Create(LoopStart->getContext(),
+                         "entry",
+                         LoopFunc,
+                         LoopStart);
+    BranchInst::Create(LoopStart, NewEntry);
+  }
+
   // Create an extra loop end node
   LoopEnd = needPrefix(false);
   BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index e17a416..12de9ee 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -248,7 +248,6 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
 
   // If the edge isn't critical, then BB has a single successor or Succ has a
   // single pred.  Split the block.
-  BasicBlock::iterator SplitPoint;
   if (BasicBlock *SP = Succ->getSinglePredecessor()) {
     // If the successor only has a single pred, split the top of the successor
     // block.
@@ -401,8 +400,12 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
       // If all incoming values for the new PHI would be the same, just don't
       // make a new PHI.  Instead, just remove the incoming values from the old
       // PHI.
-      for (unsigned i = 0, e = Preds.size(); i != e; ++i)
-        PN->removeIncomingValue(Preds[i], false);
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+        // Explicitly check the BB index here to handle duplicates in Preds.
+        int Idx = PN->getBasicBlockIndex(Preds[i]);
+        if (Idx >= 0)
+          PN->removeIncomingValue(Idx, false);
+      }
     } else {
       // If the values coming into the block are not the same, we need a PHI.
       // Create the new PHI node, insert it into NewBB at the end of the block
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 8f3ff96..0e7f7f7 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
@@ -45,7 +44,6 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<LoopInfo>();
-      AU.addPreserved<ProfileInfo>();
 
       // No loop canonicalization guarantees are broken by this pass.
       AU.addPreservedID(LoopSimplifyID);
@@ -213,10 +211,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
 
   DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
   LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
-  ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
 
   // If we have nothing to update, just return.
-  if (DT == 0 && LI == 0 && PI == 0)
+  if (DT == 0 && LI == 0)
     return NewBB;
 
   // Now update analysis information.  Since the only predecessor of NewBB is
@@ -369,9 +366,5 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
     }
   }
 
-  // Update ProfileInfo if it is around.
-  if (PI)
-    PI->splitEdge(TIBB, DestBB, NewBB, MergeIdenticalEdges);
-
   return NewBB;
 }
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 3648fd6..5afd6b8 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMTransformUtils
   CmpInstAnalysis.cpp
   CodeExtractor.cpp
   DemoteRegToStack.cpp
+  GlobalStatus.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
   IntegerDivision.cpp
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 82013f9..6f00864 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -665,8 +665,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     TheSwitch->setCondition(call);
     TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
     // Remove redundant case
-    SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1);
-    TheSwitch->removeCase(ToBeRemoved);
+    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
     break;
   }
 }
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 9cbe15d..1da226b 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -266,8 +266,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
   BasicBlock *CB;
   BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator());
   bool Iteration = true;
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  IRBuilder<>::InsertPointGuard Guard(Builder);
   Value *PC = PBI->getCondition();
 
   do {
@@ -298,7 +297,6 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
     new UnreachableInst(CB->getContext(), CB);
   } while (Iteration);
 
-  Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
   DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
   return true;
 }
@@ -372,7 +370,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
 
 /// Check whether \param BB is the merge block of a if-region.  If yes, check
 /// whether there exists an adjacent if-region upstream, the two if-regions
-/// contain identical instuctions and can be legally merged.  \returns true if
+/// contain identical instructions and can be legally merged.  \returns true if
 /// the two if-regions are merged.
 ///
 /// From:
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
new file mode 100644
index 0000000..5f0a563
--- /dev/null
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -0,0 +1,183 @@
+//===-- GlobalStatus.cpp - Compute status info for globals -----------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+/// Return the stronger of the two ordering. If the two orderings are acquire
+/// and release, then return AcquireRelease.
+///
+static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+  if (X == Acquire && Y == Release)
+    return AcquireRelease;
+  if (Y == Acquire && X == Release)
+    return AcquireRelease;
+  return (AtomicOrdering)std::max(X, Y);
+}
+
+/// It is safe to destroy a constant iff it is only used by constants itself.
+/// Note that constants cannot be cyclic, so this test is pretty easy to
+/// implement recursively.
+///
+bool llvm::isSafeToDestroyConstant(const Constant *C) {
+  if (isa<GlobalValue>(C))
+    return false;
+
+  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E;
+       ++UI)
+    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
+      if (!isSafeToDestroyConstant(CU))
+        return false;
+    } else
+      return false;
+  return true;
+}
+
+static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
+                             SmallPtrSet<const PHINode *, 16> &PhiUsers) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
+       ++UI) {
+    const User *U = *UI;
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      GS.HasNonInstructionUser = true;
+
+      // If the result of the constantexpr isn't pointer type, then we won't
+      // know to expect it in various places.  Just reject early.
+      if (!isa<PointerType>(CE->getType()))
+        return true;
+
+      if (analyzeGlobalAux(CE, GS, PhiUsers))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        const Function *F = I->getParent()->getParent();
+        if (GS.AccessingFunction == 0)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.IsLoaded = true;
+        // Don't hack on volatile loads.
+        if (LI->isVolatile())
+          return true;
+        GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering());
+      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V)
+          return true;
+
+        // Don't hack on volatile stores.
+        if (SI->isVolatile())
+          return true;
+
+        GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::Stored) {
+          if (const GlobalVariable *GV =
+                  dyn_cast<GlobalVariable>(SI->getOperand(1))) {
+            Value *StoredVal = SI->getOperand(0);
+
+            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+              if (C->isThreadDependent()) {
+                // The stored value changes between threads; don't track it.
+                return true;
+              }
+            }
+
+            if (StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (GS.StoredType < GlobalStatus::StoredOnce) {
+              GS.StoredType = GlobalStatus::StoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::StoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::Stored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::Stored;
+          }
+        }
+      } else if (isa<BitCastInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<GetElementPtrInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<SelectInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
+        // PHI nodes we can check just like select or GEP instructions, but we
+        // have to be careful about infinite recursion.
+        if (PhiUsers.insert(PN)) // Not already visited.
+          if (analyzeGlobalAux(I, GS, PhiUsers))
+            return true;
+      } else if (isa<CmpInst>(I)) {
+        GS.IsCompared = true;
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile())
+          return true;
+        if (MTI->getArgOperand(0) == V)
+          GS.StoredType = GlobalStatus::Stored;
+        if (MTI->getArgOperand(1) == V)
+          GS.IsLoaded = true;
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile())
+          return true;
+        GS.StoredType = GlobalStatus::Stored;
+      } else if (ImmutableCallSite C = I) {
+        if (!C.isCallee(UI))
+          return true;
+        GS.IsLoaded = true;
+      } else {
+        return true; // Any other non-load instruction might take address!
+      }
+    } else if (const Constant *C = dyn_cast<Constant>(U)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!isSafeToDestroyConstant(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
+  SmallPtrSet<const PHINode *, 16> PhiUsers;
+  return analyzeGlobalAux(V, GS, PhiUsers);
+}
+
+GlobalStatus::GlobalStatus()
+    : IsCompared(false), IsLoaded(false), StoredType(NotStored),
+      StoredOnceValue(0), AccessingFunction(0),
+      HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
+      Ordering(NotAtomic) {}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index dabb67b..d021bce 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -193,7 +193,8 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     CallInst *CI = dyn_cast<CallInst>(I);
 
     // If this call cannot unwind, don't convert it to an invoke.
-    if (!CI || CI->doesNotThrow())
+    // Inline asm calls cannot throw.
+    if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
     // Convert this function call into an invoke instruction.  First, split the
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 2d1b166..f15e8d5 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -55,7 +55,6 @@ namespace {
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    std::vector<BasicBlock*> LoopBlocks;
     PredIteratorCache PredCache;
     Loop *L;
     
@@ -82,11 +81,6 @@ namespace {
       // Check the special guarantees that LCSSA makes.
       assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!");
     }
-
-    /// inLoop - returns true if the given block is within the current loop
-    bool inLoop(BasicBlock *B) const {
-      return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B);
-    }
   };
 }
   
@@ -129,11 +123,6 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
   if (ExitBlocks.empty())
     return false;
   
-  // Speed up queries by creating a sorted vector of blocks.
-  LoopBlocks.clear();
-  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
-  array_pod_sort(LoopBlocks.begin(), LoopBlocks.end());
-  
   // Look at all the instructions in the loop, checking to see if they have uses
   // outside the loop.  If so, rewrite those uses.
   bool MadeChange = false;
@@ -198,7 +187,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
     if (PHINode *PN = dyn_cast<PHINode>(U))
       UserBB = PN->getIncomingBlock(UI);
     
-    if (InstBB != UserBB && !inLoop(UserBB))
+    if (InstBB != UserBB && !L->contains(UserBB))
       UsesToRewrite.push_back(&UI.getUse());
   }
 
@@ -244,7 +233,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
       // If the exit block has a predecessor not within the loop, arrange for
       // the incoming value use corresponding to that predecessor to be
       // rewritten in terms of a different LCSSA PHI.
-      if (!inLoop(*PI))
+      if (!L->contains(*PI))
         UsesToRewrite.push_back(
           &PN->getOperandUse(
             PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1)));
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 08e1808..2768041 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -16,10 +16,10 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/DIBuilder.h"
 #include "llvm/DebugInfo.h"
@@ -43,6 +43,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
+
 //===----------------------------------------------------------------------===//
 //  Local constant propagation.
 //
@@ -193,33 +195,28 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
       SwitchInst::CaseIt FirstCase = SI->case_begin();
-      IntegersSubset& Case = FirstCase.getCaseValueEx();
-      if (Case.isSingleNumber()) {
-        // FIXME: Currently work with ConstantInt based numbers.
-        Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
-             Case.getSingleNumber(0).toConstantInt(),
-            "cond");
-
-        // Insert the new branch.
-        BranchInst *NewBr = Builder.CreateCondBr(Cond,
-                                FirstCase.getCaseSuccessor(),
-                                SI->getDefaultDest());
-        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
-        if (MD && MD->getNumOperands() == 3) {
-          ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
-          ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
-          assert(SICase && SIDef);
-          // The TrueWeight should be the weight for the single case of SI.
-          NewBr->setMetadata(LLVMContext::MD_prof,
-                 MDBuilder(BB->getContext()).
-                 createBranchWeights(SICase->getValue().getZExtValue(),
-                                     SIDef->getValue().getZExtValue()));
-        }
+      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+          FirstCase.getCaseValue(), "cond");
 
-        // Delete the old switch.
-        SI->eraseFromParent();
-        return true;
+      // Insert the new branch.
+      BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                               FirstCase.getCaseSuccessor(),
+                                               SI->getDefaultDest());
+      MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+      if (MD && MD->getNumOperands() == 3) {
+        ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
+        ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+        assert(SICase && SIDef);
+        // The TrueWeight should be the weight for the single case of SI.
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BB->getContext()).
+                        createBranchWeights(SICase->getValue().getZExtValue(),
+                                            SIDef->getValue().getZExtValue()));
       }
+
+      // Delete the old switch.
+      SI->eraseFromParent();
+      return true;
     }
     return false;
   }
@@ -415,7 +412,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
     Instruction *Inst = BI++;
 
     WeakVH BIHandle(BI);
-    if (recursivelySimplifyInstruction(Inst, TD)) {
+    if (recursivelySimplifyInstruction(Inst, TD, TLI)) {
       MadeChange = true;
       if (BIHandle != BI)
         BI = BB->begin();
@@ -515,11 +512,6 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
       DT->changeImmediateDominator(DestBB, PredBBIDom);
       DT->eraseNode(PredBB);
     }
-    ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
-    if (PI) {
-      PI->replaceAllUses(PredBB, DestBB);
-      PI->removeEdge(ProfileInfo::getEdge(PredBB, DestBB));
-    }
   }
   // Nuke BB.
   PredBB->eraseFromParent();
@@ -533,7 +525,7 @@ static bool CanMergeValues(Value *First, Value *Second) {
 }
 
 /// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
-/// almost-empty BB ending in an unconditional branch to Succ, into succ.
+/// almost-empty BB ending in an unconditional branch to Succ, into Succ.
 ///
 /// Assumption: Succ is the single successor for BB.
 ///
@@ -1053,7 +1045,11 @@ bool llvm::LowerDbgDeclare(Function &F) {
   for (SmallVectorImpl<DbgDeclareInst *>::iterator I = Dbgs.begin(),
          E = Dbgs.end(); I != E; ++I) {
     DbgDeclareInst *DDI = *I;
-    if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+    AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+    // If this is an alloca for a scalar variable, insert a dbg.value
+    // at each load and store to the alloca and erase the dbg.declare.
+    if (AI && !AI->isArrayAllocation()) {
+
       // We only remove the dbg.declare intrinsic if all uses are
       // converted to dbg.value intrinsics.
       bool RemoveDDI = true;
@@ -1121,33 +1117,153 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
   return true;
 }
 
-bool llvm::removeUnreachableBlocks(Function &F) {
-  SmallPtrSet<BasicBlock*, 16> Reachable;
+/// changeToUnreachable - Insert an unreachable instruction before the specified
+/// instruction, making it and the rest of the code in the block dead.
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    (*SI)->removePredecessor(BB);
+
+  // Insert a call to llvm.trap right before this.  This turns the undefined
+  // behavior into a hard fail instead of falling through into random code.
+  if (UseLLVMTrap) {
+    Function *TrapFn =
+      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
+    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+    CallTrap->setDebugLoc(I->getDebugLoc());
+  }
+  new UnreachableInst(I->getContext(), I);
+
+  // All instructions after this are dead.
+  BasicBlock::iterator BBI = I, BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+  }
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
+  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(II->getParent());
+  II->eraseFromParent();
+}
+
+static bool markAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+
   SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(&F.getEntryBlock());
-  Reachable.insert(&F.getEntryBlock());
+  Worklist.push_back(BB);
+  Reachable.insert(BB);
+  bool Changed = false;
   do {
-    BasicBlock *BB = Worklist.pop_back_val();
+    BB = Worklist.pop_back_val();
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          ++BBI;
+          if (!isa<UnreachableInst>(BBI)) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(BBI, false);
+            Changed = true;
+          }
+          break;
+        }
+      }
+
+      // Store to undef and store to null are undefined and used to signal that
+      // they should be changed to unreachable by passes that can't modify the
+      // CFG.
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
+        Value *Ptr = SI->getOperand(1);
+
+        if (isa<UndefValue>(Ptr) ||
+            (isa<ConstantPointerNull>(Ptr) &&
+             SI->getPointerAddressSpace() == 0)) {
+          changeToUnreachable(SI, true);
+          Changed = true;
+          break;
+        }
+      }
+    }
+
+    // Turn invokes that call 'nounwind' functions into ordinary calls.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Value *Callee = II->getCalledValue();
+      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true);
+        Changed = true;
+      } else if (II->doesNotThrow()) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->getUnwindDest()->removePredecessor(II->getParent());
+          II->eraseFromParent();
+        } else
+          changeToCall(II);
+        Changed = true;
+      }
+    }
+
+    Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.insert(*SI))
         Worklist.push_back(*SI);
   } while (!Worklist.empty());
+  return Changed;
+}
 
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
+/// otherwise.
+bool llvm::removeUnreachableBlocks(Function &F) {
+  SmallPtrSet<BasicBlock*, 128> Reachable;
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
+
+  // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
-    return false;
+    return Changed;
 
   assert(Reachable.size() < F.size());
-  for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) {
-    if (Reachable.count(I))
+  NumRemoved += F.size()-Reachable.size();
+
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
       continue;
 
-    for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.count(*SI))
-        (*SI)->removePredecessor(I);
-    I->dropAllReferences();
+        (*SI)->removePredecessor(BB);
+    BB->dropAllReferences();
   }
 
-  for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;)
+  for (Function::iterator I = ++F.begin(); I != F.end();)
     if (!Reachable.count(I))
       I = F.getBasicBlockList().erase(I);
     else
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index cb581b3..162807d 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -90,7 +90,8 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
   // Move all definitions in the successor to the predecessor...
   OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
 
-  std::string OldName = BB->getName();
+  // OldName will be valid until erased.
+  StringRef OldName = BB->getName();
 
   // Erase basic block from the function...
 
@@ -102,12 +103,13 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
     }
   }
   LI->removeBlock(BB);
-  BB->eraseFromParent();
 
   // Inherit predecessor's name if it exists...
   if (!OldName.empty() && !OnlyPred->hasName())
     OnlyPred->setName(OldName);
 
+  BB->eraseFromParent();
+
   return OnlyPred;
 }
 
@@ -239,8 +241,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     DEBUG(dbgs() << "!\n");
   }
 
-  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
-
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
 
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 4aee8ff..e017f50 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -29,7 +29,7 @@
 
 using namespace llvm;
 
-STATISTIC(IfHandled, "Number of 'expect' intrinsic intructions handled");
+STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled");
 
 static cl::opt<uint32_t>
 LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index f66b54d..9799a30 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -346,7 +346,6 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
       // Scan all of the uses and see if the live range is live across an unwind
       // edge.  If we find a use live across an invoke edge, create an alloca
       // and spill the value.
-      std::set<InvokeInst*> InvokesWithStoreInserted;
 
       // Find all of the blocks that this value is live in.
       std::set<BasicBlock*> LiveBBs;
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 955b853..2d2a8a5 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -66,6 +66,18 @@ namespace {
                              BasicBlock* OrigBlock, BasicBlock* Default);
     unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
   };
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator () (const LowerSwitch::CaseRange& C1,
+                      const LowerSwitch::CaseRange& C2) {
+
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
 }
 
 char LowerSwitch::ID = 0;
@@ -147,7 +159,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
   Function::iterator FI = OrigBlock;
   F->getBasicBlockList().insert(++FI, NewNode);
 
-  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT,
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
                                 Val, Pivot.Low, "Pivot");
   NewNode->getInstList().push_back(Comp);
   BranchInst::Create(LBranch, RBranch, Comp, NewNode);
@@ -222,34 +234,40 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
 
 // Clusterify - Transform simple list of Cases into list of CaseRange's
 unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
-
-  IntegersSubsetToBB TheClusterifier;
+  unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-       i != e; ++i) {
-    BasicBlock *SuccBB = i.getCaseSuccessor();
-    IntegersSubset CaseRanges = i.getCaseValueEx();
-    TheClusterifier.add(CaseRanges, SuccBB);
-  }
-  
-  TheClusterifier.optimize();
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
+                              i.getCaseSuccessor()));
   
-  size_t numCmps = 0;
-  for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(),
-       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
-    IntegersSubsetToBB::Cluster &C = *i;
-    
-    // FIXME: Currently work with ConstantInt based numbers.
-    // Changing it to APInt based is a pretty heavy for this commit.
-    Cases.push_back(CaseRange(C.first.getLow().toConstantInt(),
-                              C.first.getHigh().toConstantInt(), C.second));
-    if (C.first.isSingleNumber())
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size()>=2)
+    for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) {
+      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
+      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
+      BasicBlock* nextBB = J->BB;
+      BasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue-currentValue==1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
       // A range counts double, since it requires two compares.
       ++numCmps;
   }
 
-  return numCmps;  
+  return numCmps;
 }
 
 // processSwitchInst - Replace the specified switch instruction with a sequence
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index ebd7db6..61b3965 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
@@ -28,7 +27,6 @@ STATISTIC(NumPromoted, "Number of alloca's promoted");
 namespace {
   struct PromotePass : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-
     PromotePass() : FunctionPass(ID) {
       initializePromotePassPass(*PassRegistry::getPassRegistry());
     }
@@ -64,7 +62,6 @@ bool PromotePass::runOnFunction(Function &F) {
   bool Changed  = false;
 
   DominatorTree &DT = getAnalysis<DominatorTree>();
-  const DataLayout *DL = getAnalysisIfAvailable<DataLayout>();
 
   while (1) {
     Allocas.clear();
@@ -73,12 +70,12 @@ bool PromotePass::runOnFunction(Function &F) {
     // the entry node
     for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
       if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
-        if (isAllocaPromotable(AI, DL))
+        if (isAllocaPromotable(AI))
           Allocas.push_back(AI);
 
     if (Allocas.empty()) break;
 
-    PromoteMemToReg(Allocas, DT, DL);
+    PromoteMemToReg(Allocas, DT);
     NumPromoted += Allocas.size();
     Changed = true;
   }
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 6910180..8f6eee3 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -30,7 +30,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -46,7 +45,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/InstVisitor.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -58,16 +56,56 @@ STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 
-namespace {
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // FIXME: If the memory unit is of pointer or integer type, we can permit
+  // assignments to subsections of the memory unit.
+
+  // Only allow direct and non-volatile loads and stores...
+  for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE; ++UI) { // Loop over all of the uses of the alloca
+    const User *U = *UI;
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Note that atomic loads can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (LI->isVolatile())
+        return false;
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == AI)
+        return false; // Don't allow a store OF the AI, only INTO the AI.
+      // Note that atomic stores can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (SI->isVolatile())
+        return false;
+    } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
+    } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (BCI->getType() != Type::getInt8PtrTy(U->getContext()))
+        return false;
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext()))
+        return false;
+      if (!GEPI->hasAllZeroIndices())
+        return false;
+      if (!onlyUsedByLifetimeMarkers(GEPI))
+        return false;
+    } else {
+      return false;
+    }
+  }
 
-struct AllocaInfo : private InstVisitor<AllocaInfo, bool> {
-  const DataLayout *DL;
+  return true;
+}
+
+namespace {
 
+struct AllocaInfo {
   SmallVector<BasicBlock *, 32> DefiningBlocks;
   SmallVector<BasicBlock *, 32> UsingBlocks;
-  SmallVector<Instruction *, 8> DeadInsts;
 
-  Type *AllocaTy;
   StoreInst *OnlyStore;
   BasicBlock *OnlyBlock;
   bool OnlyUsedInOneBlock;
@@ -75,13 +113,9 @@ struct AllocaInfo : private InstVisitor<AllocaInfo, bool> {
   Value *AllocaPointerVal;
   DbgDeclareInst *DbgDeclare;
 
-  AllocaInfo(const DataLayout *DL) : DL(DL) {}
-
   void clear() {
     DefiningBlocks.clear();
     UsingBlocks.clear();
-    DeadInsts.clear();
-    AllocaTy = 0;
     OnlyStore = 0;
     OnlyBlock = 0;
     OnlyUsedInOneBlock = true;
@@ -91,116 +125,39 @@ struct AllocaInfo : private InstVisitor<AllocaInfo, bool> {
 
   /// Scan the uses of the specified alloca, filling in the AllocaInfo used
   /// by the rest of the pass to reason about the uses of this alloca.
-  bool analyzeAlloca(AllocaInst &AI) {
+  void AnalyzeAlloca(AllocaInst *AI) {
     clear();
 
-    AllocaTy = AI.getAllocatedType();
-    enqueueUsers(AI);
-
-    // Walk queued up uses in the worklist to handle nested uses.
-    while (!UseWorklist.empty()) {
-      U = UseWorklist.pop_back_val();
-      Instruction &I = *cast<Instruction>(U->getUser());
-      if (!visit(I))
-        return false; // Propagate failure to promote up.
+    // As we scan the uses of the alloca instruction, keep track of stores,
+    // and decide whether all of the loads and stores to the alloca are within
+    // the same basic block.
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+         UI != E;) {
+      Instruction *User = cast<Instruction>(*UI++);
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        // Remember the basic blocks which define new values for the alloca
+        DefiningBlocks.push_back(SI->getParent());
+        AllocaPointerVal = SI->getOperand(0);
+        OnlyStore = SI;
+      } else {
+        LoadInst *LI = cast<LoadInst>(User);
+        // Otherwise it must be a load instruction, keep track of variable
+        // reads.
+        UsingBlocks.push_back(LI->getParent());
+        AllocaPointerVal = LI;
+      }
 
       if (OnlyUsedInOneBlock) {
         if (OnlyBlock == 0)
-          OnlyBlock = I.getParent();
-        else if (OnlyBlock != I.getParent())
+          OnlyBlock = User->getParent();
+        else if (OnlyBlock != User->getParent())
           OnlyUsedInOneBlock = false;
       }
     }
 
-    DbgDeclare = FindAllocaDbgDeclare(&AI);
-    return true;
-  }
-
-private:
-  // Befriend the base class so it can call through private visitor methods.
-  friend class InstVisitor<AllocaInfo, bool>;
-
-  /// \brief A use pointer that is non-null when visiting uses.
-  Use *U;
-
-  /// \brief A worklist for recursively visiting all uses of an alloca.
-  SmallVector<Use *, 8> UseWorklist;
-
-  /// \brief A set for preventing cyclic visitation.
-  SmallPtrSet<Use *, 8> VisitedUses;
-
-  void enqueueUsers(Instruction &I) {
-    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
-         ++UI)
-      if (VisitedUses.insert(&UI.getUse()))
-        UseWorklist.push_back(&UI.getUse());
+    DbgDeclare = FindAllocaDbgDeclare(AI);
   }
-
-  bool visitLoadInst(LoadInst &LI) {
-    if (LI.isVolatile() || LI.getType() != AllocaTy)
-      return false;
-
-    // Keep track of variable reads.
-    UsingBlocks.push_back(LI.getParent());
-    AllocaPointerVal = &LI;
-    return true;
-  }
-
-  bool visitStoreInst(StoreInst &SI) {
-    if (SI.isVolatile() || SI.getValueOperand() == U->get() ||
-        SI.getValueOperand()->getType() != AllocaTy)
-      return false;
-
-    // Remember the basic blocks which define new values for the alloca
-    DefiningBlocks.push_back(SI.getParent());
-    AllocaPointerVal = SI.getOperand(0);
-    OnlyStore = &SI;
-    return true;
-  }
-
-  bool visitBitCastInst(BitCastInst &BC) {
-    if (BC.use_empty())
-      DeadInsts.push_back(&BC);
-    else
-      enqueueUsers(BC);
-    return true;
-  }
-
-  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (GEPI.use_empty()) {
-      DeadInsts.push_back(&GEPI);
-      return true;
-    }
-
-    enqueueUsers(GEPI);
-
-    return GEPI.hasAllZeroIndices();
-  }
-
-  // We can promote through debug info intrinsics as they don't alter the
-  // value stored in memory.
-  bool visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) {
-    DeadInsts.push_back(&I);
-    return true;
-  }
-
-  bool visitIntrinsicInst(IntrinsicInst &II) {
-    switch (II.getIntrinsicID()) {
-    default:
-      return false;
-
-      // Lifetime intrinsics don't preclude promoting the memory to a register.
-      // FIXME: We should use these to promote to undef when outside of a valid
-      // lifetime.
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-      DeadInsts.push_back(&II);
-      return true;
-    }
-  }
-
-  // The fallback is that the alloca cannot be promoted.
-  bool visitInstruction(Instruction &I) { return false; }
 };
 
 // Data package used by RenamePass()
@@ -278,7 +235,6 @@ struct PromoteMem2Reg {
   std::vector<AllocaInst *> Allocas;
   DominatorTree &DT;
   DIBuilder DIB;
-  const DataLayout *DL;
 
   /// An AliasSetTracker object to update.  If null, don't update it.
   AliasSetTracker *AST;
@@ -324,9 +280,9 @@ struct PromoteMem2Reg {
 
 public:
   PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                 const DataLayout *DL, AliasSetTracker *AST)
+                 AliasSetTracker *AST)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
-        DIB(*DT.getRoot()->getParent()->getParent()), DL(DL), AST(AST) {}
+        DIB(*DT.getRoot()->getParent()->getParent()), AST(AST) {}
 
   void run();
 
@@ -357,39 +313,27 @@ private:
 
 } // end of anonymous namespace
 
-/// \brief Walk a small vector of dead instructions and recursively remove them
-/// and subsequently dead instructions.
-///
-/// This is only valid to call on dead instructions using an alloca which is
-/// promotable, as we leverage that assumption to delete them faster.
-static void removeDeadInstructions(AllocaInst *AI,
-                                   SmallVectorImpl<Instruction *> &DeadInsts) {
-  while (!DeadInsts.empty()) {
-    Instruction *I = DeadInsts.pop_back_val();
-
-    // Don't delete the alloca itself.
-    if (I == AI)
-      continue;
+static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+  // Knowing that this alloca is promotable, we know that it's safe to kill all
+  // instructions except for load and store.
 
-    // Note that we open code the deletion algorithm here because we know
-    // apriori that all of the instructions using an alloca that reaches here
-    // are trivially dead when their use list becomes empty (The only risk are
-    // lifetime markers which we specifically want to nuke). By coding it here
-    // we can skip the triviality test and be more efficient.
-    //
-    // Null out all of the instruction's operands to see if any operand becomes
-    // dead as we go.
-    for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE;
-         ++OI) {
-      Instruction *Op = dyn_cast<Instruction>(*OI);
-      if (!Op)
-        continue;
-
-      OI->set(0);
-      if (!Op->use_empty())
-        continue;
+  for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE;) {
+    Instruction *I = cast<Instruction>(*UI);
+    ++UI;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      continue;
 
-      DeadInsts.push_back(Op);
+    if (!I->getType()->isVoidTy()) {
+      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+      // Follow the use/def chain to erase them now instead of leaving it for
+      // dead code elimination later.
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE;) {
+        Instruction *Inst = cast<Instruction>(*UI);
+        ++UI;
+        Inst->eraseFromParent();
+      }
     }
     I->eraseFromParent();
   }
@@ -474,6 +418,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
     DIBuilder DIB(*AI->getParent()->getParent()->getParent());
     ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB);
     DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
   }
   // Remove the (now dead) store and alloca.
   Info.OnlyStore->eraseFromParent();
@@ -486,16 +431,6 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   return true;
 }
 
-namespace {
-/// This is a helper predicate used to search by the first element of a pair.
-struct StoreIndexSearchPredicate {
-  bool operator()(const std::pair<unsigned, StoreInst *> &LHS,
-                  const std::pair<unsigned, StoreInst *> &RHS) {
-    return LHS.first < RHS.first;
-  }
-};
-}
-
 /// Many allocas are only used within a single basic block.  If this is the
 /// case, avoid traversing the CFG and inserting a lot of potentially useless
 /// PHI nodes by just performing a single linear pass over the basic block
@@ -528,8 +463,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 
   // Sort the stores by their index, making it efficient to do a lookup with a
   // binary search.
-  std::sort(StoresByIndex.begin(), StoresByIndex.end(),
-            StoreIndexSearchPredicate());
+  std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
 
   // Walk all of the loads from this alloca, replacing them with the nearest
   // store above them, if any.
@@ -544,7 +478,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     StoresByIndexTy::iterator I =
         std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
                          std::make_pair(LoadIdx, static_cast<StoreInst *>(0)),
-                         StoreIndexSearchPredicate());
+                         less_first());
 
     if (I == StoresByIndex.begin())
       // If there is no store before this load, the load takes the undef value.
@@ -577,8 +511,10 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   LBI.deleteValue(AI);
 
   // The alloca's debuginfo can be removed as well.
-  if (DbgDeclareInst *DDI = Info.DbgDeclare)
+  if (DbgDeclareInst *DDI = Info.DbgDeclare) {
     DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
+  }
 
   ++NumLocalPromoted;
 }
@@ -590,23 +526,17 @@ void PromoteMem2Reg::run() {
     PointerAllocaValues.resize(Allocas.size());
   AllocaDbgDeclares.resize(Allocas.size());
 
-  AllocaInfo Info(DL);
+  AllocaInfo Info;
   LargeBlockInfo LBI;
 
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
 
+    assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
     assert(AI->getParent()->getParent() == &F &&
            "All allocas should be in the same function, which is same as DF!");
 
-    // Calculate the set of read and write-locations for each alloca.  This is
-    // analogous to finding the 'uses' and 'definitions' of each variable.
-    bool Good = Info.analyzeAlloca(*AI);
-    (void)Good;
-    assert(Good && "Cannot promote non-promotable alloca!");
-
-    // Nuke all of the dead instructions.
-    removeDeadInstructions(AI, Info.DeadInsts);
+    removeLifetimeIntrinsicUsers(AI);
 
     if (AI->use_empty()) {
       // If there are no uses of the alloca, just delete it now.
@@ -620,6 +550,10 @@ void PromoteMem2Reg::run() {
       continue;
     }
 
+    // Calculate the set of read and write-locations for each alloca.  This is
+    // analogous to finding the 'uses' and 'definitions' of each variable.
+    Info.AnalyzeAlloca(AI);
+
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
@@ -904,16 +838,6 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
   }
 }
 
-namespace {
-typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
-
-struct DomTreeNodeCompare {
-  bool operator()(const DomTreeNodePair &LHS, const DomTreeNodePair &RHS) {
-    return LHS.second < RHS.second;
-  }
-};
-} // end anonymous namespace
-
 /// At this point, we're committed to promoting the alloca using IDF's, and the
 /// standard SSA construction algorithm.  Determine which blocks need phi nodes
 /// and see if we can optimize out some work by avoiding insertion of dead phi
@@ -931,9 +855,9 @@ void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
 
   // Use a priority queue keyed on dominator tree level so that inserted nodes
   // are handled from the bottom of the dominator tree upwards.
-  typedef std::priority_queue<DomTreeNodePair,
-                              SmallVector<DomTreeNodePair, 32>,
-                              DomTreeNodeCompare> IDFPriorityQueue;
+  typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
+  typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>,
+                              less_second> IDFPriorityQueue;
   IDFPriorityQueue PQ;
 
   for (SmallPtrSet<BasicBlock *, 32>::const_iterator I = DefBlocks.begin(),
@@ -1145,19 +1069,11 @@ NextIteration:
   goto NextIteration;
 }
 
-bool llvm::isAllocaPromotable(const AllocaInst *AI, const DataLayout *DL) {
-  // We cast away constness because we re-use the non-const analysis that the
-  // actual promotion routine uses. While it is non-const, it doesn't actually
-  // mutate anything at this phase, and we discard the non-const results that
-  // promotion uses to mutate the alloca.
-  return AllocaInfo(DL).analyzeAlloca(*const_cast<AllocaInst *>(AI));
-}
-
 void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           const DataLayout *DL, AliasSetTracker *AST) {
+                           AliasSetTracker *AST) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;
 
-  PromoteMem2Reg(Allocas, DT, DL, AST).run();
+  PromoteMem2Reg(Allocas, DT, AST).run();
 }
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index fc85ef3..30adbfa 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -63,7 +63,7 @@ void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
 }
 
 static bool IsEquivalentPHI(PHINode *PHI,
-                            DenseMap<BasicBlock*, Value*> &ValueMapping) {
+                          SmallDenseMap<BasicBlock*, Value*, 8> &ValueMapping) {
   unsigned PHINumValues = PHI->getNumIncomingValues();
   if (PHINumValues != ValueMapping.size())
     return false;
@@ -136,8 +136,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // Otherwise, we do need a PHI: check to see if we already have one available
   // in this block that produces the right value.
   if (isa<PHINode>(BB->begin())) {
-    DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(),
-                                               PredValues.end());
+    SmallDenseMap<BasicBlock*, Value*, 8> ValueMapping(PredValues.begin(),
+                                                       PredValues.end());
     PHINode *SomePHI;
     for (BasicBlock::iterator It = BB->begin();
          (SomePHI = dyn_cast<PHINode>(It)); ++It) {
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index c4c1423..ff50b12 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -475,9 +476,13 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
           CV = ICI->getOperand(0);
 
   // Unwrap any lossless ptrtoint cast.
-  if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext()))
-    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV))
-      CV = PTII->getOperand(0);
+  if (TD && CV) {
+    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
+      Value *Ptr = PTII->getPointerOperand();
+      if (PTII->getType() == TD->getIntPtrType(Ptr->getType()))
+        CV = Ptr;
+    }
+  }
   return CV;
 }
 
@@ -699,9 +704,10 @@ namespace {
   };
 }
 
-static int ConstantIntSortPredicate(const void *P1, const void *P2) {
-  const ConstantInt *LHS = *(const ConstantInt*const*)P1;
-  const ConstantInt *RHS = *(const ConstantInt*const*)P2;
+static int ConstantIntSortPredicate(ConstantInt *const *P1,
+                                    ConstantInt *const *P2) {
+  const ConstantInt *LHS = *P1;
+  const ConstantInt *RHS = *P2;
   if (LHS->getValue().ult(RHS->getValue()))
     return 1;
   if (LHS->getValue() == RHS->getValue())
@@ -924,7 +930,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
         assert(TD && "Cannot switch on pointer without DataLayout");
-        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
+        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getType()),
                                     "magicptr");
       }
 
@@ -1556,6 +1562,19 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
   return true;
 }
 
+/// \returns True if this block contains a CallInst with the NoDuplicate
+/// attribute.
+static bool HasNoDuplicateCall(const BasicBlock *BB) {
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    const CallInst *CI = dyn_cast<CallInst>(I);
+    if (!CI)
+      continue;
+    if (CI->cannotDuplicate())
+      return true;
+  }
+  return false;
+}
+
 /// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch
 /// across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
@@ -1603,6 +1622,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) {
   // Now we know that this block has multiple preds and two succs.
   if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
 
+  if (HasNoDuplicateCall(BB)) return false;
+
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -2069,14 +2090,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
     // must already have been resolved, so we won't be inhibiting the
-    // out-of-order core by speculating them earlier.
-    if (BonusInst) {
+    // out-of-order core by speculating them earlier. We also allow
+    // instructions that are used by the terminator's condition because it
+    // exposes more merging opportunities.
+    bool UsedByBranch = (BonusInst && BonusInst->hasOneUse() &&
+                         *BonusInst->use_begin() == Cond);
+
+    if (BonusInst && !UsedByBranch) {
       // Collect the values used by the bonus inst
       SmallPtrSet<Value*, 4> UsedValues;
       for (Instruction::op_iterator OI = BonusInst->op_begin(),
            OE = BonusInst->op_end(); OI != OE; ++OI) {
         Value *V = *OI;
-        if (!isa<Constant>(V))
+        if (!isa<Constant>(V) && !isa<Argument>(V))
           UsedValues.insert(V);
       }
 
@@ -2787,7 +2813,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD,
   if (CompVal->getType()->isPointerTy()) {
     assert(TD && "Cannot switch on pointer without DataLayout");
     CompVal = Builder.CreatePtrToInt(CompVal,
-                                     TD->getIntPtrType(CompVal->getContext()),
+                                     TD->getIntPtrType(CompVal->getType()),
                                      "magicptr");
   }
 
@@ -3160,7 +3186,7 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
 /// and use it to remove dead cases.
 static bool EliminateDeadSwitchCases(SwitchInst *SI) {
   Value *Cond = SI->getCondition();
-  unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
+  unsigned Bits = Cond->getType()->getIntegerBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
   ComputeMaskedBits(Cond, KnownZero, KnownOne);
 
@@ -3303,28 +3329,10 @@ static Constant *LookupConstant(Value *V,
 /// simple instructions such as binary operations where both operands are
 /// constant or can be replaced by constants from the ConstantPool. Returns the
 /// resulting constant on success, 0 otherwise.
-static Constant *ConstantFold(Instruction *I,
-                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-    Constant *A = LookupConstant(BO->getOperand(0), ConstantPool);
-    if (!A)
-      return 0;
-    Constant *B = LookupConstant(BO->getOperand(1), ConstantPool);
-    if (!B)
-      return 0;
-    return ConstantExpr::get(BO->getOpcode(), A, B);
-  }
-
-  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
-    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
-    if (!A)
-      return 0;
-    Constant *B = LookupConstant(I->getOperand(1), ConstantPool);
-    if (!B)
-      return 0;
-    return ConstantExpr::getCompare(Cmp->getPredicate(), A, B);
-  }
-
+static Constant *
+ConstantFold(Instruction *I,
+             const SmallDenseMap<Value *, Constant *> &ConstantPool,
+             const DataLayout *DL) {
   if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
     Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
     if (!A)
@@ -3336,14 +3344,19 @@ static Constant *ConstantFold(Instruction *I,
     return 0;
   }
 
-  if (CastInst *Cast = dyn_cast<CastInst>(I)) {
-    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
-    if (!A)
+  SmallVector<Constant *, 4> COps;
+  for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) {
+    if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
+      COps.push_back(A);
+    else
       return 0;
-    return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy());
   }
 
-  return 0;
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
+                                           COps[1], DL);
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL);
 }
 
 /// GetCaseResults - Try to determine the resulting constant values in phi nodes
@@ -3355,7 +3368,8 @@ GetCaseResults(SwitchInst *SI,
                ConstantInt *CaseVal,
                BasicBlock *CaseDest,
                BasicBlock **CommonDest,
-               SmallVectorImpl<std::pair<PHINode*,Constant*> > &Res) {
+               SmallVectorImpl<std::pair<PHINode *, Constant *> > &Res,
+               const DataLayout *DL) {
   // The block from which we enter the common destination.
   BasicBlock *Pred = SI->getParent();
 
@@ -3374,7 +3388,7 @@ GetCaseResults(SwitchInst *SI,
     } else if (isa<DbgInfoIntrinsic>(I)) {
       // Skip debug intrinsic.
       continue;
-    } else if (Constant *C = ConstantFold(I, ConstantPool)) {
+    } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) {
       // Instruction is side-effect free and constant.
       ConstantPool.insert(std::make_pair(I, C));
     } else {
@@ -3698,7 +3712,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
     typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
     ResultsTy Results;
     if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
-                        Results))
+                        Results, TD))
       return false;
 
     // Append the result from this case to the list for each phi.
@@ -3712,7 +3726,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   // Get the resulting values for the default case.
   SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
   if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest,
-                      DefaultResultsList))
+                      DefaultResultsList, TD))
     return false;
   for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
     PHINode *PHI = DefaultResultsList[I].first;
@@ -3733,14 +3747,32 @@ static bool SwitchToLookupTable(SwitchInst *SI,
                                             CommonDest->getParent(),
                                             CommonDest);
 
-  // Check whether the condition value is within the case range, and branch to
-  // the new BB.
+  // Compute the table index value.
   Builder.SetInsertPoint(SI);
   Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
                                         "switch.tableidx");
-  Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
-      MinCaseVal->getType(), TableSize));
-  Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+
+  // Compute the maximum table size representable by the integer type we are
+  // switching upon.
+  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+  uint64_t MaxTableSize = CaseSize > 63? UINT64_MAX : 1ULL << CaseSize;
+  assert(MaxTableSize >= TableSize &&
+         "It is impossible for a switch to have more entries than the max "
+         "representable value of its input integer type's size.");
+
+  // If we have a fully covered lookup table, unconditionally branch to the
+  // lookup table BB. Otherwise, check if the condition value is within the case
+  // range. If it is so, branch to the new BB. Otherwise branch to SI's default
+  // destination.
+  const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize;
+  if (GeneratingCoveredLookupTable) {
+    Builder.CreateBr(LookupBB);
+    SI->getDefaultDest()->removePredecessor(SI->getParent());
+  } else {
+    Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
+                                         MinCaseVal->getType(), TableSize));
+    Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+  }
 
   // Populate the BB that does the lookups.
   Builder.SetInsertPoint(LookupBB);
@@ -3769,9 +3801,11 @@ static bool SwitchToLookupTable(SwitchInst *SI,
     Builder.CreateBr(CommonDest);
 
   // Remove the switch.
-  for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) {
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
     BasicBlock *Succ = SI->getSuccessor(i);
-    if (Succ == SI->getDefaultDest()) continue;
+
+    if (Succ == SI->getDefaultDest())
+      continue;
     Succ->removePredecessor(SI->getParent());
   }
   SI->eraseFromParent();
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 094c201..15b3e66 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -26,11 +27,16 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 
 using namespace llvm;
 
+static cl::opt<bool>
+ColdErrorCalls("error-reporting-is-cold",  cl::init(true),
+  cl::Hidden, cl::desc("Treat error-reporting calls as cold"));
+
 /// This class is the abstract base class for the set of optimizations that
 /// corresponds to one library call.
 namespace {
@@ -118,6 +124,21 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
   return false;
 }
 
+/// \brief Check whether the overloaded unary floating point function
+/// corresponing to \a Ty is available.
+static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                            LibFunc::Func DoubleFn, LibFunc::Func FloatFn,
+                            LibFunc::Func LongDoubleFn) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    return TLI->has(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->has(DoubleFn);
+  default:
+    return TLI->has(LongDoubleFn);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Fortified Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -477,7 +498,7 @@ struct StrChrOpt : public LibCallOptimization {
 
     // Compute the offset, make sure to handle the case when we're searching for
     // zero (a weird way to spell strlen).
-    size_t I = CharC->getSExtValue() == 0 ?
+    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
         Str.size() : Str.find(CharC->getSExtValue());
     if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
       return Constant::getNullValue(CI->getType());
@@ -513,7 +534,7 @@ struct StrRChrOpt : public LibCallOptimization {
     }
 
     // Compute the offset.
-    size_t I = CharC->getSExtValue() == 0 ?
+    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
         Str.size() : Str.rfind(CharC->getSExtValue());
     if (I == StringRef::npos) // Didn't find the char. Return null.
       return Constant::getNullValue(CI->getType());
@@ -774,7 +795,7 @@ struct StrPBrkOpt : public LibCallOptimization {
     // Constant folding.
     if (HasS1 && HasS2) {
       size_t I = S1.find_first_of(S2);
-      if (I == std::string::npos) // No match.
+      if (I == StringRef::npos) // No match.
         return Constant::getNullValue(CI->getType());
 
       return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
@@ -912,7 +933,7 @@ struct StrStrOpt : public LibCallOptimization {
 
     // If both strings are known, constant fold it.
     if (HasStr1 && HasStr2) {
-      std::string::size_type Offset = SearchStr.find(ToFindStr);
+      size_t Offset = SearchStr.find(ToFindStr);
 
       if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
         return Constant::getNullValue(CI->getType());
@@ -1031,7 +1052,7 @@ struct MemSetOpt : public LibCallOptimization {
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)))
       return 0;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
@@ -1133,9 +1154,13 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
 
     Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+      // pow(1.0, x) -> 1.0
+      if (Op1C->isExactlyValue(1.0))
         return Op1C;
-      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+      // pow(2.0, x) -> exp2(x)
+      if (Op1C->isExactlyValue(2.0) &&
+          hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
+                          LibFunc::exp2l))
         return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
     }
 
@@ -1145,7 +1170,11 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
 
-    if (Op2C->isExactlyValue(0.5)) {
+    if (Op2C->isExactlyValue(0.5) &&
+        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
+                        LibFunc::sqrtl) &&
+        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
+                        LibFunc::fabsl)) {
       // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
       // This is faster than calling pow, and still handles negative zero
       // and negative infinity correctly.
@@ -1178,7 +1207,7 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     Value *Ret = NULL;
     if (UnsafeFPShrink && Callee->getName() == "exp2" &&
-        TLI->has(LibFunc::exp2)) {
+        TLI->has(LibFunc::exp2f)) {
       UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
       Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
     }
@@ -1229,6 +1258,155 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
   }
 };
 
+struct SinCosPiOpt : public LibCallOptimization {
+  SinCosPiOpt() {}
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Make sure the prototype is as expected, otherwise the rest of the
+    // function is probably invalid and likely to abort.
+    if (!isTrigLibCall(CI))
+      return 0;
+
+    Value *Arg = CI->getArgOperand(0);
+    SmallVector<CallInst *, 1> SinCalls;
+    SmallVector<CallInst *, 1> CosCalls;
+    SmallVector<CallInst *, 1> SinCosCalls;
+
+    bool IsFloat = Arg->getType()->isFloatTy();
+
+    // Look for all compatible sinpi, cospi and sincospi calls with the same
+    // argument. If there are enough (in some sense) we can make the
+    // substitution.
+    for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI)
+      classifyArgUse(*UI, CI->getParent(), IsFloat, SinCalls, CosCalls,
+                     SinCosCalls);
+
+    // It's only worthwhile if both sinpi and cospi are actually used.
+    if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
+      return 0;
+
+    Value *Sin, *Cos, *SinCos;
+    insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
+                     SinCos);
+
+    replaceTrigInsts(SinCalls, Sin);
+    replaceTrigInsts(CosCalls, Cos);
+    replaceTrigInsts(SinCosCalls, SinCos);
+
+    return 0;
+  }
+
+  bool isTrigLibCall(CallInst *CI) {
+    Function *Callee = CI->getCalledFunction();
+    FunctionType *FT = Callee->getFunctionType();
+
+    // We can only hope to do anything useful if we can ignore things like errno
+    // and floating-point exceptions.
+    bool AttributesSafe = CI->hasFnAttr(Attribute::NoUnwind) &&
+                          CI->hasFnAttr(Attribute::ReadNone);
+
+    // Other than that we need float(float) or double(double)
+    return AttributesSafe && FT->getNumParams() == 1 &&
+           FT->getReturnType() == FT->getParamType(0) &&
+           (FT->getParamType(0)->isFloatTy() ||
+            FT->getParamType(0)->isDoubleTy());
+  }
+
+  void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
+                      SmallVectorImpl<CallInst *> &SinCalls,
+                      SmallVectorImpl<CallInst *> &CosCalls,
+                      SmallVectorImpl<CallInst *> &SinCosCalls) {
+    CallInst *CI = dyn_cast<CallInst>(Val);
+
+    if (!CI)
+      return;
+
+    Function *Callee = CI->getCalledFunction();
+    StringRef FuncName = Callee->getName();
+    LibFunc::Func Func;
+    if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) ||
+        !isTrigLibCall(CI))
+      return;
+
+    if (IsFloat) {
+      if (Func == LibFunc::sinpif)
+        SinCalls.push_back(CI);
+      else if (Func == LibFunc::cospif)
+        CosCalls.push_back(CI);
+      else if (Func == LibFunc::sincospi_stretf)
+        SinCosCalls.push_back(CI);
+    } else {
+      if (Func == LibFunc::sinpi)
+        SinCalls.push_back(CI);
+      else if (Func == LibFunc::cospi)
+        CosCalls.push_back(CI);
+      else if (Func == LibFunc::sincospi_stret)
+        SinCosCalls.push_back(CI);
+    }
+  }
+
+  void replaceTrigInsts(SmallVectorImpl<CallInst*> &Calls, Value *Res) {
+    for (SmallVectorImpl<CallInst*>::iterator I = Calls.begin(),
+           E = Calls.end();
+         I != E; ++I) {
+      LCS->replaceAllUsesWith(*I, Res);
+    }
+  }
+
+  void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
+                        bool UseFloat, Value *&Sin, Value *&Cos,
+                        Value *&SinCos) {
+    Type *ArgTy = Arg->getType();
+    Type *ResTy;
+    StringRef Name;
+
+    Triple T(OrigCallee->getParent()->getTargetTriple());
+    if (UseFloat) {
+      Name = "__sincospi_stretf";
+
+      assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+      // x86_64 can't use {float, float} since that would be returned in both
+      // xmm0 and xmm1, which isn't what a real struct would do.
+      ResTy = T.getArch() == Triple::x86_64
+                  ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                  : static_cast<Type *>(StructType::get(ArgTy, ArgTy, NULL));
+    } else {
+      Name = "__sincospi_stret";
+      ResTy = StructType::get(ArgTy, ArgTy, NULL);
+    }
+
+    Module *M = OrigCallee->getParent();
+    Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
+                                           ResTy, ArgTy, NULL);
+
+    if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+      // If the argument is an instruction, it must dominate all uses so put our
+      // sincos call there.
+      BasicBlock::iterator Loc = ArgInst;
+      B.SetInsertPoint(ArgInst->getParent(), ++Loc);
+    } else {
+      // Otherwise (e.g. for a constant) the beginning of the function is as
+      // good a place as any.
+      BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+      B.SetInsertPoint(&EntryBB, EntryBB.begin());
+    }
+
+    SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+    if (SinCos->getType()->isStructTy()) {
+      Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+      Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+    } else {
+      Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                   "sinpi");
+      Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                   "cospi");
+    }
+  }
+
+};
+
 //===----------------------------------------------------------------------===//
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -1333,6 +1511,54 @@ struct ToAsciiOpt : public LibCallOptimization {
 // Formatting and IO Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
+struct ErrorReportingOpt : public LibCallOptimization {
+  ErrorReportingOpt(int S = -1) : StreamArg(S) {}
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &) {
+    // Error reporting calls should be cold, mark them as such.
+    // This applies even to non-builtin calls: it is only a hint and applies to
+    // functions that the frontend might not understand as builtins.
+
+    // This heuristic was suggested in:
+    // Improving Static Branch Prediction in a Compiler
+    // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
+    // Proceedings of PACT'98, Oct. 1998, IEEE
+
+    if (!CI->hasFnAttr(Attribute::Cold) && isReportingError(Callee, CI)) {
+      CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
+    }
+
+    return 0;
+  }
+
+protected:
+  bool isReportingError(Function *Callee, CallInst *CI) {
+    if (!ColdErrorCalls)
+      return false;
+ 
+    if (!Callee || !Callee->isDeclaration())
+      return false;
+
+    if (StreamArg < 0)
+      return true;
+
+    // These functions might be considered cold, but only if their stream
+    // argument is stderr.
+
+    if (StreamArg >= (int) CI->getNumArgOperands())
+      return false;
+    LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
+    if (!LI)
+      return false;
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+    if (!GV || !GV->isDeclaration())
+      return false;
+    return GV->getName() == "stderr";
+  }
+
+  int StreamArg;
+};
+
 struct PrintFOpt : public LibCallOptimization {
   Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
@@ -1361,7 +1587,7 @@ struct PrintFOpt : public LibCallOptimization {
 
     // printf("foo\n") --> puts("foo")
     if (FormatStr[FormatStr.size()-1] == '\n' &&
-        FormatStr.find('%') == std::string::npos) {  // no format characters.
+        FormatStr.find('%') == StringRef::npos) { // No format characters.
       // Create a string literal with no \n on it.  We expect the constant merge
       // pass to be run after this pass, to merge duplicate strings.
       FormatStr = FormatStr.drop_back();
@@ -1513,6 +1739,9 @@ struct SPrintFOpt : public LibCallOptimization {
 struct FPrintFOpt : public LibCallOptimization {
   Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 0);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // All the optimizations depend on the format string.
     StringRef FormatStr;
     if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
@@ -1590,6 +1819,9 @@ struct FPrintFOpt : public LibCallOptimization {
 
 struct FWriteOpt : public LibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 3);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // Require a pointer, an integer, an integer, a pointer, returning integer.
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
@@ -1623,6 +1855,9 @@ struct FWriteOpt : public LibCallOptimization {
 
 struct FPutsOpt : public LibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 1);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // These optimizations require DataLayout.
     if (!TD) return 0;
 
@@ -1741,6 +1976,7 @@ static MemSetOpt MemSet;
 // Math library call optimizations.
 static UnaryDoubleFPOpt UnaryDoubleFP(false);
 static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+static SinCosPiOpt SinCosPi;
 
   // Integer library call optimizations.
 static FFSOpt FFS;
@@ -1750,6 +1986,9 @@ static IsAsciiOpt IsAscii;
 static ToAsciiOpt ToAscii;
 
 // Formatting and IO library call optimizations.
+static ErrorReportingOpt ErrorReporting;
+static ErrorReportingOpt ErrorReporting0(0);
+static ErrorReportingOpt ErrorReporting1(1);
 static PrintFOpt PrintF;
 static SPrintFOpt SPrintF;
 static FPrintFOpt FPrintF;
@@ -1825,6 +2064,11 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
       case LibFunc::cos:
       case LibFunc::cosl:
         return &Cos;
+      case LibFunc::sinpif:
+      case LibFunc::sinpi:
+      case LibFunc::cospif:
+      case LibFunc::cospi:
+        return &SinCosPi;
       case LibFunc::powf:
       case LibFunc::pow:
       case LibFunc::powl:
@@ -1859,6 +2103,13 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
         return &FPuts;
       case LibFunc::puts:
         return &Puts;
+      case LibFunc::perror:
+        return &ErrorReporting;
+      case LibFunc::vfprintf:
+      case LibFunc::fiprintf:
+        return &ErrorReporting0;
+      case LibFunc::fputc:
+        return &ErrorReporting1;
       case LibFunc::ceil:
       case LibFunc::fabs:
       case LibFunc::floor:
diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Transforms/Utils/SpecialCaseList.cpp
index b98cb5b..2ef692c 100644
--- a/lib/Transforms/Utils/SpecialCaseList.cpp
+++ b/lib/Transforms/Utils/SpecialCaseList.cpp
@@ -49,29 +49,45 @@ struct SpecialCaseList::Entry {
   }
 };
 
-SpecialCaseList::SpecialCaseList(const StringRef Path) {
-  // Validate and open blacklist file.
-  if (Path.empty()) return;
+SpecialCaseList::SpecialCaseList() : Entries() {}
+
+SpecialCaseList *SpecialCaseList::create(
+    const StringRef Path, std::string &Error) {
+  if (Path.empty())
+    return new SpecialCaseList();
   OwningPtr<MemoryBuffer> File;
   if (error_code EC = MemoryBuffer::getFile(Path, File)) {
-    report_fatal_error("Can't open blacklist file: " + Path + ": " +
-                       EC.message());
+    Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str();
+    return 0;
   }
+  return create(File.get(), Error);
+}
 
-  init(File.get());
+SpecialCaseList *SpecialCaseList::create(
+    const MemoryBuffer *MB, std::string &Error) {
+  OwningPtr<SpecialCaseList> SCL(new SpecialCaseList());
+  if (!SCL->parse(MB, Error))
+    return 0;
+  return SCL.take();
 }
 
-SpecialCaseList::SpecialCaseList(const MemoryBuffer *MB) {
-  init(MB);
+SpecialCaseList *SpecialCaseList::createOrDie(const StringRef Path) {
+  std::string Error;
+  if (SpecialCaseList *SCL = create(Path, Error))
+    return SCL;
+  report_fatal_error(Error);
 }
 
-void SpecialCaseList::init(const MemoryBuffer *MB) {
+bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
   // Iterate through each line in the blacklist file.
   SmallVector<StringRef, 16> Lines;
   SplitString(MB->getBuffer(), Lines, "\n\r");
   StringMap<StringMap<std::string> > Regexps;
+  assert(Entries.empty() &&
+         "parse() should be called on an empty SpecialCaseList");
+  int LineNo = 1;
   for (SmallVectorImpl<StringRef>::iterator I = Lines.begin(), E = Lines.end();
-       I != E; ++I) {
+       I != E; ++I, ++LineNo) {
     // Ignore empty lines and lines starting with "#"
     if (I->empty() || I->startswith("#"))
       continue;
@@ -80,7 +96,9 @@ void SpecialCaseList::init(const MemoryBuffer *MB) {
     StringRef Prefix = SplitLine.first;
     if (SplitLine.second.empty()) {
       // Missing ':' in the line.
-      report_fatal_error("malformed blacklist line: " + SplitLine.first);
+      Error = (Twine("Malformed line ") + Twine(LineNo) + ": '" +
+               SplitLine.first + "'").str();
+      return false;
     }
 
     std::pair<StringRef, StringRef> SplitRegexp = SplitLine.second.split("=");
@@ -113,10 +131,11 @@ void SpecialCaseList::init(const MemoryBuffer *MB) {
 
     // Check that the regexp is valid.
     Regex CheckRE(Regexp);
-    std::string Error;
-    if (!CheckRE.isValid(Error)) {
-      report_fatal_error("malformed blacklist regex: " + SplitLine.second +
-          ": " + Error);
+    std::string REError;
+    if (!CheckRE.isValid(REError)) {
+      Error = (Twine("Malformed regex in line ") + Twine(LineNo) + ": '" +
+               SplitLine.second + "': " + REError).str();
+      return false;
     }
 
     // Add this regexp into the proper group by its prefix.
@@ -135,6 +154,7 @@ void SpecialCaseList::init(const MemoryBuffer *MB) {
       Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue());
     }
   }
+  return true;
 }
 
 SpecialCaseList::~SpecialCaseList() {
@@ -149,18 +169,12 @@ SpecialCaseList::~SpecialCaseList() {
   }
 }
 
-bool SpecialCaseList::findCategory(const Function &F,
-                                   StringRef &Category) const {
-  return findCategory(*F.getParent(), Category) ||
-         findCategory("fun", F.getName(), Category);
-}
-
 bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const {
   return isIn(*F.getParent(), Category) ||
          inSectionCategory("fun", F.getName(), Category);
 }
 
-static StringRef GetGVTypeString(const GlobalVariable &G) {
+static StringRef GetGlobalTypeString(const GlobalValue &G) {
   // Types of GlobalVariables are always pointer types.
   Type *GType = G.getType()->getElementType();
   // For now we support blacklisting struct types only.
@@ -171,46 +185,29 @@ static StringRef GetGVTypeString(const GlobalVariable &G) {
   return "<unknown type>";
 }
 
-bool SpecialCaseList::findCategory(const GlobalVariable &G,
-                                   StringRef &Category) const {
-  return findCategory(*G.getParent(), Category) ||
-         findCategory("global", G.getName(), Category) ||
-         findCategory("type", GetGVTypeString(G), Category);
-}
-
 bool SpecialCaseList::isIn(const GlobalVariable &G,
                            const StringRef Category) const {
   return isIn(*G.getParent(), Category) ||
          inSectionCategory("global", G.getName(), Category) ||
-         inSectionCategory("type", GetGVTypeString(G), Category);
+         inSectionCategory("type", GetGlobalTypeString(G), Category);
 }
 
-bool SpecialCaseList::findCategory(const Module &M, StringRef &Category) const {
-  return findCategory("src", M.getModuleIdentifier(), Category);
+bool SpecialCaseList::isIn(const GlobalAlias &GA,
+                           const StringRef Category) const {
+  if (isIn(*GA.getParent(), Category))
+    return true;
+
+  if (isa<FunctionType>(GA.getType()->getElementType()))
+    return inSectionCategory("fun", GA.getName(), Category);
+
+  return inSectionCategory("global", GA.getName(), Category) ||
+         inSectionCategory("type", GetGlobalTypeString(GA), Category);
 }
 
 bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const {
   return inSectionCategory("src", M.getModuleIdentifier(), Category);
 }
 
-bool SpecialCaseList::findCategory(const StringRef Section,
-                                   const StringRef Query,
-                                   StringRef &Category) const {
-  StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section);
-  if (I == Entries.end()) return false;
-
-  for (StringMap<Entry>::const_iterator II = I->second.begin(),
-                                        IE = I->second.end();
-       II != IE; ++II) {
-    if (II->getValue().match(Query)) {
-      Category = II->first();
-      return true;
-    }
-  }
-
-  return false;
-}
-
 bool SpecialCaseList::inSectionCategory(const StringRef Section,
                                         const StringRef Query,
                                         const StringRef Category) const {
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index cbc1d63..c5e1dcb 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -533,7 +533,7 @@ namespace {
       default: break;
       case Instruction::GetElementPtr:
         // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the intruction addressing mode. At the moment we don't
+        // lowered to the instruction addressing mode. At the moment we don't
         // generate vector GEPs.
         return 0;
       case Instruction::Br:
@@ -625,10 +625,10 @@ namespace {
         ConstantInt *IntOff = ConstOffSCEV->getValue();
         int64_t Offset = IntOff->getSExtValue();
 
-        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+        Type *VTy = IPtr->getType()->getPointerElementType();
         int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
 
-        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        Type *VTy2 = JPtr->getType()->getPointerElementType();
         if (VTy != VTy2 && Offset < 0) {
           int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
           OffsetInElmts = Offset/VTy2TSS;
@@ -1182,6 +1182,8 @@ namespace {
       // Look for an instruction with which to pair instruction *I...
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
+      if (I->mayWriteToMemory()) WriteSet.add(I);
+
       bool JAfterStart = IAfterStart;
       BasicBlock::iterator J = llvm::next(I);
       for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
@@ -1403,6 +1405,8 @@ namespace {
 
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
+      if (I->mayWriteToMemory()) WriteSet.add(I);
+
       for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
         (void) trackUsesOfI(Users, WriteSet, I, J);
 
@@ -2227,11 +2231,12 @@ namespace {
     // The pointer value is taken to be the one with the lowest offset.
     Value *VPtr = IPtr;
 
-    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
-    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
+    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType = PointerType::get(VArgType,
-      cast<PointerType>(IPtr->getType())->getAddressSpace());
+    Type *VArgPtrType
+      = PointerType::get(VArgType,
+                         IPtr->getType()->getPointerAddressSpace());
     return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
                         /* insert before */ I);
   }
@@ -2240,7 +2245,7 @@ namespace {
                      unsigned MaskOffset, unsigned NumInElem,
                      unsigned NumInElem1, unsigned IdxOffset,
                      std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    unsigned NumElem1 = J->getType()->getVectorNumElements();
     for (unsigned v = 0; v < NumElem1; ++v) {
       int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
       if (m < 0) {
@@ -2267,18 +2272,18 @@ namespace {
     Type *ArgTypeJ = J->getType();
     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
-    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
+    unsigned NumElemI = ArgTypeI->getVectorNumElements();
 
     // Get the total number of elements in the fused vector type.
     // By definition, this must equal the number of elements in
     // the final mask.
-    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+    unsigned NumElem = VArgType->getVectorNumElements();
     std::vector<Constant*> Mask(NumElem);
 
     Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    unsigned NumInElemI = OpTypeI->getVectorNumElements();
     Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
 
     // The fused vector will be:
     // -----------------------------------------------------
@@ -2340,6 +2345,12 @@ namespace {
     return ExpandedIEChain;
   }
 
+  static unsigned getNumScalarElements(Type *Ty) {
+    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+      return VecTy->getNumElements();
+    return 1;
+  }
+
   // Returns the value to be used as the specified operand of the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
@@ -2355,17 +2366,8 @@ namespace {
     Instruction *L = I, *H = J;
     Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
 
-    unsigned numElemL;
-    if (ArgTypeL->isVectorTy())
-      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
-    else
-      numElemL = 1;
-
-    unsigned numElemH;
-    if (ArgTypeH->isVectorTy())
-      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
-    else
-      numElemH = 1;
+    unsigned numElemL = getNumScalarElements(ArgTypeL);
+    unsigned numElemH = getNumScalarElements(ArgTypeH);
 
     Value *LOp = L->getOperand(o);
     Value *HOp = H->getOperand(o);
@@ -2426,11 +2428,12 @@ namespace {
 
       if (CanUseInputs) {
         unsigned LOpElem =
-          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(LOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
+
         unsigned HOpElem =
-          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(HOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
 
         // We have one or two input vectors. We need to map each index of the
         // operands to the index of the original vector.
@@ -2646,14 +2649,14 @@ namespace {
                                            getReplacementName(IBeforeJ ? I : J,
                                                               true, o, 1));
         }
-  
+
         NHOp->insertBefore(IBeforeJ ? J : I);
         HOp = NHOp;
       }
     }
 
     if (ArgType->isVectorTy()) {
-      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      unsigned numElem = VArgType->getVectorNumElements();
       std::vector<Constant*> Mask(numElem);
       for (unsigned v = 0; v < numElem; ++v) {
         unsigned Idx = v;
@@ -2746,16 +2749,8 @@ namespace {
       VectorType *VType = getVecTypeForPair(IType, JType);
       unsigned numElem = VType->getNumElements();
 
-      unsigned numElemI, numElemJ;
-      if (IType->isVectorTy())
-        numElemI = cast<VectorType>(IType)->getNumElements();
-      else
-        numElemI = 1;
-
-      if (JType->isVectorTy())
-        numElemJ = cast<VectorType>(JType)->getNumElements();
-      else
-        numElemJ = 1;
+      unsigned numElemI = getNumScalarElements(IType);
+      unsigned numElemJ = getNumScalarElements(JType);
 
       if (IType->isVectorTy()) {
         std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
@@ -2804,6 +2799,8 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
     for (; cast<Instruction>(L) != J; ++L)
       (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
 
@@ -2824,6 +2821,8 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
     for (; cast<Instruction>(L) != J;) {
       if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
         // Move this instruction
@@ -2853,6 +2852,7 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
 
     // Note: We cannot end the loop when we reach J because J could be moved
     // farther down the use chain by another instruction pairing. Also, J
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a62fedc..5e75871 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -126,6 +127,9 @@ static const unsigned MaxVectorWidth = 64;
 /// Maximum vectorization unroll count.
 static const unsigned MaxUnrollFactor = 16;
 
+/// The cost of a loop that is considered 'small' by the unroller.
+static const unsigned SmallLoopCost = 20;
+
 namespace {
 
 // Forward declarations.
@@ -167,7 +171,9 @@ public:
     updateAnalysis();
   }
 
-private:
+  virtual ~InnerLoopVectorizer() {}
+
+protected:
   /// A small list of PHINodes.
   typedef SmallVector<PHINode*, 4> PhiVector;
   /// When we unroll loops we have multiple vector values for each scalar.
@@ -187,7 +193,13 @@ private:
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop(LoopVectorizationLegality *Legal);
+  virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// \brief The Loop exit block may have single value PHI nodes where the
+  /// incoming value is 'Undef'. While vectorizing we only handled real values
+  /// that were defined inside the loop. Here we fix the 'undef case'.
+  /// See PR14725.
+  void fixLCSSAPHIs();
 
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True. It returns the *entry*
@@ -201,16 +213,23 @@ private:
   void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
                             PhiVector *PV);
 
+  /// Vectorize a single PHINode in a block. This method handles the induction
+  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+  /// arbitrary length vectors.
+  void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
+                           LoopVectorizationLegality *Legal,
+                           unsigned UF, unsigned VF, PhiVector *PV);
+
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
   void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
   /// of scalars.
-  void scalarizeInstruction(Instruction *Instr);
+  virtual void scalarizeInstruction(Instruction *Instr);
 
   /// Vectorize Load and Store instructions,
-  void vectorizeMemoryInstruction(Instruction *Instr,
+  virtual void vectorizeMemoryInstruction(Instruction *Instr,
                                   LoopVectorizationLegality *Legal);
 
   /// Create a broadcast instruction. This method generates a broadcast
@@ -218,12 +237,12 @@ private:
   /// value. If this is the induction variable then we extend it to N, N+1, ...
   /// this is needed because each iteration in the loop corresponds to a SIMD
   /// element.
-  Value *getBroadcastInstrs(Value *V);
+  virtual Value *getBroadcastInstrs(Value *V);
 
   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
   /// The sequence starts at StartIndex.
-  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
+  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -233,7 +252,7 @@ private:
   VectorParts &getVectorValue(Value *V);
 
   /// Generate a shuffle sequence that will reverse the vector Vec.
-  Value *reverseVector(Value *Vec);
+  virtual Value *reverseVector(Value *Vec);
 
   /// This is a helper class that holds the vectorizer state. It maps scalar
   /// instructions to vector instructions. When the code is 'unrolled' then
@@ -291,6 +310,8 @@ private:
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
   unsigned VF;
+
+protected:
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
   unsigned UF;
@@ -326,6 +347,22 @@ private:
   EdgeMaskCache MaskCache;
 };
 
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                    DominatorTree *DT, DataLayout *DL,
+                    const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
+    InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
+
+private:
+  virtual void scalarizeInstruction(Instruction *Instr);
+  virtual void vectorizeMemoryInstruction(Instruction *Instr,
+                                          LoopVectorizationLegality *Legal);
+  virtual Value *getBroadcastInstrs(Value *V);
+  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
+  virtual Value *reverseVector(Value *Vec);
+};
+
 /// \brief Look for a meaningful debug location on the instruction or it's
 /// operands.
 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
@@ -409,7 +446,7 @@ public:
     MRK_FloatMax
   };
 
-  /// This POD struct holds information about reduction variables.
+  /// This struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
@@ -446,8 +483,8 @@ public:
     MinMaxReductionKind MinMaxKind;
   };
 
-  // This POD struct holds information about the memory runtime legality
-  // check that a group of pointers do not overlap.
+  /// This struct holds information about the memory runtime legality
+  /// check that a group of pointers do not overlap.
   struct RuntimePointerCheck {
     RuntimePointerCheck() : Need(false) {}
 
@@ -457,6 +494,8 @@ public:
       Pointers.clear();
       Starts.clear();
       Ends.clear();
+      IsWritePtr.clear();
+      DependencySetId.clear();
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
@@ -478,7 +517,7 @@ public:
     SmallVector<unsigned, 2> DependencySetId;
   };
 
-  /// A POD for saving information about induction variables.
+  /// A struct for saving information about induction variables.
   struct InductionInfo {
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
@@ -725,9 +764,9 @@ struct LoopVectorizeHints {
   /// Vectorization unroll factor.
   unsigned Unroll;
 
-  LoopVectorizeHints(const Loop *L)
+  LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
   : Width(VectorizationFactor)
-  , Unroll(VectorizationUnroll)
+  , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
   , LoopID(L->getLoopID()) {
     getHints(L);
     // The command line options override any loop metadata except for when
@@ -736,6 +775,9 @@ struct LoopVectorizeHints {
       Width = VectorizationFactor;
     if (VectorizationUnroll.getNumOccurrences() > 0)
       Unroll = VectorizationUnroll;
+
+    DEBUG(if (DisableUnrolling && Unroll == 1)
+            dbgs() << "LV: Unrolling disabled by the pass manager\n");
   }
 
   /// Return the loop vectorizer metadata prefix.
@@ -762,6 +804,7 @@ struct LoopVectorizeHints {
         Vals.push_back(LoopID->getOperand(i));
 
     Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
+    Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1));
 
     MDNode *NewLoopID = MDNode::get(Context, Vals);
     // Set operand 0 to refer to the loop id itself.
@@ -825,15 +868,18 @@ private:
     unsigned Val = C->getZExtValue();
 
     if (Hint == "width") {
-      assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth &&
-             "Invalid width metadata");
-      Width = Val;
+      if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
+        Width = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n");
     } else if (Hint == "unroll") {
-      assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor &&
-             "Invalid unroll metadata");
-      Unroll = Val;
-    } else
-      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
+      if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
+        Unroll = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
+    } else {
+      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
+    }
   }
 };
 
@@ -842,7 +888,8 @@ struct LoopVectorize : public LoopPass {
   /// Pass identification, replacement for typeid
   static char ID;
 
-  explicit LoopVectorize() : LoopPass(ID) {
+  explicit LoopVectorize(bool NoUnrolling = false)
+    : LoopPass(ID), DisableUnrolling(NoUnrolling) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -852,6 +899,7 @@ struct LoopVectorize : public LoopPass {
   TargetTransformInfo *TTI;
   DominatorTree *DT;
   TargetLibraryInfo *TLI;
+  bool DisableUnrolling;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -865,17 +913,22 @@ struct LoopVectorize : public LoopPass {
     DT = &getAnalysis<DominatorTree>();
     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
+    // If the target claims to have no vector registers don't attempt
+    // vectorization.
+    if (!TTI->getNumberOfRegisters(true))
+      return false;
+
     if (DL == NULL) {
-      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n");
       return false;
     }
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
-    LoopVectorizeHints Hints(L);
+    LoopVectorizeHints Hints(L, DisableUnrolling);
 
-    if (Hints.Width == 1) {
+    if (Hints.Width == 1 && Hints.Unroll == 1) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
     }
@@ -912,19 +965,23 @@ struct LoopVectorize : public LoopPass {
     unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
                                         VF.Cost);
 
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
+          F->getParent()->getModuleIdentifier() << '\n');
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
+
     if (VF.Width == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-      return false;
+      if (UF == 1)
+        return false;
+      // We decided not to vectorize, but we may want to unroll.
+      InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
+      Unroller.vectorize(&LVL);
+    } else {
+      // If we decided that it is *legal* to vectorize the loop then do it.
+      InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
+      LB.vectorize(&LVL);
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
-          F->getParent()->getModuleIdentifier()<<"\n");
-    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
-
-    // If we decided that it is *legal* to vectorize the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
-    LB.vectorize(&LVL);
-
     // Mark the loop as already vectorized to avoid vectorizing again.
     Hints.setAlreadyVectorized(L);
 
@@ -971,25 +1028,19 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Save the current insertion location.
-  Instruction *Loc = Builder.GetInsertPoint();
-
   // We need to place the broadcast of invariant variables outside the loop.
   Instruction *Instr = dyn_cast<Instruction>(V);
   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
 
   // Place the code for broadcasting invariant variables in the new preheader.
+  IRBuilder<>::InsertPointGuard Guard(Builder);
   if (Invariant)
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
 
   // Broadcast the scalar into all locations in the vector.
   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
 
-  // Restore the builder insertion point.
-  if (Invariant)
-    Builder.SetInsertPoint(Loc);
-
   return Shuf;
 }
 
@@ -1016,10 +1067,35 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+static unsigned getGEPInductionOperand(DataLayout *DL,
+                                       const GetElementPtrInst *Gep) {
+  unsigned LastOperand = Gep->getNumOperands() - 1;
+  unsigned GEPAllocSize = DL->getTypeAllocSize(
+      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+
+  // Walk backwards and try to peel off zeros.
+  while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
+    // Find the type we're currently indexing into.
+    gep_type_iterator GEPTI = gep_type_begin(Gep);
+    std::advance(GEPTI, LastOperand - 1);
+
+    // If it's a type with the same allocation size as the result of the GEP we
+    // can peel off the zero index.
+    if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+      break;
+    --LastOperand;
+  }
+
+  return LastOperand;
+}
+
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
   // Make sure that the pointer does not point to structs.
-  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+  if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
 
   // If this value is a pointer induction variable we know it is consecutive.
@@ -1037,8 +1113,6 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     return 0;
 
   unsigned NumOperands = Gep->getNumOperands();
-  Value *LastIndex = Gep->getOperand(NumOperands - 1);
-
   Value *GpPtr = Gep->getPointerOperand();
   // If this GEP value is a consecutive pointer induction variable and all of
   // the indices are constant then we know it is consecutive. We can
@@ -1062,14 +1136,18 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
       return -1;
   }
 
-  // Check that all of the gep indices are uniform except for the last.
-  for (unsigned i = 0; i < NumOperands - 1; ++i)
-    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+  unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+
+  // Check that all of the gep indices are uniform except for our induction
+  // operand.
+  for (unsigned i = 0; i != NumOperands; ++i)
+    if (i != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
-  // We can emit wide load/stores only if the last index is the induction
-  // variable.
-  const SCEV *Last = SE->getSCEV(LastIndex);
+  // We can emit wide load/stores only if the last non-zero index is the
+  // induction variable.
+  const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand));
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
     const SCEV *Step = AR->getStepRecurrence(*SE);
 
@@ -1127,6 +1205,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+  // An alignment of 0 means target abi alignment. We need to use the scalar's
+  // target abi alignment in such a case.
+  if (!Alignment)
+    Alignment = DL->getABITypeAlignment(ScalarDataTy);
   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
@@ -1166,7 +1248,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
     unsigned NumOperands = Gep->getNumOperands();
-    unsigned LastOperand = NumOperands - 1;
+    unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
     // Create the new GEP with the new induction variable.
     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
 
@@ -1175,9 +1257,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
 
       // Update last index or loop invariant instruction anchored in loop.
-      if (i == LastOperand ||
+      if (i == InductionOperand ||
           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
-        assert((i == LastOperand ||
+        assert((i == InductionOperand ||
                SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
                "Must be last index or loop invariant");
 
@@ -1301,7 +1383,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Instruction *Cloned = Instr->clone();
       if (!IsVoidRetTy)
         Cloned->setName(Instr->getName() + ".cloned");
-      // Replace the operands of the cloned instrucions with extracted scalars.
+      // Replace the operands of the cloned instructions with extracted scalars.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
         Value *Op = Params[op][Part];
         // Param is a vector. Need to extract the right lane.
@@ -1335,11 +1417,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   SmallVector<TrackingVH<Value> , 2> Starts;
   SmallVector<TrackingVH<Value> , 2> Ends;
 
+  LLVMContext &Ctx = Loc->getContext();
   SCEVExpander Exp(*SE, "induction");
 
-  // Use this type for pointer arithmetic.
-  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
-
   for (unsigned i = 0; i < NumPointers; ++i) {
     Value *Ptr = PtrRtCheck->Pointers[i];
     const SCEV *Sc = SE->getSCEV(Ptr);
@@ -1350,7 +1430,11 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
       Starts.push_back(Ptr);
       Ends.push_back(Ptr);
     } else {
-      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+
+      // Use this type for pointer arithmetic.
+      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
 
       Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
@@ -1372,10 +1456,20 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
       if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
        continue;
 
-      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
-      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
-      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
-      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
+      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+             "Trying to bounds check pointers with different address spaces");
+
+      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
 
       Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
       Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
@@ -1390,9 +1484,8 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   // We have to do this trickery because the IRBuilder might fold the check to a
   // constant expression in which case there is no Instruction anchored in a
   // the block.
-  LLVMContext &Ctx = Loc->getContext();
-  Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
-                                                  ConstantInt::getTrue(Ctx));
+  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                 ConstantInt::getTrue(Ctx));
   ChkBuilder.Insert(Check, "memcheck.conflict");
   return Check;
 }
@@ -1444,6 +1537,16 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
+  // The exit count might have the type of i64 while the phi is i32. This can
+  // happen if we have an induction variable that is sign extended before the
+  // compare. The only way that we get a backedge taken count is that the
+  // induction variable was signed and as such will not overflow. In such a case
+  // truncation is legal.
+  if (ExitCount->getType()->getPrimitiveSizeInBits() >
+      IdxTy->getPrimitiveSizeInBits())
+    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
+
+  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
   // Get the total trip count from the count by adding 1.
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
@@ -1496,7 +1599,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstNonPHI());
 
   // Generate the induction variable.
   setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
@@ -1724,6 +1827,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
+
+  LoopVectorizeHints Hints(Lp, true);
+  Hints.setAlreadyVectorized(Lp);
 }
 
 /// This function returns the identity element (or neutral element) for
@@ -1753,6 +1859,31 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
   }
 }
 
+static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
+                                              Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
+                                               Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 2 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      I.getType() != I.getArgOperand(1)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+
 static Intrinsic::ID
 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   // If we have an intrinsic call, check if it is trivially vectorizable.
@@ -1767,11 +1898,13 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
     case Intrinsic::log10:
     case Intrinsic::log2:
     case Intrinsic::fabs:
+    case Intrinsic::copysign:
     case Intrinsic::floor:
     case Intrinsic::ceil:
     case Intrinsic::trunc:
     case Intrinsic::rint:
     case Intrinsic::nearbyint:
+    case Intrinsic::round:
     case Intrinsic::pow:
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
@@ -1789,8 +1922,9 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   LibFunc::Func Func;
   Function *F = CI->getCalledFunction();
   // We're going to make assumptions on the semantics of the functions, check
-  // that the target knows that it's available in this environment.
-  if (!F || !TLI->getLibFunc(F->getName(), Func))
+  // that the target knows that it's available in this environment and it does
+  // not have local linkage.
+  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
     return Intrinsic::not_intrinsic;
 
   // Otherwise check if we have a call to a function that can be turned into a
@@ -1801,59 +1935,67 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   case LibFunc::sin:
   case LibFunc::sinf:
   case LibFunc::sinl:
-    return Intrinsic::sin;
+    return checkUnaryFloatSignature(*CI, Intrinsic::sin);
   case LibFunc::cos:
   case LibFunc::cosf:
   case LibFunc::cosl:
-    return Intrinsic::cos;
+    return checkUnaryFloatSignature(*CI, Intrinsic::cos);
   case LibFunc::exp:
   case LibFunc::expf:
   case LibFunc::expl:
-    return Intrinsic::exp;
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp);
   case LibFunc::exp2:
   case LibFunc::exp2f:
   case LibFunc::exp2l:
-    return Intrinsic::exp2;
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
   case LibFunc::log:
   case LibFunc::logf:
   case LibFunc::logl:
-    return Intrinsic::log;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log);
   case LibFunc::log10:
   case LibFunc::log10f:
   case LibFunc::log10l:
-    return Intrinsic::log10;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log10);
   case LibFunc::log2:
   case LibFunc::log2f:
   case LibFunc::log2l:
-    return Intrinsic::log2;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log2);
   case LibFunc::fabs:
   case LibFunc::fabsf:
   case LibFunc::fabsl:
-    return Intrinsic::fabs;
+    return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
+  case LibFunc::copysign:
+  case LibFunc::copysignf:
+  case LibFunc::copysignl:
+    return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
   case LibFunc::floor:
   case LibFunc::floorf:
   case LibFunc::floorl:
-    return Intrinsic::floor;
+    return checkUnaryFloatSignature(*CI, Intrinsic::floor);
   case LibFunc::ceil:
   case LibFunc::ceilf:
   case LibFunc::ceill:
-    return Intrinsic::ceil;
+    return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
   case LibFunc::trunc:
   case LibFunc::truncf:
   case LibFunc::truncl:
-    return Intrinsic::trunc;
+    return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
   case LibFunc::rint:
   case LibFunc::rintf:
   case LibFunc::rintl:
-    return Intrinsic::rint;
+    return checkUnaryFloatSignature(*CI, Intrinsic::rint);
   case LibFunc::nearbyint:
   case LibFunc::nearbyintf:
   case LibFunc::nearbyintl:
-    return Intrinsic::nearbyint;
+    return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
+  case LibFunc::round:
+  case LibFunc::roundf:
+  case LibFunc::roundl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::round);
   case LibFunc::pow:
   case LibFunc::powf:
   case LibFunc::powl:
-    return Intrinsic::pow;
+    return checkBinaryFloatSignature(*CI, Intrinsic::pow);
   }
 
   return Intrinsic::not_intrinsic;
@@ -1925,6 +2067,54 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
   return Select;
 }
 
+namespace {
+struct CSEDenseMapInfo {
+  static bool canHandle(Instruction *I) {
+    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+  }
+  static inline Instruction *getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline Instruction *getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(Instruction *I) {
+    assert(canHandle(I) && "Unknown instruction!");
+    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+                                                           I->value_op_end()));
+  }
+  static bool isEqual(Instruction *LHS, Instruction *RHS) {
+    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+        LHS == getTombstoneKey() || RHS == getTombstoneKey())
+      return LHS == RHS;
+    return LHS->isIdenticalTo(RHS);
+  }
+};
+}
+
+///\brief Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+  // Perform simple cse.
+  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *In = I++;
+
+    if (!CSEDenseMapInfo::canHandle(In))
+      continue;
+
+    // Check if we can replace this instruction with any of the
+    // visited instructions.
+    if (Instruction *V = CSEMap.lookup(In)) {
+      In->replaceAllUsesWith(V);
+      In->eraseFromParent();
+      continue;
+    }
+
+    CSEMap[In] = In;
+  }
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1995,18 +2185,31 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
         RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
       // MinMax reduction have the start value as their identify.
-      VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
-                                                         "minmax.ident");
+      if (VF == 1) {
+        VectorStart = Identity = RdxDesc.StartValue;
+      } else {
+        VectorStart = Identity = Builder.CreateVectorSplat(VF,
+                                                           RdxDesc.StartValue,
+                                                           "minmax.ident");
+      }
     } else {
+      // Handle other reduction kinds:
       Constant *Iden =
-        LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
-                                                        VecTy->getScalarType());
-      Identity = ConstantVector::getSplat(VF, Iden);
-
-      // This vector is the Identity vector where the first element is the
-      // incoming scalar reduction.
-      VectorStart = Builder.CreateInsertElement(Identity,
-                                                RdxDesc.StartValue, Zero);
+      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                      VecTy->getScalarType());
+      if (VF == 1) {
+        Identity = Iden;
+        // This vector is the Identity vector where the first element is the
+        // incoming scalar reduction.
+        VectorStart = RdxDesc.StartValue;
+      } else {
+        Identity = ConstantVector::getSplat(VF, Iden);
+
+        // This vector is the Identity vector where the first element is the
+        // incoming scalar reduction.
+        VectorStart = Builder.CreateInsertElement(Identity,
+                                                  RdxDesc.StartValue, Zero);
+      }
     }
 
     // Fix the vector-loop phi.
@@ -2062,37 +2265,40 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                         ReducedPartRdx, RdxParts[part]);
     }
 
-    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-    // and vector ops, reducing the set of values being computed by half each
-    // round.
-    assert(isPowerOf2_32(VF) &&
-           "Reduction emission only supported for pow2 vectors!");
-    Value *TmpVec = ReducedPartRdx;
-    SmallVector<Constant*, 32> ShuffleMask(VF, 0);
-    for (unsigned i = VF; i != 1; i >>= 1) {
-      // Move the upper half of the vector to the lower half.
-      for (unsigned j = 0; j != i/2; ++j)
-        ShuffleMask[j] = Builder.getInt32(i/2 + j);
-
-      // Fill the rest of the mask with undef.
-      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
-                UndefValue::get(Builder.getInt32Ty()));
-
-      Value *Shuf =
+    if (VF > 1) {
+      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+      // and vector ops, reducing the set of values being computed by half each
+      // round.
+      assert(isPowerOf2_32(VF) &&
+             "Reduction emission only supported for pow2 vectors!");
+      Value *TmpVec = ReducedPartRdx;
+      SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+      for (unsigned i = VF; i != 1; i >>= 1) {
+        // Move the upper half of the vector to the lower half.
+        for (unsigned j = 0; j != i/2; ++j)
+          ShuffleMask[j] = Builder.getInt32(i/2 + j);
+
+        // Fill the rest of the mask with undef.
+        std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+                  UndefValue::get(Builder.getInt32Ty()));
+
+        Value *Shuf =
         Builder.CreateShuffleVector(TmpVec,
                                     UndefValue::get(TmpVec->getType()),
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
-                                     "bin.rdx");
-      else
-        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
-    }
+        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+          TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                       "bin.rdx");
+        else
+          TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
+      }
 
-    // The result is in the first element of the vector.
-    Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+      // The result is in the first element of the vector.
+      ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
+                                                    Builder.getInt32(0));
+    }
 
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
@@ -2101,7 +2307,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi) continue;
+      if (!LCSSAPhi) break;
 
       // All PHINodes need to have a single entry edge, or two if
       // we already fixed them.
@@ -2111,7 +2317,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       // incoming bypass edge.
       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
         // Add an edge coming from the bypass.
-        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
         break;
       }
     }// end of the LCSSA phi scan.
@@ -2123,23 +2329,26 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
     // Pick the other block.
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 
-  // The Loop exit block may have single value PHI nodes where the incoming
-  // value is 'undef'. While vectorizing we only handled real values that
-  // were defined inside the loop. Here we handle the 'undef case'.
-  // See PR14725.
+  fixLCSSAPHIs();
+
+  // Remove redundant induction instructions.
+  cse(LoopVectorBody);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-    if (!LCSSAPhi) continue;
+    if (!LCSSAPhi) break;
     if (LCSSAPhi->getNumIncomingValues() == 1)
       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
                             LoopMiddleBlock);
   }
-}
+} 
 
 InnerLoopVectorizer::VectorParts
 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
@@ -2200,161 +2409,185 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   return BlockMask;
 }
 
-void
-InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
-                                          BasicBlock *BB, PhiVector *PV) {
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    VectorParts &Entry = WidenMap.get(it);
-    switch (it->getOpcode()) {
-    case Instruction::Br:
-      // Nothing to do for PHIs and BR, since we already took care of the
-      // loop control flow instructions.
-      continue;
-    case Instruction::PHI:{
-      PHINode* P = cast<PHINode>(it);
-      // Handle reduction variables:
-      if (Legal->getReductionVars()->count(P)) {
-        for (unsigned part = 0; part < UF; ++part) {
-          // This is phase one of vectorizing PHIs.
-          Type *VecTy = VectorType::get(it->getType(), VF);
-          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
-                                        LoopVectorBody-> getFirstInsertionPt());
-        }
-        PV->push_back(P);
-        continue;
-      }
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
+                                              InnerLoopVectorizer::VectorParts &Entry,
+                                              LoopVectorizationLegality *Legal,
+                                              unsigned UF, unsigned VF, PhiVector *PV) {
+  PHINode* P = cast<PHINode>(PN);
+  // Handle reduction variables:
+  if (Legal->getReductionVars()->count(P)) {
+    for (unsigned part = 0; part < UF; ++part) {
+      // This is phase one of vectorizing PHIs.
+      Type *VecTy = (VF == 1) ? PN->getType() :
+      VectorType::get(PN->getType(), VF);
+      Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
+                                    LoopVectorBody-> getFirstInsertionPt());
+    }
+    PV->push_back(P);
+    return;
+  }
 
-      setDebugLocFromInst(Builder, P);
-      // Check for PHI nodes that are lowered to vector selects.
-      if (P->getParent() != OrigLoop->getHeader()) {
-        // We know that all PHIs in non header blocks are converted into
-        // selects, so we don't have to worry about the insertion order and we
-        // can just use the builder.
-        // At this point we generate the predication tree. There may be
-        // duplications since this is a simple recursive scan, but future
-        // optimizations will clean it up.
-
-        unsigned NumIncoming = P->getNumIncomingValues();
-
-        // Generate a sequence of selects of the form:
-        // SELECT(Mask3, In3,
-        //      SELECT(Mask2, In2,
-        //                   ( ...)))
-        for (unsigned In = 0; In < NumIncoming; In++) {
-          VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
-                                            P->getParent());
-          VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
-
-          for (unsigned part = 0; part < UF; ++part) {
-            // We might have single edge PHIs (blocks) - use an identity
-            // 'select' for the first PHI operand.
-            if (In == 0)
-              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
-                                                 In0[part]);
-            else
-              // Select between the current value and the previous incoming edge
-              // based on the incoming mask.
-              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
-                                                 Entry[part], "predphi");
-          }
-        }
-        continue;
+  setDebugLocFromInst(Builder, P);
+  // Check for PHI nodes that are lowered to vector selects.
+  if (P->getParent() != OrigLoop->getHeader()) {
+    // We know that all PHIs in non header blocks are converted into
+    // selects, so we don't have to worry about the insertion order and we
+    // can just use the builder.
+    // At this point we generate the predication tree. There may be
+    // duplications since this is a simple recursive scan, but future
+    // optimizations will clean it up.
+
+    unsigned NumIncoming = P->getNumIncomingValues();
+
+    // Generate a sequence of selects of the form:
+    // SELECT(Mask3, In3,
+    //      SELECT(Mask2, In2,
+    //                   ( ...)))
+    for (unsigned In = 0; In < NumIncoming; In++) {
+      VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+                                        P->getParent());
+      VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+      for (unsigned part = 0; part < UF; ++part) {
+        // We might have single edge PHIs (blocks) - use an identity
+        // 'select' for the first PHI operand.
+        if (In == 0)
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                             In0[part]);
+        else
+          // Select between the current value and the previous incoming edge
+          // based on the incoming mask.
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                             Entry[part], "predphi");
       }
+    }
+    return;
+  }
 
-      // This PHINode must be an induction variable.
-      // Make sure that we know about it.
-      assert(Legal->getInductionVars()->count(P) &&
-             "Not an induction variable");
-
-      LoopVectorizationLegality::InductionInfo II =
-        Legal->getInductionVars()->lookup(P);
-
-      switch (II.IK) {
-      case LoopVectorizationLegality::IK_NoInduction:
-        llvm_unreachable("Unknown induction");
-      case LoopVectorizationLegality::IK_IntInduction: {
-        assert(P->getType() == II.StartValue->getType() && "Types must match");
-        Type *PhiTy = P->getType();
-        Value *Broadcasted;
-        if (P == OldInduction) {
-          // Handle the canonical induction variable. We might have had to
-          // extend the type.
-          Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
-        } else {
-          // Handle other induction variables that are now based on the
-          // canonical one.
-          Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
-                                                   "normalized.idx");
-          NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
-          Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
-                                          "offset.idx");
-        }
-        Broadcasted = getBroadcastInstrs(Broadcasted);
-        // After broadcasting the induction variable we need to make the vector
-        // consecutive by adding 0, 1, 2, etc.
+  // This PHINode must be an induction variable.
+  // Make sure that we know about it.
+  assert(Legal->getInductionVars()->count(P) &&
+         "Not an induction variable");
+
+  LoopVectorizationLegality::InductionInfo II =
+  Legal->getInductionVars()->lookup(P);
+
+  switch (II.IK) {
+    case LoopVectorizationLegality::IK_NoInduction:
+      llvm_unreachable("Unknown induction");
+    case LoopVectorizationLegality::IK_IntInduction: {
+      assert(P->getType() == II.StartValue->getType() && "Types must match");
+      Type *PhiTy = P->getType();
+      Value *Broadcasted;
+      if (P == OldInduction) {
+        // Handle the canonical induction variable. We might have had to
+        // extend the type.
+        Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
+      } else {
+        // Handle other induction variables that are now based on the
+        // canonical one.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
+                                                 "normalized.idx");
+        NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
+        Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
+                                        "offset.idx");
+      }
+      Broadcasted = getBroadcastInstrs(Broadcasted);
+      // After broadcasting the induction variable we need to make the vector
+      // consecutive by adding 0, 1, 2, etc.
+      for (unsigned part = 0; part < UF; ++part)
+        Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
+      return;
+    }
+    case LoopVectorizationLegality::IK_ReverseIntInduction:
+    case LoopVectorizationLegality::IK_PtrInduction:
+    case LoopVectorizationLegality::IK_ReversePtrInduction:
+      // Handle reverse integer and pointer inductions.
+      Value *StartIdx = ExtendedIdx;
+      // This is the normalized GEP that starts counting at zero.
+      Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                               "normalized.idx");
+
+      // Handle the reverse integer induction variable case.
+      if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
+        IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+        Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                               "resize.norm.idx");
+        Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                               "reverse.idx");
+
+        // This is a new value so do not hoist it out.
+        Value *Broadcasted = getBroadcastInstrs(ReverseInd);
+        // After broadcasting the induction variable we need to make the
+        // vector consecutive by adding  ... -3, -2, -1, 0.
         for (unsigned part = 0; part < UF; ++part)
-          Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
-        continue;
+          Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+                                             true);
+        return;
       }
-      case LoopVectorizationLegality::IK_ReverseIntInduction:
-      case LoopVectorizationLegality::IK_PtrInduction:
-      case LoopVectorizationLegality::IK_ReversePtrInduction:
-        // Handle reverse integer and pointer inductions.
-        Value *StartIdx = ExtendedIdx;
-        // This is the normalized GEP that starts counting at zero.
-        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                                 "normalized.idx");
 
-        // Handle the reverse integer induction variable case.
-        if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
-          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
-          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
-                                                 "resize.norm.idx");
-          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
-                                                 "reverse.idx");
-
-          // This is a new value so do not hoist it out.
-          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
-          // After broadcasting the induction variable we need to make the
-          // vector consecutive by adding  ... -3, -2, -1, 0.
-          for (unsigned part = 0; part < UF; ++part)
-            Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
-                                               true);
+      // Handle the pointer induction variable case.
+      assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+      // Is this a reverse induction ptr or a consecutive induction ptr.
+      bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+                      II.IK);
+
+      // This is the vector of results. Notice that we don't generate
+      // vector geps because scalar geps result in better code.
+      for (unsigned part = 0; part < UF; ++part) {
+        if (VF == 1) {
+          int EltIndex = (part) * (Reverse ? -1 : 1);
+          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+          Value *GlobalIdx;
+          if (Reverse)
+            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+          else
+            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          Entry[part] = SclrGep;
           continue;
         }
 
-        // Handle the pointer induction variable case.
-        assert(P->getType()->isPointerTy() && "Unexpected type.");
-
-        // Is this a reverse induction ptr or a consecutive induction ptr.
-        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
-                        II.IK);
-
-        // This is the vector of results. Notice that we don't generate
-        // vector geps because scalar geps result in better code.
-        for (unsigned part = 0; part < UF; ++part) {
-          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-          for (unsigned int i = 0; i < VF; ++i) {
-            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
-            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
-            Value *GlobalIdx;
-            if (!Reverse)
-              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-            else
-              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
-
-            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
-                                               "next.gep");
-            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                                 Builder.getInt32(i),
-                                                 "insert.gep");
-          }
-          Entry[part] = VecVal;
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+          Value *GlobalIdx;
+          if (!Reverse)
+            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+          else
+            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
         }
-        continue;
+        Entry[part] = VecVal;
       }
+      return;
+  }
+}
 
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+                                          BasicBlock *BB, PhiVector *PV) {
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    VectorParts &Entry = WidenMap.get(it);
+    switch (it->getOpcode()) {
+    case Instruction::Br:
+      // Nothing to do for PHIs and BR, since we already took care of the
+      // loop control flow instructions.
+      continue;
+    case Instruction::PHI:{
+      // Vectorize PHINodes.
+      widenPHIInstruction(it, Entry, Legal, UF, VF, PV);
+      continue;
     }// End of PHI.
 
     case Instruction::Add:
@@ -2413,8 +2646,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       VectorParts &Cond = getVectorValue(it->getOperand(0));
       VectorParts &Op0  = getVectorValue(it->getOperand(1));
       VectorParts &Op1  = getVectorValue(it->getOperand(2));
-      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
-                                                       Builder.getInt32(0));
+
+      Value *ScalarCond = (VF == 1) ? Cond[0] :
+        Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
+
       for (unsigned Part = 0; Part < UF; ++Part) {
         Entry[Part] = Builder.CreateSelect(
           InvariantCond ? ScalarCond : Cond[Part],
@@ -2475,7 +2710,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         break;
       }
       /// Vectorize casts.
-      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+      Type *DestTy = (VF == 1) ? CI->getType() :
+                                 VectorType::get(CI->getType(), VF);
 
       VectorParts &A = getVectorValue(it->getOperand(0));
       for (unsigned Part = 0; Part < UF; ++Part)
@@ -2505,7 +2741,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
             VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
             Args.push_back(Arg[Part]);
           }
-          Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
+          Type *Tys[] = {CI->getType()};
+          if (VF > 1)
+            Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+
           Function *F = Intrinsic::getDeclaration(M, ID, Tys);
           Entry[Part] = Builder.CreateCall(F, Args);
         }
@@ -2542,19 +2781,36 @@ void InnerLoopVectorizer::updateAnalysis() {
   DEBUG(DT->verifyAnalysis());
 }
 
+/// \brief Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (!Phi)
+      return true;
+    for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
+      if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
+        if (C->canTrap())
+          return false;
+  }
+  return true;
+}
+
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion)
     return false;
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
 
   // A list of pointers that we can safely read and write to.
   SmallPtrSet<Value *, 8> SafePointes;
 
   // Collect safe addresses.
-  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
-    BasicBlock *BB = LoopBlocks[i];
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
 
     if (blockNeedsPredication(BB))
       continue;
@@ -2568,16 +2824,22 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   }
 
   // Collect the blocks that need predication.
-  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
-    BasicBlock *BB = LoopBlocks[i];
+  BasicBlock *Header = TheLoop->getHeader();
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
 
     // We don't support switch statements inside loops.
     if (!isa<BranchInst>(BB->getTerminator()))
       return false;
 
     // We must be able to predicate all blocks that need to be predicated.
-    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes))
+    if (blockNeedsPredication(BB)) {
+      if (!blockCanBePredicated(BB, SafePointes))
+        return false;
+    } else if (BB != Header && !canIfConvertPHINodes(BB))
       return false;
+
   }
 
   // We can if-convert this loop.
@@ -2602,19 +2864,17 @@ bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->getExitingBlock())
     return false;
 
-  unsigned NumBlocks = TheLoop->getNumBlocks();
+  // We need to have a loop header.
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << '\n');
 
   // Check if we can if-convert non single-bb loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
     return false;
   }
 
-  // We need to have a loop header.
-  BasicBlock *Latch = TheLoop->getLoopLatch();
-  DEBUG(dbgs() << "LV: Found a loop: " <<
-        TheLoop->getHeader()->getName() << "\n");
-
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
@@ -2623,6 +2883,7 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // Do not loop-vectorize loops with a tiny trip count.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
   if (TC > 0u && TC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
@@ -2657,7 +2918,13 @@ bool LoopVectorizationLegality::canVectorize() {
 
 static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
   if (Ty->isPointerTy())
-    return DL.getIntPtrType(Ty->getContext());
+    return DL.getIntPtrType(Ty);
+
+  // It is possible that char's or short's overflow when we ask for the loop's
+  // trip count, work around this by changing the type size.
+  if (Ty->getScalarSizeInBits() < 32)
+    return Type::getInt32Ty(Ty->getContext());
+
   return Ty;
 }
 
@@ -2682,7 +2949,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
       Instruction *U = cast<Instruction>(*I);
       // This user may be a reduction exit value.
       if (!TheLoop->contains(U)) {
-        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n');
         return true;
       }
     }
@@ -2758,6 +3025,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
           DEBUG(dbgs() << "LV: Found an induction variable.\n");
           Inductions[Phi] = InductionInfo(StartValue, IK);
+
+          // Until we explicitly handle the case of an induction variable with
+          // an outside loop user we have to give up vectorizing this loop.
+          if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+            return false;
+
           continue;
         }
 
@@ -2812,9 +3085,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       }
 
       // Check that the instruction return type is vectorizable.
-      if (!VectorType::isValidElementType(it->getType()) &&
-          !it->getType()->isVoidTy()) {
-        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+      // Also, we can't vectorize extractelement instructions.
+      if ((!VectorType::isValidElementType(it->getType()) &&
+           !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
       }
 
@@ -2904,7 +3178,7 @@ public:
   /// non-intersection.
   bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
                        unsigned &NumComparisons, ScalarEvolution *SE,
-                       Loop *TheLoop);
+                       Loop *TheLoop, bool ShouldCheckStride = false);
 
   /// \brief Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
@@ -2918,6 +3192,7 @@ public:
   bool isRTCheckNeeded() { return IsRTCheckNeeded; }
 
   bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  void resetDepChecks() { CheckDeps.clear(); }
 
   MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
 
@@ -2972,10 +3247,15 @@ static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
   return AR->isAffine();
 }
 
+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+                        const Loop *Lp);
+
 bool AccessAnalysis::canCheckPtrAtRT(
                        LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
                         unsigned &NumComparisons, ScalarEvolution *SE,
-                        Loop *TheLoop) {
+                        Loop *TheLoop, bool ShouldCheckStride) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   unsigned NumReadPtrChecks = 0;
@@ -3003,7 +3283,10 @@ bool AccessAnalysis::canCheckPtrAtRT(
     else
       ++NumReadPtrChecks;
 
-    if (hasComputableBounds(SE, Ptr)) {
+    if (hasComputableBounds(SE, Ptr) &&
+        // When we run after a failing dependency check we have to make sure we
+        // don't have wrapping pointers.
+        (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) {
       // The id of the dependence set.
       unsigned DepId;
 
@@ -3019,7 +3302,7 @@ bool AccessAnalysis::canCheckPtrAtRT(
 
       RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
 
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
     } else {
       CanDoRT = false;
     }
@@ -3027,9 +3310,36 @@ bool AccessAnalysis::canCheckPtrAtRT(
 
   if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
     NumComparisons = 0; // Only one dependence set.
-  else
+  else {
     NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
                                            NumWritePtrChecks - 1));
+  }
+
+  // If the pointers that we would use for the bounds comparison have different
+  // address spaces, assume the values aren't directly comparable, so we can't
+  // use them for the runtime check. We also have to assume they could
+  // overlap. In the future there should be metadata for whether address spaces
+  // are disjoint.
+  unsigned NumPointers = RtCheck.Pointers.size();
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i + 1; j < NumPointers; ++j) {
+      // Only need to check pointers between two different dependency sets.
+      if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+       continue;
+
+      Value *PtrI = RtCheck.Pointers[i];
+      Value *PtrJ = RtCheck.Pointers[j];
+
+      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
+      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
+      if (ASi != ASj) {
+        DEBUG(dbgs() << "LV: Runtime check would require comparison between"
+                       " different address spaces\n");
+        return false;
+      }
+    }
+  }
+
   return CanDoRT;
 }
 
@@ -3084,7 +3394,7 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) {
                         !isa<Argument>(UnderlyingObj)) &&
            !isIdentifiedObject(UnderlyingObj))) {
         DEBUG(dbgs() << "LV: Found an unidentified " <<
-              (IsWrite ?  "write" : "read" ) << " ptr:" << *UnderlyingObj <<
+              (IsWrite ?  "write" : "read" ) << " ptr: " << *UnderlyingObj <<
               "\n");
         IsRTCheckNeeded = (IsRTCheckNeeded ||
                            !isIdentifiedObject(UnderlyingObj) ||
@@ -3158,8 +3468,9 @@ public:
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
 
-  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
-    SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
+  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
+        ShouldRetryWithRuntimeCheck(false) {}
 
   /// \brief Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -3189,6 +3500,10 @@ public:
   /// the accesses safely with.
   unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
+  /// \brief In same cases when the dependency check fails we can still
+  /// vectorize the loop with a dynamic array access check.
+  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+
 private:
   ScalarEvolution *SE;
   DataLayout *DL;
@@ -3206,6 +3521,10 @@ private:
   // We can access this many bytes in parallel safely.
   unsigned MaxSafeDepDistBytes;
 
+  /// \brief If we see a non constant dependence distance we can still try to
+  /// vectorize this loop with runtime checks.
+  bool ShouldRetryWithRuntimeCheck;
+
   /// \brief Check whether there is a plausible dependence between the two
   /// accesses.
   ///
@@ -3403,6 +3722,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
   if (!C) {
     DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
+    ShouldRetryWithRuntimeCheck = true;
     return true;
   }
 
@@ -3428,7 +3748,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   if (Val == 0) {
     if (ATy == BTy)
       return false;
-    DEBUG(dbgs() << "LV: Zero dependence difference but different types");
+    DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
     return true;
   }
 
@@ -3437,7 +3757,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // Positive distance bigger than max vectorization factor.
   if (ATy != BTy) {
     DEBUG(dbgs() <<
-          "LV: ReadWrite-Write positive dependency with different types");
+          "LV: ReadWrite-Write positive dependency with different types\n");
     return false;
   }
 
@@ -3454,7 +3774,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       2*TypeByteSize > MaxSafeDepDistBytes ||
       Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
     DEBUG(dbgs() << "LV: Failure because of Positive distance "
-        << Val.getSExtValue() << "\n");
+        << Val.getSExtValue() << '\n');
     return true;
   }
 
@@ -3467,7 +3787,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
      return true;
 
   DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
-        " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
+        " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
 
   return false;
 }
@@ -3571,8 +3891,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
         Stores.push_back(St);
         DepChecker.addAccess(St);
       }
-    } // next instr.
-  } // next block.
+    } // Next instr.
+  } // Next block.
 
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
@@ -3619,7 +3939,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
-  SmallPtrSet<Value *, 16> ReadOnlyPtr;
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
     LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
@@ -3667,7 +3986,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (NumComparisons == 0 && NeedRTCheck)
     NeedRTCheck = false;
 
-  // Check that we did not collect too many pointers or found a unsizeable
+  // Check that we did not collect too many pointers or found an unsizeable
   // pointer.
   if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
@@ -3693,9 +4012,32 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
                                        Accesses.getDependenciesToCheck());
     MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+
+    if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
+      DEBUG(dbgs() << "LV: Retrying with memory checks\n");
+      NeedRTCheck = true;
+
+      // Clear the dependency checks. We assume they are not needed.
+      Accesses.resetDepChecks();
+
+      PtrRtCheck.reset();
+      PtrRtCheck.Need = true;
+
+      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
+                                         TheLoop, true);
+      // Check that we did not collect too many pointers or found an unsizeable
+      // pointer.
+      if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+        DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
+        PtrRtCheck.reset();
+        return false;
+      }
+
+      CanVecMem = true;
+    }
   }
 
-  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+  DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
         " need a runtime memory check.\n");
 
   return CanVecMem;
@@ -3839,6 +4181,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
         if (ExitInstruction != 0 || Cur == Phi)
           return false;
 
+        // The instruction used by an outside user must be the last instruction
+        // before we feed back to the reduction phi. Otherwise, we loose VF-1
+        // operations on the value.
+        if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
+         return false;
+
         ExitInstruction = Cur;
         continue;
       }
@@ -4045,6 +4393,14 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
     if (it->mayWriteToMemory() || it->mayThrow())
       return false;
 
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+         OI != OE; ++OI) {
+      if (Constant *C = dyn_cast<Constant>(*OI))
+        if (C->canTrap())
+          return false;
+    }
+
     // The instructions below can trap.
     switch (it->getOpcode()) {
     default: continue;
@@ -4071,7 +4427,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   // Find the trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
-  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   unsigned WidestType = getWidestType();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
@@ -4082,7 +4438,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
                     WidestRegister : MaxSafeDepDist);
   unsigned MaxVectorSize = WidestRegister / WidestType;
   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
-  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is: "
+          << WidestRegister << " bits.\n");
 
   if (MaxVectorSize == 0) {
     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
@@ -4118,7 +4475,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   if (UserVF != 0) {
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
-    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
 
     Factor.Width = UserVF;
     return Factor;
@@ -4126,13 +4483,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   float Cost = expectedCost(1);
   unsigned Width = 1;
-  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
   for (unsigned i=2; i <= VF; i*=2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
     float VectorCost = expectedCost(i) / (float)i;
-    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+    DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
           (int)VectorCost << ".\n");
     if (VectorCost < Cost) {
       Cost = VectorCost;
@@ -4256,8 +4613,20 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   else if (UF < 1)
     UF = 1;
 
-  if (Legal->getReductionVars()->size()) {
-    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+  bool HasReductions = Legal->getReductionVars()->size();
+
+  // Decide if we want to unroll if we decided that it is legal to vectorize
+  // but not profitable.
+  if (VF == 1) {
+    if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
+        LoopCost > SmallLoopCost)
+      return 1;
+
+    return UF;
+  }
+
+  if (HasReductions) {
+    DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
     return UF;
   }
 
@@ -4265,14 +4634,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   // We assume that the cost overhead is 1 and we use the cost model
   // to estimate the cost of the loop and unroll until the cost of the
   // loop overhead is about 5% of the cost of the loop.
-  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
-  if (LoopCost < 20) {
-    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
-    unsigned NewUF = 20/LoopCost + 1;
+  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  if (LoopCost < SmallLoopCost) {
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
+    unsigned NewUF = SmallLoopCost / (LoopCost + 1);
     return std::min(NewUF, UF);
   }
 
-  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  DEBUG(dbgs() << "LV: Not Unrolling.\n");
   return 1;
 }
 
@@ -4373,16 +4742,16 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
 
     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
-          OpenIntervals.size() <<"\n");
+          OpenIntervals.size() << '\n');
 
     // Add the current instruction to the list of open intervals.
     OpenIntervals.insert(I);
   }
 
   unsigned Invariant = LoopInvariants.size();
-  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
-  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
-  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
 
   R.LoopInvariantRegs = Invariant;
   R.MaxLocalUsers = MaxUsage;
@@ -4406,8 +4775,8 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
       unsigned C = getInstructionCost(it, VF);
       BlockCost += C;
-      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
-            VF << " For instruction: "<< *it << "\n");
+      DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
+            VF << " For instruction: " << *it << '\n');
     }
 
     // We assume that if-converted blocks have a 50% chance of being executed.
@@ -4669,13 +5038,16 @@ char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
-  Pass *createLoopVectorizePass() {
-    return new LoopVectorize();
+  Pass *createLoopVectorizePass(bool NoUnrolling) {
+    return new LoopVectorize(NoUnrolling);
   }
 }
 
@@ -4690,3 +5062,96 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
 
   return false;
 }
+
+
+void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<VectorParts, 4> Params;
+
+  setDebugLocFromInst(Builder, Instr);
+
+  // Find all of the vectorized parameters.
+  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+    Value *SrcOp = Instr->getOperand(op);
+
+    // If we are accessing the old induction variable, use the new one.
+    if (SrcOp == OldInduction) {
+      Params.push_back(getVectorValue(SrcOp));
+      continue;
+    }
+
+    // Try using previously calculated values.
+    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+    // If the src is an instruction that appeared earlier in the basic block
+    // then it should already be vectorized.
+    if (SrcInst && OrigLoop->contains(SrcInst)) {
+      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
+      // The parameter is a vector value from earlier.
+      Params.push_back(WidenMap.get(SrcInst));
+    } else {
+      // The parameter is a scalar from outside the loop. Maybe even a constant.
+      VectorParts Scalars;
+      Scalars.append(UF, SrcOp);
+      Params.push_back(Scalars);
+    }
+  }
+
+  assert(Params.size() == Instr->getNumOperands() &&
+         "Invalid number of operands");
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+  Value *UndefVec = IsVoidRetTy ? 0 :
+  UndefValue::get(Instr->getType());
+  // Create a new entry in the WidenMap and initialize it to Undef or Null.
+  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+
+  // For each vector unroll 'part':
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // For each scalar that we create:
+
+    Instruction *Cloned = Instr->clone();
+      if (!IsVoidRetTy)
+        Cloned->setName(Instr->getName() + ".cloned");
+      // Replace the operands of the cloned instructions with extracted scalars.
+      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+        Value *Op = Params[op][Part];
+        Cloned->setOperand(op, Op);
+      }
+
+      // Place the cloned scalar in the new loop.
+      Builder.Insert(Cloned);
+
+      // If the original scalar returns a value we need to place it in a vector
+      // so that future users will be able to use it.
+      if (!IsVoidRetTy)
+        VecResults[Part] = Cloned;
+  }
+}
+
+void
+InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr,
+                                              LoopVectorizationLegality*) {
+  return scalarizeInstruction(Instr);
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) {
+  return Vec;
+}
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
+  return V;
+}
+
+Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
+                                               bool Negate) {
+  // When unrolling and the VF is 1, we only need to add a simple scalar.
+  Type *ITy = Val->getType();
+  assert(!ITy->isVectorTy() && "Val must be a scalar");
+  Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
+  return Builder.CreateAdd(Val, C, "induction");
+}
+
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9312b4b..c72b51f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -25,8 +25,8 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -49,34 +49,24 @@ static cl::opt<int>
     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                      cl::desc("Only vectorize if you gain more than this "
                               "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
+                   cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Attempt to vectorize horizontal reductions feeding into a store"));
+
 namespace {
 
 static const unsigned MinVecRegSize = 128;
 
 static const unsigned RecursionMaxDepth = 12;
 
-/// RAII pattern to save the insertion point of the IR builder.
-class BuilderLocGuard {
-public:
-  BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()),
-  DbgLoc(B.getCurrentDebugLocation()) {}
-  ~BuilderLocGuard() {
-    Builder.SetCurrentDebugLocation(DbgLoc);
-    if (Loc)
-      Builder.SetInsertPoint(Loc);
-  }
-
-private:
-  // Prevent copying.
-  BuilderLocGuard(const BuilderLocGuard &);
-  BuilderLocGuard &operator=(const BuilderLocGuard &);
-  IRBuilder<> &Builder;
-  AssertingVH<Instruction> Loc;
-  DebugLoc DbgLoc;
-};
-
-/// A helper class for numbering instructions in multible blocks.
-/// Numbers starts at zero for each basic block.
+/// A helper class for numbering instructions in multiple blocks.
+/// Numbers start at zero for each basic block.
 struct BlockNumbering {
 
   BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
@@ -173,6 +163,37 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
   return Opcode;
 }
 
+/// \returns \p I after propagating metadata from \p VL.
+static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
+  Instruction *I0 = cast<Instruction>(VL[0]);
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  I0->getAllMetadataOtherThanDebugLoc(Metadata);
+
+  for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
+    unsigned Kind = Metadata[i].first;
+    MDNode *MD = Metadata[i].second;
+
+    for (int i = 1, e = VL.size(); MD && i != e; i++) {
+      Instruction *I = cast<Instruction>(VL[i]);
+      MDNode *IMD = I->getMetadata(Kind);
+
+      switch (Kind) {
+      default:
+        MD = 0; // Remove unknown metadata
+        break;
+      case LLVMContext::MD_tbaa:
+        MD = MDNode::getMostGenericTBAA(MD, IMD);
+        break;
+      case LLVMContext::MD_fpmath:
+        MD = MDNode::getMostGenericFPMath(MD, IMD);
+        break;
+      }
+    }
+    I->setMetadata(Kind, MD);
+  }
+  return I;
+}
+
 /// \returns The type that all of the values in \p VL have or null if there
 /// are different types.
 static Type* getSameType(ArrayRef<Value *> VL) {
@@ -216,6 +237,104 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) {
   return true;
 }
 
+static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                           SmallVectorImpl<Value *> &Left,
+                                           SmallVectorImpl<Value *> &Right) {
+
+  SmallVector<Value *, 16> OrigLeft, OrigRight;
+
+  bool AllSameOpcodeLeft = true;
+  bool AllSameOpcodeRight = true;
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    Instruction *I = cast<Instruction>(VL[i]);
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+
+    OrigLeft.push_back(V0);
+    OrigRight.push_back(V1);
+
+    Instruction *I0 = dyn_cast<Instruction>(V0);
+    Instruction *I1 = dyn_cast<Instruction>(V1);
+
+    // Check whether all operands on one side have the same opcode. In this case
+    // we want to preserve the original order and not make things worse by
+    // reordering.
+    AllSameOpcodeLeft = I0;
+    AllSameOpcodeRight = I1;
+
+    if (i && AllSameOpcodeLeft) {
+      if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
+        if(P0->getOpcode() != I0->getOpcode())
+          AllSameOpcodeLeft = false;
+      } else
+        AllSameOpcodeLeft = false;
+    }
+    if (i && AllSameOpcodeRight) {
+      if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
+        if(P1->getOpcode() != I1->getOpcode())
+          AllSameOpcodeRight = false;
+      } else
+        AllSameOpcodeRight = false;
+    }
+
+    // Sort two opcodes. In the code below we try to preserve the ability to use
+    // broadcast of values instead of individual inserts.
+    // vl1 = load
+    // vl2 = phi
+    // vr1 = load
+    // vr2 = vr2
+    //    = vl1 x vr1
+    //    = vl2 x vr2
+    // If we just sorted according to opcode we would leave the first line in
+    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
+    //    = vl1 x vr1
+    //    = vr2 x vl2
+    // Because vr2 and vr1 are from the same load we loose the opportunity of a
+    // broadcast for the packed right side in the backend: we have [vr1, vl2]
+    // instead of [vr1, vr2=vr1].
+    if (I0 && I1) {
+       if(!i && I0->getOpcode() > I1->getOpcode()) {
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
+         // Try not to destroy a broad cast for no apparent benefit.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
+         // Try preserve broadcasts.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
+         // Try preserve broadcasts.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else {
+         Left.push_back(I0);
+         Right.push_back(I1);
+       }
+       continue;
+    }
+    // One opcode, put the instruction on the right.
+    if (I0) {
+      Left.push_back(V1);
+      Right.push_back(I0);
+      continue;
+    }
+    Left.push_back(V0);
+    Right.push_back(V1);
+  }
+
+  bool LeftBroadcast = isSplat(Left);
+  bool RightBroadcast = isSplat(Right);
+
+  // Don't reorder if the operands where good to begin with.
+  if (!(LeftBroadcast || RightBroadcast) &&
+      (AllSameOpcodeRight || AllSameOpcodeLeft)) {
+    Left = OrigLeft;
+    Right = OrigRight;
+  }
+}
+
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
 public:
@@ -238,17 +357,20 @@ public:
     }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
-  void vectorizeTree();
+  /// Returns the vectorized root.
+  Value *vectorizeTree();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
   int getTreeCost();
 
-  /// Construct a vectorizable tree that starts at \p Roots.
-  void buildTree(ArrayRef<Value *> Roots);
+  /// Construct a vectorizable tree that starts at \p Roots and is possibly
+  /// used by a reduction of \p RdxOps.
+  void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
+    RdxOps = 0;
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
@@ -278,7 +400,7 @@ private:
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
-  Value *alreadyVectorized(ArrayRef<Value *> VL);
+  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
 
   /// \brief Take the pointer operand from the Load/Store instruction.
   /// \returns NULL if this is not a valid Load/Store instruction.
@@ -305,26 +427,31 @@ private:
   /// \returns the pointer to the barrier instruction if we can't sink.
   Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
 
-  /// \returns the index of the last instrucion in the BB from \p VL.
+  /// \returns the index of the last instruction in the BB from \p VL.
   int getLastIndex(ArrayRef<Value *> VL);
 
-  /// \returns the Instrucion in the bundle \p VL.
+  /// \returns the Instruction in the bundle \p VL.
   Instruction *getLastInstruction(ArrayRef<Value *> VL);
 
+  /// \brief Set the Builder insert point to one after the last instruction in
+  /// the bundle
+  void setInsertPointAfterBundle(ArrayRef<Value *> VL);
+
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
 
+  /// \returns whether the VectorizableTree is fully vectoriable and will
+  /// be beneficial even the tree height is tiny.
+  bool isFullyVectorizableTinyTree(); 
+
   struct TreeEntry {
     TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
     NeedToGather(0) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
-    bool isSame(ArrayRef<Value *> VL) {
+    bool isSame(ArrayRef<Value *> VL) const {
       assert(VL.size() == Scalars.size() && "Invalid size");
-      for (int i = 0, e = VL.size(); i != e; ++i)
-        if (VL[i] != Scalars[i])
-          return false;
-      return true;
+      return std::equal(VL.begin(), VL.end(), Scalars.begin());
     }
 
     /// A vector of scalars.
@@ -393,10 +520,15 @@ private:
 
   /// Holds all of the instructions that we gathered.
   SetVector<Instruction *> GatherSeq;
+  /// A list of blocks that we are going to CSE.
+  SmallSet<BasicBlock *, 8> CSEBlocks;
 
   /// Numbers instructions in different blocks.
   DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
 
+  /// Reduction operators.
+  ValueSet *RdxOps;
+
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
@@ -409,8 +541,9 @@ private:
   IRBuilder<> Builder;
 };
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
   deleteTree();
+  RdxOps = Rdx;
   if (!getSameType(Roots))
     return;
   buildTree_rec(Roots, 0);
@@ -431,18 +564,20 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
            UE = Scalar->use_end(); User != UE; ++User) {
         DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n");
 
-        bool Gathered = MustGather.count(*User);
-
         // Skip in-tree scalars that become vectors.
-        if (ScalarToTreeEntry.count(*User) && !Gathered) {
+        if (ScalarToTreeEntry.count(*User)) {
           DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
                 **User << ".\n");
           int Idx = ScalarToTreeEntry[*User]; (void) Idx;
           assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
           continue;
         }
+        Instruction *UserInst = dyn_cast<Instruction>(*User);
+        if (!UserInst)
+          continue;
 
-        if (!isa<Instruction>(*User))
+        // Ignore uses that are part of the reduction.
+        if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
           continue;
 
         DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " <<
@@ -574,6 +709,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         continue;
       }
 
+      // This user is part of the reduction.
+      if (RdxOps && RdxOps->count(User))
+        continue;
+
       // Make sure that we can schedule this unknown user.
       BlockNumbering &BN = BlocksNumbers[BB];
       int UserIndex = BN.getIndex(User);
@@ -635,6 +774,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   switch (Opcode) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
+
+      // Check for terminator values (e.g. invoke).
+      for (unsigned j = 0; j < VL.size(); ++j)
+        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+          TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i));
+          if (Term) {
+            DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+            newTreeEntry(VL, false);
+            return;
+          }
+        }
+
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
@@ -658,13 +809,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     }
     case Instruction::Load: {
       // Check if the loads are consecutive or of we need to swizzle them.
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+        LoadInst *L = cast<LoadInst>(VL[i]);
+        if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) {
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
           return;
         }
-
+      }
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
       return;
@@ -753,6 +905,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
+      // Sort operands of the instructions so that each side is more likely to
+      // have the same opcode.
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+        ValueList Left, Right;
+        reorderInputsAccordingToOpcode(VL, Left, Right);
+        buildTree_rec(Left, Depth + 1);
+        buildTree_rec(Right, Depth + 1);
+        return;
+      }
+
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -874,9 +1036,24 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
       } else {
-        ScalarCost = VecTy->getNumElements() *
-        TTI->getArithmeticInstrCost(Opcode, ScalarTy);
-        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+        // Certain instructions can be cheaper to vectorize if they have a
+        // constant second vector operand.
+        TargetTransformInfo::OperandValueKind Op1VK =
+            TargetTransformInfo::OK_AnyValue;
+        TargetTransformInfo::OperandValueKind Op2VK =
+            TargetTransformInfo::OK_UniformConstantValue;
+
+        // Check whether all second operands are constant.
+        for (unsigned i = 0; i < VL.size(); ++i)
+          if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) {
+            Op2VK = TargetTransformInfo::OK_AnyValue;
+            break;
+          }
+
+        ScalarCost =
+            VecTy->getNumElements() *
+            TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
+        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
       }
       return VecCost - ScalarCost;
     }
@@ -884,14 +1061,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Cost of wide load - cost of scalar loads.
       int ScalarLdCost = VecTy->getNumElements() *
       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
       int ScalarStCost = VecTy->getNumElements() *
       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
       return VecStCost - ScalarStCost;
     }
     default:
@@ -899,19 +1076,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
   }
 }
 
+bool BoUpSLP::isFullyVectorizableTinyTree() {
+  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
+        VectorizableTree.size() << " is fully vectorizable .\n");
+
+  // We only handle trees of height 2.
+  if (VectorizableTree.size() != 2)
+    return false;
+
+  // Gathering cost would be too much for tiny trees.
+  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 
+    return false; 
+
+  return true; 
+}
+
 int BoUpSLP::getTreeCost() {
   int Cost = 0;
   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
         VectorizableTree.size() << ".\n");
 
-  // Don't vectorize tiny trees. Small load/store chains or consecutive stores
-  // of constants will be vectoried in SelectionDAG in MergeConsecutiveStores.
-  // The SelectionDAG vectorizer can only handle pairs (trees of height = 2).
-  if (VectorizableTree.size() < 3) {
+  // We only vectorize tiny trees if it is fully vectorizable.
+  if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
     if (!VectorizableTree.size()) {
       assert(!ExternalUses.size() && "We should not have any external users");
     }
-    return 0;
+    return INT_MAX;
   }
 
   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
@@ -992,63 +1182,29 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
     return false;
 
-  // Calculate a constant offset from the base pointer without using SCEV
-  // in the supported cases.
-  // TODO: Add support for the case where one of the pointers is a GEP that
-  // uses the other pointer.
-  GetElementPtrInst *GepA = dyn_cast<GetElementPtrInst>(PtrA);
-  GetElementPtrInst *GepB = dyn_cast<GetElementPtrInst>(PtrB);
-
-  unsigned BW = DL->getPointerSizeInBits(ASA);
+  unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  int64_t Sz = DL->getTypeStoreSize(Ty);
+  APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
 
-  // Check if PtrA is the base and PtrB is a constant offset.
-  if (GepB && GepB->getPointerOperand() == PtrA) {
-    APInt Offset(BW, 0);
-    if (GepB->accumulateConstantOffset(*DL, Offset))
-      return Offset.getSExtValue() == Sz;
-    return false;
-  }
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
 
-  // Check if PtrB is the base and PtrA is a constant offset.
-  if (GepA && GepA->getPointerOperand() == PtrB) {
-    APInt Offset(BW, 0);
-    if (GepA->accumulateConstantOffset(*DL, Offset))
-      return Offset.getSExtValue() == -Sz;
-    return false;
-  }
+  APInt OffsetDelta = OffsetB - OffsetA;
 
-  // If both pointers are GEPs:
-  if (GepA && GepB) {
-    // Check that they have the same base pointer and number of indices.
-    if (GepA->getPointerOperand() != GepB->getPointerOperand() ||
-        GepA->getNumIndices() != GepB->getNumIndices())
-      return false;
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
 
-    // Try to strip the geps. This makes SCEV faster.
-    // Make sure that all of the indices except for the last are identical.
-    int LastIdx = GepA->getNumIndices();
-    for (int i = 0; i < LastIdx - 1; i++) {
-      if (GepA->getOperand(i+1) != GepB->getOperand(i+1))
-          return false;
-    }
-
-    PtrA = GepA->getOperand(LastIdx);
-    PtrB = GepB->getOperand(LastIdx);
-    Sz = 1;
-  }
-
-  ConstantInt *CA = dyn_cast<ConstantInt>(PtrA);
-  ConstantInt *CB = dyn_cast<ConstantInt>(PtrB);
-  if (CA && CB) {
-    return (CA->getSExtValue() + Sz == CB->getSExtValue());
-  }
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
 
-  // Calculate the distance.
+  // Otherwise compute the distance with SCEV between the base pointers.
   const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
   const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *C = SE->getConstant(PtrSCEVA->getType(), Sz);
+  const SCEV *C = SE->getConstant(BaseDelta);
   const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
   return X == PtrSCEVB;
 }
@@ -1102,6 +1258,15 @@ Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
   return I;
 }
 
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  Instruction *LastInst = getLastInstruction(VL);
+  BasicBlock::iterator NextInst = LastInst;
+  ++NextInst;
+  Builder.SetInsertPoint(VL0->getParent(), NextInst);
+  Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+}
+
 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   Value *Vec = UndefValue::get(Ty);
   // Generate the 'InsertElement' instruction.
@@ -1109,6 +1274,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
       GatherSeq.insert(Insrt);
+      CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
       if (ScalarToTreeEntry.count(VL[i])) {
@@ -1132,10 +1298,12 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) {
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *En = &VectorizableTree[Idx];
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
+  SmallDenseMap<Value*, int>::const_iterator Entry
+    = ScalarToTreeEntry.find(VL[0]);
+  if (Entry != ScalarToTreeEntry.end()) {
+    int Idx = Entry->second;
+    const TreeEntry *En = &VectorizableTree[Idx];
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
@@ -1159,38 +1327,48 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
-  BuilderLocGuard Guard(Builder);
+  IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
   }
 
-  Type *ScalarTy = E->Scalars[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0]))
+  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+  Type *ScalarTy = VL0->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
 
   if (E->NeedToGather) {
+    setInsertPointAfterBundle(E->Scalars);
     return Gather(E->Scalars, VecTy);
   }
 
-  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
   unsigned Opcode = VL0->getOpcode();
   assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
 
   switch (Opcode) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
-      Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt());
+      Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       E->VectorizedValue = NewPhi;
 
+      // PHINodes may have multiple entries from the same block. We want to
+      // visit every block once.
+      SmallSet<BasicBlock*, 4> VisitedBBs;
+
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         BasicBlock *IBB = PH->getIncomingBlock(i);
 
+        if (!VisitedBBs.insert(IBB)) {
+          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+          continue;
+        }
+
         // Prepare the operand vector.
         for (unsigned j = 0; j < E->Scalars.size(); ++j)
           Operands.push_back(cast<PHINode>(E->Scalars[j])->
@@ -1231,8 +1409,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
         INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *InVec = vectorizeTree(INVL);
 
@@ -1252,8 +1429,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
       }
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
@@ -1279,8 +1455,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
       }
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *Cond = vectorizeTree(CondVec);
       Value *True = vectorizeTree(TrueVec);
@@ -1288,7 +1463,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       if (Value *V = alreadyVectorized(E->Scalars))
         return V;
-      
+
       Value *V = Builder.CreateSelect(Cond, True, False);
       E->VectorizedValue = V;
       return V;
@@ -1312,13 +1487,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Or:
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
-        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
-      }
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
+        reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+      else
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+          LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+          RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+        }
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
@@ -1333,41 +1510,46 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
       E->VectorizedValue = V;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
       return V;
     }
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       LoadInst *LI = cast<LoadInst>(VL0);
-      Value *VecPtr =
-      Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo());
+      unsigned AS = LI->getPointerAddressSpace();
+
+      Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
-      return LI;
+      return propagateMetadata(LI, E->Scalars);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
       unsigned Alignment = SI->getAlignment();
+      unsigned AS = SI->getPointerAddressSpace();
 
       ValueList ValueOp;
       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
         ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *VecValue = vectorizeTree(ValueOp);
-      Value *VecPtr =
-      Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
+      Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
       S->setAlignment(Alignment);
       E->VectorizedValue = S;
-      return S;
+      return propagateMetadata(S, E->Scalars);
     }
     default:
     llvm_unreachable("unknown inst");
@@ -1375,7 +1557,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   return 0;
 }
 
-void BoUpSLP::vectorizeTree() {
+Value *BoUpSLP::vectorizeTree() {
   Builder.SetInsertPoint(F->getEntryBlock().begin());
   vectorizeTree(&VectorizableTree[0]);
 
@@ -1407,6 +1589,7 @@ void BoUpSLP::vectorizeTree() {
     if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
       Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      CSEBlocks.insert(PN->getParent());
       User->replaceUsesOfWith(Scalar, Ex);
     } else if (isa<Instruction>(Vec)){
       if (PHINode *PH = dyn_cast<PHINode>(User)) {
@@ -1414,17 +1597,20 @@ void BoUpSLP::vectorizeTree() {
           if (PH->getIncomingValue(i) == Scalar) {
             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+            CSEBlocks.insert(PH->getIncomingBlock(i));
             PH->setOperand(i, Ex);
           }
         }
       } else {
         Builder.SetInsertPoint(cast<Instruction>(User));
         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+        CSEBlocks.insert(cast<Instruction>(User)->getParent());
         User->replaceUsesOfWith(Scalar, Ex);
      }
     } else {
       Builder.SetInsertPoint(F->getEntryBlock().begin());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      CSEBlocks.insert(&F->getEntryBlock());
       User->replaceUsesOfWith(Scalar, Ex);
     }
 
@@ -1450,9 +1636,10 @@ void BoUpSLP::vectorizeTree() {
         for (Value::use_iterator User = Scalar->use_begin(),
              UE = Scalar->use_end(); User != UE; ++User) {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
-          assert(!MustGather.count(*User) &&
-                 "Replacing gathered value with undef");
-          assert(ScalarToTreeEntry.count(*User) &&
+
+          assert((ScalarToTreeEntry.count(*User) ||
+                  // It is legal to replace the reduction users by undef.
+                  (RdxOps && RdxOps->count(*User))) &&
                  "Replacing out-of-tree value with undef");
         }
         Value *Undef = UndefValue::get(Ty);
@@ -1467,8 +1654,20 @@ void BoUpSLP::vectorizeTree() {
     BlocksNumbers[it].forget();
   }
   Builder.ClearInsertionPoint();
+
+  return VectorizableTree[0].VectorizedValue;
 }
 
+class DTCmp {
+  const DominatorTree *DT;
+
+public:
+  DTCmp(const DominatorTree *DT) : DT(DT) {}
+  bool operator()(const BasicBlock *A, const BasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+};
+
 void BoUpSLP::optimizeGatherSequence() {
   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
         << " gather sequences instructions.\n");
@@ -1504,45 +1703,48 @@ void BoUpSLP::optimizeGatherSequence() {
     Insert->moveBefore(PreHeader->getTerminator());
   }
 
+  // Sort blocks by domination. This ensures we visit a block after all blocks
+  // dominating it are visited.
+  SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end());
+  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT));
+
   // Perform O(N^2) search over the gather sequences and merge identical
   // instructions. TODO: We can further optimize this scan if we split the
   // instructions into different buckets based on the insert lane.
-  SmallPtrSet<Instruction*, 16> Visited;
-  SmallVector<Instruction*, 16> ToRemove;
-  ReversePostOrderTraversal<Function*> RPOT(F);
-  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
-       E = RPOT.end(); I != E; ++I) {
+  SmallVector<Instruction *, 16> Visited;
+  for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(),
+                                               E = CSEWorkList.end();
+       I != E; ++I) {
+    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) &&
+           "Worklist not sorted properly!");
     BasicBlock *BB = *I;
-    // For all instructions in the function:
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      Instruction *In = it;
-      if ((!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) ||
-          !GatherSeq.count(In))
+    // For all instructions in blocks containing gather sequences:
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+      Instruction *In = it++;
+      if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
         continue;
 
       // Check if we can replace this instruction with any of the
       // visited instructions.
-      for (SmallPtrSet<Instruction*, 16>::iterator v = Visited.begin(),
-           ve = Visited.end(); v != ve; ++v) {
+      for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
+                                                    ve = Visited.end();
+           v != ve; ++v) {
         if (In->isIdenticalTo(*v) &&
             DT->dominates((*v)->getParent(), In->getParent())) {
           In->replaceAllUsesWith(*v);
-          ToRemove.push_back(In);
+          In->eraseFromParent();
           In = 0;
           break;
         }
       }
-      if (In)
-        Visited.insert(In);
+      if (In) {
+        assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+        Visited.push_back(In);
+      }
     }
   }
-
-  // Erase all of the instructions that we RAUWed.
-  for (SmallVectorImpl<Instruction *>::iterator v = ToRemove.begin(),
-       ve = ToRemove.end(); v != ve; ++v) {
-    assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses");
-    (*v)->eraseFromParent();
-  }
+  CSEBlocks.clear();
+  GatherSeq.clear();
 }
 
 /// The SLPVectorizer Pass.
@@ -1575,14 +1777,18 @@ struct SLPVectorizer : public FunctionPass {
     StoreRefs.clear();
     bool Changed = false;
 
+    // If the target claims to have no vector registers don't attempt
+    // vectorization.
+    if (!TTI->getNumberOfRegisters(true))
+      return false;
+
     // Must have DataLayout. We can't require it because some tests run w/o
     // triple.
     if (!DL)
       return false;
 
     // Don't vectorize when the attribute NoImplicitFloat is used.
-    if (F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat))
+    if (F.hasFnAttribute(Attribute::NoImplicitFloat))
       return false;
 
     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
@@ -1661,6 +1867,21 @@ private:
   StoreListMap StoreRefs;
 };
 
+/// \brief Check that the Values in the slice in VL array are still existant in
+/// the WeakVH array.
+/// Vectorization of part of the VL array may cause later values in the VL array
+/// to become invalid. We track when this has happened in the WeakVH array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
+                               SmallVectorImpl<WeakVH> &VH,
+                               unsigned SliceBegin,
+                               unsigned SliceSize) {
+  for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
+    if (VH[i] != VL[i])
+      return true;
+
+  return false;
+}
+
 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
                                           int CostThreshold, BoUpSLP &R) {
   unsigned ChainLen = Chain.size();
@@ -1673,11 +1894,19 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
   if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
 
+  // Keep track of values that were delete by vectorizing in the loop below.
+  SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
+
   bool Changed = false;
   // Look for profitable vectorizable trees at all offsets, starting at zero.
   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
     if (i + VF > e)
       break;
+
+    // Check that a previous iteration of this loop did not delete the Value.
+    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+      continue;
+
     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
           << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
@@ -1697,7 +1926,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
     }
   }
 
-    return Changed;
+  return Changed;
 }
 
 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
@@ -1764,15 +1993,17 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
     if (!SI)
       continue;
 
+    // Don't touch volatile stores.
+    if (!SI->isSimple())
+      continue;
+
     // Check that the pointer points to scalars.
     Type *Ty = SI->getValueOperand()->getType();
     if (Ty->isAggregateType() || Ty->isVectorTy())
       return 0;
 
-    // Find the base of the GEP.
-    Value *Ptr = SI->getPointerOperand();
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
-      Ptr = GEP->getPointerOperand();
+    // Find the base pointer.
+    Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
 
     // Save the store locations.
     StoreRefs[Ptr].push_back(SI);
@@ -1797,28 +2028,61 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
   // Check that all of the parts are scalar instructions of the same type.
   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
   if (!I0)
-    return 0;
+    return false;
 
   unsigned Opcode0 = I0->getOpcode();
 
+  Type *Ty0 = I0->getType();
+  unsigned Sz = DL->getTypeSizeInBits(Ty0);
+  unsigned VF = MinVecRegSize / Sz;
+
   for (int i = 0, e = VL.size(); i < e; ++i) {
     Type *Ty = VL[i]->getType();
     if (Ty->isAggregateType() || Ty->isVectorTy())
-      return 0;
+      return false;
     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
     if (!Inst || Inst->getOpcode() != Opcode0)
-      return 0;
+      return false;
   }
 
-  R.buildTree(VL);
-  int Cost = R.getTreeCost();
+  bool Changed = false;
 
-  if (Cost >= -SLPCostThreshold)
-    return false;
+  // Keep track of values that were delete by vectorizing in the loop below.
+  SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
 
-  DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
-  R.vectorizeTree();
-  return true;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    unsigned OpsWidth = 0;
+
+    if (i + VF > e)
+      OpsWidth = e - i;
+    else
+      OpsWidth = VF;
+
+    if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+      break;
+
+    // Check that a previous iteration of this loop did not delete the Value.
+    if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+                 << "\n");
+    ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
+
+    R.buildTree(Ops);
+    int Cost = R.getTreeCost();
+
+    if (Cost < -SLPCostThreshold) {
+      DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
+      R.vectorizeTree();
+
+      // Move to the next bundle.
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  return Changed;
 }
 
 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
@@ -1861,30 +2125,405 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   return 0;
 }
 
+/// \brief Generate a shuffle mask to be used in a reduction tree.
+///
+/// \param VecLen The length of the vector to be reduced.
+/// \param NumEltsToRdx The number of elements that should be reduced in the
+///        vector.
+/// \param IsPairwise Whether the reduction is a pairwise or splitting
+///        reduction. A pairwise reduction will generate a mask of 
+///        <0,2,...> or <1,3,..> while a splitting reduction will generate
+///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
+/// \param IsLeft True will generate a mask of even elements, odd otherwise.
+static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
+                                   bool IsPairwise, bool IsLeft,
+                                   IRBuilder<> &Builder) {
+  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
+
+  SmallVector<Constant *, 32> ShuffleMask(
+      VecLen, UndefValue::get(Builder.getInt32Ty()));
+
+  if (IsPairwise)
+    // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+  else
+    // Move the upper half of the vector to the lower half.
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+
+  return ConstantVector::get(ShuffleMask);
+}
+
+
+/// Model horizontal reductions.
+///
+/// A horizontal reduction is a tree of reduction operations (currently add and
+/// fadd) that has operations that can be put into a vector as its leaf.
+/// For example, this tree:
+///
+/// mul mul mul mul
+///  \  /    \  /
+///   +       +
+///    \     /
+///       +
+/// This tree has "mul" as its reduced values and "+" as its reduction
+/// operations. A reduction might be feeding into a store or a binary operation
+/// feeding a phi.
+///    ...
+///    \  /
+///     +
+///     |
+///  phi +=
+///
+///  Or:
+///    ...
+///    \  /
+///     +
+///     |
+///   *p =
+///
+class HorizontalReduction {
+  SmallPtrSet<Value *, 16> ReductionOps;
+  SmallVector<Value *, 32> ReducedVals;
+
+  BinaryOperator *ReductionRoot;
+  PHINode *ReductionPHI;
+
+  /// The opcode of the reduction.
+  unsigned ReductionOpcode;
+  /// The opcode of the values we perform a reduction on.
+  unsigned ReducedValueOpcode;
+  /// The width of one full horizontal reduction operation.
+  unsigned ReduxWidth;
+  /// Should we model this reduction as a pairwise reduction tree or a tree that
+  /// splits the vector in halves and adds those halves.
+  bool IsPairwiseReduction;
+
+public:
+  HorizontalReduction()
+    : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
+    ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
+
+  /// \brief Try to find a reduction tree.
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
+                                 DataLayout *DL) {
+    assert((!Phi ||
+            std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
+           "Thi phi needs to use the binary operator");
+
+    // We could have a initial reductions that is not an add.
+    //  r *= v1 + v2 + v3 + v4
+    // In such a case start looking for a tree rooted in the first '+'.
+    if (Phi) {
+      if (B->getOperand(0) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(1));
+      } else if (B->getOperand(1) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(0));
+      }
+    }
+
+    if (!B)
+      return false;
+
+    Type *Ty = B->getType();
+    if (Ty->isVectorTy())
+      return false;
+
+    ReductionOpcode = B->getOpcode();
+    ReducedValueOpcode = 0;
+    ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+    ReductionRoot = B;
+    ReductionPHI = Phi;
+
+    if (ReduxWidth < 4)
+      return false;
+
+    // We currently only support adds.
+    if (ReductionOpcode != Instruction::Add &&
+        ReductionOpcode != Instruction::FAdd)
+      return false;
+
+    // Post order traverse the reduction tree starting at B. We only handle true
+    // trees containing only binary operators.
+    SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
+    Stack.push_back(std::make_pair(B, 0));
+    while (!Stack.empty()) {
+      BinaryOperator *TreeN = Stack.back().first;
+      unsigned EdgeToVist = Stack.back().second++;
+      bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+
+      // Only handle trees in the current basic block.
+      if (TreeN->getParent() != B->getParent())
+        return false;
+
+      // Each tree node needs to have one user except for the ultimate
+      // reduction.
+      if (!TreeN->hasOneUse() && TreeN != B)
+        return false;
+
+      // Postorder vist.
+      if (EdgeToVist == 2 || IsReducedValue) {
+        if (IsReducedValue) {
+          // Make sure that the opcodes of the operations that we are going to
+          // reduce match.
+          if (!ReducedValueOpcode)
+            ReducedValueOpcode = TreeN->getOpcode();
+          else if (ReducedValueOpcode != TreeN->getOpcode())
+            return false;
+          ReducedVals.push_back(TreeN);
+        } else {
+          // We need to be able to reassociate the adds.
+          if (!TreeN->isAssociative())
+            return false;
+          ReductionOps.insert(TreeN);
+        }
+        // Retract.
+        Stack.pop_back();
+        continue;
+      }
+
+      // Visit left or right.
+      Value *NextV = TreeN->getOperand(EdgeToVist);
+      BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
+      if (Next)
+        Stack.push_back(std::make_pair(Next, 0));
+      else if (NextV != Phi)
+        return false;
+    }
+    return true;
+  }
+
+  /// \brief Attempt to vectorize the tree found by
+  /// matchAssociativeReduction.
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    if (ReducedVals.empty())
+      return false;
+
+    unsigned NumReducedVals = ReducedVals.size();
+    if (NumReducedVals < ReduxWidth)
+      return false;
+
+    Value *VectorizedTree = 0;
+    IRBuilder<> Builder(ReductionRoot);
+    FastMathFlags Unsafe;
+    Unsafe.setUnsafeAlgebra();
+    Builder.SetFastMathFlags(Unsafe);
+    unsigned i = 0;
+
+    for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+      ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
+      V.buildTree(ValsToReduce, &ReductionOps);
+
+      // Estimate cost.
+      int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+      if (Cost >= -SLPCostThreshold)
+        break;
+
+      DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
+                   << ". (HorRdx)\n");
+
+      // Vectorize a tree.
+      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+      Value *VectorizedRoot = V.vectorizeTree();
+
+      // Emit a reduction.
+      Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+      if (VectorizedTree) {
+        Builder.SetCurrentDebugLocation(Loc);
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedSubTree, "bin.rdx");
+      } else
+        VectorizedTree = ReducedSubTree;
+    }
+
+    if (VectorizedTree) {
+      // Finish the reduction.
+      for (; i < NumReducedVals; ++i) {
+        Builder.SetCurrentDebugLocation(
+          cast<Instruction>(ReducedVals[i])->getDebugLoc());
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedVals[i]);
+      }
+      // Update users.
+      if (ReductionPHI) {
+        assert(ReductionRoot != NULL && "Need a reduction operation");
+        ReductionRoot->setOperand(0, VectorizedTree);
+        ReductionRoot->setOperand(1, ReductionPHI);
+      } else
+        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+    }
+    return VectorizedTree != 0;
+  }
+
+private:
+
+  /// \brief Calcuate the cost of a reduction.
+  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+    Type *ScalarTy = FirstReducedVal->getType();
+    Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+
+    int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
+    int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+
+    IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
+    int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
+
+    int ScalarReduxCost =
+        ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+
+    DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+                 << " for reduction that starts with " << *FirstReducedVal
+                 << " (It is a "
+                 << (IsPairwiseReduction ? "pairwise" : "splitting")
+                 << " reduction)\n");
+
+    return VecReduxCost - ScalarReduxCost;
+  }
+
+  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
+                            Value *R, const Twine &Name = "") {
+    if (Opcode == Instruction::FAdd)
+      return Builder.CreateFAdd(L, R, Name);
+    return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
+  }
+
+  /// \brief Emit a horizontal reduction of the vectorized value.
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+    assert(VectorizedValue && "Need to have a vectorized tree node");
+    Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
+    assert(isPowerOf2_32(ReduxWidth) &&
+           "We only handle power-of-two reductions for now");
+
+    Value *TmpVec = ValToReduce;
+    for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
+      if (IsPairwiseReduction) {
+        Value *LeftMask =
+          createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
+        Value *RightMask =
+          createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+
+        Value *LeftShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
+        Value *RightShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
+          "rdx.shuf.r");
+        TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
+                             "bin.rdx");
+      } else {
+        Value *UpperHalf =
+          createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
+        Value *Shuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
+        TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
+      }
+    }
+
+    // The result is in the first element of the vector.
+    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+  }
+};
+
+/// \brief Recognize construction of vectors like
+///  %ra = insertelement <4 x float> undef, float %s0, i32 0
+///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+///
+/// Returns true if it matches
+///
+static bool findBuildVector(InsertElementInst *IE,
+                            SmallVectorImpl<Value *> &Ops) {
+  if (!isa<UndefValue>(IE->getOperand(0)))
+    return false;
+
+  while (true) {
+    Ops.push_back(IE->getOperand(1));
+
+    if (IE->use_empty())
+      return false;
+
+    InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back());
+    if (!NextUse)
+      return true;
+
+    // If this isn't the final use, make sure the next insertelement is the only
+    // use. It's OK if the final constructed vector is used multiple times
+    if (!IE->hasOneUse())
+      return false;
+
+    IE = NextUse;
+  }
+
+  return false;
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+  return V->getType() < V2->getType();
+}
+
 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
-  // Collect the incoming values from the PHIs.
-  for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
-       ++instr) {
-    PHINode *P = dyn_cast<PHINode>(instr);
-
-    if (!P)
-      break;
+  SmallSet<Value *, 16> VisitedInstrs;
+
+  bool HaveVectorizedPhiNodes = true;
+  while (HaveVectorizedPhiNodes) {
+    HaveVectorizedPhiNodes = false;
+
+    // Collect the incoming values from the PHIs.
+    Incoming.clear();
+    for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
+         ++instr) {
+      PHINode *P = dyn_cast<PHINode>(instr);
+      if (!P)
+        break;
 
-    // Stop constructing the list when you reach a different type.
-    if (Incoming.size() && P->getType() != Incoming[0]->getType()) {
-      Changed |= tryToVectorizeList(Incoming, R);
-      Incoming.clear();
+      if (!VisitedInstrs.count(P))
+        Incoming.push_back(P);
     }
 
-    Incoming.push_back(P);
+    // Sort by type.
+    std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
+
+    // Try to vectorize elements base on their type.
+    for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+                                           E = Incoming.end();
+         IncIt != E;) {
+
+      // Look for the next elements with the same type.
+      SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+      while (SameTypeIt != E &&
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+        VisitedInstrs.insert(*SameTypeIt);
+        ++SameTypeIt;
+      }
+
+      // Try to vectorize them.
+      unsigned NumElts = (SameTypeIt - IncIt);
+      DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+      if (NumElts > 1 &&
+          tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) {
+        // Success start over because instructions might have been changed.
+        HaveVectorizedPhiNodes = true;
+        Changed = true;
+        break;
+      }
+
+      // Start over at the next instruction of a differnt type (or the end).
+      IncIt = SameTypeIt;
+    }
   }
 
-  if (Incoming.size() > 1)
-    Changed |= tryToVectorizeList(Incoming, R);
+  VisitedInstrs.clear();
+
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+    // We may go through BB multiple times so skip the one we have checked.
+    if (!VisitedInstrs.insert(it))
+      continue;
 
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     if (isa<DbgInfoIntrinsic>(it))
       continue;
 
@@ -1902,24 +2541,86 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (!BI)
         continue;
 
-      Value *Inst = BI->getOperand(0);
+      // Try to match and vectorize a horizontal reduction.
+      HorizontalReduction HorRdx;
+      if (ShouldVectorizeHor &&
+          HorRdx.matchAssociativeReduction(P, BI, DL) &&
+          HorRdx.tryToReduce(R, TTI)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
+     Value *Inst = BI->getOperand(0);
       if (Inst == P)
         Inst = BI->getOperand(1);
 
-      Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+      if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
       continue;
     }
 
+    // Try to vectorize horizontal reductions feeding into a store.
+    if (ShouldStartVectorizeHorAtStore)
+      if (StoreInst *SI = dyn_cast<StoreInst>(it))
+        if (BinaryOperator *BinOp =
+                dyn_cast<BinaryOperator>(SI->getValueOperand())) {
+          HorizontalReduction HorRdx;
+          if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
+                HorRdx.tryToReduce(R, TTI)) ||
+               tryToVectorize(BinOp, R))) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+            continue;
+          }
+        }
+
     // Try to vectorize trees that start at compare instructions.
     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
-        Changed |= true;
+        Changed = true;
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
+      for (int i = 0; i < 2; ++i) {
+         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+            if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+              Changed = true;
+              // We would like to start over since some instructions are deleted
+              // and the iterator may become invalid value.
+              it = BB->begin();
+              e = BB->end();
+            }
+         }
+      }
+      continue;
+    }
+
+    // Try to vectorize trees that start at insertelement instructions.
+    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
+      SmallVector<Value *, 8> Ops;
+      if (!findBuildVector(IE, Ops))
         continue;
+
+      if (tryToVectorizeList(Ops, R)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
       }
-      for (int i = 0; i < 2; ++i)
-        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
-          Changed |=
-              tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
+
       continue;
     }
   }
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index c19bb67..fc6483c 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -11,9 +11,14 @@ foreach(entry ${entries})
   endif()
 endforeach(entry)
 
-# Also add in the compiler-rt tree if present and we have a sufficiently
-# recent version of CMake.
+# Also add in libc++ and compiler-rt trees if present (and we have
+# a sufficiently recent version of CMake where required).
 if(${LLVM_BUILD_RUNTIME})
+  # MSVC isn't quite working with libc++ yet, disable it until issues are
+  # fixed.
+  if(NOT MSVC)
+    add_llvm_external_project(libcxx)
+  endif()
   if(${CMAKE_VERSION} VERSION_GREATER 2.8.7)
     add_llvm_external_project(compiler-rt)
   else()
diff --git a/projects/sample/autoconf/configure.ac b/projects/sample/autoconf/configure.ac
index dc0acf3..03cd214 100644
--- a/projects/sample/autoconf/configure.ac
+++ b/projects/sample/autoconf/configure.ac
@@ -535,6 +535,21 @@ case "$enableval" in
   *) AC_MSG_ERROR([Invalid setting for --enable-pthreads. Use "yes" or "no"]) ;;
 esac
 
+dnl Allow disablement of zlib
+AC_ARG_ENABLE(zlib,
+              AS_HELP_STRING([--enable-zlib],
+                             [Use zlib for compression/decompression if
+                              available (default is YES)]),,
+                              enableval=default)
+case "$enableval" in
+  yes) AC_SUBST(LLVM_ENABLE_ZLIB,[1]) ;;
+  no)  AC_SUBST(LLVM_ENABLE_ZLIB,[0]) ;;
+  default) AC_SUBST(LLVM_ENABLE_ZLIB,[1]) ;;
+  *) AC_MSG_ERROR([Invalid setting for --enable-zlib. Use "yes" or "no"]) ;;
+esac
+AC_DEFINE_UNQUOTED([LLVM_ENABLE_ZLIB],$LLVM_ENABLE_ZLIB,
+                   [Define if zlib is enabled])
+
 dnl Allow building without position independent code
 AC_ARG_ENABLE(pic,
   AS_HELP_STRING([--enable-pic],
@@ -820,6 +835,17 @@ AC_ARG_WITH(bug-report-url,
 AC_DEFINE_UNQUOTED(BUG_REPORT_URL,"$withval",
                    [Bug report URL.])
 
+dnl --enable-terminfo: check whether the user wants to control use of terminfo:
+AC_ARG_ENABLE(terminfo,AS_HELP_STRING(
+  [--enable-terminfo],
+  [Query the terminfo database if available (default is YES)]),
+  [case "$enableval" in
+    yes) llvm_cv_enable_terminfo="yes" ;;
+    no)  llvm_cv_enable_terminfo="no"  ;;
+    *) AC_MSG_ERROR([Invalid setting for --enable-terminfo. Use "yes" or "no"]) ;;
+  esac],
+  llvm_cv_enable_terminfo="yes")
+
 dnl --enable-libffi : check whether the user wants to turn off libffi:
 AC_ARG_ENABLE(libffi,AS_HELP_STRING(
   --enable-libffi,[Check for the presence of libffi (default is NO)]),
@@ -1061,6 +1087,7 @@ AC_CHECK_LIB(m,sin)
 if test "$llvm_cv_os_type" = "MingW" ; then
   AC_CHECK_LIB(imagehlp, main)
   AC_CHECK_LIB(psapi, main)
+  AC_CHECK_LIB(shell32, main)
 fi
 
 dnl dlopen() is required for plugin support.
@@ -1068,6 +1095,19 @@ AC_SEARCH_LIBS(dlopen,dl,AC_DEFINE([HAVE_DLOPEN],[1],
                [Define if dlopen() is available on this platform.]),
                AC_MSG_WARN([dlopen() not found - disabling plugin support]))
 
+dnl Search for the clock_gettime() function. Note that we rely on the POSIX
+dnl macros to detect whether clock_gettime is available, this just finds the
+dnl right libraries to link with.
+AC_SEARCH_LIBS(clock_gettime,rt)
+
+dnl The curses library is optional; used for querying terminal info
+if test "$llvm_cv_enable_terminfo" = "yes" ; then
+  dnl We need the has_color functionality in curses for it to be useful.
+  AC_SEARCH_LIBS(setupterm,tinfo curses ncurses ncursesw,
+                 AC_DEFINE([HAVE_TERMINFO],[1],
+                           [Define if the setupterm() function is supported this platform.]))
+fi
+
 dnl libffi is optional; used to call external functions from the interpreter
 if test "$llvm_cv_enable_libffi" = "yes" ; then
   AC_SEARCH_LIBS(ffi_call,ffi,AC_DEFINE([HAVE_FFI_CALL],[1],
@@ -1094,6 +1134,11 @@ if test "$ENABLE_THREADS" -eq 1 && test "$ENABLE_PTHREADS" -eq 1 ; then
                  [Have pthread_getspecific]))
 fi
 
+dnl zlib is optional; used for compression/uncompression
+if test "$LLVM_ENABLE_ZLIB" -eq 1 ; then
+  AC_CHECK_LIB(z, compress2)
+fi
+
 dnl Allow extra x86-disassembler library
 AC_ARG_WITH(udis86,
   AS_HELP_STRING([--with-udis86=<path>],
@@ -1179,6 +1224,13 @@ if test "$ENABLE_THREADS" -eq 1 && test "$ENABLE_PTHREADS" -eq 1 ; then
 else
   AC_SUBST(HAVE_PTHREAD, 0)
 fi
+if test "$LLVM_ENABLE_ZLIB" -eq 1 ; then
+  AC_CHECK_HEADERS(zlib.h,
+                   AC_SUBST(HAVE_LIBZ, 1),
+                   AC_SUBST(HAVE_LIBZ, 0))
+else
+  AC_SUBST(HAVE_LIBZ, 0)
+fi
 
 dnl Try to find ffi.h.
 if test "$llvm_cv_enable_libffi" = "yes" ; then
diff --git a/projects/sample/configure b/projects/sample/configure
index 23e01c8..bc04c96 100755
--- a/projects/sample/configure
+++ b/projects/sample/configure
@@ -698,6 +698,7 @@ ENABLE_DOCS
 ENABLE_DOXYGEN
 ENABLE_THREADS
 ENABLE_PTHREADS
+LLVM_ENABLE_ZLIB
 ENABLE_PIC
 ENABLE_SHARED
 ENABLE_EMBED_STDCXX
@@ -766,6 +767,7 @@ COVERED_SWITCH_DEFAULT
 USE_UDIS86
 USE_OPROFILE
 HAVE_PTHREAD
+HAVE_LIBZ
 HUGE_VAL_SANITY
 MMAP_FILE
 SHLIBEXT
@@ -1394,6 +1396,8 @@ Optional Features:
   --enable-doxygen        Build doxygen documentation (default is NO)
   --enable-threads        Use threads if available (default is YES)
   --enable-pthreads       Use pthreads if available (default is YES)
+  --enable-zlib           Use zlib for compression/decompression if available
+                          (default is YES)
   --enable-pic            Build LLVM with Position Independent Code (default
                           is YES)
   --enable-shared         Build a shared library and link tools against it
@@ -1409,6 +1413,8 @@ Optional Features:
                           (default=all)
   --enable-bindings       Build specific language bindings:
                           all,auto,none,{binding-name} (default=auto)
+  --enable-terminfo       Query the terminfo database if available (default is
+                          YES)
   --enable-libffi         Check for the presence of libffi (default is NO)
   --enable-ltdl-install   install libltdl
 
@@ -5202,6 +5208,30 @@ echo "$as_me: error: Invalid setting for --enable-pthreads. Use \"yes\" or \"no\
    { (exit 1); exit 1; }; } ;;
 esac
 
+# Check whether --enable-zlib was given.
+if test "${enable_zlib+set}" = set; then
+  enableval=$enable_zlib;
+else
+  enableval=default
+fi
+
+case "$enableval" in
+  yes) LLVM_ENABLE_ZLIB=1
+ ;;
+  no)  LLVM_ENABLE_ZLIB=0
+ ;;
+  default) LLVM_ENABLE_ZLIB=1
+ ;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-zlib. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-zlib. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
+cat >>confdefs.h <<_ACEOF
+#define LLVM_ENABLE_ZLIB $LLVM_ENABLE_ZLIB
+_ACEOF
+
+
 # Check whether --enable-pic was given.
 if test "${enable_pic+set}" = set; then
   enableval=$enable_pic;
@@ -5604,6 +5634,20 @@ cat >>confdefs.h <<_ACEOF
 _ACEOF
 
 
+# Check whether --enable-terminfo was given.
+if test "${enable_terminfo+set}" = set; then
+  enableval=$enable_terminfo; case "$enableval" in
+    yes) llvm_cv_enable_terminfo="yes" ;;
+    no)  llvm_cv_enable_terminfo="no"  ;;
+    *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-terminfo. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-terminfo. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+  esac
+else
+  llvm_cv_enable_terminfo="yes"
+fi
+
+
 # Check whether --enable-libffi was given.
 if test "${enable_libffi+set}" = set; then
   enableval=$enable_libffi; case "$enableval" in
@@ -10339,7 +10383,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10342 "configure"
+#line 10386 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -12353,6 +12397,87 @@ _ACEOF
 
 fi
 
+
+{ echo "$as_me:$LINENO: checking for main in -lshell32" >&5
+echo $ECHO_N "checking for main in -lshell32... $ECHO_C" >&6; }
+if test "${ac_cv_lib_shell32_main+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lshell32  $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+
+int
+main ()
+{
+return main ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_lib_shell32_main=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_lib_shell32_main=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_lib_shell32_main" >&5
+echo "${ECHO_T}$ac_cv_lib_shell32_main" >&6; }
+if test $ac_cv_lib_shell32_main = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBSHELL32 1
+_ACEOF
+
+  LIBS="-lshell32 $LIBS"
+
+fi
+
 fi
 
 { echo "$as_me:$LINENO: checking for library containing dlopen" >&5
@@ -12462,6 +12587,212 @@ echo "$as_me: WARNING: dlopen() not found - disabling plugin support" >&2;}
 fi
 
 
+{ echo "$as_me:$LINENO: checking for library containing clock_gettime" >&5
+echo $ECHO_N "checking for library containing clock_gettime... $ECHO_C" >&6; }
+if test "${ac_cv_search_clock_gettime+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char clock_gettime ();
+int
+main ()
+{
+return clock_gettime ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' rt; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_search_clock_gettime=$ac_res
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext
+  if test "${ac_cv_search_clock_gettime+set}" = set; then
+  break
+fi
+done
+if test "${ac_cv_search_clock_gettime+set}" = set; then
+  :
+else
+  ac_cv_search_clock_gettime=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_search_clock_gettime" >&5
+echo "${ECHO_T}$ac_cv_search_clock_gettime" >&6; }
+ac_res=$ac_cv_search_clock_gettime
+if test "$ac_res" != no; then
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+
+fi
+
+
+if test "$llvm_cv_enable_terminfo" = "yes" ; then
+    { echo "$as_me:$LINENO: checking for library containing setupterm" >&5
+echo $ECHO_N "checking for library containing setupterm... $ECHO_C" >&6; }
+if test "${ac_cv_search_setupterm+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_func_search_save_LIBS=$LIBS
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char setupterm ();
+int
+main ()
+{
+return setupterm ();
+  ;
+  return 0;
+}
+_ACEOF
+for ac_lib in '' tinfo curses ncurses ncursesw; do
+  if test -z "$ac_lib"; then
+    ac_res="none required"
+  else
+    ac_res=-l$ac_lib
+    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
+  fi
+  rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_search_setupterm=$ac_res
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext
+  if test "${ac_cv_search_setupterm+set}" = set; then
+  break
+fi
+done
+if test "${ac_cv_search_setupterm+set}" = set; then
+  :
+else
+  ac_cv_search_setupterm=no
+fi
+rm conftest.$ac_ext
+LIBS=$ac_func_search_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_search_setupterm" >&5
+echo "${ECHO_T}$ac_cv_search_setupterm" >&6; }
+ac_res=$ac_cv_search_setupterm
+if test "$ac_res" != no; then
+  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_TERMINFO 1
+_ACEOF
+
+fi
+
+fi
+
 if test "$llvm_cv_enable_libffi" = "yes" ; then
   { echo "$as_me:$LINENO: checking for library containing ffi_call" >&5
 echo $ECHO_N "checking for library containing ffi_call... $ECHO_C" >&6; }
@@ -13075,6 +13406,96 @@ fi
 
 fi
 
+if test "$LLVM_ENABLE_ZLIB" -eq 1 ; then
+
+{ echo "$as_me:$LINENO: checking for compress2 in -lz" >&5
+echo $ECHO_N "checking for compress2 in -lz... $ECHO_C" >&6; }
+if test "${ac_cv_lib_z_compress2+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lz  $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char compress2 ();
+int
+main ()
+{
+return compress2 ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_lib_z_compress2=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_lib_z_compress2=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ echo "$as_me:$LINENO: result: $ac_cv_lib_z_compress2" >&5
+echo "${ECHO_T}$ac_cv_lib_z_compress2" >&6; }
+if test $ac_cv_lib_z_compress2 = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBZ 1
+_ACEOF
+
+  LIBS="-lz $LIBS"
+
+fi
+
+fi
+
 
 # Check whether --with-udis86 was given.
 if test "${with_udis86+set}" = set; then
@@ -15567,6 +15988,184 @@ else
   HAVE_PTHREAD=0
 
 fi
+if test "$LLVM_ENABLE_ZLIB" -eq 1 ; then
+
+for ac_header in zlib.h
+do
+as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+  { echo "$as_me:$LINENO: checking for $ac_header" >&5
+echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+fi
+ac_res=`eval echo '${'$as_ac_Header'}'`
+	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+else
+  # Is the header compilable?
+{ echo "$as_me:$LINENO: checking $ac_header usability" >&5
+echo $ECHO_N "checking $ac_header usability... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+#include <$ac_header>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest.$ac_objext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_header_compiler=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_header_compiler=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+echo "${ECHO_T}$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ echo "$as_me:$LINENO: checking $ac_header presence" >&5
+echo $ECHO_N "checking $ac_header presence... $ECHO_C" >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <$ac_header>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null; then
+  if test -s conftest.err; then
+    ac_cpp_err=$ac_c_preproc_warn_flag
+    ac_cpp_err=$ac_cpp_err$ac_c_werror_flag
+  else
+    ac_cpp_err=
+  fi
+else
+  ac_cpp_err=yes
+fi
+if test -z "$ac_cpp_err"; then
+  ac_header_preproc=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  ac_header_preproc=no
+fi
+
+rm -f conftest.err conftest.$ac_ext
+{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+echo "${ECHO_T}$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
+  yes:no: )
+    { echo "$as_me:$LINENO: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&5
+echo "$as_me: WARNING: $ac_header: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the compiler's result" >&5
+echo "$as_me: WARNING: $ac_header: proceeding with the compiler's result" >&2;}
+    ac_header_preproc=yes
+    ;;
+  no:yes:* )
+    { echo "$as_me:$LINENO: WARNING: $ac_header: present but cannot be compiled" >&5
+echo "$as_me: WARNING: $ac_header: present but cannot be compiled" >&2;}
+    { echo "$as_me:$LINENO: WARNING: $ac_header:     check for missing prerequisite headers?" >&5
+echo "$as_me: WARNING: $ac_header:     check for missing prerequisite headers?" >&2;}
+    { echo "$as_me:$LINENO: WARNING: $ac_header: see the Autoconf documentation" >&5
+echo "$as_me: WARNING: $ac_header: see the Autoconf documentation" >&2;}
+    { echo "$as_me:$LINENO: WARNING: $ac_header:     section \"Present But Cannot Be Compiled\"" >&5
+echo "$as_me: WARNING: $ac_header:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { echo "$as_me:$LINENO: WARNING: $ac_header: proceeding with the preprocessor's result" >&5
+echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result" >&2;}
+    { echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
+echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
+    ( cat <<\_ASBOX
+## ------------------------------ ##
+## Report this to bugs@yourdomain ##
+## ------------------------------ ##
+_ASBOX
+     ) | sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+{ echo "$as_me:$LINENO: checking for $ac_header" >&5
+echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  eval "$as_ac_Header=\$ac_header_preproc"
+fi
+ac_res=`eval echo '${'$as_ac_Header'}'`
+	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+echo "${ECHO_T}$ac_res" >&6; }
+
+fi
+if test `eval echo '${'$as_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+ HAVE_LIBZ=1
+
+else
+  HAVE_LIBZ=0
+
+fi
+
+done
+
+else
+  HAVE_LIBZ=0
+
+fi
 
 if test "$llvm_cv_enable_libffi" = "yes" ; then
 
@@ -21720,6 +22319,7 @@ ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
 ENABLE_PTHREADS!$ENABLE_PTHREADS$ac_delim
+LLVM_ENABLE_ZLIB!$LLVM_ENABLE_ZLIB$ac_delim
 ENABLE_PIC!$ENABLE_PIC$ac_delim
 ENABLE_SHARED!$ENABLE_SHARED$ac_delim
 ENABLE_EMBED_STDCXX!$ENABLE_EMBED_STDCXX$ac_delim
@@ -21788,6 +22388,7 @@ COVERED_SWITCH_DEFAULT!$COVERED_SWITCH_DEFAULT$ac_delim
 USE_UDIS86!$USE_UDIS86$ac_delim
 USE_OPROFILE!$USE_OPROFILE$ac_delim
 HAVE_PTHREAD!$HAVE_PTHREAD$ac_delim
+HAVE_LIBZ!$HAVE_LIBZ$ac_delim
 HUGE_VAL_SANITY!$HUGE_VAL_SANITY$ac_delim
 MMAP_FILE!$MMAP_FILE$ac_delim
 SHLIBEXT!$SHLIBEXT$ac_delim
@@ -21811,7 +22412,7 @@ LIBOBJS!$LIBOBJS$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 90; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 92; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
deleted file mode 100644
index 502b91d..0000000
--- a/runtime/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-if( NOT LLVM_BUILD_RUNTIME )
-  set(EXCLUDE_FROM_ALL ON)
-endif()
-
-add_subdirectory(libprofile)
diff --git a/runtime/LLVMBuild.txt b/runtime/LLVMBuild.txt
deleted file mode 100644
index 05334fd..0000000
--- a/runtime/LLVMBuild.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-;===- ./runtime/LLVMBuild.txt ----------------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Group
-name = Runtime
-parent = $ROOT
diff --git a/runtime/Makefile b/runtime/Makefile
deleted file mode 100644
index d0e85d5..0000000
--- a/runtime/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-##===- runtime/Makefile ------------------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-LEVEL = ..
-include $(LEVEL)/Makefile.config
-
-ifndef NO_RUNTIME_LIBS
-
-PARALLEL_DIRS  := libprofile
-
-# Disable libprofile: a faulty libtool is generated by autoconf which breaks the
-# build on Sparc
-ifeq ($(ARCH), Sparc)
-PARALLEL_DIRS := $(filter-out libprofile, $(PARALLEL_DIRS))
-endif
-
-ifeq ($(TARGET_OS), $(filter $(TARGET_OS), Cygwin MingW Minix))
-PARALLEL_DIRS := $(filter-out libprofile, $(PARALLEL_DIRS))
-endif
-
-endif
-
-include $(LEVEL)/Makefile.common
-
-install::
diff --git a/runtime/README.txt b/runtime/README.txt
deleted file mode 100644
index 2e2e547..0000000
--- a/runtime/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-This directory contains the various runtime libraries used by components of 
-the LLVM compiler.  For example, the automatic pool allocation transformation
-inserts calls to an external pool allocator library.  This runtime library is
-an example of the type of library that lives in these directories.
diff --git a/runtime/libprofile/BasicBlockTracing.c b/runtime/libprofile/BasicBlockTracing.c
deleted file mode 100644
index 0815e2e..0000000
--- a/runtime/libprofile/BasicBlockTracing.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*===-- BasicBlockTracing.c - Support library for basic block tracing -----===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source
-|* License. See LICENSE.TXT for details.
-|* 
-|*===----------------------------------------------------------------------===*|
-|* 
-|* This file implements the call back routines for the basic block tracing
-|* instrumentation pass.  This should be used with the -trace-basic-blocks
-|* LLVM pass.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#include "Profiling.h"
-#include <stdlib.h>
-#include <stdio.h>
-
-static unsigned *ArrayStart, *ArrayEnd, *ArrayCursor;
-
-/* WriteAndFlushBBTraceData - write out the currently accumulated trace data
- * and reset the cursor to point to the beginning of the buffer.
- */
-static void WriteAndFlushBBTraceData () {
-  write_profiling_data(BBTraceInfo, ArrayStart, (ArrayCursor - ArrayStart));
-  ArrayCursor = ArrayStart;
-}
-
-/* BBTraceAtExitHandler - When the program exits, just write out any remaining 
- * data and free the trace buffer.
- */
-static void BBTraceAtExitHandler(void) {
-  WriteAndFlushBBTraceData ();
-  free (ArrayStart);
-}
-
-/* llvm_trace_basic_block - called upon hitting a new basic block. */
-void llvm_trace_basic_block (unsigned BBNum) {
-  *ArrayCursor++ = BBNum;
-  if (ArrayCursor == ArrayEnd)
-    WriteAndFlushBBTraceData ();
-}
-
-/* llvm_start_basic_block_tracing - This is the main entry point of the basic
- * block tracing library.  It is responsible for setting up the atexit
- * handler and allocating the trace buffer.
- */
-int llvm_start_basic_block_tracing(int argc, const char **argv,
-                              unsigned *arrayStart, unsigned numElements) {
-  int Ret;
-  const unsigned BufferSize = 128 * 1024;
-  unsigned ArraySize;
-
-  Ret = save_arguments(argc, argv);
-
-  /* Allocate a buffer to contain BB tracing data */
-  ArraySize = BufferSize / sizeof (unsigned);
-  ArrayStart = malloc (ArraySize * sizeof (unsigned));
-  ArrayEnd = ArrayStart + ArraySize;
-  ArrayCursor = ArrayStart;
-
-  /* Set up the atexit handler. */
-  atexit (BBTraceAtExitHandler);
-
-  return Ret;
-}
diff --git a/runtime/libprofile/CMakeLists.txt b/runtime/libprofile/CMakeLists.txt
deleted file mode 100644
index 9044f76..0000000
--- a/runtime/libprofile/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-set(SOURCES
-  BasicBlockTracing.c
-  CommonProfiling.c
-  PathProfiling.c
-  EdgeProfiling.c
-  OptimalEdgeProfiling.c
-  Profiling.h
-  )
-
-add_llvm_library( profile_rt-static ${SOURCES} )
-set_target_properties( profile_rt-static
-  PROPERTIES
-  OUTPUT_NAME "profile_rt" )
-
-set(BUILD_SHARED_LIBS ON)
-add_llvm_library( profile_rt-shared ${SOURCES} )
-set_target_properties( profile_rt-shared
-  PROPERTIES
-  OUTPUT_NAME "profile_rt" )
diff --git a/runtime/libprofile/CommonProfiling.c b/runtime/libprofile/CommonProfiling.c
deleted file mode 100644
index 8f4119c..0000000
--- a/runtime/libprofile/CommonProfiling.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*===-- CommonProfiling.c - Profiling support library support -------------===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source
-|* License. See LICENSE.TXT for details.
-|*
-|*===----------------------------------------------------------------------===*|
-|*
-|* This file implements functions used by the various different types of
-|* profiling implementations.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#include "Profiling.h"
-#include <assert.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <string.h>
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#else
-#include <io.h>
-#endif
-#include <stdlib.h>
-
-static char *SavedArgs = 0;
-static unsigned SavedArgsLength = 0;
-static const char *SavedEnvVar = 0;
-
-static const char *OutputFilename = "llvmprof.out";
-
-/* check_environment_variable - Check to see if the LLVMPROF_OUTPUT environment
- * variable is set.  If it is then save it and set OutputFilename.
- */
-static void check_environment_variable(void) {
-  const char *EnvVar;
-  if (SavedEnvVar) return; /* Guarantee that we can't leak memory. */
-
-  if ((EnvVar = getenv("LLVMPROF_OUTPUT")) != NULL) {
-    /* The string that getenv returns is allowed to be statically allocated,
-     * which means it may be changed by future calls to getenv, so copy it.
-     */
-    SavedEnvVar = strdup(EnvVar);
-    OutputFilename = SavedEnvVar;
-  }
-}
-
-/* save_arguments - Save argc and argv as passed into the program for the file
- * we output.
- * If either the LLVMPROF_OUTPUT environment variable or the -llvmprof-output
- * command line argument are set then change OutputFilename to the provided
- * value.  The command line argument value overrides the environment variable.
- */
-int save_arguments(int argc, const char **argv) {
-  unsigned Length, i;
-  if (!SavedEnvVar && !SavedArgs) check_environment_variable();
-  if (SavedArgs || !argv) return argc;  /* This can be called multiple times */
-
-  /* Check to see if there are any arguments passed into the program for the
-   * profiler.  If there are, strip them off and remember their settings.
-   */
-  while (argc > 1 && !strncmp(argv[1], "-llvmprof-", 10)) {
-    /* Ok, we have an llvmprof argument.  Remove it from the arg list and decide
-     * what to do with it.
-     */
-    const char *Arg = argv[1];
-    memmove((char**)&argv[1], &argv[2], (argc-1)*sizeof(char*));
-    --argc;
-
-    if (!strcmp(Arg, "-llvmprof-output")) {
-      if (argc == 1)
-        puts("-llvmprof-output requires a filename argument!");
-      else {
-        OutputFilename = strdup(argv[1]);
-        if (SavedEnvVar) { free((void *)SavedEnvVar); SavedEnvVar = 0; }
-        memmove((char**)&argv[1], &argv[2], (argc-1)*sizeof(char*));
-        --argc;
-      }
-    } else {
-      printf("Unknown option to the profiler runtime: '%s' - ignored.\n", Arg);
-    }
-  }
-
-  for (Length = 0, i = 0; i != (unsigned)argc; ++i)
-    Length += strlen(argv[i])+1;
-
-  /* Defensively check for a zero length, even though this is unlikely
-   * to happen in practice.  This avoids calling malloc() below with a
-   * size of 0.
-   */
-  if (Length == 0) {
-    SavedArgs = 0;
-    SavedArgsLength = 0;
-    return argc;
-  }
-  
-  SavedArgs = (char*)malloc(Length);
-  for (Length = 0, i = 0; i != (unsigned)argc; ++i) {
-    unsigned Len = strlen(argv[i]);
-    memcpy(SavedArgs+Length, argv[i], Len);
-    Length += Len;
-    SavedArgs[Length++] = ' ';
-  }
-
-  SavedArgsLength = Length;
-
-  return argc;
-}
-
-
-/*
- * Retrieves the file descriptor for the profile file.
- */
-int getOutFile() {
-  static int OutFile = -1;
-
-  /* If this is the first time this function is called, open the output file
-   * for appending, creating it if it does not already exist.
-   */
-  if (OutFile == -1) {
-    OutFile = open(OutputFilename, O_CREAT | O_WRONLY, 0666);
-    lseek(OutFile, 0, SEEK_END); /* O_APPEND prevents seeking */
-    if (OutFile == -1) {
-      fprintf(stderr, "LLVM profiling runtime: while opening '%s': ",
-              OutputFilename);
-      perror("");
-      return(OutFile);
-    }
-
-    /* Output the command line arguments to the file. */
-    {
-      int PTy = ArgumentInfo;
-      int Zeros = 0;
-      if (write(OutFile, &PTy, sizeof(int)) < 0 ||
-          write(OutFile, &SavedArgsLength, sizeof(unsigned)) < 0 ||
-          write(OutFile, SavedArgs, SavedArgsLength) < 0 ) {
-        fprintf(stderr,"error: unable to write to output file.");
-        exit(0);
-      }
-      /* Pad out to a multiple of four bytes */
-      if (SavedArgsLength & 3) {
-        if (write(OutFile, &Zeros, 4-(SavedArgsLength&3)) < 0) {
-          fprintf(stderr,"error: unable to write to output file.");
-          exit(0);
-        }
-      }
-    }
-  }
-  return(OutFile);
-}
-
-/* write_profiling_data - Write a raw block of profiling counters out to the
- * llvmprof.out file.  Note that we allow programs to be instrumented with
- * multiple different kinds of instrumentation.  For this reason, this function
- * may be called more than once.
- */
-void write_profiling_data(enum ProfilingType PT, unsigned *Start,
-                          unsigned NumElements) {
-  int PTy;
-  int outFile = getOutFile();
-
-  /* Write out this record! */
-  PTy = PT;
-  if( write(outFile, &PTy, sizeof(int)) < 0 ||
-      write(outFile, &NumElements, sizeof(unsigned)) < 0 ||
-      write(outFile, Start, NumElements*sizeof(unsigned)) < 0 ) {
-    fprintf(stderr,"error: unable to write to output file.");
-    exit(0);
-  }
-}
diff --git a/runtime/libprofile/EdgeProfiling.c b/runtime/libprofile/EdgeProfiling.c
deleted file mode 100644
index f19e188..0000000
--- a/runtime/libprofile/EdgeProfiling.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*===-- EdgeProfiling.c - Support library for edge profiling --------------===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source      
-|* License. See LICENSE.TXT for details.                                      
-|* 
-|*===----------------------------------------------------------------------===*|
-|* 
-|* This file implements the call back routines for the edge profiling
-|* instrumentation pass.  This should be used with the -insert-edge-profiling
-|* LLVM pass.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#include "Profiling.h"
-#include <stdlib.h>
-
-static unsigned *ArrayStart;
-static unsigned NumElements;
-
-/* EdgeProfAtExitHandler - When the program exits, just write out the profiling
- * data.
- */
-static void EdgeProfAtExitHandler(void) {
-  /* Note that if this were doing something more intelligent with the
-   * instrumentation, we could do some computation here to expand what we
-   * collected into simple edge profiles.  Since we directly count each edge, we
-   * just write out all of the counters directly.
-   */
-  write_profiling_data(EdgeInfo, ArrayStart, NumElements);
-}
-
-
-/* llvm_start_edge_profiling - This is the main entry point of the edge
- * profiling library.  It is responsible for setting up the atexit handler.
- */
-int llvm_start_edge_profiling(int argc, const char **argv,
-                              unsigned *arrayStart, unsigned numElements) {
-  int Ret = save_arguments(argc, argv);
-  ArrayStart = arrayStart;
-  NumElements = numElements;
-  atexit(EdgeProfAtExitHandler);
-  return Ret;
-}
diff --git a/runtime/libprofile/Makefile b/runtime/libprofile/Makefile
deleted file mode 100644
index 2f061ad..0000000
--- a/runtime/libprofile/Makefile
+++ /dev/null
@@ -1,52 +0,0 @@
-##===- runtime/libprofile/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-include $(LEVEL)/Makefile.config
-
-LIBRARYNAME = profile_rt
-LINK_LIBS_IN_SHARED = 1
-SHARED_LIBRARY = 1
-
-# Build and install this archive.                                                                                                                  
-BUILD_ARCHIVE = 1
-override NO_INSTALL_ARCHIVES =
-
-include $(LEVEL)/Makefile.common
-
-ifeq ($(HOST_OS),Darwin)
-    # Special hack to allow libprofile_rt to have an offset version number.
-    PROFILE_RT_LIBRARY_VERSION := $(LLVM_SUBMIT_VERSION)
-
-    # Set dylib internal version number to llvmCore submission number.
-    ifdef LLVM_SUBMIT_VERSION
-        LLVMLibsOptions := $(LLVMLibsOptions) -Wl,-current_version \
-                        -Wl,$(PROFILE_RT_LIBRARY_VERSION).$(LLVM_SUBMIT_SUBVERSION) \
-                        -Wl,-compatibility_version -Wl,1
-    endif
-    # Extra options to override libtool defaults.
-    LLVMLibsOptions    := $(LLVMLibsOptions)  \
-                         -Wl,-dead_strip
-
-    # Mac OS X 10.4 and earlier tools do not allow a second -install_name on
-    # command line.
-    DARWIN_VERS := $(shell echo $(TARGET_TRIPLE) | sed 's/.*darwin\([0-9]*\).*/\1/')
-    ifneq ($(DARWIN_VERS),8)
-       LLVMLibsOptions    := $(LLVMLibsOptions) \
-                            -Wl,-install_name \
-                            -Wl,"@rpath/lib$(LIBRARYNAME)$(SHLIBEXT)"
-    endif
-
-    # If we're doing an Apple-style build, add the LTO object path.
-    ifeq ($(RC_XBS),YES)
-       TempFile           := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/profile_rt-lto.XXXXXX)
-       LLVMLibsOptions    := $(LLVMLibsOptions) \
-                             -Wl,-object_path_lto -Wl,$(TempFile)
-    endif
-endif
diff --git a/runtime/libprofile/OptimalEdgeProfiling.c b/runtime/libprofile/OptimalEdgeProfiling.c
deleted file mode 100644
index 3a7631b..0000000
--- a/runtime/libprofile/OptimalEdgeProfiling.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*===-- OptimalEdgeProfiling.c - Support library for opt. edge profiling --===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source      
-|* License. See LICENSE.TXT for details.                                      
-|* 
-|*===----------------------------------------------------------------------===*|
-|* 
-|* This file implements the call back routines for the edge profiling
-|* instrumentation pass.  This should be used with the
-|* -insert-opt-edge-profiling LLVM pass.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#include "Profiling.h"
-#include <stdlib.h>
-
-static unsigned *ArrayStart;
-static unsigned NumElements;
-
-/* OptEdgeProfAtExitHandler - When the program exits, just write out the
- * profiling data.
- */
-static void OptEdgeProfAtExitHandler(void) {
-  /* Note that, although the array has a counter for each edge, not all
-   * counters are updated, the ones that are not used are initialised with -1.
-   * When loading this information the counters with value -1 have to be
-   * recalculated, it is guaranteed that this is possible.
-   */
-  write_profiling_data(OptEdgeInfo, ArrayStart, NumElements);
-}
-
-
-/* llvm_start_opt_edge_profiling - This is the main entry point of the edge
- * profiling library.  It is responsible for setting up the atexit handler.
- */
-int llvm_start_opt_edge_profiling(int argc, const char **argv,
-                                  unsigned *arrayStart, unsigned numElements) {
-  int Ret = save_arguments(argc, argv);
-  ArrayStart = arrayStart;
-  NumElements = numElements;
-  atexit(OptEdgeProfAtExitHandler);
-  return Ret;
-}
diff --git a/runtime/libprofile/PathProfiling.c b/runtime/libprofile/PathProfiling.c
deleted file mode 100644
index 71ee944..0000000
--- a/runtime/libprofile/PathProfiling.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*===-- PathProfiling.c - Support library for path profiling --------------===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source
-|* License. See LICENSE.TXT for details.
-|*
-|*===----------------------------------------------------------------------===*|
-|*
-|* This file implements the call back routines for the path profiling
-|* instrumentation pass.  This should be used with the -insert-path-profiling
-|* LLVM pass.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#include "Profiling.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/Support/DataTypes.h"
-#include <sys/types.h>
-#if !defined(_MSC_VER) && !defined(__MINGW32__)
-#include <unistd.h>
-#else
-#include <io.h>
-#endif
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-/* note that this is used for functions with large path counts,
-         but it is unlikely those paths will ALL be executed */
-#define ARBITRARY_HASH_BIN_COUNT 100
-
-typedef struct pathHashEntry_s {
-  uint32_t pathNumber;
-  uint32_t pathCount;
-  struct pathHashEntry_s* next;
-} pathHashEntry_t;
-
-typedef struct pathHashTable_s {
-  pathHashEntry_t* hashBins[ARBITRARY_HASH_BIN_COUNT];
-  uint32_t pathCounts;
-} pathHashTable_t;
-
-typedef struct {
-  enum ProfilingStorageType type;
-  uint32_t size;
-  void* array;
-} ftEntry_t;
-
-/* pointer to the function table allocated in the instrumented program */
-ftEntry_t* ft;
-uint32_t ftSize;
-
-/* write an array table to file */
-void writeArrayTable(uint32_t fNumber, ftEntry_t* ft, uint32_t* funcCount) {
-  int outFile = getOutFile();
-  uint32_t arrayHeaderLocation = 0;
-  uint32_t arrayCurrentLocation = 0;
-  uint32_t arrayIterator = 0;
-  uint32_t functionUsed = 0;
-  uint32_t pathCounts = 0;
-
-  /* look through each entry in the array to determine whether the function
-     was executed at all */
-  for( arrayIterator = 0; arrayIterator < ft->size; arrayIterator++ ) {
-    uint32_t pc = ((uint32_t*)ft->array)[arrayIterator];
-
-    /* was this path executed? */
-    if( pc ) {
-      PathProfileTableEntry pte;
-      pte.pathNumber = arrayIterator;
-      pte.pathCounter = pc;
-      pathCounts++;
-
-      /* one-time initialization stuff */
-      if(!functionUsed) {
-        arrayHeaderLocation = lseek(outFile, 0, SEEK_CUR);
-        lseek(outFile, sizeof(PathProfileHeader), SEEK_CUR);
-        functionUsed = 1;
-        (*funcCount)++;
-      }
-
-      /* write path data */
-      if (write(outFile, &pte, sizeof(PathProfileTableEntry)) < 0) {
-        fprintf(stderr, "error: unable to write path entry to output file.\n");
-        return;
-      }
-    }
-  }
-
-  /* If this function was executed, write the header */
-  if( functionUsed ) {
-    PathProfileHeader fHeader;
-    fHeader.fnNumber = fNumber;
-    fHeader.numEntries = pathCounts;
-
-    arrayCurrentLocation = lseek(outFile, 0, SEEK_CUR);
-    lseek(outFile, arrayHeaderLocation, SEEK_SET);
-
-    if (write(outFile, &fHeader, sizeof(PathProfileHeader)) < 0) {
-      fprintf(stderr,
-              "error: unable to write function header to output file.\n");
-      return;
-    }
-
-    lseek(outFile, arrayCurrentLocation, SEEK_SET);
-  }
-}
-
-static uint32_t hash (uint32_t key) {
-  /* this may benefit from a proper hash function */
-  return key%ARBITRARY_HASH_BIN_COUNT;
-}
-
-/* output a specific function's hash table to the profile file */
-void writeHashTable(uint32_t functionNumber, pathHashTable_t* hashTable) {
-  int outFile = getOutFile();
-  PathProfileHeader header;
-  uint32_t i;
-
-  header.fnNumber = functionNumber;
-  header.numEntries = hashTable->pathCounts;
-
-  if (write(outFile, &header, sizeof(PathProfileHeader)) < 0) {
-    fprintf(stderr, "error: unable to write function header to output file.\n");
-    return;
-  }
-
-  for (i = 0; i < ARBITRARY_HASH_BIN_COUNT; i++) {
-    pathHashEntry_t* hashEntry = hashTable->hashBins[i];
-
-    while (hashEntry) {
-      pathHashEntry_t* temp;
-
-      PathProfileTableEntry pte;
-      pte.pathNumber = hashEntry->pathNumber;
-      pte.pathCounter = hashEntry->pathCount;
-
-      if (write(outFile, &pte, sizeof(PathProfileTableEntry)) < 0) {
-        fprintf(stderr, "error: unable to write path entry to output file.\n");
-        return;
-      }
-
-      temp = hashEntry;
-      hashEntry = hashEntry->next;
-      free (temp);
-
-    }
-  }
-}
-
-/* Return a pointer to this path's specific path counter */
-static uint32_t* getPathCounter(uint32_t functionNumber,
-                                       uint32_t pathNumber) {
-  pathHashTable_t* hashTable;
-  pathHashEntry_t* hashEntry;
-  uint32_t index = hash(pathNumber);
-
-  if( ft[functionNumber-1].array == 0)
-    ft[functionNumber-1].array = calloc(sizeof(pathHashTable_t), 1);
-
-  hashTable = (pathHashTable_t*)((ftEntry_t*)ft)[functionNumber-1].array;
-  hashEntry = hashTable->hashBins[index];
-
-  while (hashEntry) {
-    if (hashEntry->pathNumber == pathNumber) {
-      return &hashEntry->pathCount;
-    }
-
-    hashEntry = hashEntry->next;
-  }
-
-  hashEntry = malloc(sizeof(pathHashEntry_t));
-  hashEntry->pathNumber = pathNumber;
-  hashEntry->pathCount = 0;
-  hashEntry->next = hashTable->hashBins[index];
-  hashTable->hashBins[index] = hashEntry;
-  hashTable->pathCounts++;
-  return &hashEntry->pathCount;
-}
-
-/* Increment a specific path's count */
-void llvm_increment_path_count (uint32_t functionNumber, uint32_t pathNumber) {
-  uint32_t* pathCounter = getPathCounter(functionNumber, pathNumber);
-  if( *pathCounter < 0xffffffff )
-    (*pathCounter)++;
-}
-
-/* Increment a specific path's count */
-void llvm_decrement_path_count (uint32_t functionNumber, uint32_t pathNumber) {
-  uint32_t* pathCounter = getPathCounter(functionNumber, pathNumber);
-  (*pathCounter)--;
-}
-
-/*
- * Writes out a path profile given a function table, in the following format.
- *
- *
- *      | <-- 32 bits --> |
- *      +-----------------+-----------------+
- * 0x00 | profileType     | functionCount   |
- *      +-----------------+-----------------+
- * 0x08 | functionNum     | profileEntries  |  // function 1
- *      +-----------------+-----------------+
- * 0x10 | pathNumber      | pathCounter     |  // entry 1.1
- *      +-----------------+-----------------+
- * 0x18 | pathNumber      | pathCounter     |  // entry 1.2
- *      +-----------------+-----------------+
- *  ... |       ...       |       ...       |  // entry 1.n
- *      +-----------------+-----------------+
- *  ... | functionNum     | profileEntries  |  // function 2
- *      +-----------------+-----------------+
- *  ... | pathNumber      | pathCounter     |  // entry 2.1
- *      +-----------------+-----------------+
- *  ... | pathNumber      | pathCounter     |  // entry 2.2
- *      +-----------------+-----------------+
- *  ... |       ...       |       ...       |  // entry 2.n
- *      +-----------------+-----------------+
- *
- */
-static void pathProfAtExitHandler(void) {
-  int outFile = getOutFile();
-  uint32_t i;
-  uint32_t header[2] = { PathInfo, 0 };
-  uint32_t headerLocation;
-  uint32_t currentLocation;
-
-  /* skip over the header for now */
-  headerLocation = lseek(outFile, 0, SEEK_CUR);
-  lseek(outFile, 2*sizeof(uint32_t), SEEK_CUR);
-
-  /* Iterate through each function */
-  for( i = 0; i < ftSize; i++ ) {
-    if( ft[i].type == ProfilingArray ) {
-      writeArrayTable(i+1,&ft[i],header + 1);
-
-    } else if( ft[i].type == ProfilingHash ) {
-      /* If the hash exists, write it to file */
-      if( ft[i].array ) {
-        writeHashTable(i+1,ft[i].array);
-        header[1]++;
-        free(ft[i].array);
-      }
-    }
-  }
-
-  /* Setup and write the path profile header */
-  currentLocation = lseek(outFile, 0, SEEK_CUR);
-  lseek(outFile, headerLocation, SEEK_SET);
-
-  if (write(outFile, header, sizeof(header)) < 0) {
-    fprintf(stderr,
-            "error: unable to write path profile header to output file.\n");
-    return;
-  }
-
-  lseek(outFile, currentLocation, SEEK_SET);
-}
-/* llvm_start_path_profiling - This is the main entry point of the path
- * profiling library.  It is responsible for setting up the atexit handler.
- */
-int llvm_start_path_profiling(int argc, const char** argv,
-                              void* functionTable, uint32_t numElements) {
-  int Ret = save_arguments(argc, argv);
-  ft = functionTable;
-  ftSize = numElements;
-  atexit(pathProfAtExitHandler);
-
-  return Ret;
-}
diff --git a/runtime/libprofile/Profiling.h b/runtime/libprofile/Profiling.h
deleted file mode 100644
index acc6399..0000000
--- a/runtime/libprofile/Profiling.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*===-- Profiling.h - Profiling support library support routines ----------===*\
-|*
-|*                     The LLVM Compiler Infrastructure
-|*
-|* This file is distributed under the University of Illinois Open Source
-|* License. See LICENSE.TXT for details.
-|*
-|*===----------------------------------------------------------------------===*|
-|*
-|* This file defines functions shared by the various different profiling
-|* implementations.
-|*
-\*===----------------------------------------------------------------------===*/
-
-#ifndef PROFILING_H
-#define PROFILING_H
-
-#include "llvm/Analysis/ProfileDataTypes.h" /* for enum ProfilingType */
-
-/* save_arguments - Save argc and argv as passed into the program for the file
- * we output.
- */
-int save_arguments(int argc, const char **argv);
-
-/*
- * Retrieves the file descriptor for the profile file.
- */
-int getOutFile();
-
-/* write_profiling_data - Write out a typed packet of profiling data to the
- * current output file.
- */
-void write_profiling_data(enum ProfilingType PT, unsigned *Start,
-                          unsigned NumElements);
-
-#endif
diff --git a/test/.clang-format b/test/.clang-format
new file mode 100644
index 0000000..4799b66
--- /dev/null
+++ b/test/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: LLVM
+ColumnLimit: 0
diff --git a/test/Analysis/BasicAA/full-store-partial-alias.ll b/test/Analysis/BasicAA/full-store-partial-alias.ll
index 2c34fd5..4de2daf 100644
--- a/test/Analysis/BasicAA/full-store-partial-alias.ll
+++ b/test/Analysis/BasicAA/full-store-partial-alias.ll
@@ -29,7 +29,9 @@ entry:
   ret i32 %tmp5.lobit
 }
 
-!0 = metadata !{metadata !"double", metadata !1}
+!0 = metadata !{metadata !4, metadata !4, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"int", metadata !1}
+!3 = metadata !{metadata !5, metadata !5, i64 0}
+!4 = metadata !{metadata !"double", metadata !1}
+!5 = metadata !{metadata !"int", metadata !1}
diff --git a/test/Analysis/BasicAA/gep-alias.ll b/test/Analysis/BasicAA/gep-alias.ll
index 9c2c7ee..2c0d467 100644
--- a/test/Analysis/BasicAA/gep-alias.ll
+++ b/test/Analysis/BasicAA/gep-alias.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -basicaa -gvn -instcombine -S 2>&1 | FileCheck %s
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
 ; Make sure that basicaa thinks R and r are must aliases.
 define i32 @test1(i8 * %P) {
@@ -15,7 +15,7 @@ entry:
 
 	%t = sub i32 %S, %s
 	ret i32 %t
-; CHECK: @test1
+; CHECK-LABEL: @test1(
 ; CHECK: ret i32 0
 }
 
@@ -32,7 +32,7 @@ entry:
 
 	%t = sub i32 %S, %s
 	ret i32 %t
-; CHECK: @test2
+; CHECK-LABEL: @test2(
 ; CHECK: ret i32 0
 }
 
@@ -51,7 +51,7 @@ entry:
 
 	%t = sub i32 %S, %s
 	ret i32 %t
-; CHECK: @test3
+; CHECK-LABEL: @test3(
 ; CHECK: ret i32 0
 }
 
@@ -68,7 +68,7 @@ entry:
   store i8* null, i8** %tmp3, align 8
   %tmp4 = load i32* %tmp2, align 8
 	ret i32 %tmp4
-; CHECK: @test4
+; CHECK-LABEL: @test4(
 ; CHECK: ret i32 64
 }
 
@@ -82,7 +82,34 @@ define i32 @test5(i32* %p, i64 %i) {
   %y = load i32* %pi
   %z = sub i32 %x, %y
   ret i32 %z
-; CHECK: @test5
+; CHECK-LABEL: @test5(
+; CHECK: ret i32 0
+}
+
+define i32 @test5_as1_smaller_size(i32 addrspace(1)* %p, i8 %i) {
+  %pi = getelementptr i32 addrspace(1)* %p, i8 %i
+  %i.next = add i8 %i, 1
+  %pi.next = getelementptr i32 addrspace(1)* %p, i8 %i.next
+  %x = load i32 addrspace(1)* %pi
+  store i32 42, i32 addrspace(1)* %pi.next
+  %y = load i32 addrspace(1)* %pi
+  %z = sub i32 %x, %y
+  ret i32 %z
+; CHECK-LABEL: @test5_as1_smaller_size(
+; CHECK: sext
+; CHECK: ret i32 0
+}
+
+define i32 @test5_as1_same_size(i32 addrspace(1)* %p, i16 %i) {
+  %pi = getelementptr i32 addrspace(1)* %p, i16 %i
+  %i.next = add i16 %i, 1
+  %pi.next = getelementptr i32 addrspace(1)* %p, i16 %i.next
+  %x = load i32 addrspace(1)* %pi
+  store i32 42, i32 addrspace(1)* %pi.next
+  %y = load i32 addrspace(1)* %pi
+  %z = sub i32 %x, %y
+  ret i32 %z
+; CHECK-LABEL: @test5_as1_same_size(
 ; CHECK: ret i32 0
 }
 
@@ -97,7 +124,7 @@ define i32 @test6(i32* %p, i64 %i1) {
   %y = load i32* %pi
   %z = sub i32 %x, %y
   ret i32 %z
-; CHECK: @test6
+; CHECK-LABEL: @test6(
 ; CHECK: ret i32 0
 }
 
@@ -111,7 +138,7 @@ define i32 @test7(i32* %p, i64 %i) {
   %y = load i32* %pi
   %z = sub i32 %x, %y
   ret i32 %z
-; CHECK: @test7
+; CHECK-LABEL: @test7(
 ; CHECK: ret i32 0
 }
 
@@ -128,7 +155,7 @@ define i32 @test8(i32* %p, i16 %i) {
   %y = load i32* %pi
   %z = sub i32 %x, %y
   ret i32 %z
-; CHECK: @test8
+; CHECK-LABEL: @test8(
 ; CHECK: ret i32 0
 }
 
@@ -139,7 +166,7 @@ define i8 @test9([4 x i8] *%P, i32 %i, i32 %j) {
   %P2 = getelementptr [4 x i8] *%P, i32 0, i32 %i3
 
   %j2 = shl i32 %j, 2
-  
+
   ; P4 = P + 4*j
   %P4 = getelementptr [4 x i8]* %P, i32 0, i32 %j2
 
@@ -148,7 +175,7 @@ define i8 @test9([4 x i8] *%P, i32 %i, i32 %j) {
   %y = load i8* %P2
   %z = sub i8 %x, %y
   ret i8 %z
-; CHECK: @test9
+; CHECK-LABEL: @test9(
 ; CHECK: ret i8 0
 }
 
@@ -157,7 +184,7 @@ define i8 @test10([4 x i8] *%P, i32 %i) {
   %i3 = add i32 %i2, 4
   ; P2 = P + 4 + 4*i
   %P2 = getelementptr [4 x i8] *%P, i32 0, i32 %i3
-  
+
   ; P4 = P + 4*i
   %P4 = getelementptr [4 x i8]* %P, i32 0, i32 %i2
 
@@ -166,7 +193,7 @@ define i8 @test10([4 x i8] *%P, i32 %i) {
   %y = load i8* %P2
   %z = sub i8 %x, %y
   ret i8 %z
-; CHECK: @test10
+; CHECK-LABEL: @test10(
 ; CHECK: ret i8 0
 }
 
@@ -182,8 +209,8 @@ define float @test11(i32 %indvar, [4 x [2 x float]]* %q) nounwind ssp {
   store i64 0, i64* %scevgep35, align 4
   %tmp30 = load float* %y29, align 4
   ret float %tmp30
-  ; CHECK: @test11
-  ; CHECK: ret float %tmp30
+; CHECK-LABEL: @test11(
+; CHECK: ret float %tmp30
 }
 
 ; (This was a miscompilation.)
@@ -198,6 +225,6 @@ define i32 @test12(i32 %x, i32 %y, i8* %p) nounwind {
   store i32 0, i32* %castd
   %r = load i32* %castp
   ret i32 %r
-  ; CHECK: @test12
-  ; CHECK: ret i32 %r
+; CHECK-LABEL: @test12(
+; CHECK: ret i32 %r
 }
diff --git a/test/Analysis/BasicAA/global-size.ll b/test/Analysis/BasicAA/global-size.ll
index a7e5aab..f081cb1 100644
--- a/test/Analysis/BasicAA/global-size.ll
+++ b/test/Analysis/BasicAA/global-size.ll
@@ -2,11 +2,11 @@
 ; the global.
 
 ; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+target datalayout = "E-p:64:64:64-p1:16:16:16-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @B = global i16 8
 
-; CHECK: @test1
+; CHECK-LABEL: @test1(
 define i16 @test1(i32* %P) {
         %X = load i16* @B
         store i32 7, i32* %P
@@ -16,11 +16,23 @@ define i16 @test1(i32* %P) {
 ; CHECK: ret i16 0
 }
 
+@B_as1 = addrspace(1) global i16 8
+
+define i16 @test1_as1(i32 addrspace(1)* %P) {
+; CHECK-LABEL: @test1_as1(
+; CHECK: ret i16 0
+  %X = load i16 addrspace(1)* @B_as1
+  store i32 7, i32 addrspace(1)* %P
+  %Y = load i16 addrspace(1)* @B_as1
+  %Z = sub i16 %Y, %X
+  ret i16 %Z
+}
+
 ; Cannot know anything about the size of this global.
 ; rdar://8813415
 @window = external global [0 x i8]
 
-; CHECK: @test2
+; CHECK-LABEL: @test2(
 define i8 @test2(i32 %tmp79, i32 %w.2, i32 %indvar89) nounwind {
   %tmp92 = add i32 %tmp79, %indvar89
   %arrayidx412 = getelementptr [0 x i8]* @window, i32 0, i32 %tmp92
diff --git a/test/Analysis/BasicAA/noalias-geps.ll b/test/Analysis/BasicAA/noalias-geps.ll
index a93d778..f9ec713 100644
--- a/test/Analysis/BasicAA/noalias-geps.ll
+++ b/test/Analysis/BasicAA/noalias-geps.ll
@@ -4,6 +4,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 ; Check that geps with equal base offsets of noalias base pointers stay noalias.
 define i32 @test(i32* %p, i16 %i) {
+; CHECK-LABEL: Function: test:
   %pi = getelementptr i32* %p, i32 0
   %pi.next = getelementptr i32* %p, i32 1
   %b = icmp eq i16 %i, 0
@@ -30,6 +31,7 @@ ret i32 0
 
 ; Check that geps with equal indices of noalias base pointers stay noalias.
 define i32 @test2([2 x i32]* %p, i32 %i) {
+; CHECK-LABEL: Function: test2:
   %pi = getelementptr [2 x i32]* %p, i32 0
   %pi.next = getelementptr [2 x i32]* %p, i32 1
   %b = icmp eq i32 %i, 0
diff --git a/test/Analysis/BlockFrequencyInfo/lit.local.cfg b/test/Analysis/BlockFrequencyInfo/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/BlockFrequencyInfo/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index c6e1bde..05cb31d 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -173,3 +173,42 @@ exit:
   %ret = phi i32 [ %val4, %then ], [ %val3, %else ]
   ret i32 %ret
 }
+
+define i32 @zero1(i32 %i, i32 %a, i32 %b) {
+; CHECK: Printing analysis {{.*}} for function 'zero1'
+entry:
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 12 / 32
+; CHECK: edge entry -> else probability is 20 / 32
+
+then:
+  br label %exit
+
+else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ %a, %then ], [ %b, %else ]
+  ret i32 %result
+}
+
+define i32 @zero2(i32 %i, i32 %a, i32 %b) {
+; CHECK: Printing analysis {{.*}} for function 'zero2'
+entry:
+  %cond = icmp ne i32 %i, -1
+  br i1 %cond, label %then, label %else
+; CHECK: edge entry -> then probability is 20 / 32
+; CHECK: edge entry -> else probability is 12 / 32
+
+then:
+  br label %exit
+
+else:
+  br label %exit
+
+exit:
+  %result = phi i32 [ %a, %then ], [ %b, %else ]
+  ret i32 %result
+}
+
diff --git a/test/Analysis/BranchProbabilityInfo/lit.local.cfg b/test/Analysis/BranchProbabilityInfo/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/BranchProbabilityInfo/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/CallGraph/lit.local.cfg b/test/Analysis/CallGraph/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/CallGraph/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/CostModel/ARM/lit.local.cfg b/test/Analysis/CostModel/ARM/lit.local.cfg
index cb77b09..8a3ba96 100644
--- a/test/Analysis/CostModel/ARM/lit.local.cfg
+++ b/test/Analysis/CostModel/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/Analysis/CostModel/PowerPC/lit.local.cfg b/test/Analysis/CostModel/PowerPC/lit.local.cfg
index 4019eca..2e46300 100644
--- a/test/Analysis/CostModel/PowerPC/lit.local.cfg
+++ b/test/Analysis/CostModel/PowerPC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'PowerPC' in targets:
     config.unsupported = True
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
index b69b3bf..f3c1283 100644
--- a/test/Analysis/CostModel/X86/cast.ll
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -38,6 +38,10 @@ define i32 @zext_sext(<8 x i1> %in) {
   ;CHECK: cost of 9 {{.*}} sext
   %S = sext <8 x i1> %in to <8 x i32>
 
+  ;CHECK: cost of 1 {{.*}} zext
+  %A1 = zext <16 x i8> undef to <16 x i16>
+  ;CHECK: cost of 1 {{.*}} sext
+  %A2 = sext <16 x i8> undef to <16 x i16>
   ;CHECK: cost of 1 {{.*}} sext
   %A = sext <8 x i16> undef to <8 x i32>
   ;CHECK: cost of 1 {{.*}} zext
@@ -51,11 +55,13 @@ define i32 @zext_sext(<8 x i1> %in) {
 
   ;CHECK: cost of 1 {{.*}} zext
   %D = zext <4 x i32> undef to <4 x i64>
-  ;CHECK: cost of 1 {{.*}} trunc
 
+  ;CHECK: cost of 1 {{.*}} trunc
   %E = trunc <4 x i64> undef to <4 x i32>
   ;CHECK: cost of 1 {{.*}} trunc
   %F = trunc <8 x i32> undef to <8 x i16>
+  ;CHECK: cost of 2 {{.*}} trunc
+  %F1 = trunc <16 x i16> undef to <16 x i8>
 
   ;CHECK: cost of 3 {{.*}} trunc
   %G = trunc <8 x i64> undef to <8 x i32>
diff --git a/test/Analysis/CostModel/X86/lit.local.cfg b/test/Analysis/CostModel/X86/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/Analysis/CostModel/X86/lit.local.cfg
+++ b/test/Analysis/CostModel/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
new file mode 100644
index 0000000..78e65ae
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -0,0 +1,365 @@
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
+
+define fastcc float @reduction_cost_float(<4 x float> %rdx) {
+  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+
+; Check that we recognize the tree starting at the extractelement as a
+; reduction.
+; CHECK-LABEL: reduction_cost
+; CHECK:  cost of 9 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx8, i32 0
+  ret float %r
+}
+
+define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
+  %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
+   <8 x i32> <i32 4    , i32     5, i32     6, i32     7,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+  %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
+   <8 x i32> <i32 2    , i32 3,     i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+  %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
+   <8 x i32> <i32 1    , i32 undef, i32 undef, i32 undef,
+              i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+
+; CHECK-LABEL: reduction_cost_int
+; CHECK:  cost of 23 {{.*}} extractelement
+
+  %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+  ret i32 %r
+}
+
+define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd
+; CHECK: cost of 11 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx.1, i32 0
+  %r2 = fadd float %r, %f1
+  ret float %r2
+}
+
+define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd_assoc
+; CHECK: cost of 11 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx.1, i32 0
+  %r2 = fadd float %r, %f1
+  ret float %r2
+}
+
+define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+        <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+        <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd_skip_first
+; CHECK: cost of 11 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx.1, i32 0
+  %r2 = fadd float %r, %f1
+  ret float %r2
+}
+
+define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
+  %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
+
+; SSE3:  cost of 2 {{.*}} extractelement
+; AVX:  cost of 2 {{.*}} extractelement
+; AVX2:  cost of 2 {{.*}} extractelement
+
+  %r = extractelement <2 x double> %bin.rdx, i32 0
+  ret double %r
+}
+
+define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
+  %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+
+; SSE3:  cost of 4 {{.*}} extractelement
+; AVX:  cost of 3 {{.*}} extractelement
+; AVX2:  cost of 3 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx8, i32 0
+  ret float %r
+}
+
+define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
+  %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
+
+; AVX:  cost of 3 {{.*}} extractelement
+; AVX2:  cost of 3 {{.*}} extractelement
+
+  %r = extractelement <4 x double> %bin.rdx8, i32 0
+  ret double %r
+}
+
+define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
+  %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
+  %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
+  %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
+
+; AVX:  cost of 4 {{.*}} extractelement
+; AVX2:  cost of 4 {{.*}} extractelement
+
+  %r = extractelement <8 x float> %bin.rdx8, i32 0
+  ret float %r
+}
+
+define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
+  %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx = add <2 x i64> %rdx, %rdx.shuf
+
+; SSE3:  cost of 2 {{.*}} extractelement
+; AVX:  cost of 1 {{.*}} extractelement
+; AVX2:  cost of 1 {{.*}} extractelement
+
+  %r = extractelement <2 x i64> %bin.rdx, i32 0
+  ret i64 %r
+}
+
+define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
+  %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx = add <4 x i32> %rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
+
+; SSE3:  cost of 3 {{.*}} extractelement
+; AVX:  cost of 3 {{.*}} extractelement
+; AVX2:  cost of 3 {{.*}} extractelement
+
+  %r = extractelement <4 x i32> %bin.rdx8, i32 0
+  ret i32 %r
+}
+
+define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
+  %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
+
+; AVX:  cost of 3 {{.*}} extractelement
+; AVX2:  cost of 3 {{.*}} extractelement
+
+  %r = extractelement <4 x i64> %bin.rdx8, i32 0
+  ret i64 %r
+}
+
+define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
+  %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
+  %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
+  %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
+
+; SSE3:  cost of 4 {{.*}} extractelement
+; AVX:  cost of 4 {{.*}} extractelement
+; AVX2:  cost of 4 {{.*}} extractelement
+
+  %r = extractelement <8 x i16> %bin.rdx8, i32 0
+  ret i16 %r
+}
+
+define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
+  %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
+  %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
+  %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
+
+; AVX:  cost of 5 {{.*}} extractelement
+; AVX2:  cost of 5 {{.*}} extractelement
+
+  %r = extractelement <8 x i32> %bin.rdx8, i32 0
+  ret i32 %r
+}
+
+define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
+  %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; SSE3:  cost of 2 {{.*}} extractelement
+; AVX:  cost of 2 {{.*}} extractelement
+; AVX2:  cost of 2 {{.*}} extractelement
+
+  %r = extractelement <2 x double> %bin.rdx8, i32 0
+  ret double %r
+}
+
+define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; SSE3:  cost of 4 {{.*}} extractelement
+; AVX:  cost of 4 {{.*}} extractelement
+; AVX2:  cost of 4 {{.*}} extractelement
+
+  %r = extractelement <4 x float> %bin.rdx8, i32 0
+  ret float %r
+}
+
+define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; AVX:  cost of 5 {{.*}} extractelement
+; AVX2:  cost of 5 {{.*}} extractelement
+
+  %r = extractelement <4 x double> %bin.rdx8, i32 0
+  ret double %r
+}
+
+define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
+  %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+  %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
+
+; AVX:  cost of 7 {{.*}} extractelement
+; AVX2:  cost of 7 {{.*}} extractelement
+
+  %r = extractelement <8 x float> %bin.rdx9, i32 0
+  ret float %r
+}
+
+define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
+  %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; SSE3:  cost of 2 {{.*}} extractelement
+; AVX:  cost of 1 {{.*}} extractelement
+; AVX2:  cost of 1 {{.*}} extractelement
+
+  %r = extractelement <2 x i64> %bin.rdx8, i32 0
+  ret i64 %r
+}
+
+define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; SSE3:  cost of 3 {{.*}} extractelement
+; AVX:  cost of 3 {{.*}} extractelement
+; AVX2:  cost of 3 {{.*}} extractelement
+
+  %r = extractelement <4 x i32> %bin.rdx8, i32 0
+  ret i32 %r
+}
+
+define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
+  %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; AVX:  cost of 5 {{.*}} extractelement
+; AVX2:  cost of 5 {{.*}} extractelement
+
+  %r = extractelement <4 x i64> %bin.rdx8, i32 0
+  ret i64 %r
+}
+
+define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
+  %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
+  %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
+
+; SSE3:  cost of 5 {{.*}} extractelement
+; AVX:  cost of 5 {{.*}} extractelement
+; AVX2:  cost of 5 {{.*}} extractelement
+
+  %r = extractelement <8 x i16> %bin.rdx9, i32 0
+  ret i16 %r
+}
+
+define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
+  %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
+  %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
+  %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
+
+; AVX:  cost of 5 {{.*}} extractelement
+; AVX2:  cost of 5 {{.*}} extractelement
+
+  %r = extractelement <8 x i32> %bin.rdx9, i32 0
+  ret i32 %r
+}
diff --git a/test/Analysis/CostModel/lit.local.cfg b/test/Analysis/CostModel/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/CostModel/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/Delinearization/a.ll b/test/Analysis/Delinearization/a.ll
new file mode 100644
index 0000000..9308749
--- /dev/null
+++ b/test/Analysis/Delinearization/a.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+;
+; void foo(long n, long m, long o, int A[n][m][o]) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
+;         A[2*i+3][3*j-4][5*k+7] = 1;
+; }
+
+; AddRec: {{{(28 + (4 * (-4 + (3 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(12 * %o)}<%for.j>,+,20}<%for.k>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(i32) bytes.
+; CHECK: ArrayRef[{3,+,2}<%for.i>][{-4,+,3}<%for.j>][{7,+,5}<%for.k>]
+
+; AddRec: {{(8 + ((4 + (12 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(12 * %o)}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%o] with elements of sizeof(i32) bytes.
+; CHECK: ArrayRef[{(1 + (3 * %m)),+,(2 * %m)}<%for.i>][{2,+,(3 * %o)}<%for.j>]
+
+; AddRec: {(8 + ((-8 + (24 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize] with elements of 2 bytes.
+; CHECK: ArrayRef[{((1 + ((-1 + (3 * %m)) * %o)) * sizeof(i32)),+,(%m * %o * sizeof(i32))}<%for.i>]
+
+; Function Attrs: nounwind uwtable
+define void @foo(i64 %n, i64 %m, i64 %o, i32* nocapture %A) #0 {
+entry:
+  %cmp32 = icmp sgt i64 %n, 0
+  br i1 %cmp32, label %for.cond1.preheader.lr.ph, label %for.end17
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp230 = icmp sgt i64 %m, 0
+  %cmp528 = icmp sgt i64 %o, 0
+  br i1 %cmp230, label %for.i, label %for.end17
+
+for.inc15.us:                                     ; preds = %for.inc12.us.us, %for.i
+  %inc16.us = add nsw i64 %i.033.us, 1
+  %exitcond55 = icmp eq i64 %inc16.us, %n
+  br i1 %exitcond55, label %for.end17, label %for.i
+
+for.i:                     ; preds = %for.cond1.preheader.lr.ph, %for.inc15.us
+  %i.033.us = phi i64 [ %inc16.us, %for.inc15.us ], [ 0, %for.cond1.preheader.lr.ph ]
+  %mul8.us = shl i64 %i.033.us, 1
+  %add9.us = add nsw i64 %mul8.us, 3
+  %0 = mul i64 %add9.us, %m
+  %sub.us = add i64 %0, -4
+  br i1 %cmp528, label %for.j, label %for.inc15.us
+
+for.inc12.us.us:                                  ; preds = %for.k
+  %inc13.us.us = add nsw i64 %j.031.us.us, 1
+  %exitcond54 = icmp eq i64 %inc13.us.us, %m
+  br i1 %exitcond54, label %for.inc15.us, label %for.j
+
+for.j:                            ; preds = %for.i, %for.inc12.us.us
+  %j.031.us.us = phi i64 [ %inc13.us.us, %for.inc12.us.us ], [ 0, %for.i ]
+  %mul7.us.us = mul nsw i64 %j.031.us.us, 3
+  %tmp.us.us = add i64 %sub.us, %mul7.us.us
+  %tmp27.us.us = mul i64 %tmp.us.us, %o
+  br label %for.k
+
+for.k:                                  ; preds = %for.k, %for.j
+  %k.029.us.us = phi i64 [ 0, %for.j ], [ %inc.us.us, %for.k ]
+  %mul.us.us = mul nsw i64 %k.029.us.us, 5
+  %arrayidx.sum.us.us = add i64 %mul.us.us, 7
+  %arrayidx10.sum.us.us = add i64 %arrayidx.sum.us.us, %tmp27.us.us
+  %arrayidx11.us.us = getelementptr inbounds i32* %A, i64 %arrayidx10.sum.us.us
+  store i32 1, i32* %arrayidx11.us.us, align 4
+  %inc.us.us = add nsw i64 %k.029.us.us, 1
+  %exitcond = icmp eq i64 %inc.us.us, %o
+  br i1 %exitcond, label %for.inc12.us.us, label %for.k
+
+for.end17:                                        ; preds = %for.inc15.us, %for.cond1.preheader.lr.ph, %entry
+  ret void
+}
diff --git a/test/Analysis/Delinearization/himeno_1.ll b/test/Analysis/Delinearization/himeno_1.ll
new file mode 100644
index 0000000..9458bd2
--- /dev/null
+++ b/test/Analysis/Delinearization/himeno_1.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; #define MR(mt,n,r,c,d)  mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)]
+;
+; struct Mat {
+;   float* m;
+;   int mnums;
+;   int mrows;
+;   int mcols;
+;   int mdeps;
+; };
+;
+; typedef struct Mat Matrix;
+;
+; void jacobi(int nn, Matrix* a, Matrix* p)
+; {
+;   long i, j, k, max,jmax,kmax;
+;
+;   p_rows_sub = p->mrows - 1;
+;   p_cols_sub = p->mcols - 1;
+;   p_deps_sub = p->mdeps - 1;
+;
+;     for(i = 1; i < p_rows_sub; i++)
+;       for(j = 1; j < p_cols_sub; j++)
+;         for(k = 1; k < p_deps_sub; k++)
+;           MR(a,0,i,j,k) = i + j + k;
+; }
+
+; AddRec: {{{(4 + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))}<%for.j>,+,4}<%for.k>
+; CHECK: Base offset: %a.base
+; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.cols to i64)][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
+; CHECK: ArrayRef[{1,+,1}<nuw><nsw><%for.i>][{1,+,1}<nuw><nsw><%for.j>][{1,+,1}<nuw><nsw><%for.k>]
+
+; AddRec: {{(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))}<%for.j>
+; CHECK: Base offset: %a.base
+; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
+; CHECK: ArrayRef[{(1 + (sext i32 %a.cols to i64)),+,(sext i32 %a.cols to i64)}<%for.i>][{(-1 + (sext i32 (-1 + %p.deps) to i64)),+,(sext i32 %a.deps to i64)}<%for.j>]
+
+; AddRec: {(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + ((sext i32 %a.deps to i64) * (-4 + (4 * (sext i32 (-1 + %p.cols) to i64)) + (4 * (sext i32 %a.cols to i64)))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>
+; CHECK: Base offset: %a.base
+; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(float) bytes.
+; CHECK: ArrayRef[{(-1 + (sext i32 (-1 + %p.deps) to i64) + ((sext i32 %a.deps to i64) * (-1 + (sext i32 (-1 + %p.cols) to i64) + (sext i32 %a.cols to i64)))),+,((sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>]
+
+%struct.Mat = type { float*, i32, i32, i32, i32 }
+
+define void @jacobi(i32 %nn, %struct.Mat* nocapture %a, %struct.Mat* nocapture %p) nounwind uwtable {
+entry:
+  %p.rows.ptr = getelementptr inbounds %struct.Mat* %p, i64 0, i32 2
+  %p.rows = load i32* %p.rows.ptr
+  %p.rows.sub = add i32 %p.rows, -1
+  %p.rows.sext = sext i32 %p.rows.sub to i64
+  %p.cols.ptr = getelementptr inbounds %struct.Mat* %p, i64 0, i32 3
+  %p.cols = load i32* %p.cols.ptr
+  %p.cols.sub = add i32 %p.cols, -1
+  %p.cols.sext = sext i32 %p.cols.sub to i64
+  %p.deps.ptr = getelementptr inbounds %struct.Mat* %p, i64 0, i32 4
+  %p.deps = load i32* %p.deps.ptr
+  %p.deps.sub = add i32 %p.deps, -1
+  %p.deps.sext = sext i32 %p.deps.sub to i64
+  %a.cols.ptr = getelementptr inbounds %struct.Mat* %a, i64 0, i32 3
+  %a.cols = load i32* %a.cols.ptr
+  %a.deps.ptr = getelementptr inbounds %struct.Mat* %a, i64 0, i32 4
+  %a.deps = load i32* %a.deps.ptr
+  %a.base.ptr = getelementptr inbounds %struct.Mat* %a, i64 0, i32 0
+  %a.base = load float** %a.base.ptr, align 8
+  br label %for.i
+
+for.i:                                            ; preds = %for.i.inc, %entry
+  %i = phi i64 [ %i.inc, %for.i.inc ], [ 1, %entry ]
+  br label %for.j
+
+for.j:                                            ; preds = %for.j.inc, %for.i
+  %j = phi i64 [ %j.inc, %for.j.inc ], [ 1, %for.i ]
+  %a.cols.sext = sext i32 %a.cols to i64
+  %a.deps.sext = sext i32 %a.deps to i64
+  br label %for.k
+
+for.k:                                            ; preds = %for.k, %for.j
+  %k = phi i64 [ 1, %for.j ], [ %k.inc, %for.k ]
+  %tmp1 = mul nsw i64 %a.cols.sext, %i
+  %tmp2 = add i64 %tmp1, %j
+  %tmp3 = mul i64 %tmp2, %a.deps.sext
+  %tmp4 = add nsw i64 %k, %tmp3
+  %arrayidx = getelementptr inbounds float* %a.base, i64 %tmp4
+  store float 1.000000e+00, float* %arrayidx
+  %k.inc = add nsw i64 %k, 1
+  %k.exitcond = icmp eq i64 %k.inc, %p.deps.sext
+  br i1 %k.exitcond, label %for.j.inc, label %for.k
+
+for.j.inc:                                        ; preds = %for.k
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %p.cols.sext
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:                                        ; preds = %for.j.inc
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %p.rows.sext
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:                                              ; preds = %for.i.inc
+  ret void
+}
diff --git a/test/Analysis/Delinearization/himeno_2.ll b/test/Analysis/Delinearization/himeno_2.ll
new file mode 100644
index 0000000..a290066
--- /dev/null
+++ b/test/Analysis/Delinearization/himeno_2.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; #define MR(mt,n,r,c,d)  mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)]
+;
+; struct Mat {
+;   float* m;
+;   int mnums;
+;   int mrows;
+;   int mcols;
+;   int mdeps;
+; };
+;
+; typedef struct Mat Matrix;
+;
+; void jacobi(int nn, Matrix* a, Matrix* p)
+; {
+;   long i, j, k, max,jmax,kmax;
+;
+;   p_rows_sub = p->mrows - 1;
+;   p_cols_sub = p->mcols - 1;
+;   p_deps_sub = p->mdeps - 1;
+;
+;     for(i = 1; i < p_rows_sub; i++)
+;       for(j = 1; j < p_cols_sub; j++)
+;         for(k = 1; k < p_deps_sub; k++)
+;           MR(a,0,i,j,k) = i + j + k;
+; }
+
+; AddRec: {{{(4 + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))}<%for.j>,+,4}<%for.k>
+; CHECK: Base offset: %a.base
+; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.cols to i64)][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
+; CHECK: ArrayRef[{1,+,1}<nuw><nsw><%for.i>][{1,+,1}<nuw><nsw><%for.j>][{1,+,1}<nuw><nsw><%for.k>]
+
+; AddRec: {{(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + (4 * (sext i32 %a.deps to i64) * (1 + (sext i32 %a.cols to i64))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>,+,(4 * (sext i32 %a.deps to i64))}<%for.j>
+; CHECK: Base offset: %a.base
+; CHECK: ArrayDecl[UnknownSize][(sext i32 %a.deps to i64)] with elements of sizeof(float) bytes.
+; CHECK: ArrayRef[{(1 + (sext i32 %a.cols to i64)),+,(sext i32 %a.cols to i64)}<%for.i>][{(-1 + (sext i32 (-1 + %p.deps) to i64)),+,(sext i32 %a.deps to i64)}<%for.j>]
+
+; AddRec: {(-4 + (4 * (sext i32 (-1 + %p.deps) to i64)) + ((sext i32 %a.deps to i64) * (-4 + (4 * (sext i32 (-1 + %p.cols) to i64)) + (4 * (sext i32 %a.cols to i64)))) + %a.base),+,(4 * (sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>
+; CHECK: Base offset: %a.base
+; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(float) bytes.
+; CHECK: ArrayRef[{(-1 + (sext i32 (-1 + %p.deps) to i64) + ((sext i32 %a.deps to i64) * (-1 + (sext i32 (-1 + %p.cols) to i64) + (sext i32 %a.cols to i64)))),+,((sext i32 %a.deps to i64) * (sext i32 %a.cols to i64))}<%for.i>]
+
+%struct.Mat = type { float*, i32, i32, i32, i32 }
+
+define void @jacobi(i32 %nn, %struct.Mat* nocapture %a, %struct.Mat* nocapture %p) nounwind uwtable {
+entry:
+  %p.rows.ptr = getelementptr inbounds %struct.Mat* %p, i64 0, i32 2
+  %p.rows = load i32* %p.rows.ptr
+  %p.rows.sub = add i32 %p.rows, -1
+  %p.rows.sext = sext i32 %p.rows.sub to i64
+  %p.cols.ptr = getelementptr inbounds %struct.Mat* %p, i64 0, i32 3
+  %p.cols = load i32* %p.cols.ptr
+  %p.cols.sub = add i32 %p.cols, -1
+  %p.cols.sext = sext i32 %p.cols.sub to i64
+  %p.deps.ptr = getelementptr inbounds %struct.Mat* %p, i64 0, i32 4
+  %p.deps = load i32* %p.deps.ptr
+  %p.deps.sub = add i32 %p.deps, -1
+  %p.deps.sext = sext i32 %p.deps.sub to i64
+  %a.cols.ptr = getelementptr inbounds %struct.Mat* %a, i64 0, i32 3
+  %a.cols = load i32* %a.cols.ptr
+  %a.cols.sext = sext i32 %a.cols to i64
+  %a.deps.ptr = getelementptr inbounds %struct.Mat* %a, i64 0, i32 4
+  %a.deps = load i32* %a.deps.ptr
+  %a.deps.sext = sext i32 %a.deps to i64
+  %a.base.ptr = getelementptr inbounds %struct.Mat* %a, i64 0, i32 0
+  %a.base = load float** %a.base.ptr, align 8
+  br label %for.i
+
+for.i:                                            ; preds = %for.i.inc, %entry
+  %i = phi i64 [ %i.inc, %for.i.inc ], [ 1, %entry ]
+  br label %for.j
+
+for.j:                                            ; preds = %for.j.inc, %for.i
+  %j = phi i64 [ %j.inc, %for.j.inc ], [ 1, %for.i ]
+  br label %for.k
+
+for.k:                                            ; preds = %for.k, %for.j
+  %k = phi i64 [ 1, %for.j ], [ %k.inc, %for.k ]
+  %tmp1 = mul nsw i64 %a.cols.sext, %i
+  %tmp2 = add i64 %tmp1, %j
+  %tmp3 = mul i64 %tmp2, %a.deps.sext
+  %tmp4 = add nsw i64 %k, %tmp3
+  %arrayidx = getelementptr inbounds float* %a.base, i64 %tmp4
+  store float 1.000000e+00, float* %arrayidx
+  %k.inc = add nsw i64 %k, 1
+  %k.exitcond = icmp eq i64 %k.inc, %p.deps.sext
+  br i1 %k.exitcond, label %for.j.inc, label %for.k
+
+for.j.inc:                                        ; preds = %for.k
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %p.cols.sext
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:                                        ; preds = %for.j.inc
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %p.rows.sext
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:                                              ; preds = %for.i.inc
+  ret void
+}
diff --git a/test/Analysis/BasicAA/lit.local.cfg b/test/Analysis/Delinearization/lit.local.cfg
index 19eebc0..19eebc0 100644
--- a/test/Analysis/BasicAA/lit.local.cfg
+++ b/test/Analysis/Delinearization/lit.local.cfg
diff --git a/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
new file mode 100644
index 0000000..82cab16
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_3d.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -analyze -delinearize  | FileCheck %s
+
+; void foo(long n, long m, long o, double A[n][m][o]) {
+;
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
+;         A[i+3][j-4][k+7] = 1.0;
+; }
+
+; AddRec: {{{(56 + (8 * (-4 + (3 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{3,+,1}<nw><%for.i>][{-4,+,1}<nw><%for.j>][{7,+,1}<nw><%for.k>]
+
+; AddRec: {{(48 + ((-24 + (24 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%o] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(-3 + (3 * %m)),+,%m}<%for.i>][{6,+,%o}<%for.j>]
+
+; AddRec: {(48 + ((-32 + (32 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(6 + ((-4 + (4 * %m)) * %o)),+,(%m * %o)}<%for.i>]
+
+define void @foo(i64 %n, i64 %m, i64 %o, double* %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j.inc ]
+  br label %for.k
+
+for.k:
+  %k = phi i64 [ 0, %for.j ], [ %k.inc, %for.k.inc ]
+  %offset0 = add nsw i64 %i, 3
+  %subscript0 = mul i64 %offset0, %m
+  %offset1 = add nsw i64 %j, -4
+  %subscript1 = add i64 %offset1, %subscript0
+  %subscript2 = mul i64 %subscript1, %o
+  %offset2 = add nsw i64 %k, 7
+  %subscript = add i64 %subscript2, %offset2
+  %idx = getelementptr inbounds double* %A, i64 %subscript
+  store double 1.0, double* %idx
+  br label %for.k.inc
+
+for.k.inc:
+  %k.inc = add nsw i64 %k, 1
+  %k.exitcond = icmp eq i64 %k.inc, %o
+  br i1 %k.exitcond, label %for.j.inc, label %for.k
+
+for.j.inc:
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
new file mode 100644
index 0000000..a1e779f
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_ivs_and_integer_offsets_nts_3d.ll
@@ -0,0 +1,72 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; void foo(long n, long m, long o, long p, double A[n][m][o+p]) {
+;
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
+;         A[i+3][j-4][k+7] = 1.0;
+; }
+
+; AddRec: {{{(56 + (8 * (-4 + (3 * %m)) * (%o + %p)) + %A),+,(8 * (%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>,+,(8 * (%o + %p))}<%for.body6.lr.ph.us.us>,+,8}<%for.body6.us.us>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m][(%o + %p)] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{3,+,1}<nw><%for.cond4.preheader.lr.ph.us>][{-4,+,1}<nw><%for.body6.lr.ph.us.us>][{7,+,1}<nw><%for.body6.us.us>]
+
+; AddRec: {{(48 + (8 * %o) + (8 * (-4 + (3 * %m)) * (%o + %p)) + %A),+,(8 * (%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>,+,(8 * (%o + %p))}<%for.body6.lr.ph.us.us>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][(%o + %p)] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(-4 + (3 * %m)),+,%m}<%for.cond4.preheader.lr.ph.us>][{(6 + %o),+,(%o + %p)}<%for.body6.lr.ph.us.us>]
+
+; AddRec: {(48 + (8 * %o) + ((-40 + (32 * %m)) * (%o + %p)) + %A),+,(8 * (%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(6 + ((-5 + (4 * %m)) * (%o + %p)) + %o),+,((%o + %p) * %m)}<%for.cond4.preheader.lr.ph.us>]
+
+define void @foo(i64 %n, i64 %m, i64 %o, i64 %p, double* nocapture %A) nounwind uwtable {
+entry:
+  %add = add nsw i64 %p, %o
+  %cmp22 = icmp sgt i64 %n, 0
+  br i1 %cmp22, label %for.cond1.preheader.lr.ph, label %for.end16
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp220 = icmp sgt i64 %m, 0
+  %cmp518 = icmp sgt i64 %o, 0
+  br i1 %cmp220, label %for.cond4.preheader.lr.ph.us, label %for.end16
+
+for.inc14.us:                                     ; preds = %for.cond4.preheader.lr.ph.us, %for.inc11.us.us
+  %inc15.us = add nsw i64 %i.023.us, 1
+  %exitcond43 = icmp eq i64 %inc15.us, %n
+  br i1 %exitcond43, label %for.end16, label %for.cond4.preheader.lr.ph.us
+
+for.cond4.preheader.lr.ph.us:                     ; preds = %for.inc14.us, %for.cond1.preheader.lr.ph
+  %i.023.us = phi i64 [ %inc15.us, %for.inc14.us ], [ 0, %for.cond1.preheader.lr.ph ]
+  %add8.us = add nsw i64 %i.023.us, 3
+  %0 = mul i64 %add8.us, %m
+  %sub.us = add i64 %0, -4
+  br i1 %cmp518, label %for.body6.lr.ph.us.us, label %for.inc14.us
+
+for.inc11.us.us:                                  ; preds = %for.body6.us.us
+  %inc12.us.us = add nsw i64 %j.021.us.us, 1
+  %exitcond42 = icmp eq i64 %inc12.us.us, %m
+  br i1 %exitcond42, label %for.inc14.us, label %for.body6.lr.ph.us.us
+
+for.body6.lr.ph.us.us:                            ; preds = %for.cond4.preheader.lr.ph.us, %for.inc11.us.us
+  %j.021.us.us = phi i64 [ %inc12.us.us, %for.inc11.us.us ], [ 0, %for.cond4.preheader.lr.ph.us ]
+  %tmp.us.us = add i64 %sub.us, %j.021.us.us
+  %tmp17.us.us = mul i64 %tmp.us.us, %add
+  br label %for.body6.us.us
+
+for.body6.us.us:                                  ; preds = %for.body6.us.us, %for.body6.lr.ph.us.us
+  %k.019.us.us = phi i64 [ 0, %for.body6.lr.ph.us.us ], [ %inc.us.us, %for.body6.us.us ]
+  %arrayidx.sum.us.us = add i64 %k.019.us.us, 7
+  %arrayidx9.sum.us.us = add i64 %arrayidx.sum.us.us, %tmp17.us.us
+  %arrayidx10.us.us = getelementptr inbounds double* %A, i64 %arrayidx9.sum.us.us
+  store double 1.000000e+00, double* %arrayidx10.us.us, align 8
+  %inc.us.us = add nsw i64 %k.019.us.us, 1
+  %exitcond = icmp eq i64 %inc.us.us, %o
+  br i1 %exitcond, label %for.inc11.us.us, label %for.body6.us.us
+
+for.end16:                                        ; preds = %for.cond1.preheader.lr.ph, %for.inc14.us, %entry
+  ret void
+}
diff --git a/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll b/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
new file mode 100644
index 0000000..a52a4c9
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; void foo(long n, long m, long o, double A[n][m][o], long p, long q, long r) {
+;
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
+;         A[i+p][j+q][k+r] = 1.0;
+; }
+
+; AddRec: {{{((8 * ((((%m * %p) + %q) * %o) + %r)) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{%p,+,1}<nw><%for.i>][{%q,+,1}<nw><%for.j>][{%r,+,1}<nw><%for.k>]
+
+; AddRec: {{(-8 + (8 * ((((%m * %p) + %q) * %o) + %r)) + (8 * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%o] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(1 + (%m * %p) + %q),+,%m}<%for.i>][{(-1 + %r),+,%o}<%for.j>]
+
+; AddRec: {(-8 + (8 * ((((%m * %p) + %q) * %o) + %r)) + (8 * %m * %o) + %A),+,(8 * %m * %o)}<%for.i>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(-1 + ((((1 + %p) * %m) + %q) * %o) + %r),+,(%m * %o)}<%for.i>]
+
+define void @foo(i64 %n, i64 %m, i64 %o, double* %A, i64 %p, i64 %q, i64 %r) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j.inc ]
+  br label %for.k
+
+for.k:
+  %k = phi i64 [ 0, %for.j ], [ %k.inc, %for.k.inc ]
+  %offset0 = add nsw i64 %i, %p
+  %subscript0 = mul i64 %offset0, %m
+  %offset1 = add nsw i64 %j, %q
+  %subscript1 = add i64 %offset1, %subscript0
+  %subscript2 = mul i64 %subscript1, %o
+  %offset2 = add nsw i64 %k, %r
+  %subscript = add i64 %subscript2, %offset2
+  %idx = getelementptr inbounds double* %A, i64 %subscript
+  store double 1.0, double* %idx
+  br label %for.k.inc
+
+for.k.inc:
+  %k.inc = add nsw i64 %k, 1
+  %k.exitcond = icmp eq i64 %k.inc, %o
+  br i1 %k.exitcond, label %for.j.inc, label %for.k
+
+for.j.inc:
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_2d.ll b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
new file mode 100644
index 0000000..d68a158
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_only_ivs_2d.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; Derived from the following code:
+;
+; void foo(long n, long m, double A[n][m]) {
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       A[i][j] = 1.0;
+; }
+
+; AddRec: {{%A,+,(8 * %m)}<%for.i>,+,8}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>]
+
+; AddRec: {(-8 + (8 * %m) + %A),+,(8 * %m)}<%for.i>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(-1 + %m),+,%m}<%for.i>]
+
+define void @foo(i64 %n, i64 %m, double* %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  %tmp = mul nsw i64 %i, %m
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j ]
+  %vlaarrayidx.sum = add i64 %j, %tmp
+  %arrayidx = getelementptr inbounds double* %A, i64 %vlaarrayidx.sum
+  store double 1.0, double* %arrayidx
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll b/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
new file mode 100644
index 0000000..7207420
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_only_ivs_2d_nested.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; extern void bar(long n, long m, double A[n][m]);
+;
+; void foo(long a, long b) {
+;   for (long n = 1; n < a; ++n)
+;   for (long m = 1; m < b; ++m) {
+;     double A[n][m];
+;     for (long i = 0; i < n; i++)
+;       for (long j = 0; j < m; j++)
+;         A[i][j] = 1.0;
+;     bar(n, m, A);
+;   }
+; }
+
+; AddRec: {{%vla.us,+,{8,+,8}<%for.cond7.preheader.lr.ph.split.us.us>}<%for.body9.lr.ph.us.us>,+,8}<%for.body9.us.us>
+; CHECK: Base offset: %vla.us
+; CHECK: ArrayDecl[UnknownSize][{1,+,1}<%for.cond7.preheader.lr.ph.split.us.us>] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{0,+,1}<nuw><nsw><%for.body9.lr.ph.us.us>][{0,+,1}<nuw><nsw><%for.body9.us.us>]
+
+define void @foo(i64 %a, i64 %b) nounwind uwtable {
+entry:
+  %cmp43 = icmp sgt i64 %a, 1
+  br i1 %cmp43, label %for.cond1.preheader.lr.ph, label %for.end19
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp224 = icmp sgt i64 %b, 1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc17, %for.cond1.preheader.lr.ph
+  %indvars.iv51 = phi i64 [ 1, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next52, %for.inc17 ]
+  br i1 %cmp224, label %for.cond7.preheader.lr.ph.split.us.us, label %for.inc17
+
+for.end13.us:                                     ; preds = %for.inc11.us.us
+  call void @bar(i64 %indvars.iv51, i64 %indvars.iv48, double* %vla.us) nounwind
+  call void @llvm.stackrestore(i8* %1)
+  %indvars.iv.next49 = add i64 %indvars.iv48, 1
+  %exitcond54 = icmp eq i64 %indvars.iv.next49, %b
+  br i1 %exitcond54, label %for.inc17, label %for.cond7.preheader.lr.ph.split.us.us
+
+for.inc11.us.us:                                  ; preds = %for.body9.us.us
+  %inc12.us.us = add nsw i64 %i.023.us.us, 1
+  %exitcond53 = icmp eq i64 %inc12.us.us, %indvars.iv51
+  br i1 %exitcond53, label %for.end13.us, label %for.body9.lr.ph.us.us
+
+for.body9.lr.ph.us.us:                            ; preds = %for.cond7.preheader.lr.ph.split.us.us, %for.inc11.us.us
+  %i.023.us.us = phi i64 [ 0, %for.cond7.preheader.lr.ph.split.us.us ], [ %inc12.us.us, %for.inc11.us.us ]
+  %0 = mul nsw i64 %i.023.us.us, %indvars.iv48
+  br label %for.body9.us.us
+
+for.body9.us.us:                                  ; preds = %for.body9.us.us, %for.body9.lr.ph.us.us
+  %j.021.us.us = phi i64 [ 0, %for.body9.lr.ph.us.us ], [ %inc.us.us, %for.body9.us.us ]
+  %arrayidx.sum.us.us = add i64 %j.021.us.us, %0
+  %arrayidx10.us.us = getelementptr inbounds double* %vla.us, i64 %arrayidx.sum.us.us
+  store double 1.000000e+00, double* %arrayidx10.us.us, align 8
+  %inc.us.us = add nsw i64 %j.021.us.us, 1
+  %exitcond50 = icmp eq i64 %inc.us.us, %indvars.iv48
+  br i1 %exitcond50, label %for.inc11.us.us, label %for.body9.us.us
+
+for.cond7.preheader.lr.ph.split.us.us:            ; preds = %for.cond1.preheader, %for.end13.us
+  %indvars.iv48 = phi i64 [ %indvars.iv.next49, %for.end13.us ], [ 1, %for.cond1.preheader ]
+  %1 = call i8* @llvm.stacksave()
+  %2 = mul nuw i64 %indvars.iv48, %indvars.iv51
+  %vla.us = alloca double, i64 %2, align 16
+  br label %for.body9.lr.ph.us.us
+
+for.inc17:                                        ; preds = %for.end13.us, %for.cond1.preheader
+  %indvars.iv.next52 = add i64 %indvars.iv51, 1
+  %exitcond55 = icmp eq i64 %indvars.iv.next52, %a
+  br i1 %exitcond55, label %for.end19, label %for.cond1.preheader
+
+for.end19:                                        ; preds = %for.inc17, %entry
+  ret void
+}
+
+declare i8* @llvm.stacksave() nounwind
+declare void @bar(i64, i64, double*)
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_3d.ll b/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
new file mode 100644
index 0000000..24f9583
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_only_ivs_3d.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+
+; void foo(long n, long m, long o, double A[n][m][o]) {
+;
+;   for (long i = 0; i < n; i++)
+;     for (long j = 0; j < m; j++)
+;       for (long k = 0; k < o; k++)
+;         A[i][j][k] = 1.0;
+; }
+
+; AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{0,+,1}<nuw><nsw><%for.j>][{0,+,1}<nuw><nsw><%for.k>]
+
+; AddRec: {{(-8 + (8 * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][(%m * %o)] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{0,+,1}<nuw><nsw><%for.i>][{(-1 + %o),+,%o}<%for.j>]
+
+; AddRec: {(-8 + (8 * %m * %o) + %A),+,(8 * %m * %o)}<%for.i>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize] with elements of sizeof(double) bytes.
+; CHECK: ArrayRef[{(-1 + (%m * %o)),+,(%m * %o)}<%for.i>]
+
+define void @foo(i64 %n, i64 %m, i64 %o, double* %A) {
+entry:
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %for.i.inc ]
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ 0, %for.i ], [ %j.inc, %for.j.inc ]
+  br label %for.k
+
+for.k:
+  %k = phi i64 [ 0, %for.j ], [ %k.inc, %for.k.inc ]
+  %subscript0 = mul i64 %i, %m
+  %subscript1 = add i64 %j, %subscript0
+  %subscript2 = mul i64 %subscript1, %o
+  %subscript = add i64 %subscript2, %k
+  %idx = getelementptr inbounds double* %A, i64 %subscript
+  store double 1.0, double* %idx
+  br label %for.k.inc
+
+for.k.inc:
+  %k.inc = add nsw i64 %k, 1
+  %k.exitcond = icmp eq i64 %k.inc, %o
+  br i1 %k.exitcond, label %for.j.inc, label %for.k
+
+for.j.inc:
+  %j.inc = add nsw i64 %j, 1
+  %j.exitcond = icmp eq i64 %j.inc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add nsw i64 %i, 1
+  %i.exitcond = icmp eq i64 %i.inc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll b/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
new file mode 100644
index 0000000..e151610
--- /dev/null
+++ b/test/Analysis/Delinearization/multidim_only_ivs_3d_cast.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -analyze -delinearize | FileCheck %s
+; void foo(int n, int m, int o, double A[n][m][o]) {
+;
+;   for (int i = 0; i < n; i++)
+;     for (int j = 0; j < m; j++)
+;       for (int k = 0; k < o; k++)
+;         A[i][j][k] = 1.0;
+; }
+
+; AddRec: {{{%A,+,(8 * (zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>,+,(8 * (zext i32 %o to i64))}<%for.j>,+,8}<%for.k>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][(zext i32 %m to i64)][(zext i32 %o to i64)] with elements of 8 bytes.
+; CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+
+; AddRec: {{((8 * (zext i32 (-1 + %o) to i64)) + %A),+,(8 * (zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>,+,(8 * (zext i32 %o to i64))}<%for.j>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize][((zext i32 %m to i64) * (zext i32 %o to i64))] with elements of 8 bytes.
+; CHECK: ArrayRef[{0,+,1}<%for.i>][{(zext i32 (-1 + %o) to i64),+,(zext i32 %o to i64)}<%for.j>]
+
+; AddRec: {((8 * (zext i32 (-1 + %o) to i64)) + (8 * (zext i32 (-1 + %m) to i64) * (zext i32 %o to i64)) + %A),+,(8 * (zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>
+; CHECK: Base offset: %A
+; CHECK: ArrayDecl[UnknownSize] with elements of 8 bytes.
+; CHECK: ArrayRef[{((zext i32 (-1 + %o) to i64) + ((zext i32 (-1 + %m) to i64) * (zext i32 %o to i64))),+,((zext i32 %m to i64) * (zext i32 %o to i64))}<%for.i>]
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32 %n, i32 %m, i32 %o, double* %A) {
+entry:
+  %m_zext = zext i32 %m to i64
+  %n_zext = zext i32 %o to i64
+  br label %for.i
+
+for.i:
+  %i = phi i64 [ %i.inc, %for.i.inc ], [ 0, %entry ]
+  br label %for.j
+
+for.j:
+  %j = phi i64 [ %j.inc, %for.j.inc ], [ 0, %for.i ]
+  br label %for.k
+
+for.k:
+  %k = phi i64 [ %k.inc, %for.k.inc ], [ 0, %for.j ]
+  %tmp = mul i64 %i, %m_zext
+  %tmp1 = trunc i64 %j to i32
+  %tmp2 = trunc i64 %i to i32
+  %mul.us.us = mul nsw i32 %tmp1, %tmp2
+  %tmp.us.us = add i64 %j, %tmp
+  %tmp17.us.us = mul i64 %tmp.us.us, %n_zext
+  %subscript = add i64 %tmp17.us.us, %k
+  %idx = getelementptr inbounds double* %A, i64 %subscript
+  store double 1.0, double* %idx
+  br label %for.k.inc
+
+for.k.inc:
+  %k.inc = add i64 %k, 1
+  %k.inc.trunc = trunc i64 %k.inc to i32
+  %k.exitcond = icmp eq i32 %k.inc.trunc, %o
+  br i1 %k.exitcond, label %for.j.inc, label %for.k
+
+for.j.inc:
+  %j.inc = add i64 %j, 1
+  %j.inc.trunc = trunc i64 %j.inc to i32
+  %j.exitcond = icmp eq i32 %j.inc.trunc, %m
+  br i1 %j.exitcond, label %for.i.inc, label %for.j
+
+for.i.inc:
+  %i.inc = add i64 %i, 1
+  %i.inc.trunc = trunc i64 %i.inc to i32
+  %i.exitcond = icmp eq i32 %i.inc.trunc, %n
+  br i1 %i.exitcond, label %end, label %for.i
+
+end:
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Banerjee.ll b/test/Analysis/DependenceAnalysis/Banerjee.ll
index 003ee03..09e8fd2 100644
--- a/test/Analysis/DependenceAnalysis/Banerjee.ll
+++ b/test/Analysis/DependenceAnalysis/Banerjee.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-apple-macosx10.6.0"
 define void @banerjee0(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
-
+; CHECK: 'Dependence Analysis' for function 'banerjee0':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [<= <>]!
 ; CHECK: da analyze - confused!
@@ -65,6 +65,7 @@ entry:
   %cmp4 = icmp sgt i64 %n, 0
   br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end9
 
+; CHECK: 'Dependence Analysis' for function 'banerjee1':
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - flow [* <>]!
 ; CHECK: da analyze - confused!
@@ -131,6 +132,7 @@ define void @banerjee2(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee2':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -181,6 +183,7 @@ define void @banerjee3(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee3':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [> >]!
 ; CHECK: da analyze - confused!
@@ -231,6 +234,7 @@ define void @banerjee4(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee4':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -281,6 +285,7 @@ define void @banerjee5(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee5':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [< <]!
 ; CHECK: da analyze - confused!
@@ -331,6 +336,7 @@ define void @banerjee6(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee6':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [=> <>]!
 ; CHECK: da analyze - confused!
@@ -381,6 +387,7 @@ define void @banerjee7(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee7':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [> <=]!
 ; CHECK: da analyze - confused!
@@ -431,6 +438,7 @@ define void @banerjee8(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee8':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [> <>]!
 ; CHECK: da analyze - confused!
@@ -481,6 +489,7 @@ define void @banerjee9(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee9':
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - flow [<= =|<]!
 ; CHECK: da analyze - confused!
@@ -532,6 +541,7 @@ define void @banerjee10(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee10':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [<> =]!
 ; CHECK: da analyze - confused!
@@ -582,6 +592,7 @@ define void @banerjee11(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee11':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [<= <>]!
 ; CHECK: da analyze - confused!
@@ -632,6 +643,7 @@ define void @banerjee12(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'banerjee12':
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - flow [= <>]!
 ; CHECK: da analyze - confused!
diff --git a/test/Analysis/DependenceAnalysis/GCD.ll b/test/Analysis/DependenceAnalysis/GCD.ll
index a422124..bb31d11 100644
--- a/test/Analysis/DependenceAnalysis/GCD.ll
+++ b/test/Analysis/DependenceAnalysis/GCD.ll
@@ -14,6 +14,7 @@ define void @gcd0(i32* %A, i32* %B) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'gcd0'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - flow [=> *|<]!
 ; CHECK: da analyze - confused!
@@ -66,6 +67,7 @@ define void @gcd1(i32* %A, i32* %B) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'gcd1'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -119,6 +121,7 @@ define void @gcd2(i32* %A, i32* %B) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'gcd2'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -172,6 +175,7 @@ define void @gcd3(i32* %A, i32* %B) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'gcd3'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - flow [<> *]!
 ; CHECK: da analyze - confused!
@@ -223,6 +227,7 @@ define void @gcd4(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'gcd4'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -284,6 +289,7 @@ define void @gcd5(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
 entry:
   br label %for.cond1.preheader
 
+; CHECK: 'Dependence Analysis' for function 'gcd5'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - flow [<> *]!
 ; CHECK: da analyze - confused!
@@ -346,6 +352,7 @@ entry:
   %cmp4 = icmp sgt i64 %n, 0
   br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end12
 
+; CHECK: 'Dependence Analysis' for function 'gcd6'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -417,6 +424,7 @@ entry:
   %cmp4 = icmp sgt i32 %n, 0
   br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
 
+; CHECK: 'Dependence Analysis' for function 'gcd7'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - flow [* *|<]!
 ; CHECK: da analyze - confused!
@@ -500,6 +508,7 @@ entry:
   %cmp4 = icmp sgt i32 %n, 0
   br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
 
+; CHECK: 'Dependence Analysis' for function 'gcd8'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -578,6 +587,7 @@ entry:
   %cmp4 = icmp eq i32 %n, 0
   br i1 %cmp4, label %for.end15, label %for.cond1.preheader.preheader
 
+; CHECK: 'Dependence Analysis' for function 'gcd9'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - flow [* *|<]!
 ; CHECK: da analyze - confused!
diff --git a/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
index 81e6189..5443909 100644
--- a/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
+++ b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
@@ -15,6 +15,7 @@ entry:
   %cmp4 = icmp eq i64 %n1, 0
   br i1 %cmp4, label %for.cond1.preheader, label %for.body.preheader
 
+; CHECK: 'Dependence Analysis' for function 'symbolicrdiv0'
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -77,6 +78,7 @@ entry:
   %cmp4 = icmp eq i64 %n1, 0
   br i1 %cmp4, label %for.cond2.preheader, label %for.body.preheader
 
+; CHECK: 'Dependence Analysis' for function 'symbolicrdiv1'
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -141,6 +143,7 @@ entry:
   %cmp4 = icmp eq i64 %n1, 0
   br i1 %cmp4, label %for.cond1.preheader, label %for.body.preheader
 
+; CHECK: 'Dependence Analysis' for function 'symbolicrdiv2'
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -203,6 +206,7 @@ entry:
   %cmp4 = icmp eq i64 %n1, 0
   br i1 %cmp4, label %for.cond1.preheader, label %for.body.preheader
 
+; CHECK: 'Dependence Analysis' for function 'symbolicrdiv3'
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -263,6 +267,7 @@ entry:
   %cmp4 = icmp eq i64 %n1, 0
   br i1 %cmp4, label %for.cond1.preheader, label %for.body.preheader
 
+; CHECK: 'Dependence Analysis' for function 'symbolicrdiv4'
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -324,6 +329,7 @@ entry:
   %cmp4 = icmp eq i64 %n1, 0
   br i1 %cmp4, label %for.cond1.preheader, label %for.body.preheader
 
+; CHECK: 'Dependence Analysis' for function 'symbolicrdiv5'
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
@@ -385,6 +391,7 @@ entry:
   %cmp4 = icmp eq i64 %n1, 0
   br i1 %cmp4, label %for.end7, label %for.cond1.preheader.preheader
 
+; CHECK: 'Dependence Analysis' for function 'symbolicrdiv6'
 ; CHECK: da analyze - output [* *]!
 ; CHECK: da analyze - none!
 ; CHECK: da analyze - confused!
diff --git a/test/Analysis/Dominators/lit.local.cfg b/test/Analysis/Dominators/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/Dominators/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/GlobalsModRef/lit.local.cfg b/test/Analysis/GlobalsModRef/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/GlobalsModRef/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/Lint/check-zero-divide.ll b/test/Analysis/Lint/check-zero-divide.ll
new file mode 100644
index 0000000..f4e79ed
--- /dev/null
+++ b/test/Analysis/Lint/check-zero-divide.ll
@@ -0,0 +1,78 @@
+; RUN: opt -lint -disable-output %s 2>&1 | FileCheck %s
+
+define <2 x i32> @use_vector_sdiv(<2 x i32> %a) nounwind {
+  %b = sdiv <2 x i32> %a, <i32 5, i32 8>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_srem(<2 x i32> %a) nounwind {
+  %b = srem <2 x i32> %a, <i32 5, i32 8>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_udiv(<2 x i32> %a) nounwind {
+  %b = udiv <2 x i32> %a, <i32 5, i32 8>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_urem(<2 x i32> %a) nounwind {
+  %b = urem <2 x i32> %a, <i32 5, i32 8>
+  ret <2 x i32> %b
+}
+
+define i32 @use_sdiv_by_zero(i32 %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT: %b = sdiv i32 %a, 0
+  %b = sdiv i32 %a, 0
+  ret i32 %b
+}
+
+define i32 @use_sdiv_by_zeroinitializer(i32 %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT: %b = sdiv i32 %a, 0
+  %b = sdiv i32 %a, zeroinitializer
+   ret i32 %b
+}
+
+define <2 x i32> @use_vector_sdiv_by_zero_x(<2 x i32> %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT: %b = sdiv <2 x i32> %a, <i32 0, i32 5>
+  %b = sdiv <2 x i32> %a, <i32 0, i32 5>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_sdiv_by_zero_y(<2 x i32> %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT:  %b = sdiv <2 x i32> %a, <i32 4, i32 0>
+  %b = sdiv <2 x i32> %a, <i32 4, i32 0>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_sdiv_by_zero_xy(<2 x i32> %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT: %b = sdiv <2 x i32> %a, zeroinitializer
+  %b = sdiv <2 x i32> %a, <i32 0, i32 0>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_sdiv_by_undef_x(<2 x i32> %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT: %b = sdiv <2 x i32> %a, <i32 undef, i32 5>
+  %b = sdiv <2 x i32> %a, <i32 undef, i32 5>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_sdiv_by_undef_y(<2 x i32> %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT: %b = sdiv <2 x i32> %a, <i32 5, i32 undef>
+  %b = sdiv <2 x i32> %a, <i32 5, i32 undef>
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @use_vector_sdiv_by_undef_xy(<2 x i32> %a) nounwind {
+; CHECK: Undefined behavior: Division by zero
+; CHECK-NEXT: %b = sdiv <2 x i32> %a, undef
+  %b = sdiv <2 x i32> %a, <i32 undef, i32 undef>
+  ret <2 x i32> %b
+}
+
diff --git a/test/Analysis/DependenceAnalysis/lit.local.cfg b/test/Analysis/Lint/lit.local.cfg
index c6106e4..c6106e4 100644
--- a/test/Analysis/DependenceAnalysis/lit.local.cfg
+++ b/test/Analysis/Lint/lit.local.cfg
diff --git a/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll b/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
index 7119007..a87bab7 100644
--- a/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
+++ b/test/Analysis/LoopInfo/2003-05-15-NestingProblem.ll
@@ -1,8 +1,9 @@
 ; This testcase was incorrectly computing that the loopentry.7 loop was
 ; not a child of the loopentry.6 loop.
 ;
-; RUN: opt < %s -analyze -loops | \
-; RUN:   grep "^            Loop at depth 4 containing: %loopentry.7<header><latch><exiting>"
+; RUN: opt < %s -analyze -loops | FileCheck %s
+
+; CHECK: Loop at depth 4 containing: %loopentry.7<header><latch><exiting>
 
 define void @getAndMoveToFrontDecode() {
 	br label %endif.2
diff --git a/test/Analysis/LoopInfo/lit.local.cfg b/test/Analysis/LoopInfo/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/LoopInfo/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/MemoryDependenceAnalysis/lit.local.cfg b/test/Analysis/MemoryDependenceAnalysis/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Analysis/MemoryDependenceAnalysis/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Analysis/PostDominators/lit.local.cfg b/test/Analysis/PostDominators/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/PostDominators/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/PostDominators/pr1098.ll b/test/Analysis/PostDominators/pr1098.ll
index afb4776..2eed213 100644
--- a/test/Analysis/PostDominators/pr1098.ll
+++ b/test/Analysis/PostDominators/pr1098.ll
@@ -1,7 +1,8 @@
-; RUN: opt < %s -postdomtree -analyze | grep entry
+; RUN: opt < %s -postdomtree -analyze | FileCheck %s
 ; PR932
 
 define void @foo(i1 %x) {
+; CHECK: entry
 entry:
         br i1 %x, label %bb1, label %bb0
 bb0:            ; preds = %entry, bb0
diff --git a/test/Analysis/Profiling/edge-profiling.ll b/test/Analysis/Profiling/edge-profiling.ll
deleted file mode 100644
index cbaf476..0000000
--- a/test/Analysis/Profiling/edge-profiling.ll
+++ /dev/null
@@ -1,139 +0,0 @@
-; Test the edge profiling instrumentation.
-; RUN: opt < %s -insert-edge-profiling -S | FileCheck %s
-
-; ModuleID = '<stdin>'
-
-@.str = private constant [12 x i8] c"hello world\00", align 1 ; <[12 x i8]*> [#uses=1]
-@.str1 = private constant [6 x i8] c"franz\00", align 1 ; <[6 x i8]*> [#uses=1]
-@.str2 = private constant [9 x i8] c"argc > 2\00", align 1 ; <[9 x i8]*> [#uses=1]
-@.str3 = private constant [9 x i8] c"argc = 1\00", align 1 ; <[9 x i8]*> [#uses=1]
-@.str4 = private constant [6 x i8] c"fritz\00", align 1 ; <[6 x i8]*> [#uses=1]
-@.str5 = private constant [10 x i8] c"argc <= 1\00", align 1 ; <[10 x i8]*> [#uses=1]
-; CHECK:@EdgeProfCounters
-; CHECK:[19 x i32] 
-; CHECK:zeroinitializer
-
-define void @oneblock() nounwind {
-entry:
-; CHECK:entry:
-; CHECK:%OldFuncCounter
-; CHECK:load 
-; CHECK:getelementptr
-; CHECK:@EdgeProfCounters
-; CHECK:i32 0
-; CHECK:i32 0
-; CHECK:%NewFuncCounter
-; CHECK:add
-; CHECK:%OldFuncCounter
-; CHECK:store 
-; CHECK:%NewFuncCounter
-; CHECK:getelementptr
-; CHECK:@EdgeProfCounters
-  %0 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  ret void
-}
-
-declare i32 @puts(i8*)
-
-define i32 @main(i32 %argc, i8** %argv) nounwind {
-entry:
-; CHECK:entry:
-  %argc_addr = alloca i32                         ; <i32*> [#uses=4]
-  %argv_addr = alloca i8**                        ; <i8***> [#uses=1]
-  %retval = alloca i32                            ; <i32*> [#uses=2]
-  %j = alloca i32                                 ; <i32*> [#uses=4]
-  %i = alloca i32                                 ; <i32*> [#uses=4]
-  %0 = alloca i32                                 ; <i32*> [#uses=2]
-; CHECK:call 
-; CHECK:@llvm_start_edge_profiling
-; CHECK:@EdgeProfCounters
-  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  store i32 %argc, i32* %argc_addr
-  store i8** %argv, i8*** %argv_addr
-  store i32 0, i32* %i, align 4
-  br label %bb10
-
-bb:                                               ; preds = %bb10
-; CHECK:bb:
-  %1 = load i32* %argc_addr, align 4              ; <i32> [#uses=1]
-  %2 = icmp sgt i32 %1, 1                         ; <i1> [#uses=1]
-  br i1 %2, label %bb1, label %bb8
-
-bb1:                                              ; preds = %bb
-; CHECK:bb1:
-  store i32 0, i32* %j, align 4
-  br label %bb6
-
-bb2:                                              ; preds = %bb6
-; CHECK:bb2:
-  %3 = call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  %4 = load i32* %argc_addr, align 4              ; <i32> [#uses=1]
-  %5 = icmp sgt i32 %4, 2                         ; <i1> [#uses=1]
-  br i1 %5, label %bb3, label %bb4
-
-bb3:                                              ; preds = %bb2
-; CHECK:bb3:
-  %6 = call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @.str2, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  br label %bb5
-
-bb4:                                              ; preds = %bb2
-; CHECK:bb4:
-  %7 = call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @.str3, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  br label %bb11
-
-bb5:                                              ; preds = %bb3
-; CHECK:bb5:
-  %8 = call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  %9 = load i32* %j, align 4                      ; <i32> [#uses=1]
-  %10 = add nsw i32 %9, 1                         ; <i32> [#uses=1]
-  store i32 %10, i32* %j, align 4
-  br label %bb6
-
-bb6:                                              ; preds = %bb5, %bb1
-; CHECK:bb6:
-  %11 = load i32* %j, align 4                     ; <i32> [#uses=1]
-  %12 = load i32* %argc_addr, align 4             ; <i32> [#uses=1]
-  %13 = icmp slt i32 %11, %12                     ; <i1> [#uses=1]
-  br i1 %13, label %bb2, label %bb7
-
-bb7:                                              ; preds = %bb6
-; CHECK:bb7:
-  br label %bb9
-
-bb8:                                              ; preds = %bb
-; CHECK:bb8:
-  %14 = call i32 @puts(i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  br label %bb9
-
-bb9:                                              ; preds = %bb8, %bb7
-; CHECK:bb9:
-  %15 = load i32* %i, align 4                     ; <i32> [#uses=1]
-  %16 = add nsw i32 %15, 1                        ; <i32> [#uses=1]
-  store i32 %16, i32* %i, align 4
-  br label %bb10
-
-bb10:                                             ; preds = %bb9, %entry
-; CHECK:bb10:
-  %17 = load i32* %i, align 4                     ; <i32> [#uses=1]
-  %18 = icmp ne i32 %17, 3                        ; <i1> [#uses=1]
-  br i1 %18, label %bb, label %bb11
-; CHECK:br
-; CHECK:label %bb10.bb11_crit_edge
-
-; CHECK:bb10.bb11_crit_edge:
-; CHECK:br
-; CHECK:label %bb11
-
-bb11:                                             ; preds = %bb10, %bb4
-; CHECK:bb11:
-  call void @oneblock() nounwind
-  store i32 0, i32* %0, align 4
-  %19 = load i32* %0, align 4                     ; <i32> [#uses=1]
-  store i32 %19, i32* %retval, align 4
-  br label %return
-
-return:                                           ; preds = %bb11
-; CHECK:return:
-  %retval12 = load i32* %retval                   ; <i32> [#uses=1]
-  ret i32 %retval12
-}
diff --git a/test/Analysis/Profiling/lit.local.cfg b/test/Analysis/Profiling/lit.local.cfg
deleted file mode 100644
index d40fa4f..0000000
--- a/test/Analysis/Profiling/lit.local.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-if 'hexagon' in root.target_triple:
-    config.unsupported = True
diff --git a/test/Analysis/Profiling/load-branch-weights-ifs.ll b/test/Analysis/Profiling/load-branch-weights-ifs.ll
deleted file mode 100644
index 7ed090b..0000000
--- a/test/Analysis/Profiling/load-branch-weights-ifs.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; RUN: opt -insert-edge-profiling -o %t1 < %s
-; RUN: rm -f %t1.prof_data
-; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
-; RUN:     -llvmprof-output %t1.prof_data
-; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
-; RUN:     | FileCheck %s
-; RUN: rm -f %t1.prof_data
-
-; FIXME: profile_rt.dll could be built on win32.
-; REQUIRES: loadable_module
-
-;; func_mod - Branch taken 6 times in 7.
-define i32 @func_mod(i32 %N) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %N.addr = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  %0 = load i32* %N.addr, align 4
-  %rem = srem i32 %0, 7
-  %tobool = icmp ne i32 %rem, 0
-  br i1 %tobool, label %if.then, label %if.else
-; CHECK: br i1 %tobool, label %if.then, label %if.else, !prof !0
-
-if.then:
-  store i32 1, i32* %retval
-  br label %return
-
-if.else:
-  store i32 0, i32* %retval
-  br label %return
-
-return:
-  %1 = load i32* %retval
-  ret i32 %1
-}
-
-;; func_const_true - conditional branch which 100% taken probability.
-define i32 @func_const_true(i32 %N) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %N.addr = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  %0 = load i32* %N.addr, align 4
-  %cmp = icmp eq i32 %0, 1
-  br i1 %cmp, label %if.then, label %if.end
-; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !1
-
-if.then:
-  store i32 1, i32* %retval
-  br label %return
-
-if.end:
-  store i32 0, i32* %retval
-  br label %return
-
-return:
-  %1 = load i32* %retval
-  ret i32 %1
-}
-
-;; func_const_true - conditional branch which 100% not-taken probability.
-define i32 @func_const_false(i32 %N) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %N.addr = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  %0 = load i32* %N.addr, align 4
-  %cmp = icmp eq i32 %0, 1
-  br i1 %cmp, label %if.then, label %if.end
-; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !2
-
-if.then:
-  store i32 1, i32* %retval
-  br label %return
-
-if.end:
-  store i32 0, i32* %retval
-  br label %return
-
-return:
-  %1 = load i32* %retval
-  ret i32 %1
-}
-
-define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 8
-  %loop = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i32 0, i32* %loop, align 4
-  br label %for.cond
-
-for.cond:
-  %0 = load i32* %loop, align 4
-  %cmp = icmp slt i32 %0, 7000
-  br i1 %cmp, label %for.body, label %for.end
-; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !3
-
-for.body:
-  %1 = load i32* %loop, align 4
-  %call = call i32 @func_mod(i32 %1)
-  br label %for.inc
-
-for.inc:
-  %2 = load i32* %loop, align 4
-  %inc = add nsw i32 %2, 1
-  store i32 %inc, i32* %loop, align 4
-  br label %for.cond
-
-for.end:
-  %call1 = call i32 @func_const_true(i32 1)
-  %call2 = call i32 @func_const_false(i32 0)
-  ret i32 0
-}
-
-; CHECK: !0 = metadata !{metadata !"branch_weights", i32 6000, i32 1000}
-; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 0}
-; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1}
-; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7000, i32 1}
-; CHECK-NOT: !4
diff --git a/test/Analysis/Profiling/load-branch-weights-loops.ll b/test/Analysis/Profiling/load-branch-weights-loops.ll
deleted file mode 100644
index 9d1925a..0000000
--- a/test/Analysis/Profiling/load-branch-weights-loops.ll
+++ /dev/null
@@ -1,188 +0,0 @@
-; RUN: opt -insert-edge-profiling -o %t1 < %s
-; RUN: rm -f %t1.prof_data
-; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
-; RUN:     -llvmprof-output %t1.prof_data
-; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
-; RUN:     | FileCheck %s
-; RUN: rm -f %t1.prof_data
-
-; FIXME: profile_rt.dll could be built on win32.
-; REQUIRES: loadable_module
-
-;; func_for - Test branch probabilities for a vanilla for loop.
-define i32 @func_for(i32 %N) nounwind uwtable {
-entry:
-  %N.addr = alloca i32, align 4
-  %ret = alloca i32, align 4
-  %loop = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  store i32 0, i32* %ret, align 4
-  store i32 0, i32* %loop, align 4
-  br label %for.cond
-
-for.cond:
-  %0 = load i32* %loop, align 4
-  %1 = load i32* %N.addr, align 4
-  %cmp = icmp slt i32 %0, %1
-  br i1 %cmp, label %for.body, label %for.end
-; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !0
-
-for.body:
-  %2 = load i32* %N.addr, align 4
-  %3 = load i32* %ret, align 4
-  %add = add nsw i32 %3, %2
-  store i32 %add, i32* %ret, align 4
-  br label %for.inc
-
-for.inc:
-  %4 = load i32* %loop, align 4
-  %inc = add nsw i32 %4, 1
-  store i32 %inc, i32* %loop, align 4
-  br label %for.cond
-
-for.end:
-  %5 = load i32* %ret, align 4
-  ret i32 %5
-}
-
-;; func_for_odd - Test branch probabilities for a for loop with a continue and
-;; a break.
-define i32 @func_for_odd(i32 %N) nounwind uwtable {
-entry:
-  %N.addr = alloca i32, align 4
-  %ret = alloca i32, align 4
-  %loop = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  store i32 0, i32* %ret, align 4
-  store i32 0, i32* %loop, align 4
-  br label %for.cond
-
-for.cond:
-  %0 = load i32* %loop, align 4
-  %1 = load i32* %N.addr, align 4
-  %cmp = icmp slt i32 %0, %1
-  br i1 %cmp, label %for.body, label %for.end
-; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !1
-
-for.body:
-  %2 = load i32* %loop, align 4
-  %rem = srem i32 %2, 10
-  %tobool = icmp ne i32 %rem, 0
-  br i1 %tobool, label %if.then, label %if.end
-; CHECK: br i1 %tobool, label %if.then, label %if.end, !prof !2
-
-if.then:
-  br label %for.inc
-
-if.end:
-  %3 = load i32* %loop, align 4
-  %cmp1 = icmp eq i32 %3, 500
-  br i1 %cmp1, label %if.then2, label %if.end3
-; CHECK: br i1 %cmp1, label %if.then2, label %if.end3, !prof !3
-
-if.then2:
-  br label %for.end
-
-if.end3:
-  %4 = load i32* %N.addr, align 4
-  %5 = load i32* %ret, align 4
-  %add = add nsw i32 %5, %4
-  store i32 %add, i32* %ret, align 4
-  br label %for.inc
-
-for.inc:
-  %6 = load i32* %loop, align 4
-  %inc = add nsw i32 %6, 1
-  store i32 %inc, i32* %loop, align 4
-  br label %for.cond
-
-for.end:
-  %7 = load i32* %ret, align 4
-  ret i32 %7
-}
-
-;; func_while - Test branch probability in a vanilla while loop.
-define i32 @func_while(i32 %N) nounwind uwtable {
-entry:
-  %N.addr = alloca i32, align 4
-  %ret = alloca i32, align 4
-  %loop = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  store i32 0, i32* %ret, align 4
-  store i32 0, i32* %loop, align 4
-  br label %while.cond
-
-while.cond:
-  %0 = load i32* %loop, align 4
-  %1 = load i32* %N.addr, align 4
-  %cmp = icmp slt i32 %0, %1
-  br i1 %cmp, label %while.body, label %while.end
-; CHECK: br i1 %cmp, label %while.body, label %while.end, !prof !0
-
-while.body:
-  %2 = load i32* %N.addr, align 4
-  %3 = load i32* %ret, align 4
-  %add = add nsw i32 %3, %2
-  store i32 %add, i32* %ret, align 4
-  %4 = load i32* %loop, align 4
-  %inc = add nsw i32 %4, 1
-  store i32 %inc, i32* %loop, align 4
-  br label %while.cond
-
-while.end:
-  %5 = load i32* %ret, align 4
-  ret i32 %5
-}
-
-;; func_while - Test branch probability in a vanilla do-while loop.
-define i32 @func_do_while(i32 %N) nounwind uwtable {
-entry:
-  %N.addr = alloca i32, align 4
-  %ret = alloca i32, align 4
-  %loop = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  store i32 0, i32* %ret, align 4
-  store i32 0, i32* %loop, align 4
-  br label %do.body
-
-do.body:
-  %0 = load i32* %N.addr, align 4
-  %1 = load i32* %ret, align 4
-  %add = add nsw i32 %1, %0
-  store i32 %add, i32* %ret, align 4
-  %2 = load i32* %loop, align 4
-  %inc = add nsw i32 %2, 1
-  store i32 %inc, i32* %loop, align 4
-  br label %do.cond
-
-do.cond:
-  %3 = load i32* %loop, align 4
-  %4 = load i32* %N.addr, align 4
-  %cmp = icmp slt i32 %3, %4
-  br i1 %cmp, label %do.body, label %do.end
-; CHECK: br i1 %cmp, label %do.body, label %do.end, !prof !4
-
-do.end:
-  %5 = load i32* %ret, align 4
-  ret i32 %5
-}
-
-define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 8
-  store i32 0, i32* %retval
-  %call = call i32 @func_for(i32 1000)
-  %call1 = call i32 @func_for_odd(i32 1000)
-  %call2 = call i32 @func_while(i32 1000)
-  %call3 = call i32 @func_do_while(i32 1000)
-  ret i32 0
-}
-
-!0 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
-!1 = metadata !{metadata !"branch_weights", i32 501, i32 0}
-!2 = metadata !{metadata !"branch_weights", i32 450, i32 51}
-!3 = metadata !{metadata !"branch_weights", i32 1, i32 50}
-!4 = metadata !{metadata !"branch_weights", i32 999, i32 1}
-; CHECK-NOT: !5
diff --git a/test/Analysis/Profiling/load-branch-weights-switches.ll b/test/Analysis/Profiling/load-branch-weights-switches.ll
deleted file mode 100644
index 5587c71..0000000
--- a/test/Analysis/Profiling/load-branch-weights-switches.ll
+++ /dev/null
@@ -1,165 +0,0 @@
-; RUN: opt -insert-edge-profiling -o %t1 < %s
-; RUN: rm -f %t1.prof_data
-; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
-; RUN:     -llvmprof-output %t1.prof_data
-; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
-; RUN:     | FileCheck %s
-; RUN: rm -f %t1.prof_data
-
-; FIXME: profile_rt.dll could be built on win32.
-; REQUIRES: loadable_module
-
-;; func_switch - Test branch probabilities for a switch instruction with an
-;; even chance of taking each case (or no case).
-define i32 @func_switch(i32 %N) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %N.addr = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  %0 = load i32* %N.addr, align 4
-  %rem = srem i32 %0, 4
-  switch i32 %rem, label %sw.epilog [
-    i32 0, label %sw.bb
-    i32 1, label %sw.bb1
-    i32 2, label %sw.bb2
-  ]
-; CHECK: ], !prof !0
-
-sw.bb:
-  store i32 5, i32* %retval
-  br label %return
-
-sw.bb1:
-  store i32 6, i32* %retval
-  br label %return
-
-sw.bb2:
-  store i32 7, i32* %retval
-  br label %return
-
-sw.epilog:
-  store i32 8, i32* %retval
-  br label %return
-
-return:
-  %1 = load i32* %retval
-  ret i32 %1
-}
-
-;; func_switch_switch - Test branch probabilities in a switch-instruction that
-;; leads to further switch instructions.  The first-tier switch occludes some
-;; possibilities in the second-tier switches, leading to some branches having a
-;; 0 probability.
-define i32 @func_switch_switch(i32 %N) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %N.addr = alloca i32, align 4
-  store i32 %N, i32* %N.addr, align 4
-  %0 = load i32* %N.addr, align 4
-  %rem = srem i32 %0, 2
-  switch i32 %rem, label %sw.default11 [
-    i32 0, label %sw.bb
-    i32 1, label %sw.bb5
-  ]
-; CHECK: ], !prof !1
-
-sw.bb:
-  %1 = load i32* %N.addr, align 4
-  %rem1 = srem i32 %1, 4
-  switch i32 %rem1, label %sw.default [
-    i32 0, label %sw.bb2
-    i32 1, label %sw.bb3
-    i32 2, label %sw.bb4
-  ]
-; CHECK: ], !prof !2
-
-sw.bb2:
-  store i32 5, i32* %retval
-  br label %return
-
-sw.bb3:
-  store i32 6, i32* %retval
-  br label %return
-
-sw.bb4:
-  store i32 7, i32* %retval
-  br label %return
-
-sw.default:
-  store i32 8, i32* %retval
-  br label %return
-
-sw.bb5:
-  %2 = load i32* %N.addr, align 4
-  %rem6 = srem i32 %2, 4
-  switch i32 %rem6, label %sw.default10 [
-    i32 0, label %sw.bb7
-    i32 1, label %sw.bb8
-    i32 2, label %sw.bb9
-  ]
-; CHECK: ], !prof !3
-
-sw.bb7:
-  store i32 9, i32* %retval
-  br label %return
-
-sw.bb8:
-  store i32 10, i32* %retval
-  br label %return
-
-sw.bb9:
-  store i32 11, i32* %retval
-  br label %return
-
-sw.default10:
-  store i32 12, i32* %retval
-  br label %return
-
-sw.default11:
-  store i32 13, i32* %retval
-  br label %return
-
-return:
-  %3 = load i32* %retval
-  ret i32 %3
-}
-
-define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  %argc.addr = alloca i32, align 4
-  %argv.addr = alloca i8**, align 8
-  %loop = alloca i32, align 4
-  store i32 0, i32* %retval
-  store i32 0, i32* %loop, align 4
-  br label %for.cond
-
-for.cond:
-  %0 = load i32* %loop, align 4
-  %cmp = icmp slt i32 %0, 4000
-  br i1 %cmp, label %for.body, label %for.end
-; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !4
-
-for.body:
-  %1 = load i32* %loop, align 4
-  %call = call i32 @func_switch(i32 %1)
-  %2 = load i32* %loop, align 4
-  %call1 = call i32 @func_switch_switch(i32 %2)
-  br label %for.inc
-
-for.inc:
-  %3 = load i32* %loop, align 4
-  %inc = add nsw i32 %3, 1
-  store i32 %inc, i32* %loop, align 4
-  br label %for.cond
-
-for.end:
-  ret i32 0
-}
-
-; CHECK: !0 = metadata !{metadata !"branch_weights", i32 1000, i32 1000, i32 1000, i32 1000}
-; CHECK: !1 = metadata !{metadata !"branch_weights", i32 0, i32 2000, i32 2000}
-; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1000, i32 0, i32 1000}
-; CHECK: !3 = metadata !{metadata !"branch_weights", i32 1000, i32 0, i32 1000, i32 0}
-; CHECK: !4 = metadata !{metadata !"branch_weights", i32 4000, i32 1}
-; CHECK-NOT: !5
diff --git a/test/Analysis/Profiling/profiling-tool-chain.ll b/test/Analysis/Profiling/profiling-tool-chain.ll
deleted file mode 100644
index 9135a85..0000000
--- a/test/Analysis/Profiling/profiling-tool-chain.ll
+++ /dev/null
@@ -1,212 +0,0 @@
-; RUN: llvm-as %s -o %t1
-
-; FIXME: The RUX parts of the test are disabled for now, they aren't working on
-; llvm-gcc-x86_64-darwin10-selfhost.
-
-; Test the edge optimal profiling instrumentation.
-; RUN: opt %t1 -insert-optimal-edge-profiling -o %t2
-; RUX: llvm-dis < %t2 | FileCheck --check-prefix=INST %s
-
-; Test the creation, reading and displaying of profile
-; RUX: rm -f llvmprof.out
-; RUX: lli -load %llvmshlibdir/profile_rt%shlibext %t2
-; RUX: lli -load %llvmshlibdir/profile_rt%shlibext %t2 1 2
-; RUX: llvm-prof -print-all-code %t1 | FileCheck --check-prefix=PROF %s
-
-; Test the loaded profile also with verifier.
-; RUX  opt %t1 -profile-loader -profile-verifier -o %t3
-
-; Test profile estimator.
-; RUN: opt %t1 -profile-estimator -profile-verifier -o %t3
-
-; PROF:  1.     2/4 oneblock
-; PROF:  2.     2/4 main
-; PROF:  1. 15.7895%    12/76	main() - bb6
-; PROF:  2. 11.8421%     9/76	main() - bb2
-; PROF:  3. 11.8421%     9/76	main() - bb3
-; PROF:  4. 11.8421%     9/76	main() - bb5
-; PROF:  5. 10.5263%     8/76	main() - bb10
-; PROF:  6. 7.89474%     6/76	main() - bb
-; PROF:  7. 7.89474%     6/76	main() - bb9
-; PROF:  8. 3.94737%     3/76	main() - bb1
-; PROF:  9. 3.94737%     3/76	main() - bb7
-; PROF: 10. 3.94737%     3/76	main() - bb8
-; PROF: 11. 2.63158%     2/76	oneblock() - entry
-; PROF: 12. 2.63158%     2/76	main() - entry
-; PROF: 13. 2.63158%     2/76	main() - bb11
-; PROF: 14. 2.63158%     2/76	main() - return
-
-; ModuleID = '<stdin>'
-
-@.str = private constant [12 x i8] c"hello world\00", align 1 ; <[12 x i8]*> [#uses=1]
-@.str1 = private constant [6 x i8] c"franz\00", align 1 ; <[6 x i8]*> [#uses=1]
-@.str2 = private constant [9 x i8] c"argc > 2\00", align 1 ; <[9 x i8]*> [#uses=1]
-@.str3 = private constant [9 x i8] c"argc = 1\00", align 1 ; <[9 x i8]*> [#uses=1]
-@.str4 = private constant [6 x i8] c"fritz\00", align 1 ; <[6 x i8]*> [#uses=1]
-@.str5 = private constant [10 x i8] c"argc <= 1\00", align 1 ; <[10 x i8]*> [#uses=1]
-; INST:@OptEdgeProfCounters
-; INST:[21 x i32]
-; INST:[i32 0,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 0,
-; INST:i32 0,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 0,
-; INST:i32 0,
-; INST:i32 -1,
-; INST:i32 -1,
-; INST:i32 0,
-; INST:i32 -1,
-; INST:i32 -1]
-
-; PROF:;;; %oneblock called 2 times.
-; PROF:;;;
-define void @oneblock() nounwind {
-entry:
-; PROF:entry:
-; PROF:	;;; Basic block executed 2 times.
-  %0 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  ret void
-}
-
-declare i32 @puts(i8*)
-
-; PROF:;;; %main called 2 times.
-; PROF:;;;
-define i32 @main(i32 %argc, i8** %argv) nounwind {
-entry:
-; PROF:entry:
-; PROF:	;;; Basic block executed 2 times.
-  %argc_addr = alloca i32                         ; <i32*> [#uses=4]
-  %argv_addr = alloca i8**                        ; <i8***> [#uses=1]
-  %retval = alloca i32                            ; <i32*> [#uses=2]
-  %j = alloca i32                                 ; <i32*> [#uses=4]
-  %i = alloca i32                                 ; <i32*> [#uses=4]
-  %0 = alloca i32                                 ; <i32*> [#uses=2]
-; INST:call 
-; INST:@llvm_start_opt_edge_profiling
-; INST:@OptEdgeProfCounters
-  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  store i32 %argc, i32* %argc_addr
-  store i8** %argv, i8*** %argv_addr
-  store i32 0, i32* %i, align 4
-  br label %bb10
-; PROF:	;;; Out-edge counts: [2.000000e+00 -> bb10]
-
-bb:                                               ; preds = %bb10
-; PROF:bb:
-; PROF:	;;; Basic block executed 6 times.
-  %1 = load i32* %argc_addr, align 4              ; <i32> [#uses=1]
-  %2 = icmp sgt i32 %1, 1                         ; <i1> [#uses=1]
-  br i1 %2, label %bb1, label %bb8
-; PROF:	;;; Out-edge counts: [3.000000e+00 -> bb1] [3.000000e+00 -> bb8]
-
-bb1:                                              ; preds = %bb
-; PROF:bb1:
-; PROF:	;;; Basic block executed 3 times.
-  store i32 0, i32* %j, align 4
-  br label %bb6
-; PROF:	;;; Out-edge counts: [3.000000e+00 -> bb6]
-
-bb2:                                              ; preds = %bb6
-; PROF:bb2:
-; PROF:	;;; Basic block executed 9 times.
-  %3 = call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  %4 = load i32* %argc_addr, align 4              ; <i32> [#uses=1]
-  %5 = icmp sgt i32 %4, 2                         ; <i1> [#uses=1]
-  br i1 %5, label %bb3, label %bb4
-; PROF:	;;; Out-edge counts: [9.000000e+00 -> bb3]
-
-bb3:                                              ; preds = %bb2
-; PROF:bb3:
-; PROF:	;;; Basic block executed 9 times.
-  %6 = call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @.str2, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  br label %bb5
-; PROF:	;;; Out-edge counts: [9.000000e+00 -> bb5]
-
-bb4:                                              ; preds = %bb2
-; PROF:bb4:
-; PROF:	;;; Never executed!
-  %7 = call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @.str3, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  br label %bb11
-
-bb5:                                              ; preds = %bb3
-; PROF:bb5:
-; PROF:	;;; Basic block executed 9 times.
-  %8 = call i32 @puts(i8* getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  %9 = load i32* %j, align 4                      ; <i32> [#uses=1]
-  %10 = add nsw i32 %9, 1                         ; <i32> [#uses=1]
-  store i32 %10, i32* %j, align 4
-  br label %bb6
-; PROF:	;;; Out-edge counts: [9.000000e+00 -> bb6]
-
-bb6:                                              ; preds = %bb5, %bb1
-; PROF:bb6:
-; PROF:	;;; Basic block executed 12 times.
-  %11 = load i32* %j, align 4                     ; <i32> [#uses=1]
-  %12 = load i32* %argc_addr, align 4             ; <i32> [#uses=1]
-  %13 = icmp slt i32 %11, %12                     ; <i1> [#uses=1]
-  br i1 %13, label %bb2, label %bb7
-; PROF:	;;; Out-edge counts: [9.000000e+00 -> bb2] [3.000000e+00 -> bb7]
-
-bb7:                                              ; preds = %bb6
-; PROF:bb7:
-; PROF:	;;; Basic block executed 3 times.
-  br label %bb9
-; PROF:	;;; Out-edge counts: [3.000000e+00 -> bb9]
-
-bb8:                                              ; preds = %bb
-; PROF:bb8:
-; PROF:	;;; Basic block executed 3 times.
-  %14 = call i32 @puts(i8* getelementptr inbounds ([10 x i8]* @.str5, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
-  br label %bb9
-; PROF:	;;; Out-edge counts: [3.000000e+00 -> bb9]
-
-bb9:                                              ; preds = %bb8, %bb7
-; PROF:bb9:
-; PROF:	;;; Basic block executed 6 times.
-  %15 = load i32* %i, align 4                     ; <i32> [#uses=1]
-  %16 = add nsw i32 %15, 1                        ; <i32> [#uses=1]
-  store i32 %16, i32* %i, align 4
-  br label %bb10
-; PROF:	;;; Out-edge counts: [6.000000e+00 -> bb10]
-
-bb10:                                             ; preds = %bb9, %entry
-; PROF:bb10:
-; PROF:	;;; Basic block executed 8 times.
-  %17 = load i32* %i, align 4                     ; <i32> [#uses=1]
-  %18 = icmp ne i32 %17, 3                        ; <i1> [#uses=1]
-  br i1 %18, label %bb, label %bb11
-; INST:br
-; INST:label %bb10.bb11_crit_edge
-; PROF:	;;; Out-edge counts: [6.000000e+00 -> bb] [2.000000e+00 -> bb11]
-
-; INST:bb10.bb11_crit_edge:
-; INST:br
-; INST:label %bb11
-
-bb11:                                             ; preds = %bb10, %bb4
-; PROF:bb11:
-; PROF:	;;; Basic block executed 2 times.
-  call void @oneblock() nounwind
-  store i32 0, i32* %0, align 4
-  %19 = load i32* %0, align 4                     ; <i32> [#uses=1]
-  store i32 %19, i32* %retval, align 4
-  br label %return
-; PROF:	;;; Out-edge counts: [2.000000e+00 -> return]
-
-return:                                           ; preds = %bb11
-; PROF:return:
-; PROF:	;;; Basic block executed 2 times.
-  %retval12 = load i32* %retval                   ; <i32> [#uses=1]
-  ret i32 %retval12
-}
diff --git a/test/Analysis/RegionInfo/lit.local.cfg b/test/Analysis/RegionInfo/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/RegionInfo/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/ScalarEvolution/2007-08-06-MisinterpretBranch.ll b/test/Analysis/ScalarEvolution/2007-08-06-MisinterpretBranch.ll
index e67e4d0..fd09fd5 100644
--- a/test/Analysis/ScalarEvolution/2007-08-06-MisinterpretBranch.ll
+++ b/test/Analysis/ScalarEvolution/2007-08-06-MisinterpretBranch.ll
@@ -1,6 +1,8 @@
-; RUN: opt < %s -indvars -adce -simplifycfg -S | grep "icmp s"
+; RUN: opt < %s -indvars -adce -simplifycfg -S | FileCheck %s
 ; PR1598
 
+; CHECK: icmp s
+
 define i32 @f(i32 %a, i32 %b, i32 %x, i32 %y) {
 entry:
 	%tmp3 = icmp eq i32 %a, %b		; <i1> [#uses=1]
diff --git a/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll b/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll
index 036abf5..9e19cca 100644
--- a/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll
+++ b/test/Analysis/ScalarEvolution/2007-08-06-Unsigned.ll
@@ -1,6 +1,8 @@
-; RUN: opt < %s -scalar-evolution -analyze | grep "Loop %bb: backedge-taken count is (-1 + (-1 \* %x) + %y)"
+; RUN: opt < %s -scalar-evolution -analyze | FileCheck %s
 ; PR1597
 
+; CHECK: Loop %bb: backedge-taken count is (-1 + (-1 * %x) + %y)
+
 define i32 @f(i32 %x, i32 %y) {
 entry:
         %tmp63 = icmp ult i32 %x, %y            ; <i1> [#uses=1]
diff --git a/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll b/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll
index a3192b9..b65a525 100644
--- a/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll
+++ b/test/Analysis/ScalarEvolution/2007-09-27-LargeStepping.ll
@@ -1,7 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep "backedge-taken count is 13"
+; RUN: opt < %s -analyze -scalar-evolution -scalar-evolution-max-iterations=0 | FileCheck %s
 ; PR1706
 
+; CHECK: backedge-taken count is 13
+
 define i32 @f() {
 entry:
 	br label %bb5
diff --git a/test/Analysis/ScalarEvolution/2007-11-14-SignedAddRec.ll b/test/Analysis/ScalarEvolution/2007-11-14-SignedAddRec.ll
index 514920f..a2850d8 100644
--- a/test/Analysis/ScalarEvolution/2007-11-14-SignedAddRec.ll
+++ b/test/Analysis/ScalarEvolution/2007-11-14-SignedAddRec.ll
@@ -1,6 +1,8 @@
-; RUN: opt < %s -indvars -S | grep printd | grep 1206807378
+; RUN: opt < %s -indvars -S | FileCheck %s
 ; PR1798
 
+; CHECK: printd(i32 1206807378)
+
 declare void @printd(i32)
 
 define i32 @test() {
diff --git a/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll b/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll
index d0644f7..6ebfa61 100644
--- a/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll
+++ b/test/Analysis/ScalarEvolution/2008-02-11-ReversedCondition.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s -scalar-evolution -analyze | grep "Loop %header: backedge-taken count is (0 smax %n)"
+; RUN: opt < %s -scalar-evolution -analyze | FileCheck %s
+
+; CHECK: Loop %header: backedge-taken count is (0 smax %n)
 
 define void @foo(i32 %n) {
 entry:
diff --git a/test/Analysis/ScalarEvolution/2008-02-15-UMax.ll b/test/Analysis/ScalarEvolution/2008-02-15-UMax.ll
index 52c7985..527fd27 100644
--- a/test/Analysis/ScalarEvolution/2008-02-15-UMax.ll
+++ b/test/Analysis/ScalarEvolution/2008-02-15-UMax.ll
@@ -1,6 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution | grep umax
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
 ; PR2003
 
+; CHECK: umax
+
 define i32 @foo(i32 %n) {
 entry:
         br label %header
diff --git a/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll b/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll
index 41734d7..9a05d88 100644
--- a/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll
+++ b/test/Analysis/ScalarEvolution/2008-05-25-NegativeStepToZero.ll
@@ -1,7 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep "backedge-taken count is 61"
+; RUN: opt < %s -analyze -scalar-evolution -scalar-evolution-max-iterations=0 | FileCheck %s
 ; PR2364
 
+; CHECK: backedge-taken count is 61
+
 define i32 @func_6() nounwind  {
 entry:
 	br label %bb5
diff --git a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll
index 5cf17a2..dcf8fc9 100644
--- a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll
+++ b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect1.ll
@@ -1,6 +1,9 @@
-; RUN: opt < %s -analyze -scalar-evolution 2>&1 | not grep smax
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | FileCheck %s
 ; PR2261
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'foo'
+; CHECK-NOT: smax
+
 @lut = common global [256 x i8] zeroinitializer, align 32		; <[256 x i8]*> [#uses=1]
 
 define void @foo(i32 %count, i32* %srcptr, i32* %dstptr) nounwind  {
diff --git a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll
index 195dfaa..c804bd9 100644
--- a/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll
+++ b/test/Analysis/ScalarEvolution/2008-07-12-UnneededSelect2.ll
@@ -1,6 +1,9 @@
-; RUN: opt < %s -analyze -scalar-evolution 2>&1 | not grep smax
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | FileCheck %s
 ; PR2070
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'a'
+; CHECK-NOT: smax
+
 define i32 @a(i32 %x) nounwind  {
 entry:
 	icmp sgt i32 %x, 1		; <i1>:0 [#uses=1]
diff --git a/test/Analysis/ScalarEvolution/2008-07-19-InfiniteLoop.ll b/test/Analysis/ScalarEvolution/2008-07-19-InfiniteLoop.ll
index 1865c05..ad34f6c 100644
--- a/test/Analysis/ScalarEvolution/2008-07-19-InfiniteLoop.ll
+++ b/test/Analysis/ScalarEvolution/2008-07-19-InfiniteLoop.ll
@@ -1,7 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep Unpredictable
+; RUN: opt < %s -analyze -scalar-evolution -scalar-evolution-max-iterations=0 | FileCheck %s
 ; PR2088
 
+; CHECK: Unpredictable
+
 define void @fun() {
 entry:
         br label %loop
diff --git a/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll b/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll
index cbf200e..82b9d56 100644
--- a/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll
+++ b/test/Analysis/ScalarEvolution/2008-07-19-WrappingIV.ll
@@ -1,7 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:   -scalar-evolution-max-iterations=0 | grep "backedge-taken count is 113"
+; RUN: opt < %s -analyze -scalar-evolution -scalar-evolution-max-iterations=0 | FileCheck %s
 ; PR2088
 
+; CHECK: backedge-taken count is 113
+
 define void @fun() {
 entry:
         br label %loop
diff --git a/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll b/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll
index c25e4a3..46c6c59 100644
--- a/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll
+++ b/test/Analysis/ScalarEvolution/2008-11-18-LessThanOrEqual.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -analyze -scalar-evolution 2>&1 | \
-; RUN: grep "Loop %bb: backedge-taken count is (7 + (-1 \* %argc))"
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | FileCheck %s
+
+; CHECK: Loop %bb: backedge-taken count is (7 + (-1 * %argc))
 
 define i32 @main(i32 %argc, i8** %argv) nounwind {
 entry:
diff --git a/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll b/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
index 56a8343..7acf90c 100644
--- a/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
+++ b/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
@@ -1,5 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution \
-; RUN:  | grep "Loop %bb: Unpredictable backedge-taken count\."
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+; CHECK: Loop %bb: backedge-taken count is ((-5 + %x) /u 3)
+; CHECK: Loop %bb: max backedge-taken count is 1431655764
+
 
 ; ScalarEvolution can't compute a trip count because it doesn't know if
 ; dividing by the stride will have a remainder. This could theoretically
diff --git a/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll b/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
index aaf6770..2b2296a 100644
--- a/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
+++ b/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
@@ -1,5 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution 2>&1 | grep "/u 3"
-; XFAIL: *
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | FileCheck %s
+
+; CHECK: Loop %bb: backedge-taken count is ((999 + (-1 * %x)) /u 3)
+; CHECK: Loop %bb: max backedge-taken count is 334
+
 
 ; This is a tricky testcase for unsigned wrap detection which ScalarEvolution
 ; doesn't yet know how to do.
diff --git a/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll b/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll
index a1b3b71..7a7a640 100644
--- a/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll
+++ b/test/Analysis/ScalarEvolution/2008-12-08-FiniteSGE.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s -analyze -scalar-evolution | grep "backedge-taken count is 255"
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+; CHECK: backedge-taken count is 255
 
 define i32 @foo(i32 %x, i32 %y, i32* %lam, i32* %alp) nounwind {
 bb1.thread:
diff --git a/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll b/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll
index bb14919..95aa1fc 100644
--- a/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll
+++ b/test/Analysis/ScalarEvolution/2008-12-14-StrideAndSigned.ll
@@ -1,7 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution 2>&1 | \
-; RUN: grep "(((-1 * %i0) + (100005 smax %i0)) /u 5)"
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 |  FileCheck %s
 ; XFAIL: *
 
+; CHECK: (((-1 * %i0) + (100005 smax %i0)) /u 5)
+
 define i32 @foo0(i32 %i0) nounwind {
 entry:
 	br label %bb1
diff --git a/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll b/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll
index 7000626..70588bc 100644
--- a/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll
+++ b/test/Analysis/ScalarEvolution/2008-12-15-DontUseSDiv.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -analyze -scalar-evolution 2>&1 | grep "/u 5"
-; XFAIL: *
+; RUN: opt < %s -analyze -scalar-evolution 2>&1 | FileCheck %s
+
+; CHECK: /u 5
 
 define i8 @foo0(i8 %i0) nounwind {
 entry:
diff --git a/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll b/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll
index 82f2608..f19d18c 100644
--- a/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll
+++ b/test/Analysis/ScalarEvolution/2009-01-02-SignedNegativeStride.ll
@@ -1,6 +1,9 @@
-; RUN: opt < %s -analyze -scalar-evolution | not grep "/u -1"
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
 ; PR3275
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'func_15'
+; CHECK-NOT: /u -1
+
 @g_16 = external global i16		; <i16*> [#uses=3]
 @.str = external constant [4 x i8]		; <[4 x i8]*> [#uses=0]
 
diff --git a/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll b/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll
index ebd9f73..3dacfbb 100644
--- a/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll
+++ b/test/Analysis/ScalarEvolution/2009-04-22-TruncCast.ll
@@ -1,35 +1,53 @@
 ; RUN: opt < %s -analyze -scalar-evolution | grep "(trunc i" | not grep ext
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'test1'
+; CHECK-NOT: (trunc i{{.*}}ext
+
 define i16 @test1(i8 %x) {
   %A = sext i8 %x to i32
   %B = trunc i32 %A to i16
   ret i16 %B
 }
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'test2'
+; CHECK-NOT: (trunc i{{.*}}ext
+
 define i8 @test2(i16 %x) {
   %A = sext i16 %x to i32
   %B = trunc i32 %A to i8
   ret i8 %B
 }
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'test3'
+; CHECK-NOT: (trunc i{{.*}}ext
+
 define i16 @test3(i16 %x) {
   %A = sext i16 %x to i32
   %B = trunc i32 %A to i16
   ret i16 %B
 }
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'test4'
+; CHECK-NOT: (trunc i{{.*}}ext
+
 define i16 @test4(i8 %x) {
   %A = zext i8 %x to i32
   %B = trunc i32 %A to i16
   ret i16 %B
 }
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'test5'
+; CHECK-NOT: (trunc i{{.*}}ext
+
 define i8 @test5(i16 %x) {
   %A = zext i16 %x to i32
   %B = trunc i32 %A to i8
   ret i8 %B
 }
 
+; CHECK: Printing analysis 'Scalar Evolution Analysis' for function 'test6'
+; CHECK-NOT: (trunc i{{.*}}ext
+
 define i16 @test6(i16 %x) {
   %A = zext i16 %x to i32
   %B = trunc i32 %A to i16
diff --git a/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll b/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll
index 8a78043..5d1502d 100644
--- a/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll
+++ b/test/Analysis/ScalarEvolution/2009-05-09-PointerEdgeCount.ll
@@ -1,5 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution | grep "count is 2"
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
 ; PR3171
+
+; CHECK: count is 2
+
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 	%struct.Foo = type { i32 }
diff --git a/test/Analysis/ScalarEvolution/and-xor.ll b/test/Analysis/ScalarEvolution/and-xor.ll
index 06f4a85..404ab91 100644
--- a/test/Analysis/ScalarEvolution/and-xor.ll
+++ b/test/Analysis/ScalarEvolution/and-xor.ll
@@ -1,5 +1,8 @@
-; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:   | grep "\-->  (zext" | count 2
+; RUN: opt < %s -scalar-evolution -analyze | FileCheck %s
+
+; CHECK: -->  (zext
+; CHECK: -->  (zext
+; CHECK-NOT: -->  (zext
 
 define i32 @foo(i32 %x) {
   %n = and i32 %x, 255
diff --git a/test/Analysis/ScalarEvolution/avoid-smax-0.ll b/test/Analysis/ScalarEvolution/avoid-smax-0.ll
index 3d15c78..8abb430 100644
--- a/test/Analysis/ScalarEvolution/avoid-smax-0.ll
+++ b/test/Analysis/ScalarEvolution/avoid-smax-0.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s -scalar-evolution -analyze | grep "Loop %bb3: backedge-taken count is (-1 + %n)"
+; RUN: opt < %s -scalar-evolution -analyze | FileCheck %s
+
+; CHECK: Loop %bb3: backedge-taken count is (-1 + %n)
 
 ; We don't want to use a max in the trip count expression in
 ; this testcase.
diff --git a/test/Analysis/ScalarEvolution/div-overflow.ll b/test/Analysis/ScalarEvolution/div-overflow.ll
index 2846797..aca964a 100644
--- a/test/Analysis/ScalarEvolution/div-overflow.ll
+++ b/test/Analysis/ScalarEvolution/div-overflow.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -scalar-evolution -analyze \
-; RUN:  | grep "\-->  ((-128 \* %a) /u -128)"
+; RUN: opt < %s -scalar-evolution -analyze | FileCheck %s
+
+; CHECK: -->  ((-128 * %a) /u -128)
 
 ; Don't let ScalarEvolution fold this div away.
 
diff --git a/test/Analysis/ScalarEvolution/do-loop.ll b/test/Analysis/ScalarEvolution/do-loop.ll
index 6e3295a..e35ea7d 100644
--- a/test/Analysis/ScalarEvolution/do-loop.ll
+++ b/test/Analysis/ScalarEvolution/do-loop.ll
@@ -1,6 +1,8 @@
-; RUN: opt < %s -analyze -scalar-evolution | grep smax
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
 ; PR1614
 
+; CHECK: smax
+
 define i32 @f(i32 %x, i32 %y) {
 entry:
 	br label %bb
diff --git a/test/Analysis/ScalarEvolution/lit.local.cfg b/test/Analysis/ScalarEvolution/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Analysis/ScalarEvolution/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Analysis/ScalarEvolution/max-trip-count-address-space.ll b/test/Analysis/ScalarEvolution/max-trip-count-address-space.ll
new file mode 100644
index 0000000..aa5254c
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/max-trip-count-address-space.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+; ScalarEvolution should be able to understand the loop and eliminate the casts.
+
+target datalayout = "e-p:32:32:32-p1:16:16:16-p2:8:8:8-p4:64:64:64-n16:32:64"
+
+; CHECK:  {%d,+,4}<%bb>		Exits: ((4 * (trunc i32 (-1 + %n) to i16)) + %d)
+
+
+define void @foo(i32 addrspace(1)* nocapture %d, i32 %n) nounwind {
+; CHECK: @foo
+entry:
+	%0 = icmp sgt i32 %n, 0		; <i1> [#uses=1]
+	br i1 %0, label %bb.nph, label %return
+
+bb.nph:		; preds = %entry
+	br label %bb
+
+bb:		; preds = %bb1, %bb.nph
+	%i.02 = phi i32 [ %5, %bb1 ], [ 0, %bb.nph ]		; <i32> [#uses=2]
+	%p.01 = phi i8 [ %4, %bb1 ], [ -1, %bb.nph ]		; <i8> [#uses=2]
+	%1 = sext i8 %p.01 to i32		; <i32> [#uses=1]
+	%2 = sext i32 %i.02 to i64		; <i64> [#uses=1]
+	%3 = getelementptr i32 addrspace(1)* %d, i64 %2		; <i32*> [#uses=1]
+	store i32 %1, i32 addrspace(1)* %3, align 4
+	%4 = add i8 %p.01, 1		; <i8> [#uses=1]
+	%5 = add i32 %i.02, 1		; <i32> [#uses=2]
+	br label %bb1
+
+bb1:		; preds = %bb
+	%6 = icmp slt i32 %5, %n		; <i1> [#uses=1]
+	br i1 %6, label %bb, label %bb1.return_crit_edge
+
+bb1.return_crit_edge:		; preds = %bb1
+	br label %return
+
+return:		; preds = %bb1.return_crit_edge, %entry
+	ret void
+}
+
+define void @test(i8 addrspace(1)* %a, i32 %n) nounwind {
+; CHECK: @test
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %tmp = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %for.body.lr.ph ]
+  %arrayidx = getelementptr i8 addrspace(1)* %a, i64 %indvar
+  store i8 0, i8 addrspace(1)* %arrayidx, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp ne i64 %indvar.next, %tmp
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+; CHECK: Determining loop execution counts for: @test
+; CHECK-NEXT: backedge-taken count is
+; CHECK-NEXT: max backedge-taken count is -1
diff --git a/test/Analysis/ScalarEvolution/nsw.ll b/test/Analysis/ScalarEvolution/nsw.ll
index 659cf4f..05992ea 100644
--- a/test/Analysis/ScalarEvolution/nsw.ll
+++ b/test/Analysis/ScalarEvolution/nsw.ll
@@ -62,11 +62,11 @@ for.body.lr.ph.i.i:                               ; preds = %entry
 for.body.i.i:                                     ; preds = %for.body.i.i, %for.body.lr.ph.i.i
   %__first.addr.02.i.i = phi i32* [ %begin, %for.body.lr.ph.i.i ], [ %ptrincdec.i.i, %for.body.i.i ]
 ; CHECK: %__first.addr.02.i.i
-; CHECK-NEXT: -->  {%begin,+,4}<nw><%for.body.i.i>
+; CHECK-NEXT: -->  {%begin,+,4}<nuw><%for.body.i.i>
   store i32 0, i32* %__first.addr.02.i.i, align 4
   %ptrincdec.i.i = getelementptr inbounds i32* %__first.addr.02.i.i, i64 1
 ; CHECK: %ptrincdec.i.i
-; CHECK-NEXT: -->  {(4 + %begin),+,4}<nw><%for.body.i.i>
+; CHECK-NEXT: -->  {(4 + %begin),+,4}<nuw><%for.body.i.i>
   %cmp.i.i = icmp eq i32* %ptrincdec.i.i, %end
   br i1 %cmp.i.i, label %for.cond.for.end_crit_edge.i.i, label %for.body.i.i
 
@@ -122,3 +122,39 @@ exit:
   %result = phi i32 [ %a, %entry ], [ %tmp2, %greater ]
   ret i32 %result
 }
+
+; TODO: This could fold down to '1'
+; CHECK-LABEL: PR12375
+; CHECK: -->  {(4 + %arg),+,4}<nuw><%bb1>		Exits: (4 + (4 * ((-1 + (-1 * %arg) + ((4 + %arg) umax (8 + %arg)<nsw>)) /u 4)) + %arg)
+define i32 @PR12375(i32* readnone %arg) {
+bb:
+  %tmp = getelementptr inbounds i32* %arg, i64 2
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp2 = phi i32* [ %arg, %bb ], [ %tmp5, %bb1 ]
+  %tmp3 = phi i32 [ 0, %bb ], [ %tmp4, %bb1 ]
+  %tmp4 = add nsw i32 %tmp3, 1
+  %tmp5 = getelementptr inbounds i32* %tmp2, i64 1
+  %tmp6 = icmp ult i32* %tmp5, %tmp
+  br i1 %tmp6, label %bb1, label %bb7
+
+bb7:                                              ; preds = %bb1
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: PR12376
+; CHECK: -->  {(4 + %arg),+,4}<nuw><%bb2>		Exits: (4 + (4 * ((3 + (-1 * %arg) + (%arg umax %arg1)) /u 4)) + %arg)
+define void @PR12376(i32* nocapture %arg, i32* nocapture %arg1)  {
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %tmp = phi i32* [ %arg, %bb ], [ %tmp4, %bb2 ]
+  %tmp3 = icmp ult i32* %tmp, %arg1
+  %tmp4 = getelementptr inbounds i32* %tmp, i64 1
+  br i1 %tmp3, label %bb2, label %bb5
+
+bb5:                                              ; preds = %bb2
+  ret void
+}
diff --git a/test/Analysis/ScalarEvolution/trip-count11.ll b/test/Analysis/ScalarEvolution/trip-count11.ll
index 7191503..e14af08 100644
--- a/test/Analysis/ScalarEvolution/trip-count11.ll
+++ b/test/Analysis/ScalarEvolution/trip-count11.ll
@@ -1,9 +1,11 @@
 ; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 @foo.a = internal constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 16
+@foo.a_as1 = internal addrspace(1) constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 16
+
 
 define i32 @foo() nounwind uwtable noinline {
 entry:
@@ -27,3 +29,27 @@ for.inc:                                          ; preds = %for.cond
 for.end:                                          ; preds = %for.cond
   ret i32 %sum.0
 }
+
+define i32 @foo_as1() nounwind uwtable noinline {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+; CHECK: --> %sum.0 Exits: 28
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ult i32 %i.0, 8
+  br i1 %cmp, label %for.inc, label %for.end
+
+for.inc:                                          ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [8 x i32] addrspace(1)* @foo.a_as1, i64 0, i64 %idxprom
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %sum.0, %0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret i32 %sum.0
+}
+
diff --git a/test/Analysis/ScalarEvolution/trip-count3.ll b/test/Analysis/ScalarEvolution/trip-count3.ll
index 32c51bf..850e035 100644
--- a/test/Analysis/ScalarEvolution/trip-count3.ll
+++ b/test/Analysis/ScalarEvolution/trip-count3.ll
@@ -4,7 +4,8 @@
 ; dividing by the stride will have a remainder. This could theoretically
 ; be teaching it how to use a more elaborate trip count computation.
 
-; CHECK: Loop %bb3.i: Unpredictable backedge-taken count.
+; CHECK: Loop %bb3.i: backedge-taken count is ((64 + (-64 smax (-1 + (-1 * %0))) + %0) /u 64)
+; CHECK: Loop %bb3.i: max backedge-taken count is 33554431
 
 %struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
 %struct.SHA_INFO = type { [5 x i32], i32, i32, [16 x i32] }
diff --git a/test/Analysis/ScalarEvolution/trip-count9.ll b/test/Analysis/ScalarEvolution/trip-count9.ll
index 9180f2b..9a080b3 100644
--- a/test/Analysis/ScalarEvolution/trip-count9.ll
+++ b/test/Analysis/ScalarEvolution/trip-count9.ll
@@ -25,8 +25,8 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
-; CHECK: Loop %loop: Unpredictable max backedge-taken count. 
+; CHECK: Loop %loop: Unpredictable backedge-taken count.
+; CHECK: Loop %loop: Unpredictable max backedge-taken count.
 define void @step2(i4 %n) {
 entry:
   %s = icmp sgt i4 %n, 0
@@ -57,8 +57,8 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @start1_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
-; CHECK: Loop %loop: Unpredictable max backedge-taken count. 
+; CHECK: Loop %loop: Unpredictable backedge-taken count.
+; CHECK: Loop %loop: Unpredictable max backedge-taken count.
 define void @start1_step2(i4 %n) {
 entry:
   %s = icmp sgt i4 %n, 0
@@ -89,8 +89,8 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @startx_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
-; CHECK: Loop %loop: Unpredictable max backedge-taken count. 
+; CHECK: Loop %loop: Unpredictable backedge-taken count.
+; CHECK: Loop %loop: Unpredictable max backedge-taken count.
 define void @startx_step2(i4 %n, i4 %x) {
 entry:
   %s = icmp sgt i4 %n, 0
@@ -120,12 +120,18 @@ exit:
   ret void
 }
 
-; Be careful with this one. If %n is INT4_MAX, %i.next will wrap. The nsw bit
-; says that the result is undefined, but ScalarEvolution must respect that
-; subsequent passes may result the undefined behavior in predictable ways.
+; If %n is INT4_MAX, %i.next will wrap. The nsw bit says that the
+; result is undefined. Therefore, after the loop's second iteration,
+; we are free to assume that the loop exits. This is valid because:
+; (a) %i.next is a poison value after the second iteration, which can
+; also be considered an undef value.
+; (b) the return instruction enacts a side effect that is control
+; dependent on the poison value.
+;
+; CHECK-LABEL: nsw_step2
 ; CHECK: Determining loop execution counts for: @nsw_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
-; CHECK: Loop %loop: Unpredictable max backedge-taken count. 
+; CHECK: Loop %loop: backedge-taken count is ((-1 + %n) /u 2)
+; CHECK: Loop %loop: max backedge-taken count is 2
 define void @nsw_step2(i4 %n) {
 entry:
   %s = icmp sgt i4 %n, 0
@@ -139,6 +145,7 @@ exit:
   ret void
 }
 
+; CHECK-LABEL: nsw_start1
 ; CHECK: Determining loop execution counts for: @nsw_start1
 ; CHECK: Loop %loop: backedge-taken count is (-2 + (2 smax %n))
 ; CHECK: Loop %loop: max backedge-taken count is 5
@@ -156,8 +163,8 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @nsw_start1_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
-; CHECK: Loop %loop: Unpredictable max backedge-taken count. 
+; CHECK: Loop %loop: backedge-taken count is ((-2 + (3 smax %n)) /u 2)
+; CHECK: Loop %loop: max backedge-taken count is 2
 define void @nsw_start1_step2(i4 %n) {
 entry:
   %s = icmp sgt i4 %n, 0
@@ -188,8 +195,8 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @nsw_startx_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
-; CHECK: Loop %loop: Unpredictable max backedge-taken count. 
+; CHECK: Loop %loop: backedge-taken count is ((-1 + (-1 * %x) + ((2 + %x) smax %n)) /u 2)
+; CHECK: Loop %loop: max backedge-taken count is 7
 define void @nsw_startx_step2(i4 %n, i4 %x) {
 entry:
   %s = icmp sgt i4 %n, 0
@@ -221,7 +228,7 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @even_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
+; CHECK: Loop %loop: backedge-taken count is ((-1 + (2 * %n)) /u 2)
 ; CHECK: Loop %loop: max backedge-taken count is 2
 define void @even_step2(i4 %n) {
 entry:
@@ -255,7 +262,7 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @even_start1_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
+; CHECK: Loop %loop: backedge-taken count is ((-2 + (3 smax (2 * %n))) /u 2)
 ; CHECK: Loop %loop: max backedge-taken count is 2
 define void @even_start1_step2(i4 %n) {
 entry:
@@ -273,7 +280,7 @@ exit:
 
 ; CHECK: Determining loop execution counts for: @even_startx
 ; CHECK: Loop %loop: backedge-taken count is (-1 + (-1 * %x) + ((1 + %x) smax (2 * %n)))
-; CHECK: Loop %loop: max backedge-taken count is -1
+; CHECK: Loop %loop: max backedge-taken count is -2
 define void @even_startx(i4 %n, i4 %x) {
 entry:
   %m = shl i4 %n, 1
@@ -289,7 +296,7 @@ exit:
 }
 
 ; CHECK: Determining loop execution counts for: @even_startx_step2
-; CHECK: Loop %loop: Unpredictable backedge-taken count. 
+; CHECK: Loop %loop: backedge-taken count is ((-1 + (-1 * %x) + ((2 + %x) smax (2 * %n))) /u 2)
 ; CHECK: Loop %loop: max backedge-taken count is 7
 define void @even_startx_step2(i4 %n, i4 %x) {
 entry:
@@ -375,7 +382,7 @@ exit:
 
 ; CHECK: Determining loop execution counts for: @even_nsw_startx
 ; CHECK: Loop %loop: backedge-taken count is (-1 + (-1 * %x) + ((1 + %x) smax (2 * %n)))
-; CHECK: Loop %loop: max backedge-taken count is -1
+; CHECK: Loop %loop: max backedge-taken count is -2
 define void @even_nsw_startx(i4 %n, i4 %x) {
 entry:
   %m = shl i4 %n, 1
diff --git a/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll b/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
new file mode 100644
index 0000000..9051139
--- /dev/null
+++ b/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -tbaa -gvn -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%structA = type { %structB }
+%structB = type { i32*, %classT }
+%classT = type { %classO, %classJ*, i8 }
+%classO = type { i32 }
+%classJ = type { i8 }
+%classA = type { %classB }
+%classB = type { i8 }
+%classC = type { %classD, %structA }
+%classD = type { %structA* }
+
+; Function Attrs: ssp uwtable
+define %structA** @test(%classA* %this, i32** %p1) #0 align 2 {
+entry:
+; CHECK-LABEL: @test
+; CHECK: load i32** %p1, align 8, !tbaa
+; CHECK: load i32** getelementptr (%classC* null, i32 0, i32 1, i32 0, i32 0), align 8, !tbaa
+; CHECK: call void @callee
+  %0 = load i32** %p1, align 8, !tbaa !1
+  %1 = load i32** getelementptr (%classC* null, i32 0, i32 1, i32 0, i32 0), align 8, !tbaa !5
+  call void @callee(i32* %0, i32* %1)
+  unreachable
+}
+
+declare void @callee(i32*, i32*) #1
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !2, i64 8}
+!6 = metadata !{metadata !"_ZTSN12_GLOBAL__N_11RINS_1FIPi8TreeIterN1I1S1LENS_1KINS_1DIKS2_S3_EEEEE1GEPSD_EE", metadata !7, i64 8}
+!7 = metadata !{metadata !"_ZTSN12_GLOBAL__N_11FIPi8TreeIterN1I1S1LENS_1KINS_1DIKS1_S2_EEEEE1GE", metadata !8, i64 0}
+!8 = metadata !{metadata !"_ZTSN12_GLOBAL__N_11DIKPi8TreeIterEE", metadata !2, i64 0, metadata !9, i64 8}
+!9 = metadata !{metadata !"_ZTS8TreeIter", metadata !2, i64 8, metadata !10, i64 16}
+!10 = metadata !{metadata !"bool", metadata !3, i64 0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll b/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll
index d59e392..76a88c8 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/aliastest.ll
@@ -47,16 +47,21 @@ define i8 @test1_no(i8* %a, i8* %b) nounwind {
 ; Root note.
 !0 = metadata !{ }
 ; Some type.
-!1 = metadata !{ metadata !"foo", metadata !0 }
+!1 = metadata !{metadata !7, metadata !7, i64 0}
 ; Some other non-aliasing type.
-!2 = metadata !{ metadata !"bar", metadata !0 }
+!2 = metadata !{metadata !8, metadata !8, i64 0}
 
 ; Some type.
-!3 = metadata !{ metadata !"foo", metadata !0 }
+!3 = metadata !{metadata !9, metadata !9, i64 0}
 ; Some type in a different type system.
-!4 = metadata !{ metadata !"bar", metadata !"different" }
+!4 = metadata !{metadata !10, metadata !10, i64 0}
 
 ; Invariant memory.
-!5 = metadata !{ metadata !"qux", metadata !0, i1 1 }
+!5 = metadata !{metadata !11, metadata !11, i64 0, i1 1}
 ; Not invariant memory.
-!6 = metadata !{ metadata !"qux", metadata !0, i1 0 }
+!6 = metadata !{metadata !11, metadata !11, i64 0, i1 0}
+!7 = metadata !{ metadata !"foo", metadata !0 }
+!8 = metadata !{ metadata !"bar", metadata !0 }
+!9 = metadata !{ metadata !"foo", metadata !0 }
+!10 = metadata !{ metadata !"bar", metadata !"different" }
+!11 = metadata !{ metadata !"qux", metadata !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll b/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll
index bb66e37..14bbeac 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/argument-promotion.ll
@@ -33,5 +33,7 @@ define i32 @callercaller(i32* %Q) {
 }
 
 !0 = metadata !{metadata !"test"}
-!1 = metadata !{metadata !"green", metadata !0}
-!2 = metadata !{metadata !"blue", metadata !0}
+!1 = metadata !{metadata !3, metadata !3, i64 0}
+!2 = metadata !{metadata !4, metadata !4, i64 0}
+!3 = metadata !{metadata !"green", metadata !0}
+!4 = metadata !{metadata !"blue", metadata !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/dse.ll b/test/Analysis/TypeBasedAliasAnalysis/dse.ll
index 6b44eb6..bcf1f2c 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/dse.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/dse.ll
@@ -51,16 +51,21 @@ define i8 @test1_no(i8* %a, i8* %b) nounwind {
 ; Root note.
 !0 = metadata !{ }
 ; Some type.
-!1 = metadata !{ metadata !"foo", metadata !0 }
+!1 = metadata !{metadata !7, metadata !7, i64 0}
 ; Some other non-aliasing type.
-!2 = metadata !{ metadata !"bar", metadata !0 }
+!2 = metadata !{metadata !8, metadata !8, i64 0}
 
 ; Some type.
-!3 = metadata !{ metadata !"foo", metadata !0 }
+!3 = metadata !{metadata !9, metadata !9, i64 0}
 ; Some type in a different type system.
-!4 = metadata !{ metadata !"bar", metadata !"different" }
+!4 = metadata !{metadata !10, metadata !10, i64 0}
 
 ; Invariant memory.
-!5 = metadata !{ metadata !"qux", metadata !0, i1 1 }
+!5 = metadata !{metadata !11, metadata !11, i64 0, i1 1}
 ; Not invariant memory.
-!6 = metadata !{ metadata !"qux", metadata !0, i1 0 }
+!6 = metadata !{metadata !11, metadata !11, i64 0, i1 0}
+!7 = metadata !{ metadata !"foo", metadata !0 }
+!8 = metadata !{ metadata !"bar", metadata !0 }
+!9 = metadata !{ metadata !"foo", metadata !0 }
+!10 = metadata !{ metadata !"bar", metadata !"different" }
+!11 = metadata !{ metadata !"qux", metadata !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
index 52e394b..4dc4073 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/dynamic-indices.ll
@@ -13,7 +13,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; CHECK: for.end:
 ; CHECK:   %arrayidx31 = getelementptr inbounds %union.vector_t* %t, i64 0, i32 0, i64 1
-; CHECK:   %tmp32 = load i64* %arrayidx31, align 8, !tbaa !3
+; CHECK:   %tmp32 = load i64* %arrayidx31, align 8, !tbaa [[TAG:!.*]]
 
 define void @vrlh(%union.vector_t* %va, %union.vector_t* %vb, %union.vector_t* %vd) nounwind {
 entry:
@@ -123,9 +123,15 @@ for.end:                                          ; preds = %for.body
   ret float %tmp10
 }
 
-!0 = metadata !{metadata !"short", metadata !1}
+; CHECK: [[TAG]] = metadata !{metadata [[TYPE_LL:!.*]], metadata [[TYPE_LL]], i64 0}
+; CHECK: [[TYPE_LL]] = metadata !{metadata !"long long", metadata {{!.*}}}
+!0 = metadata !{metadata !6, metadata !6, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"long long", metadata !1}
-!4 = metadata !{metadata !"int", metadata !1}
-!5 = metadata !{metadata !"float", metadata !1}
+!3 = metadata !{metadata !7, metadata !7, i64 0}
+!4 = metadata !{metadata !8, metadata !8, i64 0}
+!5 = metadata !{metadata !9, metadata !9, i64 0}
+!6 = metadata !{metadata !"short", metadata !1}
+!7 = metadata !{metadata !"long long", metadata !1}
+!8 = metadata !{metadata !"int", metadata !1}
+!9 = metadata !{metadata !"float", metadata !1}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll b/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
index 0a30b30..e9fb941 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
@@ -80,6 +80,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) nounwind
 !0 = metadata !{ }
 
 ; Invariant memory.
-!1 = metadata !{ metadata !"foo", metadata !0, i1 1 }
+!1 = metadata !{metadata !3, metadata !3, i64 0, i1 1 }
 ; Not invariant memory.
-!2 = metadata !{ metadata !"foo", metadata !0, i1 0 }
+!2 = metadata !{metadata !3, metadata !3, i64 0, i1 0 }
+!3 = metadata !{ metadata !"foo", metadata !0 }
diff --git a/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll b/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
index eceaa2c..90e1abb 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/gvn-nonlocal-type-mismatch.ll
@@ -85,7 +85,11 @@ if.else:
 }
 
 !0 = metadata !{}
-!1 = metadata !{metadata !"red", metadata !0}
-!2 = metadata !{metadata !"blu", metadata !0}
-!3 = metadata !{metadata !"outer space"}
-!4 = metadata !{metadata !"brick red", metadata !1}
+!1 = metadata !{metadata !5, metadata !5, i64 0}
+!2 = metadata !{metadata !6, metadata !6, i64 0}
+!3 = metadata !{metadata !7, metadata !7, i64 0}
+!4 = metadata !{metadata !8, metadata !8, i64 0}
+!5 = metadata !{metadata !"red", metadata !0}
+!6 = metadata !{metadata !"blu", metadata !0}
+!7 = metadata !{metadata !"outer space"}
+!8 = metadata !{metadata !"brick red", metadata !5}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
index 6f1c22d..93b8e50 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll
@@ -26,5 +26,7 @@ declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
 ; CHECK: attributes [[NUW]] = { nounwind }
 
 !0 = metadata !{metadata !"tbaa root", null}
-!1 = metadata !{metadata !"A", metadata !0}
-!2 = metadata !{metadata !"B", metadata !0}
+!1 = metadata !{metadata !3, metadata !3, i64 0}
+!2 = metadata !{metadata !4, metadata !4, i64 0}
+!3 = metadata !{metadata !"A", metadata !0}
+!4 = metadata !{metadata !"B", metadata !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/licm.ll b/test/Analysis/TypeBasedAliasAnalysis/licm.ll
index 12a9c1d..e45fc85 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/licm.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/licm.ll
@@ -30,8 +30,8 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 !0 = metadata !{metadata !"root", null}
-!1 = metadata !{metadata !"pointer", metadata !0}
-!2 = metadata !{metadata !"double", metadata !0}
+!1 = metadata !{metadata !6, metadata !6, i64 0}
+!2 = metadata !{metadata !7, metadata !7, i64 0}
 
 ; LICM shouldn't hoist anything here.
 
@@ -56,6 +56,10 @@ loop:
   br label %loop
 }
 
-!3 = metadata !{metadata !"pointer", metadata !4}
-!4 = metadata !{metadata !"char", metadata !5}
-!5 = metadata !{metadata !"root", null}
+!3 = metadata !{metadata !"pointer", metadata !8}
+!4 = metadata !{metadata !8, metadata !8, i64 0}
+!5 = metadata !{metadata !9, metadata !9, i64 0}
+!6 = metadata !{metadata !"pointer", metadata !0}
+!7 = metadata !{metadata !"double", metadata !0}
+!8 = metadata !{metadata !"char", metadata !9}
+!9 = metadata !{metadata !"root", null}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/lit.local.cfg b/test/Analysis/TypeBasedAliasAnalysis/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Analysis/TypeBasedAliasAnalysis/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll b/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
index c2407df..6fd6eac 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/memcpyopt.ll
@@ -7,7 +7,7 @@ target datalayout = "e-p:64:64:64"
 
 ; CHECK: @foo
 ; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p, i8* %q, i64 16, i32 1, i1 false), !tbaa !0
-; CHECK-NEXT: store i8 2, i8* %s, align 1, !tbaa !2
+; CHECK-NEXT: store i8 2, i8* %s, align 1, !tbaa [[TAGA:!.*]]
 ; CHECK-NEXT: ret void
 define void @foo(i8* nocapture %p, i8* nocapture %q, i8* nocapture %s) nounwind {
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p, i8* %q, i64 16, i32 1, i1 false), !tbaa !2
@@ -18,6 +18,10 @@ define void @foo(i8* nocapture %p, i8* nocapture %q, i8* nocapture %s) nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
+; CHECK [[TAGA]] = metadata !{metadata [[TYPEA:!.*]], metadata [[TYPEA]], i64 0}
+; CHECK [[TYPEA]] = metadata !{metadata !"A", metadata !{{.*}}}
 !0 = metadata !{metadata !"tbaa root", null}
-!1 = metadata !{metadata !"A", metadata !0}
-!2 = metadata !{metadata !"B", metadata !0}
+!1 = metadata !{metadata !3, metadata !3, i64 0}
+!2 = metadata !{metadata !4, metadata !4, i64 0}
+!3 = metadata !{metadata !"A", metadata !0}
+!4 = metadata !{metadata !"B", metadata !0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll b/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
index f1edb44..609e87c 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/placement-tbaa.ll
@@ -18,7 +18,7 @@
 
 ; Basic AA says MayAlias, TBAA says NoAlias
 ; CHECK: MayAlias: i64* %i5, i8** %p
-; CHECK: NoAlias: store i64 %conv, i64* %i5, align 8, !tbaa !4 <->   store i8* null, i8** %p, align 8, !tbaa !3
+; CHECK: NoAlias: store i64 %conv, i64* %i5, align 8, !tbaa !6 <->   store i8* null, i8** %p, align 8, !tbaa !9
 
 %struct.Foo = type { i64 }
 %struct.Bar = type { i8* }
@@ -32,10 +32,10 @@ entry:
   store i32 %n, i32* %n.addr, align 4, !tbaa !0
   %call = call noalias i8* @_Znwm(i64 8)
   %0 = bitcast i8* %call to %struct.Foo*
-  store %struct.Foo* %0, %struct.Foo** %f, align 8, !tbaa !3
-  %1 = load %struct.Foo** %f, align 8, !tbaa !3
+  store %struct.Foo* %0, %struct.Foo** %f, align 8, !tbaa !4
+  %1 = load %struct.Foo** %f, align 8, !tbaa !4
   %i = getelementptr inbounds %struct.Foo* %1, i32 0, i32 0
-  store i64 1, i64* %i, align 8, !tbaa !4
+  store i64 1, i64* %i, align 8, !tbaa !6
   store i32 0, i32* %i1, align 4, !tbaa !0
   br label %for.cond
 
@@ -46,7 +46,7 @@ for.cond:
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:
-  %4 = load %struct.Foo** %f, align 8, !tbaa !3
+  %4 = load %struct.Foo** %f, align 8, !tbaa !4
   %5 = bitcast %struct.Foo* %4 to i8*
   %new.isnull = icmp eq i8* %5, null
   br i1 %new.isnull, label %new.cont, label %new.notnull
@@ -57,11 +57,11 @@ new.notnull:
 
 new.cont:
   %7 = phi %struct.Bar* [ %6, %new.notnull ], [ null, %for.body ]
-  store %struct.Bar* %7, %struct.Bar** %b, align 8, !tbaa !3
-  %8 = load %struct.Bar** %b, align 8, !tbaa !3
+  store %struct.Bar* %7, %struct.Bar** %b, align 8, !tbaa !4
+  %8 = load %struct.Bar** %b, align 8, !tbaa !4
   %p = getelementptr inbounds %struct.Bar* %8, i32 0, i32 0
-  store i8* null, i8** %p, align 8, !tbaa !3
-  %9 = load %struct.Foo** %f, align 8, !tbaa !3
+  store i8* null, i8** %p, align 8, !tbaa !9
+  %9 = load %struct.Foo** %f, align 8, !tbaa !4
   %10 = bitcast %struct.Foo* %9 to i8*
   %new.isnull2 = icmp eq i8* %10, null
   br i1 %new.isnull2, label %new.cont4, label %new.notnull3
@@ -72,12 +72,12 @@ new.notnull3:
 
 new.cont4:
   %12 = phi %struct.Foo* [ %11, %new.notnull3 ], [ null, %new.cont ]
-  store %struct.Foo* %12, %struct.Foo** %f, align 8, !tbaa !3
+  store %struct.Foo* %12, %struct.Foo** %f, align 8, !tbaa !4
   %13 = load i32* %i1, align 4, !tbaa !0
   %conv = sext i32 %13 to i64
-  %14 = load %struct.Foo** %f, align 8, !tbaa !3
+  %14 = load %struct.Foo** %f, align 8, !tbaa !4
   %i5 = getelementptr inbounds %struct.Foo* %14, i32 0, i32 0
-  store i64 %conv, i64* %i5, align 8, !tbaa !4
+  store i64 %conv, i64* %i5, align 8, !tbaa !6
   br label %for.inc
 
 for.inc:
@@ -87,9 +87,9 @@ for.inc:
   br label %for.cond
 
 for.end:
-  %16 = load %struct.Foo** %f, align 8, !tbaa !3
+  %16 = load %struct.Foo** %f, align 8, !tbaa !4
   %i6 = getelementptr inbounds %struct.Foo* %16, i32 0, i32 0
-  %17 = load i64* %i6, align 8, !tbaa !4
+  %17 = load i64* %i6, align 8, !tbaa !6
   ret i64 %17
 }
 
@@ -97,8 +97,14 @@ declare noalias i8* @_Znwm(i64)
 
 attributes #0 = { nounwind }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
-!4 = metadata !{metadata !"long", metadata !1}
+!0 = metadata !{metadata !1, metadata !1, i64 0}
+!1 = metadata !{metadata !"int", metadata !2, i64 0}
+!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0}
+!3 = metadata !{metadata !"Simple C/C++ TBAA"}
+!4 = metadata !{metadata !5, metadata !5, i64 0}
+!5 = metadata !{metadata !"any pointer", metadata !2, i64 0}
+!6 = metadata !{metadata !7, metadata !8, i64 0}
+!7 = metadata !{metadata !"_ZTS3Foo", metadata !8, i64 0}
+!8 = metadata !{metadata !"long", metadata !2, i64 0}
+!9 = metadata !{metadata !10, metadata !5, i64 0}
+!10 = metadata !{metadata !"_ZTS3Bar", metadata !5, i64 0}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/precedence.ll b/test/Analysis/TypeBasedAliasAnalysis/precedence.ll
index 47cb5f2..b219ef1 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/precedence.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/precedence.ll
@@ -39,8 +39,12 @@ entry:
   ret i64 %tmp3
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
+!0 = metadata !{metadata !2, metadata !2, i64 0}
 !1 = metadata !{metadata !"simple"}
-!3 = metadata !{metadata !"float", metadata !1}
-!4 = metadata !{metadata !"long", metadata !1}
-!5 = metadata !{metadata !"small", metadata !1}
+!2 = metadata !{metadata !"int", metadata !1}
+!3 = metadata !{metadata !6, metadata !6, i64 0}
+!4 = metadata !{metadata !7, metadata !7, i64 0}
+!5 = metadata !{metadata !8, metadata !8, i64 0}
+!6 = metadata !{metadata !"float", metadata !1}
+!7 = metadata !{metadata !"long", metadata !1}
+!8 = metadata !{metadata !"small", metadata !1}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/sink.ll b/test/Analysis/TypeBasedAliasAnalysis/sink.ll
index fd32d6a..726da6c 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/sink.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/sink.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -tbaa -sink -S < %s | FileCheck %s
 
 ; CHECK: a:
-; CHECK:   %f = load float* %p, !tbaa !2
+; CHECK:   %f = load float* %p, !tbaa [[TAGA:!.*]]
 ; CHECK:   store float %f, float* %q
 
 define void @foo(float* %p, i1 %c, float* %q, float* %r) {
@@ -15,6 +15,10 @@ b:
   ret void
 }
 
-!0 = metadata !{metadata !"A", metadata !2}
-!1 = metadata !{metadata !"B", metadata !2}
+; CHECK: [[TAGA]] = metadata !{metadata [[TYPEA:!.*]], metadata [[TYPEA]], i64 0}
+; CHECK: [[TYPEA]] = metadata !{metadata !"A", metadata !{{.*}}}
+!0 = metadata !{metadata !3, metadata !3, i64 0}
+!1 = metadata !{metadata !4, metadata !4, i64 0}
 !2 = metadata !{metadata !"test"}
+!3 = metadata !{metadata !"A", metadata !2}
+!4 = metadata !{metadata !"B", metadata !2}
diff --git a/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll b/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
index ee52763..0cd5c30 100644
--- a/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
+++ b/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -tbaa -basicaa -struct-path-tbaa -aa-eval -evaluate-tbaa -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s
-; RUN: opt < %s -tbaa -basicaa -struct-path-tbaa -gvn -S | FileCheck %s --check-prefix=OPT
+; RUN: opt < %s -tbaa -basicaa -aa-eval -evaluate-tbaa -print-no-aliases -print-may-aliases -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tbaa -basicaa -gvn -S | FileCheck %s --check-prefix=OPT
 ; Generated from clang/test/CodeGen/tbaa.cpp with "-O1 -struct-path-tbaa -disable-llvm-optzns".
 
 %struct.StructA = type { i16, i32, i16, i32 }
diff --git a/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll b/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
index a0d77fa..17dd745 100644
--- a/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
+++ b/test/Assembler/2010-02-05-FunctionLocalMetadataBecomesNull.ll
@@ -25,9 +25,9 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !7 = metadata !{metadata !1}
 !6 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.0 (trunk 131941)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, metadata !7, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !0 = metadata !{i32 786688, metadata !1, metadata !"c", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!1 = metadata !{i32 786478, metadata !8, metadata !2, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 ()* @main, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !8, metadata !2, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !6, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b"}
diff --git a/test/Assembler/ConstantExprFoldCast.ll b/test/Assembler/ConstantExprFoldCast.ll
index 0ce6e84..161a4ca 100644
--- a/test/Assembler/ConstantExprFoldCast.ll
+++ b/test/Assembler/ConstantExprFoldCast.ll
@@ -12,3 +12,5 @@
 @F = global i32* inttoptr (i32 add (i32 5, i32 -5) to i32*)
 @G = global i32* inttoptr (i32 sub (i32 5, i32 5) to i32*)
 
+; Address space cast AS0 null-> AS1 null
+@H = global i32 addrspace(1)* addrspacecast(i32* null to i32 addrspace(1)*)
diff --git a/test/Assembler/ConstantExprNoFold.ll b/test/Assembler/ConstantExprNoFold.ll
index 83e8909..b41959f 100644
--- a/test/Assembler/ConstantExprNoFold.ll
+++ b/test/Assembler/ConstantExprNoFold.ll
@@ -21,3 +21,6 @@ target datalayout = "p:32:32"
 
 ; CHECK: @D = global i1 icmp eq (i64* getelementptr inbounds (i64* @A, i64 1), i64* getelementptr inbounds (i64* @B, i64 2))
 @D = global i1 icmp eq (i64* getelementptr inbounds (i64* @A, i64 1), i64* getelementptr inbounds (i64* @B, i64 2))
+
+; CHECK: @E = global i64 addrspace(1)* addrspacecast (i64* @A to i64 addrspace(1)*)
+@E = global i64 addrspace(1)* addrspacecast(i64* @A to i64 addrspace(1)*)
diff --git a/test/Assembler/attribute-builtin.ll b/test/Assembler/attribute-builtin.ll
index a7799f0..01c8a6b 100644
--- a/test/Assembler/attribute-builtin.ll
+++ b/test/Assembler/attribute-builtin.ll
@@ -8,7 +8,7 @@
 ; RUN: llvm-dis | \
 ; RUN: llvm-as -disable-verify | \
 ; RUN: llvm-dis | \
-; RUN: FileCheck -check-prefix=ASSEMBLES %s
+; RUN: FileCheck -check-prefix=CHECK-ASSEMBLES %s
 
 ; CHECK-ASSEMBLES: declare i8* @foo(i8*) [[NOBUILTIN:#[0-9]+]]
 ; CHECK-ASSEMBLES: call i8* @foo(i8* %x) [[BUILTIN:#[0-9]+]]
@@ -26,10 +26,8 @@ define i8* @bar(i8* %x) {
 ; which do not have nobuiltin on them.
 ; rdar://13727199
 
-; RUN: not llvm-as <%s 2>&1  | FileCheck -check-prefix=BAD %s
+; RUN: not llvm-as <%s 2>&1  | FileCheck -check-prefix=CHECK-BAD %s
 
-; CHECK-BAD: Attribute 'builtin' can only be used in a call to a function with the 'nobuiltin' attribute.
-; CHECK-BAD-NEXT: %y = call i8* @lar(i8* %x) #1
 ; CHECK-BAD: Attribute 'builtin' can only be applied to a callsite.
 ; CHECK-BAD-NEXT: i8* (i8*)* @car
 ; CHECK-BAD: Attribute 'builtin' can only be applied to a callsite.
diff --git a/test/Assembler/auto_upgrade_intrinsics.ll b/test/Assembler/auto_upgrade_intrinsics.ll
index 7ad5cc3..8f655ce 100644
--- a/test/Assembler/auto_upgrade_intrinsics.ll
+++ b/test/Assembler/auto_upgrade_intrinsics.ll
@@ -6,6 +6,10 @@ declare i16 @llvm.ctlz.i16(i16)
 declare i32 @llvm.ctlz.i32(i32)
 declare i42 @llvm.ctlz.i42(i42)  ; Not a power-of-2
 
+
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
+
+
 define void @test.ctlz(i8 %a, i16 %b, i32 %c, i42 %d) {
 ; CHECK: @test.ctlz
 
@@ -42,3 +46,14 @@ entry:
 
   ret void
 }
+
+
+@a = private global [60 x i8] zeroinitializer, align 1
+
+define i32 @test.objectsize() {
+; CHECK-LABEL: @test.objectsize(
+; CHECK: @llvm.objectsize.i32.p0i8
+; CHECK-DAG: declare i32 @llvm.objectsize.i32.p0i8
+  %s = call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+  ret i32 %s
+}
diff --git a/test/Assembler/functionlocal-metadata.ll b/test/Assembler/functionlocal-metadata.ll
index 0f0ab4c..0d93bfd 100644
--- a/test/Assembler/functionlocal-metadata.ll
+++ b/test/Assembler/functionlocal-metadata.ll
@@ -3,7 +3,7 @@
 define void @Foo(i32 %a, i32 %b) {
 entry:
   call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !2)
-; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata !2)
+; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID2:[0-9]+]])
   %0 = add i32 %a, 1                              ; <i32> [#uses=1]
   %two = add i32 %b, %0                           ; <i32> [#uses=0]
   %1 = alloca i32                                 ; <i32*> [#uses=1]
@@ -19,28 +19,38 @@ entry:
   call void @llvm.dbg.declare(metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"})
 ; CHECK: metadata !{i32 %a}, metadata !{i32 %a, metadata !"foo"}
   call void @llvm.dbg.declare(metadata !{i32 %b}, metadata !{metadata !0, i32 %two})
-; CHECK: metadata !{i32 %b}, metadata !{metadata !0, i32 %two}
+; CHECK: metadata !{i32 %b}, metadata !{metadata ![[ID0:[0-9]+]], i32 %two}
 
   call void @llvm.dbg.value(metadata !{ i32 %a }, i64 0, metadata !1)
-; CHECK: metadata !{i32 %a}, i64 0, metadata !1
+; CHECK: metadata !{i32 %a}, i64 0, metadata ![[ID1:[0-9]+]]
   call void @llvm.dbg.value(metadata !{ i32 %0 }, i64 25, metadata !0)
-; CHECK: metadata !{i32 %0}, i64 25, metadata !0
+; CHECK: metadata !{i32 %0}, i64 25, metadata ![[ID0]]
   call void @llvm.dbg.value(metadata !{ i32* %1 }, i64 16, metadata !3)
-; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata !3)
+; CHECK: call void @llvm.dbg.value(metadata !{i32* %1}, i64 16, metadata ![[ID3:[0-9]+]])
   call void @llvm.dbg.value(metadata !3, i64 12, metadata !2)
-; CHECK: metadata !3, i64 12, metadata !2
+; CHECK: metadata ![[ID3]], i64 12, metadata ![[ID2]]
 
   ret void, !foo !0, !bar !1
-; CHECK: ret void, !foo !0, !bar !1
+; CHECK: ret void, !foo ![[FOO:[0-9]+]], !bar ![[BAR:[0-9]+]]
 }
 
+!llvm.module.flags = !{!4}
+
 !0 = metadata !{i32 662302, i32 26, metadata !1, null}
 !1 = metadata !{i32 4, metadata !"foo"}
 !2 = metadata !{metadata !"bar"}
 !3 = metadata !{metadata !"foo"}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !foo = !{ !0 }
 !bar = !{ !1 }
+
+; CHECK: !foo = !{![[FOO]]}
+; CHECK: !bar = !{![[BAR]]}
+; CHECK: ![[ID0]] = metadata !{i32 662302, i32 26, metadata ![[ID1]], null}
+; CHECK: ![[ID1]] = metadata !{i32 4, metadata !"foo"}
+; CHECK: ![[ID2]] = metadata !{metadata !"bar"}
+; CHECK; ![[ID3]] = metadata !{metadata !"foo"}
diff --git a/test/Assembler/lit.local.cfg b/test/Assembler/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Assembler/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Bindings/Ocaml/analysis.ml b/test/Bindings/Ocaml/analysis.ml
index 7df8e21..c02645c 100644
--- a/test/Bindings/Ocaml/analysis.ml
+++ b/test/Bindings/Ocaml/analysis.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_analysis.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_analysis.cmxa %t.builddir/analysis.ml -o %t
  * RUN: %t
  * XFAIL: vg_leak
  *)
diff --git a/test/Bindings/Ocaml/bitreader.ml b/test/Bindings/Ocaml/bitreader.ml
index e5beccd..f1d202a 100644
--- a/test/Bindings/Ocaml/bitreader.ml
+++ b/test/Bindings/Ocaml/bitreader.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_bitreader.cmxa llvm_bitwriter.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_bitreader.cmxa llvm_bitwriter.cmxa %t.builddir/bitreader.ml -o %t
  * RUN: %t %t.bc
  * RUN: llvm-dis < %t.bc
  * XFAIL: vg_leak
diff --git a/test/Bindings/Ocaml/bitwriter.ml b/test/Bindings/Ocaml/bitwriter.ml
index 1388760..ae456cf 100644
--- a/test/Bindings/Ocaml/bitwriter.ml
+++ b/test/Bindings/Ocaml/bitwriter.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A unix.cmxa llvm.cmxa llvm_bitwriter.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A unix.cmxa llvm.cmxa llvm_bitwriter.cmxa %t.builddir/bitwriter.ml -o %t
  * RUN: %t %t.bc
  * RUN: llvm-dis < %t.bc
  * XFAIL: vg_leak
diff --git a/test/Bindings/Ocaml/executionengine.ml b/test/Bindings/Ocaml/executionengine.ml
index f7a49bb..8e24949 100644
--- a/test/Bindings/Ocaml/executionengine.ml
+++ b/test/Bindings/Ocaml/executionengine.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_target.cmxa llvm_executionengine.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_target.cmxa llvm_executionengine.cmxa %t.builddir/executionengine.ml -o %t
  * RUN: %t
  * XFAIL: vg_leak
  *)
@@ -100,11 +103,11 @@ let test_executionengine () =
   (* run_static_dtors *)
   ExecutionEngine.run_static_dtors ee;
 
-  (* Show that the target data binding links and runs.*)
-  let td = ExecutionEngine.target_data ee in
+  (* Show that the data layout binding links and runs.*)
+  let dl = ExecutionEngine.data_layout ee in
 
   (* Demonstrate that a garbage pointer wasn't returned. *)
-  let ty = intptr_type td in
+  let ty = DataLayout.intptr_type context dl in
   if ty != i32_type && ty != i64_type then bomb "target_data did not work";
   
   (* dispose *)
diff --git a/test/Bindings/Ocaml/ext_exc.ml b/test/Bindings/Ocaml/ext_exc.ml
index b4d2e6d..9afc3c3 100644
--- a/test/Bindings/Ocaml/ext_exc.ml
+++ b/test/Bindings/Ocaml/ext_exc.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_bitreader.cmxa llvm_executionengine.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_bitreader.cmxa llvm_executionengine.cmxa %t.builddir/ext_exc.ml -o %t
  * RUN: %t </dev/null
  * XFAIL: vg_leak
  *)
diff --git a/test/Bindings/Ocaml/ipo_opts.ml b/test/Bindings/Ocaml/ipo_opts.ml
index d4537e4..e0bcbe5 100644
--- a/test/Bindings/Ocaml/ipo_opts.ml
+++ b/test/Bindings/Ocaml/ipo_opts.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_ipo.cmxa llvm_target.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_ipo.cmxa llvm_target.cmxa %t.builddir/ipo_opts.ml -o %t
  * RUN: %t %t.bc
  * XFAIL: vg_leak
  *)
@@ -43,15 +46,13 @@ let test_transforms () =
       ignore (build_ret (build_call fn [| |] "" b) b);
   end;
 
-  let td = DataLayout.create (target_triple m) in
-  
   ignore (PassManager.create ()
-           ++ DataLayout.add td
            ++ add_argument_promotion
            ++ add_constant_merge
            ++ add_dead_arg_elimination
            ++ add_function_attrs
            ++ add_function_inlining
+           ++ add_always_inliner
            ++ add_global_dce
            ++ add_global_optimizer
            ++ add_ipc_propagation
@@ -61,9 +62,7 @@ let test_transforms () =
            ++ add_strip_dead_prototypes
            ++ add_strip_symbols
            ++ PassManager.run_module m
-           ++ PassManager.dispose);
-
-  DataLayout.dispose td
+           ++ PassManager.dispose)
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/irreader.ml b/test/Bindings/Ocaml/irreader.ml
new file mode 100644
index 0000000..3511c2b
--- /dev/null
+++ b/test/Bindings/Ocaml/irreader.ml
@@ -0,0 +1,59 @@
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -g -warn-error A llvm.cmxa llvm_irreader.cmxa %t.builddir/irreader.ml -o %t
+ * RUN: %t
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_irreader
+
+let context = global_context ()
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+let _ =
+  Printexc.record_backtrace true
+
+let insist cond =
+  if not cond then failwith "insist"
+
+
+(*===-- IR Reader ---------------------------------------------------------===*)
+
+let test_irreader () =
+  begin
+    let buf = MemoryBuffer.of_string "@foo = global i32 42" in
+    let m   = parse_ir context buf in
+    match lookup_global "foo" m with
+    | Some foo ->
+        insist ((global_initializer foo) = (const_int (i32_type context) 42))
+    | None ->
+        failwith "global"
+  end;
+
+  begin
+    let buf = MemoryBuffer.of_string "@foo = global garble" in
+    try
+      ignore (parse_ir context buf);
+      failwith "parsed"
+    with Llvm_irreader.Error _ ->
+      ()
+  end
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "irreader" test_irreader
diff --git a/test/Bindings/Ocaml/linker.ml b/test/Bindings/Ocaml/linker.ml
new file mode 100644
index 0000000..9359ae9
--- /dev/null
+++ b/test/Bindings/Ocaml/linker.ml
@@ -0,0 +1,63 @@
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_linker.cmxa %t.builddir/linker.ml -o %t
+ * RUN: %t
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_linker
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Linker -----------------------------------------------------------===*)
+
+let test_linker () =
+  let fty = function_type void_type [| |] in
+
+  let make_module name =
+    let m = create_module context name in
+    let fn = define_function ("fn_" ^ name) fty m in
+    ignore (build_ret_void (builder_at_end context (entry_block fn)));
+    m
+  in
+
+  let m1 = make_module "one"
+  and m2 = make_module "two" in
+  link_modules m1 m2 Mode.PreserveSource;
+  dispose_module m1;
+  dispose_module m2;
+
+  let m1 = make_module "one"
+  and m2 = make_module "two" in
+  link_modules m1 m2 Mode.DestroySource;
+  dispose_module m1;
+
+  let m1 = make_module "one"
+  and m2 = make_module "one" in
+  try
+    link_modules m1 m2 Mode.PreserveSource;
+    failwith "must raise"
+  with Error _ ->
+    dispose_module m1;
+    dispose_module m2
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "linker" test_linker
diff --git a/test/Bindings/Ocaml/lit.local.cfg b/test/Bindings/Ocaml/lit.local.cfg
index 640c58d..c38d89a 100644
--- a/test/Bindings/Ocaml/lit.local.cfg
+++ b/test/Bindings/Ocaml/lit.local.cfg
@@ -1,6 +1,5 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.ml']
+config.suffixes = ['.ml']
 
 bindings = set([s.strip() for s in config.root.llvm_bindings.split(',')])
 if not 'ocaml' in bindings:
     config.unsupported = True
-
diff --git a/test/Bindings/Ocaml/passmgr_builder.ml b/test/Bindings/Ocaml/passmgr_builder.ml
new file mode 100644
index 0000000..1a3102f
--- /dev/null
+++ b/test/Bindings/Ocaml/passmgr_builder.ml
@@ -0,0 +1,64 @@
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_passmgr_builder.cmxa %t.builddir/passmgr_builder.ml -o %t
+ * RUN: %t %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_passmgr_builder
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+
+(*===-- Pass Manager Builder ----------------------------------------------===*)
+
+let test_pmbuilder () =
+  let (++) x f = ignore (f x); x in
+
+  let module_passmgr = PassManager.create () in
+  let func_passmgr   = PassManager.create_function m in
+  let lto_passmgr    = PassManager.create () in
+
+  ignore (Llvm_passmgr_builder.create ()
+           ++ set_opt_level 3
+           ++ set_size_level 1
+           ++ set_disable_unit_at_a_time false
+           ++ set_disable_unroll_loops false
+           ++ use_inliner_with_threshold 10
+           ++ populate_function_pass_manager func_passmgr
+           ++ populate_module_pass_manager module_passmgr
+           ++ populate_lto_pass_manager lto_passmgr
+                  ~internalize:false ~run_inliner:false);
+  Gc.compact ();
+
+  PassManager.dispose module_passmgr;
+  PassManager.dispose func_passmgr;
+  PassManager.dispose lto_passmgr
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "pass manager builder" test_pmbuilder;
+  dispose_module m
diff --git a/test/Bindings/Ocaml/scalar_opts.ml b/test/Bindings/Ocaml/scalar_opts.ml
index 0760dad..39913e4 100644
--- a/test/Bindings/Ocaml/scalar_opts.ml
+++ b/test/Bindings/Ocaml/scalar_opts.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_scalar_opts.cmxa llvm_target.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_scalar_opts.cmxa llvm_target.cmxa %t.builddir/scalar_opts.ml -o %t
  * RUN: %t %t.bc
  * XFAIL: vg_leak
  *)
@@ -38,10 +41,7 @@ let test_transforms () =
   let fn = define_function "fn" fty m in
   ignore (build_ret_void (builder_at_end context (entry_block fn)));
   
-  let td = DataLayout.create (target_triple m) in
-  
   ignore (PassManager.create_function m
-           ++ DataLayout.add td
            ++ add_verifier
            ++ add_constant_propagation
            ++ add_sccp
@@ -72,13 +72,12 @@ let test_transforms () =
            ++ add_lower_expect_intrinsic
            ++ add_type_based_alias_analysis
            ++ add_basic_alias_analysis
+           ++ add_partially_inline_lib_calls
            ++ add_verifier
            ++ PassManager.initialize
            ++ PassManager.run_function fn
            ++ PassManager.finalize
-           ++ PassManager.dispose);
-  
-  DataLayout.dispose td
+           ++ PassManager.dispose)
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/target.ml b/test/Bindings/Ocaml/target.ml
index 7a35a79..d69fb0e 100644
--- a/test/Bindings/Ocaml/target.ml
+++ b/test/Bindings/Ocaml/target.ml
@@ -1,5 +1,9 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_target.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -g -warn-error A llvm.cmxa llvm_target.cmxa llvm_executionengine.cmxa %t.builddir/target.ml -o %t
  * RUN: %t %t.bc
+ * REQUIRES: native, object-emission
  * XFAIL: vg_leak
  *)
 
@@ -10,6 +14,7 @@
 open Llvm
 open Llvm_target
 
+let _ = Llvm_executionengine.initialize_native_target ()
 
 let context = global_context ()
 let i32_type = Llvm.i32_type context
@@ -18,10 +23,11 @@ let i64_type = Llvm.i64_type context
 (* Tiny unit test framework - really just to help find which line is busted *)
 let print_checkpoints = false
 
-let suite name f =
-  if print_checkpoints then
-    prerr_endline (name ^ ":");
-  f ()
+let _ =
+  Printexc.record_backtrace true
+
+let assert_equal a b =
+  if a <> b then failwith "assert_equal"
 
 
 (*===-- Fixture -----------------------------------------------------------===*)
@@ -29,31 +35,83 @@ let suite name f =
 let filename = Sys.argv.(1)
 let m = create_module context filename
 
+let target = Target.by_triple (Target.default_triple ())
 
-(*===-- Target Data -------------------------------------------------------===*)
+let machine = TargetMachine.create (Target.default_triple ()) target
+
+(*===-- Data Layout -------------------------------------------------------===*)
 
 let test_target_data () =
-  let td = DataLayout.create (target_triple m) in
-  let sty = struct_type context [| i32_type; i64_type |] in
-  
-  ignore (DataLayout.as_string td);
-  ignore (byte_order td);
-  ignore (pointer_size td);
-  ignore (intptr_type td);
-  ignore (size_in_bits td sty);
-  ignore (store_size td sty);
-  ignore (abi_size td sty);
-  ignore (stack_align td sty);
-  ignore (preferred_align td sty);
-  ignore (preferred_align_of_global td (declare_global sty "g" m));
-  ignore (element_at_offset td sty (Int64.of_int 1));
-  ignore (offset_of_element td sty 1);
+  let module DL = DataLayout in
+  let layout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-" ^
+               "f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-" ^
+               "a0:0:64-n32" in
+  let dl     = DL.of_string layout in
+  let sty    = struct_type context [| i32_type; i64_type |] in
   
-  DataLayout.dispose td
+  assert_equal (DL.as_string dl) layout;
+  assert_equal (DL.byte_order dl) Endian.Little;
+  assert_equal (DL.pointer_size dl) 4;
+  assert_equal (DL.intptr_type context dl) i32_type;
+  assert_equal (DL.qualified_pointer_size 0 dl) 4;
+  assert_equal (DL.qualified_intptr_type context 0 dl) i32_type;
+  assert_equal (DL.size_in_bits sty dl) (Int64.of_int 96);
+  assert_equal (DL.store_size sty dl) (Int64.of_int 12);
+  assert_equal (DL.abi_size sty dl) (Int64.of_int 12);
+  assert_equal (DL.stack_align sty dl) 4;
+  assert_equal (DL.preferred_align sty dl) 8;
+  assert_equal (DL.preferred_align_of_global (declare_global sty "g" m) dl) 8;
+  assert_equal (DL.element_at_offset sty (Int64.of_int 1) dl) 0;
+  assert_equal (DL.offset_of_element sty 1 dl) (Int64.of_int 4);
+
+  let pm = PassManager.create () in
+  ignore (DL.add_to_pass_manager pm dl)
+
+
+(*===-- Target ------------------------------------------------------------===*)
+
+let test_target () =
+  let module T = Target in
+  ignore (T.succ target);
+  ignore (T.name target);
+  ignore (T.description target);
+  ignore (T.has_jit target);
+  ignore (T.has_target_machine target);
+  ignore (T.has_asm_backend target)
+
+
+(*===-- Target Machine ----------------------------------------------------===*)
+
+let test_target_machine () =
+  let module TM = TargetMachine in
+  assert_equal (TM.target machine) target;
+  assert_equal (TM.triple machine) (Target.default_triple ());
+  assert_equal (TM.cpu machine) "";
+  assert_equal (TM.features machine) "";
+  ignore (TM.data_layout machine)
+
+
+(*===-- Code Emission -----------------------------------------------------===*)
+
+let test_code_emission () =
+  TargetMachine.emit_to_file m CodeGenFileType.ObjectFile filename machine;
+  try
+    TargetMachine.emit_to_file m CodeGenFileType.ObjectFile
+                               "/nonexistent/file" machine;
+    failwith "must raise"
+  with Llvm_target.Error _ ->
+    ();
+
+  let buf = TargetMachine.emit_to_memory_buffer m CodeGenFileType.ObjectFile
+                                                machine in
+  Llvm.MemoryBuffer.dispose buf
 
 
 (*===-- Driver ------------------------------------------------------------===*)
 
 let _ =
-  suite "target data" test_target_data;
+  test_target_data ();
+  test_target ();
+  test_target_machine ();
+  (* test_code_emission (); *) (* broken without AsmParser support *)
   dispose_module m
diff --git a/test/Bindings/Ocaml/vectorize_opts.ml b/test/Bindings/Ocaml/vectorize_opts.ml
new file mode 100644
index 0000000..5ef985d
--- /dev/null
+++ b/test/Bindings/Ocaml/vectorize_opts.ml
@@ -0,0 +1,56 @@
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_vectorize.cmxa llvm_target.cmxa %t.builddir/vectorize_opts.ml -o %t
+ * RUN: %t %t.bc
+ * XFAIL: vg_leak
+ *)
+
+(* Note: It takes several seconds for ocamlopt to link an executable with
+         libLLVMCore.a, so it's better to write a big test than a bunch of
+         little ones. *)
+
+open Llvm
+open Llvm_vectorize
+open Llvm_target
+
+let context = global_context ()
+let void_type = Llvm.void_type context
+
+(* Tiny unit test framework - really just to help find which line is busted *)
+let print_checkpoints = false
+
+let suite name f =
+  if print_checkpoints then
+    prerr_endline (name ^ ":");
+  f ()
+
+
+(*===-- Fixture -----------------------------------------------------------===*)
+
+let filename = Sys.argv.(1)
+let m = create_module context filename
+
+
+(*===-- Transforms --------------------------------------------------------===*)
+
+let test_transforms () =
+  let (++) x f = ignore (f x); x in
+
+  let fty = function_type void_type [| |] in
+  let fn = define_function "fn" fty m in
+  ignore (build_ret_void (builder_at_end context (entry_block fn)));
+
+  ignore (PassManager.create ()
+           ++ add_bb_vectorize
+           ++ add_loop_vectorize
+           ++ add_slp_vectorize
+           ++ PassManager.run_module m
+           ++ PassManager.dispose)
+
+
+(*===-- Driver ------------------------------------------------------------===*)
+
+let _ =
+  suite "transforms" test_transforms;
+  dispose_module m
diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
index ccde1f0..167efce 100644
--- a/test/Bindings/Ocaml/vmcore.ml
+++ b/test/Bindings/Ocaml/vmcore.ml
@@ -1,4 +1,7 @@
-(* RUN: %ocamlopt -warn-error A llvm.cmxa llvm_analysis.cmxa llvm_bitwriter.cmxa %s -o %t
+(* RUN: rm -rf %t.builddir
+ * RUN: mkdir -p %t.builddir
+ * RUN: cp %s %t.builddir
+ * RUN: %ocamlopt -warn-error A llvm.cmxa llvm_analysis.cmxa llvm_bitwriter.cmxa %t.builddir/vmcore.ml -o %t
  * RUN: %t %t.bc
  * RUN: llvm-dis < %t.bc > %t.ll
  * RUN: FileCheck %s < %t.ll
@@ -64,6 +67,14 @@ let filename = Sys.argv.(1)
 let m = create_module context filename
 
 
+(*===-- Conversion --------------------------------------------------------===*)
+
+let test_conversion () =
+  insist ("i32" = (string_of_lltype i32_type));
+  let c = const_int i32_type 42 in
+  insist ("i32 42" = (string_of_llvalue c))
+
+
 (*===-- Target ------------------------------------------------------------===*)
 
 let test_target () =
@@ -283,6 +294,7 @@ let test_constants () =
    * CHECK: const_ptrtoint{{.*}}ptrtoint
    * CHECK: const_inttoptr{{.*}}inttoptr
    * CHECK: const_bitcast{{.*}}bitcast
+   * CHECK: const_intcast{{.*}}zext
    *)
   let i128_type = integer_type context 128 in
   ignore (define_global "const_trunc" (const_trunc (const_add foldbomb five)
@@ -302,6 +314,8 @@ let test_constants () =
   ignore (define_global "const_inttoptr" (const_inttoptr (const_add foldbomb five)
                                                   void_ptr) m);
   ignore (define_global "const_bitcast" (const_bitcast ffoldbomb i64_type) m);
+  ignore (define_global "const_intcast"
+          (const_intcast foldbomb i128_type ~is_signed:false) m);
   
   group "misc constants";
   (* CHECK: const_size_of{{.*}}getelementptr{{.*}}null
@@ -402,7 +416,8 @@ let test_global_variables () =
   let fourty_two32 = const_int i32_type 42 in
 
   group "declarations"; begin
-    (* CHECK: GVar01{{.*}}external
+    (* CHECK: @GVar01 = external global i32
+     * CHECK: @QGVar01 = external addrspace(3) global i32
      *)
     insist (None == lookup_global "GVar01" m);
     let g = declare_global i32_type "GVar01" m in
@@ -424,8 +439,10 @@ let test_global_variables () =
   end;
   
   group "definitions"; begin
-    (* CHECK: GVar02{{.*}}42
-     * CHECK: GVar03{{.*}}42
+    (* CHECK: @GVar02 = global i32 42
+     * CHECK: @GVar03 = global i32 42
+     * CHECK: @QGVar02 = addrspace(3) global i32 42
+     * CHECK: @QGVar03 = addrspace(3) global i32 42
      *)
     let g = define_global "GVar02" fourty_two32 m in
     let g2 = declare_global i32_type "GVar03" m ++
@@ -449,10 +466,24 @@ let test_global_variables () =
           set_thread_local true in
   insist (is_thread_local g);
 
-  (* CHECK-NOWHERE-NOT: GVar05
+  (* CHECK: GVar05{{.*}}thread_local(initialexec)
+   *)
+  group "threadlocal_mode";
+  let g = define_global "GVar05" fourty_two32 m ++
+          set_thread_local_mode ThreadLocalMode.InitialExec in
+  insist ((thread_local_mode g) = ThreadLocalMode.InitialExec);
+
+  (* CHECK: GVar06{{.*}}externally_initialized
+   *)
+  group "externally_initialized";
+  let g = define_global "GVar06" fourty_two32 m ++
+          set_externally_initialized true in
+  insist (is_externally_initialized g);
+
+  (* CHECK-NOWHERE-NOT: GVar07
    *)
   group "delete";
-  let g = define_global "GVar05" fourty_two32 m in
+  let g = define_global "GVar07" fourty_two32 m in
   delete_global g;
 
   (* CHECK: ConstGlobalVar{{.*}}constant
@@ -1000,8 +1031,8 @@ let test_builder () =
   end;
 
   group "metadata"; begin
-    (* CHECK: %metadata = add i32 %P1, %P2, !test !0
-     * !0 is metadata emitted at EOF.
+    (* CHECK: %metadata = add i32 %P1, %P2, !test !1
+     * !1 is metadata emitted at EOF.
      *)
     let i = build_add p1 p2 "metadata" atentry in
     insist ((has_metadata i) = false);
@@ -1024,9 +1055,19 @@ let test_builder () =
     set_metadata i kind md
   end;
 
+  group "named metadata"; begin
+    (* !llvm.module.flags is emitted at EOF. *)
+    let n1 = const_int i32_type 1 in
+    let n2 = mdstring context "Debug Info Version" in
+    let md = mdnode context [| n1; n2; n1 |] in
+    add_named_metadata_operand m "llvm.module.flags" md;
+
+    insist ((get_named_metadata m "llvm.module.flags") = [| md |])
+  end;
+
   group "dbg"; begin
-    (* CHECK: %dbg = add i32 %P1, %P2, !dbg !1
-     * !1 is metadata emitted at EOF.
+    (* CHECK: %dbg = add i32 %P1, %P2, !dbg !2
+     * !2 is metadata emitted at EOF.
      *)
     insist ((current_debug_location atentry) = None);
 
@@ -1234,16 +1275,27 @@ let test_builder () =
 
     (* CHECK: %build_alloca = alloca i32
      * CHECK: %build_array_alloca = alloca i32, i32 %P2
-     * CHECK: %build_load = load i32* %build_array_alloca
-     * CHECK: store i32 %P2, i32* %build_alloca
+     * CHECK: %build_load = load volatile i32* %build_array_alloca, align 4
+     * CHECK: store volatile i32 %P2, i32* %build_alloca, align 4
      * CHECK: %build_gep = getelementptr i32* %build_array_alloca, i32 %P2
      * CHECK: %build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2
      * CHECK: %build_struct_gep = getelementptr inbounds{{.*}}%build_alloca2, i32 0, i32 1
+     * CHECK: %build_atomicrmw = atomicrmw xchg i8* %p, i8 42 seq_cst
      *)
     let alloca = build_alloca i32_type "build_alloca" b in
     let array_alloca = build_array_alloca i32_type p2 "build_array_alloca" b in
-    ignore(build_load array_alloca "build_load" b);
-    ignore(build_store p2 alloca b);
+
+    let load = build_load array_alloca "build_load" b in
+    ignore(set_alignment 4 load);
+    ignore(set_volatile true load);
+    insist(true = is_volatile load);
+    insist(4 = alignment load);
+
+    let store = build_store p2 alloca b in
+    ignore(set_volatile true store);
+    ignore(set_alignment 4 store);
+    insist(true = is_volatile store);
+    insist(4 = alignment store);
     ignore(build_gep array_alloca [| p2 |] "build_gep" b);
     ignore(build_in_bounds_gep array_alloca [| p2 |] "build_in_bounds_gep" b);
 
@@ -1251,6 +1303,11 @@ let test_builder () =
     let alloca2 = build_alloca sty "build_alloca2" b in
     ignore(build_struct_gep alloca2 1 "build_struct_gep" b);
 
+    let p = build_alloca i8_type "p" b in
+    ignore(build_atomicrmw AtomicRMWBinOp.Xchg p (const_int i8_type 42)
+              AtomicOrdering.SequentiallyConsistent false "build_atomicrmw"
+              b);
+
     ignore(build_unreachable b)
   end;
 
@@ -1289,8 +1346,10 @@ let test_builder () =
 
 (* End-of-file checks for things like metdata and attributes.
  * CHECK: attributes #0 = {{.*}}uwtable{{.*}}
- * CHECK: !0 = metadata !{i32 1, metadata !"metadata test"}
- * CHECK: !1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}
+ * CHECK: !llvm.module.flags = !{!0}
+ * CHECK: !0 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+ * CHECK: !1 = metadata !{i32 1, metadata !"metadata test"}
+ * CHECK: !2 = metadata !{i32 2, i32 3, metadata !3, metadata !3}
  *)
 
 (*===-- Pass Managers -----------------------------------------------------===*)
@@ -1317,6 +1376,14 @@ let test_pass_manager () =
   end
 
 
+(*===-- Memory Buffer -----------------------------------------------------===*)
+
+let test_memory_buffer () =
+  group "memory buffer";
+  let buf = MemoryBuffer.of_string "foobar" in
+  insist ((MemoryBuffer.as_string buf) = "foobar")
+
+
 (*===-- Writer ------------------------------------------------------------===*)
 
 let test_writer () =
@@ -1334,6 +1401,7 @@ let test_writer () =
 (*===-- Driver ------------------------------------------------------------===*)
 
 let _ =
+  suite "conversion"       test_conversion;
   suite "target"           test_target;
   suite "constants"        test_constants;
   suite "global values"    test_global_values;
@@ -1347,5 +1415,6 @@ let _ =
   suite "instructions"     test_instructions;
   suite "builder"          test_builder;
   suite "pass manager"     test_pass_manager;
+  suite "memory buffer"    test_memory_buffer;
   suite "writer"           test_writer; (* Keep this last; it disposes m. *)
   exit !exit_status
diff --git a/test/Bindings/llvm-c/calc.test b/test/Bindings/llvm-c/calc.test
new file mode 100644
index 0000000..36a76e6
--- /dev/null
+++ b/test/Bindings/llvm-c/calc.test
@@ -0,0 +1,15 @@
+; RUN: llvm-c-test --calc <%s | FileCheck %s
+
+; constant folding
+test 100 200 +
+;CHECK: ModuleID = 'test'
+;CHECK: define i64 @test
+;CHECK: {
+;CHECK: ret i64 300
+;CHECK: }
+
+arg1 0 @ 0 @ * 1 @ 1 @ * +
+;CHECK: ModuleID = 'arg1'
+;CHECK: getelementptr
+;CHECK: load
+;CHECK: ret
diff --git a/test/Bindings/llvm-c/disassemble.test b/test/Bindings/llvm-c/disassemble.test
new file mode 100644
index 0000000..201e914
--- /dev/null
+++ b/test/Bindings/llvm-c/disassemble.test
@@ -0,0 +1,29 @@
+; RUN: llvm-c-test --disassemble < %s | FileCheck %s
+
+
+arm-linux-android    44 26 1f e5 0c 10 4b e2 02 20 81 e0
+;CHECK: triple: arm-linux-android
+;CHECK: ldr	r2, [pc, #-1604]
+;CHECK: sub	r1, r11, #12
+;CHECK: 02 20 81 e0
+;CHECK: add	r2, r1, r2
+
+x86_64-linux-unknown 48 83 c4 38 5b 5d 41 5c 41 5d 41 5e 41 5f c3
+;CHECK: triple: x86_64-linux-unknown
+;CHECK: addq	$56, %rsp
+;CHECK: popq	%rbx
+;CHECK: popq	%rbp
+;CHECK: popq	%r12
+;CHECK: popq	%r13
+;CHECK: popq	%r14
+;CHECK: popq	%r15
+;CHECK: ret
+
+i686-apple-darwin    0f b7 4c 24 0a e8 29 ce ff ff
+;CHECK: movzwl	10(%esp), %ecx
+;CHECK: calll	-12759
+
+i686-linux-unknown   dd 44 24 04 d9 e1 c3
+;CHECK: fldl	4(%esp)
+;CHECK: fabs
+;CHECK: ret
diff --git a/test/Bindings/llvm-c/functions.ll b/test/Bindings/llvm-c/functions.ll
new file mode 100644
index 0000000..4503fb1
--- /dev/null
+++ b/test/Bindings/llvm-c/functions.ll
@@ -0,0 +1,31 @@
+; RUN: llvm-as < %s | llvm-c-test --module-list-functions | FileCheck %s
+
+define i32 @X() {
+entry:
+  br label %l1
+
+l1:
+  br label %l2
+
+l2:
+  br label %l3
+
+l3:
+  ret i32 1234
+}
+;CHECK: FunctionDefinition: X [#bb=4]
+
+
+define i32 @Z(i32 %a) {
+entry:
+  %0 = tail call i32 @Y(i32 %a)
+  ret i32 %0
+}
+
+;CHECK: FunctionDefinition: Z [#bb=1]
+;CHECK:  calls: Y
+;CHECK:  #isn: 2
+
+declare i32 @Y(i32)
+;CHECK: FunctionDeclaration: Y
+
diff --git a/test/Bindings/llvm-c/globals.ll b/test/Bindings/llvm-c/globals.ll
new file mode 100644
index 0000000..a38f08b
--- /dev/null
+++ b/test/Bindings/llvm-c/globals.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llvm-c-test --module-list-globals | FileCheck %s
+
+@foo = constant [7 x i8] c"foobar\00", align 1
+;CHECK: GlobalDefinition: foo [7 x i8]*
+
+@bar = common global i32 0, align 4
+;CHECK: GlobalDefinition: bar i32*
diff --git a/test/Bindings/llvm-c/lit.local.cfg b/test/Bindings/llvm-c/lit.local.cfg
new file mode 100644
index 0000000..d83ebee
--- /dev/null
+++ b/test/Bindings/llvm-c/lit.local.cfg
@@ -0,0 +1,5 @@
+targets = set(config.root.targets_to_build.split())
+if not "X86" in targets:
+    config.unsupported = True
+if not "ARM" in targets:
+    config.unsupported = True
diff --git a/test/Bitcode/2012-05-07-SwitchInstRangesSupport.ll b/test/Bitcode/2012-05-07-SwitchInstRangesSupport.ll
deleted file mode 100644
index 583b9a8..0000000
--- a/test/Bitcode/2012-05-07-SwitchInstRangesSupport.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; RUN: rm -f %t.bc
-; RUN: rm -f %t.ll
-; RUN: rm -f %t2.bc
-; RUN: rm -f %t2.ll
-; RUN: llvm-as %s -o %t.bc
-; RUN: llvm-dis %t.bc -o - | tail -n +2 > %t.ll
-; RUN: llvm-as %t.ll -o %t2.bc
-; RUN: llvm-dis %t2.bc -o - | tail -n +2 > %t2.ll
-; RUN: llvm-diff %t.ll %t2.ll
-
-define void @test() {
-  %mem = alloca i32
-  store i32 2, i32* %mem
-  %c = load i32* %mem
-  switch i32 %c, label %exit [
-      i32 1, label %exit
-      i32 2, label %exit
-  ]
-exit:
-  ret void
-}
-define void @test_wide() {
-  %mem = alloca i256
-  store i256 2, i256* %mem
-  %c = load i256* %mem
-  switch i256 %c, label %exit [
-      i256 123456789012345678901234567890, label %exit
-      i256 2, label %exit
-  ]
-exit:
-  ret void
-}
-
diff --git a/test/Bitcode/attributes.ll b/test/Bitcode/attributes.ll
index 92f892a..1789878 100644
--- a/test/Bitcode/attributes.ll
+++ b/test/Bitcode/attributes.ll
@@ -203,7 +203,13 @@ define void @f34()
 ; CHECK: define void @f34()
 {
         call void @nobuiltin() nobuiltin
-; CHECK: call void @nobuiltin() #23
+; CHECK: call void @nobuiltin() #24
+        ret void;
+}
+
+define void @f35() optnone noinline
+; CHECK: define void @f35() #23
+{
         ret void;
 }
 
@@ -230,4 +236,6 @@ define void @f34()
 ; CHECK: attributes #20 = { "cpu"="cortex-a8" }
 ; CHECK: attributes #21 = { sspstrong }
 ; CHECK: attributes #22 = { minsize }
-; CHECK: attributes #23 = { nobuiltin }
+; CHECK: attributes #23 = { noinline optnone }
+; CHECK: attributes #24 = { nobuiltin }
+
diff --git a/test/Bitcode/case-ranges-3.3.ll b/test/Bitcode/case-ranges-3.3.ll
new file mode 100644
index 0000000..6e1d0a6
--- /dev/null
+++ b/test/Bitcode/case-ranges-3.3.ll
@@ -0,0 +1,67 @@
+; RUN:  llvm-dis < %s.bc| FileCheck %s
+
+; case-ranges.ll.bc was generated by passing this file to llvm-as from the 3.3
+; release of LLVM. This tests that the bitcode for switches from that release
+; can still be read.
+
+define i32 @foo(i32 %x) nounwind ssp uwtable {
+; CHECK: define i32 @foo
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  store i32 %x, i32* %2, align 4
+  %3 = load i32* %2, align 4
+  switch i32 %3, label %9 [
+; CHECK: switch i32 %3, label %9
+    i32 -3, label %4
+; CHECK-NEXT: i32 -3, label %4
+    i32 -2, label %4
+; CHECK-NEXT: i32 -2, label %4
+    i32 -1, label %4
+; CHECK-NEXT: i32 -1, label %4
+    i32 0, label %4
+; CHECK-NEXT: i32 0, label %4
+    i32 1, label %4
+; CHECK-NEXT: i32 1, label %4
+    i32 2, label %4
+; CHECK-NEXT: i32 2, label %4
+    i32 4, label %5
+; CHECK-NEXT: i32 4, label %5
+    i32 5, label %6
+; CHECK-NEXT: i32 5, label %6
+    i32 6, label %7
+; CHECK-NEXT: i32 6, label %7
+    i32 7, label %8
+; CHECK-NEXT: i32 7, label %8
+  ]
+
+; <label>:4
+  store i32 -1, i32* %1
+  br label %11
+
+; <label>:5
+  store i32 2, i32* %1
+  br label %11
+
+; <label>:6
+  store i32 1, i32* %1
+  br label %11
+
+; <label>:7
+  store i32 4, i32* %1
+  br label %11
+
+; <label>:8
+  store i32 3, i32* %1
+  br label %11
+
+; <label>:9
+  br label %10
+
+; <label>:10
+  store i32 0, i32* %1
+  br label %11
+
+; <label>:11
+  %12 = load i32* %1
+  ret i32 %12
+}
diff --git a/test/Bitcode/case-ranges-3.3.ll.bc b/test/Bitcode/case-ranges-3.3.ll.bc
new file mode 100644
index 0000000..998f747
--- /dev/null
+++ b/test/Bitcode/case-ranges-3.3.ll.bc
diff --git a/test/Bitcode/drop-debug-info.ll b/test/Bitcode/drop-debug-info.ll
new file mode 100644
index 0000000..da4ae0c
--- /dev/null
+++ b/test/Bitcode/drop-debug-info.ll
@@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0, !dbg !12
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.5 (trunk 195495) (llvm/trunk 195495:195504M)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"../llvm/tools/clang/test/CodeGen/debug-info-version.c", metadata !"/Users/manmanren/llvm_gmail/release"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/llvm_gmail/release/../llvm/tools/clang/test/CodeGen/debug-info-version.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!12 = metadata !{i32 4, i32 0, metadata !4, null}
+
+; CHECK-NOT: !dbg
+; CHECK-NOT: !llvm.dbg.cu
diff --git a/test/Bitcode/extractelement.ll b/test/Bitcode/extractelement.ll
index d88f811..8999c65 100644
--- a/test/Bitcode/extractelement.ll
+++ b/test/Bitcode/extractelement.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -constprop | llvm-dis
+; RUN: opt < %s -constprop | llvm-dis -disable-output
 ; PR3465
 
 define double @test() {
diff --git a/test/Bitcode/invalid.ll b/test/Bitcode/invalid.ll
new file mode 100644
index 0000000..1d4a82b
--- /dev/null
+++ b/test/Bitcode/invalid.ll
@@ -0,0 +1,7 @@
+; RUN:  not llvm-dis < %s.bc 2>&1 | FileCheck %s
+
+; CHECK: llvm-dis{{(\.EXE|\.exe)?}}: Invalid value
+
+; invalid.ll.bc has an invalid attribute number.
+; The test checks that LLVM reports the error and doesn't access freed memory
+; in doing so.
diff --git a/test/Bitcode/invalid.ll.bc b/test/Bitcode/invalid.ll.bc
new file mode 100644
index 0000000..a85c364
--- /dev/null
+++ b/test/Bitcode/invalid.ll.bc
diff --git a/test/Bitcode/lit.local.cfg b/test/Bitcode/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Bitcode/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Bitcode/metadata-2.ll b/test/Bitcode/metadata-2.ll
index dbf46b0..4055f92 100644
--- a/test/Bitcode/metadata-2.ll
+++ b/test/Bitcode/metadata-2.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis -o /dev/null
+; RUN: llvm-as < %s | llvm-dis -disable-output
 	%0 = type { %object.ModuleInfo.__vtbl*, i8*, %"byte[]", %1, %"ClassInfo[]", i32, void ()*, void ()*, void ()*, i8*, void ()* }		; type %0
 	%1 = type { i64, %object.ModuleInfo* }		; type %1
 	%2 = type { i32, void ()* }		; type %2
diff --git a/test/Bitcode/metadata.ll b/test/Bitcode/metadata.ll
index 19db3ea..fc8a622 100644
--- a/test/Bitcode/metadata.ll
+++ b/test/Bitcode/metadata.ll
@@ -1,6 +1,5 @@
-; RUN: llvm-as < %s | llvm-dis -o /dev/null
+; RUN: llvm-as < %s | llvm-dis -disable-output
 
 !llvm.foo = !{!0}
 !0 = metadata !{i32 42}
 @my.str = internal constant [4 x i8] c"foo\00"
-
diff --git a/test/Bitcode/null-type.ll b/test/Bitcode/null-type.ll
index 8502b0d..a620fab 100644
--- a/test/Bitcode/null-type.ll
+++ b/test/Bitcode/null-type.ll
@@ -1,5 +1,4 @@
-; RUN: not llvm-dis < %s.bc > /dev/null 2> %t
-; RUN: FileCheck %s < %t
+; RUN: not llvm-dis < %s.bc 2>&1 | FileCheck %s
 ; PR8494
 
-; CHECK: Invalid MODULE_CODE_FUNCTION record
+; CHECK: Invalid record
diff --git a/test/Bitcode/select.ll b/test/Bitcode/select.ll
new file mode 100644
index 0000000..71e669a
--- /dev/null
+++ b/test/Bitcode/select.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define <2 x i32> @main() {
+  ret <2 x i32> select (<2 x i1> <i1 false, i1 undef>, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 undef>)
+}
+
+; CHECK: define <2 x i32> @main() {
+; CHECK:   ret <2 x i32> select (<2 x i1> <i1 false, i1 undef>, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 undef>)
+; CHECK: }
diff --git a/test/Bitcode/shuffle.ll b/test/Bitcode/shuffle.ll
index c3c01c6..1495d8e 100644
--- a/test/Bitcode/shuffle.ll
+++ b/test/Bitcode/shuffle.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llvm-dis
+; RUN: llvm-as < %s | llvm-dis -disable-output
 
 ; <rdar://problem/8622574>
 ; tests the bitcodereader can handle the case where the reader will initially
diff --git a/test/Bitcode/upgrade-tbaa.ll b/test/Bitcode/upgrade-tbaa.ll
new file mode 100644
index 0000000..e738909
--- /dev/null
+++ b/test/Bitcode/upgrade-tbaa.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Function Attrs: nounwind
+define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) #0 {
+entry:
+  store i32 0, i32* %pI, align 4, !tbaa !{metadata !"int", metadata !0}
+  ; CHECK: store i32 0, i32* %pI, align 4, !tbaa [[TAG_INT:!.*]]
+  store float 1.000000e+00, float* %pF, align 4, !tbaa !2
+  ; CHECK: store float 1.000000e+00, float* %pF, align 4, !tbaa [[TAG_FLOAT:!.*]]
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
+!2 = metadata !{metadata !"float", metadata !0}
+
+; CHECK: [[TAG_INT]] = metadata !{metadata [[TYPE_INT:!.*]], metadata [[TYPE_INT]], i64 0}
+; CHECK: [[TYPE_INT]] = metadata !{metadata !"int", metadata [[TYPE_CHAR:!.*]]}
+; CHECK: [[TYPE_CHAR]] = metadata !{metadata !"omnipotent char", metadata !{{.*}}
+; CHECK: [[TAG_FLOAT]] = metadata !{metadata [[TYPE_FLOAT:!.*]], metadata [[TYPE_FLOAT]], i64 0}
+; CHECK: [[TYPE_FLOAT]] = metadata !{metadata !"float", metadata [[TYPE_CHAR]]}
diff --git a/test/BugPoint/lit.local.cfg b/test/BugPoint/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/BugPoint/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index 2ba1a9f..cc043f0 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll
@@ -4,12 +4,12 @@
 
 ; Bugpoint should keep the call's metadata attached to the call.
 
-; CHECK: call void @foo(), !dbg !0, !attach !4
-; CHECK: !0 = metadata !{i32 104, i32 105, metadata !1, metadata !1}
-; CHECK: !1 = metadata !{i32 458769, metadata !2, i32 0, metadata !"me", i1 true, metadata !"", i32 0, metadata !3, metadata !3, null, null, null, metadata !""}
-; CHECK: !2 = metadata !{metadata !"source.c", metadata !"/dir"}
-; CHECK: !3 = metadata !{i32 0}
-; CHECK: !4 = metadata !{metadata !"the call to foo"}
+; CHECK: call void @foo(), !dbg ![[LOC:[0-9]+]], !attach ![[CALL:[0-9]+]]
+; CHECK: ![[LOC]] = metadata !{i32 104, i32 105, metadata ![[SCOPE:[0-9]+]], metadata ![[SCOPE]]}
+; CHECK: ![[SCOPE]] = metadata !{i32 458769, metadata ![[FILE:[0-9]+]], i32 0, metadata !"me", i1 true, metadata !"", i32 0, metadata ![[LIST:[0-9]+]], metadata ![[LIST]], null, null, null, metadata !""}
+; CHECK: ![[FILE]] = metadata !{metadata !"source.c", metadata !"/dir"}
+; CHECK: ![[LIST]] = metadata !{i32 0}
+; CHECK: ![[CALL]] = metadata !{metadata !"the call to foo"}
 
 %rust_task = type {}
 define void @test(i32* %a, i8* %b) {
@@ -23,6 +23,8 @@ define void @test(i32* %a, i8* %b) {
 
 declare void @foo()
 
+!llvm.module.flags = !{!17}
+
 !0 = metadata !{metadata !"boring"}
 !1 = metadata !{metadata !"uninteresting"}
 !2 = metadata !{metadata !"the call to foo"}
@@ -37,3 +39,4 @@ declare void @foo()
 !14 = metadata !{i32 108, i32 109, metadata !9, metadata !9}
 !15 = metadata !{metadata !"source.c", metadata !"/dir"}
 !16 = metadata !{i32 0}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 392f5f2..d6f7dab 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,21 +13,25 @@ if(NOT LLVM_BUILD_TOOLS)
 endif()
 
 # Set the depends list as a variable so that it can grow conditionally.
+# NOTE: Sync the substitutions in test/lit.cfg when adding to this list.
 set(LLVM_TEST_DEPENDS
           UnitTests
           BugpointPasses
           LLVMHello
           llc
           lli
+          lli-child-target
           llvm-ar
           llvm-as
           llvm-bcanalyzer
+          llvm-c-test
           llvm-cov
           llvm-diff
           llvm-dis
           llvm-extract
           llvm-dwarfdump
           llvm-link
+          llvm-lto
           llvm-mc
           llvm-mcmarkup
           llvm-nm
diff --git a/test/CodeGen/AArch64/adrp-relocation.ll b/test/CodeGen/AArch64/adrp-relocation.ll
deleted file mode 100644
index 1e12d69..0000000
--- a/test/CodeGen/AArch64/adrp-relocation.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -filetype=obj < %s | llvm-readobj -s -r | FileCheck %s
-
-define i64 @testfn() nounwind {
-entry:
-  ret i64 0
-}
-
-define i64 @foo() nounwind {
-entry:
-  %bar = alloca i64 ()*, align 8
-  store i64 ()* @testfn, i64 ()** %bar, align 8
-  %call = call i64 @testfn()
-  ret i64 %call
-}
-
-; The above should produce an ADRP/ADD pair to calculate the address of
-; testfn. The important point is that LLVM shouldn't think it can deal with the
-; relocation on the ADRP itself (even though it knows everything about the
-; relative offsets of testfn and foo) because its value depends on where this
-; object file's .text section gets relocated in memory.
-
-; CHECK:      Relocations [
-; CHECK-NEXT:   Section (2) .rela.text {
-; CHECK-NEXT:     0x10 R_AARCH64_ADR_PREL_PG_HI21 testfn 0x0
-; CHECK-NEXT:     0x14 R_AARCH64_ADD_ABS_LO12_NC testfn 0x0
-; CHECK-NEXT:   }
-; CHECK-NEXT: ]
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index a84217f..1d3c0a0 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s
 
 declare void @use_addr(i8*)
 
@@ -7,13 +8,13 @@ define void @test_simple_alloca(i64 %n) {
 
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
+; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK: mov [[TMP:x[0-9]+]], sp
-; CHECK: sub x0, [[TMP]], [[SPDELTA]]
+; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
+; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
 ; CHECK: mov sp, x0
 
   call void @use_addr(i8* %buf)
@@ -37,13 +38,13 @@ define i64 @test_alloca_with_local(i64 %n) {
   %loc = alloca i64
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
+; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK: mov [[TMP:x[0-9]+]], sp
-; CHECK: sub x0, [[TMP]], [[SPDELTA]]
+; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
+; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
 ; CHECK: mov sp, x0
 
   ; Obviously suboptimal code here, but it to get &local in x1
@@ -66,16 +67,22 @@ define i64 @test_alloca_with_local(i64 %n) {
 }
 
 define void @test_variadic_alloca(i64 %n, ...) {
-; CHECK-LABEL: test_variadic_alloca:
+; CHECK: test_variadic_alloca:
 
 ; CHECK: sub     sp, sp, #208
 ; CHECK: stp     x29, x30, [sp, #192]
 ; CHECK: add     x29, sp, #192
 ; CHECK: sub     [[TMP:x[0-9]+]], x29, #192
 ; CHECK: add     x8, [[TMP]], #0
-; CHECK: str     q7, [x8, #112]
+; CHECK-FP: str     q7, [x8, #112]
 ; [...]
-; CHECK: str     q1, [x8, #16]
+; CHECK-FP: str     q1, [x8, #16]
+
+; CHECK-NOFP: sub     sp, sp, #80
+; CHECK-NOFP: stp     x29, x30, [sp, #64]
+; CHECK-NOFP: add     x29, sp, #64
+; CHECK-NOFP: sub     [[TMP:x[0-9]+]], x29, #64
+; CHECK-NOFP: add     x8, [[TMP]], #0
 
   %addr = alloca i8, i64 %n
 
@@ -86,6 +93,10 @@ define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK: sub sp, x29, #192
 ; CHECK: ldp x29, x30, [sp, #192]
 ; CHECK: add sp, sp, #208
+
+; CHECK-NOFP: sub sp, x29, #64
+; CHECK-NOFP: ldp x29, x30, [sp, #64]
+; CHECK-NOFP: add sp, sp, #80
 }
 
 define void @test_alloca_large_frame(i64 %n) {
@@ -112,16 +123,16 @@ declare i8* @llvm.stacksave()
 declare void @llvm.stackrestore(i8*)
 
 define void @test_scoped_alloca(i64 %n) {
-; CHECK: test_scoped_alloca
+; CHECK-LABEL: test_scoped_alloca:
 ; CHECK: sub sp, sp, #32
 
   %sp = call i8* @llvm.stacksave()
 ; CHECK: mov [[SAVED_SP:x[0-9]+]], sp
+; CHECK: mov [[OLDSP:x[0-9]+]], sp
 
   %addr = alloca i8, i64 %n
 ; CHECK: and [[SPDELTA:x[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: mov [[OLDSP:x[0-9]+]], sp
-; CHECK: sub [[NEWSP:x[0-9]+]], [[OLDSP]], [[SPDELTA]]
+; CHECK-DAG: sub [[NEWSP:x[0-9]+]], [[OLDSP]], [[SPDELTA]]
 ; CHECK: mov sp, [[NEWSP]]
 
   call void @use_addr(i8* %addr)
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
index 1b14be2..682b7ba 100644
--- a/test/CodeGen/AArch64/basic-pic.ll
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -1,10 +1,7 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -filetype=obj %s -o -| llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s
 
 @var = global i32 0
 
-; CHECK-ELF: RELOCATION RECORDS FOR [.rela.text]
-
 define i32 @get_globalvar() {
 ; CHECK-LABEL: get_globalvar:
 
@@ -13,8 +10,6 @@ define i32 @get_globalvar() {
 ; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], #:got_lo12:var]
 ; CHECK: ldr w0, [x[[GOTLOC]]]
 
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var
   ret i32 %val
 }
 
@@ -25,8 +20,6 @@ define i32* @get_globalvaraddr() {
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:var]
 
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var
   ret i32* @var
 }
 
@@ -39,8 +32,6 @@ define i32 @get_hiddenvar() {
 ; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
 ; CHECK: ldr w0, [x[[HI]], #:lo12:hiddenvar]
 
-; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
-; CHECK-ELF: R_AARCH64_LDST32_ABS_LO12_NC hiddenvar
   ret i32 %val
 }
 
@@ -51,8 +42,6 @@ define i32* @get_hiddenvaraddr() {
 ; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
 ; CHECK: add x0, [[HI]], #:lo12:hiddenvar
 
-; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
-; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC hiddenvar
   ret i32* @hiddenvar
 }
 
@@ -62,9 +51,4 @@ define void()* @get_func() {
   ret void()* bitcast(void()*()* @get_func to void()*)
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
 ; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:get_func]
-
-  ; Particularly important that the ADRP gets a relocation, LLVM tends to think
-  ; it can relax it because it knows where get_func is. It can't!
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE get_func
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC get_func
 }
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 48c50a1..9c1dfeb 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -9,16 +10,16 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
-; CHECK: movz [[W52:w[0-9]+]], #52
-; CHECK: movz [[W42:w[0-9]+]], #42
+; CHECK-DAG: movz [[W52:w[0-9]+]], #52
+; CHECK-DAG: movz [[W42:w[0-9]+]], #42
 ; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi
 
   %rhs64 = sext i32 %rhs32 to i64
   %tst2 = icmp sle i64 %lhs64, %rhs64
   %val2 = select i1 %tst2, i64 %lhs64, i64 %rhs64
   store i64 %val2, i64* @var64
-; CHECK: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
-; CHECK: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
+; CHECK-DAG: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
+; CHECK-DAG: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
 ; CHECK: csel {{x[0-9]+}}, [[LHS]], [[EXT_RHS]], le
 
   ret void
@@ -30,6 +31,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 
   %tst1 = fcmp one float %lhs32, %rhs32
 ; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFP-NOT: fcmp
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
 ; CHECK: movz [[W52:w[0-9]+]], #52
@@ -40,6 +42,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 
   %tst2 = fcmp ueq double %lhs64, %rhs64
 ; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NOFP-NOT: fcmp
   %val2 = select i1 %tst2, i64 9, i64 15
   store i64 %val2, i64* @var64
 ; CHECK: movz [[CONST15:x[0-9]+]], #15
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 13f032d..12c7b6a 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
@@ -26,7 +27,7 @@ define float @test_select_float(i1 %bit, float %a, float %b) {
 ; CHECK: movz [[ONE:w[0-9]+]], #1
 ; CHECK: tst w0, [[ONE]]
 ; CHECK-NEXT: fcsel s0, s0, s1, ne
-
+; CHECK-NOFP-NOT: fcsel
   ret float %val
 }
 
@@ -36,6 +37,7 @@ define double @test_select_double(i1 %bit, double %a, double %b) {
 ; CHECK: movz [[ONE:w[0-9]+]], #1
 ; CHECK: tst w0, [[ONE]]
 ; CHECK-NEXT: fcsel d0, d0, d1, ne
+; CHECK-NOFP-NOT: fcsel
 
   ret double %val
 }
@@ -56,6 +58,7 @@ define i1 @test_setcc_float(float %lhs, float %rhs) {
   %val = fcmp oeq float %lhs, %rhs
 ; CHECK: fcmp s0, s1
 ; CHECK: csinc w0, wzr, wzr, ne
+; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
 
@@ -64,6 +67,7 @@ define i1 @test_setcc_double(double %lhs, double %rhs) {
   %val = fcmp oeq double %lhs, %rhs
 ; CHECK: fcmp d0, d1
 ; CHECK: csinc w0, wzr, wzr, ne
+; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
 
diff --git a/test/CodeGen/AArch64/elf-extern.ll b/test/CodeGen/AArch64/elf-extern.ll
deleted file mode 100644
index e09aa12..0000000
--- a/test/CodeGen/AArch64/elf-extern.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
-
-; External symbols are a different concept to global variables but should still
-; get relocations and so on when used.
-
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
-
-define i32 @check_extern() {
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* undef, i32 undef, i32 4, i1 0)
-  ret i32 0
-}
-
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_AARCH64_CALL26 memcpy
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index 9afcfc4..b28eb3e 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -5,8 +5,8 @@ define i32 @test_floattoi32(float %in) {
 
   %signed = fptosi float %in to i32
   %unsigned = fptoui float %in to i32
-; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{s[0-9]+}}
-; CHECK: fcvtzs [[SIG:w[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:w[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:w[0-9]+]], {{s[0-9]+}}
 
   %res = sub i32 %signed, %unsigned
 ; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -20,8 +20,8 @@ define i32 @test_doubletoi32(double %in) {
 
   %signed = fptosi double %in to i32
   %unsigned = fptoui double %in to i32
-; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{d[0-9]+}}
-; CHECK: fcvtzs [[SIG:w[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:w[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:w[0-9]+]], {{d[0-9]+}}
 
   %res = sub i32 %signed, %unsigned
 ; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -35,8 +35,8 @@ define i64 @test_floattoi64(float %in) {
 
   %signed = fptosi float %in to i64
   %unsigned = fptoui float %in to i64
-; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{s[0-9]+}}
-; CHECK: fcvtzs [[SIG:x[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:x[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:x[0-9]+]], {{s[0-9]+}}
 
   %res = sub i64 %signed, %unsigned
 ; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -50,8 +50,8 @@ define i64 @test_doubletoi64(double %in) {
 
   %signed = fptosi double %in to i64
   %unsigned = fptoui double %in to i64
-; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{d[0-9]+}}
-; CHECK: fcvtzs [[SIG:x[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:x[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:x[0-9]+]], {{d[0-9]+}}
 
   %res = sub i64 %signed, %unsigned
 ; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -65,8 +65,8 @@ define float @test_i32tofloat(i32 %in) {
 
   %signed = sitofp i32 %in to float
   %unsigned = uitofp i32 %in to float
-; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{w[0-9]+}}
-; CHECK: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:s[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}}
 
   %res = fsub float %signed, %unsigned
 ; CHECL: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -79,8 +79,8 @@ define double @test_i32todouble(i32 %in) {
 
   %signed = sitofp i32 %in to double
   %unsigned = uitofp i32 %in to double
-; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{w[0-9]+}}
-; CHECK: scvtf [[SIG:d[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:d[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:d[0-9]+]], {{w[0-9]+}}
 
   %res = fsub double %signed, %unsigned
 ; CHECK: fsub {{d[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -93,8 +93,8 @@ define float @test_i64tofloat(i64 %in) {
 
   %signed = sitofp i64 %in to float
   %unsigned = uitofp i64 %in to float
-; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{x[0-9]+}}
-; CHECK: scvtf [[SIG:s[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:s[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{x[0-9]+}}
 
   %res = fsub float %signed, %unsigned
 ; CHECK: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -107,8 +107,8 @@ define double @test_i64todouble(i64 %in) {
 
   %signed = sitofp i64 %in to double
   %unsigned = uitofp i64 %in to double
-; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{x[0-9]+}}
-; CHECK: scvtf [[SIG:d[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:d[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:d[0-9]+]], {{x[0-9]+}}
 
   %res = fsub double %signed, %unsigned
 ; CHECK: sub {{d[0-9]+}}, [[SIG]], [[UNSIG]]
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index 3a9a6fc..590557f 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -129,8 +129,9 @@ define float @test_fnmsub_unfused(float %a, float %b, float %c) {
   %diff = fsub float %nega, %prod
 ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST-NOT: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST: fneg {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-DAG: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-DAG: fneg {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-DAG: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: ret
   ret float %diff
 }
diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/fp128.ll
index 853c03d..c312bb1 100644
--- a/test/CodeGen/AArch64/fp128.ll
+++ b/test/CodeGen/AArch64/fp128.ll
@@ -150,14 +150,14 @@ define i1 @test_setcc2() {
 ; Technically, everything after the call to __letf2 is redundant, but we'll let
 ; LLVM have its fun for now.
   %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __unordtf2
-; CHECK: mov     x[[UNORDERED:[0-9]+]], x0
-
 ; CHECK: bl      __gttf2
 ; CHECK: cmp w0, #0
 ; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
-; CHECK: cmp w[[UNORDERED]], #0
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp w0, #0
 ; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+
 ; CHECK: orr     w0, [[UNORDERED]], [[GT]]
 
   ret i1 %val
@@ -174,15 +174,14 @@ define i32 @test_br_cc() {
 
   ; olt == !uge, which LLVM unfortunately "optimizes" this to.
   %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __unordtf2
-; CHECK: mov     x[[UNORDERED:[0-9]+]], x0
-
 ; CHECK: bl      __getf2
 ; CHECK: cmp w0, #0
-
 ; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
-; CHECK: cmp w[[UNORDERED]], #0
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp w0, #0
 ; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+
 ; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
 ; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
   br i1 %cond, label %iftrue, label %iffalse
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index ccf7c8a..b8f7169 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -9,12 +9,13 @@ define void @check_float() {
   %val = load float* @varf32
   %newval1 = fadd float %val, 8.5
   store volatile float %newval1, float* @varf32
-; CHECK: fmov {{s[0-9]+}}, #8.5
+; CHECK-DAG: fmov [[EIGHT5:s[0-9]+]], #8.5
 
   %newval2 = fadd float %val, 128.0
   store volatile float %newval2, float* @varf32
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI0_0
+; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI0_0
 
+; CHECK: ret
   ret void
 }
 
@@ -24,11 +25,12 @@ define void @check_double() {
   %val = load double* @varf64
   %newval1 = fadd double %val, 8.5
   store volatile double %newval1, double* @varf64
-; CHECK: fmov {{d[0-9]+}}, #8.5
+; CHECK-DAG: fmov {{d[0-9]+}}, #8.5
 
   %newval2 = fadd double %val, 128.0
   store volatile double %newval2, double* @varf64
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
+; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
 
+; CHECK: ret
   ret void
 }
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
new file mode 100644
index 0000000..182704b
--- /dev/null
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: mov x0, x29
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+define i8* @t2() nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr x[[reg:[0-9]+]], [x29]
+; CHECK: ldr x[[reg]], [x[[reg]]]
+	%0 = call i8* @llvm.frameaddress(i32 2)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 15f8e76..430d77f 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -23,6 +24,7 @@ define void @add_floats(float %val1, float %val2) {
 ; CHECK-LABEL: add_floats:
     %newval = fadd float %val1, %val2
 ; CHECK: fadd [[ADDRES:s[0-9]+]], s0, s1
+; CHECK-NOFP-NOT: fadd
     store float %newval, float* @varfloat
 ; CHECK: str [[ADDRES]], [{{x[0-9]+}}, #:lo12:varfloat]
     ret void
@@ -35,15 +37,15 @@ define void @take_struct(%myStruct* byval %structval) {
     %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2
     %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0
 
-    %val0 = load i32* %addr0
+    %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
 ; CHECK: ldr [[REG32:w[0-9]+]], [{{x[0-9]+|sp}}, #12]
-    store i32 %val0, i32* @var32
+    store volatile i32 %val0, i32* @var32
 ; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
 
-    %val1 = load i64* %addr1
+    %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [{{x[0-9]+|sp}}]
-    store i64 %val1, i64* @var64
+    store volatile i64 %val1, i64* @var64
 ; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
 
     ret void
@@ -56,14 +58,14 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st
     %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2
     %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0
 
-    %val0 = load i32* %addr0
+    %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
 ; CHECK: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16
 ; CHECK: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12]
     store i32 %val0, i32* @var32
 ; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
 
-    %val1 = load i64* %addr1
+    %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [sp, #16]
     store i64 %val1, i64* @var64
 ; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
@@ -84,6 +86,7 @@ define double @return_double() {
 ; CHECK-LABEL: return_double:
     ret double 3.14
 ; CHECK: ldr d0, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK-NOFP-NOT: ldr d0,
 }
 
 ; This is the kind of IR clang will produce for returning a struct
@@ -130,17 +133,18 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
                           double %notstacked) {
 ; CHECK-LABEL: struct_on_stack:
     %addr = getelementptr %myStruct* %struct, i64 0, i32 0
-    %val64 = load i64* %addr
-    store i64 %val64, i64* @var64
+    %val64 = load volatile i64* %addr
+    store volatile i64 %val64, i64* @var64
     ; Currently nothing on local stack, so struct should be at sp
 ; CHECK: ldr [[VAL64:x[0-9]+]], [sp]
 ; CHECK: str [[VAL64]], [{{x[0-9]+}}, #:lo12:var64]
 
-    store double %notstacked, double* @vardouble
+    store volatile double %notstacked, double* @vardouble
 ; CHECK-NOT: ldr d0
 ; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble
+; CHECK-NOFP-NOT: str d0,
 
-    %retval = load i32* %stacked
+    %retval = load volatile i32* %stacked
     ret i32 %retval
 ; CHECK: ldr w0, [sp, #16]
 }
@@ -176,10 +180,10 @@ define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
 ; CHECK: check_i128_stackalign
     store i128 %stack2, i128* @var128
     ; Nothing local on stack in current codegen, so first stack is 16 away
-; CHECK: ldr {{x[0-9]+}}, [sp, #16]
+; CHECK: add     x[[REG:[0-9]+]], sp, #16
+; CHECK: ldr {{x[0-9]+}}, [x[[REG]], #8]
     ; Important point is that we address sp+24 for second dword
-; CHECK: add     [[REG:x[0-9]+]], sp, #16
-; CHECK: ldr     {{x[0-9]+}}, {{\[}}[[REG]], #8]
+; CHECK: ldr     {{x[0-9]+}}, [sp, #16]
     ret void
 }
 
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index b12130b..ac188bb 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -21,16 +22,18 @@ define void @simple_args() {
   %char1 = load i8* @var8
   %char2 = load i8* @var8_2
   call void @take_i8s(i8 %char1, i8 %char2)
-; CHECK: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
-; CHECK: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
+; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
+; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
 ; CHECK: bl take_i8s
 
   %float1 = load float* @varfloat
   %float2 = load float* @varfloat_2
   call void @take_floats(float %float1, float %float2)
-; CHECK: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
-; CHECK: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK-DAG: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
+; CHECK-DAG: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
 ; CHECK: bl take_floats
+; CHECK-NOFP-NOT: ldr s1,
+; CHECK-NOFP-NOT: ldr s0,
 
   ret void
 }
@@ -52,6 +55,7 @@ define void @simple_rets() {
   store double %dbl, double* @vardouble
 ; CHECK: bl return_double
 ; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+; CHECK-NOFP-NOT: str d0,
 
   %arr = call [2 x i64] @return_smallstruct()
   store [2 x i64] %arr, [2 x i64]* @varsmallstruct
@@ -75,17 +79,19 @@ declare void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
                           float %var8)
 
 define void @check_stack_args() {
+; CHECK-LABEL: check_stack_args:
   call i32 @struct_on_stack(i8 0, i16 12, i32 42, i64 99, i128 1,
                             i32* @var32, %myStruct* byval @varstruct,
                             i32 999, double 1.0)
   ; Want to check that the final double is passed in registers and
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
-; CHECK: mov x0, sp
-; CHECK: str {{w[0-9]+}}, [x0]
-; CHECK: str {{w[0-9]+}}, [x0, #12]
-; CHECK: fmov d0,
+; CHECK: mov x[[SPREG:[0-9]+]], sp
+; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]]]
+; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]], #12]
+; CHECK-DAG: fmov d0,
 ; CHECK: bl struct_on_stack
+; CHECK-NOFP-NOT: fmov
 
   call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0,
                          float -2.0, float -8.0, float 16.0, float 1.0,
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
index d1b21f8..b7f4d3c 100644
--- a/test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ b/test/CodeGen/AArch64/inline-asm-modifiers.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s
 
 @var_simple = hidden global i32 0
 @var_got = global i32 0
@@ -23,12 +22,10 @@ define void @test_inline_modifier_L() nounwind {
 ; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie]
 ; CHECK: add x0, x0, #:tprel_lo12:var_tlsle
 
-; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC var_simple
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var_got
-; CHECK-ELF: R_AARCH64_TLSDESC_ADD_LO12_NC var_tlsgd
-; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_LO12 var_tlsld
-; CHECK-ELF: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC var_tlsie
-; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_LO12 var_tlsle
+  call void asm sideeffect "add x0, x0, ${0:L}", "Si,~{x0}"(i32 64)
+  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "Si,~{x0}"(i32 64)
+; CHECK: add x0, x0, #64
+; CHECK: ldr x0, [x0, #64]
 
   ret void
 }
@@ -40,9 +37,8 @@ define void @test_inline_modifier_G() nounwind {
 ; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
 ; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
 
-; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_HI12 var_tlsld
-; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_HI12 var_tlsle
-
+  call void asm sideeffect "add x0, x0, ${0:G}", "Si,~{x0}"(i32 42)
+; CHECK: add x0, x0, #42
   ret void
 }
 
@@ -58,10 +54,8 @@ define void @test_inline_modifier_A() nounwind {
 ; CHECK: adrp x0, :tlsdesc:var_tlsgd
 ; CHECK: adrp x0, :gottprel:var_tlsie
 
-; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 var_simple
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var_got
-; CHECK-ELF: R_AARCH64_TLSDESC_ADR_PAGE var_tlsgd
-; CHECK-ELF: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 var_tlsie
+  call void asm sideeffect "adrp x0, ${0:A}", "Si,~{x0}"(i32 40)
+; CHECK: adrp x0, #40
 
   ret void
 }
@@ -87,6 +81,12 @@ define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind {
   call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0)
 ; CHECK: add {{w[0-9]+}}, wzr, wzr
 ; CHECK: add {{x[0-9]+}}, xzr, xzr
+
+  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${1:w}", "=r,Ir,0"(i32 123, i32 %small)
+  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${1:x}", "=r,Ir,0"(i32 456, i64 %big)
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #123
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #456
+
   ret void
 }
 
@@ -113,6 +113,18 @@ define void @test_inline_modifier_bhsdq() nounwind {
 ; CHECK: ldr s0, [sp]
 ; CHECK: ldr d0, [sp]
 ; CHECK: ldr q0, [sp]
+
+  call void asm sideeffect "fcmp b0, ${0:b}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp h0, ${0:h}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp s0, ${0:s}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp d0, ${0:d}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp q0, ${0:q}", "Yw"(float 0.0)
+; CHECK: fcmp b0, #0
+; CHECK: fcmp h0, #0
+; CHECK: fcmp s0, #0
+; CHECK: fcmp d0, #0
+; CHECK: fcmp q0, #0
+
   ret void
 }
 
@@ -123,3 +135,13 @@ define void @test_inline_modifier_c() nounwind {
 
   ret void
 }
+
+define void @test_inline_modifier_a() nounwind {
+; CHECK-LABEL: test_inline_modifier_a:
+  call void asm sideeffect "prfm pldl1keep, ${0:a}", "r"(i32* @var_simple)
+; CHECK: adrp [[VARHI:x[0-9]+]], var_simple
+; CHECK: add x[[VARADDR:[0-9]+]], [[VARHI]], #:lo12:var_simple
+; CHECK: prfm pldl1keep, [x[[VARADDR]]]
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 0f1e760..4bb0942 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s -check-prefix=CHECK-ELF
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
@@ -48,19 +47,3 @@ lbl4:
 ; CHECK-NEXT: .xword
 ; CHECK-NEXT: .xword
 ; CHECK-NEXT: .xword
-
-; ELF tests:
-
-; First make sure we get a page/lo12 pair in .text to pick up the jump-table
-
-; CHECK-ELF:      Relocations [
-; CHECK-ELF:        Section ({{[0-9]+}}) .rela.text {
-; CHECK-ELF-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 .rodata
-; CHECK-ELF-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ADD_ABS_LO12_NC .rodata
-; CHECK-ELF:        }
-
-; Also check the targets in .rodata are relocated
-; CHECK-ELF:        Section ({{[0-9]+}}) .rela.rodata {
-; CHECK-ELF-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ABS64 .text
-; CHECK-ELF:        }
-; CHECK-ELF:      ]
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
index c83fb52..db30fd9 100644
--- a/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
@@ -197,11 +198,13 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
    %val_sxtwN = load volatile float* %addr_sxtwN
    store volatile float %val_sxtwN, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %addr_lslN = getelementptr float* %base, i64 %off64
   %val_lslN = load volatile float* %addr_lslN
   store volatile float %val_lslN, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #2]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %addrint_uxtw = ptrtoint float* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -210,6 +213,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile float* %addr_uxtw
   store volatile float %val_uxtw, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_sxtw = ptrtoint float* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -218,6 +222,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val64_sxtw = load volatile float* %addr_sxtw
   store volatile float %val64_sxtw, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_lsl = ptrtoint float* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -225,6 +230,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile float* %addr_lsl
   store volatile float %val64_lsl, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_uxtwN = ptrtoint float* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -234,6 +240,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val64 = load volatile float* @var_float
   store volatile float %val64, float* %addr_uxtwN
 ; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
    ret void
 }
 
@@ -244,11 +251,13 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
    %val_sxtwN = load volatile double* %addr_sxtwN
    store volatile double %val_sxtwN, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %addr_lslN = getelementptr double* %base, i64 %off64
   %val_lslN = load volatile double* %addr_lslN
   store volatile double %val_lslN, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #3]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %addrint_uxtw = ptrtoint double* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -257,6 +266,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile double* %addr_uxtw
   store volatile double %val_uxtw, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_sxtw = ptrtoint double* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -265,6 +275,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val64_sxtw = load volatile double* %addr_sxtw
   store volatile double %val64_sxtw, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_lsl = ptrtoint double* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -272,6 +283,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile double* %addr_lsl
   store volatile double %val64_lsl, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_uxtwN = ptrtoint double* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -281,6 +293,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val64 = load volatile double* @var_double
   store volatile double %val64, double* %addr_uxtwN
 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
    ret void
 }
 
@@ -292,11 +305,13 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
    %val_sxtwN = load volatile fp128* %addr_sxtwN
    store volatile fp128 %val_sxtwN, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %addr_lslN = getelementptr fp128* %base, i64 %off64
   %val_lslN = load volatile fp128* %addr_lslN
   store volatile fp128 %val_lslN, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %addrint_uxtw = ptrtoint fp128* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -305,6 +320,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile fp128* %addr_uxtw
   store volatile fp128 %val_uxtw, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %base_sxtw = ptrtoint fp128* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -313,6 +329,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64_sxtw = load volatile fp128* %addr_sxtw
   store volatile fp128 %val64_sxtw, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %base_lsl = ptrtoint fp128* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -320,6 +337,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile fp128* %addr_lsl
   store volatile fp128 %val64_lsl, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %base_uxtwN = ptrtoint fp128* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -329,5 +347,6 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64 = load volatile fp128* %base
   store volatile fp128 %val64, fp128* %addr_uxtwN
 ; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
    ret void
 }
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
index 03dedcc..bea5bb5 100644
--- a/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
@@ -194,9 +195,11 @@ define void @ldst_float() {
 
   %valfp = load volatile float* %addrfp
 ; CHECK: ldur {{s[0-9]+}}, [{{x[0-9]+}}, #-5]
+; CHECK-NOFP-NOT: ldur {{s[0-9]+}},
 
   store volatile float %valfp, float* %addrfp
 ; CHECK: stur {{s[0-9]+}}, [{{x[0-9]+}}, #-5]
+; CHECK-NOFP-NOT: stur {{s[0-9]+}},
 
   ret void
 }
@@ -210,9 +213,11 @@ define void @ldst_double() {
 
   %valfp = load volatile double* %addrfp
 ; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #4]
+; CHECK-NOFP-NOT: ldur {{d[0-9]+}},
 
   store volatile double %valfp, double* %addrfp
 ; CHECK: stur {{d[0-9]+}}, [{{x[0-9]+}}, #4]
+; CHECK-NOFP-NOT: stur {{d[0-9]+}},
 
    ret void
 }
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
index 77cef4e..44c1586 100644
--- a/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
@@ -230,9 +231,11 @@ define void @ldst_float() {
    %valfp = load volatile float* @var_float
 ; CHECK: adrp {{x[0-9]+}}, var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   store volatile float %valfp, float* @var_float
 ; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK-NOFP-NOT: str {{s[0-9]+}},
 
    ret void
 }
@@ -243,9 +246,11 @@ define void @ldst_double() {
    %valfp = load volatile double* @var_double
 ; CHECK: adrp {{x[0-9]+}}, var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   store volatile double %valfp, double* @var_double
 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK-NOFP-NOT: str {{d[0-9]+}},
 
    ret void
 }
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index c5ce241..9a66a00 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'AArch64' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/AArch64/literal_pools.ll b/test/CodeGen/AArch64/literal_pools.ll
index b82f290..fc33aee 100644
--- a/test/CodeGen/AArch64/literal_pools.ll
+++ b/test/CodeGen/AArch64/literal_pools.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -65,8 +67,8 @@ define void @floating_lits() {
   %floatval = load float* @varfloat
   %newfloat = fadd float %floatval, 128.0
 ; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr {{s[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK: fadd
+; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
 ; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
@@ -74,20 +76,26 @@ define void @floating_lits() {
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
 ; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
 ; CHECK-LARGE: fadd
+; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
+; CHECK-NOFP-LARGE-NOT: fadd
 
   store float %newfloat, float* @varfloat
 
   %doubleval = load double* @vardouble
   %newdouble = fadd double %doubleval, 129.0
 ; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr {{d[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK: fadd
+; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, [[LIT128]]
+; CHECK: fadd {{d[0-9]+}}, {{d[0-9]+}}, [[LIT129]]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
+; CHECK-NOFP-NOT: fadd
 
 ; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
 ; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
+; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
 
   store double %newdouble, double* @vardouble
 
diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/neon-2velem-high.ll
new file mode 100644
index 0000000..97031d9
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-2velem-high.ll
@@ -0,0 +1,331 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK: test_vmull_high_n_s16:
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK: test_vmull_high_n_s32:
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
+; CHECK: test_vmull_high_n_u16:
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK: test_vmull_high_n_u32:
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK: test_vqdmull_high_n_s16:
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vqdmull15.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK: test_vqdmull_high_n_s32:
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vqdmull9.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlal_high_n_s16:
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlal_high_n_s32:
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlal_high_n_u16:
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlal_high_n_u32:
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vqdmlal_high_n_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+  ret <4 x i32> %vqdmlal17.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vqdmlal_high_n_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+  ret <2 x i64> %vqdmlal11.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlsl_high_n_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlsl_high_n_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlsl_high_n_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlsl_high_n_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vqdmlsl_high_n_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+  ret <4 x i32> %vqdmlsl17.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vqdmlsl_high_n_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+  ret <2 x i64> %vqdmlsl11.i.i
+}
+
+define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
+; CHECK: test_vmul_n_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %a
+  ret <2 x float> %mul.i
+}
+
+define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
+; CHECK: test_vmulq_n_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %a
+  ret <4 x float> %mul.i
+}
+
+define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
+; CHECK: test_vmulq_n_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %a
+  ret <2 x double> %mul.i
+}
+
+define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK: test_vfma_n_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK: test_vfmaq_n_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK: test_vfms_n_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %1
+}
+
+define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK: test_vfmsq_n_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %1
+}
diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/neon-2velem.ll
new file mode 100644
index 0000000..9d61842
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-2velem.ll
@@ -0,0 +1,2550 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
+
+declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
+
+declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
+
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmla_lane_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlaq_lane_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmla_lane_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlaq_lane_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmla_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlaq_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmla_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlaq_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmls_lane_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsq_lane_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmls_lane_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsq_lane_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmls_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsq_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmls_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsq_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfma_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmaq_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfma_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmaq_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfms_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmsq_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfms_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmsq_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK: test_vfmaq_lane_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmaq_laneq_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK: test_vfmsq_lane_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %sub = fsub <1 x double> <double -0.000000e+00>, %v
+  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmsq_laneq_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_lane_s16:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_lane_s32:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_high_lane_s16:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_high_lane_s32:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_lane_s16:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_lane_s32:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_high_lane_s16:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_high_lane_s32:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_lane_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_lane_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_laneq_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_laneq_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_high_lane_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_high_lane_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_high_laneq_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_high_laneq_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulh_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulhq_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulh_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulhq_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulh_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulhq_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulh_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulhq_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmul_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
+; CHECK: test_vmul_lane_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <1 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulq_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK: test_vmulq_lane_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmul_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
+; CHECK: test_vmul_laneq_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 1
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulq_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulq_laneq_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulx_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulxq_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK: test_vmulxq_lane_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulx_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulxq_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulxq_laneq_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmla_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlaq_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmla_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlaq_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmla_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlaq_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmla_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlaq_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmls_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsq_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmls_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsq_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmls_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsq_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmls_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsq_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfma_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmaq_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfma_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmaq_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfms_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmsq_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfms_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmsq_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmaq_laneq_f64_0:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmsq_laneq_f64_0:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_lane_s16_0:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_lane_s32_0:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_high_lane_s16_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_high_lane_s32_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_lane_s16_0:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_lane_s32_0:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_high_lane_s16_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_high_lane_s32_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_lane_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_lane_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_laneq_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_laneq_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_high_lane_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_high_lane_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_high_laneq_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_high_laneq_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulh_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulhq_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulh_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulhq_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulh_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulhq_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulh_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulhq_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmul_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulq_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmul_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
+; CHECK: test_vmul_laneq_f64_0:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulq_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulq_laneq_f64_0:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulx_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulxq_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
+; CHECK: test_vmulxq_lane_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulx_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulxq_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulxq_laneq_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/neon-3vdiff.ll
new file mode 100644
index 0000000..171e2b2
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-3vdiff.ll
@@ -0,0 +1,1806 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>)
+
+declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vaddl_s8:
+; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vaddl_s16:
+; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vaddl_s32:
+; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vaddl_u8:
+; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vaddl_u16:
+; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vaddl_u32:
+; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vaddl_high_s8:
+; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddl_high_s16:
+; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddl_high_s32:
+; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vaddl_high_u8:
+; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddl_high_u16:
+; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddl_high_u32:
+; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vaddw_s8:
+; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vaddw_s16:
+; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vaddw_s32:
+; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vaddw_u8:
+; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vaddw_u16:
+; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vaddw_u32:
+; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vaddw_high_s8:
+; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vaddw_high_s16:
+; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vaddw_high_s32:
+; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vaddw_high_u8:
+; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vaddw_high_u16:
+; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vaddw_high_u32:
+; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsubl_s8:
+; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsubl_s16:
+; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsubl_s32:
+; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsubl_u8:
+; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsubl_u16:
+; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsubl_u32:
+; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsubl_high_s8:
+; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubl_high_s16:
+; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubl_high_s32:
+; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsubl_high_u8:
+; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubl_high_u16:
+; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubl_high_u32:
+; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vsubw_s8:
+; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vsubw_s16:
+; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vsubw_s32:
+; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vsubw_u8:
+; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vsubw_u16:
+; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vsubw_u32:
+; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vsubw_high_s8:
+; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vsubw_high_s16:
+; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vsubw_high_s32:
+; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vsubw_high_u8:
+; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vsubw_high_u16:
+; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vsubw_high_u32:
+; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_s16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_s32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_s64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_u16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_u32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_u64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_high_s16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_high_s32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_high_s64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_high_u16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_high_u32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_high_u64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_s16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_s32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_s64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_u16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_u32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_u64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_high_s16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_high_s32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_high_s64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_high_u16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_high_u32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_high_u64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_s16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_s32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_s64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_u16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_u32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_u64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_high_s16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_high_s32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_high_s64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_high_u16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_high_u32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_high_u64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_s16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_s32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_s64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_u16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_u32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_u64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_high_s16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_high_s32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_high_s64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_high_u16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_high_u32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_high_u64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vabdl_s8:
+; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vabdl_s16:
+; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vabdl_s32:
+; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vabdl_u8:
+; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vabdl_u16:
+; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vabdl_u32:
+; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vabal_s8:
+; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vabal_s16:
+; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vabal_s32:
+; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vabal_u8:
+; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vabal_u16:
+; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vabal_u32:
+; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vabdl_high_s8:
+; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vabdl_high_s16:
+; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vabdl_high_s32:
+; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vabdl_high_u8:
+; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vabdl_high_u16:
+; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vabdl_high_u32:
+; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vabal_high_s8:
+; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vabal_high_s16:
+; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vabal_high_s32:
+; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vabal_high_u8:
+; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vabal_high_u16:
+; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vabal_high_u32:
+; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vmull_s8:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vmull_s16:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vmull_s32:
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vmull_u8:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vmull_u16:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vmull_u32:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vmull_high_s8:
+; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vmull_high_s16:
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vmull_high_s32:
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vmull_high_u8:
+; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vmull_high_u16:
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vmull_high_u32:
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlal_s8:
+; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlal_s16:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlal_s32:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlal_u8:
+; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlal_u16:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlal_u32:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlal_high_s8:
+; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlal_high_s16:
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlal_high_s32:
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlal_high_u8:
+; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlal_high_u16:
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlal_high_u32:
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlsl_s8:
+; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlsl_s16:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlsl_s32:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlsl_u8:
+; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlsl_u16:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlsl_u32:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlsl_high_s8:
+; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlsl_high_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlsl_high_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlsl_high_u8:
+; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlsl_high_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlsl_high_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vqdmull_s16:
+; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vqdmull_s32:
+; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vqdmlal_s16:
+; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vqdmlal_s32:
+; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vqdmlsl_s16:
+; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vqdmlsl_s32:
+; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vqdmull_high_s16:
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vqdmull2.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vqdmull_high_s32:
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vqdmull2.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vqdmlal_high_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+  ret <4 x i32> %vqdmlal4.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vqdmlal_high_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+  ret <2 x i64> %vqdmlal4.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vqdmlsl_high_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+  ret <4 x i32> %vqdmlsl4.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vqdmlsl_high_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+  ret <2 x i64> %vqdmlsl4.i.i
+}
+
+define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vmull_p8:
+; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vmull_high_p8:
+; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll
index b423666..5400984 100644
--- a/test/CodeGen/AArch64/neon-aba-abd.ll
+++ b/test/CodeGen/AArch64/neon-aba-abd.ll
@@ -157,6 +157,16 @@ define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ret <2 x i32> %abd
 }
 
+define <2 x i32> @test_sabd_v2i32_const() {
+; CHECK: test_sabd_v2i32_const:
+; CHECK: movi     d1, #0xffffffff0000
+; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
+  %1 = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(
+    <2 x i32> <i32 -2147483648, i32 2147450880>,
+    <2 x i32> <i32 -65536, i32 65535>)
+  ret <2 x i32> %1
+}
+
 define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_saba_v2i32:
   %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
@@ -223,4 +233,4 @@ define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
   %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
 ; CHECK: fabd v0.2d, v0.2d, v1.2d
   ret <2 x double> %abd
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll
new file mode 100644
index 0000000..733db97
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-across.ll
@@ -0,0 +1,476 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float>)
+
+declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32>)
+
+declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8>)
+
+declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32>)
+
+declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8>)
+
+define i16 @test_vaddlv_s8(<8 x i8> %a) {
+; CHECK: test_vaddlv_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i16> %saddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_s16(<4 x i16> %a) {
+; CHECK: test_vaddlv_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i32> %saddlv.i, i32 0
+  ret i32 %0
+}
+
+define i16 @test_vaddlv_u8(<8 x i8> %a) {
+; CHECK: test_vaddlv_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_u16(<4 x i16> %a) {
+; CHECK: test_vaddlv_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
+  ret i32 %0
+}
+
+define i16 @test_vaddlvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i16> %saddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i32> %saddlv.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_s32:
+; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %saddlv.i = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i64> %saddlv.i, i32 0
+  ret i64 %0
+}
+
+define i16 @test_vaddlvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_u32:
+; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uaddlv.i = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i64> %uaddlv.i, i32 0
+  ret i64 %0
+}
+
+define i8 @test_vmaxv_s8(<8 x i8> %a) {
+; CHECK: test_vmaxv_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %smaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_s16(<4 x i16> %a) {
+; CHECK: test_vmaxv_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %smaxv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vmaxv_u8(<8 x i8> %a) {
+; CHECK: test_vmaxv_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %umaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_u16(<4 x i16> %a) {
+; CHECK: test_vmaxv_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %umaxv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vmaxvq_s8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %smaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_s16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %smaxv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_s32:
+; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %smaxv.i = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %smaxv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vmaxvq_u8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %umaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_u16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %umaxv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_u32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_u32:
+; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %umaxv.i = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %umaxv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vminv_s8(<8 x i8> %a) {
+; CHECK: test_vminv_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %sminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminv_s16(<4 x i16> %a) {
+; CHECK: test_vminv_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %sminv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vminv_u8(<8 x i8> %a) {
+; CHECK: test_vminv_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %uminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminv_u16(<4 x i16> %a) {
+; CHECK: test_vminv_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %uminv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vminvq_s8(<16 x i8> %a) {
+; CHECK: test_vminvq_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %sminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminvq_s16(<8 x i16> %a) {
+; CHECK: test_vminvq_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %sminv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a) {
+; CHECK: test_vminvq_s32:
+; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sminv.i = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %sminv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vminvq_u8(<16 x i8> %a) {
+; CHECK: test_vminvq_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %uminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminvq_u16(<8 x i16> %a) {
+; CHECK: test_vminvq_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %uminv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vminvq_u32(<4 x i32> %a) {
+; CHECK: test_vminvq_u32:
+; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uminv.i = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %uminv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vaddv_s8(<8 x i8> %a) {
+; CHECK: test_vaddv_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddv_s16(<4 x i16> %a) {
+; CHECK: test_vaddv_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vaddv_u8(<8 x i8> %a) {
+; CHECK: test_vaddv_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddv_u16(<4 x i16> %a) {
+; CHECK: test_vaddv_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddvq_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddvq_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddvq_s32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %vaddv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddvq_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddvq_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddvq_u32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %vaddv.i, i32 0
+  ret i32 %0
+}
+
+define float @test_vmaxvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxvq_f32:
+; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vmaxv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vmaxv.i, i32 0
+  ret float %0
+}
+
+define float @test_vminvq_f32(<4 x float> %a) {
+; CHECK: test_vminvq_f32:
+; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vminv.i = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vminv.i, i32 0
+  ret float %0
+}
+
+define float @test_vmaxnmvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxnmvq_f32:
+; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vmaxnmv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vmaxnmv.i, i32 0
+  ret float %0
+}
+
+define float @test_vminnmvq_f32(<4 x float> %a) {
+; CHECK: test_vminnmvq_f32:
+; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vminnmv.i = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vminnmv.i, i32 0
+  ret float %0
+}
+
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll
index 65ec8a2..078ba14 100644
--- a/test/CodeGen/AArch64/neon-add-sub.ll
+++ b/test/CodeGen/AArch64/neon-add-sub.ll
@@ -118,15 +118,120 @@ define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
 	ret <2 x double> %tmp3
 }
 
-define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
-	%tmp3 = add <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
+define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vadd_f64
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fadd <1 x double> %a, %b
+  ret <1 x double> %1
 }
 
-define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
-	%tmp3 = sub <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
+define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmul_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %a, %b
+  ret <1 x double> %1
 }
 
+define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vdiv_f64
+; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fdiv <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmla_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fadd <1 x double> %1, %a
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmls_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fsub <1 x double> %a, %1
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfms_f64
+; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %b
+  %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfma_f64
+; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vsub_f64
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vabd_f64
+; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmax_f64
+; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmin_f64
+; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmaxnm_f64
+; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vminnm_f64
+; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabs_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vabs_f64
+; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vneg_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vneg_f64
+; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %a
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
+declare <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll
new file mode 100644
index 0000000..6bd923d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-bsl.ll
@@ -0,0 +1,222 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+
+declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
+
+declare <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double>, <1 x double>, <1 x double>)
+
+declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
+
+declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+
+define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: test_vbsl_s8:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+; CHECK-LABEL: test_vbsl_s16:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
+  %0 = bitcast <4 x i16> %vbsl3.i to <8 x i8>
+  ret <8 x i8> %0
+}
+
+define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: test_vbsl_s32:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
+  ret <2 x i32> %vbsl3.i
+}
+
+define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_vbsl_s64:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
+  ret <1 x i64> %vbsl3.i
+}
+
+define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: test_vbsl_u8:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+; CHECK-LABEL: test_vbsl_u16:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
+  ret <4 x i16> %vbsl3.i
+}
+
+define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: test_vbsl_u32:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
+  ret <2 x i32> %vbsl3.i
+}
+
+define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_vbsl_u64:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
+  ret <1 x i64> %vbsl3.i
+}
+
+define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) {
+; CHECK-LABEL: test_vbsl_f32:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3)
+  ret <2 x float> %vbsl3.i
+}
+
+define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_vbsl_f64:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = bitcast <1 x i64> %v1 to <1 x double>
+  %vbsl3.i = tail call <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double> %vbsl.i, <1 x double> %v2, <1 x double> %v3)
+  ret <1 x double> %vbsl3.i
+}
+
+define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: test_vbsl_p8:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+; CHECK-LABEL: test_vbsl_p16:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
+  ret <4 x i16> %vbsl3.i
+}
+
+define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+; CHECK-LABEL: test_vbslq_s8:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+; CHECK-LABEL: test_vbslq_s16:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-LABEL: test_vbslq_s32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
+  ret <4 x i32> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
+; CHECK-LABEL: test_vbslq_s64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
+  ret <2 x i64> %vbsl3.i
+}
+
+define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+; CHECK-LABEL: test_vbslq_u8:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+; CHECK-LABEL: test_vbslq_u16:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-LABEL: test_vbslq_u32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
+  ret <4 x i32> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
+; CHECK-LABEL: test_vbslq_u64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
+  ret <2 x i64> %vbsl3.i
+}
+
+define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) {
+; CHECK-LABEL: test_vbslq_f32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = bitcast <4 x i32> %v1 to <4 x float>
+  %vbsl3.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %vbsl.i, <4 x float> %v2, <4 x float> %v3)
+  ret <4 x float> %vbsl3.i
+}
+
+define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+; CHECK-LABEL: test_vbslq_p8:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+; CHECK-LABEL: test_vbslq_p16:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
+  ret <8 x i16> %vbsl3.i
+}
+
+define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) {
+; CHECK-LABEL: test_vbslq_f64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = bitcast <2 x i64> %v1 to <2 x double>
+  %vbsl3.i = tail call <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double> %vbsl.i, <2 x double> %v2, <2 x double> %v3)
+  ret <2 x double> %vbsl3.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 0848f9b..68f0342 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -51,8 +51,7 @@ define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
 
 define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
@@ -60,8 +59,7 @@ define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
 
 define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
@@ -69,8 +67,7 @@ define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
 
 define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
@@ -78,8 +75,7 @@ define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
 
 define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
@@ -87,8 +83,7 @@ define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
 
 define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -96,8 +91,7 @@ define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
 
 define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -105,8 +99,7 @@ define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
 
 define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -867,8 +860,7 @@ define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
 
 define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
@@ -876,8 +868,7 @@ define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
 
 define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
@@ -885,8 +876,7 @@ define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
 
 define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
@@ -894,8 +884,7 @@ define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
 
 define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
@@ -903,8 +892,7 @@ define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
 
 define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -912,8 +900,7 @@ define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
 
 define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -921,8 +908,7 @@ define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
 
 define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1369,8 +1355,7 @@ define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1382,8 +1367,7 @@ define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1395,8 +1379,7 @@ define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1408,8 +1391,7 @@ define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) {
 ;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1421,8 +1403,7 @@ define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) {
 ;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1434,8 +1415,7 @@ define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) {
 ;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1445,8 +1425,7 @@ define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1456,8 +1435,7 @@ define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1467,8 +1445,7 @@ define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1478,8 +1455,7 @@ define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1489,16 +1465,14 @@ define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1508,8 +1482,7 @@ define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1519,8 +1492,7 @@ define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1529,8 +1501,7 @@ define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1540,8 +1511,7 @@ define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1551,8 +1521,7 @@ define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1561,8 +1530,7 @@ define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1572,8 +1540,7 @@ define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
 ;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1583,8 +1550,7 @@ define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
 ;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1593,8 +1559,7 @@ define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
 ;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1766,8 +1731,7 @@ define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) {
 ;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1778,8 +1742,7 @@ define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) {
 ;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1790,8 +1753,7 @@ define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
 ;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1800,8 +1762,7 @@ define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
 ; UGE with zero = !OLT
 ;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1810,8 +1771,7 @@ define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
 ; UGE with zero = !OLT
 ;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1819,8 +1779,7 @@ define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
 ; UGE with zero = !OLT
 ;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1829,8 +1788,7 @@ define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
 ; UGT with zero = !OLE
 ;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1839,8 +1797,7 @@ define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
 ; UGT with zero = !OLE
 ;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1848,8 +1805,7 @@ define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
 ; UGT with zero = !OLE
 ;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1858,8 +1814,7 @@ define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
 ; ULT with zero = !OGE
 ;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1867,8 +1822,7 @@ define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
 
 define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1876,8 +1830,7 @@ define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
 
 define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1887,8 +1840,7 @@ define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
 ; ULE with zero = !OGT
 ;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1897,8 +1849,7 @@ define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
 ; ULE with zero = !OGT
 ;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1907,8 +1858,7 @@ define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
 ; ULE with zero = !OGT
 ;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1917,8 +1867,7 @@ define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
 ; UNE with zero = !OEQ with zero
 ;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1927,8 +1876,7 @@ define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
 ; UNE with zero = !OEQ with zero
 ;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1936,8 +1884,7 @@ define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
 ; UNE with zero = !OEQ with zero
 ;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1949,8 +1896,7 @@ define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1961,8 +1907,7 @@ define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1973,8 +1918,7 @@ define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll
new file mode 100644
index 0000000..e18530e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-copy.ll
@@ -0,0 +1,615 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+
+define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[15], {{w[0-31]+}}
+  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[6], {{w[0-31]+}}
+  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[2], {{w[0-31]+}}
+  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{x[0-31]+}}
+  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[5], {{w[0-31]+}}
+  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
+  ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[3], {{w[0-31]+}}
+  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
+  ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{w[0-31]+}}
+  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[7], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[4], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 0
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define i32 @umovw16b(<16 x i8> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw8h(<8 x i16> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4s(<4 x i32> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  ret i32 %tmp3
+}
+
+define i64 @umovx2d(<2 x i64> %tmp1) {
+;CHECK: umov {{x[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  ret i64 %tmp3
+}
+
+define i32 @umovw8b(<8 x i8> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[7]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4h(<4 x i16> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw2s(<2 x i32> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  ret i32 %tmp3
+}
+
+define i64 @umovx1d(<1 x i64> %tmp1) {
+;CHECK: fmov {{x[0-31]+}}, {{d[0-31]+}}
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  ret i64 %tmp3
+}
+
+define i32 @smovw16b(<16 x i8> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw8h(<8 x i16> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx16b(<16 x i8> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @smovx8h(<8 x i16> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx4s(<4 x i32> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define i32 @smovw8b(<8 x i8> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[4]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw4h(<4 x i16> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx8b(<8 x i8> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[6]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
+  %tmp4 = sext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @smovx4h(<4 x i16> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx2s(<2 x i32> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
+  ret <8 x i8> %vecinit7.i
+}
+
+define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
+;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
+  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
+  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
+  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
+  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
+  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
+  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
+  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
+  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
+  ret <16 x i8> %vecinit15.i
+}
+
+define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
+  ret <2 x i64> %vecinit1.i
+}
+
+define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
+;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
+; CHECK-LABEL: test_bitcastv8i8toi64:
+   %res = bitcast <8 x i8> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
+; CHECK-LABEL: test_bitcastv4i16toi64:
+   %res = bitcast <4 x i16> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
+; CHECK-LABEL: test_bitcastv2i32toi64:
+   %res = bitcast <2 x i32> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
+; CHECK-LABEL: test_bitcastv2f32toi64:
+   %res = bitcast <2 x float> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
+; CHECK-LABEL: test_bitcastv1i64toi64:
+   %res = bitcast <1 x i64> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
+; CHECK-LABEL: test_bitcastv1f64toi64:
+   %res = bitcast <1 x double> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov8i8:
+   %res = bitcast i64 %in to <8 x i8>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <8 x i8> %res
+}
+
+define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov4i16:
+   %res = bitcast i64 %in to <4 x i16>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <4 x i16> %res
+}
+
+define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2i32:
+   %res = bitcast i64 %in to <2 x i32>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x i32> %res
+}
+
+define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2f32:
+   %res = bitcast i64 %in to <2 x float>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x float> %res
+}
+
+define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1i64:
+   %res = bitcast i64 %in to <1 x i64>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x i64> %res
+}
+
+define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1f64:
+   %res = bitcast i64 %in to <1 x double>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x double> %res
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll
new file mode 100644
index 0000000..0283e0e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-crypto.ll
@@ -0,0 +1,149 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
+; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
+
+declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32>, <1 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32>, <1 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32>, <1 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>) #1
+
+declare <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32>) #1
+
+declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>) #1
+
+declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>) #1
+
+declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>) #1
+
+declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>) #1
+
+define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) {
+; CHECK: test_vaeseq_u8:
+; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese
+entry:
+  %aese.i = tail call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %aese.i
+}
+
+define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) {
+; CHECK: test_vaesdq_u8:
+; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %aesd.i
+}
+
+define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) {
+; CHECK: test_vaesmcq_u8:
+; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %data)
+  ret <16 x i8> %aesmc.i
+}
+
+define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) {
+; CHECK: test_vaesimcq_u8:
+; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %data)
+  ret <16 x i8> %aesimc.i
+}
+
+define i32 @test_vsha1h_u32(i32 %hash_e) {
+; CHECK: test_vsha1h_u32:
+; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %sha1h.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1h1.i = tail call <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32> %sha1h.i)
+  %0 = extractelement <1 x i32> %sha1h1.i, i32 0
+  ret i32 %0
+}
+
+define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) {
+; CHECK: test_vsha1su1q_u32:
+; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w12_15)
+  ret <4 x i32> %sha1su12.i
+}
+
+define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK: test_vsha256su0q_u32:
+; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %sha256su02.i
+}
+
+define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK: test_vsha1cq_u32:
+; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha1c.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1c1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32> %hash_abcd, <1 x i32> %sha1c.i, <4 x i32> %wk)
+  ret <4 x i32> %sha1c1.i
+}
+
+define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK: test_vsha1pq_u32:
+; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha1p.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1p1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32> %hash_abcd, <1 x i32> %sha1p.i, <4 x i32> %wk)
+  ret <4 x i32> %sha1p1.i
+}
+
+define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK: test_vsha1mq_u32:
+; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha1m.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1m1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32> %hash_abcd, <1 x i32> %sha1m.i, <4 x i32> %wk)
+  ret <4 x i32> %sha1m1.i
+}
+
+define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) {
+; CHECK: test_vsha1su0q_u32:
+; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
+  ret <4 x i32> %sha1su03.i
+}
+
+define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK: test_vsha256hq_u32:
+; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %sha256h3.i
+}
+
+define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK: test_vsha256h2q_u32:
+; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %sha256h23.i
+}
+
+define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK: test_vsha256su1q_u32:
+; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %sha256su13.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll
new file mode 100644
index 0000000..f546aa7
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfma_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x float> %shuffle, %b
+  %add = fadd <2 x float> %mul, %a
+  ret <2 x float> %add
+}
+
+define <4 x i32> @test_vshrn_not_match(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_not_match
+; CHECK-NOT: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #35
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = ashr <2 x i64> %b, <i64 35, i64 35>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
new file mode 100644
index 0000000..5c52cd3
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+entry:
+  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %vext
+}
+
+define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+entry:
+  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i16> %vext
+}
+
+define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vext_s32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+entry:
+  %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %vext
+}
+
+define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK: test_vext_s64:
+entry:
+  %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
+  ret <1 x i64> %vext
+}
+
+define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+entry:
+  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ret <16 x i8> %vext
+}
+
+define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %vext
+}
+
+define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vextq_s32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+entry:
+  %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %vext
+}
+
+define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vextq_s64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+entry:
+  %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i64> %vext
+}
+
+define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vext_u8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+entry:
+  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %vext
+}
+
+define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vext_u16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+entry:
+  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i16> %vext
+}
+
+define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vext_u32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+entry:
+  %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %vext
+}
+
+define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK: test_vext_u64:
+entry:
+  %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
+  ret <1 x i64> %vext
+}
+
+define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vextq_u8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+entry:
+  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ret <16 x i8> %vext
+}
+
+define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vextq_u16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %vext
+}
+
+define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vextq_u32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+entry:
+  %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %vext
+}
+
+define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vextq_u64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+entry:
+  %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i64> %vext
+}
+
+define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vext_f32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+entry:
+  %vext = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %vext
+}
+
+define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK: test_vext_f64:
+entry:
+  %vext = shufflevector <1 x double> %a, <1 x double> %b, <1 x i32> <i32 0>
+  ret <1 x double> %vext
+}
+
+define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vextq_f32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+entry:
+  %vext = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x float> %vext
+}
+
+define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vextq_f64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+entry:
+  %vext = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x double> %vext
+}
+
+define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vext_p8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+entry:
+  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %vext
+}
+
+define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vext_p16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+entry:
+  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i16> %vext
+}
+
+define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vextq_p8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+entry:
+  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ret <16 x i8> %vext
+}
+
+define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vextq_p16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %vext
+}
diff --git a/test/CodeGen/AArch64/neon-misc-scalar.ll b/test/CodeGen/AArch64/neon-misc-scalar.ll
new file mode 100644
index 0000000..cca8deb
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-misc-scalar.ll
@@ -0,0 +1,60 @@
+;RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
+
+declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
+
+declare <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64>)
+
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>)
+
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) {
+entry:
+  ; CHECK: test_vuqadd_s64
+  %vuqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ; CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vuqadd2.i
+}
+
+define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) {
+entry:
+  ; CHECK: test_vsqadd_u64
+  %vsqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ; CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vsqadd2.i
+}
+
+define <1 x i64> @test_vabs_s64(<1 x i64> %a) {
+  ; CHECK: test_vabs_s64
+entry:
+  %vabs1.i = tail call <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64> %a)
+  ; CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vabs1.i
+}
+
+define <1 x i64> @test_vqabs_s64(<1 x i64> %a) {
+  ; CHECK: test_vqabs_s64
+entry:
+  %vqabs1.i = tail call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %a)
+  ; CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vqabs1.i
+}
+
+define <1 x i64> @test_vqneg_s64(<1 x i64> %a) {
+  ; CHECK: test_vqneg_s64
+entry:
+  %vqneg1.i = tail call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %a)
+  ; CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vqneg1.i
+}
+
+define <1 x i64> @test_vneg_s64(<1 x i64> %a) {
+  ; CHECK: test_vneg_s64
+entry:
+  %sub.i = sub <1 x i64> zeroinitializer, %a
+  ; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %sub.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll
new file mode 100644
index 0000000..9660bf2
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-misc.ll
@@ -0,0 +1,1799 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+
+define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+; CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+; CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x i16> %shuffle.i
+}
+
+define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x i32> %shuffle.i
+}
+
+define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x float> %shuffle.i
+}
+
+define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x float> %shuffle.i
+}
+
+define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
+  ret <4 x i16> %vpaddl.i
+}
+
+define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
+  ret <2 x i32> %vpaddl1.i
+}
+
+define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
+  ret <1 x i64> %vpaddl1.i
+}
+
+define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
+  ret <4 x i16> %vpaddl.i
+}
+
+define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
+  ret <2 x i32> %vpaddl1.i
+}
+
+define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
+  ret <1 x i64> %vpaddl1.i
+}
+
+define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
+  ret <8 x i16> %vpaddl.i
+}
+
+define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
+  ret <4 x i32> %vpaddl1.i
+}
+
+define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
+  ret <2 x i64> %vpaddl1.i
+}
+
+define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
+  ret <8 x i16> %vpaddl.i
+}
+
+define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
+  ret <4 x i32> %vpaddl1.i
+}
+
+define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
+  ret <2 x i64> %vpaddl1.i
+}
+
+define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
+  ret <4 x i16> %vpadal1.i
+}
+
+define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
+  ret <2 x i32> %vpadal2.i
+}
+
+define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
+  ret <1 x i64> %vpadal2.i
+}
+
+define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
+  ret <4 x i16> %vpadal1.i
+}
+
+define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
+  ret <2 x i32> %vpadal2.i
+}
+
+define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
+  ret <1 x i64> %vpadal2.i
+}
+
+define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
+  ret <8 x i16> %vpadal1.i
+}
+
+define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
+  ret <4 x i32> %vpadal2.i
+}
+
+define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
+  ret <2 x i64> %vpadal2.i
+}
+
+define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
+  ret <8 x i16> %vpadal1.i
+}
+
+define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
+  ret <4 x i32> %vpadal2.i
+}
+
+define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
+  ret <2 x i64> %vpadal2.i
+}
+
+define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vqabs.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vqabs.i
+}
+
+define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vqabs.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vqabs.i
+}
+
+define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vqabs1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vqabs1.i
+}
+
+define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vqabs1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vqabs1.i
+}
+
+define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vqabs1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vqabs1.i
+}
+
+define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vqabs1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vqabs1.i
+}
+
+define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vqabs1.i = tail call <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64> %a) #4
+  ret <2 x i64> %vqabs1.i
+}
+
+define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vqneg.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vqneg.i
+}
+
+define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vqneg.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vqneg.i
+}
+
+define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vqneg1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vqneg1.i
+}
+
+define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vqneg1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vqneg1.i
+}
+
+define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vqneg1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vqneg1.i
+}
+
+define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vqneg1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vqneg1.i
+}
+
+define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vqneg1.i = tail call <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64> %a) #4
+  ret <2 x i64> %vqneg1.i
+}
+
+define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %sub.i = sub <8 x i8> zeroinitializer, %a
+  ret <8 x i8> %sub.i
+}
+
+define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %sub.i = sub <16 x i8> zeroinitializer, %a
+  ret <16 x i8> %sub.i
+}
+
+define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %sub.i = sub <4 x i16> zeroinitializer, %a
+  ret <4 x i16> %sub.i
+}
+
+define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %sub.i = sub <8 x i16> zeroinitializer, %a
+  ret <8 x i16> %sub.i
+}
+
+define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %sub.i = sub <2 x i32> zeroinitializer, %a
+  ret <2 x i32> %sub.i
+}
+
+define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %sub.i = sub <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %sub.i = sub <2 x i64> zeroinitializer, %a
+  ret <2 x i64> %sub.i
+}
+
+define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+; CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+  ret <2 x float> %sub.i
+}
+
+define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+; CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <4 x float> %sub.i
+}
+
+define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
+; CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+  ret <2 x double> %sub.i
+}
+
+define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vabs.i
+}
+
+define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vabs.i
+}
+
+define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vabs1.i
+}
+
+define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vabs1.i
+}
+
+define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vabs1.i
+}
+
+define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vabs1.i
+}
+
+define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vabs1.i = tail call <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64> %a) #4
+  ret <2 x i64> %vabs1.i
+}
+
+define <2 x float> @test_vabs_f32(<2 x float> %a) #1 {
+; CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vabs1.i
+}
+
+define <4 x float> @test_vabsq_f32(<4 x float> %a) #1 {
+; CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vabs1.i
+}
+
+define <2 x double> @test_vabsq_f64(<2 x double> %a) #1 {
+; CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vabs1.i = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vabs1.i
+}
+
+define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vuqadd.i = tail call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+  ret <8 x i8> %vuqadd.i
+}
+
+define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vuqadd.i = tail call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+  ret <16 x i8> %vuqadd.i
+}
+
+define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vuqadd2.i = tail call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+  ret <4 x i16> %vuqadd2.i
+}
+
+define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vuqadd2.i = tail call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+  ret <8 x i16> %vuqadd2.i
+}
+
+define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vuqadd2.i = tail call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+  ret <2 x i32> %vuqadd2.i
+}
+
+define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vuqadd2.i = tail call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+  ret <4 x i32> %vuqadd2.i
+}
+
+define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vuqadd2.i = tail call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+  ret <2 x i64> %vuqadd2.i
+}
+
+define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vcls.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vcls.i
+}
+
+define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vcls.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vcls.i
+}
+
+define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vcls1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vcls1.i
+}
+
+define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vcls1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vcls1.i
+}
+
+define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcls1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vcls1.i
+}
+
+define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcls1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vcls1.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+  ret <8 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+  ret <16 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
+  ret <4 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
+  ret <8 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
+  ret <2 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
+  ret <4 x i32> %vclz1.i
+}
+
+define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+; CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vctpop.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vctpop.i
+}
+
+define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+; CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vctpop.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vctpop.i
+}
+
+define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <8 x i8> %neg.i
+}
+
+define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <16 x i8> %neg.i
+}
+
+define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <4 x i16> %neg.i
+}
+
+define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <8 x i16> %neg.i
+}
+
+define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
+  ret <2 x i32> %neg.i
+}
+
+define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %neg.i
+}
+
+define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
+; CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vrbit.i = tail call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vrbit.i
+}
+
+define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
+; CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vrbit.i = tail call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vrbit.i
+}
+
+define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vmovn.i = trunc <8 x i16> %a to <8 x i8>
+  ret <8 x i8> %vmovn.i
+}
+
+define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vmovn.i = trunc <4 x i32> %a to <4 x i16>
+  ret <4 x i16> %vmovn.i
+}
+
+define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vmovn.i = trunc <2 x i64> %a to <2 x i32>
+  ret <2 x i32> %vmovn.i
+}
+
+define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vmovn.i.i = trunc <8 x i16> %b to <8 x i8>
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vmovn.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vmovn.i.i = trunc <4 x i32> %b to <4 x i16>
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vmovn.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vmovn.i.i = trunc <2 x i64> %b to <2 x i32>
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vmovn.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+; CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vqdmull1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
+  ret <8 x i8> %vqdmull1.i
+}
+
+define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+; CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vqdmull1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
+  ret <4 x i16> %vqdmull1.i
+}
+
+define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+; CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vqdmull1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
+  ret <2 x i32> %vqdmull1.i
+}
+
+define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vqdmull1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %b) #4
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqdmull1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vqdmull1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqdmull1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vqdmull1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %b) #4
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqdmull1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+; CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
+  ret <8 x i8> %vqmovn1.i
+}
+
+define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+; CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
+  ret <4 x i16> %vqmovn1.i
+}
+
+define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+; CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
+  ret <2 x i32> %vqmovn1.i
+}
+
+define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %b) #4
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: test_vqmovn_high_s32
+  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: test_vqmovn_high_s64
+  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %b) #4
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+; CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
+  ret <8 x i8> %vqmovn1.i
+}
+
+define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+; CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
+  ret <4 x i16> %vqmovn1.i
+}
+
+define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+; CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
+  ret <2 x i32> %vqmovn1.i
+}
+
+define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %b) #4
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %b) #4
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
+  %1 = sext <8 x i8> %a to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
+  %1 = sext <4 x i16> %a to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
+  %1 = sext <2 x i32> %a to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
+  %1 = zext <8 x i8> %a to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
+  %1 = zext <4 x i16> %a to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
+  %1 = zext <2 x i32> %a to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <4 x i16> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+; CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vcvt1.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
+  ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @test_vcvt_high_f16_f32(<4 x i16> %a, <4 x float> %b) #0 {
+; CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vcvt1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vcvt1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x float> @test_vcvt_f32_f16(<4 x i16> %a) #0 {
+; CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
+  %vcvt1.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %a) #4
+  ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @test_vcvt_high_f32_f16(<8 x i16> %a) #0 {
+; CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vcvt1.i.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %shuffle.i.i) #4
+  ret <4 x float> %vcvt1.i.i
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
+; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvt.i = fptrunc <2 x double> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+; CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vcvt.i.i = fptrunc <2 x double> %b to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvt.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
+; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvtx_f32_f641.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) #4
+  ret <2 x float> %vcvtx_f32_f641.i
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) #4
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
+; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
+  %vcvt.i = fpext <2 x float> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
+; CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %shuffle.i.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %vcvt.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
+  ret <2 x double> %vcvt.i.i
+}
+
+define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+; CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndn1.i = tail call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndn1.i
+}
+
+define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+; CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndn1.i = tail call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndn1.i
+}
+
+define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
+; CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndn1.i = tail call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndn1.i
+}
+
+define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+; CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrnda1.i = tail call <2 x float> @llvm.round.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrnda1.i
+}
+
+define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+; CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+   %vrnda1.i = tail call <4 x float> @llvm.round.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrnda1.i
+}
+
+define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+; CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrnda1.i = tail call <2 x double> @llvm.round.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrnda1.i
+}
+
+define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+; CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndp1.i = tail call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndp1.i
+}
+
+define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+; CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+ %vrndp1.i = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndp1.i
+}
+
+define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+; CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndp1.i = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndp1.i
+}
+
+define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+; CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndm1.i = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndm1.i
+}
+
+define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+; CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndm1.i = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndm1.i
+}
+
+define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
+; CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+   %vrndm1.i = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndm1.i
+}
+
+define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+; CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndx1.i = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndx1.i
+}
+
+define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+; CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndx1.i = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndx1.i
+}
+
+define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
+; CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndx1.i = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndx1.i
+}
+
+define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+; CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+   %vrnd1.i = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrnd1.i
+}
+
+define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+; CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrnd1.i = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrnd1.i
+}
+
+define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
+; CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrnd1.i = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrnd1.i
+}
+
+define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
+; CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndi1.i = tail call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndi1.i
+}
+
+define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
+; CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndi1.i = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndi1.i
+}
+
+define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
+; CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndi1.i = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndi1.i
+}
+
+define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = fptosi <2 x double> %a to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = fptoui <2 x double> %a to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtns_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtns_f321.i
+}
+
+define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtns_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtns_f321.i
+}
+
+define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtns_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtns_f641.i
+}
+
+define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtnu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtnu_f321.i
+}
+
+define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtnu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtnu_f321.i
+}
+
+define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtnu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtnu_f641.i
+}
+
+define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtps_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtps_f321.i
+}
+
+define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtps_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtps_f321.i
+}
+
+define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtps_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtps_f641.i
+}
+
+define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtpu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtpu_f321.i
+}
+
+define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtpu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtpu_f321.i
+}
+
+define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtpu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtpu_f641.i
+}
+
+define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtms_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtms_f321.i
+}
+
+define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtms_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtms_f321.i
+}
+
+define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtms_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtms_f641.i
+}
+
+define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtmu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtmu_f321.i
+}
+
+define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtmu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtmu_f321.i
+}
+
+define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtmu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtmu_f641.i
+}
+
+define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtas_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtas_f321.i
+}
+
+define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtas_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtas_f321.i
+}
+
+define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtas_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtas_f641.i
+}
+
+define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtau_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtau_f321.i
+}
+
+define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtau_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtau_f321.i
+}
+
+define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtau_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtau_f641.i
+}
+
+define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+; CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrsqrte1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrsqrte1.i
+}
+
+define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+; CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrsqrte1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrsqrte1.i
+}
+
+define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
+; CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrsqrte1.i = tail call <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrsqrte1.i
+}
+
+define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+; CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrecpe1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrecpe1.i
+}
+
+define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+; CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrecpe1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrecpe1.i
+}
+
+define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
+; CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrecpe1.i = tail call <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrecpe1.i
+}
+
+define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+; CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrecpe1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vrecpe1.i
+}
+
+define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+; CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrecpe1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vrecpe1.i
+}
+
+define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
+; CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vsqrt1.i = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vsqrt1.i
+}
+
+define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+; CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vsqrt1.i = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vsqrt1.i
+}
+
+define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+; CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vsqrt1.i = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vsqrt1.i
+}
+
+define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+; CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+; CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
+; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
+; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = uitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #2
+
+declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) #2
+
+declare <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) #2
+
+declare <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) #2
+
+declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) #2
+
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) #2
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) #2
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #2
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) #2
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) #2
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) #2
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) #2
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) #2
+
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) #2
+
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) #2
+
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) #2
+
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) #2
+
+declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) #2
+
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #3
+
+declare <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64>) #2
+
+declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64>) #2
+
+declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64>) #2
+
+declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) #2
+
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) #2
+
+
+define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fptosi <1 x double> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fptoui <1 x double> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtn_s64_f64
+; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtn_u64_f64
+; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtp_s64_f64
+; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtp_u64_f64
+; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtm_s64_f64
+; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtm_u64_f64
+; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvta_s64_f64
+; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvta_u64_f64
+; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = sitofp <1 x i64> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = uitofp <1 x i64> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>)
+
+define <1 x double> @test_vrndn_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndn_f64
+; CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrnda_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrnda_f64
+; CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.round.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndp_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndp_f64
+; CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndm_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndm_f64
+; CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndx_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndx_f64
+; CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrnd_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrnd_f64
+; CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndi_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndi_f64
+; CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
+declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
+declare <1 x double> @llvm.rint.v1f64(<1 x double>)
+declare <1 x double> @llvm.floor.v1f64(<1 x double>)
+declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
+declare <1 x double> @llvm.round.v1f64(<1 x double>)
+declare <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double>)
+
+define <1 x double> @test_vrsqrte_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrsqrte_f64
+; CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrecpe_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrecpe_f64
+; CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vsqrt_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vsqrt_f64
+; CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vrecps_f64
+; CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vrsqrts_f64
+; CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
+declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
+declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 42f6a89..60b13b8 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -202,4 +202,16 @@ define <2 x double> @fmov2d() {
 	ret <2 x double> < double -1.2e1, double -1.2e1>
 }
 
+define <2 x i32> @movi1d_1() {
+; CHECK: movi    d0, #0xffffffff0000
+  ret <2 x i32> < i32  -65536, i32 65535>
+}
+
+
+declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>)
+define <2 x i32> @movi1d() {
+; CHECK: movi     d1, #0xffffffff0000
+  %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)
+  ret <2 x i32> %1
+}
 
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
new file mode 100644
index 0000000..fa4d54d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -0,0 +1,1693 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
+%struct.uint16x4x2_t = type { [2 x <4 x i16>] }
+%struct.uint32x2x2_t = type { [2 x <2 x i32>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.poly8x8x2_t = type { [2 x <8 x i8>] }
+%struct.poly16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
+%struct.uint16x8x2_t = type { [2 x <8 x i16>] }
+%struct.uint32x4x2_t = type { [2 x <4 x i32>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
+%struct.poly16x8x2_t = type { [2 x <8 x i16>] }
+
+define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp1_s8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp1q_s8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp1_s16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp1q_s16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp1_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp1q_s32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp1q_s64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp1_u8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp1q_u8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp1_u16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp1q_u16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp1_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp1q_u32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp1q_u64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vuzp1_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vuzp1q_f32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vuzp1q_f64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp1_p8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp1q_p8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp1_p16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp1q_p16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp2_s8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp2q_s8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp2_s16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp2q_s16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp2_s32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp2q_s32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp2q_s64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp2_u8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp2q_u8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp2_u16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp2q_u16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp2_u32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp2q_u32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp2q_u64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vuzp2_f32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vuzp2q_f32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vuzp2q_f64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp2_p8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp2q_p8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp2_p16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp2q_p16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip1_s8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip1q_s8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip1_s16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip1q_s16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip1_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip1q_s32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip1q_s64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip1_u8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip1q_u8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip1_u16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip1q_u16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip1_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip1q_u32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip1q_u64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vzip1_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vzip1q_f32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vzip1q_f64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip1_p8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip1q_p8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip1_p16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip1q_p16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip2_s8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip2q_s8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip2_s16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip2q_s16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip2_s32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip2q_s32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip2q_s64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip2_u8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip2q_u8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip2_u16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip2q_u16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip2_u32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip2q_u32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip2q_u64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vzip2_f32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vzip2q_f32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vzip2q_f64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip2_p8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip2q_p8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip2_p16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip2q_p16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn1_s8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn1q_s8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn1_s16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn1q_s16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn1_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn1q_s32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn1q_s64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn1_u8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn1q_u8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn1_u16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn1q_u16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn1_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn1q_u32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn1q_u64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vtrn1_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vtrn1q_f32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vtrn1q_f64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn1_p8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn1q_p8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn1_p16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn1q_p16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn2_s8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn2q_s8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn2_s16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn2q_s16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn2_s32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn2q_s32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn2q_s64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn2_u8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn2q_u8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn2_u16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn2q_u16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn2_u32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn2q_u32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn2q_u64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vtrn2_f32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vtrn2q_f32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vtrn2q_f64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn2_p8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn2q_p8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn2_p16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn2q_p16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp_s8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp_s16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vuzp1.i, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp_u8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp_u16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1
+  ret %struct.uint16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vuzp1.i, 0, 1
+  ret %struct.uint32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vuzp_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vuzp1.i, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp_p8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.poly8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp_p16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1
+  ret %struct.poly16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzpq_s8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzpq_s16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzpq_s32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vuzp1.i, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzpq_u8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1
+  ret %struct.uint8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzpq_u16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1
+  ret %struct.uint16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzpq_u32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vuzp1.i, 0, 1
+  ret %struct.uint32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vuzpq_f32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vuzp.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vuzp1.i, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzpq_p8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1
+  ret %struct.poly8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzpq_p16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1
+  ret %struct.poly16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip_s8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip_s16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vzip1.i, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip_u8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip_u16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1
+  ret %struct.uint16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vzip1.i, 0, 1
+  ret %struct.uint32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vzip_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vzip1.i, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip_p8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1
+  ret %struct.poly8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip_p16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1
+  ret %struct.poly16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzipq_s8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzipq_s16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzipq_s32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vzip1.i, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzipq_u8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1
+  ret %struct.uint8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzipq_u16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1
+  ret %struct.uint16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzipq_u32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vzip1.i, 0, 1
+  ret %struct.uint32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vzipq_f32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vzip.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vzip1.i, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzipq_p8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1
+  ret %struct.poly8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzipq_p16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1
+  ret %struct.poly16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn_s8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn_s16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vtrn1.i, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn_u8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn_u16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1
+  ret %struct.uint16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vtrn1.i, 0, 1
+  ret %struct.uint32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vtrn_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vtrn1.i, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn_p8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1
+  ret %struct.poly8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn_p16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1
+  ret %struct.poly16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrnq_s8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrnq_s16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrnq_s32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vtrn1.i, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrnq_u8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1
+  ret %struct.uint8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrnq_u16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1
+  ret %struct.uint16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrnq_u32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vtrn1.i, 0, 1
+  ret %struct.uint32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vtrnq_f32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vtrn.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vtrn1.i, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrnq_p8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1
+  ret %struct.poly8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrnq_p16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1
+  ret %struct.poly16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
+; CHECK: test_uzp:
+
+  %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+
+; CHECK: dup	{{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: uzp1	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: uzp2	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll
index 404e491..5b4ec28 100644
--- a/test/CodeGen/AArch64/neon-rounding-shift.ll
+++ b/test/CodeGen/AArch64/neon-rounding-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: urshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: srshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
 
diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
index b2fac1f..fc60d90 100644
--- a/test/CodeGen/AArch64/neon-saturating-add-sub.ll
+++ b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
@@ -102,22 +102,7 @@ define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqadd_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqadd d0, d0, d1
-  ret <1 x i64> %tmp1
-}
 
-define <1 x i64> @test_sqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqadd_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqadd d0, d0, d1
-  ret <1 x i64> %tmp1
-}
 
 declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
@@ -254,21 +239,3 @@ define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; CHECK: sqsub v0.2d, v0.2d, v1.2d
   ret <2 x i64> %tmp1
 }
-
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqsub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqsub d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqsub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqsub d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
index 05d8dfe..d89262c 100644
--- a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
+++ b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqrshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqrshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
 
diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll
index 3b7f78c..11009fb 100644
--- a/test/CodeGen/AArch64/neon-saturating-shift.ll
+++ b/test/CodeGen/AArch64/neon-saturating-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
 
diff --git a/test/CodeGen/AArch64/neon-scalar-abs.ll b/test/CodeGen/AArch64/neon-scalar-abs.ll
new file mode 100644
index 0000000..03a89e04
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-abs.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define i64 @test_vabsd_s64(i64 %a) {
+; CHECK: test_vabsd_s64
+; CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vabs1.i = tail call <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64> %vabs.i)
+  %0 = extractelement <1 x i64> %vabs1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64>)
+
+define i8 @test_vqabsb_s8(i8 %a) {
+; CHECK: test_vqabsb_s8
+; CHECK: sqabs {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vqabs1.i = call <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8> %vqabs.i)
+  %0 = extractelement <1 x i8> %vqabs1.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8>)
+
+define i16 @test_vqabsh_s16(i16 %a) {
+; CHECK: test_vqabsh_s16
+; CHECK: sqabs {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqabs1.i = call <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16> %vqabs.i)
+  %0 = extractelement <1 x i16> %vqabs1.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16>)
+
+define i32 @test_vqabss_s32(i32 %a) {
+; CHECK: test_vqabss_s32
+; CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqabs1.i = call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %vqabs.i)
+  %0 = extractelement <1 x i32> %vqabs1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>)
+
+define i64 @test_vqabsd_s64(i64 %a) {
+; CHECK: test_vqabsd_s64
+; CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqabs1.i = call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %vqabs.i)
+  %0 = extractelement <1 x i64> %vqabs1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
new file mode 100644
index 0000000..09ca880
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+	%tmp3 = add <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+	%tmp3 = sub <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_add_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uadd_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sub_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_usub_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
new file mode 100644
index 0000000..8ce42de
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -0,0 +1,108 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare float @llvm.fma.f32(float, float, float)
+declare double @llvm.fma.f64(double, double, double)
+
+define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmla_ss4S
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmla_ss4S_swap
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
+  ; CHECK: test_fmla_ss2S
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
+  ; CHECK: test_fmla_ddD
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmla_dd2D
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmla_dd2D_swap
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
+  ret double %tmp2
+}
+
+define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmls_ss4S
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
+  ret float %tmp3
+}
+
+define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmls_ss4S_swap
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp1, float %tmp2, float %a)
+  ret float %tmp3
+}
+
+
+define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
+  ; CHECK: test_fmls_ss2S
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
+  ret float %tmp3
+}
+
+define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
+  ; CHECK: test_fmls_ddD
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmls_dd2D
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmls_dd2D_swap
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp1, double %tmp2, double %a)
+  ret double %tmp3
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
new file mode 100644
index 0000000..968ad3e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
+  ; CHECK: test_fmul_lane_ss2S
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
+  ; CHECK: test_fmul_lane_ss2S_swap
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
+  ; CHECK: test_fmul_lane_ss4S
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
+  ; CHECK: test_fmul_lane_ss4S_swap
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
+  ; CHECK: test_fmul_lane_ddD
+  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+
+define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
+  ; CHECK: test_fmul_lane_dd2D
+  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
+  ; CHECK: test_fmul_lane_dd2D_swap
+  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %tmp1, %a;
+  ret double %tmp2;
+}
+
+declare float @llvm.aarch64.neon.vmulx.f32(float, float)
+
+define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
+  ; CHECK: test_fmulx_lane_f32
+  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
+  ; CHECK: test_fmulx_laneq_f32
+  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
+  ; CHECK: test_fmulx_laneq_f32_swap
+  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a)
+  ret float %tmp2;
+}
+
+declare double @llvm.aarch64.neon.vmulx.f64(double, double)
+
+define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
+  ; CHECK: test_fmulx_lane_f64
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
+  ; CHECK: test_fmulx_laneq_f64_0
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+
+define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
+  ; CHECK: test_fmulx_laneq_f64_1
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
+  ; CHECK: test_fmulx_laneq_f64_1_swap
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a)
+  ret double %tmp2;
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll
new file mode 100644
index 0000000..5f10cbb
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-compare.ll
@@ -0,0 +1,343 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+;; Scalar Integer Compare
+
+define i64 @test_vceqd(i64 %a, i64 %b) {
+; CHECK: test_vceqd
+; CHECK: cmeq {{d[0-9]+}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vceq.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vceq1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceq.i, <1 x i64> %vceq1.i)
+  %0 = extractelement <1 x i64> %vceq2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vceqzd(i64 %a) {
+; CHECK: test_vceqzd
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vceqz.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vceqz1.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceqz.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vceqz1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcged(i64 %a, i64 %b) {
+; CHECK: test_vcged
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcge1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcgezd(i64 %a) {
+; CHECK: test_vcgezd
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vcgez.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgez1.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcgez.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vcgez1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcgtd(i64 %a, i64 %b) {
+; CHECK: test_vcgtd
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgt1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcgtzd(i64 %a) {
+; CHECK: test_vcgtzd
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vcgtz.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgtz1.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgtz.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vcgtz1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcled(i64 %a, i64 %b) {
+; CHECK: test_vcled
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcgt1.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vclezd(i64 %a) {
+; CHECK: test_vclezd
+; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vclez.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vclez1.i = call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64> %vclez.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vclez1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcltd(i64 %a, i64 %b) {
+; CHECK: test_vcltd
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcge1.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcltzd(i64 %a) {
+; CHECK: test_vcltzd
+; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vcltz.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcltz1.i = call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64> %vcltz.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vcltz1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vtstd(i64 %a, i64 %b) {
+; CHECK: test_vtstd
+; CHECK: cmtst {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vtst.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vtst1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vtst2.i = call <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64> %vtst.i, <1 x i64> %vtst1.i)
+  %0 = extractelement <1 x i64> %vtst2.i, i32 0
+  ret i64 %0
+}
+
+
+define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcage_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2
+  ret <1 x i64> %vcage2.i
+}
+
+define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcagt_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2
+  ret <1 x i64> %vcagt2.i
+}
+
+define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcale_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2
+  ret <1 x i64> %vcage2.i
+}
+
+define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcalt_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2
+  ret <1 x i64> %vcagt2.i
+}
+
+define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vceq_s64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp eq <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vceq_u64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp eq <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vceq_f64
+; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp oeq <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcge_s64
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp sge <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcge_u64
+; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp uge <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcge_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp oge <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcle_s64
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp sle <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcle_u64
+; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp ule <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcle_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp ole <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcgt_s64
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp sgt <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcgt_u64
+; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp ugt <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcgt_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp ogt <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vclt_s64
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp slt <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vclt_u64
+; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp ult <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vclt_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp olt <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
+; CHECK: test_vceqz_s64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp eq <1 x i64> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
+; CHECK: test_vceqz_u64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp eq <1 x i64> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
+; CHECK: test_vceqz_p64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp eq <1 x i64> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
+; CHECK: test_vceqzq_p64
+; CHECK: cmeq  {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0
+  %1 = icmp eq <2 x i64> %a, zeroinitializer
+  %vceqz.i = zext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %vceqz.i
+}
+
+define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
+; CHECK: test_vcgez_s64
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp sge <1 x i64> %a, zeroinitializer
+  %vcgez.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vcgez.i
+}
+
+define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
+; CHECK: test_vclez_s64
+; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp sle <1 x i64> %a, zeroinitializer
+  %vclez.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vclez.i
+}
+
+define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
+; CHECK: test_vcgtz_s64
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp sgt <1 x i64> %a, zeroinitializer
+  %vcgtz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vcgtz.i
+}
+
+define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
+; CHECK: test_vcltz_s64
+; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0
+  %1 = icmp slt <1 x i64> %a, zeroinitializer
+  %vcltz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vcltz.i
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vchi.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
new file mode 100644
index 0000000..d433ff5
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define float @test_dup_sv2S(<2 x float> %v) {
+ ;CHECK: test_dup_sv2S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+ %tmp1 = extractelement <2 x float> %v, i32 1
+ ret float  %tmp1
+}
+
+define float @test_dup_sv4S(<4 x float> %v) {
+ ;CHECK: test_dup_sv4S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[0]
+ %tmp1 = extractelement <4 x float> %v, i32 0
+ ret float  %tmp1
+}
+
+define double @test_dup_dvD(<1 x double> %v) {
+ ;CHECK: test_dup_dvD
+ ;CHECK-NOT: dup {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+ ;CHECK: ret
+ %tmp1 = extractelement <1 x double> %v, i32 0
+ ret double  %tmp1
+}
+
+define double @test_dup_dv2D(<2 x double> %v) {
+ ;CHECK: test_dup_dv2D
+ ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+ %tmp1 = extractelement <2 x double> %v, i32 1
+ ret double  %tmp1
+}
+
+define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
+ ;CHECK: test_vector_dup_bv16B
+ ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[14]
+ %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
+ ret <1 x i8> %shuffle.i
+}
+
+define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
+ ;CHECK: test_vector_dup_bv8B
+ ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[7]
+ %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> 
+ ret <1 x i8> %shuffle.i
+}
+
+define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
+ ;CHECK: test_vector_dup_hv8H
+ ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[7]
+ %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
+ ret <1 x i16> %shuffle.i
+}
+
+define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
+ ;CHECK: test_vector_dup_hv4H
+ ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[3]
+ %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> 
+ ret <1 x i16> %shuffle.i
+}
+
+define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
+ ;CHECK: test_vector_dup_sv4S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+ %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
+ ret <1 x i32> %shuffle
+}
+
+define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
+ ;CHECK: test_vector_dup_sv2S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+ %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> 
+ ret <1 x i32> %shuffle
+}
+
+define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
+ ;CHECK: test_vector_dup_dv2D
+ ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+ %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
+ ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
+  ;CHECK: test_vector_copy_dup_dv2D
+  ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %vget_lane = extractelement <2 x i64> %c, i32 1
+  %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0
+  ret <1 x i64> %vset_lane
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll
new file mode 100644
index 0000000..a06d5d6
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-cvt.ll
@@ -0,0 +1,137 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define float @test_vcvts_f32_s32(i32 %a) {
+; CHECK: test_vcvts_f32_s32
+; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32> %vcvtf.i)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32>)
+
+define double @test_vcvtd_f64_s64(i64 %a) {
+; CHECK: test_vcvtd_f64_s64
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64> %vcvtf.i)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64>)
+
+define float @test_vcvts_f32_u32(i32 %a) {
+; CHECK: test_vcvts_f32_u32
+; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32> %vcvtf.i)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32>)
+
+define double @test_vcvtd_f64_u64(i64 %a) {
+; CHECK: test_vcvtd_f64_u64
+; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64> %vcvtf.i)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64>)
+
+define float @test_vcvts_n_f32_s32(i32 %a) {
+; CHECK: test_vcvts_n_f32_s32
+; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32> %vcvtf, i32 1)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32>, i32)
+
+define double @test_vcvtd_n_f64_s64(i64 %a) {
+; CHECK: test_vcvtd_n_f64_s64
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64> %vcvtf, i32 1)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64>, i32)
+
+define float @test_vcvts_n_f32_u32(i32 %a) {
+; CHECK: test_vcvts_n_f32_u32
+; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32> %vcvtf, i32 1)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32>, i32)
+
+define double @test_vcvtd_n_f64_u64(i64 %a) {
+; CHECK: test_vcvtd_n_f64_u64
+; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64> %vcvtf, i32 1)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64>, i32)
+
+define i32 @test_vcvts_n_s32_f32(float %a) {
+; CHECK: test_vcvts_n_s32_f32
+; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
+entry:
+  %fcvtzs = insertelement <1 x float> undef, float %a, i32 0
+  %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float> %fcvtzs, i32 1)
+  %0 = extractelement <1 x i32> %fcvtzs1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float>, i32)
+
+define i64 @test_vcvtd_n_s64_f64(double %a) {
+; CHECK: test_vcvtd_n_s64_f64
+; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
+entry:
+  %fcvtzs = insertelement <1 x double> undef, double %a, i32 0
+  %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double> %fcvtzs, i32 1)
+  %0 = extractelement <1 x i64> %fcvtzs1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double>, i32)
+
+define i32 @test_vcvts_n_u32_f32(float %a) {
+; CHECK: test_vcvts_n_u32_f32
+; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
+entry:
+  %fcvtzu = insertelement <1 x float> undef, float %a, i32 0
+  %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float> %fcvtzu, i32 32)
+  %0 = extractelement <1 x i32> %fcvtzu1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float>, i32)
+
+define i64 @test_vcvtd_n_u64_f64(double %a) {
+; CHECK: test_vcvtd_n_u64_f64
+; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
+entry:
+  %fcvtzu = insertelement <1 x double> undef, double %a, i32 0
+  %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double> %fcvtzu, i32 64)
+  %0 = extractelement <1 x i64> %fcvtzu1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
new file mode 100644
index 0000000..faf521b
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define i8 @test_vqmovunh_s16(i16 %a) {
+; CHECK: test_vqmovunh_s16
+; CHECK: sqxtun {{b[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqmovun.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqmovun1.i = call <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16> %vqmovun.i)
+  %0 = extractelement <1 x i8> %vqmovun1.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vqmovuns_s32(i32 %a) {
+; CHECK: test_vqmovuns_s32
+; CHECK: sqxtun {{h[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqmovun.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqmovun1.i = call <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32> %vqmovun.i)
+  %0 = extractelement <1 x i16> %vqmovun1.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vqmovund_s64(i64 %a) {
+; CHECK: test_vqmovund_s64
+; CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqmovun.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqmovun1.i = call <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64> %vqmovun.i)
+  %0 = extractelement <1 x i32> %vqmovun1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64>)
+
+define i8 @test_vqmovnh_s16(i16 %a) {
+; CHECK: test_vqmovnh_s16
+; CHECK: sqxtn {{b[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16> %vqmovn.i)
+  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vqmovns_s32(i32 %a) {
+; CHECK: test_vqmovns_s32
+; CHECK: sqxtn {{h[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32> %vqmovn.i)
+  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vqmovnd_s64(i64 %a) {
+; CHECK: test_vqmovnd_s64
+; CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64> %vqmovn.i)
+  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64>)
+
+define i8 @test_vqmovnh_u16(i16 %a) {
+; CHECK: test_vqmovnh_u16
+; CHECK: uqxtn {{b[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16> %vqmovn.i)
+  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
+  ret i8 %0
+}
+
+
+define i16 @test_vqmovns_u32(i32 %a) {
+; CHECK: test_vqmovns_u32
+; CHECK: uqxtn {{h[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32> %vqmovn.i)
+  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vqmovnd_u64(i64 %a) {
+; CHECK: test_vqmovnd_u64
+; CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64> %vqmovn.i)
+  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll
new file mode 100644
index 0000000..75686d3
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-fabd.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define float @test_vabds_f32(float %a, float %b) {
+; CHECK-LABEL: test_vabds_f32
+; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vabd.i = insertelement <1 x float> undef, float %a, i32 0
+  %vabd1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vabd2.i = call <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float> %vabd.i, <1 x float> %vabd1.i)
+  %0 = extractelement <1 x float> %vabd2.i, i32 0
+  ret float %0
+}
+
+define double @test_vabdd_f64(double %a, double %b) {
+; CHECK-LABEL: test_vabdd_f64
+; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vabd.i = insertelement <1 x double> undef, double %a, i32 0
+  %vabd1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vabd2.i = call <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double> %vabd.i, <1 x double> %vabd1.i)
+  %0 = extractelement <1 x double> %vabd2.i, i32 0
+  ret double %0
+}
+
+declare <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double>, <1 x double>)
+declare <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float>, <1 x float>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
new file mode 100644
index 0000000..d7b84fa
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
@@ -0,0 +1,255 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+;; Scalar Floating-point Convert
+
+define float @test_vcvtxn(double %a) {
+; CHECK: test_vcvtxn
+; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtf.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtf1.i = tail call <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double> %vcvtf.i)
+  %0 = extractelement <1 x float> %vcvtf1.i, i32 0
+  ret float %0
+}
+
+declare <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double>)
+
+define i32 @test_vcvtass(float %a) {
+; CHECK: test_vcvtass
+; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtas.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtas1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float> %vcvtas.i)
+  %0 = extractelement <1 x i32> %vcvtas1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float>)
+
+define i64 @test_test_vcvtasd(double %a) {
+; CHECK: test_test_vcvtasd
+; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtas.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtas1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %vcvtas.i)
+  %0 = extractelement <1 x i64> %vcvtas1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtaus(float %a) {
+; CHECK: test_vcvtaus
+; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtau.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtau1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float> %vcvtau.i)
+  %0 = extractelement <1 x i32> %vcvtau1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtaud(double %a) {
+; CHECK: test_vcvtaud
+; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtau.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtau1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %vcvtau.i)
+  %0 = extractelement <1 x i64> %vcvtau1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) 
+
+define i32 @test_vcvtmss(float %a) {
+; CHECK: test_vcvtmss
+; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtms.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtms1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float> %vcvtms.i)
+  %0 = extractelement <1 x i32> %vcvtms1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtmd_s64_f64(double %a) {
+; CHECK: test_vcvtmd_s64_f64
+; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtms.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtms1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %vcvtms.i)
+  %0 = extractelement <1 x i64> %vcvtms1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtmus(float %a) {
+; CHECK: test_vcvtmus
+; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtmu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtmu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float> %vcvtmu.i)
+  %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtmud(double %a) {
+; CHECK: test_vcvtmud
+; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtmu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtmu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %vcvtmu.i)
+  %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtnss(float %a) {
+; CHECK: test_vcvtnss
+; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtns.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtns1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float> %vcvtns.i)
+  %0 = extractelement <1 x i32> %vcvtns1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtnd_s64_f64(double %a) {
+; CHECK: test_vcvtnd_s64_f64
+; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtns.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtns1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %vcvtns.i)
+  %0 = extractelement <1 x i64> %vcvtns1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtnus(float %a) {
+; CHECK: test_vcvtnus
+; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtnu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtnu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float> %vcvtnu.i)
+  %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtnud(double %a) {
+; CHECK: test_vcvtnud
+; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtnu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtnu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %vcvtnu.i)
+  %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtpss(float %a) {
+; CHECK: test_vcvtpss
+; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtps.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtps1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float> %vcvtps.i)
+  %0 = extractelement <1 x i32> %vcvtps1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtpd_s64_f64(double %a) {
+; CHECK: test_vcvtpd_s64_f64
+; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtps.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtps1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %vcvtps.i)
+  %0 = extractelement <1 x i64> %vcvtps1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtpus(float %a) {
+; CHECK: test_vcvtpus
+; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtpu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtpu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float> %vcvtpu.i)
+  %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtpud(double %a) {
+; CHECK: test_vcvtpud
+; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtpu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtpu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %vcvtpu.i)
+  %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtss(float %a) {
+; CHECK: test_vcvtss
+; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtzs.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtzs1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float> %vcvtzs.i)
+  %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtd_s64_f64(double %a) {
+; CHECK: test_vcvtd_s64_f64
+; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvzs.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvzs1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %vcvzs.i)
+  %0 = extractelement <1 x i64> %vcvzs1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtus(float %a) {
+; CHECK: test_vcvtus
+; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtzu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtzu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float> %vcvtzu.i)
+  %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtud(double %a) {
+; CHECK: test_vcvtud
+; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtzu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtzu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %vcvtzu.i)
+  %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
new file mode 100644
index 0000000..a6e5859
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
@@ -0,0 +1,328 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+;; Scalar Floating-point Compare
+
+define i32 @test_vceqs_f32(float %a, float %b) {
+; CHECK: test_vceqs_f32
+; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vceq.i = insertelement <1 x float> undef, float %a, i32 0
+  %vceq1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vceq2.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> %vceq1.i)
+  %0 = extractelement <1 x i32> %vceq2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vceqd_f64(double %a, double %b) {
+; CHECK: test_vceqd_f64
+; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vceq.i = insertelement <1 x double> undef, double %a, i32 0
+  %vceq1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double> %vceq.i, <1 x double> %vceq1.i)
+  %0 = extractelement <1 x i64> %vceq2.i, i32 0
+  ret i64 %0
+}
+
+define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 {
+; CHECK: test_vceqz_f64
+; CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0.0
+entry:
+  %0 = fcmp oeq <1 x double> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %0 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define i32 @test_vceqzs_f32(float %a) {
+; CHECK: test_vceqzs_f32
+; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vceq.i = insertelement <1 x float> undef, float %a, i32 0
+  %vceq1.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vceq1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vceqzd_f64(double %a) {
+; CHECK: test_vceqzd_f64
+; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vceq.i = insertelement <1 x double> undef, double %a, i32 0
+  %vceq1.i = tail call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double> %vceq.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vceq1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcges_f32(float %a, float %b) {
+; CHECK: test_vcges_f32
+; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcge1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i)
+  %0 = extractelement <1 x i32> %vcge2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcged_f64(double %a, double %b) {
+; CHECK: test_vcged_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcge1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcgezs_f32(float %a) {
+; CHECK: test_vcgezs_f32
+; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcge1.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vcge1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcgezd_f64(double %a) {
+; CHECK: test_vcgezd_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcge1.i = tail call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double> %vcge.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vcge1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcgts_f32(float %a, float %b) {
+; CHECK: test_vcgts_f32
+; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcgt1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i)
+  %0 = extractelement <1 x i32> %vcgt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcgtd_f64(double %a, double %b) {
+; CHECK: test_vcgtd_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcgt1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcgtzs_f32(float %a) {
+; CHECK: test_vcgtzs_f32
+; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vcgt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcgt1.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vcgt1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcgtzd_f64(double %a) {
+; CHECK: test_vcgtzd_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vcgt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcgt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double> %vcgt.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vcgt1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcles_f32(float %a, float %b) {
+; CHECK: test_vcles_f32
+; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcge1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i)
+  %0 = extractelement <1 x i32> %vcge2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcled_f64(double %a, double %b) {
+; CHECK: test_vcled_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcge1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vclezs_f32(float %a) {
+; CHECK: test_vclezs_f32
+; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vcle.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcle1.i = call <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float> %vcle.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vcle1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vclezd_f64(double %a) {
+; CHECK: test_vclezd_f64
+; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vcle.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcle1.i = tail call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double> %vcle.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vcle1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vclts_f32(float %a, float %b) {
+; CHECK: test_vclts_f32
+; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcgt1.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i)
+  %0 = extractelement <1 x i32> %vcgt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcltd_f64(double %a, double %b) {
+; CHECK: test_vcltd_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcgt1.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcltzs_f32(float %a) {
+; CHECK: test_vcltzs_f32
+; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vclt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vclt1.i = call <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float> %vclt.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vclt1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcltzd_f64(double %a) {
+; CHECK: test_vcltzd_f64
+; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vclt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vclt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double> %vclt.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vclt1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcages_f32(float %a, float %b) {
+; CHECK: test_vcages_f32
+; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcage1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i)
+  %0 = extractelement <1 x i32> %vcage2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcaged_f64(double %a, double %b) {
+; CHECK: test_vcaged_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcage1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i)
+  %0 = extractelement <1 x i64> %vcage2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcagts_f32(float %a, float %b) {
+; CHECK: test_vcagts_f32
+; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcagt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcagt1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcagt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcagt.i, <1 x float> %vcagt1.i)
+  %0 = extractelement <1 x i32> %vcagt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcagtd_f64(double %a, double %b) {
+; CHECK: test_vcagtd_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcagt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcagt1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcagt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcagt.i, <1 x double> %vcagt1.i)
+  %0 = extractelement <1 x i64> %vcagt2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcales_f32(float %a, float %b) {
+; CHECK: test_vcales_f32
+; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcage1.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i)
+  %0 = extractelement <1 x i32> %vcage2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcaled_f64(double %a, double %b) {
+; CHECK: test_vcaled_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcage1.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i)
+  %0 = extractelement <1 x i64> %vcage2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcalts_f32(float %a, float %b) {
+; CHECK: test_vcalts_f32
+; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcalt.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcalt1.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcalt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcalt.i, <1 x float> %vcalt1.i)
+  %0 = extractelement <1 x i32> %vcalt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcaltd_f64(double %a, double %b) {
+; CHECK: test_vcaltd_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcalt.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcalt1.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcalt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcalt.i, <1 x double> %vcalt1.i)
+  %0 = extractelement <1 x i64> %vcalt2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll
new file mode 100644
index 0000000..991037f
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-mul.ll
@@ -0,0 +1,143 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) {
+; CHECK: test_vqdmulhh_s16
+; CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
+  %3 = call <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
+  %4 = extractelement <1 x i16> %3, i32 0
+  ret i16 %4
+}
+
+define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) {
+; CHECK: test_vqdmulhs_s32
+; CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
+  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
+  %3 = call <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
+  %4 = extractelement <1 x i32> %3, i32 0
+  ret i32 %4
+}
+
+declare <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32>, <1 x i32>)
+
+define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) {
+; CHECK: test_vqrdmulhh_s16
+; CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
+  %3 = call <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
+  %4 = extractelement <1 x i16> %3, i32 0
+  ret i16 %4
+}
+
+define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) {
+; CHECK: test_vqrdmulhs_s32
+; CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
+  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
+  %3 = call <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
+  %4 = extractelement <1 x i32> %3, i32 0
+  ret i32 %4
+}
+
+declare <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>)
+
+define float @test_vmulxs_f32(float %a, float %b) {
+; CHECK: test_vmulxs_f32
+; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b)
+  ret float %1
+}
+
+define double @test_vmulxd_f64(double %a, double %b) {
+; CHECK: test_vmulxd_f64
+; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b)
+  ret double %1
+}
+
+declare float @llvm.aarch64.neon.vmulx.f32(float, float)
+declare double @llvm.aarch64.neon.vmulx.f64(double, double)
+
+define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) {
+; CHECK: test_vqdmlalh_s16
+; CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqdmlal.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqdmlal1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vqdmlal2.i = insertelement <1 x i16> undef, i16 %c, i32 0
+  %vqdmlal3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32> %vqdmlal.i, <1 x i16> %vqdmlal1.i, <1 x i16> %vqdmlal2.i)
+  %0 = extractelement <1 x i32> %vqdmlal3.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) {
+; CHECK: test_vqdmlals_s32
+; CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqdmlal.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqdmlal1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vqdmlal2.i = insertelement <1 x i32> undef, i32 %c, i32 0
+  %vqdmlal3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64> %vqdmlal.i, <1 x i32> %vqdmlal1.i, <1 x i32> %vqdmlal2.i)
+  %0 = extractelement <1 x i64> %vqdmlal3.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
+declare <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
+
+define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) {
+; CHECK: test_vqdmlslh_s16
+; CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqdmlsl.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqdmlsl1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vqdmlsl2.i = insertelement <1 x i16> undef, i16 %c, i32 0
+  %vqdmlsl3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32> %vqdmlsl.i, <1 x i16> %vqdmlsl1.i, <1 x i16> %vqdmlsl2.i)
+  %0 = extractelement <1 x i32> %vqdmlsl3.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) {
+; CHECK: test_vqdmlsls_s32
+; CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqdmlsl.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqdmlsl1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vqdmlsl2.i = insertelement <1 x i32> undef, i32 %c, i32 0
+  %vqdmlsl3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64> %vqdmlsl.i, <1 x i32> %vqdmlsl1.i, <1 x i32> %vqdmlsl2.i)
+  %0 = extractelement <1 x i64> %vqdmlsl3.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
+declare <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
+
+define i32 @test_vqdmullh_s16(i16 %a, i16 %b) {
+; CHECK: test_vqdmullh_s16
+; CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqdmull.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqdmull1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vqdmull2.i = call <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16> %vqdmull.i, <1 x i16> %vqdmull1.i)
+  %0 = extractelement <1 x i32> %vqdmull2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vqdmulls_s32(i32 %a, i32 %b) {
+; CHECK: test_vqdmulls_s32
+; CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqdmull.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqdmull1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vqdmull2.i = call <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32> %vqdmull.i, <1 x i32> %vqdmull1.i)
+  %0 = extractelement <1 x i64> %vqdmull2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16>, <1 x i16>)
+declare <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32>, <1 x i32>)
diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll
new file mode 100644
index 0000000..4dc9d51
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-neg.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define i64 @test_vnegd_s64(i64 %a) {
+; CHECK: test_vnegd_s64
+; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vneg1.i = tail call <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64> %vneg.i)
+  %0 = extractelement <1 x i64> %vneg1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64>)
+
+define i8 @test_vqnegb_s8(i8 %a) {
+; CHECK: test_vqnegb_s8
+; CHECK: sqneg {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vqneg1.i = call <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8> %vqneg.i)
+  %0 = extractelement <1 x i8> %vqneg1.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8>)
+
+define i16 @test_vqnegh_s16(i16 %a) {
+; CHECK: test_vqnegh_s16
+; CHECK: sqneg {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqneg1.i = call <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16> %vqneg.i)
+  %0 = extractelement <1 x i16> %vqneg1.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16>)
+
+define i32 @test_vqnegs_s32(i32 %a) {
+; CHECK: test_vqnegs_s32
+; CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqneg1.i = call <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32> %vqneg.i)
+  %0 = extractelement <1 x i32> %vqneg1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32>)
+
+define i64 @test_vqnegd_s64(i64 %a) {
+; CHECK: test_vqnegd_s64
+; CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqneg1.i = call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %vqneg.i)
+  %0 = extractelement <1 x i64> %vqneg1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll
new file mode 100644
index 0000000..f21c27b
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-recip.ll
@@ -0,0 +1,116 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define float @test_vrecpss_f32(float %a, float %b) {
+; CHECK: test_vrecpss_f32
+; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x float> undef, float %a, i32 0
+  %2 = insertelement <1 x float> undef, float %b, i32 0
+  %3 = call <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float> %1, <1 x float> %2)
+  %4 = extractelement <1 x float> %3, i32 0
+  ret float %4
+}
+
+define double @test_vrecpsd_f64(double %a, double %b) {
+; CHECK: test_vrecpsd_f64
+; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = insertelement <1 x double> undef, double %a, i32 0
+  %2 = insertelement <1 x double> undef, double %b, i32 0
+  %3 = call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %1, <1 x double> %2)
+  %4 = extractelement <1 x double> %3, i32 0
+  ret double %4
+}
+
+declare <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float>, <1 x float>)
+declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
+
+define float @test_vrsqrtss_f32(float %a, float %b) {
+; CHECK: test_vrsqrtss_f32
+; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x float> undef, float %a, i32 0
+  %2 = insertelement <1 x float> undef, float %b, i32 0
+  %3 = call <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float> %1, <1 x float> %2)
+  %4 = extractelement <1 x float> %3, i32 0
+  ret float %4
+}
+
+define double @test_vrsqrtsd_f64(double %a, double %b) {
+; CHECK: test_vrsqrtsd_f64
+; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = insertelement <1 x double> undef, double %a, i32 0
+  %2 = insertelement <1 x double> undef, double %b, i32 0
+  %3 = call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %1, <1 x double> %2)
+  %4 = extractelement <1 x double> %3, i32 0
+  ret double %4
+}
+
+declare <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float>, <1 x float>)
+declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
+
+define float @test_vrecpes_f32(float %a) {
+; CHECK: test_vrecpes_f32
+; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vrecpe.i = insertelement <1 x float> undef, float %a, i32 0
+  %vrecpe1.i = tail call <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float> %vrecpe.i)
+  %0 = extractelement <1 x float> %vrecpe1.i, i32 0
+  ret float %0
+}
+
+define double @test_vrecped_f64(double %a) {
+; CHECK: test_vrecped_f64
+; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vrecpe.i = insertelement <1 x double> undef, double %a, i32 0
+  %vrecpe1.i = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %vrecpe.i)
+  %0 = extractelement <1 x double> %vrecpe1.i, i32 0
+  ret double %0
+}
+
+declare <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float>)
+declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
+
+define float @test_vrecpxs_f32(float %a) {
+; CHECK: test_vrecpxs_f32
+; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vrecpx.i = insertelement <1 x float> undef, float %a, i32 0
+  %vrecpx1.i = tail call <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float> %vrecpx.i)
+  %0 = extractelement <1 x float> %vrecpx1.i, i32 0
+  ret float %0
+}
+
+define double @test_vrecpxd_f64(double %a) {
+; CHECK: test_vrecpxd_f64
+; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vrecpx.i = insertelement <1 x double> undef, double %a, i32 0
+  %vrecpx1.i = tail call <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double> %vrecpx.i)
+  %0 = extractelement <1 x double> %vrecpx1.i, i32 0
+  ret double %0
+}
+
+declare <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float>)
+declare <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double>)
+
+define float @test_vrsqrtes_f32(float %a) {
+; CHECK: test_vrsqrtes_f32
+; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vrsqrte.i = insertelement <1 x float> undef, float %a, i32 0
+  %vrsqrte1.i = tail call <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float> %vrsqrte.i)
+  %0 = extractelement <1 x float> %vrsqrte1.i, i32 0
+  ret float %0
+}
+
+define double @test_vrsqrted_f64(double %a) {
+; CHECK: test_vrsqrted_f64
+; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vrsqrte.i = insertelement <1 x double> undef, double %a, i32 0
+  %vrsqrte1.i = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %vrsqrte.i)
+  %0 = extractelement <1 x double> %vrsqrte1.i, i32 0
+  ret double %0
+}
+
+declare <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float>)
+declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
new file mode 100644
index 0000000..80e8dc3
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
@@ -0,0 +1,247 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
+
+define <1 x i64> @test_addp_v1i64(<2 x i64> %a) {
+; CHECK: test_addp_v1i64:
+        %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
+; CHECK: addp d0, v0.2d
+        ret <1 x i64> %val
+}
+
+declare <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float>)
+
+define <1 x float> @test_faddp_v1f32(<2 x float> %a) {
+; CHECK: test_faddp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float> %a)
+; CHECK: faddp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double>)
+
+define <1 x double> @test_faddp_v1f64(<2 x double> %a) {
+; CHECK: test_faddp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double> %a)
+; CHECK: faddp d0, v0.2d
+        ret <1 x double> %val
+}
+
+
+declare <1 x float> @llvm.aarch64.neon.vpmax(<2 x float>)
+
+define <1 x float> @test_fmaxp_v1f32(<2 x float> %a) {
+; CHECK: test_fmaxp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpmax(<2 x float> %a)
+; CHECK: fmaxp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double>)
+
+define <1 x double> @test_fmaxp_v1f64(<2 x double> %a) {
+; CHECK: test_fmaxp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double> %a)
+; CHECK: fmaxp d0, v0.2d
+        ret <1 x double> %val
+}
+
+
+declare <1 x float> @llvm.aarch64.neon.vpmin(<2 x float>)
+
+define <1 x float> @test_fminp_v1f32(<2 x float> %a) {
+; CHECK: test_fminp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpmin(<2 x float> %a)
+; CHECK: fminp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpminq(<2 x double>)
+
+define <1 x double> @test_fminp_v1f64(<2 x double> %a) {
+; CHECK: test_fminp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpminq(<2 x double> %a)
+; CHECK: fminp d0, v0.2d
+        ret <1 x double> %val
+}
+
+declare <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float>)
+
+define <1 x float> @test_fmaxnmp_v1f32(<2 x float> %a) {
+; CHECK: test_fmaxnmp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float> %a)
+; CHECK: fmaxnmp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double>)
+
+define <1 x double> @test_fmaxnmp_v1f64(<2 x double> %a) {
+; CHECK: test_fmaxnmp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double> %a)
+; CHECK: fmaxnmp d0, v0.2d
+        ret <1 x double> %val
+}
+
+declare <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float>)
+
+define <1 x float> @test_fminnmp_v1f32(<2 x float> %a) {
+; CHECK: test_fminnmp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float> %a)
+; CHECK: fminnmp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double>)
+
+define <1 x double> @test_fminnmp_v1f64(<2 x double> %a) {
+; CHECK: test_fminnmp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double> %a)
+; CHECK: fminnmp d0, v0.2d
+        ret <1 x double> %val
+}
+
+define float @test_vaddv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vaddv_f32
+; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define float @test_vaddvq_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vaddvq_f32
+; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vaddvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vaddvq_f64
+; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vmaxv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vmaxv_f32
+; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vmaxvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vmaxvq_f64
+; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vminv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vminv_f32
+; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vminvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vminvq_f64
+; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define double @test_vmaxnmvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vmaxnmvq_f64
+; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vmaxnmv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vmaxnmv_f32
+; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vminnmvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vminnmvq_f64
+; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vminnmv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vminnmv_f32
+; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpaddq_s64
+; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpaddq_u64
+; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define i64 @test_vaddvq_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vaddvq_s64
+; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
+  %2 = extractelement <1 x i64> %1, i32 0
+  ret i64 %2
+}
+
+define i64 @test_vaddvq_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vaddvq_u64
+; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
+  %2 = extractelement <1 x i64> %1, i32 0
+  ret i64 %2
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>)
+
+declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double>)
+
+declare <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
new file mode 100644
index 0000000..83ceb4e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_urshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_srshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_urshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_srshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
new file mode 100644
index 0000000..bd66f80
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
@@ -0,0 +1,242 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqadd_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqadd_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqsub_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqsub_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqadd_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqadd_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqsub_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqsub_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqadd_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqadd_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqsub_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+
+define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqsub_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqadd_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqadd_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqsub_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqsub_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define i8 @test_vuqaddb_s8(i8 %a, i8 %b) {
+; CHECK: test_vuqaddb_s8
+; CHECK: suqadd {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
+  %vuqadd2.i = call <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8> %vuqadd.i, <1 x i8> %vuqadd1.i)
+  %0 = extractelement <1 x i8> %vuqadd2.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
+
+define i16 @test_vuqaddh_s16(i16 %a, i16 %b) {
+; CHECK: test_vuqaddh_s16
+; CHECK: suqadd {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vuqadd2.i = call <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16> %vuqadd.i, <1 x i16> %vuqadd1.i)
+  %0 = extractelement <1 x i16> %vuqadd2.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
+
+define i32 @test_vuqadds_s32(i32 %a, i32 %b) {
+; CHECK: test_vuqadds_s32
+; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vuqadd2.i = call <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32> %vuqadd.i, <1 x i32> %vuqadd1.i)
+  %0 = extractelement <1 x i32> %vuqadd2.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32>, <1 x i32>)
+
+define i64 @test_vuqaddd_s64(i64 %a, i64 %b) {
+; CHECK: test_vuqaddd_s64
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vuqadd2.i = call <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64> %vuqadd.i, <1 x i64> %vuqadd1.i)
+  %0 = extractelement <1 x i64> %vuqadd2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64>, <1 x i64>)
+
+define i8 @test_vsqaddb_u8(i8 %a, i8 %b) {
+; CHECK: test_vsqaddb_u8
+; CHECK: usqadd {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
+  %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %vsqadd.i, <1 x i8> %vsqadd1.i)
+  %0 = extractelement <1 x i8> %vsqadd2.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8>, <1 x i8>)
+
+define i16 @test_vsqaddh_u16(i16 %a, i16 %b) {
+; CHECK: test_vsqaddh_u16
+; CHECK: usqadd {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %vsqadd.i, <1 x i16> %vsqadd1.i)
+  %0 = extractelement <1 x i16> %vsqadd2.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16>, <1 x i16>)
+
+define i32 @test_vsqadds_u32(i32 %a, i32 %b) {
+; CHECK: test_vsqadds_u32
+; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vsqadd2.i = call <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32> %vsqadd.i, <1 x i32> %vsqadd1.i)
+  %0 = extractelement <1 x i32> %vsqadd2.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32>, <1 x i32>)
+
+define i64 @test_vsqaddd_u64(i64 %a, i64 %b) {
+; CHECK: test_vsqaddd_u64
+; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsqadd2.i = call <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64> %vsqadd.i, <1 x i64> %vsqadd1.i)
+  %0 = extractelement <1 x i64> %vsqadd2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
new file mode 100644
index 0000000..0fd67df
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqrshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqrshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqrshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqrshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqrshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqrshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqrshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+
+  ret <1 x i32> %tmp1
+}
+
+define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqrshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqrshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqrshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
new file mode 100644
index 0000000..8fdea24
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
new file mode 100644
index 0000000..6224361
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
@@ -0,0 +1,531 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define i64 @test_vshrd_n_s64(i64 %a) {
+; CHECK: test_vshrd_n_s64
+; CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsshr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsshr1 = call <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64> %vsshr, i32 63)
+  %0 = extractelement <1 x i64> %vsshr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64>, i32)
+
+define i64 @test_vshrd_n_u64(i64 %a) {
+; CHECK: test_vshrd_n_u64
+; CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vushr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vushr1 = call <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64> %vushr, i32 63)
+  %0 = extractelement <1 x i64> %vushr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64>, i32)
+
+define i64 @test_vrshrd_n_s64(i64 %a) {
+; CHECK: test_vrshrd_n_s64
+; CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsrshr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsrshr1 = call <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64> %vsrshr, i32 63)
+  %0 = extractelement <1 x i64> %vsrshr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64>, i32)
+
+define i64 @test_vrshrd_n_u64(i64 %a) {
+; CHECK: test_vrshrd_n_u64
+; CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vurshr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vurshr1 = call <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64> %vurshr, i32 63)
+  %0 = extractelement <1 x i64> %vurshr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64>, i32)
+
+define i64 @test_vsrad_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vsrad_n_s64
+; CHECK: ssra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vssra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vssra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vssra2 = call <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64> %vssra, <1 x i64> %vssra1, i32 63)
+  %0 = extractelement <1 x i64> %vssra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vsrad_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vsrad_n_u64
+; CHECK: usra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vusra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vusra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vusra2 = call <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64> %vusra, <1 x i64> %vusra1, i32 63)
+  %0 = extractelement <1 x i64> %vusra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vrsrad_n_s64
+; CHECK: srsra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsrsra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsrsra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsrsra2 = call <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64> %vsrsra, <1 x i64> %vsrsra1, i32 63)
+  %0 = extractelement <1 x i64> %vsrsra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vrsrad_n_u64
+; CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vursra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vursra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vursra2 = call <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64> %vursra, <1 x i64> %vursra1, i32 63)
+  %0 = extractelement <1 x i64> %vursra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vshld_n_s64(i64 %a) {
+; CHECK: test_vshld_n_s64
+; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
+  %0 = extractelement <1 x i64> %vshl1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64>, i32)
+
+define i64 @test_vshld_n_u64(i64 %a) {
+; CHECK: test_vshld_n_u64
+; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
+  %0 = extractelement <1 x i64> %vshl1, i32 0
+  ret i64 %0
+}
+
+define i8 @test_vqshlb_n_s8(i8 %a) {
+; CHECK: test_vqshlb_n_s8
+; CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
+entry:
+  %vsqshl = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vsqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8> %vsqshl, i32 7)
+  %0 = extractelement <1 x i8> %vsqshl1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8>, i32)
+
+define i16 @test_vqshlh_n_s16(i16 %a) {
+; CHECK: test_vqshlh_n_s16
+; CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
+entry:
+  %vsqshl = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16> %vsqshl, i32 15)
+  %0 = extractelement <1 x i16> %vsqshl1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16>, i32)
+
+define i32 @test_vqshls_n_s32(i32 %a) {
+; CHECK: test_vqshls_n_s32
+; CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
+entry:
+  %vsqshl = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32> %vsqshl, i32 31)
+  %0 = extractelement <1 x i32> %vsqshl1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32>, i32)
+
+define i64 @test_vqshld_n_s64(i64 %a) {
+; CHECK: test_vqshld_n_s64
+; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsqshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64> %vsqshl, i32 63)
+  %0 = extractelement <1 x i64> %vsqshl1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64>, i32)
+
+define i8 @test_vqshlb_n_u8(i8 %a) {
+; CHECK: test_vqshlb_n_u8
+; CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
+entry:
+  %vuqshl = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vuqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8> %vuqshl, i32 7)
+  %0 = extractelement <1 x i8> %vuqshl1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8>, i32)
+
+define i16 @test_vqshlh_n_u16(i16 %a) {
+; CHECK: test_vqshlh_n_u16
+; CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
+entry:
+  %vuqshl = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16> %vuqshl, i32 15)
+  %0 = extractelement <1 x i16> %vuqshl1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16>, i32)
+
+define i32 @test_vqshls_n_u32(i32 %a) {
+; CHECK: test_vqshls_n_u32
+; CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
+entry:
+  %vuqshl = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32> %vuqshl, i32 31)
+  %0 = extractelement <1 x i32> %vuqshl1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32>, i32)
+
+define i64 @test_vqshld_n_u64(i64 %a) {
+; CHECK: test_vqshld_n_u64
+; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vuqshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64> %vuqshl, i32 63)
+  %0 = extractelement <1 x i64> %vuqshl1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64>, i32)
+
+define i8 @test_vqshlub_n_s8(i8 %a) {
+; CHECK: test_vqshlub_n_s8
+; CHECK: sqshlu {{b[0-9]+}}, {{b[0-9]+}}, #7
+entry:
+  %vsqshlu = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vsqshlu1 = call <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8> %vsqshlu, i32 7)
+  %0 = extractelement <1 x i8> %vsqshlu1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8>, i32)
+
+define i16 @test_vqshluh_n_s16(i16 %a) {
+; CHECK: test_vqshluh_n_s16
+; CHECK: sqshlu {{h[0-9]+}}, {{h[0-9]+}}, #15
+entry:
+  %vsqshlu = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshlu1 = call <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16> %vsqshlu, i32 15)
+  %0 = extractelement <1 x i16> %vsqshlu1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16>, i32)
+
+define i32 @test_vqshlus_n_s32(i32 %a) {
+; CHECK: test_vqshlus_n_s32
+; CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
+entry:
+  %vsqshlu = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshlu1 = call <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32> %vsqshlu, i32 31)
+  %0 = extractelement <1 x i32> %vsqshlu1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32>, i32)
+
+define i64 @test_vqshlud_n_s64(i64 %a) {
+; CHECK: test_vqshlud_n_s64
+; CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsqshlu = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshlu1 = call <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64> %vsqshlu, i32 63)
+  %0 = extractelement <1 x i64> %vsqshlu1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64>, i32)
+
+define i64 @test_vsrid_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vsrid_n_s64
+; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
+  %0 = extractelement <1 x i64> %vsri2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vsrid_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vsrid_n_u64
+; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
+  %0 = extractelement <1 x i64> %vsri2, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vslid_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vslid_n_s64
+; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
+  %0 = extractelement <1 x i64> %vsli2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vslid_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vslid_n_u64
+; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
+  %0 = extractelement <1 x i64> %vsli2, i32 0
+  ret i64 %0
+}
+
+define i8 @test_vqshrnh_n_s16(i16 %a) {
+; CHECK: test_vqshrnh_n_s16
+; CHECK: sqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16> %vsqshrn, i32 8)
+  %0 = extractelement <1 x i8> %vsqshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqshrns_n_s32(i32 %a) {
+; CHECK: test_vqshrns_n_s32
+; CHECK: sqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32> %vsqshrn, i32 16)
+  %0 = extractelement <1 x i16> %vsqshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqshrnd_n_s64(i64 %a) {
+; CHECK: test_vqshrnd_n_s64
+; CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64> %vsqshrn, i32 32)
+  %0 = extractelement <1 x i32> %vsqshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqshrnh_n_u16(i16 %a) {
+; CHECK: test_vqshrnh_n_u16
+; CHECK: uqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vuqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16> %vuqshrn, i32 8)
+  %0 = extractelement <1 x i8> %vuqshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqshrns_n_u32(i32 %a) {
+; CHECK: test_vqshrns_n_u32
+; CHECK: uqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vuqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32> %vuqshrn, i32 16)
+  %0 = extractelement <1 x i16> %vuqshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqshrnd_n_u64(i64 %a) {
+; CHECK: test_vqshrnd_n_u64
+; CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vuqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64> %vuqshrn, i32 32)
+  %0 = extractelement <1 x i32> %vuqshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqrshrnh_n_s16(i16 %a) {
+; CHECK: test_vqrshrnh_n_s16
+; CHECK: sqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16> %vsqrshrn, i32 8)
+  %0 = extractelement <1 x i8> %vsqrshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqrshrns_n_s32(i32 %a) {
+; CHECK: test_vqrshrns_n_s32
+; CHECK: sqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32> %vsqrshrn, i32 16)
+  %0 = extractelement <1 x i16> %vsqrshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqrshrnd_n_s64(i64 %a) {
+; CHECK: test_vqrshrnd_n_s64
+; CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64> %vsqrshrn, i32 32)
+  %0 = extractelement <1 x i32> %vsqrshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqrshrnh_n_u16(i16 %a) {
+; CHECK: test_vqrshrnh_n_u16
+; CHECK: uqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vuqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16> %vuqrshrn, i32 8)
+  %0 = extractelement <1 x i8> %vuqrshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqrshrns_n_u32(i32 %a) {
+; CHECK: test_vqrshrns_n_u32
+; CHECK: uqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vuqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %vuqrshrn, i32 16)
+  %0 = extractelement <1 x i16> %vuqrshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqrshrnd_n_u64(i64 %a) {
+; CHECK: test_vqrshrnd_n_u64
+; CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vuqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64> %vuqrshrn, i32 32)
+  %0 = extractelement <1 x i32> %vuqrshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqshrunh_n_s16(i16 %a) {
+; CHECK: test_vqshrunh_n_s16
+; CHECK: sqshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqshrun = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16> %vsqshrun, i32 8)
+  %0 = extractelement <1 x i8> %vsqshrun1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqshruns_n_s32(i32 %a) {
+; CHECK: test_vqshruns_n_s32
+; CHECK: sqshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqshrun = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32> %vsqshrun, i32 16)
+  %0 = extractelement <1 x i16> %vsqshrun1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqshrund_n_s64(i64 %a) {
+; CHECK: test_vqshrund_n_s64
+; CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqshrun = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64> %vsqshrun, i32 32)
+  %0 = extractelement <1 x i32> %vsqshrun1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqrshrunh_n_s16(i16 %a) {
+; CHECK: test_vqrshrunh_n_s16
+; CHECK: sqrshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqrshrun = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqrshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16> %vsqrshrun, i32 8)
+  %0 = extractelement <1 x i8> %vsqrshrun1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqrshruns_n_s32(i32 %a) {
+; CHECK: test_vqrshruns_n_s32
+; CHECK: sqrshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqrshrun = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqrshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32> %vsqrshrun, i32 16)
+  %0 = extractelement <1 x i16> %vsqrshrun1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqrshrund_n_s64(i64 %a) {
+; CHECK: test_vqrshrund_n_s64
+; CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqrshrun = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqrshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64> %vsqrshrun, i32 32)
+  %0 = extractelement <1 x i32> %vsqrshrun1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll
new file mode 100644
index 0000000..1222be5
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-shift.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_ushl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_ushl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-shift-left-long.ll b/test/CodeGen/AArch64/neon-shift-left-long.ll
new file mode 100644
index 0000000..d45c476
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -0,0 +1,193 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i16> @test_sshll_v8i8(<8 x i8> %a) {
+; CHECK: test_sshll_v8i8:
+; CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
+  %1 = sext <8 x i8> %a to <8 x i16>
+  %tmp = shl <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll_v4i16(<4 x i16> %a) {
+; CHECK: test_sshll_v4i16:
+; CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
+  %1 = sext <4 x i16> %a to <4 x i32>
+  %tmp = shl <4 x i32> %1, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll_v2i32(<2 x i32> %a) {
+; CHECK: test_sshll_v2i32:
+; CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
+  %1 = sext <2 x i32> %a to <2 x i64>
+  %tmp = shl <2 x i64> %1, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll_v8i8(<8 x i8> %a) {
+; CHECK: test_ushll_v8i8:
+; CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
+  %1 = zext <8 x i8> %a to <8 x i16>
+  %tmp = shl <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll_v4i16(<4 x i16> %a) {
+; CHECK: test_ushll_v4i16:
+; CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
+  %1 = zext <4 x i16> %a to <4 x i32>
+  %tmp = shl <4 x i32> %1, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll_v2i32(<2 x i32> %a) {
+; CHECK: test_ushll_v2i32:
+; CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
+  %1 = zext <2 x i32> %a to <2 x i64>
+  %tmp = shl <2 x i64> %1, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_sshll2_v16i8(<16 x i8> %a) {
+; CHECK: test_sshll2_v16i8:
+; CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %tmp = shl <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll2_v8i16(<8 x i16> %a) {
+; CHECK: test_sshll2_v8i16:
+; CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %tmp = shl <4 x i32> %2, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll2_v4i32(<4 x i32> %a) {
+; CHECK: test_sshll2_v4i32:
+; CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %2 = sext <2 x i32> %1 to <2 x i64>
+  %tmp = shl <2 x i64> %2, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll2_v16i8(<16 x i8> %a) {
+; CHECK: test_ushll2_v16i8:
+; CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %tmp = shl <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll2_v8i16(<8 x i16> %a) {
+; CHECK: test_ushll2_v8i16:
+; CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %tmp = shl <4 x i32> %2, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll2_v4i32(<4 x i32> %a) {
+; CHECK: test_ushll2_v4i32:
+; CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %2 = zext <2 x i32> %1 to <2 x i64>
+  %tmp = shl <2 x i64> %2, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_sshll_shl0_v8i8(<8 x i8> %a) {
+; CHECK: test_sshll_shl0_v8i8:
+; CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
+  %tmp = sext <8 x i8> %a to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll_shl0_v4i16(<4 x i16> %a) {
+; CHECK: test_sshll_shl0_v4i16:
+; CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
+  %tmp = sext <4 x i16> %a to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll_shl0_v2i32(<2 x i32> %a) {
+; CHECK: test_sshll_shl0_v2i32:
+; CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
+  %tmp = sext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll_shl0_v8i8(<8 x i8> %a) {
+; CHECK: test_ushll_shl0_v8i8:
+; CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
+  %tmp = zext <8 x i8> %a to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll_shl0_v4i16(<4 x i16> %a) {
+; CHECK: test_ushll_shl0_v4i16:
+; CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
+  %tmp = zext <4 x i16> %a to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll_shl0_v2i32(<2 x i32> %a) {
+; CHECK: test_ushll_shl0_v2i32:
+; CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
+  %tmp = zext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_sshll2_shl0_v16i8(<16 x i8> %a) {
+; CHECK: test_sshll2_shl0_v16i8:
+; CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp = sext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll2_shl0_v8i16(<8 x i16> %a) {
+; CHECK: test_sshll2_shl0_v8i16:
+; CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp = sext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll2_shl0_v4i32(<4 x i32> %a) {
+; CHECK: test_sshll2_shl0_v4i32:
+; CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp = sext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll2_shl0_v16i8(<16 x i8> %a) {
+; CHECK: test_ushll2_shl0_v16i8:
+; CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp = zext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll2_shl0_v8i16(<8 x i16> %a) {
+; CHECK: test_ushll2_shl0_v8i16:
+; CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp = zext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll2_shl0_v4i32(<4 x i32> %a) {
+; CHECK: test_ushll2_shl0_v4i32:
+; CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp = zext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %tmp
+}
diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll
index 45a2605..33b04ce 100644
--- a/test/CodeGen/AArch64/neon-shift.ll
+++ b/test/CodeGen/AArch64/neon-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
 
@@ -137,4 +120,52 @@ define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 }
 
 
+define <8 x i8> @test_shl_v8i8(<8 x i8> %a) {
+; CHECK: test_shl_v8i8:
+; CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %tmp = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %tmp
+}
+
+define <4 x i16> @test_shl_v4i16(<4 x i16> %a) {
+; CHECK: test_shl_v4i16:
+; CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %tmp = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %tmp
+}
+
+define <2 x i32> @test_shl_v2i32(<2 x i32> %a) {
+; CHECK: test_shl_v2i32:
+; CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %tmp = shl <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %tmp
+}
+
+define <16 x i8> @test_shl_v16i8(<16 x i8> %a) {
+; CHECK: test_shl_v16i8:
+; CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %tmp = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %tmp
+}
+
+define <8 x i16> @test_shl_v8i16(<8 x i16> %a) {
+; CHECK: test_shl_v8i16:
+; CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %tmp = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_shl_v4i32(<4 x i32> %a) {
+; CHECK: test_shl_v4i32:
+; CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %tmp = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_shl_v2i64(<2 x i64> %a) {
+; CHECK: test_shl_v2i64:
+; CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
+  %tmp = shl <2 x i64> %a, <i64 63, i64 63>
+  ret <2 x i64> %tmp
+}
 
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
new file mode 100644
index 0000000..d5557c0
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
@@ -0,0 +1,2314 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v16i8:
+; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %tmp = load <16 x i8>* %ptr
+  store <16 x i8> %tmp, <16 x i8>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v8i16:
+; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %tmp = load <8 x i16>* %ptr
+  store <8 x i16> %tmp, <8 x i16>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v4i32:
+; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %tmp = load <4 x i32>* %ptr
+  store <4 x i32> %tmp, <4 x i32>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v2i64:
+; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %tmp = load <2 x i64>* %ptr
+  store <2 x i64> %tmp, <2 x i64>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v8i8:
+; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %tmp = load <8 x i8>* %ptr
+  store <8 x i8> %tmp, <8 x i8>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v4i16:
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %tmp = load <4 x i16>* %ptr
+  store <4 x i16> %tmp, <4 x i16>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v2i32:
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %tmp = load <2 x i32>* %ptr
+  store <2 x i32> %tmp, <2 x i32>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v1i64:
+; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %tmp = load <1 x i64>* %ptr
+  store <1 x i64> %tmp, <1 x i64>* %ptr2
+  ret void
+}
+
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+
+define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld1q_s8
+; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
+  ret <16 x i8> %vld1
+}
+
+define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld1q_s16
+; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
+  ret <8 x i16> %vld1
+}
+
+define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld1q_s32
+; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
+  ret <4 x i32> %vld1
+}
+
+define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld1q_s64
+; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
+  ret <2 x i64> %vld1
+}
+
+define <4 x float> @test_vld1q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld1q_f32
+; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
+  ret <4 x float> %vld1
+}
+
+define <2 x double> @test_vld1q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld1q_f64
+; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
+  ret <2 x double> %vld1
+}
+
+define <8 x i8> @test_vld1_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld1_s8
+; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
+  ret <8 x i8> %vld1
+}
+
+define <4 x i16> @test_vld1_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld1_s16
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
+  ret <4 x i16> %vld1
+}
+
+define <2 x i32> @test_vld1_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld1_s32
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
+  ret <2 x i32> %vld1
+}
+
+define <1 x i64> @test_vld1_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld1_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
+  ret <1 x i64> %vld1
+}
+
+define <2 x float> @test_vld1_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld1_f32
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
+  ret <2 x float> %vld1
+}
+
+define <1 x double> @test_vld1_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld1_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
+  ret <1 x double> %vld1
+}
+
+define <8 x i8> @test_vld1_p8(i8* readonly %a) {
+; CHECK-LABEL: test_vld1_p8
+; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
+  ret <8 x i8> %vld1
+}
+
+define <4 x i16> @test_vld1_p16(i16* readonly %a) {
+; CHECK-LABEL: test_vld1_p16
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
+  ret <4 x i16> %vld1
+}
+
+define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld2q_s8
+; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
+  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld2q_s16
+; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
+  %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld2q_s32
+; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld2q_s64
+; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld2q_f32
+; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld2q_f64
+; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld2_s8
+; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
+  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld2_s16
+; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
+  %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld2_s32
+; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld2_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld2_f32
+; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld2_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld3q_s8
+; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
+  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
+  ret %struct.int8x16x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld3q_s16
+; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
+  %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld3q_s32
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld3q_s64
+; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld3q_f32
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld3q_f64
+; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld3_s8
+; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
+  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld3_s16
+; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
+  %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld3_s32
+; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld3_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld3_f32
+; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld3_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld4q_s8
+; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
+  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld4q_s16
+; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
+  %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld4q_s32
+; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld4q_s64
+; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld4q_f32
+; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld4q_f64
+; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld4_s8
+; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
+  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld4_s16
+; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
+  %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld4_s32
+; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld4_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld4_f32
+; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld4_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
+declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
+declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
+declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
+
+define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_s8
+; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_s16
+; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_s32
+; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_s64
+; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_f32
+; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1q_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_f64
+; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_s8
+; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_s16
+; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_s32
+; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_s64
+; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_f32
+; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_f64
+; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
+  ret void
+}
+
+define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s8
+; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
+  ret void
+}
+
+define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s16
+; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
+  ret void
+}
+
+define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s32
+; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s64
+; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_f32
+; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_f64
+; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s8
+; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
+  ret void
+}
+
+define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s16
+; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
+  ret void
+}
+
+define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s32
+; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2_f32
+; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2_f64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s8
+; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
+  ret void
+}
+
+define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s16
+; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
+  ret void
+}
+
+define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s32
+; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s64
+; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_f32
+; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_f64
+; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s8
+; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
+  ret void
+}
+
+define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s16
+; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
+  ret void
+}
+
+define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s32
+; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3_f32
+; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3_f64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s8
+; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
+  ret void
+}
+
+define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s16
+; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
+  ret void
+}
+
+define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s32
+; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s64
+; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_f32
+; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_f64
+; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s8
+; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
+  ret void
+}
+
+define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s16
+; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
+  ret void
+}
+
+define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s32
+; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4_f32
+; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4_f64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
+declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
+declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
+declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
+declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
+declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
+declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
+declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
+
+define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a)  {
+; CHECK-LABEL: test_vld1q_s8_x2
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+  %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
+  %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0
+  %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1
+  ret %struct.int8x16x2_t %5
+}
+
+define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a)  {
+; CHECK-LABEL: test_vld1q_s16_x2
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
+  %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1
+  %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0
+  %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1
+  ret %struct.int16x8x2_t %6
+}
+
+define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a)  {
+; CHECK-LABEL: test_vld1q_s32_x2
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
+  %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
+  %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0
+  %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1
+  ret %struct.int32x4x2_t %6
+}
+
+define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a)  {
+; CHECK-LABEL: test_vld1q_s64_x2
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0
+  %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1
+  %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0
+  %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1
+  ret %struct.int64x2x2_t %6
+}
+
+define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a)  {
+; CHECK-LABEL: test_vld1q_f32_x2
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float> } %2, 0
+  %4 = extractvalue { <4 x float>, <4 x float> } %2, 1
+  %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0
+  %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1
+  ret %struct.float32x4x2_t %6
+}
+
+
+define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a)  {
+; CHECK-LABEL: test_vld1q_f64_x2
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double> } %2, 0
+  %4 = extractvalue { <2 x double>, <2 x double> } %2, 1
+  %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0
+  %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1
+  ret %struct.float64x2x2_t %6
+}
+
+define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a)  {
+; CHECK-LABEL: test_vld1_s8_x2
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
+  %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
+  %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0
+  %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1
+  ret %struct.int8x8x2_t %5
+}
+
+define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a)  {
+; CHECK-LABEL: test_vld1_s16_x2
+; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0
+  %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1
+  %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0
+  %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1
+  ret %struct.int16x4x2_t %6
+}
+
+define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a)  {
+; CHECK-LABEL: test_vld1_s32_x2
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0
+  %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1
+  %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0
+  %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1
+  ret %struct.int32x2x2_t %6
+}
+
+define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a)  {
+; CHECK-LABEL: test_vld1_s64_x2
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0
+  %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1
+  %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0
+  %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1
+  ret %struct.int64x1x2_t %6
+}
+
+define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a)  {
+; CHECK-LABEL: test_vld1_f32_x2
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x float>, <2 x float> } %2, 0
+  %4 = extractvalue { <2 x float>, <2 x float> } %2, 1
+  %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0
+  %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1
+  ret %struct.float32x2x2_t %6
+}
+
+define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a)  {
+; CHECK-LABEL: test_vld1_f64_x2
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x double>, <1 x double> } %2, 0
+  %4 = extractvalue { <1 x double>, <1 x double> } %2, 1
+  %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0
+  %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1
+  ret %struct.float64x1x2_t %6
+}
+
+define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a)  {
+; CHECK-LABEL: test_vld1q_s8_x3
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
+; [{{x[0-9]+|sp}}]
+  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
+  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
+  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
+  %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0
+  %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1
+  %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2
+  ret %struct.int8x16x3_t %7
+}
+
+define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a)  {
+; CHECK-LABEL: test_vld1q_s16_x3
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
+  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
+  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
+  %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0
+  %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1
+  %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2
+  ret %struct.int16x8x3_t %8
+}
+
+define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a)  {
+; CHECK-LABEL: test_vld1q_s32_x3
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
+  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
+  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
+  %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0
+  %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1
+  %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2
+  ret %struct.int32x4x3_t %8
+}
+
+define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a)  {
+; CHECK-LABEL: test_vld1q_s64_x3
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
+  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
+  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
+  %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0
+  %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1
+  %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2
+  ret %struct.int64x2x3_t %8
+}
+
+define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a)  {
+; CHECK-LABEL: test_vld1q_f32_x3
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0
+  %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1
+  %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2
+  ret %struct.float32x4x3_t %8
+}
+
+
+define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a)  {
+; CHECK-LABEL: test_vld1q_f64_x3
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0
+  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1
+  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2
+  %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0
+  %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1
+  %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2
+  ret %struct.float64x2x3_t %8
+}
+
+define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a)  {
+; CHECK-LABEL: test_vld1_s8_x3
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
+; [{{x[0-9]+|sp}}]
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+  %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0
+  %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1
+  %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2
+  ret %struct.int8x8x3_t %7
+}
+
+define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a)  {
+; CHECK-LABEL: test_vld1_s16_x3
+; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+  %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0
+  %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1
+  %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2
+  ret %struct.int16x4x3_t %8
+}
+
+define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a)  {
+  %1 = bitcast i32* %a to i8*
+; CHECK-LABEL: test_vld1_s32_x3
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+  %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0
+  %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1
+  %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2
+  ret %struct.int32x2x3_t %8
+}
+
+define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a)  {
+; CHECK-LABEL: test_vld1_s64_x3
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
+  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
+  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
+  %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0
+  %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1
+  %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2
+  ret %struct.int64x1x3_t %8
+}
+
+define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a)  {
+; CHECK-LABEL: test_vld1_f32_x3
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0
+  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1
+  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2
+  %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0
+  %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1
+  %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2
+  ret %struct.float32x2x3_t %8
+}
+
+
+define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a)  {
+; CHECK-LABEL: test_vld1_f64_x3
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0
+  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1
+  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2
+  %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0
+  %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1
+  %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2
+  ret %struct.float64x1x3_t %8
+}
+
+define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a)  {
+; CHECK-LABEL: test_vld1q_s8_x4
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
+; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
+  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
+  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
+  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
+  %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0
+  %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1
+  %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2
+  %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3
+  ret %struct.int8x16x4_t %9
+}
+
+define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a)  {
+; CHECK-LABEL: test_vld1q_s16_x4
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
+; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
+  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
+  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
+  %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3
+  %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0
+  %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1
+  %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2
+  %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3
+  ret %struct.int16x8x4_t %10
+}
+
+define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a)  {
+; CHECK-LABEL: test_vld1q_s32_x4
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
+  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
+  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
+  %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3
+  %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0
+  %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1
+  %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2
+  %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3
+  ret %struct.int32x4x4_t %10
+}
+
+define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a)  {
+; CHECK-LABEL: test_vld1q_s64_x4
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
+  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
+  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
+  %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3
+  %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0
+  %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1
+  %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2
+  %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3
+  ret %struct.int64x2x4_t %10
+}
+
+define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a)  {
+; CHECK-LABEL: test_vld1q_f32_x4
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
+  %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0
+  %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1
+  %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2
+  %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3
+  ret %struct.float32x4x4_t %10
+}
+
+define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a)  {
+; CHECK-LABEL: test_vld1q_f64_x4
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
+  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
+  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
+  %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
+  %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0
+  %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1
+  %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2
+  %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3
+  ret %struct.float64x2x4_t %10
+}
+
+define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a)  {
+; CHECK-LABEL: test_vld1_s8_x4
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
+; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+  %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
+  %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0
+  %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1
+  %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2
+  %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3
+  ret %struct.int8x8x4_t %9
+}
+
+define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a)  {
+; CHECK-LABEL: test_vld1_s16_x4
+; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
+; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3
+  %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0
+  %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1
+  %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2
+  %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3
+  ret %struct.int16x4x4_t %10
+}
+
+define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a)  {
+; CHECK-LABEL: test_vld1_s32_x4
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+  %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
+  %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0
+  %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1
+  %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2
+  %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3
+  ret %struct.int32x2x4_t %10
+}
+
+define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a)  {
+; CHECK-LABEL: test_vld1_s64_x4
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
+  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
+  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
+  %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3
+  %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0
+  %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1
+  %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2
+  %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3
+  ret %struct.int64x1x4_t %10
+}
+
+define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a)  {
+; CHECK-LABEL: test_vld1_f32_x4
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0
+  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1
+  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2
+  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3
+  %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0
+  %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1
+  %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2
+  %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3
+  ret %struct.float32x2x4_t %10
+}
+
+
+define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a)  {
+; CHECK-LABEL: test_vld1_f64_x4
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0
+  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1
+  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2
+  %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3
+  %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0
+  %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1
+  %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2
+  %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3
+  ret %struct.float64x1x4_t %10
+}
+
+define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b)  {
+; CHECK-LABEL: test_vst1q_s8_x2
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <16 x i8>] %b, 0
+  %2 = extractvalue [2 x <16 x i8>] %b, 1
+  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b)  {
+; CHECK-LABEL: test_vst1q_s16_x2
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <8 x i16>] %b, 0
+  %2 = extractvalue [2 x <8 x i16>] %b, 1
+  %3 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b)  {
+; CHECK-LABEL: test_vst1q_s32_x2
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <4 x i32>] %b, 0
+  %2 = extractvalue [2 x <4 x i32>] %b, 1
+  %3 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b)  {
+; CHECK-LABEL: test_vst1q_s64_x2
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x i64>] %b, 0
+  %2 = extractvalue [2 x <2 x i64>] %b, 1
+  %3 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b)  {
+; CHECK-LABEL: test_vst1q_f32_x2
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <4 x float>] %b, 0
+  %2 = extractvalue [2 x <4 x float>] %b, 1
+  %3 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4)
+  ret void
+}
+
+
+define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b)  {
+; CHECK-LABEL: test_vst1q_f64_x2
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x double>] %b, 0
+  %2 = extractvalue [2 x <2 x double>] %b, 1
+  %3 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b)  {
+; CHECK-LABEL: test_vst1_s8_x2
+; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b)  {
+; CHECK-LABEL: test_vst1_s16_x2
+; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <4 x i16>] %b, 0
+  %2 = extractvalue [2 x <4 x i16>] %b, 1
+  %3 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b)  {
+; CHECK-LABEL: test_vst1_s32_x2
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x i32>] %b, 0
+  %2 = extractvalue [2 x <2 x i32>] %b, 1
+  %3 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b)  {
+; CHECK-LABEL: test_vst1_s64_x2
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <1 x i64>] %b, 0
+  %2 = extractvalue [2 x <1 x i64>] %b, 1
+  %3 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b)  {
+; CHECK-LABEL: test_vst1_f32_x2
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x float>] %b, 0
+  %2 = extractvalue [2 x <2 x float>] %b, 1
+  %3 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b)  {
+; CHECK-LABEL: test_vst1_f64_x2
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <1 x double>] %b, 0
+  %2 = extractvalue [2 x <1 x double>] %b, 1
+  %3 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b)  {
+; CHECK-LABEL: test_vst1q_s8_x3
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <16 x i8>] %b, 0
+  %2 = extractvalue [3 x <16 x i8>] %b, 1
+  %3 = extractvalue [3 x <16 x i8>] %b, 2
+  tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b)  {
+; CHECK-LABEL: test_vst1q_s16_x3
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <8 x i16>] %b, 0
+  %2 = extractvalue [3 x <8 x i16>] %b, 1
+  %3 = extractvalue [3 x <8 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b)  {
+; CHECK-LABEL: test_vst1q_s32_x3
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <4 x i32>] %b, 0
+  %2 = extractvalue [3 x <4 x i32>] %b, 1
+  %3 = extractvalue [3 x <4 x i32>] %b, 2
+  %4 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b)  {
+; CHECK-LABEL: test_vst1q_s64_x3
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x i64>] %b, 0
+  %2 = extractvalue [3 x <2 x i64>] %b, 1
+  %3 = extractvalue [3 x <2 x i64>] %b, 2
+  %4 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b)  {
+; CHECK-LABEL: test_vst1q_f32_x3
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <4 x float>] %b, 0
+  %2 = extractvalue [3 x <4 x float>] %b, 1
+  %3 = extractvalue [3 x <4 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b)  {
+; CHECK-LABEL: test_vst1q_f64_x3
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x double>] %b, 0
+  %2 = extractvalue [3 x <2 x double>] %b, 1
+  %3 = extractvalue [3 x <2 x double>] %b, 2
+  %4 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b)  {
+; CHECK-LABEL: test_vst1_s8_x3
+; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <8 x i8>] %b, 0
+  %2 = extractvalue [3 x <8 x i8>] %b, 1
+  %3 = extractvalue [3 x <8 x i8>] %b, 2
+  tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b)  {
+; CHECK-LABEL: test_vst1_s16_x3
+; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b)  {
+; CHECK-LABEL: test_vst1_s32_x3
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x i32>] %b, 0
+  %2 = extractvalue [3 x <2 x i32>] %b, 1
+  %3 = extractvalue [3 x <2 x i32>] %b, 2
+  %4 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b)  {
+; CHECK-LABEL: test_vst1_s64_x3
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <1 x i64>] %b, 0
+  %2 = extractvalue [3 x <1 x i64>] %b, 1
+  %3 = extractvalue [3 x <1 x i64>] %b, 2
+  %4 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b)  {
+; CHECK-LABEL: test_vst1_f32_x3
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x float>] %b, 0
+  %2 = extractvalue [3 x <2 x float>] %b, 1
+  %3 = extractvalue [3 x <2 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b)  {
+; CHECK-LABEL: test_vst1_f64_x3
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <1 x double>] %b, 0
+  %2 = extractvalue [3 x <1 x double>] %b, 1
+  %3 = extractvalue [3 x <1 x double>] %b, 2
+  %4 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b)  {
+; CHECK-LABEL: test_vst1q_s8_x4
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
+; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <16 x i8>] %b, 0
+  %2 = extractvalue [4 x <16 x i8>] %b, 1
+  %3 = extractvalue [4 x <16 x i8>] %b, 2
+  %4 = extractvalue [4 x <16 x i8>] %b, 3
+  tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b)  {
+; CHECK-LABEL: test_vst1q_s16_x4
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
+; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <8 x i16>] %b, 0
+  %2 = extractvalue [4 x <8 x i16>] %b, 1
+  %3 = extractvalue [4 x <8 x i16>] %b, 2
+  %4 = extractvalue [4 x <8 x i16>] %b, 3
+  %5 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b)  {
+; CHECK-LABEL: test_vst1q_s32_x4
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <4 x i32>] %b, 0
+  %2 = extractvalue [4 x <4 x i32>] %b, 1
+  %3 = extractvalue [4 x <4 x i32>] %b, 2
+  %4 = extractvalue [4 x <4 x i32>] %b, 3
+  %5 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b)  {
+; CHECK-LABEL: test_vst1q_s64_x4
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x i64>] %b, 0
+  %2 = extractvalue [4 x <2 x i64>] %b, 1
+  %3 = extractvalue [4 x <2 x i64>] %b, 2
+  %4 = extractvalue [4 x <2 x i64>] %b, 3
+  %5 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b)  {
+; CHECK-LABEL: test_vst1q_f32_x4
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <4 x float>] %b, 0
+  %2 = extractvalue [4 x <4 x float>] %b, 1
+  %3 = extractvalue [4 x <4 x float>] %b, 2
+  %4 = extractvalue [4 x <4 x float>] %b, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b)  {
+; CHECK-LABEL: test_vst1q_f64_x4
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x double>] %b, 0
+  %2 = extractvalue [4 x <2 x double>] %b, 1
+  %3 = extractvalue [4 x <2 x double>] %b, 2
+  %4 = extractvalue [4 x <2 x double>] %b, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b)  {
+; CHECK-LABEL: test_vst1_s8_x4
+; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
+; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <8 x i8>] %b, 0
+  %2 = extractvalue [4 x <8 x i8>] %b, 1
+  %3 = extractvalue [4 x <8 x i8>] %b, 2
+  %4 = extractvalue [4 x <8 x i8>] %b, 3
+  tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b)  {
+; CHECK-LABEL: test_vst1_s16_x4
+; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
+; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <4 x i16>] %b, 0
+  %2 = extractvalue [4 x <4 x i16>] %b, 1
+  %3 = extractvalue [4 x <4 x i16>] %b, 2
+  %4 = extractvalue [4 x <4 x i16>] %b, 3
+  %5 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b)  {
+; CHECK-LABEL: test_vst1_s32_x4
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x i32>] %b, 0
+  %2 = extractvalue [4 x <2 x i32>] %b, 1
+  %3 = extractvalue [4 x <2 x i32>] %b, 2
+  %4 = extractvalue [4 x <2 x i32>] %b, 3
+  %5 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b)  {
+; CHECK-LABEL: test_vst1_s64_x4
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <1 x i64>] %b, 0
+  %2 = extractvalue [4 x <1 x i64>] %b, 1
+  %3 = extractvalue [4 x <1 x i64>] %b, 2
+  %4 = extractvalue [4 x <1 x i64>] %b, 3
+  %5 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b)  {
+; CHECK-LABEL: test_vst1_f32_x4
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x float>] %b, 0
+  %2 = extractvalue [4 x <2 x float>] %b, 1
+  %3 = extractvalue [4 x <2 x float>] %b, 2
+  %4 = extractvalue [4 x <2 x float>] %b, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b)  {
+; CHECK-LABEL: test_vst1_f64_x4
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <1 x double>] %b, 0
+  %2 = extractvalue [4 x <1 x double>] %b, 1
+  %3 = extractvalue [4 x <1 x double>] %b, 2
+  %4 = extractvalue [4 x <1 x double>] %b, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8)
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32)
+declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
new file mode 100644
index 0000000..3f28320
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
@@ -0,0 +1,2113 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <2 x double> undef, double %0, i32 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <2 x float> undef, float %0, i32 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %1
+}
+
+define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2q_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2q_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2q_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2q_dup_s64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2q_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2q_dup_f64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3q_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+  ret %struct.int8x16x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3q_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3q_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3q_dup_s64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3q_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3q_dup_f64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4q_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
+  %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4q_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
+  %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4q_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
+  %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4q_dup_s64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
+  %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4q_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
+  %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4q_dup_f64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
+  %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
+  %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
+  %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
+  %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+  %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
+  %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+  %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+  ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+  ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+  ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+  ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+  ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+  ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+  ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+  ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+  ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+  ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %vld1_lane
+}
+
+define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+  %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s8
+; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+  %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s8
+; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <16 x i8> %b, i32 15
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i16> %b, i32 7
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i32> %b, i32 3
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i64> %b, i32 1
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x float> %b, i32 3
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x double> %b, i32 1
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i8> %b, i32 7
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i16> %b, i32 3
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i32> %b, i32 1
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x i64> %b, i32 0
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x float> %b, i32 1
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x double> %b, i32 0
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
+  ret void
+}
+
+define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
+  ret void
+}
+
+define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
+  ret void
+}
+
+define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-ldst.ll b/test/CodeGen/AArch64/neon-simd-ldst.ll
new file mode 100644
index 0000000..afc0901
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define void @test_ldstq_4v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldstq_4v
+; CHECK: ld4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+; CHECK: st4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+entry:
+  %tobool62 = icmp eq i32 %count, 0
+  br i1 %tobool62, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.063 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.063, -1
+  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %io, i32 1)
+  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
+  tail call void @llvm.arm.neon.vst4.v16i8(i8* %io, <16 x i8> %vld4.fca.0.extract, <16 x i8> %vld4.fca.1.extract, <16 x i8> %vld4.fca.2.extract, <16 x i8> %vld4.fca.3.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+
+define void @test_ldstq_3v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldstq_3v
+; CHECK: ld3     {v0.16b, v1.16b, v2.16b}, [x0]
+; CHECK: st3     {v0.16b, v1.16b, v2.16b}, [x0]
+entry:
+  %tobool47 = icmp eq i32 %count, 0
+  br i1 %tobool47, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.048 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.048, -1
+  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %io, i32 1)
+  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
+  tail call void @llvm.arm.neon.vst3.v16i8(i8* %io, <16 x i8> %vld3.fca.0.extract, <16 x i8> %vld3.fca.1.extract, <16 x i8> %vld3.fca.2.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+
+define void @test_ldstq_2v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldstq_2v
+; CHECK: ld2     {v0.16b, v1.16b}, [x0]
+; CHECK: st2     {v0.16b, v1.16b}, [x0]
+entry:
+  %tobool22 = icmp eq i32 %count, 0
+  br i1 %tobool22, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.023, -1
+  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %io, i32 1)
+  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
+  tail call void @llvm.arm.neon.vst2.v16i8(i8* %io, <16 x i8> %vld2.fca.0.extract, <16 x i8> %vld2.fca.1.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+
+define void @test_ldst_4v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldst_4v
+; CHECK: ld4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+; CHECK: st4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+entry:
+  %tobool42 = icmp eq i32 %count, 0
+  br i1 %tobool42, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.043 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.043, -1
+  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %io, i32 1)
+  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
+  tail call void @llvm.arm.neon.vst4.v8i8(i8* %io, <8 x i8> %vld4.fca.0.extract, <8 x i8> %vld4.fca.1.extract, <8 x i8> %vld4.fca.2.extract, <8 x i8> %vld4.fca.3.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+
+define void @test_ldst_3v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldst_3v
+; CHECK: ld3     {v0.8b, v1.8b, v2.8b}, [x0]
+; CHECK: st3     {v0.8b, v1.8b, v2.8b}, [x0]
+entry:
+  %tobool32 = icmp eq i32 %count, 0
+  br i1 %tobool32, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.033 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.033, -1
+  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %io, i32 1)
+  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %io, <8 x i8> %vld3.fca.0.extract, <8 x i8> %vld3.fca.1.extract, <8 x i8> %vld3.fca.2.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+
+define void @test_ldst_2v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldst_2v
+; CHECK: ld2     {v0.8b, v1.8b}, [x0]
+; CHECK: st2     {v0.8b, v1.8b}, [x0]
+entry:
+  %tobool22 = icmp eq i32 %count, 0
+  br i1 %tobool22, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.023, -1
+  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %io, i32 1)
+  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
+  tail call void @llvm.arm.neon.vst2.v8i8(i8* %io, <8 x i8> %vld2.fca.0.extract, <8 x i8> %vld2.fca.1.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
new file mode 100644
index 0000000..156fe1d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
@@ -0,0 +1,354 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+;Check for a post-increment updating load.
+define <4 x i16> @test_vld1_fx_update(i16** %ptr) nounwind {
+; CHECK: test_vld1_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #8
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 2)
+  %tmp2 = getelementptr i16* %A, i32 4
+  store i16* %tmp2, i16** %ptr
+  ret <4 x i16> %tmp1
+}
+
+;Check for a post-increment updating load with register increment.
+define <2 x i32> @test_vld1_reg_update(i32** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld1_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i32** %ptr
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 4)
+  %tmp2 = getelementptr i32* %A, i32 %inc
+  store i32* %tmp2, i32** %ptr
+  ret <2 x i32> %tmp1
+}
+
+define <2 x float> @test_vld2_fx_update(float** %ptr) nounwind {
+; CHECK: test_vld2_fx_update
+; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 4)
+  %tmp2 = extractvalue { <2 x float>, <2 x float> } %tmp1, 0
+  %tmp3 = getelementptr float* %A, i32 4
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vld2_reg_update(i8** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld2_reg_update
+; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  %tmp0 = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1)
+  %tmp1 = extractvalue { <16 x i8>, <16 x i8> } %tmp0, 0
+  %tmp2 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp2, i8** %ptr
+  ret <16 x i8> %tmp1
+}
+
+define <4 x i32> @test_vld3_fx_update(i32** %ptr) nounwind {
+; CHECK: test_vld3_fx_update
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #48
+  %A = load i32** %ptr
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 4)
+  %tmp2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %tmp1, 0
+  %tmp3 = getelementptr i32* %A, i32 12
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i16> @test_vld3_reg_update(i16** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld3_reg_update
+; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 2)
+  %tmp2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %tmp1, 0
+  %tmp3 = getelementptr i16* %A, i32 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <8 x i16> @test_vld4_fx_update(i16** %ptr) nounwind {
+; CHECK: test_vld4_fx_update
+; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], #64
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
+  %tmp2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %tmp1, 0
+  %tmp3 = getelementptr i16* %A, i32 32
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vld4_reg_update(i8** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld4_reg_update
+; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  %tmp0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1)
+  %tmp1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %tmp0, 0
+  %tmp2 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp2, i8** %ptr
+  ret <8 x i8> %tmp1
+}
+
+define void @test_vst1_fx_update(float** %ptr, <2 x float> %B) nounwind {
+; CHECK: test_vst1_fx_update
+; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #8
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %B, i32 4)
+  %tmp2 = getelementptr float* %A, i32 2
+  store float* %tmp2, float** %ptr
+  ret void
+}
+
+define void @test_vst1_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
+; CHECK: test_vst1_reg_update
+; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %B, i32 2)
+  %tmp1 = getelementptr i16* %A, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst2_fx_update(i64** %ptr, <1 x i64> %B) nounwind {
+; CHECK: test_vst2_fx_update
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}], #16
+  %A = load i64** %ptr
+  %tmp0 = bitcast i64* %A to i8*
+  call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %B, <1 x i64> %B, i32 8)
+  %tmp1 = getelementptr i64* %A, i32 2
+  store i64* %tmp1, i64** %ptr
+  ret void
+}
+
+define void @test_vst2_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
+; CHECK: test_vst2_reg_update
+; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, i32 4)
+  %tmp0 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp0, i8** %ptr
+  ret void
+}
+
+define void @test_vst3_fx_update(i32** %ptr, <2 x i32> %B) nounwind {
+; CHECK: test_vst3_fx_update
+; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #24
+  %A = load i32** %ptr
+  %tmp0 = bitcast i32* %A to i8*
+  call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %B, <2 x i32> %B, <2 x i32> %B, i32 4)
+  %tmp1 = getelementptr i32* %A, i32 6
+  store i32* %tmp1, i32** %ptr
+  ret void
+}
+
+define void @test_vst3_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
+; CHECK: test_vst3_reg_update
+; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %B, <8 x i16> %B, <8 x i16> %B, i32 2)
+  %tmp1 = getelementptr i16* %A, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst4_fx_update(float** %ptr, <4 x float> %B) nounwind {
+; CHECK: test_vst4_fx_update
+; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}], #64
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %B, <4 x float> %B, <4 x float> %B, <4 x float> %B, i32 4)
+  %tmp1 = getelementptr float* %A, i32 16
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+define void @test_vst4_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
+; CHECK: test_vst4_reg_update
+; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, i32 1)
+  %tmp0 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp0, i8** %ptr
+  ret void
+}
+
+
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+
+define <16 x i8> @test_vld1x2_fx_update(i8* %a, i8** %ptr) {
+; CHECK: test_vld1x2_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
+  %1 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+  %tmp1 = getelementptr i8* %a, i32 32
+  store i8* %tmp1, i8** %ptr
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vld1x2_reg_update(i16* %a, i16** %ptr, i32 %inc) {
+; CHECK: test_vld1x2_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret <8 x i16> %3
+}
+
+define <2 x i64> @test_vld1x3_fx_update(i64* %a, i64** %ptr) {
+; CHECK: test_vld1x3_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], #48
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
+  %tmp1 = getelementptr i64* %a, i32 6
+  store i64* %tmp1, i64** %ptr
+  ret  <2 x i64> %3
+}
+
+define <8 x i16> @test_vld1x3_reg_update(i16* %a, i16** %ptr, i32 %inc) {
+; CHECK: test_vld1x3_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret <8 x i16> %3
+}
+
+define <4 x float> @test_vld1x4_fx_update(float* %a, float** %ptr) {
+; CHECK: test_vld1x4_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %tmp1 = getelementptr float* %a, i32 16
+  store float* %tmp1, float** %ptr
+  ret <4 x float> %3
+}
+
+define <8 x i8> @test_vld1x4_reg_update(i8* readonly %a, i8** %ptr, i32 %inc) #0 {
+; CHECK: test_vld1x4_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret <8 x i8> %2
+}
+
+define void @test_vst1x2_fx_update(i8* %a, [2 x <16 x i8>] %b.coerce, i8** %ptr) #2 {
+; CHECK: test_vst1x2_fx_update
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
+  %1 = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %2 = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
+  %tmp1 = getelementptr i8* %a, i32 32
+  store i8* %tmp1, i8** %ptr
+  ret void
+}
+
+define void @test_vst1x2_reg_update(i16* %a, [2 x <8 x i16>] %b.coerce, i16** %ptr, i32 %inc) #2 {
+; CHECK: test_vst1x2_reg_update
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %2 = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %3 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst1x3_fx_update(i32* %a, [3 x <2 x i32>] %b.coerce, i32** %ptr) #2 {
+; CHECK: test_vst1x3_fx_update
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #24
+  %1 = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %2 = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %3 = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %4 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
+  %tmp1 = getelementptr i32* %a, i32 6
+  store i32* %tmp1, i32** %ptr
+  ret void
+}
+
+define void @test_vst1x3_reg_update(i64* %a, [3 x <1 x i64>] %b.coerce, i64** %ptr, i32 %inc) #2 {
+; CHECK: test_vst1x3_reg_update
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %2 = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %3 = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %4 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
+  %tmp1 = getelementptr i64* %a, i32 %inc
+  store i64* %tmp1, i64** %ptr
+  ret void
+}
+
+define void @test_vst1x4_fx_update(float* %a, [4 x <4 x float>] %b.coerce, float** %ptr) #2 {
+; CHECK: test_vst1x4_fx_update
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
+  %1 = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %2 = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %3 = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %4 = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
+  %tmp1 = getelementptr float* %a, i32 16
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+define void @test_vst1x4_reg_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr, i32 %inc) #2 {
+; CHECK: test_vst1x4_reg_update
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
+declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #3
+declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) #3
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
new file mode 100644
index 0000000..80a9347
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
@@ -0,0 +1,319 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
+; CHECK-LABEL: test_vld2q_dup_fx_update
+; CHECK: ld2r  {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2
+  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
+  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
+  %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
+  %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <16 x i8>] } %7
+}
+
+define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2q_dup_reg_update
+; CHECK: ld2r  {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
+  %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
+  %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
+  %tmp1 = getelementptr i32* %a, i32 %inc
+  store i32* %tmp1, i32** %ptr
+  ret { [2 x <4 x i32>] } %8
+}
+
+define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
+; CHECK-LABEL: test_vld3_dup_fx_update
+; CHECK: ld3r  {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+  %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+  %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
+  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+  %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
+  %9 = insertvalue { [3 x <4 x i16>] }  undef, <4 x i16> %4, 0, 0
+  %10 = insertvalue { [3 x <4 x i16>] }  %9, <4 x i16> %6, 0, 1
+  %11 = insertvalue { [3 x <4 x i16>] }  %10, <4 x i16> %8, 0, 2
+  %tmp1 = getelementptr i16* %a, i32 3
+  store i16* %tmp1, i16** %ptr
+  ret { [3 x <4 x i16>] }  %11
+}
+
+define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_reg_update
+; CHECK: ld3r  {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
+  %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+  %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
+  %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
+  %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
+  %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret { [3 x <8 x i8>] }%10
+}
+
+define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
+; CHECK-LABEL: test_vld4_dup_fx_update
+; CHECK: ld4r  {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+  %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+  %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
+  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+  %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
+  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
+  %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
+  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
+  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
+  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
+  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+  %tmp1 = getelementptr i32* %a, i32 4
+  store i32* %tmp1, i32** %ptr
+  ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_reg_update
+; CHECK: ld4r  {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
+  %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
+  %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
+  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
+  %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
+  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
+  %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
+  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
+  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
+  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
+  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret { [4 x <2 x double>] } %14
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vld2_lane_fx_update
+; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <8 x i8>] } %7
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2_lane_reg_update
+; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
+  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <8 x i8>] } %7
+}
+
+define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vld3_lane_fx_update
+; CHECK: ld3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12
+  %1 = extractvalue [3 x <2 x float>] %b, 0
+  %2 = extractvalue [3 x <2 x float>] %b, 1
+  %3 = extractvalue [3 x <2 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
+  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
+  %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
+  %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
+  %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
+  %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
+  %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
+  %tmp1 = getelementptr float* %a, i32 3
+  store float* %tmp1, float** %ptr
+  ret { [3 x <2 x float>] } %11
+}
+
+define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_lane_reg_update
+; CHECK: ld3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
+  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
+  %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
+  %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
+  %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
+  %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret { [3 x <4 x i16>] } %11
+}
+
+define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
+; CHECK-LABEL: test_vld4_lane_fx_update
+; CHECK: ld4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16
+  %1 = extractvalue [4 x <2 x i32>] %b, 0
+  %2 = extractvalue [4 x <2 x i32>] %b, 1
+  %3 = extractvalue [4 x <2 x i32>] %b, 2
+  %4 = extractvalue [4 x <2 x i32>] %b, 3
+  %5 = bitcast i32* %a to i8*
+  %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
+  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
+  %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
+  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
+  %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
+  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
+  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
+  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
+  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+  %tmp1 = getelementptr i32* %a, i32 4
+  store i32* %tmp1, i32** %ptr
+  ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_lane_reg_update
+; CHECK: ld4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x double>] %b, 0
+  %2 = extractvalue [4 x <2 x double>] %b, 1
+  %3 = extractvalue [4 x <2 x double>] %b, 2
+  %4 = extractvalue [4 x <2 x double>] %b, 3
+  %5 = bitcast double* %a to i8*
+  %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
+  %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
+  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
+  %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
+  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
+  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
+  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
+  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret { [4 x <2 x double>] } %14
+}
+
+define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vst2_lane_fx_update
+; CHECK: st2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret void
+}
+
+define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst2_lane_reg_update
+; CHECK: st2  {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %3 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+  %tmp1 = getelementptr i32* %a, i32 %inc
+  store i32* %tmp1, i32** %ptr
+  ret void
+}
+
+define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vst3_lane_fx_update
+; CHECK: st3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12
+  %1 = extractvalue [3 x <4 x float>] %b, 0
+  %2 = extractvalue [3 x <4 x float>] %b, 1
+  %3 = extractvalue [3 x <4 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
+  %tmp1 = getelementptr float* %a, i32 3
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst3_lane_reg_update
+; CHECK: st3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
+; CHECK-LABEL: test_vst4_lane_fx_update
+; CHECK: st4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32
+  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+  %tmp1 = getelementptr double* %a, i32 4
+  store double* %tmp1, double** %ptr
+  ret void
+}
+
+
+define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst4_lane_reg_update
+; CHECK: st4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
+  %tmp1 = getelementptr float* %a, i32 %inc
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-shift.ll b/test/CodeGen/AArch64/neon-simd-shift.ll
new file mode 100644
index 0000000..fd76265
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-shift.ll
@@ -0,0 +1,1556 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vshr_n_s8
+; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vshr_n_s16
+; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vshr_n_s32
+; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_s8
+; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_s16
+; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_s32
+; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_s64
+; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vshr_n_u8
+; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vshr_n_u16
+; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vshr_n_u32
+; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_u8
+; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_u16
+; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_u32
+; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_u64
+; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_s8
+; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_s16
+; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_s32
+; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_s8
+; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_s16
+; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_s32
+; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_s64
+; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_u8
+; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_u16
+; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_u32
+; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_u8
+; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_u16
+; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_u32
+; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_u64
+; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vrshr_n_s8
+; CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %a, i32 3)
+  ret <8 x i8> %vrshr_n
+}
+
+
+define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vrshr_n_s16
+; CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %a, i32 3)
+  ret <4 x i16> %vrshr_n
+}
+
+
+define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vrshr_n_s32
+; CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %a, i32 3)
+  ret <2 x i32> %vrshr_n
+}
+
+
+define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vrshrq_n_s8
+; CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %a, i32 3)
+  ret <16 x i8> %vrshr_n
+}
+
+
+define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vrshrq_n_s16
+; CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %a, i32 3)
+  ret <8 x i16> %vrshr_n
+}
+
+
+define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vrshrq_n_s32
+; CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %a, i32 3)
+  ret <4 x i32> %vrshr_n
+}
+
+
+define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vrshrq_n_s64
+; CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %a, i32 3)
+  ret <2 x i64> %vrshr_n
+}
+
+
+define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vrshr_n_u8
+; CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %a, i32 3)
+  ret <8 x i8> %vrshr_n
+}
+
+
+define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vrshr_n_u16
+; CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %a, i32 3)
+  ret <4 x i16> %vrshr_n
+}
+
+
+define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vrshr_n_u32
+; CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %a, i32 3)
+  ret <2 x i32> %vrshr_n
+}
+
+
+define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vrshrq_n_u8
+; CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %a, i32 3)
+  ret <16 x i8> %vrshr_n
+}
+
+
+define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vrshrq_n_u16
+; CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %a, i32 3)
+  ret <8 x i16> %vrshr_n
+}
+
+
+define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vrshrq_n_u32
+; CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %a, i32 3)
+  ret <4 x i32> %vrshr_n
+}
+
+
+define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vrshrq_n_u64
+; CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %a, i32 3)
+  ret <2 x i64> %vrshr_n
+}
+
+
+define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vrsra_n_s8
+; CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %1 = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %b, i32 3)
+  %vrsra_n = add <8 x i8> %1, %a
+  ret <8 x i8> %vrsra_n
+}
+
+define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vrsra_n_s16
+; CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %1 = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %b, i32 3)
+  %vrsra_n = add <4 x i16> %1, %a
+  ret <4 x i16> %vrsra_n
+}
+
+define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vrsra_n_s32
+; CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %b, i32 3)
+  %vrsra_n = add <2 x i32> %1, %a
+  ret <2 x i32> %vrsra_n
+}
+
+define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vrsraq_n_s8
+; CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %1 = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %b, i32 3)
+  %vrsra_n = add <16 x i8> %1, %a
+  ret <16 x i8> %vrsra_n
+}
+
+define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsraq_n_s16
+; CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %1 = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %b, i32 3)
+  %vrsra_n = add <8 x i16> %1, %a
+  ret <8 x i16> %vrsra_n
+}
+
+define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsraq_n_s32
+; CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %1 = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %b, i32 3)
+  %vrsra_n = add <4 x i32> %1, %a
+  ret <4 x i32> %vrsra_n
+}
+
+define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsraq_n_s64
+; CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %1 = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %b, i32 3)
+  %vrsra_n = add <2 x i64> %1, %a
+  ret <2 x i64> %vrsra_n
+}
+
+define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vrsra_n_u8
+; CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %1 = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %b, i32 3)
+  %vrsra_n = add <8 x i8> %1, %a
+  ret <8 x i8> %vrsra_n
+}
+
+define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vrsra_n_u16
+; CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %1 = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %b, i32 3)
+  %vrsra_n = add <4 x i16> %1, %a
+  ret <4 x i16> %vrsra_n
+}
+
+define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vrsra_n_u32
+; CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %b, i32 3)
+  %vrsra_n = add <2 x i32> %1, %a
+  ret <2 x i32> %vrsra_n
+}
+
+define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vrsraq_n_u8
+; CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %1 = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %b, i32 3)
+  %vrsra_n = add <16 x i8> %1, %a
+  ret <16 x i8> %vrsra_n
+}
+
+define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsraq_n_u16
+; CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %1 = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %b, i32 3)
+  %vrsra_n = add <8 x i16> %1, %a
+  ret <8 x i16> %vrsra_n
+}
+
+define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsraq_n_u32
+; CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %1 = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %b, i32 3)
+  %vrsra_n = add <4 x i32> %1, %a
+  ret <4 x i32> %vrsra_n
+}
+
+define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsraq_n_u64
+; CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %1 = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %b, i32 3)
+  %vrsra_n = add <2 x i64> %1, %a
+  ret <2 x i64> %vrsra_n
+}
+
+define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsri_n_s8
+; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsri_n
+}
+
+
+define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsri_n_s16
+; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
+  ret <4 x i16> %vsri
+}
+
+
+define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsri_n_s32
+; CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsri = tail call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
+  ret <2 x i32> %vsri
+}
+
+
+define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsriq_n_s8
+; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsri_n
+}
+
+
+define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsriq_n_s16
+; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
+  ret <8 x i16> %vsri
+}
+
+
+define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsriq_n_s32
+; CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsri = tail call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
+  ret <4 x i32> %vsri
+}
+
+
+define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsriq_n_s64
+; CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsri = tail call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
+  ret <2 x i64> %vsri
+}
+
+define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsri_n_p8
+; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsri_n
+}
+
+define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsri_n_p16
+; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
+  ret <4 x i16> %vsri
+}
+
+define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsriq_n_p8
+; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsri_n
+}
+
+define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsriq_n_p16
+; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
+  ret <8 x i16> %vsri
+}
+
+define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsli_n_s8
+; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsli_n
+}
+
+define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsli_n_s16
+; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
+  ret <4 x i16> %vsli
+}
+
+define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsli_n_s32
+; CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsli = tail call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
+  ret <2 x i32> %vsli
+}
+
+define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsliq_n_s8
+; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsli_n
+}
+
+define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsliq_n_s16
+; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
+  ret <8 x i16> %vsli
+}
+
+define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsliq_n_s32
+; CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsli = tail call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
+  ret <4 x i32> %vsli
+}
+
+define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsliq_n_s64
+; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsli = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
+  ret <2 x i64> %vsli
+}
+
+define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsli_n_p8
+; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsli_n
+}
+
+define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsli_n_p16
+; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
+  ret <4 x i16> %vsli
+}
+
+define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsliq_n_p8
+; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsli_n
+}
+
+define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsliq_n_p16
+; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
+  ret <8 x i16> %vsli
+}
+
+define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
+; CHECK: test_vqshl_n_s8
+; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vqshl = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %vqshl
+}
+
+
+define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
+; CHECK: test_vqshl_n_s16
+; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %vqshl
+}
+
+
+define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
+; CHECK: test_vqshl_n_s32
+; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %vqshl
+}
+
+
+define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
+; CHECK: test_vqshlq_n_s8
+; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %vqshl_n
+}
+
+
+define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshlq_n_s16
+; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %vqshl
+}
+
+
+define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshlq_n_s32
+; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %vqshl
+}
+
+
+define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshlq_n_s64
+; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %vqshl
+}
+
+
+define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
+; CHECK: test_vqshl_n_u8
+; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %vqshl_n
+}
+
+
+define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
+; CHECK: test_vqshl_n_u16
+; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %vqshl
+}
+
+
+define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
+; CHECK: test_vqshl_n_u32
+; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %vqshl
+}
+
+
+define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
+; CHECK: test_vqshlq_n_u8
+; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %vqshl_n
+}
+
+
+define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
+; CHECK: test_vqshlq_n_u16
+; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %vqshl
+}
+
+
+define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
+; CHECK: test_vqshlq_n_u32
+; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %vqshl
+}
+
+
+define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
+; CHECK: test_vqshlq_n_u64
+; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %vqshl
+}
+
+define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) {
+; CHECK: test_vqshlu_n_s8
+; CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vqshlu = tail call <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8> %a, i32 3)
+  ret <8 x i8> %vqshlu
+}
+
+
+define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) {
+; CHECK: test_vqshlu_n_s16
+; CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vqshlu = tail call <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16> %a, i32 3)
+  ret <4 x i16> %vqshlu
+}
+
+
+define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) {
+; CHECK: test_vqshlu_n_s32
+; CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vqshlu = tail call <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32> %a, i32 3)
+  ret <2 x i32> %vqshlu
+}
+
+
+define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) {
+; CHECK: test_vqshluq_n_s8
+; CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vqshlu = tail call <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8> %a, i32 3)
+  ret <16 x i8> %vqshlu
+}
+
+
+define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshluq_n_s16
+; CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vqshlu = tail call <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16> %a, i32 3)
+  ret <8 x i16> %vqshlu
+}
+
+
+define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshluq_n_s32
+; CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vqshlu = tail call <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32> %a, i32 3)
+  ret <4 x i32> %vqshlu
+}
+
+
+define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshluq_n_s64
+; CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vqshlu = tail call <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64> %a, i32 3)
+  ret <2 x i64> %vqshlu
+}
+
+
+define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_s16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_s32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_s64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_u16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_u32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_u64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_s16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_s32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_s64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_u16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_u32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_u64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshrun_n_s16
+; CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqshrun
+}
+
+
+define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshrun_n_s32
+; CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqshrun
+}
+
+define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshrun_n_s64
+; CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqshrun
+}
+
+define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrun_high_n_s16
+; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrun_high_n_s32
+; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrun_high_n_s64
+; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vrshrn_n_s16
+; CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vrshrn
+}
+
+
+define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vrshrn_n_s32
+; CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vrshrn
+}
+
+
+define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vrshrn_n_s64
+; CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vrshrn
+}
+
+define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vrshrn_high_n_s16
+; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vrshrn_high_n_s32
+; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vrshrn_high_n_s64
+; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) {
+; CHECK: test_vqrshrun_n_s16
+; CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqrshrun
+}
+
+define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) {
+; CHECK: test_vqrshrun_n_s32
+; CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqrshrun
+}
+
+define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) {
+; CHECK: test_vqrshrun_n_s64
+; CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqrshrun
+}
+
+define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrun_high_n_s16
+; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrun_high_n_s32
+; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrun_high_n_s64
+; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshrn_n_s16
+; CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqshrn
+}
+
+
+define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshrn_n_s32
+; CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqshrn
+}
+
+
+define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshrn_n_s64
+; CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqshrn
+}
+
+
+define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vqshrn_n_u16
+; CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqshrn
+}
+
+
+define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vqshrn_n_u32
+; CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqshrn
+}
+
+
+define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vqshrn_n_u64
+; CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqshrn
+}
+
+
+define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_s16
+; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_s32
+; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_s64
+; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_u16
+; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_u32
+; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_u64
+; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vqrshrn_n_s16
+; CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqrshrn
+}
+
+
+define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vqrshrn_n_s32
+; CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqrshrn
+}
+
+
+define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vqrshrn_n_s64
+; CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqrshrn
+}
+
+
+define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vqrshrn_n_u16
+; CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqrshrn
+}
+
+
+define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vqrshrn_n_u32
+; CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqrshrn
+}
+
+
+define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vqrshrn_n_u64
+; CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqrshrn
+}
+
+
+define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_s16
+; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_s32
+; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_s64
+; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_u16
+; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_u32
+; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_u64
+; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) {
+; CHECK: test_vcvt_n_f32_s32
+; CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
+  ret <2 x float> %vcvt
+}
+
+define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) {
+; CHECK: test_vcvtq_n_f32_s32
+; CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
+  ret <4 x float> %vcvt
+}
+
+define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) {
+; CHECK: test_vcvtq_n_f64_s64
+; CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
+  ret <2 x double> %vcvt
+}
+
+define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) {
+; CHECK: test_vcvt_n_f32_u32
+; CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
+  ret <2 x float> %vcvt
+}
+
+define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) {
+; CHECK: test_vcvtq_n_f32_u32
+; CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
+  ret <4 x float> %vcvt
+}
+
+define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) {
+; CHECK: test_vcvtq_n_f64_u64
+; CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
+  ret <2 x double> %vcvt
+}
+
+define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) {
+; CHECK: test_vcvt_n_s32_f32
+; CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 31)
+  ret <2 x i32> %vcvt
+}
+
+define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) {
+; CHECK: test_vcvtq_n_s32_f32
+; CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 31)
+  ret <4 x i32> %vcvt
+}
+
+define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) {
+; CHECK: test_vcvtq_n_s64_f64
+; CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %a, i32 50)
+  ret <2 x i64> %vcvt
+}
+
+define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) {
+; CHECK: test_vcvt_n_u32_f32
+; CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 31)
+  ret <2 x i32> %vcvt
+}
+
+define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) {
+; CHECK: test_vcvt_n_u32_f32
+; CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 31)
+  ret <4 x i32> %vcvt
+}
+
+define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) {
+; CHECK: test_vcvtq_n_u64_f64
+; CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %a, i32 50)
+  ret <2 x i64> %vcvt
+}
+
+declare <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8>, <8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16>, <4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32>, <2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8>, <16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16>, <8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32>, <4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64>, <2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
+
+declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
+
+declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
+
+declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
+
+declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
+
+declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) 
+
+declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) 
+
+declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) 
+
+declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64>, i32)
+
+declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+
+define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll
new file mode 100644
index 0000000..8eac1e8
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-tbl.ll
@@ -0,0 +1,828 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8>, <8 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8>, <16 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtbl1_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl11.i
+}
+
+define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vqtbl1_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %vtbl1.i
+}
+
+define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl2_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
+  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl17.i
+}
+
+define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl2_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl2.i
+}
+
+define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl3_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl212.i
+}
+
+define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl3_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl3.i
+}
+
+define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl4_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl216.i
+}
+
+define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl4_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl4.i
+}
+
+define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vqtbl1q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %vtbl1.i
+}
+
+define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl2q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl2.i
+}
+
+define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl3q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl3.i
+}
+
+define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl4q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl4.i
+}
+
+define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vtbx1_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx2_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx17.i
+}
+
+define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx3_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx4_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx216.i
+}
+
+define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vqtbx1_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %vtbx1.i
+}
+
+define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx2_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx2.i
+}
+
+define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx3_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx3.i
+}
+
+define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx4_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx4.i
+}
+
+define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vqtbx1q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %vtbx1.i
+}
+
+define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx2q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx2.i
+}
+
+define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx3q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx3.i
+}
+
+define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx4q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx4.i
+}
+
+define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtbl1_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl11.i
+}
+
+define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vqtbl1_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %vtbl1.i
+}
+
+define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl2_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
+  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl17.i
+}
+
+define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl2_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl2.i
+}
+
+define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl3_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl212.i
+}
+
+define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl3_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl3.i
+}
+
+define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl4_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl216.i
+}
+
+define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl4_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl4.i
+}
+
+define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vqtbl1q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %vtbl1.i
+}
+
+define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl2q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl2.i
+}
+
+define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl3q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl3.i
+}
+
+define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl4q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl4.i
+}
+
+define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vtbx1_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx2_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx17.i
+}
+
+define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx3_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx4_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx216.i
+}
+
+define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vqtbx1_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %vtbx1.i
+}
+
+define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx2_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx2.i
+}
+
+define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx3_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx3.i
+}
+
+define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx4_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx4.i
+}
+
+define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vqtbx1q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %vtbx1.i
+}
+
+define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx2q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx2.i
+}
+
+define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx3q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx3.i
+}
+
+define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx4q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx4.i
+}
+
+define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtbl1_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl11.i
+}
+
+define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vqtbl1_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %vtbl1.i
+}
+
+define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl2_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
+  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl17.i
+}
+
+define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl2_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl2.i
+}
+
+define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl3_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl212.i
+}
+
+define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl3_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl3.i
+}
+
+define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl4_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl216.i
+}
+
+define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl4_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl4.i
+}
+
+define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vqtbl1q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %vtbl1.i
+}
+
+define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl2q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl2.i
+}
+
+define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl3q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl3.i
+}
+
+define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl4q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl4.i
+}
+
+define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vtbx1_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx2_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx17.i
+}
+
+define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx3_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx4_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx216.i
+}
+
+define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vqtbx1_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %vtbx1.i
+}
+
+define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx2_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx2.i
+}
+
+define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx3_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx3.i
+}
+
+define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx4_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx4.i
+}
+
+define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vqtbx1q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %vtbx1.i
+}
+
+define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx2q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx2.i
+}
+
+define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx3q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx3.i
+}
+
+define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx4q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx4.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-simd-vget.ll b/test/CodeGen/AArch64/neon-simd-vget.ll
new file mode 100644
index 0000000..6474499
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-vget.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_s8:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_s16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_s32:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_s64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_u8:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_u16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_u32:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_u64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_p64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_f16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_high_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_high_f32:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_p8:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_p16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_high_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_high_f64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
+  ret <1 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_s8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_s16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_s32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_s64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_u8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_u16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_u32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_u64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_p64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_f16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_low_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_low_f32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_p8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_p16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_low_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_low_f64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %shuffle.i
+}
diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll
new file mode 100644
index 0000000..cb8432a
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s
+
+; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
+; causing a crash during live range calc.
+define void @fp128_livein(i64 %a) {
+  %tobool = icmp ne i64 %a, 0
+  %conv = zext i1 %tobool to i32
+  %conv2 = sitofp i32 %conv to fp128
+  %conv6 = sitofp i32 %conv to double
+  %call3 = tail call i32 @g(fp128 %conv2)
+  %call8 = tail call i32 @h(double %conv6)
+  ret void
+}
+
+declare i32 @f()
+declare i32 @g(fp128)
+declare i32 @h(double)
diff --git a/test/CodeGen/AArch64/returnaddr.ll b/test/CodeGen/AArch64/returnaddr.ll
new file mode 100644
index 0000000..c85f9ec
--- /dev/null
+++ b/test/CodeGen/AArch64/returnaddr.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, x30
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: ldr x[[reg:[0-9]+]], [x29]
+; CHECK: ldr x[[reg]], [x[[reg]]]
+; CHECK: ldr x0, [x[[reg]], #8]
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/tls-dynamics.ll
index 887d2f8..68c481c 100644
--- a/test/CodeGen/AArch64/tls-dynamics.ll
+++ b/test/CodeGen/AArch64/tls-dynamics.ll
@@ -10,8 +10,8 @@ define i32 @test_generaldynamic() {
   ret i32 %val
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
 ; CHECK: .tlsdesccall general_dynamic_var
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -19,8 +19,8 @@ define i32 @test_generaldynamic() {
 ; CHECK: ldr w0, [x[[TP]], x0]
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -31,8 +31,8 @@ define i32* @test_generaldynamic_addr() {
   ret i32* @general_dynamic_var
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
 ; CHECK: .tlsdesccall general_dynamic_var
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -40,8 +40,8 @@ define i32* @test_generaldynamic_addr() {
 ; CHECK: add x0, [[TP]], x0
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -55,8 +55,8 @@ define i32 @test_localdynamic() {
   ret i32 %val
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
 ; CHECK: .tlsdesccall _TLS_MODULE_BASE_
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -66,8 +66,8 @@ define i32 @test_localdynamic() {
 ; CHECK: ldr w0, [x0, [[DTP_OFFSET]]]
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -78,8 +78,8 @@ define i32* @test_localdynamic_addr() {
   ret i32* @local_dynamic_var
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
 ; CHECK: .tlsdesccall _TLS_MODULE_BASE_
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -89,8 +89,8 @@ define i32* @test_localdynamic_addr() {
 ; CHECK: add x0, x0, [[DTP_OFFSET]]
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -110,8 +110,8 @@ define i32 @test_localdynamic_deduplicate() {
   ret i32 %sum
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
 ; CHECK: .tlsdesccall _TLS_MODULE_BASE_
 ; CHECK-NEXT: blr [[CALLEE]]
 
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
index 1c7e3bf..f3d376b 100644
--- a/test/CodeGen/AArch64/variadic.ll
+++ b/test/CodeGen/AArch64/variadic.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s
 
 %va_list = type {i8*, i8*, i8*, i32, i32}
 
@@ -9,19 +10,28 @@ declare void @llvm.va_start(i8*)
 define void @test_simple(i32 %n, ...) {
 ; CHECK-LABEL: test_simple:
 ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 ; CHECK: mov x[[FPRBASE:[0-9]+]], sp
 ; CHECK: str q7, [x[[FPRBASE]], #112]
 ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
 ; CHECK: str x7, [x[[GPRBASE]], #48]
 
+; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
+; CHECK-NOFP: str x7, [x[[GPRBASE]], #48]
+; CHECK-NOFP-NOT: str q7,
+; CHECK-NOFP: str x1, [sp, #[[GPRFROMSP]]]
+
 ; Omit the middle ones
 
 ; CHECK: str q0, [sp]
 ; CHECK: str x1, [sp, #[[GPRFROMSP]]]
 
+; CHECK-NOFP-NOT: str q0, [sp]
+
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 ; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
 ; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
 ; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
@@ -33,6 +43,14 @@ define void @test_simple(i32 %n, ...) {
 ; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
+; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #55
+; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
+; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
   ret void
 }
 
@@ -44,11 +62,19 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
 ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
 ; CHECK: str x7, [x[[GPRBASE]], #32]
 
+; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK-NOFP-NOT: str q7,
+; CHECK-NOFP: mov x[[GPRBASE:[0-9]+]], sp
+; CHECK-NOFP: str x7, [x[[GPRBASE]], #24]
+
 ; Omit the middle ones
 
 ; CHECK: str q1, [sp]
 ; CHECK: str x3, [sp, #[[GPRFROMSP]]]
 
+; CHECK-NOFP-NOT: str q1, [sp]
+; CHECK-NOFP: str x4, [sp]
+
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_start(i8* %addr)
 ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
@@ -63,6 +89,15 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
 ; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
+; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #31
+; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #32
+; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
   ret void
 }
 
@@ -75,6 +110,9 @@ define void @test_nospare([8 x i64], [8 x float], ...) {
 ; CHECK: mov [[STACK:x[0-9]+]], sp
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP-NOT: sub sp, sp
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #64
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
   ret void
 }
 
@@ -87,6 +125,10 @@ define void @test_offsetstack([10 x i64], [3 x float], ...) {
 ; CHECK: str q7, [x[[FPRBASE]], #64]
 
 ; CHECK-NOT: str x{{[0-9]+}},
+
+; CHECK-NOFP-NOT: str q7,
+; CHECK-NOT: str x7,
+
 ; Omit the middle ones
 
 ; CHECK: str q3, [sp]
@@ -102,6 +144,11 @@ define void @test_offsetstack([10 x i64], [3 x float], ...) {
 ; CHECK: add [[STACK:x[0-9]+]], sp, #96
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #40
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
   ret void
 }
 
@@ -110,12 +157,14 @@ declare void @llvm.va_end(i8*)
 define void @test_va_end() nounwind {
 ; CHECK-LABEL: test_va_end:
 ; CHECK-NEXT: BB#0
+; CHECK-NOFP: BB#0
 
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_end(i8* %addr)
 
   ret void
 ; CHECK-NEXT: ret
+; CHECK-NOFP-NEXT: ret
 }
 
 declare void @llvm.va_copy(i8* %dest, i8* %src)
@@ -131,14 +180,25 @@ define void @test_va_copy() {
 ; Check beginning and end again:
 
 ; CHECK: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+
 ; CHECK: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list]
 
+; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
 ; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 
-; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
 ; CHECK: str [[BLOCK]], [x[[DEST_LIST]], #24]
 
+; CHECK-NOFP: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list]
+
+; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
+; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
+
+; CHECK-NOFP: str [[BLOCK]], [x[[DEST_LIST]], #24]
+
   ret void
 ; CHECK: ret
+; CHECK-NOFP: ret
 }
diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index dd08b56..570fcf9 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll
@@ -24,8 +24,7 @@ declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
 !1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true,
-                i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
+!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
 !3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
 !4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
 !5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index 89f468a..35739d7 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
@@ -13,12 +13,13 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!15}
 !0 = metadata !{i32 524545, metadata !1, metadata !"b", metadata !2, i32 93, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 524334, metadata !12, null, metadata !"__addvsi3", metadata !"__addvsi3", metadata !"__addvsi3", i32 94, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
 !12 = metadata !{metadata !"libgcc2.c", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc"}
 !3 = metadata !{i32 524305, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !13, metadata !13, metadata !14, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 524309, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6, metadata !6}
 !6 = metadata !{i32 524310, metadata !12, null, metadata !"SItype", i32 152, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
 !7 = metadata !{i32 524329, metadata !"libgcc2.h", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", metadata !3} ; [ DW_TAG_file_type ]
@@ -28,3 +29,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !11 = metadata !{i32 100, i32 0, metadata !10, null}
 !13 = metadata !{i32 0}
 !14 = metadata !{metadata !1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index f4ad4bc..7aacd1a 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -75,9 +75,10 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!49}
 
 !0 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !48, null, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_structure_type ]
+!1 = metadata !{i32 786451, metadata !48, null, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
 !2 = metadata !{i32 786473, metadata !48} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !48, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !47, metadata !47, metadata !46, metadata !47,  metadata !47, metadata !""} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
@@ -86,18 +87,18 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !7 = metadata !{i32 786445, metadata !48, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
 !8 = metadata !{i32 786468, metadata !48, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
 !12 = metadata !{i32 786447, metadata !48, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
 !13 = metadata !{i32 786468, metadata !48, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!14 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
 !16 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !17 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!18 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
 !20 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!21 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
 !23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
@@ -125,3 +126,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
 !47 = metadata !{i32 0}
 !48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll b/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll
index e6d1518..3053694 100644
--- a/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll
+++ b/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll
@@ -1,10 +1,79 @@
+; This tests that MC/asm header conversion is smooth and that the
+; build attributes are correct
+
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s --check-prefix=V6
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi | FileCheck %s --check-prefix=V6M
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s | FileCheck %s --check-prefix=ARM1156T2F-S
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefix=V7M
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=V7
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi | FileCheck %s --check-prefix=Vt8
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=+v8fp | FileCheck %s --check-prefix=V8-V8FP
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=+neon | FileCheck %s --check-prefix=V8-NEON
-; This tests that MC/asm header conversion is smooth
-;
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-neon,-crypto | FileCheck %s --check-prefix=V8-FPARMv8
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-fp-armv8,-crypto | FileCheck %s --check-prefix=V8-NEON
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-crypto | FileCheck %s --check-prefix=V8-FPARMv8-NEON
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8-FPARMv8-NEON-CRYPTO
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9-mp | FileCheck %s --check-prefix=CORTEX-A9-MP
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
+
+; V6:   .eabi_attribute 6, 6
+; V6:   .eabi_attribute 8, 1
+; V6:   .eabi_attribute 24, 1
+; V6:   .eabi_attribute 25, 1
+; V6-NOT:   .eabi_attribute 27
+; V6-NOT:   .eabi_attribute 28
+; V6-NOT:    .eabi_attribute 36
+; V6-NOT:    .eabi_attribute 42
+; V6-NOT:    .eabi_attribute 68
+
+; V6M:  .eabi_attribute 6, 12
+; V6M:  .eabi_attribute 7, 77
+; V6M:  .eabi_attribute 8, 0
+; V6M:  .eabi_attribute 9, 1
+; V6M:  .eabi_attribute 24, 1
+; V6M:  .eabi_attribute 25, 1
+; V6M-NOT:  .eabi_attribute 27
+; V6M-NOT:  .eabi_attribute 28
+; V6M-NOT:  .eabi_attribute 36
+; V6M-NOT:  .eabi_attribute 42
+; V6M-NOT:  .eabi_attribute 68
+
+; ARM1156T2F-S: .cpu arm1156t2f-s
+; ARM1156T2F-S: .eabi_attribute 6, 8
+; ARM1156T2F-S: .eabi_attribute 8, 1
+; ARM1156T2F-S: .eabi_attribute 9, 2
+; ARM1156T2F-S: .fpu vfpv2
+; ARM1156T2F-S: .eabi_attribute 20, 1
+; ARM1156T2F-S: .eabi_attribute 21, 1
+; ARM1156T2F-S: .eabi_attribute 23, 3
+; ARM1156T2F-S: .eabi_attribute 24, 1
+; ARM1156T2F-S: .eabi_attribute 25, 1
+; ARM1156T2F-S-NOT: .eabi_attribute 27
+; ARM1156T2F-S-NOT: .eabi_attribute 28
+; ARM1156T2F-S-NOT: .eabi_attribute 36
+; ARM1156T2F-S-NOT:    .eabi_attribute 42
+; ARM1156T2F-S-NOT:    .eabi_attribute 68
+
+; V7M:  .eabi_attribute 6, 10
+; V7M:  .eabi_attribute 7, 77
+; V7M:  .eabi_attribute 8, 0
+; V7M:  .eabi_attribute 9, 2
+; V7M:  .eabi_attribute 24, 1
+; V7M:  .eabi_attribute 25, 1
+; V7M-NOT:  .eabi_attribute 27
+; V7M-NOT:  .eabi_attribute 28
+; V7M-NOT:  .eabi_attribute 36
+; V7M-NOT:  .eabi_attribute 42
+; V7M:  .eabi_attribute 44, 0
+; V7M-NOT:  .eabi_attribute 68
+
 ; V7:      .syntax unified
 ; V7: .eabi_attribute 6, 10
 ; V7: .eabi_attribute 20, 1
@@ -12,6 +81,11 @@
 ; V7: .eabi_attribute 23, 3
 ; V7: .eabi_attribute 24, 1
 ; V7: .eabi_attribute 25, 1
+; V7-NOT: .eabi_attribute 27
+; V7-NOT: .eabi_attribute 28
+; V7-NOT: .eabi_attribute 36
+; V7-NOT:    .eabi_attribute 42
+; V7-NOT:    .eabi_attribute 68
 
 ; V8:      .syntax unified
 ; V8: .eabi_attribute 6, 14
@@ -19,14 +93,193 @@
 ; Vt8:     .syntax unified
 ; Vt8: .eabi_attribute 6, 14
 
-; V8-V8FP:      .syntax unified
-; V8-V8FP: .eabi_attribute 6, 14
-; V8-V8FP: .eabi_attribute 10, 7
+; V8-FPARMv8:      .syntax unified
+; V8-FPARMv8: .eabi_attribute 6, 14
+; V8-FPARMv8: .fpu fp-armv8
 
 ; V8-NEON:      .syntax unified
 ; V8-NEON: .eabi_attribute 6, 14
+; V8-NEON: .fpu neon
 ; V8-NEON: .eabi_attribute 12, 3
 
+; V8-FPARMv8-NEON:      .syntax unified
+; V8-FPARMv8-NEON: .eabi_attribute 6, 14
+; V8-FPARMv8-NEON: .fpu neon-fp-armv8
+; V8-FPARMv8-NEON: .eabi_attribute 12, 3
+
+; V8-FPARMv8-NEON-CRYPTO:      .syntax unified
+; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 6, 14
+; V8-FPARMv8-NEON-CRYPTO: .fpu crypto-neon-fp-armv8
+; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 12, 3
+
+; CORTEX-A9-SOFT:  .cpu cortex-a9
+; CORTEX-A9-SOFT:  .eabi_attribute 6, 10
+; CORTEX-A9-SOFT:  .eabi_attribute 7, 65
+; CORTEX-A9-SOFT:  .eabi_attribute 8, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 9, 2
+; CORTEX-A9-SOFT:  .fpu neon
+; CORTEX-A9-SOFT:  .eabi_attribute 20, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 21, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 23, 3
+; CORTEX-A9-SOFT:  .eabi_attribute 24, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 25, 1
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 27
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 28
+; CORTEX-A9-SOFT:  .eabi_attribute 36, 1
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 42
+; CORTEX-A9-SOFT:  .eabi_attribute 68, 1
+
+; CORTEX-A9-HARD:  .cpu cortex-a9
+; CORTEX-A9-HARD:  .eabi_attribute 6, 10
+; CORTEX-A9-HARD:  .eabi_attribute 7, 65
+; CORTEX-A9-HARD:  .eabi_attribute 8, 1
+; CORTEX-A9-HARD:  .eabi_attribute 9, 2
+; CORTEX-A9-HARD:  .fpu neon
+; CORTEX-A9-HARD:  .eabi_attribute 20, 1
+; CORTEX-A9-HARD:  .eabi_attribute 21, 1
+; CORTEX-A9-HARD:  .eabi_attribute 23, 3
+; CORTEX-A9-HARD:  .eabi_attribute 24, 1
+; CORTEX-A9-HARD:  .eabi_attribute 25, 1
+; CORTEX-A9-HARD-NOT:  .eabi_attribute 27
+; CORTEX-A9-HARD:  .eabi_attribute 28, 1
+; CORTEX-A9-HARD:  .eabi_attribute 36, 1
+; CORTEX-A9-HARD-NOT:  .eabi_attribute 42
+; CORTEX-A9-HARD:  .eabi_attribute 68, 1
+
+; CORTEX-A9-MP:  .cpu cortex-a9-mp
+; CORTEX-A9-MP:  .eabi_attribute 6, 10
+; CORTEX-A9-MP:  .eabi_attribute 7, 65
+; CORTEX-A9-MP:  .eabi_attribute 8, 1
+; CORTEX-A9-MP:  .eabi_attribute 9, 2
+; CORTEX-A9-MP:  .fpu neon
+; CORTEX-A9-MP:  .eabi_attribute 20, 1
+; CORTEX-A9-MP:  .eabi_attribute 21, 1
+; CORTEX-A9-MP:  .eabi_attribute 23, 3
+; CORTEX-A9-MP:  .eabi_attribute 24, 1
+; CORTEX-A9-MP:  .eabi_attribute 25, 1
+; CORTEX-A9-NOT:  .eabi_attribute 27
+; CORTEX-A9-NOT:  .eabi_attribute 28
+; CORTEX-A9-MP:  .eabi_attribute 36, 1
+; CORTEX-A9-MP:  .eabi_attribute 42, 1
+; CORTEX-A9-MP:  .eabi_attribute 68, 1
+
+; CORTEX-A15: .cpu cortex-a15
+; CORTEX-A15: .eabi_attribute 6, 10
+; CORTEX-A15: .eabi_attribute 7, 65
+; CORTEX-A15: .eabi_attribute 8, 1
+; CORTEX-A15: .eabi_attribute 9, 2
+; CORTEX-A15: .fpu neon-vfpv4
+; CORTEX-A15: .eabi_attribute 20, 1
+; CORTEX-A15: .eabi_attribute 21, 1
+; CORTEX-A15: .eabi_attribute 23, 3
+; CORTEX-A15: .eabi_attribute 24, 1
+; CORTEX-A15: .eabi_attribute 25, 1
+; CORTEX-A15-NOT: .eabi_attribute 27
+; CORTEX-A15-NOT: .eabi_attribute 28
+; CORTEX-A15: .eabi_attribute 36, 1
+; CORTEX-A15: .eabi_attribute 42, 1
+; CORTEX-A15: .eabi_attribute 44, 2
+; CORTEX-A15: .eabi_attribute 68, 3
+
+; CORTEX-M0:  .cpu cortex-m0
+; CORTEX-M0:  .eabi_attribute 6, 12
+; CORTEX-M0:  .eabi_attribute 7, 77
+; CORTEX-M0:  .eabi_attribute 8, 0
+; CORTEX-M0:  .eabi_attribute 9, 1
+; CORTEX-M0:  .eabi_attribute 24, 1
+; CORTEX-M0:  .eabi_attribute 25, 1
+; CORTEX-M0-NOT:  .eabi_attribute 27
+; CORTEX-M0-NOT:  .eabi_attribute 28
+; CORTEX-M0-NOT:  .eabi_attribute 36
+; CORTEX-M0-NOT:  .eabi_attribute 42
+; CORTEX-M0-NOT:  .eabi_attribute 68
+
+; CORTEX-M4-SOFT:  .cpu cortex-m4
+; CORTEX-M4-SOFT:  .eabi_attribute 6, 13
+; CORTEX-M4-SOFT:  .eabi_attribute 7, 77
+; CORTEX-M4-SOFT:  .eabi_attribute 8, 0
+; CORTEX-M4-SOFT:  .eabi_attribute 9, 2
+; CORTEX-M4-SOFT:  .fpu vfpv4-d16
+; CORTEX-M4-SOFT:  .eabi_attribute 20, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 21, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 23, 3
+; CORTEX-M4-SOFT:  .eabi_attribute 24, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 25, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 27, 1
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 28
+; CORTEX-M4-SOFT:  .eabi_attribute 36, 1
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 42
+; CORTEX-M4-SOFT:  .eabi_attribute 44, 0
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 68
+
+; CORTEX-M4-HARD:  .cpu cortex-m4
+; CORTEX-M4-HARD:  .eabi_attribute 6, 13
+; CORTEX-M4-HARD:  .eabi_attribute 7, 77
+; CORTEX-M4-HARD:  .eabi_attribute 8, 0
+; CORTEX-M4-HARD:  .eabi_attribute 9, 2
+; CORTEX-M4-HARD:  .fpu vfpv4-d16
+; CORTEX-M4-HARD:  .eabi_attribute 20, 1
+; CORTEX-M4-HARD:  .eabi_attribute 21, 1
+; CORTEX-M4-HARD:  .eabi_attribute 23, 3
+; CORTEX-M4-HARD:  .eabi_attribute 24, 1
+; CORTEX-M4-HARD:  .eabi_attribute 25, 1
+; CORTEX-M4-HARD:  .eabi_attribute 27, 1
+; CORTEX-M4-HARD:  .eabi_attribute 28, 1
+; CORTEX-M4-HARD:  .eabi_attribute 36, 1
+; CORTEX-M4-HARD-NOT:  .eabi_attribute 42
+; CORTEX-M4-HARD:  .eabi_attribute 44, 0
+; CORTEX-M4-HRAD-NOT:  .eabi_attribute 68
+
+; CORTEX-R5:  .cpu cortex-r5
+; CORTEX-R5:  .eabi_attribute 6, 10
+; CORTEX-R5:  .eabi_attribute 7, 82
+; CORTEX-R5:  .eabi_attribute 8, 1
+; CORTEX-R5:  .eabi_attribute 9, 2
+; CORTEX-R5:  .fpu vfpv3-d16
+; CORTEX-R5:  .eabi_attribute 20, 1
+; CORTEX-R5:  .eabi_attribute 21, 1
+; CORTEX-R5:  .eabi_attribute 23, 3
+; CORTEX-R5:  .eabi_attribute 24, 1
+; CORTEX-R5:  .eabi_attribute 25, 1
+; CORTEX-R5:  .eabi_attribute 27, 1
+; CORTEX-R5-NOT:  .eabi_attribute 28
+; CORTEX-R5-NOT:  .eabi_attribute 36
+; CORTEX-R5-NOT:  .eabi_attribute 42
+; CORTEX-R5:  .eabi_attribute 44, 2
+; CORTEX-R5-NOT:  .eabi_attribute 68
+
+; CORTEX-A53:  .cpu cortex-a53
+; CORTEX-A53:  .eabi_attribute 6, 14
+; CORTEX-A53:  .eabi_attribute 7, 65
+; CORTEX-A53:  .eabi_attribute 8, 1
+; CORTEX-A53:  .eabi_attribute 9, 2
+; CORTEX-A53:  .fpu crypto-neon-fp-armv8
+; CORTEX-A53:  .eabi_attribute 12, 3
+; CORTEX-A53:  .eabi_attribute 24, 1
+; CORTEX-A53:  .eabi_attribute 25, 1
+; CORTEX-A53-NOT:  .eabi_attribute 27
+; CORTEX-A53-NOT:  .eabi_attribute 28
+; CORTEX-A53:  .eabi_attribute 36, 1
+; CORTEX-A53:  .eabi_attribute 42, 1
+; CORTEX-A53:  .eabi_attribute 44, 2
+; CORTEX-A53:  .eabi_attribute 68, 3
+
+; CORTEX-A57:  .cpu cortex-a57
+; CORTEX-A57:  .eabi_attribute 6, 14
+; CORTEX-A57:  .eabi_attribute 7, 65
+; CORTEX-A57:  .eabi_attribute 8, 1
+; CORTEX-A57:  .eabi_attribute 9, 2
+; CORTEX-A57:  .fpu crypto-neon-fp-armv8
+; CORTEX-A57:  .eabi_attribute 12, 3
+; CORTEX-A57:  .eabi_attribute 24, 1
+; CORTEX-A57:  .eabi_attribute 25, 1
+; CORTEX-A57-NOT:  .eabi_attribute 27
+; CORTEX-A57-NOT:  .eabi_attribute 28
+; CORTEX-A57:  .eabi_attribute 36, 1
+; CORTEX-A57:  .eabi_attribute 42, 1
+; CORTEX-A57:  .eabi_attribute 44, 2
+; CORTEX-A57:  .eabi_attribute 68, 3
+
 define i32 @f(i64 %z) {
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
deleted file mode 100644
index d19adcc..0000000
--- a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc  %s -mtriple=arm-linux-gnueabi -filetype=obj -o - | \
-; RUN:    llvm-readobj -s -sd | FileCheck  -check-prefix=BASIC %s 
-; RUN: llc  %s -mtriple=armv7-linux-gnueabi -march=arm -mcpu=cortex-a8 \
-; RUN:    -mattr=-neon,-vfp3,+vfp2 \
-; RUN:    -arm-reserve-r9 -filetype=obj -o - | \
-; RUN:    llvm-readobj -s -sd | FileCheck  -check-prefix=CORTEXA8 %s
-
-
-; This tests that the extpected ARM attributes are emitted.
-;
-; BASIC:        Section {
-; BASIC:          Name: .ARM.attributes
-; BASIC-NEXT:     Type: SHT_ARM_ATTRIBUTES
-; BASIC-NEXT:     Flags [ (0x0)
-; BASIC-NEXT:     ]
-; BASIC-NEXT:     Address: 0x0
-; BASIC-NEXT:     Offset: 0x3C
-; BASIC-NEXT:     Size: 28
-; BASIC-NEXT:     Link: 0
-; BASIC-NEXT:     Info: 0
-; BASIC-NEXT:     AddressAlignment: 1
-; BASIC-NEXT:     EntrySize: 0
-; BASIC-NEXT:     SectionData (
-; BASIC-NEXT:       0000: 411B0000 00616561 62690001 11000000
-; BASIC-NEXT:       0010: 06011401 15011703 18011901
-; BASIC-NEXT:     )
-
-; CORTEXA8:        Name: .ARM.attributes
-; CORTEXA8-NEXT:     Type: SHT_ARM_ATTRIBUTES
-; CORTEXA8-NEXT:     Flags [ (0x0)
-; CORTEXA8-NEXT:     ]
-; CORTEXA8-NEXT:     Address: 0x0
-; CORTEXA8-NEXT:     Offset: 0x3C
-; CORTEXA8-NEXT:     Size: 47
-; CORTEXA8-NEXT:     Link: 0
-; CORTEXA8-NEXT:     Info: 0
-; CORTEXA8-NEXT:     AddressAlignment: 1
-; CORTEXA8-NEXT:     EntrySize: 0
-; CORTEXA8-NEXT:     SectionData (
-; CORTEXA8-NEXT:       0000: 412E0000 00616561 62690001 24000000
-; CORTEXA8-NEXT:       0010: 05434F52 5445582D 41380006 0A074108
-; CORTEXA8-NEXT:       0020: 0109020A 02140115 01170318 011901
-; CORTEXA8-NEXT:     )
-
-define i32 @f(i64 %z) {
-       ret i32 0
-}
diff --git a/test/CodeGen/ARM/2010-11-30-reloc-movt.ll b/test/CodeGen/ARM/2010-11-30-reloc-movt.ll
deleted file mode 100644
index 6bea7b8..0000000
--- a/test/CodeGen/ARM/2010-11-30-reloc-movt.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc  %s -mtriple=armv7-linux-gnueabi -filetype=obj -o - | \
-; RUN:    llvm-readobj -s -sr -sd | FileCheck  -check-prefix=OBJ %s
-
-target triple = "armv7-none-linux-gnueabi"
-
-@a = external global i8
-
-define arm_aapcs_vfpcc i32 @barf() nounwind {
-entry:
-  %0 = tail call arm_aapcs_vfpcc  i32 @foo(i8* @a) nounwind
-  ret i32 %0
-; OBJ:        Section {
-; OBJ:          Name: .text
-; OBJ:          SectionData (
-; OBJ-NEXT:       0000: 00482DE9 000000E3 000040E3 FEFFFFEB
-; OBJ-NEXT:       0010: 0088BDE8
-; OBJ-NEXT:     )
-; OBJ:          Relocations [
-; OBJ-NEXT:       0x4 R_ARM_MOVW_ABS_NC a
-; OBJ-NEXT:       0x8 R_ARM_MOVT_ABS
-; OBJ-NEXT:       0xC R_ARM_CALL foo
-; OBJ-NEXT:     ]
-
-}
-
-declare arm_aapcs_vfpcc i32 @foo(i8*)
-
diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index 626d121..f57411b 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
@@ -76,11 +76,12 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!49}
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get1", metadata !"get1", metadata !"get1", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get1, null, null, metadata !42, i32 4} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !47, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !41, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !47, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !47, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5}
 !5 = metadata !{i32 786468, metadata !47, metadata !1, metadata !"_Bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get2", metadata !"get2", metadata !"get2", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get2, null, null, metadata !43, i32 7} ; [ DW_TAG_subprogram ]
@@ -126,3 +127,4 @@ entry:
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
 !48 = metadata !{i32 0}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
index f689d49..bc72e12 100644
--- a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
+++ b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
@@ -21,8 +21,8 @@ for.body:                                         ; preds = %_Z14printIsNotZeroi
   %x = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %i.022, i32 0
   %y = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %i.022, i32 1
   %inc = add i32 %i.022, 1
-  %tmp8 = load i32* %x, align 4, !tbaa !0
-  %tmp11 = load i32* %y, align 4, !tbaa !0
+  %tmp8 = load i32* %x, align 4
+  %tmp11 = load i32* %y, align 4
   %mul = mul nsw i32 %tmp11, %tmp8
   %tobool.i14 = icmp eq i32 %mul, 0
   br i1 %tobool.i14, label %_Z14printIsNotZeroi.exit17, label %if.then.i16
@@ -35,15 +35,10 @@ _Z14printIsNotZeroi.exit17:                       ; preds = %_Z14printIsNotZeroi
 
 _Z14printIsNotZeroi.exit17.for.body_crit_edge:    ; preds = %_Z14printIsNotZeroi.exit17
   %b.phi.trans.insert = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %inc, i32 3
-  %tmp3.pre = load i8* %b.phi.trans.insert, align 1, !tbaa !3
+  %tmp3.pre = load i8* %b.phi.trans.insert, align 1
   %phitmp27 = icmp eq i8 undef, 0
   br label %for.body
 
 for.end:                                          ; preds = %_Z14printIsNotZeroi.exit17
   ret void
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"bool", metadata !1}
diff --git a/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
index 348ec9f..e30c9c6 100644
--- a/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
+++ b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
@@ -15,15 +15,14 @@ for.cond:
 
 for.body:
 ; CHECK: %for.
-; CHECK: movs r{{[0-9]+}}, #{{[01]}}
+; CHECK: mov{{.*}} r{{[0-9]+}}, #{{[01]}}
+; CHECK: mov{{.*}} r{{[0-9]+}}, #{{[01]}}
+; CHECK-NOT: mov r{{[0-9]+}}, #{{[01]}}
   %arrayidx = getelementptr i32* %A, i32 %0
   %tmp4 = load i32* %arrayidx, align 4
   %cmp6 = icmp eq i32 %tmp4, %value
   br i1 %cmp6, label %return, label %for.inc
 
-; CHECK: %for.
-; CHECK: movs r{{[0-9]+}}, #{{[01]}}
-
 for.inc:
   %inc = add i32 %0, 1
   br label %for.cond
diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index 33826f8..bb78707 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
@@ -73,17 +73,18 @@ define i32 @get5(i32 %a) nounwind optsize ssp {
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!49}
 
 !0 = metadata !{i32 786449, metadata !47, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !41, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get1", metadata !"get1", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get1, null, null, metadata !42, i32 5} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get1", metadata !"get1", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get1, null, null, metadata !42, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
 !2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get2", metadata !"get2", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get2, null, null, metadata !43, i32 8} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get3", metadata !"get3", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get3, null, null, metadata !44, i32 11} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get4", metadata !"get4", metadata !"", i32 14, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get4, null, null, metadata !45, i32 14} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get5", metadata !"get5", metadata !"", i32 17, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get5, null, null, metadata !46, i32 17} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get2", metadata !"get2", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get2, null, null, metadata !43, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
+!7 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get3", metadata !"get3", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get3, null, null, metadata !44, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
+!8 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get4", metadata !"get4", metadata !"", i32 14, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get4, null, null, metadata !45, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
+!9 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get5", metadata !"get5", metadata !"", i32 17, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get5, null, null, metadata !46, i32 17} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
 !10 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 16777221, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
 !11 = metadata !{i32 786688, metadata !12, metadata !"b", metadata !2, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
 !12 = metadata !{i32 786443, metadata !47, metadata !1, i32 5, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
@@ -123,3 +124,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"ss3.c", metadata !"/private/tmp"}
 !48 = metadata !{i32 0}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll b/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
index 91de08a..9163166 100644
--- a/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
+++ b/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
@@ -42,7 +42,7 @@ if.then:                                          ; preds = %land.lhs.true
 ; If-convert the return
 ; CHECK: it	ne
 ; Fold the CSR+return into a pop
-; CHECK: pop {r4, r5, r6, r7, pc}
+; CHECK: pop {r4, r5, r7, pc}
 sw.bb18:
   %call20 = tail call i32 @bar(i32 %in2) nounwind
   switch i32 %call20, label %sw.default56 [
diff --git a/test/CodeGen/ARM/2011-10-26-memset-inline.ll b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
index ff049c8..03614ed 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-inline.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
@@ -10,8 +10,8 @@ target triple = "thumbv7-apple-ios5.0.0"
 ; CHECK-GENERIT-NEXT: strb
 ; CHECK-GENERIT-NEXT: strb
 ; CHECK-GENERIT-NEXT: strb
-; CHECK-UNALIGNED:      strb
-; CHECK-UNALIGNED-NEXT: str 
+; CHECK-UNALIGNED:    strb
+; CHECK-UNALIGNED:    str
 define void @foo(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 -1, i64 5, i32 1, i1 false)
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index f563eee..850c511 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
 
 ; Trigger multiple NEON stores.
-; CHECK:      vst1.64
-; CHECK-NEXT: vst1.64
+; CHECK: vst1.64
+; CHECK: vst1.64
 define void @f_0_40(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 40, i32 16, i1 false)
diff --git a/test/CodeGen/ARM/2012-08-30-select.ll b/test/CodeGen/ARM/2012-08-30-select.ll
index 2fd8df4..e78bbde 100644
--- a/test/CodeGen/ARM/2012-08-30-select.ll
+++ b/test/CodeGen/ARM/2012-08-30-select.ll
@@ -5,14 +5,11 @@
 ;CHECK: it  ne
 ;CHECK-NEXT: vmovne.i32
 ;CHECK: bx
-define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+define <16 x i8> @select_s_v_v(<16 x i8> %vec, i32 %avail) {
 entry:
-  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
   %and = and i32 %avail, 1
   %tobool = icmp eq i32 %and, 0
-  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
-  ret <16 x i8> %vld1.
+  %ret = select i1 %tobool, <16 x i8> %vec, <16 x i8> zeroinitializer
+  ret <16 x i8> %ret
 }
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
-
diff --git a/test/CodeGen/ARM/2013-02-27-expand-vfma.ll b/test/CodeGen/ARM/2013-02-27-expand-vfma.ll
index 135b144..f812118 100644
--- a/test/CodeGen/ARM/2013-02-27-expand-vfma.ll
+++ b/test/CodeGen/ARM/2013-02-27-expand-vfma.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=armv7s-apple-darwin | FileCheck %s -check-prefix=VFP4
+; RUN: llc < %s -mtriple=armv7s-apple-darwin | FileCheck %s -check-prefix=CHECK-VFP4
 
 define <4 x float> @muladd(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind {
 ; CHECK-LABEL: muladd:
diff --git a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
index 2eeebac..c4f5f54 100644
--- a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
+++ b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck -check-prefix=CHECK-V8 %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck -check-prefix=CHECK-V8 %s
 ; rdar://13782395
 
 define i32 @t1(i32 %a, i32 %b, i8** %retaddr) {
@@ -81,7 +83,7 @@ KBBlockZero.exit:                                 ; preds = %bb2.i
 ; <rdar://problem/14379453>
 
 ; Hard-coded registers comes from the ABI.
-; CHECK: wrapDistance:
+; CHECK-LABEL: wrapDistance:
 ; CHECK: cmp r1, #59
 ; CHECK-NEXT: itt le
 ; CHECK-NEXT: suble r0, r2, #1
@@ -100,6 +102,27 @@ KBBlockZero.exit:                                 ; preds = %bb2.i
 ; CHECK: [[LABEL]]:
 ; CHECK-NEXT: subs r0, r1, r0
 ; CHECK-NEXT: bx lr
+
+; CHECK-V8-LABEL: wrapDistance:
+; CHECK-V8: cmp r1, #59
+; CHECK-V8-NEXT: bgt
+; CHECK-V8-NEXT: %if.then
+; CHECK-V8-NEXT: subs r0, r2, #1
+; CHECK-V8-NEXT: bx lr
+; CHECK-V8-NEXT: %if.else
+; CHECK-V8-NEXT: subs [[REG:r[0-9]+]], #120
+; CHECK-V8-NEXT: cmp [[REG]], r1
+; CHECK-V8-NEXT: bge
+; CHECK-V8-NEXT: %if.else
+; CHECK-V8-NEXT: cmp r0, #119
+; CHECK-V8-NEXT: bgt
+; CHECK-V8-NEXT: %if.then4
+; CHECK-V8-NEXT: adds r0, r1, #1
+; CHECK-V8-NEXT: bx lr
+; CHECK-V8-NEXT: %if.end5
+; CHECK-V8-NEXT: subs r0, r1, r0
+; CHECK-V8-NEXT: bx lr
+
 define i32 @wrapDistance(i32 %tx, i32 %sx, i32 %w) {
 entry:
   %cmp = icmp slt i32 %sx, 60
diff --git a/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
new file mode 100644
index 0000000..defb946
--- /dev/null
+++ b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -march=thumb -mattr=+v7,+thumb2 | FileCheck %s
+
+define i8 @f1(i8* %call1, i8* %call3, i32 %h, i32 %w, i32 %Width) {
+; CHECK: f1:
+entry:
+        %mul17 = mul nsw i32 %Width, %h
+        %add = add nsw i32 %mul17, %w
+        %sub19 = sub i32 %add, %Width
+        %sub20 = add i32 %sub19, -1
+        %arrayidx21 = getelementptr inbounds i8* %call1, i32 %sub20
+        %0 = load i8* %arrayidx21, align 1
+        %conv22 = zext i8 %0 to i32
+        %arrayidx25 = getelementptr inbounds i8* %call1, i32 %sub19
+        %1 = load i8* %arrayidx25, align 1
+        %conv26 = zext i8 %1 to i32
+        %mul23189 = add i32 %conv26, %conv22
+        %add30 = add i32 %sub19, 1
+        %arrayidx31 = getelementptr inbounds i8* %call1, i32 %add30
+        %2 = load i8* %arrayidx31, align 1
+        %conv32 = zext i8 %2 to i32
+; CHECK: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #-1]
+; CHECK-NEXT: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #1]
+        %add28190 = add i32 %mul23189, %conv32
+        %sub35 = add i32 %add, -1
+        %arrayidx36 = getelementptr inbounds i8* %call1, i32 %sub35
+        %3 = load i8* %arrayidx36, align 1
+        %conv37 = zext i8 %3 to i32
+        %add34191 = add i32 %add28190, %conv37
+        %arrayidx40 = getelementptr inbounds i8* %call1, i32 %add
+        %4 = load i8* %arrayidx40, align 1
+        %conv41 = zext i8 %4 to i32
+        %mul42 = mul nsw i32 %conv41, 255
+        %add44 = add i32 %add, 1
+        %arrayidx45 = getelementptr inbounds i8* %call1, i32 %add44
+        %5 = load i8* %arrayidx45, align 1
+        %conv46 = zext i8 %5 to i32
+; CHECK: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #-1]
+; CHECK-NEXT: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #1]
+        %add49 = add i32 %add, %Width
+        %sub50 = add i32 %add49, -1
+        %arrayidx51 = getelementptr inbounds i8* %call1, i32 %sub50
+        %6 = load i8* %arrayidx51, align 1
+        %conv52 = zext i8 %6 to i32
+        %arrayidx56 = getelementptr inbounds i8* %call1, i32 %add49
+        %7 = load i8* %arrayidx56, align 1
+        %conv57 = zext i8 %7 to i32
+        %add61 = add i32 %add49, 1
+        %arrayidx62 = getelementptr inbounds i8* %call1, i32 %add61
+        %8 = load i8* %arrayidx62, align 1
+        %conv63 = zext i8 %8 to i32
+; CHECK: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #-1]
+; CHECK-NEXT: ldrb{{[.w]*}} r{{[0-9]*}}, [r{{[0-9]*}}, #1]
+        %tmp = add i32 %add34191, %conv46
+        %tmp193 = add i32 %tmp, %conv52
+        %tmp194 = add i32 %tmp193, %conv57
+        %tmp195 = add i32 %tmp194, %conv63
+        %tmp196 = mul i32 %tmp195, -28
+        %add65 = add i32 %tmp196, %mul42
+        %9 = lshr i32 %add65, 8
+        %conv68 = trunc i32 %9 to i8
+        %arrayidx69 = getelementptr inbounds i8* %call3, i32 %add
+        store i8 %conv68, i8* %arrayidx69, align 1
+        ret i8 %conv68
+}
diff --git a/test/CodeGen/ARM/2013-10-11-select-stalls.ll b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
new file mode 100644
index 0000000..33c0587
--- /dev/null
+++ b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
@@ -0,0 +1,16 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -stats 2>&1 | not grep "Number of pipeline stalls"
+; Evaluate the two vld1.8 instructions in separate MBB's,
+; instead of stalling on one and conditionally overwriting its result.
+
+define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %foo, i32 1)
+  %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2
+  ret <16 x i8> %retv
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
diff --git a/test/CodeGen/ARM/2013-11-08-inline-asm-neon-array.ll b/test/CodeGen/ARM/2013-11-08-inline-asm-neon-array.ll
new file mode 100644
index 0000000..5a86477
--- /dev/null
+++ b/test/CodeGen/ARM/2013-11-08-inline-asm-neon-array.ll
@@ -0,0 +1,16 @@
+;RUN:  not llc -mtriple=arm-linux-gnueabihf < %s 2>&1 | FileCheck %s
+
+; ModuleID = 'bug.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7--"
+
+%struct.uint8x8x4_t = type { [4 x <8 x i8>] }
+
+define void @foo() #0 {
+  %vsrc = alloca %struct.uint8x8x4_t, align 8
+  %ptr = alloca i8;
+  %1 = call i8* asm sideeffect "vld4.u8 ${0:h}, [$1], $2", "=*w,=r,r,1"(%struct.uint8x8x4_t* %vsrc, i32 0, i8* %ptr)
+  ret void
+}
+
+; CHECK: error: couldn't allocate output register for constraint 'w'
diff --git a/test/CodeGen/ARM/a15-SD-dep.ll b/test/CodeGen/ARM/a15-SD-dep.ll
index df921e0..019ff61 100644
--- a/test/CodeGen/ARM/a15-SD-dep.ll
+++ b/test/CodeGen/ARM/a15-SD-dep.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -disable-a15-sd-optimization -verify-machineinstrs < %s  | FileCheck -check-prefix=DISABLED %s
-; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck -check-prefix=ENABLED %s
+; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -disable-a15-sd-optimization -verify-machineinstrs < %s  | FileCheck -check-prefix=CHECK-DISABLED %s
+; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-ENABLED %s
 
 ; CHECK-ENABLED-LABEL: t1:
 ; CHECK-DISABLED-LABEL: t1:
diff --git a/test/CodeGen/ARM/addrspacecast.ll b/test/CodeGen/ARM/addrspacecast.ll
new file mode 100644
index 0000000..2e98ba5
--- /dev/null
+++ b/test/CodeGen/ARM/addrspacecast.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=arm
+
+; Check that codegen for an addrspace cast succeeds without error.
+define <4 x i32 addrspace(1)*> @f (<4 x i32*> %x) {
+  %1 = addrspacecast <4 x i32*> %x to <4 x i32 addrspace(1)*>
+  ret <4 x i32 addrspace(1)*> %1
+}
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index d668334..f55ae10 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -1,15 +1,30 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -o %t
-; RUN: grep " = " %t   | count 5
-; RUN: grep globl %t | count 4
-; RUN: grep weak %t  | count 1
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s
 
-@bar = external global i32
+; CHECK: .globl	test
+
+; CHECK: .globl	foo1
+; CHECK: foo1 = bar
+
+; CHECK: .globl	foo2
+; CHECK: foo2 = bar
+
+; CHECK: .weak	bar_f
+; CHECK: bar_f = foo_f
+
+; CHECK: bar_i = bar
+
+; CHECK: .globl	A
+; CHECK: A = bar
+
+@bar = global i32 42
 @foo1 = alias i32* @bar
 @foo2 = alias i32* @bar
 
 %FunTy = type i32()
 
-declare i32 @foo_f()
+define i32 @foo_f() {
+  ret i32 0
+}
 @bar_f = alias weak %FunTy* @foo_f
 
 @bar_i = alias internal i32* @bar
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 0762070..88d797e 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=arm | FileCheck -check-prefix=ARM %s
 ; RUN: llc < %s -march=thumb | FileCheck -check-prefix=THUMB %s
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck -check-prefix=T2 %s
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck -check-prefix=V8 %s
 
 ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified.
 
@@ -39,6 +40,17 @@ tailrecurse:                                      ; preds = %sw.bb, %entry
   br i1 %tst, label %sw.bb, label %tailrecurse.switch
 
 tailrecurse.switch:                               ; preds = %tailrecurse
+; V8-LABEL: %tailrecurse.switch
+; V8: cmp
+; V8-NEXT: beq
+; V8-NEXT: %tailrecurse.switch
+; V8: cmp
+; V8-NEXT: beq
+; V8-NEXT: %tailrecurse.switch
+; V8: cmp
+; V8-NEXT: beq
+; V8-NEXT: b	
+; The trailing space in the last line checks that the branch is unconditional
   switch i32 %and, label %sw.epilog [
     i32 1, label %sw.bb
     i32 3, label %sw.bb6
@@ -73,6 +85,7 @@ sw.epilog:                                        ; preds = %tailrecurse.switch
 ; ARM: bar
 ; THUMB: bar
 ; T2: bar
+; V8-LABEL: bar:
 define internal zeroext i8 @bar(%struct.S* %x, %struct.S* nocapture %y) nounwind readonly {
 entry:
   %0 = getelementptr inbounds %struct.S* %x, i32 0, i32 1, i32 0
@@ -81,22 +94,32 @@ entry:
 ; ARM: ands
 ; THUMB: ands
 ; T2: ands
+; V8: ands
+; V8-NEXT: beq
   %3 = and i32 %2, 112
   %4 = icmp eq i32 %3, 0
   br i1 %4, label %return, label %bb
 
 bb:                                               ; preds = %entry
+; V8-NEXT: %bb
   %5 = getelementptr inbounds %struct.S* %y, i32 0, i32 1, i32 0
   %6 = load i8* %5, align 1
   %7 = zext i8 %6 to i32
 ; ARM: andsne
 ; THUMB: ands
 ; T2: andsne
+; V8: ands
+; V8-NEXT: beq
   %8 = and i32 %7, 112
   %9 = icmp eq i32 %8, 0
   br i1 %9, label %return, label %bb2
 
 bb2:                                              ; preds = %bb
+; V8-NEXT: %bb2
+; V8-NEXT: cmp
+; V8-NEXT: it	ne
+; V8-NEXT: cmpne
+; V8-NEXT: bne
   %10 = icmp eq i32 %3, 16
   %11 = icmp eq i32 %8, 16
   %or.cond = or i1 %10, %11
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 8ec829c..0477d4f 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-THUMB
 
 define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test1:
@@ -175,28 +175,14 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
   ret i64 %r
 }
 
-; Compiles down to cmpxchg
-; FIXME: Should compile to a single ldrexd
+; Compiles down to a single ldrexd
 define i64 @test8(i64* %ptr) {
 ; CHECK-LABEL: test8:
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: cmp [[REG1]]
-; CHECK: cmpeq [[REG2]]
-; CHECK: bne
-; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
-; CHECK: cmp
-; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
 ; CHECK-THUMB-LABEL: test8:
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: cmp [[REG1]]
-; CHECK-THUMB: it eq
-; CHECK-THUMB: cmpeq [[REG2]]
-; CHECK-THUMB: bne
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
   %r = load atomic i64* %ptr seq_cst, align 8
diff --git a/test/CodeGen/ARM/atomic-load-store.ll b/test/CodeGen/ARM/atomic-load-store.ll
index 476b3dd..53c7184 100644
--- a/test/CodeGen/ARM/atomic-load-store.ll
+++ b/test/CodeGen/ARM/atomic-load-store.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=ARM
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s -check-prefix=THUMBTWO
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=THUMBTWO
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios | FileCheck %s -check-prefix=THUMBONE
 ; RUN  llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
 
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 6e6b363..9a79c9f 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-T1
+; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-T1
 
 define void @func(i32 %argc, i8** %argv) nounwind {
 entry:
@@ -24,78 +26,93 @@ entry:
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_add_4
   %0 = atomicrmw add i32* %val1, i32 %tmp monotonic
 	store i32 %0, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_sub_4
   %1 = atomicrmw sub i32* %val2, i32 30 monotonic
 	store i32 %1, i32* %old
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_add_4
   %2 = atomicrmw add i32* %val2, i32 1 monotonic
 	store i32 %2, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_sub_4
   %3 = atomicrmw sub i32* %val2, i32 1 monotonic
 	store i32 %3, i32* %old
   ; CHECK: ldrex
   ; CHECK: and
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_and_4
   %4 = atomicrmw and i32* %andt, i32 4080 monotonic
 	store i32 %4, i32* %old
   ; CHECK: ldrex
   ; CHECK: or
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_or_4
   %5 = atomicrmw or i32* %ort, i32 4080 monotonic
 	store i32 %5, i32* %old
   ; CHECK: ldrex
   ; CHECK: eor
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_xor_4
   %6 = atomicrmw xor i32* %xort, i32 4080 monotonic
 	store i32 %6, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_min_4
   %7 = atomicrmw min i32* %val2, i32 16 monotonic
 	store i32 %7, i32* %old
 	%neg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_min_4
   %8 = atomicrmw min i32* %val2, i32 %neg monotonic
 	store i32 %8, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_max_4
   %9 = atomicrmw max i32* %val2, i32 1 monotonic
 	store i32 %9, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_max_4
   %10 = atomicrmw max i32* %val2, i32 0 monotonic
 	store i32 %10, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_4
   %11 = atomicrmw umin i32* %val2, i32 16 monotonic
 	store i32 %11, i32* %old
 	%uneg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_4
   %12 = atomicrmw umin i32* %val2, i32 %uneg monotonic
 	store i32 %12, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_4
   %13 = atomicrmw umax i32* %val2, i32 1 monotonic
 	store i32 %13, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_4
   %14 = atomicrmw umax i32* %val2, i32 0 monotonic
 	store i32 %14, i32* %old
 
@@ -110,22 +127,26 @@ entry:
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_2
   %0 = atomicrmw umin i16* %val, i16 16 monotonic
   store i16 %0, i16* %old
   %uneg = sub i16 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_2
   %1 = atomicrmw umin i16* %val, i16 %uneg monotonic
   store i16 %1, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_2
   %2 = atomicrmw umax i16* %val, i16 1 monotonic
   store i16 %2, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_2
   %3 = atomicrmw umax i16* %val, i16 0 monotonic
   store i16 %3, i16* %old
   ret void
@@ -139,22 +160,26 @@ entry:
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_1
   %0 = atomicrmw umin i8* %val, i8 16 monotonic
   store i8 %0, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_1
   %uneg = sub i8 0, 1
   %1 = atomicrmw umin i8* %val, i8 %uneg monotonic
   store i8 %1, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_1
   %2 = atomicrmw umax i8* %val, i8 1 monotonic
   store i8 %2, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_1
   %3 = atomicrmw umax i8* %val, i8 0 monotonic
   store i8 %3, i8* %old
   ret void
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
new file mode 100644
index 0000000..3f93929
--- /dev/null
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -0,0 +1,1344 @@
+; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i8:
+   %old = atomicrmw add i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i16:
+   %old = atomicrmw add i16* @var16, i16 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i32:
+   %old = atomicrmw add i32* @var32, i32 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i64:
+   %old = atomicrmw add i64* @var64, i64 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: adds [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: adc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i8:
+   %old = atomicrmw sub i8* @var8, i8 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i16:
+   %old = atomicrmw sub i16* @var16, i16 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i32:
+   %old = atomicrmw sub i32* @var32, i32 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i64:
+   %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i8:
+   %old = atomicrmw and i8* @var8, i8 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i16:
+   %old = atomicrmw and i16* @var16, i16 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i32:
+   %old = atomicrmw and i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i64:
+   %old = atomicrmw and i64* @var64, i64 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: and{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i8:
+   %old = atomicrmw or i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i16:
+   %old = atomicrmw or i16* @var16, i16 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i32:
+   %old = atomicrmw or i32* @var32, i32 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i64:
+   %old = atomicrmw or i64* @var64, i64 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i8:
+   %old = atomicrmw xor i8* @var8, i8 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i16:
+   %old = atomicrmw xor i16* @var16, i16 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i32:
+   %old = atomicrmw xor i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i64:
+   %old = atomicrmw xor i64* @var64, i64 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i8:
+   %old = atomicrmw xchg i8* @var8, i8 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i16:
+   %old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i32:
+   %old = atomicrmw xchg i32* @var32, i32 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i64:
+   %old = atomicrmw xchg i64* @var64, i64 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i8:
+   %old = atomicrmw min i8* @var8, i8 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it ge
+; CHECK:      movge r[[OLDX]], r0
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i16:
+   %old = atomicrmw min i16* @var16, i16 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it ge
+; CHECK:      movge r[[OLDX]], r0
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i32:
+   %old = atomicrmw min i32* @var32, i32 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lt
+; CHECK:      movlt r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i64:
+   %old = atomicrmw min i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: blt .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i8:
+   %old = atomicrmw max i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it le
+; CHECK:      movle r[[OLDX]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i16:
+   %old = atomicrmw max i16* @var16, i16 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it le
+; CHECK:      movle r[[OLDX]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i32:
+   %old = atomicrmw max i32* @var32, i32 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it gt
+; CHECK:      movgt r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i64:
+   %old = atomicrmw max i64* @var64, i64 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: bge .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i8:
+   %old = atomicrmw umin i8* @var8, i8 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lo
+; CHECK:      movlo r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i16:
+   %old = atomicrmw umin i16* @var16, i16 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lo
+; CHECK:      movlo r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i32:
+   %old = atomicrmw umin i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lo
+; CHECK:      movlo r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i64:
+   %old = atomicrmw umin i64* @var64, i64 %offset acq_rel
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: blo .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i8:
+   %old = atomicrmw umax i8* @var8, i8 %offset acq_rel
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it hi
+; CHECK:      movhi r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i16:
+   %old = atomicrmw umax i16* @var16, i16 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it hi
+; CHECK:      movhi r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i32:
+   %old = atomicrmw umax i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it hi
+; CHECK:      movhi r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i64:
+   %old = atomicrmw umax i64* @var64, i64 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: bhs .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i8:
+   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r1 is a reasonable guess.
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i16:
+   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r1 is a reasonable guess.
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i32:
+   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r1 is a reasonable guess.
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i64:
+   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: cmp   [[OLD1]], r0
+; Thumb mode: it eq
+; CHECK:      cmpeq [[OLD2]], r1
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r2, r3 is a reasonable guess.
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, [[OLD1]]
+; CHECK-NEXT: mov r1, [[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_monotonic_i8() nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_i8:
+  %val = load atomic i8* @var8 monotonic, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK: ldrb r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_regoff_i8:
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i8*
+
+  %val = load atomic i8* %addr monotonic, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldrb r0, [r0, r2]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_acquire_i8() nounwind {
+; CHECK-LABEL: test_atomic_load_acquire_i8:
+  %val = load atomic i8* @var8 acquire, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldab r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_seq_cst_i8() nounwind {
+; CHECK-LABEL: test_atomic_load_seq_cst_i8:
+  %val = load atomic i8* @var8 seq_cst, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldab r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret i8 %val
+}
+
+define i16 @test_atomic_load_monotonic_i16() nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_i16:
+  %val = load atomic i16* @var16 monotonic, align 2
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldrh r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i16 %val
+}
+
+define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_regoff_i32:
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i32*
+
+  %val = load atomic i32* %addr monotonic, align 4
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldr r0, [r0, r2]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i32 %val
+}
+
+define i64 @test_atomic_load_seq_cst_i64() nounwind {
+; CHECK-LABEL: test_atomic_load_seq_cst_i64:
+  %val = load atomic i64* @var64 seq_cst, align 8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var64
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldaexd r0, r1, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret i64 %val
+}
+
+define void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_i8:
+  store atomic i8 %val, i8* @var8 monotonic, align 1
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK: strb r0, [r[[ADDR]]]
+
+  ret void
+}
+
+define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_regoff_i8:
+
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i8*
+
+  store atomic i8 %val, i8* %addr monotonic, align 1
+; CHECK: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp]
+; CHECK: strb [[VAL]], [r0, r2]
+
+  ret void
+}
+
+define void @test_atomic_store_release_i8(i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_release_i8:
+  store atomic i8 %val, i8* @var8 release, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: stlb r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret void
+}
+
+define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_seq_cst_i8:
+  store atomic i8 %val, i8* @var8 seq_cst, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: stlb r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret void
+}
+
+define void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_i16:
+  store atomic i16 %val, i16* @var16 monotonic, align 2
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: strh r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret void
+}
+
+define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_regoff_i32:
+
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i32*
+
+  store atomic i32 %val, i32* %addr monotonic, align 4
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldr [[VAL:r[0-9]+]], [sp]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: str [[VAL]], [r0, r2]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret void
+}
+
+define void @test_atomic_store_release_i64(i64 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_release_i64:
+  store atomic i64 %val, i64* @var64 release, align 8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret void
+}
+
+define i32 @not.barriers(i32* %var, i1 %cond) {
+; CHECK-LABEL: not.barriers:
+  br i1 %cond, label %atomic_ver, label %simple_ver
+simple_ver:
+  %oldval = load i32* %var
+  %newval = add nsw i32 %oldval, -1
+  store i32 %newval, i32* %var
+  br label %somewhere
+atomic_ver:
+  fence seq_cst
+  %val = atomicrmw add i32* %var, i32 -1 monotonic
+  fence seq_cst
+  br label %somewhere
+; CHECK: dmb
+; CHECK: ldrex
+; CHECK: dmb
+  ; The key point here is that the second dmb isn't immediately followed by the
+  ; simple_ver basic block, which LLVM attempted to do when DMB had been marked
+  ; with isBarrier. For now, look for something that looks like "somewhere".
+; CHECK-NEXT: mov
+somewhere:
+  %combined = phi i32 [ %val, %atomic_ver ], [ %newval, %simple_ver]
+  ret i32 %combined
+}
diff --git a/test/CodeGen/ARM/build-attributes-encoding.s b/test/CodeGen/ARM/build-attributes-encoding.s
new file mode 100644
index 0000000..5ad51b2
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-encoding.s
@@ -0,0 +1,85 @@
+// This tests that ARM attributes are properly encoded.
+
+// RUN: llvm-mc < %s -triple=arm-linux-gnueabi -filetype=obj -o - \
+// RUN:   | llvm-readobj -s -sd | FileCheck %s
+
+// Tag_CPU_name (=5)
+.cpu Cortex-A8
+
+// Tag_CPU_arch (=6)
+.eabi_attribute 6, 10
+
+// Tag_arch_profile (=7)
+.eabi_attribute 7, 'A'
+
+// Tag_ARM_ISA_use (=8)
+.eabi_attribute 8, 1
+
+// Tag_THUMB_ISA_use (=9)
+.eabi_attribute 9, 2
+
+// Tag_FP_arch (=10)
+.fpu vfpv3
+
+// Tag_Advanced_SIMD_arch (=12)
+.eabi_attribute 12, 2
+
+// Tag_ABI_FP_denormal (=20)
+.eabi_attribute 20, 1
+
+// Tag_ABI_FP_exceptions (=21)
+.eabi_attribute 21, 1
+
+// Tag_ABI_FP_number_model (=23)
+.eabi_attribute 23, 1
+
+// Tag_ABI_align_needed (=24)
+.eabi_attribute 24, 1
+
+// Tag_ABI_align_preserved (=25)
+.eabi_attribute 25, 1
+
+// Tag_ABI_HardFP_use (=27)
+.eabi_attribute 27, 0
+
+// Tag_ABI_VFP_args (=28)
+.eabi_attribute 28, 1
+
+// Tag_FP_HP_extension (=36)
+.eabi_attribute 36, 1
+
+// Tag_MPextension_use (=42)
+.eabi_attribute 42, 1
+
+// Tag_DIV_use (=44)
+.eabi_attribute 44, 2
+
+// Tag_Virtualization_use (=68)
+.eabi_attribute 68, 3
+
+// Check that values > 128 are encoded properly
+.eabi_attribute 110, 160
+
+// Check that tags > 128 are encoded properly
+.eabi_attribute 129, 1
+.eabi_attribute 250, 1
+
+// CHECK:        Section {
+// CHECK:          Name: .ARM.attributes
+// CHECK-NEXT:     Type: SHT_ARM_ATTRIBUTES
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Address: 0x0
+// CHECK-NEXT:     Offset: 0x34
+// CHECK-NEXT:     Size: 70
+// CHECK-NEXT:     Link: 0
+// CHECK-NEXT:     Info: 0
+// CHECK-NEXT:     AddressAlignment: 1
+// CHECK-NEXT:     EntrySize: 0
+// CHECK-NEXT:     SectionData (
+// CHECK-NEXT:       0000: 41450000 00616561 62690001 3B000000
+// CHECK-NEXT:       0010: 05434F52 5445582D 41380006 0A074108
+// CHECK-NEXT:       0020: 0109020A 030C0214 01150117 01180119
+// CHECK-NEXT:       0030: 011B001C 0124012A 012C0244 036EA001
+// CHECK-NEXT:       0040: 810101FA 0101
+// CHECK-NEXT:     )
diff --git a/test/CodeGen/ARM/byval_load_align.ll b/test/CodeGen/ARM/byval_load_align.ll
new file mode 100644
index 0000000..2c0910c
--- /dev/null
+++ b/test/CodeGen/ARM/byval_load_align.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple thumbv7-apple-ios -O1 | FileCheck %s
+
+; rdar://15144402
+; Make sure we don't assume 4-byte alignment when loading from a byval argument
+; with alignment of 2.
+; CHECK: ldr r1, [r[[REG:[0-9]+]]]
+; CHECK: ldr r2, [r[[REG]], #4]
+; CHECK: ldr r3, [r[[REG]], #8]
+; CHECK-NOT: ldm
+; CHECK: .align	1 @ @sID
+
+%struct.ModuleID = type { [32 x i8], [32 x i8], i16 }
+
+@sID = internal constant %struct.ModuleID { [32 x i8] c"TEST\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [32 x i8] c"1.0\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", i16 23 }, align 2
+
+; Function Attrs: nounwind ssp
+define void @Client() #0 {
+entry:
+  tail call void @Logger(i8 signext 97, %struct.ModuleID* byval @sID) #2
+  ret void
+}
+
+declare void @Logger(i8 signext, %struct.ModuleID* byval) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index d4be6ee..86106a0 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -17,7 +17,7 @@ target triple = "thumbv7-apple-ios3.0.0"
 ; Function Attrs: nounwind ssp
 define i32 @pr16110() #0 {
 for.cond1.preheader:
-  store i32 0, i32* @c, align 4, !dbg !21, !tbaa !23
+  store i32 0, i32* @c, align 4, !dbg !21
   br label %for.cond1.outer, !dbg !26
 
 for.cond1:                                        ; preds = %for.end9, %for.cond1.outer
@@ -26,9 +26,9 @@ for.cond1:                                        ; preds = %for.end9, %for.cond
   br i1 %cmp, label %for.body2, label %for.end9, !dbg !26
 
 for.body2:                                        ; preds = %for.cond1
-  store i32 %storemerge11, i32* @b, align 4, !dbg !26, !tbaa !23
+  store i32 %storemerge11, i32* @b, align 4, !dbg !26
   tail call void @llvm.dbg.value(metadata !27, i64 0, metadata !11), !dbg !28
-  %0 = load i64* @a, align 8, !dbg !29, !tbaa !30
+  %0 = load i64* @a, align 8, !dbg !29
   %xor = xor i64 %0, %e.1.ph, !dbg !29
   %conv3 = trunc i64 %xor to i32, !dbg !29
   tail call void @llvm.dbg.value(metadata !{i32 %conv3}, i64 0, metadata !10), !dbg !29
@@ -44,7 +44,7 @@ land.end:                                         ; preds = %land.rhs, %for.body
   %1 = phi i1 [ false, %for.body2 ], [ %tobool5, %land.rhs ]
   %land.ext = zext i1 %1 to i32
   %call6 = tail call i32 bitcast (i32 (...)* @fn2 to i32 (i32, i32*)*)(i32 %land.ext, i32* null) #3
-  %2 = load i32* @b, align 4, !dbg !26, !tbaa !23
+  %2 = load i32* @b, align 4, !dbg !26
   %inc8 = add nsw i32 %2, 1, !dbg !26
   %phitmp = and i64 %xor, 4294967295, !dbg !26
   br label %for.cond1.outer, !dbg !26
@@ -52,7 +52,7 @@ land.end:                                         ; preds = %land.rhs, %for.body
 for.cond1.outer:                                  ; preds = %land.end, %for.cond1.preheader
   %storemerge11.ph = phi i32 [ %inc8, %land.end ], [ 0, %for.cond1.preheader ]
   %e.1.ph = phi i64 [ %phitmp, %land.end ], [ 0, %for.cond1.preheader ]
-  %3 = load i32* @d, align 4, !dbg !31, !tbaa !23
+  %3 = load i32* @d, align 4, !dbg !31
   %tobool10 = icmp eq i32 %3, 0, !dbg !31
   br label %for.cond1
 
@@ -60,7 +60,7 @@ for.end9:                                         ; preds = %for.cond1
   br i1 %tobool10, label %if.end, label %for.cond1, !dbg !31
 
 if.end:                                           ; preds = %for.end9
-  store i32 %storemerge11, i32* @b, align 4, !dbg !26, !tbaa !23
+  store i32 %storemerge11, i32* @b, align 4, !dbg !26
   ret i32 0, !dbg !32
 }
 
@@ -71,12 +71,13 @@ declare i32 @fn3(...) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) #2
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 182024) (llvm/trunk 182023)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"pr16110.c", metadata !"/d/b"}
@@ -84,7 +85,7 @@ attributes #3 = { nounwind }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"pr16110", metadata !"pr16110", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @pr16110, null, null, metadata !9, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10, metadata !11}
@@ -101,13 +102,10 @@ attributes #3 = { nounwind }
 !20 = metadata !{i32 786484, i32 0, null, metadata !"d", metadata !"d", metadata !"", metadata !5, i32 4, metadata !8, i32 0, i32 1, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
 !21 = metadata !{i32 10, i32 0, metadata !22, null}
 !22 = metadata !{i32 786443, metadata !1, metadata !4, i32 10, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!23 = metadata !{metadata !"int", metadata !24}
-!24 = metadata !{metadata !"omnipotent char", metadata !25}
-!25 = metadata !{metadata !"Simple C/C++ TBAA"}
 !26 = metadata !{i32 12, i32 0, metadata !13, null}
 !27 = metadata !{i32* null}
 !28 = metadata !{i32 13, i32 0, metadata !12, null}
 !29 = metadata !{i32 14, i32 0, metadata !12, null}
-!30 = metadata !{metadata !"long long", metadata !24}
 !31 = metadata !{i32 16, i32 0, metadata !4, null}
 !32 = metadata !{i32 18, i32 0, metadata !4, null}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/constantfp.ll b/test/CodeGen/ARM/constantfp.ll
new file mode 100644
index 0000000..974bdd7
--- /dev/null
+++ b/test/CodeGen/ARM/constantfp.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=armv7 -mattr=+neon -mcpu=swift %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv7 -mattr=+neon -mcpu=cortex-a8 %s -o - | FileCheck --check-prefix=CHECK-NONEONFP %s
+; RUN: llc -mtriple=armv7 -mattr=-neon -mcpu=cortex-a8 %s -o - | FileCheck --check-prefix=CHECK-NONEON %s
+
+define arm_aapcs_vfpcc float @test_vmov_f32() {
+; CHECK-LABEL: test_vmov_f32:
+; CHECK: vmov.f32 d0, #1.0
+
+; CHECK-NONEONFP: vmov.f32 s0, #1.0
+  ret float 1.0
+}
+
+define arm_aapcs_vfpcc float @test_vmov_imm() {
+; CHECK-LABEL: test_vmov_imm:
+; CHECK: vmov.i32 d0, #0
+
+; CHECK-NONEON-LABEL: test_vmov_imm:
+; CHECK_NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret float 0.0
+}
+
+define arm_aapcs_vfpcc float @test_vmvn_imm() {
+; CHECK-LABEL: test_vmvn_imm:
+; CHECK: vmvn.i32 d0, #0xb0000000
+
+; CHECK-NONEON-LABEL: test_vmvn_imm:
+; CHECK_NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret float 8589934080.0
+}
+
+define arm_aapcs_vfpcc double @test_vmov_f64() {
+; CHECK-LABEL: test_vmov_f64:
+; CHECK: vmov.f64 d0, #1.0
+
+; CHECK-NONEON-LABEL: test_vmov_f64:
+; CHECK_NONEON: vmov.f64 d0, #1.0
+
+  ret double 1.0
+}
+
+define arm_aapcs_vfpcc double @test_vmov_double_imm() {
+; CHECK-LABEL: test_vmov_double_imm:
+; CHECK: vmov.i32 d0, #0
+
+; CHECK-NONEON-LABEL: test_vmov_double_imm:
+; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret double 0.0
+}
+
+define arm_aapcs_vfpcc double @test_vmvn_double_imm() {
+; CHECK-LABEL: test_vmvn_double_imm:
+; CHECK: vmvn.i32 d0, #0xb0000000
+
+; CHECK-NONEON-LABEL: test_vmvn_double_imm:
+; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret double 0x4fffffff4fffffff
+}
+
+; Make sure we don't ignore the high half of 64-bit values when deciding whether
+; a vmov/vmvn is possible.
+define arm_aapcs_vfpcc double @test_notvmvn_double_imm() {
+; CHECK-LABEL: test_notvmvn_double_imm:
+; CHECK: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+
+; CHECK-NONEON-LABEL: test_notvmvn_double_imm:
+; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret double 0x4fffffffffffffff
+}
diff --git a/test/CodeGen/ARM/dagcombine-concatvector.ll b/test/CodeGen/ARM/dagcombine-concatvector.ll
index d8c6c64..2927ea2 100644
--- a/test/CodeGen/ARM/dagcombine-concatvector.ll
+++ b/test/CodeGen/ARM/dagcombine-concatvector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 -mcpu=generic | FileCheck %s
 
 ; PR15525
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/ARM/darwin-eabi.ll b/test/CodeGen/ARM/darwin-eabi.ll
new file mode 100644
index 0000000..f2cde71
--- /dev/null
+++ b/test/CodeGen/ARM/darwin-eabi.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv7m-apple-darwin -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-M3
+; RUN: llc -mtriple=thumbv7em-apple-darwin -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-M4
+; RUN: llc -mtriple=thumbv7-apple-darwin -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-M3
+; RUN: llc -mtriple=thumbv7-apple-darwin -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-M4
+
+define float @float_op(float %lhs, float %rhs) {
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+; CHECK-M3-LABEL: float_op:
+; CHECK-M3: blx ___addsf3
+
+; CHECK-M4-LABEL: float_op:
+; CHECK-M4: vadd.f32
+}
+
+define double @double_op(double %lhs, double %rhs) {
+  %sum = fadd double %lhs, %rhs
+  ret double %sum
+; CHECK-M3-LABEL: double_op:
+; CHECK-M3: blx ___adddf3
+
+; CHECK-M4-LABEL: double_op:
+; CHECK-M4: blx ___adddf3
+}
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index 89ccb20..e8bf3ba 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -30,15 +30,16 @@ declare void @foobar(i64, i64)
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33}
 
 !0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !30, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !2, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31, i32 11} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !2, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
 !2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !2, i32 16777227, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !6 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786451, metadata !32, metadata !0, metadata !"tag_s", i32 5, i64 96, i64 32, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!7 = metadata !{i32 786451, metadata !32, metadata !0, metadata !"tag_s", i32 5, i64 96, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !11, metadata !12}
 !9 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"x", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
 !10 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
@@ -64,3 +65,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !30 = metadata !{metadata !1}
 !31 = metadata !{metadata !5, metadata !13, metadata !14, metadata !17, metadata !18, metadata!19}
 !32 = metadata !{metadata !"one.c", metadata !"/Volumes/Athwagate/R10048772"}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index bd55786..6cbe4b4 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -93,37 +93,38 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!162}
 
 !0 = metadata !{i32 786449, metadata !153, i32 16, metadata !"Apple clang version 2.1", i1 false, metadata !"", i32 2, metadata !147, metadata !26, metadata !148, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786433, metadata !160, metadata !0, metadata !"", i32 248, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !3, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!1 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"", i32 248, i64 32, i64 32, i32 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
 !2 = metadata !{i32 786473, metadata !160} ; [ DW_TAG_file_type ]
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786472, metadata !"Ver1", i64 0} ; [ DW_TAG_enumerator ]
-!5 = metadata !{i32 786433, metadata !160, metadata !0, metadata !"Mode", i32 79, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !7, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!5 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"Mode", i32 79, i64 32, i64 32, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
 !6 = metadata !{i32 786473, metadata !161} ; [ DW_TAG_file_type ]
 !7 = metadata !{metadata !8}
 !8 = metadata !{i32 786472, metadata !"One", i64 0} ; [ DW_TAG_enumerator ]
-!9 = metadata !{i32 786433, metadata !149, metadata !0, metadata !"", i32 15, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !11, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!9 = metadata !{i32 786436, metadata !149, metadata !0, metadata !"", i32 15, i64 32, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
 !10 = metadata !{i32 786473, metadata !149} ; [ DW_TAG_file_type ]
 !11 = metadata !{metadata !12, metadata !13}
 !12 = metadata !{i32 786472, metadata !"Unknown", i64 0} ; [ DW_TAG_enumerator ]
 !13 = metadata !{i32 786472, metadata !"Known", i64 1} ; [ DW_TAG_enumerator ]
-!14 = metadata !{i32 786433, metadata !150, metadata !0, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !16, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!14 = metadata !{i32 786436, metadata !150, metadata !0, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
 !15 = metadata !{i32 786473, metadata !150} ; [ DW_TAG_file_type ]
 !16 = metadata !{metadata !17, metadata !18}
 !17 = metadata !{i32 786472, metadata !"Single", i64 0} ; [ DW_TAG_enumerator ]
 !18 = metadata !{i32 786472, metadata !"Double", i64 1} ; [ DW_TAG_enumerator ]
-!19 = metadata !{i32 786433, metadata !151, metadata !0, metadata !"", i32 14, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !21, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!19 = metadata !{i32 786436, metadata !151, metadata !0, metadata !"", i32 14, i64 32, i64 32, i32 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
 !20 = metadata !{i32 786473, metadata !151} ; [ DW_TAG_file_type ]
 !21 = metadata !{metadata !22}
 !22 = metadata !{i32 786472, metadata !"Eleven", i64 0} ; [ DW_TAG_enumerator ]
-!23 = metadata !{i32 786478, metadata !152, metadata !24, metadata !"foobar_func_block_invoke_0", metadata !"foobar_func_block_invoke_0", metadata !"", i32 609, metadata !25, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null, i32 609} ; [ DW_TAG_subprogram ]
+!23 = metadata !{i32 786478, metadata !152, metadata !24, metadata !"foobar_func_block_invoke_0", metadata !"foobar_func_block_invoke_0", metadata !"", i32 609, metadata !25, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null, i32 609} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
 !24 = metadata !{i32 786473, metadata !152} ; [ DW_TAG_file_type ]
-!25 = metadata !{i32 786453, metadata !152, metadata !24, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !26, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!25 = metadata !{i32 786453, metadata !152, metadata !24, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !26 = metadata !{null}
 !27 = metadata !{i32 786689, metadata !23, metadata !".block_descriptor", metadata !24, i32 16777825, metadata !28, i32 64, null} ; [ DW_TAG_arg_variable ]
 !28 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ]
-!29 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"__block_literal_14", i32 609, i64 256, i64 32, i32 0, i32 0, i32 0, metadata !30, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!29 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"__block_literal_14", i32 609, i64 256, i64 32, i32 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
 !30 = metadata !{metadata !31, metadata !33, metadata !35, metadata !36, metadata !37, metadata !48, metadata !89, metadata !124}
 !31 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 609, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
 !32 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
@@ -133,7 +134,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !36 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__FuncPtr", i32 609, i64 32, i64 32, i64 96, i32 0, metadata !32} ; [ DW_TAG_member ]
 !37 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__descriptor", i32 609, i64 32, i64 32, i64 128, i32 0, metadata !38} ; [ DW_TAG_member ]
 !38 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ]
-!39 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"__block_descriptor_withcopydispose", i32 307, i64 128, i64 32, i32 0, i32 0, i32 0, metadata !41, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!39 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"__block_descriptor_withcopydispose", i32 307, i64 128, i64 32, i32 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
 !40 = metadata !{i32 786473, metadata !153} ; [ DW_TAG_file_type ]
 !41 = metadata !{metadata !42, metadata !44, metadata !45, metadata !47}
 !42 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"reserved", i32 307, i64 32, i64 32, i64 0, i32 0, metadata !43} ; [ DW_TAG_member ]
@@ -144,7 +145,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !47 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"DestroyFuncPtr", i32 307, i64 32, i64 32, i64 96, i32 0, metadata !46} ; [ DW_TAG_member ]
 !48 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 609, i64 32, i64 32, i64 160, i32 0, metadata !49} ; [ DW_TAG_member ]
 !49 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !50} ; [ DW_TAG_pointer_type ]
-!50 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"", i32 0, i64 224, i64 0, i32 0, i32 16, i32 0, metadata !51, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!50 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"", i32 0, i64 224, i64 0, i32 0, i32 16, null, metadata !51, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
 !51 = metadata !{metadata !52, metadata !53, metadata !54, metadata !55, metadata !56, metadata !57, metadata !58}
 !52 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
 !53 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__forwarding", i32 0, i64 32, i64 32, i64 32, i32 0, metadata !32} ; [ DW_TAG_member ]
@@ -154,17 +155,17 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !57 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__destroy_helper", i32 0, i64 32, i64 32, i64 160, i32 0, metadata !32} ; [ DW_TAG_member ]
 !58 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 0, i64 32, i64 32, i64 192, i32 0, metadata !59} ; [ DW_TAG_member ]
 !59 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !60} ; [ DW_TAG_pointer_type ]
-!60 = metadata !{i32 786451, metadata !154, metadata !24, metadata !"UIMydata", i32 26, i64 128, i64 32, i32 0, i32 0, i32 0, metadata !62, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!60 = metadata !{i32 786451, metadata !154, metadata !24, metadata !"UIMydata", i32 26, i64 128, i64 32, i32 0, i32 0, null, metadata !62, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
 !61 = metadata !{i32 786473, metadata !154} ; [ DW_TAG_file_type ]
 !62 = metadata !{metadata !63, metadata !71, metadata !75, metadata !79}
 !63 = metadata !{i32 786460, metadata !60, null, metadata !61, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!64 = metadata !{i32 786451, metadata !155, metadata !40, metadata !"NSO", i32 66, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !66, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!64 = metadata !{i32 786451, metadata !155, metadata !40, metadata !"NSO", i32 66, i64 32, i64 32, i32 0, i32 0, null, metadata !66, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
 !65 = metadata !{i32 786473, metadata !155} ; [ DW_TAG_file_type ]
 !66 = metadata !{metadata !67}
 !67 = metadata !{i32 786445, metadata !155, metadata !65, metadata !"isa", i32 67, i64 32, i64 32, i64 0, i32 2, metadata !68, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !68 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"Class", i32 197, i64 0, i64 0, i64 0, i32 0, metadata !69} ; [ DW_TAG_typedef ]
 !69 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !70} ; [ DW_TAG_pointer_type ]
-!70 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, i32 0, null, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!70 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
 !71 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataRef", i32 28, i64 32, i64 32, i64 32, i32 0, metadata !72, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !72 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"CFTypeRef", i32 313, i64 0, i64 0, i64 0, i32 0, metadata !73} ; [ DW_TAG_typedef ]
 !73 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !74} ; [ DW_TAG_pointer_type ]
@@ -174,7 +175,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !77 = metadata !{i32 786473, metadata !156} ; [ DW_TAG_file_type ]
 !78 = metadata !{i32 786468, null, metadata !0, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !79 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataFlags", i32 37, i64 8, i64 8, i64 96, i32 0, metadata !80, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!80 = metadata !{i32 786451, metadata !154, metadata !0, metadata !"", i32 30, i64 8, i64 8, i32 0, i32 0, i32 0, metadata !81, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!80 = metadata !{i32 786451, metadata !154, metadata !0, metadata !"", i32 30, i64 8, i64 8, i32 0, i32 0, null, metadata !81, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
 !81 = metadata !{metadata !82, metadata !84, metadata !85, metadata !86, metadata !87, metadata !88}
 !82 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"named", i32 31, i64 1, i64 32, i64 0, i32 0, metadata !83} ; [ DW_TAG_member ]
 !83 = metadata !{i32 786468, null, metadata !0, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
@@ -185,10 +186,10 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !88 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"isCIMydata", i32 36, i64 1, i64 32, i64 7, i32 0, metadata !83} ; [ DW_TAG_member ]
 !89 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"self", i32 609, i64 32, i64 32, i64 192, i32 0, metadata !90} ; [ DW_TAG_member ]
 !90 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ]
-!91 = metadata !{i32 786451, metadata !152, metadata !40, metadata !"MyWork", i32 36, i64 384, i64 32, i32 0, i32 0, i32 0, metadata !92, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!91 = metadata !{i32 786451, metadata !152, metadata !40, metadata !"MyWork", i32 36, i64 384, i64 32, i32 0, i32 0, null, metadata !92, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
 !92 = metadata !{metadata !93, metadata !98, metadata !101, metadata !107, metadata !123}
 !93 = metadata !{i32 786460, metadata !152, metadata !91, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !94} ; [ DW_TAG_inheritance ]
-!94 = metadata !{i32 786451, metadata !157, metadata !40, metadata !"twork", i32 43, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !96, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!94 = metadata !{i32 786451, metadata !157, metadata !40, metadata !"twork", i32 43, i64 32, i64 32, i32 0, i32 0, null, metadata !96, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
 !95 = metadata !{i32 786473, metadata !157} ; [ DW_TAG_file_type ]
 !96 = metadata !{metadata !97}
 !97 = metadata !{i32 786460, metadata !94, null, metadata !95, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
@@ -197,23 +198,23 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !100 = metadata !{i32 786468, null, metadata !0, metadata !"long long unsigned int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !101 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_library", i32 39, i64 32, i64 32, i64 96, i32 1, metadata !102, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !102 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !103} ; [ DW_TAG_pointer_type ]
-!103 = metadata !{i32 786451, metadata !158, metadata !40, metadata !"MyLibrary2", i32 22, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !105, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!103 = metadata !{i32 786451, metadata !158, metadata !40, metadata !"MyLibrary2", i32 22, i64 32, i64 32, i32 0, i32 0, null, metadata !105, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
 !104 = metadata !{i32 786473, metadata !158} ; [ DW_TAG_file_type ]
 !105 = metadata !{metadata !106}
 !106 = metadata !{i32 786460, metadata !103, null, metadata !104, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
 !107 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_bounds", i32 40, i64 128, i64 32, i64 128, i32 1, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !108 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"CR", i32 33, i64 0, i64 0, i64 0, i32 0, metadata !109} ; [ DW_TAG_typedef ]
-!109 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CR", i32 29, i64 128, i64 32, i32 0, i32 0, i32 0, metadata !110, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!109 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CR", i32 29, i64 128, i64 32, i32 0, i32 0, null, metadata !110, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
 !110 = metadata !{metadata !111, metadata !117}
 !111 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"origin", i32 30, i64 64, i64 32, i64 0, i32 0, metadata !112} ; [ DW_TAG_member ]
 !112 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"CP", i32 17, i64 0, i64 0, i64 0, i32 0, metadata !113} ; [ DW_TAG_typedef ]
-!113 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CP", i32 13, i64 64, i64 32, i32 0, i32 0, i32 0, metadata !114, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!113 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CP", i32 13, i64 64, i64 32, i32 0, i32 0, null, metadata !114, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
 !114 = metadata !{metadata !115, metadata !116}
 !115 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"x", i32 14, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
 !116 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"y", i32 15, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
 !117 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"size", i32 31, i64 64, i64 32, i64 64, i32 0, metadata !118} ; [ DW_TAG_member ]
 !118 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"Size", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !119} ; [ DW_TAG_typedef ]
-!119 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"Size", i32 21, i64 64, i64 32, i32 0, i32 0, i32 0, metadata !120, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!119 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"Size", i32 21, i64 64, i64 32, i32 0, i32 0, null, metadata !120, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
 !120 = metadata !{metadata !121, metadata !122}
 !121 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"width", i32 22, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
 !122 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"height", i32 23, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
@@ -221,7 +222,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !124 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"semi", i32 609, i64 32, i64 32, i64 224, i32 0, metadata !125} ; [ DW_TAG_member ]
 !125 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"d_t", i32 35, i64 0, i64 0, i64 0, i32 0, metadata !126} ; [ DW_TAG_typedef ]
 !126 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !127} ; [ DW_TAG_pointer_type ]
-!127 = metadata !{i32 786451, metadata !159, metadata !0, metadata !"my_struct", i32 49, i64 0, i64 0, i32 0, i32 4, i32 0, null, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!127 = metadata !{i32 786451, metadata !159, metadata !0, metadata !"my_struct", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
 !128 = metadata !{i32 786473, metadata !159} ; [ DW_TAG_file_type ]
 !129 = metadata !{i32 609, i32 144, metadata !23, null}
 !130 = metadata !{i32 786689, metadata !23, metadata !"loadedMydata", metadata !24, i32 33555041, metadata !59, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -256,3 +257,4 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !159 = metadata !{metadata !"header15.h", metadata !"/Volumes/Sandbox/llvm"}
 !160 = metadata !{metadata !"header.h", metadata !"/Volumes/Sandbox/llvm"}
 !161 = metadata !{metadata !"header2.h", metadata !"/Volumes/Sandbox/llvm"}
+!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 052fd22..8505f53 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -38,23 +38,25 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
+!llvm.module.flags = !{!56}
+
 !0 = metadata !{i32 786478, metadata !54, null, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
 !3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !54, metadata !2, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
+!6 = metadata !{i32 786433, metadata !54, metadata !2, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
 !7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, null, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null, null, metadata !52, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786478, metadata !54, null, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null, null, metadata !52, i32 0} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
+!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, null, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, null, metadata !53, i32 0} ; [ DW_TAG_subprogram ]
+!14 = metadata !{i32 786478, metadata !55, null, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 0} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
 !15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
 !18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -94,3 +96,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 11631ae..30a3e2d 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -57,11 +57,12 @@ entry:
 declare i32 @puts(i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!48}
 
 !0 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"printer", metadata !"printer", metadata !"printer", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @printer, null, null, metadata !43, i32 12} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !46, i32 1, metadata !"(LLVM build 00)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !42, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8}
 !5 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
@@ -69,7 +70,7 @@ declare i32 @puts(i8* nocapture) nounwind
 !8 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"inlineprinter", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @inlineprinter, null, null, metadata !44, i32 5} ; [ DW_TAG_subprogram ]
 !10 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 18, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !45, i32 18} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!11 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !5, metadata !5, metadata !13}
 !13 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
 !14 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ]
@@ -106,3 +107,4 @@ declare i32 @puts(i8* nocapture) nounwind
 !45 = metadata !{metadata !22, metadata !23, metadata !24}
 !46 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !47 = metadata !{i32 0}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index af61f6c..ee515fd5 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -36,24 +36,25 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!56}
 
-!0 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 3} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
 !1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
+!6 = metadata !{i32 786433, metadata !2, null, metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
 !7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !52, i32 59} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !52, i32 59} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
+!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, null, metadata !53, i32 41} ; [ DW_TAG_subprogram ]
+!14 = metadata !{i32 786478, metadata !55, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 41} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
 !15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
 !18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -70,7 +71,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !30 = metadata !{i32 786689, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0, null} ; [ DW_TAG_arg_variable ]
 !31 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
 !32 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"FV", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, i32 0} ; [ DW_TAG_union_type ]
+!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, null} ; [ DW_TAG_union_type ]
 !34 = metadata !{metadata !35, metadata !37}
 !35 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"V", i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
 !36 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"v4sf", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
@@ -93,3 +94,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index 83e7dac..e92d977 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -62,15 +62,16 @@ declare i32 @puts(i8* nocapture) nounwind optsize
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!53}
 
-!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48, i32 5} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
 !1 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !51, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !52, metadata !52, metadata !47, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !51, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !51, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"printer", metadata !"printer", metadata !"", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i8*, float, i8)* @printer, null, null, metadata !49, i32 12} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !50, i32 18} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"printer", metadata !"printer", metadata !"", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @printer, null, null, metadata !49, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
+!7 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !50, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
 !8 = metadata !{i32 786689, metadata !0, metadata !"ptr", metadata !1, i32 16777220, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
 !10 = metadata !{i32 786689, metadata !0, metadata !"val", metadata !1, i32 33554436, metadata !11, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -116,3 +117,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !50 = metadata !{metadata !17, metadata !18, metadata !22}
 !51 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !52 = metadata !{i32 0}
+!53 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index cc2e831..854fcab 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -40,11 +40,12 @@ declare float @_Z2f3f(float) optsize
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20}
 
 !0 = metadata !{i32 786449, metadata !18, i32 4, metadata !"clang version 3.0 (trunk 130845)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !16, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @_Z3foov, null, null, metadata !17, i32 5} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z3foov, null, null, metadata !17, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
 !2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 786688, metadata !6, metadata !"k", metadata !2, i32 6, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
 !6 = metadata !{i32 786443, metadata !18, metadata !1, i32 5, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
@@ -61,3 +62,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !17 = metadata !{metadata !5, metadata !8}
 !18 = metadata !{metadata !"k.cc", metadata !"/private/tmp"}
 !19 = metadata !{i32 0}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 06d6172..7be0c79 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -60,7 +60,7 @@ bb:
   %3 = load i32* @tabsize, align 4
   %4 = srem i32 %cols, %3
   %5 = sdiv i32 %cols, %3
-  %6 = tail call i32 @llvm.objectsize.i32(i8* null, i1 false)
+  %6 = tail call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 false)
   %7 = tail call i8* @__memset_chk(i8* null, i32 9, i32 %5, i32 %6) nounwind
   br label %bb1
 
@@ -71,7 +71,7 @@ bb1:
   ret void
 }
 
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readnone
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readnone
 declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
 
 ; rdar://11714607
diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll
index 4e28a10..9c9a188 100644
--- a/test/CodeGen/ARM/fast-isel-align.ll
+++ b/test/CodeGen/ARM/fast-isel-align.ll
@@ -1,22 +1,22 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
-
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
-
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-
-; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+
+; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Check unaligned stores
 %struct.anon = type <{ float }>
diff --git a/test/CodeGen/ARM/fast-isel-binary.ll b/test/CodeGen/ARM/fast-isel-binary.ll
index 3159627..e1a2a4f 100644
--- a/test/CodeGen/ARM/fast-isel-binary.ll
+++ b/test/CodeGen/ARM/fast-isel-binary.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
 
 ; Test add with non-legal types
 
diff --git a/test/CodeGen/ARM/fast-isel-br-phi.ll b/test/CodeGen/ARM/fast-isel-br-phi.ll
index a0aba69..3b9d465 100644
--- a/test/CodeGen/ARM/fast-isel-br-phi.ll
+++ b/test/CodeGen/ARM/fast-isel-br-phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios
 
 ; This test ensures HandlePHINodesInSuccessorBlocks() is able to promote basic
 ; non-legal integer types (i.e., i1, i8, i16).
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index d10a381..917a15d 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=THUMB-NOVFP
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=THUMB-NOVFP
+
+; XFAIL: vg_leak
 
 ; Note that some of these tests assume that relocations are either
 ; movw/movt or constant pool loads. Different platforms will select
diff --git a/test/CodeGen/ARM/fast-isel-cmp-imm.ll b/test/CodeGen/ARM/fast-isel-cmp-imm.ll
index 45ef4ed..55baf48 100644
--- a/test/CodeGen/ARM/fast-isel-cmp-imm.ll
+++ b/test/CodeGen/ARM/fast-isel-cmp-imm.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 define void @t1a(float %a) uwtable ssp {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-conversion.ll b/test/CodeGen/ARM/fast-isel-conversion.ll
index e40891a..5983493 100644
--- a/test/CodeGen/ARM/fast-isel-conversion.ll
+++ b/test/CodeGen/ARM/fast-isel-conversion.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -verify-machineinstrs -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -verify-machineinstrs -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -verify-machineinstrs -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
 
 ; Test sitofp
 
diff --git a/test/CodeGen/ARM/fast-isel-ext.ll b/test/CodeGen/ARM/fast-isel-ext.ll
index 15d0d3c..de0dd19 100644
--- a/test/CodeGen/ARM/fast-isel-ext.ll
+++ b/test/CodeGen/ARM/fast-isel-ext.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=v7
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=v7
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-apple-ios | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-linux-gnueabi | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-apple-ios | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-linux-gnueabi | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=v7
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=v7
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=v7
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=v7
 
 ; Can't test pre-ARMv6 Thumb because ARM FastISel currently only supports
 ; Thumb2. The ARMFastISel::ARMEmitIntExt code should work for Thumb by always
diff --git a/test/CodeGen/ARM/fast-isel-icmp.ll b/test/CodeGen/ARM/fast-isel-icmp.ll
index 3dc1109..85f449e 100644
--- a/test/CodeGen/ARM/fast-isel-icmp.ll
+++ b/test/CodeGen/ARM/fast-isel-icmp.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 define i32 @icmp_i16_signed(i16 %a, i16 %b) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 572ac3a..b08b72b 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
+
+; XFAIL: vg_leak
 
 ; Note that some of these tests assume that relocations are either
 ; movw/movt or constant pool loads. Different platforms will select
diff --git a/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll b/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
index 2a88678..d9c9cc4 100644
--- a/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
+++ b/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 define i32 @t1(i32* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t1
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -1
-  %0 = load i32* %add.ptr, align 4, !tbaa !0
+  %0 = load i32* %add.ptr, align 4
 ; THUMB: ldr r{{[0-9]}}, [r0, #-4]
   ret i32 %0
 }
@@ -13,7 +13,7 @@ define i32 @t2(i32* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t2
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -63
-  %0 = load i32* %add.ptr, align 4, !tbaa !0
+  %0 = load i32* %add.ptr, align 4
 ; THUMB: ldr r{{[0-9]}}, [r0, #-252]
   ret i32 %0
 }
@@ -22,7 +22,7 @@ define i32 @t3(i32* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t3
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -64
-  %0 = load i32* %add.ptr, align 4, !tbaa !0
+  %0 = load i32* %add.ptr, align 4
 ; THUMB: ldr r{{[0-9]}}, [r0]
   ret i32 %0
 }
@@ -31,7 +31,7 @@ define zeroext i16 @t4(i16* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t4
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -1
-  %0 = load i16* %add.ptr, align 2, !tbaa !3
+  %0 = load i16* %add.ptr, align 2
 ; THUMB: ldrh r{{[0-9]}}, [r0, #-2]
   ret i16 %0
 }
@@ -40,7 +40,7 @@ define zeroext i16 @t5(i16* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t5
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -127
-  %0 = load i16* %add.ptr, align 2, !tbaa !3
+  %0 = load i16* %add.ptr, align 2
 ; THUMB: ldrh r{{[0-9]}}, [r0, #-254]
   ret i16 %0
 }
@@ -49,7 +49,7 @@ define zeroext i16 @t6(i16* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t6
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -128
-  %0 = load i16* %add.ptr, align 2, !tbaa !3
+  %0 = load i16* %add.ptr, align 2
 ; THUMB: ldrh r{{[0-9]}}, [r0]
   ret i16 %0
 }
@@ -58,7 +58,7 @@ define zeroext i8 @t7(i8* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t7
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -1
-  %0 = load i8* %add.ptr, align 1, !tbaa !1
+  %0 = load i8* %add.ptr, align 1
 ; THUMB: ldrb r{{[0-9]}}, [r0, #-1]
   ret i8 %0
 }
@@ -67,7 +67,7 @@ define zeroext i8 @t8(i8* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t8
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -255
-  %0 = load i8* %add.ptr, align 1, !tbaa !1
+  %0 = load i8* %add.ptr, align 1
 ; THUMB: ldrb r{{[0-9]}}, [r0, #-255]
   ret i8 %0
 }
@@ -76,7 +76,7 @@ define zeroext i8 @t9(i8* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t9
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -256
-  %0 = load i8* %add.ptr, align 1, !tbaa !1
+  %0 = load i8* %add.ptr, align 1
 ; THUMB: ldrb r{{[0-9]}}, [r0]
   ret i8 %0
 }
@@ -85,7 +85,7 @@ define void @t10(i32* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t10
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -1
-  store i32 0, i32* %add.ptr, align 4, !tbaa !0
+  store i32 0, i32* %add.ptr, align 4
 ; THUMB: str r{{[0-9]}}, [r0, #-4]
   ret void
 }
@@ -94,7 +94,7 @@ define void @t11(i32* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t11
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -63
-  store i32 0, i32* %add.ptr, align 4, !tbaa !0
+  store i32 0, i32* %add.ptr, align 4
 ; THUMB: str r{{[0-9]}}, [r0, #-252]
   ret void
 }
@@ -103,7 +103,7 @@ define void @t12(i32* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t12
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -64
-  store i32 0, i32* %add.ptr, align 4, !tbaa !0
+  store i32 0, i32* %add.ptr, align 4
 ; THUMB: str r{{[0-9]}}, [r0]
   ret void
 }
@@ -112,7 +112,7 @@ define void @t13(i16* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t13
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -1
-  store i16 0, i16* %add.ptr, align 2, !tbaa !3
+  store i16 0, i16* %add.ptr, align 2
 ; THUMB: strh r{{[0-9]}}, [r0, #-2]
   ret void
 }
@@ -121,7 +121,7 @@ define void @t14(i16* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t14
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -127
-  store i16 0, i16* %add.ptr, align 2, !tbaa !3
+  store i16 0, i16* %add.ptr, align 2
 ; THUMB: strh r{{[0-9]}}, [r0, #-254]
   ret void
 }
@@ -130,7 +130,7 @@ define void @t15(i16* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t15
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -128
-  store i16 0, i16* %add.ptr, align 2, !tbaa !3
+  store i16 0, i16* %add.ptr, align 2
 ; THUMB: strh r{{[0-9]}}, [r0]
   ret void
 }
@@ -139,7 +139,7 @@ define void @t16(i8* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t16
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -1
-  store i8 0, i8* %add.ptr, align 1, !tbaa !1
+  store i8 0, i8* %add.ptr, align 1
 ; THUMB: strb r{{[0-9]}}, [r0, #-1]
   ret void
 }
@@ -148,7 +148,7 @@ define void @t17(i8* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t17
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -255
-  store i8 0, i8* %add.ptr, align 1, !tbaa !1
+  store i8 0, i8* %add.ptr, align 1
 ; THUMB: strb r{{[0-9]}}, [r0, #-255]
   ret void
 }
@@ -157,12 +157,7 @@ define void @t18(i8* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t18
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -256
-  store i8 0, i8* %add.ptr, align 1, !tbaa !1
+  store i8 0, i8* %add.ptr, align 1
 ; THUMB: strb r{{[0-9]}}, [r0]
   ret void
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
index ad0f159..838c103 100644
--- a/test/CodeGen/ARM/fast-isel-pic.ll
+++ b/test/CodeGen/ARM/fast-isel-pic.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s --check-prefix=THUMB-ELF
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
 
 @g = global i32 0, align 4
 
@@ -25,6 +25,8 @@ entry:
 ; ARMv7: add  [[reg2]], pc, [[reg2]]
 ; ARMv7-ELF: LoadGV
 ; ARMv7-ELF: ldr r[[reg2:[0-9]+]],
+; ARMv7-ELF: .LPC
+; ARMv7-ELF-NEXT: add r[[reg2]], pc
 ; ARMv7-ELF: ldr r[[reg3:[0-9]+]],
 ; ARMv7-ELF: ldr r[[reg2]], [r[[reg3]], r[[reg2]]]
   %tmp = load i32* @g
@@ -54,6 +56,8 @@ entry:
 ; ARMv7: ldr  r[[reg5]], [r[[reg5]]]
 ; ARMv7-ELF: LoadIndirectSymbol
 ; ARMv7-ELF: ldr r[[reg5:[0-9]+]],
+; ARMv7-ELF: .LPC
+; ARMv7-ELF-NEXT: add r[[reg5]], pc
 ; ARMv7-ELF: ldr r[[reg6:[0-9]+]],
 ; ARMv7-ELF: ldr r[[reg5]], [r[[reg6]], r[[reg5]]]
   %tmp = load i32* @i
diff --git a/test/CodeGen/ARM/fast-isel-ret.ll b/test/CodeGen/ARM/fast-isel-ret.ll
index ba5412c..8a68309 100644
--- a/test/CodeGen/ARM/fast-isel-ret.ll
+++ b/test/CodeGen/ARM/fast-isel-ret.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s
 
 ; Sign-extend of i1 currently not supported by fast-isel
 ;define signext i1 @ret0(i1 signext %a) nounwind uwtable ssp {
diff --git a/test/CodeGen/ARM/fast-isel-select.ll b/test/CodeGen/ARM/fast-isel-select.ll
index bb88814..40f8807 100644
--- a/test/CodeGen/ARM/fast-isel-select.ll
+++ b/test/CodeGen/ARM/fast-isel-select.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv8-apple-ios | FileCheck %s --check-prefix=THUMB
 
 define i32 @t1(i1 %c) nounwind readnone {
 entry:
@@ -39,15 +40,16 @@ define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
 entry:
 ; ARM: t3
 ; ARM: cmp r0, #0
-; ARM: movne r{{[1-9]}}, r{{[1-9]}}
-; ARM: mov r0, r{{[1-9]}}
+; ARM: movne r2, r1
+; ARM: add r0, r2, r1
 ; THUMB: t3
 ; THUMB: cmp r0, #0
 ; THUMB: it ne
-; THUMB: movne r{{[1-9]}}, r{{[1-9]}}
-; THUMB: mov r0, r{{[1-9]}}
+; THUMB: movne r2, r1
+; THUMB: add.w r0, r2, r1
   %0 = select i1 %c, i32 %a, i32 %b
-  ret i32 %0
+  %1 = add i32 %0, %a
+  ret i32 %1
 }
 
 define i32 @t4(i1 %c) nounwind readnone {
diff --git a/test/CodeGen/ARM/fast-isel-shifter.ll b/test/CodeGen/ARM/fast-isel-shifter.ll
index dbb1ce2..eb4b2b2 100644
--- a/test/CodeGen/ARM/fast-isel-shifter.ll
+++ b/test/CodeGen/ARM/fast-isel-shifter.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
 
 define i32 @shl() nounwind ssp {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-static.ll b/test/CodeGen/ARM/fast-isel-static.ll
index 7d86cb9..93c14a0 100644
--- a/test/CodeGen/ARM/fast-isel-static.ll
+++ b/test/CodeGen/ARM/fast-isel-static.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=LONG %s
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=LONG %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=NORM %s
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=NORM %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
 
 define void @myadd(float* %sum, float* %addend) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 0cebc90..5981cab 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 ; Very basic fast-isel functionality.
 define i32 @test0(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/ARM/fastisel-gep-promote-before-add.ll b/test/CodeGen/ARM/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..a32ab6d
--- /dev/null
+++ b/test/CodeGen/ARM/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=armv7-apple-ios %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: ldrb {{r[0-9]+}}, {{\[r[0-9]+\]}}
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
new file mode 100644
index 0000000..67fd129
--- /dev/null
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=thumbv7-apple-darwin-eabi < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv6m-apple-darwin-eabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
+; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
+
+
+declare void @bar(i8*)
+
+%bigVec = type [2 x double]
+
+@var = global %bigVec zeroinitializer
+
+define void @check_simple() minsize {
+; CHECK-LABEL: check_simple:
+; CHECK: push.w {r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp, sp,
+; ...
+; CHECK-NOT: add sp, sp,
+; CHECK: pop.w {r0, r1, r2, r3, r11, pc}
+
+; CHECK-T1-LABEL: check_simple:
+; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
+; CHECK-T1: add r7, sp, #16
+; CHECK-T1-NOT: sub sp, sp,
+; ...
+; CHECK-T1-NOT: add sp, sp,
+; CHECK-T1: pop {r0, r1, r2, r3, r7, pc}
+
+  ; iOS always has a frame pointer and messing with the push affects
+  ; how it's set in the prologue. Make sure we get that right.
+; CHECK-IOS-LABEL: check_simple:
+; CHECK-IOS: push {r3, r4, r5, r6, r7, lr}
+; CHECK-NOT: sub sp,
+; CHECK-IOS: add r7, sp, #16
+; CHECK-NOT: sub sp,
+; ...
+; CHECK-NOT: add sp,
+; CHEC: pop {r3, r4, r5, r6, r7, pc}
+
+  %var = alloca i8, i32 16
+  call void @bar(i8* %var)
+  ret void
+}
+
+define void @check_simple_too_big() minsize {
+; CHECK-LABEL: check_simple_too_big:
+; CHECK: push.w {r11, lr}
+; CHECK: sub sp,
+; ...
+; CHECK: add sp,
+; CHECK: pop.w {r11, pc}
+  %var = alloca i8, i32 64
+  call void @bar(i8* %var)
+  ret void
+}
+
+define void @check_vfp_fold() minsize {
+; CHECK-LABEL: check_vfp_fold:
+; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
+; CHECK: vpush {d6, d7, d8, d9}
+; CHECK-NOT: sub sp,
+; ...
+; CHECK: vldmia r[[GLOBREG]], {d8, d9}
+; ...
+; CHECK-NOT: add sp,
+; CHECK: vpop {d6, d7, d8, d9}
+; CHECKL pop {r[[GLOBREG]], pc}
+
+  ; iOS uses aligned NEON stores here, which is convenient since we
+  ; want to make sure that works too.
+; CHECK-IOS-LABEL: check_vfp_fold:
+; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-IOS: sub.w r4, sp, #16
+; CHECK-IOS: bic r4, r4, #15
+; CHECK-IOS: mov sp, r4
+; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
+; ...
+; CHECK-IOS: add r4, sp, #16
+; CHECK-IOS: vld1.64 {d8, d9}, [r4:128]
+; CHECK-IOS: mov sp, r4
+; CHECK-IOS: pop {r4, r7, pc}
+
+  %var = alloca i8, i32 16
+
+  %tmp = load %bigVec* @var
+  call void @bar(i8* %var)
+  store %bigVec %tmp, %bigVec* @var
+
+  ret void
+}
+
+; This function should use just enough space that the "add sp, sp, ..." could be
+; folded in except that doing so would clobber the value being returned.
+define i64 @check_no_return_clobber() minsize {
+; CHECK-LABEL: check_no_return_clobber:
+; CHECK: push.w {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp,
+; ...
+; CHECK: add sp, #40
+; CHECK: pop.w {r11, pc}
+
+  ; Just to keep iOS FileCheck within previous function:
+; CHECK-IOS-LABEL: check_no_return_clobber:
+
+  %var = alloca i8, i32 40
+  call void @bar(i8* %var)
+  ret i64 0
+}
+
+define arm_aapcs_vfpcc double @check_vfp_no_return_clobber() minsize {
+; CHECK-LABEL: check_vfp_no_return_clobber:
+; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
+; CHECK: vpush {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9}
+; CHECK-NOT: sub sp,
+; ...
+; CHECK: add sp, #64
+; CHECK: vpop {d8, d9}
+; CHECK: pop {r[[GLOBREG]], pc}
+
+  %var = alloca i8, i32 64
+
+  %tmp = load %bigVec* @var
+  call void @bar(i8* %var)
+  store %bigVec %tmp, %bigVec* @var
+
+  ret double 1.0
+}
+
+@dbl = global double 0.0
+
+; PR18136: there was a bug determining where the first eligible pop in a
+; basic-block was when the entire block was epilogue code.
+define void @test_fold_point(i1 %tst) minsize {
+; CHECK-LABEL: test_fold_point:
+
+  ; Important to check for beginning of basic block, because if it gets
+  ; if-converted the test is probably no longer checking what it should.
+; CHECK: {{LBB[0-9]+_2}}:
+; CHECK-NEXT: vpop {d7, d8}
+; CHECK-NEXT: pop {r4, pc}
+
+  ; With a guaranteed frame-pointer, we want to make sure that its offset in the
+  ; push block is correct, even if a few registers have been tacked onto a later
+  ; vpush (PR18160).
+; CHECK-IOS-LABEL: test_fold_point:
+; CHECK-IOS: push {r4, r7, lr}
+; CHECK-IOS-NEXT: add r7, sp, #4
+; CHECK-IOS-NEXT: vpush {d7, d8}
+
+  ; We want some memory so there's a stack adjustment to fold...
+  %var = alloca i8, i32 8
+
+  ; We want a long-lived floating register so that a callee-saved dN is used and
+  ; there's both a vpop and a pop.
+  %live_val = load double* @dbl
+  br i1 %tst, label %true, label %end
+true:
+  call void @bar(i8* %var)
+  store double %live_val, double* @dbl
+  br label %end
+end:
+  ; We want the epilogue to be the only thing in a basic block so that we hit
+  ; the correct edge-case (first inst in block is correct one to adjust).
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/ifconv-kills.ll b/test/CodeGen/ARM/ifconv-kills.ll
new file mode 100644
index 0000000..bf54ba2
--- /dev/null
+++ b/test/CodeGen/ARM/ifconv-kills.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march arm -mcpu swift -verify-machineinstrs
+
+declare i32 @f(i32 %p0, i32 %p1)
+
+define i32 @foo(i32* %ptr) {
+entry:
+  %cmp = icmp ne i32* %ptr, null
+  br i1 %cmp, label %if.then, label %if.else
+
+; present something which can be easily if-converted
+if.then:
+  ; %R0 should be killed here
+  %valt = load i32* %ptr, align 4
+  br label %return
+
+if.else:
+  ; %R0 should be killed here, however after if-conversion the %R0 kill
+  ; has to be removed because if.then will follow after this and still
+  ; read it.
+  %addr = getelementptr inbounds i32* %ptr, i32 4
+  %vale = load i32* %addr, align 4
+  br label %return
+
+return:
+  %phival = phi i32 [ %valt, %if.then ], [ %vale, %if.else ]
+  ; suggest to bring %phival/%valt/%vale into %R1 (because otherwise there
+  ; will be no kills in if.then/if.else)
+  %retval = call i32 @f (i32 0, i32 %phival)
+  ret i32 %retval
+}
diff --git a/test/CodeGen/ARM/ifconv-regmask.ll b/test/CodeGen/ARM/ifconv-regmask.ll
new file mode 100644
index 0000000..d45f65f
--- /dev/null
+++ b/test/CodeGen/ARM/ifconv-regmask.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios6.0.0 -verify-machineinstrs
+
+%union.opcode = type { i32 }
+
+@opcode = external global %union.opcode, align 4
+
+; Function Attrs: nounwind ssp
+define i32 @sfu() {
+entry:
+  %bf.load = load i32* getelementptr inbounds (%union.opcode* @opcode, i32 0, i32 0), align 4
+  %bf.lshr = lshr i32 %bf.load, 26
+  %bf.clear = and i32 %bf.lshr, 7
+  switch i32 %bf.clear, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %call = tail call i32 @func0()
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %call2 = tail call i32 @func1()
+  br label %return
+
+return:                                           ; preds = %sw.bb1, %sw.bb, %entry
+  %retval.0 = phi i32 [ %call2, %sw.bb1 ], [ %call, %sw.bb ], [ -1, %entry ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind ssp
+declare i32 @func0()
+
+; Function Attrs: nounwind ssp
+declare i32 @func1()
diff --git a/test/CodeGen/ARM/indirectbr.ll b/test/CodeGen/ARM/indirectbr.ll
index 99e84a6..1aeeb91 100644
--- a/test/CodeGen/ARM/indirectbr.ll
+++ b/test/CodeGen/ARM/indirectbr.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -relocation-model=pic -mtriple=armv6-apple-darwin | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -relocation-model=pic -mtriple=thumbv6-apple-darwin | FileCheck %s -check-prefix=THUMB
 ; RUN: llc < %s -relocation-model=static -mtriple=thumbv7-apple-darwin | FileCheck %s -check-prefix=THUMB2
+; RUN: llc < %s -relocation-model=static -mtriple=thumbv8-apple-darwin | FileCheck %s -check-prefix=THUMB2
 
 @nextaddr = global i8* null                       ; <i8**> [#uses=2]
 @C.0.2070 = private constant [5 x i8*] [i8* blockaddress(@foo, %L1), i8* blockaddress(@foo, %L2), i8* blockaddress(@foo, %L3), i8* blockaddress(@foo, %L4), i8* blockaddress(@foo, %L5)] ; <[5 x i8*]*> [#uses=1]
@@ -48,14 +49,17 @@ L2:                                               ; preds = %L3, %bb2
 
 L1:                                               ; preds = %L2, %bb2
   %res.3 = phi i32 [ %phitmp, %L2 ], [ 2, %bb2 ]  ; <i32> [#uses=1]
+; ARM-LABEL: %L1
 ; ARM: ldr [[R1:r[0-9]+]], LCPI
 ; ARM: add [[R1b:r[0-9]+]], pc, [[R1]]
 ; ARM: str [[R1b]]
+; THUMB-LABEL: %L1
 ; THUMB: ldr
 ; THUMB: add
 ; THUMB: ldr [[R2:r[0-9]+]], LCPI
 ; THUMB: add [[R2]], pc
 ; THUMB: str [[R2]]
+; THUMB2-LABEL: %L1
 ; THUMB2: ldr [[R2:r[0-9]+]], LCPI
 ; THUMB2-NEXT: str{{(.w)?}} [[R2]]
   store i8* blockaddress(@foo, %L5), i8** @nextaddr, align 4
diff --git a/test/CodeGen/ARM/inlineasm-64bit.ll b/test/CodeGen/ARM/inlineasm-64bit.ll
index b23db10..683a0c4 100644
--- a/test/CodeGen/ARM/inlineasm-64bit.ll
+++ b/test/CodeGen/ARM/inlineasm-64bit.ll
@@ -85,3 +85,22 @@ define void @strd_test(i64* %p, i32 %lo, i32 %hi) nounwind {
   tail call void asm sideeffect "strd $0, ${0:H}, [$1]", "r,r"(i64 %4, i64* %p) nounwind
   ret void
 }
+
+; Make sure we don't untie operands by mistake.
+define i64 @tied_64bit_test(i64 %in) nounwind {
+; CHECK-LABEL: tied_64bit_test:
+; CHECK: OUT([[OUTREG:r[0-9]+]]), IN([[OUTREG]])
+  %addr = alloca i64
+  call void asm "OUT($0), IN($1)", "=*rm,0"(i64* %addr, i64 %in)
+  ret i64 %in
+}
+
+; If we explicitly name a tied operand, then the code should lookup the operand
+; we were tied to for information about register class and so on.
+define i64 @tied_64bit_lookback_test(i64 %in) nounwind {
+; CHECK-LABEL: tied_64bit_lookback_test:
+; CHECK: OUTLO([[LO:r[0-9]+]]) OUTHI([[HI:r[0-9]+]]) INLO([[LO]]) INHI([[HI]])
+  %vars = call {i64, i32, i64} asm "OUTLO(${2:Q}) OUTHI(${2:R}) INLO(${3:Q}) INHI(${3:R})", "=r,=r,=r,2"(i64 %in)
+  %res = extractvalue {i64, i32, i64} %vars, 2
+  ret i64 %res
+}
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
new file mode 100644
index 0000000..217fd69
--- /dev/null
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -0,0 +1,130 @@
+; RUN: llc -mtriple=arm-none-none-eabi -mcpu=cortex-a15 -o - %s | FileCheck --check-prefix=CHECK-A %s
+; RUN: llc -mtriple=thumb-none-none-eabi -mcpu=cortex-a15 -o - %s | FileCheck --check-prefix=CHECK-A-THUMB %s
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-m3 -o - %s | FileCheck --check-prefix=CHECK-M %s
+
+declare arm_aapcscc void @bar()
+
+@bigvar = global [16 x i32] zeroinitializer
+
+define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
+  ; Must save all registers except banked sp and lr (we save lr anyway because
+  ; we actually need it at the end to execute the return ourselves).
+
+  ; Also need special function return setting pc and CPSR simultaneously.
+; CHECK-A-LABEL: irq_fn:
+; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: add r11, sp, #16
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; CHECK-A: bl bar
+; CHECK-A: sub sp, r11, #16
+; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: subs pc, lr, #4
+
+; CHECK-A-THUMB-LABEL: irq_fn:
+; CHECK-A-THUMB: push {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-A-THUMB: mov r4, sp
+; CHECK-A-THUMB: add r7, sp, #20
+; CHECK-A-THUMB: bic r4, r4, #7
+; CHECK-A-THUMB: bl bar
+; CHECK-A-THUMB: sub.w r4, r7,  #20
+; CHECK-A-THUMB: mov sp, r4
+; CHECK-A-THUMB: pop.w {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-A-THUMB: subs pc, lr, #4
+
+  ; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
+  ; appropriate sentinel so no special return needed).
+; CHECK-M: push {r4, r7, lr}
+; CHECK-M: add r7, sp, #4
+; CHECK-M: sub sp, #4
+; CHECK-M: mov r4, sp
+; CHECK-M: mov sp, r4
+; CHECK-M: blx _bar
+; CHECK-M: subs r4, r7, #4
+; CHECK-M: mov sp, r4
+; CHECK-M: pop {r4, r7, pc}
+
+  call arm_aapcscc void @bar()
+  ret void
+}
+
+define arm_aapcscc void @fiq_fn() alignstack(8) "interrupt"="FIQ" {
+; CHECK-A-LABEL: fiq_fn:
+; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r11, lr}
+  ; 32 to get past r0, r1, ..., r7
+; CHECK-A: add r11, sp, #32
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+  ; 32 must match above
+; CHECK-A: sub sp, r11, #32
+; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r11, lr}
+; CHECK-A: subs pc, lr, #4
+
+  %val = load volatile [16 x i32]* @bigvar
+  store volatile [16 x i32] %val, [16 x i32]* @bigvar
+  ret void
+}
+
+define arm_aapcscc void @swi_fn() alignstack(8) "interrupt"="SWI" {
+; CHECK-A-LABEL: swi_fn:
+; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-A: add r11, sp, #44
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+; CHECK-A: sub sp, r11, #44
+; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-A: subs pc, lr, #0
+
+  %val = load volatile [16 x i32]* @bigvar
+  store volatile [16 x i32] %val, [16 x i32]* @bigvar
+  ret void
+}
+
+define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
+; CHECK-A-LABEL: undef_fn:
+; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: add r11, sp, #16
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+; CHECK-A: sub sp, r11, #16
+; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: subs pc, lr, #0
+
+  call void @bar()
+  ret void
+}
+
+define arm_aapcscc void @abort_fn() alignstack(8) "interrupt"="ABORT" {
+; CHECK-A-LABEL: abort_fn:
+; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: add r11, sp, #16
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+; CHECK-A: sub sp, r11, #16
+; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: subs pc, lr, #4
+
+  call void @bar()
+  ret void
+}
+
+@var = global double 0.0
+
+; We don't save VFP regs, since it would be a massive overhead in the general
+; case.
+define arm_aapcscc void @floating_fn() alignstack(8) "interrupt"="IRQ" {
+; CHECK-A-LABEL: floating_fn:
+; CHECK-A-NOT: vpush
+; CHECK-A-NOT: vstr
+; CHECK-A-NOT: vstm
+; CHECK-A: vadd.f64 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %lhs = load volatile double* @var
+  %rhs = load volatile double* @var
+  %sum = fadd double %lhs, %rhs
+  store double %sum, double* @var
+  ret void
+}
diff --git a/test/CodeGen/ARM/intrinsics-crypto.ll b/test/CodeGen/ARM/intrinsics-crypto.ll
new file mode 100644
index 0000000..c038fe6
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-crypto.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=armv8 -mattr=+crypto | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_aesde(<16 x i8>* %a, <16 x i8> *%b) {
+  %tmp = load <16 x i8>* %a
+  %tmp2 = load <16 x i8>* %b
+  %tmp3 = call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %tmp, <16 x i8> %tmp2)
+  ; CHECK: aesd.8 q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp4 = call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %tmp3, <16 x i8> %tmp2)
+  ; CHECK: aese.8 q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp5 = call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %tmp4)
+  ; CHECK: aesimc.8 q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp6 = call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %tmp5)
+  ; CHECK: aesmc.8 q{{[0-9]+}}, q{{[0-9]+}}
+  ret <16 x i8> %tmp6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_sha(<4 x i32> *%a, <4 x i32> *%b, <4 x i32> *%c) {
+  %tmp = load <4 x i32>* %a
+  %tmp2 = load <4 x i32>* %b
+  %tmp3 = load <4 x i32>* %c
+  %res1 = call <4 x i32> @llvm.arm.neon.sha1h.v4i32(<4 x i32> %tmp)
+  ; CHECK: sha1h.32 q{{[0-9]+}}, q{{[0-9]+}}
+  %res2 = call <4 x i32> @llvm.arm.neon.sha1c.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1c.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res3 = call <4 x i32> @llvm.arm.neon.sha1m.v4i32(<4 x i32> %res2, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1m.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res4 = call <4 x i32> @llvm.arm.neon.sha1p.v4i32(<4 x i32> %res3, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1p.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res5 = call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %res4, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1su0.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res6 = call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %res5, <4 x i32> %res1)
+  ; CHECK: sha1su1.32 q{{[0-9]+}}, q{{[0-9]+}}
+  %res7 = call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %res6, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha256h.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res8 = call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %res7, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha256h2.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res9 = call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %res8, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha256su1.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res10 = call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %res9, <4 x i32> %tmp3)
+  ; CHECK: sha256su0.32 q{{[0-9]+}}, q{{[0-9]+}}
+  ret <4 x i32> %res10
+}
+
+declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>)
+declare <4 x i32> @llvm.arm.neon.sha1h.v4i32(<4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1c.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1m.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1p.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>)
diff --git a/test/CodeGen/ARM/intrinsics-v8.ll b/test/CodeGen/ARM/intrinsics-v8.ll
new file mode 100644
index 0000000..247bfc1
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-v8.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=armv8 -mattr=+db | FileCheck %s
+
+define void @test() {
+  ; CHECK: dmb sy
+  call void @llvm.arm.dmb(i32 15)
+  ; CHECK: dmb osh
+  call void @llvm.arm.dmb(i32 3)
+  ; CHECK: dsb sy
+  call void @llvm.arm.dsb(i32 15)
+  ; CHECK: dsb ishld
+  call void @llvm.arm.dsb(i32 9)
+  ; CHECK: sevl
+  tail call void @llvm.arm.sevl() nounwind
+  ret void
+}
+
+declare void @llvm.arm.dmb(i32)
+declare void @llvm.arm.dsb(i32)
+declare void @llvm.arm.sevl() nounwind
diff --git a/test/CodeGen/ARM/lit.local.cfg b/test/CodeGen/ARM/lit.local.cfg
index 4d75f58..8a3ba96 100644
--- a/test/CodeGen/ARM/lit.local.cfg
+++ b/test/CodeGen/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/ARM/long_shift.ll b/test/CodeGen/ARM/long_shift.ll
index a99a7ec..3e986d80 100644
--- a/test/CodeGen/ARM/long_shift.ll
+++ b/test/CodeGen/ARM/long_shift.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
 
 define i64 @f0(i64 %A, i64 %B) {
-; CHECK: f0
+; CHECK-LABEL: f0:
 ; CHECK:      lsrs    r3, r3, #1
 ; CHECK-NEXT: rrx     r2, r2
 ; CHECK-NEXT: subs    r0, r0, r2
@@ -13,7 +13,7 @@ define i64 @f0(i64 %A, i64 %B) {
 }
 
 define i32 @f1(i64 %x, i64 %y) {
-; CHECK: f1
+; CHECK-LABEL: f1:
 ; CHECK: lsl{{.*}}r2
 	%a = shl i64 %x, %y
 	%b = trunc i64 %a to i32
@@ -21,7 +21,7 @@ define i32 @f1(i64 %x, i64 %y) {
 }
 
 define i32 @f2(i64 %x, i64 %y) {
-; CHECK: f2
+; CHECK-LABEL: f2:
 ; CHECK:      lsr{{.*}}r2
 ; CHECK-NEXT: rsb     r3, r2, #32
 ; CHECK-NEXT: sub     r2, r2, #32
@@ -34,7 +34,7 @@ define i32 @f2(i64 %x, i64 %y) {
 }
 
 define i32 @f3(i64 %x, i64 %y) {
-; CHECK: f3
+; CHECK-LABEL: f3:
 ; CHECK:      lsr{{.*}}r2
 ; CHECK-NEXT: rsb     r3, r2, #32
 ; CHECK-NEXT: sub     r2, r2, #32
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index c274545..5da335f 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -65,7 +65,7 @@ if.end28:                                         ; preds = %if.then24, %while.c
   %dst.1 = phi %struct.rtx_def* [ undef, %if.then24 ], [ %dst.0, %while.cond ], [ %dst.0, %while.cond ]
   %arrayidx30 = getelementptr inbounds %struct.rtx_def* %dst.1, i32 0, i32 1, i32 0
   %rtx31 = bitcast %union.rtunion_def* %arrayidx30 to %struct.rtx_def**
-  %0 = load %struct.rtx_def** %rtx31, align 4, !tbaa !0
+  %0 = load %struct.rtx_def** %rtx31, align 4
   br label %while.cond
 
 if.then46:                                        ; preds = %while.cond
@@ -77,7 +77,3 @@ if.end47:                                         ; preds = %while.cond
 }
 
 attributes #0 = { nounwind ssp }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/neon-spfp.ll b/test/CodeGen/ARM/neon-spfp.ll
index 5385668..dd2e67f 100644
--- a/test/CodeGen/ARM/neon-spfp.ll
+++ b/test/CodeGen/ARM/neon-spfp.ll
@@ -1,20 +1,20 @@
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 | FileCheck %s -check-prefix=LINUXA5
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=LINUXA8
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 | FileCheck %s -check-prefix=LINUXA9
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 | FileCheck %s -check-prefix=LINUXA15
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift | FileCheck %s -check-prefix=LINUXSWIFT
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 | FileCheck %s -check-prefix=CHECK-LINUXA5
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-LINUXA8
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 | FileCheck %s -check-prefix=CHECK-LINUXA9
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 | FileCheck %s -check-prefix=CHECK-LINUXA15
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift | FileCheck %s -check-prefix=CHECK-LINUXSWIFT
 
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA5
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA8
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA9
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA15
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFESWIFT
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA5
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA8
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA9
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA15
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFESWIFT
 
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a5 | FileCheck %s -check-prefix=DARWINA5
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=DARWINA8
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=DARWINA9
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a15 | FileCheck %s -check-prefix=DARWINA15
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=swift | FileCheck %s -check-prefix=DARWINSWIFT
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a5 | FileCheck %s -check-prefix=CHECK-DARWINA5
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-DARWINA8
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=CHECK-DARWINA9
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a15 | FileCheck %s -check-prefix=CHECK-DARWINA15
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=swift | FileCheck %s -check-prefix=CHECK-DARWINSWIFT
 
 ; This test makes sure we're not lowering VMUL.f32 D* (aka. NEON) for single-prec. FP ops, since
 ; NEON is not fully IEEE 754 compliant, unless unsafe-math is selected.
diff --git a/test/CodeGen/ARM/neon_spill.ll b/test/CodeGen/ARM/neon_spill.ll
index 277bd05..d286d16 100644
--- a/test/CodeGen/ARM/neon_spill.ll
+++ b/test/CodeGen/ARM/neon_spill.ll
@@ -24,7 +24,7 @@ declare arm_aapcs_vfpcc %2** @func4()
 define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
   call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   %2 = call arm_aapcs_vfpcc  %0** @func2() nounwind
-  %3 = load %0** %2, align 4, !tbaa !0
+  %3 = load %0** %2, align 4
   store float 0.000000e+00, float* undef, align 4
   %4 = call arm_aapcs_vfpcc  %2* @func3(%2* undef, %2* undef, i32 2956) nounwind
   call arm_aapcs_vfpcc  void @func1(%0* %3, float* undef, float* undef, %2* undef)
@@ -35,11 +35,11 @@ define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
   %6 = call arm_aapcs_vfpcc  %2** @func4() nounwind
   %7 = call arm_aapcs_vfpcc  %2* @func3(%2* undef, %2* undef, i32 2971) nounwind
   %8 = fadd float undef, -1.000000e+05
-  store float %8, float* undef, align 16, !tbaa !3
+  store float %8, float* undef, align 16
   %9 = call arm_aapcs_vfpcc  i32 @rand() nounwind
   %10 = fmul float undef, 2.000000e+05
   %11 = fadd float %10, -1.000000e+05
-  store float %11, float* undef, align 4, !tbaa !3
+  store float %11, float* undef, align 4
   call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   ret void
 }
@@ -47,8 +47,3 @@ define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
 
 declare arm_aapcs_vfpcc i32 @rand()
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"float", metadata !1}
diff --git a/test/CodeGen/ARM/no-fpu.ll b/test/CodeGen/ARM/no-fpu.ll
new file mode 100644
index 0000000..fff4bcc
--- /dev/null
+++ b/test/CodeGen/ARM/no-fpu.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon,-vfp2 | FileCheck --check-prefix=NONEON-NOVFP %s
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon | FileCheck --check-prefix=NONEON %s
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-vfp2 | FileCheck --check-prefix=NOVFP %s
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon,+vfp2 | FileCheck --check-prefix=NONEON-VFP %s
+
+; Check no NEON instructions are selected when feature is disabled.
+define void @neonop(i64* nocapture readonly %a, i64* nocapture %b) #0 {
+  %1 = bitcast i64* %a to <2 x i64>*
+  %wide.load = load <2 x i64>* %1, align 8
+  ; NONEON-NOVFP-NOT: vld1.64
+  ; NONEON-NOT: vld1.64
+  %add = add <2 x i64> %wide.load, %wide.load
+  ; NONEON-NOVFP-NOT: vadd.i64
+  ; NONEON-NOT: vadd.i64
+  %2 = bitcast i64* %b to <2 x i64>*
+  store <2 x i64> %add, <2 x i64>* %2, align 8
+  ; NONEON-NOVFP-NOT: vst1.64
+  ; NONEON-NOT: vst1.64
+  ret void
+}
+
+; Likewise with VFP instructions.
+define double @fpmult(double %a, double %b) {
+  %res = fmul double %a, %b
+  ; NONEON-NOVFP-NOT: vmov
+  ; NONEON-NOVFP-NOT: vmul.f64
+  ; NOVFP-NOT: vmov
+  ; NOVFP-NOT: vmul.f64
+  ; NONEON-VFP: vmov
+  ; NONEON-VFP: vmul.f64
+  ret double %res
+}
+
diff --git a/test/CodeGen/ARM/noreturn.ll b/test/CodeGen/ARM/noreturn.ll
new file mode 100644
index 0000000..4c876ce
--- /dev/null
+++ b/test/CodeGen/ARM/noreturn.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O3 -o - %s | FileCheck %s
+; Test case from PR16882.
+target triple = "thumbv7s-apple-ios"
+
+define i32 @test1() {
+; CHECK-LABEL: @test1
+; CHECK-NOT: push
+entry:
+  tail call void @overflow() #0
+  unreachable
+}
+
+; Function Attrs: noreturn nounwind
+declare void @overflow() #0
+
+define i32 @test2(i32 %x, i32 %y) {
+; CHECK-LABEL: @test2
+; CHECK-NOT: push
+; CHECK-NOT: pop
+entry:
+  %conv = sext i32 %x to i64
+  %conv1 = sext i32 %y to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %conv2 = trunc i64 %mul to i32
+  %conv3 = sext i32 %conv2 to i64
+  %cmp = icmp eq i64 %mul, %conv3
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @overflow() #0
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 %conv2
+}
+
+; Test case for PR17825.
+define i32 @test3() {
+; CHECK-LABEL: @test3
+; CHECK: push
+entry:
+  tail call void @overflow_with_unwind() #1
+  unreachable
+}
+
+; Function Attrs: noreturn
+declare void @overflow_with_unwind() #1
+
+attributes #0 = { noreturn nounwind }
+attributes #1 = { noreturn }
diff --git a/test/CodeGen/ARM/optselect-regclass.ll b/test/CodeGen/ARM/optselect-regclass.ll
new file mode 100644
index 0000000..1aa4520
--- /dev/null
+++ b/test/CodeGen/ARM/optselect-regclass.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=arm -mcpu=swift -verify-machineinstrs
+%union.opcode.0.2.5.8.15.28 = type { i32 }
+
+@opcode = external global %union.opcode.0.2.5.8.15.28, align 4
+@operands = external hidden global [50 x i8], align 4
+@.str86 = external hidden unnamed_addr constant [13 x i8], align 1
+
+; Function Attrs: nounwind ssp
+define void @xfr() {
+entry:
+  %bf.load4 = load i32* getelementptr inbounds (%union.opcode.0.2.5.8.15.28* @opcode, i32 0, i32 0), align 4
+  %bf.clear10 = and i32 %bf.load4, 65535
+  %and11 = and i32 %bf.load4, 32768
+  %tobool12 = icmp ne i32 %and11, 0
+  %cond13 = select i1 %tobool12, i32 1073676288, i32 0
+  %or = or i32 %cond13, %bf.clear10
+  %shl = shl nuw i32 %or, 2
+  %add = add i32 0, %shl
+  tail call void (i8*, i32, i32, i8*, ...)* @__sprintf_chk(i8* getelementptr inbounds ([50 x i8]* @operands, i32 0, i32 0), i32 0, i32 50, i8* getelementptr inbounds ([13 x i8]* @.str86, i32 0, i32 0), i32 undef, i32 undef, i32 %add)
+  ret void
+}
+
+declare void @__sprintf_chk(i8*, i32, i32, i8*, ...)
diff --git a/test/CodeGen/ARM/pic.ll b/test/CodeGen/ARM/pic.ll
new file mode 100644
index 0000000..9fc7a63
--- /dev/null
+++ b/test/CodeGen/ARM/pic.ll
@@ -0,0 +1,23 @@
+; Check the function call in PIC relocation model.
+
+; If the relocation model is PIC, then the "bl" instruction for the function
+; call to the external function should come with PLT fixup type.
+
+; RUN:  llc < %s -mtriple=armv7-unknown-linux-gnueabi \
+; RUN:           -relocation-model=pic -fast-isel -verify-machineinstrs \
+; RUN:    | FileCheck %s
+
+define void @test() {
+entry:
+
+  %0 = call i32 @get()
+; CHECK: bl get(PLT)
+
+  call void @put(i32 %0)
+; CHECK: bl put(PLT)
+
+  ret void
+}
+
+declare i32 @get()
+declare void @put(i32)
diff --git a/test/CodeGen/ARM/prefetch-thumb.ll b/test/CodeGen/ARM/prefetch-thumb.ll
new file mode 100644
index 0000000..e6f6ae8
--- /dev/null
+++ b/test/CodeGen/ARM/prefetch-thumb.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=thumb -mattr=+v7         | FileCheck %s -check-prefix=THUMB2
+; TODO: This test case will be merged back into prefetch.ll when ARM mode issue is solved.
+
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
+
+define void @t6() {
+entry:
+;ARM: t6:
+;ARM: pld [sp]
+;ARM: pld [sp, #50]
+
+;THUMB2: t6:
+;THUMB2: pld [sp]
+;THUMB2: pld [sp, #50]
+
+%red = alloca [100 x i8], align 1
+%0 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 0
+%1 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 50
+call void @llvm.prefetch(i8* %0, i32 0, i32 3, i32 1)
+call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1)
+ret void
+}
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 3fe2bb8..25484f4 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -239,10 +239,9 @@ bb14:                                             ; preds = %bb6
 ; PR7157
 define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
 ; CHECK-LABEL:        t9:
-; CHECK:        vldr
-; CHECK-NOT:    vmov d{{.*}}, d16
-; CHECK:        vmov.i32 d17
+; CHECK: vmov.i32 d16, #0x0
 ; CHECK-NEXT:   vst1.64 {d16, d17}, [r0:128]
+; CHECK-NEXT:   vorr d17, d16, d16
 ; CHECK-NEXT:   vst1.64 {d16, d17}, [r0:128]
   %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
   %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index 5e7506a..6f4bfb8 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -11,7 +11,7 @@ entry:
 
 ; ARMT2-LABEL: t1:
 ; ARMT2: movw [[R:r[0-1]]], #357
-; ARMT2: movgt [[R]], #123
+; ARMT2: movwgt [[R]], #123
 
 ; THUMB2-LABEL: t1:
 ; THUMB2: movw [[R:r[0-1]]], #357
@@ -25,9 +25,9 @@ entry:
 define i32 @t2(i32 %c) nounwind readnone {
 entry:
 ; ARM-LABEL: t2:
-; ARM: mov [[R:r[0-1]]], #123
-; ARM: movgt [[R]], #101
-; ARM: orrgt [[R]], [[R]], #256
+; ARM: mov [[R:r[0-9]+]], #101
+; ARM: orr [[R]], [[R]], #256
+; ARM: movle [[R]], #123
 
 ; ARMT2-LABEL: t2:
 ; ARMT2: mov [[R:r[0-1]]], #123
@@ -50,7 +50,7 @@ entry:
 
 ; ARMT2-LABEL: t3:
 ; ARMT2: mov [[R:r[0-1]]], #0
-; ARMT2: moveq [[R]], #1
+; ARMT2: movweq [[R]], #1
 
 ; THUMB2-LABEL: t3:
 ; THUMB2: mov{{(s|\.w)}} [[R:r[0-1]]], #0
diff --git a/test/CodeGen/ARM/select-undef.ll b/test/CodeGen/ARM/select-undef.ll
new file mode 100644
index 0000000..23f7eb8
--- /dev/null
+++ b/test/CodeGen/ARM/select-undef.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=arm -mcpu=swift -verify-machineinstrs
+define i32 @func(i32 %arg0, i32 %arg1) {
+entry:
+  %cmp = icmp slt i32 %arg0, 10
+  %v = select i1 %cmp, i32 undef, i32 %arg1
+  ret i32 %v
+}
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index d5c3a27..ed006d6 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -59,7 +59,7 @@ entry:
 define double @f7(double %a, double %b) {
 ;CHECK-LABEL: f7:
 ;CHECK: movlt
-;CHECK: movlt
+;CHECK: movge
 ;CHECK-VFP-LABEL: f7:
 ;CHECK-VFP: vmovmi
     %tmp = fcmp olt double %a, 1.234e+00
@@ -75,7 +75,7 @@ define double @f7(double %a, double %b) {
 ; into the constant pool based on the value of the "icmp". If we have one "it"
 ; block generated, odds are good that we have close to the ideal code for this:
 ;
-; CHECK-NEON:      _f8:
+; CHECK-NEON-LABEL: f8:
 ; CHECK-NEON:      movw    [[R3:r[0-9]+]], #1123
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
@@ -113,7 +113,7 @@ entry:
   ret void
 }
 
-; CHECK: f10
+; CHECK-LABEL: f10:
 define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
 ; CHECK-NOT: floatsisf
   %1 = icmp eq i32 %a, %b
@@ -122,7 +122,7 @@ define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
   ret float %3
 }
 
-; CHECK: f11
+; CHECK-LABEL: f11:
 define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
 ; CHECK-NOT: floatsisf
   %1 = icmp eq i32 %a, %b
@@ -130,7 +130,7 @@ define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
   ret float %2
 }
 
-; CHECK: f12
+; CHECK-LABEL: f12:
 define float @f12(i32 %a, i32 %b) nounwind uwtable readnone ssp {
 ; CHECK-NOT: floatunsisf
   %1 = icmp eq i32 %a, %b
diff --git a/test/CodeGen/ARM/setcc-sentinals.ll b/test/CodeGen/ARM/setcc-sentinals.ll
new file mode 100644
index 0000000..8878f9b
--- /dev/null
+++ b/test/CodeGen/ARM/setcc-sentinals.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mcpu=cortex-a8 -march=arm -asm-verbose=false | FileCheck %s
+
+define zeroext i1 @test0(i32 %x) nounwind {
+; CHECK-LABEL: test0:
+; CHECK-NEXT: add [[REG:(r[0-9]+)|(lr)]], r0, #1
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: cmp [[REG]], #1
+; CHECK-NEXT: movwhi r0, #1
+; CHECK-NEXT: bx  lr
+  %cmp1 = icmp ne i32 %x, -1
+  %not.cmp = icmp ne i32 %x, 0
+  %.cmp1 = and i1 %cmp1, %not.cmp
+  ret i1 %.cmp1
+}
diff --git a/test/CodeGen/ARM/sincos.ll b/test/CodeGen/ARM/sincos.ll
new file mode 100644
index 0000000..30b2664
--- /dev/null
+++ b/test/CodeGen/ARM/sincos.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT
+; RUN: llc < %s -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; SINCOS-LABEL: test1:
+; SINCOS: bl ___sincosf_stret
+
+; NOOPT-LABEL: test1:
+; NOOPT: bl _sinf
+; NOOPT: bl _cosf
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; SINCOS-LABEL: test2:
+; SINCOS: bl ___sincos_stret
+
+; NOOPT-LABEL: test2:
+; NOOPT: bl _sin
+; NOOPT: bl _cos
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
diff --git a/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll b/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
new file mode 100644
index 0000000..f5cda14
--- /dev/null
+++ b/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
@@ -0,0 +1,32 @@
+; RUN: llc -O3 -mcpu=swift -mtriple=armv7s-apple-ios6.0.0 %s -o /dev/null
+; rdar://14811848
+
+; Make sure that we do not emit the BMOVPCB_CALL instruction for now or if we
+; fix the assumptions in its implementation that we do not crash when doing it.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "armv7s-apple-ios6.0.0"
+
+@main.title = private unnamed_addr constant [15 x i8] c"foo and stuff\0A\00", align 1
+@.str = private unnamed_addr constant [3 x i8] c"%s\00", align 1
+
+; Function Attrs: nounwind optsize ssp
+define i32 @main() #0 {
+entry:
+  %title = alloca [15 x i8], align 1
+  %0 = getelementptr inbounds [15 x i8]* %title, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* getelementptr inbounds ([15 x i8]* @main.title, i32 0, i32 0), i32 15, i32 1, i1 false)
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i8* %0) #3
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+; Function Attrs: nounwind optsize
+declare i32 @printf(i8* nocapture readonly, ...) #2
+
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind optsize }
diff --git a/test/CodeGen/ARM/struct-byval-frame-index.ll b/test/CodeGen/ARM/struct-byval-frame-index.ll
index ae68ce5..465ee12 100644
--- a/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/test/CodeGen/ARM/struct-byval-frame-index.ll
@@ -72,10 +72,10 @@ declare void @SetMotionVectorsMB(%structK* nocapture, i32) #1
 ; Function Attrs: nounwind
 define void @set_stored_macroblock_parameters() #1 {
 entry:
-  %0 = load %structB** @img, align 4, !tbaa !0
-  %1 = load i32* undef, align 4, !tbaa !3
+  %0 = load %structB** @img, align 4
+  %1 = load i32* undef, align 4
   %mb_data = getelementptr inbounds %structB* %0, i32 0, i32 61
-  %2 = load %structK** %mb_data, align 4, !tbaa !0
+  %2 = load %structK** %mb_data, align 4
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
@@ -109,7 +109,7 @@ for.body119:                                      ; preds = %for.body119, %for.c
   br i1 undef, label %for.body119, label %if.end164
 
 if.end164:                                        ; preds = %for.body119, %for.cond47.preheader, %if.end43
-  store i32*** null, i32**** @cofDC, align 4, !tbaa !0
+  store i32*** null, i32**** @cofDC, align 4
   %mb_type = getelementptr inbounds %structK* %2, i32 %1, i32 8
   br i1 undef, label %if.end230, label %if.then169
 
@@ -134,7 +134,7 @@ if.then233:                                       ; preds = %if.end230
 
 if.end236:                                        ; preds = %if.end230
   %cmp242 = icmp ne i16 undef, 8
-  %4 = load i32* @luma_transform_size_8x8_flag, align 4, !tbaa !3
+  %4 = load i32* @luma_transform_size_8x8_flag, align 4
   %tobool245 = icmp ne i32 %4, 0
   %or.cond812 = or i1 %cmp242, %tobool245
   br i1 %or.cond812, label %if.end249, label %land.lhs.true246
@@ -150,11 +150,11 @@ if.then248:                                       ; preds = %land.lhs.true246
   br label %if.end249
 
 if.end249:                                        ; preds = %if.then248, %land.lhs.true246, %if.end236
-  %5 = load i32* @luma_transform_size_8x8_flag, align 4, !tbaa !3
-  %6 = load %structA** @rdopt, align 4, !tbaa !0
+  %5 = load i32* @luma_transform_size_8x8_flag, align 4
+  %6 = load %structA** @rdopt, align 4
   %luma_transform_size_8x8_flag264 = getelementptr inbounds %structA* %6, i32 0, i32 21
-  store i32 %5, i32* %luma_transform_size_8x8_flag264, align 4, !tbaa !3
-  %7 = load i32* undef, align 4, !tbaa !3
+  store i32 %5, i32* %luma_transform_size_8x8_flag264, align 4
+  %7 = load i32* undef, align 4
   %add281 = add nsw i32 %7, 0
   br label %for.body285
 
@@ -162,36 +162,36 @@ for.body285:                                      ; preds = %for.inc503, %if.end
   %8 = phi %structB* [ undef, %if.end249 ], [ %.pre1155, %for.inc503 ]
   %i.21103 = phi i32 [ 0, %if.end249 ], [ %inc504, %for.inc503 ]
   %block_x286 = getelementptr inbounds %structB* %8, i32 0, i32 37
-  %9 = load i32* %block_x286, align 4, !tbaa !3
+  %9 = load i32* %block_x286, align 4
   %add287 = add nsw i32 %9, %i.21103
   %shr289 = ashr i32 %i.21103, 1
   %add290 = add nsw i32 %shr289, 0
   %arrayidx292 = getelementptr inbounds %structK* %2, i32 %1, i32 15, i32 %add290
-  %10 = load %structM** @enc_picture, align 4, !tbaa !0
+  %10 = load %structM** @enc_picture, align 4
   %ref_idx = getelementptr inbounds %structM* %10, i32 0, i32 35
-  %11 = load i8**** %ref_idx, align 4, !tbaa !0
-  %12 = load i8*** %11, align 4, !tbaa !0
+  %11 = load i8**** %ref_idx, align 4
+  %12 = load i8*** %11, align 4
   %arrayidx313 = getelementptr inbounds i8** %12, i32 %add281
-  %13 = load i8** %arrayidx313, align 4, !tbaa !0
+  %13 = load i8** %arrayidx313, align 4
   %arrayidx314 = getelementptr inbounds i8* %13, i32 %add287
-  store i8 -1, i8* %arrayidx314, align 1, !tbaa !1
-  %14 = load %structB** @img, align 4, !tbaa !0
+  store i8 -1, i8* %arrayidx314, align 1
+  %14 = load %structB** @img, align 4
   %MbaffFrameFlag327 = getelementptr inbounds %structB* %14, i32 0, i32 100
-  %15 = load i32* %MbaffFrameFlag327, align 4, !tbaa !3
+  %15 = load i32* %MbaffFrameFlag327, align 4
   %tobool328 = icmp eq i32 %15, 0
   br i1 %tobool328, label %if.end454, label %if.then329
 
 if.then329:                                       ; preds = %for.body285
-  %16 = load %structA** @rdopt, align 4, !tbaa !0
+  %16 = load %structA** @rdopt, align 4
   br label %if.end454
 
 if.end454:                                        ; preds = %if.then329, %for.body285
-  %17 = load i32* %arrayidx292, align 4, !tbaa !3
+  %17 = load i32* %arrayidx292, align 4
   %cmp457 = icmp eq i32 %17, 0
   br i1 %cmp457, label %if.then475, label %lor.lhs.false459
 
 lor.lhs.false459:                                 ; preds = %if.end454
-  %18 = load i32* %mb_type, align 4, !tbaa !3
+  %18 = load i32* %mb_type, align 4
   switch i32 %18, label %for.inc503 [
     i32 9, label %if.then475
     i32 10, label %if.then475
@@ -200,12 +200,12 @@ lor.lhs.false459:                                 ; preds = %if.end454
   ]
 
 if.then475:                                       ; preds = %lor.lhs.false459, %lor.lhs.false459, %lor.lhs.false459, %lor.lhs.false459, %if.end454
-  store i16 0, i16* undef, align 2, !tbaa !4
+  store i16 0, i16* undef, align 2
   br label %for.inc503
 
 for.inc503:                                       ; preds = %if.then475, %lor.lhs.false459
   %inc504 = add nsw i32 %i.21103, 1
-  %.pre1155 = load %structB** @img, align 4, !tbaa !0
+  %.pre1155 = load %structB** @img, align 4
   br label %for.body285
 }
 
@@ -216,10 +216,4 @@ declare void @update_offset_params(i32, i32) #1
 declare void @RestoreMVBlock8x8(i32, i32, %structN* byval nocapture, i32) #1
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"int", metadata !1}
-!4 = metadata !{metadata !"short", metadata !1}
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
index 012b994..130925a 100644
--- a/test/CodeGen/ARM/struct_byval.ll
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios6.0 | FileCheck %s -check-prefix=THUMB
 
 ; rdar://9877866
 %struct.SmallStruct = type { i32, [8 x i32], [37 x i8] }
@@ -10,6 +11,10 @@ entry:
 ; CHECK: ldr
 ; CHECK: str
 ; CHECK-NOT:bne
+; THUMB-LABEL: f:
+; THUMB: ldr
+; THUMB: str
+; THUMB-NOT:bne
   %st = alloca %struct.SmallStruct, align 4
   %call = call i32 @e1(%struct.SmallStruct* byval %st)
   ret i32 0
@@ -23,6 +28,11 @@ entry:
 ; CHECK: sub
 ; CHECK: str
 ; CHECK: bne
+; THUMB-LABEL: g:
+; THUMB: ldr
+; THUMB: sub
+; THUMB: str
+; THUMB: bne
   %st = alloca %struct.LargeStruct, align 4
   %call = call i32 @e2(%struct.LargeStruct* byval %st)
   ret i32 0
@@ -36,6 +46,11 @@ entry:
 ; CHECK: sub
 ; CHECK: vst1
 ; CHECK: bne
+; THUMB-LABEL: h:
+; THUMB: vld1
+; THUMB: sub
+; THUMB: vst1
+; THUMB: bne
   %st = alloca %struct.LargeStruct, align 16
   %call = call i32 @e3(%struct.LargeStruct* byval align 16 %st)
   ret i32 0
@@ -49,8 +64,10 @@ declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
 ; We can't do tail call since address of s is passed to the callee and part of
 ; s is in caller's local frame.
 define void @f3(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f3
+; CHECK-LABEL: f3
 ; CHECK: bl _consumestruct
+; THUMB-LABEL: f3
+; THUMB: blx _consumestruct
 entry:
   %0 = bitcast %struct.SmallStruct* %s to i8*
   tail call void @consumestruct(i8* %0, i32 80) optsize
@@ -58,8 +75,10 @@ entry:
 }
 
 define void @f4(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f4
+; CHECK-LABEL: f4
 ; CHECK: bl _consumestruct
+; THUMB-LABEL: f4
+; THUMB: blx _consumestruct
 entry:
   %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
   %0 = bitcast i32* %addr to i8*
@@ -69,8 +88,10 @@ entry:
 
 ; We can do tail call here since s is in the incoming argument area.
 define void @f5(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f5
+; CHECK-LABEL: f5
 ; CHECK: b _consumestruct
+; THUMB-LABEL: f5
+; THUMB: b.w _consumestruct
 entry:
   %0 = bitcast %struct.SmallStruct* %s to i8*
   tail call void @consumestruct(i8* %0, i32 80) optsize
@@ -78,8 +99,10 @@ entry:
 }
 
 define void @f6(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f6
+; CHECK-LABEL: f6
 ; CHECK: b _consumestruct
+; THUMB-LABEL: f6
+; THUMB: b.w _consumestruct
 entry:
   %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
   %0 = bitcast i32* %addr to i8*
@@ -88,3 +111,19 @@ entry:
 }
 
 declare void @consumestruct(i8* nocapture %structp, i32 %structsize) nounwind
+
+; PR17309
+%struct.I.8 = type { [10 x i32], [3 x i8] }
+
+declare void @use_I(%struct.I.8* byval)
+define void @test_I_16() {
+; CHECK-LABEL: test_I_16
+; CHECK: ldrb
+; CHECK: strb
+; THUMB-LABEL: test_I_16
+; THUMB: ldrb
+; THUMB: strb
+entry:
+  call void @use_I(%struct.I.8* byval align 16 undef)
+  ret void
+}
diff --git a/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll b/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
new file mode 100644
index 0000000..1899269
--- /dev/null
+++ b/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
@@ -0,0 +1,1523 @@
+;RUN: llc < %s -mtriple=armv7-none-linux-gnueabi   -mattr=+neon -verify-machineinstrs -filetype=obj | llvm-objdump -triple armv7-none-linux-gnueabi   -disassemble - | FileCheck %s --check-prefix=ARM
+;RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -mattr=+neon -verify-machineinstrs -filetype=obj | llvm-objdump -triple thumbv7-none-linux-gnueabi -disassemble - | FileCheck %s --check-prefix=THUMB2
+;RUN: llc < %s -mtriple=armv7-none-linux-gnueabi   -mattr=-neon -verify-machineinstrs -filetype=obj | llvm-objdump -triple armv7-none-linux-gnueabi   -disassemble - | FileCheck %s --check-prefix=NO_NEON
+;We want to have both positive and negative checks for thumb1. These checks
+;are not easy to do in a single pass so we generate the output once to a
+;temp file and run filecheck twice with different prefixes.
+;RUN: llc < %s -mtriple=thumbv5-none-linux-gnueabi              -verify-machineinstrs -filetype=obj | llvm-objdump -triple thumbv5-none-linux-gnueabi -disassemble - > %t
+;RUN: cat %t | FileCheck %s --check-prefix=THUMB1
+;RUN: cat %t | FileCheck %s --check-prefix=T1POST
+
+;This file contains auto generated tests for the lowering of passing structs
+;byval in the arm backend. We have tests for both packed and unpacked
+;structs at varying alignments. Each test is run for arm, thumb2 and thumb1.
+;We check for the strings in the generated object code using llvm-objdump
+;because it provides better assurance that we are generating instructions
+;for the correct architecture. Otherwise we could accidently generate an
+;ARM instruction for THUMB1 and wouldn't detect it because the assembly
+;code representation is the same, but the object code would be generated
+;incorrectly. For each test we check for the label, a load instruction of the
+;correct form, a branch if it will be generated with a loop, and the leftover
+;cleanup if the number of bytes does not divide evenly by the store size
+
+%struct.A = type <{ [ 10 x i32 ] }> ; 40 bytes
+declare void @use_A(%struct.A* byval)
+%struct.B = type <{ [ 10 x i32 ], i8 }> ; 41 bytes
+declare void @use_B(%struct.B* byval)
+%struct.C = type <{ [ 10 x i32 ], [ 3 x i8 ] }> ; 43 bytes
+declare void @use_C(%struct.C* byval)
+%struct.D = type <{ [ 100 x i32 ] }> ; 400 bytes
+declare void @use_D(%struct.D* byval)
+%struct.E = type <{ [ 100 x i32 ], i8 }> ; 401 bytes
+declare void @use_E(%struct.E* byval)
+%struct.F = type <{ [ 100 x i32 ], [ 3 x i8 ] }> ; 403 bytes
+declare void @use_F(%struct.F* byval)
+%struct.G = type  { [ 10 x i32 ] }  ; 40 bytes
+declare void @use_G(%struct.G* byval)
+%struct.H = type  { [ 10 x i32 ], i8 }  ; 41 bytes
+declare void @use_H(%struct.H* byval)
+%struct.I = type  { [ 10 x i32 ], [ 3 x i8 ] }  ; 43 bytes
+declare void @use_I(%struct.I* byval)
+%struct.J = type  { [ 100 x i32 ] }  ; 400 bytes
+declare void @use_J(%struct.J* byval)
+%struct.K = type  { [ 100 x i32 ], i8 }  ; 401 bytes
+declare void @use_K(%struct.K* byval)
+%struct.L = type  { [ 100 x i32 ], [ 3 x i8 ] }  ; 403 bytes
+declare void @use_L(%struct.L* byval)
+
+;ARM-LABEL:    test_A_1:
+;THUMB2-LABEL: test_A_1:
+;NO_NEON-LABEL:test_A_1:
+;THUMB1-LABEL: test_A_1:
+;T1POST-LABEL: test_A_1:
+  define void @test_A_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.A, align 1
+    call void @use_A(%struct.A* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_2:
+;THUMB2-LABEL: test_A_2:
+;NO_NEON-LABEL:test_A_2:
+;THUMB1-LABEL: test_A_2:
+;T1POST-LABEL: test_A_2:
+  define void @test_A_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.A, align 2
+    call void @use_A(%struct.A* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_4:
+;THUMB2-LABEL: test_A_4:
+;NO_NEON-LABEL:test_A_4:
+;THUMB1-LABEL: test_A_4:
+;T1POST-LABEL: test_A_4:
+  define void @test_A_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.A, align 4
+    call void @use_A(%struct.A* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_8:
+;THUMB2-LABEL: test_A_8:
+;NO_NEON-LABEL:test_A_8:
+;THUMB1-LABEL: test_A_8:
+;T1POST-LABEL: test_A_8:
+  define void @test_A_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.A, align 8
+    call void @use_A(%struct.A* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_16:
+;THUMB2-LABEL: test_A_16:
+;NO_NEON-LABEL:test_A_16:
+;THUMB1-LABEL: test_A_16:
+;T1POST-LABEL: test_A_16:
+  define void @test_A_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.A, align 16
+    call void @use_A(%struct.A* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_1:
+;THUMB2-LABEL: test_B_1:
+;NO_NEON-LABEL:test_B_1:
+;THUMB1-LABEL: test_B_1:
+;T1POST-LABEL: test_B_1:
+  define void @test_B_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.B, align 1
+    call void @use_B(%struct.B* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_2:
+;THUMB2-LABEL: test_B_2:
+;NO_NEON-LABEL:test_B_2:
+;THUMB1-LABEL: test_B_2:
+;T1POST-LABEL: test_B_2:
+  define void @test_B_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.B, align 2
+    call void @use_B(%struct.B* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_4:
+;THUMB2-LABEL: test_B_4:
+;NO_NEON-LABEL:test_B_4:
+;THUMB1-LABEL: test_B_4:
+;T1POST-LABEL: test_B_4:
+  define void @test_B_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.B, align 4
+    call void @use_B(%struct.B* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_8:
+;THUMB2-LABEL: test_B_8:
+;NO_NEON-LABEL:test_B_8:
+;THUMB1-LABEL: test_B_8:
+;T1POST-LABEL: test_B_8:
+  define void @test_B_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.B, align 8
+    call void @use_B(%struct.B* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_16:
+;THUMB2-LABEL: test_B_16:
+;NO_NEON-LABEL:test_B_16:
+;THUMB1-LABEL: test_B_16:
+;T1POST-LABEL: test_B_16:
+  define void @test_B_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.B, align 16
+    call void @use_B(%struct.B* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_1:
+;THUMB2-LABEL: test_C_1:
+;NO_NEON-LABEL:test_C_1:
+;THUMB1-LABEL: test_C_1:
+;T1POST-LABEL: test_C_1:
+  define void @test_C_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.C, align 1
+    call void @use_C(%struct.C* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_2:
+;THUMB2-LABEL: test_C_2:
+;NO_NEON-LABEL:test_C_2:
+;THUMB1-LABEL: test_C_2:
+;T1POST-LABEL: test_C_2:
+  define void @test_C_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.C, align 2
+    call void @use_C(%struct.C* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_4:
+;THUMB2-LABEL: test_C_4:
+;NO_NEON-LABEL:test_C_4:
+;THUMB1-LABEL: test_C_4:
+;T1POST-LABEL: test_C_4:
+  define void @test_C_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.C, align 4
+    call void @use_C(%struct.C* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_8:
+;THUMB2-LABEL: test_C_8:
+;NO_NEON-LABEL:test_C_8:
+;THUMB1-LABEL: test_C_8:
+;T1POST-LABEL: test_C_8:
+  define void @test_C_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.C, align 8
+    call void @use_C(%struct.C* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_16:
+;THUMB2-LABEL: test_C_16:
+;NO_NEON-LABEL:test_C_16:
+;THUMB1-LABEL: test_C_16:
+;T1POST-LABEL: test_C_16:
+  define void @test_C_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.C, align 16
+    call void @use_C(%struct.C* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_1:
+;THUMB2-LABEL: test_D_1:
+;NO_NEON-LABEL:test_D_1:
+;THUMB1-LABEL: test_D_1:
+;T1POST-LABEL: test_D_1:
+  define void @test_D_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.D, align 1
+    call void @use_D(%struct.D* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_2:
+;THUMB2-LABEL: test_D_2:
+;NO_NEON-LABEL:test_D_2:
+;THUMB1-LABEL: test_D_2:
+;T1POST-LABEL: test_D_2:
+  define void @test_D_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.D, align 2
+    call void @use_D(%struct.D* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_4:
+;THUMB2-LABEL: test_D_4:
+;NO_NEON-LABEL:test_D_4:
+;THUMB1-LABEL: test_D_4:
+;T1POST-LABEL: test_D_4:
+  define void @test_D_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.D, align 4
+    call void @use_D(%struct.D* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_8:
+;THUMB2-LABEL: test_D_8:
+;NO_NEON-LABEL:test_D_8:
+;THUMB1-LABEL: test_D_8:
+;T1POST-LABEL: test_D_8:
+  define void @test_D_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.D, align 8
+    call void @use_D(%struct.D* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_16:
+;THUMB2-LABEL: test_D_16:
+;NO_NEON-LABEL:test_D_16:
+;THUMB1-LABEL: test_D_16:
+;T1POST-LABEL: test_D_16:
+  define void @test_D_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.D, align 16
+    call void @use_D(%struct.D* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_1:
+;THUMB2-LABEL: test_E_1:
+;NO_NEON-LABEL:test_E_1:
+;THUMB1-LABEL: test_E_1:
+;T1POST-LABEL: test_E_1:
+  define void @test_E_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.E, align 1
+    call void @use_E(%struct.E* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_2:
+;THUMB2-LABEL: test_E_2:
+;NO_NEON-LABEL:test_E_2:
+;THUMB1-LABEL: test_E_2:
+;T1POST-LABEL: test_E_2:
+  define void @test_E_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.E, align 2
+    call void @use_E(%struct.E* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_4:
+;THUMB2-LABEL: test_E_4:
+;NO_NEON-LABEL:test_E_4:
+;THUMB1-LABEL: test_E_4:
+;T1POST-LABEL: test_E_4:
+  define void @test_E_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.E, align 4
+    call void @use_E(%struct.E* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_8:
+;THUMB2-LABEL: test_E_8:
+;NO_NEON-LABEL:test_E_8:
+;THUMB1-LABEL: test_E_8:
+;T1POST-LABEL: test_E_8:
+  define void @test_E_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.E, align 8
+    call void @use_E(%struct.E* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_16:
+;THUMB2-LABEL: test_E_16:
+;NO_NEON-LABEL:test_E_16:
+;THUMB1-LABEL: test_E_16:
+;T1POST-LABEL: test_E_16:
+  define void @test_E_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.E, align 16
+    call void @use_E(%struct.E* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_1:
+;THUMB2-LABEL: test_F_1:
+;NO_NEON-LABEL:test_F_1:
+;THUMB1-LABEL: test_F_1:
+;T1POST-LABEL: test_F_1:
+  define void @test_F_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.F, align 1
+    call void @use_F(%struct.F* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_2:
+;THUMB2-LABEL: test_F_2:
+;NO_NEON-LABEL:test_F_2:
+;THUMB1-LABEL: test_F_2:
+;T1POST-LABEL: test_F_2:
+  define void @test_F_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.F, align 2
+    call void @use_F(%struct.F* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_4:
+;THUMB2-LABEL: test_F_4:
+;NO_NEON-LABEL:test_F_4:
+;THUMB1-LABEL: test_F_4:
+;T1POST-LABEL: test_F_4:
+  define void @test_F_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.F, align 4
+    call void @use_F(%struct.F* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_8:
+;THUMB2-LABEL: test_F_8:
+;NO_NEON-LABEL:test_F_8:
+;THUMB1-LABEL: test_F_8:
+;T1POST-LABEL: test_F_8:
+  define void @test_F_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.F, align 8
+    call void @use_F(%struct.F* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_16:
+;THUMB2-LABEL: test_F_16:
+;NO_NEON-LABEL:test_F_16:
+;THUMB1-LABEL: test_F_16:
+;T1POST-LABEL: test_F_16:
+  define void @test_F_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.F, align 16
+    call void @use_F(%struct.F* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_1:
+;THUMB2-LABEL: test_G_1:
+;NO_NEON-LABEL:test_G_1:
+;THUMB1-LABEL: test_G_1:
+;T1POST-LABEL: test_G_1:
+  define void @test_G_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.G, align 1
+    call void @use_G(%struct.G* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_2:
+;THUMB2-LABEL: test_G_2:
+;NO_NEON-LABEL:test_G_2:
+;THUMB1-LABEL: test_G_2:
+;T1POST-LABEL: test_G_2:
+  define void @test_G_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.G, align 2
+    call void @use_G(%struct.G* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_4:
+;THUMB2-LABEL: test_G_4:
+;NO_NEON-LABEL:test_G_4:
+;THUMB1-LABEL: test_G_4:
+;T1POST-LABEL: test_G_4:
+  define void @test_G_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.G, align 4
+    call void @use_G(%struct.G* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_8:
+;THUMB2-LABEL: test_G_8:
+;NO_NEON-LABEL:test_G_8:
+;THUMB1-LABEL: test_G_8:
+;T1POST-LABEL: test_G_8:
+  define void @test_G_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.G, align 8
+    call void @use_G(%struct.G* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_16:
+;THUMB2-LABEL: test_G_16:
+;NO_NEON-LABEL:test_G_16:
+;THUMB1-LABEL: test_G_16:
+;T1POST-LABEL: test_G_16:
+  define void @test_G_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.G, align 16
+    call void @use_G(%struct.G* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_1:
+;THUMB2-LABEL: test_H_1:
+;NO_NEON-LABEL:test_H_1:
+;THUMB1-LABEL: test_H_1:
+;T1POST-LABEL: test_H_1:
+  define void @test_H_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.H, align 1
+    call void @use_H(%struct.H* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_2:
+;THUMB2-LABEL: test_H_2:
+;NO_NEON-LABEL:test_H_2:
+;THUMB1-LABEL: test_H_2:
+;T1POST-LABEL: test_H_2:
+  define void @test_H_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.H, align 2
+    call void @use_H(%struct.H* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_4:
+;THUMB2-LABEL: test_H_4:
+;NO_NEON-LABEL:test_H_4:
+;THUMB1-LABEL: test_H_4:
+;T1POST-LABEL: test_H_4:
+  define void @test_H_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.H, align 4
+    call void @use_H(%struct.H* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_8:
+;THUMB2-LABEL: test_H_8:
+;NO_NEON-LABEL:test_H_8:
+;THUMB1-LABEL: test_H_8:
+;T1POST-LABEL: test_H_8:
+  define void @test_H_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.H, align 8
+    call void @use_H(%struct.H* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_16:
+;THUMB2-LABEL: test_H_16:
+;NO_NEON-LABEL:test_H_16:
+;THUMB1-LABEL: test_H_16:
+;T1POST-LABEL: test_H_16:
+  define void @test_H_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.H, align 16
+    call void @use_H(%struct.H* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_1:
+;THUMB2-LABEL: test_I_1:
+;NO_NEON-LABEL:test_I_1:
+;THUMB1-LABEL: test_I_1:
+;T1POST-LABEL: test_I_1:
+  define void @test_I_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.I, align 1
+    call void @use_I(%struct.I* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_2:
+;THUMB2-LABEL: test_I_2:
+;NO_NEON-LABEL:test_I_2:
+;THUMB1-LABEL: test_I_2:
+;T1POST-LABEL: test_I_2:
+  define void @test_I_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.I, align 2
+    call void @use_I(%struct.I* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_4:
+;THUMB2-LABEL: test_I_4:
+;NO_NEON-LABEL:test_I_4:
+;THUMB1-LABEL: test_I_4:
+;T1POST-LABEL: test_I_4:
+  define void @test_I_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.I, align 4
+    call void @use_I(%struct.I* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_8:
+;THUMB2-LABEL: test_I_8:
+;NO_NEON-LABEL:test_I_8:
+;THUMB1-LABEL: test_I_8:
+;T1POST-LABEL: test_I_8:
+  define void @test_I_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.I, align 8
+    call void @use_I(%struct.I* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_16:
+;THUMB2-LABEL: test_I_16:
+;NO_NEON-LABEL:test_I_16:
+;THUMB1-LABEL: test_I_16:
+;T1POST-LABEL: test_I_16:
+  define void @test_I_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.I, align 16
+    call void @use_I(%struct.I* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_1:
+;THUMB2-LABEL: test_J_1:
+;NO_NEON-LABEL:test_J_1:
+;THUMB1-LABEL: test_J_1:
+;T1POST-LABEL: test_J_1:
+  define void @test_J_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.J, align 1
+    call void @use_J(%struct.J* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_2:
+;THUMB2-LABEL: test_J_2:
+;NO_NEON-LABEL:test_J_2:
+;THUMB1-LABEL: test_J_2:
+;T1POST-LABEL: test_J_2:
+  define void @test_J_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.J, align 2
+    call void @use_J(%struct.J* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_4:
+;THUMB2-LABEL: test_J_4:
+;NO_NEON-LABEL:test_J_4:
+;THUMB1-LABEL: test_J_4:
+;T1POST-LABEL: test_J_4:
+  define void @test_J_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.J, align 4
+    call void @use_J(%struct.J* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_8:
+;THUMB2-LABEL: test_J_8:
+;NO_NEON-LABEL:test_J_8:
+;THUMB1-LABEL: test_J_8:
+;T1POST-LABEL: test_J_8:
+  define void @test_J_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.J, align 8
+    call void @use_J(%struct.J* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_16:
+;THUMB2-LABEL: test_J_16:
+;NO_NEON-LABEL:test_J_16:
+;THUMB1-LABEL: test_J_16:
+;T1POST-LABEL: test_J_16:
+  define void @test_J_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.J, align 16
+    call void @use_J(%struct.J* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_1:
+;THUMB2-LABEL: test_K_1:
+;NO_NEON-LABEL:test_K_1:
+;THUMB1-LABEL: test_K_1:
+;T1POST-LABEL: test_K_1:
+  define void @test_K_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.K, align 1
+    call void @use_K(%struct.K* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_2:
+;THUMB2-LABEL: test_K_2:
+;NO_NEON-LABEL:test_K_2:
+;THUMB1-LABEL: test_K_2:
+;T1POST-LABEL: test_K_2:
+  define void @test_K_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.K, align 2
+    call void @use_K(%struct.K* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_4:
+;THUMB2-LABEL: test_K_4:
+;NO_NEON-LABEL:test_K_4:
+;THUMB1-LABEL: test_K_4:
+;T1POST-LABEL: test_K_4:
+  define void @test_K_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.K, align 4
+    call void @use_K(%struct.K* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_8:
+;THUMB2-LABEL: test_K_8:
+;NO_NEON-LABEL:test_K_8:
+;THUMB1-LABEL: test_K_8:
+;T1POST-LABEL: test_K_8:
+  define void @test_K_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.K, align 8
+    call void @use_K(%struct.K* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_16:
+;THUMB2-LABEL: test_K_16:
+;NO_NEON-LABEL:test_K_16:
+;THUMB1-LABEL: test_K_16:
+;T1POST-LABEL: test_K_16:
+  define void @test_K_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.K, align 16
+    call void @use_K(%struct.K* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_1:
+;THUMB2-LABEL: test_L_1:
+;NO_NEON-LABEL:test_L_1:
+;THUMB1-LABEL: test_L_1:
+;T1POST-LABEL: test_L_1:
+  define void @test_L_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.L, align 1
+    call void @use_L(%struct.L* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_2:
+;THUMB2-LABEL: test_L_2:
+;NO_NEON-LABEL:test_L_2:
+;THUMB1-LABEL: test_L_2:
+;T1POST-LABEL: test_L_2:
+  define void @test_L_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.L, align 2
+    call void @use_L(%struct.L* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_4:
+;THUMB2-LABEL: test_L_4:
+;NO_NEON-LABEL:test_L_4:
+;THUMB1-LABEL: test_L_4:
+;T1POST-LABEL: test_L_4:
+  define void @test_L_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.L, align 4
+    call void @use_L(%struct.L* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_8:
+;THUMB2-LABEL: test_L_8:
+;NO_NEON-LABEL:test_L_8:
+;THUMB1-LABEL: test_L_8:
+;T1POST-LABEL: test_L_8:
+  define void @test_L_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.L, align 8
+    call void @use_L(%struct.L* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_16:
+;THUMB2-LABEL: test_L_16:
+;NO_NEON-LABEL:test_L_16:
+;THUMB1-LABEL: test_L_16:
+;T1POST-LABEL: test_L_16:
+  define void @test_L_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.L, align 16
+    call void @use_L(%struct.L* byval align 16 %a)
+    ret void
+  }
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 1b411e3..19727da 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -1,4 +1,7 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s --check-prefix=V7
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi | FileCheck %s -check-prefix=V8
+
 
 define i32 @f(i32 %a, i32 %b) nounwind ssp {
 entry:
@@ -84,3 +87,60 @@ land.lhs.true:                                    ; preds = %num2long.exit
 if.end11:                                         ; preds = %num2long.exit
   ret i32 23
 }
+
+define float @float_sel(i32 %a, i32 %b, float %x, float %y) {
+entry:
+; CHECK-LABEL: float_sel:
+; CHECK-NOT: cmp
+; V8-LABEL: float_sel:
+; V8-NOT: cmp
+; V8: vseleq.f32
+  %sub = sub i32 %a, %b
+  %cmp = icmp eq i32 %sub, 0
+  %ret = select i1 %cmp, float %x, float %y
+  ret float %ret
+}
+
+define double @double_sel(i32 %a, i32 %b, double %x, double %y) {
+entry:
+; CHECK-LABEL: double_sel:
+; CHECK-NOT: cmp
+; V8-LABEL: double_sel:
+; V8-NOT: cmp
+; V8: vseleq.f64
+  %sub = sub i32 %a, %b
+  %cmp = icmp eq i32 %sub, 0
+  %ret = select i1 %cmp, double %x, double %y
+  ret double %ret
+}
+
+@t = common global i32 0
+define double @double_sub(i32 %a, i32 %b, double %x, double %y) {
+entry:
+; CHECK-LABEL: double_sub:
+; CHECK: subs
+; CHECK-NOT: cmp
+; V8-LABEL: double_sub:
+; V8: vsel
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub i32 %a, %b
+  store i32 %sub, i32* @t
+  %ret = select i1 %cmp, double %x, double %y
+  ret double %ret
+}
+
+define double @double_sub_swap(i32 %a, i32 %b, double %x, double %y) {
+entry:
+; V7-LABEL: double_sub_swap:
+; V7-NOT: cmp
+; V7: subs
+; V8-LABEL: double_sub_swap:
+; V8-NOT: subs
+; V8: cmp
+; V8: vsel
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub i32 %b, %a
+  %ret = select i1 %cmp, double %x, double %y
+  store i32 %sub, i32* @t
+  ret double %ret
+}
diff --git a/test/CodeGen/ARM/swift-vldm.ll b/test/CodeGen/ARM/swift-vldm.ll
new file mode 100644
index 0000000..67ae00a
--- /dev/null
+++ b/test/CodeGen/ARM/swift-vldm.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
+
+; Check that we avoid producing vldm instructions using d registers that
+; begin in the most-significant half of a q register. These require more
+; micro-ops on swift and so aren't worth combining.
+
+; CHECK-LABEL: test_vldm
+; CHECK: vldmia r{{[0-9]+}}, {d2, d3, d4}
+; CHECK-NOT: vldmia r{{[0-9]+}}, {d1, d2, d3, d4}
+
+declare fastcc void @force_register(double %d0, double %d1, double %d2, double %d3, double %d4) 
+
+define void @test_vldm(double* %x, double * %y) {
+entry:
+  %addr1 = getelementptr double * %x, i32 1
+  %addr2 = getelementptr double * %x, i32 2
+  %addr3 = getelementptr double * %x, i32 3
+  %d0 = load double * %y
+  %d1 = load double * %x
+  %d2 = load double * %addr1
+  %d3 = load double * %addr2
+  %d4 = load double * %addr3
+  ; We are trying to force x[0-3] in registers d1 to d4 so that we can test we
+  ; don't form a "vldmia rX, {d1, d2, d3, d4}".
+  ; We are relying on the calling convention and that register allocation
+  ; properly coalesces registers.
+  call fastcc void @force_register(double %d0, double %d1, double %d2, double %d3, double %d4)
+  ret void
+}
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index a25352c..47c5dcc 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -1,14 +1,15 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
 ; PR11107
 
 define i32 @test(i32 %a, i32 %b) {
 entry:
 ; CHECK:        cmp
 ; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsbmi
+; CHECK-NEXT:   rsb{{s?}}mi
 ; CHECK-NEXT:   cmp
 ; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsbmi
+; CHECK-NEXT:   rsb{{s?}}mi
  %cmp1 = icmp slt i32 %a, 0
  %sub1 = sub nsw i32 0, %a
  %abs1 = select i1 %cmp1, i32 %sub1, i32 %a
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index db88a03..6cb26e3 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -9,13 +9,13 @@
 ; RUN: llc -mtriple=armv7 -mattr=+nacl-trap -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7 -mattr=+nacl-trap - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
-; RUN: llc -fast-isel -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
+; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
 ; RUN: llc -mtriple=armv7 -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7 - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-ALL
-; RUN: llc -fast-isel -mtriple=armv7 -filetype=obj %s -o - \
+; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7 -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7 - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-ALL
 ; rdar://7961298
diff --git a/test/CodeGen/ARM/vadd.ll b/test/CodeGen/ARM/vadd.ll
index a1ad37b..fcb5408 100644
--- a/test/CodeGen/ARM/vadd.ll
+++ b/test/CodeGen/ARM/vadd.ll
@@ -90,37 +90,6 @@ define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp3
 }
 
-define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vaddhni16:
-;CHECK: vaddhn.i16
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vaddhni32:
-;CHECK: vaddhn.i32
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vaddhni64:
-;CHECK: vaddhn.i64
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-
 define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: vraddhni16:
 ;CHECK: vraddhn.i16
@@ -152,6 +121,33 @@ declare <8 x i8>  @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind rea
 declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
 declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
 
+define <8 x i8> @vaddhni16_natural(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: vaddhni16_natural:
+; CHECK: vaddhn.i16
+  %sum = add <8 x i16> %A, %B
+  %shift = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %trunc = trunc <8 x i16> %shift to <8 x i8>
+  ret <8 x i8> %trunc
+}
+
+define <4 x i16> @vaddhni32_natural(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: vaddhni32_natural:
+; CHECK: vaddhn.i32
+  %sum = add <4 x i32> %A, %B
+  %shift = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+  %trunc = trunc <4 x i32> %shift to <4 x i16>
+  ret <4 x i16> %trunc
+}
+
+define <2 x i32> @vaddhni64_natural(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: vaddhni64_natural:
+; CHECK: vaddhn.i64
+  %sum = add <2 x i64> %A, %B
+  %shift = lshr <2 x i64> %sum, <i64 32, i64 32>
+  %trunc = trunc <2 x i64> %shift to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
 define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vaddls8:
 ;CHECK: vaddl.s8
diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll
index 4221c98..759da22 100644
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -29,7 +29,7 @@ entry:
 
 ; Radar 8407927: Make sure that VMOVRRD gets optimized away when the result is
 ; converted back to be used as a vector type.
-; CHECK: test_vmovrrd_combine
+; CHECK-LABEL: test_vmovrrd_combine:
 define <4 x i32> @test_vmovrrd_combine() nounwind {
 entry:
   br i1 undef, label %bb1, label %bb2
@@ -136,7 +136,7 @@ define i16 @foldBuildVectors() {
 
 ; Test that we are generating vrev and vext for reverse shuffles of v8i16
 ; shuffles.
-; CHECK: reverse_v8i16
+; CHECK-LABEL: reverse_v8i16:
 define void @reverse_v8i16(<8 x i16>* %loadaddr, <8 x i16>* %storeaddr) {
   %v0 = load <8 x i16>* %loadaddr
   ; CHECK: vrev64.16
@@ -149,7 +149,7 @@ define void @reverse_v8i16(<8 x i16>* %loadaddr, <8 x i16>* %storeaddr) {
 
 ; Test that we are generating vrev and vext for reverse shuffles of v16i8
 ; shuffles.
-; CHECK: reverse_v16i8
+; CHECK-LABEL: reverse_v16i8:
 define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) {
   %v0 = load <16 x i8>* %loadaddr
   ; CHECK: vrev64.8
@@ -165,7 +165,7 @@ define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) {
 ; vldr cannot handle unaligned loads.
 ; Fall back to vld1.32, which can, instead of using the general purpose loads
 ; followed by a costly sequence of instructions to build the vector register.
-; CHECK: t3
+; CHECK-LABEL: t3:
 ; CHECK: vld1.32 {[[REG:d[0-9]+]][0]}
 ; CHECK: vld1.32 {[[REG]][1]}
 ; CHECK: vmull.u8 q{{[0-9]+}}, [[REG]], [[REG]]
@@ -188,7 +188,7 @@ declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
 ; Check that (insert_vector_elt (load)) => (vector_load).
 ; Thus, check that scalar_to_vector do not interfer with that.
 define <8 x i16> @t4(i8* nocapture %sp0) {
-; CHECK: t4
+; CHECK-LABEL: t4:
 ; CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r0]
 entry:
   %pix_sp0.0.cast = bitcast i8* %sp0 to i32*
@@ -202,7 +202,7 @@ entry:
 ; Make sure vector load is used for all three loads.
 ; Lowering to build vector was breaking the single use property of the load of
 ;  %pix_sp0.0.copyload.
-; CHECK: t5
+; CHECK-LABEL: t5:
 ; CHECK: vld1.32 {[[REG1:d[0-9]+]][1]}, [r0]
 ; CHECK: vorr [[REG2:d[0-9]+]], [[REG1]], [[REG1]]
 ; CHECK: vld1.32 {[[REG1]][0]}, [r1]
@@ -224,3 +224,23 @@ entry:
   %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %0, <8 x i8> %1)
   ret <8 x i16> %vmull.i
 }
+
+; <rdar://problem/14989896> Make sure we manage to truncate a vector from an
+; illegal type to a legal type.
+define <2 x i8> @test_truncate(<2 x i128> %in) {
+; CHECK-LABEL: test_truncate:
+; CHECK: mov [[BASE:r[0-9]+]], sp
+; CHECK-NEXT: vld1.32 {[[REG1:d[0-9]+]][0]}, {{\[}}[[BASE]]:32]
+; CHECK-NEXT: add [[BASE2:r[0-9]+]], [[BASE]], #4
+; CHECK-NEXT: vld1.32 {[[REG1]][1]}, {{\[}}[[BASE2]]:32]
+; REG2 Should map on the same Q register as REG1, i.e., REG2 = REG1 - 1, but we
+; cannot express that.
+; CHECK-NEXT: vmov.32 [[REG2:d[0-9]+]][0], r0
+; CHECK-NEXT: vmov.32 [[REG2]][1], r1
+; The Q register used here should match floor(REG1/2), but we cannot express that.
+; CHECK-NEXT: vmovn.i64 [[RES:d[0-9]+]], q{{[0-9]+}}
+; CHECK-NEXT: vmov r0, r1, [[RES]]
+entry:
+  %res = trunc <2 x i128> %in to <2 x i8>
+  ret <2 x i8> %res
+}
diff --git a/test/CodeGen/ARM/vldm-liveness.ll b/test/CodeGen/ARM/vldm-liveness.ll
new file mode 100644
index 0000000..751f447
--- /dev/null
+++ b/test/CodeGen/ARM/vldm-liveness.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple thumbv7-apple-ios -verify-machineinstrs -o - %s | FileCheck %s
+
+; ARM load store optimizer was dealing with a sequence like:
+;     s1 = VLDRS [r0, 1], Q0<imp-def>
+;     s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
+;     s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
+;     s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
+;
+; It decided to combine the {s0, s1} loads into a single instruction in the
+; third position. However, this leaves the instruction defining s3 with a stray
+; imp-use of Q0, which is undefined.
+;
+; The verifier catches this, so this test just makes sure that appropriate
+; liveness flags are added.
+;
+; I believe the change will be tested as long as the vldmia is not the first of
+; the loads. Earlier optimisations may perturb the output over time, but
+; fiddling the indices should be sufficient to restore the test.
+
+define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
+; CHECK-LABEL: foo:
+; CHECK: vldr s3, [r0, #8]
+; CHECK: vldmia r0, {s0, s1}
+; CHECK: vldr s2, [r0, #16]
+   %off0 = getelementptr float* %ptr, i32 0
+   %val0 = load float* %off0
+   %off1 = getelementptr float* %ptr, i32 1
+   %val1 = load float* %off1
+   %off4 = getelementptr float* %ptr, i32 4
+   %val4 = load float* %off4
+   %off2 = getelementptr float* %ptr, i32 2
+   %val2 = load float* %off2
+
+   %vec1 = insertelement <4 x float> undef, float %val0, i32 0
+   %vec2 = insertelement <4 x float> %vec1, float %val1, i32 1
+   %vec3 = insertelement <4 x float> %vec2, float %val4, i32 2
+   %vec4 = insertelement <4 x float> %vec3, float %val2, i32 3
+
+   ret <4 x float> %vec4
+}
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
new file mode 100644
index 0000000..d0a9ac6
--- /dev/null
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
+
+; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory.
+; Check that we don't crash when we generate these instructions on Cortex-A9.
+
+; CHECK: test:
+; CHECK: vstmia
+; CHECK: vldmia
+define void @test(i64* %src) #0 {
+entry:
+  %arrayidx39 = getelementptr inbounds i64* %src, i32 13
+  %vecinit285 = shufflevector <16 x i64> undef, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit285, <16 x i64>* undef, align 128
+  %0 = load i64* undef, align 8
+  %vecinit379 = insertelement <16 x i64> undef, i64 %0, i32 9
+  %1 = load i64* undef, align 8
+  %vecinit419 = insertelement <16 x i64> undef, i64 %1, i32 15
+  store <16 x i64> %vecinit419, <16 x i64>* undef, align 128
+  %vecinit579 = insertelement <16 x i64> undef, i64 0, i32 4
+  %vecinit582 = shufflevector <16 x i64> %vecinit579, <16 x i64> <i64 6, i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit584 = insertelement <16 x i64> %vecinit582, i64 undef, i32 9
+  %vecinit586 = insertelement <16 x i64> %vecinit584, i64 0, i32 10
+  %vecinit589 = shufflevector <16 x i64> %vecinit586, <16 x i64> <i64 12, i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 18, i32 19, i32 undef>
+  %2 = load i64* undef, align 8
+  %vecinit591 = insertelement <16 x i64> %vecinit589, i64 %2, i32 15
+  store <16 x i64> %vecinit591, <16 x i64>* undef, align 128
+  %vecinit694 = shufflevector <16 x i64> undef, <16 x i64> <i64 13, i64 14, i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+  store <16 x i64> %vecinit694, <16 x i64>* undef, align 128
+  %3 = load i64* undef, align 8
+  %vecinit1331 = insertelement <16 x i64> undef, i64 %3, i32 14
+  %4 = load i64* undef, align 8
+  %vecinit1468 = insertelement <16 x i64> undef, i64 %4, i32 11
+  %vecinit1471 = shufflevector <16 x i64> %vecinit1468, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
+  %vecinit1474 = shufflevector <16 x i64> %vecinit1471, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit1474, <16 x i64>* undef, align 128
+  %vecinit1552 = shufflevector <16 x i64> undef, <16 x i64> <i64 10, i64 11, i64 12, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1555 = shufflevector <16 x i64> %vecinit1552, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
+  %vecinit1558 = shufflevector <16 x i64> %vecinit1555, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit1558, <16 x i64>* undef, align 128
+  %vecinit1591 = shufflevector <16 x i64> undef, <16 x i64> <i64 3, i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1594 = shufflevector <16 x i64> %vecinit1591, <16 x i64> <i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1597 = shufflevector <16 x i64> %vecinit1594, <16 x i64> <i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1599 = insertelement <16 x i64> %vecinit1597, i64 undef, i32 8
+  %vecinit1601 = insertelement <16 x i64> %vecinit1599, i64 undef, i32 9
+  %vecinit1603 = insertelement <16 x i64> %vecinit1601, i64 undef, i32 10
+  %5 = load i64* undef, align 8
+  %vecinit1605 = insertelement <16 x i64> %vecinit1603, i64 %5, i32 11
+  %vecinit1608 = shufflevector <16 x i64> %vecinit1605, <16 x i64> <i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 undef>
+  %6 = load i64* undef, align 8
+  %vecinit1610 = insertelement <16 x i64> %vecinit1608, i64 %6, i32 15
+  store <16 x i64> %vecinit1610, <16 x i64>* undef, align 128
+  %vecinit2226 = shufflevector <16 x i64> undef, <16 x i64> <i64 6, i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %7 = load i64* undef, align 8
+  %vecinit2228 = insertelement <16 x i64> %vecinit2226, i64 %7, i32 8
+  %vecinit2230 = insertelement <16 x i64> %vecinit2228, i64 undef, i32 9
+  %vecinit2233 = shufflevector <16 x i64> %vecinit2230, <16 x i64> <i64 11, i64 12, i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef>
+  %vecinit2236 = shufflevector <16 x i64> %vecinit2233, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit2236, <16 x i64>* undef, align 128
+  %vecinit2246 = shufflevector <16 x i64> undef, <16 x i64> <i64 4, i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit2249 = shufflevector <16 x i64> %vecinit2246, <16 x i64> <i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit2252 = shufflevector <16 x i64> %vecinit2249, <16 x i64> <i64 10, i64 11, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit2255 = shufflevector <16 x i64> %vecinit2252, <16 x i64> <i64 12, i64 13, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 undef, i32 undef, i32 undef>
+  %8 = load i64* %arrayidx39, align 8
+  %vecinit2257 = insertelement <16 x i64> %vecinit2255, i64 %8, i32 13
+  %vecinit2260 = shufflevector <16 x i64> %vecinit2257, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit2260, <16 x i64>* null, align 128
+  ret void
+}
+attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/vminmaxnm.ll b/test/CodeGen/ARM/vminmaxnm.ll
index afa73b9..f6ce64c 100644
--- a/test/CodeGen/ARM/vminmaxnm.ll
+++ b/test/CodeGen/ARM/vminmaxnm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple armv8 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST
 
 define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK: vmaxnmq
@@ -36,6 +37,51 @@ define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
   ret <2 x float> %tmp3
 }
 
+define float @fp-armv8_vminnm_o(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vminnm_o
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK: fp-armv8_vminnm_o
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp olt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vminnm_u(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vminnm_u
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK: fp-armv8_vminnm_u
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ult float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_o(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vmaxnm_o
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK: fp-armv8_vmaxnm_o
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ogt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_u(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vmaxnm_u
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK: fp-armv8_vmaxnm_u
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ugt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+
 declare <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 6210ad3..de329ac 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -515,6 +515,17 @@ entry:
   ret void
 }
 
+define <8 x i8> @no_distribute(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+; CHECK: no_distribute
+; CHECK: vadd.i8
+; CHECK: vmul.i8
+; CHECK-NOT: vmla.i8
+  %0 = add <8 x i8> %a, %b
+  %1 = mul <8x i8> %0, %0
+  ret <8 x i8> %1
+}
+
 ; If one operand has a zero-extend and the other a sign-extend, vmull
 ; cannot be used.
 define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) {
@@ -623,3 +634,21 @@ entry:
   store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
   ret void
 }
+
+define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
+;   Look for doing a normal scalar FP load rather than an to-all-lanes load.
+;   e.g., "ldr s0, [r2]" rathern than "vld1.32  {d18[], d19[]}, [r2:32]"
+;   Then check that the vector multiply has folded the splat to all lanes
+;   and used a vector * scalar instruction.
+; CHECK: vldr  {{s[0-9]+}}, [r2]
+; CHECK: vmul.f32  q8, q8, d0[0]
+  %tmp = load float* %src, align 4
+  %tmp5 = load <4 x float>* %a, align 4
+  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
+  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
+  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
+  %tmp10 = fmul <4 x float> %tmp9, %tmp5
+  store <4 x float> %tmp10, <4 x float>* %dst, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM/vqdmul.ll b/test/CodeGen/ARM/vqdmul.ll
index a28cae9..d298167 100644
--- a/test/CodeGen/ARM/vqdmul.ll
+++ b/test/CodeGen/ARM/vqdmul.ll
@@ -197,84 +197,92 @@ entry:
 declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
-define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: vqdmlals16:
+define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: vqdmlals16_natural:
 ;CHECK: vqdmlal.s16
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = load <4 x i16>* %C
-	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
-	ret <4 x i32> %tmp4
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i16>* %C
+        %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
+        %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+        ret <4 x i32> %tmp5
 }
 
-define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: vqdmlals32:
+define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: vqdmlals32_natural:
 ;CHECK: vqdmlal.s32
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = load <2 x i32>* %C
-	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
-	ret <2 x i64> %tmp4
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i32>* %C
+        %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
+        %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+        ret <2 x i64> %tmp5
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlal_lanes16
+; CHECK-LABEL: test_vqdmlal_lanes16_natural:
 ; CHECK: vqdmlal.s16 q0, d2, d3[1]
   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
-  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %1
+  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
+  %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+  ret <4 x i32> %2
 }
 
-define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlal_lanes32
+; CHECK-LABEL: test_vqdmlal_lanes32_natural:
 ; CHECK: vqdmlal.s32 q0, d2, d3[1]
   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
-  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
+  %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+  ret <2 x i64> %2
 }
 
-declare <4 x i32>  @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64>  @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32>  @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: vqdmlsls16:
+define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: vqdmlsls16_natural:
 ;CHECK: vqdmlsl.s16
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = load <4 x i16>* %C
-	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
-	ret <4 x i32> %tmp4
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i16>* %C
+        %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
+        %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+        ret <4 x i32> %tmp5
 }
 
-define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: vqdmlsls32:
+define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: vqdmlsls32_natural:
 ;CHECK: vqdmlsl.s32
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = load <2 x i32>* %C
-	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
-	ret <2 x i64> %tmp4
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i32>* %C
+        %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
+        %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+        ret <2 x i64> %tmp5
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlsl_lanes16
+; CHECK-LABEL: test_vqdmlsl_lanes16_natural:
 ; CHECK: vqdmlsl.s16 q0, d2, d3[1]
   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
-  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %1
+  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
+  %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+  ret <4 x i32> %2
 }
 
-define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlsl_lanes32
+; CHECK-LABEL: test_vqdmlsl_lanes32_natural:
 ; CHECK: vqdmlsl.s32 q0, d2, d3[1]
   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
-  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
+  %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+  ret <2 x i64> %2
 }
 
-declare <4 x i32>  @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64>  @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32>  @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vsel.ll b/test/CodeGen/ARM/vsel.ll
new file mode 100644
index 0000000..7e1f714
--- /dev/null
+++ b/test/CodeGen/ARM/vsel.ll
@@ -0,0 +1,309 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnueabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+define void @test_vsel32sgt(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sgt
+  %tst1 = icmp sgt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64sgt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sgt
+  %tst1 = icmp sgt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32sge(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sge
+  %tst1 = icmp sge i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselge.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64sge(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sge
+  %tst1 = icmp sge i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselge.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32eq(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32eq
+  %tst1 = icmp eq i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vseleq.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64eq(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64eq
+  %tst1 = icmp eq i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vseleq.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32slt(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32slt
+  %tst1 = icmp slt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f32 s0, s1, s0
+  ret void
+}
+define void @test_vsel64slt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64slt
+  %tst1 = icmp slt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f64 d16, d1, d0
+  ret void
+}
+define void @test_vsel32sle(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sle
+  %tst1 = icmp sle i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselge.f32 s0, s1, s0
+  ret void
+}
+define void @test_vsel64sle(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sle
+  %tst1 = icmp sle i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselge.f64 d16, d1, d0
+  ret void
+}
+define void @test_vsel32ogt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ogt
+  %tst1 = fcmp ogt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64ogt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ogt
+  %tst1 = fcmp ogt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32oge(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32oge
+  %tst1 = fcmp oge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64oge(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64oge
+  %tst1 = fcmp oge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32oeq(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32oeq
+  %tst1 = fcmp oeq float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64oeq(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64oeq
+  %tst1 = fcmp oeq float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ugt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ugt
+  %tst1 = fcmp ugt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ugt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ugt
+  %tst1 = fcmp ugt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32uge(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32uge
+  %tst1 = fcmp uge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64uge(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64uge
+  %tst1 = fcmp uge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32olt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32olt
+  %tst1 = fcmp olt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64olt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64olt
+  %tst1 = fcmp olt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ult(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ult
+  %tst1 = fcmp ult float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ult(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ult
+  %tst1 = fcmp ult float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32ole(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ole
+  %tst1 = fcmp ole float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64ole(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ole
+  %tst1 = fcmp ole float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ule(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ule
+  %tst1 = fcmp ule float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ule(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ule
+  %tst1 = fcmp ule float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32ord(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ord
+  %tst1 = fcmp ord float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ord(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ord
+  %tst1 = fcmp ord float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32une(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32une
+  %tst1 = fcmp une float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64une(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64une
+  %tst1 = fcmp une float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32uno(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32uno
+  %tst1 = fcmp uno float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64uno(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64uno
+  %tst1 = fcmp uno float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f64 d16, d1, d2
+  ret void
+}
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index 651b6d5..34c5c70 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -13,7 +13,7 @@ define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check for a post-increment updating store.
 define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst1lanei8_update:
-;CHECK: vst1.8 {d16[3]}, [r2]!
+;CHECK: vst1.8 {d16[3]}, [{{r[0-9]}}]!
 	%A = load i8** %ptr
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = extractelement <8 x i8> %tmp1, i32 3
diff --git a/test/CodeGen/ARM/vsub.ll b/test/CodeGen/ARM/vsub.ll
index 89c3095..6b95b97 100644
--- a/test/CodeGen/ARM/vsub.ll
+++ b/test/CodeGen/ARM/vsub.ll
@@ -90,37 +90,33 @@ define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp3
 }
 
-define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vsubhni16:
-;CHECK: vsubhn.i16
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i8> %tmp3
+define <8 x i8> @vsubhni16_natural(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: vsubhni16_natural:
+; CHECK: vsubhn.i16
+  %sum = sub <8 x i16> %A, %B
+  %shift = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %trunc = trunc <8 x i16> %shift to <8 x i8>
+  ret <8 x i8> %trunc
 }
 
-define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vsubhni32:
-;CHECK: vsubhn.i32
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i16> %tmp3
+define <4 x i16> @vsubhni32_natural(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: vsubhni32_natural:
+; CHECK: vsubhn.i32
+  %sum = sub <4 x i32> %A, %B
+  %shift = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+  %trunc = trunc <4 x i32> %shift to <4 x i16>
+  ret <4 x i16> %trunc
 }
 
-define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vsubhni64:
-;CHECK: vsubhn.i64
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i32> %tmp3
+define <2 x i32> @vsubhni64_natural(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: vsubhni64_natural:
+; CHECK: vsubhn.i64
+  %sum = sub <2 x i64> %A, %B
+  %shift = lshr <2 x i64> %sum, <i64 32, i64 32>
+  %trunc = trunc <2 x i64> %shift to <2 x i32>
+  ret <2 x i32> %trunc
 }
 
-declare <8 x i8>  @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-
 define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: vrsubhni16:
 ;CHECK: vrsubhn.i16
diff --git a/test/CodeGen/CPP/lit.local.cfg b/test/CodeGen/CPP/lit.local.cfg
index 4d4b4a4..4063dd1 100644
--- a/test/CodeGen/CPP/lit.local.cfg
+++ b/test/CodeGen/CPP/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'CppBackend' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
index 6281ada..3f17ce1 100644
--- a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
+++ b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
@@ -63,30 +63,58 @@ bb47:		; preds = %bb46, %bb44
 	br label %bb44
 }
 
-declare i32 @pthread_once(i32*, void ()*)
+define i32 @pthread_once(i32*, void ()*) {
+  ret i32 0
+}
 
-declare i8* @pthread_getspecific(i32)
+define i8* @pthread_getspecific(i32) {
+  ret i8* null
+}
 
-declare i32 @pthread_setspecific(i32, i8*)
+define i32 @pthread_setspecific(i32, i8*) {
+  ret i32 0
+}
 
-declare i32 @pthread_create(i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)
+define i32 @pthread_create(i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*) {
+  ret i32 0
+}
 
-declare i32 @pthread_cancel(i64)
+define i32 @pthread_cancel(i64) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_lock(%struct.pthread_mutex_t*)
+define i32 @pthread_mutex_lock(%struct.pthread_mutex_t*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_trylock(%struct.pthread_mutex_t*)
+define i32 @pthread_mutex_trylock(%struct.pthread_mutex_t*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_unlock(%struct.pthread_mutex_t*)
+define i32 @pthread_mutex_unlock(%struct.pthread_mutex_t*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_init(%struct.pthread_mutex_t*, %struct.Alignment*)
+define i32 @pthread_mutex_init(%struct.pthread_mutex_t*, %struct.Alignment*) {
+  ret i32 0
+}
 
-declare i32 @pthread_key_create(i32*, void (i8*)*)
+define i32 @pthread_key_create(i32*, void (i8*)*) {
+  ret i32 0
+}
 
-declare i32 @pthread_key_delete(i32)
+define i32 @pthread_key_delete(i32) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutexattr_init(%struct.Alignment*)
+define i32 @pthread_mutexattr_init(%struct.Alignment*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutexattr_settype(%struct.Alignment*, i32)
+define i32 @pthread_mutexattr_settype(%struct.Alignment*, i32) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutexattr_destroy(%struct.Alignment*)
+define i32 @pthread_mutexattr_destroy(%struct.Alignment*) {
+  ret i32 0
+}
diff --git a/test/CodeGen/Generic/crash.ll b/test/CodeGen/Generic/crash.ll
index d3fc204..8de6b0d 100644
--- a/test/CodeGen/Generic/crash.ll
+++ b/test/CodeGen/Generic/crash.ll
@@ -23,7 +23,7 @@ bb32:                                             ; preds = %bb6
 %3 = load double* %1, align 4
 %4 = load double* %0, align 4
 call void @Parse_Vector(double* %0) nounwind
-%5 = call i32 @llvm.objectsize.i32(i8* undef, i1 false)
+%5 = call i32 @llvm.objectsize.i32.p0i8(i8* undef, i1 false)
 %6 = icmp eq i32 %5, -1
 br i1 %6, label %bb34, label %bb33
 
@@ -36,7 +36,7 @@ unreachable
 }
 
 declare void @Parse_Vector(double*)
-declare i32 @llvm.objectsize.i32(i8*, i1)
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1)
 
 
 ; PR9578
diff --git a/test/CodeGen/Generic/lit.local.cfg b/test/CodeGen/Generic/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/CodeGen/Generic/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/CodeGen/Hexagon/BranchPredict.ll b/test/CodeGen/Hexagon/BranchPredict.ll
index 716e85d..4ab1966 100644
--- a/test/CodeGen/Hexagon/BranchPredict.ll
+++ b/test/CodeGen/Hexagon/BranchPredict.ll
@@ -53,7 +53,7 @@ return:                                           ; preds = %if.else, %if.then
 define i32 @foo_bar(i32 %a, i16 signext %b) nounwind {
 ; CHECK: if{{ *}}(!cmp.eq(r{{[0-9]*}}.new, #0)) jump:nt
 entry:
-  %0 = load i32* @j, align 4, !tbaa !2
+  %0 = load i32* @j, align 4
   %tobool = icmp eq i32 %0, 0
   br i1 %tobool, label %if.else, label %if.then, !prof !0
 
@@ -74,6 +74,3 @@ return:                                           ; preds = %if.else, %if.then
 
 !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
-!2 = metadata !{metadata !"int", metadata !3}
-!3 = metadata !{metadata !"omnipotent char", metadata !4}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/combine_ir.ll b/test/CodeGen/Hexagon/combine_ir.ll
index 8b99ef7..e100cf7 100644
--- a/test/CodeGen/Hexagon/combine_ir.ll
+++ b/test/CodeGen/Hexagon/combine_ir.ll
@@ -4,7 +4,7 @@
 
 define void @word(i32* nocapture %a) nounwind {
 entry:
-  %0 = load i32* %a, align 4, !tbaa !0
+  %0 = load i32* %a, align 4
   %1 = zext i32 %0 to i64
   tail call void @bar(i64 %1) nounwind
   ret void
@@ -17,10 +17,10 @@ declare void @bar(i64)
 
 define void @halfword(i16* nocapture %a) nounwind {
 entry:
-  %0 = load i16* %a, align 2, !tbaa !3
+  %0 = load i16* %a, align 2
   %1 = zext i16 %0 to i64
   %add.ptr = getelementptr inbounds i16* %a, i32 1
-  %2 = load i16* %add.ptr, align 2, !tbaa !3
+  %2 = load i16* %add.ptr, align 2
   %3 = zext i16 %2 to i64
   %4 = shl nuw nsw i64 %3, 16
   %ins = or i64 %4, %1
@@ -33,18 +33,13 @@ entry:
 
 define void @byte(i8* nocapture %a) nounwind {
 entry:
-  %0 = load i8* %a, align 1, !tbaa !1
+  %0 = load i8* %a, align 1
   %1 = zext i8 %0 to i64
   %add.ptr = getelementptr inbounds i8* %a, i32 1
-  %2 = load i8* %add.ptr, align 1, !tbaa !1
+  %2 = load i8* %add.ptr, align 1
   %3 = zext i8 %2 to i64
   %4 = shl nuw nsw i64 %3, 8
   %ins = or i64 %4, %1
   tail call void @bar(i64 %ins) nounwind
   ret void
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index fce6d19..bfdd813 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -35,13 +35,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!29}
 
 !0 = metadata !{i32 786449, metadata !28, i32 12, metadata !"QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !28, null, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*)* @foo, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !9}
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
@@ -60,3 +61,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !26 = metadata !{i32 3, i32 23, metadata !20, null}
 !27 = metadata !{i32 6, i32 1, metadata !16, null}
 !28 = metadata !{metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t"}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/Hexagon/lit.local.cfg b/test/CodeGen/Hexagon/lit.local.cfg
index 24324b2..e96bab8 100644
--- a/test/CodeGen/Hexagon/lit.local.cfg
+++ b/test/CodeGen/Hexagon/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Hexagon' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Hexagon/memops.ll b/test/CodeGen/Hexagon/memops.ll
index 5498848..fca1a73 100644
--- a/test/CodeGen/Hexagon/memops.ll
+++ b/test/CodeGen/Hexagon/memops.ll
@@ -4,11 +4,11 @@
 define void @memop_unsigned_char_add5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv = zext i8 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -16,11 +16,11 @@ define void @memop_unsigned_char_add(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv1 = zext i8 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
@@ -28,51 +28,51 @@ define void @memop_unsigned_char_sub(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv1 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_or(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %p, align 1, !tbaa !0
+  store i8 %or3, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_and(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %p, align 1, !tbaa !0
+  store i8 %and3, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_clrbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv = zext i8 %0 to i32
   %and = and i32 %conv, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_setbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv = zext i8 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -80,11 +80,11 @@ define void @memop_unsigned_char_add5_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -93,11 +93,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -106,11 +106,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -118,9 +118,9 @@ define void @memop_unsigned_char_or_index(i8* nocapture %p, i32 %i, i8 zeroext %
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -128,9 +128,9 @@ define void @memop_unsigned_char_and_index(i8* nocapture %p, i32 %i, i8 zeroext
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -138,11 +138,11 @@ define void @memop_unsigned_char_clrbit_index(i8* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %and = and i32 %conv, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -150,11 +150,11 @@ define void @memop_unsigned_char_setbit_index(i8* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -162,11 +162,11 @@ define void @memop_unsigned_char_add5_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -175,11 +175,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -188,11 +188,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -200,9 +200,9 @@ define void @memop_unsigned_char_or_index5(i8* nocapture %p, i8 zeroext %x) noun
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -210,9 +210,9 @@ define void @memop_unsigned_char_and_index5(i8* nocapture %p, i8 zeroext %x) nou
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -220,11 +220,11 @@ define void @memop_unsigned_char_clrbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %and = and i32 %conv, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -232,22 +232,22 @@ define void @memop_unsigned_char_setbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
 define void @memop_signed_char_add5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv2 = zext i8 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -255,11 +255,11 @@ define void @memop_signed_char_add(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv13 = zext i8 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
@@ -267,51 +267,51 @@ define void @memop_signed_char_sub(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv13 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_or(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %p, align 1, !tbaa !0
+  store i8 %or3, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_and(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %p, align 1, !tbaa !0
+  store i8 %and3, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_clrbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv2 = zext i8 %0 to i32
   %and = and i32 %conv2, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_setbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv2 = zext i8 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -319,11 +319,11 @@ define void @memop_signed_char_add5_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -332,11 +332,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -345,11 +345,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -357,9 +357,9 @@ define void @memop_signed_char_or_index(i8* nocapture %p, i32 %i, i8 signext %x)
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -367,9 +367,9 @@ define void @memop_signed_char_and_index(i8* nocapture %p, i32 %i, i8 signext %x
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -377,11 +377,11 @@ define void @memop_signed_char_clrbit_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %and = and i32 %conv2, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -389,11 +389,11 @@ define void @memop_signed_char_setbit_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -401,11 +401,11 @@ define void @memop_signed_char_add5_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -414,11 +414,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -427,11 +427,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -439,9 +439,9 @@ define void @memop_signed_char_or_index5(i8* nocapture %p, i8 signext %x) nounwi
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -449,9 +449,9 @@ define void @memop_signed_char_and_index5(i8* nocapture %p, i8 signext %x) nounw
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -459,11 +459,11 @@ define void @memop_signed_char_clrbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %and = and i32 %conv2, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -471,22 +471,22 @@ define void @memop_signed_char_setbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
 define void @memop_unsigned_short_add5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv = zext i16 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -494,11 +494,11 @@ define void @memop_unsigned_short_add(i16* nocapture %p, i16 zeroext %x) nounwin
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv1 = zext i16 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
@@ -506,51 +506,51 @@ define void @memop_unsigned_short_sub(i16* nocapture %p, i16 zeroext %x) nounwin
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv1 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_or(i16* nocapture %p, i16 zeroext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %p, align 2, !tbaa !2
+  store i16 %or3, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_and(i16* nocapture %p, i16 zeroext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %p, align 2, !tbaa !2
+  store i16 %and3, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_clrbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv = zext i16 %0 to i32
   %and = and i32 %conv, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_setbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv = zext i16 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -558,11 +558,11 @@ define void @memop_unsigned_short_add5_index(i16* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -571,11 +571,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -584,11 +584,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -596,9 +596,9 @@ define void @memop_unsigned_short_or_index(i16* nocapture %p, i32 %i, i16 zeroex
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -606,9 +606,9 @@ define void @memop_unsigned_short_and_index(i16* nocapture %p, i32 %i, i16 zeroe
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -616,11 +616,11 @@ define void @memop_unsigned_short_clrbit_index(i16* nocapture %p, i32 %i) nounwi
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %and = and i32 %conv, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -628,11 +628,11 @@ define void @memop_unsigned_short_setbit_index(i16* nocapture %p, i32 %i) nounwi
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -640,11 +640,11 @@ define void @memop_unsigned_short_add5_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -653,11 +653,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -666,11 +666,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -678,9 +678,9 @@ define void @memop_unsigned_short_or_index5(i16* nocapture %p, i16 zeroext %x) n
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -688,9 +688,9 @@ define void @memop_unsigned_short_and_index5(i16* nocapture %p, i16 zeroext %x)
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -698,11 +698,11 @@ define void @memop_unsigned_short_clrbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %and = and i32 %conv, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -710,22 +710,22 @@ define void @memop_unsigned_short_setbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
 define void @memop_signed_short_add5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv2 = zext i16 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -733,11 +733,11 @@ define void @memop_signed_short_add(i16* nocapture %p, i16 signext %x) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv13 = zext i16 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
@@ -745,51 +745,51 @@ define void @memop_signed_short_sub(i16* nocapture %p, i16 signext %x) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv13 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_or(i16* nocapture %p, i16 signext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %p, align 2, !tbaa !2
+  store i16 %or3, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_and(i16* nocapture %p, i16 signext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %p, align 2, !tbaa !2
+  store i16 %and3, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_clrbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv2 = zext i16 %0 to i32
   %and = and i32 %conv2, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_setbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv2 = zext i16 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -797,11 +797,11 @@ define void @memop_signed_short_add5_index(i16* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -810,11 +810,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -823,11 +823,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -835,9 +835,9 @@ define void @memop_signed_short_or_index(i16* nocapture %p, i32 %i, i16 signext
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -845,9 +845,9 @@ define void @memop_signed_short_and_index(i16* nocapture %p, i32 %i, i16 signext
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -855,11 +855,11 @@ define void @memop_signed_short_clrbit_index(i16* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %and = and i32 %conv2, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -867,11 +867,11 @@ define void @memop_signed_short_setbit_index(i16* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -879,11 +879,11 @@ define void @memop_signed_short_add5_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -892,11 +892,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -905,11 +905,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -917,9 +917,9 @@ define void @memop_signed_short_or_index5(i16* nocapture %p, i16 signext %x) nou
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -927,9 +927,9 @@ define void @memop_signed_short_and_index5(i16* nocapture %p, i16 signext %x) no
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -937,11 +937,11 @@ define void @memop_signed_short_clrbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %and = and i32 %conv2, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -949,74 +949,74 @@ define void @memop_signed_short_setbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
 define void @memop_signed_int_add5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add i32 %0, 5
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_add(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add i32 %0, %x
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_sub(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %sub = sub i32 %0, %x
-  store i32 %sub, i32* %p, align 4, !tbaa !3
+  store i32 %sub, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_or(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_and(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_clrbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_setbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
@@ -1024,9 +1024,9 @@ define void @memop_signed_int_add5_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1034,9 +1034,9 @@ define void @memop_signed_int_add_index(i32* nocapture %p, i32 %i, i32 %x) nounw
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1044,9 +1044,9 @@ define void @memop_signed_int_sub_index(i32* nocapture %p, i32 %i, i32 %x) nounw
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1054,9 +1054,9 @@ define void @memop_signed_int_or_index(i32* nocapture %p, i32 %i, i32 %x) nounwi
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1064,9 +1064,9 @@ define void @memop_signed_int_and_index(i32* nocapture %p, i32 %i, i32 %x) nounw
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1074,9 +1074,9 @@ define void @memop_signed_int_clrbit_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1084,9 +1084,9 @@ define void @memop_signed_int_setbit_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1094,9 +1094,9 @@ define void @memop_signed_int_add5_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1104,9 +1104,9 @@ define void @memop_signed_int_add_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1114,9 +1114,9 @@ define void @memop_signed_int_sub_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1124,9 +1124,9 @@ define void @memop_signed_int_or_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1134,9 +1134,9 @@ define void @memop_signed_int_and_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1144,9 +1144,9 @@ define void @memop_signed_int_clrbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1154,72 +1154,72 @@ define void @memop_signed_int_setbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
 define void @memop_unsigned_int_add5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add nsw i32 %0, 5
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_add(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add nsw i32 %0, %x
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_sub(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %sub = sub nsw i32 %0, %x
-  store i32 %sub, i32* %p, align 4, !tbaa !3
+  store i32 %sub, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_or(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_and(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_clrbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_setbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
@@ -1227,9 +1227,9 @@ define void @memop_unsigned_int_add5_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1237,9 +1237,9 @@ define void @memop_unsigned_int_add_index(i32* nocapture %p, i32 %i, i32 %x) nou
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1247,9 +1247,9 @@ define void @memop_unsigned_int_sub_index(i32* nocapture %p, i32 %i, i32 %x) nou
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub nsw i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1257,9 +1257,9 @@ define void @memop_unsigned_int_or_index(i32* nocapture %p, i32 %i, i32 %x) noun
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1267,9 +1267,9 @@ define void @memop_unsigned_int_and_index(i32* nocapture %p, i32 %i, i32 %x) nou
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1277,9 +1277,9 @@ define void @memop_unsigned_int_clrbit_index(i32* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1287,9 +1287,9 @@ define void @memop_unsigned_int_setbit_index(i32* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1297,9 +1297,9 @@ define void @memop_unsigned_int_add5_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1307,9 +1307,9 @@ define void @memop_unsigned_int_add_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1317,9 +1317,9 @@ define void @memop_unsigned_int_sub_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub nsw i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1327,9 +1327,9 @@ define void @memop_unsigned_int_or_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1337,9 +1337,9 @@ define void @memop_unsigned_int_and_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1347,9 +1347,9 @@ define void @memop_unsigned_int_clrbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1357,13 +1357,8 @@ define void @memop_unsigned_int_setbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
-
-!0 = metadata !{metadata !"omnipotent char", metadata !1}
-!1 = metadata !{metadata !"Simple C/C++ TBAA"}
-!2 = metadata !{metadata !"short", metadata !0}
-!3 = metadata !{metadata !"int", metadata !0}
diff --git a/test/CodeGen/Hexagon/union-1.ll b/test/CodeGen/Hexagon/union-1.ll
index 7c6da74..fe79f95 100644
--- a/test/CodeGen/Hexagon/union-1.ll
+++ b/test/CodeGen/Hexagon/union-1.ll
@@ -5,10 +5,10 @@
 
 define void @word(i32* nocapture %a) nounwind {
 entry:
-  %0 = load i32* %a, align 4, !tbaa !0
+  %0 = load i32* %a, align 4
   %1 = zext i32 %0 to i64
   %add.ptr = getelementptr inbounds i32* %a, i32 1
-  %2 = load i32* %add.ptr, align 4, !tbaa !0
+  %2 = load i32* %add.ptr, align 4
   %3 = zext i32 %2 to i64
   %4 = shl nuw i64 %3, 32
   %ins = or i64 %4, %1
@@ -17,7 +17,3 @@ entry:
 }
 
 declare void @bar(i64)
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index f35a5d1..953e576 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
@@ -12,11 +12,12 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!13}
 
 !0 = metadata !{i32 786478, metadata !12, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 2.9 (trunk 120996)", i1 false, metadata !"", i32 0, metadata !6, metadata !6, metadata !11, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 0}
@@ -26,3 +27,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !10 = metadata !{i32 4, i32 2, metadata !8, null}
 !11 = metadata !{metadata !0}
 !12 = metadata !{metadata !"/tmp/x.c", metadata !"/Users/manav"}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/MSP430/cc_args.ll b/test/CodeGen/MSP430/cc_args.ll
new file mode 100644
index 0000000..39e99e2
--- /dev/null
+++ b/test/CodeGen/MSP430/cc_args.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+define void @test() #0 {
+entry:
+; CHECK: test:
+
+; CHECK: mov.w #1, r15
+; CHECK: call #f_i16
+  call void @f_i16(i16 1)
+
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: call #f_i32
+  call void @f_i32(i32 16909060)
+
+; CHECK: mov.w #1800, r12
+; CHECK: mov.w #1286, r13
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: call #f_i64
+  call void @f_i64(i64 72623859790382856)
+
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: mov.w #1800, r12
+; CHECK: mov.w #1286, r13
+; CHECK: call #f_i32_i32
+  call void @f_i32_i32(i32 16909060, i32 84281096)
+
+; CHECK: mov.w #1, r15
+; CHECK: mov.w #772, r13
+; CHECK: mov.w #258, r14
+; CHECK: mov.w #2, r12
+; CHECK: call #f_i16_i32_i16
+  call void @f_i16_i32_i16(i16 1, i32 16909060, i16 2)
+
+; CHECK: mov.w #2, 8(r1)
+; CHECK: mov.w #258, 6(r1)
+; CHECK: mov.w #772, 4(r1)
+; CHECK: mov.w #1286, 2(r1)
+; CHECK: mov.w #1800, 0(r1)
+; CHECK: mov.w #1, r15
+; CHECK: call #f_i16_i64_i16
+  call void @f_i16_i64_i16(i16 1, i64 72623859790382856, i16 2)
+
+  ret void
+}
+
+@g_i16 = common global i16 0, align 2
+@g_i32 = common global i32 0, align 2
+@g_i64 = common global i64 0, align 2
+
+define void @f_i16(i16 %a) #0 {
+; CHECK: f_i16:
+; CHECK: mov.w r15, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+  ret void
+}
+
+define void @f_i32(i32 %a) #0 {
+; CHECK: f_i32:
+; CHECK: mov.w r15, &g_i32+2
+; CHECK: mov.w r14, &g_i32
+  store volatile i32 %a, i32* @g_i32, align 2
+  ret void
+}
+
+define void @f_i64(i64 %a) #0 {
+; CHECK: f_i64:
+; CHECK: mov.w r15, &g_i64+6
+; CHECK: mov.w r14, &g_i64+4
+; CHECK: mov.w r13, &g_i64+2
+; CHECK: mov.w r12, &g_i64
+  store volatile i64 %a, i64* @g_i64, align 2
+  ret void
+}
+
+define void @f_i32_i32(i32 %a, i32 %b) #0 {
+; CHECK: f_i32_i32:
+; CHECK: mov.w r15, &g_i32+2
+; CHECK: mov.w r14, &g_i32
+  store volatile i32 %a, i32* @g_i32, align 2
+; CHECK: mov.w r13, &g_i32+2
+; CHECK: mov.w r12, &g_i32
+  store volatile i32 %b, i32* @g_i32, align 2
+  ret void
+}
+
+define void @f_i16_i32_i16(i16 %a, i32 %b, i16 %c) #0 {
+; CHECK: f_i16_i32_i16:
+; CHECK: mov.w r15, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+; CHECK: mov.w r14, &g_i32+2
+; CHECK: mov.w r13, &g_i32
+  store volatile i32 %b, i32* @g_i32, align 2
+; CHECK: mov.w r12, &g_i16
+  store volatile i16 %c, i16* @g_i16, align 2
+  ret void
+}
+
+define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 {
+; CHECK: f_i16_i64_i16:
+; CHECK: mov.w r15, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+;CHECK: mov.w 10(r4), &g_i64+6
+;CHECK: mov.w 8(r4), &g_i64+4
+;CHECK: mov.w 6(r4), &g_i64+2
+;CHECK: mov.w 4(r4), &g_i64
+  store volatile i64 %b, i64* @g_i64, align 2
+;CHECK: mov.w 12(r4), &g_i16
+  store volatile i16 %c, i16* @g_i16, align 2
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/MSP430/cc_ret.ll b/test/CodeGen/MSP430/cc_ret.ll
new file mode 100644
index 0000000..c2a9ae6
--- /dev/null
+++ b/test/CodeGen/MSP430/cc_ret.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+define void @test() #0 {
+entry:
+; CHECK: test:
+
+; CHECK: call #f_i16
+; CHECK: mov.w r15, &g_i16
+  %0 = call i16 @f_i16()
+  store volatile i16 %0, i16* @g_i16
+
+; CHECK: call #f_i32
+; CHECK: mov.w r15, &g_i32+2
+; CHECK: mov.w r14, &g_i32
+  %1 = call i32 @f_i32()
+  store volatile i32 %1, i32* @g_i32
+
+; CHECK: call #f_i64
+; CHECK: mov.w r15, &g_i64+6
+; CHECK: mov.w r14, &g_i64+4
+; CHECK: mov.w r13, &g_i64+2
+; CHECK: mov.w r12, &g_i64
+  %2 = call i64 @f_i64()
+  store volatile i64 %2, i64* @g_i64
+
+  ret void
+}
+
+@g_i16 = common global i16 0, align 2
+@g_i32 = common global i32 0, align 2
+@g_i64 = common global i64 0, align 2
+
+define i16 @f_i16() #0 {
+; CHECK: f_i16:
+; CHECK: mov.w #1, r15
+; CHECK: ret
+  ret i16 1
+}
+
+define i32 @f_i32() #0 {
+; CHECK: f_i32:
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: ret
+  ret i32 16909060
+}
+
+define i64 @f_i64() #0 {
+; CHECK: f_i64:
+; CHECK: mov.w #1800, r12
+; CHECK: mov.w #1286, r13
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: ret
+  ret i64 72623859790382856
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/MSP430/lit.local.cfg b/test/CodeGen/MSP430/lit.local.cfg
index 0ca9fc9..a18fe6f 100644
--- a/test/CodeGen/MSP430/lit.local.cfg
+++ b/test/CodeGen/MSP430/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'MSP430' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/MSP430/transient-stack-alignment.ll b/test/CodeGen/MSP430/transient-stack-alignment.ll
new file mode 100644
index 0000000..cca8350
--- /dev/null
+++ b/test/CodeGen/MSP430/transient-stack-alignment.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+define void @test() #0 {
+; CHECK-LABEL: test:
+; CHECK: sub.w #2, r1
+  %1 = alloca i8, align 1
+; CHECK-NEXT: mov.b #0, 1(r1)
+  store i8 0, i8* %1, align 1
+; CHECK-NEXT: add.w #2, r1
+; CHECK-NEXT: ret
+  ret void
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
diff --git a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
index 8479ad2..3381143 100644
--- a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
+++ b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s 
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   < %s | FileCheck %s 
 
 define signext i8 @A(i8 %e.0, i8 signext %sum)  nounwind {
 entry:
diff --git a/test/CodeGen/Mips/2008-08-01-AsmInline.ll b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
index dbde742..e274bc0 100644
--- a/test/CodeGen/Mips/2008-08-01-AsmInline.ll
+++ b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
@@ -51,3 +51,21 @@ entry:
   ret void
 }
 
+; Check that RA doesn't allocate registers in the clobber list.
+; CHECK-LABEL: foo4:
+; CHECK: #APP
+; CHECK-NOT: ulh $2
+; CHECK: #NO_APP
+; CHECK: #APP
+; CHECK-NOT: $f0
+; CHECK: #NO_APP
+
+define void @foo4() {
+entry:
+  %0 = tail call i32 asm sideeffect "ulh $0,16($$sp)\0A\09", "=r,~{$2}"()
+  store i32 %0, i32* @gi2, align 4
+  %1 = load float* @gf0, align 4
+  %2 = tail call double asm sideeffect "cvt.d.s $0, $1\0A\09", "=f,f,~{$f0}"(float %1)
+  store double %2, double* @gd0, align 8
+  ret void
+}
diff --git a/test/CodeGen/Mips/2013-11-18-fp64-const0.ll b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
new file mode 100644
index 0000000..f8390d9
--- /dev/null
+++ b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mips -mattr=-fp64 < %s | FileCheck -check-prefix=CHECK-FP32 %s
+; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck -check-prefix=CHECK-FP64 %s
+
+; This test case is a simplified version of an llvm-stress generated test with
+; seed=3718491962.
+; It originally failed on MIPS32 with FP64 with the following error:
+;     LLVM ERROR: ran out of registers during register allocation
+; This was caused by impossible register class restrictions caused by the use
+; of BuildPairF64 instead of BuildPairF64_64.
+
+define void @autogen_SD3718491962() {
+BB:
+  ; CHECK-FP32: mtc1 $zero, $f{{[0-3]*[02468]}}
+  ; CHECK-FP32: mtc1 $zero, $f{{[0-3]*[13579]}}
+
+  ; CHECK-FP64: mtc1 $zero, $f{{[0-9]+}}
+  ; CHECK-FP64-NOT: mtc1 $zero,
+  ; FIXME: A redundant mthc1 is currently emitted. Add a -NOT when it is
+  ;        eliminated
+
+  %Cmp = fcmp ule double 0.000000e+00, undef
+  %Cmp11 = fcmp ueq double 0xFDBD965CF1BB7FDA, undef
+  br label %CF88
+
+CF88:                                             ; preds = %CF86
+  %Sl18 = select i1 %Cmp, i1 %Cmp11, i1 %Cmp
+  br i1 %Sl18, label %CF88, label %CF85
+
+CF85:                                             ; preds = %CF88
+  ret void
+}
diff --git a/test/CodeGen/Mips/beqzc.ll b/test/CodeGen/Mips/beqzc.ll
new file mode 100644
index 0000000..4a294c2
--- /dev/null
+++ b/test/CodeGen/Mips/beqzc.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define i32 @main() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  %. = select i1 %cmp, i32 10, i32 55
+  store i32 %., i32* @j, align 4
+; cond-b-short: 	beqz	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
+  ret i32 0
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
+
diff --git a/test/CodeGen/Mips/beqzc1.ll b/test/CodeGen/Mips/beqzc1.ll
new file mode 100644
index 0000000..8f929a8
--- /dev/null
+++ b/test/CodeGen/Mips/beqzc1.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define i32 @main() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; cond-b-short: 	bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
+if.then:                                          ; preds = %entry
+  store i32 10, i32* @j, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 0
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index 7de7fa6..beab65f 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static   < %s | FileCheck %s -check-prefix=STATIC-MIPS16-1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static   < %s | FileCheck %s -check-prefix=STATIC-MIPS16-2
 
 @reg = common global i8* null, align 4
 
@@ -36,6 +38,14 @@ entry:
 ; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst($tmp[[T2]])
 ; STATIC-N64: ld  $[[R3:[0-9]+]], %got_page($tmp[[T3:[0-9]+]])
 ; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst($tmp[[T3]])
+; STATIC-MIPS16-1: .ent	f
+; STATIC-MIPS16-2: .ent	f
+; STATIC-MIPS16-1: li  $[[R1_16:[0-9]+]], %hi($tmp[[TI_16:[0-9]+]])
+; STATIC-MIPS16-1: sll ${{[0-9]+}},  $[[R1_16]], 16
+; STATIC-MIPS16-2: li  ${{[0-9]+}}, %lo($tmp{{[0-9]+}})
+; STATIC-MIPS16-1 jal	dummy
+; STATIC-MIPS16-2 jal	dummy
+
 define void @f() nounwind {
 entry:
   %call = tail call i8* @dummy(i8* blockaddress(@f, %baz))
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index 869ecd9..68341c1 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -160,7 +160,14 @@ for.end:                                          ; preds = %for.body, %entry
 ;
 ; SUCCBB-LABEL:      succbbs_br1:
 ; SUCCBB:      beqz ${{[0-9]+}}, $BB
-; SUCCBB-NEXT: lw $25, %call16(foo100)
+; SUCCBB-NEXT: lw ${{[0-9]+}}, %got(foo101)(${{[0-9]+}})
+
+define internal fastcc void @foo101() {
+entry:
+  tail call void @foo100()
+  tail call void @foo100()
+  ret void
+}
 
 define void @succbbs_br1(i32 %a) {
 entry:
@@ -168,7 +175,7 @@ entry:
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void @foo100() #1
+  tail call fastcc void @foo101()
   br label %if.end
 
 if.end:                                           ; preds = %entry, %if.then
diff --git a/test/CodeGen/Mips/brsize3.ll b/test/CodeGen/Mips/brsize3.ll
new file mode 100644
index 0000000..7b1f440
--- /dev/null
+++ b/test/CodeGen/Mips/brsize3.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-no-short
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-long
+
+; ModuleID = 'brsize3.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+; Function Attrs: noreturn nounwind optsize
+define void @foo() #0 {
+entry:
+  br label %x
+
+x:                                                ; preds = %x, %entry
+  tail call void asm sideeffect ".space 60000", ""() #1, !srcloc !1
+  br label %x
+; b-long: $BB0_1:
+; b-long:	#APP
+; b-long:	.space 60000
+; b-long:	#NO_APP
+; b-long:	b	$BB0_1
+; b-no-short: $BB0_1:
+; b-no-short:	#APP
+; b-no-short:	.space 60000
+; b-no-short:	#NO_APP
+; b-no-short-NOT:	b	$BB0_1 # 16 bit inst
+
+}
+
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind }
+
+!1 = metadata !{i32 45}
diff --git a/test/CodeGen/Mips/brsize3a.ll b/test/CodeGen/Mips/brsize3a.ll
new file mode 100644
index 0000000..6382fa2
--- /dev/null
+++ b/test/CodeGen/Mips/brsize3a.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-short
+
+; ModuleID = 'brsize3.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+; Function Attrs: noreturn nounwind optsize
+define void @foo() #0 {
+entry:
+  br label %x
+
+x:                                                ; preds = %x, %entry
+  tail call void asm sideeffect ".space 200", ""() #1, !srcloc !1
+  br label %x
+; b-short: $BB0_1:
+; b-short:	#APP
+; b-short:	.space 200
+; b-short:	#NO_APP
+; b-short:	b	$BB0_1 # 16 bit inst
+
+}
+
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind }
+
+!1 = metadata !{i32 45}
diff --git a/test/CodeGen/Mips/bswap.ll b/test/CodeGen/Mips/bswap.ll
index 0da2d2b..f17b91a 100644
--- a/test/CodeGen/Mips/bswap.ll
+++ b/test/CodeGen/Mips/bswap.ll
@@ -1,11 +1,13 @@
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=MIPS32
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=MIPS64
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   | FileCheck %s -check-prefix=mips16 
 
 define i32 @bswap32(i32 %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswap32:
 ; MIPS32: wsbh $[[R0:[0-9]+]]
 ; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; mips16: .ent bswap32
   %or.3 = call i32 @llvm.bswap.i32(i32 %x)
   ret i32 %or.3
 }
@@ -15,6 +17,7 @@ entry:
 ; MIPS64-LABEL: bswap64:
 ; MIPS64: dsbh $[[R0:[0-9]+]]
 ; MIPS64: dshd ${{[0-9]+}}, $[[R0]]
+; mips16: .ent bswap64
   %or.7 = call i64 @llvm.bswap.i64(i64 %x)
   ret i64 %or.7
 }
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index 585bc25..490d427 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -1,20 +1,31 @@
-; RUN: llc  < %s -march=mipsel | FileCheck %s
-; RUN: llc  < %s -march=mips   | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32
+; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64
+; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64
+
 @a = external global i32
 
+; CHECK-LABEL: f:
+; FP32: mtc1
+; FP32: mtc1
+; FP64-DAG: mtc1
+; FP64-DAG: mthc1
+
 define double @f(i32 %a1, double %d) nounwind {
 entry:
-; CHECK: mtc1
-; CHECK: mtc1
   store i32 %a1, i32* @a, align 4
   %add = fadd double %d, 2.000000e+00
   ret double %add
 }
 
+; CHECK-LABEL: f3:
+; FP32: mfc1
+; FP32: mfc1
+; FP64-DAG: mfc1
+; FP64-DAG: mfhc1
+
 define void @f3(double %d, i32 %a1) nounwind {
 entry:
-; CHECK: mfc1
-; CHECK: mfc1
   tail call void @f2(i32 %a1, double %d) nounwind
   ret void
 }
diff --git a/test/CodeGen/Mips/cmplarge.ll b/test/CodeGen/Mips/cmplarge.ll
index b082fa3..2a3d30a 100644
--- a/test/CodeGen/Mips/cmplarge.ll
+++ b/test/CodeGen/Mips/cmplarge.ll
@@ -10,7 +10,7 @@ target triple = "mipsel--linux-gnu"
 define void @getSubImagesLuma(%struct.StorablePicture* nocapture %s) #0 {
 entry:
   %size_y = getelementptr inbounds %struct.StorablePicture* %s, i32 0, i32 1
-  %0 = load i32* %size_y, align 4, !tbaa !0
+  %0 = load i32* %size_y, align 4
   %sub = add nsw i32 %0, -1
   %add5 = add nsw i32 %0, 20
   %cmp6 = icmp sgt i32 %add5, -20
@@ -20,7 +20,7 @@ for.body:                                         ; preds = %entry, %for.body
   %j.07 = phi i32 [ %inc, %for.body ], [ -20, %entry ]
   %call = tail call i32 bitcast (i32 (...)* @iClip3 to i32 (i32, i32, i32)*)(i32 0, i32 %sub, i32 %j.07) #2
   %inc = add nsw i32 %j.07, 1
-  %1 = load i32* %size_y, align 4, !tbaa !0
+  %1 = load i32* %size_y, align 4
   %add = add nsw i32 %1, 20
   %cmp = icmp slt i32 %inc, %add
   br i1 %cmp, label %for.body, label %for.end
@@ -33,10 +33,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; cmp16:	.end	getSubImagesLuma
 declare i32 @iClip3(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Mips/const1.ll b/test/CodeGen/Mips/const1.ll
new file mode 100644
index 0000000..cb2baca
--- /dev/null
+++ b/test/CodeGen/Mips/const1.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands < %s | FileCheck %s 
+
+; ModuleID = 'const1.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mipsel-unknown-linux"
+
+@i = common global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+@l = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+  store i32 -559023410, i32* @j, align 4
+  store i32 -87105875, i32* @k, align 4
+  store i32 262991277, i32* @l, align 4
+  ret void
+; CHECK: 	lw	${{[0-9]+}}, $CPI0_0
+; CHECK:	lw	${{[0-9]+}}, $CPI0_1
+; CHECK: 	lw	${{[0-9]+}}, $CPI0_2
+; CHECK: $CPI0_0:
+; CHECK:	.4byte	3735943886
+; CHECK: $CPI0_1:
+; CHECK:	.4byte	4207861421
+; CHECK: $CPI0_2:
+; CHECK:	.4byte	262991277
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b754974ec32ab712ea7d8b52cd8037b24e7d6ed3) (gitosis@dmz-portal.mips.com:llvm.git 8e211187b501bc73edb938fde0019c9a20bcffd5)"}
diff --git a/test/CodeGen/Mips/const4a.ll b/test/CodeGen/Mips/const4a.ll
new file mode 100644
index 0000000..0332327
--- /dev/null
+++ b/test/CodeGen/Mips/const4a.ll
@@ -0,0 +1,180 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
+
+; ModuleID = 'const4.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+@b = common global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+@l = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+  %0 = load i32* @b, align 4
+; no-load-relax	lw	${{[0-9]+}}, $CPI0_1	# 16 bit inst
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.else
+; no-load-relax:	beqz	${{[0-9]+}}, $BB0_3
+; no-load-relax:	lw	${{[0-9]+}}, %call16(foo)(${{[0-9]+}})
+; no-load-relax:	b	$BB0_4
+; no-load-relax:	.align	2
+; no-load-relax: $CPI0_0:
+; no-load-relax:	.4byte	3735943886
+; no-load-relax: $BB0_3:
+; no-load-relax:	lw	${{[0-9]+}}, %call16(goo)(${{[0-9]+}})
+if.then:                                          ; preds = %entry
+  call void bitcast (void (...)* @foo to void ()*)()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  call void bitcast (void (...)* @goo to void ()*)()
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  ret void
+}
+
+declare void @foo(...) #1
+
+declare void @goo(...) #1
+
+declare void @hoo(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
diff --git a/test/CodeGen/Mips/const6.ll b/test/CodeGen/Mips/const6.ll
new file mode 100644
index 0000000..20cdc09
--- /dev/null
+++ b/test/CodeGen/Mips/const6.ll
@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
+
+; ModuleID = 'const6.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+@l = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+; load-relax: 	lw	${{[0-9]+}}, $CPI0_0
+; load-relax:	jrc	 $ra
+; load-relax:	.align	2
+; load-relax: $CPI0_0:
+; load-relax:	.4byte	3735943886
+; load-relax:	.end	t
+
+; no-load-relax: lw	${{[0-9]+}}, $CPI0_1	# 16 bit inst
+; no-load-relax:	jalrc 	${{[0-9]+}}
+; no-load-relax:	b	$BB0_2
+; no-load-relax:	.align	2
+; no-load-relax: $CPI0_0:
+; no-load-relax:	.4byte	3735943886
+; no-load-relax: $BB0_2:
+
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  ret void
+}
+
+declare void @hoo(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
+
+
diff --git a/test/CodeGen/Mips/const6a.ll b/test/CodeGen/Mips/const6a.ll
new file mode 100644
index 0000000..8b402ac
--- /dev/null
+++ b/test/CodeGen/Mips/const6a.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax1
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
+
+; ModuleID = 'const6a.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+; load-relax-NOT: 	lw	${{[0-9]+}}, $CPI0_0 # 16 bit inst
+; load-relax1: lw	${{[0-9]+}}, $CPI0_0
+; load-relax:	jrc	 $ra
+; load-relax:	.align	2
+; load-relax: $CPI0_0:
+; load-relax:	.4byte	3735943886
+; load-relax:	.end	t
+  call void asm sideeffect ".space 40000", ""() #1, !srcloc !1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind }
+
+!1 = metadata !{i32 121}
diff --git a/test/CodeGen/Mips/ctlz.ll b/test/CodeGen/Mips/ctlz.ll
new file mode 100644
index 0000000..2ddb727
--- /dev/null
+++ b/test/CodeGen/Mips/ctlz.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=static
+
+@x = global i32 28912, align 4
+@y = common global i32 0, align 4
+
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @x, align 4
+  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 true)
+  store i32 %1, i32* @y, align 4
+  ret i32 0
+}
+
+; static: .end main
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #1
+
+
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readnone }
+
diff --git a/test/CodeGen/Mips/disable-tail-merge.ll b/test/CodeGen/Mips/disable-tail-merge.ll
new file mode 100644
index 0000000..b4c093a
--- /dev/null
+++ b/test/CodeGen/Mips/disable-tail-merge.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+
+; CHECK: addiu ${{[0-9]+}}, ${{[0-9]+}}, 23
+; CHECK: addiu ${{[0-9]+}}, ${{[0-9]+}}, 23
+
+define i32 @test1(i32 %a) {
+entry:
+  %tobool = icmp eq i32 %a, 0
+  %0 = load i32* @g0, align 4
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %add1 = add nsw i32 %1, 23
+  br label %if.end
+
+if.else:
+  %add2 = add nsw i32 %0, 11
+  store i32 %add2, i32* @g0, align 4
+  %2 = load i32* @g1, align 4
+  %add3 = add nsw i32 %2, 23
+  br label %if.end
+
+if.end:
+  %storemerge = phi i32 [ %add3, %if.else ], [ %add1, %if.then ]
+  store i32 %storemerge, i32* @g1, align 4
+  ret i32 %storemerge
+}
diff --git a/test/CodeGen/Mips/divrem.ll b/test/CodeGen/Mips/divrem.ll
index a983c46..b631c3b 100644
--- a/test/CodeGen/Mips/divrem.ll
+++ b/test/CodeGen/Mips/divrem.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=TRAP
+; RUN: llc -march=mips -verify-machineinstrs < %s |\
+; RUN: FileCheck %s -check-prefix=TRAP
 ; RUN: llc -march=mips -mno-check-zero-division < %s |\
 ; RUN: FileCheck %s -check-prefix=NOCHECK
 
@@ -11,6 +12,9 @@
 ; NOCHECK-NOT: teq
 ; NOCHECK: .end sdiv1
 
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+
 define i32 @sdiv1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
   %div = sdiv i32 %a0, %a1
@@ -67,3 +71,11 @@ entry:
   %div = udiv i32 %a0, %a1
   ret i32 %div
 }
+
+define i32 @killFlags() {
+entry:
+  %0 = load i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %div = sdiv i32 %0, %1
+  ret i32 %div
+}
diff --git a/test/CodeGen/Mips/extins.ll b/test/CodeGen/Mips/extins.ll
index a164f70..efaeeea 100644
--- a/test/CodeGen/Mips/extins.ll
+++ b/test/CodeGen/Mips/extins.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc  < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
+; RUN: llc  < %s -march=mips -mcpu=mips16 | FileCheck %s -check-prefix=16
 
 define i32 @ext0_5_9(i32 %s, i32 %pos, i32 %sz) nounwind readnone {
 entry:
-; CHECK: ext ${{[0-9]+}}, $4, 5, 9
+; 32R2: ext ${{[0-9]+}}, $4, 5, 9
+; 16-NOT: ext ${{[0-9]+}}
   %shr = lshr i32 %s, 5
   %and = and i32 %shr, 511
   ret i32 %and
@@ -10,7 +12,8 @@ entry:
 
 define void @ins2_5_9(i32 %s, i32* nocapture %d) nounwind {
 entry:
-; CHECK: ins ${{[0-9]+}}, $4, 5, 9
+; 32R2: ins ${{[0-9]+}}, $4, 5, 9
+; 16-NOT: ins ${{[0-9]+}}
   %and = shl i32 %s, 5
   %shl = and i32 %and, 16352
   %tmp3 = load i32* %d, align 4
diff --git a/test/CodeGen/Mips/f16abs.ll b/test/CodeGen/Mips/f16abs.ll
new file mode 100644
index 0000000..928914f
--- /dev/null
+++ b/test/CodeGen/Mips/f16abs.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=static
+
+@y = global double -1.450000e+00, align 8
+@x = common global double 0.000000e+00, align 8
+
+@y1 = common global float 0.000000e+00, align 4
+@x1 = common global float 0.000000e+00, align 4
+
+
+
+; Function Attrs: nounwind optsize
+define i32 @main() #0 {
+entry:
+  %0 = load double* @y, align 8
+  %call = tail call double @fabs(double %0) #2
+  store double %call, double* @x, align 8
+; static-NOT: 	.ent	__call_stub_fp_fabs
+; static-NOT: 	jal fabs
+  %1 = load float* @y1, align 4
+  %call2 = tail call float @fabsf(float %1) #2
+  store float %call2, float* @x1, align 4
+; static-NOT: 	.ent	__call_stub_fp_fabsf
+; static-NOT: 	jal fabsf
+  ret i32 0
+}
+
+; Function Attrs: nounwind optsize readnone
+declare double @fabs(double) #1
+
+declare float @fabsf(float) #1
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind optsize readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #2 = { nounwind optsize readnone }
+
+
+
diff --git a/test/CodeGen/Mips/fixdfsf.ll b/test/CodeGen/Mips/fixdfsf.ll
new file mode 100644
index 0000000..b08eefd
--- /dev/null
+++ b/test/CodeGen/Mips/fixdfsf.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic2
+
+@x = common global double 0.000000e+00, align 8
+@y = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define void @foo()  {
+entry:
+  %0 = load double* @x, align 8
+  %conv = fptoui double %0 to i32
+  store i32 %conv, i32* @y, align 4
+; pic1:	lw	${{[0-9]+}}, %call16(__fixunsdfsi)(${{[0-9]+}})
+; pic2:	lw	${{[0-9]+}}, %got(__mips16_call_stub_2)(${{[0-9]+}})
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/fp16instrinsmc.ll b/test/CodeGen/Mips/fp16instrinsmc.ll
index 3c01d56..bb43d27 100644
--- a/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/test/CodeGen/Mips/fp16instrinsmc.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=1010111 -mips-os16 < %s | FileCheck %s -check-prefix=fmask 
 
 @x = global float 1.500000e+00, align 4
 @xn = global float -1.900000e+01, align 4
@@ -13,6 +14,14 @@
 
 ; Function Attrs: nounwind
 define void @foo1() #0 {
+; fmask: .ent foo1
+; fmask: .set	noreorder
+; fmask: .set	nomacro
+; fmask: .set	noat
+; fmask: .set	at
+; fmask: .set	macro
+; fmask: .set	reorder
+; fmask: .end	foo1
 entry:
   %0 = load float* @x, align 4
   %1 = load float* @one, align 4
@@ -26,6 +35,9 @@ declare float @copysignf(float, float) #1
 
 ; Function Attrs: nounwind
 define void @foo2() #0 {
+; fmask:	.ent	foo2
+; fmask:	save	{{.*}}
+; fmask:	.end	foo2
 entry:
   %0 = load float* @x, align 4
   %1 = load float* @negone, align 4
@@ -37,6 +49,14 @@ entry:
 ; Function Attrs: nounwind
 define void @foo3() #0 {
 entry:
+; fmask: .ent foo3
+; fmask: .set	noreorder
+; fmask: .set	nomacro
+; fmask: .set	noat
+; fmask: .set	at
+; fmask: .set	macro
+; fmask: .set	reorder
+; fmask: .end	foo3
   %0 = load double* @xd, align 8
   %1 = load float* @oned, align 4
   %conv = fpext float %1 to double
@@ -51,6 +71,9 @@ declare double @copysign(double, double) #1
 ; Function Attrs: nounwind
 define void @foo4() #0 {
 entry:
+; fmask:	.ent	foo4
+; fmask:	save	{{.*}}
+; fmask:	.end	foo4
   %0 = load double* @xd, align 8
   %1 = load double* @negoned, align 8
   %call = call double @copysign(double %0, double %1) #2
@@ -362,7 +385,7 @@ entry:
 ; Function Attrs: nounwind
 declare double @exp2(double) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/test/CodeGen/Mips/fp16mix.ll b/test/CodeGen/Mips/fp16mix.ll
new file mode 100644
index 0000000..8d85099
--- /dev/null
+++ b/test/CodeGen/Mips/fp16mix.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=10 -mips-os16 < %s | FileCheck %s -check-prefix=fmask1
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=01 -mips-os16 < %s | FileCheck %s -check-prefix=fmask2 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=10. -mips-os16 < %s | FileCheck %s -check-prefix=fmask1nr
+
+; Function Attrs: nounwind optsize readnone
+define void @foo1()  {
+entry:
+  ret void
+; fmask1: .ent foo1
+; fmask1: .set	noreorder
+; fmask1: .set	nomacro
+; fmask1: .set	noat
+; fmask1: .set	at
+; fmask1: .set	macro
+; fmask1: .set	reorder
+; fmask1: .end	foo1
+; fmask2: .ent	foo1
+; fmask2: save	{{.*}}
+; fmask2: .end	foo1
+; fmask1nr: .ent foo1
+; fmask1nr: .set	noreorder
+; fmask1nr: .set	nomacro
+; fmask1nr: .set	noat
+; fmask1nr: .set	at
+; fmask1nr: .set	macro
+; fmask1nr: .set	reorder
+; fmask1nr: .end	foo1
+}
+
+; Function Attrs: nounwind optsize readnone
+define void @foo2()  {
+entry:
+  ret void
+; fmask2: .ent foo2
+; fmask2: .set	noreorder
+; fmask2: .set	nomacro
+; fmask2: .set	noat
+; fmask2: .set	at
+; fmask2: .set	macro
+; fmask2: .set	reorder
+; fmask2: .end	foo2
+; fmask1: .ent	foo2
+; fmask1: save	{{.*}}
+; fmask1: .end	foo2
+; fmask1nr: .ent	foo2
+; fmask1nr: save	{{.*}}
+; fmask1nr: .end	foo2
+}
+
+; Function Attrs: nounwind optsize readnone
+define void @foo3()  {
+entry:
+  ret void
+; fmask1: .ent foo3
+; fmask1: .set	noreorder
+; fmask1: .set	nomacro
+; fmask1: .set	noat
+; fmask1: .set	at
+; fmask1: .set	macro
+; fmask1: .set	reorder
+; fmask1: .end	foo3
+; fmask2:  .ent	foo3
+; fmask2:  save	{{.*}}
+; fmask2:  .end	foo3
+; fmask1r:  .ent	foo3
+; fmask1r:  save	{{.*}}
+; fmask1r:  .end	foo3
+}
+
+; Function Attrs: nounwind optsize readnone
+define void @foo4()  {
+entry:
+  ret void
+; fmask2: .ent foo4
+; fmask2: .set	noreorder
+; fmask2: .set	nomacro
+; fmask2: .set	noat
+; fmask2: .set	at
+; fmask2: .set	macro
+; fmask2: .set	reorder
+; fmask2: .end	foo4
+; fmask1: .ent	foo4
+; fmask1: save	{{.*}}
+; fmask1: .end	foo4
+; fmask1nr: .ent	foo4
+; fmask1nr: save	{{.*}}
+; fmask1nr: .end	foo4
+}
+
+
diff --git a/test/CodeGen/Mips/fpneeded.ll b/test/CodeGen/Mips/fpneeded.ll
index 623883a..dcdebb9 100644
--- a/test/CodeGen/Mips/fpneeded.ll
+++ b/test/CodeGen/Mips/fpneeded.ll
@@ -131,7 +131,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	foo3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 define void @vv() #0 {
 entry:
diff --git a/test/CodeGen/Mips/fpnotneeded.ll b/test/CodeGen/Mips/fpnotneeded.ll
index dc2ec10..b4fab64 100644
--- a/test/CodeGen/Mips/fpnotneeded.ll
+++ b/test/CodeGen/Mips/fpnotneeded.ll
@@ -57,7 +57,7 @@ entry:
 ; 32:	restore	{{.+}} 
 ; 32:	.end	foo
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 
 define float @fv() #0 {
diff --git a/test/CodeGen/Mips/fptr2.ll b/test/CodeGen/Mips/fptr2.ll
new file mode 100644
index 0000000..77028db
--- /dev/null
+++ b/test/CodeGen/Mips/fptr2.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=static16
+
+; Function Attrs: nounwind
+define double @my_mul(double %a, double %b) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  %b.addr = alloca double, align 8
+  store double %a, double* %a.addr, align 8
+  store double %b, double* %b.addr, align 8
+  %0 = load double* %a.addr, align 8
+  %1 = load double* %b.addr, align 8
+  %mul = fmul double %0, %1
+  ret double %mul
+}
+
+; static16: 	        .ent	__fn_stub_my_mul
+; static16:     	.set reorder
+; static16-NEXT:	#NO_APP
+; static16: 	        .end __fn_stub_my_mul
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index 83c88ae..058a041 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -1,11 +1,11 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST1
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST2
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST1
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST2
 ;
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
-; RUN: llc  -march=mipsel -mcpu=mips32  -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR32
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32  -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR32
 
 
 @.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
@@ -26,9 +26,11 @@ entry:
 ; SR32:  .set nomacro
 ; SR32:  .set noat
 ; SR:	save 	$ra, $s0, $s1, $s2, [[FS:[0-9]+]]
-; PE:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
-; PE: 	addiu	$[[T2:[0-9]+]], $pc, %lo(_gp_disp)
-; PE:	sll	$[[T3:[0-9]+]], $[[T1]], 16
+; PE:    .ent main
+; PE:    .align  2
+; PE-NEXT:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
+; PE-NEXT: 	addiu	$[[T2:[0-9]+]], $pc, %lo(_gp_disp)
+; PE:	        sll	$[[T3:[0-9]+]], $[[T1]], 16
 ; C1:	lw	${{[0-9]+}}, %got($.str)(${{[0-9]+}})
 ; C2:	lw	${{[0-9]+}}, %call16(printf)(${{[0-9]+}})
 ; C1:	addiu	${{[0-9]+}}, %lo($.str)
diff --git a/test/CodeGen/Mips/hf16call32.ll b/test/CodeGen/Mips/hf16call32.ll
index 934cf06..461438e 100644
--- a/test/CodeGen/Mips/hf16call32.ll
+++ b/test/CodeGen/Mips/hf16call32.ll
@@ -1026,5 +1026,5 @@ declare { double, double } @dc_sf(float) #1
 ; stel:	jr $18
 ; stel:	.end	__call_stub_fp_dc_sf
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/hf16call32_body.ll b/test/CodeGen/Mips/hf16call32_body.ll
index 793b771..34bae26 100644
--- a/test/CodeGen/Mips/hf16call32_body.ll
+++ b/test/CodeGen/Mips/hf16call32_body.ll
@@ -20,7 +20,7 @@ entry:
 }
 ; stel: .section	.mips16.fn.v_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_sf
-; stel:		la $25, v_sf
+; stel:		la $25,v_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		jr $25
 ; stel:		__fn_local_v_sf = v_sf
@@ -40,7 +40,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_df
-; stel:		la $25, v_df
+; stel:		la $25,v_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		jr $25
@@ -63,7 +63,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_sf_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_sf_sf
-; stel:		la $25, v_sf_sf
+; stel:		la $25,v_sf_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f14
 ; stel:		jr $25
@@ -86,7 +86,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_sf_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_sf_df
-; stel:		la $25, v_sf_df
+; stel:		la $25,v_sf_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $6,$f14
 ; stel:		mfc1 $7,$f15
@@ -110,7 +110,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_df_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_df_sf
-; stel:		la $25, v_df_sf
+; stel:		la $25,v_df_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -134,7 +134,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_df_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_df_df
-; stel:		la $25, v_df_df
+; stel:		la $25,v_df_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -164,7 +164,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_sf
-; stel:		la $25, sf_sf
+; stel:		la $25,sf_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		jr $25
 ; stel:		__fn_local_sf_sf = sf_sf
@@ -184,7 +184,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_df
-; stel:		la $25, sf_df
+; stel:		la $25,sf_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		jr $25
@@ -208,7 +208,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_sf_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_sf_sf
-; stel:		la $25, sf_sf_sf
+; stel:		la $25,sf_sf_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f14
 ; stel:		jr $25
@@ -232,7 +232,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_sf_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_sf_df
-; stel:		la $25, sf_sf_df
+; stel:		la $25,sf_sf_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $6,$f14
 ; stel:		mfc1 $7,$f15
@@ -257,7 +257,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_df_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_df_sf
-; stel:		la $25, sf_df_sf
+; stel:		la $25,sf_df_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -282,7 +282,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_df_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_df_df
-; stel:		la $25, sf_df_df
+; stel:		la $25,sf_df_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -291,4 +291,4 @@ entry:
 ; stel:		__fn_local_sf_df_df = sf_df_df
 ; stel:	.end	__fn_stub_sf_df_df
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/hf1_body.ll b/test/CodeGen/Mips/hf1_body.ll
new file mode 100644
index 0000000..b2cce92
--- /dev/null
+++ b/test/CodeGen/Mips/hf1_body.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16  -relocation-model=pic -soft-float -mips16-hard-float < %s | FileCheck %s -check-prefix=picfp16
+
+@x = external global float
+
+; Function Attrs: nounwind
+define void @v_sf(float %p) #0 {
+entry:
+  %p.addr = alloca float, align 4
+  store float %p, float* %p.addr, align 4
+  %0 = load float* %p.addr, align 4
+  store float %0, float* @x, align 4
+  ret void
+}
+; picfp16:	.ent	__fn_stub_v_sf
+; picfp16:	.cpload  $25
+; picfp16:	.set reorder
+; picfp16:	.reloc 0,R_MIPS_NONE,v_sf
+; picfp16: 	la $25,$__fn_local_v_sf
+; picfp16: 	mfc1 $4,$f12
+; picfp16: 	jr $25
+; picfp16: 	.end	__fn_stub_v_sf
diff --git a/test/CodeGen/Mips/hfptrcall.ll b/test/CodeGen/Mips/hfptrcall.ll
index b1d36c0..25639da 100644
--- a/test/CodeGen/Mips/hfptrcall.ll
+++ b/test/CodeGen/Mips/hfptrcall.ll
@@ -118,8 +118,8 @@ entry:
 
 declare i32 @printf(i8*, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
 
 
 
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
index c6da8b1..f4dd1eb 100644
--- a/test/CodeGen/Mips/i32k.ll
+++ b/test/CodeGen/Mips/i32k.ll
@@ -1,16 +1,23 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16a
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16b
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
 
 define i32 @main() nounwind {
 entry:
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
-; 16a:	li	${{[0-9]+}}, 29905
-; 16b:	li	${{[0-9]+}}, 16408
+; 16:	lw	${{[0-9]+}}, 1f
+; 16:	b	2f
+; 16:	.align	2
+; 16: 1: 	.word	1075344593
+; 16: 2:
+
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
-; 16a:	li	${{[0-9]+}}, 49127
-; 16b:	li	${{[0-9]+}}, 35631
+
+; 16:	lw	${{[0-9]+}}, 1f
+; 16:	b	2f
+; 16:	.align	2
+; 16: 1: 	.word	-1075344593
+; 16: 2:
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/i64arg.ll b/test/CodeGen/Mips/i64arg.ll
index 0b16424..5b2d135 100644
--- a/test/CodeGen/Mips/i64arg.ll
+++ b/test/CodeGen/Mips/i64arg.ll
@@ -2,18 +2,18 @@
 
 define void @f1(i64 %ll1, float %f, i64 %ll, i32 %i, float %f2) nounwind {
 entry:
-; CHECK: move $[[R1:[0-9]+]], $5
-; CHECK: move $[[R0:[0-9]+]], $4
-; CHECK: ori $6, ${{[0-9]+}}, 3855
-; CHECK: ori $7, ${{[0-9]+}}, 22136
-; CHECK: lw  $25, %call16(ff1)
+; CHECK-DAG: lw $[[R2:[0-9]+]], 80($sp)
+; CHECK-DAG: lw $[[R3:[0-9]+]], 84($sp)
+; CHECK-DAG: move $[[R1:[0-9]+]], $5
+; CHECK-DAG: move $[[R0:[0-9]+]], $4
+; CHECK-DAG: ori $6, ${{[0-9]+}}, 3855
+; CHECK-DAG: ori $7, ${{[0-9]+}}, 22136
+; CHECK-DAG: lw  $25, %call16(ff1)
 ; CHECK: jalr
   tail call void @ff1(i32 %i, i64 1085102592623924856) nounwind
-; CHECK: lw $25, %call16(ff2)
-; CHECK: lw $[[R2:[0-9]+]], 80($sp)
-; CHECK: lw $[[R3:[0-9]+]], 84($sp)
-; CHECK: move $4, $[[R2]]
-; CHECK: move $5, $[[R3]]
+; CHECK-DAG: lw $25, %call16(ff2)
+; CHECK-DAG: move $4, $[[R2]]
+; CHECK-DAG: move $5, $[[R3]]
 ; CHECK: jalr $25
   tail call void @ff2(i64 %ll, double 3.000000e+00) nounwind
   %sub = add nsw i32 %i, -1
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 1e96346..09fee3d 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -18,11 +18,11 @@ entry:
 ; 64:  dsll  $[[R0]], $[[R0]], 48
 ; 64:  daddiu  $[[R0]], $[[R0]], -1
 ; 64:  dsll  $[[R0]], $[[R0]], 16
-; 64:  daddiu  $[[R0]], $[[R0]], -48
+; 64:  daddiu  $[[R0]], $[[R0]], -32
 ; 64:  daddu $sp, $sp, $[[R0]]
 ; 64:  lui $[[R1:[0-9]+]], 1
 ; 64:  daddu $[[R1]], $sp, $[[R1]]
-; 64:  sd  $ra, 40($[[R1]])
+; 64:  sd  $ra, 24($[[R1]])
 
   %agg.tmp = alloca %struct.S1, align 1
   %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/lazy-binding.ll b/test/CodeGen/Mips/lazy-binding.ll
new file mode 100644
index 0000000..839155a
--- /dev/null
+++ b/test/CodeGen/Mips/lazy-binding.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+; CHECK-LABEL: foo6:
+; CHECK: %while.body
+; CHECK: lw  $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+; CHECK: %while.end
+
+define void @foo6(i32 %n) {
+entry:
+  %tobool1 = icmp eq i32 %n, 0
+  br i1 %tobool1, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %n.addr.02 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %n.addr.02, -1
+  tail call void @foo2()
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare void @foo2()
+
+; CHECK-LABEL: foo1:
+; CHECK: lw $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+; CHECK: lw $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+; CHECK: lw $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+
+define void @foo1() {
+entry:
+  tail call void @foo2()
+  tail call void @foo2()
+  tail call void @foo2()
+  ret void
+}
diff --git a/test/CodeGen/Mips/lit.local.cfg b/test/CodeGen/Mips/lit.local.cfg
index e157c54..1fa54b4 100644
--- a/test/CodeGen/Mips/lit.local.cfg
+++ b/test/CodeGen/Mips/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Mips' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 1a4f79c..af192d0 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -1,13 +1,17 @@
-; RUN: llc -march=mipsel -force-mips-long-branch < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=N64
 
 @g0 = external global i32
 
 define void @foo1(i32 %s) nounwind {
 entry:
+; O32: nop
+; O32: addiu $sp, $sp, -8
 ; O32: bal
 ; O32: lui $1, 0
 ; O32: addiu $1, $1, {{[0-9]+}} 
+; N64: nop
+; N64: daddiu $sp, $sp, -16
 ; N64: lui $1, 0
 ; N64: daddiu $1, $1, 0
 ; N64: dsll $1, $1, 16
diff --git a/test/CodeGen/Mips/mips16_32_1.ll b/test/CodeGen/Mips/mips16_32_1.ll
index 6f4826e..e156641 100644
--- a/test/CodeGen/Mips/mips16_32_1.ll
+++ b/test/CodeGen/Mips/mips16_32_1.ll
@@ -11,4 +11,4 @@ entry:
 ; CHECK:	save	{{.+}}
 ; CHECK:	restore	{{.+}} 
 ; CHECK:	.end	foo
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_10.ll b/test/CodeGen/Mips/mips16_32_10.ll
index 330dbfe..7c017b8 100644
--- a/test/CodeGen/Mips/mips16_32_10.ll
+++ b/test/CodeGen/Mips/mips16_32_10.ll
@@ -54,6 +54,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_3.ll b/test/CodeGen/Mips/mips16_32_3.ll
index 8874a88..dd94ec1 100644
--- a/test/CodeGen/Mips/mips16_32_3.ll
+++ b/test/CodeGen/Mips/mips16_32_3.ll
@@ -65,6 +65,6 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_4.ll b/test/CodeGen/Mips/mips16_32_4.ll
index cdaed6c..5e49071 100644
--- a/test/CodeGen/Mips/mips16_32_4.ll
+++ b/test/CodeGen/Mips/mips16_32_4.ll
@@ -60,6 +60,6 @@ entry:
 ; 32:	.end	main
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_5.ll b/test/CodeGen/Mips/mips16_32_5.ll
index 45e0bf4..17900a2 100644
--- a/test/CodeGen/Mips/mips16_32_5.ll
+++ b/test/CodeGen/Mips/mips16_32_5.ll
@@ -75,6 +75,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_6.ll b/test/CodeGen/Mips/mips16_32_6.ll
index f4b8e7a..a77031a 100644
--- a/test/CodeGen/Mips/mips16_32_6.ll
+++ b/test/CodeGen/Mips/mips16_32_6.ll
@@ -81,6 +81,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_7.ll b/test/CodeGen/Mips/mips16_32_7.ll
index f8726ea..895b5d4 100644
--- a/test/CodeGen/Mips/mips16_32_7.ll
+++ b/test/CodeGen/Mips/mips16_32_7.ll
@@ -71,6 +71,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_8.ll b/test/CodeGen/Mips/mips16_32_8.ll
index e51f296..4152d68 100644
--- a/test/CodeGen/Mips/mips16_32_8.ll
+++ b/test/CodeGen/Mips/mips16_32_8.ll
@@ -68,7 +68,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_9.ll b/test/CodeGen/Mips/mips16_32_9.ll
index f5ff368..c9b494f 100644
--- a/test/CodeGen/Mips/mips16_32_9.ll
+++ b/test/CodeGen/Mips/mips16_32_9.ll
@@ -46,6 +46,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips64instrs.ll b/test/CodeGen/Mips/mips64instrs.ll
index 7b06c2d..2894d69 100644
--- a/test/CodeGen/Mips/mips64instrs.ll
+++ b/test/CodeGen/Mips/mips64instrs.ll
@@ -1,4 +1,7 @@
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck %s
+
+@gll0 = common global i64 0, align 8
+@gll1 = common global i64 0, align 8
 
 define i64 @f0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
@@ -90,17 +93,21 @@ entry:
 ; CHECK: ddiv $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
 ; CHECK: teq $[[R0]], $zero, 7
 ; CHECK: mflo
-  %div = sdiv i64 %a, %b
+  %0 = load i64* @gll0, align 8
+  %1 = load i64* @gll1, align 8
+  %div = sdiv i64 %0, %1
   ret i64 %div
 }
 
-define i64 @f15(i64 %a, i64 %b) nounwind readnone {
+define i64 @f15() nounwind readnone {
 entry:
 ; CHECK-LABEL: f15:
 ; CHECK: ddivu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
 ; CHECK: teq $[[R0]], $zero, 7
 ; CHECK: mflo
-  %div = udiv i64 %a, %b
+  %0 = load i64* @gll0, align 8
+  %1 = load i64* @gll1, align 8
+  %div = udiv i64 %0, %1
   ret i64 %div
 }
 
@@ -148,4 +155,3 @@ entry:
   %neg = xor i64 %or, -1
   ret i64 %neg
 }
-
diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index be9d0b6..f4854f8 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
@@ -1,22 +1,31 @@
-; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 < %s | \
-; RUN: FileCheck %s -check-prefix=LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r2 \
+; RUN: < %s | FileCheck %s -check-prefix=LE-PIC
 ; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 < %s | \
 ; RUN: FileCheck %s -check-prefix=LE-STATIC
 ; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 < %s | \
 ; RUN: FileCheck %s -check-prefix=BE-PIC
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=CHECK-LDC1-SDC1
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-LDC1-SDC1
 
 @g0 = common global double 0.000000e+00, align 8
 
 ; LE-PIC-LABEL: test_ldc1:
-; LE-PIC: lwc1 $f0, 0(${{[0-9]+}})
-; LE-PIC: lwc1 $f1, 4(${{[0-9]+}})
+; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; LE-PIC-DAG: mtc1 $[[R0]], $f0
+; LE-PIC-DAG: mtc1 $[[R1]], $f1
 ; LE-STATIC-LABEL: test_ldc1:
-; LE-STATIC: lwc1 $f0, %lo(g0)(${{[0-9]+}})
-; LE-STATIC: lwc1 $f1, %lo(g0+4)(${{[0-9]+}})
+; LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; LE-STATIC-DAG: mtc1 $[[R3]], $f1
 ; BE-PIC-LABEL: test_ldc1:
-; BE-PIC: lwc1 $f1, 0(${{[0-9]+}})
-; BE-PIC: lwc1 $f0, 4(${{[0-9]+}})
+; BE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; BE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; BE-PIC-DAG: mtc1 $[[R1]], $f0
+; BE-PIC-DAG: mtc1 $[[R0]], $f1
 ; CHECK-LDC1-SDC1-LABEL: test_ldc1:
 ; CHECK-LDC1-SDC1: ldc1 $f{{[0-9]+}}
 
@@ -27,14 +36,22 @@ entry:
 }
 
 ; LE-PIC-LABEL: test_sdc1:
-; LE-PIC: swc1 $f12, 0(${{[0-9]+}})
-; LE-PIC: swc1 $f13, 4(${{[0-9]+}})
+; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
+; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
 ; LE-STATIC-LABEL: test_sdc1:
-; LE-STATIC: swc1 $f12, %lo(g0)(${{[0-9]+}})
-; LE-STATIC: swc1 $f13, %lo(g0+4)(${{[0-9]+}})
+; LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
 ; BE-PIC-LABEL: test_sdc1:
-; BE-PIC: swc1 $f13, 0(${{[0-9]+}})
-; BE-PIC: swc1 $f12, 4(${{[0-9]+}})
+; BE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; BE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; BE-PIC-DAG: sw $[[R1]], 0(${{[0-9]+}})
+; BE-PIC-DAG: sw $[[R0]], 4(${{[0-9]+}})
 ; CHECK-LDC1-SDC1-LABEL: test_sdc1:
 ; CHECK-LDC1-SDC1: sdc1 $f{{[0-9]+}}
 
@@ -43,3 +60,34 @@ entry:
   store double %a, double* @g0, align 8
   ret void
 }
+
+
+; LE-PIC-LABEL: test_ldxc1:
+; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; LE-PIC-DAG: mtc1 $[[R0]], $f0
+; LE-PIC-DAG: mtc1 $[[R1]], $f1
+; CHECK-LDC1-SDC1-LABEL: test_ldxc1:
+; CHECK-LDC1-SDC1: ldxc1 $f{{[0-9]+}}
+
+define double @test_ldxc1(double* nocapture readonly %a, i32 %i) {
+entry:
+  %arrayidx = getelementptr inbounds double* %a, i32 %i
+  %0 = load double* %arrayidx, align 8
+  ret double %0
+}
+
+; LE-PIC-LABEL: test_sdxc1:
+; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
+; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
+; CHECK-LDC1-SDC1-LABEL: test_sdxc1:
+; CHECK-LDC1-SDC1: sdxc1 $f{{[0-9]+}}
+
+define void @test_sdxc1(double %b, double* nocapture %a, i32 %i) {
+entry:
+  %arrayidx = getelementptr inbounds double* %a, i32 %i
+  store double %b, double* %arrayidx, align 8
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/2r.ll b/test/CodeGen/Mips/msa/2r.ll
new file mode 100644
index 0000000..da35ad8
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2r.ll
@@ -0,0 +1,257 @@
+; Test the MSA intrinsics that are encoded with the 2R instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_nloc_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nloc_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nloc_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nloc_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.nloc.b(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_nloc_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.nloc.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_nloc_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_b_ARG1)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.b [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_b_test
+;
+@llvm_mips_nloc_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_nloc_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_nloc_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_nloc_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.nloc.h(<8 x i16> %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_nloc_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.nloc.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_nloc_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_h_ARG1)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.h [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_h_test
+;
+@llvm_mips_nloc_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_nloc_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_nloc_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_nloc_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.nloc.w(<4 x i32> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_nloc_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.nloc.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_nloc_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_w_test
+;
+@llvm_mips_nloc_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_nloc_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_nloc_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_nloc_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.nloc.d(<2 x i64> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_nloc_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.nloc.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_nloc_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_d_test
+;
+@llvm_mips_nlzc_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nlzc_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nlzc_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nlzc_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.nlzc.b(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_nlzc_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.nlzc.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_nlzc_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_b_ARG1)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.b [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_b_test
+;
+@llvm_mips_nlzc_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_nlzc_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_nlzc_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_nlzc_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.nlzc.h(<8 x i16> %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_nlzc_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.nlzc.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_nlzc_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_h_ARG1)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.h [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_h_test
+;
+@llvm_mips_nlzc_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_nlzc_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_nlzc_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_nlzc_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.nlzc.w(<4 x i32> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_nlzc_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.nlzc.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_nlzc_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_w_test
+;
+@llvm_mips_nlzc_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_nlzc_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_nlzc_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_nlzc_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.nlzc.d(<2 x i64> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_nlzc_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.nlzc.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_nlzc_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_d_test
+;
+@llvm_mips_pcnt_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_pcnt_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_pcnt_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_pcnt_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.pcnt.b(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_pcnt_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.pcnt.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_pcnt_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_b_ARG1)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.b [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_b_test
+;
+@llvm_mips_pcnt_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_pcnt_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_pcnt_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_pcnt_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.pcnt.h(<8 x i16> %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_pcnt_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.pcnt.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_pcnt_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_h_ARG1)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.h [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_h_test
+;
+@llvm_mips_pcnt_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_pcnt_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_pcnt_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_pcnt_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.pcnt.w(<4 x i32> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_pcnt_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.pcnt.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_pcnt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_w_test
+;
+@llvm_mips_pcnt_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_pcnt_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_pcnt_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_pcnt_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.pcnt.d(<2 x i64> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_pcnt_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.pcnt.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_pcnt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2r_vector_scalar.ll b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
new file mode 100644
index 0000000..6f6e1b9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
@@ -0,0 +1,87 @@
+; Test the MSA intrinsics that are encoded with the 2R instruction format and
+; convert scalars to vectors.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fill_b_ARG1 = global i32 23, align 16
+@llvm_mips_fill_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_fill_b_test() nounwind {
+entry:
+  %0 = load i32* @llvm_mips_fill_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.fill.b(i32 %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_fill_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.fill.b(i32) nounwind
+
+; CHECK: llvm_mips_fill_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]],
+; CHECK-DAG: fill.b [[R2:\$w[0-9]+]], [[R1]]
+; CHECK-DAG: st.b [[R2]],
+; CHECK: .size llvm_mips_fill_b_test
+;
+@llvm_mips_fill_h_ARG1 = global i32 23, align 16
+@llvm_mips_fill_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_fill_h_test() nounwind {
+entry:
+  %0 = load i32* @llvm_mips_fill_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.fill.h(i32 %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_fill_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.fill.h(i32) nounwind
+
+; CHECK: llvm_mips_fill_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]],
+; CHECK-DAG: fill.h [[R2:\$w[0-9]+]], [[R1]]
+; CHECK-DAG: st.h [[R2]],
+; CHECK: .size llvm_mips_fill_h_test
+;
+@llvm_mips_fill_w_ARG1 = global i32 23, align 16
+@llvm_mips_fill_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fill_w_test() nounwind {
+entry:
+  %0 = load i32* @llvm_mips_fill_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.fill.w(i32 %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_fill_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fill.w(i32) nounwind
+
+; CHECK: llvm_mips_fill_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]],
+; CHECK-DAG: fill.w [[R2:\$w[0-9]+]], [[R1]]
+; CHECK-DAG: st.w [[R2]],
+; CHECK: .size llvm_mips_fill_w_test
+;
+@llvm_mips_fill_d_ARG1 = global i64 23, align 16
+@llvm_mips_fill_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fill_d_test() nounwind {
+entry:
+  %0 = load i64* @llvm_mips_fill_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.fill.d(i64 %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_fill_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fill.d(i64) nounwind
+
+; CHECK: llvm_mips_fill_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], 4(
+; CHECK-DAG: ldi.b [[R3:\$w[0-9]+]], 0
+; CHECK-DAG: insert.w [[R3]][0], [[R1]]
+; CHECK-DAG: insert.w [[R3]][1], [[R2]]
+; CHECK-DAG: insert.w [[R3]][2], [[R1]]
+; CHECK-DAG: insert.w [[R3]][3], [[R2]]
+; CHECK-DAG: st.w [[R3]],
+; CHECK: .size llvm_mips_fill_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf.ll b/test/CodeGen/Mips/msa/2rf.ll
new file mode 100644
index 0000000..b361ef5
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf.ll
@@ -0,0 +1,323 @@
+; Test the MSA intrinsics that are encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_flog2_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_flog2_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_flog2_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_flog2_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.flog2.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_flog2_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.flog2.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_flog2_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_flog2_w_test
+;
+@llvm_mips_flog2_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_flog2_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_flog2_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_flog2_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.flog2.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_flog2_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.flog2.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_flog2_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_flog2_d_test
+
+define void @flog2_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_flog2_w_ARG1
+  %1 = tail call <4 x float> @llvm.log2.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_flog2_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.log2.v4f32(<4 x float> %val)
+
+; CHECK: flog2_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size flog2_w_test
+
+define void @flog2_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_flog2_d_ARG1
+  %1 = tail call <2 x double> @llvm.log2.v2f64(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_flog2_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.log2.v2f64(<2 x double> %val)
+
+; CHECK: flog2_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size flog2_d_test
+;
+@llvm_mips_frint_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_frint_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_frint_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frint_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.frint.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frint_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.frint.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_frint_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frint_w_test
+;
+@llvm_mips_frint_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_frint_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_frint_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frint_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.frint.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frint_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.frint.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_frint_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frint_d_test
+
+define void @frint_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frint_w_ARG1
+  %1 = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frint_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind
+
+; CHECK: frint_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size frint_w_test
+
+define void @frint_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frint_d_ARG1
+  %1 = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frint_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind
+
+; CHECK: frint_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size frint_d_test
+;
+@llvm_mips_frcp_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_frcp_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_frcp_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frcp_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.frcp.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frcp_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.frcp.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_frcp_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frcp_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frcp.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frcp_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frcp_w_test
+;
+@llvm_mips_frcp_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_frcp_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_frcp_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frcp_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.frcp.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frcp_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.frcp.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_frcp_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frcp_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frcp.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frcp_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frcp_d_test
+;
+@llvm_mips_frsqrt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_frsqrt_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_frsqrt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frsqrt_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.frsqrt.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frsqrt_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.frsqrt.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_frsqrt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frsqrt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frsqrt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frsqrt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frsqrt_w_test
+;
+@llvm_mips_frsqrt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_frsqrt_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_frsqrt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frsqrt_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.frsqrt.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frsqrt_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.frsqrt.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_frsqrt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frsqrt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frsqrt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frsqrt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frsqrt_d_test
+;
+@llvm_mips_fsqrt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsqrt_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fsqrt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsqrt_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.fsqrt.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fsqrt_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fsqrt.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fsqrt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fsqrt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fsqrt_w_test
+;
+@llvm_mips_fsqrt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsqrt_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fsqrt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsqrt_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.fsqrt.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fsqrt_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fsqrt.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_fsqrt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fsqrt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fsqrt_d_test
+
+define void @fsqrt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsqrt_w_ARG1
+  %1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fsqrt_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind
+
+; CHECK: fsqrt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size fsqrt_w_test
+
+define void @fsqrt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsqrt_d_ARG1
+  %1 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fsqrt_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind
+
+; CHECK: fsqrt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fsqrt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size fsqrt_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_exup.ll b/test/CodeGen/Mips/msa/2rf_exup.ll
new file mode 100644
index 0000000..8d7cc36
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_exup.ll
@@ -0,0 +1,82 @@
+; Test the MSA floating point conversion intrinsics (e.g. float->double) that
+; are encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fexupl_w_ARG1 = global <8 x half> <half 0.000000e+00, half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00>, align 16
+@llvm_mips_fexupl_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupl_w_test() nounwind {
+entry:
+  %0 = load <8 x half>* @llvm_mips_fexupl_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.fexupl.w(<8 x half> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fexupl_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexupl.w(<8 x half>) nounwind
+
+; CHECK: llvm_mips_fexupl_w_test:
+; CHECK: ld.h
+; CHECK: fexupl.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexupl_w_test
+;
+@llvm_mips_fexupl_d_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexupl_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupl_d_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexupl_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.fexupl.d(<4 x float> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fexupl_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fexupl.d(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fexupl_d_test:
+; CHECK: ld.w
+; CHECK: fexupl.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fexupl_d_test
+;
+@llvm_mips_fexupr_w_ARG1 = global <8 x half> <half 0.000000e+00, half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00>, align 16
+@llvm_mips_fexupr_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupr_w_test() nounwind {
+entry:
+  %0 = load <8 x half>* @llvm_mips_fexupr_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.fexupr.w(<8 x half> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fexupr_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexupr.w(<8 x half>) nounwind
+
+; CHECK: llvm_mips_fexupr_w_test:
+; CHECK: ld.h
+; CHECK: fexupr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexupr_w_test
+;
+@llvm_mips_fexupr_d_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexupr_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupr_d_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexupr_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.fexupr.d(<4 x float> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fexupr_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fexupr.d(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fexupr_d_test:
+; CHECK: ld.w
+; CHECK: fexupr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fexupr_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_float_int.ll b/test/CodeGen/Mips/msa/2rf_float_int.ll
new file mode 100644
index 0000000..3b5dfda
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_float_int.ll
@@ -0,0 +1,90 @@
+; Test the MSA integer to floating point conversion intrinsics that are encoded
+; with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ffint_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffint_s_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffint_s_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffint.s.w(<4 x i32> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffint_s_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffint.s.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffint_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_s_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_s.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_s_w_test
+;
+@llvm_mips_ffint_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ffint_s_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ffint_s_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffint.s.d(<2 x i64> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffint_s_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffint.s.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_ffint_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_s_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_s.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_s_d_test
+;
+@llvm_mips_ffint_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffint_u_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffint_u_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffint.u.w(<4 x i32> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffint_u_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffint.u.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffint_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_u_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_u.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_u_w_test
+;
+@llvm_mips_ffint_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ffint_u_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ffint_u_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffint.u.d(<2 x i64> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffint_u_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffint.u.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_ffint_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_u_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_u.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_fq.ll b/test/CodeGen/Mips/msa/2rf_fq.ll
new file mode 100644
index 0000000..021dd93
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_fq.ll
@@ -0,0 +1,82 @@
+; Test the MSA fixed-point to floating point conversion intrinsics that are
+; encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ffql_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ffql_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffql_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ffql_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffql.w(<8 x i16> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffql_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffql.w(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_ffql_w_test:
+; CHECK: ld.h
+; CHECK: ffql.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ffql_w_test
+;
+@llvm_mips_ffql_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffql_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffql_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffql_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffql.d(<4 x i32> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffql_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffql.d(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffql_d_test:
+; CHECK: ld.w
+; CHECK: ffql.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ffql_d_test
+;
+@llvm_mips_ffqr_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ffqr_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffqr_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ffqr_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffqr.w(<8 x i16> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffqr_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffqr.w(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_ffqr_w_test:
+; CHECK: ld.h
+; CHECK: ffqr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ffqr_w_test
+;
+@llvm_mips_ffqr_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffqr_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffqr_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffqr_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffqr.d(<4 x i32> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffqr_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffqr.d(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffqr_d_test:
+; CHECK: ld.w
+; CHECK: ffqr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ffqr_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_int_float.ll b/test/CodeGen/Mips/msa/2rf_int_float.ll
new file mode 100644
index 0000000..4665ae0
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_int_float.ll
@@ -0,0 +1,217 @@
+; Test the MSA floating point to integer intrinsics that are encoded with the
+; 2RF instruction format. This includes conversions but other instructions such
+; as fclass are also here.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fclass_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fclass_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fclass_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fclass_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.fclass.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_fclass_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fclass.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fclass_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fclass_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fclass.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fclass_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fclass_w_test
+;
+@llvm_mips_fclass_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fclass_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fclass_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fclass_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.fclass.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_fclass_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fclass.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_fclass_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fclass_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fclass.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fclass_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fclass_d_test
+;
+@llvm_mips_ftrunc_s_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftrunc_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftrunc_s_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftrunc_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftrunc.s.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftrunc_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftrunc.s.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftrunc_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_s_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_s.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_s_w_test
+;
+@llvm_mips_ftrunc_s_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftrunc_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftrunc_s_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftrunc_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftrunc.s.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftrunc_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftrunc.s.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftrunc_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_s_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_s.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_s_d_test
+;
+@llvm_mips_ftrunc_u_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftrunc_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftrunc_u_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftrunc_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftrunc.u.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftrunc_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftrunc.u.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftrunc_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_u_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_u.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_u_w_test
+;
+@llvm_mips_ftrunc_u_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftrunc_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftrunc_u_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftrunc_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftrunc.u.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftrunc_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftrunc.u.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftrunc_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_u_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_u.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_u_d_test
+;
+@llvm_mips_ftint_s_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftint_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftint_s_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftint_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftint.s.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftint_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftint.s.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftint_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_s_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_s.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_s_w_test
+;
+@llvm_mips_ftint_s_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftint_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftint_s_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftint_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftint.s.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftint_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftint.s.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftint_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_s_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_s.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_s_d_test
+;
+@llvm_mips_ftint_u_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftint_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftint_u_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftint_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftint.u.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftint_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftint.u.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftint_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_u_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_u.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_u_w_test
+;
+@llvm_mips_ftint_u_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftint_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftint_u_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftint_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftint.u.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftint_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftint.u.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftint_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_u_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_u.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_tq.ll b/test/CodeGen/Mips/msa/2rf_tq.ll
new file mode 100644
index 0000000..6f3c508
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_tq.ll
@@ -0,0 +1,50 @@
+; Test the MSA floating-point to fixed-point conversion intrinsics that are
+; encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ftq_h_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftq_h_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_ftq_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ftq_h_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftq_h_ARG1
+  %1 = load <4 x float>* @llvm_mips_ftq_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ftq.h(<4 x float> %0, <4 x float> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ftq_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ftq.h(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_ftq_h_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ftq.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ftq_h_test
+;
+@llvm_mips_ftq_w_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftq_w_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_ftq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftq_w_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftq_w_ARG1
+  %1 = load <2 x double>* @llvm_mips_ftq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ftq.w(<2 x double> %0, <2 x double> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ftq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftq.w(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_ftq_w_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ftq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ftq_w_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-a.ll b/test/CodeGen/Mips/msa/3r-a.ll
new file mode 100644
index 0000000..dab15b6
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-a.ll
@@ -0,0 +1,1191 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'a'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+; It should fail to compile without fp64.
+; RUN: not llc -march=mips -mattr=+msa < %s 2>&1 | \
+; RUN:    FileCheck -check-prefix=FP32ERROR %s
+; FP32ERROR: LLVM ERROR: MSA requires a 64-bit FPU register file (FR=1 mode).
+
+@llvm_mips_add_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_add_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_add_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_add_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_add_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_add_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.add.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_add_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.add.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_add_a_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_b_test
+;
+@llvm_mips_add_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_add_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_add_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_add_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_add_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_add_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.add.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_add_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.add.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_add_a_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_h_test
+;
+@llvm_mips_add_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_add_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_add_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_add_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_add_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_add_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.add.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_add_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.add.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_add_a_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_w_test
+;
+@llvm_mips_add_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_add_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_add_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_add_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_add_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_add_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.add.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_add_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.add.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_add_a_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_d_test
+;
+@llvm_mips_adds_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_adds_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_adds_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_adds_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_adds_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_adds_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.adds.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_adds_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.adds.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_adds_a_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_b_test
+;
+@llvm_mips_adds_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_adds_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_adds_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_adds_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_adds_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_adds_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.adds.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_adds_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.adds.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_adds_a_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_h_test
+;
+@llvm_mips_adds_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_adds_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_adds_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_adds_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_adds_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_adds_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.adds.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_adds_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.adds.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_adds_a_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_w_test
+;
+@llvm_mips_adds_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_adds_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_adds_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_adds_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_adds_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_adds_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.adds.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_adds_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.adds.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_adds_a_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_d_test
+;
+@llvm_mips_adds_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_adds_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_adds_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_adds_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_adds_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_adds_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.adds.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_adds_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.adds.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_adds_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_b_test
+;
+@llvm_mips_adds_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_adds_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_adds_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_adds_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_adds_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_adds_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.adds.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_adds_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.adds.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_adds_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_h_test
+;
+@llvm_mips_adds_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_adds_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_adds_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_adds_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_adds_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_adds_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.adds.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_adds_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.adds.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_adds_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_w_test
+;
+@llvm_mips_adds_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_adds_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_adds_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_adds_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_adds_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_adds_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.adds.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_adds_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.adds.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_adds_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_d_test
+;
+@llvm_mips_adds_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_adds_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_adds_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_adds_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_adds_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_adds_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.adds.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_adds_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.adds.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_adds_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_b_test
+;
+@llvm_mips_adds_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_adds_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_adds_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_adds_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_adds_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_adds_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.adds.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_adds_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.adds.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_adds_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_h_test
+;
+@llvm_mips_adds_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_adds_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_adds_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_adds_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_adds_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_adds_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.adds.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_adds_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.adds.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_adds_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_w_test
+;
+@llvm_mips_adds_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_adds_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_adds_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_adds_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_adds_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_adds_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.adds.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_adds_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.adds.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_adds_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_d_test
+;
+@llvm_mips_addv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_addv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_addv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_addv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_addv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_addv_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_addv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.addv.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_addv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_b_test
+;
+@llvm_mips_addv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_addv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_addv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_addv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_addv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_addv_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_addv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.addv.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_addv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_h_test
+;
+@llvm_mips_addv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_addv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_addv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_addv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_addv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_addv_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_addv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.addv.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_addv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_w_test
+;
+@llvm_mips_addv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_addv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_addv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_addv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_addv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_addv_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_addv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.addv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_addv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_d_test
+;
+
+define void @addv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_addv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_addv_b_ARG2
+  %2 = add <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_addv_b_RES
+  ret void
+}
+
+; CHECK: addv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size addv_b_test
+;
+
+define void @addv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_addv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_addv_h_ARG2
+  %2 = add <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_addv_h_RES
+  ret void
+}
+
+; CHECK: addv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size addv_h_test
+;
+
+define void @addv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_addv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_addv_w_ARG2
+  %2 = add <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_addv_w_RES
+  ret void
+}
+
+; CHECK: addv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size addv_w_test
+;
+
+define void @addv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_addv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_addv_d_ARG2
+  %2 = add <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_addv_d_RES
+  ret void
+}
+
+; CHECK: addv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size addv_d_test
+;
+@llvm_mips_asub_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_asub_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_asub_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_asub_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_asub_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_asub_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.asub.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_asub_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.asub.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_asub_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_b_test
+;
+@llvm_mips_asub_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_asub_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_asub_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_asub_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_asub_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_asub_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.asub.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_asub_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.asub.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_asub_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_h_test
+;
+@llvm_mips_asub_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_asub_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_asub_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_asub_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_asub_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_asub_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.asub.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_asub_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.asub.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_asub_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_w_test
+;
+@llvm_mips_asub_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_asub_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_asub_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_asub_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_asub_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_asub_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.asub.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_asub_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.asub.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_asub_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_d_test
+;
+@llvm_mips_asub_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_asub_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_asub_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_asub_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_asub_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_asub_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.asub.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_asub_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.asub.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_asub_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_b_test
+;
+@llvm_mips_asub_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_asub_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_asub_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_asub_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_asub_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_asub_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.asub.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_asub_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.asub.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_asub_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_h_test
+;
+@llvm_mips_asub_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_asub_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_asub_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_asub_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_asub_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_asub_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.asub.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_asub_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.asub.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_asub_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_w_test
+;
+@llvm_mips_asub_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_asub_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_asub_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_asub_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_asub_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_asub_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.asub.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_asub_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.asub.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_asub_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_d_test
+;
+@llvm_mips_ave_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ave_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ave_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ave_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ave_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ave_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ave.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ave_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ave.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ave_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_b_test
+;
+@llvm_mips_ave_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ave_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ave_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ave_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ave_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ave_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ave.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ave_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ave.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ave_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_h_test
+;
+@llvm_mips_ave_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ave_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ave_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ave_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ave_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ave_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ave.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ave_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ave.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ave_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_w_test
+;
+@llvm_mips_ave_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ave_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ave_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ave_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ave_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ave_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ave.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ave_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ave.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ave_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_d_test
+;
+@llvm_mips_ave_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ave_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ave_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ave_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ave_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ave_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ave.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ave_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ave.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ave_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_b_test
+;
+@llvm_mips_ave_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ave_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ave_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ave_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ave_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ave_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ave.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ave_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ave.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ave_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_h_test
+;
+@llvm_mips_ave_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ave_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ave_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ave_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ave_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ave_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ave.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ave_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ave.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ave_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_w_test
+;
+@llvm_mips_ave_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ave_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ave_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ave_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ave_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ave_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ave.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ave_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ave.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ave_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_d_test
+;
+@llvm_mips_aver_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_aver_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_aver_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_aver_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_aver_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_aver_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.aver.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_aver_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.aver.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_aver_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_b_test
+;
+@llvm_mips_aver_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_aver_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_aver_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_aver_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_aver_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_aver_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.aver.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_aver_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.aver.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_aver_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_h_test
+;
+@llvm_mips_aver_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_aver_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_aver_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_aver_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_aver_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_aver_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.aver.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_aver_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.aver.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_aver_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_w_test
+;
+@llvm_mips_aver_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_aver_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_aver_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_aver_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_aver_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_aver_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.aver.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_aver_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.aver.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_aver_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_d_test
+;
+@llvm_mips_aver_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_aver_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_aver_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_aver_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_aver_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_aver_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.aver.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_aver_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.aver.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_aver_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_b_test
+;
+@llvm_mips_aver_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_aver_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_aver_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_aver_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_aver_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_aver_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.aver.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_aver_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.aver.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_aver_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_h_test
+;
+@llvm_mips_aver_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_aver_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_aver_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_aver_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_aver_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_aver_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.aver.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_aver_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.aver.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_aver_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_w_test
+;
+@llvm_mips_aver_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_aver_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_aver_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_aver_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_aver_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_aver_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.aver.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_aver_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.aver.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_aver_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-b.ll b/test/CodeGen/Mips/msa/3r-b.ll
new file mode 100644
index 0000000..a05d19b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-b.ll
@@ -0,0 +1,494 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'b'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bclr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bclr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bclr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bclr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bclr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bclr_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bclr.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bclr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bclr.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_bclr_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: bclr.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bclr_b_test
+;
+@llvm_mips_bclr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bclr_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bclr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bclr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bclr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bclr_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.bclr.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_bclr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bclr.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_bclr_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: bclr.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bclr_h_test
+;
+@llvm_mips_bclr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bclr_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bclr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bclr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bclr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bclr_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.bclr.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_bclr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bclr.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_bclr_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: bclr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bclr_w_test
+;
+@llvm_mips_bclr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bclr_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bclr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bclr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bclr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bclr_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.bclr.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_bclr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bclr.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_bclr_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: bclr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bclr_d_test
+
+@llvm_mips_binsl_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsl_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsl_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_binsl_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_binsl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsl_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_binsl_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.binsl.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_binsl_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsl.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_binsl_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_b_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_b_ARG3)(
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.b [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.b [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_b_test
+
+@llvm_mips_binsl_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsl_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsl_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_binsl_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_binsl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsl_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_binsl_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.binsl.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_binsl_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsl.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_binsl_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_h_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_h_ARG3)(
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.h [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.h [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.h [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_h_test
+
+@llvm_mips_binsl_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsl_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsl_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_binsl_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_binsl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsl_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_binsl_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.binsl.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_binsl_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsl.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_binsl_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_w_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_w_ARG3)(
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.w [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.w [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.w [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_w_test
+
+@llvm_mips_binsl_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsl_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsl_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_binsl_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_binsl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsl_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_binsl_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.binsl.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_binsl_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsl.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_binsl_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_d_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_d_ARG3)(
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.d [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.d [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.d [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_d_test
+
+@llvm_mips_binsr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsr_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsr_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_binsr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_binsr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsr_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_binsr_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.binsr.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_binsr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsr.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_binsr_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_b_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_b_ARG3)(
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.b [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.b [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_b_test
+
+@llvm_mips_binsr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsr_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsr_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_binsr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_binsr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsr_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_binsr_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.binsr.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_binsr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsr.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_binsr_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_h_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_h_ARG3)(
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.h [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.h [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.h [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_h_test
+
+@llvm_mips_binsr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsr_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsr_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_binsr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_binsr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsr_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_binsr_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.binsr.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_binsr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsr.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_binsr_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_w_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_w_ARG3)(
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.w [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.w [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.w [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_w_test
+
+@llvm_mips_binsr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsr_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsr_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_binsr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_binsr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsr_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_binsr_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.binsr.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_binsr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsr.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_binsr_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_d_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_d_ARG3)(
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.d [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.d [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.d [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_d_test
+
+@llvm_mips_bneg_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bneg_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bneg_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bneg_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bneg_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bneg_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bneg.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bneg_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bneg.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_bneg_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: bneg.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bneg_b_test
+;
+@llvm_mips_bneg_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bneg_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bneg_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bneg_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bneg_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bneg_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.bneg.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_bneg_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bneg.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_bneg_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: bneg.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bneg_h_test
+;
+@llvm_mips_bneg_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bneg_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bneg_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bneg_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bneg_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bneg_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.bneg.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_bneg_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bneg.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_bneg_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: bneg.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bneg_w_test
+;
+@llvm_mips_bneg_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bneg_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bneg_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bneg_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bneg_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bneg_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.bneg.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_bneg_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bneg.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_bneg_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: bneg.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bneg_d_test
+;
+@llvm_mips_bset_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bset_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bset_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bset_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bset_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bset_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bset.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bset_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bset.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_bset_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: bset.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bset_b_test
+;
+@llvm_mips_bset_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bset_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bset_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bset_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bset_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bset_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.bset.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_bset_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bset.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_bset_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: bset.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bset_h_test
+;
+@llvm_mips_bset_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bset_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bset_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bset_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bset_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bset_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.bset.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_bset_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bset.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_bset_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: bset.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bset_w_test
+;
+@llvm_mips_bset_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bset_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bset_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bset_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bset_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bset_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.bset.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_bset_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bset.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_bset_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: bset.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bset_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-c.ll b/test/CodeGen/Mips/msa/3r-c.ll
new file mode 100644
index 0000000..6ec92c2
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-c.ll
@@ -0,0 +1,446 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'c'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ceq_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ceq_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ceq_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ceq_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ceq_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ceq_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ceq.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ceq_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ceq.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ceq_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ceq.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ceq_b_test
+;
+@llvm_mips_ceq_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ceq_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ceq_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ceq_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ceq_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ceq_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ceq.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ceq_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ceq.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ceq_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ceq.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ceq_h_test
+;
+@llvm_mips_ceq_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ceq_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ceq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ceq_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ceq_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ceq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ceq.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ceq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ceq.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ceq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ceq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ceq_w_test
+;
+@llvm_mips_ceq_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ceq_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ceq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ceq_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ceq_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ceq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ceq.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ceq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ceq.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ceq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ceq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ceq_d_test
+;
+@llvm_mips_cle_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_cle_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_cle_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_cle_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_cle_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_cle_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.cle.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_cle_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.cle.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_cle_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: cle_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_cle_s_b_test
+;
+@llvm_mips_cle_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_cle_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_cle_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_cle_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_cle_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_cle_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.cle.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_cle_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.cle.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_cle_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: cle_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_cle_s_h_test
+;
+@llvm_mips_cle_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_cle_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_cle_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_cle_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_cle_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_cle_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.cle.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_cle_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.cle.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_cle_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: cle_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_cle_s_w_test
+;
+@llvm_mips_cle_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_cle_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_cle_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_cle_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_cle_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_cle_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.cle.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_cle_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.cle.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_cle_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: cle_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_cle_s_d_test
+;
+@llvm_mips_cle_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_cle_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_cle_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_cle_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_cle_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_cle_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.cle.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_cle_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.cle.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_cle_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: cle_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_cle_u_b_test
+;
+@llvm_mips_cle_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_cle_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_cle_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_cle_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_cle_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_cle_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.cle.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_cle_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.cle.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_cle_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: cle_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_cle_u_h_test
+;
+@llvm_mips_cle_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_cle_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_cle_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_cle_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_cle_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_cle_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.cle.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_cle_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.cle.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_cle_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: cle_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_cle_u_w_test
+;
+@llvm_mips_cle_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_cle_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_cle_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_cle_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_cle_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_cle_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.cle.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_cle_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.cle.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_cle_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: cle_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_cle_u_d_test
+;
+@llvm_mips_clt_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clt_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_clt_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clt_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clt_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_clt_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.clt.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_clt_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clt.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_clt_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: clt_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clt_s_b_test
+;
+@llvm_mips_clt_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clt_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_clt_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clt_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clt_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_clt_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.clt.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_clt_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clt.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_clt_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: clt_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clt_s_h_test
+;
+@llvm_mips_clt_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clt_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_clt_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clt_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clt_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_clt_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.clt.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_clt_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clt.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_clt_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: clt_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clt_s_w_test
+;
+@llvm_mips_clt_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clt_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_clt_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clt_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clt_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_clt_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.clt.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_clt_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clt.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_clt_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: clt_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clt_s_d_test
+;
+@llvm_mips_clt_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clt_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_clt_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clt_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clt_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_clt_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.clt.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_clt_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clt.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_clt_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: clt_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clt_u_b_test
+;
+@llvm_mips_clt_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clt_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_clt_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clt_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clt_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_clt_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.clt.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_clt_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clt.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_clt_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: clt_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clt_u_h_test
+;
+@llvm_mips_clt_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clt_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_clt_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clt_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clt_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_clt_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.clt.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_clt_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clt.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_clt_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: clt_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clt_u_w_test
+;
+@llvm_mips_clt_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clt_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_clt_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clt_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clt_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_clt_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.clt.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_clt_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clt.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_clt_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: clt_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clt_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-d.ll b/test/CodeGen/Mips/msa/3r-d.ll
new file mode 100644
index 0000000..0099554
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-d.ll
@@ -0,0 +1,478 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'd'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_div_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_div_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_div_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_div_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.div.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.div.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_div_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_div_s_b_test
+;
+@llvm_mips_div_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_div_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_div_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_div_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.div.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.div.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_div_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_div_s_h_test
+;
+@llvm_mips_div_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_div_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_div_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_div_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.div.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.div.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_div_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_div_s_w_test
+;
+@llvm_mips_div_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_div_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_div_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_div_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.div.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.div.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_div_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_div_s_d_test
+;
+
+define void @div_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_s_b_ARG2
+  %2 = sdiv <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_s_b_RES
+  ret void
+}
+
+; CHECK: div_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_s.b
+; CHECK: st.b
+; CHECK: .size div_s_b_test
+
+define void @div_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_s_h_ARG2
+  %2 = sdiv <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_s_h_RES
+  ret void
+}
+
+; CHECK: div_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_s.h
+; CHECK: st.h
+; CHECK: .size div_s_h_test
+
+define void @div_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_s_w_ARG2
+  %2 = sdiv <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_s_w_RES
+  ret void
+}
+
+; CHECK: div_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_s.w
+; CHECK: st.w
+; CHECK: .size div_s_w_test
+
+define void @div_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_s_d_ARG2
+  %2 = sdiv <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_s_d_RES
+  ret void
+}
+
+; CHECK: div_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_s.d
+; CHECK: st.d
+; CHECK: .size div_s_d_test
+;
+@llvm_mips_div_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_div_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_div_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_div_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.div.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.div.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_div_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_div_u_b_test
+;
+@llvm_mips_div_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_div_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_div_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_div_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.div.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.div.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_div_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_div_u_h_test
+;
+@llvm_mips_div_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_div_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_div_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_div_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.div.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.div.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_div_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_div_u_w_test
+;
+@llvm_mips_div_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_div_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_div_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_div_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.div.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.div.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_div_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_div_u_d_test
+;
+
+define void @div_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_u_b_ARG2
+  %2 = udiv <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_u_b_RES
+  ret void
+}
+
+; CHECK: div_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_u.b
+; CHECK: st.b
+; CHECK: .size div_u_b_test
+
+define void @div_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_u_h_ARG2
+  %2 = udiv <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_u_h_RES
+  ret void
+}
+
+; CHECK: div_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_u.h
+; CHECK: st.h
+; CHECK: .size div_u_h_test
+
+define void @div_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_u_w_ARG2
+  %2 = udiv <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_u_w_RES
+  ret void
+}
+
+; CHECK: div_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_u.w
+; CHECK: st.w
+; CHECK: .size div_u_w_test
+
+define void @div_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_u_d_ARG2
+  %2 = udiv <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_u_d_RES
+  ret void
+}
+
+; CHECK: div_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_u.d
+; CHECK: st.d
+; CHECK: .size div_u_d_test
+;
+@llvm_mips_dotp_s_h_ARG1 = global <16 x i8> <i8  0, i8  1, i8  2, i8  3,
+                                             i8  4, i8  5, i8  6, i8  7,
+                                             i8  8, i8  9, i8 10, i8 11,
+                                             i8 12, i8 13, i8 14, i8 15>,
+                                            align 16
+@llvm_mips_dotp_s_h_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19,
+                                             i8 20, i8 21, i8 22, i8 23,
+                                             i8 24, i8 25, i8 26, i8 27,
+                                             i8 28, i8 29, i8 30, i8 31>,
+                                            align 16
+@llvm_mips_dotp_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0,
+                                             i16 0, i16 0, i16 0, i16 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_s_h_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_dotp_s_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dotp_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.dotp.s.h(<16 x i8> %0, <16 x i8> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_dotp_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dotp.s.h(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dotp_s_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: dotp_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dotp_s_h_test
+;
+@llvm_mips_dotp_s_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3,
+                                             i16 4, i16 5, i16 6, i16 7>,
+                                            align 16
+@llvm_mips_dotp_s_w_ARG2 = global <8 x i16> <i16  4, i16  5, i16  6, i16  7,
+                                             i16  8, i16  9, i16 10, i16 11>,
+                                            align 16
+@llvm_mips_dotp_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_s_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dotp_s_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dotp_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.dotp.s.w(<8 x i16> %0, <8 x i16> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_dotp_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dotp.s.w(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dotp_s_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: dotp_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dotp_s_w_test
+;
+@llvm_mips_dotp_s_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 0, i32 1>,
+                                            align 16
+@llvm_mips_dotp_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 2, i32 3>,
+                                            align 16
+@llvm_mips_dotp_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dotp_s_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dotp_s_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dotp_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.dotp.s.d(<4 x i32> %0, <4 x i32> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_dotp_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dotp.s.d(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dotp_s_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: dotp_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dotp_s_d_test
+;
+@llvm_mips_dotp_u_h_ARG1 = global <16 x i8> <i8  0, i8  1, i8  2, i8  3,
+                                             i8  4, i8  5, i8  6, i8  7,
+                                             i8  8, i8  9, i8 10, i8 11,
+                                             i8 12, i8 13, i8 14, i8 15>,
+                                            align 16
+@llvm_mips_dotp_u_h_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19,
+                                             i8 20, i8 21, i8 22, i8 23,
+                                             i8 24, i8 25, i8 26, i8 27,
+                                             i8 28, i8 29, i8 30, i8 31>,
+                                            align 16
+@llvm_mips_dotp_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0,
+                                             i16 0, i16 0, i16 0, i16 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_u_h_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_dotp_u_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dotp_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.dotp.u.h(<16 x i8> %0, <16 x i8> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_dotp_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dotp.u.h(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dotp_u_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: dotp_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dotp_u_h_test
+;
+@llvm_mips_dotp_u_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3,
+                                             i16 4, i16 5, i16 6, i16 7>,
+                                            align 16
+@llvm_mips_dotp_u_w_ARG2 = global <8 x i16> <i16  4, i16  5, i16  6, i16  7,
+                                             i16  8, i16  9, i16 10, i16 11>,
+                                            align 16
+@llvm_mips_dotp_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_u_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dotp_u_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dotp_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.dotp.u.w(<8 x i16> %0, <8 x i16> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_dotp_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dotp.u.w(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dotp_u_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: dotp_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dotp_u_w_test
+;
+@llvm_mips_dotp_u_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 0, i32 1>,
+                                            align 16
+@llvm_mips_dotp_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 2, i32 3>,
+                                            align 16
+@llvm_mips_dotp_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dotp_u_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dotp_u_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dotp_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.dotp.u.d(<4 x i32> %0, <4 x i32> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_dotp_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dotp.u.d(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dotp_u_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: dotp_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dotp_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-i.ll b/test/CodeGen/Mips/msa/3r-i.ll
new file mode 100644
index 0000000..2ef3047
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-i.ll
@@ -0,0 +1,358 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'i'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ilvev_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvev_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvev_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvev_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvev_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvev_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvev.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvev_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvev.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvev_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvev.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvev_b_test
+;
+@llvm_mips_ilvev_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvev_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvev_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvev_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvev_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvev_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvev.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvev_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvev.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvev_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvev.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvev_h_test
+;
+@llvm_mips_ilvev_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvev_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvev_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvev_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvev_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvev_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvev.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvev_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvev.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvev_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvev.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvev_w_test
+;
+@llvm_mips_ilvev_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvev_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvev_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvev_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvev_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvev_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvev.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvev_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvev.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvev_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvev.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvev_d_test
+;
+@llvm_mips_ilvl_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvl_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvl_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvl_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvl.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvl_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvl.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvl_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvl.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvl_b_test
+;
+@llvm_mips_ilvl_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvl_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvl_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvl_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvl.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvl_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvl.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvl_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvl.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvl_h_test
+;
+@llvm_mips_ilvl_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvl_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvl_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvl_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvl.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvl_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvl.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvl_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvl.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvl_w_test
+;
+@llvm_mips_ilvl_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvl_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvl_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvl_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvl.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvl_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvl.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvl_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvl.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvl_d_test
+;
+@llvm_mips_ilvod_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvod_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvod_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvod_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvod_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvod_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvod.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvod_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvod.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvod_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvod.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvod_b_test
+;
+@llvm_mips_ilvod_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvod_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvod_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvod_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvod_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvod_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvod.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvod_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvod.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvod_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvod.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvod_h_test
+;
+@llvm_mips_ilvod_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvod_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvod_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvod_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvod_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvod_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvod.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvod_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvod.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvod_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvod.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvod_w_test
+;
+@llvm_mips_ilvod_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvod_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvod_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvod_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvod_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvod_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvod.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvod_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvod.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvod_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvod.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvod_d_test
+;
+@llvm_mips_ilvr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvr_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvr.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvr.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvr_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvr.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvr_b_test
+;
+@llvm_mips_ilvr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvr_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvr_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvr.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvr.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvr_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvr.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvr_h_test
+;
+@llvm_mips_ilvr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvr_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvr_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvr.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvr.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvr_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvr_w_test
+;
+@llvm_mips_ilvr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvr_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvr_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvr.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvr.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvr_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvr_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-m.ll b/test/CodeGen/Mips/msa/3r-m.ll
new file mode 100644
index 0000000..ddfd720
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-m.ll
@@ -0,0 +1,862 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'm'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_max_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_max_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_max_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_max_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_max_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_max_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.max.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_max_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.max.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_max_a_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: max_a.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_max_a_b_test
+;
+@llvm_mips_max_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_max_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_max_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_max_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_max_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_max_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.max.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_max_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.max.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_max_a_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: max_a.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_max_a_h_test
+;
+@llvm_mips_max_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_max_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_max_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_max_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_max_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_max_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.max.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_max_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.max.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_max_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: max_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_max_a_w_test
+;
+@llvm_mips_max_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_max_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_max_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_max_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_max_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_max_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.max.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_max_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.max.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_max_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: max_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_max_a_d_test
+;
+@llvm_mips_max_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_max_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_max_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_max_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_max_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_max_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.max.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_max_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.max.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_max_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: max_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_max_s_b_test
+;
+@llvm_mips_max_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_max_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_max_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_max_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_max_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_max_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.max.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_max_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.max.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_max_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: max_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_max_s_h_test
+;
+@llvm_mips_max_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_max_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_max_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_max_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_max_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_max_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.max.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_max_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.max.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_max_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: max_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_max_s_w_test
+;
+@llvm_mips_max_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_max_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_max_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_max_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_max_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_max_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.max.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_max_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.max.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_max_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: max_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_max_s_d_test
+;
+@llvm_mips_max_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_max_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_max_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_max_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_max_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_max_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.max.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_max_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.max.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_max_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: max_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_max_u_b_test
+;
+@llvm_mips_max_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_max_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_max_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_max_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_max_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_max_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.max.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_max_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.max.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_max_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: max_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_max_u_h_test
+;
+@llvm_mips_max_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_max_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_max_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_max_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_max_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_max_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.max.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_max_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.max.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_max_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: max_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_max_u_w_test
+;
+@llvm_mips_max_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_max_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_max_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_max_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_max_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_max_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.max.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_max_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.max.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_max_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: max_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_max_u_d_test
+;
+@llvm_mips_min_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_min_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_min_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_min_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_min_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_min_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.min.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_min_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.min.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_min_a_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: min_a.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_min_a_b_test
+;
+@llvm_mips_min_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_min_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_min_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_min_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_min_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_min_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.min.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_min_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.min.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_min_a_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: min_a.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_min_a_h_test
+;
+@llvm_mips_min_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_min_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_min_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_min_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_min_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_min_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.min.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_min_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.min.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_min_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: min_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_min_a_w_test
+;
+@llvm_mips_min_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_min_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_min_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_min_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_min_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_min_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.min.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_min_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.min.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_min_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: min_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_min_a_d_test
+;
+@llvm_mips_min_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_min_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_min_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_min_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_min_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_min_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.min.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_min_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.min.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_min_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: min_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_min_s_b_test
+;
+@llvm_mips_min_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_min_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_min_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_min_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_min_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_min_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.min.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_min_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.min.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_min_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: min_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_min_s_h_test
+;
+@llvm_mips_min_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_min_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_min_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_min_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_min_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_min_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.min.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_min_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.min.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_min_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: min_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_min_s_w_test
+;
+@llvm_mips_min_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_min_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_min_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_min_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_min_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_min_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.min.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_min_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.min.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_min_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: min_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_min_s_d_test
+;
+@llvm_mips_min_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_min_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_min_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_min_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_min_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_min_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.min.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_min_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.min.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_min_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: min_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_min_u_b_test
+;
+@llvm_mips_min_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_min_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_min_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_min_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_min_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_min_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.min.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_min_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.min.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_min_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: min_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_min_u_h_test
+;
+@llvm_mips_min_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_min_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_min_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_min_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_min_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_min_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.min.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_min_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.min.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_min_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: min_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_min_u_w_test
+;
+@llvm_mips_min_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_min_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_min_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_min_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_min_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_min_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.min.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_min_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.min.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_min_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: min_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_min_u_d_test
+;
+@llvm_mips_mod_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mod_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_mod_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mod_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mod_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mod_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.mod.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mod_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mod.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_mod_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mod_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mod_s_b_test
+;
+@llvm_mips_mod_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mod_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mod_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mod_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mod_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mod_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mod.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mod_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mod.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mod_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mod_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mod_s_h_test
+;
+@llvm_mips_mod_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mod_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mod_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mod_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mod_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mod_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mod.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mod_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mod.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mod_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mod_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mod_s_w_test
+;
+@llvm_mips_mod_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mod_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_mod_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mod_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mod_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mod_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.mod.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mod_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mod.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_mod_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mod_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mod_s_d_test
+;
+@llvm_mips_mod_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mod_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_mod_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mod_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mod_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mod_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.mod.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mod_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mod.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_mod_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mod_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mod_u_b_test
+;
+@llvm_mips_mod_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mod_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mod_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mod_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mod_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mod_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mod.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mod_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mod.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mod_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mod_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mod_u_h_test
+;
+@llvm_mips_mod_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mod_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mod_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mod_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mod_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mod_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mod.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mod_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mod.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mod_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mod_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mod_u_w_test
+;
+@llvm_mips_mod_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mod_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_mod_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mod_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mod_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mod_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.mod.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mod_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mod.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_mod_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mod_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mod_u_d_test
+;
+@llvm_mips_mulv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mulv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_mulv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mulv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mulv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mulv_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.mulv.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mulv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mulv.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_mulv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mulv.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mulv_b_test
+;
+@llvm_mips_mulv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mulv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mulv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mulv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mulv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mulv_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mulv.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mulv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mulv.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mulv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mulv.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mulv_h_test
+;
+@llvm_mips_mulv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mulv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mulv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mulv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mulv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mulv_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mulv.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mulv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mulv.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mulv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mulv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mulv_w_test
+;
+@llvm_mips_mulv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mulv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_mulv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mulv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mulv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mulv_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.mulv.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mulv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mulv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_mulv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mulv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mulv_d_test
+
+define void @mulv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mulv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mulv_b_ARG2
+  %2 = mul <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mulv_b_RES
+  ret void
+}
+
+; CHECK: mulv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mulv.b
+; CHECK: st.b
+; CHECK: .size mulv_b_test
+
+define void @mulv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mulv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mulv_h_ARG2
+  %2 = mul <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mulv_h_RES
+  ret void
+}
+
+; CHECK: mulv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mulv.h
+; CHECK: st.h
+; CHECK: .size mulv_h_test
+
+define void @mulv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mulv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mulv_w_ARG2
+  %2 = mul <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mulv_w_RES
+  ret void
+}
+
+; CHECK: mulv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mulv.w
+; CHECK: st.w
+; CHECK: .size mulv_w_test
+
+define void @mulv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mulv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mulv_d_ARG2
+  %2 = mul <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mulv_d_RES
+  ret void
+}
+
+; CHECK: mulv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mulv.d
+; CHECK: st.d
+; CHECK: .size mulv_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-p.ll b/test/CodeGen/Mips/msa/3r-p.ll
new file mode 100644
index 0000000..852023b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-p.ll
@@ -0,0 +1,182 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'p'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_pckev_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_pckev_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_pckev_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_pckev_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_pckev_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_pckev_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.pckev.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_pckev_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.pckev.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_pckev_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: pckev.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_pckev_b_test
+;
+@llvm_mips_pckev_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_pckev_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_pckev_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_pckev_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_pckev_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_pckev_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.pckev.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_pckev_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.pckev.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_pckev_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: pckev.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_pckev_h_test
+;
+@llvm_mips_pckev_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_pckev_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_pckev_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_pckev_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_pckev_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_pckev_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.pckev.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_pckev_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.pckev.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_pckev_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: pckev.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_pckev_w_test
+;
+@llvm_mips_pckev_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_pckev_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_pckev_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_pckev_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_pckev_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_pckev_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.pckev.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_pckev_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.pckev.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_pckev_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: pckev.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_pckev_d_test
+;
+@llvm_mips_pckod_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_pckod_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_pckod_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_pckod_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_pckod_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_pckod_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.pckod.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_pckod_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.pckod.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_pckod_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: pckod.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_pckod_b_test
+;
+@llvm_mips_pckod_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_pckod_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_pckod_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_pckod_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_pckod_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_pckod_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.pckod.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_pckod_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.pckod.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_pckod_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: pckod.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_pckod_h_test
+;
+@llvm_mips_pckod_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_pckod_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_pckod_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_pckod_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_pckod_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_pckod_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.pckod.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_pckod_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.pckod.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_pckod_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: pckod.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_pckod_w_test
+;
+@llvm_mips_pckod_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_pckod_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_pckod_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_pckod_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_pckod_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_pckod_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.pckod.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_pckod_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.pckod.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_pckod_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: pckod.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_pckod_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-s.ll b/test/CodeGen/Mips/msa/3r-s.ll
new file mode 100644
index 0000000..30cf265
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-s.ll
@@ -0,0 +1,1353 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 's'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_sld_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sld_b_ARG2 = global i32 10, align 16
+@llvm_mips_sld_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sld_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sld_b_ARG1
+  %1 = load i32* @llvm_mips_sld_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.sld.b(<16 x i8> %0, i32 %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sld_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sld.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sld_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sld_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.b [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_sld_b_test
+;
+@llvm_mips_sld_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sld_h_ARG2 = global i32 10, align 16
+@llvm_mips_sld_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sld_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sld_h_ARG1
+  %1 = load i32* @llvm_mips_sld_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.sld.h(<8 x i16> %0, i32 %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sld_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sld.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sld_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_h_ARG1)
+; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.h [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_sld_h_test
+;
+@llvm_mips_sld_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sld_w_ARG2 = global i32 10, align 16
+@llvm_mips_sld_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sld_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sld_w_ARG1
+  %1 = load i32* @llvm_mips_sld_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.sld.w(<4 x i32> %0, i32 %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sld_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sld.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sld_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_w_ARG1)
+; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.w [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_sld_w_test
+;
+@llvm_mips_sld_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sld_d_ARG2 = global i32 10, align 16
+@llvm_mips_sld_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sld_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sld_d_ARG1
+  %1 = load i32* @llvm_mips_sld_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.sld.d(<2 x i64> %0, i32 %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sld_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sld.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sld_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_d_ARG1)
+; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.d [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_sld_d_test
+;
+@llvm_mips_sll_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sll_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_sll_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sll_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sll_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sll_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.sll.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sll_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sll.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_sll_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_sll_b_test
+;
+@llvm_mips_sll_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sll_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_sll_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sll_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sll_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sll_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.sll.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sll_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sll.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_sll_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_sll_h_test
+;
+@llvm_mips_sll_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sll_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_sll_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sll_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sll_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sll_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.sll.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sll_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sll.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_sll_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_sll_w_test
+;
+@llvm_mips_sll_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sll_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_sll_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sll_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sll_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sll_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.sll.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sll_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sll.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_sll_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_sll_d_test
+
+define void @sll_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sll_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sll_b_ARG2
+  %2 = shl <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sll_b_RES
+  ret void
+}
+
+; CHECK: sll_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size sll_b_test
+
+define void @sll_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sll_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sll_h_ARG2
+  %2 = shl <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sll_h_RES
+  ret void
+}
+
+; CHECK: sll_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size sll_h_test
+
+define void @sll_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sll_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sll_w_ARG2
+  %2 = shl <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sll_w_RES
+  ret void
+}
+
+; CHECK: sll_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size sll_w_test
+
+define void @sll_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sll_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sll_d_ARG2
+  %2 = shl <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sll_d_RES
+  ret void
+}
+
+; CHECK: sll_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size sll_d_test
+;
+@llvm_mips_sra_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sra_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_sra_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sra_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sra_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sra_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.sra.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sra_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sra.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_sra_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_sra_b_test
+;
+@llvm_mips_sra_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sra_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_sra_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sra_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sra_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sra_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.sra.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sra_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sra.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_sra_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_sra_h_test
+;
+@llvm_mips_sra_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sra_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_sra_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sra_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sra_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sra_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.sra.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sra_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sra.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_sra_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_sra_w_test
+;
+@llvm_mips_sra_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sra_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_sra_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sra_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sra_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sra_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.sra.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sra_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sra.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_sra_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_sra_d_test
+;
+
+define void @sra_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sra_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sra_b_ARG2
+  %2 = ashr <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sra_b_RES
+  ret void
+}
+
+; CHECK: sra_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size sra_b_test
+
+define void @sra_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sra_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sra_h_ARG2
+  %2 = ashr <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sra_h_RES
+  ret void
+}
+
+; CHECK: sra_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size sra_h_test
+
+define void @sra_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sra_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sra_w_ARG2
+  %2 = ashr <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sra_w_RES
+  ret void
+}
+
+; CHECK: sra_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size sra_w_test
+
+define void @sra_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sra_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sra_d_ARG2
+  %2 = ashr <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sra_d_RES
+  ret void
+}
+
+; CHECK: sra_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size sra_d_test
+
+@llvm_mips_srar_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srar_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_srar_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srar_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srar_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srar_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.srar.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srar_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srar.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_srar_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_srar_b_test
+;
+@llvm_mips_srar_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srar_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_srar_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srar_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srar_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srar_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.srar.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srar_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srar.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_srar_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_srar_h_test
+;
+@llvm_mips_srar_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srar_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_srar_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srar_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srar_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srar_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.srar.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srar_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srar.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_srar_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_srar_w_test
+;
+@llvm_mips_srar_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srar_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_srar_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srar_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srar_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srar_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.srar.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srar_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srar.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_srar_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_srar_d_test
+;
+@llvm_mips_srl_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srl_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_srl_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srl_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.srl.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srl_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srl.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_srl_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_srl_b_test
+;
+@llvm_mips_srl_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srl_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_srl_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srl_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.srl.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srl_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srl.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_srl_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_srl_h_test
+;
+@llvm_mips_srl_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srl_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_srl_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srl_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.srl.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srl_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srl.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_srl_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_srl_w_test
+;
+@llvm_mips_srl_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srl_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_srl_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srl_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.srl.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srl_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srl.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_srl_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_srl_d_test
+;
+@llvm_mips_srlr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srlr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_srlr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srlr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srlr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srlr_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.srlr.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srlr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srlr.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_srlr_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_srlr_b_test
+;
+@llvm_mips_srlr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srlr_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_srlr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srlr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srlr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srlr_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.srlr.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srlr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srlr.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_srlr_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_srlr_h_test
+;
+@llvm_mips_srlr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srlr_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_srlr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srlr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srlr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srlr_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.srlr.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srlr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srlr.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_srlr_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_srlr_w_test
+;
+@llvm_mips_srlr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srlr_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_srlr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srlr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srlr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srlr_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.srlr.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srlr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srlr.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_srlr_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_srlr_d_test
+;
+
+define void @srl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srl_b_ARG2
+  %2 = lshr <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srl_b_RES
+  ret void
+}
+
+; CHECK: srl_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size srl_b_test
+
+define void @srl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srl_h_ARG2
+  %2 = lshr <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srl_h_RES
+  ret void
+}
+
+; CHECK: srl_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size srl_h_test
+
+define void @srl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srl_w_ARG2
+  %2 = lshr <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srl_w_RES
+  ret void
+}
+
+; CHECK: srl_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size srl_w_test
+
+define void @srl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srl_d_ARG2
+  %2 = lshr <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srl_d_RES
+  ret void
+}
+
+; CHECK: srl_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size srl_d_test
+
+@llvm_mips_subs_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subs_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subs_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subs_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subs_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subs_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subs.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subs_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subs.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subs_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subs_s_b_test
+;
+@llvm_mips_subs_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subs_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subs_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subs_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subs_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subs_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subs.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subs_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subs.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subs_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subs_s_h_test
+;
+@llvm_mips_subs_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subs_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subs_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subs_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subs_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subs_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subs.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subs_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subs.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subs_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subs_s_w_test
+;
+@llvm_mips_subs_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subs_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subs_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subs_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subs_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subs_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subs.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subs_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subs.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subs_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subs_s_d_test
+;
+@llvm_mips_subs_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subs_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subs_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subs_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subs_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subs_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subs.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subs_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subs.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subs_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subs_u_b_test
+;
+@llvm_mips_subs_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subs_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subs_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subs_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subs_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subs_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subs.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subs_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subs.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subs_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subs_u_h_test
+;
+@llvm_mips_subs_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subs_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subs_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subs_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subs_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subs_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subs.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subs_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subs.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subs_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subs_u_w_test
+;
+@llvm_mips_subs_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subs_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subs_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subs_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subs_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subs_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subs.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subs_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subs.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subs_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subs_u_d_test
+;
+@llvm_mips_subsus_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subsus_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subsus_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subsus_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subsus_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subsus_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subsus.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subsus_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subsus.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subsus_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subsus_u_b_test
+;
+@llvm_mips_subsus_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subsus_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subsus_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subsus_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subsus_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subsus_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subsus.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subsus_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subsus.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subsus_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subsus_u_h_test
+;
+@llvm_mips_subsus_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subsus_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subsus_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subsus_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subsus_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subsus_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subsus.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subsus_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subsus.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subsus_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subsus_u_w_test
+;
+@llvm_mips_subsus_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subsus_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subsus_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subsus_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subsus_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subsus_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subsus.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subsus_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subsus.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subsus_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subsus_u_d_test
+;
+@llvm_mips_subsuu_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subsuu_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subsuu_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subsuu_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subsuu_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subsuu_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subsuu.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subsuu_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subsuu.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_b_test
+;
+@llvm_mips_subsuu_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subsuu_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subsuu_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subsuu_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subsuu_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subsuu_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subsuu.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subsuu_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subsuu.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_h_test
+;
+@llvm_mips_subsuu_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subsuu_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subsuu_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subsuu_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subsuu_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subsuu_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subsuu.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subsuu_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subsuu.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_w_test
+;
+@llvm_mips_subsuu_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subsuu_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subsuu_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subsuu_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subsuu_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subsuu_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subsuu.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subsuu_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subsuu.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_d_test
+;
+@llvm_mips_subv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subv_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subv.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subv.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subv_b_test
+;
+@llvm_mips_subv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subv_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subv.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subv.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subv_h_test
+;
+@llvm_mips_subv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subv_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subv.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subv.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subv_w_test
+;
+@llvm_mips_subv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subv_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subv.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subv_d_test
+;
+
+define void @subv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subv_b_ARG2
+  %2 = sub <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subv_b_RES
+  ret void
+}
+
+; CHECK: subv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size subv_b_test
+
+define void @subv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subv_h_ARG2
+  %2 = sub <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subv_h_RES
+  ret void
+}
+
+; CHECK: subv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size subv_h_test
+
+define void @subv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subv_w_ARG2
+  %2 = sub <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subv_w_RES
+  ret void
+}
+
+; CHECK: subv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size subv_w_test
+
+define void @subv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subv_d_ARG2
+  %2 = sub <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subv_d_RES
+  ret void
+}
+
+; CHECK: subv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size subv_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-v.ll b/test/CodeGen/Mips/msa/3r-v.ll
new file mode 100644
index 0000000..c9693f9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-v.ll
@@ -0,0 +1,105 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'v'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_vshf_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_vshf_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_vshf_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_vshf_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_vshf_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_vshf_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_vshf_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_vshf_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.vshf.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_vshf_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.vshf.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_vshf_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: vshf.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_vshf_b_test
+;
+@llvm_mips_vshf_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_vshf_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_vshf_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_vshf_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_vshf_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_vshf_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_vshf_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_vshf_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.vshf.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_vshf_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.vshf.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_vshf_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: vshf.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_vshf_h_test
+;
+@llvm_mips_vshf_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_vshf_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_vshf_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_vshf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_vshf_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_vshf_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_vshf_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_vshf_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.vshf.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_vshf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.vshf.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_vshf_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: vshf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_vshf_w_test
+;
+@llvm_mips_vshf_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_vshf_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_vshf_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_vshf_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_vshf_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_vshf_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_vshf_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_vshf_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.vshf.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_vshf_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.vshf.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_vshf_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: vshf.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_vshf_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r_4r.ll b/test/CodeGen/Mips/msa/3r_4r.ll
new file mode 100644
index 0000000..b7fd728
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r_4r.ll
@@ -0,0 +1,206 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format and
+; use the result as a third operand.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_maddv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_maddv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_maddv_b_ARG3 = global <16 x i8> <i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47>, align 16
+@llvm_mips_maddv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_maddv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_maddv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_maddv_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_maddv_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.maddv.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_maddv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.maddv.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_maddv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: maddv.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_maddv_b_test
+;
+@llvm_mips_maddv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maddv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_maddv_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_maddv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maddv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maddv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_maddv_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_maddv_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.maddv.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_maddv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maddv.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_maddv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: maddv.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maddv_h_test
+;
+@llvm_mips_maddv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maddv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_maddv_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_maddv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maddv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maddv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_maddv_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_maddv_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.maddv.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_maddv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maddv.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_maddv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: maddv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maddv_w_test
+;
+@llvm_mips_maddv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_maddv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_maddv_d_ARG3 = global <2 x i64> <i64 4, i64 5>, align 16
+@llvm_mips_maddv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_maddv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_maddv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_maddv_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_maddv_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.maddv.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_maddv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.maddv.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_maddv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: maddv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_maddv_d_test
+;
+@llvm_mips_msubv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_msubv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_msubv_b_ARG3 = global <16 x i8> <i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47>, align 16
+@llvm_mips_msubv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_msubv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_msubv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_msubv_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_msubv_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.msubv.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_msubv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.msubv.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_msubv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: msubv.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_msubv_b_test
+;
+@llvm_mips_msubv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_msubv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_msubv_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_msubv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_msubv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_msubv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_msubv_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_msubv_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.msubv.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_msubv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.msubv.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_msubv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: msubv.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_msubv_h_test
+;
+@llvm_mips_msubv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_msubv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_msubv_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_msubv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_msubv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_msubv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_msubv_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_msubv_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.msubv.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_msubv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.msubv.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_msubv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: msubv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_msubv_w_test
+;
+@llvm_mips_msubv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_msubv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_msubv_d_ARG3 = global <2 x i64> <i64 4, i64 5>, align 16
+@llvm_mips_msubv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_msubv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_msubv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_msubv_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_msubv_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.msubv.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_msubv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.msubv.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_msubv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: msubv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_msubv_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r_4r_widen.ll b/test/CodeGen/Mips/msa/3r_4r_widen.ll
new file mode 100644
index 0000000..7063e45
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r_4r_widen.ll
@@ -0,0 +1,307 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format and
+; use the result as a third operand and results in wider elements than the
+; operands had.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_dpadd_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpadd_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpadd_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpadd_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpadd_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpadd_s_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpadd_s_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpadd_s_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpadd_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpadd_s_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpadd_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpadd_s_h_test
+;
+@llvm_mips_dpadd_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpadd_s_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpadd_s_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpadd_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpadd_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpadd_s_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpadd_s_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpadd_s_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpadd_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpadd_s_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpadd_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpadd_s_w_test
+;
+@llvm_mips_dpadd_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpadd_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpadd_s_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpadd_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpadd_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpadd_s_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpadd_s_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpadd_s_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpadd_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpadd_s_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpadd_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpadd_s_d_test
+;
+@llvm_mips_dpadd_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpadd_u_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpadd_u_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpadd_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpadd_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpadd_u_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpadd_u_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpadd_u_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpadd_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpadd_u_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpadd_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpadd_u_h_test
+;
+@llvm_mips_dpadd_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpadd_u_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpadd_u_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpadd_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpadd_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpadd_u_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpadd_u_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpadd_u_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpadd_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpadd_u_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpadd_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpadd_u_w_test
+;
+@llvm_mips_dpadd_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpadd_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpadd_u_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpadd_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpadd_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpadd_u_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpadd_u_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpadd_u_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpadd_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpadd_u_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpadd_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpadd_u_d_test
+;
+@llvm_mips_dpsub_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpsub_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpsub_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpsub_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpsub_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpsub_s_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpsub_s_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpsub_s_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpsub.s.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpsub_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpsub.s.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpsub_s_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpsub_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpsub_s_h_test
+;
+@llvm_mips_dpsub_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpsub_s_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpsub_s_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpsub_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpsub_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpsub_s_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpsub_s_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpsub_s_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpsub.s.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpsub_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpsub.s.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpsub_s_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpsub_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpsub_s_w_test
+;
+@llvm_mips_dpsub_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpsub_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpsub_s_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpsub_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpsub_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpsub_s_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpsub_s_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpsub_s_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpsub.s.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpsub_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpsub.s.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpsub_s_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpsub_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpsub_s_d_test
+;
+@llvm_mips_dpsub_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpsub_u_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpsub_u_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpsub_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpsub_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpsub_u_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpsub_u_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpsub_u_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpsub.u.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpsub_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpsub.u.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpsub_u_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpsub_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpsub_u_h_test
+;
+@llvm_mips_dpsub_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpsub_u_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpsub_u_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpsub_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpsub_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpsub_u_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpsub_u_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpsub_u_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpsub.u.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpsub_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpsub.u.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpsub_u_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpsub_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpsub_u_w_test
+;
+@llvm_mips_dpsub_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpsub_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpsub_u_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpsub_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpsub_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpsub_u_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpsub_u_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpsub_u_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpsub.u.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpsub_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpsub.u.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpsub_u_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpsub_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpsub_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r_splat.ll b/test/CodeGen/Mips/msa/3r_splat.ll
new file mode 100644
index 0000000..6b0cb26
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r_splat.ll
@@ -0,0 +1,94 @@
+; Test the MSA splat intrinsics that are encoded with the 3R instruction
+; format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
+; RUN:     FileCheck -check-prefix=MIPS32 %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
+; RUN:     FileCheck -check-prefix=MIPS32 %s
+
+@llvm_mips_splat_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_splat_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_splat_b_test(i32 %a) nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_splat_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.splat.b(<16 x i8> %0, i32 %a)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_splat_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.splat.b(<16 x i8>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_b_test:
+; MIPS32-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_b_ARG1)(
+; MIPS32-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_b_RES)(
+; MIPS32-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: splat.b [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS32-DAG: st.b [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_b_test
+
+@llvm_mips_splat_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_splat_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_splat_h_test(i32 %a) nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_splat_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.splat.h(<8 x i16> %0, i32 %a)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_splat_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.splat.h(<8 x i16>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_h_test:
+; MIPS32-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_h_ARG1)(
+; MIPS32-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_h_RES)(
+; MIPS32-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: splat.h [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS32-DAG: st.h [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_h_test
+
+@llvm_mips_splat_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_splat_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_splat_w_test(i32 %a) nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_splat_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.splat.w(<4 x i32> %0, i32 %a)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_splat_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.splat.w(<4 x i32>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_w_test:
+; MIPS32-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_w_ARG1)(
+; MIPS32-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_w_RES)(
+; MIPS32-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: splat.w [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS32-DAG: st.w [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_w_test
+
+@llvm_mips_splat_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_splat_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_splat_d_test(i32 %a) nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_splat_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.splat.d(<2 x i64> %0, i32 %a)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_splat_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.splat.d(<2 x i64>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_d_test:
+; FIXME: This test is currently disabled for MIPS32 because the indices are
+;        difficult to match. This is because 64-bit values cannot be stored in
+;        GPR32.
+; MIPS64-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_d_ARG1)(
+; MIPS64-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_d_RES)(
+; MIPS64-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS64-DAG: splat.d [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS64-DAG: st.d [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_d_test
diff --git a/test/CodeGen/Mips/msa/3rf.ll b/test/CodeGen/Mips/msa/3rf.ll
new file mode 100644
index 0000000..ae665af
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf.ll
@@ -0,0 +1,485 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fadd_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fadd_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fadd_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fadd_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fadd_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fadd_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fadd_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fadd.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fadd_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fadd.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fadd_w_test
+;
+@llvm_mips_fadd_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fadd_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fadd_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fadd_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fadd_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fadd_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fadd_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fadd.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fadd_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fadd.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fadd_d_test
+
+define void @fadd_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fadd_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fadd_w_ARG2
+  %2 = fadd <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fadd_w_RES
+  ret void
+}
+
+; CHECK: fadd_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fadd.w
+; CHECK: st.w
+; CHECK: .size fadd_w_test
+
+define void @fadd_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fadd_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fadd_d_ARG2
+  %2 = fadd <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fadd_d_RES
+  ret void
+}
+
+; CHECK: fadd_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fadd.d
+; CHECK: st.d
+; CHECK: .size fadd_d_test
+;
+@llvm_mips_fdiv_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fdiv_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fdiv_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fdiv_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fdiv_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fdiv_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fdiv.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fdiv_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fdiv.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fdiv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fdiv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fdiv_w_test
+;
+@llvm_mips_fdiv_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fdiv_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fdiv_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fdiv_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fdiv_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fdiv_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fdiv.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fdiv_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fdiv.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fdiv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fdiv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fdiv_d_test
+
+define void @fdiv_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fdiv_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fdiv_w_ARG2
+  %2 = fdiv <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fdiv_w_RES
+  ret void
+}
+
+; CHECK: fdiv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fdiv.w
+; CHECK: st.w
+; CHECK: .size fdiv_w_test
+
+define void @fdiv_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fdiv_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fdiv_d_ARG2
+  %2 = fdiv <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fdiv_d_RES
+  ret void
+}
+
+; CHECK: fdiv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fdiv.d
+; CHECK: st.d
+; CHECK: .size fdiv_d_test
+;
+@llvm_mips_fmin_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmin_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmin_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmin_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmin_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmin.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmin_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmin.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmin_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmin.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmin_w_test
+;
+@llvm_mips_fmin_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmin_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmin_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmin_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmin_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmin.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmin_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmin.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmin_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmin.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmin_d_test
+;
+@llvm_mips_fmin_a_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmin_a_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmin_a_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_a_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmin_a_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmin_a_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmin.a.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmin_a_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmin.a.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmin_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmin_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmin_a_w_test
+;
+@llvm_mips_fmin_a_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmin_a_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmin_a_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_a_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmin_a_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmin_a_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmin.a.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmin_a_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmin.a.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmin_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmin_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmin_a_d_test
+;
+@llvm_mips_fmax_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmax_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmax_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmax_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmax_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmax.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmax_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmax.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmax_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmax.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmax_w_test
+;
+@llvm_mips_fmax_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmax_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmax_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmax_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmax_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmax.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmax_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmax.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmax_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmax.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmax_d_test
+;
+@llvm_mips_fmax_a_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmax_a_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmax_a_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_a_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmax_a_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmax_a_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmax.a.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmax_a_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmax.a.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmax_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmax_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmax_a_w_test
+;
+@llvm_mips_fmax_a_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmax_a_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmax_a_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_a_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmax_a_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmax_a_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmax.a.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmax_a_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmax.a.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmax_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmax_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmax_a_d_test
+;
+@llvm_mips_fmul_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmul_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmul_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmul_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmul_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmul_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmul.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmul_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmul.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmul_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmul.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmul_w_test
+;
+@llvm_mips_fmul_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmul_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmul_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmul_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmul_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmul_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmul.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmul_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmul.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmul_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmul.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmul_d_test
+
+define void @fmul_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmul_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmul_w_ARG2
+  %2 = fmul <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmul_w_RES
+  ret void
+}
+
+; CHECK: fmul_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmul.w
+; CHECK: st.w
+; CHECK: .size fmul_w_test
+
+define void @fmul_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmul_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmul_d_ARG2
+  %2 = fmul <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmul_d_RES
+  ret void
+}
+
+; CHECK: fmul_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmul.d
+; CHECK: st.d
+; CHECK: .size fmul_d_test
+;
+@llvm_mips_fsub_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsub_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsub_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fsub_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsub_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsub_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fsub.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fsub_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fsub.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsub_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsub.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsub_w_test
+;
+@llvm_mips_fsub_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsub_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsub_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fsub_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsub_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsub_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fsub.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fsub_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fsub.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsub_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsub.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsub_d_test
+;
+
+define void @fsub_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsub_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsub_w_ARG2
+  %2 = fsub <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fsub_w_RES
+  ret void
+}
+
+; CHECK: fsub_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsub.w
+; CHECK: st.w
+; CHECK: .size fsub_w_test
+
+define void @fsub_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsub_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsub_d_ARG2
+  %2 = fsub <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fsub_d_RES
+  ret void
+}
+
+; CHECK: fsub_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsub.d
+; CHECK: st.d
+; CHECK: .size fsub_d_test
diff --git a/test/CodeGen/Mips/msa/3rf_4rf.ll b/test/CodeGen/Mips/msa/3rf_4rf.ll
new file mode 100644
index 0000000..67ef7fd
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_4rf.ll
@@ -0,0 +1,106 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; use the result as a third operand.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fmadd_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmadd_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmadd_w_ARG3 = global <4 x float> <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>, align 16
+@llvm_mips_fmadd_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmadd_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmadd_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmadd_w_ARG2
+  %2 = load <4 x float>* @llvm_mips_fmadd_w_ARG3
+  %3 = tail call <4 x float> @llvm.mips.fmadd.w(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* @llvm_mips_fmadd_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmadd.w(<4 x float>, <4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmadd_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmadd.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmadd_w_test
+;
+@llvm_mips_fmadd_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmadd_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmadd_d_ARG3 = global <2 x double> <double 4.000000e+00, double 5.000000e+00>, align 16
+@llvm_mips_fmadd_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmadd_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmadd_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmadd_d_ARG2
+  %2 = load <2 x double>* @llvm_mips_fmadd_d_ARG3
+  %3 = tail call <2 x double> @llvm.mips.fmadd.d(<2 x double> %0, <2 x double> %1, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* @llvm_mips_fmadd_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmadd.d(<2 x double>, <2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmadd_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmadd.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmadd_d_test
+;
+@llvm_mips_fmsub_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmsub_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmsub_w_ARG3 = global <4 x float> <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>, align 16
+@llvm_mips_fmsub_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmsub_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmsub_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmsub_w_ARG2
+  %2 = load <4 x float>* @llvm_mips_fmsub_w_ARG3
+  %3 = tail call <4 x float> @llvm.mips.fmsub.w(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* @llvm_mips_fmsub_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmsub.w(<4 x float>, <4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmsub_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmsub.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmsub_w_test
+;
+@llvm_mips_fmsub_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmsub_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmsub_d_ARG3 = global <2 x double> <double 4.000000e+00, double 5.000000e+00>, align 16
+@llvm_mips_fmsub_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmsub_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmsub_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmsub_d_ARG2
+  %2 = load <2 x double>* @llvm_mips_fmsub_d_ARG3
+  %3 = tail call <2 x double> @llvm.mips.fmsub.d(<2 x double> %0, <2 x double> %1, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* @llvm_mips_fmsub_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmsub.d(<2 x double>, <2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmsub_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmsub.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmsub_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_4rf_q.ll b/test/CodeGen/Mips/msa/3rf_4rf_q.ll
new file mode 100644
index 0000000..de28be0
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_4rf_q.ll
@@ -0,0 +1,206 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; use the result as a third operand and perform fixed-point operations.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_madd_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_madd_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_madd_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_madd_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_madd_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_madd_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_madd_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_madd_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.madd.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_madd_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.madd.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_madd_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: madd_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_madd_q_h_test
+;
+@llvm_mips_madd_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_madd_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_madd_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_madd_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_madd_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_madd_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_madd_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_madd_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.madd.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_madd_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.madd.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_madd_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: madd_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_madd_q_w_test
+;
+@llvm_mips_maddr_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maddr_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_maddr_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_maddr_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maddr_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maddr_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_maddr_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_maddr_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.maddr.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_maddr_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maddr.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_maddr_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: maddr_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maddr_q_h_test
+;
+@llvm_mips_maddr_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maddr_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_maddr_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_maddr_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maddr_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maddr_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_maddr_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_maddr_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.maddr.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_maddr_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maddr.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_maddr_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: maddr_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maddr_q_w_test
+;
+@llvm_mips_msub_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_msub_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_msub_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_msub_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_msub_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_msub_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_msub_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_msub_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.msub.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_msub_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.msub.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_msub_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: msub_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_msub_q_h_test
+;
+@llvm_mips_msub_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_msub_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_msub_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_msub_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_msub_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_msub_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_msub_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_msub_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.msub.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_msub_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.msub.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_msub_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: msub_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_msub_q_w_test
+;
+@llvm_mips_msubr_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_msubr_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_msubr_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_msubr_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_msubr_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_msubr_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_msubr_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_msubr_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.msubr.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_msubr_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.msubr.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_msubr_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: msubr_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_msubr_q_h_test
+;
+@llvm_mips_msubr_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_msubr_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_msubr_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_msubr_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_msubr_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_msubr_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_msubr_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_msubr_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.msubr.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_msubr_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.msubr.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_msubr_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: msubr_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_msubr_q_w_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_exdo.ll b/test/CodeGen/Mips/msa/3rf_exdo.ll
new file mode 100644
index 0000000..8a7f268
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_exdo.ll
@@ -0,0 +1,50 @@
+; Test the MSA floating-point conversion intrinsics that are encoded with the
+; 3RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fexdo_h_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexdo_h_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fexdo_h_RES  = global <8 x half> <half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00>, align 16
+
+define void @llvm_mips_fexdo_h_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexdo_h_ARG1
+  %1 = load <4 x float>* @llvm_mips_fexdo_h_ARG2
+  %2 = tail call <8 x half> @llvm.mips.fexdo.h(<4 x float> %0, <4 x float> %1)
+  store <8 x half> %2, <8 x half>* @llvm_mips_fexdo_h_RES
+  ret void
+}
+
+declare <8 x half> @llvm.mips.fexdo.h(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fexdo_h_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fexdo.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_fexdo_h_test
+;
+@llvm_mips_fexdo_w_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fexdo_w_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fexdo_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexdo_w_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fexdo_w_ARG1
+  %1 = load <2 x double>* @llvm_mips_fexdo_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fexdo.w(<2 x double> %0, <2 x double> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fexdo_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexdo.w(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fexdo_w_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fexdo.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexdo_w_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_float_int.ll b/test/CodeGen/Mips/msa/3rf_float_int.ll
new file mode 100644
index 0000000..7b01e17
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_float_int.ll
@@ -0,0 +1,50 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; take an integer as an operand.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fexp2_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexp2_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_fexp2_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexp2_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexp2_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_fexp2_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fexp2.w(<4 x float> %0, <4 x i32> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fexp2_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexp2.w(<4 x float>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_fexp2_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fexp2.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexp2_w_test
+;
+@llvm_mips_fexp2_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fexp2_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_fexp2_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fexp2_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fexp2_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_fexp2_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fexp2.d(<2 x double> %0, <2 x i64> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fexp2_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fexp2.d(<2 x double>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_fexp2_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fexp2.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fexp2_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_int_float.ll b/test/CodeGen/Mips/msa/3rf_int_float.ll
new file mode 100644
index 0000000..5624771
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_int_float.ll
@@ -0,0 +1,974 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; produce an integer as a result.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fcaf_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcaf_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcaf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcaf_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcaf_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcaf_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcaf.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcaf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcaf.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcaf_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcaf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcaf_w_test
+;
+@llvm_mips_fcaf_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcaf_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcaf_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcaf_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcaf_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcaf_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcaf.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcaf_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcaf.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcaf_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcaf.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcaf_d_test
+;
+@llvm_mips_fceq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fceq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fceq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fceq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fceq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fceq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fceq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fceq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fceq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fceq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fceq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fceq_w_test
+;
+@llvm_mips_fceq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fceq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fceq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fceq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fceq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fceq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fceq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fceq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fceq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fceq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fceq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fceq_d_test
+;
+@llvm_mips_fcle_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcle_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcle_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcle_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcle_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcle_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcle.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcle_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcle.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcle_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcle.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcle_w_test
+;
+@llvm_mips_fcle_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcle_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcle_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcle_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcle_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcle_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcle.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcle_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcle.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcle_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcle.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcle_d_test
+;
+@llvm_mips_fclt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fclt_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fclt_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fclt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fclt_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fclt_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fclt.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fclt_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fclt.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fclt_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fclt.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fclt_w_test
+;
+@llvm_mips_fclt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fclt_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fclt_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fclt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fclt_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fclt_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fclt.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fclt_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fclt.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fclt_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fclt.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fclt_d_test
+;
+@llvm_mips_fcor_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcor_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcor_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcor_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcor_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcor_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcor.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcor_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcor.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcor_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcor.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcor_w_test
+;
+@llvm_mips_fcor_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcor_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcor_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcor_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcor_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcor_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcor.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcor_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcor.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcor_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcor.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcor_d_test
+;
+@llvm_mips_fcne_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcne_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcne_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcne_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcne_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcne_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcne.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcne_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcne.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcne_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcne.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcne_w_test
+;
+@llvm_mips_fcne_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcne_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcne_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcne_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcne_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcne_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcne.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcne_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcne.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcne_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcne.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcne_d_test
+;
+@llvm_mips_fcueq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcueq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcueq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcueq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcueq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcueq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcueq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcueq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcueq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcueq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcueq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcueq_w_test
+;
+@llvm_mips_fcueq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcueq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcueq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcueq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcueq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcueq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcueq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcueq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcueq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcueq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcueq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcueq_d_test
+;
+@llvm_mips_fcult_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcult_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcult_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcult_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcult_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcult_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcult.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcult_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcult.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcult_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcult.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcult_w_test
+;
+@llvm_mips_fcult_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcult_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcult_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcult_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcult_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcult_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcult.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcult_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcult.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcult_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcult.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcult_d_test
+;
+@llvm_mips_fcule_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcule_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcule_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcule_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcule_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcule_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcule.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcule_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcule.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcule_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcule.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcule_w_test
+;
+@llvm_mips_fcule_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcule_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcule_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcule_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcule_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcule_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcule.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcule_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcule.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcule_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcule.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcule_d_test
+;
+@llvm_mips_fcun_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcun_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcun_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcun_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcun_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcun_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcun.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcun_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcun.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcun_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcun.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcun_w_test
+;
+@llvm_mips_fcun_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcun_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcun_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcun_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcun_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcun_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcun.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcun_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcun.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcun_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcun.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcun_d_test
+;
+@llvm_mips_fcune_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcune_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcune_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcune_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcune_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcune_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcune.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcune_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcune.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcune_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcune.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcune_w_test
+;
+@llvm_mips_fcune_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcune_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcune_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcune_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcune_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcune_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcune.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcune_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcune.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcune_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcune.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcune_d_test
+;
+@llvm_mips_fsaf_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsaf_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsaf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsaf_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsaf_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsaf_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsaf.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsaf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsaf.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsaf_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsaf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsaf_w_test
+;
+@llvm_mips_fsaf_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsaf_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsaf_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsaf_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsaf_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsaf_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsaf.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsaf_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsaf.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsaf_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsaf.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsaf_d_test
+;
+@llvm_mips_fseq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fseq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fseq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fseq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fseq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fseq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fseq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fseq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fseq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fseq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fseq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fseq_w_test
+;
+@llvm_mips_fseq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fseq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fseq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fseq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fseq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fseq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fseq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fseq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fseq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fseq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fseq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fseq_d_test
+;
+@llvm_mips_fsle_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsle_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsle_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsle_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsle_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsle_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsle.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsle_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsle.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsle_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsle.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsle_w_test
+;
+@llvm_mips_fsle_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsle_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsle_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsle_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsle_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsle_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsle.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsle_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsle.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsle_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsle.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsle_d_test
+;
+@llvm_mips_fslt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fslt_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fslt_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fslt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fslt_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fslt_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fslt.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fslt_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fslt.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fslt_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fslt.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fslt_w_test
+;
+@llvm_mips_fslt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fslt_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fslt_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fslt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fslt_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fslt_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fslt.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fslt_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fslt.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fslt_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fslt.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fslt_d_test
+;
+@llvm_mips_fsor_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsor_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsor_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsor_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsor_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsor_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsor.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsor_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsor.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsor_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsor.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsor_w_test
+;
+@llvm_mips_fsor_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsor_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsor_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsor_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsor_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsor_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsor.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsor_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsor.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsor_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsor.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsor_d_test
+;
+@llvm_mips_fsne_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsne_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsne_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsne_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsne_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsne_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsne.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsne_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsne.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsne_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsne.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsne_w_test
+;
+@llvm_mips_fsne_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsne_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsne_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsne_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsne_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsne_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsne.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsne_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsne.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsne_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsne.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsne_d_test
+;
+@llvm_mips_fsueq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsueq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsueq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsueq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsueq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsueq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsueq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsueq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsueq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsueq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsueq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsueq_w_test
+;
+@llvm_mips_fsueq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsueq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsueq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsueq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsueq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsueq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsueq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsueq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsueq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsueq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsueq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsueq_d_test
+;
+@llvm_mips_fsult_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsult_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsult_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsult_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsult_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsult_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsult.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsult_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsult.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsult_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsult.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsult_w_test
+;
+@llvm_mips_fsult_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsult_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsult_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsult_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsult_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsult_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsult.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsult_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsult.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsult_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsult.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsult_d_test
+;
+@llvm_mips_fsule_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsule_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsule_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsule_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsule_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsule_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsule.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsule_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsule.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsule_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsule.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsule_w_test
+;
+@llvm_mips_fsule_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsule_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsule_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsule_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsule_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsule_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsule.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsule_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsule.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsule_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsule.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsule_d_test
+;
+@llvm_mips_fsun_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsun_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsun_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsun_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsun_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsun_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsun.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsun_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsun.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsun_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsun.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsun_w_test
+;
+@llvm_mips_fsun_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsun_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsun_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsun_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsun_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsun_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsun.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsun_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsun.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsun_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsun.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsun_d_test
+;
+@llvm_mips_fsune_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsune_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsune_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsune_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsune_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsune_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsune.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsune_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsune.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsune_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsune.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsune_w_test
+;
+@llvm_mips_fsune_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsune_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsune_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsune_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsune_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsune_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsune.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsune_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsune.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsune_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsune.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsune_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_q.ll b/test/CodeGen/Mips/msa/3rf_q.ll
new file mode 100644
index 0000000..f7000ee
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_q.ll
@@ -0,0 +1,94 @@
+; Test the MSA fixed-point intrinsics that are encoded with the 3RF instruction
+; format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_mul_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mul_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mul_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mul_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mul_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mul_q_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mul.q.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mul_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mul.q.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mul_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mul_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mul_q_h_test
+;
+@llvm_mips_mul_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mul_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mul_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mul_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mul_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mul_q_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mul.q.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mul_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mul.q.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mul_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mul_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mul_q_w_test
+;
+@llvm_mips_mulr_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mulr_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mulr_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mulr_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mulr_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mulr_q_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mulr.q.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mulr_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mulr.q.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mulr_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mulr_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mulr_q_h_test
+;
+@llvm_mips_mulr_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mulr_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mulr_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mulr_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mulr_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mulr_q_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mulr.q.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mulr_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mulr.q.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mulr_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mulr_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mulr_q_w_test
+;
diff --git a/test/CodeGen/Mips/msa/arithmetic.ll b/test/CodeGen/Mips/msa/arithmetic.ll
new file mode 100644
index 0000000..09ee502
--- /dev/null
+++ b/test/CodeGen/Mips/msa/arithmetic.ll
@@ -0,0 +1,726 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @add_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: add_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <16 x i8> %1, %2
+  ; CHECK-DAG: addv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v16i8
+}
+
+define void @add_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: add_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <8 x i16> %1, %2
+  ; CHECK-DAG: addv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v8i16
+}
+
+define void @add_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: add_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <4 x i32> %1, %2
+  ; CHECK-DAG: addv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v4i32
+}
+
+define void @add_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: add_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <2 x i64> %1, %2
+  ; CHECK-DAG: addv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v2i64
+}
+
+define void @add_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: add_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: addvi.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v16i8_i
+}
+
+define void @add_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: add_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1,
+                          i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: addvi.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v8i16_i
+}
+
+define void @add_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: add_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: addvi.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v4i32_i
+}
+
+define void @add_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: add_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: addvi.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v2i64_i
+}
+
+define void @sub_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: sub_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <16 x i8> %1, %2
+  ; CHECK-DAG: subv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v16i8
+}
+
+define void @sub_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: sub_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <8 x i16> %1, %2
+  ; CHECK-DAG: subv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v8i16
+}
+
+define void @sub_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: sub_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <4 x i32> %1, %2
+  ; CHECK-DAG: subv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v4i32
+}
+
+define void @sub_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: sub_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <2 x i64> %1, %2
+  ; CHECK-DAG: subv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v2i64
+}
+
+define void @sub_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: sub_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: subvi.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v16i8_i
+}
+
+define void @sub_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: sub_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1,
+                          i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: subvi.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v8i16_i
+}
+
+define void @sub_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: sub_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: subvi.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v4i32_i
+}
+
+define void @sub_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: sub_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: subvi.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v2i64_i
+}
+
+define void @mul_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: mul_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <16 x i8> %1, %2
+  ; CHECK-DAG: mulv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v16i8
+}
+
+define void @mul_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: mul_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <8 x i16> %1, %2
+  ; CHECK-DAG: mulv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v8i16
+}
+
+define void @mul_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: mul_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <4 x i32> %1, %2
+  ; CHECK-DAG: mulv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v4i32
+}
+
+define void @mul_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: mul_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <2 x i64> %1, %2
+  ; CHECK-DAG: mulv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v2i64
+}
+
+define void @maddv_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                         <16 x i8>* %c) nounwind {
+  ; CHECK: maddv_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <16 x i8> %2, %3
+  %5 = add <16 x i8> %4, %1
+  ; CHECK-DAG: maddv.b [[R1]], [[R2]], [[R3]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v16i8
+}
+
+define void @maddv_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                         <8 x i16>* %c) nounwind {
+  ; CHECK: maddv_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <8 x i16> %2, %3
+  %5 = add <8 x i16> %4, %1
+  ; CHECK-DAG: maddv.h [[R1]], [[R2]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v8i16
+}
+
+define void @maddv_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                         <4 x i32>* %c) nounwind {
+  ; CHECK: maddv_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <4 x i32> %2, %3
+  %5 = add <4 x i32> %4, %1
+  ; CHECK-DAG: maddv.w [[R1]], [[R2]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v4i32
+}
+
+define void @maddv_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                         <2 x i64>* %c) nounwind {
+  ; CHECK: maddv_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <2 x i64> %2, %3
+  %5 = add <2 x i64> %4, %1
+  ; CHECK-DAG: maddv.d [[R1]], [[R2]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v2i64
+}
+
+define void @msubv_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                         <16 x i8>* %c) nounwind {
+  ; CHECK: msubv_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <16 x i8> %2, %3
+  %5 = sub <16 x i8> %1, %4
+  ; CHECK-DAG: msubv.b [[R1]], [[R2]], [[R3]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v16i8
+}
+
+define void @msubv_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                         <8 x i16>* %c) nounwind {
+  ; CHECK: msubv_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <8 x i16> %2, %3
+  %5 = sub <8 x i16> %1, %4
+  ; CHECK-DAG: msubv.h [[R1]], [[R2]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v8i16
+}
+
+define void @msubv_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                         <4 x i32>* %c) nounwind {
+  ; CHECK: msubv_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <4 x i32> %2, %3
+  %5 = sub <4 x i32> %1, %4
+  ; CHECK-DAG: msubv.w [[R1]], [[R2]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v4i32
+}
+
+define void @msubv_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                         <2 x i64>* %c) nounwind {
+  ; CHECK: msubv_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <2 x i64> %2, %3
+  %5 = sub <2 x i64> %1, %4
+  ; CHECK-DAG: msubv.d [[R1]], [[R2]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v2i64
+}
+
+define void @div_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: div_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <16 x i8> %1, %2
+  ; CHECK-DAG: div_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v16i8
+}
+
+define void @div_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: div_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <8 x i16> %1, %2
+  ; CHECK-DAG: div_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v8i16
+}
+
+define void @div_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: div_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <4 x i32> %1, %2
+  ; CHECK-DAG: div_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v4i32
+}
+
+define void @div_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: div_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <2 x i64> %1, %2
+  ; CHECK-DAG: div_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v2i64
+}
+
+define void @div_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: div_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <16 x i8> %1, %2
+  ; CHECK-DAG: div_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v16i8
+}
+
+define void @div_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: div_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <8 x i16> %1, %2
+  ; CHECK-DAG: div_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v8i16
+}
+
+define void @div_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: div_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <4 x i32> %1, %2
+  ; CHECK-DAG: div_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v4i32
+}
+
+define void @div_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: div_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <2 x i64> %1, %2
+  ; CHECK-DAG: div_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v2i64
+}
+
+define void @mod_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: mod_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <16 x i8> %1, %2
+  ; CHECK-DAG: mod_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v16i8
+}
+
+define void @mod_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: mod_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <8 x i16> %1, %2
+  ; CHECK-DAG: mod_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v8i16
+}
+
+define void @mod_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: mod_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <4 x i32> %1, %2
+  ; CHECK-DAG: mod_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v4i32
+}
+
+define void @mod_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: mod_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <2 x i64> %1, %2
+  ; CHECK-DAG: mod_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v2i64
+}
+
+define void @mod_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: mod_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <16 x i8> %1, %2
+  ; CHECK-DAG: mod_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v16i8
+}
+
+define void @mod_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: mod_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <8 x i16> %1, %2
+  ; CHECK-DAG: mod_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v8i16
+}
+
+define void @mod_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: mod_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <4 x i32> %1, %2
+  ; CHECK-DAG: mod_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v4i32
+}
+
+define void @mod_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: mod_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <2 x i64> %1, %2
+  ; CHECK-DAG: mod_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v2i64
+}
diff --git a/test/CodeGen/Mips/msa/arithmetic_float.ll b/test/CodeGen/Mips/msa/arithmetic_float.ll
new file mode 100644
index 0000000..dc38721
--- /dev/null
+++ b/test/CodeGen/Mips/msa/arithmetic_float.ll
@@ -0,0 +1,456 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @add_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: add_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fadd <4 x float> %1, %2
+  ; CHECK-DAG: fadd.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v4f32
+}
+
+define void @add_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: add_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fadd <2 x double> %1, %2
+  ; CHECK-DAG: fadd.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v2f64
+}
+
+define void @sub_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: sub_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fsub <4 x float> %1, %2
+  ; CHECK-DAG: fsub.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v4f32
+}
+
+define void @sub_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: sub_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fsub <2 x double> %1, %2
+  ; CHECK-DAG: fsub.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v2f64
+}
+
+define void @mul_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: mul_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fmul <4 x float> %1, %2
+  ; CHECK-DAG: fmul.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v4f32
+}
+
+define void @mul_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: mul_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fmul <2 x double> %1, %2
+  ; CHECK-DAG: fmul.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v2f64
+}
+
+define void @fma_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                       <4 x float>* %c) nounwind {
+  ; CHECK: fma_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x float>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = tail call <4 x float> @llvm.fma.v4f32 (<4 x float> %1, <4 x float> %2,
+                                              <4 x float> %3)
+  ; CHECK-DAG: fmadd.w [[R1]], [[R2]], [[R3]]
+  store <4 x float> %4, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fma_v4f32
+}
+
+define void @fma_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                       <2 x double>* %c) nounwind {
+  ; CHECK: fma_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x double>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = tail call <2 x double> @llvm.fma.v2f64 (<2 x double> %1, <2 x double> %2,
+                                               <2 x double> %3)
+  ; CHECK-DAG: fmadd.d [[R1]], [[R2]], [[R3]]
+  store <2 x double> %4, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fma_v2f64
+}
+
+define void @fmsub_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                       <4 x float>* %c) nounwind {
+  ; CHECK: fmsub_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x float>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = fmul <4 x float> %2, %3
+  %5 = fsub <4 x float> %1, %4
+  ; CHECK-DAG: fmsub.w [[R1]], [[R2]], [[R3]]
+  store <4 x float> %5, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fmsub_v4f32
+}
+
+define void @fmsub_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                       <2 x double>* %c) nounwind {
+  ; CHECK: fmsub_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x double>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = fmul <2 x double> %2, %3
+  %5 = fsub <2 x double> %1, %4
+  ; CHECK-DAG: fmsub.d [[R1]], [[R2]], [[R3]]
+  store <2 x double> %5, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fmsub_v2f64
+}
+
+define void @fdiv_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: fdiv_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fdiv <4 x float> %1, %2
+  ; CHECK-DAG: fdiv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fdiv_v4f32
+}
+
+define void @fdiv_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: fdiv_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fdiv <2 x double> %1, %2
+  ; CHECK-DAG: fdiv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fdiv_v2f64
+}
+
+define void @fabs_v4f32(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fabs_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.fabs.v4f32 (<4 x float> %1)
+  ; CHECK-DAG: fmax_a.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fabs_v4f32
+}
+
+define void @fabs_v2f64(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: fabs_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.fabs.v2f64 (<2 x double> %1)
+  ; CHECK-DAG: fmax_a.d [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fabs_v2f64
+}
+
+define void @fexp2_v4f32(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fexp2_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.exp2.v4f32 (<4 x float> %1)
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.w [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: fexp2.w [[R4:\$w[0-9]+]], [[R3]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v4f32
+}
+
+define void @fexp2_v2f64(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: fexp2_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.exp2.v2f64 (<2 x double> %1)
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.d [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: fexp2.d [[R4:\$w[0-9]+]], [[R3]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v2f64
+}
+
+define void @fexp2_v4f32_2(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fexp2_v4f32_2:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.exp2.v4f32 (<4 x float> %1)
+  %3 = fmul <4 x float> <float 2.0, float 2.0, float 2.0, float 2.0>, %2
+  ; CHECK-DAG: lui [[R3:\$[0-9]+]], 16384
+  ; CHECK-DAG: fill.w [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: fexp2.w [[R5:\$w[0-9]+]], [[R4]], [[R1]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R5]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v4f32_2
+}
+
+define void @fexp2_v2f64_2(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK:      .8byte 4611686018427387904
+  ; CHECK-NEXT: .8byte 4611686018427387904
+  ; CHECK: fexp2_v2f64_2:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.exp2.v2f64 (<2 x double> %1)
+  %3 = fmul <2 x double> <double 2.0, double 2.0>, %2
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo(
+  ; CHECK-DAG: fexp2.d [[R4:\$w[0-9]+]], [[R3]], [[R1]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v2f64_2
+}
+
+define void @fsqrt_v4f32(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fsqrt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %1)
+  ; CHECK-DAG: fsqrt.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fsqrt_v4f32
+}
+
+define void @fsqrt_v2f64(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: fsqrt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %1)
+  ; CHECK-DAG: fsqrt.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fsqrt_v2f64
+}
+
+define void @ffint_u_v4f32(<4 x float>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ffint_u_v4f32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = uitofp <4 x i32> %1 to <4 x float>
+  ; CHECK-DAG: ffint_u.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_u_v4f32
+}
+
+define void @ffint_u_v2f64(<2 x double>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ffint_u_v2f64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = uitofp <2 x i64> %1 to <2 x double>
+  ; CHECK-DAG: ffint_u.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_u_v2f64
+}
+
+define void @ffint_s_v4f32(<4 x float>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ffint_s_v4f32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = sitofp <4 x i32> %1 to <4 x float>
+  ; CHECK-DAG: ffint_s.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_s_v4f32
+}
+
+define void @ffint_s_v2f64(<2 x double>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ffint_s_v2f64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = sitofp <2 x i64> %1 to <2 x double>
+  ; CHECK-DAG: ffint_s.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_s_v2f64
+}
+
+define void @ftrunc_u_v4f32(<4 x i32>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: ftrunc_u_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptoui <4 x float> %1 to <4 x i32>
+  ; CHECK-DAG: ftrunc_u.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_u_v4f32
+}
+
+define void @ftrunc_u_v2f64(<2 x i64>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: ftrunc_u_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptoui <2 x double> %1 to <2 x i64>
+  ; CHECK-DAG: ftrunc_u.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_u_v2f64
+}
+
+define void @ftrunc_s_v4f32(<4 x i32>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: ftrunc_s_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptosi <4 x float> %1 to <4 x i32>
+  ; CHECK-DAG: ftrunc_s.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_s_v4f32
+}
+
+define void @ftrunc_s_v2f64(<2 x i64>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: ftrunc_s_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptosi <2 x double> %1 to <2 x i64>
+  ; CHECK-DAG: ftrunc_s.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_s_v2f64
+}
+
+declare <4 x float>  @llvm.fabs.v4f32(<4 x float>  %Val)
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %Val)
+declare <4 x float>  @llvm.exp2.v4f32(<4 x float>  %val)
+declare <2 x double> @llvm.exp2.v2f64(<2 x double> %val)
+declare <4 x float>  @llvm.fma.v4f32(<4 x float>  %a, <4 x float>  %b,
+                                     <4 x float>  %c)
+declare <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b,
+                                     <2 x double> %c)
+declare <4 x float>  @llvm.sqrt.v4f32(<4 x float>  %Val)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %Val)
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
new file mode 100644
index 0000000..0169a07
--- /dev/null
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -0,0 +1,481 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-BE %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-LE %s
+
+@v4i8 = global <4 x i8> <i8 0, i8 0, i8 0, i8 0>
+@v16i8 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+@v8i16 = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+@v4i32 = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+@v2i64 = global <2 x i64> <i64 0, i64 0>
+@i64 = global i64 0
+
+define void @const_v16i8() nounwind {
+  ; MIPS32-AE: const_v16i8:
+
+  store volatile <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
+
+  store volatile <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 31>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0>, <16 x i8>*@v16i8
+  ; MIPS32-BE: ldi.h [[R1:\$w[0-9]+]], 256
+  ; MIPS32-LE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4>, <16 x i8>*@v16i8
+  ; MIPS32-BE-DAG: lui [[R2:\$[0-9]+]], 258
+  ; MIPS32-LE-DAG: lui [[R2:\$[0-9]+]], 1027
+  ; MIPS32-BE-DAG: ori [[R2]], [[R2]], 772
+  ; MIPS32-LE-DAG: ori [[R2]], [[R2]], 513
+  ; MIPS32-AE-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
+
+  store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v16i8
+}
+
+define void @const_v8i16() nounwind {
+  ; MIPS32-AE: const_v8i16:
+
+  store volatile <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <8 x i16> <i16 1, i16 1, i16 1, i16 2, i16 1, i16 1, i16 1, i16 31>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 4
+
+  store volatile <8 x i16> <i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2>, <8 x i16>*@v8i16
+  ; MIPS32-BE-DAG: lui [[R2:\$[0-9]+]], 1
+  ; MIPS32-LE-DAG: lui [[R2:\$[0-9]+]], 2
+  ; MIPS32-BE-DAG: ori [[R2]], [[R2]], 2
+  ; MIPS32-LE-DAG: ori [[R2]], [[R2]], 1
+  ; MIPS32-AE-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
+
+  store volatile <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v8i16
+}
+
+define void @const_v4i32() nounwind {
+  ; MIPS32-AE: const_v4i32:
+
+  store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.w [[R1:\$w[0-9]+]], 1
+
+  store volatile <4 x i32> <i32 1, i32 1, i32 1, i32 31>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x i32> <i32 16843009, i32 16843009, i32 16843009, i32 16843009>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
+
+  store volatile <4 x i32> <i32 65537, i32 65537, i32 65537, i32 65537>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <4 x i32> <i32 1, i32 2, i32 1, i32 2>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x i32> <i32 3, i32 4, i32 5, i32 6>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v4i32
+}
+
+define void @const_v2i64() nounwind {
+  ; MIPS32-AE: const_v2i64:
+
+  store volatile <2 x i64> <i64 0, i64 0>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <2 x i64> <i64 72340172838076673, i64 72340172838076673>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 281479271743489, i64 281479271743489>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 4294967297, i64 4294967297>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.w [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 1, i64 1>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.d [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 1, i64 31>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x i64> <i64 3, i64 4>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v2i64
+}
+
+define void @nonconst_v16i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h) nounwind {
+  ; MIPS32-AE: nonconst_v16i8:
+
+  %1 = insertelement <16 x i8> undef, i8 %a, i32 0
+  %2 = insertelement <16 x i8> %1, i8 %b, i32 1
+  %3 = insertelement <16 x i8> %2, i8 %c, i32 2
+  %4 = insertelement <16 x i8> %3, i8 %d, i32 3
+  %5 = insertelement <16 x i8> %4, i8 %e, i32 4
+  %6 = insertelement <16 x i8> %5, i8 %f, i32 5
+  %7 = insertelement <16 x i8> %6, i8 %g, i32 6
+  %8 = insertelement <16 x i8> %7, i8 %h, i32 7
+  %9 = insertelement <16 x i8> %8, i8 %h, i32 8
+  %10 = insertelement <16 x i8> %9, i8 %h, i32 9
+  %11 = insertelement <16 x i8> %10, i8 %h, i32 10
+  %12 = insertelement <16 x i8> %11, i8 %h, i32 11
+  %13 = insertelement <16 x i8> %12, i8 %h, i32 12
+  %14 = insertelement <16 x i8> %13, i8 %h, i32 13
+  %15 = insertelement <16 x i8> %14, i8 %h, i32 14
+  %16 = insertelement <16 x i8> %15, i8 %h, i32 15
+  ; MIPS32-AE-DAG: insert.b [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE-DAG: insert.b [[R1]][1], $5
+  ; MIPS32-AE-DAG: insert.b [[R1]][2], $6
+  ; MIPS32-AE-DAG: insert.b [[R1]][3], $7
+  ; MIPS32-BE-DAG: lbu [[R2:\$[0-9]+]], 19($sp)
+  ; MIPS32-LE-DAG: lbu [[R2:\$[0-9]+]], 16($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][4], [[R2]]
+  ; MIPS32-BE-DAG: lbu [[R3:\$[0-9]+]], 23($sp)
+  ; MIPS32-LE-DAG: lbu [[R3:\$[0-9]+]], 20($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][5], [[R3]]
+  ; MIPS32-BE-DAG: lbu [[R4:\$[0-9]+]], 27($sp)
+  ; MIPS32-LE-DAG: lbu [[R4:\$[0-9]+]], 24($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][6], [[R4]]
+  ; MIPS32-BE-DAG: lbu [[R5:\$[0-9]+]], 31($sp)
+  ; MIPS32-LE-DAG: lbu [[R5:\$[0-9]+]], 28($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][7], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][8], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][9], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][10], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][11], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][12], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][13], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][14], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][15], [[R5]]
+
+  store volatile <16 x i8> %16, <16 x i8>*@v16i8
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v16i8
+}
+
+define void @nonconst_v8i16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h) nounwind {
+  ; MIPS32-AE: nonconst_v8i16:
+
+  %1 = insertelement <8 x i16> undef, i16 %a, i32 0
+  %2 = insertelement <8 x i16> %1, i16 %b, i32 1
+  %3 = insertelement <8 x i16> %2, i16 %c, i32 2
+  %4 = insertelement <8 x i16> %3, i16 %d, i32 3
+  %5 = insertelement <8 x i16> %4, i16 %e, i32 4
+  %6 = insertelement <8 x i16> %5, i16 %f, i32 5
+  %7 = insertelement <8 x i16> %6, i16 %g, i32 6
+  %8 = insertelement <8 x i16> %7, i16 %h, i32 7
+  ; MIPS32-AE-DAG: insert.h [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE-DAG: insert.h [[R1]][1], $5
+  ; MIPS32-AE-DAG: insert.h [[R1]][2], $6
+  ; MIPS32-AE-DAG: insert.h [[R1]][3], $7
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 18($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 16($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][4], [[R2]]
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 22($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 20($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][5], [[R2]]
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 26($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 24($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][6], [[R2]]
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 30($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 28($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][7], [[R2]]
+
+  store volatile <8 x i16> %8, <8 x i16>*@v8i16
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v8i16
+}
+
+define void @nonconst_v4i32(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+  ; MIPS32-AE: nonconst_v4i32:
+
+  %1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %b, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %c, i32 2
+  %4 = insertelement <4 x i32> %3, i32 %d, i32 3
+  ; MIPS32-AE: insert.w [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE: insert.w [[R1]][1], $5
+  ; MIPS32-AE: insert.w [[R1]][2], $6
+  ; MIPS32-AE: insert.w [[R1]][3], $7
+
+  store volatile <4 x i32> %4, <4 x i32>*@v4i32
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v4i32
+}
+
+define void @nonconst_v2i64(i64 %a, i64 %b) nounwind {
+  ; MIPS32-AE: nonconst_v2i64:
+
+  %1 = insertelement <2 x i64> undef, i64 %a, i32 0
+  %2 = insertelement <2 x i64> %1, i64 %b, i32 1
+  ; MIPS32-AE: insert.w [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE: insert.w [[R1]][1], $5
+  ; MIPS32-AE: insert.w [[R1]][2], $6
+  ; MIPS32-AE: insert.w [[R1]][3], $7
+
+  store volatile <2 x i64> %2, <2 x i64>*@v2i64
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v2i64
+}
+
+define i32 @extract_sext_v16i8() nounwind {
+  ; MIPS32-AE: extract_sext_v16i8:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <16 x i8> %2, i32 1
+  %4 = sext i8 %3 to i32
+  ; MIPS32-AE-DAG: copy_s.b [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: sll
+  ; MIPS32-AE-NOT: sra
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_sext_v16i8
+}
+
+define i32 @extract_sext_v8i16() nounwind {
+  ; MIPS32-AE: extract_sext_v8i16:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <8 x i16> %2, i32 1
+  %4 = sext i16 %3 to i32
+  ; MIPS32-AE-DAG: copy_s.h [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: sll
+  ; MIPS32-AE-NOT: sra
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_sext_v8i16
+}
+
+define i32 @extract_sext_v4i32() nounwind {
+  ; MIPS32-AE: extract_sext_v4i32:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x i32> %2, i32 1
+  ; MIPS32-AE-DAG: copy_s.w [[R3:\$[0-9]+]], [[R1]][1]
+
+  ret i32 %3
+  ; MIPS32-AE: .size extract_sext_v4i32
+}
+
+define i64 @extract_sext_v2i64() nounwind {
+  ; MIPS32-AE: extract_sext_v2i64:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <2 x i64> %2, i32 1
+  ; MIPS32-AE-DAG: copy_s.w [[R3:\$[0-9]+]], [[R1]][2]
+  ; MIPS32-AE-DAG: copy_s.w [[R4:\$[0-9]+]], [[R1]][3]
+  ; MIPS32-AE-NOT: sll
+  ; MIPS32-AE-NOT: sra
+
+  ret i64 %3
+  ; MIPS32-AE: .size extract_sext_v2i64
+}
+
+define i32 @extract_zext_v16i8() nounwind {
+  ; MIPS32-AE: extract_zext_v16i8:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <16 x i8> %2, i32 1
+  %4 = zext i8 %3 to i32
+  ; MIPS32-AE-DAG: copy_u.b [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: andi
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_zext_v16i8
+}
+
+define i32 @extract_zext_v8i16() nounwind {
+  ; MIPS32-AE: extract_zext_v8i16:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <8 x i16> %2, i32 1
+  %4 = zext i16 %3 to i32
+  ; MIPS32-AE-DAG: copy_u.h [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: andi
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_zext_v8i16
+}
+
+define i32 @extract_zext_v4i32() nounwind {
+  ; MIPS32-AE: extract_zext_v4i32:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x i32> %2, i32 1
+  ; MIPS32-AE-DAG: copy_{{[su]}}.w [[R3:\$[0-9]+]], [[R1]][1]
+
+  ret i32 %3
+  ; MIPS32-AE: .size extract_zext_v4i32
+}
+
+define i64 @extract_zext_v2i64() nounwind {
+  ; MIPS32-AE: extract_zext_v2i64:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <2 x i64> %2, i32 1
+  ; MIPS32-AE-DAG: copy_{{[su]}}.w [[R3:\$[0-9]+]], [[R1]][2]
+  ; MIPS32-AE-DAG: copy_{{[su]}}.w [[R4:\$[0-9]+]], [[R1]][3]
+  ; MIPS32-AE-NOT: andi
+
+  ret i64 %3
+  ; MIPS32-AE: .size extract_zext_v2i64
+}
+
+define void @insert_v16i8(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v16i8:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %a2 = trunc i32 %a to i8
+  %a3 = sext i8 %a2 to i32
+  %a4 = trunc i32 %a3 to i8
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <16 x i8> %1, i8 %a4, i32 1
+  ; MIPS32-AE-DAG: insert.b [[R1]][1], $4
+
+  store <16 x i8> %2, <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: st.b [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v16i8
+}
+
+define void @insert_v8i16(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v8i16:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %a2 = trunc i32 %a to i16
+  %a3 = sext i16 %a2 to i32
+  %a4 = trunc i32 %a3 to i16
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <8 x i16> %1, i16 %a4, i32 1
+  ; MIPS32-AE-DAG: insert.h [[R1]][1], $4
+
+  store <8 x i16> %2, <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: st.h [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v8i16
+}
+
+define void @insert_v4i32(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v4i32:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <4 x i32> %1, i32 %a, i32 1
+  ; MIPS32-AE-DAG: insert.w [[R1]][1], $4
+
+  store <4 x i32> %2, <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v4i32
+}
+
+define void @insert_v2i64(i64 %a) nounwind {
+  ; MIPS32-AE: insert_v2i64:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <2 x i64> %1, i64 %a, i32 1
+  ; MIPS32-AE-DAG: insert.w [[R1]][2], $4
+  ; MIPS32-AE-DAG: insert.w [[R1]][3], $5
+
+  store <2 x i64> %2, <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v2i64
+}
+
+define void @truncstore() nounwind {
+  ; MIPS32-AE: truncstore:
+
+  store volatile <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, <4 x i8>*@v4i8
+  ; TODO: What code should be emitted?
+
+  ret void
+  ; MIPS32-AE: .size truncstore
+}
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
new file mode 100644
index 0000000..1f53810
--- /dev/null
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -0,0 +1,207 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32 %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32 %s
+
+@v4f32 = global <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
+@v2f64 = global <2 x double> <double 0.0, double 0.0>
+@f32 = global float 0.0
+@f64 = global double 0.0
+
+define void @const_v4f32() nounwind {
+  ; MIPS32: const_v4f32:
+
+  store volatile <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, <4 x float>*@v4f32
+  ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
+
+  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float>*@v4f32
+  ; MIPS32: lui     [[R1:\$[0-9]+]], 16256
+  ; MIPS32: fill.w  [[R2:\$w[0-9]+]], [[R1]]
+
+  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 31.0>, <4 x float>*@v4f32
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x float> <float 65537.0, float 65537.0, float 65537.0, float 65537.0>, <4 x float>*@v4f32
+  ; MIPS32: lui     [[R1:\$[0-9]+]], 18304
+  ; MIPS32: ori     [[R2:\$[0-9]+]], [[R1]], 128
+  ; MIPS32: fill.w  [[R3:\$w[0-9]+]], [[R2]]
+
+  store volatile <4 x float> <float 1.0, float 2.0, float 1.0, float 2.0>, <4 x float>*@v4f32
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x float> <float 3.0, float 4.0, float 5.0, float 6.0>, <4 x float>*@v4f32
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32: .size const_v4f32
+}
+
+define void @const_v2f64() nounwind {
+  ; MIPS32: const_v2f64:
+
+  store volatile <2 x double> <double 0.0, double 0.0>, <2 x double>*@v2f64
+  ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
+
+  store volatile <2 x double> <double 72340172838076673.0, double 72340172838076673.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 281479271743489.0, double 281479271743489.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 4294967297.0, double 4294967297.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 1.0, double 1.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 1.0, double 31.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 3.0, double 4.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32: .size const_v2f64
+}
+
+define void @nonconst_v4f32() nounwind {
+  ; MIPS32: nonconst_v4f32:
+
+  %1 = load float *@f32
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = insertelement <4 x float> %2, float %1, i32 1
+  %4 = insertelement <4 x float> %3, float %1, i32 2
+  %5 = insertelement <4 x float> %4, float %1, i32 3
+  store volatile <4 x float> %5, <4 x float>*@v4f32
+  ; MIPS32: lwc1 $f[[R1:[0-9]+]], 0(
+  ; MIPS32: splati.w [[R2:\$w[0-9]+]], $w[[R1]]
+
+  ret void
+  ; MIPS32: .size nonconst_v4f32
+}
+
+define void @nonconst_v2f64() nounwind {
+  ; MIPS32: nonconst_v2f64:
+
+  %1 = load double *@f64
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  store volatile <2 x double> %3, <2 x double>*@v2f64
+  ; MIPS32: ldc1 $f[[R1:[0-9]+]], 0(
+  ; MIPS32: splati.d [[R2:\$w[0-9]+]], $w[[R1]]
+
+  ret void
+  ; MIPS32: .size nonconst_v2f64
+}
+
+define float @extract_v4f32() nounwind {
+  ; MIPS32: extract_v4f32:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x float> %2, i32 1
+  ; Element 1 can be obtained by splatting it across the vector and extracting
+  ; $w0:sub_lo
+  ; MIPS32-DAG: splati.w $w0, [[R1]][1]
+
+  ret float %3
+  ; MIPS32: .size extract_v4f32
+}
+
+define float @extract_v4f32_elt0() nounwind {
+  ; MIPS32: extract_v4f32_elt0:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w $w0, [[R1]], [[R1]]
+
+  %3 = extractelement <4 x float> %2, i32 0
+  ; Element 0 can be obtained by extracting $w0:sub_lo ($f0)
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+
+  ret float %3
+  ; MIPS32: .size extract_v4f32_elt0
+}
+
+define double @extract_v2f64() nounwind {
+  ; MIPS32: extract_v2f64:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <2 x double> %2, i32 1
+  ; Element 1 can be obtained by splatting it across the vector and extracting
+  ; $w0:sub_64
+  ; MIPS32-DAG: splati.d $w0, [[R1]][1]
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+  ; MIPS32-NOT: mthc1
+  ; MIPS32-NOT: sll
+  ; MIPS32-NOT: sra
+
+  ret double %3
+  ; MIPS32: .size extract_v2f64
+}
+
+define double @extract_v2f64_elt0() nounwind {
+  ; MIPS32: extract_v2f64_elt0:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d $w0, [[R1]], [[R1]]
+
+  %3 = extractelement <2 x double> %2, i32 0
+  ; Element 0 can be obtained by extracting $w0:sub_64 ($f0)
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+  ; MIPS32-NOT: mthc1
+  ; MIPS32-NOT: sll
+  ; MIPS32-NOT: sra
+
+  ret double %3
+  ; MIPS32: .size extract_v2f64_elt0
+}
+
+define void @insert_v4f32(float %a) nounwind {
+  ; MIPS32: insert_v4f32:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = insertelement <4 x float> %1, float %a, i32 1
+  ; float argument passed in $f12
+  ; MIPS32-DAG: insve.w [[R1]][1], $w12[0]
+
+  store <4 x float> %2, <4 x float>* @v4f32
+  ; MIPS32-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v4f32
+}
+
+define void @insert_v2f64(double %a) nounwind {
+  ; MIPS32: insert_v2f64:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = insertelement <2 x double> %1, double %a, i32 1
+  ; double argument passed in $f12
+  ; MIPS32-DAG: insve.d [[R1]][1], $w12[0]
+
+  store <2 x double> %2, <2 x double>* @v2f64
+  ; MIPS32-DAG: st.d [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v2f64
+}
diff --git a/test/CodeGen/Mips/msa/bit.ll b/test/CodeGen/Mips/msa/bit.ll
new file mode 100644
index 0000000..59ddbe1
--- /dev/null
+++ b/test/CodeGen/Mips/msa/bit.ll
@@ -0,0 +1,537 @@
+; Test the MSA intrinsics that are encoded with the BIT instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_sat_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sat_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sat_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sat_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.sat.s.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_sat_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sat.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_b_test:
+; CHECK: ld.b
+; CHECK: sat_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_sat_s_b_test
+;
+@llvm_mips_sat_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sat_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sat_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sat_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.sat.s.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_sat_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sat.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_h_test:
+; CHECK: ld.h
+; CHECK: sat_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_sat_s_h_test
+;
+@llvm_mips_sat_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sat_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sat_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sat_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.sat.s.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_sat_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sat.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_w_test:
+; CHECK: ld.w
+; CHECK: sat_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_sat_s_w_test
+;
+@llvm_mips_sat_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sat_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sat_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sat_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.sat.s.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_sat_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sat.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_d_test:
+; CHECK: ld.d
+; CHECK: sat_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_sat_s_d_test
+;
+@llvm_mips_sat_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sat_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sat_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sat_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.sat.u.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_sat_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sat.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_b_test:
+; CHECK: ld.b
+; CHECK: sat_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_sat_u_b_test
+;
+@llvm_mips_sat_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sat_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sat_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sat_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.sat.u.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_sat_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sat.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_h_test:
+; CHECK: ld.h
+; CHECK: sat_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_sat_u_h_test
+;
+@llvm_mips_sat_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sat_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sat_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sat_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.sat.u.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_sat_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sat.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_w_test:
+; CHECK: ld.w
+; CHECK: sat_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_sat_u_w_test
+;
+@llvm_mips_sat_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sat_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sat_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sat_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.sat.u.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_sat_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sat.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_d_test:
+; CHECK: ld.d
+; CHECK: sat_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_sat_u_d_test
+;
+@llvm_mips_slli_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_slli_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_slli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_slli_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.slli.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_slli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_slli_b_test:
+; CHECK: ld.b
+; CHECK: slli.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_slli_b_test
+;
+@llvm_mips_slli_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_slli_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_slli_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_slli_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.slli.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_slli_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_slli_h_test:
+; CHECK: ld.h
+; CHECK: slli.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_slli_h_test
+;
+@llvm_mips_slli_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_slli_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_slli_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_slli_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.slli.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_slli_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_slli_w_test:
+; CHECK: ld.w
+; CHECK: slli.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_slli_w_test
+;
+@llvm_mips_slli_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_slli_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_slli_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_slli_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.slli.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_slli_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_slli_d_test:
+; CHECK: ld.d
+; CHECK: slli.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_slli_d_test
+;
+@llvm_mips_srai_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srai_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srai_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srai_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srai.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srai_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srai_b_test:
+; CHECK: ld.b
+; CHECK: srai.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srai_b_test
+;
+@llvm_mips_srai_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srai_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srai_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srai_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srai.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srai_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srai_h_test:
+; CHECK: ld.h
+; CHECK: srai.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srai_h_test
+;
+@llvm_mips_srai_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srai_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srai_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srai_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srai.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srai_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srai_w_test:
+; CHECK: ld.w
+; CHECK: srai.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srai_w_test
+;
+@llvm_mips_srai_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srai_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srai_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srai_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srai.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srai_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srai_d_test:
+; CHECK: ld.d
+; CHECK: srai.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srai_d_test
+;
+@llvm_mips_srari_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srari_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srari_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srari_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srari.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srari_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srari_b_test:
+; CHECK: ld.b
+; CHECK: srari.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srari_b_test
+;
+@llvm_mips_srari_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srari_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srari_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srari_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srari.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srari_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srari_h_test:
+; CHECK: ld.h
+; CHECK: srari.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srari_h_test
+;
+@llvm_mips_srari_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srari_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srari_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srari_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srari.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srari_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srari_w_test:
+; CHECK: ld.w
+; CHECK: srari.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srari_w_test
+;
+@llvm_mips_srari_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srari_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srari_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srari_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srari.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srari_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srari_d_test:
+; CHECK: ld.d
+; CHECK: srari.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srari_d_test
+;
+@llvm_mips_srli_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srli_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srli_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srli.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srli_b_test:
+; CHECK: ld.b
+; CHECK: srli.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srli_b_test
+;
+@llvm_mips_srli_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srli_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srli_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srli_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srli.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srli_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srli_h_test:
+; CHECK: ld.h
+; CHECK: srli.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srli_h_test
+;
+@llvm_mips_srli_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srli_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srli_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srli_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srli.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srli_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srli_w_test:
+; CHECK: ld.w
+; CHECK: srli.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srli_w_test
+;
+@llvm_mips_srli_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srli_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srli_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srli_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srli.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srli_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srli_d_test:
+; CHECK: ld.d
+; CHECK: srli.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srli_d_test
+;
+@llvm_mips_srlri_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srlri_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srlri_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srlri_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srlri_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_b_test:
+; CHECK: ld.b
+; CHECK: srlri.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srlri_b_test
+;
+@llvm_mips_srlri_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srlri_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srlri_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srlri_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srlri_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_h_test:
+; CHECK: ld.h
+; CHECK: srlri.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srlri_h_test
+;
+@llvm_mips_srlri_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srlri_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srlri_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srlri_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srlri_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_w_test:
+; CHECK: ld.w
+; CHECK: srlri.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srlri_w_test
+;
+@llvm_mips_srlri_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srlri_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srlri_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srlri_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srlri_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_d_test:
+; CHECK: ld.d
+; CHECK: srlri.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srlri_d_test
+;
diff --git a/test/CodeGen/Mips/msa/bitcast.ll b/test/CodeGen/Mips/msa/bitcast.ll
new file mode 100644
index 0000000..8e880ec
--- /dev/null
+++ b/test/CodeGen/Mips/msa/bitcast.ll
@@ -0,0 +1,1210 @@
+; Test the bitcast operation for big-endian and little-endian.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=BIGENDIAN %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=LITENDIAN %s
+
+define void @v16i8_to_v16i8(<16 x i8>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v16i8:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v16i8_to_v16i8
+
+; BIGENDIAN: v16i8_to_v16i8:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.b [[R3]],
+; BIGENDIAN: .size v16i8_to_v16i8
+
+define void @v16i8_to_v8i16(<16 x i8>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v8i16:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v16i8_to_v8i16
+
+; BIGENDIAN: v16i8_to_v8i16:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v16i8_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v16i8_to_v8f16(<16 x i8>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v8f16:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.b [[R2]],
+; LITENDIAN: .size v16i8_to_v8f16
+
+; BIGENDIAN: v16i8_to_v8f16:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.b [[R2]],
+; BIGENDIAN: .size v16i8_to_v8f16
+
+define void @v16i8_to_v4i32(<16 x i8>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v4i32:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v16i8_to_v4i32
+
+; BIGENDIAN: v16i8_to_v4i32:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v16i8_to_v4i32
+
+define void @v16i8_to_v4f32(<16 x i8>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v4f32:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v16i8_to_v4f32
+
+; BIGENDIAN: v16i8_to_v4f32:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v16i8_to_v4f32
+
+define void @v16i8_to_v2i64(<16 x i8>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v2i64:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v16i8_to_v2i64
+
+; BIGENDIAN: v16i8_to_v2i64:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v16i8_to_v2i64
+
+define void @v16i8_to_v2f64(<16 x i8>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v2f64:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v16i8_to_v2f64
+
+; BIGENDIAN: v16i8_to_v2f64:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v16i8_to_v2f64
+
+define void @v8i16_to_v16i8(<8 x i16>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v16i8:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v8i16_to_v16i8
+
+; BIGENDIAN: v8i16_to_v16i8:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v8i16_to_v16i8
+
+define void @v8i16_to_v8i16(<8 x i16>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v8i16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v8i16_to_v8i16
+
+; BIGENDIAN: v8i16_to_v8i16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.h [[R3]],
+; BIGENDIAN: .size v8i16_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8i16_to_v8f16(<8 x i16>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v8f16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.h [[R2]],
+; LITENDIAN: .size v8i16_to_v8f16
+
+; BIGENDIAN: v8i16_to_v8f16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.h [[R2]],
+; BIGENDIAN: .size v8i16_to_v8f16
+
+define void @v8i16_to_v4i32(<8 x i16>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v4i32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v8i16_to_v4i32
+
+; BIGENDIAN: v8i16_to_v4i32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v8i16_to_v4i32
+
+define void @v8i16_to_v4f32(<8 x i16>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v4f32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v8i16_to_v4f32
+
+; BIGENDIAN: v8i16_to_v4f32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v8i16_to_v4f32
+
+define void @v8i16_to_v2i64(<8 x i16>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v2i64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v8i16_to_v2i64
+
+; BIGENDIAN: v8i16_to_v2i64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v8i16_to_v2i64
+
+define void @v8i16_to_v2f64(<8 x i16>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v2f64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v8i16_to_v2f64
+
+; BIGENDIAN: v8i16_to_v2f64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v8i16_to_v2f64
+
+;----
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v16i8(<8 x half>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <16 x i8>
+  %2 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %1, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v16i8:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v8f16_to_v16i8
+
+; BIGENDIAN: v8f16_to_v16i8:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R1]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v8f16_to_v16i8
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v8i16(<8 x half>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <8 x i16>
+  %2 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %1, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v8i16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.h [[R2]],
+; LITENDIAN: .size v8f16_to_v8i16
+
+; BIGENDIAN: v8f16_to_v8i16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.h [[R2]],
+; BIGENDIAN: .size v8f16_to_v8i16
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v8f16(<8 x half>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <8 x half>
+  store <8 x half> %1, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v8f16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: st.h [[R1]],
+; LITENDIAN: .size v8f16_to_v8f16
+
+; BIGENDIAN: v8f16_to_v8f16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: st.h [[R1]],
+; BIGENDIAN: .size v8f16_to_v8f16
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v4i32(<8 x half>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <4 x i32>
+  %2 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %1, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v4i32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v8f16_to_v4i32
+
+; BIGENDIAN: v8f16_to_v4i32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 177
+; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v8f16_to_v4i32
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v4f32(<8 x half>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <4 x float>
+  %2 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %1, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v4f32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v8f16_to_v4f32
+
+; BIGENDIAN: v8f16_to_v4f32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 177
+; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v8f16_to_v4f32
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v2i64(<8 x half>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <2 x i64>
+  %2 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %1, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v2i64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v8f16_to_v2i64
+
+; BIGENDIAN: v8f16_to_v2i64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 27
+; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v8f16_to_v2i64
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v2f64(<8 x half>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <2 x double>
+  %2 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %1, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v2f64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v8f16_to_v2f64
+
+; BIGENDIAN: v8f16_to_v2f64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 27
+; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v8f16_to_v2f64
+;----
+
+define void @v4i32_to_v16i8(<4 x i32>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v16i8:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v4i32_to_v16i8
+
+; BIGENDIAN: v4i32_to_v16i8:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v4i32_to_v16i8
+
+define void @v4i32_to_v8i16(<4 x i32>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v8i16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v4i32_to_v8i16
+
+; BIGENDIAN: v4i32_to_v8i16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v4i32_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v4i32_to_v8f16(<4 x i32>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v8f16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v4i32_to_v8f16
+
+; BIGENDIAN: v4i32_to_v8f16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.w [[R2]],
+; BIGENDIAN: .size v4i32_to_v8f16
+
+define void @v4i32_to_v4i32(<4 x i32>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v4i32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4i32_to_v4i32
+
+; BIGENDIAN: v4i32_to_v4i32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4i32_to_v4i32
+
+define void @v4i32_to_v4f32(<4 x i32>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v4f32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4i32_to_v4f32
+
+; BIGENDIAN: v4i32_to_v4f32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4i32_to_v4f32
+
+define void @v4i32_to_v2i64(<4 x i32>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v2i64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4i32_to_v2i64
+
+; BIGENDIAN: v4i32_to_v2i64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4i32_to_v2i64
+
+define void @v4i32_to_v2f64(<4 x i32>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v2f64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4i32_to_v2f64
+
+; BIGENDIAN: v4i32_to_v2f64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4i32_to_v2f64
+
+define void @v4f32_to_v16i8(<4 x float>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v16i8:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v4f32_to_v16i8
+
+; BIGENDIAN: v4f32_to_v16i8:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v4f32_to_v16i8
+
+define void @v4f32_to_v8i16(<4 x float>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v8i16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v4f32_to_v8i16
+
+; BIGENDIAN: v4f32_to_v8i16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v4f32_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v4f32_to_v8f16(<4 x float>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v8f16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v4f32_to_v8f16
+
+; BIGENDIAN: v4f32_to_v8f16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.w [[R2]],
+; BIGENDIAN: .size v4f32_to_v8f16
+
+define void @v4f32_to_v4i32(<4 x float>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v4i32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4f32_to_v4i32
+
+; BIGENDIAN: v4f32_to_v4i32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4f32_to_v4i32
+
+define void @v4f32_to_v4f32(<4 x float>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v4f32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4f32_to_v4f32
+
+; BIGENDIAN: v4f32_to_v4f32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4f32_to_v4f32
+
+define void @v4f32_to_v2i64(<4 x float>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v2i64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4f32_to_v2i64
+
+; BIGENDIAN: v4f32_to_v2i64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4f32_to_v2i64
+
+define void @v4f32_to_v2f64(<4 x float>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v2f64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4f32_to_v2f64
+
+; BIGENDIAN: v4f32_to_v2f64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4f32_to_v2f64
+
+define void @v2i64_to_v16i8(<2 x i64>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v16i8:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v2i64_to_v16i8
+
+; BIGENDIAN: v2i64_to_v16i8:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v2i64_to_v16i8
+
+define void @v2i64_to_v8i16(<2 x i64>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v8i16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v2i64_to_v8i16
+
+; BIGENDIAN: v2i64_to_v8i16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v2i64_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v2i64_to_v8f16(<2 x i64>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v8f16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v2i64_to_v8f16
+
+; BIGENDIAN: v2i64_to_v8f16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.d [[R2]],
+; BIGENDIAN: .size v2i64_to_v8f16
+
+define void @v2i64_to_v4i32(<2 x i64>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v4i32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2i64_to_v4i32
+
+; BIGENDIAN: v2i64_to_v4i32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2i64_to_v4i32
+
+define void @v2i64_to_v4f32(<2 x i64>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v4f32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2i64_to_v4f32
+
+; BIGENDIAN: v2i64_to_v4f32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2i64_to_v4f32
+
+define void @v2i64_to_v2i64(<2 x i64>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v2i64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2i64_to_v2i64
+
+; BIGENDIAN: v2i64_to_v2i64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2i64_to_v2i64
+
+define void @v2i64_to_v2f64(<2 x i64>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v2f64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2i64_to_v2f64
+
+; BIGENDIAN: v2i64_to_v2f64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2i64_to_v2f64
+
+define void @v2f64_to_v16i8(<2 x double>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v16i8:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v2f64_to_v16i8
+
+; BIGENDIAN: v2f64_to_v16i8:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v2f64_to_v16i8
+
+define void @v2f64_to_v8i16(<2 x double>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v8i16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v2f64_to_v8i16
+
+; BIGENDIAN: v2f64_to_v8i16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v2f64_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v2f64_to_v8f16(<2 x double>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v8f16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v2f64_to_v8f16
+
+; BIGENDIAN: v2f64_to_v8f16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.d [[R2]],
+; BIGENDIAN: .size v2f64_to_v8f16
+
+define void @v2f64_to_v4i32(<2 x double>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v4i32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2f64_to_v4i32
+
+; BIGENDIAN: v2f64_to_v4i32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2f64_to_v4i32
+
+define void @v2f64_to_v4f32(<2 x double>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v4f32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2f64_to_v4f32
+
+; BIGENDIAN: v2f64_to_v4f32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2f64_to_v4f32
+
+define void @v2f64_to_v2i64(<2 x double>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v2i64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2f64_to_v2i64
+
+; BIGENDIAN: v2f64_to_v2i64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2f64_to_v2i64
+
+define void @v2f64_to_v2f64(<2 x double>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v2f64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2f64_to_v2f64
+
+; BIGENDIAN: v2f64_to_v2f64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2f64_to_v2f64
+
+declare <16 x i8> @llvm.mips.addv.b(<16 x i8>, <16 x i8>) nounwind
+declare <8 x i16> @llvm.mips.addv.h(<8 x i16>, <8 x i16>) nounwind
+declare <4 x i32> @llvm.mips.addv.w(<4 x i32>, <4 x i32>) nounwind
+declare <2 x i64> @llvm.mips.addv.d(<2 x i64>, <2 x i64>) nounwind
+declare <4 x float> @llvm.mips.fadd.w(<4 x float>, <4 x float>) nounwind
+declare <2 x double> @llvm.mips.fadd.d(<2 x double>, <2 x double>) nounwind
diff --git a/test/CodeGen/Mips/msa/bitwise.ll b/test/CodeGen/Mips/msa/bitwise.ll
new file mode 100644
index 0000000..9a88c47
--- /dev/null
+++ b/test/CodeGen/Mips/msa/bitwise.ll
@@ -0,0 +1,1639 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @and_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: and_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <16 x i8> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v16i8
+}
+
+define void @and_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: and_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v8i16
+}
+
+define void @and_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: and_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v4i32
+}
+
+define void @and_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: and_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v2i64
+}
+
+define void @and_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: and_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: andi.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v16i8_i
+}
+
+define void @and_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: and_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v8i16_i
+}
+
+define void @and_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: and_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v4i32_i
+}
+
+define void @and_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: and_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v2i64_i
+}
+
+define void @or_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: or_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <16 x i8> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v16i8
+}
+
+define void @or_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: or_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <8 x i16> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v8i16
+}
+
+define void @or_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: or_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <4 x i32> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v4i32
+}
+
+define void @or_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: or_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <2 x i64> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v2i64
+}
+
+define void @or_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: or_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ; CHECK-DAG: ori.b [[R4:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v16i8_i
+}
+
+define void @or_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: or_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v8i16_i
+}
+
+define void @or_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: or_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v4i32_i
+}
+
+define void @or_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: or_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <2 x i64> %1, <i64 3, i64 3>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v2i64_i
+}
+
+define void @nor_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: nor_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <16 x i8> %1, %2
+  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v16i8
+}
+
+define void @nor_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: nor_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <8 x i16> %1, %2
+  %4 = xor <8 x i16> %3, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v8i16
+}
+
+define void @nor_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: nor_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <4 x i32> %1, %2
+  %4 = xor <4 x i32> %3, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v4i32
+}
+
+define void @nor_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: nor_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <2 x i64> %1, %2
+  %4 = xor <2 x i64> %3, <i64 -1, i64 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v2i64
+}
+
+define void @nor_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: nor_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ; CHECK-DAG: ori.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v16i8_i
+}
+
+define void @nor_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: nor_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = xor <8 x i16> %2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v8i16_i
+}
+
+define void @nor_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: nor_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = xor <4 x i32> %2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v4i32_i
+}
+
+define void @nor_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: nor_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <2 x i64> %1, <i64 1, i64 1>
+  %3 = xor <2 x i64> %2, <i64 -1, i64 -1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v2i64_i
+}
+
+define void @xor_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: xor_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <16 x i8> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v16i8
+}
+
+define void @xor_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: xor_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <8 x i16> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v8i16
+}
+
+define void @xor_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: xor_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <4 x i32> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v4i32
+}
+
+define void @xor_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: xor_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <2 x i64> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v2i64
+}
+
+define void @xor_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: xor_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ; CHECK-DAG: xori.b [[R4:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v16i8_i
+}
+
+define void @xor_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: xor_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v8i16_i
+}
+
+define void @xor_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: xor_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v4i32_i
+}
+
+define void @xor_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: xor_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <2 x i64> %1, <i64 3, i64 3>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v2i64_i
+}
+
+define void @sll_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: sll_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> %1, %2
+  ; CHECK-DAG: sll.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v16i8
+}
+
+define void @sll_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: sll_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> %1, %2
+  ; CHECK-DAG: sll.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v8i16
+}
+
+define void @sll_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: sll_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> %1, %2
+  ; CHECK-DAG: sll.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v4i32
+}
+
+define void @sll_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: sll_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> %1, %2
+  ; CHECK-DAG: sll.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v2i64
+}
+
+define void @sll_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: sll_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: slli.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v16i8_i
+}
+
+define void @sll_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: sll_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: slli.h [[R4:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v8i16_i
+}
+
+define void @sll_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: sll_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: slli.w [[R4:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v4i32_i
+}
+
+define void @sll_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: sll_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: slli.d [[R4:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v2i64_i
+}
+
+define void @sra_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: sra_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <16 x i8> %1, %2
+  ; CHECK-DAG: sra.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v16i8
+}
+
+define void @sra_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: sra_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <8 x i16> %1, %2
+  ; CHECK-DAG: sra.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v8i16
+}
+
+define void @sra_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: sra_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <4 x i32> %1, %2
+  ; CHECK-DAG: sra.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v4i32
+}
+
+define void @sra_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: sra_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <2 x i64> %1, %2
+  ; CHECK-DAG: sra.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v2i64
+}
+
+define void @sra_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: sra_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: srai.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v16i8_i
+}
+
+define void @sra_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: sra_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: srai.h [[R4:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v8i16_i
+}
+
+define void @sra_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: sra_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: srai.w [[R4:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v4i32_i
+}
+
+define void @sra_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: sra_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v2i64_i
+}
+
+define void @srl_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: srl_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <16 x i8> %1, %2
+  ; CHECK-DAG: srl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v16i8
+}
+
+define void @srl_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: srl_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <8 x i16> %1, %2
+  ; CHECK-DAG: srl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v8i16
+}
+
+define void @srl_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: srl_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <4 x i32> %1, %2
+  ; CHECK-DAG: srl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v4i32
+}
+
+define void @srl_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: srl_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <2 x i64> %1, %2
+  ; CHECK-DAG: srl.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v2i64
+}
+
+define void @srl_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: srl_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: srli.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v16i8_i
+}
+
+define void @srl_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: srl_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: srli.h [[R4:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v8i16_i
+}
+
+define void @srl_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: srl_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: srli.w [[R4:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v4i32_i
+}
+
+define void @srl_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: srl_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: srli.d [[R4:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v2i64_i
+}
+
+define void @ctpop_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: ctpop_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <16 x i8> @llvm.ctpop.v16i8 (<16 x i8> %1)
+  ; CHECK-DAG: pcnt.b [[R3:\$w[0-9]+]], [[R1]]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v16i8
+}
+
+define void @ctpop_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: ctpop_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <8 x i16> @llvm.ctpop.v8i16 (<8 x i16> %1)
+  ; CHECK-DAG: pcnt.h [[R3:\$w[0-9]+]], [[R1]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v8i16
+}
+
+define void @ctpop_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ctpop_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x i32> @llvm.ctpop.v4i32 (<4 x i32> %1)
+  ; CHECK-DAG: pcnt.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v4i32
+}
+
+define void @ctpop_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ctpop_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x i64> @llvm.ctpop.v2i64 (<2 x i64> %1)
+  ; CHECK-DAG: pcnt.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v2i64
+}
+
+define void @ctlz_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: ctlz_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <16 x i8> @llvm.ctlz.v16i8 (<16 x i8> %1)
+  ; CHECK-DAG: nlzc.b [[R3:\$w[0-9]+]], [[R1]]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v16i8
+}
+
+define void @ctlz_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: ctlz_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <8 x i16> @llvm.ctlz.v8i16 (<8 x i16> %1)
+  ; CHECK-DAG: nlzc.h [[R3:\$w[0-9]+]], [[R1]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v8i16
+}
+
+define void @ctlz_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ctlz_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x i32> @llvm.ctlz.v4i32 (<4 x i32> %1)
+  ; CHECK-DAG: nlzc.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v4i32
+}
+
+define void @ctlz_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ctlz_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x i64> @llvm.ctlz.v2i64 (<2 x i64> %1)
+  ; CHECK-DAG: nlzc.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v2i64
+}
+
+define void @bsel_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b, <16 x i8>* %m) nounwind {
+  ; CHECK: bsel_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %m
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1>
+  %5 = and <16 x i8> %1, %3
+  %6 = and <16 x i8> %2, %4
+  %7 = or <16 x i8> %5, %6
+  ; bmnz is the same operation
+  ; CHECK-DAG: bmnz.v [[R1]], [[R2]], [[R3]]
+  store <16 x i8> %7, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v16i8
+}
+
+define void @bsel_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %m) nounwind {
+  ; CHECK: bsel_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %m
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($6)
+  %3 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1>
+  %4 = and <16 x i8> %1, %3
+  %5 = and <16 x i8> <i8 6, i8 6, i8 6, i8 6,
+                      i8 6, i8 6, i8 6, i8 6,
+                      i8 6, i8 6, i8 6, i8 6,
+                      i8 6, i8 6, i8 6, i8 6>, %2
+  %6 = or <16 x i8> %4, %5
+  ; CHECK-DAG: bseli.b [[R3]], [[R1]], 6
+  store <16 x i8> %6, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v16i8_i
+}
+
+define void @bsel_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bsel_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, <i16 6, i16 6, i16 6, i16 6,
+                          i16 6, i16 6, i16 6, i16 6>
+  %4 = and <8 x i16> %2, <i16 65529, i16 65529, i16 65529, i16 65529,
+                          i16 65529, i16 65529, i16 65529, i16 65529>
+  %5 = or <8 x i16> %3, %4
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 6
+  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v8i16
+}
+
+define void @bsel_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bsel_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, <i32 6, i32 6, i32 6, i32 6>
+  %4 = and <4 x i32> %2, <i32 4294967289, i32 4294967289, i32 4294967289, i32 4294967289>
+  %5 = or <4 x i32> %3, %4
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 6
+  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v4i32
+}
+
+define void @bsel_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bsel_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, <i64 6, i64 6>
+  %4 = and <2 x i64> %2, <i64 18446744073709551609, i64 18446744073709551609>
+  %5 = or <2 x i64> %3, %4
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 6
+  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v2i64
+}
+
+define void @binsl_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: binsl_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <16 x i8> %1, <i8 192, i8 192, i8 192, i8 192,
+                          i8 192, i8 192, i8 192, i8 192,
+                          i8 192, i8 192, i8 192, i8 192,
+                          i8 192, i8 192, i8 192, i8 192>
+  %4 = and <16 x i8> %2, <i8 63, i8 63, i8 63, i8 63,
+                          i8 63, i8 63, i8 63, i8 63,
+                          i8 63, i8 63, i8 63, i8 63,
+                          i8 63, i8 63, i8 63, i8 63>
+  %5 = or <16 x i8> %3, %4
+  ; CHECK-DAG: binsli.b [[R2]], [[R1]], 2
+  store <16 x i8> %5, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v16i8_i
+}
+
+define void @binsl_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: binsl_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, <i16 49152, i16 49152, i16 49152, i16 49152,
+                          i16 49152, i16 49152, i16 49152, i16 49152>
+  %4 = and <8 x i16> %2, <i16 16383, i16 16383, i16 16383, i16 16383,
+                          i16 16383, i16 16383, i16 16383, i16 16383>
+  %5 = or <8 x i16> %3, %4
+  ; CHECK-DAG: binsli.h [[R2]], [[R1]], 2
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v8i16_i
+}
+
+define void @binsl_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: binsl_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, <i32 3221225472, i32 3221225472, i32 3221225472, i32 3221225472>
+  %4 = and <4 x i32> %2, <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
+  %5 = or <4 x i32> %3, %4
+  ; CHECK-DAG: binsli.w [[R2]], [[R1]], 2
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v4i32_i
+}
+
+define void @binsl_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: binsl_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, <i64 18446744073709551608, i64 18446744073709551608>
+  %4 = and <2 x i64> %2, <i64 7, i64 7>
+  %5 = or <2 x i64> %3, %4
+  ; TODO: We use a particularly wide mask here to work around a legalization
+  ;       issue. If the mask doesn't fit within a 10-bit immediate, it gets
+  ;       legalized into a constant pool. We should add a test to cover the
+  ;       other cases once they correctly select binsli.d.
+  ; CHECK-DAG: binsli.d [[R2]], [[R1]], 61
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v2i64_i
+}
+
+define void @binsr_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: binsr_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3,
+                          i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %4 = and <16 x i8> %2, <i8 252, i8 252, i8 252, i8 252,
+                          i8 252, i8 252, i8 252, i8 252,
+                          i8 252, i8 252, i8 252, i8 252,
+                          i8 252, i8 252, i8 252, i8 252>
+  %5 = or <16 x i8> %3, %4
+  ; CHECK-DAG: binsri.b [[R2]], [[R1]], 2
+  store <16 x i8> %5, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v16i8_i
+}
+
+define void @binsr_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: binsr_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3,
+                          i16 3, i16 3, i16 3, i16 3>
+  %4 = and <8 x i16> %2, <i16 65532, i16 65532, i16 65532, i16 65532,
+                          i16 65532, i16 65532, i16 65532, i16 65532>
+  %5 = or <8 x i16> %3, %4
+  ; CHECK-DAG: binsri.h [[R2]], [[R1]], 2
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v8i16_i
+}
+
+define void @binsr_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: binsr_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  %4 = and <4 x i32> %2, <i32 4294967292, i32 4294967292, i32 4294967292, i32 4294967292>
+  %5 = or <4 x i32> %3, %4
+  ; CHECK-DAG: binsri.w [[R2]], [[R1]], 2
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v4i32_i
+}
+
+define void @binsr_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: binsr_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, <i64 3, i64 3>
+  %4 = and <2 x i64> %2, <i64 18446744073709551612, i64 18446744073709551612>
+  %5 = or <2 x i64> %3, %4
+  ; CHECK-DAG: binsri.d [[R2]], [[R1]], 2
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v2i64_i
+}
+
+define void @bclr_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: bclr_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
+  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %5 = and <16 x i8> %1, %4
+  ; CHECK-DAG: bclr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %5, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v16i8
+}
+
+define void @bclr_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bclr_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
+  %4 = xor <8 x i16> %3, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %5 = and <8 x i16> %1, %4
+  ; CHECK-DAG: bclr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v8i16
+}
+
+define void @bclr_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bclr_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
+  %4 = xor <4 x i32> %3, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = and <4 x i32> %1, %4
+  ; CHECK-DAG: bclr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v4i32
+}
+
+define void @bclr_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bclr_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> <i64 1, i64 1>, %2
+  %4 = xor <2 x i64> %3, <i64 -1, i64 -1>
+  %5 = and <2 x i64> %1, %4
+  ; CHECK-DAG: bclr.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v2i64
+}
+
+define void @bset_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: bset_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
+  %4 = or <16 x i8> %1, %3
+  ; CHECK-DAG: bset.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v16i8
+}
+
+define void @bset_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bset_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
+  %4 = or <8 x i16> %1, %3
+  ; CHECK-DAG: bset.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v8i16
+}
+
+define void @bset_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bset_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
+  %4 = or <4 x i32> %1, %3
+  ; CHECK-DAG: bset.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v4i32
+}
+
+define void @bset_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bset_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> <i64 1, i64 1>, %2
+  %4 = or <2 x i64> %1, %3
+  ; CHECK-DAG: bset.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v2i64
+}
+
+define void @bneg_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: bneg_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
+  %4 = xor <16 x i8> %1, %3
+  ; CHECK-DAG: bneg.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v16i8
+}
+
+define void @bneg_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bneg_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
+  %4 = xor <8 x i16> %1, %3
+  ; CHECK-DAG: bneg.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v8i16
+}
+
+define void @bneg_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bneg_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
+  %4 = xor <4 x i32> %1, %3
+  ; CHECK-DAG: bneg.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v4i32
+}
+
+define void @bneg_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bneg_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> <i64 1, i64 1>, %2
+  %4 = xor <2 x i64> %1, %3
+  ; CHECK-DAG: bneg.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v2i64
+}
+
+define void @bclri_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: bclri_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <16 x i8> <i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8>,
+                     <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <16 x i8> %1, %2
+  ; bclri.b and andi.b are exactly equivalent.
+  ; CHECK-DAG: andi.b [[R3:\$w[0-9]+]], [[R1]], 247
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v16i8
+}
+
+define void @bclri_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: bclri_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <8 x i16> <i16  8, i16  8, i16  8, i16  8, i16  8, i16  8, i16  8, i16  8>,
+                     <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <8 x i16> %1, %2
+  ; CHECK-DAG: bclri.h [[R3:\$w[0-9]+]], [[R1]], 3
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v8i16
+}
+
+define void @bclri_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: bclri_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <4 x i32> <i32  8, i32  8, i32  8, i32  8>,
+                     <i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <4 x i32> %1, %2
+  ; CHECK-DAG: bclri.w [[R3:\$w[0-9]+]], [[R1]], 3
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v4i32
+}
+
+define void @bclri_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: bclri_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <2 x i64> <i64  8, i64  8>,
+                     <i64 -1, i64 -1>
+  %3 = and <2 x i64> %1, %2
+  ; CHECK-DAG: bclri.d [[R3:\$w[0-9]+]], [[R1]], 3
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v2i64
+}
+
+define void @bseti_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: bseti_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <16 x i8> %1, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  ; CHECK-DAG: bseti.b [[R3:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v16i8
+}
+
+define void @bseti_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: bseti_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ; CHECK-DAG: bseti.h [[R3:\$w[0-9]+]], [[R1]], 3
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v8i16
+}
+
+define void @bseti_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: bseti_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ; CHECK-DAG: bseti.w [[R3:\$w[0-9]+]], [[R1]], 3
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v4i32
+}
+
+define void @bseti_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: bseti_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <2 x i64> %1, <i64 8, i64 8>
+  ; CHECK-DAG: bseti.d [[R3:\$w[0-9]+]], [[R1]], 3
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v2i64
+}
+
+define void @bnegi_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: bnegi_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <16 x i8> %1, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  ; CHECK-DAG: bnegi.b [[R3:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v16i8
+}
+
+define void @bnegi_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: bnegi_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ; CHECK-DAG: bnegi.h [[R3:\$w[0-9]+]], [[R1]], 3
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v8i16
+}
+
+define void @bnegi_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: bnegi_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ; CHECK-DAG: bnegi.w [[R3:\$w[0-9]+]], [[R1]], 3
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v4i32
+}
+
+define void @bnegi_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: bnegi_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <2 x i64> %1, <i64 8, i64 8>
+  ; CHECK-DAG: bnegi.d [[R3:\$w[0-9]+]], [[R1]], 3
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v2i64
+}
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %val)
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val)
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val)
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val)
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %val)
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %val)
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val)
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %val)
diff --git a/test/CodeGen/Mips/msa/compare.ll b/test/CodeGen/Mips/msa/compare.ll
new file mode 100644
index 0000000..6408d7b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/compare.ll
@@ -0,0 +1,2079 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @ceq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ceq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: ceq.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v16i8
+}
+
+define void @ceq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ceq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: ceq.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v8i16
+}
+
+define void @ceq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ceq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: ceq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v4i32
+}
+
+define void @ceq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ceq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: ceq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v2i64
+}
+
+define void @cle_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: cle_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: cle_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v16i8
+}
+
+define void @cle_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: cle_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: cle_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v8i16
+}
+
+define void @cle_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: cle_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: cle_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v4i32
+}
+
+define void @cle_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: cle_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: cle_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v2i64
+}
+
+define void @cle_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: cle_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: cle_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v16i8
+}
+
+define void @cle_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: cle_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: cle_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v8i16
+}
+
+define void @cle_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: cle_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: cle_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v4i32
+}
+
+define void @cle_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: cle_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: cle_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v2i64
+}
+
+define void @clt_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: clt_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: clt_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v16i8
+}
+
+define void @clt_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: clt_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: clt_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v8i16
+}
+
+define void @clt_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: clt_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: clt_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v4i32
+}
+
+define void @clt_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: clt_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: clt_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v2i64
+}
+
+define void @clt_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: clt_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: clt_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v16i8
+}
+
+define void @clt_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: clt_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: clt_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v8i16
+}
+
+define void @clt_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: clt_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: clt_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v4i32
+}
+
+define void @clt_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: clt_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: clt_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v2i64
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: cne_v16i8:
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: ceq.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: xori.b [[R3]], [[R3]], 255
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v16i8
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: cne_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: ceq.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; TODO: This should be an 'xori.b [[R3]], [[R3]], 255' but thats an optimisation issue
+  ; CHECK-DAG: ldi.b [[R4:\$w[0-9]+]], -1
+  ; CHECK-DAG: xor.v [[R3]], [[R3]], [[R4]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v8i16
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: cne_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: ceq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; TODO: This should be an 'xori.b [[R3]], [[R3]], 255' but thats an optimisation issue
+  ; CHECK-DAG: ldi.b [[R4:\$w[0-9]+]], -1
+  ; CHECK-DAG: xor.v [[R3]], [[R3]], [[R4]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v4i32
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: cne_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: ceq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; TODO: This should be an 'xori.b [[R3]], [[R3]], 255' but thats an optimisation issue
+  ; CHECK-DAG: ldi.b [[R4:\$w[0-9]+]], -1
+  ; CHECK-DAG: xor.v [[R3]], [[R3]], [[R4]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v2i64
+}
+
+define void @ceqi_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: ceqi_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: ceqi.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v16i8
+}
+
+define void @ceqi_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: ceqi_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: ceqi.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v8i16
+}
+
+define void @ceqi_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ceqi_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: ceqi.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v4i32
+}
+
+define void @ceqi_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ceqi_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: ceqi.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v2i64
+}
+
+define void @clei_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clei_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clei_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v16i8
+}
+
+define void @clei_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clei_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clei_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v8i16
+}
+
+define void @clei_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clei_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clei_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v4i32
+}
+
+define void @clei_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clei_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clei_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v2i64
+}
+
+define void @clei_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clei_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clei_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v16i8
+}
+
+define void @clei_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clei_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clei_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v8i16
+}
+
+define void @clei_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clei_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clei_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v4i32
+}
+
+define void @clei_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clei_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clei_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v2i64
+}
+
+define void @clti_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clti_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clti_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v16i8
+}
+
+define void @clti_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clti_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clti_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v8i16
+}
+
+define void @clti_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clti_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clti_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v4i32
+}
+
+define void @clti_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clti_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clti_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v2i64
+}
+
+define void @clti_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clti_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clti_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v16i8
+}
+
+define void @clti_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clti_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clti_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v8i16
+}
+
+define void @clti_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clti_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clti_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v4i32
+}
+
+define void @clti_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clti_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clti_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v2i64
+}
+
+define void @bsel_s_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bsel_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_s.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <16 x i1> %4, <16 x i8> %1, <16 x i8> %3
+  ; bmnz.v is the same operation
+  ; CHECK-DAG: bmnz.v [[R3]], [[R1]], [[R4]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v16i8
+}
+
+define void @bsel_s_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bsel_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_s.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <8 x i1> %4, <8 x i16> %1, <8 x i16> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v8i16
+}
+
+define void @bsel_s_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bsel_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_s.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <4 x i1> %4, <4 x i32> %1, <4 x i32> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v4i32
+}
+
+define void @bsel_s_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bsel_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_s.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <2 x i1> %4, <2 x i64> %1, <2 x i64> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v2i64
+}
+
+define void @bsel_u_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bsel_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_u.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <16 x i1> %4, <16 x i8> %1, <16 x i8> %3
+  ; bmnz.v is the same operation
+  ; CHECK-DAG: bmnz.v [[R3]], [[R1]], [[R4]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v16i8
+}
+
+define void @bsel_u_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bsel_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_u.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <8 x i1> %4, <8 x i16> %1, <8 x i16> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v8i16
+}
+
+define void @bsel_u_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bsel_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_u.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <4 x i1> %4, <4 x i32> %1, <4 x i32> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v4i32
+}
+
+define void @bsel_u_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bsel_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_u.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <2 x i1> %4, <2 x i64> %1, <2 x i64> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v2i64
+}
+
+define void @bseli_s_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bseli_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_s.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: bseli.b [[R4]], [[R1]], 1
+  store <16 x i8> %4, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v16i8
+}
+
+define void @bseli_s_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bseli_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_s.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %4, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v8i16
+}
+
+define void @bseli_s_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bseli_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_s.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %4, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v4i32
+}
+
+define void @bseli_s_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bseli_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_s.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %4, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v2i64
+}
+
+define void @bseli_u_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bseli_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_u.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: bseli.b [[R4]], [[R1]], 1
+  store <16 x i8> %4, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v16i8
+}
+
+define void @bseli_u_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bseli_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_u.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %4, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v8i16
+}
+
+define void @bseli_u_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bseli_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_u.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %4, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v4i32
+}
+
+define void @bseli_u_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bseli_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_u.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %4, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v2i64
+}
+
+define void @max_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v16i8
+}
+
+define void @max_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v8i16
+}
+
+define void @max_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v4i32
+}
+
+define void @max_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v2i64
+}
+
+define void @max_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v16i8
+}
+
+define void @max_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v8i16
+}
+
+define void @max_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v4i32
+}
+
+define void @max_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v2i64
+}
+
+define void @max_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v16i8
+}
+
+define void @max_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v8i16
+}
+
+define void @max_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v4i32
+}
+
+define void @max_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v2i64
+}
+
+define void @max_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v16i8
+}
+
+define void @max_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v8i16
+}
+
+define void @max_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v4i32
+}
+
+define void @max_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v2i64
+}
+
+define void @maxi_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v16i8
+}
+
+define void @maxi_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v8i16
+}
+
+define void @maxi_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v4i32
+}
+
+define void @maxi_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v2i64
+}
+
+define void @maxi_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v16i8
+}
+
+define void @maxi_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v8i16
+}
+
+define void @maxi_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v4i32
+}
+
+define void @maxi_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v2i64
+}
+
+define void @maxi_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v16i8
+}
+
+define void @maxi_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v8i16
+}
+
+define void @maxi_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v4i32
+}
+
+define void @maxi_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v2i64
+}
+
+define void @maxi_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v16i8
+}
+
+define void @maxi_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v8i16
+}
+
+define void @maxi_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v4i32
+}
+
+define void @maxi_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v2i64
+}
+
+define void @min_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v16i8
+}
+
+define void @min_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v8i16
+}
+
+define void @min_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v4i32
+}
+
+define void @min_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v2i64
+}
+
+define void @min_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v16i8
+}
+
+define void @min_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v8i16
+}
+
+define void @min_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v4i32
+}
+
+define void @min_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v2i64
+}
+
+define void @min_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v16i8
+}
+
+define void @min_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v8i16
+}
+
+define void @min_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v4i32
+}
+
+define void @min_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v2i64
+}
+
+define void @min_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v16i8
+}
+
+define void @min_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v8i16
+}
+
+define void @min_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v4i32
+}
+
+define void @min_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v2i64
+}
+
+define void @mini_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v16i8
+}
+
+define void @mini_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v8i16
+}
+
+define void @mini_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v4i32
+}
+
+define void @mini_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v2i64
+}
+
+define void @mini_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v16i8
+}
+
+define void @mini_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v8i16
+}
+
+define void @mini_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v4i32
+}
+
+define void @mini_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v2i64
+}
+
+define void @mini_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v16i8
+}
+
+define void @mini_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v8i16
+}
+
+define void @mini_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v4i32
+}
+
+define void @mini_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v2i64
+}
+
+define void @mini_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v16i8
+}
+
+define void @mini_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v8i16
+}
+
+define void @mini_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v4i32
+}
+
+define void @mini_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v2i64
+}
diff --git a/test/CodeGen/Mips/msa/compare_float.ll b/test/CodeGen/Mips/msa/compare_float.ll
new file mode 100644
index 0000000..2fc61f8
--- /dev/null
+++ b/test/CodeGen/Mips/msa/compare_float.ll
@@ -0,0 +1,663 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+declare <4 x float> @llvm.mips.fmax.w(<4 x float>, <4 x float>) nounwind
+declare <2 x double> @llvm.mips.fmax.d(<2 x double>, <2 x double>) nounwind
+declare <4 x float> @llvm.mips.fmin.w(<4 x float>, <4 x float>) nounwind
+declare <2 x double> @llvm.mips.fmin.d(<2 x double>, <2 x double>) nounwind
+
+define void @false_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: false_v4f32:
+
+  %1 = load <4 x float>* %a
+  %2 = load <4 x float>* %b
+  %3 = fcmp false <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  store <4 x i32> %4, <4 x i32>* %c
+  ret void
+
+  ; (setcc $a, $b, SETFALSE) is always folded, so we won't get fcaf:
+  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+  ; CHECK: .size false_v4f32
+}
+
+define void @false_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: false_v2f64:
+
+  %1 = load <2 x double>* %a
+  %2 = load <2 x double>* %b
+  %3 = fcmp false <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  store <2 x i64> %4, <2 x i64>* %c
+  ret void
+
+  ; FIXME: This code is correct, but poor. Ideally it would be similar to
+  ;        the code in @false_v4f32
+  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
+  ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
+  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+  ; CHECK: .size false_v2f64
+}
+
+define void @oeq_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: oeq_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oeq <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fceq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oeq_v4f32
+}
+
+define void @oeq_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: oeq_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oeq <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fceq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oeq_v2f64
+}
+
+define void @oge_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: oge_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oge <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcle.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oge_v4f32
+}
+
+define void @oge_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: oge_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oge <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcle.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oge_v2f64
+}
+
+define void @ogt_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ogt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fclt.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ogt_v4f32
+}
+
+define void @ogt_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ogt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fclt.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ogt_v2f64
+}
+
+define void @ole_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ole_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ole <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcle.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ole_v4f32
+}
+
+define void @ole_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ole_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ole <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcle.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ole_v2f64
+}
+
+define void @olt_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: olt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp olt <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fclt.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size olt_v4f32
+}
+
+define void @olt_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: olt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp olt <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fclt.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size olt_v2f64
+}
+
+define void @one_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: one_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp one <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcne.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size one_v4f32
+}
+
+define void @one_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: one_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp one <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcne.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size one_v2f64
+}
+
+define void @ord_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ord_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ord <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcor.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ord_v4f32
+}
+
+define void @ord_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ord_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ord <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcor.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ord_v2f64
+}
+
+define void @ueq_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ueq_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ueq <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcueq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ueq_v4f32
+}
+
+define void @ueq_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ueq_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ueq <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcueq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ueq_v2f64
+}
+
+define void @uge_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: uge_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uge <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcule.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uge_v4f32
+}
+
+define void @uge_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: uge_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uge <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcule.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uge_v2f64
+}
+
+define void @ugt_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ugt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ugt <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcult.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ugt_v4f32
+}
+
+define void @ugt_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ugt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ugt <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcult.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ugt_v2f64
+}
+
+define void @ule_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ule_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ule <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcule.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ule_v4f32
+}
+
+define void @ule_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ule_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ule <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcule.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ule_v2f64
+}
+
+define void @ult_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ult_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ult <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcult.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ult_v4f32
+}
+
+define void @ult_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ult_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ult <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcult.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ult_v2f64
+}
+
+define void @uno_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: uno_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uno <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcun.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uno_v4f32
+}
+
+define void @uno_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: uno_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uno <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcun.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uno_v2f64
+}
+
+define void @true_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: true_v4f32:
+
+  %1 = load <4 x float>* %a
+  %2 = load <4 x float>* %b
+  %3 = fcmp true <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  store <4 x i32> %4, <4 x i32>* %c
+  ret void
+
+  ; (setcc $a, $b, SETTRUE) is always folded, so we won't get fcaf:
+  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], -1
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+  ; CHECK: .size true_v4f32
+}
+
+define void @true_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: true_v2f64:
+
+  %1 = load <2 x double>* %a
+  %2 = load <2 x double>* %b
+  %3 = fcmp true <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  store <2 x i64> %4, <2 x i64>* %c
+  ret void
+
+  ; FIXME: This code is correct, but poor. Ideally it would be similar to
+  ;        the code in @true_v4f32
+  ; CHECK-DAG: ldi.d [[R1:\$w[0-9]+]], 1
+  ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
+  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+  ; CHECK: .size true_v2f64
+}
+
+define void @bsel_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                          <4 x float>* %c) nounwind {
+  ; CHECK: bsel_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x float>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = fcmp ogt <4 x float> %1, %2
+  ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x float> %5, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v4f32
+}
+
+define void @bsel_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                          <2 x double>* %c) nounwind {
+  ; CHECK: bsel_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x double>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = fcmp ogt <2 x double> %1, %2
+  ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <2 x i1> %4, <2 x double> %1, <2 x double> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x double> %5, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v2f64
+}
+
+define void @bseli_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                          <4 x float>* %c) nounwind {
+  ; CHECK: bseli_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <4 x float> %1, %2
+  ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> zeroinitializer
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3:\$w[0-9]+]]
+  store <4 x float> %4, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_v4f32
+}
+
+define void @bseli_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                          <2 x double>* %c) nounwind {
+  ; CHECK: bseli_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <2 x double> %1, %2
+  ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <2 x i1> %3, <2 x double> %1, <2 x double> zeroinitializer
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3:\$w[0-9]+]]
+  store <2 x double> %4, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_v2f64
+}
+
+define void @max_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: max_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <4 x float> @llvm.mips.fmax.w(<4 x float> %1, <4 x float> %2)
+  ; CHECK-DAG: fmax.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_v4f32
+}
+
+define void @max_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: max_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <2 x double> @llvm.mips.fmax.d(<2 x double> %1, <2 x double> %2)
+  ; CHECK-DAG: fmax.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_v2f64
+}
+
+define void @min_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: min_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <4 x float> @llvm.mips.fmin.w(<4 x float> %1, <4 x float> %2)
+  ; CHECK-DAG: fmin.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_v4f32
+}
+
+define void @min_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: min_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <2 x double> @llvm.mips.fmin.d(<2 x double> %1, <2 x double> %2)
+  ; CHECK-DAG: fmin.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_v2f64
+}
diff --git a/test/CodeGen/Mips/msa/elm_copy.ll b/test/CodeGen/Mips/msa/elm_copy.ll
new file mode 100644
index 0000000..ed3e52c
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_copy.ll
@@ -0,0 +1,162 @@
+; Test the MSA intrinsics that are encoded with the ELM instruction format and
+; are element extraction operations.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_copy_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_copy_s_b_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_copy_s_b_ARG1
+  %1 = tail call i32 @llvm.mips.copy.s.b(<16 x i8> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_s_b_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_b_test:
+; CHECK: ld.b
+; CHECK: copy_s.b
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_b_test
+;
+@llvm_mips_copy_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_copy_s_h_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_copy_s_h_ARG1
+  %1 = tail call i32 @llvm.mips.copy.s.h(<8 x i16> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_s_h_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_h_test:
+; CHECK: ld.h
+; CHECK: copy_s.h
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_h_test
+;
+@llvm_mips_copy_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_copy_s_w_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_copy_s_w_ARG1
+  %1 = tail call i32 @llvm.mips.copy.s.w(<4 x i32> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_s_w_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_w_test:
+; CHECK: ld.w
+; CHECK: copy_s.w
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_w_test
+;
+@llvm_mips_copy_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_copy_s_d_RES  = global i64 0, align 16
+
+define void @llvm_mips_copy_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_copy_s_d_ARG1
+  %1 = tail call i64 @llvm.mips.copy.s.d(<2 x i64> %0, i32 1)
+  store i64 %1, i64* @llvm_mips_copy_s_d_RES
+  ret void
+}
+
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_d_test:
+; CHECK: ld.w
+; CHECK: copy_s.w
+; CHECK: copy_s.w
+; CHECK: sw
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_d_test
+;
+@llvm_mips_copy_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_copy_u_b_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_copy_u_b_ARG1
+  %1 = tail call i32 @llvm.mips.copy.u.b(<16 x i8> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_u_b_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_b_test:
+; CHECK: ld.b
+; CHECK: copy_u.b
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_b_test
+;
+@llvm_mips_copy_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_copy_u_h_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_copy_u_h_ARG1
+  %1 = tail call i32 @llvm.mips.copy.u.h(<8 x i16> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_u_h_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_h_test:
+; CHECK: ld.h
+; CHECK: copy_u.h
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_h_test
+;
+@llvm_mips_copy_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_copy_u_w_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_copy_u_w_ARG1
+  %1 = tail call i32 @llvm.mips.copy.u.w(<4 x i32> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_u_w_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_w_test:
+; CHECK: ld.w
+; CHECK: copy_u.w
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_w_test
+;
+@llvm_mips_copy_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_copy_u_d_RES  = global i64 0, align 16
+
+define void @llvm_mips_copy_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_copy_u_d_ARG1
+  %1 = tail call i64 @llvm.mips.copy.u.d(<2 x i64> %0, i32 1)
+  store i64 %1, i64* @llvm_mips_copy_u_d_RES
+  ret void
+}
+
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_d_test:
+; CHECK: ld.w
+; CHECK: copy_s.w
+; CHECK: copy_s.w
+; CHECK: sw
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/elm_cxcmsa.ll b/test/CodeGen/Mips/msa/elm_cxcmsa.ll
new file mode 100644
index 0000000..8d6b0ee
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_cxcmsa.ll
@@ -0,0 +1,168 @@
+; Test the MSA ctcmsa and cfcmsa intrinsics (which are encoded with the ELM
+; instruction format).
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define i32 @msa_ir_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 0)
+  ret i32 %0
+}
+
+; CHECK: msa_ir_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $0
+; CHECK: .size msa_ir_cfcmsa_test
+;
+define i32 @msa_csr_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 1)
+  ret i32 %0
+}
+
+; CHECK: msa_csr_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $1
+; CHECK: .size msa_csr_cfcmsa_test
+;
+define i32 @msa_access_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 2)
+  ret i32 %0
+}
+
+; CHECK: msa_access_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $2
+; CHECK: .size msa_access_cfcmsa_test
+;
+define i32 @msa_save_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 3)
+  ret i32 %0
+}
+
+; CHECK: msa_save_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $3
+; CHECK: .size msa_save_cfcmsa_test
+;
+define i32 @msa_modify_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 4)
+  ret i32 %0
+}
+
+; CHECK: msa_modify_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $4
+; CHECK: .size msa_modify_cfcmsa_test
+;
+define i32 @msa_request_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 5)
+  ret i32 %0
+}
+
+; CHECK: msa_request_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $5
+; CHECK: .size msa_request_cfcmsa_test
+;
+define i32 @msa_map_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 6)
+  ret i32 %0
+}
+
+; CHECK: msa_map_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $6
+; CHECK: .size msa_map_cfcmsa_test
+;
+define i32 @msa_unmap_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 7)
+  ret i32 %0
+}
+
+; CHECK: msa_unmap_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $7
+; CHECK: .size msa_unmap_cfcmsa_test
+;
+define void @msa_ir_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 0, i32 1)
+  ret void
+}
+
+; CHECK: msa_ir_ctcmsa_test:
+; CHECK: ctcmsa $0
+; CHECK: .size msa_ir_ctcmsa_test
+;
+define void @msa_csr_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 1, i32 1)
+  ret void
+}
+
+; CHECK: msa_csr_ctcmsa_test:
+; CHECK: ctcmsa $1
+; CHECK: .size msa_csr_ctcmsa_test
+;
+define void @msa_access_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 2, i32 1)
+  ret void
+}
+
+; CHECK: msa_access_ctcmsa_test:
+; CHECK: ctcmsa $2
+; CHECK: .size msa_access_ctcmsa_test
+;
+define void @msa_save_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 3, i32 1)
+  ret void
+}
+
+; CHECK: msa_save_ctcmsa_test:
+; CHECK: ctcmsa $3
+; CHECK: .size msa_save_ctcmsa_test
+;
+define void @msa_modify_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 4, i32 1)
+  ret void
+}
+
+; CHECK: msa_modify_ctcmsa_test:
+; CHECK: ctcmsa $4
+; CHECK: .size msa_modify_ctcmsa_test
+;
+define void @msa_request_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 5, i32 1)
+  ret void
+}
+
+; CHECK: msa_request_ctcmsa_test:
+; CHECK: ctcmsa $5
+; CHECK: .size msa_request_ctcmsa_test
+;
+define void @msa_map_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 6, i32 1)
+  ret void
+}
+
+; CHECK: msa_map_ctcmsa_test:
+; CHECK: ctcmsa $6
+; CHECK: .size msa_map_ctcmsa_test
+;
+define void @msa_unmap_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 7, i32 1)
+  ret void
+}
+
+; CHECK: msa_unmap_ctcmsa_test:
+; CHECK: ctcmsa $7
+; CHECK: .size msa_unmap_ctcmsa_test
+;
+declare i32 @llvm.mips.cfcmsa(i32) nounwind
+declare void @llvm.mips.ctcmsa(i32, i32) nounwind
diff --git a/test/CodeGen/Mips/msa/elm_insv.ll b/test/CodeGen/Mips/msa/elm_insv.ll
new file mode 100644
index 0000000..fa7ceaf
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_insv.ll
@@ -0,0 +1,192 @@
+; Test the MSA element insertion intrinsics that are encoded with the ELM
+; instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_insert_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_insert_b_ARG3 = global i32 27, align 16
+@llvm_mips_insert_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_insert_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_insert_b_ARG1
+  %1 = load i32* @llvm_mips_insert_b_ARG3
+  %2 = tail call <16 x i8> @llvm.mips.insert.b(<16 x i8> %0, i32 1, i32 %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_insert_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.insert.b(<16 x i8>, i32, i32) nounwind
+
+; CHECK: llvm_mips_insert_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0(
+; CHECK-DAG: insert.b [[R2]][1], [[R1]]
+; CHECK-DAG: st.b [[R2]], 0(
+; CHECK: .size llvm_mips_insert_b_test
+;
+@llvm_mips_insert_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_insert_h_ARG3 = global i32 27, align 16
+@llvm_mips_insert_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_insert_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_insert_h_ARG1
+  %1 = load i32* @llvm_mips_insert_h_ARG3
+  %2 = tail call <8 x i16> @llvm.mips.insert.h(<8 x i16> %0, i32 1, i32 %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_insert_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.insert.h(<8 x i16>, i32, i32) nounwind
+
+; CHECK: llvm_mips_insert_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0(
+; CHECK-DAG: insert.h [[R2]][1], [[R1]]
+; CHECK-DAG: st.h [[R2]], 0(
+; CHECK: .size llvm_mips_insert_h_test
+;
+@llvm_mips_insert_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_insert_w_ARG3 = global i32 27, align 16
+@llvm_mips_insert_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_insert_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_insert_w_ARG1
+  %1 = load i32* @llvm_mips_insert_w_ARG3
+  %2 = tail call <4 x i32> @llvm.mips.insert.w(<4 x i32> %0, i32 1, i32 %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_insert_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.insert.w(<4 x i32>, i32, i32) nounwind
+
+; CHECK: llvm_mips_insert_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0(
+; CHECK-DAG: insert.w [[R2]][1], [[R1]]
+; CHECK-DAG: st.w [[R2]], 0(
+; CHECK: .size llvm_mips_insert_w_test
+;
+@llvm_mips_insert_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_insert_d_ARG3 = global i64 27, align 16
+@llvm_mips_insert_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_insert_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_insert_d_ARG1
+  %1 = load i64* @llvm_mips_insert_d_ARG3
+  %2 = tail call <2 x i64> @llvm.mips.insert.d(<2 x i64> %0, i32 1, i64 %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_insert_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.insert.d(<2 x i64>, i32, i64) nounwind
+
+; CHECK: llvm_mips_insert_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], 4(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]],
+; CHECK-DAG: insert.w [[R3]][2], [[R1]]
+; CHECK-DAG: insert.w [[R3]][3], [[R2]]
+; CHECK-DAG: st.w [[R3]],
+; CHECK: .size llvm_mips_insert_d_test
+;
+@llvm_mips_insve_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_insve_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_insve_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_insve_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_insve_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_insve_b_ARG3
+  %2 = tail call <16 x i8> @llvm.mips.insve.b(<16 x i8> %0, i32 1, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_insve_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.insve.b(<16 x i8>, i32, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_insve_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_b_ARG3)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.b [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.b [[R3]],
+; CHECK: .size llvm_mips_insve_b_test
+;
+@llvm_mips_insve_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_insve_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_insve_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_insve_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_insve_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_insve_h_ARG3
+  %2 = tail call <8 x i16> @llvm.mips.insve.h(<8 x i16> %0, i32 1, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_insve_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.insve.h(<8 x i16>, i32, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_insve_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_h_ARG3)(
+; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.h [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.h [[R3]],
+; CHECK: .size llvm_mips_insve_h_test
+;
+@llvm_mips_insve_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_insve_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_insve_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_insve_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_insve_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_insve_w_ARG3
+  %2 = tail call <4 x i32> @llvm.mips.insve.w(<4 x i32> %0, i32 1, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_insve_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.insve.w(<4 x i32>, i32, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_insve_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_w_ARG3)(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.w [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.w [[R3]],
+; CHECK: .size llvm_mips_insve_w_test
+;
+@llvm_mips_insve_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_insve_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_insve_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_insve_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_insve_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_insve_d_ARG3
+  %2 = tail call <2 x i64> @llvm.mips.insve.d(<2 x i64> %0, i32 1, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_insve_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.insve.d(<2 x i64>, i32, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_insve_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_d_ARG3)(
+; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.d [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.d [[R3]],
+; CHECK: .size llvm_mips_insve_d_test
+;
diff --git a/test/CodeGen/Mips/msa/elm_move.ll b/test/CodeGen/Mips/msa/elm_move.ll
new file mode 100644
index 0000000..98c06c7
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_move.ll
@@ -0,0 +1,25 @@
+; Test the MSA move intrinsics (which are encoded with the ELM instruction
+; format).
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_move_vb_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_move_vb_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_move_vb_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_move_vb_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.move.v(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_move_vb_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.move.v(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_move_vb_test:
+; CHECK: ld.b
+; CHECK: move.v
+; CHECK: st.b
+; CHECK: .size llvm_mips_move_vb_test
+;
diff --git a/test/CodeGen/Mips/msa/elm_shift_slide.ll b/test/CodeGen/Mips/msa/elm_shift_slide.ll
new file mode 100644
index 0000000..39d670d
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_shift_slide.ll
@@ -0,0 +1,158 @@
+; Test the MSA intrinsics that are encoded with the ELM instruction format and
+; are either shifts or slides.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_sldi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sldi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sldi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sldi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %0, i32 1)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_sldi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_b_test:
+; CHECK: ld.b
+; CHECK: sldi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_sldi_b_test
+;
+@llvm_mips_sldi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sldi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sldi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sldi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %0, i32 1)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_sldi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_h_test:
+; CHECK: ld.h
+; CHECK: sldi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_sldi_h_test
+;
+@llvm_mips_sldi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sldi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sldi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sldi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %0, i32 1)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_sldi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_w_test:
+; CHECK: ld.w
+; CHECK: sldi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_sldi_w_test
+;
+@llvm_mips_sldi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sldi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sldi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sldi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %0, i32 1)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_sldi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_d_test:
+; CHECK: ld.d
+; CHECK: sldi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_sldi_d_test
+;
+@llvm_mips_splati_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_splati_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_splati_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_splati_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.splati.b(<16 x i8> %0, i32 1)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_splati_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_splati_b_test:
+; CHECK: ld.b
+; CHECK: splati.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_splati_b_test
+;
+@llvm_mips_splati_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_splati_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_splati_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_splati_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.splati.h(<8 x i16> %0, i32 1)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_splati_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_splati_h_test:
+; CHECK: ld.h
+; CHECK: splati.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_splati_h_test
+;
+@llvm_mips_splati_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_splati_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_splati_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_splati_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.splati.w(<4 x i32> %0, i32 1)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_splati_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_splati_w_test:
+; CHECK: ld.w
+; CHECK: splati.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_splati_w_test
+;
+@llvm_mips_splati_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_splati_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_splati_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_splati_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.splati.d(<2 x i64> %0, i32 1)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_splati_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_splati_d_test:
+; CHECK: ld.d
+; CHECK: splati.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_splati_d_test
+;
diff --git a/test/CodeGen/Mips/msa/endian.ll b/test/CodeGen/Mips/msa/endian.ll
new file mode 100644
index 0000000..44d1925
--- /dev/null
+++ b/test/CodeGen/Mips/msa/endian.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=BIGENDIAN %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=LITENDIAN %s
+
+@v16i8 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+@v8i16 = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+@v4i32 = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+@v2i64 = global <2 x i64> <i64 0, i64 0>
+
+define void @const_v16i8() nounwind {
+  ; LITENDIAN: .byte 0
+  ; LITENDIAN: .byte 1
+  ; LITENDIAN: .byte 2
+  ; LITENDIAN: .byte 3
+  ; LITENDIAN: .byte 4
+  ; LITENDIAN: .byte 5
+  ; LITENDIAN: .byte 6
+  ; LITENDIAN: .byte 7
+  ; LITENDIAN: .byte 8
+  ; LITENDIAN: .byte 9
+  ; LITENDIAN: .byte 10
+  ; LITENDIAN: .byte 11
+  ; LITENDIAN: .byte 12
+  ; LITENDIAN: .byte 13
+  ; LITENDIAN: .byte 14
+  ; LITENDIAN: .byte 15
+  ; LITENDIAN: const_v16i8:
+  ; BIGENDIAN: .byte 0
+  ; BIGENDIAN: .byte 1
+  ; BIGENDIAN: .byte 2
+  ; BIGENDIAN: .byte 3
+  ; BIGENDIAN: .byte 4
+  ; BIGENDIAN: .byte 5
+  ; BIGENDIAN: .byte 6
+  ; BIGENDIAN: .byte 7
+  ; BIGENDIAN: .byte 8
+  ; BIGENDIAN: .byte 9
+  ; BIGENDIAN: .byte 10
+  ; BIGENDIAN: .byte 11
+  ; BIGENDIAN: .byte 12
+  ; BIGENDIAN: .byte 13
+  ; BIGENDIAN: .byte 14
+  ; BIGENDIAN: .byte 15
+  ; BIGENDIAN: const_v16i8:
+
+  store volatile <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8>*@v16i8
+
+  ret void
+}
+
+define void @const_v8i16() nounwind {
+  ; LITENDIAN: .2byte 0
+  ; LITENDIAN: .2byte 1
+  ; LITENDIAN: .2byte 2
+  ; LITENDIAN: .2byte 3
+  ; LITENDIAN: .2byte 4
+  ; LITENDIAN: .2byte 5
+  ; LITENDIAN: .2byte 6
+  ; LITENDIAN: .2byte 7
+  ; LITENDIAN: const_v8i16:
+  ; BIGENDIAN: .2byte 0
+  ; BIGENDIAN: .2byte 1
+  ; BIGENDIAN: .2byte 2
+  ; BIGENDIAN: .2byte 3
+  ; BIGENDIAN: .2byte 4
+  ; BIGENDIAN: .2byte 5
+  ; BIGENDIAN: .2byte 6
+  ; BIGENDIAN: .2byte 7
+  ; BIGENDIAN: const_v8i16:
+
+  store volatile <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16>*@v8i16
+
+  ret void
+}
+
+define void @const_v4i32() nounwind {
+  ; LITENDIAN: .4byte 0
+  ; LITENDIAN: .4byte 1
+  ; LITENDIAN: .4byte 2
+  ; LITENDIAN: .4byte 3
+  ; LITENDIAN: const_v4i32:
+  ; BIGENDIAN: .4byte 0
+  ; BIGENDIAN: .4byte 1
+  ; BIGENDIAN: .4byte 2
+  ; BIGENDIAN: .4byte 3
+  ; BIGENDIAN: const_v4i32:
+
+  store volatile <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>*@v4i32
+
+  ret void
+}
+
+define void @const_v2i64() nounwind {
+  ; LITENDIAN: .4byte 1
+  ; LITENDIAN: .4byte 0
+  ; LITENDIAN: .4byte 2
+  ; LITENDIAN: .4byte 0
+  ; LITENDIAN: const_v2i64:
+  ; BIGENDIAN: .4byte 0
+  ; BIGENDIAN: .4byte 1
+  ; BIGENDIAN: .4byte 0
+  ; BIGENDIAN: .4byte 2
+  ; BIGENDIAN: const_v2i64:
+
+  store volatile <2 x i64> <i64 1, i64 2>, <2 x i64>*@v2i64
+
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
new file mode 100644
index 0000000..3088e1b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/frameindex.ll
@@ -0,0 +1,85 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-BE %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-LE %s
+
+define void @loadstore_v16i8_near() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_near:
+
+  %1 = alloca <16 x i8>
+  %2 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0($sp)
+  store volatile <16 x i8> %2, <16 x i8>* %1
+  ; MIPS32-AE: st.b [[R1]], 0($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_near
+}
+
+define void @loadstore_v16i8_just_under_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_under_simm10:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp)
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: st.b [[R1]], 496($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_under_simm10
+}
+
+define void @loadstore_v16i8_just_over_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_over_simm10:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [497 x i8] ; Push the frame just over 512 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_over_simm10
+}
+
+define void @loadstore_v16i8_just_under_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_under_simm16:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_under_simm16
+}
+
+define void @loadstore_v16i8_just_over_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_over_simm16:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_over_simm16
+}
diff --git a/test/CodeGen/Mips/msa/i10.ll b/test/CodeGen/Mips/msa/i10.ll
new file mode 100644
index 0000000..c5a9617
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i10.ll
@@ -0,0 +1,89 @@
+; Test the MSA intrinsics that are encoded with the I10 instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bnz_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+
+define i32 @llvm_mips_bnz_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bnz_b_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.b(<16 x i8> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_bnz_b_test:
+; CHECK-DAG: ld.b [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.b [[R0]]
+; CHECK: .size llvm_mips_bnz_b_test
+
+@llvm_mips_bnz_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+
+define i32 @llvm_mips_bnz_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bnz_h_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.h(<8 x i16> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_bnz_h_test:
+; CHECK-DAG: ld.h [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.h [[R0]]
+; CHECK: .size llvm_mips_bnz_h_test
+
+@llvm_mips_bnz_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+
+define i32 @llvm_mips_bnz_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bnz_w_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.w(<4 x i32> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_bnz_w_test:
+; CHECK-DAG: ld.w [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.w [[R0]]
+; CHECK: .size llvm_mips_bnz_w_test
+
+@llvm_mips_bnz_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+
+define i32 @llvm_mips_bnz_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bnz_d_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.d(<2 x i64> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_bnz_d_test:
+; CHECK-DAG: ld.d [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.d [[R0]]
+; CHECK: .size llvm_mips_bnz_d_test
+
diff --git a/test/CodeGen/Mips/msa/i5-a.ll b/test/CodeGen/Mips/msa/i5-a.ll
new file mode 100644
index 0000000..0b50720
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-a.ll
@@ -0,0 +1,82 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'a'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_addvi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_addvi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_addvi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_addvi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_addvi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_b_test:
+; CHECK: ld.b
+; CHECK: addvi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_addvi_b_test
+;
+@llvm_mips_addvi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_addvi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_addvi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_addvi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_addvi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_h_test:
+; CHECK: ld.h
+; CHECK: addvi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_addvi_h_test
+;
+@llvm_mips_addvi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_addvi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_addvi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_addvi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_addvi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_w_test:
+; CHECK: ld.w
+; CHECK: addvi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_addvi_w_test
+;
+@llvm_mips_addvi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_addvi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_addvi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_addvi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_addvi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_d_test:
+; CHECK: ld.d
+; CHECK: addvi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_addvi_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-b.ll b/test/CodeGen/Mips/msa/i5-b.ll
new file mode 100644
index 0000000..da6be66
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-b.ll
@@ -0,0 +1,439 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'b'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bclri_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bclri_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bclri_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bclri_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_bclri_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_b_test:
+; CHECK: ld.b
+; andi.b is equivalent to bclri.b
+; CHECK: andi.b {{\$w[0-9]}}, {{\$w[0-9]}}, 127
+; CHECK: st.b
+; CHECK: .size llvm_mips_bclri_b_test
+;
+@llvm_mips_bclri_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bclri_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bclri_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bclri_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_bclri_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_h_test:
+; CHECK: ld.h
+; CHECK: bclri.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bclri_h_test
+;
+@llvm_mips_bclri_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bclri_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bclri_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bclri_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_bclri_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_w_test:
+; CHECK: ld.w
+; CHECK: bclri.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bclri_w_test
+;
+@llvm_mips_bclri_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bclri_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bclri_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bclri_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_bclri_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_d_test:
+; CHECK: ld.d
+; CHECK: bclri.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bclri_d_test
+;
+@llvm_mips_binsli_b_ARG1 = global <16 x i8> zeroinitializer, align 16
+@llvm_mips_binsli_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsli_b_RES  = global <16 x i8> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsli_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsli_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %0, <16 x i8> %1, i32 7)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_binsli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.b [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_b_RES)(
+; CHECK-DAG: st.b [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_b_test
+
+@llvm_mips_binsli_h_ARG1 = global <8 x i16> zeroinitializer, align 16
+@llvm_mips_binsli_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsli_h_RES  = global <8 x i16> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsli_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsli_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %0, <8 x i16> %1, i32 7)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_binsli_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_h_ARG2)(
+; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.h [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_h_RES)(
+; CHECK-DAG: st.h [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_h_test
+
+@llvm_mips_binsli_w_ARG1 = global <4 x i32> zeroinitializer, align 16
+@llvm_mips_binsli_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsli_w_RES  = global <4 x i32> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsli_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsli_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %0, <4 x i32> %1, i32 7)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_binsli_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_w_ARG2)(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.w [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_w_RES)(
+; CHECK-DAG: st.w [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_w_test
+
+@llvm_mips_binsli_d_ARG1 = global <2 x i64> zeroinitializer, align 16
+@llvm_mips_binsli_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsli_d_RES  = global <2 x i64> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsli_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsli_d_ARG2
+  ; TODO: We use a particularly wide mask here to work around a legalization
+  ;       issue. If the mask doesn't fit within a 10-bit immediate, it gets
+  ;       legalized into a constant pool. We should add a test to cover the
+  ;       other cases once they correctly select binsli.d.
+  %2 = tail call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %0, <2 x i64> %1, i32 61)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_binsli_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_d_ARG2)(
+; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.d [[R3]], [[R4]], 61
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_d_RES)(
+; CHECK-DAG: st.d [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_d_test
+
+@llvm_mips_binsri_b_ARG1 = global <16 x i8> zeroinitializer, align 16
+@llvm_mips_binsri_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsri_b_RES  = global <16 x i8> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsri_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsri_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %0, <16 x i8> %1, i32 7)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_binsri_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.b [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_b_RES)(
+; CHECK-DAG: st.b [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_b_test
+
+@llvm_mips_binsri_h_ARG1 = global <8 x i16> zeroinitializer, align 16
+@llvm_mips_binsri_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsri_h_RES  = global <8 x i16> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsri_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsri_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %0, <8 x i16> %1, i32 7)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_binsri_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_h_ARG2)(
+; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.h [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_h_RES)(
+; CHECK-DAG: st.h [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_h_test
+
+@llvm_mips_binsri_w_ARG1 = global <4 x i32> zeroinitializer, align 16
+@llvm_mips_binsri_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsri_w_RES  = global <4 x i32> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsri_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsri_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %0, <4 x i32> %1, i32 7)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_binsri_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_w_ARG2)(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.w [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_w_RES)(
+; CHECK-DAG: st.w [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_w_test
+
+@llvm_mips_binsri_d_ARG1 = global <2 x i64> zeroinitializer, align 16
+@llvm_mips_binsri_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsri_d_RES  = global <2 x i64> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsri_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsri_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %0, <2 x i64> %1, i32 7)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_binsri_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_d_ARG2)(
+; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.d [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_d_RES)(
+; CHECK-DAG: st.d [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_d_test
+
+@llvm_mips_bnegi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bnegi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bnegi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bnegi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_bnegi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_b_test:
+; CHECK: ld.b
+; CHECK: bnegi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bnegi_b_test
+;
+@llvm_mips_bnegi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bnegi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bnegi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bnegi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_bnegi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_h_test:
+; CHECK: ld.h
+; CHECK: bnegi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bnegi_h_test
+;
+@llvm_mips_bnegi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bnegi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bnegi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bnegi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_bnegi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_w_test:
+; CHECK: ld.w
+; CHECK: bnegi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bnegi_w_test
+;
+@llvm_mips_bnegi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bnegi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bnegi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bnegi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_bnegi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_d_test:
+; CHECK: ld.d
+; CHECK: bnegi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bnegi_d_test
+;
+@llvm_mips_bseti_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bseti_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bseti_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bseti_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_bseti_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_b_test:
+; CHECK: ld.b
+; CHECK: bseti.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bseti_b_test
+;
+@llvm_mips_bseti_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bseti_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bseti_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bseti_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_bseti_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_h_test:
+; CHECK: ld.h
+; CHECK: bseti.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bseti_h_test
+;
+@llvm_mips_bseti_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bseti_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bseti_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bseti_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_bseti_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_w_test:
+; CHECK: ld.w
+; CHECK: bseti.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bseti_w_test
+;
+@llvm_mips_bseti_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bseti_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bseti_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bseti_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_bseti_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_d_test:
+; CHECK: ld.d
+; CHECK: bseti.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bseti_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-c.ll b/test/CodeGen/Mips/msa/i5-c.ll
new file mode 100644
index 0000000..bf1578f
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-c.ll
@@ -0,0 +1,386 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'c'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ceqi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ceqi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ceqi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ceqi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.ceqi.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_ceqi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ceqi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_b_test:
+; CHECK: ld.b
+; CHECK: ceqi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ceqi_b_test
+;
+@llvm_mips_ceqi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ceqi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ceqi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ceqi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.ceqi.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_ceqi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ceqi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_h_test:
+; CHECK: ld.h
+; CHECK: ceqi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ceqi_h_test
+;
+@llvm_mips_ceqi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ceqi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ceqi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ceqi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ceqi.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ceqi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ceqi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_w_test:
+; CHECK: ld.w
+; CHECK: ceqi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ceqi_w_test
+;
+@llvm_mips_ceqi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ceqi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ceqi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ceqi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ceqi.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ceqi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ceqi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_d_test:
+; CHECK: ld.d
+; CHECK: ceqi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ceqi_d_test
+;
+@llvm_mips_clei_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clei_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clei_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clei_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clei_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_b_test:
+; CHECK: ld.b
+; CHECK: clei_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clei_s_b_test
+;
+@llvm_mips_clei_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clei_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clei_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clei_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clei_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_h_test:
+; CHECK: ld.h
+; CHECK: clei_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clei_s_h_test
+;
+@llvm_mips_clei_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clei_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clei_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clei_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clei_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_w_test:
+; CHECK: ld.w
+; CHECK: clei_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clei_s_w_test
+;
+@llvm_mips_clei_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clei_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clei_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clei_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clei_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_d_test:
+; CHECK: ld.d
+; CHECK: clei_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clei_s_d_test
+;
+@llvm_mips_clei_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clei_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clei_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clei_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clei_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_b_test:
+; CHECK: ld.b
+; CHECK: clei_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clei_u_b_test
+;
+@llvm_mips_clei_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clei_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clei_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clei_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clei_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_h_test:
+; CHECK: ld.h
+; CHECK: clei_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clei_u_h_test
+;
+@llvm_mips_clei_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clei_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clei_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clei_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clei_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_w_test:
+; CHECK: ld.w
+; CHECK: clei_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clei_u_w_test
+;
+@llvm_mips_clei_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clei_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clei_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clei_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clei_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_d_test:
+; CHECK: ld.d
+; CHECK: clei_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clei_u_d_test
+;
+@llvm_mips_clti_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clti_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clti_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clti_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clti_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_b_test:
+; CHECK: ld.b
+; CHECK: clti_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clti_s_b_test
+;
+@llvm_mips_clti_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clti_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clti_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clti_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clti_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_h_test:
+; CHECK: ld.h
+; CHECK: clti_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clti_s_h_test
+;
+@llvm_mips_clti_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clti_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clti_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clti_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clti_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_w_test:
+; CHECK: ld.w
+; CHECK: clti_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clti_s_w_test
+;
+@llvm_mips_clti_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clti_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clti_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clti_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clti_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_d_test:
+; CHECK: ld.d
+; CHECK: clti_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clti_s_d_test
+;
+@llvm_mips_clti_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clti_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clti_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clti_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clti_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_b_test:
+; CHECK: ld.b
+; CHECK: clti_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clti_u_b_test
+;
+@llvm_mips_clti_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clti_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clti_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clti_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clti_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_h_test:
+; CHECK: ld.h
+; CHECK: clti_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clti_u_h_test
+;
+@llvm_mips_clti_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clti_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clti_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clti_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clti_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_w_test:
+; CHECK: ld.w
+; CHECK: clti_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clti_u_w_test
+;
+@llvm_mips_clti_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clti_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clti_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clti_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clti_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_d_test:
+; CHECK: ld.d
+; CHECK: clti_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clti_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-m.ll b/test/CodeGen/Mips/msa/i5-m.ll
new file mode 100644
index 0000000..2766349
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-m.ll
@@ -0,0 +1,310 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'm'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_maxi_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_maxi_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_maxi_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_maxi_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_maxi_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_b_test:
+; CHECK: ld.b
+; CHECK: maxi_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_maxi_s_b_test
+;
+@llvm_mips_maxi_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maxi_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maxi_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maxi_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_maxi_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_h_test:
+; CHECK: ld.h
+; CHECK: maxi_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maxi_s_h_test
+;
+@llvm_mips_maxi_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maxi_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maxi_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maxi_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_maxi_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_w_test:
+; CHECK: ld.w
+; CHECK: maxi_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maxi_s_w_test
+;
+@llvm_mips_maxi_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_maxi_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_maxi_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_maxi_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_maxi_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_d_test:
+; CHECK: ld.d
+; CHECK: maxi_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_maxi_s_d_test
+;
+@llvm_mips_maxi_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_maxi_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_maxi_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_maxi_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_maxi_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_b_test:
+; CHECK: ld.b
+; CHECK: maxi_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_maxi_u_b_test
+;
+@llvm_mips_maxi_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maxi_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maxi_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maxi_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_maxi_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_h_test:
+; CHECK: ld.h
+; CHECK: maxi_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maxi_u_h_test
+;
+@llvm_mips_maxi_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maxi_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maxi_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maxi_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_maxi_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_w_test:
+; CHECK: ld.w
+; CHECK: maxi_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maxi_u_w_test
+;
+@llvm_mips_maxi_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_maxi_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_maxi_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_maxi_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_maxi_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_d_test:
+; CHECK: ld.d
+; CHECK: maxi_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_maxi_u_d_test
+;
+@llvm_mips_mini_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mini_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mini_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mini_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_mini_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_b_test:
+; CHECK: ld.b
+; CHECK: mini_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mini_s_b_test
+;
+@llvm_mips_mini_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mini_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mini_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mini_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_mini_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_h_test:
+; CHECK: ld.h
+; CHECK: mini_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mini_s_h_test
+;
+@llvm_mips_mini_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mini_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mini_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mini_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_mini_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_w_test:
+; CHECK: ld.w
+; CHECK: mini_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mini_s_w_test
+;
+@llvm_mips_mini_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mini_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mini_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mini_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_mini_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_d_test:
+; CHECK: ld.d
+; CHECK: mini_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mini_s_d_test
+;
+@llvm_mips_mini_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mini_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mini_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mini_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_mini_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_b_test:
+; CHECK: ld.b
+; CHECK: mini_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mini_u_b_test
+;
+@llvm_mips_mini_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mini_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mini_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mini_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_mini_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_h_test:
+; CHECK: ld.h
+; CHECK: mini_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mini_u_h_test
+;
+@llvm_mips_mini_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mini_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mini_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mini_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_mini_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_w_test:
+; CHECK: ld.w
+; CHECK: mini_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mini_u_w_test
+;
+@llvm_mips_mini_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mini_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mini_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mini_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_mini_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_d_test:
+; CHECK: ld.d
+; CHECK: mini_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mini_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-s.ll b/test/CodeGen/Mips/msa/i5-s.ll
new file mode 100644
index 0000000..184172f
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-s.ll
@@ -0,0 +1,82 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 's'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_subvi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subvi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subvi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subvi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.subvi.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_subvi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subvi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_b_test:
+; CHECK: ld.b
+; CHECK: subvi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_subvi_b_test
+;
+@llvm_mips_subvi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subvi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subvi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subvi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.subvi.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_subvi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subvi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_h_test:
+; CHECK: ld.h
+; CHECK: subvi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_subvi_h_test
+;
+@llvm_mips_subvi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subvi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subvi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subvi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.subvi.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_subvi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subvi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_w_test:
+; CHECK: ld.w
+; CHECK: subvi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_subvi_w_test
+;
+@llvm_mips_subvi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subvi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subvi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subvi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.subvi.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_subvi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subvi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_d_test:
+; CHECK: ld.d
+; CHECK: subvi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_subvi_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5_ld_st.ll b/test/CodeGen/Mips/msa/i5_ld_st.ll
new file mode 100644
index 0000000..7cc55f2
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5_ld_st.ll
@@ -0,0 +1,150 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format and
+; are loads or stores.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ld_b_ARG = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ld_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ld_b_test() nounwind {
+entry:
+  %0 = bitcast <16 x i8>* @llvm_mips_ld_b_ARG to i8*
+  %1 = tail call <16 x i8> @llvm.mips.ld.b(i8* %0, i32 16)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_ld_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ld.b(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_b_test:
+; CHECK: ld.b [[R1:\$w[0-9]+]], 16(
+; CHECK: st.b
+; CHECK: .size llvm_mips_ld_b_test
+;
+@llvm_mips_ld_h_ARG = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ld_h_RES = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ld_h_test() nounwind {
+entry:
+  %0 = bitcast <8 x i16>* @llvm_mips_ld_h_ARG to i8*
+  %1 = tail call <8 x i16> @llvm.mips.ld.h(i8* %0, i32 16)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_ld_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ld.h(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_h_test:
+; CHECK: ld.h [[R1:\$w[0-9]+]], 16(
+; CHECK: st.h
+; CHECK: .size llvm_mips_ld_h_test
+;
+@llvm_mips_ld_w_ARG = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ld_w_RES = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ld_w_test() nounwind {
+entry:
+  %0 = bitcast <4 x i32>* @llvm_mips_ld_w_ARG to i8*
+  %1 = tail call <4 x i32> @llvm.mips.ld.w(i8* %0, i32 16)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ld_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ld.w(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_w_test:
+; CHECK: ld.w [[R1:\$w[0-9]+]], 16(
+; CHECK: st.w
+; CHECK: .size llvm_mips_ld_w_test
+;
+@llvm_mips_ld_d_ARG = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ld_d_RES = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ld_d_test() nounwind {
+entry:
+  %0 = bitcast <2 x i64>* @llvm_mips_ld_d_ARG to i8*
+  %1 = tail call <2 x i64> @llvm.mips.ld.d(i8* %0, i32 16)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ld_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ld.d(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_d_test:
+; CHECK: ld.d [[R1:\$w[0-9]+]], 16(
+; CHECK: st.d
+; CHECK: .size llvm_mips_ld_d_test
+;
+@llvm_mips_st_b_ARG = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_st_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_st_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_st_b_ARG
+  %1 = bitcast <16 x i8>* @llvm_mips_st_b_RES to i8*
+  tail call void @llvm.mips.st.b(<16 x i8> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.b(<16 x i8>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_b_test:
+; CHECK: ld.b
+; CHECK: st.b [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_b_test
+;
+@llvm_mips_st_h_ARG = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_st_h_RES = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_st_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_st_h_ARG
+  %1 = bitcast <8 x i16>* @llvm_mips_st_h_RES to i8*
+  tail call void @llvm.mips.st.h(<8 x i16> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.h(<8 x i16>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_h_test:
+; CHECK: ld.h
+; CHECK: st.h [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_h_test
+;
+@llvm_mips_st_w_ARG = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_st_w_RES = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_st_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_st_w_ARG
+  %1 = bitcast <4 x i32>* @llvm_mips_st_w_RES to i8*
+  tail call void @llvm.mips.st.w(<4 x i32> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.w(<4 x i32>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_w_test:
+; CHECK: ld.w
+; CHECK: st.w [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_w_test
+;
+@llvm_mips_st_d_ARG = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_st_d_RES = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_st_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_st_d_ARG
+  %1 = bitcast <2 x i64>* @llvm_mips_st_d_RES to i8*
+  tail call void @llvm.mips.st.d(<2 x i64> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.d(<2 x i64>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_d_test:
+; CHECK: ld.d
+; CHECK: st.d [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i8.ll b/test/CodeGen/Mips/msa/i8.ll
new file mode 100644
index 0000000..d2931a7
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i8.ll
@@ -0,0 +1,211 @@
+; Test the MSA intrinsics that are encoded with the I8 instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_andi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_andi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_andi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_andi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.andi.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_andi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_andi_b_test:
+; CHECK: ld.b
+; CHECK: andi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_andi_b_test
+
+@llvm_mips_bmnzi_b_ARG1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+@llvm_mips_bmnzi_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmnzi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmnzi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmnzi_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmnzi_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 25)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bmnzi_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: bmnzi.b [[R3]], [[R4]], 25
+; CHECK-DAG: st.b [[R3]], 0(
+; CHECK: .size llvm_mips_bmnzi_b_test
+
+@llvm_mips_bmzi_b_ARG1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+@llvm_mips_bmzi_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmzi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmzi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmzi_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmzi_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 25)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmzi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bmzi_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmzi_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmzi_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; bmnzi.b is the same as bmzi.b with ws and wd_in swapped
+; CHECK-DAG: bmnzi.b [[R4]], [[R3]], 25
+; CHECK-DAG: st.b [[R4]], 0(
+; CHECK: .size llvm_mips_bmzi_b_test
+
+@llvm_mips_bseli_b_ARG1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+@llvm_mips_bseli_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bseli_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bseli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bseli_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bseli_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %0, <16 x i8> %1, i32 25)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bseli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bseli_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bseli_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bseli_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: bseli.b [[R3]], [[R4]], 25
+; CHECK-DAG: st.b [[R3]], 0(
+; CHECK: .size llvm_mips_bseli_b_test
+
+@llvm_mips_nori_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nori_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nori_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nori_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.nori.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_nori_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_nori_b_test:
+; CHECK: ld.b
+; CHECK: nori.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_nori_b_test
+;
+@llvm_mips_ori_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ori_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ori_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ori_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.ori.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_ori_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_ori_b_test:
+; CHECK: ld.b
+; CHECK: ori.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ori_b_test
+;
+@llvm_mips_shf_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_shf_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_shf_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_shf_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.shf.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_shf_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.shf.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_shf_b_test:
+; CHECK: ld.b
+; CHECK: shf.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_shf_b_test
+;
+@llvm_mips_shf_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_shf_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_shf_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_shf_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.shf.h(<8 x i16> %0, i32 25)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_shf_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.shf.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_shf_h_test:
+; CHECK: ld.h
+; CHECK: shf.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_shf_h_test
+;
+@llvm_mips_shf_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_shf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_shf_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_shf_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.shf.w(<4 x i32> %0, i32 25)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_shf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.shf.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_shf_w_test:
+; CHECK: ld.w
+; CHECK: shf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_shf_w_test
+;
+@llvm_mips_xori_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_xori_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_xori_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_xori_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.xori.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_xori_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.xori.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_xori_b_test:
+; CHECK: ld.b
+; CHECK: xori.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_xori_b_test
+;
diff --git a/test/CodeGen/Mips/msa/inline-asm.ll b/test/CodeGen/Mips/msa/inline-asm.ll
new file mode 100644
index 0000000..4a34273
--- /dev/null
+++ b/test/CodeGen/Mips/msa/inline-asm.ll
@@ -0,0 +1,34 @@
+; A basic inline assembly test
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@v4i32_r  = global <4 x i32> zeroinitializer, align 16
+
+define void @test1() nounwind {
+entry:
+  ; CHECK-LABEL: test1:
+  %0 = call <4 x i32> asm "ldi.w ${0:w}, 1", "=f"()
+  ; CHECK: ldi.w $w{{[1-3]?[0-9]}}, 1
+  store <4 x i32> %0, <4 x i32>* @v4i32_r
+  ret void
+}
+
+define void @test2() nounwind {
+entry:
+  ; CHECK-LABEL: test2:
+  %0 = load <4 x i32>* @v4i32_r
+  %1 = call <4 x i32> asm "addvi.w ${0:w}, ${1:w}, 1", "=f,f"(<4 x i32> %0)
+  ; CHECK: addvi.w $w{{[1-3]?[0-9]}}, $w{{[1-3]?[0-9]}}, 1
+  store <4 x i32> %1, <4 x i32>* @v4i32_r
+  ret void
+}
+
+define void @test3() nounwind {
+entry:
+  ; CHECK-LABEL: test3:
+  %0 = load <4 x i32>* @v4i32_r
+  %1 = call <4 x i32> asm sideeffect "addvi.w ${0:w}, ${1:w}, 1", "=f,f,~{$w0}"(<4 x i32> %0)
+  ; CHECK: addvi.w $w{{([1-9]|[1-3][0-9])}}, $w{{([1-9]|[1-3][0-9])}}, 1
+  store <4 x i32> %1, <4 x i32>* @v4i32_r
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll b/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll
new file mode 100644
index 0000000..4beaaa9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll
@@ -0,0 +1,134 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; "Unexpected illegal type!" assertion.
+; It should at least successfully build.
+
+define void @autogen_SD1704963983(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <4 x double>
+  %A3 = alloca <8 x i64>
+  %A2 = alloca <1 x double>
+  %A1 = alloca double
+  %A = alloca i32
+  %L = load i8* %0
+  store i8 77, i8* %0
+  %E = extractelement <8 x i64> zeroinitializer, i32 2
+  %Shuff = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15, i32 1, i32 3>
+  %I = insertelement <8 x i64> zeroinitializer, i64 %E, i32 7
+  %Sl = select i1 false, i8* %0, i8* %0
+  %Cmp = icmp eq i32 434069, 272505
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF78, %BB
+  %L5 = load i8* %Sl
+  store i8 %L, i8* %Sl
+  %E6 = extractelement <8 x i32> zeroinitializer, i32 2
+  %Shuff7 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 undef>
+  %I8 = insertelement <8 x i64> zeroinitializer, i64 %4, i32 7
+  %B = shl <1 x i16> zeroinitializer, zeroinitializer
+  %FC = sitofp <8 x i64> zeroinitializer to <8 x float>
+  %Sl9 = select i1 %Cmp, i8 77, i8 77
+  %Cmp10 = icmp uge <8 x i64> %Shuff, zeroinitializer
+  %L11 = load i8* %0
+  store i8 %Sl9, i8* %0
+  %E12 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 undef, i32 3, i32 5, i32 7>
+  %I14 = insertelement <4 x i32> zeroinitializer, i32 %3, i32 3
+  %B15 = udiv <1 x i16> %B, zeroinitializer
+  %Tr = trunc <8 x i64> %Shuff to <8 x i32>
+  %Sl16 = select i1 %Cmp, i8 77, i8 %5
+  %Cmp17 = icmp ult <8 x i1> %Cmp10, %Cmp10
+  %L18 = load i8* %Sl
+  store i8 -1, i8* %Sl
+  %E19 = extractelement <8 x i32> zeroinitializer, i32 3
+  %Shuff20 = shufflevector <8 x float> %FC, <8 x float> %FC, <8 x i32> <i32 6, i32 8, i32 undef, i32 12, i32 14, i32 0, i32 2, i32 undef>
+  %I21 = insertelement <8 x i64> %Shuff13, i64 %E, i32 0
+  %B22 = urem <8 x i64> %Shuff7, %I21
+  %FC23 = sitofp i32 50347 to float
+  %Sl24 = select i1 %Cmp, double 0.000000e+00, double 0.000000e+00
+  %Cmp25 = icmp ugt i32 465489, 47533
+  br i1 %Cmp25, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF
+  %L26 = load i8* %Sl
+  store i32 50347, i32* %A
+  %E27 = extractelement <8 x i1> %Cmp10, i32 2
+  br i1 %E27, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF81, %CF78
+  %Shuff28 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 undef>
+  %I29 = insertelement <1 x i16> zeroinitializer, i16 -1, i32 0
+  %B30 = urem <8 x i32> %Tr, zeroinitializer
+  %Tr31 = trunc i32 0 to i16
+  %Sl32 = select i1 %Cmp, <2 x i1> zeroinitializer, <2 x i1> zeroinitializer
+  %L33 = load i8* %Sl
+  store i8 %L26, i8* %Sl
+  %E34 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff35 = shufflevector <1 x i16> zeroinitializer, <1 x i16> %B, <1 x i32> undef
+  %I36 = insertelement <8 x i64> %Shuff28, i64 %E, i32 7
+  %B37 = srem <1 x i16> %I29, zeroinitializer
+  %FC38 = sitofp <8 x i32> %B30 to <8 x double>
+  %Sl39 = select i1 %Cmp, double 0.000000e+00, double %Sl24
+  %L40 = load i8* %Sl
+  store i8 %Sl16, i8* %Sl
+  %E41 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff42 = shufflevector <8 x i1> %Cmp17, <8 x i1> %Cmp10, <8 x i32> <i32 14, i32 undef, i32 2, i32 4, i32 undef, i32 8, i32 10, i32 12>
+  %I43 = insertelement <4 x i32> zeroinitializer, i32 272505, i32 0
+  %B44 = urem <8 x i32> %B30, %Tr
+  %PC = bitcast i8* %0 to i64*
+  %Sl45 = select i1 %Cmp, <8 x i1> %Cmp10, <8 x i1> %Shuff42
+  %Cmp46 = fcmp ugt float 0xB856238A00000000, 0x47DA795E40000000
+  br i1 %Cmp46, label %CF77, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF77
+  %L47 = load i64* %PC
+  store i8 77, i8* %Sl
+  %E48 = extractelement <8 x i64> zeroinitializer, i32 2
+  %Shuff49 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff7, <8 x i32> <i32 5, i32 7, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 3>
+  %I50 = insertelement <8 x i64> zeroinitializer, i64 %L47, i32 7
+  %B51 = fdiv float 0x46CC2D8000000000, %FC23
+  %PC52 = bitcast <8 x i64>* %A3 to i64*
+  %Sl53 = select i1 %Cmp, <8 x i64> %Shuff, <8 x i64> %Shuff
+  %Cmp54 = fcmp ole float 0x47DA795E40000000, 0xB856238A00000000
+  br i1 %Cmp54, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF80
+  %L55 = load i8* %Sl
+  store i8 %Sl16, i8* %Sl
+  %E56 = extractelement <1 x i16> %B, i32 0
+  %Shuff57 = shufflevector <1 x i16> zeroinitializer, <1 x i16> zeroinitializer, <1 x i32> <i32 1>
+  %I58 = insertelement <8 x i64> zeroinitializer, i64 %L47, i32 7
+  %B59 = srem i32 %E19, %E19
+  %Sl60 = select i1 %Cmp, i8 77, i8 77
+  %Cmp61 = icmp ult <1 x i16> zeroinitializer, %B
+  %L62 = load i8* %Sl
+  store i64 %L47, i64* %PC52
+  %E63 = extractelement <4 x i32> %I43, i32 2
+  %Shuff64 = shufflevector <4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
+  %I65 = insertelement <8 x i64> %B22, i64 %L47, i32 7
+  %B66 = add <8 x i64> %I50, %I65
+  %FC67 = uitofp i16 %E12 to float
+  %Sl68 = select i1 %Cmp, <8 x i32> %B30, <8 x i32> zeroinitializer
+  %Cmp69 = fcmp ord double 0.000000e+00, 0.000000e+00
+  br i1 %Cmp69, label %CF77, label %CF79
+
+CF79:                                             ; preds = %CF81
+  %L70 = load i32* %A
+  store i64 %4, i64* %PC
+  %E71 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff72 = shufflevector <8 x i32> zeroinitializer, <8 x i32> %B44, <8 x i32> <i32 11, i32 undef, i32 15, i32 1, i32 3, i32 undef, i32 7, i32 9>
+  %I73 = insertelement <8 x i16> zeroinitializer, i16 %E12, i32 5
+  %B74 = fsub double 0.000000e+00, 0.000000e+00
+  %Sl75 = select i1 %Cmp46, i32 %E6, i32 %E19
+  %Cmp76 = icmp ugt <4 x i32> %I43, zeroinitializer
+  store i8 %L, i8* %Sl
+  store i64 %L47, i64* %PC
+  store i64 %L47, i64* %PC
+  store i8 %L5, i8* %Sl
+  store i8 %L5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s1935737938.ll b/test/CodeGen/Mips/msa/llvm-stress-s1935737938.ll
new file mode 100644
index 0000000..f9cab03
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s1935737938.ll
@@ -0,0 +1,138 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; `Opc && "Cannot copy registers"' assertion.
+; It should at least successfully build.
+
+define void @autogen_SD1935737938(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i64
+  %A3 = alloca <4 x i32>
+  %A2 = alloca i64
+  %A1 = alloca i32
+  %A = alloca <2 x i64>
+  %L = load i8* %0
+  store i8 -1, i8* %0
+  %E = extractelement <2 x i32> zeroinitializer, i32 0
+  %Shuff = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I = insertelement <1 x i64> <i64 -1>, i64 286689, i32 0
+  %B = lshr i8 %L, -69
+  %ZE = fpext float 0xBF2AA5FE80000000 to double
+  %Sl = select i1 true, <1 x i64> <i64 -1>, <1 x i64> <i64 -1>
+  %L5 = load i8* %0
+  store i8 -69, i8* %0
+  %E6 = extractelement <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i32 14
+  %Shuff7 = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I8 = insertelement <2 x i32> zeroinitializer, i32 135673, i32 1
+  %B9 = udiv i8 %B, %B
+  %FC = uitofp i32 %3 to double
+  %Sl10 = select i1 true, <1 x i1> zeroinitializer, <1 x i1> zeroinitializer
+  %Cmp = icmp ne <1 x i64> %I, <i64 -1>
+  %L11 = load i8* %0
+  store i8 %L11, i8* %0
+  %E12 = extractelement <1 x i64> <i64 -1>, i32 0
+  %Shuff13 = shufflevector <1 x i64> %Sl, <1 x i64> <i64 -1>, <1 x i32> <i32 1>
+  %I14 = insertelement <1 x i64> %I, i64 303290, i32 0
+  %B15 = frem float 0.000000e+00, 0.000000e+00
+  %Sl16 = select i1 true, <1 x i1> %Cmp, <1 x i1> zeroinitializer
+  %Cmp17 = fcmp one float 0xBD946F9840000000, %B15
+  br label %CF74
+
+CF74:                                             ; preds = %CF74, %CF80, %CF76, %BB
+  %L18 = load i8* %0
+  store i8 -69, i8* %0
+  %E19 = extractelement <1 x i64> %Sl, i32 0
+  %Shuff20 = shufflevector <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i32> <i32 12, i32 14, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %I21 = insertelement <2 x i32> %Shuff, i32 135673, i32 0
+  %B22 = urem i32 135673, %3
+  %FC23 = sitofp i8 %L to float
+  %Sl24 = select i1 true, i8 %B, i8 %L18
+  %L25 = load i8* %0
+  store i8 %L, i8* %0
+  %E26 = extractelement <2 x i32> %Shuff, i32 1
+  %Shuff27 = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  %I28 = insertelement <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i64 %E12, i32 8
+  %B29 = frem double %ZE, 0x235104F0E94F406E
+  %Tr = trunc i64 286689 to i8
+  %Sl30 = select i1 true, float 0x45B13EA500000000, float %B15
+  %Cmp31 = icmp eq i32 %B22, %B22
+  br i1 %Cmp31, label %CF74, label %CF80
+
+CF80:                                             ; preds = %CF74
+  %L32 = load i8* %0
+  store i8 -1, i8* %0
+  %E33 = extractelement <2 x i32> zeroinitializer, i32 1
+  %Shuff34 = shufflevector <1 x i64> %Shuff13, <1 x i64> <i64 -1>, <1 x i32> zeroinitializer
+  %I35 = insertelement <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 -1, i32 0
+  %FC36 = sitofp <1 x i1> %Cmp to <1 x float>
+  %Sl37 = select i1 true, <8 x i8> %Shuff20, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %Cmp38 = icmp sgt <2 x i32> %I21, %Shuff27
+  %L39 = load i8* %0
+  store i8 %Sl24, i8* %0
+  %E40 = extractelement <8 x i64> zeroinitializer, i32 1
+  %Shuff41 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp38, <2 x i32> <i32 0, i32 2>
+  %I42 = insertelement <4 x i32> zeroinitializer, i32 414573, i32 2
+  %B43 = srem i8 %L5, %L39
+  %Sl44 = select i1 %Cmp17, i8 %L, i8 %L
+  %Cmp45 = fcmp une float 0x3AFCE1A0C0000000, 0.000000e+00
+  br i1 %Cmp45, label %CF74, label %CF76
+
+CF76:                                             ; preds = %CF80
+  %L46 = load i8* %0
+  store i8 %L39, i8* %0
+  %E47 = extractelement <2 x i32> %Shuff27, i32 0
+  %Shuff48 = shufflevector <1 x i1> %Sl10, <1 x i1> %Sl10, <1 x i32> <i32 1>
+  %I49 = insertelement <1 x i64> <i64 -1>, i64 %E12, i32 0
+  %FC50 = fptosi double 0x235104F0E94F406E to i32
+  %Sl51 = select i1 %Cmp17, <16 x i64> %I28, <16 x i64> %I28
+  %Cmp52 = icmp ne i8 %Tr, %Sl24
+  br i1 %Cmp52, label %CF74, label %CF75
+
+CF75:                                             ; preds = %CF75, %CF76
+  %L53 = load i8* %0
+  store i8 %L18, i8* %0
+  %E54 = extractelement <8 x i8> %Shuff20, i32 5
+  %Shuff55 = shufflevector <2 x i32> %Shuff, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %I56 = insertelement <4 x i32> %I42, i32 %B22, i32 2
+  %B57 = sub i64 %E40, %E6
+  %Sl58 = select i1 true, i64 303290, i64 %E40
+  %Cmp59 = icmp slt i64 %E40, %E6
+  br i1 %Cmp59, label %CF75, label %CF78
+
+CF78:                                             ; preds = %CF75
+  %L60 = load i8* %0
+  store i8 -69, i8* %0
+  %E61 = extractelement <2 x i32> zeroinitializer, i32 0
+  %Shuff62 = shufflevector <2 x i32> %Shuff7, <2 x i32> %I21, <2 x i32> <i32 1, i32 3>
+  %I63 = insertelement <1 x i1> %Sl16, i1 %Cmp45, i32 0
+  %B64 = and i8 %Sl44, -69
+  %ZE65 = zext <1 x i1> %Shuff48 to <1 x i64>
+  %Sl66 = select i1 true, <1 x i64> %I, <1 x i64> %I49
+  %Cmp67 = icmp ugt i64 286689, %E40
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF78
+  %L68 = load i8* %0
+  store i64 %B57, i64* %2
+  %E69 = extractelement <2 x i1> %Shuff41, i32 1
+  br i1 %E69, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF
+  %Shuff70 = shufflevector <1 x i64> %Shuff34, <1 x i64> <i64 -1>, <1 x i32> zeroinitializer
+  %I71 = insertelement <2 x i32> %Shuff, i32 %E26, i32 0
+  %Se = sext i8 %L60 to i32
+  %Sl72 = select i1 %Cmp45, <2 x i32> %Shuff62, <2 x i32> %I71
+  %Cmp73 = fcmp ugt double 0x235104F0E94F406E, 0x235104F0E94F406E
+  br i1 %Cmp73, label %CF77, label %CF79
+
+CF79:                                             ; preds = %CF77
+  store i8 %L18, i8* %0
+  store i8 %E54, i8* %0
+  store i8 %L39, i8* %0
+  store i8 %L39, i8* %0
+  store i8 %B, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s2090927243-simplified.ll b/test/CodeGen/Mips/msa/llvm-stress-s2090927243-simplified.ll
new file mode 100644
index 0000000..3811314
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s2090927243-simplified.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a "Cannot select ..." error.
+; This was because undef's are ignored when checking if a vector constant is a
+; splat, but are legalized to zero if left in the DAG which changes the constant
+; into a non-splat.
+;
+; It should at least successfully build.
+
+define void @autogen_SD2090927243() {
+BB:
+  br label %CF77
+
+CF77:                                             ; preds = %CF77, %CF80
+  %Shuff27 = shufflevector <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>,
+                           <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>,
+                           <16 x i32> <i32 27, i32 29, i32 31, i32 1, i32 3, i32 5, i32 undef, i32 9, i32 11, i32 13, i32 undef, i32 17, i32 19, i32 21, i32 23, i32 undef>
+  %ZE30 = zext <16 x i8> %Shuff27 to <16 x i32>
+  %Cmp32 = fcmp ueq float undef, 0x3CDA6E5E40000000
+  br i1 %Cmp32, label %CF77, label %CF
+
+CF:                                               ; preds = %CF, %CF81
+  %E48 = extractelement <16 x i32> %ZE30, i32 14
+  br i1 undef, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s2501752154-simplified.ll b/test/CodeGen/Mips/msa/llvm-stress-s2501752154-simplified.ll
new file mode 100644
index 0000000..564ad74
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s2501752154-simplified.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a "Cannot select ..." error.
+; This happened because the legalizer treated undef's in the <4 x float>
+; constant as equivalent to the defined elements when checking if it a constant
+; splat, but then proceeded to legalize the undef's to zero, leaving it as a
+; non-splat that cannot be selected. It should have eliminated the undef's by
+; rewriting the splat constant.
+
+; It should at least successfully build.
+
+define void @autogen_SD2501752154() {
+BB:
+  %BC = bitcast <4 x i32> <i32 -1, i32 -1, i32 undef, i32 undef> to <4 x float>
+  br label %CF74
+
+CF74:                                             ; preds = %CF74, %CF
+  %E54 = extractelement <1 x i1> undef, i32 0
+  br i1 %E54, label %CF74, label %CF79
+
+CF79:                                             ; preds = %CF75
+  %I63 = insertelement <4 x float> %BC, float undef, i32 0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s2704903805.ll b/test/CodeGen/Mips/msa/llvm-stress-s2704903805.ll
new file mode 100644
index 0000000..e14f405
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s2704903805.ll
@@ -0,0 +1,141 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA after dereferencing a null this pointer.
+; It should at least successfully build.
+
+define void @autogen_SD2704903805(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i32
+  %A3 = alloca i32
+  %A2 = alloca i8
+  %A1 = alloca i32
+  %A = alloca i8
+  %L = load i8* %0
+  store i8 %5, i8* %0
+  %E = extractelement <2 x i16> zeroinitializer, i32 0
+  %Shuff = shufflevector <1 x i8> <i8 -1>, <1 x i8> <i8 -1>, <1 x i32> undef
+  %I = insertelement <1 x i8> <i8 -1>, i8 85, i32 0
+  %B = lshr <2 x i16> zeroinitializer, zeroinitializer
+  %FC = sitofp <4 x i16> zeroinitializer to <4 x float>
+  %Sl = select i1 true, float 0.000000e+00, float 0x401E76A240000000
+  %Cmp = icmp ule i16 -25210, %E
+  br label %CF83
+
+CF83:                                             ; preds = %BB
+  %L5 = load i8* %0
+  store i8 85, i8* %0
+  %E6 = extractelement <1 x i8> <i8 -1>, i32 0
+  %Shuff7 = shufflevector <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I8 = insertelement <4 x i16> zeroinitializer, i16 %E, i32 3
+  %B9 = ashr <2 x i16> %Shuff7, zeroinitializer
+  %FC10 = sitofp i32 -1 to float
+  %Sl11 = select i1 %Cmp, i32 -1, i32 -1
+  %Cmp12 = icmp sgt i32 -1, -1
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF81, %CF83
+  %L13 = load i8* %0
+  store i8 0, i8* %0
+  %E14 = extractelement <2 x i64> zeroinitializer, i32 0
+  %Shuff15 = shufflevector <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
+  %I16 = insertelement <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i64 81222, i32 1
+  %B17 = lshr <2 x i16> zeroinitializer, %B
+  %Tr = trunc i32 272597 to i1
+  br i1 %Tr, label %CF, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF
+  %Sl18 = select i1 %Cmp, <2 x i64> zeroinitializer, <2 x i64> zeroinitializer
+  %Cmp19 = icmp ne i1 %Cmp12, %Cmp
+  br i1 %Cmp19, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF80
+  %L20 = load i8* %0
+  store i8 85, i8* %0
+  %E21 = extractelement <1 x i8> <i8 -1>, i32 0
+  %Shuff22 = shufflevector <1 x i8> <i8 -1>, <1 x i8> %Shuff, <1 x i32> zeroinitializer
+  %I23 = insertelement <1 x i8> <i8 -1>, i8 %L5, i32 0
+  %FC24 = fptoui <4 x float> %FC to <4 x i16>
+  %Sl25 = select i1 %Cmp, <2 x i32> zeroinitializer, <2 x i32> <i32 -1, i32 -1>
+  %Cmp26 = icmp ult <4 x i64> %I16, %Shuff15
+  %L27 = load i8* %0
+  store i8 %L, i8* %0
+  %E28 = extractelement <1 x i8> <i8 -1>, i32 0
+  %Shuff29 = shufflevector <8 x i16> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> <i32 11, i32 undef, i32 15, i32 1, i32 3, i32 5, i32 undef, i32 9>
+  %I30 = insertelement <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i64 %E14, i32 1
+  %B31 = mul i8 %E28, 85
+  %PC = bitcast i32* %A3 to i32*
+  %Sl32 = select i1 %Cmp12, float %FC10, float 0x4712BFE680000000
+  %L33 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E34 = extractelement <2 x i16> zeroinitializer, i32 1
+  %Shuff35 = shufflevector <1 x i8> %Shuff, <1 x i8> <i8 -1>, <1 x i32> zeroinitializer
+  %I36 = insertelement <1 x i8> <i8 -1>, i8 %L13, i32 0
+  %B37 = xor i8 %L27, %L
+  %Sl38 = select i1 %Cmp, i16 %E34, i16 %E
+  %Cmp39 = icmp eq i1 %Cmp19, %Cmp
+  br i1 %Cmp39, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF81
+  %L40 = load i32* %PC
+  store i32 %3, i32* %PC
+  %E41 = extractelement <2 x i32> zeroinitializer, i32 0
+  %Shuff42 = shufflevector <2 x i32> <i32 -1, i32 -1>, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I43 = insertelement <1 x i8> <i8 -1>, i8 0, i32 0
+  %B44 = or i16 %E, -25210
+  %Se = sext i32 %3 to i64
+  %Sl45 = select i1 true, <1 x i8> %Shuff, <1 x i8> %I43
+  %Cmp46 = icmp sge <1 x i8> %I36, %Shuff
+  %L47 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E48 = extractelement <2 x i16> zeroinitializer, i32 0
+  %Shuff49 = shufflevector <1 x i8> <i8 -1>, <1 x i8> <i8 -1>, <1 x i32> <i32 1>
+  %I50 = insertelement <2 x i32> %Sl25, i32 47963, i32 1
+  %B51 = srem <1 x i8> %I, %Shuff22
+  %FC52 = sitofp i8 %5 to double
+  %Sl53 = select i1 %Cmp39, i8 %L27, i8 85
+  %Cmp54 = icmp slt i16 %E34, %E34
+  br i1 %Cmp54, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF77
+  %L55 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E56 = extractelement <8 x i16> %Shuff29, i32 4
+  %Shuff57 = shufflevector <1 x i8> <i8 -1>, <1 x i8> <i8 -1>, <1 x i32> <i32 1>
+  %I58 = insertelement <1 x i8> %B51, i8 %Sl53, i32 0
+  %ZE = fpext float %FC10 to double
+  %Sl59 = select i1 %Cmp12, <2 x i16> %B9, <2 x i16> zeroinitializer
+  %Cmp60 = fcmp ult double 0.000000e+00, 0.000000e+00
+  br i1 %Cmp60, label %CF78, label %CF79
+
+CF79:                                             ; preds = %CF79, %CF78
+  %L61 = load i32* %PC
+  store i32 %L33, i32* %A3
+  %E62 = extractelement <4 x i64> %Shuff15, i32 1
+  %Shuff63 = shufflevector <8 x i16> %Shuff29, <8 x i16> %Shuff29, <8 x i32> <i32 undef, i32 10, i32 12, i32 undef, i32 undef, i32 undef, i32 4, i32 6>
+  %I64 = insertelement <2 x i64> zeroinitializer, i64 %Se, i32 0
+  %B65 = shl i8 %5, 85
+  %ZE66 = zext <4 x i1> %Cmp26 to <4 x i32>
+  %Sl67 = select i1 %Tr, <1 x i8> %Shuff, <1 x i8> %I23
+  %Cmp68 = fcmp olt float 0x4712BFE680000000, 0x4712BFE680000000
+  br i1 %Cmp68, label %CF79, label %CF82
+
+CF82:                                             ; preds = %CF79
+  %L69 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E70 = extractelement <8 x i16> zeroinitializer, i32 3
+  %Shuff71 = shufflevector <4 x i64> %Shuff15, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i32> <i32 6, i32 undef, i32 2, i32 4>
+  %I72 = insertelement <1 x i8> <i8 -1>, i8 %L, i32 0
+  %B73 = srem i64 %E62, %Se
+  %ZE74 = zext <4 x i1> %Cmp26 to <4 x i32>
+  %Sl75 = select i1 %Cmp, i32 463279, i32 %L61
+  %Cmp76 = icmp sgt <1 x i8> %Shuff49, %Shuff22
+  store i8 %B31, i8* %0
+  store i8 85, i8* %0
+  store i32 %L33, i32* %PC
+  store i8 %B65, i8* %0
+  store i8 %L5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s3861334421.ll b/test/CodeGen/Mips/msa/llvm-stress-s3861334421.ll
new file mode 100644
index 0000000..1a03e55
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s3861334421.ll
@@ -0,0 +1,149 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; "Don't know how to expand this condition!" unreachable.
+; It should at least successfully build.
+
+define void @autogen_SD3861334421(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <2 x i32>
+  %A3 = alloca <2 x double>
+  %A2 = alloca i64
+  %A1 = alloca i64
+  %A = alloca double
+  %L = load i8* %0
+  store i8 -101, i8* %0
+  %E = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 0
+  %Shuff = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 undef, i32 1>
+  %I = insertelement <8 x i64> zeroinitializer, i64 %4, i32 5
+  %B = and i64 116376, 57247
+  %FC = uitofp i8 7 to double
+  %Sl = select i1 false, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %L5 = load i8* %0
+  store i8 %L, i8* %0
+  %E6 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 3
+  %Shuff7 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I8 = insertelement <8 x i8> %Sl, i8 7, i32 4
+  %B9 = or <8 x i64> zeroinitializer, zeroinitializer
+  %Sl10 = select i1 false, i64 116376, i64 380809
+  %Cmp = icmp sgt i32 394647, 17081
+  br label %CF
+
+CF:                                               ; preds = %CF, %BB
+  %L11 = load i8* %0
+  store i8 -87, i8* %0
+  %E12 = extractelement <4 x i64> zeroinitializer, i32 0
+  %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 7, i32 9, i32 11, i32 13, i32 undef, i32 1, i32 3, i32 5>
+  %I14 = insertelement <4 x i64> zeroinitializer, i64 380809, i32 1
+  %B15 = srem i64 %Sl10, 380809
+  %FC16 = sitofp i64 57247 to float
+  %Sl17 = select i1 false, double 0x87A9374869A78EC6, double 0.000000e+00
+  %Cmp18 = icmp uge i8 %L, %5
+  br i1 %Cmp18, label %CF, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF88, %CF
+  %L19 = load i8* %0
+  store i8 -101, i8* %0
+  %E20 = extractelement <4 x i64> zeroinitializer, i32 0
+  %Shuff21 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff7, <4 x i32> <i32 7, i32 1, i32 3, i32 5>
+  %I22 = insertelement <4 x i64> zeroinitializer, i64 127438, i32 1
+  %B23 = fdiv double %Sl17, 0.000000e+00
+  %Sl24 = select i1 %Cmp18, i32 420510, i32 492085
+  %Cmp25 = icmp ugt i1 %Cmp18, false
+  br i1 %Cmp25, label %CF80, label %CF83
+
+CF83:                                             ; preds = %CF83, %CF80
+  %L26 = load i8* %0
+  store i8 -87, i8* %0
+  %E27 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 0
+  %Shuff28 = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 7, i32 1, i32 3, i32 5>
+  %I29 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 492085, i32 1
+  %B30 = lshr <8 x i8> %I8, %I8
+  %FC31 = sitofp <4 x i32> %Shuff28 to <4 x double>
+  %Sl32 = select i1 false, <8 x i8> %I8, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %Cmp33 = icmp eq i64 %B, 116376
+  br i1 %Cmp33, label %CF83, label %CF88
+
+CF88:                                             ; preds = %CF83
+  %L34 = load i8* %0
+  store i8 -87, i8* %0
+  %E35 = extractelement <8 x i64> %Shuff, i32 7
+  %Shuff36 = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %Shuff28, <4 x i32> <i32 2, i32 undef, i32 undef, i32 0>
+  %I37 = insertelement <4 x i64> zeroinitializer, i64 380809, i32 0
+  %B38 = xor <8 x i64> %B9, %B9
+  %ZE = zext i32 0 to i64
+  %Sl39 = select i1 %Cmp33, i8 %L11, i8 %L5
+  %Cmp40 = icmp sgt i1 %Cmp, false
+  br i1 %Cmp40, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF81, %CF85, %CF87, %CF88
+  %L41 = load i8* %0
+  store i8 %L34, i8* %0
+  %E42 = extractelement <8 x i64> %Shuff13, i32 6
+  %Shuff43 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 7>
+  %I44 = insertelement <4 x i64> zeroinitializer, i64 116376, i32 3
+  %B45 = fsub float %FC16, 0x3AC86DCC40000000
+  %Tr = trunc <4 x i64> %I14 to <4 x i32>
+  %Sl46 = select i1 false, <8 x i64> %B38, <8 x i64> zeroinitializer
+  %Cmp47 = icmp sgt i1 %Cmp18, %Cmp18
+  br i1 %Cmp47, label %CF81, label %CF85
+
+CF85:                                             ; preds = %CF81
+  %L48 = load i8* %0
+  store i8 -101, i8* %0
+  %E49 = extractelement <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i32 2
+  %Shuff50 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I51 = insertelement <4 x i64> zeroinitializer, i64 %E20, i32 3
+  %B52 = or i32 336955, %Sl24
+  %FC53 = uitofp i8 %L48 to double
+  %Sl54 = select i1 %Cmp47, i32 %3, i32 %Sl24
+  %Cmp55 = icmp ne <8 x i64> %Shuff13, zeroinitializer
+  %L56 = load i8* %0
+  store i8 %L11, i8* %0
+  %E57 = extractelement <4 x i64> %Shuff21, i32 1
+  %Shuff58 = shufflevector <8 x i64> %Shuff, <8 x i64> zeroinitializer, <8 x i32> <i32 4, i32 6, i32 undef, i32 10, i32 12, i32 undef, i32 0, i32 2>
+  %I59 = insertelement <4 x i64> zeroinitializer, i64 %E42, i32 2
+  %B60 = udiv <8 x i8> %Sl, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %Tr61 = trunc i8 49 to i1
+  br i1 %Tr61, label %CF81, label %CF84
+
+CF84:                                             ; preds = %CF84, %CF85
+  %Sl62 = select i1 false, i8 %L, i8 %L48
+  %Cmp63 = icmp ne <8 x i64> %I, zeroinitializer
+  %L64 = load i8* %0
+  store i8 %5, i8* %0
+  %E65 = extractelement <8 x i1> %Cmp55, i32 0
+  br i1 %E65, label %CF84, label %CF87
+
+CF87:                                             ; preds = %CF84
+  %Shuff66 = shufflevector <4 x i64> %Shuff21, <4 x i64> %I14, <4 x i32> <i32 3, i32 undef, i32 7, i32 1>
+  %I67 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 %Sl54, i32 1
+  %B68 = frem double %B23, %Sl17
+  %ZE69 = zext <8 x i8> %Sl32 to <8 x i64>
+  %Sl70 = select i1 %Tr61, i64 %E20, i64 %E12
+  %Cmp71 = icmp slt <8 x i64> %I, %Shuff
+  %L72 = load i8* %0
+  store i8 %L72, i8* %0
+  %E73 = extractelement <8 x i1> %Cmp55, i32 6
+  br i1 %E73, label %CF81, label %CF82
+
+CF82:                                             ; preds = %CF82, %CF87
+  %Shuff74 = shufflevector <4 x i32> %I67, <4 x i32> %I29, <4 x i32> <i32 1, i32 3, i32 undef, i32 7>
+  %I75 = insertelement <4 x i64> zeroinitializer, i64 380809, i32 3
+  %B76 = fsub double 0.000000e+00, %FC53
+  %Tr77 = trunc i32 %E to i8
+  %Sl78 = select i1 %Cmp18, i64* %A2, i64* %2
+  %Cmp79 = icmp eq i32 394647, 492085
+  br i1 %Cmp79, label %CF82, label %CF86
+
+CF86:                                             ; preds = %CF82
+  store i64 %Sl70, i64* %Sl78
+  store i64 %E57, i64* %Sl78
+  store i64 %Sl70, i64* %Sl78
+  store i64 %B, i64* %Sl78
+  store i64 %Sl10, i64* %Sl78
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s3926023935.ll b/test/CodeGen/Mips/msa/llvm-stress-s3926023935.ll
new file mode 100644
index 0000000..96547d9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s3926023935.ll
@@ -0,0 +1,143 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; "Type for zero vector elements is not legal" assertion.
+; It should at least successfully build.
+
+define void @autogen_SD3926023935(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i1
+  %A3 = alloca float
+  %A2 = alloca double
+  %A1 = alloca float
+  %A = alloca double
+  %L = load i8* %0
+  store i8 -123, i8* %0
+  %E = extractelement <4 x i64> zeroinitializer, i32 1
+  %Shuff = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %I = insertelement <2 x i1> zeroinitializer, i1 false, i32 0
+  %BC = bitcast i64 181325 to double
+  %Sl = select i1 false, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+  %Cmp = icmp ne <4 x i64> zeroinitializer, zeroinitializer
+  %L5 = load i8* %0
+  store i8 %L, i8* %0
+  %E6 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff7 = shufflevector <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  %I8 = insertelement <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i64 498254, i32 4
+  %B = shl i32 0, 364464
+  %Sl9 = select i1 false, i64 %E, i64 498254
+  %Cmp10 = icmp sge i8 -123, %5
+  br label %CF80
+
+CF80:                                             ; preds = %BB
+  %L11 = load i8* %0
+  store i8 -123, i8* %0
+  %E12 = extractelement <2 x i16> zeroinitializer, i32 1
+  %Shuff13 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %I14 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 %B, i32 2
+  %B15 = sdiv i64 334618, -1
+  %PC = bitcast i1* %A4 to i64*
+  %Sl16 = select i1 %Cmp10, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %Cmp17 = icmp ule <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %Sl16
+  %L18 = load double* %A2
+  store i64 498254, i64* %PC
+  %E19 = extractelement <4 x i64> zeroinitializer, i32 0
+  %Shuff20 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %I, <2 x i32> <i32 3, i32 1>
+  %I21 = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %B22 = fadd double 0.000000e+00, %BC
+  %ZE = zext <2 x i1> %Shuff20 to <2 x i32>
+  %Sl23 = select i1 %Cmp10, <2 x i1> %Shuff20, <2 x i1> zeroinitializer
+  %Cmp24 = icmp ult <2 x i32> zeroinitializer, zeroinitializer
+  %L25 = load i8* %0
+  store i8 %L25, i8* %0
+  %E26 = extractelement <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, i32 3
+  %Shuff27 = shufflevector <4 x i32> %Shuff, <4 x i32> %I14, <4 x i32> <i32 6, i32 0, i32 undef, i32 4>
+  %I28 = insertelement <4 x i32> zeroinitializer, i32 %3, i32 0
+  %B29 = lshr i8 %E26, -43
+  %Tr = trunc i8 %L5 to i1
+  br label %CF79
+
+CF79:                                             ; preds = %CF80
+  %Sl30 = select i1 false, i8 %B29, i8 -123
+  %Cmp31 = icmp sge <2 x i1> %I, %I
+  %L32 = load i64* %PC
+  store i8 -123, i8* %0
+  %E33 = extractelement <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i32 2
+  %Shuff34 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff13, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I35 = insertelement <4 x i64> zeroinitializer, i64 498254, i32 3
+  %B36 = sub <8 x i64> %I8, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  %PC37 = bitcast i8* %0 to i1*
+  %Sl38 = select i1 %Cmp10, i8 -43, i8 %L5
+  %Cmp39 = icmp eq i64 498254, %B15
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF79
+  %L40 = load double* %A
+  store i1 %Cmp39, i1* %PC37
+  %E41 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff42 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %ZE, <2 x i32> <i32 2, i32 undef>
+  %I43 = insertelement <4 x i32> %Shuff, i32 %3, i32 0
+  %B44 = shl i64 %E41, -1
+  %Se = sext <2 x i1> %I to <2 x i32>
+  %Sl45 = select i1 %Cmp10, i1 false, i1 false
+  br i1 %Sl45, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF
+  %Cmp46 = fcmp uno double 0.000000e+00, 0.000000e+00
+  br i1 %Cmp46, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF83, %CF82, %CF77
+  %L47 = load i64* %PC
+  store i8 -123, i8* %0
+  %E48 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff49 = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 6, i32 undef>
+  %I50 = insertelement <2 x i1> zeroinitializer, i1 %Cmp10, i32 0
+  %B51 = sdiv i64 %E19, 463132
+  %Tr52 = trunc i64 %E48 to i32
+  %Sl53 = select i1 %Tr, i1 %Cmp46, i1 %Cmp10
+  br i1 %Sl53, label %CF78, label %CF83
+
+CF83:                                             ; preds = %CF78
+  %Cmp54 = fcmp uge double %L40, %L40
+  br i1 %Cmp54, label %CF78, label %CF82
+
+CF82:                                             ; preds = %CF83
+  %L55 = load i64* %PC
+  store i64 %L32, i64* %PC
+  %E56 = extractelement <2 x i16> %Shuff7, i32 1
+  %Shuff57 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I58 = insertelement <2 x i32> %Sl, i32 %Tr52, i32 0
+  %B59 = or i32 %B, %3
+  %FC = sitofp i64 498254 to double
+  %Sl60 = select i1 false, i64 %E6, i64 -1
+  %Cmp61 = icmp sgt <4 x i32> %Shuff27, %I43
+  %L62 = load i64* %PC
+  store i64 %Sl9, i64* %PC
+  %E63 = extractelement <2 x i32> %ZE, i32 0
+  %Shuff64 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff13, <4 x i32> <i32 1, i32 3, i32 undef, i32 7>
+  %I65 = insertelement <4 x i32> %Shuff, i32 %3, i32 3
+  %B66 = sub i64 %L47, 53612
+  %Tr67 = trunc i64 %4 to i32
+  %Sl68 = select i1 %Cmp39, i1 %Cmp39, i1 false
+  br i1 %Sl68, label %CF78, label %CF81
+
+CF81:                                             ; preds = %CF82
+  %Cmp69 = icmp ne <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, %B36
+  %L70 = load i8* %0
+  store i64 %L55, i64* %PC
+  %E71 = extractelement <4 x i32> %Shuff49, i32 1
+  %Shuff72 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff34, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %I73 = insertelement <4 x i64> %Shuff64, i64 %E, i32 2
+  %B74 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, %B36
+  %Sl75 = select i1 %Sl68, i64 %B51, i64 %L55
+  %Cmp76 = icmp sgt <8 x i64> %B74, %B36
+  store i1 %Cmp39, i1* %PC37
+  store i64 %E41, i64* %PC
+  store i64 %L32, i64* %PC
+  store i64 %Sl75, i64* %2
+  store i64 %L32, i64* %PC
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s3997499501.ll b/test/CodeGen/Mips/msa/llvm-stress-s3997499501.ll
new file mode 100644
index 0000000..bef75f3
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s3997499501.ll
@@ -0,0 +1,152 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed to select instructions for extract_vector_elt for
+; v4f32 on MSA.
+; It should at least successfully build.
+
+define void @autogen_SD3997499501(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <1 x double>
+  %A3 = alloca double
+  %A2 = alloca float
+  %A1 = alloca double
+  %A = alloca double
+  %L = load i8* %0
+  store i8 97, i8* %0
+  %E = extractelement <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i32 14
+  %Shuff = shufflevector <2 x i1> zeroinitializer, <2 x i1> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I = insertelement <4 x i64> zeroinitializer, i64 0, i32 3
+  %Tr = trunc <1 x i64> zeroinitializer to <1 x i8>
+  %Sl = select i1 false, double* %A1, double* %A
+  %Cmp = icmp ne <2 x i64> zeroinitializer, zeroinitializer
+  %L5 = load double* %Sl
+  store float -4.374162e+06, float* %A2
+  %E6 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff7 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %I, <4 x i32> <i32 2, i32 4, i32 6, i32 undef>
+  %I8 = insertelement <2 x i1> %Shuff, i1 false, i32 0
+  %B = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %PC = bitcast float* %A2 to float*
+  %Sl9 = select i1 false, i32 82299, i32 0
+  %Cmp10 = icmp slt i8 97, %5
+  br label %CF72
+
+CF72:                                             ; preds = %CF72, %CF80, %CF78, %BB
+  %L11 = load double* %Sl
+  store double 0.000000e+00, double* %Sl
+  %E12 = extractelement <2 x i1> zeroinitializer, i32 0
+  br i1 %E12, label %CF72, label %CF80
+
+CF80:                                             ; preds = %CF72
+  %Shuff13 = shufflevector <2 x i1> zeroinitializer, <2 x i1> zeroinitializer, <2 x i32> <i32 3, i32 1>
+  %I14 = insertelement <2 x i64> zeroinitializer, i64 %4, i32 1
+  %B15 = fadd double %L5, 0.000000e+00
+  %BC = bitcast i32 0 to float
+  %Sl16 = select i1 %E12, float 0xC7957ED940000000, float %BC
+  %Cmp17 = icmp eq i32 136082, 471909
+  br i1 %Cmp17, label %CF72, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF80
+  %L18 = load double* %Sl
+  store double 0.000000e+00, double* %Sl
+  %E19 = extractelement <2 x i1> zeroinitializer, i32 0
+  br i1 %E19, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF77
+  %Shuff20 = shufflevector <2 x i1> zeroinitializer, <2 x i1> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I21 = insertelement <8 x i1> zeroinitializer, i1 %Cmp10, i32 7
+  %B22 = sdiv <4 x i64> %Shuff7, zeroinitializer
+  %FC = uitofp i8 97 to double
+  %Sl23 = select i1 %Cmp10, <2 x i1> zeroinitializer, <2 x i1> zeroinitializer
+  %L24 = load double* %Sl
+  store float %Sl16, float* %PC
+  %E25 = extractelement <2 x i1> %Shuff, i32 1
+  br i1 %E25, label %CF72, label %CF76
+
+CF76:                                             ; preds = %CF78
+  %Shuff26 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %B22, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
+  %I27 = insertelement <4 x i64> zeroinitializer, i64 %E, i32 2
+  %B28 = mul <4 x i64> %I27, zeroinitializer
+  %ZE = zext <8 x i1> zeroinitializer to <8 x i64>
+  %Sl29 = select i1 %Cmp17, float -4.374162e+06, float -4.374162e+06
+  %L30 = load i8* %0
+  store double %L5, double* %Sl
+  %E31 = extractelement <8 x i1> zeroinitializer, i32 5
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF81, %CF76
+  %Shuff32 = shufflevector <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <16 x i32> <i32 8, i32 undef, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 26, i32 28, i32 30, i32 undef, i32 2, i32 4, i32 6>
+  %I33 = insertelement <8 x i1> zeroinitializer, i1 false, i32 2
+  %BC34 = bitcast <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> to <4 x float>
+  %Sl35 = select i1 %E12, <2 x i1> %I8, <2 x i1> zeroinitializer
+  %Cmp36 = fcmp oge double 0xC2C3BAE2D5C18360, 0xC2C3BAE2D5C18360
+  br i1 %Cmp36, label %CF, label %CF74
+
+CF74:                                             ; preds = %CF74, %CF
+  %L37 = load float* %PC
+  store double 0.000000e+00, double* %Sl
+  %E38 = extractelement <2 x i1> %Sl23, i32 1
+  br i1 %E38, label %CF74, label %CF75
+
+CF75:                                             ; preds = %CF75, %CF82, %CF74
+  %Shuff39 = shufflevector <2 x i1> %Shuff13, <2 x i1> zeroinitializer, <2 x i32> <i32 undef, i32 2>
+  %I40 = insertelement <4 x i64> zeroinitializer, i64 %4, i32 2
+  %Sl41 = select i1 %Cmp10, i32 0, i32 %3
+  %Cmp42 = icmp ne <1 x i64> zeroinitializer, zeroinitializer
+  %L43 = load double* %Sl
+  store i64 %4, i64* %2
+  %E44 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E44, label %CF75, label %CF82
+
+CF82:                                             ; preds = %CF75
+  %Shuff45 = shufflevector <2 x i1> %Sl23, <2 x i1> %Sl23, <2 x i32> <i32 2, i32 0>
+  %I46 = insertelement <4 x i64> zeroinitializer, i64 0, i32 0
+  %B47 = sub i64 %E, %E6
+  %Sl48 = select i1 %Cmp10, double %L5, double %L43
+  %Cmp49 = icmp uge i64 %4, %B47
+  br i1 %Cmp49, label %CF75, label %CF81
+
+CF81:                                             ; preds = %CF82
+  %L50 = load i8* %0
+  store double %L43, double* %Sl
+  %E51 = extractelement <4 x i64> %Shuff7, i32 3
+  %Shuff52 = shufflevector <4 x float> %BC34, <4 x float> %BC34, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I53 = insertelement <2 x i1> %Cmp, i1 %E25, i32 0
+  %B54 = fdiv double %L24, %L43
+  %BC55 = bitcast <4 x i64> zeroinitializer to <4 x double>
+  %Sl56 = select i1 false, i8 %5, i8 97
+  %L57 = load i8* %0
+  store i8 %L50, i8* %0
+  %E58 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E58, label %CF, label %CF73
+
+CF73:                                             ; preds = %CF73, %CF81
+  %Shuff59 = shufflevector <2 x i1> %Shuff13, <2 x i1> %Shuff45, <2 x i32> <i32 undef, i32 0>
+  %I60 = insertelement <4 x float> %Shuff52, float -4.374162e+06, i32 0
+  %B61 = mul <4 x i64> %I46, zeroinitializer
+  %PC62 = bitcast double* %A3 to float*
+  %Sl63 = select i1 %Cmp10, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer
+  %Cmp64 = icmp ne <2 x i1> %Cmp, %Shuff
+  %L65 = load double* %A1
+  store float -4.374162e+06, float* %PC62
+  %E66 = extractelement <8 x i1> %I21, i32 3
+  br i1 %E66, label %CF73, label %CF79
+
+CF79:                                             ; preds = %CF79, %CF73
+  %Shuff67 = shufflevector <8 x i1> %I21, <8 x i1> %I21, <8 x i32> <i32 6, i32 8, i32 10, i32 12, i32 14, i32 0, i32 undef, i32 4>
+  %I68 = insertelement <1 x i1> %Cmp42, i1 %E25, i32 0
+  %B69 = sdiv <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  %Sl70 = select i1 %Cmp49, <2 x i1> %Sl23, <2 x i1> %Shuff45
+  %Cmp71 = icmp ne i1 false, false
+  br i1 %Cmp71, label %CF79, label %CF83
+
+CF83:                                             ; preds = %CF79
+  store double 0.000000e+00, double* %Sl
+  store float %BC, float* %PC62
+  store double %Sl48, double* %Sl
+  store double %FC, double* %Sl
+  store float %BC, float* %PC62
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll b/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll
new file mode 100644
index 0000000..24e27cb
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test is based on an llvm-stress generated test case with seed=449609655
+
+; This test originally failed for MSA with a
+; "Comparison requires equal bit widths" assertion.
+; The legalizer legalized ; the <4 x i8>'s into <4 x i32>'s, then a call to
+; isVSplat() returned the splat value for <i8 -1, i8 -1, ...> as a 32-bit APInt
+; (255), but the zeroinitializer splat value as an 8-bit APInt (0). The
+; assertion occured when trying to check the values were bitwise inverses of
+; each-other.
+;
+; It should at least successfully build.
+
+define void @autogen_SD449609655(i8) {
+BB:
+  %Cmp = icmp ult i8 -3, %0
+  br label %CF78
+
+CF78:                                             ; preds = %CF81, %CF78, %BB
+  %Sl31 = select i1 %Cmp, <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, <4 x i8> zeroinitializer
+  br i1 undef, label %CF78, label %CF81
+
+CF81:                                             ; preds = %CF78
+  br i1 undef, label %CF78, label %CF80
+
+CF80:                                             ; preds = %CF81
+  %I59 = insertelement <4 x i8> %Sl31, i8 undef, i32 1
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s525530439.ll b/test/CodeGen/Mips/msa/llvm-stress-s525530439.ll
new file mode 100644
index 0000000..697871d
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s525530439.ll
@@ -0,0 +1,139 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; `Num < NumOperands && "Invalid child # of SDNode!"' assertion.
+; It should at least successfully build.
+
+define void @autogen_SD525530439(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i32
+  %A3 = alloca double
+  %A2 = alloca <1 x double>
+  %A1 = alloca <8 x double>
+  %A = alloca i64
+  %L = load i8* %0
+  store i64 33695, i64* %A
+  %E = extractelement <4 x i32> zeroinitializer, i32 3
+  %Shuff = shufflevector <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 2, i32 0>
+  %I = insertelement <4 x i16> zeroinitializer, i16 -11642, i32 0
+  %B = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %ZE = fpext float 0x3B64A2B880000000 to double
+  %Sl = select i1 true, i16 -1, i16 -11642
+  %L5 = load i8* %0
+  store i8 0, i8* %0
+  %E6 = extractelement <4 x i32> zeroinitializer, i32 2
+  %Shuff7 = shufflevector <8 x i1> zeroinitializer, <8 x i1> zeroinitializer, <8 x i32> <i32 undef, i32 7, i32 9, i32 11, i32 13, i32 15, i32 1, i32 undef>
+  %I8 = insertelement <4 x i32> zeroinitializer, i32 %3, i32 3
+  %B9 = sub i32 71140, 439732
+  %BC = bitcast <2 x i32> <i32 -1, i32 -1> to <2 x float>
+  %Sl10 = select i1 true, i32* %1, i32* %1
+  %Cmp = icmp sge <8 x i64> zeroinitializer, zeroinitializer
+  %L11 = load i32* %Sl10
+  store <1 x double> zeroinitializer, <1 x double>* %A2
+  %E12 = extractelement <4 x i16> zeroinitializer, i32 0
+  %Shuff13 = shufflevector <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i32> undef
+  %I14 = insertelement <1 x i16> zeroinitializer, i16 %Sl, i32 0
+  %B15 = or i16 -1, %E12
+  %BC16 = bitcast <4 x i32> zeroinitializer to <4 x float>
+  %Sl17 = select i1 true, i64 %4, i64 %4
+  %Cmp18 = fcmp ugt float 0xC5ABB1BF80000000, 0x3EEF3D6300000000
+  br label %CF75
+
+CF75:                                             ; preds = %CF75, %BB
+  %L19 = load i32* %Sl10
+  store i32 %L11, i32* %Sl10
+  %E20 = extractelement <4 x i32> zeroinitializer, i32 1
+  %Shuff21 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %I8, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
+  %I22 = insertelement <4 x float> %BC16, float 0x3EEF3D6300000000, i32 2
+  %B23 = shl i32 71140, 439732
+  %ZE24 = fpext <4 x float> %I22 to <4 x double>
+  %Sl25 = select i1 %Cmp18, i32 %L11, i32 %L11
+  %Cmp26 = icmp ne i32 %E20, %L19
+  br i1 %Cmp26, label %CF75, label %CF76
+
+CF76:                                             ; preds = %CF75
+  %L27 = load i32* %Sl10
+  store i32 439732, i32* %Sl10
+  %E28 = extractelement <4 x i32> %Shuff21, i32 3
+  %Shuff29 = shufflevector <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 0>
+  %I30 = insertelement <8 x i1> %Shuff7, i1 %Cmp18, i32 4
+  %Sl31 = select i1 %Cmp18, i32 %3, i32 %B23
+  %Cmp32 = icmp ugt i32 0, %3
+  br label %CF74
+
+CF74:                                             ; preds = %CF74, %CF80, %CF78, %CF76
+  %L33 = load i64* %2
+  store i32 71140, i32* %Sl10
+  %E34 = extractelement <4 x i32> zeroinitializer, i32 1
+  %Shuff35 = shufflevector <1 x i16> zeroinitializer, <1 x i16> zeroinitializer, <1 x i32> undef
+  %I36 = insertelement <4 x i16> zeroinitializer, i16 -11642, i32 0
+  %B37 = mul <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %Shuff29
+  %Sl38 = select i1 %Cmp18, double 0.000000e+00, double 0x2BA9DB480DA732C6
+  %Cmp39 = icmp sgt i16 -11642, %Sl
+  br i1 %Cmp39, label %CF74, label %CF80
+
+CF80:                                             ; preds = %CF74
+  %L40 = load i8* %0
+  store i32 0, i32* %Sl10
+  %E41 = extractelement <8 x i64> zeroinitializer, i32 1
+  %Shuff42 = shufflevector <1 x i16> %I14, <1 x i16> %I14, <1 x i32> undef
+  %I43 = insertelement <4 x i16> %I36, i16 -11642, i32 0
+  %FC = fptoui float 0x455CA2B080000000 to i16
+  %Sl44 = select i1 %Cmp18, i1 %Cmp18, i1 %Cmp39
+  br i1 %Sl44, label %CF74, label %CF78
+
+CF78:                                             ; preds = %CF80
+  %L45 = load i32* %Sl10
+  store i8 %L5, i8* %0
+  %E46 = extractelement <8 x i1> %Shuff7, i32 2
+  br i1 %E46, label %CF74, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF78
+  %Shuff47 = shufflevector <4 x i16> %I43, <4 x i16> zeroinitializer, <4 x i32> <i32 5, i32 undef, i32 1, i32 3>
+  %I48 = insertelement <1 x i16> %Shuff42, i16 %Sl, i32 0
+  %B49 = mul i8 0, %L40
+  %FC50 = uitofp i32 %3 to double
+  %Sl51 = select i1 %Sl44, i32 %L27, i32 0
+  %Cmp52 = icmp sge i8 %B49, 0
+  br i1 %Cmp52, label %CF77, label %CF79
+
+CF79:                                             ; preds = %CF77
+  %L53 = load i32* %Sl10
+  store i8 %L40, i8* %0
+  %E54 = extractelement <4 x i32> zeroinitializer, i32 1
+  %Shuff55 = shufflevector <4 x i32> %Shuff21, <4 x i32> %I8, <4 x i32> <i32 4, i32 6, i32 undef, i32 2>
+  %I56 = insertelement <4 x i32> zeroinitializer, i32 %Sl51, i32 2
+  %Tr = trunc <1 x i64> %Shuff13 to <1 x i16>
+  %Sl57 = select i1 %Cmp18, <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 -1, i32 -1>
+  %Cmp58 = icmp uge <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %I56
+  %L59 = load i8* %0
+  store <1 x double> zeroinitializer, <1 x double>* %A2
+  %E60 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff61 = shufflevector <4 x i32> %I8, <4 x i32> %I8, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
+  %I62 = insertelement <4 x i16> zeroinitializer, i16 %E12, i32 1
+  %B63 = and <4 x i32> %Shuff61, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %PC = bitcast double* %A3 to i32*
+  %Sl64 = select i1 %Cmp18, <4 x i32> %Shuff61, <4 x i32> %Shuff55
+  %Cmp65 = icmp sgt i32 439732, %3
+  br label %CF
+
+CF:                                               ; preds = %CF79
+  %L66 = load i32* %Sl10
+  store i32 %E6, i32* %PC
+  %E67 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 2
+  %Shuff68 = shufflevector <4 x i32> %Sl64, <4 x i32> %I8, <4 x i32> <i32 5, i32 undef, i32 1, i32 undef>
+  %I69 = insertelement <4 x i16> %Shuff47, i16 %Sl, i32 3
+  %B70 = sdiv <4 x i64> zeroinitializer, zeroinitializer
+  %FC71 = sitofp i32 %L66 to double
+  %Sl72 = select i1 %Cmp18, i64 %4, i64 %4
+  %Cmp73 = icmp eq <4 x i64> zeroinitializer, %B70
+  store i32 %B23, i32* %PC
+  store i32 %3, i32* %PC
+  store i32 %3, i32* %Sl10
+  store i32 %L27, i32* %1
+  store i32 0, i32* %PC
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll b/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll
new file mode 100644
index 0000000..dc4200a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll
@@ -0,0 +1,143 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed to select instructions for extract_vector_elt for
+; v2f64 on MSA.
+; It should at least successfully build.
+
+define void @autogen_SD997348632(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <2 x i32>
+  %A3 = alloca <16 x i16>
+  %A2 = alloca <4 x i1>
+  %A1 = alloca <4 x i16>
+  %A = alloca <2 x i32>
+  %L = load i8* %0
+  store i8 %L, i8* %0
+  %E = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 1, i32 3, i32 5>
+  %I = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %FC = sitofp <4 x i32> zeroinitializer to <4 x double>
+  %Sl = select i1 false, <4 x i64> %Shuff, <4 x i64> %Shuff
+  %L5 = load i8* %0
+  store i8 %5, i8* %0
+  %E6 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff7 = shufflevector <2 x i1> %I, <2 x i1> %I, <2 x i32> <i32 1, i32 undef>
+  %I8 = insertelement <1 x i16> zeroinitializer, i16 0, i32 0
+  %B = xor i32 376034, %3
+  %FC9 = fptoui float 0x406DB70180000000 to i64
+  %Sl10 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %Cmp = icmp ult <4 x i64> zeroinitializer, zeroinitializer
+  %L11 = load i8* %0
+  store i8 %L, i8* %0
+  %E12 = extractelement <4 x i64> zeroinitializer, i32 2
+  %Shuff13 = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 undef, i32 3>
+  %I14 = insertelement <8 x i32> zeroinitializer, i32 -1, i32 7
+  %B15 = fdiv <4 x double> %FC, %FC
+  %Tr = trunc i32 376034 to i16
+  %Sl16 = select i1 false, <8 x i32> %Sl10, <8 x i32> zeroinitializer
+  %Cmp17 = icmp uge i32 233658, %E
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF79, %CF84, %BB
+  %L18 = load i8* %0
+  store i8 %L, i8* %0
+  %E19 = extractelement <4 x i64> %Sl, i32 3
+  %Shuff20 = shufflevector <2 x i1> %Shuff7, <2 x i1> %I, <2 x i32> <i32 2, i32 0>
+  %I21 = insertelement <4 x i64> zeroinitializer, i64 %FC9, i32 0
+  %B22 = xor <8 x i32> %I14, %I14
+  %Tr23 = trunc i16 0 to i8
+  %Sl24 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
+  %Cmp25 = icmp eq i1 false, false
+  br i1 %Cmp25, label %CF, label %CF79
+
+CF79:                                             ; preds = %CF
+  %L26 = load i8* %0
+  store i8 %L26, i8* %0
+  %E27 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff28 = shufflevector <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  %I29 = insertelement <16 x i32> %Shuff28, i32 %B, i32 15
+  %B30 = fdiv float 0.000000e+00, -6.749110e+06
+  %Sl31 = select i1 false, i32 %3, i32 %3
+  %Cmp32 = fcmp uno float 0.000000e+00, 0x406DB70180000000
+  br i1 %Cmp32, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF79
+  %L33 = load i8* %0
+  store i8 %L, i8* %0
+  %E34 = extractelement <16 x i32> %Shuff28, i32 1
+  %Shuff35 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %I21, <4 x i32> <i32 undef, i32 6, i32 0, i32 2>
+  %I36 = insertelement <4 x double> %FC, double 0xA4A57F449CA36CC2, i32 2
+  %Se = sext <4 x i1> %Cmp to <4 x i32>
+  %Sl37 = select i1 %Cmp17, i32 0, i32 0
+  %Cmp38 = icmp ne i32 440284, 376034
+  br i1 %Cmp38, label %CF78, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF82, %CF78
+  %L39 = load i8* %0
+  store i8 %L, i8* %0
+  %E40 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E40, label %CF80, label %CF82
+
+CF82:                                             ; preds = %CF80
+  %Shuff41 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff20, <2 x i32> <i32 2, i32 0>
+  %I42 = insertelement <2 x i1> %Shuff41, i1 false, i32 0
+  %B43 = sub i32 %E, 0
+  %Sl44 = select i1 %Cmp32, <16 x i32> %Shuff28, <16 x i32> %Shuff28
+  %Cmp45 = icmp sgt <4 x i64> zeroinitializer, %I21
+  %L46 = load i8* %0
+  store i8 %L11, i8* %0
+  %E47 = extractelement <8 x i32> %Sl16, i32 4
+  %Shuff48 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff7, <2 x i32> <i32 undef, i32 1>
+  %I49 = insertelement <2 x i1> %Shuff48, i1 %Cmp17, i32 1
+  %B50 = and <8 x i32> %I14, %Sl10
+  %FC51 = fptoui float -6.749110e+06 to i1
+  br i1 %FC51, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF81, %CF82
+  %Sl52 = select i1 false, float -6.749110e+06, float 0x406DB70180000000
+  %Cmp53 = icmp uge <2 x i32> <i32 -1, i32 -1>, <i32 -1, i32 -1>
+  %L54 = load i8* %0
+  store i8 %L5, i8* %0
+  %E55 = extractelement <8 x i32> zeroinitializer, i32 7
+  %Shuff56 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 4, i32 6, i32 0>
+  %I57 = insertelement <2 x i1> %Shuff7, i1 false, i32 0
+  %B58 = fmul <4 x double> %FC, %FC
+  %FC59 = fptoui <4 x double> %I36 to <4 x i16>
+  %Sl60 = select i1 %Cmp17, <2 x i1> %I, <2 x i1> %I57
+  %Cmp61 = icmp ule <8 x i32> %B50, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %L62 = load i8* %0
+  store i8 %L33, i8* %0
+  %E63 = extractelement <4 x i64> %Shuff, i32 2
+  %Shuff64 = shufflevector <4 x i64> %Shuff56, <4 x i64> %Shuff56, <4 x i32> <i32 5, i32 7, i32 1, i32 undef>
+  %I65 = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %B66 = sdiv i32 %B, %E55
+  %Tr67 = trunc i8 %L54 to i1
+  br i1 %Tr67, label %CF81, label %CF83
+
+CF83:                                             ; preds = %CF83, %CF81
+  %Sl68 = select i1 %Cmp17, i1 %Cmp25, i1 %Tr67
+  br i1 %Sl68, label %CF83, label %CF84
+
+CF84:                                             ; preds = %CF83
+  %Cmp69 = icmp uge i32 %E, %E34
+  br i1 %Cmp69, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF84
+  %L70 = load i8* %0
+  store i8 %L, i8* %0
+  %E71 = extractelement <4 x i64> %Shuff, i32 0
+  %Shuff72 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %I, <2 x i32> <i32 3, i32 1>
+  %I73 = insertelement <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 %B66, i32 1
+  %FC74 = uitofp i1 %Cmp32 to double
+  %Sl75 = select i1 %FC51, i16 9704, i16 0
+  %Cmp76 = icmp ugt <1 x i16> %I8, %I8
+  store i8 %L39, i8* %0
+  store i8 %5, i8* %0
+  store i8 %Tr23, i8* %0
+  store i8 %L, i8* %0
+  store i8 %5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-sz1-s742806235.ll b/test/CodeGen/Mips/msa/llvm-stress-sz1-s742806235.ll
new file mode 100644
index 0000000..8c4fcba
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-sz1-s742806235.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed to select code for a truncstore of a
+; build_vector.
+; It should at least successfully build.
+
+define void @autogen_SD742806235(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca double
+  %A3 = alloca double
+  %A2 = alloca <8 x i8>
+  %A1 = alloca <4 x float>
+  %A = alloca i1
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  store <8 x i8> <i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1>, <8 x i8>* %A2
+  store i8 %5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/shift-dagcombine.ll b/test/CodeGen/Mips/msa/shift-dagcombine.ll
new file mode 100644
index 0000000..0d809fb
--- /dev/null
+++ b/test/CodeGen/Mips/msa/shift-dagcombine.ll
@@ -0,0 +1,70 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @ashr_v4i32(<4 x i32>* %c) nounwind {
+  ; CHECK-LABEL: ashr_v4i32:
+
+  %1 = ashr <4 x i32> <i32 1, i32 2, i32 4, i32 8>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sra
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], 1
+  ; CHECK-NOT: sra
+  store volatile <4 x i32> %1, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  %2 = ashr <4 x i32> <i32 -2, i32 -4, i32 -8, i32 -16>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sra
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], -2
+  ; CHECK-NOT: sra
+  store volatile <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK-LABEL: .size ashr_v4i32
+}
+
+define void @lshr_v4i32(<4 x i32>* %c) nounwind {
+  ; CHECK-LABEL: lshr_v4i32:
+
+  %1 = lshr <4 x i32> <i32 1, i32 2, i32 4, i32 8>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: srl
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], 1
+  ; CHECK-NOT: srl
+  store volatile <4 x i32> %1, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  %2 = lshr <4 x i32> <i32 -2, i32 -4, i32 -8, i32 -16>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: srl
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], %lo
+  ; CHECK-NOT: srl
+  store volatile <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK-LABEL: .size lshr_v4i32
+}
+
+define void @shl_v4i32(<4 x i32>* %c) nounwind {
+  ; CHECK-LABEL: shl_v4i32:
+
+  %1 = shl <4 x i32> <i32 8, i32 4, i32 2, i32 1>,
+                     <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sll
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], 8
+  ; CHECK-NOT: sll
+  store volatile <4 x i32> %1, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  %2 = shl <4 x i32> <i32 -8, i32 -4, i32 -2, i32 -1>,
+                     <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sll
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], -8
+  ; CHECK-NOT: sll
+  store volatile <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK-LABEL: .size shl_v4i32
+}
diff --git a/test/CodeGen/Mips/msa/shuffle.ll b/test/CodeGen/Mips/msa/shuffle.ll
new file mode 100644
index 0000000..316c669
--- /dev/null
+++ b/test/CodeGen/Mips/msa/shuffle.ll
@@ -0,0 +1,803 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_0
+}
+
+define void @vshf_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_1:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_1
+}
+
+define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_2:
+
+  %1 = load <16 x i8>* %a
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_2
+}
+
+define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_3:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 17, i32 24, i32 25, i32 18, i32 19, i32 20, i32 28, i32 19, i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_3
+}
+
+define void @vshf_v16i8_4(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_4:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> <i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17>
+  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_4
+}
+
+define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_0
+}
+
+define void @vshf_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_1:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_1
+}
+
+define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_2:
+
+  %1 = load <8 x i16>* %a
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_2
+}
+
+define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_3:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_3
+}
+
+define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_4:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> %1, <8 x i32> <i32 1, i32 9, i32 1, i32 9, i32 1, i32 9, i32 1, i32 9>
+  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_4
+}
+
+; Note: v4i32 only has one 4-element set so it's impossible to get a vshf.w
+; instruction when using a single vector.
+
+define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_0
+}
+
+define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_1:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_1
+}
+
+define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_2:
+
+  %1 = load <4 x i32>* %a
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 5, i32 6, i32 4>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R2]], 36
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_2
+}
+
+define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_3:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 6, i32 4>
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.w [[R3]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_3
+}
+
+define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_4:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 5, i32 5, i32 1>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_4
+}
+
+define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_0
+}
+
+define void @vshf_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_1:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_1
+}
+
+define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_2:
+
+  %1 = load <2 x i64>* %a
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 2>
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_2
+}
+
+define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_3:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 2>
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_3
+}
+
+define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_4:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> %1, <2 x i32> <i32 1, i32 3>
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_4
+}
+
+define void @shf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: shf_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 2, i32 0, i32 5, i32 7, i32 6, i32 4, i32 9, i32 11, i32 10, i32 8, i32 13, i32 15, i32 14, i32 12>
+  ; CHECK-DAG: shf.b [[R3:\$w[0-9]+]], [[R1]], 45
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v16i8_0
+}
+
+define void @shf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: shf_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ; CHECK-DAG: shf.h [[R3:\$w[0-9]+]], [[R1]], 27
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v8i16_0
+}
+
+define void @shf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: shf_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v4i32_0
+}
+
+; shf.d does not exist
+
+define void @ilvev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvev_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v16i8_0
+}
+
+define void @ilvev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvev_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v8i16_0
+}
+
+define void @ilvev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvev_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v4i32_0
+}
+
+define void @ilvev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvev_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v2i64_0
+}
+
+define void @ilvod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvod_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v16i8_0
+}
+
+define void @ilvod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvod_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v8i16_0
+}
+
+define void @ilvod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvod_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v4i32_0
+}
+
+define void @ilvod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvod_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v2i64_0
+}
+
+define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvl_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v16i8_0
+}
+
+define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvl_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v8i16_0
+}
+
+define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvl_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v4i32_0
+}
+
+define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvl_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  ; ilvl.d and ilvev.d are equivalent for v2i64
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v2i64_0
+}
+
+define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvr_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v16i8_0
+}
+
+define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvr_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v8i16_0
+}
+
+define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvr_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v4i32_0
+}
+
+define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvr_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
+  ; ilvr.d and ilvod.d are equivalent for v2i64
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v2i64_0
+}
+
+define void @pckev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: pckev_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v16i8_0
+}
+
+define void @pckev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: pckev_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v8i16_0
+}
+
+define void @pckev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: pckev_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v4i32_0
+}
+
+define void @pckev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: pckev_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  ; pckev.d and ilvev.d are equivalent for v2i64
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v2i64_0
+}
+
+define void @pckod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: pckod_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v16i8_0
+}
+
+define void @pckod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: pckod_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v8i16_0
+}
+
+define void @pckod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: pckod_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v4i32_0
+}
+
+define void @pckod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: pckod_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
+  ; pckod.d and ilvod.d are equivalent for v2i64
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v2i64_0
+}
+
+define void @splati_v16i8_0(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: splati_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef,
+                     <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][4]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v16i8_0
+}
+
+define void @splati_v8i16_0(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: splati_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][4]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v8i16_0
+}
+
+define void @splati_v4i32_0(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: splati_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ; shf.w and splati.w are equivalent
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 255
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v4i32_0
+}
+
+define void @splati_v2i64_0(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: splati_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v2i64_0
+}
diff --git a/test/CodeGen/Mips/msa/special.ll b/test/CodeGen/Mips/msa/special.ll
new file mode 100644
index 0000000..60a4369
--- /dev/null
+++ b/test/CodeGen/Mips/msa/special.ll
@@ -0,0 +1,26 @@
+; Test the MSA intrinsics that are encoded with the SPECIAL instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define i32 @llvm_mips_lsa_test(i32 %a, i32 %b) nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.lsa(i32 %a, i32 %b, i32 2)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lsa(i32, i32, i32) nounwind
+
+; CHECK: llvm_mips_lsa_test:
+; CHECK: lsa {{\$[0-9]+}}, {{\$[0-9]+}}, {{\$[0-9]+}}, 2
+; CHECK: .size llvm_mips_lsa_test
+
+define i32 @lsa_test(i32 %a, i32 %b) nounwind {
+entry:
+  %0 = shl i32 %b, 2
+  %1 = add i32 %a, %0
+  ret i32 %1
+}
+
+; CHECK: lsa_test:
+; CHECK: lsa {{\$[0-9]+}}, {{\$[0-9]+}}, {{\$[0-9]+}}, 2
+; CHECK: .size lsa_test
diff --git a/test/CodeGen/Mips/msa/spill.ll b/test/CodeGen/Mips/msa/spill.ll
new file mode 100644
index 0000000..66f896a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/spill.ll
@@ -0,0 +1,601 @@
+; Test that the correct instruction is chosen for spill and reload by trying
+; to have 33 live MSA registers simultaneously
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define i32 @test_i8(<16 x i8>* %p0, <16 x i8>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <16 x i8>* %p0, i32 1
+  %p2  = getelementptr <16 x i8>* %p0, i32 2
+  %p3  = getelementptr <16 x i8>* %p0, i32 3
+  %p4  = getelementptr <16 x i8>* %p0, i32 4
+  %p5  = getelementptr <16 x i8>* %p0, i32 5
+  %p6  = getelementptr <16 x i8>* %p0, i32 6
+  %p7  = getelementptr <16 x i8>* %p0, i32 7
+  %p8  = getelementptr <16 x i8>* %p0, i32 8
+  %p9  = getelementptr <16 x i8>* %p0, i32 9
+  %p10 = getelementptr <16 x i8>* %p0, i32 10
+  %p11 = getelementptr <16 x i8>* %p0, i32 11
+  %p12 = getelementptr <16 x i8>* %p0, i32 12
+  %p13 = getelementptr <16 x i8>* %p0, i32 13
+  %p14 = getelementptr <16 x i8>* %p0, i32 14
+  %p15 = getelementptr <16 x i8>* %p0, i32 15
+  %p16 = getelementptr <16 x i8>* %p0, i32 16
+  %p17 = getelementptr <16 x i8>* %p0, i32 17
+  %p18 = getelementptr <16 x i8>* %p0, i32 18
+  %p19 = getelementptr <16 x i8>* %p0, i32 19
+  %p20 = getelementptr <16 x i8>* %p0, i32 20
+  %p21 = getelementptr <16 x i8>* %p0, i32 21
+  %p22 = getelementptr <16 x i8>* %p0, i32 22
+  %p23 = getelementptr <16 x i8>* %p0, i32 23
+  %p24 = getelementptr <16 x i8>* %p0, i32 24
+  %p25 = getelementptr <16 x i8>* %p0, i32 25
+  %p26 = getelementptr <16 x i8>* %p0, i32 26
+  %p27 = getelementptr <16 x i8>* %p0, i32 27
+  %p28 = getelementptr <16 x i8>* %p0, i32 28
+  %p29 = getelementptr <16 x i8>* %p0, i32 29
+  %p30 = getelementptr <16 x i8>* %p0, i32 30
+  %p31 = getelementptr <16 x i8>* %p0, i32 31
+  %p32 = getelementptr <16 x i8>* %p0, i32 32
+  %p33 = getelementptr <16 x i8>* %p0, i32 33
+  %0  = load <16 x i8>* %p0, align 16
+  %1  = load <16 x i8>* %p1, align 16
+  %2  = load <16 x i8>* %p2, align 16
+  %3  = load <16 x i8>* %p3, align 16
+  %4  = load <16 x i8>* %p4, align 16
+  %5  = load <16 x i8>* %p5, align 16
+  %6  = load <16 x i8>* %p6, align 16
+  %7  = load <16 x i8>* %p7, align 16
+  %8  = load <16 x i8>* %p8, align 16
+  %9  = load <16 x i8>* %p9, align 16
+  %10 = load <16 x i8>* %p10, align 16
+  %11 = load <16 x i8>* %p11, align 16
+  %12 = load <16 x i8>* %p12, align 16
+  %13 = load <16 x i8>* %p13, align 16
+  %14 = load <16 x i8>* %p14, align 16
+  %15 = load <16 x i8>* %p15, align 16
+  %16 = load <16 x i8>* %p16, align 16
+  %17 = load <16 x i8>* %p17, align 16
+  %18 = load <16 x i8>* %p18, align 16
+  %19 = load <16 x i8>* %p19, align 16
+  %20 = load <16 x i8>* %p20, align 16
+  %21 = load <16 x i8>* %p21, align 16
+  %22 = load <16 x i8>* %p22, align 16
+  %23 = load <16 x i8>* %p23, align 16
+  %24 = load <16 x i8>* %p24, align 16
+  %25 = load <16 x i8>* %p25, align 16
+  %26 = load <16 x i8>* %p26, align 16
+  %27 = load <16 x i8>* %p27, align 16
+  %28 = load <16 x i8>* %p28, align 16
+  %29 = load <16 x i8>* %p29, align 16
+  %30 = load <16 x i8>* %p30, align 16
+  %31 = load <16 x i8>* %p31, align 16
+  %32 = load <16 x i8>* %p32, align 16
+  %33 = load <16 x i8>* %p33, align 16
+  %r1  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0,   <16 x i8> %1)
+  %r2  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r1,  <16 x i8> %2)
+  %r3  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r2,  <16 x i8> %3)
+  %r4  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r3,  <16 x i8> %4)
+  %r5  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r4,  <16 x i8> %5)
+  %r6  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r5,  <16 x i8> %6)
+  %r7  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r6,  <16 x i8> %7)
+  %r8  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r7,  <16 x i8> %8)
+  %r9  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r8,  <16 x i8> %9)
+  %r10 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r9,  <16 x i8> %10)
+  %r11 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r10, <16 x i8> %11)
+  %r12 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r11, <16 x i8> %12)
+  %r13 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r12, <16 x i8> %13)
+  %r14 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r13, <16 x i8> %14)
+  %r15 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r14, <16 x i8> %15)
+  %r16 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r15, <16 x i8> %16)
+  %r17 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r16, <16 x i8> %17)
+  %r18 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r17, <16 x i8> %18)
+  %r19 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r18, <16 x i8> %19)
+  %r20 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r19, <16 x i8> %20)
+  %r21 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r20, <16 x i8> %21)
+  %r22 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r21, <16 x i8> %22)
+  %r23 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r22, <16 x i8> %23)
+  %r24 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r23, <16 x i8> %24)
+  %r25 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r24, <16 x i8> %25)
+  %r26 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r25, <16 x i8> %26)
+  %r27 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r26, <16 x i8> %27)
+  %r28 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r27, <16 x i8> %28)
+  %r29 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r28, <16 x i8> %29)
+  %r30 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r29, <16 x i8> %30)
+  %r31 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r30, <16 x i8> %31)
+  %r32 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r31, <16 x i8> %32)
+  %r33 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r32, <16 x i8> %33)
+  %rx1  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r33,   <16 x i8> %1)
+  %rx2  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx1,  <16 x i8> %2)
+  %rx3  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx2,  <16 x i8> %3)
+  %rx4  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx3,  <16 x i8> %4)
+  %rx5  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx4,  <16 x i8> %5)
+  %rx6  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx5,  <16 x i8> %6)
+  %rx7  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx6,  <16 x i8> %7)
+  %rx8  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx7,  <16 x i8> %8)
+  %rx9  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx8,  <16 x i8> %9)
+  %rx10 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx9,  <16 x i8> %10)
+  %rx11 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx10, <16 x i8> %11)
+  %rx12 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx11, <16 x i8> %12)
+  %rx13 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx12, <16 x i8> %13)
+  %rx14 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx13, <16 x i8> %14)
+  %rx15 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx14, <16 x i8> %15)
+  %rx16 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx15, <16 x i8> %16)
+  %rx17 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx16, <16 x i8> %17)
+  %rx18 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx17, <16 x i8> %18)
+  %rx19 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx18, <16 x i8> %19)
+  %rx20 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx19, <16 x i8> %20)
+  %rx21 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx20, <16 x i8> %21)
+  %rx22 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx21, <16 x i8> %22)
+  %rx23 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx22, <16 x i8> %23)
+  %rx24 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx23, <16 x i8> %24)
+  %rx25 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx24, <16 x i8> %25)
+  %rx26 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx25, <16 x i8> %26)
+  %rx27 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx26, <16 x i8> %27)
+  %rx28 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx27, <16 x i8> %28)
+  %rx29 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx28, <16 x i8> %29)
+  %rx30 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx29, <16 x i8> %30)
+  %rx31 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx30, <16 x i8> %31)
+  %rx32 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx31, <16 x i8> %32)
+  %rx33 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx32, <16 x i8> %33)
+  %res = call i32 @llvm.mips.copy.s.b(<16 x i8> %rx33, i32 0)
+  ret i32 %res
+}
+
+declare <16 x i8> @llvm.mips.addv.b(<16 x i8>, <16 x i8>) nounwind
+declare i32       @llvm.mips.copy.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: test_i8:
+; CHECK: st.b {{.*}} Spill
+; CHECK: st.b {{.*}} Spill
+; CHECK: ld.b {{.*}} Reload
+; CHECK: ld.b {{.*}} Reload
+; CHECK: .size
+
+define i32 @test_i16(<8 x i16>* %p0, <8 x i16>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <8 x i16>* %p0, i32 1
+  %p2  = getelementptr <8 x i16>* %p0, i32 2
+  %p3  = getelementptr <8 x i16>* %p0, i32 3
+  %p4  = getelementptr <8 x i16>* %p0, i32 4
+  %p5  = getelementptr <8 x i16>* %p0, i32 5
+  %p6  = getelementptr <8 x i16>* %p0, i32 6
+  %p7  = getelementptr <8 x i16>* %p0, i32 7
+  %p8  = getelementptr <8 x i16>* %p0, i32 8
+  %p9  = getelementptr <8 x i16>* %p0, i32 9
+  %p10 = getelementptr <8 x i16>* %p0, i32 10
+  %p11 = getelementptr <8 x i16>* %p0, i32 11
+  %p12 = getelementptr <8 x i16>* %p0, i32 12
+  %p13 = getelementptr <8 x i16>* %p0, i32 13
+  %p14 = getelementptr <8 x i16>* %p0, i32 14
+  %p15 = getelementptr <8 x i16>* %p0, i32 15
+  %p16 = getelementptr <8 x i16>* %p0, i32 16
+  %p17 = getelementptr <8 x i16>* %p0, i32 17
+  %p18 = getelementptr <8 x i16>* %p0, i32 18
+  %p19 = getelementptr <8 x i16>* %p0, i32 19
+  %p20 = getelementptr <8 x i16>* %p0, i32 20
+  %p21 = getelementptr <8 x i16>* %p0, i32 21
+  %p22 = getelementptr <8 x i16>* %p0, i32 22
+  %p23 = getelementptr <8 x i16>* %p0, i32 23
+  %p24 = getelementptr <8 x i16>* %p0, i32 24
+  %p25 = getelementptr <8 x i16>* %p0, i32 25
+  %p26 = getelementptr <8 x i16>* %p0, i32 26
+  %p27 = getelementptr <8 x i16>* %p0, i32 27
+  %p28 = getelementptr <8 x i16>* %p0, i32 28
+  %p29 = getelementptr <8 x i16>* %p0, i32 29
+  %p30 = getelementptr <8 x i16>* %p0, i32 30
+  %p31 = getelementptr <8 x i16>* %p0, i32 31
+  %p32 = getelementptr <8 x i16>* %p0, i32 32
+  %p33 = getelementptr <8 x i16>* %p0, i32 33
+  %0  = load <8 x i16>* %p0, align 16
+  %1  = load <8 x i16>* %p1, align 16
+  %2  = load <8 x i16>* %p2, align 16
+  %3  = load <8 x i16>* %p3, align 16
+  %4  = load <8 x i16>* %p4, align 16
+  %5  = load <8 x i16>* %p5, align 16
+  %6  = load <8 x i16>* %p6, align 16
+  %7  = load <8 x i16>* %p7, align 16
+  %8  = load <8 x i16>* %p8, align 16
+  %9  = load <8 x i16>* %p9, align 16
+  %10 = load <8 x i16>* %p10, align 16
+  %11 = load <8 x i16>* %p11, align 16
+  %12 = load <8 x i16>* %p12, align 16
+  %13 = load <8 x i16>* %p13, align 16
+  %14 = load <8 x i16>* %p14, align 16
+  %15 = load <8 x i16>* %p15, align 16
+  %16 = load <8 x i16>* %p16, align 16
+  %17 = load <8 x i16>* %p17, align 16
+  %18 = load <8 x i16>* %p18, align 16
+  %19 = load <8 x i16>* %p19, align 16
+  %20 = load <8 x i16>* %p20, align 16
+  %21 = load <8 x i16>* %p21, align 16
+  %22 = load <8 x i16>* %p22, align 16
+  %23 = load <8 x i16>* %p23, align 16
+  %24 = load <8 x i16>* %p24, align 16
+  %25 = load <8 x i16>* %p25, align 16
+  %26 = load <8 x i16>* %p26, align 16
+  %27 = load <8 x i16>* %p27, align 16
+  %28 = load <8 x i16>* %p28, align 16
+  %29 = load <8 x i16>* %p29, align 16
+  %30 = load <8 x i16>* %p30, align 16
+  %31 = load <8 x i16>* %p31, align 16
+  %32 = load <8 x i16>* %p32, align 16
+  %33 = load <8 x i16>* %p33, align 16
+  %r1  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0,   <8 x i16> %1)
+  %r2  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r1,  <8 x i16> %2)
+  %r3  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r2,  <8 x i16> %3)
+  %r4  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r3,  <8 x i16> %4)
+  %r5  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r4,  <8 x i16> %5)
+  %r6  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r5,  <8 x i16> %6)
+  %r7  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r6,  <8 x i16> %7)
+  %r8  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r7,  <8 x i16> %8)
+  %r9  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r8,  <8 x i16> %9)
+  %r10 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r9,  <8 x i16> %10)
+  %r11 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r10, <8 x i16> %11)
+  %r12 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r11, <8 x i16> %12)
+  %r13 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r12, <8 x i16> %13)
+  %r14 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r13, <8 x i16> %14)
+  %r15 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r14, <8 x i16> %15)
+  %r16 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r15, <8 x i16> %16)
+  %r17 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r16, <8 x i16> %17)
+  %r18 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r17, <8 x i16> %18)
+  %r19 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r18, <8 x i16> %19)
+  %r20 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r19, <8 x i16> %20)
+  %r21 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r20, <8 x i16> %21)
+  %r22 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r21, <8 x i16> %22)
+  %r23 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r22, <8 x i16> %23)
+  %r24 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r23, <8 x i16> %24)
+  %r25 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r24, <8 x i16> %25)
+  %r26 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r25, <8 x i16> %26)
+  %r27 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r26, <8 x i16> %27)
+  %r28 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r27, <8 x i16> %28)
+  %r29 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r28, <8 x i16> %29)
+  %r30 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r29, <8 x i16> %30)
+  %r31 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r30, <8 x i16> %31)
+  %r32 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r31, <8 x i16> %32)
+  %r33 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r32, <8 x i16> %33)
+  %rx1  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r33,   <8 x i16> %1)
+  %rx2  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx1,  <8 x i16> %2)
+  %rx3  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx2,  <8 x i16> %3)
+  %rx4  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx3,  <8 x i16> %4)
+  %rx5  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx4,  <8 x i16> %5)
+  %rx6  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx5,  <8 x i16> %6)
+  %rx7  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx6,  <8 x i16> %7)
+  %rx8  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx7,  <8 x i16> %8)
+  %rx9  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx8,  <8 x i16> %9)
+  %rx10 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx9,  <8 x i16> %10)
+  %rx11 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx10, <8 x i16> %11)
+  %rx12 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx11, <8 x i16> %12)
+  %rx13 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx12, <8 x i16> %13)
+  %rx14 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx13, <8 x i16> %14)
+  %rx15 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx14, <8 x i16> %15)
+  %rx16 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx15, <8 x i16> %16)
+  %rx17 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx16, <8 x i16> %17)
+  %rx18 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx17, <8 x i16> %18)
+  %rx19 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx18, <8 x i16> %19)
+  %rx20 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx19, <8 x i16> %20)
+  %rx21 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx20, <8 x i16> %21)
+  %rx22 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx21, <8 x i16> %22)
+  %rx23 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx22, <8 x i16> %23)
+  %rx24 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx23, <8 x i16> %24)
+  %rx25 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx24, <8 x i16> %25)
+  %rx26 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx25, <8 x i16> %26)
+  %rx27 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx26, <8 x i16> %27)
+  %rx28 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx27, <8 x i16> %28)
+  %rx29 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx28, <8 x i16> %29)
+  %rx30 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx29, <8 x i16> %30)
+  %rx31 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx30, <8 x i16> %31)
+  %rx32 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx31, <8 x i16> %32)
+  %rx33 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx32, <8 x i16> %33)
+  %res = call i32 @llvm.mips.copy.s.h(<8 x i16> %rx33, i32 0)
+  ret i32 %res
+}
+
+declare <8 x i16> @llvm.mips.addv.h(<8 x i16>, <8 x i16>) nounwind
+declare i32       @llvm.mips.copy.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: test_i16:
+; CHECK: st.h {{.*}} Spill
+; CHECK: st.h {{.*}} Spill
+; CHECK: ld.h {{.*}} Reload
+; CHECK: ld.h {{.*}} Reload
+; CHECK: .size
+
+define i32 @test_i32(<4 x i32>* %p0, <4 x i32>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <4 x i32>* %p0, i32 1
+  %p2  = getelementptr <4 x i32>* %p0, i32 2
+  %p3  = getelementptr <4 x i32>* %p0, i32 3
+  %p4  = getelementptr <4 x i32>* %p0, i32 4
+  %p5  = getelementptr <4 x i32>* %p0, i32 5
+  %p6  = getelementptr <4 x i32>* %p0, i32 6
+  %p7  = getelementptr <4 x i32>* %p0, i32 7
+  %p8  = getelementptr <4 x i32>* %p0, i32 8
+  %p9  = getelementptr <4 x i32>* %p0, i32 9
+  %p10 = getelementptr <4 x i32>* %p0, i32 10
+  %p11 = getelementptr <4 x i32>* %p0, i32 11
+  %p12 = getelementptr <4 x i32>* %p0, i32 12
+  %p13 = getelementptr <4 x i32>* %p0, i32 13
+  %p14 = getelementptr <4 x i32>* %p0, i32 14
+  %p15 = getelementptr <4 x i32>* %p0, i32 15
+  %p16 = getelementptr <4 x i32>* %p0, i32 16
+  %p17 = getelementptr <4 x i32>* %p0, i32 17
+  %p18 = getelementptr <4 x i32>* %p0, i32 18
+  %p19 = getelementptr <4 x i32>* %p0, i32 19
+  %p20 = getelementptr <4 x i32>* %p0, i32 20
+  %p21 = getelementptr <4 x i32>* %p0, i32 21
+  %p22 = getelementptr <4 x i32>* %p0, i32 22
+  %p23 = getelementptr <4 x i32>* %p0, i32 23
+  %p24 = getelementptr <4 x i32>* %p0, i32 24
+  %p25 = getelementptr <4 x i32>* %p0, i32 25
+  %p26 = getelementptr <4 x i32>* %p0, i32 26
+  %p27 = getelementptr <4 x i32>* %p0, i32 27
+  %p28 = getelementptr <4 x i32>* %p0, i32 28
+  %p29 = getelementptr <4 x i32>* %p0, i32 29
+  %p30 = getelementptr <4 x i32>* %p0, i32 30
+  %p31 = getelementptr <4 x i32>* %p0, i32 31
+  %p32 = getelementptr <4 x i32>* %p0, i32 32
+  %p33 = getelementptr <4 x i32>* %p0, i32 33
+  %0  = load <4 x i32>* %p0, align 16
+  %1  = load <4 x i32>* %p1, align 16
+  %2  = load <4 x i32>* %p2, align 16
+  %3  = load <4 x i32>* %p3, align 16
+  %4  = load <4 x i32>* %p4, align 16
+  %5  = load <4 x i32>* %p5, align 16
+  %6  = load <4 x i32>* %p6, align 16
+  %7  = load <4 x i32>* %p7, align 16
+  %8  = load <4 x i32>* %p8, align 16
+  %9  = load <4 x i32>* %p9, align 16
+  %10 = load <4 x i32>* %p10, align 16
+  %11 = load <4 x i32>* %p11, align 16
+  %12 = load <4 x i32>* %p12, align 16
+  %13 = load <4 x i32>* %p13, align 16
+  %14 = load <4 x i32>* %p14, align 16
+  %15 = load <4 x i32>* %p15, align 16
+  %16 = load <4 x i32>* %p16, align 16
+  %17 = load <4 x i32>* %p17, align 16
+  %18 = load <4 x i32>* %p18, align 16
+  %19 = load <4 x i32>* %p19, align 16
+  %20 = load <4 x i32>* %p20, align 16
+  %21 = load <4 x i32>* %p21, align 16
+  %22 = load <4 x i32>* %p22, align 16
+  %23 = load <4 x i32>* %p23, align 16
+  %24 = load <4 x i32>* %p24, align 16
+  %25 = load <4 x i32>* %p25, align 16
+  %26 = load <4 x i32>* %p26, align 16
+  %27 = load <4 x i32>* %p27, align 16
+  %28 = load <4 x i32>* %p28, align 16
+  %29 = load <4 x i32>* %p29, align 16
+  %30 = load <4 x i32>* %p30, align 16
+  %31 = load <4 x i32>* %p31, align 16
+  %32 = load <4 x i32>* %p32, align 16
+  %33 = load <4 x i32>* %p33, align 16
+  %r1 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %1)
+  %r2 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r1, <4 x i32> %2)
+  %r3 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r2, <4 x i32> %3)
+  %r4 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r3, <4 x i32> %4)
+  %r5 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r4, <4 x i32> %5)
+  %r6 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r5, <4 x i32> %6)
+  %r7 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r6, <4 x i32> %7)
+  %r8 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r7, <4 x i32> %8)
+  %r9 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r8, <4 x i32> %9)
+  %r10 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r9, <4 x i32> %10)
+  %r11 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r10, <4 x i32> %11)
+  %r12 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r11, <4 x i32> %12)
+  %r13 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r12, <4 x i32> %13)
+  %r14 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r13, <4 x i32> %14)
+  %r15 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r14, <4 x i32> %15)
+  %r16 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r15, <4 x i32> %16)
+  %r17 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r16, <4 x i32> %17)
+  %r18 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r17, <4 x i32> %18)
+  %r19 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r18, <4 x i32> %19)
+  %r20 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r19, <4 x i32> %20)
+  %r21 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r20, <4 x i32> %21)
+  %r22 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r21, <4 x i32> %22)
+  %r23 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r22, <4 x i32> %23)
+  %r24 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r23, <4 x i32> %24)
+  %r25 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r24, <4 x i32> %25)
+  %r26 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r25, <4 x i32> %26)
+  %r27 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r26, <4 x i32> %27)
+  %r28 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r27, <4 x i32> %28)
+  %r29 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r28, <4 x i32> %29)
+  %r30 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r29, <4 x i32> %30)
+  %r31 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r30, <4 x i32> %31)
+  %r32 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r31, <4 x i32> %32)
+  %r33 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r32, <4 x i32> %33)
+  %rx1 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r33, <4 x i32> %1)
+  %rx2 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx1, <4 x i32> %2)
+  %rx3 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx2, <4 x i32> %3)
+  %rx4 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx3, <4 x i32> %4)
+  %rx5 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx4, <4 x i32> %5)
+  %rx6 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx5, <4 x i32> %6)
+  %rx7 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx6, <4 x i32> %7)
+  %rx8 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx7, <4 x i32> %8)
+  %rx9 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx8, <4 x i32> %9)
+  %rx10 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx9, <4 x i32> %10)
+  %rx11 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx10, <4 x i32> %11)
+  %rx12 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx11, <4 x i32> %12)
+  %rx13 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx12, <4 x i32> %13)
+  %rx14 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx13, <4 x i32> %14)
+  %rx15 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx14, <4 x i32> %15)
+  %rx16 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx15, <4 x i32> %16)
+  %rx17 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx16, <4 x i32> %17)
+  %rx18 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx17, <4 x i32> %18)
+  %rx19 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx18, <4 x i32> %19)
+  %rx20 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx19, <4 x i32> %20)
+  %rx21 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx20, <4 x i32> %21)
+  %rx22 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx21, <4 x i32> %22)
+  %rx23 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx22, <4 x i32> %23)
+  %rx24 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx23, <4 x i32> %24)
+  %rx25 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx24, <4 x i32> %25)
+  %rx26 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx25, <4 x i32> %26)
+  %rx27 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx26, <4 x i32> %27)
+  %rx28 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx27, <4 x i32> %28)
+  %rx29 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx28, <4 x i32> %29)
+  %rx30 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx29, <4 x i32> %30)
+  %rx31 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx30, <4 x i32> %31)
+  %rx32 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx31, <4 x i32> %32)
+  %rx33 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx32, <4 x i32> %33)
+  %res = call i32 @llvm.mips.copy.s.w(<4 x i32> %rx33, i32 0)
+  ret i32 %res
+}
+
+declare <4 x i32> @llvm.mips.addv.w(<4 x i32>, <4 x i32>) nounwind
+declare i32       @llvm.mips.copy.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: test_i32:
+; CHECK: st.w {{.*}} Spill
+; CHECK: st.w {{.*}} Spill
+; CHECK: ld.w {{.*}} Reload
+; CHECK: ld.w {{.*}} Reload
+; CHECK: .size
+
+define i32 @test_i64(<2 x i64>* %p0, <2 x i64>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <2 x i64>* %p0, i32 1
+  %p2  = getelementptr <2 x i64>* %p0, i32 2
+  %p3  = getelementptr <2 x i64>* %p0, i32 3
+  %p4  = getelementptr <2 x i64>* %p0, i32 4
+  %p5  = getelementptr <2 x i64>* %p0, i32 5
+  %p6  = getelementptr <2 x i64>* %p0, i32 6
+  %p7  = getelementptr <2 x i64>* %p0, i32 7
+  %p8  = getelementptr <2 x i64>* %p0, i32 8
+  %p9  = getelementptr <2 x i64>* %p0, i32 9
+  %p10 = getelementptr <2 x i64>* %p0, i32 10
+  %p11 = getelementptr <2 x i64>* %p0, i32 11
+  %p12 = getelementptr <2 x i64>* %p0, i32 12
+  %p13 = getelementptr <2 x i64>* %p0, i32 13
+  %p14 = getelementptr <2 x i64>* %p0, i32 14
+  %p15 = getelementptr <2 x i64>* %p0, i32 15
+  %p16 = getelementptr <2 x i64>* %p0, i32 16
+  %p17 = getelementptr <2 x i64>* %p0, i32 17
+  %p18 = getelementptr <2 x i64>* %p0, i32 18
+  %p19 = getelementptr <2 x i64>* %p0, i32 19
+  %p20 = getelementptr <2 x i64>* %p0, i32 20
+  %p21 = getelementptr <2 x i64>* %p0, i32 21
+  %p22 = getelementptr <2 x i64>* %p0, i32 22
+  %p23 = getelementptr <2 x i64>* %p0, i32 23
+  %p24 = getelementptr <2 x i64>* %p0, i32 24
+  %p25 = getelementptr <2 x i64>* %p0, i32 25
+  %p26 = getelementptr <2 x i64>* %p0, i32 26
+  %p27 = getelementptr <2 x i64>* %p0, i32 27
+  %p28 = getelementptr <2 x i64>* %p0, i32 28
+  %p29 = getelementptr <2 x i64>* %p0, i32 29
+  %p30 = getelementptr <2 x i64>* %p0, i32 30
+  %p31 = getelementptr <2 x i64>* %p0, i32 31
+  %p32 = getelementptr <2 x i64>* %p0, i32 32
+  %p33 = getelementptr <2 x i64>* %p0, i32 33
+  %0  = load <2 x i64>* %p0, align 16
+  %1  = load <2 x i64>* %p1, align 16
+  %2  = load <2 x i64>* %p2, align 16
+  %3  = load <2 x i64>* %p3, align 16
+  %4  = load <2 x i64>* %p4, align 16
+  %5  = load <2 x i64>* %p5, align 16
+  %6  = load <2 x i64>* %p6, align 16
+  %7  = load <2 x i64>* %p7, align 16
+  %8  = load <2 x i64>* %p8, align 16
+  %9  = load <2 x i64>* %p9, align 16
+  %10 = load <2 x i64>* %p10, align 16
+  %11 = load <2 x i64>* %p11, align 16
+  %12 = load <2 x i64>* %p12, align 16
+  %13 = load <2 x i64>* %p13, align 16
+  %14 = load <2 x i64>* %p14, align 16
+  %15 = load <2 x i64>* %p15, align 16
+  %16 = load <2 x i64>* %p16, align 16
+  %17 = load <2 x i64>* %p17, align 16
+  %18 = load <2 x i64>* %p18, align 16
+  %19 = load <2 x i64>* %p19, align 16
+  %20 = load <2 x i64>* %p20, align 16
+  %21 = load <2 x i64>* %p21, align 16
+  %22 = load <2 x i64>* %p22, align 16
+  %23 = load <2 x i64>* %p23, align 16
+  %24 = load <2 x i64>* %p24, align 16
+  %25 = load <2 x i64>* %p25, align 16
+  %26 = load <2 x i64>* %p26, align 16
+  %27 = load <2 x i64>* %p27, align 16
+  %28 = load <2 x i64>* %p28, align 16
+  %29 = load <2 x i64>* %p29, align 16
+  %30 = load <2 x i64>* %p30, align 16
+  %31 = load <2 x i64>* %p31, align 16
+  %32 = load <2 x i64>* %p32, align 16
+  %33 = load <2 x i64>* %p33, align 16
+  %r1  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0,   <2 x i64> %1)
+  %r2  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r1,  <2 x i64> %2)
+  %r3  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r2,  <2 x i64> %3)
+  %r4  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r3,  <2 x i64> %4)
+  %r5  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r4,  <2 x i64> %5)
+  %r6  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r5,  <2 x i64> %6)
+  %r7  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r6,  <2 x i64> %7)
+  %r8  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r7,  <2 x i64> %8)
+  %r9  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r8,  <2 x i64> %9)
+  %r10 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r9,  <2 x i64> %10)
+  %r11 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r10, <2 x i64> %11)
+  %r12 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r11, <2 x i64> %12)
+  %r13 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r12, <2 x i64> %13)
+  %r14 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r13, <2 x i64> %14)
+  %r15 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r14, <2 x i64> %15)
+  %r16 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r15, <2 x i64> %16)
+  %r17 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r16, <2 x i64> %17)
+  %r18 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r17, <2 x i64> %18)
+  %r19 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r18, <2 x i64> %19)
+  %r20 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r19, <2 x i64> %20)
+  %r21 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r20, <2 x i64> %21)
+  %r22 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r21, <2 x i64> %22)
+  %r23 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r22, <2 x i64> %23)
+  %r24 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r23, <2 x i64> %24)
+  %r25 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r24, <2 x i64> %25)
+  %r26 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r25, <2 x i64> %26)
+  %r27 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r26, <2 x i64> %27)
+  %r28 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r27, <2 x i64> %28)
+  %r29 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r28, <2 x i64> %29)
+  %r30 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r29, <2 x i64> %30)
+  %r31 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r30, <2 x i64> %31)
+  %r32 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r31, <2 x i64> %32)
+  %r33 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r32, <2 x i64> %33)
+  %rx1  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r33,  <2 x i64> %1)
+  %rx2  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx1,  <2 x i64> %2)
+  %rx3  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx2,  <2 x i64> %3)
+  %rx4  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx3,  <2 x i64> %4)
+  %rx5  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx4,  <2 x i64> %5)
+  %rx6  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx5,  <2 x i64> %6)
+  %rx7  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx6,  <2 x i64> %7)
+  %rx8  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx7,  <2 x i64> %8)
+  %rx9  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx8,  <2 x i64> %9)
+  %rx10 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx9,  <2 x i64> %10)
+  %rx11 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx10, <2 x i64> %11)
+  %rx12 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx11, <2 x i64> %12)
+  %rx13 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx12, <2 x i64> %13)
+  %rx14 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx13, <2 x i64> %14)
+  %rx15 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx14, <2 x i64> %15)
+  %rx16 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx15, <2 x i64> %16)
+  %rx17 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx16, <2 x i64> %17)
+  %rx18 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx17, <2 x i64> %18)
+  %rx19 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx18, <2 x i64> %19)
+  %rx20 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx19, <2 x i64> %20)
+  %rx21 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx20, <2 x i64> %21)
+  %rx22 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx21, <2 x i64> %22)
+  %rx23 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx22, <2 x i64> %23)
+  %rx24 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx23, <2 x i64> %24)
+  %rx25 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx24, <2 x i64> %25)
+  %rx26 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx25, <2 x i64> %26)
+  %rx27 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx26, <2 x i64> %27)
+  %rx28 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx27, <2 x i64> %28)
+  %rx29 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx28, <2 x i64> %29)
+  %rx30 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx29, <2 x i64> %30)
+  %rx31 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx30, <2 x i64> %31)
+  %rx32 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx31, <2 x i64> %32)
+  %rx33 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx32, <2 x i64> %33)
+  %res1 = bitcast <2 x i64> %rx33 to <4 x i32>
+  %res = call i32 @llvm.mips.copy.s.w(<4 x i32> %res1, i32 0)
+  ret i32 %res
+}
+
+declare <2 x i64> @llvm.mips.addv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: test_i64:
+; CHECK: st.d {{.*}} Spill
+; CHECK: st.d {{.*}} Spill
+; CHECK: ld.d {{.*}} Reload
+; CHECK: ld.d {{.*}} Reload
+; CHECK: .size
diff --git a/test/CodeGen/Mips/msa/vec.ll b/test/CodeGen/Mips/msa/vec.ll
new file mode 100644
index 0000000..5bddf5a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/vec.ll
@@ -0,0 +1,946 @@
+; Test the MSA intrinsics that are encoded with the VEC instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ANYENDIAN %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ANYENDIAN %s
+
+@llvm_mips_and_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_and_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_and_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_and_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_and_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_and_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_and_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_b_test
+;
+@llvm_mips_and_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_and_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_and_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_and_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_and_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_and_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_and_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_h_test
+;
+@llvm_mips_and_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_and_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_and_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_and_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_and_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_and_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_and_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_w_test
+;
+@llvm_mips_and_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_and_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_and_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_and_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_and_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_and_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_and_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_d_test
+;
+define void @and_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_and_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_and_v_b_ARG2
+  %2 = and <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_and_v_b_RES
+  ret void
+}
+
+; CHECK: and_v_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: and.v
+; CHECK: st.b
+; CHECK: .size and_v_b_test
+;
+define void @and_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_and_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_and_v_h_ARG2
+  %2 = and <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_and_v_h_RES
+  ret void
+}
+
+; CHECK: and_v_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: and.v
+; CHECK: st.h
+; CHECK: .size and_v_h_test
+;
+
+define void @and_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_and_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_and_v_w_ARG2
+  %2 = and <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_and_v_w_RES
+  ret void
+}
+
+; CHECK: and_v_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: and.v
+; CHECK: st.w
+; CHECK: .size and_v_w_test
+;
+
+define void @and_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_and_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_and_v_d_ARG2
+  %2 = and <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_and_v_d_RES
+  ret void
+}
+
+; CHECK: and_v_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: and.v
+; CHECK: st.d
+; CHECK: .size and_v_d_test
+;
+@llvm_mips_bmnz_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmnz_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bmnz_v_b_ARG3 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmnz_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmnz_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmnz_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmnz_v_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_bmnz_v_b_ARG3
+  %3 = bitcast <16 x i8> %0 to <16 x i8>
+  %4 = bitcast <16 x i8> %1 to <16 x i8>
+  %5 = bitcast <16 x i8> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <16 x i8>
+  store <16 x i8> %7, <16 x i8>* @llvm_mips_bmnz_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_b_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_b_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_b_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_b_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_b_test
+
+@llvm_mips_bmnz_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmnz_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bmnz_v_h_ARG3 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmnz_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bmnz_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bmnz_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bmnz_v_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_bmnz_v_h_ARG3
+  %3 = bitcast <8 x i16> %0 to <16 x i8>
+  %4 = bitcast <8 x i16> %1 to <16 x i8>
+  %5 = bitcast <8 x i16> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <8 x i16>
+  store <8 x i16> %7, <8 x i16>* @llvm_mips_bmnz_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_h_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_h_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_h_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_h_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_h_test
+
+@llvm_mips_bmnz_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmnz_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bmnz_v_w_ARG3 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmnz_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bmnz_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bmnz_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bmnz_v_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_bmnz_v_w_ARG3
+  %3 = bitcast <4 x i32> %0 to <16 x i8>
+  %4 = bitcast <4 x i32> %1 to <16 x i8>
+  %5 = bitcast <4 x i32> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <4 x i32>
+  store <4 x i32> %7, <4 x i32>* @llvm_mips_bmnz_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_w_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_w_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_w_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_w_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_w_test
+
+@llvm_mips_bmnz_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmnz_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bmnz_v_d_ARG3 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmnz_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bmnz_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bmnz_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bmnz_v_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_bmnz_v_d_ARG3
+  %3 = bitcast <2 x i64> %0 to <16 x i8>
+  %4 = bitcast <2 x i64> %1 to <16 x i8>
+  %5 = bitcast <2 x i64> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <2 x i64>
+  store <2 x i64> %7, <2 x i64>* @llvm_mips_bmnz_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_d_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_d_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_d_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_d_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_d_test
+
+@llvm_mips_bmz_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmz_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bmz_v_b_ARG3 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmz_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmz_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmz_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmz_v_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_bmz_v_b_ARG3
+  %3 = bitcast <16 x i8> %0 to <16 x i8>
+  %4 = bitcast <16 x i8> %1 to <16 x i8>
+  %5 = bitcast <16 x i8> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <16 x i8>
+  store <16 x i8> %7, <16 x i8>* @llvm_mips_bmz_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_b_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_b_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_b_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_b_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_b_test
+
+@llvm_mips_bmz_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmz_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bmz_v_h_ARG3 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmz_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bmz_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bmz_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bmz_v_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_bmz_v_h_ARG3
+  %3 = bitcast <8 x i16> %0 to <16 x i8>
+  %4 = bitcast <8 x i16> %1 to <16 x i8>
+  %5 = bitcast <8 x i16> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <8 x i16>
+  store <8 x i16> %7, <8 x i16>* @llvm_mips_bmz_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_h_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_h_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_h_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_h_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_h_test
+
+@llvm_mips_bmz_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmz_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bmz_v_w_ARG3 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmz_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bmz_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bmz_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bmz_v_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_bmz_v_w_ARG3
+  %3 = bitcast <4 x i32> %0 to <16 x i8>
+  %4 = bitcast <4 x i32> %1 to <16 x i8>
+  %5 = bitcast <4 x i32> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <4 x i32>
+  store <4 x i32> %7, <4 x i32>* @llvm_mips_bmz_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_w_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_w_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_w_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_w_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_w_test
+
+@llvm_mips_bmz_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmz_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bmz_v_d_ARG3 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmz_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bmz_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bmz_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bmz_v_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_bmz_v_d_ARG3
+  %3 = bitcast <2 x i64> %0 to <16 x i8>
+  %4 = bitcast <2 x i64> %1 to <16 x i8>
+  %5 = bitcast <2 x i64> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <2 x i64>
+  store <2 x i64> %7, <2 x i64>* @llvm_mips_bmz_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_d_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_d_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_d_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_d_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_d_test
+
+@llvm_mips_bsel_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bsel_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bsel_v_b_ARG3 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bsel_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bsel_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bsel_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bsel_v_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_bsel_v_b_ARG3
+  %3 = bitcast <16 x i8> %0 to <16 x i8>
+  %4 = bitcast <16 x i8> %1 to <16 x i8>
+  %5 = bitcast <16 x i8> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <16 x i8>
+  store <16 x i8> %7, <16 x i8>* @llvm_mips_bsel_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_b_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_b_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_b_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_b_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_b_test
+
+@llvm_mips_bsel_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bsel_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bsel_v_h_ARG3 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bsel_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bsel_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bsel_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bsel_v_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_bsel_v_h_ARG3
+  %3 = bitcast <8 x i16> %0 to <16 x i8>
+  %4 = bitcast <8 x i16> %1 to <16 x i8>
+  %5 = bitcast <8 x i16> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <8 x i16>
+  store <8 x i16> %7, <8 x i16>* @llvm_mips_bsel_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_h_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_h_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_h_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_h_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_h_test
+
+@llvm_mips_bsel_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bsel_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bsel_v_w_ARG3 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bsel_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bsel_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bsel_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bsel_v_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_bsel_v_w_ARG3
+  %3 = bitcast <4 x i32> %0 to <16 x i8>
+  %4 = bitcast <4 x i32> %1 to <16 x i8>
+  %5 = bitcast <4 x i32> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <4 x i32>
+  store <4 x i32> %7, <4 x i32>* @llvm_mips_bsel_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_w_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_w_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_w_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_w_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_w_test
+
+@llvm_mips_bsel_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bsel_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bsel_v_d_ARG3 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bsel_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bsel_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bsel_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bsel_v_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_bsel_v_d_ARG3
+  %3 = bitcast <2 x i64> %0 to <16 x i8>
+  %4 = bitcast <2 x i64> %1 to <16 x i8>
+  %5 = bitcast <2 x i64> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <2 x i64>
+  store <2 x i64> %7, <2 x i64>* @llvm_mips_bsel_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_d_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_d_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_d_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_d_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_d_test
+
+@llvm_mips_nor_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nor_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_nor_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nor_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nor_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_nor_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_nor_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_b_test
+;
+@llvm_mips_nor_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_nor_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_nor_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_nor_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_nor_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_nor_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_nor_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_h_test
+;
+@llvm_mips_nor_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_nor_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_nor_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_nor_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_nor_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_nor_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_nor_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_w_test
+;
+@llvm_mips_nor_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_nor_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_nor_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_nor_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_nor_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_nor_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_nor_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_d_test
+;
+@llvm_mips_or_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_or_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_or_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_or_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_or_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_or_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_or_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_b_test
+;
+@llvm_mips_or_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_or_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_or_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_or_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_or_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_or_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_or_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_h_test
+;
+@llvm_mips_or_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_or_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_or_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_or_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_or_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_or_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_or_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_w_test
+;
+@llvm_mips_or_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_or_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_or_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_or_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_or_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_or_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_or_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_d_test
+;
+define void @or_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_or_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_or_v_b_ARG2
+  %2 = or <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_or_v_b_RES
+  ret void
+}
+
+; CHECK: or_v_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: or.v
+; CHECK: st.b
+; CHECK: .size or_v_b_test
+;
+define void @or_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_or_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_or_v_h_ARG2
+  %2 = or <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_or_v_h_RES
+  ret void
+}
+
+; CHECK: or_v_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: or.v
+; CHECK: st.h
+; CHECK: .size or_v_h_test
+;
+
+define void @or_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_or_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_or_v_w_ARG2
+  %2 = or <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_or_v_w_RES
+  ret void
+}
+
+; CHECK: or_v_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: or.v
+; CHECK: st.w
+; CHECK: .size or_v_w_test
+;
+
+define void @or_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_or_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_or_v_d_ARG2
+  %2 = or <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_or_v_d_RES
+  ret void
+}
+
+; CHECK: or_v_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: or.v
+; CHECK: st.d
+; CHECK: .size or_v_d_test
+;
+@llvm_mips_xor_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_xor_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_xor_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_xor_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_xor_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_xor_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_xor_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_b_test
+;
+@llvm_mips_xor_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_xor_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_xor_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_xor_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_xor_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_xor_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_xor_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_h_test
+;
+@llvm_mips_xor_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_xor_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_xor_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_xor_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_xor_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_xor_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_xor_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_w_test
+;
+@llvm_mips_xor_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_xor_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_xor_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_xor_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_xor_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_xor_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_xor_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_d_test
+;
+define void @xor_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_xor_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_xor_v_b_ARG2
+  %2 = xor <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_xor_v_b_RES
+  ret void
+}
+
+; CHECK: xor_v_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: xor.v
+; CHECK: st.b
+; CHECK: .size xor_v_b_test
+;
+define void @xor_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_xor_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_xor_v_h_ARG2
+  %2 = xor <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_xor_v_h_RES
+  ret void
+}
+
+; CHECK: xor_v_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: xor.v
+; CHECK: st.h
+; CHECK: .size xor_v_h_test
+;
+
+define void @xor_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_xor_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_xor_v_w_ARG2
+  %2 = xor <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_xor_v_w_RES
+  ret void
+}
+
+; CHECK: xor_v_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: xor.v
+; CHECK: st.w
+; CHECK: .size xor_v_w_test
+;
+
+define void @xor_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_xor_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_xor_v_d_ARG2
+  %2 = xor <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_xor_v_d_RES
+  ret void
+}
+
+; CHECK: xor_v_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: xor.v
+; CHECK: st.d
+; CHECK: .size xor_v_d_test
+;
+declare <16 x i8> @llvm.mips.and.v(<16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.bmnz.v(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.bmz.v(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.bsel.v(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.nor.v(<16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.or.v(<16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.xor.v(<16 x i8>, <16 x i8>) nounwind
diff --git a/test/CodeGen/Mips/msa/vecs10.ll b/test/CodeGen/Mips/msa/vecs10.ll
new file mode 100644
index 0000000..e22e075
--- /dev/null
+++ b/test/CodeGen/Mips/msa/vecs10.ll
@@ -0,0 +1,47 @@
+; Test the MSA intrinsics that are encoded with the VECS10 instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bnz_v_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+
+define i32 @llvm_mips_bnz_v_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bnz_v_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.v(<16 x i8> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.v(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_bnz_v_test:
+; CHECK-DAG: ld.b [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.v [[R0]]
+; CHECK: .size llvm_mips_bnz_v_test
+
+@llvm_mips_bz_v_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+
+define i32 @llvm_mips_bz_v_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bz_v_ARG1
+  %1 = tail call i32 @llvm.mips.bz.v(<16 x i8> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bz.v(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_bz_v_test:
+; CHECK-DAG: ld.b [[R0:\$w[0-9]+]]
+; CHECK-DAG: bz.v [[R0]]
+; CHECK: .size llvm_mips_bz_v_test
+;
diff --git a/test/CodeGen/Mips/nomips16.ll b/test/CodeGen/Mips/nomips16.ll
new file mode 100644
index 0000000..bf7c667
--- /dev/null
+++ b/test/CodeGen/Mips/nomips16.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s 
+
+@x = global float 0.000000e+00, align 4
+@.str = private unnamed_addr constant [20 x i8] c"in main: mips16 %f\0A\00", align 1
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  %0 = load float* @x, align 4
+  %conv = fpext float %0 to double
+  %add = fadd double %conv, 1.500000e+00
+  %conv1 = fptrunc double %add to float
+  store float %conv1, float* @x, align 4
+  ret void
+}
+; CHECK: 	.ent	foo
+; CHECK: 	jal	__mips16_extendsfdf2
+; CHECK:   	.end	foo
+
+; Function Attrs: nounwind
+define void @nofoo() #1 {
+entry:
+  %0 = load float* @x, align 4
+  %conv = fpext float %0 to double
+  %add = fadd double %conv, 3.900000e+00
+  %conv1 = fptrunc double %add to float
+  store float %conv1, float* @x, align 4
+  ret void
+}
+
+; CHECK: 	.ent	nofoo
+; CHECK: 	cvt.d.s	$f{{.+}}, $f{{.+}}
+; CHECK: 	.end	nofoo
+
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
diff --git a/test/CodeGen/Mips/o32_cc.ll b/test/CodeGen/Mips/o32_cc.ll
index 70b66ef..08e5aab 100644
--- a/test/CodeGen/Mips/o32_cc.ll
+++ b/test/CodeGen/Mips/o32_cc.ll
@@ -1,11 +1,12 @@
-; RUN: llc -march=mips < %s | FileCheck %s
-
-; FIXME: Disabled because it unpredictably fails on certain platforms.
-; REQUIRES: disabled
+; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=FP32EL %s
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck -check-prefix=FP64EL %s
 
 ; $f12, $f14
-; CHECK: ldc1 $f12, %lo
-; CHECK: ldc1 $f14, %lo
+; CHECK-LABEL: testlowercall0:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: ldc1 $f14, %lo
 define void @testlowercall0() nounwind {
 entry:
   tail call void @f0(double 5.000000e+00, double 6.000000e+00) nounwind
@@ -15,8 +16,9 @@ entry:
 declare void @f0(double, double)
 
 ; $f12, $f14
-; CHECK: lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
+; CHECK-LABEL: testlowercall1:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
 define void @testlowercall1() nounwind {
 entry:
   tail call void @f1(float 8.000000e+00, float 9.000000e+00) nounwind
@@ -26,8 +28,9 @@ entry:
 declare void @f1(float, float)
 
 ; $f12, $f14
-; CHECK: lwc1 $f12, %lo
-; CHECK: ldc1 $f14, %lo
+; CHECK-LABEL: testlowercall2:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: ldc1 $f14, %lo
 define void @testlowercall2() nounwind {
 entry:
   tail call void @f2(float 8.000000e+00, double 6.000000e+00) nounwind
@@ -37,8 +40,9 @@ entry:
 declare void @f2(float, double)
 
 ; $f12, $f14
-; CHECK: ldc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
+; CHECK-LABEL: testlowercall3:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
 define void @testlowercall3() nounwind {
 entry:
   tail call void @f3(double 5.000000e+00, float 9.000000e+00) nounwind
@@ -48,10 +52,11 @@ entry:
 declare void @f3(double, float)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 12
-; CHECK: addiu $5, $zero, 13
-; CHECK: addiu $6, $zero, 14
-; CHECK: addiu $7, $zero, 15
+; CHECK-LABEL: testlowercall4:
+; CHECK-DAG: addiu $4, $zero, 12
+; CHECK-DAG: addiu $5, $zero, 13
+; CHECK-DAG: addiu $6, $zero, 14
+; CHECK-DAG: addiu $7, $zero, 15
 define void @testlowercall4() nounwind {
 entry:
   tail call void @f4(i32 12, i32 13, i32 14, i32 15) nounwind
@@ -61,10 +66,11 @@ entry:
 declare void @f4(i32, i32, i32, i32)
 
 ; $f12, $6, stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 23
+; CHECK-LABEL: testlowercall5:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 23
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall5() nounwind {
 entry:
   tail call void @f5(double 1.500000e+01, i32 23, double 1.700000e+01) nounwind
@@ -74,9 +80,10 @@ entry:
 declare void @f5(double, i32, double)
 
 ; $f12, $6, $7
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 33
-; CHECK: addiu $7, $zero, 24
+; CHECK-LABEL: testlowercall6:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 33
+; CHECK-DAG: addiu $7, $zero, 24
 define void @testlowercall6() nounwind {
 entry:
   tail call void @f6(double 2.500000e+01, i32 33, i32 24) nounwind
@@ -86,9 +93,10 @@ entry:
 declare void @f6(double, i32, i32)
 
 ; $f12, $5, $6
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 43
-; CHECK: addiu $6, $zero, 34
+; CHECK-LABEL: testlowercall7:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 43
+; CHECK-DAG: addiu $6, $zero, 34
 define void @testlowercall7() nounwind {
 entry:
   tail call void @f7(float 1.800000e+01, i32 43, i32 34) nounwind
@@ -98,11 +106,12 @@ entry:
 declare void @f7(float, i32, i32)
 
 ; $4, $5, $6, stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: addiu $4, $zero, 22
-; CHECK: addiu $5, $zero, 53
-; CHECK: addiu $6, $zero, 44
+; CHECK-LABEL: testlowercall8:
+; CHECK-DAG: addiu $4, $zero, 22
+; CHECK-DAG: addiu $5, $zero, 53
+; CHECK-DAG: addiu $6, $zero, 44
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall8() nounwind {
 entry:
   tail call void @f8(i32 22, i32 53, i32 44, double 4.000000e+00) nounwind
@@ -112,10 +121,11 @@ entry:
 declare void @f8(i32, i32, i32, double)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 32
-; CHECK: addiu $5, $zero, 63
-; CHECK: addiu $6, $zero, 54
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall9:
+; CHECK-DAG: addiu $4, $zero, 32
+; CHECK-DAG: addiu $5, $zero, 63
+; CHECK-DAG: addiu $6, $zero, 54
+; CHECK-DAG: lui $7, 16688
 define void @testlowercall9() nounwind {
 entry:
   tail call void @f9(i32 32, i32 63, i32 54, float 1.100000e+01) nounwind
@@ -125,10 +135,15 @@ entry:
 declare void @f9(i32, i32, i32, float)
 
 ; $4, $5, ($6, $7)
-; CHECK: addiu $4, $zero, 42
-; CHECK: addiu $5, $zero, 73
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall10:
+; CHECK-DAG: addiu $4, $zero, 42
+; CHECK-DAG: addiu $5, $zero, 73
+; FP32EL-LABEL: testlowercall10:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall10:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall10() nounwind {
 entry:
   tail call void @f10(i32 42, i32 73, double 2.700000e+01) nounwind
@@ -138,9 +153,14 @@ entry:
 declare void @f10(i32, i32, double)
 
 ; $4, ($6, $7)
-; CHECK: addiu $4, $zero, 52
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall11:
+; CHECK-DAG: addiu $4, $zero, 52
+; FP32EL-LABEL: testlowercall11:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall11:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall11() nounwind {
 entry:
   tail call void @f11(i32 52, double 1.600000e+01) nounwind
@@ -150,10 +170,11 @@ entry:
 declare void @f11(i32, double)
 
 ; $f12, $f14, $6, $7
-; CHECK: lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: ori $6
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall12:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; CHECK-DAG: lui $6, 16672
+; CHECK-DAG: lui $7, 16808
 define void @testlowercall12() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind
@@ -163,10 +184,11 @@ entry:
 declare void @f12(float, float, float, float)
 
 ; $f12, $5, $6, $7
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 83
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 25
+; CHECK-LABEL: testlowercall13:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 83
+; CHECK-DAG: lui $6, 16800
+; CHECK-DAG: addiu $7, $zero, 25
 define void @testlowercall13() nounwind {
 entry:
   tail call void @f13(float 3.800000e+01, i32 83, float 2.000000e+01, i32 25) nounwind
@@ -177,9 +199,10 @@ entry:
 declare void @f13(float, i32, float, i32)
 
 ; $f12, $f14, $7
-; CHECK: ldc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall14:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; CHECK-DAG: lui $7, 16880
 define void @testlowercall14() nounwind {
 entry:
   tail call void @f14(double 3.500000e+01, float 2.900000e+01, float 3.000000e+01) nounwind
@@ -189,10 +212,15 @@ entry:
 declare void @f14(double, float, float)
 
 ; $f12, $f14, ($6, $7)
-; CHECK: lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall15:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; FP32EL-LABEL: testlowercall15:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall15:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall15() nounwind {
 entry:
   tail call void @f15(float 4.800000e+01, float 3.900000e+01, double 3.700000e+01) nounwind
@@ -202,10 +230,11 @@ entry:
 declare void @f15(float, float, double)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 62
-; CHECK: ori $5
-; CHECK: addiu $6, $zero, 64
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall16:
+; CHECK-DAG: addiu $4, $zero, 62
+; CHECK-DAG: lui $5, 16964
+; CHECK-DAG: addiu $6, $zero, 64
+; CHECK-DAG: lui $7, 16888
 define void @testlowercall16() nounwind {
 entry:
   tail call void @f16(i32 62, float 4.900000e+01, i32 64, float 3.100000e+01) nounwind
@@ -215,10 +244,11 @@ entry:
 declare void @f16(i32, float, i32, float)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 72
-; CHECK: ori $5
-; CHECK: addiu $6, $zero, 74
-; CHECK: addiu $7, $zero, 35
+; CHECK-LABEL: testlowercall17:
+; CHECK-DAG: addiu $4, $zero, 72
+; CHECK-DAG: lui $5, 17004
+; CHECK-DAG: addiu $6, $zero, 74
+; CHECK-DAG: addiu $7, $zero, 35
 define void @testlowercall17() nounwind {
 entry:
   tail call void @f17(i32 72, float 5.900000e+01, i32 74, i32 35) nounwind
@@ -228,10 +258,11 @@ entry:
 declare void @f17(i32, float, i32, i32)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 82
-; CHECK: addiu $5, $zero, 93
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 45
+; CHECK-LABEL: testlowercall18:
+; CHECK-DAG: addiu $4, $zero, 82
+; CHECK-DAG: addiu $5, $zero, 93
+; CHECK-DAG: lui $6, 16928
+; CHECK-DAG: addiu $7, $zero, 45
 define void @testlowercall18() nounwind {
 entry:
   tail call void @f18(i32 82, i32 93, float 4.000000e+01, i32 45) nounwind
@@ -242,11 +273,16 @@ declare void @f18(i32, i32, float, i32)
 
 
 ; $4, ($6, $7), stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: addiu $4, $zero, 92
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall20:
+; CHECK-DAG: addiu $4, $zero, 92
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; FP32EL-LABEL: testlowercall20:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall20:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall20() nounwind {
 entry:
   tail call void @f20(i32 92, double 2.600000e+01, double 4.700000e+01) nounwind
@@ -256,8 +292,9 @@ entry:
 declare void @f20(i32, double, double)
 
 ; $f12, $5
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 103
+; CHECK-LABEL: testlowercall21:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 103
 define void @testlowercall21() nounwind {
 entry:
   tail call void @f21(float 5.800000e+01, i32 103) nounwind
@@ -267,10 +304,15 @@ entry:
 declare void @f21(float, i32)
 
 ; $f12, $5, ($6, $7)
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 113
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall22:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 113
+; FP32EL-LABEL: testlowercall22:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall22:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall22() nounwind {
 entry:
   tail call void @f22(float 6.800000e+01, i32 113, double 5.700000e+01) nounwind
@@ -280,8 +322,9 @@ entry:
 declare void @f22(float, i32, double)
 
 ; $f12, f6
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 123
+; CHECK-LABEL: testlowercall23:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 123
 define void @testlowercall23() nounwind {
 entry:
   tail call void @f23(double 4.500000e+01, i32 123) nounwind
@@ -291,10 +334,11 @@ entry:
 declare void @f23(double, i32)
 
 ; $f12,$6, stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 133
+; CHECK-LABEL: testlowercall24:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 133
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall24() nounwind {
 entry:
   tail call void @f24(double 5.500000e+01, i32 133, double 6.700000e+01) nounwind
@@ -303,19 +347,19 @@ entry:
 
 declare void @f24(double, i32, double)
 
-; CHECK: lwc1 $f12, %lo
-; lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: ori $6
-; CHECK: ori $7
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 83
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 25
-; CHECK: addiu $4, $zero, 82
-; CHECK: addiu $5, $zero, 93
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 45
+; CHECK-LABEL: testlowercall25:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; CHECK-DAG: lui $6
+; CHECK-DAG: lui $7
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 83
+; CHECK-DAG: lui $6
+; CHECK-DAG: addiu $7, $zero, 25
+; CHECK-DAG: addiu $4, $zero, 82
+; CHECK-DAG: addiu $5, $zero, 93
+; CHECK-DAG: lui $6
+; CHECK-DAG: addiu $7, $zero, 45
 define void @testlowercall25() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index 0a8f85f..5db47ac 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -10,22 +10,23 @@
 
 define void @f1() nounwind {
 entry:
-; CHECK: lw  $[[R1:[0-9]+]], %got(f1.s1)
-; CHECK: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
-; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
-; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
-; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
-; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
-; CHECK: lw  $[[R6:[0-9]+]], 28($[[R0]])
-; CHECK: sw  $[[R6]], 36($sp)
-; CHECK: sw  $[[R5]], 32($sp)
-; CHECK: sw  $[[R4]], 28($sp)
-; CHECK: sw  $[[R3]], 24($sp)
-; CHECK: sw  $[[R7]], 20($sp)
-; CHECK: lw  $[[R2:[0-9]+]], 8($[[R0]])
-; CHECK: sw  $[[R2]], 16($sp)
-; CHECK: lw  $6, %lo(f1.s1)($[[R1]])
-; CHECK: lw  $7, 4($[[R0]])
+; CHECK-LABEL: f1:
+; CHECK-DAG: lw  $[[R1:[0-9]+]], %got(f1.s1)
+; CHECK-DAG: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
+; CHECK-DAG: lw  $[[R7:[0-9]+]], 12($[[R0]])
+; CHECK-DAG: lw  $[[R3:[0-9]+]], 16($[[R0]])
+; CHECK-DAG: lw  $[[R4:[0-9]+]], 20($[[R0]])
+; CHECK-DAG: lw  $[[R5:[0-9]+]], 24($[[R0]])
+; CHECK-DAG: lw  $[[R6:[0-9]+]], 28($[[R0]])
+; CHECK-DAG: sw  $[[R6]], 36($sp)
+; CHECK-DAG: sw  $[[R5]], 32($sp)
+; CHECK-DAG: sw  $[[R4]], 28($sp)
+; CHECK-DAG: sw  $[[R3]], 24($sp)
+; CHECK-DAG: sw  $[[R7]], 20($sp)
+; CHECK-DAG: lw  $[[R2:[0-9]+]], 8($[[R0]])
+; CHECK-DAG: sw  $[[R2]], 16($sp)
+; CHECK-DAG: lw  $6, %lo(f1.s1)($[[R1]])
+; CHECK-DAG: lw  $7, 4($[[R0]])
   %agg.tmp10 = alloca %struct.S3, align 4
   call void @callee1(float 2.000000e+01, %struct.S1* byval bitcast (%0* @f1.s1 to %struct.S1*)) nounwind
   call void @callee2(%struct.S2* byval @f1.s2) nounwind
@@ -61,17 +62,17 @@ entry:
 ; CHECK: mfc1 $6, $f[[F0]]
 
   %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
-  %tmp = load i32* %i2, align 4, !tbaa !0
+  %tmp = load i32* %i2, align 4
   %d = getelementptr inbounds %struct.S1* %s1, i32 0, i32 4
-  %tmp1 = load double* %d, align 8, !tbaa !3
+  %tmp1 = load double* %d, align 8
   %ll = getelementptr inbounds %struct.S1* %s1, i32 0, i32 3
-  %tmp2 = load i64* %ll, align 8, !tbaa !4
+  %tmp2 = load i64* %ll, align 8
   %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
-  %tmp3 = load i32* %i, align 4, !tbaa !0
+  %tmp3 = load i32* %i, align 4
   %s = getelementptr inbounds %struct.S1* %s1, i32 0, i32 1
-  %tmp4 = load i16* %s, align 2, !tbaa !5
+  %tmp4 = load i16* %s, align 2
   %c = getelementptr inbounds %struct.S1* %s1, i32 0, i32 0
-  %tmp5 = load i8* %c, align 1, !tbaa !1
+  %tmp5 = load i8* %c, align 1
   tail call void @callee4(i32 %tmp, double %tmp1, i64 %tmp2, i32 %tmp3, i16 signext %tmp4, i8 signext %tmp5, float %f) nounwind
   ret void
 }
@@ -90,9 +91,9 @@ entry:
 ; CHECK: sw  $[[R0]], 24($sp)
 
   %arrayidx = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 0
-  %tmp = load i32* %arrayidx, align 4, !tbaa !0
+  %tmp = load i32* %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 3
-  %tmp3 = load i32* %arrayidx2, align 4, !tbaa !0
+  %tmp3 = load i32* %arrayidx2, align 4
   tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp3, i16 signext 4, i8 signext 5, float 6.000000e+00) nounwind
   ret void
 }
@@ -110,11 +111,11 @@ entry:
 ; CHECK: sw  $[[R1]], 24($sp)
 
   %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
-  %tmp = load i32* %i, align 4, !tbaa !0
+  %tmp = load i32* %i, align 4
   %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
-  %tmp1 = load i32* %i2, align 4, !tbaa !0
+  %tmp1 = load i32* %i2, align 4
   %c = getelementptr inbounds %struct.S3* %s3, i32 0, i32 0
-  %tmp2 = load i8* %c, align 1, !tbaa !1
+  %tmp2 = load i8* %c, align 1
   tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp1, i16 signext 4, i8 signext %tmp2, float 6.000000e+00) nounwind
   ret void
 }
@@ -128,10 +129,3 @@ entry:
 }
 
 declare void @f6(%struct.S4* nocapture byval, i64)
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"double", metadata !1}
-!4 = metadata !{metadata !"long long", metadata !1}
-!5 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/Mips/powif64_16.ll b/test/CodeGen/Mips/powif64_16.ll
new file mode 100644
index 0000000..35a7ca9
--- /dev/null
+++ b/test/CodeGen/Mips/powif64_16.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s 
+
+declare float     @llvm.powi.f32(float  %Val, i32 %power)
+declare double    @llvm.powi.f64(double %Val, i32 %power)
+
+define float @foo_pow_f32(float %y, i32 %p)  {
+  %1 = tail call float @llvm.powi.f32(float %y, i32 %p)
+; CHECK-NOT: .ent	__call_stub_fp_llvm.powi.f32
+; CHECK-NOT: {{.*}} jal llvm.powi.f32
+  ret float %1
+} 
+
+define double @foo_pow_f64(double %y, i32 %p)  {
+  %1 = tail call double @llvm.powi.f64(double %y, i32 %p)
+; CHECK-NOT: .ent	__call_stub_fp_llvm.powi.f64
+; CHECK-NOT: {{.*}} jal llvm.powi.f64 
+  ret double %1
+} 
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readonly }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/Mips/ra-allocatable.ll b/test/CodeGen/Mips/ra-allocatable.ll
index 7621788..afc5cb0 100644
--- a/test/CodeGen/Mips/ra-allocatable.ll
+++ b/test/CodeGen/Mips/ra-allocatable.ll
@@ -98,191 +98,186 @@ entry:
 ; CHECK: lw  $ra, {{[0-9]+}}($sp)            # 4-byte Folded Reload
 ; CHECK: jr  $ra
 
-  %0 = load i32* @a0, align 4, !tbaa !0
-  %1 = load i32** @b0, align 4, !tbaa !3
-  store i32 %0, i32* %1, align 4, !tbaa !0
-  %2 = load i32* @a1, align 4, !tbaa !0
-  %3 = load i32** @b1, align 4, !tbaa !3
-  store i32 %2, i32* %3, align 4, !tbaa !0
-  %4 = load i32* @a2, align 4, !tbaa !0
-  %5 = load i32** @b2, align 4, !tbaa !3
-  store i32 %4, i32* %5, align 4, !tbaa !0
-  %6 = load i32* @a3, align 4, !tbaa !0
-  %7 = load i32** @b3, align 4, !tbaa !3
-  store i32 %6, i32* %7, align 4, !tbaa !0
-  %8 = load i32* @a4, align 4, !tbaa !0
-  %9 = load i32** @b4, align 4, !tbaa !3
-  store i32 %8, i32* %9, align 4, !tbaa !0
-  %10 = load i32* @a5, align 4, !tbaa !0
-  %11 = load i32** @b5, align 4, !tbaa !3
-  store i32 %10, i32* %11, align 4, !tbaa !0
-  %12 = load i32* @a6, align 4, !tbaa !0
-  %13 = load i32** @b6, align 4, !tbaa !3
-  store i32 %12, i32* %13, align 4, !tbaa !0
-  %14 = load i32* @a7, align 4, !tbaa !0
-  %15 = load i32** @b7, align 4, !tbaa !3
-  store i32 %14, i32* %15, align 4, !tbaa !0
-  %16 = load i32* @a8, align 4, !tbaa !0
-  %17 = load i32** @b8, align 4, !tbaa !3
-  store i32 %16, i32* %17, align 4, !tbaa !0
-  %18 = load i32* @a9, align 4, !tbaa !0
-  %19 = load i32** @b9, align 4, !tbaa !3
-  store i32 %18, i32* %19, align 4, !tbaa !0
-  %20 = load i32* @a10, align 4, !tbaa !0
-  %21 = load i32** @b10, align 4, !tbaa !3
-  store i32 %20, i32* %21, align 4, !tbaa !0
-  %22 = load i32* @a11, align 4, !tbaa !0
-  %23 = load i32** @b11, align 4, !tbaa !3
-  store i32 %22, i32* %23, align 4, !tbaa !0
-  %24 = load i32* @a12, align 4, !tbaa !0
-  %25 = load i32** @b12, align 4, !tbaa !3
-  store i32 %24, i32* %25, align 4, !tbaa !0
-  %26 = load i32* @a13, align 4, !tbaa !0
-  %27 = load i32** @b13, align 4, !tbaa !3
-  store i32 %26, i32* %27, align 4, !tbaa !0
-  %28 = load i32* @a14, align 4, !tbaa !0
-  %29 = load i32** @b14, align 4, !tbaa !3
-  store i32 %28, i32* %29, align 4, !tbaa !0
-  %30 = load i32* @a15, align 4, !tbaa !0
-  %31 = load i32** @b15, align 4, !tbaa !3
-  store i32 %30, i32* %31, align 4, !tbaa !0
-  %32 = load i32* @a16, align 4, !tbaa !0
-  %33 = load i32** @b16, align 4, !tbaa !3
-  store i32 %32, i32* %33, align 4, !tbaa !0
-  %34 = load i32* @a17, align 4, !tbaa !0
-  %35 = load i32** @b17, align 4, !tbaa !3
-  store i32 %34, i32* %35, align 4, !tbaa !0
-  %36 = load i32* @a18, align 4, !tbaa !0
-  %37 = load i32** @b18, align 4, !tbaa !3
-  store i32 %36, i32* %37, align 4, !tbaa !0
-  %38 = load i32* @a19, align 4, !tbaa !0
-  %39 = load i32** @b19, align 4, !tbaa !3
-  store i32 %38, i32* %39, align 4, !tbaa !0
-  %40 = load i32* @a20, align 4, !tbaa !0
-  %41 = load i32** @b20, align 4, !tbaa !3
-  store i32 %40, i32* %41, align 4, !tbaa !0
-  %42 = load i32* @a21, align 4, !tbaa !0
-  %43 = load i32** @b21, align 4, !tbaa !3
-  store i32 %42, i32* %43, align 4, !tbaa !0
-  %44 = load i32* @a22, align 4, !tbaa !0
-  %45 = load i32** @b22, align 4, !tbaa !3
-  store i32 %44, i32* %45, align 4, !tbaa !0
-  %46 = load i32* @a23, align 4, !tbaa !0
-  %47 = load i32** @b23, align 4, !tbaa !3
-  store i32 %46, i32* %47, align 4, !tbaa !0
-  %48 = load i32* @a24, align 4, !tbaa !0
-  %49 = load i32** @b24, align 4, !tbaa !3
-  store i32 %48, i32* %49, align 4, !tbaa !0
-  %50 = load i32* @a25, align 4, !tbaa !0
-  %51 = load i32** @b25, align 4, !tbaa !3
-  store i32 %50, i32* %51, align 4, !tbaa !0
-  %52 = load i32* @a26, align 4, !tbaa !0
-  %53 = load i32** @b26, align 4, !tbaa !3
-  store i32 %52, i32* %53, align 4, !tbaa !0
-  %54 = load i32* @a27, align 4, !tbaa !0
-  %55 = load i32** @b27, align 4, !tbaa !3
-  store i32 %54, i32* %55, align 4, !tbaa !0
-  %56 = load i32* @a28, align 4, !tbaa !0
-  %57 = load i32** @b28, align 4, !tbaa !3
-  store i32 %56, i32* %57, align 4, !tbaa !0
-  %58 = load i32* @a29, align 4, !tbaa !0
-  %59 = load i32** @b29, align 4, !tbaa !3
-  store i32 %58, i32* %59, align 4, !tbaa !0
-  %60 = load i32* @a0, align 4, !tbaa !0
-  %61 = load i32** @c0, align 4, !tbaa !3
-  store i32 %60, i32* %61, align 4, !tbaa !0
-  %62 = load i32* @a1, align 4, !tbaa !0
-  %63 = load i32** @c1, align 4, !tbaa !3
-  store i32 %62, i32* %63, align 4, !tbaa !0
-  %64 = load i32* @a2, align 4, !tbaa !0
-  %65 = load i32** @c2, align 4, !tbaa !3
-  store i32 %64, i32* %65, align 4, !tbaa !0
-  %66 = load i32* @a3, align 4, !tbaa !0
-  %67 = load i32** @c3, align 4, !tbaa !3
-  store i32 %66, i32* %67, align 4, !tbaa !0
-  %68 = load i32* @a4, align 4, !tbaa !0
-  %69 = load i32** @c4, align 4, !tbaa !3
-  store i32 %68, i32* %69, align 4, !tbaa !0
-  %70 = load i32* @a5, align 4, !tbaa !0
-  %71 = load i32** @c5, align 4, !tbaa !3
-  store i32 %70, i32* %71, align 4, !tbaa !0
-  %72 = load i32* @a6, align 4, !tbaa !0
-  %73 = load i32** @c6, align 4, !tbaa !3
-  store i32 %72, i32* %73, align 4, !tbaa !0
-  %74 = load i32* @a7, align 4, !tbaa !0
-  %75 = load i32** @c7, align 4, !tbaa !3
-  store i32 %74, i32* %75, align 4, !tbaa !0
-  %76 = load i32* @a8, align 4, !tbaa !0
-  %77 = load i32** @c8, align 4, !tbaa !3
-  store i32 %76, i32* %77, align 4, !tbaa !0
-  %78 = load i32* @a9, align 4, !tbaa !0
-  %79 = load i32** @c9, align 4, !tbaa !3
-  store i32 %78, i32* %79, align 4, !tbaa !0
-  %80 = load i32* @a10, align 4, !tbaa !0
-  %81 = load i32** @c10, align 4, !tbaa !3
-  store i32 %80, i32* %81, align 4, !tbaa !0
-  %82 = load i32* @a11, align 4, !tbaa !0
-  %83 = load i32** @c11, align 4, !tbaa !3
-  store i32 %82, i32* %83, align 4, !tbaa !0
-  %84 = load i32* @a12, align 4, !tbaa !0
-  %85 = load i32** @c12, align 4, !tbaa !3
-  store i32 %84, i32* %85, align 4, !tbaa !0
-  %86 = load i32* @a13, align 4, !tbaa !0
-  %87 = load i32** @c13, align 4, !tbaa !3
-  store i32 %86, i32* %87, align 4, !tbaa !0
-  %88 = load i32* @a14, align 4, !tbaa !0
-  %89 = load i32** @c14, align 4, !tbaa !3
-  store i32 %88, i32* %89, align 4, !tbaa !0
-  %90 = load i32* @a15, align 4, !tbaa !0
-  %91 = load i32** @c15, align 4, !tbaa !3
-  store i32 %90, i32* %91, align 4, !tbaa !0
-  %92 = load i32* @a16, align 4, !tbaa !0
-  %93 = load i32** @c16, align 4, !tbaa !3
-  store i32 %92, i32* %93, align 4, !tbaa !0
-  %94 = load i32* @a17, align 4, !tbaa !0
-  %95 = load i32** @c17, align 4, !tbaa !3
-  store i32 %94, i32* %95, align 4, !tbaa !0
-  %96 = load i32* @a18, align 4, !tbaa !0
-  %97 = load i32** @c18, align 4, !tbaa !3
-  store i32 %96, i32* %97, align 4, !tbaa !0
-  %98 = load i32* @a19, align 4, !tbaa !0
-  %99 = load i32** @c19, align 4, !tbaa !3
-  store i32 %98, i32* %99, align 4, !tbaa !0
-  %100 = load i32* @a20, align 4, !tbaa !0
-  %101 = load i32** @c20, align 4, !tbaa !3
-  store i32 %100, i32* %101, align 4, !tbaa !0
-  %102 = load i32* @a21, align 4, !tbaa !0
-  %103 = load i32** @c21, align 4, !tbaa !3
-  store i32 %102, i32* %103, align 4, !tbaa !0
-  %104 = load i32* @a22, align 4, !tbaa !0
-  %105 = load i32** @c22, align 4, !tbaa !3
-  store i32 %104, i32* %105, align 4, !tbaa !0
-  %106 = load i32* @a23, align 4, !tbaa !0
-  %107 = load i32** @c23, align 4, !tbaa !3
-  store i32 %106, i32* %107, align 4, !tbaa !0
-  %108 = load i32* @a24, align 4, !tbaa !0
-  %109 = load i32** @c24, align 4, !tbaa !3
-  store i32 %108, i32* %109, align 4, !tbaa !0
-  %110 = load i32* @a25, align 4, !tbaa !0
-  %111 = load i32** @c25, align 4, !tbaa !3
-  store i32 %110, i32* %111, align 4, !tbaa !0
-  %112 = load i32* @a26, align 4, !tbaa !0
-  %113 = load i32** @c26, align 4, !tbaa !3
-  store i32 %112, i32* %113, align 4, !tbaa !0
-  %114 = load i32* @a27, align 4, !tbaa !0
-  %115 = load i32** @c27, align 4, !tbaa !3
-  store i32 %114, i32* %115, align 4, !tbaa !0
-  %116 = load i32* @a28, align 4, !tbaa !0
-  %117 = load i32** @c28, align 4, !tbaa !3
-  store i32 %116, i32* %117, align 4, !tbaa !0
-  %118 = load i32* @a29, align 4, !tbaa !0
-  %119 = load i32** @c29, align 4, !tbaa !3
-  store i32 %118, i32* %119, align 4, !tbaa !0
-  %120 = load i32* @a0, align 4, !tbaa !0
+  %0 = load i32* @a0, align 4
+  %1 = load i32** @b0, align 4
+  store i32 %0, i32* %1, align 4
+  %2 = load i32* @a1, align 4
+  %3 = load i32** @b1, align 4
+  store i32 %2, i32* %3, align 4
+  %4 = load i32* @a2, align 4
+  %5 = load i32** @b2, align 4
+  store i32 %4, i32* %5, align 4
+  %6 = load i32* @a3, align 4
+  %7 = load i32** @b3, align 4
+  store i32 %6, i32* %7, align 4
+  %8 = load i32* @a4, align 4
+  %9 = load i32** @b4, align 4
+  store i32 %8, i32* %9, align 4
+  %10 = load i32* @a5, align 4
+  %11 = load i32** @b5, align 4
+  store i32 %10, i32* %11, align 4
+  %12 = load i32* @a6, align 4
+  %13 = load i32** @b6, align 4
+  store i32 %12, i32* %13, align 4
+  %14 = load i32* @a7, align 4
+  %15 = load i32** @b7, align 4
+  store i32 %14, i32* %15, align 4
+  %16 = load i32* @a8, align 4
+  %17 = load i32** @b8, align 4
+  store i32 %16, i32* %17, align 4
+  %18 = load i32* @a9, align 4
+  %19 = load i32** @b9, align 4
+  store i32 %18, i32* %19, align 4
+  %20 = load i32* @a10, align 4
+  %21 = load i32** @b10, align 4
+  store i32 %20, i32* %21, align 4
+  %22 = load i32* @a11, align 4
+  %23 = load i32** @b11, align 4
+  store i32 %22, i32* %23, align 4
+  %24 = load i32* @a12, align 4
+  %25 = load i32** @b12, align 4
+  store i32 %24, i32* %25, align 4
+  %26 = load i32* @a13, align 4
+  %27 = load i32** @b13, align 4
+  store i32 %26, i32* %27, align 4
+  %28 = load i32* @a14, align 4
+  %29 = load i32** @b14, align 4
+  store i32 %28, i32* %29, align 4
+  %30 = load i32* @a15, align 4
+  %31 = load i32** @b15, align 4
+  store i32 %30, i32* %31, align 4
+  %32 = load i32* @a16, align 4
+  %33 = load i32** @b16, align 4
+  store i32 %32, i32* %33, align 4
+  %34 = load i32* @a17, align 4
+  %35 = load i32** @b17, align 4
+  store i32 %34, i32* %35, align 4
+  %36 = load i32* @a18, align 4
+  %37 = load i32** @b18, align 4
+  store i32 %36, i32* %37, align 4
+  %38 = load i32* @a19, align 4
+  %39 = load i32** @b19, align 4
+  store i32 %38, i32* %39, align 4
+  %40 = load i32* @a20, align 4
+  %41 = load i32** @b20, align 4
+  store i32 %40, i32* %41, align 4
+  %42 = load i32* @a21, align 4
+  %43 = load i32** @b21, align 4
+  store i32 %42, i32* %43, align 4
+  %44 = load i32* @a22, align 4
+  %45 = load i32** @b22, align 4
+  store i32 %44, i32* %45, align 4
+  %46 = load i32* @a23, align 4
+  %47 = load i32** @b23, align 4
+  store i32 %46, i32* %47, align 4
+  %48 = load i32* @a24, align 4
+  %49 = load i32** @b24, align 4
+  store i32 %48, i32* %49, align 4
+  %50 = load i32* @a25, align 4
+  %51 = load i32** @b25, align 4
+  store i32 %50, i32* %51, align 4
+  %52 = load i32* @a26, align 4
+  %53 = load i32** @b26, align 4
+  store i32 %52, i32* %53, align 4
+  %54 = load i32* @a27, align 4
+  %55 = load i32** @b27, align 4
+  store i32 %54, i32* %55, align 4
+  %56 = load i32* @a28, align 4
+  %57 = load i32** @b28, align 4
+  store i32 %56, i32* %57, align 4
+  %58 = load i32* @a29, align 4
+  %59 = load i32** @b29, align 4
+  store i32 %58, i32* %59, align 4
+  %60 = load i32* @a0, align 4
+  %61 = load i32** @c0, align 4
+  store i32 %60, i32* %61, align 4
+  %62 = load i32* @a1, align 4
+  %63 = load i32** @c1, align 4
+  store i32 %62, i32* %63, align 4
+  %64 = load i32* @a2, align 4
+  %65 = load i32** @c2, align 4
+  store i32 %64, i32* %65, align 4
+  %66 = load i32* @a3, align 4
+  %67 = load i32** @c3, align 4
+  store i32 %66, i32* %67, align 4
+  %68 = load i32* @a4, align 4
+  %69 = load i32** @c4, align 4
+  store i32 %68, i32* %69, align 4
+  %70 = load i32* @a5, align 4
+  %71 = load i32** @c5, align 4
+  store i32 %70, i32* %71, align 4
+  %72 = load i32* @a6, align 4
+  %73 = load i32** @c6, align 4
+  store i32 %72, i32* %73, align 4
+  %74 = load i32* @a7, align 4
+  %75 = load i32** @c7, align 4
+  store i32 %74, i32* %75, align 4
+  %76 = load i32* @a8, align 4
+  %77 = load i32** @c8, align 4
+  store i32 %76, i32* %77, align 4
+  %78 = load i32* @a9, align 4
+  %79 = load i32** @c9, align 4
+  store i32 %78, i32* %79, align 4
+  %80 = load i32* @a10, align 4
+  %81 = load i32** @c10, align 4
+  store i32 %80, i32* %81, align 4
+  %82 = load i32* @a11, align 4
+  %83 = load i32** @c11, align 4
+  store i32 %82, i32* %83, align 4
+  %84 = load i32* @a12, align 4
+  %85 = load i32** @c12, align 4
+  store i32 %84, i32* %85, align 4
+  %86 = load i32* @a13, align 4
+  %87 = load i32** @c13, align 4
+  store i32 %86, i32* %87, align 4
+  %88 = load i32* @a14, align 4
+  %89 = load i32** @c14, align 4
+  store i32 %88, i32* %89, align 4
+  %90 = load i32* @a15, align 4
+  %91 = load i32** @c15, align 4
+  store i32 %90, i32* %91, align 4
+  %92 = load i32* @a16, align 4
+  %93 = load i32** @c16, align 4
+  store i32 %92, i32* %93, align 4
+  %94 = load i32* @a17, align 4
+  %95 = load i32** @c17, align 4
+  store i32 %94, i32* %95, align 4
+  %96 = load i32* @a18, align 4
+  %97 = load i32** @c18, align 4
+  store i32 %96, i32* %97, align 4
+  %98 = load i32* @a19, align 4
+  %99 = load i32** @c19, align 4
+  store i32 %98, i32* %99, align 4
+  %100 = load i32* @a20, align 4
+  %101 = load i32** @c20, align 4
+  store i32 %100, i32* %101, align 4
+  %102 = load i32* @a21, align 4
+  %103 = load i32** @c21, align 4
+  store i32 %102, i32* %103, align 4
+  %104 = load i32* @a22, align 4
+  %105 = load i32** @c22, align 4
+  store i32 %104, i32* %105, align 4
+  %106 = load i32* @a23, align 4
+  %107 = load i32** @c23, align 4
+  store i32 %106, i32* %107, align 4
+  %108 = load i32* @a24, align 4
+  %109 = load i32** @c24, align 4
+  store i32 %108, i32* %109, align 4
+  %110 = load i32* @a25, align 4
+  %111 = load i32** @c25, align 4
+  store i32 %110, i32* %111, align 4
+  %112 = load i32* @a26, align 4
+  %113 = load i32** @c26, align 4
+  store i32 %112, i32* %113, align 4
+  %114 = load i32* @a27, align 4
+  %115 = load i32** @c27, align 4
+  store i32 %114, i32* %115, align 4
+  %116 = load i32* @a28, align 4
+  %117 = load i32** @c28, align 4
+  store i32 %116, i32* %117, align 4
+  %118 = load i32* @a29, align 4
+  %119 = load i32** @c29, align 4
+  store i32 %118, i32* %119, align 4
+  %120 = load i32* @a0, align 4
   ret i32 %120
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/Mips/rotate.ll b/test/CodeGen/Mips/rotate.ll
index 4f3cfb7..813bbdf 100644
--- a/test/CodeGen/Mips/rotate.ll
+++ b/test/CodeGen/Mips/rotate.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   < %s | FileCheck %s -check-prefix=mips16 
 
 ; CHECK:  rotrv $2, $4
+; mips16: .ent rot0
 define i32 @rot0(i32 %a, i32 %b) nounwind readnone {
 entry:
   %shl = shl i32 %a, %b
@@ -11,6 +13,7 @@ entry:
 }
 
 ; CHECK:  rotr  $2, $4, 22
+; mips16: .ent rot1
 define i32 @rot1(i32 %a) nounwind readnone {
 entry:
   %shl = shl i32 %a, 10
@@ -20,6 +23,7 @@ entry:
 }
 
 ; CHECK:  rotrv $2, $4, $5
+; mips16: .ent rot2
 define i32 @rot2(i32 %a, i32 %b) nounwind readnone {
 entry:
   %shr = lshr i32 %a, %b
@@ -30,6 +34,7 @@ entry:
 }
 
 ; CHECK:  rotr  $2, $4, 10
+; mips16: .ent rot3
 define i32 @rot3(i32 %a) nounwind readnone {
 entry:
   %shr = lshr i32 %a, 10
diff --git a/test/CodeGen/Mips/sel1c.ll b/test/CodeGen/Mips/sel1c.ll
new file mode 100644
index 0000000..4c4784d
--- /dev/null
+++ b/test/CodeGen/Mips/sel1c.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 1, align 4
+@j = global i32 2, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define void @t() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, %1
+  %cond = select i1 %cmp, i32 1, i32 2
+  store i32 %cond, i32* @k, align 4
+  ret void
+; cond-b-short:	bteqz	$BB0_{{[0-9]+}}  # 16 bit inst
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/sel2c.ll b/test/CodeGen/Mips/sel2c.ll
new file mode 100644
index 0000000..25dfaa9
--- /dev/null
+++ b/test/CodeGen/Mips/sel2c.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 1, align 4
+@j = global i32 2, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define void @t() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp ne i32 %0, %1
+  %cond = select i1 %cmp, i32 1, i32 2
+  store i32 %cond, i32* @k, align 4
+; cond-b-short:	btnez	$BB0_{{[0-9]+}}  # 16 bit inst
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/simplebr.ll b/test/CodeGen/Mips/simplebr.ll
new file mode 100644
index 0000000..a1d6367
--- /dev/null
+++ b/test/CodeGen/Mips/simplebr.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+
+; ModuleID = 'simplebr.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  call void bitcast (void (...)* @goo to void ()*)()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  call void bitcast (void (...)* @hoo to void ()*)()
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; CHECK-STATIC16:	b	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
+
+declare void @goo(...) #1
+
+declare void @hoo(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/stack-alignment.ll b/test/CodeGen/Mips/stack-alignment.ll
new file mode 100644
index 0000000..b18f966
--- /dev/null
+++ b/test/CodeGen/Mips/stack-alignment.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
+
+; 32:      addiu  $sp, $sp, -8
+; 64:      addiu  $sp, $sp, -16
+
+define i32 @foo1() #0 {
+entry:
+  ret i32 14
+}
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
index bcd33fc..30f47ab 100644
--- a/test/CodeGen/Mips/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -243,3 +243,16 @@ entry:
   ret i32 %call
 }
 
+; Check that there is a chain edge between the load and store nodes.
+;
+; PIC32-LABEL: caller14:
+; PIC32: lw ${{[0-9]+}}, 16($sp)
+; PIC32: sw $4, 16($sp)
+
+define void @caller14(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  tail call void @callee14(i32 %e, i32 %b, i32 %c, i32 %d, i32 %a)
+  ret void
+}
+
+declare void @callee14(i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/Mips/tnaked.ll b/test/CodeGen/Mips/tnaked.ll
index edf1ecf..08f1ab5 100644
--- a/test/CodeGen/Mips/tnaked.ll
+++ b/test/CodeGen/Mips/tnaked.ll
@@ -25,5 +25,5 @@ entry:
 ; CHECK:	.fmask	0x00000000,0
 ; CHECK: 	addiu	$sp, $sp, -8
 
-attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll
new file mode 100644
index 0000000..92f0fcb1
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug17709.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; ModuleID = '__kernelgen_main_module'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define linker_private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
+entry:
+  ;unreachable
+  %t0 = insertvalue {double, double} undef, double 1.0, 0
+  %t1 = insertvalue {double, double} %t0, double 1.0, 1
+  ret { double, double } %t1
+}
+
+%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
+%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
+
+; CHECK: .visible .entry __kernelgen_main
+define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) {
+entry:
+  %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8)
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/callchain.ll b/test/CodeGen/NVPTX/callchain.ll
new file mode 100644
index 0000000..60b118b
--- /dev/null
+++ b/test/CodeGen/NVPTX/callchain.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx"
+
+define void @foo(i8* %ptr) {
+  %fnptr = bitcast i8* %ptr to void ()*
+; CHECK: prototype_0 : .callprototype ()_ ()
+  tail call void %fnptr()
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/constant-vectors.ll b/test/CodeGen/NVPTX/constant-vectors.ll
new file mode 100644
index 0000000..208c2d9
--- /dev/null
+++ b/test/CodeGen/NVPTX/constant-vectors.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-nvidia-cuda"
+
+; CHECK: .visible .global .align 16 .b8 testArray[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+@testArray = constant [2 x <4 x i8>] [<4 x i8> <i8 0, i8 1, i8 2, i8 3>, <4 x i8> <i8 4, i8 5, i8 6, i8 7>], align 16
diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll
new file mode 100644
index 0000000..06d3d56
--- /dev/null
+++ b/test/CodeGen/NVPTX/implicit-def.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
+
+; CHECK: // implicit-def: %f[[F0:[0-9]+]]
+; CHECK: add.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
+define float @foo(float %a) {
+  %ret = fadd float %a, undef
+  ret float %ret
+}
+
diff --git a/test/CodeGen/NVPTX/inline-asm.ll b/test/CodeGen/NVPTX/inline-asm.ll
new file mode 100644
index 0000000..d76eb42
--- /dev/null
+++ b/test/CodeGen/NVPTX/inline-asm.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+define float @test(float %x) {
+entry:
+; CHECK: ex2.approx.ftz.f32 %f{{[0-9]+}}, %f{{[0-9]+}}
+  %0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x)
+  ret float %0
+}
diff --git a/test/CodeGen/NVPTX/lit.local.cfg b/test/CodeGen/NVPTX/lit.local.cfg
index 7180c84..85cf8c2 100644
--- a/test/CodeGen/NVPTX/lit.local.cfg
+++ b/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'NVPTX' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/NVPTX/pr17529.ll b/test/CodeGen/NVPTX/pr17529.ll
new file mode 100644
index 0000000..a162142
--- /dev/null
+++ b/test/CodeGen/NVPTX/pr17529.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Function Attrs: nounwind
+; CHECK: .func kernelgen_memcpy
+define ptx_device void @kernelgen_memcpy(i8* nocapture %dst) #0 {
+entry:
+  br i1 undef, label %for.end, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %scevgep9 = getelementptr i8* %dst, i64 %index
+  %scevgep910 = bitcast i8* %scevgep9 to <4 x i8>*
+  store <4 x i8> undef, <4 x i8>* %scevgep910, align 1
+  %index.next = add i64 %index, 4
+  %0 = icmp eq i64 undef, %index.next
+  br i1 %0, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  br i1 undef, label %for.end, label %for.body.preheader1
+
+for.body.preheader1:                              ; preds = %middle.block
+  %scevgep2 = getelementptr i8* %dst, i64 0
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader1
+  %lsr.iv3 = phi i8* [ %scevgep2, %for.body.preheader1 ], [ %scevgep4, %for.body ]
+  store i8 undef, i8* %lsr.iv3, align 1
+  %scevgep4 = getelementptr i8* %lsr.iv3, i64 1
+  br label %for.body
+
+for.end:                                          ; preds = %middle.block, %entry
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll
new file mode 100644
index 0000000..03f5cfc
--- /dev/null
+++ b/test/CodeGen/NVPTX/vec8.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+; CHECK: .visible .func foo
+define void @foo(<8 x i8> %a, i8* %b) {
+  %t0 = extractelement <8 x i8> %a, i32 0
+; CHECK-DAG: ld.param.v4.u8
+; CHECK-DAG: ld.param.u32
+  store i8 %t0, i8* %b
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/2010-02-12-saveCR.ll b/test/CodeGen/PowerPC/2010-02-12-saveCR.ll
index 097611a..b0c37b8 100644
--- a/test/CodeGen/PowerPC/2010-02-12-saveCR.ll
+++ b/test/CodeGen/PowerPC/2010-02-12-saveCR.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=g4 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=g4 -break-anti-dependencies=none | FileCheck %s
 ; ModuleID = 'hh.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128-n32"
 target triple = "powerpc-apple-darwin9.6"
diff --git a/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll b/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll
index 635062b..9bf25c8 100644
--- a/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll
+++ b/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll
@@ -25,4 +25,4 @@ if.end1018:                                       ; preds = %for.end957, %for.en
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/Frames-alloca.ll b/test/CodeGen/PowerPC/Frames-alloca.ll
index 28dd08c..4588bc0 100644
--- a/test/CodeGen/PowerPC/Frames-alloca.ll
+++ b/test/CodeGen/PowerPC/Frames-alloca.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC32
-; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC64
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=PPC32-NOFP
-; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=PPC64-NOFP
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC32
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC32-RS
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=PPC32-RS-NOFP
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC32
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC64
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=CHECK-PPC32-NOFP
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=CHECK-PPC64-NOFP
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC32
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC32-RS
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=CHECK-PPC32-RS-NOFP
 
 ; CHECK-PPC32: stw r31, -4(r1)
 ; CHECK-PPC32: lwz r1, 0(r1)
diff --git a/test/CodeGen/PowerPC/addrfuncstr.ll b/test/CodeGen/PowerPC/addrfuncstr.ll
index 60c02d4..6750b5c 100644
--- a/test/CodeGen/PowerPC/addrfuncstr.ll
+++ b/test/CodeGen/PowerPC/addrfuncstr.ll
@@ -23,5 +23,5 @@ declare i64 @fread(i8*, i64, i64, %struct._IO_FILE*) #1
 ; CHECK: .section .data.rel.ro
 ; CHECK: .quad fread
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/asym-regclass-copy.ll b/test/CodeGen/PowerPC/asym-regclass-copy.ll
index d04a6c9..b19125b 100644
--- a/test/CodeGen/PowerPC/asym-regclass-copy.ll
+++ b/test/CodeGen/PowerPC/asym-regclass-copy.ll
@@ -52,5 +52,5 @@ declare void @free(i8* nocapture) #0
 
 declare i64 @strtol(i8*, i8** nocapture, i32 signext) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/PowerPC/bdzlr.ll b/test/CodeGen/PowerPC/bdzlr.ll
index 656a858..e487558 100644
--- a/test/CodeGen/PowerPC/bdzlr.ll
+++ b/test/CodeGen/PowerPC/bdzlr.ll
@@ -35,15 +35,15 @@ for.body:                                         ; preds = %for.body.for.body_c
   %0 = phi %struct.lua_TValue.17.692* [ undef, %for.body.lr.ph ], [ %.pre, %for.body.for.body_crit_edge ]
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]
   %tt = getelementptr inbounds %struct.lua_TValue.17.692* %0, i64 %indvars.iv, i32 1
-  %1 = load i32* %tt, align 4, !tbaa !0
-  store i32 %1, i32* undef, align 4, !tbaa !0
+  %1 = load i32* %tt, align 4
+  store i32 %1, i32* undef, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n
   br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge
 
 for.body.for.body_crit_edge:                      ; preds = %for.body
-  %.pre = load %struct.lua_TValue.17.692** undef, align 8, !tbaa !3
+  %.pre = load %struct.lua_TValue.17.692** undef, align 8
   br label %for.body
 
 for.end:                                          ; preds = %for.body, %if.end, %entry
@@ -57,8 +57,3 @@ for.end:                                          ; preds = %for.body, %if.end,
 }
 
 attributes #0 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/PowerPC/copysignl.ll b/test/CodeGen/PowerPC/copysignl.ll
new file mode 100644
index 0000000..4b801b7
--- /dev/null
+++ b/test/CodeGen/PowerPC/copysignl.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @foo_d_ll(ppc_fp128 %a, ppc_fp128 %b) #0 {
+entry:
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %a, ppc_fp128 %b) #0
+  %conv = fptrunc ppc_fp128 %call to double
+  ret double %conv
+
+; CHECK-LABEL: @foo_d_ll
+; CHECK: fcpsgn 1, 3, 1
+; CHECK: blr
+}
+
+declare ppc_fp128 @copysignl(ppc_fp128, ppc_fp128) #0
+
+define double @foo_dl(double %a, ppc_fp128 %b) #0 {
+entry:
+  %conv = fptrunc ppc_fp128 %b to double
+  %call = tail call double @copysign(double %a, double %conv) #0
+  ret double %call
+
+; CHECK-LABEL: @foo_dl
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+declare double @copysign(double, double) #0
+
+define ppc_fp128 @foo_ll(double %a, ppc_fp128 %b) #0 {
+entry:
+  %conv = fpext double %a to ppc_fp128
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %conv, ppc_fp128 %b) #0
+  ret ppc_fp128 %call
+
+; CHECK-LABEL: @foo_ll
+; CHECK: bl copysignl
+; CHECK: blr
+}
+
+define ppc_fp128 @foo_ld(double %a, double %b) #0 {
+entry:
+  %conv = fpext double %a to ppc_fp128
+  %conv1 = fpext double %b to ppc_fp128
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %conv, ppc_fp128 %conv1) #0
+  ret ppc_fp128 %call
+
+; CHECK-LABEL: @foo_ld
+; CHECK: bl copysignl
+; CHECK: blr
+}
+
+define ppc_fp128 @foo_lf(double %a, float %b) #0 {
+entry:
+  %conv = fpext double %a to ppc_fp128
+  %conv1 = fpext float %b to ppc_fp128
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %conv, ppc_fp128 %conv1) #0
+  ret ppc_fp128 %call
+
+; CHECK-LABEL: @foo_lf
+; CHECK: bl copysignl
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/cr-spills.ll b/test/CodeGen/PowerPC/cr-spills.ll
index d6df7a2..be0dbad 100644
--- a/test/CodeGen/PowerPC/cr-spills.ll
+++ b/test/CodeGen/PowerPC/cr-spills.ll
@@ -53,11 +53,11 @@ for.cond286.preheader:                            ; preds = %for.body252
 
 for.cond290.preheader:                            ; preds = %for.end520, %for.cond286.preheader
   %srcptr.31595 = phi i16* [ getelementptr inbounds ([768 x i16]* @SetupFastFullPelSearch.orig_pels, i64 0, i64 0), %for.cond286.preheader ], [ null, %for.end520 ]
-  %1 = load i32* undef, align 4, !tbaa !0
-  %2 = load i32* @weight_luma, align 4, !tbaa !0
-  %3 = load i32* @wp_luma_round, align 4, !tbaa !0
-  %4 = load i32* @luma_log_weight_denom, align 4, !tbaa !0
-  %5 = load i32* @offset_luma, align 4, !tbaa !0
+  %1 = load i32* undef, align 4
+  %2 = load i32* @weight_luma, align 4
+  %3 = load i32* @wp_luma_round, align 4
+  %4 = load i32* @luma_log_weight_denom, align 4
+  %5 = load i32* @offset_luma, align 4
   %incdec.ptr502.sum = add i64 undef, 16
   br label %for.body293
 
@@ -68,7 +68,7 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %LineSadBlk1.01587 = phi i32 [ 0, %for.cond290.preheader ], [ %add402, %for.body293 ]
   %LineSadBlk3.01586 = phi i32 [ 0, %for.cond290.preheader ], [ %add514, %for.body293 ]
   %LineSadBlk2.01585 = phi i32 [ 0, %for.cond290.preheader ], [ %add458, %for.body293 ]
-  %6 = load i16* %refptr.11590, align 2, !tbaa !3
+  %6 = load i16* %refptr.11590, align 2
   %conv294 = zext i16 %6 to i32
   %mul295 = mul nsw i32 %conv294, %2
   %add296 = add nsw i32 %mul295, %3
@@ -78,16 +78,16 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cond.i.i1514 = select i1 %cmp.i.i1513, i32 %add297, i32 0
   %cmp.i4.i1515 = icmp slt i32 %cond.i.i1514, %1
   %cond.i5.i1516 = select i1 %cmp.i4.i1515, i32 %cond.i.i1514, i32 %1
-  %7 = load i16* %srcptr.41591, align 2, !tbaa !3
+  %7 = load i16* %srcptr.41591, align 2
   %conv300 = zext i16 %7 to i32
   %sub301 = sub nsw i32 %cond.i5.i1516, %conv300
   %idxprom302 = sext i32 %sub301 to i64
   %arrayidx303 = getelementptr inbounds i32* %cond, i64 %idxprom302
-  %8 = load i32* %arrayidx303, align 4, !tbaa !0
+  %8 = load i32* %arrayidx303, align 4
   %add304 = add nsw i32 %8, %LineSadBlk0.01588
-  %9 = load i32* undef, align 4, !tbaa !0
+  %9 = load i32* undef, align 4
   %add318 = add nsw i32 %add304, %9
-  %10 = load i16* undef, align 2, !tbaa !3
+  %10 = load i16* undef, align 2
   %conv321 = zext i16 %10 to i32
   %mul322 = mul nsw i32 %conv321, %2
   %add323 = add nsw i32 %mul322, %3
@@ -100,22 +100,22 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %sub329 = sub nsw i32 %cond.i5.i1508, 0
   %idxprom330 = sext i32 %sub329 to i64
   %arrayidx331 = getelementptr inbounds i32* %cond, i64 %idxprom330
-  %11 = load i32* %arrayidx331, align 4, !tbaa !0
+  %11 = load i32* %arrayidx331, align 4
   %add332 = add nsw i32 %add318, %11
   %cmp.i.i1501 = icmp sgt i32 undef, 0
   %cond.i.i1502 = select i1 %cmp.i.i1501, i32 undef, i32 0
   %cmp.i4.i1503 = icmp slt i32 %cond.i.i1502, %1
   %cond.i5.i1504 = select i1 %cmp.i4.i1503, i32 %cond.i.i1502, i32 %1
   %incdec.ptr341 = getelementptr inbounds i16* %srcptr.41591, i64 4
-  %12 = load i16* null, align 2, !tbaa !3
+  %12 = load i16* null, align 2
   %conv342 = zext i16 %12 to i32
   %sub343 = sub nsw i32 %cond.i5.i1504, %conv342
   %idxprom344 = sext i32 %sub343 to i64
   %arrayidx345 = getelementptr inbounds i32* %cond, i64 %idxprom344
-  %13 = load i32* %arrayidx345, align 4, !tbaa !0
+  %13 = load i32* %arrayidx345, align 4
   %add346 = add nsw i32 %add332, %13
   %incdec.ptr348 = getelementptr inbounds i16* %refptr.11590, i64 5
-  %14 = load i16* null, align 2, !tbaa !3
+  %14 = load i16* null, align 2
   %conv349 = zext i16 %14 to i32
   %mul350 = mul nsw i32 %conv349, %2
   %add351 = add nsw i32 %mul350, %3
@@ -126,15 +126,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1499 = icmp slt i32 %cond.i.i1498, %1
   %cond.i5.i1500 = select i1 %cmp.i4.i1499, i32 %cond.i.i1498, i32 %1
   %incdec.ptr355 = getelementptr inbounds i16* %srcptr.41591, i64 5
-  %15 = load i16* %incdec.ptr341, align 2, !tbaa !3
+  %15 = load i16* %incdec.ptr341, align 2
   %conv356 = zext i16 %15 to i32
   %sub357 = sub nsw i32 %cond.i5.i1500, %conv356
   %idxprom358 = sext i32 %sub357 to i64
   %arrayidx359 = getelementptr inbounds i32* %cond, i64 %idxprom358
-  %16 = load i32* %arrayidx359, align 4, !tbaa !0
+  %16 = load i32* %arrayidx359, align 4
   %add360 = add nsw i32 %16, %LineSadBlk1.01587
   %incdec.ptr362 = getelementptr inbounds i16* %refptr.11590, i64 6
-  %17 = load i16* %incdec.ptr348, align 2, !tbaa !3
+  %17 = load i16* %incdec.ptr348, align 2
   %conv363 = zext i16 %17 to i32
   %mul364 = mul nsw i32 %conv363, %2
   %add365 = add nsw i32 %mul364, %3
@@ -145,15 +145,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1495 = icmp slt i32 %cond.i.i1494, %1
   %cond.i5.i1496 = select i1 %cmp.i4.i1495, i32 %cond.i.i1494, i32 %1
   %incdec.ptr369 = getelementptr inbounds i16* %srcptr.41591, i64 6
-  %18 = load i16* %incdec.ptr355, align 2, !tbaa !3
+  %18 = load i16* %incdec.ptr355, align 2
   %conv370 = zext i16 %18 to i32
   %sub371 = sub nsw i32 %cond.i5.i1496, %conv370
   %idxprom372 = sext i32 %sub371 to i64
   %arrayidx373 = getelementptr inbounds i32* %cond, i64 %idxprom372
-  %19 = load i32* %arrayidx373, align 4, !tbaa !0
+  %19 = load i32* %arrayidx373, align 4
   %add374 = add nsw i32 %add360, %19
   %incdec.ptr376 = getelementptr inbounds i16* %refptr.11590, i64 7
-  %20 = load i16* %incdec.ptr362, align 2, !tbaa !3
+  %20 = load i16* %incdec.ptr362, align 2
   %conv377 = zext i16 %20 to i32
   %mul378 = mul nsw i32 %conv377, %2
   %add379 = add nsw i32 %mul378, %3
@@ -164,14 +164,14 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1491 = icmp slt i32 %cond.i.i1490, %1
   %cond.i5.i1492 = select i1 %cmp.i4.i1491, i32 %cond.i.i1490, i32 %1
   %incdec.ptr383 = getelementptr inbounds i16* %srcptr.41591, i64 7
-  %21 = load i16* %incdec.ptr369, align 2, !tbaa !3
+  %21 = load i16* %incdec.ptr369, align 2
   %conv384 = zext i16 %21 to i32
   %sub385 = sub nsw i32 %cond.i5.i1492, %conv384
   %idxprom386 = sext i32 %sub385 to i64
   %arrayidx387 = getelementptr inbounds i32* %cond, i64 %idxprom386
-  %22 = load i32* %arrayidx387, align 4, !tbaa !0
+  %22 = load i32* %arrayidx387, align 4
   %add388 = add nsw i32 %add374, %22
-  %23 = load i16* %incdec.ptr376, align 2, !tbaa !3
+  %23 = load i16* %incdec.ptr376, align 2
   %conv391 = zext i16 %23 to i32
   %mul392 = mul nsw i32 %conv391, %2
   %add395 = add nsw i32 0, %5
@@ -180,25 +180,25 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1487 = icmp slt i32 %cond.i.i1486, %1
   %cond.i5.i1488 = select i1 %cmp.i4.i1487, i32 %cond.i.i1486, i32 %1
   %incdec.ptr397 = getelementptr inbounds i16* %srcptr.41591, i64 8
-  %24 = load i16* %incdec.ptr383, align 2, !tbaa !3
+  %24 = load i16* %incdec.ptr383, align 2
   %conv398 = zext i16 %24 to i32
   %sub399 = sub nsw i32 %cond.i5.i1488, %conv398
   %idxprom400 = sext i32 %sub399 to i64
   %arrayidx401 = getelementptr inbounds i32* %cond, i64 %idxprom400
-  %25 = load i32* %arrayidx401, align 4, !tbaa !0
+  %25 = load i32* %arrayidx401, align 4
   %add402 = add nsw i32 %add388, %25
   %incdec.ptr404 = getelementptr inbounds i16* %refptr.11590, i64 9
   %cmp.i4.i1483 = icmp slt i32 undef, %1
   %cond.i5.i1484 = select i1 %cmp.i4.i1483, i32 undef, i32 %1
-  %26 = load i16* %incdec.ptr397, align 2, !tbaa !3
+  %26 = load i16* %incdec.ptr397, align 2
   %conv412 = zext i16 %26 to i32
   %sub413 = sub nsw i32 %cond.i5.i1484, %conv412
   %idxprom414 = sext i32 %sub413 to i64
   %arrayidx415 = getelementptr inbounds i32* %cond, i64 %idxprom414
-  %27 = load i32* %arrayidx415, align 4, !tbaa !0
+  %27 = load i32* %arrayidx415, align 4
   %add416 = add nsw i32 %27, %LineSadBlk2.01585
   %incdec.ptr418 = getelementptr inbounds i16* %refptr.11590, i64 10
-  %28 = load i16* %incdec.ptr404, align 2, !tbaa !3
+  %28 = load i16* %incdec.ptr404, align 2
   %conv419 = zext i16 %28 to i32
   %mul420 = mul nsw i32 %conv419, %2
   %add421 = add nsw i32 %mul420, %3
@@ -212,10 +212,10 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %sub427 = sub nsw i32 %cond.i5.i1480, 0
   %idxprom428 = sext i32 %sub427 to i64
   %arrayidx429 = getelementptr inbounds i32* %cond, i64 %idxprom428
-  %29 = load i32* %arrayidx429, align 4, !tbaa !0
+  %29 = load i32* %arrayidx429, align 4
   %add430 = add nsw i32 %add416, %29
   %incdec.ptr432 = getelementptr inbounds i16* %refptr.11590, i64 11
-  %30 = load i16* %incdec.ptr418, align 2, !tbaa !3
+  %30 = load i16* %incdec.ptr418, align 2
   %conv433 = zext i16 %30 to i32
   %mul434 = mul nsw i32 %conv433, %2
   %add435 = add nsw i32 %mul434, %3
@@ -225,15 +225,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cond.i.i1474 = select i1 %cmp.i.i1473, i32 %add437, i32 0
   %cmp.i4.i1475 = icmp slt i32 %cond.i.i1474, %1
   %cond.i5.i1476 = select i1 %cmp.i4.i1475, i32 %cond.i.i1474, i32 %1
-  %31 = load i16* %incdec.ptr425, align 2, !tbaa !3
+  %31 = load i16* %incdec.ptr425, align 2
   %conv440 = zext i16 %31 to i32
   %sub441 = sub nsw i32 %cond.i5.i1476, %conv440
   %idxprom442 = sext i32 %sub441 to i64
   %arrayidx443 = getelementptr inbounds i32* %cond, i64 %idxprom442
-  %32 = load i32* %arrayidx443, align 4, !tbaa !0
+  %32 = load i32* %arrayidx443, align 4
   %add444 = add nsw i32 %add430, %32
   %incdec.ptr446 = getelementptr inbounds i16* %refptr.11590, i64 12
-  %33 = load i16* %incdec.ptr432, align 2, !tbaa !3
+  %33 = load i16* %incdec.ptr432, align 2
   %conv447 = zext i16 %33 to i32
   %mul448 = mul nsw i32 %conv447, %2
   %add449 = add nsw i32 %mul448, %3
@@ -244,15 +244,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1471 = icmp slt i32 %cond.i.i1470, %1
   %cond.i5.i1472 = select i1 %cmp.i4.i1471, i32 %cond.i.i1470, i32 %1
   %incdec.ptr453 = getelementptr inbounds i16* %srcptr.41591, i64 12
-  %34 = load i16* undef, align 2, !tbaa !3
+  %34 = load i16* undef, align 2
   %conv454 = zext i16 %34 to i32
   %sub455 = sub nsw i32 %cond.i5.i1472, %conv454
   %idxprom456 = sext i32 %sub455 to i64
   %arrayidx457 = getelementptr inbounds i32* %cond, i64 %idxprom456
-  %35 = load i32* %arrayidx457, align 4, !tbaa !0
+  %35 = load i32* %arrayidx457, align 4
   %add458 = add nsw i32 %add444, %35
   %incdec.ptr460 = getelementptr inbounds i16* %refptr.11590, i64 13
-  %36 = load i16* %incdec.ptr446, align 2, !tbaa !3
+  %36 = load i16* %incdec.ptr446, align 2
   %conv461 = zext i16 %36 to i32
   %mul462 = mul nsw i32 %conv461, %2
   %add463 = add nsw i32 %mul462, %3
@@ -263,12 +263,12 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1467 = icmp slt i32 %cond.i.i1466, %1
   %cond.i5.i1468 = select i1 %cmp.i4.i1467, i32 %cond.i.i1466, i32 %1
   %incdec.ptr467 = getelementptr inbounds i16* %srcptr.41591, i64 13
-  %37 = load i16* %incdec.ptr453, align 2, !tbaa !3
+  %37 = load i16* %incdec.ptr453, align 2
   %conv468 = zext i16 %37 to i32
   %sub469 = sub nsw i32 %cond.i5.i1468, %conv468
   %idxprom470 = sext i32 %sub469 to i64
   %arrayidx471 = getelementptr inbounds i32* %cond, i64 %idxprom470
-  %38 = load i32* %arrayidx471, align 4, !tbaa !0
+  %38 = load i32* %arrayidx471, align 4
   %add472 = add nsw i32 %38, %LineSadBlk3.01586
   %incdec.ptr474 = getelementptr inbounds i16* %refptr.11590, i64 14
   %add477 = add nsw i32 0, %3
@@ -279,15 +279,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1463 = icmp slt i32 %cond.i.i1462, %1
   %cond.i5.i1464 = select i1 %cmp.i4.i1463, i32 %cond.i.i1462, i32 %1
   %incdec.ptr481 = getelementptr inbounds i16* %srcptr.41591, i64 14
-  %39 = load i16* %incdec.ptr467, align 2, !tbaa !3
+  %39 = load i16* %incdec.ptr467, align 2
   %conv482 = zext i16 %39 to i32
   %sub483 = sub nsw i32 %cond.i5.i1464, %conv482
   %idxprom484 = sext i32 %sub483 to i64
   %arrayidx485 = getelementptr inbounds i32* %cond, i64 %idxprom484
-  %40 = load i32* %arrayidx485, align 4, !tbaa !0
+  %40 = load i32* %arrayidx485, align 4
   %add486 = add nsw i32 %add472, %40
   %incdec.ptr488 = getelementptr inbounds i16* %refptr.11590, i64 15
-  %41 = load i16* %incdec.ptr474, align 2, !tbaa !3
+  %41 = load i16* %incdec.ptr474, align 2
   %conv489 = zext i16 %41 to i32
   %mul490 = mul nsw i32 %conv489, %2
   %add491 = add nsw i32 %mul490, %3
@@ -298,14 +298,14 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1459 = icmp slt i32 %cond.i.i1458, %1
   %cond.i5.i1460 = select i1 %cmp.i4.i1459, i32 %cond.i.i1458, i32 %1
   %incdec.ptr495 = getelementptr inbounds i16* %srcptr.41591, i64 15
-  %42 = load i16* %incdec.ptr481, align 2, !tbaa !3
+  %42 = load i16* %incdec.ptr481, align 2
   %conv496 = zext i16 %42 to i32
   %sub497 = sub nsw i32 %cond.i5.i1460, %conv496
   %idxprom498 = sext i32 %sub497 to i64
   %arrayidx499 = getelementptr inbounds i32* %cond, i64 %idxprom498
-  %43 = load i32* %arrayidx499, align 4, !tbaa !0
+  %43 = load i32* %arrayidx499, align 4
   %add500 = add nsw i32 %add486, %43
-  %44 = load i16* %incdec.ptr488, align 2, !tbaa !3
+  %44 = load i16* %incdec.ptr488, align 2
   %conv503 = zext i16 %44 to i32
   %mul504 = mul nsw i32 %conv503, %2
   %add505 = add nsw i32 %mul504, %3
@@ -315,22 +315,22 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cond.i.i1454 = select i1 %cmp.i.i1453, i32 %add507, i32 0
   %cmp.i4.i1455 = icmp slt i32 %cond.i.i1454, %1
   %cond.i5.i1456 = select i1 %cmp.i4.i1455, i32 %cond.i.i1454, i32 %1
-  %45 = load i16* %incdec.ptr495, align 2, !tbaa !3
+  %45 = load i16* %incdec.ptr495, align 2
   %conv510 = zext i16 %45 to i32
   %sub511 = sub nsw i32 %cond.i5.i1456, %conv510
   %idxprom512 = sext i32 %sub511 to i64
   %arrayidx513 = getelementptr inbounds i32* %cond, i64 %idxprom512
-  %46 = load i32* %arrayidx513, align 4, !tbaa !0
+  %46 = load i32* %arrayidx513, align 4
   %add514 = add nsw i32 %add500, %46
   %add.ptr517 = getelementptr inbounds i16* %refptr.11590, i64 %incdec.ptr502.sum
   %exitcond1692 = icmp eq i32 undef, 4
   br i1 %exitcond1692, label %for.end520, label %for.body293
 
 for.end520:                                       ; preds = %for.body293
-  store i32 %add346, i32* undef, align 4, !tbaa !0
-  store i32 %add402, i32* undef, align 4, !tbaa !0
-  store i32 %add458, i32* undef, align 4, !tbaa !0
-  store i32 %add514, i32* null, align 4, !tbaa !0
+  store i32 %add346, i32* undef, align 4
+  store i32 %add402, i32* undef, align 4
+  store i32 %add458, i32* undef, align 4
+  store i32 %add514, i32* null, align 4
   br i1 undef, label %for.end543, label %for.cond290.preheader
 
 for.end543:                                       ; preds = %for.end520
@@ -400,10 +400,5 @@ for.end999:                                       ; preds = %for.inc997
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/PowerPC/ctr-cleanup.ll b/test/CodeGen/PowerPC/ctr-cleanup.ll
index 04e4ffb..1a669eb 100644
--- a/test/CodeGen/PowerPC/ctr-cleanup.ll
+++ b/test/CodeGen/PowerPC/ctr-cleanup.ll
@@ -22,4 +22,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/ctrloop-cpsgn.ll b/test/CodeGen/PowerPC/ctrloop-cpsgn.ll
new file mode 100644
index 0000000..2f04409
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-cpsgn.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mcpu=ppc | FileCheck %s
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+define ppc_fp128 @foo(ppc_fp128* nocapture %n, ppc_fp128 %d) nounwind readonly {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %x.05 = phi ppc_fp128 [ %d, %entry ], [ %conv, %for.body ]
+  %arrayidx = getelementptr inbounds ppc_fp128* %n, i32 %i.06
+  %0 = load ppc_fp128* %arrayidx, align 8
+  %conv = tail call ppc_fp128 @copysignl(ppc_fp128 %x.05, ppc_fp128 %d) nounwind readonly
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret ppc_fp128 %conv
+}
+
+declare ppc_fp128 @copysignl(ppc_fp128, ppc_fp128) #0
+
+; CHECK: @foo
+; CHECK-NOT: mtctr
+
diff --git a/test/CodeGen/PowerPC/ctrloop-le.ll b/test/CodeGen/PowerPC/ctrloop-le.ll
index 21a6fab..7b8185e 100644
--- a/test/CodeGen/PowerPC/ctrloop-le.ll
+++ b/test/CodeGen/PowerPC/ctrloop-le.ll
@@ -32,8 +32,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos2_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -62,8 +61,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos4_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -92,8 +90,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -122,8 +119,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -443,4 +439,3 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
-
diff --git a/test/CodeGen/PowerPC/ctrloop-lt.ll b/test/CodeGen/PowerPC/ctrloop-lt.ll
index 448716d..eaab61a 100644
--- a/test/CodeGen/PowerPC/ctrloop-lt.ll
+++ b/test/CodeGen/PowerPC/ctrloop-lt.ll
@@ -33,7 +33,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 ; CHECK: test_pos2_ir_slt
 ; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -63,7 +63,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 ; CHECK: test_pos4_ir_slt
 ; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -92,8 +92,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_ir_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -122,8 +121,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_ir_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -326,8 +324,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos2_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -356,8 +353,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos4_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -386,8 +382,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -416,8 +411,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -442,4 +436,3 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
-
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 30fe19e..cb93dec 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -15,13 +15,14 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22}
 
 !0 = metadata !{i32 720913, metadata !21, i32 12, metadata !"clang version 3.1", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, metadata !"", metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, metadata !21, null, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !13, i32 0} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9, metadata !10}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
@@ -36,3 +37,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !19 = metadata !{i32 2, i32 3, metadata !20, null}
 !20 = metadata !{i32 720907, metadata !21, metadata !5, i32 1, i32 34, i32 0} ; [ DW_TAG_lexical_block ]
 !21 = metadata !{metadata !"dbg.c", metadata !"/src"}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/PowerPC/dyn-alloca-aligned.ll b/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
index a18ada7..a5d45b8 100644
--- a/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
+++ b/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
@@ -12,12 +12,12 @@ entry:
   %vla = alloca i32, i64 %0, align 128
   %vla1 = alloca i32, i64 %0, align 128
   %a2 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %1 = load i32* %a2, align 4, !tbaa !0
-  store i32 %1, i32* %vla1, align 128, !tbaa !0
+  %1 = load i32* %a2, align 4
+  store i32 %1, i32* %vla1, align 128
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %2 = load i32* %b, align 4, !tbaa !0
+  %2 = load i32* %b, align 4
   %arrayidx3 = getelementptr inbounds i32* %vla1, i64 1
-  store i32 %2, i32* %arrayidx3, align 4, !tbaa !0
+  store i32 %2, i32* %arrayidx3, align 4
   call void @bar(i32* %vla1, i32* %vla) #0
   ret void
 
@@ -33,7 +33,3 @@ entry:
 }
 
 attributes #0 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/fast-isel-GEP-coalesce.ll b/test/CodeGen/PowerPC/fast-isel-GEP-coalesce.ll
new file mode 100644
index 0000000..7bdda04
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-GEP-coalesce.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+%struct.A = type { i32, [2 x [2 x i32]], i8, [3 x [3 x [3 x i32]]] }
+%struct.B = type { i32, [2 x [2 x [2 x %struct.A]]] }
+
+@arr = common global [2 x [2 x [2 x [2 x [2 x i32]]]]] zeroinitializer, align 4
+@A = common global [3 x [3 x %struct.A]] zeroinitializer, align 4
+@B = common global [2 x [2 x [2 x %struct.B]]] zeroinitializer, align 4
+
+define i32* @t1() nounwind {
+entry:
+; ELF64: t1
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([2 x [2 x [2 x [2 x [2 x i32]]]]]* @arr, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 124
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t2() nounwind {
+entry:
+; ELF64: t2
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([3 x [3 x %struct.A]]* @A, i32 0, i32 2, i32 2, i32 3, i32 1, i32 2, i32 2), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 1148
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t3() nounwind {
+entry:
+; ELF64: t3
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([3 x [3 x %struct.A]]* @A, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 140
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t4() nounwind {
+entry:
+; ELF64: t4
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([2 x [2 x [2 x %struct.B]]]* @B, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 3, i32 1, i32 2, i32 1), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 1284
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-binary.ll b/test/CodeGen/PowerPC/fast-isel-binary.ll
new file mode 100644
index 0000000..43a6cd0
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-binary.ll
@@ -0,0 +1,137 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; Test add with non-legal types
+
+define void @add_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: add_i8
+  %a.addr = alloca i8, align 4
+  %0 = add i8 %a, %b
+; ELF64: add
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @add_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: add_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = add i8 %a, 22;
+; ELF64: addi
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @add_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: add_i16
+  %a.addr = alloca i16, align 4
+  %0 = add i16 %a, %b
+; ELF64: add
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @add_i16_imm(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: add_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = add i16 %a, 243;
+; ELF64: addi
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+; Test or with non-legal types
+
+define void @or_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: or_i8
+  %a.addr = alloca i8, align 4
+  %0 = or i8 %a, %b
+; ELF64: or
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @or_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: or_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = or i8 %a, -13;
+; ELF64: ori
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @or_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: or_i16
+  %a.addr = alloca i16, align 4
+  %0 = or i16 %a, %b
+; ELF64: or
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @or_i16_imm(i16 %a) nounwind ssp {
+entry:
+; ELF64: or_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = or i16 %a, 273;
+; ELF64: ori
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+; Test sub with non-legal types
+
+define void @sub_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: sub_i8
+  %a.addr = alloca i8, align 4
+  %0 = sub i8 %a, %b
+; ELF64: subf
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: sub_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = sub i8 %a, 22;
+; ELF64: addi
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: sub_i16
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, %b
+; ELF64: subf
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16_imm(i16 %a) nounwind ssp {
+entry:
+; ELF64: sub_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, 247;
+; ELF64: addi
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16_badimm(i16 %a) nounwind ssp {
+entry:
+; ELF64: sub_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, -32768;
+; ELF64: subf
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-br-const.ll b/test/CodeGen/PowerPC/fast-isel-br-const.ll
new file mode 100644
index 0000000..2cfb8a2
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-br-const.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define i32 @t1(i32 %a, i32 %b) nounwind uwtable ssp {
+entry:
+; ELF64: t1
+  %x = add i32 %a, %b  
+  br i1 1, label %if.then, label %if.else
+; ELF64-NOT: b {{\.?}}LBB0_1
+
+if.then:                                          ; preds = %entry
+  call void @foo1()
+  br label %if.end7
+
+if.else:                                          ; preds = %entry
+  br i1 0, label %if.then2, label %if.else3
+; ELF64: b {{\.?}}LBB0_4
+
+if.then2:                                         ; preds = %if.else
+  call void @foo2()
+  br label %if.end6
+
+if.else3:                                         ; preds = %if.else
+  %y = sub i32 %a, %b
+  br i1 1, label %if.then5, label %if.end
+; ELF64-NOT: b {{\.?}}LBB0_5
+
+if.then5:                                         ; preds = %if.else3
+  call void @foo1()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then5, %if.else3
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.end, %if.then2
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end6, %if.then
+  ret i32 0
+}
+
+declare void @foo1()
+
+declare void @foo2()
diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll
new file mode 100644
index 0000000..33a8ba9
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-call.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define i32 @t1(i8 signext %a) nounwind {
+  %1 = sext i8 %a to i32
+  ret i32 %1
+}
+
+define i32 @t2(i8 zeroext %a) nounwind {
+  %1 = zext i8 %a to i32
+  ret i32 %1
+}
+
+define i32 @t3(i16 signext %a) nounwind {
+  %1 = sext i16 %a to i32
+  ret i32 %1
+}
+
+define i32 @t4(i16 zeroext %a) nounwind {
+  %1 = zext i16 %a to i32
+  ret i32 %1
+}
+
+define void @foo(i8 %a, i16 %b) nounwind {
+; ELF64: foo
+  %1 = call i32 @t1(i8 signext %a)
+; ELF64: extsb
+  %2 = call i32 @t2(i8 zeroext %a)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  %3 = call i32 @t3(i16 signext %b)
+; ELF64: extsh
+  %4 = call i32 @t4(i16 zeroext %b)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+
+;; A few test to check materialization
+  %5 = call i32 @t2(i8 zeroext 255)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  %6 = call i32 @t4(i16 zeroext 65535)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+  ret void
+}
+
+define void @foo2() nounwind {
+  %1 = call signext i16 @t5()
+  %2 = call zeroext i16 @t6()
+  %3 = call signext i8 @t7()
+  %4 = call zeroext i8 @t8()
+  ret void
+}
+
+declare signext i16 @t5();
+declare zeroext i16 @t6();
+declare signext i8 @t7();
+declare zeroext i8 @t8();
+
+define i32 @t10(i32 %argc, i8** nocapture %argv) {
+entry:
+; ELF64: t10
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
+; ELF64: li 3, 0
+; ELF64: li 4, 248
+; ELF64: li 5, 187
+; ELF64: li 6, 28
+; ELF64: li 7, 40
+; ELF64: li 8, 186
+; ELF64: rldicl 3, 3, 0, 56
+; ELF64: rldicl 4, 4, 0, 56
+; ELF64: rldicl 5, 5, 0, 56
+; ELF64: rldicl 6, 6, 0, 56
+; ELF64: rldicl 7, 7, 0, 56
+; ELF64: rldicl 8, 8, 0, 56
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+define i32 @bar0(i32 %i) nounwind {
+  ret i32 0
+}
+
+; Function pointers are not yet implemented.
+;define void @foo3() uwtable {
+;  %fptr = alloca i32 (i32)*, align 8
+;  store i32 (i32)* @bar0, i32 (i32)** %fptr, align 8
+;  %1 = load i32 (i32)** %fptr, align 8
+;  %call = call i32 %1(i32 0)
+;  ret void
+;}
+
+; Intrinsic calls not yet implemented, and udiv isn't one for PPC anyway.
+;define i32 @LibCall(i32 %a, i32 %b) {
+;entry:
+;        %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
+;        ret i32 %tmp1
+;}
+
+declare void @float_foo(float %f) ssp
+
+define void @float_const() ssp {
+entry:
+; ELF64: float_const
+  call void @float_foo(float 0x401C666660000000)
+; ELF64: addis [[REG:[0-9]+]], 2, .LCPI[[SUF:[0-9_]+]]@toc@ha
+; ELF64: lfs 1, .LCPI[[SUF]]@toc@l([[REG]])
+  ret void
+}
+
+define void @float_reg(float %dummy, float %f) ssp {
+entry:
+; ELF64: float_reg
+  call void @float_foo(float %f)
+; ELF64: fmr 1, 2
+  ret void
+}
+
+declare void @double_foo(double %d) ssp
+
+define void @double_const() ssp {
+entry:
+; ELF64: double_const
+  call void @double_foo(double 0x1397723CCABD0000401C666660000000)
+; ELF64: addis [[REG2:[0-9]+]], 2, .LCPI[[SUF2:[0-9_]+]]@toc@ha
+; ELF64: lfd 1, .LCPI[[SUF2]]@toc@l([[REG2]])
+  ret void
+}
+
+define void @double_reg(double %dummy, double %d) ssp {
+entry:
+; ELF64: double_reg
+  call void @double_foo(double %d)
+; ELF64: fmr 1, 2
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
new file mode 100644
index 0000000..33f7a79
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
@@ -0,0 +1,289 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define void @t1a(float %a) uwtable ssp {
+entry:
+; ELF64: t1a
+  %cmp = fcmp oeq float %a, 0.000000e+00
+; ELF64: addis
+; ELF64: lfs
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare void @foo()
+
+define void @t1b(float %a) uwtable ssp {
+entry:
+; ELF64: t1b
+  %cmp = fcmp oeq float %a, -0.000000e+00
+; ELF64: addis
+; ELF64: lfs
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t2a(double %a) uwtable ssp {
+entry:
+; ELF64: t2a
+  %cmp = fcmp oeq double %a, 0.000000e+00
+; ELF64: addis
+; ELF64: lfd
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t2b(double %a) uwtable ssp {
+entry:
+; ELF64: t2b
+  %cmp = fcmp oeq double %a, -0.000000e+00
+; ELF64: addis
+; ELF64: lfd
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t4(i8 signext %a) uwtable ssp {
+entry:
+; ELF64: t4
+  %cmp = icmp eq i8 %a, -1
+; ELF64: extsb
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t5(i8 zeroext %a) uwtable ssp {
+entry:
+; ELF64: t5
+  %cmp = icmp eq i8 %a, 1
+; ELF64: extsb
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t6(i16 signext %a) uwtable ssp {
+entry:
+; ELF64: t6
+  %cmp = icmp eq i16 %a, -1
+; ELF64: extsh
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t7(i16 zeroext %a) uwtable ssp {
+entry:
+; ELF64: t7
+  %cmp = icmp eq i16 %a, 1
+; ELF64: extsh
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t8(i32 %a) uwtable ssp {
+entry:
+; ELF64: t8
+  %cmp = icmp eq i32 %a, -1
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t9(i32 %a) uwtable ssp {
+entry:
+; ELF64: t9
+  %cmp = icmp eq i32 %a, 1
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t10(i32 %a) uwtable ssp {
+entry:
+; ELF64: t10
+  %cmp = icmp eq i32 %a, 384
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t11(i32 %a) uwtable ssp {
+entry:
+; ELF64: t11
+  %cmp = icmp eq i32 %a, 4096
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t12(i8 %a) uwtable ssp {
+entry:
+; ELF64: t12
+  %cmp = icmp ugt i8 %a, -113
+; ELF64: rlwinm
+; ELF64: cmplwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t13() nounwind ssp {
+entry:
+; ELF64: t13
+  %cmp = icmp slt i32 -123, -2147483648
+; ELF64: li
+; ELF64: lis
+; ELF64: cmpw
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define void @t14(i64 %a) uwtable ssp {
+entry:
+; ELF64: t14
+  %cmp = icmp eq i64 %a, -1
+; ELF64: cmpdi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t15(i64 %a) uwtable ssp {
+entry:
+; ELF64: t15
+  %cmp = icmp eq i64 %a, 1
+; ELF64: cmpdi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t16(i64 %a) uwtable ssp {
+entry:
+; ELF64: t16
+  %cmp = icmp eq i64 %a, 384
+; ELF64: cmpdi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t17(i64 %a) uwtable ssp {
+entry:
+; ELF64: t17
+  %cmp = icmp eq i64 %a, 32768
+; Extra operand so we don't match on cmpdi.
+; ELF64: cmpd {{[0-9]+}}
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion.ll b/test/CodeGen/PowerPC/fast-isel-conversion.ll
new file mode 100644
index 0000000..a31c312
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-conversion.ll
@@ -0,0 +1,305 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; Test sitofp
+
+define void @sitofp_single_i64(i64 %a, float %b) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i64
+  %b.addr = alloca float, align 4
+  %conv = sitofp i64 %a to float
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_single_i32(i32 %a, float %b) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i32
+  %b.addr = alloca float, align 4
+  %conv = sitofp i32 %a to float
+; ELF64: std
+; ELF64: lfiwax
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_single_i16(i16 %a, float %b) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i16
+  %b.addr = alloca float, align 4
+  %conv = sitofp i16 %a to float
+; ELF64: extsh
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_single_i8(i8 %a) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i8
+  %b.addr = alloca float, align 4
+  %conv = sitofp i8 %a to float
+; ELF64: extsb
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i32
+  %b.addr = alloca double, align 8
+  %conv = sitofp i32 %a to double
+; ELF64: std
+; ELF64: lfiwax
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i64
+  %b.addr = alloca double, align 8
+  %conv = sitofp i64 %a to double
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i16
+  %b.addr = alloca double, align 8
+  %conv = sitofp i16 %a to double
+; ELF64: extsh
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i8
+  %b.addr = alloca double, align 8
+  %conv = sitofp i8 %a to double
+; ELF64: extsb
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+; Test uitofp
+
+define void @uitofp_single_i64(i64 %a, float %b) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i64
+  %b.addr = alloca float, align 4
+  %conv = uitofp i64 %a to float
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_single_i32(i32 %a, float %b) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i32
+  %b.addr = alloca float, align 4
+  %conv = uitofp i32 %a to float
+; ELF64: std
+; ELF64: lfiwzx
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_single_i16(i16 %a, float %b) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i16
+  %b.addr = alloca float, align 4
+  %conv = uitofp i16 %a to float
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_single_i8(i8 %a) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i8
+  %b.addr = alloca float, align 4
+  %conv = uitofp i8 %a to float
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_double_i64(i64 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i64
+  %b.addr = alloca double, align 8
+  %conv = uitofp i64 %a to double
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @uitofp_double_i32(i32 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i32
+  %b.addr = alloca double, align 8
+  %conv = uitofp i32 %a to double
+; ELF64: std
+; ELF64: lfiwzx
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @uitofp_double_i16(i16 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i16
+  %b.addr = alloca double, align 8
+  %conv = uitofp i16 %a to double
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @uitofp_double_i8(i8 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i8
+  %b.addr = alloca double, align 8
+  %conv = uitofp i8 %a to double
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+; Test fptosi
+
+define void @fptosi_float_i32(float %a) nounwind ssp {
+entry:
+; ELF64: fptosi_float_i32
+  %b.addr = alloca i32, align 4
+  %conv = fptosi float %a to i32
+; ELF64: fctiwz
+; ELF64: stfd
+; ELF64: lwa
+  store i32 %conv, i32* %b.addr, align 4
+  ret void
+}
+
+define void @fptosi_float_i64(float %a) nounwind ssp {
+entry:
+; ELF64: fptosi_float_i64
+  %b.addr = alloca i64, align 4
+  %conv = fptosi float %a to i64
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 4
+  ret void
+}
+
+define void @fptosi_double_i32(double %a) nounwind ssp {
+entry:
+; ELF64: fptosi_double_i32
+  %b.addr = alloca i32, align 8
+  %conv = fptosi double %a to i32
+; ELF64: fctiwz
+; ELF64: stfd
+; ELF64: lwa
+  store i32 %conv, i32* %b.addr, align 8
+  ret void
+}
+
+define void @fptosi_double_i64(double %a) nounwind ssp {
+entry:
+; ELF64: fptosi_double_i64
+  %b.addr = alloca i64, align 8
+  %conv = fptosi double %a to i64
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 8
+  ret void
+}
+
+; Test fptoui
+
+define void @fptoui_float_i32(float %a) nounwind ssp {
+entry:
+; ELF64: fptoui_float_i32
+  %b.addr = alloca i32, align 4
+  %conv = fptoui float %a to i32
+; ELF64: fctiwuz
+; ELF64: stfd
+; ELF64: lwz
+  store i32 %conv, i32* %b.addr, align 4
+  ret void
+}
+
+define void @fptoui_float_i64(float %a) nounwind ssp {
+entry:
+; ELF64: fptoui_float_i64
+  %b.addr = alloca i64, align 4
+  %conv = fptoui float %a to i64
+; ELF64: fctiduz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 4
+  ret void
+}
+
+define void @fptoui_double_i32(double %a) nounwind ssp {
+entry:
+; ELF64: fptoui_double_i32
+  %b.addr = alloca i32, align 8
+  %conv = fptoui double %a to i32
+; ELF64: fctiwuz
+; ELF64: stfd
+; ELF64: lwz
+  store i32 %conv, i32* %b.addr, align 8
+  ret void
+}
+
+define void @fptoui_double_i64(double %a) nounwind ssp {
+entry:
+; ELF64: fptoui_double_i64
+  %b.addr = alloca i64, align 8
+  %conv = fptoui double %a to i64
+; ELF64: fctiduz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 8
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-crash.ll b/test/CodeGen/PowerPC/fast-isel-crash.ll
new file mode 100644
index 0000000..1813fc9
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-crash.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7
+
+; Ensure this doesn't crash.
+
+%union.anon = type { <16 x i32> }
+
+@__md0 = external global [137 x i8]
+
+define internal void @stretch(<4 x i8> addrspace(1)* %src, <4 x i8> addrspace(1)* %dst, i32 %width, i32 %height, i32 %iLS, i32 %oLS, <2 x float> %c, <4 x float> %param) nounwind {
+entry:
+  ret void
+}
+
+define internal i32 @_Z13get_global_idj(i32 %dim) nounwind ssp {
+entry:
+  ret i32 undef
+}
+
+define void @wrap(i8 addrspace(1)* addrspace(1)* %arglist, i32 addrspace(1)* %gtid) nounwind ssp {
+entry:
+  call void @stretch(<4 x i8> addrspace(1)* undef, <4 x i8> addrspace(1)* undef, i32 undef, i32 undef, i32 undef, i32 undef, <2 x float> undef, <4 x float> undef)
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-ext.ll b/test/CodeGen/PowerPC/fast-isel-ext.ll
new file mode 100644
index 0000000..753305a
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-ext.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; zext
+
+define i32 @zext_8_32(i8 %a) nounwind ssp {
+; ELF64: zext_8_32
+  %r = zext i8 %a to i32
+; ELF64: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
+  ret i32 %r
+}
+
+define i32 @zext_16_32(i16 %a) nounwind ssp {
+; ELF64: zext_16_32
+  %r = zext i16 %a to i32
+; ELF64: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
+  ret i32 %r
+}
+
+define i64 @zext_8_64(i8 %a) nounwind ssp {
+; ELF64: zext_8_64
+  %r = zext i8 %a to i64
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  ret i64 %r
+}
+
+define i64 @zext_16_64(i16 %a) nounwind ssp {
+; ELF64: zext_16_64
+  %r = zext i16 %a to i64
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+  ret i64 %r
+}
+
+define i64 @zext_32_64(i32 %a) nounwind ssp {
+; ELF64: zext_32_64
+  %r = zext i32 %a to i64
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+  ret i64 %r
+}
+
+; sext
+
+define i32 @sext_8_32(i8 %a) nounwind ssp {
+; ELF64: sext_8_32
+  %r = sext i8 %a to i32
+; ELF64: extsb
+  ret i32 %r
+}
+
+define i32 @sext_16_32(i16 %a) nounwind ssp {
+; ELF64: sext_16_32
+  %r = sext i16 %a to i32
+; ELF64: extsh
+  ret i32 %r
+}
+
+define i64 @sext_8_64(i8 %a) nounwind ssp {
+; ELF64: sext_8_64
+  %r = sext i8 %a to i64
+; ELF64: extsb
+  ret i64 %r
+}
+
+define i64 @sext_16_64(i16 %a) nounwind ssp {
+; ELF64: sext_16_64
+  %r = sext i16 %a to i64
+; ELF64: extsh
+  ret i64 %r
+}
+
+define i64 @sext_32_64(i32 %a) nounwind ssp {
+; ELF64: sext_32_64
+  %r = sext i32 %a to i64
+; ELF64: extsw
+  ret i64 %r
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-fold.ll b/test/CodeGen/PowerPC/fast-isel-fold.ll
new file mode 100644
index 0000000..4de345f
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-fold.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+@a = global i8 1, align 1
+@b = global i16 2, align 2
+@c = global i32 4, align 4
+
+define void @t1() nounwind uwtable ssp {
+; ELF64: t1
+  %1 = load i8* @a, align 1
+  call void @foo1(i8 zeroext %1)
+; ELF64: lbz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+define void @t2() nounwind uwtable ssp {
+; ELF64: t2
+  %1 = load i16* @b, align 2
+  call void @foo2(i16 zeroext %1)
+; ELF64: lhz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+define void @t2a() nounwind uwtable ssp {
+; ELF64: t2a
+  %1 = load i32* @c, align 4
+  call void @foo3(i32 zeroext %1)
+; ELF64: lwz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+declare void @foo1(i8 zeroext)
+declare void @foo2(i16 zeroext)
+declare void @foo3(i32 zeroext)
+
+define i32 @t3() nounwind uwtable ssp {
+; ELF64: t3
+  %1 = load i8* @a, align 1
+  %2 = zext i8 %1 to i32
+; ELF64: lbz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t4() nounwind uwtable ssp {
+; ELF64: t4
+  %1 = load i16* @b, align 2
+  %2 = zext i16 %1 to i32
+; ELF64: lhz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t5() nounwind uwtable ssp {
+; ELF64: t5
+  %1 = load i16* @b, align 2
+  %2 = sext i16 %1 to i32
+; ELF64: lha
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t6() nounwind uwtable ssp {
+; ELF64: t6
+  %1 = load i8* @a, align 2
+  %2 = sext i8 %1 to i32
+; ELF64: lbz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i64 @t7() nounwind uwtable ssp {
+; ELF64: t7
+  %1 = load i8* @a, align 1
+  %2 = zext i8 %1 to i64
+; ELF64: lbz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t8() nounwind uwtable ssp {
+; ELF64: t8
+  %1 = load i16* @b, align 2
+  %2 = zext i16 %1 to i64
+; ELF64: lhz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t9() nounwind uwtable ssp {
+; ELF64: t9
+  %1 = load i16* @b, align 2
+  %2 = sext i16 %1 to i64
+; ELF64: lha
+; ELF64-NOT: extsh
+  ret i64 %2
+}
+
+define i64 @t10() nounwind uwtable ssp {
+; ELF64: t10
+  %1 = load i8* @a, align 2
+  %2 = sext i8 %1 to i64
+; ELF64: lbz
+; ELF64: extsb
+  ret i64 %2
+}
+
+define i64 @t11() nounwind uwtable ssp {
+; ELF64: t11
+  %1 = load i32* @c, align 4
+  %2 = zext i32 %1 to i64
+; ELF64: lwz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t12() nounwind uwtable ssp {
+; ELF64: t12
+  %1 = load i32* @c, align 4
+  %2 = sext i32 %1 to i64
+; ELF64: lwa
+; ELF64-NOT: extsw
+  ret i64 %2
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-indirectbr.ll b/test/CodeGen/PowerPC/fast-isel-indirectbr.ll
new file mode 100644
index 0000000..88ccf91
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-indirectbr.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define void @t1(i8* %x) {
+entry:
+; ELF64: t1
+  br label %L0
+
+L0:
+  br label %L1
+
+L1:
+  indirectbr i8* %x, [ label %L0, label %L1 ]
+; ELF64: mtctr 3
+; ELF64: bctr
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-load-store.ll b/test/CodeGen/PowerPC/fast-isel-load-store.ll
new file mode 100644
index 0000000..026b15f
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-load-store.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; This test verifies that load/store instructions are properly generated,
+; and that they pass MI verification.
+
+@a = global i8 1, align 1
+@b = global i16 2, align 2
+@c = global i32 4, align 4
+@d = global i64 8, align 8
+@e = global float 1.25, align 4
+@f = global double 3.5, align 8
+
+%struct.s = type<{ i8, i32 }>
+%struct.t = type<{ i8, i64 }>
+
+@g = global %struct.s <{ i8 1, i32 2 }>, align 1
+@h = global %struct.t <{ i8 1, i64 2 }>, align 1
+
+@i = common global [8192 x i64] zeroinitializer, align 8
+
+; load
+
+define i8 @t1() nounwind uwtable ssp {
+; ELF64: t1
+  %1 = load i8* @a, align 1
+; ELF64: lbz
+  %2 = add nsw i8 %1, 1
+; ELF64: addi
+  ret i8 %2
+}
+
+define i16 @t2() nounwind uwtable ssp {
+; ELF64: t2
+  %1 = load i16* @b, align 2
+; ELF64: lhz
+  %2 = add nsw i16 %1, 1
+; ELF64: addi
+  ret i16 %2
+}
+
+define i32 @t3() nounwind uwtable ssp {
+; ELF64: t3
+  %1 = load i32* @c, align 4
+; ELF64: lwz
+  %2 = add nsw i32 %1, 1
+; ELF64: addi
+  ret i32 %2
+}
+
+define i64 @t4() nounwind uwtable ssp {
+; ELF64: t4
+  %1 = load i64* @d, align 4
+; ELF64: ld
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+define float @t5() nounwind uwtable ssp {
+; ELF64: t5
+  %1 = load float* @e, align 4
+; ELF64: lfs
+  %2 = fadd float %1, 1.0
+; ELF64: fadds
+  ret float %2
+}
+
+define double @t6() nounwind uwtable ssp {
+; ELF64: t6
+  %1 = load double* @f, align 8
+; ELF64: lfd
+  %2 = fadd double %1, 1.0
+; ELF64: fadd
+  ret double %2
+}
+
+; store
+
+define void @t7(i8 %v) nounwind uwtable ssp {
+; ELF64: t7
+  %1 = add nsw i8 %v, 1
+  store i8 %1, i8* @a, align 1
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: stb
+  ret void
+}
+
+define void @t8(i16 %v) nounwind uwtable ssp {
+; ELF64: t8
+  %1 = add nsw i16 %v, 1
+  store i16 %1, i16* @b, align 2
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: sth
+  ret void
+}
+
+define void @t9(i32 %v) nounwind uwtable ssp {
+; ELF64: t9
+  %1 = add nsw i32 %v, 1
+  store i32 %1, i32* @c, align 4
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: stw
+  ret void
+}
+
+define void @t10(i64 %v) nounwind uwtable ssp {
+; ELF64: t10
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* @d, align 4
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: std
+  ret void
+}
+
+define void @t11(float %v) nounwind uwtable ssp {
+; ELF64: t11
+  %1 = fadd float %v, 1.0
+  store float %1, float* @e, align 4
+; ELF64: fadds
+; ELF64: stfs
+  ret void
+}
+
+define void @t12(double %v) nounwind uwtable ssp {
+; ELF64: t12
+  %1 = fadd double %v, 1.0
+  store double %1, double* @f, align 8
+; ELF64: fadd
+; ELF64: stfd
+  ret void
+}
+
+;; lwa requires an offset divisible by 4, so we need lwax here.
+define i64 @t13() nounwind uwtable ssp {
+; ELF64: t13
+  %1 = load i32* getelementptr inbounds (%struct.s* @g, i32 0, i32 1), align 1
+  %2 = sext i32 %1 to i64
+; ELF64: li
+; ELF64: lwax
+  %3 = add nsw i64 %2, 1
+; ELF64: addi
+  ret i64 %3
+}
+
+;; ld requires an offset divisible by 4, so we need ldx here.
+define i64 @t14() nounwind uwtable ssp {
+; ELF64: t14
+  %1 = load i64* getelementptr inbounds (%struct.t* @h, i32 0, i32 1), align 1
+; ELF64: li
+; ELF64: ldx
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+;; std requires an offset divisible by 4, so we need stdx here.
+define void @t15(i64 %v) nounwind uwtable ssp {
+; ELF64: t15
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* getelementptr inbounds (%struct.t* @h, i32 0, i32 1), align 1
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: li
+; ELF64: stdx
+  ret void
+}
+
+;; ld requires an offset that fits in 16 bits, so we need ldx here.
+define i64 @t16() nounwind uwtable ssp {
+; ELF64: t16
+  %1 = load i64* getelementptr inbounds ([8192 x i64]* @i, i32 0, i64 5000), align 8
+; ELF64: lis
+; ELF64: ori
+; ELF64: ldx
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+;; std requires an offset that fits in 16 bits, so we need stdx here.
+define void @t17(i64 %v) nounwind uwtable ssp {
+; ELF64: t17
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* getelementptr inbounds ([8192 x i64]* @i, i32 0, i64 5000), align 8
+; ELF64: addis
+; ELF64: ld
+; ELF64: addi
+; ELF64: lis
+; ELF64: ori
+; ELF64: stdx
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/fast-isel-redefinition.ll b/test/CodeGen/PowerPC/fast-isel-redefinition.ll
new file mode 100644
index 0000000..72422bd
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-redefinition.ll
@@ -0,0 +1,10 @@
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -optimize-regalloc -regalloc=basic -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s
+; This isn't exactly a useful set of command-line options, but check that it
+; doesn't crash.  (It crashed formerly on ARM, and proved useful in
+; discovering a bug on PowerPC as well.)
+
+define i32 @f(i32* %x) nounwind ssp {
+  %y = getelementptr inbounds i32* %x, i32 5000
+  %tmp103 = load i32* %y, align 4
+  ret i32 %tmp103
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll
new file mode 100644
index 0000000..fa19f8b
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-ret.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define signext i8 @ret2(i8 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret2
+; ELF64: extsb
+; ELF64: blr
+  ret i8 %a
+}
+
+define zeroext i8 @ret3(i8 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret3
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64: blr
+  ret i8 %a
+}
+
+define signext i16 @ret4(i16 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret4
+; ELF64: extsh
+; ELF64: blr
+  ret i16 %a
+}
+
+define zeroext i16 @ret5(i16 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret5
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: blr
+  ret i16 %a
+}
+
+define i16 @ret6(i16 %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret6
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: blr
+  ret i16 %a
+}
+
+define signext i32 @ret7(i32 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret7
+; ELF64: extsw
+; ELF64: blr
+  ret i32 %a
+}
+
+define zeroext i32 @ret8(i32 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret8
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+; ELF64: blr
+  ret i32 %a
+}
+
+define i32 @ret9(i32 %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret9
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+; ELF64: blr
+  ret i32 %a
+}
+
+define i64 @ret10(i64 %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret10
+; ELF64-NOT: exts
+; ELF64-NOT: rldicl
+; ELF64: blr
+  ret i64 %a
+}
+
+define float @ret11(float %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret11
+; ELF64: blr
+  ret float %a
+}
+
+define double @ret12(double %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret12
+; ELF64: blr
+  ret double %a
+}
+
+define i8 @ret13() nounwind uwtable ssp {
+entry:
+; ELF64: ret13
+; ELF64: li
+; ELF64: blr
+  ret i8 15;
+}
+
+define i16 @ret14() nounwind uwtable ssp {
+entry:
+; ELF64: ret14
+; ELF64: li
+; ELF64: blr
+  ret i16 -225;
+}
+
+define i32 @ret15() nounwind uwtable ssp {
+entry:
+; ELF64: ret15
+; ELF64: lis
+; ELF64: ori
+; ELF64: blr
+  ret i32 278135;
+}
+
+define i64 @ret16() nounwind uwtable ssp {
+entry:
+; ELF64: ret16
+; ELF64: li
+; ELF64: sldi
+; ELF64: oris
+; ELF64: ori
+; ELF64: blr
+  ret i64 27813515225;
+}
+
+define float @ret17() nounwind uwtable ssp {
+entry:
+; ELF64: ret17
+; ELF64: addis
+; ELF64: lfs
+; ELF64: blr
+  ret float 2.5;
+}
+
+define double @ret18() nounwind uwtable ssp {
+entry:
+; ELF64: ret18
+; ELF64: addis
+; ELF64: lfd
+; ELF64: blr
+  ret double 2.5e-33;
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-shifter.ll b/test/CodeGen/PowerPC/fast-isel-shifter.ll
new file mode 100644
index 0000000..198bfbe
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-shifter.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define i32 @shl() nounwind ssp {
+entry:
+; ELF64: shl
+; ELF64: slw
+  %shl = shl i32 -1, 2
+  ret i32 %shl
+}
+
+define i32 @shl_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ELF64: shl_reg
+; ELF64: slw
+  %shl = shl i32 %src1, %src2
+  ret i32 %shl
+}
+
+define i32 @lshr() nounwind ssp {
+entry:
+; ELF64: lshr
+; ELF64: srw
+  %lshr = lshr i32 -1, 2
+  ret i32 %lshr
+}
+
+define i32 @lshr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ELF64: lshr_reg
+; ELF64: srw
+  %lshr = lshr i32 %src1, %src2
+  ret i32 %lshr
+}
+
+define i32 @ashr() nounwind ssp {
+entry:
+; ELF64: ashr
+; ELF64: srawi
+  %ashr = ashr i32 -1, 2
+  ret i32 %ashr
+}
+
+define i32 @ashr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ELF64: ashr_reg
+; ELF64: sraw
+  %ashr = ashr i32 %src1, %src2
+  ret i32 %ashr
+}
+
diff --git a/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll b/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..4bcacf0
--- /dev/null
+++ b/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,17 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=powerpc64-unknown-freebsd10.0 %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: gep_promotion:
+  ; CHECK: lbz {{[0-9]+}}, 0({{.*}})
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
diff --git a/test/CodeGen/PowerPC/fcpsgn.ll b/test/CodeGen/PowerPC/fcpsgn.ll
new file mode 100644
index 0000000..f469981
--- /dev/null
+++ b/test/CodeGen/PowerPC/fcpsgn.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @foo_dd(double %a, double %b) #0 {
+entry:
+  %call = tail call double @copysign(double %a, double %b) #0
+  ret double %call
+
+; CHECK-LABEL: @foo_dd
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+declare double @copysign(double, double) #0
+
+define float @foo_ss(float %a, float %b) #0 {
+entry:
+  %call = tail call float @copysignf(float %a, float %b) #0
+  ret float %call
+
+; CHECK-LABEL: @foo_ss
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+declare float @copysignf(float, float) #0
+
+define float @foo_sd(float %a, double %b) #0 {
+entry:
+  %conv = fptrunc double %b to float
+  %call = tail call float @copysignf(float %a, float %conv) #0
+  ret float %call
+
+; CHECK-LABEL: @foo_sd
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+define double @foo_ds(double %a, float %b) #0 {
+entry:
+  %conv = fpext float %b to double
+  %call = tail call double @copysign(double %a, double %conv) #0
+  ret double %call
+
+; CHECK-LABEL: @foo_ds
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/frameaddr.ll b/test/CodeGen/PowerPC/frameaddr.ll
index eabd4a6..4480273 100644
--- a/test/CodeGen/PowerPC/frameaddr.ll
+++ b/test/CodeGen/PowerPC/frameaddr.ll
@@ -40,8 +40,8 @@ declare void @use(i8*)
 
 declare i8* @llvm.frameaddress(i32) #2
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { noreturn nounwind }
 attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind naked "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind naked "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
diff --git a/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
new file mode 100644
index 0000000..f97d0ff
--- /dev/null
+++ b/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
@@ -0,0 +1,139 @@
+; RUN: llc -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+%"class.std::__1::__assoc_sub_state" = type { %"class.std::__1::__shared_count", %"class.std::__exception_ptr::exception_ptr", %"class.std::__1::mutex", %"class.std::__1::condition_variable", i32 }
+%"class.std::__1::__shared_count" = type { i32 (...)**, i64 }
+%"class.std::__exception_ptr::exception_ptr" = type { i8* }
+%"class.std::__1::mutex" = type { %union.pthread_mutex_t }
+%union.pthread_mutex_t = type { %"struct.<anonymous union>::__pthread_mutex_s" }
+%"struct.<anonymous union>::__pthread_mutex_s" = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_internal_list }
+%struct.__pthread_internal_list = type { %struct.__pthread_internal_list*, %struct.__pthread_internal_list* }
+%"class.std::__1::condition_variable" = type { %union.pthread_cond_t }
+%union.pthread_cond_t = type { %struct.anon }
+%struct.anon = type { i32, i32, i64, i64, i64, i8*, i32, i32 }
+%"class.std::__1::unique_lock" = type { %"class.std::__1::mutex"*, i8 }
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: optsize
+define void @_ZNSt3__117__assoc_sub_state4copyEv(%"class.std::__1::__assoc_sub_state"* %this) #0 align 2 {
+entry:
+  %__lk = alloca %"class.std::__1::unique_lock", align 8
+  %ref.tmp = alloca %"class.std::__exception_ptr::exception_ptr", align 8
+  %tmp = alloca { i64, i64 }, align 8
+  %agg.tmp = alloca %"class.std::__exception_ptr::exception_ptr", align 8
+  %__mut_ = getelementptr inbounds %"class.std::__1::__assoc_sub_state"* %this, i64 0, i32 2
+  %__m_.i.i = getelementptr inbounds %"class.std::__1::unique_lock"* %__lk, i64 0, i32 0
+  store %"class.std::__1::mutex"* %__mut_, %"class.std::__1::mutex"** %__m_.i.i, align 8, !tbaa !5
+  %__owns_.i.i = getelementptr inbounds %"class.std::__1::unique_lock"* %__lk, i64 0, i32 1
+  store i8 1, i8* %__owns_.i.i, align 8, !tbaa !6
+  call void @_ZNSt3__15mutex4lockEv(%"class.std::__1::mutex"* %__mut_) #4
+  invoke void @_ZNSt3__117__assoc_sub_state10__sub_waitERNS_11unique_lockINS_5mutexEEE(%"class.std::__1::__assoc_sub_state"* %this, %"class.std::__1::unique_lock"* %__lk) #4
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %__exception_ = getelementptr inbounds %"class.std::__1::__assoc_sub_state"* %this, i64 0, i32 1
+  %0 = bitcast { i64, i64 }* %tmp to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 16, i32 8, i1 false)
+  call void @_ZNSt15__exception_ptr13exception_ptrC1EMS0_FvvE(%"class.std::__exception_ptr::exception_ptr"* %ref.tmp, { i64, i64 }* byval %tmp) #5
+  %call = call zeroext i1 @_ZNSt15__exception_ptrneERKNS_13exception_ptrES2_(%"class.std::__exception_ptr::exception_ptr"* %__exception_, %"class.std::__exception_ptr::exception_ptr"* %ref.tmp) #5
+  call void @_ZNSt15__exception_ptr13exception_ptrD1Ev(%"class.std::__exception_ptr::exception_ptr"* %ref.tmp) #5
+  br i1 %call, label %if.then, label %if.end
+
+if.then:                                          ; preds = %invoke.cont
+  call void @_ZNSt15__exception_ptr13exception_ptrC1ERKS0_(%"class.std::__exception_ptr::exception_ptr"* %agg.tmp, %"class.std::__exception_ptr::exception_ptr"* %__exception_) #5
+  invoke void @_ZSt17rethrow_exceptionNSt15__exception_ptr13exception_ptrE(%"class.std::__exception_ptr::exception_ptr"* %agg.tmp) #6
+          to label %invoke.cont4 unwind label %lpad3
+
+invoke.cont4:                                     ; preds = %if.then
+  unreachable
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = extractvalue { i8*, i32 } %1, 1
+  br label %ehcleanup
+
+lpad3:                                            ; preds = %if.then
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %5 = extractvalue { i8*, i32 } %4, 0
+  %6 = extractvalue { i8*, i32 } %4, 1
+  call void @_ZNSt15__exception_ptr13exception_ptrD1Ev(%"class.std::__exception_ptr::exception_ptr"* %agg.tmp) #5
+  br label %ehcleanup
+
+if.end:                                           ; preds = %invoke.cont
+  %7 = load i8* %__owns_.i.i, align 8, !tbaa !6, !range !4
+  %tobool.i.i = icmp eq i8 %7, 0
+  br i1 %tobool.i.i, label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit, label %if.then.i.i
+
+if.then.i.i:                                      ; preds = %if.end
+  %8 = load %"class.std::__1::mutex"** %__m_.i.i, align 8, !tbaa !5
+  call void @_ZNSt3__15mutex6unlockEv(%"class.std::__1::mutex"* %8) #5
+  br label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit
+
+_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit:      ; preds = %if.then.i.i, %if.end
+  ret void
+
+ehcleanup:                                        ; preds = %lpad3, %lpad
+  %exn.slot.0 = phi i8* [ %5, %lpad3 ], [ %2, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %6, %lpad3 ], [ %3, %lpad ]
+  %9 = load i8* %__owns_.i.i, align 8, !tbaa !6, !range !4
+  %tobool.i.i9 = icmp eq i8 %9, 0
+  br i1 %tobool.i.i9, label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit12, label %if.then.i.i11
+
+if.then.i.i11:                                    ; preds = %ehcleanup
+  %10 = load %"class.std::__1::mutex"** %__m_.i.i, align 8, !tbaa !5
+  call void @_ZNSt3__15mutex6unlockEv(%"class.std::__1::mutex"* %10) #5
+  br label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit12
+
+_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit12:    ; preds = %if.then.i.i11, %ehcleanup
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+; Function Attrs: optsize
+declare void @_ZNSt3__117__assoc_sub_state10__sub_waitERNS_11unique_lockINS_5mutexEEE(%"class.std::__1::__assoc_sub_state"*, %"class.std::__1::unique_lock"*) #0 align 2
+
+; Function Attrs: nounwind optsize
+declare zeroext i1 @_ZNSt15__exception_ptrneERKNS_13exception_ptrES2_(%"class.std::__exception_ptr::exception_ptr"*, %"class.std::__exception_ptr::exception_ptr"*) #1
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt15__exception_ptr13exception_ptrC1EMS0_FvvE(%"class.std::__exception_ptr::exception_ptr"*, { i64, i64 }* byval) #1
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt15__exception_ptr13exception_ptrD1Ev(%"class.std::__exception_ptr::exception_ptr"*) #1
+
+; Function Attrs: noreturn optsize
+declare void @_ZSt17rethrow_exceptionNSt15__exception_ptr13exception_ptrE(%"class.std::__exception_ptr::exception_ptr"*) #2
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt15__exception_ptr13exception_ptrC1ERKS0_(%"class.std::__exception_ptr::exception_ptr"*, %"class.std::__exception_ptr::exception_ptr"*) #1
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt3__15mutex6unlockEv(%"class.std::__1::mutex"*) #1
+
+; Function Attrs: optsize
+declare void @_ZNSt3__15mutex4lockEv(%"class.std::__1::mutex"*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
+
+attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { optsize }
+attributes #5 = { nounwind optsize }
+attributes #6 = { noreturn optsize }
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"bool", metadata !1}
+!4 = metadata !{i8 0, i8 2}
+!5 = metadata !{metadata !0, metadata !0, i64 0}
+!6 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/CodeGen/PowerPC/hello-reloc.s b/test/CodeGen/PowerPC/hello-reloc.s
new file mode 100644
index 0000000..9bbfb38
--- /dev/null
+++ b/test/CodeGen/PowerPC/hello-reloc.s
@@ -0,0 +1,84 @@
+; This tests for the basic implementation of PPCMachObjectWriter.cpp,
+; which is responsible for writing mach-o relocation entries for (PIC)
+; PowerPC objects.
+; NOTE: Darwin PPC asm syntax is not yet supported by PPCAsmParser,
+; so this test case uses ELF PPC asm syntax to produce a mach-o object.
+; Once PPCAsmParser supports darwin asm syntax, this test case should
+; be updated accordingly.  
+
+; RUN: llvm-mc -filetype=obj -relocation-model=pic -mcpu=g4 -triple=powerpc-apple-darwin8 %s -o - | llvm-readobj -relocations | FileCheck -check-prefix=DARWIN-G4-DUMP %s
+
+;	.machine ppc7400
+	.section	__TEXT,__textcoal_nt,coalesced,pure_instructions
+	.section	__TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_main
+	.align	4
+_main:                                  ; @main
+; BB#0:                                 ; %entry
+	mflr 0
+	stw 31, -4(1)
+	stw 0, 8(1)
+	stwu 1, -80(1)
+	bl L0$pb
+L0$pb:
+	mr 31, 1
+	li 5, 0
+	mflr 2
+	stw 3, 68(31)
+	stw 5, 72(31)
+	stw 4, 64(31)
+	addis 2, 2, (L_.str-L0$pb)@ha
+	la 3, (L_.str-L0$pb)@l(2)
+	bl L_puts$stub
+	li 3, 0
+	addi 1, 1, 80
+	lwz 0, 8(1)
+	lwz 31, -4(1)
+	mtlr 0
+	blr
+
+	.section	__TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
+	.align	4
+L_puts$stub:
+	.indirect_symbol	_puts
+	mflr 0
+	bcl 20, 31, L_puts$stub$tmp
+L_puts$stub$tmp:
+	mflr 11
+	addis 11, 11, (L_puts$lazy_ptr-L_puts$stub$tmp)@ha
+	mtlr 0
+	lwzu 12, (L_puts$lazy_ptr-L_puts$stub$tmp)@l(11)
+	mtctr 12
+	bctr
+	.section	__DATA,__la_symbol_ptr,lazy_symbol_pointers
+L_puts$lazy_ptr:
+	.indirect_symbol	_puts
+	.long	dyld_stub_binding_helper
+
+.subsections_via_symbols
+	.section	__TEXT,__cstring,cstring_literals
+L_.str:                                 ; @.str
+	.asciz	 "Hello, world!"
+
+; DARWIN-G4-DUMP:Format: Mach-O 32-bit ppc
+; DARWIN-G4-DUMP:Arch: powerpc
+; DARWIN-G4-DUMP:AddressSize: 32bit
+; DARWIN-G4-DUMP:Relocations [
+; DARWIN-G4-DUMP:  Section __text {
+; DARWIN-G4-DUMP:    0x34 1 2 0 PPC_RELOC_BR24 0 -
+; DARWIN-G4-DUMP:    0x30 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:    0x2C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x60 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:  }
+; DARWIN-G4-DUMP:  Section __picsymbolstub1 {
+; DARWIN-G4-DUMP:    0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:    0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x18 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:  }
+; DARWIN-G4-DUMP:  Section __la_symbol_ptr {
+; DARWIN-G4-DUMP:    0x0 0 2 1 PPC_RELOC_VANILLA 0 dyld_stub_binding_helper
+; DARWIN-G4-DUMP:  }
+; DARWIN-G4-DUMP:]
diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll
index d2a3239..5770d78 100644
--- a/test/CodeGen/PowerPC/i64_fp_round.ll
+++ b/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -22,6 +22,6 @@ entry:
 ; Also check that with -enable-unsafe-fp-math we do not get that extra
 ; code sequence.  Simply verify that there is no "isel" present.
 
-; RUN: llc -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=UNSAFE
+; RUN: llc -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
 ; CHECK-UNSAFE-NOT: isel
 
diff --git a/test/CodeGen/PowerPC/inlineasm-i64-reg.ll b/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
index fa9aa45..5e31cd5 100644
--- a/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
+++ b/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
@@ -59,6 +59,49 @@ entry:
   ret i32 %conv
 }
 
+declare void @mtrace()
+
+define signext i32 @main(i32 signext %argc, i8** %argv) {
+entry:
+  %argc.addr = alloca i32, align 4
+  store i32 %argc, i32* %argc.addr, align 4
+  %0 = call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1076)
+  %asmresult1.i = extractvalue { i64, i64 } %0, 1
+  %conv.i = trunc i64 %asmresult1.i to i32
+  %cmp = icmp eq i32 %conv.i, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK-LABEL: @main
+
+; CHECK-DAG: mr [[REG:[0-9]+]], 3
+; CHECK-DAG: li 0, 1076
+; CHECK:     stw [[REG]],
+
+; CHECK:     #APP
+; CHECK:     sc
+; CHECK:     #NO_APP
+                                      
+; CHECK:     cmpwi {{[0-9]+}}, [[REG]], 1
+
+; CHECK: blr
+
+if.then:                                          ; preds = %entry
+  call void @mtrace()
+  %.pre = load i32* %argc.addr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = phi i32 [ %.pre, %if.then ], [ %argc, %entry ]
+  %cmp1 = icmp slt i32 %1, 2
+  br i1 %cmp1, label %usage, label %if.end40
+
+usage:    
+  ret i32 8
+
+if.end40:
+  ret i32 0
+}
+
 attributes #0 = { alwaysinline inlinehint nounwind }
 attributes #1 = { nounwind }
 
diff --git a/test/CodeGen/PowerPC/isel-rc-nox0.ll b/test/CodeGen/PowerPC/isel-rc-nox0.ll
index 7d425cc..ac99aa4 100644
--- a/test/CodeGen/PowerPC/isel-rc-nox0.ll
+++ b/test/CodeGen/PowerPC/isel-rc-nox0.ll
@@ -22,7 +22,7 @@ crc32_gentab.exit:                                ; preds = %for.cond1.preheader
 
 for.cond1.preheader.i2961.i:                      ; preds = %for.inc44.i2977.i, %crc32_gentab.exit
   call void @llvm.memset.p0i8.i64(i8* bitcast ([1 x [9 x i32]]* @g_62 to i8*), i8 -1, i64 36, i32 4, i1 false) #1
-  %0 = load i32* %retval.0.i.i.i, align 4, !tbaa !0
+  %0 = load i32* %retval.0.i.i.i, align 4
   %tobool.i2967.i = icmp eq i32 %0, 0
   br label %for.body21.i2968.i
 
@@ -42,9 +42,5 @@ func_80.exit2978.i:                               ; preds = %for.inc44.i2977.i
 ; Function Attrs: nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "ssp-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "ssp-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/lit.local.cfg b/test/CodeGen/PowerPC/lit.local.cfg
index aaa31d9..2e46300 100644
--- a/test/CodeGen/PowerPC/lit.local.cfg
+++ b/test/CodeGen/PowerPC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'PowerPC' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/PowerPC/mcm-2.ll b/test/CodeGen/PowerPC/mcm-2.ll
index d4f40f7..fee98d8 100644
--- a/test/CodeGen/PowerPC/mcm-2.ll
+++ b/test/CodeGen/PowerPC/mcm-2.ll
@@ -31,7 +31,9 @@ entry:
 ; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
 ; LARGE: lwz {{[0-9]+}}, 0([[REG2]])
 ; LARGE: stw {{[0-9]+}}, 0([[REG2]])
-; LARGE: .type [[VAR]],@object
-; LARGE: .local [[VAR]]
-; LARGE: .comm [[VAR]],4,4
+; LARGE: [[VAR]]:
+; LARGE: .tc [[VAR2:[a-z0-9A-Z_.]+]][TC],[[VAR2]]
+; LARGE: .type [[VAR2]],@object
+; LARGE: .local [[VAR2]]
+; LARGE: .comm [[VAR2]],4,4
 
diff --git a/test/CodeGen/PowerPC/mcm-3.ll b/test/CodeGen/PowerPC/mcm-3.ll
index ce151fb..b6d681d 100644
--- a/test/CodeGen/PowerPC/mcm-3.ll
+++ b/test/CodeGen/PowerPC/mcm-3.ll
@@ -33,9 +33,11 @@ entry:
 ; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
 ; LARGE: lwz {{[0-9]+}}, 0([[REG2]])
 ; LARGE: stw {{[0-9]+}}, 0([[REG2]])
-; LARGE: .type [[VAR]],@object
-; LARGE: .data
-; LARGE: .globl [[VAR]]
 ; LARGE: [[VAR]]:
+; LARGE: .tc [[VAR2:[a-z0-9A-Z_.]+]][TC],[[VAR2]]
+; LARGE: .type [[VAR2]],@object
+; LARGE: .data
+; LARGE: .globl [[VAR2]]
+; LARGE: [[VAR2]]:
 ; LARGE: .long 5
 
diff --git a/test/CodeGen/PowerPC/mcm-4.ll b/test/CodeGen/PowerPC/mcm-4.ll
index 7d7b132..73dd902 100644
--- a/test/CodeGen/PowerPC/mcm-4.ll
+++ b/test/CodeGen/PowerPC/mcm-4.ll
@@ -22,6 +22,6 @@ entry:
 ; LARGE: [[VAR:[a-z0-9A-Z_.]+]]:
 ; LARGE: .quad 4562098671269285104
 ; LARGE-LABEL: test_double_const:
-; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
-; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
+; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR2:[a-z0-9A-Z_.]+]]@toc@ha
+; LARGE: ld [[REG2:[0-9]+]], [[VAR2]]@toc@l([[REG1]])
 ; LARGE: lfd {{[0-9]+}}, 0([[REG2]])
diff --git a/test/CodeGen/PowerPC/mcm-9.ll b/test/CodeGen/PowerPC/mcm-9.ll
index e587f61..7906b6a 100644
--- a/test/CodeGen/PowerPC/mcm-9.ll
+++ b/test/CodeGen/PowerPC/mcm-9.ll
@@ -7,8 +7,7 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
-@ei = external global i32
-@a = alias i32* @ei
+@a = external global i32
 
 define signext i32 @test_external() nounwind {
 entry:
diff --git a/test/CodeGen/PowerPC/negctr.ll b/test/CodeGen/PowerPC/negctr.ll
index ef33bb7..2e64993 100644
--- a/test/CodeGen/PowerPC/negctr.ll
+++ b/test/CodeGen/PowerPC/negctr.ll
@@ -83,4 +83,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/optcmp.ll b/test/CodeGen/PowerPC/optcmp.ll
index 523f329..35aabfa 100644
--- a/test/CodeGen/PowerPC/optcmp.ll
+++ b/test/CodeGen/PowerPC/optcmp.ll
@@ -5,7 +5,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 define signext i32 @foo(i32 signext %a, i32 signext %b, i32* nocapture %c) #0 {
 entry:
   %sub = sub nsw i32 %a, %b
-  store i32 %sub, i32* %c, align 4, !tbaa !0
+  store i32 %sub, i32* %c, align 4
   %cmp = icmp sgt i32 %a, %b
   %cond = select i1 %cmp, i32 %a, i32 %b
   ret i32 %cond
@@ -17,7 +17,7 @@ entry:
 define signext i32 @foo2(i32 signext %a, i32 signext %b, i32* nocapture %c) #0 {
 entry:
   %shl = shl i32 %a, %b
-  store i32 %shl, i32* %c, align 4, !tbaa !0
+  store i32 %shl, i32* %c, align 4
   %cmp = icmp sgt i32 %shl, 0
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -29,7 +29,7 @@ entry:
 define i64 @fool(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %a, %b
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp sgt i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -43,7 +43,7 @@ entry:
 define i64 @foolb(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %a, %b
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp sle i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -57,7 +57,7 @@ entry:
 define i64 @foolc(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %b, %a
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp sgt i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -71,7 +71,7 @@ entry:
 define i64 @foold(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %b, %a
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp eq i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -85,7 +85,7 @@ entry:
 define i64 @foold2(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %a, %b
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp eq i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -99,7 +99,7 @@ entry:
 define i64 @foo2l(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %shl = shl i64 %a, %b
-  store i64 %shl, i64* %c, align 8, !tbaa !3
+  store i64 %shl, i64* %c, align 8
   %cmp = icmp sgt i64 %shl, 0
   %conv1 = zext i1 %cmp to i64
   ret i64 %conv1
@@ -112,7 +112,7 @@ entry:
 define double @food(double %a, double %b, double* nocapture %c) #0 {
 entry:
   %sub = fsub double %a, %b
-  store double %sub, double* %c, align 8, !tbaa !3
+  store double %sub, double* %c, align 8
   %cmp = fcmp ogt double %a, %b
   %cond = select i1 %cmp, double %a, double %b
   ret double %cond
@@ -125,7 +125,7 @@ entry:
 define float @foof(float %a, float %b, float* nocapture %c) #0 {
 entry:
   %sub = fsub float %a, %b
-  store float %sub, float* %c, align 4, !tbaa !3
+  store float %sub, float* %c, align 4
   %cmp = fcmp ogt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
@@ -135,9 +135,18 @@ entry:
 ; CHECK: stfs 0, 0(5)
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"long", metadata !1}
-!4 = metadata !{metadata !"any pointer", metadata !1}
+declare i64 @llvm.ctpop.i64(i64);
+
+define signext i64 @fooct(i64 signext %a, i64 signext %b, i64* nocapture %c) #0 {
+entry:
+  %sub = sub nsw i64 %a, %b
+  %subc = call i64 @llvm.ctpop.i64(i64 %sub)
+  store i64 %subc, i64* %c, align 4
+  %cmp = icmp sgt i64 %subc, 0
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+
+; CHECK: @fooct
+; CHECK-NOT: popcntd.
+}
 
diff --git a/test/CodeGen/PowerPC/pr15031.ll b/test/CodeGen/PowerPC/pr15031.ll
index 5ccf941..e58ad80 100644
--- a/test/CodeGen/PowerPC/pr15031.ll
+++ b/test/CodeGen/PowerPC/pr15031.ll
@@ -317,54 +317,42 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %entry, %if.then
   %Reg.addr.0 = phi i32 [ %call3, %if.then ], [ %Reg, %entry ]
   %RegNo.i.i = getelementptr inbounds %"class.llvm::MachineOperand"* %this, i64 0, i32 2, i32 0
-  %1 = load i32* %RegNo.i.i, align 4, !tbaa !0
+  %1 = load i32* %RegNo.i.i, align 4
   %cmp.i = icmp eq i32 %1, %Reg.addr.0
   br i1 %cmp.i, label %_ZN4llvm14MachineOperand6setRegEj.exit, label %if.end.i
 
 if.end.i:                                         ; preds = %if.end
   %ParentMI.i.i = getelementptr inbounds %"class.llvm::MachineOperand"* %this, i64 0, i32 3
-  %2 = load %"class.llvm::MachineInstr"** %ParentMI.i.i, align 8, !tbaa !3
+  %2 = load %"class.llvm::MachineInstr"** %ParentMI.i.i, align 8
   %tobool.i = icmp eq %"class.llvm::MachineInstr"* %2, null
   br i1 %tobool.i, label %if.end13.i, label %if.then3.i
 
 if.then3.i:                                       ; preds = %if.end.i
   %Parent.i.i = getelementptr inbounds %"class.llvm::MachineInstr"* %2, i64 0, i32 2
-  %3 = load %"class.llvm::MachineBasicBlock"** %Parent.i.i, align 8, !tbaa !3
+  %3 = load %"class.llvm::MachineBasicBlock"** %Parent.i.i, align 8
   %tobool5.i = icmp eq %"class.llvm::MachineBasicBlock"* %3, null
   br i1 %tobool5.i, label %if.end13.i, label %if.then6.i
 
 if.then6.i:                                       ; preds = %if.then3.i
   %xParent.i.i = getelementptr inbounds %"class.llvm::MachineBasicBlock"* %3, i64 0, i32 4
-  %4 = load %"class.llvm::MachineFunction"** %xParent.i.i, align 8, !tbaa !3
+  %4 = load %"class.llvm::MachineFunction"** %xParent.i.i, align 8
   %tobool8.i = icmp eq %"class.llvm::MachineFunction"* %4, null
   br i1 %tobool8.i, label %if.end13.i, label %if.then9.i
 
 if.then9.i:                                       ; preds = %if.then6.i
   %RegInfo.i.i = getelementptr inbounds %"class.llvm::MachineFunction"* %4, i64 0, i32 5
-  %5 = load %"class.llvm::MachineRegisterInfo"** %RegInfo.i.i, align 8, !tbaa !3
+  %5 = load %"class.llvm::MachineRegisterInfo"** %RegInfo.i.i, align 8
   tail call void @_ZN4llvm19MachineRegisterInfo27removeRegOperandFromUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"* %5, %"class.llvm::MachineOperand"* %this)
-  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4, !tbaa !0
+  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4
   tail call void @_ZN4llvm19MachineRegisterInfo22addRegOperandToUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"* %5, %"class.llvm::MachineOperand"* %this)
   br label %_ZN4llvm14MachineOperand6setRegEj.exit
 
 if.end13.i:                                       ; preds = %if.then6.i, %if.then3.i, %if.end.i
-  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4, !tbaa !0
+  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4
   br label %_ZN4llvm14MachineOperand6setRegEj.exit
 
 _ZN4llvm14MachineOperand6setRegEj.exit:           ; preds = %if.end, %if.then9.i, %if.end13.i
   ret void
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
-!4 = metadata !{metadata !"vtable pointer", metadata !2}
-!5 = metadata !{metadata !"long", metadata !1}
-!6 = metadata !{i64 0, i64 8, metadata !3, i64 8, i64 8, metadata !5}
-!7 = metadata !{metadata !"short", metadata !1}
-!8 = metadata !{i64 0, i64 1, metadata !1, i64 1, i64 4, metadata !0, i64 2, i64 1, metadata !1, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 4, i64 4, metadata !0, i64 4, i64 4, metadata !0, i64 8, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !5, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 24, i64 8, metadata !3, i64 16, i64 4, metadata !0, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 24, i64 4, metadata !0}
-!9 = metadata !{metadata !"bool", metadata !1}
-!10 = metadata !{i8 0, i8 2}
-
 ; CHECK-NOT: lbzu 3, 1(3)
diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
new file mode 100644
index 0000000..2848221
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr17168.ll
@@ -0,0 +1,521 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s
+
+; This test formerly failed due to a DBG_VALUE being placed prior to a PHI
+; when fast-isel is partially successful before punting to DAG-isel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@grid_points = external global [3 x i32], align 4
+
+; Function Attrs: nounwind
+define fastcc void @compute_rhs() #0 {
+entry:
+  br i1 undef, label %for.cond871.preheader.for.inc960_crit_edge, label %for.end1042, !dbg !439
+
+for.cond871.preheader.for.inc960_crit_edge:       ; preds = %for.cond871.preheader.for.inc960_crit_edge, %entry
+  br i1 false, label %for.cond871.preheader.for.inc960_crit_edge, label %for.cond964.preheader, !dbg !439
+
+for.cond964.preheader:                            ; preds = %for.cond871.preheader.for.inc960_crit_edge
+  br i1 undef, label %for.cond968.preheader, label %for.end1042, !dbg !441
+
+for.cond968.preheader:                            ; preds = %for.cond968.preheader, %for.cond964.preheader
+  br i1 false, label %for.cond968.preheader, label %for.end1042, !dbg !441
+
+for.end1042:                                      ; preds = %for.cond968.preheader, %for.cond964.preheader, %entry
+  %0 = phi i32 [ undef, %for.cond964.preheader ], [ undef, %for.cond968.preheader ], [ undef, %entry ]
+  %1 = load i32* getelementptr inbounds ([3 x i32]* @grid_points, i64 0, i64 0), align 4, !dbg !443, !tbaa !444
+  tail call void @llvm.dbg.value(metadata !447, i64 0, metadata !119), !dbg !448
+  %sub10454270 = add nsw i32 %0, -1, !dbg !448
+  %cmp10464271 = icmp sgt i32 %sub10454270, 1, !dbg !448
+  %sub11134263 = add nsw i32 %1, -1, !dbg !450
+  %cmp11144264 = icmp sgt i32 %sub11134263, 1, !dbg !450
+  br i1 %cmp11144264, label %for.cond1116.preheader, label %for.cond1816.preheader.for.inc1898_crit_edge, !dbg !450
+
+for.cond1116.preheader:                           ; preds = %for.inc1658, %for.end1042
+  br i1 %cmp10464271, label %for.body1123, label %for.inc1658, !dbg !452
+
+for.body1123:                                     ; preds = %for.body1123, %for.cond1116.preheader
+  br label %for.body1123, !dbg !455
+
+for.inc1658:                                      ; preds = %for.cond1116.preheader
+  br i1 undef, label %for.cond1116.preheader, label %for.cond1816.preheader.for.inc1898_crit_edge, !dbg !450
+
+for.cond1816.preheader.for.inc1898_crit_edge:     ; preds = %for.cond1816.preheader.for.inc1898_crit_edge, %for.inc1658, %for.end1042
+  br label %for.cond1816.preheader.for.inc1898_crit_edge, !dbg !458
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!438, !464}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 190311)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"bt.c", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !82, metadata !102, metadata !114, metadata !132, metadata !145, metadata !154, metadata !155, metadata !162, metadata !183, metadata !200, metadata !201, metadata !207, metadata !208, metadata !215, metadata !221, metadata !230, metadata !238, metadata !246, metadata !255, metadata !260, metadata !261, metadata !268, metadata !274, metadata !279, metadata !280, metadata !287, metadata !293}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 74, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !12, i32 74} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
+!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !21, metadata !22, metadata !23, metadata !25, metadata !26}
+!13 = metadata !{i32 786689, metadata !4, metadata !"argc", metadata !5, i32 16777290, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 74]
+!14 = metadata !{i32 786689, metadata !4, metadata !"argv", metadata !5, i32 33554506, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 74]
+!15 = metadata !{i32 786688, metadata !4, metadata !"niter", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [niter] [line 76]
+!16 = metadata !{i32 786688, metadata !4, metadata !"step", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [step] [line 76]
+!17 = metadata !{i32 786688, metadata !4, metadata !"n3", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n3] [line 76]
+!18 = metadata !{i32 786688, metadata !4, metadata !"nthreads", metadata !5, i32 77, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
+!19 = metadata !{i32 786688, metadata !4, metadata !"navg", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [navg] [line 78]
+!20 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!21 = metadata !{i32 786688, metadata !4, metadata !"mflops", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
+!22 = metadata !{i32 786688, metadata !4, metadata !"tmax", metadata !5, i32 80, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
+!23 = metadata !{i32 786688, metadata !4, metadata !"verified", metadata !5, i32 81, metadata !24, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [verified] [line 81]
+!24 = metadata !{i32 786454, metadata !1, null, metadata !"boolean", i32 12, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
+!25 = metadata !{i32 786688, metadata !4, metadata !"class", metadata !5, i32 82, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [class] [line 82]
+!26 = metadata !{i32 786688, metadata !4, metadata !"fp", metadata !5, i32 83, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [fp] [line 83]
+!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
+!28 = metadata !{i32 786454, metadata !1, null, metadata !"FILE", i32 49, i64 0, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
+!29 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_FILE", i32 271, i64 1728, i64 64, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
+!30 = metadata !{metadata !"/usr/include/libio.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!31 = metadata !{metadata !32, metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !39, metadata !40, metadata !41, metadata !42, metadata !43, metadata !44, metadata !52, metadata !53, metadata !54, metadata !55, metadata !58, metadata !60, metadata !62, metadata !66, metadata !68, metadata !70, metadata !71, metadata !72, metadata !73, metadata !74, metadata !77, metadata !78}
+!32 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags", i32 272, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
+!33 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_ptr", i32 277, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
+!34 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_end", i32 278, i64 64, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
+!35 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_base", i32 279, i64 64, i64 64, i64 192, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
+!36 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_base", i32 280, i64 64, i64 64, i64 256, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
+!37 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_ptr", i32 281, i64 64, i64 64, i64 320, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
+!38 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_end", i32 282, i64 64, i64 64, i64 384, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
+!39 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_base", i32 283, i64 64, i64 64, i64 448, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
+!40 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_end", i32 284, i64 64, i64 64, i64 512, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
+!41 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_base", i32 286, i64 64, i64 64, i64 576, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
+!42 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_backup_base", i32 287, i64 64, i64 64, i64 640, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
+!43 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_end", i32 288, i64 64, i64 64, i64 704, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
+!44 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_markers", i32 290, i64 64, i64 64, i64 768, i32 0, metadata !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
+!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
+!46 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_marker", i32 186, i64 192, i64 64, i32 0, i32 0, null, metadata !47, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
+!47 = metadata !{metadata !48, metadata !49, metadata !51}
+!48 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_next", i32 187, i64 64, i64 64, i64 0, i32 0, metadata !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
+!49 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_sbuf", i32 188, i64 64, i64 64, i64 64, i32 0, metadata !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
+!50 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
+!51 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_pos", i32 192, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
+!52 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_chain", i32 292, i64 64, i64 64, i64 832, i32 0, metadata !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
+!53 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_fileno", i32 294, i64 32, i64 32, i64 896, i32 0, metadata !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
+!54 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags2", i32 298, i64 32, i64 32, i64 928, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
+!55 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_old_offset", i32 300, i64 64, i64 64, i64 960, i32 0, metadata !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
+!56 = metadata !{i32 786454, metadata !30, null, metadata !"__off_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
+!57 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!58 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_cur_column", i32 304, i64 16, i64 16, i64 1024, i32 0, metadata !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
+!59 = metadata !{i32 786468, null, null, metadata !"unsigned short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!60 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_vtable_offset", i32 305, i64 8, i64 8, i64 1040, i32 0, metadata !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
+!61 = metadata !{i32 786468, null, null, metadata !"signed char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!62 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_shortbuf", i32 306, i64 8, i64 8, i64 1048, i32 0, metadata !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
+!63 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 8, i64 8, i32 0, i32 0, metadata !11, metadata !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!64 = metadata !{metadata !65}
+!65 = metadata !{i32 786465, i64 0, i64 1}        ; [ DW_TAG_subrange_type ] [0, 0]
+!66 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_lock", i32 310, i64 64, i64 64, i64 1088, i32 0, metadata !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
+!67 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!68 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_offset", i32 319, i64 64, i64 64, i64 1152, i32 0, metadata !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
+!69 = metadata !{i32 786454, metadata !30, null, metadata !"__off64_t", i32 142, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
+!70 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad1", i32 328, i64 64, i64 64, i64 1216, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
+!71 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad2", i32 329, i64 64, i64 64, i64 1280, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
+!72 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad3", i32 330, i64 64, i64 64, i64 1344, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
+!73 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad4", i32 331, i64 64, i64 64, i64 1408, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
+!74 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad5", i32 332, i64 64, i64 64, i64 1472, i32 0, metadata !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
+!75 = metadata !{i32 786454, metadata !30, null, metadata !"size_t", i32 42, i64 0, i64 0, i64 0, i32 0, metadata !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
+!76 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!77 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_mode", i32 334, i64 32, i64 32, i64 1536, i32 0, metadata !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
+!78 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_unused2", i32 336, i64 160, i64 8, i64 1568, i32 0, metadata !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
+!79 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !11, metadata !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!80 = metadata !{metadata !81}
+!81 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
+!82 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"verify", metadata !"verify", metadata !"", i32 2388, metadata !83, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !86, i32 2388} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
+!83 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !84, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!84 = metadata !{null, metadata !8, metadata !10, metadata !85}
+!85 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
+!86 = metadata !{metadata !87, metadata !88, metadata !89, metadata !90, metadata !94, metadata !95, metadata !96, metadata !97, metadata !98, metadata !99, metadata !100, metadata !101}
+!87 = metadata !{i32 786689, metadata !82, metadata !"no_time_steps", metadata !5, i32 16779604, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
+!88 = metadata !{i32 786689, metadata !82, metadata !"class", metadata !5, i32 33556820, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [class] [line 2388]
+!89 = metadata !{i32 786689, metadata !82, metadata !"verified", metadata !5, i32 50334036, metadata !85, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
+!90 = metadata !{i32 786688, metadata !82, metadata !"xcrref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
+!91 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 320, i64 64, i32 0, i32 0, metadata !20, metadata !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
+!92 = metadata !{metadata !93}
+!93 = metadata !{i32 786465, i64 0, i64 5}        ; [ DW_TAG_subrange_type ] [0, 4]
+!94 = metadata !{i32 786688, metadata !82, metadata !"xceref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
+!95 = metadata !{i32 786688, metadata !82, metadata !"xcrdif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
+!96 = metadata !{i32 786688, metadata !82, metadata !"xcedif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
+!97 = metadata !{i32 786688, metadata !82, metadata !"epsilon", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
+!98 = metadata !{i32 786688, metadata !82, metadata !"xce", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
+!99 = metadata !{i32 786688, metadata !82, metadata !"xcr", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
+!100 = metadata !{i32 786688, metadata !82, metadata !"dtref", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
+!101 = metadata !{i32 786688, metadata !82, metadata !"m", metadata !5, i32 2399, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2399]
+!102 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"rhs_norm", metadata !"rhs_norm", metadata !"", i32 266, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !106, i32 266} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
+!103 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !104, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!104 = metadata !{null, metadata !105}
+!105 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!106 = metadata !{metadata !107, metadata !108, metadata !109, metadata !110, metadata !111, metadata !112, metadata !113}
+!107 = metadata !{i32 786689, metadata !102, metadata !"rms", metadata !5, i32 16777482, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 266]
+!108 = metadata !{i32 786688, metadata !102, metadata !"i", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 271]
+!109 = metadata !{i32 786688, metadata !102, metadata !"j", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 271]
+!110 = metadata !{i32 786688, metadata !102, metadata !"k", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 271]
+!111 = metadata !{i32 786688, metadata !102, metadata !"d", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 271]
+!112 = metadata !{i32 786688, metadata !102, metadata !"m", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 271]
+!113 = metadata !{i32 786688, metadata !102, metadata !"add", metadata !5, i32 272, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 272]
+!114 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"compute_rhs", metadata !"compute_rhs", metadata !"", i32 1767, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @compute_rhs, null, null, metadata !117, i32 1767} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
+!115 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !116, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!116 = metadata !{null}
+!117 = metadata !{metadata !118, metadata !119, metadata !120, metadata !121, metadata !122, metadata !123, metadata !124, metadata !125, metadata !126, metadata !127, metadata !128, metadata !129, metadata !130, metadata !131}
+!118 = metadata !{i32 786688, metadata !114, metadata !"i", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1769]
+!119 = metadata !{i32 786688, metadata !114, metadata !"j", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1769]
+!120 = metadata !{i32 786688, metadata !114, metadata !"k", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1769]
+!121 = metadata !{i32 786688, metadata !114, metadata !"m", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 1769]
+!122 = metadata !{i32 786688, metadata !114, metadata !"rho_inv", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
+!123 = metadata !{i32 786688, metadata !114, metadata !"uijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
+!124 = metadata !{i32 786688, metadata !114, metadata !"up1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
+!125 = metadata !{i32 786688, metadata !114, metadata !"um1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
+!126 = metadata !{i32 786688, metadata !114, metadata !"vijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
+!127 = metadata !{i32 786688, metadata !114, metadata !"vp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
+!128 = metadata !{i32 786688, metadata !114, metadata !"vm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
+!129 = metadata !{i32 786688, metadata !114, metadata !"wijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
+!130 = metadata !{i32 786688, metadata !114, metadata !"wp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
+!131 = metadata !{i32 786688, metadata !114, metadata !"wm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
+!132 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"error_norm", metadata !"error_norm", metadata !"", i32 225, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !133, i32 225} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
+!133 = metadata !{metadata !134, metadata !135, metadata !136, metadata !137, metadata !138, metadata !139, metadata !140, metadata !141, metadata !142, metadata !143, metadata !144}
+!134 = metadata !{i32 786689, metadata !132, metadata !"rms", metadata !5, i32 16777441, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 225]
+!135 = metadata !{i32 786688, metadata !132, metadata !"i", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 232]
+!136 = metadata !{i32 786688, metadata !132, metadata !"j", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 232]
+!137 = metadata !{i32 786688, metadata !132, metadata !"k", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 232]
+!138 = metadata !{i32 786688, metadata !132, metadata !"m", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 232]
+!139 = metadata !{i32 786688, metadata !132, metadata !"d", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 232]
+!140 = metadata !{i32 786688, metadata !132, metadata !"xi", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 233]
+!141 = metadata !{i32 786688, metadata !132, metadata !"eta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 233]
+!142 = metadata !{i32 786688, metadata !132, metadata !"zeta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
+!143 = metadata !{i32 786688, metadata !132, metadata !"u_exact", metadata !5, i32 233, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
+!144 = metadata !{i32 786688, metadata !132, metadata !"add", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 233]
+!145 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_solution", metadata !"exact_solution", metadata !"", i32 643, metadata !146, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !148, i32 644} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
+!146 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !147, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!147 = metadata !{null, metadata !20, metadata !20, metadata !20, metadata !105}
+!148 = metadata !{metadata !149, metadata !150, metadata !151, metadata !152, metadata !153}
+!149 = metadata !{i32 786689, metadata !145, metadata !"xi", metadata !5, i32 16777859, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [xi] [line 643]
+!150 = metadata !{i32 786689, metadata !145, metadata !"eta", metadata !5, i32 33555075, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [eta] [line 643]
+!151 = metadata !{i32 786689, metadata !145, metadata !"zeta", metadata !5, i32 50332291, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
+!152 = metadata !{i32 786689, metadata !145, metadata !"dtemp", metadata !5, i32 67109508, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
+!153 = metadata !{i32 786688, metadata !145, metadata !"m", metadata !5, i32 653, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 653]
+!154 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"set_constants", metadata !"set_constants", metadata !"", i32 2191, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2191} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
+!155 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsinit", metadata !"lhsinit", metadata !"", i32 855, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !156, i32 855} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
+!156 = metadata !{metadata !157, metadata !158, metadata !159, metadata !160, metadata !161}
+!157 = metadata !{i32 786688, metadata !155, metadata !"i", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 857]
+!158 = metadata !{i32 786688, metadata !155, metadata !"j", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 857]
+!159 = metadata !{i32 786688, metadata !155, metadata !"k", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 857]
+!160 = metadata !{i32 786688, metadata !155, metadata !"m", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 857]
+!161 = metadata !{i32 786688, metadata !155, metadata !"n", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 857]
+!162 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"initialize", metadata !"initialize", metadata !"", i32 669, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !163, i32 669} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
+!163 = metadata !{metadata !164, metadata !165, metadata !166, metadata !167, metadata !168, metadata !169, metadata !170, metadata !171, metadata !172, metadata !173, metadata !174, metadata !179, metadata !180, metadata !181, metadata !182}
+!164 = metadata !{i32 786688, metadata !162, metadata !"i", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 679]
+!165 = metadata !{i32 786688, metadata !162, metadata !"j", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 679]
+!166 = metadata !{i32 786688, metadata !162, metadata !"k", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 679]
+!167 = metadata !{i32 786688, metadata !162, metadata !"m", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 679]
+!168 = metadata !{i32 786688, metadata !162, metadata !"ix", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ix] [line 679]
+!169 = metadata !{i32 786688, metadata !162, metadata !"iy", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iy] [line 679]
+!170 = metadata !{i32 786688, metadata !162, metadata !"iz", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iz] [line 679]
+!171 = metadata !{i32 786688, metadata !162, metadata !"xi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 680]
+!172 = metadata !{i32 786688, metadata !162, metadata !"eta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 680]
+!173 = metadata !{i32 786688, metadata !162, metadata !"zeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
+!174 = metadata !{i32 786688, metadata !162, metadata !"Pface", metadata !5, i32 680, metadata !175, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
+!175 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1920, i64 64, i32 0, i32 0, metadata !20, metadata !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
+!176 = metadata !{metadata !177, metadata !178, metadata !93}
+!177 = metadata !{i32 786465, i64 0, i64 2}       ; [ DW_TAG_subrange_type ] [0, 1]
+!178 = metadata !{i32 786465, i64 0, i64 3}       ; [ DW_TAG_subrange_type ] [0, 2]
+!179 = metadata !{i32 786688, metadata !162, metadata !"Pxi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
+!180 = metadata !{i32 786688, metadata !162, metadata !"Peta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
+!181 = metadata !{i32 786688, metadata !162, metadata !"Pzeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
+!182 = metadata !{i32 786688, metadata !162, metadata !"temp", metadata !5, i32 680, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 680]
+!183 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_rhs", metadata !"exact_rhs", metadata !"", i32 301, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !184, i32 301} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
+!184 = metadata !{metadata !185, metadata !186, metadata !187, metadata !188, metadata !189, metadata !190, metadata !191, metadata !192, metadata !193, metadata !194, metadata !195, metadata !196, metadata !197, metadata !198, metadata !199}
+!185 = metadata !{i32 786688, metadata !183, metadata !"dtemp", metadata !5, i32 310, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
+!186 = metadata !{i32 786688, metadata !183, metadata !"xi", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 310]
+!187 = metadata !{i32 786688, metadata !183, metadata !"eta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 310]
+!188 = metadata !{i32 786688, metadata !183, metadata !"zeta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
+!189 = metadata !{i32 786688, metadata !183, metadata !"dtpp", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
+!190 = metadata !{i32 786688, metadata !183, metadata !"m", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 311]
+!191 = metadata !{i32 786688, metadata !183, metadata !"i", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 311]
+!192 = metadata !{i32 786688, metadata !183, metadata !"j", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 311]
+!193 = metadata !{i32 786688, metadata !183, metadata !"k", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 311]
+!194 = metadata !{i32 786688, metadata !183, metadata !"ip1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
+!195 = metadata !{i32 786688, metadata !183, metadata !"im1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [im1] [line 311]
+!196 = metadata !{i32 786688, metadata !183, metadata !"jp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
+!197 = metadata !{i32 786688, metadata !183, metadata !"jm1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
+!198 = metadata !{i32 786688, metadata !183, metadata !"km1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [km1] [line 311]
+!199 = metadata !{i32 786688, metadata !183, metadata !"kp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
+!200 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"adi", metadata !"adi", metadata !"", i32 210, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 210} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
+!201 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"add", metadata !"add", metadata !"", i32 187, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !202, i32 187} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
+!202 = metadata !{metadata !203, metadata !204, metadata !205, metadata !206}
+!203 = metadata !{i32 786688, metadata !201, metadata !"i", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 193]
+!204 = metadata !{i32 786688, metadata !201, metadata !"j", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 193]
+!205 = metadata !{i32 786688, metadata !201, metadata !"k", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 193]
+!206 = metadata !{i32 786688, metadata !201, metadata !"m", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 193]
+!207 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve", metadata !"z_solve", metadata !"", i32 3457, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3457} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
+!208 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_backsubstitute", metadata !"z_backsubstitute", metadata !"", i32 3480, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !209, i32 3480} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
+!209 = metadata !{metadata !210, metadata !211, metadata !212, metadata !213, metadata !214}
+!210 = metadata !{i32 786688, metadata !208, metadata !"i", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3492]
+!211 = metadata !{i32 786688, metadata !208, metadata !"j", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3492]
+!212 = metadata !{i32 786688, metadata !208, metadata !"k", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3492]
+!213 = metadata !{i32 786688, metadata !208, metadata !"m", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3492]
+!214 = metadata !{i32 786688, metadata !208, metadata !"n", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3492]
+!215 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve_cell", metadata !"z_solve_cell", metadata !"", i32 3512, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !216, i32 3512} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
+!216 = metadata !{metadata !217, metadata !218, metadata !219, metadata !220}
+!217 = metadata !{i32 786688, metadata !215, metadata !"i", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3527]
+!218 = metadata !{i32 786688, metadata !215, metadata !"j", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3527]
+!219 = metadata !{i32 786688, metadata !215, metadata !"k", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3527]
+!220 = metadata !{i32 786688, metadata !215, metadata !"ksize", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
+!221 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvrhs", metadata !"binvrhs", metadata !"", i32 3154, metadata !222, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !225, i32 3154} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
+!222 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !223, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!223 = metadata !{null, metadata !224, metadata !105}
+!224 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!225 = metadata !{metadata !226, metadata !227, metadata !228, metadata !229}
+!226 = metadata !{i32 786689, metadata !221, metadata !"lhs", metadata !5, i32 16780370, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
+!227 = metadata !{i32 786689, metadata !221, metadata !"r", metadata !5, i32 33557586, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 3154]
+!228 = metadata !{i32 786688, metadata !221, metadata !"pivot", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
+!229 = metadata !{i32 786688, metadata !221, metadata !"coeff", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
+!230 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matmul_sub", metadata !"matmul_sub", metadata !"", i32 2841, metadata !231, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !233, i32 2842} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
+!231 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !232, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!232 = metadata !{null, metadata !224, metadata !224, metadata !224}
+!233 = metadata !{metadata !234, metadata !235, metadata !236, metadata !237}
+!234 = metadata !{i32 786689, metadata !230, metadata !"ablock", metadata !5, i32 16780057, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
+!235 = metadata !{i32 786689, metadata !230, metadata !"bblock", metadata !5, i32 33557273, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
+!236 = metadata !{i32 786689, metadata !230, metadata !"cblock", metadata !5, i32 50334490, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
+!237 = metadata !{i32 786688, metadata !230, metadata !"j", metadata !5, i32 2851, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2851]
+!238 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matvec_sub", metadata !"matvec_sub", metadata !"", i32 2814, metadata !239, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !241, i32 2814} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
+!239 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !240, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!240 = metadata !{null, metadata !224, metadata !105, metadata !105}
+!241 = metadata !{metadata !242, metadata !243, metadata !244, metadata !245}
+!242 = metadata !{i32 786689, metadata !238, metadata !"ablock", metadata !5, i32 16780030, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
+!243 = metadata !{i32 786689, metadata !238, metadata !"avec", metadata !5, i32 33557246, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
+!244 = metadata !{i32 786689, metadata !238, metadata !"bvec", metadata !5, i32 50334462, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
+!245 = metadata !{i32 786688, metadata !238, metadata !"i", metadata !5, i32 2823, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2823]
+!246 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvcrhs", metadata !"binvcrhs", metadata !"", i32 2885, metadata !247, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !249, i32 2885} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
+!247 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !248, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!248 = metadata !{null, metadata !224, metadata !224, metadata !105}
+!249 = metadata !{metadata !250, metadata !251, metadata !252, metadata !253, metadata !254}
+!250 = metadata !{i32 786689, metadata !246, metadata !"lhs", metadata !5, i32 16780101, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
+!251 = metadata !{i32 786689, metadata !246, metadata !"c", metadata !5, i32 33557317, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 2885]
+!252 = metadata !{i32 786689, metadata !246, metadata !"r", metadata !5, i32 50334533, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 2885]
+!253 = metadata !{i32 786688, metadata !246, metadata !"pivot", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
+!254 = metadata !{i32 786688, metadata !246, metadata !"coeff", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
+!255 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsz", metadata !"lhsz", metadata !"", i32 1475, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !256, i32 1475} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
+!256 = metadata !{metadata !257, metadata !258, metadata !259}
+!257 = metadata !{i32 786688, metadata !255, metadata !"i", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1484]
+!258 = metadata !{i32 786688, metadata !255, metadata !"j", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1484]
+!259 = metadata !{i32 786688, metadata !255, metadata !"k", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1484]
+!260 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve", metadata !"y_solve", metadata !"", i32 3299, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3299} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
+!261 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_backsubstitute", metadata !"y_backsubstitute", metadata !"", i32 3323, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !262, i32 3323} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
+!262 = metadata !{metadata !263, metadata !264, metadata !265, metadata !266, metadata !267}
+!263 = metadata !{i32 786688, metadata !261, metadata !"i", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3335]
+!264 = metadata !{i32 786688, metadata !261, metadata !"j", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3335]
+!265 = metadata !{i32 786688, metadata !261, metadata !"k", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3335]
+!266 = metadata !{i32 786688, metadata !261, metadata !"m", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3335]
+!267 = metadata !{i32 786688, metadata !261, metadata !"n", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3335]
+!268 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve_cell", metadata !"y_solve_cell", metadata !"", i32 3355, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !269, i32 3355} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
+!269 = metadata !{metadata !270, metadata !271, metadata !272, metadata !273}
+!270 = metadata !{i32 786688, metadata !268, metadata !"i", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3370]
+!271 = metadata !{i32 786688, metadata !268, metadata !"j", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3370]
+!272 = metadata !{i32 786688, metadata !268, metadata !"k", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3370]
+!273 = metadata !{i32 786688, metadata !268, metadata !"jsize", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
+!274 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsy", metadata !"lhsy", metadata !"", i32 1181, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !275, i32 1181} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
+!275 = metadata !{metadata !276, metadata !277, metadata !278}
+!276 = metadata !{i32 786688, metadata !274, metadata !"i", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1190]
+!277 = metadata !{i32 786688, metadata !274, metadata !"j", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1190]
+!278 = metadata !{i32 786688, metadata !274, metadata !"k", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1190]
+!279 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve", metadata !"x_solve", metadata !"", i32 2658, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2658} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
+!280 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_backsubstitute", metadata !"x_backsubstitute", metadata !"", i32 2684, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !281, i32 2684} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
+!281 = metadata !{metadata !282, metadata !283, metadata !284, metadata !285, metadata !286}
+!282 = metadata !{i32 786688, metadata !280, metadata !"i", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2696]
+!283 = metadata !{i32 786688, metadata !280, metadata !"j", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2696]
+!284 = metadata !{i32 786688, metadata !280, metadata !"k", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2696]
+!285 = metadata !{i32 786688, metadata !280, metadata !"m", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2696]
+!286 = metadata !{i32 786688, metadata !280, metadata !"n", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 2696]
+!287 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve_cell", metadata !"x_solve_cell", metadata !"", i32 2716, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !288, i32 2716} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
+!288 = metadata !{metadata !289, metadata !290, metadata !291, metadata !292}
+!289 = metadata !{i32 786688, metadata !287, metadata !"i", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2728]
+!290 = metadata !{i32 786688, metadata !287, metadata !"j", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2728]
+!291 = metadata !{i32 786688, metadata !287, metadata !"k", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2728]
+!292 = metadata !{i32 786688, metadata !287, metadata !"isize", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
+!293 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsx", metadata !"lhsx", metadata !"", i32 898, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !294, i32 898} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
+!294 = metadata !{metadata !295, metadata !296, metadata !297}
+!295 = metadata !{i32 786688, metadata !293, metadata !"i", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 907]
+!296 = metadata !{i32 786688, metadata !293, metadata !"j", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 907]
+!297 = metadata !{i32 786688, metadata !293, metadata !"k", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 907]
+!298 = metadata !{metadata !299, metadata !304, metadata !305, metadata !309, metadata !310, metadata !311, metadata !312, metadata !313, metadata !314, metadata !315, metadata !316, metadata !317, metadata !318, metadata !319, metadata !320, metadata !321, metadata !322, metadata !323, metadata !324, metadata !325, metadata !326, metadata !327, metadata !328, metadata !329, metadata !330, metadata !331, metadata !332, metadata !333, metadata !334, metadata !335, metadata !336, metadata !337, metadata !338, metadata !339, metadata !340, metadata !341, metadata !342, metadata !343, metadata !347, metadata !350, metadata !351, metadata !352, metadata !353, metadata !354, metadata !355, metadata !356, metadata !360, metadata !361, metadata !362, metadata !363, metadata !364, metadata !365, metadata !366, metadata !367, metadata !368, metadata !369, metadata !370, metadata !371, metadata !372, metadata !373, metadata !374, metadata !375, metadata !376, metadata !377, metadata !378, metadata !379, metadata !380, metadata !381, metadata !382, metadata !383, metadata !384, metadata !385, metadata !386, metadata !387, metadata !388, metadata !389, metadata !390, metadata !391, metadata !392, metadata !393, metadata !394, metadata !395, metadata !396, metadata !397, metadata !398, metadata !399, metadata !400, metadata !401, metadata !402, metadata !403, metadata !404, metadata !405, metadata !406, metadata !407, metadata !408, metadata !409, metadata !410, metadata !411, metadata !412, metadata !413, metadata !414, metadata !415, metadata !416, metadata !417, metadata !418, metadata !419, metadata !422, metadata !426, metadata !427, metadata !430, metadata !431, metadata !434, metadata !435, metadata !436, metadata !437}
+!299 = metadata !{i32 786484, i32 0, null, metadata !"grid_points", metadata !"grid_points", metadata !"", metadata !300, i32 28, metadata !302, i32 1, i32 1, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
+!300 = metadata !{i32 786473, metadata !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
+!301 = metadata !{metadata !"./header.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!302 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 96, i64 32, i32 0, i32 0, metadata !8, metadata !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
+!303 = metadata !{metadata !178}
+!304 = metadata !{i32 786484, i32 0, null, metadata !"dt", metadata !"dt", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
+!305 = metadata !{i32 786484, i32 0, null, metadata !"rhs", metadata !"rhs", metadata !"", metadata !300, i32 68, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
+!306 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1385839040, i64 64, i32 0, i32 0, metadata !20, metadata !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
+!307 = metadata !{metadata !308, metadata !308, metadata !308, metadata !93}
+!308 = metadata !{i32 786465, i64 0, i64 163}     ; [ DW_TAG_subrange_type ] [0, 162]
+!309 = metadata !{i32 786484, i32 0, null, metadata !"zzcon5", metadata !"zzcon5", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
+!310 = metadata !{i32 786484, i32 0, null, metadata !"zzcon4", metadata !"zzcon4", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
+!311 = metadata !{i32 786484, i32 0, null, metadata !"zzcon3", metadata !"zzcon3", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
+!312 = metadata !{i32 786484, i32 0, null, metadata !"dz5tz1", metadata !"dz5tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
+!313 = metadata !{i32 786484, i32 0, null, metadata !"dz4tz1", metadata !"dz4tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
+!314 = metadata !{i32 786484, i32 0, null, metadata !"dz3tz1", metadata !"dz3tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
+!315 = metadata !{i32 786484, i32 0, null, metadata !"zzcon2", metadata !"zzcon2", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
+!316 = metadata !{i32 786484, i32 0, null, metadata !"dz2tz1", metadata !"dz2tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
+!317 = metadata !{i32 786484, i32 0, null, metadata !"tz2", metadata !"tz2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
+!318 = metadata !{i32 786484, i32 0, null, metadata !"dz1tz1", metadata !"dz1tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
+!319 = metadata !{i32 786484, i32 0, null, metadata !"yycon5", metadata !"yycon5", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
+!320 = metadata !{i32 786484, i32 0, null, metadata !"yycon4", metadata !"yycon4", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
+!321 = metadata !{i32 786484, i32 0, null, metadata !"yycon3", metadata !"yycon3", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
+!322 = metadata !{i32 786484, i32 0, null, metadata !"dy5ty1", metadata !"dy5ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
+!323 = metadata !{i32 786484, i32 0, null, metadata !"dy4ty1", metadata !"dy4ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
+!324 = metadata !{i32 786484, i32 0, null, metadata !"dy3ty1", metadata !"dy3ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
+!325 = metadata !{i32 786484, i32 0, null, metadata !"yycon2", metadata !"yycon2", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
+!326 = metadata !{i32 786484, i32 0, null, metadata !"dy2ty1", metadata !"dy2ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
+!327 = metadata !{i32 786484, i32 0, null, metadata !"ty2", metadata !"ty2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
+!328 = metadata !{i32 786484, i32 0, null, metadata !"dy1ty1", metadata !"dy1ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
+!329 = metadata !{i32 786484, i32 0, null, metadata !"dssp", metadata !"dssp", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
+!330 = metadata !{i32 786484, i32 0, null, metadata !"c1", metadata !"c1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
+!331 = metadata !{i32 786484, i32 0, null, metadata !"xxcon5", metadata !"xxcon5", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
+!332 = metadata !{i32 786484, i32 0, null, metadata !"xxcon4", metadata !"xxcon4", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
+!333 = metadata !{i32 786484, i32 0, null, metadata !"xxcon3", metadata !"xxcon3", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
+!334 = metadata !{i32 786484, i32 0, null, metadata !"dx5tx1", metadata !"dx5tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
+!335 = metadata !{i32 786484, i32 0, null, metadata !"dx4tx1", metadata !"dx4tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
+!336 = metadata !{i32 786484, i32 0, null, metadata !"dx3tx1", metadata !"dx3tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
+!337 = metadata !{i32 786484, i32 0, null, metadata !"c2", metadata !"c2", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
+!338 = metadata !{i32 786484, i32 0, null, metadata !"con43", metadata !"con43", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
+!339 = metadata !{i32 786484, i32 0, null, metadata !"xxcon2", metadata !"xxcon2", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
+!340 = metadata !{i32 786484, i32 0, null, metadata !"dx2tx1", metadata !"dx2tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
+!341 = metadata !{i32 786484, i32 0, null, metadata !"tx2", metadata !"tx2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
+!342 = metadata !{i32 786484, i32 0, null, metadata !"dx1tx1", metadata !"dx1tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
+!343 = metadata !{i32 786484, i32 0, null, metadata !"forcing", metadata !"forcing", metadata !"", metadata !300, i32 66, metadata !344, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
+!344 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1663006848, i64 64, i32 0, i32 0, metadata !20, metadata !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
+!345 = metadata !{metadata !308, metadata !308, metadata !308, metadata !346}
+!346 = metadata !{i32 786465, i64 0, i64 6}       ; [ DW_TAG_subrange_type ] [0, 5]
+!347 = metadata !{i32 786484, i32 0, null, metadata !"qs", metadata !"qs", metadata !"", metadata !300, i32 63, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
+!348 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 277167808, i64 64, i32 0, i32 0, metadata !20, metadata !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
+!349 = metadata !{metadata !308, metadata !308, metadata !308}
+!350 = metadata !{i32 786484, i32 0, null, metadata !"square", metadata !"square", metadata !"", metadata !300, i32 65, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
+!351 = metadata !{i32 786484, i32 0, null, metadata !"ws", metadata !"ws", metadata !"", metadata !300, i32 62, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
+!352 = metadata !{i32 786484, i32 0, null, metadata !"vs", metadata !"vs", metadata !"", metadata !300, i32 61, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
+!353 = metadata !{i32 786484, i32 0, null, metadata !"us", metadata !"us", metadata !"", metadata !300, i32 60, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
+!354 = metadata !{i32 786484, i32 0, null, metadata !"rho_i", metadata !"rho_i", metadata !"", metadata !300, i32 64, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
+!355 = metadata !{i32 786484, i32 0, null, metadata !"u", metadata !"u", metadata !"", metadata !300, i32 67, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
+!356 = metadata !{i32 786484, i32 0, null, metadata !"ce", metadata !"ce", metadata !"", metadata !300, i32 36, metadata !357, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
+!357 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 4160, i64 64, i32 0, i32 0, metadata !20, metadata !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
+!358 = metadata !{metadata !93, metadata !359}
+!359 = metadata !{i32 786465, i64 0, i64 13}      ; [ DW_TAG_subrange_type ] [0, 12]
+!360 = metadata !{i32 786484, i32 0, null, metadata !"dnzm1", metadata !"dnzm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
+!361 = metadata !{i32 786484, i32 0, null, metadata !"dnym1", metadata !"dnym1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
+!362 = metadata !{i32 786484, i32 0, null, metadata !"dnxm1", metadata !"dnxm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
+!363 = metadata !{i32 786484, i32 0, null, metadata !"zzcon1", metadata !"zzcon1", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
+!364 = metadata !{i32 786484, i32 0, null, metadata !"yycon1", metadata !"yycon1", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
+!365 = metadata !{i32 786484, i32 0, null, metadata !"xxcon1", metadata !"xxcon1", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
+!366 = metadata !{i32 786484, i32 0, null, metadata !"con16", metadata !"con16", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
+!367 = metadata !{i32 786484, i32 0, null, metadata !"c2iv", metadata !"c2iv", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
+!368 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tz3", metadata !"c3c4tz3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
+!369 = metadata !{i32 786484, i32 0, null, metadata !"c3c4ty3", metadata !"c3c4ty3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
+!370 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tx3", metadata !"c3c4tx3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
+!371 = metadata !{i32 786484, i32 0, null, metadata !"comz6", metadata !"comz6", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
+!372 = metadata !{i32 786484, i32 0, null, metadata !"comz5", metadata !"comz5", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
+!373 = metadata !{i32 786484, i32 0, null, metadata !"comz4", metadata !"comz4", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
+!374 = metadata !{i32 786484, i32 0, null, metadata !"comz1", metadata !"comz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
+!375 = metadata !{i32 786484, i32 0, null, metadata !"dtdssp", metadata !"dtdssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
+!376 = metadata !{i32 786484, i32 0, null, metadata !"c2dttz1", metadata !"c2dttz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
+!377 = metadata !{i32 786484, i32 0, null, metadata !"c2dtty1", metadata !"c2dtty1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
+!378 = metadata !{i32 786484, i32 0, null, metadata !"c2dttx1", metadata !"c2dttx1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
+!379 = metadata !{i32 786484, i32 0, null, metadata !"dttz2", metadata !"dttz2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
+!380 = metadata !{i32 786484, i32 0, null, metadata !"dttz1", metadata !"dttz1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
+!381 = metadata !{i32 786484, i32 0, null, metadata !"dtty2", metadata !"dtty2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
+!382 = metadata !{i32 786484, i32 0, null, metadata !"dtty1", metadata !"dtty1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
+!383 = metadata !{i32 786484, i32 0, null, metadata !"dttx2", metadata !"dttx2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
+!384 = metadata !{i32 786484, i32 0, null, metadata !"dttx1", metadata !"dttx1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
+!385 = metadata !{i32 786484, i32 0, null, metadata !"c5dssp", metadata !"c5dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
+!386 = metadata !{i32 786484, i32 0, null, metadata !"c4dssp", metadata !"c4dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
+!387 = metadata !{i32 786484, i32 0, null, metadata !"dzmax", metadata !"dzmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
+!388 = metadata !{i32 786484, i32 0, null, metadata !"dymax", metadata !"dymax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
+!389 = metadata !{i32 786484, i32 0, null, metadata !"dxmax", metadata !"dxmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
+!390 = metadata !{i32 786484, i32 0, null, metadata !"dz5", metadata !"dz5", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
+!391 = metadata !{i32 786484, i32 0, null, metadata !"dz4", metadata !"dz4", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
+!392 = metadata !{i32 786484, i32 0, null, metadata !"dz3", metadata !"dz3", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
+!393 = metadata !{i32 786484, i32 0, null, metadata !"dz2", metadata !"dz2", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
+!394 = metadata !{i32 786484, i32 0, null, metadata !"dz1", metadata !"dz1", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
+!395 = metadata !{i32 786484, i32 0, null, metadata !"dy5", metadata !"dy5", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
+!396 = metadata !{i32 786484, i32 0, null, metadata !"dy4", metadata !"dy4", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
+!397 = metadata !{i32 786484, i32 0, null, metadata !"dy3", metadata !"dy3", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
+!398 = metadata !{i32 786484, i32 0, null, metadata !"dy2", metadata !"dy2", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
+!399 = metadata !{i32 786484, i32 0, null, metadata !"dy1", metadata !"dy1", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
+!400 = metadata !{i32 786484, i32 0, null, metadata !"dx5", metadata !"dx5", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
+!401 = metadata !{i32 786484, i32 0, null, metadata !"dx4", metadata !"dx4", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
+!402 = metadata !{i32 786484, i32 0, null, metadata !"dx3", metadata !"dx3", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
+!403 = metadata !{i32 786484, i32 0, null, metadata !"dx2", metadata !"dx2", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
+!404 = metadata !{i32 786484, i32 0, null, metadata !"dx1", metadata !"dx1", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
+!405 = metadata !{i32 786484, i32 0, null, metadata !"tz3", metadata !"tz3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
+!406 = metadata !{i32 786484, i32 0, null, metadata !"tz1", metadata !"tz1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
+!407 = metadata !{i32 786484, i32 0, null, metadata !"ty3", metadata !"ty3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
+!408 = metadata !{i32 786484, i32 0, null, metadata !"ty1", metadata !"ty1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
+!409 = metadata !{i32 786484, i32 0, null, metadata !"tx3", metadata !"tx3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
+!410 = metadata !{i32 786484, i32 0, null, metadata !"tx1", metadata !"tx1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
+!411 = metadata !{i32 786484, i32 0, null, metadata !"conz1", metadata !"conz1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
+!412 = metadata !{i32 786484, i32 0, null, metadata !"c1345", metadata !"c1345", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
+!413 = metadata !{i32 786484, i32 0, null, metadata !"c3c4", metadata !"c3c4", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
+!414 = metadata !{i32 786484, i32 0, null, metadata !"c1c5", metadata !"c1c5", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
+!415 = metadata !{i32 786484, i32 0, null, metadata !"c1c2", metadata !"c1c2", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
+!416 = metadata !{i32 786484, i32 0, null, metadata !"c5", metadata !"c5", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
+!417 = metadata !{i32 786484, i32 0, null, metadata !"c4", metadata !"c4", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
+!418 = metadata !{i32 786484, i32 0, null, metadata !"c3", metadata !"c3", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
+!419 = metadata !{i32 786484, i32 0, null, metadata !"lhs", metadata !"lhs", metadata !"", metadata !300, i32 69, metadata !420, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
+!420 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 20787585600, i64 64, i32 0, i32 0, metadata !20, metadata !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
+!421 = metadata !{metadata !308, metadata !308, metadata !308, metadata !178, metadata !93, metadata !93}
+!422 = metadata !{i32 786484, i32 0, null, metadata !"q", metadata !"q", metadata !"", metadata !300, i32 73, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
+!423 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 10368, i64 64, i32 0, i32 0, metadata !20, metadata !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
+!424 = metadata !{metadata !425}
+!425 = metadata !{i32 786465, i64 0, i64 162}     ; [ DW_TAG_subrange_type ] [0, 161]
+!426 = metadata !{i32 786484, i32 0, null, metadata !"cuf", metadata !"cuf", metadata !"", metadata !300, i32 72, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
+!427 = metadata !{i32 786484, i32 0, null, metadata !"buf", metadata !"buf", metadata !"", metadata !300, i32 75, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
+!428 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 51840, i64 64, i32 0, i32 0, metadata !20, metadata !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
+!429 = metadata !{metadata !425, metadata !93}
+!430 = metadata !{i32 786484, i32 0, null, metadata !"ue", metadata !"ue", metadata !"", metadata !300, i32 74, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
+!431 = metadata !{i32 786484, i32 0, null, metadata !"njac", metadata !"njac", metadata !"", metadata !300, i32 86, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
+!432 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 6886684800, i64 64, i32 0, i32 0, metadata !20, metadata !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
+!433 = metadata !{metadata !308, metadata !308, metadata !425, metadata !93, metadata !93}
+!434 = metadata !{i32 786484, i32 0, null, metadata !"fjac", metadata !"fjac", metadata !"", metadata !300, i32 84, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
+!435 = metadata !{i32 786484, i32 0, null, metadata !"tmp3", metadata !"tmp3", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
+!436 = metadata !{i32 786484, i32 0, null, metadata !"tmp2", metadata !"tmp2", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
+!437 = metadata !{i32 786484, i32 0, null, metadata !"tmp1", metadata !"tmp1", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
+!438 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!439 = metadata !{i32 1898, i32 0, metadata !440, null}
+!440 = metadata !{i32 786443, metadata !1, metadata !114, i32 1898, i32 0, i32 107} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!441 = metadata !{i32 1913, i32 0, metadata !442, null}
+!442 = metadata !{i32 786443, metadata !1, metadata !114, i32 1913, i32 0, i32 115} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!443 = metadata !{i32 1923, i32 0, metadata !114, null}
+!444 = metadata !{metadata !"int", metadata !445}
+!445 = metadata !{metadata !"omnipotent char", metadata !446}
+!446 = metadata !{metadata !"Simple C/C++ TBAA"}
+!447 = metadata !{i32 1}
+!448 = metadata !{i32 1925, i32 0, metadata !449, null}
+!449 = metadata !{i32 786443, metadata !1, metadata !114, i32 1925, i32 0, i32 121} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!450 = metadata !{i32 1939, i32 0, metadata !451, null}
+!451 = metadata !{i32 786443, metadata !1, metadata !114, i32 1939, i32 0, i32 127} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!452 = metadata !{i32 1940, i32 0, metadata !453, null}
+!453 = metadata !{i32 786443, metadata !1, metadata !454, i32 1940, i32 0, i32 129} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!454 = metadata !{i32 786443, metadata !1, metadata !451, i32 1939, i32 0, i32 128} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!455 = metadata !{i32 1941, i32 0, metadata !456, null}
+!456 = metadata !{i32 786443, metadata !1, metadata !457, i32 1941, i32 0, i32 131} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!457 = metadata !{i32 786443, metadata !1, metadata !453, i32 1940, i32 0, i32 130} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!458 = metadata !{i32 2020, i32 0, metadata !459, null}
+!459 = metadata !{i32 786443, metadata !1, metadata !460, i32 2020, i32 0, i32 149} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!460 = metadata !{i32 786443, metadata !1, metadata !461, i32 2019, i32 0, i32 148} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!461 = metadata !{i32 786443, metadata !1, metadata !462, i32 2019, i32 0, i32 147} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!462 = metadata !{i32 786443, metadata !1, metadata !463, i32 2018, i32 0, i32 146} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!463 = metadata !{i32 786443, metadata !1, metadata !114, i32 2018, i32 0, i32 145} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!464 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/PowerPC/pr17354.ll b/test/CodeGen/PowerPC/pr17354.ll
new file mode 100644
index 0000000..dca81b1
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr17354.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mcpu=pwr7 -relocation-model=pic <%s | FileCheck %s
+
+; Test that PR17354 is fixed.  We must generate a nop following even
+; local calls when generating code for shared libraries, to permit
+; TOC fixup.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.CS = type { i32 }
+
+@_ZL3glb = internal global [1 x %struct.CS] zeroinitializer, align 4
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
+define internal void @__cxx_global_var_init() section ".text.startup" {
+entry:
+  call void @_Z4funcv(%struct.CS* sret getelementptr inbounds ([1 x %struct.CS]* @_ZL3glb, i64 0, i64 0))
+  ret void
+}
+
+; CHECK-LABEL: __cxx_global_var_init:
+; CHECK: bl _Z4funcv
+; CHECK-NEXT: nop
+
+; Function Attrs: nounwind
+define void @_Z4funcv(%struct.CS* noalias sret %agg.result) #0 {
+entry:
+  %a_ = getelementptr inbounds %struct.CS* %agg.result, i32 0, i32 0
+  store i32 0, i32* %a_, align 4
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() section ".text.startup" {
+entry:
+  call void @__cxx_global_var_init()
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
index 38d7682..891e801 100644
--- a/test/CodeGen/PowerPC/recipest.ll
+++ b/test/CodeGen/PowerPC/recipest.ll
@@ -169,6 +169,7 @@ entry:
   ret double %r
 
 ; CHECK: @foo3
+; CHECK: fcmpu
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
@@ -195,6 +196,7 @@ entry:
   ret float %r
 
 ; CHECK: @goo3
+; CHECK: fcmpu
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
@@ -217,7 +219,8 @@ entry:
 
 ; CHECK: @hoo3
 ; CHECK: vrsqrtefp
-; CHECK: vrefp
+; CHECK-DAG: vrefp
+; CHECK-DAG: vcmpeqfp
 
 ; CHECK-SAFE: @hoo3
 ; CHECK-SAFE-NOT: vrsqrtefp
diff --git a/test/CodeGen/PowerPC/reg-names.ll b/test/CodeGen/PowerPC/reg-names.ll
new file mode 100644
index 0000000..f8fa7e4
--- /dev/null
+++ b/test/CodeGen/PowerPC/reg-names.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -ppc-asm-full-reg-names < %s | FileCheck -check-prefix=CHECK-FN %s
+
+define i64 @test1(i64 %a, i64 %b) {
+; CHECK-LABEL: @test1
+; CHECK-FN-LABEL: @test1
+
+entry:
+  ret i64 %b
+
+; CHECK: mr 3, 4
+; CHECK-FN: mr r3, r4
+
+; CHECK: blr
+; CHECK-FN: blr
+}
+
diff --git a/test/CodeGen/PowerPC/reloc-align.ll b/test/CodeGen/PowerPC/reloc-align.ll
index bd5c4d6..13d6ada 100644
--- a/test/CodeGen/PowerPC/reloc-align.ll
+++ b/test/CodeGen/PowerPC/reloc-align.ll
@@ -31,4 +31,4 @@ entry:
   ret i32 %bf.cast
 }
 
-attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/rlwimi-and.ll b/test/CodeGen/PowerPC/rlwimi-and.ll
index e20a13f..7963249 100644
--- a/test/CodeGen/PowerPC/rlwimi-and.ll
+++ b/test/CodeGen/PowerPC/rlwimi-and.ll
@@ -28,12 +28,11 @@ codeRepl17:                                       ; preds = %codeRepl4
   store i16 %rvml38.sroa.0.0.insert.insert, i16* undef, align 2
   unreachable
 
+; FIXME: the SLWI could be folded into the RLWIMI to give a rotate of 8.
 ; CHECK: @test
-; CHECK-DAG: slwi [[R1:[0-9]+]],
-; CHECK-DAG: rlwinm [[R2:[0-9]+]],
-; CHECK-DAG: srawi [[R3:[0-9]+]], [[R1]]
-; CHECK-DAG: rlwinm [[R4:[0-9]+]], [[R3]], 0, 23, 23
-; CHECK: rlwimi [[R4]], [[R2]], 0,
+; CHECK-DAG: slwi [[R1:[0-9]+]], {{[0-9]+}}, 31
+; CHECK-DAG: rlwinm [[R2:[0-9]+]], {{[0-9]+}}, 0, 31, 31
+; CHECK: rlwimi [[R2]], [[R1]], 9, 23, 23
 
 codeRepl29:                                       ; preds = %codeRepl1
   unreachable
diff --git a/test/CodeGen/PowerPC/rounding-ops.ll b/test/CodeGen/PowerPC/rounding-ops.ll
index 2c02900..bf0a641 100644
--- a/test/CodeGen/PowerPC/rounding-ops.ll
+++ b/test/CodeGen/PowerPC/rounding-ops.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck -check-prefix=CHECK-FM %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -9,9 +8,6 @@ define float @test1(float %x) nounwind  {
 
 ; CHECK-LABEL: test1:
 ; CHECK: frim 1, 1
-
-; CHECK-FM-LABEL: test1:
-; CHECK-FM: frim 1, 1
 }
 
 declare float @floorf(float) nounwind readnone
@@ -22,38 +18,29 @@ define double @test2(double %x) nounwind  {
 
 ; CHECK-LABEL: test2:
 ; CHECK: frim 1, 1
-
-; CHECK-FM-LABEL: test2:
-; CHECK-FM: frim 1, 1
 }
 
 declare double @floor(double) nounwind readnone
 
 define float @test3(float %x) nounwind  {
-  %call = tail call float @nearbyintf(float %x) nounwind readnone
+  %call = tail call float @roundf(float %x) nounwind readnone
   ret float %call
 
 ; CHECK-LABEL: test3:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test3:
-; CHECK-FM: frin 1, 1
+; CHECK: frin 1, 1
 }
 
-declare float @nearbyintf(float) nounwind readnone
+declare float @roundf(float) nounwind readnone
 
 define double @test4(double %x) nounwind  {
-  %call = tail call double @nearbyint(double %x) nounwind readnone
+  %call = tail call double @round(double %x) nounwind readnone
   ret double %call
 
 ; CHECK-LABEL: test4:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test4:
-; CHECK-FM: frin 1, 1
+; CHECK: frin 1, 1
 }
 
-declare double @nearbyint(double) nounwind readnone
+declare double @round(double) nounwind readnone
 
 define float @test5(float %x) nounwind  {
   %call = tail call float @ceilf(float %x) nounwind readnone
@@ -61,9 +48,6 @@ define float @test5(float %x) nounwind  {
 
 ; CHECK-LABEL: test5:
 ; CHECK: frip 1, 1
-
-; CHECK-FM-LABEL: test5:
-; CHECK-FM: frip 1, 1
 }
 
 declare float @ceilf(float) nounwind readnone
@@ -74,9 +58,6 @@ define double @test6(double %x) nounwind  {
 
 ; CHECK-LABEL: test6:
 ; CHECK: frip 1, 1
-
-; CHECK-FM-LABEL: test6:
-; CHECK-FM: frip 1, 1
 }
 
 declare double @ceil(double) nounwind readnone
@@ -87,9 +68,6 @@ define float @test9(float %x) nounwind  {
 
 ; CHECK-LABEL: test9:
 ; CHECK: friz 1, 1
-
-; CHECK-FM-LABEL: test9:
-; CHECK-FM: friz 1, 1
 }
 
 declare float @truncf(float) nounwind readnone
@@ -100,48 +78,7 @@ define double @test10(double %x) nounwind  {
 
 ; CHECK-LABEL: test10:
 ; CHECK: friz 1, 1
-
-; CHECK-FM-LABEL: test10:
-; CHECK-FM: friz 1, 1
 }
 
 declare double @trunc(double) nounwind readnone
 
-define void @test11(float %x, float* %y) nounwind  {
-  %call = tail call float @rintf(float %x) nounwind readnone
-  store float %call, float* %y
-  ret void
-
-; CHECK-LABEL: test11:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test11:
-; CHECK-FM: frin [[R2:[0-9]+]], [[R1:[0-9]+]]
-; CHECK-FM: fcmpu [[CR:[0-9]+]], [[R2]], [[R1]]
-; CHECK-FM: beq [[CR]], .LBB[[BB:[0-9]+]]_2
-; CHECK-FM: mtfsb1 6
-; CHECK-FM: .LBB[[BB]]_2:
-; CHECK-FM: blr
-}
-
-declare float @rintf(float) nounwind readnone
-
-define void @test12(double %x, double* %y) nounwind  {
-  %call = tail call double @rint(double %x) nounwind readnone
-  store double %call, double* %y
-  ret void
-
-; CHECK-LABEL: test12:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test12:
-; CHECK-FM: frin [[R2:[0-9]+]], [[R1:[0-9]+]]
-; CHECK-FM: fcmpu [[CR:[0-9]+]], [[R2]], [[R1]]
-; CHECK-FM: beq [[CR]], .LBB[[BB:[0-9]+]]_2
-; CHECK-FM: mtfsb1 6
-; CHECK-FM: .LBB[[BB]]_2:
-; CHECK-FM: blr
-}
-
-declare double @rint(double) nounwind readnone
-
diff --git a/test/CodeGen/PowerPC/sjlj.ll b/test/CodeGen/PowerPC/sjlj.ll
index 571f3b2..414640b 100644
--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@@ -64,15 +64,16 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK: std
 ; Make sure that we're not saving VRSAVE on non-Darwin:
 ; CHECK-NOT: mfspr
-; CHECK: stfd
-; CHECK: stvx
 
-; CHECK: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
-; CHECK: std 31, env_sigill@toc@l([[REG]])
-; CHECK: addi [[REG]], [[REG]], env_sigill@toc@l
-; CHECK: std [[REG]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
-; CHECK: std 1, 16([[REG]])
-; CHECK: std 2, 24([[REG]])
+; CHECK-DAG: stfd
+; CHECK-DAG: stvx
+
+; CHECK-DAG: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
+; CHECK-DAG: std 31, env_sigill@toc@l([[REG]])
+; CHECK-DAG: addi [[REGA:[0-9]+]], [[REG]], env_sigill@toc@l
+; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
+; CHECK-DAG: std 1, 16([[REGA]])
+; CHECK-DAG: std 2, 24([[REGA]])
 ; CHECK: bcl 20, 31, .LBB1_1
 ; CHECK: li 3, 1
 ; CHECK: #EH_SjLj_Setup	.LBB1_1
@@ -134,11 +135,11 @@ return:                                           ; preds = %if.end, %if.then
 
 ; CHECK: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
 ; CHECK: std 31, env_sigill@toc@l([[REG]])
-; CHECK: addi [[REG]], [[REG]], env_sigill@toc@l
-; CHECK: std [[REG]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
-; CHECK: std 1, 16([[REG]])
-; CHECK: std 2, 24([[REG]])
-; CHECK: std 30, 32([[REG]])
+; CHECK: addi [[REGB:[0-9]+]], [[REG]], env_sigill@toc@l
+; CHECK-DAG: std [[REGB]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
+; CHECK-DAG: std 1, 16([[REGB]])
+; CHECK-DAG: std 2, 24([[REGB]])
+; CHECK-DAG: std 30, 32([[REGB]])
 ; CHECK: bcl 20, 31,
 
 ; CHECK: blr
@@ -152,7 +153,7 @@ declare i8* @llvm.stacksave() #3
 
 declare i32 @llvm.eh.sjlj.setjmp(i8*) #3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { noreturn nounwind }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/test/CodeGen/PowerPC/stack-realign.ll b/test/CodeGen/PowerPC/stack-realign.ll
index f7b6d19..1c7a36a 100644
--- a/test/CodeGen/PowerPC/stack-realign.ll
+++ b/test/CodeGen/PowerPC/stack-realign.ll
@@ -11,13 +11,13 @@ define void @goo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   ret void
 }
@@ -74,13 +74,13 @@ define void @hoo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [200000 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [200000 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [200000 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   ret void
 }
@@ -105,13 +105,13 @@ define void @loo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   call void asm sideeffect "", "~{f30}"() nounwind
   ret void
@@ -145,7 +145,3 @@ entry:
 ; CHECK-FP: stfd 30, -16(30)
 
 ; CHECK-FP: blr
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
new file mode 100644
index 0000000..97ac788
--- /dev/null
+++ b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mcpu=ppc64 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define zeroext i1 @test1() unnamed_addr #0 align 2 {
+
+; CHECK-LABEL: @test1
+
+entry:
+  br i1 undef, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  unreachable
+
+lor.end:                                          ; preds = %entry
+  br i1 undef, label %land.rhs, label %if.then
+
+if.then:                                          ; preds = %lor.end
+  br i1 undef, label %return, label %if.end.i24
+
+if.end.i24:                                       ; preds = %if.then
+  %0 = load i32* undef, align 4
+  %lnot.i.i16.i23 = icmp eq i32 %0, 0
+  br i1 %lnot.i.i16.i23, label %if.end7.i37, label %test.exit27.i34
+
+test.exit27.i34: ; preds = %if.end.i24
+  br i1 undef, label %return, label %if.end7.i37
+
+if.end7.i37:                                      ; preds = %test.exit27.i34, %if.end.i24
+  %tobool.i.i36 = icmp eq i8 undef, 0
+  br i1 %tobool.i.i36, label %return, label %if.then9.i39
+
+if.then9.i39:                                     ; preds = %if.end7.i37
+  br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
+
+; CHECK: .LBB0_7:
+; CHECK:	beq 1, .LBB0_10
+; CHECK:	beq 0, .LBB0_10
+; CHECK: .LBB0_9:
+
+lor.rhs.i.i49:                                    ; preds = %if.then9.i39
+  %cmp.i.i.i.i48 = icmp ne i64 undef, 0
+  br label %return
+
+land.rhs:                                         ; preds = %lor.end
+  br i1 undef, label %return, label %if.end.i
+
+if.end.i:                                         ; preds = %land.rhs
+  br i1 undef, label %return, label %if.then9.i
+
+if.then9.i:                                       ; preds = %if.end.i
+  br i1 undef, label %return, label %lor.rhs.i.i
+
+lor.rhs.i.i:                                      ; preds = %if.then9.i
+  %cmp.i.i.i.i = icmp ne i64 undef, 0
+  br label %return
+
+return:                                           ; preds = %lor.rhs.i.i, %if.then9.i, %if.end.i, %land.rhs, %lor.rhs.i.i49, %if.then9.i39, %if.end7.i37, %test.exit27.i34, %if.then
+  %retval.0 = phi i1 [ false, %if.then ], [ false, %test.exit27.i34 ], [ true, %if.end7.i37 ], [ true, %if.then9.i39 ], [ %cmp.i.i.i.i48, %lor.rhs.i.i49 ], [ false, %land.rhs ], [ true, %if.end.i ], [ true, %if.then9.i ], [ %cmp.i.i.i.i, %lor.rhs.i.i ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/tls-gd-obj.ll b/test/CodeGen/PowerPC/tls-gd-obj.ll
deleted file mode 100644
index 26cb6f2..0000000
--- a/test/CodeGen/PowerPC/tls-gd-obj.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
-; RUN: llvm-readobj -r | FileCheck %s
-
-; Test correct relocation generation for thread-local storage using
-; the general dynamic model and integrated assembly.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = thread_local global i32 0, align 4
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; Verify generation of R_PPC64_GOT_TLSGD16_HA, R_PPC64_GOT_TLSGD16_LO,
-; and R_PPC64_TLSGD for accessing external variable a, and R_PPC64_REL24
-; for the call to __tls_get_addr.
-;
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSGD16_HA a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSGD16_LO a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLSGD          a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_REL24          __tls_get_addr
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/PowerPC/tls-ie-obj.ll b/test/CodeGen/PowerPC/tls-ie-obj.ll
deleted file mode 100644
index f24a94b..0000000
--- a/test/CodeGen/PowerPC/tls-ie-obj.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -filetype=obj %s -o - | \
-; RUN: llvm-readobj -r | FileCheck %s
-
-; Test correct relocation generation for thread-local storage
-; using the initial-exec model and integrated assembly.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = external thread_local global i32
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; Verify generation of R_PPC64_GOT_TPREL16_DS and R_PPC64_TLS for
-; accessing external variable a.
-;
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TPREL16_HA    a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TPREL16_LO_DS a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLS               a
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/PowerPC/tls-ld-obj.ll b/test/CodeGen/PowerPC/tls-ld-obj.ll
deleted file mode 100644
index 4a7d7b3..0000000
--- a/test/CodeGen/PowerPC/tls-ld-obj.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
-; RUN: llvm-readobj -r | FileCheck %s
-
-; Test correct relocation generation for thread-local storage using
-; the local dynamic model.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = hidden thread_local global i32 0, align 4
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; Verify generation of R_PPC64_GOT_TLSLD16_HA, R_PPC64_GOT_TLSLD16_LO,
-; R_PPC64_TLSLD, R_PPC64_DTPREL16_HA, and R_PPC64_DTPREL16_LO for
-; accessing external variable a, and R_PPC64_REL24 for the call to
-; __tls_get_addr.
-;
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSLD16_HA a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSLD16_LO a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLSLD          a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_REL24          __tls_get_addr
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_DTPREL16_HA    a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_DTPREL16_LO    a
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/PowerPC/unal-altivec2.ll b/test/CodeGen/PowerPC/unal-altivec2.ll
new file mode 100644
index 0000000..7464675
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-altivec2.ll
@@ -0,0 +1,166 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(float* noalias nocapture %x, float* noalias nocapture readonly %y) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+; CHECK-LABEL: @foo
+; CHECK: lvsl
+; CHECK: blr
+  %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ]
+  %0 = getelementptr inbounds float* %y, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>* %1, align 4
+  %2 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load)
+  %3 = getelementptr inbounds float* %x, i64 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  store <4 x float> %2, <4 x float>* %4, align 4
+  %index.next = add i64 %index, 4
+  %5 = getelementptr inbounds float* %y, i64 %index.next
+  %6 = bitcast float* %5 to <4 x float>*
+  %wide.load.1 = load <4 x float>* %6, align 4
+  %7 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.1)
+  %8 = getelementptr inbounds float* %x, i64 %index.next
+  %9 = bitcast float* %8 to <4 x float>*
+  store <4 x float> %7, <4 x float>* %9, align 4
+  %index.next.1 = add i64 %index.next, 4
+  %10 = getelementptr inbounds float* %y, i64 %index.next.1
+  %11 = bitcast float* %10 to <4 x float>*
+  %wide.load.2 = load <4 x float>* %11, align 4
+  %12 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.2)
+  %13 = getelementptr inbounds float* %x, i64 %index.next.1
+  %14 = bitcast float* %13 to <4 x float>*
+  store <4 x float> %12, <4 x float>* %14, align 4
+  %index.next.2 = add i64 %index.next.1, 4
+  %15 = getelementptr inbounds float* %y, i64 %index.next.2
+  %16 = bitcast float* %15 to <4 x float>*
+  %wide.load.3 = load <4 x float>* %16, align 4
+  %17 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.3)
+  %18 = getelementptr inbounds float* %x, i64 %index.next.2
+  %19 = bitcast float* %18 to <4 x float>*
+  store <4 x float> %17, <4 x float>* %19, align 4
+  %index.next.3 = add i64 %index.next.2, 4
+  %20 = getelementptr inbounds float* %y, i64 %index.next.3
+  %21 = bitcast float* %20 to <4 x float>*
+  %wide.load.4 = load <4 x float>* %21, align 4
+  %22 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.4)
+  %23 = getelementptr inbounds float* %x, i64 %index.next.3
+  %24 = bitcast float* %23 to <4 x float>*
+  store <4 x float> %22, <4 x float>* %24, align 4
+  %index.next.4 = add i64 %index.next.3, 4
+  %25 = getelementptr inbounds float* %y, i64 %index.next.4
+  %26 = bitcast float* %25 to <4 x float>*
+  %wide.load.5 = load <4 x float>* %26, align 4
+  %27 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.5)
+  %28 = getelementptr inbounds float* %x, i64 %index.next.4
+  %29 = bitcast float* %28 to <4 x float>*
+  store <4 x float> %27, <4 x float>* %29, align 4
+  %index.next.5 = add i64 %index.next.4, 4
+  %30 = getelementptr inbounds float* %y, i64 %index.next.5
+  %31 = bitcast float* %30 to <4 x float>*
+  %wide.load.6 = load <4 x float>* %31, align 4
+  %32 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.6)
+  %33 = getelementptr inbounds float* %x, i64 %index.next.5
+  %34 = bitcast float* %33 to <4 x float>*
+  store <4 x float> %32, <4 x float>* %34, align 4
+  %index.next.6 = add i64 %index.next.5, 4
+  %35 = getelementptr inbounds float* %y, i64 %index.next.6
+  %36 = bitcast float* %35 to <4 x float>*
+  %wide.load.7 = load <4 x float>* %36, align 4
+  %37 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.7)
+  %38 = getelementptr inbounds float* %x, i64 %index.next.6
+  %39 = bitcast float* %38 to <4 x float>*
+  store <4 x float> %37, <4 x float>* %39, align 4
+  %index.next.7 = add i64 %index.next.6, 4
+  %40 = getelementptr inbounds float* %y, i64 %index.next.7
+  %41 = bitcast float* %40 to <4 x float>*
+  %wide.load.8 = load <4 x float>* %41, align 4
+  %42 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.8)
+  %43 = getelementptr inbounds float* %x, i64 %index.next.7
+  %44 = bitcast float* %43 to <4 x float>*
+  store <4 x float> %42, <4 x float>* %44, align 4
+  %index.next.8 = add i64 %index.next.7, 4
+  %45 = getelementptr inbounds float* %y, i64 %index.next.8
+  %46 = bitcast float* %45 to <4 x float>*
+  %wide.load.9 = load <4 x float>* %46, align 4
+  %47 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.9)
+  %48 = getelementptr inbounds float* %x, i64 %index.next.8
+  %49 = bitcast float* %48 to <4 x float>*
+  store <4 x float> %47, <4 x float>* %49, align 4
+  %index.next.9 = add i64 %index.next.8, 4
+  %50 = getelementptr inbounds float* %y, i64 %index.next.9
+  %51 = bitcast float* %50 to <4 x float>*
+  %wide.load.10 = load <4 x float>* %51, align 4
+  %52 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.10)
+  %53 = getelementptr inbounds float* %x, i64 %index.next.9
+  %54 = bitcast float* %53 to <4 x float>*
+  store <4 x float> %52, <4 x float>* %54, align 4
+  %index.next.10 = add i64 %index.next.9, 4
+  %55 = getelementptr inbounds float* %y, i64 %index.next.10
+  %56 = bitcast float* %55 to <4 x float>*
+  %wide.load.11 = load <4 x float>* %56, align 4
+  %57 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.11)
+  %58 = getelementptr inbounds float* %x, i64 %index.next.10
+  %59 = bitcast float* %58 to <4 x float>*
+  store <4 x float> %57, <4 x float>* %59, align 4
+  %index.next.11 = add i64 %index.next.10, 4
+  %60 = getelementptr inbounds float* %y, i64 %index.next.11
+  %61 = bitcast float* %60 to <4 x float>*
+  %wide.load.12 = load <4 x float>* %61, align 4
+  %62 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.12)
+  %63 = getelementptr inbounds float* %x, i64 %index.next.11
+  %64 = bitcast float* %63 to <4 x float>*
+  store <4 x float> %62, <4 x float>* %64, align 4
+  %index.next.12 = add i64 %index.next.11, 4
+  %65 = getelementptr inbounds float* %y, i64 %index.next.12
+  %66 = bitcast float* %65 to <4 x float>*
+  %wide.load.13 = load <4 x float>* %66, align 4
+  %67 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.13)
+  %68 = getelementptr inbounds float* %x, i64 %index.next.12
+  %69 = bitcast float* %68 to <4 x float>*
+  store <4 x float> %67, <4 x float>* %69, align 4
+  %index.next.13 = add i64 %index.next.12, 4
+  %70 = getelementptr inbounds float* %y, i64 %index.next.13
+  %71 = bitcast float* %70 to <4 x float>*
+  %wide.load.14 = load <4 x float>* %71, align 4
+  %72 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.14)
+  %73 = getelementptr inbounds float* %x, i64 %index.next.13
+  %74 = bitcast float* %73 to <4 x float>*
+  store <4 x float> %72, <4 x float>* %74, align 4
+  %index.next.14 = add i64 %index.next.13, 4
+  %75 = getelementptr inbounds float* %y, i64 %index.next.14
+  %76 = bitcast float* %75 to <4 x float>*
+  %wide.load.15 = load <4 x float>* %76, align 4
+  %77 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.15)
+  %78 = getelementptr inbounds float* %x, i64 %index.next.14
+  %79 = bitcast float* %78 to <4 x float>*
+  store <4 x float> %77, <4 x float>* %79, align 4
+  %index.next.15 = add i64 %index.next.14, 4
+  %80 = icmp eq i64 %index.next.15, 2048
+  br i1 %80, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm_cos_v4f32(<4 x float>) #1
+
+define <2 x double> @bar(double* %x) {
+entry:
+  %p = bitcast double* %x to <2 x double>*
+  %r = load <2 x double>* %p, align 8
+
+; CHECK-LABEL: @bar
+; CHECK-NOT: lvsl
+; CHECK: blr
+
+  ret <2 x double> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/PowerPC/unal4-std.ll b/test/CodeGen/PowerPC/unal4-std.ll
index 169bd78..9f29e31 100644
--- a/test/CodeGen/PowerPC/unal4-std.ll
+++ b/test/CodeGen/PowerPC/unal4-std.ll
@@ -24,4 +24,4 @@ if.end210:                                        ; preds = %entry
 ; CHECK: stdx {{[0-9]+}}, 0,
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index 2baac76..260d036 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll
@@ -19,7 +19,7 @@ declare void @llvm.eh.unwind.init() #0
 attributes #0 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8}
+!llvm.module.flags = !{!8, !11}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"/tmp/unwind-dw2.c", metadata !"/tmp"}
@@ -27,8 +27,9 @@ attributes #0 = { nounwind }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !9 = metadata !{i32 2, i32 0, metadata !4, null}
 !10 = metadata !{i32 3, i32 0, metadata !4, null}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/PowerPC/vec-abi-align.ll b/test/CodeGen/PowerPC/vec-abi-align.ll
new file mode 100644
index 0000000..3239cf6
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec-abi-align.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s2 = type { i64, <4 x float> }
+
+@ve = external global <4 x float>
+@n = external global i64
+
+; Function Attrs: nounwind
+define void @test1(i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, <4 x float> inreg %vs.coerce) #0 {
+entry:
+  store <4 x float> %vs.coerce, <4 x float>* @ve, align 16
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK: stvx 2,
+; CHECK: blr
+}
+
+; Function Attrs: nounwind
+define void @test2(i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, %struct.s2* byval nocapture readonly %vs) #0 {
+entry:
+  %m = getelementptr inbounds %struct.s2* %vs, i64 0, i32 0
+  %0 = load i64* %m, align 8
+  store i64 %0, i64* @n, align 8
+  %v = getelementptr inbounds %struct.s2* %vs, i64 0, i32 1
+  %1 = load <4 x float>* %v, align 16
+  store <4 x float> %1, <4 x float>* @ve, align 16
+  ret void
+
+; CHECK-LABEL: @test2
+; CHECK: ld {{[0-9]+}}, 112(1)
+; CHECK: li [[REG16:[0-9]+]], 16
+; CHECK: addi [[REGB:[0-9]+]], 1, 112
+; CHECK: lvx 2, [[REGB]], [[REG16]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind
+define void @test3(i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, %struct.s2* byval nocapture readonly %vs) #0 {
+entry:
+  %m = getelementptr inbounds %struct.s2* %vs, i64 0, i32 0
+  %0 = load i64* %m, align 8
+  store i64 %0, i64* @n, align 8
+  %v = getelementptr inbounds %struct.s2* %vs, i64 0, i32 1
+  %1 = load <4 x float>* %v, align 16
+  store <4 x float> %1, <4 x float>* @ve, align 16
+  ret void
+
+; CHECK-LABEL: @test3
+; CHECK: ld {{[0-9]+}}, 128(1)
+; CHECK: li [[REG16:[0-9]+]], 16
+; CHECK: addi [[REGB:[0-9]+]], 1, 128
+; CHECK: lvx 2, [[REGB]], [[REG16]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
index 6373a26..8d16e15 100644
--- a/test/CodeGen/PowerPC/vec_extload.ll
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -5,7 +5,7 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
-; Altivec does not provides an sext intruction, so it expands
+; Altivec does not provides an sext instruction, so it expands
 ; a set of vector stores (stvx), bytes load/sign expand/store
 ; (lbz/stb), and a final vector load (lvx) to load the result
 ; extended vector.
diff --git a/test/CodeGen/PowerPC/zero-not-run.ll b/test/CodeGen/PowerPC/zero-not-run.ll
index 04c4277..9df0d6e 100644
--- a/test/CodeGen/PowerPC/zero-not-run.ll
+++ b/test/CodeGen/PowerPC/zero-not-run.ll
@@ -24,4 +24,4 @@ for.end731:                                       ; preds = %entry
 ; Function Attrs: nounwind
 declare i64 @safe_mod_func_uint64_t_u_u(i64, i64) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
index 5c14270..3c4fcf7 100644
--- a/test/CodeGen/R600/128bit-kernel-args.ll
+++ b/test/CodeGen/R600/128bit-kernel-args.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @v4i32_kernel_arg
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
new file mode 100644
index 0000000..7a12687
--- /dev/null
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
+; the global address space(1) uses 64-bit pointers.  These tests check to make sure
+; the correct pointer size is used for the local address space.
+
+; The e{{32|64}} suffix on the instructions refers to the encoding size and not
+; the size of the operands.  The operand size is denoted in the instruction name.
+; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
+; instructions with B64, U64, and I64 take 64-bit operands.
+
+; CHECK-LABEL: @local_address_load
+; CHECK: V_MOV_B32_e{{32|64}} [[PTR:v[0-9]]]
+; CHECK: DS_READ_B32 [[PTR]]
+define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @local_address_gep
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CHECK: DS_READ_B32 [[VPTR]]
+define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+entry:
+  %0 = getelementptr i32 addrspace(3)* %in, i32 %offset
+  %1 = load i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @local_address_gep_const_offset
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CHECK: DS_READ_B32 [[VPTR]]
+define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = getelementptr i32 addrspace(3)* %in, i32 1
+  %1 = load i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @null_32bit_lds_ptr:
+; CHECK: V_CMP_NE_I32
+; CHECK-NOT: V_CMP_NE_I32
+; CHECK: V_CNDMASK_B32
+define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+  %cmp = icmp ne i32 addrspace(3)* %lds, null
+  %x = select i1 %cmp, i32 123, i32 456
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @mul_32bit_ptr:
+; CHECK: V_MUL_LO_I32
+; CHECK-NEXT: V_ADD_I32_e32
+; CHECK-NEXT: DS_READ_B32
+define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+  %ptr = getelementptr [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
+  %val = load float addrspace(3)* %ptr
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+@g_lds = addrspace(3) global float zeroinitializer, align 4
+
+; CHECK-LABEL: @infer_ptr_alignment_global_offset:
+; CHECK: V_MOV_B32_e32 [[REG:v[0-9]+]], 0
+; CHECK: DS_READ_B32 v{{[0-9]+}}, 0, [[REG]]
+define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+  %val = load float addrspace(3)* @g_lds
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+
+@ptr = addrspace(3) global i32 addrspace(3)* null
+@dst = addrspace(3) global [16384 x i32] zeroinitializer
+
+; SI-LABEL: @global_ptr:
+; SI-CHECK: DS_WRITE_B32
+define void @global_ptr() nounwind {
+  store i32 addrspace(3)* getelementptr ([16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+  ret void
+}
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
index 34a0a87..0d6bfb1 100644
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; SI-CHECK: @f64_kernel_arg
-; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 9
-; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 11
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 9
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 11
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 16f7f97..3d5506b 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -1,39 +1,55 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @test2
+;EG-CHECK-LABEL: @test1:
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK-LABEL: @test1:
+;SI-CHECK: V_ADD_I32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI-CHECK-NOT: [[REG]]
+;SI-CHECK: BUFFER_STORE_DWORD [[REG]],
+define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK-LABEL: @test2:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @test2:
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %a = load <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32> addrspace(1)* %b_ptr
   %result = add <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-;EG-CHECK: @test4
+;EG-CHECK-LABEL: @test4:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @test4:
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %a = load <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32> addrspace(1)* %b_ptr
   %result = add <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
new file mode 100644
index 0000000..303a1cb
--- /dev/null
+++ b/test/CodeGen/R600/add_i64.ll
@@ -0,0 +1,59 @@
+; XFAIL: *
+; This will fail until i64 add is enabled
+
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+
+
+declare i32 @llvm.SI.tid() readnone
+
+; SI-LABEL: @test_i64_vreg:
+define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64 addrspace(1)* %a_ptr
+  %b = load i64 addrspace(1)* %b_ptr
+  %result = add i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Check that the SGPR add operand is correctly moved to a VGPR.
+; SI-LABEL: @sgpr_operand:
+define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+  %foo = load i64 addrspace(1)* %in, align 8
+  %result = add i64 %foo, %a
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Swap the arguments. Check that the SGPR -> VGPR copy works with the
+; SGPR as other operand.
+;
+; SI-LABEL: @sgpr_operand_reversed:
+define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+  %foo = load i64 addrspace(1)* %in, align 8
+  %result = add i64 %a, %foo
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: @test_v2i64_sreg:
+define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @test_v2i64_vreg:
+define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
new file mode 100644
index 0000000..1fc616a
--- /dev/null
+++ b/test/CodeGen/R600/address-space.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+
+; Test that codegenprepare understands address space sizes
+
+%struct.foo = type { [3 x float], [3 x float] }
+
+; CHECK-LABEL: @do_as_ptr_calcs:
+; CHECK: S_ADD_I32 {{s[0-9]+}},
+; CHECK: S_ADD_I32 [[SREG1:s[0-9]+]],
+; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; CHECK: DS_READ_B32 [[VREG1]],
+define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+entry:
+  %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
+  %y = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+  br label %bb32
+
+bb32:
+  %a = load float addrspace(3)* %x, align 4
+  %b = load float addrspace(3)* %y, align 4
+  %cmp = fcmp one float %a, %b
+  br i1 %cmp, label %bb34, label %bb33
+
+bb33:
+  unreachable
+
+bb34:
+  unreachable
+}
+
+
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index 44c21bd..ee9bc83 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @test2
 ;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test2
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 }
 
 ;EG-CHECK: @test4
-;EG-CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test4
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
new file mode 100644
index 0000000..652bbfe
--- /dev/null
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll
@@ -0,0 +1,18 @@
+; XFAIL: *
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() readnone
+
+
+; SI-LABEL: @test_array_ptr_calc(
+define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [16 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr [16 x i32] addrspace(1)* %inA, i32 1, i32 %tid
+  %b_ptr = getelementptr i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32 addrspace(1)* %a_ptr
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
new file mode 100644
index 0000000..0bc48a3
--- /dev/null
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK-LABEL: @atomic_add_local
+; R600-CHECK: LDS_ADD *
+; SI-CHECK-LABEL: @atomic_add_local
+; SI-CHECK: DS_ADD_U32_RTN 0
+define void @atomic_add_local(i32 addrspace(3)* %local) {
+entry:
+   %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; R600-CHECK-LABEL: @atomic_add_ret_local
+; R600-CHECK: LDS_ADD_RET *
+; SI-CHECK-LABEL: @atomic_add_ret_local
+; SI-CHECK: DS_ADD_U32_RTN 0
+define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+entry:
+  %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
new file mode 100644
index 0000000..e4a6829
--- /dev/null
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK-LABEL: @atomic_sub_local
+; R600-CHECK: LDS_SUB *
+; SI-CHECK-LABEL: @atomic_sub_local
+; SI-CHECK: DS_SUB_U32_RTN 0
+define void @atomic_sub_local(i32 addrspace(3)* %local) {
+entry:
+   %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; R600-CHECK-LABEL: @atomic_sub_ret_local
+; R600-CHECK: LDS_SUB_RET *
+; SI-CHECK-LABEL: @atomic_sub_ret_local
+; SI-CHECK: DS_SUB_U32_RTN 0
+define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+entry:
+  %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index cdccdfa..bbfe856 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
@@ -38,8 +38,8 @@ entry:
 ; R600-CHECK: @bfi_sha256_ma
 ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
 ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
-; SI-CHECK: V_XOR_B32_e64 [[DST:VGPR[0-9]+]], {{[SV]GPR[0-9]+, VGPR[0-9]+}}
-; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}
+; SI-CHECK: V_XOR_B32_e64 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}}
+; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}}
 
 define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll
new file mode 100644
index 0000000..6b68376
--- /dev/null
+++ b/test/CodeGen/R600/big_alu.ll
@@ -0,0 +1,1174 @@
+;RUN: llc < %s -march=r600 -mcpu=cedar
+;REQUIRES: asserts
+
+;This test ensures that R600 backend can handle ifcvt properly
+;and do not generate ALU clauses with more than 128 instructions.
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = extractelement <4 x float> %reg0, i32 1
+  %2 = extractelement <4 x float> %reg0, i32 2
+  %3 = extractelement <4 x float> %reg0, i32 3
+  %4 = extractelement <4 x float> %reg1, i32 0
+  %5 = extractelement <4 x float> %reg9, i32 0
+  %6 = extractelement <4 x float> %reg8, i32 0
+  %7 = fcmp ugt float %6, 0.000000e+00
+  %8 = select i1 %7, float %4, float %5
+  %9 = extractelement <4 x float> %reg1, i32 1
+  %10 = extractelement <4 x float> %reg9, i32 1
+  %11 = extractelement <4 x float> %reg8, i32 0
+  %12 = fcmp ugt float %11, 0.000000e+00
+  %13 = select i1 %12, float %9, float %10
+  %14 = extractelement <4 x float> %reg1, i32 2
+  %15 = extractelement <4 x float> %reg9, i32 2
+  %16 = extractelement <4 x float> %reg8, i32 0
+  %17 = fcmp ugt float %16, 0.000000e+00
+  %18 = select i1 %17, float %14, float %15
+  %19 = extractelement <4 x float> %reg1, i32 3
+  %20 = extractelement <4 x float> %reg9, i32 3
+  %21 = extractelement <4 x float> %reg8, i32 0
+  %22 = extractelement <4 x float> %reg2, i32 0
+  %23 = extractelement <4 x float> %reg2, i32 1
+  %24 = extractelement <4 x float> %reg2, i32 2
+  %25 = extractelement <4 x float> %reg2, i32 3
+  %26 = extractelement <4 x float> %reg3, i32 0
+  %27 = extractelement <4 x float> %reg3, i32 1
+  %28 = extractelement <4 x float> %reg3, i32 2
+  %29 = extractelement <4 x float> %reg3, i32 3
+  %30 = extractelement <4 x float> %reg4, i32 0
+  %31 = extractelement <4 x float> %reg4, i32 1
+  %32 = extractelement <4 x float> %reg4, i32 2
+  %33 = extractelement <4 x float> %reg4, i32 3
+  %34 = extractelement <4 x float> %reg5, i32 0
+  %35 = extractelement <4 x float> %reg5, i32 1
+  %36 = extractelement <4 x float> %reg5, i32 2
+  %37 = extractelement <4 x float> %reg5, i32 3
+  %38 = extractelement <4 x float> %reg6, i32 0
+  %39 = extractelement <4 x float> %reg6, i32 1
+  %40 = extractelement <4 x float> %reg6, i32 2
+  %41 = extractelement <4 x float> %reg6, i32 3
+  %42 = extractelement <4 x float> %reg7, i32 0
+  %43 = extractelement <4 x float> %reg7, i32 1
+  %44 = extractelement <4 x float> %reg7, i32 2
+  %45 = extractelement <4 x float> %reg7, i32 3
+  %46 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %47 = extractelement <4 x float> %46, i32 0
+  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %49 = extractelement <4 x float> %48, i32 1
+  %50 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %53 = extractelement <4 x float> %52, i32 0
+  %54 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %55 = extractelement <4 x float> %54, i32 0
+  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %57 = extractelement <4 x float> %56, i32 1
+  %58 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %59 = extractelement <4 x float> %58, i32 2
+  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %63 = extractelement <4 x float> %62, i32 0
+  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %65 = extractelement <4 x float> %64, i32 1
+  %66 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %67 = extractelement <4 x float> %66, i32 2
+  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %69 = extractelement <4 x float> %68, i32 0
+  %70 = fcmp oge float %69, 3.500000e+00
+  %71 = sext i1 %70 to i32
+  %72 = bitcast i32 %71 to float
+  %73 = bitcast float %72 to i32
+  %74 = icmp ne i32 %73, 0
+  %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
+  %75 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %76 = extractelement <4 x float> %75, i32 0
+  %77 = fcmp oge float %76, 2.000000e+00
+  %78 = sext i1 %77 to i32
+  %79 = bitcast i32 %78 to float
+  %80 = bitcast float %79 to i32
+  %81 = icmp ne i32 %80, 0
+  br i1 %81, label %IF137, label %ENDIF136
+
+IF137:                                            ; preds = %main_body
+  %82 = insertelement <4 x float> undef, float %30, i32 0
+  %83 = insertelement <4 x float> %82, float %31, i32 1
+  %84 = insertelement <4 x float> %83, float %32, i32 2
+  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
+  %86 = insertelement <4 x float> undef, float %30, i32 0
+  %87 = insertelement <4 x float> %86, float %31, i32 1
+  %88 = insertelement <4 x float> %87, float %32, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
+  %91 = call float @llvm.AMDGPU.rsq(float %90)
+  %92 = fmul float %30, %91
+  %93 = fmul float %31, %91
+  %94 = fmul float %32, %91
+  %95 = insertelement <4 x float> undef, float %92, i32 0
+  %96 = insertelement <4 x float> %95, float %93, i32 1
+  %97 = insertelement <4 x float> %96, float %94, i32 2
+  %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3
+  %99 = insertelement <4 x float> undef, float %37, i32 0
+  %100 = insertelement <4 x float> %99, float %38, i32 1
+  %101 = insertelement <4 x float> %100, float %39, i32 2
+  %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3
+  %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102)
+  %104 = insertelement <4 x float> undef, float %92, i32 0
+  %105 = insertelement <4 x float> %104, float %93, i32 1
+  %106 = insertelement <4 x float> %105, float %94, i32 2
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3
+  %108 = insertelement <4 x float> undef, float %40, i32 0
+  %109 = insertelement <4 x float> %108, float %41, i32 1
+  %110 = insertelement <4 x float> %109, float %42, i32 2
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
+  %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111)
+  %113 = fsub float -0.000000e+00, %92
+  %114 = fsub float -0.000000e+00, %93
+  %115 = fsub float -0.000000e+00, %94
+  %116 = insertelement <4 x float> undef, float %34, i32 0
+  %117 = insertelement <4 x float> %116, float %35, i32 1
+  %118 = insertelement <4 x float> %117, float %36, i32 2
+  %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
+  %120 = insertelement <4 x float> undef, float %113, i32 0
+  %121 = insertelement <4 x float> %120, float %114, i32 1
+  %122 = insertelement <4 x float> %121, float %115, i32 2
+  %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
+  %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
+  %125 = fdiv float 1.000000e+00, %124
+  %126 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %127 = extractelement <4 x float> %126, i32 0
+  %128 = fmul float %127, %125
+  %129 = fmul float %103, %128
+  %130 = fmul float %112, %128
+  %131 = bitcast float %. to i32
+  %132 = sitofp i32 %131 to float
+  %133 = fdiv float 1.000000e+00, %132
+  %134 = bitcast float %. to i32
+  %135 = add i32 %134, -1
+  %136 = bitcast i32 %135 to float
+  %137 = bitcast float %136 to i32
+  br label %LOOP
+
+ENDIF136:                                         ; preds = %main_body, %ENDIF154
+  %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ]
+  %138 = fmul float %26, 0x3F847AE140000000
+  %139 = fmul float %27, 0x3F847AE140000000
+  %140 = fmul float %28, 0x3F847AE140000000
+  %141 = insertelement <4 x float> undef, float %138, i32 0
+  %142 = insertelement <4 x float> %141, float %139, i32 1
+  %143 = insertelement <4 x float> %142, float %140, i32 2
+  %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3
+  %145 = extractelement <4 x float> %144, i32 0
+  %146 = extractelement <4 x float> %144, i32 1
+  %147 = extractelement <4 x float> %144, i32 2
+  %148 = extractelement <4 x float> %144, i32 3
+  %149 = insertelement <4 x float> undef, float %145, i32 0
+  %150 = insertelement <4 x float> %149, float %146, i32 1
+  %151 = insertelement <4 x float> %150, float %147, i32 2
+  %152 = insertelement <4 x float> %151, float %148, i32 3
+  %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3)
+  %154 = extractelement <4 x float> %153, i32 0
+  %155 = extractelement <4 x float> %153, i32 1
+  %156 = extractelement <4 x float> %153, i32 2
+  %157 = extractelement <4 x float> %153, i32 3
+  %158 = fmul float %26, 0x3F45A07B40000000
+  %159 = fmul float %27, 0x3F45A07B40000000
+  %160 = fmul float %28, 0x3F45A07B40000000
+  %161 = insertelement <4 x float> undef, float %158, i32 0
+  %162 = insertelement <4 x float> %161, float %159, i32 1
+  %163 = insertelement <4 x float> %162, float %160, i32 2
+  %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3
+  %165 = extractelement <4 x float> %164, i32 0
+  %166 = extractelement <4 x float> %164, i32 1
+  %167 = extractelement <4 x float> %164, i32 2
+  %168 = extractelement <4 x float> %164, i32 3
+  %169 = insertelement <4 x float> undef, float %165, i32 0
+  %170 = insertelement <4 x float> %169, float %166, i32 1
+  %171 = insertelement <4 x float> %170, float %167, i32 2
+  %172 = insertelement <4 x float> %171, float %168, i32 3
+  %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3)
+  %174 = extractelement <4 x float> %173, i32 0
+  %175 = extractelement <4 x float> %173, i32 1
+  %176 = extractelement <4 x float> %173, i32 2
+  %177 = extractelement <4 x float> %173, i32 3
+  %178 = fmul float %176, 3.000000e+03
+  %179 = fadd float %178, %28
+  %180 = fdiv float 1.000000e+00, %33
+  %181 = fmul float %32, %180
+  %182 = call float @fabs(float %181)
+  %183 = fmul float %174, 0x3FD99999A0000000
+  %184 = fadd float %183, 0x3FAEB851E0000000
+  %185 = fmul float %175, 0x3FE3333340000000
+  %186 = fadd float %185, %184
+  %187 = fmul float %176, 2.000000e+00
+  %188 = fadd float %187, %186
+  %189 = fmul float %177, 4.000000e+00
+  %190 = fadd float %189, %188
+  %191 = fmul float %154, 0x3FB99999A0000000
+  %192 = fadd float %191, %190
+  %193 = fmul float %155, 0x3FD99999A0000000
+  %194 = fadd float %193, %192
+  %195 = fmul float %156, 0x3FE99999A0000000
+  %196 = fadd float %195, %194
+  %197 = fmul float %157, 0x4000CCCCC0000000
+  %198 = fadd float %197, %196
+  %199 = fmul float 0xBE5EFB4CC0000000, %182
+  %200 = fmul float %199, %182
+  %201 = call float @llvm.AMDIL.exp.(float %200)
+  %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000)
+  %203 = fadd float %202, 0x3FF4CCCCC0000000
+  %204 = fmul float %203, 0x3FE1C71C80000000
+  %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00)
+  %206 = fadd float %202, 0x3FF4CCCCC0000000
+  %207 = fmul float %206, 0x3FE1C71C80000000
+  %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00)
+  %209 = fadd float %202, 2.000000e+00
+  %210 = fmul float %209, 0x3FD611A7A0000000
+  %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00)
+  %212 = fmul float 2.000000e+00, %205
+  %213 = fsub float -0.000000e+00, %212
+  %214 = fadd float 3.000000e+00, %213
+  %215 = fmul float %205, %214
+  %216 = fmul float %205, %215
+  %217 = fmul float 2.000000e+00, %208
+  %218 = fsub float -0.000000e+00, %217
+  %219 = fadd float 3.000000e+00, %218
+  %220 = fmul float %208, %219
+  %221 = fmul float %208, %220
+  %222 = fmul float 2.000000e+00, %211
+  %223 = fsub float -0.000000e+00, %222
+  %224 = fadd float 3.000000e+00, %223
+  %225 = fmul float %211, %224
+  %226 = fmul float %211, %225
+  %227 = fmul float %26, 0x3F368B5CC0000000
+  %228 = fmul float %27, 0x3F368B5CC0000000
+  %229 = insertelement <4 x float> undef, float %227, i32 0
+  %230 = insertelement <4 x float> %229, float %228, i32 1
+  %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2
+  %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3
+  %233 = extractelement <4 x float> %232, i32 0
+  %234 = extractelement <4 x float> %232, i32 1
+  %235 = insertelement <4 x float> undef, float %233, i32 0
+  %236 = insertelement <4 x float> %235, float %234, i32 1
+  %237 = insertelement <4 x float> %236, float undef, i32 2
+  %238 = insertelement <4 x float> %237, float undef, i32 3
+  %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2)
+  %240 = extractelement <4 x float> %239, i32 0
+  %241 = insertelement <4 x float> undef, float %240, i32 0
+  %242 = insertelement <4 x float> %241, float %228, i32 1
+  %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2
+  %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3
+  %245 = extractelement <4 x float> %244, i32 0
+  %246 = insertelement <4 x float> undef, float %245, i32 0
+  %247 = insertelement <4 x float> %246, float undef, i32 1
+  %248 = insertelement <4 x float> %247, float undef, i32 2
+  %249 = insertelement <4 x float> %248, float undef, i32 3
+  %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1)
+  %251 = extractelement <4 x float> %250, i32 0
+  %252 = extractelement <4 x float> %250, i32 1
+  %253 = extractelement <4 x float> %250, i32 2
+  %254 = extractelement <4 x float> %250, i32 3
+  %255 = fmul float %251, %216
+  %256 = fmul float %252, %221
+  %257 = fmul float %253, %226
+  %258 = fmul float %254, 0.000000e+00
+  %259 = fadd float %202, 0x3FF4CCCCC0000000
+  %260 = fmul float %259, 0x3FE1C71C80000000
+  %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00)
+  %262 = fadd float %202, 0x3FF4CCCCC0000000
+  %263 = fmul float %262, 0x3FE1C71C80000000
+  %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00)
+  %265 = fadd float %202, 2.000000e+00
+  %266 = fmul float %265, 0x3FD611A7A0000000
+  %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00)
+  %268 = fmul float 2.000000e+00, %261
+  %269 = fsub float -0.000000e+00, %268
+  %270 = fadd float 3.000000e+00, %269
+  %271 = fmul float %261, %270
+  %272 = fmul float %261, %271
+  %273 = fmul float 2.000000e+00, %264
+  %274 = fsub float -0.000000e+00, %273
+  %275 = fadd float 3.000000e+00, %274
+  %276 = fmul float %264, %275
+  %277 = fmul float %264, %276
+  %278 = fmul float 2.000000e+00, %267
+  %279 = fsub float -0.000000e+00, %278
+  %280 = fadd float 3.000000e+00, %279
+  %281 = fmul float %267, %280
+  %282 = fmul float %267, %281
+  %283 = fmul float %26, 0x3F22DFD6A0000000
+  %284 = fmul float %27, 0x3F22DFD6A0000000
+  %285 = insertelement <4 x float> undef, float %283, i32 0
+  %286 = insertelement <4 x float> %285, float %284, i32 1
+  %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2
+  %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3
+  %289 = extractelement <4 x float> %288, i32 0
+  %290 = extractelement <4 x float> %288, i32 1
+  %291 = insertelement <4 x float> undef, float %289, i32 0
+  %292 = insertelement <4 x float> %291, float %290, i32 1
+  %293 = insertelement <4 x float> %292, float undef, i32 2
+  %294 = insertelement <4 x float> %293, float undef, i32 3
+  %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2)
+  %296 = extractelement <4 x float> %295, i32 0
+  %297 = extractelement <4 x float> %295, i32 1
+  %298 = extractelement <4 x float> %295, i32 2
+  %299 = extractelement <4 x float> %295, i32 3
+  %300 = fmul float %296, %272
+  %301 = fmul float %297, %277
+  %302 = fmul float %298, %282
+  %303 = fmul float %299, 0.000000e+00
+  %304 = fmul float %temp68.1, %37
+  %305 = fmul float %temp68.1, %38
+  %306 = fmul float %temp68.1, %39
+  %307 = fmul float %temp69.0, %40
+  %308 = fadd float %307, %304
+  %309 = fmul float %temp69.0, %41
+  %310 = fadd float %309, %305
+  %311 = fmul float %temp69.0, %42
+  %312 = fadd float %311, %306
+  %313 = fmul float %temp70.0, %34
+  %314 = fadd float %313, %308
+  %315 = fmul float %temp70.0, %35
+  %316 = fadd float %315, %310
+  %317 = fmul float %temp70.0, %36
+  %318 = fadd float %317, %312
+  %319 = insertelement <4 x float> undef, float %314, i32 0
+  %320 = insertelement <4 x float> %319, float %316, i32 1
+  %321 = insertelement <4 x float> %320, float %318, i32 2
+  %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
+  %323 = insertelement <4 x float> undef, float %314, i32 0
+  %324 = insertelement <4 x float> %323, float %316, i32 1
+  %325 = insertelement <4 x float> %324, float %318, i32 2
+  %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
+  %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
+  %328 = call float @llvm.AMDGPU.rsq(float %327)
+  %329 = fmul float %314, %328
+  %330 = fmul float %316, %328
+  %331 = fmul float %318, %328
+  %332 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %333 = extractelement <4 x float> %332, i32 0
+  %334 = fsub float -0.000000e+00, %333
+  %335 = fadd float 1.000000e+00, %334
+  %336 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %337 = extractelement <4 x float> %336, i32 0
+  %338 = fsub float -0.000000e+00, %337
+  %339 = fadd float 1.000000e+00, %338
+  %340 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %341 = extractelement <4 x float> %340, i32 0
+  %342 = fsub float -0.000000e+00, %341
+  %343 = fadd float 1.000000e+00, %342
+  %344 = fsub float -0.000000e+00, %335
+  %345 = fadd float %202, %344
+  %346 = fsub float -0.000000e+00, %339
+  %347 = fadd float %202, %346
+  %348 = fadd float %347, 0xBFE3333340000000
+  %349 = fsub float -0.000000e+00, %202
+  %350 = fsub float -0.000000e+00, %343
+  %351 = fadd float %349, %350
+  %352 = insertelement <4 x float> undef, float %43, i32 0
+  %353 = insertelement <4 x float> %352, float %44, i32 1
+  %354 = insertelement <4 x float> %353, float %45, i32 2
+  %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3
+  %356 = insertelement <4 x float> undef, float %43, i32 0
+  %357 = insertelement <4 x float> %356, float %44, i32 1
+  %358 = insertelement <4 x float> %357, float %45, i32 2
+  %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
+  %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
+  %361 = call float @llvm.AMDGPU.rsq(float %360)
+  %362 = fmul float %45, %361
+  %363 = call float @fabs(float %362)
+  %364 = fmul float %176, 0x3FECCCCCC0000000
+  %365 = fadd float %364, %363
+  %366 = fadd float %365, 0xBFEFAE1480000000
+  %367 = fmul float %366, 0xC023FFFFC0000000
+  %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00)
+  %369 = fsub float -0.000000e+00, %335
+  %370 = fadd float %202, %369
+  %371 = fadd float %370, 0x3FBEB851E0000000
+  %372 = fsub float -0.000000e+00, %339
+  %373 = fadd float %202, %372
+  %374 = fadd float %373, 0xBFE0A3D700000000
+  %375 = fsub float -0.000000e+00, %202
+  %376 = fsub float -0.000000e+00, %343
+  %377 = fadd float %375, %376
+  %378 = insertelement <4 x float> undef, float %43, i32 0
+  %379 = insertelement <4 x float> %378, float %44, i32 1
+  %380 = insertelement <4 x float> %379, float %45, i32 2
+  %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3
+  %382 = insertelement <4 x float> undef, float %43, i32 0
+  %383 = insertelement <4 x float> %382, float %44, i32 1
+  %384 = insertelement <4 x float> %383, float %45, i32 2
+  %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
+  %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
+  %387 = call float @llvm.AMDGPU.rsq(float %386)
+  %388 = fmul float %45, %387
+  %389 = call float @fabs(float %388)
+  %390 = fmul float %176, 0x3FF51EB860000000
+  %391 = fadd float %390, %389
+  %392 = fadd float %391, 0xBFEFAE1480000000
+  %393 = fmul float %392, 0xC0490001A0000000
+  %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00)
+  %395 = fmul float 2.000000e+00, %368
+  %396 = fsub float -0.000000e+00, %395
+  %397 = fadd float 3.000000e+00, %396
+  %398 = fmul float %368, %397
+  %399 = fmul float %368, %398
+  %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345)
+  %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348)
+  %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351)
+  %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00)
+  %404 = fmul float 2.000000e+00, %394
+  %405 = fsub float -0.000000e+00, %404
+  %406 = fadd float 3.000000e+00, %405
+  %407 = fmul float %394, %406
+  %408 = fmul float %394, %407
+  %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371)
+  %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374)
+  %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377)
+  %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000)
+  %413 = fcmp oge float 2.200000e+03, %179
+  %414 = sext i1 %413 to i32
+  %415 = bitcast i32 %414 to float
+  %416 = bitcast float %415 to i32
+  %417 = icmp ne i32 %416, 0
+  br i1 %417, label %IF161, label %ENDIF160
+
+LOOP:                                             ; preds = %ENDIF139, %IF137
+  %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ]
+  %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ]
+  %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ]
+  %418 = bitcast float %temp96.0 to i32
+  %419 = icmp sge i32 %418, %137
+  %420 = sext i1 %419 to i32
+  %421 = bitcast i32 %420 to float
+  %422 = bitcast float %421 to i32
+  %423 = icmp ne i32 %422, 0
+  br i1 %423, label %IF140, label %ENDIF139
+
+IF140:                                            ; preds = %LOOP
+  %424 = fmul float %133, 5.000000e-01
+  %425 = fmul float %129, %temp92.0
+  %426 = fadd float %425, %22
+  %427 = fmul float %130, %temp92.0
+  %428 = fadd float %427, %23
+  %429 = insertelement <4 x float> undef, float %426, i32 0
+  %430 = insertelement <4 x float> %429, float %428, i32 1
+  %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2
+  %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3
+  %433 = extractelement <4 x float> %432, i32 0
+  %434 = extractelement <4 x float> %432, i32 1
+  %435 = insertelement <4 x float> undef, float %433, i32 0
+  %436 = insertelement <4 x float> %435, float %434, i32 1
+  %437 = insertelement <4 x float> %436, float undef, i32 2
+  %438 = insertelement <4 x float> %437, float undef, i32 3
+  %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2)
+  %440 = extractelement <4 x float> %439, i32 3
+  %441 = fcmp oge float %temp92.0, %440
+  %442 = sext i1 %441 to i32
+  %443 = bitcast i32 %442 to float
+  %444 = bitcast float %443 to i32
+  %445 = icmp ne i32 %444, 0
+  br i1 %445, label %IF146, label %ENDIF145
+
+ENDIF139:                                         ; preds = %LOOP
+  %446 = fadd float %temp88.0, %133
+  %447 = fmul float %129, %446
+  %448 = fadd float %447, %22
+  %449 = fmul float %130, %446
+  %450 = fadd float %449, %23
+  %451 = insertelement <4 x float> undef, float %448, i32 0
+  %452 = insertelement <4 x float> %451, float %450, i32 1
+  %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2
+  %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3
+  %455 = extractelement <4 x float> %454, i32 0
+  %456 = extractelement <4 x float> %454, i32 1
+  %457 = insertelement <4 x float> undef, float %455, i32 0
+  %458 = insertelement <4 x float> %457, float %456, i32 1
+  %459 = insertelement <4 x float> %458, float undef, i32 2
+  %460 = insertelement <4 x float> %459, float undef, i32 3
+  %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2)
+  %462 = extractelement <4 x float> %461, i32 3
+  %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
+  %464 = sext i1 %463 to i32
+  %465 = bitcast i32 %464 to float
+  %466 = fcmp oge float %446, %462
+  %467 = sext i1 %466 to i32
+  %468 = bitcast i32 %467 to float
+  %469 = bitcast float %465 to i32
+  %470 = bitcast float %468 to i32
+  %471 = and i32 %469, %470
+  %472 = bitcast i32 %471 to float
+  %473 = bitcast float %472 to i32
+  %474 = icmp ne i32 %473, 0
+  %.temp92.0 = select i1 %474, float %446, float %temp92.0
+  %475 = bitcast float %temp96.0 to i32
+  %476 = add i32 %475, 1
+  %477 = bitcast i32 %476 to float
+  br label %LOOP
+
+IF146:                                            ; preds = %IF140
+  %478 = fmul float 2.000000e+00, %424
+  %479 = fsub float -0.000000e+00, %478
+  %480 = fadd float %temp92.0, %479
+  br label %ENDIF145
+
+ENDIF145:                                         ; preds = %IF140, %IF146
+  %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ]
+  %481 = fadd float %temp88.1, %424
+  %482 = fmul float %424, 5.000000e-01
+  %483 = fmul float %129, %481
+  %484 = fadd float %483, %22
+  %485 = fmul float %130, %481
+  %486 = fadd float %485, %23
+  %487 = insertelement <4 x float> undef, float %484, i32 0
+  %488 = insertelement <4 x float> %487, float %486, i32 1
+  %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2
+  %490 = insertelement <4 x float> %489, float %440, i32 3
+  %491 = extractelement <4 x float> %490, i32 0
+  %492 = extractelement <4 x float> %490, i32 1
+  %493 = insertelement <4 x float> undef, float %491, i32 0
+  %494 = insertelement <4 x float> %493, float %492, i32 1
+  %495 = insertelement <4 x float> %494, float undef, i32 2
+  %496 = insertelement <4 x float> %495, float undef, i32 3
+  %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2)
+  %498 = extractelement <4 x float> %497, i32 3
+  %499 = fcmp oge float %481, %498
+  %500 = sext i1 %499 to i32
+  %501 = bitcast i32 %500 to float
+  %502 = bitcast float %501 to i32
+  %503 = icmp ne i32 %502, 0
+  br i1 %503, label %IF149, label %ENDIF148
+
+IF149:                                            ; preds = %ENDIF145
+  %504 = fmul float 2.000000e+00, %482
+  %505 = fsub float -0.000000e+00, %504
+  %506 = fadd float %481, %505
+  br label %ENDIF148
+
+ENDIF148:                                         ; preds = %ENDIF145, %IF149
+  %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ]
+  %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ]
+  %507 = fadd float %temp88.2, %482
+  %508 = fmul float %482, 5.000000e-01
+  %509 = fmul float %129, %507
+  %510 = fadd float %509, %22
+  %511 = fmul float %130, %507
+  %512 = fadd float %511, %23
+  %513 = insertelement <4 x float> undef, float %510, i32 0
+  %514 = insertelement <4 x float> %513, float %512, i32 1
+  %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2
+  %516 = insertelement <4 x float> %515, float %498, i32 3
+  %517 = extractelement <4 x float> %516, i32 0
+  %518 = extractelement <4 x float> %516, i32 1
+  %519 = insertelement <4 x float> undef, float %517, i32 0
+  %520 = insertelement <4 x float> %519, float %518, i32 1
+  %521 = insertelement <4 x float> %520, float undef, i32 2
+  %522 = insertelement <4 x float> %521, float undef, i32 3
+  %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2)
+  %524 = extractelement <4 x float> %523, i32 3
+  %525 = fcmp oge float %507, %524
+  %526 = sext i1 %525 to i32
+  %527 = bitcast i32 %526 to float
+  %528 = bitcast float %527 to i32
+  %529 = icmp ne i32 %528, 0
+  br i1 %529, label %IF152, label %ENDIF151
+
+IF152:                                            ; preds = %ENDIF148
+  %530 = fmul float 2.000000e+00, %508
+  %531 = fsub float -0.000000e+00, %530
+  %532 = fadd float %507, %531
+  br label %ENDIF151
+
+ENDIF151:                                         ; preds = %ENDIF148, %IF152
+  %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ]
+  %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ]
+  %533 = fadd float %temp88.3, %508
+  %534 = fmul float %508, 5.000000e-01
+  %535 = fmul float %129, %533
+  %536 = fadd float %535, %22
+  %537 = fmul float %130, %533
+  %538 = fadd float %537, %23
+  %539 = insertelement <4 x float> undef, float %536, i32 0
+  %540 = insertelement <4 x float> %539, float %538, i32 1
+  %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2
+  %542 = insertelement <4 x float> %541, float %524, i32 3
+  %543 = extractelement <4 x float> %542, i32 0
+  %544 = extractelement <4 x float> %542, i32 1
+  %545 = insertelement <4 x float> undef, float %543, i32 0
+  %546 = insertelement <4 x float> %545, float %544, i32 1
+  %547 = insertelement <4 x float> %546, float undef, i32 2
+  %548 = insertelement <4 x float> %547, float undef, i32 3
+  %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2)
+  %550 = extractelement <4 x float> %549, i32 3
+  %551 = fcmp oge float %533, %550
+  %552 = sext i1 %551 to i32
+  %553 = bitcast i32 %552 to float
+  %554 = bitcast float %553 to i32
+  %555 = icmp ne i32 %554, 0
+  br i1 %555, label %IF155, label %ENDIF154
+
+IF155:                                            ; preds = %ENDIF151
+  %556 = fmul float 2.000000e+00, %534
+  %557 = fsub float -0.000000e+00, %556
+  %558 = fadd float %533, %557
+  br label %ENDIF154
+
+ENDIF154:                                         ; preds = %ENDIF151, %IF155
+  %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ]
+  %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ]
+  %559 = fadd float %temp88.4, %534
+  %560 = fmul float %129, %559
+  %561 = fadd float %560, %22
+  %562 = fmul float %130, %559
+  %563 = fadd float %562, %23
+  %564 = insertelement <4 x float> undef, float %561, i32 0
+  %565 = insertelement <4 x float> %564, float %563, i32 1
+  %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2
+  %567 = insertelement <4 x float> %566, float %550, i32 3
+  %568 = extractelement <4 x float> %567, i32 0
+  %569 = extractelement <4 x float> %567, i32 1
+  %570 = insertelement <4 x float> undef, float %568, i32 0
+  %571 = insertelement <4 x float> %570, float %569, i32 1
+  %572 = insertelement <4 x float> %571, float undef, i32 2
+  %573 = insertelement <4 x float> %572, float undef, i32 3
+  %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2)
+  %575 = extractelement <4 x float> %574, i32 3
+  %576 = fcmp oge float %559, %575
+  %577 = sext i1 %576 to i32
+  %578 = bitcast i32 %577 to float
+  %579 = bitcast float %578 to i32
+  %580 = icmp ne i32 %579, 0
+  %.temp92.4 = select i1 %580, float %559, float %temp92.4
+  %581 = fmul float %129, %.temp92.4
+  %582 = fadd float %581, %22
+  %583 = fmul float %130, %.temp92.4
+  %584 = fadd float %583, %23
+  %585 = insertelement <4 x float> undef, float %582, i32 0
+  %586 = insertelement <4 x float> %585, float %584, i32 1
+  %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2
+  %588 = insertelement <4 x float> %587, float %575, i32 3
+  %589 = extractelement <4 x float> %588, i32 0
+  %590 = extractelement <4 x float> %588, i32 1
+  %591 = insertelement <4 x float> undef, float %589, i32 0
+  %592 = insertelement <4 x float> %591, float %590, i32 1
+  %593 = insertelement <4 x float> %592, float undef, i32 2
+  %594 = insertelement <4 x float> %593, float undef, i32 3
+  %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2)
+  %596 = extractelement <4 x float> %595, i32 0
+  %597 = extractelement <4 x float> %595, i32 1
+  %598 = extractelement <4 x float> %595, i32 2
+  %599 = fmul float %596, 2.000000e+00
+  %600 = fadd float %599, -1.000000e+00
+  %601 = fmul float %597, 2.000000e+00
+  %602 = fadd float %601, -1.000000e+00
+  %603 = fmul float %598, 2.000000e+00
+  %604 = fadd float %603, -1.000000e+00
+  br label %ENDIF136
+
+IF161:                                            ; preds = %ENDIF136
+  %605 = fmul float %202, 0x3FB99999A0000000
+  %606 = fcmp uge float 0x3FE4CCCCC0000000, %605
+  %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605
+  %608 = fcmp uge float %607, 5.000000e-01
+  %609 = select i1 %608, float 5.000000e-01, float %607
+  %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300)
+  %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301)
+  %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302)
+  %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303)
+  %614 = insertelement <4 x float> undef, float %329, i32 0
+  %615 = insertelement <4 x float> %614, float %330, i32 1
+  %616 = insertelement <4 x float> %615, float %331, i32 2
+  %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3
+  %618 = insertelement <4 x float> undef, float %63, i32 0
+  %619 = insertelement <4 x float> %618, float %65, i32 1
+  %620 = insertelement <4 x float> %619, float %67, i32 2
+  %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3
+  %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621)
+  %623 = fcmp uge float 0x3FE6666660000000, %622
+  %624 = select i1 %623, float 0x3FE6666660000000, float %622
+  %625 = fmul float %8, %624
+  %626 = fmul float %13, %624
+  %627 = fmul float %18, %624
+  %628 = insertelement <4 x float> undef, float %34, i32 0
+  %629 = insertelement <4 x float> %628, float %35, i32 1
+  %630 = insertelement <4 x float> %629, float %36, i32 2
+  %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3
+  %632 = insertelement <4 x float> undef, float %63, i32 0
+  %633 = insertelement <4 x float> %632, float %65, i32 1
+  %634 = insertelement <4 x float> %633, float %67, i32 2
+  %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3
+  %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635)
+  %637 = fcmp uge float 0x3FECCCCCC0000000, %636
+  %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636
+  %639 = fmul float %625, %638
+  %640 = fmul float %626, %638
+  %641 = fmul float %627, %638
+  br label %ENDIF160
+
+ENDIF160:                                         ; preds = %ENDIF136, %IF161
+  %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ]
+  %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ]
+  %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ]
+  %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ]
+  %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ]
+  %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %642 = fcmp olt float 2.200000e+03, %179
+  %643 = sext i1 %642 to i32
+  %644 = bitcast i32 %643 to float
+  %645 = fcmp olt float %179, 2.300000e+03
+  %646 = sext i1 %645 to i32
+  %647 = bitcast i32 %646 to float
+  %648 = bitcast float %644 to i32
+  %649 = bitcast float %647 to i32
+  %650 = and i32 %648, %649
+  %651 = bitcast i32 %650 to float
+  %652 = bitcast float %651 to i32
+  %653 = icmp ne i32 %652, 0
+  br i1 %653, label %IF164, label %ENDIF163
+
+IF164:                                            ; preds = %ENDIF160
+  %654 = fmul float %202, 5.000000e-01
+  %655 = fcmp uge float 0x3FE4CCCCC0000000, %654
+  %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654
+  %657 = fcmp uge float %656, 0x3FD6666660000000
+  %658 = select i1 %657, float 0x3FD6666660000000, float %656
+  %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300)
+  %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301)
+  %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302)
+  %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303)
+  %663 = insertelement <4 x float> undef, float %329, i32 0
+  %664 = insertelement <4 x float> %663, float %330, i32 1
+  %665 = insertelement <4 x float> %664, float %331, i32 2
+  %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3
+  %667 = insertelement <4 x float> undef, float %63, i32 0
+  %668 = insertelement <4 x float> %667, float %65, i32 1
+  %669 = insertelement <4 x float> %668, float %67, i32 2
+  %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3
+  %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670)
+  %672 = fcmp uge float 0x3FE6666660000000, %671
+  %673 = select i1 %672, float 0x3FE6666660000000, float %671
+  %674 = fmul float %8, %673
+  %675 = fmul float %13, %673
+  %676 = fmul float %18, %673
+  %677 = insertelement <4 x float> undef, float %34, i32 0
+  %678 = insertelement <4 x float> %677, float %35, i32 1
+  %679 = insertelement <4 x float> %678, float %36, i32 2
+  %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3
+  %681 = insertelement <4 x float> undef, float %63, i32 0
+  %682 = insertelement <4 x float> %681, float %65, i32 1
+  %683 = insertelement <4 x float> %682, float %67, i32 2
+  %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3
+  %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684)
+  %686 = fcmp uge float 0x3FECCCCCC0000000, %685
+  %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685
+  %688 = fmul float %674, %687
+  %689 = fmul float %675, %687
+  %690 = fmul float %676, %687
+  br label %ENDIF163
+
+ENDIF163:                                         ; preds = %ENDIF160, %IF164
+  %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ]
+  %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ]
+  %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ]
+  %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ]
+  %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ]
+  %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ]
+  %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ]
+  %691 = fcmp oge float %179, 2.300000e+03
+  %692 = sext i1 %691 to i32
+  %693 = bitcast i32 %692 to float
+  %694 = fcmp olt float %179, 2.480000e+03
+  %695 = sext i1 %694 to i32
+  %696 = bitcast i32 %695 to float
+  %697 = bitcast float %693 to i32
+  %698 = bitcast float %696 to i32
+  %699 = and i32 %697, %698
+  %700 = bitcast i32 %699 to float
+  %701 = bitcast float %700 to i32
+  %702 = icmp ne i32 %701, 0
+  br i1 %702, label %IF167, label %ENDIF166
+
+IF167:                                            ; preds = %ENDIF163
+  %703 = fmul float %202, 5.000000e-01
+  %704 = fcmp uge float 0x3FE4CCCCC0000000, %703
+  %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703
+  %706 = fcmp uge float %705, 0x3FD3333340000000
+  %707 = select i1 %706, float 0x3FD3333340000000, float %705
+  %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300)
+  %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301)
+  %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302)
+  %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303)
+  %712 = insertelement <4 x float> undef, float %329, i32 0
+  %713 = insertelement <4 x float> %712, float %330, i32 1
+  %714 = insertelement <4 x float> %713, float %331, i32 2
+  %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3
+  %716 = insertelement <4 x float> undef, float %63, i32 0
+  %717 = insertelement <4 x float> %716, float %65, i32 1
+  %718 = insertelement <4 x float> %717, float %67, i32 2
+  %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3
+  %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719)
+  %721 = fcmp uge float 0x3FEB333340000000, %720
+  %722 = select i1 %721, float 0x3FEB333340000000, float %720
+  %723 = fmul float %8, %722
+  %724 = fmul float %13, %722
+  %725 = fmul float %18, %722
+  %726 = insertelement <4 x float> undef, float %34, i32 0
+  %727 = insertelement <4 x float> %726, float %35, i32 1
+  %728 = insertelement <4 x float> %727, float %36, i32 2
+  %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3
+  %730 = insertelement <4 x float> undef, float %63, i32 0
+  %731 = insertelement <4 x float> %730, float %65, i32 1
+  %732 = insertelement <4 x float> %731, float %67, i32 2
+  %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3
+  %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733)
+  %735 = fcmp uge float 0x3FECCCCCC0000000, %734
+  %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734
+  %737 = fmul float %723, %736
+  %738 = fmul float %724, %736
+  %739 = fmul float %725, %736
+  br label %ENDIF166
+
+ENDIF166:                                         ; preds = %ENDIF163, %IF167
+  %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ]
+  %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ]
+  %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ]
+  %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ]
+  %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ]
+  %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ]
+  %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ]
+  %740 = fcmp oge float %179, 2.480000e+03
+  %741 = sext i1 %740 to i32
+  %742 = bitcast i32 %741 to float
+  %743 = fcmp olt float %179, 2.530000e+03
+  %744 = sext i1 %743 to i32
+  %745 = bitcast i32 %744 to float
+  %746 = bitcast float %742 to i32
+  %747 = bitcast float %745 to i32
+  %748 = and i32 %746, %747
+  %749 = bitcast i32 %748 to float
+  %750 = bitcast float %749 to i32
+  %751 = icmp ne i32 %750, 0
+  br i1 %751, label %IF170, label %ENDIF169
+
+IF170:                                            ; preds = %ENDIF166
+  %752 = fmul float %202, 5.000000e-01
+  %753 = fcmp uge float 0x3FE4CCCCC0000000, %752
+  %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752
+  %755 = fcmp uge float %754, 0x3FC99999A0000000
+  %756 = select i1 %755, float 0x3FC99999A0000000, float %754
+  %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300)
+  %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301)
+  %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302)
+  %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303)
+  %761 = insertelement <4 x float> undef, float %329, i32 0
+  %762 = insertelement <4 x float> %761, float %330, i32 1
+  %763 = insertelement <4 x float> %762, float %331, i32 2
+  %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3
+  %765 = insertelement <4 x float> undef, float %63, i32 0
+  %766 = insertelement <4 x float> %765, float %65, i32 1
+  %767 = insertelement <4 x float> %766, float %67, i32 2
+  %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3
+  %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768)
+  %770 = fcmp uge float 0x3FEB333340000000, %769
+  %771 = select i1 %770, float 0x3FEB333340000000, float %769
+  %772 = fmul float %8, %771
+  %773 = fmul float %13, %771
+  %774 = fmul float %18, %771
+  %775 = insertelement <4 x float> undef, float %34, i32 0
+  %776 = insertelement <4 x float> %775, float %35, i32 1
+  %777 = insertelement <4 x float> %776, float %36, i32 2
+  %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3
+  %779 = insertelement <4 x float> undef, float %63, i32 0
+  %780 = insertelement <4 x float> %779, float %65, i32 1
+  %781 = insertelement <4 x float> %780, float %67, i32 2
+  %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3
+  %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782)
+  %784 = fcmp uge float 0x3FECCCCCC0000000, %783
+  %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783
+  %786 = fmul float %772, %785
+  %787 = fmul float %773, %785
+  %788 = fmul float %774, %785
+  br label %ENDIF169
+
+ENDIF169:                                         ; preds = %ENDIF166, %IF170
+  %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ]
+  %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ]
+  %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ]
+  %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ]
+  %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ]
+  %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ]
+  %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ]
+  %789 = fcmp oge float %179, 2.530000e+03
+  %790 = sext i1 %789 to i32
+  %791 = bitcast i32 %790 to float
+  %792 = fcmp olt float %179, 2.670000e+03
+  %793 = sext i1 %792 to i32
+  %794 = bitcast i32 %793 to float
+  %795 = bitcast float %791 to i32
+  %796 = bitcast float %794 to i32
+  %797 = and i32 %795, %796
+  %798 = bitcast i32 %797 to float
+  %799 = bitcast float %798 to i32
+  %800 = icmp ne i32 %799, 0
+  br i1 %800, label %IF173, label %ENDIF172
+
+IF173:                                            ; preds = %ENDIF169
+  %801 = fmul float %202, 5.000000e-01
+  %802 = fcmp uge float 0x3FE4CCCCC0000000, %801
+  %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801
+  %804 = fcmp uge float %803, 0x3FB99999A0000000
+  %805 = select i1 %804, float 0x3FB99999A0000000, float %803
+  %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300)
+  %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301)
+  %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302)
+  %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303)
+  %810 = insertelement <4 x float> undef, float %329, i32 0
+  %811 = insertelement <4 x float> %810, float %330, i32 1
+  %812 = insertelement <4 x float> %811, float %331, i32 2
+  %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3
+  %814 = insertelement <4 x float> undef, float %63, i32 0
+  %815 = insertelement <4 x float> %814, float %65, i32 1
+  %816 = insertelement <4 x float> %815, float %67, i32 2
+  %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3
+  %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817)
+  %819 = fcmp uge float 0x3FEB333340000000, %818
+  %820 = select i1 %819, float 0x3FEB333340000000, float %818
+  %821 = fmul float %8, %820
+  %822 = fmul float %13, %820
+  %823 = fmul float %18, %820
+  %824 = insertelement <4 x float> undef, float %34, i32 0
+  %825 = insertelement <4 x float> %824, float %35, i32 1
+  %826 = insertelement <4 x float> %825, float %36, i32 2
+  %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3
+  %828 = insertelement <4 x float> undef, float %63, i32 0
+  %829 = insertelement <4 x float> %828, float %65, i32 1
+  %830 = insertelement <4 x float> %829, float %67, i32 2
+  %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3
+  %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831)
+  %833 = fcmp uge float 0x3FECCCCCC0000000, %832
+  %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832
+  %835 = fmul float %821, %834
+  %836 = fmul float %822, %834
+  %837 = fmul float %823, %834
+  br label %ENDIF172
+
+ENDIF172:                                         ; preds = %ENDIF169, %IF173
+  %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ]
+  %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ]
+  %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ]
+  %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ]
+  %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ]
+  %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ]
+  %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ]
+  %838 = fcmp oge float %179, 2.670000e+03
+  %839 = sext i1 %838 to i32
+  %840 = bitcast i32 %839 to float
+  %841 = bitcast float %840 to i32
+  %842 = icmp ne i32 %841, 0
+  br i1 %842, label %IF176, label %ENDIF175
+
+IF176:                                            ; preds = %ENDIF172
+  %843 = fmul float %202, 0x3FB99999A0000000
+  %844 = fcmp uge float 0.000000e+00, %843
+  %845 = select i1 %844, float 0.000000e+00, float %843
+  %846 = fcmp uge float %845, 0x3FD99999A0000000
+  %847 = select i1 %846, float 0x3FD99999A0000000, float %845
+  %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300)
+  %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301)
+  %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302)
+  %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303)
+  %852 = insertelement <4 x float> undef, float %329, i32 0
+  %853 = insertelement <4 x float> %852, float %330, i32 1
+  %854 = insertelement <4 x float> %853, float %331, i32 2
+  %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3
+  %856 = insertelement <4 x float> undef, float %63, i32 0
+  %857 = insertelement <4 x float> %856, float %65, i32 1
+  %858 = insertelement <4 x float> %857, float %67, i32 2
+  %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3
+  %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859)
+  %861 = fcmp uge float 0x3FEB333340000000, %860
+  %862 = select i1 %861, float 0x3FEB333340000000, float %860
+  %863 = fmul float %8, %862
+  %864 = fmul float %13, %862
+  %865 = fmul float %18, %862
+  %866 = insertelement <4 x float> undef, float %34, i32 0
+  %867 = insertelement <4 x float> %866, float %35, i32 1
+  %868 = insertelement <4 x float> %867, float %36, i32 2
+  %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3
+  %870 = insertelement <4 x float> undef, float %63, i32 0
+  %871 = insertelement <4 x float> %870, float %65, i32 1
+  %872 = insertelement <4 x float> %871, float %67, i32 2
+  %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3
+  %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873)
+  %875 = fcmp uge float 0x3FECCCCCC0000000, %874
+  %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874
+  %877 = fmul float %863, %876
+  %878 = fmul float %864, %876
+  %879 = fmul float %865, %876
+  br label %ENDIF175
+
+ENDIF175:                                         ; preds = %ENDIF172, %IF176
+  %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ]
+  %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ]
+  %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ]
+  %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ]
+  %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
+  %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
+  %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
+  %880 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %881 = extractelement <4 x float> %880, i32 0
+  %882 = fcmp olt float %881, %179
+  %883 = sext i1 %882 to i32
+  %884 = bitcast i32 %883 to float
+  %885 = bitcast float %884 to i32
+  %886 = icmp ne i32 %885, 0
+  br i1 %886, label %IF179, label %ENDIF178
+
+IF179:                                            ; preds = %ENDIF175
+  %887 = fadd float %202, 1.000000e+00
+  %888 = fadd float %202, 1.000000e+00
+  %889 = fadd float %202, 1.000000e+00
+  %890 = insertelement <4 x float> undef, float %43, i32 0
+  %891 = insertelement <4 x float> %890, float %44, i32 1
+  %892 = insertelement <4 x float> %891, float %45, i32 2
+  %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3
+  %894 = insertelement <4 x float> undef, float %43, i32 0
+  %895 = insertelement <4 x float> %894, float %44, i32 1
+  %896 = insertelement <4 x float> %895, float %45, i32 2
+  %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
+  %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
+  %899 = call float @llvm.AMDGPU.rsq(float %898)
+  %900 = fmul float %45, %899
+  %901 = call float @fabs(float %900)
+  %902 = fmul float %176, 0x3FECCCCCC0000000
+  %903 = fadd float %902, %901
+  %904 = fadd float %903, 0xBFEFAE1480000000
+  %905 = fmul float %904, 0xC043FFFE20000000
+  %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00)
+  %907 = fmul float 2.000000e+00, %906
+  %908 = fsub float -0.000000e+00, %907
+  %909 = fadd float 3.000000e+00, %908
+  %910 = fmul float %906, %909
+  %911 = fmul float %906, %910
+  %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887)
+  %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888)
+  %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889)
+  %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00)
+  %916 = fmul float %202, 5.000000e-01
+  %917 = fcmp uge float 0x3FE4CCCCC0000000, %916
+  %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916
+  %919 = fcmp uge float %918, 0x3FE3333340000000
+  %920 = select i1 %919, float 0x3FE3333340000000, float %918
+  %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5)
+  %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5)
+  %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5)
+  %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5)
+  %925 = insertelement <4 x float> undef, float %329, i32 0
+  %926 = insertelement <4 x float> %925, float %330, i32 1
+  %927 = insertelement <4 x float> %926, float %331, i32 2
+  %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3
+  %929 = insertelement <4 x float> undef, float %63, i32 0
+  %930 = insertelement <4 x float> %929, float %65, i32 1
+  %931 = insertelement <4 x float> %930, float %67, i32 2
+  %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3
+  %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932)
+  %934 = fcmp uge float 0x3FE99999A0000000, %933
+  %935 = select i1 %934, float 0x3FE99999A0000000, float %933
+  %936 = fmul float %8, %935
+  %937 = fmul float %13, %935
+  %938 = fmul float %18, %935
+  %939 = insertelement <4 x float> undef, float %34, i32 0
+  %940 = insertelement <4 x float> %939, float %35, i32 1
+  %941 = insertelement <4 x float> %940, float %36, i32 2
+  %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3
+  %943 = insertelement <4 x float> undef, float %63, i32 0
+  %944 = insertelement <4 x float> %943, float %65, i32 1
+  %945 = insertelement <4 x float> %944, float %67, i32 2
+  %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3
+  %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946)
+  %948 = fcmp uge float 0x3FECCCCCC0000000, %947
+  %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947
+  %950 = fmul float %936, %949
+  %951 = fmul float %937, %949
+  %952 = fmul float %938, %949
+  br label %ENDIF178
+
+ENDIF178:                                         ; preds = %ENDIF175, %IF179
+  %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ]
+  %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ]
+  %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ]
+  %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ]
+  %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ]
+  %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ]
+  %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ]
+  %953 = fmul float %55, %temp92.12
+  %954 = fmul float %57, %temp93.6
+  %955 = fmul float %59, %temp94.6
+  %956 = fmul float %61, 0.000000e+00
+  %957 = fmul float %temp84.6, %953
+  %958 = fmul float %temp85.6, %954
+  %959 = fmul float %temp86.6, %955
+  %960 = fmul float %temp87.6, %956
+  %961 = fmul float %2, -2.000000e+00
+  %962 = fadd float %961, 1.000000e+00
+  %963 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %964 = extractelement <4 x float> %963, i32 2
+  %965 = fsub float -0.000000e+00, %964
+  %966 = fadd float %962, %965
+  %967 = fdiv float 1.000000e+00, %966
+  %968 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
+  %969 = extractelement <4 x float> %968, i32 2
+  %970 = fmul float %969, %967
+  %971 = fsub float -0.000000e+00, %53
+  %972 = fmul float %971, %53
+  %973 = fmul float %972, %970
+  %974 = fmul float %973, %970
+  %975 = fmul float %974, 0x3FF7154760000000
+  %976 = call float @llvm.AMDIL.exp.(float %975)
+  %977 = fcmp oeq float %53, 1.000000e+00
+  %978 = sext i1 %977 to i32
+  %979 = bitcast i32 %978 to float
+  %980 = bitcast float %979 to i32
+  %981 = icmp ne i32 %980, 0
+  %.184 = select i1 %981, float 1.000000e+00, float %976
+  %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47)
+  %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49)
+  %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51)
+  %985 = insertelement <4 x float> undef, float %982, i32 0
+  %986 = insertelement <4 x float> %985, float %983, i32 1
+  %987 = insertelement <4 x float> %986, float %984, i32 2
+  %988 = insertelement <4 x float> %987, float %960, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
new file mode 100644
index 0000000..bccc416
--- /dev/null
+++ b/test/CodeGen/R600/bitcast.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; This test just checks that the compiler doesn't crash.
+; CHECK-LABEL: @v32i8_to_v8i32
+; CHECK: S_ENDPGM
+
+define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+entry:
+  %1 = load <32 x i8> addrspace(2)* %0
+  %2 = bitcast <32 x i8> %1 to <8 x i32>
+  %3 = extractelement <8 x i32> %2, i32 1
+  %4 = icmp ne i32 %3, 0
+  %5 = select i1 %4, float 0.0, float 1.0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
+  ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+
diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll
index 9b738a2..8179de1 100644
--- a/test/CodeGen/R600/build_vector.ll
+++ b/test/CodeGen/R600/build_vector.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @build_vector2
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
 ; SI-CHECK: @build_vector2
-; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
-; SI-CHECK: BUFFER_STORE_DWORDX2 [[X]]_[[Y]]
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[X]]:[[Y]]{{\]}}
 define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
@@ -22,11 +22,11 @@ entry:
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
 ; SI-CHECK: @build_vector4
-; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
-; SI-CHECK-DAG: V_MOV_B32_e32 [[Z:VGPR[0-9]]], 7
-; SI-CHECK-DAG: V_MOV_B32_e32 [[W:VGPR[0-9]]], 8
-; SI-CHECK: BUFFER_STORE_DWORDX4 [[X]]_[[Y]]_[[Z]]_[[W]]
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[Z:[0-9]]], 7
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[W:[0-9]]], 8
+; SI-CHECK: BUFFER_STORE_DWORDX4 v{{\[}}[[X]]:[[W]]{{\]}}
 define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll
new file mode 100644
index 0000000..f8ec712
--- /dev/null
+++ b/test/CodeGen/R600/combine_vloads.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+;
+; kernel void combine_vloads(global char8* src, global char8* result) {
+;   for (int i = 0; i < 1024; ++i)
+;     result[i] = src[0] + src[1] + src[2] + src[3];
+; }
+;
+
+
+; 128-bit loads instead of many 8-bit
+; EG-LABEL: @combine_vloads:
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+entry:
+  br label %for.body
+
+for.exit:                                         ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
+  %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
+  %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
+  %vecload2 = load <8 x i32> addrspace(1)* %0, align 32
+  %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
+  %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp9 = add nsw <8 x i8> %tmp5, %tmp8
+  %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
+  %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
+  %scevgep = getelementptr <8 x i8> addrspace(1)* %result, i32 %i.01
+  %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
+  %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
+  store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
+  %tmp19 = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %tmp19, 1024
+  br i1 %exitcond, label %for.exit, label %for.body
+}
diff --git a/test/CodeGen/R600/complex-folding.ll b/test/CodeGen/R600/complex-folding.ll
new file mode 100644
index 0000000..99f0d99
--- /dev/null
+++ b/test/CodeGen/R600/complex-folding.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @main
+; CHECK-NOT: MOV
+define void @main(<4 x float> inreg %reg0) #0 {
+entry:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = call float @fabs(float %0)
+  %2 = fptoui float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = insertelement <4 x float> undef, float %3, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0)
+  ret void
+}
+
+declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
index f460f13..9385150 100644
--- a/test/CodeGen/R600/elf.ll
+++ b/test/CodeGen/R600/elf.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -o - | FileCheck --check-prefix=CONFIG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG-CHECK %s
 
 ; ELF-CHECK: Format: ELF32
 ; ELF-CHECK: Name: .AMDGPU.config
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
new file mode 100644
index 0000000..aa660b3
--- /dev/null
+++ b/test/CodeGen/R600/extload.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: @anyext_load_i8:
+; EG: AND_INT
+; EG-NEXT: 255
+define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
+  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; EG-LABEL: @anyext_load_i16:
+; EG: AND_INT
+; EG: LSHL
+; EG: 65535
+define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
+  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; EG-LABEL: @anyext_load_lds_i8:
+; EG: AND_INT
+; EG-NEXT: 255
+define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
+  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
+  ret void
+}
+
+; EG-LABEL: @anyext_load_lds_i16:
+; EG: AND_INT
+; EG: LSHL
+; EG: 65535
+define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
+  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index 78ffd57..a5f5df9 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -1,15 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; DAGCombiner will transform:
 ; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; R600-CHECK: @fabs_free
+; R600-CHECK-LABEL: @fabs_free
 ; R600-CHECK-NOT: AND
 ; R600-CHECK: |PV.{{[XYZW]}}|
-; SI-CHECK: @fabs_free
-; SI-CHECK: V_ADD_F32_e64 VGPR{{[0-9]}}, SGPR{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK-LABEL: @fabs_free
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
 
 define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -19,4 +19,36 @@ entry:
   ret void
 }
 
+; R600-CHECK-LABEL: @fabs_v2
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; SI-CHECK-LABEL: @fabs_v2
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+define void @fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @fabs_v4
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; SI-CHECK-LABEL: @fabs_v4
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+define void @fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
 declare float @fabs(float ) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 97dbe44..f467bb7 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fadd_f32
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @fadd_f32() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fadd float %r0, %r1
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+; R600-CHECK: @fadd_f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI-CHECK: @fadd_f32
+; SI-CHECK: V_ADD_F32
+define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+   %0 = fadd float %a, %b
+   store float %0, float addrspace(1)* %out
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
-; CHECK: @fadd_v2f32
-; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; R600-CHECK: @fadd_v2f32
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; SI-CHECK: @fadd_v2f32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fadd <2 x float> %a, %b
@@ -25,12 +25,16 @@ entry:
   ret void
 }
 
-; CHECK: @fadd_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; R600-CHECK: @fadd_v4f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fadd_v4f32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
 define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
index 130302f..48cd3cf 100644
--- a/test/CodeGen/R600/fadd64.ll
+++ b/test/CodeGen/R600/fadd64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fadd_f64
-; CHECK: V_ADD_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_ADD_F64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
 
 define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
index 7373a21..1d4e323 100644
--- a/test/CodeGen/R600/fcmp-cnd.ll
+++ b/test/CodeGen/R600/fcmp-cnd.ll
@@ -2,7 +2,7 @@
 
 ;Not checking arguments 2 and 3 to CNDE, because they may change between
 ;registers and literal.x depending on what the optimizer does.
-;CHECK: CNDE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
index dc3a779..c76a758 100644
--- a/test/CodeGen/R600/fcmp.ll
+++ b/test/CodeGen/R600/fcmp.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; CHECK: @fcmp_sext
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
index 8f2513b..bcc7a8c 100644
--- a/test/CodeGen/R600/fcmp64.ll
+++ b/test/CodeGen/R600/fcmp64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @flt_f64
-; CHECK: V_CMP_LT_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_LT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -14,7 +14,7 @@ define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fle_f64
-; CHECK: V_CMP_LE_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_LE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -27,7 +27,7 @@ define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fgt_f64
-; CHECK: V_CMP_GT_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_GT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -40,7 +40,7 @@ define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fge_f64
-; CHECK: V_CMP_GE_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_GE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -53,7 +53,7 @@ define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fne_f64
-; CHECK: V_CMP_NEQ_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_NEQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -66,7 +66,7 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @feq_f64
-; CHECK: V_CMP_EQ_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_EQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 2402a9c..5c5ee7e 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fconst_f64
-; CHECK: V_MOV_B32_e32 {{VGPR[0-9]+}}, 0.000000e+00
-; CHECK-NEXT: V_MOV_B32_e32 {{VGPR[0-9]+}}, 2.312500e+00
+; CHECK: V_MOV_B32_e32 {{v[0-9]+}}, 0.000000e+00
+; CHECK-NEXT: V_MOV_B32_e32 {{v[0-9]+}}, 2.312500e+00
 
 define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 6798eac..3d21524 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,14 +1,20 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; These tests check that fdiv is expanded correctly and also test that the
 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
 ; instruction groups.
 
-; CHECK: @fdiv_v2f32
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; R600-CHECK: @fdiv_v2f32
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; SI-CHECK: @fdiv_v2f32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fdiv <2 x float> %a, %b
@@ -16,16 +22,24 @@ entry:
   ret void
 }
 
-; CHECK: @fdiv_v4f32
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-
+; R600-CHECK: @fdiv_v4f32
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; SI-CHECK: @fdiv_v4f32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fdiv64.ll b/test/CodeGen/R600/fdiv64.ll
index 76c5ca3..79b5c8b 100644
--- a/test/CodeGen/R600/fdiv64.ll
+++ b/test/CodeGen/R600/fdiv64.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fdiv_f64
-; CHECK: V_RCP_F64_e32 {{VGPR[0-9]+_VGPR[0-9]+}}
-; CHECK: V_MUL_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_RCP_F64_e32 {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 
 define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
index 877d69a..67e86c4 100644
--- a/test/CodeGen/R600/floor.ll
+++ b/test/CodeGen/R600/floor.ll
@@ -2,15 +2,15 @@
 
 ;CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @floor(float %r0)
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @floor(float) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
new file mode 100644
index 0000000..51e9d29
--- /dev/null
+++ b/test/CodeGen/R600/fma.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK: @fma_f32
+; CHECK: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                     float addrspace(1)* %in2, float addrspace(1)* %in3) {
+   %r0 = load float addrspace(1)* %in1
+   %r1 = load float addrspace(1)* %in2
+   %r2 = load float addrspace(1)* %in3
+   %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+   store float %r3, float addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+; CHECK: @fma_f64
+; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = load double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll
index 75e65d8..935e351 100644
--- a/test/CodeGen/R600/fmad.ll
+++ b/test/CodeGen/R600/fmad.ll
@@ -2,18 +2,18 @@
 
 ;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = call float @llvm.R600.load.input(i32 2)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = extractelement <4 x float> %reg0, i32 2
    %r3 = fmul float %r0, %r1
-	%r4 = fadd float %r3, %r2
-   call void @llvm.AMDGPU.store.output(float %r4, i32 0)
+   %r4 = fadd float %r3, %r2
+   %vec = insertelement <4 x float> undef, float %r4, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
index 8b704e5..d7127f4 100644
--- a/test/CodeGen/R600/fmax.ll
+++ b/test/CodeGen/R600/fmax.ll
@@ -2,15 +2,16 @@
 
 ;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fcmp uge float %r0, %r1
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp oge float %r0, %r1
    %r3 = select i1 %r2, float %r0, float %r1
-   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
index 5e34b7c..defa8c0 100644
--- a/test/CodeGen/R600/fmin.ll
+++ b/test/CodeGen/R600/fmin.ll
@@ -2,15 +2,16 @@
 
 ;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
    %r3 = select i1 %r2, float %r1, float %r0
-   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 6ef3a11..2a7825f 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -1,23 +1,27 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fmul_f32
-; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @fmul_f32() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fmul float %r0, %r1
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
-   ret void
+; R600-CHECK: @fmul_f32
+; R600-CHECK: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI-CHECK: @fmul_f32
+; SI-CHECK: V_MUL_F32
+define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fmul float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
 }
 
 declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; CHECK: @fmul_v2f32
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
+; R600-CHECK: @fmul_v2f32
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; SI-CHECK: @fmul_v2f32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
@@ -25,12 +29,16 @@ entry:
   ret void
 }
 
-; CHECK: @fmul_v4f32
-; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; R600-CHECK: @fmul_v4f32
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fmul_v4f32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
 define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fmul.v4f32.ll b/test/CodeGen/R600/fmul.v4f32.ll
deleted file mode 100644
index 74a58f7..0000000
--- a/test/CodeGen/R600/fmul.v4f32.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
-  %result = fmul <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
index 8a57d4a..7c7bf04 100644
--- a/test/CodeGen/R600/fmul64.ll
+++ b/test/CodeGen/R600/fmul64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fmul_f64
-; CHECK: V_MUL_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll
new file mode 100644
index 0000000..48944f6
--- /dev/null
+++ b/test/CodeGen/R600/fmuladd.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK: @fmuladd_f32
+; CHECK: V_MAD_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                         float addrspace(1)* %in2, float addrspace(1)* %in3) {
+   %r0 = load float addrspace(1)* %in1
+   %r1 = load float addrspace(1)* %in2
+   %r2 = load float addrspace(1)* %in3
+   %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
+   store float %r3, float addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+; CHECK: @fmuladd_f64
+; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                         double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = load double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index 799db0c..9446aa8 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -1,8 +1,23 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fneg_v2
-; CHECK: -PV
-; CHECK: -PV
+; R600-CHECK-LABEL: @fneg
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+define void @fneg(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fsub float -0.000000e+00, %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @fneg_v2
+; R600-CHECK: -PV
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg_v2
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
 define void @fneg_v2(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
 entry:
   %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
@@ -10,11 +25,16 @@ entry:
   ret void
 }
 
-; CHECK: @fneg_v4
-; CHECK: -PV
-; CHECK: -PV
-; CHECK: -PV
-; CHECK: -PV
+; R600-CHECK-LABEL: @fneg_v4
+; R600-CHECK: -PV
+; R600-CHECK: -T
+; R600-CHECK: -PV
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg_v4
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
 define void @fneg_v4(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
 entry:
   %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
@@ -26,9 +46,12 @@ entry:
 ; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
 ; unless the target returns true for isNegFree()
 
-; CHECK-NOT: XOR
-; CHECK: -KC0[2].Z
-
+; R600-CHECK-LABEL: @fneg_free
+; R600-CHECK-NOT: XOR
+; R600-CHECK: -KC0[2].Z
+; SI-CHECK-LABEL: @fneg_free
+; XXX: We could use V_ADD_F32_e64 with the negate bit here instead.
+; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0, 0, 0
 define void @fneg_free(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = bitcast i32 %in to float
diff --git a/test/CodeGen/R600/fp64_to_sint.ll b/test/CodeGen/R600/fp64_to_sint.ll
new file mode 100644
index 0000000..185e21c
--- /dev/null
+++ b/test/CodeGen/R600/fp64_to_sint.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @fp64_to_sint
+; CHECK: V_CVT_I32_F64_e32
+define void @fp64_to_sint(i32 addrspace(1)* %out, double %in) {
+  %result = fptosi double %in to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 6471270..8302b4f 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @fp_to_sint_v2i32
-; R600-CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI-CHECK: @fp_to_sint_v2i32
 ; SI-CHECK: V_CVT_I32_F32_e32
 ; SI-CHECK: V_CVT_I32_F32_e32
@@ -14,10 +14,10 @@ define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
 }
 
 ; R600-CHECK: @fp_to_sint_v4i32
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI-CHECK: @fp_to_sint_v4i32
 ; SI-CHECK: V_CVT_I32_F32_e32
 ; SI-CHECK: V_CVT_I32_F32_e32
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 2a365f9..77db43b 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fp_to_uint_v2i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: @fp_to_uint_v2i32
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fp_to_uint_v2i32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
 
 define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
@@ -10,11 +14,16 @@ define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   ret void
 }
 
-; CHECK: @fp_to_uint_v4i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: @fp_to_uint_v4i32
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI-CHECK: @fp_to_uint_v4i32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
 
 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll
new file mode 100644
index 0000000..143ee79
--- /dev/null
+++ b/test/CodeGen/R600/fpext.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @fpext
+; CHECK: V_CVT_F64_F32_e32
+define void @fpext(double addrspace(1)* %out, float %in) {
+  %result = fpext float %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll
new file mode 100644
index 0000000..20a8c00
--- /dev/null
+++ b/test/CodeGen/R600/fptrunc.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @fptrunc
+; CHECK: V_CVT_F32_F64_e32
+define void @fptrunc(float addrspace(1)* %out, double %in) {
+  %result = fptrunc double %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
index 2613805..ae50b17 100644
--- a/test/CodeGen/R600/fsqrt.ll
+++ b/test/CodeGen/R600/fsqrt.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fsqrt_f32
-; CHECK: V_SQRT_F32_e32 {{VGPR[0-9]+, VGPR[0-9]+}}
+; CHECK: V_SQRT_F32_e32 {{v[0-9]+, v[0-9]+}}
 
 define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
    %r0 = load float addrspace(1)* %in
@@ -11,7 +11,7 @@ define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 }
 
 ; CHECK: @fsqrt_f64
-; CHECK: V_SQRT_F64_e32 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_SQRT_F64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r0 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 0fc5860..4f74efb 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -1,23 +1,27 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fsub_f32
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-
-define void @fsub_f32() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fsub float %r0, %r1
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
-   ret void
+; R600-CHECK: @fsub_f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
+; SI-CHECK: @fsub_f32
+; SI-CHECK: V_SUB_F32
+define void @fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fsub float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
 }
 
 declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; CHECK: @fsub_v2f32
-; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
-; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+; R600-CHECK: @fsub_v2f32
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+; SI-CHECK: @fsub_v2f32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fsub <2 x float> %a, %b
@@ -25,11 +29,16 @@ entry:
   ret void
 }
 
-; CHECK: @fsub_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: @fsub_v4f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fsub_v4f32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
 define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index fa59dcc..1445a20 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fsub_f64
-; CHECK: V_ADD_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}, 0, 0, 0, 0, 2
+; CHECK: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}, 0, 0, 0, 0, 2
 
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
new file mode 100644
index 0000000..4ea21dd
--- /dev/null
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+
+define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+; CHECK-LABEL @use_gep_address_space:
+; CHECK: S_ADD_I32
+  %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
+  store i32 99, i32 addrspace(3)* %p
+  ret void
+}
+
+define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: @gep_as_vector_v4:
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+  %p = getelementptr <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
+  %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2
+  %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  store i32 99, i32 addrspace(3)* %p2
+  store i32 99, i32 addrspace(3)* %p3
+  ret void
+}
+
+define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: @gep_as_vector_v2:
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+  %p = getelementptr <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
+  %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  ret void
+}
+
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
index e3005fe..71705a6 100644
--- a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
+++ b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
@@ -3,7 +3,7 @@
 ;Test that a select with reversed True/False values is correctly lowered
 ;to a SETNE_INT.  There should only be one SETNE_INT instruction.
 
-;CHECK: SETNE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK-NOT: SETNE_INT
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
index 979efb0..b047315 100644
--- a/test/CodeGen/R600/imm.ll
+++ b/test/CodeGen/R600/imm.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
 ; CHECK: @i64_imm_inline_lo
-; CHECK: S_MOV_B32 [[LO:SGPR[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 [[LO_VGPR:VGPR[0-9]+]], [[LO]]
-; CHECK: BUFFER_STORE_DWORDX2 [[LO_VGPR]]_
+; CHECK: S_MOV_B32 [[LO:s[0-9]+]], 5
+; CHECK: V_MOV_B32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
+; CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[LO_VGPR]]:
 define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
@@ -13,9 +13,9 @@ entry:
 
 ; Use a 64-bit value with hi bits that can be represented as an inline constant
 ; CHECK: @i64_imm_inline_hi
-; CHECK: S_MOV_B32 [[HI:SGPR[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 [[HI_VGPR:VGPR[0-9]+]], [[HI]]
-; CHECK: BUFFER_STORE_DWORDX2 {{VGPR[0-9]+}}_[[HI_VGPR]]
+; CHECK: S_MOV_B32 [[HI:s[0-9]+]], 5
+; CHECK: V_MOV_B32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
+; CHECK: BUFFER_STORE_DWORDX2 v{{\[[0-9]+:}}[[HI_VGPR]]
 define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll
index ba5de22..169d69b 100644
--- a/test/CodeGen/R600/indirect-addressing-si.ll
+++ b/test/CodeGen/R600/indirect-addressing-si.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
 
 ; CHECK: extract_w_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELS_B32_e32
 define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -15,7 +15,7 @@ entry:
 }
 
 ; CHECK: extract_wo_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELS_B32_e32
 define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -25,7 +25,7 @@ entry:
 }
 
 ; CHECK: insert_w_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELD_B32_e32
 define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -37,7 +37,7 @@ entry:
 }
 
 ; CHECK: insert_wo_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELD_B32_e32
 define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
diff --git a/test/CodeGen/R600/indirect-addressing.ll b/test/CodeGen/R600/indirect-addressing.ll
deleted file mode 100644
index bd72cd9..0000000
--- a/test/CodeGen/R600/indirect-addressing.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; This test checks that uses and defs of the AR register happen in the same
-; instruction clause.
-
-; CHECK: @mova_same_clause
-; CHECK: MOVA_INT
-; CHECK-NOT: ALU clause
-; CHECK: 0 + AR.x
-; CHECK: MOVA_INT
-; CHECK-NOT: ALU clause
-; CHECK: 0 + AR.x
-
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
-entry:
-  %stack = alloca [5 x i32], align 4
-  %0 = load i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 %0
-  store i32 4, i32* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
-  %1 = load i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 %1
-  store i32 5, i32* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 0
-  %2 = load i32* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 1
-  %3 = load i32* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
-  ret void
-}
-
-; This test checks that the stack offset is calculated correctly for structs.
-; All register loads/stores should be optimized away, so there shouldn't be
-; any MOVA instructions.
-;
-; XXX: This generated code has unnecessary MOVs, we should be able to optimize
-; this.
-
-; CHECK: @multiple_structs
-; CHECK-NOT: MOVA_INT
-
-%struct.point = type { i32, i32 }
-
-define void @multiple_structs(i32 addrspace(1)* %out) {
-entry:
-  %a = alloca %struct.point
-  %b = alloca %struct.point
-  %a.x.ptr = getelementptr %struct.point* %a, i32 0, i32 0
-  %a.y.ptr = getelementptr %struct.point* %a, i32 0, i32 1
-  %b.x.ptr = getelementptr %struct.point* %b, i32 0, i32 0
-  %b.y.ptr = getelementptr %struct.point* %b, i32 0, i32 1
-  store i32 0, i32* %a.x.ptr
-  store i32 1, i32* %a.y.ptr
-  store i32 2, i32* %b.x.ptr
-  store i32 3, i32* %b.y.ptr
-  %a.indirect.ptr = getelementptr %struct.point* %a, i32 0, i32 0
-  %b.indirect.ptr = getelementptr %struct.point* %b, i32 0, i32 0
-  %a.indirect = load i32* %a.indirect.ptr
-  %b.indirect = load i32* %b.indirect.ptr
-  %0 = add i32 %a.indirect, %b.indirect
-  store i32 %0, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
new file mode 100644
index 0000000..05aecce
--- /dev/null
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -0,0 +1,16 @@
+; XFAIL: *
+; RUN: llc < %s -march=r600 -mcpu=redwood -o %t
+
+define void @var_insert(<4 x i32> addrspace(1)* %out, <4 x i32> %x, i32 %val, i32 %idx) nounwind  {
+entry:
+  %tmp3 = insertelement <4 x i32> %x, i32 %val, i32 %idx		; <<4 x i32>> [#uses=1]
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @var_extract(i32 addrspace(1)* %out, <4 x i32> %x, i32 %idx) nounwind  {
+entry:
+  %tmp3 = extractelement <4 x i32> %x, i32 %idx		; <<i32>> [#uses=1]
+  store i32 %tmp3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll
index 26c298b..ae9c8bb 100644
--- a/test/CodeGen/R600/jump-address.ll
+++ b/test/CodeGen/R600/jump-address.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: JUMP @5
+; CHECK: JUMP @3
 ; CHECK: EXPORT
 ; CHECK-NOT: EXPORT
 
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
index 3d70e4b..0baa3cd 100644
--- a/test/CodeGen/R600/kcache-fold.ll
+++ b/test/CodeGen/R600/kcache-fold.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; CHECK: @main1
-; CHECK: MOV T{{[0-9]+\.[XYZW], KC0}}
+; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
 define void @main1() {
 main_body:
   %0 = load <4 x float> addrspace(8)* null
@@ -10,7 +10,7 @@ main_body:
   %3 = extractelement <4 x float> %2, i32 0
   %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %5 = extractelement <4 x float> %4, i32 0
-  %6 = fcmp ult float %1, 0.000000e+00
+  %6 = fcmp ogt float %1, 0.000000e+00
   %7 = select i1 %6, float %3, float %5
   %8 = load <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 1
@@ -18,7 +18,7 @@ main_body:
   %11 = extractelement <4 x float> %10, i32 1
   %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ult float %9, 0.000000e+00
+  %14 = fcmp ogt float %9, 0.000000e+00
   %15 = select i1 %14, float %11, float %13
   %16 = load <4 x float> addrspace(8)* null
   %17 = extractelement <4 x float> %16, i32 2
@@ -26,7 +26,7 @@ main_body:
   %19 = extractelement <4 x float> %18, i32 2
   %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ult float %17, 0.000000e+00
+  %22 = fcmp ogt float %17, 0.000000e+00
   %23 = select i1 %22, float %19, float %21
   %24 = load <4 x float> addrspace(8)* null
   %25 = extractelement <4 x float> %24, i32 3
@@ -34,7 +34,7 @@ main_body:
   %27 = extractelement <4 x float> %26, i32 3
   %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %29 = extractelement <4 x float> %28, i32 3
-  %30 = fcmp ult float %25, 0.000000e+00
+  %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
   %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
   %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
@@ -58,7 +58,7 @@ main_body:
   %3 = extractelement <4 x float> %2, i32 0
   %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %5 = extractelement <4 x float> %4, i32 1
-  %6 = fcmp ult float %1, 0.000000e+00
+  %6 = fcmp ogt float %1, 0.000000e+00
   %7 = select i1 %6, float %3, float %5
   %8 = load <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 1
@@ -66,7 +66,7 @@ main_body:
   %11 = extractelement <4 x float> %10, i32 0
   %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ult float %9, 0.000000e+00
+  %14 = fcmp ogt float %9, 0.000000e+00
   %15 = select i1 %14, float %11, float %13
   %16 = load <4 x float> addrspace(8)* null
   %17 = extractelement <4 x float> %16, i32 2
@@ -74,7 +74,7 @@ main_body:
   %19 = extractelement <4 x float> %18, i32 3
   %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ult float %17, 0.000000e+00
+  %22 = fcmp ogt float %17, 0.000000e+00
   %23 = select i1 %22, float %19, float %21
   %24 = load <4 x float> addrspace(8)* null
   %25 = extractelement <4 x float> %24, i32 3
@@ -82,7 +82,7 @@ main_body:
   %27 = extractelement <4 x float> %26, i32 3
   %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %29 = extractelement <4 x float> %28, i32 2
-  %30 = fcmp ult float %25, 0.000000e+00
+  %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
   %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
   %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
new file mode 100644
index 0000000..247e316
--- /dev/null
+++ b/test/CodeGen/R600/kernel-args.ll
@@ -0,0 +1,455 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; EG-CHECK-LABEL: @i8_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+
+define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i8_zext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i8_zext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i8_sext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i8_sext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+entry:
+  %0 = sext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i16_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+
+define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i16_zext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i16_zext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i16_sext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i16_sext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+entry:
+  %0 = sext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i32_arg
+; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i32_arg
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+entry:
+  store i32 %in, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @f32_arg
+; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @f32_arg
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+entry:
+  store float %in, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v2i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+entry:
+  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v2i16_arg
+; SI-CHECK-DAG: BUFFER_LOAD_USHORT
+; SI-CHECK-DAG: BUFFER_LOAD_USHORT
+define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI-CHECK-LABEL: @v2i32_arg
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI-CHECK-LABEL: @v2f32_arg
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+entry:
+  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v3i8_arg
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
+; SI-CHECK-LABEL: @v3i8_arg
+define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+entry:
+  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v3i16_arg
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
+; SI-CHECK-LABEL: @v3i16_arg
+define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+entry:
+  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+; EG-CHECK-LABEL: @v3i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI-CHECK-LABEL: @v3i32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+entry:
+  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v3f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI-CHECK-LABEL: @v3f32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+entry:
+  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v4i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v4i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+entry:
+  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI-CHECK-LABEL: @v4i32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI-CHECK-LABEL: @v4f32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+entry:
+  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v8i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+entry:
+  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v8i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+entry:
+  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI-CHECK-LABEL: @v8i32_arg
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+entry:
+  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI-CHECK-LABEL: @v8f32_arg
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+entry:
+  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v16i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+entry:
+  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v16i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+entry:
+  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI-CHECK-LABEL: @v16i32_arg
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+entry:
+  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI-CHECK-LABEL: @v16f32_arg
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+entry:
+  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
new file mode 100644
index 0000000..63a4332
--- /dev/null
+++ b/test/CodeGen/R600/lds-output-queue.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+;
+; This test checks that the lds input queue will is empty at the end of
+; the ALU clause.
+
+; CHECK-LABEL: @lds_input_queue
+; CHECK: LDS_READ_RET * OQAP
+; CHECK-NOT: ALU clause
+; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
+
+@local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4
+
+define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+  %1 = load i32 addrspace(3)* %0
+  call void @llvm.AMDGPU.barrier.local()
+
+  ; This will start a new clause for the vertex fetch
+  %2 = load i32 addrspace(1)* %in
+  %3 = add i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
+
+; The machine scheduler does not do proper alias analysis and assumes that
+; loads from global values (Note that a global value is different that a
+; value from global memory.  A global value is a value that is declared
+; outside of a function, it can reside in any address space) alias with
+; all other loads.
+;
+; This is a problem for scheduling the reads from the local data share (lds).
+; These reads are implemented using two instructions.  The first copies the
+; data from lds into the lds output queue, and the second moves the data from
+; the input queue into main memory.  These two instructions don't have to be
+; scheduled one after the other, but they do need to be scheduled in the same
+; clause.  The aliasing problem mentioned above causes problems when there is a
+; load from global memory which immediately follows a load from a global value that
+; has been declared in the local memory space:
+;
+;  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+;  %1 = load i32 addrspace(3)* %0
+;  %2 = load i32 addrspace(1)* %in
+;
+; The instruction selection phase will generate ISA that looks like this:
+; %OQAP = LDS_READ_RET
+; %vreg0 = MOV %OQAP
+; %vreg1 = VTX_READ_32
+; %vreg2 = ADD_INT %vreg1, %vreg0
+;
+; The bottom scheduler will schedule the two ALU instructions first:
+;
+; UNSCHEDULED:
+; %OQAP = LDS_READ_RET
+; %vreg1 = VTX_READ_32
+;
+; SCHEDULED:
+;
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
+; to consider the global memory read (VTX_READ_32) has a chain dependency, so
+; the global memory read will always be scheduled first.  This will give us a
+; final program which looks like this:
+;
+; Alu clause:
+; %OQAP = LDS_READ_RET
+; VTX clause:
+; %vreg1 = VTX_READ_32
+; Alu clause:
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; This is an illegal program because the OQAP def and use know occur in
+; different ALU clauses.
+;
+; This test checks this scenario and makes sure it doesn't result in an
+; illegal program.  For now, we have fixed this issue by merging the
+; LDS_READ_RET and MOV together during instruction selection and then
+; expanding them after scheduling.  Once the scheduler has better alias
+; analysis, we should be able to keep these instructions sparate before
+; scheduling.
+;
+; CHECK-LABEL: @local_global_alias
+; CHECK: LDS_READ_RET
+; CHECK-NOT: ALU clause
+; CHECK MOV * T{{[0-9]\.[XYZW]}}, OQAP
+define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
+  %1 = load i32 addrspace(3)* %0
+  %2 = load i32 addrspace(1)* %in
+  %3 = add i32 %2, %1
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
new file mode 100644
index 0000000..2185180
--- /dev/null
+++ b/test/CodeGen/R600/lds-size.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test makes sure we do not double count global values when they are
+; used in different basic blocks.
+
+; CHECK-LABEL: @test
+; CHECK: .long   166120
+; CHECK-NEXT: .long   1
+@lds = internal addrspace(3) unnamed_addr global i32 zeroinitializer, align 4
+
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %else
+
+if:
+  store i32 1, i32 addrspace(3)* @lds
+  br label %endif
+
+else:
+  store i32 2, i32 addrspace(3)* @lds
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
index 36ee493..2d8930a 100644
--- a/test/CodeGen/R600/lit.local.cfg
+++ b/test/CodeGen/R600/lit.local.cfg
@@ -1,13 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'R600' in targets:
     config.unsupported = True
-
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
index 77b168e..47191e0 100644
--- a/test/CodeGen/R600/literals.ll
+++ b/test/CodeGen/R600/literals.ll
@@ -7,7 +7,8 @@
 ; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: @i32_literal
-; CHECK: ADD_INT * T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -23,7 +24,8 @@ entry:
 ; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: @float_literal
-; CHECK: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
@@ -31,3 +33,32 @@ entry:
   store float %0, float addrspace(1)* %out
   ret void
 }
+
+; Make sure inline literals are folded into REG_SEQUENCE instructions.
+; CHECK: @inline_literal_reg_sequence
+; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
+
+define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+entry:
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @inline_literal_dot4
+; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
+; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
+define void @inline_literal_dot4(float addrspace(1)* %out) {
+entry:
+  %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
index 3e854c8..1336f4e 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MAX_I32_e32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
index e227bf8..3435ea4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MIN_I32_e32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
index cc0732b..83b56a5 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.mul.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
@@ -2,16 +2,16 @@
 
 ;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @llvm.AMDGPU.mul(float ,float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
index 7627783..e6bb2c4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; R600-CHECK: @amdgpu_trunc
-; R600-CHECK: TRUNC * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; R600-CHECK: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK: @amdgpu_trunc
 ; SI-CHECK: V_TRUNC_F32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
index 7699c04..4cfa133 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MAX_U32_e32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
index a911ad9..14af051 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MIN_U32_e32
 
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
index e45722c..0438ecc 100644
--- a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: S_MOV_B32
 ;CHECK-NEXT: V_INTERP_MOV_F32
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 0adcdfc..59e00f0 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll
@@ -1,15 +1,15 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 3, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 2, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 1, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 4, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 5, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 12, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -23,25 +23,25 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1,
-      <8 x i32> undef, i32 1)
+      <32 x i8> undef, i32 1)
    %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2,
-      <8 x i32> undef, i32 2)
+      <32 x i8> undef, i32 2)
    %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3,
-      <8 x i32> undef, i32 3)
+      <32 x i8> undef, i32 3)
    %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4,
-      <8 x i32> undef, i32 4)
+      <32 x i8> undef, i32 4)
    %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5,
-      <8 x i32> undef, i32 5)
+      <32 x i8> undef, i32 5)
    %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6,
-      <8 x i32> undef, i32 6)
+      <32 x i8> undef, i32 6)
    %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10,
-      <8 x i32> undef, i32 10)
+      <32 x i8> undef, i32 10)
    %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11,
-      <8 x i32> undef, i32 11)
+      <32 x i8> undef, i32 11)
    %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15,
-      <8 x i32> undef, i32 15)
+      <32 x i8> undef, i32 15)
    %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16,
-      <8 x i32> undef, i32 16)
+      <32 x i8> undef, i32 16)
    %e1 = extractelement <4 x i32> %res1, i32 0
    %e2 = extractelement <4 x i32> %res2, i32 1
    %e3 = extractelement <4 x i32> %res3, i32 2
@@ -82,6 +82,50 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <8 x i32>, i32) readnone
+; Test that ccordinates are stored in vgprs and not sgprs
+; CHECK: vgpr_coords
+; CHECK: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
+define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr float addrspace(2)* addrspace(2)* %0, i32 0
+  %21 = load float addrspace(2)* addrspace(2)* %20, !tbaa !2
+  %22 = getelementptr float addrspace(2)* %21, i32 0
+  %23 = load float addrspace(2)* %22, !tbaa !2, !invariant.load !1
+  %24 = getelementptr float addrspace(2)* %21, i32 1
+  %25 = load float addrspace(2)* %24, !tbaa !2, !invariant.load !1
+  %26 = getelementptr float addrspace(2)* %21, i32 4
+  %27 = load float addrspace(2)* %26, !tbaa !2, !invariant.load !1
+  %28 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %29 = load <32 x i8> addrspace(2)* %28, !tbaa !2
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %23 to i32
+  %32 = bitcast float %25 to i32
+  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
+  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
+  %35 = insertelement <4 x i32> %34, i32 %30, i32 2
+  %36 = insertelement <4 x i32> %35, i32 undef, i32 3
+  %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
+  %38 = extractelement <4 x i32> %37, i32 0
+  %39 = extractelement <4 x i32> %37, i32 1
+  %40 = extractelement <4 x i32> %37, i32 2
+  %41 = extractelement <4 x i32> %37, i32 3
+  %42 = bitcast i32 %38 to float
+  %43 = bitcast i32 %39 to float
+  %44 = bitcast i32 %40 to float
+  %45 = bitcast i32 %41 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
+  ret void
+}
+
+declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null}
+!1 = metadata !{}
+!2 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.resinfo.ll b/test/CodeGen/R600/llvm.SI.resinfo.ll
index eb31514..af3afc1 100644
--- a/test/CodeGen/R600/llvm.SI.resinfo.ll
+++ b/test/CodeGen/R600/llvm.SI.resinfo.ll
@@ -1,40 +1,40 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 3, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 2, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 1, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 4, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 8, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 5, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 9, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 6, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 10, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 12, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 8, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
 		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
-   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <8 x i32> undef, i32 1)
-   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <8 x i32> undef, i32 2)
-   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <8 x i32> undef, i32 3)
-   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <8 x i32> undef, i32 4)
-   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <8 x i32> undef, i32 5)
-   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <8 x i32> undef, i32 6)
-   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <8 x i32> undef, i32 7)
-   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <8 x i32> undef, i32 8)
-   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <8 x i32> undef, i32 9)
-   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <8 x i32> undef, i32 10)
-   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <8 x i32> undef, i32 11)
-   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <8 x i32> undef, i32 12)
-   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <8 x i32> undef, i32 13)
-   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <8 x i32> undef, i32 14)
-   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <8 x i32> undef, i32 15)
-   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <8 x i32> undef, i32 16)
+   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1)
+   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2)
+   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3)
+   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4)
+   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5)
+   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6)
+   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7)
+   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8)
+   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9)
+   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10)
+   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11)
+   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12)
+   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13)
+   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14)
+   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15)
+   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16)
    %e1 = extractelement <4 x i32> %res1, i32 0
    %e2 = extractelement <4 x i32> %res2, i32 1
    %e3 = extractelement <4 x i32> %res3, i32 2
@@ -105,6 +105,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
    ret void
 }
 
-declare <4 x i32> @llvm.SI.resinfo(i32, <8 x i32>, i32) readnone
+declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
new file mode 100644
index 0000000..e5e4ec4
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll
@@ -0,0 +1,93 @@
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+
+; CHECK-LABEL: @v1
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 13
+define void @v1(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v2
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 11
+define void @v2(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v3
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
+define void @v3(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v4
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 7
+define void @v4(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v5
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
+define void @v5(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: @v6
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 6
+define void @v6(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: @v7
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 9
+define void @v7(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index 7655996..d41737c 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -1,21 +1,21 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 3
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 6
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 10
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 2
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 1
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 4
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
-      <8 x i32> undef, <4 x i32> undef, i32 1)
+      <32 x i8> undef, <16 x i8> undef, i32 1)
    %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
-      <8 x i32> undef, <4 x i32> undef, i32 2)
+      <32 x i8> undef, <16 x i8> undef, i32 2)
    %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
-      <8 x i32> undef, <4 x i32> undef, i32 3)
+      <32 x i8> undef, <16 x i8> undef, i32 3)
    %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
-      <8 x i32> undef, <4 x i32> undef, i32 4)
+      <32 x i8> undef, <16 x i8> undef, i32 4)
    %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
-      <8 x i32> undef, <4 x i32> undef, i32 5)
+      <32 x i8> undef, <16 x i8> undef, i32 5)
    %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
-      <8 x i32> undef, <4 x i32> undef, i32 6)
+      <32 x i8> undef, <16 x i8> undef, i32 6)
    %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
-      <8 x i32> undef, <4 x i32> undef, i32 7)
+      <32 x i8> undef, <16 x i8> undef, i32 7)
    %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
-      <8 x i32> undef, <4 x i32> undef, i32 8)
+      <32 x i8> undef, <16 x i8> undef, i32 8)
    %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
-      <8 x i32> undef, <4 x i32> undef, i32 9)
+      <32 x i8> undef, <16 x i8> undef, i32 9)
    %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
-      <8 x i32> undef, <4 x i32> undef, i32 10)
+      <32 x i8> undef, <16 x i8> undef, i32 10)
    %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
-      <8 x i32> undef, <4 x i32> undef, i32 11)
+      <32 x i8> undef, <16 x i8> undef, i32 11)
    %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
-      <8 x i32> undef, <4 x i32> undef, i32 12)
+      <32 x i8> undef, <16 x i8> undef, i32 12)
    %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
-      <8 x i32> undef, <4 x i32> undef, i32 13)
+      <32 x i8> undef, <16 x i8> undef, i32 13)
    %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
-      <8 x i32> undef, <4 x i32> undef, i32 14)
+      <32 x i8> undef, <16 x i8> undef, i32 14)
    %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
-      <8 x i32> undef, <4 x i32> undef, i32 15)
+      <32 x i8> undef, <16 x i8> undef, i32 15)
    %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
-      <8 x i32> undef, <4 x i32> undef, i32 16)
+      <32 x i8> undef, <16 x i8> undef, i32 16)
    %e1 = extractelement <4 x float> %res1, i32 0
    %e2 = extractelement <4 x float> %res2, i32 1
    %e3 = extractelement <4 x float> %res3, i32 2
@@ -135,6 +135,23 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x float> @llvm.SI.sample.(<4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+; CHECK: @v1
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
+define void @v1(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  %5 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
index 3b05551..21ac725 100644
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -1,21 +1,21 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 2
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 1
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 4
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
-      <8 x i32> undef, <4 x i32> undef, i32 1)
+      <32 x i8> undef, <16 x i8> undef, i32 1)
    %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
-      <8 x i32> undef, <4 x i32> undef, i32 2)
+      <32 x i8> undef, <16 x i8> undef, i32 2)
    %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
-      <8 x i32> undef, <4 x i32> undef, i32 3)
+      <32 x i8> undef, <16 x i8> undef, i32 3)
    %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
-      <8 x i32> undef, <4 x i32> undef, i32 4)
+      <32 x i8> undef, <16 x i8> undef, i32 4)
    %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
-      <8 x i32> undef, <4 x i32> undef, i32 5)
+      <32 x i8> undef, <16 x i8> undef, i32 5)
    %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
-      <8 x i32> undef, <4 x i32> undef, i32 6)
+      <32 x i8> undef, <16 x i8> undef, i32 6)
    %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
-      <8 x i32> undef, <4 x i32> undef, i32 7)
+      <32 x i8> undef, <16 x i8> undef, i32 7)
    %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
-      <8 x i32> undef, <4 x i32> undef, i32 8)
+      <32 x i8> undef, <16 x i8> undef, i32 8)
    %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
-      <8 x i32> undef, <4 x i32> undef, i32 9)
+      <32 x i8> undef, <16 x i8> undef, i32 9)
    %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
-      <8 x i32> undef, <4 x i32> undef, i32 10)
+      <32 x i8> undef, <16 x i8> undef, i32 10)
    %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
-      <8 x i32> undef, <4 x i32> undef, i32 11)
+      <32 x i8> undef, <16 x i8> undef, i32 11)
    %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
-      <8 x i32> undef, <4 x i32> undef, i32 12)
+      <32 x i8> undef, <16 x i8> undef, i32 12)
    %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
-      <8 x i32> undef, <4 x i32> undef, i32 13)
+      <32 x i8> undef, <16 x i8> undef, i32 13)
    %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
-      <8 x i32> undef, <4 x i32> undef, i32 14)
+      <32 x i8> undef, <16 x i8> undef, i32 14)
    %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
-      <8 x i32> undef, <4 x i32> undef, i32 15)
+      <32 x i8> undef, <16 x i8> undef, i32 15)
    %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
-      <8 x i32> undef, <4 x i32> undef, i32 16)
+      <32 x i8> undef, <16 x i8> undef, i32 16)
    %e1 = extractelement <4 x float> %res1, i32 0
    %e2 = extractelement <4 x float> %res2, i32 1
    %e3 = extractelement <4 x float> %res3, i32 2
@@ -135,6 +135,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
new file mode 100644
index 0000000..fa7c3ca
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
@@ -0,0 +1,44 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK_LABEL: @test1
+;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 32, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test1(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK_LABEL: @test2
+;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 24, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test2(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK_LABEL: @test3
+;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 16, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test3(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
+        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK_LABEL: @test4
+;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test4(i32 %vdata, i32 %vaddr) {
+    call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
+        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
index 238d9f2..fe17304 100644
--- a/test/CodeGen/R600/llvm.SI.tid.ll
+++ b/test/CodeGen/R600/llvm.SI.tid.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MBCNT_LO_U32_B32_e64
 ;CHECK: V_MBCNT_HI_U32_B32_e32
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 8fb4559..aaf2305 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -5,15 +5,15 @@
 ;CHECK: ADD *
 ;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @llvm.cos.f32(float %r0)
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @llvm.cos.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.floor.ll b/test/CodeGen/R600/llvm.floor.ll
new file mode 100644
index 0000000..f7071cd
--- /dev/null
+++ b/test/CodeGen/R600/llvm.floor.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK: @f32
+; R600-CHECK: FLOOR
+; SI-CHECK: @f32
+; SI-CHECK: V_FLOOR_F32_e32
+define void @f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.floor.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v2f32
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; SI-CHECK: @v2f32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.floor.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v4f32
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; SI-CHECK: @v4f32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.floor.f32(float) #0
+
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
index 0f51cf4..b587d2b 100644
--- a/test/CodeGen/R600/llvm.pow.ll
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -4,16 +4,16 @@
 ;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = call float @llvm.pow.f32( float %r0, float %r1)
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @llvm.pow.f32(float ,float ) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
new file mode 100644
index 0000000..c174b33
--- /dev/null
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK: @f32
+; R600-CHECK: RNDNE
+; SI-CHECK: @f32
+; SI-CHECK: V_RNDNE_F32_e32
+define void @f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.rint.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v2f32
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; SI-CHECK: @v2f32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v4f32
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; SI-CHECK: @v4f32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.rint.f32(float) #0
+
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll
new file mode 100644
index 0000000..e06d45d
--- /dev/null
+++ b/test/CodeGen/R600/llvm.round.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+
+; FUNC-LABEL: @f32
+; R600: FRACT
+; R600-DAG: ADD
+; R600-DAG: CEIL
+; R600-DAG: FLOOR
+; R600: CNDGE
+define void @f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.round.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; The vector tests are really difficult to verify, since it can be hard to
+; predict how the scheduler will order the instructions.  We already have
+; a test for the scalar case, so the vector tests just check that the
+; compiler doesn't crash.
+
+; FUNC-LABEL: v2f32
+; R600: CF_END
+define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: v4f32
+; R600: CF_END
+define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.round.f32(float)
+declare <2 x float> @llvm.round.v2f32(<2 x float>)
+declare <4 x float> @llvm.round.v4f32(<4 x float>)
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index e94c2ba..9eb9983 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -5,15 +5,15 @@
 ;CHECK: ADD *
 ;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @llvm.sin.f32( float %r0)
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @llvm.sin.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
new file mode 100644
index 0000000..0d0d186
--- /dev/null
+++ b/test/CodeGen/R600/llvm.sqrt.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 --mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK-LABEL: @sqrt_f32
+; R600-CHECK: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; R600-CHECK: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
+; SI-CHECK-LABEL: @sqrt_f32
+; SI-CHECK: V_SQRT_F32_e32
+define void @sqrt_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.sqrt.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @sqrt_v2f32
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
+; SI-CHECK-LABEL: @sqrt_v2f32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @sqrt_v4f32
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
+; SI-CHECK-LABEL: @sqrt_v4f32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float %in)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/load-input-fold.ll b/test/CodeGen/R600/load-input-fold.ll
index aff2a6e..ca86d0e 100644
--- a/test/CodeGen/R600/load-input-fold.ll
+++ b/test/CodeGen/R600/load-input-fold.ll
@@ -1,20 +1,20 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
 ;REQUIRES: asserts
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
-  %5 = call float @llvm.R600.load.input(i32 9)
-  %6 = call float @llvm.R600.load.input(i32 10)
-  %7 = call float @llvm.R600.load.input(i32 11)
-  %8 = call float @llvm.R600.load.input(i32 12)
-  %9 = call float @llvm.R600.load.input(i32 13)
-  %10 = call float @llvm.R600.load.input(i32 14)
-  %11 = call float @llvm.R600.load.input(i32 15)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
   %12 = load <4 x float> addrspace(8)* null
   %13 = extractelement <4 x float> %12, i32 0
   %14 = fmul float %0, %13
@@ -96,9 +96,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-; Function Attrs: readnone
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 ; Function Attrs: readonly
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index f478ef5..e4492d7 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,17 +1,17 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK  %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
 
 ;===------------------------------------------------------------------------===;
 ; GLOBAL ADDRESS SPACE
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the global address space.
-; R600-CHECK: @load_i8
+; R600-CHECK-LABEL: @load_i8
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI-CHECK: @load_i8
-; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_i8
+; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8 addrspace(1)* %in
   %2 = zext i8 %1 to i32
@@ -19,13 +19,13 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   ret void
 }
 
-; R600-CHECK: @load_i8_sext
+; R600-CHECK-LABEL: @load_i8_sext
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: @load_i8_sext
+; SI-CHECK-LABEL: @load_i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
@@ -35,10 +35,98 @@ entry:
   ret void
 }
 
+; R600-CHECK-LABEL: @load_v2i8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @load_v2i8
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(1)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i8_sext
+; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 24
+; SI-CHECK-LABEL: @load_v2i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(1)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @load_v4i8
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(1)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8_sext
+; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
+; R600-CHECK-DAG: 24
+; SI-CHECK-LABEL: @load_v4i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(1)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; Load an i16 value from the global address space.
-; R600-CHECK: @load_i16
+; R600-CHECK-LABEL: @load_i16
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_i16
+; SI-CHECK-LABEL: @load_i16
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
@@ -48,13 +136,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i16_sext
+; R600-CHECK-LABEL: @load_i16_sext
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: @load_i16_sext
+; SI-CHECK-LABEL: @load_i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
@@ -64,12 +152,100 @@ entry:
   ret void
 }
 
+; R600-CHECK-LABEL: @load_v2i16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @load_v2i16
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(1)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i16_sext
+; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 16
+; SI-CHECK-LABEL: @load_v2i16_sext
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(1)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @load_v4i16
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(1)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16_sext
+; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
+; R600-CHECK-DAG: 16
+; SI-CHECK-LABEL: @load_v4i16_sext
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(1)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; load an i32 value from the global address space.
-; R600-CHECK: @load_i32
+; R600-CHECK-LABEL: @load_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_i32
-; SI-CHECK: BUFFER_LOAD_DWORD VGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_i32
+; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -78,11 +254,11 @@ entry:
 }
 
 ; load a f32 value from the global address space.
-; R600-CHECK: @load_f32
+; R600-CHECK-LABEL: @load_f32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_f32
-; SI-CHECK: BUFFER_LOAD_DWORD VGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_f32
+; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float addrspace(1)* %in
@@ -91,10 +267,10 @@ entry:
 }
 
 ; load a v2f32 value from the global address space
-; R600-CHECK: @load_v2f32
+; R600-CHECK-LABEL: @load_v2f32
 ; R600-CHECK: VTX_READ_64
 
-; SI-CHECK: @load_v2f32
+; SI-CHECK-LABEL: @load_v2f32
 ; SI-CHECK: BUFFER_LOAD_DWORDX2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
@@ -103,11 +279,11 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i64
-; R600-CHECK: RAT
-; R600-CHECK: RAT
+; R600-CHECK-LABEL: @load_i64
+; R600-CHECK: MEM_RAT
+; R600-CHECK: MEM_RAT
 
-; SI-CHECK: @load_i64
+; SI-CHECK-LABEL: @load_i64
 ; SI-CHECK: BUFFER_LOAD_DWORDX2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
@@ -116,15 +292,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i64_sext
-; R600-CHECK: RAT
-; R600-CHECK: RAT
+; R600-CHECK-LABEL: @load_i64_sext
+; R600-CHECK: MEM_RAT
+; R600-CHECK: MEM_RAT
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
 ; R600-CHECK: 31
-; SI-CHECK: @load_i64_sext
-; SI-CHECK: BUFFER_LOAD_DWORDX2 [[VAL:VGPR[0-9]_VGPR[0-9]]]
-; SI-CHECK: V_LSHL_B64 [[LSHL:VGPR[0-9]_VGPR[0-9]]], [[VAL]], 32
-; SI-CHECK: V_ASHR_I64 VGPR{{[0-9]}}_VGPR{{[0-9]}}, [[LSHL]], 32
+; SI-CHECK-LABEL: @load_i64_sext
+; SI-CHECK: BUFFER_LOAD_DWORDX2 [[VAL:v\[[0-9]:[0-9]\]]]
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -134,9 +308,9 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i64_zext
-; R600-CHECK: RAT
-; R600-CHECK: RAT
+; R600-CHECK-LABEL: @load_i64_zext
+; R600-CHECK: MEM_RAT
+; R600-CHECK: MEM_RAT
 define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -150,14 +324,14 @@ entry:
 ;===------------------------------------------------------------------------===;
 
 ; Load a sign-extended i8 value
-; R600-CHECK: @load_const_i8_sext
+; R600-CHECK-LABEL: @load_const_i8_sext
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: @load_const_i8_sext
-; SI-CHECK: BUFFER_LOAD_SBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_const_i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -167,10 +341,10 @@ entry:
 }
 
 ; Load an aligned i8 value
-; R600-CHECK: @load_const_i8_aligned
+; R600-CHECK-LABEL: @load_const_i8_aligned
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i8_aligned
-; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_const_i8_aligned
+; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -180,10 +354,10 @@ entry:
 }
 
 ; Load an un-aligned i8 value
-; R600-CHECK: @load_const_i8_unaligned
+; R600-CHECK-LABEL: @load_const_i8_unaligned
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i8_unaligned
-; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_const_i8_unaligned
+; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(2)* %in, i32 1
@@ -194,13 +368,13 @@ entry:
 }
 
 ; Load a sign-extended i16 value
-; R600-CHECK: @load_const_i16_sext
+; R600-CHECK-LABEL: @load_const_i16_sext
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: @load_const_i16_sext
+; SI-CHECK-LABEL: @load_const_i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -211,9 +385,9 @@ entry:
 }
 
 ; Load an aligned i16 value
-; R600-CHECK: @load_const_i16_aligned
+; R600-CHECK-LABEL: @load_const_i16_aligned
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i16_aligned
+; SI-CHECK-LABEL: @load_const_i16_aligned
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -224,9 +398,9 @@ entry:
 }
 
 ; Load an un-aligned i16 value
-; R600-CHECK: @load_const_i16_unaligned
+; R600-CHECK-LABEL: @load_const_i16_unaligned
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i16_unaligned
+; SI-CHECK-LABEL: @load_const_i16_unaligned
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -238,11 +412,11 @@ entry:
 }
 
 ; Load an i32 value from the constant address space.
-; R600-CHECK: @load_const_addrspace_i32
+; R600-CHECK-LABEL: @load_const_addrspace_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_const_addrspace_i32
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_const_addrspace_i32
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
 define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %0 = load i32 addrspace(2)* %in
@@ -251,14 +425,259 @@ entry:
 }
 
 ; Load a f32 value from the constant address space.
-; R600-CHECK: @load_const_addrspace_f32
+; R600-CHECK-LABEL: @load_const_addrspace_f32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_const_addrspace_f32
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_const_addrspace_f32
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
 define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
   %1 = load float addrspace(2)* %in
   store float %1, float addrspace(1)* %out
   ret void
 }
 
+;===------------------------------------------------------------------------===;
+; LOCAL ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load an i8 value from the local address space.
+; R600-CHECK-LABEL: @load_i8_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; SI-CHECK-LABEL: @load_i8_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U8
+define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+  %1 = load i8 addrspace(3)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_i8_sext_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: ASHR
+; SI-CHECK-LABEL: @load_i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I8
+define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+entry:
+  %0 = load i8 addrspace(3)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i8_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; SI-CHECK-LABEL: @load_v2i8_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(3)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i8_sext_local
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v2i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(3)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; SI-CHECK-LABEL: @load_v4i8_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(3)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8_sext_local
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v4i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(3)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Load an i16 value from the local address space.
+; R600-CHECK-LABEL: @load_i16_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; SI-CHECK-LABEL: @load_i16_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U16
+define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16	 addrspace(3)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_i16_sext_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: ASHR
+; SI-CHECK-LABEL: @load_i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I16
+define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16 addrspace(3)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i16_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; SI-CHECK-LABEL: @load_v2i16_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(3)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i16_sext_local
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v2i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(3)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; SI-CHECK-LABEL: @load_v4i16_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(3)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16_sext_local
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v4i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(3)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load an i32 value from the glocal address space.
+; R600-CHECK-LABEL: @load_i32_local
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-LABEL: @load_i32_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_B32
+define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; load a f32 value from the global address space.
+; R600-CHECK-LABEL: @load_f32_local
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-LABEL: @load_f32_local
+; SI-CHECK: DS_READ_B32
+define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
+entry:
+  %0 = load float addrspace(3)* %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; load a v2f32 value from the local address space
+; R600-CHECK-LABEL: @load_v2f32_local
+; R600-CHECK: LDS_READ_RET
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-LABEL: @load_v2f32_local
+; SI-CHECK: DS_READ_B32
+; SI-CHECK: DS_READ_B32
+define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x float> addrspace(3)* %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
index 8cba0b6..81a6310 100644
--- a/test/CodeGen/R600/load.vec.ll
+++ b/test/CodeGen/R600/load.vec.ll
@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK  %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK  %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
 
 ; load a v2i32 value from the global address space.
 ; EG-CHECK: @load_v2i32
 ; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
 ; SI-CHECK: @load_v2i32
-; SI-CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
+; SI-CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
 define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %a = load <2 x i32> addrspace(1) * %in
   store <2 x i32> %a, <2 x i32> addrspace(1)* %out
@@ -16,7 +16,7 @@ define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG-CHECK: @load_v4i32
 ; EG-CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
 ; SI-CHECK: @load_v4i32
-; SI-CHECK: BUFFER_LOAD_DWORDX4 VGPR{{[0-9]+}}
+; SI-CHECK: BUFFER_LOAD_DWORDX4 v[{{[0-9]+:[0-9]+}}]
 define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %a = load <4 x i32> addrspace(1) * %in
   store <4 x i32> %a, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
index 3b4a8f8..e351e41 100644
--- a/test/CodeGen/R600/load64.ll
+++ b/test/CodeGen/R600/load64.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; load a f64 value from the global address space.
 ; CHECK: @load_f64
-; CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
+; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
 define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 entry:
   %0 = load double addrspace(1)* %in
@@ -12,7 +12,7 @@ entry:
 
 ; Load a f64 value from the constant address space.
 ; CHECK: @load_const_addrspace_f64
-; CHECK: S_LOAD_DWORDX2 SGPR{{[0-9]+}}
+; CHECK: S_LOAD_DWORDX2 s[{{[0-9]+:[0-9]+}}]
 define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
   %1 = load double addrspace(2)* %in
   store double %1, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 6d3610e..e2d8406 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -1,27 +1,34 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; TODO: Add RUN and CHECK lines for SI once this test works there
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 @local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
 @local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
 
-; CHECK: @local_memory_two_objects
+; EG-CHECK: @local_memory_two_objects
 
 ; Check that the LDS size emitted correctly
-; CHECK: .long 166120
-; CHECK-NEXT: .long 8
-
-; Make sure the lds writes are using different addresses.
-; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]]
-; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]]
+; EG-CHECK: .long 166120
+; EG-CHECK-NEXT: .long 8
+; SI-CHECK: .long 47180
+; SI-CHECK-NEXT: .long 32768
+
+; We would like to check the the lds writes are using different
+; addresses, but due to variations in the scheduler, we can't do
+; this consistently on evergreen GPUs.
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; SI-CHECK: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; SI-CHECK-NOT: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW]]
 
 ; GROUP_BARRIER must be the last instruction in a clause
-; CHECK: GROUP_BARRIER
-; CHECK-NEXT: ALU clause
+; EG-CHECK: GROUP_BARRIER
+; EG-CHECK-NEXT: ALU clause
 
 ; Make sure the lds reads are using different addresses.
-; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
-; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, 0, [[ADDRR:v[0-9]+]]
+; SI-CHECK-NOT: DS_READ_B32 {{v[0-9]+}}, 0, [[ADDRR]]
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 5458fb9..2168a3d 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -1,19 +1,24 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=CI-CHECK %s
 
-@local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4
+@local_memory.local_mem = internal addrspace(3) unnamed_addr global [128 x i32] zeroinitializer, align 4
 
-; EG-CHECK: @local_memory
-; SI-CHECK: @local_memory
+; EG-CHECK-LABEL: @local_memory
+; SI-CHECK-LABEL: @local_memory
+; CI-CHECK-LABEL: @local_memory
 
 ; Check that the LDS size emitted correctly
 ; EG-CHECK: .long 166120
-; EG-CHECK-NEXT: .long 16
+; EG-CHECK-NEXT: .long 128
 ; SI-CHECK: .long 47180
-; SI-CHECK-NEXT: .long 32768
+; SI-CHECK-NEXT: .long 65536
+; CI-CHECK: .long 47180
+; CI-CHECK-NEXT: .long 32768
 
 ; EG-CHECK: LDS_WRITE
-; SI-CHECK: DS_WRITE_B32
+; SI-CHECK_NOT: S_WQM_B64
+; SI-CHECK: DS_WRITE_B32 0
 
 ; GROUP_BARRIER must be the last instruction in a clause
 ; EG-CHECK: GROUP_BARRIER
@@ -21,18 +26,18 @@
 ; SI-CHECK: S_BARRIER
 
 ; EG-CHECK: LDS_READ_RET
-; SI-CHECK: DS_READ_B32
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, 0
 
 define void @local_memory(i32 addrspace(1)* %out) {
 entry:
   %y.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  %arrayidx = getelementptr inbounds [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
   store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
   %add = add nsw i32 %y.i, 1
   %cmp = icmp eq i32 %add, 16
   %.add = select i1 %cmp, i32 0, i32 %add
   call void @llvm.AMDGPU.barrier.local()
-  %arrayidx1 = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %arrayidx1 = getelementptr inbounds [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
   %0 = load i32 addrspace(3)* %arrayidx1, align 4
   %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i
   store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
index 806e681..2162839 100644
--- a/test/CodeGen/R600/lshl.ll
+++ b/test/CodeGen/R600/lshl.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_LSHL_B32_e64 VGPR{{[0-9]}}, SGPR{{[0-9]}}, 1
+;CHECK: S_LSHL_B32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = mul i32 %p, 2
diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
index cfbcc34..886d1c4 100644
--- a/test/CodeGen/R600/lshr.ll
+++ b/test/CodeGen/R600/lshr.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_LSHR_B32_e64 {{VGPR[0-9]}}, SGPR{{[0-9]}}, 1
+;CHECK: S_LSHR_B32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 2
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
index ce42ae7..df063ec 100644
--- a/test/CodeGen/R600/mad_int24.ll
+++ b/test/CodeGen/R600/mad_int24.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; EG-CHECK: @i32_mad24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index 00aa64a..66a070e 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; EG-CHECK: @u32_mad24
+; EG-CHECK-LABEL: @u32_mad24
 ; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
-; SI-CHECK: @u32_mad24
+; SI-CHECK-LABEL: @u32_mad24
 ; SI-CHECK: V_MAD_U32_U24
 
 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
@@ -19,7 +19,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i16_mad24
+; EG-CHECK-LABEL: @i16_mad24
 ; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
@@ -30,10 +30,10 @@ entry:
 ; EG-CHECK: 16
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 16
-; SI-CHECK: @i16_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MAD:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 16, [[MAD]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 16, [[LSHL]]
+; SI-CHECK-LABEL: @i16_mad24
+; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MAD]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]]
 
 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
@@ -44,7 +44,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i8_mad24
+; EG-CHECK-LABEL: @i8_mad24
 ; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
@@ -55,10 +55,10 @@ entry:
 ; EG-CHECK: 24
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 24
-; SI-CHECK: @i8_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 24, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 24, [[LSHL]]
+; SI-CHECK-LABEL: @i8_mad24
+; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]]
 
 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
diff --git a/test/CodeGen/R600/max-literals.ll b/test/CodeGen/R600/max-literals.ll
index c31b7c0..65a6d2b 100644
--- a/test/CodeGen/R600/max-literals.ll
+++ b/test/CodeGen/R600/max-literals.ll
@@ -3,13 +3,13 @@
 ; CHECK: @main
 ; CHECK: ADD *
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
   %5 = fadd float %0, 2.0
   %6 = fadd float %1, 3.0
   %7 = fadd float %2, 4.0
@@ -32,13 +32,13 @@ main_body:
 ; CHECK: @main
 ; CHECK-NOT: ADD *
 
-define void @main2() #0 {
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
   %5 = fadd float %0, 2.0
   %6 = fadd float %1, 3.0
   %7 = fadd float %2, 4.0
@@ -59,7 +59,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index 18a17b6..8c27e28 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; mul24 and mad24 are affected
 
@@ -8,8 +8,8 @@
 ;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test2
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -27,10 +27,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test4
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
index 16ae760..66a1a9e 100644
--- a/test/CodeGen/R600/mul_int24.ll
+++ b/test/CodeGen/R600/mul_int24.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; EG-CHECK: @i32_mul24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index b1a7f94..6e6d549 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; EG-CHECK: @u32_mul24
+; EG-CHECK-LABEL: @u32_mul24
 ; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: @u32_mul24
+; SI-CHECK-LABEL: @u32_mul24
 ; SI-CHECK: V_MUL_U32_U24
 
 define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
@@ -18,7 +18,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i16_mul24
+; EG-CHECK-LABEL: @i16_mul24
 ; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; The order of A and B does not matter.
@@ -28,10 +28,10 @@ entry:
 ; EG-CHECK: 16
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 16
-; SI-CHECK: @i16_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 16, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 16, [[LSHL]]
+; SI-CHECK-LABEL: @i16_mul24
+; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]]
 
 define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
@@ -41,7 +41,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i8_mul24
+; EG-CHECK-LABEL: @i8_mul24
 ; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; The order of A and B does not matter.
@@ -51,10 +51,10 @@ entry:
 ; EG-CHECK: 24
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 24
-; SI-CHECK: @i8_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 24, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 24, [[LSHL]]
+; SI-CHECK-LABEL: @i8_mul24
+; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]]
 
 define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index eb379d1..d5fc014 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,8 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MOV_B32_e32 VGPR{{[0-9]+}}, -1431655765
-;CHECK: V_MUL_HI_U32 VGPR0, {{[SV]GPR[0-9]+}}, {{VGPR[0-9]+}}
-;CHECK-NEXT: V_LSHRREV_B32_e32 VGPR0, 1, VGPR0
+;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, -1431655765
+;CHECK: V_MUL_HI_U32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
+;CHECK-NEXT: V_LSHRREV_B32_e32 v0, 1, v0
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 3
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 4a4e892..35d23b3 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-; EG-CHECK: @or_v2i32
+; EG-CHECK-LABEL: @or_v2i32
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @or_v2i32
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @or_v2i32
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
   ret void
 }
 
-; EG-CHECK: @or_v4i32
+; EG-CHECK-LABEL: @or_v4i32
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @or_v4i32
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @or_v4i32
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -38,3 +38,16 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; EG-CHECK-LABEL: @or_i64
+; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; SI-CHECK-LABEL: @or_i64
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
+define void @or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+	%0 = or i64 %a, %b
+	store i64 %0, i64 addrspace(1)* %out
+	ret void
+}
diff --git a/test/CodeGen/R600/predicate-dp4.ll b/test/CodeGen/R600/predicate-dp4.ll
new file mode 100644
index 0000000..e48d6a7
--- /dev/null
+++ b/test/CodeGen/R600/predicate-dp4.ll
@@ -0,0 +1,27 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+; CHECK-LABEL: @main
+; CHECK: PRED_SETE_INT * Pred,
+; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
+define void @main(<4 x float> inreg) #0 {
+main_body:
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %IF, label %ENDIF
+
+IF:                                             ; preds = %main_body
+  %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %IF, %main_body
+  %5 = phi float [%4, %IF], [0.000000e+00, %main_body]
+  %6 = insertelement <4 x float> undef, float %5, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll
index 0d3eeef..902508f 100644
--- a/test/CodeGen/R600/predicates.ll
+++ b/test/CodeGen/R600/predicates.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s
 
 ; These tests make sure the compiler is optimizing branches using predicates
 ; when it is legal to do so.
diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
new file mode 100644
index 0000000..48a013c
--- /dev/null
+++ b/test/CodeGen/R600/private-memory.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; This test checks that uses and defs of the AR register happen in the same
+; instruction clause.
+
+; R600-CHECK-LABEL: @mova_same_clause
+; R600-CHECK: MOVA_INT
+; R600-CHECK-NOT: ALU clause
+; R600-CHECK: 0 + AR.x
+; R600-CHECK: MOVA_INT
+; R600-CHECK-NOT: ALU clause
+; R600-CHECK: 0 + AR.x
+
+; SI-CHECK-LABEL: @mova_same_clause
+; SI-CHECK: V_READFIRSTLANE
+; SI-CHECK: V_MOVRELD
+; SI-CHECK: S_CBRANCH
+; SI-CHECK: V_READFIRSTLANE
+; SI-CHECK: V_MOVRELD
+; SI-CHECK: S_CBRANCH
+define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %0 = load i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 %0
+  store i32 4, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
+  %1 = load i32 addrspace(1)* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 %1
+  store i32 5, i32* %arrayidx3, align 4
+  %arrayidx10 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 0
+  %2 = load i32* %arrayidx10, align 4
+  store i32 %2, i32 addrspace(1)* %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32]* %stack, i32 0, i32 1
+  %3 = load i32* %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32 addrspace(1)* %out, i32 1
+  store i32 %3, i32 addrspace(1)* %arrayidx13
+  ret void
+}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; R600-CHECK-LABEL: @multiple_structs
+; R600-CHECK-NOT: MOVA_INT
+; SI-CHECK-LABEL: @multiple_structs
+; SI-CHECK-NOT: V_MOVREL
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) {
+entry:
+  %a = alloca %struct.point
+  %b = alloca %struct.point
+  %a.x.ptr = getelementptr %struct.point* %a, i32 0, i32 0
+  %a.y.ptr = getelementptr %struct.point* %a, i32 0, i32 1
+  %b.x.ptr = getelementptr %struct.point* %b, i32 0, i32 0
+  %b.y.ptr = getelementptr %struct.point* %b, i32 0, i32 1
+  store i32 0, i32* %a.x.ptr
+  store i32 1, i32* %a.y.ptr
+  store i32 2, i32* %b.x.ptr
+  store i32 3, i32* %b.y.ptr
+  %a.indirect.ptr = getelementptr %struct.point* %a, i32 0, i32 0
+  %b.indirect.ptr = getelementptr %struct.point* %b, i32 0, i32 0
+  %a.indirect = load i32* %a.indirect.ptr
+  %b.indirect = load i32* %b.indirect.ptr
+  %0 = add i32 %a.indirect, %b.indirect
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test direct access of a private array inside a loop.  The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; R600-CHECK-LABLE: @direct_loop
+; R600-CHECK-NOT: MOVA_INT
+; SI-CHECK-LABEL: @direct_loop
+; SI-CHECK-NOT: V_MOVREL
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %prv_array_const = alloca [2 x i32]
+  %prv_array = alloca [2 x i32]
+  %a = load i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %b = load i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+  store i32 %a, i32* %a_dst_ptr
+  %b_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 1
+  store i32 %b, i32* %b_dst_ptr
+  br label %for.body
+
+for.body:
+  %inc = phi i32 [0, %entry], [%count, %for.body]
+  %x_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32* %x_ptr
+  %y_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32* %y_ptr
+  %xy = add i32 %x, %y
+  store i32 %xy, i32* %y_ptr
+  %count = add i32 %inc, 1
+  %done = icmp eq i32 %count, 4095
+  br i1 %done, label %for.end, label %for.body
+
+for.end:
+  %value_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32* %value_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/pv-packing.ll b/test/CodeGen/R600/pv-packing.ll
index 03fc204..e5615b9 100644
--- a/test/CodeGen/R600/pv-packing.ll
+++ b/test/CodeGen/R600/pv-packing.ll
@@ -3,17 +3,17 @@
 ;CHECK: DOT4  T{{[0-9]\.X}}
 ;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 8)
-  %4 = call float @llvm.R600.load.input(i32 9)
-  %5 = call float @llvm.R600.load.input(i32 10)
-  %6 = call float @llvm.R600.load.input(i32 12)
-  %7 = call float @llvm.R600.load.input(i32 13)
-  %8 = call float @llvm.R600.load.input(i32 14)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg2, i32 0
+  %4 = extractelement <4 x float> %reg2, i32 1
+  %5 = extractelement <4 x float> %reg2, i32 2
+  %6 = extractelement <4 x float> %reg3, i32 0
+  %7 = extractelement <4 x float> %reg3, i32 1
+  %8 = extractelement <4 x float> %reg3, i32 2
   %9 = load <4 x float> addrspace(8)* null
   %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
@@ -36,9 +36,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-; Function Attrs: readnone
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 
@@ -46,5 +43,3 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="1" }
 attributes #1 = { readnone }
-attributes #2 = { readonly }
-attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 6e0b744..5a930b2 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -1,38 +1,38 @@
 ; RUN: llc < %s -march=r600 | FileCheck %s
 
 ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-;CHECK: CNDGE T{{[0-9].[XYZW]}}, PV.X
+;CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
-  %5 = call float @llvm.R600.load.input(i32 9)
-  %6 = call float @llvm.R600.load.input(i32 10)
-  %7 = call float @llvm.R600.load.input(i32 11)
-  %8 = call float @llvm.R600.load.input(i32 12)
-  %9 = call float @llvm.R600.load.input(i32 13)
-  %10 = call float @llvm.R600.load.input(i32 14)
-  %11 = call float @llvm.R600.load.input(i32 15)
-  %12 = call float @llvm.R600.load.input(i32 16)
-  %13 = call float @llvm.R600.load.input(i32 17)
-  %14 = call float @llvm.R600.load.input(i32 18)
-  %15 = call float @llvm.R600.load.input(i32 19)
-  %16 = call float @llvm.R600.load.input(i32 20)
-  %17 = call float @llvm.R600.load.input(i32 21)
-  %18 = call float @llvm.R600.load.input(i32 22)
-  %19 = call float @llvm.R600.load.input(i32 23)
-  %20 = call float @llvm.R600.load.input(i32 24)
-  %21 = call float @llvm.R600.load.input(i32 25)
-  %22 = call float @llvm.R600.load.input(i32 26)
-  %23 = call float @llvm.R600.load.input(i32 27)
-  %24 = call float @llvm.R600.load.input(i32 28)
-  %25 = call float @llvm.R600.load.input(i32 29)
-  %26 = call float @llvm.R600.load.input(i32 30)
-  %27 = call float @llvm.R600.load.input(i32 31)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
+  %12 = extractelement <4 x float> %reg4, i32 0
+  %13 = extractelement <4 x float> %reg4, i32 1
+  %14 = extractelement <4 x float> %reg4, i32 2
+  %15 = extractelement <4 x float> %reg4, i32 3
+  %16 = extractelement <4 x float> %reg5, i32 0
+  %17 = extractelement <4 x float> %reg5, i32 1
+  %18 = extractelement <4 x float> %reg5, i32 2
+  %19 = extractelement <4 x float> %reg5, i32 3
+  %20 = extractelement <4 x float> %reg6, i32 0
+  %21 = extractelement <4 x float> %reg6, i32 1
+  %22 = extractelement <4 x float> %reg6, i32 2
+  %23 = extractelement <4 x float> %reg6, i32 3
+  %24 = extractelement <4 x float> %reg7, i32 0
+  %25 = extractelement <4 x float> %reg7, i32 1
+  %26 = extractelement <4 x float> %reg7, i32 2
+  %27 = extractelement <4 x float> %reg7, i32 3
   %28 = load <4 x float> addrspace(8)* null
   %29 = extractelement <4 x float> %28, i32 0
   %30 = fmul float %0, %29
@@ -219,9 +219,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-; Function Attrs: readnone
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 ; Function Attrs: readonly
diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
index 6ef3c31..b760c88 100644
--- a/test/CodeGen/R600/r600-encoding.ll
+++ b/test/CodeGen/R600/r600-encoding.ll
@@ -10,15 +10,16 @@
 ; R600-CHECK: @test
 ; R600-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
-define void @test() {
+define void @test(<4 x float> inreg %reg0) #0 {
 entry:
-  %0 = call float @llvm.R600.load.input(i32 0)
-  %1 = call float @llvm.R600.load.input(i32 1)
-  %2 = fmul float %0, %1
-  call void @llvm.AMDGPU.store.output(float %2, i32 0)
+  %r0 = extractelement <4 x float> %reg0, i32 0
+  %r1 = extractelement <4 x float> %reg0, i32 1
+  %r2 = fmul float %r0, %r1
+  %vec = insertelement <4 x float> undef, float %r2, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/r600-export-fix.ll b/test/CodeGen/R600/r600-export-fix.ll
new file mode 100644
index 0000000..73bc063
--- /dev/null
+++ b/test/CodeGen/R600/r600-export-fix.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
+
+;CHECK:	EXPORT T{{[0-9]}}.XYZW
+;CHECK:	EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0XZW
+;CHECK: EXPORT T{{[0-9]}}.XYZW
+;CHECK: EXPORT T{{[0-9]}}.YX00
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fmul float %5, %0
+  %7 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %8 = extractelement <4 x float> %7, i32 1
+  %9 = fmul float %8, %0
+  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %11 = extractelement <4 x float> %10, i32 2
+  %12 = fmul float %11, %0
+  %13 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %14 = extractelement <4 x float> %13, i32 3
+  %15 = fmul float %14, %0
+  %16 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %17 = extractelement <4 x float> %16, i32 0
+  %18 = fmul float %17, %1
+  %19 = fadd float %18, %6
+  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %21 = extractelement <4 x float> %20, i32 1
+  %22 = fmul float %21, %1
+  %23 = fadd float %22, %9
+  %24 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %25 = extractelement <4 x float> %24, i32 2
+  %26 = fmul float %25, %1
+  %27 = fadd float %26, %12
+  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %29 = extractelement <4 x float> %28, i32 3
+  %30 = fmul float %29, %1
+  %31 = fadd float %30, %15
+  %32 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = fmul float %33, %2
+  %35 = fadd float %34, %19
+  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %37 = extractelement <4 x float> %36, i32 1
+  %38 = fmul float %37, %2
+  %39 = fadd float %38, %23
+  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %41 = extractelement <4 x float> %40, i32 2
+  %42 = fmul float %41, %2
+  %43 = fadd float %42, %27
+  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %45 = extractelement <4 x float> %44, i32 3
+  %46 = fmul float %45, %2
+  %47 = fadd float %46, %31
+  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %49 = extractelement <4 x float> %48, i32 0
+  %50 = fmul float %49, %3
+  %51 = fadd float %50, %35
+  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %53 = extractelement <4 x float> %52, i32 1
+  %54 = fmul float %53, %3
+  %55 = fadd float %54, %39
+  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %57 = extractelement <4 x float> %56, i32 2
+  %58 = fmul float %57, %3
+  %59 = fadd float %58, %43
+  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = fmul float %61, %3
+  %63 = fadd float %62, %47
+  %64 = load <4 x float> addrspace(8)* null
+  %65 = extractelement <4 x float> %64, i32 0
+  %66 = load <4 x float> addrspace(8)* null
+  %67 = extractelement <4 x float> %66, i32 1
+  %68 = load <4 x float> addrspace(8)* null
+  %69 = extractelement <4 x float> %68, i32 2
+  %70 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %71 = extractelement <4 x float> %70, i32 0
+  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %73 = extractelement <4 x float> %72, i32 1
+  %74 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %75 = extractelement <4 x float> %74, i32 2
+  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %77 = extractelement <4 x float> %76, i32 0
+  %78 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %79 = extractelement <4 x float> %78, i32 1
+  %80 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %81 = extractelement <4 x float> %80, i32 2
+  %82 = insertelement <4 x float> undef, float %51, i32 0
+  %83 = insertelement <4 x float> %82, float %55, i32 1
+  %84 = insertelement <4 x float> %83, float %59, i32 2
+  %85 = insertelement <4 x float> %84, float %63, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1)
+  %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1
+  %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2)
+  %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1
+  %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2
+  %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2)
+  %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %95 = insertelement <4 x float> %94, float %65, i32 1
+  %96 = insertelement <4 x float> %95, float %67, i32 2
+  %97 = insertelement <4 x float> %96, float %69, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2)
+  %98 = insertelement <4 x float> undef, float %77, i32 0
+  %99 = insertelement <4 x float> %98, float %79, i32 1
+  %100 = insertelement <4 x float> %99, float %81, i32 2
+  %101 = insertelement <4 x float> %100, float %71, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2)
+  %102 = insertelement <4 x float> undef, float %73, i32 0
+  %103 = insertelement <4 x float> %102, float %75, i32 1
+  %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2
+  %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2)
+  %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1
+  %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2
+  %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2)
+  %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1
+  %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2
+  %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2)
+  ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll
index 895ad5e..6dee3ef 100644
--- a/test/CodeGen/R600/r600cfg.ll
+++ b/test/CodeGen/R600/r600cfg.ll
@@ -1,12 +1,12 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood
 ;REQUIRES: asserts
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = bitcast float %0 to i32
   %5 = icmp eq i32 %4, 0
   %6 = sext i1 %5 to i32
@@ -113,12 +113,8 @@ ENDIF48:                                          ; preds = %LOOP47
   br label %LOOP47
 }
 
-; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
 declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/reciprocal.ll b/test/CodeGen/R600/reciprocal.ll
index 2783929..b4ac47a 100644
--- a/test/CodeGen/R600/reciprocal.ll
+++ b/test/CodeGen/R600/reciprocal.ll
@@ -2,15 +2,14 @@
 
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0  {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = fdiv float 1.0, %r0
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
-
-declare float @llvm.AMDGPU.rcp(float ) readnone
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index 5c4c4e9..edf7aee 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -o - | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-; R600-CHECK: @rotr
+; R600-CHECK-LABEL: @rotr:
 ; R600-CHECK: BIT_ALIGN_INT
 
-; SI-CHECK: @rotr
+; SI-CHECK-LABEL: @rotr:
 ; SI-CHECK: V_ALIGNBIT_B32
 define void @rotr(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
@@ -16,14 +16,16 @@ entry:
   ret void
 }
 
-; R600-CHECK: @rotl
+; R600-CHECK-LABEL: @rotl:
 ; R600-CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
 ; R600-CHECK-NEXT: 32
-; R600-CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+; R600-CHECK: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
 
-; SI-CHECK: @rotl
-; SI-CHECK: V_SUB_I32_e64 [[DST:VGPR[0-9]+]], 32, {{[SV]GPR[0-9]+}}
-; SI-CHECK: V_ALIGNBIT_B32 {{VGPR[0-9]+, [SV]GPR[0-9]+, VGPR[0-9]+}}, [[DST]]
+
+; SI-CHECK-LABEL: @rotl:
+; SI-CHECK: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI-CHECK: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI-CHECK: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
 define void @rotl(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
diff --git a/test/CodeGen/R600/rv7x0_count3.ll b/test/CodeGen/R600/rv7x0_count3.ll
index 474d6ba..c3fd923 100644
--- a/test/CodeGen/R600/rv7x0_count3.ll
+++ b/test/CodeGen/R600/rv7x0_count3.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s
 
-; CHECK: TEX 9 @4 ;  encoding: [0x04,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
+; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-   %1 = call float @llvm.R600.load.input(i32 4)
-   %2 = call float @llvm.R600.load.input(i32 5)
-   %3 = call float @llvm.R600.load.input(i32 6)
-   %4 = call float @llvm.R600.load.input(i32 7)
+define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+   %1 = extractelement <4 x float> %reg1, i32 0
+   %2 = extractelement <4 x float> %reg1, i32 1
+   %3 = extractelement <4 x float> %reg1, i32 2
+   %4 = extractelement <4 x float> %reg1, i32 3
    %5 = insertelement <4 x float> undef, float %1, i32 0
    %6 = insertelement <4 x float> %5, float %2, i32 1
    %7 = insertelement <4 x float> %6, float %3, i32 2
@@ -36,9 +36,6 @@ define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in)
 
 declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
 
-; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-attributes #1 = { readnone }
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
index ba9620c..11e8f51 100644
--- a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
+++ b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
@@ -1,12 +1,12 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
 main_body:
-  %0 = call float @llvm.R600.interp.input(i32 0, i32 0)
-  %1 = call float @llvm.R600.interp.input(i32 1, i32 0)
-  %2 = call float @llvm.R600.interp.input(i32 2, i32 0)
-  %3 = call float @llvm.R600.interp.input(i32 3, i32 0)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = fcmp ult float %1, 0.000000e+00
   %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
   %6 = fsub float -0.000000e+00, %5
@@ -74,10 +74,9 @@ ELSE17:                                           ; preds = %ELSE
   br label %ENDIF
 }
 
-declare float @llvm.R600.interp.input(i32, i32) #0
-
 declare float @llvm.AMDIL.clamp.(float, float, float) #0
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { readnone }
+attributes #1 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested.ll b/test/CodeGen/R600/schedule-fs-loop-nested.ll
index 5e875c4..b917ec6 100644
--- a/test/CodeGen/R600/schedule-fs-loop-nested.ll
+++ b/test/CodeGen/R600/schedule-fs-loop-nested.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-fs-loop.ll b/test/CodeGen/R600/schedule-fs-loop.ll
index d142cac..d6c194b 100644
--- a/test/CodeGen/R600/schedule-fs-loop.ll
+++ b/test/CodeGen/R600/schedule-fs-loop.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-if-2.ll b/test/CodeGen/R600/schedule-if-2.ll
index 6afd677..38aad18 100644
--- a/test/CodeGen/R600/schedule-if-2.ll
+++ b/test/CodeGen/R600/schedule-if-2.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-if.ll b/test/CodeGen/R600/schedule-if.ll
index 347d92f..f960c93 100644
--- a/test/CodeGen/R600/schedule-if.ll
+++ b/test/CodeGen/R600/schedule-if.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
index 44b7c2f..33b20d3 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
@@ -1,12 +1,12 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
 ;REQUIRES: asserts
 
-define void @main() {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = fcmp ult float %0, 0.000000e+00
   %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
   %6 = fsub float -0.000000e+00, %5
@@ -127,8 +127,6 @@ ENDIF19:                                          ; preds = %ENDIF16
   br label %LOOP
 }
 
-declare float @llvm.R600.load.input(i32) #0
-
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { readnone }
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll
new file mode 100644
index 0000000..f940142
--- /dev/null
+++ b/test/CodeGen/R600/select.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Normally icmp + select is optimized to select_cc, when this happens the
+; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
+;
+; In order to avoid the select_cc optimization, this test case calculates the
+; condition for the select in a separate basic block.
+
+; CHECK-LABEL: @select
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+                     <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
+                     <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
+                     i32 %cond) {
+entry:
+  br label %for
+body:
+  %inc = add i32 %i, 1
+  %br_cmp.i = icmp eq i1 %br_cmp, 0
+  br label %for
+for:
+  %i = phi i32 [ %inc, %body], [ 0, %entry ]
+  %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ]
+  %0 = icmp eq i32 %cond, %i
+  %1 = select i1 %br_cmp, i32 2, i32 3
+  %2 = select i1 %br_cmp, float 2.0 , float 5.0
+  %3 = select i1 %br_cmp, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 4, i32 5>
+  %4 = select i1 %br_cmp, <2 x float> <float 2.0, float 3.0>, <2 x float> <float 4.0, float 5.0>
+  %5 = select i1 %br_cmp, <4 x i32> <i32 2 , i32 3, i32 4, i32 5>, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+  %6 = select i1 %br_cmp, <4 x float> <float 2.0, float 3.0, float 4.0, float 5.0>, <4 x float> <float 6.0, float 7.0, float 8.0, float 9.0>
+  br i1 %0, label %body, label %done
+
+done:
+  store i32 %1, i32 addrspace(1)* %i32out
+  store float %2, float addrspace(1)* %f32out
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out
+  store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out
+  store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-cnd.ll b/test/CodeGen/R600/selectcc-cnd.ll
index d7287b4..0bfca69 100644
--- a/test/CodeGen/R600/selectcc-cnd.ll
+++ b/test/CodeGen/R600/selectcc-cnd.ll
@@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE
-;CHECK: CNDE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
-;CHECK-NEXT: {{[-0-9]+\(2.0}}
+;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
+;CHECK: 1073741824
 define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
diff --git a/test/CodeGen/R600/selectcc-cnde-int.ll b/test/CodeGen/R600/selectcc-cnde-int.ll
index 768dc7d..d568888 100644
--- a/test/CodeGen/R600/selectcc-cnde-int.ll
+++ b/test/CodeGen/R600/selectcc-cnde-int.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE_INT
-;CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
+;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32 addrspace(1)* %in
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
index 7e2d559..834c030 100644
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ b/test/CodeGen/R600/selectcc-opt.ll
@@ -6,7 +6,7 @@
 
 define void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 0.000000e+00
+  %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -34,7 +34,7 @@ ENDIF:
 ; CHECK-NEXT: ALU clause starting
 define void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 0.0
+  %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
index 291a7bd..5c7d499 100644
--- a/test/CodeGen/R600/set-dx10.ll
+++ b/test/CodeGen/R600/set-dx10.ll
@@ -5,7 +5,8 @@
 ; SET*DX10 instructions.
 
 ; CHECK: @fcmp_une_select_fptosi
-; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -18,7 +19,8 @@ entry:
 }
 
 ; CHECK: @fcmp_une_select_i32
-; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -28,12 +30,13 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ueq_select_fptosi
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oeq_select_fptosi
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ueq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ueq float %in, 5.0
+  %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -41,23 +44,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ueq_select_i32
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oeq_select_i32
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ueq_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ueq float %in, 5.0
+  %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_ugt_select_fptosi
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_ogt_select_fptosi
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ugt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ugt float %in, 5.0
+  %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -65,23 +70,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ugt_select_i32
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_ogt_select_i32
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ugt_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ugt float %in, 5.0
+  %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_uge_select_fptosi
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oge_select_fptosi
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_uge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp uge float %in, 5.0
+  %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -89,23 +96,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_uge_select_i32
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oge_select_i32
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_uge_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp uge float %in, 5.0
+  %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_ule_select_fptosi
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_ole_select_fptosi
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ule_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ule float %in, 5.0
+  %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -113,23 +122,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ule_select_i32
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_ole_select_i32
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ule_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ule float %in, 5.0
+  %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_ult_select_fptosi
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_olt_select_fptosi
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ult_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 5.0
+  %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -137,12 +148,13 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ult_select_i32
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_olt_select_i32
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ult_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 5.0
+  %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 992de70..8d34c4a 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,8 +1,9 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
-; CHECK: @setcc_v2i32
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; FUNC-LABEL: @setcc_v2i32
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
 
 define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
   %result = icmp eq <2 x i32> %a, %b
@@ -11,11 +12,11 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %
   ret void
 }
 
-; CHECK: @setcc_v4i32
-; EG-CHECK-DAG: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @setcc_v4i32
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -26,3 +27,307 @@ define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
   store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;;;==========================================================================;;;
+;; Float comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @f32_oeq
+; R600: SETE_DX10
+; SI: V_CMP_EQ_F32
+define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oeq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ogt
+; R600: SETGT_DX10
+; SI: V_CMP_GT_F32
+define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ogt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_oge
+; R600: SETGE_DX10
+; SI: V_CMP_GE_F32
+define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_olt
+; R600: SETGT_DX10
+; SI: V_CMP_LT_F32
+define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp olt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ole
+; R600: SETGE_DX10
+; SI: V_CMP_LE_F32
+define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ole float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_one
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+; SI: V_CMP_O_F32
+; SI: V_CMP_NEQ_F32
+; SI: S_AND_B64
+define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp one float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ord
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+; SI: V_CMP_O_F32
+define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ord float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ueq
+; R600-DAG: SETNE_DX10
+; R600-DAG: SETNE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETNE_INT
+; SI: V_CMP_U_F32
+; SI: V_CMP_EQ_F32
+; SI: S_OR_B64
+define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ueq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ugt
+; R600: SETGE
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_GT_F32
+; SI: S_OR_B64
+define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ugt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_uge
+; R600: SETGT
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_GE_F32
+; SI: S_OR_B64
+define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ult
+; R600: SETGE
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_LT_F32
+; SI: S_OR_B64
+define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ult float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ule
+; R600: SETGT
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_LE_F32
+; SI: S_OR_B64
+define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ule float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_une
+; R600: SETNE_DX10
+; SI: V_CMP_NEQ_F32
+define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp une float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_uno
+; R600: SETNE_DX10
+; R600: SETNE_DX10
+; R600: OR_INT
+; R600: SETNE_INT
+; SI: V_CMP_U_F32
+define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uno float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 32-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @i32_eq
+; R600: SETE_INT
+; SI: V_CMP_EQ_I32
+define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ne
+; R600: SETNE_INT
+; SI: V_CMP_NE_I32
+define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ne i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ugt
+; R600: SETGT_UINT
+; SI: V_CMP_GT_U32
+define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ugt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_uge
+; R600: SETGE_UINT
+; SI: V_CMP_GE_U32
+define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp uge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ult
+; R600: SETGT_UINT
+; SI: V_CMP_LT_U32
+define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ult i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ule
+; R600: SETGE_UINT
+; SI: V_CMP_LE_U32
+define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ule i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_sgt
+; R600: SETGT_INT
+; SI: V_CMP_GT_I32
+define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sgt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_sge
+; R600: SETGE_INT
+; SI: V_CMP_GE_I32
+define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_slt
+; R600: SETGT_INT
+; SI: V_CMP_LT_I32
+define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp slt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_sle
+; R600: SETGE_INT
+; SI: V_CMP_LE_I32
+define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sle i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
new file mode 100644
index 0000000..9202fc0
--- /dev/null
+++ b/test/CodeGen/R600/setcc64.ll
@@ -0,0 +1,263 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; XXX: Merge this into setcc, once R600 supports 64-bit operations
+
+;;;==========================================================================;;;
+;; Double comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @f64_oeq
+; SI: V_CMP_EQ_F64
+define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oeq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ogt
+; SI: V_CMP_GT_F64
+define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ogt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_oge
+; SI: V_CMP_GE_F64
+define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_olt
+; SI: V_CMP_LT_F64
+define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp olt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ole
+; SI: V_CMP_LE_F64
+define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ole double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_one
+; SI: V_CMP_O_F64
+; SI: V_CMP_NEQ_F64
+; SI: S_AND_B64
+define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp one double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ord
+; SI: V_CMP_O_F64
+define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ord double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ueq
+; SI: V_CMP_U_F64
+; SI: V_CMP_EQ_F64
+; SI: S_OR_B64
+define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ueq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ugt
+; SI: V_CMP_U_F64
+; SI: V_CMP_GT_F64
+; SI: S_OR_B64
+define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ugt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_uge
+; SI: V_CMP_U_F64
+; SI: V_CMP_GE_F64
+; SI: S_OR_B64
+define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ult
+; SI: V_CMP_U_F64
+; SI: V_CMP_LT_F64
+; SI: S_OR_B64
+define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ult double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ule
+; SI: V_CMP_U_F64
+; SI: V_CMP_LE_F64
+; SI: S_OR_B64
+define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ule double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_une
+; SI: V_CMP_NEQ_F64
+define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp une double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_uno
+; SI: V_CMP_U_F64
+define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uno double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 64-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @i64_eq
+; SI: V_CMP_EQ_I64
+define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ne
+; SI: V_CMP_NE_I64
+define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ne i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ugt
+; SI: V_CMP_GT_U64
+define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ugt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_uge
+; SI: V_CMP_GE_U64
+define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp uge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ult
+; SI: V_CMP_LT_U64
+define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ult i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ule
+; SI: V_CMP_LE_U64
+define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ule i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_sgt
+; SI: V_CMP_GT_I64
+define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sgt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_sge
+; SI: V_CMP_GE_I64
+define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_slt
+; SI: V_CMP_LT_I64
+define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp slt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_sle
+; SI: V_CMP_LE_I64
+define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sle i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index 19716f8..8633a4b 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_O_F32_e64 SGPR0_SGPR1, {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}, 0, 0, 0, 0
+;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index 929dbb1..c77a37e 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_U_F32_e64 SGPR0_SGPR1, {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}, 0, 0, 0, 0
+;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
new file mode 100644
index 0000000..d74161b
--- /dev/null
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; Copy VGPR -> SGPR used twice as an instruction operand, which is then
+; used in an REG_SEQUENCE that also needs to be handled.
+
+; SI-LABEL: @test_dup_operands:
+; SI: V_ADD_I32_e32
+define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+  %a = load <2 x i32> addrspace(1)* %in
+  %lo = extractelement <2 x i32> %a, i32 0
+  %hi = extractelement <2 x i32> %a, i32 1
+  %add = add i32 %lo, %lo
+  %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0
+  %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1
+  store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index b0d4549..5472c1b 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI  | FileCheck %s
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
-; CHECK: @main
-; CHECK: S_BUFFER_LOAD_DWORD [[DST:SGPR[0-9]]], {{[SGPR_[0-9]+}}, 0
-; CHECK: V_MOV_B32_e32 VGPR{{[0-9]}}, [[DST]]
+; CHECK-LABEL: @phi1
+; CHECK: S_BUFFER_LOAD_DWORD [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: V_MOV_B32_e32 v{{[0-9]}}, [[DST]]
 
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
   %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
   %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
@@ -28,13 +28,133 @@ ENDIF:                                            ; preds = %main_body, %ELSE
   ret void
 }
 
+; Make sure this program doesn't crash
+; CHECK-LABEL: @phi2
+define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
+  %37 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %38 = load <32 x i8> addrspace(2)* %37, !tbaa !1
+  %39 = getelementptr <16 x i8> addrspace(2)* %1, i32 0
+  %40 = load <16 x i8> addrspace(2)* %39, !tbaa !1
+  %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
+  %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5)
+  %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5)
+  %46 = bitcast float %41 to i32
+  %47 = bitcast float %42 to i32
+  %48 = insertelement <2 x i32> undef, i32 %46, i32 0
+  %49 = insertelement <2 x i32> %48, i32 %47, i32 1
+  %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = call float @fabs(float %51)
+  %53 = fmul float %43, %43
+  %54 = fmul float %44, %44
+  %55 = fadd float %54, %53
+  %56 = fmul float %45, %45
+  %57 = fadd float %55, %56
+  %58 = call float @llvm.AMDGPU.rsq(float %57)
+  %59 = fmul float %43, %58
+  %60 = fmul float %44, %58
+  %61 = fmul float %45, %58
+  %62 = fmul float %59, %23
+  %63 = fmul float %60, %24
+  %64 = fadd float %63, %62
+  %65 = fmul float %61, %25
+  %66 = fadd float %64, %65
+  %67 = fsub float -0.000000e+00, %26
+  %68 = fmul float %66, %52
+  %69 = fadd float %68, %67
+  %70 = fmul float %27, %69
+  %71 = fmul float %28, %69
+  %72 = call float @fabs(float %70)
+  %73 = fcmp olt float 0x3EE4F8B580000000, %72
+  %74 = sext i1 %73 to i32
+  %75 = bitcast i32 %74 to float
+  %76 = bitcast float %75 to i32
+  %77 = icmp ne i32 %76, 0
+  br i1 %77, label %IF, label %ENDIF
+
+IF:                                               ; preds = %main_body
+  %78 = fsub float -0.000000e+00, %70
+  %79 = call float @llvm.AMDIL.exp.(float %78)
+  %80 = fsub float -0.000000e+00, %79
+  %81 = fadd float 1.000000e+00, %80
+  %82 = fdiv float 1.000000e+00, %70
+  %83 = fmul float %81, %82
+  %84 = fmul float %32, %83
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %IF
+  %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ]
+  %85 = call float @fabs(float %71)
+  %86 = fcmp olt float 0x3EE4F8B580000000, %85
+  %87 = sext i1 %86 to i32
+  %88 = bitcast i32 %87 to float
+  %89 = bitcast float %88 to i32
+  %90 = icmp ne i32 %89, 0
+  br i1 %90, label %IF25, label %ENDIF24
+
+IF25:                                             ; preds = %ENDIF
+  %91 = fsub float -0.000000e+00, %71
+  %92 = call float @llvm.AMDIL.exp.(float %91)
+  %93 = fsub float -0.000000e+00, %92
+  %94 = fadd float 1.000000e+00, %93
+  %95 = fdiv float 1.000000e+00, %71
+  %96 = fmul float %94, %95
+  %97 = fmul float %36, %96
+  br label %ENDIF24
+
+ENDIF24:                                          ; preds = %ENDIF, %IF25
+  %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ]
+  %98 = fmul float %29, %temp4.0
+  %99 = fmul float %30, %temp4.0
+  %100 = fmul float %31, %temp4.0
+  %101 = fmul float %33, %temp8.0
+  %102 = fadd float %101, %98
+  %103 = fmul float %34, %temp8.0
+  %104 = fadd float %103, %99
+  %105 = fmul float %35, %temp8.0
+  %106 = fadd float %105, %100
+  %107 = call float @llvm.pow.f32(float %52, float %22)
+  %108 = fsub float -0.000000e+00, %102
+  %109 = fmul float %108, %107
+  %110 = fsub float -0.000000e+00, %104
+  %111 = fmul float %110, %107
+  %112 = fsub float -0.000000e+00, %106
+  %113 = fmul float %112, %107
+  %114 = call i32 @llvm.SI.packf16(float %109, float %111)
+  %115 = bitcast i32 %114 to float
+  %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00)
+  %117 = bitcast i32 %116 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117)
+  ret void
+}
+
 ; We just want ot make sure the program doesn't crash
-; CHECK: @loop
+; CHECK-LABEL: @loop
 
 define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
   %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
   %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
@@ -79,6 +199,129 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { readonly }
+attributes #3 = { readnone }
+attributes #4 = { nounwind readonly }
+
+!0 = metadata !{metadata !"const", null}
+!1 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #3
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+; This checks for a bug in the FixSGPRCopies pass where VReg96
+; registers were being identified as an SGPR regclass which was causing
+; an assertion failure.
+
+; CHECK-LABEL: @sample_v3
+; CHECK: IMAGE_SAMPLE
+; CHECK: IMAGE_SAMPLE
+; CHECK: EXP
+; CHECK: S_ENDPGM
+define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 
-!0 = metadata !{metadata !"const", null, i32 1}
+entry:
+  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !2
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
+  %24 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %25 = load <32 x i8> addrspace(2)* %24, !tbaa !2
+  %26 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %27 = load <16 x i8> addrspace(2)* %26, !tbaa !2
+  %28 = fcmp oeq float %23, 0.0
+  br i1 %28, label %if, label %else
+
+if:
+  %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 0, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.if.0 = extractelement <4 x float> %val.if, i32 0
+  %val.if.1 = extractelement <4 x float> %val.if, i32 1
+  %val.if.2 = extractelement <4 x float> %val.if, i32 2
+  br label %endif
+
+else:
+  %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 1, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.else.0 = extractelement <4 x float> %val.else, i32 0
+  %val.else.1 = extractelement <4 x float> %val.else, i32 1
+  %val.else.2 = extractelement <4 x float> %val.else, i32 2
+  br label %endif
+
+endif:
+  %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else]
+  %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else]
+  %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else]
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0)
+  ret void
+}
+
+!2 = metadata !{metadata !"const", null, i32 1}
+
+; CHECK-LABEL: @copy1
+; CHECK: BUFFER_LOAD_DWORD
+; CHECK: V_ADD
+; CHECK: S_ENDPGM
+define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+entry:
+  %0 = load float addrspace(1)* %in0
+  %1 = fcmp oeq float %0, 0.0
+  br i1 %1, label %if0, label %endif
+
+if0:
+  %2 = bitcast float %0 to i32
+  %3 = fcmp olt float %0, 0.0
+  br i1 %3, label %if1, label %endif
+
+if1:
+  %4 = add i32 %2, 1
+  br label %endif
+
+endif:
+  %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ]
+  %6 = bitcast i32 %5 to float
+  store float %6, float addrspace(1)* %out
+  ret void
+}
+
+; This test is just checking that we don't crash / assertion fail.
+; CHECK-LABEL: @copy2
+; CHECK: S_ENDPGM
+
+define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+entry:
+  br label %LOOP68
+
+LOOP68:
+  %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ]
+  %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ]
+  %g = icmp eq i32 0, %t
+  %l = bitcast float %temp4.7 to i32
+  br i1 %g, label %IF70, label %ENDIF69
+
+IF70:
+  %q = icmp ne i32 %l, 13
+  %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  ret void
+
+ENDIF69:
+  %u = add i32 %l, %t
+  %v = bitcast i32 %u to float
+  %x = add i32 %t, -1
+  br label %LOOP68
+}
+
+attributes #0 = { "ShaderType"="0" }
 
diff --git a/test/CodeGen/R600/shared-op-cycle.ll b/test/CodeGen/R600/shared-op-cycle.ll
new file mode 100644
index 0000000..0484fc9
--- /dev/null
+++ b/test/CodeGen/R600/shared-op-cycle.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @main
+; CHECK: MULADD_IEEE *
+; CHECK-NOT: MULADD_IEEE *
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+   %w0 = extractelement <4 x float> %reg0, i32 3
+   %w1 = extractelement <4 x float> %reg1, i32 3
+   %w2 = extractelement <4 x float> %reg2, i32 3
+   %sq0 = fmul float %w0, %w0
+   %r0 = fadd float %sq0, 2.0
+   %sq1 = fmul float %w1, %w1
+   %r1 = fadd float %sq1, 2.0
+   %sq2 = fmul float %w2, %w2
+   %r2 = fadd float %sq2, 2.0
+   %v0 = insertelement <4 x float> undef, float %r0, i32 0
+   %v1 = insertelement <4 x float> %v0, float %r1, i32 1
+   %v2 = insertelement <4 x float> %v1, float %r2, i32 2
+   %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
+   %vecres = insertelement <4 x float> undef, float %res, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
+   ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index d99e325..4a6aab4 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @shl_v2i32
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @shl_v2i32
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -25,10 +25,10 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @shl_v4i32
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/short-args.ll b/test/CodeGen/R600/short-args.ll
deleted file mode 100644
index 20d0ae4..0000000
--- a/test/CodeGen/R600/short-args.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
-
-; EG-CHECK: @i8_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: BUFFER_LOAD_UBYTE
-
-define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i8_zext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i8_sext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
-entry:
-  %0 = sext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i16_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: BUFFER_LOAD_USHORT
-
-define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i16_zext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i16_sext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
-entry:
-  %0 = sext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
new file mode 100644
index 0000000..9886fe9
--- /dev/null
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll
@@ -0,0 +1,23 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=SI -asm-verbose=false < %s | FileCheck %s
+
+
+define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+; CHECK-LABEL: @test:
+
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 60, label %sw.bb
+  ]
+
+sw.bb:
+  unreachable
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  ret void
+}
+
diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
new file mode 100644
index 0000000..8d7a79c
--- /dev/null
+++ b/test/CodeGen/R600/si-lod-bias.ll
@@ -0,0 +1,51 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; This shader has the potential to generated illegal VGPR to SGPR copies if
+; the wrong register class is used for the REG_SEQUENCE instructions.
+
+; CHECK: @main
+; CHECK: IMAGE_SAMPLE_B v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %24 = load <32 x i8> addrspace(2)* %23, !tbaa !1
+  %25 = getelementptr <16 x i8> addrspace(2)* %1, i32 0
+  %26 = load <16 x i8> addrspace(2)* %25, !tbaa !1
+  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %29 = bitcast float %22 to i32
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %28 to i32
+  %32 = insertelement <4 x i32> undef, i32 %29, i32 0
+  %33 = insertelement <4 x i32> %32, i32 %30, i32 1
+  %34 = insertelement <4 x i32> %33, i32 %31, i32 2
+  %35 = insertelement <4 x i32> %34, i32 undef, i32 3
+  %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = extractelement <4 x float> %36, i32 1
+  %39 = extractelement <4 x float> %36, i32 2
+  %40 = extractelement <4 x float> %36, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null}
+!1 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
new file mode 100644
index 0000000..05c5e31
--- /dev/null
+++ b/test/CodeGen/R600/si-sgpr-spill.ll
@@ -0,0 +1,692 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+
+; XXX: Enable when spilling is supported
+; XFAIL: *
+
+; These tests check that the compiler won't crash when it needs to spill
+; SGPRs.
+
+; CHECK-LABEL: @main
+; Writing to M0 from an SMRD instruction will hang the GPU.
+; CHECK-NOT: S_BUFFER_LOAD_DWORD m0
+; CHECK: S_ENDPGM
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !0
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
+  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
+  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
+  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
+  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
+  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
+  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
+  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224)
+  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
+  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
+  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
+  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
+  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
+  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
+  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
+  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
+  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
+  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296)
+  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304)
+  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308)
+  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312)
+  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368)
+  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
+  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
+  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
+  %61 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %62 = load <32 x i8> addrspace(2)* %61, !tbaa !0
+  %63 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %64 = load <16 x i8> addrspace(2)* %63, !tbaa !0
+  %65 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %66 = load <32 x i8> addrspace(2)* %65, !tbaa !0
+  %67 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %68 = load <16 x i8> addrspace(2)* %67, !tbaa !0
+  %69 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %70 = load <32 x i8> addrspace(2)* %69, !tbaa !0
+  %71 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %72 = load <16 x i8> addrspace(2)* %71, !tbaa !0
+  %73 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %74 = load <32 x i8> addrspace(2)* %73, !tbaa !0
+  %75 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %76 = load <16 x i8> addrspace(2)* %75, !tbaa !0
+  %77 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %78 = load <32 x i8> addrspace(2)* %77, !tbaa !0
+  %79 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %80 = load <16 x i8> addrspace(2)* %79, !tbaa !0
+  %81 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %82 = load <32 x i8> addrspace(2)* %81, !tbaa !0
+  %83 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %84 = load <16 x i8> addrspace(2)* %83, !tbaa !0
+  %85 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %86 = load <32 x i8> addrspace(2)* %85, !tbaa !0
+  %87 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %88 = load <16 x i8> addrspace(2)* %87, !tbaa !0
+  %89 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %90 = load <32 x i8> addrspace(2)* %89, !tbaa !0
+  %91 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %92 = load <16 x i8> addrspace(2)* %91, !tbaa !0
+  %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
+  %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
+  %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
+  %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
+  %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
+  %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
+  %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
+  %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
+  %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
+  %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
+  %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
+  %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
+  %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
+  %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
+  %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
+  %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
+  %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
+  %110 = call i32 @llvm.SI.tid()
+  %111 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
+  %112 = bitcast float %93 to i32
+  store i32 %112, i32 addrspace(3)* %111
+  %113 = bitcast float %94 to i32
+  store i32 %113, i32 addrspace(3)* %111
+  %114 = call i32 @llvm.SI.tid()
+  %115 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
+  %116 = and i32 %114, -4
+  %117 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
+  %118 = add i32 %116, 1
+  %119 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
+  %120 = bitcast float %93 to i32
+  store i32 %120, i32 addrspace(3)* %115
+  %121 = load i32 addrspace(3)* %117
+  %122 = bitcast i32 %121 to float
+  %123 = load i32 addrspace(3)* %119
+  %124 = bitcast i32 %123 to float
+  %125 = fsub float %124, %122
+  %126 = bitcast float %94 to i32
+  store i32 %126, i32 addrspace(3)* %115
+  %127 = load i32 addrspace(3)* %117
+  %128 = bitcast i32 %127 to float
+  %129 = load i32 addrspace(3)* %119
+  %130 = bitcast i32 %129 to float
+  %131 = fsub float %130, %128
+  %132 = insertelement <4 x float> undef, float %125, i32 0
+  %133 = insertelement <4 x float> %132, float %131, i32 1
+  %134 = insertelement <4 x float> %133, float %131, i32 2
+  %135 = insertelement <4 x float> %134, float %131, i32 3
+  %136 = extractelement <4 x float> %135, i32 0
+  %137 = extractelement <4 x float> %135, i32 1
+  %138 = fmul float %60, %93
+  %139 = fmul float %60, %94
+  %140 = fmul float %60, %94
+  %141 = fmul float %60, %94
+  %142 = call i32 @llvm.SI.tid()
+  %143 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
+  %144 = bitcast float %138 to i32
+  store i32 %144, i32 addrspace(3)* %143
+  %145 = bitcast float %139 to i32
+  store i32 %145, i32 addrspace(3)* %143
+  %146 = bitcast float %140 to i32
+  store i32 %146, i32 addrspace(3)* %143
+  %147 = bitcast float %141 to i32
+  store i32 %147, i32 addrspace(3)* %143
+  %148 = call i32 @llvm.SI.tid()
+  %149 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
+  %150 = and i32 %148, -4
+  %151 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
+  %152 = add i32 %150, 2
+  %153 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
+  %154 = bitcast float %138 to i32
+  store i32 %154, i32 addrspace(3)* %149
+  %155 = load i32 addrspace(3)* %151
+  %156 = bitcast i32 %155 to float
+  %157 = load i32 addrspace(3)* %153
+  %158 = bitcast i32 %157 to float
+  %159 = fsub float %158, %156
+  %160 = bitcast float %139 to i32
+  store i32 %160, i32 addrspace(3)* %149
+  %161 = load i32 addrspace(3)* %151
+  %162 = bitcast i32 %161 to float
+  %163 = load i32 addrspace(3)* %153
+  %164 = bitcast i32 %163 to float
+  %165 = fsub float %164, %162
+  %166 = bitcast float %140 to i32
+  store i32 %166, i32 addrspace(3)* %149
+  %167 = load i32 addrspace(3)* %151
+  %168 = bitcast i32 %167 to float
+  %169 = load i32 addrspace(3)* %153
+  %170 = bitcast i32 %169 to float
+  %171 = fsub float %170, %168
+  %172 = bitcast float %141 to i32
+  store i32 %172, i32 addrspace(3)* %149
+  %173 = load i32 addrspace(3)* %151
+  %174 = bitcast i32 %173 to float
+  %175 = load i32 addrspace(3)* %153
+  %176 = bitcast i32 %175 to float
+  %177 = fsub float %176, %174
+  %178 = insertelement <4 x float> undef, float %159, i32 0
+  %179 = insertelement <4 x float> %178, float %165, i32 1
+  %180 = insertelement <4 x float> %179, float %171, i32 2
+  %181 = insertelement <4 x float> %180, float %177, i32 3
+  %182 = extractelement <4 x float> %181, i32 0
+  %183 = extractelement <4 x float> %181, i32 1
+  %184 = fdiv float 1.000000e+00, %97
+  %185 = fmul float %33, %184
+  %186 = fcmp uge float 1.000000e+00, %185
+  %187 = select i1 %186, float %185, float 1.000000e+00
+  %188 = fmul float %187, %30
+  %189 = call float @ceil(float %188)
+  %190 = fcmp uge float 3.000000e+00, %189
+  %191 = select i1 %190, float 3.000000e+00, float %189
+  %192 = fdiv float 1.000000e+00, %191
+  %193 = fdiv float 1.000000e+00, %30
+  %194 = fmul float %191, %193
+  %195 = fmul float %31, %194
+  %196 = fmul float %95, %95
+  %197 = fmul float %96, %96
+  %198 = fadd float %197, %196
+  %199 = fmul float %97, %97
+  %200 = fadd float %198, %199
+  %201 = call float @llvm.AMDGPU.rsq(float %200)
+  %202 = fmul float %95, %201
+  %203 = fmul float %96, %201
+  %204 = fmul float %202, %29
+  %205 = fmul float %203, %29
+  %206 = fmul float %204, -1.000000e+00
+  %207 = fmul float %205, 1.000000e+00
+  %208 = fmul float %206, %32
+  %209 = fmul float %207, %32
+  %210 = fsub float -0.000000e+00, %208
+  %211 = fadd float %93, %210
+  %212 = fsub float -0.000000e+00, %209
+  %213 = fadd float %94, %212
+  %214 = fmul float %206, %192
+  %215 = fmul float %207, %192
+  %216 = fmul float -1.000000e+00, %192
+  %217 = bitcast float %136 to i32
+  %218 = bitcast float %182 to i32
+  %219 = bitcast float %137 to i32
+  %220 = bitcast float %183 to i32
+  %221 = insertelement <8 x i32> undef, i32 %217, i32 0
+  %222 = insertelement <8 x i32> %221, i32 %218, i32 1
+  %223 = insertelement <8 x i32> %222, i32 %219, i32 2
+  %224 = insertelement <8 x i32> %223, i32 %220, i32 3
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ]
+  %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ]
+  %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ]
+  %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ]
+  %225 = fcmp oge float %temp24.0, %191
+  %226 = sext i1 %225 to i32
+  %227 = bitcast i32 %226 to float
+  %228 = bitcast float %227 to i32
+  %229 = icmp ne i32 %228, 0
+  br i1 %229, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  %230 = bitcast float %136 to i32
+  %231 = bitcast float %182 to i32
+  %232 = bitcast float %137 to i32
+  %233 = bitcast float %183 to i32
+  %234 = insertelement <8 x i32> undef, i32 %230, i32 0
+  %235 = insertelement <8 x i32> %234, i32 %231, i32 1
+  %236 = insertelement <8 x i32> %235, i32 %232, i32 2
+  %237 = insertelement <8 x i32> %236, i32 %233, i32 3
+  br label %LOOP65
+
+ENDIF:                                            ; preds = %LOOP
+  %238 = bitcast float %temp28.0 to i32
+  %239 = bitcast float %temp29.0 to i32
+  %240 = insertelement <8 x i32> %224, i32 %238, i32 4
+  %241 = insertelement <8 x i32> %240, i32 %239, i32 5
+  %242 = insertelement <8 x i32> %241, i32 undef, i32 6
+  %243 = insertelement <8 x i32> %242, i32 undef, i32 7
+  %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %245 = extractelement <4 x float> %244, i32 3
+  %246 = fcmp oge float %temp30.0, %245
+  %247 = sext i1 %246 to i32
+  %248 = bitcast i32 %247 to float
+  %249 = bitcast float %248 to i32
+  %250 = and i32 %249, 1065353216
+  %251 = bitcast i32 %250 to float
+  %252 = fmul float %214, %251
+  %253 = fadd float %252, %temp28.0
+  %254 = fmul float %215, %251
+  %255 = fadd float %254, %temp29.0
+  %256 = fmul float %216, %251
+  %257 = fadd float %256, %temp30.0
+  %258 = fadd float %temp24.0, 1.000000e+00
+  br label %LOOP
+
+LOOP65:                                           ; preds = %ENDIF66, %IF
+  %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ]
+  %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ]
+  %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ]
+  %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ]
+  %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ]
+  %259 = fcmp oge float %temp24.1, %195
+  %260 = sext i1 %259 to i32
+  %261 = bitcast i32 %260 to float
+  %262 = bitcast float %261 to i32
+  %263 = icmp ne i32 %262, 0
+  br i1 %263, label %IF67, label %ENDIF66
+
+IF67:                                             ; preds = %LOOP65
+  %264 = bitcast float %136 to i32
+  %265 = bitcast float %182 to i32
+  %266 = bitcast float %137 to i32
+  %267 = bitcast float %183 to i32
+  %268 = bitcast float %temp28.1 to i32
+  %269 = bitcast float %temp29.1 to i32
+  %270 = insertelement <8 x i32> undef, i32 %264, i32 0
+  %271 = insertelement <8 x i32> %270, i32 %265, i32 1
+  %272 = insertelement <8 x i32> %271, i32 %266, i32 2
+  %273 = insertelement <8 x i32> %272, i32 %267, i32 3
+  %274 = insertelement <8 x i32> %273, i32 %268, i32 4
+  %275 = insertelement <8 x i32> %274, i32 %269, i32 5
+  %276 = insertelement <8 x i32> %275, i32 undef, i32 6
+  %277 = insertelement <8 x i32> %276, i32 undef, i32 7
+  %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2)
+  %279 = extractelement <4 x float> %278, i32 0
+  %280 = extractelement <4 x float> %278, i32 1
+  %281 = extractelement <4 x float> %278, i32 2
+  %282 = extractelement <4 x float> %278, i32 3
+  %283 = fmul float %282, %47
+  %284 = bitcast float %136 to i32
+  %285 = bitcast float %182 to i32
+  %286 = bitcast float %137 to i32
+  %287 = bitcast float %183 to i32
+  %288 = bitcast float %temp28.1 to i32
+  %289 = bitcast float %temp29.1 to i32
+  %290 = insertelement <8 x i32> undef, i32 %284, i32 0
+  %291 = insertelement <8 x i32> %290, i32 %285, i32 1
+  %292 = insertelement <8 x i32> %291, i32 %286, i32 2
+  %293 = insertelement <8 x i32> %292, i32 %287, i32 3
+  %294 = insertelement <8 x i32> %293, i32 %288, i32 4
+  %295 = insertelement <8 x i32> %294, i32 %289, i32 5
+  %296 = insertelement <8 x i32> %295, i32 undef, i32 6
+  %297 = insertelement <8 x i32> %296, i32 undef, i32 7
+  %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2)
+  %299 = extractelement <4 x float> %298, i32 0
+  %300 = extractelement <4 x float> %298, i32 1
+  %301 = extractelement <4 x float> %298, i32 2
+  %302 = bitcast float %136 to i32
+  %303 = bitcast float %182 to i32
+  %304 = bitcast float %137 to i32
+  %305 = bitcast float %183 to i32
+  %306 = bitcast float %temp28.1 to i32
+  %307 = bitcast float %temp29.1 to i32
+  %308 = insertelement <8 x i32> undef, i32 %302, i32 0
+  %309 = insertelement <8 x i32> %308, i32 %303, i32 1
+  %310 = insertelement <8 x i32> %309, i32 %304, i32 2
+  %311 = insertelement <8 x i32> %310, i32 %305, i32 3
+  %312 = insertelement <8 x i32> %311, i32 %306, i32 4
+  %313 = insertelement <8 x i32> %312, i32 %307, i32 5
+  %314 = insertelement <8 x i32> %313, i32 undef, i32 6
+  %315 = insertelement <8 x i32> %314, i32 undef, i32 7
+  %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2)
+  %317 = extractelement <4 x float> %316, i32 0
+  %318 = extractelement <4 x float> %316, i32 1
+  %319 = extractelement <4 x float> %316, i32 2
+  %320 = fmul float %317, %23
+  %321 = fmul float %318, %24
+  %322 = fmul float %319, %25
+  %323 = fmul float %299, %26
+  %324 = fadd float %323, %320
+  %325 = fmul float %300, %27
+  %326 = fadd float %325, %321
+  %327 = fmul float %301, %28
+  %328 = fadd float %327, %322
+  %329 = fadd float %279, %324
+  %330 = fadd float %280, %326
+  %331 = fadd float %281, %328
+  %332 = bitcast float %136 to i32
+  %333 = bitcast float %182 to i32
+  %334 = bitcast float %137 to i32
+  %335 = bitcast float %183 to i32
+  %336 = bitcast float %temp28.1 to i32
+  %337 = bitcast float %temp29.1 to i32
+  %338 = insertelement <8 x i32> undef, i32 %332, i32 0
+  %339 = insertelement <8 x i32> %338, i32 %333, i32 1
+  %340 = insertelement <8 x i32> %339, i32 %334, i32 2
+  %341 = insertelement <8 x i32> %340, i32 %335, i32 3
+  %342 = insertelement <8 x i32> %341, i32 %336, i32 4
+  %343 = insertelement <8 x i32> %342, i32 %337, i32 5
+  %344 = insertelement <8 x i32> %343, i32 undef, i32 6
+  %345 = insertelement <8 x i32> %344, i32 undef, i32 7
+  %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %347 = extractelement <4 x float> %346, i32 0
+  %348 = extractelement <4 x float> %346, i32 1
+  %349 = extractelement <4 x float> %346, i32 2
+  %350 = fadd float %347, -5.000000e-01
+  %351 = fadd float %348, -5.000000e-01
+  %352 = fadd float %349, -5.000000e-01
+  %353 = fmul float %350, %350
+  %354 = fmul float %351, %351
+  %355 = fadd float %354, %353
+  %356 = fmul float %352, %352
+  %357 = fadd float %355, %356
+  %358 = call float @llvm.AMDGPU.rsq(float %357)
+  %359 = fmul float %350, %358
+  %360 = fmul float %351, %358
+  %361 = fmul float %352, %358
+  %362 = bitcast float %136 to i32
+  %363 = bitcast float %182 to i32
+  %364 = bitcast float %137 to i32
+  %365 = bitcast float %183 to i32
+  %366 = bitcast float %temp28.1 to i32
+  %367 = bitcast float %temp29.1 to i32
+  %368 = insertelement <8 x i32> undef, i32 %362, i32 0
+  %369 = insertelement <8 x i32> %368, i32 %363, i32 1
+  %370 = insertelement <8 x i32> %369, i32 %364, i32 2
+  %371 = insertelement <8 x i32> %370, i32 %365, i32 3
+  %372 = insertelement <8 x i32> %371, i32 %366, i32 4
+  %373 = insertelement <8 x i32> %372, i32 %367, i32 5
+  %374 = insertelement <8 x i32> %373, i32 undef, i32 6
+  %375 = insertelement <8 x i32> %374, i32 undef, i32 7
+  %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2)
+  %377 = extractelement <4 x float> %376, i32 0
+  %378 = extractelement <4 x float> %376, i32 1
+  %379 = extractelement <4 x float> %376, i32 2
+  %380 = extractelement <4 x float> %376, i32 3
+  %381 = fsub float -0.000000e+00, %95
+  %382 = fsub float -0.000000e+00, %96
+  %383 = fsub float -0.000000e+00, %97
+  %384 = fmul float %359, %381
+  %385 = fmul float %360, %382
+  %386 = fadd float %385, %384
+  %387 = fmul float %361, %383
+  %388 = fadd float %386, %387
+  %389 = fmul float %388, %359
+  %390 = fmul float %388, %360
+  %391 = fmul float %388, %361
+  %392 = fmul float 2.000000e+00, %389
+  %393 = fmul float 2.000000e+00, %390
+  %394 = fmul float 2.000000e+00, %391
+  %395 = fsub float -0.000000e+00, %392
+  %396 = fadd float %381, %395
+  %397 = fsub float -0.000000e+00, %393
+  %398 = fadd float %382, %397
+  %399 = fsub float -0.000000e+00, %394
+  %400 = fadd float %383, %399
+  %401 = fmul float %396, %98
+  %402 = fmul float %396, %99
+  %403 = fmul float %396, %100
+  %404 = fmul float %398, %101
+  %405 = fadd float %404, %401
+  %406 = fmul float %398, %102
+  %407 = fadd float %406, %402
+  %408 = fmul float %398, %103
+  %409 = fadd float %408, %403
+  %410 = fmul float %400, %104
+  %411 = fadd float %410, %405
+  %412 = fmul float %400, %105
+  %413 = fadd float %412, %407
+  %414 = fmul float %400, %106
+  %415 = fadd float %414, %409
+  %416 = bitcast float %136 to i32
+  %417 = bitcast float %182 to i32
+  %418 = bitcast float %137 to i32
+  %419 = bitcast float %183 to i32
+  %420 = bitcast float %temp28.1 to i32
+  %421 = bitcast float %temp29.1 to i32
+  %422 = insertelement <8 x i32> undef, i32 %416, i32 0
+  %423 = insertelement <8 x i32> %422, i32 %417, i32 1
+  %424 = insertelement <8 x i32> %423, i32 %418, i32 2
+  %425 = insertelement <8 x i32> %424, i32 %419, i32 3
+  %426 = insertelement <8 x i32> %425, i32 %420, i32 4
+  %427 = insertelement <8 x i32> %426, i32 %421, i32 5
+  %428 = insertelement <8 x i32> %427, i32 undef, i32 6
+  %429 = insertelement <8 x i32> %428, i32 undef, i32 7
+  %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2)
+  %431 = extractelement <4 x float> %430, i32 0
+  %432 = extractelement <4 x float> %430, i32 1
+  %433 = extractelement <4 x float> %430, i32 2
+  %434 = fmul float %48, %411
+  %435 = fmul float %49, %411
+  %436 = fmul float %50, %411
+  %437 = fmul float %51, %413
+  %438 = fadd float %437, %434
+  %439 = fmul float %52, %413
+  %440 = fadd float %439, %435
+  %441 = fmul float %53, %413
+  %442 = fadd float %441, %436
+  %443 = fmul float %54, %415
+  %444 = fadd float %443, %438
+  %445 = fmul float %55, %415
+  %446 = fadd float %445, %440
+  %447 = fmul float %56, %415
+  %448 = fadd float %447, %442
+  %449 = insertelement <4 x float> undef, float %444, i32 0
+  %450 = insertelement <4 x float> %449, float %446, i32 1
+  %451 = insertelement <4 x float> %450, float %448, i32 2
+  %452 = insertelement <4 x float> %451, float %195, i32 3
+  %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452)
+  %454 = extractelement <4 x float> %453, i32 0
+  %455 = extractelement <4 x float> %453, i32 1
+  %456 = extractelement <4 x float> %453, i32 2
+  %457 = extractelement <4 x float> %453, i32 3
+  %458 = call float @fabs(float %456)
+  %459 = fdiv float 1.000000e+00, %458
+  %460 = fmul float %454, %459
+  %461 = fadd float %460, 1.500000e+00
+  %462 = fmul float %455, %459
+  %463 = fadd float %462, 1.500000e+00
+  %464 = bitcast float %463 to i32
+  %465 = bitcast float %461 to i32
+  %466 = bitcast float %457 to i32
+  %467 = insertelement <4 x i32> undef, i32 %464, i32 0
+  %468 = insertelement <4 x i32> %467, i32 %465, i32 1
+  %469 = insertelement <4 x i32> %468, i32 %466, i32 2
+  %470 = insertelement <4 x i32> %469, i32 undef, i32 3
+  %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4)
+  %472 = extractelement <4 x float> %471, i32 0
+  %473 = extractelement <4 x float> %471, i32 1
+  %474 = extractelement <4 x float> %471, i32 2
+  %475 = fmul float %431, %472
+  %476 = fadd float %475, %329
+  %477 = fmul float %432, %473
+  %478 = fadd float %477, %330
+  %479 = fmul float %433, %474
+  %480 = fadd float %479, %331
+  %481 = fmul float %107, %107
+  %482 = fmul float %108, %108
+  %483 = fadd float %482, %481
+  %484 = fmul float %109, %109
+  %485 = fadd float %483, %484
+  %486 = call float @llvm.AMDGPU.rsq(float %485)
+  %487 = fmul float %107, %486
+  %488 = fmul float %108, %486
+  %489 = fmul float %109, %486
+  %490 = fmul float %377, %40
+  %491 = fmul float %378, %41
+  %492 = fmul float %379, %42
+  %493 = fmul float %359, %487
+  %494 = fmul float %360, %488
+  %495 = fadd float %494, %493
+  %496 = fmul float %361, %489
+  %497 = fadd float %495, %496
+  %498 = fmul float %497, %359
+  %499 = fmul float %497, %360
+  %500 = fmul float %497, %361
+  %501 = fmul float 2.000000e+00, %498
+  %502 = fmul float 2.000000e+00, %499
+  %503 = fmul float 2.000000e+00, %500
+  %504 = fsub float -0.000000e+00, %501
+  %505 = fadd float %487, %504
+  %506 = fsub float -0.000000e+00, %502
+  %507 = fadd float %488, %506
+  %508 = fsub float -0.000000e+00, %503
+  %509 = fadd float %489, %508
+  %510 = fmul float %95, %95
+  %511 = fmul float %96, %96
+  %512 = fadd float %511, %510
+  %513 = fmul float %97, %97
+  %514 = fadd float %512, %513
+  %515 = call float @llvm.AMDGPU.rsq(float %514)
+  %516 = fmul float %95, %515
+  %517 = fmul float %96, %515
+  %518 = fmul float %97, %515
+  %519 = fmul float %505, %516
+  %520 = fmul float %507, %517
+  %521 = fadd float %520, %519
+  %522 = fmul float %509, %518
+  %523 = fadd float %521, %522
+  %524 = fsub float -0.000000e+00, %523
+  %525 = fcmp uge float %524, 0.000000e+00
+  %526 = select i1 %525, float %524, float 0.000000e+00
+  %527 = fmul float %43, %380
+  %528 = fadd float %527, 1.000000e+00
+  %529 = call float @llvm.pow.f32(float %526, float %528)
+  %530 = fmul float %476, %37
+  %531 = fmul float %478, %38
+  %532 = fmul float %480, %39
+  %533 = fmul float %359, %487
+  %534 = fmul float %360, %488
+  %535 = fadd float %534, %533
+  %536 = fmul float %361, %489
+  %537 = fadd float %535, %536
+  %538 = fcmp uge float %537, 0.000000e+00
+  %539 = select i1 %538, float %537, float 0.000000e+00
+  %540 = fmul float %530, %539
+  %541 = fmul float %531, %539
+  %542 = fmul float %532, %539
+  %543 = fmul float %490, %529
+  %544 = fadd float %543, %540
+  %545 = fmul float %491, %529
+  %546 = fadd float %545, %541
+  %547 = fmul float %492, %529
+  %548 = fadd float %547, %542
+  %549 = fmul float %476, %34
+  %550 = fmul float %478, %35
+  %551 = fmul float %480, %36
+  %552 = fmul float %544, %57
+  %553 = fadd float %552, %549
+  %554 = fmul float %546, %58
+  %555 = fadd float %554, %550
+  %556 = fmul float %548, %59
+  %557 = fadd float %556, %551
+  %558 = bitcast float %136 to i32
+  %559 = bitcast float %182 to i32
+  %560 = bitcast float %137 to i32
+  %561 = bitcast float %183 to i32
+  %562 = bitcast float %temp28.1 to i32
+  %563 = bitcast float %temp29.1 to i32
+  %564 = insertelement <8 x i32> undef, i32 %558, i32 0
+  %565 = insertelement <8 x i32> %564, i32 %559, i32 1
+  %566 = insertelement <8 x i32> %565, i32 %560, i32 2
+  %567 = insertelement <8 x i32> %566, i32 %561, i32 3
+  %568 = insertelement <8 x i32> %567, i32 %562, i32 4
+  %569 = insertelement <8 x i32> %568, i32 %563, i32 5
+  %570 = insertelement <8 x i32> %569, i32 undef, i32 6
+  %571 = insertelement <8 x i32> %570, i32 undef, i32 7
+  %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2)
+  %573 = extractelement <4 x float> %572, i32 0
+  %574 = extractelement <4 x float> %572, i32 1
+  %575 = extractelement <4 x float> %572, i32 2
+  %576 = fmul float %573, %44
+  %577 = fadd float %576, %553
+  %578 = fmul float %574, %45
+  %579 = fadd float %578, %555
+  %580 = fmul float %575, %46
+  %581 = fadd float %580, %557
+  %582 = call i32 @llvm.SI.packf16(float %577, float %579)
+  %583 = bitcast i32 %582 to float
+  %584 = call i32 @llvm.SI.packf16(float %581, float %283)
+  %585 = bitcast i32 %584 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585)
+  ret void
+
+ENDIF66:                                          ; preds = %LOOP65
+  %586 = bitcast float %temp28.1 to i32
+  %587 = bitcast float %temp29.1 to i32
+  %588 = insertelement <8 x i32> %237, i32 %586, i32 4
+  %589 = insertelement <8 x i32> %588, i32 %587, i32 5
+  %590 = insertelement <8 x i32> %589, i32 undef, i32 6
+  %591 = insertelement <8 x i32> %590, i32 undef, i32 7
+  %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %593 = extractelement <4 x float> %592, i32 3
+  %594 = fcmp oge float %temp30.1, %593
+  %595 = sext i1 %594 to i32
+  %596 = bitcast i32 %595 to float
+  %597 = bitcast float %596 to i32
+  %598 = and i32 %597, 1065353216
+  %599 = bitcast i32 %598 to float
+  %600 = fmul float 5.000000e-01, %temp32.0
+  %601 = fsub float -0.000000e+00, %600
+  %602 = fmul float %599, %temp32.0
+  %603 = fadd float %602, %601
+  %604 = fmul float %214, %603
+  %605 = fadd float %604, %temp28.1
+  %606 = fmul float %215, %603
+  %607 = fadd float %606, %temp29.1
+  %608 = fmul float %216, %603
+  %609 = fadd float %608, %temp30.1
+  %610 = fadd float %temp24.1, 1.000000e+00
+  %611 = fmul float %temp32.0, 5.000000e-01
+  br label %LOOP65
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: readnone
+declare i32 @llvm.SI.tid() #2
+
+; Function Attrs: readonly
+declare float @ceil(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2
+
+; Function Attrs: readnone
+declare float @fabs(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+attributes #3 = { readonly }
+attributes #4 = { nounwind readonly }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll
new file mode 100644
index 0000000..093234f
--- /dev/null
+++ b/test/CodeGen/R600/si-vector-hang.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; CHECK: @test_8_min_char
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; ModuleID = 'radeon'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+target triple = "r600--"
+
+; Function Attrs: nounwind
+define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+entry:
+  %0 = load i8 addrspace(1)* %in0, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %arrayidx2.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 1
+  %2 = load i8 addrspace(1)* %arrayidx2.i.i, align 1
+  %3 = insertelement <8 x i8> %1, i8 %2, i32 1
+  %arrayidx6.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 2
+  %4 = load i8 addrspace(1)* %arrayidx6.i.i, align 1
+  %5 = insertelement <8 x i8> %3, i8 %4, i32 2
+  %arrayidx10.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 3
+  %6 = load i8 addrspace(1)* %arrayidx10.i.i, align 1
+  %7 = insertelement <8 x i8> %5, i8 %6, i32 3
+  %arrayidx.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 4
+  %8 = load i8 addrspace(1)* %arrayidx.i.i, align 1
+  %9 = insertelement <8 x i8> undef, i8 %8, i32 0
+  %arrayidx2.i9.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 5
+  %10 = load i8 addrspace(1)* %arrayidx2.i9.i, align 1
+  %11 = insertelement <8 x i8> %9, i8 %10, i32 1
+  %arrayidx6.i11.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 6
+  %12 = load i8 addrspace(1)* %arrayidx6.i11.i, align 1
+  %13 = insertelement <8 x i8> %11, i8 %12, i32 2
+  %arrayidx10.i13.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 7
+  %14 = load i8 addrspace(1)* %arrayidx10.i13.i, align 1
+  %15 = insertelement <8 x i8> %13, i8 %14, i32 3
+  %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %16 = load i8 addrspace(1)* %in1, align 1
+  %17 = insertelement <8 x i8> undef, i8 %16, i32 0
+  %arrayidx2.i.i4 = getelementptr inbounds i8 addrspace(1)* %in1, i64 1
+  %18 = load i8 addrspace(1)* %arrayidx2.i.i4, align 1
+  %19 = insertelement <8 x i8> %17, i8 %18, i32 1
+  %arrayidx6.i.i5 = getelementptr inbounds i8 addrspace(1)* %in1, i64 2
+  %20 = load i8 addrspace(1)* %arrayidx6.i.i5, align 1
+  %21 = insertelement <8 x i8> %19, i8 %20, i32 2
+  %arrayidx10.i.i6 = getelementptr inbounds i8 addrspace(1)* %in1, i64 3
+  %22 = load i8 addrspace(1)* %arrayidx10.i.i6, align 1
+  %23 = insertelement <8 x i8> %21, i8 %22, i32 3
+  %arrayidx.i.i7 = getelementptr inbounds i8 addrspace(1)* %in1, i64 4
+  %24 = load i8 addrspace(1)* %arrayidx.i.i7, align 1
+  %25 = insertelement <8 x i8> undef, i8 %24, i32 0
+  %arrayidx2.i9.i8 = getelementptr inbounds i8 addrspace(1)* %in1, i64 5
+  %26 = load i8 addrspace(1)* %arrayidx2.i9.i8, align 1
+  %27 = insertelement <8 x i8> %25, i8 %26, i32 1
+  %arrayidx6.i11.i9 = getelementptr inbounds i8 addrspace(1)* %in1, i64 6
+  %28 = load i8 addrspace(1)* %arrayidx6.i11.i9, align 1
+  %29 = insertelement <8 x i8> %27, i8 %28, i32 2
+  %arrayidx10.i13.i10 = getelementptr inbounds i8 addrspace(1)* %in1, i64 7
+  %30 = load i8 addrspace(1)* %arrayidx10.i13.i10, align 1
+  %31 = insertelement <8 x i8> %29, i8 %30, i32 3
+  %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11
+  %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11
+  %32 = extractelement <8 x i8> %cond.i, i32 0
+  store i8 %32, i8 addrspace(1)* %out, align 1
+  %33 = extractelement <8 x i8> %cond.i, i32 1
+  %arrayidx2.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 1
+  store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1
+  %34 = extractelement <8 x i8> %cond.i, i32 2
+  %arrayidx.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 2
+  store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1
+  %35 = extractelement <8 x i8> %cond.i, i32 3
+  %arrayidx2.i6.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 3
+  store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1
+  %arrayidx.i.i3 = getelementptr inbounds i8 addrspace(1)* %out, i64 4
+  %36 = extractelement <8 x i8> %cond.i, i32 4
+  store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1
+  %37 = extractelement <8 x i8> %cond.i, i32 5
+  %arrayidx2.i.i6.i = getelementptr inbounds i8 addrspace(1)* %out, i64 5
+  store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1
+  %38 = extractelement <8 x i8> %cond.i, i32 6
+  %arrayidx.i.i7.i = getelementptr inbounds i8 addrspace(1)* %out, i64 6
+  store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1
+  %39 = extractelement <8 x i8> %cond.i, i32 7
+  %arrayidx2.i6.i8.i = getelementptr inbounds i8 addrspace(1)* %out, i64 7
+  store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+!0 = metadata !{null}
+!1 = metadata !{null}
+!2 = metadata !{null}
+!3 = metadata !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
+!4 = metadata !{null}
+!5 = metadata !{null}
+!6 = metadata !{null}
+!7 = metadata !{null}
+!8 = metadata !{null}
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index e4ef534..1212cee 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -1,5 +1,5 @@
 
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; CHECK: V_ASHR
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c)  {
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 4e88494..9241799 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @sint_to_fp_v2i32
 ; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
diff --git a/test/CodeGen/R600/sint_to_fp64.ll b/test/CodeGen/R600/sint_to_fp64.ll
new file mode 100644
index 0000000..5abc9d1
--- /dev/null
+++ b/test/CodeGen/R600/sint_to_fp64.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @sint_to_fp64
+; CHECK: V_CVT_F64_I32_e32
+define void @sint_to_fp64(double addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index 5220a96..fe9df10 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @ashr_v2i32
+;EG-CHECK-LABEL: @ashr_v2i32
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @ashr_v2i32
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @ashr_v2i32
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK: @ashr_v4i32
+;EG-CHECK-LABEL: @ashr_v4i32
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @ashr_v4i32
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @ashr_v4i32
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,11 +39,11 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK: @ashr_i64
+;EG-CHECK-LABEL: @ashr_i64
 ;EG-CHECK: ASHR
 
-;SI-CHECK: @ashr_i64
-;SI-CHECK: V_ASHR_I64
+;SI-CHECK-LABEL: @ashr_i64
+;SI-CHECK: S_ASHR_I64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index d1dcd7f..7637355 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @lshr_v2i32
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @lshr_v2i32
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -26,10 +26,10 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @lshr_v4i32
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
new file mode 100644
index 0000000..01210ce
--- /dev/null
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -0,0 +1,8 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+  %p = getelementptr <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  store <4 x i32*> %p, <4 x i32*>* %out
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index 1bda5e6..5e51d56 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -1,13 +1,118 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+
+;===------------------------------------------------------------------------===;
+; Global Address Space
+;===------------------------------------------------------------------------===;
+
+; i8 store
+; EG-CHECK-LABEL: @store_i8
+; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]]
+; IG 0: Get the byte index and truncate the value
+; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-CHECK-NEXT: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
+; EG-CHECK-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; IG 1: Truncate the calculated the shift amount for the mask
+; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-CHECK-NEXT: 3
+; IG 2: Shift the value and the mask
+; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
+; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-CHECK-NEXT: 255
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
+; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI-CHECK-LABEL: @store_i8
+; SI-CHECK: BUFFER_STORE_BYTE
+
+define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+entry:
+  store i8 %in, i8 addrspace(1)* %out
+  ret void
+}
+
+; i16 store
+; EG-CHECK-LABEL: @store_i16
+; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]]
+; IG 0: Get the byte index and truncate the value
+; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
+; EG-CHECK-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
+; IG 1: Truncate the calculated the shift amount for the mask
+; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-CHECK: 3
+; IG 2: Shift the value and the mask
+; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
+; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-CHECK-NEXT: 65535
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
+; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI-CHECK-LABEL: @store_i16
+; SI-CHECK: BUFFER_STORE_SHORT
+define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+entry:
+  store i16 %in, i16 addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_v2i8
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK-NOT: MEM_RAT MSKOR
+; SI-CHECK-LABEL: @store_v2i8
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i8>
+  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+
+; EG-CHECK-LABEL: @store_v2i16
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v2i16
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v2i16
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i16>
+  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_v4i8
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v4i8
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v4i8
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i8>
+  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
+  ret void
+}
 
 ; floating-point store
-; EG-CHECK: @store_f32
-; EG-CHECK: RAT_WRITE_CACHELESS_32_eg T{{[0-9]+\.X, T[0-9]+\.X}}, 1
-; CM-CHECK: @store_f32
-; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @store_f32
+; EG-CHECK-LABEL: @store_f32
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
+; CM-CHECK-LABEL: @store_f32
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK-LABEL: @store_f32
 ; SI-CHECK: BUFFER_STORE_DWORD
 
 define void @store_f32(float addrspace(1)* %out, float %in) {
@@ -15,22 +120,141 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
   ret void
 }
 
+; EG-CHECK-LABEL: @store_v4i16
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK-NOT: MEM_RAT MSKOR
+; SI-CHECK-LABEL: @store_v4i16
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK-NOT: BUFFER_STORE_BYTE
+define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i16>
+  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
 ; vec2 floating-point stores
-; EG-CHECK: @store_v2f32
-; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
-; CM-CHECK: @store_v2f32
-; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
-; SI-CHECK: @store_v2f32
+; EG-CHECK-LABEL: @store_v2f32
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v2f32
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v2f32
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 
 define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
-  %1 = insertelement <2 x float> %0, float %b, i32 0
+  %1 = insertelement <2 x float> %0, float %b, i32 1
   store <2 x float> %1, <2 x float> addrspace(1)* %out
   ret void
 }
 
+; EG-CHECK-LABEL: @store_v4i32
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; EG-CHECK-NOT: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v4i32
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; CM-CHECK-NOT: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v4i32
+; SI-CHECK: BUFFER_STORE_DWORDX4
+define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; Local Address Space
+;===------------------------------------------------------------------------===;
+
+; EG-CHECK-LABEL: @store_local_i8
+; EG-CHECK: LDS_BYTE_WRITE
+; SI-CHECK-LABEL: @store_local_i8
+; SI-CHECK: DS_WRITE_B8
+define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+  store i8 %in, i8 addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_i16
+; EG-CHECK: LDS_SHORT_WRITE
+; SI-CHECK-LABEL: @store_local_i16
+; SI-CHECK: DS_WRITE_B16
+define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+  store i16 %in, i16 addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v2i16
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v2i16
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v2i16
+; SI-CHECK: DS_WRITE_B16
+; SI-CHECK: DS_WRITE_B16
+define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v4i8
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v4i8
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v4i8
+; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: DS_WRITE_B8
+define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v2i32
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v2i32
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v2i32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v4i32
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v4i32
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v4i32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
 ; The stores in this function are combined by the optimizer to create a
 ; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
 ; should not try to split the 64-bit store back into 2 32-bit stores.
@@ -38,25 +262,21 @@ entry:
 ; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
 ; be two 32-bit stores.
 
-; EG-CHECK: @vecload2
-; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
-; CM-CHECK: @vecload2
-; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
-; SI-CHECK: @vecload2
+; EG-CHECK-LABEL: @vecload2
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @vecload2
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @vecload2
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
-  %0 = load i32 addrspace(2)* %mem, align 4, !tbaa !5
+  %0 = load i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32 addrspace(2)* %mem, i64 1
-  %1 = load i32 addrspace(2)* %arrayidx1.i, align 4, !tbaa !5
-  store i32 %0, i32 addrspace(1)* %out, align 4, !tbaa !5
+  %1 = load i32 addrspace(2)* %arrayidx1.i, align 4
+  store i32 %0, i32 addrspace(1)* %out, align 4
   %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
-  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4, !tbaa !5
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!5 = metadata !{metadata !"int", metadata !6}
-!6 = metadata !{metadata !"omnipotent char", metadata !7}
-!7 = metadata !{metadata !"Simple C/C++ TBAA"}
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll
index 5ffb7f1..00589a0 100644
--- a/test/CodeGen/R600/store.r600.ll
+++ b/test/CodeGen/R600/store.r600.ll
@@ -4,7 +4,7 @@
 
 ; v4i32 store
 ; EG-CHECK: @store_v4i32
-; EG-CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %1 = load <4 x i32> addrspace(1) * %in
@@ -14,7 +14,7 @@ define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 
 ; v4f32 store
 ; EG-CHECK: @store_v4f32
-; EG-CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/R600/structurize.ll b/test/CodeGen/R600/structurize.ll
new file mode 100644
index 0000000..c2acd93
--- /dev/null
+++ b/test/CodeGen/R600/structurize.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s
+; Test case for a crash in the AMDILCFGStructurizer from a CFG like this:
+;
+;                            entry
+;                           /     \
+;               diamond_head       branch_from
+;                 /      \           |
+;    diamond_false        diamond_true
+;                 \      /
+;                   done
+;
+; When the diamond_true branch had more than 100 instructions.
+;
+;
+
+; CHECK-LABEL: @branch_into_diamond
+; === entry block:
+; CHECK: ALU_PUSH_BEFORE
+; === Branch instruction (IF):
+; CHECK: JUMP
+  ; === branch_from block
+  ; CHECK: ALU
+  ; === Duplicated diamond_true block (There can be more than one ALU clause):
+  ; === XXX: We should be able to optimize this so the basic block is not
+  ; === duplicated.  See comments in
+  ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf()
+  ; CHECK: ALU
+; === Branch instruction (ELSE):
+; CHECK: ELSE
+  ; === diamond_head block:
+  ; CHECK: ALU_PUSH_BEFORE
+  ; === Branch instruction (IF):
+  ; CHECK: JUMP
+    ; === diamond_true block (There can be more than one ALU clause):
+    ; ALU
+  ; === Branch instruction (ELSE):
+  ; CHECK: ELSE
+    ; === diamond_false block plus implicit ENDIF
+    ; CHECK: ALU_POP_AFTER
+; === Branch instruction (ENDIF):
+; CHECK: POP
+; === done block:
+; CHECK: ALU
+; CHECK: MEM_RAT_CACHELESS
+; CHECK: CF_END
+
+
+define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+%0 = icmp ne i32 %a, 0
+  br i1 %0, label %diamond_head, label %branch_from
+
+diamond_head:
+  %1 = icmp ne i32 %a, 1
+  br i1 %1, label %diamond_true, label %diamond_false
+
+branch_from:
+  %2 = add i32 %a, 1
+  br label %diamond_true
+
+diamond_false:
+  %3 = add i32 %a, 2
+  br label %done
+
+diamond_true:
+  %4 = phi i32 [%2, %branch_from], [%a, %diamond_head]
+  ; This block needs to be > 100 ISA instructions to hit the bug,
+  ; so we'll use udiv instructions.
+  %div0 = udiv i32 %a, %b
+  %div1 = udiv i32 %div0, %4
+  %div2 = udiv i32 %div1, 11
+  %div3 = udiv i32 %div2, %a
+  %div4 = udiv i32 %div3, %b
+  %div5 = udiv i32 %div4, %c
+  %div6 = udiv i32 %div5, %div0
+  %div7 = udiv i32 %div6, %div1
+  br label %done
+
+done:
+  %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/structurize1.ll b/test/CodeGen/R600/structurize1.ll
new file mode 100644
index 0000000..8c10301
--- /dev/null
+++ b/test/CodeGen/R600/structurize1.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s
+
+; This tests for abug where the AMDILCFGStructurizer was crashing on loops
+; like this:
+;
+; for (i = 0; i < x; i++) {
+;   if (cond0) {
+;     if (cond1) {
+;
+;     } else {
+;
+;     }
+;     if (cond2) {
+;
+;     }
+;   }
+; }
+
+; CHECK-LABEL: @if_inside_loop
+; CHECK: LOOP_START_DX10
+; CHECK: END_LOOP
+define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%inc, %for.inc]
+  %val = phi i32 [0, %entry], [%val.for.inc, %for.inc]
+  %inc = add i32 %0, 1
+  %1 = icmp ult i32 10, %a
+  br i1 %1, label %for.inc, label %if.then
+
+if.then:
+  %2 = icmp ne i32 0, %b
+  br i1 %2, label %if.then.true, label %if.then.false
+
+if.then.true:
+  %3 = add i32 %a, %val
+  br label %if
+
+if.then.false:
+  %4 = mul i32 %a, %val
+  br label %if
+
+if:
+  %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false]
+  %5 = icmp ne i32 0, %c
+  br i1 %5, label %if.true, label %for.inc
+
+if.true:
+  %6 = add i32 %a, %val.if
+  br label %for.inc
+
+for.inc:
+  %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true]
+  %7 = icmp ne i32 0, %d
+  br i1 %7, label %for.body, label %exit
+
+exit:
+  store i32 %val.for.inc, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 3bd4cb8..5fdd2b8 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @test2
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test2
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 }
 
 ;EG-CHECK: @test4
-;EG-CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test4
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
index b2175af..16c3f19 100644
--- a/test/CodeGen/R600/swizzle-export.ll
+++ b/test/CodeGen/R600/swizzle-export.ll
@@ -6,12 +6,12 @@
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XXWX
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XXXW
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = load <4 x float> addrspace(8)* null
   %5 = extractelement <4 x float> %4, i32 1
   %6 = load <4 x float> addrspace(8)* null
@@ -93,14 +93,15 @@ main_body:
 }
 
 ; EG-CHECK: @main2
-; EG-CHECK: T{{[0-9]+}}.ZXY0
+; EG-CHECK: T{{[0-9]+}}.XY__
+; EG-CHECK: T{{[0-9]+}}.YXZ0
 
-define void @main2() #0 {
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = fadd float %0, 2.5
+  %3 = fmul float %1, 3.5
   %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %5 = extractelement <4 x float> %4, i32 0
   %6 = call float @llvm.cos.f32(float %5)
@@ -108,27 +109,21 @@ main_body:
   %8 = extractelement <4 x float> %7, i32 0
   %9 = load <4 x float> addrspace(8)* null
   %10 = extractelement <4 x float> %9, i32 1
-  %11 = insertelement <4 x float> undef, float %0, i32 0
-  %12 = insertelement <4 x float> %11, float %1, i32 1
-  %13 = insertelement <4 x float> %12, float %2, i32 2
-  %14 = insertelement <4 x float> %13, float %3, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %14, i32 60, i32 1)
-  %15 = insertelement <4 x float> undef, float %6, i32 0
-  %16 = insertelement <4 x float> %15, float %8, i32 1
-  %17 = insertelement <4 x float> %16, float %10, i32 2
-  %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  %11 = insertelement <4 x float> undef, float %2, i32 0
+  %12 = insertelement <4 x float> %11, float %3, i32 1
+  call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1)
+  %13 = insertelement <4 x float> undef, float %6, i32 0
+  %14 = insertelement <4 x float> %13, float %8, i32 1
+  %15 = insertelement <4 x float> %14, float %10, i32 2
+  %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2)
   ret void
 }
 
-; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
 ; Function Attrs: nounwind readonly
-declare float @llvm.cos.f32(float) #2
+declare float @llvm.cos.f32(float) #1
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-attributes #2 = { nounwind readonly }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/R600/tex-clause-antidep.ll b/test/CodeGen/R600/tex-clause-antidep.ll
index 5979609..cbb9c50 100644
--- a/test/CodeGen/R600/tex-clause-antidep.ll
+++ b/test/CodeGen/R600/tex-clause-antidep.ll
@@ -3,11 +3,11 @@
 ;CHECK: TEX
 ;CHECK-NEXT: ALU
 
-define void @test() {
-  %1 = call float @llvm.R600.load.input(i32 0)
-  %2 = call float @llvm.R600.load.input(i32 1)
-  %3 = call float @llvm.R600.load.input(i32 2)
-  %4 = call float @llvm.R600.load.input(i32 3)
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
   %5 = insertelement <4 x float> undef, float %1, i32 0
   %6 = insertelement <4 x float> %5, float %2, i32 1
   %7 = insertelement <4 x float> %6, float %3, i32 2
@@ -19,6 +19,7 @@ define void @test() {
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
 declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/texture-input-merge.ll b/test/CodeGen/R600/texture-input-merge.ll
index 5d0ecef..789538a 100644
--- a/test/CodeGen/R600/texture-input-merge.ll
+++ b/test/CodeGen/R600/texture-input-merge.ll
@@ -2,11 +2,11 @@
 
 ;CHECK-NOT: MOV
 
-define void @test() {
-  %1 = call float @llvm.R600.load.input(i32 0)
-  %2 = call float @llvm.R600.load.input(i32 1)
-  %3 = call float @llvm.R600.load.input(i32 2)
-  %4 = call float @llvm.R600.load.input(i32 3)
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
   %5 = fmul float %1, 3.0
   %6 = fmul float %2, 3.0
   %7 = fmul float %3, 3.0
@@ -25,6 +25,7 @@ define void @test() {
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
 declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
new file mode 100644
index 0000000..ec959c2
--- /dev/null
+++ b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This tests for a bug in the SelectionDAG where custom lowered truncated
+; vector stores at the end of a basic block were not being added to the
+; LegalizedNodes list, which triggered an assertion failure.
+
+; CHECK-LABEL: @test
+; CHECK: MEM_RAT_CACHELESS STORE_RAW
+define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %done
+
+if:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
new file mode 100644
index 0000000..0bd320a
--- /dev/null
+++ b/test/CodeGen/R600/trunc.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+; SI-LABEL: @trunc_i64_to_i32_store
+; SI: S_LOAD_DWORD s0, s[0:1], 11
+; SI: V_MOV_B32_e32 v0, s0
+; SI: BUFFER_STORE_DWORD v0
+
+; EG-LABEL: @trunc_i64_to_i32_store
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG: LSHR
+; EG-NEXT: 2(
+
+  %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @trunc_shl_i64:
+; SI: S_LOAD_DWORDX2
+; SI: S_LOAD_DWORDX2 [[SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: S_LSHL_B64 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, [[SREG]], 2
+; SI: MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
+define void @trunc_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+  %b = shl i64 %a, 2
+  %result = trunc i64 %b to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 08fe2ef..5371321 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,13 +1,26 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+
+;EG-CHECK-LABEL: @test
+;EG-CHECK-NOT: SETGE_INT
+;EG-CHECK: CF_END
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1) * %in
+  %b = load i32 addrspace(1) * %b_ptr
+  %result = udiv i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
 
 ;The code generated by udiv is long and complex and may frequently change.
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v4i32 udiv
 
-;EG-CHECK: @test2
+;EG-CHECK-LABEL: @test2
 ;EG-CHECK: CF_END
-;SI-CHECK: @test2
+;SI-CHECK-LABEL: @test2
 ;SI-CHECK: S_ENDPGM
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
@@ -19,9 +32,9 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
+;EG-CHECK-LABEL: @test4
 ;EG-CHECK: CF_END
-;SI-CHECK: @test4
+;SI-CHECK-LABEL: @test4
 ;SI-CHECK: S_ENDPGM
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index faac77a..a5ac355 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @uint_to_fp_v2i32
+; R600-CHECK-LABEL: @uint_to_fp_v2i32
 ; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-; SI-CHECK: @uint_to_fp_v2i32
+; SI-CHECK-LABEL: @uint_to_fp_v2i32
 ; SI-CHECK: V_CVT_F32_U32_e32
 ; SI-CHECK: V_CVT_F32_U32_e32
 define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
@@ -13,12 +13,12 @@ define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   ret void
 }
 
-; R600-CHECK: @uint_to_fp_v4i32
+; R600-CHECK-LABEL: @uint_to_fp_v4i32
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @uint_to_fp_v4i32
+; SI-CHECK-LABEL: @uint_to_fp_v4i32
 ; SI-CHECK: V_CVT_F32_U32_e32
 ; SI-CHECK: V_CVT_F32_U32_e32
 ; SI-CHECK: V_CVT_F32_U32_e32
@@ -29,3 +29,18 @@ define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; R600-CHECK-LABEL: @uint_to_fp_i64_f32
+; R600-CHECK: UINT_TO_FLT
+; R600-CHECK: UINT_TO_FLT
+; R600-CHECK: MULADD_IEEE
+; SI-CHECK-LABEL: @uint_to_fp_i64_f32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_MAD_F32
+define void @uint_to_fp_i64_f32(float addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = uitofp i64 %in to float
+  store float %0, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
new file mode 100644
index 0000000..2824ff8
--- /dev/null
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @unaligned_load_store_i32:
+; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+  %v = load i32 addrspace(3)* %p, align 1
+  store i32 %v, i32 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: @unaligned_load_store_v4i32:
+; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+  %v = load <4 x i32> addrspace(3)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
+  ret void
+}
diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
index cf29833..f986a02 100644
--- a/test/CodeGen/R600/unsupported-cc.ll
+++ b/test/CodeGen/R600/unsupported-cc.ll
@@ -2,8 +2,9 @@
 
 ; These tests are for condition codes that are not supported by the hardware
 
-; CHECK: @slt
-; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @slt
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -13,8 +14,9 @@ entry:
   ret void
 }
 
-; CHECK: @ult_i32
-; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ult_i32
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -24,9 +26,11 @@ entry:
   ret void
 }
 
-; CHECK: @ult_float
-; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ult_float
+; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
 define void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
@@ -35,9 +39,22 @@ entry:
   ret void
 }
 
-; CHECK: @olt
-; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-;CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-LABEL: @ult_float_native
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ult_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @olt
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
@@ -46,8 +63,9 @@ entry:
   ret void
 }
 
-; CHECK: @sle
-; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @sle
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -57,8 +75,9 @@ entry:
   ret void
 }
 
-; CHECK: @ule_i32
-; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ule_i32
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -68,9 +87,11 @@ entry:
   ret void
 }
 
-; CHECK: @ule_float
-; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ule_float
+; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
 define void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
@@ -79,8 +100,21 @@ entry:
   ret void
 }
 
-; CHECK: @ole
-; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ule_float_native
+; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ule_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @ole
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/R600/urecip.ll b/test/CodeGen/R600/urecip.ll
index dad02dd..e808e3d 100644
--- a/test/CodeGen/R600/urecip.ll
+++ b/test/CodeGen/R600/urecip.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_RCP_IFLAG_F32_e32
 
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index cf3474c..8045145 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;The code generated by urem is long and complex and may frequently change.
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
index d892229..7ea7a5c 100644
--- a/test/CodeGen/R600/vertex-fetch-encoding.ll
+++ b/test/CodeGen/R600/vertex-fetch-encoding.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI-CHECK %s
-; RUN: not llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
 
 ; NI-CHECK: @vtx_fetch32
 ; NI-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
index 72a9084..dca7b06 100644
--- a/test/CodeGen/R600/vselect.ll
+++ b/test/CodeGen/R600/vselect.ll
@@ -1,9 +1,9 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @test_select_v2i32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test_select_v2i32
 ;SI-CHECK: V_CNDMASK_B32_e64
@@ -20,8 +20,8 @@ entry:
 }
 
 ;EG-CHECK: @test_select_v2f32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test_select_v2f32
 ;SI-CHECK: V_CNDMASK_B32_e64
@@ -31,17 +31,17 @@ define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrs
 entry:
   %0 = load <2 x float> addrspace(1)* %in0
   %1 = load <2 x float> addrspace(1)* %in1
-  %cmp = fcmp one <2 x float> %0, %1
+  %cmp = fcmp une <2 x float> %0, %1
   %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ;EG-CHECK: @test_select_v4i32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test_select_v4i32
 ;SI-CHECK: V_CNDMASK_B32_e64
@@ -60,16 +60,16 @@ entry:
 }
 
 ;EG-CHECK: @test_select_v4f32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <4 x float> addrspace(1)* %in0
   %1 = load <4 x float> addrspace(1)* %in1
-  %cmp = fcmp one <4 x float> %0, %1
+  %cmp = fcmp une <4 x float> %0, %1
   %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/vselect64.ll b/test/CodeGen/R600/vselect64.ll
new file mode 100644
index 0000000..604695b
--- /dev/null
+++ b/test/CodeGen/R600/vselect64.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck  %s
+; XXX: Merge this test into vselect.ll once SI supports 64-bit select.
+
+; CHECK-LABEL: @test_select_v4i64
+; Make sure the vectors aren't being stored on the stack.  We know they are
+; being stored on the stack if the shaders uses at leat 10 registers.
+; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
+define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+entry:
+       %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
+       %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
+       store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+       ret void
+}
+
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
new file mode 100644
index 0000000..2cf88fe
--- /dev/null
+++ b/test/CodeGen/R600/wait.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=r600 -mcpu=SI --verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: @main
+;CHECK: S_WAITCNT lgkmcnt(0)
+;CHECK: S_WAITCNT vmcnt(0)
+;CHECK: S_WAITCNT expcnt(0) lgkmcnt(0)
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, i32 inreg, i32, i32, i32, i32) #0 {
+main_body:
+  %10 = getelementptr <16 x i8> addrspace(2)* %3, i32 0
+  %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
+  %12 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %11, i32 0, i32 %6)
+  %13 = extractelement <4 x float> %12, i32 0
+  %14 = extractelement <4 x float> %12, i32 1
+  %15 = extractelement <4 x float> %12, i32 2
+  %16 = extractelement <4 x float> %12, i32 3
+  %17 = getelementptr <16 x i8> addrspace(2)* %3, i32 1
+  %18 = load <16 x i8> addrspace(2)* %17, !tbaa !0
+  %19 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %18, i32 0, i32 %6)
+  %20 = extractelement <4 x float> %19, i32 0
+  %21 = extractelement <4 x float> %19, i32 1
+  %22 = extractelement <4 x float> %19, i32 2
+  %23 = extractelement <4 x float> %19, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %20, float %21, float %22, float %23)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %13, float %14, float %15, float %16)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 7998983..9618d7f 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; R600-CHECK: @ngroups_x
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].X
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].X
 ; SI-CHECK: @ngroups_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 0
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
@@ -16,11 +16,11 @@ entry:
 }
 
 ; R600-CHECK: @ngroups_y
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].Y
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].Y
 ; SI-CHECK: @ngroups_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 1
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 1
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
@@ -30,11 +30,11 @@ entry:
 }
 
 ; R600-CHECK: @ngroups_z
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].Z
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].Z
 ; SI-CHECK: @ngroups_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 2
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 2
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
@@ -44,11 +44,11 @@ entry:
 }
 
 ; R600-CHECK: @global_size_x
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].W
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].W
 ; SI-CHECK: @global_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 3
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 3
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
@@ -58,11 +58,11 @@ entry:
 }
 
 ; R600-CHECK: @global_size_y
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].X
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].X
 ; SI-CHECK: @global_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 4
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 4
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
@@ -72,11 +72,11 @@ entry:
 }
 
 ; R600-CHECK: @global_size_z
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].Y
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].Y
 ; SI-CHECK: @global_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 5
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 5
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
@@ -86,11 +86,11 @@ entry:
 }
 
 ; R600-CHECK: @local_size_x
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].Z
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].Z
 ; SI-CHECK: @local_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 6
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 6
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
 entry:
@@ -100,11 +100,11 @@ entry:
 }
 
 ; R600-CHECK: @local_size_y
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].W
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].W
 ; SI-CHECK: @local_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 7
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 7
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
 entry:
@@ -114,11 +114,11 @@ entry:
 }
 
 ; R600-CHECK: @local_size_z
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[2].X
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[2].X
 ; SI-CHECK: @local_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 8
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 8
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
 entry:
@@ -127,12 +127,12 @@ entry:
   ret void
 }
 
-; The tgid values are stored in SGPRs offset by the number of user SGPRs.
-; Currently we always use exactly 2 user SGPRs for the pointer to the
+; The tgid values are stored in ss offset by the number of user ss.
+; Currently we always use exactly 2 user ss for the pointer to the
 ; kernel arguments, but this may change in the future.
 
 ; SI-CHECK: @tgid_x
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR2
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @tgid_x (i32 addrspace(1)* %out) {
 entry:
@@ -142,7 +142,7 @@ entry:
 }
 
 ; SI-CHECK: @tgid_y
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR3
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @tgid_y (i32 addrspace(1)* %out) {
 entry:
@@ -152,7 +152,7 @@ entry:
 }
 
 ; SI-CHECK: @tgid_z
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR4
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @tgid_z (i32 addrspace(1)* %out) {
 entry:
@@ -162,7 +162,7 @@ entry:
 }
 
 ; SI-CHECK: @tidig_x
-; SI-CHECK: BUFFER_STORE_DWORD VGPR0
+; SI-CHECK: BUFFER_STORE_DWORD v0
 define void @tidig_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
@@ -171,7 +171,7 @@ entry:
 }
 
 ; SI-CHECK: @tidig_y
-; SI-CHECK: BUFFER_STORE_DWORD VGPR1
+; SI-CHECK: BUFFER_STORE_DWORD v1
 define void @tidig_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
@@ -180,7 +180,7 @@ entry:
 }
 
 ; SI-CHECK: @tidig_z
-; SI-CHECK: BUFFER_STORE_DWORD VGPR2
+; SI-CHECK: BUFFER_STORE_DWORD v2
 define void @tidig_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
new file mode 100644
index 0000000..b1cbe3f
--- /dev/null
+++ b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
@@ -0,0 +1,86 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; We want all MULLO_INT inst to be last in their instruction group
+;CHECK: @fill3d
+;CHECK-NOT: MULLO_INT T[0-9]+
+
+; ModuleID = 'radeon'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+target triple = "r600--"
+
+; Function Attrs: nounwind
+define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
+  %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
+  %mul = mul i32 %y.i18, %x.i
+  %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1
+  %mul3 = mul i32 %mul, %z.i17
+  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1
+  %mul26.i = mul i32 %x.i12.i, %x.i.i
+  %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.i16 = add i32 %x.i4.i, %mul26.i
+  %mul7 = mul i32 %add.i16, %y.i18
+  %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1
+  %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1
+  %mul30.i = mul i32 %y.i14.i, %y.i.i
+  %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %add.i14 = add i32 %mul30.i, %mul7
+  %mul819 = add i32 %add.i14, %y.i6.i
+  %add = mul i32 %mul819, %z.i17
+  %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1
+  %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1
+  %mul33.i = mul i32 %z.i16.i, %z.i.i
+  %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1
+  %add.i = add i32 %z.i8.i, %mul33.i
+  %add13 = add i32 %add.i, %add
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %add13
+  store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.z() #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!opencl.kernels = !{!0, !1, !2}
+
+!0 = metadata !{null}
+!1 = metadata !{null}
+!2 = metadata !{void (i32 addrspace(1)*)* @fill3d}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index f52729d..c12b0c1 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @xor_v2i32
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @xor_v2i32
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 
 define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
@@ -19,16 +19,16 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 }
 
 ;EG-CHECK: @xor_v4i32
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @xor_v4i32
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32> addrspace(1) * %in0
@@ -37,3 +37,20 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;EG-CHECK: @xor_i1
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+;SI-CHECK: @xor_i1
+;SI-CHECK: S_XOR_B64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+
+define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float addrspace(1) * %in0
+  %b = load float addrspace(1) * %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 0.000000e+00
+  %xor = xor i1 %acmp, %bcmp
+  %result = select i1 %xor, float %a, float %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 413b849..481b3b3 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @test
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 
 ; SI-CHECK: @test
-; SI-CHECK: V_MOV_B32_e32 [[ZERO:VGPR[0-9]]], 0
-; SI-CHECK: BUFFER_STORE_DWORDX2 VGPR0_[[ZERO]]
+; SI-CHECK: V_MOV_B32_e32 v[[ZERO:[0-9]]], 0
+; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = mul i32 %a, %b
diff --git a/test/CodeGen/SI/sanity.ll b/test/CodeGen/SI/sanity.ll
deleted file mode 100644
index 62cdcf5..0000000
--- a/test/CodeGen/SI/sanity.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
-
-; CHECK: S_ENDPGM
-
-define void @main() {
-main_body:
-  call void @llvm.AMDGPU.shader.type(i32 1)
-  %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
-  %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0
-  %2 = load <4 x i32> addrspace(2)* %1
-  %3 = call i32 @llvm.SI.vs.load.buffer.index()
-  %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = extractelement <4 x float> %4, i32 1
-  %7 = extractelement <4 x float> %4, i32 2
-  %8 = extractelement <4 x float> %4, i32 3
-  %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
-  %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1
-  %11 = load <4 x i32> addrspace(2)* %10
-  %12 = call i32 @llvm.SI.vs.load.buffer.index()
-  %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12)
-  %14 = extractelement <4 x float> %13, i32 0
-  %15 = extractelement <4 x float> %13, i32 1
-  %16 = extractelement <4 x float> %13, i32 2
-  %17 = extractelement <4 x float> %13, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8)
-  ret void
-}
-
-declare void @llvm.AMDGPU.shader.type(i32)
-
-declare i32 @llvm.SI.vs.load.buffer.index() readnone
-
-declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/SPARC/2011-01-11-CC.ll b/test/CodeGen/SPARC/2011-01-11-CC.ll
index edbcb49..50f3a65 100644
--- a/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=sparc <%s | FileCheck %s -check-prefix=V8
 ; RUN: llc -march=sparc -mattr=v9 <%s | FileCheck %s -check-prefix=V9
+; RUN: llc -mtriple=sparc64-unknown-linux <%s | FileCheck %s -check-prefix=SPARC64
 
 
 define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind readnone noinline {
@@ -65,9 +66,11 @@ define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind readnone noin
 entry:
 ;V8-LABEL: test_select_int_fcc:
 ;V8: fcmps
+;V8-NEXT: nop
 ;V8: {{fbe|fbne}}
 ;V9-LABEL: test_select_int_fcc:
 ;V9: fcmps
+;V9-NEXT-NOT: nop
 ;V9-NOT: {{fbe|fbne}}
 ;V9: mov{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
@@ -94,12 +97,95 @@ define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind r
 entry:
 ;V8-LABEL: test_select_dfp_fcc:
 ;V8: fcmpd
+;V8-NEXT: nop
 ;V8: {{fbne|fbe}}
 ;V9-LABEL: test_select_dfp_fcc:
 ;V9: fcmpd
+;V9-NEXT-NOT: nop
 ;V9-NOT: {{fbne|fbe}}
 ;V9: fmovd{{e|ne}} %fcc0
   %0 = fcmp une double %f, 0.000000e+00
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
+
+define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) {
+entry:
+; V8-LABEL: test_float_cc
+; V8:       fcmpd
+; V8:       {{fbl|fbuge}} .LBB
+; V8:       fcmpd
+; V8:       {{fbule|fbg}} .LBB
+
+; V9-LABEL: test_float_cc
+; V9:       fcmpd
+; V9:       {{fbl|fbuge}} .LBB
+; V9:       fcmpd
+; V9:       {{fbule|fbg}} .LBB
+
+   %0 = fcmp uge double %a, 0.000000e+00
+   br i1 %0, label %loop, label %loop.2
+
+loop:
+   %1 = icmp eq i32 %c, 10
+   br i1 %1, label %loop, label %exit.0
+
+loop.2:
+   %2 = fcmp ogt double %b, 0.000000e+00
+   br i1 %2, label %exit.1, label %loop
+
+exit.0:
+   ret i32 0
+
+exit.1:
+   ret i32 1
+}
+
+; V8-LABEL: test_adde_sube
+; V8:       addcc
+; V8:       addxcc
+; V8:       addxcc
+; V8:       addxcc
+; V8:       subcc
+; V8:       subxcc
+; V8:       subxcc
+; V8:       subxcc
+
+
+; V9-LABEL: test_adde_sube
+; V9:       addcc
+; V9:       addxcc
+; V9:       addxcc
+; V9:       addxcc
+; V9:       subcc
+; V9:       subxcc
+; V9:       subxcc
+; V9:       subxcc
+
+; SPARC64-LABEL: test_adde_sube
+; SPARC64:       addcc
+; SPARC64:       addxcc
+; SPARC64:       addxcc
+; SPARC64:       addxcc
+; SPARC64:       subcc
+; SPARC64:       subxcc
+; SPARC64:       subxcc
+; SPARC64:       subxcc
+
+
+define void @test_adde_sube(i8* %a, i8* %b, i8* %sum, i8* %diff) {
+entry:
+   %0 = bitcast i8* %a to i128*
+   %1 = bitcast i8* %b to i128*
+   %2 = load i128* %0
+   %3 = load i128* %1
+   %4 = add i128 %2, %3
+   %5 = bitcast i8* %sum to i128*
+   store i128 %4, i128* %5
+   tail call void asm sideeffect "", "=*m,*m"(i128 *%0, i128* %5) nounwind
+   %6 = load i128* %0
+   %7 = sub i128 %2, %6
+   %8 = bitcast i8* %diff to i128*
+   store i128 %7, i128* %8
+   ret void
+}
diff --git a/test/CodeGen/SPARC/2011-01-11-Call.ll b/test/CodeGen/SPARC/2011-01-11-Call.ll
index 7350e92..a0f478e 100644
--- a/test/CodeGen/SPARC/2011-01-11-Call.ll
+++ b/test/CodeGen/SPARC/2011-01-11-Call.ll
@@ -1,4 +1,24 @@
 ; RUN: llc -march=sparc -O0 <%s
+; RUN: llc -march=sparc   <%s | FileCheck %s --check-prefix=V8
+; RUN: llc -march=sparcv9 <%s | FileCheck %s --check-prefix=V9
+
+; V8-LABEL: test
+; V8:       save %sp
+; V8:       call foo
+; V8-NEXT:  nop
+; V8:       call bar
+; V8-NEXT:  nop
+; V8:       jmp %i7+8
+; V8-NEXT:  restore
+
+; V9-LABEL: test
+; V9:       save %sp
+; V9:       call foo
+; V9-NEXT:  nop
+; V9:       call bar
+; V9-NEXT:  nop
+; V9:       jmp %i7+8
+; V9-NEXT:  restore
 
 define void @test() nounwind {
 entry:
@@ -11,3 +31,23 @@ declare i32 @foo(...)
 
 declare void @bar(...)
 
+
+; V8-LABEL: test_tail_call_with_return
+; V8:       save %sp
+; V8:       call foo
+; V8-NEXT:  nop
+; V8:       jmp %i7+8
+; V8-NEXT:  restore %g0, %o0, %o0
+
+; V9-LABEL: test_tail_call_with_return
+; V9:       save %sp
+; V9:       call foo
+; V9-NEXT:  nop
+; V9:       jmp %i7+8
+; V9-NEXT:  restore %g0, %o0, %o0
+
+define i32 @test_tail_call_with_return() nounwind {
+entry:
+ %0 = tail call i32 (...)* @foo() nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/SPARC/2013-05-17-CallFrame.ll b/test/CodeGen/SPARC/2013-05-17-CallFrame.ll
index 9e9e821..81f586f 100644
--- a/test/CodeGen/SPARC/2013-05-17-CallFrame.ll
+++ b/test/CodeGen/SPARC/2013-05-17-CallFrame.ll
@@ -1,10 +1,20 @@
-; RUN: llc -march=sparc < %s | FileCheck %s
+; RUN: llc -march=sparc   < %s | FileCheck %s --check-prefix=V8
+; RUN: llc -march=sparcv9 < %s | FileCheck %s --check-prefix=SPARC64
+
+; V8-LABEL: variable_alloca_with_adj_call_stack
+; V8:       save %sp, -96, %sp
+; V8:       add {{.+}}, 96, %o0
+; V8:       add %sp, -16, %sp
+; V8:       call foo
+; V8:       add %sp, 16, %sp
+
+; SPARC64-LABEL: variable_alloca_with_adj_call_stack
+; SPARC64:       save %sp, -128, %sp
+; SPARC64:       add {{.+}}, 2175, %o0
+; SPARC64:       add %sp, -80, %sp
+; SPARC64:       call foo
+; SPARC64:       add %sp, 80, %sp
 
-; CHECK: variable_alloca_with_adj_call_stack
-; CHECK: save %sp, -96, %sp
-; CHECK: add %sp, -16, %sp
-; CHECK: call foo
-; CHECK: add %sp, 16, %sp
 define void @variable_alloca_with_adj_call_stack(i32 %num) {
 entry:
   %0 = alloca i8, i32 %num, align 8
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index 5a7eb40..8b752a1 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -376,3 +376,38 @@ define signext i32 @ret_nosext(i32 signext %a0) {
 define signext i32 @ret_nozext(i32 signext %a0) {
   ret i32 %a0
 }
+
+; CHECK-LABEL: test_register_directive
+; CHECK:       .register %g2, #scratch
+; CHECK:       .register %g3, #scratch
+; CHECK:       add %i0, 2, %g2
+; CHECK:       add %i0, 3, %g3
+define i32 @test_register_directive(i32 %i0) {
+entry:
+  %0 = add nsw i32 %i0, 2
+  %1 = add nsw i32 %i0, 3
+  tail call void asm sideeffect "", "r,r,~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6},~{o7},~{g1},~{g4},~{g5},~{g6},~{g7}"(i32 %0, i32 %1)
+  %2 = add nsw i32 %0, %1
+  ret i32 %2
+}
+
+; CHECK-LABEL: test_large_stack
+
+; CHECK:       sethi 16, %g1
+; CHECK:       xor %g1, -176, %g1
+; CHECK:       save %sp, %g1, %sp
+
+; CHECK:       sethi 14, %g1
+; CHECK:       xor %g1, -1, %g1
+; CHECK:       add %g1, %fp, %g1
+; CHECK:       call use_buf
+
+define i32 @test_large_stack() {
+entry:
+  %buffer1 = alloca [16384 x i8], align 8
+  %buffer1.sub = getelementptr inbounds [16384 x i8]* %buffer1, i32 0, i32 0
+  %0 = call i32 @use_buf(i32 16384, i8* %buffer1.sub)
+  ret i32 %0
+}
+
+declare i32 @use_buf(i32, i8*)
diff --git a/test/CodeGen/SPARC/64bit.ll b/test/CodeGen/SPARC/64bit.ll
index f778f9d..f5ed047 100644
--- a/test/CodeGen/SPARC/64bit.ll
+++ b/test/CodeGen/SPARC/64bit.ll
@@ -285,3 +285,26 @@ entry:
   store i64 0, i64* %0, align 8
   ret i64 0
 }
+
+; CHECK-LABEL: bit_ops
+; CHECK:       popc
+
+; OPT-LABEL: bit_ops
+; OPT:       popc
+
+define i64 @bit_ops(i64 %arg) {
+entry:
+  %0 = tail call i64 @llvm.ctpop.i64(i64 %arg)
+  %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 true)
+  %2 = tail call i64 @llvm.cttz.i64(i64 %arg, i1 true)
+  %3 = tail call i64 @llvm.bswap.i64(i64 %arg)
+  %4 = add i64 %0, %1
+  %5 = add i64 %2, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
diff --git a/test/CodeGen/SPARC/64cond.ll b/test/CodeGen/SPARC/64cond.ll
index bdc5e70..7451b04 100644
--- a/test/CodeGen/SPARC/64cond.ll
+++ b/test/CodeGen/SPARC/64cond.ll
@@ -109,3 +109,17 @@ entry:
   %rv = select i1 %tobool, i64 123, i64 0
   ret i64 %rv
 }
+
+; CHECK-LABEL: setcc_resultty
+; CHECK:       cmp
+; CHECK:       movne %xcc, 1, [[R:%[gilo][0-7]]]
+; CHECK:       or [[R]], %i1, %i0
+
+define i1 @setcc_resultty(i64 %a, i1 %b) {
+  %a0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a, i64 32)
+  %a1 = extractvalue { i64, i1 } %a0, 1
+  %a4 = or i1 %a1, %b
+  ret i1 %a4
+}
+
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
diff --git a/test/CodeGen/SPARC/constpool.ll b/test/CodeGen/SPARC/constpool.ll
index d8b7b15..b861676 100644
--- a/test/CodeGen/SPARC/constpool.ll
+++ b/test/CodeGen/SPARC/constpool.ll
@@ -39,8 +39,10 @@ entry:
 ; v8pic32: sethi %hi(.LCPI0_0), %[[R1:[gilo][0-7]]]
 ; v8pic32: add %[[R1]], %lo(.LCPI0_0), %[[Goffs:[gilo][0-7]]]
 ; v8pic32: ld [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v8pic32: jmp %o7+8
 ; v8pic32: ld [%[[Gaddr]]], %f0
+; v8pic32: jmp %i7+8
+; v8pic32: restore
+
 
 
 ; v9pic32: floatCP
@@ -48,6 +50,8 @@ entry:
 ; v9pic32: sethi %hi(.LCPI0_0), %[[R1:[gilo][0-7]]]
 ; v9pic32: add %[[R1]], %lo(.LCPI0_0), %[[Goffs:[gilo][0-7]]]
 ; v9pic32: ldx [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v9pic32: jmp %o7+8
 ; v9pic32: ld [%[[Gaddr]]], %f1
+; v9pic32: jmp %i7+8
+; v9pic32: restore
+
 
diff --git a/test/CodeGen/SPARC/exception.ll b/test/CodeGen/SPARC/exception.ll
new file mode 100644
index 0000000..cb5b6e5
--- /dev/null
+++ b/test/CodeGen/SPARC/exception.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+
+
+%struct.__fundamental_type_info_pseudo = type { %struct.__type_info_pseudo }
+%struct.__type_info_pseudo = type { i8*, i8* }
+
+@_ZTIi = external constant %struct.__fundamental_type_info_pseudo
+@_ZTIf = external constant %struct.__fundamental_type_info_pseudo
+@.cst = linker_private unnamed_addr constant [12 x i8] c"catched int\00", align 64
+@.cst1 = linker_private unnamed_addr constant [14 x i8] c"catched float\00", align 64
+
+; CHECK-LABEL: main:
+; CHECK:       .cfi_startproc
+; CHECK:       .cfi_def_cfa_register 30
+; CHECK:       .cfi_window_save
+; CHECK:       .cfi_register 15, 31
+
+; CHECK:        call __cxa_throw
+; CHECK:        call __cxa_throw
+
+; CHECK:        call __cxa_begin_catch
+; CHECK:        call __cxa_end_catch
+
+; CHECK:        call __cxa_begin_catch
+; CHECK:        call __cxa_end_catch
+
+; CHECK:        .cfi_endproc
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) unnamed_addr #0 {
+entry:
+  %0 = icmp eq i32 %argc, 2
+  %1 = tail call i8* @__cxa_allocate_exception(i32 4) #1
+  br i1 %0, label %"3", label %"4"
+
+"3":                                              ; preds = %entry
+  %2 = bitcast i8* %1 to i32*
+  store i32 0, i32* %2, align 4
+  invoke void @__cxa_throw(i8* %1, i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIi to i8*), void (i8*)* null) #2
+          to label %3 unwind label %"8"
+
+; <label>:3                                       ; preds = %"3"
+  unreachable
+
+"4":                                              ; preds = %entry
+  %4 = bitcast i8* %1 to float*
+  store float 1.000000e+00, float* %4, align 4
+
+
+  invoke void @__cxa_throw(i8* %1, i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIf to i8*), void (i8*)* null) #2
+          to label %5 unwind label %"8"
+
+; <label>:5                                       ; preds = %"4"
+  unreachable
+
+"5":                                              ; preds = %"13", %"11"
+  %6 = phi i32 [ 2, %"13" ], [ 0, %"11" ]
+  ret i32 %6
+
+"8":                                              ; preds = %"4", %"3"
+  %exc = landingpad { i8*, i32 } personality i32 (i32, i64, i8*, i8*)* @__gxx_personality_v0
+          catch %struct.__fundamental_type_info_pseudo* @_ZTIi
+          catch %struct.__fundamental_type_info_pseudo* @_ZTIf
+  %exc_ptr12 = extractvalue { i8*, i32 } %exc, 0
+  %filter13 = extractvalue { i8*, i32 } %exc, 1
+  %typeid = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIi to i8*))
+  %7 = icmp eq i32 %filter13, %typeid
+  br i1 %7, label %"11", label %8
+
+; <label>:8                                       ; preds = %"8"
+  %typeid8 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIf to i8*))
+  %9 = icmp eq i32 %filter13, %typeid8
+  br i1 %9, label %"13", label %"9"
+
+"9":                                              ; preds = %8
+  resume { i8*, i32 } %exc
+
+"11":                                             ; preds = %"8"
+  %10 = tail call i8* @__cxa_begin_catch(i8* %exc_ptr12) #1
+  %11 = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.cst, i32 0, i32 0))
+  tail call void @__cxa_end_catch() #1
+  br label %"5"
+
+"13":                                             ; preds = %8
+  %12 = tail call i8* @__cxa_begin_catch(i8* %exc_ptr12) #1
+  %13 = tail call i32 @puts(i8* getelementptr inbounds ([14 x i8]* @.cst1, i32 0, i32 0))
+  tail call void @__cxa_end_catch() #1
+  br label %"5"
+}
+
+; Function Attrs: nounwind
+declare i8* @__cxa_allocate_exception(i32) #1
+
+; Function Attrs: noreturn
+declare void @__cxa_throw(i8*, i8*, void (i8*)*) #2
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #3
+
+; Function Attrs: nounwind
+declare i8* @__cxa_begin_catch(i8*) #1
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #1
+
+declare i32 @__gxx_personality_v0(i32, i64, i8*, i8*)
+
+attributes #0 = { "no-frame-pointer-elim-non-leaf"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { noreturn }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/SPARC/float.ll b/test/CodeGen/SPARC/float.ll
index 8dfd371..6636704 100644
--- a/test/CodeGen/SPARC/float.ll
+++ b/test/CodeGen/SPARC/float.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=sparc < %s | FileCheck %s -check-prefix=V8
 ; RUN: llc -march=sparc -O0 < %s | FileCheck %s -check-prefix=V8-UNOPT
 ; RUN: llc -march=sparc -mattr=v9 < %s | FileCheck %s -check-prefix=V9
-
+; RUN: llc -mtriple=sparc64-unknown-linux < %s | FileCheck %s -check-prefix=SPARC64
 
 ; V8-LABEL:     test_neg:
 ; V8:     call get_double
@@ -16,6 +16,9 @@
 ; V9-LABEL:     test_neg:
 ; V9:     fnegd %f0, %f0
 
+; SPARC64-LABEL: test_neg:
+; SPARC64:       fnegd %f0, %f0
+
 define double @test_neg() {
 entry:
   %0 = tail call double @get_double()
@@ -35,6 +38,10 @@ entry:
 ; V9-LABEL:     test_abs:
 ; V9:     fabsd %f0, %f0
 
+
+; SPARC64-LABEL:     test_abs:
+; SPARC64:     fabsd %f0, %f0
+
 define double @test_abs() {
 entry:
   %0 = tail call double @get_double()
@@ -45,3 +52,198 @@ entry:
 declare double @get_double()
 declare double @llvm.fabs.f64(double) nounwind readonly
 
+; V8-LABEL:    test_v9_floatreg:
+; V8:          fsubd {{.+}}, {{.+}}, {{.+}}
+; V8:          faddd {{.+}}, {{.+}}, [[R:%f(((1|2)?(0|2|4|6|8))|30)]]
+; V8:          std [[R]], [%{{.+}}]
+; V8:          ldd [%{{.+}}], %f0
+
+; V9-LABEL:    test_v9_floatreg:
+; V9:          fsubd {{.+}}, {{.+}}, {{.+}}
+; V9:          faddd {{.+}}, {{.+}}, [[R:%f((3(2|4|6|8))|((4|5)(0|2|4|6|8))|(60|62))]]
+; V9:          fmovd [[R]], %f0
+
+; SPARC64-LABEL:    test_v9_floatreg:
+; SPARC64:          fsubd {{.+}}, {{.+}}, {{.+}}
+; SPARC64:          faddd {{.+}}, {{.+}}, [[R:%f((3(2|4|6|8))|((4|5)(0|2|4|6|8))|(60|62))]]
+; SPARC64:          fmovd [[R]], %f0
+
+define double @test_v9_floatreg() {
+entry:
+  %0 = tail call double @get_double()
+  %1 = tail call double @get_double()
+  %2 = fsub double %0, %1
+  tail call void asm sideeffect "", "~{f0},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"()
+  %3 = fadd double %2, %2
+  ret double %3
+}
+
+; V8-LABEL:    test_xtos_stox
+; V8:          call __floatdisf
+; V8:          call __fixsfdi
+
+; V9-LABEL:    test_xtos_stox
+; V9:          call __floatdisf
+; V9:          call __fixsfdi
+
+; SPARC64-LABEL:    test_xtos_stox
+; SPARC64:          fxtos
+; SPARC64:          fstox
+
+define void @test_xtos_stox(i64 %a, i64* %ptr0, float* %ptr1) {
+entry:
+  %0 = sitofp i64 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptosi float %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_itos_stoi
+; V8:          fitos
+; V8:          fstoi
+
+; V9-LABEL:    test_itos_stoi
+; V9:          fitos
+; V9:          fstoi
+
+; SPARC64-LABEL:    test_itos_stoi
+; SPARC64:          fitos
+; SPARC64:          fstoi
+
+define void @test_itos_stoi(i32 %a, i32* %ptr0, float* %ptr1) {
+entry:
+  %0 = sitofp i32 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptosi float %0 to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
+
+
+; V8-LABEL:    test_xtod_dtox
+; V8:          call __floatdidf
+; V8:          call __fixdfdi
+
+; V9-LABEL:    test_xtod_dtox
+; V9:          call __floatdidf
+; V9:          call __fixdfdi
+
+; SPARC64-LABEL:    test_xtod_dtox
+; SPARC64:          fxtod
+; SPARC64:          fdtox
+
+define void @test_xtod_dtox(i64 %a, i64* %ptr0, double* %ptr1) {
+entry:
+  %0 = sitofp i64 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptosi double %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_itod_dtoi
+; V8:          fitod
+; V8:          fdtoi
+
+; V9-LABEL:    test_itod_dtoi
+; V9:          fitod
+; V9:          fdtoi
+
+; SPARC64-LABEL:    test_itod_dtoi
+; SPARC64:          fitod
+; SPARC64:          fdtoi
+
+define void @test_itod_dtoi(i32 %a, i32* %ptr0, double* %ptr1) {
+entry:
+  %0 = sitofp i32 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptosi double %0 to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_uxtos_stoux
+; V8:          call __floatundisf
+; V8:          call __fixunssfdi
+
+; V9-LABEL:    test_uxtos_stoux
+; V9:          call __floatundisf
+; V9:          call __fixunssfdi
+
+; SPARC64-LABEL:   test_uxtos_stoux
+; SPARC64-NOT:     call __floatundisf
+; SPARC64-NOT:     call __fixunssfdi
+
+define void @test_uxtos_stoux(i64 %a, i64* %ptr0, float* %ptr1) {
+entry:
+  %0 = uitofp i64 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptoui float %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_utos_stou
+; V8:          fdtos
+; V8:          fstoi
+
+; V9-LABEL:    test_utos_stou
+; V9:          fdtos
+; V9:          fstoi
+
+; SPARC64-LABEL:    test_utos_stou
+; SPARC64:     fdtos
+; SPARC64:     fstoi
+
+define void @test_utos_stou(i32 %a, i32* %ptr0, float* %ptr1) {
+entry:
+  %0 = uitofp i32 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptoui float %0 to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
+
+
+; V8-LABEL:    test_uxtod_dtoux
+; V8:          call __floatundidf
+; V8:          call __fixunsdfdi
+
+; V9-LABEL:    test_uxtod_dtoux
+; V9:          call __floatundidf
+; V9:          call __fixunsdfdi
+
+; SPARC64-LABEL:    test_uxtod_dtoux
+; SPARC64-NOT:          call __floatundidf
+; SPARC64-NOT:          call __floatunsdfdi
+
+define void @test_uxtod_dtoux(i64 %a, i64* %ptr0, double* %ptr1) {
+entry:
+  %0 = uitofp i64 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptoui double %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_utod_dtou
+; V8-NOT:      fitod
+; V8:          fdtoi
+
+; V9-LABEL:    test_utod_dtou
+; V9-NOT:      fitod
+; V9:          fdtoi
+
+; SPARC64-LABEL:    test_utod_dtou
+; SPARC64-NOT:      fitod
+; SPARC64:          fdtoi
+
+define void @test_utod_dtou(i32 %a, double %b, i32* %ptr0, double* %ptr1) {
+entry:
+  %0 = uitofp i32 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptoui double %b to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/fp128.ll b/test/CodeGen/SPARC/fp128.ll
new file mode 100644
index 0000000..c761361
--- /dev/null
+++ b/test/CodeGen/SPARC/fp128.ll
@@ -0,0 +1,234 @@
+; RUN: llc < %s -march=sparc -mattr=hard-quad-float | FileCheck %s --check-prefix=HARD
+; RUN: llc < %s -march=sparc -mattr=-hard-quad-float | FileCheck %s --check-prefix=SOFT
+
+
+; HARD-LABEL: f128_ops
+; HARD:       ldd
+; HARD:       ldd
+; HARD:       ldd
+; HARD:       ldd
+; HARD:       faddq [[R0:.+]],  [[R1:.+]],  [[R2:.+]]
+; HARD:       fsubq [[R2]], [[R3:.+]], [[R4:.+]]
+; HARD:       fmulq [[R4]], [[R5:.+]], [[R6:.+]]
+; HARD:       fdivq [[R6]], [[R2]]
+; HARD:       std
+; HARD:       std
+
+; SOFT-LABEL: f128_ops
+; SOFT:       ldd
+; SOFT:       ldd
+; SOFT:       ldd
+; SOFT:       ldd
+; SOFT:       call _Q_add
+; SOFT:       call _Q_sub
+; SOFT:       call _Q_mul
+; SOFT:       call _Q_div
+; SOFT:       std
+; SOFT:       std
+
+define void @f128_ops(fp128* noalias sret %scalar.result, fp128* byval %a, fp128* byval %b, fp128* byval %c, fp128* byval %d) {
+entry:
+  %0 = load fp128* %a, align 8
+  %1 = load fp128* %b, align 8
+  %2 = load fp128* %c, align 8
+  %3 = load fp128* %d, align 8
+  %4 = fadd fp128 %0, %1
+  %5 = fsub fp128 %4, %2
+  %6 = fmul fp128 %5, %3
+  %7 = fdiv fp128 %6, %4
+  store fp128 %7, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: f128_spill
+; HARD:       std %f{{.+}}, [%[[S0:.+]]]
+; HARD:       std %f{{.+}}, [%[[S1:.+]]]
+; HARD-DAG:   ldd [%[[S0]]], %f{{.+}}
+; HARD-DAG:   ldd [%[[S1]]], %f{{.+}}
+; HARD:       jmp
+
+; SOFT-LABEL: f128_spill
+; SOFT:       std %f{{.+}}, [%[[S0:.+]]]
+; SOFT:       std %f{{.+}}, [%[[S1:.+]]]
+; SOFT-DAG:   ldd [%[[S0]]], %f{{.+}}
+; SOFT-DAG:   ldd [%[[S1]]], %f{{.+}}
+; SOFT:       jmp
+
+define void @f128_spill(fp128* noalias sret %scalar.result, fp128* byval %a) {
+entry:
+  %0 = load fp128* %a, align 8
+  call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"()
+  store fp128 %0, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: f128_compare
+; HARD:       fcmpq
+; HARD-NEXT:  nop
+
+; SOFT-LABEL: f128_compare
+; SOFT:       _Q_cmp
+
+define i32 @f128_compare(fp128* byval %f0, fp128* byval %f1, i32 %a, i32 %b) {
+entry:
+   %0 = load fp128* %f0, align 8
+   %1 = load fp128* %f1, align 8
+   %cond = fcmp ult fp128 %0, %1
+   %ret = select i1 %cond, i32 %a, i32 %b
+   ret i32 %ret
+}
+
+; HARD-LABEL: f128_compare2
+; HARD:       fcmpq
+; HARD:       fb{{ule|g}}
+
+; SOFT-LABEL: f128_compare2
+; SOFT:       _Q_cmp
+; SOFT:       cmp
+
+define i32 @f128_compare2() {
+entry:
+  %0 = fcmp ogt fp128 undef, 0xL00000000000000000000000000000000
+  br i1 %0, label %"5", label %"7"
+
+"5":                                              ; preds = %entry
+  ret i32 0
+
+"7":                                              ; preds = %entry
+  ret i32 1
+}
+
+
+; HARD-LABEL: f128_abs
+; HARD:       fabss
+
+; SOFT-LABEL: f128_abs
+; SOFT:       fabss
+
+define void @f128_abs(fp128* noalias sret %scalar.result, fp128* byval %a) {
+entry:
+  %0 = load fp128* %a, align 8
+  %1 = tail call fp128 @llvm.fabs.f128(fp128 %0)
+  store fp128 %1, fp128* %scalar.result, align 8
+  ret void
+}
+
+declare fp128 @llvm.fabs.f128(fp128) nounwind readonly
+
+; HARD-LABEL: int_to_f128
+; HARD:       fitoq
+
+; SOFT-LABEL: int_to_f128
+; SOFT:       _Q_itoq
+
+define void @int_to_f128(fp128* noalias sret %scalar.result, i32 %i) {
+entry:
+  %0 = sitofp i32 %i to fp128
+  store fp128 %0, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: fp128_unaligned
+; HARD:       ldub
+; HARD:       faddq
+; HARD:       stb
+; HARD:       jmp
+
+; SOFT-LABEL: fp128_unaligned
+; SOFT:       ldub
+; SOFT:       call _Q_add
+; SOFT:       stb
+; SOFT:       jmp
+
+define void @fp128_unaligned(fp128* %a, fp128* %b, fp128* %c) {
+entry:
+  %0 = load fp128* %a, align 1
+  %1 = load fp128* %b, align 1
+  %2 = fadd fp128 %0, %1
+  store fp128 %2, fp128* %c, align 1
+  ret void
+}
+
+; HARD-LABEL: uint_to_f128
+; HARD:       fdtoq
+
+; SOFT-LABEL: uint_to_f128
+; SOFT:       _Q_utoq
+
+define void @uint_to_f128(fp128* noalias sret %scalar.result, i32 %i) {
+entry:
+  %0 = uitofp i32 %i to fp128
+  store fp128 %0, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: f128_to_i32
+; HARD:       fqtoi
+; HARD:       fqtoi
+
+; SOFT-LABEL: f128_to_i32
+; SOFT:       call _Q_qtou
+; SOFT:       call _Q_qtoi
+
+
+define i32 @f128_to_i32(fp128* %a, fp128* %b) {
+entry:
+  %0 = load fp128* %a, align 8
+  %1 = load fp128* %b, align 8
+  %2 = fptoui fp128 %0 to i32
+  %3 = fptosi fp128 %1 to i32
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+
+; HARD-LABEL:    test_itoq_qtoi
+; HARD:          call _Q_lltoq
+; HARD:          call _Q_qtoll
+; HARD:          fitoq
+; HARD:          fqtoi
+
+; SOFT-LABEL:    test_itoq_qtoi
+; SOFT:          call _Q_lltoq
+; SOFT:          call _Q_qtoll
+; SOFT:          call _Q_itoq
+; SOFT:          call _Q_qtoi
+
+define void @test_itoq_qtoi(i64 %a, i32 %b, i64* %ptr0, fp128* %ptr1) {
+entry:
+  %0 = sitofp i64 %a to fp128
+  store  fp128 %0, fp128* %ptr1, align 8
+  %1 = fptosi fp128 %0 to i64
+  store  i64 %1, i64* %ptr0, align 8
+  %2 = sitofp i32 %b to fp128
+  store  fp128 %2, fp128* %ptr1, align 8
+  %3 = fptosi fp128 %2 to i32
+  %4 = bitcast i64* %ptr0 to i32*
+  store  i32 %3, i32* %4, align 8
+  ret void
+}
+
+; HARD-LABEL:    test_utoq_qtou
+; HARD-DAG:      call _Q_ulltoq
+; HARD-DAG:      call _Q_qtoull
+; HARD-DAG:      fdtoq
+; HARD-DAG:      fqtoi
+
+; SOFT-LABEL:    test_utoq_qtou
+; SOFT-DAG:      call _Q_ulltoq
+; SOFT-DAG:      call _Q_qtoull
+; SOFT-DAG:      call _Q_utoq
+; SOFT-DAG:      call _Q_qtou
+
+define void @test_utoq_qtou(i64 %a, i32 %b, i64* %ptr0, fp128* %ptr1) {
+entry:
+  %0 = uitofp i64 %a to fp128
+  store  fp128 %0, fp128* %ptr1, align 8
+  %1 = fptoui fp128 %0 to i64
+  store  i64 %1, i64* %ptr0, align 8
+  %2 = uitofp i32 %b to fp128
+  store  fp128 %2, fp128* %ptr1, align 8
+  %3 = fptoui fp128 %2 to i32
+  %4 = bitcast i64* %ptr0 to i32*
+  store  i32 %3, i32* %4, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/globals.ll b/test/CodeGen/SPARC/globals.ll
index 0e0dfc8..7e3effe 100644
--- a/test/CodeGen/SPARC/globals.ll
+++ b/test/CodeGen/SPARC/globals.ll
@@ -41,8 +41,9 @@ define zeroext i8 @loadG() {
 ; v8pic32: sethi %hi(G), %[[R1:[gilo][0-7]]]
 ; v8pic32: add %[[R1]], %lo(G), %[[Goffs:[gilo][0-7]]]
 ; v8pic32: ld [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v8pic32: jmp %o7+8
-; v8pic32: ldub [%[[Gaddr]]], %o0
+; v8pic32: ldub [%[[Gaddr]]], %i0
+; v8pic32: jmp %i7+8
+; v8pic32: restore
 
 
 ; v9pic32: loadG
@@ -50,6 +51,7 @@ define zeroext i8 @loadG() {
 ; v9pic32: sethi %hi(G), %[[R1:[gilo][0-7]]]
 ; v9pic32: add %[[R1]], %lo(G), %[[Goffs:[gilo][0-7]]]
 ; v9pic32: ldx [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v9pic32: jmp %o7+8
-; v9pic32: ldub [%[[Gaddr]]], %o0
+; v9pic32: ldub [%[[Gaddr]]], %i0
+; v9pic32: jmp %i7+8
+; v9pic32: restore
 
diff --git a/test/CodeGen/SPARC/lit.local.cfg b/test/CodeGen/SPARC/lit.local.cfg
index 6f30a87..4d344fa 100644
--- a/test/CodeGen/SPARC/lit.local.cfg
+++ b/test/CodeGen/SPARC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Sparc' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/SPARC/rem.ll b/test/CodeGen/SPARC/rem.ll
new file mode 100644
index 0000000..abef1fc
--- /dev/null
+++ b/test/CodeGen/SPARC/rem.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=sparcv9 | FileCheck %s
+
+; CHECK-LABEL: test1:
+; CHECK:        sdivx %o0, %o1, %o2
+; CHECK-NEXT:   mulx %o2, %o1, %o1
+; CHECK-NEXT:   jmp %o7+8
+; CHECK-NEXT:   sub %o0, %o1, %o0
+
+define i64 @test1(i64 %X, i64 %Y) {
+        %tmp1 = srem i64 %X, %Y
+        ret i64 %tmp1
+}
+
+; CHECK-LABEL: test2:
+; CHECK:        udivx %o0, %o1, %o2
+; CHECK-NEXT:   mulx %o2, %o1, %o1
+; CHECK-NEXT:   jmp %o7+8
+; CHECK-NEXT:   sub %o0, %o1, %o0
+
+define i64 @test2(i64 %X, i64 %Y) {
+        %tmp1 = urem i64 %X, %Y
+        ret i64 %tmp1
+}
+
+; PR18150
+; CHECK-LABEL: test3
+; CHECK:       sethi 2545, [[R0:%[gilo][0-7]]]
+; CHECK:       or    [[R0]], 379, [[R1:%[gilo][0-7]]]
+; CHECK:       mulx  %o0, [[R1]], [[R2:%[gilo][0-7]]]
+; CHECK:       udivx [[R2]], 1021, [[R3:%[gilo][0-7]]]
+; CHECK:       mulx  [[R3]], 1021, [[R4:%[gilo][0-7]]]
+; CHECK:       sub   [[R2]], [[R4]], %o0
+
+define i64 @test3(i64 %b) {
+entry:
+  %mul = mul i64 %b, 2606459
+  %rem = urem i64 %mul, 1021
+  ret i64 %rem
+}
diff --git a/test/CodeGen/SPARC/setjmp.ll b/test/CodeGen/SPARC/setjmp.ll
new file mode 100644
index 0000000..39984fb
--- /dev/null
+++ b/test/CodeGen/SPARC/setjmp.ll
@@ -0,0 +1,72 @@
+;RUN: llc -march=sparc   < %s | FileCheck %s
+;RUN: llc -march=sparcv9 < %s | FileCheck %s --check-prefix=V9
+
+
+%0 = type { [32 x i32] }
+%struct.jmpbuf_env = type { i32, i32, [1 x %struct.__jmp_buf_tag], i32 }
+%struct.__jmp_buf_tag = type { [3 x i32], i32, %0 }
+
+@jenv = common unnamed_addr global %struct.jmpbuf_env* null
+@.cst = linker_private unnamed_addr constant [30 x i8] c"in bar with jmp_buf's id: %d\0A\00", align 64
+
+; CHECK-LABEL: foo
+; CHECK-DAG:   st {{.+}}, [%i0]
+; CHECK-DAG:   st {{.+}}, [%i0+4]
+; CHECK:       call _setjmp
+; CHECK:       ld [%fp+{{.+}}], %[[R:[gilo][0-7]]]
+; CHECK:       st %o0, [%[[R]]+{{.+}}]
+
+; V9-LABEL:   foo
+; V9-DAG:     st {{.+}}, [%i0]
+; V9-DAG:     st {{.+}}, [%i0+4]
+; V9:         call _setjmp
+; V9:         ldx [%fp+{{.+}}], %[[R:[gilo][0-7]]]
+; V9:         st %o0, [%[[R]]+{{.+}}]
+
+; Function Attrs: nounwind
+define i32 @foo(%struct.jmpbuf_env* byval %inbuf) #0 {
+entry:
+  %0 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 0
+  store i32 0, i32* %0, align 4, !tbaa !4
+  %1 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 1
+  store i32 1, i32* %1, align 4, !tbaa !4
+  %2 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 2, i32 0
+  %3 = call i32 @_setjmp(%struct.__jmp_buf_tag* %2) #2
+  %4 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 3
+  store i32 %3, i32* %4, align 4, !tbaa !4
+  store %struct.jmpbuf_env* %inbuf, %struct.jmpbuf_env** @jenv, align 4, !tbaa !3
+  %5 = load i32* %1, align 4, !tbaa !4
+  %6 = icmp eq i32 %5, 1
+  %7 = icmp eq i32 %3, 0
+  %or.cond = and i1 %6, %7
+  br i1 %or.cond, label %"4.i", label %bar.exit
+
+"4.i":                                            ; preds = %entry
+  call void @longjmp(%struct.__jmp_buf_tag* %2, i32 0) #1
+  unreachable
+
+bar.exit:                                         ; preds = %entry
+  %8 = load i32* %0, align 4, !tbaa !4
+  %9 = call i32 (i8*, ...)* @printf(i8* noalias getelementptr inbounds ([30 x i8]* @.cst, i32 0, i32 0), i32 %8) #0
+  ret i32 0
+}
+
+; Function Attrs: nounwind returns_twice
+declare i32 @_setjmp(%struct.__jmp_buf_tag*) #2
+
+; Function Attrs: noreturn nounwind
+declare void @longjmp(%struct.__jmp_buf_tag*, i32) #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture, ...) #0
+
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn nounwind }
+attributes #2 = { nounwind returns_twice }
+
+!0 = metadata !{metadata !"alias set 6: struct.jmpbuf_env*", metadata !1}
+!1 = metadata !{metadata !1}
+!2 = metadata !{metadata !"alias set 3: int", metadata !1}
+!3 = metadata !{metadata !0, metadata !0, i64 0}
+!4 = metadata !{metadata !2, metadata !2, i64 0}
diff --git a/test/CodeGen/SPARC/tls.ll b/test/CodeGen/SPARC/tls.ll
new file mode 100644
index 0000000..660ddff
--- /dev/null
+++ b/test/CodeGen/SPARC/tls.ll
@@ -0,0 +1,73 @@
+; RUN: llc <%s -march=sparc   -relocation-model=static | FileCheck %s --check-prefix=v8abs
+; RUN: llc <%s -march=sparcv9 -relocation-model=static | FileCheck %s --check-prefix=v9abs
+; RUN: llc <%s -march=sparc   -relocation-model=pic    | FileCheck %s --check-prefix=pic
+; RUN: llc <%s -march=sparcv9 -relocation-model=pic    | FileCheck %s --check-prefix=pic
+
+
+@local_symbol = internal thread_local global i32 0
+@extern_symbol = external thread_local global i32
+
+; v8abs-LABEL:  test_tls_local
+; v8abs:        sethi  %tle_hix22(local_symbol), [[R0:%[goli][0-7]]]
+; v8abs:        xor    [[R0]], %tle_lox10(local_symbol), [[R1:%[goli][0-7]]]
+; v8abs:        ld     [%g7+[[R1]]]
+
+; v9abs-LABEL:  test_tls_local
+; v9abs:        sethi  %tle_hix22(local_symbol), [[R0:%[goli][0-7]]]
+; v9abs:        xor    [[R0]], %tle_lox10(local_symbol), [[R1:%[goli][0-7]]]
+; v9abs:        ld     [%g7+[[R1]]]
+
+; pic-LABEL:  test_tls_local
+; pic:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; pic:        add    [[PC]], %o7, [[GOTBASE:%[goli][0-7]]]
+; pic-DAG:    sethi  %tldm_hi22(local_symbol), [[R0:%[goli][0-7]]]
+; pic-DAG:    add    [[R0]], %tldm_lo10(local_symbol), [[R1:%[goli][0-7]]]
+; pic-DAG:    add    [[GOTBASE]], [[R1]], %o0, %tldm_add(local_symbol)
+; pic-DAG:    call   __tls_get_addr, %tldm_call(local_symbol)
+; pic-DAG:    sethi  %tldo_hix22(local_symbol), [[R2:%[goli][0-7]]]
+; pic-DAG:    xor    [[R2]], %tldo_lox10(local_symbol), [[R3:%[goli][0-7]]]
+; pic:        add    %o0, [[R3]], {{.+}}, %tldo_add(local_symbol)
+
+define i32 @test_tls_local() {
+entry:
+  %0 = load i32* @local_symbol, align 4
+  %1 = add i32 %0, 1
+  store i32 %1, i32* @local_symbol, align 4
+  ret i32 %1
+}
+
+
+; v8abs-LABEL:  test_tls_extern
+; v8abs:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; v8abs:        add    [[PC]], %o7, %[[GOTBASE:[goli][0-7]]]
+; v8abs:        sethi  %tie_hi22(extern_symbol), [[R1:%[goli][0-7]]]
+; v8abs:        add    [[R1]], %tie_lo10(extern_symbol), %[[R2:[goli][0-7]]]
+; v8abs:        ld     [%[[GOTBASE]]+%[[R2]]], [[R3:%[goli][0-7]]], %tie_ld(extern_symbol)
+; v8abs:        add    %g7, [[R3]], %[[R4:[goli][0-7]]], %tie_add(extern_symbol)
+; v8abs:        ld     [%[[R4]]]
+
+; v9abs-LABEL:  test_tls_extern
+; v9abs:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; v9abs:        add    [[PC]], %o7, %[[GOTBASE:[goli][0-7]]]
+; v9abs:        sethi  %tie_hi22(extern_symbol), [[R1:%[goli][0-7]]]
+; v9abs:        add    [[R1]], %tie_lo10(extern_symbol), %[[R2:[goli][0-7]]]
+; v9abs:        ldx    [%[[GOTBASE]]+%[[R2]]], [[R3:%[goli][0-7]]], %tie_ldx(extern_symbol)
+; v9abs:        add    %g7, [[R3]], %[[R4:[goli][0-7]]], %tie_add(extern_symbol)
+; v9abs:        ld     [%[[R4]]]
+
+; pic-LABEL:  test_tls_extern
+; pic:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; pic:        add    [[PC]], %o7, [[GOTBASE:%[goli][0-7]]]
+; pic:        sethi  %tgd_hi22(extern_symbol), [[R0:%[goli][0-7]]]
+; pic:        add    [[R0]], %tgd_lo10(extern_symbol), [[R1:%[goli][0-7]]]
+; pic:        add    [[GOTBASE]], [[R1]], %o0, %tgd_add(extern_symbol)
+; pic:        call   __tls_get_addr, %tgd_call(extern_symbol)
+; pic-NEXT:   nop
+
+define i32 @test_tls_extern() {
+entry:
+  %0 = load i32* @extern_symbol, align 4
+  %1 = add i32 %0, 1
+  store i32 %1, i32* @extern_symbol, align 4
+  ret i32 %1
+}
diff --git a/test/CodeGen/SystemZ/Large/branch-range-09.py b/test/CodeGen/SystemZ/Large/branch-range-09.py
new file mode 100644
index 0000000..b3fd813
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-09.py
@@ -0,0 +1,107 @@
+# Test 32-bit COMPARE LOGICAL AND BRANCH in cases where the sheer number of
+# instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffcc bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 12 bytes if it uses a short
+# branch and 14 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x34 - 6) / 12 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x34 / 12 == 4 blocks
+# can use short branches.
+#
+# CHECK: lb [[REG:%r[0-5]]], 0(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: lb [[REG:%r[0-5]]], 1(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 2(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 3(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 4(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 5(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 6(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 7(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# ...main goes here...
+# CHECK: lb [[REG:%r[0-5]]], 25(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL:\.L[^ ]*]]
+# CHECK: lb [[REG:%r[0-5]]], 26(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 27(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 28(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 29(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 30(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 31(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 32(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffcc
+
+print 'define void @f1(i8 *%base, i8 *%stop, i32 %limit) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
+    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
+    print '  %%btest%d = icmp ult i32 %%limit, %%bext%d' % (i, i)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
+    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
+    print '  %%atest%d = icmp ult i32 %%limit, %%aext%d' % (i, i)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-10.py b/test/CodeGen/SystemZ/Large/branch-range-10.py
new file mode 100644
index 0000000..3aeea3e
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-10.py
@@ -0,0 +1,111 @@
+# Test 64-bit COMPARE LOGICAL AND BRANCH in cases where the sheer number of
+# instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffcc bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 12 bytes if it uses a short
+# branch and 16 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x34 - 6) / 12 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x34 / 12 == 4 blocks
+# can use short branches.  The conservative algorithm we use makes
+# one of the forward branches unnecessarily long, as noted in the
+# check output below.
+#
+# CHECK: lgb [[REG:%r[0-5]]], 0(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: lgb [[REG:%r[0-5]]], 1(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 2(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 3(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 4(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# ...as mentioned above, the next one could be a CLGRJL instead...
+# CHECK: lgb [[REG:%r[0-5]]], 5(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 6(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 7(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# ...main goes here...
+# CHECK: lgb [[REG:%r[0-5]]], 25(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL:\.L[^ ]*]]
+# CHECK: lgb [[REG:%r[0-5]]], 26(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 27(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 28(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 29(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 30(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 31(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 32(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffcc
+
+print 'define void @f1(i8 *%base, i8 *%stop, i64 %limit) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
+    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
+    print '  %%btest%d = icmp ult i64 %%limit, %%bext%d' % (i, i)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
+    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
+    print '  %%atest%d = icmp ult i64 %%limit, %%aext%d' % (i, i)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-11.py b/test/CodeGen/SystemZ/Large/branch-range-11.py
new file mode 100644
index 0000000..034902c
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-11.py
@@ -0,0 +1,127 @@
+# Test 32-bit COMPARE LOGICAL IMMEDIATE AND BRANCH in cases where the sheer
+# number of instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffc6 bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 14 bytes if it uses a short
+# branch and 20 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x3a - 6) / 14 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x3a / 14 == 4 blocks
+# can use short branches.  The conservative algorithm we use makes
+# one of the forward branches unnecessarily long, as noted in the
+# check output below.
+#
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 50
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 51
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 52
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 53
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 54
+# CHECK: jgl [[LABEL]]
+# ...as mentioned above, the next one could be a CLIJL instead...
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 55
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 56, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 57, [[LABEL]]
+# ...main goes here...
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 100, [[LABEL:\.L[^ ]*]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 101, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 102, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 103, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 104
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 105
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 106
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 107
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffc6
+
+print 'define void @f1(i8 *%base, i32 *%stopa, i32 *%stopb) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bcur%da = load volatile i32 *%%stopa' % i
+    print '  %%bcur%db = load volatile i32 *%%stopb' % i
+    print '  %%bsub%d = sub i32 %%bcur%da, %%bcur%db' % (i, i, i)
+    print '  %%btest%d = icmp ult i32 %%bsub%d, %d' % (i, i, i + 50)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%acur%da = load volatile i32 *%%stopa' % i
+    print '  %%acur%db = load volatile i32 *%%stopb' % i
+    print '  %%asub%d = sub i32 %%acur%da, %%acur%db' % (i, i, i)
+    print '  %%atest%d = icmp ult i32 %%asub%d, %d' % (i, i, i + 100)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-12.py b/test/CodeGen/SystemZ/Large/branch-range-12.py
new file mode 100644
index 0000000..007d477
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-12.py
@@ -0,0 +1,127 @@
+# Test 64-bit COMPARE LOGICAL IMMEDIATE AND BRANCH in cases where the sheer
+# number of instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffb4 bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 18 bytes if it uses a short
+# branch and 24 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x4c - 6) / 18 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x4c / 18 == 4 blocks
+# can use short branches.  The conservative algorithm we use makes
+# one of the forward branches unnecessarily long, as noted in the
+# check output below.
+#
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 50
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 51
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 52
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 53
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 54
+# CHECK: jgl [[LABEL]]
+# ...as mentioned above, the next one could be a CLGIJL instead...
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 55
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 56, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 57, [[LABEL]]
+# ...main goes here...
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 100, [[LABEL:\.L[^ ]*]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 101, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 102, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 103, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 104
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 105
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 106
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 107
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffb4
+
+print 'define void @f1(i8 *%base, i64 *%stopa, i64 *%stopb) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bcur%da = load volatile i64 *%%stopa' % i
+    print '  %%bcur%db = load volatile i64 *%%stopb' % i
+    print '  %%bsub%d = sub i64 %%bcur%da, %%bcur%db' % (i, i, i)
+    print '  %%btest%d = icmp ult i64 %%bsub%d, %d' % (i, i, i + 50)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%acur%da = load volatile i64 *%%stopa' % i
+    print '  %%acur%db = load volatile i64 *%%stopb' % i
+    print '  %%asub%d = sub i64 %%acur%da, %%acur%db' % (i, i, i)
+    print '  %%atest%d = icmp ult i64 %%asub%d, %d' % (i, i, i + 100)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/alias-01.ll b/test/CodeGen/SystemZ/alias-01.ll
new file mode 100644
index 0000000..8839aad
--- /dev/null
+++ b/test/CodeGen/SystemZ/alias-01.ll
@@ -0,0 +1,19 @@
+; Test 32-bit ANDs in which the second operand is variable.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Check that there are no spills.
+define void @f1(<16 x i32> *%src1, <16 x float> *%dest) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: %r15
+; CHECK: br %r14
+  %val = load <16 x i32> *%src1, !tbaa !1
+  %add = add <16 x i32> %val, %val
+  %res = bitcast <16 x i32> %add to <16 x float>
+  store <16 x float> %res, <16 x float> *%dest, !tbaa !2
+  ret void
+}
+
+!0 = metadata !{ metadata !"root" }
+!1 = metadata !{ metadata !"set1", metadata !0 }
+!2 = metadata !{ metadata !"set2", metadata !0 }
diff --git a/test/CodeGen/SystemZ/alloca-02.ll b/test/CodeGen/SystemZ/alloca-02.ll
index b6ed7f7..b5787b1 100644
--- a/test/CodeGen/SystemZ/alloca-02.ll
+++ b/test/CodeGen/SystemZ/alloca-02.ll
@@ -21,18 +21,21 @@ define i64 @f1(i64 %length, i64 %index) {
 ;
 ; CHECK-C-LABEL: f1:
 ; CHECK-C: lgr %r15, [[ADDR:%r[1-5]]]
-; CHECK-C: la [[TMP:%r[1-5]]], 160(%r3,[[ADDR]])
-; CHECK-C: mvi 0([[TMP]]), 2
+; CHECK-C-DAG: la %r2, 160([[ADDR]])
+; CHECK-C-DAG: lhi [[TMP:%r[0-5]]], 2
+; CHECK-C: stc [[TMP]], 0({{%r3,%r2|%r2,%r3}})
 ;
 ; CHECK-D-LABEL: f1:
 ; CHECK-D: lgr %r15, [[ADDR:%r[1-5]]]
-; CHECK-D: la [[TMP:%r[1-5]]], 160(%r3,[[ADDR]])
-; CHECK-D: mvi 4095([[TMP]]), 3
+; CHECK-D-DAG: la %r2, 160([[ADDR]])
+; CHECK-D-DAG: lhi [[TMP:%r[0-5]]], 3
+; CHECK-D: stc [[TMP]], 4095({{%r3,%r2|%r2,%r3}})
 ;
 ; CHECK-E-LABEL: f1:
 ; CHECK-E: lgr %r15, [[ADDR:%r[1-5]]]
-; CHECK-E: la [[TMP:%r[1-5]]], 160(%r3,[[ADDR]])
-; CHECK-E: mviy 4096([[TMP]]), 4
+; CHECK-E-DAG: la %r2, 160([[ADDR]])
+; CHECK-E-DAG: lhi [[TMP:%r[0-5]]], 4
+; CHECK-E: stcy [[TMP]], 4096({{%r3,%r2|%r2,%r3}})
   %a = alloca i8, i64 %length
   store volatile i8 0, i8 *%a
   %b = getelementptr i8 *%a, i64 4095
diff --git a/test/CodeGen/SystemZ/and-08.ll b/test/CodeGen/SystemZ/and-08.ll
new file mode 100644
index 0000000..7ded115
--- /dev/null
+++ b/test/CodeGen/SystemZ/and-08.ll
@@ -0,0 +1,378 @@
+; Test memory-to-memory ANDs.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+@g1src = global i8 1
+@g1dst = global i8 1
+@g2src = global i16 2
+@g2dst = global i16 2
+
+; Test the simple i8 case.
+define void @f1(i8 *%ptr1) {
+; CHECK-LABEL: f1:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %and = and i8 %val, %old
+  store i8 %and, i8 *%ptr2
+  ret void
+}
+
+; ...and again in reverse.
+define void @f2(i8 *%ptr1) {
+; CHECK-LABEL: f2:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %and = and i8 %old, %val
+  store i8 %and, i8 *%ptr2
+  ret void
+}
+
+; Test i8 cases where one value is zero-extended to 32 bits and the other
+; sign-extended.
+define void @f3(i8 *%ptr1) {
+; CHECK-LABEL: f3:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = zext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = sext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; ...and again with the extension types reversed.
+define void @f4(i8 *%ptr1) {
+; CHECK-LABEL: f4:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = sext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = zext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; ...and again with two sign extensions.
+define void @f5(i8 *%ptr1) {
+; CHECK-LABEL: f5:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = sext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = sext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; ...and again with two zero extensions.
+define void @f6(i8 *%ptr1) {
+; CHECK-LABEL: f6:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = zext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = zext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; Test i8 cases where the value is extended to 64 bits (just one case
+; this time).
+define void @f7(i8 *%ptr1) {
+; CHECK-LABEL: f7:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = sext i8 %val to i64
+  %old = load i8 *%ptr2
+  %extold = zext i8 %old to i64
+  %and = and i64 %extval, %extold
+  %trunc = trunc i64 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; Test the simple i16 case.
+define void @f8(i16 *%ptr1) {
+; CHECK-LABEL: f8:
+; CHECK: nc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %old = load i16 *%ptr2
+  %and = and i16 %val, %old
+  store i16 %and, i16 *%ptr2
+  ret void
+}
+
+; Test i16 cases where the value is extended to 32 bits.
+define void @f9(i16 *%ptr1) {
+; CHECK-LABEL: f9:
+; CHECK: nc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %extval = zext i16 %val to i32
+  %old = load i16 *%ptr2
+  %extold = sext i16 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i16
+  store i16 %trunc, i16 *%ptr2
+  ret void
+}
+
+; Test i16 cases where the value is extended to 64 bits.
+define void @f10(i16 *%ptr1) {
+; CHECK-LABEL: f10:
+; CHECK: nc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %extval = sext i16 %val to i64
+  %old = load i16 *%ptr2
+  %extold = zext i16 %old to i64
+  %and = and i64 %extval, %extold
+  %trunc = trunc i64 %and to i16
+  store i16 %trunc, i16 *%ptr2
+  ret void
+}
+
+; Test the simple i32 case.
+define void @f11(i32 *%ptr1) {
+; CHECK-LABEL: f11:
+; CHECK: nc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %old = load i32 *%ptr2
+  %and = and i32 %old, %val
+  store i32 %and, i32 *%ptr2
+  ret void
+}
+
+; Test i32 cases where the value is extended to 64 bits.
+define void @f12(i32 *%ptr1) {
+; CHECK-LABEL: f12:
+; CHECK: nc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %extval = sext i32 %val to i64
+  %old = load i32 *%ptr2
+  %extold = zext i32 %old to i64
+  %and = and i64 %extval, %extold
+  %trunc = trunc i64 %and to i32
+  store i32 %trunc, i32 *%ptr2
+  ret void
+}
+
+; Test the i64 case.
+define void @f13(i64 *%ptr1) {
+; CHECK-LABEL: f13:
+; CHECK: nc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; Make sure that we don't use NC if the first load is volatile.
+define void @f14(i64 *%ptr1) {
+; CHECK-LABEL: f14:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load volatile i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...likewise the second.
+define void @f15(i64 *%ptr1) {
+; CHECK-LABEL: f15:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load volatile i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...likewise the store.
+define void @f16(i64 *%ptr1) {
+; CHECK-LABEL: f16:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store volatile i64 %and, i64 *%ptr2
+  ret void
+}
+
+; Test that NC is not used for aligned loads and stores if there is
+; no way of telling whether they alias.  We don't want to use NC in
+; cases where the addresses could be equal.
+define void @f17(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f17:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...but if one of the loads isn't aligned, we can't be sure.
+define void @f18(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f18:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; Repeat the previous test with the operands in the opposite order.
+define void @f19(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f19:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2
+  %old = load i64 *%ptr2
+  %and = and i64 %val, %old
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...and again with the other operand being unaligned.
+define void @f20(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f20:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2, align 2
+  %and = and i64 %val, %old
+  store i64 %and, i64 *%ptr2, align 2
+  ret void
+}
+
+; Test a case where there is definite overlap.
+define void @f21(i64 %base) {
+; CHECK-LABEL: f21:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %add = add i64 %base, 1
+  %ptr1 = inttoptr i64 %base to i64 *
+  %ptr2 = inttoptr i64 %add to i64 *
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2, align 1
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 1
+  ret void
+}
+
+; Test that we can use NC for global addresses for i8.
+define void @f22(i8 *%ptr) {
+; CHECK-LABEL: f22:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g1src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g1dst
+; CHECK: nc 0(1,[[DST]]), 0([[SRC]])
+; CHECK: br %r14
+  %val = load i8 *@g1src
+  %old = load i8 *@g1dst
+  %and = and i8 %val, %old
+  store i8 %and, i8 *@g1dst
+  ret void
+}
+
+; Test that we use NC even where LHRL and STHRL are available.
+define void @f23(i16 *%ptr) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g2src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g2dst
+; CHECK: nc 0(2,[[DST]]), 0([[SRC]])
+; CHECK: br %r14
+  %val = load i16 *@g2src
+  %old = load i16 *@g2dst
+  %and = and i16 %val, %old
+  store i16 %and, i16 *@g2dst
+  ret void
+}
+
+; Test a case where offset disambiguation is enough.
+define void @f24(i64 *%ptr1) {
+; CHECK-LABEL: f24:
+; CHECK: nc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1, align 1
+  %old = load i64 *%ptr2, align 1
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 1
+  ret void
+}
+
+; Test a case where TBAA tells us there is no alias.
+define void @f25(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f25:
+; CHECK: nc 0(8,%r3), 0(%r2)
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2, !tbaa !3
+  %old = load i64 *%ptr2, align 2, !tbaa !4
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 2, !tbaa !4
+  ret void
+}
+
+; Test a case where TBAA information is present but doesn't help.
+define void @f26(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f26:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2, !tbaa !3
+  %old = load i64 *%ptr2, align 2, !tbaa !3
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 2, !tbaa !3
+  ret void
+}
+
+!0 = metadata !{ metadata !"root" }
+!1 = metadata !{ metadata !"set1", metadata !0 }
+!2 = metadata !{ metadata !"set2", metadata !0 }
+!3 = metadata !{ metadata !1, metadata !1, i64 0}
+!4 = metadata !{ metadata !2, metadata !2, i64 0}
diff --git a/test/CodeGen/SystemZ/args-06.ll b/test/CodeGen/SystemZ/args-06.ll
index a89fe9b..644fcec9 100644
--- a/test/CodeGen/SystemZ/args-06.ll
+++ b/test/CodeGen/SystemZ/args-06.ll
@@ -27,8 +27,8 @@ define i16 @f2(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g) {
 ; CHECK: ar %r2, %r4
 ; CHECK: ar %r2, %r5
 ; CHECK: ar %r2, %r6
-; CHECK: lh {{%r[0-5]}}, 166(%r15)
-; CHECK: lh {{%r[0-5]}}, 174(%r15)
+; CHECK: ah %r2, 166(%r15)
+; CHECK: ah %r2, 174(%r15)
 ; CHECK: br %r14
   %addb = add i16 %a, %b
   %addc = add i16 %addb, %c
diff --git a/test/CodeGen/SystemZ/asm-17.ll b/test/CodeGen/SystemZ/asm-17.ll
index 33234fc..7bc9da3 100644
--- a/test/CodeGen/SystemZ/asm-17.ll
+++ b/test/CodeGen/SystemZ/asm-17.ll
@@ -80,3 +80,26 @@ define float @f7(float %in) {
   call void asm sideeffect "blah", "~{f0},~{cc}"()
   ret float %in
 }
+
+; Test that both registers in a GR128 pair get hoisted.
+define void @f8(i32 %count) {
+; CHECK-LABEL: f8
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lhi %r1, 1
+; CHECK: %loop
+; CHECK-NOT: %r
+; CHECK: blah %r0, %r1
+; CHECK: br %r14
+entry:
+  br label %loop
+
+loop:
+  %this = phi i32 [ %count, %entry ], [ %next, %loop ]
+  call void asm sideeffect "blah $0, $1", "{r0},{r1}" (i32 0, i32 1)
+  %next = sub i32 %this, 1
+  %cmp = icmp ne i32 %next, 0
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/asm-18.ll b/test/CodeGen/SystemZ/asm-18.ll
new file mode 100644
index 0000000..d60654b
--- /dev/null
+++ b/test/CodeGen/SystemZ/asm-18.ll
@@ -0,0 +1,745 @@
+; Test high-word operations, using "h" constraints to force a high
+; register and "r" constraints to force a low register.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Test loads and stores involving mixtures of high and low registers.
+define void @f1(i32 *%ptr1, i32 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lfh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: l [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: lfh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: ly [[REG4:%r[0-5]]], 524284(%r3)
+; CHECK: blah [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK-DAG: stfh [[REG1]], 0(%r2)
+; CHECK-DAG: st [[REG2]], 0(%r3)
+; CHECK-DAG: stfh [[REG3]], 4096(%r2)
+; CHECK-DAG: sty [[REG4]], 524284(%r3)
+; CHECK: br %r14
+  %ptr3 = getelementptr i32 *%ptr1, i64 1024
+  %ptr4 = getelementptr i32 *%ptr2, i64 131071
+  %old1 = load i32 *%ptr1
+  %old2 = load i32 *%ptr2
+  %old3 = load i32 *%ptr3
+  %old4 = load i32 *%ptr4
+  %res = call { i32, i32, i32, i32 } asm "blah $0, $1, $2, $3",
+              "=h,=r,=h,=r,0,1,2,3"(i32 %old1, i32 %old2, i32 %old3, i32 %old4)
+  %new1 = extractvalue { i32, i32, i32, i32 } %res, 0
+  %new2 = extractvalue { i32, i32, i32, i32 } %res, 1
+  %new3 = extractvalue { i32, i32, i32, i32 } %res, 2
+  %new4 = extractvalue { i32, i32, i32, i32 } %res, 3
+  store i32 %new1, i32 *%ptr1
+  store i32 %new2, i32 *%ptr2
+  store i32 %new3, i32 *%ptr3
+  store i32 %new4, i32 *%ptr4
+  ret void
+}
+
+; Test moves involving mixtures of high and low registers.
+define i32 @f2(i32 %old) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: risbhg [[REG1:%r[0-5]]], %r2, 0, 159, 32
+; CHECK-DAG: lr %r3, %r2
+; CHECK: stepa [[REG1]], %r2, %r3
+; CHECK: risbhg {{%r[0-5]}}, [[REG1]], 0, 159, 0
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: risblg %r2, [[REG2]], 0, 159, 32
+; CHECK: br %r14
+  %tmp = call i32 asm "stepa $1, $2, $3",
+                      "=h,0,{r2},{r3}"(i32 %old, i32 %old, i32 %old)
+  %new = call i32 asm "stepb $1, $2", "=&h,0,h"(i32 %tmp, i32 %tmp)
+  ret i32 %new
+}
+
+; Test sign-extending 8-bit loads into mixtures of high and low registers.
+define void @f3(i8 *%ptr1, i8 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: lbh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: lb [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: lbh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: lb [[REG4:%r[0-5]]], 524287(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i8 *%ptr1, i64 4096
+  %ptr4 = getelementptr i8 *%ptr2, i64 524287
+  %val1 = load i8 *%ptr1
+  %val2 = load i8 *%ptr2
+  %val3 = load i8 *%ptr3
+  %val4 = load i8 *%ptr4
+  %ext1 = sext i8 %val1 to i32
+  %ext2 = sext i8 %val2 to i32
+  %ext3 = sext i8 %val3 to i32
+  %ext4 = sext i8 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test sign-extending 16-bit loads into mixtures of high and low registers.
+define void @f4(i16 *%ptr1, i16 *%ptr2) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: lhh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: lh [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: lhh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: lhy [[REG4:%r[0-5]]], 524286(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i16 *%ptr1, i64 2048
+  %ptr4 = getelementptr i16 *%ptr2, i64 262143
+  %val1 = load i16 *%ptr1
+  %val2 = load i16 *%ptr2
+  %val3 = load i16 *%ptr3
+  %val4 = load i16 *%ptr4
+  %ext1 = sext i16 %val1 to i32
+  %ext2 = sext i16 %val2 to i32
+  %ext3 = sext i16 %val3 to i32
+  %ext4 = sext i16 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test zero-extending 8-bit loads into mixtures of high and low registers.
+define void @f5(i8 *%ptr1, i8 *%ptr2) {
+; CHECK-LABEL: f5:
+; CHECK-DAG: llch [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: llc [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: llch [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: llc [[REG4:%r[0-5]]], 524287(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i8 *%ptr1, i64 4096
+  %ptr4 = getelementptr i8 *%ptr2, i64 524287
+  %val1 = load i8 *%ptr1
+  %val2 = load i8 *%ptr2
+  %val3 = load i8 *%ptr3
+  %val4 = load i8 *%ptr4
+  %ext1 = zext i8 %val1 to i32
+  %ext2 = zext i8 %val2 to i32
+  %ext3 = zext i8 %val3 to i32
+  %ext4 = zext i8 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test zero-extending 16-bit loads into mixtures of high and low registers.
+define void @f6(i16 *%ptr1, i16 *%ptr2) {
+; CHECK-LABEL: f6:
+; CHECK-DAG: llhh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: llh [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: llhh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: llh [[REG4:%r[0-5]]], 524286(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i16 *%ptr1, i64 2048
+  %ptr4 = getelementptr i16 *%ptr2, i64 262143
+  %val1 = load i16 *%ptr1
+  %val2 = load i16 *%ptr2
+  %val3 = load i16 *%ptr3
+  %val4 = load i16 *%ptr4
+  %ext1 = zext i16 %val1 to i32
+  %ext2 = zext i16 %val2 to i32
+  %ext3 = zext i16 %val3 to i32
+  %ext4 = zext i16 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test truncating stores of high and low registers into 8-bit memory.
+define void @f7(i8 *%ptr1, i8 *%ptr2) {
+; CHECK-LABEL: f7:
+; CHECK: blah [[REG1:%r[0-5]]], [[REG2:%r[0-5]]]
+; CHECK-DAG: stch [[REG1]], 0(%r2)
+; CHECK-DAG: stc [[REG2]], 0(%r3)
+; CHECK-DAG: stch [[REG1]], 4096(%r2)
+; CHECK-DAG: stcy [[REG2]], 524287(%r3)
+; CHECK: br %r14
+  %res = call { i32, i32 } asm "blah $0, $1", "=h,=r"()
+  %res1 = extractvalue { i32, i32 } %res, 0
+  %res2 = extractvalue { i32, i32 } %res, 1
+  %trunc1 = trunc i32 %res1 to i8
+  %trunc2 = trunc i32 %res2 to i8
+  %ptr3 = getelementptr i8 *%ptr1, i64 4096
+  %ptr4 = getelementptr i8 *%ptr2, i64 524287
+  store i8 %trunc1, i8 *%ptr1
+  store i8 %trunc2, i8 *%ptr2
+  store i8 %trunc1, i8 *%ptr3
+  store i8 %trunc2, i8 *%ptr4
+  ret void
+}
+
+; Test truncating stores of high and low registers into 16-bit memory.
+define void @f8(i16 *%ptr1, i16 *%ptr2) {
+; CHECK-LABEL: f8:
+; CHECK: blah [[REG1:%r[0-5]]], [[REG2:%r[0-5]]]
+; CHECK-DAG: sthh [[REG1]], 0(%r2)
+; CHECK-DAG: sth [[REG2]], 0(%r3)
+; CHECK-DAG: sthh [[REG1]], 4096(%r2)
+; CHECK-DAG: sthy [[REG2]], 524286(%r3)
+; CHECK: br %r14
+  %res = call { i32, i32 } asm "blah $0, $1", "=h,=r"()
+  %res1 = extractvalue { i32, i32 } %res, 0
+  %res2 = extractvalue { i32, i32 } %res, 1
+  %trunc1 = trunc i32 %res1 to i16
+  %trunc2 = trunc i32 %res2 to i16
+  %ptr3 = getelementptr i16 *%ptr1, i64 2048
+  %ptr4 = getelementptr i16 *%ptr2, i64 262143
+  store i16 %trunc1, i16 *%ptr1
+  store i16 %trunc2, i16 *%ptr2
+  store i16 %trunc1, i16 *%ptr3
+  store i16 %trunc2, i16 *%ptr4
+  ret void
+}
+
+; Test zero extensions from 8 bits between mixtures of high and low registers.
+define i32 @f9(i8 %val1, i8 %val2) {
+; CHECK-LABEL: f9:
+; CHECK-DAG: risbhg [[REG1:%r[0-5]]], %r2, 24, 159, 32
+; CHECK-DAG: llcr [[REG2:%r[0-5]]], %r3
+; CHECK: stepa [[REG1]], [[REG2]]
+; CHECK: risbhg [[REG3:%r[0-5]]], [[REG1]], 24, 159, 0
+; CHECK: stepb [[REG3]]
+; CHECK: risblg %r2, [[REG3]], 24, 159, 32
+; CHECK: br %r14
+  %ext1 = zext i8 %val1 to i32
+  %ext2 = zext i8 %val2 to i32
+  %val3 = call i8 asm sideeffect "stepa $0, $1", "=h,0,r"(i32 %ext1, i32 %ext2)
+  %ext3 = zext i8 %val3 to i32
+  %val4 = call i8 asm sideeffect "stepb $0", "=h,0"(i32 %ext3)
+  %ext4 = zext i8 %val4 to i32
+  ret i32 %ext4
+}
+
+; Test zero extensions from 16 bits between mixtures of high and low registers.
+define i32 @f10(i16 %val1, i16 %val2) {
+; CHECK-LABEL: f10:
+; CHECK-DAG: risbhg [[REG1:%r[0-5]]], %r2, 16, 159, 32
+; CHECK-DAG: llhr [[REG2:%r[0-5]]], %r3
+; CHECK: stepa [[REG1]], [[REG2]]
+; CHECK: risbhg [[REG3:%r[0-5]]], [[REG1]], 16, 159, 0
+; CHECK: stepb [[REG3]]
+; CHECK: risblg %r2, [[REG3]], 16, 159, 32
+; CHECK: br %r14
+  %ext1 = zext i16 %val1 to i32
+  %ext2 = zext i16 %val2 to i32
+  %val3 = call i16 asm sideeffect "stepa $0, $1", "=h,0,r"(i32 %ext1, i32 %ext2)
+  %ext3 = zext i16 %val3 to i32
+  %val4 = call i16 asm sideeffect "stepb $0", "=h,0"(i32 %ext3)
+  %ext4 = zext i16 %val4 to i32
+  ret i32 %ext4
+}
+
+; Test loads of 16-bit constants into mixtures of high and low registers.
+define void @f11() {
+; CHECK-LABEL: f11:
+; CHECK-DAG: iihf [[REG1:%r[0-5]]], 4294934529
+; CHECK-DAG: lhi [[REG2:%r[0-5]]], -32768
+; CHECK-DAG: llihl [[REG3:%r[0-5]]], 32766
+; CHECK-DAG: lhi [[REG4:%r[0-5]]], 32767
+; CHECK: blah [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK: br %r14
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 -32767, i32 -32768,
+                                     i32 32766, i32 32767)
+  ret void
+}
+
+; Test loads of unsigned constants into mixtures of high and low registers.
+; For stepc, we expect the h and r operands to be paired by the register
+; allocator.  It doesn't really matter which comes first: LLILL/IIHF would
+; be just as good.
+define void @f12() {
+; CHECK-LABEL: f12:
+; CHECK-DAG: llihl [[REG1:%r[0-5]]], 32768
+; CHECK-DAG: llihl [[REG2:%r[0-5]]], 65535
+; CHECK-DAG: llihh [[REG3:%r[0-5]]], 1
+; CHECK-DAG: llihh [[REG4:%r[0-5]]], 65535
+; CHECK: stepa [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK-DAG: llill [[REG1:%r[0-5]]], 32769
+; CHECK-DAG: llill [[REG2:%r[0-5]]], 65534
+; CHECK-DAG: llilh [[REG3:%r[0-5]]], 2
+; CHECK-DAG: llilh [[REG4:%r[0-5]]], 65534
+; CHECK: stepb [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK-DAG: llihl [[REG1:%r[0-5]]], 32770
+; CHECK-DAG: iilf [[REG1]], 65533
+; CHECK-DAG: llihh [[REG2:%r[0-5]]], 4
+; CHECK-DAG: iilf [[REG2]], 524288
+; CHECK: stepc [[REG1]], [[REG1]], [[REG2]], [[REG2]]
+; CHECK-DAG: iihf [[REG1:%r[0-5]]], 3294967296
+; CHECK-DAG: iilf [[REG2:%r[0-5]]], 4294567296
+; CHECK-DAG: iihf [[REG3:%r[0-5]]], 1000000000
+; CHECK-DAG: iilf [[REG4:%r[0-5]]], 400000
+; CHECK: stepd [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK: br %r14
+  call void asm sideeffect "stepa $0, $1, $2, $3",
+                           "h,h,h,h"(i32 32768, i32 65535,
+                                     i32 65536, i32 -65536)
+  call void asm sideeffect "stepb $0, $1, $2, $3",
+                           "r,r,r,r"(i32 32769, i32 65534,
+                                     i32 131072, i32 -131072)
+  call void asm sideeffect "stepc $0, $1, $2, $3",
+                           "h,r,h,r"(i32 32770, i32 65533,
+                                     i32 262144, i32 524288)
+  call void asm sideeffect "stepd $0, $1, $2, $3",
+                           "h,r,h,r"(i32 -1000000000, i32 -400000,
+                                     i32 1000000000, i32 400000)
+  ret void
+}
+
+; Test selects involving high registers.
+define void @f13(i32 %x, i32 %y) {
+; CHECK-LABEL: f13:
+; CHECK: llihl [[REG:%r[0-5]]], 0
+; CHECK: cije %r2, 0
+; CHECK: iihf [[REG]], 2102030405
+; CHECK: blah [[REG]]
+; CHECK: br %r14
+  %cmp = icmp eq i32 %x, 0
+  %val = select i1 %cmp, i32 0, i32 2102030405
+  call void asm sideeffect "blah $0", "h"(i32 %val)
+  ret void
+}
+
+; Test selects involving low registers.
+define void @f14(i32 %x, i32 %y) {
+; CHECK-LABEL: f14:
+; CHECK: lhi [[REG:%r[0-5]]], 0
+; CHECK: cije %r2, 0
+; CHECK: iilf [[REG]], 2102030405
+; CHECK: blah [[REG]]
+; CHECK: br %r14
+  %cmp = icmp eq i32 %x, 0
+  %val = select i1 %cmp, i32 0, i32 2102030405
+  call void asm sideeffect "blah $0", "r"(i32 %val)
+  ret void
+}
+
+; Test immediate insertion involving high registers.
+define void @f15() {
+; CHECK-LABEL: f15:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: iihh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: iihl [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %and1 = and i32 %res1, 65535
+  %or1 = or i32 %and1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %or1)
+  %and2 = and i32 %res2, -65536
+  %or2 = or i32 %and2, 34661
+  call void asm sideeffect "stepc $0", "h"(i32 %or2)
+  ret void
+}
+
+; Test immediate insertion involving low registers.
+define void @f16() {
+; CHECK-LABEL: f16:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: iilh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: iill [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %and1 = and i32 %res1, 65535
+  %or1 = or i32 %and1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %or1)
+  %and2 = and i32 %res2, -65536
+  %or2 = or i32 %and2, 34661
+  call void asm sideeffect "stepc $0", "r"(i32 %or2)
+  ret void
+}
+
+; Test immediate OR involving high registers.
+define void @f17() {
+; CHECK-LABEL: f17:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: oihh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: oihl [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: oihf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %or1 = or i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %or1)
+  %or2 = or i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %or2)
+  %or3 = or i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "h"(i32 %or3)
+  ret void
+}
+
+; Test immediate OR involving low registers.
+define void @f18() {
+; CHECK-LABEL: f18:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: oilh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: oill [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: oilf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %or1 = or i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %or1)
+  %or2 = or i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %or2)
+  %or3 = or i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "r"(i32 %or3)
+  ret void
+}
+
+; Test immediate XOR involving high registers.
+define void @f19() {
+; CHECK-LABEL: f19:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: xihf [[REG]], 305397760
+; CHECK: stepb [[REG]]
+; CHECK: xihf [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: xihf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %xor1 = xor i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %xor1)
+  %xor2 = xor i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %xor2)
+  %xor3 = xor i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "h"(i32 %xor3)
+  ret void
+}
+
+; Test immediate XOR involving low registers.
+define void @f20() {
+; CHECK-LABEL: f20:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: xilf [[REG]], 305397760
+; CHECK: stepb [[REG]]
+; CHECK: xilf [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: xilf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %xor1 = xor i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %xor1)
+  %xor2 = xor i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %xor2)
+  %xor3 = xor i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "r"(i32 %xor3)
+  ret void
+}
+
+; Test two-operand immediate AND involving high registers.
+define void @f21() {
+; CHECK-LABEL: f21:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: nihh [[REG]], 4096
+; CHECK: stepb [[REG]]
+; CHECK: nihl [[REG]], 57536
+; CHECK: stepc [[REG]]
+; CHECK: nihf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %and1 = and i32 %res1, 268500991
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %and1)
+  %and2 = and i32 %res2, -8000
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %and2)
+  %and3 = and i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "h"(i32 %and3)
+  ret void
+}
+
+; Test two-operand immediate AND involving low registers.
+define void @f22() {
+; CHECK-LABEL: f22:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: nilh [[REG]], 4096
+; CHECK: stepb [[REG]]
+; CHECK: nill [[REG]], 57536
+; CHECK: stepc [[REG]]
+; CHECK: nilf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %and1 = and i32 %res1, 268500991
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %and1)
+  %and2 = and i32 %res2, -8000
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %and2)
+  %and3 = and i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "r"(i32 %and3)
+  ret void
+}
+
+; Test three-operand immediate AND involving mixtures of low and high registers.
+define i32 @f23(i32 %old) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: risblg [[REG1:%r[0-5]]], %r2, 28, 158, 0
+; CHECK-DAG: risbhg [[REG2:%r[0-5]]], %r2, 24, 158, 32
+; CHECK: stepa %r2, [[REG1]], [[REG2]]
+; CHECK-DAG: risbhg [[REG3:%r[0-5]]], [[REG2]], 25, 159, 0
+; CHECK-DAG: risblg %r2, [[REG2]], 24, 152, 32
+; CHECK: stepb [[REG2]], [[REG3]], %r2
+; CHECK: br %r14
+  %and1 = and i32 %old, 14
+  %and2 = and i32 %old, 254
+  %res1 = call i32 asm "stepa $1, $2, $3",
+                       "=h,r,r,0"(i32 %old, i32 %and1, i32 %and2)
+  %and3 = and i32 %res1, 127
+  %and4 = and i32 %res1, 128
+  %res2 = call i32 asm "stepb $1, $2, $3",
+                       "=r,h,h,0"(i32 %res1, i32 %and3, i32 %and4)
+  ret i32 %res2
+}
+
+; Test RISB[LH]G insertions involving mixtures of high and low registers.
+define i32 @f24(i32 %old) {
+; CHECK-LABEL: f24:
+; CHECK-DAG: risblg [[REG1:%r[0-5]]], %r2, 28, 158, 1
+; CHECK-DAG: risbhg [[REG2:%r[0-5]]], %r2, 24, 158, 29
+; CHECK: stepa %r2, [[REG1]], [[REG2]]
+; CHECK-DAG: risbhg [[REG3:%r[0-5]]], [[REG2]], 25, 159, 62
+; CHECK-DAG: risblg %r2, [[REG2]], 24, 152, 37
+; CHECK: stepb [[REG2]], [[REG3]], %r2
+; CHECK: br %r14
+  %shift1 = shl i32 %old, 1
+  %and1 = and i32 %shift1, 14
+  %shift2 = lshr i32 %old, 3
+  %and2 = and i32 %shift2, 254
+  %res1 = call i32 asm "stepa $1, $2, $3",
+                       "=h,r,r,0"(i32 %old, i32 %and1, i32 %and2)
+  %shift3 = lshr i32 %res1, 2
+  %and3 = and i32 %shift3, 127
+  %shift4 = shl i32 %res1, 5
+  %and4 = and i32 %shift4, 128
+  %res2 = call i32 asm "stepb $1, $2, $3",
+                       "=r,h,h,0"(i32 %res1, i32 %and3, i32 %and4)
+  ret i32 %res2
+}
+
+; Test TMxx involving mixtures of high and low registers.
+define i32 @f25(i32 %old) {
+; CHECK-LABEL: f25:
+; CHECK-DAG: tmll %r2, 1
+; CHECK-DAG: tmlh %r2, 1
+; CHECK: stepa [[REG1:%r[0-5]]],
+; CHECK-DAG: tmhl [[REG1]], 1
+; CHECK-DAG: tmhh [[REG1]], 1
+; CHECK: stepb %r2,
+; CHECK: br %r14
+  %and1 = and i32 %old, 1
+  %and2 = and i32 %old, 65536
+  %cmp1 = icmp eq i32 %and1, 0
+  %cmp2 = icmp eq i32 %and2, 0
+  %sel1 = select i1 %cmp1, i32 100, i32 200
+  %sel2 = select i1 %cmp2, i32 100, i32 200
+  %res1 = call i32 asm "stepa $0, $1, $2",
+                       "=h,r,r"(i32 %sel1, i32 %sel2)
+  %and3 = and i32 %res1, 1
+  %and4 = and i32 %res1, 65536
+  %cmp3 = icmp eq i32 %and3, 0
+  %cmp4 = icmp eq i32 %and4, 0
+  %sel3 = select i1 %cmp3, i32 100, i32 200
+  %sel4 = select i1 %cmp4, i32 100, i32 200
+  %res2 = call i32 asm "stepb $0, $1, $2",
+                       "=r,h,h"(i32 %sel3, i32 %sel4)
+  ret i32 %res2
+}
+
+; Test two-operand halfword immediate addition involving high registers.
+define void @f26() {
+; CHECK-LABEL: f26:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: aih [[REG]], -32768
+; CHECK: stepb [[REG]]
+; CHECK: aih [[REG]], 1
+; CHECK: stepc [[REG]]
+; CHECK: aih [[REG]], 32767
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %add1 = add i32 %res1, -32768
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %add1)
+  %add2 = add i32 %res2, 1
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %add2)
+  %add3 = add i32 %res3, 32767
+  call void asm sideeffect "stepd $0", "h"(i32 %add3)
+  ret void
+}
+
+; Test two-operand halfword immediate addition involving low registers.
+define void @f27() {
+; CHECK-LABEL: f27:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: ahi [[REG]], -32768
+; CHECK: stepb [[REG]]
+; CHECK: ahi [[REG]], 1
+; CHECK: stepc [[REG]]
+; CHECK: ahi [[REG]], 32767
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %add1 = add i32 %res1, -32768
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %add1)
+  %add2 = add i32 %res2, 1
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %add2)
+  %add3 = add i32 %res3, 32767
+  call void asm sideeffect "stepd $0", "r"(i32 %add3)
+  ret void
+}
+
+; Test three-operand halfword immediate addition involving mixtures of low
+; and high registers.  RISBHG/AIH would be OK too, instead of AHIK/RISBHG.
+define i32 @f28(i32 %old) {
+; CHECK-LABEL: f28:
+; CHECK: ahik [[REG1:%r[0-5]]], %r2, 14
+; CHECK: stepa %r2, [[REG1]]
+; CHECK: ahik [[TMP:%r[0-5]]], [[REG1]], 254
+; CHECK: risbhg [[REG2:%r[0-5]]], [[TMP]], 0, 159, 32
+; CHECK: stepb [[REG1]], [[REG2]]
+; CHECK: risbhg [[REG3:%r[0-5]]], [[REG2]], 0, 159, 0
+; CHECK: aih [[REG3]], 127
+; CHECK: stepc [[REG2]], [[REG3]]
+; CHECK: risblg %r2, [[REG3]], 0, 159, 32
+; CHECK: ahi %r2, 128
+; CHECK: stepd [[REG3]], %r2
+; CHECK: br %r14
+  %add1 = add i32 %old, 14
+  %res1 = call i32 asm "stepa $1, $2",
+                       "=r,r,0"(i32 %old, i32 %add1)
+  %add2 = add i32 %res1, 254
+  %res2 = call i32 asm "stepb $1, $2",
+                       "=h,r,0"(i32 %res1, i32 %add2)
+  %add3 = add i32 %res2, 127
+  %res3 = call i32 asm "stepc $1, $2",
+                       "=h,h,0"(i32 %res2, i32 %add3)
+  %add4 = add i32 %res3, 128
+  %res4 = call i32 asm "stepd $1, $2",
+                       "=r,h,0"(i32 %res3, i32 %add4)
+  ret i32 %res4
+}
+
+; Test large immediate addition involving high registers.
+define void @f29() {
+; CHECK-LABEL: f29:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: aih [[REG]], -32769
+; CHECK: stepb [[REG]]
+; CHECK: aih [[REG]], 32768
+; CHECK: stepc [[REG]]
+; CHECK: aih [[REG]], 1000000000
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %add1 = add i32 %res1, -32769
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %add1)
+  %add2 = add i32 %res2, 32768
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %add2)
+  %add3 = add i32 %res3, 1000000000
+  call void asm sideeffect "stepd $0", "h"(i32 %add3)
+  ret void
+}
+
+; Test large immediate addition involving low registers.
+define void @f30() {
+; CHECK-LABEL: f30:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], -32769
+; CHECK: stepb [[REG]]
+; CHECK: afi [[REG]], 32768
+; CHECK: stepc [[REG]]
+; CHECK: afi [[REG]], 1000000000
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %add1 = add i32 %res1, -32769
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %add1)
+  %add2 = add i32 %res2, 32768
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %add2)
+  %add3 = add i32 %res3, 1000000000
+  call void asm sideeffect "stepd $0", "r"(i32 %add3)
+  ret void
+}
+
+; Test large immediate comparison involving high registers.
+define i32 @f31() {
+; CHECK-LABEL: f31:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: cih [[REG1]], 1000000000
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: clih [[REG2]], 1000000000
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %cmp1 = icmp sle i32 %res1, 1000000000
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=h,r"(i32 %sel1)
+  %cmp2 = icmp ule i32 %res2, 1000000000
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  ret i32 %sel2
+}
+
+; Test large immediate comparison involving low registers.
+define i32 @f32() {
+; CHECK-LABEL: f32:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: cfi [[REG1]], 1000000000
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: clfi [[REG2]], 1000000000
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %cmp1 = icmp sle i32 %res1, 1000000000
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %sel1)
+  %cmp2 = icmp ule i32 %res2, 1000000000
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  ret i32 %sel2
+}
+
+; Test memory comparison involving high registers.
+define void @f33(i32 *%ptr1, i32 *%ptr2) {
+; CHECK-LABEL: f33:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: chf [[REG1]], 0(%r2)
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: clhf [[REG2]], 0(%r3)
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %load1 = load i32 *%ptr1
+  %cmp1 = icmp sle i32 %res1, %load1
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=h,r"(i32 %sel1)
+  %load2 = load i32 *%ptr2
+  %cmp2 = icmp ule i32 %res2, %load2
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  store i32 %sel2, i32 *%ptr1
+  ret void
+}
+
+; Test memory comparison involving low registers.
+define void @f34(i32 *%ptr1, i32 *%ptr2) {
+; CHECK-LABEL: f34:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: c [[REG1]], 0(%r2)
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: cl [[REG2]], 0(%r3)
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %load1 = load i32 *%ptr1
+  %cmp1 = icmp sle i32 %res1, %load1
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %sel1)
+  %load2 = load i32 *%ptr2
+  %cmp2 = icmp ule i32 %res2, %load2
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  store i32 %sel2, i32 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
index a15fe57..2b750c4 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
@@ -91,8 +91,7 @@ define i8 @f3(i8 *%src, i8 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clrjle [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 39, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -112,7 +111,7 @@ define i8 @f3(i8 *%src, i8 %b) {
 ; CHECK-SHIFT2-LABEL: f3:
 ; CHECK-SHIFT2: sll %r3, 24
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjle {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -128,8 +127,7 @@ define i8 @f4(i8 *%src, i8 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clrjhe [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 39, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -149,7 +147,7 @@ define i8 @f4(i8 *%src, i8 %b) {
 ; CHECK-SHIFT2-LABEL: f4:
 ; CHECK-SHIFT2: sll %r3, 24
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjhe {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -196,7 +194,7 @@ define i8 @f6(i8 *%src) {
 define i8 @f7(i8 *%src) {
 ; CHECK-LABEL: f7:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 256
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjle [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 39, 0
 ; CHECK: br %r14
 ;
@@ -213,7 +211,7 @@ define i8 @f7(i8 *%src) {
 define i8 @f8(i8 *%src) {
 ; CHECK-LABEL: f8:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 65024
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjhe [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 39, 0
 ; CHECK: br %r14
 ;
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
index c0ae883..98ffedf 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
@@ -91,8 +91,7 @@ define i16 @f3(i16 *%src, i16 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clrjle [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 47, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -112,7 +111,7 @@ define i16 @f3(i16 *%src, i16 %b) {
 ; CHECK-SHIFT2-LABEL: f3:
 ; CHECK-SHIFT2: sll %r3, 16
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjle {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -128,8 +127,7 @@ define i16 @f4(i16 *%src, i16 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clrjhe [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 47, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -149,7 +147,7 @@ define i16 @f4(i16 *%src, i16 %b) {
 ; CHECK-SHIFT2-LABEL: f4:
 ; CHECK-SHIFT2: sll %r3, 16
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjhe {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -196,7 +194,7 @@ define i16 @f6(i16 *%src) {
 define i16 @f7(i16 *%src) {
 ; CHECK-LABEL: f7:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 1
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjle [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 47, 0
 ; CHECK: br %r14
 ;
@@ -213,7 +211,7 @@ define i16 @f7(i16 *%src) {
 define i16 @f8(i16 *%src) {
 ; CHECK-LABEL: f8:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 65534
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjhe [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 47, 0
 ; CHECK: br %r14
 ;
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
index 3a9485a..f2152c6 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
@@ -1,6 +1,7 @@
-; Test 32-bit atomic minimum and maximum.
+; Test 32-bit atomic minimum and maximum.  Here we match the z10 versions,
+; which can't use LOCR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Check signed minium.
 define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
@@ -37,9 +38,8 @@ define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
 ; CHECK-LABEL: f3:
 ; CHECK: l %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clr %r2, %r4
 ; CHECK: lr [[NEW:%r[0-9]+]], %r2
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clrjle %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lr [[NEW]], %r4
 ; CHECK: cs %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
@@ -53,9 +53,8 @@ define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
 ; CHECK-LABEL: f4:
 ; CHECK: l %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clr %r2, %r4
 ; CHECK: lr [[NEW:%r[0-9]+]], %r2
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clrjhe %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lr [[NEW]], %r4
 ; CHECK: cs %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
index ebed147..037eb1a 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
@@ -1,6 +1,7 @@
-; Test 64-bit atomic minimum and maximum.
+; Test 64-bit atomic minimum and maximum.  Here we match the z10 versions,
+; which can't use LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Check signed minium.
 define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
@@ -37,9 +38,8 @@ define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
 ; CHECK-LABEL: f3:
 ; CHECK: lg %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clgr %r2, %r4
 ; CHECK: lgr [[NEW:%r[0-9]+]], %r2
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clgrjle %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lgr [[NEW]], %r4
 ; CHECK: csg %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
@@ -53,9 +53,8 @@ define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
 ; CHECK-LABEL: f4:
 ; CHECK: lg %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clgr %r2, %r4
 ; CHECK: lgr [[NEW:%r[0-9]+]], %r2
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clgrjhe %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lgr [[NEW]], %r4
 ; CHECK: csg %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
diff --git a/test/CodeGen/SystemZ/branch-05.ll b/test/CodeGen/SystemZ/branch-05.ll
index d657c9b..b2157b5 100644
--- a/test/CodeGen/SystemZ/branch-05.ll
+++ b/test/CodeGen/SystemZ/branch-05.ll
@@ -5,8 +5,7 @@
 define i32 @f1(i32 %x, i32 %y, i32 %op) {
 ; CHECK-LABEL: f1:
 ; CHECK: ahi %r4, -1
-; CHECK: clfi %r4, 5
-; CHECK-NEXT: jh
+; CHECK: clijh %r4, 5,
 ; CHECK: llgfr [[OP64:%r[0-5]]], %r4
 ; CHECK: sllg [[INDEX:%r[1-5]]], [[OP64]], 3
 ; CHECK: larl [[BASE:%r[1-5]]]
diff --git a/test/CodeGen/SystemZ/branch-06.ll b/test/CodeGen/SystemZ/branch-06.ll
index 13e5a84..2fa23b7 100644
--- a/test/CodeGen/SystemZ/branch-06.ll
+++ b/test/CodeGen/SystemZ/branch-06.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 declare i32 @foo()
+@g1 = global i16 0
 
 define void @f1(i32 %target) {
 ; CHECK-LABEL: f1:
@@ -87,3 +88,103 @@ loop:
 exit:
   ret void
 }
+
+; Check that CRJ is used for checking equality with a zero-extending
+; character load.
+define void @f7(i8 *%targetptr) {
+; CHECK-LABEL: f7:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: llc [[REG:%r[0-5]]],
+; CHECK: crje %r2, [[REG]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %byte = load i8 *%targetptr
+  %target = zext i8 %byte to i32
+  %cond = icmp eq i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; ...and zero-extending i16 loads.
+define void @f8(i16 *%targetptr) {
+; CHECK-LABEL: f8:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: llh [[REG:%r[0-5]]],
+; CHECK: crje %r2, [[REG]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %half = load i16 *%targetptr
+  %target = zext i16 %half to i32
+  %cond = icmp eq i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; ...unless the address is a global.
+define void @f9(i16 *%targetptr) {
+; CHECK-LABEL: f9:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clhrl %r2, g1
+; CHECK: je .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %half = load i16 *@g1
+  %target = zext i16 %half to i32
+  %cond = icmp eq i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; Check that CRJ is used for checking order between two zero-extending
+; byte loads, even if the original comparison was unsigned.
+define void @f10(i8 *%targetptr1) {
+; CHECK-LABEL: f10:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK-DAG: llc [[REG1:%r[0-5]]], 0(
+; CHECK-DAG: llc [[REG2:%r[0-5]]], 1(
+; CHECK: crjl [[REG1]], [[REG2]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %targetptr2 = getelementptr i8 *%targetptr1, i64 1
+  %byte1 = load i8 *%targetptr1
+  %byte2 = load i8 *%targetptr2
+  %ext1 = zext i8 %byte1 to i32
+  %ext2 = zext i8 %byte2 to i32
+  %cond = icmp ult i32 %ext1, %ext2
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; ...likewise halfword loads.
+define void @f11(i16 *%targetptr1) {
+; CHECK-LABEL: f11:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK-DAG: llh [[REG1:%r[0-5]]], 0(
+; CHECK-DAG: llh [[REG2:%r[0-5]]], 2(
+; CHECK: crjl [[REG1]], [[REG2]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %targetptr2 = getelementptr i16 *%targetptr1, i64 1
+  %half1 = load i16 *%targetptr1
+  %half2 = load i16 *%targetptr2
+  %ext1 = zext i16 %half1 to i32
+  %ext2 = zext i16 %half2 to i32
+  %cond = icmp ult i32 %ext1, %ext2
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/branch-07.ll b/test/CodeGen/SystemZ/branch-07.ll
index b715a05..bac6071 100644
--- a/test/CodeGen/SystemZ/branch-07.ll
+++ b/test/CodeGen/SystemZ/branch-07.ll
@@ -97,10 +97,9 @@ exit:
 ; Test a vector of 0/-1 results for i32 EQ.
 define i64 @f7(i64 %a, i64 %b) {
 ; CHECK-LABEL: f7:
-; CHECK: lhi [[REG:%r[0-5]]], -1
-; CHECK: crje {{%r[0-5]}}
-; CHECK: lhi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], -268435456
+; CHECK: sra [[REG]], 31
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
@@ -113,10 +112,9 @@ define i64 @f7(i64 %a, i64 %b) {
 ; Test a vector of 0/-1 results for i32 NE.
 define i64 @f8(i64 %a, i64 %b) {
 ; CHECK-LABEL: f8:
-; CHECK: lhi [[REG:%r[0-5]]], -1
-; CHECK: crjlh {{%r[0-5]}}
-; CHECK: lhi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], 1879048192
+; CHECK: sra [[REG]], 31
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
@@ -129,10 +127,10 @@ define i64 @f8(i64 %a, i64 %b) {
 ; Test a vector of 0/-1 results for i64 EQ.
 define void @f9(i64 %a, i64 %b, <2 x i64> *%dest) {
 ; CHECK-LABEL: f9:
-; CHECK: lghi [[REG:%r[0-5]]], -1
-; CHECK: crje {{%r[0-5]}}
-; CHECK: lghi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], -268435456
+; CHECK: sllg [[REG2:%r[0-5]]], [[REG]], 32
+; CHECK: srag {{%r[0-5]}}, [[REG2]], 63
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
@@ -145,10 +143,10 @@ define void @f9(i64 %a, i64 %b, <2 x i64> *%dest) {
 ; Test a vector of 0/-1 results for i64 NE.
 define void @f10(i64 %a, i64 %b, <2 x i64> *%dest) {
 ; CHECK-LABEL: f10:
-; CHECK: lghi [[REG:%r[0-5]]], -1
-; CHECK: crjlh {{%r[0-5]}}
-; CHECK: lghi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], 1879048192
+; CHECK: sllg [[REG2:%r[0-5]]], [[REG]], 32
+; CHECK: srag {{%r[0-5]}}, [[REG2]], 63
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
diff --git a/test/CodeGen/SystemZ/branch-08.ll b/test/CodeGen/SystemZ/branch-08.ll
index c4dc467..6741d29 100644
--- a/test/CodeGen/SystemZ/branch-08.ll
+++ b/test/CodeGen/SystemZ/branch-08.ll
@@ -6,14 +6,15 @@ declare void @foo() noreturn
 
 ; Check a case where a separate branch is needed and where the original
 ; order should be reversed.
-define i32 @f1(i32 %a, i32 %b) {
+define i32 @f1(i32 %a, i32 *%bptr) {
 ; CHECK-LABEL: f1:
-; CHECK: clr %r2, %r3
+; CHECK: cl %r2, 0(%r3)
 ; CHECK: jl .L[[LABEL:.*]]
 ; CHECK: br %r14
 ; CHECK: .L[[LABEL]]:
 ; CHECK: brasl %r14, foo@PLT
 entry:
+  %b = load i32 *%bptr
   %cmp = icmp ult i32 %a, %b
   br i1 %cmp, label %callit, label %return
 
diff --git a/test/CodeGen/SystemZ/branch-09.ll b/test/CodeGen/SystemZ/branch-09.ll
new file mode 100644
index 0000000..5591f5b
--- /dev/null
+++ b/test/CodeGen/SystemZ/branch-09.ll
@@ -0,0 +1,62 @@
+; Test all condition-code masks that are relevant for CLRJ.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i32 @foo()
+@g1 = global i16 0
+
+define void @f1(i32 %target) {
+; CHECK-LABEL: f1:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjle %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp ule i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f2(i32 %target) {
+; CHECK-LABEL: f2:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjl %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp ult i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f3(i32 %target) {
+; CHECK-LABEL: f3:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjh %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp ugt i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f4(i32 %target) {
+; CHECK-LABEL: f4:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjhe %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp uge i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/branch-10.ll b/test/CodeGen/SystemZ/branch-10.ll
new file mode 100644
index 0000000..ec6e759
--- /dev/null
+++ b/test/CodeGen/SystemZ/branch-10.ll
@@ -0,0 +1,62 @@
+; Test all condition-code masks that are relevant for CLGRJ.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @foo()
+@g1 = global i16 0
+
+define void @f1(i64 %target) {
+; CHECK-LABEL: f1:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjle %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp ule i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f2(i64 %target) {
+; CHECK-LABEL: f2:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjl %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp ult i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f3(i64 %target) {
+; CHECK-LABEL: f3:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjh %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp ugt i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f4(i64 %target) {
+; CHECK-LABEL: f4:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjhe %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp uge i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/call-03.ll b/test/CodeGen/SystemZ/call-03.ll
new file mode 100644
index 0000000..1f314ea
--- /dev/null
+++ b/test/CodeGen/SystemZ/call-03.ll
@@ -0,0 +1,125 @@
+; Test sibling calls.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @ok(i8 %r2, i16 %r3, i32 %r4, i64 %r5, float %f0, double %f2,
+                 float %f4, double %f6)
+declare void @uses_r6(i8 %r2, i16 %r3, i32 %r4, i64 %r5, i64 %r6)
+declare void @uses_indirect(fp128 %r2)
+declare void @uses_stack(float %f0, float %f2, float %f4, float %f6,
+                         float %stack)
+declare i32 @returns_i32()
+declare i64 @returns_i64()
+
+; Check the maximum number of arguments that we can pass and still use
+; a sibling call.
+define void @f1() {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lzer %f0
+; CHECK-DAG: lzdr %f2
+; CHECK-DAG: lhi %r2, 1
+; CHECK-DAG: lhi %r3, 2
+; CHECK-DAG: lhi %r4, 3
+; CHECK-DAG: lghi %r5, 4
+; CHECK-DAG: {{ler %f4, %f0|lzer %f4}}
+; CHECK-DAG: {{ldr %f6, %f2|lzdr %f6}}
+; CHECK: jg ok@PLT
+  tail call void @ok(i8 1, i16 2, i32 3, i64 4, float 0.0, double 0.0,
+                     float 0.0, double 0.0)
+  ret void
+}
+
+; Check a call that uses %r6 to pass an argument.  At the moment we don't
+; use sibling calls in that case.
+define void @f2() {
+; CHECK-LABEL: f2:
+; CHECK: brasl %r14, uses_r6@PLT
+; CHECK: br %r14
+  tail call void @uses_r6(i8 1, i16 2, i32 3, i64 4, i64 5)
+  ret void
+}
+
+; Check a call that passes indirect arguments.  We can't use sibling
+; calls in that case.
+define void @f3() {
+; CHECK-LABEL: f3:
+; CHECK: brasl %r14, uses_indirect@PLT
+; CHECK: br %r14
+  tail call void @uses_indirect(fp128 0xL00000000000000000000000000000000)
+  ret void
+}
+
+; Check a call that uses direct stack arguments, which again prevents
+; sibling calls
+define void @f4() {
+; CHECK-LABEL: f4:
+; CHECK: brasl %r14, uses_stack@PLT
+; CHECK: br %r14
+  tail call void @uses_stack(float 0.0, float 0.0, float 0.0, float 0.0,
+                             float 0.0)
+  ret void
+}
+
+; Check an indirect call.  In this case the only acceptable choice for
+; the target register is %r1.
+define void @f5(void(i32, i32, i32, i32) *%foo) {
+; CHECK-LABEL: f5:
+; CHECK: lgr %r1, %r2
+; CHECK-DAG: lhi %r2, 1
+; CHECK-DAG: lhi %r3, 2
+; CHECK-DAG: lhi %r4, 3
+; CHECK-DAG: lhi %r5, 4
+; CHECK: br %r1
+  tail call void %foo(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Check an indirect call that will be forced into a call-saved GPR
+; (which should be %r13, the highest GPR not used for anything else).
+define void @f6(void(i32) *%foo) {
+; CHECK-LABEL: f6:
+; CHECK: stmg %r13, %r15, 104(%r15)
+; CHECK: lgr %r13, %r2
+; CHECK: brasl %r14, returns_i32
+; CHECK: lgr %r1, %r13
+; CHECK: lmg %r13, %r15, 264(%r15)
+; CHECK: br %r1
+  %arg = call i32 @returns_i32()
+  tail call void %foo(i32 %arg)
+  ret void
+}
+
+; Test a function that returns a value.
+define i64 @f7() {
+; CHECK-LABEL: f7:
+; CHECK: jg returns_i64@PLT
+  %res = tail call i64 @returns_i64()
+  ret i64 %res
+}
+
+; Test a function that returns a value truncated from i64 to i32.
+define i32 @f8() {
+; CHECK-LABEL: f8:
+; CHECK: jg returns_i64@PLT
+  %res = tail call i64 @returns_i64()
+  %trunc = trunc i64 %res to i32
+  ret i32 %trunc
+}
+
+; Test a function that returns a value truncated from i64 to i7.
+define i7 @f9() {
+; CHECK-LABEL: f9:
+; CHECK: jg returns_i64@PLT
+  %res = tail call i64 @returns_i64()
+  %trunc = trunc i64 %res to i7
+  ret i7 %trunc
+}
+
+; Test a function that returns a value truncated from i32 to i8.
+define i8 @f10() {
+; CHECK-LABEL: f10:
+; CHECK: jg returns_i32@PLT
+  %res = tail call i32 @returns_i32()
+  %trunc = trunc i32 %res to i8
+  ret i8 %trunc
+}
diff --git a/test/CodeGen/SystemZ/cond-store-01.ll b/test/CodeGen/SystemZ/cond-store-01.ll
index 80e6d91..d55ea21 100644
--- a/test/CodeGen/SystemZ/cond-store-01.ll
+++ b/test/CodeGen/SystemZ/cond-store-01.ll
@@ -1,6 +1,7 @@
-; Test 8-bit conditional stores that are presented as selects.
+; Test 8-bit conditional stores that are presented as selects.  The volatile
+; tests require z10, which use a branch instead of a LOCR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare void @foo(i8 *)
 
@@ -13,7 +14,7 @@ define void @f1(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -29,7 +30,7 @@ define void @f2(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %alt, i8 %orig
   store i8 %res, i8 *%ptr
@@ -46,7 +47,7 @@ define void @f3(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -64,7 +65,7 @@ define void @f4(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -83,7 +84,7 @@ define void @f5(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -101,7 +102,7 @@ define void @f6(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -120,7 +121,7 @@ define void @f7(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -138,7 +139,7 @@ define void @f8(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -157,7 +158,7 @@ define void @f9(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -175,7 +176,7 @@ define void @f10(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -194,7 +195,7 @@ define void @f11(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 4095
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -211,7 +212,7 @@ define void @f12(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 4096
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -228,7 +229,7 @@ define void @f13(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 524287
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -247,7 +248,7 @@ define void @f14(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 524288
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -264,7 +265,7 @@ define void @f15(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 -524288
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -283,7 +284,7 @@ define void @f16(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 -524289
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -302,7 +303,7 @@ define void @f17(i64 %base, i64 %index, i8 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i8 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -317,7 +318,7 @@ define void @f18(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stc {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -332,7 +333,7 @@ define void @f19(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store volatile i8 %res, i8 *%ptr
@@ -352,7 +353,7 @@ define void @f20(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stc {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i8 *%ptr unordered, align 1
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -368,7 +369,7 @@ define void @f21(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: cs {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store atomic i8 %res, i8 *%ptr unordered, align 1
@@ -388,7 +389,7 @@ define void @f22(i8 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i8
   call void @foo(i8 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-02.ll b/test/CodeGen/SystemZ/cond-store-02.ll
index e01a853..91bc486 100644
--- a/test/CodeGen/SystemZ/cond-store-02.ll
+++ b/test/CodeGen/SystemZ/cond-store-02.ll
@@ -1,6 +1,7 @@
-; Test 16-bit conditional stores that are presented as selects.
+; Test 16-bit conditional stores that are presented as selects.  The volatile
+; tests require z10, which use a branch instead of a LOCR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare void @foo(i16 *)
 
@@ -13,7 +14,7 @@ define void @f1(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -29,7 +30,7 @@ define void @f2(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %alt, i16 %orig
   store i16 %res, i16 *%ptr
@@ -46,7 +47,7 @@ define void @f3(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -64,7 +65,7 @@ define void @f4(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -83,7 +84,7 @@ define void @f5(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -101,7 +102,7 @@ define void @f6(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -120,7 +121,7 @@ define void @f7(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -138,7 +139,7 @@ define void @f8(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -157,7 +158,7 @@ define void @f9(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -175,7 +176,7 @@ define void @f10(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -194,7 +195,7 @@ define void @f11(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 2047
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -211,7 +212,7 @@ define void @f12(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 2048
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -228,7 +229,7 @@ define void @f13(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 262143
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -247,7 +248,7 @@ define void @f14(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 262144
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -264,7 +265,7 @@ define void @f15(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 -262144
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -283,7 +284,7 @@ define void @f16(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 -262145
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -302,7 +303,7 @@ define void @f17(i64 %base, i64 %index, i16 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i16 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -317,7 +318,7 @@ define void @f18(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: sth {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -332,7 +333,7 @@ define void @f19(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store volatile i16 %res, i16 *%ptr
@@ -352,7 +353,7 @@ define void @f20(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: sth {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i16 *%ptr unordered, align 2
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -368,7 +369,7 @@ define void @f21(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: cs {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store atomic i16 %res, i16 *%ptr unordered, align 2
@@ -388,7 +389,7 @@ define void @f22(i16 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i16
   call void @foo(i16 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-03.ll b/test/CodeGen/SystemZ/cond-store-03.ll
index e122bc2..d4fd48d 100644
--- a/test/CodeGen/SystemZ/cond-store-03.ll
+++ b/test/CodeGen/SystemZ/cond-store-03.ll
@@ -13,7 +13,7 @@ define void @f1(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -29,7 +29,7 @@ define void @f2(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %alt, i32 %orig
   store i32 %res, i32 *%ptr
@@ -46,7 +46,7 @@ define void @f3(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = sext i32 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -64,7 +64,7 @@ define void @f4(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = sext i32 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -83,7 +83,7 @@ define void @f5(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = zext i32 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -101,7 +101,7 @@ define void @f6(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = zext i32 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -120,7 +120,7 @@ define void @f7(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 1023
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -137,7 +137,7 @@ define void @f8(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 1024
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -154,7 +154,7 @@ define void @f9(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 131071
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -173,7 +173,7 @@ define void @f10(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -190,7 +190,7 @@ define void @f11(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 -131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -209,7 +209,7 @@ define void @f12(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 -131073
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -228,7 +228,7 @@ define void @f13(i64 %base, i64 %index, i32 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i32 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -243,7 +243,7 @@ define void @f14(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -258,7 +258,7 @@ define void @f15(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store volatile i32 %res, i32 *%ptr
@@ -277,7 +277,7 @@ define void @f16(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i32 *%ptr unordered, align 4
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -293,7 +293,7 @@ define void @f17(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: cs {{%r[0-5]}}, %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store atomic i32 %res, i32 *%ptr unordered, align 4
@@ -313,7 +313,7 @@ define void @f18(i32 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i32
   call void @foo(i32 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-04.ll b/test/CodeGen/SystemZ/cond-store-04.ll
index 4ed23a3..fc565c4 100644
--- a/test/CodeGen/SystemZ/cond-store-04.ll
+++ b/test/CodeGen/SystemZ/cond-store-04.ll
@@ -13,7 +13,7 @@ define void @f1(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stg %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -29,7 +29,7 @@ define void @f2(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stg %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %alt, i64 %orig
   store i64 %res, i64 *%ptr
@@ -46,7 +46,7 @@ define void @f3(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 65535
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -65,7 +65,7 @@ define void @f4(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -82,7 +82,7 @@ define void @f5(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 -65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -101,7 +101,7 @@ define void @f6(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 -65537
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -120,7 +120,7 @@ define void @f7(i64 %base, i64 %index, i64 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 524287
   %ptr = inttoptr i64 %add2 to i64 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -135,7 +135,7 @@ define void @f8(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -150,7 +150,7 @@ define void @f9(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stg %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store volatile i64 %res, i64 *%ptr
@@ -169,7 +169,7 @@ define void @f10(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i64 *%ptr unordered, align 8
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -185,7 +185,7 @@ define void @f11(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: csg {{%r[0-5]}}, %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store atomic i64 %res, i64 *%ptr unordered, align 8
@@ -205,7 +205,7 @@ define void @f12(i64 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i64
   call void @foo(i64 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-05.ll b/test/CodeGen/SystemZ/cond-store-05.ll
index e41c8fe..f8056f7 100644
--- a/test/CodeGen/SystemZ/cond-store-05.ll
+++ b/test/CodeGen/SystemZ/cond-store-05.ll
@@ -13,7 +13,7 @@ define void @f1(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: ste %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -29,7 +29,7 @@ define void @f2(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: ste %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %alt, float %orig
   store float %res, float *%ptr
@@ -46,7 +46,7 @@ define void @f3(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 1023
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -63,7 +63,7 @@ define void @f4(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 1024
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -80,7 +80,7 @@ define void @f5(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 131071
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -99,7 +99,7 @@ define void @f6(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -116,7 +116,7 @@ define void @f7(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 -131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -135,7 +135,7 @@ define void @f8(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 -131073
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -154,7 +154,7 @@ define void @f9(i64 %base, i64 %index, float %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to float *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -169,7 +169,7 @@ define void @f10(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: ste {{%f[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -184,7 +184,7 @@ define void @f11(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: ste %f0, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store volatile float %res, float *%ptr
@@ -204,7 +204,7 @@ define void @f12(float %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca float
   call void @foo(float *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-06.ll b/test/CodeGen/SystemZ/cond-store-06.ll
index 759a3e0..6668195 100644
--- a/test/CodeGen/SystemZ/cond-store-06.ll
+++ b/test/CodeGen/SystemZ/cond-store-06.ll
@@ -13,7 +13,7 @@ define void @f1(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -29,7 +29,7 @@ define void @f2(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %alt, double %orig
   store double %res, double *%ptr
@@ -46,7 +46,7 @@ define void @f3(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 511
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -63,7 +63,7 @@ define void @f4(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 512
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -80,7 +80,7 @@ define void @f5(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 65535
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -99,7 +99,7 @@ define void @f6(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -116,7 +116,7 @@ define void @f7(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 -65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -135,7 +135,7 @@ define void @f8(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 -65537
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -154,7 +154,7 @@ define void @f9(i64 %base, i64 %index, double %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 524287
   %ptr = inttoptr i64 %add2 to double *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -169,7 +169,7 @@ define void @f10(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: std {{%f[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -184,7 +184,7 @@ define void @f11(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store volatile double %res, double *%ptr
@@ -204,7 +204,7 @@ define void @f12(double %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca double
   call void @foo(double *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
diff --git a/test/CodeGen/SystemZ/fp-cmp-01.ll b/test/CodeGen/SystemZ/fp-cmp-01.ll
index 6a9598e..d7c0cce 100644
--- a/test/CodeGen/SystemZ/fp-cmp-01.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-01.ll
@@ -1,6 +1,7 @@
-; Test 32-bit floating-point comparison.
+; Test 32-bit floating-point comparison.  The tests assume a z10 implementation
+; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare float @foo()
 
@@ -159,3 +160,160 @@ define i64 @f8(i64 %a, i64 %b, float %f) {
   %res = select i1 %cond, i64 %a, i64 %b
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CEB to be used,
+; first with oeq.
+define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: je {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp oeq float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then one.
+define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f10:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jlh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp one float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then olt.
+define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f11:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp olt float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ole.
+define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jhe {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ole float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then oge.
+define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f13:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jle {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp oge float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ogt.
+define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f14:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jl {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ogt float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ueq.
+define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f15:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnlh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ueq float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then une.
+define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f16:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jne {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp une float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ult.
+define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f17:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnle {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ult float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ule.
+define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f18:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnl {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ule float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then uge.
+define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f19:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp uge float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ugt.
+define i64 @f20(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f20:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnhe {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ugt float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-cmp-02.ll b/test/CodeGen/SystemZ/fp-cmp-02.ll
index 309d12e..c61f04e 100644
--- a/test/CodeGen/SystemZ/fp-cmp-02.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-02.ll
@@ -1,6 +1,7 @@
-; Test 64-bit floating-point comparison.
+; Test 64-bit floating-point comparison.  The tests assume a z10 implementation
+; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare double @foo()
 
@@ -159,3 +160,16 @@ define i64 @f8(i64 %a, i64 %b, double %f) {
   %res = select i1 %cond, i64 %a, i64 %b
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CDB to be used,
+define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: cdb %f0, 0(%r4)
+; CHECK-NEXT: jl {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load double *%ptr
+  %cond = fcmp ogt double %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-cmp-03.ll b/test/CodeGen/SystemZ/fp-cmp-03.ll
index 0f71f4e..e777d00 100644
--- a/test/CodeGen/SystemZ/fp-cmp-03.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-03.ll
@@ -1,6 +1,7 @@
-; Test 128-bit floating-point comparison.
+; Test 128-bit floating-point comparison.  The tests assume a z10 implementation
+; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; There is no memory form of 128-bit comparison.
 define i64 @f1(i64 %a, i64 %b, fp128 *%ptr, float %f2) {
diff --git a/test/CodeGen/SystemZ/fp-move-02.ll b/test/CodeGen/SystemZ/fp-move-02.ll
index b4f0428..505ee8d 100644
--- a/test/CodeGen/SystemZ/fp-move-02.ll
+++ b/test/CodeGen/SystemZ/fp-move-02.ll
@@ -1,6 +1,7 @@
-; Test moves between FPRs and GPRs.
+; Test moves between FPRs and GPRs.  The 32-bit cases test the z10
+; implementation, which has no high-word support.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare i64 @foo()
 declare double @bar()
@@ -63,11 +64,11 @@ define double @f5(i64 %a) {
 
 ; Test 128-bit moves from GPRs to FPRs.  i128 isn't a legitimate type,
 ; so this goes through memory.
-; FIXME: it would be better to use one MVC here.
 define void @f6(fp128 *%a, i128 *%b) {
 ; CHECK-LABEL: f6:
 ; CHECK: lg
-; CHECK: mvc
+; CHECK: lg
+; CHECK: stg
 ; CHECK: stg
 ; CHECK: br %r14
   %val = load i128 *%b
diff --git a/test/CodeGen/SystemZ/fp-move-09.ll b/test/CodeGen/SystemZ/fp-move-09.ll
new file mode 100644
index 0000000..52b2ee2
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-move-09.ll
@@ -0,0 +1,62 @@
+; Test moves between FPRs and GPRs for z196 and above.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check that moves from i32s to floats can use high registers.
+define float @f1(i16 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: llhh [[REG:%r[0-5]]], 0(%r2)
+; CHECK: oihh [[REG]], 16256
+; CHECK: ldgr %f0, [[REG]]
+; CHECK: br %r14
+  %base = load i16 *%ptr
+  %ext = zext i16 %base to i32
+  %full = or i32 %ext, 1065353216
+  %res = bitcast i32 %full to float
+  ret float %res
+}
+
+; Check that moves from floats to i32s can use high registers.
+; This "store the low byte" technique is used by llvmpipe, for example.
+define void @f2(float %val, i8 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: lgdr [[REG:%r[0-5]]], %f0
+; CHECK: stch [[REG]], 0(%r2)
+; CHECK: br %r14
+  %res = bitcast float %val to i32
+  %trunc = trunc i32 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; Like f2, but with a conditional store.
+define void @f3(float %val, i8 *%ptr, i32 %which) {
+; CHECK-LABEL: f3:
+; CHECK: cijlh %r3, 0,
+; CHECK: lgdr [[REG:%r[0-5]]], %f0
+; CHECK: stch [[REG]], 0(%r2)
+; CHECK: br %r14
+  %int = bitcast float %val to i32
+  %trunc = trunc i32 %int to i8
+  %old = load i8 *%ptr
+  %cmp = icmp eq i32 %which, 0
+  %res = select i1 %cmp, i8 %trunc, i8 %old
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; ...and again with 16-bit memory.
+define void @f4(float %val, i16 *%ptr, i32 %which) {
+; CHECK-LABEL: f4:
+; CHECK: cijlh %r3, 0,
+; CHECK: lgdr [[REG:%r[0-5]]], %f0
+; CHECK: sthh [[REG]], 0(%r2)
+; CHECK: br %r14
+  %int = bitcast float %val to i32
+  %trunc = trunc i32 %int to i16
+  %old = load i16 *%ptr
+  %cmp = icmp eq i32 %which, 0
+  %res = select i1 %cmp, i16 %trunc, i16 %old
+  store i16 %res, i16 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-round-01.ll b/test/CodeGen/SystemZ/fp-round-01.ll
index f2530dc..565db5a 100644
--- a/test/CodeGen/SystemZ/fp-round-01.ll
+++ b/test/CodeGen/SystemZ/fp-round-01.ll
@@ -1,9 +1,8 @@
-; Test rint()-like rounding, with non-integer values triggering an
-; inexact condition.
+; Test rounding functions for z10.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
-; Test f32.
+; Test rint for f32.
 declare float @llvm.rint.f32(float %f)
 define float @f1(float %f) {
 ; CHECK-LABEL: f1:
@@ -13,7 +12,7 @@ define float @f1(float %f) {
   ret float %res
 }
 
-; Test f64.
+; Test rint for f64.
 declare double @llvm.rint.f64(double %f)
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
@@ -23,7 +22,7 @@ define double @f2(double %f) {
   ret double %res
 }
 
-; Test f128.
+; Test rint for f128.
 declare fp128 @llvm.rint.f128(fp128 %f)
 define void @f3(fp128 *%ptr) {
 ; CHECK-LABEL: f3:
@@ -34,3 +33,118 @@ define void @f3(fp128 *%ptr) {
   store fp128 %res, fp128 *%ptr
   ret void
 }
+
+; Test nearbyint for f32.
+declare float @llvm.nearbyint.f32(float %f)
+define float @f4(float %f) {
+; CHECK-LABEL: f4:
+; CHECK: brasl %r14, nearbyintf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.nearbyint.f32(float %f)
+  ret float %res
+}
+
+; Test nearbyint for f64.
+declare double @llvm.nearbyint.f64(double %f)
+define double @f5(double %f) {
+; CHECK-LABEL: f5:
+; CHECK: brasl %r14, nearbyint@PLT
+; CHECK: br %r14
+  %res = call double @llvm.nearbyint.f64(double %f)
+  ret double %res
+}
+
+; Test nearbyint for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test floor for f32.
+declare float @llvm.floor.f32(float %f)
+define float @f7(float %f) {
+; CHECK-LABEL: f7:
+; CHECK: brasl %r14, floorf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.floor.f32(float %f)
+  ret float %res
+}
+
+; Test floor for f64.
+declare double @llvm.floor.f64(double %f)
+define double @f8(double %f) {
+; CHECK-LABEL: f8:
+; CHECK: brasl %r14, floor@PLT
+; CHECK: br %r14
+  %res = call double @llvm.floor.f64(double %f)
+  ret double %res
+}
+
+; Test floor for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test ceil for f32.
+declare float @llvm.ceil.f32(float %f)
+define float @f10(float %f) {
+; CHECK-LABEL: f10:
+; CHECK: brasl %r14, ceilf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.ceil.f32(float %f)
+  ret float %res
+}
+
+; Test ceil for f64.
+declare double @llvm.ceil.f64(double %f)
+define double @f11(double %f) {
+; CHECK-LABEL: f11:
+; CHECK: brasl %r14, ceil@PLT
+; CHECK: br %r14
+  %res = call double @llvm.ceil.f64(double %f)
+  ret double %res
+}
+
+; Test ceil for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test trunc for f32.
+declare float @llvm.trunc.f32(float %f)
+define float @f13(float %f) {
+; CHECK-LABEL: f13:
+; CHECK: brasl %r14, truncf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.trunc.f32(float %f)
+  ret float %res
+}
+
+; Test trunc for f64.
+declare double @llvm.trunc.f64(double %f)
+define double @f14(double %f) {
+; CHECK-LABEL: f14:
+; CHECK: brasl %r14, trunc@PLT
+; CHECK: br %r14
+  %res = call double @llvm.trunc.f64(double %f)
+  ret double %res
+}
+
+; Test trunc for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test round for f32.
+declare float @llvm.round.f32(float %f)
+define float @f16(float %f) {
+; CHECK-LABEL: f16:
+; CHECK: brasl %r14, roundf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.round.f32(float %f)
+  ret float %res
+}
+
+; Test round for f64.
+declare double @llvm.round.f64(double %f)
+define double @f17(double %f) {
+; CHECK-LABEL: f17:
+; CHECK: brasl %r14, round@PLT
+; CHECK: br %r14
+  %res = call double @llvm.round.f64(double %f)
+  ret double %res
+}
+
+; Test round for f128: omitted for now because we cannot handle
+; indirect arguments.
diff --git a/test/CodeGen/SystemZ/fp-round-02.ll b/test/CodeGen/SystemZ/fp-round-02.ll
new file mode 100644
index 0000000..d79c9c4
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-round-02.ll
@@ -0,0 +1,195 @@
+; Test rounding functions for z196 and above.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Test rint for f32.
+declare float @llvm.rint.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: fiebr %f0, 0, %f0
+; CHECK: br %r14
+  %res = call float @llvm.rint.f32(float %f)
+  ret float %res
+}
+
+; Test rint for f64.
+declare double @llvm.rint.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: fidbr %f0, 0, %f0
+; CHECK: br %r14
+  %res = call double @llvm.rint.f64(double %f)
+  ret double %res
+}
+
+; Test rint for f128.
+declare fp128 @llvm.rint.f128(fp128 %f)
+define void @f3(fp128 *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: fixbr %f0, 0, %f0
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.rint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test nearbyint for f32.
+declare float @llvm.nearbyint.f32(float %f)
+define float @f4(float %f) {
+; CHECK-LABEL: f4:
+; CHECK: fiebra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.nearbyint.f32(float %f)
+  ret float %res
+}
+
+; Test nearbyint for f64.
+declare double @llvm.nearbyint.f64(double %f)
+define double @f5(double %f) {
+; CHECK-LABEL: f5:
+; CHECK: fidbra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.nearbyint.f64(double %f)
+  ret double %res
+}
+
+; Test nearbyint for f128.
+declare fp128 @llvm.nearbyint.f128(fp128 %f)
+define void @f6(fp128 *%ptr) {
+; CHECK-LABEL: f6:
+; CHECK: fixbra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.nearbyint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test floor for f32.
+declare float @llvm.floor.f32(float %f)
+define float @f7(float %f) {
+; CHECK-LABEL: f7:
+; CHECK: fiebra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.floor.f32(float %f)
+  ret float %res
+}
+
+; Test floor for f64.
+declare double @llvm.floor.f64(double %f)
+define double @f8(double %f) {
+; CHECK-LABEL: f8:
+; CHECK: fidbra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.floor.f64(double %f)
+  ret double %res
+}
+
+; Test floor for f128.
+declare fp128 @llvm.floor.f128(fp128 %f)
+define void @f9(fp128 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: fixbra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.floor.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test ceil for f32.
+declare float @llvm.ceil.f32(float %f)
+define float @f10(float %f) {
+; CHECK-LABEL: f10:
+; CHECK: fiebra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.ceil.f32(float %f)
+  ret float %res
+}
+
+; Test ceil for f64.
+declare double @llvm.ceil.f64(double %f)
+define double @f11(double %f) {
+; CHECK-LABEL: f11:
+; CHECK: fidbra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.ceil.f64(double %f)
+  ret double %res
+}
+
+; Test ceil for f128.
+declare fp128 @llvm.ceil.f128(fp128 %f)
+define void @f12(fp128 *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: fixbra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.ceil.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test trunc for f32.
+declare float @llvm.trunc.f32(float %f)
+define float @f13(float %f) {
+; CHECK-LABEL: f13:
+; CHECK: fiebra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.trunc.f32(float %f)
+  ret float %res
+}
+
+; Test trunc for f64.
+declare double @llvm.trunc.f64(double %f)
+define double @f14(double %f) {
+; CHECK-LABEL: f14:
+; CHECK: fidbra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.trunc.f64(double %f)
+  ret double %res
+}
+
+; Test trunc for f128.
+declare fp128 @llvm.trunc.f128(fp128 %f)
+define void @f15(fp128 *%ptr) {
+; CHECK-LABEL: f15:
+; CHECK: fixbra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.trunc.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test round for f32.
+declare float @llvm.round.f32(float %f)
+define float @f16(float %f) {
+; CHECK-LABEL: f16:
+; CHECK: fiebra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.round.f32(float %f)
+  ret float %res
+}
+
+; Test round for f64.
+declare double @llvm.round.f64(double %f)
+define double @f17(double %f) {
+; CHECK-LABEL: f17:
+; CHECK: fidbra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.round.f64(double %f)
+  ret double %res
+}
+
+; Test round for f128.
+declare fp128 @llvm.round.f128(fp128 %f)
+define void @f18(fp128 *%ptr) {
+; CHECK-LABEL: f18:
+; CHECK: fixbra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.round.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-sqrt-01.ll b/test/CodeGen/SystemZ/fp-sqrt-01.ll
index b6568d6..7465af4 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-01.ll
@@ -2,7 +2,8 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-declare float @llvm.sqrt.f32(float %f)
+declare float @llvm.sqrt.f32(float)
+declare float @sqrtf(float)
 
 ; Check register square root.
 define float @f1(float %val) {
@@ -152,3 +153,17 @@ define void @f7(float *%ptr) {
 
   ret void
 }
+
+; Check that a call to the normal sqrtf function is lowered.
+define float @f8(float %dummy, float %val) {
+; CHECK-LABEL: f8:
+; CHECK: sqebr %f0, %f2
+; CHECK: cebr %f0, %f0
+; CHECK: jo [[LABEL:\.L.*]]
+; CHECK: br %r14
+; CHECK: [[LABEL]]:
+; CHECK: ler %f0, %f2
+; CHECK: jg sqrtf@PLT
+  %res = tail call float @sqrtf(float %val)
+  ret float %res
+}
diff --git a/test/CodeGen/SystemZ/fp-sqrt-02.ll b/test/CodeGen/SystemZ/fp-sqrt-02.ll
index b07a2c6..66ffd19 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-02.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-02.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 declare double @llvm.sqrt.f64(double %f)
+declare double @sqrt(double)
 
 ; Check register square root.
 define double @f1(double %val) {
@@ -152,3 +153,17 @@ define void @f7(double *%ptr) {
 
   ret void
 }
+
+; Check that a call to the normal sqrt function is lowered.
+define double @f8(double %dummy, double %val) {
+; CHECK-LABEL: f8:
+; CHECK: sqdbr %f0, %f2
+; CHECK: cdbr %f0, %f0
+; CHECK: jo [[LABEL:\.L.*]]
+; CHECK: br %r14
+; CHECK: [[LABEL]]:
+; CHECK: ldr %f0, %f2
+; CHECK: jg sqrt@PLT
+  %res = tail call double @sqrt(double %val)
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/frame-13.ll b/test/CodeGen/SystemZ/frame-13.ll
index 1d38354..393850f 100644
--- a/test/CodeGen/SystemZ/frame-13.ll
+++ b/test/CodeGen/SystemZ/frame-13.ll
@@ -1,8 +1,11 @@
 ; Test the handling of base + 12-bit displacement addresses for large frames,
-; in cases where no 20-bit form exists.
+; in cases where no 20-bit form exists.  The tests here assume z10 register
+; pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 
 ; This file tests what happens when a displacement is converted from
 ; being relative to the start of a frame object to being relative to
@@ -182,17 +185,16 @@ define void @f8() {
 }
 
 ; Check a case where the original displacement is out of range.  The backend
-; should force an LAY from the outset.  We don't yet do any kind of anchor
-; optimization, so there should be no offset on the MVHI itself.
+; should force STY to be used instead.
 define void @f9() {
 ; CHECK-NOFP-LABEL: f9:
-; CHECK-NOFP: lay %r1, 12296(%r15)
-; CHECK-NOFP: mvhi 0(%r1), 42
+; CHECK-NOFP: lhi [[TMP:%r[0-5]]], 42
+; CHECK-NOFP: sty [[TMP]], 12296(%r15)
 ; CHECK-NOFP: br %r14
 ;
 ; CHECK-FP-LABEL: f9:
-; CHECK-FP: lay %r1, 12296(%r11)
-; CHECK-FP: mvhi 0(%r1), 42
+; CHECK-FP: lhi [[TMP:%r[0-5]]], 42
+; CHECK-FP: sty [[TMP]], 12296(%r11)
 ; CHECK-FP: br %r14
   %region1 = alloca [2006 x i32], align 8
   %region2 = alloca [2006 x i32], align 8
diff --git a/test/CodeGen/SystemZ/frame-14.ll b/test/CodeGen/SystemZ/frame-14.ll
index 22a45ee..3b48179 100644
--- a/test/CodeGen/SystemZ/frame-14.ll
+++ b/test/CodeGen/SystemZ/frame-14.ll
@@ -1,9 +1,13 @@
 ; Test the handling of base + displacement addresses for large frames,
 ; in cases where both 12-bit and 20-bit displacements are allowed.
+; The tests here assume z10 register pressure, without the high words
+; being available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
-
 ; This file tests what happens when a displacement is converted from
 ; being relative to the start of a frame object to being relative to
 ; the frame itself.  In some cases the test is only possible if two
diff --git a/test/CodeGen/SystemZ/frame-15.ll b/test/CodeGen/SystemZ/frame-15.ll
index d8b291d..b3c95e7 100644
--- a/test/CodeGen/SystemZ/frame-15.ll
+++ b/test/CodeGen/SystemZ/frame-15.ll
@@ -1,8 +1,11 @@
 ; Test the handling of base + index + 12-bit displacement addresses for
-; large frames, in cases where no 20-bit form exists.
+; large frames, in cases where no 20-bit form exists.  The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 
 declare void @foo(float *%ptr1, float *%ptr2)
 
diff --git a/test/CodeGen/SystemZ/frame-16.ll b/test/CodeGen/SystemZ/frame-16.ll
index 9f43b49..f7e2dfa 100644
--- a/test/CodeGen/SystemZ/frame-16.ll
+++ b/test/CodeGen/SystemZ/frame-16.ll
@@ -1,8 +1,12 @@
 ; Test the handling of base + index + displacement addresses for large frames,
 ; in cases where both 12-bit and 20-bit displacements are allowed.
+; The tests here assume z10 register pressure, without the high words
+; being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 
 ; This file tests what happens when a displacement is converted from
 ; being relative to the start of a frame object to being relative to
diff --git a/test/CodeGen/SystemZ/frame-18.ll b/test/CodeGen/SystemZ/frame-18.ll
index 57d6f7d..21dfc12 100644
--- a/test/CodeGen/SystemZ/frame-18.ll
+++ b/test/CodeGen/SystemZ/frame-18.ll
@@ -1,6 +1,7 @@
-; Test spilling of GPRs.
+; Test spilling of GPRs.  The tests here assume z10 register pressure,
+; without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; We need to allocate a 4-byte spill slot, rounded to 8 bytes.  The frame
 ; size should be exactly 160 + 8 = 168.
diff --git a/test/CodeGen/SystemZ/insert-06.ll b/test/CodeGen/SystemZ/insert-06.ll
index 8366b2c..edcd0c5 100644
--- a/test/CodeGen/SystemZ/insert-06.ll
+++ b/test/CodeGen/SystemZ/insert-06.ll
@@ -165,3 +165,16 @@ define i64 @f13(i64 %a, i32 %b) {
   %or = or i64 %shift, %low
   ret i64 %or
 }
+
+; We previously wrongly removed the upper AND as dead.
+define i64 @f14(i64 %a, i64 %b) {
+; CHECK-LABEL: f14:
+; CHECK: risbg {{%r[0-5]}}, %r2, 6, 134, 0
+; CHECK: br %r14
+  %and1 = and i64 %a, 144115188075855872
+  %and2 = and i64 %b, 15
+  %or = or i64 %and1, %and2
+  %res = icmp eq i64 %or, 0
+  %ext = sext i1 %res to i64
+  ret i64 %ext
+}
diff --git a/test/CodeGen/SystemZ/int-abs-01.ll b/test/CodeGen/SystemZ/int-abs-01.ll
new file mode 100644
index 0000000..40fb611
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-abs-01.ll
@@ -0,0 +1,83 @@
+; Test integer absolute.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test i32->i32 absolute using slt.
+define i32 @f1(i32 %val) {
+; CHECK-LABEL: f1:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %neg, i32 %val
+  ret i32 %res
+}
+
+; Test i32->i32 absolute using sle.
+define i32 @f2(i32 %val) {
+; CHECK-LABEL: f2:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sle i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %neg, i32 %val
+  ret i32 %res
+}
+
+; Test i32->i32 absolute using sgt.
+define i32 @f3(i32 %val) {
+; CHECK-LABEL: f3:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sgt i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %val, i32 %neg
+  ret i32 %res
+}
+
+; Test i32->i32 absolute using sge.
+define i32 @f4(i32 %val) {
+; CHECK-LABEL: f4:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sge i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %val, i32 %neg
+  ret i32 %res
+}
+
+; Test i32->i64 absolute.
+define i64 @f5(i32 %val) {
+; CHECK-LABEL: f5:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i32 %val to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %res = select i1 %cmp, i64 %neg, i64 %ext
+  ret i64 %res
+}
+
+; Test i32->i64 absolute that uses an "in-register" form of sign extension.
+define i64 @f6(i64 %val) {
+; CHECK-LABEL: f6:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %trunc = trunc i64 %val to i32
+  %ext = sext i32 %trunc to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %res = select i1 %cmp, i64 %neg, i64 %ext
+  ret i64 %res
+}
+
+; Test i64 absolute.
+define i64 @f7(i64 %val) {
+; CHECK-LABEL: f7:
+; CHECK: lpgr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i64 %val, 0
+  %neg = sub i64 0, %val
+  %res = select i1 %cmp, i64 %neg, i64 %val
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-add-09.ll b/test/CodeGen/SystemZ/int-add-09.ll
index 717fed0..fd151a7 100644
--- a/test/CodeGen/SystemZ/int-add-09.ll
+++ b/test/CodeGen/SystemZ/int-add-09.ll
@@ -7,7 +7,7 @@
 define void @f1(i128 *%aptr) {
 ; CHECK-LABEL: f1:
 ; CHECK: algfi {{%r[0-5]}}, 1
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
@@ -20,7 +20,7 @@ define void @f1(i128 *%aptr) {
 define void @f2(i128 *%aptr) {
 ; CHECK-LABEL: f2:
 ; CHECK: algfi {{%r[0-5]}}, 4294967295
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
@@ -33,7 +33,7 @@ define void @f2(i128 *%aptr) {
 define void @f3(i128 *%aptr) {
 ; CHECK-LABEL: f3:
 ; CHECK: algr
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
@@ -46,7 +46,7 @@ define void @f3(i128 *%aptr) {
 define void @f4(i128 *%aptr) {
 ; CHECK-LABEL: f4:
 ; CHECK: algr
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
diff --git a/test/CodeGen/SystemZ/int-add-10.ll b/test/CodeGen/SystemZ/int-add-10.ll
index 66a275b..01d0a66 100644
--- a/test/CodeGen/SystemZ/int-add-10.ll
+++ b/test/CodeGen/SystemZ/int-add-10.ll
@@ -7,7 +7,7 @@
 define void @f1(i128 *%aptr, i32 %b) {
 ; CHECK-LABEL: f1:
 ; CHECK: algfr {{%r[0-5]}}, %r3
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -21,7 +21,7 @@ define void @f1(i128 *%aptr, i32 %b) {
 define void @f2(i128 *%aptr, i64 %b) {
 ; CHECK-LABEL: f2:
 ; CHECK: algfr {{%r[0-5]}}, %r3
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -37,7 +37,7 @@ define void @f2(i128 *%aptr, i64 %b) {
 define void @f3(i128 *%aptr, i64 %b) {
 ; CHECK-LABEL: f3:
 ; CHECK: algfr {{%r[0-5]}}, %r3
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -52,7 +52,7 @@ define void @f3(i128 *%aptr, i64 %b) {
 define void @f4(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f4:
 ; CHECK: algf {{%r[0-5]}}, 0(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -67,7 +67,7 @@ define void @f4(i128 *%aptr, i32 *%bsrc) {
 define void @f5(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f5:
 ; CHECK: algf {{%r[0-5]}}, 524284(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -85,7 +85,7 @@ define void @f6(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f6:
 ; CHECK: agfi %r3, 524288
 ; CHECK: algf {{%r[0-5]}}, 0(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -101,7 +101,7 @@ define void @f6(i128 *%aptr, i32 *%bsrc) {
 define void @f7(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f7:
 ; CHECK: algf {{%r[0-5]}}, -4(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -117,7 +117,7 @@ define void @f7(i128 *%aptr, i32 *%bsrc) {
 define void @f8(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f8:
 ; CHECK: algf {{%r[0-5]}}, -524288(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -135,7 +135,7 @@ define void @f9(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f9:
 ; CHECK: agfi %r3, -524292
 ; CHECK: algf {{%r[0-5]}}, 0(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
diff --git a/test/CodeGen/SystemZ/int-add-11.ll b/test/CodeGen/SystemZ/int-add-11.ll
index 6c617ba..679c206 100644
--- a/test/CodeGen/SystemZ/int-add-11.ll
+++ b/test/CodeGen/SystemZ/int-add-11.ll
@@ -1,6 +1,7 @@
-; Test 32-bit additions of constants to memory.
+; Test 32-bit additions of constants to memory.  The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Check additions of 1.
 define void @f1(i32 *%ptr) {
@@ -126,3 +127,169 @@ define void @f10(i64 %base, i64 %index) {
   store i32 %add, i32 *%ptr
   ret void
 }
+
+; Check that adding 127 to a spilled value can use ASI.
+define void @f11(i32 *%ptr, i32 %sel) {
+; CHECK-LABEL: f11:
+; CHECK: asi {{[0-9]+}}(%r15), 127
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i32 %val0, 127
+  %add1 = add i32 %val1, 127
+  %add2 = add i32 %val2, 127
+  %add3 = add i32 %val3, 127
+  %add4 = add i32 %val4, 127
+  %add5 = add i32 %val5, 127
+  %add6 = add i32 %val6, 127
+  %add7 = add i32 %val7, 127
+  %add8 = add i32 %val8, 127
+  %add9 = add i32 %val9, 127
+  %add10 = add i32 %val10, 127
+  %add11 = add i32 %val11, 127
+  %add12 = add i32 %val12, 127
+  %add13 = add i32 %val13, 127
+  %add14 = add i32 %val14, 127
+  %add15 = add i32 %val15, 127
+  br label %store
+
+store:
+  %new0 = phi i32 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i32 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i32 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i32 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i32 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i32 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i32 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i32 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i32 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i32 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i32 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i32 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i32 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i32 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i32 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i32 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i32 %new0, i32 *%ptr
+  store volatile i32 %new1, i32 *%ptr
+  store volatile i32 %new2, i32 *%ptr
+  store volatile i32 %new3, i32 *%ptr
+  store volatile i32 %new4, i32 *%ptr
+  store volatile i32 %new5, i32 *%ptr
+  store volatile i32 %new6, i32 *%ptr
+  store volatile i32 %new7, i32 *%ptr
+  store volatile i32 %new8, i32 *%ptr
+  store volatile i32 %new9, i32 *%ptr
+  store volatile i32 %new10, i32 *%ptr
+  store volatile i32 %new11, i32 *%ptr
+  store volatile i32 %new12, i32 *%ptr
+  store volatile i32 %new13, i32 *%ptr
+  store volatile i32 %new14, i32 *%ptr
+  store volatile i32 %new15, i32 *%ptr
+
+  ret void
+}
+
+; Check that adding -128 to a spilled value can use ASI.
+define void @f12(i32 *%ptr, i32 %sel) {
+; CHECK-LABEL: f12:
+; CHECK: asi {{[0-9]+}}(%r15), -128
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i32 %val0, -128
+  %add1 = add i32 %val1, -128
+  %add2 = add i32 %val2, -128
+  %add3 = add i32 %val3, -128
+  %add4 = add i32 %val4, -128
+  %add5 = add i32 %val5, -128
+  %add6 = add i32 %val6, -128
+  %add7 = add i32 %val7, -128
+  %add8 = add i32 %val8, -128
+  %add9 = add i32 %val9, -128
+  %add10 = add i32 %val10, -128
+  %add11 = add i32 %val11, -128
+  %add12 = add i32 %val12, -128
+  %add13 = add i32 %val13, -128
+  %add14 = add i32 %val14, -128
+  %add15 = add i32 %val15, -128
+  br label %store
+
+store:
+  %new0 = phi i32 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i32 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i32 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i32 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i32 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i32 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i32 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i32 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i32 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i32 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i32 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i32 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i32 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i32 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i32 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i32 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i32 %new0, i32 *%ptr
+  store volatile i32 %new1, i32 *%ptr
+  store volatile i32 %new2, i32 *%ptr
+  store volatile i32 %new3, i32 *%ptr
+  store volatile i32 %new4, i32 *%ptr
+  store volatile i32 %new5, i32 *%ptr
+  store volatile i32 %new6, i32 *%ptr
+  store volatile i32 %new7, i32 *%ptr
+  store volatile i32 %new8, i32 *%ptr
+  store volatile i32 %new9, i32 *%ptr
+  store volatile i32 %new10, i32 *%ptr
+  store volatile i32 %new11, i32 *%ptr
+  store volatile i32 %new12, i32 *%ptr
+  store volatile i32 %new13, i32 *%ptr
+  store volatile i32 %new14, i32 *%ptr
+  store volatile i32 %new15, i32 *%ptr
+
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-add-12.ll b/test/CodeGen/SystemZ/int-add-12.ll
index ef4dc39..741cce1 100644
--- a/test/CodeGen/SystemZ/int-add-12.ll
+++ b/test/CodeGen/SystemZ/int-add-12.ll
@@ -126,3 +126,169 @@ define void @f10(i64 %base, i64 %index) {
   store i64 %add, i64 *%ptr
   ret void
 }
+
+; Check that adding 127 to a spilled value can use AGSI.
+define void @f11(i64 *%ptr, i32 %sel) {
+; CHECK-LABEL: f11:
+; CHECK: agsi {{[0-9]+}}(%r15), 127
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i64 *%ptr
+  %val1 = load volatile i64 *%ptr
+  %val2 = load volatile i64 *%ptr
+  %val3 = load volatile i64 *%ptr
+  %val4 = load volatile i64 *%ptr
+  %val5 = load volatile i64 *%ptr
+  %val6 = load volatile i64 *%ptr
+  %val7 = load volatile i64 *%ptr
+  %val8 = load volatile i64 *%ptr
+  %val9 = load volatile i64 *%ptr
+  %val10 = load volatile i64 *%ptr
+  %val11 = load volatile i64 *%ptr
+  %val12 = load volatile i64 *%ptr
+  %val13 = load volatile i64 *%ptr
+  %val14 = load volatile i64 *%ptr
+  %val15 = load volatile i64 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i64 %val0, 127
+  %add1 = add i64 %val1, 127
+  %add2 = add i64 %val2, 127
+  %add3 = add i64 %val3, 127
+  %add4 = add i64 %val4, 127
+  %add5 = add i64 %val5, 127
+  %add6 = add i64 %val6, 127
+  %add7 = add i64 %val7, 127
+  %add8 = add i64 %val8, 127
+  %add9 = add i64 %val9, 127
+  %add10 = add i64 %val10, 127
+  %add11 = add i64 %val11, 127
+  %add12 = add i64 %val12, 127
+  %add13 = add i64 %val13, 127
+  %add14 = add i64 %val14, 127
+  %add15 = add i64 %val15, 127
+  br label %store
+
+store:
+  %new0 = phi i64 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i64 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i64 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i64 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i64 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i64 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i64 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i64 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i64 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i64 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i64 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i64 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i64 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i64 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i64 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i64 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i64 %new0, i64 *%ptr
+  store volatile i64 %new1, i64 *%ptr
+  store volatile i64 %new2, i64 *%ptr
+  store volatile i64 %new3, i64 *%ptr
+  store volatile i64 %new4, i64 *%ptr
+  store volatile i64 %new5, i64 *%ptr
+  store volatile i64 %new6, i64 *%ptr
+  store volatile i64 %new7, i64 *%ptr
+  store volatile i64 %new8, i64 *%ptr
+  store volatile i64 %new9, i64 *%ptr
+  store volatile i64 %new10, i64 *%ptr
+  store volatile i64 %new11, i64 *%ptr
+  store volatile i64 %new12, i64 *%ptr
+  store volatile i64 %new13, i64 *%ptr
+  store volatile i64 %new14, i64 *%ptr
+  store volatile i64 %new15, i64 *%ptr
+
+  ret void
+}
+
+; Check that adding -128 to a spilled value can use AGSI.
+define void @f12(i64 *%ptr, i32 %sel) {
+; CHECK-LABEL: f12:
+; CHECK: agsi {{[0-9]+}}(%r15), -128
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i64 *%ptr
+  %val1 = load volatile i64 *%ptr
+  %val2 = load volatile i64 *%ptr
+  %val3 = load volatile i64 *%ptr
+  %val4 = load volatile i64 *%ptr
+  %val5 = load volatile i64 *%ptr
+  %val6 = load volatile i64 *%ptr
+  %val7 = load volatile i64 *%ptr
+  %val8 = load volatile i64 *%ptr
+  %val9 = load volatile i64 *%ptr
+  %val10 = load volatile i64 *%ptr
+  %val11 = load volatile i64 *%ptr
+  %val12 = load volatile i64 *%ptr
+  %val13 = load volatile i64 *%ptr
+  %val14 = load volatile i64 *%ptr
+  %val15 = load volatile i64 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i64 %val0, -128
+  %add1 = add i64 %val1, -128
+  %add2 = add i64 %val2, -128
+  %add3 = add i64 %val3, -128
+  %add4 = add i64 %val4, -128
+  %add5 = add i64 %val5, -128
+  %add6 = add i64 %val6, -128
+  %add7 = add i64 %val7, -128
+  %add8 = add i64 %val8, -128
+  %add9 = add i64 %val9, -128
+  %add10 = add i64 %val10, -128
+  %add11 = add i64 %val11, -128
+  %add12 = add i64 %val12, -128
+  %add13 = add i64 %val13, -128
+  %add14 = add i64 %val14, -128
+  %add15 = add i64 %val15, -128
+  br label %store
+
+store:
+  %new0 = phi i64 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i64 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i64 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i64 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i64 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i64 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i64 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i64 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i64 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i64 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i64 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i64 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i64 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i64 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i64 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i64 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i64 %new0, i64 *%ptr
+  store volatile i64 %new1, i64 *%ptr
+  store volatile i64 %new2, i64 *%ptr
+  store volatile i64 %new3, i64 *%ptr
+  store volatile i64 %new4, i64 *%ptr
+  store volatile i64 %new5, i64 *%ptr
+  store volatile i64 %new6, i64 *%ptr
+  store volatile i64 %new7, i64 *%ptr
+  store volatile i64 %new8, i64 *%ptr
+  store volatile i64 %new9, i64 *%ptr
+  store volatile i64 %new10, i64 *%ptr
+  store volatile i64 %new11, i64 *%ptr
+  store volatile i64 %new12, i64 *%ptr
+  store volatile i64 %new13, i64 *%ptr
+  store volatile i64 %new14, i64 *%ptr
+  store volatile i64 %new15, i64 *%ptr
+
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-01.ll b/test/CodeGen/SystemZ/int-cmp-01.ll
index dbfe0df..6653b6f 100644
--- a/test/CodeGen/SystemZ/int-cmp-01.ll
+++ b/test/CodeGen/SystemZ/int-cmp-01.ll
@@ -149,3 +149,17 @@ define void @f10(i32 %lhs, i64 %base, i64 %index, i32 *%dst) {
   store i32 %res, i32 *%dst
   ret void
 }
+
+; Check the comparison can be reversed if that allows CH to be used.
+define double @f11(double %a, double %b, i32 %rhs, i16 *%src) {
+; CHECK-LABEL: f11:
+; CHECK: ch %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %half = load i16 *%src
+  %lhs = sext i16 %half to i32
+  %cond = icmp slt i32 %lhs, %rhs
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-02.ll b/test/CodeGen/SystemZ/int-cmp-02.ll
index 26e1391..4a8a1a9 100644
--- a/test/CodeGen/SystemZ/int-cmp-02.ll
+++ b/test/CodeGen/SystemZ/int-cmp-02.ll
@@ -181,3 +181,16 @@ while.body:
 while.end:
   ret void
 }
+
+; Check the comparison can be reversed if that allows C to be used.
+define double @f13(double %a, double %b, i32 %i2, i32 *%ptr) {
+; CHECK-LABEL: f13:
+; CHECK: c %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i32 *%ptr
+  %cond = icmp slt i32 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-03.ll b/test/CodeGen/SystemZ/int-cmp-03.ll
index 2d679cf..aa654e0 100644
--- a/test/CodeGen/SystemZ/int-cmp-03.ll
+++ b/test/CodeGen/SystemZ/int-cmp-03.ll
@@ -5,8 +5,7 @@
 ; Check register comparison.
 define double @f1(double %a, double %b, i32 %i1, i32 %i2) {
 ; CHECK-LABEL: f1:
-; CHECK: clr %r2, %r3
-; CHECK-NEXT: jl
+; CHECK: clrjl %r2, %r3
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ult i32 %i1, %i2
@@ -160,3 +159,16 @@ define double @f11(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
   %res = select i1 %cond, double %a, double %b
   ret double %res
 }
+
+; Check the comparison can be reversed if that allows CL to be used.
+define double @f12(double %a, double %b, i32 %i2, i32 *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: cl %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i32 *%ptr
+  %cond = icmp ult i32 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-04.ll b/test/CodeGen/SystemZ/int-cmp-04.ll
index 54c4b5b..a6606f3 100644
--- a/test/CodeGen/SystemZ/int-cmp-04.ll
+++ b/test/CodeGen/SystemZ/int-cmp-04.ll
@@ -105,3 +105,17 @@ define void @f7(i64 %lhs, i64 %base, i64 %index, i64 *%dst) {
   store i64 %res, i64 *%dst
   ret void
 }
+
+; Check the comparison can be reversed if that allows CGH to be used.
+define double @f8(double %a, double %b, i64 %rhs, i16 *%src) {
+; CHECK-LABEL: f8:
+; CHECK: cgh %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %half = load i16 *%src
+  %lhs = sext i16 %half to i64
+  %cond = icmp slt i64 %lhs, %rhs
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-05.ll b/test/CodeGen/SystemZ/int-cmp-05.ll
index 36d12a5..f15b76b 100644
--- a/test/CodeGen/SystemZ/int-cmp-05.ll
+++ b/test/CodeGen/SystemZ/int-cmp-05.ll
@@ -54,7 +54,7 @@ define double @f4(double %a, double %b, i64 %i1, i32 %unext) {
   ret double %res
 }
 
-; Check signed comparisonn with memory.
+; Check signed comparison with memory.
 define double @f5(double %a, double %b, i64 %i1, i32 *%ptr) {
 ; CHECK-LABEL: f5:
 ; CHECK: cgf %r2, 0(%r3)
@@ -290,3 +290,17 @@ define i64 @f15(i32 *%ptr0) {
 
   ret i64 %sel9
 }
+
+; Check the comparison can be reversed if that allows CGF to be used.
+define double @f16(double %a, double %b, i64 %i2, i32 *%ptr) {
+; CHECK-LABEL: f16:
+; CHECK: cgf %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %unext = load i32 *%ptr
+  %i1 = sext i32 %unext to i64
+  %cond = icmp slt i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-06.ll b/test/CodeGen/SystemZ/int-cmp-06.ll
index cdd6114..8ab62e8 100644
--- a/test/CodeGen/SystemZ/int-cmp-06.ll
+++ b/test/CodeGen/SystemZ/int-cmp-06.ll
@@ -104,7 +104,7 @@ define double @f8(double %a, double %b, i64 %i1, i64 %unext) {
   ret double %res
 }
 
-; Check unsigned comparisonn with memory.
+; Check unsigned comparison with memory.
 define double @f9(double %a, double %b, i64 %i1, i32 *%ptr) {
 ; CHECK-LABEL: f9:
 ; CHECK: clgf %r2, 0(%r3)
@@ -340,3 +340,17 @@ define i64 @f19(i32 *%ptr0) {
 
   ret i64 %sel9
 }
+
+; Check the comparison can be reversed if that allows CLGF to be used.
+define double @f20(double %a, double %b, i64 %i2, i32 *%ptr) {
+; CHECK-LABEL: f20:
+; CHECK: clgf %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %unext = load i32 *%ptr
+  %i1 = zext i32 %unext to i64
+  %cond = icmp ult i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-07.ll b/test/CodeGen/SystemZ/int-cmp-07.ll
index 3308cb0..530d178 100644
--- a/test/CodeGen/SystemZ/int-cmp-07.ll
+++ b/test/CodeGen/SystemZ/int-cmp-07.ll
@@ -115,3 +115,16 @@ define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
   %res = select i1 %cond, double %a, double %b
   ret double %res
 }
+
+; Check the comparison can be reversed if that allows CG to be used.
+define double @f9(double %a, double %b, i64 %i2, i64 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: cg %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i64 *%ptr
+  %cond = icmp slt i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-08.ll b/test/CodeGen/SystemZ/int-cmp-08.ll
index e68a0fe..ebf158a 100644
--- a/test/CodeGen/SystemZ/int-cmp-08.ll
+++ b/test/CodeGen/SystemZ/int-cmp-08.ll
@@ -5,8 +5,7 @@
 ; Check CLGR.
 define double @f1(double %a, double %b, i64 %i1, i64 %i2) {
 ; CHECK-LABEL: f1:
-; CHECK: clgr %r2, %r3
-; CHECK-NEXT: jl
+; CHECK: clgrjl %r2, %r3
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ult i64 %i1, %i2
@@ -116,3 +115,16 @@ define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
   %res = select i1 %cond, double %a, double %b
   ret double %res
 }
+
+; Check the comparison can be reversed if that allows CLG to be used.
+define double @f9(double %a, double %b, i64 %i2, i64 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: clg %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i64 *%ptr
+  %cond = icmp ult i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-10.ll b/test/CodeGen/SystemZ/int-cmp-10.ll
index e30e014..4d4c4bb 100644
--- a/test/CodeGen/SystemZ/int-cmp-10.ll
+++ b/test/CodeGen/SystemZ/int-cmp-10.ll
@@ -2,12 +2,11 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; Check a value near the low end of the range.  We use CFI for comparisons
-; with zero, or things that are equivalent to them.
+; Check a value near the low end of the range.  We use signed forms for
+; comparisons with zero, or things that are equivalent to them.
 define double @f1(double %a, double %b, i32 %i1) {
 ; CHECK-LABEL: f1:
-; CHECK: clfi %r2, 1
-; CHECK-NEXT: jh
+; CHECK: clijh %r2, 1
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ugt i32 %i1, 1
@@ -15,9 +14,32 @@ define double @f1(double %a, double %b, i32 %i1) {
   ret double %res
 }
 
-; Check a value near the high end of the range.
+; Check the top of the CLIJ range.
 define double @f2(double %a, double %b, i32 %i1) {
 ; CHECK-LABEL: f2:
+; CHECK: clijl %r2, 255
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i32 %i1, 255
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check the next value up, which needs a separate comparison.
+define double @f3(double %a, double %b, i32 %i1) {
+; CHECK-LABEL: f3:
+; CHECK: clfi %r2, 256
+; CHECK: jl
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i32 %i1, 256
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check a value near the high end of the range.
+define double @f4(double %a, double %b, i32 %i1) {
+; CHECK-LABEL: f4:
 ; CHECK: clfi %r2, 4294967280
 ; CHECK-NEXT: jl
 ; CHECK: ldr %f0, %f2
diff --git a/test/CodeGen/SystemZ/int-cmp-12.ll b/test/CodeGen/SystemZ/int-cmp-12.ll
index f57f6ec..077b224 100644
--- a/test/CodeGen/SystemZ/int-cmp-12.ll
+++ b/test/CodeGen/SystemZ/int-cmp-12.ll
@@ -2,12 +2,11 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; Check a value near the low end of the range.  We use CGFI for comparisons
-; with zero, or things that are equivalent to them.
+; Check a value near the low end of the range.  We use signed forms for
+; comparisons with zero, or things that are equivalent to them.
 define double @f1(double %a, double %b, i64 %i1) {
 ; CHECK-LABEL: f1:
-; CHECK: clgfi %r2, 1
-; CHECK-NEXT: jh
+; CHECK: clgijh %r2, 1
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ugt i64 %i1, 1
@@ -15,9 +14,32 @@ define double @f1(double %a, double %b, i64 %i1) {
   ret double %res
 }
 
-; Check the high end of the CLGFI range.
+; Check the top of the CLGIJ range.
 define double @f2(double %a, double %b, i64 %i1) {
 ; CHECK-LABEL: f2:
+; CHECK: clgijl %r2, 255
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i64 %i1, 255
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check the next value up, which needs a separate comparison.
+define double @f3(double %a, double %b, i64 %i1) {
+; CHECK-LABEL: f3:
+; CHECK: clgfi %r2, 256
+; CHECK: jl
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i64 %i1, 256
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check the high end of the CLGFI range.
+define double @f4(double %a, double %b, i64 %i1) {
+; CHECK-LABEL: f4:
 ; CHECK: clgfi %r2, 4294967295
 ; CHECK-NEXT: jl
 ; CHECK: ldr %f0, %f2
@@ -28,10 +50,9 @@ define double @f2(double %a, double %b, i64 %i1) {
 }
 
 ; Check the next value up, which must use a register comparison.
-define double @f3(double %a, double %b, i64 %i1) {
-; CHECK-LABEL: f3:
-; CHECK: clgr %r2,
-; CHECK-NEXT: jl
+define double @f5(double %a, double %b, i64 %i1) {
+; CHECK-LABEL: f5:
+; CHECK: clgrjl %r2,
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ult i64 %i1, 4294967296
diff --git a/test/CodeGen/SystemZ/int-cmp-20.ll b/test/CodeGen/SystemZ/int-cmp-20.ll
index 7ecde77..98c41cd 100644
--- a/test/CodeGen/SystemZ/int-cmp-20.ll
+++ b/test/CodeGen/SystemZ/int-cmp-20.ll
@@ -63,7 +63,7 @@ define double @f4(double %a, double %b, i8 *%ptr) {
 ; extension.  The condition is always true.
 define double @f5(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f5:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = zext i8 %val to i32
@@ -79,7 +79,7 @@ define double @f5(double %a, double %b, i8 *%ptr) {
 ; and simply ignore CLI for this range.  First check the low end of the range.
 define double @f6(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f6:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -91,7 +91,7 @@ define double @f6(double %a, double %b, i8 *%ptr) {
 ; ...and then the high end.
 define double @f7(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f7:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -118,7 +118,7 @@ define double @f8(double %a, double %b, i8 *%ptr) {
 ; extension.  This cannot use CLI.
 define double @f9(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f9:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -145,7 +145,7 @@ define double @f10(double %a, double %b, i8 *%ptr) {
 ; extension.  This cannot use CLI.
 define double @f11(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f11:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -158,7 +158,7 @@ define double @f11(double %a, double %b, i8 *%ptr) {
 ; extension.  The condition is always true.
 define double @f12(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f12:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = zext i8 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-36.ll b/test/CodeGen/SystemZ/int-cmp-36.ll
index 831b05f..fa2d4bf 100644
--- a/test/CodeGen/SystemZ/int-cmp-36.ll
+++ b/test/CodeGen/SystemZ/int-cmp-36.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
   ret i32 %res
 }
+
+; Check the comparison can be reversed if that allows CHRL to be used.
+define i32 @f6(i32 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: chrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = sext i16 %val to i32
+  %cond = icmp slt i32 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i32 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-37.ll b/test/CodeGen/SystemZ/int-cmp-37.ll
index 97d210e..8095ed1 100644
--- a/test/CodeGen/SystemZ/int-cmp-37.ll
+++ b/test/CodeGen/SystemZ/int-cmp-37.ll
@@ -86,8 +86,7 @@ define i32 @f5(i32 %src1) {
 ; CHECK-LABEL: f5:
 ; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
 ; CHECK: llh [[VAL:%r[0-5]]], 0([[REG]])
-; CHECK: clr %r2, [[VAL]]
-; CHECK-NEXT: jl
+; CHECK: clrjl %r2, [[VAL]],
 ; CHECK: br %r14
 entry:
   %val = load i16 *@h, align 1
@@ -101,3 +100,22 @@ exit:
   %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
   ret i32 %res
 }
+
+; Check the comparison can be reversed if that allows CLHRL to be used.
+define i32 @f6(i32 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: clhrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = zext i16 %val to i32
+  %cond = icmp ult i32 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i32 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-38.ll b/test/CodeGen/SystemZ/int-cmp-38.ll
index d5a852c..9017583 100644
--- a/test/CodeGen/SystemZ/int-cmp-38.ll
+++ b/test/CodeGen/SystemZ/int-cmp-38.ll
@@ -115,3 +115,21 @@ exit:
   %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
   ret i32 %res
 }
+
+; Check the comparison can be reversed if that allows CRL to be used.
+define i32 @f7(i32 %src2) {
+; CHECK-LABEL: f7:
+; CHECK: crl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %src1 = load i32 *@g
+  %cond = icmp slt i32 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i32 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-39.ll b/test/CodeGen/SystemZ/int-cmp-39.ll
index d442058..fc9547d 100644
--- a/test/CodeGen/SystemZ/int-cmp-39.ll
+++ b/test/CodeGen/SystemZ/int-cmp-39.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CGHRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: cghrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = sext i16 %val to i64
+  %cond = icmp slt i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-40.ll b/test/CodeGen/SystemZ/int-cmp-40.ll
index 6dab2db..9c532f1 100644
--- a/test/CodeGen/SystemZ/int-cmp-40.ll
+++ b/test/CodeGen/SystemZ/int-cmp-40.ll
@@ -86,8 +86,7 @@ define i64 @f5(i64 %src1) {
 ; CHECK-LABEL: f5:
 ; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
 ; CHECK: llgh [[VAL:%r[0-5]]], 0([[REG]])
-; CHECK: clgr %r2, [[VAL]]
-; CHECK-NEXT: jl
+; CHECK: clgrjl %r2, [[VAL]],
 ; CHECK: br %r14
 entry:
   %val = load i16 *@h, align 1
@@ -101,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CLGHRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: clghrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = zext i16 %val to i64
+  %cond = icmp ult i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-41.ll b/test/CodeGen/SystemZ/int-cmp-41.ll
index 099681d..77f6e7d 100644
--- a/test/CodeGen/SystemZ/int-cmp-41.ll
+++ b/test/CodeGen/SystemZ/int-cmp-41.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CGFRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: cgfrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i32 *@g
+  %src1 = sext i32 %val to i64
+  %cond = icmp slt i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-42.ll b/test/CodeGen/SystemZ/int-cmp-42.ll
index 26a268d..94ef008 100644
--- a/test/CodeGen/SystemZ/int-cmp-42.ll
+++ b/test/CodeGen/SystemZ/int-cmp-42.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CLGFRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: clgfrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i32 *@g
+  %src1 = zext i32 %val to i64
+  %cond = icmp ult i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-43.ll b/test/CodeGen/SystemZ/int-cmp-43.ll
index e5e1390..1a62588 100644
--- a/test/CodeGen/SystemZ/int-cmp-43.ll
+++ b/test/CodeGen/SystemZ/int-cmp-43.ll
@@ -96,3 +96,21 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CGRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: cgrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %src1 = load i64 *@g
+  %cond = icmp slt i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-44.ll b/test/CodeGen/SystemZ/int-cmp-44.ll
index b94f482..ae0133f 100644
--- a/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -203,11 +203,11 @@ exit:
 ; comparisons with zero if the immediate covers the whole register.
 define i32 @f11(i32 %a, i32 %b, i32 *%dest) {
 ; CHECK-LABEL: f11:
-; CHECK: nilf %r2, 100
+; CHECK: nilf %r2, 100000001
 ; CHECK-NEXT: jl .L{{.*}}
 ; CHECK: br %r14
 entry:
-  %res = and i32 %a, 100
+  %res = and i32 %a, 100000001
   %cmp = icmp ne i32 %res, 0
   br i1 %cmp, label %exit, label %store
 
diff --git a/test/CodeGen/SystemZ/int-cmp-46.ll b/test/CodeGen/SystemZ/int-cmp-46.ll
new file mode 100644
index 0000000..f311942
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-46.ll
@@ -0,0 +1,491 @@
+; Test the use of TEST UNDER MASK for 32-bit operations.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+@g = global i32 0
+
+; Check the lowest useful TMLL value.
+define void @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: tmll %r2, 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLL range.
+define void @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: tmll %r2, 65535
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 65535
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMLH value, which is the next value up.
+define void @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: tmlh %r2, 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 65536
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the next value up again, which cannot use TM.
+define void @f4(i32 %a) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: {{tm[lh].}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 4294901759
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLH range.
+define void @f5(i32 %a) {
+; CHECK-LABEL: f5:
+; CHECK: tmlh %r2, 65535
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 4294901760
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for LT comparisons that are equivalent to
+; an equality comparison with zero.
+define void @f6(i32 %a) {
+; CHECK-LABEL: f6:
+; CHECK: tmll %r2, 240
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp slt i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with LE.
+define void @f7(i32 %a) {
+; CHECK-LABEL: f7:
+; CHECK: tmll %r2, 240
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp sle i32 %and, 15
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for GE comparisons that are equivalent to
+; an inequality comparison with zero.
+define void @f8(i32 %a) {
+; CHECK-LABEL: f8:
+; CHECK: tmll %r2, 240
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp uge i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with GT.
+define void @f9(i32 %a) {
+; CHECK-LABEL: f9:
+; CHECK: tmll %r2, 240
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp ugt i32 %and, 15
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for LT comparisons that effectively
+; test whether the top bit is clear.
+define void @f10(i32 %a) {
+; CHECK-LABEL: f10:
+; CHECK: tmll %r2, 35
+; CHECK: jle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 35
+  %cmp = icmp ult i32 %and, 8
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with LE.
+define void @f11(i32 %a) {
+; CHECK-LABEL: f11:
+; CHECK: tmll %r2, 35
+; CHECK: jle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 35
+  %cmp = icmp ule i32 %and, 31
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for GE comparisons that effectively test
+; whether the top bit is set.
+define void @f12(i32 %a) {
+; CHECK-LABEL: f12:
+; CHECK: tmll %r2, 140
+; CHECK: jnle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 140
+  %cmp = icmp uge i32 %and, 128
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again for GT.
+define void @f13(i32 %a) {
+; CHECK-LABEL: f13:
+; CHECK: tmll %r2, 140
+; CHECK: jnle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 140
+  %cmp = icmp ugt i32 %and, 126
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for equality comparisons with the mask.
+define void @f14(i32 %a) {
+; CHECK-LABEL: f14:
+; CHECK: tmll %r2, 101
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 101
+  %cmp = icmp eq i32 %and, 101
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for inequality comparisons with the mask.
+define void @f15(i32 %a) {
+; CHECK-LABEL: f15:
+; CHECK: tmll %r2, 65519
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 65519
+  %cmp = icmp ne i32 %and, 65519
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for LT comparisons that are equivalent
+; to inequality comparisons with the mask.
+define void @f16(i32 %a) {
+; CHECK-LABEL: f16:
+; CHECK: tmll %r2, 130
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 130
+  %cmp = icmp ult i32 %and, 129
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with LE.
+define void @f17(i32 %a) {
+; CHECK-LABEL: f17:
+; CHECK: tmll %r2, 130
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 130
+  %cmp = icmp ule i32 %and, 128
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for GE comparisons that are equivalent
+; to equality comparisons with the mask.
+define void @f18(i32 %a) {
+; CHECK-LABEL: f18:
+; CHECK: tmll %r2, 194
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 194
+  %cmp = icmp uge i32 %and, 193
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again for GT.
+define void @f19(i32 %a) {
+; CHECK-LABEL: f19:
+; CHECK: tmll %r2, 194
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 194
+  %cmp = icmp ugt i32 %and, 192
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for equality comparisons for the low bit
+; when the mask has two bits.
+define void @f20(i32 %a) {
+; CHECK-LABEL: f20:
+; CHECK: tmll %r2, 20
+; CHECK: jl {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp eq i32 %and, 4
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for inequality comparisons for the low bit
+; when the mask has two bits.
+define void @f21(i32 %a) {
+; CHECK-LABEL: f21:
+; CHECK: tmll %r2, 20
+; CHECK: jnl {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp ne i32 %and, 4
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for equality comparisons for the high bit
+; when the mask has two bits.
+define void @f22(i32 %a) {
+; CHECK-LABEL: f22:
+; CHECK: tmll %r2, 20
+; CHECK: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp eq i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for inequality comparisons for the high bit
+; when the mask has two bits.
+define void @f23(i32 %a) {
+; CHECK-LABEL: f23:
+; CHECK: tmll %r2, 20
+; CHECK: jnh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp ne i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHL into a TMxx mask.
+define void @f24(i32 %a) {
+; CHECK-LABEL: f24:
+; CHECK: tmll %r2, 255
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shl = shl i32 %a, 12
+  %and = and i32 %shl, 1044480
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHR into a TMxx mask.
+define void @f25(i32 %a) {
+; CHECK-LABEL: f25:
+; CHECK: tmlh %r2, 512
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shr = lshr i32 %a, 25
+  %and = and i32 %shr, 1
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-47.ll b/test/CodeGen/SystemZ/int-cmp-47.ll
new file mode 100644
index 0000000..9ebcbfe
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-47.ll
@@ -0,0 +1,234 @@
+; Test the use of TEST UNDER MASK for 64-bit operations.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+@g = global i32 0
+
+; Check the lowest useful TMLL value.
+define void @f1(i64 %a) {
+; CHECK-LABEL: f1:
+; CHECK: tmll %r2, 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 1
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLL range.
+define void @f2(i64 %a) {
+; CHECK-LABEL: f2:
+; CHECK: tmll %r2, 65535
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 65535
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMLH value, which is the next value up.
+define void @f3(i64 %a) {
+; CHECK-LABEL: f3:
+; CHECK: tmlh %r2, 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 65536
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the next value up again, which cannot use TM.
+define void @f4(i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: {{tm[lh].}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294901759
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLH range.
+define void @f5(i64 %a) {
+; CHECK-LABEL: f5:
+; CHECK: tmlh %r2, 65535
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294901760
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMHL value.
+define void @f6(i64 %a) {
+; CHECK-LABEL: f6:
+; CHECK: tmhl %r2, 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294967296
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the next value up again, which cannot use TM.
+define void @f7(i64 %a) {
+; CHECK-LABEL: f7:
+; CHECK-NOT: {{tm[lh].}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294967297
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMHL range.
+define void @f8(i64 %a) {
+; CHECK-LABEL: f8:
+; CHECK: tmhl %r2, 65535
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 281470681743360
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMHH value.
+define void @f9(i64 %a) {
+; CHECK-LABEL: f9:
+; CHECK: tmhh %r2, 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 281474976710656
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMHH range.
+define void @f10(i64 %a) {
+; CHECK-LABEL: f10:
+; CHECK: tmhh %r2, 65535
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 18446462598732840960
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHL into a TMxx mask.
+define void @f11(i64 %a) {
+; CHECK-LABEL: f11:
+; CHECK: tmhl %r2, 32768
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shl = shl i64 %a, 1
+  %and = and i64 %shl, 281474976710656
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHR into a TMxx mask.
+define void @f12(i64 %a) {
+; CHECK-LABEL: f12:
+; CHECK: tmhh %r2, 256
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shr = lshr i64 %a, 56
+  %and = and i64 %shr, 1
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-48.ll b/test/CodeGen/SystemZ/int-cmp-48.ll
new file mode 100644
index 0000000..d7c6370
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-48.ll
@@ -0,0 +1,245 @@
+; Test the use of TM and TMY.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+@g = global i32 0
+
+; Check a simple branching use of TM.
+define void @f1(i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that we do not fold across an aliasing store.
+define void @f2(i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
+; CHECK: mvi 0(%r2), 0
+; CHECK: tmll [[REG]], 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %byte = load i8 *%src
+  store i8 0, i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a simple select-based use of TM.
+define double @f3(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f3:
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that we do not fold across an aliasing store.
+define double @f4(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f4:
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: mvi 0(%r2), 0
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  store i8 0, i8 *%src
+  ret double %res
+}
+
+; Check an inequality check.
+define double @f5(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f5:
+; CHECK: tm 0(%r2), 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp ne i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that we can also use TM for equality comparisons with the mask.
+define double @f6(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f6:
+; CHECK: tm 0(%r2), 254
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 254
+  %cmp = icmp eq i8 %and, 254
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check inequality comparisons with the mask.
+define double @f7(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f7:
+; CHECK: tm 0(%r2), 254
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 254
+  %cmp = icmp ne i8 %and, 254
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that we do not use the memory TM instruction when CC is being tested
+; for 2.
+define double @f8(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f8:
+; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
+; CHECK: tmll [[REG]], 3
+; CHECK: jh {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 3
+  %cmp = icmp eq i8 %and, 2
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; ...likewise 1.
+define double @f9(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f9:
+; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
+; CHECK: tmll [[REG]], 3
+; CHECK: jl {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 3
+  %cmp = icmp eq i8 %and, 1
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the high end of the TM range.
+define double @f10(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f10:
+; CHECK: tm 4095(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 4095
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the low end of the positive TMY range.
+define double @f11(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f11:
+; CHECK: tmy 4096(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 4096
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the high end of the TMY range.
+define double @f12(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f12:
+; CHECK: tmy 524287(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 524287
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the next byte up, which needs separate address logic.
+define double @f13(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f13:
+; CHECK: agfi %r2, 524288
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 524288
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the low end of the TMY range.
+define double @f14(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f14:
+; CHECK: tmy -524288(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 -524288
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the next byte down, which needs separate address logic.
+define double @f15(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f15:
+; CHECK: agfi %r2, -524289
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 -524289
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that TM(Y) does not allow an index
+define double @f16(i8 *%src, i64 %index, double %a, double %b) {
+; CHECK-LABEL: f16:
+; CHECK: tm 0({{%r[1-5]}}), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 %index
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-49.ll b/test/CodeGen/SystemZ/int-cmp-49.ll
new file mode 100644
index 0000000..83f18a2
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-49.ll
@@ -0,0 +1,49 @@
+; That that we don't try to use z196 instructions on z10 for TMHH and TMHL.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -O0 | FileCheck %s
+
+@g = global i32 0
+
+; Check the lowest useful TMHL value.
+define void @f1(i64 %a) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: tmhl {{%r[0-5]}}, 1
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294967296
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMHH value.
+define void @f2(i64 %a) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: tmhh {{%r[0-5]}}, 1
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 281474976710656
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-const-03.ll b/test/CodeGen/SystemZ/int-const-03.ll
index 78db963..af1cef2 100644
--- a/test/CodeGen/SystemZ/int-const-03.ll
+++ b/test/CodeGen/SystemZ/int-const-03.ll
@@ -139,11 +139,11 @@ define void @f14(i8 *%src) {
   ret void
 }
 
-; Check that MVI does not allow an index
+; Check that MVI does not allow an index.  We prefer STC in that case.
 define void @f15(i64 %src, i64 %index) {
 ; CHECK-LABEL: f15:
-; CHECK: agr %r2, %r3
-; CHECK: mvi 4095(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: stc [[TMP]], 4095({{%r2,%r3|%r3,%r2}}
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %add2 = add i64 %add1, 4095
@@ -152,11 +152,11 @@ define void @f15(i64 %src, i64 %index) {
   ret void
 }
 
-; Check that MVIY does not allow an index
+; Check that MVIY does not allow an index.  We prefer STCY in that case.
 define void @f16(i64 %src, i64 %index) {
 ; CHECK-LABEL: f16:
-; CHECK: agr %r2, %r3
-; CHECK: mviy 4096(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: stcy [[TMP]], 4096({{%r2,%r3|%r3,%r2}}
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %add2 = add i64 %add1, 4096
diff --git a/test/CodeGen/SystemZ/int-const-04.ll b/test/CodeGen/SystemZ/int-const-04.ll
index c109faa..aced50b 100644
--- a/test/CodeGen/SystemZ/int-const-04.ll
+++ b/test/CodeGen/SystemZ/int-const-04.ll
@@ -75,34 +75,34 @@ define void @f8(i16 *%a) {
   ret void
 }
 
-; Check the next halfword up, which needs separate address logic.
-; Other sequences besides this one would be OK.
+; Check the next halfword up, which is out of range.  We prefer STHY
+; in that case.
 define void @f9(i16 *%a) {
 ; CHECK-LABEL: f9:
-; CHECK: aghi %r2, 4096
-; CHECK: mvhhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sthy [[TMP]], 4096(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%a, i64 2048
   store i16 42, i16 *%ptr
   ret void
 }
 
-; Check negative displacements, which also need separate address logic.
+; Check negative displacements, for which we again prefer STHY.
 define void @f10(i16 *%a) {
 ; CHECK-LABEL: f10:
-; CHECK: aghi %r2, -2
-; CHECK: mvhhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sthy [[TMP]], -2(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%a, i64 -1
   store i16 42, i16 *%ptr
   ret void
 }
 
-; Check that MVHHI does not allow an index
+; Check that MVHHI does not allow an index.
 define void @f11(i64 %src, i64 %index) {
 ; CHECK-LABEL: f11:
-; CHECK: agr %r2, %r3
-; CHECK: mvhhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sth [[TMP]], 0({{%r2,%r3|%r3,%r2}})
 ; CHECK: br %r14
   %add = add i64 %src, %index
   %ptr = inttoptr i64 %add to i16 *
diff --git a/test/CodeGen/SystemZ/int-const-05.ll b/test/CodeGen/SystemZ/int-const-05.ll
index d0c8569..98d6851 100644
--- a/test/CodeGen/SystemZ/int-const-05.ll
+++ b/test/CodeGen/SystemZ/int-const-05.ll
@@ -66,34 +66,33 @@ define void @f7(i32 *%a) {
   ret void
 }
 
-; Check the next word up, which needs separate address logic.
-; Other sequences besides this one would be OK.
+; Check the next word up, which is out of range.  We prefer STY in that case.
 define void @f8(i32 *%a) {
 ; CHECK-LABEL: f8:
-; CHECK: aghi %r2, 4096
-; CHECK: mvhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sty [[TMP]], 4096(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%a, i64 1024
   store i32 42, i32 *%ptr
   ret void
 }
 
-; Check negative displacements, which also need separate address logic.
+; Check negative displacements, for which we again prefer STY.
 define void @f9(i32 *%a) {
 ; CHECK-LABEL: f9:
-; CHECK: aghi %r2, -4
-; CHECK: mvhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sty [[TMP]], -4(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%a, i64 -1
   store i32 42, i32 *%ptr
   ret void
 }
 
-; Check that MVHI does not allow an index
+; Check that MVHI does not allow an index.
 define void @f10(i64 %src, i64 %index) {
 ; CHECK-LABEL: f10:
-; CHECK: agr %r2, %r3
-; CHECK: mvhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: st [[TMP]], 0({{%r2,%r3|%r3,%r2}})
 ; CHECK: br %r14
   %add = add i64 %src, %index
   %ptr = inttoptr i64 %add to i32 *
diff --git a/test/CodeGen/SystemZ/int-const-06.ll b/test/CodeGen/SystemZ/int-const-06.ll
index 12a555c..cf07c66 100644
--- a/test/CodeGen/SystemZ/int-const-06.ll
+++ b/test/CodeGen/SystemZ/int-const-06.ll
@@ -66,34 +66,34 @@ define void @f7(i64 *%a) {
   ret void
 }
 
-; Check the next doubleword up, which needs separate address logic.
-; Other sequences besides this one would be OK.
+; Check the next doubleword up, which is out of range.  We prefer STG
+; in that case.
 define void @f8(i64 *%a) {
 ; CHECK-LABEL: f8:
-; CHECK: aghi %r2, 4096
-; CHECK: mvghi 0(%r2), 42
+; CHECK: lghi [[TMP:%r[0-5]]], 42
+; CHECK: stg [[TMP]], 4096(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%a, i64 512
   store i64 42, i64 *%ptr
   ret void
 }
 
-; Check negative displacements, which also need separate address logic.
+; Check negative displacements, for which we again prefer STG.
 define void @f9(i64 *%a) {
 ; CHECK-LABEL: f9:
-; CHECK: aghi %r2, -8
-; CHECK: mvghi 0(%r2), 42
+; CHECK: lghi [[TMP:%r[0-5]]], 42
+; CHECK: stg [[TMP]], -8(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%a, i64 -1
   store i64 42, i64 *%ptr
   ret void
 }
 
-; Check that MVGHI does not allow an index
+; Check that MVGHI does not allow an index.
 define void @f10(i64 %src, i64 %index) {
 ; CHECK-LABEL: f10:
-; CHECK: agr %r2, %r3
-; CHECK: mvghi 0(%r2), 42
+; CHECK: lghi [[TMP:%r[0-5]]], 42
+; CHECK: stg [[TMP]], 0({{%r2,%r3|%r3,%r2}})
 ; CHECK: br %r14
   %add = add i64 %src, %index
   %ptr = inttoptr i64 %add to i64 *
diff --git a/test/CodeGen/SystemZ/int-conv-02.ll b/test/CodeGen/SystemZ/int-conv-02.ll
index 18cfd4a..dd7760d 100644
--- a/test/CodeGen/SystemZ/int-conv-02.ll
+++ b/test/CodeGen/SystemZ/int-conv-02.ll
@@ -1,6 +1,7 @@
-; Test zero extensions from a byte to an i32.
+; Test zero extensions from a byte to an i32.    The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Test register extension, starting with an i32.
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/SystemZ/int-conv-06.ll b/test/CodeGen/SystemZ/int-conv-06.ll
index 9c95bad..33860d1 100644
--- a/test/CodeGen/SystemZ/int-conv-06.ll
+++ b/test/CodeGen/SystemZ/int-conv-06.ll
@@ -1,6 +1,7 @@
-; Test zero extensions from a halfword to an i32.
+; Test zero extensions from a halfword to an i32.  The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Test register extension, starting with an i32.
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/SystemZ/int-conv-09.ll b/test/CodeGen/SystemZ/int-conv-09.ll
index db4c333..b9c5089 100644
--- a/test/CodeGen/SystemZ/int-conv-09.ll
+++ b/test/CodeGen/SystemZ/int-conv-09.ll
@@ -102,80 +102,3 @@ define i64 @f9(i64 %src, i64 %index) {
   %ext = sext i32 %word to i64
   ret i64 %ext
 }
-
-; Test a case where we spill the source of at least one LGFR.  We want
-; to use LGF if possible.
-define void @f10(i64 *%ptr1, i32 *%ptr2) {
-; CHECK-LABEL: f10:
-; CHECK: lgf {{%r[0-9]+}}, 16{{[04]}}(%r15)
-; CHECK: br %r14
-  %val0 = load volatile i32 *%ptr2
-  %val1 = load volatile i32 *%ptr2
-  %val2 = load volatile i32 *%ptr2
-  %val3 = load volatile i32 *%ptr2
-  %val4 = load volatile i32 *%ptr2
-  %val5 = load volatile i32 *%ptr2
-  %val6 = load volatile i32 *%ptr2
-  %val7 = load volatile i32 *%ptr2
-  %val8 = load volatile i32 *%ptr2
-  %val9 = load volatile i32 *%ptr2
-  %val10 = load volatile i32 *%ptr2
-  %val11 = load volatile i32 *%ptr2
-  %val12 = load volatile i32 *%ptr2
-  %val13 = load volatile i32 *%ptr2
-  %val14 = load volatile i32 *%ptr2
-  %val15 = load volatile i32 *%ptr2
-
-  %ext0 = sext i32 %val0 to i64
-  %ext1 = sext i32 %val1 to i64
-  %ext2 = sext i32 %val2 to i64
-  %ext3 = sext i32 %val3 to i64
-  %ext4 = sext i32 %val4 to i64
-  %ext5 = sext i32 %val5 to i64
-  %ext6 = sext i32 %val6 to i64
-  %ext7 = sext i32 %val7 to i64
-  %ext8 = sext i32 %val8 to i64
-  %ext9 = sext i32 %val9 to i64
-  %ext10 = sext i32 %val10 to i64
-  %ext11 = sext i32 %val11 to i64
-  %ext12 = sext i32 %val12 to i64
-  %ext13 = sext i32 %val13 to i64
-  %ext14 = sext i32 %val14 to i64
-  %ext15 = sext i32 %val15 to i64
-
-  store volatile i32 %val0, i32 *%ptr2
-  store volatile i32 %val1, i32 *%ptr2
-  store volatile i32 %val2, i32 *%ptr2
-  store volatile i32 %val3, i32 *%ptr2
-  store volatile i32 %val4, i32 *%ptr2
-  store volatile i32 %val5, i32 *%ptr2
-  store volatile i32 %val6, i32 *%ptr2
-  store volatile i32 %val7, i32 *%ptr2
-  store volatile i32 %val8, i32 *%ptr2
-  store volatile i32 %val9, i32 *%ptr2
-  store volatile i32 %val10, i32 *%ptr2
-  store volatile i32 %val11, i32 *%ptr2
-  store volatile i32 %val12, i32 *%ptr2
-  store volatile i32 %val13, i32 *%ptr2
-  store volatile i32 %val14, i32 *%ptr2
-  store volatile i32 %val15, i32 *%ptr2
-
-  store volatile i64 %ext0, i64 *%ptr1
-  store volatile i64 %ext1, i64 *%ptr1
-  store volatile i64 %ext2, i64 *%ptr1
-  store volatile i64 %ext3, i64 *%ptr1
-  store volatile i64 %ext4, i64 *%ptr1
-  store volatile i64 %ext5, i64 *%ptr1
-  store volatile i64 %ext6, i64 *%ptr1
-  store volatile i64 %ext7, i64 *%ptr1
-  store volatile i64 %ext8, i64 *%ptr1
-  store volatile i64 %ext9, i64 *%ptr1
-  store volatile i64 %ext10, i64 *%ptr1
-  store volatile i64 %ext11, i64 *%ptr1
-  store volatile i64 %ext12, i64 *%ptr1
-  store volatile i64 %ext13, i64 *%ptr1
-  store volatile i64 %ext14, i64 *%ptr1
-  store volatile i64 %ext15, i64 *%ptr1
-
-  ret void
-}
diff --git a/test/CodeGen/SystemZ/int-conv-10.ll b/test/CodeGen/SystemZ/int-conv-10.ll
index f2f71d9..781c74c 100644
--- a/test/CodeGen/SystemZ/int-conv-10.ll
+++ b/test/CodeGen/SystemZ/int-conv-10.ll
@@ -111,80 +111,3 @@ define i64 @f10(i64 %src, i64 %index) {
   %ext = zext i32 %word to i64
   ret i64 %ext
 }
-
-; Test a case where we spill the source of at least one LLGFR.  We want
-; to use LLGF if possible.
-define void @f11(i64 *%ptr1, i32 *%ptr2) {
-; CHECK-LABEL: f11:
-; CHECK: llgf {{%r[0-9]+}}, 16{{[04]}}(%r15)
-; CHECK: br %r14
-  %val0 = load volatile i32 *%ptr2
-  %val1 = load volatile i32 *%ptr2
-  %val2 = load volatile i32 *%ptr2
-  %val3 = load volatile i32 *%ptr2
-  %val4 = load volatile i32 *%ptr2
-  %val5 = load volatile i32 *%ptr2
-  %val6 = load volatile i32 *%ptr2
-  %val7 = load volatile i32 *%ptr2
-  %val8 = load volatile i32 *%ptr2
-  %val9 = load volatile i32 *%ptr2
-  %val10 = load volatile i32 *%ptr2
-  %val11 = load volatile i32 *%ptr2
-  %val12 = load volatile i32 *%ptr2
-  %val13 = load volatile i32 *%ptr2
-  %val14 = load volatile i32 *%ptr2
-  %val15 = load volatile i32 *%ptr2
-
-  %ext0 = zext i32 %val0 to i64
-  %ext1 = zext i32 %val1 to i64
-  %ext2 = zext i32 %val2 to i64
-  %ext3 = zext i32 %val3 to i64
-  %ext4 = zext i32 %val4 to i64
-  %ext5 = zext i32 %val5 to i64
-  %ext6 = zext i32 %val6 to i64
-  %ext7 = zext i32 %val7 to i64
-  %ext8 = zext i32 %val8 to i64
-  %ext9 = zext i32 %val9 to i64
-  %ext10 = zext i32 %val10 to i64
-  %ext11 = zext i32 %val11 to i64
-  %ext12 = zext i32 %val12 to i64
-  %ext13 = zext i32 %val13 to i64
-  %ext14 = zext i32 %val14 to i64
-  %ext15 = zext i32 %val15 to i64
-
-  store volatile i32 %val0, i32 *%ptr2
-  store volatile i32 %val1, i32 *%ptr2
-  store volatile i32 %val2, i32 *%ptr2
-  store volatile i32 %val3, i32 *%ptr2
-  store volatile i32 %val4, i32 *%ptr2
-  store volatile i32 %val5, i32 *%ptr2
-  store volatile i32 %val6, i32 *%ptr2
-  store volatile i32 %val7, i32 *%ptr2
-  store volatile i32 %val8, i32 *%ptr2
-  store volatile i32 %val9, i32 *%ptr2
-  store volatile i32 %val10, i32 *%ptr2
-  store volatile i32 %val11, i32 *%ptr2
-  store volatile i32 %val12, i32 *%ptr2
-  store volatile i32 %val13, i32 *%ptr2
-  store volatile i32 %val14, i32 *%ptr2
-  store volatile i32 %val15, i32 *%ptr2
-
-  store volatile i64 %ext0, i64 *%ptr1
-  store volatile i64 %ext1, i64 *%ptr1
-  store volatile i64 %ext2, i64 *%ptr1
-  store volatile i64 %ext3, i64 *%ptr1
-  store volatile i64 %ext4, i64 *%ptr1
-  store volatile i64 %ext5, i64 *%ptr1
-  store volatile i64 %ext6, i64 *%ptr1
-  store volatile i64 %ext7, i64 *%ptr1
-  store volatile i64 %ext8, i64 *%ptr1
-  store volatile i64 %ext9, i64 *%ptr1
-  store volatile i64 %ext10, i64 *%ptr1
-  store volatile i64 %ext11, i64 *%ptr1
-  store volatile i64 %ext12, i64 *%ptr1
-  store volatile i64 %ext13, i64 *%ptr1
-  store volatile i64 %ext14, i64 *%ptr1
-  store volatile i64 %ext15, i64 *%ptr1
-
-  ret void
-}
diff --git a/test/CodeGen/SystemZ/int-conv-11.ll b/test/CodeGen/SystemZ/int-conv-11.ll
new file mode 100644
index 0000000..3076962
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-conv-11.ll
@@ -0,0 +1,350 @@
+; Test spills of zero extensions when high GR32s are available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Test a case where we spill the source of at least one LLCRMux.  We want
+; to use LLC(H) if possible.
+define void @f1(i32 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: llc{{h?}} {{%r[0-9]+}}, 16{{[37]}}(%r15)
+; CHECK: br %r14
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+  %val16 = load volatile i32 *%ptr
+  %val17 = load volatile i32 *%ptr
+  %val18 = load volatile i32 *%ptr
+  %val19 = load volatile i32 *%ptr
+  %val20 = load volatile i32 *%ptr
+  %val21 = load volatile i32 *%ptr
+  %val22 = load volatile i32 *%ptr
+  %val23 = load volatile i32 *%ptr
+  %val24 = load volatile i32 *%ptr
+  %val25 = load volatile i32 *%ptr
+  %val26 = load volatile i32 *%ptr
+  %val27 = load volatile i32 *%ptr
+  %val28 = load volatile i32 *%ptr
+  %val29 = load volatile i32 *%ptr
+  %val30 = load volatile i32 *%ptr
+  %val31 = load volatile i32 *%ptr
+
+  %trunc0 = trunc i32 %val0 to i8
+  %trunc1 = trunc i32 %val1 to i8
+  %trunc2 = trunc i32 %val2 to i8
+  %trunc3 = trunc i32 %val3 to i8
+  %trunc4 = trunc i32 %val4 to i8
+  %trunc5 = trunc i32 %val5 to i8
+  %trunc6 = trunc i32 %val6 to i8
+  %trunc7 = trunc i32 %val7 to i8
+  %trunc8 = trunc i32 %val8 to i8
+  %trunc9 = trunc i32 %val9 to i8
+  %trunc10 = trunc i32 %val10 to i8
+  %trunc11 = trunc i32 %val11 to i8
+  %trunc12 = trunc i32 %val12 to i8
+  %trunc13 = trunc i32 %val13 to i8
+  %trunc14 = trunc i32 %val14 to i8
+  %trunc15 = trunc i32 %val15 to i8
+  %trunc16 = trunc i32 %val16 to i8
+  %trunc17 = trunc i32 %val17 to i8
+  %trunc18 = trunc i32 %val18 to i8
+  %trunc19 = trunc i32 %val19 to i8
+  %trunc20 = trunc i32 %val20 to i8
+  %trunc21 = trunc i32 %val21 to i8
+  %trunc22 = trunc i32 %val22 to i8
+  %trunc23 = trunc i32 %val23 to i8
+  %trunc24 = trunc i32 %val24 to i8
+  %trunc25 = trunc i32 %val25 to i8
+  %trunc26 = trunc i32 %val26 to i8
+  %trunc27 = trunc i32 %val27 to i8
+  %trunc28 = trunc i32 %val28 to i8
+  %trunc29 = trunc i32 %val29 to i8
+  %trunc30 = trunc i32 %val30 to i8
+  %trunc31 = trunc i32 %val31 to i8
+
+  %ext0 = zext i8 %trunc0 to i32
+  %ext1 = zext i8 %trunc1 to i32
+  %ext2 = zext i8 %trunc2 to i32
+  %ext3 = zext i8 %trunc3 to i32
+  %ext4 = zext i8 %trunc4 to i32
+  %ext5 = zext i8 %trunc5 to i32
+  %ext6 = zext i8 %trunc6 to i32
+  %ext7 = zext i8 %trunc7 to i32
+  %ext8 = zext i8 %trunc8 to i32
+  %ext9 = zext i8 %trunc9 to i32
+  %ext10 = zext i8 %trunc10 to i32
+  %ext11 = zext i8 %trunc11 to i32
+  %ext12 = zext i8 %trunc12 to i32
+  %ext13 = zext i8 %trunc13 to i32
+  %ext14 = zext i8 %trunc14 to i32
+  %ext15 = zext i8 %trunc15 to i32
+  %ext16 = zext i8 %trunc16 to i32
+  %ext17 = zext i8 %trunc17 to i32
+  %ext18 = zext i8 %trunc18 to i32
+  %ext19 = zext i8 %trunc19 to i32
+  %ext20 = zext i8 %trunc20 to i32
+  %ext21 = zext i8 %trunc21 to i32
+  %ext22 = zext i8 %trunc22 to i32
+  %ext23 = zext i8 %trunc23 to i32
+  %ext24 = zext i8 %trunc24 to i32
+  %ext25 = zext i8 %trunc25 to i32
+  %ext26 = zext i8 %trunc26 to i32
+  %ext27 = zext i8 %trunc27 to i32
+  %ext28 = zext i8 %trunc28 to i32
+  %ext29 = zext i8 %trunc29 to i32
+  %ext30 = zext i8 %trunc30 to i32
+  %ext31 = zext i8 %trunc31 to i32
+
+  store volatile i32 %val0, i32 *%ptr
+  store volatile i32 %val1, i32 *%ptr
+  store volatile i32 %val2, i32 *%ptr
+  store volatile i32 %val3, i32 *%ptr
+  store volatile i32 %val4, i32 *%ptr
+  store volatile i32 %val5, i32 *%ptr
+  store volatile i32 %val6, i32 *%ptr
+  store volatile i32 %val7, i32 *%ptr
+  store volatile i32 %val8, i32 *%ptr
+  store volatile i32 %val9, i32 *%ptr
+  store volatile i32 %val10, i32 *%ptr
+  store volatile i32 %val11, i32 *%ptr
+  store volatile i32 %val12, i32 *%ptr
+  store volatile i32 %val13, i32 *%ptr
+  store volatile i32 %val14, i32 *%ptr
+  store volatile i32 %val15, i32 *%ptr
+  store volatile i32 %val16, i32 *%ptr
+  store volatile i32 %val17, i32 *%ptr
+  store volatile i32 %val18, i32 *%ptr
+  store volatile i32 %val19, i32 *%ptr
+  store volatile i32 %val20, i32 *%ptr
+  store volatile i32 %val21, i32 *%ptr
+  store volatile i32 %val22, i32 *%ptr
+  store volatile i32 %val23, i32 *%ptr
+  store volatile i32 %val24, i32 *%ptr
+  store volatile i32 %val25, i32 *%ptr
+  store volatile i32 %val26, i32 *%ptr
+  store volatile i32 %val27, i32 *%ptr
+  store volatile i32 %val28, i32 *%ptr
+  store volatile i32 %val29, i32 *%ptr
+  store volatile i32 %val30, i32 *%ptr
+  store volatile i32 %val31, i32 *%ptr
+
+  store volatile i32 %ext0, i32 *%ptr
+  store volatile i32 %ext1, i32 *%ptr
+  store volatile i32 %ext2, i32 *%ptr
+  store volatile i32 %ext3, i32 *%ptr
+  store volatile i32 %ext4, i32 *%ptr
+  store volatile i32 %ext5, i32 *%ptr
+  store volatile i32 %ext6, i32 *%ptr
+  store volatile i32 %ext7, i32 *%ptr
+  store volatile i32 %ext8, i32 *%ptr
+  store volatile i32 %ext9, i32 *%ptr
+  store volatile i32 %ext10, i32 *%ptr
+  store volatile i32 %ext11, i32 *%ptr
+  store volatile i32 %ext12, i32 *%ptr
+  store volatile i32 %ext13, i32 *%ptr
+  store volatile i32 %ext14, i32 *%ptr
+  store volatile i32 %ext15, i32 *%ptr
+  store volatile i32 %ext16, i32 *%ptr
+  store volatile i32 %ext17, i32 *%ptr
+  store volatile i32 %ext18, i32 *%ptr
+  store volatile i32 %ext19, i32 *%ptr
+  store volatile i32 %ext20, i32 *%ptr
+  store volatile i32 %ext21, i32 *%ptr
+  store volatile i32 %ext22, i32 *%ptr
+  store volatile i32 %ext23, i32 *%ptr
+  store volatile i32 %ext24, i32 *%ptr
+  store volatile i32 %ext25, i32 *%ptr
+  store volatile i32 %ext26, i32 *%ptr
+  store volatile i32 %ext27, i32 *%ptr
+  store volatile i32 %ext28, i32 *%ptr
+  store volatile i32 %ext29, i32 *%ptr
+  store volatile i32 %ext30, i32 *%ptr
+  store volatile i32 %ext31, i32 *%ptr
+
+  ret void
+}
+
+; Same again with i16, which should use LLH(H).
+define void @f2(i32 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: llh{{h?}} {{%r[0-9]+}}, 16{{[26]}}(%r15)
+; CHECK: br %r14
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+  %val16 = load volatile i32 *%ptr
+  %val17 = load volatile i32 *%ptr
+  %val18 = load volatile i32 *%ptr
+  %val19 = load volatile i32 *%ptr
+  %val20 = load volatile i32 *%ptr
+  %val21 = load volatile i32 *%ptr
+  %val22 = load volatile i32 *%ptr
+  %val23 = load volatile i32 *%ptr
+  %val24 = load volatile i32 *%ptr
+  %val25 = load volatile i32 *%ptr
+  %val26 = load volatile i32 *%ptr
+  %val27 = load volatile i32 *%ptr
+  %val28 = load volatile i32 *%ptr
+  %val29 = load volatile i32 *%ptr
+  %val30 = load volatile i32 *%ptr
+  %val31 = load volatile i32 *%ptr
+
+  %trunc0 = trunc i32 %val0 to i16
+  %trunc1 = trunc i32 %val1 to i16
+  %trunc2 = trunc i32 %val2 to i16
+  %trunc3 = trunc i32 %val3 to i16
+  %trunc4 = trunc i32 %val4 to i16
+  %trunc5 = trunc i32 %val5 to i16
+  %trunc6 = trunc i32 %val6 to i16
+  %trunc7 = trunc i32 %val7 to i16
+  %trunc8 = trunc i32 %val8 to i16
+  %trunc9 = trunc i32 %val9 to i16
+  %trunc10 = trunc i32 %val10 to i16
+  %trunc11 = trunc i32 %val11 to i16
+  %trunc12 = trunc i32 %val12 to i16
+  %trunc13 = trunc i32 %val13 to i16
+  %trunc14 = trunc i32 %val14 to i16
+  %trunc15 = trunc i32 %val15 to i16
+  %trunc16 = trunc i32 %val16 to i16
+  %trunc17 = trunc i32 %val17 to i16
+  %trunc18 = trunc i32 %val18 to i16
+  %trunc19 = trunc i32 %val19 to i16
+  %trunc20 = trunc i32 %val20 to i16
+  %trunc21 = trunc i32 %val21 to i16
+  %trunc22 = trunc i32 %val22 to i16
+  %trunc23 = trunc i32 %val23 to i16
+  %trunc24 = trunc i32 %val24 to i16
+  %trunc25 = trunc i32 %val25 to i16
+  %trunc26 = trunc i32 %val26 to i16
+  %trunc27 = trunc i32 %val27 to i16
+  %trunc28 = trunc i32 %val28 to i16
+  %trunc29 = trunc i32 %val29 to i16
+  %trunc30 = trunc i32 %val30 to i16
+  %trunc31 = trunc i32 %val31 to i16
+
+  %ext0 = zext i16 %trunc0 to i32
+  %ext1 = zext i16 %trunc1 to i32
+  %ext2 = zext i16 %trunc2 to i32
+  %ext3 = zext i16 %trunc3 to i32
+  %ext4 = zext i16 %trunc4 to i32
+  %ext5 = zext i16 %trunc5 to i32
+  %ext6 = zext i16 %trunc6 to i32
+  %ext7 = zext i16 %trunc7 to i32
+  %ext8 = zext i16 %trunc8 to i32
+  %ext9 = zext i16 %trunc9 to i32
+  %ext10 = zext i16 %trunc10 to i32
+  %ext11 = zext i16 %trunc11 to i32
+  %ext12 = zext i16 %trunc12 to i32
+  %ext13 = zext i16 %trunc13 to i32
+  %ext14 = zext i16 %trunc14 to i32
+  %ext15 = zext i16 %trunc15 to i32
+  %ext16 = zext i16 %trunc16 to i32
+  %ext17 = zext i16 %trunc17 to i32
+  %ext18 = zext i16 %trunc18 to i32
+  %ext19 = zext i16 %trunc19 to i32
+  %ext20 = zext i16 %trunc20 to i32
+  %ext21 = zext i16 %trunc21 to i32
+  %ext22 = zext i16 %trunc22 to i32
+  %ext23 = zext i16 %trunc23 to i32
+  %ext24 = zext i16 %trunc24 to i32
+  %ext25 = zext i16 %trunc25 to i32
+  %ext26 = zext i16 %trunc26 to i32
+  %ext27 = zext i16 %trunc27 to i32
+  %ext28 = zext i16 %trunc28 to i32
+  %ext29 = zext i16 %trunc29 to i32
+  %ext30 = zext i16 %trunc30 to i32
+  %ext31 = zext i16 %trunc31 to i32
+
+  store volatile i32 %val0, i32 *%ptr
+  store volatile i32 %val1, i32 *%ptr
+  store volatile i32 %val2, i32 *%ptr
+  store volatile i32 %val3, i32 *%ptr
+  store volatile i32 %val4, i32 *%ptr
+  store volatile i32 %val5, i32 *%ptr
+  store volatile i32 %val6, i32 *%ptr
+  store volatile i32 %val7, i32 *%ptr
+  store volatile i32 %val8, i32 *%ptr
+  store volatile i32 %val9, i32 *%ptr
+  store volatile i32 %val10, i32 *%ptr
+  store volatile i32 %val11, i32 *%ptr
+  store volatile i32 %val12, i32 *%ptr
+  store volatile i32 %val13, i32 *%ptr
+  store volatile i32 %val14, i32 *%ptr
+  store volatile i32 %val15, i32 *%ptr
+  store volatile i32 %val16, i32 *%ptr
+  store volatile i32 %val17, i32 *%ptr
+  store volatile i32 %val18, i32 *%ptr
+  store volatile i32 %val19, i32 *%ptr
+  store volatile i32 %val20, i32 *%ptr
+  store volatile i32 %val21, i32 *%ptr
+  store volatile i32 %val22, i32 *%ptr
+  store volatile i32 %val23, i32 *%ptr
+  store volatile i32 %val24, i32 *%ptr
+  store volatile i32 %val25, i32 *%ptr
+  store volatile i32 %val26, i32 *%ptr
+  store volatile i32 %val27, i32 *%ptr
+  store volatile i32 %val28, i32 *%ptr
+  store volatile i32 %val29, i32 *%ptr
+  store volatile i32 %val30, i32 *%ptr
+  store volatile i32 %val31, i32 *%ptr
+
+  store volatile i32 %ext0, i32 *%ptr
+  store volatile i32 %ext1, i32 *%ptr
+  store volatile i32 %ext2, i32 *%ptr
+  store volatile i32 %ext3, i32 *%ptr
+  store volatile i32 %ext4, i32 *%ptr
+  store volatile i32 %ext5, i32 *%ptr
+  store volatile i32 %ext6, i32 *%ptr
+  store volatile i32 %ext7, i32 *%ptr
+  store volatile i32 %ext8, i32 *%ptr
+  store volatile i32 %ext9, i32 *%ptr
+  store volatile i32 %ext10, i32 *%ptr
+  store volatile i32 %ext11, i32 *%ptr
+  store volatile i32 %ext12, i32 *%ptr
+  store volatile i32 %ext13, i32 *%ptr
+  store volatile i32 %ext14, i32 *%ptr
+  store volatile i32 %ext15, i32 *%ptr
+  store volatile i32 %ext16, i32 *%ptr
+  store volatile i32 %ext17, i32 *%ptr
+  store volatile i32 %ext18, i32 *%ptr
+  store volatile i32 %ext19, i32 *%ptr
+  store volatile i32 %ext20, i32 *%ptr
+  store volatile i32 %ext21, i32 *%ptr
+  store volatile i32 %ext22, i32 *%ptr
+  store volatile i32 %ext23, i32 *%ptr
+  store volatile i32 %ext24, i32 *%ptr
+  store volatile i32 %ext25, i32 *%ptr
+  store volatile i32 %ext26, i32 *%ptr
+  store volatile i32 %ext27, i32 *%ptr
+  store volatile i32 %ext28, i32 *%ptr
+  store volatile i32 %ext29, i32 *%ptr
+  store volatile i32 %ext30, i32 *%ptr
+  store volatile i32 %ext31, i32 *%ptr
+
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-div-06.ll b/test/CodeGen/SystemZ/int-div-06.ll
new file mode 100644
index 0000000..8576b1b
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-div-06.ll
@@ -0,0 +1,56 @@
+; Test that divisions by constants are implemented as multiplications.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Check signed 32-bit division.
+define i32 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: lgfr [[REG:%r[0-5]]], %r2
+; CHECK: msgfi [[REG]], 502748801
+; CHECK-DAG: srlg [[RES1:%r[0-5]]], [[REG]], 63
+; CHECK-DAG: srag %r2, [[REG]], 46
+; CHECK: ar %r2, [[RES1]]
+; CHECK: br %r14
+  %b = sdiv i32 %a, 139968
+  ret i32 %b
+}
+
+; Check unsigned 32-bit division.
+define i32 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: llgfr [[REG:%r[0-5]]], %r2
+; CHECK: msgfi [[REG]], 502748801
+; CHECK: srlg %r2, [[REG]], 46
+; CHECK: br %r14
+  %b = udiv i32 %a, 139968
+  ret i32 %b
+}
+
+; Check signed 64-bit division.
+define i64 @f3(i64 %dummy, i64 %a) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: llihf [[CONST:%r[0-5]]], 1005497601
+; CHECK-DAG: oilf [[CONST]], 4251762321
+; CHECK-DAG: srag [[REG:%r[0-5]]], %r3, 63
+; CHECK-DAG: ngr [[REG]], [[CONST]]
+; CHECK-DAG: mlgr %r2, [[CONST]]
+; CHECK: sgr %r2, [[REG]]
+; CHECK: srlg [[RES1:%r[0-5]]], %r2, 63
+; CHECK: srag %r2, %r2, 15
+; CHECK: agr %r2, [[RES1]]
+; CHECK: br %r14
+  %b = sdiv i64 %a, 139968
+  ret i64 %b
+}
+
+; Check unsigned 64-bit division.
+define i64 @f4(i64 %dummy, i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK: llihf [[CONST:%r[0-5]]], 1005497601
+; CHECK: oilf [[CONST]], 4251762321
+; CHECK: mlgr %r2, [[CONST]]
+; CHECK: srlg %r2, %r2, 15
+; CHECK: br %r14
+  %b = udiv i64 %a, 139968
+  ret i64 %b
+}
diff --git a/test/CodeGen/SystemZ/int-move-08.ll b/test/CodeGen/SystemZ/int-move-08.ll
index f16dd8e..56fcbc6 100644
--- a/test/CodeGen/SystemZ/int-move-08.ll
+++ b/test/CodeGen/SystemZ/int-move-08.ll
@@ -10,6 +10,8 @@
 @gsrc32u = global i32 1, align 2, section "foo"
 @gdst16u = global i16 2, align 1, section "foo"
 @gdst32u = global i32 2, align 2, section "foo"
+@garray8 = global [2 x i8] [i8 100, i8 101]
+@garray16 = global [2 x i16] [i16 102, i16 103]
 
 ; Check sign-extending loads from i16.
 define i32 @f1() {
@@ -97,3 +99,36 @@ define void @f8() {
   store i32 %val, i32 *@gdst32u, align 2
   ret void
 }
+
+; Test a case where we want to use one LARL for accesses to two different
+; parts of a variable.
+define void @f9() {
+; CHECK-LABEL: f9:
+; CHECK: larl [[REG:%r[0-5]]], garray8
+; CHECK: llc [[VAL:%r[0-5]]], 0([[REG]])
+; CHECK: srl [[VAL]], 1
+; CHECK: stc [[VAL]], 1([[REG]])
+; CHECK: br %r14
+  %ptr1 = getelementptr [2 x i8] *@garray8, i64 0, i64 0
+  %ptr2 = getelementptr [2 x i8] *@garray8, i64 0, i64 1
+  %val = load i8 *%ptr1
+  %shr = lshr i8 %val, 1
+  store i8 %shr, i8 *%ptr2
+  ret void
+}
+
+; Test a case where we want to use separate relative-long addresses for
+; two different parts of a variable.
+define void @f10() {
+; CHECK-LABEL: f10:
+; CHECK: llhrl [[VAL:%r[0-5]]], garray16
+; CHECK: srl [[VAL]], 1
+; CHECK: sthrl [[VAL]], garray16+2
+; CHECK: br %r14
+  %ptr1 = getelementptr [2 x i16] *@garray16, i64 0, i64 0
+  %ptr2 = getelementptr [2 x i16] *@garray16, i64 0, i64 1
+  %val = load i16 *%ptr1
+  %shr = lshr i16 %val, 1
+  store i16 %shr, i16 *%ptr2
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-mul-08.ll b/test/CodeGen/SystemZ/int-mul-08.ll
index a245760..90b26a4 100644
--- a/test/CodeGen/SystemZ/int-mul-08.ll
+++ b/test/CodeGen/SystemZ/int-mul-08.ll
@@ -22,9 +22,13 @@ define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
 ; This needs a rather convoluted sequence.
 define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
 ; CHECK-LABEL: f2:
-; CHECK: mlgr
-; CHECK: agr
-; CHECK: agr
+; CHECK-DAG: srag [[RES1:%r[0-5]]], %r3, 63
+; CHECK-DAG: srag [[RES2:%r[0-5]]], %r4, 63
+; CHECK-DAG: ngr [[RES1]], %r4
+; CHECK-DAG: ngr [[RES2]], %r3
+; CHECK-DAG: agr [[RES2]], [[RES1]]
+; CHECK-DAG: mlgr %r2, %r4
+; CHECK: sgr %r2, [[RES2]]
 ; CHECK: br %r14
   %ax = sext i64 %a to i128
   %bx = sext i64 %b to i128
diff --git a/test/CodeGen/SystemZ/int-neg-02.ll b/test/CodeGen/SystemZ/int-neg-02.ll
new file mode 100644
index 0000000..e26194c
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-neg-02.ll
@@ -0,0 +1,91 @@
+; Test negative integer absolute.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test i32->i32 negative absolute using slt.
+define i32 @f1(i32 %val) {
+; CHECK-LABEL: f1:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %neg, i32 %val
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i32 negative absolute using sle.
+define i32 @f2(i32 %val) {
+; CHECK-LABEL: f2:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sle i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %neg, i32 %val
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i32 negative absolute using sgt.
+define i32 @f3(i32 %val) {
+; CHECK-LABEL: f3:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sgt i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %val, i32 %neg
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i32 negative absolute using sge.
+define i32 @f4(i32 %val) {
+; CHECK-LABEL: f4:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sge i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %val, i32 %neg
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i64 negative absolute.
+define i64 @f5(i32 %val) {
+; CHECK-LABEL: f5:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i32 %val to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %abs = select i1 %cmp, i64 %neg, i64 %ext
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Test i32->i64 negative absolute that uses an "in-register" form of
+; sign extension.
+define i64 @f6(i64 %val) {
+; CHECK-LABEL: f6:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %trunc = trunc i64 %val to i32
+  %ext = sext i32 %trunc to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %abs = select i1 %cmp, i64 %neg, i64 %ext
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Test i64 negative absolute.
+define i64 @f7(i64 %val) {
+; CHECK-LABEL: f7:
+; CHECK: lngr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i64 %val, 0
+  %neg = sub i64 0, %val
+  %abs = select i1 %cmp, i64 %neg, i64 %val
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/lit.local.cfg b/test/CodeGen/SystemZ/lit.local.cfg
index 79528d1..b12af09 100644
--- a/test/CodeGen/SystemZ/lit.local.cfg
+++ b/test/CodeGen/SystemZ/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'SystemZ' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/SystemZ/memchr-01.ll b/test/CodeGen/SystemZ/memchr-01.ll
new file mode 100644
index 0000000..c51690b
--- /dev/null
+++ b/test/CodeGen/SystemZ/memchr-01.ll
@@ -0,0 +1,21 @@
+; Test memchr using SRST, with a weird but usable prototype.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@memchr(i8 *%src, i16 %char, i32 %len)
+
+; Test a simple forwarded call.
+define i8 *@f1(i8 *%src, i16 %char, i32 %len) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lgr [[REG:%r[1-5]]], %r2
+; CHECK-DAG: algfr %r2, %r4
+; CHECK-DAG: llcr %r0, %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK: jl {{\.L.*}}
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+  %res = call i8 *@memchr(i8 *%src, i16 %char, i32 %len)
+  ret i8 *%res
+}
diff --git a/test/CodeGen/SystemZ/memchr-02.ll b/test/CodeGen/SystemZ/memchr-02.ll
new file mode 100644
index 0000000..982b396
--- /dev/null
+++ b/test/CodeGen/SystemZ/memchr-02.ll
@@ -0,0 +1,57 @@
+; Test memchr using SRST, with the correct prototype.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@memchr(i8 *%src, i32 %char, i64 %len)
+
+; Test a simple forwarded call.
+define i8 *@f1(i64 %len, i8 *%src, i32 %char) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: agr %r2, %r3
+; CHECK-DAG: llcr %r0, %r4
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: srst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK: jl {{\.L.*}}
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+  %res = call i8 *@memchr(i8 *%src, i32 %char, i64 %len)
+  ret i8 *%res
+}
+
+; Test a doubled call with no use of %r0 in between.  There should be a
+; single load of %r0.
+define i8 *@f2(i8 *%src, i8 *%charptr, i64 %len) {
+; CHECK-LABEL: f2:
+; CHECK: llc %r0, 0(%r3)
+; CHECK-NOT: %r0
+; CHECK: srst [[RES1:%r[1-5]]], %r2
+; CHECK-NOT: %r0
+; CHECK: srst %r2, [[RES1]]
+; CHECK: br %r14
+  %char = load volatile i8 *%charptr
+  %charext = zext i8 %char to i32
+  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
+  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
+  ret i8 *%res2
+}
+
+; Test a doubled call with a use of %r0 in between.  %r0 must be loaded
+; for each loop.
+define i8 *@f3(i8 *%src, i8 *%charptr, i64 %len) {
+; CHECK-LABEL: f3:
+; CHECK: llc [[CHAR:%r[1-5]]], 0(%r3)
+; CHECK: lr %r0, [[CHAR]]
+; CHECK: srst [[RES1:%r[1-5]]], %r2
+; CHECK: lhi %r0, 0
+; CHECK: blah %r0
+; CHECK: lr %r0, [[CHAR]]
+; CHECK: srst %r2, [[RES1]]
+; CHECK: br %r14
+  %char = load volatile i8 *%charptr
+  %charext = zext i8 %char to i32
+  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
+  call void asm sideeffect "blah $0", "{r0}" (i32 0)
+  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
+  ret i8 *%res2
+}
diff --git a/test/CodeGen/SystemZ/memcmp-01.ll b/test/CodeGen/SystemZ/memcmp-01.ll
new file mode 100644
index 0000000..a014419
--- /dev/null
+++ b/test/CodeGen/SystemZ/memcmp-01.ll
@@ -0,0 +1,221 @@
+; Test memcmp using CLC, with i32 results.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 0)
+  ret i32 %res
+}
+
+; Check a case where the result is used as an integer.
+define i32 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK: clc 0(2,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 2)
+  ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: clc 0(3,%r2), 0(%r3)
+; CHECK-NEXT: je {{\..*}}
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3)
+  %cmp = icmp eq i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK: clc 0(4,%r2), 0(%r3)
+; CHECK-NEXT: jlh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4)
+  %cmp = icmp ne i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK: clc 0(5,%r2), 0(%r3)
+; CHECK-NEXT: jl {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK: clc 0(6,%r2), 0(%r3)
+; CHECK-NEXT: jh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6)
+  %cmp = icmp sgt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the upper end of the CLC range.  Here the result is used both as
+; an integer and for branching.
+define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: jl {{.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret i32 %res
+}
+
+; 257 bytes needs two CLCs.
+define i32 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(1,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  ret i32 %res
+}
+
+; Test a comparison of 258 bytes in which the CC result can be used directly.
+define void @f9(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f9:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(1,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: jl .L
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Test the largest size that can use two CLCs.
+define i32 @f10(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f10:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(256,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 512)
+  ret i32 %res
+}
+
+; Test the smallest size that needs 3 CLCs.
+define i32 @f11(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f11:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(256,%r2), 256(%r3)
+; CHECK: jlh [[LABEL]]
+; CHECK: clc 512(1,%r2), 512(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 513)
+  ret i32 %res
+}
+
+; Test the largest size than can use 3 CLCs.
+define i32 @f12(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f12:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(256,%r2), 256(%r3)
+; CHECK: jlh [[LABEL]]
+; CHECK: clc 512(256,%r2), 512(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 768)
+  ret i32 %res
+}
+
+; The next size up uses a loop instead.  We leave the more complicated
+; loop tests to memcpy-01.ll, which shares the same form.
+define i32 @f13(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f13:
+; CHECK: lghi [[COUNT:%r[0-5]]], 3
+; CHECK: [[LOOP:.L[^:]*]]:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK-DAG: la %r2, 256(%r2)
+; CHECK-DAG: la %r3, 256(%r3)
+; CHECK: brctg [[COUNT]], [[LOOP]]
+; CHECK: clc 0(1,%r2), 0(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 769)
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/memcmp-02.ll b/test/CodeGen/SystemZ/memcmp-02.ll
new file mode 100644
index 0000000..74b090d
--- /dev/null
+++ b/test/CodeGen/SystemZ/memcmp-02.ll
@@ -0,0 +1,139 @@
+; Test memcmp using CLC, with i64 results.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i64 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 0)
+  ret i64 %res
+}
+
+; Check a case where the result is used as an integer.
+define i64 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK: clc 0(2,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 2)
+  ret i64 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: clc 0(3,%r2), 0(%r3)
+; CHECK-NEXT: je {{\..*}}
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 3)
+  %cmp = icmp eq i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK: clc 0(4,%r2), 0(%r3)
+; CHECK-NEXT: jlh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 4)
+  %cmp = icmp ne i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK: clc 0(5,%r2), 0(%r3)
+; CHECK-NEXT: jl {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 5)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK: clc 0(6,%r2), 0(%r3)
+; CHECK-NEXT: jh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 6)
+  %cmp = icmp sgt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the upper end of the CLC range.  Here the result is used both as
+; an integer and for branching.
+define i64 @f7(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: jl {{.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 256)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %res
+}
+
+; 257 bytes needs two CLCs.
+define i64 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(1,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/memcpy-01.ll b/test/CodeGen/SystemZ/memcpy-01.ll
index 7cb58b3..b53ec54 100644
--- a/test/CodeGen/SystemZ/memcpy-01.ll
+++ b/test/CodeGen/SystemZ/memcpy-01.ll
@@ -4,7 +4,9 @@
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *nocapture, i8 *nocapture, i32, i32, i1) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8 *nocapture, i8 *nocapture, i64, i32, i1) nounwind
+declare void @foo(i8 *, i8 *)
 
+; Test a no-op move, i32 version.
 define void @f1(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK-NOT: %r2
@@ -15,6 +17,7 @@ define void @f1(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test a no-op move, i64 version.
 define void @f2(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f2:
 ; CHECK-NOT: %r2
@@ -25,6 +28,7 @@ define void @f2(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test a 1-byte move, i32 version.
 define void @f3(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f3:
 ; CHECK: mvc 0(1,%r2), 0(%r3)
@@ -34,6 +38,7 @@ define void @f3(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test a 1-byte move, i64 version.
 define void @f4(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f4:
 ; CHECK: mvc 0(1,%r2), 0(%r3)
@@ -43,6 +48,7 @@ define void @f4(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test the upper range of a single MVC, i32 version.
 define void @f5(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f5:
 ; CHECK: mvc 0(256,%r2), 0(%r3)
@@ -52,6 +58,7 @@ define void @f5(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test the upper range of a single MVC, i64 version.
 define void @f6(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f6:
 ; CHECK: mvc 0(256,%r2), 0(%r3)
@@ -61,22 +68,168 @@ define void @f6(i8 *%dest, i8 *%src) {
   ret void
 }
 
-; 257 bytes is too big for a single MVC.  For now expect none, so that
-; the test fails and gets updated when large copies are implemented.
+; Test the first case that needs two MVCs.
 define void @f7(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f7:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(1,%r2), 256(%r3)
 ; CHECK: br %r14
   call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%dest, i8 *%src, i32 257, i32 1,
                                        i1 false)
   ret void
 }
 
+; Test the last-but-one case that needs two MVCs.
 define void @f8(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f8:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(255,%r2), 256(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 511, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test the last case that needs two MVCs.
+define void @f9(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f9:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 512, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test an arbitrary value that uses straight-line code.
+define void @f10(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f10:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(255,%r2), 1024(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again in cases where not all parts are in range of MVC.
+define void @f11(i8 *%srcbase, i8 *%destbase) {
+; CHECK-LABEL: f11:
+; CHECK: mvc 4000(256,%r2), 3500(%r3)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4256(%r2)
+; CHECK: mvc 0(256,[[NEWDEST]]), 3756(%r3)
+; CHECK: mvc 256(256,[[NEWDEST]]), 4012(%r3)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4268(%r3)
+; CHECK: mvc 512(256,[[NEWDEST]]), 0([[NEWSRC]])
+; CHECK: mvc 768(255,[[NEWDEST]]), 256([[NEWSRC]])
+; CHECK: br %r14
+  %dest = getelementptr i8 *%srcbase, i64 4000
+  %src = getelementptr i8* %destbase, i64 3500
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again with a destination frame base that goes out of range.
+define void @f12() {
+; CHECK-LABEL: f12:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 4076(256,%r15), 2100(%r15)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4332(%r15)
+; CHECK: mvc 0(256,[[NEWDEST]]), 2356(%r15)
+; CHECK: mvc 256(256,[[NEWDEST]]), 2612(%r15)
+; CHECK: mvc 512(256,[[NEWDEST]]), 2868(%r15)
+; CHECK: mvc 768(255,[[NEWDEST]]), 3124(%r15)
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %arr = alloca [6000 x i8]
+  %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 3900
+  %src = getelementptr [6000 x i8] *%arr, i64 0, i64 1924
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
+  ret void
+}
+
+; ...and again with a source frame base that goes out of range.
+define void @f13() {
+; CHECK-LABEL: f13:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 200(256,%r15), 3826(%r15)
+; CHECK: mvc 456(256,%r15), 4082(%r15)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4338(%r15)
+; CHECK: mvc 712(256,%r15), 0([[NEWSRC]])
+; CHECK: mvc 968(256,%r15), 256([[NEWSRC]])
+; CHECK: mvc 1224(255,%r15), 512([[NEWSRC]])
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %arr = alloca [6000 x i8]
+  %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 24
+  %src = getelementptr [6000 x i8] *%arr, i64 0, i64 3650
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
+  ret void
+}
+
+; Test the last case that is done using straight-line code.
+define void @f14(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f14:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(256,%r2), 1024(%r3)
+; CHECK: mvc 1280(256,%r2), 1280(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1536, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test the first case that is done using a loop.
+define void @f15(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f15:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: la %r2, 256(%r2)
+; CHECK: la %r3, 256(%r3)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 0(1,%r2), 0(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again with frame bases, where the base must be loaded into a
+; register before the loop.
+define void @f16() {
+; CHECK-LABEL: f16:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-DAG: lghi [[COUNT:%r[0-5]]], 6
+; CHECK-DAG: la [[BASE:%r[0-5]]], 160(%r15)
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 2368([[BASE]])
+; CHECK: mvc 1600(256,[[BASE]]), 0([[BASE]])
+; CHECK: la [[BASE]], 256([[BASE]])
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1600(1,[[BASE]]), 0([[BASE]])
+; CHECK: brasl %r14, foo@PLT
 ; CHECK: br %r14
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 257, i32 1,
+  %arr = alloca [3200 x i8]
+  %dest = getelementptr [3200 x i8] *%arr, i64 0, i64 1600
+  %src = getelementptr [3200 x i8] *%arr, i64 0, i64 0
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
                                        i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
   ret void
 }
diff --git a/test/CodeGen/SystemZ/memcpy-02.ll b/test/CodeGen/SystemZ/memcpy-02.ll
index 83b2cd8..2b01091 100644
--- a/test/CodeGen/SystemZ/memcpy-02.ll
+++ b/test/CodeGen/SystemZ/memcpy-02.ll
@@ -2,11 +2,14 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-@g1 = global i8 1
-@g2 = global i16 2
+@g1src = global i8 1
+@g1dst = global i8 1
+@g2src = global i16 2
+@g2dst = global i16 2
 @g3 = global i32 3
 @g4 = global i64 4
-@g5 = external global fp128, align 16
+@g5src = external global fp128, align 16
+@g5dst = external global fp128, align 16
 
 ; Test the simple i8 case.
 define void @f1(i8 *%ptr1) {
@@ -237,18 +240,19 @@ define void @f19(i64 *%ptr1) {
   ret void
 }
 
-; Test that MVC is used for aligned loads and stores, even if there is
-; no way of telling whether they alias.
+; Test that MVC is not used for aligned loads and stores if there is
+; no way of telling whether they alias.  We don't want to use MVC in
+; cases where the addresses could be equal.
 define void @f20(i64 *%ptr1, i64 *%ptr2) {
 ; CHECK-LABEL: f20:
-; CHECK: mvc 0(8,%r3), 0(%r2)
+; CHECK-NOT: mvc
 ; CHECK: br %r14
   %val = load i64 *%ptr1
   store i64 %val, i64 *%ptr2
   ret void
 }
 
-; ...but if the loads aren't aligned, we can't be sure.
+; ...and again for unaligned loads and stores.
 define void @f21(i64 *%ptr1, i64 *%ptr2) {
 ; CHECK-LABEL: f21:
 ; CHECK-NOT: mvc
@@ -274,50 +278,29 @@ define void @f22(i64 %base) {
 ; Test that we can use MVC for global addresses for i8.
 define void @f23(i8 *%ptr) {
 ; CHECK-LABEL: f23:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: mvc 0(1,%r2), 0([[REG]])
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g1src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g1dst
+; CHECK: mvc 0(1,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load i8 *@g1
-  store i8 %val, i8 *%ptr
+  %val = load i8 *@g1src
+  store i8 %val, i8 *@g1dst
   ret void
 }
 
-; ...and again with the global on the store.
-define void @f24(i8 *%ptr) {
+; Test that we use LHRL and STHRL for i16.
+define void @f24(i16 *%ptr) {
 ; CHECK-LABEL: f24:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: mvc 0(1,[[REG]]), 0(%r2)
-; CHECK: br %r14
-  %val = load i8 *%ptr
-  store i8 %val, i8 *@g1
-  ret void
-}
-
-; Test that we use LHRL for i16.
-define void @f25(i16 *%ptr) {
-; CHECK-LABEL: f25:
-; CHECK: lhrl [[REG:%r[0-5]]], g2
-; CHECK: sth [[REG]], 0(%r2)
+; CHECK: lhrl [[REG:%r[0-5]]], g2src
+; CHECK: sthrl [[REG]], g2dst
 ; CHECK: br %r14
-  %val = load i16 *@g2
-  store i16 %val, i16 *%ptr
-  ret void
-}
-
-; ...likewise STHRL.
-define void @f26(i16 *%ptr) {
-; CHECK-LABEL: f26:
-; CHECK: lh [[REG:%r[0-5]]], 0(%r2)
-; CHECK: sthrl [[REG]], g2
-; CHECK: br %r14
-  %val = load i16 *%ptr
-  store i16 %val, i16 *@g2
+  %val = load i16 *@g2src
+  store i16 %val, i16 *@g2dst
   ret void
 }
 
 ; Test that we use LRL for i32.
-define void @f27(i32 *%ptr) {
-; CHECK-LABEL: f27:
+define void @f25(i32 *%ptr) {
+; CHECK-LABEL: f25:
 ; CHECK: lrl [[REG:%r[0-5]]], g3
 ; CHECK: st [[REG]], 0(%r2)
 ; CHECK: br %r14
@@ -327,8 +310,8 @@ define void @f27(i32 *%ptr) {
 }
 
 ; ...likewise STRL.
-define void @f28(i32 *%ptr) {
-; CHECK-LABEL: f28:
+define void @f26(i32 *%ptr) {
+; CHECK-LABEL: f26:
 ; CHECK: l [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: strl [[REG]], g3
 ; CHECK: br %r14
@@ -338,8 +321,8 @@ define void @f28(i32 *%ptr) {
 }
 
 ; Test that we use LGRL for i64.
-define void @f29(i64 *%ptr) {
-; CHECK-LABEL: f29:
+define void @f27(i64 *%ptr) {
+; CHECK-LABEL: f27:
 ; CHECK: lgrl [[REG:%r[0-5]]], g4
 ; CHECK: stg [[REG]], 0(%r2)
 ; CHECK: br %r14
@@ -349,8 +332,8 @@ define void @f29(i64 *%ptr) {
 }
 
 ; ...likewise STGRL.
-define void @f30(i64 *%ptr) {
-; CHECK-LABEL: f30:
+define void @f28(i64 *%ptr) {
+; CHECK-LABEL: f28:
 ; CHECK: lg [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: stgrl [[REG]], g4
 ; CHECK: br %r14
@@ -360,30 +343,20 @@ define void @f30(i64 *%ptr) {
 }
 
 ; Test that we can use MVC for global addresses for fp128.
-define void @f31(fp128 *%ptr) {
-; CHECK-LABEL: f31:
-; CHECK: larl [[REG:%r[0-5]]], g5
-; CHECK: mvc 0(16,%r2), 0([[REG]])
-; CHECK: br %r14
-  %val = load fp128 *@g5, align 16
-  store fp128 %val, fp128 *%ptr, align 16
-  ret void
-}
-
-; ...and again with the global on the store.
-define void @f32(fp128 *%ptr) {
-; CHECK-LABEL: f32:
-; CHECK: larl [[REG:%r[0-5]]], g5
-; CHECK: mvc 0(16,[[REG]]), 0(%r2)
+define void @f29(fp128 *%ptr) {
+; CHECK-LABEL: f29:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g5src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g5dst
+; CHECK: mvc 0(16,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load fp128 *%ptr, align 16
-  store fp128 %val, fp128 *@g5, align 16
+  %val = load fp128 *@g5src, align 16
+  store fp128 %val, fp128 *@g5dst, align 16
   ret void
 }
 
 ; Test a case where offset disambiguation is enough.
-define void @f33(i64 *%ptr1) {
-; CHECK-LABEL: f33:
+define void @f30(i64 *%ptr1) {
+; CHECK-LABEL: f30:
 ; CHECK: mvc 8(8,%r2), 0(%r2)
 ; CHECK: br %r14
   %ptr2 = getelementptr i64 *%ptr1, i64 1
@@ -393,8 +366,8 @@ define void @f33(i64 *%ptr1) {
 }
 
 ; Test f21 in cases where TBAA tells us there is no alias.
-define void @f34(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f34:
+define void @f31(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f31:
 ; CHECK: mvc 0(8,%r3), 0(%r2)
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1
@@ -403,8 +376,8 @@ define void @f34(i64 *%ptr1, i64 *%ptr2) {
 }
 
 ; Test f21 in cases where TBAA is present but doesn't help.
-define void @f35(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f35:
+define void @f32(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f32:
 ; CHECK-NOT: mvc
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1
@@ -413,5 +386,7 @@ define void @f35(i64 *%ptr1, i64 *%ptr2) {
 }
 
 !0 = metadata !{ metadata !"root" }
-!1 = metadata !{ metadata !"set1", metadata !0 }
-!2 = metadata !{ metadata !"set2", metadata !0 }
+!1 = metadata !{ metadata !3, metadata !3, i64 0 }
+!2 = metadata !{ metadata !4, metadata !4, i64 0 }
+!3 = metadata !{ metadata !"set1", metadata !0 }
+!4 = metadata !{ metadata !"set2", metadata !0 }
diff --git a/test/CodeGen/SystemZ/memset-01.ll b/test/CodeGen/SystemZ/memset-01.ll
index b272a5b..f17901c 100644
--- a/test/CodeGen/SystemZ/memset-01.ll
+++ b/test/CodeGen/SystemZ/memset-01.ll
@@ -103,22 +103,58 @@ define void @f10(i8 *%dest, i8 %val) {
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
 define void @f11(i8 *%dest, i8 %val) {
 ; CHECK-LABEL: f11:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 %val, i32 258, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
 define void @f12(i8 *%dest, i8 %val) {
 ; CHECK-LABEL: f12:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 258, i32 1, i1 false)
   ret void
 }
+
+; Test the largest case for which straight-line code is used.
+define void @f13(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f13:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(256,%r2), 256(%r2)
+; CHECK: mvc 513(256,%r2), 512(%r2)
+; CHECK: mvc 769(256,%r2), 768(%r2)
+; CHECK: mvc 1025(256,%r2), 1024(%r2)
+; CHECK: mvc 1281(256,%r2), 1280(%r2)
+; CHECK: br %r14
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1537, i32 1,
+                                  i1 false)
+  ret void
+}
+
+; Test the next size up, which uses a loop.  We leave the other corner
+; cases to memcpy-01.ll.
+define void @f14(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f14:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 769(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1538, i32 1,
+                                  i1 false)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/memset-02.ll b/test/CodeGen/SystemZ/memset-02.ll
index b74d907..b4724c0 100644
--- a/test/CodeGen/SystemZ/memset-02.ll
+++ b/test/CodeGen/SystemZ/memset-02.ll
@@ -139,21 +139,23 @@ define void @f14(i8 *%dest) {
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
 define void @f15(i8 *%dest) {
 ; CHECK-LABEL: f15:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 128, i32 258, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
 define void @f16(i8 *%dest) {
 ; CHECK-LABEL: f16:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 128, i64 258, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/SystemZ/memset-03.ll b/test/CodeGen/SystemZ/memset-03.ll
index 1d48f1a..a95f89f 100644
--- a/test/CodeGen/SystemZ/memset-03.ll
+++ b/test/CodeGen/SystemZ/memset-03.ll
@@ -140,8 +140,7 @@ define void @f14(i8 *%dest) {
 ; 7 bytes, i32 version.
 define void @f15(i8 *%dest) {
 ; CHECK-LABEL: f15:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(6,%r2), 0(%r2)
+; CHECK: xc 0(7,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 7, i32 1, i1 false)
   ret void
@@ -150,8 +149,7 @@ define void @f15(i8 *%dest) {
 ; 7 bytes, i64 version.
 define void @f16(i8 *%dest) {
 ; CHECK-LABEL: f16:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(6,%r2), 0(%r2)
+; CHECK: xc 0(7,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 7, i32 1, i1 false)
   ret void
@@ -218,8 +216,7 @@ define void @f22(i8 *%dest) {
 ; 11 bytes, i32 version.
 define void @f23(i8 *%dest) {
 ; CHECK-LABEL: f23:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(10,%r2), 0(%r2)
+; CHECK: xc 0(11,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 11, i32 1, i1 false)
   ret void
@@ -228,8 +225,7 @@ define void @f23(i8 *%dest) {
 ; 11 bytes, i64 version.
 define void @f24(i8 *%dest) {
 ; CHECK-LABEL: f24:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(10,%r2), 0(%r2)
+; CHECK: xc 0(11,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 11, i32 1, i1 false)
   ret void
@@ -258,8 +254,7 @@ define void @f26(i8 *%dest) {
 ; 13 bytes, i32 version.
 define void @f27(i8 *%dest) {
 ; CHECK-LABEL: f27:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(12,%r2), 0(%r2)
+; CHECK: xc 0(13,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 13, i32 1, i1 false)
   ret void
@@ -268,8 +263,7 @@ define void @f27(i8 *%dest) {
 ; 13 bytes, i64 version.
 define void @f28(i8 *%dest) {
 ; CHECK-LABEL: f28:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(12,%r2), 0(%r2)
+; CHECK: xc 0(13,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 13, i32 1, i1 false)
   ret void
@@ -278,8 +272,7 @@ define void @f28(i8 *%dest) {
 ; 14 bytes, i32 version.
 define void @f29(i8 *%dest) {
 ; CHECK-LABEL: f29:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(13,%r2), 0(%r2)
+; CHECK: xc 0(14,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 14, i32 1, i1 false)
   ret void
@@ -288,8 +281,7 @@ define void @f29(i8 *%dest) {
 ; 14 bytes, i64 version.
 define void @f30(i8 *%dest) {
 ; CHECK-LABEL: f30:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(13,%r2), 0(%r2)
+; CHECK: xc 0(14,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 14, i32 1, i1 false)
   ret void
@@ -298,8 +290,7 @@ define void @f30(i8 *%dest) {
 ; 15 bytes, i32 version.
 define void @f31(i8 *%dest) {
 ; CHECK-LABEL: f31:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(14,%r2), 0(%r2)
+; CHECK: xc 0(15,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 15, i32 1, i1 false)
   ret void
@@ -308,8 +299,7 @@ define void @f31(i8 *%dest) {
 ; 15 bytes, i64 version.
 define void @f32(i8 *%dest) {
 ; CHECK-LABEL: f32:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(14,%r2), 0(%r2)
+; CHECK: xc 0(15,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 15, i32 1, i1 false)
   ret void
@@ -338,8 +328,7 @@ define void @f34(i8 *%dest) {
 ; 17 bytes, i32 version.
 define void @f35(i8 *%dest) {
 ; CHECK-LABEL: f35:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(16,%r2), 0(%r2)
+; CHECK: xc 0(17,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 17, i32 1, i1 false)
   ret void
@@ -348,49 +337,46 @@ define void @f35(i8 *%dest) {
 ; 17 bytes, i64 version.
 define void @f36(i8 *%dest) {
 ; CHECK-LABEL: f36:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(16,%r2), 0(%r2)
+; CHECK: xc 0(17,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 17, i32 1, i1 false)
   ret void
 }
 
-; 257 bytes, i32 version.
+; 256 bytes, i32 version.
 define void @f37(i8 *%dest) {
 ; CHECK-LABEL: f37:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: xc 0(256,%r2), 0(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 257, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 256, i32 1, i1 false)
   ret void
 }
 
-; 257 bytes, i64 version.
+; 256 bytes, i64 version.
 define void @f38(i8 *%dest) {
 ; CHECK-LABEL: f38:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: xc 0(256,%r2), 0(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 257, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 256, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 257 bytes, i32 version.  We need two MVCs.
 define void @f39(i8 *%dest) {
 ; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: xc 0(256,%r2), 0(%r2)
+; CHECK: xc 256(1,%r2), 256(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 258, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 257, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 257 bytes, i64 version.
 define void @f40(i8 *%dest) {
 ; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: xc 0(256,%r2), 0(%r2)
+; CHECK: xc 256(1,%r2), 256(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 258, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 257, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/SystemZ/memset-04.ll b/test/CodeGen/SystemZ/memset-04.ll
index 9288692..7906e8d 100644
--- a/test/CodeGen/SystemZ/memset-04.ll
+++ b/test/CodeGen/SystemZ/memset-04.ll
@@ -375,21 +375,23 @@ define void @f38(i8 *%dest) {
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
 define void @f39(i8 *%dest) {
 ; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 -1, i32 258, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
 define void @f40(i8 *%dest) {
 ; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 -1, i64 258, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/SystemZ/or-08.ll b/test/CodeGen/SystemZ/or-08.ll
new file mode 100644
index 0000000..8f5bf31
--- /dev/null
+++ b/test/CodeGen/SystemZ/or-08.ll
@@ -0,0 +1,57 @@
+; Test memory-to-memory ORs.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test the simple i8 case.
+define void @f1(i8 *%ptr1) {
+; CHECK-LABEL: f1:
+; CHECK: oc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %or = or i8 %val, %old
+  store i8 %or, i8 *%ptr2
+  ret void
+}
+
+; Test the simple i16 case.
+define void @f2(i16 *%ptr1) {
+; CHECK-LABEL: f2:
+; CHECK: oc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %old = load i16 *%ptr2
+  %or = or i16 %val, %old
+  store i16 %or, i16 *%ptr2
+  ret void
+}
+
+; Test the simple i32 case.
+define void @f3(i32 *%ptr1) {
+; CHECK-LABEL: f3:
+; CHECK: oc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %old = load i32 *%ptr2
+  %or = or i32 %old, %val
+  store i32 %or, i32 *%ptr2
+  ret void
+}
+
+; Test the i64 case.
+define void @f4(i64 *%ptr1) {
+; CHECK-LABEL: f4:
+; CHECK: oc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %or = or i64 %old, %val
+  store i64 %or, i64 *%ptr2
+  ret void
+}
+
+; Leave other more complicated tests to and-08.ll.
diff --git a/test/CodeGen/SystemZ/prefetch-01.ll b/test/CodeGen/SystemZ/prefetch-01.ll
new file mode 100644
index 0000000..bb7fea9
--- /dev/null
+++ b/test/CodeGen/SystemZ/prefetch-01.ll
@@ -0,0 +1,87 @@
+; Test data prefetching.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @llvm.prefetch(i8*, i32, i32, i32)
+
+@g = global [4096 x i8] zeroinitializer
+
+; Check that instruction read prefetches are ignored.
+define void @f1(i8 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; Check that instruction write prefetches are ignored.
+define void @f2(i8 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 0)
+  ret void
+}
+
+; Check data read prefetches.
+define void @f3(i8 *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: pfd 1, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+; Check data write prefetches.
+define void @f4(i8 *%ptr) {
+; CHECK-LABEL: f4:
+; CHECK: pfd 2, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check an address at the negative end of the range.
+define void @f5(i8 *%base, i64 %index) {
+; CHECK-LABEL: f5:
+; CHECK: pfd 2, -524288({{%r2,%r3|%r3,%r2}})
+; CHECK: br %r14
+  %add = add i64 %index, -524288
+  %ptr = getelementptr i8 *%base, i64 %add
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check an address at the positive end of the range.
+define void @f6(i8 *%base, i64 %index) {
+; CHECK-LABEL: f6:
+; CHECK: pfd 2, 524287({{%r2,%r3|%r3,%r2}})
+; CHECK: br %r14
+  %add = add i64 %index, 524287
+  %ptr = getelementptr i8 *%base, i64 %add
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check that the next address up still compiles.
+define void @f7(i8 *%base, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: 524288
+; CHECK: pfd 2,
+; CHECK: br %r14
+  %add = add i64 %index, 524288
+  %ptr = getelementptr i8 *%base, i64 %add
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check pc-relative prefetches.
+define void @f8() {
+; CHECK-LABEL: f8:
+; CHECK: pfdrl 2, g
+; CHECK: br %r14
+  %ptr = getelementptr [4096 x i8] *@g, i64 0, i64 0
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/risbg-01.ll b/test/CodeGen/SystemZ/risbg-01.ll
index 85de6dc..a4d11fd 100644
--- a/test/CodeGen/SystemZ/risbg-01.ll
+++ b/test/CodeGen/SystemZ/risbg-01.ll
@@ -1,6 +1,7 @@
 ; Test sequences that can use RISBG with a zeroed first operand.
+; The tests here assume that RISBLG isn't available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Test an extraction of bit 0 from a right-shifted value.
 define i32 @f1(i32 %foo) {
@@ -455,3 +456,17 @@ define i64 @f40(i64 %foo, i64 *%dest) {
   %and = and i64 %shl, 2147483647
   ret i64 %and
 }
+
+; In this case the sign extension is converted to a pair of 32-bit shifts,
+; which is then extended to 64 bits.  We previously used the wrong bit size
+; when testing whether the shifted-in bits of the shift right were significant.
+define i64 @f41(i1 %x) {
+; CHECK-LABEL: f41:
+; CHECK: sll %r2, 31
+; CHECK: sra %r2, 31
+; CHECK: llgcr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i1 %x to i8
+  %ext2 = zext i8 %ext to i64
+  ret i64 %ext2
+}
diff --git a/test/CodeGen/SystemZ/setcc-01.ll b/test/CodeGen/SystemZ/setcc-01.ll
new file mode 100644
index 0000000..4626760
--- /dev/null
+++ b/test/CodeGen/SystemZ/setcc-01.ll
@@ -0,0 +1,74 @@
+; Test SETCC for every integer condition.  The tests here assume that
+; RISBLG isn't available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+
+; Test CC in { 0 }, with 3 don't care.
+define i32 @f1(i32 %a, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = icmp eq i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1 }, with 3 don't care.
+define i32 @f2(i32 %a, i32 %b) {
+; CHECK-LABEL: f2:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = icmp slt i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1 }, with 3 don't care.
+define i32 @f3(i32 %a, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -536870912
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = icmp sle i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 2 }, with 3 don't care.
+define i32 @f4(i32 %a, i32 %b) {
+; CHECK-LABEL: f4:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = icmp sgt i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 2 }, with 3 don't care.
+define i32 @f5(i32 %a, i32 %b) {
+; CHECK-LABEL: f5:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 4294967295
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = icmp sge i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 2 }, with 3 don't care.
+define i32 @f6(i32 %a, i32 %b) {
+; CHECK-LABEL: f6:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = icmp ne i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/setcc-02.ll b/test/CodeGen/SystemZ/setcc-02.ll
new file mode 100644
index 0000000..6a7be47
--- /dev/null
+++ b/test/CodeGen/SystemZ/setcc-02.ll
@@ -0,0 +1,174 @@
+; Test SETCC for every floating-point condition.  The tests here assume that
+; RISBLG isn't available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+
+; Test CC in { 0 }
+define i32 @f1(float %a, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp oeq float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1 }
+define i32 @f2(float %a, float %b) {
+; CHECK-LABEL: f2:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp olt float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1 }
+define i32 @f3(float %a, float %b) {
+; CHECK-LABEL: f3:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -536870912
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ole float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 2 }
+define i32 @f4(float %a, float %b) {
+; CHECK-LABEL: f4:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ogt float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 2 }
+define i32 @f5(float %a, float %b) {
+; CHECK-LABEL: f5:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 4294967295
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = fcmp oge float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 2 }
+define i32 @f6(float %a, float %b) {
+; CHECK-LABEL: f6:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], 268435456
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = fcmp one float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 2 }
+define i32 @f7(float %a, float %b) {
+; CHECK-LABEL: f7:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ord float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 3 }
+define i32 @f8(float %a, float %b) {
+; CHECK-LABEL: f8:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uno float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 3 }
+define i32 @f9(float %a, float %b) {
+; CHECK-LABEL: f9:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], -268435456
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = fcmp ueq float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 3 }
+define i32 @f10(float %a, float %b) {
+; CHECK-LABEL: f10:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = fcmp ult float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 3 }
+define i32 @f11(float %a, float %b) {
+; CHECK-LABEL: f11:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ule float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 2, 3 }
+define i32 @f12(float %a, float %b) {
+; CHECK-LABEL: f12:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = fcmp ugt float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 2, 3 }
+define i32 @f13(float %a, float %b) {
+; CHECK-LABEL: f13:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uge float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 2, 3 }
+define i32 @f14(float %a, float %b) {
+; CHECK-LABEL: f14:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp une float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/shift-10.ll b/test/CodeGen/SystemZ/shift-10.ll
new file mode 100644
index 0000000..46ed218
--- /dev/null
+++ b/test/CodeGen/SystemZ/shift-10.ll
@@ -0,0 +1,78 @@
+; Test compound shifts.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test a shift right followed by a sign extension.  This can use two shifts.
+define i64 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: sllg [[REG:%r[0-5]]], %r2, 62
+; CHECK: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %shr = lshr i32 %a, 1
+  %trunc = trunc i32 %shr to i1
+  %ext = sext i1 %trunc to i64
+  ret i64 %ext
+}
+
+; ...and again with the highest shift count.
+define i64 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: sllg [[REG:%r[0-5]]], %r2, 32
+; CHECK: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %shr = lshr i32 %a, 31
+  %trunc = trunc i32 %shr to i1
+  %ext = sext i1 %trunc to i64
+  ret i64 %ext
+}
+
+; Test a left shift that of an extended right shift in a case where folding
+; is possible.
+define i64 @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: risbg %r2, %r2, 27, 181, 9
+; CHECK: br %r14
+  %shr = lshr i32 %a, 1
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 10
+  %and = and i64 %shl, 137438952960
+  ret i64 %and
+}
+
+; ...and again with a larger right shift.
+define i64 @f4(i32 %a) {
+; CHECK-LABEL: f4:
+; CHECK: risbg %r2, %r2, 30, 158, 3
+; CHECK: br %r14
+  %shr = lshr i32 %a, 30
+  %ext = sext i32 %shr to i64
+  %shl = shl i64 %ext, 33
+  %and = and i64 %shl, 8589934592
+  ret i64 %and
+}
+
+; Repeat the previous test in a case where all bits outside the
+; bottom 3 matter.
+define i64 @f5(i32 %a) {
+; CHECK-LABEL: f5:
+; CHECK: risbg %r2, %r2, 29, 158, 3
+; CHECK: lhi %r2, 7
+; CHECK: br %r14
+  %shr = lshr i32 %a, 30
+  %ext = sext i32 %shr to i64
+  %shl = shl i64 %ext, 33
+  %or = or i64 %shl, 7
+  ret i64 %or
+}
+
+; Test that SRA gets replaced with SRL if the sign bit is the only one
+; that matters.
+define i64 @f6(i64 %a) {
+; CHECK-LABEL: f6:
+; CHECK: risbg %r2, %r2, 55, 183, 19
+; CHECK: br %r14
+  %shl = shl i64 %a, 10
+  %shr = ashr i64 %shl, 60
+  %and = and i64 %shr, 256
+  ret i64 %and
+}
diff --git a/test/CodeGen/SystemZ/spill-01.ll b/test/CodeGen/SystemZ/spill-01.ll
index 9de89d6..ca64a88 100644
--- a/test/CodeGen/SystemZ/spill-01.ll
+++ b/test/CodeGen/SystemZ/spill-01.ll
@@ -1,6 +1,7 @@
-; Test spilling using MVC.
+; Test spilling using MVC.  The tests here assume z10 register pressure,
+; without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare void @foo()
 
diff --git a/test/CodeGen/SystemZ/strcmp-01.ll b/test/CodeGen/SystemZ/strcmp-01.ll
new file mode 100644
index 0000000..122c160
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcmp-01.ll
@@ -0,0 +1,70 @@
+; Test strcmp using CLST, i32 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @strcmp(i8 *%src1, i8 *%src2)
+
+; Check a case where the result is used as an integer.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: br %r14
+  %res = call i32 @strcmp(i8 *%src1, i8 *%src2)
+  ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f2(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f2:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: je {{\.L.*}}
+; CHECK: br %r14
+  %res = call i32 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp eq i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Test a case where the result is used both as an integer and for
+; branching.
+define i32 @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: jl {{\.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/strcmp-02.ll b/test/CodeGen/SystemZ/strcmp-02.ll
new file mode 100644
index 0000000..27bd00b
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcmp-02.ll
@@ -0,0 +1,72 @@
+; Test strcmp using CLST, i64 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @strcmp(i8 *%src1, i8 *%src2)
+
+; Check a case where the result is used as an integer.
+define i64 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: br %r14
+  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
+  ret i64 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f2(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f2:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: je {{\.L.*}}
+; CHECK: br %r14
+  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp eq i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Test a case where the result is used both as an integer and for
+; branching.
+define i64 @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: jl {{\.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/strcpy-01.ll b/test/CodeGen/SystemZ/strcpy-01.ll
new file mode 100644
index 0000000..29bab62
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcpy-01.ll
@@ -0,0 +1,50 @@
+; Test strcpy using MVST.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@strcpy(i8 *%dest, i8 *%src)
+declare i8 *@stpcpy(i8 *%dest, i8 *%src)
+
+; Check strcpy.
+define i8 *@f1(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r2
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: mvst [[REG]], %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  %res = call i8 *@strcpy(i8 *%dest, i8 *%src)
+  ret i8 *%res
+}
+
+; Check stpcpy.
+define i8 *@f2(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: mvst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  %res = call i8 *@stpcpy(i8 *%dest, i8 *%src)
+  ret i8 *%res
+}
+
+; Check correct operation with other loads and stores.  The load must
+; come before the loop and the store afterwards.
+define i32 @f3(i32 %dummy, i8 *%dest, i8 *%src, i32 *%resptr, i32 *%storeptr) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: l %r2, 0(%r5)
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: mvst %r3, %r4
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK: mvhi 0(%r6), 0
+; CHECK: br %r14
+  %res = load i32 *%resptr
+  %unused = call i8 *@strcpy(i8 *%dest, i8 *%src)
+  store i32 0, i32 *%storeptr
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/strlen-01.ll b/test/CodeGen/SystemZ/strlen-01.ll
new file mode 100644
index 0000000..16161d4
--- /dev/null
+++ b/test/CodeGen/SystemZ/strlen-01.ll
@@ -0,0 +1,39 @@
+; Test strlen using SRST, i64 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @strlen(i8 *%src)
+declare i64 @strnlen(i8 *%src, i64 %len)
+
+; Test strlen with its proper i64 prototype.  It would also be valid for
+; the uses of %r3 and REG after the LGR to be swapped.
+define i64 @f1(i32 %dummy, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lghi %r2, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i64 @strlen(i8 *%src)
+  ret i64 %res
+}
+
+; Test strnlen with its proper i64 prototype.
+define i64 @f2(i64 %len, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: agr %r2, %r3
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i64 @strnlen(i8 *%src, i64 %len)
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/strlen-02.ll b/test/CodeGen/SystemZ/strlen-02.ll
new file mode 100644
index 0000000..e1abbff
--- /dev/null
+++ b/test/CodeGen/SystemZ/strlen-02.ll
@@ -0,0 +1,39 @@
+; Test strlen using SRST, i32 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i32 @strlen(i8 *%src)
+declare i32 @strnlen(i8 *%src, i32 %len)
+
+; Test strlen with an i32-based prototype.  It would also be valid for
+; the uses of %r3 and REG after the LGR to be swapped.
+define i32 @f1(i32 %dummy, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lghi %r2, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i32 @strlen(i8 *%src)
+  ret i32 %res
+}
+
+; Test strnlen with an i32-based prototype.
+define i32 @f2(i32 zeroext %len, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: agr %r2, %r3
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i32 @strnlen(i8 *%src, i32 %len)
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/unaligned-01.ll b/test/CodeGen/SystemZ/unaligned-01.ll
index 621069d..526a068 100644
--- a/test/CodeGen/SystemZ/unaligned-01.ll
+++ b/test/CodeGen/SystemZ/unaligned-01.ll
@@ -1,7 +1,10 @@
 ; Check that unaligned accesses are allowed in general.  We check the
 ; few exceptions (like CRL) in their respective test files.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; FIXME: -combiner-alias-analysis (the default for SystemZ) stops
+;        f1 from being optimized.
+; RUN: llc < %s -mtriple=s390x-linux-gnu -combiner-alias-analysis=false \
+; RUN:   | FileCheck %s
 
 ; Check that these four byte stores become a single word store.
 define void @f1(i8 *%ptr) {
diff --git a/test/CodeGen/SystemZ/xor-08.ll b/test/CodeGen/SystemZ/xor-08.ll
new file mode 100644
index 0000000..8cba41e
--- /dev/null
+++ b/test/CodeGen/SystemZ/xor-08.ll
@@ -0,0 +1,57 @@
+; Test memory-to-memory XORs.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test the simple i8 case.
+define void @f1(i8 *%ptr1) {
+; CHECK-LABEL: f1:
+; CHECK: xc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %xor = xor i8 %val, %old
+  store i8 %xor, i8 *%ptr2
+  ret void
+}
+
+; Test the simple i16 case.
+define void @f2(i16 *%ptr1) {
+; CHECK-LABEL: f2:
+; CHECK: xc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %old = load i16 *%ptr2
+  %xor = xor i16 %val, %old
+  store i16 %xor, i16 *%ptr2
+  ret void
+}
+
+; Test the simple i32 case.
+define void @f3(i32 *%ptr1) {
+; CHECK-LABEL: f3:
+; CHECK: xc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %old = load i32 *%ptr2
+  %xor = xor i32 %old, %val
+  store i32 %xor, i32 *%ptr2
+  ret void
+}
+
+; Test the i64 case.
+define void @f4(i64 *%ptr1) {
+; CHECK-LABEL: f4:
+; CHECK: xc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %xor = xor i64 %old, %val
+  store i64 %xor, i64 *%ptr2
+  ret void
+}
+
+; Leave other more complicated tests to and-08.ll.
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index f5b3739..b87bf24 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -47,25 +47,26 @@ declare double @sqrt(double) nounwind readonly
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
+!llvm.module.flags = !{!104}
 !0 = metadata !{i32 46, i32 0, metadata !1, null}
 !1 = metadata !{i32 524299, metadata !101, metadata !2, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !2 = metadata !{i32 524299, metadata !101, metadata !3, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !3 = metadata !{i32 524334, metadata !101, null, metadata !"getClosestDiagonal3", metadata !"getClosestDiagonal3", metadata !"_Z19getClosestDiagonal3ii", i32 44, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !4 = metadata !{i32 524329, metadata !101} ; [ DW_TAG_file_type ]
 !5 = metadata !{i32 524305, metadata !101, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !102, metadata !102, metadata !103, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !22, metadata !22}
-!8 = metadata !{i32 524307, metadata !99, null, metadata !"ggVector3", i32 66, i64 192, i64 32, i64 0, i32 0, null, metadata !10, i32 0, null} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 524307, metadata !99, null, metadata !"ggVector3", i32 66, i64 192, i64 32, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
 !9 = metadata !{i32 524329, metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", metadata !5} ; [ DW_TAG_file_type ]
 !99 = metadata !{metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !10 = metadata !{metadata !11, metadata !16, metadata !23, metadata !26, metadata !29, metadata !30, metadata !35, metadata !36, metadata !37, metadata !41, metadata !42, metadata !43, metadata !46, metadata !47, metadata !48, metadata !52, metadata !53, metadata !54, metadata !57, metadata !60, metadata !63, metadata !66, metadata !70, metadata !71, metadata !74, metadata !75, metadata !76, metadata !77, metadata !78, metadata !81, metadata !82, metadata !83, metadata !84, metadata !85, metadata !88, metadata !89, metadata !90}
 !11 = metadata !{i32 524301, metadata !99, metadata !8, metadata !"e", i32 160, i64 192, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]
-!12 = metadata !{i32 524289, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !13, metadata !14, i32 0, null} ; [ DW_TAG_array_type ]
+!12 = metadata !{i32 524289, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !13, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
 !13 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"double", i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !14 = metadata !{metadata !15}
 !15 = metadata !{i32 524321, i64 0, i64 3}        ; [ DW_TAG_subrange_type ]
 !16 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 72, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!17 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19, metadata !20}
 !19 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
 !20 = metadata !{i32 524310, metadata !100, null, metadata !"ggBoolean", i32 478, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_typedef ]
@@ -73,69 +74,69 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !100 = metadata !{metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm"}
 !22 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !23 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 73, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!24 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!24 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !19}
 !26 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 74, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!27 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!27 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !28 = metadata !{null, metadata !19, metadata !13, metadata !13, metadata !13}
 !29 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Set", metadata !"Set", metadata !"_ZN9ggVector33SetEddd", i32 81, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !30 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZNK9ggVector31xEv", i32 82, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!31 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !32, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!31 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !32, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !32 = metadata !{metadata !13, metadata !33}
 !33 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !34} ; [ DW_TAG_pointer_type ]
 !34 = metadata !{i32 524326, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ]
 !35 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZNK9ggVector31yEv", i32 83, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !36 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZNK9ggVector31zEv", i32 84, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !37 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZN9ggVector31xEv", i32 85, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!38 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !39, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!38 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !39, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !39 = metadata !{metadata !40, metadata !19}
 !40 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"double", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_reference_type ]
 !41 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZN9ggVector31yEv", i32 86, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !42 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZN9ggVector31zEv", i32 87, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !43 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetX", metadata !"SetX", metadata !"_ZN9ggVector34SetXEd", i32 88, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!44 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!44 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !45 = metadata !{null, metadata !19, metadata !13}
 !46 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetY", metadata !"SetY", metadata !"_ZN9ggVector34SetYEd", i32 89, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !47 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetZ", metadata !"SetZ", metadata !"_ZN9ggVector34SetZEd", i32 90, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !48 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 92, metadata !49, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!49 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !50, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!49 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !50, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !50 = metadata !{null, metadata !19, metadata !51}
 !51 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !34} ; [ DW_TAG_reference_type ]
 !52 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZNK9ggVector39toleranceEv", i32 100, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !53 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZN9ggVector39toleranceEv", i32 101, metadata !38, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !54 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+", metadata !"operator+", metadata !"_ZNK9ggVector3psEv", i32 107, metadata !55, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!55 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !56, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!55 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !56, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !56 = metadata !{metadata !51, metadata !33}
 !57 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-", metadata !"operator-", metadata !"_ZNK9ggVector3ngEv", i32 108, metadata !58, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!58 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !59, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!58 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !59, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !59 = metadata !{metadata !8, metadata !33}
 !60 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZNK9ggVector3ixEi", i32 290, metadata !61, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!61 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !62, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!61 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !62, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !62 = metadata !{metadata !13, metadata !33, metadata !22}
 !63 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZN9ggVector3ixEi", i32 278, metadata !64, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!64 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!64 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !65 = metadata !{metadata !40, metadata !19, metadata !22}
 !66 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+=", metadata !"operator+=", metadata !"_ZN9ggVector3pLERKS_", i32 303, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!67 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !68, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!67 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !68, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !68 = metadata !{metadata !69, metadata !19, metadata !51}
 !69 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"ggVector3", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_reference_type ]
 !70 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-=", metadata !"operator-=", metadata !"_ZN9ggVector3mIERKS_", i32 310, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !71 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator*=", metadata !"operator*=", metadata !"_ZN9ggVector3mLEd", i32 317, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!72 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !73, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!72 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !73, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !73 = metadata !{metadata !69, metadata !19, metadata !13}
 !74 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator/=", metadata !"operator/=", metadata !"_ZN9ggVector3dVEd", i32 324, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !75 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"length", metadata !"length", metadata !"_ZNK9ggVector36lengthEv", i32 121, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !76 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"squaredLength", metadata !"squaredLength", metadata !"_ZNK9ggVector313squaredLengthEv", i32 122, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !77 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"MakeUnitVector", metadata !"MakeUnitVector", metadata !"_ZN9ggVector314MakeUnitVectorEv", i32 217, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !78 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Perturb", metadata !"Perturb", metadata !"_ZNK9ggVector37PerturbEdd", i32 126, metadata !79, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!79 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !80, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!79 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !80, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !80 = metadata !{metadata !8, metadata !33, metadata !13, metadata !13}
 !81 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxComponent", metadata !"maxComponent", metadata !"_ZNK9ggVector312maxComponentEv", i32 128, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !82 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minComponent", metadata !"minComponent", metadata !"_ZNK9ggVector312minComponentEv", i32 129, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !83 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxAbsComponent", metadata !"maxAbsComponent", metadata !"_ZNK9ggVector315maxAbsComponentEv", i32 131, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !84 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minAbsComponent", metadata !"minAbsComponent", metadata !"_ZNK9ggVector315minAbsComponentEv", i32 132, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !85 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinComponent", metadata !"indexOfMinComponent", metadata !"_ZNK9ggVector319indexOfMinComponentEv", i32 133, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!86 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !87, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!86 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !87, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !87 = metadata !{metadata !22, metadata !33}
 !88 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinAbsComponent", metadata !"indexOfMinAbsComponent", metadata !"_ZNK9ggVector322indexOfMinAbsComponentEv", i32 137, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !89 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMaxComponent", metadata !"indexOfMaxComponent", metadata !"_ZNK9ggVector319indexOfMaxComponentEv", i32 146, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
@@ -151,3 +152,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !102 = metadata !{i32 0}
 !103 = metadata !{metadata !3}
+!104 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/Thumb/PR17309.ll b/test/CodeGen/Thumb/PR17309.ll
new file mode 100644
index 0000000..b7b08e9
--- /dev/null
+++ b/test/CodeGen/Thumb/PR17309.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple thumbv5-none-linux-gnueabi < %s | FileCheck %s
+
+%struct.C = type { [1000 x i8] }
+%struct.S = type { [1000 x i16] }
+%struct.I = type { [1000 x i32] }
+
+;CHECK-LABEL: pass_C:
+;CHECK-NOT: ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;CHECK-NOT: strb    r{{[0-9]+}}, [{{.*}}], #1
+define void @pass_C() #0 {
+entry:
+  %c = alloca %struct.C, align 1
+  %0 = getelementptr inbounds %struct.C* %c, i32 0, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 1000, i8* %0) #1
+  call void @use_C(%struct.C* byval %c) #3
+  call void @llvm.lifetime.end(i64 1000, i8* %0) #1
+  ret void
+}
+
+;CHECK-LABEL: pass_S:
+;CHECK-NOT: ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;CHECK-NOT: strh    r{{[0-9]+}}, [{{.*}}], #2
+define void @pass_S() #0 {
+entry:
+  %s = alloca %struct.S, align 2
+  %0 = bitcast %struct.S* %s to i8*
+  call void @llvm.lifetime.start(i64 2000, i8* %0) #1
+  call void @use_S(%struct.S* byval %s) #3
+  call void @llvm.lifetime.end(i64 2000, i8* %0) #1
+  ret void
+}
+
+;CHECK-LABEL: pass_I:
+;CHECK-NOT: ldr     r{{[0-9]+}}, [{{.*}}], #4
+;CHECK-NOT: str     r{{[0-9]+}}, [{{.*}}], #4
+define void @pass_I() #0 {
+entry:
+  %i = alloca %struct.I, align 4
+  %0 = bitcast %struct.I* %i to i8*
+  call void @llvm.lifetime.start(i64 4000, i8* %0) #1
+  call void @use_I(%struct.I* byval %i) #3
+  call void @llvm.lifetime.end(i64 4000, i8* %0) #1
+  ret void
+}
+
+declare void @use_C(%struct.C* byval) #2
+declare void @use_S(%struct.S* byval) #2
+declare void @use_I(%struct.I* byval) #2
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind optsize }
diff --git a/test/CodeGen/Thumb/barrier.ll b/test/CodeGen/Thumb/barrier.ll
index 8fca273..1c27fa0 100644
--- a/test/CodeGen/Thumb/barrier.ll
+++ b/test/CodeGen/Thumb/barrier.ll
@@ -7,7 +7,7 @@ define void @t1() {
 ; V6: blx {{_*}}sync_synchronize
 
 ; V6M-LABEL: t1:
-; V6M: dmb ish
+; V6M: dmb sy
   fence seq_cst
   ret void
 }
diff --git a/test/CodeGen/Thumb/lit.local.cfg b/test/CodeGen/Thumb/lit.local.cfg
index 4d75f58..8a3ba96 100644
--- a/test/CodeGen/Thumb/lit.local.cfg
+++ b/test/CodeGen/Thumb/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
index 486c064..1b8bdb1 100644
--- a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
+++ b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
@@ -8,10 +8,10 @@ define void @t() nounwind ssp {
 entry:
 ; CHECK-LABEL: t:
   %size = mul i32 8, 2
-; CHECK:  subs  r0, #16
+; CHECK:  sub.w  r0, sp, #16
 ; CHECK:  mov sp, r0
   %vla_a = alloca i8, i32 %size, align 8
-; CHECK:  subs  r0, #16
+; CHECK:  sub.w  r0, sp, #16
 ; CHECK:  mov sp, r0
   %vla_b = alloca i8, i32 %size, align 8
   unreachable
diff --git a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
index 244d0bb..810bfb7 100644
--- a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
+++ b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
@@ -40,7 +40,7 @@ entry:
 ; CHECK: pop
 ; CHECK: pop
 ; Do not convert into single stream code. BranchProbability Analysis assumes
-; that branches which goes to "ret" intruction have lower probabilities.
+; that branches which goes to "ret" instruction have lower probabilities.
   switch i32 undef, label %bb7 [
     i32 37, label %bb43
     i32 48, label %bb5
diff --git a/test/CodeGen/Thumb2/lit.local.cfg b/test/CodeGen/Thumb2/lit.local.cfg
index cb77b09..8a3ba96 100644
--- a/test/CodeGen/Thumb2/lit.local.cfg
+++ b/test/CodeGen/Thumb2/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Thumb2/tail-call-r9.ll b/test/CodeGen/Thumb2/tail-call-r9.ll
new file mode 100644
index 0000000..24c76c9
--- /dev/null
+++ b/test/CodeGen/Thumb2/tail-call-r9.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 | FileCheck %s
+
+@foo = common global void ()* null, align 4
+
+; Make sure in the presence of a tail call, r9 doesn't get used to hold
+; the destination address. It's callee-saved in AAPCS.
+define arm_aapcscc void @test(i32 %a) nounwind {
+; CHECK-LABEL: test:
+; CHECK-NOT bx r9
+  %tmp = load void ()** @foo, align 4
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r12}"() nounwind
+  tail call arm_aapcscc void %tmp() nounwind
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
index 85943cf..13a1ca2 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
-
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-no-restrict-it |FileCheck %s
 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK: ittt ne
@@ -74,7 +75,7 @@ entry:
 ; CHECK-LABEL: t3:
 ; CHECK: itt ge
 ; CHECK: movge r0, r1
-; CHECK: blge  _foo
+; CHECK: blge  {{_?}}foo
 	%tmp1 = icmp sgt i32 %a, 10		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
 
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
index 788fa06..403cd48 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-no-restrict-it | FileCheck %s
 
 define void @foo(i32 %X, i32 %Y) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
index bcf10ef..a71aa3f 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-no-restrict-it | FileCheck %s
 
 ; There shouldn't be a unconditional branch at end of bb52.
 ; rdar://7184787
diff --git a/test/CodeGen/Thumb2/thumb2-select.ll b/test/CodeGen/Thumb2/thumb2-select.ll
index 0feaf95..5f5fa19 100644
--- a/test/CodeGen/Thumb2/thumb2-select.ll
+++ b/test/CodeGen/Thumb2/thumb2-select.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -show-mc-encoding | FileCheck %s
 
 define i32 @f1(i32 %a.s) {
 entry:
@@ -66,7 +66,7 @@ define i32 @f7(i32 %a, i32 %b, i32 %c) {
 entry:
 ; CHECK-LABEL: f7:
 ; CHECK: it hi
-; CHECK: lsrhi.w
+; CHECK: lsrhi {{r[0-9]+}}
     %tmp1 = icmp ugt i32 %a, %b
     %tmp2 = udiv i32 %c, 3
     %tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -77,7 +77,7 @@ define i32 @f8(i32 %a, i32 %b, i32 %c) {
 entry:
 ; CHECK-LABEL: f8:
 ; CHECK: it lo
-; CHECK: lsllo.w
+; CHECK: lsllo {{r[0-9]+}}
     %tmp1 = icmp ult i32 %a, %b
     %tmp2 = mul i32 %c, 4
     %tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -96,3 +96,20 @@ entry:
     %tmp5 = select i1 %tmp1, i32 %tmp4, i32 3
     ret i32 %tmp5
 }
+
+define i32 @f10(i32 %a, i32 %b) {
+; CHECK-LABEL: f10:
+; CHECK: movwne {{r[0-9]+}}, #1234    @ encoding: [0x40,0xf2,0xd2,0x4{{[0-9a-f]+}}]
+    %tst = icmp ne i32 %a, %b
+    %val = select i1 %tst, i32 1234, i32 12345
+    ret i32 %val
+}
+
+; Make sure we pick the Thumb encoding for movw/movt
+define i32 @f11(i32 %a, i32 %b) {
+; CHECK-LABEL: f11:
+; CHECK: movwne {{r[0-9]+}}, #50033         @ encoding: [0x4c,0xf2,0x71,0x3{{[0-9a-f]+}}]
+    %tst = icmp ne i32 %a, %b
+    %val = select i1 %tst, i32 123454321, i32 543212345
+    ret i32 %val
+}
diff --git a/test/CodeGen/Thumb2/v8_IT_1.ll b/test/CodeGen/Thumb2/v8_IT_1.ll
new file mode 100644
index 0000000..30dbb48
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_1.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=thumbv8 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -mattr=+neon -arm-restrict-it | FileCheck %s
+
+;CHECK-LABEL: select_s_v_v:
+;CHECK-NOT: it
+;CHECK: bx
+define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
+  ret <16 x i8> %vld1.
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+
diff --git a/test/CodeGen/Thumb2/v8_IT_2.ll b/test/CodeGen/Thumb2/v8_IT_2.ll
new file mode 100644
index 0000000..170b413
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_2.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+
+	%struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* }
+
+define fastcc i32 @CountTree(%struct.quad_struct* %tree) {
+entry:
+; CHECK-LABEL: CountTree:
+; CHECK: bne
+; CHECK: cmp
+; CHECK: it eq
+; CHECK: cmpeq
+; CHECK: bne
+; CHECK: mov
+; CHECK: pop
+	br label %tailrecurse
+
+tailrecurse:		; preds = %bb, %entry
+	%tmp6 = load %struct.quad_struct** null		; <%struct.quad_struct*> [#uses=1]
+	%tmp9 = load %struct.quad_struct** null		; <%struct.quad_struct*> [#uses=2]
+	%tmp12 = load %struct.quad_struct** null		; <%struct.quad_struct*> [#uses=1]
+	%tmp14 = icmp eq %struct.quad_struct* null, null		; <i1> [#uses=1]
+	%tmp17 = icmp eq %struct.quad_struct* %tmp6, null		; <i1> [#uses=1]
+	%tmp23 = icmp eq %struct.quad_struct* %tmp9, null		; <i1> [#uses=1]
+	%tmp29 = icmp eq %struct.quad_struct* %tmp12, null		; <i1> [#uses=1]
+	%bothcond = and i1 %tmp17, %tmp14		; <i1> [#uses=1]
+	%bothcond1 = and i1 %bothcond, %tmp23		; <i1> [#uses=1]
+	%bothcond2 = and i1 %bothcond1, %tmp29		; <i1> [#uses=1]
+	br i1 %bothcond2, label %return, label %bb
+
+bb:		; preds = %tailrecurse
+	%tmp41 = tail call fastcc i32 @CountTree( %struct.quad_struct* %tmp9 )		; <i32> [#uses=0]
+	br label %tailrecurse
+
+return:		; preds = %tailrecurse
+	ret i32 0
+}
+
diff --git a/test/CodeGen/Thumb2/v8_IT_3.ll b/test/CodeGen/Thumb2/v8_IT_3.ll
new file mode 100644
index 0000000..4dca246
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_3.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+
+%struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
+%struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
+
+@FuncPtr = external hidden unnamed_addr global %struct.FF*
+@.str1 = external hidden unnamed_addr constant [6 x i8], align 4
+@G = external unnamed_addr global i32
+@.str2 = external hidden unnamed_addr constant [58 x i8], align 4
+@.str3 = external hidden unnamed_addr constant [58 x i8], align 4
+
+define i32 @test() nounwind optsize ssp {
+entry:
+; CHECK-LABEL: test:
+; CHECK: push
+; CHECK-NOT: push
+  %block_size = alloca i32, align 4
+  %block_count = alloca i32, align 4
+  %index_cache = alloca i32, align 4
+  store i32 0, i32* %index_cache, align 4
+  %tmp = load i32* @G, align 4
+  %tmp1 = call i32 @bar(i32 0, i32 0, i32 %tmp) nounwind
+  switch i32 %tmp1, label %bb8 [
+    i32 0, label %bb
+    i32 536870913, label %bb4
+    i32 536870914, label %bb6
+  ]
+
+bb:
+  %tmp2 = load i32* @G, align 4
+  %tmp4 = icmp eq i32 %tmp2, 0
+  br i1 %tmp4, label %bb1, label %bb8
+
+bb1:
+; CHECK: %bb6
+; CHECK: it	eq
+; CHECK-NEXT: ldreq
+; CHECK-NEXT: it	eq
+; CHECK-NEXT: cmpeq
+; CHECK: %bb1
+  %tmp5 = load i32* %block_size, align 4
+  %tmp6 = load i32* %block_count, align 4
+  %tmp7 = call %struct.FF* @Get() nounwind
+  store %struct.FF* %tmp7, %struct.FF** @FuncPtr, align 4
+  %tmp10 = zext i32 %tmp6 to i64
+  %tmp11 = zext i32 %tmp5 to i64
+  %tmp12 = mul nsw i64 %tmp10, %tmp11
+  %tmp13 = call i32 @foo(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), i64 %tmp12, i32 %tmp5) nounwind
+  br label %bb8
+
+bb4:
+; CHECK-PIC: cmp
+; CHECK-PIC: cmp
+; CHECK-PIC-NEXT: bne
+; CHECK-PIC-NEXT: %bb4
+; CHECK-PIC-NEXT: movs
+; CHECK-PIC-NEXT: add
+; CHECK-PIC-NEXT: pop
+  ret i32 0
+
+bb6:
+  ret i32 1
+
+bb8:
+  ret i32 -1
+}
+
+declare i32 @printf(i8*, ...)
+
+declare %struct.FF* @Get()
+
+declare i32 @foo(i8*, i64, i32)
+
+declare i32 @bar(i32, i32, i32)
diff --git a/test/CodeGen/Thumb2/v8_IT_4.ll b/test/CodeGen/Thumb2/v8_IT_4.ll
new file mode 100644
index 0000000..5a80d8c
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_4.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=thumbv8-eabi -float-abi=hard | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-eabi -float-abi=hard -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-eabi -float-abi=hard -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-eabi -float-abi=hard -regalloc=basic -arm-restrict-it | FileCheck %s
+
+%"struct.__gnu_cxx::__normal_iterator<char*,std::basic_string<char, std::char_traits<char>, std::allocator<char> > >" = type { i8* }
+%"struct.__gnu_cxx::new_allocator<char>" = type <{ i8 }>
+%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >" = type { %"struct.__gnu_cxx::__normal_iterator<char*,std::basic_string<char, std::char_traits<char>, std::allocator<char> > >" }
+%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >::_Rep" = type { %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >::_Rep_base" }
+%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >::_Rep_base" = type { i32, i32, i32 }
+
+
+define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) {
+; CHECK-LABEL: _ZNKSs7compareERKSs:
+; CHECK:      cbnz	r0,
+; CHECK-NEXT: %bb
+; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}}
+; CHECK-NEXT: %bb1
+; CHECK-NEXT: pop.w
+entry:
+  %0 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
+  %1 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) ; <i32> [#uses=3]
+  %2 = icmp ult i32 %1, %0                        ; <i1> [#uses=1]
+  %3 = select i1 %2, i32 %1, i32 %0               ; <i32> [#uses=1]
+  %4 = tail call arm_aapcs_vfpcc  i8* @_ZNKSs7_M_dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i8*> [#uses=1]
+  %5 = tail call arm_aapcs_vfpcc  i8* @_ZNKSs4dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) ; <i8*> [#uses=1]
+  %6 = tail call arm_aapcs_vfpcc  i32 @memcmp(i8* %4, i8* %5, i32 %3) nounwind readonly ; <i32> [#uses=2]
+  %7 = icmp eq i32 %6, 0                          ; <i1> [#uses=1]
+  br i1 %7, label %bb, label %bb1
+
+bb:                                               ; preds = %entry
+  %8 = sub i32 %0, %1                             ; <i32> [#uses=1]
+  ret i32 %8
+
+bb1:                                              ; preds = %entry
+  ret i32 %6
+}
+
+declare arm_aapcs_vfpcc i32 @memcmp(i8* nocapture, i8* nocapture, i32) nounwind readonly
+
+declare arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this)
+
+declare arm_aapcs_vfpcc i8* @_ZNKSs7_M_dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this)
+
+declare arm_aapcs_vfpcc i8* @_ZNKSs4dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this)
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
new file mode 100644
index 0000000..30250c8
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+; CHECK: it	ne
+; CHECK-NEXT: cmpne
+; CHECK-NEXT: beq
+; CHECK: cmp
+; CHECK-NEXT: beq
+; CHECK-NEXT: %if.else163
+; CHECK-NEXT: mov.w
+; CHECK-NEXT: b
+; CHECK-NEXT: %if.else145
+; CHECK-NEXT: mov.w
+
+%struct.hc = type { i32, i32, i32, i32 }
+
+define i32 @t(i32 %type) optsize {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  unreachable
+
+if.else:
+  br i1 undef, label %if.then15, label %if.else18
+
+if.then15:
+  unreachable
+
+if.else18:
+  switch i32 %type, label %if.else173 [
+    i32 3, label %if.then115
+    i32 1, label %if.then102
+  ]
+
+if.then102:
+  br i1 undef, label %cond.true10.i, label %t.exit
+
+cond.true10.i:
+  br label %t.exit
+
+t.exit:
+  unreachable
+
+if.then115:
+  br i1 undef, label %if.else163, label %if.else145
+
+if.else145:
+  %call150 = call fastcc %struct.hc* @foo(%struct.hc* undef, i32 34865152) optsize
+  br label %while.body172
+
+if.else163:
+  %call168 = call fastcc %struct.hc* @foo(%struct.hc* undef, i32 34078720) optsize
+  br label %while.body172
+
+while.body172:
+  br label %while.body172
+
+if.else173:
+  ret i32 -1
+}
+
+declare hidden fastcc %struct.hc* @foo(%struct.hc* nocapture, i32) nounwind optsize
+
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
index 0afddd8..69266dc 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,7 +1,10 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
-; RUN:   grep asm-printer | grep 14
+; RUN:   grep asm-printer | grep 16
 ;
+; It's possible to schedule this in 14 instructions by avoiding
+; callee-save registers, but the scheduler isn't currently that
+; conervative with registers.
 @size20 = external global i32		; <i32*> [#uses=1]
 @in5 = external global i8*		; <i8**> [#uses=1]
 
@@ -21,4 +24,3 @@ define i32 @compare(i8* %a, i8* %b) nounwind {
 }
 
 declare i32 @memcmp(i8*, i8*, i32)
-
diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll
index 24aa5b9..4ec7039 100644
--- a/test/CodeGen/X86/2007-01-08-InstrSched.ll
+++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll
@@ -13,10 +13,10 @@ define float @foo(float %x) nounwind {
 
 ; CHECK: mulss
 ; CHECK: mulss
-; CHECK: addss
 ; CHECK: mulss
-; CHECK: addss
 ; CHECK: mulss
 ; CHECK: addss
+; CHECK: addss
+; CHECK: addss
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index c5d2a46..638d399 100644
--- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
@@ -1,4 +1,13 @@
-; RUN: llc < %s -march=x86 | grep weak | count 2
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
+
 @__gthrw_pthread_once = alias weak i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
 
-declare extern_weak i32 @pthread_once(i32*, void ()*)
+define weak i32 @pthread_once(i32*, void ()*) {
+  ret i32 0
+}
+
+; CHECK: .weak   pthread_once
+; CHECK: pthread_once:
+
+; CHECK: .weak   __gthrw_pthread_once
+; CHECK: __gthrw_pthread_once = pthread_once
diff --git a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
index 7a3d72d..1ec9c70 100644
--- a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
+++ b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep sarl | not grep esp
+; RUN: llc < %s -march=x86 -mcpu=corei7 | grep sarl | not grep esp
 
 define signext   i16 @t(i16* %qmatrix, i16* %dct, i16* %acBaseTable, i16* %acExtTable, i16 signext  %acBaseRes, i16 signext  %acMaskRes, i16 signext  %acExtRes, i32* %bitptr, i32* %source, i32 %markerPrefix, i8** %byteptr, i32 %scale, i32 %round, i32 %bits) {
 entry:
diff --git a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
index 8946415..18b3714 100644
--- a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
+++ b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
@@ -45,4 +45,6 @@ bb383:		; preds = %bb374.us, %bb311.split
 	ret i64 0
 }
 
-declare i64 @__wcstoll_l(i32*, i32**, i32, %struct.__locale_struct*) nounwind 
+define i64 @__wcstoll_l(i32*, i32**, i32, %struct.__locale_struct*) nounwind {
+  ret i64 0
+}
diff --git a/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll b/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll
index 4eaca17..86bce8e 100644
--- a/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll
+++ b/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mattr=+sse41
+; RUN: llc < %s -mattr=+sse4.1
 ; rdar://5886601
 ; gcc testsuite:  gcc.target/i386/sse4_1-pblendw.c
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index ecd8663..296f0ca 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -67,17 +67,16 @@ declare i64 @strlen(i8*) nounwind readonly
 declare void @llvm.stackrestore(i8*) nounwind
 
 !0 = metadata !{i32 459009, metadata !1, metadata !"s1", metadata !2, i32 2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true,
-                i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 458769, metadata !17, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6}
 !5 = metadata !{i32 458788, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
 !7 = metadata !{i32 2, i32 0, metadata !1, null}
 !8 = metadata !{i32 459008, metadata !1, metadata !"str.0", metadata !2, i32 3, metadata !9} ; [ DW_TAG_auto_variable ]
 !9 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null} ; [ DW_TAG_array_type ]
+!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 458785, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
 !13 = metadata !{i32 3, i32 0, metadata !14, null}
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 8174fbd..764c2cd 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm"
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
 
@@ -17,9 +17,9 @@ bb4:		; preds = %bb.i, %bb26, %bb4, %entry
 ; CHECK: %bb4
 ; CHECK: xorl
 ; CHECK: callq
-; CHECK: movq
 ; CHECK: xorl
 ; CHECK: xorl
+; CHECK: movq
 
 	%0 = call i32 (...)* @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
 	%ins = or i64 %p, 2097152		; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
index 5cb05e8..e1930e0 100644
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
+; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
 ; RUN:   FileCheck %s
 ; rdar://6808032
 
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index ae2e9ac..a936edc 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -24,8 +24,7 @@ declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
 !1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true,
-               i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
 !4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
 !5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index c54f030..f99e682 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -29,18 +29,19 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7}
 !6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null} ; [ DW_TAG_structure_type ]
+!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
 !9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
+!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !13}
 !12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
 !13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
@@ -51,3 +52,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !18 = metadata !{metadata !1}
 !19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
 !20 = metadata !{i32 0}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index 71c7b65..4d4e8c1 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -18,7 +18,7 @@ declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.
 
 !0 = metadata !{i32 458769, metadata !15, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, i32 0} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 458790, metadata !15, metadata !0, metadata !"", i32 0, i64 192, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_const_type ]
-!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null} ; [ DW_TAG_structure_type ]
+!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
 !3 = metadata !{metadata !4, metadata !6, metadata !7}
 !4 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"x", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
 !5 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
@@ -27,7 +27,7 @@ declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.
 !8 = metadata !{i32 459008, metadata !9, metadata !"t", metadata !0, i32 5, metadata !2} ; [ DW_TAG_auto_variable ]
 !9 = metadata !{i32 458763, null, metadata !10, i32 0, i32 0, i32 0}        ; [ DW_TAG_lexical_block ]
 !10 = metadata !{i32 458798, i32 0, metadata !0, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest}
diff --git a/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll b/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
index d4a74c9..060c535 100644
--- a/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
+++ b/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mcpu=generic -mtriple=i386-apple-darwin -tailcallopt < %s | FileCheck %s
+; RUN: llc -mcpu=generic -mtriple=i386-apple-darwin -tailcallopt -enable-misched=false < %s | FileCheck %s
 ; Check that lowered argumens do not overwrite the return address before it is moved.
 ; Bug 6225
 ;
 ; If a call is a fastcc tail call and tail call optimization is enabled, the
-; caller frame is replaced by the callee frame. This can require that arguments are 
+; caller frame is replaced by the callee frame. This can require that arguments are
 ; placed on the former return address stack slot. Special care needs to be taken
 ; taken that the return address is moved / or stored in a register before
 ; lowering of arguments potentially overwrites the value.
@@ -51,5 +51,3 @@ false:
   tail call fastcc void @l298(i32 %r10, i32 %r9, i32 %r4) noreturn nounwind
   ret void
 }
-
-
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 00ac71a..7faee99 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -199,12 +199,13 @@ declare float @copysignf(float, float) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!48}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !45, metadata !2, metadata !"__divsc3", metadata !"__divsc3", metadata !"__divsc3", i32 1922, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43, i32 1922} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !45} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !45, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !44, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9}
 !6 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SCtype", i32 170, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
 !7 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
@@ -248,3 +249,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !47 = metadata !{i32 0}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index 4b1dfb3..c5736eb 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -22,6 +22,7 @@ declare void @foo(i32) nounwind optsize noinline ssp
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!38}
 
 !0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ]
 !1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ]
@@ -29,21 +30,21 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
 !5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !3}
 !8 = metadata !{i32 786689, metadata !9, metadata !"myvar", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 17, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i8* (%struct.a*)* @bar, null, null, metadata !34, i32 17} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !12, metadata !13}
 !12 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
 !13 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null} ; [ DW_TAG_structure_type ]
+!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
 !16 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"c", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !3} ; [ DW_TAG_member ]
 !17 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"d", i32 4, i64 64, i64 64, i64 64, i32 0, metadata !13} ; [ DW_TAG_member ]
 !18 = metadata !{i32 786689, metadata !19, metadata !"argc", metadata !1, i32 22, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 22, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !35, i32 22} ; [ DW_TAG_subprogram ]
-!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !3, metadata !3, metadata !22}
 !22 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
 !23 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ]
@@ -86,3 +87,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   83
 ; CHECK-NEXT: Ltmp{{.*}}:
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index d5c0ead..1114c8d 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -23,12 +23,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!20}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, metadata !15, i32 2} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6}
 !6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !2, i32 6, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -48,3 +49,4 @@ entry:
 ;CHECK: DEBUG_VALUE: bar:x <- E
 ;CHECK: Ltmp
 ;CHECK:	DEBUG_VALUE: foo:y <- 1{{$}}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 1571a58..b45ac22 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -20,18 +20,19 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!34}
 !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_structure_type ]
+!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
 !3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
 !4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !5 = metadata !{metadata !6, metadata !1, metadata !8}
 !6 = metadata !{i32 786445, metadata !31, metadata !2, metadata !"y", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ]
 !7 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"baz", metadata !"baz", metadata !"_ZN3foo3bazEi", i32 15, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 15} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{metadata !7, metadata !11, metadata !7}
 !11 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !2} ; [ DW_TAG_pointer_type ]
 !12 = metadata !{i32 786470, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !13} ; [ DW_TAG_const_type ]
@@ -41,7 +42,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !3, i32 15, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 786689, metadata !18, metadata !"argc", metadata !3, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !18 = metadata !{i32 786478, metadata !31, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 19, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, null, i32 19} ; [ DW_TAG_subprogram ]
-!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{metadata !7, metadata !7, metadata !21}
 !21 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
 !22 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
@@ -56,3 +57,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"}
 !32 = metadata !{i32 0}
 !33 = metadata !{metadata !1, metadata !8, metadata !18}
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index e91cd76..b49aec3 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -7,15 +7,14 @@
 !39 = metadata !{i32 524305, metadata !109, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !108, metadata !108, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !46 = metadata !{i32 524303, metadata !109, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ]
 !47 = metadata !{i32 524324, metadata !109, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true,
-                i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !99 = metadata !{metadata !100}
 !100 = metadata !{i32 524324, metadata !109, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !101 = metadata !{[2 x i8*]* @C.9.2167}
 !102 = metadata !{i32 524544, metadata !103, metadata !"find_strings", metadata !38, i32 75, metadata !104, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 !103 = metadata !{i32 524299, null, metadata !97, i32 73, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null} ; [ DW_TAG_array_type ]
+!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
 !105 = metadata !{metadata !106}
 !106 = metadata !{i32 524321, i64 0, i64 1333}    ; [ DW_TAG_subrange_type ]
 !107 = metadata !{i32 73, i32 0, metadata !103, null}
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index c6e1654..91fec3b 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -75,10 +75,11 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!49}
 !46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_structure_type ]
+!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
 !2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !47, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !48, metadata !48, metadata !46, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
@@ -87,18 +88,18 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !7 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
 !8 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 12} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
 !12 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
 !13 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
 !16 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !17 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 16} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
 !20 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 23} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
 !23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
@@ -125,3 +126,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
 !47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
 !48 = metadata !{i32 0}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index 831fe66..9aa41c3 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -13,11 +13,12 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!17}
 
 !0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 53, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114084)", i1 false, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, metadata !13, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, metadata !14, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786478, metadata !15, metadata !7, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
@@ -31,3 +32,4 @@ entry:
 !14 = metadata !{metadata !"", metadata !"/private/tmp"}
 !15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"}
 !16 = metadata !{i32 0}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
index 1b33977..39d89e3 100644
--- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: movq	___stack_chk_guard@GOTPCREL(%rip)
-; CHECK: movb   38(%rsp), [[R0:%.+]]
-; CHECK: movb   8(%rsp), [[R1:%.+]]
-; CHECK: movb   [[R1]], 8(%rsp)
-; CHECK: movb   [[R0]], 38(%rsp)
+; CHECK: movb   (%rsp), [[R1:%.+]]
+; CHECK: movb   30(%rsp), [[R0:%.+]]
+; CHECK: movb   [[R1]], (%rsp)
+; CHECK: movb   [[R0]], 30(%rsp)
 ; CHECK: callq	___stack_chk_fail
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index e118e80..21ac7c9 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -16,16 +16,17 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!19}
 
 !0 = metadata !{i32 786478, metadata !17, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.bar*)* @foo, null, null, metadata !16, i32 3} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 2.9 (trunk 117922)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !15, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, metadata !17, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786689, metadata !0, metadata !"i", metadata !1, i32 3, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !7 = metadata !{i32 786447, metadata !17, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !11}
 !10 = metadata !{i32 786445, metadata !17,  metadata !1, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
 !11 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ]
@@ -36,3 +37,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !16 = metadata !{metadata !6}
 !17 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
 !18 = metadata !{i32 0}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-12-02-MC-Set.ll b/test/CodeGen/X86/2010-12-02-MC-Set.ll
index 1a4c586..5a407d3 100644
--- a/test/CodeGen/X86/2010-12-02-MC-Set.ll
+++ b/test/CodeGen/X86/2010-12-02-MC-Set.ll
@@ -7,12 +7,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10}
 !7 = metadata !{metadata !0}
 
 !0 = metadata !{i32 786478, metadata !9, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !9} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 2.9 (trunk 120563)", i1 false, metadata !"", i32 0, metadata !8, metadata !8, metadata !7, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !9, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !9, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 5, i32 1, metadata !6, null}
 !6 = metadata !{i32 786443, metadata !9, metadata !0, i32 3, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
@@ -23,3 +24,4 @@ entry:
 ; CHECK-NEXT: __debug_line
 ; CHECK-NEXT: Lline_table_start0
 ; CHECK-NEXT: Ltmp{{[0-9]}} = (Ltmp
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 3e0fbca..d534030 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -70,15 +70,16 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 declare i32 @puts(i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
 !1 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !31, i32 12, metadata !"clang version 2.9 (trunk 124117)", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !28, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
+!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -104,3 +105,4 @@ declare i32 @puts(i8* nocapture) nounwind
 !30 = metadata !{metadata !14, metadata !17}
 !31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"}
 !32 = metadata !{i32 0}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
index 0f18f09..91cd208 100644
--- a/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
+++ b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.6.0"
 
 @aux_temp = external global %struct.dfa, align 8
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readnone
 
 declare void @__memset_chk() nounwind
 
@@ -21,12 +21,12 @@ if.end.i:                                         ; preds = %entry
   br i1 undef, label %land.end.thread.i, label %land.end.i
 
 land.end.thread.i:                                ; preds = %if.end.i
-  %0 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %0 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false) nounwind
   %cmp1710.i = icmp eq i64 %0, -1
   br i1 %cmp1710.i, label %cond.false156.i, label %cond.true138.i
 
 land.end.i:                                       ; preds = %if.end.i
-  %1 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %1 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false) nounwind
   %cmp17.i = icmp eq i64 %1, -1
   br i1 %cmp17.i, label %cond.false156.i, label %cond.true138.i
 
@@ -41,13 +41,8 @@ cond.false156.i:                                  ; preds = %for.end.i, %land.en
 
 cond.end166.i:                                    ; preds = %cond.false156.i, %cond.true138.i
   %idxprom1113.i = phi i64 [ %idxprom1114.i, %cond.false156.i ], [ undef, %cond.true138.i ]
-  %tmp235.i = load %struct.state** getelementptr inbounds (%struct.dfa* @aux_temp, i64 0, i32 2), align 8, !tbaa !0
+  %tmp235.i = load %struct.state** getelementptr inbounds (%struct.dfa* @aux_temp, i64 0, i32 2), align 8
   %att.i = getelementptr inbounds %struct.state* %tmp235.i, i64 %idxprom1113.i, i32 0
-  store i32 0, i32* %att.i, align 4, !tbaa !3
+  store i32 0, i32* %att.i, align 4
   ret void
 }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
index ce63c74..5275b68 100644
--- a/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -29,3 +29,21 @@ entry:
   store float %conv, float* %f, align 4
   ret float %conv
 }
+
+define void @PR17495() {
+entry:
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.1.copyload = load i24* undef, align 1
+  %conv = sitofp i24 %x.1.copyload to float
+  %div = fmul float %conv, 0x3E80000000000000
+  store float %div, float* undef, align 4
+  br i1 false, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+
+; CHECK-LABEL: @PR17495
+; CHECK-NOT: fildll
+}
diff --git a/test/CodeGen/X86/2011-09-18-sse2cmp.ll b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
index a6f428f..89de648 100644
--- a/test/CodeGen/X86/2011-09-18-sse2cmp.ll
+++ b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse41 | FileCheck %s
+;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
 
 ;CHECK: @max
 ;CHECK: cmplepd
diff --git a/test/CodeGen/X86/2011-09-21-setcc-bug.ll b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
index 4daf678..a67c3f3 100644
--- a/test/CodeGen/X86/2011-09-21-setcc-bug.ll
+++ b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1
 
 ; Make sure we are not crashing on this code.
 
diff --git a/test/CodeGen/X86/2011-10-11-srl.ll b/test/CodeGen/X86/2011-10-11-srl.ll
index 6c6d340..434f88c 100644
--- a/test/CodeGen/X86/2011-10-11-srl.ll
+++ b/test/CodeGen/X86/2011-10-11-srl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-sse41 
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-sse4.1
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/2011-10-12-MachineCSE.ll b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
index cd15f84..72e672a 100644
--- a/test/CodeGen/X86/2011-10-12-MachineCSE.ll
+++ b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
@@ -20,11 +20,11 @@ entry:
   %2 = lshr i32 %1, 16
   %bf.clear = and i32 %2, 255
   %idxprom = sext i32 %bf.clear to i64
-  %3 = load %struct.optab** getelementptr inbounds ([49 x %struct.optab*]* @optab_table, i32 0, i64 0), align 8, !tbaa !0
+  %3 = load %struct.optab** getelementptr inbounds ([49 x %struct.optab*]* @optab_table, i32 0, i64 0), align 8
   %handlers = getelementptr inbounds %struct.optab* %3, i32 0, i32 1
   %arrayidx = getelementptr inbounds [59 x %struct.anon.3]* %handlers, i32 0, i64 %idxprom
   %insn_code = getelementptr inbounds %struct.anon.3* %arrayidx, i32 0, i32 0
-  %4 = load i32* %insn_code, align 4, !tbaa !3
+  %4 = load i32* %insn_code, align 4
   %cmp = icmp eq i32 %4, 1317
   br i1 %cmp, label %if.then, label %lor.lhs.false
 
@@ -32,14 +32,14 @@ lor.lhs.false:                                    ; preds = %entry
   %idxprom1 = sext i32 %4 to i64
   %arrayidx2 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom1
   %operand = getelementptr inbounds %struct.insn_data* %arrayidx2, i32 0, i32 3
-  %5 = load %struct.insn_operand_data** %operand, align 8, !tbaa !0
+  %5 = load %struct.insn_operand_data** %operand, align 8
   %arrayidx3 = getelementptr inbounds %struct.insn_operand_data* %5, i64 0
   %predicate = getelementptr inbounds %struct.insn_operand_data* %arrayidx3, i32 0, i32 0
-  %6 = load i32 (%struct.rtx_def*, i32)** %predicate, align 8, !tbaa !0
+  %6 = load i32 (%struct.rtx_def*, i32)** %predicate, align 8
   %idxprom4 = sext i32 %4 to i64
   %arrayidx5 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom4
   %operand6 = getelementptr inbounds %struct.insn_data* %arrayidx5, i32 0, i32 3
-  %7 = load %struct.insn_operand_data** %operand6, align 8, !tbaa !0
+  %7 = load %struct.insn_operand_data** %operand6, align 8
   %arrayidx7 = getelementptr inbounds %struct.insn_operand_data* %7, i64 0
   %8 = bitcast %struct.insn_operand_data* %arrayidx7 to i8*
   %bf.field.offs = getelementptr i8* %8, i32 16
@@ -54,14 +54,14 @@ lor.lhs.false9:                                   ; preds = %lor.lhs.false
   %idxprom10 = sext i32 %4 to i64
   %arrayidx11 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom10
   %operand12 = getelementptr inbounds %struct.insn_data* %arrayidx11, i32 0, i32 3
-  %11 = load %struct.insn_operand_data** %operand12, align 8, !tbaa !0
+  %11 = load %struct.insn_operand_data** %operand12, align 8
   %arrayidx13 = getelementptr inbounds %struct.insn_operand_data* %11, i64 1
   %predicate14 = getelementptr inbounds %struct.insn_operand_data* %arrayidx13, i32 0, i32 0
-  %12 = load i32 (%struct.rtx_def*, i32)** %predicate14, align 8, !tbaa !0
+  %12 = load i32 (%struct.rtx_def*, i32)** %predicate14, align 8
   %idxprom15 = sext i32 %4 to i64
   %arrayidx16 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom15
   %operand17 = getelementptr inbounds %struct.insn_data* %arrayidx16, i32 0, i32 3
-  %13 = load %struct.insn_operand_data** %operand17, align 8, !tbaa !0
+  %13 = load %struct.insn_operand_data** %operand17, align 8
   %arrayidx18 = getelementptr inbounds %struct.insn_operand_data* %13, i64 1
   %14 = bitcast %struct.insn_operand_data* %arrayidx18 to i8*
   %bf.field.offs19 = getelementptr i8* %14, i32 16
@@ -76,14 +76,14 @@ lor.lhs.false23:                                  ; preds = %lor.lhs.false9
   %idxprom24 = sext i32 %4 to i64
   %arrayidx25 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom24
   %operand26 = getelementptr inbounds %struct.insn_data* %arrayidx25, i32 0, i32 3
-  %17 = load %struct.insn_operand_data** %operand26, align 8, !tbaa !0
+  %17 = load %struct.insn_operand_data** %operand26, align 8
   %arrayidx27 = getelementptr inbounds %struct.insn_operand_data* %17, i64 2
   %predicate28 = getelementptr inbounds %struct.insn_operand_data* %arrayidx27, i32 0, i32 0
-  %18 = load i32 (%struct.rtx_def*, i32)** %predicate28, align 8, !tbaa !0
+  %18 = load i32 (%struct.rtx_def*, i32)** %predicate28, align 8
   %idxprom29 = sext i32 %4 to i64
   %arrayidx30 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom29
   %operand31 = getelementptr inbounds %struct.insn_data* %arrayidx30, i32 0, i32 3
-  %19 = load %struct.insn_operand_data** %operand31, align 8, !tbaa !0
+  %19 = load %struct.insn_operand_data** %operand31, align 8
   %arrayidx32 = getelementptr inbounds %struct.insn_operand_data* %19, i64 2
   %20 = bitcast %struct.insn_operand_data* %arrayidx32 to i8*
   %bf.field.offs33 = getelementptr i8* %20, i32 16
@@ -101,7 +101,7 @@ if.end:                                           ; preds = %lor.lhs.false23
   %idxprom37 = sext i32 %4 to i64
   %arrayidx38 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom37
   %genfun = getelementptr inbounds %struct.insn_data* %arrayidx38, i32 0, i32 2
-  %23 = load %struct.rtx_def* (%struct.rtx_def*, ...)** %genfun, align 8, !tbaa !0
+  %23 = load %struct.rtx_def* (%struct.rtx_def*, ...)** %genfun, align 8
   %call39 = tail call %struct.rtx_def* (%struct.rtx_def*, ...)* %23(%struct.rtx_def* %r0, %struct.rtx_def* %r1, %struct.rtx_def* %c)
   br label %return
 
@@ -109,8 +109,3 @@ return:                                           ; preds = %if.end, %if.then
   %24 = phi %struct.rtx_def* [ %call39, %if.end ], [ null, %if.then ]
   ret %struct.rtx_def* %24
 }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"_ZTS9insn_code", metadata !1}
diff --git a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
index da734d4..07a6910 100644
--- a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@@ -16,8 +16,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: main
 define i32 @main() nounwind uwtable {
 entry:
-; CHECK: pmovsxbq  j(%rip), %
 ; CHECK: pmovsxbq  i(%rip), %
+; CHECK: pmovsxbq  j(%rip), %
   %0 = load <2 x i8>* @i, align 8
   %1 = load <2 x i8>* @j, align 8
   %div = sdiv <2 x i8> %1, %0
@@ -25,4 +25,3 @@ entry:
   ret i32 0
 ; CHECK: ret
 }
-
diff --git a/test/CodeGen/X86/2011-12-15-vec_shift.ll b/test/CodeGen/X86/2011-12-15-vec_shift.ll
index dc3a08b..0183e10 100644
--- a/test/CodeGen/X86/2011-12-15-vec_shift.ll
+++ b/test/CodeGen/X86/2011-12-15-vec_shift.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mattr=+sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
-; RUN: llc -march=x86-64 -mattr=-sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
+; RUN: llc -march=x86-64 -mattr=+sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
+; RUN: llc -march=x86-64 -mattr=-sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
 ; Test case for r146671
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
index 7515e80..14643e4 100644
--- a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
+++ b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mattr=-sse42,+sse41 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mattr=-sse4.2,+sse4.1 < %s | FileCheck %s
 ; Make sure we don't load from the location pointed to by %p
 ; twice: it has non-obvious performance implications, and
 ; the relevant transformation doesn't know how to update
diff --git a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
index a883d79..cd8a16f 100644
--- a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
+++ b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
@@ -15,7 +15,7 @@ entry:
 
 ; CHECK: lock
 ; CHECK-NEXT: orl {{.*}}, (%esp)
-; CHECK-NEXT: cmpl $0
+; CHECK-NEXT: testl [[REG:%e[a-z]+]], [[REG]]
 
 if.then:                                          ; preds = %entry
   tail call void bitcast (void (...)* @foo to void ()*)() nounwind
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 186fafb..16706ae 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,8 +5,8 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK-LABEL: func:
-;CHECK: vxorps
-;CHECK: vinsertf128
+;CHECK: vpxor
+;CHECK: vinserti128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
 ;CHECK: vmulps
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 503aab4..d41b432 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -6,7 +6,7 @@
 ;
 ; CHECK: %entry
 ; CHECK: DEBUG_VALUE: hg
-; CHECK: je
+; CHECK: j
 
 %struct.node.0.27 = type { i16, double, [3 x double], i32, i32 }
 %struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
@@ -36,6 +36,7 @@ return:                                           ; preds = %for.cond.preheader,
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12}
 
 !0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
@@ -44,5 +45,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
 !5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
 !6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [from ]
+!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 21e105d..7befa6b 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -63,6 +63,7 @@ if.else4114:                                      ; preds = %if.then4073
 declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!35}
 
 !0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
@@ -79,7 +80,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !13 = metadata !{i32 786443, metadata !3, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
 !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !17 = metadata !{metadata !18}
 !18 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
@@ -134,3 +135,4 @@ declare void @_Znwm()
 !32 = metadata !{i32 786454, metadata !34, null, metadata !"HM", i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
 !33 = metadata !{i32 786473, metadata !34} ; [ DW_TAG_file_type ]
 !34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index dcbe109..5aec3d9 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -34,11 +34,13 @@ invoke.cont44:                                    ; preds = %if.end
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8}
 
 !0 = metadata !{i32 786449, metadata !6, i32 4, metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
 !2 = metadata !{null}
 !3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214]
-!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [from ]
+!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
 !5 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
 !6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"}
 !7 = metadata !{i32 0}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll b/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
index f0c7781..0ff9d39 100644
--- a/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
+++ b/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
@@ -23,6 +23,6 @@ entry:
 
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) #1
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
new file mode 100644
index 0000000..3455b68
--- /dev/null
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
@@ -0,0 +1,132 @@
+; RUN: llc -mtriple x86_64-apple-darwin -O0 < %s -o - | FileCheck %s
+;
+; During X86 fastisel, the address of indirect call was resolved
+; through bitcast, ptrtoint, and inttoptr instructions. This is valid
+; only if the related instructions are in that same basic block, otherwise
+; we may reference variables that were not live accross basic blocks
+; resulting in undefined virtual registers.
+;
+; In this example, this is illustrated by a the spill/reload of the
+; LOADED_PTR_SLOT.
+;
+; Before this patch, the compiler was accessing two different spill
+; slots.
+; <rdar://problem/15192473>
+
+; CHECK-LABEL: @test_bitcast
+; Load the value of the function pointer: %loaded_ptr
+; CHECK: movq (%rdi), [[LOADED_PTR:%[a-z]+]]
+; Spill %arg2.
+; CHECK: movq %rdx, [[ARG2_SLOT:[0-9]*\(%[a-z]+\)]]
+; Spill %loaded_ptr.
+; CHECK: movq [[LOADED_PTR]], [[LOADED_PTR_SLOT:[0-9]*\(%[a-z]+\)]]
+; Perform the indirect call.
+; Load the first argument
+; CHECK: movq [[ARG2_SLOT]], %rdi
+; Load the second argument
+; CHECK: movq [[ARG2_SLOT]], %rsi
+; Load the thrid argument
+; CHECK: movq [[ARG2_SLOT]], %rdx
+; Load the function pointer.
+; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
+; Call.
+; CHECK: callq *[[FCT_PTR]]
+; CHECK: ret
+define i64 @test_bitcast(i64 (i64, i64, i64)** %arg, i1 %bool, i64 %arg2) {
+entry:
+  %loaded_ptr = load i64 (i64, i64, i64)** %arg, align 8
+  %raw = bitcast i64 (i64, i64, i64)* %loaded_ptr to i8*
+  switch i1 %bool, label %default [
+    i1 true, label %label_true
+    i1 false, label %label_end
+  ]
+default:
+  unreachable
+
+label_true:
+  br label %label_end
+
+label_end:
+  %fct_ptr = bitcast i8* %raw to i64 (i64, i64, i64)*
+  %res = call i64 %fct_ptr(i64 %arg2, i64 %arg2, i64 %arg2)
+  ret i64 %res
+}
+
+; CHECK-LABEL: @test_inttoptr
+; Load the value of the function pointer: %loaded_ptr
+; CHECK: movq (%rdi), [[LOADED_PTR:%[a-z]+]]
+; Spill %arg2.
+; CHECK: movq %rdx, [[ARG2_SLOT:[0-9]*\(%[a-z]+\)]]
+; Spill %loaded_ptr.
+; CHECK: movq [[LOADED_PTR]], [[LOADED_PTR_SLOT:[0-9]*\(%[a-z]+\)]]
+; Perform the indirect call.
+; Load the first argument
+; CHECK: movq [[ARG2_SLOT]], %rdi
+; Load the second argument
+; CHECK: movq [[ARG2_SLOT]], %rsi
+; Load the thrid argument
+; CHECK: movq [[ARG2_SLOT]], %rdx
+; Load the function pointer.
+; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
+; Call.
+; CHECK: callq *[[FCT_PTR]]
+; CHECK: ret
+define i64 @test_inttoptr(i64 (i64, i64, i64)** %arg, i1 %bool, i64 %arg2) {
+entry:
+  %loaded_ptr = load i64 (i64, i64, i64)** %arg, align 8
+  %raw = ptrtoint i64 (i64, i64, i64)* %loaded_ptr to i64
+  switch i1 %bool, label %default [
+    i1 true, label %label_true
+    i1 false, label %label_end
+  ]
+default:
+  unreachable
+
+label_true:
+  br label %label_end
+
+label_end:
+  %fct_ptr = inttoptr i64 %raw to i64 (i64, i64, i64)*
+  %res = call i64 %fct_ptr(i64 %arg2, i64 %arg2, i64 %arg2)
+  ret i64 %res
+}
+
+; CHECK-LABEL: @test_ptrtoint
+; Load the value of the function pointer: %loaded_ptr
+; CHECK: movq (%rdi), [[LOADED_PTR:%[a-z]+]]
+; Spill %arg2.
+; CHECK: movq %rdx, [[ARG2_SLOT:[0-9]*\(%[a-z]+\)]]
+; Spill %loaded_ptr.
+; CHECK: movq [[LOADED_PTR]], [[LOADED_PTR_SLOT:[0-9]*\(%[a-z]+\)]]
+; Perform the indirect call.
+; Load the first argument
+; CHECK: movq [[ARG2_SLOT]], %rdi
+; Load the second argument
+; CHECK: movq [[ARG2_SLOT]], %rsi
+; Load the thrid argument
+; CHECK: movq [[ARG2_SLOT]], %rdx
+; Load the function pointer.
+; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
+; Call.
+; CHECK: callq *[[FCT_PTR]]
+; CHECK: ret
+define i64 @test_ptrtoint(i64 (i64, i64, i64)** %arg, i1 %bool, i64 %arg2) {
+entry:
+  %loaded_ptr = load i64 (i64, i64, i64)** %arg, align 8
+  %raw = bitcast i64 (i64, i64, i64)* %loaded_ptr to i8*
+  switch i1 %bool, label %default [
+    i1 true, label %label_true
+    i1 false, label %label_end
+  ]
+default:
+  unreachable
+
+label_true:
+  br label %label_end
+
+label_end:
+  %fct_int = ptrtoint i8* %raw to i64
+  %fct_ptr = inttoptr i64 %fct_int to i64 (i64, i64, i64)*
+  %res = call i64 %fct_ptr(i64 %arg2, i64 %arg2, i64 %arg2)
+  ret i64 %res
+}
diff --git a/test/CodeGen/X86/3addr-16bit.ll b/test/CodeGen/X86/3addr-16bit.ll
index 77c3c16..fafdfdb 100644
--- a/test/CodeGen/X86/3addr-16bit.ll
+++ b/test/CodeGen/X86/3addr-16bit.ll
@@ -34,7 +34,8 @@ entry:
 
 ; 64BIT-LABEL:     t2:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal -1(%rsi), %eax
+; 64BIT:     decl %eax
+; 64BIT:     movzwl %ax
   %0 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   %1 = add i16 %k, -1                             ; <i16> [#uses=3]
   br i1 %0, label %bb, label %bb1
@@ -58,7 +59,7 @@ entry:
 
 ; 64BIT-LABEL:     t3:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal 2(%rsi), %eax
+; 64BIT:     addl $2, %eax
   %0 = add i16 %k, 2                              ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
@@ -81,7 +82,7 @@ entry:
 
 ; 64BIT-LABEL:     t4:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal (%rsi,%rdi), %eax
+; 64BIT:     addl %edi, %eax
   %0 = add i16 %k, %c                             ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
diff --git a/test/CodeGen/X86/GC/lit.local.cfg b/test/CodeGen/X86/GC/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/CodeGen/X86/GC/lit.local.cfg
+++ b/test/CodeGen/X86/GC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/X86/GC/ocaml-gc-assert.ll b/test/CodeGen/X86/GC/ocaml-gc-assert.ll
new file mode 100644
index 0000000..b32ceca
--- /dev/null
+++ b/test/CodeGen/X86/GC/ocaml-gc-assert.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; PR3168
+
+; CHECK-LABEL: append
+
+define i32* @append() gc "ocaml" {
+entry:
+  switch i32 0, label %L2 [i32 0, label %L1]
+L1:
+  %var8 = alloca i8*
+  call void @llvm.gcroot(i8** %var8,i8* null)
+  br label %L3
+L2:
+  call ccc void @oread_runtime_casenotcovered()
+  unreachable
+L3:
+  ret i32* null
+}
+
+declare ccc void @oread_runtime_casenotcovered()
+declare void @llvm.gcroot(i8**,i8*)
diff --git a/test/CodeGen/X86/GC/ocaml-gc.ll b/test/CodeGen/X86/GC/ocaml-gc.ll
index 44241a9..6d5f8ae 100644
--- a/test/CodeGen/X86/GC/ocaml-gc.ll
+++ b/test/CodeGen/X86/GC/ocaml-gc.ll
@@ -2,23 +2,23 @@
 
 define i32 @main(i32 %x) nounwind gc "ocaml" {
 ; CHECK:        .text
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___code_begin
-; CHECK-NEXT: caml_3C_stdin_3E___code_begin:
+; CHECK-NEXT:   .globl "caml<stdin>__code_begin"
+; CHECK-NEXT: "caml<stdin>__code_begin":
 ; CHECK-NEXT:   .data
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___data_begin
-; CHECK-NEXT: caml_3C_stdin_3E___data_begin:
+; CHECK-NEXT:   .globl  "caml<stdin>__data_begin"
+; CHECK-NEXT: "caml<stdin>__data_begin":
 
   %puts = tail call i32 @foo(i32 %x)
   ret i32 0
 
-; CHECK:        .globl  caml_3C_stdin_3E___code_end
-; CHECK-NEXT: caml_3C_stdin_3E___code_end:
+; CHECK:        .globl "caml<stdin>__code_end"
+; CHECK-NEXT: "caml<stdin>__code_end":
 ; CHECK-NEXT:   .data
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___data_end
-; CHECK-NEXT: caml_3C_stdin_3E___data_end:
+; CHECK-NEXT:   .globl "caml<stdin>__data_end"
+; CHECK-NEXT: "caml<stdin>__data_end":
 ; CHECK-NEXT:   .quad   0
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___frametable
-; CHECK-NEXT: caml_3C_stdin_3E___frametable:
+; CHECK-NEXT:   .globl "caml<stdin>__frametable"
+; CHECK-NEXT: "caml<stdin>__frametable":
 ; CHECK-NEXT:   .short  1
 ; CHECK-NEXT:   .align  8
 ; CHECK-NEXT:                # live roots for main
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index df9580c..584e644 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -26,11 +26,12 @@ bb2:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22}
 
 !0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
 !2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -49,3 +50,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !19 = metadata !{metadata !6, metadata !7, metadata !10}
 !20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !21 = metadata !{i32 0}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 8b67a44..51d0d17 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -25,6 +25,11 @@ declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!23}
+!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"t.c", metadata !""}
 !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
 !2 = metadata !{i32 0}
 !22 = metadata !{i32 786688, null, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index f1d9296..a8e3537 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -4,8 +4,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-;YESCOLOR: subq  $136, %rsp
-;NOCOLOR: subq  $264, %rsp
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
 
 define i32 @myCall_w2(i32 %in) {
 entry:
diff --git a/test/CodeGen/X86/abi-isel.ll b/test/CodeGen/X86/abi-isel.ll
index 3b84231..633e70f 100644
--- a/test/CodeGen/X86/abi-isel.ll
+++ b/test/CodeGen/X86/abi-isel.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-PIC
 
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=LINUX-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC
 
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
 
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-PIC
 
 @src = external global [131072 x i32]
 @dst = external global [131072 x i32]
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index f36577b..62a62a4 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -9,7 +9,7 @@ define i32 @test1(i32 inreg %a) nounwind {
   %b = add i32 %a, 128
   ret i32 %b
 ; X32: subl	$-128, %eax
-; X64: subl $-128, 
+; X64: subl $-128,
 }
 define i64 @test2(i64 inreg %a) nounwind {
   %b = add i64 %a, 2147483648
@@ -20,7 +20,7 @@ define i64 @test2(i64 inreg %a) nounwind {
 define i64 @test3(i64 inreg %a) nounwind {
   %b = add i64 %a, 128
   ret i64 %b
-  
+
 ; X32: addl $128, %eax
 ; X64: subq	$-128,
 }
@@ -38,7 +38,7 @@ normal:
 
 overflow:
   ret i1 false
-  
+
 ; X32-LABEL: test4:
 ; X32: addl
 ; X32-NEXT: jo
@@ -82,11 +82,11 @@ define i64 @test6(i64 %A, i32 %B) nounwind {
         ret i64 %tmp5
 
 ; X32-LABEL: test6:
-; X32:	    movl 12(%esp), %edx
+; X32:	    movl 4(%esp), %eax
+; X32-NEXT: movl 12(%esp), %edx
 ; X32-NEXT: addl 8(%esp), %edx
-; X32-NEXT: movl 4(%esp), %eax
 ; X32-NEXT: ret
-        
+
 ; X64-LABEL: test6:
 ; X64:	shlq	$32, %r[[A1]]
 ; X64:	leaq	(%r[[A1]],%r[[A0]]), %rax
diff --git a/test/CodeGen/X86/aes_intrinsics.ll b/test/CodeGen/X86/aes_intrinsics.ll
new file mode 100644
index 0000000..fc1a2cc
--- /dev/null
+++ b/test/CodeGen/X86/aes_intrinsics.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+aes,-avx | FileCheck %s
+
+define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesdec
+  %res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesdeclast
+  %res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesenc
+  %res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesenclast
+  %res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
+  ; CHECK: aesimc
+  %res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
+  ; CHECK: aeskeygenassist
+  %res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/alias-error.ll b/test/CodeGen/X86/alias-error.ll
new file mode 100644
index 0000000..8f01dcf
--- /dev/null
+++ b/test/CodeGen/X86/alias-error.ll
@@ -0,0 +1,5 @@
+; RUN: not llc -mtriple=i686-pc-linux-gnu %s -o /dev/null 2>&1 | FileCheck %s
+
+@a = external global i32
+@b = alias i32* @a
+; CHECK: b: Target doesn't support aliases to declarations
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index f920279..d0a262d 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -1,26 +1,38 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false -o %t
-; RUN: grep globl %t | count 6
-; RUN: grep weak %t  | count 1
-; RUN: grep hidden %t | count 1
-; RUN: grep protected %t | count 1
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false | FileCheck %s
 
-@bar = external global i32
+@bar = global i32 42
+
+; CHECK-DAG: .globl	foo1
 @foo1 = alias i32* @bar
+
+; CHECK-DAG: .globl	foo2
 @foo2 = alias i32* @bar
 
 %FunTy = type i32()
 
-declare i32 @foo_f()
+define i32 @foo_f() {
+  ret i32 0
+}
+; CHECK-DAG: .weak	bar_f
 @bar_f = alias weak %FunTy* @foo_f
 
+@bar_l = alias linkonce_odr i32* @bar
+; CHECK-DAG: .weak	bar_l
+
 @bar_i = alias internal i32* @bar
 
+; CHECK-DAG: .globl	A
 @A = alias bitcast (i32* @bar to i64*)
 
+; CHECK-DAG: .globl	bar_h
+; CHECK-DAG: .hidden	bar_h
 @bar_h = hidden alias i32* @bar
 
+; CHECK-DAG: .globl	bar_p
+; CHECK-DAG: .protected	bar_p
 @bar_p = protected alias i32* @bar
 
+; CHECK-DAG: .globl	test
 define i32 @test() {
 entry:
    %tmp = load i32* @foo1
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 3d76fb0..74b9470 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
diff --git a/test/CodeGen/X86/anyregcc-crash.ll b/test/CodeGen/X86/anyregcc-crash.ll
new file mode 100644
index 0000000..cf6f6ed
--- /dev/null
+++ b/test/CodeGen/X86/anyregcc-crash.ll
@@ -0,0 +1,17 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
+                        i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
+                        i64 %v13, i64 %v14, i64 %v15, i64 %v16) {
+entry:
+  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
+                i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
+                i64 %v13, i64 %v14, i64 %v15, i64 %v16)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/anyregcc.ll b/test/CodeGen/X86/anyregcc.ll
new file mode 100644
index 0000000..8109f87
--- /dev/null
+++ b/test/CodeGen/X86/anyregcc.ll
@@ -0,0 +1,348 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .long   0
+; Num Constants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   8
+
+; test
+; CHECK-NEXT:   .long   0
+; CHECK-LABEL:  .long   L{{.*}}-_test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-NEXT:   .long   1
+; CHECK-LABEL:  .long   L{{.*}}-_property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 1, i32 15, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-NEXT:   .long   2
+; CHECK-LABEL:  .long   L{{.*}}-_property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-NEXT:   .long   3
+; CHECK-LABEL:  .long   L{{.*}}-_property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register <-- this will be folded once folding for FI is implemented
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 3, i32 15, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-NEXT:   .long   4
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-NEXT:   .long   5
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long 12
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register RDI
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 5
+; CHECK-NEXT: .long  0
+; Loc 1: Register RSI
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long 13
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to RBP +
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 7
+; CHECK-NEXT: .long  {{[0-9]+}}
+; Loc 4: Arg3 spilled to RBP +
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 7
+; CHECK-NEXT: .long  {{[0-9]+}}
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/atom-call-reg-indirect.ll b/test/CodeGen/X86/atom-call-reg-indirect.ll
index 933b98b..48f2d4c 100644
--- a/test/CodeGen/X86/atom-call-reg-indirect.ll
+++ b/test/CodeGen/X86/atom-call-reg-indirect.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck -check-prefix=ATOM-NOT32 %s
 ; RUN: llc < %s -mcpu=atom -mtriple=x86_64-linux  | FileCheck -check-prefix=ATOM64 %s
 ; RUN: llc < %s -mcpu=core2 -mtriple=x86_64-linux | FileCheck -check-prefix=ATOM-NOT64 %s
+; RUN: llc < %s -mcpu=slm -mtriple=i686-linux  | FileCheck -check-prefix=SLM32 %s
+; RUN: llc < %s -mcpu=slm -mtriple=x86_64-linux  | FileCheck -check-prefix=SLM64 %s
 
 
 ; fn_ptr.ll
@@ -20,6 +22,10 @@ entry:
   ;ATOM64: movq (%rcx), %rcx
   ;ATOM64: callq *%rcx
   ;ATOM-NOT64: callq *(%rcx)
+  ;SLM32: movl (%ecx), %ecx
+  ;SLM32: calll *%ecx
+  ;SLM64: movq (%rcx), %rcx
+  ;SLM64: callq *%rcx
   tail call void %1(%class.A* %call)
   ret i32 0
 }
@@ -40,6 +46,10 @@ entry:
   ;ATOM64: movq (%rax), %rax
   ;ATOM64: callq *%rax
   ;ATOM-NOT64: callq *(%rax)
+  ;SLM32: movl (%eax), %eax
+  ;SLM32: calll *%eax
+  ;SLM64: movq (%rax), %rax
+  ;SLM64: callq *%rax
   tail call void %1(i32 2)
   ret i32 0
 }
diff --git a/test/CodeGen/X86/atom-lea-addw-bug.ll b/test/CodeGen/X86/atom-lea-addw-bug.ll
new file mode 100644
index 0000000..5cda2df
--- /dev/null
+++ b/test/CodeGen/X86/atom-lea-addw-bug.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mcpu=atom | FileCheck %s
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target triple = "x86_64-apple-darwin12.5.0"
+
+define i32 @DoLayout() {
+entry:
+  %tmp1 = load i16* undef, align 2
+  %tmp17 = load i16* null, align 2
+  %tmp19 = load i16* undef, align 2
+  %shl = shl i16 %tmp19, 1
+  %add55 = add i16 %tmp17, %tmp1
+  %add57 = add i16 %add55, %shl
+  %conv60 = zext i16 %add57 to i32
+  %add61 = add nsw i32 %conv60, 0
+  %conv63 = and i32 %add61, 65535
+  ret i32 %conv63
+; CHECK: addw
+}
diff --git a/test/CodeGen/X86/atom-sched.ll b/test/CodeGen/X86/atom-sched.ll
index 0d97e85..fd18472 100644
--- a/test/CodeGen/X86/atom-sched.ll
+++ b/test/CodeGen/X86/atom-sched.ll
@@ -1,4 +1,5 @@
 ; RUN: llc <%s -O2 -mcpu=atom -march=x86 -relocation-model=static | FileCheck -check-prefix=atom %s
+; RUN: llc <%s -O2 -mcpu=slm -march=x86 -relocation-model=static | FileCheck -check-prefix=slm %s
 ; RUN: llc <%s -O2 -mcpu=core2 -march=x86 -relocation-model=static | FileCheck %s
 ;
 
@@ -13,6 +14,9 @@ define void @func() nounwind uwtable {
 ; atom: imull
 ; atom-NOT: movl
 ; atom: imull
+; slm: imull
+; slm-NOT: movl
+; slm: imull
 ; CHECK: imull
 ; CHECK: movl
 ; CHECK: imull
diff --git a/test/CodeGen/X86/atomic-dagsched.ll b/test/CodeGen/X86/atomic-dagsched.ll
index 05e630b..aa05757 100644
--- a/test/CodeGen/X86/atomic-dagsched.ll
+++ b/test/CodeGen/X86/atomic-dagsched.ll
@@ -34,8 +34,8 @@ dim_0_vector_pre_head.i:                          ; preds = %loop
 vector_kernel_entry.i:                            ; preds = %vector_kernel_entry.i, %dim_0_vector_pre_head.i
   %asr.iv9 = phi i8* [ %scevgep10, %vector_kernel_entry.i ], [ %asr.iv6, %dim_0_vector_pre_head.i ]
   %asr.iv = phi i64 [ %asr.iv.next, %vector_kernel_entry.i ], [ %vector.size.i, %dim_0_vector_pre_head.i ]
-  %8 = bitcast i8* %ptrtoarg4 to i32 addrspace(1)*
-  %asr.iv911 = bitcast i8* %asr.iv9 to <8 x i32> addrspace(1)*
+  %8 = addrspacecast i8* %ptrtoarg4 to i32 addrspace(1)*
+  %asr.iv911 = addrspacecast i8* %asr.iv9 to <8 x i32> addrspace(1)*
   %9 = load <8 x i32> addrspace(1)* %asr.iv911, align 4
   %extract8vector_func.i = extractelement <8 x i32> %9, i32 0
   %extract9vector_func.i = extractelement <8 x i32> %9, i32 1
@@ -73,8 +73,8 @@ dim_0_pre_head.i:                                 ; preds = %scalarIf.i
 
 scalar_kernel_entry.i:                            ; preds = %scalar_kernel_entry.i, %dim_0_pre_head.i
   %asr.iv12 = phi i64 [ %asr.iv.next13, %scalar_kernel_entry.i ], [ %22, %dim_0_pre_head.i ]
-  %23 = bitcast i8* %asr.iv6 to i32 addrspace(1)*
-  %24 = bitcast i8* %ptrtoarg4 to i32 addrspace(1)*
+  %23 = addrspacecast i8* %asr.iv6 to i32 addrspace(1)*
+  %24 = addrspacecast i8* %ptrtoarg4 to i32 addrspace(1)*
   %scevgep16 = getelementptr i32 addrspace(1)* %23, i64 %asr.iv12
   %25 = load i32 addrspace(1)* %scevgep16, align 4
   %26 = atomicrmw min i32 addrspace(1)* %24, i32 %25 seq_cst
diff --git a/test/CodeGen/X86/avx-arith.ll b/test/CodeGen/X86/avx-arith.ll
index 4aa3370..a9da1ec 100644
--- a/test/CodeGen/X86/avx-arith.ll
+++ b/test/CodeGen/X86/avx-arith.ll
@@ -240,15 +240,15 @@ define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpaddq %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
@@ -269,4 +269,3 @@ define <4 x float> @int_sqrt_ss() {
  %x2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x1) nounwind
  ret <4 x float> %x2
 }
-
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 64c4627..1fd9085 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -122,10 +122,10 @@ define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
   ret <16 x i16> %res
 }
 
-;;; Check that VMOVPQIto64rr generates the assembly string "vmovd".  Previously
+;;; Check that VMOVPQIto64rr generates the assembly string "vmovq".  Previously
 ;;; an incorrect mnemonic of "movd" was printed for this instruction.
 ; CHECK: VMOVPQIto64rr
-; CHECK: vmovd
+; CHECK: vmovq
 define i64 @VMOVPQIto64rr(<2 x i64> %a) {
 entry:
   %vecext.i = extractelement <2 x i64> %a, i32 0
diff --git a/test/CodeGen/X86/avx-bitcast.ll b/test/CodeGen/X86/avx-bitcast.ll
index ecc71be..c9d828c 100644
--- a/test/CodeGen/X86/avx-bitcast.ll
+++ b/test/CodeGen/X86/avx-bitcast.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 ; CHECK: vmovsd (%
-; CHECK-NEXT: vmovd %xmm
+; CHECK-NEXT: vmovq %xmm
 define i64 @bitcasti64tof64() {
   %a = load double* undef
   %b = bitcast double %a to i64
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 0550720..7337815 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -32,7 +32,7 @@ declare i32 @func_int(i32, i32)
 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
   %y = alloca <16 x float>, align 16
   %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
   %2 = load <16 x float>* %y, align 16
   %3 = fadd <16 x float> %2, %1
   ret <16 x float> %3
@@ -43,21 +43,21 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; preserved ymm6-ymm15
 ; WIN64: testf16_regs
 ; WIN64: call
-; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
-; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; WIN64: ret
 
 ; preserved ymm8-ymm15
 ; X64: testf16_regs
 ; X64: call
-; X64: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
-; X64: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
+; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; X64: ret
 
 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
   %y = alloca <16 x float>, align 16
   %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
   %2 = load <16 x float>* %y, align 16
   %3 = fadd <16 x float> %1, %b
   %4 = fadd <16 x float> %2, %3
@@ -166,4 +166,3 @@ entry:
   %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %8
 }
-
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
index b9c7000..fb2287f 100644
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@@ -154,6 +154,17 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
   ret <4 x i64> %extmask
 }
 
+; AVX-LABEL: sext_16i8_to_16i16
+; AVX: vpmovsxbw
+; AVX: vmovhlps
+; AVX: vpmovsxbw
+; AVX: ret
+define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
+ %X = load <16 x i8>* %ptr
+ %Y = sext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
 ; AVX: sext_4i8_to_4i64
 ; AVX: vpslld  $24
 ; AVX: vpsrad  $24
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index a625601..0956361 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -81,7 +81,7 @@ entry:
 define i32 @test9(<4 x i32> %a) nounwind {
 ; CHECK: test9
 ; CHECK: vpextrd
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4> 
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4>
   %r = extractelement <8 x i32> %b, i32 2
 ; CHECK: ret
   ret i32 %r
@@ -251,8 +251,8 @@ define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
 ; CHECK: swap8doubles
 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
+; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
+; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 5c01c2c..5d07815 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -20,7 +20,7 @@ entry:
   ret <16 x i16> %shuffle
 }
 
-; CHECK: vmovd
+; CHECK: vmovq
 ; CHECK-NEXT: vmovlhps %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index d007736..58d0a35 100644
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -12,4 +12,9 @@ define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
   %B = trunc <8 x i32> %A to <8 x i16>
   ret <8 x i16>%B
 }
-
+define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
+; CHECK-LABEL: trunc_16_8
+; CHECK: pshufb
+  %B = trunc <16 x i16> %A to <16 x i8>
+  ret <16 x i8> %B
+}
diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll
index e2b6c55..7511746 100644
--- a/test/CodeGen/X86/avx-zext.ll
+++ b/test/CodeGen/X86/avx-zext.ll
@@ -27,3 +27,15 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
   %t = zext <8 x i8> %z to <8 x i32>
   ret <8 x i32> %t
 }
+
+; PR17654
+define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
+; CHECK-LABEL: zext_16i8_to_16i16:
+; CHECK: vpxor
+; CHECK: vpunpckhbw
+; CHECK: vpunpcklbw
+; CHECK: vinsertf128
+; CHECK: ret
+  %t = zext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index 997fa19..72bdd9d 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -148,3 +148,21 @@ define <8 x i32> @mul_const9(<8 x i32> %x) {
   %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %y
 }
+
+; CHECK: mul_const10
+; CHECK: vpmulld
+; CHECK: ret
+define <4 x i32> @mul_const10(<4 x i32> %x) {
+  ; %x * 0x01010101
+  %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
+  ret <4 x i32> %m
+}
+
+; CHECK: mul_const11
+; CHECK: vpmulld
+; CHECK: ret
+define <4 x i32> @mul_const11(<4 x i32> %x) {
+  ; %x * 0x80808080
+  %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
+  ret <4 x i32> %m
+}
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index 3ce08dc..f49718e 100644
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -63,6 +63,34 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
   ret <8 x i32>%B
 }
 
+; CHECK-LABEL: zext_16i8_16i16:
+; CHECK: vpmovzxbw
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <16 x i16> @zext_16i8_16i16(<16 x i8> %z) {
+  %t = zext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}
+
+; CHECK-LABEL: sext_16i8_16i16:
+; CHECK: vpmovsxbw
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
+  %t = sext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}
+
+; CHECK-LABEL: trunc_16i16_16i8:
+; CHECK: vpshufb
+; CHECK: vpshufb
+; CHECK: vpor
+; CHECK: ret
+define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
+  %t = trunc <16 x i16> %z to <16 x i8>
+  ret <16 x i8> %t
+}
+
 ; CHECK: load_sext_test1
 ; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}}
 ; CHECK: ret 
diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll
index 176e02c..83573dc 100644
--- a/test/CodeGen/X86/avx2-palignr.ll
+++ b/test/CodeGen/X86/avx2-palignr.ll
@@ -51,7 +51,7 @@ define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind {
 
 define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind {
 ; CHECK-LABEL: test8:
-; CHECK: palignr $5
+; CHECK: vpalignr $5
   %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> <i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
   ret <32 x i8> %C
 }
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index a978d93..5592e6c 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -121,7 +121,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_3:
-; CHECK: vpsraw  $16, %ymm0, %ymm0
+; CHECK: vpsraw  $15, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
@@ -151,7 +151,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_3:
-; CHECK: vpsrad  $32, %ymm0, %ymm0
+; CHECK: vpsrad  $31, %ymm0, %ymm0
 ; CHECK: ret
 
 ; SSE Logical Shift Right
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
new file mode 100644
index 0000000..e27600e
--- /dev/null
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -0,0 +1,271 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: addpd512
+; CHECK: vaddpd
+; CHECK: ret
+define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %add.i = fadd <8 x double> %x, %y
+  ret <8 x double> %add.i
+}
+
+; CHECK-LABEL: addpd512fold
+; CHECK: vaddpd LCP{{.*}}(%rip)
+; CHECK: ret
+define <8 x double> @addpd512fold(<8 x double> %y) {
+entry:
+  %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %add.i
+}
+
+; CHECK-LABEL: addps512
+; CHECK: vaddps
+; CHECK: ret
+define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %add.i = fadd <16 x float> %x, %y
+  ret <16 x float> %add.i
+}
+
+; CHECK-LABEL: addps512fold
+; CHECK: vaddps LCP{{.*}}(%rip)
+; CHECK: ret
+define <16 x float> @addps512fold(<16 x float> %y) {
+entry:
+  %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000,  float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %add.i
+}
+
+; CHECK-LABEL: subpd512
+; CHECK: vsubpd
+; CHECK: ret
+define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %sub.i = fsub <8 x double> %x, %y
+  ret <8 x double> %sub.i
+}
+
+; CHECK-LABEL: @subpd512fold
+; CHECK: vsubpd (%
+; CHECK: ret
+define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
+entry:
+  %tmp2 = load <8 x double>* %x, align 8
+  %sub.i = fsub <8 x double> %y, %tmp2
+  ret <8 x double> %sub.i
+}
+
+; CHECK-LABEL: @subps512
+; CHECK: vsubps
+; CHECK: ret
+define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %sub.i = fsub <16 x float> %x, %y
+  ret <16 x float> %sub.i
+}
+
+; CHECK-LABEL: subps512fold
+; CHECK: vsubps (%
+; CHECK: ret
+define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
+entry:
+  %tmp2 = load <16 x float>* %x, align 4
+  %sub.i = fsub <16 x float> %y, %tmp2
+  ret <16 x float> %sub.i
+}
+
+; CHECK-LABEL: imulq512
+; CHECK: vpmuludq
+; CHECK: vpmuludq
+; CHECK: ret
+define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
+  %z = mul <8 x i64>%x, %y
+  ret <8 x i64>%z
+}
+
+; CHECK-LABEL: mulpd512
+; CHECK: vmulpd
+; CHECK: ret
+define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %mul.i = fmul <8 x double> %x, %y
+  ret <8 x double> %mul.i
+}
+
+; CHECK-LABEL: mulpd512fold
+; CHECK: vmulpd LCP{{.*}}(%rip)
+; CHECK: ret
+define <8 x double> @mulpd512fold(<8 x double> %y) {
+entry:
+  %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %mul.i
+}
+
+; CHECK-LABEL: mulps512
+; CHECK: vmulps
+; CHECK: ret
+define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %mul.i = fmul <16 x float> %x, %y
+  ret <16 x float> %mul.i
+}
+
+; CHECK-LABEL: mulps512fold
+; CHECK: vmulps LCP{{.*}}(%rip)
+; CHECK: ret
+define <16 x float> @mulps512fold(<16 x float> %y) {
+entry:
+  %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %mul.i
+}
+
+; CHECK-LABEL: divpd512
+; CHECK: vdivpd
+; CHECK: ret
+define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %div.i = fdiv <8 x double> %x, %y
+  ret <8 x double> %div.i
+}
+
+; CHECK-LABEL: divpd512fold
+; CHECK: vdivpd LCP{{.*}}(%rip)
+; CHECK: ret
+define <8 x double> @divpd512fold(<8 x double> %y) {
+entry:
+  %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %div.i
+}
+
+; CHECK-LABEL: divps512
+; CHECK: vdivps
+; CHECK: ret
+define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %div.i = fdiv <16 x float> %x, %y
+  ret <16 x float> %div.i
+}
+
+; CHECK-LABEL: divps512fold
+; CHECK: vdivps LCP{{.*}}(%rip)
+; CHECK: ret
+define <16 x float> @divps512fold(<16 x float> %y) {
+entry:
+  %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %div.i
+}
+
+; CHECK-LABEL: vpaddq_test
+; CHECK: vpaddq %zmm
+; CHECK: ret
+define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+  %x = add <8 x i64> %i, %j
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpaddd_test
+; CHECK: vpaddd %zmm
+; CHECK: ret
+define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+  %x = add <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpsubq_test
+; CHECK: vpsubq %zmm
+; CHECK: ret
+define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+  %x = sub <8 x i64> %i, %j
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpsubd_test
+; CHECK: vpsubd
+; CHECK: ret
+define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+  %x = sub <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpmulld_test
+; CHECK: vpmulld %zmm
+; CHECK: ret
+define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
+  %x = mul <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: sqrtA
+; CHECK: vsqrtssz
+; CHECK: ret
+declare float @sqrtf(float) readnone
+define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+entry:
+  %conv1 = tail call float @sqrtf(float %a) nounwind readnone
+  ret float %conv1
+}
+
+; CHECK-LABEL: sqrtB
+; CHECK: vsqrtsdz
+; CHECK: ret
+declare double @sqrt(double) readnone
+define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+entry:
+  %call = tail call double @sqrt(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: sqrtC
+; CHECK: vsqrtssz
+; CHECK: ret
+declare float @llvm.sqrt.f32(float)
+define float @sqrtC(float %a) nounwind {
+  %b = call float @llvm.sqrt.f32(float %a)
+  ret float %b
+}
+
+; CHECK-LABEL: fadd_broadcast
+; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK: ret
+define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
+  %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: addq_broadcast
+; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK: ret
+define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
+  %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %b
+}
+
+; CHECK-LABEL: orq_broadcast
+; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK: ret
+define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+  %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %b
+}
+
+; CHECK-LABEL: andd512fold
+; CHECK: vpandd (%
+; CHECK: ret
+define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+entry:
+  %a = load <16 x i32>* %x, align 4
+  %b = and <16 x i32> %y, %a
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: andqbrst
+; CHECK: vpandq  (%rdi){1to8}, %zmm
+; CHECK: ret
+define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+entry:
+  %a = load i64* %ap, align 8
+  %b = insertelement <8 x i64> undef, i64 %a, i32 0
+  %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %d = and <8 x i64> %p1, %c
+  ret <8 x i64>%d
+}
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
new file mode 100644
index 0000000..bc4560b
--- /dev/null
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vpxord
+; CHECK: ret
+define <16 x i32> @test1(i32* %x) {
+   %y = load i32* %x, align 4
+   %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4
+   ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test2
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
+; CHECK: ret
+define <16 x i32> @test2(<16 x i32> %x) {
+   %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
+   ret <16 x i32>%res
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
new file mode 100644
index 0000000..ba52745
--- /dev/null
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK: vucomisdz
+define double @test1(double %a, double %b) nounwind {
+  %tobool = fcmp une double %a, %b
+  br i1 %tobool, label %l1, label %l2
+
+l1:
+  %c = fsub double %a, %b
+  ret double %c
+l2:
+  %c1 = fadd double %a, %b
+  ret double %c1
+}
+
+; CHECK: vucomissz
+define float @test2(float %a, float %b) nounwind {
+  %tobool = fcmp olt float %a, %b
+  br i1 %tobool, label %l1, label %l2
+
+l1:
+  %c = fsub float %a, %b
+  ret float %c
+l2:
+  %c1 = fadd float %a, %b
+  ret float %c1
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
new file mode 100644
index 0000000..ed68ff7
--- /dev/null
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -0,0 +1,217 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: sitof32
+; CHECK: vcvtdq2ps %zmm
+; CHECK: ret
+define <16 x float> @sitof32(<16 x i32> %a) nounwind {
+  %b = sitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: fptosi00
+; CHECK: vcvttps2dq %zmm
+; CHECK: ret
+define <16 x i32> @fptosi00(<16 x float> %a) nounwind {
+  %b = fptosi <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: fptoui00
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
+  %b = fptoui <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: fptoui01
+; CHECK: vcvttpd2udq
+; CHECK: ret
+define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
+  %b = fptoui <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: sitof64
+; CHECK: vcvtdq2pd %ymm
+; CHECK: ret
+define <8 x double> @sitof64(<8 x i32> %a) {
+  %b = sitofp <8 x i32> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+; CHECK-LABEL: fptosi01
+; CHECK: vcvttpd2dq %zmm
+; CHECK: ret
+define <8 x i32> @fptosi01(<8 x double> %a) {
+  %b = fptosi <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: fptrunc00
+; CHECK: vcvtpd2ps %zmm
+; CHECK-NEXT: vcvtpd2ps %zmm
+; CHECK-NEXT: vinsertf64x4    $1
+; CHECK: ret
+define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
+  %a = fptrunc <16 x double> %b to <16 x float>
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: fpext00
+; CHECK: vcvtps2pd %ymm0, %zmm0
+; CHECK: ret
+define <8 x double> @fpext00(<8 x float> %b) nounwind {
+  %a = fpext <8 x float> %b to <8 x double>
+  ret <8 x double> %a
+}
+
+; CHECK-LABEL: funcA
+; CHECK: vcvtsi2sdqz (%
+; CHECK: ret
+define double @funcA(i64* nocapture %e) {
+entry:
+  %tmp1 = load i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: funcB
+; CHECK: vcvtsi2sdlz (%
+; CHECK: ret
+define double @funcB(i32* %e) {
+entry:
+  %tmp1 = load i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: funcC
+; CHECK: vcvtsi2sslz (%
+; CHECK: ret
+define float @funcC(i32* %e) {
+entry:
+  %tmp1 = load i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: i64tof32
+; CHECK: vcvtsi2ssqz  (%
+; CHECK: ret
+define float @i64tof32(i64* %e) {
+entry:
+  %tmp1 = load i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: fpext
+; CHECK: vcvtss2sdz
+; CHECK: ret
+define void @fpext() {
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load float* %f, align 4
+  %conv = fpext float %tmp to double
+  store double %conv, double* %d, align 8
+  ret void
+}
+
+; CHECK-LABEL: fpround_scalar
+; CHECK: vmovsdz
+; CHECK: vcvtsd2ssz
+; CHECK: vmovssz
+; CHECK: ret
+define void @fpround_scalar() nounwind uwtable {
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load double* %d, align 8
+  %conv = fptrunc double %tmp to float
+  store float %conv, float* %f, align 4
+  ret void
+}
+
+; CHECK-LABEL: long_to_double
+; CHECK: vmovqz
+; CHECK: ret
+define double @long_to_double(i64 %x) {
+   %res = bitcast i64 %x to double
+   ret double %res
+}
+
+; CHECK-LABEL: double_to_long
+; CHECK: vmovqz
+; CHECK: ret
+define i64 @double_to_long(double %x) {
+   %res = bitcast double %x to i64
+   ret i64 %res
+}
+
+; CHECK-LABEL: int_to_float
+; CHECK: vmovdz
+; CHECK: ret
+define float @int_to_float(i32 %x) {
+   %res = bitcast i32 %x to float
+   ret float %res
+}
+
+; CHECK-LABEL: float_to_int
+; CHECK: vmovdz
+; CHECK: ret
+define i32 @float_to_int(float %x) {
+   %res = bitcast float %x to i32
+   ret i32 %res
+}
+
+; CHECK-LABEL: uitof64
+; CHECK: vcvtudq2pd
+; CHECK: vextracti64x4
+; CHECK: vcvtudq2pd
+; CHECK: ret
+define <16 x double> @uitof64(<16 x i32> %a) nounwind {
+  %b = uitofp <16 x i32> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+; CHECK-LABEL: uitof32
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <16 x float> @uitof32(<16 x i32> %a) nounwind {
+  %b = uitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: @fptosi02
+; CHECK vcvttss2siz
+; CHECK: ret
+define i32 @fptosi02(float %a) nounwind {
+  %b = fptosi float %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: @fptoui02
+; CHECK vcvttss2usiz
+; CHECK: ret
+define i32 @fptoui02(float %a) nounwind {
+  %b = fptoui float %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: @uitofp02
+; CHECK vcvtusi2ss
+; CHECK: ret
+define float @uitofp02(i32 %a) nounwind {
+  %b = uitofp i32 %a to float
+  ret float %b
+}
+
+; CHECK-LABEL: @uitofp03
+; CHECK vcvtusi2sd
+; CHECK: ret
+define double @uitofp03(i32 %a) nounwind {
+  %b = uitofp i32 %a to double
+  ret double %b
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
new file mode 100644
index 0000000..ce3d759
--- /dev/null
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -0,0 +1,97 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmadd_ps_z
+  ; CHECK: vfmadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmadd_pd_z
+  ; CHECK: vfmadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubps_z
+  ; CHECK: vfmsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubpd_z
+  ; CHECK: vfmsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmadd_ps_z
+  ; CHECK: vfnmadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmadd_pd_z
+  ; CHECK: vfnmadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmsubps_z
+  ; CHECK: vfnmsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmsubpd_z
+  ; CHECK: vfnmsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmaddsubps_z
+  ; CHECK: vfmaddsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmaddsubpd_z
+  ; CHECK: vfmaddsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubaddps_z
+  ; CHECK: vfmsubadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubaddpd_z
+  ; CHECK: vfmsubadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
new file mode 100644
index 0000000..d6926e2
--- /dev/null
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -fp-contract=fast | FileCheck %s
+
+; CHECK-LABEL: test_x86_fmadd_ps_z
+; CHECK: vfmadd213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %res = fadd <16 x float> %x, %a2
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fmsub_ps_z
+; CHECK: vfmsub213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %res = fsub <16 x float> %x, %a2
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fnmadd_ps_z
+; CHECK: vfnmadd213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %res = fsub <16 x float> %a2, %x
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fnmsub_ps_z
+; CHECK: vfnmsub213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, 
+                          float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+						  float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, 
+						  float -0.000000e+00>, %x
+  %res = fsub <16 x float> %y, %a2
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fmadd_pd_z
+; CHECK: vfmadd213pd     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  %x = fmul <8 x double> %a0, %a1
+  %res = fadd <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+; CHECK-LABEL: test_x86_fmsub_pd_z
+; CHECK: vfmsub213pd     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  %x = fmul <8 x double> %a0, %a1
+  %res = fsub <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+define double @test_x86_fmsub_sd_z(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+;CHECK-LABEL: test132_br
+;CHECK: vfmadd132ps  LCP{{.*}}(%rip){1to16}
+;CHECK: ret
+define <16 x float> @test132_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+  %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %b2 = fadd <16 x float> %b1, %a2
+  ret <16 x float> %b2
+}
+
+;CHECK-LABEL: test213_br
+;CHECK: vfmadd213ps  LCP{{.*}}(%rip){1to16}
+;CHECK: ret
+define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+  %b1 = fmul <16 x float> %a1, %a2
+  %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <16 x float> %b2
+}
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
new file mode 100644
index 0000000..0321e95
--- /dev/null
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
+
+declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
+
+;CHECK-LABEL: gather_mask_dps
+;CHECK: kmovw
+;CHECK: vgatherdps
+;CHECK: vpadd
+;CHECK: vscatterdps
+;CHECK: ret
+define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_dpd
+;CHECK: kmovw
+;CHECK: vgatherdpd
+;CHECK: vpadd
+;CHECK: vscatterdpd
+;CHECK: ret
+define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qps
+;CHECK: kmovw
+;CHECK: vgatherqps
+;CHECK: vpadd
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qpd
+;CHECK: kmovw
+;CHECK: vgatherqpd
+;CHECK: vpadd
+;CHECK: vscatterqpd
+;CHECK: ret
+define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
+  ret void
+}
+;;
+;; Integer Gather/Scatter
+;;
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
+
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
+
+;CHECK-LABEL: gather_mask_dd
+;CHECK: kmovw
+;CHECK: vpgatherdd
+;CHECK: vpadd
+;CHECK: vpscatterdd
+;CHECK: ret
+define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qd
+;CHECK: kmovw
+;CHECK: vpgatherqd
+;CHECK: vpadd
+;CHECK: vpscatterqd
+;CHECK: ret
+define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qq
+;CHECK: kmovw
+;CHECK: vpgatherqq
+;CHECK: vpadd
+;CHECK: vpscatterqq
+;CHECK: ret
+define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_dq
+;CHECK: kmovw
+;CHECK: vpgatherdq
+;CHECK: vpadd
+;CHECK: vpscatterdq
+;CHECK: ret
+define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
+  ret void
+}
+
+;; FP Intinsics without masks
+
+declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32)
+declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32)
+
+;CHECK-LABEL: gather_dps
+;CHECK: kxnorw
+;CHECK: vgatherdps
+;CHECK: vscatterdps
+;CHECK: ret
+define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qps
+;CHECK: kxnorw
+;CHECK: vgatherqps
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qpd
+;CHECK: kxnorw
+;CHECK: vgatherqpd
+;CHECK: vpadd
+;CHECK: vscatterqpd
+;CHECK: ret
+define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4)
+  ret void
+}
+
+;; Integer Intinsics without masks
+
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32)
+
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32)
+
+;CHECK-LABEL: gather_dpi
+;CHECK: kxnorw
+;CHECK: vpgatherdd
+;CHECK: vpscatterdd
+;CHECK: ret
+define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qpq
+;CHECK: vpxord  %zmm
+;CHECK: kxnorw
+;CHECK: vpgatherqq
+;CHECK: vpadd
+;CHECK: vpscatterqq
+;CHECK: ret
+define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qpi
+;CHECK: vpxor %ymm
+;CHECK: kxnorw
+;CHECK: vpgatherqd
+;CHECK: vpadd
+;CHECK: vpscatterqd
+;CHECK: ret
+define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 189bdd7..3f06740 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-;CHECK: test1
+;CHECK-LABEL: test1:
 ;CHECK: vinsertps
 ;CHECK: vinsertf32x4
 ;CHECK: ret
@@ -11,7 +11,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
   ret <16 x float> %rrr3
 }
 
-;CHECK: test2
+;CHECK-LABEL: test2:
 ;CHECK: vinsertf32x4
 ;CHECK: vextractf32x4
 ;CHECK: vinsertf32x4
@@ -23,7 +23,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
   ret <8 x double> %rrr3
 }
 
-;CHECK: test3
+;CHECK-LABEL: test3:
 ;CHECK: vextractf32x4
 ;CHECK: vinsertf32x4
 ;CHECK: ret
@@ -33,7 +33,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
   ret <16 x float> %rrr2
 }
 
-;CHECK: test4
+;CHECK-LABEL: test4:
 ;CHECK: vextracti32x4
 ;CHECK: vinserti32x4
 ;CHECK: ret
@@ -43,7 +43,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
   ret <8 x i64> %rrr2
 }
 
-;CHECK: test5
+;CHECK-LABEL: test5:
 ;CHECK: vextractpsz
 ;CHECK: ret
 define i32 @test5(<4 x float> %x) nounwind {
@@ -52,7 +52,7 @@ define i32 @test5(<4 x float> %x) nounwind {
   ret i32 %ei
 }
 
-;CHECK: test6
+;CHECK-LABEL: test6:
 ;CHECK: vextractpsz {{.*}}, (%rdi)
 ;CHECK: ret
 define void @test6(<4 x float> %x, float* %out) nounwind {
@@ -61,3 +61,65 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
   ret void
 }
 
+;CHECK-LABEL: test7
+;CHECK: vmovdz
+;CHECK: vpermps %zmm
+;CHECK: ret
+define float @test7(<16 x float> %x, i32 %ind) nounwind {
+  %e = extractelement <16 x float> %x, i32 %ind
+  ret float %e
+}
+
+;CHECK-LABEL: test8
+;CHECK: vmovqz
+;CHECK: vpermpd %zmm
+;CHECK: ret
+define double @test8(<8 x double> %x, i32 %ind) nounwind {
+  %e = extractelement <8 x double> %x, i32 %ind
+  ret double %e
+}
+
+;CHECK-LABEL: test9
+;CHECK: vmovd
+;CHECK: vpermps %ymm
+;CHECK: ret
+define float @test9(<8 x float> %x, i32 %ind) nounwind {
+  %e = extractelement <8 x float> %x, i32 %ind
+  ret float %e
+}
+
+;CHECK-LABEL: test10
+;CHECK: vmovdz
+;CHECK: vpermd %zmm
+;CHEKK: vmovdz  %xmm0, %eax
+;CHECK: ret
+define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
+  %e = extractelement <16 x i32> %x, i32 %ind
+  ret i32 %e
+}
+
+;CHECK-LABEL: test11
+;CHECK: movl    $260
+;CHECK: bextrl
+;CHECK: movl    $268
+;CHECK: bextrl
+;CHECK: ret
+define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
+  %cmp_res = icmp ult <16 x i32> %a, %b
+  %ia = extractelement <16 x i1> %cmp_res, i32 4
+  %ib = extractelement <16 x i1> %cmp_res, i32 12
+
+  br i1 %ia, label %A, label %B
+
+  A:
+    ret <16 x i32>%b
+  B:
+   %c = add <16 x i32>%b, %a
+  br i1 %ib, label %C, label %D
+  C:
+   %c1 = sub <16 x i32>%c, %a
+   ret <16 x i32>%c1
+  D:
+   %c2 = mul <16 x i32>%c, %a
+   ret <16 x i32>%c2
+}
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
new file mode 100644
index 0000000..5bdabf2
--- /dev/null
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -0,0 +1,374 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare i32 @llvm.x86.avx512.kortestz(i16, i16) nounwind readnone
+; CHECK: test_kortestz
+; CHECK: kortestw
+; CHECK: sete
+define i32 @test_kortestz(i16 %a0, i16 %a1) {
+  %res = call i32 @llvm.x86.avx512.kortestz(i16 %a0, i16 %a1) 
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.kortestc(i16, i16) nounwind readnone
+; CHECK: test_kortestc
+; CHECK: kortestw
+; CHECK: sbbl
+define i32 @test_kortestc(i16 %a0, i16 %a1) {
+  %res = call i32 @llvm.x86.avx512.kortestc(i16 %a0, i16 %a1) 
+  ret i32 %res
+}
+
+define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
+  ; CHECK: vrcp14ps
+  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>) nounwind readnone
+
+define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
+  ; CHECK: vrcp14pd
+  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>) nounwind readnone
+
+define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
+  ; CHECK: vrcp28ps
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float>) nounwind readnone
+
+define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
+  ; CHECK: vrcp28pd
+  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double>) nounwind readnone
+
+define <8 x double> @test_rndscale_pd_512(<8 x double> %a0) {
+  ; CHECK: vrndscale
+  %res = call <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double> %a0, i32 7) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double>, i32) nounwind readnone
+
+
+define <16 x float> @test_rndscale_ps_512(<16 x float> %a0) {
+  ; CHECK: vrndscale
+  %res = call <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float> %a0, i32 7) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float>, i32) nounwind readnone
+
+
+define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
+  ; CHECK: vrsqrt14ps
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>) nounwind readnone
+
+define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float>) nounwind readnone
+
+define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
+  ; CHECK: vrsqrt14ss
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
+  ; CHECK: vrsqrt28ss
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
+  ; CHECK: vrcp14ss
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
+  ; CHECK: vrcp28ss
+  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>) nounwind readnone
+
+define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
+  ; CHECK: vsqrtpd
+  %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone
+
+define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
+  ; CHECK: vsqrtps
+  %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone
+
+define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: vsqrtssz
+  %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: vsqrtsdz
+  %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
+  ; CHECK: vcvtsd2siz
+  %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
+  ; CHECK: vcvtsi2sdqz
+  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
+
+define <2 x double> @test_x86_avx512_cvtusi642sd(<2 x double> %a0, i64 %a1) {
+  ; CHECK: vcvtusi2sdqz
+  %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64) nounwind readnone
+
+define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
+  ; CHECK: vcvttsd2siz
+  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
+
+
+define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
+  ; CHECK: vcvtss2siz
+  %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
+  ; CHECK: vcvtsi2ssqz
+  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
+
+
+define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
+  ; CHECK: vcvttss2siz
+  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+
+define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
+  ; CHECK: vcvtsd2usiz
+  %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
+  ; CHECK: vcvtph2ps
+  %res = call <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16> %a0)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16>) nounwind readonly
+
+
+define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
+  ; CHECK: vcvtps2ph
+  %res = call <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float> %a0, i32 0)
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float>, i32) nounwind readonly
+
+define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
+  ; CHECK: vbroadcastss
+  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
+  ; CHECK: vbroadcastsd
+  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
+
+define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0) {
+  ; CHECK: vbroadcastss
+  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float>) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0) {
+  ; CHECK: vbroadcastsd
+  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double>) nounwind readonly
+
+define <16 x i32> @test_x86_pbroadcastd_512(<4 x i32>  %a0) {
+  ; CHECK: vpbroadcastd
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %a0) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>) nounwind readonly
+
+define <16 x i32> @test_x86_pbroadcastd_i32_512(i32  %a0) {
+  ; CHECK: vpbroadcastd
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
+
+define <8 x i64> @test_x86_pbroadcastq_512(<2 x i64> %a0) {
+  ; CHECK: vpbroadcastq
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %a0) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>) nounwind readonly
+
+define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
+  ; CHECK: vpbroadcastq
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
+
+define <16 x i32> @test_x86_pmaxu_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmaxud 
+  %res = call <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pmaxu_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpmaxuq
+  %res = call <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_x86_pmaxs_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmaxsd
+  %res = call <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pmaxs_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpmaxsq
+  %res = call <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_x86_pminu_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpminud
+  %res = call <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pminu_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpminuq
+  %res = call <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_x86_pmins_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpminsd
+  %res = call <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pmins_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpminsq
+  %res = call <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_conflict_d(<16 x i32> %a) {
+  ; CHECK: vpconflictd
+  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a)
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>) nounwind readonly
+
+define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
+  ; CHECK: vpconflictd %zmm0, %zmm0 {%k1} {z}
+  %vmask = bitcast i16 %mask to <16 x i1>
+  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1> %vmask, <16 x i32> %a)
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1>,<16 x i32>) nounwind readonly
+
+define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ; CHECK: vpconflictq {{.*}} {%k1}
+  %vmask = bitcast i8 %mask to <8 x i1>
+  %res = call <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64> %b, <8 x i1> %vmask, <8 x i64> %a)
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64>, <8 x i1>,<8 x i64>) nounwind readonly
+
+define <16 x float> @test_x86_mskblend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK: vblendmps
+  %m0 = bitcast i16 %a0 to <16 x i1>
+  %res = call <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %m0, <16 x float> %a1, <16 x float> %a2) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %a0, <16 x float> %a1, <16 x float> %a2) nounwind readonly
+
+define <8 x double> @test_x86_mskblend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK: vblendmpd
+  %m0 = bitcast i8 %a0 to <8 x i1>
+  %res = call <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %m0, <8 x double> %a1, <8 x double> %a2) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %a0, <8 x double> %a1, <8 x double> %a2) nounwind readonly
+
+define <16 x i32> @test_x86_mskblend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
+  ; CHECK: vpblendmd
+  %m0 = bitcast i16 %a0 to <16 x i1>
+  %res = call <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %m0, <16 x i32> %a1, <16 x i32> %a2) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %a0, <16 x i32> %a1, <16 x i32> %a2) nounwind readonly
+
+define <8 x i64> @test_x86_mskblend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+  ; CHECK: vpblendmq
+  %m0 = bitcast i8 %a0 to <8 x i1>
+  %res = call <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %m0, <8 x i64> %a1, <8 x i64> %a2) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind readonly
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index eec8873..ef5cb56 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -27,8 +27,8 @@ define i16 @mand16(i16 %x, i16 %y) {
   %md = xor <16 x i1> %ma, %mb
   %me = or <16 x i1> %mc, %md
   %ret = bitcast <16 x i1> %me to i16
-; CHECK: kxorw
 ; CHECK: kandw
+; CHECK: kxorw
 ; CHECK: korw
   ret i16 %ret
 }
@@ -55,4 +55,3 @@ define i8 @shuf_test1(i16 %v) nounwind {
    %mask1 = bitcast <8 x i1> %mask to i8
    ret i8 %mask1
 }
-
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
new file mode 100644
index 0000000..91242b1
--- /dev/null
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: @test1
+; CHECK: vmovdz  %xmm0, %eax
+; CHECK: ret
+define i32 @test1(float %x) {
+   %res = bitcast float %x to i32
+   ret i32 %res
+}
+
+; CHECK-LABEL: @test2
+; CHECK: vmovdz  %edi
+; CHECK: ret
+define <4 x i32> @test2(i32 %x) {
+   %res = insertelement <4 x i32>undef, i32 %x, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test3
+; CHECK: vmovqz  %rdi
+; CHECK: ret
+define <2 x i64> @test3(i64 %x) {
+   %res = insertelement <2 x i64>undef, i64 %x, i32 0
+   ret <2 x i64>%res
+}
+
+; CHECK-LABEL: @test4
+; CHECK: vmovdz  (%rdi)
+; CHECK: ret
+define <4 x i32> @test4(i32* %x) {
+   %y = load i32* %x
+   %res = insertelement <4 x i32>undef, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test5
+; CHECK: vmovssz  %xmm0, (%rdi)
+; CHECK: ret
+define void @test5(float %x, float* %y) {
+   store float %x, float* %y, align 4
+   ret void
+}
+
+; CHECK-LABEL: @test6
+; CHECK: vmovsdz  %xmm0, (%rdi)
+; CHECK: ret
+define void @test6(double %x, double* %y) {
+   store double %x, double* %y, align 8
+   ret void
+}
+
+; CHECK-LABEL: @test7
+; CHECK: vmovssz  (%rdi), %xmm0
+; CHECK: ret
+define float @test7(i32* %x) {
+   %y = load i32* %x
+   %res = bitcast i32 %y to float
+   ret float %res
+}
+
+; CHECK-LABEL: @test8
+; CHECK: vmovdz %xmm0, %eax
+; CHECK: ret
+define i32 @test8(<4 x i32> %x) {
+   %res = extractelement <4 x i32> %x, i32 0
+   ret i32 %res
+}
+
+; CHECK-LABEL: @test9
+; CHECK: vmovqz %xmm0, %rax
+; CHECK: ret
+define i64 @test9(<2 x i64> %x) {
+   %res = extractelement <2 x i64> %x, i32 0
+   ret i64 %res
+}
+
+; CHECK-LABEL: @test10
+; CHECK: vmovdz  (%rdi)
+; CHECK: ret
+define <4 x i32> @test10(i32* %x) {
+   %y = load i32* %x, align 4
+   %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test11
+; CHECK: vmovssz  (%rdi)
+; CHECK: ret
+define <4 x float> @test11(float* %x) {
+   %y = load float* %x, align 4
+   %res = insertelement <4 x float>zeroinitializer, float %y, i32 0
+   ret <4 x float>%res
+}
+
+; CHECK-LABEL: @test12
+; CHECK: vmovsdz  (%rdi)
+; CHECK: ret
+define <2 x double> @test12(double* %x) {
+   %y = load double* %x, align 8
+   %res = insertelement <2 x double>zeroinitializer, double %y, i32 0
+   ret <2 x double>%res
+}
+
+; CHECK-LABEL: @test13
+; CHECK: vmovqz  %rdi
+; CHECK: ret
+define <2 x i64> @test13(i64 %x) {
+   %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
+   ret <2 x i64>%res
+}
+
+; CHECK-LABEL: @test14
+; CHECK: vmovdz  %edi
+; CHECK: ret
+define <4 x i32> @test14(i32 %x) {
+   %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test15
+; CHECK: vmovdz  (%rdi)
+; CHECK: ret
+define <4 x i32> @test15(i32* %x) {
+   %y = load i32* %x, align 4
+   %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test16
+; CHECK: vmovdqu32
+; CHECK: ret
+define <16 x i32> @test16(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %res = load <16 x i32>* %vaddr, align 1
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test17
+; CHECK: vmovdqa32
+; CHECK: ret
+define <16 x i32> @test17(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %res = load <16 x i32>* %vaddr, align 64
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test18
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test18(i8 * %addr, <8 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  store <8 x i64>%data, <8 x i64>* %vaddr, align 64
+  ret void
+}
+
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
new file mode 100644
index 0000000..d2d6681
--- /dev/null
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s
+
+; CHECK-LABEL: select00
+; CHECK: vmovaps
+; CHECK-NEXT: LBB
+define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
+  %cmpres = icmp eq i32 %a, 255
+  %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
+  %res = xor <16 x i32> %b, %selres
+  ret <16 x i32> %res
+}
+
+; CHECK-LABEL: select01
+; CHECK: vmovaps
+; CHECK-NEXT: LBB
+define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
+  %cmpres = icmp eq i32 %a, 255
+  %selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b
+  %res = xor <8 x i64> %b, %selres
+  ret <8 x i64> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll
new file mode 100644
index 0000000..8cdcf8a
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shift.ll
@@ -0,0 +1,108 @@
+;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+;CHECK-LABEL: shift_16_i32
+;CHECK: vpsrld
+;CHECK: vpslld
+;CHECK: vpsrad
+;CHECK: ret
+define <16 x i32> @shift_16_i32(<16 x i32> %a) {
+   %b = lshr <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+   %c = shl <16 x i32> %b, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
+   %d = ashr <16 x i32> %c, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
+   ret <16 x i32> %d;
+}
+
+;CHECK-LABEL: shift_8_i64
+;CHECK: vpsrlq
+;CHECK: vpsllq
+;CHECK: vpsraq
+;CHECK: ret
+define <8 x i64> @shift_8_i64(<8 x i64> %a) {
+   %b = lshr <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+   %c = shl <8 x i64> %b,  <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
+   %d = ashr <8 x i64> %c, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
+   ret <8 x i64> %d;
+}
+
+; CHECK-LABEL: variable_shl4
+; CHECK: vpsllvq %zmm
+; CHECK: ret
+define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
+  %k = shl <8 x i64> %x, %y
+  ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_shl5
+; CHECK: vpsllvd %zmm
+; CHECK: ret
+define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
+  %k = shl <16 x i32> %x, %y
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_srl0
+; CHECK: vpsrlvd
+; CHECK: ret
+define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
+  %k = lshr <16 x i32> %x, %y
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_srl2
+; CHECK: psrlvq
+; CHECK: ret
+define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
+  %k = lshr <8 x i64> %x, %y
+  ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_sra1
+; CHECK: vpsravd
+; CHECK: ret
+define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
+  %k = ashr <16 x i32> %x, %y
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_sra2
+; CHECK: vpsravq %zmm
+; CHECK: ret
+define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
+  %k = ashr <8 x i64> %x, %y
+  ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_sra01_load
+; CHECK: vpsravd (%
+; CHECK: ret
+define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
+  %y1 = load <16 x i32>* %y
+  %k = ashr <16 x i32> %x, %y1
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_shl1_load
+; CHECK: vpsllvd (%
+; CHECK: ret
+define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
+  %y1 = load <16 x i32>* %y
+  %k = shl <16 x i32> %x, %y1
+  ret <16 x i32> %k
+}
+; CHECK: variable_srl0_load
+; CHECK: vpsrlvd (%
+; CHECK: ret
+define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
+  %y1 = load <16 x i32>* %y
+  %k = lshr <16 x i32> %x, %y1
+  ret <16 x i32> %k
+}
+
+; CHECK: variable_srl3_load
+; CHECK: vpsrlvq (%
+; CHECK: ret
+define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) {
+  %y1 = load <8 x i64>* %y
+  %k = lshr <8 x i64> %x, %y1
+  ret <8 x i64> %k
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
new file mode 100644
index 0000000..c9e0c2b
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -0,0 +1,226 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; CHECK: LCP
+; CHECK: .long 2
+; CHECK: .long 5
+; CHECK: .long 0
+; CHECK: .long 0
+; CHECK: .long 7
+; CHECK: .long 0
+; CHECK: .long 10
+; CHECK: .long 1
+; CHECK: .long 0
+; CHECK: .long 5
+; CHECK: .long 0
+; CHECK: .long 4
+; CHECK: .long 7
+; CHECK: .long 0
+; CHECK: .long 10
+; CHECK: .long 1
+; CHECK-LABEL: test1:
+; CHECK: vpermps
+; CHECK: ret
+define <16 x float> @test1(<16 x float> %a) nounwind {
+  %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x float> %c
+}
+
+; CHECK-LABEL: test2:
+; CHECK: vpermd
+; CHECK: ret
+define <16 x i32> @test2(<16 x i32> %a) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: test3:
+; CHECK: vpermq
+; CHECK: ret
+define <8 x i64> @test3(<8 x i64> %a) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test4:
+; CHECK: vpermpd
+; CHECK: ret
+define <8 x double> @test4(<8 x double> %a) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test5:
+; CHECK: vpermi2pd
+; CHECK: ret
+define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test6:
+; CHECK: vpermq $30
+; CHECK: ret
+define <8 x i64> @test6(<8 x i64> %a) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test7:
+; CHECK: vpermi2q
+; CHECK: ret
+define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test8:
+; CHECK: vpermi2d
+; CHECK: ret
+define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: test9:
+; CHECK: vpermi2ps
+; CHECK: ret
+define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
+  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x float> %c
+}
+
+; CHECK-LABEL: test10:
+; CHECK: vpermi2ps (
+; CHECK: ret
+define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
+  %c = load <16 x float>* %b
+  %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x float> %d
+}
+
+; CHECK-LABEL: test11:
+; CHECK: vpermi2d (
+; CHECK: ret
+define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
+  %c = load <16 x i32>* %b
+  %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x i32> %d
+}
+
+; CHECK-LABEL: test12
+; CHECK: vmovlhpsz %xmm
+; CHECK: ret
+define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %c
+}
+
+; CHECK-LABEL: test13
+; CHECK: vpermilps $-79, %zmm
+; CHECK: ret
+define <16 x float> @test13(<16 x float> %a) {
+ %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+ ret <16 x float> %b
+}
+
+; CHECK-LABEL: test14
+; CHECK: vpermilpd $-53, %zmm
+; CHECK: ret
+define <8 x double> @test14(<8 x double> %a) {
+ %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
+ ret <8 x double> %b
+}
+
+; CHECK-LABEL: test15
+; CHECK: vpshufd $-79, %zmm
+; CHECK: ret
+define <16 x i32> @test15(<16 x i32> %a) {
+ %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+ ret <16 x i32> %b
+}
+; CHECK-LABEL: test16
+; CHECK: valignq $2, %zmm0, %zmm1
+; CHECK: ret
+define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test17
+; CHECK: vshufpd $19, %zmm1, %zmm0
+; CHECK: ret
+define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test18
+; CHECK: vpunpckhdq %zmm
+; CHECK: ret
+define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
+ %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15, i32 18, i32 26, i32 19, i32 27, i32 22, i32 30, i32 23, i32 31>
+ ret <16 x i32> %b
+}
+
+; CHECK-LABEL: test19
+; CHECK: vpunpckldq %zmm
+; CHECK: ret
+define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
+ %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
+ ret <16 x i32> %b
+}
+
+; CHECK-LABEL: test20
+; CHECK: vpunpckhqdq  %zmm
+; CHECK: ret
+define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
+ %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15>
+ ret <8 x i64> %b
+}
+
+; CHECK-LABEL: test21
+; CHECK: vunpcklps %zmm
+; CHECK: ret
+define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
+ %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
+ ret <16 x float> %b
+}
+
+; CHECK-LABEL: test22
+; CHECK: vmovhlpsz %xmm
+; CHECK: ret
+define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x i32> %c
+}
+
+; CHECK-LABEL: @test23
+; CHECK: vshufps $-112, %zmm
+; CHECK: ret
+define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
+ %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
+ ret <16 x float> %b
+}
+
+; CHECK-LABEL: @test24
+; CHECK: vpermi2d
+; CHECK: ret
+define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: @test25
+; CHECK: vshufps  $52
+; CHECK: ret
+define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 undef, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: @test26
+; CHECK: vmovshdup
+; CHECK: ret
+define <16 x i32> @test26(<16 x i32> %a) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
+  ret <16 x i32> %c
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
new file mode 100644
index 0000000..31db68c
--- /dev/null
+++ b/test/CodeGen/X86/avx512-trunc-ext.ll
@@ -0,0 +1,127 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: trunc_16x32_to_16x8
+; CHECK: vpmovdb
+; CHECK: ret
+define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) nounwind readnone {
+  %x = trunc <16 x i32> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+; CHECK-LABEL: trunc_8x64_to_8x16
+; CHECK: vpmovqw
+; CHECK: ret
+define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone {
+  %x = trunc <8 x i64> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+
+; CHECK-LABEL: zext_16x8_to_16x32
+; CHECK; vpmovzxbd {{.*}}%zmm
+; CHECK: ret
+define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+  %x = zext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: sext_16x8_to_16x32
+; CHECK; vpmovsxbd {{.*}}%zmm
+; CHECK: ret
+define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+  %x = sext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+
+; CHECK-LABEL: zext_16x16_to_16x32
+; CHECK; vpmovzxwd {{.*}}%zmm
+; CHECK: ret
+define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %i) nounwind readnone {
+  %x = zext <16 x i16> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: zext_8x16_to_8x64
+; CHECK; vpmovzxwq
+; CHECK: ret
+define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %i) nounwind readnone {
+  %x = zext <8 x i16> %i to <8 x i64>
+  ret <8 x i64> %x
+}
+
+;CHECK-LABEL: fptrunc_test
+;CHECK: vcvtpd2ps {{.*}}%zmm
+;CHECK: ret
+define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
+  %b = fptrunc <8 x double> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+;CHECK-LABEL: fpext_test
+;CHECK: vcvtps2pd {{.*}}%zmm
+;CHECK: ret
+define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
+  %b = fpext <8 x float> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+; CHECK-LABEL: zext_16i1_to_16xi32
+; CHECK: vpbroadcastd LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK: ret
+define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
+  %a = bitcast i16 %b to <16 x i1>
+  %c = zext <16 x i1> %a to <16 x i32>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: zext_8i1_to_8xi64
+; CHECK: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK: ret
+define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
+  %a = bitcast i8 %b to <8 x i1>
+  %c = zext <8 x i1> %a to <8 x i64>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: trunc_16i8_to_16i1
+; CHECK: vpmovsxbd
+; CHECK: vpandd
+; CHECK: vptestmd
+; CHECK: ret
+define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
+  %mask_b = trunc <16 x i8>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+; CHECK-LABEL: trunc_16i32_to_16i1
+; CHECK: vpandd
+; CHECK: vptestmd
+; CHECK: ret
+define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
+  %mask_b = trunc <16 x i32>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+; CHECK-LABEL: trunc_8i16_to_8i1
+; CHECK: vpmovsxwq
+; CHECK: vpandq LCP{{.*}}(%rip){1to8}
+; CHECK: vptestmq
+; CHECK: ret
+define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
+  %mask_b = trunc <8 x i16>%a to <8 x i1>
+  %mask = bitcast <8 x i1> %mask_b to i8
+  ret i8 %mask
+}
+
+; CHECK: sext_8i1_8i32
+; CHECK: vpbroadcastq  LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK: ret
+define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+  %x = icmp slt <8 x i32> %a1, %a2
+  %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+  %y = sext <8 x i1> %x1 to <8 x i32>
+  ret <8 x i32> %y
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 4f07f94..6f89d6c 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-;CHECK: _inreg16xi32
+;CHECK-LABEL: _inreg16xi32:
 ;CHECK: vpbroadcastd {{.*}}, %zmm
 ;CHECK: ret
 define   <16 x i32> @_inreg16xi32(i32 %a) {
@@ -9,7 +9,7 @@ define   <16 x i32> @_inreg16xi32(i32 %a) {
   ret <16 x i32> %c
 }
 
-;CHECK: _inreg8xi64
+;CHECK-LABEL: _inreg8xi64:
 ;CHECK: vpbroadcastq {{.*}}, %zmm
 ;CHECK: ret
 define   <8 x i64> @_inreg8xi64(i64 %a) {
@@ -18,7 +18,7 @@ define   <8 x i64> @_inreg8xi64(i64 %a) {
   ret <8 x i64> %c
 }
 
-;CHECK: _inreg16xfloat
+;CHECK-LABEL: _inreg16xfloat:
 ;CHECK: vbroadcastssz {{.*}}, %zmm
 ;CHECK: ret
 define   <16 x float> @_inreg16xfloat(float %a) {
@@ -27,7 +27,7 @@ define   <16 x float> @_inreg16xfloat(float %a) {
   ret <16 x float> %c
 }
 
-;CHECK: _inreg8xdouble
+;CHECK-LABEL: _inreg8xdouble:
 ;CHECK: vbroadcastsdz {{.*}}, %zmm
 ;CHECK: ret
 define   <8 x double> @_inreg8xdouble(double %a) {
@@ -35,3 +35,19 @@ define   <8 x double> @_inreg8xdouble(double %a) {
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
   ret <8 x double> %c
 }
+
+;CHECK-LABEL: _xmm16xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
+  %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
+  ret <16 x i32> %b
+}
+
+;CHECK-LABEL: _xmm16xfloat
+;CHECK: vbroadcastssz
+;CHECK: ret
+define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
+  %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %b
+}
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
new file mode 100644
index 0000000..6ca5bcc
--- /dev/null
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -0,0 +1,113 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vcmpleps
+; CHECK: vmovups
+; CHECK: ret
+define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
+	%mask = fcmp ole <16 x float> %x, %y
+	%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
+	ret <16 x float> %max
+}
+
+; CHECK-LABEL: test2
+; CHECK: vcmplepd
+; CHECK: vmovupd
+; CHECK: ret
+define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
+	%mask = fcmp ole <8 x double> %x, %y
+	%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
+	ret <8 x double> %max
+}
+
+; CHECK-LABEL: test3
+; CHECK: vpcmpeqd  (%rdi)
+; CHECK: vmovdqu32
+; CHECK: ret
+define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
+  %y = load <16 x i32>* %yp, align 4
+	%mask = icmp eq <16 x i32> %x, %y
+	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+	ret <16 x i32> %max
+}
+
+; CHECK-LABEL: @test4_unsigned
+; CHECK: vpcmpnltud
+; CHECK: vmovdqu32
+; CHECK: ret
+define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
+	%mask = icmp uge <16 x i32> %x, %y
+	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+	ret <16 x i32> %max
+}
+
+; CHECK-LABEL: test5
+; CHECK: vpcmpeqq {{.*}}%k1
+; CHECK: vmovdqu64 {{.*}}%k1
+; CHECK: ret
+define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
+	%mask = icmp eq <8 x i64> %x, %y
+	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+	ret <8 x i64> %max
+}
+
+; CHECK-LABEL: test6_unsigned
+; CHECK: vpcmpnleuq {{.*}}%k1
+; CHECK: vmovdqu64 {{.*}}%k1
+; CHECK: ret
+define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
+	%mask = icmp ugt <8 x i64> %x, %y
+	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+	ret <8 x i64> %max
+}
+
+; CHECK-LABEL: test7
+; CHECK: xor
+; CHECK: vcmpltps
+; CHECK: vblendvps
+; CHECK: ret
+define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
+  %mask = fcmp olt <4 x float> %a, zeroinitializer
+  %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
+  ret <4 x float>%c
+}
+
+; CHECK-LABEL: test8
+; CHECK: xor
+; CHECK: vcmpltpd
+; CHECK: vblendvpd
+; CHECK: ret
+define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
+  %mask = fcmp olt <2 x double> %a, zeroinitializer
+  %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
+  ret <2 x double>%c
+}
+
+; CHECK-LABEL: test9
+; CHECK: vpcmpeqd
+; CHECK: vpblendmd
+; CHECK: ret
+define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
+  %mask = icmp eq <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test10
+; CHECK: vcmpeqps
+; CHECK: vblendmps
+; CHECK: ret
+define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
+  %mask = fcmp oeq <8 x float> %x, %y
+  %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %max
+}
+
+; CHECK-LABEL: test11_unsigned
+; CHECK: vpmaxud
+; CHECK: ret
+define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
+  %mask = icmp ugt <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %max
+}
diff --git a/test/CodeGen/X86/bc-extract.ll b/test/CodeGen/X86/bc-extract.ll
index ceabcb7..a1c0f5a 100644
--- a/test/CodeGen/X86/bc-extract.ll
+++ b/test/CodeGen/X86/bc-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 |  FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 |  FileCheck %s
 
 
 define float @extractFloat1() nounwind {
diff --git a/test/CodeGen/X86/bitcast2.ll b/test/CodeGen/X86/bitcast2.ll
index 48922b5..12aa863 100644
--- a/test/CodeGen/X86/bitcast2.ll
+++ b/test/CodeGen/X86/bitcast2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 | grep movd | count 2
-; RUN: llc < %s -march=x86-64 | not grep rsp
+; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movd | count 2
+; RUN: llc < %s -march=x86-64 -mattr=-avx | not grep rsp
 
 define i64 @test1(double %A) {
    %B = bitcast double %A to i64
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index fa775bd..4f2060f 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 
 ; In this test we check that sign-extend of the mask bit is performed by
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 4eda888..242075a 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -111,6 +111,23 @@ define i32 @bextr32_load(i32* %x, i32 %y) nounwind readnone {
 
 declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
 
+define i32 @bextr32b(i32 %x) nounwind uwtable readnone ssp {
+  %1 = lshr i32 %x, 4
+  %2 = and i32 %1, 4095
+  ret i32 %2
+; CHECK-LABEL: bextr32b:
+; CHECK: bextrl
+}
+
+define i32 @bextr32b_load(i32* %x) nounwind uwtable readnone ssp {
+  %1 = load i32* %x
+  %2 = lshr i32 %1, 4
+  %3 = and i32 %2, 4095
+  ret i32 %3
+; CHECK-LABEL: bextr32b_load:
+; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
+}
+
 define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
   %tmp = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %y)
   ret i64 %tmp
@@ -120,6 +137,14 @@ define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
 
 declare i64 @llvm.x86.bmi.bextr.64(i64, i64) nounwind readnone
 
+define i64 @bextr64b(i64 %x) nounwind uwtable readnone ssp {
+  %1 = lshr i64 %x, 4
+  %2 = and i64 %1, 4095
+  ret i64 %2
+; CHECK-LABEL: bextr64b:
+; CHECK: bextrq
+}
+
 define i32 @bzhi32(i32 %x, i32 %y) nounwind readnone {
   %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x, i32 %y)
   ret i32 %tmp
@@ -146,6 +171,51 @@ define i64 @bzhi64(i64 %x, i64 %y) nounwind readnone {
 
 declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) nounwind readnone
 
+define i32 @bzhi32b(i32 %x, i8 zeroext %index) #0 {
+entry:
+  %conv = zext i8 %index to i32
+  %shl = shl i32 1, %conv
+  %sub = add nsw i32 %shl, -1
+  %and = and i32 %sub, %x
+  ret i32 %and
+; CHECK-LABEL: bzhi32b:
+; CHECK: bzhil
+}
+
+define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) #0 {
+entry:
+  %x = load i32* %w
+  %conv = zext i8 %index to i32
+  %shl = shl i32 1, %conv
+  %sub = add nsw i32 %shl, -1
+  %and = and i32 %sub, %x
+  ret i32 %and
+; CHECK-LABEL: bzhi32b_load:
+; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
+}
+
+define i32 @bzhi32c(i32 %x, i8 zeroext %index) #0 {
+entry:
+  %conv = zext i8 %index to i32
+  %shl = shl i32 1, %conv
+  %sub = add nsw i32 %shl, -1
+  %and = and i32 %x, %sub
+  ret i32 %and
+; CHECK-LABEL: bzhi32c:
+; CHECK: bzhil
+}
+
+define i64 @bzhi64b(i64 %x, i8 zeroext %index) #0 {
+entry:
+  %conv = zext i8 %index to i64
+  %shl = shl i64 1, %conv
+  %sub = add nsw i64 %shl, -1
+  %and = and i64 %x, %sub
+  ret i64 %and
+; CHECK-LABEL: bzhi64b:
+; CHECK: bzhiq
+}
+
 define i32 @blsi32(i32 %x) nounwind readnone {
   %tmp = sub i32 0, %x
   %tmp2 = and i32 %x, %tmp
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index fa6f6e8..a0a1c36 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand,+rdseed | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
diff --git a/test/CodeGen/X86/break-anti-dependencies.ll b/test/CodeGen/X86/break-anti-dependencies.ll
index c942614..614d0ad 100644
--- a/test/CodeGen/X86/break-anti-dependencies.ll
+++ b/test/CodeGen/X86/break-anti-dependencies.ll
@@ -1,7 +1,7 @@
 ; Without list-burr scheduling we may not see the difference in codegen here.
 ; Use a subtarget that has post-RA scheduling enabled because the anti-dependency
 ; breaker requires liveness information to be kept.
-; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
+; RUN: llc < %s -march=x86-64 -mcpu=atom -enable-misched=false -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
 ; RUN:   grep "%xmm0" %t | count 14
 ; RUN:   not grep "%xmm1" %t
 ; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
diff --git a/test/CodeGen/X86/break-avx-dep.ll b/test/CodeGen/X86/break-avx-dep.ll
new file mode 100644
index 0000000..210bda1
--- /dev/null
+++ b/test/CodeGen/X86/break-avx-dep.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+;
+; rdar:15221834 False AVX register dependencies cause 5x slowdown on
+; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed
+; to avoid cyclic dependence on a write to the same register in a
+; previous iteration.
+
+; CHECK-LABEL: t1:
+; CHECK-LABEL: %loop
+; CHECK: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}}
+; CHECK: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}}
+define i64 @t1(i64* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i64* %x
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
+  %fi = sitofp i64 %i to double
+  %vy = load double* %y
+  %fipy = fadd double %fi, %vy
+  %iipy = fptosi double %fipy to i64
+  %s2 = add i64 %s1, %iipy
+  %inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i64 %s2
+}
diff --git a/test/CodeGen/X86/bswap.ll b/test/CodeGen/X86/bswap.ll
index 9e46592..e6a456c 100644
--- a/test/CodeGen/X86/bswap.ll
+++ b/test/CodeGen/X86/bswap.ll
@@ -1,6 +1,7 @@
 ; bswap should be constant folded when it is passed a constant argument
 
 ; RUN: llc < %s -march=x86 -mcpu=i686 | FileCheck %s
+; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=CHECK64
 
 declare i16 @llvm.bswap.i16(i16)
 
@@ -11,6 +12,9 @@ declare i64 @llvm.bswap.i64(i64)
 define i16 @W(i16 %A) {
 ; CHECK-LABEL: W:
 ; CHECK: rolw $8, %ax
+
+; CHECK64-LABEL: W:
+; CHECK64: rolw $8, %
         %Z = call i16 @llvm.bswap.i16( i16 %A )         ; <i16> [#uses=1]
         ret i16 %Z
 }
@@ -18,6 +22,9 @@ define i16 @W(i16 %A) {
 define i32 @X(i32 %A) {
 ; CHECK-LABEL: X:
 ; CHECK: bswapl %eax
+
+; CHECK64-LABEL: X:
+; CHECK64: bswapl %
         %Z = call i32 @llvm.bswap.i32( i32 %A )         ; <i32> [#uses=1]
         ret i32 %Z
 }
@@ -26,6 +33,9 @@ define i64 @Y(i64 %A) {
 ; CHECK-LABEL: Y:
 ; CHECK: bswapl %eax
 ; CHECK: bswapl %edx
+
+; CHECK64-LABEL: Y:
+; CHECK64: bswapq %
         %Z = call i64 @llvm.bswap.i64( i64 %A )         ; <i64> [#uses=1]
         ret i64 %Z
 }
@@ -33,9 +43,13 @@ define i64 @Y(i64 %A) {
 ; rdar://9164521
 define i32 @test1(i32 %a) nounwind readnone {
 entry:
-; CHECK: test1
-; CHECK: bswapl %eax
-; CHECK: shrl $16, %eax
+; CHECK-LABEL: test1:
+; CHECK: bswapl [[REG:%.*]]
+; CHECK: shrl $16, [[REG]]
+
+; CHECK64-LABEL: test1:
+; CHECK64: bswapl [[REG:%.*]]
+; CHECK64: shrl $16, [[REG]]
   %and = lshr i32 %a, 8
   %shr3 = and i32 %and, 255
   %and2 = shl i32 %a, 8
@@ -46,9 +60,13 @@ entry:
 
 define i32 @test2(i32 %a) nounwind readnone {
 entry:
-; CHECK: test2
-; CHECK: bswapl %eax
-; CHECK: sarl $16, %eax
+; CHECK-LABEL: test2:
+; CHECK: bswapl [[REG:%.*]]
+; CHECK: sarl $16, [[REG]]
+
+; CHECK64-LABEL: test2:
+; CHECK64: bswapl [[REG:%.*]]
+; CHECK64: sarl $16, [[REG]]
   %and = lshr i32 %a, 8
   %shr4 = and i32 %and, 255
   %and2 = shl i32 %a, 8
@@ -57,3 +75,80 @@ entry:
   %conv3 = ashr exact i32 %sext, 16
   ret i32 %conv3
 }
+
+@var8 = global i8 0
+@var16 = global i16 0
+
+; The "shl" below can move bits into the high parts of the value, so the
+; operation is not a "bswap, shr" pair.
+
+; rdar://problem/14814049
+define i64 @not_bswap() {
+; CHECK-LABEL: not_bswap:
+; CHECK-NOT: bswapl
+; CHECK: ret
+
+; CHECK64-LABEL: not_bswap:
+; CHECK64-NOT: bswapq
+; CHECK64: ret
+  %init = load i16* @var16
+  %big = zext i16 %init to i64
+
+  %hishifted = lshr i64 %big, 8
+  %loshifted = shl i64 %big, 8
+
+  %notswapped = or i64 %hishifted, %loshifted
+
+  ret i64 %notswapped
+}
+
+; This time, the lshr (and subsequent or) is completely useless. While it's
+; technically correct to convert this into a "bswap, shr", it's suboptimal. A
+; simple shl works better.
+
+define i64 @not_useful_bswap() {
+; CHECK-LABEL: not_useful_bswap:
+; CHECK-NOT: bswapl
+; CHECK: ret
+
+; CHECK64-LABEL: not_useful_bswap:
+; CHECK64-NOT: bswapq
+; CHECK64: ret
+
+  %init = load i8* @var8
+  %big = zext i8 %init to i64
+
+  %hishifted = lshr i64 %big, 8
+  %loshifted = shl i64 %big, 8
+
+  %notswapped = or i64 %hishifted, %loshifted
+
+  ret i64 %notswapped
+}
+
+; Finally, it *is* OK to just mask off the shl if we know that the value is zero
+; beyond 16 bits anyway. This is a legitimate bswap.
+
+define i64 @finally_useful_bswap() {
+; CHECK-LABEL: finally_useful_bswap:
+; CHECK: bswapl [[REG:%.*]]
+; CHECK: shrl $16, [[REG]]
+; CHECK: ret
+
+; CHECK64-LABEL: finally_useful_bswap:
+; CHECK64: bswapq [[REG:%.*]]
+; CHECK64: shrq $48, [[REG]]
+; CHECK64: ret
+
+  %init = load i16* @var16
+  %big = zext i16 %init to i64
+
+  %hishifted = lshr i64 %big, 8
+  %lomasked = and i64 %big, 255
+  %loshifted = shl i64 %lomasked, 8
+
+  %swapped = or i64 %hishifted, %loshifted
+
+  ret i64 %swapped
+}
+
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index e28923b..f12a354 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -38,7 +38,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -56,7 +56,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atest2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atest2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -74,7 +74,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atest2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atest2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -91,7 +91,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -109,7 +109,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -127,7 +127,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -145,7 +145,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -163,7 +163,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atestne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atestne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -181,7 +181,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atestne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atestne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -199,7 +199,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -217,7 +217,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -235,7 +235,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -253,7 +253,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -271,7 +271,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aquery2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aquery2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -289,7 +289,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aquery2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aquery2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -307,7 +307,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -325,7 +325,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -343,7 +343,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3x(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3x
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -361,7 +361,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3bx(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3bx
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -379,7 +379,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -397,7 +397,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -415,7 +415,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aqueryne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aqueryne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -433,7 +433,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aqueryne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aqueryne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -451,7 +451,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -469,7 +469,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -487,7 +487,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3x(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3x
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -505,7 +505,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3bx(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3bx
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 8a96e41..42751d7 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -7,14 +7,14 @@
 define i32 @main() nounwind  {
 entry:
 ; CHECK-LABEL: main:
-; CHECK: movl $1, (%esp)
 ; CHECK: leal 16(%esp), %edi
 ; CHECK: leal 160(%esp), %esi
 ; CHECK: rep;movsl
+; CHECK: movl $1, (%esp)
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
 	%tmp15 = getelementptr %struct.S* %s, i32 0, i32 0		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16
-	call void @t( i32 1, %struct.S* byval  %s ) nounwind 
+	call void @t( i32 1, %struct.S* byval  %s ) nounwind
 	ret i32 0
 }
 
diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll
index 8c1c864..c88726e 100644
--- a/test/CodeGen/X86/chain_order.ll
+++ b/test/CodeGen/X86/chain_order.ll
@@ -3,8 +3,8 @@
 ;CHECK-LABEL: cftx020:
 ;CHECK: vmovsd  (%rdi), %xmm{{.*}}
 ;CHECK: vmovsd  16(%rdi), %xmm{{.*}}
-;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
 ;CHECK: vmovsd  24(%rdi), %xmm{{.*}}
+;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
 ;CHECK: vmovupd %xmm{{.*}}, (%rdi)
 ;CHECK: vmovupd %xmm{{.*}}, 16(%rdi)
 ;CHECK: ret
@@ -35,4 +35,3 @@ entry:
   store <2 x double> %14, <2 x double>* %15, align 8
   ret void
 }
-
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 92c0445..215b862 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: movl	$12, %eax
-; CHECK-NEXT: btl
+; CHECK: btl
+; CHECK-NEXT: movl	$12, %eax
 ; CHECK-NEXT: cmovael	(%rcx), %eax
 ; CHECK-NEXT: ret
 
@@ -19,8 +19,8 @@ entry:
 define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK-LABEL: test2:
-; CHECK: movl	$12, %eax
-; CHECK-NEXT: btl
+; CHECK: btl
+; CHECK-NEXT: movl	$12, %eax
 ; CHECK-NEXT: cmovbl	(%rcx), %eax
 ; CHECK-NEXT: ret
 
@@ -92,7 +92,7 @@ bb.i.i.i:                                         ; preds = %entry
 ; CHECK: testb
 ; CHECK-NOT: xor
 ; CHECK: setne
-; CHECK-NEXT: testb
+; CHECK: testb
 
 func_4.exit.i:                                    ; preds = %bb.i.i.i, %entry
   %.not.i = xor i1 %2, true                       ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/coalesce-implicitdef.ll b/test/CodeGen/X86/coalesce-implicitdef.ll
index 19cd08c..9be0452 100644
--- a/test/CodeGen/X86/coalesce-implicitdef.ll
+++ b/test/CodeGen/X86/coalesce-implicitdef.ll
@@ -26,7 +26,7 @@ for.cond:                                         ; preds = %for.inc34, %entry
   br i1 %tobool, label %for.end36, label %for.body
 
 for.body:                                         ; preds = %for.cond
-  store i32 0, i32* @c, align 4, !tbaa !0
+  store i32 0, i32* @c, align 4
   br label %for.body2
 
 for.body2:                                        ; preds = %for.body, %for.inc
@@ -35,7 +35,7 @@ for.body2:                                        ; preds = %for.body, %for.inc
   br i1 %tobool3, label %if.then10, label %if.then
 
 if.then:                                          ; preds = %for.body2
-  store i32 0, i32* %i, align 4, !tbaa !0
+  store i32 0, i32* %i, align 4
   br label %for.body6
 
 for.body6:                                        ; preds = %if.then, %for.body6
@@ -43,7 +43,7 @@ for.body6:                                        ; preds = %if.then, %for.body6
   br i1 true, label %for.body6, label %for.inc
 
 if.then10:                                        ; preds = %for.body2
-  store i32 1, i32* @b, align 4, !tbaa !0
+  store i32 1, i32* @b, align 4
   ret void
 
 for.inc:                                          ; preds = %for.body6
@@ -66,30 +66,30 @@ while.end:                                        ; preds = %while.cond
 
 for.inc27.backedge:                               ; preds = %while.end, %if.then22
   %inc28 = add nsw i32 %0, 1
-  store i32 %inc28, i32* @b, align 4, !tbaa !0
+  store i32 %inc28, i32* @b, align 4
   %tobool17 = icmp eq i32 %inc28, 0
   br i1 %tobool17, label %for.inc27.if.end30.loopexit56_crit_edge, label %while.condthread-pre-split
 
 if.then22:                                        ; preds = %while.end
-  %1 = load i16* %p2.1, align 2, !tbaa !3
+  %1 = load i16* %p2.1, align 2
   %tobool23 = icmp eq i16 %1, 0
   br i1 %tobool23, label %for.inc27.backedge, label %label.loopexit
 
 label.loopexit:                                   ; preds = %if.then22
-  store i32 %inc20, i32* @a, align 4, !tbaa !0
+  store i32 %inc20, i32* @a, align 4
   %inc2858 = add nsw i32 %0, 1
-  store i32 %inc2858, i32* @b, align 4, !tbaa !0
+  store i32 %inc2858, i32* @b, align 4
   %tobool1759 = icmp eq i32 %inc2858, 0
   br i1 %tobool1759, label %if.end30, label %while.condthread-pre-split
 
 for.inc27.if.end30.loopexit56_crit_edge:          ; preds = %for.inc27.backedge
-  store i32 %inc20, i32* @a, align 4, !tbaa !0
+  store i32 %inc20, i32* @a, align 4
   br label %if.end30
 
 if.end30:                                         ; preds = %for.inc27.if.end30.loopexit56_crit_edge, %label.loopexit, %label.preheader, %for.inc
   %i.0.load46 = phi i32 [ 0, %for.inc ], [ %i.0.load4669, %label.preheader ], [ %i.0.load4669, %label.loopexit ], [ %i.0.load4669, %for.inc27.if.end30.loopexit56_crit_edge ]
   %pi.4 = phi i32* [ %i, %for.inc ], [ %pi.3.ph, %label.preheader ], [ %pi.3.ph, %label.loopexit ], [ %pi.3.ph, %for.inc27.if.end30.loopexit56_crit_edge ]
-  %2 = load i32* %pi.4, align 4, !tbaa !0
+  %2 = load i32* %pi.4, align 4
   %tobool31 = icmp eq i32 %2, 0
   br i1 %tobool31, label %for.inc34, label %label.preheader
 
@@ -100,31 +100,26 @@ for.inc34:                                        ; preds = %if.end30
 
 for.end36:                                        ; preds = %for.cond
   store i32 1, i32* %i, align 4
-  %3 = load i32* @c, align 4, !tbaa !0
+  %3 = load i32* @c, align 4
   %tobool37 = icmp eq i32 %3, 0
   br i1 %tobool37, label %label.preheader, label %land.rhs
 
 land.rhs:                                         ; preds = %for.end36
-  store i32 0, i32* @a, align 4, !tbaa !0
+  store i32 0, i32* @a, align 4
   br label %label.preheader
 
 label.preheader:                                  ; preds = %for.end36, %if.end30, %land.rhs
   %i.0.load4669 = phi i32 [ 1, %land.rhs ], [ %i.0.load46, %if.end30 ], [ 1, %for.end36 ]
   %pi.3.ph = phi i32* [ %pi.0, %land.rhs ], [ %pi.4, %if.end30 ], [ %pi.0, %for.end36 ]
-  %4 = load i32* @b, align 4, !tbaa !0
+  %4 = load i32* @b, align 4
   %inc285863 = add nsw i32 %4, 1
-  store i32 %inc285863, i32* @b, align 4, !tbaa !0
+  store i32 %inc285863, i32* @b, align 4
   %tobool175964 = icmp eq i32 %inc285863, 0
   br i1 %tobool175964, label %if.end30, label %while.condthread-pre-split.lr.ph.lr.ph
 
 while.condthread-pre-split.lr.ph.lr.ph:           ; preds = %label.preheader
-  %.pr50 = load i32* @d, align 4, !tbaa !0
+  %.pr50 = load i32* @d, align 4
   %tobool19 = icmp eq i32 %.pr50, 0
-  %a.promoted.pre = load i32* @a, align 4, !tbaa !0
+  %a.promoted.pre = load i32* @a, align 4
   br label %while.condthread-pre-split
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/X86/coff-feat00.ll b/test/CodeGen/X86/coff-feat00.ll
new file mode 100644
index 0000000..1dcd427
--- /dev/null
+++ b/test/CodeGen/X86/coff-feat00.ll
@@ -0,0 +1,7 @@
+; RUN: llc -O0 -mtriple=i386-pc-win32 -filetype=asm -o - %s | FileCheck %s
+
+define i32 @foo() {
+  ret i32 0
+}
+
+; CHECK: @feat.00 = 1
diff --git a/test/CodeGen/X86/commute-two-addr.ll b/test/CodeGen/X86/commute-two-addr.ll
index eb44e08..656c385 100644
--- a/test/CodeGen/X86/commute-two-addr.ll
+++ b/test/CodeGen/X86/commute-two-addr.ll
@@ -38,10 +38,10 @@ define i32 @t2(i32 %X, i32 %Y) nounwind {
 define %0 @t3(i32 %lb, i8 zeroext %has_lb, i8 zeroext %lb_inclusive, i32 %ub, i8 zeroext %has_ub, i8 zeroext %ub_inclusive) nounwind {
 entry:
 ; DARWIN-LABEL: t3:
-; DARWIN: shll $16
 ; DARWIN: shlq $32, %rcx
+; DARWIN-NEXT: orq %rcx, %rax
+; DARWIN-NEXT: shll $8
 ; DARWIN-NOT: leaq
-; DARWIN: orq %rcx, %rax
   %tmp21 = zext i32 %lb to i64
   %tmp23 = zext i32 %ub to i64
   %tmp24 = shl i64 %tmp23, 32
diff --git a/test/CodeGen/X86/compact-unwind.ll b/test/CodeGen/X86/compact-unwind.ll
index 8c4fa27..9d3a125 100644
--- a/test/CodeGen/X86/compact-unwind.ll
+++ b/test/CodeGen/X86/compact-unwind.ll
@@ -1,18 +1,29 @@
-; RUN: llc < %s -disable-cfi -disable-fp-elim -mtriple x86_64-apple-darwin11 | FileCheck %s
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 | FileCheck -check-prefix=ASM %s
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | FileCheck -check-prefix=CU %s
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 \
+; RUN:  | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | FileCheck -check-prefix=FROM-ASM %s
 
 %ty = type { i8* }
 
 @gv = external global i32
 
 ; This is aligning the stack with a push of a random register.
-; CHECK: pushq %rax
+; ASM: pushq %rax
 
 ; Even though we can't encode %rax into the compact unwind, We still want to be
 ; able to generate a compact unwind encoding in this particular case.
-;
-; CHECK: __LD,__compact_unwind
-; CHECK: _foo ## Range Start
-; CHECK: 16842753 ## Compact Unwind Encoding: 0x1010001
+
+; CU:      Contents of section __compact_unwind:
+; CU-NEXT: 0020 00000000 00000000 1e000000 01000101
+; CU-NEXT: 0030 00000000 00000000 00000000 00000000
+
+; FROM-ASM:      Contents of section __compact_unwind:
+; FROM-ASM-NEXT: 0020 00000000 00000000 1e000000 01000101
+; FROM-ASM-NEXT: 0030 00000000 00000000 00000000 00000000
 
 define i8* @foo(i64 %size) {
   %addr = alloca i64, align 8
diff --git a/test/CodeGen/X86/crash-nosse.ll b/test/CodeGen/X86/crash-nosse.ll
index 7a15a47..b1e01f9 100644
--- a/test/CodeGen/X86/crash-nosse.ll
+++ b/test/CodeGen/X86/crash-nosse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -mattr=-sse2,-sse41 -verify-machineinstrs
+; RUN: llc < %s -mcpu=corei7 -mattr=-sse2,-sse4.1 -verify-machineinstrs
 target triple = "x86_64-unknown-linux-gnu"
 
 ; PR10503
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index b0a0e24..051150e 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -204,7 +204,7 @@ entry:
 ; <rdar://problem/9187792>
 define fastcc void @func_61() nounwind sspreq {
 entry:
-  %t1 = tail call i64 @llvm.objectsize.i64(i8* undef, i1 false)
+  %t1 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
   %t2 = icmp eq i64 %t1, -1
   br i1 %t2, label %bb2, label %bb1
 
@@ -215,7 +215,7 @@ bb2:
   ret void
 }
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readnone
 
 ; PR10277
 ; This test has dead code elimination caused by remat during spilling.
diff --git a/test/CodeGen/X86/dagcombine-shifts.ll b/test/CodeGen/X86/dagcombine-shifts.ll
new file mode 100644
index 0000000..905cf05
--- /dev/null
+++ b/test/CodeGen/X86/dagcombine-shifts.ll
@@ -0,0 +1,209 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; fold (shl (zext (lshr (A, X))), X) -> (zext (shl (lshr (A, X)), X))
+
+; Canolicalize the sequence shl/zext/lshr performing the zeroextend
+; as the last instruction of the sequence.
+; This will help DAGCombiner to identify and then fold the sequence
+; of shifts into a single AND.
+; This transformation is profitable if the shift amounts are the same
+; and if there is only one use of the zext.
+
+define i16 @fun1(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i16
+  %shl = shl i16 %ext, 4
+  ret i16 %shl
+}
+
+; CHECK-LABEL: @fun1
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i32 @fun2(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i32
+  %shl = shl i32 %ext, 4
+  ret i32 %shl
+}
+
+; CHECK-LABEL: @fun2
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i32 @fun3(i16 zeroext %v) {
+entry:
+  %shr = lshr i16 %v, 4
+  %ext = zext i16 %shr to i32
+  %shl = shl i32 %ext, 4
+  ret i32 %shl
+}
+
+; CHECK-LABEL: @fun3
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i64 @fun4(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun4
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i64 @fun5(i16 zeroext %v) {
+entry:
+  %shr = lshr i16 %v, 4
+  %ext = zext i16 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun5
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i64 @fun6(i32 zeroext %v) {
+entry:
+  %shr = lshr i32 %v, 4
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun6
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+; Don't fold the pattern if we use arithmetic shifts.
+
+define i64 @fun7(i8 zeroext %v) {
+entry:
+  %shr = ashr i8 %v, 4
+  %ext = zext i8 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun7
+; CHECK: sar
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun8(i16 zeroext %v) {
+entry:
+  %shr = ashr i16 %v, 4
+  %ext = zext i16 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun8
+; CHECK: sar
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun9(i32 zeroext %v) {
+entry:
+  %shr = ashr i32 %v, 4
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun9
+; CHECK: sar
+; CHECK: shl
+; CHECK: ret
+
+; Don't fold the pattern if there is more than one use of the
+; operand in input to the shift left.
+
+define i64 @fun10(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i64
+  %shl = shl i64 %ext, 4
+  %add = add i64 %shl, %ext
+  ret i64 %add
+}
+
+; CHECK-LABEL: @fun10
+; CHECK: shr
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun11(i16 zeroext %v) {
+entry:
+  %shr = lshr i16 %v, 4
+  %ext = zext i16 %shr to i64
+  %shl = shl i64 %ext, 4
+  %add = add i64 %shl, %ext
+  ret i64 %add
+}
+
+; CHECK-LABEL: @fun11
+; CHECK: shr
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun12(i32 zeroext %v) {
+entry:
+  %shr = lshr i32 %v, 4
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 4
+  %add = add i64 %shl, %ext
+  ret i64 %add
+}
+
+; CHECK-LABEL: @fun12
+; CHECK: shr
+; CHECK: shl
+; CHECK: ret
+
+; PR17380
+; Make sure that the combined dags are legal if we run the DAGCombiner after
+; Legalization took place. The add instruction is redundant and increases by 
+; one the number of uses of the zext. This prevents the transformation from
+; firing before dags are legalized and optimized.
+; Once the add is removed, the number of uses becomes one and therefore the
+; dags are canonicalized. After Legalization, we need to make sure that the
+; valuetype for the shift count is legal.
+; Verify also that we correctly fold the shl-shr sequence into an 
+; AND with bitmask.
+
+define void @g(i32 %a) {
+  %b = lshr i32 %a, 2
+  %c = zext i32 %b to i64
+  %d = add i64 %c, 1
+  %e = shl i64 %c, 2
+  tail call void @f(i64 %e)
+  ret void
+}
+
+; CHECK-LABEL: @g
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: and
+; CHECK-NEXT: jmp
+
+declare void @f(i64)
+
diff --git a/test/CodeGen/X86/dagcombine-unsafe-math.ll b/test/CodeGen/X86/dagcombine-unsafe-math.ll
new file mode 100644
index 0000000..f06d9f1
--- /dev/null
+++ b/test/CodeGen/X86/dagcombine-unsafe-math.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s 
+
+
+; rdar://13126763
+; Expression "x + x*x" was mistakenly transformed into "x * 3.0f".
+
+define float @test1(float %x) {
+  %t1 = fmul fast float %x, %x
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: test1
+; CHECK: vaddss
+}
+
+; (x + x) + x => x * 3.0
+define float @test2(float %x) {
+  %t1 = fadd fast float %x, %x
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: .long  1077936128
+; CHECK: test2
+; CHECK: vmulss LCPI1_0(%rip), %xmm0, %xmm0
+}
+
+; x + (x + x) => x * 3.0
+define float @test3(float %x) {
+  %t1 = fadd fast float %x, %x
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: .long  1077936128
+; CHECK: test3
+; CHECK: vmulss LCPI2_0(%rip), %xmm0, %xmm0
+}
+
+; (y + x) + x != x * 3.0
+define float @test4(float %x, float %y) {
+  %t1 = fadd fast float %x, %y
+  %t2 = fadd fast float %t1, %x
+  ret float %t2
+; CHECK: test4
+; CHECK: vaddss
+}
+
+; rdar://13445387
+; "x + x + x => 3.0 * x" should be disabled after legalization because 
+; Instruction-Selection doesn't know how to handle "3.0"
+; 
+define float @test5() {
+  %mul.i.i151 = fmul <4 x float> zeroinitializer, zeroinitializer
+  %vecext.i8.i152 = extractelement <4 x float> %mul.i.i151, i32 1
+  %vecext1.i9.i153 = extractelement <4 x float> %mul.i.i151, i32 0
+  %add.i10.i154 = fadd float %vecext1.i9.i153, %vecext.i8.i152
+  %vecext.i7.i155 = extractelement <4 x float> %mul.i.i151, i32 2
+  %add.i.i156 = fadd float %vecext.i7.i155, %add.i10.i154
+  ret float %add.i.i156
+}
diff --git a/test/CodeGen/X86/dagcombine_unsafe_math.ll b/test/CodeGen/X86/dagcombine_unsafe_math.ll
deleted file mode 100644
index 592cf1b..0000000
--- a/test/CodeGen/X86/dagcombine_unsafe_math.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s 
-
-
-; rdar://13126763
-; Expression "x + x*x" was mistakenly transformed into "x * 3.0f".
-
-define float @test1(float %x) {
-  %t1 = fmul fast float %x, %x
-  %t2 = fadd fast float %t1, %x
-  ret float %t2
-; CHECK: test1
-; CHECK: vaddss
-}
-
-; (x + x) + x => x * 3.0
-define float @test2(float %x) {
-  %t1 = fadd fast float %x, %x
-  %t2 = fadd fast float %t1, %x
-  ret float %t2
-; CHECK: .long  1077936128
-; CHECK: test2
-; CHECK: vmulss LCPI1_0(%rip), %xmm0, %xmm0
-}
-
-; x + (x + x) => x * 3.0
-define float @test3(float %x) {
-  %t1 = fadd fast float %x, %x
-  %t2 = fadd fast float %t1, %x
-  ret float %t2
-; CHECK: .long  1077936128
-; CHECK: test3
-; CHECK: vmulss LCPI2_0(%rip), %xmm0, %xmm0
-}
-
-; (y + x) + x != x * 3.0
-define float @test4(float %x, float %y) {
-  %t1 = fadd fast float %x, %y
-  %t2 = fadd fast float %t1, %x
-  ret float %t2
-; CHECK: test4
-; CHECK: vaddss
-}
-
-; rdar://13445387
-; "x + x + x => 3.0 * x" should be disabled after legalization because 
-; Instruction-Selection dosen't know how to handle "3.0"
-; 
-define float @test5() {
-  %mul.i.i151 = fmul <4 x float> zeroinitializer, zeroinitializer
-  %vecext.i8.i152 = extractelement <4 x float> %mul.i.i151, i32 1
-  %vecext1.i9.i153 = extractelement <4 x float> %mul.i.i151, i32 0
-  %add.i10.i154 = fadd float %vecext1.i9.i153, %vecext.i8.i152
-  %vecext.i7.i155 = extractelement <4 x float> %mul.i.i151, i32 2
-  %add.i.i156 = fadd float %vecext.i7.i155, %add.i10.i154
-  ret float %add.i.i156
-}
diff --git a/test/CodeGen/X86/dbg-at-specficiation.ll b/test/CodeGen/X86/dbg-at-specficiation.ll
deleted file mode 100644
index a6eebcb..0000000
--- a/test/CodeGen/X86/dbg-at-specficiation.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc  < %s | FileCheck %s
-; Radar 10147769
-; Do not unnecessarily use AT_specification DIE.
-; CHECK-NOT: AT_specification
-
-@a = common global [10 x i32] zeroinitializer, align 16
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 720913, metadata !11, i32 12, metadata !"clang version 3.0 (trunk 140253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, i32 0} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{i32 720948, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 720937, metadata !11} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !8, metadata !9, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!8 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
-!11 = metadata !{metadata !"x.c", metadata !"/private/tmp"}
diff --git a/test/CodeGen/X86/dbg-byval-parameter.ll b/test/CodeGen/X86/dbg-byval-parameter.ll
deleted file mode 100644
index ef9e03c..0000000
--- a/test/CodeGen/X86/dbg-byval-parameter.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -march=x86 -asm-verbose < %s | grep DW_TAG_formal_parameter
-
-
-%struct.Pt = type { double, double }
-%struct.Rect = type { %struct.Pt, %struct.Pt }
-
-define double @foo(%struct.Rect* byval %my_r0) nounwind ssp {
-entry:
-  %retval = alloca double                         ; <double*> [#uses=2]
-  %0 = alloca double                              ; <double*> [#uses=2]
-  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15
-  %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
-  %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
-  %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
-  store double %3, double* %0, align 8, !dbg !16
-  %4 = load double* %0, align 8, !dbg !16         ; <double> [#uses=1]
-  store double %4, double* %retval, align 8, !dbg !16
-  br label %return, !dbg !16
-
-return:                                           ; preds = %entry
-  %retval1 = load double* %retval, !dbg !16       ; <double> [#uses=1]
-  ret double %retval1, !dbg !16
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!3}
-
-!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null} ; [ DW_TAG_structure_type ]
-!8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
-!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ]
-!15 = metadata !{i32 11, i32 0, metadata !1, null}
-!16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
-!20 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-const-int.ll b/test/CodeGen/X86/dbg-const-int.ll
deleted file mode 100644
index fc4ff6d..0000000
--- a/test/CodeGen/X86/dbg-const-int.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s - | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.7"
-; Radar 9511391
-
-;CHECK:         .byte   4                       ## DW_AT_const_value
-define i32 @foo() nounwind uwtable readnone optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
-  ret i32 42, !dbg !10
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.0 (trunk 132191)", i1 true, metadata !"", i32 0, metadata !14, metadata !14, metadata !11, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !13, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !13, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786688, metadata !7, metadata !"i", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!7 = metadata !{i32 786443, metadata !13, metadata !1, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 42}
-!9 = metadata !{i32 2, i32 12, metadata !7, null}
-!10 = metadata !{i32 3, i32 2, metadata !7, null}
-!11 = metadata !{metadata !1}
-!12 = metadata !{metadata !6}
-!13 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!14 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-const.ll b/test/CodeGen/X86/dbg-const.ll
deleted file mode 100644
index b37eb0a..0000000
--- a/test/CodeGen/X86/dbg-const.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc < %s - | FileCheck %s
-;
-; FIXME: A potentially more interesting test case would be:
-; %call = @bar()
-; dbg.value j=0
-; %call2 = @bar()
-; dbg.value j=%call
-;
-; We cannot current handle the above sequence because codegenprepare
-; hoists the second dbg.value above %call2, which then appears to
-; conflict with j=0. It does this because SelectionDAG cannot handle
-; global debug values.
-
-target triple = "x86_64-apple-darwin10.0.0"
-
-;CHECK:        ## DW_OP_constu
-;CHECK-NEXT:  .byte	42
-define i32 @foobar() nounwind readonly noinline ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
-  %call = tail call i32 @bar(), !dbg !11
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6), !dbg !11
-  %call2 = tail call i32 @bar(), !dbg !11
-  %add = add nsw i32 %call2, %call, !dbg !12
-  ret i32 %add, !dbg !10
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-declare i32 @bar() nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !15, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"foobar", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @foobar, null, null, metadata !14, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114183)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !15, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null}
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !15, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
-!6 = metadata !{i32 786688, metadata !7, metadata !"j", metadata !1, i32 15, metadata !5, i32 0, null}
-!7 = metadata !{i32 786443, metadata !15, metadata !0, i32 12, i32 52, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 42}
-!9 = metadata !{i32 15, i32 12, metadata !7, null}
-!10 = metadata !{i32 23, i32 3, metadata !7, null}
-!11 = metadata !{i32 17, i32 3, metadata !7, null}
-!12 = metadata !{i32 18, i32 3, metadata !7, null}
-!13 = metadata !{metadata !0}
-!14 = metadata !{metadata !6}
-!15 = metadata !{metadata !"mu.c", metadata !"/private/tmp"}
-!16 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-declare-arg.ll b/test/CodeGen/X86/dbg-declare-arg.ll
deleted file mode 100644
index 55b4238..0000000
--- a/test/CodeGen/X86/dbg-declare-arg.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc -O0 -fast-isel=false < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.7"
-;Radar 9321650
-
-;CHECK: ##DEBUG_VALUE: my_a 
-
-%class.A = type { i32, i32, i32, i32 }
-
-define void @_Z3fooi(%class.A* sret %agg.result, i32 %i) ssp {
-entry:
-  %i.addr = alloca i32, align 4
-  %j = alloca i32, align 4
-  %nrvo = alloca i1
-  %cleanup.dest.slot = alloca i32
-  store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !26), !dbg !27
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !28), !dbg !30
-  store i32 0, i32* %j, align 4, !dbg !31
-  %tmp = load i32* %i.addr, align 4, !dbg !32
-  %cmp = icmp eq i32 %tmp, 42, !dbg !32
-  br i1 %cmp, label %if.then, label %if.end, !dbg !32
-
-if.then:                                          ; preds = %entry
-  %tmp1 = load i32* %i.addr, align 4, !dbg !33
-  %add = add nsw i32 %tmp1, 1, !dbg !33
-  store i32 %add, i32* %j, align 4, !dbg !33
-  br label %if.end, !dbg !35
-
-if.end:                                           ; preds = %if.then, %entry
-  store i1 false, i1* %nrvo, !dbg !36
-  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !37), !dbg !39
-  %tmp2 = load i32* %j, align 4, !dbg !40
-  %x = getelementptr inbounds %class.A* %agg.result, i32 0, i32 0, !dbg !40
-  store i32 %tmp2, i32* %x, align 4, !dbg !40
-  store i1 true, i1* %nrvo, !dbg !41
-  store i32 1, i32* %cleanup.dest.slot
-  %nrvo.val = load i1* %nrvo, !dbg !42
-  br i1 %nrvo.val, label %nrvo.skipdtor, label %nrvo.unused, !dbg !42
-
-nrvo.unused:                                      ; preds = %if.end
-  call void @_ZN1AD1Ev(%class.A* %agg.result), !dbg !42
-  br label %nrvo.skipdtor, !dbg !42
-
-nrvo.skipdtor:                                    ; preds = %nrvo.unused, %if.end
-  ret void, !dbg !42
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-define linkonce_odr void @_ZN1AD1Ev(%class.A* %this) unnamed_addr ssp align 2 {
-entry:
-  %this.addr = alloca %class.A*, align 8
-  store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !43), !dbg !44
-  %this1 = load %class.A** %this.addr
-  call void @_ZN1AD2Ev(%class.A* %this1)
-  ret void, !dbg !45
-}
-
-define linkonce_odr void @_ZN1AD2Ev(%class.A* %this) unnamed_addr nounwind ssp align 2 {
-entry:
-  %this.addr = alloca %class.A*, align 8
-  store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !46), !dbg !47
-  %this1 = load %class.A** %this.addr
-  %x = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !48
-  store i32 1, i32* %x, align 4, !dbg !48
-  ret void, !dbg !48
-}
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"~A", metadata !"~A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 589826, metadata !51, metadata !2, metadata !"A", i32 2, i64 128, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null} ; [ DW_TAG_class_type ]
-!2 = metadata !{i32 786449, metadata !51, i32 4, metadata !"clang version 3.0 (trunk 130127)", i1 false, metadata !"", i32 0, metadata !24, metadata !24, metadata !50, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !5, metadata !7, metadata !8, metadata !9, metadata !0, metadata !10, metadata !14}
-!5 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !6} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"z", i32 2, i64 32, i64 32, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!9 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"o", i32 2, i64 32, i64 32, i64 96, i32 0, metadata !6} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, metadata !2, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!15 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!16 = metadata !{null, metadata !13, metadata !17}
-!17 = metadata !{i32 589840, null, metadata !2, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ]
-!18 = metadata !{i32 786470, metadata !2, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_const_type ]
-!19 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 4, metadata !20, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*, i32)* @_Z3fooi, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!20 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !21, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!21 = metadata !{metadata !1}
-!22 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD1Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*)* @_ZN1AD1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!23 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !24, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!24 = metadata !{null}
-!25 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*)* @_ZN1AD2Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!26 = metadata !{i32 786689, metadata !19, metadata !"i", metadata !3, i32 16777220, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 4, i32 11, metadata !19, null}
-!28 = metadata !{i32 786688, metadata !29, metadata !"j", metadata !3, i32 5, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786443, metadata !51, metadata !19, i32 4, i32 14, i32 0} ; [ DW_TAG_lexical_block ]
-!30 = metadata !{i32 5, i32 7, metadata !29, null}
-!31 = metadata !{i32 5, i32 12, metadata !29, null}
-!32 = metadata !{i32 6, i32 3, metadata !29, null}
-!33 = metadata !{i32 7, i32 5, metadata !34, null}
-!34 = metadata !{i32 786443, metadata !51, metadata !29, i32 6, i32 16, i32 1} ; [ DW_TAG_lexical_block ]
-!35 = metadata !{i32 8, i32 3, metadata !34, null}
-!36 = metadata !{i32 9, i32 9, metadata !29, null}
-!37 = metadata !{i32 786688, metadata !29, metadata !"my_a", metadata !3, i32 9, metadata !38, i32 0, null} ; [ DW_TAG_auto_variable ]
-!38 = metadata !{i32 589840, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
-!39 = metadata !{i32 9, i32 5, metadata !29, null}
-!40 = metadata !{i32 10, i32 3, metadata !29, null}
-!41 = metadata !{i32 11, i32 3, metadata !29, null}
-!42 = metadata !{i32 12, i32 1, metadata !29, null}
-!43 = metadata !{i32 786689, metadata !22, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
-!44 = metadata !{i32 2, i32 47, metadata !22, null}
-!45 = metadata !{i32 2, i32 61, metadata !22, null}
-!46 = metadata !{i32 786689, metadata !25, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
-!47 = metadata !{i32 2, i32 47, metadata !25, null}
-!48 = metadata !{i32 2, i32 54, metadata !49, null}
-!49 = metadata !{i32 786443, metadata !51, metadata !25, i32 2, i32 52, i32 2} ; [ DW_TAG_lexical_block ]
-!50 = metadata !{metadata !0, metadata !10, metadata !14, metadata !19, metadata !22, metadata !25}
-!51 = metadata !{metadata !"a.cc", metadata !"/private/tmp"}
diff --git a/test/CodeGen/X86/dbg-declare.ll b/test/CodeGen/X86/dbg-declare.ll
deleted file mode 100644
index d74e270..0000000
--- a/test/CodeGen/X86/dbg-declare.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -O0 -mtriple x86_64-apple-darwin
-; <rdar://problem/11134152>
-
-define i32 @foo(i32* %x) nounwind uwtable ssp {
-entry:
-  %x.addr = alloca i32*, align 8
-  %saved_stack = alloca i8*
-  %cleanup.dest.slot = alloca i32
-  store i32* %x, i32** %x.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %x.addr}, metadata !14), !dbg !15
-  %0 = load i32** %x.addr, align 8, !dbg !16
-  %1 = load i32* %0, align 4, !dbg !16
-  %2 = zext i32 %1 to i64, !dbg !16
-  %3 = call i8* @llvm.stacksave(), !dbg !16
-  store i8* %3, i8** %saved_stack, !dbg !16
-  %vla = alloca i8, i64 %2, align 16, !dbg !16
-  call void @llvm.dbg.declare(metadata !{i8* %vla}, metadata !18), !dbg !23
-  store i32 1, i32* %cleanup.dest.slot
-  %4 = load i8** %saved_stack, !dbg !24
-  call void @llvm.stackrestore(i8* %4), !dbg !24
-  ret i32 0, !dbg !25
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare i8* @llvm.stacksave() nounwind
-
-declare void @llvm.stackrestore(i8*) nounwind
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !26, i32 12, metadata !"clang version 3.1 (trunk 153698)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !26, metadata !0, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*)* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777221, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 5, i32 21, metadata !5, null}
-!16 = metadata !{i32 7, i32 13, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !26, metadata !5, i32 6, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{i32 786688, metadata !17, metadata !"a", metadata !6, i32 7, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!19 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 0, i64 8, i32 0, i32 0, metadata !20, metadata !21, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!20 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!21 = metadata !{metadata !22}
-!22 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
-!23 = metadata !{i32 7, i32 8, metadata !17, null}
-!24 = metadata !{i32 9, i32 1, metadata !17, null}
-!25 = metadata !{i32 8, i32 3, metadata !17, null}
-!26 = metadata !{metadata !"20020104-2.c", metadata !"/Volumes/Sandbox/llvm"}
diff --git a/test/CodeGen/X86/dbg-file-name.ll b/test/CodeGen/X86/dbg-file-name.ll
deleted file mode 100644
index 797b4b5..0000000
--- a/test/CodeGen/X86/dbg-file-name.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -enable-dwarf-directory -mtriple x86_64-apple-darwin10.0.0  < %s | FileCheck %s
-
-; Radar 8884898
-; CHECK: file	1 "simple.c"
-
-declare i32 @printf(i8*, ...) nounwind
-
-define i32 @main() nounwind {
-  ret i32 0
-}
-
-!llvm.dbg.cu = !{!2}
-
-!1 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !10, i32 1, metadata !"LLVM build 00", i1 true, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 786468, metadata !10, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !10, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 9, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!8 = metadata !{metadata !5}
-!9 = metadata !{metadata !6}
-!10 = metadata !{metadata !"simple.c", metadata !"/Users/manav/one/two"}
-!11 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-i128-const.ll b/test/CodeGen/X86/dbg-i128-const.ll
deleted file mode 100644
index f413909..0000000
--- a/test/CodeGen/X86/dbg-i128-const.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
-
-; CHECK: DW_AT_const_value
-; CHECK-NEXT: 42
-
-define i128 @__foo(i128 %a, i128 %b) nounwind {
-entry:
-  tail call void @llvm.dbg.value(metadata !0, i64 0, metadata !1), !dbg !11
-  %add = add i128 %a, %b, !dbg !11
-  ret i128 %add, !dbg !11
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!5}
-
-!0 = metadata !{i128 42 }
-!1 = metadata !{i32 786688, metadata !2, metadata !"MAX", metadata !4, i32 29, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
-!2 = metadata !{i32 786443, metadata !13, metadata !3, i32 26, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 786478, metadata !13, metadata !4, metadata !"__foo", metadata !"__foo", metadata !"__foo", i32 26, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i128 (i128, i128)* @__foo, null, null, null, i32 26} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786449, metadata !13, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !15, metadata !15, metadata !12, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 786453, metadata !13, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!7 = metadata !{metadata !8, metadata !8, metadata !8}
-!8 = metadata !{i32 786454, metadata !14, metadata !4, metadata !"ti_int", i32 78, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
-!9 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!10 = metadata !{i32 786468, metadata !13, metadata !4, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 29, i32 0, metadata !2, null}
-!12 = metadata !{metadata !3}
-!13 = metadata !{metadata !"foo.c", metadata !"/tmp"}
-!14 = metadata !{metadata !"myint.h", metadata !"/tmp"}
-!15 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-large-unsigned-const.ll b/test/CodeGen/X86/dbg-large-unsigned-const.ll
deleted file mode 100644
index c5cbf06..0000000
--- a/test/CodeGen/X86/dbg-large-unsigned-const.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -filetype=obj %s -o /dev/null
-; Hanle large unsigned constant values.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-macosx10.7.0"
-
-define zeroext i1 @_Z3iseRKxS0_(i64* nocapture %LHS, i64* nocapture %RHS) nounwind readonly optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i64* %LHS}, i64 0, metadata !7), !dbg !13
-  tail call void @llvm.dbg.value(metadata !{i64* %RHS}, i64 0, metadata !11), !dbg !14
-  %tmp1 = load i64* %LHS, align 4, !dbg !15
-  %tmp3 = load i64* %RHS, align 4, !dbg !15
-  %cmp = icmp eq i64 %tmp1, %tmp3, !dbg !15
-  ret i1 %cmp, !dbg !15
-}
-
-define zeroext i1 @_Z2fnx(i64 %a) nounwind readnone optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !21), !dbg !24
-  tail call void @llvm.dbg.value(metadata !25, i64 0, metadata !26), !dbg !27
-  %cmp.i = icmp eq i64 %a, 9223372036854775807, !dbg !28
-  ret i1 %cmp.i, !dbg !22
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-!29 = metadata !{metadata !1, metadata !6}
-!30 = metadata !{metadata !7, metadata !11}
-!31 = metadata !{metadata !12}
-
-!0 = metadata !{i32 786449, metadata !32, i32 4, metadata !"clang version 3.0 (trunk 135593)", i1 true, metadata !"", i32 0, metadata !33, metadata !33, metadata !29, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !32, null, metadata !"ise", metadata !"ise", metadata !"_Z3iseRKxS0_", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i1 (i64*, i64*)* @_Z3iseRKxS0_, null, null, metadata !30, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !32, null, metadata !"fn", metadata !"fn", metadata !"_Z2fnx", i32 6, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i1 (i64)* @_Z2fnx, null, null, metadata !31, i32 6} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786448, metadata !0, null, null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ]
-!9 = metadata !{i32 786470, metadata !0, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ]
-!10 = metadata !{i32 786468, null, metadata !0, metadata !"long long int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !2, i32 16777222, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 2, i32 27, metadata !1, null}
-!14 = metadata !{i32 2, i32 49, metadata !1, null}
-!15 = metadata !{i32 3, i32 3, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !32, metadata !1, i32 2, i32 54, i32 0} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{i32 6, i32 19, metadata !6, null}
-!21 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 7, i32 10, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !32, metadata !6, i32 6, i32 22, i32 1} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 2, i32 27, metadata !1, metadata !22}
-!25 = metadata !{i64 9223372036854775807}         
-!26 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 2, i32 49, metadata !1, metadata !22}
-!28 = metadata !{i32 3, i32 3, metadata !16, metadata !22}
-!32 = metadata !{metadata !"lli.cc", metadata !"/private/tmp"}
-!33 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-merge-loc-entry.ll b/test/CodeGen/X86/dbg-merge-loc-entry.ll
deleted file mode 100644
index ccf4808..0000000
--- a/test/CodeGen/X86/dbg-merge-loc-entry.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc < %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-; RUN: llc < %s -o %t -filetype=obj -regalloc=basic
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin8"
-
-;CHECK: DW_AT_location{{.*}}(<0x01> 55 )
-
-%0 = type { i64, i1 }
-
-@__clz_tab = external constant [256 x i8]
-
-define hidden i128 @__divti3(i128 %u, i128 %v) nounwind readnone {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i128 %u}, i64 0, metadata !14), !dbg !15
-  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !17), !dbg !21
-  br i1 undef, label %bb2, label %bb4, !dbg !22
-
-bb2:                                              ; preds = %entry
-  br label %bb4, !dbg !23
-
-bb4:                                              ; preds = %bb2, %entry
-  br i1 undef, label %__udivmodti4.exit, label %bb82.i, !dbg !24
-
-bb82.i:                                           ; preds = %bb4
-  unreachable
-
-__udivmodti4.exit:                                ; preds = %bb4
-  ret i128 undef, !dbg !27
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-declare %0 @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__udivmodti4", metadata !"__udivmodti4", metadata !"", i32 879, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, null, i32 879} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !29, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !31, metadata !31, metadata !28, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5, metadata !5, metadata !5, metadata !8}
-!5 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"UTItype", i32 166, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786473, metadata !30} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786447, metadata !29, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__divti3", metadata !"__divti3", metadata !"__divti3", i32 1094, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i128 (i128, i128)* @__divti3, null, null, null, i32 1094} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!11 = metadata !{metadata !12, metadata !12, metadata !12}
-!12 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"TItype", i32 160, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_typedef ]
-!13 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !9, metadata !"u", metadata !1, i32 1093, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 1093, i32 0, metadata !9, null}
-!16 = metadata !{i64 0}
-!17 = metadata !{i32 786688, metadata !18, metadata !"c", metadata !1, i32 1095, metadata !19, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786443, metadata !29, metadata !9, i32 1094, i32 0, i32 13} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"word_type", i32 424, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_typedef ]
-!20 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!21 = metadata !{i32 1095, i32 0, metadata !18, null}
-!22 = metadata !{i32 1103, i32 0, metadata !18, null}
-!23 = metadata !{i32 1104, i32 0, metadata !18, null}
-!24 = metadata !{i32 1003, i32 0, metadata !25, metadata !26}
-!25 = metadata !{i32 786443, metadata !29, metadata !0, i32 879, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 1107, i32 0, metadata !18, null}
-!27 = metadata !{i32 1111, i32 0, metadata !18, null}
-!28 = metadata !{metadata !0, metadata !9}
-!29 = metadata !{metadata !"foobar.c", metadata !"/tmp"}
-!30 = metadata !{metadata !"foobar.h", metadata !"/tmp"}
-!31 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-prolog-end.ll b/test/CodeGen/X86/dbg-prolog-end.ll
deleted file mode 100644
index c8d8499..0000000
--- a/test/CodeGen/X86/dbg-prolog-end.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -O0 < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.7"
-
-;CHECK: .loc	1 2 11 prologue_end
-define i32 @foo(i32 %i) nounwind ssp {
-entry:
-  %i.addr = alloca i32, align 4
-  %j = alloca i32, align 4
-  store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7), !dbg !8
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9), !dbg !11
-  store i32 2, i32* %j, align 4, !dbg !12
-  %tmp = load i32* %j, align 4, !dbg !13
-  %inc = add nsw i32 %tmp, 1, !dbg !13
-  store i32 %inc, i32* %j, align 4, !dbg !13
-  %tmp1 = load i32* %j, align 4, !dbg !14
-  %tmp2 = load i32* %i.addr, align 4, !dbg !14
-  %add = add nsw i32 %tmp1, %tmp2, !dbg !14
-  store i32 %add, i32* %j, align 4, !dbg !14
-  %tmp3 = load i32* %j, align 4, !dbg !15
-  ret i32 %tmp3, !dbg !15
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-define i32 @main() nounwind ssp {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %call = call i32 @foo(i32 21), !dbg !16
-  ret i32 %call, !dbg !16
-}
-
-!llvm.dbg.cu = !{!0}
-!18 = metadata !{metadata !1, metadata !6}
-
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.0 (trunk 131100)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @main, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777217, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 1, i32 13, metadata !1, null}
-!9 = metadata !{i32 786688, metadata !10, metadata !"j", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 786443, metadata !19, metadata !1, i32 1, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 2, i32 6, metadata !10, null}
-!12 = metadata !{i32 2, i32 11, metadata !10, null}
-!13 = metadata !{i32 3, i32 2, metadata !10, null}
-!14 = metadata !{i32 4, i32 2, metadata !10, null}
-!15 = metadata !{i32 5, i32 2, metadata !10, null}
-!16 = metadata !{i32 8, i32 2, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !6, i32 7, i32 12, i32 1} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{metadata !"/tmp/a.c", metadata !"/private/tmp"}
-!20 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-subrange.ll b/test/CodeGen/X86/dbg-subrange.ll
deleted file mode 100644
index ffb5f2d..0000000
--- a/test/CodeGen/X86/dbg-subrange.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -O0 < %s | FileCheck %s
-; Radar 10464995
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.7.2"
-
-@s = common global [4294967296 x i8] zeroinitializer, align 16
-;CHECK: .long	4294967295
-
-define void @bar() nounwind uwtable ssp {
-entry:
-  store i8 97, i8* getelementptr inbounds ([4294967296 x i8]* @s, i32 0, i64 0), align 1, !dbg !18
-  ret void, !dbg !20
-}
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !21, i32 12, metadata !"clang version 3.1 (trunk 144833)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !11,  metadata !11, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !21, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @bar, null, null, metadata !9, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !10}
-!10 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !13}
-!13 = metadata !{i32 720948, i32 0, null, metadata !"s", metadata !"s", metadata !"", metadata !6, i32 2, metadata !14, i32 0, i32 1, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
-!14 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!15 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{i32 720929, i64 0, i64 4294967296} ; [ DW_TAG_subrange_type ]
-!18 = metadata !{i32 5, i32 3, metadata !19, null}
-!19 = metadata !{i32 786443, metadata !21, metadata !5, i32 4, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{i32 6, i32 1, metadata !19, null}
-!21 = metadata !{metadata !"small.c", metadata !"/private/tmp"}
diff --git a/test/CodeGen/X86/dbg-value-dag-combine.ll b/test/CodeGen/X86/dbg-value-dag-combine.ll
deleted file mode 100644
index e281493..0000000
--- a/test/CodeGen/X86/dbg-value-dag-combine.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-; PR 9817
-
-
-declare  <4 x i32> @__amdil_get_global_id_int()
-declare  void @llvm.dbg.value(metadata , i64 , metadata )
-define void @__OpenCL_test_kernel(i32 addrspace(1)* %ip) nounwind {
-entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata
-!7), !dbg !8
-  %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
-  %1 = extractelement <4 x i32> %0, i32 0
-  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9), !dbg !11
-  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13), !dbg !14
-  %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
-  %tmp3 = add i32 0, %tmp2, !dbg !15
-; CHECK:  ##DEBUG_VALUE: idx <- EAX{{$}}
-  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13), !dbg
-!15
-  %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx, align 4, !dbg !16
-  ret void, !dbg !17
-}
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !19, metadata !1, metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !19, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !12, metadata !12, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !19, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{null, metadata !5}
-!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 1, i32 42, metadata !0, null}
-!9 = metadata !{i32 786688, metadata !10, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 786443, metadata !19, metadata !0, i32 2, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 3, i32 41, metadata !10, null}
-!12 = metadata !{i32 0}
-!13 = metadata !{i32 786688, metadata !10, metadata !"idx", metadata !1, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!14 = metadata !{i32 4, i32 20, metadata !10, null}
-!15 = metadata !{i32 5, i32 15, metadata !10, null}
-!16 = metadata !{i32 6, i32 18, metadata !10, null}
-!17 = metadata !{i32 7, i32 1, metadata !0, null}
-!18 = metadata !{metadata !0}
-!19 = metadata !{metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp"}
diff --git a/test/CodeGen/X86/dbg-value-isel.ll b/test/CodeGen/X86/dbg-value-isel.ll
deleted file mode 100644
index 0013385..0000000
--- a/test/CodeGen/X86/dbg-value-isel.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-; PR 9879
-
-; CHECK: ##DEBUG_VALUE: tid <-
-%0 = type { i8*, i8*, i8*, i8*, i32 }
-
-@sgv = internal addrspace(2) constant [1 x i8] zeroinitializer
-@fgv = internal addrspace(2) constant [1 x i8] zeroinitializer
-@lvgv = internal constant [0 x i8*] zeroinitializer
-@llvm.global.annotations = appending global [1 x %0] [%0 { i8* bitcast (void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel to i8*), i8* bitcast ([1 x i8] addrspace(2)* @sgv to i8*), i8* bitcast ([1 x i8] addrspace(2)* @fgv to i8*), i8* bitcast ([0 x i8*]* @lvgv to i8*), i32 0 }], section "llvm.metadata"
-
-define void @__OpenCL_nbt02_kernel(i32 addrspace(1)* %ip) nounwind {
-entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8), !dbg !9
-  %0 = call <4 x i32> @__amdil_get_local_id_int() nounwind
-  %1 = extractelement <4 x i32> %0, i32 0
-  br label %2
-
-; <label>:2                                       ; preds = %entry
-  %3 = phi i32 [ %1, %entry ]
-  br label %4
-
-; <label>:4                                       ; preds = %2
-  %5 = phi i32 [ %3, %2 ]
-  br label %get_local_id.exit
-
-get_local_id.exit:                                ; preds = %4
-  %6 = phi i32 [ %5, %4 ]
-  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10), !dbg !12
-  %7 = call <4 x i32> @__amdil_get_global_id_int() nounwind, !dbg !12
-  %8 = extractelement <4 x i32> %7, i32 0, !dbg !12
-  br label %9
-
-; <label>:9                                       ; preds = %get_local_id.exit
-  %10 = phi i32 [ %8, %get_local_id.exit ]
-  br label %11
-
-; <label>:11                                      ; preds = %9
-  %12 = phi i32 [ %10, %9 ]
-  br label %get_global_id.exit
-
-get_global_id.exit:                               ; preds = %11
-  %13 = phi i32 [ %12, %11 ]
-  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13), !dbg !14
-  %14 = call <4 x i32> @__amdil_get_local_size_int() nounwind
-  %15 = extractelement <4 x i32> %14, i32 0
-  br label %16
-
-; <label>:16                                      ; preds = %get_global_id.exit
-  %17 = phi i32 [ %15, %get_global_id.exit ]
-  br label %18
-
-; <label>:18                                      ; preds = %16
-  %19 = phi i32 [ %17, %16 ]
-  br label %get_local_size.exit
-
-get_local_size.exit:                              ; preds = %18
-  %20 = phi i32 [ %19, %18 ]
-  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15), !dbg !16
-  %tmp5 = add i32 %6, %13, !dbg !17
-  %tmp7 = add i32 %tmp5, %20, !dbg !17
-  store i32 %tmp7, i32 addrspace(1)* %ip, align 4, !dbg !17
-  br label %return, !dbg !17
-
-return:                                           ; preds = %get_local_size.exit
-  ret void, !dbg !18
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare <4 x i32> @__amdil_get_local_size_int() nounwind
-
-declare <4 x i32> @__amdil_get_local_id_int() nounwind
-
-declare <4 x i32> @__amdil_get_global_id_int() nounwind
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !20, metadata !1, metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !20, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !21, metadata !21, metadata !19, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !20, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{null, metadata !5}
-!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{i32 589846, metadata !20, metadata !2, metadata !"uint", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 1, i32 32, metadata !0, null}
-!10 = metadata !{i32 786688, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !0, i32 2, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 5, i32 24, metadata !11, null}
-!13 = metadata !{i32 786688, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!14 = metadata !{i32 6, i32 25, metadata !11, null}
-!15 = metadata !{i32 786688, metadata !11, metadata !"lsz", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!16 = metadata !{i32 7, i32 26, metadata !11, null}
-!17 = metadata !{i32 9, i32 24, metadata !11, null}
-!18 = metadata !{i32 10, i32 1, metadata !0, null}
-!19 = metadata !{metadata !0}
-!20 = metadata !{metadata !"OCLlLwTXZ.cl", metadata !"/tmp"}
-!21 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-value-location.ll b/test/CodeGen/X86/dbg-value-location.ll
deleted file mode 100644
index f896e58..0000000
--- a/test/CodeGen/X86/dbg-value-location.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-; RUN: llc < %s -regalloc=basic | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-;Radar 8950491
-
-;CHECK: .long Lset5
-;CHECK-NEXT:        ## DW_AT_decl_file
-;CHECK-NEXT:        ## DW_AT_decl_line
-;CHECK-NEXT:        ## DW_AT_type
-;CHECK-NEXT:        ## DW_AT_location
-
-@dfm = external global i32, align 4
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-define i32 @foo(i32 %dev, i64 %cmd, i8* %data, i32 %data2) nounwind optsize ssp {
-entry:
-  call void @llvm.dbg.value(metadata !{i32 %dev}, i64 0, metadata !12), !dbg !13
-  %tmp.i = load i32* @dfm, align 4, !dbg !14
-  %cmp.i = icmp eq i32 %tmp.i, 0, !dbg !14
-  br i1 %cmp.i, label %if.else, label %if.end.i, !dbg !14
-
-if.end.i:                                         ; preds = %entry
-  switch i64 %cmd, label %if.then [
-    i64 2147772420, label %bb.i
-    i64 536897538, label %bb116.i
-  ], !dbg !22
-
-bb.i:                                             ; preds = %if.end.i
-  unreachable
-
-bb116.i:                                          ; preds = %if.end.i
-  unreachable
-
-if.then:                                          ; preds = %if.end.i
-  ret i32 undef, !dbg !23
-
-if.else:                                          ; preds = %entry
-  ret i32 0
-}
-
-declare hidden fastcc i32 @bar(i32, i32* nocapture) nounwind optsize ssp
-declare hidden fastcc i32 @bar2(i32) nounwind optsize ssp
-declare hidden fastcc i32 @bar3(i32) nounwind optsize ssp
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 19510, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i64, i8*, i32)* @foo, null, null, null, i32 19510} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !27, i32 12, metadata !"clang version 2.9 (trunk 124753)", i1 true, metadata !"", i32 0, metadata !28, metadata !28, metadata !24, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar3", metadata !"bar3", metadata !"", i32 14827, metadata !3, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @bar3, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar2", metadata !"bar2", metadata !"", i32 15397, metadata !3, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @bar2, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar", metadata !"bar", metadata !"", i32 12382, metadata !9, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i32*)* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !10, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 786689, metadata !0, metadata !"var", metadata !1, i32 19509, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 19509, i32 20, metadata !0, null}
-!14 = metadata !{i32 18091, i32 2, metadata !15, metadata !17}
-!15 = metadata !{i32 786443, metadata !26, metadata !16, i32 18086, i32 1, i32 748} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo_bar", metadata !"foo_bar", metadata !"", i32 18086, metadata !3, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 19514, i32 2, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !26, metadata !0, i32 19510, i32 1, i32 99} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{i32 18094, i32 2, metadata !15, metadata !17}
-!23 = metadata !{i32 19524, i32 1, metadata !18, null}
-!24 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8}
-!25 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
-!26 = metadata !{metadata !"/tmp/f.c", metadata !"/tmp"}
-!27 = metadata !{metadata !"f.i", metadata !"/tmp"}
-!28 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-value-terminator.ll b/test/CodeGen/X86/dbg-value-terminator.ll
deleted file mode 100644
index e8d70de..0000000
--- a/test/CodeGen/X86/dbg-value-terminator.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; RUN: llc -mtriple=x86_64-apple-macosx < %s -verify-machineinstrs | FileCheck %s
-;
-; PR16143: MachineOperand::setIsKill(bool): Assertion
-;
-; verify-machineinstrs should ensure that DEBUG_VALUEs go before the
-; terminator.
-;
-; CHECK-LABEL: test:
-; CHECK: ##DEBUG_VALUE: i
-%a = type { i32, i32 }
-
-define hidden fastcc %a* @test() #1 {
-entry:
-  %0 = icmp eq %a* undef, null, !dbg !1
-  br i1 %0, label %"14", label %return, !dbg !1
-
-"14":                                             ; preds = %"8"
-  br i1 undef, label %"25", label %"21", !dbg !1
-
-"21":                                             ; preds = %"14"
-  br i1 undef, label %may_unswitch_on.exit, label %"6.i", !dbg !1
-
-"6.i":                                            ; preds = %"21"
-  br i1 undef, label %"10.i", label %may_unswitch_on.exit, !dbg !1
-
-"10.i":                                           ; preds = %"6.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"12.i", !dbg !1
-
-"12.i":                                           ; preds = %"10.i"
-  br i1 undef, label %"4.i.i", label %"3.i.i", !dbg !1
-
-"3.i.i":                                          ; preds = %"12.i"
-  br i1 undef, label %"4.i.i", label %VEC_edge_base_index.exit.i, !dbg !1
-
-"4.i.i":                                          ; preds = %"3.i.i", %"12.i"
-  unreachable, !dbg !1
-
-VEC_edge_base_index.exit.i:                       ; preds = %"3.i.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"16.i", !dbg !1
-
-"16.i":                                           ; preds = %VEC_edge_base_index.exit.i
-  br i1 undef, label %"4.i6.i", label %"3.i5.i", !dbg !1
-
-"3.i5.i":                                         ; preds = %"16.i"
-  br i1 undef, label %VEC_edge_base_index.exit7.i, label %"4.i6.i", !dbg !1
-
-"4.i6.i":                                         ; preds = %"3.i5.i", %"16.i"
-  unreachable, !dbg !1
-
-VEC_edge_base_index.exit7.i:                      ; preds = %"3.i5.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"21.i", !dbg !1
-
-"21.i":                                           ; preds = %VEC_edge_base_index.exit7.i
-  br i1 undef, label %may_unswitch_on.exit, label %"23.i", !dbg !1
-
-"23.i":                                           ; preds = %"21.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"26.i", !dbg !1
-
-"26.i":                                           ; preds = %"34.i", %"23.i"
-  %1 = icmp eq i32 undef, 9, !dbg !1
-  br i1 %1, label %"34.i", label %"28.i", !dbg !1
-
-"28.i":                                           ; preds = %"26.i"
-  unreachable
-
-"34.i":                                           ; preds = %"26.i"
-  br i1 undef, label %"26.i", label %"36.i", !dbg !1
-
-"36.i":                                           ; preds = %"34.i"
-  br i1 undef, label %"37.i", label %"38.i", !dbg !1
-
-"37.i":                                           ; preds = %"36.i"
-  br label %"38.i", !dbg !1
-
-"38.i":                                           ; preds = %"37.i", %"36.i"
-  br i1 undef, label %"39.i", label %"45.i", !dbg !1
-
-"39.i":                                           ; preds = %"38.i"
-  br i1 undef, label %"41.i", label %may_unswitch_on.exit, !dbg !1
-
-"41.i":                                           ; preds = %"39.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"42.i", !dbg !1
-
-"42.i":                                           ; preds = %"41.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"44.i", !dbg !1
-
-"44.i":                                           ; preds = %"42.i"
-  %2 = load %a** undef, align 8, !dbg !1
-  %3 = bitcast %a* %2 to %a*, !dbg !1
-  call void @llvm.dbg.value(metadata !{%a* %3}, i64 0, metadata !6), !dbg !12
-  br label %may_unswitch_on.exit, !dbg !1
-
-"45.i":                                           ; preds = %"38.i"
-  unreachable
-
-may_unswitch_on.exit:                             ; preds = %"44.i", %"42.i", %"41.i", %"39.i", %"23.i", %"21.i", %VEC_edge_base_index.exit7.i, %VEC_edge_base_index.exit.i, %"10.i", %"6.i", %"21"
-  %4 = phi %a* [ %3, %"44.i" ], [ null, %"6.i" ], [ null, %"10.i" ], [ null, %VEC_edge_base_index.exit7.i ], [ null, %VEC_edge_base_index.exit.i ], [ null, %"21.i" ], [ null, %"23.i" ], [ null, %"39.i" ], [ null, %"42.i" ], [ null, %"41.i" ], [ null, %"21" ]
-  br label %return
-
-"25":                                             ; preds = %"14"
-  unreachable
-
-"return":
-  %result = phi %a* [ null, %entry ], [ %4, %may_unswitch_on.exit ]
-  ret %a* %result, !dbg !1
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind uwtable }
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, %a* ()* @test, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 2, i32 13, metadata !1, null}
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !6, metadata !7, metadata !10}
-!20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!21 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index b746dec..3b4a868 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -5,6 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5}
 
 !0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{i32 0}
@@ -15,3 +16,4 @@ target triple = "x86_64-unknown-linux-gnu"
 ;                        Dir  Mod Time   File Len   File Name
 ;                        ---- ---------- ---------- ---------------------------
 ; CHECK: file_names[  1]    0 0x00000000 0x00000000 empty.c
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/dyn_alloca_aligned.ll b/test/CodeGen/X86/dyn_alloca_aligned.ll
new file mode 100644
index 0000000..993f4d2
--- /dev/null
+++ b/test/CodeGen/X86/dyn_alloca_aligned.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+define i32 @A(i32 %Size) {
+; CHECK:  subq    %rcx, %rax
+; CHECK:  andq    $-128, %rax
+; CHECK:  movq    %rax, %rsp
+  %A = alloca i8, i32 %Size, align 128
+  %A_addr = ptrtoint i8* %A to i32
+  ret i32 %A_addr
+}
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index 7fcef03..cd2dc1d 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -40,7 +40,7 @@ entry:
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx
 
 ; ATOM: _t:
-; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %ecx
-; ATOM:         movl    $0, %eax
+; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %e{{..}}
+; ATOM:         movl    $0, %e{{..}}
 
 }
diff --git a/test/CodeGen/X86/fastcc.ll b/test/CodeGen/X86/fastcc.ll
index 705ab7b..a362f8d 100644
--- a/test/CodeGen/X86/fastcc.ll
+++ b/test/CodeGen/X86/fastcc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 -post-RA-scheduler=false | FileCheck %s
-; CHECK: movsd %xmm0, 8(%esp)
-; CHECK: xorl %ecx, %ecx
+; CHECK: movsd %xmm{{[0-9]}}, 8(%esp)
+; CHECK: xorl %eax, %eax
 
 @d = external global double		; <double*> [#uses=1]
 @c = external global double		; <double*> [#uses=1]
diff --git a/test/CodeGen/X86/fastisel-gep-promote-before-add.ll b/test/CodeGen/X86/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..f87a34c
--- /dev/null
+++ b/test/CodeGen/X86/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,37 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=x86_64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: movzbl ({{.*}})
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
+define zeroext i8 @gep_promotion_nonconst(i8 %i, i8* %ptr) nounwind uwtable ssp {
+entry:
+  %i.addr = alloca i8, align 4
+  %ptr.addr = alloca i8*, align 8
+  store i8 %i, i8* %i.addr, align 4
+  store i8* %ptr, i8** %ptr.addr, align 8
+  %0 = load i8* %i.addr, align 4
+  ; CHECK-LABEL: _gep_promotion_nonconst:
+  ; CHECK: movzbl ({{.*}})
+  %xor = xor i8 %0, -128   ; %0   ^ 0x80
+  %add = add i8 %xor, -127 ; %xor + 0x81
+  %1 = load i8** %ptr.addr, align 8
+
+  %arrayidx = getelementptr inbounds i8* %1, i8 %add
+
+  %2 = load i8* %arrayidx, align 1
+  ret i8 %2
+}
+
diff --git a/test/CodeGen/X86/floor-soft-float.ll b/test/CodeGen/X86/floor-soft-float.ll
index 8e7ee09..5644509 100644
--- a/test/CodeGen/X86/floor-soft-float.ll
+++ b/test/CodeGen/X86/floor-soft-float.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx -soft-float=0 | FileCheck %s --check-prefix=CHECK-HARD-FLOAT
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx -soft-float=1 | FileCheck %s --check-prefix=CHECK-SOFT-FLOAT
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx -soft-float=0 | FileCheck %s --check-prefix=CHECK-HARD-FLOAT
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx -soft-float=1 | FileCheck %s --check-prefix=CHECK-SOFT-FLOAT
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index 47100be..e85d8f7 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ; rdar://12721174
 ; We should not fold movss into pshufd since pshufd expects m128 while movss
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index 495acd9..dde0a2d 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -38,10 +38,10 @@ L:
 
   store i16 %A, i16* %Q
   ret i32 %D
-  
+
 ; CHECK-LABEL: test2:
 ; CHECK: 	movl	4(%esp), %eax
-; CHECK-NEXT:	movzwl	(%eax), %ecx
+; CHECK-NEXT:	movzwl	(%eax), %e{{..}}
 
 }
 
@@ -49,10 +49,10 @@ L:
 ; xor in exit block will be CSE'ed and load will be folded to xor in entry.
 define i1 @test3(i32* %P, i32* %Q) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK: movl 8(%esp), %eax
-; CHECK: xorl (%eax),
+; CHECK: movl 8(%esp), %e
+; CHECK: movl 4(%esp), %e
+; CHECK: xorl (%e
 ; CHECK: j
-; CHECK-NOT: xor
 entry:
   %0 = load i32* %P, align 4
   %1 = load i32* %Q, align 4
diff --git a/test/CodeGen/X86/fold-pcmpeqd-2.ll b/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 0a3afb7..60a6844 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -54,22 +54,27 @@ forbody:		; preds = %forcond
 	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
 	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
 	%cmpunord.i11 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i8 3) nounwind		; <<4 x float>> [#uses=1]
+	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	%andps.i5 = and <4 x i32> %bitcast.i3, zeroinitializer		; <<4 x i32>> [#uses=1]
+
+	call void null(<4 x float> %mul313, <4 x float> %cmpunord.i11, <4 x float> %tmp83, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
+
+	%tmp84 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul313, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+
 	%bitcast6.i13 = bitcast <4 x float> %cmpunord.i11 to <4 x i32>		; <<4 x i32>> [#uses=2]
 	%andps.i14 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %bitcast6.i13		; <<4 x i32>> [#uses=1]
 	%not.i16 = xor <4 x i32> %bitcast6.i13, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
 	%andnps.i17 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %not.i16		; <<4 x i32>> [#uses=1]
 	%orps.i18 = or <4 x i32> %andnps.i17, %andps.i14		; <<4 x i32>> [#uses=1]
 	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andps.i5 = and <4 x i32> %bitcast.i3, zeroinitializer		; <<4 x i32>> [#uses=1]
+
 	%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%not.i7 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
 	%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7		; <<4 x i32>> [#uses=1]
-	call void null(<4 x float> %mul313, <4 x float> %cmpunord.i11, <4 x float> %tmp83, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
 	%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5		; <<4 x i32>> [#uses=1]
 	%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp84 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul313, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+
 	%bitcast6.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=2]
 	%andps.i = and <4 x i32> zeroinitializer, %bitcast6.i		; <<4 x i32>> [#uses=1]
 	%bitcast11.i = bitcast <4 x float> %tmp84 to <4 x i32>		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/fp-elim.ll b/test/CodeGen/X86/fp-elim.ll
index 583388c..2c50bd1 100644
--- a/test/CodeGen/X86/fp-elim.ll
+++ b/test/CodeGen/X86/fp-elim.ll
@@ -4,7 +4,7 @@
 ; Implement -momit-leaf-frame-pointer
 ; rdar://7886181
 
-define i32 @t1() "no-frame-pointer-elim-non-leaf"="false" nounwind readnone {
+define i32 @t1() nounwind readnone {
 entry:
 ; FP-ELIM-LABEL:  t1:
 ; FP-ELIM-NEXT:     movl
@@ -17,7 +17,7 @@ entry:
   ret i32 10
 }
 
-define void @t2() "no-frame-pointer-elim-non-leaf"="false" nounwind {
+define void @t2() nounwind {
 entry:
 ; FP-ELIM-LABEL:  t2:
 ; FP-ELIM-NOT:      pushl %ebp
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-define i32 @t3() "no-frame-pointer-elim-non-leaf"="true" nounwind readnone {
+define i32 @t3() "no-frame-pointer-elim-non-leaf" nounwind readnone {
 entry:
 ; FP-ELIM-LABEL:  t3:
 ; FP-ELIM-NEXT:     movl
@@ -44,7 +44,7 @@ entry:
   ret i32 10
 }
 
-define void @t4() "no-frame-pointer-elim-non-leaf"="true" nounwind {
+define void @t4() "no-frame-pointer-elim-non-leaf" nounwind {
 entry:
 ; FP-ELIM-LABEL:  t4:
 ; FP-ELIM-NEXT:     pushl %ebp
diff --git a/test/CodeGen/X86/fp-une-cmp.ll b/test/CodeGen/X86/fp-une-cmp.ll
new file mode 100644
index 0000000..7f772d1
--- /dev/null
+++ b/test/CodeGen/X86/fp-une-cmp.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=x86 -mattr=sse4.1 | FileCheck %s
+; <rdar://problem/7859988>
+
+; Make sure we don't generate more jumps than we need to. We used to generate
+; something like this:
+;
+;       jne  LBB0_1
+;       jnp  LBB0_2
+;   LBB0_1:
+;       jmp  LBB0_3
+;   LBB0_2:
+;       addsd ...
+;   LBB0_3:
+;
+; Now we generate this:
+;
+;       jne  LBB0_2
+;       jp   LBB0_2
+;       addsd ...
+;   LBB0_2:
+
+; CHECK:       func
+; CHECK:       jne [[LABEL:.*]]
+; CHECK-NEXT:  jp  [[LABEL]]
+; CHECK-NOT:   jmp
+
+define float @func(float %x, float %y) nounwind readnone optsize ssp {
+entry:
+  %0 = fpext float %x to double
+  %1 = fpext float %y to double
+  %2 = fmul double %0, %1
+  %3 = fcmp une double %2, 0.000000e+00
+  br i1 %3, label %bb2, label %bb1
+
+bb1:
+  %4 = fadd double %2, -1.000000e+00
+  br label %bb2
+
+bb2:
+  %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
+  %.0 = fptrunc double %.0.in to float
+  ret float %.0
+}
diff --git a/test/CodeGen/X86/frame-base.ll b/test/CodeGen/X86/frame-base.ll
new file mode 100644
index 0000000..a6bd2a5
--- /dev/null
+++ b/test/CodeGen/X86/frame-base.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
+
+; The issue here was a conflict between forming a %rip-relative lea and a
+; FrameIndex lea. The %rip sanity-checks didn't consider that a base register
+; had been set if we'd already matched a FrameIndex, when it has in reality.
+
+@var = global i32 0
+
+define void @test_frame_rip_conflict() {
+; CHECK-LABEL: test_frame_rip_conflict:
+; CHECK: leaq _var(%rip), [[TMPADDR:%r.*]]
+; CHECK: leaq {{-?[0-9]+}}(%rsp,[[TMPADDR]]),
+  %stackvar = alloca i32
+
+  %stackint = ptrtoint i32* %stackvar to i64
+  %addr = add i64 ptrtoint(i32* @var to i64), %stackint
+
+  call void @eat_i64(i64 %addr)
+  ret void
+}
+
+declare void @eat_i64(i64)
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index 0729dda..cbcc62a 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -4,7 +4,7 @@
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
 ; ATOM: foo
 ; ATOM: addl
-; ATOM: leal
+; ATOM: addl
 ; ATOM: leal
 
 ; CHECK: foo
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index 72a5096..5f48b1e 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -1,21 +1,35 @@
-; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN
+; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN
 ; rdar://7398554
 
 ; When doing vector gather-scatter index calculation with 32-bit indices,
 ; bounce the vector off of cache rather than shuffling each individual
 ; element out of the index vector.
 
-; CHECK: andps    ([[H:%rdx|%r8]]), %xmm0
-; CHECK: movaps   %xmm0, {{(-24)?}}(%rsp)
-; CHECK: movslq   {{(-24)?}}(%rsp), %rax
-; CHECK: movsd    ([[P:%rdi|%rcx]],%rax,8), %xmm0
-; CHECK: movslq   {{-20|4}}(%rsp), %rax
-; CHECK: movhpd   ([[P]],%rax,8), %xmm0
-; CHECK: movslq   {{-16|8}}(%rsp), %rax
-; CHECK: movsd    ([[P]],%rax,8), %xmm1
-; CHECK: movslq   {{-12|12}}(%rsp), %rax
-; CHECK: movhpd   ([[P]],%rax,8), %xmm1
+; CHECK: foo:
+; LIN: movaps	(%rsi), %xmm0
+; LIN: andps	(%rdx), %xmm0
+; LIN: movaps	%xmm0, -24(%rsp)
+; LIN: movslq	-24(%rsp), %[[REG1:r.+]]
+; LIN: movslq	-20(%rsp), %[[REG2:r.+]]
+; LIN: movslq	-16(%rsp), %[[REG3:r.+]]
+; LIN: movslq	-12(%rsp), %[[REG4:r.+]]
+; LIN: movsd	(%rdi,%[[REG1]],8), %xmm0
+; LIN: movhpd	(%rdi,%[[REG2]],8), %xmm0
+; LIN: movsd	(%rdi,%[[REG3]],8), %xmm1
+; LIN: movhpd	(%rdi,%[[REG4]],8), %xmm1
+
+; WIN: movaps	(%rdx), %xmm0
+; WIN: andps	(%r8), %xmm0
+; WIN: movaps	%xmm0, (%rsp)
+; WIN: movslq	(%rsp), %[[REG1:r.+]]
+; WIN: movslq	4(%rsp), %[[REG2:r.+]]
+; WIN: movslq	8(%rsp), %[[REG3:r.+]]
+; WIN: movslq	12(%rsp), %[[REG4:r.+]]
+; WIN: movsd	(%rcx,%[[REG1]],8), %xmm0
+; WIN: movhpd	(%rcx,%[[REG2]],8), %xmm0
+; WIN: movsd	(%rcx,%[[REG3]],8), %xmm1
+; WIN: movhpd	(%rcx,%[[REG4]],8), %xmm1
 
 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
   %a = load <4 x i32>* %i
diff --git a/test/CodeGen/X86/ghc-cc.ll b/test/CodeGen/X86/ghc-cc.ll
index 0e65cfd..4dba2c0 100644
--- a/test/CodeGen/X86/ghc-cc.ll
+++ b/test/CodeGen/X86/ghc-cc.ll
@@ -28,10 +28,10 @@ entry:
 
 define cc 10 void @foo() nounwind {
 entry:
-  ; CHECK: movl base, %ebx
-  ; CHECK-NEXT: movl sp, %ebp
+  ; CHECK:      movl r1, %esi
   ; CHECK-NEXT: movl hp, %edi
-  ; CHECK-NEXT: movl r1, %esi
+  ; CHECK-NEXT: movl sp, %ebp
+  ; CHECK-NEXT: movl base, %ebx
   %0 = load i32* @r1
   %1 = load i32* @hp
   %2 = load i32* @sp
@@ -42,4 +42,3 @@ entry:
 }
 
 declare cc 10 void @bar(i32, i32, i32, i32)
-
diff --git a/test/CodeGen/X86/ghc-cc64.ll b/test/CodeGen/X86/ghc-cc64.ll
index fcf7e17..403391e 100644
--- a/test/CodeGen/X86/ghc-cc64.ll
+++ b/test/CodeGen/X86/ghc-cc64.ll
@@ -41,22 +41,22 @@ entry:
 
 define cc 10 void @foo() nounwind {
 entry:
-  ; CHECK: movq base(%rip), %r13
-  ; CHECK-NEXT: movq sp(%rip), %rbp
-  ; CHECK-NEXT: movq hp(%rip), %r12
-  ; CHECK-NEXT: movq r1(%rip), %rbx
-  ; CHECK-NEXT: movq r2(%rip), %r14
-  ; CHECK-NEXT: movq r3(%rip), %rsi
-  ; CHECK-NEXT: movq r4(%rip), %rdi
-  ; CHECK-NEXT: movq r5(%rip), %r8
-  ; CHECK-NEXT: movq r6(%rip), %r9
-  ; CHECK-NEXT: movq splim(%rip), %r15
-  ; CHECK-NEXT: movss f1(%rip), %xmm1
-  ; CHECK-NEXT: movss f2(%rip), %xmm2
-  ; CHECK-NEXT: movss f3(%rip), %xmm3
-  ; CHECK-NEXT: movss f4(%rip), %xmm4
+  ; CHECK:      movsd d2(%rip), %xmm6
   ; CHECK-NEXT: movsd d1(%rip), %xmm5
-  ; CHECK-NEXT: movsd d2(%rip), %xmm6
+  ; CHECK-NEXT: movss f4(%rip), %xmm4
+  ; CHECK-NEXT: movss f3(%rip), %xmm3
+  ; CHECK-NEXT: movss f2(%rip), %xmm2
+  ; CHECK-NEXT: movss f1(%rip), %xmm1
+  ; CHECK-NEXT: movq splim(%rip), %r15
+  ; CHECK-NEXT: movq r6(%rip), %r9
+  ; CHECK-NEXT: movq r5(%rip), %r8
+  ; CHECK-NEXT: movq r4(%rip), %rdi
+  ; CHECK-NEXT: movq r3(%rip), %rsi
+  ; CHECK-NEXT: movq r2(%rip), %r14
+  ; CHECK-NEXT: movq r1(%rip), %rbx
+  ; CHECK-NEXT: movq hp(%rip), %r12
+  ; CHECK-NEXT: movq sp(%rip), %rbp
+  ; CHECK-NEXT: movq base(%rip), %r13
   %0 = load double* @d2
   %1 = load double* @d1
   %2 = load float* @f4
@@ -83,4 +83,3 @@ entry:
 
 declare cc 10 void @bar(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,
                         float, float, float, float, double, double)
-
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 194f597..d8743ac 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -65,10 +65,10 @@
 ; PR4584
 @"foo bar" = linkonce global i32 42
 
-; LINUX: .type	foo_20_bar,@object
-; LINUX: .section .data.foo_20_bar,"aGw",@progbits,foo_20_bar,comdat
-; LINUX: .weak	foo_20_bar
-; LINUX: foo_20_bar:
+; LINUX: .type	"foo bar",@object
+; LINUX: .section ".data.foo bar","aGw",@progbits,"foo bar",comdat
+; LINUX: .weak	"foo bar"
+; LINUX: "foo bar":
 
 ; DARWIN: .section		__DATA,__datacoal_nt,coalesced
 ; DARWIN: .globl	"_foo bar"
diff --git a/test/CodeGen/X86/h-register-addressing-32.ll b/test/CodeGen/X86/h-register-addressing-32.ll
index 968a9e8..68e8c60 100644
--- a/test/CodeGen/X86/h-register-addressing-32.ll
+++ b/test/CodeGen/X86/h-register-addressing-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "movzbl	%[abcd]h," | count 7
+; RUN: llc < %s -march=x86 -mattr=-bmi | FileCheck %s
 
 ; Use h-register extract and zero-extend.
 
@@ -9,6 +9,9 @@ define double @foo8(double* nocapture inreg %p, i32 inreg %x) nounwind readonly
   %t3 = load double* %t2, align 8
   ret double %t3
 }
+; CHECK: foo8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define float @foo4(float* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 8
   %t1 = and i32 %t0, 255
@@ -16,6 +19,9 @@ define float @foo4(float* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load float* %t2, align 8
   ret float %t3
 }
+; CHECK: foo4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i16 @foo2(i16* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 8
   %t1 = and i32 %t0, 255
@@ -23,6 +29,9 @@ define i16 @foo2(i16* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i16* %t2, align 8
   ret i16 %t3
 }
+; CHECK: foo2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @foo1(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 8
   %t1 = and i32 %t0, 255
@@ -30,6 +39,9 @@ define i8 @foo1(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: foo1:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar8(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 5
   %t1 = and i32 %t0, 2040
@@ -37,6 +49,9 @@ define i8 @bar8(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar4(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 6
   %t1 = and i32 %t0, 1020
@@ -44,6 +59,9 @@ define i8 @bar4(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar2(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 7
   %t1 = and i32 %t0, 510
@@ -51,3 +69,6 @@ define i8 @bar2(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: ret
diff --git a/test/CodeGen/X86/h-register-addressing-64.ll b/test/CodeGen/X86/h-register-addressing-64.ll
index a19fca5..3f549d2 100644
--- a/test/CodeGen/X86/h-register-addressing-64.ll
+++ b/test/CodeGen/X86/h-register-addressing-64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep "movzbl	%[abcd]h," | count 7
+; RUN: llc < %s -march=x86-64 -mattr=-bmi | FileCheck %s
 
 ; Use h-register extract and zero-extend.
 
@@ -9,6 +9,9 @@ define double @foo8(double* nocapture inreg %p, i64 inreg %x) nounwind readonly
   %t3 = load double* %t2, align 8
   ret double %t3
 }
+; CHECK: foo8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define float @foo4(float* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 8
   %t1 = and i64 %t0, 255
@@ -16,6 +19,9 @@ define float @foo4(float* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load float* %t2, align 8
   ret float %t3
 }
+; CHECK: foo4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i16 @foo2(i16* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 8
   %t1 = and i64 %t0, 255
@@ -23,6 +29,9 @@ define i16 @foo2(i16* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i16* %t2, align 8
   ret i16 %t3
 }
+; CHECK: foo2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @foo1(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 8
   %t1 = and i64 %t0, 255
@@ -30,6 +39,9 @@ define i8 @foo1(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: foo1:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar8(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 5
   %t1 = and i64 %t0, 2040
@@ -37,6 +49,9 @@ define i8 @bar8(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar4(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 6
   %t1 = and i64 %t0, 1020
@@ -44,6 +59,9 @@ define i8 @bar4(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar2(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 7
   %t1 = and i64 %t0, 510
@@ -51,3 +69,6 @@ define i8 @bar2(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: ret
diff --git a/test/CodeGen/X86/h-registers-0.ll b/test/CodeGen/X86/h-registers-0.ll
index 71b3b43..6a5ccaa 100644
--- a/test/CodeGen/X86/h-registers-0.ll
+++ b/test/CodeGen/X86/h-registers-0.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
-; RUN: llc < %s -march=x86    | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mattr=-bmi -march=x86    | FileCheck %s -check-prefix=X86-32
 
 ; Use h registers. On x86-64, codegen doesn't support general allocation
 ; of h registers yet, due to x86 encoding complications.
diff --git a/test/CodeGen/X86/h-registers-1.ll b/test/CodeGen/X86/h-registers-1.ll
index 903c453..7254325 100644
--- a/test/CodeGen/X86/h-registers-1.ll
+++ b/test/CodeGen/X86/h-registers-1.ll
@@ -1,12 +1,21 @@
-; RUN: llc < %s -mtriple=x86_64-linux > %t
-; RUN: grep "movzbl	%[abcd]h," %t | count 8
-; RUN: grep "%[abcd]h" %t | not grep "%r[[:digit:]]*d"
+; RUN: llc -mattr=-bmi < %s -mtriple=x86_64-linux | FileCheck %s
 
 ; LLVM creates virtual registers for values live across blocks
 ; based on the type of the value. Make sure that the extracts
 ; here use the GR64_NOREX register class for their result,
 ; instead of plain GR64.
 
+; CHECK: foo:
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: ret
+
 define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d,
                 i64 %e, i64 %f, i64 %g, i64 %h) {
   %sa = lshr i64 %a, 8
diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll
index 76d17a0..b34417e 100644
--- a/test/CodeGen/X86/hipe-cc.ll
+++ b/test/CodeGen/X86/hipe-cc.ll
@@ -49,10 +49,10 @@ entry:
   store i32 %arg1, i32* %arg1_var
   store i32 %arg2, i32* %arg2_var
 
-  ; CHECK:      movl   4(%esp), %edx
-  ; CHECK-NEXT: movl   8(%esp), %eax
+  ; CHECK:      movl  16(%esp), %esi
   ; CHECK-NEXT: movl  12(%esp), %ebp
-  ; CHECK-NEXT: movl  16(%esp), %esi
+  ; CHECK-NEXT: movl   8(%esp), %eax
+  ; CHECK-NEXT: movl   4(%esp), %edx
   %0 = load i32* %hp_var
   %1 = load i32* %p_var
   %2 = load i32* %arg0_var
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index 5dbb5a2..27e1c72 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -5,10 +5,10 @@
 define void @zap(i64 %a, i64 %b) nounwind {
 entry:
   ; CHECK:      movq %rsi, %rax
-  ; CHECK-NEXT: movq %rdi, %rsi
-  ; CHECK-NEXT: movq %rax, %rdx
   ; CHECK-NEXT: movl $8, %ecx
   ; CHECK-NEXT: movl $9, %r8d
+  ; CHECK-NEXT: movq %rdi, %rsi
+  ; CHECK-NEXT: movq %rax, %rdx
   ; CHECK-NEXT: callq addfour
   %0 = call cc 11 {i64, i64, i64} @addfour(i64 undef, i64 undef, i64 %a, i64 %b, i64 8, i64 9)
   %res = extractvalue {i64, i64, i64} %0, 2
@@ -57,11 +57,11 @@ entry:
   store i64 %arg2, i64* %arg2_var
   store i64 %arg3, i64* %arg3_var
 
-  ; CHECK:      movq  8(%rsp), %rcx
-  ; CHECK-NEXT: movq  16(%rsp), %rdx
-  ; CHECK-NEXT: movq  24(%rsp), %rsi
+  ; CHECK:      movq  40(%rsp), %r15
   ; CHECK-NEXT: movq  32(%rsp), %rbp
-  ; CHECK-NEXT: movq  40(%rsp), %r15
+  ; CHECK-NEXT: movq  24(%rsp), %rsi
+  ; CHECK-NEXT: movq  16(%rsp), %rdx
+  ; CHECK-NEXT: movq  8(%rsp), %rcx
   %0 = load i64* %hp_var
   %1 = load i64* %p_var
   %2 = load i64* %arg0_var
diff --git a/test/CodeGen/X86/hoist-common.ll b/test/CodeGen/X86/hoist-common.ll
index 6b26876..01d1b8c 100644
--- a/test/CodeGen/X86/hoist-common.ll
+++ b/test/CodeGen/X86/hoist-common.ll
@@ -1,4 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx  | FileCheck %s
+; This is supposed to be testing BranchFolding's common
+; code hoisting logic, but has been erroneously passing due
+; to there being a redundant xorl in the entry block
+; and no common code to hoist.
+; However, now that MachineSink sinks the redundant xor
+; hoist-common looks at it and rejects it for hoisting,
+; which causes this test to fail.
+; Since it seems this test is broken, marking XFAIL for now
+; until someone decides to remove it or fix what it tests.
+; XFAIL: *
 
 ; Common "xorb al, al" instruction in the two successor blocks should be
 ; moved to the entry block above the test + je.
diff --git a/test/CodeGen/X86/i128-mul.ll b/test/CodeGen/X86/i128-mul.ll
index c0b85df..8cfda85 100644
--- a/test/CodeGen/X86/i128-mul.ll
+++ b/test/CodeGen/X86/i128-mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
 ; PR1198
 
 define i64 @foo(i64 %x, i64 %y) {
diff --git a/test/CodeGen/X86/i486-fence-loop.ll b/test/CodeGen/X86/i486-fence-loop.ll
new file mode 100644
index 0000000..d809619
--- /dev/null
+++ b/test/CodeGen/X86/i486-fence-loop.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=x86 -mcpu=i486 -o - %s | FileCheck %s
+
+; Main test here was that ISelDAG could cope with a MachineNode in the chain
+; from the first load to the "X86ISD::SUB". Previously it thought that meant no
+; cycle could be formed so it tried to use "sub (%eax), [[RHS]]".
+
+define void @gst_atomic_queue_push(i32* %addr) {
+; CHECK-LABEL: gst_atomic_queue_push:
+; CHECK: movl (%eax), [[LHS:%e[a-z]+]]
+; CHECK: lock
+; CHECK-NEXT: orl
+; CHECK: movl (%eax), [[RHS:%e[a-z]+]]
+; CHECK: cmpl [[LHS]], [[RHS]]
+
+entry:
+  br label %while.body
+
+while.body:
+  %0 = load volatile i32* %addr, align 4
+  fence seq_cst
+  %1 = load volatile i32* %addr, align 4
+  %cmp = icmp sgt i32 %1, %0
+  br i1 %cmp, label %while.body, label %if.then
+
+if.then:
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/ident-metadata.ll b/test/CodeGen/X86/ident-metadata.ll
new file mode 100644
index 0000000..a568673
--- /dev/null
+++ b/test/CodeGen/X86/ident-metadata.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+; Verify that llvm.ident metadata is emitted as .ident
+; directives in assembly files, and in the .comment section in ELF object files.
+
+; CHECK: .ident  "clang version x.x"
+; CHECK-NEXT: .ident  "something else"
+!llvm.ident = !{!0, !1}
+!0 = metadata !{metadata !"clang version x.x"}
+!1 = metadata !{metadata !"something else"}
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
index 747a589..31fb190 100644
--- a/test/CodeGen/X86/inline-asm-error.ll
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -6,7 +6,7 @@
 ; RUN: FileCheck %s < %t3
 
 ; The register allocator must fail on this function.
-; CHECK: error: ran out of registers during register allocation
+; CHECK: error: inline assembly requires more registers than available
 
 define void @f(i32 %x0, i32 %x1, i32 %x2, i32 %x3, i32 %x4, i32 %x5, i32 %x6, i32 %x7, i32 %x8, i32 %x9) nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/inline-asm-flag-clobber.ll b/test/CodeGen/X86/inline-asm-flag-clobber.ll
index 51ea843..45f4d2f 100644
--- a/test/CodeGen/X86/inline-asm-flag-clobber.ll
+++ b/test/CodeGen/X86/inline-asm-flag-clobber.ll
@@ -2,18 +2,31 @@
 ; PR3701
 
 define i64 @t(i64* %arg) nounwind {
-	br i1 true, label %1, label %5
+        br i1 true, label %1, label %5
 
-; <label>:1		; preds = %0
-	%2 = icmp eq i64* null, %arg		; <i1> [#uses=1]
-	%3 = tail call i64* asm sideeffect "movl %fs:0,$0", "=r,~{dirflag},~{fpsr},~{flags}"() nounwind		; <%struct.thread*> [#uses=0]
+; <label>:1             ; preds = %0
+        %2 = icmp eq i64* null, %arg            ; <i1> [#uses=1]
+        %3 = tail call i64* asm sideeffect "movl %fs:0,$0", "=r,~{dirflag},~{fpsr},~{flags}"() nounwind         ; <%struct.thread*> [#uses=0]
 ; CHECK: test
 ; CHECK-NEXT: j
-	br i1 %2, label %4, label %5
+        br i1 %2, label %4, label %5
 
-; <label>:4		; preds = %1
-	ret i64 1
+; <label>:4             ; preds = %1
+        ret i64 1
 
-; <label>:5		; preds = %1
-	ret i64 0
+; <label>:5             ; preds = %1
+        ret i64 0
 }
+
+; Make sure that we translate this to the bswap intrinsic which lowers down without the
+; inline assembly.
+; CHECK-NOT: #APP
+define i32 @s(i32 %argc, i8** nocapture %argv) unnamed_addr nounwind {
+entry:
+  %0 = trunc i32 %argc to i16
+  %asmtmp = tail call i16 asm "rorw $$8, ${0:w}", "=r,0,~{fpsr},~{flags},~{cc}"(i16 %0) nounwind, !srcloc !0
+  %1 = zext i16 %asmtmp to i32
+  ret i32 %1
+}
+
+!0 = metadata !{i64 935930}
diff --git a/test/CodeGen/X86/ins_subreg_coalesce-1.ll b/test/CodeGen/X86/ins_subreg_coalesce-1.ll
index bec98a2..a74e3f2 100644
--- a/test/CodeGen/X86/ins_subreg_coalesce-1.ll
+++ b/test/CodeGen/X86/ins_subreg_coalesce-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=-bmi | FileCheck %s
 
 define fastcc i32 @t() nounwind  {
 entry:
diff --git a/test/CodeGen/X86/isel-optnone.ll b/test/CodeGen/X86/isel-optnone.ll
new file mode 100644
index 0000000..d2f0628
--- /dev/null
+++ b/test/CodeGen/X86/isel-optnone.ll
@@ -0,0 +1,42 @@
+; RUN: llc -O2 -march=x86 < %s | FileCheck %s
+
+define i32* @fooOptnone(i32* %p, i32* %q, i32** %z) #0 {
+entry:
+  %r = load i32* %p
+  %s = load i32* %q
+  %y = load i32** %z
+
+  %t0 = add i32 %r, %s
+  %t1 = add i32 %t0, 1
+  %t2 = getelementptr i32* %y, i32 1
+  %t3 = getelementptr i32* %t2, i32 %t1
+
+  ret i32* %t3
+
+; 'optnone' should use fast-isel which will not produce 'lea'.
+; CHECK-LABEL: fooOptnone:
+; CHECK-NOT:   lea
+; CHECK:       ret
+}
+
+define i32* @fooNormal(i32* %p, i32* %q, i32** %z) #1 {
+entry:
+  %r = load i32* %p
+  %s = load i32* %q
+  %y = load i32** %z
+
+  %t0 = add i32 %r, %s
+  %t1 = add i32 %t0, 1
+  %t2 = getelementptr i32* %y, i32 1
+  %t3 = getelementptr i32* %t2, i32 %t1
+
+  ret i32* %t3
+
+; Normal ISel will produce 'lea'.
+; CHECK-LABEL: fooNormal:
+; CHECK:       lea
+; CHECK:       ret
+}
+
+attributes #0 = { nounwind optnone noinline }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/large-gep-chain.ll b/test/CodeGen/X86/large-gep-chain.ll
new file mode 100644
index 0000000..5cf4661
--- /dev/null
+++ b/test/CodeGen/X86/large-gep-chain.ll
@@ -0,0 +1,25607 @@
+; RUN: llc < %s -O0 -march x86 -o /dev/null
+; <rdar://problem/12445434>
+
+%0 = type { i32, float* }
+
+@0 = external unnamed_addr constant [27 x i8], align 1
+@1 = external unnamed_addr constant [26 x i8], align 1
+@2 = external unnamed_addr constant [18 x i8], align 1
+@3 = external unnamed_addr constant [15 x i8], align 1
+@4 = external unnamed_addr constant [20 x i8], align 1
+@5 = external unnamed_addr constant [21 x i8], align 1
+@6 = external unnamed_addr constant [12 x i8], align 1
+@7 = external unnamed_addr constant [27 x i8], align 1
+@8 = external unnamed_addr constant [63 x i8], align 1
+
+define void @main() uwtable ssp {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb
+  br label %bb25362
+
+bb2:                                              ; preds = %bb
+  %tmp = getelementptr inbounds float* null, i64 1
+  %tmp3 = getelementptr inbounds float* %tmp, i64 1
+  %tmp4 = getelementptr inbounds float* %tmp3, i64 1
+  %tmp5 = getelementptr inbounds float* %tmp4, i64 1
+  %tmp6 = getelementptr inbounds float* %tmp5, i64 1
+  %tmp7 = getelementptr inbounds float* %tmp6, i64 1
+  %tmp8 = getelementptr inbounds float* %tmp7, i64 1
+  %tmp9 = getelementptr inbounds float* %tmp8, i64 1
+  %tmp10 = getelementptr inbounds float* %tmp9, i64 1
+  %tmp11 = getelementptr inbounds float* %tmp10, i64 1
+  %tmp12 = getelementptr inbounds float* %tmp11, i64 1
+  %tmp13 = getelementptr inbounds float* %tmp12, i64 1
+  %tmp14 = getelementptr inbounds float* %tmp13, i64 1
+  %tmp15 = getelementptr inbounds float* %tmp14, i64 1
+  %tmp16 = getelementptr inbounds float* %tmp15, i64 1
+  %tmp17 = getelementptr inbounds float* %tmp16, i64 1
+  %tmp18 = getelementptr inbounds float* %tmp17, i64 1
+  %tmp19 = getelementptr inbounds float* %tmp18, i64 1
+  %tmp20 = getelementptr inbounds float* %tmp19, i64 1
+  %tmp21 = getelementptr inbounds float* %tmp20, i64 1
+  %tmp22 = getelementptr inbounds float* %tmp21, i64 1
+  %tmp23 = getelementptr inbounds float* %tmp22, i64 1
+  %tmp24 = getelementptr inbounds float* %tmp23, i64 1
+  %tmp25 = getelementptr inbounds float* %tmp24, i64 1
+  %tmp26 = getelementptr inbounds float* %tmp25, i64 1
+  %tmp27 = getelementptr inbounds float* %tmp26, i64 1
+  %tmp28 = getelementptr inbounds float* %tmp27, i64 1
+  %tmp29 = getelementptr inbounds float* %tmp28, i64 1
+  %tmp30 = getelementptr inbounds float* %tmp29, i64 1
+  %tmp31 = getelementptr inbounds float* %tmp30, i64 1
+  %tmp32 = getelementptr inbounds float* %tmp31, i64 1
+  %tmp33 = getelementptr inbounds float* %tmp32, i64 1
+  %tmp34 = getelementptr inbounds float* %tmp33, i64 1
+  %tmp35 = getelementptr inbounds float* %tmp34, i64 1
+  %tmp36 = getelementptr inbounds float* %tmp35, i64 1
+  %tmp37 = getelementptr inbounds float* %tmp36, i64 1
+  %tmp38 = getelementptr inbounds float* %tmp37, i64 1
+  %tmp39 = getelementptr inbounds float* %tmp38, i64 1
+  %tmp40 = getelementptr inbounds float* %tmp39, i64 1
+  %tmp41 = getelementptr inbounds float* %tmp40, i64 1
+  %tmp42 = getelementptr inbounds float* %tmp41, i64 1
+  %tmp43 = getelementptr inbounds float* %tmp42, i64 1
+  %tmp44 = getelementptr inbounds float* %tmp43, i64 1
+  %tmp45 = getelementptr inbounds float* %tmp44, i64 1
+  %tmp46 = getelementptr inbounds float* %tmp45, i64 1
+  %tmp47 = getelementptr inbounds float* %tmp46, i64 1
+  %tmp48 = getelementptr inbounds float* %tmp47, i64 1
+  %tmp49 = getelementptr inbounds float* %tmp48, i64 1
+  %tmp50 = getelementptr inbounds float* %tmp49, i64 1
+  %tmp51 = getelementptr inbounds float* %tmp50, i64 1
+  %tmp52 = getelementptr inbounds float* %tmp51, i64 1
+  %tmp53 = getelementptr inbounds float* %tmp52, i64 1
+  %tmp54 = getelementptr inbounds float* %tmp53, i64 1
+  %tmp55 = getelementptr inbounds float* %tmp54, i64 1
+  %tmp56 = getelementptr inbounds float* %tmp55, i64 1
+  %tmp57 = getelementptr inbounds float* %tmp56, i64 1
+  %tmp58 = getelementptr inbounds float* %tmp57, i64 1
+  %tmp59 = getelementptr inbounds float* %tmp58, i64 1
+  %tmp60 = getelementptr inbounds float* %tmp59, i64 1
+  %tmp61 = getelementptr inbounds float* %tmp60, i64 1
+  %tmp62 = getelementptr inbounds float* %tmp61, i64 1
+  %tmp63 = getelementptr inbounds float* %tmp62, i64 1
+  %tmp64 = getelementptr inbounds float* %tmp63, i64 1
+  %tmp65 = getelementptr inbounds float* %tmp64, i64 1
+  %tmp66 = getelementptr inbounds float* %tmp65, i64 1
+  %tmp67 = getelementptr inbounds float* %tmp66, i64 1
+  %tmp68 = getelementptr inbounds float* %tmp67, i64 1
+  %tmp69 = getelementptr inbounds float* %tmp68, i64 1
+  %tmp70 = getelementptr inbounds float* %tmp69, i64 1
+  %tmp71 = getelementptr inbounds float* %tmp70, i64 1
+  %tmp72 = getelementptr inbounds float* %tmp71, i64 1
+  %tmp73 = getelementptr inbounds float* %tmp72, i64 1
+  %tmp74 = getelementptr inbounds float* %tmp73, i64 1
+  %tmp75 = getelementptr inbounds float* %tmp74, i64 1
+  %tmp76 = getelementptr inbounds float* %tmp75, i64 1
+  %tmp77 = getelementptr inbounds float* %tmp76, i64 1
+  %tmp78 = getelementptr inbounds float* %tmp77, i64 1
+  %tmp79 = getelementptr inbounds float* %tmp78, i64 1
+  %tmp80 = getelementptr inbounds float* %tmp79, i64 1
+  %tmp81 = getelementptr inbounds float* %tmp80, i64 1
+  %tmp82 = getelementptr inbounds float* %tmp81, i64 1
+  %tmp83 = getelementptr inbounds float* %tmp82, i64 1
+  %tmp84 = getelementptr inbounds float* %tmp83, i64 1
+  %tmp85 = getelementptr inbounds float* %tmp84, i64 1
+  %tmp86 = getelementptr inbounds float* %tmp85, i64 1
+  %tmp87 = getelementptr inbounds float* %tmp86, i64 1
+  %tmp88 = getelementptr inbounds float* %tmp87, i64 1
+  %tmp89 = getelementptr inbounds float* %tmp88, i64 1
+  %tmp90 = getelementptr inbounds float* %tmp89, i64 1
+  %tmp91 = getelementptr inbounds float* %tmp90, i64 1
+  %tmp92 = getelementptr inbounds float* %tmp91, i64 1
+  %tmp93 = getelementptr inbounds float* %tmp92, i64 1
+  %tmp94 = getelementptr inbounds float* %tmp93, i64 1
+  %tmp95 = getelementptr inbounds float* %tmp94, i64 1
+  %tmp96 = getelementptr inbounds float* %tmp95, i64 1
+  %tmp97 = getelementptr inbounds float* %tmp96, i64 1
+  %tmp98 = getelementptr inbounds float* %tmp97, i64 1
+  %tmp99 = getelementptr inbounds float* %tmp98, i64 1
+  %tmp100 = getelementptr inbounds float* %tmp99, i64 1
+  %tmp101 = getelementptr inbounds float* %tmp100, i64 1
+  %tmp102 = getelementptr inbounds float* %tmp101, i64 1
+  %tmp103 = getelementptr inbounds float* %tmp102, i64 1
+  %tmp104 = getelementptr inbounds float* %tmp103, i64 1
+  %tmp105 = getelementptr inbounds float* %tmp104, i64 1
+  %tmp106 = getelementptr inbounds float* %tmp105, i64 1
+  %tmp107 = getelementptr inbounds float* %tmp106, i64 1
+  %tmp108 = getelementptr inbounds float* %tmp107, i64 1
+  %tmp109 = getelementptr inbounds float* %tmp108, i64 1
+  %tmp110 = getelementptr inbounds float* %tmp109, i64 1
+  %tmp111 = getelementptr inbounds float* %tmp110, i64 1
+  %tmp112 = getelementptr inbounds float* %tmp111, i64 1
+  %tmp113 = getelementptr inbounds float* %tmp112, i64 1
+  %tmp114 = getelementptr inbounds float* %tmp113, i64 1
+  %tmp115 = getelementptr inbounds float* %tmp114, i64 1
+  %tmp116 = getelementptr inbounds float* %tmp115, i64 1
+  %tmp117 = getelementptr inbounds float* %tmp116, i64 1
+  %tmp118 = getelementptr inbounds float* %tmp117, i64 1
+  %tmp119 = getelementptr inbounds float* %tmp118, i64 1
+  %tmp120 = getelementptr inbounds float* %tmp119, i64 1
+  %tmp121 = getelementptr inbounds float* %tmp120, i64 1
+  %tmp122 = getelementptr inbounds float* %tmp121, i64 1
+  %tmp123 = getelementptr inbounds float* %tmp122, i64 1
+  %tmp124 = getelementptr inbounds float* %tmp123, i64 1
+  %tmp125 = getelementptr inbounds float* %tmp124, i64 1
+  %tmp126 = getelementptr inbounds float* %tmp125, i64 1
+  %tmp127 = getelementptr inbounds float* %tmp126, i64 1
+  %tmp128 = getelementptr inbounds float* %tmp127, i64 1
+  %tmp129 = getelementptr inbounds float* %tmp128, i64 1
+  %tmp130 = getelementptr inbounds float* %tmp129, i64 1
+  %tmp131 = getelementptr inbounds float* %tmp130, i64 1
+  %tmp132 = getelementptr inbounds float* %tmp131, i64 1
+  %tmp133 = getelementptr inbounds float* %tmp132, i64 1
+  %tmp134 = getelementptr inbounds float* %tmp133, i64 1
+  %tmp135 = getelementptr inbounds float* %tmp134, i64 1
+  %tmp136 = getelementptr inbounds float* %tmp135, i64 1
+  %tmp137 = getelementptr inbounds float* %tmp136, i64 1
+  %tmp138 = getelementptr inbounds float* %tmp137, i64 1
+  %tmp139 = getelementptr inbounds float* %tmp138, i64 1
+  %tmp140 = getelementptr inbounds float* %tmp139, i64 1
+  %tmp141 = getelementptr inbounds float* %tmp140, i64 1
+  %tmp142 = getelementptr inbounds float* %tmp141, i64 1
+  %tmp143 = getelementptr inbounds float* %tmp142, i64 1
+  %tmp144 = getelementptr inbounds float* %tmp143, i64 1
+  %tmp145 = getelementptr inbounds float* %tmp144, i64 1
+  %tmp146 = getelementptr inbounds float* %tmp145, i64 1
+  %tmp147 = getelementptr inbounds float* %tmp146, i64 1
+  %tmp148 = getelementptr inbounds float* %tmp147, i64 1
+  %tmp149 = getelementptr inbounds float* %tmp148, i64 1
+  %tmp150 = getelementptr inbounds float* %tmp149, i64 1
+  %tmp151 = getelementptr inbounds float* %tmp150, i64 1
+  %tmp152 = getelementptr inbounds float* %tmp151, i64 1
+  %tmp153 = getelementptr inbounds float* %tmp152, i64 1
+  %tmp154 = getelementptr inbounds float* %tmp153, i64 1
+  %tmp155 = getelementptr inbounds float* %tmp154, i64 1
+  %tmp156 = getelementptr inbounds float* %tmp155, i64 1
+  %tmp157 = getelementptr inbounds float* %tmp156, i64 1
+  %tmp158 = getelementptr inbounds float* %tmp157, i64 1
+  %tmp159 = getelementptr inbounds float* %tmp158, i64 1
+  %tmp160 = getelementptr inbounds float* %tmp159, i64 1
+  %tmp161 = getelementptr inbounds float* %tmp160, i64 1
+  %tmp162 = getelementptr inbounds float* %tmp161, i64 1
+  %tmp163 = getelementptr inbounds float* %tmp162, i64 1
+  %tmp164 = getelementptr inbounds float* %tmp163, i64 1
+  %tmp165 = getelementptr inbounds float* %tmp164, i64 1
+  %tmp166 = getelementptr inbounds float* %tmp165, i64 1
+  %tmp167 = getelementptr inbounds float* %tmp166, i64 1
+  %tmp168 = getelementptr inbounds float* %tmp167, i64 1
+  %tmp169 = getelementptr inbounds float* %tmp168, i64 1
+  %tmp170 = getelementptr inbounds float* %tmp169, i64 1
+  %tmp171 = getelementptr inbounds float* %tmp170, i64 1
+  %tmp172 = getelementptr inbounds float* %tmp171, i64 1
+  %tmp173 = getelementptr inbounds float* %tmp172, i64 1
+  %tmp174 = getelementptr inbounds float* %tmp173, i64 1
+  %tmp175 = getelementptr inbounds float* %tmp174, i64 1
+  %tmp176 = getelementptr inbounds float* %tmp175, i64 1
+  %tmp177 = getelementptr inbounds float* %tmp176, i64 1
+  %tmp178 = getelementptr inbounds float* %tmp177, i64 1
+  %tmp179 = getelementptr inbounds float* %tmp178, i64 1
+  %tmp180 = getelementptr inbounds float* %tmp179, i64 1
+  %tmp181 = getelementptr inbounds float* %tmp180, i64 1
+  %tmp182 = getelementptr inbounds float* %tmp181, i64 1
+  %tmp183 = getelementptr inbounds float* %tmp182, i64 1
+  %tmp184 = getelementptr inbounds float* %tmp183, i64 1
+  %tmp185 = getelementptr inbounds float* %tmp184, i64 1
+  %tmp186 = getelementptr inbounds float* %tmp185, i64 1
+  %tmp187 = getelementptr inbounds float* %tmp186, i64 1
+  %tmp188 = getelementptr inbounds float* %tmp187, i64 1
+  %tmp189 = getelementptr inbounds float* %tmp188, i64 1
+  %tmp190 = getelementptr inbounds float* %tmp189, i64 1
+  %tmp191 = getelementptr inbounds float* %tmp190, i64 1
+  %tmp192 = getelementptr inbounds float* %tmp191, i64 1
+  %tmp193 = getelementptr inbounds float* %tmp192, i64 1
+  %tmp194 = getelementptr inbounds float* %tmp193, i64 1
+  %tmp195 = getelementptr inbounds float* %tmp194, i64 1
+  %tmp196 = getelementptr inbounds float* %tmp195, i64 1
+  %tmp197 = getelementptr inbounds float* %tmp196, i64 1
+  %tmp198 = getelementptr inbounds float* %tmp197, i64 1
+  %tmp199 = getelementptr inbounds float* %tmp198, i64 1
+  %tmp200 = getelementptr inbounds float* %tmp199, i64 1
+  %tmp201 = getelementptr inbounds float* %tmp200, i64 1
+  %tmp202 = getelementptr inbounds float* %tmp201, i64 1
+  %tmp203 = getelementptr inbounds float* %tmp202, i64 1
+  %tmp204 = getelementptr inbounds float* %tmp203, i64 1
+  %tmp205 = getelementptr inbounds float* %tmp204, i64 1
+  %tmp206 = getelementptr inbounds float* %tmp205, i64 1
+  %tmp207 = getelementptr inbounds float* %tmp206, i64 1
+  %tmp208 = getelementptr inbounds float* %tmp207, i64 1
+  %tmp209 = getelementptr inbounds float* %tmp208, i64 1
+  %tmp210 = getelementptr inbounds float* %tmp209, i64 1
+  %tmp211 = getelementptr inbounds float* %tmp210, i64 1
+  %tmp212 = getelementptr inbounds float* %tmp211, i64 1
+  %tmp213 = getelementptr inbounds float* %tmp212, i64 1
+  %tmp214 = getelementptr inbounds float* %tmp213, i64 1
+  %tmp215 = getelementptr inbounds float* %tmp214, i64 1
+  %tmp216 = getelementptr inbounds float* %tmp215, i64 1
+  %tmp217 = getelementptr inbounds float* %tmp216, i64 1
+  %tmp218 = getelementptr inbounds float* %tmp217, i64 1
+  %tmp219 = getelementptr inbounds float* %tmp218, i64 1
+  %tmp220 = getelementptr inbounds float* %tmp219, i64 1
+  %tmp221 = getelementptr inbounds float* %tmp220, i64 1
+  %tmp222 = getelementptr inbounds float* %tmp221, i64 1
+  %tmp223 = getelementptr inbounds float* %tmp222, i64 1
+  %tmp224 = getelementptr inbounds float* %tmp223, i64 1
+  %tmp225 = getelementptr inbounds float* %tmp224, i64 1
+  %tmp226 = getelementptr inbounds float* %tmp225, i64 1
+  %tmp227 = getelementptr inbounds float* %tmp226, i64 1
+  %tmp228 = getelementptr inbounds float* %tmp227, i64 1
+  %tmp229 = getelementptr inbounds float* %tmp228, i64 1
+  %tmp230 = getelementptr inbounds float* %tmp229, i64 1
+  %tmp231 = getelementptr inbounds float* %tmp230, i64 1
+  %tmp232 = getelementptr inbounds float* %tmp231, i64 1
+  %tmp233 = getelementptr inbounds float* %tmp232, i64 1
+  %tmp234 = getelementptr inbounds float* %tmp233, i64 1
+  %tmp235 = getelementptr inbounds float* %tmp234, i64 1
+  %tmp236 = getelementptr inbounds float* %tmp235, i64 1
+  %tmp237 = getelementptr inbounds float* %tmp236, i64 1
+  %tmp238 = getelementptr inbounds float* %tmp237, i64 1
+  %tmp239 = getelementptr inbounds float* %tmp238, i64 1
+  %tmp240 = getelementptr inbounds float* %tmp239, i64 1
+  %tmp241 = getelementptr inbounds float* %tmp240, i64 1
+  %tmp242 = getelementptr inbounds float* %tmp241, i64 1
+  %tmp243 = getelementptr inbounds float* %tmp242, i64 1
+  %tmp244 = getelementptr inbounds float* %tmp243, i64 1
+  %tmp245 = getelementptr inbounds float* %tmp244, i64 1
+  %tmp246 = getelementptr inbounds float* %tmp245, i64 1
+  %tmp247 = getelementptr inbounds float* %tmp246, i64 1
+  %tmp248 = getelementptr inbounds float* %tmp247, i64 1
+  %tmp249 = getelementptr inbounds float* %tmp248, i64 1
+  %tmp250 = getelementptr inbounds float* %tmp249, i64 1
+  %tmp251 = getelementptr inbounds float* %tmp250, i64 1
+  %tmp252 = getelementptr inbounds float* %tmp251, i64 1
+  %tmp253 = getelementptr inbounds float* %tmp252, i64 1
+  %tmp254 = getelementptr inbounds float* %tmp253, i64 1
+  %tmp255 = getelementptr inbounds float* %tmp254, i64 1
+  %tmp256 = getelementptr inbounds float* %tmp255, i64 1
+  %tmp257 = getelementptr inbounds float* %tmp256, i64 1
+  %tmp258 = getelementptr inbounds float* %tmp257, i64 1
+  %tmp259 = getelementptr inbounds float* %tmp258, i64 1
+  %tmp260 = getelementptr inbounds float* %tmp259, i64 1
+  %tmp261 = getelementptr inbounds float* %tmp260, i64 1
+  %tmp262 = getelementptr inbounds float* %tmp261, i64 1
+  %tmp263 = getelementptr inbounds float* %tmp262, i64 1
+  %tmp264 = getelementptr inbounds float* %tmp263, i64 1
+  %tmp265 = getelementptr inbounds float* %tmp264, i64 1
+  %tmp266 = getelementptr inbounds float* %tmp265, i64 1
+  %tmp267 = getelementptr inbounds float* %tmp266, i64 1
+  %tmp268 = getelementptr inbounds float* %tmp267, i64 1
+  %tmp269 = getelementptr inbounds float* %tmp268, i64 1
+  %tmp270 = getelementptr inbounds float* %tmp269, i64 1
+  %tmp271 = getelementptr inbounds float* %tmp270, i64 1
+  %tmp272 = getelementptr inbounds float* %tmp271, i64 1
+  %tmp273 = getelementptr inbounds float* %tmp272, i64 1
+  %tmp274 = getelementptr inbounds float* %tmp273, i64 1
+  %tmp275 = getelementptr inbounds float* %tmp274, i64 1
+  %tmp276 = getelementptr inbounds float* %tmp275, i64 1
+  %tmp277 = getelementptr inbounds float* %tmp276, i64 1
+  %tmp278 = getelementptr inbounds float* %tmp277, i64 1
+  %tmp279 = getelementptr inbounds float* %tmp278, i64 1
+  %tmp280 = getelementptr inbounds float* %tmp279, i64 1
+  %tmp281 = getelementptr inbounds float* %tmp280, i64 1
+  %tmp282 = getelementptr inbounds float* %tmp281, i64 1
+  %tmp283 = getelementptr inbounds float* %tmp282, i64 1
+  %tmp284 = getelementptr inbounds float* %tmp283, i64 1
+  %tmp285 = getelementptr inbounds float* %tmp284, i64 1
+  %tmp286 = getelementptr inbounds float* %tmp285, i64 1
+  %tmp287 = getelementptr inbounds float* %tmp286, i64 1
+  %tmp288 = getelementptr inbounds float* %tmp287, i64 1
+  %tmp289 = getelementptr inbounds float* %tmp288, i64 1
+  %tmp290 = getelementptr inbounds float* %tmp289, i64 1
+  %tmp291 = getelementptr inbounds float* %tmp290, i64 1
+  %tmp292 = getelementptr inbounds float* %tmp291, i64 1
+  %tmp293 = getelementptr inbounds float* %tmp292, i64 1
+  %tmp294 = getelementptr inbounds float* %tmp293, i64 1
+  %tmp295 = getelementptr inbounds float* %tmp294, i64 1
+  %tmp296 = getelementptr inbounds float* %tmp295, i64 1
+  %tmp297 = getelementptr inbounds float* %tmp296, i64 1
+  %tmp298 = getelementptr inbounds float* %tmp297, i64 1
+  %tmp299 = getelementptr inbounds float* %tmp298, i64 1
+  %tmp300 = getelementptr inbounds float* %tmp299, i64 1
+  %tmp301 = getelementptr inbounds float* %tmp300, i64 1
+  %tmp302 = getelementptr inbounds float* %tmp301, i64 1
+  %tmp303 = getelementptr inbounds float* %tmp302, i64 1
+  %tmp304 = getelementptr inbounds float* %tmp303, i64 1
+  %tmp305 = getelementptr inbounds float* %tmp304, i64 1
+  %tmp306 = getelementptr inbounds float* %tmp305, i64 1
+  %tmp307 = getelementptr inbounds float* %tmp306, i64 1
+  %tmp308 = getelementptr inbounds float* %tmp307, i64 1
+  %tmp309 = getelementptr inbounds float* %tmp308, i64 1
+  %tmp310 = getelementptr inbounds float* %tmp309, i64 1
+  %tmp311 = getelementptr inbounds float* %tmp310, i64 1
+  %tmp312 = getelementptr inbounds float* %tmp311, i64 1
+  %tmp313 = getelementptr inbounds float* %tmp312, i64 1
+  %tmp314 = getelementptr inbounds float* %tmp313, i64 1
+  %tmp315 = getelementptr inbounds float* %tmp314, i64 1
+  %tmp316 = getelementptr inbounds float* %tmp315, i64 1
+  %tmp317 = getelementptr inbounds float* %tmp316, i64 1
+  %tmp318 = getelementptr inbounds float* %tmp317, i64 1
+  %tmp319 = getelementptr inbounds float* %tmp318, i64 1
+  %tmp320 = getelementptr inbounds float* %tmp319, i64 1
+  %tmp321 = getelementptr inbounds float* %tmp320, i64 1
+  %tmp322 = getelementptr inbounds float* %tmp321, i64 1
+  %tmp323 = getelementptr inbounds float* %tmp322, i64 1
+  %tmp324 = getelementptr inbounds float* %tmp323, i64 1
+  %tmp325 = getelementptr inbounds float* %tmp324, i64 1
+  %tmp326 = getelementptr inbounds float* %tmp325, i64 1
+  %tmp327 = getelementptr inbounds float* %tmp326, i64 1
+  %tmp328 = getelementptr inbounds float* %tmp327, i64 1
+  %tmp329 = getelementptr inbounds float* %tmp328, i64 1
+  %tmp330 = getelementptr inbounds float* %tmp329, i64 1
+  %tmp331 = getelementptr inbounds float* %tmp330, i64 1
+  %tmp332 = getelementptr inbounds float* %tmp331, i64 1
+  %tmp333 = getelementptr inbounds float* %tmp332, i64 1
+  %tmp334 = getelementptr inbounds float* %tmp333, i64 1
+  %tmp335 = getelementptr inbounds float* %tmp334, i64 1
+  %tmp336 = getelementptr inbounds float* %tmp335, i64 1
+  %tmp337 = getelementptr inbounds float* %tmp336, i64 1
+  %tmp338 = getelementptr inbounds float* %tmp337, i64 1
+  %tmp339 = getelementptr inbounds float* %tmp338, i64 1
+  %tmp340 = getelementptr inbounds float* %tmp339, i64 1
+  %tmp341 = getelementptr inbounds float* %tmp340, i64 1
+  %tmp342 = getelementptr inbounds float* %tmp341, i64 1
+  %tmp343 = getelementptr inbounds float* %tmp342, i64 1
+  %tmp344 = getelementptr inbounds float* %tmp343, i64 1
+  %tmp345 = getelementptr inbounds float* %tmp344, i64 1
+  %tmp346 = getelementptr inbounds float* %tmp345, i64 1
+  %tmp347 = getelementptr inbounds float* %tmp346, i64 1
+  %tmp348 = getelementptr inbounds float* %tmp347, i64 1
+  %tmp349 = getelementptr inbounds float* %tmp348, i64 1
+  %tmp350 = getelementptr inbounds float* %tmp349, i64 1
+  %tmp351 = getelementptr inbounds float* %tmp350, i64 1
+  %tmp352 = getelementptr inbounds float* %tmp351, i64 1
+  %tmp353 = getelementptr inbounds float* %tmp352, i64 1
+  %tmp354 = getelementptr inbounds float* %tmp353, i64 1
+  %tmp355 = getelementptr inbounds float* %tmp354, i64 1
+  %tmp356 = getelementptr inbounds float* %tmp355, i64 1
+  %tmp357 = getelementptr inbounds float* %tmp356, i64 1
+  %tmp358 = getelementptr inbounds float* %tmp357, i64 1
+  %tmp359 = getelementptr inbounds float* %tmp358, i64 1
+  %tmp360 = getelementptr inbounds float* %tmp359, i64 1
+  %tmp361 = getelementptr inbounds float* %tmp360, i64 1
+  %tmp362 = getelementptr inbounds float* %tmp361, i64 1
+  %tmp363 = getelementptr inbounds float* %tmp362, i64 1
+  %tmp364 = getelementptr inbounds float* %tmp363, i64 1
+  %tmp365 = getelementptr inbounds float* %tmp364, i64 1
+  %tmp366 = getelementptr inbounds float* %tmp365, i64 1
+  %tmp367 = getelementptr inbounds float* %tmp366, i64 1
+  %tmp368 = getelementptr inbounds float* %tmp367, i64 1
+  %tmp369 = getelementptr inbounds float* %tmp368, i64 1
+  %tmp370 = getelementptr inbounds float* %tmp369, i64 1
+  %tmp371 = getelementptr inbounds float* %tmp370, i64 1
+  %tmp372 = getelementptr inbounds float* %tmp371, i64 1
+  %tmp373 = getelementptr inbounds float* %tmp372, i64 1
+  %tmp374 = getelementptr inbounds float* %tmp373, i64 1
+  %tmp375 = getelementptr inbounds float* %tmp374, i64 1
+  %tmp376 = getelementptr inbounds float* %tmp375, i64 1
+  %tmp377 = getelementptr inbounds float* %tmp376, i64 1
+  %tmp378 = getelementptr inbounds float* %tmp377, i64 1
+  %tmp379 = getelementptr inbounds float* %tmp378, i64 1
+  %tmp380 = getelementptr inbounds float* %tmp379, i64 1
+  %tmp381 = getelementptr inbounds float* %tmp380, i64 1
+  %tmp382 = getelementptr inbounds float* %tmp381, i64 1
+  %tmp383 = getelementptr inbounds float* %tmp382, i64 1
+  %tmp384 = getelementptr inbounds float* %tmp383, i64 1
+  %tmp385 = getelementptr inbounds float* %tmp384, i64 1
+  %tmp386 = getelementptr inbounds float* %tmp385, i64 1
+  %tmp387 = getelementptr inbounds float* %tmp386, i64 1
+  %tmp388 = getelementptr inbounds float* %tmp387, i64 1
+  %tmp389 = getelementptr inbounds float* %tmp388, i64 1
+  %tmp390 = getelementptr inbounds float* %tmp389, i64 1
+  %tmp391 = getelementptr inbounds float* %tmp390, i64 1
+  %tmp392 = getelementptr inbounds float* %tmp391, i64 1
+  %tmp393 = getelementptr inbounds float* %tmp392, i64 1
+  %tmp394 = getelementptr inbounds float* %tmp393, i64 1
+  %tmp395 = getelementptr inbounds float* %tmp394, i64 1
+  %tmp396 = getelementptr inbounds float* %tmp395, i64 1
+  %tmp397 = getelementptr inbounds float* %tmp396, i64 1
+  %tmp398 = getelementptr inbounds float* %tmp397, i64 1
+  %tmp399 = getelementptr inbounds float* %tmp398, i64 1
+  %tmp400 = getelementptr inbounds float* %tmp399, i64 1
+  %tmp401 = getelementptr inbounds float* %tmp400, i64 1
+  %tmp402 = getelementptr inbounds float* %tmp401, i64 1
+  %tmp403 = getelementptr inbounds float* %tmp402, i64 1
+  %tmp404 = getelementptr inbounds float* %tmp403, i64 1
+  %tmp405 = getelementptr inbounds float* %tmp404, i64 1
+  %tmp406 = getelementptr inbounds float* %tmp405, i64 1
+  %tmp407 = getelementptr inbounds float* %tmp406, i64 1
+  %tmp408 = getelementptr inbounds float* %tmp407, i64 1
+  %tmp409 = getelementptr inbounds float* %tmp408, i64 1
+  %tmp410 = getelementptr inbounds float* %tmp409, i64 1
+  %tmp411 = getelementptr inbounds float* %tmp410, i64 1
+  %tmp412 = getelementptr inbounds float* %tmp411, i64 1
+  %tmp413 = getelementptr inbounds float* %tmp412, i64 1
+  %tmp414 = getelementptr inbounds float* %tmp413, i64 1
+  %tmp415 = getelementptr inbounds float* %tmp414, i64 1
+  %tmp416 = getelementptr inbounds float* %tmp415, i64 1
+  %tmp417 = getelementptr inbounds float* %tmp416, i64 1
+  %tmp418 = getelementptr inbounds float* %tmp417, i64 1
+  %tmp419 = getelementptr inbounds float* %tmp418, i64 1
+  %tmp420 = getelementptr inbounds float* %tmp419, i64 1
+  %tmp421 = getelementptr inbounds float* %tmp420, i64 1
+  %tmp422 = getelementptr inbounds float* %tmp421, i64 1
+  %tmp423 = getelementptr inbounds float* %tmp422, i64 1
+  %tmp424 = getelementptr inbounds float* %tmp423, i64 1
+  %tmp425 = getelementptr inbounds float* %tmp424, i64 1
+  %tmp426 = getelementptr inbounds float* %tmp425, i64 1
+  %tmp427 = getelementptr inbounds float* %tmp426, i64 1
+  %tmp428 = getelementptr inbounds float* %tmp427, i64 1
+  %tmp429 = getelementptr inbounds float* %tmp428, i64 1
+  %tmp430 = getelementptr inbounds float* %tmp429, i64 1
+  %tmp431 = getelementptr inbounds float* %tmp430, i64 1
+  %tmp432 = getelementptr inbounds float* %tmp431, i64 1
+  %tmp433 = getelementptr inbounds float* %tmp432, i64 1
+  %tmp434 = getelementptr inbounds float* %tmp433, i64 1
+  %tmp435 = getelementptr inbounds float* %tmp434, i64 1
+  %tmp436 = getelementptr inbounds float* %tmp435, i64 1
+  %tmp437 = getelementptr inbounds float* %tmp436, i64 1
+  %tmp438 = getelementptr inbounds float* %tmp437, i64 1
+  %tmp439 = getelementptr inbounds float* %tmp438, i64 1
+  %tmp440 = getelementptr inbounds float* %tmp439, i64 1
+  %tmp441 = getelementptr inbounds float* %tmp440, i64 1
+  %tmp442 = getelementptr inbounds float* %tmp441, i64 1
+  %tmp443 = getelementptr inbounds float* %tmp442, i64 1
+  %tmp444 = getelementptr inbounds float* %tmp443, i64 1
+  %tmp445 = getelementptr inbounds float* %tmp444, i64 1
+  %tmp446 = getelementptr inbounds float* %tmp445, i64 1
+  %tmp447 = getelementptr inbounds float* %tmp446, i64 1
+  %tmp448 = getelementptr inbounds float* %tmp447, i64 1
+  %tmp449 = getelementptr inbounds float* %tmp448, i64 1
+  %tmp450 = getelementptr inbounds float* %tmp449, i64 1
+  %tmp451 = getelementptr inbounds float* %tmp450, i64 1
+  %tmp452 = getelementptr inbounds float* %tmp451, i64 1
+  %tmp453 = getelementptr inbounds float* %tmp452, i64 1
+  %tmp454 = getelementptr inbounds float* %tmp453, i64 1
+  %tmp455 = getelementptr inbounds float* %tmp454, i64 1
+  %tmp456 = getelementptr inbounds float* %tmp455, i64 1
+  %tmp457 = getelementptr inbounds float* %tmp456, i64 1
+  %tmp458 = getelementptr inbounds float* %tmp457, i64 1
+  %tmp459 = getelementptr inbounds float* %tmp458, i64 1
+  %tmp460 = getelementptr inbounds float* %tmp459, i64 1
+  %tmp461 = getelementptr inbounds float* %tmp460, i64 1
+  %tmp462 = getelementptr inbounds float* %tmp461, i64 1
+  %tmp463 = getelementptr inbounds float* %tmp462, i64 1
+  %tmp464 = getelementptr inbounds float* %tmp463, i64 1
+  %tmp465 = getelementptr inbounds float* %tmp464, i64 1
+  %tmp466 = getelementptr inbounds float* %tmp465, i64 1
+  %tmp467 = getelementptr inbounds float* %tmp466, i64 1
+  %tmp468 = getelementptr inbounds float* %tmp467, i64 1
+  %tmp469 = getelementptr inbounds float* %tmp468, i64 1
+  %tmp470 = getelementptr inbounds float* %tmp469, i64 1
+  %tmp471 = getelementptr inbounds float* %tmp470, i64 1
+  %tmp472 = getelementptr inbounds float* %tmp471, i64 1
+  %tmp473 = getelementptr inbounds float* %tmp472, i64 1
+  %tmp474 = getelementptr inbounds float* %tmp473, i64 1
+  %tmp475 = getelementptr inbounds float* %tmp474, i64 1
+  %tmp476 = getelementptr inbounds float* %tmp475, i64 1
+  %tmp477 = getelementptr inbounds float* %tmp476, i64 1
+  %tmp478 = getelementptr inbounds float* %tmp477, i64 1
+  %tmp479 = getelementptr inbounds float* %tmp478, i64 1
+  %tmp480 = getelementptr inbounds float* %tmp479, i64 1
+  %tmp481 = getelementptr inbounds float* %tmp480, i64 1
+  %tmp482 = getelementptr inbounds float* %tmp481, i64 1
+  %tmp483 = getelementptr inbounds float* %tmp482, i64 1
+  %tmp484 = getelementptr inbounds float* %tmp483, i64 1
+  %tmp485 = getelementptr inbounds float* %tmp484, i64 1
+  %tmp486 = getelementptr inbounds float* %tmp485, i64 1
+  %tmp487 = getelementptr inbounds float* %tmp486, i64 1
+  %tmp488 = getelementptr inbounds float* %tmp487, i64 1
+  %tmp489 = getelementptr inbounds float* %tmp488, i64 1
+  %tmp490 = getelementptr inbounds float* %tmp489, i64 1
+  %tmp491 = getelementptr inbounds float* %tmp490, i64 1
+  %tmp492 = getelementptr inbounds float* %tmp491, i64 1
+  %tmp493 = getelementptr inbounds float* %tmp492, i64 1
+  %tmp494 = getelementptr inbounds float* %tmp493, i64 1
+  %tmp495 = getelementptr inbounds float* %tmp494, i64 1
+  %tmp496 = getelementptr inbounds float* %tmp495, i64 1
+  %tmp497 = getelementptr inbounds float* %tmp496, i64 1
+  %tmp498 = getelementptr inbounds float* %tmp497, i64 1
+  %tmp499 = getelementptr inbounds float* %tmp498, i64 1
+  %tmp500 = getelementptr inbounds float* %tmp499, i64 1
+  %tmp501 = getelementptr inbounds float* %tmp500, i64 1
+  %tmp502 = getelementptr inbounds float* %tmp501, i64 1
+  %tmp503 = getelementptr inbounds float* %tmp502, i64 1
+  %tmp504 = getelementptr inbounds float* %tmp503, i64 1
+  %tmp505 = getelementptr inbounds float* %tmp504, i64 1
+  %tmp506 = getelementptr inbounds float* %tmp505, i64 1
+  %tmp507 = getelementptr inbounds float* %tmp506, i64 1
+  %tmp508 = getelementptr inbounds float* %tmp507, i64 1
+  %tmp509 = getelementptr inbounds float* %tmp508, i64 1
+  %tmp510 = getelementptr inbounds float* %tmp509, i64 1
+  %tmp511 = getelementptr inbounds float* %tmp510, i64 1
+  %tmp512 = getelementptr inbounds float* %tmp511, i64 1
+  %tmp513 = getelementptr inbounds float* %tmp512, i64 1
+  %tmp514 = getelementptr inbounds float* %tmp513, i64 1
+  %tmp515 = getelementptr inbounds float* %tmp514, i64 1
+  %tmp516 = getelementptr inbounds float* %tmp515, i64 1
+  %tmp517 = getelementptr inbounds float* %tmp516, i64 1
+  %tmp518 = getelementptr inbounds float* %tmp517, i64 1
+  %tmp519 = getelementptr inbounds float* %tmp518, i64 1
+  %tmp520 = getelementptr inbounds float* %tmp519, i64 1
+  %tmp521 = getelementptr inbounds float* %tmp520, i64 1
+  %tmp522 = getelementptr inbounds float* %tmp521, i64 1
+  %tmp523 = getelementptr inbounds float* %tmp522, i64 1
+  %tmp524 = getelementptr inbounds float* %tmp523, i64 1
+  %tmp525 = getelementptr inbounds float* %tmp524, i64 1
+  %tmp526 = getelementptr inbounds float* %tmp525, i64 1
+  %tmp527 = getelementptr inbounds float* %tmp526, i64 1
+  %tmp528 = getelementptr inbounds float* %tmp527, i64 1
+  %tmp529 = getelementptr inbounds float* %tmp528, i64 1
+  %tmp530 = getelementptr inbounds float* %tmp529, i64 1
+  %tmp531 = getelementptr inbounds float* %tmp530, i64 1
+  %tmp532 = getelementptr inbounds float* %tmp531, i64 1
+  %tmp533 = getelementptr inbounds float* %tmp532, i64 1
+  %tmp534 = getelementptr inbounds float* %tmp533, i64 1
+  %tmp535 = getelementptr inbounds float* %tmp534, i64 1
+  %tmp536 = getelementptr inbounds float* %tmp535, i64 1
+  %tmp537 = getelementptr inbounds float* %tmp536, i64 1
+  %tmp538 = getelementptr inbounds float* %tmp537, i64 1
+  %tmp539 = getelementptr inbounds float* %tmp538, i64 1
+  %tmp540 = getelementptr inbounds float* %tmp539, i64 1
+  %tmp541 = getelementptr inbounds float* %tmp540, i64 1
+  %tmp542 = getelementptr inbounds float* %tmp541, i64 1
+  %tmp543 = getelementptr inbounds float* %tmp542, i64 1
+  %tmp544 = getelementptr inbounds float* %tmp543, i64 1
+  %tmp545 = getelementptr inbounds float* %tmp544, i64 1
+  %tmp546 = getelementptr inbounds float* %tmp545, i64 1
+  %tmp547 = getelementptr inbounds float* %tmp546, i64 1
+  %tmp548 = getelementptr inbounds float* %tmp547, i64 1
+  %tmp549 = getelementptr inbounds float* %tmp548, i64 1
+  %tmp550 = getelementptr inbounds float* %tmp549, i64 1
+  %tmp551 = getelementptr inbounds float* %tmp550, i64 1
+  %tmp552 = getelementptr inbounds float* %tmp551, i64 1
+  %tmp553 = getelementptr inbounds float* %tmp552, i64 1
+  %tmp554 = getelementptr inbounds float* %tmp553, i64 1
+  %tmp555 = getelementptr inbounds float* %tmp554, i64 1
+  %tmp556 = getelementptr inbounds float* %tmp555, i64 1
+  %tmp557 = getelementptr inbounds float* %tmp556, i64 1
+  %tmp558 = getelementptr inbounds float* %tmp557, i64 1
+  %tmp559 = getelementptr inbounds float* %tmp558, i64 1
+  %tmp560 = getelementptr inbounds float* %tmp559, i64 1
+  %tmp561 = getelementptr inbounds float* %tmp560, i64 1
+  %tmp562 = getelementptr inbounds float* %tmp561, i64 1
+  %tmp563 = getelementptr inbounds float* %tmp562, i64 1
+  %tmp564 = getelementptr inbounds float* %tmp563, i64 1
+  %tmp565 = getelementptr inbounds float* %tmp564, i64 1
+  %tmp566 = getelementptr inbounds float* %tmp565, i64 1
+  %tmp567 = getelementptr inbounds float* %tmp566, i64 1
+  %tmp568 = getelementptr inbounds float* %tmp567, i64 1
+  %tmp569 = getelementptr inbounds float* %tmp568, i64 1
+  %tmp570 = getelementptr inbounds float* %tmp569, i64 1
+  %tmp571 = getelementptr inbounds float* %tmp570, i64 1
+  %tmp572 = getelementptr inbounds float* %tmp571, i64 1
+  %tmp573 = getelementptr inbounds float* %tmp572, i64 1
+  %tmp574 = getelementptr inbounds float* %tmp573, i64 1
+  %tmp575 = getelementptr inbounds float* %tmp574, i64 1
+  %tmp576 = getelementptr inbounds float* %tmp575, i64 1
+  %tmp577 = getelementptr inbounds float* %tmp576, i64 1
+  %tmp578 = getelementptr inbounds float* %tmp577, i64 1
+  %tmp579 = getelementptr inbounds float* %tmp578, i64 1
+  %tmp580 = getelementptr inbounds float* %tmp579, i64 1
+  %tmp581 = getelementptr inbounds float* %tmp580, i64 1
+  %tmp582 = getelementptr inbounds float* %tmp581, i64 1
+  %tmp583 = getelementptr inbounds float* %tmp582, i64 1
+  %tmp584 = getelementptr inbounds float* %tmp583, i64 1
+  %tmp585 = getelementptr inbounds float* %tmp584, i64 1
+  %tmp586 = getelementptr inbounds float* %tmp585, i64 1
+  %tmp587 = getelementptr inbounds float* %tmp586, i64 1
+  %tmp588 = getelementptr inbounds float* %tmp587, i64 1
+  %tmp589 = getelementptr inbounds float* %tmp588, i64 1
+  %tmp590 = getelementptr inbounds float* %tmp589, i64 1
+  %tmp591 = getelementptr inbounds float* %tmp590, i64 1
+  %tmp592 = getelementptr inbounds float* %tmp591, i64 1
+  %tmp593 = getelementptr inbounds float* %tmp592, i64 1
+  %tmp594 = getelementptr inbounds float* %tmp593, i64 1
+  %tmp595 = getelementptr inbounds float* %tmp594, i64 1
+  %tmp596 = getelementptr inbounds float* %tmp595, i64 1
+  %tmp597 = getelementptr inbounds float* %tmp596, i64 1
+  %tmp598 = getelementptr inbounds float* %tmp597, i64 1
+  %tmp599 = getelementptr inbounds float* %tmp598, i64 1
+  %tmp600 = getelementptr inbounds float* %tmp599, i64 1
+  %tmp601 = getelementptr inbounds float* %tmp600, i64 1
+  %tmp602 = getelementptr inbounds float* %tmp601, i64 1
+  %tmp603 = getelementptr inbounds float* %tmp602, i64 1
+  %tmp604 = getelementptr inbounds float* %tmp603, i64 1
+  %tmp605 = getelementptr inbounds float* %tmp604, i64 1
+  %tmp606 = getelementptr inbounds float* %tmp605, i64 1
+  %tmp607 = getelementptr inbounds float* %tmp606, i64 1
+  %tmp608 = getelementptr inbounds float* %tmp607, i64 1
+  %tmp609 = getelementptr inbounds float* %tmp608, i64 1
+  %tmp610 = getelementptr inbounds float* %tmp609, i64 1
+  %tmp611 = getelementptr inbounds float* %tmp610, i64 1
+  %tmp612 = getelementptr inbounds float* %tmp611, i64 1
+  %tmp613 = getelementptr inbounds float* %tmp612, i64 1
+  %tmp614 = getelementptr inbounds float* %tmp613, i64 1
+  %tmp615 = getelementptr inbounds float* %tmp614, i64 1
+  %tmp616 = getelementptr inbounds float* %tmp615, i64 1
+  %tmp617 = getelementptr inbounds float* %tmp616, i64 1
+  %tmp618 = getelementptr inbounds float* %tmp617, i64 1
+  %tmp619 = getelementptr inbounds float* %tmp618, i64 1
+  %tmp620 = getelementptr inbounds float* %tmp619, i64 1
+  %tmp621 = getelementptr inbounds float* %tmp620, i64 1
+  %tmp622 = getelementptr inbounds float* %tmp621, i64 1
+  %tmp623 = getelementptr inbounds float* %tmp622, i64 1
+  %tmp624 = getelementptr inbounds float* %tmp623, i64 1
+  %tmp625 = getelementptr inbounds float* %tmp624, i64 1
+  %tmp626 = getelementptr inbounds float* %tmp625, i64 1
+  %tmp627 = getelementptr inbounds float* %tmp626, i64 1
+  %tmp628 = getelementptr inbounds float* %tmp627, i64 1
+  %tmp629 = getelementptr inbounds float* %tmp628, i64 1
+  %tmp630 = getelementptr inbounds float* %tmp629, i64 1
+  %tmp631 = getelementptr inbounds float* %tmp630, i64 1
+  %tmp632 = getelementptr inbounds float* %tmp631, i64 1
+  %tmp633 = getelementptr inbounds float* %tmp632, i64 1
+  %tmp634 = getelementptr inbounds float* %tmp633, i64 1
+  %tmp635 = getelementptr inbounds float* %tmp634, i64 1
+  %tmp636 = getelementptr inbounds float* %tmp635, i64 1
+  %tmp637 = getelementptr inbounds float* %tmp636, i64 1
+  %tmp638 = getelementptr inbounds float* %tmp637, i64 1
+  %tmp639 = getelementptr inbounds float* %tmp638, i64 1
+  %tmp640 = getelementptr inbounds float* %tmp639, i64 1
+  %tmp641 = getelementptr inbounds float* %tmp640, i64 1
+  %tmp642 = getelementptr inbounds float* %tmp641, i64 1
+  %tmp643 = getelementptr inbounds float* %tmp642, i64 1
+  %tmp644 = getelementptr inbounds float* %tmp643, i64 1
+  %tmp645 = getelementptr inbounds float* %tmp644, i64 1
+  %tmp646 = getelementptr inbounds float* %tmp645, i64 1
+  %tmp647 = getelementptr inbounds float* %tmp646, i64 1
+  %tmp648 = getelementptr inbounds float* %tmp647, i64 1
+  %tmp649 = getelementptr inbounds float* %tmp648, i64 1
+  %tmp650 = getelementptr inbounds float* %tmp649, i64 1
+  %tmp651 = getelementptr inbounds float* %tmp650, i64 1
+  %tmp652 = getelementptr inbounds float* %tmp651, i64 1
+  %tmp653 = getelementptr inbounds float* %tmp652, i64 1
+  %tmp654 = getelementptr inbounds float* %tmp653, i64 1
+  %tmp655 = getelementptr inbounds float* %tmp654, i64 1
+  %tmp656 = getelementptr inbounds float* %tmp655, i64 1
+  %tmp657 = getelementptr inbounds float* %tmp656, i64 1
+  %tmp658 = getelementptr inbounds float* %tmp657, i64 1
+  %tmp659 = getelementptr inbounds float* %tmp658, i64 1
+  %tmp660 = getelementptr inbounds float* %tmp659, i64 1
+  %tmp661 = getelementptr inbounds float* %tmp660, i64 1
+  %tmp662 = getelementptr inbounds float* %tmp661, i64 1
+  %tmp663 = getelementptr inbounds float* %tmp662, i64 1
+  %tmp664 = getelementptr inbounds float* %tmp663, i64 1
+  %tmp665 = getelementptr inbounds float* %tmp664, i64 1
+  %tmp666 = getelementptr inbounds float* %tmp665, i64 1
+  %tmp667 = getelementptr inbounds float* %tmp666, i64 1
+  %tmp668 = getelementptr inbounds float* %tmp667, i64 1
+  %tmp669 = getelementptr inbounds float* %tmp668, i64 1
+  %tmp670 = getelementptr inbounds float* %tmp669, i64 1
+  %tmp671 = getelementptr inbounds float* %tmp670, i64 1
+  %tmp672 = getelementptr inbounds float* %tmp671, i64 1
+  %tmp673 = getelementptr inbounds float* %tmp672, i64 1
+  %tmp674 = getelementptr inbounds float* %tmp673, i64 1
+  %tmp675 = getelementptr inbounds float* %tmp674, i64 1
+  %tmp676 = getelementptr inbounds float* %tmp675, i64 1
+  %tmp677 = getelementptr inbounds float* %tmp676, i64 1
+  %tmp678 = getelementptr inbounds float* %tmp677, i64 1
+  %tmp679 = getelementptr inbounds float* %tmp678, i64 1
+  %tmp680 = getelementptr inbounds float* %tmp679, i64 1
+  %tmp681 = getelementptr inbounds float* %tmp680, i64 1
+  %tmp682 = getelementptr inbounds float* %tmp681, i64 1
+  %tmp683 = getelementptr inbounds float* %tmp682, i64 1
+  %tmp684 = getelementptr inbounds float* %tmp683, i64 1
+  %tmp685 = getelementptr inbounds float* %tmp684, i64 1
+  %tmp686 = getelementptr inbounds float* %tmp685, i64 1
+  %tmp687 = getelementptr inbounds float* %tmp686, i64 1
+  %tmp688 = getelementptr inbounds float* %tmp687, i64 1
+  %tmp689 = getelementptr inbounds float* %tmp688, i64 1
+  %tmp690 = getelementptr inbounds float* %tmp689, i64 1
+  %tmp691 = getelementptr inbounds float* %tmp690, i64 1
+  %tmp692 = getelementptr inbounds float* %tmp691, i64 1
+  %tmp693 = getelementptr inbounds float* %tmp692, i64 1
+  %tmp694 = getelementptr inbounds float* %tmp693, i64 1
+  %tmp695 = getelementptr inbounds float* %tmp694, i64 1
+  %tmp696 = getelementptr inbounds float* %tmp695, i64 1
+  %tmp697 = getelementptr inbounds float* %tmp696, i64 1
+  %tmp698 = getelementptr inbounds float* %tmp697, i64 1
+  %tmp699 = getelementptr inbounds float* %tmp698, i64 1
+  %tmp700 = getelementptr inbounds float* %tmp699, i64 1
+  %tmp701 = getelementptr inbounds float* %tmp700, i64 1
+  %tmp702 = getelementptr inbounds float* %tmp701, i64 1
+  %tmp703 = getelementptr inbounds float* %tmp702, i64 1
+  %tmp704 = getelementptr inbounds float* %tmp703, i64 1
+  %tmp705 = getelementptr inbounds float* %tmp704, i64 1
+  %tmp706 = getelementptr inbounds float* %tmp705, i64 1
+  %tmp707 = getelementptr inbounds float* %tmp706, i64 1
+  %tmp708 = getelementptr inbounds float* %tmp707, i64 1
+  %tmp709 = getelementptr inbounds float* %tmp708, i64 1
+  %tmp710 = getelementptr inbounds float* %tmp709, i64 1
+  %tmp711 = getelementptr inbounds float* %tmp710, i64 1
+  %tmp712 = getelementptr inbounds float* %tmp711, i64 1
+  %tmp713 = getelementptr inbounds float* %tmp712, i64 1
+  %tmp714 = getelementptr inbounds float* %tmp713, i64 1
+  %tmp715 = getelementptr inbounds float* %tmp714, i64 1
+  %tmp716 = getelementptr inbounds float* %tmp715, i64 1
+  %tmp717 = getelementptr inbounds float* %tmp716, i64 1
+  %tmp718 = getelementptr inbounds float* %tmp717, i64 1
+  %tmp719 = getelementptr inbounds float* %tmp718, i64 1
+  %tmp720 = getelementptr inbounds float* %tmp719, i64 1
+  %tmp721 = getelementptr inbounds float* %tmp720, i64 1
+  %tmp722 = getelementptr inbounds float* %tmp721, i64 1
+  %tmp723 = getelementptr inbounds float* %tmp722, i64 1
+  %tmp724 = getelementptr inbounds float* %tmp723, i64 1
+  %tmp725 = getelementptr inbounds float* %tmp724, i64 1
+  %tmp726 = getelementptr inbounds float* %tmp725, i64 1
+  %tmp727 = getelementptr inbounds float* %tmp726, i64 1
+  %tmp728 = getelementptr inbounds float* %tmp727, i64 1
+  %tmp729 = getelementptr inbounds float* %tmp728, i64 1
+  %tmp730 = getelementptr inbounds float* %tmp729, i64 1
+  %tmp731 = getelementptr inbounds float* %tmp730, i64 1
+  %tmp732 = getelementptr inbounds float* %tmp731, i64 1
+  %tmp733 = getelementptr inbounds float* %tmp732, i64 1
+  %tmp734 = getelementptr inbounds float* %tmp733, i64 1
+  %tmp735 = getelementptr inbounds float* %tmp734, i64 1
+  %tmp736 = getelementptr inbounds float* %tmp735, i64 1
+  %tmp737 = getelementptr inbounds float* %tmp736, i64 1
+  %tmp738 = getelementptr inbounds float* %tmp737, i64 1
+  %tmp739 = getelementptr inbounds float* %tmp738, i64 1
+  %tmp740 = getelementptr inbounds float* %tmp739, i64 1
+  %tmp741 = getelementptr inbounds float* %tmp740, i64 1
+  %tmp742 = getelementptr inbounds float* %tmp741, i64 1
+  %tmp743 = getelementptr inbounds float* %tmp742, i64 1
+  %tmp744 = getelementptr inbounds float* %tmp743, i64 1
+  %tmp745 = getelementptr inbounds float* %tmp744, i64 1
+  %tmp746 = getelementptr inbounds float* %tmp745, i64 1
+  %tmp747 = getelementptr inbounds float* %tmp746, i64 1
+  %tmp748 = getelementptr inbounds float* %tmp747, i64 1
+  %tmp749 = getelementptr inbounds float* %tmp748, i64 1
+  %tmp750 = getelementptr inbounds float* %tmp749, i64 1
+  %tmp751 = getelementptr inbounds float* %tmp750, i64 1
+  %tmp752 = getelementptr inbounds float* %tmp751, i64 1
+  %tmp753 = getelementptr inbounds float* %tmp752, i64 1
+  %tmp754 = getelementptr inbounds float* %tmp753, i64 1
+  %tmp755 = getelementptr inbounds float* %tmp754, i64 1
+  %tmp756 = getelementptr inbounds float* %tmp755, i64 1
+  %tmp757 = getelementptr inbounds float* %tmp756, i64 1
+  %tmp758 = getelementptr inbounds float* %tmp757, i64 1
+  %tmp759 = getelementptr inbounds float* %tmp758, i64 1
+  %tmp760 = getelementptr inbounds float* %tmp759, i64 1
+  %tmp761 = getelementptr inbounds float* %tmp760, i64 1
+  %tmp762 = getelementptr inbounds float* %tmp761, i64 1
+  %tmp763 = getelementptr inbounds float* %tmp762, i64 1
+  %tmp764 = getelementptr inbounds float* %tmp763, i64 1
+  %tmp765 = getelementptr inbounds float* %tmp764, i64 1
+  %tmp766 = getelementptr inbounds float* %tmp765, i64 1
+  %tmp767 = getelementptr inbounds float* %tmp766, i64 1
+  %tmp768 = getelementptr inbounds float* %tmp767, i64 1
+  %tmp769 = getelementptr inbounds float* %tmp768, i64 1
+  %tmp770 = getelementptr inbounds float* %tmp769, i64 1
+  %tmp771 = getelementptr inbounds float* %tmp770, i64 1
+  %tmp772 = getelementptr inbounds float* %tmp771, i64 1
+  %tmp773 = getelementptr inbounds float* %tmp772, i64 1
+  %tmp774 = getelementptr inbounds float* %tmp773, i64 1
+  %tmp775 = getelementptr inbounds float* %tmp774, i64 1
+  %tmp776 = getelementptr inbounds float* %tmp775, i64 1
+  %tmp777 = getelementptr inbounds float* %tmp776, i64 1
+  %tmp778 = getelementptr inbounds float* %tmp777, i64 1
+  %tmp779 = getelementptr inbounds float* %tmp778, i64 1
+  %tmp780 = getelementptr inbounds float* %tmp779, i64 1
+  %tmp781 = getelementptr inbounds float* %tmp780, i64 1
+  %tmp782 = getelementptr inbounds float* %tmp781, i64 1
+  %tmp783 = getelementptr inbounds float* %tmp782, i64 1
+  %tmp784 = getelementptr inbounds float* %tmp783, i64 1
+  %tmp785 = getelementptr inbounds float* %tmp784, i64 1
+  %tmp786 = getelementptr inbounds float* %tmp785, i64 1
+  %tmp787 = getelementptr inbounds float* %tmp786, i64 1
+  %tmp788 = getelementptr inbounds float* %tmp787, i64 1
+  %tmp789 = getelementptr inbounds float* %tmp788, i64 1
+  %tmp790 = getelementptr inbounds float* %tmp789, i64 1
+  %tmp791 = getelementptr inbounds float* %tmp790, i64 1
+  %tmp792 = getelementptr inbounds float* %tmp791, i64 1
+  %tmp793 = getelementptr inbounds float* %tmp792, i64 1
+  %tmp794 = getelementptr inbounds float* %tmp793, i64 1
+  %tmp795 = getelementptr inbounds float* %tmp794, i64 1
+  %tmp796 = getelementptr inbounds float* %tmp795, i64 1
+  %tmp797 = getelementptr inbounds float* %tmp796, i64 1
+  %tmp798 = getelementptr inbounds float* %tmp797, i64 1
+  %tmp799 = getelementptr inbounds float* %tmp798, i64 1
+  %tmp800 = getelementptr inbounds float* %tmp799, i64 1
+  %tmp801 = getelementptr inbounds float* %tmp800, i64 1
+  %tmp802 = getelementptr inbounds float* %tmp801, i64 1
+  %tmp803 = getelementptr inbounds float* %tmp802, i64 1
+  %tmp804 = getelementptr inbounds float* %tmp803, i64 1
+  %tmp805 = getelementptr inbounds float* %tmp804, i64 1
+  %tmp806 = getelementptr inbounds float* %tmp805, i64 1
+  %tmp807 = getelementptr inbounds float* %tmp806, i64 1
+  %tmp808 = getelementptr inbounds float* %tmp807, i64 1
+  %tmp809 = getelementptr inbounds float* %tmp808, i64 1
+  %tmp810 = getelementptr inbounds float* %tmp809, i64 1
+  %tmp811 = getelementptr inbounds float* %tmp810, i64 1
+  %tmp812 = getelementptr inbounds float* %tmp811, i64 1
+  %tmp813 = getelementptr inbounds float* %tmp812, i64 1
+  %tmp814 = getelementptr inbounds float* %tmp813, i64 1
+  %tmp815 = getelementptr inbounds float* %tmp814, i64 1
+  %tmp816 = getelementptr inbounds float* %tmp815, i64 1
+  %tmp817 = getelementptr inbounds float* %tmp816, i64 1
+  %tmp818 = getelementptr inbounds float* %tmp817, i64 1
+  %tmp819 = getelementptr inbounds float* %tmp818, i64 1
+  %tmp820 = getelementptr inbounds float* %tmp819, i64 1
+  %tmp821 = getelementptr inbounds float* %tmp820, i64 1
+  %tmp822 = getelementptr inbounds float* %tmp821, i64 1
+  %tmp823 = getelementptr inbounds float* %tmp822, i64 1
+  %tmp824 = getelementptr inbounds float* %tmp823, i64 1
+  %tmp825 = getelementptr inbounds float* %tmp824, i64 1
+  %tmp826 = getelementptr inbounds float* %tmp825, i64 1
+  %tmp827 = getelementptr inbounds float* %tmp826, i64 1
+  %tmp828 = getelementptr inbounds float* %tmp827, i64 1
+  %tmp829 = getelementptr inbounds float* %tmp828, i64 1
+  %tmp830 = getelementptr inbounds float* %tmp829, i64 1
+  %tmp831 = getelementptr inbounds float* %tmp830, i64 1
+  %tmp832 = getelementptr inbounds float* %tmp831, i64 1
+  %tmp833 = getelementptr inbounds float* %tmp832, i64 1
+  %tmp834 = getelementptr inbounds float* %tmp833, i64 1
+  %tmp835 = getelementptr inbounds float* %tmp834, i64 1
+  %tmp836 = getelementptr inbounds float* %tmp835, i64 1
+  %tmp837 = getelementptr inbounds float* %tmp836, i64 1
+  %tmp838 = getelementptr inbounds float* %tmp837, i64 1
+  %tmp839 = getelementptr inbounds float* %tmp838, i64 1
+  %tmp840 = getelementptr inbounds float* %tmp839, i64 1
+  %tmp841 = getelementptr inbounds float* %tmp840, i64 1
+  %tmp842 = getelementptr inbounds float* %tmp841, i64 1
+  %tmp843 = getelementptr inbounds float* %tmp842, i64 1
+  %tmp844 = getelementptr inbounds float* %tmp843, i64 1
+  %tmp845 = getelementptr inbounds float* %tmp844, i64 1
+  %tmp846 = getelementptr inbounds float* %tmp845, i64 1
+  %tmp847 = getelementptr inbounds float* %tmp846, i64 1
+  %tmp848 = getelementptr inbounds float* %tmp847, i64 1
+  %tmp849 = getelementptr inbounds float* %tmp848, i64 1
+  %tmp850 = getelementptr inbounds float* %tmp849, i64 1
+  %tmp851 = getelementptr inbounds float* %tmp850, i64 1
+  %tmp852 = getelementptr inbounds float* %tmp851, i64 1
+  %tmp853 = getelementptr inbounds float* %tmp852, i64 1
+  %tmp854 = getelementptr inbounds float* %tmp853, i64 1
+  %tmp855 = getelementptr inbounds float* %tmp854, i64 1
+  %tmp856 = getelementptr inbounds float* %tmp855, i64 1
+  %tmp857 = getelementptr inbounds float* %tmp856, i64 1
+  %tmp858 = getelementptr inbounds float* %tmp857, i64 1
+  %tmp859 = getelementptr inbounds float* %tmp858, i64 1
+  %tmp860 = getelementptr inbounds float* %tmp859, i64 1
+  %tmp861 = getelementptr inbounds float* %tmp860, i64 1
+  %tmp862 = getelementptr inbounds float* %tmp861, i64 1
+  %tmp863 = getelementptr inbounds float* %tmp862, i64 1
+  %tmp864 = getelementptr inbounds float* %tmp863, i64 1
+  %tmp865 = getelementptr inbounds float* %tmp864, i64 1
+  %tmp866 = getelementptr inbounds float* %tmp865, i64 1
+  %tmp867 = getelementptr inbounds float* %tmp866, i64 1
+  %tmp868 = getelementptr inbounds float* %tmp867, i64 1
+  %tmp869 = getelementptr inbounds float* %tmp868, i64 1
+  %tmp870 = getelementptr inbounds float* %tmp869, i64 1
+  %tmp871 = getelementptr inbounds float* %tmp870, i64 1
+  %tmp872 = getelementptr inbounds float* %tmp871, i64 1
+  %tmp873 = getelementptr inbounds float* %tmp872, i64 1
+  %tmp874 = getelementptr inbounds float* %tmp873, i64 1
+  %tmp875 = getelementptr inbounds float* %tmp874, i64 1
+  %tmp876 = getelementptr inbounds float* %tmp875, i64 1
+  %tmp877 = getelementptr inbounds float* %tmp876, i64 1
+  %tmp878 = getelementptr inbounds float* %tmp877, i64 1
+  %tmp879 = getelementptr inbounds float* %tmp878, i64 1
+  %tmp880 = getelementptr inbounds float* %tmp879, i64 1
+  %tmp881 = getelementptr inbounds float* %tmp880, i64 1
+  %tmp882 = getelementptr inbounds float* %tmp881, i64 1
+  %tmp883 = getelementptr inbounds float* %tmp882, i64 1
+  %tmp884 = getelementptr inbounds float* %tmp883, i64 1
+  %tmp885 = getelementptr inbounds float* %tmp884, i64 1
+  %tmp886 = getelementptr inbounds float* %tmp885, i64 1
+  %tmp887 = getelementptr inbounds float* %tmp886, i64 1
+  %tmp888 = getelementptr inbounds float* %tmp887, i64 1
+  %tmp889 = getelementptr inbounds float* %tmp888, i64 1
+  %tmp890 = getelementptr inbounds float* %tmp889, i64 1
+  %tmp891 = getelementptr inbounds float* %tmp890, i64 1
+  %tmp892 = getelementptr inbounds float* %tmp891, i64 1
+  %tmp893 = getelementptr inbounds float* %tmp892, i64 1
+  %tmp894 = getelementptr inbounds float* %tmp893, i64 1
+  %tmp895 = getelementptr inbounds float* %tmp894, i64 1
+  %tmp896 = getelementptr inbounds float* %tmp895, i64 1
+  %tmp897 = getelementptr inbounds float* %tmp896, i64 1
+  %tmp898 = getelementptr inbounds float* %tmp897, i64 1
+  %tmp899 = getelementptr inbounds float* %tmp898, i64 1
+  %tmp900 = getelementptr inbounds float* %tmp899, i64 1
+  %tmp901 = getelementptr inbounds float* %tmp900, i64 1
+  %tmp902 = getelementptr inbounds float* %tmp901, i64 1
+  %tmp903 = getelementptr inbounds float* %tmp902, i64 1
+  %tmp904 = getelementptr inbounds float* %tmp903, i64 1
+  %tmp905 = getelementptr inbounds float* %tmp904, i64 1
+  %tmp906 = getelementptr inbounds float* %tmp905, i64 1
+  %tmp907 = getelementptr inbounds float* %tmp906, i64 1
+  %tmp908 = getelementptr inbounds float* %tmp907, i64 1
+  %tmp909 = getelementptr inbounds float* %tmp908, i64 1
+  %tmp910 = getelementptr inbounds float* %tmp909, i64 1
+  %tmp911 = getelementptr inbounds float* %tmp910, i64 1
+  %tmp912 = getelementptr inbounds float* %tmp911, i64 1
+  %tmp913 = getelementptr inbounds float* %tmp912, i64 1
+  %tmp914 = getelementptr inbounds float* %tmp913, i64 1
+  %tmp915 = getelementptr inbounds float* %tmp914, i64 1
+  %tmp916 = getelementptr inbounds float* %tmp915, i64 1
+  %tmp917 = getelementptr inbounds float* %tmp916, i64 1
+  %tmp918 = getelementptr inbounds float* %tmp917, i64 1
+  %tmp919 = getelementptr inbounds float* %tmp918, i64 1
+  %tmp920 = getelementptr inbounds float* %tmp919, i64 1
+  %tmp921 = getelementptr inbounds float* %tmp920, i64 1
+  %tmp922 = getelementptr inbounds float* %tmp921, i64 1
+  %tmp923 = getelementptr inbounds float* %tmp922, i64 1
+  %tmp924 = getelementptr inbounds float* %tmp923, i64 1
+  %tmp925 = getelementptr inbounds float* %tmp924, i64 1
+  %tmp926 = getelementptr inbounds float* %tmp925, i64 1
+  %tmp927 = getelementptr inbounds float* %tmp926, i64 1
+  %tmp928 = getelementptr inbounds float* %tmp927, i64 1
+  %tmp929 = getelementptr inbounds float* %tmp928, i64 1
+  %tmp930 = getelementptr inbounds float* %tmp929, i64 1
+  %tmp931 = getelementptr inbounds float* %tmp930, i64 1
+  %tmp932 = getelementptr inbounds float* %tmp931, i64 1
+  %tmp933 = getelementptr inbounds float* %tmp932, i64 1
+  %tmp934 = getelementptr inbounds float* %tmp933, i64 1
+  %tmp935 = getelementptr inbounds float* %tmp934, i64 1
+  %tmp936 = getelementptr inbounds float* %tmp935, i64 1
+  %tmp937 = getelementptr inbounds float* %tmp936, i64 1
+  %tmp938 = getelementptr inbounds float* %tmp937, i64 1
+  %tmp939 = getelementptr inbounds float* %tmp938, i64 1
+  %tmp940 = getelementptr inbounds float* %tmp939, i64 1
+  %tmp941 = getelementptr inbounds float* %tmp940, i64 1
+  %tmp942 = getelementptr inbounds float* %tmp941, i64 1
+  %tmp943 = getelementptr inbounds float* %tmp942, i64 1
+  %tmp944 = getelementptr inbounds float* %tmp943, i64 1
+  %tmp945 = getelementptr inbounds float* %tmp944, i64 1
+  %tmp946 = getelementptr inbounds float* %tmp945, i64 1
+  %tmp947 = getelementptr inbounds float* %tmp946, i64 1
+  %tmp948 = getelementptr inbounds float* %tmp947, i64 1
+  %tmp949 = getelementptr inbounds float* %tmp948, i64 1
+  %tmp950 = getelementptr inbounds float* %tmp949, i64 1
+  %tmp951 = getelementptr inbounds float* %tmp950, i64 1
+  %tmp952 = getelementptr inbounds float* %tmp951, i64 1
+  %tmp953 = getelementptr inbounds float* %tmp952, i64 1
+  %tmp954 = getelementptr inbounds float* %tmp953, i64 1
+  %tmp955 = getelementptr inbounds float* %tmp954, i64 1
+  %tmp956 = getelementptr inbounds float* %tmp955, i64 1
+  %tmp957 = getelementptr inbounds float* %tmp956, i64 1
+  %tmp958 = getelementptr inbounds float* %tmp957, i64 1
+  %tmp959 = getelementptr inbounds float* %tmp958, i64 1
+  %tmp960 = getelementptr inbounds float* %tmp959, i64 1
+  %tmp961 = getelementptr inbounds float* %tmp960, i64 1
+  %tmp962 = getelementptr inbounds float* %tmp961, i64 1
+  %tmp963 = getelementptr inbounds float* %tmp962, i64 1
+  %tmp964 = getelementptr inbounds float* %tmp963, i64 1
+  %tmp965 = getelementptr inbounds float* %tmp964, i64 1
+  %tmp966 = getelementptr inbounds float* %tmp965, i64 1
+  %tmp967 = getelementptr inbounds float* %tmp966, i64 1
+  %tmp968 = getelementptr inbounds float* %tmp967, i64 1
+  %tmp969 = getelementptr inbounds float* %tmp968, i64 1
+  %tmp970 = getelementptr inbounds float* %tmp969, i64 1
+  %tmp971 = getelementptr inbounds float* %tmp970, i64 1
+  %tmp972 = getelementptr inbounds float* %tmp971, i64 1
+  %tmp973 = getelementptr inbounds float* %tmp972, i64 1
+  %tmp974 = getelementptr inbounds float* %tmp973, i64 1
+  %tmp975 = getelementptr inbounds float* %tmp974, i64 1
+  %tmp976 = getelementptr inbounds float* %tmp975, i64 1
+  %tmp977 = getelementptr inbounds float* %tmp976, i64 1
+  %tmp978 = getelementptr inbounds float* %tmp977, i64 1
+  %tmp979 = getelementptr inbounds float* %tmp978, i64 1
+  %tmp980 = getelementptr inbounds float* %tmp979, i64 1
+  %tmp981 = getelementptr inbounds float* %tmp980, i64 1
+  %tmp982 = getelementptr inbounds float* %tmp981, i64 1
+  %tmp983 = getelementptr inbounds float* %tmp982, i64 1
+  %tmp984 = getelementptr inbounds float* %tmp983, i64 1
+  %tmp985 = getelementptr inbounds float* %tmp984, i64 1
+  %tmp986 = getelementptr inbounds float* %tmp985, i64 1
+  %tmp987 = getelementptr inbounds float* %tmp986, i64 1
+  %tmp988 = getelementptr inbounds float* %tmp987, i64 1
+  %tmp989 = getelementptr inbounds float* %tmp988, i64 1
+  %tmp990 = getelementptr inbounds float* %tmp989, i64 1
+  %tmp991 = getelementptr inbounds float* %tmp990, i64 1
+  %tmp992 = getelementptr inbounds float* %tmp991, i64 1
+  %tmp993 = getelementptr inbounds float* %tmp992, i64 1
+  %tmp994 = getelementptr inbounds float* %tmp993, i64 1
+  %tmp995 = getelementptr inbounds float* %tmp994, i64 1
+  %tmp996 = getelementptr inbounds float* %tmp995, i64 1
+  %tmp997 = getelementptr inbounds float* %tmp996, i64 1
+  %tmp998 = getelementptr inbounds float* %tmp997, i64 1
+  %tmp999 = getelementptr inbounds float* %tmp998, i64 1
+  %tmp1000 = getelementptr inbounds float* %tmp999, i64 1
+  %tmp1001 = getelementptr inbounds float* %tmp1000, i64 1
+  %tmp1002 = getelementptr inbounds float* %tmp1001, i64 1
+  %tmp1003 = getelementptr inbounds float* %tmp1002, i64 1
+  %tmp1004 = getelementptr inbounds float* %tmp1003, i64 1
+  %tmp1005 = getelementptr inbounds float* %tmp1004, i64 1
+  %tmp1006 = getelementptr inbounds float* %tmp1005, i64 1
+  %tmp1007 = getelementptr inbounds float* %tmp1006, i64 1
+  %tmp1008 = getelementptr inbounds float* %tmp1007, i64 1
+  %tmp1009 = getelementptr inbounds float* %tmp1008, i64 1
+  %tmp1010 = getelementptr inbounds float* %tmp1009, i64 1
+  %tmp1011 = getelementptr inbounds float* %tmp1010, i64 1
+  %tmp1012 = getelementptr inbounds float* %tmp1011, i64 1
+  %tmp1013 = getelementptr inbounds float* %tmp1012, i64 1
+  %tmp1014 = getelementptr inbounds float* %tmp1013, i64 1
+  %tmp1015 = getelementptr inbounds float* %tmp1014, i64 1
+  %tmp1016 = getelementptr inbounds float* %tmp1015, i64 1
+  %tmp1017 = getelementptr inbounds float* %tmp1016, i64 1
+  %tmp1018 = getelementptr inbounds float* %tmp1017, i64 1
+  %tmp1019 = getelementptr inbounds float* %tmp1018, i64 1
+  %tmp1020 = getelementptr inbounds float* %tmp1019, i64 1
+  %tmp1021 = getelementptr inbounds float* %tmp1020, i64 1
+  %tmp1022 = getelementptr inbounds float* %tmp1021, i64 1
+  %tmp1023 = getelementptr inbounds float* %tmp1022, i64 1
+  %tmp1024 = getelementptr inbounds float* %tmp1023, i64 1
+  %tmp1025 = getelementptr inbounds float* %tmp1024, i64 1
+  %tmp1026 = getelementptr inbounds float* %tmp1025, i64 1
+  %tmp1027 = getelementptr inbounds float* %tmp1026, i64 1
+  %tmp1028 = getelementptr inbounds float* %tmp1027, i64 1
+  %tmp1029 = getelementptr inbounds float* %tmp1028, i64 1
+  %tmp1030 = getelementptr inbounds float* %tmp1029, i64 1
+  %tmp1031 = getelementptr inbounds float* %tmp1030, i64 1
+  %tmp1032 = getelementptr inbounds float* %tmp1031, i64 1
+  %tmp1033 = getelementptr inbounds float* %tmp1032, i64 1
+  %tmp1034 = getelementptr inbounds float* %tmp1033, i64 1
+  %tmp1035 = getelementptr inbounds float* %tmp1034, i64 1
+  %tmp1036 = getelementptr inbounds float* %tmp1035, i64 1
+  %tmp1037 = getelementptr inbounds float* %tmp1036, i64 1
+  %tmp1038 = getelementptr inbounds float* %tmp1037, i64 1
+  %tmp1039 = getelementptr inbounds float* %tmp1038, i64 1
+  %tmp1040 = getelementptr inbounds float* %tmp1039, i64 1
+  %tmp1041 = getelementptr inbounds float* %tmp1040, i64 1
+  %tmp1042 = getelementptr inbounds float* %tmp1041, i64 1
+  %tmp1043 = getelementptr inbounds float* %tmp1042, i64 1
+  %tmp1044 = getelementptr inbounds float* %tmp1043, i64 1
+  %tmp1045 = getelementptr inbounds float* %tmp1044, i64 1
+  %tmp1046 = getelementptr inbounds float* %tmp1045, i64 1
+  %tmp1047 = getelementptr inbounds float* %tmp1046, i64 1
+  %tmp1048 = getelementptr inbounds float* %tmp1047, i64 1
+  %tmp1049 = getelementptr inbounds float* %tmp1048, i64 1
+  %tmp1050 = getelementptr inbounds float* %tmp1049, i64 1
+  %tmp1051 = getelementptr inbounds float* %tmp1050, i64 1
+  %tmp1052 = getelementptr inbounds float* %tmp1051, i64 1
+  %tmp1053 = getelementptr inbounds float* %tmp1052, i64 1
+  %tmp1054 = getelementptr inbounds float* %tmp1053, i64 1
+  %tmp1055 = getelementptr inbounds float* %tmp1054, i64 1
+  %tmp1056 = getelementptr inbounds float* %tmp1055, i64 1
+  %tmp1057 = getelementptr inbounds float* %tmp1056, i64 1
+  %tmp1058 = getelementptr inbounds float* %tmp1057, i64 1
+  %tmp1059 = getelementptr inbounds float* %tmp1058, i64 1
+  %tmp1060 = getelementptr inbounds float* %tmp1059, i64 1
+  %tmp1061 = getelementptr inbounds float* %tmp1060, i64 1
+  %tmp1062 = getelementptr inbounds float* %tmp1061, i64 1
+  %tmp1063 = getelementptr inbounds float* %tmp1062, i64 1
+  %tmp1064 = getelementptr inbounds float* %tmp1063, i64 1
+  %tmp1065 = getelementptr inbounds float* %tmp1064, i64 1
+  %tmp1066 = getelementptr inbounds float* %tmp1065, i64 1
+  %tmp1067 = getelementptr inbounds float* %tmp1066, i64 1
+  %tmp1068 = getelementptr inbounds float* %tmp1067, i64 1
+  %tmp1069 = getelementptr inbounds float* %tmp1068, i64 1
+  %tmp1070 = getelementptr inbounds float* %tmp1069, i64 1
+  %tmp1071 = getelementptr inbounds float* %tmp1070, i64 1
+  %tmp1072 = getelementptr inbounds float* %tmp1071, i64 1
+  %tmp1073 = getelementptr inbounds float* %tmp1072, i64 1
+  %tmp1074 = getelementptr inbounds float* %tmp1073, i64 1
+  %tmp1075 = getelementptr inbounds float* %tmp1074, i64 1
+  %tmp1076 = getelementptr inbounds float* %tmp1075, i64 1
+  %tmp1077 = getelementptr inbounds float* %tmp1076, i64 1
+  %tmp1078 = getelementptr inbounds float* %tmp1077, i64 1
+  %tmp1079 = getelementptr inbounds float* %tmp1078, i64 1
+  %tmp1080 = getelementptr inbounds float* %tmp1079, i64 1
+  %tmp1081 = getelementptr inbounds float* %tmp1080, i64 1
+  %tmp1082 = getelementptr inbounds float* %tmp1081, i64 1
+  %tmp1083 = getelementptr inbounds float* %tmp1082, i64 1
+  %tmp1084 = getelementptr inbounds float* %tmp1083, i64 1
+  %tmp1085 = getelementptr inbounds float* %tmp1084, i64 1
+  %tmp1086 = getelementptr inbounds float* %tmp1085, i64 1
+  %tmp1087 = getelementptr inbounds float* %tmp1086, i64 1
+  %tmp1088 = getelementptr inbounds float* %tmp1087, i64 1
+  %tmp1089 = getelementptr inbounds float* %tmp1088, i64 1
+  %tmp1090 = getelementptr inbounds float* %tmp1089, i64 1
+  %tmp1091 = getelementptr inbounds float* %tmp1090, i64 1
+  %tmp1092 = getelementptr inbounds float* %tmp1091, i64 1
+  %tmp1093 = getelementptr inbounds float* %tmp1092, i64 1
+  %tmp1094 = getelementptr inbounds float* %tmp1093, i64 1
+  %tmp1095 = getelementptr inbounds float* %tmp1094, i64 1
+  %tmp1096 = getelementptr inbounds float* %tmp1095, i64 1
+  %tmp1097 = getelementptr inbounds float* %tmp1096, i64 1
+  %tmp1098 = getelementptr inbounds float* %tmp1097, i64 1
+  %tmp1099 = getelementptr inbounds float* %tmp1098, i64 1
+  %tmp1100 = getelementptr inbounds float* %tmp1099, i64 1
+  %tmp1101 = getelementptr inbounds float* %tmp1100, i64 1
+  %tmp1102 = getelementptr inbounds float* %tmp1101, i64 1
+  %tmp1103 = getelementptr inbounds float* %tmp1102, i64 1
+  %tmp1104 = getelementptr inbounds float* %tmp1103, i64 1
+  %tmp1105 = getelementptr inbounds float* %tmp1104, i64 1
+  %tmp1106 = getelementptr inbounds float* %tmp1105, i64 1
+  %tmp1107 = getelementptr inbounds float* %tmp1106, i64 1
+  %tmp1108 = getelementptr inbounds float* %tmp1107, i64 1
+  %tmp1109 = getelementptr inbounds float* %tmp1108, i64 1
+  %tmp1110 = getelementptr inbounds float* %tmp1109, i64 1
+  %tmp1111 = getelementptr inbounds float* %tmp1110, i64 1
+  %tmp1112 = getelementptr inbounds float* %tmp1111, i64 1
+  %tmp1113 = getelementptr inbounds float* %tmp1112, i64 1
+  %tmp1114 = getelementptr inbounds float* %tmp1113, i64 1
+  %tmp1115 = getelementptr inbounds float* %tmp1114, i64 1
+  %tmp1116 = getelementptr inbounds float* %tmp1115, i64 1
+  %tmp1117 = getelementptr inbounds float* %tmp1116, i64 1
+  %tmp1118 = getelementptr inbounds float* %tmp1117, i64 1
+  %tmp1119 = getelementptr inbounds float* %tmp1118, i64 1
+  %tmp1120 = getelementptr inbounds float* %tmp1119, i64 1
+  %tmp1121 = getelementptr inbounds float* %tmp1120, i64 1
+  %tmp1122 = getelementptr inbounds float* %tmp1121, i64 1
+  %tmp1123 = getelementptr inbounds float* %tmp1122, i64 1
+  %tmp1124 = getelementptr inbounds float* %tmp1123, i64 1
+  %tmp1125 = getelementptr inbounds float* %tmp1124, i64 1
+  %tmp1126 = getelementptr inbounds float* %tmp1125, i64 1
+  %tmp1127 = getelementptr inbounds float* %tmp1126, i64 1
+  %tmp1128 = getelementptr inbounds float* %tmp1127, i64 1
+  %tmp1129 = getelementptr inbounds float* %tmp1128, i64 1
+  %tmp1130 = getelementptr inbounds float* %tmp1129, i64 1
+  %tmp1131 = getelementptr inbounds float* %tmp1130, i64 1
+  %tmp1132 = getelementptr inbounds float* %tmp1131, i64 1
+  %tmp1133 = getelementptr inbounds float* %tmp1132, i64 1
+  %tmp1134 = getelementptr inbounds float* %tmp1133, i64 1
+  %tmp1135 = getelementptr inbounds float* %tmp1134, i64 1
+  %tmp1136 = getelementptr inbounds float* %tmp1135, i64 1
+  %tmp1137 = getelementptr inbounds float* %tmp1136, i64 1
+  %tmp1138 = getelementptr inbounds float* %tmp1137, i64 1
+  %tmp1139 = getelementptr inbounds float* %tmp1138, i64 1
+  %tmp1140 = getelementptr inbounds float* %tmp1139, i64 1
+  %tmp1141 = getelementptr inbounds float* %tmp1140, i64 1
+  %tmp1142 = getelementptr inbounds float* %tmp1141, i64 1
+  %tmp1143 = getelementptr inbounds float* %tmp1142, i64 1
+  %tmp1144 = getelementptr inbounds float* %tmp1143, i64 1
+  %tmp1145 = getelementptr inbounds float* %tmp1144, i64 1
+  %tmp1146 = getelementptr inbounds float* %tmp1145, i64 1
+  %tmp1147 = getelementptr inbounds float* %tmp1146, i64 1
+  %tmp1148 = getelementptr inbounds float* %tmp1147, i64 1
+  %tmp1149 = getelementptr inbounds float* %tmp1148, i64 1
+  %tmp1150 = getelementptr inbounds float* %tmp1149, i64 1
+  %tmp1151 = getelementptr inbounds float* %tmp1150, i64 1
+  %tmp1152 = getelementptr inbounds float* %tmp1151, i64 1
+  %tmp1153 = getelementptr inbounds float* %tmp1152, i64 1
+  %tmp1154 = getelementptr inbounds float* %tmp1153, i64 1
+  %tmp1155 = getelementptr inbounds float* %tmp1154, i64 1
+  %tmp1156 = getelementptr inbounds float* %tmp1155, i64 1
+  %tmp1157 = getelementptr inbounds float* %tmp1156, i64 1
+  %tmp1158 = getelementptr inbounds float* %tmp1157, i64 1
+  %tmp1159 = getelementptr inbounds float* %tmp1158, i64 1
+  %tmp1160 = getelementptr inbounds float* %tmp1159, i64 1
+  %tmp1161 = getelementptr inbounds float* %tmp1160, i64 1
+  %tmp1162 = getelementptr inbounds float* %tmp1161, i64 1
+  %tmp1163 = getelementptr inbounds float* %tmp1162, i64 1
+  %tmp1164 = getelementptr inbounds float* %tmp1163, i64 1
+  %tmp1165 = getelementptr inbounds float* %tmp1164, i64 1
+  %tmp1166 = getelementptr inbounds float* %tmp1165, i64 1
+  %tmp1167 = getelementptr inbounds float* %tmp1166, i64 1
+  %tmp1168 = getelementptr inbounds float* %tmp1167, i64 1
+  %tmp1169 = getelementptr inbounds float* %tmp1168, i64 1
+  %tmp1170 = getelementptr inbounds float* %tmp1169, i64 1
+  %tmp1171 = getelementptr inbounds float* %tmp1170, i64 1
+  %tmp1172 = getelementptr inbounds float* %tmp1171, i64 1
+  %tmp1173 = getelementptr inbounds float* %tmp1172, i64 1
+  %tmp1174 = getelementptr inbounds float* %tmp1173, i64 1
+  %tmp1175 = getelementptr inbounds float* %tmp1174, i64 1
+  %tmp1176 = getelementptr inbounds float* %tmp1175, i64 1
+  %tmp1177 = getelementptr inbounds float* %tmp1176, i64 1
+  %tmp1178 = getelementptr inbounds float* %tmp1177, i64 1
+  %tmp1179 = getelementptr inbounds float* %tmp1178, i64 1
+  %tmp1180 = getelementptr inbounds float* %tmp1179, i64 1
+  %tmp1181 = getelementptr inbounds float* %tmp1180, i64 1
+  %tmp1182 = getelementptr inbounds float* %tmp1181, i64 1
+  %tmp1183 = getelementptr inbounds float* %tmp1182, i64 1
+  %tmp1184 = getelementptr inbounds float* %tmp1183, i64 1
+  %tmp1185 = getelementptr inbounds float* %tmp1184, i64 1
+  %tmp1186 = getelementptr inbounds float* %tmp1185, i64 1
+  %tmp1187 = getelementptr inbounds float* %tmp1186, i64 1
+  %tmp1188 = getelementptr inbounds float* %tmp1187, i64 1
+  %tmp1189 = getelementptr inbounds float* %tmp1188, i64 1
+  %tmp1190 = getelementptr inbounds float* %tmp1189, i64 1
+  %tmp1191 = getelementptr inbounds float* %tmp1190, i64 1
+  %tmp1192 = getelementptr inbounds float* %tmp1191, i64 1
+  %tmp1193 = getelementptr inbounds float* %tmp1192, i64 1
+  %tmp1194 = getelementptr inbounds float* %tmp1193, i64 1
+  %tmp1195 = getelementptr inbounds float* %tmp1194, i64 1
+  %tmp1196 = getelementptr inbounds float* %tmp1195, i64 1
+  %tmp1197 = getelementptr inbounds float* %tmp1196, i64 1
+  %tmp1198 = getelementptr inbounds float* %tmp1197, i64 1
+  %tmp1199 = getelementptr inbounds float* %tmp1198, i64 1
+  %tmp1200 = getelementptr inbounds float* %tmp1199, i64 1
+  %tmp1201 = getelementptr inbounds float* %tmp1200, i64 1
+  %tmp1202 = getelementptr inbounds float* %tmp1201, i64 1
+  %tmp1203 = getelementptr inbounds float* %tmp1202, i64 1
+  %tmp1204 = getelementptr inbounds float* %tmp1203, i64 1
+  %tmp1205 = getelementptr inbounds float* %tmp1204, i64 1
+  %tmp1206 = getelementptr inbounds float* %tmp1205, i64 1
+  %tmp1207 = getelementptr inbounds float* %tmp1206, i64 1
+  %tmp1208 = getelementptr inbounds float* %tmp1207, i64 1
+  %tmp1209 = getelementptr inbounds float* %tmp1208, i64 1
+  %tmp1210 = getelementptr inbounds float* %tmp1209, i64 1
+  %tmp1211 = getelementptr inbounds float* %tmp1210, i64 1
+  %tmp1212 = getelementptr inbounds float* %tmp1211, i64 1
+  %tmp1213 = getelementptr inbounds float* %tmp1212, i64 1
+  %tmp1214 = getelementptr inbounds float* %tmp1213, i64 1
+  %tmp1215 = getelementptr inbounds float* %tmp1214, i64 1
+  %tmp1216 = getelementptr inbounds float* %tmp1215, i64 1
+  %tmp1217 = getelementptr inbounds float* %tmp1216, i64 1
+  %tmp1218 = getelementptr inbounds float* %tmp1217, i64 1
+  %tmp1219 = getelementptr inbounds float* %tmp1218, i64 1
+  %tmp1220 = getelementptr inbounds float* %tmp1219, i64 1
+  %tmp1221 = getelementptr inbounds float* %tmp1220, i64 1
+  %tmp1222 = getelementptr inbounds float* %tmp1221, i64 1
+  %tmp1223 = getelementptr inbounds float* %tmp1222, i64 1
+  %tmp1224 = getelementptr inbounds float* %tmp1223, i64 1
+  %tmp1225 = getelementptr inbounds float* %tmp1224, i64 1
+  %tmp1226 = getelementptr inbounds float* %tmp1225, i64 1
+  %tmp1227 = getelementptr inbounds float* %tmp1226, i64 1
+  %tmp1228 = getelementptr inbounds float* %tmp1227, i64 1
+  %tmp1229 = getelementptr inbounds float* %tmp1228, i64 1
+  %tmp1230 = getelementptr inbounds float* %tmp1229, i64 1
+  %tmp1231 = getelementptr inbounds float* %tmp1230, i64 1
+  %tmp1232 = getelementptr inbounds float* %tmp1231, i64 1
+  %tmp1233 = getelementptr inbounds float* %tmp1232, i64 1
+  %tmp1234 = getelementptr inbounds float* %tmp1233, i64 1
+  %tmp1235 = getelementptr inbounds float* %tmp1234, i64 1
+  %tmp1236 = getelementptr inbounds float* %tmp1235, i64 1
+  %tmp1237 = getelementptr inbounds float* %tmp1236, i64 1
+  %tmp1238 = getelementptr inbounds float* %tmp1237, i64 1
+  %tmp1239 = getelementptr inbounds float* %tmp1238, i64 1
+  %tmp1240 = getelementptr inbounds float* %tmp1239, i64 1
+  %tmp1241 = getelementptr inbounds float* %tmp1240, i64 1
+  %tmp1242 = getelementptr inbounds float* %tmp1241, i64 1
+  %tmp1243 = getelementptr inbounds float* %tmp1242, i64 1
+  %tmp1244 = getelementptr inbounds float* %tmp1243, i64 1
+  %tmp1245 = getelementptr inbounds float* %tmp1244, i64 1
+  %tmp1246 = getelementptr inbounds float* %tmp1245, i64 1
+  %tmp1247 = getelementptr inbounds float* %tmp1246, i64 1
+  %tmp1248 = getelementptr inbounds float* %tmp1247, i64 1
+  %tmp1249 = getelementptr inbounds float* %tmp1248, i64 1
+  %tmp1250 = getelementptr inbounds float* %tmp1249, i64 1
+  %tmp1251 = getelementptr inbounds float* %tmp1250, i64 1
+  %tmp1252 = getelementptr inbounds float* %tmp1251, i64 1
+  %tmp1253 = getelementptr inbounds float* %tmp1252, i64 1
+  %tmp1254 = getelementptr inbounds float* %tmp1253, i64 1
+  %tmp1255 = getelementptr inbounds float* %tmp1254, i64 1
+  %tmp1256 = getelementptr inbounds float* %tmp1255, i64 1
+  %tmp1257 = getelementptr inbounds float* %tmp1256, i64 1
+  %tmp1258 = getelementptr inbounds float* %tmp1257, i64 1
+  %tmp1259 = getelementptr inbounds float* %tmp1258, i64 1
+  %tmp1260 = getelementptr inbounds float* %tmp1259, i64 1
+  %tmp1261 = getelementptr inbounds float* %tmp1260, i64 1
+  %tmp1262 = getelementptr inbounds float* %tmp1261, i64 1
+  %tmp1263 = getelementptr inbounds float* %tmp1262, i64 1
+  %tmp1264 = getelementptr inbounds float* %tmp1263, i64 1
+  %tmp1265 = getelementptr inbounds float* %tmp1264, i64 1
+  %tmp1266 = getelementptr inbounds float* %tmp1265, i64 1
+  %tmp1267 = getelementptr inbounds float* %tmp1266, i64 1
+  %tmp1268 = getelementptr inbounds float* %tmp1267, i64 1
+  %tmp1269 = getelementptr inbounds float* %tmp1268, i64 1
+  %tmp1270 = getelementptr inbounds float* %tmp1269, i64 1
+  %tmp1271 = getelementptr inbounds float* %tmp1270, i64 1
+  %tmp1272 = getelementptr inbounds float* %tmp1271, i64 1
+  %tmp1273 = getelementptr inbounds float* %tmp1272, i64 1
+  %tmp1274 = getelementptr inbounds float* %tmp1273, i64 1
+  %tmp1275 = getelementptr inbounds float* %tmp1274, i64 1
+  %tmp1276 = getelementptr inbounds float* %tmp1275, i64 1
+  %tmp1277 = getelementptr inbounds float* %tmp1276, i64 1
+  %tmp1278 = getelementptr inbounds float* %tmp1277, i64 1
+  %tmp1279 = getelementptr inbounds float* %tmp1278, i64 1
+  %tmp1280 = getelementptr inbounds float* %tmp1279, i64 1
+  %tmp1281 = getelementptr inbounds float* %tmp1280, i64 1
+  %tmp1282 = getelementptr inbounds float* %tmp1281, i64 1
+  %tmp1283 = getelementptr inbounds float* %tmp1282, i64 1
+  %tmp1284 = getelementptr inbounds float* %tmp1283, i64 1
+  %tmp1285 = getelementptr inbounds float* %tmp1284, i64 1
+  %tmp1286 = getelementptr inbounds float* %tmp1285, i64 1
+  %tmp1287 = getelementptr inbounds float* %tmp1286, i64 1
+  %tmp1288 = getelementptr inbounds float* %tmp1287, i64 1
+  %tmp1289 = getelementptr inbounds float* %tmp1288, i64 1
+  %tmp1290 = getelementptr inbounds float* %tmp1289, i64 1
+  %tmp1291 = getelementptr inbounds float* %tmp1290, i64 1
+  %tmp1292 = getelementptr inbounds float* %tmp1291, i64 1
+  %tmp1293 = getelementptr inbounds float* %tmp1292, i64 1
+  %tmp1294 = getelementptr inbounds float* %tmp1293, i64 1
+  %tmp1295 = getelementptr inbounds float* %tmp1294, i64 1
+  %tmp1296 = getelementptr inbounds float* %tmp1295, i64 1
+  %tmp1297 = getelementptr inbounds float* %tmp1296, i64 1
+  %tmp1298 = getelementptr inbounds float* %tmp1297, i64 1
+  %tmp1299 = getelementptr inbounds float* %tmp1298, i64 1
+  %tmp1300 = getelementptr inbounds float* %tmp1299, i64 1
+  %tmp1301 = getelementptr inbounds float* %tmp1300, i64 1
+  %tmp1302 = getelementptr inbounds float* %tmp1301, i64 1
+  %tmp1303 = getelementptr inbounds float* %tmp1302, i64 1
+  %tmp1304 = getelementptr inbounds float* %tmp1303, i64 1
+  %tmp1305 = getelementptr inbounds float* %tmp1304, i64 1
+  %tmp1306 = getelementptr inbounds float* %tmp1305, i64 1
+  %tmp1307 = getelementptr inbounds float* %tmp1306, i64 1
+  %tmp1308 = getelementptr inbounds float* %tmp1307, i64 1
+  %tmp1309 = getelementptr inbounds float* %tmp1308, i64 1
+  %tmp1310 = getelementptr inbounds float* %tmp1309, i64 1
+  %tmp1311 = getelementptr inbounds float* %tmp1310, i64 1
+  %tmp1312 = getelementptr inbounds float* %tmp1311, i64 1
+  %tmp1313 = getelementptr inbounds float* %tmp1312, i64 1
+  %tmp1314 = getelementptr inbounds float* %tmp1313, i64 1
+  %tmp1315 = getelementptr inbounds float* %tmp1314, i64 1
+  %tmp1316 = getelementptr inbounds float* %tmp1315, i64 1
+  %tmp1317 = getelementptr inbounds float* %tmp1316, i64 1
+  %tmp1318 = getelementptr inbounds float* %tmp1317, i64 1
+  %tmp1319 = getelementptr inbounds float* %tmp1318, i64 1
+  %tmp1320 = getelementptr inbounds float* %tmp1319, i64 1
+  %tmp1321 = getelementptr inbounds float* %tmp1320, i64 1
+  %tmp1322 = getelementptr inbounds float* %tmp1321, i64 1
+  %tmp1323 = getelementptr inbounds float* %tmp1322, i64 1
+  %tmp1324 = getelementptr inbounds float* %tmp1323, i64 1
+  %tmp1325 = getelementptr inbounds float* %tmp1324, i64 1
+  %tmp1326 = getelementptr inbounds float* %tmp1325, i64 1
+  %tmp1327 = getelementptr inbounds float* %tmp1326, i64 1
+  %tmp1328 = getelementptr inbounds float* %tmp1327, i64 1
+  %tmp1329 = getelementptr inbounds float* %tmp1328, i64 1
+  %tmp1330 = getelementptr inbounds float* %tmp1329, i64 1
+  %tmp1331 = getelementptr inbounds float* %tmp1330, i64 1
+  %tmp1332 = getelementptr inbounds float* %tmp1331, i64 1
+  %tmp1333 = getelementptr inbounds float* %tmp1332, i64 1
+  %tmp1334 = getelementptr inbounds float* %tmp1333, i64 1
+  %tmp1335 = getelementptr inbounds float* %tmp1334, i64 1
+  %tmp1336 = getelementptr inbounds float* %tmp1335, i64 1
+  %tmp1337 = getelementptr inbounds float* %tmp1336, i64 1
+  %tmp1338 = getelementptr inbounds float* %tmp1337, i64 1
+  %tmp1339 = getelementptr inbounds float* %tmp1338, i64 1
+  %tmp1340 = getelementptr inbounds float* %tmp1339, i64 1
+  %tmp1341 = getelementptr inbounds float* %tmp1340, i64 1
+  %tmp1342 = getelementptr inbounds float* %tmp1341, i64 1
+  %tmp1343 = getelementptr inbounds float* %tmp1342, i64 1
+  %tmp1344 = getelementptr inbounds float* %tmp1343, i64 1
+  %tmp1345 = getelementptr inbounds float* %tmp1344, i64 1
+  %tmp1346 = getelementptr inbounds float* %tmp1345, i64 1
+  %tmp1347 = getelementptr inbounds float* %tmp1346, i64 1
+  %tmp1348 = getelementptr inbounds float* %tmp1347, i64 1
+  %tmp1349 = getelementptr inbounds float* %tmp1348, i64 1
+  %tmp1350 = getelementptr inbounds float* %tmp1349, i64 1
+  %tmp1351 = getelementptr inbounds float* %tmp1350, i64 1
+  %tmp1352 = getelementptr inbounds float* %tmp1351, i64 1
+  %tmp1353 = getelementptr inbounds float* %tmp1352, i64 1
+  %tmp1354 = getelementptr inbounds float* %tmp1353, i64 1
+  %tmp1355 = getelementptr inbounds float* %tmp1354, i64 1
+  %tmp1356 = getelementptr inbounds float* %tmp1355, i64 1
+  %tmp1357 = getelementptr inbounds float* %tmp1356, i64 1
+  %tmp1358 = getelementptr inbounds float* %tmp1357, i64 1
+  %tmp1359 = getelementptr inbounds float* %tmp1358, i64 1
+  %tmp1360 = getelementptr inbounds float* %tmp1359, i64 1
+  %tmp1361 = getelementptr inbounds float* %tmp1360, i64 1
+  %tmp1362 = getelementptr inbounds float* %tmp1361, i64 1
+  %tmp1363 = getelementptr inbounds float* %tmp1362, i64 1
+  %tmp1364 = getelementptr inbounds float* %tmp1363, i64 1
+  %tmp1365 = getelementptr inbounds float* %tmp1364, i64 1
+  %tmp1366 = getelementptr inbounds float* %tmp1365, i64 1
+  %tmp1367 = getelementptr inbounds float* %tmp1366, i64 1
+  %tmp1368 = getelementptr inbounds float* %tmp1367, i64 1
+  %tmp1369 = getelementptr inbounds float* %tmp1368, i64 1
+  %tmp1370 = getelementptr inbounds float* %tmp1369, i64 1
+  %tmp1371 = getelementptr inbounds float* %tmp1370, i64 1
+  %tmp1372 = getelementptr inbounds float* %tmp1371, i64 1
+  %tmp1373 = getelementptr inbounds float* %tmp1372, i64 1
+  %tmp1374 = getelementptr inbounds float* %tmp1373, i64 1
+  %tmp1375 = getelementptr inbounds float* %tmp1374, i64 1
+  %tmp1376 = getelementptr inbounds float* %tmp1375, i64 1
+  %tmp1377 = getelementptr inbounds float* %tmp1376, i64 1
+  %tmp1378 = getelementptr inbounds float* %tmp1377, i64 1
+  %tmp1379 = getelementptr inbounds float* %tmp1378, i64 1
+  %tmp1380 = getelementptr inbounds float* %tmp1379, i64 1
+  %tmp1381 = getelementptr inbounds float* %tmp1380, i64 1
+  %tmp1382 = getelementptr inbounds float* %tmp1381, i64 1
+  %tmp1383 = getelementptr inbounds float* %tmp1382, i64 1
+  %tmp1384 = getelementptr inbounds float* %tmp1383, i64 1
+  %tmp1385 = getelementptr inbounds float* %tmp1384, i64 1
+  %tmp1386 = getelementptr inbounds float* %tmp1385, i64 1
+  %tmp1387 = getelementptr inbounds float* %tmp1386, i64 1
+  %tmp1388 = getelementptr inbounds float* %tmp1387, i64 1
+  %tmp1389 = getelementptr inbounds float* %tmp1388, i64 1
+  %tmp1390 = getelementptr inbounds float* %tmp1389, i64 1
+  %tmp1391 = getelementptr inbounds float* %tmp1390, i64 1
+  %tmp1392 = getelementptr inbounds float* %tmp1391, i64 1
+  %tmp1393 = getelementptr inbounds float* %tmp1392, i64 1
+  %tmp1394 = getelementptr inbounds float* %tmp1393, i64 1
+  %tmp1395 = getelementptr inbounds float* %tmp1394, i64 1
+  %tmp1396 = getelementptr inbounds float* %tmp1395, i64 1
+  %tmp1397 = getelementptr inbounds float* %tmp1396, i64 1
+  %tmp1398 = getelementptr inbounds float* %tmp1397, i64 1
+  %tmp1399 = getelementptr inbounds float* %tmp1398, i64 1
+  %tmp1400 = getelementptr inbounds float* %tmp1399, i64 1
+  %tmp1401 = getelementptr inbounds float* %tmp1400, i64 1
+  %tmp1402 = getelementptr inbounds float* %tmp1401, i64 1
+  %tmp1403 = getelementptr inbounds float* %tmp1402, i64 1
+  %tmp1404 = getelementptr inbounds float* %tmp1403, i64 1
+  %tmp1405 = getelementptr inbounds float* %tmp1404, i64 1
+  %tmp1406 = getelementptr inbounds float* %tmp1405, i64 1
+  %tmp1407 = getelementptr inbounds float* %tmp1406, i64 1
+  %tmp1408 = getelementptr inbounds float* %tmp1407, i64 1
+  %tmp1409 = getelementptr inbounds float* %tmp1408, i64 1
+  %tmp1410 = getelementptr inbounds float* %tmp1409, i64 1
+  %tmp1411 = getelementptr inbounds float* %tmp1410, i64 1
+  %tmp1412 = getelementptr inbounds float* %tmp1411, i64 1
+  %tmp1413 = getelementptr inbounds float* %tmp1412, i64 1
+  %tmp1414 = getelementptr inbounds float* %tmp1413, i64 1
+  %tmp1415 = getelementptr inbounds float* %tmp1414, i64 1
+  %tmp1416 = getelementptr inbounds float* %tmp1415, i64 1
+  %tmp1417 = getelementptr inbounds float* %tmp1416, i64 1
+  %tmp1418 = getelementptr inbounds float* %tmp1417, i64 1
+  %tmp1419 = getelementptr inbounds float* %tmp1418, i64 1
+  %tmp1420 = getelementptr inbounds float* %tmp1419, i64 1
+  %tmp1421 = getelementptr inbounds float* %tmp1420, i64 1
+  %tmp1422 = getelementptr inbounds float* %tmp1421, i64 1
+  %tmp1423 = getelementptr inbounds float* %tmp1422, i64 1
+  %tmp1424 = getelementptr inbounds float* %tmp1423, i64 1
+  %tmp1425 = getelementptr inbounds float* %tmp1424, i64 1
+  %tmp1426 = getelementptr inbounds float* %tmp1425, i64 1
+  %tmp1427 = getelementptr inbounds float* %tmp1426, i64 1
+  %tmp1428 = getelementptr inbounds float* %tmp1427, i64 1
+  %tmp1429 = getelementptr inbounds float* %tmp1428, i64 1
+  %tmp1430 = getelementptr inbounds float* %tmp1429, i64 1
+  %tmp1431 = getelementptr inbounds float* %tmp1430, i64 1
+  %tmp1432 = getelementptr inbounds float* %tmp1431, i64 1
+  %tmp1433 = getelementptr inbounds float* %tmp1432, i64 1
+  %tmp1434 = getelementptr inbounds float* %tmp1433, i64 1
+  %tmp1435 = getelementptr inbounds float* %tmp1434, i64 1
+  %tmp1436 = getelementptr inbounds float* %tmp1435, i64 1
+  %tmp1437 = getelementptr inbounds float* %tmp1436, i64 1
+  %tmp1438 = getelementptr inbounds float* %tmp1437, i64 1
+  %tmp1439 = getelementptr inbounds float* %tmp1438, i64 1
+  %tmp1440 = getelementptr inbounds float* %tmp1439, i64 1
+  %tmp1441 = getelementptr inbounds float* %tmp1440, i64 1
+  %tmp1442 = getelementptr inbounds float* %tmp1441, i64 1
+  %tmp1443 = getelementptr inbounds float* %tmp1442, i64 1
+  %tmp1444 = getelementptr inbounds float* %tmp1443, i64 1
+  %tmp1445 = getelementptr inbounds float* %tmp1444, i64 1
+  %tmp1446 = getelementptr inbounds float* %tmp1445, i64 1
+  %tmp1447 = getelementptr inbounds float* %tmp1446, i64 1
+  %tmp1448 = getelementptr inbounds float* %tmp1447, i64 1
+  %tmp1449 = getelementptr inbounds float* %tmp1448, i64 1
+  %tmp1450 = getelementptr inbounds float* %tmp1449, i64 1
+  %tmp1451 = getelementptr inbounds float* %tmp1450, i64 1
+  %tmp1452 = getelementptr inbounds float* %tmp1451, i64 1
+  %tmp1453 = getelementptr inbounds float* %tmp1452, i64 1
+  %tmp1454 = getelementptr inbounds float* %tmp1453, i64 1
+  %tmp1455 = getelementptr inbounds float* %tmp1454, i64 1
+  %tmp1456 = getelementptr inbounds float* %tmp1455, i64 1
+  %tmp1457 = getelementptr inbounds float* %tmp1456, i64 1
+  %tmp1458 = getelementptr inbounds float* %tmp1457, i64 1
+  %tmp1459 = getelementptr inbounds float* %tmp1458, i64 1
+  %tmp1460 = getelementptr inbounds float* %tmp1459, i64 1
+  %tmp1461 = getelementptr inbounds float* %tmp1460, i64 1
+  %tmp1462 = getelementptr inbounds float* %tmp1461, i64 1
+  %tmp1463 = getelementptr inbounds float* %tmp1462, i64 1
+  %tmp1464 = getelementptr inbounds float* %tmp1463, i64 1
+  %tmp1465 = getelementptr inbounds float* %tmp1464, i64 1
+  %tmp1466 = getelementptr inbounds float* %tmp1465, i64 1
+  %tmp1467 = getelementptr inbounds float* %tmp1466, i64 1
+  %tmp1468 = getelementptr inbounds float* %tmp1467, i64 1
+  %tmp1469 = getelementptr inbounds float* %tmp1468, i64 1
+  %tmp1470 = getelementptr inbounds float* %tmp1469, i64 1
+  %tmp1471 = getelementptr inbounds float* %tmp1470, i64 1
+  %tmp1472 = getelementptr inbounds float* %tmp1471, i64 1
+  %tmp1473 = getelementptr inbounds float* %tmp1472, i64 1
+  %tmp1474 = getelementptr inbounds float* %tmp1473, i64 1
+  %tmp1475 = getelementptr inbounds float* %tmp1474, i64 1
+  %tmp1476 = getelementptr inbounds float* %tmp1475, i64 1
+  %tmp1477 = getelementptr inbounds float* %tmp1476, i64 1
+  %tmp1478 = getelementptr inbounds float* %tmp1477, i64 1
+  %tmp1479 = getelementptr inbounds float* %tmp1478, i64 1
+  %tmp1480 = getelementptr inbounds float* %tmp1479, i64 1
+  %tmp1481 = getelementptr inbounds float* %tmp1480, i64 1
+  %tmp1482 = getelementptr inbounds float* %tmp1481, i64 1
+  %tmp1483 = getelementptr inbounds float* %tmp1482, i64 1
+  %tmp1484 = getelementptr inbounds float* %tmp1483, i64 1
+  %tmp1485 = getelementptr inbounds float* %tmp1484, i64 1
+  %tmp1486 = getelementptr inbounds float* %tmp1485, i64 1
+  %tmp1487 = getelementptr inbounds float* %tmp1486, i64 1
+  %tmp1488 = getelementptr inbounds float* %tmp1487, i64 1
+  %tmp1489 = getelementptr inbounds float* %tmp1488, i64 1
+  %tmp1490 = getelementptr inbounds float* %tmp1489, i64 1
+  %tmp1491 = getelementptr inbounds float* %tmp1490, i64 1
+  %tmp1492 = getelementptr inbounds float* %tmp1491, i64 1
+  %tmp1493 = getelementptr inbounds float* %tmp1492, i64 1
+  %tmp1494 = getelementptr inbounds float* %tmp1493, i64 1
+  %tmp1495 = getelementptr inbounds float* %tmp1494, i64 1
+  %tmp1496 = getelementptr inbounds float* %tmp1495, i64 1
+  %tmp1497 = getelementptr inbounds float* %tmp1496, i64 1
+  %tmp1498 = getelementptr inbounds float* %tmp1497, i64 1
+  %tmp1499 = getelementptr inbounds float* %tmp1498, i64 1
+  %tmp1500 = getelementptr inbounds float* %tmp1499, i64 1
+  %tmp1501 = getelementptr inbounds float* %tmp1500, i64 1
+  %tmp1502 = getelementptr inbounds float* %tmp1501, i64 1
+  %tmp1503 = getelementptr inbounds float* %tmp1502, i64 1
+  %tmp1504 = getelementptr inbounds float* %tmp1503, i64 1
+  %tmp1505 = getelementptr inbounds float* %tmp1504, i64 1
+  %tmp1506 = getelementptr inbounds float* %tmp1505, i64 1
+  %tmp1507 = getelementptr inbounds float* %tmp1506, i64 1
+  %tmp1508 = getelementptr inbounds float* %tmp1507, i64 1
+  %tmp1509 = getelementptr inbounds float* %tmp1508, i64 1
+  %tmp1510 = getelementptr inbounds float* %tmp1509, i64 1
+  %tmp1511 = getelementptr inbounds float* %tmp1510, i64 1
+  %tmp1512 = getelementptr inbounds float* %tmp1511, i64 1
+  %tmp1513 = getelementptr inbounds float* %tmp1512, i64 1
+  %tmp1514 = getelementptr inbounds float* %tmp1513, i64 1
+  %tmp1515 = getelementptr inbounds float* %tmp1514, i64 1
+  %tmp1516 = getelementptr inbounds float* %tmp1515, i64 1
+  %tmp1517 = getelementptr inbounds float* %tmp1516, i64 1
+  %tmp1518 = getelementptr inbounds float* %tmp1517, i64 1
+  %tmp1519 = getelementptr inbounds float* %tmp1518, i64 1
+  %tmp1520 = getelementptr inbounds float* %tmp1519, i64 1
+  %tmp1521 = getelementptr inbounds float* %tmp1520, i64 1
+  %tmp1522 = getelementptr inbounds float* %tmp1521, i64 1
+  %tmp1523 = getelementptr inbounds float* %tmp1522, i64 1
+  %tmp1524 = getelementptr inbounds float* %tmp1523, i64 1
+  %tmp1525 = getelementptr inbounds float* %tmp1524, i64 1
+  %tmp1526 = getelementptr inbounds float* %tmp1525, i64 1
+  %tmp1527 = getelementptr inbounds float* %tmp1526, i64 1
+  %tmp1528 = getelementptr inbounds float* %tmp1527, i64 1
+  %tmp1529 = getelementptr inbounds float* %tmp1528, i64 1
+  %tmp1530 = getelementptr inbounds float* %tmp1529, i64 1
+  %tmp1531 = getelementptr inbounds float* %tmp1530, i64 1
+  %tmp1532 = getelementptr inbounds float* %tmp1531, i64 1
+  %tmp1533 = getelementptr inbounds float* %tmp1532, i64 1
+  %tmp1534 = getelementptr inbounds float* %tmp1533, i64 1
+  %tmp1535 = getelementptr inbounds float* %tmp1534, i64 1
+  %tmp1536 = getelementptr inbounds float* %tmp1535, i64 1
+  %tmp1537 = getelementptr inbounds float* %tmp1536, i64 1
+  %tmp1538 = getelementptr inbounds float* %tmp1537, i64 1
+  %tmp1539 = getelementptr inbounds float* %tmp1538, i64 1
+  %tmp1540 = getelementptr inbounds float* %tmp1539, i64 1
+  %tmp1541 = getelementptr inbounds float* %tmp1540, i64 1
+  %tmp1542 = getelementptr inbounds float* %tmp1541, i64 1
+  %tmp1543 = getelementptr inbounds float* %tmp1542, i64 1
+  %tmp1544 = getelementptr inbounds float* %tmp1543, i64 1
+  %tmp1545 = getelementptr inbounds float* %tmp1544, i64 1
+  %tmp1546 = getelementptr inbounds float* %tmp1545, i64 1
+  %tmp1547 = getelementptr inbounds float* %tmp1546, i64 1
+  %tmp1548 = getelementptr inbounds float* %tmp1547, i64 1
+  %tmp1549 = getelementptr inbounds float* %tmp1548, i64 1
+  %tmp1550 = getelementptr inbounds float* %tmp1549, i64 1
+  %tmp1551 = getelementptr inbounds float* %tmp1550, i64 1
+  %tmp1552 = getelementptr inbounds float* %tmp1551, i64 1
+  %tmp1553 = getelementptr inbounds float* %tmp1552, i64 1
+  %tmp1554 = getelementptr inbounds float* %tmp1553, i64 1
+  %tmp1555 = getelementptr inbounds float* %tmp1554, i64 1
+  %tmp1556 = getelementptr inbounds float* %tmp1555, i64 1
+  %tmp1557 = getelementptr inbounds float* %tmp1556, i64 1
+  %tmp1558 = getelementptr inbounds float* %tmp1557, i64 1
+  %tmp1559 = getelementptr inbounds float* %tmp1558, i64 1
+  %tmp1560 = getelementptr inbounds float* %tmp1559, i64 1
+  %tmp1561 = getelementptr inbounds float* %tmp1560, i64 1
+  %tmp1562 = getelementptr inbounds float* %tmp1561, i64 1
+  %tmp1563 = getelementptr inbounds float* %tmp1562, i64 1
+  %tmp1564 = getelementptr inbounds float* %tmp1563, i64 1
+  %tmp1565 = getelementptr inbounds float* %tmp1564, i64 1
+  %tmp1566 = getelementptr inbounds float* %tmp1565, i64 1
+  %tmp1567 = getelementptr inbounds float* %tmp1566, i64 1
+  %tmp1568 = getelementptr inbounds float* %tmp1567, i64 1
+  %tmp1569 = getelementptr inbounds float* %tmp1568, i64 1
+  %tmp1570 = getelementptr inbounds float* %tmp1569, i64 1
+  %tmp1571 = getelementptr inbounds float* %tmp1570, i64 1
+  %tmp1572 = getelementptr inbounds float* %tmp1571, i64 1
+  %tmp1573 = getelementptr inbounds float* %tmp1572, i64 1
+  %tmp1574 = getelementptr inbounds float* %tmp1573, i64 1
+  %tmp1575 = getelementptr inbounds float* %tmp1574, i64 1
+  %tmp1576 = getelementptr inbounds float* %tmp1575, i64 1
+  %tmp1577 = getelementptr inbounds float* %tmp1576, i64 1
+  %tmp1578 = getelementptr inbounds float* %tmp1577, i64 1
+  %tmp1579 = getelementptr inbounds float* %tmp1578, i64 1
+  %tmp1580 = getelementptr inbounds float* %tmp1579, i64 1
+  %tmp1581 = getelementptr inbounds float* %tmp1580, i64 1
+  %tmp1582 = getelementptr inbounds float* %tmp1581, i64 1
+  %tmp1583 = getelementptr inbounds float* %tmp1582, i64 1
+  %tmp1584 = getelementptr inbounds float* %tmp1583, i64 1
+  %tmp1585 = getelementptr inbounds float* %tmp1584, i64 1
+  %tmp1586 = getelementptr inbounds float* %tmp1585, i64 1
+  %tmp1587 = getelementptr inbounds float* %tmp1586, i64 1
+  %tmp1588 = getelementptr inbounds float* %tmp1587, i64 1
+  %tmp1589 = getelementptr inbounds float* %tmp1588, i64 1
+  %tmp1590 = getelementptr inbounds float* %tmp1589, i64 1
+  %tmp1591 = getelementptr inbounds float* %tmp1590, i64 1
+  %tmp1592 = getelementptr inbounds float* %tmp1591, i64 1
+  %tmp1593 = getelementptr inbounds float* %tmp1592, i64 1
+  %tmp1594 = getelementptr inbounds float* %tmp1593, i64 1
+  %tmp1595 = getelementptr inbounds float* %tmp1594, i64 1
+  %tmp1596 = getelementptr inbounds float* %tmp1595, i64 1
+  %tmp1597 = getelementptr inbounds float* %tmp1596, i64 1
+  %tmp1598 = getelementptr inbounds float* %tmp1597, i64 1
+  %tmp1599 = getelementptr inbounds float* %tmp1598, i64 1
+  %tmp1600 = getelementptr inbounds float* %tmp1599, i64 1
+  %tmp1601 = getelementptr inbounds float* %tmp1600, i64 1
+  %tmp1602 = getelementptr inbounds float* %tmp1601, i64 1
+  %tmp1603 = getelementptr inbounds float* %tmp1602, i64 1
+  %tmp1604 = getelementptr inbounds float* %tmp1603, i64 1
+  %tmp1605 = getelementptr inbounds float* %tmp1604, i64 1
+  %tmp1606 = getelementptr inbounds float* %tmp1605, i64 1
+  %tmp1607 = getelementptr inbounds float* %tmp1606, i64 1
+  %tmp1608 = getelementptr inbounds float* %tmp1607, i64 1
+  %tmp1609 = getelementptr inbounds float* %tmp1608, i64 1
+  %tmp1610 = getelementptr inbounds float* %tmp1609, i64 1
+  %tmp1611 = getelementptr inbounds float* %tmp1610, i64 1
+  %tmp1612 = getelementptr inbounds float* %tmp1611, i64 1
+  %tmp1613 = getelementptr inbounds float* %tmp1612, i64 1
+  %tmp1614 = getelementptr inbounds float* %tmp1613, i64 1
+  %tmp1615 = getelementptr inbounds float* %tmp1614, i64 1
+  %tmp1616 = getelementptr inbounds float* %tmp1615, i64 1
+  %tmp1617 = getelementptr inbounds float* %tmp1616, i64 1
+  %tmp1618 = getelementptr inbounds float* %tmp1617, i64 1
+  %tmp1619 = getelementptr inbounds float* %tmp1618, i64 1
+  %tmp1620 = getelementptr inbounds float* %tmp1619, i64 1
+  %tmp1621 = getelementptr inbounds float* %tmp1620, i64 1
+  %tmp1622 = getelementptr inbounds float* %tmp1621, i64 1
+  %tmp1623 = getelementptr inbounds float* %tmp1622, i64 1
+  %tmp1624 = getelementptr inbounds float* %tmp1623, i64 1
+  %tmp1625 = getelementptr inbounds float* %tmp1624, i64 1
+  %tmp1626 = getelementptr inbounds float* %tmp1625, i64 1
+  %tmp1627 = getelementptr inbounds float* %tmp1626, i64 1
+  %tmp1628 = getelementptr inbounds float* %tmp1627, i64 1
+  %tmp1629 = getelementptr inbounds float* %tmp1628, i64 1
+  %tmp1630 = getelementptr inbounds float* %tmp1629, i64 1
+  %tmp1631 = getelementptr inbounds float* %tmp1630, i64 1
+  %tmp1632 = getelementptr inbounds float* %tmp1631, i64 1
+  %tmp1633 = getelementptr inbounds float* %tmp1632, i64 1
+  %tmp1634 = getelementptr inbounds float* %tmp1633, i64 1
+  %tmp1635 = getelementptr inbounds float* %tmp1634, i64 1
+  %tmp1636 = getelementptr inbounds float* %tmp1635, i64 1
+  %tmp1637 = getelementptr inbounds float* %tmp1636, i64 1
+  %tmp1638 = getelementptr inbounds float* %tmp1637, i64 1
+  %tmp1639 = getelementptr inbounds float* %tmp1638, i64 1
+  %tmp1640 = getelementptr inbounds float* %tmp1639, i64 1
+  %tmp1641 = getelementptr inbounds float* %tmp1640, i64 1
+  %tmp1642 = getelementptr inbounds float* %tmp1641, i64 1
+  %tmp1643 = getelementptr inbounds float* %tmp1642, i64 1
+  %tmp1644 = getelementptr inbounds float* %tmp1643, i64 1
+  %tmp1645 = getelementptr inbounds float* %tmp1644, i64 1
+  %tmp1646 = getelementptr inbounds float* %tmp1645, i64 1
+  %tmp1647 = getelementptr inbounds float* %tmp1646, i64 1
+  %tmp1648 = getelementptr inbounds float* %tmp1647, i64 1
+  %tmp1649 = getelementptr inbounds float* %tmp1648, i64 1
+  %tmp1650 = getelementptr inbounds float* %tmp1649, i64 1
+  %tmp1651 = getelementptr inbounds float* %tmp1650, i64 1
+  %tmp1652 = getelementptr inbounds float* %tmp1651, i64 1
+  %tmp1653 = getelementptr inbounds float* %tmp1652, i64 1
+  %tmp1654 = getelementptr inbounds float* %tmp1653, i64 1
+  %tmp1655 = getelementptr inbounds float* %tmp1654, i64 1
+  %tmp1656 = getelementptr inbounds float* %tmp1655, i64 1
+  %tmp1657 = getelementptr inbounds float* %tmp1656, i64 1
+  %tmp1658 = getelementptr inbounds float* %tmp1657, i64 1
+  %tmp1659 = getelementptr inbounds float* %tmp1658, i64 1
+  %tmp1660 = getelementptr inbounds float* %tmp1659, i64 1
+  %tmp1661 = getelementptr inbounds float* %tmp1660, i64 1
+  %tmp1662 = getelementptr inbounds float* %tmp1661, i64 1
+  %tmp1663 = getelementptr inbounds float* %tmp1662, i64 1
+  %tmp1664 = getelementptr inbounds float* %tmp1663, i64 1
+  %tmp1665 = getelementptr inbounds float* %tmp1664, i64 1
+  %tmp1666 = getelementptr inbounds float* %tmp1665, i64 1
+  %tmp1667 = getelementptr inbounds float* %tmp1666, i64 1
+  %tmp1668 = getelementptr inbounds float* %tmp1667, i64 1
+  %tmp1669 = getelementptr inbounds float* %tmp1668, i64 1
+  %tmp1670 = getelementptr inbounds float* %tmp1669, i64 1
+  %tmp1671 = getelementptr inbounds float* %tmp1670, i64 1
+  %tmp1672 = getelementptr inbounds float* %tmp1671, i64 1
+  %tmp1673 = getelementptr inbounds float* %tmp1672, i64 1
+  %tmp1674 = getelementptr inbounds float* %tmp1673, i64 1
+  %tmp1675 = getelementptr inbounds float* %tmp1674, i64 1
+  %tmp1676 = getelementptr inbounds float* %tmp1675, i64 1
+  %tmp1677 = getelementptr inbounds float* %tmp1676, i64 1
+  %tmp1678 = getelementptr inbounds float* %tmp1677, i64 1
+  %tmp1679 = getelementptr inbounds float* %tmp1678, i64 1
+  %tmp1680 = getelementptr inbounds float* %tmp1679, i64 1
+  %tmp1681 = getelementptr inbounds float* %tmp1680, i64 1
+  %tmp1682 = getelementptr inbounds float* %tmp1681, i64 1
+  %tmp1683 = getelementptr inbounds float* %tmp1682, i64 1
+  %tmp1684 = getelementptr inbounds float* %tmp1683, i64 1
+  %tmp1685 = getelementptr inbounds float* %tmp1684, i64 1
+  %tmp1686 = getelementptr inbounds float* %tmp1685, i64 1
+  %tmp1687 = getelementptr inbounds float* %tmp1686, i64 1
+  %tmp1688 = getelementptr inbounds float* %tmp1687, i64 1
+  %tmp1689 = getelementptr inbounds float* %tmp1688, i64 1
+  %tmp1690 = getelementptr inbounds float* %tmp1689, i64 1
+  %tmp1691 = getelementptr inbounds float* %tmp1690, i64 1
+  %tmp1692 = getelementptr inbounds float* %tmp1691, i64 1
+  %tmp1693 = getelementptr inbounds float* %tmp1692, i64 1
+  %tmp1694 = getelementptr inbounds float* %tmp1693, i64 1
+  %tmp1695 = getelementptr inbounds float* %tmp1694, i64 1
+  %tmp1696 = getelementptr inbounds float* %tmp1695, i64 1
+  %tmp1697 = getelementptr inbounds float* %tmp1696, i64 1
+  %tmp1698 = getelementptr inbounds float* %tmp1697, i64 1
+  %tmp1699 = getelementptr inbounds float* %tmp1698, i64 1
+  %tmp1700 = getelementptr inbounds float* %tmp1699, i64 1
+  %tmp1701 = getelementptr inbounds float* %tmp1700, i64 1
+  %tmp1702 = getelementptr inbounds float* %tmp1701, i64 1
+  %tmp1703 = getelementptr inbounds float* %tmp1702, i64 1
+  %tmp1704 = getelementptr inbounds float* %tmp1703, i64 1
+  %tmp1705 = getelementptr inbounds float* %tmp1704, i64 1
+  %tmp1706 = getelementptr inbounds float* %tmp1705, i64 1
+  %tmp1707 = getelementptr inbounds float* %tmp1706, i64 1
+  %tmp1708 = getelementptr inbounds float* %tmp1707, i64 1
+  %tmp1709 = getelementptr inbounds float* %tmp1708, i64 1
+  %tmp1710 = getelementptr inbounds float* %tmp1709, i64 1
+  %tmp1711 = getelementptr inbounds float* %tmp1710, i64 1
+  %tmp1712 = getelementptr inbounds float* %tmp1711, i64 1
+  %tmp1713 = getelementptr inbounds float* %tmp1712, i64 1
+  %tmp1714 = getelementptr inbounds float* %tmp1713, i64 1
+  %tmp1715 = getelementptr inbounds float* %tmp1714, i64 1
+  %tmp1716 = getelementptr inbounds float* %tmp1715, i64 1
+  %tmp1717 = getelementptr inbounds float* %tmp1716, i64 1
+  %tmp1718 = getelementptr inbounds float* %tmp1717, i64 1
+  %tmp1719 = getelementptr inbounds float* %tmp1718, i64 1
+  %tmp1720 = getelementptr inbounds float* %tmp1719, i64 1
+  %tmp1721 = getelementptr inbounds float* %tmp1720, i64 1
+  %tmp1722 = getelementptr inbounds float* %tmp1721, i64 1
+  %tmp1723 = getelementptr inbounds float* %tmp1722, i64 1
+  %tmp1724 = getelementptr inbounds float* %tmp1723, i64 1
+  %tmp1725 = getelementptr inbounds float* %tmp1724, i64 1
+  %tmp1726 = getelementptr inbounds float* %tmp1725, i64 1
+  %tmp1727 = getelementptr inbounds float* %tmp1726, i64 1
+  %tmp1728 = getelementptr inbounds float* %tmp1727, i64 1
+  %tmp1729 = getelementptr inbounds float* %tmp1728, i64 1
+  %tmp1730 = getelementptr inbounds float* %tmp1729, i64 1
+  %tmp1731 = getelementptr inbounds float* %tmp1730, i64 1
+  %tmp1732 = getelementptr inbounds float* %tmp1731, i64 1
+  %tmp1733 = getelementptr inbounds float* %tmp1732, i64 1
+  %tmp1734 = getelementptr inbounds float* %tmp1733, i64 1
+  %tmp1735 = getelementptr inbounds float* %tmp1734, i64 1
+  %tmp1736 = getelementptr inbounds float* %tmp1735, i64 1
+  %tmp1737 = getelementptr inbounds float* %tmp1736, i64 1
+  %tmp1738 = getelementptr inbounds float* %tmp1737, i64 1
+  %tmp1739 = getelementptr inbounds float* %tmp1738, i64 1
+  %tmp1740 = getelementptr inbounds float* %tmp1739, i64 1
+  %tmp1741 = getelementptr inbounds float* %tmp1740, i64 1
+  %tmp1742 = getelementptr inbounds float* %tmp1741, i64 1
+  %tmp1743 = getelementptr inbounds float* %tmp1742, i64 1
+  %tmp1744 = getelementptr inbounds float* %tmp1743, i64 1
+  %tmp1745 = getelementptr inbounds float* %tmp1744, i64 1
+  %tmp1746 = getelementptr inbounds float* %tmp1745, i64 1
+  %tmp1747 = getelementptr inbounds float* %tmp1746, i64 1
+  %tmp1748 = getelementptr inbounds float* %tmp1747, i64 1
+  %tmp1749 = getelementptr inbounds float* %tmp1748, i64 1
+  %tmp1750 = getelementptr inbounds float* %tmp1749, i64 1
+  %tmp1751 = getelementptr inbounds float* %tmp1750, i64 1
+  %tmp1752 = getelementptr inbounds float* %tmp1751, i64 1
+  %tmp1753 = getelementptr inbounds float* %tmp1752, i64 1
+  %tmp1754 = getelementptr inbounds float* %tmp1753, i64 1
+  %tmp1755 = getelementptr inbounds float* %tmp1754, i64 1
+  %tmp1756 = getelementptr inbounds float* %tmp1755, i64 1
+  %tmp1757 = getelementptr inbounds float* %tmp1756, i64 1
+  %tmp1758 = getelementptr inbounds float* %tmp1757, i64 1
+  %tmp1759 = getelementptr inbounds float* %tmp1758, i64 1
+  %tmp1760 = getelementptr inbounds float* %tmp1759, i64 1
+  %tmp1761 = getelementptr inbounds float* %tmp1760, i64 1
+  %tmp1762 = getelementptr inbounds float* %tmp1761, i64 1
+  %tmp1763 = getelementptr inbounds float* %tmp1762, i64 1
+  %tmp1764 = getelementptr inbounds float* %tmp1763, i64 1
+  %tmp1765 = getelementptr inbounds float* %tmp1764, i64 1
+  %tmp1766 = getelementptr inbounds float* %tmp1765, i64 1
+  %tmp1767 = getelementptr inbounds float* %tmp1766, i64 1
+  %tmp1768 = getelementptr inbounds float* %tmp1767, i64 1
+  %tmp1769 = getelementptr inbounds float* %tmp1768, i64 1
+  %tmp1770 = getelementptr inbounds float* %tmp1769, i64 1
+  %tmp1771 = getelementptr inbounds float* %tmp1770, i64 1
+  %tmp1772 = getelementptr inbounds float* %tmp1771, i64 1
+  %tmp1773 = getelementptr inbounds float* %tmp1772, i64 1
+  %tmp1774 = getelementptr inbounds float* %tmp1773, i64 1
+  %tmp1775 = getelementptr inbounds float* %tmp1774, i64 1
+  %tmp1776 = getelementptr inbounds float* %tmp1775, i64 1
+  %tmp1777 = getelementptr inbounds float* %tmp1776, i64 1
+  %tmp1778 = getelementptr inbounds float* %tmp1777, i64 1
+  %tmp1779 = getelementptr inbounds float* %tmp1778, i64 1
+  %tmp1780 = getelementptr inbounds float* %tmp1779, i64 1
+  %tmp1781 = getelementptr inbounds float* %tmp1780, i64 1
+  %tmp1782 = getelementptr inbounds float* %tmp1781, i64 1
+  %tmp1783 = getelementptr inbounds float* %tmp1782, i64 1
+  %tmp1784 = getelementptr inbounds float* %tmp1783, i64 1
+  %tmp1785 = getelementptr inbounds float* %tmp1784, i64 1
+  %tmp1786 = getelementptr inbounds float* %tmp1785, i64 1
+  %tmp1787 = getelementptr inbounds float* %tmp1786, i64 1
+  %tmp1788 = getelementptr inbounds float* %tmp1787, i64 1
+  %tmp1789 = getelementptr inbounds float* %tmp1788, i64 1
+  %tmp1790 = getelementptr inbounds float* %tmp1789, i64 1
+  %tmp1791 = getelementptr inbounds float* %tmp1790, i64 1
+  %tmp1792 = getelementptr inbounds float* %tmp1791, i64 1
+  %tmp1793 = getelementptr inbounds float* %tmp1792, i64 1
+  %tmp1794 = getelementptr inbounds float* %tmp1793, i64 1
+  %tmp1795 = getelementptr inbounds float* %tmp1794, i64 1
+  %tmp1796 = getelementptr inbounds float* %tmp1795, i64 1
+  %tmp1797 = getelementptr inbounds float* %tmp1796, i64 1
+  %tmp1798 = getelementptr inbounds float* %tmp1797, i64 1
+  %tmp1799 = getelementptr inbounds float* %tmp1798, i64 1
+  %tmp1800 = getelementptr inbounds float* %tmp1799, i64 1
+  %tmp1801 = getelementptr inbounds float* %tmp1800, i64 1
+  %tmp1802 = getelementptr inbounds float* %tmp1801, i64 1
+  %tmp1803 = getelementptr inbounds float* %tmp1802, i64 1
+  %tmp1804 = getelementptr inbounds float* %tmp1803, i64 1
+  %tmp1805 = getelementptr inbounds float* %tmp1804, i64 1
+  %tmp1806 = getelementptr inbounds float* %tmp1805, i64 1
+  %tmp1807 = getelementptr inbounds float* %tmp1806, i64 1
+  %tmp1808 = getelementptr inbounds float* %tmp1807, i64 1
+  %tmp1809 = getelementptr inbounds float* %tmp1808, i64 1
+  %tmp1810 = getelementptr inbounds float* %tmp1809, i64 1
+  %tmp1811 = getelementptr inbounds float* %tmp1810, i64 1
+  %tmp1812 = getelementptr inbounds float* %tmp1811, i64 1
+  %tmp1813 = getelementptr inbounds float* %tmp1812, i64 1
+  %tmp1814 = getelementptr inbounds float* %tmp1813, i64 1
+  %tmp1815 = getelementptr inbounds float* %tmp1814, i64 1
+  %tmp1816 = getelementptr inbounds float* %tmp1815, i64 1
+  %tmp1817 = getelementptr inbounds float* %tmp1816, i64 1
+  %tmp1818 = getelementptr inbounds float* %tmp1817, i64 1
+  %tmp1819 = getelementptr inbounds float* %tmp1818, i64 1
+  %tmp1820 = getelementptr inbounds float* %tmp1819, i64 1
+  %tmp1821 = getelementptr inbounds float* %tmp1820, i64 1
+  %tmp1822 = getelementptr inbounds float* %tmp1821, i64 1
+  %tmp1823 = getelementptr inbounds float* %tmp1822, i64 1
+  %tmp1824 = getelementptr inbounds float* %tmp1823, i64 1
+  %tmp1825 = getelementptr inbounds float* %tmp1824, i64 1
+  %tmp1826 = getelementptr inbounds float* %tmp1825, i64 1
+  %tmp1827 = getelementptr inbounds float* %tmp1826, i64 1
+  %tmp1828 = getelementptr inbounds float* %tmp1827, i64 1
+  %tmp1829 = getelementptr inbounds float* %tmp1828, i64 1
+  %tmp1830 = getelementptr inbounds float* %tmp1829, i64 1
+  %tmp1831 = getelementptr inbounds float* %tmp1830, i64 1
+  %tmp1832 = getelementptr inbounds float* %tmp1831, i64 1
+  %tmp1833 = getelementptr inbounds float* %tmp1832, i64 1
+  %tmp1834 = getelementptr inbounds float* %tmp1833, i64 1
+  %tmp1835 = getelementptr inbounds float* %tmp1834, i64 1
+  %tmp1836 = getelementptr inbounds float* %tmp1835, i64 1
+  %tmp1837 = getelementptr inbounds float* %tmp1836, i64 1
+  %tmp1838 = getelementptr inbounds float* %tmp1837, i64 1
+  %tmp1839 = getelementptr inbounds float* %tmp1838, i64 1
+  %tmp1840 = getelementptr inbounds float* %tmp1839, i64 1
+  %tmp1841 = getelementptr inbounds float* %tmp1840, i64 1
+  %tmp1842 = getelementptr inbounds float* %tmp1841, i64 1
+  %tmp1843 = getelementptr inbounds float* %tmp1842, i64 1
+  %tmp1844 = getelementptr inbounds float* %tmp1843, i64 1
+  %tmp1845 = getelementptr inbounds float* %tmp1844, i64 1
+  %tmp1846 = getelementptr inbounds float* %tmp1845, i64 1
+  %tmp1847 = getelementptr inbounds float* %tmp1846, i64 1
+  %tmp1848 = getelementptr inbounds float* %tmp1847, i64 1
+  %tmp1849 = getelementptr inbounds float* %tmp1848, i64 1
+  %tmp1850 = getelementptr inbounds float* %tmp1849, i64 1
+  %tmp1851 = getelementptr inbounds float* %tmp1850, i64 1
+  %tmp1852 = getelementptr inbounds float* %tmp1851, i64 1
+  %tmp1853 = getelementptr inbounds float* %tmp1852, i64 1
+  %tmp1854 = getelementptr inbounds float* %tmp1853, i64 1
+  %tmp1855 = getelementptr inbounds float* %tmp1854, i64 1
+  %tmp1856 = getelementptr inbounds float* %tmp1855, i64 1
+  %tmp1857 = getelementptr inbounds float* %tmp1856, i64 1
+  %tmp1858 = getelementptr inbounds float* %tmp1857, i64 1
+  %tmp1859 = getelementptr inbounds float* %tmp1858, i64 1
+  %tmp1860 = getelementptr inbounds float* %tmp1859, i64 1
+  %tmp1861 = getelementptr inbounds float* %tmp1860, i64 1
+  %tmp1862 = getelementptr inbounds float* %tmp1861, i64 1
+  %tmp1863 = getelementptr inbounds float* %tmp1862, i64 1
+  %tmp1864 = getelementptr inbounds float* %tmp1863, i64 1
+  %tmp1865 = getelementptr inbounds float* %tmp1864, i64 1
+  %tmp1866 = getelementptr inbounds float* %tmp1865, i64 1
+  %tmp1867 = getelementptr inbounds float* %tmp1866, i64 1
+  %tmp1868 = getelementptr inbounds float* %tmp1867, i64 1
+  %tmp1869 = getelementptr inbounds float* %tmp1868, i64 1
+  %tmp1870 = getelementptr inbounds float* %tmp1869, i64 1
+  %tmp1871 = getelementptr inbounds float* %tmp1870, i64 1
+  %tmp1872 = getelementptr inbounds float* %tmp1871, i64 1
+  %tmp1873 = getelementptr inbounds float* %tmp1872, i64 1
+  %tmp1874 = getelementptr inbounds float* %tmp1873, i64 1
+  %tmp1875 = getelementptr inbounds float* %tmp1874, i64 1
+  %tmp1876 = getelementptr inbounds float* %tmp1875, i64 1
+  %tmp1877 = getelementptr inbounds float* %tmp1876, i64 1
+  %tmp1878 = getelementptr inbounds float* %tmp1877, i64 1
+  %tmp1879 = getelementptr inbounds float* %tmp1878, i64 1
+  %tmp1880 = getelementptr inbounds float* %tmp1879, i64 1
+  %tmp1881 = getelementptr inbounds float* %tmp1880, i64 1
+  %tmp1882 = getelementptr inbounds float* %tmp1881, i64 1
+  %tmp1883 = getelementptr inbounds float* %tmp1882, i64 1
+  %tmp1884 = getelementptr inbounds float* %tmp1883, i64 1
+  %tmp1885 = getelementptr inbounds float* %tmp1884, i64 1
+  %tmp1886 = getelementptr inbounds float* %tmp1885, i64 1
+  %tmp1887 = getelementptr inbounds float* %tmp1886, i64 1
+  %tmp1888 = getelementptr inbounds float* %tmp1887, i64 1
+  %tmp1889 = getelementptr inbounds float* %tmp1888, i64 1
+  %tmp1890 = getelementptr inbounds float* %tmp1889, i64 1
+  %tmp1891 = getelementptr inbounds float* %tmp1890, i64 1
+  %tmp1892 = getelementptr inbounds float* %tmp1891, i64 1
+  %tmp1893 = getelementptr inbounds float* %tmp1892, i64 1
+  %tmp1894 = getelementptr inbounds float* %tmp1893, i64 1
+  %tmp1895 = getelementptr inbounds float* %tmp1894, i64 1
+  %tmp1896 = getelementptr inbounds float* %tmp1895, i64 1
+  %tmp1897 = getelementptr inbounds float* %tmp1896, i64 1
+  %tmp1898 = getelementptr inbounds float* %tmp1897, i64 1
+  %tmp1899 = getelementptr inbounds float* %tmp1898, i64 1
+  %tmp1900 = getelementptr inbounds float* %tmp1899, i64 1
+  %tmp1901 = getelementptr inbounds float* %tmp1900, i64 1
+  %tmp1902 = getelementptr inbounds float* %tmp1901, i64 1
+  %tmp1903 = getelementptr inbounds float* %tmp1902, i64 1
+  %tmp1904 = getelementptr inbounds float* %tmp1903, i64 1
+  %tmp1905 = getelementptr inbounds float* %tmp1904, i64 1
+  %tmp1906 = getelementptr inbounds float* %tmp1905, i64 1
+  %tmp1907 = getelementptr inbounds float* %tmp1906, i64 1
+  %tmp1908 = getelementptr inbounds float* %tmp1907, i64 1
+  %tmp1909 = getelementptr inbounds float* %tmp1908, i64 1
+  %tmp1910 = getelementptr inbounds float* %tmp1909, i64 1
+  %tmp1911 = getelementptr inbounds float* %tmp1910, i64 1
+  %tmp1912 = getelementptr inbounds float* %tmp1911, i64 1
+  %tmp1913 = getelementptr inbounds float* %tmp1912, i64 1
+  %tmp1914 = getelementptr inbounds float* %tmp1913, i64 1
+  %tmp1915 = getelementptr inbounds float* %tmp1914, i64 1
+  %tmp1916 = getelementptr inbounds float* %tmp1915, i64 1
+  %tmp1917 = getelementptr inbounds float* %tmp1916, i64 1
+  %tmp1918 = getelementptr inbounds float* %tmp1917, i64 1
+  %tmp1919 = getelementptr inbounds float* %tmp1918, i64 1
+  %tmp1920 = getelementptr inbounds float* %tmp1919, i64 1
+  %tmp1921 = getelementptr inbounds float* %tmp1920, i64 1
+  %tmp1922 = getelementptr inbounds float* %tmp1921, i64 1
+  %tmp1923 = getelementptr inbounds float* %tmp1922, i64 1
+  %tmp1924 = getelementptr inbounds float* %tmp1923, i64 1
+  %tmp1925 = getelementptr inbounds float* %tmp1924, i64 1
+  %tmp1926 = getelementptr inbounds float* %tmp1925, i64 1
+  %tmp1927 = getelementptr inbounds float* %tmp1926, i64 1
+  %tmp1928 = getelementptr inbounds float* %tmp1927, i64 1
+  %tmp1929 = getelementptr inbounds float* %tmp1928, i64 1
+  %tmp1930 = getelementptr inbounds float* %tmp1929, i64 1
+  %tmp1931 = getelementptr inbounds float* %tmp1930, i64 1
+  %tmp1932 = getelementptr inbounds float* %tmp1931, i64 1
+  %tmp1933 = getelementptr inbounds float* %tmp1932, i64 1
+  %tmp1934 = getelementptr inbounds float* %tmp1933, i64 1
+  %tmp1935 = getelementptr inbounds float* %tmp1934, i64 1
+  %tmp1936 = getelementptr inbounds float* %tmp1935, i64 1
+  %tmp1937 = getelementptr inbounds float* %tmp1936, i64 1
+  %tmp1938 = getelementptr inbounds float* %tmp1937, i64 1
+  %tmp1939 = getelementptr inbounds float* %tmp1938, i64 1
+  %tmp1940 = getelementptr inbounds float* %tmp1939, i64 1
+  %tmp1941 = getelementptr inbounds float* %tmp1940, i64 1
+  %tmp1942 = getelementptr inbounds float* %tmp1941, i64 1
+  %tmp1943 = getelementptr inbounds float* %tmp1942, i64 1
+  %tmp1944 = getelementptr inbounds float* %tmp1943, i64 1
+  %tmp1945 = getelementptr inbounds float* %tmp1944, i64 1
+  %tmp1946 = getelementptr inbounds float* %tmp1945, i64 1
+  %tmp1947 = getelementptr inbounds float* %tmp1946, i64 1
+  %tmp1948 = getelementptr inbounds float* %tmp1947, i64 1
+  %tmp1949 = getelementptr inbounds float* %tmp1948, i64 1
+  %tmp1950 = getelementptr inbounds float* %tmp1949, i64 1
+  %tmp1951 = getelementptr inbounds float* %tmp1950, i64 1
+  %tmp1952 = getelementptr inbounds float* %tmp1951, i64 1
+  %tmp1953 = getelementptr inbounds float* %tmp1952, i64 1
+  %tmp1954 = getelementptr inbounds float* %tmp1953, i64 1
+  %tmp1955 = getelementptr inbounds float* %tmp1954, i64 1
+  %tmp1956 = getelementptr inbounds float* %tmp1955, i64 1
+  %tmp1957 = getelementptr inbounds float* %tmp1956, i64 1
+  %tmp1958 = getelementptr inbounds float* %tmp1957, i64 1
+  %tmp1959 = getelementptr inbounds float* %tmp1958, i64 1
+  %tmp1960 = getelementptr inbounds float* %tmp1959, i64 1
+  %tmp1961 = getelementptr inbounds float* %tmp1960, i64 1
+  %tmp1962 = getelementptr inbounds float* %tmp1961, i64 1
+  %tmp1963 = getelementptr inbounds float* %tmp1962, i64 1
+  %tmp1964 = getelementptr inbounds float* %tmp1963, i64 1
+  %tmp1965 = getelementptr inbounds float* %tmp1964, i64 1
+  %tmp1966 = getelementptr inbounds float* %tmp1965, i64 1
+  %tmp1967 = getelementptr inbounds float* %tmp1966, i64 1
+  %tmp1968 = getelementptr inbounds float* %tmp1967, i64 1
+  %tmp1969 = getelementptr inbounds float* %tmp1968, i64 1
+  %tmp1970 = getelementptr inbounds float* %tmp1969, i64 1
+  %tmp1971 = getelementptr inbounds float* %tmp1970, i64 1
+  %tmp1972 = getelementptr inbounds float* %tmp1971, i64 1
+  %tmp1973 = getelementptr inbounds float* %tmp1972, i64 1
+  %tmp1974 = getelementptr inbounds float* %tmp1973, i64 1
+  %tmp1975 = getelementptr inbounds float* %tmp1974, i64 1
+  %tmp1976 = getelementptr inbounds float* %tmp1975, i64 1
+  %tmp1977 = getelementptr inbounds float* %tmp1976, i64 1
+  %tmp1978 = getelementptr inbounds float* %tmp1977, i64 1
+  %tmp1979 = getelementptr inbounds float* %tmp1978, i64 1
+  %tmp1980 = getelementptr inbounds float* %tmp1979, i64 1
+  %tmp1981 = getelementptr inbounds float* %tmp1980, i64 1
+  %tmp1982 = getelementptr inbounds float* %tmp1981, i64 1
+  %tmp1983 = getelementptr inbounds float* %tmp1982, i64 1
+  %tmp1984 = getelementptr inbounds float* %tmp1983, i64 1
+  %tmp1985 = getelementptr inbounds float* %tmp1984, i64 1
+  %tmp1986 = getelementptr inbounds float* %tmp1985, i64 1
+  %tmp1987 = getelementptr inbounds float* %tmp1986, i64 1
+  %tmp1988 = getelementptr inbounds float* %tmp1987, i64 1
+  %tmp1989 = getelementptr inbounds float* %tmp1988, i64 1
+  %tmp1990 = getelementptr inbounds float* %tmp1989, i64 1
+  %tmp1991 = getelementptr inbounds float* %tmp1990, i64 1
+  %tmp1992 = getelementptr inbounds float* %tmp1991, i64 1
+  %tmp1993 = getelementptr inbounds float* %tmp1992, i64 1
+  %tmp1994 = getelementptr inbounds float* %tmp1993, i64 1
+  %tmp1995 = getelementptr inbounds float* %tmp1994, i64 1
+  %tmp1996 = getelementptr inbounds float* %tmp1995, i64 1
+  %tmp1997 = getelementptr inbounds float* %tmp1996, i64 1
+  %tmp1998 = getelementptr inbounds float* %tmp1997, i64 1
+  %tmp1999 = getelementptr inbounds float* %tmp1998, i64 1
+  %tmp2000 = getelementptr inbounds float* %tmp1999, i64 1
+  %tmp2001 = getelementptr inbounds float* %tmp2000, i64 1
+  %tmp2002 = getelementptr inbounds float* %tmp2001, i64 1
+  %tmp2003 = getelementptr inbounds float* %tmp2002, i64 1
+  %tmp2004 = getelementptr inbounds float* %tmp2003, i64 1
+  %tmp2005 = getelementptr inbounds float* %tmp2004, i64 1
+  %tmp2006 = getelementptr inbounds float* %tmp2005, i64 1
+  %tmp2007 = getelementptr inbounds float* %tmp2006, i64 1
+  %tmp2008 = getelementptr inbounds float* %tmp2007, i64 1
+  %tmp2009 = getelementptr inbounds float* %tmp2008, i64 1
+  %tmp2010 = getelementptr inbounds float* %tmp2009, i64 1
+  %tmp2011 = getelementptr inbounds float* %tmp2010, i64 1
+  %tmp2012 = getelementptr inbounds float* %tmp2011, i64 1
+  %tmp2013 = getelementptr inbounds float* %tmp2012, i64 1
+  %tmp2014 = getelementptr inbounds float* %tmp2013, i64 1
+  %tmp2015 = getelementptr inbounds float* %tmp2014, i64 1
+  %tmp2016 = getelementptr inbounds float* %tmp2015, i64 1
+  %tmp2017 = getelementptr inbounds float* %tmp2016, i64 1
+  %tmp2018 = getelementptr inbounds float* %tmp2017, i64 1
+  %tmp2019 = getelementptr inbounds float* %tmp2018, i64 1
+  %tmp2020 = getelementptr inbounds float* %tmp2019, i64 1
+  %tmp2021 = getelementptr inbounds float* %tmp2020, i64 1
+  %tmp2022 = getelementptr inbounds float* %tmp2021, i64 1
+  %tmp2023 = getelementptr inbounds float* %tmp2022, i64 1
+  %tmp2024 = getelementptr inbounds float* %tmp2023, i64 1
+  %tmp2025 = getelementptr inbounds float* %tmp2024, i64 1
+  %tmp2026 = getelementptr inbounds float* %tmp2025, i64 1
+  %tmp2027 = getelementptr inbounds float* %tmp2026, i64 1
+  %tmp2028 = getelementptr inbounds float* %tmp2027, i64 1
+  %tmp2029 = getelementptr inbounds float* %tmp2028, i64 1
+  %tmp2030 = getelementptr inbounds float* %tmp2029, i64 1
+  %tmp2031 = getelementptr inbounds float* %tmp2030, i64 1
+  %tmp2032 = getelementptr inbounds float* %tmp2031, i64 1
+  %tmp2033 = getelementptr inbounds float* %tmp2032, i64 1
+  %tmp2034 = getelementptr inbounds float* %tmp2033, i64 1
+  %tmp2035 = getelementptr inbounds float* %tmp2034, i64 1
+  %tmp2036 = getelementptr inbounds float* %tmp2035, i64 1
+  %tmp2037 = getelementptr inbounds float* %tmp2036, i64 1
+  %tmp2038 = getelementptr inbounds float* %tmp2037, i64 1
+  %tmp2039 = getelementptr inbounds float* %tmp2038, i64 1
+  %tmp2040 = getelementptr inbounds float* %tmp2039, i64 1
+  %tmp2041 = getelementptr inbounds float* %tmp2040, i64 1
+  %tmp2042 = getelementptr inbounds float* %tmp2041, i64 1
+  %tmp2043 = getelementptr inbounds float* %tmp2042, i64 1
+  %tmp2044 = getelementptr inbounds float* %tmp2043, i64 1
+  %tmp2045 = getelementptr inbounds float* %tmp2044, i64 1
+  %tmp2046 = getelementptr inbounds float* %tmp2045, i64 1
+  %tmp2047 = getelementptr inbounds float* %tmp2046, i64 1
+  %tmp2048 = getelementptr inbounds float* %tmp2047, i64 1
+  %tmp2049 = getelementptr inbounds float* %tmp2048, i64 1
+  %tmp2050 = getelementptr inbounds float* %tmp2049, i64 1
+  %tmp2051 = getelementptr inbounds float* %tmp2050, i64 1
+  %tmp2052 = getelementptr inbounds float* %tmp2051, i64 1
+  %tmp2053 = getelementptr inbounds float* %tmp2052, i64 1
+  %tmp2054 = getelementptr inbounds float* %tmp2053, i64 1
+  %tmp2055 = getelementptr inbounds float* %tmp2054, i64 1
+  %tmp2056 = getelementptr inbounds float* %tmp2055, i64 1
+  %tmp2057 = getelementptr inbounds float* %tmp2056, i64 1
+  %tmp2058 = getelementptr inbounds float* %tmp2057, i64 1
+  %tmp2059 = getelementptr inbounds float* %tmp2058, i64 1
+  %tmp2060 = getelementptr inbounds float* %tmp2059, i64 1
+  %tmp2061 = getelementptr inbounds float* %tmp2060, i64 1
+  %tmp2062 = getelementptr inbounds float* %tmp2061, i64 1
+  %tmp2063 = getelementptr inbounds float* %tmp2062, i64 1
+  %tmp2064 = getelementptr inbounds float* %tmp2063, i64 1
+  %tmp2065 = getelementptr inbounds float* %tmp2064, i64 1
+  %tmp2066 = getelementptr inbounds float* %tmp2065, i64 1
+  %tmp2067 = getelementptr inbounds float* %tmp2066, i64 1
+  %tmp2068 = getelementptr inbounds float* %tmp2067, i64 1
+  %tmp2069 = getelementptr inbounds float* %tmp2068, i64 1
+  %tmp2070 = getelementptr inbounds float* %tmp2069, i64 1
+  %tmp2071 = getelementptr inbounds float* %tmp2070, i64 1
+  %tmp2072 = getelementptr inbounds float* %tmp2071, i64 1
+  %tmp2073 = getelementptr inbounds float* %tmp2072, i64 1
+  %tmp2074 = getelementptr inbounds float* %tmp2073, i64 1
+  %tmp2075 = getelementptr inbounds float* %tmp2074, i64 1
+  %tmp2076 = getelementptr inbounds float* %tmp2075, i64 1
+  %tmp2077 = getelementptr inbounds float* %tmp2076, i64 1
+  %tmp2078 = getelementptr inbounds float* %tmp2077, i64 1
+  %tmp2079 = getelementptr inbounds float* %tmp2078, i64 1
+  %tmp2080 = getelementptr inbounds float* %tmp2079, i64 1
+  %tmp2081 = getelementptr inbounds float* %tmp2080, i64 1
+  %tmp2082 = getelementptr inbounds float* %tmp2081, i64 1
+  %tmp2083 = getelementptr inbounds float* %tmp2082, i64 1
+  %tmp2084 = getelementptr inbounds float* %tmp2083, i64 1
+  %tmp2085 = getelementptr inbounds float* %tmp2084, i64 1
+  %tmp2086 = getelementptr inbounds float* %tmp2085, i64 1
+  %tmp2087 = getelementptr inbounds float* %tmp2086, i64 1
+  %tmp2088 = getelementptr inbounds float* %tmp2087, i64 1
+  %tmp2089 = getelementptr inbounds float* %tmp2088, i64 1
+  %tmp2090 = getelementptr inbounds float* %tmp2089, i64 1
+  %tmp2091 = getelementptr inbounds float* %tmp2090, i64 1
+  %tmp2092 = getelementptr inbounds float* %tmp2091, i64 1
+  %tmp2093 = getelementptr inbounds float* %tmp2092, i64 1
+  %tmp2094 = getelementptr inbounds float* %tmp2093, i64 1
+  %tmp2095 = getelementptr inbounds float* %tmp2094, i64 1
+  %tmp2096 = getelementptr inbounds float* %tmp2095, i64 1
+  %tmp2097 = getelementptr inbounds float* %tmp2096, i64 1
+  %tmp2098 = getelementptr inbounds float* %tmp2097, i64 1
+  %tmp2099 = getelementptr inbounds float* %tmp2098, i64 1
+  %tmp2100 = getelementptr inbounds float* %tmp2099, i64 1
+  %tmp2101 = getelementptr inbounds float* %tmp2100, i64 1
+  %tmp2102 = getelementptr inbounds float* %tmp2101, i64 1
+  %tmp2103 = getelementptr inbounds float* %tmp2102, i64 1
+  %tmp2104 = getelementptr inbounds float* %tmp2103, i64 1
+  %tmp2105 = getelementptr inbounds float* %tmp2104, i64 1
+  %tmp2106 = getelementptr inbounds float* %tmp2105, i64 1
+  %tmp2107 = getelementptr inbounds float* %tmp2106, i64 1
+  %tmp2108 = getelementptr inbounds float* %tmp2107, i64 1
+  %tmp2109 = getelementptr inbounds float* %tmp2108, i64 1
+  %tmp2110 = getelementptr inbounds float* %tmp2109, i64 1
+  %tmp2111 = getelementptr inbounds float* %tmp2110, i64 1
+  %tmp2112 = getelementptr inbounds float* %tmp2111, i64 1
+  %tmp2113 = getelementptr inbounds float* %tmp2112, i64 1
+  %tmp2114 = getelementptr inbounds float* %tmp2113, i64 1
+  %tmp2115 = getelementptr inbounds float* %tmp2114, i64 1
+  %tmp2116 = getelementptr inbounds float* %tmp2115, i64 1
+  %tmp2117 = getelementptr inbounds float* %tmp2116, i64 1
+  %tmp2118 = getelementptr inbounds float* %tmp2117, i64 1
+  %tmp2119 = getelementptr inbounds float* %tmp2118, i64 1
+  %tmp2120 = getelementptr inbounds float* %tmp2119, i64 1
+  %tmp2121 = getelementptr inbounds float* %tmp2120, i64 1
+  %tmp2122 = getelementptr inbounds float* %tmp2121, i64 1
+  %tmp2123 = getelementptr inbounds float* %tmp2122, i64 1
+  %tmp2124 = getelementptr inbounds float* %tmp2123, i64 1
+  %tmp2125 = getelementptr inbounds float* %tmp2124, i64 1
+  %tmp2126 = getelementptr inbounds float* %tmp2125, i64 1
+  %tmp2127 = getelementptr inbounds float* %tmp2126, i64 1
+  %tmp2128 = getelementptr inbounds float* %tmp2127, i64 1
+  %tmp2129 = getelementptr inbounds float* %tmp2128, i64 1
+  %tmp2130 = getelementptr inbounds float* %tmp2129, i64 1
+  %tmp2131 = getelementptr inbounds float* %tmp2130, i64 1
+  %tmp2132 = getelementptr inbounds float* %tmp2131, i64 1
+  %tmp2133 = getelementptr inbounds float* %tmp2132, i64 1
+  %tmp2134 = getelementptr inbounds float* %tmp2133, i64 1
+  %tmp2135 = getelementptr inbounds float* %tmp2134, i64 1
+  %tmp2136 = getelementptr inbounds float* %tmp2135, i64 1
+  %tmp2137 = getelementptr inbounds float* %tmp2136, i64 1
+  %tmp2138 = getelementptr inbounds float* %tmp2137, i64 1
+  %tmp2139 = getelementptr inbounds float* %tmp2138, i64 1
+  %tmp2140 = getelementptr inbounds float* %tmp2139, i64 1
+  %tmp2141 = getelementptr inbounds float* %tmp2140, i64 1
+  %tmp2142 = getelementptr inbounds float* %tmp2141, i64 1
+  %tmp2143 = getelementptr inbounds float* %tmp2142, i64 1
+  %tmp2144 = getelementptr inbounds float* %tmp2143, i64 1
+  %tmp2145 = getelementptr inbounds float* %tmp2144, i64 1
+  %tmp2146 = getelementptr inbounds float* %tmp2145, i64 1
+  %tmp2147 = getelementptr inbounds float* %tmp2146, i64 1
+  %tmp2148 = getelementptr inbounds float* %tmp2147, i64 1
+  %tmp2149 = getelementptr inbounds float* %tmp2148, i64 1
+  %tmp2150 = getelementptr inbounds float* %tmp2149, i64 1
+  %tmp2151 = getelementptr inbounds float* %tmp2150, i64 1
+  %tmp2152 = getelementptr inbounds float* %tmp2151, i64 1
+  %tmp2153 = getelementptr inbounds float* %tmp2152, i64 1
+  %tmp2154 = getelementptr inbounds float* %tmp2153, i64 1
+  %tmp2155 = getelementptr inbounds float* %tmp2154, i64 1
+  %tmp2156 = getelementptr inbounds float* %tmp2155, i64 1
+  %tmp2157 = getelementptr inbounds float* %tmp2156, i64 1
+  %tmp2158 = getelementptr inbounds float* %tmp2157, i64 1
+  %tmp2159 = getelementptr inbounds float* %tmp2158, i64 1
+  %tmp2160 = getelementptr inbounds float* %tmp2159, i64 1
+  %tmp2161 = getelementptr inbounds float* %tmp2160, i64 1
+  %tmp2162 = getelementptr inbounds float* %tmp2161, i64 1
+  %tmp2163 = getelementptr inbounds float* %tmp2162, i64 1
+  %tmp2164 = getelementptr inbounds float* %tmp2163, i64 1
+  %tmp2165 = getelementptr inbounds float* %tmp2164, i64 1
+  %tmp2166 = getelementptr inbounds float* %tmp2165, i64 1
+  %tmp2167 = getelementptr inbounds float* %tmp2166, i64 1
+  %tmp2168 = getelementptr inbounds float* %tmp2167, i64 1
+  %tmp2169 = getelementptr inbounds float* %tmp2168, i64 1
+  %tmp2170 = getelementptr inbounds float* %tmp2169, i64 1
+  %tmp2171 = getelementptr inbounds float* %tmp2170, i64 1
+  %tmp2172 = getelementptr inbounds float* %tmp2171, i64 1
+  %tmp2173 = getelementptr inbounds float* %tmp2172, i64 1
+  %tmp2174 = getelementptr inbounds float* %tmp2173, i64 1
+  %tmp2175 = getelementptr inbounds float* %tmp2174, i64 1
+  %tmp2176 = getelementptr inbounds float* %tmp2175, i64 1
+  %tmp2177 = getelementptr inbounds float* %tmp2176, i64 1
+  %tmp2178 = getelementptr inbounds float* %tmp2177, i64 1
+  %tmp2179 = getelementptr inbounds float* %tmp2178, i64 1
+  %tmp2180 = getelementptr inbounds float* %tmp2179, i64 1
+  %tmp2181 = getelementptr inbounds float* %tmp2180, i64 1
+  %tmp2182 = getelementptr inbounds float* %tmp2181, i64 1
+  %tmp2183 = getelementptr inbounds float* %tmp2182, i64 1
+  %tmp2184 = getelementptr inbounds float* %tmp2183, i64 1
+  %tmp2185 = getelementptr inbounds float* %tmp2184, i64 1
+  %tmp2186 = getelementptr inbounds float* %tmp2185, i64 1
+  %tmp2187 = getelementptr inbounds float* %tmp2186, i64 1
+  %tmp2188 = getelementptr inbounds float* %tmp2187, i64 1
+  %tmp2189 = getelementptr inbounds float* %tmp2188, i64 1
+  %tmp2190 = getelementptr inbounds float* %tmp2189, i64 1
+  %tmp2191 = getelementptr inbounds float* %tmp2190, i64 1
+  %tmp2192 = getelementptr inbounds float* %tmp2191, i64 1
+  %tmp2193 = getelementptr inbounds float* %tmp2192, i64 1
+  %tmp2194 = getelementptr inbounds float* %tmp2193, i64 1
+  %tmp2195 = getelementptr inbounds float* %tmp2194, i64 1
+  %tmp2196 = getelementptr inbounds float* %tmp2195, i64 1
+  %tmp2197 = getelementptr inbounds float* %tmp2196, i64 1
+  %tmp2198 = getelementptr inbounds float* %tmp2197, i64 1
+  %tmp2199 = getelementptr inbounds float* %tmp2198, i64 1
+  %tmp2200 = getelementptr inbounds float* %tmp2199, i64 1
+  %tmp2201 = getelementptr inbounds float* %tmp2200, i64 1
+  %tmp2202 = getelementptr inbounds float* %tmp2201, i64 1
+  %tmp2203 = getelementptr inbounds float* %tmp2202, i64 1
+  %tmp2204 = getelementptr inbounds float* %tmp2203, i64 1
+  %tmp2205 = getelementptr inbounds float* %tmp2204, i64 1
+  %tmp2206 = getelementptr inbounds float* %tmp2205, i64 1
+  %tmp2207 = getelementptr inbounds float* %tmp2206, i64 1
+  %tmp2208 = getelementptr inbounds float* %tmp2207, i64 1
+  %tmp2209 = getelementptr inbounds float* %tmp2208, i64 1
+  %tmp2210 = getelementptr inbounds float* %tmp2209, i64 1
+  %tmp2211 = getelementptr inbounds float* %tmp2210, i64 1
+  %tmp2212 = getelementptr inbounds float* %tmp2211, i64 1
+  %tmp2213 = getelementptr inbounds float* %tmp2212, i64 1
+  %tmp2214 = getelementptr inbounds float* %tmp2213, i64 1
+  %tmp2215 = getelementptr inbounds float* %tmp2214, i64 1
+  %tmp2216 = getelementptr inbounds float* %tmp2215, i64 1
+  %tmp2217 = getelementptr inbounds float* %tmp2216, i64 1
+  %tmp2218 = getelementptr inbounds float* %tmp2217, i64 1
+  %tmp2219 = getelementptr inbounds float* %tmp2218, i64 1
+  %tmp2220 = getelementptr inbounds float* %tmp2219, i64 1
+  %tmp2221 = getelementptr inbounds float* %tmp2220, i64 1
+  %tmp2222 = getelementptr inbounds float* %tmp2221, i64 1
+  %tmp2223 = getelementptr inbounds float* %tmp2222, i64 1
+  %tmp2224 = getelementptr inbounds float* %tmp2223, i64 1
+  %tmp2225 = getelementptr inbounds float* %tmp2224, i64 1
+  %tmp2226 = getelementptr inbounds float* %tmp2225, i64 1
+  %tmp2227 = getelementptr inbounds float* %tmp2226, i64 1
+  %tmp2228 = getelementptr inbounds float* %tmp2227, i64 1
+  %tmp2229 = getelementptr inbounds float* %tmp2228, i64 1
+  %tmp2230 = getelementptr inbounds float* %tmp2229, i64 1
+  %tmp2231 = getelementptr inbounds float* %tmp2230, i64 1
+  %tmp2232 = getelementptr inbounds float* %tmp2231, i64 1
+  %tmp2233 = getelementptr inbounds float* %tmp2232, i64 1
+  %tmp2234 = getelementptr inbounds float* %tmp2233, i64 1
+  %tmp2235 = getelementptr inbounds float* %tmp2234, i64 1
+  %tmp2236 = getelementptr inbounds float* %tmp2235, i64 1
+  %tmp2237 = getelementptr inbounds float* %tmp2236, i64 1
+  %tmp2238 = getelementptr inbounds float* %tmp2237, i64 1
+  %tmp2239 = getelementptr inbounds float* %tmp2238, i64 1
+  %tmp2240 = getelementptr inbounds float* %tmp2239, i64 1
+  %tmp2241 = getelementptr inbounds float* %tmp2240, i64 1
+  %tmp2242 = getelementptr inbounds float* %tmp2241, i64 1
+  %tmp2243 = getelementptr inbounds float* %tmp2242, i64 1
+  %tmp2244 = getelementptr inbounds float* %tmp2243, i64 1
+  %tmp2245 = getelementptr inbounds float* %tmp2244, i64 1
+  %tmp2246 = getelementptr inbounds float* %tmp2245, i64 1
+  %tmp2247 = getelementptr inbounds float* %tmp2246, i64 1
+  %tmp2248 = getelementptr inbounds float* %tmp2247, i64 1
+  %tmp2249 = getelementptr inbounds float* %tmp2248, i64 1
+  %tmp2250 = getelementptr inbounds float* %tmp2249, i64 1
+  %tmp2251 = getelementptr inbounds float* %tmp2250, i64 1
+  %tmp2252 = getelementptr inbounds float* %tmp2251, i64 1
+  %tmp2253 = getelementptr inbounds float* %tmp2252, i64 1
+  %tmp2254 = getelementptr inbounds float* %tmp2253, i64 1
+  %tmp2255 = getelementptr inbounds float* %tmp2254, i64 1
+  %tmp2256 = getelementptr inbounds float* %tmp2255, i64 1
+  %tmp2257 = getelementptr inbounds float* %tmp2256, i64 1
+  %tmp2258 = getelementptr inbounds float* %tmp2257, i64 1
+  %tmp2259 = getelementptr inbounds float* %tmp2258, i64 1
+  %tmp2260 = getelementptr inbounds float* %tmp2259, i64 1
+  %tmp2261 = getelementptr inbounds float* %tmp2260, i64 1
+  %tmp2262 = getelementptr inbounds float* %tmp2261, i64 1
+  %tmp2263 = getelementptr inbounds float* %tmp2262, i64 1
+  %tmp2264 = getelementptr inbounds float* %tmp2263, i64 1
+  %tmp2265 = getelementptr inbounds float* %tmp2264, i64 1
+  %tmp2266 = getelementptr inbounds float* %tmp2265, i64 1
+  %tmp2267 = getelementptr inbounds float* %tmp2266, i64 1
+  %tmp2268 = getelementptr inbounds float* %tmp2267, i64 1
+  %tmp2269 = getelementptr inbounds float* %tmp2268, i64 1
+  %tmp2270 = getelementptr inbounds float* %tmp2269, i64 1
+  %tmp2271 = getelementptr inbounds float* %tmp2270, i64 1
+  %tmp2272 = getelementptr inbounds float* %tmp2271, i64 1
+  %tmp2273 = getelementptr inbounds float* %tmp2272, i64 1
+  %tmp2274 = getelementptr inbounds float* %tmp2273, i64 1
+  %tmp2275 = getelementptr inbounds float* %tmp2274, i64 1
+  %tmp2276 = getelementptr inbounds float* %tmp2275, i64 1
+  %tmp2277 = getelementptr inbounds float* %tmp2276, i64 1
+  %tmp2278 = getelementptr inbounds float* %tmp2277, i64 1
+  %tmp2279 = getelementptr inbounds float* %tmp2278, i64 1
+  %tmp2280 = getelementptr inbounds float* %tmp2279, i64 1
+  %tmp2281 = getelementptr inbounds float* %tmp2280, i64 1
+  %tmp2282 = getelementptr inbounds float* %tmp2281, i64 1
+  %tmp2283 = getelementptr inbounds float* %tmp2282, i64 1
+  %tmp2284 = getelementptr inbounds float* %tmp2283, i64 1
+  %tmp2285 = getelementptr inbounds float* %tmp2284, i64 1
+  %tmp2286 = getelementptr inbounds float* %tmp2285, i64 1
+  %tmp2287 = getelementptr inbounds float* %tmp2286, i64 1
+  %tmp2288 = getelementptr inbounds float* %tmp2287, i64 1
+  %tmp2289 = getelementptr inbounds float* %tmp2288, i64 1
+  %tmp2290 = getelementptr inbounds float* %tmp2289, i64 1
+  %tmp2291 = getelementptr inbounds float* %tmp2290, i64 1
+  %tmp2292 = getelementptr inbounds float* %tmp2291, i64 1
+  %tmp2293 = getelementptr inbounds float* %tmp2292, i64 1
+  %tmp2294 = getelementptr inbounds float* %tmp2293, i64 1
+  %tmp2295 = getelementptr inbounds float* %tmp2294, i64 1
+  %tmp2296 = getelementptr inbounds float* %tmp2295, i64 1
+  %tmp2297 = getelementptr inbounds float* %tmp2296, i64 1
+  %tmp2298 = getelementptr inbounds float* %tmp2297, i64 1
+  %tmp2299 = getelementptr inbounds float* %tmp2298, i64 1
+  %tmp2300 = getelementptr inbounds float* %tmp2299, i64 1
+  %tmp2301 = getelementptr inbounds float* %tmp2300, i64 1
+  %tmp2302 = getelementptr inbounds float* %tmp2301, i64 1
+  %tmp2303 = getelementptr inbounds float* %tmp2302, i64 1
+  %tmp2304 = getelementptr inbounds float* %tmp2303, i64 1
+  %tmp2305 = getelementptr inbounds float* %tmp2304, i64 1
+  %tmp2306 = getelementptr inbounds float* %tmp2305, i64 1
+  %tmp2307 = getelementptr inbounds float* %tmp2306, i64 1
+  %tmp2308 = getelementptr inbounds float* %tmp2307, i64 1
+  %tmp2309 = getelementptr inbounds float* %tmp2308, i64 1
+  %tmp2310 = getelementptr inbounds float* %tmp2309, i64 1
+  %tmp2311 = getelementptr inbounds float* %tmp2310, i64 1
+  %tmp2312 = getelementptr inbounds float* %tmp2311, i64 1
+  %tmp2313 = getelementptr inbounds float* %tmp2312, i64 1
+  %tmp2314 = getelementptr inbounds float* %tmp2313, i64 1
+  %tmp2315 = getelementptr inbounds float* %tmp2314, i64 1
+  %tmp2316 = getelementptr inbounds float* %tmp2315, i64 1
+  %tmp2317 = getelementptr inbounds float* %tmp2316, i64 1
+  %tmp2318 = getelementptr inbounds float* %tmp2317, i64 1
+  %tmp2319 = getelementptr inbounds float* %tmp2318, i64 1
+  %tmp2320 = getelementptr inbounds float* %tmp2319, i64 1
+  %tmp2321 = getelementptr inbounds float* %tmp2320, i64 1
+  %tmp2322 = getelementptr inbounds float* %tmp2321, i64 1
+  %tmp2323 = getelementptr inbounds float* %tmp2322, i64 1
+  %tmp2324 = getelementptr inbounds float* %tmp2323, i64 1
+  %tmp2325 = getelementptr inbounds float* %tmp2324, i64 1
+  %tmp2326 = getelementptr inbounds float* %tmp2325, i64 1
+  %tmp2327 = getelementptr inbounds float* %tmp2326, i64 1
+  %tmp2328 = getelementptr inbounds float* %tmp2327, i64 1
+  %tmp2329 = getelementptr inbounds float* %tmp2328, i64 1
+  %tmp2330 = getelementptr inbounds float* %tmp2329, i64 1
+  %tmp2331 = getelementptr inbounds float* %tmp2330, i64 1
+  %tmp2332 = getelementptr inbounds float* %tmp2331, i64 1
+  %tmp2333 = getelementptr inbounds float* %tmp2332, i64 1
+  %tmp2334 = getelementptr inbounds float* %tmp2333, i64 1
+  %tmp2335 = getelementptr inbounds float* %tmp2334, i64 1
+  %tmp2336 = getelementptr inbounds float* %tmp2335, i64 1
+  %tmp2337 = getelementptr inbounds float* %tmp2336, i64 1
+  %tmp2338 = getelementptr inbounds float* %tmp2337, i64 1
+  %tmp2339 = getelementptr inbounds float* %tmp2338, i64 1
+  %tmp2340 = getelementptr inbounds float* %tmp2339, i64 1
+  %tmp2341 = getelementptr inbounds float* %tmp2340, i64 1
+  %tmp2342 = getelementptr inbounds float* %tmp2341, i64 1
+  %tmp2343 = getelementptr inbounds float* %tmp2342, i64 1
+  %tmp2344 = getelementptr inbounds float* %tmp2343, i64 1
+  %tmp2345 = getelementptr inbounds float* %tmp2344, i64 1
+  %tmp2346 = getelementptr inbounds float* %tmp2345, i64 1
+  %tmp2347 = getelementptr inbounds float* %tmp2346, i64 1
+  %tmp2348 = getelementptr inbounds float* %tmp2347, i64 1
+  %tmp2349 = getelementptr inbounds float* %tmp2348, i64 1
+  %tmp2350 = getelementptr inbounds float* %tmp2349, i64 1
+  %tmp2351 = getelementptr inbounds float* %tmp2350, i64 1
+  %tmp2352 = getelementptr inbounds float* %tmp2351, i64 1
+  %tmp2353 = getelementptr inbounds float* %tmp2352, i64 1
+  %tmp2354 = getelementptr inbounds float* %tmp2353, i64 1
+  %tmp2355 = getelementptr inbounds float* %tmp2354, i64 1
+  %tmp2356 = getelementptr inbounds float* %tmp2355, i64 1
+  %tmp2357 = getelementptr inbounds float* %tmp2356, i64 1
+  %tmp2358 = getelementptr inbounds float* %tmp2357, i64 1
+  %tmp2359 = getelementptr inbounds float* %tmp2358, i64 1
+  %tmp2360 = getelementptr inbounds float* %tmp2359, i64 1
+  %tmp2361 = getelementptr inbounds float* %tmp2360, i64 1
+  %tmp2362 = getelementptr inbounds float* %tmp2361, i64 1
+  %tmp2363 = getelementptr inbounds float* %tmp2362, i64 1
+  %tmp2364 = getelementptr inbounds float* %tmp2363, i64 1
+  %tmp2365 = getelementptr inbounds float* %tmp2364, i64 1
+  %tmp2366 = getelementptr inbounds float* %tmp2365, i64 1
+  %tmp2367 = getelementptr inbounds float* %tmp2366, i64 1
+  %tmp2368 = getelementptr inbounds float* %tmp2367, i64 1
+  %tmp2369 = getelementptr inbounds float* %tmp2368, i64 1
+  %tmp2370 = getelementptr inbounds float* %tmp2369, i64 1
+  %tmp2371 = getelementptr inbounds float* %tmp2370, i64 1
+  %tmp2372 = getelementptr inbounds float* %tmp2371, i64 1
+  %tmp2373 = getelementptr inbounds float* %tmp2372, i64 1
+  %tmp2374 = getelementptr inbounds float* %tmp2373, i64 1
+  %tmp2375 = getelementptr inbounds float* %tmp2374, i64 1
+  %tmp2376 = getelementptr inbounds float* %tmp2375, i64 1
+  %tmp2377 = getelementptr inbounds float* %tmp2376, i64 1
+  %tmp2378 = getelementptr inbounds float* %tmp2377, i64 1
+  %tmp2379 = getelementptr inbounds float* %tmp2378, i64 1
+  %tmp2380 = getelementptr inbounds float* %tmp2379, i64 1
+  %tmp2381 = getelementptr inbounds float* %tmp2380, i64 1
+  %tmp2382 = getelementptr inbounds float* %tmp2381, i64 1
+  %tmp2383 = getelementptr inbounds float* %tmp2382, i64 1
+  %tmp2384 = getelementptr inbounds float* %tmp2383, i64 1
+  %tmp2385 = getelementptr inbounds float* %tmp2384, i64 1
+  %tmp2386 = getelementptr inbounds float* %tmp2385, i64 1
+  %tmp2387 = getelementptr inbounds float* %tmp2386, i64 1
+  %tmp2388 = getelementptr inbounds float* %tmp2387, i64 1
+  %tmp2389 = getelementptr inbounds float* %tmp2388, i64 1
+  %tmp2390 = getelementptr inbounds float* %tmp2389, i64 1
+  %tmp2391 = getelementptr inbounds float* %tmp2390, i64 1
+  %tmp2392 = getelementptr inbounds float* %tmp2391, i64 1
+  %tmp2393 = getelementptr inbounds float* %tmp2392, i64 1
+  %tmp2394 = getelementptr inbounds float* %tmp2393, i64 1
+  %tmp2395 = getelementptr inbounds float* %tmp2394, i64 1
+  %tmp2396 = getelementptr inbounds float* %tmp2395, i64 1
+  %tmp2397 = getelementptr inbounds float* %tmp2396, i64 1
+  %tmp2398 = getelementptr inbounds float* %tmp2397, i64 1
+  %tmp2399 = getelementptr inbounds float* %tmp2398, i64 1
+  %tmp2400 = getelementptr inbounds float* %tmp2399, i64 1
+  %tmp2401 = getelementptr inbounds float* %tmp2400, i64 1
+  %tmp2402 = getelementptr inbounds float* %tmp2401, i64 1
+  %tmp2403 = getelementptr inbounds float* %tmp2402, i64 1
+  %tmp2404 = getelementptr inbounds float* %tmp2403, i64 1
+  %tmp2405 = getelementptr inbounds float* %tmp2404, i64 1
+  %tmp2406 = getelementptr inbounds float* %tmp2405, i64 1
+  %tmp2407 = getelementptr inbounds float* %tmp2406, i64 1
+  %tmp2408 = getelementptr inbounds float* %tmp2407, i64 1
+  %tmp2409 = getelementptr inbounds float* %tmp2408, i64 1
+  %tmp2410 = getelementptr inbounds float* %tmp2409, i64 1
+  %tmp2411 = getelementptr inbounds float* %tmp2410, i64 1
+  %tmp2412 = getelementptr inbounds float* %tmp2411, i64 1
+  %tmp2413 = getelementptr inbounds float* %tmp2412, i64 1
+  %tmp2414 = getelementptr inbounds float* %tmp2413, i64 1
+  %tmp2415 = getelementptr inbounds float* %tmp2414, i64 1
+  %tmp2416 = getelementptr inbounds float* %tmp2415, i64 1
+  %tmp2417 = getelementptr inbounds float* %tmp2416, i64 1
+  %tmp2418 = getelementptr inbounds float* %tmp2417, i64 1
+  %tmp2419 = getelementptr inbounds float* %tmp2418, i64 1
+  %tmp2420 = getelementptr inbounds float* %tmp2419, i64 1
+  %tmp2421 = getelementptr inbounds float* %tmp2420, i64 1
+  %tmp2422 = getelementptr inbounds float* %tmp2421, i64 1
+  %tmp2423 = getelementptr inbounds float* %tmp2422, i64 1
+  %tmp2424 = getelementptr inbounds float* %tmp2423, i64 1
+  %tmp2425 = getelementptr inbounds float* %tmp2424, i64 1
+  %tmp2426 = getelementptr inbounds float* %tmp2425, i64 1
+  %tmp2427 = getelementptr inbounds float* %tmp2426, i64 1
+  %tmp2428 = getelementptr inbounds float* %tmp2427, i64 1
+  %tmp2429 = getelementptr inbounds float* %tmp2428, i64 1
+  %tmp2430 = getelementptr inbounds float* %tmp2429, i64 1
+  %tmp2431 = getelementptr inbounds float* %tmp2430, i64 1
+  %tmp2432 = getelementptr inbounds float* %tmp2431, i64 1
+  %tmp2433 = getelementptr inbounds float* %tmp2432, i64 1
+  %tmp2434 = getelementptr inbounds float* %tmp2433, i64 1
+  %tmp2435 = getelementptr inbounds float* %tmp2434, i64 1
+  %tmp2436 = getelementptr inbounds float* %tmp2435, i64 1
+  %tmp2437 = getelementptr inbounds float* %tmp2436, i64 1
+  %tmp2438 = getelementptr inbounds float* %tmp2437, i64 1
+  %tmp2439 = getelementptr inbounds float* %tmp2438, i64 1
+  %tmp2440 = getelementptr inbounds float* %tmp2439, i64 1
+  %tmp2441 = getelementptr inbounds float* %tmp2440, i64 1
+  %tmp2442 = getelementptr inbounds float* %tmp2441, i64 1
+  %tmp2443 = getelementptr inbounds float* %tmp2442, i64 1
+  %tmp2444 = getelementptr inbounds float* %tmp2443, i64 1
+  %tmp2445 = getelementptr inbounds float* %tmp2444, i64 1
+  %tmp2446 = getelementptr inbounds float* %tmp2445, i64 1
+  %tmp2447 = getelementptr inbounds float* %tmp2446, i64 1
+  %tmp2448 = getelementptr inbounds float* %tmp2447, i64 1
+  %tmp2449 = getelementptr inbounds float* %tmp2448, i64 1
+  %tmp2450 = getelementptr inbounds float* %tmp2449, i64 1
+  %tmp2451 = getelementptr inbounds float* %tmp2450, i64 1
+  %tmp2452 = getelementptr inbounds float* %tmp2451, i64 1
+  %tmp2453 = getelementptr inbounds float* %tmp2452, i64 1
+  %tmp2454 = getelementptr inbounds float* %tmp2453, i64 1
+  %tmp2455 = getelementptr inbounds float* %tmp2454, i64 1
+  %tmp2456 = getelementptr inbounds float* %tmp2455, i64 1
+  %tmp2457 = getelementptr inbounds float* %tmp2456, i64 1
+  %tmp2458 = getelementptr inbounds float* %tmp2457, i64 1
+  %tmp2459 = getelementptr inbounds float* %tmp2458, i64 1
+  %tmp2460 = getelementptr inbounds float* %tmp2459, i64 1
+  %tmp2461 = getelementptr inbounds float* %tmp2460, i64 1
+  %tmp2462 = getelementptr inbounds float* %tmp2461, i64 1
+  %tmp2463 = getelementptr inbounds float* %tmp2462, i64 1
+  %tmp2464 = getelementptr inbounds float* %tmp2463, i64 1
+  %tmp2465 = getelementptr inbounds float* %tmp2464, i64 1
+  %tmp2466 = getelementptr inbounds float* %tmp2465, i64 1
+  %tmp2467 = getelementptr inbounds float* %tmp2466, i64 1
+  %tmp2468 = getelementptr inbounds float* %tmp2467, i64 1
+  %tmp2469 = getelementptr inbounds float* %tmp2468, i64 1
+  %tmp2470 = getelementptr inbounds float* %tmp2469, i64 1
+  %tmp2471 = getelementptr inbounds float* %tmp2470, i64 1
+  %tmp2472 = getelementptr inbounds float* %tmp2471, i64 1
+  %tmp2473 = getelementptr inbounds float* %tmp2472, i64 1
+  %tmp2474 = getelementptr inbounds float* %tmp2473, i64 1
+  %tmp2475 = getelementptr inbounds float* %tmp2474, i64 1
+  %tmp2476 = getelementptr inbounds float* %tmp2475, i64 1
+  %tmp2477 = getelementptr inbounds float* %tmp2476, i64 1
+  %tmp2478 = getelementptr inbounds float* %tmp2477, i64 1
+  %tmp2479 = getelementptr inbounds float* %tmp2478, i64 1
+  %tmp2480 = getelementptr inbounds float* %tmp2479, i64 1
+  %tmp2481 = getelementptr inbounds float* %tmp2480, i64 1
+  %tmp2482 = getelementptr inbounds float* %tmp2481, i64 1
+  %tmp2483 = getelementptr inbounds float* %tmp2482, i64 1
+  %tmp2484 = getelementptr inbounds float* %tmp2483, i64 1
+  %tmp2485 = getelementptr inbounds float* %tmp2484, i64 1
+  %tmp2486 = getelementptr inbounds float* %tmp2485, i64 1
+  %tmp2487 = getelementptr inbounds float* %tmp2486, i64 1
+  %tmp2488 = getelementptr inbounds float* %tmp2487, i64 1
+  %tmp2489 = getelementptr inbounds float* %tmp2488, i64 1
+  %tmp2490 = getelementptr inbounds float* %tmp2489, i64 1
+  %tmp2491 = getelementptr inbounds float* %tmp2490, i64 1
+  %tmp2492 = getelementptr inbounds float* %tmp2491, i64 1
+  %tmp2493 = getelementptr inbounds float* %tmp2492, i64 1
+  %tmp2494 = getelementptr inbounds float* %tmp2493, i64 1
+  %tmp2495 = getelementptr inbounds float* %tmp2494, i64 1
+  %tmp2496 = getelementptr inbounds float* %tmp2495, i64 1
+  %tmp2497 = getelementptr inbounds float* %tmp2496, i64 1
+  %tmp2498 = getelementptr inbounds float* %tmp2497, i64 1
+  %tmp2499 = getelementptr inbounds float* %tmp2498, i64 1
+  %tmp2500 = getelementptr inbounds float* %tmp2499, i64 1
+  %tmp2501 = getelementptr inbounds float* %tmp2500, i64 1
+  %tmp2502 = getelementptr inbounds float* %tmp2501, i64 1
+  %tmp2503 = getelementptr inbounds float* %tmp2502, i64 1
+  %tmp2504 = getelementptr inbounds float* %tmp2503, i64 1
+  %tmp2505 = getelementptr inbounds float* %tmp2504, i64 1
+  %tmp2506 = getelementptr inbounds float* %tmp2505, i64 1
+  %tmp2507 = getelementptr inbounds float* %tmp2506, i64 1
+  %tmp2508 = getelementptr inbounds float* %tmp2507, i64 1
+  %tmp2509 = getelementptr inbounds float* %tmp2508, i64 1
+  %tmp2510 = getelementptr inbounds float* %tmp2509, i64 1
+  %tmp2511 = getelementptr inbounds float* %tmp2510, i64 1
+  %tmp2512 = getelementptr inbounds float* %tmp2511, i64 1
+  %tmp2513 = getelementptr inbounds float* %tmp2512, i64 1
+  %tmp2514 = getelementptr inbounds float* %tmp2513, i64 1
+  %tmp2515 = getelementptr inbounds float* %tmp2514, i64 1
+  %tmp2516 = getelementptr inbounds float* %tmp2515, i64 1
+  %tmp2517 = getelementptr inbounds float* %tmp2516, i64 1
+  %tmp2518 = getelementptr inbounds float* %tmp2517, i64 1
+  %tmp2519 = getelementptr inbounds float* %tmp2518, i64 1
+  %tmp2520 = getelementptr inbounds float* %tmp2519, i64 1
+  %tmp2521 = getelementptr inbounds float* %tmp2520, i64 1
+  %tmp2522 = getelementptr inbounds float* %tmp2521, i64 1
+  %tmp2523 = getelementptr inbounds float* %tmp2522, i64 1
+  %tmp2524 = getelementptr inbounds float* %tmp2523, i64 1
+  %tmp2525 = getelementptr inbounds float* %tmp2524, i64 1
+  %tmp2526 = getelementptr inbounds float* %tmp2525, i64 1
+  %tmp2527 = getelementptr inbounds float* %tmp2526, i64 1
+  %tmp2528 = getelementptr inbounds float* %tmp2527, i64 1
+  %tmp2529 = getelementptr inbounds float* %tmp2528, i64 1
+  %tmp2530 = getelementptr inbounds float* %tmp2529, i64 1
+  %tmp2531 = getelementptr inbounds float* %tmp2530, i64 1
+  %tmp2532 = getelementptr inbounds float* %tmp2531, i64 1
+  %tmp2533 = getelementptr inbounds float* %tmp2532, i64 1
+  %tmp2534 = getelementptr inbounds float* %tmp2533, i64 1
+  %tmp2535 = getelementptr inbounds float* %tmp2534, i64 1
+  %tmp2536 = getelementptr inbounds float* %tmp2535, i64 1
+  %tmp2537 = getelementptr inbounds float* %tmp2536, i64 1
+  %tmp2538 = getelementptr inbounds float* %tmp2537, i64 1
+  %tmp2539 = getelementptr inbounds float* %tmp2538, i64 1
+  %tmp2540 = getelementptr inbounds float* %tmp2539, i64 1
+  %tmp2541 = getelementptr inbounds float* %tmp2540, i64 1
+  %tmp2542 = getelementptr inbounds float* %tmp2541, i64 1
+  %tmp2543 = getelementptr inbounds float* %tmp2542, i64 1
+  %tmp2544 = getelementptr inbounds float* %tmp2543, i64 1
+  %tmp2545 = getelementptr inbounds float* %tmp2544, i64 1
+  %tmp2546 = getelementptr inbounds float* %tmp2545, i64 1
+  %tmp2547 = getelementptr inbounds float* %tmp2546, i64 1
+  %tmp2548 = getelementptr inbounds float* %tmp2547, i64 1
+  %tmp2549 = getelementptr inbounds float* %tmp2548, i64 1
+  %tmp2550 = getelementptr inbounds float* %tmp2549, i64 1
+  %tmp2551 = getelementptr inbounds float* %tmp2550, i64 1
+  %tmp2552 = getelementptr inbounds float* %tmp2551, i64 1
+  %tmp2553 = getelementptr inbounds float* %tmp2552, i64 1
+  %tmp2554 = getelementptr inbounds float* %tmp2553, i64 1
+  %tmp2555 = getelementptr inbounds float* %tmp2554, i64 1
+  %tmp2556 = getelementptr inbounds float* %tmp2555, i64 1
+  %tmp2557 = getelementptr inbounds float* %tmp2556, i64 1
+  %tmp2558 = getelementptr inbounds float* %tmp2557, i64 1
+  %tmp2559 = getelementptr inbounds float* %tmp2558, i64 1
+  %tmp2560 = getelementptr inbounds float* %tmp2559, i64 1
+  %tmp2561 = getelementptr inbounds float* %tmp2560, i64 1
+  %tmp2562 = getelementptr inbounds float* %tmp2561, i64 1
+  %tmp2563 = getelementptr inbounds float* %tmp2562, i64 1
+  %tmp2564 = getelementptr inbounds float* %tmp2563, i64 1
+  %tmp2565 = getelementptr inbounds float* %tmp2564, i64 1
+  %tmp2566 = getelementptr inbounds float* %tmp2565, i64 1
+  %tmp2567 = getelementptr inbounds float* %tmp2566, i64 1
+  %tmp2568 = getelementptr inbounds float* %tmp2567, i64 1
+  %tmp2569 = getelementptr inbounds float* %tmp2568, i64 1
+  %tmp2570 = getelementptr inbounds float* %tmp2569, i64 1
+  %tmp2571 = getelementptr inbounds float* %tmp2570, i64 1
+  %tmp2572 = getelementptr inbounds float* %tmp2571, i64 1
+  %tmp2573 = getelementptr inbounds float* %tmp2572, i64 1
+  %tmp2574 = getelementptr inbounds float* %tmp2573, i64 1
+  %tmp2575 = getelementptr inbounds float* %tmp2574, i64 1
+  %tmp2576 = getelementptr inbounds float* %tmp2575, i64 1
+  %tmp2577 = getelementptr inbounds float* %tmp2576, i64 1
+  %tmp2578 = getelementptr inbounds float* %tmp2577, i64 1
+  %tmp2579 = getelementptr inbounds float* %tmp2578, i64 1
+  %tmp2580 = getelementptr inbounds float* %tmp2579, i64 1
+  %tmp2581 = getelementptr inbounds float* %tmp2580, i64 1
+  %tmp2582 = getelementptr inbounds float* %tmp2581, i64 1
+  %tmp2583 = getelementptr inbounds float* %tmp2582, i64 1
+  %tmp2584 = getelementptr inbounds float* %tmp2583, i64 1
+  %tmp2585 = getelementptr inbounds float* %tmp2584, i64 1
+  %tmp2586 = getelementptr inbounds float* %tmp2585, i64 1
+  %tmp2587 = getelementptr inbounds float* %tmp2586, i64 1
+  %tmp2588 = getelementptr inbounds float* %tmp2587, i64 1
+  %tmp2589 = getelementptr inbounds float* %tmp2588, i64 1
+  %tmp2590 = getelementptr inbounds float* %tmp2589, i64 1
+  %tmp2591 = getelementptr inbounds float* %tmp2590, i64 1
+  %tmp2592 = getelementptr inbounds float* %tmp2591, i64 1
+  %tmp2593 = getelementptr inbounds float* %tmp2592, i64 1
+  %tmp2594 = getelementptr inbounds float* %tmp2593, i64 1
+  %tmp2595 = getelementptr inbounds float* %tmp2594, i64 1
+  %tmp2596 = getelementptr inbounds float* %tmp2595, i64 1
+  %tmp2597 = getelementptr inbounds float* %tmp2596, i64 1
+  %tmp2598 = getelementptr inbounds float* %tmp2597, i64 1
+  %tmp2599 = getelementptr inbounds float* %tmp2598, i64 1
+  %tmp2600 = getelementptr inbounds float* %tmp2599, i64 1
+  %tmp2601 = getelementptr inbounds float* %tmp2600, i64 1
+  %tmp2602 = getelementptr inbounds float* %tmp2601, i64 1
+  %tmp2603 = getelementptr inbounds float* %tmp2602, i64 1
+  %tmp2604 = getelementptr inbounds float* %tmp2603, i64 1
+  %tmp2605 = getelementptr inbounds float* %tmp2604, i64 1
+  %tmp2606 = getelementptr inbounds float* %tmp2605, i64 1
+  %tmp2607 = getelementptr inbounds float* %tmp2606, i64 1
+  %tmp2608 = getelementptr inbounds float* %tmp2607, i64 1
+  %tmp2609 = getelementptr inbounds float* %tmp2608, i64 1
+  %tmp2610 = getelementptr inbounds float* %tmp2609, i64 1
+  %tmp2611 = getelementptr inbounds float* %tmp2610, i64 1
+  %tmp2612 = getelementptr inbounds float* %tmp2611, i64 1
+  %tmp2613 = getelementptr inbounds float* %tmp2612, i64 1
+  %tmp2614 = getelementptr inbounds float* %tmp2613, i64 1
+  %tmp2615 = getelementptr inbounds float* %tmp2614, i64 1
+  %tmp2616 = getelementptr inbounds float* %tmp2615, i64 1
+  %tmp2617 = getelementptr inbounds float* %tmp2616, i64 1
+  %tmp2618 = getelementptr inbounds float* %tmp2617, i64 1
+  %tmp2619 = getelementptr inbounds float* %tmp2618, i64 1
+  %tmp2620 = getelementptr inbounds float* %tmp2619, i64 1
+  %tmp2621 = getelementptr inbounds float* %tmp2620, i64 1
+  %tmp2622 = getelementptr inbounds float* %tmp2621, i64 1
+  %tmp2623 = getelementptr inbounds float* %tmp2622, i64 1
+  %tmp2624 = getelementptr inbounds float* %tmp2623, i64 1
+  %tmp2625 = getelementptr inbounds float* %tmp2624, i64 1
+  %tmp2626 = getelementptr inbounds float* %tmp2625, i64 1
+  %tmp2627 = getelementptr inbounds float* %tmp2626, i64 1
+  %tmp2628 = getelementptr inbounds float* %tmp2627, i64 1
+  %tmp2629 = getelementptr inbounds float* %tmp2628, i64 1
+  %tmp2630 = getelementptr inbounds float* %tmp2629, i64 1
+  %tmp2631 = getelementptr inbounds float* %tmp2630, i64 1
+  %tmp2632 = getelementptr inbounds float* %tmp2631, i64 1
+  %tmp2633 = getelementptr inbounds float* %tmp2632, i64 1
+  %tmp2634 = getelementptr inbounds float* %tmp2633, i64 1
+  %tmp2635 = getelementptr inbounds float* %tmp2634, i64 1
+  %tmp2636 = getelementptr inbounds float* %tmp2635, i64 1
+  %tmp2637 = getelementptr inbounds float* %tmp2636, i64 1
+  %tmp2638 = getelementptr inbounds float* %tmp2637, i64 1
+  %tmp2639 = getelementptr inbounds float* %tmp2638, i64 1
+  %tmp2640 = getelementptr inbounds float* %tmp2639, i64 1
+  %tmp2641 = getelementptr inbounds float* %tmp2640, i64 1
+  %tmp2642 = getelementptr inbounds float* %tmp2641, i64 1
+  %tmp2643 = getelementptr inbounds float* %tmp2642, i64 1
+  %tmp2644 = getelementptr inbounds float* %tmp2643, i64 1
+  %tmp2645 = getelementptr inbounds float* %tmp2644, i64 1
+  %tmp2646 = getelementptr inbounds float* %tmp2645, i64 1
+  %tmp2647 = getelementptr inbounds float* %tmp2646, i64 1
+  %tmp2648 = getelementptr inbounds float* %tmp2647, i64 1
+  %tmp2649 = getelementptr inbounds float* %tmp2648, i64 1
+  %tmp2650 = getelementptr inbounds float* %tmp2649, i64 1
+  %tmp2651 = getelementptr inbounds float* %tmp2650, i64 1
+  %tmp2652 = getelementptr inbounds float* %tmp2651, i64 1
+  %tmp2653 = getelementptr inbounds float* %tmp2652, i64 1
+  %tmp2654 = getelementptr inbounds float* %tmp2653, i64 1
+  %tmp2655 = getelementptr inbounds float* %tmp2654, i64 1
+  %tmp2656 = getelementptr inbounds float* %tmp2655, i64 1
+  %tmp2657 = getelementptr inbounds float* %tmp2656, i64 1
+  %tmp2658 = getelementptr inbounds float* %tmp2657, i64 1
+  %tmp2659 = getelementptr inbounds float* %tmp2658, i64 1
+  %tmp2660 = getelementptr inbounds float* %tmp2659, i64 1
+  %tmp2661 = getelementptr inbounds float* %tmp2660, i64 1
+  %tmp2662 = getelementptr inbounds float* %tmp2661, i64 1
+  %tmp2663 = getelementptr inbounds float* %tmp2662, i64 1
+  %tmp2664 = getelementptr inbounds float* %tmp2663, i64 1
+  %tmp2665 = getelementptr inbounds float* %tmp2664, i64 1
+  %tmp2666 = getelementptr inbounds float* %tmp2665, i64 1
+  %tmp2667 = getelementptr inbounds float* %tmp2666, i64 1
+  %tmp2668 = getelementptr inbounds float* %tmp2667, i64 1
+  %tmp2669 = getelementptr inbounds float* %tmp2668, i64 1
+  %tmp2670 = getelementptr inbounds float* %tmp2669, i64 1
+  %tmp2671 = getelementptr inbounds float* %tmp2670, i64 1
+  %tmp2672 = getelementptr inbounds float* %tmp2671, i64 1
+  %tmp2673 = getelementptr inbounds float* %tmp2672, i64 1
+  %tmp2674 = getelementptr inbounds float* %tmp2673, i64 1
+  %tmp2675 = getelementptr inbounds float* %tmp2674, i64 1
+  %tmp2676 = getelementptr inbounds float* %tmp2675, i64 1
+  %tmp2677 = getelementptr inbounds float* %tmp2676, i64 1
+  %tmp2678 = getelementptr inbounds float* %tmp2677, i64 1
+  %tmp2679 = getelementptr inbounds float* %tmp2678, i64 1
+  %tmp2680 = getelementptr inbounds float* %tmp2679, i64 1
+  %tmp2681 = getelementptr inbounds float* %tmp2680, i64 1
+  %tmp2682 = getelementptr inbounds float* %tmp2681, i64 1
+  %tmp2683 = getelementptr inbounds float* %tmp2682, i64 1
+  %tmp2684 = getelementptr inbounds float* %tmp2683, i64 1
+  %tmp2685 = getelementptr inbounds float* %tmp2684, i64 1
+  %tmp2686 = getelementptr inbounds float* %tmp2685, i64 1
+  %tmp2687 = getelementptr inbounds float* %tmp2686, i64 1
+  %tmp2688 = getelementptr inbounds float* %tmp2687, i64 1
+  %tmp2689 = getelementptr inbounds float* %tmp2688, i64 1
+  %tmp2690 = getelementptr inbounds float* %tmp2689, i64 1
+  %tmp2691 = getelementptr inbounds float* %tmp2690, i64 1
+  %tmp2692 = getelementptr inbounds float* %tmp2691, i64 1
+  %tmp2693 = getelementptr inbounds float* %tmp2692, i64 1
+  %tmp2694 = getelementptr inbounds float* %tmp2693, i64 1
+  %tmp2695 = getelementptr inbounds float* %tmp2694, i64 1
+  %tmp2696 = getelementptr inbounds float* %tmp2695, i64 1
+  %tmp2697 = getelementptr inbounds float* %tmp2696, i64 1
+  %tmp2698 = getelementptr inbounds float* %tmp2697, i64 1
+  %tmp2699 = getelementptr inbounds float* %tmp2698, i64 1
+  %tmp2700 = getelementptr inbounds float* %tmp2699, i64 1
+  %tmp2701 = getelementptr inbounds float* %tmp2700, i64 1
+  %tmp2702 = getelementptr inbounds float* %tmp2701, i64 1
+  %tmp2703 = getelementptr inbounds float* %tmp2702, i64 1
+  %tmp2704 = getelementptr inbounds float* %tmp2703, i64 1
+  %tmp2705 = getelementptr inbounds float* %tmp2704, i64 1
+  %tmp2706 = getelementptr inbounds float* %tmp2705, i64 1
+  %tmp2707 = getelementptr inbounds float* %tmp2706, i64 1
+  %tmp2708 = getelementptr inbounds float* %tmp2707, i64 1
+  %tmp2709 = getelementptr inbounds float* %tmp2708, i64 1
+  %tmp2710 = getelementptr inbounds float* %tmp2709, i64 1
+  %tmp2711 = getelementptr inbounds float* %tmp2710, i64 1
+  %tmp2712 = getelementptr inbounds float* %tmp2711, i64 1
+  %tmp2713 = getelementptr inbounds float* %tmp2712, i64 1
+  %tmp2714 = getelementptr inbounds float* %tmp2713, i64 1
+  %tmp2715 = getelementptr inbounds float* %tmp2714, i64 1
+  %tmp2716 = getelementptr inbounds float* %tmp2715, i64 1
+  %tmp2717 = getelementptr inbounds float* %tmp2716, i64 1
+  %tmp2718 = getelementptr inbounds float* %tmp2717, i64 1
+  %tmp2719 = getelementptr inbounds float* %tmp2718, i64 1
+  %tmp2720 = getelementptr inbounds float* %tmp2719, i64 1
+  %tmp2721 = getelementptr inbounds float* %tmp2720, i64 1
+  %tmp2722 = getelementptr inbounds float* %tmp2721, i64 1
+  %tmp2723 = getelementptr inbounds float* %tmp2722, i64 1
+  %tmp2724 = getelementptr inbounds float* %tmp2723, i64 1
+  %tmp2725 = getelementptr inbounds float* %tmp2724, i64 1
+  %tmp2726 = getelementptr inbounds float* %tmp2725, i64 1
+  %tmp2727 = getelementptr inbounds float* %tmp2726, i64 1
+  %tmp2728 = getelementptr inbounds float* %tmp2727, i64 1
+  %tmp2729 = getelementptr inbounds float* %tmp2728, i64 1
+  %tmp2730 = getelementptr inbounds float* %tmp2729, i64 1
+  %tmp2731 = getelementptr inbounds float* %tmp2730, i64 1
+  %tmp2732 = getelementptr inbounds float* %tmp2731, i64 1
+  %tmp2733 = getelementptr inbounds float* %tmp2732, i64 1
+  %tmp2734 = getelementptr inbounds float* %tmp2733, i64 1
+  %tmp2735 = getelementptr inbounds float* %tmp2734, i64 1
+  %tmp2736 = getelementptr inbounds float* %tmp2735, i64 1
+  %tmp2737 = getelementptr inbounds float* %tmp2736, i64 1
+  %tmp2738 = getelementptr inbounds float* %tmp2737, i64 1
+  %tmp2739 = getelementptr inbounds float* %tmp2738, i64 1
+  %tmp2740 = getelementptr inbounds float* %tmp2739, i64 1
+  %tmp2741 = getelementptr inbounds float* %tmp2740, i64 1
+  %tmp2742 = getelementptr inbounds float* %tmp2741, i64 1
+  %tmp2743 = getelementptr inbounds float* %tmp2742, i64 1
+  %tmp2744 = getelementptr inbounds float* %tmp2743, i64 1
+  %tmp2745 = getelementptr inbounds float* %tmp2744, i64 1
+  %tmp2746 = getelementptr inbounds float* %tmp2745, i64 1
+  %tmp2747 = getelementptr inbounds float* %tmp2746, i64 1
+  %tmp2748 = getelementptr inbounds float* %tmp2747, i64 1
+  %tmp2749 = getelementptr inbounds float* %tmp2748, i64 1
+  %tmp2750 = getelementptr inbounds float* %tmp2749, i64 1
+  %tmp2751 = getelementptr inbounds float* %tmp2750, i64 1
+  %tmp2752 = getelementptr inbounds float* %tmp2751, i64 1
+  %tmp2753 = getelementptr inbounds float* %tmp2752, i64 1
+  %tmp2754 = getelementptr inbounds float* %tmp2753, i64 1
+  %tmp2755 = getelementptr inbounds float* %tmp2754, i64 1
+  %tmp2756 = getelementptr inbounds float* %tmp2755, i64 1
+  %tmp2757 = getelementptr inbounds float* %tmp2756, i64 1
+  %tmp2758 = getelementptr inbounds float* %tmp2757, i64 1
+  %tmp2759 = getelementptr inbounds float* %tmp2758, i64 1
+  %tmp2760 = getelementptr inbounds float* %tmp2759, i64 1
+  %tmp2761 = getelementptr inbounds float* %tmp2760, i64 1
+  %tmp2762 = getelementptr inbounds float* %tmp2761, i64 1
+  %tmp2763 = getelementptr inbounds float* %tmp2762, i64 1
+  %tmp2764 = getelementptr inbounds float* %tmp2763, i64 1
+  %tmp2765 = getelementptr inbounds float* %tmp2764, i64 1
+  %tmp2766 = getelementptr inbounds float* %tmp2765, i64 1
+  %tmp2767 = getelementptr inbounds float* %tmp2766, i64 1
+  %tmp2768 = getelementptr inbounds float* %tmp2767, i64 1
+  %tmp2769 = getelementptr inbounds float* %tmp2768, i64 1
+  %tmp2770 = getelementptr inbounds float* %tmp2769, i64 1
+  %tmp2771 = getelementptr inbounds float* %tmp2770, i64 1
+  %tmp2772 = getelementptr inbounds float* %tmp2771, i64 1
+  %tmp2773 = getelementptr inbounds float* %tmp2772, i64 1
+  %tmp2774 = getelementptr inbounds float* %tmp2773, i64 1
+  %tmp2775 = getelementptr inbounds float* %tmp2774, i64 1
+  %tmp2776 = getelementptr inbounds float* %tmp2775, i64 1
+  %tmp2777 = getelementptr inbounds float* %tmp2776, i64 1
+  %tmp2778 = getelementptr inbounds float* %tmp2777, i64 1
+  %tmp2779 = getelementptr inbounds float* %tmp2778, i64 1
+  %tmp2780 = getelementptr inbounds float* %tmp2779, i64 1
+  %tmp2781 = getelementptr inbounds float* %tmp2780, i64 1
+  %tmp2782 = getelementptr inbounds float* %tmp2781, i64 1
+  %tmp2783 = getelementptr inbounds float* %tmp2782, i64 1
+  %tmp2784 = getelementptr inbounds float* %tmp2783, i64 1
+  %tmp2785 = getelementptr inbounds float* %tmp2784, i64 1
+  %tmp2786 = getelementptr inbounds float* %tmp2785, i64 1
+  %tmp2787 = getelementptr inbounds float* %tmp2786, i64 1
+  %tmp2788 = getelementptr inbounds float* %tmp2787, i64 1
+  %tmp2789 = getelementptr inbounds float* %tmp2788, i64 1
+  %tmp2790 = getelementptr inbounds float* %tmp2789, i64 1
+  %tmp2791 = getelementptr inbounds float* %tmp2790, i64 1
+  %tmp2792 = getelementptr inbounds float* %tmp2791, i64 1
+  %tmp2793 = getelementptr inbounds float* %tmp2792, i64 1
+  %tmp2794 = getelementptr inbounds float* %tmp2793, i64 1
+  %tmp2795 = getelementptr inbounds float* %tmp2794, i64 1
+  %tmp2796 = getelementptr inbounds float* %tmp2795, i64 1
+  %tmp2797 = getelementptr inbounds float* %tmp2796, i64 1
+  %tmp2798 = getelementptr inbounds float* %tmp2797, i64 1
+  %tmp2799 = getelementptr inbounds float* %tmp2798, i64 1
+  %tmp2800 = getelementptr inbounds float* %tmp2799, i64 1
+  %tmp2801 = getelementptr inbounds float* %tmp2800, i64 1
+  %tmp2802 = getelementptr inbounds float* %tmp2801, i64 1
+  %tmp2803 = getelementptr inbounds float* %tmp2802, i64 1
+  %tmp2804 = getelementptr inbounds float* %tmp2803, i64 1
+  %tmp2805 = getelementptr inbounds float* %tmp2804, i64 1
+  %tmp2806 = getelementptr inbounds float* %tmp2805, i64 1
+  %tmp2807 = getelementptr inbounds float* %tmp2806, i64 1
+  %tmp2808 = getelementptr inbounds float* %tmp2807, i64 1
+  %tmp2809 = getelementptr inbounds float* %tmp2808, i64 1
+  %tmp2810 = getelementptr inbounds float* %tmp2809, i64 1
+  %tmp2811 = getelementptr inbounds float* %tmp2810, i64 1
+  %tmp2812 = getelementptr inbounds float* %tmp2811, i64 1
+  %tmp2813 = getelementptr inbounds float* %tmp2812, i64 1
+  %tmp2814 = getelementptr inbounds float* %tmp2813, i64 1
+  %tmp2815 = getelementptr inbounds float* %tmp2814, i64 1
+  %tmp2816 = getelementptr inbounds float* %tmp2815, i64 1
+  %tmp2817 = getelementptr inbounds float* %tmp2816, i64 1
+  %tmp2818 = getelementptr inbounds float* %tmp2817, i64 1
+  %tmp2819 = getelementptr inbounds float* %tmp2818, i64 1
+  %tmp2820 = getelementptr inbounds float* %tmp2819, i64 1
+  %tmp2821 = getelementptr inbounds float* %tmp2820, i64 1
+  %tmp2822 = getelementptr inbounds float* %tmp2821, i64 1
+  %tmp2823 = getelementptr inbounds float* %tmp2822, i64 1
+  %tmp2824 = getelementptr inbounds float* %tmp2823, i64 1
+  %tmp2825 = getelementptr inbounds float* %tmp2824, i64 1
+  %tmp2826 = getelementptr inbounds float* %tmp2825, i64 1
+  %tmp2827 = getelementptr inbounds float* %tmp2826, i64 1
+  %tmp2828 = getelementptr inbounds float* %tmp2827, i64 1
+  %tmp2829 = getelementptr inbounds float* %tmp2828, i64 1
+  %tmp2830 = getelementptr inbounds float* %tmp2829, i64 1
+  %tmp2831 = getelementptr inbounds float* %tmp2830, i64 1
+  %tmp2832 = getelementptr inbounds float* %tmp2831, i64 1
+  %tmp2833 = getelementptr inbounds float* %tmp2832, i64 1
+  %tmp2834 = getelementptr inbounds float* %tmp2833, i64 1
+  %tmp2835 = getelementptr inbounds float* %tmp2834, i64 1
+  %tmp2836 = getelementptr inbounds float* %tmp2835, i64 1
+  %tmp2837 = getelementptr inbounds float* %tmp2836, i64 1
+  %tmp2838 = getelementptr inbounds float* %tmp2837, i64 1
+  %tmp2839 = getelementptr inbounds float* %tmp2838, i64 1
+  %tmp2840 = getelementptr inbounds float* %tmp2839, i64 1
+  %tmp2841 = getelementptr inbounds float* %tmp2840, i64 1
+  %tmp2842 = getelementptr inbounds float* %tmp2841, i64 1
+  %tmp2843 = getelementptr inbounds float* %tmp2842, i64 1
+  %tmp2844 = getelementptr inbounds float* %tmp2843, i64 1
+  %tmp2845 = getelementptr inbounds float* %tmp2844, i64 1
+  %tmp2846 = getelementptr inbounds float* %tmp2845, i64 1
+  %tmp2847 = getelementptr inbounds float* %tmp2846, i64 1
+  %tmp2848 = getelementptr inbounds float* %tmp2847, i64 1
+  %tmp2849 = getelementptr inbounds float* %tmp2848, i64 1
+  %tmp2850 = getelementptr inbounds float* %tmp2849, i64 1
+  %tmp2851 = getelementptr inbounds float* %tmp2850, i64 1
+  %tmp2852 = getelementptr inbounds float* %tmp2851, i64 1
+  %tmp2853 = getelementptr inbounds float* %tmp2852, i64 1
+  %tmp2854 = getelementptr inbounds float* %tmp2853, i64 1
+  %tmp2855 = getelementptr inbounds float* %tmp2854, i64 1
+  %tmp2856 = getelementptr inbounds float* %tmp2855, i64 1
+  %tmp2857 = getelementptr inbounds float* %tmp2856, i64 1
+  %tmp2858 = getelementptr inbounds float* %tmp2857, i64 1
+  %tmp2859 = getelementptr inbounds float* %tmp2858, i64 1
+  %tmp2860 = getelementptr inbounds float* %tmp2859, i64 1
+  %tmp2861 = getelementptr inbounds float* %tmp2860, i64 1
+  %tmp2862 = getelementptr inbounds float* %tmp2861, i64 1
+  %tmp2863 = getelementptr inbounds float* %tmp2862, i64 1
+  %tmp2864 = getelementptr inbounds float* %tmp2863, i64 1
+  %tmp2865 = getelementptr inbounds float* %tmp2864, i64 1
+  %tmp2866 = getelementptr inbounds float* %tmp2865, i64 1
+  %tmp2867 = getelementptr inbounds float* %tmp2866, i64 1
+  %tmp2868 = getelementptr inbounds float* %tmp2867, i64 1
+  %tmp2869 = getelementptr inbounds float* %tmp2868, i64 1
+  %tmp2870 = getelementptr inbounds float* %tmp2869, i64 1
+  %tmp2871 = getelementptr inbounds float* %tmp2870, i64 1
+  %tmp2872 = getelementptr inbounds float* %tmp2871, i64 1
+  %tmp2873 = getelementptr inbounds float* %tmp2872, i64 1
+  %tmp2874 = getelementptr inbounds float* %tmp2873, i64 1
+  %tmp2875 = getelementptr inbounds float* %tmp2874, i64 1
+  %tmp2876 = getelementptr inbounds float* %tmp2875, i64 1
+  %tmp2877 = getelementptr inbounds float* %tmp2876, i64 1
+  %tmp2878 = getelementptr inbounds float* %tmp2877, i64 1
+  %tmp2879 = getelementptr inbounds float* %tmp2878, i64 1
+  %tmp2880 = getelementptr inbounds float* %tmp2879, i64 1
+  %tmp2881 = getelementptr inbounds float* %tmp2880, i64 1
+  %tmp2882 = getelementptr inbounds float* %tmp2881, i64 1
+  %tmp2883 = getelementptr inbounds float* %tmp2882, i64 1
+  %tmp2884 = getelementptr inbounds float* %tmp2883, i64 1
+  %tmp2885 = getelementptr inbounds float* %tmp2884, i64 1
+  %tmp2886 = getelementptr inbounds float* %tmp2885, i64 1
+  %tmp2887 = getelementptr inbounds float* %tmp2886, i64 1
+  %tmp2888 = getelementptr inbounds float* %tmp2887, i64 1
+  %tmp2889 = getelementptr inbounds float* %tmp2888, i64 1
+  %tmp2890 = getelementptr inbounds float* %tmp2889, i64 1
+  %tmp2891 = getelementptr inbounds float* %tmp2890, i64 1
+  %tmp2892 = getelementptr inbounds float* %tmp2891, i64 1
+  %tmp2893 = getelementptr inbounds float* %tmp2892, i64 1
+  %tmp2894 = getelementptr inbounds float* %tmp2893, i64 1
+  %tmp2895 = getelementptr inbounds float* %tmp2894, i64 1
+  %tmp2896 = getelementptr inbounds float* %tmp2895, i64 1
+  %tmp2897 = getelementptr inbounds float* %tmp2896, i64 1
+  %tmp2898 = getelementptr inbounds float* %tmp2897, i64 1
+  %tmp2899 = getelementptr inbounds float* %tmp2898, i64 1
+  %tmp2900 = getelementptr inbounds float* %tmp2899, i64 1
+  %tmp2901 = getelementptr inbounds float* %tmp2900, i64 1
+  %tmp2902 = getelementptr inbounds float* %tmp2901, i64 1
+  %tmp2903 = getelementptr inbounds float* %tmp2902, i64 1
+  %tmp2904 = getelementptr inbounds float* %tmp2903, i64 1
+  %tmp2905 = getelementptr inbounds float* %tmp2904, i64 1
+  %tmp2906 = getelementptr inbounds float* %tmp2905, i64 1
+  %tmp2907 = getelementptr inbounds float* %tmp2906, i64 1
+  %tmp2908 = getelementptr inbounds float* %tmp2907, i64 1
+  %tmp2909 = getelementptr inbounds float* %tmp2908, i64 1
+  %tmp2910 = getelementptr inbounds float* %tmp2909, i64 1
+  %tmp2911 = getelementptr inbounds float* %tmp2910, i64 1
+  %tmp2912 = getelementptr inbounds float* %tmp2911, i64 1
+  %tmp2913 = getelementptr inbounds float* %tmp2912, i64 1
+  %tmp2914 = getelementptr inbounds float* %tmp2913, i64 1
+  %tmp2915 = getelementptr inbounds float* %tmp2914, i64 1
+  %tmp2916 = getelementptr inbounds float* %tmp2915, i64 1
+  %tmp2917 = getelementptr inbounds float* %tmp2916, i64 1
+  %tmp2918 = getelementptr inbounds float* %tmp2917, i64 1
+  %tmp2919 = getelementptr inbounds float* %tmp2918, i64 1
+  %tmp2920 = getelementptr inbounds float* %tmp2919, i64 1
+  %tmp2921 = getelementptr inbounds float* %tmp2920, i64 1
+  %tmp2922 = getelementptr inbounds float* %tmp2921, i64 1
+  %tmp2923 = getelementptr inbounds float* %tmp2922, i64 1
+  %tmp2924 = getelementptr inbounds float* %tmp2923, i64 1
+  %tmp2925 = getelementptr inbounds float* %tmp2924, i64 1
+  %tmp2926 = getelementptr inbounds float* %tmp2925, i64 1
+  %tmp2927 = getelementptr inbounds float* %tmp2926, i64 1
+  %tmp2928 = getelementptr inbounds float* %tmp2927, i64 1
+  %tmp2929 = getelementptr inbounds float* %tmp2928, i64 1
+  %tmp2930 = getelementptr inbounds float* %tmp2929, i64 1
+  %tmp2931 = getelementptr inbounds float* %tmp2930, i64 1
+  %tmp2932 = getelementptr inbounds float* %tmp2931, i64 1
+  %tmp2933 = getelementptr inbounds float* %tmp2932, i64 1
+  %tmp2934 = getelementptr inbounds float* %tmp2933, i64 1
+  %tmp2935 = getelementptr inbounds float* %tmp2934, i64 1
+  %tmp2936 = getelementptr inbounds float* %tmp2935, i64 1
+  %tmp2937 = getelementptr inbounds float* %tmp2936, i64 1
+  %tmp2938 = getelementptr inbounds float* %tmp2937, i64 1
+  %tmp2939 = getelementptr inbounds float* %tmp2938, i64 1
+  %tmp2940 = getelementptr inbounds float* %tmp2939, i64 1
+  %tmp2941 = getelementptr inbounds float* %tmp2940, i64 1
+  %tmp2942 = getelementptr inbounds float* %tmp2941, i64 1
+  %tmp2943 = getelementptr inbounds float* %tmp2942, i64 1
+  %tmp2944 = getelementptr inbounds float* %tmp2943, i64 1
+  %tmp2945 = getelementptr inbounds float* %tmp2944, i64 1
+  %tmp2946 = getelementptr inbounds float* %tmp2945, i64 1
+  %tmp2947 = getelementptr inbounds float* %tmp2946, i64 1
+  %tmp2948 = getelementptr inbounds float* %tmp2947, i64 1
+  %tmp2949 = getelementptr inbounds float* %tmp2948, i64 1
+  %tmp2950 = getelementptr inbounds float* %tmp2949, i64 1
+  %tmp2951 = getelementptr inbounds float* %tmp2950, i64 1
+  %tmp2952 = getelementptr inbounds float* %tmp2951, i64 1
+  %tmp2953 = getelementptr inbounds float* %tmp2952, i64 1
+  %tmp2954 = getelementptr inbounds float* %tmp2953, i64 1
+  %tmp2955 = getelementptr inbounds float* %tmp2954, i64 1
+  %tmp2956 = getelementptr inbounds float* %tmp2955, i64 1
+  %tmp2957 = getelementptr inbounds float* %tmp2956, i64 1
+  %tmp2958 = getelementptr inbounds float* %tmp2957, i64 1
+  %tmp2959 = getelementptr inbounds float* %tmp2958, i64 1
+  %tmp2960 = getelementptr inbounds float* %tmp2959, i64 1
+  %tmp2961 = getelementptr inbounds float* %tmp2960, i64 1
+  %tmp2962 = getelementptr inbounds float* %tmp2961, i64 1
+  %tmp2963 = getelementptr inbounds float* %tmp2962, i64 1
+  %tmp2964 = getelementptr inbounds float* %tmp2963, i64 1
+  %tmp2965 = getelementptr inbounds float* %tmp2964, i64 1
+  %tmp2966 = getelementptr inbounds float* %tmp2965, i64 1
+  %tmp2967 = getelementptr inbounds float* %tmp2966, i64 1
+  %tmp2968 = getelementptr inbounds float* %tmp2967, i64 1
+  %tmp2969 = getelementptr inbounds float* %tmp2968, i64 1
+  %tmp2970 = getelementptr inbounds float* %tmp2969, i64 1
+  %tmp2971 = getelementptr inbounds float* %tmp2970, i64 1
+  %tmp2972 = getelementptr inbounds float* %tmp2971, i64 1
+  %tmp2973 = getelementptr inbounds float* %tmp2972, i64 1
+  %tmp2974 = getelementptr inbounds float* %tmp2973, i64 1
+  %tmp2975 = getelementptr inbounds float* %tmp2974, i64 1
+  %tmp2976 = getelementptr inbounds float* %tmp2975, i64 1
+  %tmp2977 = getelementptr inbounds float* %tmp2976, i64 1
+  %tmp2978 = getelementptr inbounds float* %tmp2977, i64 1
+  %tmp2979 = getelementptr inbounds float* %tmp2978, i64 1
+  %tmp2980 = getelementptr inbounds float* %tmp2979, i64 1
+  %tmp2981 = getelementptr inbounds float* %tmp2980, i64 1
+  %tmp2982 = getelementptr inbounds float* %tmp2981, i64 1
+  %tmp2983 = getelementptr inbounds float* %tmp2982, i64 1
+  %tmp2984 = getelementptr inbounds float* %tmp2983, i64 1
+  %tmp2985 = getelementptr inbounds float* %tmp2984, i64 1
+  %tmp2986 = getelementptr inbounds float* %tmp2985, i64 1
+  %tmp2987 = getelementptr inbounds float* %tmp2986, i64 1
+  %tmp2988 = getelementptr inbounds float* %tmp2987, i64 1
+  %tmp2989 = getelementptr inbounds float* %tmp2988, i64 1
+  %tmp2990 = getelementptr inbounds float* %tmp2989, i64 1
+  %tmp2991 = getelementptr inbounds float* %tmp2990, i64 1
+  %tmp2992 = getelementptr inbounds float* %tmp2991, i64 1
+  %tmp2993 = getelementptr inbounds float* %tmp2992, i64 1
+  %tmp2994 = getelementptr inbounds float* %tmp2993, i64 1
+  %tmp2995 = getelementptr inbounds float* %tmp2994, i64 1
+  %tmp2996 = getelementptr inbounds float* %tmp2995, i64 1
+  %tmp2997 = getelementptr inbounds float* %tmp2996, i64 1
+  %tmp2998 = getelementptr inbounds float* %tmp2997, i64 1
+  %tmp2999 = getelementptr inbounds float* %tmp2998, i64 1
+  %tmp3000 = getelementptr inbounds float* %tmp2999, i64 1
+  %tmp3001 = getelementptr inbounds float* %tmp3000, i64 1
+  %tmp3002 = getelementptr inbounds float* %tmp3001, i64 1
+  %tmp3003 = getelementptr inbounds float* %tmp3002, i64 1
+  %tmp3004 = getelementptr inbounds float* %tmp3003, i64 1
+  %tmp3005 = getelementptr inbounds float* %tmp3004, i64 1
+  %tmp3006 = getelementptr inbounds float* %tmp3005, i64 1
+  %tmp3007 = getelementptr inbounds float* %tmp3006, i64 1
+  %tmp3008 = getelementptr inbounds float* %tmp3007, i64 1
+  %tmp3009 = getelementptr inbounds float* %tmp3008, i64 1
+  %tmp3010 = getelementptr inbounds float* %tmp3009, i64 1
+  %tmp3011 = getelementptr inbounds float* %tmp3010, i64 1
+  %tmp3012 = getelementptr inbounds float* %tmp3011, i64 1
+  %tmp3013 = getelementptr inbounds float* %tmp3012, i64 1
+  %tmp3014 = getelementptr inbounds float* %tmp3013, i64 1
+  %tmp3015 = getelementptr inbounds float* %tmp3014, i64 1
+  %tmp3016 = getelementptr inbounds float* %tmp3015, i64 1
+  %tmp3017 = getelementptr inbounds float* %tmp3016, i64 1
+  %tmp3018 = getelementptr inbounds float* %tmp3017, i64 1
+  %tmp3019 = getelementptr inbounds float* %tmp3018, i64 1
+  %tmp3020 = getelementptr inbounds float* %tmp3019, i64 1
+  %tmp3021 = getelementptr inbounds float* %tmp3020, i64 1
+  %tmp3022 = getelementptr inbounds float* %tmp3021, i64 1
+  %tmp3023 = getelementptr inbounds float* %tmp3022, i64 1
+  %tmp3024 = getelementptr inbounds float* %tmp3023, i64 1
+  %tmp3025 = getelementptr inbounds float* %tmp3024, i64 1
+  %tmp3026 = getelementptr inbounds float* %tmp3025, i64 1
+  %tmp3027 = getelementptr inbounds float* %tmp3026, i64 1
+  %tmp3028 = getelementptr inbounds float* %tmp3027, i64 1
+  %tmp3029 = getelementptr inbounds float* %tmp3028, i64 1
+  %tmp3030 = getelementptr inbounds float* %tmp3029, i64 1
+  %tmp3031 = getelementptr inbounds float* %tmp3030, i64 1
+  %tmp3032 = getelementptr inbounds float* %tmp3031, i64 1
+  %tmp3033 = getelementptr inbounds float* %tmp3032, i64 1
+  %tmp3034 = getelementptr inbounds float* %tmp3033, i64 1
+  %tmp3035 = getelementptr inbounds float* %tmp3034, i64 1
+  %tmp3036 = getelementptr inbounds float* %tmp3035, i64 1
+  %tmp3037 = getelementptr inbounds float* %tmp3036, i64 1
+  %tmp3038 = getelementptr inbounds float* %tmp3037, i64 1
+  %tmp3039 = getelementptr inbounds float* %tmp3038, i64 1
+  %tmp3040 = getelementptr inbounds float* %tmp3039, i64 1
+  %tmp3041 = getelementptr inbounds float* %tmp3040, i64 1
+  %tmp3042 = getelementptr inbounds float* %tmp3041, i64 1
+  %tmp3043 = getelementptr inbounds float* %tmp3042, i64 1
+  %tmp3044 = getelementptr inbounds float* %tmp3043, i64 1
+  %tmp3045 = getelementptr inbounds float* %tmp3044, i64 1
+  %tmp3046 = getelementptr inbounds float* %tmp3045, i64 1
+  %tmp3047 = getelementptr inbounds float* %tmp3046, i64 1
+  %tmp3048 = getelementptr inbounds float* %tmp3047, i64 1
+  %tmp3049 = getelementptr inbounds float* %tmp3048, i64 1
+  %tmp3050 = getelementptr inbounds float* %tmp3049, i64 1
+  %tmp3051 = getelementptr inbounds float* %tmp3050, i64 1
+  %tmp3052 = getelementptr inbounds float* %tmp3051, i64 1
+  %tmp3053 = getelementptr inbounds float* %tmp3052, i64 1
+  %tmp3054 = getelementptr inbounds float* %tmp3053, i64 1
+  %tmp3055 = getelementptr inbounds float* %tmp3054, i64 1
+  %tmp3056 = getelementptr inbounds float* %tmp3055, i64 1
+  %tmp3057 = getelementptr inbounds float* %tmp3056, i64 1
+  %tmp3058 = getelementptr inbounds float* %tmp3057, i64 1
+  %tmp3059 = getelementptr inbounds float* %tmp3058, i64 1
+  %tmp3060 = getelementptr inbounds float* %tmp3059, i64 1
+  %tmp3061 = getelementptr inbounds float* %tmp3060, i64 1
+  %tmp3062 = getelementptr inbounds float* %tmp3061, i64 1
+  %tmp3063 = getelementptr inbounds float* %tmp3062, i64 1
+  %tmp3064 = getelementptr inbounds float* %tmp3063, i64 1
+  %tmp3065 = getelementptr inbounds float* %tmp3064, i64 1
+  %tmp3066 = getelementptr inbounds float* %tmp3065, i64 1
+  %tmp3067 = getelementptr inbounds float* %tmp3066, i64 1
+  %tmp3068 = getelementptr inbounds float* %tmp3067, i64 1
+  %tmp3069 = getelementptr inbounds float* %tmp3068, i64 1
+  %tmp3070 = getelementptr inbounds float* %tmp3069, i64 1
+  %tmp3071 = getelementptr inbounds float* %tmp3070, i64 1
+  %tmp3072 = getelementptr inbounds float* %tmp3071, i64 1
+  %tmp3073 = getelementptr inbounds float* %tmp3072, i64 1
+  %tmp3074 = getelementptr inbounds float* %tmp3073, i64 1
+  %tmp3075 = getelementptr inbounds float* %tmp3074, i64 1
+  %tmp3076 = getelementptr inbounds float* %tmp3075, i64 1
+  %tmp3077 = getelementptr inbounds float* %tmp3076, i64 1
+  %tmp3078 = getelementptr inbounds float* %tmp3077, i64 1
+  %tmp3079 = getelementptr inbounds float* %tmp3078, i64 1
+  %tmp3080 = getelementptr inbounds float* %tmp3079, i64 1
+  %tmp3081 = getelementptr inbounds float* %tmp3080, i64 1
+  %tmp3082 = getelementptr inbounds float* %tmp3081, i64 1
+  %tmp3083 = getelementptr inbounds float* %tmp3082, i64 1
+  %tmp3084 = getelementptr inbounds float* %tmp3083, i64 1
+  %tmp3085 = getelementptr inbounds float* %tmp3084, i64 1
+  %tmp3086 = getelementptr inbounds float* %tmp3085, i64 1
+  %tmp3087 = getelementptr inbounds float* %tmp3086, i64 1
+  %tmp3088 = getelementptr inbounds float* %tmp3087, i64 1
+  %tmp3089 = getelementptr inbounds float* %tmp3088, i64 1
+  %tmp3090 = getelementptr inbounds float* %tmp3089, i64 1
+  %tmp3091 = getelementptr inbounds float* %tmp3090, i64 1
+  %tmp3092 = getelementptr inbounds float* %tmp3091, i64 1
+  %tmp3093 = getelementptr inbounds float* %tmp3092, i64 1
+  %tmp3094 = getelementptr inbounds float* %tmp3093, i64 1
+  %tmp3095 = getelementptr inbounds float* %tmp3094, i64 1
+  %tmp3096 = getelementptr inbounds float* %tmp3095, i64 1
+  %tmp3097 = getelementptr inbounds float* %tmp3096, i64 1
+  %tmp3098 = getelementptr inbounds float* %tmp3097, i64 1
+  %tmp3099 = getelementptr inbounds float* %tmp3098, i64 1
+  %tmp3100 = getelementptr inbounds float* %tmp3099, i64 1
+  %tmp3101 = getelementptr inbounds float* %tmp3100, i64 1
+  %tmp3102 = getelementptr inbounds float* %tmp3101, i64 1
+  %tmp3103 = getelementptr inbounds float* %tmp3102, i64 1
+  %tmp3104 = getelementptr inbounds float* %tmp3103, i64 1
+  %tmp3105 = getelementptr inbounds float* %tmp3104, i64 1
+  %tmp3106 = getelementptr inbounds float* %tmp3105, i64 1
+  %tmp3107 = getelementptr inbounds float* %tmp3106, i64 1
+  %tmp3108 = getelementptr inbounds float* %tmp3107, i64 1
+  %tmp3109 = getelementptr inbounds float* %tmp3108, i64 1
+  %tmp3110 = getelementptr inbounds float* %tmp3109, i64 1
+  %tmp3111 = getelementptr inbounds float* %tmp3110, i64 1
+  %tmp3112 = getelementptr inbounds float* %tmp3111, i64 1
+  %tmp3113 = getelementptr inbounds float* %tmp3112, i64 1
+  %tmp3114 = getelementptr inbounds float* %tmp3113, i64 1
+  %tmp3115 = getelementptr inbounds float* %tmp3114, i64 1
+  %tmp3116 = getelementptr inbounds float* %tmp3115, i64 1
+  %tmp3117 = getelementptr inbounds float* %tmp3116, i64 1
+  %tmp3118 = getelementptr inbounds float* %tmp3117, i64 1
+  %tmp3119 = getelementptr inbounds float* %tmp3118, i64 1
+  %tmp3120 = getelementptr inbounds float* %tmp3119, i64 1
+  %tmp3121 = getelementptr inbounds float* %tmp3120, i64 1
+  %tmp3122 = getelementptr inbounds float* %tmp3121, i64 1
+  %tmp3123 = getelementptr inbounds float* %tmp3122, i64 1
+  %tmp3124 = getelementptr inbounds float* %tmp3123, i64 1
+  %tmp3125 = getelementptr inbounds float* %tmp3124, i64 1
+  %tmp3126 = getelementptr inbounds float* %tmp3125, i64 1
+  %tmp3127 = getelementptr inbounds float* %tmp3126, i64 1
+  %tmp3128 = getelementptr inbounds float* %tmp3127, i64 1
+  %tmp3129 = getelementptr inbounds float* %tmp3128, i64 1
+  %tmp3130 = getelementptr inbounds float* %tmp3129, i64 1
+  %tmp3131 = getelementptr inbounds float* %tmp3130, i64 1
+  %tmp3132 = getelementptr inbounds float* %tmp3131, i64 1
+  %tmp3133 = getelementptr inbounds float* %tmp3132, i64 1
+  %tmp3134 = getelementptr inbounds float* %tmp3133, i64 1
+  %tmp3135 = getelementptr inbounds float* %tmp3134, i64 1
+  %tmp3136 = getelementptr inbounds float* %tmp3135, i64 1
+  %tmp3137 = getelementptr inbounds float* %tmp3136, i64 1
+  %tmp3138 = getelementptr inbounds float* %tmp3137, i64 1
+  %tmp3139 = getelementptr inbounds float* %tmp3138, i64 1
+  %tmp3140 = getelementptr inbounds float* %tmp3139, i64 1
+  %tmp3141 = getelementptr inbounds float* %tmp3140, i64 1
+  %tmp3142 = getelementptr inbounds float* %tmp3141, i64 1
+  %tmp3143 = getelementptr inbounds float* %tmp3142, i64 1
+  %tmp3144 = getelementptr inbounds float* %tmp3143, i64 1
+  %tmp3145 = getelementptr inbounds float* %tmp3144, i64 1
+  %tmp3146 = getelementptr inbounds float* %tmp3145, i64 1
+  %tmp3147 = getelementptr inbounds float* %tmp3146, i64 1
+  %tmp3148 = getelementptr inbounds float* %tmp3147, i64 1
+  %tmp3149 = getelementptr inbounds float* %tmp3148, i64 1
+  %tmp3150 = getelementptr inbounds float* %tmp3149, i64 1
+  %tmp3151 = getelementptr inbounds float* %tmp3150, i64 1
+  %tmp3152 = getelementptr inbounds float* %tmp3151, i64 1
+  %tmp3153 = getelementptr inbounds float* %tmp3152, i64 1
+  %tmp3154 = getelementptr inbounds float* %tmp3153, i64 1
+  %tmp3155 = getelementptr inbounds float* %tmp3154, i64 1
+  %tmp3156 = getelementptr inbounds float* %tmp3155, i64 1
+  %tmp3157 = getelementptr inbounds float* %tmp3156, i64 1
+  %tmp3158 = getelementptr inbounds float* %tmp3157, i64 1
+  %tmp3159 = getelementptr inbounds float* %tmp3158, i64 1
+  %tmp3160 = getelementptr inbounds float* %tmp3159, i64 1
+  %tmp3161 = getelementptr inbounds float* %tmp3160, i64 1
+  %tmp3162 = getelementptr inbounds float* %tmp3161, i64 1
+  %tmp3163 = getelementptr inbounds float* %tmp3162, i64 1
+  %tmp3164 = getelementptr inbounds float* %tmp3163, i64 1
+  %tmp3165 = getelementptr inbounds float* %tmp3164, i64 1
+  %tmp3166 = getelementptr inbounds float* %tmp3165, i64 1
+  %tmp3167 = getelementptr inbounds float* %tmp3166, i64 1
+  %tmp3168 = getelementptr inbounds float* %tmp3167, i64 1
+  %tmp3169 = getelementptr inbounds float* %tmp3168, i64 1
+  %tmp3170 = getelementptr inbounds float* %tmp3169, i64 1
+  %tmp3171 = getelementptr inbounds float* %tmp3170, i64 1
+  %tmp3172 = getelementptr inbounds float* %tmp3171, i64 1
+  %tmp3173 = getelementptr inbounds float* %tmp3172, i64 1
+  %tmp3174 = getelementptr inbounds float* %tmp3173, i64 1
+  %tmp3175 = getelementptr inbounds float* %tmp3174, i64 1
+  %tmp3176 = getelementptr inbounds float* %tmp3175, i64 1
+  %tmp3177 = getelementptr inbounds float* %tmp3176, i64 1
+  %tmp3178 = getelementptr inbounds float* %tmp3177, i64 1
+  %tmp3179 = getelementptr inbounds float* %tmp3178, i64 1
+  %tmp3180 = getelementptr inbounds float* %tmp3179, i64 1
+  %tmp3181 = getelementptr inbounds float* %tmp3180, i64 1
+  %tmp3182 = getelementptr inbounds float* %tmp3181, i64 1
+  %tmp3183 = getelementptr inbounds float* %tmp3182, i64 1
+  %tmp3184 = getelementptr inbounds float* %tmp3183, i64 1
+  %tmp3185 = getelementptr inbounds float* %tmp3184, i64 1
+  %tmp3186 = getelementptr inbounds float* %tmp3185, i64 1
+  %tmp3187 = getelementptr inbounds float* %tmp3186, i64 1
+  %tmp3188 = getelementptr inbounds float* %tmp3187, i64 1
+  %tmp3189 = getelementptr inbounds float* %tmp3188, i64 1
+  %tmp3190 = getelementptr inbounds float* %tmp3189, i64 1
+  %tmp3191 = getelementptr inbounds float* %tmp3190, i64 1
+  %tmp3192 = getelementptr inbounds float* %tmp3191, i64 1
+  %tmp3193 = getelementptr inbounds float* %tmp3192, i64 1
+  %tmp3194 = getelementptr inbounds float* %tmp3193, i64 1
+  %tmp3195 = getelementptr inbounds float* %tmp3194, i64 1
+  %tmp3196 = getelementptr inbounds float* %tmp3195, i64 1
+  %tmp3197 = getelementptr inbounds float* %tmp3196, i64 1
+  %tmp3198 = getelementptr inbounds float* %tmp3197, i64 1
+  %tmp3199 = getelementptr inbounds float* %tmp3198, i64 1
+  %tmp3200 = getelementptr inbounds float* %tmp3199, i64 1
+  %tmp3201 = getelementptr inbounds float* %tmp3200, i64 1
+  %tmp3202 = getelementptr inbounds float* %tmp3201, i64 1
+  %tmp3203 = getelementptr inbounds float* %tmp3202, i64 1
+  %tmp3204 = getelementptr inbounds float* %tmp3203, i64 1
+  %tmp3205 = getelementptr inbounds float* %tmp3204, i64 1
+  %tmp3206 = getelementptr inbounds float* %tmp3205, i64 1
+  %tmp3207 = getelementptr inbounds float* %tmp3206, i64 1
+  %tmp3208 = getelementptr inbounds float* %tmp3207, i64 1
+  %tmp3209 = getelementptr inbounds float* %tmp3208, i64 1
+  %tmp3210 = getelementptr inbounds float* %tmp3209, i64 1
+  %tmp3211 = getelementptr inbounds float* %tmp3210, i64 1
+  %tmp3212 = getelementptr inbounds float* %tmp3211, i64 1
+  %tmp3213 = getelementptr inbounds float* %tmp3212, i64 1
+  %tmp3214 = getelementptr inbounds float* %tmp3213, i64 1
+  %tmp3215 = getelementptr inbounds float* %tmp3214, i64 1
+  %tmp3216 = getelementptr inbounds float* %tmp3215, i64 1
+  %tmp3217 = getelementptr inbounds float* %tmp3216, i64 1
+  %tmp3218 = getelementptr inbounds float* %tmp3217, i64 1
+  %tmp3219 = getelementptr inbounds float* %tmp3218, i64 1
+  %tmp3220 = getelementptr inbounds float* %tmp3219, i64 1
+  %tmp3221 = getelementptr inbounds float* %tmp3220, i64 1
+  %tmp3222 = getelementptr inbounds float* %tmp3221, i64 1
+  %tmp3223 = getelementptr inbounds float* %tmp3222, i64 1
+  %tmp3224 = getelementptr inbounds float* %tmp3223, i64 1
+  %tmp3225 = getelementptr inbounds float* %tmp3224, i64 1
+  %tmp3226 = getelementptr inbounds float* %tmp3225, i64 1
+  %tmp3227 = getelementptr inbounds float* %tmp3226, i64 1
+  %tmp3228 = getelementptr inbounds float* %tmp3227, i64 1
+  %tmp3229 = getelementptr inbounds float* %tmp3228, i64 1
+  %tmp3230 = getelementptr inbounds float* %tmp3229, i64 1
+  %tmp3231 = getelementptr inbounds float* %tmp3230, i64 1
+  %tmp3232 = getelementptr inbounds float* %tmp3231, i64 1
+  %tmp3233 = getelementptr inbounds float* %tmp3232, i64 1
+  %tmp3234 = getelementptr inbounds float* %tmp3233, i64 1
+  %tmp3235 = getelementptr inbounds float* %tmp3234, i64 1
+  %tmp3236 = getelementptr inbounds float* %tmp3235, i64 1
+  %tmp3237 = getelementptr inbounds float* %tmp3236, i64 1
+  %tmp3238 = getelementptr inbounds float* %tmp3237, i64 1
+  %tmp3239 = getelementptr inbounds float* %tmp3238, i64 1
+  %tmp3240 = getelementptr inbounds float* %tmp3239, i64 1
+  %tmp3241 = getelementptr inbounds float* %tmp3240, i64 1
+  %tmp3242 = getelementptr inbounds float* %tmp3241, i64 1
+  %tmp3243 = getelementptr inbounds float* %tmp3242, i64 1
+  %tmp3244 = getelementptr inbounds float* %tmp3243, i64 1
+  %tmp3245 = getelementptr inbounds float* %tmp3244, i64 1
+  %tmp3246 = getelementptr inbounds float* %tmp3245, i64 1
+  %tmp3247 = getelementptr inbounds float* %tmp3246, i64 1
+  %tmp3248 = getelementptr inbounds float* %tmp3247, i64 1
+  %tmp3249 = getelementptr inbounds float* %tmp3248, i64 1
+  %tmp3250 = getelementptr inbounds float* %tmp3249, i64 1
+  %tmp3251 = getelementptr inbounds float* %tmp3250, i64 1
+  %tmp3252 = getelementptr inbounds float* %tmp3251, i64 1
+  %tmp3253 = getelementptr inbounds float* %tmp3252, i64 1
+  %tmp3254 = getelementptr inbounds float* %tmp3253, i64 1
+  %tmp3255 = getelementptr inbounds float* %tmp3254, i64 1
+  %tmp3256 = getelementptr inbounds float* %tmp3255, i64 1
+  %tmp3257 = getelementptr inbounds float* %tmp3256, i64 1
+  %tmp3258 = getelementptr inbounds float* %tmp3257, i64 1
+  %tmp3259 = getelementptr inbounds float* %tmp3258, i64 1
+  %tmp3260 = getelementptr inbounds float* %tmp3259, i64 1
+  %tmp3261 = getelementptr inbounds float* %tmp3260, i64 1
+  %tmp3262 = getelementptr inbounds float* %tmp3261, i64 1
+  %tmp3263 = getelementptr inbounds float* %tmp3262, i64 1
+  %tmp3264 = getelementptr inbounds float* %tmp3263, i64 1
+  %tmp3265 = getelementptr inbounds float* %tmp3264, i64 1
+  %tmp3266 = getelementptr inbounds float* %tmp3265, i64 1
+  %tmp3267 = getelementptr inbounds float* %tmp3266, i64 1
+  %tmp3268 = getelementptr inbounds float* %tmp3267, i64 1
+  %tmp3269 = getelementptr inbounds float* %tmp3268, i64 1
+  %tmp3270 = getelementptr inbounds float* %tmp3269, i64 1
+  %tmp3271 = getelementptr inbounds float* %tmp3270, i64 1
+  %tmp3272 = getelementptr inbounds float* %tmp3271, i64 1
+  %tmp3273 = getelementptr inbounds float* %tmp3272, i64 1
+  %tmp3274 = getelementptr inbounds float* %tmp3273, i64 1
+  %tmp3275 = getelementptr inbounds float* %tmp3274, i64 1
+  %tmp3276 = getelementptr inbounds float* %tmp3275, i64 1
+  %tmp3277 = getelementptr inbounds float* %tmp3276, i64 1
+  %tmp3278 = getelementptr inbounds float* %tmp3277, i64 1
+  %tmp3279 = getelementptr inbounds float* %tmp3278, i64 1
+  %tmp3280 = getelementptr inbounds float* %tmp3279, i64 1
+  %tmp3281 = getelementptr inbounds float* %tmp3280, i64 1
+  %tmp3282 = getelementptr inbounds float* %tmp3281, i64 1
+  %tmp3283 = getelementptr inbounds float* %tmp3282, i64 1
+  %tmp3284 = getelementptr inbounds float* %tmp3283, i64 1
+  %tmp3285 = getelementptr inbounds float* %tmp3284, i64 1
+  %tmp3286 = getelementptr inbounds float* %tmp3285, i64 1
+  %tmp3287 = getelementptr inbounds float* %tmp3286, i64 1
+  %tmp3288 = getelementptr inbounds float* %tmp3287, i64 1
+  %tmp3289 = getelementptr inbounds float* %tmp3288, i64 1
+  %tmp3290 = getelementptr inbounds float* %tmp3289, i64 1
+  %tmp3291 = getelementptr inbounds float* %tmp3290, i64 1
+  %tmp3292 = getelementptr inbounds float* %tmp3291, i64 1
+  %tmp3293 = getelementptr inbounds float* %tmp3292, i64 1
+  %tmp3294 = getelementptr inbounds float* %tmp3293, i64 1
+  %tmp3295 = getelementptr inbounds float* %tmp3294, i64 1
+  %tmp3296 = getelementptr inbounds float* %tmp3295, i64 1
+  %tmp3297 = getelementptr inbounds float* %tmp3296, i64 1
+  %tmp3298 = getelementptr inbounds float* %tmp3297, i64 1
+  %tmp3299 = getelementptr inbounds float* %tmp3298, i64 1
+  %tmp3300 = getelementptr inbounds float* %tmp3299, i64 1
+  %tmp3301 = getelementptr inbounds float* %tmp3300, i64 1
+  %tmp3302 = getelementptr inbounds float* %tmp3301, i64 1
+  %tmp3303 = getelementptr inbounds float* %tmp3302, i64 1
+  %tmp3304 = getelementptr inbounds float* %tmp3303, i64 1
+  %tmp3305 = getelementptr inbounds float* %tmp3304, i64 1
+  %tmp3306 = getelementptr inbounds float* %tmp3305, i64 1
+  %tmp3307 = getelementptr inbounds float* %tmp3306, i64 1
+  %tmp3308 = getelementptr inbounds float* %tmp3307, i64 1
+  %tmp3309 = getelementptr inbounds float* %tmp3308, i64 1
+  %tmp3310 = getelementptr inbounds float* %tmp3309, i64 1
+  %tmp3311 = getelementptr inbounds float* %tmp3310, i64 1
+  %tmp3312 = getelementptr inbounds float* %tmp3311, i64 1
+  %tmp3313 = getelementptr inbounds float* %tmp3312, i64 1
+  %tmp3314 = getelementptr inbounds float* %tmp3313, i64 1
+  %tmp3315 = getelementptr inbounds float* %tmp3314, i64 1
+  %tmp3316 = getelementptr inbounds float* %tmp3315, i64 1
+  %tmp3317 = getelementptr inbounds float* %tmp3316, i64 1
+  %tmp3318 = getelementptr inbounds float* %tmp3317, i64 1
+  %tmp3319 = getelementptr inbounds float* %tmp3318, i64 1
+  %tmp3320 = getelementptr inbounds float* %tmp3319, i64 1
+  %tmp3321 = getelementptr inbounds float* %tmp3320, i64 1
+  %tmp3322 = getelementptr inbounds float* %tmp3321, i64 1
+  %tmp3323 = getelementptr inbounds float* %tmp3322, i64 1
+  %tmp3324 = getelementptr inbounds float* %tmp3323, i64 1
+  %tmp3325 = getelementptr inbounds float* %tmp3324, i64 1
+  %tmp3326 = getelementptr inbounds float* %tmp3325, i64 1
+  %tmp3327 = getelementptr inbounds float* %tmp3326, i64 1
+  %tmp3328 = getelementptr inbounds float* %tmp3327, i64 1
+  %tmp3329 = getelementptr inbounds float* %tmp3328, i64 1
+  %tmp3330 = getelementptr inbounds float* %tmp3329, i64 1
+  %tmp3331 = getelementptr inbounds float* %tmp3330, i64 1
+  %tmp3332 = getelementptr inbounds float* %tmp3331, i64 1
+  %tmp3333 = getelementptr inbounds float* %tmp3332, i64 1
+  %tmp3334 = getelementptr inbounds float* %tmp3333, i64 1
+  %tmp3335 = getelementptr inbounds float* %tmp3334, i64 1
+  %tmp3336 = getelementptr inbounds float* %tmp3335, i64 1
+  %tmp3337 = getelementptr inbounds float* %tmp3336, i64 1
+  %tmp3338 = getelementptr inbounds float* %tmp3337, i64 1
+  %tmp3339 = getelementptr inbounds float* %tmp3338, i64 1
+  %tmp3340 = getelementptr inbounds float* %tmp3339, i64 1
+  %tmp3341 = getelementptr inbounds float* %tmp3340, i64 1
+  %tmp3342 = getelementptr inbounds float* %tmp3341, i64 1
+  %tmp3343 = getelementptr inbounds float* %tmp3342, i64 1
+  %tmp3344 = getelementptr inbounds float* %tmp3343, i64 1
+  %tmp3345 = getelementptr inbounds float* %tmp3344, i64 1
+  %tmp3346 = getelementptr inbounds float* %tmp3345, i64 1
+  %tmp3347 = getelementptr inbounds float* %tmp3346, i64 1
+  %tmp3348 = getelementptr inbounds float* %tmp3347, i64 1
+  %tmp3349 = getelementptr inbounds float* %tmp3348, i64 1
+  %tmp3350 = getelementptr inbounds float* %tmp3349, i64 1
+  %tmp3351 = getelementptr inbounds float* %tmp3350, i64 1
+  %tmp3352 = getelementptr inbounds float* %tmp3351, i64 1
+  %tmp3353 = getelementptr inbounds float* %tmp3352, i64 1
+  %tmp3354 = getelementptr inbounds float* %tmp3353, i64 1
+  %tmp3355 = getelementptr inbounds float* %tmp3354, i64 1
+  %tmp3356 = getelementptr inbounds float* %tmp3355, i64 1
+  %tmp3357 = getelementptr inbounds float* %tmp3356, i64 1
+  %tmp3358 = getelementptr inbounds float* %tmp3357, i64 1
+  %tmp3359 = getelementptr inbounds float* %tmp3358, i64 1
+  %tmp3360 = getelementptr inbounds float* %tmp3359, i64 1
+  %tmp3361 = getelementptr inbounds float* %tmp3360, i64 1
+  %tmp3362 = getelementptr inbounds float* %tmp3361, i64 1
+  %tmp3363 = getelementptr inbounds float* %tmp3362, i64 1
+  %tmp3364 = getelementptr inbounds float* %tmp3363, i64 1
+  %tmp3365 = getelementptr inbounds float* %tmp3364, i64 1
+  %tmp3366 = getelementptr inbounds float* %tmp3365, i64 1
+  %tmp3367 = getelementptr inbounds float* %tmp3366, i64 1
+  %tmp3368 = getelementptr inbounds float* %tmp3367, i64 1
+  %tmp3369 = getelementptr inbounds float* %tmp3368, i64 1
+  %tmp3370 = getelementptr inbounds float* %tmp3369, i64 1
+  %tmp3371 = getelementptr inbounds float* %tmp3370, i64 1
+  %tmp3372 = getelementptr inbounds float* %tmp3371, i64 1
+  %tmp3373 = getelementptr inbounds float* %tmp3372, i64 1
+  %tmp3374 = getelementptr inbounds float* %tmp3373, i64 1
+  %tmp3375 = getelementptr inbounds float* %tmp3374, i64 1
+  %tmp3376 = getelementptr inbounds float* %tmp3375, i64 1
+  %tmp3377 = getelementptr inbounds float* %tmp3376, i64 1
+  %tmp3378 = getelementptr inbounds float* %tmp3377, i64 1
+  %tmp3379 = getelementptr inbounds float* %tmp3378, i64 1
+  %tmp3380 = getelementptr inbounds float* %tmp3379, i64 1
+  %tmp3381 = getelementptr inbounds float* %tmp3380, i64 1
+  %tmp3382 = getelementptr inbounds float* %tmp3381, i64 1
+  %tmp3383 = getelementptr inbounds float* %tmp3382, i64 1
+  %tmp3384 = getelementptr inbounds float* %tmp3383, i64 1
+  %tmp3385 = getelementptr inbounds float* %tmp3384, i64 1
+  %tmp3386 = getelementptr inbounds float* %tmp3385, i64 1
+  %tmp3387 = getelementptr inbounds float* %tmp3386, i64 1
+  %tmp3388 = getelementptr inbounds float* %tmp3387, i64 1
+  %tmp3389 = getelementptr inbounds float* %tmp3388, i64 1
+  %tmp3390 = getelementptr inbounds float* %tmp3389, i64 1
+  %tmp3391 = getelementptr inbounds float* %tmp3390, i64 1
+  %tmp3392 = getelementptr inbounds float* %tmp3391, i64 1
+  %tmp3393 = getelementptr inbounds float* %tmp3392, i64 1
+  %tmp3394 = getelementptr inbounds float* %tmp3393, i64 1
+  %tmp3395 = getelementptr inbounds float* %tmp3394, i64 1
+  %tmp3396 = getelementptr inbounds float* %tmp3395, i64 1
+  %tmp3397 = getelementptr inbounds float* %tmp3396, i64 1
+  %tmp3398 = getelementptr inbounds float* %tmp3397, i64 1
+  %tmp3399 = getelementptr inbounds float* %tmp3398, i64 1
+  %tmp3400 = getelementptr inbounds float* %tmp3399, i64 1
+  %tmp3401 = getelementptr inbounds float* %tmp3400, i64 1
+  %tmp3402 = getelementptr inbounds float* %tmp3401, i64 1
+  %tmp3403 = getelementptr inbounds float* %tmp3402, i64 1
+  %tmp3404 = getelementptr inbounds float* %tmp3403, i64 1
+  %tmp3405 = getelementptr inbounds float* %tmp3404, i64 1
+  %tmp3406 = getelementptr inbounds float* %tmp3405, i64 1
+  %tmp3407 = getelementptr inbounds float* %tmp3406, i64 1
+  %tmp3408 = getelementptr inbounds float* %tmp3407, i64 1
+  %tmp3409 = getelementptr inbounds float* %tmp3408, i64 1
+  %tmp3410 = getelementptr inbounds float* %tmp3409, i64 1
+  %tmp3411 = getelementptr inbounds float* %tmp3410, i64 1
+  %tmp3412 = getelementptr inbounds float* %tmp3411, i64 1
+  %tmp3413 = getelementptr inbounds float* %tmp3412, i64 1
+  %tmp3414 = getelementptr inbounds float* %tmp3413, i64 1
+  %tmp3415 = getelementptr inbounds float* %tmp3414, i64 1
+  %tmp3416 = getelementptr inbounds float* %tmp3415, i64 1
+  %tmp3417 = getelementptr inbounds float* %tmp3416, i64 1
+  %tmp3418 = getelementptr inbounds float* %tmp3417, i64 1
+  %tmp3419 = getelementptr inbounds float* %tmp3418, i64 1
+  %tmp3420 = getelementptr inbounds float* %tmp3419, i64 1
+  %tmp3421 = getelementptr inbounds float* %tmp3420, i64 1
+  %tmp3422 = getelementptr inbounds float* %tmp3421, i64 1
+  %tmp3423 = getelementptr inbounds float* %tmp3422, i64 1
+  %tmp3424 = getelementptr inbounds float* %tmp3423, i64 1
+  %tmp3425 = getelementptr inbounds float* %tmp3424, i64 1
+  %tmp3426 = getelementptr inbounds float* %tmp3425, i64 1
+  %tmp3427 = getelementptr inbounds float* %tmp3426, i64 1
+  %tmp3428 = getelementptr inbounds float* %tmp3427, i64 1
+  %tmp3429 = getelementptr inbounds float* %tmp3428, i64 1
+  %tmp3430 = getelementptr inbounds float* %tmp3429, i64 1
+  %tmp3431 = getelementptr inbounds float* %tmp3430, i64 1
+  %tmp3432 = getelementptr inbounds float* %tmp3431, i64 1
+  %tmp3433 = getelementptr inbounds float* %tmp3432, i64 1
+  %tmp3434 = getelementptr inbounds float* %tmp3433, i64 1
+  %tmp3435 = getelementptr inbounds float* %tmp3434, i64 1
+  %tmp3436 = getelementptr inbounds float* %tmp3435, i64 1
+  %tmp3437 = getelementptr inbounds float* %tmp3436, i64 1
+  %tmp3438 = getelementptr inbounds float* %tmp3437, i64 1
+  %tmp3439 = getelementptr inbounds float* %tmp3438, i64 1
+  %tmp3440 = getelementptr inbounds float* %tmp3439, i64 1
+  %tmp3441 = getelementptr inbounds float* %tmp3440, i64 1
+  %tmp3442 = getelementptr inbounds float* %tmp3441, i64 1
+  %tmp3443 = getelementptr inbounds float* %tmp3442, i64 1
+  %tmp3444 = getelementptr inbounds float* %tmp3443, i64 1
+  %tmp3445 = getelementptr inbounds float* %tmp3444, i64 1
+  %tmp3446 = getelementptr inbounds float* %tmp3445, i64 1
+  %tmp3447 = getelementptr inbounds float* %tmp3446, i64 1
+  %tmp3448 = getelementptr inbounds float* %tmp3447, i64 1
+  %tmp3449 = getelementptr inbounds float* %tmp3448, i64 1
+  %tmp3450 = getelementptr inbounds float* %tmp3449, i64 1
+  %tmp3451 = getelementptr inbounds float* %tmp3450, i64 1
+  %tmp3452 = getelementptr inbounds float* %tmp3451, i64 1
+  %tmp3453 = getelementptr inbounds float* %tmp3452, i64 1
+  %tmp3454 = getelementptr inbounds float* %tmp3453, i64 1
+  %tmp3455 = getelementptr inbounds float* %tmp3454, i64 1
+  %tmp3456 = getelementptr inbounds float* %tmp3455, i64 1
+  %tmp3457 = getelementptr inbounds float* %tmp3456, i64 1
+  %tmp3458 = getelementptr inbounds float* %tmp3457, i64 1
+  %tmp3459 = getelementptr inbounds float* %tmp3458, i64 1
+  %tmp3460 = getelementptr inbounds float* %tmp3459, i64 1
+  %tmp3461 = getelementptr inbounds float* %tmp3460, i64 1
+  %tmp3462 = getelementptr inbounds float* %tmp3461, i64 1
+  %tmp3463 = getelementptr inbounds float* %tmp3462, i64 1
+  %tmp3464 = getelementptr inbounds float* %tmp3463, i64 1
+  %tmp3465 = getelementptr inbounds float* %tmp3464, i64 1
+  %tmp3466 = getelementptr inbounds float* %tmp3465, i64 1
+  %tmp3467 = getelementptr inbounds float* %tmp3466, i64 1
+  %tmp3468 = getelementptr inbounds float* %tmp3467, i64 1
+  %tmp3469 = getelementptr inbounds float* %tmp3468, i64 1
+  %tmp3470 = getelementptr inbounds float* %tmp3469, i64 1
+  %tmp3471 = getelementptr inbounds float* %tmp3470, i64 1
+  %tmp3472 = getelementptr inbounds float* %tmp3471, i64 1
+  %tmp3473 = getelementptr inbounds float* %tmp3472, i64 1
+  %tmp3474 = getelementptr inbounds float* %tmp3473, i64 1
+  %tmp3475 = getelementptr inbounds float* %tmp3474, i64 1
+  %tmp3476 = getelementptr inbounds float* %tmp3475, i64 1
+  %tmp3477 = getelementptr inbounds float* %tmp3476, i64 1
+  %tmp3478 = getelementptr inbounds float* %tmp3477, i64 1
+  %tmp3479 = getelementptr inbounds float* %tmp3478, i64 1
+  %tmp3480 = getelementptr inbounds float* %tmp3479, i64 1
+  %tmp3481 = getelementptr inbounds float* %tmp3480, i64 1
+  %tmp3482 = getelementptr inbounds float* %tmp3481, i64 1
+  %tmp3483 = getelementptr inbounds float* %tmp3482, i64 1
+  %tmp3484 = getelementptr inbounds float* %tmp3483, i64 1
+  %tmp3485 = getelementptr inbounds float* %tmp3484, i64 1
+  %tmp3486 = getelementptr inbounds float* %tmp3485, i64 1
+  %tmp3487 = getelementptr inbounds float* %tmp3486, i64 1
+  %tmp3488 = getelementptr inbounds float* %tmp3487, i64 1
+  %tmp3489 = getelementptr inbounds float* %tmp3488, i64 1
+  %tmp3490 = getelementptr inbounds float* %tmp3489, i64 1
+  %tmp3491 = getelementptr inbounds float* %tmp3490, i64 1
+  %tmp3492 = getelementptr inbounds float* %tmp3491, i64 1
+  %tmp3493 = getelementptr inbounds float* %tmp3492, i64 1
+  %tmp3494 = getelementptr inbounds float* %tmp3493, i64 1
+  %tmp3495 = getelementptr inbounds float* %tmp3494, i64 1
+  %tmp3496 = getelementptr inbounds float* %tmp3495, i64 1
+  %tmp3497 = getelementptr inbounds float* %tmp3496, i64 1
+  %tmp3498 = getelementptr inbounds float* %tmp3497, i64 1
+  %tmp3499 = getelementptr inbounds float* %tmp3498, i64 1
+  %tmp3500 = getelementptr inbounds float* %tmp3499, i64 1
+  %tmp3501 = getelementptr inbounds float* %tmp3500, i64 1
+  %tmp3502 = getelementptr inbounds float* %tmp3501, i64 1
+  %tmp3503 = getelementptr inbounds float* %tmp3502, i64 1
+  %tmp3504 = getelementptr inbounds float* %tmp3503, i64 1
+  %tmp3505 = getelementptr inbounds float* %tmp3504, i64 1
+  %tmp3506 = getelementptr inbounds float* %tmp3505, i64 1
+  %tmp3507 = getelementptr inbounds float* %tmp3506, i64 1
+  %tmp3508 = getelementptr inbounds float* %tmp3507, i64 1
+  %tmp3509 = getelementptr inbounds float* %tmp3508, i64 1
+  %tmp3510 = getelementptr inbounds float* %tmp3509, i64 1
+  %tmp3511 = getelementptr inbounds float* %tmp3510, i64 1
+  %tmp3512 = getelementptr inbounds float* %tmp3511, i64 1
+  %tmp3513 = getelementptr inbounds float* %tmp3512, i64 1
+  %tmp3514 = getelementptr inbounds float* %tmp3513, i64 1
+  %tmp3515 = getelementptr inbounds float* %tmp3514, i64 1
+  %tmp3516 = getelementptr inbounds float* %tmp3515, i64 1
+  %tmp3517 = getelementptr inbounds float* %tmp3516, i64 1
+  %tmp3518 = getelementptr inbounds float* %tmp3517, i64 1
+  %tmp3519 = getelementptr inbounds float* %tmp3518, i64 1
+  %tmp3520 = getelementptr inbounds float* %tmp3519, i64 1
+  %tmp3521 = getelementptr inbounds float* %tmp3520, i64 1
+  %tmp3522 = getelementptr inbounds float* %tmp3521, i64 1
+  %tmp3523 = getelementptr inbounds float* %tmp3522, i64 1
+  %tmp3524 = getelementptr inbounds float* %tmp3523, i64 1
+  %tmp3525 = getelementptr inbounds float* %tmp3524, i64 1
+  %tmp3526 = getelementptr inbounds float* %tmp3525, i64 1
+  %tmp3527 = getelementptr inbounds float* %tmp3526, i64 1
+  %tmp3528 = getelementptr inbounds float* %tmp3527, i64 1
+  %tmp3529 = getelementptr inbounds float* %tmp3528, i64 1
+  %tmp3530 = getelementptr inbounds float* %tmp3529, i64 1
+  %tmp3531 = getelementptr inbounds float* %tmp3530, i64 1
+  %tmp3532 = getelementptr inbounds float* %tmp3531, i64 1
+  %tmp3533 = getelementptr inbounds float* %tmp3532, i64 1
+  %tmp3534 = getelementptr inbounds float* %tmp3533, i64 1
+  %tmp3535 = getelementptr inbounds float* %tmp3534, i64 1
+  %tmp3536 = getelementptr inbounds float* %tmp3535, i64 1
+  %tmp3537 = getelementptr inbounds float* %tmp3536, i64 1
+  %tmp3538 = getelementptr inbounds float* %tmp3537, i64 1
+  %tmp3539 = getelementptr inbounds float* %tmp3538, i64 1
+  %tmp3540 = getelementptr inbounds float* %tmp3539, i64 1
+  %tmp3541 = getelementptr inbounds float* %tmp3540, i64 1
+  %tmp3542 = getelementptr inbounds float* %tmp3541, i64 1
+  %tmp3543 = getelementptr inbounds float* %tmp3542, i64 1
+  %tmp3544 = getelementptr inbounds float* %tmp3543, i64 1
+  %tmp3545 = getelementptr inbounds float* %tmp3544, i64 1
+  %tmp3546 = getelementptr inbounds float* %tmp3545, i64 1
+  %tmp3547 = getelementptr inbounds float* %tmp3546, i64 1
+  %tmp3548 = getelementptr inbounds float* %tmp3547, i64 1
+  %tmp3549 = getelementptr inbounds float* %tmp3548, i64 1
+  %tmp3550 = getelementptr inbounds float* %tmp3549, i64 1
+  %tmp3551 = getelementptr inbounds float* %tmp3550, i64 1
+  %tmp3552 = getelementptr inbounds float* %tmp3551, i64 1
+  %tmp3553 = getelementptr inbounds float* %tmp3552, i64 1
+  %tmp3554 = getelementptr inbounds float* %tmp3553, i64 1
+  %tmp3555 = getelementptr inbounds float* %tmp3554, i64 1
+  %tmp3556 = getelementptr inbounds float* %tmp3555, i64 1
+  %tmp3557 = getelementptr inbounds float* %tmp3556, i64 1
+  %tmp3558 = getelementptr inbounds float* %tmp3557, i64 1
+  %tmp3559 = getelementptr inbounds float* %tmp3558, i64 1
+  %tmp3560 = getelementptr inbounds float* %tmp3559, i64 1
+  %tmp3561 = getelementptr inbounds float* %tmp3560, i64 1
+  %tmp3562 = getelementptr inbounds float* %tmp3561, i64 1
+  %tmp3563 = getelementptr inbounds float* %tmp3562, i64 1
+  %tmp3564 = getelementptr inbounds float* %tmp3563, i64 1
+  %tmp3565 = getelementptr inbounds float* %tmp3564, i64 1
+  %tmp3566 = getelementptr inbounds float* %tmp3565, i64 1
+  %tmp3567 = getelementptr inbounds float* %tmp3566, i64 1
+  %tmp3568 = getelementptr inbounds float* %tmp3567, i64 1
+  %tmp3569 = getelementptr inbounds float* %tmp3568, i64 1
+  %tmp3570 = getelementptr inbounds float* %tmp3569, i64 1
+  %tmp3571 = getelementptr inbounds float* %tmp3570, i64 1
+  %tmp3572 = getelementptr inbounds float* %tmp3571, i64 1
+  %tmp3573 = getelementptr inbounds float* %tmp3572, i64 1
+  %tmp3574 = getelementptr inbounds float* %tmp3573, i64 1
+  %tmp3575 = getelementptr inbounds float* %tmp3574, i64 1
+  %tmp3576 = getelementptr inbounds float* %tmp3575, i64 1
+  %tmp3577 = getelementptr inbounds float* %tmp3576, i64 1
+  %tmp3578 = getelementptr inbounds float* %tmp3577, i64 1
+  %tmp3579 = getelementptr inbounds float* %tmp3578, i64 1
+  %tmp3580 = getelementptr inbounds float* %tmp3579, i64 1
+  %tmp3581 = getelementptr inbounds float* %tmp3580, i64 1
+  %tmp3582 = getelementptr inbounds float* %tmp3581, i64 1
+  %tmp3583 = getelementptr inbounds float* %tmp3582, i64 1
+  %tmp3584 = getelementptr inbounds float* %tmp3583, i64 1
+  %tmp3585 = getelementptr inbounds float* %tmp3584, i64 1
+  %tmp3586 = getelementptr inbounds float* %tmp3585, i64 1
+  %tmp3587 = getelementptr inbounds float* %tmp3586, i64 1
+  %tmp3588 = getelementptr inbounds float* %tmp3587, i64 1
+  %tmp3589 = getelementptr inbounds float* %tmp3588, i64 1
+  %tmp3590 = getelementptr inbounds float* %tmp3589, i64 1
+  %tmp3591 = getelementptr inbounds float* %tmp3590, i64 1
+  %tmp3592 = getelementptr inbounds float* %tmp3591, i64 1
+  %tmp3593 = getelementptr inbounds float* %tmp3592, i64 1
+  %tmp3594 = getelementptr inbounds float* %tmp3593, i64 1
+  %tmp3595 = getelementptr inbounds float* %tmp3594, i64 1
+  %tmp3596 = getelementptr inbounds float* %tmp3595, i64 1
+  %tmp3597 = getelementptr inbounds float* %tmp3596, i64 1
+  %tmp3598 = getelementptr inbounds float* %tmp3597, i64 1
+  %tmp3599 = getelementptr inbounds float* %tmp3598, i64 1
+  %tmp3600 = getelementptr inbounds float* %tmp3599, i64 1
+  %tmp3601 = getelementptr inbounds float* %tmp3600, i64 1
+  %tmp3602 = getelementptr inbounds float* %tmp3601, i64 1
+  %tmp3603 = getelementptr inbounds float* %tmp3602, i64 1
+  %tmp3604 = getelementptr inbounds float* %tmp3603, i64 1
+  %tmp3605 = getelementptr inbounds float* %tmp3604, i64 1
+  %tmp3606 = getelementptr inbounds float* %tmp3605, i64 1
+  %tmp3607 = getelementptr inbounds float* %tmp3606, i64 1
+  %tmp3608 = getelementptr inbounds float* %tmp3607, i64 1
+  %tmp3609 = getelementptr inbounds float* %tmp3608, i64 1
+  %tmp3610 = getelementptr inbounds float* %tmp3609, i64 1
+  %tmp3611 = getelementptr inbounds float* %tmp3610, i64 1
+  %tmp3612 = getelementptr inbounds float* %tmp3611, i64 1
+  %tmp3613 = getelementptr inbounds float* %tmp3612, i64 1
+  %tmp3614 = getelementptr inbounds float* %tmp3613, i64 1
+  %tmp3615 = getelementptr inbounds float* %tmp3614, i64 1
+  %tmp3616 = getelementptr inbounds float* %tmp3615, i64 1
+  %tmp3617 = getelementptr inbounds float* %tmp3616, i64 1
+  %tmp3618 = getelementptr inbounds float* %tmp3617, i64 1
+  %tmp3619 = getelementptr inbounds float* %tmp3618, i64 1
+  %tmp3620 = getelementptr inbounds float* %tmp3619, i64 1
+  %tmp3621 = getelementptr inbounds float* %tmp3620, i64 1
+  %tmp3622 = getelementptr inbounds float* %tmp3621, i64 1
+  %tmp3623 = getelementptr inbounds float* %tmp3622, i64 1
+  %tmp3624 = getelementptr inbounds float* %tmp3623, i64 1
+  %tmp3625 = getelementptr inbounds float* %tmp3624, i64 1
+  %tmp3626 = getelementptr inbounds float* %tmp3625, i64 1
+  %tmp3627 = getelementptr inbounds float* %tmp3626, i64 1
+  %tmp3628 = getelementptr inbounds float* %tmp3627, i64 1
+  %tmp3629 = getelementptr inbounds float* %tmp3628, i64 1
+  %tmp3630 = getelementptr inbounds float* %tmp3629, i64 1
+  %tmp3631 = getelementptr inbounds float* %tmp3630, i64 1
+  %tmp3632 = getelementptr inbounds float* %tmp3631, i64 1
+  %tmp3633 = getelementptr inbounds float* %tmp3632, i64 1
+  %tmp3634 = getelementptr inbounds float* %tmp3633, i64 1
+  %tmp3635 = getelementptr inbounds float* %tmp3634, i64 1
+  %tmp3636 = getelementptr inbounds float* %tmp3635, i64 1
+  %tmp3637 = getelementptr inbounds float* %tmp3636, i64 1
+  %tmp3638 = getelementptr inbounds float* %tmp3637, i64 1
+  %tmp3639 = getelementptr inbounds float* %tmp3638, i64 1
+  %tmp3640 = getelementptr inbounds float* %tmp3639, i64 1
+  %tmp3641 = getelementptr inbounds float* %tmp3640, i64 1
+  %tmp3642 = getelementptr inbounds float* %tmp3641, i64 1
+  %tmp3643 = getelementptr inbounds float* %tmp3642, i64 1
+  %tmp3644 = getelementptr inbounds float* %tmp3643, i64 1
+  %tmp3645 = getelementptr inbounds float* %tmp3644, i64 1
+  %tmp3646 = getelementptr inbounds float* %tmp3645, i64 1
+  %tmp3647 = getelementptr inbounds float* %tmp3646, i64 1
+  %tmp3648 = getelementptr inbounds float* %tmp3647, i64 1
+  %tmp3649 = getelementptr inbounds float* %tmp3648, i64 1
+  %tmp3650 = getelementptr inbounds float* %tmp3649, i64 1
+  %tmp3651 = getelementptr inbounds float* %tmp3650, i64 1
+  %tmp3652 = getelementptr inbounds float* %tmp3651, i64 1
+  %tmp3653 = getelementptr inbounds float* %tmp3652, i64 1
+  %tmp3654 = getelementptr inbounds float* %tmp3653, i64 1
+  %tmp3655 = getelementptr inbounds float* %tmp3654, i64 1
+  %tmp3656 = getelementptr inbounds float* %tmp3655, i64 1
+  %tmp3657 = getelementptr inbounds float* %tmp3656, i64 1
+  %tmp3658 = getelementptr inbounds float* %tmp3657, i64 1
+  %tmp3659 = getelementptr inbounds float* %tmp3658, i64 1
+  %tmp3660 = getelementptr inbounds float* %tmp3659, i64 1
+  %tmp3661 = getelementptr inbounds float* %tmp3660, i64 1
+  %tmp3662 = getelementptr inbounds float* %tmp3661, i64 1
+  %tmp3663 = getelementptr inbounds float* %tmp3662, i64 1
+  %tmp3664 = getelementptr inbounds float* %tmp3663, i64 1
+  %tmp3665 = getelementptr inbounds float* %tmp3664, i64 1
+  %tmp3666 = getelementptr inbounds float* %tmp3665, i64 1
+  %tmp3667 = getelementptr inbounds float* %tmp3666, i64 1
+  %tmp3668 = getelementptr inbounds float* %tmp3667, i64 1
+  %tmp3669 = getelementptr inbounds float* %tmp3668, i64 1
+  %tmp3670 = getelementptr inbounds float* %tmp3669, i64 1
+  %tmp3671 = getelementptr inbounds float* %tmp3670, i64 1
+  %tmp3672 = getelementptr inbounds float* %tmp3671, i64 1
+  %tmp3673 = getelementptr inbounds float* %tmp3672, i64 1
+  %tmp3674 = getelementptr inbounds float* %tmp3673, i64 1
+  %tmp3675 = getelementptr inbounds float* %tmp3674, i64 1
+  %tmp3676 = getelementptr inbounds float* %tmp3675, i64 1
+  %tmp3677 = getelementptr inbounds float* %tmp3676, i64 1
+  %tmp3678 = getelementptr inbounds float* %tmp3677, i64 1
+  %tmp3679 = getelementptr inbounds float* %tmp3678, i64 1
+  %tmp3680 = getelementptr inbounds float* %tmp3679, i64 1
+  %tmp3681 = getelementptr inbounds float* %tmp3680, i64 1
+  %tmp3682 = getelementptr inbounds float* %tmp3681, i64 1
+  %tmp3683 = getelementptr inbounds float* %tmp3682, i64 1
+  %tmp3684 = getelementptr inbounds float* %tmp3683, i64 1
+  %tmp3685 = getelementptr inbounds float* %tmp3684, i64 1
+  %tmp3686 = getelementptr inbounds float* %tmp3685, i64 1
+  %tmp3687 = getelementptr inbounds float* %tmp3686, i64 1
+  %tmp3688 = getelementptr inbounds float* %tmp3687, i64 1
+  %tmp3689 = getelementptr inbounds float* %tmp3688, i64 1
+  %tmp3690 = getelementptr inbounds float* %tmp3689, i64 1
+  %tmp3691 = getelementptr inbounds float* %tmp3690, i64 1
+  %tmp3692 = getelementptr inbounds float* %tmp3691, i64 1
+  %tmp3693 = getelementptr inbounds float* %tmp3692, i64 1
+  %tmp3694 = getelementptr inbounds float* %tmp3693, i64 1
+  %tmp3695 = getelementptr inbounds float* %tmp3694, i64 1
+  %tmp3696 = getelementptr inbounds float* %tmp3695, i64 1
+  %tmp3697 = getelementptr inbounds float* %tmp3696, i64 1
+  %tmp3698 = getelementptr inbounds float* %tmp3697, i64 1
+  %tmp3699 = getelementptr inbounds float* %tmp3698, i64 1
+  %tmp3700 = getelementptr inbounds float* %tmp3699, i64 1
+  %tmp3701 = getelementptr inbounds float* %tmp3700, i64 1
+  %tmp3702 = getelementptr inbounds float* %tmp3701, i64 1
+  %tmp3703 = getelementptr inbounds float* %tmp3702, i64 1
+  %tmp3704 = getelementptr inbounds float* %tmp3703, i64 1
+  %tmp3705 = getelementptr inbounds float* %tmp3704, i64 1
+  %tmp3706 = getelementptr inbounds float* %tmp3705, i64 1
+  %tmp3707 = getelementptr inbounds float* %tmp3706, i64 1
+  %tmp3708 = getelementptr inbounds float* %tmp3707, i64 1
+  %tmp3709 = getelementptr inbounds float* %tmp3708, i64 1
+  %tmp3710 = getelementptr inbounds float* %tmp3709, i64 1
+  %tmp3711 = getelementptr inbounds float* %tmp3710, i64 1
+  %tmp3712 = getelementptr inbounds float* %tmp3711, i64 1
+  %tmp3713 = getelementptr inbounds float* %tmp3712, i64 1
+  %tmp3714 = getelementptr inbounds float* %tmp3713, i64 1
+  %tmp3715 = getelementptr inbounds float* %tmp3714, i64 1
+  %tmp3716 = getelementptr inbounds float* %tmp3715, i64 1
+  %tmp3717 = getelementptr inbounds float* %tmp3716, i64 1
+  %tmp3718 = getelementptr inbounds float* %tmp3717, i64 1
+  %tmp3719 = getelementptr inbounds float* %tmp3718, i64 1
+  %tmp3720 = getelementptr inbounds float* %tmp3719, i64 1
+  %tmp3721 = getelementptr inbounds float* %tmp3720, i64 1
+  %tmp3722 = getelementptr inbounds float* %tmp3721, i64 1
+  %tmp3723 = getelementptr inbounds float* %tmp3722, i64 1
+  %tmp3724 = getelementptr inbounds float* %tmp3723, i64 1
+  %tmp3725 = getelementptr inbounds float* %tmp3724, i64 1
+  %tmp3726 = getelementptr inbounds float* %tmp3725, i64 1
+  %tmp3727 = getelementptr inbounds float* %tmp3726, i64 1
+  %tmp3728 = getelementptr inbounds float* %tmp3727, i64 1
+  %tmp3729 = getelementptr inbounds float* %tmp3728, i64 1
+  %tmp3730 = getelementptr inbounds float* %tmp3729, i64 1
+  %tmp3731 = getelementptr inbounds float* %tmp3730, i64 1
+  %tmp3732 = getelementptr inbounds float* %tmp3731, i64 1
+  %tmp3733 = getelementptr inbounds float* %tmp3732, i64 1
+  %tmp3734 = getelementptr inbounds float* %tmp3733, i64 1
+  %tmp3735 = getelementptr inbounds float* %tmp3734, i64 1
+  %tmp3736 = getelementptr inbounds float* %tmp3735, i64 1
+  %tmp3737 = getelementptr inbounds float* %tmp3736, i64 1
+  %tmp3738 = getelementptr inbounds float* %tmp3737, i64 1
+  %tmp3739 = getelementptr inbounds float* %tmp3738, i64 1
+  %tmp3740 = getelementptr inbounds float* %tmp3739, i64 1
+  %tmp3741 = getelementptr inbounds float* %tmp3740, i64 1
+  %tmp3742 = getelementptr inbounds float* %tmp3741, i64 1
+  %tmp3743 = getelementptr inbounds float* %tmp3742, i64 1
+  %tmp3744 = getelementptr inbounds float* %tmp3743, i64 1
+  %tmp3745 = getelementptr inbounds float* %tmp3744, i64 1
+  %tmp3746 = getelementptr inbounds float* %tmp3745, i64 1
+  %tmp3747 = getelementptr inbounds float* %tmp3746, i64 1
+  %tmp3748 = getelementptr inbounds float* %tmp3747, i64 1
+  %tmp3749 = getelementptr inbounds float* %tmp3748, i64 1
+  %tmp3750 = getelementptr inbounds float* %tmp3749, i64 1
+  %tmp3751 = getelementptr inbounds float* %tmp3750, i64 1
+  %tmp3752 = getelementptr inbounds float* %tmp3751, i64 1
+  %tmp3753 = getelementptr inbounds float* %tmp3752, i64 1
+  %tmp3754 = getelementptr inbounds float* %tmp3753, i64 1
+  %tmp3755 = getelementptr inbounds float* %tmp3754, i64 1
+  %tmp3756 = getelementptr inbounds float* %tmp3755, i64 1
+  %tmp3757 = getelementptr inbounds float* %tmp3756, i64 1
+  %tmp3758 = getelementptr inbounds float* %tmp3757, i64 1
+  %tmp3759 = getelementptr inbounds float* %tmp3758, i64 1
+  %tmp3760 = getelementptr inbounds float* %tmp3759, i64 1
+  %tmp3761 = getelementptr inbounds float* %tmp3760, i64 1
+  %tmp3762 = getelementptr inbounds float* %tmp3761, i64 1
+  %tmp3763 = getelementptr inbounds float* %tmp3762, i64 1
+  %tmp3764 = getelementptr inbounds float* %tmp3763, i64 1
+  %tmp3765 = getelementptr inbounds float* %tmp3764, i64 1
+  %tmp3766 = getelementptr inbounds float* %tmp3765, i64 1
+  %tmp3767 = getelementptr inbounds float* %tmp3766, i64 1
+  %tmp3768 = getelementptr inbounds float* %tmp3767, i64 1
+  %tmp3769 = getelementptr inbounds float* %tmp3768, i64 1
+  %tmp3770 = getelementptr inbounds float* %tmp3769, i64 1
+  %tmp3771 = getelementptr inbounds float* %tmp3770, i64 1
+  %tmp3772 = getelementptr inbounds float* %tmp3771, i64 1
+  %tmp3773 = getelementptr inbounds float* %tmp3772, i64 1
+  %tmp3774 = getelementptr inbounds float* %tmp3773, i64 1
+  %tmp3775 = getelementptr inbounds float* %tmp3774, i64 1
+  %tmp3776 = getelementptr inbounds float* %tmp3775, i64 1
+  %tmp3777 = getelementptr inbounds float* %tmp3776, i64 1
+  %tmp3778 = getelementptr inbounds float* %tmp3777, i64 1
+  %tmp3779 = getelementptr inbounds float* %tmp3778, i64 1
+  %tmp3780 = getelementptr inbounds float* %tmp3779, i64 1
+  %tmp3781 = getelementptr inbounds float* %tmp3780, i64 1
+  %tmp3782 = getelementptr inbounds float* %tmp3781, i64 1
+  %tmp3783 = getelementptr inbounds float* %tmp3782, i64 1
+  %tmp3784 = getelementptr inbounds float* %tmp3783, i64 1
+  %tmp3785 = getelementptr inbounds float* %tmp3784, i64 1
+  %tmp3786 = getelementptr inbounds float* %tmp3785, i64 1
+  %tmp3787 = getelementptr inbounds float* %tmp3786, i64 1
+  %tmp3788 = getelementptr inbounds float* %tmp3787, i64 1
+  %tmp3789 = getelementptr inbounds float* %tmp3788, i64 1
+  %tmp3790 = getelementptr inbounds float* %tmp3789, i64 1
+  %tmp3791 = getelementptr inbounds float* %tmp3790, i64 1
+  %tmp3792 = getelementptr inbounds float* %tmp3791, i64 1
+  %tmp3793 = getelementptr inbounds float* %tmp3792, i64 1
+  %tmp3794 = getelementptr inbounds float* %tmp3793, i64 1
+  %tmp3795 = getelementptr inbounds float* %tmp3794, i64 1
+  %tmp3796 = getelementptr inbounds float* %tmp3795, i64 1
+  %tmp3797 = getelementptr inbounds float* %tmp3796, i64 1
+  %tmp3798 = getelementptr inbounds float* %tmp3797, i64 1
+  %tmp3799 = getelementptr inbounds float* %tmp3798, i64 1
+  %tmp3800 = getelementptr inbounds float* %tmp3799, i64 1
+  %tmp3801 = getelementptr inbounds float* %tmp3800, i64 1
+  %tmp3802 = getelementptr inbounds float* %tmp3801, i64 1
+  %tmp3803 = getelementptr inbounds float* %tmp3802, i64 1
+  %tmp3804 = getelementptr inbounds float* %tmp3803, i64 1
+  %tmp3805 = getelementptr inbounds float* %tmp3804, i64 1
+  %tmp3806 = getelementptr inbounds float* %tmp3805, i64 1
+  %tmp3807 = getelementptr inbounds float* %tmp3806, i64 1
+  %tmp3808 = getelementptr inbounds float* %tmp3807, i64 1
+  %tmp3809 = getelementptr inbounds float* %tmp3808, i64 1
+  %tmp3810 = getelementptr inbounds float* %tmp3809, i64 1
+  %tmp3811 = getelementptr inbounds float* %tmp3810, i64 1
+  %tmp3812 = getelementptr inbounds float* %tmp3811, i64 1
+  %tmp3813 = getelementptr inbounds float* %tmp3812, i64 1
+  %tmp3814 = getelementptr inbounds float* %tmp3813, i64 1
+  %tmp3815 = getelementptr inbounds float* %tmp3814, i64 1
+  %tmp3816 = getelementptr inbounds float* %tmp3815, i64 1
+  %tmp3817 = getelementptr inbounds float* %tmp3816, i64 1
+  %tmp3818 = getelementptr inbounds float* %tmp3817, i64 1
+  %tmp3819 = getelementptr inbounds float* %tmp3818, i64 1
+  %tmp3820 = getelementptr inbounds float* %tmp3819, i64 1
+  %tmp3821 = getelementptr inbounds float* %tmp3820, i64 1
+  %tmp3822 = getelementptr inbounds float* %tmp3821, i64 1
+  %tmp3823 = getelementptr inbounds float* %tmp3822, i64 1
+  %tmp3824 = getelementptr inbounds float* %tmp3823, i64 1
+  %tmp3825 = getelementptr inbounds float* %tmp3824, i64 1
+  %tmp3826 = getelementptr inbounds float* %tmp3825, i64 1
+  %tmp3827 = getelementptr inbounds float* %tmp3826, i64 1
+  %tmp3828 = getelementptr inbounds float* %tmp3827, i64 1
+  %tmp3829 = getelementptr inbounds float* %tmp3828, i64 1
+  %tmp3830 = getelementptr inbounds float* %tmp3829, i64 1
+  %tmp3831 = getelementptr inbounds float* %tmp3830, i64 1
+  %tmp3832 = getelementptr inbounds float* %tmp3831, i64 1
+  %tmp3833 = getelementptr inbounds float* %tmp3832, i64 1
+  %tmp3834 = getelementptr inbounds float* %tmp3833, i64 1
+  %tmp3835 = getelementptr inbounds float* %tmp3834, i64 1
+  %tmp3836 = getelementptr inbounds float* %tmp3835, i64 1
+  %tmp3837 = getelementptr inbounds float* %tmp3836, i64 1
+  %tmp3838 = getelementptr inbounds float* %tmp3837, i64 1
+  %tmp3839 = getelementptr inbounds float* %tmp3838, i64 1
+  %tmp3840 = getelementptr inbounds float* %tmp3839, i64 1
+  %tmp3841 = getelementptr inbounds float* %tmp3840, i64 1
+  %tmp3842 = getelementptr inbounds float* %tmp3841, i64 1
+  %tmp3843 = getelementptr inbounds float* %tmp3842, i64 1
+  %tmp3844 = getelementptr inbounds float* %tmp3843, i64 1
+  %tmp3845 = getelementptr inbounds float* %tmp3844, i64 1
+  %tmp3846 = getelementptr inbounds float* %tmp3845, i64 1
+  %tmp3847 = getelementptr inbounds float* %tmp3846, i64 1
+  %tmp3848 = getelementptr inbounds float* %tmp3847, i64 1
+  %tmp3849 = getelementptr inbounds float* %tmp3848, i64 1
+  %tmp3850 = getelementptr inbounds float* %tmp3849, i64 1
+  %tmp3851 = getelementptr inbounds float* %tmp3850, i64 1
+  %tmp3852 = getelementptr inbounds float* %tmp3851, i64 1
+  %tmp3853 = getelementptr inbounds float* %tmp3852, i64 1
+  %tmp3854 = getelementptr inbounds float* %tmp3853, i64 1
+  %tmp3855 = getelementptr inbounds float* %tmp3854, i64 1
+  %tmp3856 = getelementptr inbounds float* %tmp3855, i64 1
+  %tmp3857 = getelementptr inbounds float* %tmp3856, i64 1
+  %tmp3858 = getelementptr inbounds float* %tmp3857, i64 1
+  %tmp3859 = getelementptr inbounds float* %tmp3858, i64 1
+  %tmp3860 = getelementptr inbounds float* %tmp3859, i64 1
+  %tmp3861 = getelementptr inbounds float* %tmp3860, i64 1
+  %tmp3862 = getelementptr inbounds float* %tmp3861, i64 1
+  %tmp3863 = getelementptr inbounds float* %tmp3862, i64 1
+  %tmp3864 = getelementptr inbounds float* %tmp3863, i64 1
+  %tmp3865 = getelementptr inbounds float* %tmp3864, i64 1
+  %tmp3866 = getelementptr inbounds float* %tmp3865, i64 1
+  %tmp3867 = getelementptr inbounds float* %tmp3866, i64 1
+  %tmp3868 = getelementptr inbounds float* %tmp3867, i64 1
+  %tmp3869 = getelementptr inbounds float* %tmp3868, i64 1
+  %tmp3870 = getelementptr inbounds float* %tmp3869, i64 1
+  %tmp3871 = getelementptr inbounds float* %tmp3870, i64 1
+  %tmp3872 = getelementptr inbounds float* %tmp3871, i64 1
+  %tmp3873 = getelementptr inbounds float* %tmp3872, i64 1
+  %tmp3874 = getelementptr inbounds float* %tmp3873, i64 1
+  %tmp3875 = getelementptr inbounds float* %tmp3874, i64 1
+  %tmp3876 = getelementptr inbounds float* %tmp3875, i64 1
+  %tmp3877 = getelementptr inbounds float* %tmp3876, i64 1
+  %tmp3878 = getelementptr inbounds float* %tmp3877, i64 1
+  %tmp3879 = getelementptr inbounds float* %tmp3878, i64 1
+  %tmp3880 = getelementptr inbounds float* %tmp3879, i64 1
+  %tmp3881 = getelementptr inbounds float* %tmp3880, i64 1
+  %tmp3882 = getelementptr inbounds float* %tmp3881, i64 1
+  %tmp3883 = getelementptr inbounds float* %tmp3882, i64 1
+  %tmp3884 = getelementptr inbounds float* %tmp3883, i64 1
+  %tmp3885 = getelementptr inbounds float* %tmp3884, i64 1
+  %tmp3886 = getelementptr inbounds float* %tmp3885, i64 1
+  %tmp3887 = getelementptr inbounds float* %tmp3886, i64 1
+  %tmp3888 = getelementptr inbounds float* %tmp3887, i64 1
+  %tmp3889 = getelementptr inbounds float* %tmp3888, i64 1
+  %tmp3890 = getelementptr inbounds float* %tmp3889, i64 1
+  %tmp3891 = getelementptr inbounds float* %tmp3890, i64 1
+  %tmp3892 = getelementptr inbounds float* %tmp3891, i64 1
+  %tmp3893 = getelementptr inbounds float* %tmp3892, i64 1
+  %tmp3894 = getelementptr inbounds float* %tmp3893, i64 1
+  %tmp3895 = getelementptr inbounds float* %tmp3894, i64 1
+  %tmp3896 = getelementptr inbounds float* %tmp3895, i64 1
+  %tmp3897 = getelementptr inbounds float* %tmp3896, i64 1
+  %tmp3898 = getelementptr inbounds float* %tmp3897, i64 1
+  %tmp3899 = getelementptr inbounds float* %tmp3898, i64 1
+  %tmp3900 = getelementptr inbounds float* %tmp3899, i64 1
+  %tmp3901 = getelementptr inbounds float* %tmp3900, i64 1
+  %tmp3902 = getelementptr inbounds float* %tmp3901, i64 1
+  %tmp3903 = getelementptr inbounds float* %tmp3902, i64 1
+  %tmp3904 = getelementptr inbounds float* %tmp3903, i64 1
+  %tmp3905 = getelementptr inbounds float* %tmp3904, i64 1
+  %tmp3906 = getelementptr inbounds float* %tmp3905, i64 1
+  %tmp3907 = getelementptr inbounds float* %tmp3906, i64 1
+  %tmp3908 = getelementptr inbounds float* %tmp3907, i64 1
+  %tmp3909 = getelementptr inbounds float* %tmp3908, i64 1
+  %tmp3910 = getelementptr inbounds float* %tmp3909, i64 1
+  %tmp3911 = getelementptr inbounds float* %tmp3910, i64 1
+  %tmp3912 = getelementptr inbounds float* %tmp3911, i64 1
+  %tmp3913 = getelementptr inbounds float* %tmp3912, i64 1
+  %tmp3914 = getelementptr inbounds float* %tmp3913, i64 1
+  %tmp3915 = getelementptr inbounds float* %tmp3914, i64 1
+  %tmp3916 = getelementptr inbounds float* %tmp3915, i64 1
+  %tmp3917 = getelementptr inbounds float* %tmp3916, i64 1
+  %tmp3918 = getelementptr inbounds float* %tmp3917, i64 1
+  %tmp3919 = getelementptr inbounds float* %tmp3918, i64 1
+  %tmp3920 = getelementptr inbounds float* %tmp3919, i64 1
+  %tmp3921 = getelementptr inbounds float* %tmp3920, i64 1
+  %tmp3922 = getelementptr inbounds float* %tmp3921, i64 1
+  %tmp3923 = getelementptr inbounds float* %tmp3922, i64 1
+  %tmp3924 = getelementptr inbounds float* %tmp3923, i64 1
+  %tmp3925 = getelementptr inbounds float* %tmp3924, i64 1
+  %tmp3926 = getelementptr inbounds float* %tmp3925, i64 1
+  %tmp3927 = getelementptr inbounds float* %tmp3926, i64 1
+  %tmp3928 = getelementptr inbounds float* %tmp3927, i64 1
+  %tmp3929 = getelementptr inbounds float* %tmp3928, i64 1
+  %tmp3930 = getelementptr inbounds float* %tmp3929, i64 1
+  %tmp3931 = getelementptr inbounds float* %tmp3930, i64 1
+  %tmp3932 = getelementptr inbounds float* %tmp3931, i64 1
+  %tmp3933 = getelementptr inbounds float* %tmp3932, i64 1
+  %tmp3934 = getelementptr inbounds float* %tmp3933, i64 1
+  %tmp3935 = getelementptr inbounds float* %tmp3934, i64 1
+  %tmp3936 = getelementptr inbounds float* %tmp3935, i64 1
+  %tmp3937 = getelementptr inbounds float* %tmp3936, i64 1
+  %tmp3938 = getelementptr inbounds float* %tmp3937, i64 1
+  %tmp3939 = getelementptr inbounds float* %tmp3938, i64 1
+  %tmp3940 = getelementptr inbounds float* %tmp3939, i64 1
+  %tmp3941 = getelementptr inbounds float* %tmp3940, i64 1
+  %tmp3942 = getelementptr inbounds float* %tmp3941, i64 1
+  %tmp3943 = getelementptr inbounds float* %tmp3942, i64 1
+  %tmp3944 = getelementptr inbounds float* %tmp3943, i64 1
+  %tmp3945 = getelementptr inbounds float* %tmp3944, i64 1
+  %tmp3946 = getelementptr inbounds float* %tmp3945, i64 1
+  %tmp3947 = getelementptr inbounds float* %tmp3946, i64 1
+  %tmp3948 = getelementptr inbounds float* %tmp3947, i64 1
+  %tmp3949 = getelementptr inbounds float* %tmp3948, i64 1
+  %tmp3950 = getelementptr inbounds float* %tmp3949, i64 1
+  %tmp3951 = getelementptr inbounds float* %tmp3950, i64 1
+  %tmp3952 = getelementptr inbounds float* %tmp3951, i64 1
+  %tmp3953 = getelementptr inbounds float* %tmp3952, i64 1
+  %tmp3954 = getelementptr inbounds float* %tmp3953, i64 1
+  %tmp3955 = getelementptr inbounds float* %tmp3954, i64 1
+  %tmp3956 = getelementptr inbounds float* %tmp3955, i64 1
+  %tmp3957 = getelementptr inbounds float* %tmp3956, i64 1
+  %tmp3958 = getelementptr inbounds float* %tmp3957, i64 1
+  %tmp3959 = getelementptr inbounds float* %tmp3958, i64 1
+  %tmp3960 = getelementptr inbounds float* %tmp3959, i64 1
+  %tmp3961 = getelementptr inbounds float* %tmp3960, i64 1
+  %tmp3962 = getelementptr inbounds float* %tmp3961, i64 1
+  %tmp3963 = getelementptr inbounds float* %tmp3962, i64 1
+  %tmp3964 = getelementptr inbounds float* %tmp3963, i64 1
+  %tmp3965 = getelementptr inbounds float* %tmp3964, i64 1
+  %tmp3966 = getelementptr inbounds float* %tmp3965, i64 1
+  %tmp3967 = getelementptr inbounds float* %tmp3966, i64 1
+  %tmp3968 = getelementptr inbounds float* %tmp3967, i64 1
+  %tmp3969 = getelementptr inbounds float* %tmp3968, i64 1
+  %tmp3970 = getelementptr inbounds float* %tmp3969, i64 1
+  %tmp3971 = getelementptr inbounds float* %tmp3970, i64 1
+  %tmp3972 = getelementptr inbounds float* %tmp3971, i64 1
+  %tmp3973 = getelementptr inbounds float* %tmp3972, i64 1
+  %tmp3974 = getelementptr inbounds float* %tmp3973, i64 1
+  %tmp3975 = getelementptr inbounds float* %tmp3974, i64 1
+  %tmp3976 = getelementptr inbounds float* %tmp3975, i64 1
+  %tmp3977 = getelementptr inbounds float* %tmp3976, i64 1
+  %tmp3978 = getelementptr inbounds float* %tmp3977, i64 1
+  %tmp3979 = getelementptr inbounds float* %tmp3978, i64 1
+  %tmp3980 = getelementptr inbounds float* %tmp3979, i64 1
+  %tmp3981 = getelementptr inbounds float* %tmp3980, i64 1
+  %tmp3982 = getelementptr inbounds float* %tmp3981, i64 1
+  %tmp3983 = getelementptr inbounds float* %tmp3982, i64 1
+  %tmp3984 = getelementptr inbounds float* %tmp3983, i64 1
+  %tmp3985 = getelementptr inbounds float* %tmp3984, i64 1
+  %tmp3986 = getelementptr inbounds float* %tmp3985, i64 1
+  %tmp3987 = getelementptr inbounds float* %tmp3986, i64 1
+  %tmp3988 = getelementptr inbounds float* %tmp3987, i64 1
+  %tmp3989 = getelementptr inbounds float* %tmp3988, i64 1
+  %tmp3990 = getelementptr inbounds float* %tmp3989, i64 1
+  %tmp3991 = getelementptr inbounds float* %tmp3990, i64 1
+  %tmp3992 = getelementptr inbounds float* %tmp3991, i64 1
+  %tmp3993 = getelementptr inbounds float* %tmp3992, i64 1
+  %tmp3994 = getelementptr inbounds float* %tmp3993, i64 1
+  %tmp3995 = getelementptr inbounds float* %tmp3994, i64 1
+  %tmp3996 = getelementptr inbounds float* %tmp3995, i64 1
+  %tmp3997 = getelementptr inbounds float* %tmp3996, i64 1
+  %tmp3998 = getelementptr inbounds float* %tmp3997, i64 1
+  %tmp3999 = getelementptr inbounds float* %tmp3998, i64 1
+  %tmp4000 = getelementptr inbounds float* %tmp3999, i64 1
+  %tmp4001 = getelementptr inbounds float* %tmp4000, i64 1
+  %tmp4002 = getelementptr inbounds float* %tmp4001, i64 1
+  %tmp4003 = getelementptr inbounds float* %tmp4002, i64 1
+  %tmp4004 = getelementptr inbounds float* %tmp4003, i64 1
+  %tmp4005 = getelementptr inbounds float* %tmp4004, i64 1
+  %tmp4006 = getelementptr inbounds float* %tmp4005, i64 1
+  %tmp4007 = getelementptr inbounds float* %tmp4006, i64 1
+  %tmp4008 = getelementptr inbounds float* %tmp4007, i64 1
+  %tmp4009 = getelementptr inbounds float* %tmp4008, i64 1
+  %tmp4010 = getelementptr inbounds float* %tmp4009, i64 1
+  %tmp4011 = getelementptr inbounds float* %tmp4010, i64 1
+  %tmp4012 = getelementptr inbounds float* %tmp4011, i64 1
+  %tmp4013 = getelementptr inbounds float* %tmp4012, i64 1
+  %tmp4014 = getelementptr inbounds float* %tmp4013, i64 1
+  %tmp4015 = getelementptr inbounds float* %tmp4014, i64 1
+  %tmp4016 = getelementptr inbounds float* %tmp4015, i64 1
+  %tmp4017 = getelementptr inbounds float* %tmp4016, i64 1
+  %tmp4018 = getelementptr inbounds float* %tmp4017, i64 1
+  %tmp4019 = getelementptr inbounds float* %tmp4018, i64 1
+  %tmp4020 = getelementptr inbounds float* %tmp4019, i64 1
+  %tmp4021 = getelementptr inbounds float* %tmp4020, i64 1
+  %tmp4022 = getelementptr inbounds float* %tmp4021, i64 1
+  %tmp4023 = getelementptr inbounds float* %tmp4022, i64 1
+  %tmp4024 = getelementptr inbounds float* %tmp4023, i64 1
+  %tmp4025 = getelementptr inbounds float* %tmp4024, i64 1
+  %tmp4026 = getelementptr inbounds float* %tmp4025, i64 1
+  %tmp4027 = getelementptr inbounds float* %tmp4026, i64 1
+  %tmp4028 = getelementptr inbounds float* %tmp4027, i64 1
+  %tmp4029 = getelementptr inbounds float* %tmp4028, i64 1
+  %tmp4030 = getelementptr inbounds float* %tmp4029, i64 1
+  %tmp4031 = getelementptr inbounds float* %tmp4030, i64 1
+  %tmp4032 = getelementptr inbounds float* %tmp4031, i64 1
+  %tmp4033 = getelementptr inbounds float* %tmp4032, i64 1
+  %tmp4034 = getelementptr inbounds float* %tmp4033, i64 1
+  %tmp4035 = getelementptr inbounds float* %tmp4034, i64 1
+  %tmp4036 = getelementptr inbounds float* %tmp4035, i64 1
+  %tmp4037 = getelementptr inbounds float* %tmp4036, i64 1
+  %tmp4038 = getelementptr inbounds float* %tmp4037, i64 1
+  %tmp4039 = getelementptr inbounds float* %tmp4038, i64 1
+  %tmp4040 = getelementptr inbounds float* %tmp4039, i64 1
+  %tmp4041 = getelementptr inbounds float* %tmp4040, i64 1
+  %tmp4042 = getelementptr inbounds float* %tmp4041, i64 1
+  %tmp4043 = getelementptr inbounds float* %tmp4042, i64 1
+  %tmp4044 = getelementptr inbounds float* %tmp4043, i64 1
+  %tmp4045 = getelementptr inbounds float* %tmp4044, i64 1
+  %tmp4046 = getelementptr inbounds float* %tmp4045, i64 1
+  %tmp4047 = getelementptr inbounds float* %tmp4046, i64 1
+  %tmp4048 = getelementptr inbounds float* %tmp4047, i64 1
+  %tmp4049 = getelementptr inbounds float* %tmp4048, i64 1
+  %tmp4050 = getelementptr inbounds float* %tmp4049, i64 1
+  %tmp4051 = getelementptr inbounds float* %tmp4050, i64 1
+  %tmp4052 = getelementptr inbounds float* %tmp4051, i64 1
+  %tmp4053 = getelementptr inbounds float* %tmp4052, i64 1
+  %tmp4054 = getelementptr inbounds float* %tmp4053, i64 1
+  %tmp4055 = getelementptr inbounds float* %tmp4054, i64 1
+  %tmp4056 = getelementptr inbounds float* %tmp4055, i64 1
+  %tmp4057 = getelementptr inbounds float* %tmp4056, i64 1
+  %tmp4058 = getelementptr inbounds float* %tmp4057, i64 1
+  %tmp4059 = getelementptr inbounds float* %tmp4058, i64 1
+  %tmp4060 = getelementptr inbounds float* %tmp4059, i64 1
+  %tmp4061 = getelementptr inbounds float* %tmp4060, i64 1
+  %tmp4062 = getelementptr inbounds float* %tmp4061, i64 1
+  %tmp4063 = getelementptr inbounds float* %tmp4062, i64 1
+  %tmp4064 = getelementptr inbounds float* %tmp4063, i64 1
+  %tmp4065 = getelementptr inbounds float* %tmp4064, i64 1
+  %tmp4066 = getelementptr inbounds float* %tmp4065, i64 1
+  %tmp4067 = getelementptr inbounds float* %tmp4066, i64 1
+  %tmp4068 = getelementptr inbounds float* %tmp4067, i64 1
+  %tmp4069 = getelementptr inbounds float* %tmp4068, i64 1
+  %tmp4070 = getelementptr inbounds float* %tmp4069, i64 1
+  %tmp4071 = getelementptr inbounds float* %tmp4070, i64 1
+  %tmp4072 = getelementptr inbounds float* %tmp4071, i64 1
+  %tmp4073 = getelementptr inbounds float* %tmp4072, i64 1
+  %tmp4074 = getelementptr inbounds float* %tmp4073, i64 1
+  %tmp4075 = getelementptr inbounds float* %tmp4074, i64 1
+  %tmp4076 = getelementptr inbounds float* %tmp4075, i64 1
+  %tmp4077 = getelementptr inbounds float* %tmp4076, i64 1
+  %tmp4078 = getelementptr inbounds float* %tmp4077, i64 1
+  %tmp4079 = getelementptr inbounds float* %tmp4078, i64 1
+  %tmp4080 = getelementptr inbounds float* %tmp4079, i64 1
+  %tmp4081 = getelementptr inbounds float* %tmp4080, i64 1
+  %tmp4082 = getelementptr inbounds float* %tmp4081, i64 1
+  %tmp4083 = getelementptr inbounds float* %tmp4082, i64 1
+  %tmp4084 = getelementptr inbounds float* %tmp4083, i64 1
+  %tmp4085 = getelementptr inbounds float* %tmp4084, i64 1
+  %tmp4086 = getelementptr inbounds float* %tmp4085, i64 1
+  %tmp4087 = getelementptr inbounds float* %tmp4086, i64 1
+  %tmp4088 = getelementptr inbounds float* %tmp4087, i64 1
+  %tmp4089 = getelementptr inbounds float* %tmp4088, i64 1
+  %tmp4090 = getelementptr inbounds float* %tmp4089, i64 1
+  %tmp4091 = getelementptr inbounds float* %tmp4090, i64 1
+  %tmp4092 = getelementptr inbounds float* %tmp4091, i64 1
+  %tmp4093 = getelementptr inbounds float* %tmp4092, i64 1
+  %tmp4094 = getelementptr inbounds float* %tmp4093, i64 1
+  %tmp4095 = getelementptr inbounds float* %tmp4094, i64 1
+  %tmp4096 = getelementptr inbounds float* %tmp4095, i64 1
+  %tmp4097 = getelementptr inbounds float* %tmp4096, i64 1
+  %tmp4098 = getelementptr inbounds float* %tmp4097, i64 1
+  %tmp4099 = getelementptr inbounds float* %tmp4098, i64 1
+  %tmp4100 = getelementptr inbounds float* %tmp4099, i64 1
+  %tmp4101 = getelementptr inbounds float* %tmp4100, i64 1
+  %tmp4102 = getelementptr inbounds float* %tmp4101, i64 1
+  %tmp4103 = getelementptr inbounds float* %tmp4102, i64 1
+  %tmp4104 = getelementptr inbounds float* %tmp4103, i64 1
+  %tmp4105 = getelementptr inbounds float* %tmp4104, i64 1
+  %tmp4106 = getelementptr inbounds float* %tmp4105, i64 1
+  %tmp4107 = getelementptr inbounds float* %tmp4106, i64 1
+  %tmp4108 = getelementptr inbounds float* %tmp4107, i64 1
+  %tmp4109 = getelementptr inbounds float* %tmp4108, i64 1
+  %tmp4110 = getelementptr inbounds float* %tmp4109, i64 1
+  %tmp4111 = getelementptr inbounds float* %tmp4110, i64 1
+  %tmp4112 = getelementptr inbounds float* %tmp4111, i64 1
+  %tmp4113 = getelementptr inbounds float* %tmp4112, i64 1
+  %tmp4114 = getelementptr inbounds float* %tmp4113, i64 1
+  %tmp4115 = getelementptr inbounds float* %tmp4114, i64 1
+  %tmp4116 = getelementptr inbounds float* %tmp4115, i64 1
+  %tmp4117 = getelementptr inbounds float* %tmp4116, i64 1
+  %tmp4118 = getelementptr inbounds float* %tmp4117, i64 1
+  %tmp4119 = getelementptr inbounds float* %tmp4118, i64 1
+  %tmp4120 = getelementptr inbounds float* %tmp4119, i64 1
+  %tmp4121 = getelementptr inbounds float* %tmp4120, i64 1
+  %tmp4122 = getelementptr inbounds float* %tmp4121, i64 1
+  %tmp4123 = getelementptr inbounds float* %tmp4122, i64 1
+  %tmp4124 = getelementptr inbounds float* %tmp4123, i64 1
+  %tmp4125 = getelementptr inbounds float* %tmp4124, i64 1
+  %tmp4126 = getelementptr inbounds float* %tmp4125, i64 1
+  %tmp4127 = getelementptr inbounds float* %tmp4126, i64 1
+  %tmp4128 = getelementptr inbounds float* %tmp4127, i64 1
+  %tmp4129 = getelementptr inbounds float* %tmp4128, i64 1
+  %tmp4130 = getelementptr inbounds float* %tmp4129, i64 1
+  %tmp4131 = getelementptr inbounds float* %tmp4130, i64 1
+  %tmp4132 = getelementptr inbounds float* %tmp4131, i64 1
+  %tmp4133 = getelementptr inbounds float* %tmp4132, i64 1
+  %tmp4134 = getelementptr inbounds float* %tmp4133, i64 1
+  %tmp4135 = getelementptr inbounds float* %tmp4134, i64 1
+  %tmp4136 = getelementptr inbounds float* %tmp4135, i64 1
+  %tmp4137 = getelementptr inbounds float* %tmp4136, i64 1
+  %tmp4138 = getelementptr inbounds float* %tmp4137, i64 1
+  %tmp4139 = getelementptr inbounds float* %tmp4138, i64 1
+  %tmp4140 = getelementptr inbounds float* %tmp4139, i64 1
+  %tmp4141 = getelementptr inbounds float* %tmp4140, i64 1
+  %tmp4142 = getelementptr inbounds float* %tmp4141, i64 1
+  %tmp4143 = getelementptr inbounds float* %tmp4142, i64 1
+  %tmp4144 = getelementptr inbounds float* %tmp4143, i64 1
+  %tmp4145 = getelementptr inbounds float* %tmp4144, i64 1
+  %tmp4146 = getelementptr inbounds float* %tmp4145, i64 1
+  %tmp4147 = getelementptr inbounds float* %tmp4146, i64 1
+  %tmp4148 = getelementptr inbounds float* %tmp4147, i64 1
+  %tmp4149 = getelementptr inbounds float* %tmp4148, i64 1
+  %tmp4150 = getelementptr inbounds float* %tmp4149, i64 1
+  %tmp4151 = getelementptr inbounds float* %tmp4150, i64 1
+  %tmp4152 = getelementptr inbounds float* %tmp4151, i64 1
+  %tmp4153 = getelementptr inbounds float* %tmp4152, i64 1
+  %tmp4154 = getelementptr inbounds float* %tmp4153, i64 1
+  %tmp4155 = getelementptr inbounds float* %tmp4154, i64 1
+  %tmp4156 = getelementptr inbounds float* %tmp4155, i64 1
+  %tmp4157 = getelementptr inbounds float* %tmp4156, i64 1
+  %tmp4158 = getelementptr inbounds float* %tmp4157, i64 1
+  %tmp4159 = getelementptr inbounds float* %tmp4158, i64 1
+  %tmp4160 = getelementptr inbounds float* %tmp4159, i64 1
+  %tmp4161 = getelementptr inbounds float* %tmp4160, i64 1
+  %tmp4162 = getelementptr inbounds float* %tmp4161, i64 1
+  %tmp4163 = getelementptr inbounds float* %tmp4162, i64 1
+  %tmp4164 = getelementptr inbounds float* %tmp4163, i64 1
+  %tmp4165 = getelementptr inbounds float* %tmp4164, i64 1
+  %tmp4166 = getelementptr inbounds float* %tmp4165, i64 1
+  %tmp4167 = getelementptr inbounds float* %tmp4166, i64 1
+  %tmp4168 = getelementptr inbounds float* %tmp4167, i64 1
+  %tmp4169 = getelementptr inbounds float* %tmp4168, i64 1
+  %tmp4170 = getelementptr inbounds float* %tmp4169, i64 1
+  %tmp4171 = getelementptr inbounds float* %tmp4170, i64 1
+  %tmp4172 = getelementptr inbounds float* %tmp4171, i64 1
+  %tmp4173 = getelementptr inbounds float* %tmp4172, i64 1
+  %tmp4174 = getelementptr inbounds float* %tmp4173, i64 1
+  %tmp4175 = getelementptr inbounds float* %tmp4174, i64 1
+  %tmp4176 = getelementptr inbounds float* %tmp4175, i64 1
+  %tmp4177 = getelementptr inbounds float* %tmp4176, i64 1
+  %tmp4178 = getelementptr inbounds float* %tmp4177, i64 1
+  %tmp4179 = getelementptr inbounds float* %tmp4178, i64 1
+  %tmp4180 = getelementptr inbounds float* %tmp4179, i64 1
+  %tmp4181 = getelementptr inbounds float* %tmp4180, i64 1
+  %tmp4182 = getelementptr inbounds float* %tmp4181, i64 1
+  %tmp4183 = getelementptr inbounds float* %tmp4182, i64 1
+  %tmp4184 = getelementptr inbounds float* %tmp4183, i64 1
+  %tmp4185 = getelementptr inbounds float* %tmp4184, i64 1
+  %tmp4186 = getelementptr inbounds float* %tmp4185, i64 1
+  %tmp4187 = getelementptr inbounds float* %tmp4186, i64 1
+  %tmp4188 = getelementptr inbounds float* %tmp4187, i64 1
+  %tmp4189 = getelementptr inbounds float* %tmp4188, i64 1
+  %tmp4190 = getelementptr inbounds float* %tmp4189, i64 1
+  %tmp4191 = getelementptr inbounds float* %tmp4190, i64 1
+  %tmp4192 = getelementptr inbounds float* %tmp4191, i64 1
+  %tmp4193 = getelementptr inbounds float* %tmp4192, i64 1
+  %tmp4194 = getelementptr inbounds float* %tmp4193, i64 1
+  %tmp4195 = getelementptr inbounds float* %tmp4194, i64 1
+  %tmp4196 = getelementptr inbounds float* %tmp4195, i64 1
+  %tmp4197 = getelementptr inbounds float* %tmp4196, i64 1
+  %tmp4198 = getelementptr inbounds float* %tmp4197, i64 1
+  %tmp4199 = getelementptr inbounds float* %tmp4198, i64 1
+  %tmp4200 = getelementptr inbounds float* %tmp4199, i64 1
+  %tmp4201 = getelementptr inbounds float* %tmp4200, i64 1
+  %tmp4202 = getelementptr inbounds float* %tmp4201, i64 1
+  %tmp4203 = getelementptr inbounds float* %tmp4202, i64 1
+  %tmp4204 = getelementptr inbounds float* %tmp4203, i64 1
+  %tmp4205 = getelementptr inbounds float* %tmp4204, i64 1
+  %tmp4206 = getelementptr inbounds float* %tmp4205, i64 1
+  %tmp4207 = getelementptr inbounds float* %tmp4206, i64 1
+  %tmp4208 = getelementptr inbounds float* %tmp4207, i64 1
+  %tmp4209 = getelementptr inbounds float* %tmp4208, i64 1
+  %tmp4210 = getelementptr inbounds float* %tmp4209, i64 1
+  %tmp4211 = getelementptr inbounds float* %tmp4210, i64 1
+  %tmp4212 = getelementptr inbounds float* %tmp4211, i64 1
+  %tmp4213 = getelementptr inbounds float* %tmp4212, i64 1
+  %tmp4214 = getelementptr inbounds float* %tmp4213, i64 1
+  %tmp4215 = getelementptr inbounds float* %tmp4214, i64 1
+  %tmp4216 = getelementptr inbounds float* %tmp4215, i64 1
+  %tmp4217 = getelementptr inbounds float* %tmp4216, i64 1
+  %tmp4218 = getelementptr inbounds float* %tmp4217, i64 1
+  %tmp4219 = getelementptr inbounds float* %tmp4218, i64 1
+  %tmp4220 = getelementptr inbounds float* %tmp4219, i64 1
+  %tmp4221 = getelementptr inbounds float* %tmp4220, i64 1
+  %tmp4222 = getelementptr inbounds float* %tmp4221, i64 1
+  %tmp4223 = getelementptr inbounds float* %tmp4222, i64 1
+  %tmp4224 = getelementptr inbounds float* %tmp4223, i64 1
+  %tmp4225 = getelementptr inbounds float* %tmp4224, i64 1
+  %tmp4226 = getelementptr inbounds float* %tmp4225, i64 1
+  %tmp4227 = getelementptr inbounds float* %tmp4226, i64 1
+  %tmp4228 = getelementptr inbounds float* %tmp4227, i64 1
+  %tmp4229 = getelementptr inbounds float* %tmp4228, i64 1
+  %tmp4230 = getelementptr inbounds float* %tmp4229, i64 1
+  %tmp4231 = getelementptr inbounds float* %tmp4230, i64 1
+  %tmp4232 = getelementptr inbounds float* %tmp4231, i64 1
+  %tmp4233 = getelementptr inbounds float* %tmp4232, i64 1
+  %tmp4234 = getelementptr inbounds float* %tmp4233, i64 1
+  %tmp4235 = getelementptr inbounds float* %tmp4234, i64 1
+  %tmp4236 = getelementptr inbounds float* %tmp4235, i64 1
+  %tmp4237 = getelementptr inbounds float* %tmp4236, i64 1
+  %tmp4238 = getelementptr inbounds float* %tmp4237, i64 1
+  %tmp4239 = getelementptr inbounds float* %tmp4238, i64 1
+  %tmp4240 = getelementptr inbounds float* %tmp4239, i64 1
+  %tmp4241 = getelementptr inbounds float* %tmp4240, i64 1
+  %tmp4242 = getelementptr inbounds float* %tmp4241, i64 1
+  %tmp4243 = getelementptr inbounds float* %tmp4242, i64 1
+  %tmp4244 = getelementptr inbounds float* %tmp4243, i64 1
+  %tmp4245 = getelementptr inbounds float* %tmp4244, i64 1
+  %tmp4246 = getelementptr inbounds float* %tmp4245, i64 1
+  %tmp4247 = getelementptr inbounds float* %tmp4246, i64 1
+  %tmp4248 = getelementptr inbounds float* %tmp4247, i64 1
+  %tmp4249 = getelementptr inbounds float* %tmp4248, i64 1
+  %tmp4250 = getelementptr inbounds float* %tmp4249, i64 1
+  %tmp4251 = getelementptr inbounds float* %tmp4250, i64 1
+  %tmp4252 = getelementptr inbounds float* %tmp4251, i64 1
+  %tmp4253 = getelementptr inbounds float* %tmp4252, i64 1
+  %tmp4254 = getelementptr inbounds float* %tmp4253, i64 1
+  %tmp4255 = getelementptr inbounds float* %tmp4254, i64 1
+  %tmp4256 = getelementptr inbounds float* %tmp4255, i64 1
+  %tmp4257 = getelementptr inbounds float* %tmp4256, i64 1
+  %tmp4258 = getelementptr inbounds float* %tmp4257, i64 1
+  %tmp4259 = getelementptr inbounds float* %tmp4258, i64 1
+  %tmp4260 = getelementptr inbounds float* %tmp4259, i64 1
+  %tmp4261 = getelementptr inbounds float* %tmp4260, i64 1
+  %tmp4262 = getelementptr inbounds float* %tmp4261, i64 1
+  %tmp4263 = getelementptr inbounds float* %tmp4262, i64 1
+  %tmp4264 = getelementptr inbounds float* %tmp4263, i64 1
+  %tmp4265 = getelementptr inbounds float* %tmp4264, i64 1
+  %tmp4266 = getelementptr inbounds float* %tmp4265, i64 1
+  %tmp4267 = getelementptr inbounds float* %tmp4266, i64 1
+  %tmp4268 = getelementptr inbounds float* %tmp4267, i64 1
+  %tmp4269 = getelementptr inbounds float* %tmp4268, i64 1
+  %tmp4270 = getelementptr inbounds float* %tmp4269, i64 1
+  %tmp4271 = getelementptr inbounds float* %tmp4270, i64 1
+  %tmp4272 = getelementptr inbounds float* %tmp4271, i64 1
+  %tmp4273 = getelementptr inbounds float* %tmp4272, i64 1
+  %tmp4274 = getelementptr inbounds float* %tmp4273, i64 1
+  %tmp4275 = getelementptr inbounds float* %tmp4274, i64 1
+  %tmp4276 = getelementptr inbounds float* %tmp4275, i64 1
+  %tmp4277 = getelementptr inbounds float* %tmp4276, i64 1
+  %tmp4278 = getelementptr inbounds float* %tmp4277, i64 1
+  %tmp4279 = getelementptr inbounds float* %tmp4278, i64 1
+  %tmp4280 = getelementptr inbounds float* %tmp4279, i64 1
+  %tmp4281 = getelementptr inbounds float* %tmp4280, i64 1
+  %tmp4282 = getelementptr inbounds float* %tmp4281, i64 1
+  %tmp4283 = getelementptr inbounds float* %tmp4282, i64 1
+  %tmp4284 = getelementptr inbounds float* %tmp4283, i64 1
+  %tmp4285 = getelementptr inbounds float* %tmp4284, i64 1
+  %tmp4286 = getelementptr inbounds float* %tmp4285, i64 1
+  %tmp4287 = getelementptr inbounds float* %tmp4286, i64 1
+  %tmp4288 = getelementptr inbounds float* %tmp4287, i64 1
+  %tmp4289 = getelementptr inbounds float* %tmp4288, i64 1
+  %tmp4290 = getelementptr inbounds float* %tmp4289, i64 1
+  %tmp4291 = getelementptr inbounds float* %tmp4290, i64 1
+  %tmp4292 = getelementptr inbounds float* %tmp4291, i64 1
+  %tmp4293 = getelementptr inbounds float* %tmp4292, i64 1
+  %tmp4294 = getelementptr inbounds float* %tmp4293, i64 1
+  %tmp4295 = getelementptr inbounds float* %tmp4294, i64 1
+  %tmp4296 = getelementptr inbounds float* %tmp4295, i64 1
+  %tmp4297 = getelementptr inbounds float* %tmp4296, i64 1
+  %tmp4298 = getelementptr inbounds float* %tmp4297, i64 1
+  %tmp4299 = getelementptr inbounds float* %tmp4298, i64 1
+  %tmp4300 = getelementptr inbounds float* %tmp4299, i64 1
+  %tmp4301 = getelementptr inbounds float* %tmp4300, i64 1
+  %tmp4302 = getelementptr inbounds float* %tmp4301, i64 1
+  %tmp4303 = getelementptr inbounds float* %tmp4302, i64 1
+  %tmp4304 = getelementptr inbounds float* %tmp4303, i64 1
+  %tmp4305 = getelementptr inbounds float* %tmp4304, i64 1
+  %tmp4306 = getelementptr inbounds float* %tmp4305, i64 1
+  %tmp4307 = getelementptr inbounds float* %tmp4306, i64 1
+  %tmp4308 = getelementptr inbounds float* %tmp4307, i64 1
+  %tmp4309 = getelementptr inbounds float* %tmp4308, i64 1
+  %tmp4310 = getelementptr inbounds float* %tmp4309, i64 1
+  %tmp4311 = getelementptr inbounds float* %tmp4310, i64 1
+  %tmp4312 = getelementptr inbounds float* %tmp4311, i64 1
+  %tmp4313 = getelementptr inbounds float* %tmp4312, i64 1
+  %tmp4314 = getelementptr inbounds float* %tmp4313, i64 1
+  %tmp4315 = getelementptr inbounds float* %tmp4314, i64 1
+  %tmp4316 = getelementptr inbounds float* %tmp4315, i64 1
+  %tmp4317 = getelementptr inbounds float* %tmp4316, i64 1
+  %tmp4318 = getelementptr inbounds float* %tmp4317, i64 1
+  %tmp4319 = getelementptr inbounds float* %tmp4318, i64 1
+  %tmp4320 = getelementptr inbounds float* %tmp4319, i64 1
+  %tmp4321 = getelementptr inbounds float* %tmp4320, i64 1
+  %tmp4322 = getelementptr inbounds float* %tmp4321, i64 1
+  %tmp4323 = getelementptr inbounds float* %tmp4322, i64 1
+  %tmp4324 = getelementptr inbounds float* %tmp4323, i64 1
+  %tmp4325 = getelementptr inbounds float* %tmp4324, i64 1
+  %tmp4326 = getelementptr inbounds float* %tmp4325, i64 1
+  %tmp4327 = getelementptr inbounds float* %tmp4326, i64 1
+  %tmp4328 = getelementptr inbounds float* %tmp4327, i64 1
+  %tmp4329 = getelementptr inbounds float* %tmp4328, i64 1
+  %tmp4330 = getelementptr inbounds float* %tmp4329, i64 1
+  %tmp4331 = getelementptr inbounds float* %tmp4330, i64 1
+  %tmp4332 = getelementptr inbounds float* %tmp4331, i64 1
+  %tmp4333 = getelementptr inbounds float* %tmp4332, i64 1
+  %tmp4334 = getelementptr inbounds float* %tmp4333, i64 1
+  %tmp4335 = getelementptr inbounds float* %tmp4334, i64 1
+  %tmp4336 = getelementptr inbounds float* %tmp4335, i64 1
+  %tmp4337 = getelementptr inbounds float* %tmp4336, i64 1
+  %tmp4338 = getelementptr inbounds float* %tmp4337, i64 1
+  %tmp4339 = getelementptr inbounds float* %tmp4338, i64 1
+  %tmp4340 = getelementptr inbounds float* %tmp4339, i64 1
+  %tmp4341 = getelementptr inbounds float* %tmp4340, i64 1
+  %tmp4342 = getelementptr inbounds float* %tmp4341, i64 1
+  %tmp4343 = getelementptr inbounds float* %tmp4342, i64 1
+  %tmp4344 = getelementptr inbounds float* %tmp4343, i64 1
+  %tmp4345 = getelementptr inbounds float* %tmp4344, i64 1
+  %tmp4346 = getelementptr inbounds float* %tmp4345, i64 1
+  %tmp4347 = getelementptr inbounds float* %tmp4346, i64 1
+  %tmp4348 = getelementptr inbounds float* %tmp4347, i64 1
+  %tmp4349 = getelementptr inbounds float* %tmp4348, i64 1
+  %tmp4350 = getelementptr inbounds float* %tmp4349, i64 1
+  %tmp4351 = getelementptr inbounds float* %tmp4350, i64 1
+  %tmp4352 = getelementptr inbounds float* %tmp4351, i64 1
+  %tmp4353 = getelementptr inbounds float* %tmp4352, i64 1
+  %tmp4354 = getelementptr inbounds float* %tmp4353, i64 1
+  %tmp4355 = getelementptr inbounds float* %tmp4354, i64 1
+  %tmp4356 = getelementptr inbounds float* %tmp4355, i64 1
+  %tmp4357 = getelementptr inbounds float* %tmp4356, i64 1
+  %tmp4358 = getelementptr inbounds float* %tmp4357, i64 1
+  %tmp4359 = getelementptr inbounds float* %tmp4358, i64 1
+  %tmp4360 = getelementptr inbounds float* %tmp4359, i64 1
+  %tmp4361 = getelementptr inbounds float* %tmp4360, i64 1
+  %tmp4362 = getelementptr inbounds float* %tmp4361, i64 1
+  %tmp4363 = getelementptr inbounds float* %tmp4362, i64 1
+  %tmp4364 = getelementptr inbounds float* %tmp4363, i64 1
+  %tmp4365 = getelementptr inbounds float* %tmp4364, i64 1
+  %tmp4366 = getelementptr inbounds float* %tmp4365, i64 1
+  %tmp4367 = getelementptr inbounds float* %tmp4366, i64 1
+  %tmp4368 = getelementptr inbounds float* %tmp4367, i64 1
+  %tmp4369 = getelementptr inbounds float* %tmp4368, i64 1
+  %tmp4370 = getelementptr inbounds float* %tmp4369, i64 1
+  %tmp4371 = getelementptr inbounds float* %tmp4370, i64 1
+  %tmp4372 = getelementptr inbounds float* %tmp4371, i64 1
+  %tmp4373 = getelementptr inbounds float* %tmp4372, i64 1
+  %tmp4374 = getelementptr inbounds float* %tmp4373, i64 1
+  %tmp4375 = getelementptr inbounds float* %tmp4374, i64 1
+  %tmp4376 = getelementptr inbounds float* %tmp4375, i64 1
+  %tmp4377 = getelementptr inbounds float* %tmp4376, i64 1
+  %tmp4378 = getelementptr inbounds float* %tmp4377, i64 1
+  %tmp4379 = getelementptr inbounds float* %tmp4378, i64 1
+  %tmp4380 = getelementptr inbounds float* %tmp4379, i64 1
+  %tmp4381 = getelementptr inbounds float* %tmp4380, i64 1
+  %tmp4382 = getelementptr inbounds float* %tmp4381, i64 1
+  %tmp4383 = getelementptr inbounds float* %tmp4382, i64 1
+  %tmp4384 = getelementptr inbounds float* %tmp4383, i64 1
+  %tmp4385 = getelementptr inbounds float* %tmp4384, i64 1
+  %tmp4386 = getelementptr inbounds float* %tmp4385, i64 1
+  %tmp4387 = getelementptr inbounds float* %tmp4386, i64 1
+  %tmp4388 = getelementptr inbounds float* %tmp4387, i64 1
+  %tmp4389 = getelementptr inbounds float* %tmp4388, i64 1
+  %tmp4390 = getelementptr inbounds float* %tmp4389, i64 1
+  %tmp4391 = getelementptr inbounds float* %tmp4390, i64 1
+  %tmp4392 = getelementptr inbounds float* %tmp4391, i64 1
+  %tmp4393 = getelementptr inbounds float* %tmp4392, i64 1
+  %tmp4394 = getelementptr inbounds float* %tmp4393, i64 1
+  %tmp4395 = getelementptr inbounds float* %tmp4394, i64 1
+  %tmp4396 = getelementptr inbounds float* %tmp4395, i64 1
+  %tmp4397 = getelementptr inbounds float* %tmp4396, i64 1
+  %tmp4398 = getelementptr inbounds float* %tmp4397, i64 1
+  %tmp4399 = getelementptr inbounds float* %tmp4398, i64 1
+  %tmp4400 = getelementptr inbounds float* %tmp4399, i64 1
+  %tmp4401 = getelementptr inbounds float* %tmp4400, i64 1
+  %tmp4402 = getelementptr inbounds float* %tmp4401, i64 1
+  %tmp4403 = getelementptr inbounds float* %tmp4402, i64 1
+  %tmp4404 = getelementptr inbounds float* %tmp4403, i64 1
+  %tmp4405 = getelementptr inbounds float* %tmp4404, i64 1
+  %tmp4406 = getelementptr inbounds float* %tmp4405, i64 1
+  %tmp4407 = getelementptr inbounds float* %tmp4406, i64 1
+  %tmp4408 = getelementptr inbounds float* %tmp4407, i64 1
+  %tmp4409 = getelementptr inbounds float* %tmp4408, i64 1
+  %tmp4410 = getelementptr inbounds float* %tmp4409, i64 1
+  %tmp4411 = getelementptr inbounds float* %tmp4410, i64 1
+  %tmp4412 = getelementptr inbounds float* %tmp4411, i64 1
+  %tmp4413 = getelementptr inbounds float* %tmp4412, i64 1
+  %tmp4414 = getelementptr inbounds float* %tmp4413, i64 1
+  %tmp4415 = getelementptr inbounds float* %tmp4414, i64 1
+  %tmp4416 = getelementptr inbounds float* %tmp4415, i64 1
+  %tmp4417 = getelementptr inbounds float* %tmp4416, i64 1
+  %tmp4418 = getelementptr inbounds float* %tmp4417, i64 1
+  %tmp4419 = getelementptr inbounds float* %tmp4418, i64 1
+  %tmp4420 = getelementptr inbounds float* %tmp4419, i64 1
+  %tmp4421 = getelementptr inbounds float* %tmp4420, i64 1
+  %tmp4422 = getelementptr inbounds float* %tmp4421, i64 1
+  %tmp4423 = getelementptr inbounds float* %tmp4422, i64 1
+  %tmp4424 = getelementptr inbounds float* %tmp4423, i64 1
+  %tmp4425 = getelementptr inbounds float* %tmp4424, i64 1
+  %tmp4426 = getelementptr inbounds float* %tmp4425, i64 1
+  %tmp4427 = getelementptr inbounds float* %tmp4426, i64 1
+  %tmp4428 = getelementptr inbounds float* %tmp4427, i64 1
+  %tmp4429 = getelementptr inbounds float* %tmp4428, i64 1
+  %tmp4430 = getelementptr inbounds float* %tmp4429, i64 1
+  %tmp4431 = getelementptr inbounds float* %tmp4430, i64 1
+  %tmp4432 = getelementptr inbounds float* %tmp4431, i64 1
+  %tmp4433 = getelementptr inbounds float* %tmp4432, i64 1
+  %tmp4434 = getelementptr inbounds float* %tmp4433, i64 1
+  %tmp4435 = getelementptr inbounds float* %tmp4434, i64 1
+  %tmp4436 = getelementptr inbounds float* %tmp4435, i64 1
+  %tmp4437 = getelementptr inbounds float* %tmp4436, i64 1
+  %tmp4438 = getelementptr inbounds float* %tmp4437, i64 1
+  %tmp4439 = getelementptr inbounds float* %tmp4438, i64 1
+  %tmp4440 = getelementptr inbounds float* %tmp4439, i64 1
+  %tmp4441 = getelementptr inbounds float* %tmp4440, i64 1
+  %tmp4442 = getelementptr inbounds float* %tmp4441, i64 1
+  %tmp4443 = getelementptr inbounds float* %tmp4442, i64 1
+  %tmp4444 = getelementptr inbounds float* %tmp4443, i64 1
+  %tmp4445 = getelementptr inbounds float* %tmp4444, i64 1
+  %tmp4446 = getelementptr inbounds float* %tmp4445, i64 1
+  %tmp4447 = getelementptr inbounds float* %tmp4446, i64 1
+  %tmp4448 = getelementptr inbounds float* %tmp4447, i64 1
+  %tmp4449 = getelementptr inbounds float* %tmp4448, i64 1
+  %tmp4450 = getelementptr inbounds float* %tmp4449, i64 1
+  %tmp4451 = getelementptr inbounds float* %tmp4450, i64 1
+  %tmp4452 = getelementptr inbounds float* %tmp4451, i64 1
+  %tmp4453 = getelementptr inbounds float* %tmp4452, i64 1
+  %tmp4454 = getelementptr inbounds float* %tmp4453, i64 1
+  %tmp4455 = getelementptr inbounds float* %tmp4454, i64 1
+  %tmp4456 = getelementptr inbounds float* %tmp4455, i64 1
+  %tmp4457 = getelementptr inbounds float* %tmp4456, i64 1
+  %tmp4458 = getelementptr inbounds float* %tmp4457, i64 1
+  %tmp4459 = getelementptr inbounds float* %tmp4458, i64 1
+  %tmp4460 = getelementptr inbounds float* %tmp4459, i64 1
+  %tmp4461 = getelementptr inbounds float* %tmp4460, i64 1
+  %tmp4462 = getelementptr inbounds float* %tmp4461, i64 1
+  %tmp4463 = getelementptr inbounds float* %tmp4462, i64 1
+  %tmp4464 = getelementptr inbounds float* %tmp4463, i64 1
+  %tmp4465 = getelementptr inbounds float* %tmp4464, i64 1
+  %tmp4466 = getelementptr inbounds float* %tmp4465, i64 1
+  %tmp4467 = getelementptr inbounds float* %tmp4466, i64 1
+  %tmp4468 = getelementptr inbounds float* %tmp4467, i64 1
+  %tmp4469 = getelementptr inbounds float* %tmp4468, i64 1
+  %tmp4470 = getelementptr inbounds float* %tmp4469, i64 1
+  %tmp4471 = getelementptr inbounds float* %tmp4470, i64 1
+  %tmp4472 = getelementptr inbounds float* %tmp4471, i64 1
+  %tmp4473 = getelementptr inbounds float* %tmp4472, i64 1
+  %tmp4474 = getelementptr inbounds float* %tmp4473, i64 1
+  %tmp4475 = getelementptr inbounds float* %tmp4474, i64 1
+  %tmp4476 = getelementptr inbounds float* %tmp4475, i64 1
+  %tmp4477 = getelementptr inbounds float* %tmp4476, i64 1
+  %tmp4478 = getelementptr inbounds float* %tmp4477, i64 1
+  %tmp4479 = getelementptr inbounds float* %tmp4478, i64 1
+  %tmp4480 = getelementptr inbounds float* %tmp4479, i64 1
+  %tmp4481 = getelementptr inbounds float* %tmp4480, i64 1
+  %tmp4482 = getelementptr inbounds float* %tmp4481, i64 1
+  %tmp4483 = getelementptr inbounds float* %tmp4482, i64 1
+  %tmp4484 = getelementptr inbounds float* %tmp4483, i64 1
+  %tmp4485 = getelementptr inbounds float* %tmp4484, i64 1
+  %tmp4486 = getelementptr inbounds float* %tmp4485, i64 1
+  %tmp4487 = getelementptr inbounds float* %tmp4486, i64 1
+  %tmp4488 = getelementptr inbounds float* %tmp4487, i64 1
+  %tmp4489 = getelementptr inbounds float* %tmp4488, i64 1
+  %tmp4490 = getelementptr inbounds float* %tmp4489, i64 1
+  %tmp4491 = getelementptr inbounds float* %tmp4490, i64 1
+  %tmp4492 = getelementptr inbounds float* %tmp4491, i64 1
+  %tmp4493 = getelementptr inbounds float* %tmp4492, i64 1
+  %tmp4494 = getelementptr inbounds float* %tmp4493, i64 1
+  %tmp4495 = getelementptr inbounds float* %tmp4494, i64 1
+  %tmp4496 = getelementptr inbounds float* %tmp4495, i64 1
+  %tmp4497 = getelementptr inbounds float* %tmp4496, i64 1
+  %tmp4498 = getelementptr inbounds float* %tmp4497, i64 1
+  %tmp4499 = getelementptr inbounds float* %tmp4498, i64 1
+  %tmp4500 = getelementptr inbounds float* %tmp4499, i64 1
+  %tmp4501 = getelementptr inbounds float* %tmp4500, i64 1
+  %tmp4502 = getelementptr inbounds float* %tmp4501, i64 1
+  %tmp4503 = getelementptr inbounds float* %tmp4502, i64 1
+  %tmp4504 = getelementptr inbounds float* %tmp4503, i64 1
+  %tmp4505 = getelementptr inbounds float* %tmp4504, i64 1
+  %tmp4506 = getelementptr inbounds float* %tmp4505, i64 1
+  %tmp4507 = getelementptr inbounds float* %tmp4506, i64 1
+  %tmp4508 = getelementptr inbounds float* %tmp4507, i64 1
+  %tmp4509 = getelementptr inbounds float* %tmp4508, i64 1
+  %tmp4510 = getelementptr inbounds float* %tmp4509, i64 1
+  %tmp4511 = getelementptr inbounds float* %tmp4510, i64 1
+  %tmp4512 = getelementptr inbounds float* %tmp4511, i64 1
+  %tmp4513 = getelementptr inbounds float* %tmp4512, i64 1
+  %tmp4514 = getelementptr inbounds float* %tmp4513, i64 1
+  %tmp4515 = getelementptr inbounds float* %tmp4514, i64 1
+  %tmp4516 = getelementptr inbounds float* %tmp4515, i64 1
+  %tmp4517 = getelementptr inbounds float* %tmp4516, i64 1
+  %tmp4518 = getelementptr inbounds float* %tmp4517, i64 1
+  %tmp4519 = getelementptr inbounds float* %tmp4518, i64 1
+  %tmp4520 = getelementptr inbounds float* %tmp4519, i64 1
+  %tmp4521 = getelementptr inbounds float* %tmp4520, i64 1
+  %tmp4522 = getelementptr inbounds float* %tmp4521, i64 1
+  %tmp4523 = getelementptr inbounds float* %tmp4522, i64 1
+  %tmp4524 = getelementptr inbounds float* %tmp4523, i64 1
+  %tmp4525 = getelementptr inbounds float* %tmp4524, i64 1
+  %tmp4526 = getelementptr inbounds float* %tmp4525, i64 1
+  %tmp4527 = getelementptr inbounds float* %tmp4526, i64 1
+  %tmp4528 = getelementptr inbounds float* %tmp4527, i64 1
+  %tmp4529 = getelementptr inbounds float* %tmp4528, i64 1
+  %tmp4530 = getelementptr inbounds float* %tmp4529, i64 1
+  %tmp4531 = getelementptr inbounds float* %tmp4530, i64 1
+  %tmp4532 = getelementptr inbounds float* %tmp4531, i64 1
+  %tmp4533 = getelementptr inbounds float* %tmp4532, i64 1
+  %tmp4534 = getelementptr inbounds float* %tmp4533, i64 1
+  %tmp4535 = getelementptr inbounds float* %tmp4534, i64 1
+  %tmp4536 = getelementptr inbounds float* %tmp4535, i64 1
+  %tmp4537 = getelementptr inbounds float* %tmp4536, i64 1
+  %tmp4538 = getelementptr inbounds float* %tmp4537, i64 1
+  %tmp4539 = getelementptr inbounds float* %tmp4538, i64 1
+  %tmp4540 = getelementptr inbounds float* %tmp4539, i64 1
+  %tmp4541 = getelementptr inbounds float* %tmp4540, i64 1
+  %tmp4542 = getelementptr inbounds float* %tmp4541, i64 1
+  %tmp4543 = getelementptr inbounds float* %tmp4542, i64 1
+  %tmp4544 = getelementptr inbounds float* %tmp4543, i64 1
+  %tmp4545 = getelementptr inbounds float* %tmp4544, i64 1
+  %tmp4546 = getelementptr inbounds float* %tmp4545, i64 1
+  %tmp4547 = getelementptr inbounds float* %tmp4546, i64 1
+  %tmp4548 = getelementptr inbounds float* %tmp4547, i64 1
+  %tmp4549 = getelementptr inbounds float* %tmp4548, i64 1
+  %tmp4550 = getelementptr inbounds float* %tmp4549, i64 1
+  %tmp4551 = getelementptr inbounds float* %tmp4550, i64 1
+  %tmp4552 = getelementptr inbounds float* %tmp4551, i64 1
+  %tmp4553 = getelementptr inbounds float* %tmp4552, i64 1
+  %tmp4554 = getelementptr inbounds float* %tmp4553, i64 1
+  %tmp4555 = getelementptr inbounds float* %tmp4554, i64 1
+  %tmp4556 = getelementptr inbounds float* %tmp4555, i64 1
+  %tmp4557 = getelementptr inbounds float* %tmp4556, i64 1
+  %tmp4558 = getelementptr inbounds float* %tmp4557, i64 1
+  %tmp4559 = getelementptr inbounds float* %tmp4558, i64 1
+  %tmp4560 = getelementptr inbounds float* %tmp4559, i64 1
+  %tmp4561 = getelementptr inbounds float* %tmp4560, i64 1
+  %tmp4562 = getelementptr inbounds float* %tmp4561, i64 1
+  %tmp4563 = getelementptr inbounds float* %tmp4562, i64 1
+  %tmp4564 = getelementptr inbounds float* %tmp4563, i64 1
+  %tmp4565 = getelementptr inbounds float* %tmp4564, i64 1
+  %tmp4566 = getelementptr inbounds float* %tmp4565, i64 1
+  %tmp4567 = getelementptr inbounds float* %tmp4566, i64 1
+  %tmp4568 = getelementptr inbounds float* %tmp4567, i64 1
+  %tmp4569 = getelementptr inbounds float* %tmp4568, i64 1
+  %tmp4570 = getelementptr inbounds float* %tmp4569, i64 1
+  %tmp4571 = getelementptr inbounds float* %tmp4570, i64 1
+  %tmp4572 = getelementptr inbounds float* %tmp4571, i64 1
+  %tmp4573 = getelementptr inbounds float* %tmp4572, i64 1
+  %tmp4574 = getelementptr inbounds float* %tmp4573, i64 1
+  %tmp4575 = getelementptr inbounds float* %tmp4574, i64 1
+  %tmp4576 = getelementptr inbounds float* %tmp4575, i64 1
+  %tmp4577 = getelementptr inbounds float* %tmp4576, i64 1
+  %tmp4578 = getelementptr inbounds float* %tmp4577, i64 1
+  %tmp4579 = getelementptr inbounds float* %tmp4578, i64 1
+  %tmp4580 = getelementptr inbounds float* %tmp4579, i64 1
+  %tmp4581 = getelementptr inbounds float* %tmp4580, i64 1
+  %tmp4582 = getelementptr inbounds float* %tmp4581, i64 1
+  %tmp4583 = getelementptr inbounds float* %tmp4582, i64 1
+  %tmp4584 = getelementptr inbounds float* %tmp4583, i64 1
+  %tmp4585 = getelementptr inbounds float* %tmp4584, i64 1
+  %tmp4586 = getelementptr inbounds float* %tmp4585, i64 1
+  %tmp4587 = getelementptr inbounds float* %tmp4586, i64 1
+  %tmp4588 = getelementptr inbounds float* %tmp4587, i64 1
+  %tmp4589 = getelementptr inbounds float* %tmp4588, i64 1
+  %tmp4590 = getelementptr inbounds float* %tmp4589, i64 1
+  %tmp4591 = getelementptr inbounds float* %tmp4590, i64 1
+  %tmp4592 = getelementptr inbounds float* %tmp4591, i64 1
+  %tmp4593 = getelementptr inbounds float* %tmp4592, i64 1
+  %tmp4594 = getelementptr inbounds float* %tmp4593, i64 1
+  %tmp4595 = getelementptr inbounds float* %tmp4594, i64 1
+  %tmp4596 = getelementptr inbounds float* %tmp4595, i64 1
+  %tmp4597 = getelementptr inbounds float* %tmp4596, i64 1
+  %tmp4598 = getelementptr inbounds float* %tmp4597, i64 1
+  %tmp4599 = getelementptr inbounds float* %tmp4598, i64 1
+  %tmp4600 = getelementptr inbounds float* %tmp4599, i64 1
+  %tmp4601 = getelementptr inbounds float* %tmp4600, i64 1
+  %tmp4602 = getelementptr inbounds float* %tmp4601, i64 1
+  %tmp4603 = getelementptr inbounds float* %tmp4602, i64 1
+  %tmp4604 = getelementptr inbounds float* %tmp4603, i64 1
+  %tmp4605 = getelementptr inbounds float* %tmp4604, i64 1
+  %tmp4606 = getelementptr inbounds float* %tmp4605, i64 1
+  %tmp4607 = getelementptr inbounds float* %tmp4606, i64 1
+  %tmp4608 = getelementptr inbounds float* %tmp4607, i64 1
+  %tmp4609 = getelementptr inbounds float* %tmp4608, i64 1
+  %tmp4610 = getelementptr inbounds float* %tmp4609, i64 1
+  %tmp4611 = getelementptr inbounds float* %tmp4610, i64 1
+  %tmp4612 = getelementptr inbounds float* %tmp4611, i64 1
+  %tmp4613 = getelementptr inbounds float* %tmp4612, i64 1
+  %tmp4614 = getelementptr inbounds float* %tmp4613, i64 1
+  %tmp4615 = getelementptr inbounds float* %tmp4614, i64 1
+  %tmp4616 = getelementptr inbounds float* %tmp4615, i64 1
+  %tmp4617 = getelementptr inbounds float* %tmp4616, i64 1
+  %tmp4618 = getelementptr inbounds float* %tmp4617, i64 1
+  %tmp4619 = getelementptr inbounds float* %tmp4618, i64 1
+  %tmp4620 = getelementptr inbounds float* %tmp4619, i64 1
+  %tmp4621 = getelementptr inbounds float* %tmp4620, i64 1
+  %tmp4622 = getelementptr inbounds float* %tmp4621, i64 1
+  %tmp4623 = getelementptr inbounds float* %tmp4622, i64 1
+  %tmp4624 = getelementptr inbounds float* %tmp4623, i64 1
+  %tmp4625 = getelementptr inbounds float* %tmp4624, i64 1
+  %tmp4626 = getelementptr inbounds float* %tmp4625, i64 1
+  %tmp4627 = getelementptr inbounds float* %tmp4626, i64 1
+  %tmp4628 = getelementptr inbounds float* %tmp4627, i64 1
+  %tmp4629 = getelementptr inbounds float* %tmp4628, i64 1
+  %tmp4630 = getelementptr inbounds float* %tmp4629, i64 1
+  %tmp4631 = getelementptr inbounds float* %tmp4630, i64 1
+  %tmp4632 = getelementptr inbounds float* %tmp4631, i64 1
+  %tmp4633 = getelementptr inbounds float* %tmp4632, i64 1
+  %tmp4634 = getelementptr inbounds float* %tmp4633, i64 1
+  %tmp4635 = getelementptr inbounds float* %tmp4634, i64 1
+  %tmp4636 = getelementptr inbounds float* %tmp4635, i64 1
+  %tmp4637 = getelementptr inbounds float* %tmp4636, i64 1
+  %tmp4638 = getelementptr inbounds float* %tmp4637, i64 1
+  %tmp4639 = getelementptr inbounds float* %tmp4638, i64 1
+  %tmp4640 = getelementptr inbounds float* %tmp4639, i64 1
+  %tmp4641 = getelementptr inbounds float* %tmp4640, i64 1
+  %tmp4642 = getelementptr inbounds float* %tmp4641, i64 1
+  %tmp4643 = getelementptr inbounds float* %tmp4642, i64 1
+  %tmp4644 = getelementptr inbounds float* %tmp4643, i64 1
+  %tmp4645 = getelementptr inbounds float* %tmp4644, i64 1
+  %tmp4646 = getelementptr inbounds float* %tmp4645, i64 1
+  %tmp4647 = getelementptr inbounds float* %tmp4646, i64 1
+  %tmp4648 = getelementptr inbounds float* %tmp4647, i64 1
+  %tmp4649 = getelementptr inbounds float* %tmp4648, i64 1
+  %tmp4650 = getelementptr inbounds float* %tmp4649, i64 1
+  %tmp4651 = getelementptr inbounds float* %tmp4650, i64 1
+  %tmp4652 = getelementptr inbounds float* %tmp4651, i64 1
+  %tmp4653 = getelementptr inbounds float* %tmp4652, i64 1
+  %tmp4654 = getelementptr inbounds float* %tmp4653, i64 1
+  %tmp4655 = getelementptr inbounds float* %tmp4654, i64 1
+  %tmp4656 = getelementptr inbounds float* %tmp4655, i64 1
+  %tmp4657 = getelementptr inbounds float* %tmp4656, i64 1
+  %tmp4658 = getelementptr inbounds float* %tmp4657, i64 1
+  %tmp4659 = getelementptr inbounds float* %tmp4658, i64 1
+  %tmp4660 = getelementptr inbounds float* %tmp4659, i64 1
+  %tmp4661 = getelementptr inbounds float* %tmp4660, i64 1
+  %tmp4662 = getelementptr inbounds float* %tmp4661, i64 1
+  %tmp4663 = getelementptr inbounds float* %tmp4662, i64 1
+  %tmp4664 = getelementptr inbounds float* %tmp4663, i64 1
+  %tmp4665 = getelementptr inbounds float* %tmp4664, i64 1
+  %tmp4666 = getelementptr inbounds float* %tmp4665, i64 1
+  %tmp4667 = getelementptr inbounds float* %tmp4666, i64 1
+  %tmp4668 = getelementptr inbounds float* %tmp4667, i64 1
+  %tmp4669 = getelementptr inbounds float* %tmp4668, i64 1
+  %tmp4670 = getelementptr inbounds float* %tmp4669, i64 1
+  %tmp4671 = getelementptr inbounds float* %tmp4670, i64 1
+  %tmp4672 = getelementptr inbounds float* %tmp4671, i64 1
+  %tmp4673 = getelementptr inbounds float* %tmp4672, i64 1
+  %tmp4674 = getelementptr inbounds float* %tmp4673, i64 1
+  %tmp4675 = getelementptr inbounds float* %tmp4674, i64 1
+  %tmp4676 = getelementptr inbounds float* %tmp4675, i64 1
+  %tmp4677 = getelementptr inbounds float* %tmp4676, i64 1
+  %tmp4678 = getelementptr inbounds float* %tmp4677, i64 1
+  %tmp4679 = getelementptr inbounds float* %tmp4678, i64 1
+  %tmp4680 = getelementptr inbounds float* %tmp4679, i64 1
+  %tmp4681 = getelementptr inbounds float* %tmp4680, i64 1
+  %tmp4682 = getelementptr inbounds float* %tmp4681, i64 1
+  %tmp4683 = getelementptr inbounds float* %tmp4682, i64 1
+  %tmp4684 = getelementptr inbounds float* %tmp4683, i64 1
+  %tmp4685 = getelementptr inbounds float* %tmp4684, i64 1
+  %tmp4686 = getelementptr inbounds float* %tmp4685, i64 1
+  %tmp4687 = getelementptr inbounds float* %tmp4686, i64 1
+  %tmp4688 = getelementptr inbounds float* %tmp4687, i64 1
+  %tmp4689 = getelementptr inbounds float* %tmp4688, i64 1
+  %tmp4690 = getelementptr inbounds float* %tmp4689, i64 1
+  %tmp4691 = getelementptr inbounds float* %tmp4690, i64 1
+  %tmp4692 = getelementptr inbounds float* %tmp4691, i64 1
+  %tmp4693 = getelementptr inbounds float* %tmp4692, i64 1
+  %tmp4694 = getelementptr inbounds float* %tmp4693, i64 1
+  %tmp4695 = getelementptr inbounds float* %tmp4694, i64 1
+  %tmp4696 = getelementptr inbounds float* %tmp4695, i64 1
+  %tmp4697 = getelementptr inbounds float* %tmp4696, i64 1
+  %tmp4698 = getelementptr inbounds float* %tmp4697, i64 1
+  %tmp4699 = getelementptr inbounds float* %tmp4698, i64 1
+  %tmp4700 = getelementptr inbounds float* %tmp4699, i64 1
+  %tmp4701 = getelementptr inbounds float* %tmp4700, i64 1
+  %tmp4702 = getelementptr inbounds float* %tmp4701, i64 1
+  %tmp4703 = getelementptr inbounds float* %tmp4702, i64 1
+  %tmp4704 = getelementptr inbounds float* %tmp4703, i64 1
+  %tmp4705 = getelementptr inbounds float* %tmp4704, i64 1
+  %tmp4706 = getelementptr inbounds float* %tmp4705, i64 1
+  %tmp4707 = getelementptr inbounds float* %tmp4706, i64 1
+  %tmp4708 = getelementptr inbounds float* %tmp4707, i64 1
+  %tmp4709 = getelementptr inbounds float* %tmp4708, i64 1
+  %tmp4710 = getelementptr inbounds float* %tmp4709, i64 1
+  %tmp4711 = getelementptr inbounds float* %tmp4710, i64 1
+  %tmp4712 = getelementptr inbounds float* %tmp4711, i64 1
+  %tmp4713 = getelementptr inbounds float* %tmp4712, i64 1
+  %tmp4714 = getelementptr inbounds float* %tmp4713, i64 1
+  %tmp4715 = getelementptr inbounds float* %tmp4714, i64 1
+  %tmp4716 = getelementptr inbounds float* %tmp4715, i64 1
+  %tmp4717 = getelementptr inbounds float* %tmp4716, i64 1
+  %tmp4718 = getelementptr inbounds float* %tmp4717, i64 1
+  %tmp4719 = getelementptr inbounds float* %tmp4718, i64 1
+  %tmp4720 = getelementptr inbounds float* %tmp4719, i64 1
+  %tmp4721 = getelementptr inbounds float* %tmp4720, i64 1
+  %tmp4722 = getelementptr inbounds float* %tmp4721, i64 1
+  %tmp4723 = getelementptr inbounds float* %tmp4722, i64 1
+  %tmp4724 = getelementptr inbounds float* %tmp4723, i64 1
+  %tmp4725 = getelementptr inbounds float* %tmp4724, i64 1
+  %tmp4726 = getelementptr inbounds float* %tmp4725, i64 1
+  %tmp4727 = getelementptr inbounds float* %tmp4726, i64 1
+  %tmp4728 = getelementptr inbounds float* %tmp4727, i64 1
+  %tmp4729 = getelementptr inbounds float* %tmp4728, i64 1
+  %tmp4730 = getelementptr inbounds float* %tmp4729, i64 1
+  %tmp4731 = getelementptr inbounds float* %tmp4730, i64 1
+  %tmp4732 = getelementptr inbounds float* %tmp4731, i64 1
+  %tmp4733 = getelementptr inbounds float* %tmp4732, i64 1
+  %tmp4734 = getelementptr inbounds float* %tmp4733, i64 1
+  %tmp4735 = getelementptr inbounds float* %tmp4734, i64 1
+  %tmp4736 = getelementptr inbounds float* %tmp4735, i64 1
+  %tmp4737 = getelementptr inbounds float* %tmp4736, i64 1
+  %tmp4738 = getelementptr inbounds float* %tmp4737, i64 1
+  %tmp4739 = getelementptr inbounds float* %tmp4738, i64 1
+  %tmp4740 = getelementptr inbounds float* %tmp4739, i64 1
+  %tmp4741 = getelementptr inbounds float* %tmp4740, i64 1
+  %tmp4742 = getelementptr inbounds float* %tmp4741, i64 1
+  %tmp4743 = getelementptr inbounds float* %tmp4742, i64 1
+  %tmp4744 = getelementptr inbounds float* %tmp4743, i64 1
+  %tmp4745 = getelementptr inbounds float* %tmp4744, i64 1
+  %tmp4746 = getelementptr inbounds float* %tmp4745, i64 1
+  %tmp4747 = getelementptr inbounds float* %tmp4746, i64 1
+  %tmp4748 = getelementptr inbounds float* %tmp4747, i64 1
+  %tmp4749 = getelementptr inbounds float* %tmp4748, i64 1
+  %tmp4750 = getelementptr inbounds float* %tmp4749, i64 1
+  %tmp4751 = getelementptr inbounds float* %tmp4750, i64 1
+  %tmp4752 = getelementptr inbounds float* %tmp4751, i64 1
+  %tmp4753 = getelementptr inbounds float* %tmp4752, i64 1
+  %tmp4754 = getelementptr inbounds float* %tmp4753, i64 1
+  %tmp4755 = getelementptr inbounds float* %tmp4754, i64 1
+  %tmp4756 = getelementptr inbounds float* %tmp4755, i64 1
+  %tmp4757 = getelementptr inbounds float* %tmp4756, i64 1
+  %tmp4758 = getelementptr inbounds float* %tmp4757, i64 1
+  %tmp4759 = getelementptr inbounds float* %tmp4758, i64 1
+  %tmp4760 = getelementptr inbounds float* %tmp4759, i64 1
+  %tmp4761 = getelementptr inbounds float* %tmp4760, i64 1
+  %tmp4762 = getelementptr inbounds float* %tmp4761, i64 1
+  %tmp4763 = getelementptr inbounds float* %tmp4762, i64 1
+  %tmp4764 = getelementptr inbounds float* %tmp4763, i64 1
+  %tmp4765 = getelementptr inbounds float* %tmp4764, i64 1
+  %tmp4766 = getelementptr inbounds float* %tmp4765, i64 1
+  %tmp4767 = getelementptr inbounds float* %tmp4766, i64 1
+  %tmp4768 = getelementptr inbounds float* %tmp4767, i64 1
+  %tmp4769 = getelementptr inbounds float* %tmp4768, i64 1
+  %tmp4770 = getelementptr inbounds float* %tmp4769, i64 1
+  %tmp4771 = getelementptr inbounds float* %tmp4770, i64 1
+  %tmp4772 = getelementptr inbounds float* %tmp4771, i64 1
+  %tmp4773 = getelementptr inbounds float* %tmp4772, i64 1
+  %tmp4774 = getelementptr inbounds float* %tmp4773, i64 1
+  %tmp4775 = getelementptr inbounds float* %tmp4774, i64 1
+  %tmp4776 = getelementptr inbounds float* %tmp4775, i64 1
+  %tmp4777 = getelementptr inbounds float* %tmp4776, i64 1
+  %tmp4778 = getelementptr inbounds float* %tmp4777, i64 1
+  %tmp4779 = getelementptr inbounds float* %tmp4778, i64 1
+  %tmp4780 = getelementptr inbounds float* %tmp4779, i64 1
+  %tmp4781 = getelementptr inbounds float* %tmp4780, i64 1
+  %tmp4782 = getelementptr inbounds float* %tmp4781, i64 1
+  %tmp4783 = getelementptr inbounds float* %tmp4782, i64 1
+  %tmp4784 = getelementptr inbounds float* %tmp4783, i64 1
+  %tmp4785 = getelementptr inbounds float* %tmp4784, i64 1
+  %tmp4786 = getelementptr inbounds float* %tmp4785, i64 1
+  %tmp4787 = getelementptr inbounds float* %tmp4786, i64 1
+  %tmp4788 = getelementptr inbounds float* %tmp4787, i64 1
+  %tmp4789 = getelementptr inbounds float* %tmp4788, i64 1
+  %tmp4790 = getelementptr inbounds float* %tmp4789, i64 1
+  %tmp4791 = getelementptr inbounds float* %tmp4790, i64 1
+  %tmp4792 = getelementptr inbounds float* %tmp4791, i64 1
+  %tmp4793 = getelementptr inbounds float* %tmp4792, i64 1
+  %tmp4794 = getelementptr inbounds float* %tmp4793, i64 1
+  %tmp4795 = getelementptr inbounds float* %tmp4794, i64 1
+  %tmp4796 = getelementptr inbounds float* %tmp4795, i64 1
+  %tmp4797 = getelementptr inbounds float* %tmp4796, i64 1
+  %tmp4798 = getelementptr inbounds float* %tmp4797, i64 1
+  %tmp4799 = getelementptr inbounds float* %tmp4798, i64 1
+  %tmp4800 = getelementptr inbounds float* %tmp4799, i64 1
+  %tmp4801 = getelementptr inbounds float* %tmp4800, i64 1
+  %tmp4802 = getelementptr inbounds float* %tmp4801, i64 1
+  %tmp4803 = getelementptr inbounds float* %tmp4802, i64 1
+  %tmp4804 = getelementptr inbounds float* %tmp4803, i64 1
+  %tmp4805 = getelementptr inbounds float* %tmp4804, i64 1
+  %tmp4806 = getelementptr inbounds float* %tmp4805, i64 1
+  %tmp4807 = getelementptr inbounds float* %tmp4806, i64 1
+  %tmp4808 = getelementptr inbounds float* %tmp4807, i64 1
+  %tmp4809 = getelementptr inbounds float* %tmp4808, i64 1
+  %tmp4810 = getelementptr inbounds float* %tmp4809, i64 1
+  %tmp4811 = getelementptr inbounds float* %tmp4810, i64 1
+  %tmp4812 = getelementptr inbounds float* %tmp4811, i64 1
+  %tmp4813 = getelementptr inbounds float* %tmp4812, i64 1
+  %tmp4814 = getelementptr inbounds float* %tmp4813, i64 1
+  %tmp4815 = getelementptr inbounds float* %tmp4814, i64 1
+  %tmp4816 = getelementptr inbounds float* %tmp4815, i64 1
+  %tmp4817 = getelementptr inbounds float* %tmp4816, i64 1
+  %tmp4818 = getelementptr inbounds float* %tmp4817, i64 1
+  %tmp4819 = getelementptr inbounds float* %tmp4818, i64 1
+  %tmp4820 = getelementptr inbounds float* %tmp4819, i64 1
+  %tmp4821 = getelementptr inbounds float* %tmp4820, i64 1
+  %tmp4822 = getelementptr inbounds float* %tmp4821, i64 1
+  %tmp4823 = getelementptr inbounds float* %tmp4822, i64 1
+  %tmp4824 = getelementptr inbounds float* %tmp4823, i64 1
+  %tmp4825 = getelementptr inbounds float* %tmp4824, i64 1
+  %tmp4826 = getelementptr inbounds float* %tmp4825, i64 1
+  %tmp4827 = getelementptr inbounds float* %tmp4826, i64 1
+  %tmp4828 = getelementptr inbounds float* %tmp4827, i64 1
+  %tmp4829 = getelementptr inbounds float* %tmp4828, i64 1
+  %tmp4830 = getelementptr inbounds float* %tmp4829, i64 1
+  %tmp4831 = getelementptr inbounds float* %tmp4830, i64 1
+  %tmp4832 = getelementptr inbounds float* %tmp4831, i64 1
+  %tmp4833 = getelementptr inbounds float* %tmp4832, i64 1
+  %tmp4834 = getelementptr inbounds float* %tmp4833, i64 1
+  %tmp4835 = getelementptr inbounds float* %tmp4834, i64 1
+  %tmp4836 = getelementptr inbounds float* %tmp4835, i64 1
+  %tmp4837 = getelementptr inbounds float* %tmp4836, i64 1
+  %tmp4838 = getelementptr inbounds float* %tmp4837, i64 1
+  %tmp4839 = getelementptr inbounds float* %tmp4838, i64 1
+  %tmp4840 = getelementptr inbounds float* %tmp4839, i64 1
+  %tmp4841 = getelementptr inbounds float* %tmp4840, i64 1
+  %tmp4842 = getelementptr inbounds float* %tmp4841, i64 1
+  %tmp4843 = getelementptr inbounds float* %tmp4842, i64 1
+  %tmp4844 = getelementptr inbounds float* %tmp4843, i64 1
+  %tmp4845 = getelementptr inbounds float* %tmp4844, i64 1
+  %tmp4846 = getelementptr inbounds float* %tmp4845, i64 1
+  %tmp4847 = getelementptr inbounds float* %tmp4846, i64 1
+  %tmp4848 = getelementptr inbounds float* %tmp4847, i64 1
+  %tmp4849 = getelementptr inbounds float* %tmp4848, i64 1
+  %tmp4850 = getelementptr inbounds float* %tmp4849, i64 1
+  %tmp4851 = getelementptr inbounds float* %tmp4850, i64 1
+  %tmp4852 = getelementptr inbounds float* %tmp4851, i64 1
+  %tmp4853 = getelementptr inbounds float* %tmp4852, i64 1
+  %tmp4854 = getelementptr inbounds float* %tmp4853, i64 1
+  %tmp4855 = getelementptr inbounds float* %tmp4854, i64 1
+  %tmp4856 = getelementptr inbounds float* %tmp4855, i64 1
+  %tmp4857 = getelementptr inbounds float* %tmp4856, i64 1
+  %tmp4858 = getelementptr inbounds float* %tmp4857, i64 1
+  %tmp4859 = getelementptr inbounds float* %tmp4858, i64 1
+  %tmp4860 = getelementptr inbounds float* %tmp4859, i64 1
+  %tmp4861 = getelementptr inbounds float* %tmp4860, i64 1
+  %tmp4862 = getelementptr inbounds float* %tmp4861, i64 1
+  %tmp4863 = getelementptr inbounds float* %tmp4862, i64 1
+  %tmp4864 = getelementptr inbounds float* %tmp4863, i64 1
+  %tmp4865 = getelementptr inbounds float* %tmp4864, i64 1
+  %tmp4866 = getelementptr inbounds float* %tmp4865, i64 1
+  %tmp4867 = getelementptr inbounds float* %tmp4866, i64 1
+  %tmp4868 = getelementptr inbounds float* %tmp4867, i64 1
+  %tmp4869 = getelementptr inbounds float* %tmp4868, i64 1
+  %tmp4870 = getelementptr inbounds float* %tmp4869, i64 1
+  %tmp4871 = getelementptr inbounds float* %tmp4870, i64 1
+  %tmp4872 = getelementptr inbounds float* %tmp4871, i64 1
+  %tmp4873 = getelementptr inbounds float* %tmp4872, i64 1
+  %tmp4874 = getelementptr inbounds float* %tmp4873, i64 1
+  %tmp4875 = getelementptr inbounds float* %tmp4874, i64 1
+  %tmp4876 = getelementptr inbounds float* %tmp4875, i64 1
+  %tmp4877 = getelementptr inbounds float* %tmp4876, i64 1
+  %tmp4878 = getelementptr inbounds float* %tmp4877, i64 1
+  %tmp4879 = getelementptr inbounds float* %tmp4878, i64 1
+  %tmp4880 = getelementptr inbounds float* %tmp4879, i64 1
+  %tmp4881 = getelementptr inbounds float* %tmp4880, i64 1
+  %tmp4882 = getelementptr inbounds float* %tmp4881, i64 1
+  %tmp4883 = getelementptr inbounds float* %tmp4882, i64 1
+  %tmp4884 = getelementptr inbounds float* %tmp4883, i64 1
+  %tmp4885 = getelementptr inbounds float* %tmp4884, i64 1
+  %tmp4886 = getelementptr inbounds float* %tmp4885, i64 1
+  %tmp4887 = getelementptr inbounds float* %tmp4886, i64 1
+  %tmp4888 = getelementptr inbounds float* %tmp4887, i64 1
+  %tmp4889 = getelementptr inbounds float* %tmp4888, i64 1
+  %tmp4890 = getelementptr inbounds float* %tmp4889, i64 1
+  %tmp4891 = getelementptr inbounds float* %tmp4890, i64 1
+  %tmp4892 = getelementptr inbounds float* %tmp4891, i64 1
+  %tmp4893 = getelementptr inbounds float* %tmp4892, i64 1
+  %tmp4894 = getelementptr inbounds float* %tmp4893, i64 1
+  %tmp4895 = getelementptr inbounds float* %tmp4894, i64 1
+  %tmp4896 = getelementptr inbounds float* %tmp4895, i64 1
+  %tmp4897 = getelementptr inbounds float* %tmp4896, i64 1
+  %tmp4898 = getelementptr inbounds float* %tmp4897, i64 1
+  %tmp4899 = getelementptr inbounds float* %tmp4898, i64 1
+  %tmp4900 = getelementptr inbounds float* %tmp4899, i64 1
+  %tmp4901 = getelementptr inbounds float* %tmp4900, i64 1
+  %tmp4902 = getelementptr inbounds float* %tmp4901, i64 1
+  %tmp4903 = getelementptr inbounds float* %tmp4902, i64 1
+  %tmp4904 = getelementptr inbounds float* %tmp4903, i64 1
+  %tmp4905 = getelementptr inbounds float* %tmp4904, i64 1
+  %tmp4906 = getelementptr inbounds float* %tmp4905, i64 1
+  %tmp4907 = getelementptr inbounds float* %tmp4906, i64 1
+  %tmp4908 = getelementptr inbounds float* %tmp4907, i64 1
+  %tmp4909 = getelementptr inbounds float* %tmp4908, i64 1
+  %tmp4910 = getelementptr inbounds float* %tmp4909, i64 1
+  %tmp4911 = getelementptr inbounds float* %tmp4910, i64 1
+  %tmp4912 = getelementptr inbounds float* %tmp4911, i64 1
+  %tmp4913 = getelementptr inbounds float* %tmp4912, i64 1
+  %tmp4914 = getelementptr inbounds float* %tmp4913, i64 1
+  %tmp4915 = getelementptr inbounds float* %tmp4914, i64 1
+  %tmp4916 = getelementptr inbounds float* %tmp4915, i64 1
+  %tmp4917 = getelementptr inbounds float* %tmp4916, i64 1
+  %tmp4918 = getelementptr inbounds float* %tmp4917, i64 1
+  %tmp4919 = getelementptr inbounds float* %tmp4918, i64 1
+  %tmp4920 = getelementptr inbounds float* %tmp4919, i64 1
+  %tmp4921 = getelementptr inbounds float* %tmp4920, i64 1
+  %tmp4922 = getelementptr inbounds float* %tmp4921, i64 1
+  %tmp4923 = getelementptr inbounds float* %tmp4922, i64 1
+  %tmp4924 = getelementptr inbounds float* %tmp4923, i64 1
+  %tmp4925 = getelementptr inbounds float* %tmp4924, i64 1
+  %tmp4926 = getelementptr inbounds float* %tmp4925, i64 1
+  %tmp4927 = getelementptr inbounds float* %tmp4926, i64 1
+  %tmp4928 = getelementptr inbounds float* %tmp4927, i64 1
+  %tmp4929 = getelementptr inbounds float* %tmp4928, i64 1
+  %tmp4930 = getelementptr inbounds float* %tmp4929, i64 1
+  %tmp4931 = getelementptr inbounds float* %tmp4930, i64 1
+  %tmp4932 = getelementptr inbounds float* %tmp4931, i64 1
+  %tmp4933 = getelementptr inbounds float* %tmp4932, i64 1
+  %tmp4934 = getelementptr inbounds float* %tmp4933, i64 1
+  %tmp4935 = getelementptr inbounds float* %tmp4934, i64 1
+  %tmp4936 = getelementptr inbounds float* %tmp4935, i64 1
+  %tmp4937 = getelementptr inbounds float* %tmp4936, i64 1
+  %tmp4938 = getelementptr inbounds float* %tmp4937, i64 1
+  %tmp4939 = getelementptr inbounds float* %tmp4938, i64 1
+  %tmp4940 = getelementptr inbounds float* %tmp4939, i64 1
+  %tmp4941 = getelementptr inbounds float* %tmp4940, i64 1
+  %tmp4942 = getelementptr inbounds float* %tmp4941, i64 1
+  %tmp4943 = getelementptr inbounds float* %tmp4942, i64 1
+  %tmp4944 = getelementptr inbounds float* %tmp4943, i64 1
+  %tmp4945 = getelementptr inbounds float* %tmp4944, i64 1
+  %tmp4946 = getelementptr inbounds float* %tmp4945, i64 1
+  %tmp4947 = getelementptr inbounds float* %tmp4946, i64 1
+  %tmp4948 = getelementptr inbounds float* %tmp4947, i64 1
+  %tmp4949 = getelementptr inbounds float* %tmp4948, i64 1
+  %tmp4950 = getelementptr inbounds float* %tmp4949, i64 1
+  %tmp4951 = getelementptr inbounds float* %tmp4950, i64 1
+  %tmp4952 = getelementptr inbounds float* %tmp4951, i64 1
+  %tmp4953 = getelementptr inbounds float* %tmp4952, i64 1
+  %tmp4954 = getelementptr inbounds float* %tmp4953, i64 1
+  %tmp4955 = getelementptr inbounds float* %tmp4954, i64 1
+  %tmp4956 = getelementptr inbounds float* %tmp4955, i64 1
+  %tmp4957 = getelementptr inbounds float* %tmp4956, i64 1
+  %tmp4958 = getelementptr inbounds float* %tmp4957, i64 1
+  %tmp4959 = getelementptr inbounds float* %tmp4958, i64 1
+  %tmp4960 = getelementptr inbounds float* %tmp4959, i64 1
+  %tmp4961 = getelementptr inbounds float* %tmp4960, i64 1
+  %tmp4962 = getelementptr inbounds float* %tmp4961, i64 1
+  %tmp4963 = getelementptr inbounds float* %tmp4962, i64 1
+  %tmp4964 = getelementptr inbounds float* %tmp4963, i64 1
+  %tmp4965 = getelementptr inbounds float* %tmp4964, i64 1
+  %tmp4966 = getelementptr inbounds float* %tmp4965, i64 1
+  %tmp4967 = getelementptr inbounds float* %tmp4966, i64 1
+  %tmp4968 = getelementptr inbounds float* %tmp4967, i64 1
+  %tmp4969 = getelementptr inbounds float* %tmp4968, i64 1
+  %tmp4970 = getelementptr inbounds float* %tmp4969, i64 1
+  %tmp4971 = getelementptr inbounds float* %tmp4970, i64 1
+  %tmp4972 = getelementptr inbounds float* %tmp4971, i64 1
+  %tmp4973 = getelementptr inbounds float* %tmp4972, i64 1
+  %tmp4974 = getelementptr inbounds float* %tmp4973, i64 1
+  %tmp4975 = getelementptr inbounds float* %tmp4974, i64 1
+  %tmp4976 = getelementptr inbounds float* %tmp4975, i64 1
+  %tmp4977 = getelementptr inbounds float* %tmp4976, i64 1
+  %tmp4978 = getelementptr inbounds float* %tmp4977, i64 1
+  %tmp4979 = getelementptr inbounds float* %tmp4978, i64 1
+  %tmp4980 = getelementptr inbounds float* %tmp4979, i64 1
+  %tmp4981 = getelementptr inbounds float* %tmp4980, i64 1
+  %tmp4982 = getelementptr inbounds float* %tmp4981, i64 1
+  %tmp4983 = getelementptr inbounds float* %tmp4982, i64 1
+  %tmp4984 = getelementptr inbounds float* %tmp4983, i64 1
+  %tmp4985 = getelementptr inbounds float* %tmp4984, i64 1
+  %tmp4986 = getelementptr inbounds float* %tmp4985, i64 1
+  %tmp4987 = getelementptr inbounds float* %tmp4986, i64 1
+  %tmp4988 = getelementptr inbounds float* %tmp4987, i64 1
+  %tmp4989 = getelementptr inbounds float* %tmp4988, i64 1
+  %tmp4990 = getelementptr inbounds float* %tmp4989, i64 1
+  %tmp4991 = getelementptr inbounds float* %tmp4990, i64 1
+  %tmp4992 = getelementptr inbounds float* %tmp4991, i64 1
+  %tmp4993 = getelementptr inbounds float* %tmp4992, i64 1
+  %tmp4994 = getelementptr inbounds float* %tmp4993, i64 1
+  %tmp4995 = getelementptr inbounds float* %tmp4994, i64 1
+  %tmp4996 = getelementptr inbounds float* %tmp4995, i64 1
+  %tmp4997 = getelementptr inbounds float* %tmp4996, i64 1
+  %tmp4998 = getelementptr inbounds float* %tmp4997, i64 1
+  %tmp4999 = getelementptr inbounds float* %tmp4998, i64 1
+  %tmp5000 = getelementptr inbounds float* %tmp4999, i64 1
+  %tmp5001 = getelementptr inbounds float* %tmp5000, i64 1
+  %tmp5002 = getelementptr inbounds float* %tmp5001, i64 1
+  %tmp5003 = getelementptr inbounds float* %tmp5002, i64 1
+  %tmp5004 = getelementptr inbounds float* %tmp5003, i64 1
+  %tmp5005 = getelementptr inbounds float* %tmp5004, i64 1
+  %tmp5006 = getelementptr inbounds float* %tmp5005, i64 1
+  %tmp5007 = getelementptr inbounds float* %tmp5006, i64 1
+  %tmp5008 = getelementptr inbounds float* %tmp5007, i64 1
+  %tmp5009 = getelementptr inbounds float* %tmp5008, i64 1
+  %tmp5010 = getelementptr inbounds float* %tmp5009, i64 1
+  %tmp5011 = getelementptr inbounds float* %tmp5010, i64 1
+  %tmp5012 = getelementptr inbounds float* %tmp5011, i64 1
+  %tmp5013 = getelementptr inbounds float* %tmp5012, i64 1
+  %tmp5014 = getelementptr inbounds float* %tmp5013, i64 1
+  %tmp5015 = getelementptr inbounds float* %tmp5014, i64 1
+  %tmp5016 = getelementptr inbounds float* %tmp5015, i64 1
+  %tmp5017 = getelementptr inbounds float* %tmp5016, i64 1
+  %tmp5018 = getelementptr inbounds float* %tmp5017, i64 1
+  %tmp5019 = getelementptr inbounds float* %tmp5018, i64 1
+  %tmp5020 = getelementptr inbounds float* %tmp5019, i64 1
+  %tmp5021 = getelementptr inbounds float* %tmp5020, i64 1
+  %tmp5022 = getelementptr inbounds float* %tmp5021, i64 1
+  %tmp5023 = getelementptr inbounds float* %tmp5022, i64 1
+  %tmp5024 = getelementptr inbounds float* %tmp5023, i64 1
+  %tmp5025 = getelementptr inbounds float* %tmp5024, i64 1
+  %tmp5026 = getelementptr inbounds float* %tmp5025, i64 1
+  %tmp5027 = getelementptr inbounds float* %tmp5026, i64 1
+  %tmp5028 = getelementptr inbounds float* %tmp5027, i64 1
+  %tmp5029 = getelementptr inbounds float* %tmp5028, i64 1
+  %tmp5030 = getelementptr inbounds float* %tmp5029, i64 1
+  %tmp5031 = getelementptr inbounds float* %tmp5030, i64 1
+  %tmp5032 = getelementptr inbounds float* %tmp5031, i64 1
+  %tmp5033 = getelementptr inbounds float* %tmp5032, i64 1
+  %tmp5034 = getelementptr inbounds float* %tmp5033, i64 1
+  %tmp5035 = getelementptr inbounds float* %tmp5034, i64 1
+  %tmp5036 = getelementptr inbounds float* %tmp5035, i64 1
+  %tmp5037 = getelementptr inbounds float* %tmp5036, i64 1
+  %tmp5038 = getelementptr inbounds float* %tmp5037, i64 1
+  %tmp5039 = getelementptr inbounds float* %tmp5038, i64 1
+  %tmp5040 = getelementptr inbounds float* %tmp5039, i64 1
+  %tmp5041 = getelementptr inbounds float* %tmp5040, i64 1
+  %tmp5042 = getelementptr inbounds float* %tmp5041, i64 1
+  %tmp5043 = getelementptr inbounds float* %tmp5042, i64 1
+  %tmp5044 = getelementptr inbounds float* %tmp5043, i64 1
+  %tmp5045 = getelementptr inbounds float* %tmp5044, i64 1
+  %tmp5046 = getelementptr inbounds float* %tmp5045, i64 1
+  %tmp5047 = getelementptr inbounds float* %tmp5046, i64 1
+  %tmp5048 = getelementptr inbounds float* %tmp5047, i64 1
+  %tmp5049 = getelementptr inbounds float* %tmp5048, i64 1
+  %tmp5050 = getelementptr inbounds float* %tmp5049, i64 1
+  %tmp5051 = getelementptr inbounds float* %tmp5050, i64 1
+  %tmp5052 = getelementptr inbounds float* %tmp5051, i64 1
+  %tmp5053 = getelementptr inbounds float* %tmp5052, i64 1
+  %tmp5054 = getelementptr inbounds float* %tmp5053, i64 1
+  %tmp5055 = getelementptr inbounds float* %tmp5054, i64 1
+  %tmp5056 = getelementptr inbounds float* %tmp5055, i64 1
+  %tmp5057 = getelementptr inbounds float* %tmp5056, i64 1
+  %tmp5058 = getelementptr inbounds float* %tmp5057, i64 1
+  %tmp5059 = getelementptr inbounds float* %tmp5058, i64 1
+  %tmp5060 = getelementptr inbounds float* %tmp5059, i64 1
+  %tmp5061 = getelementptr inbounds float* %tmp5060, i64 1
+  %tmp5062 = getelementptr inbounds float* %tmp5061, i64 1
+  %tmp5063 = getelementptr inbounds float* %tmp5062, i64 1
+  %tmp5064 = getelementptr inbounds float* %tmp5063, i64 1
+  %tmp5065 = getelementptr inbounds float* %tmp5064, i64 1
+  %tmp5066 = getelementptr inbounds float* %tmp5065, i64 1
+  %tmp5067 = getelementptr inbounds float* %tmp5066, i64 1
+  %tmp5068 = getelementptr inbounds float* %tmp5067, i64 1
+  %tmp5069 = getelementptr inbounds float* %tmp5068, i64 1
+  %tmp5070 = getelementptr inbounds float* %tmp5069, i64 1
+  %tmp5071 = getelementptr inbounds float* %tmp5070, i64 1
+  %tmp5072 = getelementptr inbounds float* %tmp5071, i64 1
+  %tmp5073 = getelementptr inbounds float* %tmp5072, i64 1
+  %tmp5074 = getelementptr inbounds float* %tmp5073, i64 1
+  %tmp5075 = getelementptr inbounds float* %tmp5074, i64 1
+  %tmp5076 = getelementptr inbounds float* %tmp5075, i64 1
+  %tmp5077 = getelementptr inbounds float* %tmp5076, i64 1
+  %tmp5078 = getelementptr inbounds float* %tmp5077, i64 1
+  %tmp5079 = getelementptr inbounds float* %tmp5078, i64 1
+  %tmp5080 = getelementptr inbounds float* %tmp5079, i64 1
+  %tmp5081 = getelementptr inbounds float* %tmp5080, i64 1
+  %tmp5082 = getelementptr inbounds float* %tmp5081, i64 1
+  %tmp5083 = getelementptr inbounds float* %tmp5082, i64 1
+  %tmp5084 = getelementptr inbounds float* %tmp5083, i64 1
+  %tmp5085 = getelementptr inbounds float* %tmp5084, i64 1
+  %tmp5086 = getelementptr inbounds float* %tmp5085, i64 1
+  %tmp5087 = getelementptr inbounds float* %tmp5086, i64 1
+  %tmp5088 = getelementptr inbounds float* %tmp5087, i64 1
+  %tmp5089 = getelementptr inbounds float* %tmp5088, i64 1
+  %tmp5090 = getelementptr inbounds float* %tmp5089, i64 1
+  %tmp5091 = getelementptr inbounds float* %tmp5090, i64 1
+  %tmp5092 = getelementptr inbounds float* %tmp5091, i64 1
+  %tmp5093 = getelementptr inbounds float* %tmp5092, i64 1
+  %tmp5094 = getelementptr inbounds float* %tmp5093, i64 1
+  %tmp5095 = getelementptr inbounds float* %tmp5094, i64 1
+  %tmp5096 = getelementptr inbounds float* %tmp5095, i64 1
+  %tmp5097 = getelementptr inbounds float* %tmp5096, i64 1
+  %tmp5098 = getelementptr inbounds float* %tmp5097, i64 1
+  %tmp5099 = getelementptr inbounds float* %tmp5098, i64 1
+  %tmp5100 = getelementptr inbounds float* %tmp5099, i64 1
+  %tmp5101 = getelementptr inbounds float* %tmp5100, i64 1
+  %tmp5102 = getelementptr inbounds float* %tmp5101, i64 1
+  %tmp5103 = getelementptr inbounds float* %tmp5102, i64 1
+  %tmp5104 = getelementptr inbounds float* %tmp5103, i64 1
+  %tmp5105 = getelementptr inbounds float* %tmp5104, i64 1
+  %tmp5106 = getelementptr inbounds float* %tmp5105, i64 1
+  %tmp5107 = getelementptr inbounds float* %tmp5106, i64 1
+  %tmp5108 = getelementptr inbounds float* %tmp5107, i64 1
+  %tmp5109 = getelementptr inbounds float* %tmp5108, i64 1
+  %tmp5110 = getelementptr inbounds float* %tmp5109, i64 1
+  %tmp5111 = getelementptr inbounds float* %tmp5110, i64 1
+  %tmp5112 = getelementptr inbounds float* %tmp5111, i64 1
+  %tmp5113 = getelementptr inbounds float* %tmp5112, i64 1
+  %tmp5114 = getelementptr inbounds float* %tmp5113, i64 1
+  %tmp5115 = getelementptr inbounds float* %tmp5114, i64 1
+  %tmp5116 = getelementptr inbounds float* %tmp5115, i64 1
+  %tmp5117 = getelementptr inbounds float* %tmp5116, i64 1
+  %tmp5118 = getelementptr inbounds float* %tmp5117, i64 1
+  %tmp5119 = getelementptr inbounds float* %tmp5118, i64 1
+  %tmp5120 = getelementptr inbounds float* %tmp5119, i64 1
+  %tmp5121 = getelementptr inbounds float* %tmp5120, i64 1
+  %tmp5122 = getelementptr inbounds float* %tmp5121, i64 1
+  %tmp5123 = getelementptr inbounds float* %tmp5122, i64 1
+  %tmp5124 = getelementptr inbounds float* %tmp5123, i64 1
+  %tmp5125 = getelementptr inbounds float* %tmp5124, i64 1
+  %tmp5126 = getelementptr inbounds float* %tmp5125, i64 1
+  %tmp5127 = getelementptr inbounds float* %tmp5126, i64 1
+  %tmp5128 = getelementptr inbounds float* %tmp5127, i64 1
+  %tmp5129 = getelementptr inbounds float* %tmp5128, i64 1
+  %tmp5130 = getelementptr inbounds float* %tmp5129, i64 1
+  %tmp5131 = getelementptr inbounds float* %tmp5130, i64 1
+  %tmp5132 = getelementptr inbounds float* %tmp5131, i64 1
+  %tmp5133 = getelementptr inbounds float* %tmp5132, i64 1
+  %tmp5134 = getelementptr inbounds float* %tmp5133, i64 1
+  %tmp5135 = getelementptr inbounds float* %tmp5134, i64 1
+  %tmp5136 = getelementptr inbounds float* %tmp5135, i64 1
+  %tmp5137 = getelementptr inbounds float* %tmp5136, i64 1
+  %tmp5138 = getelementptr inbounds float* %tmp5137, i64 1
+  %tmp5139 = getelementptr inbounds float* %tmp5138, i64 1
+  %tmp5140 = getelementptr inbounds float* %tmp5139, i64 1
+  %tmp5141 = getelementptr inbounds float* %tmp5140, i64 1
+  %tmp5142 = getelementptr inbounds float* %tmp5141, i64 1
+  %tmp5143 = getelementptr inbounds float* %tmp5142, i64 1
+  %tmp5144 = getelementptr inbounds float* %tmp5143, i64 1
+  %tmp5145 = getelementptr inbounds float* %tmp5144, i64 1
+  %tmp5146 = getelementptr inbounds float* %tmp5145, i64 1
+  %tmp5147 = getelementptr inbounds float* %tmp5146, i64 1
+  %tmp5148 = getelementptr inbounds float* %tmp5147, i64 1
+  %tmp5149 = getelementptr inbounds float* %tmp5148, i64 1
+  %tmp5150 = getelementptr inbounds float* %tmp5149, i64 1
+  %tmp5151 = getelementptr inbounds float* %tmp5150, i64 1
+  %tmp5152 = getelementptr inbounds float* %tmp5151, i64 1
+  %tmp5153 = getelementptr inbounds float* %tmp5152, i64 1
+  %tmp5154 = getelementptr inbounds float* %tmp5153, i64 1
+  %tmp5155 = getelementptr inbounds float* %tmp5154, i64 1
+  %tmp5156 = getelementptr inbounds float* %tmp5155, i64 1
+  %tmp5157 = getelementptr inbounds float* %tmp5156, i64 1
+  %tmp5158 = getelementptr inbounds float* %tmp5157, i64 1
+  %tmp5159 = getelementptr inbounds float* %tmp5158, i64 1
+  %tmp5160 = getelementptr inbounds float* %tmp5159, i64 1
+  %tmp5161 = getelementptr inbounds float* %tmp5160, i64 1
+  %tmp5162 = getelementptr inbounds float* %tmp5161, i64 1
+  %tmp5163 = getelementptr inbounds float* %tmp5162, i64 1
+  %tmp5164 = getelementptr inbounds float* %tmp5163, i64 1
+  %tmp5165 = getelementptr inbounds float* %tmp5164, i64 1
+  %tmp5166 = getelementptr inbounds float* %tmp5165, i64 1
+  %tmp5167 = getelementptr inbounds float* %tmp5166, i64 1
+  %tmp5168 = getelementptr inbounds float* %tmp5167, i64 1
+  %tmp5169 = getelementptr inbounds float* %tmp5168, i64 1
+  %tmp5170 = getelementptr inbounds float* %tmp5169, i64 1
+  %tmp5171 = getelementptr inbounds float* %tmp5170, i64 1
+  %tmp5172 = getelementptr inbounds float* %tmp5171, i64 1
+  %tmp5173 = getelementptr inbounds float* %tmp5172, i64 1
+  %tmp5174 = getelementptr inbounds float* %tmp5173, i64 1
+  %tmp5175 = getelementptr inbounds float* %tmp5174, i64 1
+  %tmp5176 = getelementptr inbounds float* %tmp5175, i64 1
+  %tmp5177 = getelementptr inbounds float* %tmp5176, i64 1
+  %tmp5178 = getelementptr inbounds float* %tmp5177, i64 1
+  %tmp5179 = getelementptr inbounds float* %tmp5178, i64 1
+  %tmp5180 = getelementptr inbounds float* %tmp5179, i64 1
+  %tmp5181 = getelementptr inbounds float* %tmp5180, i64 1
+  %tmp5182 = getelementptr inbounds float* %tmp5181, i64 1
+  %tmp5183 = getelementptr inbounds float* %tmp5182, i64 1
+  %tmp5184 = getelementptr inbounds float* %tmp5183, i64 1
+  %tmp5185 = getelementptr inbounds float* %tmp5184, i64 1
+  %tmp5186 = getelementptr inbounds float* %tmp5185, i64 1
+  %tmp5187 = getelementptr inbounds float* %tmp5186, i64 1
+  %tmp5188 = getelementptr inbounds float* %tmp5187, i64 1
+  %tmp5189 = getelementptr inbounds float* %tmp5188, i64 1
+  %tmp5190 = getelementptr inbounds float* %tmp5189, i64 1
+  %tmp5191 = getelementptr inbounds float* %tmp5190, i64 1
+  %tmp5192 = getelementptr inbounds float* %tmp5191, i64 1
+  %tmp5193 = getelementptr inbounds float* %tmp5192, i64 1
+  %tmp5194 = getelementptr inbounds float* %tmp5193, i64 1
+  %tmp5195 = getelementptr inbounds float* %tmp5194, i64 1
+  %tmp5196 = getelementptr inbounds float* %tmp5195, i64 1
+  %tmp5197 = getelementptr inbounds float* %tmp5196, i64 1
+  %tmp5198 = getelementptr inbounds float* %tmp5197, i64 1
+  %tmp5199 = getelementptr inbounds float* %tmp5198, i64 1
+  %tmp5200 = getelementptr inbounds float* %tmp5199, i64 1
+  %tmp5201 = getelementptr inbounds float* %tmp5200, i64 1
+  %tmp5202 = getelementptr inbounds float* %tmp5201, i64 1
+  %tmp5203 = getelementptr inbounds float* %tmp5202, i64 1
+  %tmp5204 = getelementptr inbounds float* %tmp5203, i64 1
+  %tmp5205 = getelementptr inbounds float* %tmp5204, i64 1
+  %tmp5206 = getelementptr inbounds float* %tmp5205, i64 1
+  %tmp5207 = getelementptr inbounds float* %tmp5206, i64 1
+  %tmp5208 = getelementptr inbounds float* %tmp5207, i64 1
+  %tmp5209 = getelementptr inbounds float* %tmp5208, i64 1
+  %tmp5210 = getelementptr inbounds float* %tmp5209, i64 1
+  %tmp5211 = getelementptr inbounds float* %tmp5210, i64 1
+  %tmp5212 = getelementptr inbounds float* %tmp5211, i64 1
+  %tmp5213 = getelementptr inbounds float* %tmp5212, i64 1
+  %tmp5214 = getelementptr inbounds float* %tmp5213, i64 1
+  %tmp5215 = getelementptr inbounds float* %tmp5214, i64 1
+  %tmp5216 = getelementptr inbounds float* %tmp5215, i64 1
+  %tmp5217 = getelementptr inbounds float* %tmp5216, i64 1
+  %tmp5218 = getelementptr inbounds float* %tmp5217, i64 1
+  %tmp5219 = getelementptr inbounds float* %tmp5218, i64 1
+  %tmp5220 = getelementptr inbounds float* %tmp5219, i64 1
+  %tmp5221 = getelementptr inbounds float* %tmp5220, i64 1
+  %tmp5222 = getelementptr inbounds float* %tmp5221, i64 1
+  %tmp5223 = getelementptr inbounds float* %tmp5222, i64 1
+  %tmp5224 = getelementptr inbounds float* %tmp5223, i64 1
+  %tmp5225 = getelementptr inbounds float* %tmp5224, i64 1
+  %tmp5226 = getelementptr inbounds float* %tmp5225, i64 1
+  %tmp5227 = getelementptr inbounds float* %tmp5226, i64 1
+  %tmp5228 = getelementptr inbounds float* %tmp5227, i64 1
+  %tmp5229 = getelementptr inbounds float* %tmp5228, i64 1
+  %tmp5230 = getelementptr inbounds float* %tmp5229, i64 1
+  %tmp5231 = getelementptr inbounds float* %tmp5230, i64 1
+  %tmp5232 = getelementptr inbounds float* %tmp5231, i64 1
+  %tmp5233 = getelementptr inbounds float* %tmp5232, i64 1
+  %tmp5234 = getelementptr inbounds float* %tmp5233, i64 1
+  %tmp5235 = getelementptr inbounds float* %tmp5234, i64 1
+  %tmp5236 = getelementptr inbounds float* %tmp5235, i64 1
+  %tmp5237 = getelementptr inbounds float* %tmp5236, i64 1
+  %tmp5238 = getelementptr inbounds float* %tmp5237, i64 1
+  %tmp5239 = getelementptr inbounds float* %tmp5238, i64 1
+  %tmp5240 = getelementptr inbounds float* %tmp5239, i64 1
+  %tmp5241 = getelementptr inbounds float* %tmp5240, i64 1
+  %tmp5242 = getelementptr inbounds float* %tmp5241, i64 1
+  %tmp5243 = getelementptr inbounds float* %tmp5242, i64 1
+  %tmp5244 = getelementptr inbounds float* %tmp5243, i64 1
+  %tmp5245 = getelementptr inbounds float* %tmp5244, i64 1
+  %tmp5246 = getelementptr inbounds float* %tmp5245, i64 1
+  %tmp5247 = getelementptr inbounds float* %tmp5246, i64 1
+  %tmp5248 = getelementptr inbounds float* %tmp5247, i64 1
+  %tmp5249 = getelementptr inbounds float* %tmp5248, i64 1
+  %tmp5250 = getelementptr inbounds float* %tmp5249, i64 1
+  %tmp5251 = getelementptr inbounds float* %tmp5250, i64 1
+  %tmp5252 = getelementptr inbounds float* %tmp5251, i64 1
+  %tmp5253 = getelementptr inbounds float* %tmp5252, i64 1
+  %tmp5254 = getelementptr inbounds float* %tmp5253, i64 1
+  %tmp5255 = getelementptr inbounds float* %tmp5254, i64 1
+  %tmp5256 = getelementptr inbounds float* %tmp5255, i64 1
+  %tmp5257 = getelementptr inbounds float* %tmp5256, i64 1
+  %tmp5258 = getelementptr inbounds float* %tmp5257, i64 1
+  %tmp5259 = getelementptr inbounds float* %tmp5258, i64 1
+  %tmp5260 = getelementptr inbounds float* %tmp5259, i64 1
+  %tmp5261 = getelementptr inbounds float* %tmp5260, i64 1
+  %tmp5262 = getelementptr inbounds float* %tmp5261, i64 1
+  %tmp5263 = getelementptr inbounds float* %tmp5262, i64 1
+  %tmp5264 = getelementptr inbounds float* %tmp5263, i64 1
+  %tmp5265 = getelementptr inbounds float* %tmp5264, i64 1
+  %tmp5266 = getelementptr inbounds float* %tmp5265, i64 1
+  %tmp5267 = getelementptr inbounds float* %tmp5266, i64 1
+  %tmp5268 = getelementptr inbounds float* %tmp5267, i64 1
+  %tmp5269 = getelementptr inbounds float* %tmp5268, i64 1
+  %tmp5270 = getelementptr inbounds float* %tmp5269, i64 1
+  %tmp5271 = getelementptr inbounds float* %tmp5270, i64 1
+  %tmp5272 = getelementptr inbounds float* %tmp5271, i64 1
+  %tmp5273 = getelementptr inbounds float* %tmp5272, i64 1
+  %tmp5274 = getelementptr inbounds float* %tmp5273, i64 1
+  %tmp5275 = getelementptr inbounds float* %tmp5274, i64 1
+  %tmp5276 = getelementptr inbounds float* %tmp5275, i64 1
+  %tmp5277 = getelementptr inbounds float* %tmp5276, i64 1
+  %tmp5278 = getelementptr inbounds float* %tmp5277, i64 1
+  %tmp5279 = getelementptr inbounds float* %tmp5278, i64 1
+  %tmp5280 = getelementptr inbounds float* %tmp5279, i64 1
+  %tmp5281 = getelementptr inbounds float* %tmp5280, i64 1
+  %tmp5282 = getelementptr inbounds float* %tmp5281, i64 1
+  %tmp5283 = getelementptr inbounds float* %tmp5282, i64 1
+  %tmp5284 = getelementptr inbounds float* %tmp5283, i64 1
+  %tmp5285 = getelementptr inbounds float* %tmp5284, i64 1
+  %tmp5286 = getelementptr inbounds float* %tmp5285, i64 1
+  %tmp5287 = getelementptr inbounds float* %tmp5286, i64 1
+  %tmp5288 = getelementptr inbounds float* %tmp5287, i64 1
+  %tmp5289 = getelementptr inbounds float* %tmp5288, i64 1
+  %tmp5290 = getelementptr inbounds float* %tmp5289, i64 1
+  %tmp5291 = getelementptr inbounds float* %tmp5290, i64 1
+  %tmp5292 = getelementptr inbounds float* %tmp5291, i64 1
+  %tmp5293 = getelementptr inbounds float* %tmp5292, i64 1
+  %tmp5294 = getelementptr inbounds float* %tmp5293, i64 1
+  %tmp5295 = getelementptr inbounds float* %tmp5294, i64 1
+  %tmp5296 = getelementptr inbounds float* %tmp5295, i64 1
+  %tmp5297 = getelementptr inbounds float* %tmp5296, i64 1
+  %tmp5298 = getelementptr inbounds float* %tmp5297, i64 1
+  %tmp5299 = getelementptr inbounds float* %tmp5298, i64 1
+  %tmp5300 = getelementptr inbounds float* %tmp5299, i64 1
+  %tmp5301 = getelementptr inbounds float* %tmp5300, i64 1
+  %tmp5302 = getelementptr inbounds float* %tmp5301, i64 1
+  %tmp5303 = getelementptr inbounds float* %tmp5302, i64 1
+  %tmp5304 = getelementptr inbounds float* %tmp5303, i64 1
+  %tmp5305 = getelementptr inbounds float* %tmp5304, i64 1
+  %tmp5306 = getelementptr inbounds float* %tmp5305, i64 1
+  %tmp5307 = getelementptr inbounds float* %tmp5306, i64 1
+  %tmp5308 = getelementptr inbounds float* %tmp5307, i64 1
+  %tmp5309 = getelementptr inbounds float* %tmp5308, i64 1
+  %tmp5310 = getelementptr inbounds float* %tmp5309, i64 1
+  %tmp5311 = getelementptr inbounds float* %tmp5310, i64 1
+  %tmp5312 = getelementptr inbounds float* %tmp5311, i64 1
+  %tmp5313 = getelementptr inbounds float* %tmp5312, i64 1
+  %tmp5314 = getelementptr inbounds float* %tmp5313, i64 1
+  %tmp5315 = getelementptr inbounds float* %tmp5314, i64 1
+  %tmp5316 = getelementptr inbounds float* %tmp5315, i64 1
+  %tmp5317 = getelementptr inbounds float* %tmp5316, i64 1
+  %tmp5318 = getelementptr inbounds float* %tmp5317, i64 1
+  %tmp5319 = getelementptr inbounds float* %tmp5318, i64 1
+  %tmp5320 = getelementptr inbounds float* %tmp5319, i64 1
+  %tmp5321 = getelementptr inbounds float* %tmp5320, i64 1
+  %tmp5322 = getelementptr inbounds float* %tmp5321, i64 1
+  %tmp5323 = getelementptr inbounds float* %tmp5322, i64 1
+  %tmp5324 = getelementptr inbounds float* %tmp5323, i64 1
+  %tmp5325 = getelementptr inbounds float* %tmp5324, i64 1
+  %tmp5326 = getelementptr inbounds float* %tmp5325, i64 1
+  %tmp5327 = getelementptr inbounds float* %tmp5326, i64 1
+  %tmp5328 = getelementptr inbounds float* %tmp5327, i64 1
+  %tmp5329 = getelementptr inbounds float* %tmp5328, i64 1
+  %tmp5330 = getelementptr inbounds float* %tmp5329, i64 1
+  %tmp5331 = getelementptr inbounds float* %tmp5330, i64 1
+  %tmp5332 = getelementptr inbounds float* %tmp5331, i64 1
+  %tmp5333 = getelementptr inbounds float* %tmp5332, i64 1
+  %tmp5334 = getelementptr inbounds float* %tmp5333, i64 1
+  %tmp5335 = getelementptr inbounds float* %tmp5334, i64 1
+  %tmp5336 = getelementptr inbounds float* %tmp5335, i64 1
+  %tmp5337 = getelementptr inbounds float* %tmp5336, i64 1
+  %tmp5338 = getelementptr inbounds float* %tmp5337, i64 1
+  %tmp5339 = getelementptr inbounds float* %tmp5338, i64 1
+  %tmp5340 = getelementptr inbounds float* %tmp5339, i64 1
+  %tmp5341 = getelementptr inbounds float* %tmp5340, i64 1
+  %tmp5342 = getelementptr inbounds float* %tmp5341, i64 1
+  %tmp5343 = getelementptr inbounds float* %tmp5342, i64 1
+  %tmp5344 = getelementptr inbounds float* %tmp5343, i64 1
+  %tmp5345 = getelementptr inbounds float* %tmp5344, i64 1
+  %tmp5346 = getelementptr inbounds float* %tmp5345, i64 1
+  %tmp5347 = getelementptr inbounds float* %tmp5346, i64 1
+  %tmp5348 = getelementptr inbounds float* %tmp5347, i64 1
+  %tmp5349 = getelementptr inbounds float* %tmp5348, i64 1
+  %tmp5350 = getelementptr inbounds float* %tmp5349, i64 1
+  %tmp5351 = getelementptr inbounds float* %tmp5350, i64 1
+  %tmp5352 = getelementptr inbounds float* %tmp5351, i64 1
+  %tmp5353 = getelementptr inbounds float* %tmp5352, i64 1
+  %tmp5354 = getelementptr inbounds float* %tmp5353, i64 1
+  %tmp5355 = getelementptr inbounds float* %tmp5354, i64 1
+  %tmp5356 = getelementptr inbounds float* %tmp5355, i64 1
+  %tmp5357 = getelementptr inbounds float* %tmp5356, i64 1
+  %tmp5358 = getelementptr inbounds float* %tmp5357, i64 1
+  %tmp5359 = getelementptr inbounds float* %tmp5358, i64 1
+  %tmp5360 = getelementptr inbounds float* %tmp5359, i64 1
+  %tmp5361 = getelementptr inbounds float* %tmp5360, i64 1
+  %tmp5362 = getelementptr inbounds float* %tmp5361, i64 1
+  %tmp5363 = getelementptr inbounds float* %tmp5362, i64 1
+  %tmp5364 = getelementptr inbounds float* %tmp5363, i64 1
+  %tmp5365 = getelementptr inbounds float* %tmp5364, i64 1
+  %tmp5366 = getelementptr inbounds float* %tmp5365, i64 1
+  %tmp5367 = getelementptr inbounds float* %tmp5366, i64 1
+  %tmp5368 = getelementptr inbounds float* %tmp5367, i64 1
+  %tmp5369 = getelementptr inbounds float* %tmp5368, i64 1
+  %tmp5370 = getelementptr inbounds float* %tmp5369, i64 1
+  %tmp5371 = getelementptr inbounds float* %tmp5370, i64 1
+  %tmp5372 = getelementptr inbounds float* %tmp5371, i64 1
+  %tmp5373 = getelementptr inbounds float* %tmp5372, i64 1
+  %tmp5374 = getelementptr inbounds float* %tmp5373, i64 1
+  %tmp5375 = getelementptr inbounds float* %tmp5374, i64 1
+  %tmp5376 = getelementptr inbounds float* %tmp5375, i64 1
+  %tmp5377 = getelementptr inbounds float* %tmp5376, i64 1
+  %tmp5378 = getelementptr inbounds float* %tmp5377, i64 1
+  %tmp5379 = getelementptr inbounds float* %tmp5378, i64 1
+  %tmp5380 = getelementptr inbounds float* %tmp5379, i64 1
+  %tmp5381 = getelementptr inbounds float* %tmp5380, i64 1
+  %tmp5382 = getelementptr inbounds float* %tmp5381, i64 1
+  %tmp5383 = getelementptr inbounds float* %tmp5382, i64 1
+  %tmp5384 = getelementptr inbounds float* %tmp5383, i64 1
+  %tmp5385 = getelementptr inbounds float* %tmp5384, i64 1
+  %tmp5386 = getelementptr inbounds float* %tmp5385, i64 1
+  %tmp5387 = getelementptr inbounds float* %tmp5386, i64 1
+  %tmp5388 = getelementptr inbounds float* %tmp5387, i64 1
+  %tmp5389 = getelementptr inbounds float* %tmp5388, i64 1
+  %tmp5390 = getelementptr inbounds float* %tmp5389, i64 1
+  %tmp5391 = getelementptr inbounds float* %tmp5390, i64 1
+  %tmp5392 = getelementptr inbounds float* %tmp5391, i64 1
+  %tmp5393 = getelementptr inbounds float* %tmp5392, i64 1
+  %tmp5394 = getelementptr inbounds float* %tmp5393, i64 1
+  %tmp5395 = getelementptr inbounds float* %tmp5394, i64 1
+  %tmp5396 = getelementptr inbounds float* %tmp5395, i64 1
+  %tmp5397 = getelementptr inbounds float* %tmp5396, i64 1
+  %tmp5398 = getelementptr inbounds float* %tmp5397, i64 1
+  %tmp5399 = getelementptr inbounds float* %tmp5398, i64 1
+  %tmp5400 = getelementptr inbounds float* %tmp5399, i64 1
+  %tmp5401 = getelementptr inbounds float* %tmp5400, i64 1
+  %tmp5402 = getelementptr inbounds float* %tmp5401, i64 1
+  %tmp5403 = getelementptr inbounds float* %tmp5402, i64 1
+  %tmp5404 = getelementptr inbounds float* %tmp5403, i64 1
+  %tmp5405 = getelementptr inbounds float* %tmp5404, i64 1
+  %tmp5406 = getelementptr inbounds float* %tmp5405, i64 1
+  %tmp5407 = getelementptr inbounds float* %tmp5406, i64 1
+  %tmp5408 = getelementptr inbounds float* %tmp5407, i64 1
+  %tmp5409 = getelementptr inbounds float* %tmp5408, i64 1
+  %tmp5410 = getelementptr inbounds float* %tmp5409, i64 1
+  %tmp5411 = getelementptr inbounds float* %tmp5410, i64 1
+  %tmp5412 = getelementptr inbounds float* %tmp5411, i64 1
+  %tmp5413 = getelementptr inbounds float* %tmp5412, i64 1
+  %tmp5414 = getelementptr inbounds float* %tmp5413, i64 1
+  %tmp5415 = getelementptr inbounds float* %tmp5414, i64 1
+  %tmp5416 = getelementptr inbounds float* %tmp5415, i64 1
+  %tmp5417 = getelementptr inbounds float* %tmp5416, i64 1
+  %tmp5418 = getelementptr inbounds float* %tmp5417, i64 1
+  %tmp5419 = getelementptr inbounds float* %tmp5418, i64 1
+  %tmp5420 = getelementptr inbounds float* %tmp5419, i64 1
+  %tmp5421 = getelementptr inbounds float* %tmp5420, i64 1
+  %tmp5422 = getelementptr inbounds float* %tmp5421, i64 1
+  %tmp5423 = getelementptr inbounds float* %tmp5422, i64 1
+  %tmp5424 = getelementptr inbounds float* %tmp5423, i64 1
+  %tmp5425 = getelementptr inbounds float* %tmp5424, i64 1
+  %tmp5426 = getelementptr inbounds float* %tmp5425, i64 1
+  %tmp5427 = getelementptr inbounds float* %tmp5426, i64 1
+  %tmp5428 = getelementptr inbounds float* %tmp5427, i64 1
+  %tmp5429 = getelementptr inbounds float* %tmp5428, i64 1
+  %tmp5430 = getelementptr inbounds float* %tmp5429, i64 1
+  %tmp5431 = getelementptr inbounds float* %tmp5430, i64 1
+  %tmp5432 = getelementptr inbounds float* %tmp5431, i64 1
+  %tmp5433 = getelementptr inbounds float* %tmp5432, i64 1
+  %tmp5434 = getelementptr inbounds float* %tmp5433, i64 1
+  %tmp5435 = getelementptr inbounds float* %tmp5434, i64 1
+  %tmp5436 = getelementptr inbounds float* %tmp5435, i64 1
+  %tmp5437 = getelementptr inbounds float* %tmp5436, i64 1
+  %tmp5438 = getelementptr inbounds float* %tmp5437, i64 1
+  %tmp5439 = getelementptr inbounds float* %tmp5438, i64 1
+  %tmp5440 = getelementptr inbounds float* %tmp5439, i64 1
+  %tmp5441 = getelementptr inbounds float* %tmp5440, i64 1
+  %tmp5442 = getelementptr inbounds float* %tmp5441, i64 1
+  %tmp5443 = getelementptr inbounds float* %tmp5442, i64 1
+  %tmp5444 = getelementptr inbounds float* %tmp5443, i64 1
+  %tmp5445 = getelementptr inbounds float* %tmp5444, i64 1
+  %tmp5446 = getelementptr inbounds float* %tmp5445, i64 1
+  %tmp5447 = getelementptr inbounds float* %tmp5446, i64 1
+  %tmp5448 = getelementptr inbounds float* %tmp5447, i64 1
+  %tmp5449 = getelementptr inbounds float* %tmp5448, i64 1
+  %tmp5450 = getelementptr inbounds float* %tmp5449, i64 1
+  %tmp5451 = getelementptr inbounds float* %tmp5450, i64 1
+  %tmp5452 = getelementptr inbounds float* %tmp5451, i64 1
+  %tmp5453 = getelementptr inbounds float* %tmp5452, i64 1
+  %tmp5454 = getelementptr inbounds float* %tmp5453, i64 1
+  %tmp5455 = getelementptr inbounds float* %tmp5454, i64 1
+  %tmp5456 = getelementptr inbounds float* %tmp5455, i64 1
+  %tmp5457 = getelementptr inbounds float* %tmp5456, i64 1
+  %tmp5458 = getelementptr inbounds float* %tmp5457, i64 1
+  %tmp5459 = getelementptr inbounds float* %tmp5458, i64 1
+  %tmp5460 = getelementptr inbounds float* %tmp5459, i64 1
+  %tmp5461 = getelementptr inbounds float* %tmp5460, i64 1
+  %tmp5462 = getelementptr inbounds float* %tmp5461, i64 1
+  %tmp5463 = getelementptr inbounds float* %tmp5462, i64 1
+  %tmp5464 = getelementptr inbounds float* %tmp5463, i64 1
+  %tmp5465 = getelementptr inbounds float* %tmp5464, i64 1
+  %tmp5466 = getelementptr inbounds float* %tmp5465, i64 1
+  %tmp5467 = getelementptr inbounds float* %tmp5466, i64 1
+  %tmp5468 = getelementptr inbounds float* %tmp5467, i64 1
+  %tmp5469 = getelementptr inbounds float* %tmp5468, i64 1
+  %tmp5470 = getelementptr inbounds float* %tmp5469, i64 1
+  %tmp5471 = getelementptr inbounds float* %tmp5470, i64 1
+  %tmp5472 = getelementptr inbounds float* %tmp5471, i64 1
+  %tmp5473 = getelementptr inbounds float* %tmp5472, i64 1
+  %tmp5474 = getelementptr inbounds float* %tmp5473, i64 1
+  %tmp5475 = getelementptr inbounds float* %tmp5474, i64 1
+  %tmp5476 = getelementptr inbounds float* %tmp5475, i64 1
+  %tmp5477 = getelementptr inbounds float* %tmp5476, i64 1
+  %tmp5478 = getelementptr inbounds float* %tmp5477, i64 1
+  %tmp5479 = getelementptr inbounds float* %tmp5478, i64 1
+  %tmp5480 = getelementptr inbounds float* %tmp5479, i64 1
+  %tmp5481 = getelementptr inbounds float* %tmp5480, i64 1
+  %tmp5482 = getelementptr inbounds float* %tmp5481, i64 1
+  %tmp5483 = getelementptr inbounds float* %tmp5482, i64 1
+  %tmp5484 = getelementptr inbounds float* %tmp5483, i64 1
+  %tmp5485 = getelementptr inbounds float* %tmp5484, i64 1
+  %tmp5486 = getelementptr inbounds float* %tmp5485, i64 1
+  %tmp5487 = getelementptr inbounds float* %tmp5486, i64 1
+  %tmp5488 = getelementptr inbounds float* %tmp5487, i64 1
+  %tmp5489 = getelementptr inbounds float* %tmp5488, i64 1
+  %tmp5490 = getelementptr inbounds float* %tmp5489, i64 1
+  %tmp5491 = getelementptr inbounds float* %tmp5490, i64 1
+  %tmp5492 = getelementptr inbounds float* %tmp5491, i64 1
+  %tmp5493 = getelementptr inbounds float* %tmp5492, i64 1
+  %tmp5494 = getelementptr inbounds float* %tmp5493, i64 1
+  %tmp5495 = getelementptr inbounds float* %tmp5494, i64 1
+  %tmp5496 = getelementptr inbounds float* %tmp5495, i64 1
+  %tmp5497 = getelementptr inbounds float* %tmp5496, i64 1
+  %tmp5498 = getelementptr inbounds float* %tmp5497, i64 1
+  %tmp5499 = getelementptr inbounds float* %tmp5498, i64 1
+  %tmp5500 = getelementptr inbounds float* %tmp5499, i64 1
+  %tmp5501 = getelementptr inbounds float* %tmp5500, i64 1
+  %tmp5502 = getelementptr inbounds float* %tmp5501, i64 1
+  %tmp5503 = getelementptr inbounds float* %tmp5502, i64 1
+  %tmp5504 = getelementptr inbounds float* %tmp5503, i64 1
+  %tmp5505 = getelementptr inbounds float* %tmp5504, i64 1
+  %tmp5506 = getelementptr inbounds float* %tmp5505, i64 1
+  %tmp5507 = getelementptr inbounds float* %tmp5506, i64 1
+  %tmp5508 = getelementptr inbounds float* %tmp5507, i64 1
+  %tmp5509 = getelementptr inbounds float* %tmp5508, i64 1
+  %tmp5510 = getelementptr inbounds float* %tmp5509, i64 1
+  %tmp5511 = getelementptr inbounds float* %tmp5510, i64 1
+  %tmp5512 = getelementptr inbounds float* %tmp5511, i64 1
+  %tmp5513 = getelementptr inbounds float* %tmp5512, i64 1
+  %tmp5514 = getelementptr inbounds float* %tmp5513, i64 1
+  %tmp5515 = getelementptr inbounds float* %tmp5514, i64 1
+  %tmp5516 = getelementptr inbounds float* %tmp5515, i64 1
+  %tmp5517 = getelementptr inbounds float* %tmp5516, i64 1
+  %tmp5518 = getelementptr inbounds float* %tmp5517, i64 1
+  %tmp5519 = getelementptr inbounds float* %tmp5518, i64 1
+  %tmp5520 = getelementptr inbounds float* %tmp5519, i64 1
+  %tmp5521 = getelementptr inbounds float* %tmp5520, i64 1
+  %tmp5522 = getelementptr inbounds float* %tmp5521, i64 1
+  %tmp5523 = getelementptr inbounds float* %tmp5522, i64 1
+  %tmp5524 = getelementptr inbounds float* %tmp5523, i64 1
+  %tmp5525 = getelementptr inbounds float* %tmp5524, i64 1
+  %tmp5526 = getelementptr inbounds float* %tmp5525, i64 1
+  %tmp5527 = getelementptr inbounds float* %tmp5526, i64 1
+  %tmp5528 = getelementptr inbounds float* %tmp5527, i64 1
+  %tmp5529 = getelementptr inbounds float* %tmp5528, i64 1
+  %tmp5530 = getelementptr inbounds float* %tmp5529, i64 1
+  %tmp5531 = getelementptr inbounds float* %tmp5530, i64 1
+  %tmp5532 = getelementptr inbounds float* %tmp5531, i64 1
+  %tmp5533 = getelementptr inbounds float* %tmp5532, i64 1
+  %tmp5534 = getelementptr inbounds float* %tmp5533, i64 1
+  %tmp5535 = getelementptr inbounds float* %tmp5534, i64 1
+  %tmp5536 = getelementptr inbounds float* %tmp5535, i64 1
+  %tmp5537 = getelementptr inbounds float* %tmp5536, i64 1
+  %tmp5538 = getelementptr inbounds float* %tmp5537, i64 1
+  %tmp5539 = getelementptr inbounds float* %tmp5538, i64 1
+  %tmp5540 = getelementptr inbounds float* %tmp5539, i64 1
+  %tmp5541 = getelementptr inbounds float* %tmp5540, i64 1
+  %tmp5542 = getelementptr inbounds float* %tmp5541, i64 1
+  %tmp5543 = getelementptr inbounds float* %tmp5542, i64 1
+  %tmp5544 = getelementptr inbounds float* %tmp5543, i64 1
+  %tmp5545 = getelementptr inbounds float* %tmp5544, i64 1
+  %tmp5546 = getelementptr inbounds float* %tmp5545, i64 1
+  %tmp5547 = getelementptr inbounds float* %tmp5546, i64 1
+  %tmp5548 = getelementptr inbounds float* %tmp5547, i64 1
+  %tmp5549 = getelementptr inbounds float* %tmp5548, i64 1
+  %tmp5550 = getelementptr inbounds float* %tmp5549, i64 1
+  %tmp5551 = getelementptr inbounds float* %tmp5550, i64 1
+  %tmp5552 = getelementptr inbounds float* %tmp5551, i64 1
+  %tmp5553 = getelementptr inbounds float* %tmp5552, i64 1
+  %tmp5554 = getelementptr inbounds float* %tmp5553, i64 1
+  %tmp5555 = getelementptr inbounds float* %tmp5554, i64 1
+  %tmp5556 = getelementptr inbounds float* %tmp5555, i64 1
+  %tmp5557 = getelementptr inbounds float* %tmp5556, i64 1
+  %tmp5558 = getelementptr inbounds float* %tmp5557, i64 1
+  %tmp5559 = getelementptr inbounds float* %tmp5558, i64 1
+  %tmp5560 = getelementptr inbounds float* %tmp5559, i64 1
+  %tmp5561 = getelementptr inbounds float* %tmp5560, i64 1
+  %tmp5562 = getelementptr inbounds float* %tmp5561, i64 1
+  %tmp5563 = getelementptr inbounds float* %tmp5562, i64 1
+  %tmp5564 = getelementptr inbounds float* %tmp5563, i64 1
+  %tmp5565 = getelementptr inbounds float* %tmp5564, i64 1
+  %tmp5566 = getelementptr inbounds float* %tmp5565, i64 1
+  %tmp5567 = getelementptr inbounds float* %tmp5566, i64 1
+  %tmp5568 = getelementptr inbounds float* %tmp5567, i64 1
+  %tmp5569 = getelementptr inbounds float* %tmp5568, i64 1
+  %tmp5570 = getelementptr inbounds float* %tmp5569, i64 1
+  %tmp5571 = getelementptr inbounds float* %tmp5570, i64 1
+  %tmp5572 = getelementptr inbounds float* %tmp5571, i64 1
+  %tmp5573 = getelementptr inbounds float* %tmp5572, i64 1
+  %tmp5574 = getelementptr inbounds float* %tmp5573, i64 1
+  %tmp5575 = getelementptr inbounds float* %tmp5574, i64 1
+  %tmp5576 = getelementptr inbounds float* %tmp5575, i64 1
+  %tmp5577 = getelementptr inbounds float* %tmp5576, i64 1
+  %tmp5578 = getelementptr inbounds float* %tmp5577, i64 1
+  %tmp5579 = getelementptr inbounds float* %tmp5578, i64 1
+  %tmp5580 = getelementptr inbounds float* %tmp5579, i64 1
+  %tmp5581 = getelementptr inbounds float* %tmp5580, i64 1
+  %tmp5582 = getelementptr inbounds float* %tmp5581, i64 1
+  %tmp5583 = getelementptr inbounds float* %tmp5582, i64 1
+  %tmp5584 = getelementptr inbounds float* %tmp5583, i64 1
+  %tmp5585 = getelementptr inbounds float* %tmp5584, i64 1
+  %tmp5586 = getelementptr inbounds float* %tmp5585, i64 1
+  %tmp5587 = getelementptr inbounds float* %tmp5586, i64 1
+  %tmp5588 = getelementptr inbounds float* %tmp5587, i64 1
+  %tmp5589 = getelementptr inbounds float* %tmp5588, i64 1
+  %tmp5590 = getelementptr inbounds float* %tmp5589, i64 1
+  %tmp5591 = getelementptr inbounds float* %tmp5590, i64 1
+  %tmp5592 = getelementptr inbounds float* %tmp5591, i64 1
+  %tmp5593 = getelementptr inbounds float* %tmp5592, i64 1
+  %tmp5594 = getelementptr inbounds float* %tmp5593, i64 1
+  %tmp5595 = getelementptr inbounds float* %tmp5594, i64 1
+  %tmp5596 = getelementptr inbounds float* %tmp5595, i64 1
+  %tmp5597 = getelementptr inbounds float* %tmp5596, i64 1
+  %tmp5598 = getelementptr inbounds float* %tmp5597, i64 1
+  %tmp5599 = getelementptr inbounds float* %tmp5598, i64 1
+  %tmp5600 = getelementptr inbounds float* %tmp5599, i64 1
+  %tmp5601 = getelementptr inbounds float* %tmp5600, i64 1
+  %tmp5602 = getelementptr inbounds float* %tmp5601, i64 1
+  %tmp5603 = getelementptr inbounds float* %tmp5602, i64 1
+  %tmp5604 = getelementptr inbounds float* %tmp5603, i64 1
+  %tmp5605 = getelementptr inbounds float* %tmp5604, i64 1
+  %tmp5606 = getelementptr inbounds float* %tmp5605, i64 1
+  %tmp5607 = getelementptr inbounds float* %tmp5606, i64 1
+  %tmp5608 = getelementptr inbounds float* %tmp5607, i64 1
+  %tmp5609 = getelementptr inbounds float* %tmp5608, i64 1
+  %tmp5610 = getelementptr inbounds float* %tmp5609, i64 1
+  %tmp5611 = getelementptr inbounds float* %tmp5610, i64 1
+  %tmp5612 = getelementptr inbounds float* %tmp5611, i64 1
+  %tmp5613 = getelementptr inbounds float* %tmp5612, i64 1
+  %tmp5614 = getelementptr inbounds float* %tmp5613, i64 1
+  %tmp5615 = getelementptr inbounds float* %tmp5614, i64 1
+  %tmp5616 = getelementptr inbounds float* %tmp5615, i64 1
+  %tmp5617 = getelementptr inbounds float* %tmp5616, i64 1
+  %tmp5618 = getelementptr inbounds float* %tmp5617, i64 1
+  %tmp5619 = getelementptr inbounds float* %tmp5618, i64 1
+  %tmp5620 = getelementptr inbounds float* %tmp5619, i64 1
+  %tmp5621 = getelementptr inbounds float* %tmp5620, i64 1
+  %tmp5622 = getelementptr inbounds float* %tmp5621, i64 1
+  %tmp5623 = getelementptr inbounds float* %tmp5622, i64 1
+  %tmp5624 = getelementptr inbounds float* %tmp5623, i64 1
+  %tmp5625 = getelementptr inbounds float* %tmp5624, i64 1
+  %tmp5626 = getelementptr inbounds float* %tmp5625, i64 1
+  %tmp5627 = getelementptr inbounds float* %tmp5626, i64 1
+  %tmp5628 = getelementptr inbounds float* %tmp5627, i64 1
+  %tmp5629 = getelementptr inbounds float* %tmp5628, i64 1
+  %tmp5630 = getelementptr inbounds float* %tmp5629, i64 1
+  %tmp5631 = getelementptr inbounds float* %tmp5630, i64 1
+  %tmp5632 = getelementptr inbounds float* %tmp5631, i64 1
+  %tmp5633 = getelementptr inbounds float* %tmp5632, i64 1
+  %tmp5634 = getelementptr inbounds float* %tmp5633, i64 1
+  %tmp5635 = getelementptr inbounds float* %tmp5634, i64 1
+  %tmp5636 = getelementptr inbounds float* %tmp5635, i64 1
+  %tmp5637 = getelementptr inbounds float* %tmp5636, i64 1
+  %tmp5638 = getelementptr inbounds float* %tmp5637, i64 1
+  %tmp5639 = getelementptr inbounds float* %tmp5638, i64 1
+  %tmp5640 = getelementptr inbounds float* %tmp5639, i64 1
+  %tmp5641 = getelementptr inbounds float* %tmp5640, i64 1
+  %tmp5642 = getelementptr inbounds float* %tmp5641, i64 1
+  %tmp5643 = getelementptr inbounds float* %tmp5642, i64 1
+  %tmp5644 = getelementptr inbounds float* %tmp5643, i64 1
+  %tmp5645 = getelementptr inbounds float* %tmp5644, i64 1
+  %tmp5646 = getelementptr inbounds float* %tmp5645, i64 1
+  %tmp5647 = getelementptr inbounds float* %tmp5646, i64 1
+  %tmp5648 = getelementptr inbounds float* %tmp5647, i64 1
+  %tmp5649 = getelementptr inbounds float* %tmp5648, i64 1
+  %tmp5650 = getelementptr inbounds float* %tmp5649, i64 1
+  %tmp5651 = getelementptr inbounds float* %tmp5650, i64 1
+  %tmp5652 = getelementptr inbounds float* %tmp5651, i64 1
+  %tmp5653 = getelementptr inbounds float* %tmp5652, i64 1
+  %tmp5654 = getelementptr inbounds float* %tmp5653, i64 1
+  %tmp5655 = getelementptr inbounds float* %tmp5654, i64 1
+  %tmp5656 = getelementptr inbounds float* %tmp5655, i64 1
+  %tmp5657 = getelementptr inbounds float* %tmp5656, i64 1
+  %tmp5658 = getelementptr inbounds float* %tmp5657, i64 1
+  %tmp5659 = getelementptr inbounds float* %tmp5658, i64 1
+  %tmp5660 = getelementptr inbounds float* %tmp5659, i64 1
+  %tmp5661 = getelementptr inbounds float* %tmp5660, i64 1
+  %tmp5662 = getelementptr inbounds float* %tmp5661, i64 1
+  %tmp5663 = getelementptr inbounds float* %tmp5662, i64 1
+  %tmp5664 = getelementptr inbounds float* %tmp5663, i64 1
+  %tmp5665 = getelementptr inbounds float* %tmp5664, i64 1
+  %tmp5666 = getelementptr inbounds float* %tmp5665, i64 1
+  %tmp5667 = getelementptr inbounds float* %tmp5666, i64 1
+  %tmp5668 = getelementptr inbounds float* %tmp5667, i64 1
+  %tmp5669 = getelementptr inbounds float* %tmp5668, i64 1
+  %tmp5670 = getelementptr inbounds float* %tmp5669, i64 1
+  %tmp5671 = getelementptr inbounds float* %tmp5670, i64 1
+  %tmp5672 = getelementptr inbounds float* %tmp5671, i64 1
+  %tmp5673 = getelementptr inbounds float* %tmp5672, i64 1
+  %tmp5674 = getelementptr inbounds float* %tmp5673, i64 1
+  %tmp5675 = getelementptr inbounds float* %tmp5674, i64 1
+  %tmp5676 = getelementptr inbounds float* %tmp5675, i64 1
+  %tmp5677 = getelementptr inbounds float* %tmp5676, i64 1
+  %tmp5678 = getelementptr inbounds float* %tmp5677, i64 1
+  %tmp5679 = getelementptr inbounds float* %tmp5678, i64 1
+  %tmp5680 = getelementptr inbounds float* %tmp5679, i64 1
+  %tmp5681 = getelementptr inbounds float* %tmp5680, i64 1
+  %tmp5682 = getelementptr inbounds float* %tmp5681, i64 1
+  %tmp5683 = getelementptr inbounds float* %tmp5682, i64 1
+  %tmp5684 = getelementptr inbounds float* %tmp5683, i64 1
+  %tmp5685 = getelementptr inbounds float* %tmp5684, i64 1
+  %tmp5686 = getelementptr inbounds float* %tmp5685, i64 1
+  %tmp5687 = getelementptr inbounds float* %tmp5686, i64 1
+  %tmp5688 = getelementptr inbounds float* %tmp5687, i64 1
+  %tmp5689 = getelementptr inbounds float* %tmp5688, i64 1
+  %tmp5690 = getelementptr inbounds float* %tmp5689, i64 1
+  %tmp5691 = getelementptr inbounds float* %tmp5690, i64 1
+  %tmp5692 = getelementptr inbounds float* %tmp5691, i64 1
+  %tmp5693 = getelementptr inbounds float* %tmp5692, i64 1
+  %tmp5694 = getelementptr inbounds float* %tmp5693, i64 1
+  %tmp5695 = getelementptr inbounds float* %tmp5694, i64 1
+  %tmp5696 = getelementptr inbounds float* %tmp5695, i64 1
+  %tmp5697 = getelementptr inbounds float* %tmp5696, i64 1
+  %tmp5698 = getelementptr inbounds float* %tmp5697, i64 1
+  %tmp5699 = getelementptr inbounds float* %tmp5698, i64 1
+  %tmp5700 = getelementptr inbounds float* %tmp5699, i64 1
+  %tmp5701 = getelementptr inbounds float* %tmp5700, i64 1
+  %tmp5702 = getelementptr inbounds float* %tmp5701, i64 1
+  %tmp5703 = getelementptr inbounds float* %tmp5702, i64 1
+  %tmp5704 = getelementptr inbounds float* %tmp5703, i64 1
+  %tmp5705 = getelementptr inbounds float* %tmp5704, i64 1
+  %tmp5706 = getelementptr inbounds float* %tmp5705, i64 1
+  %tmp5707 = getelementptr inbounds float* %tmp5706, i64 1
+  %tmp5708 = getelementptr inbounds float* %tmp5707, i64 1
+  %tmp5709 = getelementptr inbounds float* %tmp5708, i64 1
+  %tmp5710 = getelementptr inbounds float* %tmp5709, i64 1
+  %tmp5711 = getelementptr inbounds float* %tmp5710, i64 1
+  %tmp5712 = getelementptr inbounds float* %tmp5711, i64 1
+  %tmp5713 = getelementptr inbounds float* %tmp5712, i64 1
+  %tmp5714 = getelementptr inbounds float* %tmp5713, i64 1
+  %tmp5715 = getelementptr inbounds float* %tmp5714, i64 1
+  %tmp5716 = getelementptr inbounds float* %tmp5715, i64 1
+  %tmp5717 = getelementptr inbounds float* %tmp5716, i64 1
+  %tmp5718 = getelementptr inbounds float* %tmp5717, i64 1
+  %tmp5719 = getelementptr inbounds float* %tmp5718, i64 1
+  %tmp5720 = getelementptr inbounds float* %tmp5719, i64 1
+  %tmp5721 = getelementptr inbounds float* %tmp5720, i64 1
+  %tmp5722 = getelementptr inbounds float* %tmp5721, i64 1
+  %tmp5723 = getelementptr inbounds float* %tmp5722, i64 1
+  %tmp5724 = getelementptr inbounds float* %tmp5723, i64 1
+  %tmp5725 = getelementptr inbounds float* %tmp5724, i64 1
+  %tmp5726 = getelementptr inbounds float* %tmp5725, i64 1
+  %tmp5727 = getelementptr inbounds float* %tmp5726, i64 1
+  %tmp5728 = getelementptr inbounds float* %tmp5727, i64 1
+  %tmp5729 = getelementptr inbounds float* %tmp5728, i64 1
+  %tmp5730 = getelementptr inbounds float* %tmp5729, i64 1
+  %tmp5731 = getelementptr inbounds float* %tmp5730, i64 1
+  %tmp5732 = getelementptr inbounds float* %tmp5731, i64 1
+  %tmp5733 = getelementptr inbounds float* %tmp5732, i64 1
+  %tmp5734 = getelementptr inbounds float* %tmp5733, i64 1
+  %tmp5735 = getelementptr inbounds float* %tmp5734, i64 1
+  %tmp5736 = getelementptr inbounds float* %tmp5735, i64 1
+  %tmp5737 = getelementptr inbounds float* %tmp5736, i64 1
+  %tmp5738 = getelementptr inbounds float* %tmp5737, i64 1
+  %tmp5739 = getelementptr inbounds float* %tmp5738, i64 1
+  %tmp5740 = getelementptr inbounds float* %tmp5739, i64 1
+  %tmp5741 = getelementptr inbounds float* %tmp5740, i64 1
+  %tmp5742 = getelementptr inbounds float* %tmp5741, i64 1
+  %tmp5743 = getelementptr inbounds float* %tmp5742, i64 1
+  %tmp5744 = getelementptr inbounds float* %tmp5743, i64 1
+  %tmp5745 = getelementptr inbounds float* %tmp5744, i64 1
+  %tmp5746 = getelementptr inbounds float* %tmp5745, i64 1
+  %tmp5747 = getelementptr inbounds float* %tmp5746, i64 1
+  %tmp5748 = getelementptr inbounds float* %tmp5747, i64 1
+  %tmp5749 = getelementptr inbounds float* %tmp5748, i64 1
+  %tmp5750 = getelementptr inbounds float* %tmp5749, i64 1
+  %tmp5751 = getelementptr inbounds float* %tmp5750, i64 1
+  %tmp5752 = getelementptr inbounds float* %tmp5751, i64 1
+  %tmp5753 = getelementptr inbounds float* %tmp5752, i64 1
+  %tmp5754 = getelementptr inbounds float* %tmp5753, i64 1
+  %tmp5755 = getelementptr inbounds float* %tmp5754, i64 1
+  %tmp5756 = getelementptr inbounds float* %tmp5755, i64 1
+  %tmp5757 = getelementptr inbounds float* %tmp5756, i64 1
+  %tmp5758 = getelementptr inbounds float* %tmp5757, i64 1
+  %tmp5759 = getelementptr inbounds float* %tmp5758, i64 1
+  %tmp5760 = getelementptr inbounds float* %tmp5759, i64 1
+  %tmp5761 = getelementptr inbounds float* %tmp5760, i64 1
+  %tmp5762 = getelementptr inbounds float* %tmp5761, i64 1
+  %tmp5763 = getelementptr inbounds float* %tmp5762, i64 1
+  %tmp5764 = getelementptr inbounds float* %tmp5763, i64 1
+  %tmp5765 = getelementptr inbounds float* %tmp5764, i64 1
+  %tmp5766 = getelementptr inbounds float* %tmp5765, i64 1
+  %tmp5767 = getelementptr inbounds float* %tmp5766, i64 1
+  %tmp5768 = getelementptr inbounds float* %tmp5767, i64 1
+  %tmp5769 = getelementptr inbounds float* %tmp5768, i64 1
+  %tmp5770 = getelementptr inbounds float* %tmp5769, i64 1
+  %tmp5771 = getelementptr inbounds float* %tmp5770, i64 1
+  %tmp5772 = getelementptr inbounds float* %tmp5771, i64 1
+  %tmp5773 = getelementptr inbounds float* %tmp5772, i64 1
+  %tmp5774 = getelementptr inbounds float* %tmp5773, i64 1
+  %tmp5775 = getelementptr inbounds float* %tmp5774, i64 1
+  %tmp5776 = getelementptr inbounds float* %tmp5775, i64 1
+  %tmp5777 = getelementptr inbounds float* %tmp5776, i64 1
+  %tmp5778 = getelementptr inbounds float* %tmp5777, i64 1
+  %tmp5779 = getelementptr inbounds float* %tmp5778, i64 1
+  %tmp5780 = getelementptr inbounds float* %tmp5779, i64 1
+  %tmp5781 = getelementptr inbounds float* %tmp5780, i64 1
+  %tmp5782 = getelementptr inbounds float* %tmp5781, i64 1
+  %tmp5783 = getelementptr inbounds float* %tmp5782, i64 1
+  %tmp5784 = getelementptr inbounds float* %tmp5783, i64 1
+  %tmp5785 = getelementptr inbounds float* %tmp5784, i64 1
+  %tmp5786 = getelementptr inbounds float* %tmp5785, i64 1
+  %tmp5787 = getelementptr inbounds float* %tmp5786, i64 1
+  %tmp5788 = getelementptr inbounds float* %tmp5787, i64 1
+  %tmp5789 = getelementptr inbounds float* %tmp5788, i64 1
+  %tmp5790 = getelementptr inbounds float* %tmp5789, i64 1
+  %tmp5791 = getelementptr inbounds float* %tmp5790, i64 1
+  %tmp5792 = getelementptr inbounds float* %tmp5791, i64 1
+  %tmp5793 = getelementptr inbounds float* %tmp5792, i64 1
+  %tmp5794 = getelementptr inbounds float* %tmp5793, i64 1
+  %tmp5795 = getelementptr inbounds float* %tmp5794, i64 1
+  %tmp5796 = getelementptr inbounds float* %tmp5795, i64 1
+  %tmp5797 = getelementptr inbounds float* %tmp5796, i64 1
+  %tmp5798 = getelementptr inbounds float* %tmp5797, i64 1
+  %tmp5799 = getelementptr inbounds float* %tmp5798, i64 1
+  %tmp5800 = getelementptr inbounds float* %tmp5799, i64 1
+  %tmp5801 = getelementptr inbounds float* %tmp5800, i64 1
+  %tmp5802 = getelementptr inbounds float* %tmp5801, i64 1
+  %tmp5803 = getelementptr inbounds float* %tmp5802, i64 1
+  %tmp5804 = getelementptr inbounds float* %tmp5803, i64 1
+  %tmp5805 = getelementptr inbounds float* %tmp5804, i64 1
+  %tmp5806 = getelementptr inbounds float* %tmp5805, i64 1
+  %tmp5807 = getelementptr inbounds float* %tmp5806, i64 1
+  %tmp5808 = getelementptr inbounds float* %tmp5807, i64 1
+  %tmp5809 = getelementptr inbounds float* %tmp5808, i64 1
+  %tmp5810 = getelementptr inbounds float* %tmp5809, i64 1
+  %tmp5811 = getelementptr inbounds float* %tmp5810, i64 1
+  %tmp5812 = getelementptr inbounds float* %tmp5811, i64 1
+  %tmp5813 = getelementptr inbounds float* %tmp5812, i64 1
+  %tmp5814 = getelementptr inbounds float* %tmp5813, i64 1
+  %tmp5815 = getelementptr inbounds float* %tmp5814, i64 1
+  %tmp5816 = getelementptr inbounds float* %tmp5815, i64 1
+  %tmp5817 = getelementptr inbounds float* %tmp5816, i64 1
+  %tmp5818 = getelementptr inbounds float* %tmp5817, i64 1
+  %tmp5819 = getelementptr inbounds float* %tmp5818, i64 1
+  %tmp5820 = getelementptr inbounds float* %tmp5819, i64 1
+  %tmp5821 = getelementptr inbounds float* %tmp5820, i64 1
+  %tmp5822 = getelementptr inbounds float* %tmp5821, i64 1
+  %tmp5823 = getelementptr inbounds float* %tmp5822, i64 1
+  %tmp5824 = getelementptr inbounds float* %tmp5823, i64 1
+  %tmp5825 = getelementptr inbounds float* %tmp5824, i64 1
+  %tmp5826 = getelementptr inbounds float* %tmp5825, i64 1
+  %tmp5827 = getelementptr inbounds float* %tmp5826, i64 1
+  %tmp5828 = getelementptr inbounds float* %tmp5827, i64 1
+  %tmp5829 = getelementptr inbounds float* %tmp5828, i64 1
+  %tmp5830 = getelementptr inbounds float* %tmp5829, i64 1
+  %tmp5831 = getelementptr inbounds float* %tmp5830, i64 1
+  %tmp5832 = getelementptr inbounds float* %tmp5831, i64 1
+  %tmp5833 = getelementptr inbounds float* %tmp5832, i64 1
+  %tmp5834 = getelementptr inbounds float* %tmp5833, i64 1
+  %tmp5835 = getelementptr inbounds float* %tmp5834, i64 1
+  %tmp5836 = getelementptr inbounds float* %tmp5835, i64 1
+  %tmp5837 = getelementptr inbounds float* %tmp5836, i64 1
+  %tmp5838 = getelementptr inbounds float* %tmp5837, i64 1
+  %tmp5839 = getelementptr inbounds float* %tmp5838, i64 1
+  %tmp5840 = getelementptr inbounds float* %tmp5839, i64 1
+  %tmp5841 = getelementptr inbounds float* %tmp5840, i64 1
+  %tmp5842 = getelementptr inbounds float* %tmp5841, i64 1
+  %tmp5843 = getelementptr inbounds float* %tmp5842, i64 1
+  %tmp5844 = getelementptr inbounds float* %tmp5843, i64 1
+  %tmp5845 = getelementptr inbounds float* %tmp5844, i64 1
+  %tmp5846 = getelementptr inbounds float* %tmp5845, i64 1
+  %tmp5847 = getelementptr inbounds float* %tmp5846, i64 1
+  %tmp5848 = getelementptr inbounds float* %tmp5847, i64 1
+  %tmp5849 = getelementptr inbounds float* %tmp5848, i64 1
+  %tmp5850 = getelementptr inbounds float* %tmp5849, i64 1
+  %tmp5851 = getelementptr inbounds float* %tmp5850, i64 1
+  %tmp5852 = getelementptr inbounds float* %tmp5851, i64 1
+  %tmp5853 = getelementptr inbounds float* %tmp5852, i64 1
+  %tmp5854 = getelementptr inbounds float* %tmp5853, i64 1
+  %tmp5855 = getelementptr inbounds float* %tmp5854, i64 1
+  %tmp5856 = getelementptr inbounds float* %tmp5855, i64 1
+  %tmp5857 = getelementptr inbounds float* %tmp5856, i64 1
+  %tmp5858 = getelementptr inbounds float* %tmp5857, i64 1
+  %tmp5859 = getelementptr inbounds float* %tmp5858, i64 1
+  %tmp5860 = getelementptr inbounds float* %tmp5859, i64 1
+  %tmp5861 = getelementptr inbounds float* %tmp5860, i64 1
+  %tmp5862 = getelementptr inbounds float* %tmp5861, i64 1
+  %tmp5863 = getelementptr inbounds float* %tmp5862, i64 1
+  %tmp5864 = getelementptr inbounds float* %tmp5863, i64 1
+  %tmp5865 = getelementptr inbounds float* %tmp5864, i64 1
+  %tmp5866 = getelementptr inbounds float* %tmp5865, i64 1
+  %tmp5867 = getelementptr inbounds float* %tmp5866, i64 1
+  %tmp5868 = getelementptr inbounds float* %tmp5867, i64 1
+  %tmp5869 = getelementptr inbounds float* %tmp5868, i64 1
+  %tmp5870 = getelementptr inbounds float* %tmp5869, i64 1
+  %tmp5871 = getelementptr inbounds float* %tmp5870, i64 1
+  %tmp5872 = getelementptr inbounds float* %tmp5871, i64 1
+  %tmp5873 = getelementptr inbounds float* %tmp5872, i64 1
+  %tmp5874 = getelementptr inbounds float* %tmp5873, i64 1
+  %tmp5875 = getelementptr inbounds float* %tmp5874, i64 1
+  %tmp5876 = getelementptr inbounds float* %tmp5875, i64 1
+  %tmp5877 = getelementptr inbounds float* %tmp5876, i64 1
+  %tmp5878 = getelementptr inbounds float* %tmp5877, i64 1
+  %tmp5879 = getelementptr inbounds float* %tmp5878, i64 1
+  %tmp5880 = getelementptr inbounds float* %tmp5879, i64 1
+  %tmp5881 = getelementptr inbounds float* %tmp5880, i64 1
+  %tmp5882 = getelementptr inbounds float* %tmp5881, i64 1
+  %tmp5883 = getelementptr inbounds float* %tmp5882, i64 1
+  %tmp5884 = getelementptr inbounds float* %tmp5883, i64 1
+  %tmp5885 = getelementptr inbounds float* %tmp5884, i64 1
+  %tmp5886 = getelementptr inbounds float* %tmp5885, i64 1
+  %tmp5887 = getelementptr inbounds float* %tmp5886, i64 1
+  %tmp5888 = getelementptr inbounds float* %tmp5887, i64 1
+  %tmp5889 = getelementptr inbounds float* %tmp5888, i64 1
+  %tmp5890 = getelementptr inbounds float* %tmp5889, i64 1
+  %tmp5891 = getelementptr inbounds float* %tmp5890, i64 1
+  %tmp5892 = getelementptr inbounds float* %tmp5891, i64 1
+  %tmp5893 = getelementptr inbounds float* %tmp5892, i64 1
+  %tmp5894 = getelementptr inbounds float* %tmp5893, i64 1
+  %tmp5895 = getelementptr inbounds float* %tmp5894, i64 1
+  %tmp5896 = getelementptr inbounds float* %tmp5895, i64 1
+  %tmp5897 = getelementptr inbounds float* %tmp5896, i64 1
+  %tmp5898 = getelementptr inbounds float* %tmp5897, i64 1
+  %tmp5899 = getelementptr inbounds float* %tmp5898, i64 1
+  %tmp5900 = getelementptr inbounds float* %tmp5899, i64 1
+  %tmp5901 = getelementptr inbounds float* %tmp5900, i64 1
+  %tmp5902 = getelementptr inbounds float* %tmp5901, i64 1
+  %tmp5903 = getelementptr inbounds float* %tmp5902, i64 1
+  %tmp5904 = getelementptr inbounds float* %tmp5903, i64 1
+  %tmp5905 = getelementptr inbounds float* %tmp5904, i64 1
+  %tmp5906 = getelementptr inbounds float* %tmp5905, i64 1
+  %tmp5907 = getelementptr inbounds float* %tmp5906, i64 1
+  %tmp5908 = getelementptr inbounds float* %tmp5907, i64 1
+  %tmp5909 = getelementptr inbounds float* %tmp5908, i64 1
+  %tmp5910 = getelementptr inbounds float* %tmp5909, i64 1
+  %tmp5911 = getelementptr inbounds float* %tmp5910, i64 1
+  %tmp5912 = getelementptr inbounds float* %tmp5911, i64 1
+  %tmp5913 = getelementptr inbounds float* %tmp5912, i64 1
+  %tmp5914 = getelementptr inbounds float* %tmp5913, i64 1
+  %tmp5915 = getelementptr inbounds float* %tmp5914, i64 1
+  %tmp5916 = getelementptr inbounds float* %tmp5915, i64 1
+  %tmp5917 = getelementptr inbounds float* %tmp5916, i64 1
+  %tmp5918 = getelementptr inbounds float* %tmp5917, i64 1
+  %tmp5919 = getelementptr inbounds float* %tmp5918, i64 1
+  %tmp5920 = getelementptr inbounds float* %tmp5919, i64 1
+  %tmp5921 = getelementptr inbounds float* %tmp5920, i64 1
+  %tmp5922 = getelementptr inbounds float* %tmp5921, i64 1
+  %tmp5923 = getelementptr inbounds float* %tmp5922, i64 1
+  %tmp5924 = getelementptr inbounds float* %tmp5923, i64 1
+  %tmp5925 = getelementptr inbounds float* %tmp5924, i64 1
+  %tmp5926 = getelementptr inbounds float* %tmp5925, i64 1
+  %tmp5927 = getelementptr inbounds float* %tmp5926, i64 1
+  %tmp5928 = getelementptr inbounds float* %tmp5927, i64 1
+  %tmp5929 = getelementptr inbounds float* %tmp5928, i64 1
+  %tmp5930 = getelementptr inbounds float* %tmp5929, i64 1
+  %tmp5931 = getelementptr inbounds float* %tmp5930, i64 1
+  %tmp5932 = getelementptr inbounds float* %tmp5931, i64 1
+  %tmp5933 = getelementptr inbounds float* %tmp5932, i64 1
+  %tmp5934 = getelementptr inbounds float* %tmp5933, i64 1
+  %tmp5935 = getelementptr inbounds float* %tmp5934, i64 1
+  %tmp5936 = getelementptr inbounds float* %tmp5935, i64 1
+  %tmp5937 = getelementptr inbounds float* %tmp5936, i64 1
+  %tmp5938 = getelementptr inbounds float* %tmp5937, i64 1
+  %tmp5939 = getelementptr inbounds float* %tmp5938, i64 1
+  %tmp5940 = getelementptr inbounds float* %tmp5939, i64 1
+  %tmp5941 = getelementptr inbounds float* %tmp5940, i64 1
+  %tmp5942 = getelementptr inbounds float* %tmp5941, i64 1
+  %tmp5943 = getelementptr inbounds float* %tmp5942, i64 1
+  %tmp5944 = getelementptr inbounds float* %tmp5943, i64 1
+  %tmp5945 = getelementptr inbounds float* %tmp5944, i64 1
+  %tmp5946 = getelementptr inbounds float* %tmp5945, i64 1
+  %tmp5947 = getelementptr inbounds float* %tmp5946, i64 1
+  %tmp5948 = getelementptr inbounds float* %tmp5947, i64 1
+  %tmp5949 = getelementptr inbounds float* %tmp5948, i64 1
+  %tmp5950 = getelementptr inbounds float* %tmp5949, i64 1
+  %tmp5951 = getelementptr inbounds float* %tmp5950, i64 1
+  %tmp5952 = getelementptr inbounds float* %tmp5951, i64 1
+  %tmp5953 = getelementptr inbounds float* %tmp5952, i64 1
+  %tmp5954 = getelementptr inbounds float* %tmp5953, i64 1
+  %tmp5955 = getelementptr inbounds float* %tmp5954, i64 1
+  %tmp5956 = getelementptr inbounds float* %tmp5955, i64 1
+  %tmp5957 = getelementptr inbounds float* %tmp5956, i64 1
+  %tmp5958 = getelementptr inbounds float* %tmp5957, i64 1
+  %tmp5959 = getelementptr inbounds float* %tmp5958, i64 1
+  %tmp5960 = getelementptr inbounds float* %tmp5959, i64 1
+  %tmp5961 = getelementptr inbounds float* %tmp5960, i64 1
+  %tmp5962 = getelementptr inbounds float* %tmp5961, i64 1
+  %tmp5963 = getelementptr inbounds float* %tmp5962, i64 1
+  %tmp5964 = getelementptr inbounds float* %tmp5963, i64 1
+  %tmp5965 = getelementptr inbounds float* %tmp5964, i64 1
+  %tmp5966 = getelementptr inbounds float* %tmp5965, i64 1
+  %tmp5967 = getelementptr inbounds float* %tmp5966, i64 1
+  %tmp5968 = getelementptr inbounds float* %tmp5967, i64 1
+  %tmp5969 = getelementptr inbounds float* %tmp5968, i64 1
+  %tmp5970 = getelementptr inbounds float* %tmp5969, i64 1
+  %tmp5971 = getelementptr inbounds float* %tmp5970, i64 1
+  %tmp5972 = getelementptr inbounds float* %tmp5971, i64 1
+  %tmp5973 = getelementptr inbounds float* %tmp5972, i64 1
+  %tmp5974 = getelementptr inbounds float* %tmp5973, i64 1
+  %tmp5975 = getelementptr inbounds float* %tmp5974, i64 1
+  %tmp5976 = getelementptr inbounds float* %tmp5975, i64 1
+  %tmp5977 = getelementptr inbounds float* %tmp5976, i64 1
+  %tmp5978 = getelementptr inbounds float* %tmp5977, i64 1
+  %tmp5979 = getelementptr inbounds float* %tmp5978, i64 1
+  %tmp5980 = getelementptr inbounds float* %tmp5979, i64 1
+  %tmp5981 = getelementptr inbounds float* %tmp5980, i64 1
+  %tmp5982 = getelementptr inbounds float* %tmp5981, i64 1
+  %tmp5983 = getelementptr inbounds float* %tmp5982, i64 1
+  %tmp5984 = getelementptr inbounds float* %tmp5983, i64 1
+  %tmp5985 = getelementptr inbounds float* %tmp5984, i64 1
+  %tmp5986 = getelementptr inbounds float* %tmp5985, i64 1
+  %tmp5987 = getelementptr inbounds float* %tmp5986, i64 1
+  %tmp5988 = getelementptr inbounds float* %tmp5987, i64 1
+  %tmp5989 = getelementptr inbounds float* %tmp5988, i64 1
+  %tmp5990 = getelementptr inbounds float* %tmp5989, i64 1
+  %tmp5991 = getelementptr inbounds float* %tmp5990, i64 1
+  %tmp5992 = getelementptr inbounds float* %tmp5991, i64 1
+  %tmp5993 = getelementptr inbounds float* %tmp5992, i64 1
+  %tmp5994 = getelementptr inbounds float* %tmp5993, i64 1
+  %tmp5995 = getelementptr inbounds float* %tmp5994, i64 1
+  %tmp5996 = getelementptr inbounds float* %tmp5995, i64 1
+  %tmp5997 = getelementptr inbounds float* %tmp5996, i64 1
+  %tmp5998 = getelementptr inbounds float* %tmp5997, i64 1
+  %tmp5999 = getelementptr inbounds float* %tmp5998, i64 1
+  %tmp6000 = getelementptr inbounds float* %tmp5999, i64 1
+  %tmp6001 = getelementptr inbounds float* %tmp6000, i64 1
+  %tmp6002 = getelementptr inbounds float* %tmp6001, i64 1
+  %tmp6003 = getelementptr inbounds float* %tmp6002, i64 1
+  %tmp6004 = getelementptr inbounds float* %tmp6003, i64 1
+  %tmp6005 = getelementptr inbounds float* %tmp6004, i64 1
+  %tmp6006 = getelementptr inbounds float* %tmp6005, i64 1
+  %tmp6007 = getelementptr inbounds float* %tmp6006, i64 1
+  %tmp6008 = getelementptr inbounds float* %tmp6007, i64 1
+  %tmp6009 = getelementptr inbounds float* %tmp6008, i64 1
+  %tmp6010 = getelementptr inbounds float* %tmp6009, i64 1
+  %tmp6011 = getelementptr inbounds float* %tmp6010, i64 1
+  %tmp6012 = getelementptr inbounds float* %tmp6011, i64 1
+  %tmp6013 = getelementptr inbounds float* %tmp6012, i64 1
+  %tmp6014 = getelementptr inbounds float* %tmp6013, i64 1
+  %tmp6015 = getelementptr inbounds float* %tmp6014, i64 1
+  %tmp6016 = getelementptr inbounds float* %tmp6015, i64 1
+  %tmp6017 = getelementptr inbounds float* %tmp6016, i64 1
+  %tmp6018 = getelementptr inbounds float* %tmp6017, i64 1
+  %tmp6019 = getelementptr inbounds float* %tmp6018, i64 1
+  %tmp6020 = getelementptr inbounds float* %tmp6019, i64 1
+  %tmp6021 = getelementptr inbounds float* %tmp6020, i64 1
+  %tmp6022 = getelementptr inbounds float* %tmp6021, i64 1
+  %tmp6023 = getelementptr inbounds float* %tmp6022, i64 1
+  %tmp6024 = getelementptr inbounds float* %tmp6023, i64 1
+  %tmp6025 = getelementptr inbounds float* %tmp6024, i64 1
+  %tmp6026 = getelementptr inbounds float* %tmp6025, i64 1
+  %tmp6027 = getelementptr inbounds float* %tmp6026, i64 1
+  %tmp6028 = getelementptr inbounds float* %tmp6027, i64 1
+  %tmp6029 = getelementptr inbounds float* %tmp6028, i64 1
+  %tmp6030 = getelementptr inbounds float* %tmp6029, i64 1
+  %tmp6031 = getelementptr inbounds float* %tmp6030, i64 1
+  %tmp6032 = getelementptr inbounds float* %tmp6031, i64 1
+  %tmp6033 = getelementptr inbounds float* %tmp6032, i64 1
+  %tmp6034 = getelementptr inbounds float* %tmp6033, i64 1
+  %tmp6035 = getelementptr inbounds float* %tmp6034, i64 1
+  %tmp6036 = getelementptr inbounds float* %tmp6035, i64 1
+  %tmp6037 = getelementptr inbounds float* %tmp6036, i64 1
+  %tmp6038 = getelementptr inbounds float* %tmp6037, i64 1
+  %tmp6039 = getelementptr inbounds float* %tmp6038, i64 1
+  %tmp6040 = getelementptr inbounds float* %tmp6039, i64 1
+  %tmp6041 = getelementptr inbounds float* %tmp6040, i64 1
+  %tmp6042 = getelementptr inbounds float* %tmp6041, i64 1
+  %tmp6043 = getelementptr inbounds float* %tmp6042, i64 1
+  %tmp6044 = getelementptr inbounds float* %tmp6043, i64 1
+  %tmp6045 = getelementptr inbounds float* %tmp6044, i64 1
+  %tmp6046 = getelementptr inbounds float* %tmp6045, i64 1
+  %tmp6047 = getelementptr inbounds float* %tmp6046, i64 1
+  %tmp6048 = getelementptr inbounds float* %tmp6047, i64 1
+  %tmp6049 = getelementptr inbounds float* %tmp6048, i64 1
+  %tmp6050 = getelementptr inbounds float* %tmp6049, i64 1
+  %tmp6051 = getelementptr inbounds float* %tmp6050, i64 1
+  %tmp6052 = getelementptr inbounds float* %tmp6051, i64 1
+  %tmp6053 = getelementptr inbounds float* %tmp6052, i64 1
+  %tmp6054 = getelementptr inbounds float* %tmp6053, i64 1
+  %tmp6055 = getelementptr inbounds float* %tmp6054, i64 1
+  %tmp6056 = getelementptr inbounds float* %tmp6055, i64 1
+  %tmp6057 = getelementptr inbounds float* %tmp6056, i64 1
+  %tmp6058 = getelementptr inbounds float* %tmp6057, i64 1
+  %tmp6059 = getelementptr inbounds float* %tmp6058, i64 1
+  %tmp6060 = getelementptr inbounds float* %tmp6059, i64 1
+  %tmp6061 = getelementptr inbounds float* %tmp6060, i64 1
+  %tmp6062 = getelementptr inbounds float* %tmp6061, i64 1
+  %tmp6063 = getelementptr inbounds float* %tmp6062, i64 1
+  %tmp6064 = getelementptr inbounds float* %tmp6063, i64 1
+  %tmp6065 = getelementptr inbounds float* %tmp6064, i64 1
+  %tmp6066 = getelementptr inbounds float* %tmp6065, i64 1
+  %tmp6067 = getelementptr inbounds float* %tmp6066, i64 1
+  %tmp6068 = getelementptr inbounds float* %tmp6067, i64 1
+  %tmp6069 = getelementptr inbounds float* %tmp6068, i64 1
+  %tmp6070 = getelementptr inbounds float* %tmp6069, i64 1
+  %tmp6071 = getelementptr inbounds float* %tmp6070, i64 1
+  %tmp6072 = getelementptr inbounds float* %tmp6071, i64 1
+  %tmp6073 = getelementptr inbounds float* %tmp6072, i64 1
+  %tmp6074 = getelementptr inbounds float* %tmp6073, i64 1
+  %tmp6075 = getelementptr inbounds float* %tmp6074, i64 1
+  %tmp6076 = getelementptr inbounds float* %tmp6075, i64 1
+  %tmp6077 = getelementptr inbounds float* %tmp6076, i64 1
+  %tmp6078 = getelementptr inbounds float* %tmp6077, i64 1
+  %tmp6079 = getelementptr inbounds float* %tmp6078, i64 1
+  %tmp6080 = getelementptr inbounds float* %tmp6079, i64 1
+  %tmp6081 = getelementptr inbounds float* %tmp6080, i64 1
+  %tmp6082 = getelementptr inbounds float* %tmp6081, i64 1
+  %tmp6083 = getelementptr inbounds float* %tmp6082, i64 1
+  %tmp6084 = getelementptr inbounds float* %tmp6083, i64 1
+  %tmp6085 = getelementptr inbounds float* %tmp6084, i64 1
+  %tmp6086 = getelementptr inbounds float* %tmp6085, i64 1
+  %tmp6087 = getelementptr inbounds float* %tmp6086, i64 1
+  %tmp6088 = getelementptr inbounds float* %tmp6087, i64 1
+  %tmp6089 = getelementptr inbounds float* %tmp6088, i64 1
+  %tmp6090 = getelementptr inbounds float* %tmp6089, i64 1
+  %tmp6091 = getelementptr inbounds float* %tmp6090, i64 1
+  %tmp6092 = getelementptr inbounds float* %tmp6091, i64 1
+  %tmp6093 = getelementptr inbounds float* %tmp6092, i64 1
+  %tmp6094 = getelementptr inbounds float* %tmp6093, i64 1
+  %tmp6095 = getelementptr inbounds float* %tmp6094, i64 1
+  %tmp6096 = getelementptr inbounds float* %tmp6095, i64 1
+  %tmp6097 = getelementptr inbounds float* %tmp6096, i64 1
+  %tmp6098 = getelementptr inbounds float* %tmp6097, i64 1
+  %tmp6099 = getelementptr inbounds float* %tmp6098, i64 1
+  %tmp6100 = getelementptr inbounds float* %tmp6099, i64 1
+  %tmp6101 = getelementptr inbounds float* %tmp6100, i64 1
+  %tmp6102 = getelementptr inbounds float* %tmp6101, i64 1
+  %tmp6103 = getelementptr inbounds float* %tmp6102, i64 1
+  %tmp6104 = getelementptr inbounds float* %tmp6103, i64 1
+  %tmp6105 = getelementptr inbounds float* %tmp6104, i64 1
+  %tmp6106 = getelementptr inbounds float* %tmp6105, i64 1
+  %tmp6107 = getelementptr inbounds float* %tmp6106, i64 1
+  %tmp6108 = getelementptr inbounds float* %tmp6107, i64 1
+  %tmp6109 = getelementptr inbounds float* %tmp6108, i64 1
+  %tmp6110 = getelementptr inbounds float* %tmp6109, i64 1
+  %tmp6111 = getelementptr inbounds float* %tmp6110, i64 1
+  %tmp6112 = getelementptr inbounds float* %tmp6111, i64 1
+  %tmp6113 = getelementptr inbounds float* %tmp6112, i64 1
+  %tmp6114 = getelementptr inbounds float* %tmp6113, i64 1
+  %tmp6115 = getelementptr inbounds float* %tmp6114, i64 1
+  %tmp6116 = getelementptr inbounds float* %tmp6115, i64 1
+  %tmp6117 = getelementptr inbounds float* %tmp6116, i64 1
+  %tmp6118 = getelementptr inbounds float* %tmp6117, i64 1
+  %tmp6119 = getelementptr inbounds float* %tmp6118, i64 1
+  %tmp6120 = getelementptr inbounds float* %tmp6119, i64 1
+  %tmp6121 = getelementptr inbounds float* %tmp6120, i64 1
+  %tmp6122 = getelementptr inbounds float* %tmp6121, i64 1
+  %tmp6123 = getelementptr inbounds float* %tmp6122, i64 1
+  %tmp6124 = getelementptr inbounds float* %tmp6123, i64 1
+  %tmp6125 = getelementptr inbounds float* %tmp6124, i64 1
+  %tmp6126 = getelementptr inbounds float* %tmp6125, i64 1
+  %tmp6127 = getelementptr inbounds float* %tmp6126, i64 1
+  %tmp6128 = getelementptr inbounds float* %tmp6127, i64 1
+  %tmp6129 = getelementptr inbounds float* %tmp6128, i64 1
+  %tmp6130 = getelementptr inbounds float* %tmp6129, i64 1
+  %tmp6131 = getelementptr inbounds float* %tmp6130, i64 1
+  %tmp6132 = getelementptr inbounds float* %tmp6131, i64 1
+  %tmp6133 = getelementptr inbounds float* %tmp6132, i64 1
+  %tmp6134 = getelementptr inbounds float* %tmp6133, i64 1
+  %tmp6135 = getelementptr inbounds float* %tmp6134, i64 1
+  %tmp6136 = getelementptr inbounds float* %tmp6135, i64 1
+  %tmp6137 = getelementptr inbounds float* %tmp6136, i64 1
+  %tmp6138 = getelementptr inbounds float* %tmp6137, i64 1
+  %tmp6139 = getelementptr inbounds float* %tmp6138, i64 1
+  %tmp6140 = getelementptr inbounds float* %tmp6139, i64 1
+  %tmp6141 = getelementptr inbounds float* %tmp6140, i64 1
+  %tmp6142 = getelementptr inbounds float* %tmp6141, i64 1
+  %tmp6143 = getelementptr inbounds float* %tmp6142, i64 1
+  %tmp6144 = getelementptr inbounds float* %tmp6143, i64 1
+  %tmp6145 = getelementptr inbounds float* %tmp6144, i64 1
+  %tmp6146 = getelementptr inbounds float* %tmp6145, i64 1
+  %tmp6147 = getelementptr inbounds float* %tmp6146, i64 1
+  %tmp6148 = getelementptr inbounds float* %tmp6147, i64 1
+  %tmp6149 = getelementptr inbounds float* %tmp6148, i64 1
+  %tmp6150 = getelementptr inbounds float* %tmp6149, i64 1
+  %tmp6151 = getelementptr inbounds float* %tmp6150, i64 1
+  %tmp6152 = getelementptr inbounds float* %tmp6151, i64 1
+  %tmp6153 = getelementptr inbounds float* %tmp6152, i64 1
+  %tmp6154 = getelementptr inbounds float* %tmp6153, i64 1
+  %tmp6155 = getelementptr inbounds float* %tmp6154, i64 1
+  %tmp6156 = getelementptr inbounds float* %tmp6155, i64 1
+  %tmp6157 = getelementptr inbounds float* %tmp6156, i64 1
+  %tmp6158 = getelementptr inbounds float* %tmp6157, i64 1
+  %tmp6159 = getelementptr inbounds float* %tmp6158, i64 1
+  %tmp6160 = getelementptr inbounds float* %tmp6159, i64 1
+  %tmp6161 = getelementptr inbounds float* %tmp6160, i64 1
+  %tmp6162 = getelementptr inbounds float* %tmp6161, i64 1
+  %tmp6163 = getelementptr inbounds float* %tmp6162, i64 1
+  %tmp6164 = getelementptr inbounds float* %tmp6163, i64 1
+  %tmp6165 = getelementptr inbounds float* %tmp6164, i64 1
+  %tmp6166 = getelementptr inbounds float* %tmp6165, i64 1
+  %tmp6167 = getelementptr inbounds float* %tmp6166, i64 1
+  %tmp6168 = getelementptr inbounds float* %tmp6167, i64 1
+  %tmp6169 = getelementptr inbounds float* %tmp6168, i64 1
+  %tmp6170 = getelementptr inbounds float* %tmp6169, i64 1
+  %tmp6171 = getelementptr inbounds float* %tmp6170, i64 1
+  %tmp6172 = getelementptr inbounds float* %tmp6171, i64 1
+  %tmp6173 = getelementptr inbounds float* %tmp6172, i64 1
+  %tmp6174 = getelementptr inbounds float* %tmp6173, i64 1
+  %tmp6175 = getelementptr inbounds float* %tmp6174, i64 1
+  %tmp6176 = getelementptr inbounds float* %tmp6175, i64 1
+  %tmp6177 = getelementptr inbounds float* %tmp6176, i64 1
+  %tmp6178 = getelementptr inbounds float* %tmp6177, i64 1
+  %tmp6179 = getelementptr inbounds float* %tmp6178, i64 1
+  %tmp6180 = getelementptr inbounds float* %tmp6179, i64 1
+  %tmp6181 = getelementptr inbounds float* %tmp6180, i64 1
+  %tmp6182 = getelementptr inbounds float* %tmp6181, i64 1
+  %tmp6183 = getelementptr inbounds float* %tmp6182, i64 1
+  %tmp6184 = getelementptr inbounds float* %tmp6183, i64 1
+  %tmp6185 = getelementptr inbounds float* %tmp6184, i64 1
+  %tmp6186 = getelementptr inbounds float* %tmp6185, i64 1
+  %tmp6187 = getelementptr inbounds float* %tmp6186, i64 1
+  %tmp6188 = getelementptr inbounds float* %tmp6187, i64 1
+  %tmp6189 = getelementptr inbounds float* %tmp6188, i64 1
+  %tmp6190 = getelementptr inbounds float* %tmp6189, i64 1
+  %tmp6191 = getelementptr inbounds float* %tmp6190, i64 1
+  %tmp6192 = getelementptr inbounds float* %tmp6191, i64 1
+  %tmp6193 = getelementptr inbounds float* %tmp6192, i64 1
+  %tmp6194 = getelementptr inbounds float* %tmp6193, i64 1
+  %tmp6195 = getelementptr inbounds float* %tmp6194, i64 1
+  %tmp6196 = getelementptr inbounds float* %tmp6195, i64 1
+  %tmp6197 = getelementptr inbounds float* %tmp6196, i64 1
+  %tmp6198 = getelementptr inbounds float* %tmp6197, i64 1
+  %tmp6199 = getelementptr inbounds float* %tmp6198, i64 1
+  %tmp6200 = getelementptr inbounds float* %tmp6199, i64 1
+  %tmp6201 = getelementptr inbounds float* %tmp6200, i64 1
+  %tmp6202 = getelementptr inbounds float* %tmp6201, i64 1
+  %tmp6203 = getelementptr inbounds float* %tmp6202, i64 1
+  %tmp6204 = getelementptr inbounds float* %tmp6203, i64 1
+  %tmp6205 = getelementptr inbounds float* %tmp6204, i64 1
+  %tmp6206 = getelementptr inbounds float* %tmp6205, i64 1
+  %tmp6207 = getelementptr inbounds float* %tmp6206, i64 1
+  %tmp6208 = getelementptr inbounds float* %tmp6207, i64 1
+  %tmp6209 = getelementptr inbounds float* %tmp6208, i64 1
+  %tmp6210 = getelementptr inbounds float* %tmp6209, i64 1
+  %tmp6211 = getelementptr inbounds float* %tmp6210, i64 1
+  %tmp6212 = getelementptr inbounds float* %tmp6211, i64 1
+  %tmp6213 = getelementptr inbounds float* %tmp6212, i64 1
+  %tmp6214 = getelementptr inbounds float* %tmp6213, i64 1
+  %tmp6215 = getelementptr inbounds float* %tmp6214, i64 1
+  %tmp6216 = getelementptr inbounds float* %tmp6215, i64 1
+  %tmp6217 = getelementptr inbounds float* %tmp6216, i64 1
+  %tmp6218 = getelementptr inbounds float* %tmp6217, i64 1
+  %tmp6219 = getelementptr inbounds float* %tmp6218, i64 1
+  %tmp6220 = getelementptr inbounds float* %tmp6219, i64 1
+  %tmp6221 = getelementptr inbounds float* %tmp6220, i64 1
+  %tmp6222 = getelementptr inbounds float* %tmp6221, i64 1
+  %tmp6223 = getelementptr inbounds float* %tmp6222, i64 1
+  %tmp6224 = getelementptr inbounds float* %tmp6223, i64 1
+  %tmp6225 = getelementptr inbounds float* %tmp6224, i64 1
+  %tmp6226 = getelementptr inbounds float* %tmp6225, i64 1
+  %tmp6227 = getelementptr inbounds float* %tmp6226, i64 1
+  %tmp6228 = getelementptr inbounds float* %tmp6227, i64 1
+  %tmp6229 = getelementptr inbounds float* %tmp6228, i64 1
+  %tmp6230 = getelementptr inbounds float* %tmp6229, i64 1
+  %tmp6231 = getelementptr inbounds float* %tmp6230, i64 1
+  %tmp6232 = getelementptr inbounds float* %tmp6231, i64 1
+  %tmp6233 = getelementptr inbounds float* %tmp6232, i64 1
+  %tmp6234 = getelementptr inbounds float* %tmp6233, i64 1
+  %tmp6235 = getelementptr inbounds float* %tmp6234, i64 1
+  %tmp6236 = getelementptr inbounds float* %tmp6235, i64 1
+  %tmp6237 = getelementptr inbounds float* %tmp6236, i64 1
+  %tmp6238 = getelementptr inbounds float* %tmp6237, i64 1
+  %tmp6239 = getelementptr inbounds float* %tmp6238, i64 1
+  %tmp6240 = getelementptr inbounds float* %tmp6239, i64 1
+  %tmp6241 = getelementptr inbounds float* %tmp6240, i64 1
+  %tmp6242 = getelementptr inbounds float* %tmp6241, i64 1
+  %tmp6243 = getelementptr inbounds float* %tmp6242, i64 1
+  %tmp6244 = getelementptr inbounds float* %tmp6243, i64 1
+  %tmp6245 = getelementptr inbounds float* %tmp6244, i64 1
+  %tmp6246 = getelementptr inbounds float* %tmp6245, i64 1
+  %tmp6247 = getelementptr inbounds float* %tmp6246, i64 1
+  %tmp6248 = getelementptr inbounds float* %tmp6247, i64 1
+  %tmp6249 = getelementptr inbounds float* %tmp6248, i64 1
+  %tmp6250 = getelementptr inbounds float* %tmp6249, i64 1
+  %tmp6251 = getelementptr inbounds float* %tmp6250, i64 1
+  %tmp6252 = getelementptr inbounds float* %tmp6251, i64 1
+  %tmp6253 = getelementptr inbounds float* %tmp6252, i64 1
+  %tmp6254 = getelementptr inbounds float* %tmp6253, i64 1
+  %tmp6255 = getelementptr inbounds float* %tmp6254, i64 1
+  %tmp6256 = getelementptr inbounds float* %tmp6255, i64 1
+  %tmp6257 = getelementptr inbounds float* %tmp6256, i64 1
+  %tmp6258 = getelementptr inbounds float* %tmp6257, i64 1
+  %tmp6259 = getelementptr inbounds float* %tmp6258, i64 1
+  %tmp6260 = getelementptr inbounds float* %tmp6259, i64 1
+  %tmp6261 = getelementptr inbounds float* %tmp6260, i64 1
+  %tmp6262 = getelementptr inbounds float* %tmp6261, i64 1
+  %tmp6263 = getelementptr inbounds float* %tmp6262, i64 1
+  %tmp6264 = getelementptr inbounds float* %tmp6263, i64 1
+  %tmp6265 = getelementptr inbounds float* %tmp6264, i64 1
+  %tmp6266 = getelementptr inbounds float* %tmp6265, i64 1
+  %tmp6267 = getelementptr inbounds float* %tmp6266, i64 1
+  %tmp6268 = getelementptr inbounds float* %tmp6267, i64 1
+  %tmp6269 = getelementptr inbounds float* %tmp6268, i64 1
+  %tmp6270 = getelementptr inbounds float* %tmp6269, i64 1
+  %tmp6271 = getelementptr inbounds float* %tmp6270, i64 1
+  %tmp6272 = getelementptr inbounds float* %tmp6271, i64 1
+  %tmp6273 = getelementptr inbounds float* %tmp6272, i64 1
+  %tmp6274 = getelementptr inbounds float* %tmp6273, i64 1
+  %tmp6275 = getelementptr inbounds float* %tmp6274, i64 1
+  %tmp6276 = getelementptr inbounds float* %tmp6275, i64 1
+  %tmp6277 = getelementptr inbounds float* %tmp6276, i64 1
+  %tmp6278 = getelementptr inbounds float* %tmp6277, i64 1
+  %tmp6279 = getelementptr inbounds float* %tmp6278, i64 1
+  %tmp6280 = getelementptr inbounds float* %tmp6279, i64 1
+  %tmp6281 = getelementptr inbounds float* %tmp6280, i64 1
+  %tmp6282 = getelementptr inbounds float* %tmp6281, i64 1
+  %tmp6283 = getelementptr inbounds float* %tmp6282, i64 1
+  %tmp6284 = getelementptr inbounds float* %tmp6283, i64 1
+  %tmp6285 = getelementptr inbounds float* %tmp6284, i64 1
+  %tmp6286 = getelementptr inbounds float* %tmp6285, i64 1
+  %tmp6287 = getelementptr inbounds float* %tmp6286, i64 1
+  %tmp6288 = getelementptr inbounds float* %tmp6287, i64 1
+  %tmp6289 = getelementptr inbounds float* %tmp6288, i64 1
+  %tmp6290 = getelementptr inbounds float* %tmp6289, i64 1
+  %tmp6291 = getelementptr inbounds float* %tmp6290, i64 1
+  %tmp6292 = getelementptr inbounds float* %tmp6291, i64 1
+  %tmp6293 = getelementptr inbounds float* %tmp6292, i64 1
+  %tmp6294 = getelementptr inbounds float* %tmp6293, i64 1
+  %tmp6295 = getelementptr inbounds float* %tmp6294, i64 1
+  %tmp6296 = getelementptr inbounds float* %tmp6295, i64 1
+  %tmp6297 = getelementptr inbounds float* %tmp6296, i64 1
+  %tmp6298 = getelementptr inbounds float* %tmp6297, i64 1
+  %tmp6299 = getelementptr inbounds float* %tmp6298, i64 1
+  %tmp6300 = getelementptr inbounds float* %tmp6299, i64 1
+  %tmp6301 = getelementptr inbounds float* %tmp6300, i64 1
+  %tmp6302 = getelementptr inbounds float* %tmp6301, i64 1
+  %tmp6303 = getelementptr inbounds float* %tmp6302, i64 1
+  %tmp6304 = getelementptr inbounds float* %tmp6303, i64 1
+  %tmp6305 = getelementptr inbounds float* %tmp6304, i64 1
+  %tmp6306 = getelementptr inbounds float* %tmp6305, i64 1
+  %tmp6307 = getelementptr inbounds float* %tmp6306, i64 1
+  %tmp6308 = getelementptr inbounds float* %tmp6307, i64 1
+  %tmp6309 = getelementptr inbounds float* %tmp6308, i64 1
+  %tmp6310 = getelementptr inbounds float* %tmp6309, i64 1
+  %tmp6311 = getelementptr inbounds float* %tmp6310, i64 1
+  %tmp6312 = getelementptr inbounds float* %tmp6311, i64 1
+  %tmp6313 = getelementptr inbounds float* %tmp6312, i64 1
+  %tmp6314 = getelementptr inbounds float* %tmp6313, i64 1
+  %tmp6315 = getelementptr inbounds float* %tmp6314, i64 1
+  %tmp6316 = getelementptr inbounds float* %tmp6315, i64 1
+  %tmp6317 = getelementptr inbounds float* %tmp6316, i64 1
+  %tmp6318 = getelementptr inbounds float* %tmp6317, i64 1
+  %tmp6319 = getelementptr inbounds float* %tmp6318, i64 1
+  %tmp6320 = getelementptr inbounds float* %tmp6319, i64 1
+  %tmp6321 = getelementptr inbounds float* %tmp6320, i64 1
+  %tmp6322 = getelementptr inbounds float* %tmp6321, i64 1
+  %tmp6323 = getelementptr inbounds float* %tmp6322, i64 1
+  %tmp6324 = getelementptr inbounds float* %tmp6323, i64 1
+  %tmp6325 = getelementptr inbounds float* %tmp6324, i64 1
+  %tmp6326 = getelementptr inbounds float* %tmp6325, i64 1
+  %tmp6327 = getelementptr inbounds float* %tmp6326, i64 1
+  %tmp6328 = getelementptr inbounds float* %tmp6327, i64 1
+  %tmp6329 = getelementptr inbounds float* %tmp6328, i64 1
+  %tmp6330 = getelementptr inbounds float* %tmp6329, i64 1
+  %tmp6331 = getelementptr inbounds float* %tmp6330, i64 1
+  %tmp6332 = getelementptr inbounds float* %tmp6331, i64 1
+  %tmp6333 = getelementptr inbounds float* %tmp6332, i64 1
+  %tmp6334 = getelementptr inbounds float* %tmp6333, i64 1
+  %tmp6335 = getelementptr inbounds float* %tmp6334, i64 1
+  %tmp6336 = getelementptr inbounds float* %tmp6335, i64 1
+  %tmp6337 = getelementptr inbounds float* %tmp6336, i64 1
+  %tmp6338 = getelementptr inbounds float* %tmp6337, i64 1
+  %tmp6339 = getelementptr inbounds float* %tmp6338, i64 1
+  %tmp6340 = getelementptr inbounds float* %tmp6339, i64 1
+  %tmp6341 = getelementptr inbounds float* %tmp6340, i64 1
+  %tmp6342 = getelementptr inbounds float* %tmp6341, i64 1
+  %tmp6343 = getelementptr inbounds float* %tmp6342, i64 1
+  %tmp6344 = getelementptr inbounds float* %tmp6343, i64 1
+  %tmp6345 = getelementptr inbounds float* %tmp6344, i64 1
+  %tmp6346 = getelementptr inbounds float* %tmp6345, i64 1
+  %tmp6347 = getelementptr inbounds float* %tmp6346, i64 1
+  %tmp6348 = getelementptr inbounds float* %tmp6347, i64 1
+  %tmp6349 = getelementptr inbounds float* %tmp6348, i64 1
+  %tmp6350 = getelementptr inbounds float* %tmp6349, i64 1
+  %tmp6351 = getelementptr inbounds float* %tmp6350, i64 1
+  %tmp6352 = getelementptr inbounds float* %tmp6351, i64 1
+  %tmp6353 = getelementptr inbounds float* %tmp6352, i64 1
+  %tmp6354 = getelementptr inbounds float* %tmp6353, i64 1
+  %tmp6355 = getelementptr inbounds float* %tmp6354, i64 1
+  %tmp6356 = getelementptr inbounds float* %tmp6355, i64 1
+  %tmp6357 = getelementptr inbounds float* %tmp6356, i64 1
+  %tmp6358 = getelementptr inbounds float* %tmp6357, i64 1
+  %tmp6359 = getelementptr inbounds float* %tmp6358, i64 1
+  %tmp6360 = getelementptr inbounds float* %tmp6359, i64 1
+  %tmp6361 = getelementptr inbounds float* %tmp6360, i64 1
+  %tmp6362 = getelementptr inbounds float* %tmp6361, i64 1
+  %tmp6363 = getelementptr inbounds float* %tmp6362, i64 1
+  %tmp6364 = getelementptr inbounds float* %tmp6363, i64 1
+  %tmp6365 = getelementptr inbounds float* %tmp6364, i64 1
+  %tmp6366 = getelementptr inbounds float* %tmp6365, i64 1
+  %tmp6367 = getelementptr inbounds float* %tmp6366, i64 1
+  %tmp6368 = getelementptr inbounds float* %tmp6367, i64 1
+  %tmp6369 = getelementptr inbounds float* %tmp6368, i64 1
+  %tmp6370 = getelementptr inbounds float* %tmp6369, i64 1
+  %tmp6371 = getelementptr inbounds float* %tmp6370, i64 1
+  %tmp6372 = getelementptr inbounds float* %tmp6371, i64 1
+  %tmp6373 = getelementptr inbounds float* %tmp6372, i64 1
+  %tmp6374 = getelementptr inbounds float* %tmp6373, i64 1
+  %tmp6375 = getelementptr inbounds float* %tmp6374, i64 1
+  %tmp6376 = getelementptr inbounds float* %tmp6375, i64 1
+  %tmp6377 = getelementptr inbounds float* %tmp6376, i64 1
+  %tmp6378 = getelementptr inbounds float* %tmp6377, i64 1
+  %tmp6379 = getelementptr inbounds float* %tmp6378, i64 1
+  %tmp6380 = getelementptr inbounds float* %tmp6379, i64 1
+  %tmp6381 = getelementptr inbounds float* %tmp6380, i64 1
+  %tmp6382 = getelementptr inbounds float* %tmp6381, i64 1
+  %tmp6383 = getelementptr inbounds float* %tmp6382, i64 1
+  %tmp6384 = getelementptr inbounds float* %tmp6383, i64 1
+  %tmp6385 = getelementptr inbounds float* %tmp6384, i64 1
+  %tmp6386 = getelementptr inbounds float* %tmp6385, i64 1
+  %tmp6387 = getelementptr inbounds float* %tmp6386, i64 1
+  %tmp6388 = getelementptr inbounds float* %tmp6387, i64 1
+  %tmp6389 = getelementptr inbounds float* %tmp6388, i64 1
+  %tmp6390 = getelementptr inbounds float* %tmp6389, i64 1
+  %tmp6391 = getelementptr inbounds float* %tmp6390, i64 1
+  %tmp6392 = getelementptr inbounds float* %tmp6391, i64 1
+  %tmp6393 = getelementptr inbounds float* %tmp6392, i64 1
+  %tmp6394 = getelementptr inbounds float* %tmp6393, i64 1
+  %tmp6395 = getelementptr inbounds float* %tmp6394, i64 1
+  %tmp6396 = getelementptr inbounds float* %tmp6395, i64 1
+  %tmp6397 = getelementptr inbounds float* %tmp6396, i64 1
+  %tmp6398 = getelementptr inbounds float* %tmp6397, i64 1
+  %tmp6399 = getelementptr inbounds float* %tmp6398, i64 1
+  %tmp6400 = getelementptr inbounds float* %tmp6399, i64 1
+  %tmp6401 = getelementptr inbounds float* %tmp6400, i64 1
+  %tmp6402 = getelementptr inbounds float* %tmp6401, i64 1
+  %tmp6403 = getelementptr inbounds float* %tmp6402, i64 1
+  %tmp6404 = getelementptr inbounds float* %tmp6403, i64 1
+  %tmp6405 = getelementptr inbounds float* %tmp6404, i64 1
+  %tmp6406 = getelementptr inbounds float* %tmp6405, i64 1
+  %tmp6407 = getelementptr inbounds float* %tmp6406, i64 1
+  %tmp6408 = getelementptr inbounds float* %tmp6407, i64 1
+  %tmp6409 = getelementptr inbounds float* %tmp6408, i64 1
+  %tmp6410 = getelementptr inbounds float* %tmp6409, i64 1
+  %tmp6411 = getelementptr inbounds float* %tmp6410, i64 1
+  %tmp6412 = getelementptr inbounds float* %tmp6411, i64 1
+  %tmp6413 = getelementptr inbounds float* %tmp6412, i64 1
+  %tmp6414 = getelementptr inbounds float* %tmp6413, i64 1
+  %tmp6415 = getelementptr inbounds float* %tmp6414, i64 1
+  %tmp6416 = getelementptr inbounds float* %tmp6415, i64 1
+  %tmp6417 = getelementptr inbounds float* %tmp6416, i64 1
+  %tmp6418 = getelementptr inbounds float* %tmp6417, i64 1
+  %tmp6419 = getelementptr inbounds float* %tmp6418, i64 1
+  %tmp6420 = getelementptr inbounds float* %tmp6419, i64 1
+  %tmp6421 = getelementptr inbounds float* %tmp6420, i64 1
+  %tmp6422 = getelementptr inbounds float* %tmp6421, i64 1
+  %tmp6423 = getelementptr inbounds float* %tmp6422, i64 1
+  %tmp6424 = getelementptr inbounds float* %tmp6423, i64 1
+  %tmp6425 = getelementptr inbounds float* %tmp6424, i64 1
+  %tmp6426 = getelementptr inbounds float* %tmp6425, i64 1
+  %tmp6427 = getelementptr inbounds float* %tmp6426, i64 1
+  %tmp6428 = getelementptr inbounds float* %tmp6427, i64 1
+  %tmp6429 = getelementptr inbounds float* %tmp6428, i64 1
+  %tmp6430 = getelementptr inbounds float* %tmp6429, i64 1
+  %tmp6431 = getelementptr inbounds float* %tmp6430, i64 1
+  %tmp6432 = getelementptr inbounds float* %tmp6431, i64 1
+  %tmp6433 = getelementptr inbounds float* %tmp6432, i64 1
+  %tmp6434 = getelementptr inbounds float* %tmp6433, i64 1
+  %tmp6435 = getelementptr inbounds float* %tmp6434, i64 1
+  %tmp6436 = getelementptr inbounds float* %tmp6435, i64 1
+  %tmp6437 = getelementptr inbounds float* %tmp6436, i64 1
+  %tmp6438 = getelementptr inbounds float* %tmp6437, i64 1
+  %tmp6439 = getelementptr inbounds float* %tmp6438, i64 1
+  %tmp6440 = getelementptr inbounds float* %tmp6439, i64 1
+  %tmp6441 = getelementptr inbounds float* %tmp6440, i64 1
+  %tmp6442 = getelementptr inbounds float* %tmp6441, i64 1
+  %tmp6443 = getelementptr inbounds float* %tmp6442, i64 1
+  %tmp6444 = getelementptr inbounds float* %tmp6443, i64 1
+  %tmp6445 = getelementptr inbounds float* %tmp6444, i64 1
+  %tmp6446 = getelementptr inbounds float* %tmp6445, i64 1
+  %tmp6447 = getelementptr inbounds float* %tmp6446, i64 1
+  %tmp6448 = getelementptr inbounds float* %tmp6447, i64 1
+  %tmp6449 = getelementptr inbounds float* %tmp6448, i64 1
+  %tmp6450 = getelementptr inbounds float* %tmp6449, i64 1
+  %tmp6451 = getelementptr inbounds float* %tmp6450, i64 1
+  %tmp6452 = getelementptr inbounds float* %tmp6451, i64 1
+  %tmp6453 = getelementptr inbounds float* %tmp6452, i64 1
+  %tmp6454 = getelementptr inbounds float* %tmp6453, i64 1
+  %tmp6455 = getelementptr inbounds float* %tmp6454, i64 1
+  %tmp6456 = getelementptr inbounds float* %tmp6455, i64 1
+  %tmp6457 = getelementptr inbounds float* %tmp6456, i64 1
+  %tmp6458 = getelementptr inbounds float* %tmp6457, i64 1
+  %tmp6459 = getelementptr inbounds float* %tmp6458, i64 1
+  %tmp6460 = getelementptr inbounds float* %tmp6459, i64 1
+  %tmp6461 = getelementptr inbounds float* %tmp6460, i64 1
+  %tmp6462 = getelementptr inbounds float* %tmp6461, i64 1
+  %tmp6463 = getelementptr inbounds float* %tmp6462, i64 1
+  %tmp6464 = getelementptr inbounds float* %tmp6463, i64 1
+  %tmp6465 = getelementptr inbounds float* %tmp6464, i64 1
+  %tmp6466 = getelementptr inbounds float* %tmp6465, i64 1
+  %tmp6467 = getelementptr inbounds float* %tmp6466, i64 1
+  %tmp6468 = getelementptr inbounds float* %tmp6467, i64 1
+  %tmp6469 = getelementptr inbounds float* %tmp6468, i64 1
+  %tmp6470 = getelementptr inbounds float* %tmp6469, i64 1
+  %tmp6471 = getelementptr inbounds float* %tmp6470, i64 1
+  %tmp6472 = getelementptr inbounds float* %tmp6471, i64 1
+  %tmp6473 = getelementptr inbounds float* %tmp6472, i64 1
+  %tmp6474 = getelementptr inbounds float* %tmp6473, i64 1
+  %tmp6475 = getelementptr inbounds float* %tmp6474, i64 1
+  %tmp6476 = getelementptr inbounds float* %tmp6475, i64 1
+  %tmp6477 = getelementptr inbounds float* %tmp6476, i64 1
+  %tmp6478 = getelementptr inbounds float* %tmp6477, i64 1
+  %tmp6479 = getelementptr inbounds float* %tmp6478, i64 1
+  %tmp6480 = getelementptr inbounds float* %tmp6479, i64 1
+  %tmp6481 = getelementptr inbounds float* %tmp6480, i64 1
+  %tmp6482 = getelementptr inbounds float* %tmp6481, i64 1
+  %tmp6483 = getelementptr inbounds float* %tmp6482, i64 1
+  %tmp6484 = getelementptr inbounds float* %tmp6483, i64 1
+  %tmp6485 = getelementptr inbounds float* %tmp6484, i64 1
+  %tmp6486 = getelementptr inbounds float* %tmp6485, i64 1
+  %tmp6487 = getelementptr inbounds float* %tmp6486, i64 1
+  %tmp6488 = getelementptr inbounds float* %tmp6487, i64 1
+  %tmp6489 = getelementptr inbounds float* %tmp6488, i64 1
+  %tmp6490 = getelementptr inbounds float* %tmp6489, i64 1
+  %tmp6491 = getelementptr inbounds float* %tmp6490, i64 1
+  %tmp6492 = getelementptr inbounds float* %tmp6491, i64 1
+  %tmp6493 = getelementptr inbounds float* %tmp6492, i64 1
+  %tmp6494 = getelementptr inbounds float* %tmp6493, i64 1
+  %tmp6495 = getelementptr inbounds float* %tmp6494, i64 1
+  %tmp6496 = getelementptr inbounds float* %tmp6495, i64 1
+  %tmp6497 = getelementptr inbounds float* %tmp6496, i64 1
+  %tmp6498 = getelementptr inbounds float* %tmp6497, i64 1
+  %tmp6499 = getelementptr inbounds float* %tmp6498, i64 1
+  %tmp6500 = getelementptr inbounds float* %tmp6499, i64 1
+  %tmp6501 = getelementptr inbounds float* %tmp6500, i64 1
+  %tmp6502 = getelementptr inbounds float* %tmp6501, i64 1
+  %tmp6503 = getelementptr inbounds float* %tmp6502, i64 1
+  %tmp6504 = getelementptr inbounds float* %tmp6503, i64 1
+  %tmp6505 = getelementptr inbounds float* %tmp6504, i64 1
+  %tmp6506 = getelementptr inbounds float* %tmp6505, i64 1
+  %tmp6507 = getelementptr inbounds float* %tmp6506, i64 1
+  %tmp6508 = getelementptr inbounds float* %tmp6507, i64 1
+  %tmp6509 = getelementptr inbounds float* %tmp6508, i64 1
+  %tmp6510 = getelementptr inbounds float* %tmp6509, i64 1
+  %tmp6511 = getelementptr inbounds float* %tmp6510, i64 1
+  %tmp6512 = getelementptr inbounds float* %tmp6511, i64 1
+  %tmp6513 = getelementptr inbounds float* %tmp6512, i64 1
+  %tmp6514 = getelementptr inbounds float* %tmp6513, i64 1
+  %tmp6515 = getelementptr inbounds float* %tmp6514, i64 1
+  %tmp6516 = getelementptr inbounds float* %tmp6515, i64 1
+  %tmp6517 = getelementptr inbounds float* %tmp6516, i64 1
+  %tmp6518 = getelementptr inbounds float* %tmp6517, i64 1
+  %tmp6519 = getelementptr inbounds float* %tmp6518, i64 1
+  %tmp6520 = getelementptr inbounds float* %tmp6519, i64 1
+  %tmp6521 = getelementptr inbounds float* %tmp6520, i64 1
+  %tmp6522 = getelementptr inbounds float* %tmp6521, i64 1
+  %tmp6523 = getelementptr inbounds float* %tmp6522, i64 1
+  %tmp6524 = getelementptr inbounds float* %tmp6523, i64 1
+  %tmp6525 = getelementptr inbounds float* %tmp6524, i64 1
+  %tmp6526 = getelementptr inbounds float* %tmp6525, i64 1
+  %tmp6527 = getelementptr inbounds float* %tmp6526, i64 1
+  %tmp6528 = getelementptr inbounds float* %tmp6527, i64 1
+  %tmp6529 = getelementptr inbounds float* %tmp6528, i64 1
+  %tmp6530 = getelementptr inbounds float* %tmp6529, i64 1
+  %tmp6531 = getelementptr inbounds float* %tmp6530, i64 1
+  %tmp6532 = getelementptr inbounds float* %tmp6531, i64 1
+  %tmp6533 = getelementptr inbounds float* %tmp6532, i64 1
+  %tmp6534 = getelementptr inbounds float* %tmp6533, i64 1
+  %tmp6535 = getelementptr inbounds float* %tmp6534, i64 1
+  %tmp6536 = getelementptr inbounds float* %tmp6535, i64 1
+  %tmp6537 = getelementptr inbounds float* %tmp6536, i64 1
+  %tmp6538 = getelementptr inbounds float* %tmp6537, i64 1
+  %tmp6539 = getelementptr inbounds float* %tmp6538, i64 1
+  %tmp6540 = getelementptr inbounds float* %tmp6539, i64 1
+  %tmp6541 = getelementptr inbounds float* %tmp6540, i64 1
+  %tmp6542 = getelementptr inbounds float* %tmp6541, i64 1
+  %tmp6543 = getelementptr inbounds float* %tmp6542, i64 1
+  %tmp6544 = getelementptr inbounds float* %tmp6543, i64 1
+  %tmp6545 = getelementptr inbounds float* %tmp6544, i64 1
+  %tmp6546 = getelementptr inbounds float* %tmp6545, i64 1
+  %tmp6547 = getelementptr inbounds float* %tmp6546, i64 1
+  %tmp6548 = getelementptr inbounds float* %tmp6547, i64 1
+  %tmp6549 = getelementptr inbounds float* %tmp6548, i64 1
+  %tmp6550 = getelementptr inbounds float* %tmp6549, i64 1
+  %tmp6551 = getelementptr inbounds float* %tmp6550, i64 1
+  %tmp6552 = getelementptr inbounds float* %tmp6551, i64 1
+  %tmp6553 = getelementptr inbounds float* %tmp6552, i64 1
+  %tmp6554 = getelementptr inbounds float* %tmp6553, i64 1
+  %tmp6555 = getelementptr inbounds float* %tmp6554, i64 1
+  %tmp6556 = getelementptr inbounds float* %tmp6555, i64 1
+  %tmp6557 = getelementptr inbounds float* %tmp6556, i64 1
+  %tmp6558 = getelementptr inbounds float* %tmp6557, i64 1
+  %tmp6559 = getelementptr inbounds float* %tmp6558, i64 1
+  %tmp6560 = getelementptr inbounds float* %tmp6559, i64 1
+  %tmp6561 = getelementptr inbounds float* %tmp6560, i64 1
+  %tmp6562 = getelementptr inbounds float* %tmp6561, i64 1
+  %tmp6563 = getelementptr inbounds float* %tmp6562, i64 1
+  %tmp6564 = getelementptr inbounds float* %tmp6563, i64 1
+  %tmp6565 = getelementptr inbounds float* %tmp6564, i64 1
+  %tmp6566 = getelementptr inbounds float* %tmp6565, i64 1
+  %tmp6567 = getelementptr inbounds float* %tmp6566, i64 1
+  %tmp6568 = getelementptr inbounds float* %tmp6567, i64 1
+  %tmp6569 = getelementptr inbounds float* %tmp6568, i64 1
+  %tmp6570 = getelementptr inbounds float* %tmp6569, i64 1
+  %tmp6571 = getelementptr inbounds float* %tmp6570, i64 1
+  %tmp6572 = getelementptr inbounds float* %tmp6571, i64 1
+  %tmp6573 = getelementptr inbounds float* %tmp6572, i64 1
+  %tmp6574 = getelementptr inbounds float* %tmp6573, i64 1
+  %tmp6575 = getelementptr inbounds float* %tmp6574, i64 1
+  %tmp6576 = getelementptr inbounds float* %tmp6575, i64 1
+  %tmp6577 = getelementptr inbounds float* %tmp6576, i64 1
+  %tmp6578 = getelementptr inbounds float* %tmp6577, i64 1
+  %tmp6579 = getelementptr inbounds float* %tmp6578, i64 1
+  %tmp6580 = getelementptr inbounds float* %tmp6579, i64 1
+  %tmp6581 = getelementptr inbounds float* %tmp6580, i64 1
+  %tmp6582 = getelementptr inbounds float* %tmp6581, i64 1
+  %tmp6583 = getelementptr inbounds float* %tmp6582, i64 1
+  %tmp6584 = getelementptr inbounds float* %tmp6583, i64 1
+  %tmp6585 = getelementptr inbounds float* %tmp6584, i64 1
+  %tmp6586 = getelementptr inbounds float* %tmp6585, i64 1
+  %tmp6587 = getelementptr inbounds float* %tmp6586, i64 1
+  %tmp6588 = getelementptr inbounds float* %tmp6587, i64 1
+  %tmp6589 = getelementptr inbounds float* %tmp6588, i64 1
+  %tmp6590 = getelementptr inbounds float* %tmp6589, i64 1
+  %tmp6591 = getelementptr inbounds float* %tmp6590, i64 1
+  %tmp6592 = getelementptr inbounds float* %tmp6591, i64 1
+  %tmp6593 = getelementptr inbounds float* %tmp6592, i64 1
+  %tmp6594 = getelementptr inbounds float* %tmp6593, i64 1
+  %tmp6595 = getelementptr inbounds float* %tmp6594, i64 1
+  %tmp6596 = getelementptr inbounds float* %tmp6595, i64 1
+  %tmp6597 = getelementptr inbounds float* %tmp6596, i64 1
+  %tmp6598 = getelementptr inbounds float* %tmp6597, i64 1
+  %tmp6599 = getelementptr inbounds float* %tmp6598, i64 1
+  %tmp6600 = getelementptr inbounds float* %tmp6599, i64 1
+  %tmp6601 = getelementptr inbounds float* %tmp6600, i64 1
+  %tmp6602 = getelementptr inbounds float* %tmp6601, i64 1
+  %tmp6603 = getelementptr inbounds float* %tmp6602, i64 1
+  %tmp6604 = getelementptr inbounds float* %tmp6603, i64 1
+  %tmp6605 = getelementptr inbounds float* %tmp6604, i64 1
+  %tmp6606 = getelementptr inbounds float* %tmp6605, i64 1
+  %tmp6607 = getelementptr inbounds float* %tmp6606, i64 1
+  %tmp6608 = getelementptr inbounds float* %tmp6607, i64 1
+  %tmp6609 = getelementptr inbounds float* %tmp6608, i64 1
+  %tmp6610 = getelementptr inbounds float* %tmp6609, i64 1
+  %tmp6611 = getelementptr inbounds float* %tmp6610, i64 1
+  %tmp6612 = getelementptr inbounds float* %tmp6611, i64 1
+  %tmp6613 = getelementptr inbounds float* %tmp6612, i64 1
+  %tmp6614 = getelementptr inbounds float* %tmp6613, i64 1
+  %tmp6615 = getelementptr inbounds float* %tmp6614, i64 1
+  %tmp6616 = getelementptr inbounds float* %tmp6615, i64 1
+  %tmp6617 = getelementptr inbounds float* %tmp6616, i64 1
+  %tmp6618 = getelementptr inbounds float* %tmp6617, i64 1
+  %tmp6619 = getelementptr inbounds float* %tmp6618, i64 1
+  %tmp6620 = getelementptr inbounds float* %tmp6619, i64 1
+  %tmp6621 = getelementptr inbounds float* %tmp6620, i64 1
+  %tmp6622 = getelementptr inbounds float* %tmp6621, i64 1
+  %tmp6623 = getelementptr inbounds float* %tmp6622, i64 1
+  %tmp6624 = getelementptr inbounds float* %tmp6623, i64 1
+  %tmp6625 = getelementptr inbounds float* %tmp6624, i64 1
+  %tmp6626 = getelementptr inbounds float* %tmp6625, i64 1
+  %tmp6627 = getelementptr inbounds float* %tmp6626, i64 1
+  %tmp6628 = getelementptr inbounds float* %tmp6627, i64 1
+  %tmp6629 = getelementptr inbounds float* %tmp6628, i64 1
+  %tmp6630 = getelementptr inbounds float* %tmp6629, i64 1
+  %tmp6631 = getelementptr inbounds float* %tmp6630, i64 1
+  %tmp6632 = getelementptr inbounds float* %tmp6631, i64 1
+  %tmp6633 = getelementptr inbounds float* %tmp6632, i64 1
+  %tmp6634 = getelementptr inbounds float* %tmp6633, i64 1
+  %tmp6635 = getelementptr inbounds float* %tmp6634, i64 1
+  %tmp6636 = getelementptr inbounds float* %tmp6635, i64 1
+  %tmp6637 = getelementptr inbounds float* %tmp6636, i64 1
+  %tmp6638 = getelementptr inbounds float* %tmp6637, i64 1
+  %tmp6639 = getelementptr inbounds float* %tmp6638, i64 1
+  %tmp6640 = getelementptr inbounds float* %tmp6639, i64 1
+  %tmp6641 = getelementptr inbounds float* %tmp6640, i64 1
+  %tmp6642 = getelementptr inbounds float* %tmp6641, i64 1
+  %tmp6643 = getelementptr inbounds float* %tmp6642, i64 1
+  %tmp6644 = getelementptr inbounds float* %tmp6643, i64 1
+  %tmp6645 = getelementptr inbounds float* %tmp6644, i64 1
+  %tmp6646 = getelementptr inbounds float* %tmp6645, i64 1
+  %tmp6647 = getelementptr inbounds float* %tmp6646, i64 1
+  %tmp6648 = getelementptr inbounds float* %tmp6647, i64 1
+  %tmp6649 = getelementptr inbounds float* %tmp6648, i64 1
+  %tmp6650 = getelementptr inbounds float* %tmp6649, i64 1
+  %tmp6651 = getelementptr inbounds float* %tmp6650, i64 1
+  %tmp6652 = getelementptr inbounds float* %tmp6651, i64 1
+  %tmp6653 = getelementptr inbounds float* %tmp6652, i64 1
+  %tmp6654 = getelementptr inbounds float* %tmp6653, i64 1
+  %tmp6655 = getelementptr inbounds float* %tmp6654, i64 1
+  %tmp6656 = getelementptr inbounds float* %tmp6655, i64 1
+  %tmp6657 = getelementptr inbounds float* %tmp6656, i64 1
+  %tmp6658 = getelementptr inbounds float* %tmp6657, i64 1
+  %tmp6659 = getelementptr inbounds float* %tmp6658, i64 1
+  %tmp6660 = getelementptr inbounds float* %tmp6659, i64 1
+  %tmp6661 = getelementptr inbounds float* %tmp6660, i64 1
+  %tmp6662 = getelementptr inbounds float* %tmp6661, i64 1
+  %tmp6663 = getelementptr inbounds float* %tmp6662, i64 1
+  %tmp6664 = getelementptr inbounds float* %tmp6663, i64 1
+  %tmp6665 = getelementptr inbounds float* %tmp6664, i64 1
+  %tmp6666 = getelementptr inbounds float* %tmp6665, i64 1
+  %tmp6667 = getelementptr inbounds float* %tmp6666, i64 1
+  %tmp6668 = getelementptr inbounds float* %tmp6667, i64 1
+  %tmp6669 = getelementptr inbounds float* %tmp6668, i64 1
+  %tmp6670 = getelementptr inbounds float* %tmp6669, i64 1
+  %tmp6671 = getelementptr inbounds float* %tmp6670, i64 1
+  %tmp6672 = getelementptr inbounds float* %tmp6671, i64 1
+  %tmp6673 = getelementptr inbounds float* %tmp6672, i64 1
+  %tmp6674 = getelementptr inbounds float* %tmp6673, i64 1
+  %tmp6675 = getelementptr inbounds float* %tmp6674, i64 1
+  %tmp6676 = getelementptr inbounds float* %tmp6675, i64 1
+  %tmp6677 = getelementptr inbounds float* %tmp6676, i64 1
+  %tmp6678 = getelementptr inbounds float* %tmp6677, i64 1
+  %tmp6679 = getelementptr inbounds float* %tmp6678, i64 1
+  %tmp6680 = getelementptr inbounds float* %tmp6679, i64 1
+  %tmp6681 = getelementptr inbounds float* %tmp6680, i64 1
+  %tmp6682 = getelementptr inbounds float* %tmp6681, i64 1
+  %tmp6683 = getelementptr inbounds float* %tmp6682, i64 1
+  %tmp6684 = getelementptr inbounds float* %tmp6683, i64 1
+  %tmp6685 = getelementptr inbounds float* %tmp6684, i64 1
+  %tmp6686 = getelementptr inbounds float* %tmp6685, i64 1
+  %tmp6687 = getelementptr inbounds float* %tmp6686, i64 1
+  %tmp6688 = getelementptr inbounds float* %tmp6687, i64 1
+  %tmp6689 = getelementptr inbounds float* %tmp6688, i64 1
+  %tmp6690 = getelementptr inbounds float* %tmp6689, i64 1
+  %tmp6691 = getelementptr inbounds float* %tmp6690, i64 1
+  %tmp6692 = getelementptr inbounds float* %tmp6691, i64 1
+  %tmp6693 = getelementptr inbounds float* %tmp6692, i64 1
+  %tmp6694 = getelementptr inbounds float* %tmp6693, i64 1
+  %tmp6695 = getelementptr inbounds float* %tmp6694, i64 1
+  %tmp6696 = getelementptr inbounds float* %tmp6695, i64 1
+  %tmp6697 = getelementptr inbounds float* %tmp6696, i64 1
+  %tmp6698 = getelementptr inbounds float* %tmp6697, i64 1
+  %tmp6699 = getelementptr inbounds float* %tmp6698, i64 1
+  %tmp6700 = getelementptr inbounds float* %tmp6699, i64 1
+  %tmp6701 = getelementptr inbounds float* %tmp6700, i64 1
+  %tmp6702 = getelementptr inbounds float* %tmp6701, i64 1
+  %tmp6703 = getelementptr inbounds float* %tmp6702, i64 1
+  %tmp6704 = getelementptr inbounds float* %tmp6703, i64 1
+  %tmp6705 = getelementptr inbounds float* %tmp6704, i64 1
+  %tmp6706 = getelementptr inbounds float* %tmp6705, i64 1
+  %tmp6707 = getelementptr inbounds float* %tmp6706, i64 1
+  %tmp6708 = getelementptr inbounds float* %tmp6707, i64 1
+  %tmp6709 = getelementptr inbounds float* %tmp6708, i64 1
+  %tmp6710 = getelementptr inbounds float* %tmp6709, i64 1
+  %tmp6711 = getelementptr inbounds float* %tmp6710, i64 1
+  %tmp6712 = getelementptr inbounds float* %tmp6711, i64 1
+  %tmp6713 = getelementptr inbounds float* %tmp6712, i64 1
+  %tmp6714 = getelementptr inbounds float* %tmp6713, i64 1
+  %tmp6715 = getelementptr inbounds float* %tmp6714, i64 1
+  %tmp6716 = getelementptr inbounds float* %tmp6715, i64 1
+  %tmp6717 = getelementptr inbounds float* %tmp6716, i64 1
+  %tmp6718 = getelementptr inbounds float* %tmp6717, i64 1
+  %tmp6719 = getelementptr inbounds float* %tmp6718, i64 1
+  %tmp6720 = getelementptr inbounds float* %tmp6719, i64 1
+  %tmp6721 = getelementptr inbounds float* %tmp6720, i64 1
+  %tmp6722 = getelementptr inbounds float* %tmp6721, i64 1
+  %tmp6723 = getelementptr inbounds float* %tmp6722, i64 1
+  %tmp6724 = getelementptr inbounds float* %tmp6723, i64 1
+  %tmp6725 = getelementptr inbounds float* %tmp6724, i64 1
+  %tmp6726 = getelementptr inbounds float* %tmp6725, i64 1
+  %tmp6727 = getelementptr inbounds float* %tmp6726, i64 1
+  %tmp6728 = getelementptr inbounds float* %tmp6727, i64 1
+  %tmp6729 = getelementptr inbounds float* %tmp6728, i64 1
+  %tmp6730 = getelementptr inbounds float* %tmp6729, i64 1
+  %tmp6731 = getelementptr inbounds float* %tmp6730, i64 1
+  %tmp6732 = getelementptr inbounds float* %tmp6731, i64 1
+  %tmp6733 = getelementptr inbounds float* %tmp6732, i64 1
+  %tmp6734 = getelementptr inbounds float* %tmp6733, i64 1
+  %tmp6735 = getelementptr inbounds float* %tmp6734, i64 1
+  %tmp6736 = getelementptr inbounds float* %tmp6735, i64 1
+  %tmp6737 = getelementptr inbounds float* %tmp6736, i64 1
+  %tmp6738 = getelementptr inbounds float* %tmp6737, i64 1
+  %tmp6739 = getelementptr inbounds float* %tmp6738, i64 1
+  %tmp6740 = getelementptr inbounds float* %tmp6739, i64 1
+  %tmp6741 = getelementptr inbounds float* %tmp6740, i64 1
+  %tmp6742 = getelementptr inbounds float* %tmp6741, i64 1
+  %tmp6743 = getelementptr inbounds float* %tmp6742, i64 1
+  %tmp6744 = getelementptr inbounds float* %tmp6743, i64 1
+  %tmp6745 = getelementptr inbounds float* %tmp6744, i64 1
+  %tmp6746 = getelementptr inbounds float* %tmp6745, i64 1
+  %tmp6747 = getelementptr inbounds float* %tmp6746, i64 1
+  %tmp6748 = getelementptr inbounds float* %tmp6747, i64 1
+  %tmp6749 = getelementptr inbounds float* %tmp6748, i64 1
+  %tmp6750 = getelementptr inbounds float* %tmp6749, i64 1
+  %tmp6751 = getelementptr inbounds float* %tmp6750, i64 1
+  %tmp6752 = getelementptr inbounds float* %tmp6751, i64 1
+  %tmp6753 = getelementptr inbounds float* %tmp6752, i64 1
+  %tmp6754 = getelementptr inbounds float* %tmp6753, i64 1
+  %tmp6755 = getelementptr inbounds float* %tmp6754, i64 1
+  %tmp6756 = getelementptr inbounds float* %tmp6755, i64 1
+  %tmp6757 = getelementptr inbounds float* %tmp6756, i64 1
+  %tmp6758 = getelementptr inbounds float* %tmp6757, i64 1
+  %tmp6759 = getelementptr inbounds float* %tmp6758, i64 1
+  %tmp6760 = getelementptr inbounds float* %tmp6759, i64 1
+  %tmp6761 = getelementptr inbounds float* %tmp6760, i64 1
+  %tmp6762 = getelementptr inbounds float* %tmp6761, i64 1
+  %tmp6763 = getelementptr inbounds float* %tmp6762, i64 1
+  %tmp6764 = getelementptr inbounds float* %tmp6763, i64 1
+  %tmp6765 = getelementptr inbounds float* %tmp6764, i64 1
+  %tmp6766 = getelementptr inbounds float* %tmp6765, i64 1
+  %tmp6767 = getelementptr inbounds float* %tmp6766, i64 1
+  %tmp6768 = getelementptr inbounds float* %tmp6767, i64 1
+  %tmp6769 = getelementptr inbounds float* %tmp6768, i64 1
+  %tmp6770 = getelementptr inbounds float* %tmp6769, i64 1
+  %tmp6771 = getelementptr inbounds float* %tmp6770, i64 1
+  %tmp6772 = getelementptr inbounds float* %tmp6771, i64 1
+  %tmp6773 = getelementptr inbounds float* %tmp6772, i64 1
+  %tmp6774 = getelementptr inbounds float* %tmp6773, i64 1
+  %tmp6775 = getelementptr inbounds float* %tmp6774, i64 1
+  %tmp6776 = getelementptr inbounds float* %tmp6775, i64 1
+  %tmp6777 = getelementptr inbounds float* %tmp6776, i64 1
+  %tmp6778 = getelementptr inbounds float* %tmp6777, i64 1
+  %tmp6779 = getelementptr inbounds float* %tmp6778, i64 1
+  %tmp6780 = getelementptr inbounds float* %tmp6779, i64 1
+  %tmp6781 = getelementptr inbounds float* %tmp6780, i64 1
+  %tmp6782 = getelementptr inbounds float* %tmp6781, i64 1
+  %tmp6783 = getelementptr inbounds float* %tmp6782, i64 1
+  %tmp6784 = getelementptr inbounds float* %tmp6783, i64 1
+  %tmp6785 = getelementptr inbounds float* %tmp6784, i64 1
+  %tmp6786 = getelementptr inbounds float* %tmp6785, i64 1
+  %tmp6787 = getelementptr inbounds float* %tmp6786, i64 1
+  %tmp6788 = getelementptr inbounds float* %tmp6787, i64 1
+  %tmp6789 = getelementptr inbounds float* %tmp6788, i64 1
+  %tmp6790 = getelementptr inbounds float* %tmp6789, i64 1
+  %tmp6791 = getelementptr inbounds float* %tmp6790, i64 1
+  %tmp6792 = getelementptr inbounds float* %tmp6791, i64 1
+  %tmp6793 = getelementptr inbounds float* %tmp6792, i64 1
+  %tmp6794 = getelementptr inbounds float* %tmp6793, i64 1
+  %tmp6795 = getelementptr inbounds float* %tmp6794, i64 1
+  %tmp6796 = getelementptr inbounds float* %tmp6795, i64 1
+  %tmp6797 = getelementptr inbounds float* %tmp6796, i64 1
+  %tmp6798 = getelementptr inbounds float* %tmp6797, i64 1
+  %tmp6799 = getelementptr inbounds float* %tmp6798, i64 1
+  %tmp6800 = getelementptr inbounds float* %tmp6799, i64 1
+  %tmp6801 = getelementptr inbounds float* %tmp6800, i64 1
+  %tmp6802 = getelementptr inbounds float* %tmp6801, i64 1
+  %tmp6803 = getelementptr inbounds float* %tmp6802, i64 1
+  %tmp6804 = getelementptr inbounds float* %tmp6803, i64 1
+  %tmp6805 = getelementptr inbounds float* %tmp6804, i64 1
+  %tmp6806 = getelementptr inbounds float* %tmp6805, i64 1
+  %tmp6807 = getelementptr inbounds float* %tmp6806, i64 1
+  %tmp6808 = getelementptr inbounds float* %tmp6807, i64 1
+  %tmp6809 = getelementptr inbounds float* %tmp6808, i64 1
+  %tmp6810 = getelementptr inbounds float* %tmp6809, i64 1
+  %tmp6811 = getelementptr inbounds float* %tmp6810, i64 1
+  %tmp6812 = getelementptr inbounds float* %tmp6811, i64 1
+  %tmp6813 = getelementptr inbounds float* %tmp6812, i64 1
+  %tmp6814 = getelementptr inbounds float* %tmp6813, i64 1
+  %tmp6815 = getelementptr inbounds float* %tmp6814, i64 1
+  %tmp6816 = getelementptr inbounds float* %tmp6815, i64 1
+  %tmp6817 = getelementptr inbounds float* %tmp6816, i64 1
+  %tmp6818 = getelementptr inbounds float* %tmp6817, i64 1
+  %tmp6819 = getelementptr inbounds float* %tmp6818, i64 1
+  %tmp6820 = getelementptr inbounds float* %tmp6819, i64 1
+  %tmp6821 = getelementptr inbounds float* %tmp6820, i64 1
+  %tmp6822 = getelementptr inbounds float* %tmp6821, i64 1
+  %tmp6823 = getelementptr inbounds float* %tmp6822, i64 1
+  %tmp6824 = getelementptr inbounds float* %tmp6823, i64 1
+  %tmp6825 = getelementptr inbounds float* %tmp6824, i64 1
+  %tmp6826 = getelementptr inbounds float* %tmp6825, i64 1
+  %tmp6827 = getelementptr inbounds float* %tmp6826, i64 1
+  %tmp6828 = getelementptr inbounds float* %tmp6827, i64 1
+  %tmp6829 = getelementptr inbounds float* %tmp6828, i64 1
+  %tmp6830 = getelementptr inbounds float* %tmp6829, i64 1
+  %tmp6831 = getelementptr inbounds float* %tmp6830, i64 1
+  %tmp6832 = getelementptr inbounds float* %tmp6831, i64 1
+  %tmp6833 = getelementptr inbounds float* %tmp6832, i64 1
+  %tmp6834 = getelementptr inbounds float* %tmp6833, i64 1
+  %tmp6835 = getelementptr inbounds float* %tmp6834, i64 1
+  %tmp6836 = getelementptr inbounds float* %tmp6835, i64 1
+  %tmp6837 = getelementptr inbounds float* %tmp6836, i64 1
+  %tmp6838 = getelementptr inbounds float* %tmp6837, i64 1
+  %tmp6839 = getelementptr inbounds float* %tmp6838, i64 1
+  %tmp6840 = getelementptr inbounds float* %tmp6839, i64 1
+  %tmp6841 = getelementptr inbounds float* %tmp6840, i64 1
+  %tmp6842 = getelementptr inbounds float* %tmp6841, i64 1
+  %tmp6843 = getelementptr inbounds float* %tmp6842, i64 1
+  %tmp6844 = getelementptr inbounds float* %tmp6843, i64 1
+  %tmp6845 = getelementptr inbounds float* %tmp6844, i64 1
+  %tmp6846 = getelementptr inbounds float* %tmp6845, i64 1
+  %tmp6847 = getelementptr inbounds float* %tmp6846, i64 1
+  %tmp6848 = getelementptr inbounds float* %tmp6847, i64 1
+  %tmp6849 = getelementptr inbounds float* %tmp6848, i64 1
+  %tmp6850 = getelementptr inbounds float* %tmp6849, i64 1
+  %tmp6851 = getelementptr inbounds float* %tmp6850, i64 1
+  %tmp6852 = getelementptr inbounds float* %tmp6851, i64 1
+  %tmp6853 = getelementptr inbounds float* %tmp6852, i64 1
+  %tmp6854 = getelementptr inbounds float* %tmp6853, i64 1
+  %tmp6855 = getelementptr inbounds float* %tmp6854, i64 1
+  %tmp6856 = getelementptr inbounds float* %tmp6855, i64 1
+  %tmp6857 = getelementptr inbounds float* %tmp6856, i64 1
+  %tmp6858 = getelementptr inbounds float* %tmp6857, i64 1
+  %tmp6859 = getelementptr inbounds float* %tmp6858, i64 1
+  %tmp6860 = getelementptr inbounds float* %tmp6859, i64 1
+  %tmp6861 = getelementptr inbounds float* %tmp6860, i64 1
+  %tmp6862 = getelementptr inbounds float* %tmp6861, i64 1
+  %tmp6863 = getelementptr inbounds float* %tmp6862, i64 1
+  %tmp6864 = getelementptr inbounds float* %tmp6863, i64 1
+  %tmp6865 = getelementptr inbounds float* %tmp6864, i64 1
+  %tmp6866 = getelementptr inbounds float* %tmp6865, i64 1
+  %tmp6867 = getelementptr inbounds float* %tmp6866, i64 1
+  %tmp6868 = getelementptr inbounds float* %tmp6867, i64 1
+  %tmp6869 = getelementptr inbounds float* %tmp6868, i64 1
+  %tmp6870 = getelementptr inbounds float* %tmp6869, i64 1
+  %tmp6871 = getelementptr inbounds float* %tmp6870, i64 1
+  %tmp6872 = getelementptr inbounds float* %tmp6871, i64 1
+  %tmp6873 = getelementptr inbounds float* %tmp6872, i64 1
+  %tmp6874 = getelementptr inbounds float* %tmp6873, i64 1
+  %tmp6875 = getelementptr inbounds float* %tmp6874, i64 1
+  %tmp6876 = getelementptr inbounds float* %tmp6875, i64 1
+  %tmp6877 = getelementptr inbounds float* %tmp6876, i64 1
+  %tmp6878 = getelementptr inbounds float* %tmp6877, i64 1
+  %tmp6879 = getelementptr inbounds float* %tmp6878, i64 1
+  %tmp6880 = getelementptr inbounds float* %tmp6879, i64 1
+  %tmp6881 = getelementptr inbounds float* %tmp6880, i64 1
+  %tmp6882 = getelementptr inbounds float* %tmp6881, i64 1
+  %tmp6883 = getelementptr inbounds float* %tmp6882, i64 1
+  %tmp6884 = getelementptr inbounds float* %tmp6883, i64 1
+  %tmp6885 = getelementptr inbounds float* %tmp6884, i64 1
+  %tmp6886 = getelementptr inbounds float* %tmp6885, i64 1
+  %tmp6887 = getelementptr inbounds float* %tmp6886, i64 1
+  %tmp6888 = getelementptr inbounds float* %tmp6887, i64 1
+  %tmp6889 = getelementptr inbounds float* %tmp6888, i64 1
+  %tmp6890 = getelementptr inbounds float* %tmp6889, i64 1
+  %tmp6891 = getelementptr inbounds float* %tmp6890, i64 1
+  %tmp6892 = getelementptr inbounds float* %tmp6891, i64 1
+  %tmp6893 = getelementptr inbounds float* %tmp6892, i64 1
+  %tmp6894 = getelementptr inbounds float* %tmp6893, i64 1
+  %tmp6895 = getelementptr inbounds float* %tmp6894, i64 1
+  %tmp6896 = getelementptr inbounds float* %tmp6895, i64 1
+  %tmp6897 = getelementptr inbounds float* %tmp6896, i64 1
+  %tmp6898 = getelementptr inbounds float* %tmp6897, i64 1
+  %tmp6899 = getelementptr inbounds float* %tmp6898, i64 1
+  %tmp6900 = getelementptr inbounds float* %tmp6899, i64 1
+  %tmp6901 = getelementptr inbounds float* %tmp6900, i64 1
+  %tmp6902 = getelementptr inbounds float* %tmp6901, i64 1
+  %tmp6903 = getelementptr inbounds float* %tmp6902, i64 1
+  %tmp6904 = getelementptr inbounds float* %tmp6903, i64 1
+  %tmp6905 = getelementptr inbounds float* %tmp6904, i64 1
+  %tmp6906 = getelementptr inbounds float* %tmp6905, i64 1
+  %tmp6907 = getelementptr inbounds float* %tmp6906, i64 1
+  %tmp6908 = getelementptr inbounds float* %tmp6907, i64 1
+  %tmp6909 = getelementptr inbounds float* %tmp6908, i64 1
+  %tmp6910 = getelementptr inbounds float* %tmp6909, i64 1
+  %tmp6911 = getelementptr inbounds float* %tmp6910, i64 1
+  %tmp6912 = getelementptr inbounds float* %tmp6911, i64 1
+  %tmp6913 = getelementptr inbounds float* %tmp6912, i64 1
+  %tmp6914 = getelementptr inbounds float* %tmp6913, i64 1
+  %tmp6915 = getelementptr inbounds float* %tmp6914, i64 1
+  %tmp6916 = getelementptr inbounds float* %tmp6915, i64 1
+  %tmp6917 = getelementptr inbounds float* %tmp6916, i64 1
+  %tmp6918 = getelementptr inbounds float* %tmp6917, i64 1
+  %tmp6919 = getelementptr inbounds float* %tmp6918, i64 1
+  %tmp6920 = getelementptr inbounds float* %tmp6919, i64 1
+  %tmp6921 = getelementptr inbounds float* %tmp6920, i64 1
+  %tmp6922 = getelementptr inbounds float* %tmp6921, i64 1
+  %tmp6923 = getelementptr inbounds float* %tmp6922, i64 1
+  %tmp6924 = getelementptr inbounds float* %tmp6923, i64 1
+  %tmp6925 = getelementptr inbounds float* %tmp6924, i64 1
+  %tmp6926 = getelementptr inbounds float* %tmp6925, i64 1
+  %tmp6927 = getelementptr inbounds float* %tmp6926, i64 1
+  %tmp6928 = getelementptr inbounds float* %tmp6927, i64 1
+  %tmp6929 = getelementptr inbounds float* %tmp6928, i64 1
+  %tmp6930 = getelementptr inbounds float* %tmp6929, i64 1
+  %tmp6931 = getelementptr inbounds float* %tmp6930, i64 1
+  %tmp6932 = getelementptr inbounds float* %tmp6931, i64 1
+  %tmp6933 = getelementptr inbounds float* %tmp6932, i64 1
+  %tmp6934 = getelementptr inbounds float* %tmp6933, i64 1
+  %tmp6935 = getelementptr inbounds float* %tmp6934, i64 1
+  %tmp6936 = getelementptr inbounds float* %tmp6935, i64 1
+  %tmp6937 = getelementptr inbounds float* %tmp6936, i64 1
+  %tmp6938 = getelementptr inbounds float* %tmp6937, i64 1
+  %tmp6939 = getelementptr inbounds float* %tmp6938, i64 1
+  %tmp6940 = getelementptr inbounds float* %tmp6939, i64 1
+  %tmp6941 = getelementptr inbounds float* %tmp6940, i64 1
+  %tmp6942 = getelementptr inbounds float* %tmp6941, i64 1
+  %tmp6943 = getelementptr inbounds float* %tmp6942, i64 1
+  %tmp6944 = getelementptr inbounds float* %tmp6943, i64 1
+  %tmp6945 = getelementptr inbounds float* %tmp6944, i64 1
+  %tmp6946 = getelementptr inbounds float* %tmp6945, i64 1
+  %tmp6947 = getelementptr inbounds float* %tmp6946, i64 1
+  %tmp6948 = getelementptr inbounds float* %tmp6947, i64 1
+  %tmp6949 = getelementptr inbounds float* %tmp6948, i64 1
+  %tmp6950 = getelementptr inbounds float* %tmp6949, i64 1
+  %tmp6951 = getelementptr inbounds float* %tmp6950, i64 1
+  %tmp6952 = getelementptr inbounds float* %tmp6951, i64 1
+  %tmp6953 = getelementptr inbounds float* %tmp6952, i64 1
+  %tmp6954 = getelementptr inbounds float* %tmp6953, i64 1
+  %tmp6955 = getelementptr inbounds float* %tmp6954, i64 1
+  %tmp6956 = getelementptr inbounds float* %tmp6955, i64 1
+  %tmp6957 = getelementptr inbounds float* %tmp6956, i64 1
+  %tmp6958 = getelementptr inbounds float* %tmp6957, i64 1
+  %tmp6959 = getelementptr inbounds float* %tmp6958, i64 1
+  %tmp6960 = getelementptr inbounds float* %tmp6959, i64 1
+  %tmp6961 = getelementptr inbounds float* %tmp6960, i64 1
+  %tmp6962 = getelementptr inbounds float* %tmp6961, i64 1
+  %tmp6963 = getelementptr inbounds float* %tmp6962, i64 1
+  %tmp6964 = getelementptr inbounds float* %tmp6963, i64 1
+  %tmp6965 = getelementptr inbounds float* %tmp6964, i64 1
+  %tmp6966 = getelementptr inbounds float* %tmp6965, i64 1
+  %tmp6967 = getelementptr inbounds float* %tmp6966, i64 1
+  %tmp6968 = getelementptr inbounds float* %tmp6967, i64 1
+  %tmp6969 = getelementptr inbounds float* %tmp6968, i64 1
+  %tmp6970 = getelementptr inbounds float* %tmp6969, i64 1
+  %tmp6971 = getelementptr inbounds float* %tmp6970, i64 1
+  %tmp6972 = getelementptr inbounds float* %tmp6971, i64 1
+  %tmp6973 = getelementptr inbounds float* %tmp6972, i64 1
+  %tmp6974 = getelementptr inbounds float* %tmp6973, i64 1
+  %tmp6975 = getelementptr inbounds float* %tmp6974, i64 1
+  %tmp6976 = getelementptr inbounds float* %tmp6975, i64 1
+  %tmp6977 = getelementptr inbounds float* %tmp6976, i64 1
+  %tmp6978 = getelementptr inbounds float* %tmp6977, i64 1
+  %tmp6979 = getelementptr inbounds float* %tmp6978, i64 1
+  %tmp6980 = getelementptr inbounds float* %tmp6979, i64 1
+  %tmp6981 = getelementptr inbounds float* %tmp6980, i64 1
+  %tmp6982 = getelementptr inbounds float* %tmp6981, i64 1
+  %tmp6983 = getelementptr inbounds float* %tmp6982, i64 1
+  %tmp6984 = getelementptr inbounds float* %tmp6983, i64 1
+  %tmp6985 = getelementptr inbounds float* %tmp6984, i64 1
+  %tmp6986 = getelementptr inbounds float* %tmp6985, i64 1
+  %tmp6987 = getelementptr inbounds float* %tmp6986, i64 1
+  %tmp6988 = getelementptr inbounds float* %tmp6987, i64 1
+  %tmp6989 = getelementptr inbounds float* %tmp6988, i64 1
+  %tmp6990 = getelementptr inbounds float* %tmp6989, i64 1
+  %tmp6991 = getelementptr inbounds float* %tmp6990, i64 1
+  %tmp6992 = getelementptr inbounds float* %tmp6991, i64 1
+  %tmp6993 = getelementptr inbounds float* %tmp6992, i64 1
+  %tmp6994 = getelementptr inbounds float* %tmp6993, i64 1
+  %tmp6995 = getelementptr inbounds float* %tmp6994, i64 1
+  %tmp6996 = getelementptr inbounds float* %tmp6995, i64 1
+  %tmp6997 = getelementptr inbounds float* %tmp6996, i64 1
+  %tmp6998 = getelementptr inbounds float* %tmp6997, i64 1
+  %tmp6999 = getelementptr inbounds float* %tmp6998, i64 1
+  %tmp7000 = getelementptr inbounds float* %tmp6999, i64 1
+  %tmp7001 = getelementptr inbounds float* %tmp7000, i64 1
+  %tmp7002 = getelementptr inbounds float* %tmp7001, i64 1
+  %tmp7003 = getelementptr inbounds float* %tmp7002, i64 1
+  %tmp7004 = getelementptr inbounds float* %tmp7003, i64 1
+  %tmp7005 = getelementptr inbounds float* %tmp7004, i64 1
+  %tmp7006 = getelementptr inbounds float* %tmp7005, i64 1
+  %tmp7007 = getelementptr inbounds float* %tmp7006, i64 1
+  %tmp7008 = getelementptr inbounds float* %tmp7007, i64 1
+  %tmp7009 = getelementptr inbounds float* %tmp7008, i64 1
+  %tmp7010 = getelementptr inbounds float* %tmp7009, i64 1
+  %tmp7011 = getelementptr inbounds float* %tmp7010, i64 1
+  %tmp7012 = getelementptr inbounds float* %tmp7011, i64 1
+  %tmp7013 = getelementptr inbounds float* %tmp7012, i64 1
+  %tmp7014 = getelementptr inbounds float* %tmp7013, i64 1
+  %tmp7015 = getelementptr inbounds float* %tmp7014, i64 1
+  %tmp7016 = getelementptr inbounds float* %tmp7015, i64 1
+  %tmp7017 = getelementptr inbounds float* %tmp7016, i64 1
+  %tmp7018 = getelementptr inbounds float* %tmp7017, i64 1
+  %tmp7019 = getelementptr inbounds float* %tmp7018, i64 1
+  %tmp7020 = getelementptr inbounds float* %tmp7019, i64 1
+  %tmp7021 = getelementptr inbounds float* %tmp7020, i64 1
+  %tmp7022 = getelementptr inbounds float* %tmp7021, i64 1
+  %tmp7023 = getelementptr inbounds float* %tmp7022, i64 1
+  %tmp7024 = getelementptr inbounds float* %tmp7023, i64 1
+  %tmp7025 = getelementptr inbounds float* %tmp7024, i64 1
+  %tmp7026 = getelementptr inbounds float* %tmp7025, i64 1
+  %tmp7027 = getelementptr inbounds float* %tmp7026, i64 1
+  %tmp7028 = getelementptr inbounds float* %tmp7027, i64 1
+  %tmp7029 = getelementptr inbounds float* %tmp7028, i64 1
+  %tmp7030 = getelementptr inbounds float* %tmp7029, i64 1
+  %tmp7031 = getelementptr inbounds float* %tmp7030, i64 1
+  %tmp7032 = getelementptr inbounds float* %tmp7031, i64 1
+  %tmp7033 = getelementptr inbounds float* %tmp7032, i64 1
+  %tmp7034 = getelementptr inbounds float* %tmp7033, i64 1
+  %tmp7035 = getelementptr inbounds float* %tmp7034, i64 1
+  %tmp7036 = getelementptr inbounds float* %tmp7035, i64 1
+  %tmp7037 = getelementptr inbounds float* %tmp7036, i64 1
+  %tmp7038 = getelementptr inbounds float* %tmp7037, i64 1
+  %tmp7039 = getelementptr inbounds float* %tmp7038, i64 1
+  %tmp7040 = getelementptr inbounds float* %tmp7039, i64 1
+  %tmp7041 = getelementptr inbounds float* %tmp7040, i64 1
+  %tmp7042 = getelementptr inbounds float* %tmp7041, i64 1
+  %tmp7043 = getelementptr inbounds float* %tmp7042, i64 1
+  %tmp7044 = getelementptr inbounds float* %tmp7043, i64 1
+  %tmp7045 = getelementptr inbounds float* %tmp7044, i64 1
+  %tmp7046 = getelementptr inbounds float* %tmp7045, i64 1
+  %tmp7047 = getelementptr inbounds float* %tmp7046, i64 1
+  %tmp7048 = getelementptr inbounds float* %tmp7047, i64 1
+  %tmp7049 = getelementptr inbounds float* %tmp7048, i64 1
+  %tmp7050 = getelementptr inbounds float* %tmp7049, i64 1
+  %tmp7051 = getelementptr inbounds float* %tmp7050, i64 1
+  %tmp7052 = getelementptr inbounds float* %tmp7051, i64 1
+  %tmp7053 = getelementptr inbounds float* %tmp7052, i64 1
+  %tmp7054 = getelementptr inbounds float* %tmp7053, i64 1
+  %tmp7055 = getelementptr inbounds float* %tmp7054, i64 1
+  %tmp7056 = getelementptr inbounds float* %tmp7055, i64 1
+  %tmp7057 = getelementptr inbounds float* %tmp7056, i64 1
+  %tmp7058 = getelementptr inbounds float* %tmp7057, i64 1
+  %tmp7059 = getelementptr inbounds float* %tmp7058, i64 1
+  %tmp7060 = getelementptr inbounds float* %tmp7059, i64 1
+  %tmp7061 = getelementptr inbounds float* %tmp7060, i64 1
+  %tmp7062 = getelementptr inbounds float* %tmp7061, i64 1
+  %tmp7063 = getelementptr inbounds float* %tmp7062, i64 1
+  %tmp7064 = getelementptr inbounds float* %tmp7063, i64 1
+  %tmp7065 = getelementptr inbounds float* %tmp7064, i64 1
+  %tmp7066 = getelementptr inbounds float* %tmp7065, i64 1
+  %tmp7067 = getelementptr inbounds float* %tmp7066, i64 1
+  %tmp7068 = getelementptr inbounds float* %tmp7067, i64 1
+  %tmp7069 = getelementptr inbounds float* %tmp7068, i64 1
+  %tmp7070 = getelementptr inbounds float* %tmp7069, i64 1
+  %tmp7071 = getelementptr inbounds float* %tmp7070, i64 1
+  %tmp7072 = getelementptr inbounds float* %tmp7071, i64 1
+  %tmp7073 = getelementptr inbounds float* %tmp7072, i64 1
+  %tmp7074 = getelementptr inbounds float* %tmp7073, i64 1
+  %tmp7075 = getelementptr inbounds float* %tmp7074, i64 1
+  %tmp7076 = getelementptr inbounds float* %tmp7075, i64 1
+  %tmp7077 = getelementptr inbounds float* %tmp7076, i64 1
+  %tmp7078 = getelementptr inbounds float* %tmp7077, i64 1
+  %tmp7079 = getelementptr inbounds float* %tmp7078, i64 1
+  %tmp7080 = getelementptr inbounds float* %tmp7079, i64 1
+  %tmp7081 = getelementptr inbounds float* %tmp7080, i64 1
+  %tmp7082 = getelementptr inbounds float* %tmp7081, i64 1
+  %tmp7083 = getelementptr inbounds float* %tmp7082, i64 1
+  %tmp7084 = getelementptr inbounds float* %tmp7083, i64 1
+  %tmp7085 = getelementptr inbounds float* %tmp7084, i64 1
+  %tmp7086 = getelementptr inbounds float* %tmp7085, i64 1
+  %tmp7087 = getelementptr inbounds float* %tmp7086, i64 1
+  %tmp7088 = getelementptr inbounds float* %tmp7087, i64 1
+  %tmp7089 = getelementptr inbounds float* %tmp7088, i64 1
+  %tmp7090 = getelementptr inbounds float* %tmp7089, i64 1
+  %tmp7091 = getelementptr inbounds float* %tmp7090, i64 1
+  %tmp7092 = getelementptr inbounds float* %tmp7091, i64 1
+  %tmp7093 = getelementptr inbounds float* %tmp7092, i64 1
+  %tmp7094 = getelementptr inbounds float* %tmp7093, i64 1
+  %tmp7095 = getelementptr inbounds float* %tmp7094, i64 1
+  %tmp7096 = getelementptr inbounds float* %tmp7095, i64 1
+  %tmp7097 = getelementptr inbounds float* %tmp7096, i64 1
+  %tmp7098 = getelementptr inbounds float* %tmp7097, i64 1
+  %tmp7099 = getelementptr inbounds float* %tmp7098, i64 1
+  %tmp7100 = getelementptr inbounds float* %tmp7099, i64 1
+  %tmp7101 = getelementptr inbounds float* %tmp7100, i64 1
+  %tmp7102 = getelementptr inbounds float* %tmp7101, i64 1
+  %tmp7103 = getelementptr inbounds float* %tmp7102, i64 1
+  %tmp7104 = getelementptr inbounds float* %tmp7103, i64 1
+  %tmp7105 = getelementptr inbounds float* %tmp7104, i64 1
+  %tmp7106 = getelementptr inbounds float* %tmp7105, i64 1
+  %tmp7107 = getelementptr inbounds float* %tmp7106, i64 1
+  %tmp7108 = getelementptr inbounds float* %tmp7107, i64 1
+  %tmp7109 = getelementptr inbounds float* %tmp7108, i64 1
+  %tmp7110 = getelementptr inbounds float* %tmp7109, i64 1
+  %tmp7111 = getelementptr inbounds float* %tmp7110, i64 1
+  %tmp7112 = getelementptr inbounds float* %tmp7111, i64 1
+  %tmp7113 = getelementptr inbounds float* %tmp7112, i64 1
+  %tmp7114 = getelementptr inbounds float* %tmp7113, i64 1
+  %tmp7115 = getelementptr inbounds float* %tmp7114, i64 1
+  %tmp7116 = getelementptr inbounds float* %tmp7115, i64 1
+  %tmp7117 = getelementptr inbounds float* %tmp7116, i64 1
+  %tmp7118 = getelementptr inbounds float* %tmp7117, i64 1
+  %tmp7119 = getelementptr inbounds float* %tmp7118, i64 1
+  %tmp7120 = getelementptr inbounds float* %tmp7119, i64 1
+  %tmp7121 = getelementptr inbounds float* %tmp7120, i64 1
+  %tmp7122 = getelementptr inbounds float* %tmp7121, i64 1
+  %tmp7123 = getelementptr inbounds float* %tmp7122, i64 1
+  %tmp7124 = getelementptr inbounds float* %tmp7123, i64 1
+  %tmp7125 = getelementptr inbounds float* %tmp7124, i64 1
+  %tmp7126 = getelementptr inbounds float* %tmp7125, i64 1
+  %tmp7127 = getelementptr inbounds float* %tmp7126, i64 1
+  %tmp7128 = getelementptr inbounds float* %tmp7127, i64 1
+  %tmp7129 = getelementptr inbounds float* %tmp7128, i64 1
+  %tmp7130 = getelementptr inbounds float* %tmp7129, i64 1
+  %tmp7131 = getelementptr inbounds float* %tmp7130, i64 1
+  %tmp7132 = getelementptr inbounds float* %tmp7131, i64 1
+  %tmp7133 = getelementptr inbounds float* %tmp7132, i64 1
+  %tmp7134 = getelementptr inbounds float* %tmp7133, i64 1
+  %tmp7135 = getelementptr inbounds float* %tmp7134, i64 1
+  %tmp7136 = getelementptr inbounds float* %tmp7135, i64 1
+  %tmp7137 = getelementptr inbounds float* %tmp7136, i64 1
+  %tmp7138 = getelementptr inbounds float* %tmp7137, i64 1
+  %tmp7139 = getelementptr inbounds float* %tmp7138, i64 1
+  %tmp7140 = getelementptr inbounds float* %tmp7139, i64 1
+  %tmp7141 = getelementptr inbounds float* %tmp7140, i64 1
+  %tmp7142 = getelementptr inbounds float* %tmp7141, i64 1
+  %tmp7143 = getelementptr inbounds float* %tmp7142, i64 1
+  %tmp7144 = getelementptr inbounds float* %tmp7143, i64 1
+  %tmp7145 = getelementptr inbounds float* %tmp7144, i64 1
+  %tmp7146 = getelementptr inbounds float* %tmp7145, i64 1
+  %tmp7147 = getelementptr inbounds float* %tmp7146, i64 1
+  %tmp7148 = getelementptr inbounds float* %tmp7147, i64 1
+  %tmp7149 = getelementptr inbounds float* %tmp7148, i64 1
+  %tmp7150 = getelementptr inbounds float* %tmp7149, i64 1
+  %tmp7151 = getelementptr inbounds float* %tmp7150, i64 1
+  %tmp7152 = getelementptr inbounds float* %tmp7151, i64 1
+  %tmp7153 = getelementptr inbounds float* %tmp7152, i64 1
+  %tmp7154 = getelementptr inbounds float* %tmp7153, i64 1
+  %tmp7155 = getelementptr inbounds float* %tmp7154, i64 1
+  %tmp7156 = getelementptr inbounds float* %tmp7155, i64 1
+  %tmp7157 = getelementptr inbounds float* %tmp7156, i64 1
+  %tmp7158 = getelementptr inbounds float* %tmp7157, i64 1
+  %tmp7159 = getelementptr inbounds float* %tmp7158, i64 1
+  %tmp7160 = getelementptr inbounds float* %tmp7159, i64 1
+  %tmp7161 = getelementptr inbounds float* %tmp7160, i64 1
+  %tmp7162 = getelementptr inbounds float* %tmp7161, i64 1
+  %tmp7163 = getelementptr inbounds float* %tmp7162, i64 1
+  %tmp7164 = getelementptr inbounds float* %tmp7163, i64 1
+  %tmp7165 = getelementptr inbounds float* %tmp7164, i64 1
+  %tmp7166 = getelementptr inbounds float* %tmp7165, i64 1
+  %tmp7167 = getelementptr inbounds float* %tmp7166, i64 1
+  %tmp7168 = getelementptr inbounds float* %tmp7167, i64 1
+  %tmp7169 = getelementptr inbounds float* %tmp7168, i64 1
+  %tmp7170 = getelementptr inbounds float* %tmp7169, i64 1
+  %tmp7171 = getelementptr inbounds float* %tmp7170, i64 1
+  %tmp7172 = getelementptr inbounds float* %tmp7171, i64 1
+  %tmp7173 = getelementptr inbounds float* %tmp7172, i64 1
+  %tmp7174 = getelementptr inbounds float* %tmp7173, i64 1
+  %tmp7175 = getelementptr inbounds float* %tmp7174, i64 1
+  %tmp7176 = getelementptr inbounds float* %tmp7175, i64 1
+  %tmp7177 = getelementptr inbounds float* %tmp7176, i64 1
+  %tmp7178 = getelementptr inbounds float* %tmp7177, i64 1
+  %tmp7179 = getelementptr inbounds float* %tmp7178, i64 1
+  %tmp7180 = getelementptr inbounds float* %tmp7179, i64 1
+  %tmp7181 = getelementptr inbounds float* %tmp7180, i64 1
+  %tmp7182 = getelementptr inbounds float* %tmp7181, i64 1
+  %tmp7183 = getelementptr inbounds float* %tmp7182, i64 1
+  %tmp7184 = getelementptr inbounds float* %tmp7183, i64 1
+  %tmp7185 = getelementptr inbounds float* %tmp7184, i64 1
+  %tmp7186 = getelementptr inbounds float* %tmp7185, i64 1
+  %tmp7187 = getelementptr inbounds float* %tmp7186, i64 1
+  %tmp7188 = getelementptr inbounds float* %tmp7187, i64 1
+  %tmp7189 = getelementptr inbounds float* %tmp7188, i64 1
+  %tmp7190 = getelementptr inbounds float* %tmp7189, i64 1
+  %tmp7191 = getelementptr inbounds float* %tmp7190, i64 1
+  %tmp7192 = getelementptr inbounds float* %tmp7191, i64 1
+  %tmp7193 = getelementptr inbounds float* %tmp7192, i64 1
+  %tmp7194 = getelementptr inbounds float* %tmp7193, i64 1
+  %tmp7195 = getelementptr inbounds float* %tmp7194, i64 1
+  %tmp7196 = getelementptr inbounds float* %tmp7195, i64 1
+  %tmp7197 = getelementptr inbounds float* %tmp7196, i64 1
+  %tmp7198 = getelementptr inbounds float* %tmp7197, i64 1
+  %tmp7199 = getelementptr inbounds float* %tmp7198, i64 1
+  %tmp7200 = getelementptr inbounds float* %tmp7199, i64 1
+  %tmp7201 = getelementptr inbounds float* %tmp7200, i64 1
+  %tmp7202 = getelementptr inbounds float* %tmp7201, i64 1
+  %tmp7203 = getelementptr inbounds float* %tmp7202, i64 1
+  %tmp7204 = getelementptr inbounds float* %tmp7203, i64 1
+  %tmp7205 = getelementptr inbounds float* %tmp7204, i64 1
+  %tmp7206 = getelementptr inbounds float* %tmp7205, i64 1
+  %tmp7207 = getelementptr inbounds float* %tmp7206, i64 1
+  %tmp7208 = getelementptr inbounds float* %tmp7207, i64 1
+  %tmp7209 = getelementptr inbounds float* %tmp7208, i64 1
+  %tmp7210 = getelementptr inbounds float* %tmp7209, i64 1
+  %tmp7211 = getelementptr inbounds float* %tmp7210, i64 1
+  %tmp7212 = getelementptr inbounds float* %tmp7211, i64 1
+  %tmp7213 = getelementptr inbounds float* %tmp7212, i64 1
+  %tmp7214 = getelementptr inbounds float* %tmp7213, i64 1
+  %tmp7215 = getelementptr inbounds float* %tmp7214, i64 1
+  %tmp7216 = getelementptr inbounds float* %tmp7215, i64 1
+  %tmp7217 = getelementptr inbounds float* %tmp7216, i64 1
+  %tmp7218 = getelementptr inbounds float* %tmp7217, i64 1
+  %tmp7219 = getelementptr inbounds float* %tmp7218, i64 1
+  %tmp7220 = getelementptr inbounds float* %tmp7219, i64 1
+  %tmp7221 = getelementptr inbounds float* %tmp7220, i64 1
+  %tmp7222 = getelementptr inbounds float* %tmp7221, i64 1
+  %tmp7223 = getelementptr inbounds float* %tmp7222, i64 1
+  %tmp7224 = getelementptr inbounds float* %tmp7223, i64 1
+  %tmp7225 = getelementptr inbounds float* %tmp7224, i64 1
+  %tmp7226 = getelementptr inbounds float* %tmp7225, i64 1
+  %tmp7227 = getelementptr inbounds float* %tmp7226, i64 1
+  %tmp7228 = getelementptr inbounds float* %tmp7227, i64 1
+  %tmp7229 = getelementptr inbounds float* %tmp7228, i64 1
+  %tmp7230 = getelementptr inbounds float* %tmp7229, i64 1
+  %tmp7231 = getelementptr inbounds float* %tmp7230, i64 1
+  %tmp7232 = getelementptr inbounds float* %tmp7231, i64 1
+  %tmp7233 = getelementptr inbounds float* %tmp7232, i64 1
+  %tmp7234 = getelementptr inbounds float* %tmp7233, i64 1
+  %tmp7235 = getelementptr inbounds float* %tmp7234, i64 1
+  %tmp7236 = getelementptr inbounds float* %tmp7235, i64 1
+  %tmp7237 = getelementptr inbounds float* %tmp7236, i64 1
+  %tmp7238 = getelementptr inbounds float* %tmp7237, i64 1
+  %tmp7239 = getelementptr inbounds float* %tmp7238, i64 1
+  %tmp7240 = getelementptr inbounds float* %tmp7239, i64 1
+  %tmp7241 = getelementptr inbounds float* %tmp7240, i64 1
+  %tmp7242 = getelementptr inbounds float* %tmp7241, i64 1
+  %tmp7243 = getelementptr inbounds float* %tmp7242, i64 1
+  %tmp7244 = getelementptr inbounds float* %tmp7243, i64 1
+  %tmp7245 = getelementptr inbounds float* %tmp7244, i64 1
+  %tmp7246 = getelementptr inbounds float* %tmp7245, i64 1
+  %tmp7247 = getelementptr inbounds float* %tmp7246, i64 1
+  %tmp7248 = getelementptr inbounds float* %tmp7247, i64 1
+  %tmp7249 = getelementptr inbounds float* %tmp7248, i64 1
+  %tmp7250 = getelementptr inbounds float* %tmp7249, i64 1
+  %tmp7251 = getelementptr inbounds float* %tmp7250, i64 1
+  %tmp7252 = getelementptr inbounds float* %tmp7251, i64 1
+  %tmp7253 = getelementptr inbounds float* %tmp7252, i64 1
+  %tmp7254 = getelementptr inbounds float* %tmp7253, i64 1
+  %tmp7255 = getelementptr inbounds float* %tmp7254, i64 1
+  %tmp7256 = getelementptr inbounds float* %tmp7255, i64 1
+  %tmp7257 = getelementptr inbounds float* %tmp7256, i64 1
+  %tmp7258 = getelementptr inbounds float* %tmp7257, i64 1
+  %tmp7259 = getelementptr inbounds float* %tmp7258, i64 1
+  %tmp7260 = getelementptr inbounds float* %tmp7259, i64 1
+  %tmp7261 = getelementptr inbounds float* %tmp7260, i64 1
+  %tmp7262 = getelementptr inbounds float* %tmp7261, i64 1
+  %tmp7263 = getelementptr inbounds float* %tmp7262, i64 1
+  %tmp7264 = getelementptr inbounds float* %tmp7263, i64 1
+  %tmp7265 = getelementptr inbounds float* %tmp7264, i64 1
+  %tmp7266 = getelementptr inbounds float* %tmp7265, i64 1
+  %tmp7267 = getelementptr inbounds float* %tmp7266, i64 1
+  %tmp7268 = getelementptr inbounds float* %tmp7267, i64 1
+  %tmp7269 = getelementptr inbounds float* %tmp7268, i64 1
+  %tmp7270 = getelementptr inbounds float* %tmp7269, i64 1
+  %tmp7271 = getelementptr inbounds float* %tmp7270, i64 1
+  %tmp7272 = getelementptr inbounds float* %tmp7271, i64 1
+  %tmp7273 = getelementptr inbounds float* %tmp7272, i64 1
+  %tmp7274 = getelementptr inbounds float* %tmp7273, i64 1
+  %tmp7275 = getelementptr inbounds float* %tmp7274, i64 1
+  %tmp7276 = getelementptr inbounds float* %tmp7275, i64 1
+  %tmp7277 = getelementptr inbounds float* %tmp7276, i64 1
+  %tmp7278 = getelementptr inbounds float* %tmp7277, i64 1
+  %tmp7279 = getelementptr inbounds float* %tmp7278, i64 1
+  %tmp7280 = getelementptr inbounds float* %tmp7279, i64 1
+  %tmp7281 = getelementptr inbounds float* %tmp7280, i64 1
+  %tmp7282 = getelementptr inbounds float* %tmp7281, i64 1
+  %tmp7283 = getelementptr inbounds float* %tmp7282, i64 1
+  %tmp7284 = getelementptr inbounds float* %tmp7283, i64 1
+  %tmp7285 = getelementptr inbounds float* %tmp7284, i64 1
+  %tmp7286 = getelementptr inbounds float* %tmp7285, i64 1
+  %tmp7287 = getelementptr inbounds float* %tmp7286, i64 1
+  %tmp7288 = getelementptr inbounds float* %tmp7287, i64 1
+  %tmp7289 = getelementptr inbounds float* %tmp7288, i64 1
+  %tmp7290 = getelementptr inbounds float* %tmp7289, i64 1
+  %tmp7291 = getelementptr inbounds float* %tmp7290, i64 1
+  %tmp7292 = getelementptr inbounds float* %tmp7291, i64 1
+  %tmp7293 = getelementptr inbounds float* %tmp7292, i64 1
+  %tmp7294 = getelementptr inbounds float* %tmp7293, i64 1
+  %tmp7295 = getelementptr inbounds float* %tmp7294, i64 1
+  %tmp7296 = getelementptr inbounds float* %tmp7295, i64 1
+  %tmp7297 = getelementptr inbounds float* %tmp7296, i64 1
+  %tmp7298 = getelementptr inbounds float* %tmp7297, i64 1
+  %tmp7299 = getelementptr inbounds float* %tmp7298, i64 1
+  %tmp7300 = getelementptr inbounds float* %tmp7299, i64 1
+  %tmp7301 = getelementptr inbounds float* %tmp7300, i64 1
+  %tmp7302 = getelementptr inbounds float* %tmp7301, i64 1
+  %tmp7303 = getelementptr inbounds float* %tmp7302, i64 1
+  %tmp7304 = getelementptr inbounds float* %tmp7303, i64 1
+  %tmp7305 = getelementptr inbounds float* %tmp7304, i64 1
+  %tmp7306 = getelementptr inbounds float* %tmp7305, i64 1
+  %tmp7307 = getelementptr inbounds float* %tmp7306, i64 1
+  %tmp7308 = getelementptr inbounds float* %tmp7307, i64 1
+  %tmp7309 = getelementptr inbounds float* %tmp7308, i64 1
+  %tmp7310 = getelementptr inbounds float* %tmp7309, i64 1
+  %tmp7311 = getelementptr inbounds float* %tmp7310, i64 1
+  %tmp7312 = getelementptr inbounds float* %tmp7311, i64 1
+  %tmp7313 = getelementptr inbounds float* %tmp7312, i64 1
+  %tmp7314 = getelementptr inbounds float* %tmp7313, i64 1
+  %tmp7315 = getelementptr inbounds float* %tmp7314, i64 1
+  %tmp7316 = getelementptr inbounds float* %tmp7315, i64 1
+  %tmp7317 = getelementptr inbounds float* %tmp7316, i64 1
+  %tmp7318 = getelementptr inbounds float* %tmp7317, i64 1
+  %tmp7319 = getelementptr inbounds float* %tmp7318, i64 1
+  %tmp7320 = getelementptr inbounds float* %tmp7319, i64 1
+  %tmp7321 = getelementptr inbounds float* %tmp7320, i64 1
+  %tmp7322 = getelementptr inbounds float* %tmp7321, i64 1
+  %tmp7323 = getelementptr inbounds float* %tmp7322, i64 1
+  %tmp7324 = getelementptr inbounds float* %tmp7323, i64 1
+  %tmp7325 = getelementptr inbounds float* %tmp7324, i64 1
+  %tmp7326 = getelementptr inbounds float* %tmp7325, i64 1
+  %tmp7327 = getelementptr inbounds float* %tmp7326, i64 1
+  %tmp7328 = getelementptr inbounds float* %tmp7327, i64 1
+  %tmp7329 = getelementptr inbounds float* %tmp7328, i64 1
+  %tmp7330 = getelementptr inbounds float* %tmp7329, i64 1
+  %tmp7331 = getelementptr inbounds float* %tmp7330, i64 1
+  %tmp7332 = getelementptr inbounds float* %tmp7331, i64 1
+  %tmp7333 = getelementptr inbounds float* %tmp7332, i64 1
+  %tmp7334 = getelementptr inbounds float* %tmp7333, i64 1
+  %tmp7335 = getelementptr inbounds float* %tmp7334, i64 1
+  %tmp7336 = getelementptr inbounds float* %tmp7335, i64 1
+  %tmp7337 = getelementptr inbounds float* %tmp7336, i64 1
+  %tmp7338 = getelementptr inbounds float* %tmp7337, i64 1
+  %tmp7339 = getelementptr inbounds float* %tmp7338, i64 1
+  %tmp7340 = getelementptr inbounds float* %tmp7339, i64 1
+  %tmp7341 = getelementptr inbounds float* %tmp7340, i64 1
+  %tmp7342 = getelementptr inbounds float* %tmp7341, i64 1
+  %tmp7343 = getelementptr inbounds float* %tmp7342, i64 1
+  %tmp7344 = getelementptr inbounds float* %tmp7343, i64 1
+  %tmp7345 = getelementptr inbounds float* %tmp7344, i64 1
+  %tmp7346 = getelementptr inbounds float* %tmp7345, i64 1
+  %tmp7347 = getelementptr inbounds float* %tmp7346, i64 1
+  %tmp7348 = getelementptr inbounds float* %tmp7347, i64 1
+  %tmp7349 = getelementptr inbounds float* %tmp7348, i64 1
+  %tmp7350 = getelementptr inbounds float* %tmp7349, i64 1
+  %tmp7351 = getelementptr inbounds float* %tmp7350, i64 1
+  %tmp7352 = getelementptr inbounds float* %tmp7351, i64 1
+  %tmp7353 = getelementptr inbounds float* %tmp7352, i64 1
+  %tmp7354 = getelementptr inbounds float* %tmp7353, i64 1
+  %tmp7355 = getelementptr inbounds float* %tmp7354, i64 1
+  %tmp7356 = getelementptr inbounds float* %tmp7355, i64 1
+  %tmp7357 = getelementptr inbounds float* %tmp7356, i64 1
+  %tmp7358 = getelementptr inbounds float* %tmp7357, i64 1
+  %tmp7359 = getelementptr inbounds float* %tmp7358, i64 1
+  %tmp7360 = getelementptr inbounds float* %tmp7359, i64 1
+  %tmp7361 = getelementptr inbounds float* %tmp7360, i64 1
+  %tmp7362 = getelementptr inbounds float* %tmp7361, i64 1
+  %tmp7363 = getelementptr inbounds float* %tmp7362, i64 1
+  %tmp7364 = getelementptr inbounds float* %tmp7363, i64 1
+  %tmp7365 = getelementptr inbounds float* %tmp7364, i64 1
+  %tmp7366 = getelementptr inbounds float* %tmp7365, i64 1
+  %tmp7367 = getelementptr inbounds float* %tmp7366, i64 1
+  %tmp7368 = getelementptr inbounds float* %tmp7367, i64 1
+  %tmp7369 = getelementptr inbounds float* %tmp7368, i64 1
+  %tmp7370 = getelementptr inbounds float* %tmp7369, i64 1
+  %tmp7371 = getelementptr inbounds float* %tmp7370, i64 1
+  %tmp7372 = getelementptr inbounds float* %tmp7371, i64 1
+  %tmp7373 = getelementptr inbounds float* %tmp7372, i64 1
+  %tmp7374 = getelementptr inbounds float* %tmp7373, i64 1
+  %tmp7375 = getelementptr inbounds float* %tmp7374, i64 1
+  %tmp7376 = getelementptr inbounds float* %tmp7375, i64 1
+  %tmp7377 = getelementptr inbounds float* %tmp7376, i64 1
+  %tmp7378 = getelementptr inbounds float* %tmp7377, i64 1
+  %tmp7379 = getelementptr inbounds float* %tmp7378, i64 1
+  %tmp7380 = getelementptr inbounds float* %tmp7379, i64 1
+  %tmp7381 = getelementptr inbounds float* %tmp7380, i64 1
+  %tmp7382 = getelementptr inbounds float* %tmp7381, i64 1
+  %tmp7383 = getelementptr inbounds float* %tmp7382, i64 1
+  %tmp7384 = getelementptr inbounds float* %tmp7383, i64 1
+  %tmp7385 = getelementptr inbounds float* %tmp7384, i64 1
+  %tmp7386 = getelementptr inbounds float* %tmp7385, i64 1
+  %tmp7387 = getelementptr inbounds float* %tmp7386, i64 1
+  %tmp7388 = getelementptr inbounds float* %tmp7387, i64 1
+  %tmp7389 = getelementptr inbounds float* %tmp7388, i64 1
+  %tmp7390 = getelementptr inbounds float* %tmp7389, i64 1
+  %tmp7391 = getelementptr inbounds float* %tmp7390, i64 1
+  %tmp7392 = getelementptr inbounds float* %tmp7391, i64 1
+  %tmp7393 = getelementptr inbounds float* %tmp7392, i64 1
+  %tmp7394 = getelementptr inbounds float* %tmp7393, i64 1
+  %tmp7395 = getelementptr inbounds float* %tmp7394, i64 1
+  %tmp7396 = getelementptr inbounds float* %tmp7395, i64 1
+  %tmp7397 = getelementptr inbounds float* %tmp7396, i64 1
+  %tmp7398 = getelementptr inbounds float* %tmp7397, i64 1
+  %tmp7399 = getelementptr inbounds float* %tmp7398, i64 1
+  %tmp7400 = getelementptr inbounds float* %tmp7399, i64 1
+  %tmp7401 = getelementptr inbounds float* %tmp7400, i64 1
+  %tmp7402 = getelementptr inbounds float* %tmp7401, i64 1
+  %tmp7403 = getelementptr inbounds float* %tmp7402, i64 1
+  %tmp7404 = getelementptr inbounds float* %tmp7403, i64 1
+  %tmp7405 = getelementptr inbounds float* %tmp7404, i64 1
+  %tmp7406 = getelementptr inbounds float* %tmp7405, i64 1
+  %tmp7407 = getelementptr inbounds float* %tmp7406, i64 1
+  %tmp7408 = getelementptr inbounds float* %tmp7407, i64 1
+  %tmp7409 = getelementptr inbounds float* %tmp7408, i64 1
+  %tmp7410 = getelementptr inbounds float* %tmp7409, i64 1
+  %tmp7411 = getelementptr inbounds float* %tmp7410, i64 1
+  %tmp7412 = getelementptr inbounds float* %tmp7411, i64 1
+  %tmp7413 = getelementptr inbounds float* %tmp7412, i64 1
+  %tmp7414 = getelementptr inbounds float* %tmp7413, i64 1
+  %tmp7415 = getelementptr inbounds float* %tmp7414, i64 1
+  %tmp7416 = getelementptr inbounds float* %tmp7415, i64 1
+  %tmp7417 = getelementptr inbounds float* %tmp7416, i64 1
+  %tmp7418 = getelementptr inbounds float* %tmp7417, i64 1
+  %tmp7419 = getelementptr inbounds float* %tmp7418, i64 1
+  %tmp7420 = getelementptr inbounds float* %tmp7419, i64 1
+  %tmp7421 = getelementptr inbounds float* %tmp7420, i64 1
+  %tmp7422 = getelementptr inbounds float* %tmp7421, i64 1
+  %tmp7423 = getelementptr inbounds float* %tmp7422, i64 1
+  %tmp7424 = getelementptr inbounds float* %tmp7423, i64 1
+  %tmp7425 = getelementptr inbounds float* %tmp7424, i64 1
+  %tmp7426 = getelementptr inbounds float* %tmp7425, i64 1
+  %tmp7427 = getelementptr inbounds float* %tmp7426, i64 1
+  %tmp7428 = getelementptr inbounds float* %tmp7427, i64 1
+  %tmp7429 = getelementptr inbounds float* %tmp7428, i64 1
+  %tmp7430 = getelementptr inbounds float* %tmp7429, i64 1
+  %tmp7431 = getelementptr inbounds float* %tmp7430, i64 1
+  %tmp7432 = getelementptr inbounds float* %tmp7431, i64 1
+  %tmp7433 = getelementptr inbounds float* %tmp7432, i64 1
+  %tmp7434 = getelementptr inbounds float* %tmp7433, i64 1
+  %tmp7435 = getelementptr inbounds float* %tmp7434, i64 1
+  %tmp7436 = getelementptr inbounds float* %tmp7435, i64 1
+  %tmp7437 = getelementptr inbounds float* %tmp7436, i64 1
+  %tmp7438 = getelementptr inbounds float* %tmp7437, i64 1
+  %tmp7439 = getelementptr inbounds float* %tmp7438, i64 1
+  %tmp7440 = getelementptr inbounds float* %tmp7439, i64 1
+  %tmp7441 = getelementptr inbounds float* %tmp7440, i64 1
+  %tmp7442 = getelementptr inbounds float* %tmp7441, i64 1
+  %tmp7443 = getelementptr inbounds float* %tmp7442, i64 1
+  %tmp7444 = getelementptr inbounds float* %tmp7443, i64 1
+  %tmp7445 = getelementptr inbounds float* %tmp7444, i64 1
+  %tmp7446 = getelementptr inbounds float* %tmp7445, i64 1
+  %tmp7447 = getelementptr inbounds float* %tmp7446, i64 1
+  %tmp7448 = getelementptr inbounds float* %tmp7447, i64 1
+  %tmp7449 = getelementptr inbounds float* %tmp7448, i64 1
+  %tmp7450 = getelementptr inbounds float* %tmp7449, i64 1
+  %tmp7451 = getelementptr inbounds float* %tmp7450, i64 1
+  %tmp7452 = getelementptr inbounds float* %tmp7451, i64 1
+  %tmp7453 = getelementptr inbounds float* %tmp7452, i64 1
+  %tmp7454 = getelementptr inbounds float* %tmp7453, i64 1
+  %tmp7455 = getelementptr inbounds float* %tmp7454, i64 1
+  %tmp7456 = getelementptr inbounds float* %tmp7455, i64 1
+  %tmp7457 = getelementptr inbounds float* %tmp7456, i64 1
+  %tmp7458 = getelementptr inbounds float* %tmp7457, i64 1
+  %tmp7459 = getelementptr inbounds float* %tmp7458, i64 1
+  %tmp7460 = getelementptr inbounds float* %tmp7459, i64 1
+  %tmp7461 = getelementptr inbounds float* %tmp7460, i64 1
+  %tmp7462 = getelementptr inbounds float* %tmp7461, i64 1
+  %tmp7463 = getelementptr inbounds float* %tmp7462, i64 1
+  %tmp7464 = getelementptr inbounds float* %tmp7463, i64 1
+  %tmp7465 = getelementptr inbounds float* %tmp7464, i64 1
+  %tmp7466 = getelementptr inbounds float* %tmp7465, i64 1
+  %tmp7467 = getelementptr inbounds float* %tmp7466, i64 1
+  %tmp7468 = getelementptr inbounds float* %tmp7467, i64 1
+  %tmp7469 = getelementptr inbounds float* %tmp7468, i64 1
+  %tmp7470 = getelementptr inbounds float* %tmp7469, i64 1
+  %tmp7471 = getelementptr inbounds float* %tmp7470, i64 1
+  %tmp7472 = getelementptr inbounds float* %tmp7471, i64 1
+  %tmp7473 = getelementptr inbounds float* %tmp7472, i64 1
+  %tmp7474 = getelementptr inbounds float* %tmp7473, i64 1
+  %tmp7475 = getelementptr inbounds float* %tmp7474, i64 1
+  %tmp7476 = getelementptr inbounds float* %tmp7475, i64 1
+  %tmp7477 = getelementptr inbounds float* %tmp7476, i64 1
+  %tmp7478 = getelementptr inbounds float* %tmp7477, i64 1
+  %tmp7479 = getelementptr inbounds float* %tmp7478, i64 1
+  %tmp7480 = getelementptr inbounds float* %tmp7479, i64 1
+  %tmp7481 = getelementptr inbounds float* %tmp7480, i64 1
+  %tmp7482 = getelementptr inbounds float* %tmp7481, i64 1
+  %tmp7483 = getelementptr inbounds float* %tmp7482, i64 1
+  %tmp7484 = getelementptr inbounds float* %tmp7483, i64 1
+  %tmp7485 = getelementptr inbounds float* %tmp7484, i64 1
+  %tmp7486 = getelementptr inbounds float* %tmp7485, i64 1
+  %tmp7487 = getelementptr inbounds float* %tmp7486, i64 1
+  %tmp7488 = getelementptr inbounds float* %tmp7487, i64 1
+  %tmp7489 = getelementptr inbounds float* %tmp7488, i64 1
+  %tmp7490 = getelementptr inbounds float* %tmp7489, i64 1
+  %tmp7491 = getelementptr inbounds float* %tmp7490, i64 1
+  %tmp7492 = getelementptr inbounds float* %tmp7491, i64 1
+  %tmp7493 = getelementptr inbounds float* %tmp7492, i64 1
+  %tmp7494 = getelementptr inbounds float* %tmp7493, i64 1
+  %tmp7495 = getelementptr inbounds float* %tmp7494, i64 1
+  %tmp7496 = getelementptr inbounds float* %tmp7495, i64 1
+  %tmp7497 = getelementptr inbounds float* %tmp7496, i64 1
+  %tmp7498 = getelementptr inbounds float* %tmp7497, i64 1
+  %tmp7499 = getelementptr inbounds float* %tmp7498, i64 1
+  %tmp7500 = getelementptr inbounds float* %tmp7499, i64 1
+  %tmp7501 = getelementptr inbounds float* %tmp7500, i64 1
+  %tmp7502 = getelementptr inbounds float* %tmp7501, i64 1
+  %tmp7503 = getelementptr inbounds float* %tmp7502, i64 1
+  %tmp7504 = getelementptr inbounds float* %tmp7503, i64 1
+  %tmp7505 = getelementptr inbounds float* %tmp7504, i64 1
+  %tmp7506 = getelementptr inbounds float* %tmp7505, i64 1
+  %tmp7507 = getelementptr inbounds float* %tmp7506, i64 1
+  %tmp7508 = getelementptr inbounds float* %tmp7507, i64 1
+  %tmp7509 = getelementptr inbounds float* %tmp7508, i64 1
+  %tmp7510 = getelementptr inbounds float* %tmp7509, i64 1
+  %tmp7511 = getelementptr inbounds float* %tmp7510, i64 1
+  %tmp7512 = getelementptr inbounds float* %tmp7511, i64 1
+  %tmp7513 = getelementptr inbounds float* %tmp7512, i64 1
+  %tmp7514 = getelementptr inbounds float* %tmp7513, i64 1
+  %tmp7515 = getelementptr inbounds float* %tmp7514, i64 1
+  %tmp7516 = getelementptr inbounds float* %tmp7515, i64 1
+  %tmp7517 = getelementptr inbounds float* %tmp7516, i64 1
+  %tmp7518 = getelementptr inbounds float* %tmp7517, i64 1
+  %tmp7519 = getelementptr inbounds float* %tmp7518, i64 1
+  %tmp7520 = getelementptr inbounds float* %tmp7519, i64 1
+  %tmp7521 = getelementptr inbounds float* %tmp7520, i64 1
+  %tmp7522 = getelementptr inbounds float* %tmp7521, i64 1
+  %tmp7523 = getelementptr inbounds float* %tmp7522, i64 1
+  %tmp7524 = getelementptr inbounds float* %tmp7523, i64 1
+  %tmp7525 = getelementptr inbounds float* %tmp7524, i64 1
+  %tmp7526 = getelementptr inbounds float* %tmp7525, i64 1
+  %tmp7527 = getelementptr inbounds float* %tmp7526, i64 1
+  %tmp7528 = getelementptr inbounds float* %tmp7527, i64 1
+  %tmp7529 = getelementptr inbounds float* %tmp7528, i64 1
+  %tmp7530 = getelementptr inbounds float* %tmp7529, i64 1
+  %tmp7531 = getelementptr inbounds float* %tmp7530, i64 1
+  %tmp7532 = getelementptr inbounds float* %tmp7531, i64 1
+  %tmp7533 = getelementptr inbounds float* %tmp7532, i64 1
+  %tmp7534 = getelementptr inbounds float* %tmp7533, i64 1
+  %tmp7535 = getelementptr inbounds float* %tmp7534, i64 1
+  %tmp7536 = getelementptr inbounds float* %tmp7535, i64 1
+  %tmp7537 = getelementptr inbounds float* %tmp7536, i64 1
+  %tmp7538 = getelementptr inbounds float* %tmp7537, i64 1
+  %tmp7539 = getelementptr inbounds float* %tmp7538, i64 1
+  %tmp7540 = getelementptr inbounds float* %tmp7539, i64 1
+  %tmp7541 = getelementptr inbounds float* %tmp7540, i64 1
+  %tmp7542 = getelementptr inbounds float* %tmp7541, i64 1
+  %tmp7543 = getelementptr inbounds float* %tmp7542, i64 1
+  %tmp7544 = getelementptr inbounds float* %tmp7543, i64 1
+  %tmp7545 = getelementptr inbounds float* %tmp7544, i64 1
+  %tmp7546 = getelementptr inbounds float* %tmp7545, i64 1
+  %tmp7547 = getelementptr inbounds float* %tmp7546, i64 1
+  %tmp7548 = getelementptr inbounds float* %tmp7547, i64 1
+  %tmp7549 = getelementptr inbounds float* %tmp7548, i64 1
+  %tmp7550 = getelementptr inbounds float* %tmp7549, i64 1
+  %tmp7551 = getelementptr inbounds float* %tmp7550, i64 1
+  %tmp7552 = getelementptr inbounds float* %tmp7551, i64 1
+  %tmp7553 = getelementptr inbounds float* %tmp7552, i64 1
+  %tmp7554 = getelementptr inbounds float* %tmp7553, i64 1
+  %tmp7555 = getelementptr inbounds float* %tmp7554, i64 1
+  %tmp7556 = getelementptr inbounds float* %tmp7555, i64 1
+  %tmp7557 = getelementptr inbounds float* %tmp7556, i64 1
+  %tmp7558 = getelementptr inbounds float* %tmp7557, i64 1
+  %tmp7559 = getelementptr inbounds float* %tmp7558, i64 1
+  %tmp7560 = getelementptr inbounds float* %tmp7559, i64 1
+  %tmp7561 = getelementptr inbounds float* %tmp7560, i64 1
+  %tmp7562 = getelementptr inbounds float* %tmp7561, i64 1
+  %tmp7563 = getelementptr inbounds float* %tmp7562, i64 1
+  %tmp7564 = getelementptr inbounds float* %tmp7563, i64 1
+  %tmp7565 = getelementptr inbounds float* %tmp7564, i64 1
+  %tmp7566 = getelementptr inbounds float* %tmp7565, i64 1
+  %tmp7567 = getelementptr inbounds float* %tmp7566, i64 1
+  %tmp7568 = getelementptr inbounds float* %tmp7567, i64 1
+  %tmp7569 = getelementptr inbounds float* %tmp7568, i64 1
+  %tmp7570 = getelementptr inbounds float* %tmp7569, i64 1
+  %tmp7571 = getelementptr inbounds float* %tmp7570, i64 1
+  %tmp7572 = getelementptr inbounds float* %tmp7571, i64 1
+  %tmp7573 = getelementptr inbounds float* %tmp7572, i64 1
+  %tmp7574 = getelementptr inbounds float* %tmp7573, i64 1
+  %tmp7575 = getelementptr inbounds float* %tmp7574, i64 1
+  %tmp7576 = getelementptr inbounds float* %tmp7575, i64 1
+  %tmp7577 = getelementptr inbounds float* %tmp7576, i64 1
+  %tmp7578 = getelementptr inbounds float* %tmp7577, i64 1
+  %tmp7579 = getelementptr inbounds float* %tmp7578, i64 1
+  %tmp7580 = getelementptr inbounds float* %tmp7579, i64 1
+  %tmp7581 = getelementptr inbounds float* %tmp7580, i64 1
+  %tmp7582 = getelementptr inbounds float* %tmp7581, i64 1
+  %tmp7583 = getelementptr inbounds float* %tmp7582, i64 1
+  %tmp7584 = getelementptr inbounds float* %tmp7583, i64 1
+  %tmp7585 = getelementptr inbounds float* %tmp7584, i64 1
+  %tmp7586 = getelementptr inbounds float* %tmp7585, i64 1
+  %tmp7587 = getelementptr inbounds float* %tmp7586, i64 1
+  %tmp7588 = getelementptr inbounds float* %tmp7587, i64 1
+  %tmp7589 = getelementptr inbounds float* %tmp7588, i64 1
+  %tmp7590 = getelementptr inbounds float* %tmp7589, i64 1
+  %tmp7591 = getelementptr inbounds float* %tmp7590, i64 1
+  %tmp7592 = getelementptr inbounds float* %tmp7591, i64 1
+  %tmp7593 = getelementptr inbounds float* %tmp7592, i64 1
+  %tmp7594 = getelementptr inbounds float* %tmp7593, i64 1
+  %tmp7595 = getelementptr inbounds float* %tmp7594, i64 1
+  %tmp7596 = getelementptr inbounds float* %tmp7595, i64 1
+  %tmp7597 = getelementptr inbounds float* %tmp7596, i64 1
+  %tmp7598 = getelementptr inbounds float* %tmp7597, i64 1
+  %tmp7599 = getelementptr inbounds float* %tmp7598, i64 1
+  %tmp7600 = getelementptr inbounds float* %tmp7599, i64 1
+  %tmp7601 = getelementptr inbounds float* %tmp7600, i64 1
+  %tmp7602 = getelementptr inbounds float* %tmp7601, i64 1
+  %tmp7603 = getelementptr inbounds float* %tmp7602, i64 1
+  %tmp7604 = getelementptr inbounds float* %tmp7603, i64 1
+  %tmp7605 = getelementptr inbounds float* %tmp7604, i64 1
+  %tmp7606 = getelementptr inbounds float* %tmp7605, i64 1
+  %tmp7607 = getelementptr inbounds float* %tmp7606, i64 1
+  %tmp7608 = getelementptr inbounds float* %tmp7607, i64 1
+  %tmp7609 = getelementptr inbounds float* %tmp7608, i64 1
+  %tmp7610 = getelementptr inbounds float* %tmp7609, i64 1
+  %tmp7611 = getelementptr inbounds float* %tmp7610, i64 1
+  %tmp7612 = getelementptr inbounds float* %tmp7611, i64 1
+  %tmp7613 = getelementptr inbounds float* %tmp7612, i64 1
+  %tmp7614 = getelementptr inbounds float* %tmp7613, i64 1
+  %tmp7615 = getelementptr inbounds float* %tmp7614, i64 1
+  %tmp7616 = getelementptr inbounds float* %tmp7615, i64 1
+  %tmp7617 = getelementptr inbounds float* %tmp7616, i64 1
+  %tmp7618 = getelementptr inbounds float* %tmp7617, i64 1
+  %tmp7619 = getelementptr inbounds float* %tmp7618, i64 1
+  %tmp7620 = getelementptr inbounds float* %tmp7619, i64 1
+  %tmp7621 = getelementptr inbounds float* %tmp7620, i64 1
+  %tmp7622 = getelementptr inbounds float* %tmp7621, i64 1
+  %tmp7623 = getelementptr inbounds float* %tmp7622, i64 1
+  %tmp7624 = getelementptr inbounds float* %tmp7623, i64 1
+  %tmp7625 = getelementptr inbounds float* %tmp7624, i64 1
+  %tmp7626 = getelementptr inbounds float* %tmp7625, i64 1
+  %tmp7627 = getelementptr inbounds float* %tmp7626, i64 1
+  %tmp7628 = getelementptr inbounds float* %tmp7627, i64 1
+  %tmp7629 = getelementptr inbounds float* %tmp7628, i64 1
+  %tmp7630 = getelementptr inbounds float* %tmp7629, i64 1
+  %tmp7631 = getelementptr inbounds float* %tmp7630, i64 1
+  %tmp7632 = getelementptr inbounds float* %tmp7631, i64 1
+  %tmp7633 = getelementptr inbounds float* %tmp7632, i64 1
+  %tmp7634 = getelementptr inbounds float* %tmp7633, i64 1
+  %tmp7635 = getelementptr inbounds float* %tmp7634, i64 1
+  %tmp7636 = getelementptr inbounds float* %tmp7635, i64 1
+  %tmp7637 = getelementptr inbounds float* %tmp7636, i64 1
+  %tmp7638 = getelementptr inbounds float* %tmp7637, i64 1
+  %tmp7639 = getelementptr inbounds float* %tmp7638, i64 1
+  %tmp7640 = getelementptr inbounds float* %tmp7639, i64 1
+  %tmp7641 = getelementptr inbounds float* %tmp7640, i64 1
+  %tmp7642 = getelementptr inbounds float* %tmp7641, i64 1
+  %tmp7643 = getelementptr inbounds float* %tmp7642, i64 1
+  %tmp7644 = getelementptr inbounds float* %tmp7643, i64 1
+  %tmp7645 = getelementptr inbounds float* %tmp7644, i64 1
+  %tmp7646 = getelementptr inbounds float* %tmp7645, i64 1
+  %tmp7647 = getelementptr inbounds float* %tmp7646, i64 1
+  %tmp7648 = getelementptr inbounds float* %tmp7647, i64 1
+  %tmp7649 = getelementptr inbounds float* %tmp7648, i64 1
+  %tmp7650 = getelementptr inbounds float* %tmp7649, i64 1
+  %tmp7651 = getelementptr inbounds float* %tmp7650, i64 1
+  %tmp7652 = getelementptr inbounds float* %tmp7651, i64 1
+  %tmp7653 = getelementptr inbounds float* %tmp7652, i64 1
+  %tmp7654 = getelementptr inbounds float* %tmp7653, i64 1
+  %tmp7655 = getelementptr inbounds float* %tmp7654, i64 1
+  %tmp7656 = getelementptr inbounds float* %tmp7655, i64 1
+  %tmp7657 = getelementptr inbounds float* %tmp7656, i64 1
+  %tmp7658 = getelementptr inbounds float* %tmp7657, i64 1
+  %tmp7659 = getelementptr inbounds float* %tmp7658, i64 1
+  %tmp7660 = getelementptr inbounds float* %tmp7659, i64 1
+  %tmp7661 = getelementptr inbounds float* %tmp7660, i64 1
+  %tmp7662 = getelementptr inbounds float* %tmp7661, i64 1
+  %tmp7663 = getelementptr inbounds float* %tmp7662, i64 1
+  %tmp7664 = getelementptr inbounds float* %tmp7663, i64 1
+  %tmp7665 = getelementptr inbounds float* %tmp7664, i64 1
+  %tmp7666 = getelementptr inbounds float* %tmp7665, i64 1
+  %tmp7667 = getelementptr inbounds float* %tmp7666, i64 1
+  %tmp7668 = getelementptr inbounds float* %tmp7667, i64 1
+  %tmp7669 = getelementptr inbounds float* %tmp7668, i64 1
+  %tmp7670 = getelementptr inbounds float* %tmp7669, i64 1
+  %tmp7671 = getelementptr inbounds float* %tmp7670, i64 1
+  %tmp7672 = getelementptr inbounds float* %tmp7671, i64 1
+  %tmp7673 = getelementptr inbounds float* %tmp7672, i64 1
+  %tmp7674 = getelementptr inbounds float* %tmp7673, i64 1
+  %tmp7675 = getelementptr inbounds float* %tmp7674, i64 1
+  %tmp7676 = getelementptr inbounds float* %tmp7675, i64 1
+  %tmp7677 = getelementptr inbounds float* %tmp7676, i64 1
+  %tmp7678 = getelementptr inbounds float* %tmp7677, i64 1
+  %tmp7679 = getelementptr inbounds float* %tmp7678, i64 1
+  %tmp7680 = getelementptr inbounds float* %tmp7679, i64 1
+  %tmp7681 = getelementptr inbounds float* %tmp7680, i64 1
+  %tmp7682 = getelementptr inbounds float* %tmp7681, i64 1
+  %tmp7683 = getelementptr inbounds float* %tmp7682, i64 1
+  %tmp7684 = getelementptr inbounds float* %tmp7683, i64 1
+  %tmp7685 = getelementptr inbounds float* %tmp7684, i64 1
+  %tmp7686 = getelementptr inbounds float* %tmp7685, i64 1
+  %tmp7687 = getelementptr inbounds float* %tmp7686, i64 1
+  %tmp7688 = getelementptr inbounds float* %tmp7687, i64 1
+  %tmp7689 = getelementptr inbounds float* %tmp7688, i64 1
+  %tmp7690 = getelementptr inbounds float* %tmp7689, i64 1
+  %tmp7691 = getelementptr inbounds float* %tmp7690, i64 1
+  %tmp7692 = getelementptr inbounds float* %tmp7691, i64 1
+  %tmp7693 = getelementptr inbounds float* %tmp7692, i64 1
+  %tmp7694 = getelementptr inbounds float* %tmp7693, i64 1
+  %tmp7695 = getelementptr inbounds float* %tmp7694, i64 1
+  %tmp7696 = getelementptr inbounds float* %tmp7695, i64 1
+  %tmp7697 = getelementptr inbounds float* %tmp7696, i64 1
+  %tmp7698 = getelementptr inbounds float* %tmp7697, i64 1
+  %tmp7699 = getelementptr inbounds float* %tmp7698, i64 1
+  %tmp7700 = getelementptr inbounds float* %tmp7699, i64 1
+  %tmp7701 = getelementptr inbounds float* %tmp7700, i64 1
+  %tmp7702 = getelementptr inbounds float* %tmp7701, i64 1
+  %tmp7703 = getelementptr inbounds float* %tmp7702, i64 1
+  %tmp7704 = getelementptr inbounds float* %tmp7703, i64 1
+  %tmp7705 = getelementptr inbounds float* %tmp7704, i64 1
+  %tmp7706 = getelementptr inbounds float* %tmp7705, i64 1
+  %tmp7707 = getelementptr inbounds float* %tmp7706, i64 1
+  %tmp7708 = getelementptr inbounds float* %tmp7707, i64 1
+  %tmp7709 = getelementptr inbounds float* %tmp7708, i64 1
+  %tmp7710 = getelementptr inbounds float* %tmp7709, i64 1
+  %tmp7711 = getelementptr inbounds float* %tmp7710, i64 1
+  %tmp7712 = getelementptr inbounds float* %tmp7711, i64 1
+  %tmp7713 = getelementptr inbounds float* %tmp7712, i64 1
+  %tmp7714 = getelementptr inbounds float* %tmp7713, i64 1
+  %tmp7715 = getelementptr inbounds float* %tmp7714, i64 1
+  %tmp7716 = getelementptr inbounds float* %tmp7715, i64 1
+  %tmp7717 = getelementptr inbounds float* %tmp7716, i64 1
+  %tmp7718 = getelementptr inbounds float* %tmp7717, i64 1
+  %tmp7719 = getelementptr inbounds float* %tmp7718, i64 1
+  %tmp7720 = getelementptr inbounds float* %tmp7719, i64 1
+  %tmp7721 = getelementptr inbounds float* %tmp7720, i64 1
+  %tmp7722 = getelementptr inbounds float* %tmp7721, i64 1
+  %tmp7723 = getelementptr inbounds float* %tmp7722, i64 1
+  %tmp7724 = getelementptr inbounds float* %tmp7723, i64 1
+  %tmp7725 = getelementptr inbounds float* %tmp7724, i64 1
+  %tmp7726 = getelementptr inbounds float* %tmp7725, i64 1
+  %tmp7727 = getelementptr inbounds float* %tmp7726, i64 1
+  %tmp7728 = getelementptr inbounds float* %tmp7727, i64 1
+  %tmp7729 = getelementptr inbounds float* %tmp7728, i64 1
+  %tmp7730 = getelementptr inbounds float* %tmp7729, i64 1
+  %tmp7731 = getelementptr inbounds float* %tmp7730, i64 1
+  %tmp7732 = getelementptr inbounds float* %tmp7731, i64 1
+  %tmp7733 = getelementptr inbounds float* %tmp7732, i64 1
+  %tmp7734 = getelementptr inbounds float* %tmp7733, i64 1
+  %tmp7735 = getelementptr inbounds float* %tmp7734, i64 1
+  %tmp7736 = getelementptr inbounds float* %tmp7735, i64 1
+  %tmp7737 = getelementptr inbounds float* %tmp7736, i64 1
+  %tmp7738 = getelementptr inbounds float* %tmp7737, i64 1
+  %tmp7739 = getelementptr inbounds float* %tmp7738, i64 1
+  %tmp7740 = getelementptr inbounds float* %tmp7739, i64 1
+  %tmp7741 = getelementptr inbounds float* %tmp7740, i64 1
+  %tmp7742 = getelementptr inbounds float* %tmp7741, i64 1
+  %tmp7743 = getelementptr inbounds float* %tmp7742, i64 1
+  %tmp7744 = getelementptr inbounds float* %tmp7743, i64 1
+  %tmp7745 = getelementptr inbounds float* %tmp7744, i64 1
+  %tmp7746 = getelementptr inbounds float* %tmp7745, i64 1
+  %tmp7747 = getelementptr inbounds float* %tmp7746, i64 1
+  %tmp7748 = getelementptr inbounds float* %tmp7747, i64 1
+  %tmp7749 = getelementptr inbounds float* %tmp7748, i64 1
+  %tmp7750 = getelementptr inbounds float* %tmp7749, i64 1
+  %tmp7751 = getelementptr inbounds float* %tmp7750, i64 1
+  %tmp7752 = getelementptr inbounds float* %tmp7751, i64 1
+  %tmp7753 = getelementptr inbounds float* %tmp7752, i64 1
+  %tmp7754 = getelementptr inbounds float* %tmp7753, i64 1
+  %tmp7755 = getelementptr inbounds float* %tmp7754, i64 1
+  %tmp7756 = getelementptr inbounds float* %tmp7755, i64 1
+  %tmp7757 = getelementptr inbounds float* %tmp7756, i64 1
+  %tmp7758 = getelementptr inbounds float* %tmp7757, i64 1
+  %tmp7759 = getelementptr inbounds float* %tmp7758, i64 1
+  %tmp7760 = getelementptr inbounds float* %tmp7759, i64 1
+  %tmp7761 = getelementptr inbounds float* %tmp7760, i64 1
+  %tmp7762 = getelementptr inbounds float* %tmp7761, i64 1
+  %tmp7763 = getelementptr inbounds float* %tmp7762, i64 1
+  %tmp7764 = getelementptr inbounds float* %tmp7763, i64 1
+  %tmp7765 = getelementptr inbounds float* %tmp7764, i64 1
+  %tmp7766 = getelementptr inbounds float* %tmp7765, i64 1
+  %tmp7767 = getelementptr inbounds float* %tmp7766, i64 1
+  %tmp7768 = getelementptr inbounds float* %tmp7767, i64 1
+  %tmp7769 = getelementptr inbounds float* %tmp7768, i64 1
+  %tmp7770 = getelementptr inbounds float* %tmp7769, i64 1
+  %tmp7771 = getelementptr inbounds float* %tmp7770, i64 1
+  %tmp7772 = getelementptr inbounds float* %tmp7771, i64 1
+  %tmp7773 = getelementptr inbounds float* %tmp7772, i64 1
+  %tmp7774 = getelementptr inbounds float* %tmp7773, i64 1
+  %tmp7775 = getelementptr inbounds float* %tmp7774, i64 1
+  %tmp7776 = getelementptr inbounds float* %tmp7775, i64 1
+  %tmp7777 = getelementptr inbounds float* %tmp7776, i64 1
+  %tmp7778 = getelementptr inbounds float* %tmp7777, i64 1
+  %tmp7779 = getelementptr inbounds float* %tmp7778, i64 1
+  %tmp7780 = getelementptr inbounds float* %tmp7779, i64 1
+  %tmp7781 = getelementptr inbounds float* %tmp7780, i64 1
+  %tmp7782 = getelementptr inbounds float* %tmp7781, i64 1
+  %tmp7783 = getelementptr inbounds float* %tmp7782, i64 1
+  %tmp7784 = getelementptr inbounds float* %tmp7783, i64 1
+  %tmp7785 = getelementptr inbounds float* %tmp7784, i64 1
+  %tmp7786 = getelementptr inbounds float* %tmp7785, i64 1
+  %tmp7787 = getelementptr inbounds float* %tmp7786, i64 1
+  %tmp7788 = getelementptr inbounds float* %tmp7787, i64 1
+  %tmp7789 = getelementptr inbounds float* %tmp7788, i64 1
+  %tmp7790 = getelementptr inbounds float* %tmp7789, i64 1
+  %tmp7791 = getelementptr inbounds float* %tmp7790, i64 1
+  %tmp7792 = getelementptr inbounds float* %tmp7791, i64 1
+  %tmp7793 = getelementptr inbounds float* %tmp7792, i64 1
+  %tmp7794 = getelementptr inbounds float* %tmp7793, i64 1
+  %tmp7795 = getelementptr inbounds float* %tmp7794, i64 1
+  %tmp7796 = getelementptr inbounds float* %tmp7795, i64 1
+  %tmp7797 = getelementptr inbounds float* %tmp7796, i64 1
+  %tmp7798 = getelementptr inbounds float* %tmp7797, i64 1
+  %tmp7799 = getelementptr inbounds float* %tmp7798, i64 1
+  %tmp7800 = getelementptr inbounds float* %tmp7799, i64 1
+  %tmp7801 = getelementptr inbounds float* %tmp7800, i64 1
+  %tmp7802 = getelementptr inbounds float* %tmp7801, i64 1
+  %tmp7803 = getelementptr inbounds float* %tmp7802, i64 1
+  %tmp7804 = getelementptr inbounds float* %tmp7803, i64 1
+  %tmp7805 = getelementptr inbounds float* %tmp7804, i64 1
+  %tmp7806 = getelementptr inbounds float* %tmp7805, i64 1
+  %tmp7807 = getelementptr inbounds float* %tmp7806, i64 1
+  %tmp7808 = getelementptr inbounds float* %tmp7807, i64 1
+  %tmp7809 = getelementptr inbounds float* %tmp7808, i64 1
+  %tmp7810 = getelementptr inbounds float* %tmp7809, i64 1
+  %tmp7811 = getelementptr inbounds float* %tmp7810, i64 1
+  %tmp7812 = getelementptr inbounds float* %tmp7811, i64 1
+  %tmp7813 = getelementptr inbounds float* %tmp7812, i64 1
+  %tmp7814 = getelementptr inbounds float* %tmp7813, i64 1
+  %tmp7815 = getelementptr inbounds float* %tmp7814, i64 1
+  %tmp7816 = getelementptr inbounds float* %tmp7815, i64 1
+  %tmp7817 = getelementptr inbounds float* %tmp7816, i64 1
+  %tmp7818 = getelementptr inbounds float* %tmp7817, i64 1
+  %tmp7819 = getelementptr inbounds float* %tmp7818, i64 1
+  %tmp7820 = getelementptr inbounds float* %tmp7819, i64 1
+  %tmp7821 = getelementptr inbounds float* %tmp7820, i64 1
+  %tmp7822 = getelementptr inbounds float* %tmp7821, i64 1
+  %tmp7823 = getelementptr inbounds float* %tmp7822, i64 1
+  %tmp7824 = getelementptr inbounds float* %tmp7823, i64 1
+  %tmp7825 = getelementptr inbounds float* %tmp7824, i64 1
+  %tmp7826 = getelementptr inbounds float* %tmp7825, i64 1
+  %tmp7827 = getelementptr inbounds float* %tmp7826, i64 1
+  %tmp7828 = getelementptr inbounds float* %tmp7827, i64 1
+  %tmp7829 = getelementptr inbounds float* %tmp7828, i64 1
+  %tmp7830 = getelementptr inbounds float* %tmp7829, i64 1
+  %tmp7831 = getelementptr inbounds float* %tmp7830, i64 1
+  %tmp7832 = getelementptr inbounds float* %tmp7831, i64 1
+  %tmp7833 = getelementptr inbounds float* %tmp7832, i64 1
+  %tmp7834 = getelementptr inbounds float* %tmp7833, i64 1
+  %tmp7835 = getelementptr inbounds float* %tmp7834, i64 1
+  %tmp7836 = getelementptr inbounds float* %tmp7835, i64 1
+  %tmp7837 = getelementptr inbounds float* %tmp7836, i64 1
+  %tmp7838 = getelementptr inbounds float* %tmp7837, i64 1
+  %tmp7839 = getelementptr inbounds float* %tmp7838, i64 1
+  %tmp7840 = getelementptr inbounds float* %tmp7839, i64 1
+  %tmp7841 = getelementptr inbounds float* %tmp7840, i64 1
+  %tmp7842 = getelementptr inbounds float* %tmp7841, i64 1
+  %tmp7843 = getelementptr inbounds float* %tmp7842, i64 1
+  %tmp7844 = getelementptr inbounds float* %tmp7843, i64 1
+  %tmp7845 = getelementptr inbounds float* %tmp7844, i64 1
+  %tmp7846 = getelementptr inbounds float* %tmp7845, i64 1
+  %tmp7847 = getelementptr inbounds float* %tmp7846, i64 1
+  %tmp7848 = getelementptr inbounds float* %tmp7847, i64 1
+  %tmp7849 = getelementptr inbounds float* %tmp7848, i64 1
+  %tmp7850 = getelementptr inbounds float* %tmp7849, i64 1
+  %tmp7851 = getelementptr inbounds float* %tmp7850, i64 1
+  %tmp7852 = getelementptr inbounds float* %tmp7851, i64 1
+  %tmp7853 = getelementptr inbounds float* %tmp7852, i64 1
+  %tmp7854 = getelementptr inbounds float* %tmp7853, i64 1
+  %tmp7855 = getelementptr inbounds float* %tmp7854, i64 1
+  %tmp7856 = getelementptr inbounds float* %tmp7855, i64 1
+  %tmp7857 = getelementptr inbounds float* %tmp7856, i64 1
+  %tmp7858 = getelementptr inbounds float* %tmp7857, i64 1
+  %tmp7859 = getelementptr inbounds float* %tmp7858, i64 1
+  %tmp7860 = getelementptr inbounds float* %tmp7859, i64 1
+  %tmp7861 = getelementptr inbounds float* %tmp7860, i64 1
+  %tmp7862 = getelementptr inbounds float* %tmp7861, i64 1
+  %tmp7863 = getelementptr inbounds float* %tmp7862, i64 1
+  %tmp7864 = getelementptr inbounds float* %tmp7863, i64 1
+  %tmp7865 = getelementptr inbounds float* %tmp7864, i64 1
+  %tmp7866 = getelementptr inbounds float* %tmp7865, i64 1
+  %tmp7867 = getelementptr inbounds float* %tmp7866, i64 1
+  %tmp7868 = getelementptr inbounds float* %tmp7867, i64 1
+  %tmp7869 = getelementptr inbounds float* %tmp7868, i64 1
+  %tmp7870 = getelementptr inbounds float* %tmp7869, i64 1
+  %tmp7871 = getelementptr inbounds float* %tmp7870, i64 1
+  %tmp7872 = getelementptr inbounds float* %tmp7871, i64 1
+  %tmp7873 = getelementptr inbounds float* %tmp7872, i64 1
+  %tmp7874 = getelementptr inbounds float* %tmp7873, i64 1
+  %tmp7875 = getelementptr inbounds float* %tmp7874, i64 1
+  %tmp7876 = getelementptr inbounds float* %tmp7875, i64 1
+  %tmp7877 = getelementptr inbounds float* %tmp7876, i64 1
+  %tmp7878 = getelementptr inbounds float* %tmp7877, i64 1
+  %tmp7879 = getelementptr inbounds float* %tmp7878, i64 1
+  %tmp7880 = getelementptr inbounds float* %tmp7879, i64 1
+  %tmp7881 = getelementptr inbounds float* %tmp7880, i64 1
+  %tmp7882 = getelementptr inbounds float* %tmp7881, i64 1
+  %tmp7883 = getelementptr inbounds float* %tmp7882, i64 1
+  %tmp7884 = getelementptr inbounds float* %tmp7883, i64 1
+  %tmp7885 = getelementptr inbounds float* %tmp7884, i64 1
+  %tmp7886 = getelementptr inbounds float* %tmp7885, i64 1
+  %tmp7887 = getelementptr inbounds float* %tmp7886, i64 1
+  %tmp7888 = getelementptr inbounds float* %tmp7887, i64 1
+  %tmp7889 = getelementptr inbounds float* %tmp7888, i64 1
+  %tmp7890 = getelementptr inbounds float* %tmp7889, i64 1
+  %tmp7891 = getelementptr inbounds float* %tmp7890, i64 1
+  %tmp7892 = getelementptr inbounds float* %tmp7891, i64 1
+  %tmp7893 = getelementptr inbounds float* %tmp7892, i64 1
+  %tmp7894 = getelementptr inbounds float* %tmp7893, i64 1
+  %tmp7895 = getelementptr inbounds float* %tmp7894, i64 1
+  %tmp7896 = getelementptr inbounds float* %tmp7895, i64 1
+  %tmp7897 = getelementptr inbounds float* %tmp7896, i64 1
+  %tmp7898 = getelementptr inbounds float* %tmp7897, i64 1
+  %tmp7899 = getelementptr inbounds float* %tmp7898, i64 1
+  %tmp7900 = getelementptr inbounds float* %tmp7899, i64 1
+  %tmp7901 = getelementptr inbounds float* %tmp7900, i64 1
+  %tmp7902 = getelementptr inbounds float* %tmp7901, i64 1
+  %tmp7903 = getelementptr inbounds float* %tmp7902, i64 1
+  %tmp7904 = getelementptr inbounds float* %tmp7903, i64 1
+  %tmp7905 = getelementptr inbounds float* %tmp7904, i64 1
+  %tmp7906 = getelementptr inbounds float* %tmp7905, i64 1
+  %tmp7907 = getelementptr inbounds float* %tmp7906, i64 1
+  %tmp7908 = getelementptr inbounds float* %tmp7907, i64 1
+  %tmp7909 = getelementptr inbounds float* %tmp7908, i64 1
+  %tmp7910 = getelementptr inbounds float* %tmp7909, i64 1
+  %tmp7911 = getelementptr inbounds float* %tmp7910, i64 1
+  %tmp7912 = getelementptr inbounds float* %tmp7911, i64 1
+  %tmp7913 = getelementptr inbounds float* %tmp7912, i64 1
+  %tmp7914 = getelementptr inbounds float* %tmp7913, i64 1
+  %tmp7915 = getelementptr inbounds float* %tmp7914, i64 1
+  %tmp7916 = getelementptr inbounds float* %tmp7915, i64 1
+  %tmp7917 = getelementptr inbounds float* %tmp7916, i64 1
+  %tmp7918 = getelementptr inbounds float* %tmp7917, i64 1
+  %tmp7919 = getelementptr inbounds float* %tmp7918, i64 1
+  %tmp7920 = getelementptr inbounds float* %tmp7919, i64 1
+  %tmp7921 = getelementptr inbounds float* %tmp7920, i64 1
+  %tmp7922 = getelementptr inbounds float* %tmp7921, i64 1
+  %tmp7923 = getelementptr inbounds float* %tmp7922, i64 1
+  %tmp7924 = getelementptr inbounds float* %tmp7923, i64 1
+  %tmp7925 = getelementptr inbounds float* %tmp7924, i64 1
+  %tmp7926 = getelementptr inbounds float* %tmp7925, i64 1
+  %tmp7927 = getelementptr inbounds float* %tmp7926, i64 1
+  %tmp7928 = getelementptr inbounds float* %tmp7927, i64 1
+  %tmp7929 = getelementptr inbounds float* %tmp7928, i64 1
+  %tmp7930 = getelementptr inbounds float* %tmp7929, i64 1
+  %tmp7931 = getelementptr inbounds float* %tmp7930, i64 1
+  %tmp7932 = getelementptr inbounds float* %tmp7931, i64 1
+  %tmp7933 = getelementptr inbounds float* %tmp7932, i64 1
+  %tmp7934 = getelementptr inbounds float* %tmp7933, i64 1
+  %tmp7935 = getelementptr inbounds float* %tmp7934, i64 1
+  %tmp7936 = getelementptr inbounds float* %tmp7935, i64 1
+  %tmp7937 = getelementptr inbounds float* %tmp7936, i64 1
+  %tmp7938 = getelementptr inbounds float* %tmp7937, i64 1
+  %tmp7939 = getelementptr inbounds float* %tmp7938, i64 1
+  %tmp7940 = getelementptr inbounds float* %tmp7939, i64 1
+  %tmp7941 = getelementptr inbounds float* %tmp7940, i64 1
+  %tmp7942 = getelementptr inbounds float* %tmp7941, i64 1
+  %tmp7943 = getelementptr inbounds float* %tmp7942, i64 1
+  %tmp7944 = getelementptr inbounds float* %tmp7943, i64 1
+  %tmp7945 = getelementptr inbounds float* %tmp7944, i64 1
+  %tmp7946 = getelementptr inbounds float* %tmp7945, i64 1
+  %tmp7947 = getelementptr inbounds float* %tmp7946, i64 1
+  %tmp7948 = getelementptr inbounds float* %tmp7947, i64 1
+  %tmp7949 = getelementptr inbounds float* %tmp7948, i64 1
+  %tmp7950 = getelementptr inbounds float* %tmp7949, i64 1
+  %tmp7951 = getelementptr inbounds float* %tmp7950, i64 1
+  %tmp7952 = getelementptr inbounds float* %tmp7951, i64 1
+  %tmp7953 = getelementptr inbounds float* %tmp7952, i64 1
+  %tmp7954 = getelementptr inbounds float* %tmp7953, i64 1
+  %tmp7955 = getelementptr inbounds float* %tmp7954, i64 1
+  %tmp7956 = getelementptr inbounds float* %tmp7955, i64 1
+  %tmp7957 = getelementptr inbounds float* %tmp7956, i64 1
+  %tmp7958 = getelementptr inbounds float* %tmp7957, i64 1
+  %tmp7959 = getelementptr inbounds float* %tmp7958, i64 1
+  %tmp7960 = getelementptr inbounds float* %tmp7959, i64 1
+  %tmp7961 = getelementptr inbounds float* %tmp7960, i64 1
+  %tmp7962 = getelementptr inbounds float* %tmp7961, i64 1
+  %tmp7963 = getelementptr inbounds float* %tmp7962, i64 1
+  %tmp7964 = getelementptr inbounds float* %tmp7963, i64 1
+  %tmp7965 = getelementptr inbounds float* %tmp7964, i64 1
+  %tmp7966 = getelementptr inbounds float* %tmp7965, i64 1
+  %tmp7967 = getelementptr inbounds float* %tmp7966, i64 1
+  %tmp7968 = getelementptr inbounds float* %tmp7967, i64 1
+  %tmp7969 = getelementptr inbounds float* %tmp7968, i64 1
+  %tmp7970 = getelementptr inbounds float* %tmp7969, i64 1
+  %tmp7971 = getelementptr inbounds float* %tmp7970, i64 1
+  %tmp7972 = getelementptr inbounds float* %tmp7971, i64 1
+  %tmp7973 = getelementptr inbounds float* %tmp7972, i64 1
+  %tmp7974 = getelementptr inbounds float* %tmp7973, i64 1
+  %tmp7975 = getelementptr inbounds float* %tmp7974, i64 1
+  %tmp7976 = getelementptr inbounds float* %tmp7975, i64 1
+  %tmp7977 = getelementptr inbounds float* %tmp7976, i64 1
+  %tmp7978 = getelementptr inbounds float* %tmp7977, i64 1
+  %tmp7979 = getelementptr inbounds float* %tmp7978, i64 1
+  %tmp7980 = getelementptr inbounds float* %tmp7979, i64 1
+  %tmp7981 = getelementptr inbounds float* %tmp7980, i64 1
+  %tmp7982 = getelementptr inbounds float* %tmp7981, i64 1
+  %tmp7983 = getelementptr inbounds float* %tmp7982, i64 1
+  %tmp7984 = getelementptr inbounds float* %tmp7983, i64 1
+  %tmp7985 = getelementptr inbounds float* %tmp7984, i64 1
+  %tmp7986 = getelementptr inbounds float* %tmp7985, i64 1
+  %tmp7987 = getelementptr inbounds float* %tmp7986, i64 1
+  %tmp7988 = getelementptr inbounds float* %tmp7987, i64 1
+  %tmp7989 = getelementptr inbounds float* %tmp7988, i64 1
+  %tmp7990 = getelementptr inbounds float* %tmp7989, i64 1
+  %tmp7991 = getelementptr inbounds float* %tmp7990, i64 1
+  %tmp7992 = getelementptr inbounds float* %tmp7991, i64 1
+  %tmp7993 = getelementptr inbounds float* %tmp7992, i64 1
+  %tmp7994 = getelementptr inbounds float* %tmp7993, i64 1
+  %tmp7995 = getelementptr inbounds float* %tmp7994, i64 1
+  %tmp7996 = getelementptr inbounds float* %tmp7995, i64 1
+  %tmp7997 = getelementptr inbounds float* %tmp7996, i64 1
+  %tmp7998 = getelementptr inbounds float* %tmp7997, i64 1
+  %tmp7999 = getelementptr inbounds float* %tmp7998, i64 1
+  %tmp8000 = getelementptr inbounds float* %tmp7999, i64 1
+  %tmp8001 = getelementptr inbounds float* %tmp8000, i64 1
+  %tmp8002 = getelementptr inbounds float* %tmp8001, i64 1
+  %tmp8003 = getelementptr inbounds float* %tmp8002, i64 1
+  %tmp8004 = getelementptr inbounds float* %tmp8003, i64 1
+  %tmp8005 = getelementptr inbounds float* %tmp8004, i64 1
+  %tmp8006 = getelementptr inbounds float* %tmp8005, i64 1
+  %tmp8007 = getelementptr inbounds float* %tmp8006, i64 1
+  %tmp8008 = getelementptr inbounds float* %tmp8007, i64 1
+  %tmp8009 = getelementptr inbounds float* %tmp8008, i64 1
+  %tmp8010 = getelementptr inbounds float* %tmp8009, i64 1
+  %tmp8011 = getelementptr inbounds float* %tmp8010, i64 1
+  %tmp8012 = getelementptr inbounds float* %tmp8011, i64 1
+  %tmp8013 = getelementptr inbounds float* %tmp8012, i64 1
+  %tmp8014 = getelementptr inbounds float* %tmp8013, i64 1
+  %tmp8015 = getelementptr inbounds float* %tmp8014, i64 1
+  %tmp8016 = getelementptr inbounds float* %tmp8015, i64 1
+  %tmp8017 = getelementptr inbounds float* %tmp8016, i64 1
+  %tmp8018 = getelementptr inbounds float* %tmp8017, i64 1
+  %tmp8019 = getelementptr inbounds float* %tmp8018, i64 1
+  %tmp8020 = getelementptr inbounds float* %tmp8019, i64 1
+  %tmp8021 = getelementptr inbounds float* %tmp8020, i64 1
+  %tmp8022 = getelementptr inbounds float* %tmp8021, i64 1
+  %tmp8023 = getelementptr inbounds float* %tmp8022, i64 1
+  %tmp8024 = getelementptr inbounds float* %tmp8023, i64 1
+  %tmp8025 = getelementptr inbounds float* %tmp8024, i64 1
+  %tmp8026 = getelementptr inbounds float* %tmp8025, i64 1
+  %tmp8027 = getelementptr inbounds float* %tmp8026, i64 1
+  %tmp8028 = getelementptr inbounds float* %tmp8027, i64 1
+  %tmp8029 = getelementptr inbounds float* %tmp8028, i64 1
+  %tmp8030 = getelementptr inbounds float* %tmp8029, i64 1
+  %tmp8031 = getelementptr inbounds float* %tmp8030, i64 1
+  %tmp8032 = getelementptr inbounds float* %tmp8031, i64 1
+  %tmp8033 = getelementptr inbounds float* %tmp8032, i64 1
+  %tmp8034 = getelementptr inbounds float* %tmp8033, i64 1
+  %tmp8035 = getelementptr inbounds float* %tmp8034, i64 1
+  %tmp8036 = getelementptr inbounds float* %tmp8035, i64 1
+  %tmp8037 = getelementptr inbounds float* %tmp8036, i64 1
+  %tmp8038 = getelementptr inbounds float* %tmp8037, i64 1
+  %tmp8039 = getelementptr inbounds float* %tmp8038, i64 1
+  %tmp8040 = getelementptr inbounds float* %tmp8039, i64 1
+  %tmp8041 = getelementptr inbounds float* %tmp8040, i64 1
+  %tmp8042 = getelementptr inbounds float* %tmp8041, i64 1
+  %tmp8043 = getelementptr inbounds float* %tmp8042, i64 1
+  %tmp8044 = getelementptr inbounds float* %tmp8043, i64 1
+  %tmp8045 = getelementptr inbounds float* %tmp8044, i64 1
+  %tmp8046 = getelementptr inbounds float* %tmp8045, i64 1
+  %tmp8047 = getelementptr inbounds float* %tmp8046, i64 1
+  %tmp8048 = getelementptr inbounds float* %tmp8047, i64 1
+  %tmp8049 = getelementptr inbounds float* %tmp8048, i64 1
+  %tmp8050 = getelementptr inbounds float* %tmp8049, i64 1
+  %tmp8051 = getelementptr inbounds float* %tmp8050, i64 1
+  %tmp8052 = getelementptr inbounds float* %tmp8051, i64 1
+  %tmp8053 = getelementptr inbounds float* %tmp8052, i64 1
+  %tmp8054 = getelementptr inbounds float* %tmp8053, i64 1
+  %tmp8055 = getelementptr inbounds float* %tmp8054, i64 1
+  %tmp8056 = getelementptr inbounds float* %tmp8055, i64 1
+  %tmp8057 = getelementptr inbounds float* %tmp8056, i64 1
+  %tmp8058 = getelementptr inbounds float* %tmp8057, i64 1
+  %tmp8059 = getelementptr inbounds float* %tmp8058, i64 1
+  %tmp8060 = getelementptr inbounds float* %tmp8059, i64 1
+  %tmp8061 = getelementptr inbounds float* %tmp8060, i64 1
+  %tmp8062 = getelementptr inbounds float* %tmp8061, i64 1
+  %tmp8063 = getelementptr inbounds float* %tmp8062, i64 1
+  %tmp8064 = getelementptr inbounds float* %tmp8063, i64 1
+  %tmp8065 = getelementptr inbounds float* %tmp8064, i64 1
+  %tmp8066 = getelementptr inbounds float* %tmp8065, i64 1
+  %tmp8067 = getelementptr inbounds float* %tmp8066, i64 1
+  %tmp8068 = getelementptr inbounds float* %tmp8067, i64 1
+  %tmp8069 = getelementptr inbounds float* %tmp8068, i64 1
+  %tmp8070 = getelementptr inbounds float* %tmp8069, i64 1
+  %tmp8071 = getelementptr inbounds float* %tmp8070, i64 1
+  %tmp8072 = getelementptr inbounds float* %tmp8071, i64 1
+  %tmp8073 = getelementptr inbounds float* %tmp8072, i64 1
+  %tmp8074 = getelementptr inbounds float* %tmp8073, i64 1
+  %tmp8075 = getelementptr inbounds float* %tmp8074, i64 1
+  %tmp8076 = getelementptr inbounds float* %tmp8075, i64 1
+  %tmp8077 = getelementptr inbounds float* %tmp8076, i64 1
+  %tmp8078 = getelementptr inbounds float* %tmp8077, i64 1
+  %tmp8079 = getelementptr inbounds float* %tmp8078, i64 1
+  %tmp8080 = getelementptr inbounds float* %tmp8079, i64 1
+  %tmp8081 = getelementptr inbounds float* %tmp8080, i64 1
+  %tmp8082 = getelementptr inbounds float* %tmp8081, i64 1
+  %tmp8083 = getelementptr inbounds float* %tmp8082, i64 1
+  %tmp8084 = getelementptr inbounds float* %tmp8083, i64 1
+  %tmp8085 = getelementptr inbounds float* %tmp8084, i64 1
+  %tmp8086 = getelementptr inbounds float* %tmp8085, i64 1
+  %tmp8087 = getelementptr inbounds float* %tmp8086, i64 1
+  %tmp8088 = getelementptr inbounds float* %tmp8087, i64 1
+  %tmp8089 = getelementptr inbounds float* %tmp8088, i64 1
+  %tmp8090 = getelementptr inbounds float* %tmp8089, i64 1
+  %tmp8091 = getelementptr inbounds float* %tmp8090, i64 1
+  %tmp8092 = getelementptr inbounds float* %tmp8091, i64 1
+  %tmp8093 = getelementptr inbounds float* %tmp8092, i64 1
+  %tmp8094 = getelementptr inbounds float* %tmp8093, i64 1
+  %tmp8095 = getelementptr inbounds float* %tmp8094, i64 1
+  %tmp8096 = getelementptr inbounds float* %tmp8095, i64 1
+  %tmp8097 = getelementptr inbounds float* %tmp8096, i64 1
+  %tmp8098 = getelementptr inbounds float* %tmp8097, i64 1
+  %tmp8099 = getelementptr inbounds float* %tmp8098, i64 1
+  %tmp8100 = getelementptr inbounds float* %tmp8099, i64 1
+  %tmp8101 = getelementptr inbounds float* %tmp8100, i64 1
+  %tmp8102 = getelementptr inbounds float* %tmp8101, i64 1
+  %tmp8103 = getelementptr inbounds float* %tmp8102, i64 1
+  %tmp8104 = getelementptr inbounds float* %tmp8103, i64 1
+  %tmp8105 = getelementptr inbounds float* %tmp8104, i64 1
+  %tmp8106 = getelementptr inbounds float* %tmp8105, i64 1
+  %tmp8107 = getelementptr inbounds float* %tmp8106, i64 1
+  %tmp8108 = getelementptr inbounds float* %tmp8107, i64 1
+  %tmp8109 = getelementptr inbounds float* %tmp8108, i64 1
+  %tmp8110 = getelementptr inbounds float* %tmp8109, i64 1
+  %tmp8111 = getelementptr inbounds float* %tmp8110, i64 1
+  %tmp8112 = getelementptr inbounds float* %tmp8111, i64 1
+  %tmp8113 = getelementptr inbounds float* %tmp8112, i64 1
+  %tmp8114 = getelementptr inbounds float* %tmp8113, i64 1
+  %tmp8115 = getelementptr inbounds float* %tmp8114, i64 1
+  %tmp8116 = getelementptr inbounds float* %tmp8115, i64 1
+  %tmp8117 = getelementptr inbounds float* %tmp8116, i64 1
+  %tmp8118 = getelementptr inbounds float* %tmp8117, i64 1
+  %tmp8119 = getelementptr inbounds float* %tmp8118, i64 1
+  %tmp8120 = getelementptr inbounds float* %tmp8119, i64 1
+  %tmp8121 = getelementptr inbounds float* %tmp8120, i64 1
+  %tmp8122 = getelementptr inbounds float* %tmp8121, i64 1
+  %tmp8123 = getelementptr inbounds float* %tmp8122, i64 1
+  %tmp8124 = getelementptr inbounds float* %tmp8123, i64 1
+  %tmp8125 = getelementptr inbounds float* %tmp8124, i64 1
+  %tmp8126 = getelementptr inbounds float* %tmp8125, i64 1
+  %tmp8127 = getelementptr inbounds float* %tmp8126, i64 1
+  %tmp8128 = getelementptr inbounds float* %tmp8127, i64 1
+  %tmp8129 = getelementptr inbounds float* %tmp8128, i64 1
+  %tmp8130 = getelementptr inbounds float* %tmp8129, i64 1
+  %tmp8131 = getelementptr inbounds float* %tmp8130, i64 1
+  %tmp8132 = getelementptr inbounds float* %tmp8131, i64 1
+  %tmp8133 = getelementptr inbounds float* %tmp8132, i64 1
+  %tmp8134 = getelementptr inbounds float* %tmp8133, i64 1
+  %tmp8135 = getelementptr inbounds float* %tmp8134, i64 1
+  %tmp8136 = getelementptr inbounds float* %tmp8135, i64 1
+  %tmp8137 = getelementptr inbounds float* %tmp8136, i64 1
+  %tmp8138 = getelementptr inbounds float* %tmp8137, i64 1
+  %tmp8139 = getelementptr inbounds float* %tmp8138, i64 1
+  %tmp8140 = getelementptr inbounds float* %tmp8139, i64 1
+  %tmp8141 = getelementptr inbounds float* %tmp8140, i64 1
+  %tmp8142 = getelementptr inbounds float* %tmp8141, i64 1
+  %tmp8143 = getelementptr inbounds float* %tmp8142, i64 1
+  %tmp8144 = getelementptr inbounds float* %tmp8143, i64 1
+  %tmp8145 = getelementptr inbounds float* %tmp8144, i64 1
+  %tmp8146 = getelementptr inbounds float* %tmp8145, i64 1
+  %tmp8147 = getelementptr inbounds float* %tmp8146, i64 1
+  %tmp8148 = getelementptr inbounds float* %tmp8147, i64 1
+  %tmp8149 = getelementptr inbounds float* %tmp8148, i64 1
+  %tmp8150 = getelementptr inbounds float* %tmp8149, i64 1
+  %tmp8151 = getelementptr inbounds float* %tmp8150, i64 1
+  %tmp8152 = getelementptr inbounds float* %tmp8151, i64 1
+  %tmp8153 = getelementptr inbounds float* %tmp8152, i64 1
+  %tmp8154 = getelementptr inbounds float* %tmp8153, i64 1
+  %tmp8155 = getelementptr inbounds float* %tmp8154, i64 1
+  %tmp8156 = getelementptr inbounds float* %tmp8155, i64 1
+  %tmp8157 = getelementptr inbounds float* %tmp8156, i64 1
+  %tmp8158 = getelementptr inbounds float* %tmp8157, i64 1
+  %tmp8159 = getelementptr inbounds float* %tmp8158, i64 1
+  %tmp8160 = getelementptr inbounds float* %tmp8159, i64 1
+  %tmp8161 = getelementptr inbounds float* %tmp8160, i64 1
+  %tmp8162 = getelementptr inbounds float* %tmp8161, i64 1
+  %tmp8163 = getelementptr inbounds float* %tmp8162, i64 1
+  %tmp8164 = getelementptr inbounds float* %tmp8163, i64 1
+  %tmp8165 = getelementptr inbounds float* %tmp8164, i64 1
+  %tmp8166 = getelementptr inbounds float* %tmp8165, i64 1
+  %tmp8167 = getelementptr inbounds float* %tmp8166, i64 1
+  %tmp8168 = getelementptr inbounds float* %tmp8167, i64 1
+  %tmp8169 = getelementptr inbounds float* %tmp8168, i64 1
+  %tmp8170 = getelementptr inbounds float* %tmp8169, i64 1
+  %tmp8171 = getelementptr inbounds float* %tmp8170, i64 1
+  %tmp8172 = getelementptr inbounds float* %tmp8171, i64 1
+  %tmp8173 = getelementptr inbounds float* %tmp8172, i64 1
+  %tmp8174 = getelementptr inbounds float* %tmp8173, i64 1
+  %tmp8175 = getelementptr inbounds float* %tmp8174, i64 1
+  %tmp8176 = getelementptr inbounds float* %tmp8175, i64 1
+  %tmp8177 = getelementptr inbounds float* %tmp8176, i64 1
+  %tmp8178 = getelementptr inbounds float* %tmp8177, i64 1
+  %tmp8179 = getelementptr inbounds float* %tmp8178, i64 1
+  %tmp8180 = getelementptr inbounds float* %tmp8179, i64 1
+  %tmp8181 = getelementptr inbounds float* %tmp8180, i64 1
+  %tmp8182 = getelementptr inbounds float* %tmp8181, i64 1
+  %tmp8183 = getelementptr inbounds float* %tmp8182, i64 1
+  %tmp8184 = getelementptr inbounds float* %tmp8183, i64 1
+  %tmp8185 = getelementptr inbounds float* %tmp8184, i64 1
+  %tmp8186 = getelementptr inbounds float* %tmp8185, i64 1
+  %tmp8187 = getelementptr inbounds float* %tmp8186, i64 1
+  %tmp8188 = getelementptr inbounds float* %tmp8187, i64 1
+  %tmp8189 = getelementptr inbounds float* %tmp8188, i64 1
+  %tmp8190 = getelementptr inbounds float* %tmp8189, i64 1
+  %tmp8191 = getelementptr inbounds float* %tmp8190, i64 1
+  %tmp8192 = getelementptr inbounds float* %tmp8191, i64 1
+  %tmp8193 = getelementptr inbounds float* %tmp8192, i64 1
+  %tmp8194 = getelementptr inbounds float* %tmp8193, i64 1
+  %tmp8195 = getelementptr inbounds float* %tmp8194, i64 1
+  %tmp8196 = getelementptr inbounds float* %tmp8195, i64 1
+  %tmp8197 = getelementptr inbounds float* %tmp8196, i64 1
+  %tmp8198 = getelementptr inbounds float* %tmp8197, i64 1
+  %tmp8199 = getelementptr inbounds float* %tmp8198, i64 1
+  %tmp8200 = getelementptr inbounds float* %tmp8199, i64 1
+  %tmp8201 = getelementptr inbounds float* %tmp8200, i64 1
+  %tmp8202 = getelementptr inbounds float* %tmp8201, i64 1
+  %tmp8203 = getelementptr inbounds float* %tmp8202, i64 1
+  %tmp8204 = getelementptr inbounds float* %tmp8203, i64 1
+  %tmp8205 = getelementptr inbounds float* %tmp8204, i64 1
+  %tmp8206 = getelementptr inbounds float* %tmp8205, i64 1
+  %tmp8207 = getelementptr inbounds float* %tmp8206, i64 1
+  %tmp8208 = getelementptr inbounds float* %tmp8207, i64 1
+  %tmp8209 = getelementptr inbounds float* %tmp8208, i64 1
+  %tmp8210 = getelementptr inbounds float* %tmp8209, i64 1
+  %tmp8211 = getelementptr inbounds float* %tmp8210, i64 1
+  %tmp8212 = getelementptr inbounds float* %tmp8211, i64 1
+  %tmp8213 = getelementptr inbounds float* %tmp8212, i64 1
+  %tmp8214 = getelementptr inbounds float* %tmp8213, i64 1
+  %tmp8215 = getelementptr inbounds float* %tmp8214, i64 1
+  %tmp8216 = getelementptr inbounds float* %tmp8215, i64 1
+  %tmp8217 = getelementptr inbounds float* %tmp8216, i64 1
+  %tmp8218 = getelementptr inbounds float* %tmp8217, i64 1
+  %tmp8219 = getelementptr inbounds float* %tmp8218, i64 1
+  %tmp8220 = getelementptr inbounds float* %tmp8219, i64 1
+  %tmp8221 = getelementptr inbounds float* %tmp8220, i64 1
+  %tmp8222 = getelementptr inbounds float* %tmp8221, i64 1
+  %tmp8223 = getelementptr inbounds float* %tmp8222, i64 1
+  %tmp8224 = getelementptr inbounds float* %tmp8223, i64 1
+  %tmp8225 = getelementptr inbounds float* %tmp8224, i64 1
+  %tmp8226 = getelementptr inbounds float* %tmp8225, i64 1
+  %tmp8227 = getelementptr inbounds float* %tmp8226, i64 1
+  %tmp8228 = getelementptr inbounds float* %tmp8227, i64 1
+  %tmp8229 = getelementptr inbounds float* %tmp8228, i64 1
+  %tmp8230 = getelementptr inbounds float* %tmp8229, i64 1
+  %tmp8231 = getelementptr inbounds float* %tmp8230, i64 1
+  %tmp8232 = getelementptr inbounds float* %tmp8231, i64 1
+  %tmp8233 = getelementptr inbounds float* %tmp8232, i64 1
+  %tmp8234 = getelementptr inbounds float* %tmp8233, i64 1
+  %tmp8235 = getelementptr inbounds float* %tmp8234, i64 1
+  %tmp8236 = getelementptr inbounds float* %tmp8235, i64 1
+  %tmp8237 = getelementptr inbounds float* %tmp8236, i64 1
+  %tmp8238 = getelementptr inbounds float* %tmp8237, i64 1
+  %tmp8239 = getelementptr inbounds float* %tmp8238, i64 1
+  %tmp8240 = getelementptr inbounds float* %tmp8239, i64 1
+  %tmp8241 = getelementptr inbounds float* %tmp8240, i64 1
+  %tmp8242 = getelementptr inbounds float* %tmp8241, i64 1
+  %tmp8243 = getelementptr inbounds float* %tmp8242, i64 1
+  %tmp8244 = getelementptr inbounds float* %tmp8243, i64 1
+  %tmp8245 = getelementptr inbounds float* %tmp8244, i64 1
+  %tmp8246 = getelementptr inbounds float* %tmp8245, i64 1
+  %tmp8247 = getelementptr inbounds float* %tmp8246, i64 1
+  %tmp8248 = getelementptr inbounds float* %tmp8247, i64 1
+  %tmp8249 = getelementptr inbounds float* %tmp8248, i64 1
+  %tmp8250 = getelementptr inbounds float* %tmp8249, i64 1
+  %tmp8251 = getelementptr inbounds float* %tmp8250, i64 1
+  %tmp8252 = getelementptr inbounds float* %tmp8251, i64 1
+  %tmp8253 = getelementptr inbounds float* %tmp8252, i64 1
+  %tmp8254 = getelementptr inbounds float* %tmp8253, i64 1
+  %tmp8255 = getelementptr inbounds float* %tmp8254, i64 1
+  %tmp8256 = getelementptr inbounds float* %tmp8255, i64 1
+  %tmp8257 = getelementptr inbounds float* %tmp8256, i64 1
+  %tmp8258 = getelementptr inbounds float* %tmp8257, i64 1
+  %tmp8259 = getelementptr inbounds float* %tmp8258, i64 1
+  %tmp8260 = getelementptr inbounds float* %tmp8259, i64 1
+  %tmp8261 = getelementptr inbounds float* %tmp8260, i64 1
+  %tmp8262 = getelementptr inbounds float* %tmp8261, i64 1
+  %tmp8263 = getelementptr inbounds float* %tmp8262, i64 1
+  %tmp8264 = getelementptr inbounds float* %tmp8263, i64 1
+  %tmp8265 = getelementptr inbounds float* %tmp8264, i64 1
+  %tmp8266 = getelementptr inbounds float* %tmp8265, i64 1
+  %tmp8267 = getelementptr inbounds float* %tmp8266, i64 1
+  %tmp8268 = getelementptr inbounds float* %tmp8267, i64 1
+  %tmp8269 = getelementptr inbounds float* %tmp8268, i64 1
+  %tmp8270 = getelementptr inbounds float* %tmp8269, i64 1
+  %tmp8271 = getelementptr inbounds float* %tmp8270, i64 1
+  %tmp8272 = getelementptr inbounds float* %tmp8271, i64 1
+  %tmp8273 = getelementptr inbounds float* %tmp8272, i64 1
+  %tmp8274 = getelementptr inbounds float* %tmp8273, i64 1
+  %tmp8275 = getelementptr inbounds float* %tmp8274, i64 1
+  %tmp8276 = getelementptr inbounds float* %tmp8275, i64 1
+  %tmp8277 = getelementptr inbounds float* %tmp8276, i64 1
+  %tmp8278 = getelementptr inbounds float* %tmp8277, i64 1
+  %tmp8279 = getelementptr inbounds float* %tmp8278, i64 1
+  %tmp8280 = getelementptr inbounds float* %tmp8279, i64 1
+  %tmp8281 = getelementptr inbounds float* %tmp8280, i64 1
+  %tmp8282 = getelementptr inbounds float* %tmp8281, i64 1
+  %tmp8283 = getelementptr inbounds float* %tmp8282, i64 1
+  %tmp8284 = getelementptr inbounds float* %tmp8283, i64 1
+  %tmp8285 = getelementptr inbounds float* %tmp8284, i64 1
+  %tmp8286 = getelementptr inbounds float* %tmp8285, i64 1
+  %tmp8287 = getelementptr inbounds float* %tmp8286, i64 1
+  %tmp8288 = getelementptr inbounds float* %tmp8287, i64 1
+  %tmp8289 = getelementptr inbounds float* %tmp8288, i64 1
+  %tmp8290 = getelementptr inbounds float* %tmp8289, i64 1
+  %tmp8291 = getelementptr inbounds float* %tmp8290, i64 1
+  %tmp8292 = getelementptr inbounds float* %tmp8291, i64 1
+  %tmp8293 = getelementptr inbounds float* %tmp8292, i64 1
+  %tmp8294 = getelementptr inbounds float* %tmp8293, i64 1
+  %tmp8295 = getelementptr inbounds float* %tmp8294, i64 1
+  %tmp8296 = getelementptr inbounds float* %tmp8295, i64 1
+  %tmp8297 = getelementptr inbounds float* %tmp8296, i64 1
+  %tmp8298 = getelementptr inbounds float* %tmp8297, i64 1
+  %tmp8299 = getelementptr inbounds float* %tmp8298, i64 1
+  %tmp8300 = getelementptr inbounds float* %tmp8299, i64 1
+  %tmp8301 = getelementptr inbounds float* %tmp8300, i64 1
+  %tmp8302 = getelementptr inbounds float* %tmp8301, i64 1
+  %tmp8303 = getelementptr inbounds float* %tmp8302, i64 1
+  %tmp8304 = getelementptr inbounds float* %tmp8303, i64 1
+  %tmp8305 = getelementptr inbounds float* %tmp8304, i64 1
+  %tmp8306 = getelementptr inbounds float* %tmp8305, i64 1
+  %tmp8307 = getelementptr inbounds float* %tmp8306, i64 1
+  %tmp8308 = getelementptr inbounds float* %tmp8307, i64 1
+  %tmp8309 = getelementptr inbounds float* %tmp8308, i64 1
+  %tmp8310 = getelementptr inbounds float* %tmp8309, i64 1
+  %tmp8311 = getelementptr inbounds float* %tmp8310, i64 1
+  %tmp8312 = getelementptr inbounds float* %tmp8311, i64 1
+  %tmp8313 = getelementptr inbounds float* %tmp8312, i64 1
+  %tmp8314 = getelementptr inbounds float* %tmp8313, i64 1
+  %tmp8315 = getelementptr inbounds float* %tmp8314, i64 1
+  %tmp8316 = getelementptr inbounds float* %tmp8315, i64 1
+  %tmp8317 = getelementptr inbounds float* %tmp8316, i64 1
+  %tmp8318 = getelementptr inbounds float* %tmp8317, i64 1
+  %tmp8319 = getelementptr inbounds float* %tmp8318, i64 1
+  %tmp8320 = getelementptr inbounds float* %tmp8319, i64 1
+  %tmp8321 = getelementptr inbounds float* %tmp8320, i64 1
+  %tmp8322 = getelementptr inbounds float* %tmp8321, i64 1
+  %tmp8323 = getelementptr inbounds float* %tmp8322, i64 1
+  %tmp8324 = getelementptr inbounds float* %tmp8323, i64 1
+  %tmp8325 = getelementptr inbounds float* %tmp8324, i64 1
+  %tmp8326 = getelementptr inbounds float* %tmp8325, i64 1
+  %tmp8327 = getelementptr inbounds float* %tmp8326, i64 1
+  %tmp8328 = getelementptr inbounds float* %tmp8327, i64 1
+  %tmp8329 = getelementptr inbounds float* %tmp8328, i64 1
+  %tmp8330 = getelementptr inbounds float* %tmp8329, i64 1
+  %tmp8331 = getelementptr inbounds float* %tmp8330, i64 1
+  %tmp8332 = getelementptr inbounds float* %tmp8331, i64 1
+  %tmp8333 = getelementptr inbounds float* %tmp8332, i64 1
+  %tmp8334 = getelementptr inbounds float* %tmp8333, i64 1
+  %tmp8335 = getelementptr inbounds float* %tmp8334, i64 1
+  %tmp8336 = getelementptr inbounds float* %tmp8335, i64 1
+  %tmp8337 = getelementptr inbounds float* %tmp8336, i64 1
+  %tmp8338 = getelementptr inbounds float* %tmp8337, i64 1
+  %tmp8339 = getelementptr inbounds float* %tmp8338, i64 1
+  %tmp8340 = getelementptr inbounds float* %tmp8339, i64 1
+  %tmp8341 = getelementptr inbounds float* %tmp8340, i64 1
+  %tmp8342 = getelementptr inbounds float* %tmp8341, i64 1
+  %tmp8343 = getelementptr inbounds float* %tmp8342, i64 1
+  %tmp8344 = getelementptr inbounds float* %tmp8343, i64 1
+  %tmp8345 = getelementptr inbounds float* %tmp8344, i64 1
+  %tmp8346 = getelementptr inbounds float* %tmp8345, i64 1
+  %tmp8347 = getelementptr inbounds float* %tmp8346, i64 1
+  %tmp8348 = getelementptr inbounds float* %tmp8347, i64 1
+  %tmp8349 = getelementptr inbounds float* %tmp8348, i64 1
+  %tmp8350 = getelementptr inbounds float* %tmp8349, i64 1
+  %tmp8351 = getelementptr inbounds float* %tmp8350, i64 1
+  %tmp8352 = getelementptr inbounds float* %tmp8351, i64 1
+  %tmp8353 = getelementptr inbounds float* %tmp8352, i64 1
+  %tmp8354 = getelementptr inbounds float* %tmp8353, i64 1
+  %tmp8355 = getelementptr inbounds float* %tmp8354, i64 1
+  %tmp8356 = getelementptr inbounds float* %tmp8355, i64 1
+  %tmp8357 = getelementptr inbounds float* %tmp8356, i64 1
+  %tmp8358 = getelementptr inbounds float* %tmp8357, i64 1
+  %tmp8359 = getelementptr inbounds float* %tmp8358, i64 1
+  %tmp8360 = getelementptr inbounds float* %tmp8359, i64 1
+  %tmp8361 = getelementptr inbounds float* %tmp8360, i64 1
+  %tmp8362 = getelementptr inbounds float* %tmp8361, i64 1
+  %tmp8363 = getelementptr inbounds float* %tmp8362, i64 1
+  %tmp8364 = getelementptr inbounds float* %tmp8363, i64 1
+  %tmp8365 = getelementptr inbounds float* %tmp8364, i64 1
+  %tmp8366 = getelementptr inbounds float* %tmp8365, i64 1
+  %tmp8367 = getelementptr inbounds float* %tmp8366, i64 1
+  %tmp8368 = getelementptr inbounds float* %tmp8367, i64 1
+  %tmp8369 = getelementptr inbounds float* %tmp8368, i64 1
+  %tmp8370 = getelementptr inbounds float* %tmp8369, i64 1
+  %tmp8371 = getelementptr inbounds float* %tmp8370, i64 1
+  %tmp8372 = getelementptr inbounds float* %tmp8371, i64 1
+  %tmp8373 = getelementptr inbounds float* %tmp8372, i64 1
+  %tmp8374 = getelementptr inbounds float* %tmp8373, i64 1
+  %tmp8375 = getelementptr inbounds float* %tmp8374, i64 1
+  %tmp8376 = getelementptr inbounds float* %tmp8375, i64 1
+  %tmp8377 = getelementptr inbounds float* %tmp8376, i64 1
+  %tmp8378 = getelementptr inbounds float* %tmp8377, i64 1
+  %tmp8379 = getelementptr inbounds float* %tmp8378, i64 1
+  %tmp8380 = getelementptr inbounds float* %tmp8379, i64 1
+  %tmp8381 = getelementptr inbounds float* %tmp8380, i64 1
+  %tmp8382 = getelementptr inbounds float* %tmp8381, i64 1
+  %tmp8383 = getelementptr inbounds float* %tmp8382, i64 1
+  %tmp8384 = getelementptr inbounds float* %tmp8383, i64 1
+  %tmp8385 = getelementptr inbounds float* %tmp8384, i64 1
+  %tmp8386 = getelementptr inbounds float* %tmp8385, i64 1
+  %tmp8387 = getelementptr inbounds float* %tmp8386, i64 1
+  %tmp8388 = getelementptr inbounds float* %tmp8387, i64 1
+  %tmp8389 = getelementptr inbounds float* %tmp8388, i64 1
+  %tmp8390 = getelementptr inbounds float* %tmp8389, i64 1
+  %tmp8391 = getelementptr inbounds float* %tmp8390, i64 1
+  %tmp8392 = getelementptr inbounds float* %tmp8391, i64 1
+  %tmp8393 = getelementptr inbounds float* %tmp8392, i64 1
+  %tmp8394 = getelementptr inbounds float* %tmp8393, i64 1
+  %tmp8395 = getelementptr inbounds float* %tmp8394, i64 1
+  %tmp8396 = getelementptr inbounds float* %tmp8395, i64 1
+  %tmp8397 = getelementptr inbounds float* %tmp8396, i64 1
+  %tmp8398 = getelementptr inbounds float* %tmp8397, i64 1
+  %tmp8399 = getelementptr inbounds float* %tmp8398, i64 1
+  %tmp8400 = getelementptr inbounds float* %tmp8399, i64 1
+  %tmp8401 = getelementptr inbounds float* %tmp8400, i64 1
+  %tmp8402 = getelementptr inbounds float* %tmp8401, i64 1
+  %tmp8403 = getelementptr inbounds float* %tmp8402, i64 1
+  %tmp8404 = getelementptr inbounds float* %tmp8403, i64 1
+  %tmp8405 = getelementptr inbounds float* %tmp8404, i64 1
+  %tmp8406 = getelementptr inbounds float* %tmp8405, i64 1
+  %tmp8407 = getelementptr inbounds float* %tmp8406, i64 1
+  %tmp8408 = getelementptr inbounds float* %tmp8407, i64 1
+  %tmp8409 = getelementptr inbounds float* %tmp8408, i64 1
+  %tmp8410 = getelementptr inbounds float* %tmp8409, i64 1
+  %tmp8411 = getelementptr inbounds float* %tmp8410, i64 1
+  %tmp8412 = getelementptr inbounds float* %tmp8411, i64 1
+  %tmp8413 = getelementptr inbounds float* %tmp8412, i64 1
+  %tmp8414 = getelementptr inbounds float* %tmp8413, i64 1
+  %tmp8415 = getelementptr inbounds float* %tmp8414, i64 1
+  %tmp8416 = getelementptr inbounds float* %tmp8415, i64 1
+  %tmp8417 = getelementptr inbounds float* %tmp8416, i64 1
+  %tmp8418 = getelementptr inbounds float* %tmp8417, i64 1
+  %tmp8419 = getelementptr inbounds float* %tmp8418, i64 1
+  %tmp8420 = getelementptr inbounds float* %tmp8419, i64 1
+  %tmp8421 = getelementptr inbounds float* %tmp8420, i64 1
+  %tmp8422 = getelementptr inbounds float* %tmp8421, i64 1
+  %tmp8423 = getelementptr inbounds float* %tmp8422, i64 1
+  %tmp8424 = getelementptr inbounds float* %tmp8423, i64 1
+  %tmp8425 = getelementptr inbounds float* %tmp8424, i64 1
+  %tmp8426 = getelementptr inbounds float* %tmp8425, i64 1
+  %tmp8427 = getelementptr inbounds float* %tmp8426, i64 1
+  %tmp8428 = getelementptr inbounds float* %tmp8427, i64 1
+  %tmp8429 = getelementptr inbounds float* %tmp8428, i64 1
+  %tmp8430 = getelementptr inbounds float* %tmp8429, i64 1
+  %tmp8431 = getelementptr inbounds float* %tmp8430, i64 1
+  %tmp8432 = getelementptr inbounds float* %tmp8431, i64 1
+  %tmp8433 = getelementptr inbounds float* %tmp8432, i64 1
+  %tmp8434 = getelementptr inbounds float* %tmp8433, i64 1
+  %tmp8435 = getelementptr inbounds float* %tmp8434, i64 1
+  %tmp8436 = getelementptr inbounds float* %tmp8435, i64 1
+  %tmp8437 = getelementptr inbounds float* %tmp8436, i64 1
+  %tmp8438 = getelementptr inbounds float* %tmp8437, i64 1
+  %tmp8439 = getelementptr inbounds float* %tmp8438, i64 1
+  %tmp8440 = getelementptr inbounds float* %tmp8439, i64 1
+  %tmp8441 = getelementptr inbounds float* %tmp8440, i64 1
+  %tmp8442 = getelementptr inbounds float* %tmp8441, i64 1
+  %tmp8443 = getelementptr inbounds float* %tmp8442, i64 1
+  %tmp8444 = getelementptr inbounds float* %tmp8443, i64 1
+  %tmp8445 = getelementptr inbounds float* %tmp8444, i64 1
+  %tmp8446 = getelementptr inbounds float* %tmp8445, i64 1
+  %tmp8447 = getelementptr inbounds float* %tmp8446, i64 1
+  %tmp8448 = getelementptr inbounds float* %tmp8447, i64 1
+  %tmp8449 = getelementptr inbounds float* %tmp8448, i64 1
+  %tmp8450 = getelementptr inbounds float* %tmp8449, i64 1
+  %tmp8451 = getelementptr inbounds float* %tmp8450, i64 1
+  %tmp8452 = getelementptr inbounds float* %tmp8451, i64 1
+  %tmp8453 = getelementptr inbounds float* %tmp8452, i64 1
+  %tmp8454 = getelementptr inbounds float* %tmp8453, i64 1
+  %tmp8455 = getelementptr inbounds float* %tmp8454, i64 1
+  %tmp8456 = getelementptr inbounds float* %tmp8455, i64 1
+  %tmp8457 = getelementptr inbounds float* %tmp8456, i64 1
+  %tmp8458 = getelementptr inbounds float* %tmp8457, i64 1
+  %tmp8459 = getelementptr inbounds float* %tmp8458, i64 1
+  %tmp8460 = getelementptr inbounds float* %tmp8459, i64 1
+  %tmp8461 = getelementptr inbounds float* %tmp8460, i64 1
+  %tmp8462 = getelementptr inbounds float* %tmp8461, i64 1
+  %tmp8463 = getelementptr inbounds float* %tmp8462, i64 1
+  %tmp8464 = getelementptr inbounds float* %tmp8463, i64 1
+  %tmp8465 = getelementptr inbounds float* %tmp8464, i64 1
+  %tmp8466 = getelementptr inbounds float* %tmp8465, i64 1
+  %tmp8467 = getelementptr inbounds float* %tmp8466, i64 1
+  %tmp8468 = getelementptr inbounds float* %tmp8467, i64 1
+  %tmp8469 = getelementptr inbounds float* %tmp8468, i64 1
+  %tmp8470 = getelementptr inbounds float* %tmp8469, i64 1
+  %tmp8471 = getelementptr inbounds float* %tmp8470, i64 1
+  %tmp8472 = getelementptr inbounds float* %tmp8471, i64 1
+  %tmp8473 = getelementptr inbounds float* %tmp8472, i64 1
+  %tmp8474 = getelementptr inbounds float* %tmp8473, i64 1
+  %tmp8475 = getelementptr inbounds float* %tmp8474, i64 1
+  %tmp8476 = getelementptr inbounds float* %tmp8475, i64 1
+  %tmp8477 = getelementptr inbounds float* %tmp8476, i64 1
+  %tmp8478 = getelementptr inbounds float* %tmp8477, i64 1
+  %tmp8479 = getelementptr inbounds float* %tmp8478, i64 1
+  %tmp8480 = getelementptr inbounds float* %tmp8479, i64 1
+  %tmp8481 = getelementptr inbounds float* %tmp8480, i64 1
+  %tmp8482 = getelementptr inbounds float* %tmp8481, i64 1
+  %tmp8483 = getelementptr inbounds float* %tmp8482, i64 1
+  %tmp8484 = getelementptr inbounds float* %tmp8483, i64 1
+  %tmp8485 = getelementptr inbounds float* %tmp8484, i64 1
+  %tmp8486 = getelementptr inbounds float* %tmp8485, i64 1
+  %tmp8487 = getelementptr inbounds float* %tmp8486, i64 1
+  %tmp8488 = getelementptr inbounds float* %tmp8487, i64 1
+  %tmp8489 = getelementptr inbounds float* %tmp8488, i64 1
+  %tmp8490 = getelementptr inbounds float* %tmp8489, i64 1
+  %tmp8491 = getelementptr inbounds float* %tmp8490, i64 1
+  %tmp8492 = getelementptr inbounds float* %tmp8491, i64 1
+  %tmp8493 = getelementptr inbounds float* %tmp8492, i64 1
+  %tmp8494 = getelementptr inbounds float* %tmp8493, i64 1
+  %tmp8495 = getelementptr inbounds float* %tmp8494, i64 1
+  %tmp8496 = getelementptr inbounds float* %tmp8495, i64 1
+  %tmp8497 = getelementptr inbounds float* %tmp8496, i64 1
+  %tmp8498 = getelementptr inbounds float* %tmp8497, i64 1
+  %tmp8499 = getelementptr inbounds float* %tmp8498, i64 1
+  %tmp8500 = getelementptr inbounds float* %tmp8499, i64 1
+  %tmp8501 = getelementptr inbounds float* %tmp8500, i64 1
+  %tmp8502 = getelementptr inbounds float* %tmp8501, i64 1
+  %tmp8503 = getelementptr inbounds float* %tmp8502, i64 1
+  %tmp8504 = getelementptr inbounds float* %tmp8503, i64 1
+  %tmp8505 = getelementptr inbounds float* %tmp8504, i64 1
+  %tmp8506 = getelementptr inbounds float* %tmp8505, i64 1
+  %tmp8507 = getelementptr inbounds float* %tmp8506, i64 1
+  %tmp8508 = getelementptr inbounds float* %tmp8507, i64 1
+  %tmp8509 = getelementptr inbounds float* %tmp8508, i64 1
+  %tmp8510 = getelementptr inbounds float* %tmp8509, i64 1
+  %tmp8511 = getelementptr inbounds float* %tmp8510, i64 1
+  %tmp8512 = getelementptr inbounds float* %tmp8511, i64 1
+  %tmp8513 = getelementptr inbounds float* %tmp8512, i64 1
+  %tmp8514 = getelementptr inbounds float* %tmp8513, i64 1
+  %tmp8515 = getelementptr inbounds float* %tmp8514, i64 1
+  %tmp8516 = getelementptr inbounds float* %tmp8515, i64 1
+  %tmp8517 = getelementptr inbounds float* %tmp8516, i64 1
+  %tmp8518 = getelementptr inbounds float* %tmp8517, i64 1
+  %tmp8519 = getelementptr inbounds float* %tmp8518, i64 1
+  %tmp8520 = getelementptr inbounds float* %tmp8519, i64 1
+  %tmp8521 = getelementptr inbounds float* %tmp8520, i64 1
+  %tmp8522 = getelementptr inbounds float* %tmp8521, i64 1
+  %tmp8523 = getelementptr inbounds float* %tmp8522, i64 1
+  %tmp8524 = getelementptr inbounds float* %tmp8523, i64 1
+  %tmp8525 = getelementptr inbounds float* %tmp8524, i64 1
+  %tmp8526 = getelementptr inbounds float* %tmp8525, i64 1
+  %tmp8527 = getelementptr inbounds float* %tmp8526, i64 1
+  %tmp8528 = getelementptr inbounds float* %tmp8527, i64 1
+  %tmp8529 = getelementptr inbounds float* %tmp8528, i64 1
+  %tmp8530 = getelementptr inbounds float* %tmp8529, i64 1
+  %tmp8531 = getelementptr inbounds float* %tmp8530, i64 1
+  %tmp8532 = getelementptr inbounds float* %tmp8531, i64 1
+  %tmp8533 = getelementptr inbounds float* %tmp8532, i64 1
+  %tmp8534 = getelementptr inbounds float* %tmp8533, i64 1
+  %tmp8535 = getelementptr inbounds float* %tmp8534, i64 1
+  %tmp8536 = getelementptr inbounds float* %tmp8535, i64 1
+  %tmp8537 = getelementptr inbounds float* %tmp8536, i64 1
+  %tmp8538 = getelementptr inbounds float* %tmp8537, i64 1
+  %tmp8539 = getelementptr inbounds float* %tmp8538, i64 1
+  %tmp8540 = getelementptr inbounds float* %tmp8539, i64 1
+  %tmp8541 = getelementptr inbounds float* %tmp8540, i64 1
+  %tmp8542 = getelementptr inbounds float* %tmp8541, i64 1
+  %tmp8543 = getelementptr inbounds float* %tmp8542, i64 1
+  %tmp8544 = getelementptr inbounds float* %tmp8543, i64 1
+  %tmp8545 = getelementptr inbounds float* %tmp8544, i64 1
+  %tmp8546 = getelementptr inbounds float* %tmp8545, i64 1
+  %tmp8547 = getelementptr inbounds float* %tmp8546, i64 1
+  %tmp8548 = getelementptr inbounds float* %tmp8547, i64 1
+  %tmp8549 = getelementptr inbounds float* %tmp8548, i64 1
+  %tmp8550 = getelementptr inbounds float* %tmp8549, i64 1
+  %tmp8551 = getelementptr inbounds float* %tmp8550, i64 1
+  %tmp8552 = getelementptr inbounds float* %tmp8551, i64 1
+  %tmp8553 = getelementptr inbounds float* %tmp8552, i64 1
+  %tmp8554 = getelementptr inbounds float* %tmp8553, i64 1
+  %tmp8555 = getelementptr inbounds float* %tmp8554, i64 1
+  %tmp8556 = getelementptr inbounds float* %tmp8555, i64 1
+  %tmp8557 = getelementptr inbounds float* %tmp8556, i64 1
+  %tmp8558 = getelementptr inbounds float* %tmp8557, i64 1
+  %tmp8559 = getelementptr inbounds float* %tmp8558, i64 1
+  %tmp8560 = getelementptr inbounds float* %tmp8559, i64 1
+  %tmp8561 = getelementptr inbounds float* %tmp8560, i64 1
+  %tmp8562 = getelementptr inbounds float* %tmp8561, i64 1
+  %tmp8563 = getelementptr inbounds float* %tmp8562, i64 1
+  %tmp8564 = getelementptr inbounds float* %tmp8563, i64 1
+  %tmp8565 = getelementptr inbounds float* %tmp8564, i64 1
+  %tmp8566 = getelementptr inbounds float* %tmp8565, i64 1
+  %tmp8567 = getelementptr inbounds float* %tmp8566, i64 1
+  %tmp8568 = getelementptr inbounds float* %tmp8567, i64 1
+  %tmp8569 = getelementptr inbounds float* %tmp8568, i64 1
+  %tmp8570 = getelementptr inbounds float* %tmp8569, i64 1
+  %tmp8571 = getelementptr inbounds float* %tmp8570, i64 1
+  %tmp8572 = getelementptr inbounds float* %tmp8571, i64 1
+  %tmp8573 = getelementptr inbounds float* %tmp8572, i64 1
+  %tmp8574 = getelementptr inbounds float* %tmp8573, i64 1
+  %tmp8575 = getelementptr inbounds float* %tmp8574, i64 1
+  %tmp8576 = getelementptr inbounds float* %tmp8575, i64 1
+  %tmp8577 = getelementptr inbounds float* %tmp8576, i64 1
+  %tmp8578 = getelementptr inbounds float* %tmp8577, i64 1
+  %tmp8579 = getelementptr inbounds float* %tmp8578, i64 1
+  %tmp8580 = getelementptr inbounds float* %tmp8579, i64 1
+  %tmp8581 = getelementptr inbounds float* %tmp8580, i64 1
+  %tmp8582 = getelementptr inbounds float* %tmp8581, i64 1
+  %tmp8583 = getelementptr inbounds float* %tmp8582, i64 1
+  %tmp8584 = getelementptr inbounds float* %tmp8583, i64 1
+  %tmp8585 = getelementptr inbounds float* %tmp8584, i64 1
+  %tmp8586 = getelementptr inbounds float* %tmp8585, i64 1
+  %tmp8587 = getelementptr inbounds float* %tmp8586, i64 1
+  %tmp8588 = getelementptr inbounds float* %tmp8587, i64 1
+  %tmp8589 = getelementptr inbounds float* %tmp8588, i64 1
+  %tmp8590 = getelementptr inbounds float* %tmp8589, i64 1
+  %tmp8591 = getelementptr inbounds float* %tmp8590, i64 1
+  %tmp8592 = getelementptr inbounds float* %tmp8591, i64 1
+  %tmp8593 = getelementptr inbounds float* %tmp8592, i64 1
+  %tmp8594 = getelementptr inbounds float* %tmp8593, i64 1
+  %tmp8595 = getelementptr inbounds float* %tmp8594, i64 1
+  %tmp8596 = getelementptr inbounds float* %tmp8595, i64 1
+  %tmp8597 = getelementptr inbounds float* %tmp8596, i64 1
+  %tmp8598 = getelementptr inbounds float* %tmp8597, i64 1
+  %tmp8599 = getelementptr inbounds float* %tmp8598, i64 1
+  %tmp8600 = getelementptr inbounds float* %tmp8599, i64 1
+  %tmp8601 = getelementptr inbounds float* %tmp8600, i64 1
+  %tmp8602 = getelementptr inbounds float* %tmp8601, i64 1
+  %tmp8603 = getelementptr inbounds float* %tmp8602, i64 1
+  %tmp8604 = getelementptr inbounds float* %tmp8603, i64 1
+  %tmp8605 = getelementptr inbounds float* %tmp8604, i64 1
+  %tmp8606 = getelementptr inbounds float* %tmp8605, i64 1
+  %tmp8607 = getelementptr inbounds float* %tmp8606, i64 1
+  %tmp8608 = getelementptr inbounds float* %tmp8607, i64 1
+  %tmp8609 = getelementptr inbounds float* %tmp8608, i64 1
+  %tmp8610 = getelementptr inbounds float* %tmp8609, i64 1
+  %tmp8611 = getelementptr inbounds float* %tmp8610, i64 1
+  %tmp8612 = getelementptr inbounds float* %tmp8611, i64 1
+  %tmp8613 = getelementptr inbounds float* %tmp8612, i64 1
+  %tmp8614 = getelementptr inbounds float* %tmp8613, i64 1
+  %tmp8615 = getelementptr inbounds float* %tmp8614, i64 1
+  %tmp8616 = getelementptr inbounds float* %tmp8615, i64 1
+  %tmp8617 = getelementptr inbounds float* %tmp8616, i64 1
+  %tmp8618 = getelementptr inbounds float* %tmp8617, i64 1
+  %tmp8619 = getelementptr inbounds float* %tmp8618, i64 1
+  %tmp8620 = getelementptr inbounds float* %tmp8619, i64 1
+  %tmp8621 = getelementptr inbounds float* %tmp8620, i64 1
+  %tmp8622 = getelementptr inbounds float* %tmp8621, i64 1
+  %tmp8623 = getelementptr inbounds float* %tmp8622, i64 1
+  %tmp8624 = getelementptr inbounds float* %tmp8623, i64 1
+  %tmp8625 = getelementptr inbounds float* %tmp8624, i64 1
+  %tmp8626 = getelementptr inbounds float* %tmp8625, i64 1
+  %tmp8627 = getelementptr inbounds float* %tmp8626, i64 1
+  %tmp8628 = getelementptr inbounds float* %tmp8627, i64 1
+  %tmp8629 = getelementptr inbounds float* %tmp8628, i64 1
+  %tmp8630 = getelementptr inbounds float* %tmp8629, i64 1
+  %tmp8631 = getelementptr inbounds float* %tmp8630, i64 1
+  %tmp8632 = getelementptr inbounds float* %tmp8631, i64 1
+  %tmp8633 = getelementptr inbounds float* %tmp8632, i64 1
+  %tmp8634 = getelementptr inbounds float* %tmp8633, i64 1
+  %tmp8635 = getelementptr inbounds float* %tmp8634, i64 1
+  %tmp8636 = getelementptr inbounds float* %tmp8635, i64 1
+  %tmp8637 = getelementptr inbounds float* %tmp8636, i64 1
+  %tmp8638 = getelementptr inbounds float* %tmp8637, i64 1
+  %tmp8639 = getelementptr inbounds float* %tmp8638, i64 1
+  %tmp8640 = getelementptr inbounds float* %tmp8639, i64 1
+  %tmp8641 = getelementptr inbounds float* %tmp8640, i64 1
+  %tmp8642 = getelementptr inbounds float* %tmp8641, i64 1
+  %tmp8643 = getelementptr inbounds float* %tmp8642, i64 1
+  %tmp8644 = getelementptr inbounds float* %tmp8643, i64 1
+  %tmp8645 = getelementptr inbounds float* %tmp8644, i64 1
+  %tmp8646 = getelementptr inbounds float* %tmp8645, i64 1
+  %tmp8647 = getelementptr inbounds float* %tmp8646, i64 1
+  %tmp8648 = getelementptr inbounds float* %tmp8647, i64 1
+  %tmp8649 = getelementptr inbounds float* %tmp8648, i64 1
+  %tmp8650 = getelementptr inbounds float* %tmp8649, i64 1
+  %tmp8651 = getelementptr inbounds float* %tmp8650, i64 1
+  %tmp8652 = getelementptr inbounds float* %tmp8651, i64 1
+  %tmp8653 = getelementptr inbounds float* %tmp8652, i64 1
+  %tmp8654 = getelementptr inbounds float* %tmp8653, i64 1
+  %tmp8655 = getelementptr inbounds float* %tmp8654, i64 1
+  %tmp8656 = getelementptr inbounds float* %tmp8655, i64 1
+  %tmp8657 = getelementptr inbounds float* %tmp8656, i64 1
+  %tmp8658 = getelementptr inbounds float* %tmp8657, i64 1
+  %tmp8659 = getelementptr inbounds float* %tmp8658, i64 1
+  %tmp8660 = getelementptr inbounds float* %tmp8659, i64 1
+  %tmp8661 = getelementptr inbounds float* %tmp8660, i64 1
+  %tmp8662 = getelementptr inbounds float* %tmp8661, i64 1
+  %tmp8663 = getelementptr inbounds float* %tmp8662, i64 1
+  %tmp8664 = getelementptr inbounds float* %tmp8663, i64 1
+  %tmp8665 = getelementptr inbounds float* %tmp8664, i64 1
+  %tmp8666 = getelementptr inbounds float* %tmp8665, i64 1
+  %tmp8667 = getelementptr inbounds float* %tmp8666, i64 1
+  %tmp8668 = getelementptr inbounds float* %tmp8667, i64 1
+  %tmp8669 = getelementptr inbounds float* %tmp8668, i64 1
+  %tmp8670 = getelementptr inbounds float* %tmp8669, i64 1
+  %tmp8671 = getelementptr inbounds float* %tmp8670, i64 1
+  %tmp8672 = getelementptr inbounds float* %tmp8671, i64 1
+  %tmp8673 = getelementptr inbounds float* %tmp8672, i64 1
+  %tmp8674 = getelementptr inbounds float* %tmp8673, i64 1
+  %tmp8675 = getelementptr inbounds float* %tmp8674, i64 1
+  %tmp8676 = getelementptr inbounds float* %tmp8675, i64 1
+  %tmp8677 = getelementptr inbounds float* %tmp8676, i64 1
+  %tmp8678 = getelementptr inbounds float* %tmp8677, i64 1
+  %tmp8679 = getelementptr inbounds float* %tmp8678, i64 1
+  %tmp8680 = getelementptr inbounds float* %tmp8679, i64 1
+  %tmp8681 = getelementptr inbounds float* %tmp8680, i64 1
+  %tmp8682 = getelementptr inbounds float* %tmp8681, i64 1
+  %tmp8683 = getelementptr inbounds float* %tmp8682, i64 1
+  %tmp8684 = getelementptr inbounds float* %tmp8683, i64 1
+  %tmp8685 = getelementptr inbounds float* %tmp8684, i64 1
+  %tmp8686 = getelementptr inbounds float* %tmp8685, i64 1
+  %tmp8687 = getelementptr inbounds float* %tmp8686, i64 1
+  %tmp8688 = getelementptr inbounds float* %tmp8687, i64 1
+  %tmp8689 = getelementptr inbounds float* %tmp8688, i64 1
+  %tmp8690 = getelementptr inbounds float* %tmp8689, i64 1
+  %tmp8691 = getelementptr inbounds float* %tmp8690, i64 1
+  %tmp8692 = getelementptr inbounds float* %tmp8691, i64 1
+  %tmp8693 = getelementptr inbounds float* %tmp8692, i64 1
+  %tmp8694 = getelementptr inbounds float* %tmp8693, i64 1
+  %tmp8695 = getelementptr inbounds float* %tmp8694, i64 1
+  %tmp8696 = getelementptr inbounds float* %tmp8695, i64 1
+  %tmp8697 = getelementptr inbounds float* %tmp8696, i64 1
+  %tmp8698 = getelementptr inbounds float* %tmp8697, i64 1
+  %tmp8699 = getelementptr inbounds float* %tmp8698, i64 1
+  %tmp8700 = getelementptr inbounds float* %tmp8699, i64 1
+  %tmp8701 = getelementptr inbounds float* %tmp8700, i64 1
+  %tmp8702 = getelementptr inbounds float* %tmp8701, i64 1
+  %tmp8703 = getelementptr inbounds float* %tmp8702, i64 1
+  %tmp8704 = getelementptr inbounds float* %tmp8703, i64 1
+  %tmp8705 = getelementptr inbounds float* %tmp8704, i64 1
+  %tmp8706 = getelementptr inbounds float* %tmp8705, i64 1
+  %tmp8707 = getelementptr inbounds float* %tmp8706, i64 1
+  %tmp8708 = getelementptr inbounds float* %tmp8707, i64 1
+  %tmp8709 = getelementptr inbounds float* %tmp8708, i64 1
+  %tmp8710 = getelementptr inbounds float* %tmp8709, i64 1
+  %tmp8711 = getelementptr inbounds float* %tmp8710, i64 1
+  %tmp8712 = getelementptr inbounds float* %tmp8711, i64 1
+  %tmp8713 = getelementptr inbounds float* %tmp8712, i64 1
+  %tmp8714 = getelementptr inbounds float* %tmp8713, i64 1
+  %tmp8715 = getelementptr inbounds float* %tmp8714, i64 1
+  %tmp8716 = getelementptr inbounds float* %tmp8715, i64 1
+  %tmp8717 = getelementptr inbounds float* %tmp8716, i64 1
+  %tmp8718 = getelementptr inbounds float* %tmp8717, i64 1
+  %tmp8719 = getelementptr inbounds float* %tmp8718, i64 1
+  %tmp8720 = getelementptr inbounds float* %tmp8719, i64 1
+  %tmp8721 = getelementptr inbounds float* %tmp8720, i64 1
+  %tmp8722 = getelementptr inbounds float* %tmp8721, i64 1
+  %tmp8723 = getelementptr inbounds float* %tmp8722, i64 1
+  %tmp8724 = getelementptr inbounds float* %tmp8723, i64 1
+  %tmp8725 = getelementptr inbounds float* %tmp8724, i64 1
+  %tmp8726 = getelementptr inbounds float* %tmp8725, i64 1
+  %tmp8727 = getelementptr inbounds float* %tmp8726, i64 1
+  %tmp8728 = getelementptr inbounds float* %tmp8727, i64 1
+  %tmp8729 = getelementptr inbounds float* %tmp8728, i64 1
+  %tmp8730 = getelementptr inbounds float* %tmp8729, i64 1
+  %tmp8731 = getelementptr inbounds float* %tmp8730, i64 1
+  %tmp8732 = getelementptr inbounds float* %tmp8731, i64 1
+  %tmp8733 = getelementptr inbounds float* %tmp8732, i64 1
+  %tmp8734 = getelementptr inbounds float* %tmp8733, i64 1
+  %tmp8735 = getelementptr inbounds float* %tmp8734, i64 1
+  %tmp8736 = getelementptr inbounds float* %tmp8735, i64 1
+  %tmp8737 = getelementptr inbounds float* %tmp8736, i64 1
+  %tmp8738 = getelementptr inbounds float* %tmp8737, i64 1
+  %tmp8739 = getelementptr inbounds float* %tmp8738, i64 1
+  %tmp8740 = getelementptr inbounds float* %tmp8739, i64 1
+  %tmp8741 = getelementptr inbounds float* %tmp8740, i64 1
+  %tmp8742 = getelementptr inbounds float* %tmp8741, i64 1
+  %tmp8743 = getelementptr inbounds float* %tmp8742, i64 1
+  %tmp8744 = getelementptr inbounds float* %tmp8743, i64 1
+  %tmp8745 = getelementptr inbounds float* %tmp8744, i64 1
+  %tmp8746 = getelementptr inbounds float* %tmp8745, i64 1
+  %tmp8747 = getelementptr inbounds float* %tmp8746, i64 1
+  %tmp8748 = getelementptr inbounds float* %tmp8747, i64 1
+  %tmp8749 = getelementptr inbounds float* %tmp8748, i64 1
+  %tmp8750 = getelementptr inbounds float* %tmp8749, i64 1
+  %tmp8751 = getelementptr inbounds float* %tmp8750, i64 1
+  %tmp8752 = getelementptr inbounds float* %tmp8751, i64 1
+  %tmp8753 = getelementptr inbounds float* %tmp8752, i64 1
+  %tmp8754 = getelementptr inbounds float* %tmp8753, i64 1
+  %tmp8755 = getelementptr inbounds float* %tmp8754, i64 1
+  %tmp8756 = getelementptr inbounds float* %tmp8755, i64 1
+  %tmp8757 = getelementptr inbounds float* %tmp8756, i64 1
+  %tmp8758 = getelementptr inbounds float* %tmp8757, i64 1
+  %tmp8759 = getelementptr inbounds float* %tmp8758, i64 1
+  %tmp8760 = getelementptr inbounds float* %tmp8759, i64 1
+  %tmp8761 = getelementptr inbounds float* %tmp8760, i64 1
+  %tmp8762 = getelementptr inbounds float* %tmp8761, i64 1
+  %tmp8763 = getelementptr inbounds float* %tmp8762, i64 1
+  %tmp8764 = getelementptr inbounds float* %tmp8763, i64 1
+  %tmp8765 = getelementptr inbounds float* %tmp8764, i64 1
+  %tmp8766 = getelementptr inbounds float* %tmp8765, i64 1
+  %tmp8767 = getelementptr inbounds float* %tmp8766, i64 1
+  %tmp8768 = getelementptr inbounds float* %tmp8767, i64 1
+  %tmp8769 = getelementptr inbounds float* %tmp8768, i64 1
+  %tmp8770 = getelementptr inbounds float* %tmp8769, i64 1
+  %tmp8771 = getelementptr inbounds float* %tmp8770, i64 1
+  %tmp8772 = getelementptr inbounds float* %tmp8771, i64 1
+  %tmp8773 = getelementptr inbounds float* %tmp8772, i64 1
+  %tmp8774 = getelementptr inbounds float* %tmp8773, i64 1
+  %tmp8775 = getelementptr inbounds float* %tmp8774, i64 1
+  %tmp8776 = getelementptr inbounds float* %tmp8775, i64 1
+  %tmp8777 = getelementptr inbounds float* %tmp8776, i64 1
+  %tmp8778 = getelementptr inbounds float* %tmp8777, i64 1
+  %tmp8779 = getelementptr inbounds float* %tmp8778, i64 1
+  %tmp8780 = getelementptr inbounds float* %tmp8779, i64 1
+  %tmp8781 = getelementptr inbounds float* %tmp8780, i64 1
+  %tmp8782 = getelementptr inbounds float* %tmp8781, i64 1
+  %tmp8783 = getelementptr inbounds float* %tmp8782, i64 1
+  %tmp8784 = getelementptr inbounds float* %tmp8783, i64 1
+  %tmp8785 = getelementptr inbounds float* %tmp8784, i64 1
+  %tmp8786 = getelementptr inbounds float* %tmp8785, i64 1
+  %tmp8787 = getelementptr inbounds float* %tmp8786, i64 1
+  %tmp8788 = getelementptr inbounds float* %tmp8787, i64 1
+  %tmp8789 = getelementptr inbounds float* %tmp8788, i64 1
+  %tmp8790 = getelementptr inbounds float* %tmp8789, i64 1
+  %tmp8791 = getelementptr inbounds float* %tmp8790, i64 1
+  %tmp8792 = getelementptr inbounds float* %tmp8791, i64 1
+  %tmp8793 = getelementptr inbounds float* %tmp8792, i64 1
+  %tmp8794 = getelementptr inbounds float* %tmp8793, i64 1
+  %tmp8795 = getelementptr inbounds float* %tmp8794, i64 1
+  %tmp8796 = getelementptr inbounds float* %tmp8795, i64 1
+  %tmp8797 = getelementptr inbounds float* %tmp8796, i64 1
+  %tmp8798 = getelementptr inbounds float* %tmp8797, i64 1
+  %tmp8799 = getelementptr inbounds float* %tmp8798, i64 1
+  %tmp8800 = getelementptr inbounds float* %tmp8799, i64 1
+  %tmp8801 = getelementptr inbounds float* %tmp8800, i64 1
+  %tmp8802 = getelementptr inbounds float* %tmp8801, i64 1
+  %tmp8803 = getelementptr inbounds float* %tmp8802, i64 1
+  %tmp8804 = getelementptr inbounds float* %tmp8803, i64 1
+  %tmp8805 = getelementptr inbounds float* %tmp8804, i64 1
+  %tmp8806 = getelementptr inbounds float* %tmp8805, i64 1
+  %tmp8807 = getelementptr inbounds float* %tmp8806, i64 1
+  %tmp8808 = getelementptr inbounds float* %tmp8807, i64 1
+  %tmp8809 = getelementptr inbounds float* %tmp8808, i64 1
+  %tmp8810 = getelementptr inbounds float* %tmp8809, i64 1
+  %tmp8811 = getelementptr inbounds float* %tmp8810, i64 1
+  %tmp8812 = getelementptr inbounds float* %tmp8811, i64 1
+  %tmp8813 = getelementptr inbounds float* %tmp8812, i64 1
+  %tmp8814 = getelementptr inbounds float* %tmp8813, i64 1
+  %tmp8815 = getelementptr inbounds float* %tmp8814, i64 1
+  %tmp8816 = getelementptr inbounds float* %tmp8815, i64 1
+  %tmp8817 = getelementptr inbounds float* %tmp8816, i64 1
+  %tmp8818 = getelementptr inbounds float* %tmp8817, i64 1
+  %tmp8819 = getelementptr inbounds float* %tmp8818, i64 1
+  %tmp8820 = getelementptr inbounds float* %tmp8819, i64 1
+  %tmp8821 = getelementptr inbounds float* %tmp8820, i64 1
+  %tmp8822 = getelementptr inbounds float* %tmp8821, i64 1
+  %tmp8823 = getelementptr inbounds float* %tmp8822, i64 1
+  %tmp8824 = getelementptr inbounds float* %tmp8823, i64 1
+  %tmp8825 = getelementptr inbounds float* %tmp8824, i64 1
+  %tmp8826 = getelementptr inbounds float* %tmp8825, i64 1
+  %tmp8827 = getelementptr inbounds float* %tmp8826, i64 1
+  %tmp8828 = getelementptr inbounds float* %tmp8827, i64 1
+  %tmp8829 = getelementptr inbounds float* %tmp8828, i64 1
+  %tmp8830 = getelementptr inbounds float* %tmp8829, i64 1
+  %tmp8831 = getelementptr inbounds float* %tmp8830, i64 1
+  %tmp8832 = getelementptr inbounds float* %tmp8831, i64 1
+  %tmp8833 = getelementptr inbounds float* %tmp8832, i64 1
+  %tmp8834 = getelementptr inbounds float* %tmp8833, i64 1
+  %tmp8835 = getelementptr inbounds float* %tmp8834, i64 1
+  %tmp8836 = getelementptr inbounds float* %tmp8835, i64 1
+  %tmp8837 = getelementptr inbounds float* %tmp8836, i64 1
+  %tmp8838 = getelementptr inbounds float* %tmp8837, i64 1
+  %tmp8839 = getelementptr inbounds float* %tmp8838, i64 1
+  %tmp8840 = getelementptr inbounds float* %tmp8839, i64 1
+  %tmp8841 = getelementptr inbounds float* %tmp8840, i64 1
+  %tmp8842 = getelementptr inbounds float* %tmp8841, i64 1
+  %tmp8843 = getelementptr inbounds float* %tmp8842, i64 1
+  %tmp8844 = getelementptr inbounds float* %tmp8843, i64 1
+  %tmp8845 = getelementptr inbounds float* %tmp8844, i64 1
+  %tmp8846 = getelementptr inbounds float* %tmp8845, i64 1
+  %tmp8847 = getelementptr inbounds float* %tmp8846, i64 1
+  %tmp8848 = getelementptr inbounds float* %tmp8847, i64 1
+  %tmp8849 = getelementptr inbounds float* %tmp8848, i64 1
+  %tmp8850 = getelementptr inbounds float* %tmp8849, i64 1
+  %tmp8851 = getelementptr inbounds float* %tmp8850, i64 1
+  %tmp8852 = getelementptr inbounds float* %tmp8851, i64 1
+  %tmp8853 = getelementptr inbounds float* %tmp8852, i64 1
+  %tmp8854 = getelementptr inbounds float* %tmp8853, i64 1
+  %tmp8855 = getelementptr inbounds float* %tmp8854, i64 1
+  %tmp8856 = getelementptr inbounds float* %tmp8855, i64 1
+  %tmp8857 = getelementptr inbounds float* %tmp8856, i64 1
+  %tmp8858 = getelementptr inbounds float* %tmp8857, i64 1
+  %tmp8859 = getelementptr inbounds float* %tmp8858, i64 1
+  %tmp8860 = getelementptr inbounds float* %tmp8859, i64 1
+  %tmp8861 = getelementptr inbounds float* %tmp8860, i64 1
+  %tmp8862 = getelementptr inbounds float* %tmp8861, i64 1
+  %tmp8863 = getelementptr inbounds float* %tmp8862, i64 1
+  %tmp8864 = getelementptr inbounds float* %tmp8863, i64 1
+  %tmp8865 = getelementptr inbounds float* %tmp8864, i64 1
+  %tmp8866 = getelementptr inbounds float* %tmp8865, i64 1
+  %tmp8867 = getelementptr inbounds float* %tmp8866, i64 1
+  %tmp8868 = getelementptr inbounds float* %tmp8867, i64 1
+  %tmp8869 = getelementptr inbounds float* %tmp8868, i64 1
+  %tmp8870 = getelementptr inbounds float* %tmp8869, i64 1
+  %tmp8871 = getelementptr inbounds float* %tmp8870, i64 1
+  %tmp8872 = getelementptr inbounds float* %tmp8871, i64 1
+  %tmp8873 = getelementptr inbounds float* %tmp8872, i64 1
+  %tmp8874 = getelementptr inbounds float* %tmp8873, i64 1
+  %tmp8875 = getelementptr inbounds float* %tmp8874, i64 1
+  %tmp8876 = getelementptr inbounds float* %tmp8875, i64 1
+  %tmp8877 = getelementptr inbounds float* %tmp8876, i64 1
+  %tmp8878 = getelementptr inbounds float* %tmp8877, i64 1
+  %tmp8879 = getelementptr inbounds float* %tmp8878, i64 1
+  %tmp8880 = getelementptr inbounds float* %tmp8879, i64 1
+  %tmp8881 = getelementptr inbounds float* %tmp8880, i64 1
+  %tmp8882 = getelementptr inbounds float* %tmp8881, i64 1
+  %tmp8883 = getelementptr inbounds float* %tmp8882, i64 1
+  %tmp8884 = getelementptr inbounds float* %tmp8883, i64 1
+  %tmp8885 = getelementptr inbounds float* %tmp8884, i64 1
+  %tmp8886 = getelementptr inbounds float* %tmp8885, i64 1
+  %tmp8887 = getelementptr inbounds float* %tmp8886, i64 1
+  %tmp8888 = getelementptr inbounds float* %tmp8887, i64 1
+  %tmp8889 = getelementptr inbounds float* %tmp8888, i64 1
+  %tmp8890 = getelementptr inbounds float* %tmp8889, i64 1
+  %tmp8891 = getelementptr inbounds float* %tmp8890, i64 1
+  %tmp8892 = getelementptr inbounds float* %tmp8891, i64 1
+  %tmp8893 = getelementptr inbounds float* %tmp8892, i64 1
+  %tmp8894 = getelementptr inbounds float* %tmp8893, i64 1
+  %tmp8895 = getelementptr inbounds float* %tmp8894, i64 1
+  %tmp8896 = getelementptr inbounds float* %tmp8895, i64 1
+  %tmp8897 = getelementptr inbounds float* %tmp8896, i64 1
+  %tmp8898 = getelementptr inbounds float* %tmp8897, i64 1
+  %tmp8899 = getelementptr inbounds float* %tmp8898, i64 1
+  %tmp8900 = getelementptr inbounds float* %tmp8899, i64 1
+  %tmp8901 = getelementptr inbounds float* %tmp8900, i64 1
+  %tmp8902 = getelementptr inbounds float* %tmp8901, i64 1
+  %tmp8903 = getelementptr inbounds float* %tmp8902, i64 1
+  %tmp8904 = getelementptr inbounds float* %tmp8903, i64 1
+  %tmp8905 = getelementptr inbounds float* %tmp8904, i64 1
+  %tmp8906 = getelementptr inbounds float* %tmp8905, i64 1
+  %tmp8907 = getelementptr inbounds float* %tmp8906, i64 1
+  %tmp8908 = getelementptr inbounds float* %tmp8907, i64 1
+  %tmp8909 = getelementptr inbounds float* %tmp8908, i64 1
+  %tmp8910 = getelementptr inbounds float* %tmp8909, i64 1
+  %tmp8911 = getelementptr inbounds float* %tmp8910, i64 1
+  %tmp8912 = getelementptr inbounds float* %tmp8911, i64 1
+  %tmp8913 = getelementptr inbounds float* %tmp8912, i64 1
+  %tmp8914 = getelementptr inbounds float* %tmp8913, i64 1
+  %tmp8915 = getelementptr inbounds float* %tmp8914, i64 1
+  %tmp8916 = getelementptr inbounds float* %tmp8915, i64 1
+  %tmp8917 = getelementptr inbounds float* %tmp8916, i64 1
+  %tmp8918 = getelementptr inbounds float* %tmp8917, i64 1
+  %tmp8919 = getelementptr inbounds float* %tmp8918, i64 1
+  %tmp8920 = getelementptr inbounds float* %tmp8919, i64 1
+  %tmp8921 = getelementptr inbounds float* %tmp8920, i64 1
+  %tmp8922 = getelementptr inbounds float* %tmp8921, i64 1
+  %tmp8923 = getelementptr inbounds float* %tmp8922, i64 1
+  %tmp8924 = getelementptr inbounds float* %tmp8923, i64 1
+  %tmp8925 = getelementptr inbounds float* %tmp8924, i64 1
+  %tmp8926 = getelementptr inbounds float* %tmp8925, i64 1
+  %tmp8927 = getelementptr inbounds float* %tmp8926, i64 1
+  %tmp8928 = getelementptr inbounds float* %tmp8927, i64 1
+  %tmp8929 = getelementptr inbounds float* %tmp8928, i64 1
+  %tmp8930 = getelementptr inbounds float* %tmp8929, i64 1
+  %tmp8931 = getelementptr inbounds float* %tmp8930, i64 1
+  %tmp8932 = getelementptr inbounds float* %tmp8931, i64 1
+  %tmp8933 = getelementptr inbounds float* %tmp8932, i64 1
+  %tmp8934 = getelementptr inbounds float* %tmp8933, i64 1
+  %tmp8935 = getelementptr inbounds float* %tmp8934, i64 1
+  %tmp8936 = getelementptr inbounds float* %tmp8935, i64 1
+  %tmp8937 = getelementptr inbounds float* %tmp8936, i64 1
+  %tmp8938 = getelementptr inbounds float* %tmp8937, i64 1
+  %tmp8939 = getelementptr inbounds float* %tmp8938, i64 1
+  %tmp8940 = getelementptr inbounds float* %tmp8939, i64 1
+  %tmp8941 = getelementptr inbounds float* %tmp8940, i64 1
+  %tmp8942 = getelementptr inbounds float* %tmp8941, i64 1
+  %tmp8943 = getelementptr inbounds float* %tmp8942, i64 1
+  %tmp8944 = getelementptr inbounds float* %tmp8943, i64 1
+  %tmp8945 = getelementptr inbounds float* %tmp8944, i64 1
+  %tmp8946 = getelementptr inbounds float* %tmp8945, i64 1
+  %tmp8947 = getelementptr inbounds float* %tmp8946, i64 1
+  %tmp8948 = getelementptr inbounds float* %tmp8947, i64 1
+  %tmp8949 = getelementptr inbounds float* %tmp8948, i64 1
+  %tmp8950 = getelementptr inbounds float* %tmp8949, i64 1
+  %tmp8951 = getelementptr inbounds float* %tmp8950, i64 1
+  %tmp8952 = getelementptr inbounds float* %tmp8951, i64 1
+  %tmp8953 = getelementptr inbounds float* %tmp8952, i64 1
+  %tmp8954 = getelementptr inbounds float* %tmp8953, i64 1
+  %tmp8955 = getelementptr inbounds float* %tmp8954, i64 1
+  %tmp8956 = getelementptr inbounds float* %tmp8955, i64 1
+  %tmp8957 = getelementptr inbounds float* %tmp8956, i64 1
+  %tmp8958 = getelementptr inbounds float* %tmp8957, i64 1
+  %tmp8959 = getelementptr inbounds float* %tmp8958, i64 1
+  %tmp8960 = getelementptr inbounds float* %tmp8959, i64 1
+  %tmp8961 = getelementptr inbounds float* %tmp8960, i64 1
+  %tmp8962 = getelementptr inbounds float* %tmp8961, i64 1
+  %tmp8963 = getelementptr inbounds float* %tmp8962, i64 1
+  %tmp8964 = getelementptr inbounds float* %tmp8963, i64 1
+  %tmp8965 = getelementptr inbounds float* %tmp8964, i64 1
+  %tmp8966 = getelementptr inbounds float* %tmp8965, i64 1
+  %tmp8967 = getelementptr inbounds float* %tmp8966, i64 1
+  %tmp8968 = getelementptr inbounds float* %tmp8967, i64 1
+  %tmp8969 = getelementptr inbounds float* %tmp8968, i64 1
+  %tmp8970 = getelementptr inbounds float* %tmp8969, i64 1
+  %tmp8971 = getelementptr inbounds float* %tmp8970, i64 1
+  %tmp8972 = getelementptr inbounds float* %tmp8971, i64 1
+  %tmp8973 = getelementptr inbounds float* %tmp8972, i64 1
+  %tmp8974 = getelementptr inbounds float* %tmp8973, i64 1
+  %tmp8975 = getelementptr inbounds float* %tmp8974, i64 1
+  %tmp8976 = getelementptr inbounds float* %tmp8975, i64 1
+  %tmp8977 = getelementptr inbounds float* %tmp8976, i64 1
+  %tmp8978 = getelementptr inbounds float* %tmp8977, i64 1
+  %tmp8979 = getelementptr inbounds float* %tmp8978, i64 1
+  %tmp8980 = getelementptr inbounds float* %tmp8979, i64 1
+  %tmp8981 = getelementptr inbounds float* %tmp8980, i64 1
+  %tmp8982 = getelementptr inbounds float* %tmp8981, i64 1
+  %tmp8983 = getelementptr inbounds float* %tmp8982, i64 1
+  %tmp8984 = getelementptr inbounds float* %tmp8983, i64 1
+  %tmp8985 = getelementptr inbounds float* %tmp8984, i64 1
+  %tmp8986 = getelementptr inbounds float* %tmp8985, i64 1
+  %tmp8987 = getelementptr inbounds float* %tmp8986, i64 1
+  %tmp8988 = getelementptr inbounds float* %tmp8987, i64 1
+  %tmp8989 = getelementptr inbounds float* %tmp8988, i64 1
+  %tmp8990 = getelementptr inbounds float* %tmp8989, i64 1
+  %tmp8991 = getelementptr inbounds float* %tmp8990, i64 1
+  %tmp8992 = getelementptr inbounds float* %tmp8991, i64 1
+  %tmp8993 = getelementptr inbounds float* %tmp8992, i64 1
+  %tmp8994 = getelementptr inbounds float* %tmp8993, i64 1
+  %tmp8995 = getelementptr inbounds float* %tmp8994, i64 1
+  %tmp8996 = getelementptr inbounds float* %tmp8995, i64 1
+  %tmp8997 = getelementptr inbounds float* %tmp8996, i64 1
+  %tmp8998 = getelementptr inbounds float* %tmp8997, i64 1
+  %tmp8999 = getelementptr inbounds float* %tmp8998, i64 1
+  %tmp9000 = getelementptr inbounds float* %tmp8999, i64 1
+  %tmp9001 = getelementptr inbounds float* %tmp9000, i64 1
+  %tmp9002 = getelementptr inbounds float* %tmp9001, i64 1
+  %tmp9003 = getelementptr inbounds float* %tmp9002, i64 1
+  %tmp9004 = getelementptr inbounds float* %tmp9003, i64 1
+  %tmp9005 = getelementptr inbounds float* %tmp9004, i64 1
+  %tmp9006 = getelementptr inbounds float* %tmp9005, i64 1
+  %tmp9007 = getelementptr inbounds float* %tmp9006, i64 1
+  %tmp9008 = getelementptr inbounds float* %tmp9007, i64 1
+  %tmp9009 = getelementptr inbounds float* %tmp9008, i64 1
+  %tmp9010 = getelementptr inbounds float* %tmp9009, i64 1
+  %tmp9011 = getelementptr inbounds float* %tmp9010, i64 1
+  %tmp9012 = getelementptr inbounds float* %tmp9011, i64 1
+  %tmp9013 = getelementptr inbounds float* %tmp9012, i64 1
+  %tmp9014 = getelementptr inbounds float* %tmp9013, i64 1
+  %tmp9015 = getelementptr inbounds float* %tmp9014, i64 1
+  %tmp9016 = getelementptr inbounds float* %tmp9015, i64 1
+  %tmp9017 = getelementptr inbounds float* %tmp9016, i64 1
+  %tmp9018 = getelementptr inbounds float* %tmp9017, i64 1
+  %tmp9019 = getelementptr inbounds float* %tmp9018, i64 1
+  %tmp9020 = getelementptr inbounds float* %tmp9019, i64 1
+  %tmp9021 = getelementptr inbounds float* %tmp9020, i64 1
+  %tmp9022 = getelementptr inbounds float* %tmp9021, i64 1
+  %tmp9023 = getelementptr inbounds float* %tmp9022, i64 1
+  %tmp9024 = getelementptr inbounds float* %tmp9023, i64 1
+  %tmp9025 = getelementptr inbounds float* %tmp9024, i64 1
+  %tmp9026 = getelementptr inbounds float* %tmp9025, i64 1
+  %tmp9027 = getelementptr inbounds float* %tmp9026, i64 1
+  %tmp9028 = getelementptr inbounds float* %tmp9027, i64 1
+  %tmp9029 = getelementptr inbounds float* %tmp9028, i64 1
+  %tmp9030 = getelementptr inbounds float* %tmp9029, i64 1
+  %tmp9031 = getelementptr inbounds float* %tmp9030, i64 1
+  %tmp9032 = getelementptr inbounds float* %tmp9031, i64 1
+  %tmp9033 = getelementptr inbounds float* %tmp9032, i64 1
+  %tmp9034 = getelementptr inbounds float* %tmp9033, i64 1
+  %tmp9035 = getelementptr inbounds float* %tmp9034, i64 1
+  %tmp9036 = getelementptr inbounds float* %tmp9035, i64 1
+  %tmp9037 = getelementptr inbounds float* %tmp9036, i64 1
+  %tmp9038 = getelementptr inbounds float* %tmp9037, i64 1
+  %tmp9039 = getelementptr inbounds float* %tmp9038, i64 1
+  %tmp9040 = getelementptr inbounds float* %tmp9039, i64 1
+  %tmp9041 = getelementptr inbounds float* %tmp9040, i64 1
+  %tmp9042 = getelementptr inbounds float* %tmp9041, i64 1
+  %tmp9043 = getelementptr inbounds float* %tmp9042, i64 1
+  %tmp9044 = getelementptr inbounds float* %tmp9043, i64 1
+  %tmp9045 = getelementptr inbounds float* %tmp9044, i64 1
+  %tmp9046 = getelementptr inbounds float* %tmp9045, i64 1
+  %tmp9047 = getelementptr inbounds float* %tmp9046, i64 1
+  %tmp9048 = getelementptr inbounds float* %tmp9047, i64 1
+  %tmp9049 = getelementptr inbounds float* %tmp9048, i64 1
+  %tmp9050 = getelementptr inbounds float* %tmp9049, i64 1
+  %tmp9051 = getelementptr inbounds float* %tmp9050, i64 1
+  %tmp9052 = getelementptr inbounds float* %tmp9051, i64 1
+  %tmp9053 = getelementptr inbounds float* %tmp9052, i64 1
+  %tmp9054 = getelementptr inbounds float* %tmp9053, i64 1
+  %tmp9055 = getelementptr inbounds float* %tmp9054, i64 1
+  %tmp9056 = getelementptr inbounds float* %tmp9055, i64 1
+  %tmp9057 = getelementptr inbounds float* %tmp9056, i64 1
+  %tmp9058 = getelementptr inbounds float* %tmp9057, i64 1
+  %tmp9059 = getelementptr inbounds float* %tmp9058, i64 1
+  %tmp9060 = getelementptr inbounds float* %tmp9059, i64 1
+  %tmp9061 = getelementptr inbounds float* %tmp9060, i64 1
+  %tmp9062 = getelementptr inbounds float* %tmp9061, i64 1
+  %tmp9063 = getelementptr inbounds float* %tmp9062, i64 1
+  %tmp9064 = getelementptr inbounds float* %tmp9063, i64 1
+  %tmp9065 = getelementptr inbounds float* %tmp9064, i64 1
+  %tmp9066 = getelementptr inbounds float* %tmp9065, i64 1
+  %tmp9067 = getelementptr inbounds float* %tmp9066, i64 1
+  %tmp9068 = getelementptr inbounds float* %tmp9067, i64 1
+  %tmp9069 = getelementptr inbounds float* %tmp9068, i64 1
+  %tmp9070 = getelementptr inbounds float* %tmp9069, i64 1
+  %tmp9071 = getelementptr inbounds float* %tmp9070, i64 1
+  %tmp9072 = getelementptr inbounds float* %tmp9071, i64 1
+  %tmp9073 = getelementptr inbounds float* %tmp9072, i64 1
+  %tmp9074 = getelementptr inbounds float* %tmp9073, i64 1
+  %tmp9075 = getelementptr inbounds float* %tmp9074, i64 1
+  %tmp9076 = getelementptr inbounds float* %tmp9075, i64 1
+  %tmp9077 = getelementptr inbounds float* %tmp9076, i64 1
+  %tmp9078 = getelementptr inbounds float* %tmp9077, i64 1
+  %tmp9079 = getelementptr inbounds float* %tmp9078, i64 1
+  %tmp9080 = getelementptr inbounds float* %tmp9079, i64 1
+  %tmp9081 = getelementptr inbounds float* %tmp9080, i64 1
+  %tmp9082 = getelementptr inbounds float* %tmp9081, i64 1
+  %tmp9083 = getelementptr inbounds float* %tmp9082, i64 1
+  %tmp9084 = getelementptr inbounds float* %tmp9083, i64 1
+  %tmp9085 = getelementptr inbounds float* %tmp9084, i64 1
+  %tmp9086 = getelementptr inbounds float* %tmp9085, i64 1
+  %tmp9087 = getelementptr inbounds float* %tmp9086, i64 1
+  %tmp9088 = getelementptr inbounds float* %tmp9087, i64 1
+  %tmp9089 = getelementptr inbounds float* %tmp9088, i64 1
+  %tmp9090 = getelementptr inbounds float* %tmp9089, i64 1
+  %tmp9091 = getelementptr inbounds float* %tmp9090, i64 1
+  %tmp9092 = getelementptr inbounds float* %tmp9091, i64 1
+  %tmp9093 = getelementptr inbounds float* %tmp9092, i64 1
+  %tmp9094 = getelementptr inbounds float* %tmp9093, i64 1
+  %tmp9095 = getelementptr inbounds float* %tmp9094, i64 1
+  %tmp9096 = getelementptr inbounds float* %tmp9095, i64 1
+  %tmp9097 = getelementptr inbounds float* %tmp9096, i64 1
+  %tmp9098 = getelementptr inbounds float* %tmp9097, i64 1
+  %tmp9099 = getelementptr inbounds float* %tmp9098, i64 1
+  %tmp9100 = getelementptr inbounds float* %tmp9099, i64 1
+  %tmp9101 = getelementptr inbounds float* %tmp9100, i64 1
+  %tmp9102 = getelementptr inbounds float* %tmp9101, i64 1
+  %tmp9103 = getelementptr inbounds float* %tmp9102, i64 1
+  %tmp9104 = getelementptr inbounds float* %tmp9103, i64 1
+  %tmp9105 = getelementptr inbounds float* %tmp9104, i64 1
+  %tmp9106 = getelementptr inbounds float* %tmp9105, i64 1
+  %tmp9107 = getelementptr inbounds float* %tmp9106, i64 1
+  %tmp9108 = getelementptr inbounds float* %tmp9107, i64 1
+  %tmp9109 = getelementptr inbounds float* %tmp9108, i64 1
+  %tmp9110 = getelementptr inbounds float* %tmp9109, i64 1
+  %tmp9111 = getelementptr inbounds float* %tmp9110, i64 1
+  %tmp9112 = getelementptr inbounds float* %tmp9111, i64 1
+  %tmp9113 = getelementptr inbounds float* %tmp9112, i64 1
+  %tmp9114 = getelementptr inbounds float* %tmp9113, i64 1
+  %tmp9115 = getelementptr inbounds float* %tmp9114, i64 1
+  %tmp9116 = getelementptr inbounds float* %tmp9115, i64 1
+  %tmp9117 = getelementptr inbounds float* %tmp9116, i64 1
+  %tmp9118 = getelementptr inbounds float* %tmp9117, i64 1
+  %tmp9119 = getelementptr inbounds float* %tmp9118, i64 1
+  %tmp9120 = getelementptr inbounds float* %tmp9119, i64 1
+  %tmp9121 = getelementptr inbounds float* %tmp9120, i64 1
+  %tmp9122 = getelementptr inbounds float* %tmp9121, i64 1
+  %tmp9123 = getelementptr inbounds float* %tmp9122, i64 1
+  %tmp9124 = getelementptr inbounds float* %tmp9123, i64 1
+  %tmp9125 = getelementptr inbounds float* %tmp9124, i64 1
+  %tmp9126 = getelementptr inbounds float* %tmp9125, i64 1
+  %tmp9127 = getelementptr inbounds float* %tmp9126, i64 1
+  %tmp9128 = getelementptr inbounds float* %tmp9127, i64 1
+  %tmp9129 = getelementptr inbounds float* %tmp9128, i64 1
+  %tmp9130 = getelementptr inbounds float* %tmp9129, i64 1
+  %tmp9131 = getelementptr inbounds float* %tmp9130, i64 1
+  %tmp9132 = getelementptr inbounds float* %tmp9131, i64 1
+  %tmp9133 = getelementptr inbounds float* %tmp9132, i64 1
+  %tmp9134 = getelementptr inbounds float* %tmp9133, i64 1
+  %tmp9135 = getelementptr inbounds float* %tmp9134, i64 1
+  %tmp9136 = getelementptr inbounds float* %tmp9135, i64 1
+  %tmp9137 = getelementptr inbounds float* %tmp9136, i64 1
+  %tmp9138 = getelementptr inbounds float* %tmp9137, i64 1
+  %tmp9139 = getelementptr inbounds float* %tmp9138, i64 1
+  %tmp9140 = getelementptr inbounds float* %tmp9139, i64 1
+  %tmp9141 = getelementptr inbounds float* %tmp9140, i64 1
+  %tmp9142 = getelementptr inbounds float* %tmp9141, i64 1
+  %tmp9143 = getelementptr inbounds float* %tmp9142, i64 1
+  %tmp9144 = getelementptr inbounds float* %tmp9143, i64 1
+  %tmp9145 = getelementptr inbounds float* %tmp9144, i64 1
+  %tmp9146 = getelementptr inbounds float* %tmp9145, i64 1
+  %tmp9147 = getelementptr inbounds float* %tmp9146, i64 1
+  %tmp9148 = getelementptr inbounds float* %tmp9147, i64 1
+  %tmp9149 = getelementptr inbounds float* %tmp9148, i64 1
+  %tmp9150 = getelementptr inbounds float* %tmp9149, i64 1
+  %tmp9151 = getelementptr inbounds float* %tmp9150, i64 1
+  %tmp9152 = getelementptr inbounds float* %tmp9151, i64 1
+  %tmp9153 = getelementptr inbounds float* %tmp9152, i64 1
+  %tmp9154 = getelementptr inbounds float* %tmp9153, i64 1
+  %tmp9155 = getelementptr inbounds float* %tmp9154, i64 1
+  %tmp9156 = getelementptr inbounds float* %tmp9155, i64 1
+  %tmp9157 = getelementptr inbounds float* %tmp9156, i64 1
+  %tmp9158 = getelementptr inbounds float* %tmp9157, i64 1
+  %tmp9159 = getelementptr inbounds float* %tmp9158, i64 1
+  %tmp9160 = getelementptr inbounds float* %tmp9159, i64 1
+  %tmp9161 = getelementptr inbounds float* %tmp9160, i64 1
+  %tmp9162 = getelementptr inbounds float* %tmp9161, i64 1
+  %tmp9163 = getelementptr inbounds float* %tmp9162, i64 1
+  %tmp9164 = getelementptr inbounds float* %tmp9163, i64 1
+  %tmp9165 = getelementptr inbounds float* %tmp9164, i64 1
+  %tmp9166 = getelementptr inbounds float* %tmp9165, i64 1
+  %tmp9167 = getelementptr inbounds float* %tmp9166, i64 1
+  %tmp9168 = getelementptr inbounds float* %tmp9167, i64 1
+  %tmp9169 = getelementptr inbounds float* %tmp9168, i64 1
+  %tmp9170 = getelementptr inbounds float* %tmp9169, i64 1
+  %tmp9171 = getelementptr inbounds float* %tmp9170, i64 1
+  %tmp9172 = getelementptr inbounds float* %tmp9171, i64 1
+  %tmp9173 = getelementptr inbounds float* %tmp9172, i64 1
+  %tmp9174 = getelementptr inbounds float* %tmp9173, i64 1
+  %tmp9175 = getelementptr inbounds float* %tmp9174, i64 1
+  %tmp9176 = getelementptr inbounds float* %tmp9175, i64 1
+  %tmp9177 = getelementptr inbounds float* %tmp9176, i64 1
+  %tmp9178 = getelementptr inbounds float* %tmp9177, i64 1
+  %tmp9179 = getelementptr inbounds float* %tmp9178, i64 1
+  %tmp9180 = getelementptr inbounds float* %tmp9179, i64 1
+  %tmp9181 = getelementptr inbounds float* %tmp9180, i64 1
+  %tmp9182 = getelementptr inbounds float* %tmp9181, i64 1
+  %tmp9183 = getelementptr inbounds float* %tmp9182, i64 1
+  %tmp9184 = getelementptr inbounds float* %tmp9183, i64 1
+  %tmp9185 = getelementptr inbounds float* %tmp9184, i64 1
+  %tmp9186 = getelementptr inbounds float* %tmp9185, i64 1
+  %tmp9187 = getelementptr inbounds float* %tmp9186, i64 1
+  %tmp9188 = getelementptr inbounds float* %tmp9187, i64 1
+  %tmp9189 = getelementptr inbounds float* %tmp9188, i64 1
+  %tmp9190 = getelementptr inbounds float* %tmp9189, i64 1
+  %tmp9191 = getelementptr inbounds float* %tmp9190, i64 1
+  %tmp9192 = getelementptr inbounds float* %tmp9191, i64 1
+  %tmp9193 = getelementptr inbounds float* %tmp9192, i64 1
+  %tmp9194 = getelementptr inbounds float* %tmp9193, i64 1
+  %tmp9195 = getelementptr inbounds float* %tmp9194, i64 1
+  %tmp9196 = getelementptr inbounds float* %tmp9195, i64 1
+  %tmp9197 = getelementptr inbounds float* %tmp9196, i64 1
+  %tmp9198 = getelementptr inbounds float* %tmp9197, i64 1
+  %tmp9199 = getelementptr inbounds float* %tmp9198, i64 1
+  %tmp9200 = getelementptr inbounds float* %tmp9199, i64 1
+  %tmp9201 = getelementptr inbounds float* %tmp9200, i64 1
+  %tmp9202 = getelementptr inbounds float* %tmp9201, i64 1
+  %tmp9203 = getelementptr inbounds float* %tmp9202, i64 1
+  %tmp9204 = getelementptr inbounds float* %tmp9203, i64 1
+  %tmp9205 = getelementptr inbounds float* %tmp9204, i64 1
+  %tmp9206 = getelementptr inbounds float* %tmp9205, i64 1
+  %tmp9207 = getelementptr inbounds float* %tmp9206, i64 1
+  %tmp9208 = getelementptr inbounds float* %tmp9207, i64 1
+  %tmp9209 = getelementptr inbounds float* %tmp9208, i64 1
+  %tmp9210 = getelementptr inbounds float* %tmp9209, i64 1
+  %tmp9211 = getelementptr inbounds float* %tmp9210, i64 1
+  %tmp9212 = getelementptr inbounds float* %tmp9211, i64 1
+  %tmp9213 = getelementptr inbounds float* %tmp9212, i64 1
+  %tmp9214 = getelementptr inbounds float* %tmp9213, i64 1
+  %tmp9215 = getelementptr inbounds float* %tmp9214, i64 1
+  %tmp9216 = getelementptr inbounds float* %tmp9215, i64 1
+  %tmp9217 = getelementptr inbounds float* %tmp9216, i64 1
+  %tmp9218 = getelementptr inbounds float* %tmp9217, i64 1
+  %tmp9219 = getelementptr inbounds float* %tmp9218, i64 1
+  %tmp9220 = getelementptr inbounds float* %tmp9219, i64 1
+  %tmp9221 = getelementptr inbounds float* %tmp9220, i64 1
+  %tmp9222 = getelementptr inbounds float* %tmp9221, i64 1
+  %tmp9223 = getelementptr inbounds float* %tmp9222, i64 1
+  %tmp9224 = getelementptr inbounds float* %tmp9223, i64 1
+  %tmp9225 = getelementptr inbounds float* %tmp9224, i64 1
+  %tmp9226 = getelementptr inbounds float* %tmp9225, i64 1
+  %tmp9227 = getelementptr inbounds float* %tmp9226, i64 1
+  %tmp9228 = getelementptr inbounds float* %tmp9227, i64 1
+  %tmp9229 = getelementptr inbounds float* %tmp9228, i64 1
+  %tmp9230 = getelementptr inbounds float* %tmp9229, i64 1
+  %tmp9231 = getelementptr inbounds float* %tmp9230, i64 1
+  %tmp9232 = getelementptr inbounds float* %tmp9231, i64 1
+  %tmp9233 = getelementptr inbounds float* %tmp9232, i64 1
+  %tmp9234 = getelementptr inbounds float* %tmp9233, i64 1
+  %tmp9235 = getelementptr inbounds float* %tmp9234, i64 1
+  %tmp9236 = getelementptr inbounds float* %tmp9235, i64 1
+  %tmp9237 = getelementptr inbounds float* %tmp9236, i64 1
+  %tmp9238 = getelementptr inbounds float* %tmp9237, i64 1
+  %tmp9239 = getelementptr inbounds float* %tmp9238, i64 1
+  %tmp9240 = getelementptr inbounds float* %tmp9239, i64 1
+  %tmp9241 = getelementptr inbounds float* %tmp9240, i64 1
+  %tmp9242 = getelementptr inbounds float* %tmp9241, i64 1
+  %tmp9243 = getelementptr inbounds float* %tmp9242, i64 1
+  %tmp9244 = getelementptr inbounds float* %tmp9243, i64 1
+  %tmp9245 = getelementptr inbounds float* %tmp9244, i64 1
+  %tmp9246 = getelementptr inbounds float* %tmp9245, i64 1
+  %tmp9247 = getelementptr inbounds float* %tmp9246, i64 1
+  %tmp9248 = getelementptr inbounds float* %tmp9247, i64 1
+  %tmp9249 = getelementptr inbounds float* %tmp9248, i64 1
+  %tmp9250 = getelementptr inbounds float* %tmp9249, i64 1
+  %tmp9251 = getelementptr inbounds float* %tmp9250, i64 1
+  %tmp9252 = getelementptr inbounds float* %tmp9251, i64 1
+  %tmp9253 = getelementptr inbounds float* %tmp9252, i64 1
+  %tmp9254 = getelementptr inbounds float* %tmp9253, i64 1
+  %tmp9255 = getelementptr inbounds float* %tmp9254, i64 1
+  %tmp9256 = getelementptr inbounds float* %tmp9255, i64 1
+  %tmp9257 = getelementptr inbounds float* %tmp9256, i64 1
+  %tmp9258 = getelementptr inbounds float* %tmp9257, i64 1
+  %tmp9259 = getelementptr inbounds float* %tmp9258, i64 1
+  %tmp9260 = getelementptr inbounds float* %tmp9259, i64 1
+  %tmp9261 = getelementptr inbounds float* %tmp9260, i64 1
+  %tmp9262 = getelementptr inbounds float* %tmp9261, i64 1
+  %tmp9263 = getelementptr inbounds float* %tmp9262, i64 1
+  %tmp9264 = getelementptr inbounds float* %tmp9263, i64 1
+  %tmp9265 = getelementptr inbounds float* %tmp9264, i64 1
+  %tmp9266 = getelementptr inbounds float* %tmp9265, i64 1
+  %tmp9267 = getelementptr inbounds float* %tmp9266, i64 1
+  %tmp9268 = getelementptr inbounds float* %tmp9267, i64 1
+  %tmp9269 = getelementptr inbounds float* %tmp9268, i64 1
+  %tmp9270 = getelementptr inbounds float* %tmp9269, i64 1
+  %tmp9271 = getelementptr inbounds float* %tmp9270, i64 1
+  %tmp9272 = getelementptr inbounds float* %tmp9271, i64 1
+  %tmp9273 = getelementptr inbounds float* %tmp9272, i64 1
+  %tmp9274 = getelementptr inbounds float* %tmp9273, i64 1
+  %tmp9275 = getelementptr inbounds float* %tmp9274, i64 1
+  %tmp9276 = getelementptr inbounds float* %tmp9275, i64 1
+  %tmp9277 = getelementptr inbounds float* %tmp9276, i64 1
+  %tmp9278 = getelementptr inbounds float* %tmp9277, i64 1
+  %tmp9279 = getelementptr inbounds float* %tmp9278, i64 1
+  %tmp9280 = getelementptr inbounds float* %tmp9279, i64 1
+  %tmp9281 = getelementptr inbounds float* %tmp9280, i64 1
+  %tmp9282 = getelementptr inbounds float* %tmp9281, i64 1
+  %tmp9283 = getelementptr inbounds float* %tmp9282, i64 1
+  %tmp9284 = getelementptr inbounds float* %tmp9283, i64 1
+  %tmp9285 = getelementptr inbounds float* %tmp9284, i64 1
+  %tmp9286 = getelementptr inbounds float* %tmp9285, i64 1
+  %tmp9287 = getelementptr inbounds float* %tmp9286, i64 1
+  %tmp9288 = getelementptr inbounds float* %tmp9287, i64 1
+  %tmp9289 = getelementptr inbounds float* %tmp9288, i64 1
+  %tmp9290 = getelementptr inbounds float* %tmp9289, i64 1
+  %tmp9291 = getelementptr inbounds float* %tmp9290, i64 1
+  %tmp9292 = getelementptr inbounds float* %tmp9291, i64 1
+  %tmp9293 = getelementptr inbounds float* %tmp9292, i64 1
+  %tmp9294 = getelementptr inbounds float* %tmp9293, i64 1
+  %tmp9295 = getelementptr inbounds float* %tmp9294, i64 1
+  %tmp9296 = getelementptr inbounds float* %tmp9295, i64 1
+  %tmp9297 = getelementptr inbounds float* %tmp9296, i64 1
+  %tmp9298 = getelementptr inbounds float* %tmp9297, i64 1
+  %tmp9299 = getelementptr inbounds float* %tmp9298, i64 1
+  %tmp9300 = getelementptr inbounds float* %tmp9299, i64 1
+  %tmp9301 = getelementptr inbounds float* %tmp9300, i64 1
+  %tmp9302 = getelementptr inbounds float* %tmp9301, i64 1
+  %tmp9303 = getelementptr inbounds float* %tmp9302, i64 1
+  %tmp9304 = getelementptr inbounds float* %tmp9303, i64 1
+  %tmp9305 = getelementptr inbounds float* %tmp9304, i64 1
+  %tmp9306 = getelementptr inbounds float* %tmp9305, i64 1
+  %tmp9307 = getelementptr inbounds float* %tmp9306, i64 1
+  %tmp9308 = getelementptr inbounds float* %tmp9307, i64 1
+  %tmp9309 = getelementptr inbounds float* %tmp9308, i64 1
+  %tmp9310 = getelementptr inbounds float* %tmp9309, i64 1
+  %tmp9311 = getelementptr inbounds float* %tmp9310, i64 1
+  %tmp9312 = getelementptr inbounds float* %tmp9311, i64 1
+  %tmp9313 = getelementptr inbounds float* %tmp9312, i64 1
+  %tmp9314 = getelementptr inbounds float* %tmp9313, i64 1
+  %tmp9315 = getelementptr inbounds float* %tmp9314, i64 1
+  %tmp9316 = getelementptr inbounds float* %tmp9315, i64 1
+  %tmp9317 = getelementptr inbounds float* %tmp9316, i64 1
+  %tmp9318 = getelementptr inbounds float* %tmp9317, i64 1
+  %tmp9319 = getelementptr inbounds float* %tmp9318, i64 1
+  %tmp9320 = getelementptr inbounds float* %tmp9319, i64 1
+  %tmp9321 = getelementptr inbounds float* %tmp9320, i64 1
+  %tmp9322 = getelementptr inbounds float* %tmp9321, i64 1
+  %tmp9323 = getelementptr inbounds float* %tmp9322, i64 1
+  %tmp9324 = getelementptr inbounds float* %tmp9323, i64 1
+  %tmp9325 = getelementptr inbounds float* %tmp9324, i64 1
+  %tmp9326 = getelementptr inbounds float* %tmp9325, i64 1
+  %tmp9327 = getelementptr inbounds float* %tmp9326, i64 1
+  %tmp9328 = getelementptr inbounds float* %tmp9327, i64 1
+  %tmp9329 = getelementptr inbounds float* %tmp9328, i64 1
+  %tmp9330 = getelementptr inbounds float* %tmp9329, i64 1
+  %tmp9331 = getelementptr inbounds float* %tmp9330, i64 1
+  %tmp9332 = getelementptr inbounds float* %tmp9331, i64 1
+  %tmp9333 = getelementptr inbounds float* %tmp9332, i64 1
+  %tmp9334 = getelementptr inbounds float* %tmp9333, i64 1
+  %tmp9335 = getelementptr inbounds float* %tmp9334, i64 1
+  %tmp9336 = getelementptr inbounds float* %tmp9335, i64 1
+  %tmp9337 = getelementptr inbounds float* %tmp9336, i64 1
+  %tmp9338 = getelementptr inbounds float* %tmp9337, i64 1
+  %tmp9339 = getelementptr inbounds float* %tmp9338, i64 1
+  %tmp9340 = getelementptr inbounds float* %tmp9339, i64 1
+  %tmp9341 = getelementptr inbounds float* %tmp9340, i64 1
+  %tmp9342 = getelementptr inbounds float* %tmp9341, i64 1
+  %tmp9343 = getelementptr inbounds float* %tmp9342, i64 1
+  %tmp9344 = getelementptr inbounds float* %tmp9343, i64 1
+  %tmp9345 = getelementptr inbounds float* %tmp9344, i64 1
+  %tmp9346 = getelementptr inbounds float* %tmp9345, i64 1
+  %tmp9347 = getelementptr inbounds float* %tmp9346, i64 1
+  %tmp9348 = getelementptr inbounds float* %tmp9347, i64 1
+  %tmp9349 = getelementptr inbounds float* %tmp9348, i64 1
+  %tmp9350 = getelementptr inbounds float* %tmp9349, i64 1
+  %tmp9351 = getelementptr inbounds float* %tmp9350, i64 1
+  %tmp9352 = getelementptr inbounds float* %tmp9351, i64 1
+  %tmp9353 = getelementptr inbounds float* %tmp9352, i64 1
+  %tmp9354 = getelementptr inbounds float* %tmp9353, i64 1
+  %tmp9355 = getelementptr inbounds float* %tmp9354, i64 1
+  %tmp9356 = getelementptr inbounds float* %tmp9355, i64 1
+  %tmp9357 = getelementptr inbounds float* %tmp9356, i64 1
+  %tmp9358 = getelementptr inbounds float* %tmp9357, i64 1
+  %tmp9359 = getelementptr inbounds float* %tmp9358, i64 1
+  %tmp9360 = getelementptr inbounds float* %tmp9359, i64 1
+  %tmp9361 = getelementptr inbounds float* %tmp9360, i64 1
+  %tmp9362 = getelementptr inbounds float* %tmp9361, i64 1
+  %tmp9363 = getelementptr inbounds float* %tmp9362, i64 1
+  %tmp9364 = getelementptr inbounds float* %tmp9363, i64 1
+  %tmp9365 = getelementptr inbounds float* %tmp9364, i64 1
+  %tmp9366 = getelementptr inbounds float* %tmp9365, i64 1
+  %tmp9367 = getelementptr inbounds float* %tmp9366, i64 1
+  %tmp9368 = getelementptr inbounds float* %tmp9367, i64 1
+  %tmp9369 = getelementptr inbounds float* %tmp9368, i64 1
+  %tmp9370 = getelementptr inbounds float* %tmp9369, i64 1
+  %tmp9371 = getelementptr inbounds float* %tmp9370, i64 1
+  %tmp9372 = getelementptr inbounds float* %tmp9371, i64 1
+  %tmp9373 = getelementptr inbounds float* %tmp9372, i64 1
+  %tmp9374 = getelementptr inbounds float* %tmp9373, i64 1
+  %tmp9375 = getelementptr inbounds float* %tmp9374, i64 1
+  %tmp9376 = getelementptr inbounds float* %tmp9375, i64 1
+  %tmp9377 = getelementptr inbounds float* %tmp9376, i64 1
+  %tmp9378 = getelementptr inbounds float* %tmp9377, i64 1
+  %tmp9379 = getelementptr inbounds float* %tmp9378, i64 1
+  %tmp9380 = getelementptr inbounds float* %tmp9379, i64 1
+  %tmp9381 = getelementptr inbounds float* %tmp9380, i64 1
+  %tmp9382 = getelementptr inbounds float* %tmp9381, i64 1
+  %tmp9383 = getelementptr inbounds float* %tmp9382, i64 1
+  %tmp9384 = getelementptr inbounds float* %tmp9383, i64 1
+  %tmp9385 = getelementptr inbounds float* %tmp9384, i64 1
+  %tmp9386 = getelementptr inbounds float* %tmp9385, i64 1
+  %tmp9387 = getelementptr inbounds float* %tmp9386, i64 1
+  %tmp9388 = getelementptr inbounds float* %tmp9387, i64 1
+  %tmp9389 = getelementptr inbounds float* %tmp9388, i64 1
+  %tmp9390 = getelementptr inbounds float* %tmp9389, i64 1
+  %tmp9391 = getelementptr inbounds float* %tmp9390, i64 1
+  %tmp9392 = getelementptr inbounds float* %tmp9391, i64 1
+  %tmp9393 = getelementptr inbounds float* %tmp9392, i64 1
+  %tmp9394 = getelementptr inbounds float* %tmp9393, i64 1
+  %tmp9395 = getelementptr inbounds float* %tmp9394, i64 1
+  %tmp9396 = getelementptr inbounds float* %tmp9395, i64 1
+  %tmp9397 = getelementptr inbounds float* %tmp9396, i64 1
+  %tmp9398 = getelementptr inbounds float* %tmp9397, i64 1
+  %tmp9399 = getelementptr inbounds float* %tmp9398, i64 1
+  %tmp9400 = getelementptr inbounds float* %tmp9399, i64 1
+  %tmp9401 = getelementptr inbounds float* %tmp9400, i64 1
+  %tmp9402 = getelementptr inbounds float* %tmp9401, i64 1
+  %tmp9403 = getelementptr inbounds float* %tmp9402, i64 1
+  %tmp9404 = getelementptr inbounds float* %tmp9403, i64 1
+  %tmp9405 = getelementptr inbounds float* %tmp9404, i64 1
+  %tmp9406 = getelementptr inbounds float* %tmp9405, i64 1
+  %tmp9407 = getelementptr inbounds float* %tmp9406, i64 1
+  %tmp9408 = getelementptr inbounds float* %tmp9407, i64 1
+  %tmp9409 = getelementptr inbounds float* %tmp9408, i64 1
+  %tmp9410 = getelementptr inbounds float* %tmp9409, i64 1
+  %tmp9411 = getelementptr inbounds float* %tmp9410, i64 1
+  %tmp9412 = getelementptr inbounds float* %tmp9411, i64 1
+  %tmp9413 = getelementptr inbounds float* %tmp9412, i64 1
+  %tmp9414 = getelementptr inbounds float* %tmp9413, i64 1
+  %tmp9415 = getelementptr inbounds float* %tmp9414, i64 1
+  %tmp9416 = getelementptr inbounds float* %tmp9415, i64 1
+  %tmp9417 = getelementptr inbounds float* %tmp9416, i64 1
+  %tmp9418 = getelementptr inbounds float* %tmp9417, i64 1
+  %tmp9419 = getelementptr inbounds float* %tmp9418, i64 1
+  %tmp9420 = getelementptr inbounds float* %tmp9419, i64 1
+  %tmp9421 = getelementptr inbounds float* %tmp9420, i64 1
+  %tmp9422 = getelementptr inbounds float* %tmp9421, i64 1
+  %tmp9423 = getelementptr inbounds float* %tmp9422, i64 1
+  %tmp9424 = getelementptr inbounds float* %tmp9423, i64 1
+  %tmp9425 = getelementptr inbounds float* %tmp9424, i64 1
+  %tmp9426 = getelementptr inbounds float* %tmp9425, i64 1
+  %tmp9427 = getelementptr inbounds float* %tmp9426, i64 1
+  %tmp9428 = getelementptr inbounds float* %tmp9427, i64 1
+  %tmp9429 = getelementptr inbounds float* %tmp9428, i64 1
+  %tmp9430 = getelementptr inbounds float* %tmp9429, i64 1
+  %tmp9431 = getelementptr inbounds float* %tmp9430, i64 1
+  %tmp9432 = getelementptr inbounds float* %tmp9431, i64 1
+  %tmp9433 = getelementptr inbounds float* %tmp9432, i64 1
+  %tmp9434 = getelementptr inbounds float* %tmp9433, i64 1
+  %tmp9435 = getelementptr inbounds float* %tmp9434, i64 1
+  %tmp9436 = getelementptr inbounds float* %tmp9435, i64 1
+  %tmp9437 = getelementptr inbounds float* %tmp9436, i64 1
+  %tmp9438 = getelementptr inbounds float* %tmp9437, i64 1
+  %tmp9439 = getelementptr inbounds float* %tmp9438, i64 1
+  %tmp9440 = getelementptr inbounds float* %tmp9439, i64 1
+  %tmp9441 = getelementptr inbounds float* %tmp9440, i64 1
+  %tmp9442 = getelementptr inbounds float* %tmp9441, i64 1
+  %tmp9443 = getelementptr inbounds float* %tmp9442, i64 1
+  %tmp9444 = getelementptr inbounds float* %tmp9443, i64 1
+  %tmp9445 = getelementptr inbounds float* %tmp9444, i64 1
+  %tmp9446 = getelementptr inbounds float* %tmp9445, i64 1
+  %tmp9447 = getelementptr inbounds float* %tmp9446, i64 1
+  %tmp9448 = getelementptr inbounds float* %tmp9447, i64 1
+  %tmp9449 = getelementptr inbounds float* %tmp9448, i64 1
+  %tmp9450 = getelementptr inbounds float* %tmp9449, i64 1
+  %tmp9451 = getelementptr inbounds float* %tmp9450, i64 1
+  %tmp9452 = getelementptr inbounds float* %tmp9451, i64 1
+  %tmp9453 = getelementptr inbounds float* %tmp9452, i64 1
+  %tmp9454 = getelementptr inbounds float* %tmp9453, i64 1
+  %tmp9455 = getelementptr inbounds float* %tmp9454, i64 1
+  %tmp9456 = getelementptr inbounds float* %tmp9455, i64 1
+  %tmp9457 = getelementptr inbounds float* %tmp9456, i64 1
+  %tmp9458 = getelementptr inbounds float* %tmp9457, i64 1
+  %tmp9459 = getelementptr inbounds float* %tmp9458, i64 1
+  %tmp9460 = getelementptr inbounds float* %tmp9459, i64 1
+  %tmp9461 = getelementptr inbounds float* %tmp9460, i64 1
+  %tmp9462 = getelementptr inbounds float* %tmp9461, i64 1
+  %tmp9463 = getelementptr inbounds float* %tmp9462, i64 1
+  %tmp9464 = getelementptr inbounds float* %tmp9463, i64 1
+  %tmp9465 = getelementptr inbounds float* %tmp9464, i64 1
+  %tmp9466 = getelementptr inbounds float* %tmp9465, i64 1
+  %tmp9467 = getelementptr inbounds float* %tmp9466, i64 1
+  %tmp9468 = getelementptr inbounds float* %tmp9467, i64 1
+  %tmp9469 = getelementptr inbounds float* %tmp9468, i64 1
+  %tmp9470 = getelementptr inbounds float* %tmp9469, i64 1
+  %tmp9471 = getelementptr inbounds float* %tmp9470, i64 1
+  %tmp9472 = getelementptr inbounds float* %tmp9471, i64 1
+  %tmp9473 = getelementptr inbounds float* %tmp9472, i64 1
+  %tmp9474 = getelementptr inbounds float* %tmp9473, i64 1
+  %tmp9475 = getelementptr inbounds float* %tmp9474, i64 1
+  %tmp9476 = getelementptr inbounds float* %tmp9475, i64 1
+  %tmp9477 = getelementptr inbounds float* %tmp9476, i64 1
+  %tmp9478 = getelementptr inbounds float* %tmp9477, i64 1
+  %tmp9479 = getelementptr inbounds float* %tmp9478, i64 1
+  %tmp9480 = getelementptr inbounds float* %tmp9479, i64 1
+  %tmp9481 = getelementptr inbounds float* %tmp9480, i64 1
+  %tmp9482 = getelementptr inbounds float* %tmp9481, i64 1
+  %tmp9483 = getelementptr inbounds float* %tmp9482, i64 1
+  %tmp9484 = getelementptr inbounds float* %tmp9483, i64 1
+  %tmp9485 = getelementptr inbounds float* %tmp9484, i64 1
+  %tmp9486 = getelementptr inbounds float* %tmp9485, i64 1
+  %tmp9487 = getelementptr inbounds float* %tmp9486, i64 1
+  %tmp9488 = getelementptr inbounds float* %tmp9487, i64 1
+  %tmp9489 = getelementptr inbounds float* %tmp9488, i64 1
+  %tmp9490 = getelementptr inbounds float* %tmp9489, i64 1
+  %tmp9491 = getelementptr inbounds float* %tmp9490, i64 1
+  %tmp9492 = getelementptr inbounds float* %tmp9491, i64 1
+  %tmp9493 = getelementptr inbounds float* %tmp9492, i64 1
+  %tmp9494 = getelementptr inbounds float* %tmp9493, i64 1
+  %tmp9495 = getelementptr inbounds float* %tmp9494, i64 1
+  %tmp9496 = getelementptr inbounds float* %tmp9495, i64 1
+  %tmp9497 = getelementptr inbounds float* %tmp9496, i64 1
+  %tmp9498 = getelementptr inbounds float* %tmp9497, i64 1
+  %tmp9499 = getelementptr inbounds float* %tmp9498, i64 1
+  %tmp9500 = getelementptr inbounds float* %tmp9499, i64 1
+  %tmp9501 = getelementptr inbounds float* %tmp9500, i64 1
+  %tmp9502 = getelementptr inbounds float* %tmp9501, i64 1
+  %tmp9503 = getelementptr inbounds float* %tmp9502, i64 1
+  %tmp9504 = getelementptr inbounds float* %tmp9503, i64 1
+  %tmp9505 = getelementptr inbounds float* %tmp9504, i64 1
+  %tmp9506 = getelementptr inbounds float* %tmp9505, i64 1
+  %tmp9507 = getelementptr inbounds float* %tmp9506, i64 1
+  %tmp9508 = getelementptr inbounds float* %tmp9507, i64 1
+  %tmp9509 = getelementptr inbounds float* %tmp9508, i64 1
+  %tmp9510 = getelementptr inbounds float* %tmp9509, i64 1
+  %tmp9511 = getelementptr inbounds float* %tmp9510, i64 1
+  %tmp9512 = getelementptr inbounds float* %tmp9511, i64 1
+  %tmp9513 = getelementptr inbounds float* %tmp9512, i64 1
+  %tmp9514 = getelementptr inbounds float* %tmp9513, i64 1
+  %tmp9515 = getelementptr inbounds float* %tmp9514, i64 1
+  %tmp9516 = getelementptr inbounds float* %tmp9515, i64 1
+  %tmp9517 = getelementptr inbounds float* %tmp9516, i64 1
+  %tmp9518 = getelementptr inbounds float* %tmp9517, i64 1
+  %tmp9519 = getelementptr inbounds float* %tmp9518, i64 1
+  %tmp9520 = getelementptr inbounds float* %tmp9519, i64 1
+  %tmp9521 = getelementptr inbounds float* %tmp9520, i64 1
+  %tmp9522 = getelementptr inbounds float* %tmp9521, i64 1
+  %tmp9523 = getelementptr inbounds float* %tmp9522, i64 1
+  %tmp9524 = getelementptr inbounds float* %tmp9523, i64 1
+  %tmp9525 = getelementptr inbounds float* %tmp9524, i64 1
+  %tmp9526 = getelementptr inbounds float* %tmp9525, i64 1
+  %tmp9527 = getelementptr inbounds float* %tmp9526, i64 1
+  %tmp9528 = getelementptr inbounds float* %tmp9527, i64 1
+  %tmp9529 = getelementptr inbounds float* %tmp9528, i64 1
+  %tmp9530 = getelementptr inbounds float* %tmp9529, i64 1
+  %tmp9531 = getelementptr inbounds float* %tmp9530, i64 1
+  %tmp9532 = getelementptr inbounds float* %tmp9531, i64 1
+  %tmp9533 = getelementptr inbounds float* %tmp9532, i64 1
+  %tmp9534 = getelementptr inbounds float* %tmp9533, i64 1
+  %tmp9535 = getelementptr inbounds float* %tmp9534, i64 1
+  %tmp9536 = getelementptr inbounds float* %tmp9535, i64 1
+  %tmp9537 = getelementptr inbounds float* %tmp9536, i64 1
+  %tmp9538 = getelementptr inbounds float* %tmp9537, i64 1
+  %tmp9539 = getelementptr inbounds float* %tmp9538, i64 1
+  %tmp9540 = getelementptr inbounds float* %tmp9539, i64 1
+  %tmp9541 = getelementptr inbounds float* %tmp9540, i64 1
+  %tmp9542 = getelementptr inbounds float* %tmp9541, i64 1
+  %tmp9543 = getelementptr inbounds float* %tmp9542, i64 1
+  %tmp9544 = getelementptr inbounds float* %tmp9543, i64 1
+  %tmp9545 = getelementptr inbounds float* %tmp9544, i64 1
+  %tmp9546 = getelementptr inbounds float* %tmp9545, i64 1
+  %tmp9547 = getelementptr inbounds float* %tmp9546, i64 1
+  %tmp9548 = getelementptr inbounds float* %tmp9547, i64 1
+  %tmp9549 = getelementptr inbounds float* %tmp9548, i64 1
+  %tmp9550 = getelementptr inbounds float* %tmp9549, i64 1
+  %tmp9551 = getelementptr inbounds float* %tmp9550, i64 1
+  %tmp9552 = getelementptr inbounds float* %tmp9551, i64 1
+  %tmp9553 = getelementptr inbounds float* %tmp9552, i64 1
+  %tmp9554 = getelementptr inbounds float* %tmp9553, i64 1
+  %tmp9555 = getelementptr inbounds float* %tmp9554, i64 1
+  %tmp9556 = getelementptr inbounds float* %tmp9555, i64 1
+  %tmp9557 = getelementptr inbounds float* %tmp9556, i64 1
+  %tmp9558 = getelementptr inbounds float* %tmp9557, i64 1
+  %tmp9559 = getelementptr inbounds float* %tmp9558, i64 1
+  %tmp9560 = getelementptr inbounds float* %tmp9559, i64 1
+  %tmp9561 = getelementptr inbounds float* %tmp9560, i64 1
+  %tmp9562 = getelementptr inbounds float* %tmp9561, i64 1
+  %tmp9563 = getelementptr inbounds float* %tmp9562, i64 1
+  %tmp9564 = getelementptr inbounds float* %tmp9563, i64 1
+  %tmp9565 = getelementptr inbounds float* %tmp9564, i64 1
+  %tmp9566 = getelementptr inbounds float* %tmp9565, i64 1
+  %tmp9567 = getelementptr inbounds float* %tmp9566, i64 1
+  %tmp9568 = getelementptr inbounds float* %tmp9567, i64 1
+  %tmp9569 = getelementptr inbounds float* %tmp9568, i64 1
+  %tmp9570 = getelementptr inbounds float* %tmp9569, i64 1
+  %tmp9571 = getelementptr inbounds float* %tmp9570, i64 1
+  %tmp9572 = getelementptr inbounds float* %tmp9571, i64 1
+  %tmp9573 = getelementptr inbounds float* %tmp9572, i64 1
+  %tmp9574 = getelementptr inbounds float* %tmp9573, i64 1
+  %tmp9575 = getelementptr inbounds float* %tmp9574, i64 1
+  %tmp9576 = getelementptr inbounds float* %tmp9575, i64 1
+  %tmp9577 = getelementptr inbounds float* %tmp9576, i64 1
+  %tmp9578 = getelementptr inbounds float* %tmp9577, i64 1
+  %tmp9579 = getelementptr inbounds float* %tmp9578, i64 1
+  %tmp9580 = getelementptr inbounds float* %tmp9579, i64 1
+  %tmp9581 = getelementptr inbounds float* %tmp9580, i64 1
+  %tmp9582 = getelementptr inbounds float* %tmp9581, i64 1
+  %tmp9583 = getelementptr inbounds float* %tmp9582, i64 1
+  %tmp9584 = getelementptr inbounds float* %tmp9583, i64 1
+  %tmp9585 = getelementptr inbounds float* %tmp9584, i64 1
+  %tmp9586 = getelementptr inbounds float* %tmp9585, i64 1
+  %tmp9587 = getelementptr inbounds float* %tmp9586, i64 1
+  %tmp9588 = getelementptr inbounds float* %tmp9587, i64 1
+  %tmp9589 = getelementptr inbounds float* %tmp9588, i64 1
+  %tmp9590 = getelementptr inbounds float* %tmp9589, i64 1
+  %tmp9591 = getelementptr inbounds float* %tmp9590, i64 1
+  %tmp9592 = getelementptr inbounds float* %tmp9591, i64 1
+  %tmp9593 = getelementptr inbounds float* %tmp9592, i64 1
+  %tmp9594 = getelementptr inbounds float* %tmp9593, i64 1
+  %tmp9595 = getelementptr inbounds float* %tmp9594, i64 1
+  %tmp9596 = getelementptr inbounds float* %tmp9595, i64 1
+  %tmp9597 = getelementptr inbounds float* %tmp9596, i64 1
+  %tmp9598 = getelementptr inbounds float* %tmp9597, i64 1
+  %tmp9599 = getelementptr inbounds float* %tmp9598, i64 1
+  %tmp9600 = getelementptr inbounds float* %tmp9599, i64 1
+  %tmp9601 = getelementptr inbounds float* %tmp9600, i64 1
+  %tmp9602 = getelementptr inbounds float* %tmp9601, i64 1
+  %tmp9603 = getelementptr inbounds float* %tmp9602, i64 1
+  %tmp9604 = getelementptr inbounds float* %tmp9603, i64 1
+  %tmp9605 = getelementptr inbounds float* %tmp9604, i64 1
+  %tmp9606 = getelementptr inbounds float* %tmp9605, i64 1
+  %tmp9607 = getelementptr inbounds float* %tmp9606, i64 1
+  %tmp9608 = getelementptr inbounds float* %tmp9607, i64 1
+  %tmp9609 = getelementptr inbounds float* %tmp9608, i64 1
+  %tmp9610 = getelementptr inbounds float* %tmp9609, i64 1
+  %tmp9611 = getelementptr inbounds float* %tmp9610, i64 1
+  %tmp9612 = getelementptr inbounds float* %tmp9611, i64 1
+  %tmp9613 = getelementptr inbounds float* %tmp9612, i64 1
+  %tmp9614 = getelementptr inbounds float* %tmp9613, i64 1
+  %tmp9615 = getelementptr inbounds float* %tmp9614, i64 1
+  %tmp9616 = getelementptr inbounds float* %tmp9615, i64 1
+  %tmp9617 = getelementptr inbounds float* %tmp9616, i64 1
+  %tmp9618 = getelementptr inbounds float* %tmp9617, i64 1
+  %tmp9619 = getelementptr inbounds float* %tmp9618, i64 1
+  %tmp9620 = getelementptr inbounds float* %tmp9619, i64 1
+  %tmp9621 = getelementptr inbounds float* %tmp9620, i64 1
+  %tmp9622 = getelementptr inbounds float* %tmp9621, i64 1
+  %tmp9623 = getelementptr inbounds float* %tmp9622, i64 1
+  %tmp9624 = getelementptr inbounds float* %tmp9623, i64 1
+  %tmp9625 = getelementptr inbounds float* %tmp9624, i64 1
+  %tmp9626 = getelementptr inbounds float* %tmp9625, i64 1
+  %tmp9627 = getelementptr inbounds float* %tmp9626, i64 1
+  %tmp9628 = getelementptr inbounds float* %tmp9627, i64 1
+  %tmp9629 = getelementptr inbounds float* %tmp9628, i64 1
+  %tmp9630 = getelementptr inbounds float* %tmp9629, i64 1
+  %tmp9631 = getelementptr inbounds float* %tmp9630, i64 1
+  %tmp9632 = getelementptr inbounds float* %tmp9631, i64 1
+  %tmp9633 = getelementptr inbounds float* %tmp9632, i64 1
+  %tmp9634 = getelementptr inbounds float* %tmp9633, i64 1
+  %tmp9635 = getelementptr inbounds float* %tmp9634, i64 1
+  %tmp9636 = getelementptr inbounds float* %tmp9635, i64 1
+  %tmp9637 = getelementptr inbounds float* %tmp9636, i64 1
+  %tmp9638 = getelementptr inbounds float* %tmp9637, i64 1
+  %tmp9639 = getelementptr inbounds float* %tmp9638, i64 1
+  %tmp9640 = getelementptr inbounds float* %tmp9639, i64 1
+  %tmp9641 = getelementptr inbounds float* %tmp9640, i64 1
+  %tmp9642 = getelementptr inbounds float* %tmp9641, i64 1
+  %tmp9643 = getelementptr inbounds float* %tmp9642, i64 1
+  %tmp9644 = getelementptr inbounds float* %tmp9643, i64 1
+  %tmp9645 = getelementptr inbounds float* %tmp9644, i64 1
+  %tmp9646 = getelementptr inbounds float* %tmp9645, i64 1
+  %tmp9647 = getelementptr inbounds float* %tmp9646, i64 1
+  %tmp9648 = getelementptr inbounds float* %tmp9647, i64 1
+  %tmp9649 = getelementptr inbounds float* %tmp9648, i64 1
+  %tmp9650 = getelementptr inbounds float* %tmp9649, i64 1
+  %tmp9651 = getelementptr inbounds float* %tmp9650, i64 1
+  %tmp9652 = getelementptr inbounds float* %tmp9651, i64 1
+  %tmp9653 = getelementptr inbounds float* %tmp9652, i64 1
+  %tmp9654 = getelementptr inbounds float* %tmp9653, i64 1
+  %tmp9655 = getelementptr inbounds float* %tmp9654, i64 1
+  %tmp9656 = getelementptr inbounds float* %tmp9655, i64 1
+  %tmp9657 = getelementptr inbounds float* %tmp9656, i64 1
+  %tmp9658 = getelementptr inbounds float* %tmp9657, i64 1
+  %tmp9659 = getelementptr inbounds float* %tmp9658, i64 1
+  %tmp9660 = getelementptr inbounds float* %tmp9659, i64 1
+  %tmp9661 = getelementptr inbounds float* %tmp9660, i64 1
+  %tmp9662 = getelementptr inbounds float* %tmp9661, i64 1
+  %tmp9663 = getelementptr inbounds float* %tmp9662, i64 1
+  %tmp9664 = getelementptr inbounds float* %tmp9663, i64 1
+  %tmp9665 = getelementptr inbounds float* %tmp9664, i64 1
+  %tmp9666 = getelementptr inbounds float* %tmp9665, i64 1
+  %tmp9667 = getelementptr inbounds float* %tmp9666, i64 1
+  %tmp9668 = getelementptr inbounds float* %tmp9667, i64 1
+  %tmp9669 = getelementptr inbounds float* %tmp9668, i64 1
+  %tmp9670 = getelementptr inbounds float* %tmp9669, i64 1
+  %tmp9671 = getelementptr inbounds float* %tmp9670, i64 1
+  %tmp9672 = getelementptr inbounds float* %tmp9671, i64 1
+  %tmp9673 = getelementptr inbounds float* %tmp9672, i64 1
+  %tmp9674 = getelementptr inbounds float* %tmp9673, i64 1
+  %tmp9675 = getelementptr inbounds float* %tmp9674, i64 1
+  %tmp9676 = getelementptr inbounds float* %tmp9675, i64 1
+  %tmp9677 = getelementptr inbounds float* %tmp9676, i64 1
+  %tmp9678 = getelementptr inbounds float* %tmp9677, i64 1
+  %tmp9679 = getelementptr inbounds float* %tmp9678, i64 1
+  %tmp9680 = getelementptr inbounds float* %tmp9679, i64 1
+  %tmp9681 = getelementptr inbounds float* %tmp9680, i64 1
+  %tmp9682 = getelementptr inbounds float* %tmp9681, i64 1
+  %tmp9683 = getelementptr inbounds float* %tmp9682, i64 1
+  %tmp9684 = getelementptr inbounds float* %tmp9683, i64 1
+  %tmp9685 = getelementptr inbounds float* %tmp9684, i64 1
+  %tmp9686 = getelementptr inbounds float* %tmp9685, i64 1
+  %tmp9687 = getelementptr inbounds float* %tmp9686, i64 1
+  %tmp9688 = getelementptr inbounds float* %tmp9687, i64 1
+  %tmp9689 = getelementptr inbounds float* %tmp9688, i64 1
+  %tmp9690 = getelementptr inbounds float* %tmp9689, i64 1
+  %tmp9691 = getelementptr inbounds float* %tmp9690, i64 1
+  %tmp9692 = getelementptr inbounds float* %tmp9691, i64 1
+  %tmp9693 = getelementptr inbounds float* %tmp9692, i64 1
+  %tmp9694 = getelementptr inbounds float* %tmp9693, i64 1
+  %tmp9695 = getelementptr inbounds float* %tmp9694, i64 1
+  %tmp9696 = getelementptr inbounds float* %tmp9695, i64 1
+  %tmp9697 = getelementptr inbounds float* %tmp9696, i64 1
+  %tmp9698 = getelementptr inbounds float* %tmp9697, i64 1
+  %tmp9699 = getelementptr inbounds float* %tmp9698, i64 1
+  %tmp9700 = getelementptr inbounds float* %tmp9699, i64 1
+  %tmp9701 = getelementptr inbounds float* %tmp9700, i64 1
+  %tmp9702 = getelementptr inbounds float* %tmp9701, i64 1
+  %tmp9703 = getelementptr inbounds float* %tmp9702, i64 1
+  %tmp9704 = getelementptr inbounds float* %tmp9703, i64 1
+  %tmp9705 = getelementptr inbounds float* %tmp9704, i64 1
+  %tmp9706 = getelementptr inbounds float* %tmp9705, i64 1
+  %tmp9707 = getelementptr inbounds float* %tmp9706, i64 1
+  %tmp9708 = getelementptr inbounds float* %tmp9707, i64 1
+  %tmp9709 = getelementptr inbounds float* %tmp9708, i64 1
+  %tmp9710 = getelementptr inbounds float* %tmp9709, i64 1
+  %tmp9711 = getelementptr inbounds float* %tmp9710, i64 1
+  %tmp9712 = getelementptr inbounds float* %tmp9711, i64 1
+  %tmp9713 = getelementptr inbounds float* %tmp9712, i64 1
+  %tmp9714 = getelementptr inbounds float* %tmp9713, i64 1
+  %tmp9715 = getelementptr inbounds float* %tmp9714, i64 1
+  %tmp9716 = getelementptr inbounds float* %tmp9715, i64 1
+  %tmp9717 = getelementptr inbounds float* %tmp9716, i64 1
+  %tmp9718 = getelementptr inbounds float* %tmp9717, i64 1
+  %tmp9719 = getelementptr inbounds float* %tmp9718, i64 1
+  %tmp9720 = getelementptr inbounds float* %tmp9719, i64 1
+  %tmp9721 = getelementptr inbounds float* %tmp9720, i64 1
+  %tmp9722 = getelementptr inbounds float* %tmp9721, i64 1
+  %tmp9723 = getelementptr inbounds float* %tmp9722, i64 1
+  %tmp9724 = getelementptr inbounds float* %tmp9723, i64 1
+  %tmp9725 = getelementptr inbounds float* %tmp9724, i64 1
+  %tmp9726 = getelementptr inbounds float* %tmp9725, i64 1
+  %tmp9727 = getelementptr inbounds float* %tmp9726, i64 1
+  %tmp9728 = getelementptr inbounds float* %tmp9727, i64 1
+  %tmp9729 = getelementptr inbounds float* %tmp9728, i64 1
+  %tmp9730 = getelementptr inbounds float* %tmp9729, i64 1
+  %tmp9731 = getelementptr inbounds float* %tmp9730, i64 1
+  %tmp9732 = getelementptr inbounds float* %tmp9731, i64 1
+  %tmp9733 = getelementptr inbounds float* %tmp9732, i64 1
+  %tmp9734 = getelementptr inbounds float* %tmp9733, i64 1
+  %tmp9735 = getelementptr inbounds float* %tmp9734, i64 1
+  %tmp9736 = getelementptr inbounds float* %tmp9735, i64 1
+  %tmp9737 = getelementptr inbounds float* %tmp9736, i64 1
+  %tmp9738 = getelementptr inbounds float* %tmp9737, i64 1
+  %tmp9739 = getelementptr inbounds float* %tmp9738, i64 1
+  %tmp9740 = getelementptr inbounds float* %tmp9739, i64 1
+  %tmp9741 = getelementptr inbounds float* %tmp9740, i64 1
+  %tmp9742 = getelementptr inbounds float* %tmp9741, i64 1
+  %tmp9743 = getelementptr inbounds float* %tmp9742, i64 1
+  %tmp9744 = getelementptr inbounds float* %tmp9743, i64 1
+  %tmp9745 = getelementptr inbounds float* %tmp9744, i64 1
+  %tmp9746 = getelementptr inbounds float* %tmp9745, i64 1
+  %tmp9747 = getelementptr inbounds float* %tmp9746, i64 1
+  %tmp9748 = getelementptr inbounds float* %tmp9747, i64 1
+  %tmp9749 = getelementptr inbounds float* %tmp9748, i64 1
+  %tmp9750 = getelementptr inbounds float* %tmp9749, i64 1
+  %tmp9751 = getelementptr inbounds float* %tmp9750, i64 1
+  %tmp9752 = getelementptr inbounds float* %tmp9751, i64 1
+  %tmp9753 = getelementptr inbounds float* %tmp9752, i64 1
+  %tmp9754 = getelementptr inbounds float* %tmp9753, i64 1
+  %tmp9755 = getelementptr inbounds float* %tmp9754, i64 1
+  %tmp9756 = getelementptr inbounds float* %tmp9755, i64 1
+  %tmp9757 = getelementptr inbounds float* %tmp9756, i64 1
+  %tmp9758 = getelementptr inbounds float* %tmp9757, i64 1
+  %tmp9759 = getelementptr inbounds float* %tmp9758, i64 1
+  %tmp9760 = getelementptr inbounds float* %tmp9759, i64 1
+  %tmp9761 = getelementptr inbounds float* %tmp9760, i64 1
+  %tmp9762 = getelementptr inbounds float* %tmp9761, i64 1
+  %tmp9763 = getelementptr inbounds float* %tmp9762, i64 1
+  %tmp9764 = getelementptr inbounds float* %tmp9763, i64 1
+  %tmp9765 = getelementptr inbounds float* %tmp9764, i64 1
+  %tmp9766 = getelementptr inbounds float* %tmp9765, i64 1
+  %tmp9767 = getelementptr inbounds float* %tmp9766, i64 1
+  %tmp9768 = getelementptr inbounds float* %tmp9767, i64 1
+  %tmp9769 = getelementptr inbounds float* %tmp9768, i64 1
+  %tmp9770 = getelementptr inbounds float* %tmp9769, i64 1
+  %tmp9771 = getelementptr inbounds float* %tmp9770, i64 1
+  %tmp9772 = getelementptr inbounds float* %tmp9771, i64 1
+  %tmp9773 = getelementptr inbounds float* %tmp9772, i64 1
+  %tmp9774 = getelementptr inbounds float* %tmp9773, i64 1
+  %tmp9775 = getelementptr inbounds float* %tmp9774, i64 1
+  %tmp9776 = getelementptr inbounds float* %tmp9775, i64 1
+  %tmp9777 = getelementptr inbounds float* %tmp9776, i64 1
+  %tmp9778 = getelementptr inbounds float* %tmp9777, i64 1
+  %tmp9779 = getelementptr inbounds float* %tmp9778, i64 1
+  %tmp9780 = getelementptr inbounds float* %tmp9779, i64 1
+  %tmp9781 = getelementptr inbounds float* %tmp9780, i64 1
+  %tmp9782 = getelementptr inbounds float* %tmp9781, i64 1
+  %tmp9783 = getelementptr inbounds float* %tmp9782, i64 1
+  %tmp9784 = getelementptr inbounds float* %tmp9783, i64 1
+  %tmp9785 = getelementptr inbounds float* %tmp9784, i64 1
+  %tmp9786 = getelementptr inbounds float* %tmp9785, i64 1
+  %tmp9787 = getelementptr inbounds float* %tmp9786, i64 1
+  %tmp9788 = getelementptr inbounds float* %tmp9787, i64 1
+  %tmp9789 = getelementptr inbounds float* %tmp9788, i64 1
+  %tmp9790 = getelementptr inbounds float* %tmp9789, i64 1
+  %tmp9791 = getelementptr inbounds float* %tmp9790, i64 1
+  %tmp9792 = getelementptr inbounds float* %tmp9791, i64 1
+  %tmp9793 = getelementptr inbounds float* %tmp9792, i64 1
+  %tmp9794 = getelementptr inbounds float* %tmp9793, i64 1
+  %tmp9795 = getelementptr inbounds float* %tmp9794, i64 1
+  %tmp9796 = getelementptr inbounds float* %tmp9795, i64 1
+  %tmp9797 = getelementptr inbounds float* %tmp9796, i64 1
+  %tmp9798 = getelementptr inbounds float* %tmp9797, i64 1
+  %tmp9799 = getelementptr inbounds float* %tmp9798, i64 1
+  %tmp9800 = getelementptr inbounds float* %tmp9799, i64 1
+  %tmp9801 = getelementptr inbounds float* %tmp9800, i64 1
+  %tmp9802 = getelementptr inbounds float* %tmp9801, i64 1
+  %tmp9803 = getelementptr inbounds float* %tmp9802, i64 1
+  %tmp9804 = getelementptr inbounds float* %tmp9803, i64 1
+  %tmp9805 = getelementptr inbounds float* %tmp9804, i64 1
+  %tmp9806 = getelementptr inbounds float* %tmp9805, i64 1
+  %tmp9807 = getelementptr inbounds float* %tmp9806, i64 1
+  %tmp9808 = getelementptr inbounds float* %tmp9807, i64 1
+  %tmp9809 = getelementptr inbounds float* %tmp9808, i64 1
+  %tmp9810 = getelementptr inbounds float* %tmp9809, i64 1
+  %tmp9811 = getelementptr inbounds float* %tmp9810, i64 1
+  %tmp9812 = getelementptr inbounds float* %tmp9811, i64 1
+  %tmp9813 = getelementptr inbounds float* %tmp9812, i64 1
+  %tmp9814 = getelementptr inbounds float* %tmp9813, i64 1
+  %tmp9815 = getelementptr inbounds float* %tmp9814, i64 1
+  %tmp9816 = getelementptr inbounds float* %tmp9815, i64 1
+  %tmp9817 = getelementptr inbounds float* %tmp9816, i64 1
+  %tmp9818 = getelementptr inbounds float* %tmp9817, i64 1
+  %tmp9819 = getelementptr inbounds float* %tmp9818, i64 1
+  %tmp9820 = getelementptr inbounds float* %tmp9819, i64 1
+  %tmp9821 = getelementptr inbounds float* %tmp9820, i64 1
+  %tmp9822 = getelementptr inbounds float* %tmp9821, i64 1
+  %tmp9823 = getelementptr inbounds float* %tmp9822, i64 1
+  %tmp9824 = getelementptr inbounds float* %tmp9823, i64 1
+  %tmp9825 = getelementptr inbounds float* %tmp9824, i64 1
+  %tmp9826 = getelementptr inbounds float* %tmp9825, i64 1
+  %tmp9827 = getelementptr inbounds float* %tmp9826, i64 1
+  %tmp9828 = getelementptr inbounds float* %tmp9827, i64 1
+  %tmp9829 = getelementptr inbounds float* %tmp9828, i64 1
+  %tmp9830 = getelementptr inbounds float* %tmp9829, i64 1
+  %tmp9831 = getelementptr inbounds float* %tmp9830, i64 1
+  %tmp9832 = getelementptr inbounds float* %tmp9831, i64 1
+  %tmp9833 = getelementptr inbounds float* %tmp9832, i64 1
+  %tmp9834 = getelementptr inbounds float* %tmp9833, i64 1
+  %tmp9835 = getelementptr inbounds float* %tmp9834, i64 1
+  %tmp9836 = getelementptr inbounds float* %tmp9835, i64 1
+  %tmp9837 = getelementptr inbounds float* %tmp9836, i64 1
+  %tmp9838 = getelementptr inbounds float* %tmp9837, i64 1
+  %tmp9839 = getelementptr inbounds float* %tmp9838, i64 1
+  %tmp9840 = getelementptr inbounds float* %tmp9839, i64 1
+  %tmp9841 = getelementptr inbounds float* %tmp9840, i64 1
+  %tmp9842 = getelementptr inbounds float* %tmp9841, i64 1
+  %tmp9843 = getelementptr inbounds float* %tmp9842, i64 1
+  %tmp9844 = getelementptr inbounds float* %tmp9843, i64 1
+  %tmp9845 = getelementptr inbounds float* %tmp9844, i64 1
+  %tmp9846 = getelementptr inbounds float* %tmp9845, i64 1
+  %tmp9847 = getelementptr inbounds float* %tmp9846, i64 1
+  %tmp9848 = getelementptr inbounds float* %tmp9847, i64 1
+  %tmp9849 = getelementptr inbounds float* %tmp9848, i64 1
+  %tmp9850 = getelementptr inbounds float* %tmp9849, i64 1
+  %tmp9851 = getelementptr inbounds float* %tmp9850, i64 1
+  %tmp9852 = getelementptr inbounds float* %tmp9851, i64 1
+  %tmp9853 = getelementptr inbounds float* %tmp9852, i64 1
+  %tmp9854 = getelementptr inbounds float* %tmp9853, i64 1
+  %tmp9855 = getelementptr inbounds float* %tmp9854, i64 1
+  %tmp9856 = getelementptr inbounds float* %tmp9855, i64 1
+  %tmp9857 = getelementptr inbounds float* %tmp9856, i64 1
+  %tmp9858 = getelementptr inbounds float* %tmp9857, i64 1
+  %tmp9859 = getelementptr inbounds float* %tmp9858, i64 1
+  %tmp9860 = getelementptr inbounds float* %tmp9859, i64 1
+  %tmp9861 = getelementptr inbounds float* %tmp9860, i64 1
+  %tmp9862 = getelementptr inbounds float* %tmp9861, i64 1
+  %tmp9863 = getelementptr inbounds float* %tmp9862, i64 1
+  %tmp9864 = getelementptr inbounds float* %tmp9863, i64 1
+  %tmp9865 = getelementptr inbounds float* %tmp9864, i64 1
+  %tmp9866 = getelementptr inbounds float* %tmp9865, i64 1
+  %tmp9867 = getelementptr inbounds float* %tmp9866, i64 1
+  %tmp9868 = getelementptr inbounds float* %tmp9867, i64 1
+  %tmp9869 = getelementptr inbounds float* %tmp9868, i64 1
+  %tmp9870 = getelementptr inbounds float* %tmp9869, i64 1
+  %tmp9871 = getelementptr inbounds float* %tmp9870, i64 1
+  %tmp9872 = getelementptr inbounds float* %tmp9871, i64 1
+  %tmp9873 = getelementptr inbounds float* %tmp9872, i64 1
+  %tmp9874 = getelementptr inbounds float* %tmp9873, i64 1
+  %tmp9875 = getelementptr inbounds float* %tmp9874, i64 1
+  %tmp9876 = getelementptr inbounds float* %tmp9875, i64 1
+  %tmp9877 = getelementptr inbounds float* %tmp9876, i64 1
+  %tmp9878 = getelementptr inbounds float* %tmp9877, i64 1
+  %tmp9879 = getelementptr inbounds float* %tmp9878, i64 1
+  %tmp9880 = getelementptr inbounds float* %tmp9879, i64 1
+  %tmp9881 = getelementptr inbounds float* %tmp9880, i64 1
+  %tmp9882 = getelementptr inbounds float* %tmp9881, i64 1
+  %tmp9883 = getelementptr inbounds float* %tmp9882, i64 1
+  %tmp9884 = getelementptr inbounds float* %tmp9883, i64 1
+  %tmp9885 = getelementptr inbounds float* %tmp9884, i64 1
+  %tmp9886 = getelementptr inbounds float* %tmp9885, i64 1
+  %tmp9887 = getelementptr inbounds float* %tmp9886, i64 1
+  %tmp9888 = getelementptr inbounds float* %tmp9887, i64 1
+  %tmp9889 = getelementptr inbounds float* %tmp9888, i64 1
+  %tmp9890 = getelementptr inbounds float* %tmp9889, i64 1
+  %tmp9891 = getelementptr inbounds float* %tmp9890, i64 1
+  %tmp9892 = getelementptr inbounds float* %tmp9891, i64 1
+  %tmp9893 = getelementptr inbounds float* %tmp9892, i64 1
+  %tmp9894 = getelementptr inbounds float* %tmp9893, i64 1
+  %tmp9895 = getelementptr inbounds float* %tmp9894, i64 1
+  %tmp9896 = getelementptr inbounds float* %tmp9895, i64 1
+  %tmp9897 = getelementptr inbounds float* %tmp9896, i64 1
+  %tmp9898 = getelementptr inbounds float* %tmp9897, i64 1
+  %tmp9899 = getelementptr inbounds float* %tmp9898, i64 1
+  %tmp9900 = getelementptr inbounds float* %tmp9899, i64 1
+  %tmp9901 = getelementptr inbounds float* %tmp9900, i64 1
+  %tmp9902 = getelementptr inbounds float* %tmp9901, i64 1
+  %tmp9903 = getelementptr inbounds float* %tmp9902, i64 1
+  %tmp9904 = getelementptr inbounds float* %tmp9903, i64 1
+  %tmp9905 = getelementptr inbounds float* %tmp9904, i64 1
+  %tmp9906 = getelementptr inbounds float* %tmp9905, i64 1
+  %tmp9907 = getelementptr inbounds float* %tmp9906, i64 1
+  %tmp9908 = getelementptr inbounds float* %tmp9907, i64 1
+  %tmp9909 = getelementptr inbounds float* %tmp9908, i64 1
+  %tmp9910 = getelementptr inbounds float* %tmp9909, i64 1
+  %tmp9911 = getelementptr inbounds float* %tmp9910, i64 1
+  %tmp9912 = getelementptr inbounds float* %tmp9911, i64 1
+  %tmp9913 = getelementptr inbounds float* %tmp9912, i64 1
+  %tmp9914 = getelementptr inbounds float* %tmp9913, i64 1
+  %tmp9915 = getelementptr inbounds float* %tmp9914, i64 1
+  %tmp9916 = getelementptr inbounds float* %tmp9915, i64 1
+  %tmp9917 = getelementptr inbounds float* %tmp9916, i64 1
+  %tmp9918 = getelementptr inbounds float* %tmp9917, i64 1
+  %tmp9919 = getelementptr inbounds float* %tmp9918, i64 1
+  %tmp9920 = getelementptr inbounds float* %tmp9919, i64 1
+  %tmp9921 = getelementptr inbounds float* %tmp9920, i64 1
+  %tmp9922 = getelementptr inbounds float* %tmp9921, i64 1
+  %tmp9923 = getelementptr inbounds float* %tmp9922, i64 1
+  %tmp9924 = getelementptr inbounds float* %tmp9923, i64 1
+  %tmp9925 = getelementptr inbounds float* %tmp9924, i64 1
+  %tmp9926 = getelementptr inbounds float* %tmp9925, i64 1
+  %tmp9927 = getelementptr inbounds float* %tmp9926, i64 1
+  %tmp9928 = getelementptr inbounds float* %tmp9927, i64 1
+  %tmp9929 = getelementptr inbounds float* %tmp9928, i64 1
+  %tmp9930 = getelementptr inbounds float* %tmp9929, i64 1
+  %tmp9931 = getelementptr inbounds float* %tmp9930, i64 1
+  %tmp9932 = getelementptr inbounds float* %tmp9931, i64 1
+  %tmp9933 = getelementptr inbounds float* %tmp9932, i64 1
+  %tmp9934 = getelementptr inbounds float* %tmp9933, i64 1
+  %tmp9935 = getelementptr inbounds float* %tmp9934, i64 1
+  %tmp9936 = getelementptr inbounds float* %tmp9935, i64 1
+  %tmp9937 = getelementptr inbounds float* %tmp9936, i64 1
+  %tmp9938 = getelementptr inbounds float* %tmp9937, i64 1
+  %tmp9939 = getelementptr inbounds float* %tmp9938, i64 1
+  %tmp9940 = getelementptr inbounds float* %tmp9939, i64 1
+  %tmp9941 = getelementptr inbounds float* %tmp9940, i64 1
+  %tmp9942 = getelementptr inbounds float* %tmp9941, i64 1
+  %tmp9943 = getelementptr inbounds float* %tmp9942, i64 1
+  %tmp9944 = getelementptr inbounds float* %tmp9943, i64 1
+  %tmp9945 = getelementptr inbounds float* %tmp9944, i64 1
+  %tmp9946 = getelementptr inbounds float* %tmp9945, i64 1
+  %tmp9947 = getelementptr inbounds float* %tmp9946, i64 1
+  %tmp9948 = getelementptr inbounds float* %tmp9947, i64 1
+  %tmp9949 = getelementptr inbounds float* %tmp9948, i64 1
+  %tmp9950 = getelementptr inbounds float* %tmp9949, i64 1
+  %tmp9951 = getelementptr inbounds float* %tmp9950, i64 1
+  %tmp9952 = getelementptr inbounds float* %tmp9951, i64 1
+  %tmp9953 = getelementptr inbounds float* %tmp9952, i64 1
+  %tmp9954 = getelementptr inbounds float* %tmp9953, i64 1
+  %tmp9955 = getelementptr inbounds float* %tmp9954, i64 1
+  %tmp9956 = getelementptr inbounds float* %tmp9955, i64 1
+  %tmp9957 = getelementptr inbounds float* %tmp9956, i64 1
+  %tmp9958 = getelementptr inbounds float* %tmp9957, i64 1
+  %tmp9959 = getelementptr inbounds float* %tmp9958, i64 1
+  %tmp9960 = getelementptr inbounds float* %tmp9959, i64 1
+  %tmp9961 = getelementptr inbounds float* %tmp9960, i64 1
+  %tmp9962 = getelementptr inbounds float* %tmp9961, i64 1
+  %tmp9963 = getelementptr inbounds float* %tmp9962, i64 1
+  %tmp9964 = getelementptr inbounds float* %tmp9963, i64 1
+  %tmp9965 = getelementptr inbounds float* %tmp9964, i64 1
+  %tmp9966 = getelementptr inbounds float* %tmp9965, i64 1
+  %tmp9967 = getelementptr inbounds float* %tmp9966, i64 1
+  %tmp9968 = getelementptr inbounds float* %tmp9967, i64 1
+  %tmp9969 = getelementptr inbounds float* %tmp9968, i64 1
+  %tmp9970 = getelementptr inbounds float* %tmp9969, i64 1
+  %tmp9971 = getelementptr inbounds float* %tmp9970, i64 1
+  %tmp9972 = getelementptr inbounds float* %tmp9971, i64 1
+  %tmp9973 = getelementptr inbounds float* %tmp9972, i64 1
+  %tmp9974 = getelementptr inbounds float* %tmp9973, i64 1
+  %tmp9975 = getelementptr inbounds float* %tmp9974, i64 1
+  %tmp9976 = getelementptr inbounds float* %tmp9975, i64 1
+  %tmp9977 = getelementptr inbounds float* %tmp9976, i64 1
+  %tmp9978 = getelementptr inbounds float* %tmp9977, i64 1
+  %tmp9979 = getelementptr inbounds float* %tmp9978, i64 1
+  %tmp9980 = getelementptr inbounds float* %tmp9979, i64 1
+  %tmp9981 = getelementptr inbounds float* %tmp9980, i64 1
+  %tmp9982 = getelementptr inbounds float* %tmp9981, i64 1
+  %tmp9983 = getelementptr inbounds float* %tmp9982, i64 1
+  %tmp9984 = getelementptr inbounds float* %tmp9983, i64 1
+  %tmp9985 = getelementptr inbounds float* %tmp9984, i64 1
+  %tmp9986 = getelementptr inbounds float* %tmp9985, i64 1
+  %tmp9987 = getelementptr inbounds float* %tmp9986, i64 1
+  %tmp9988 = getelementptr inbounds float* %tmp9987, i64 1
+  %tmp9989 = getelementptr inbounds float* %tmp9988, i64 1
+  %tmp9990 = getelementptr inbounds float* %tmp9989, i64 1
+  %tmp9991 = getelementptr inbounds float* %tmp9990, i64 1
+  %tmp9992 = getelementptr inbounds float* %tmp9991, i64 1
+  %tmp9993 = getelementptr inbounds float* %tmp9992, i64 1
+  %tmp9994 = getelementptr inbounds float* %tmp9993, i64 1
+  %tmp9995 = getelementptr inbounds float* %tmp9994, i64 1
+  %tmp9996 = getelementptr inbounds float* %tmp9995, i64 1
+  %tmp9997 = getelementptr inbounds float* %tmp9996, i64 1
+  %tmp9998 = getelementptr inbounds float* %tmp9997, i64 1
+  %tmp9999 = getelementptr inbounds float* %tmp9998, i64 1
+  %tmp10000 = getelementptr inbounds float* %tmp9999, i64 1
+  %tmp10001 = getelementptr inbounds float* %tmp10000, i64 1
+  %tmp10002 = getelementptr inbounds float* %tmp10001, i64 1
+  %tmp10003 = getelementptr inbounds float* %tmp10002, i64 1
+  %tmp10004 = getelementptr inbounds float* %tmp10003, i64 1
+  %tmp10005 = getelementptr inbounds float* %tmp10004, i64 1
+  %tmp10006 = getelementptr inbounds float* %tmp10005, i64 1
+  %tmp10007 = getelementptr inbounds float* %tmp10006, i64 1
+  %tmp10008 = getelementptr inbounds float* %tmp10007, i64 1
+  %tmp10009 = getelementptr inbounds float* %tmp10008, i64 1
+  %tmp10010 = getelementptr inbounds float* %tmp10009, i64 1
+  %tmp10011 = getelementptr inbounds float* %tmp10010, i64 1
+  %tmp10012 = getelementptr inbounds float* %tmp10011, i64 1
+  %tmp10013 = getelementptr inbounds float* %tmp10012, i64 1
+  %tmp10014 = getelementptr inbounds float* %tmp10013, i64 1
+  %tmp10015 = getelementptr inbounds float* %tmp10014, i64 1
+  %tmp10016 = getelementptr inbounds float* %tmp10015, i64 1
+  %tmp10017 = getelementptr inbounds float* %tmp10016, i64 1
+  %tmp10018 = getelementptr inbounds float* %tmp10017, i64 1
+  %tmp10019 = getelementptr inbounds float* %tmp10018, i64 1
+  %tmp10020 = getelementptr inbounds float* %tmp10019, i64 1
+  %tmp10021 = getelementptr inbounds float* %tmp10020, i64 1
+  %tmp10022 = getelementptr inbounds float* %tmp10021, i64 1
+  %tmp10023 = getelementptr inbounds float* %tmp10022, i64 1
+  %tmp10024 = getelementptr inbounds float* %tmp10023, i64 1
+  %tmp10025 = getelementptr inbounds float* %tmp10024, i64 1
+  %tmp10026 = getelementptr inbounds float* %tmp10025, i64 1
+  %tmp10027 = getelementptr inbounds float* %tmp10026, i64 1
+  %tmp10028 = getelementptr inbounds float* %tmp10027, i64 1
+  %tmp10029 = getelementptr inbounds float* %tmp10028, i64 1
+  %tmp10030 = getelementptr inbounds float* %tmp10029, i64 1
+  %tmp10031 = getelementptr inbounds float* %tmp10030, i64 1
+  %tmp10032 = getelementptr inbounds float* %tmp10031, i64 1
+  %tmp10033 = getelementptr inbounds float* %tmp10032, i64 1
+  %tmp10034 = getelementptr inbounds float* %tmp10033, i64 1
+  %tmp10035 = getelementptr inbounds float* %tmp10034, i64 1
+  %tmp10036 = getelementptr inbounds float* %tmp10035, i64 1
+  %tmp10037 = getelementptr inbounds float* %tmp10036, i64 1
+  %tmp10038 = getelementptr inbounds float* %tmp10037, i64 1
+  %tmp10039 = getelementptr inbounds float* %tmp10038, i64 1
+  %tmp10040 = getelementptr inbounds float* %tmp10039, i64 1
+  %tmp10041 = getelementptr inbounds float* %tmp10040, i64 1
+  %tmp10042 = getelementptr inbounds float* %tmp10041, i64 1
+  %tmp10043 = getelementptr inbounds float* %tmp10042, i64 1
+  %tmp10044 = getelementptr inbounds float* %tmp10043, i64 1
+  %tmp10045 = getelementptr inbounds float* %tmp10044, i64 1
+  %tmp10046 = getelementptr inbounds float* %tmp10045, i64 1
+  %tmp10047 = getelementptr inbounds float* %tmp10046, i64 1
+  %tmp10048 = getelementptr inbounds float* %tmp10047, i64 1
+  %tmp10049 = getelementptr inbounds float* %tmp10048, i64 1
+  %tmp10050 = getelementptr inbounds float* %tmp10049, i64 1
+  %tmp10051 = getelementptr inbounds float* %tmp10050, i64 1
+  %tmp10052 = getelementptr inbounds float* %tmp10051, i64 1
+  %tmp10053 = getelementptr inbounds float* %tmp10052, i64 1
+  %tmp10054 = getelementptr inbounds float* %tmp10053, i64 1
+  %tmp10055 = getelementptr inbounds float* %tmp10054, i64 1
+  %tmp10056 = getelementptr inbounds float* %tmp10055, i64 1
+  %tmp10057 = getelementptr inbounds float* %tmp10056, i64 1
+  %tmp10058 = getelementptr inbounds float* %tmp10057, i64 1
+  %tmp10059 = getelementptr inbounds float* %tmp10058, i64 1
+  %tmp10060 = getelementptr inbounds float* %tmp10059, i64 1
+  %tmp10061 = getelementptr inbounds float* %tmp10060, i64 1
+  %tmp10062 = getelementptr inbounds float* %tmp10061, i64 1
+  %tmp10063 = getelementptr inbounds float* %tmp10062, i64 1
+  %tmp10064 = getelementptr inbounds float* %tmp10063, i64 1
+  %tmp10065 = getelementptr inbounds float* %tmp10064, i64 1
+  %tmp10066 = getelementptr inbounds float* %tmp10065, i64 1
+  %tmp10067 = getelementptr inbounds float* %tmp10066, i64 1
+  %tmp10068 = getelementptr inbounds float* %tmp10067, i64 1
+  %tmp10069 = getelementptr inbounds float* %tmp10068, i64 1
+  %tmp10070 = getelementptr inbounds float* %tmp10069, i64 1
+  %tmp10071 = getelementptr inbounds float* %tmp10070, i64 1
+  %tmp10072 = getelementptr inbounds float* %tmp10071, i64 1
+  %tmp10073 = getelementptr inbounds float* %tmp10072, i64 1
+  %tmp10074 = getelementptr inbounds float* %tmp10073, i64 1
+  %tmp10075 = getelementptr inbounds float* %tmp10074, i64 1
+  %tmp10076 = getelementptr inbounds float* %tmp10075, i64 1
+  %tmp10077 = getelementptr inbounds float* %tmp10076, i64 1
+  %tmp10078 = getelementptr inbounds float* %tmp10077, i64 1
+  %tmp10079 = getelementptr inbounds float* %tmp10078, i64 1
+  %tmp10080 = getelementptr inbounds float* %tmp10079, i64 1
+  %tmp10081 = getelementptr inbounds float* %tmp10080, i64 1
+  %tmp10082 = getelementptr inbounds float* %tmp10081, i64 1
+  %tmp10083 = getelementptr inbounds float* %tmp10082, i64 1
+  %tmp10084 = getelementptr inbounds float* %tmp10083, i64 1
+  %tmp10085 = getelementptr inbounds float* %tmp10084, i64 1
+  %tmp10086 = getelementptr inbounds float* %tmp10085, i64 1
+  %tmp10087 = getelementptr inbounds float* %tmp10086, i64 1
+  %tmp10088 = getelementptr inbounds float* %tmp10087, i64 1
+  %tmp10089 = getelementptr inbounds float* %tmp10088, i64 1
+  %tmp10090 = getelementptr inbounds float* %tmp10089, i64 1
+  %tmp10091 = getelementptr inbounds float* %tmp10090, i64 1
+  %tmp10092 = getelementptr inbounds float* %tmp10091, i64 1
+  %tmp10093 = getelementptr inbounds float* %tmp10092, i64 1
+  %tmp10094 = getelementptr inbounds float* %tmp10093, i64 1
+  %tmp10095 = getelementptr inbounds float* %tmp10094, i64 1
+  %tmp10096 = getelementptr inbounds float* %tmp10095, i64 1
+  %tmp10097 = getelementptr inbounds float* %tmp10096, i64 1
+  %tmp10098 = getelementptr inbounds float* %tmp10097, i64 1
+  %tmp10099 = getelementptr inbounds float* %tmp10098, i64 1
+  %tmp10100 = getelementptr inbounds float* %tmp10099, i64 1
+  %tmp10101 = getelementptr inbounds float* %tmp10100, i64 1
+  %tmp10102 = getelementptr inbounds float* %tmp10101, i64 1
+  %tmp10103 = getelementptr inbounds float* %tmp10102, i64 1
+  %tmp10104 = getelementptr inbounds float* %tmp10103, i64 1
+  %tmp10105 = getelementptr inbounds float* %tmp10104, i64 1
+  %tmp10106 = getelementptr inbounds float* %tmp10105, i64 1
+  %tmp10107 = getelementptr inbounds float* %tmp10106, i64 1
+  %tmp10108 = getelementptr inbounds float* %tmp10107, i64 1
+  %tmp10109 = getelementptr inbounds float* %tmp10108, i64 1
+  %tmp10110 = getelementptr inbounds float* %tmp10109, i64 1
+  %tmp10111 = getelementptr inbounds float* %tmp10110, i64 1
+  %tmp10112 = getelementptr inbounds float* %tmp10111, i64 1
+  %tmp10113 = getelementptr inbounds float* %tmp10112, i64 1
+  %tmp10114 = getelementptr inbounds float* %tmp10113, i64 1
+  %tmp10115 = getelementptr inbounds float* %tmp10114, i64 1
+  %tmp10116 = getelementptr inbounds float* %tmp10115, i64 1
+  %tmp10117 = getelementptr inbounds float* %tmp10116, i64 1
+  %tmp10118 = getelementptr inbounds float* %tmp10117, i64 1
+  %tmp10119 = getelementptr inbounds float* %tmp10118, i64 1
+  %tmp10120 = getelementptr inbounds float* %tmp10119, i64 1
+  %tmp10121 = getelementptr inbounds float* %tmp10120, i64 1
+  %tmp10122 = getelementptr inbounds float* %tmp10121, i64 1
+  %tmp10123 = getelementptr inbounds float* %tmp10122, i64 1
+  %tmp10124 = getelementptr inbounds float* %tmp10123, i64 1
+  %tmp10125 = getelementptr inbounds float* %tmp10124, i64 1
+  %tmp10126 = getelementptr inbounds float* %tmp10125, i64 1
+  %tmp10127 = getelementptr inbounds float* %tmp10126, i64 1
+  %tmp10128 = getelementptr inbounds float* %tmp10127, i64 1
+  %tmp10129 = getelementptr inbounds float* %tmp10128, i64 1
+  %tmp10130 = getelementptr inbounds float* %tmp10129, i64 1
+  %tmp10131 = getelementptr inbounds float* %tmp10130, i64 1
+  %tmp10132 = getelementptr inbounds float* %tmp10131, i64 1
+  %tmp10133 = getelementptr inbounds float* %tmp10132, i64 1
+  %tmp10134 = getelementptr inbounds float* %tmp10133, i64 1
+  %tmp10135 = getelementptr inbounds float* %tmp10134, i64 1
+  %tmp10136 = getelementptr inbounds float* %tmp10135, i64 1
+  %tmp10137 = getelementptr inbounds float* %tmp10136, i64 1
+  %tmp10138 = getelementptr inbounds float* %tmp10137, i64 1
+  %tmp10139 = getelementptr inbounds float* %tmp10138, i64 1
+  %tmp10140 = getelementptr inbounds float* %tmp10139, i64 1
+  %tmp10141 = getelementptr inbounds float* %tmp10140, i64 1
+  %tmp10142 = getelementptr inbounds float* %tmp10141, i64 1
+  %tmp10143 = getelementptr inbounds float* %tmp10142, i64 1
+  %tmp10144 = getelementptr inbounds float* %tmp10143, i64 1
+  %tmp10145 = getelementptr inbounds float* %tmp10144, i64 1
+  %tmp10146 = getelementptr inbounds float* %tmp10145, i64 1
+  %tmp10147 = getelementptr inbounds float* %tmp10146, i64 1
+  %tmp10148 = getelementptr inbounds float* %tmp10147, i64 1
+  %tmp10149 = getelementptr inbounds float* %tmp10148, i64 1
+  %tmp10150 = getelementptr inbounds float* %tmp10149, i64 1
+  %tmp10151 = getelementptr inbounds float* %tmp10150, i64 1
+  %tmp10152 = getelementptr inbounds float* %tmp10151, i64 1
+  %tmp10153 = getelementptr inbounds float* %tmp10152, i64 1
+  %tmp10154 = getelementptr inbounds float* %tmp10153, i64 1
+  %tmp10155 = getelementptr inbounds float* %tmp10154, i64 1
+  %tmp10156 = getelementptr inbounds float* %tmp10155, i64 1
+  %tmp10157 = getelementptr inbounds float* %tmp10156, i64 1
+  %tmp10158 = getelementptr inbounds float* %tmp10157, i64 1
+  %tmp10159 = getelementptr inbounds float* %tmp10158, i64 1
+  %tmp10160 = getelementptr inbounds float* %tmp10159, i64 1
+  %tmp10161 = getelementptr inbounds float* %tmp10160, i64 1
+  %tmp10162 = getelementptr inbounds float* %tmp10161, i64 1
+  %tmp10163 = getelementptr inbounds float* %tmp10162, i64 1
+  %tmp10164 = getelementptr inbounds float* %tmp10163, i64 1
+  %tmp10165 = getelementptr inbounds float* %tmp10164, i64 1
+  %tmp10166 = getelementptr inbounds float* %tmp10165, i64 1
+  %tmp10167 = getelementptr inbounds float* %tmp10166, i64 1
+  %tmp10168 = getelementptr inbounds float* %tmp10167, i64 1
+  %tmp10169 = getelementptr inbounds float* %tmp10168, i64 1
+  %tmp10170 = getelementptr inbounds float* %tmp10169, i64 1
+  %tmp10171 = getelementptr inbounds float* %tmp10170, i64 1
+  %tmp10172 = getelementptr inbounds float* %tmp10171, i64 1
+  %tmp10173 = getelementptr inbounds float* %tmp10172, i64 1
+  %tmp10174 = getelementptr inbounds float* %tmp10173, i64 1
+  %tmp10175 = getelementptr inbounds float* %tmp10174, i64 1
+  %tmp10176 = getelementptr inbounds float* %tmp10175, i64 1
+  %tmp10177 = getelementptr inbounds float* %tmp10176, i64 1
+  %tmp10178 = getelementptr inbounds float* %tmp10177, i64 1
+  %tmp10179 = getelementptr inbounds float* %tmp10178, i64 1
+  %tmp10180 = getelementptr inbounds float* %tmp10179, i64 1
+  %tmp10181 = getelementptr inbounds float* %tmp10180, i64 1
+  %tmp10182 = getelementptr inbounds float* %tmp10181, i64 1
+  %tmp10183 = getelementptr inbounds float* %tmp10182, i64 1
+  %tmp10184 = getelementptr inbounds float* %tmp10183, i64 1
+  %tmp10185 = getelementptr inbounds float* %tmp10184, i64 1
+  %tmp10186 = getelementptr inbounds float* %tmp10185, i64 1
+  %tmp10187 = getelementptr inbounds float* %tmp10186, i64 1
+  %tmp10188 = getelementptr inbounds float* %tmp10187, i64 1
+  %tmp10189 = getelementptr inbounds float* %tmp10188, i64 1
+  %tmp10190 = getelementptr inbounds float* %tmp10189, i64 1
+  %tmp10191 = getelementptr inbounds float* %tmp10190, i64 1
+  %tmp10192 = getelementptr inbounds float* %tmp10191, i64 1
+  %tmp10193 = getelementptr inbounds float* %tmp10192, i64 1
+  %tmp10194 = getelementptr inbounds float* %tmp10193, i64 1
+  %tmp10195 = getelementptr inbounds float* %tmp10194, i64 1
+  %tmp10196 = getelementptr inbounds float* %tmp10195, i64 1
+  %tmp10197 = getelementptr inbounds float* %tmp10196, i64 1
+  %tmp10198 = getelementptr inbounds float* %tmp10197, i64 1
+  %tmp10199 = getelementptr inbounds float* %tmp10198, i64 1
+  %tmp10200 = getelementptr inbounds float* %tmp10199, i64 1
+  %tmp10201 = getelementptr inbounds float* %tmp10200, i64 1
+  %tmp10202 = getelementptr inbounds float* %tmp10201, i64 1
+  %tmp10203 = getelementptr inbounds float* %tmp10202, i64 1
+  %tmp10204 = getelementptr inbounds float* %tmp10203, i64 1
+  %tmp10205 = getelementptr inbounds float* %tmp10204, i64 1
+  %tmp10206 = getelementptr inbounds float* %tmp10205, i64 1
+  %tmp10207 = getelementptr inbounds float* %tmp10206, i64 1
+  %tmp10208 = getelementptr inbounds float* %tmp10207, i64 1
+  %tmp10209 = getelementptr inbounds float* %tmp10208, i64 1
+  %tmp10210 = getelementptr inbounds float* %tmp10209, i64 1
+  %tmp10211 = getelementptr inbounds float* %tmp10210, i64 1
+  %tmp10212 = getelementptr inbounds float* %tmp10211, i64 1
+  %tmp10213 = getelementptr inbounds float* %tmp10212, i64 1
+  %tmp10214 = getelementptr inbounds float* %tmp10213, i64 1
+  %tmp10215 = getelementptr inbounds float* %tmp10214, i64 1
+  %tmp10216 = getelementptr inbounds float* %tmp10215, i64 1
+  %tmp10217 = getelementptr inbounds float* %tmp10216, i64 1
+  %tmp10218 = getelementptr inbounds float* %tmp10217, i64 1
+  %tmp10219 = getelementptr inbounds float* %tmp10218, i64 1
+  %tmp10220 = getelementptr inbounds float* %tmp10219, i64 1
+  %tmp10221 = getelementptr inbounds float* %tmp10220, i64 1
+  %tmp10222 = getelementptr inbounds float* %tmp10221, i64 1
+  %tmp10223 = getelementptr inbounds float* %tmp10222, i64 1
+  %tmp10224 = getelementptr inbounds float* %tmp10223, i64 1
+  %tmp10225 = getelementptr inbounds float* %tmp10224, i64 1
+  %tmp10226 = getelementptr inbounds float* %tmp10225, i64 1
+  %tmp10227 = getelementptr inbounds float* %tmp10226, i64 1
+  %tmp10228 = getelementptr inbounds float* %tmp10227, i64 1
+  %tmp10229 = getelementptr inbounds float* %tmp10228, i64 1
+  %tmp10230 = getelementptr inbounds float* %tmp10229, i64 1
+  %tmp10231 = getelementptr inbounds float* %tmp10230, i64 1
+  %tmp10232 = getelementptr inbounds float* %tmp10231, i64 1
+  %tmp10233 = getelementptr inbounds float* %tmp10232, i64 1
+  %tmp10234 = getelementptr inbounds float* %tmp10233, i64 1
+  %tmp10235 = getelementptr inbounds float* %tmp10234, i64 1
+  %tmp10236 = getelementptr inbounds float* %tmp10235, i64 1
+  %tmp10237 = getelementptr inbounds float* %tmp10236, i64 1
+  %tmp10238 = getelementptr inbounds float* %tmp10237, i64 1
+  %tmp10239 = getelementptr inbounds float* %tmp10238, i64 1
+  %tmp10240 = getelementptr inbounds float* %tmp10239, i64 1
+  %tmp10241 = getelementptr inbounds float* %tmp10240, i64 1
+  %tmp10242 = getelementptr inbounds float* %tmp10241, i64 1
+  %tmp10243 = getelementptr inbounds float* %tmp10242, i64 1
+  %tmp10244 = getelementptr inbounds float* %tmp10243, i64 1
+  %tmp10245 = getelementptr inbounds float* %tmp10244, i64 1
+  %tmp10246 = getelementptr inbounds float* %tmp10245, i64 1
+  %tmp10247 = getelementptr inbounds float* %tmp10246, i64 1
+  %tmp10248 = getelementptr inbounds float* %tmp10247, i64 1
+  %tmp10249 = getelementptr inbounds float* %tmp10248, i64 1
+  %tmp10250 = getelementptr inbounds float* %tmp10249, i64 1
+  %tmp10251 = getelementptr inbounds float* %tmp10250, i64 1
+  %tmp10252 = getelementptr inbounds float* %tmp10251, i64 1
+  %tmp10253 = getelementptr inbounds float* %tmp10252, i64 1
+  %tmp10254 = getelementptr inbounds float* %tmp10253, i64 1
+  %tmp10255 = getelementptr inbounds float* %tmp10254, i64 1
+  %tmp10256 = getelementptr inbounds float* %tmp10255, i64 1
+  %tmp10257 = getelementptr inbounds float* %tmp10256, i64 1
+  %tmp10258 = getelementptr inbounds float* %tmp10257, i64 1
+  %tmp10259 = getelementptr inbounds float* %tmp10258, i64 1
+  %tmp10260 = getelementptr inbounds float* %tmp10259, i64 1
+  %tmp10261 = getelementptr inbounds float* %tmp10260, i64 1
+  %tmp10262 = getelementptr inbounds float* %tmp10261, i64 1
+  %tmp10263 = getelementptr inbounds float* %tmp10262, i64 1
+  %tmp10264 = getelementptr inbounds float* %tmp10263, i64 1
+  %tmp10265 = getelementptr inbounds float* %tmp10264, i64 1
+  %tmp10266 = getelementptr inbounds float* %tmp10265, i64 1
+  %tmp10267 = getelementptr inbounds float* %tmp10266, i64 1
+  %tmp10268 = getelementptr inbounds float* %tmp10267, i64 1
+  %tmp10269 = getelementptr inbounds float* %tmp10268, i64 1
+  %tmp10270 = getelementptr inbounds float* %tmp10269, i64 1
+  %tmp10271 = getelementptr inbounds float* %tmp10270, i64 1
+  %tmp10272 = getelementptr inbounds float* %tmp10271, i64 1
+  %tmp10273 = getelementptr inbounds float* %tmp10272, i64 1
+  %tmp10274 = getelementptr inbounds float* %tmp10273, i64 1
+  %tmp10275 = getelementptr inbounds float* %tmp10274, i64 1
+  %tmp10276 = getelementptr inbounds float* %tmp10275, i64 1
+  %tmp10277 = getelementptr inbounds float* %tmp10276, i64 1
+  %tmp10278 = getelementptr inbounds float* %tmp10277, i64 1
+  %tmp10279 = getelementptr inbounds float* %tmp10278, i64 1
+  %tmp10280 = getelementptr inbounds float* %tmp10279, i64 1
+  %tmp10281 = getelementptr inbounds float* %tmp10280, i64 1
+  %tmp10282 = getelementptr inbounds float* %tmp10281, i64 1
+  %tmp10283 = getelementptr inbounds float* %tmp10282, i64 1
+  %tmp10284 = getelementptr inbounds float* %tmp10283, i64 1
+  %tmp10285 = getelementptr inbounds float* %tmp10284, i64 1
+  %tmp10286 = getelementptr inbounds float* %tmp10285, i64 1
+  %tmp10287 = getelementptr inbounds float* %tmp10286, i64 1
+  %tmp10288 = getelementptr inbounds float* %tmp10287, i64 1
+  %tmp10289 = getelementptr inbounds float* %tmp10288, i64 1
+  %tmp10290 = getelementptr inbounds float* %tmp10289, i64 1
+  %tmp10291 = getelementptr inbounds float* %tmp10290, i64 1
+  %tmp10292 = getelementptr inbounds float* %tmp10291, i64 1
+  %tmp10293 = getelementptr inbounds float* %tmp10292, i64 1
+  %tmp10294 = getelementptr inbounds float* %tmp10293, i64 1
+  %tmp10295 = getelementptr inbounds float* %tmp10294, i64 1
+  %tmp10296 = getelementptr inbounds float* %tmp10295, i64 1
+  %tmp10297 = getelementptr inbounds float* %tmp10296, i64 1
+  %tmp10298 = getelementptr inbounds float* %tmp10297, i64 1
+  %tmp10299 = getelementptr inbounds float* %tmp10298, i64 1
+  %tmp10300 = getelementptr inbounds float* %tmp10299, i64 1
+  %tmp10301 = getelementptr inbounds float* %tmp10300, i64 1
+  %tmp10302 = getelementptr inbounds float* %tmp10301, i64 1
+  %tmp10303 = getelementptr inbounds float* %tmp10302, i64 1
+  %tmp10304 = getelementptr inbounds float* %tmp10303, i64 1
+  %tmp10305 = getelementptr inbounds float* %tmp10304, i64 1
+  %tmp10306 = getelementptr inbounds float* %tmp10305, i64 1
+  %tmp10307 = getelementptr inbounds float* %tmp10306, i64 1
+  %tmp10308 = getelementptr inbounds float* %tmp10307, i64 1
+  %tmp10309 = getelementptr inbounds float* %tmp10308, i64 1
+  %tmp10310 = getelementptr inbounds float* %tmp10309, i64 1
+  %tmp10311 = getelementptr inbounds float* %tmp10310, i64 1
+  %tmp10312 = getelementptr inbounds float* %tmp10311, i64 1
+  %tmp10313 = getelementptr inbounds float* %tmp10312, i64 1
+  %tmp10314 = getelementptr inbounds float* %tmp10313, i64 1
+  %tmp10315 = getelementptr inbounds float* %tmp10314, i64 1
+  %tmp10316 = getelementptr inbounds float* %tmp10315, i64 1
+  %tmp10317 = getelementptr inbounds float* %tmp10316, i64 1
+  %tmp10318 = getelementptr inbounds float* %tmp10317, i64 1
+  %tmp10319 = getelementptr inbounds float* %tmp10318, i64 1
+  %tmp10320 = getelementptr inbounds float* %tmp10319, i64 1
+  %tmp10321 = getelementptr inbounds float* %tmp10320, i64 1
+  %tmp10322 = getelementptr inbounds float* %tmp10321, i64 1
+  %tmp10323 = getelementptr inbounds float* %tmp10322, i64 1
+  %tmp10324 = getelementptr inbounds float* %tmp10323, i64 1
+  %tmp10325 = getelementptr inbounds float* %tmp10324, i64 1
+  %tmp10326 = getelementptr inbounds float* %tmp10325, i64 1
+  %tmp10327 = getelementptr inbounds float* %tmp10326, i64 1
+  %tmp10328 = getelementptr inbounds float* %tmp10327, i64 1
+  %tmp10329 = getelementptr inbounds float* %tmp10328, i64 1
+  %tmp10330 = getelementptr inbounds float* %tmp10329, i64 1
+  %tmp10331 = getelementptr inbounds float* %tmp10330, i64 1
+  %tmp10332 = getelementptr inbounds float* %tmp10331, i64 1
+  %tmp10333 = getelementptr inbounds float* %tmp10332, i64 1
+  %tmp10334 = getelementptr inbounds float* %tmp10333, i64 1
+  %tmp10335 = getelementptr inbounds float* %tmp10334, i64 1
+  %tmp10336 = getelementptr inbounds float* %tmp10335, i64 1
+  %tmp10337 = getelementptr inbounds float* %tmp10336, i64 1
+  %tmp10338 = getelementptr inbounds float* %tmp10337, i64 1
+  %tmp10339 = getelementptr inbounds float* %tmp10338, i64 1
+  %tmp10340 = getelementptr inbounds float* %tmp10339, i64 1
+  %tmp10341 = getelementptr inbounds float* %tmp10340, i64 1
+  %tmp10342 = getelementptr inbounds float* %tmp10341, i64 1
+  %tmp10343 = getelementptr inbounds float* %tmp10342, i64 1
+  %tmp10344 = getelementptr inbounds float* %tmp10343, i64 1
+  %tmp10345 = getelementptr inbounds float* %tmp10344, i64 1
+  %tmp10346 = getelementptr inbounds float* %tmp10345, i64 1
+  %tmp10347 = getelementptr inbounds float* %tmp10346, i64 1
+  %tmp10348 = getelementptr inbounds float* %tmp10347, i64 1
+  %tmp10349 = getelementptr inbounds float* %tmp10348, i64 1
+  %tmp10350 = getelementptr inbounds float* %tmp10349, i64 1
+  %tmp10351 = getelementptr inbounds float* %tmp10350, i64 1
+  %tmp10352 = getelementptr inbounds float* %tmp10351, i64 1
+  %tmp10353 = getelementptr inbounds float* %tmp10352, i64 1
+  %tmp10354 = getelementptr inbounds float* %tmp10353, i64 1
+  %tmp10355 = getelementptr inbounds float* %tmp10354, i64 1
+  %tmp10356 = getelementptr inbounds float* %tmp10355, i64 1
+  %tmp10357 = getelementptr inbounds float* %tmp10356, i64 1
+  %tmp10358 = getelementptr inbounds float* %tmp10357, i64 1
+  %tmp10359 = getelementptr inbounds float* %tmp10358, i64 1
+  %tmp10360 = getelementptr inbounds float* %tmp10359, i64 1
+  %tmp10361 = getelementptr inbounds float* %tmp10360, i64 1
+  %tmp10362 = getelementptr inbounds float* %tmp10361, i64 1
+  %tmp10363 = getelementptr inbounds float* %tmp10362, i64 1
+  %tmp10364 = getelementptr inbounds float* %tmp10363, i64 1
+  %tmp10365 = getelementptr inbounds float* %tmp10364, i64 1
+  %tmp10366 = getelementptr inbounds float* %tmp10365, i64 1
+  %tmp10367 = getelementptr inbounds float* %tmp10366, i64 1
+  %tmp10368 = getelementptr inbounds float* %tmp10367, i64 1
+  %tmp10369 = getelementptr inbounds float* %tmp10368, i64 1
+  %tmp10370 = getelementptr inbounds float* %tmp10369, i64 1
+  %tmp10371 = getelementptr inbounds float* %tmp10370, i64 1
+  %tmp10372 = getelementptr inbounds float* %tmp10371, i64 1
+  %tmp10373 = getelementptr inbounds float* %tmp10372, i64 1
+  %tmp10374 = getelementptr inbounds float* %tmp10373, i64 1
+  %tmp10375 = getelementptr inbounds float* %tmp10374, i64 1
+  %tmp10376 = getelementptr inbounds float* %tmp10375, i64 1
+  %tmp10377 = getelementptr inbounds float* %tmp10376, i64 1
+  %tmp10378 = getelementptr inbounds float* %tmp10377, i64 1
+  %tmp10379 = getelementptr inbounds float* %tmp10378, i64 1
+  %tmp10380 = getelementptr inbounds float* %tmp10379, i64 1
+  %tmp10381 = getelementptr inbounds float* %tmp10380, i64 1
+  %tmp10382 = getelementptr inbounds float* %tmp10381, i64 1
+  %tmp10383 = getelementptr inbounds float* %tmp10382, i64 1
+  %tmp10384 = getelementptr inbounds float* %tmp10383, i64 1
+  %tmp10385 = getelementptr inbounds float* %tmp10384, i64 1
+  %tmp10386 = getelementptr inbounds float* %tmp10385, i64 1
+  %tmp10387 = getelementptr inbounds float* %tmp10386, i64 1
+  %tmp10388 = getelementptr inbounds float* %tmp10387, i64 1
+  %tmp10389 = getelementptr inbounds float* %tmp10388, i64 1
+  %tmp10390 = getelementptr inbounds float* %tmp10389, i64 1
+  %tmp10391 = getelementptr inbounds float* %tmp10390, i64 1
+  %tmp10392 = getelementptr inbounds float* %tmp10391, i64 1
+  %tmp10393 = getelementptr inbounds float* %tmp10392, i64 1
+  %tmp10394 = getelementptr inbounds float* %tmp10393, i64 1
+  %tmp10395 = getelementptr inbounds float* %tmp10394, i64 1
+  %tmp10396 = getelementptr inbounds float* %tmp10395, i64 1
+  %tmp10397 = getelementptr inbounds float* %tmp10396, i64 1
+  %tmp10398 = getelementptr inbounds float* %tmp10397, i64 1
+  %tmp10399 = getelementptr inbounds float* %tmp10398, i64 1
+  %tmp10400 = getelementptr inbounds float* %tmp10399, i64 1
+  %tmp10401 = getelementptr inbounds float* %tmp10400, i64 1
+  %tmp10402 = getelementptr inbounds float* %tmp10401, i64 1
+  %tmp10403 = getelementptr inbounds float* %tmp10402, i64 1
+  %tmp10404 = getelementptr inbounds float* %tmp10403, i64 1
+  %tmp10405 = getelementptr inbounds float* %tmp10404, i64 1
+  %tmp10406 = getelementptr inbounds float* %tmp10405, i64 1
+  %tmp10407 = getelementptr inbounds float* %tmp10406, i64 1
+  %tmp10408 = getelementptr inbounds float* %tmp10407, i64 1
+  %tmp10409 = getelementptr inbounds float* %tmp10408, i64 1
+  %tmp10410 = getelementptr inbounds float* %tmp10409, i64 1
+  %tmp10411 = getelementptr inbounds float* %tmp10410, i64 1
+  %tmp10412 = getelementptr inbounds float* %tmp10411, i64 1
+  %tmp10413 = getelementptr inbounds float* %tmp10412, i64 1
+  %tmp10414 = getelementptr inbounds float* %tmp10413, i64 1
+  %tmp10415 = getelementptr inbounds float* %tmp10414, i64 1
+  %tmp10416 = getelementptr inbounds float* %tmp10415, i64 1
+  %tmp10417 = getelementptr inbounds float* %tmp10416, i64 1
+  %tmp10418 = getelementptr inbounds float* %tmp10417, i64 1
+  %tmp10419 = getelementptr inbounds float* %tmp10418, i64 1
+  %tmp10420 = getelementptr inbounds float* %tmp10419, i64 1
+  %tmp10421 = getelementptr inbounds float* %tmp10420, i64 1
+  %tmp10422 = getelementptr inbounds float* %tmp10421, i64 1
+  %tmp10423 = getelementptr inbounds float* %tmp10422, i64 1
+  %tmp10424 = getelementptr inbounds float* %tmp10423, i64 1
+  %tmp10425 = getelementptr inbounds float* %tmp10424, i64 1
+  %tmp10426 = getelementptr inbounds float* %tmp10425, i64 1
+  %tmp10427 = getelementptr inbounds float* %tmp10426, i64 1
+  %tmp10428 = getelementptr inbounds float* %tmp10427, i64 1
+  %tmp10429 = getelementptr inbounds float* %tmp10428, i64 1
+  %tmp10430 = getelementptr inbounds float* %tmp10429, i64 1
+  %tmp10431 = getelementptr inbounds float* %tmp10430, i64 1
+  %tmp10432 = getelementptr inbounds float* %tmp10431, i64 1
+  %tmp10433 = getelementptr inbounds float* %tmp10432, i64 1
+  %tmp10434 = getelementptr inbounds float* %tmp10433, i64 1
+  %tmp10435 = getelementptr inbounds float* %tmp10434, i64 1
+  %tmp10436 = getelementptr inbounds float* %tmp10435, i64 1
+  %tmp10437 = getelementptr inbounds float* %tmp10436, i64 1
+  %tmp10438 = getelementptr inbounds float* %tmp10437, i64 1
+  %tmp10439 = getelementptr inbounds float* %tmp10438, i64 1
+  %tmp10440 = getelementptr inbounds float* %tmp10439, i64 1
+  %tmp10441 = getelementptr inbounds float* %tmp10440, i64 1
+  %tmp10442 = getelementptr inbounds float* %tmp10441, i64 1
+  %tmp10443 = getelementptr inbounds float* %tmp10442, i64 1
+  %tmp10444 = getelementptr inbounds float* %tmp10443, i64 1
+  %tmp10445 = getelementptr inbounds float* %tmp10444, i64 1
+  %tmp10446 = getelementptr inbounds float* %tmp10445, i64 1
+  %tmp10447 = getelementptr inbounds float* %tmp10446, i64 1
+  %tmp10448 = getelementptr inbounds float* %tmp10447, i64 1
+  %tmp10449 = getelementptr inbounds float* %tmp10448, i64 1
+  %tmp10450 = getelementptr inbounds float* %tmp10449, i64 1
+  %tmp10451 = getelementptr inbounds float* %tmp10450, i64 1
+  %tmp10452 = getelementptr inbounds float* %tmp10451, i64 1
+  %tmp10453 = getelementptr inbounds float* %tmp10452, i64 1
+  %tmp10454 = getelementptr inbounds float* %tmp10453, i64 1
+  %tmp10455 = getelementptr inbounds float* %tmp10454, i64 1
+  %tmp10456 = getelementptr inbounds float* %tmp10455, i64 1
+  %tmp10457 = getelementptr inbounds float* %tmp10456, i64 1
+  %tmp10458 = getelementptr inbounds float* %tmp10457, i64 1
+  %tmp10459 = getelementptr inbounds float* %tmp10458, i64 1
+  %tmp10460 = getelementptr inbounds float* %tmp10459, i64 1
+  %tmp10461 = getelementptr inbounds float* %tmp10460, i64 1
+  %tmp10462 = getelementptr inbounds float* %tmp10461, i64 1
+  %tmp10463 = getelementptr inbounds float* %tmp10462, i64 1
+  %tmp10464 = getelementptr inbounds float* %tmp10463, i64 1
+  %tmp10465 = getelementptr inbounds float* %tmp10464, i64 1
+  %tmp10466 = getelementptr inbounds float* %tmp10465, i64 1
+  %tmp10467 = getelementptr inbounds float* %tmp10466, i64 1
+  %tmp10468 = getelementptr inbounds float* %tmp10467, i64 1
+  %tmp10469 = getelementptr inbounds float* %tmp10468, i64 1
+  %tmp10470 = getelementptr inbounds float* %tmp10469, i64 1
+  %tmp10471 = getelementptr inbounds float* %tmp10470, i64 1
+  %tmp10472 = getelementptr inbounds float* %tmp10471, i64 1
+  %tmp10473 = getelementptr inbounds float* %tmp10472, i64 1
+  %tmp10474 = getelementptr inbounds float* %tmp10473, i64 1
+  %tmp10475 = getelementptr inbounds float* %tmp10474, i64 1
+  %tmp10476 = getelementptr inbounds float* %tmp10475, i64 1
+  %tmp10477 = getelementptr inbounds float* %tmp10476, i64 1
+  %tmp10478 = getelementptr inbounds float* %tmp10477, i64 1
+  %tmp10479 = getelementptr inbounds float* %tmp10478, i64 1
+  %tmp10480 = getelementptr inbounds float* %tmp10479, i64 1
+  %tmp10481 = getelementptr inbounds float* %tmp10480, i64 1
+  %tmp10482 = getelementptr inbounds float* %tmp10481, i64 1
+  %tmp10483 = getelementptr inbounds float* %tmp10482, i64 1
+  %tmp10484 = getelementptr inbounds float* %tmp10483, i64 1
+  %tmp10485 = getelementptr inbounds float* %tmp10484, i64 1
+  %tmp10486 = getelementptr inbounds float* %tmp10485, i64 1
+  %tmp10487 = getelementptr inbounds float* %tmp10486, i64 1
+  %tmp10488 = getelementptr inbounds float* %tmp10487, i64 1
+  %tmp10489 = getelementptr inbounds float* %tmp10488, i64 1
+  %tmp10490 = getelementptr inbounds float* %tmp10489, i64 1
+  %tmp10491 = getelementptr inbounds float* %tmp10490, i64 1
+  %tmp10492 = getelementptr inbounds float* %tmp10491, i64 1
+  %tmp10493 = getelementptr inbounds float* %tmp10492, i64 1
+  %tmp10494 = getelementptr inbounds float* %tmp10493, i64 1
+  %tmp10495 = getelementptr inbounds float* %tmp10494, i64 1
+  %tmp10496 = getelementptr inbounds float* %tmp10495, i64 1
+  %tmp10497 = getelementptr inbounds float* %tmp10496, i64 1
+  %tmp10498 = getelementptr inbounds float* %tmp10497, i64 1
+  %tmp10499 = getelementptr inbounds float* %tmp10498, i64 1
+  %tmp10500 = getelementptr inbounds float* %tmp10499, i64 1
+  %tmp10501 = getelementptr inbounds float* %tmp10500, i64 1
+  %tmp10502 = getelementptr inbounds float* %tmp10501, i64 1
+  %tmp10503 = getelementptr inbounds float* %tmp10502, i64 1
+  %tmp10504 = getelementptr inbounds float* %tmp10503, i64 1
+  %tmp10505 = getelementptr inbounds float* %tmp10504, i64 1
+  %tmp10506 = getelementptr inbounds float* %tmp10505, i64 1
+  %tmp10507 = getelementptr inbounds float* %tmp10506, i64 1
+  %tmp10508 = getelementptr inbounds float* %tmp10507, i64 1
+  %tmp10509 = getelementptr inbounds float* %tmp10508, i64 1
+  %tmp10510 = getelementptr inbounds float* %tmp10509, i64 1
+  %tmp10511 = getelementptr inbounds float* %tmp10510, i64 1
+  %tmp10512 = getelementptr inbounds float* %tmp10511, i64 1
+  %tmp10513 = getelementptr inbounds float* %tmp10512, i64 1
+  %tmp10514 = getelementptr inbounds float* %tmp10513, i64 1
+  %tmp10515 = getelementptr inbounds float* %tmp10514, i64 1
+  %tmp10516 = getelementptr inbounds float* %tmp10515, i64 1
+  %tmp10517 = getelementptr inbounds float* %tmp10516, i64 1
+  %tmp10518 = getelementptr inbounds float* %tmp10517, i64 1
+  %tmp10519 = getelementptr inbounds float* %tmp10518, i64 1
+  %tmp10520 = getelementptr inbounds float* %tmp10519, i64 1
+  %tmp10521 = getelementptr inbounds float* %tmp10520, i64 1
+  %tmp10522 = getelementptr inbounds float* %tmp10521, i64 1
+  %tmp10523 = getelementptr inbounds float* %tmp10522, i64 1
+  %tmp10524 = getelementptr inbounds float* %tmp10523, i64 1
+  %tmp10525 = getelementptr inbounds float* %tmp10524, i64 1
+  %tmp10526 = getelementptr inbounds float* %tmp10525, i64 1
+  %tmp10527 = getelementptr inbounds float* %tmp10526, i64 1
+  %tmp10528 = getelementptr inbounds float* %tmp10527, i64 1
+  %tmp10529 = getelementptr inbounds float* %tmp10528, i64 1
+  %tmp10530 = getelementptr inbounds float* %tmp10529, i64 1
+  %tmp10531 = getelementptr inbounds float* %tmp10530, i64 1
+  %tmp10532 = getelementptr inbounds float* %tmp10531, i64 1
+  %tmp10533 = getelementptr inbounds float* %tmp10532, i64 1
+  %tmp10534 = getelementptr inbounds float* %tmp10533, i64 1
+  %tmp10535 = getelementptr inbounds float* %tmp10534, i64 1
+  %tmp10536 = getelementptr inbounds float* %tmp10535, i64 1
+  %tmp10537 = getelementptr inbounds float* %tmp10536, i64 1
+  %tmp10538 = getelementptr inbounds float* %tmp10537, i64 1
+  %tmp10539 = getelementptr inbounds float* %tmp10538, i64 1
+  %tmp10540 = getelementptr inbounds float* %tmp10539, i64 1
+  %tmp10541 = getelementptr inbounds float* %tmp10540, i64 1
+  %tmp10542 = getelementptr inbounds float* %tmp10541, i64 1
+  %tmp10543 = getelementptr inbounds float* %tmp10542, i64 1
+  %tmp10544 = getelementptr inbounds float* %tmp10543, i64 1
+  %tmp10545 = getelementptr inbounds float* %tmp10544, i64 1
+  %tmp10546 = getelementptr inbounds float* %tmp10545, i64 1
+  %tmp10547 = getelementptr inbounds float* %tmp10546, i64 1
+  %tmp10548 = getelementptr inbounds float* %tmp10547, i64 1
+  %tmp10549 = getelementptr inbounds float* %tmp10548, i64 1
+  %tmp10550 = getelementptr inbounds float* %tmp10549, i64 1
+  %tmp10551 = getelementptr inbounds float* %tmp10550, i64 1
+  %tmp10552 = getelementptr inbounds float* %tmp10551, i64 1
+  %tmp10553 = getelementptr inbounds float* %tmp10552, i64 1
+  %tmp10554 = getelementptr inbounds float* %tmp10553, i64 1
+  %tmp10555 = getelementptr inbounds float* %tmp10554, i64 1
+  %tmp10556 = getelementptr inbounds float* %tmp10555, i64 1
+  %tmp10557 = getelementptr inbounds float* %tmp10556, i64 1
+  %tmp10558 = getelementptr inbounds float* %tmp10557, i64 1
+  %tmp10559 = getelementptr inbounds float* %tmp10558, i64 1
+  %tmp10560 = getelementptr inbounds float* %tmp10559, i64 1
+  %tmp10561 = getelementptr inbounds float* %tmp10560, i64 1
+  %tmp10562 = getelementptr inbounds float* %tmp10561, i64 1
+  %tmp10563 = getelementptr inbounds float* %tmp10562, i64 1
+  %tmp10564 = getelementptr inbounds float* %tmp10563, i64 1
+  %tmp10565 = getelementptr inbounds float* %tmp10564, i64 1
+  %tmp10566 = getelementptr inbounds float* %tmp10565, i64 1
+  %tmp10567 = getelementptr inbounds float* %tmp10566, i64 1
+  %tmp10568 = getelementptr inbounds float* %tmp10567, i64 1
+  %tmp10569 = getelementptr inbounds float* %tmp10568, i64 1
+  %tmp10570 = getelementptr inbounds float* %tmp10569, i64 1
+  %tmp10571 = getelementptr inbounds float* %tmp10570, i64 1
+  %tmp10572 = getelementptr inbounds float* %tmp10571, i64 1
+  %tmp10573 = getelementptr inbounds float* %tmp10572, i64 1
+  %tmp10574 = getelementptr inbounds float* %tmp10573, i64 1
+  %tmp10575 = getelementptr inbounds float* %tmp10574, i64 1
+  %tmp10576 = getelementptr inbounds float* %tmp10575, i64 1
+  %tmp10577 = getelementptr inbounds float* %tmp10576, i64 1
+  %tmp10578 = getelementptr inbounds float* %tmp10577, i64 1
+  %tmp10579 = getelementptr inbounds float* %tmp10578, i64 1
+  %tmp10580 = getelementptr inbounds float* %tmp10579, i64 1
+  %tmp10581 = getelementptr inbounds float* %tmp10580, i64 1
+  %tmp10582 = getelementptr inbounds float* %tmp10581, i64 1
+  %tmp10583 = getelementptr inbounds float* %tmp10582, i64 1
+  %tmp10584 = getelementptr inbounds float* %tmp10583, i64 1
+  %tmp10585 = getelementptr inbounds float* %tmp10584, i64 1
+  %tmp10586 = getelementptr inbounds float* %tmp10585, i64 1
+  %tmp10587 = getelementptr inbounds float* %tmp10586, i64 1
+  %tmp10588 = getelementptr inbounds float* %tmp10587, i64 1
+  %tmp10589 = getelementptr inbounds float* %tmp10588, i64 1
+  %tmp10590 = getelementptr inbounds float* %tmp10589, i64 1
+  %tmp10591 = getelementptr inbounds float* %tmp10590, i64 1
+  %tmp10592 = getelementptr inbounds float* %tmp10591, i64 1
+  %tmp10593 = getelementptr inbounds float* %tmp10592, i64 1
+  %tmp10594 = getelementptr inbounds float* %tmp10593, i64 1
+  %tmp10595 = getelementptr inbounds float* %tmp10594, i64 1
+  %tmp10596 = getelementptr inbounds float* %tmp10595, i64 1
+  %tmp10597 = getelementptr inbounds float* %tmp10596, i64 1
+  %tmp10598 = getelementptr inbounds float* %tmp10597, i64 1
+  %tmp10599 = getelementptr inbounds float* %tmp10598, i64 1
+  %tmp10600 = getelementptr inbounds float* %tmp10599, i64 1
+  %tmp10601 = getelementptr inbounds float* %tmp10600, i64 1
+  %tmp10602 = getelementptr inbounds float* %tmp10601, i64 1
+  %tmp10603 = getelementptr inbounds float* %tmp10602, i64 1
+  %tmp10604 = getelementptr inbounds float* %tmp10603, i64 1
+  %tmp10605 = getelementptr inbounds float* %tmp10604, i64 1
+  %tmp10606 = getelementptr inbounds float* %tmp10605, i64 1
+  %tmp10607 = getelementptr inbounds float* %tmp10606, i64 1
+  %tmp10608 = getelementptr inbounds float* %tmp10607, i64 1
+  %tmp10609 = getelementptr inbounds float* %tmp10608, i64 1
+  %tmp10610 = getelementptr inbounds float* %tmp10609, i64 1
+  %tmp10611 = getelementptr inbounds float* %tmp10610, i64 1
+  %tmp10612 = getelementptr inbounds float* %tmp10611, i64 1
+  %tmp10613 = getelementptr inbounds float* %tmp10612, i64 1
+  %tmp10614 = getelementptr inbounds float* %tmp10613, i64 1
+  %tmp10615 = getelementptr inbounds float* %tmp10614, i64 1
+  %tmp10616 = getelementptr inbounds float* %tmp10615, i64 1
+  %tmp10617 = getelementptr inbounds float* %tmp10616, i64 1
+  %tmp10618 = getelementptr inbounds float* %tmp10617, i64 1
+  %tmp10619 = getelementptr inbounds float* %tmp10618, i64 1
+  %tmp10620 = getelementptr inbounds float* %tmp10619, i64 1
+  %tmp10621 = getelementptr inbounds float* %tmp10620, i64 1
+  %tmp10622 = getelementptr inbounds float* %tmp10621, i64 1
+  %tmp10623 = getelementptr inbounds float* %tmp10622, i64 1
+  %tmp10624 = getelementptr inbounds float* %tmp10623, i64 1
+  %tmp10625 = getelementptr inbounds float* %tmp10624, i64 1
+  %tmp10626 = getelementptr inbounds float* %tmp10625, i64 1
+  %tmp10627 = getelementptr inbounds float* %tmp10626, i64 1
+  %tmp10628 = getelementptr inbounds float* %tmp10627, i64 1
+  %tmp10629 = getelementptr inbounds float* %tmp10628, i64 1
+  %tmp10630 = getelementptr inbounds float* %tmp10629, i64 1
+  %tmp10631 = getelementptr inbounds float* %tmp10630, i64 1
+  %tmp10632 = getelementptr inbounds float* %tmp10631, i64 1
+  %tmp10633 = getelementptr inbounds float* %tmp10632, i64 1
+  %tmp10634 = getelementptr inbounds float* %tmp10633, i64 1
+  %tmp10635 = getelementptr inbounds float* %tmp10634, i64 1
+  %tmp10636 = getelementptr inbounds float* %tmp10635, i64 1
+  %tmp10637 = getelementptr inbounds float* %tmp10636, i64 1
+  %tmp10638 = getelementptr inbounds float* %tmp10637, i64 1
+  %tmp10639 = getelementptr inbounds float* %tmp10638, i64 1
+  %tmp10640 = getelementptr inbounds float* %tmp10639, i64 1
+  %tmp10641 = getelementptr inbounds float* %tmp10640, i64 1
+  %tmp10642 = getelementptr inbounds float* %tmp10641, i64 1
+  %tmp10643 = getelementptr inbounds float* %tmp10642, i64 1
+  %tmp10644 = getelementptr inbounds float* %tmp10643, i64 1
+  %tmp10645 = getelementptr inbounds float* %tmp10644, i64 1
+  %tmp10646 = getelementptr inbounds float* %tmp10645, i64 1
+  %tmp10647 = getelementptr inbounds float* %tmp10646, i64 1
+  %tmp10648 = getelementptr inbounds float* %tmp10647, i64 1
+  %tmp10649 = getelementptr inbounds float* %tmp10648, i64 1
+  %tmp10650 = getelementptr inbounds float* %tmp10649, i64 1
+  %tmp10651 = getelementptr inbounds float* %tmp10650, i64 1
+  %tmp10652 = getelementptr inbounds float* %tmp10651, i64 1
+  %tmp10653 = getelementptr inbounds float* %tmp10652, i64 1
+  %tmp10654 = getelementptr inbounds float* %tmp10653, i64 1
+  %tmp10655 = getelementptr inbounds float* %tmp10654, i64 1
+  %tmp10656 = getelementptr inbounds float* %tmp10655, i64 1
+  %tmp10657 = getelementptr inbounds float* %tmp10656, i64 1
+  %tmp10658 = getelementptr inbounds float* %tmp10657, i64 1
+  %tmp10659 = getelementptr inbounds float* %tmp10658, i64 1
+  %tmp10660 = getelementptr inbounds float* %tmp10659, i64 1
+  %tmp10661 = getelementptr inbounds float* %tmp10660, i64 1
+  %tmp10662 = getelementptr inbounds float* %tmp10661, i64 1
+  %tmp10663 = getelementptr inbounds float* %tmp10662, i64 1
+  %tmp10664 = getelementptr inbounds float* %tmp10663, i64 1
+  %tmp10665 = getelementptr inbounds float* %tmp10664, i64 1
+  %tmp10666 = getelementptr inbounds float* %tmp10665, i64 1
+  %tmp10667 = getelementptr inbounds float* %tmp10666, i64 1
+  %tmp10668 = getelementptr inbounds float* %tmp10667, i64 1
+  %tmp10669 = getelementptr inbounds float* %tmp10668, i64 1
+  %tmp10670 = getelementptr inbounds float* %tmp10669, i64 1
+  %tmp10671 = getelementptr inbounds float* %tmp10670, i64 1
+  %tmp10672 = getelementptr inbounds float* %tmp10671, i64 1
+  %tmp10673 = getelementptr inbounds float* %tmp10672, i64 1
+  %tmp10674 = getelementptr inbounds float* %tmp10673, i64 1
+  %tmp10675 = getelementptr inbounds float* %tmp10674, i64 1
+  %tmp10676 = getelementptr inbounds float* %tmp10675, i64 1
+  %tmp10677 = getelementptr inbounds float* %tmp10676, i64 1
+  %tmp10678 = getelementptr inbounds float* %tmp10677, i64 1
+  %tmp10679 = getelementptr inbounds float* %tmp10678, i64 1
+  %tmp10680 = getelementptr inbounds float* %tmp10679, i64 1
+  %tmp10681 = getelementptr inbounds float* %tmp10680, i64 1
+  %tmp10682 = getelementptr inbounds float* %tmp10681, i64 1
+  %tmp10683 = getelementptr inbounds float* %tmp10682, i64 1
+  %tmp10684 = getelementptr inbounds float* %tmp10683, i64 1
+  %tmp10685 = getelementptr inbounds float* %tmp10684, i64 1
+  %tmp10686 = getelementptr inbounds float* %tmp10685, i64 1
+  %tmp10687 = getelementptr inbounds float* %tmp10686, i64 1
+  %tmp10688 = getelementptr inbounds float* %tmp10687, i64 1
+  %tmp10689 = getelementptr inbounds float* %tmp10688, i64 1
+  %tmp10690 = getelementptr inbounds float* %tmp10689, i64 1
+  %tmp10691 = getelementptr inbounds float* %tmp10690, i64 1
+  %tmp10692 = getelementptr inbounds float* %tmp10691, i64 1
+  %tmp10693 = getelementptr inbounds float* %tmp10692, i64 1
+  %tmp10694 = getelementptr inbounds float* %tmp10693, i64 1
+  %tmp10695 = getelementptr inbounds float* %tmp10694, i64 1
+  %tmp10696 = getelementptr inbounds float* %tmp10695, i64 1
+  %tmp10697 = getelementptr inbounds float* %tmp10696, i64 1
+  %tmp10698 = getelementptr inbounds float* %tmp10697, i64 1
+  %tmp10699 = getelementptr inbounds float* %tmp10698, i64 1
+  %tmp10700 = getelementptr inbounds float* %tmp10699, i64 1
+  %tmp10701 = getelementptr inbounds float* %tmp10700, i64 1
+  %tmp10702 = getelementptr inbounds float* %tmp10701, i64 1
+  %tmp10703 = getelementptr inbounds float* %tmp10702, i64 1
+  %tmp10704 = getelementptr inbounds float* %tmp10703, i64 1
+  %tmp10705 = getelementptr inbounds float* %tmp10704, i64 1
+  %tmp10706 = getelementptr inbounds float* %tmp10705, i64 1
+  %tmp10707 = getelementptr inbounds float* %tmp10706, i64 1
+  %tmp10708 = getelementptr inbounds float* %tmp10707, i64 1
+  %tmp10709 = getelementptr inbounds float* %tmp10708, i64 1
+  %tmp10710 = getelementptr inbounds float* %tmp10709, i64 1
+  %tmp10711 = getelementptr inbounds float* %tmp10710, i64 1
+  %tmp10712 = getelementptr inbounds float* %tmp10711, i64 1
+  %tmp10713 = getelementptr inbounds float* %tmp10712, i64 1
+  %tmp10714 = getelementptr inbounds float* %tmp10713, i64 1
+  %tmp10715 = getelementptr inbounds float* %tmp10714, i64 1
+  %tmp10716 = getelementptr inbounds float* %tmp10715, i64 1
+  %tmp10717 = getelementptr inbounds float* %tmp10716, i64 1
+  %tmp10718 = getelementptr inbounds float* %tmp10717, i64 1
+  %tmp10719 = getelementptr inbounds float* %tmp10718, i64 1
+  %tmp10720 = getelementptr inbounds float* %tmp10719, i64 1
+  %tmp10721 = getelementptr inbounds float* %tmp10720, i64 1
+  %tmp10722 = getelementptr inbounds float* %tmp10721, i64 1
+  %tmp10723 = getelementptr inbounds float* %tmp10722, i64 1
+  %tmp10724 = getelementptr inbounds float* %tmp10723, i64 1
+  %tmp10725 = getelementptr inbounds float* %tmp10724, i64 1
+  %tmp10726 = getelementptr inbounds float* %tmp10725, i64 1
+  %tmp10727 = getelementptr inbounds float* %tmp10726, i64 1
+  %tmp10728 = getelementptr inbounds float* %tmp10727, i64 1
+  %tmp10729 = getelementptr inbounds float* %tmp10728, i64 1
+  %tmp10730 = getelementptr inbounds float* %tmp10729, i64 1
+  %tmp10731 = getelementptr inbounds float* %tmp10730, i64 1
+  %tmp10732 = getelementptr inbounds float* %tmp10731, i64 1
+  %tmp10733 = getelementptr inbounds float* %tmp10732, i64 1
+  %tmp10734 = getelementptr inbounds float* %tmp10733, i64 1
+  %tmp10735 = getelementptr inbounds float* %tmp10734, i64 1
+  %tmp10736 = getelementptr inbounds float* %tmp10735, i64 1
+  %tmp10737 = getelementptr inbounds float* %tmp10736, i64 1
+  %tmp10738 = getelementptr inbounds float* %tmp10737, i64 1
+  %tmp10739 = getelementptr inbounds float* %tmp10738, i64 1
+  %tmp10740 = getelementptr inbounds float* %tmp10739, i64 1
+  %tmp10741 = getelementptr inbounds float* %tmp10740, i64 1
+  %tmp10742 = getelementptr inbounds float* %tmp10741, i64 1
+  %tmp10743 = getelementptr inbounds float* %tmp10742, i64 1
+  %tmp10744 = getelementptr inbounds float* %tmp10743, i64 1
+  %tmp10745 = getelementptr inbounds float* %tmp10744, i64 1
+  %tmp10746 = getelementptr inbounds float* %tmp10745, i64 1
+  %tmp10747 = getelementptr inbounds float* %tmp10746, i64 1
+  %tmp10748 = getelementptr inbounds float* %tmp10747, i64 1
+  %tmp10749 = getelementptr inbounds float* %tmp10748, i64 1
+  %tmp10750 = getelementptr inbounds float* %tmp10749, i64 1
+  %tmp10751 = getelementptr inbounds float* %tmp10750, i64 1
+  %tmp10752 = getelementptr inbounds float* %tmp10751, i64 1
+  %tmp10753 = getelementptr inbounds float* %tmp10752, i64 1
+  %tmp10754 = getelementptr inbounds float* %tmp10753, i64 1
+  %tmp10755 = getelementptr inbounds float* %tmp10754, i64 1
+  %tmp10756 = getelementptr inbounds float* %tmp10755, i64 1
+  %tmp10757 = getelementptr inbounds float* %tmp10756, i64 1
+  %tmp10758 = getelementptr inbounds float* %tmp10757, i64 1
+  %tmp10759 = getelementptr inbounds float* %tmp10758, i64 1
+  %tmp10760 = getelementptr inbounds float* %tmp10759, i64 1
+  %tmp10761 = getelementptr inbounds float* %tmp10760, i64 1
+  %tmp10762 = getelementptr inbounds float* %tmp10761, i64 1
+  %tmp10763 = getelementptr inbounds float* %tmp10762, i64 1
+  %tmp10764 = getelementptr inbounds float* %tmp10763, i64 1
+  %tmp10765 = getelementptr inbounds float* %tmp10764, i64 1
+  %tmp10766 = getelementptr inbounds float* %tmp10765, i64 1
+  %tmp10767 = getelementptr inbounds float* %tmp10766, i64 1
+  %tmp10768 = getelementptr inbounds float* %tmp10767, i64 1
+  %tmp10769 = getelementptr inbounds float* %tmp10768, i64 1
+  %tmp10770 = getelementptr inbounds float* %tmp10769, i64 1
+  %tmp10771 = getelementptr inbounds float* %tmp10770, i64 1
+  %tmp10772 = getelementptr inbounds float* %tmp10771, i64 1
+  %tmp10773 = getelementptr inbounds float* %tmp10772, i64 1
+  %tmp10774 = getelementptr inbounds float* %tmp10773, i64 1
+  %tmp10775 = getelementptr inbounds float* %tmp10774, i64 1
+  %tmp10776 = getelementptr inbounds float* %tmp10775, i64 1
+  %tmp10777 = getelementptr inbounds float* %tmp10776, i64 1
+  %tmp10778 = getelementptr inbounds float* %tmp10777, i64 1
+  %tmp10779 = getelementptr inbounds float* %tmp10778, i64 1
+  %tmp10780 = getelementptr inbounds float* %tmp10779, i64 1
+  %tmp10781 = getelementptr inbounds float* %tmp10780, i64 1
+  %tmp10782 = getelementptr inbounds float* %tmp10781, i64 1
+  %tmp10783 = getelementptr inbounds float* %tmp10782, i64 1
+  %tmp10784 = getelementptr inbounds float* %tmp10783, i64 1
+  %tmp10785 = getelementptr inbounds float* %tmp10784, i64 1
+  %tmp10786 = getelementptr inbounds float* %tmp10785, i64 1
+  %tmp10787 = getelementptr inbounds float* %tmp10786, i64 1
+  %tmp10788 = getelementptr inbounds float* %tmp10787, i64 1
+  %tmp10789 = getelementptr inbounds float* %tmp10788, i64 1
+  %tmp10790 = getelementptr inbounds float* %tmp10789, i64 1
+  %tmp10791 = getelementptr inbounds float* %tmp10790, i64 1
+  %tmp10792 = getelementptr inbounds float* %tmp10791, i64 1
+  %tmp10793 = getelementptr inbounds float* %tmp10792, i64 1
+  %tmp10794 = getelementptr inbounds float* %tmp10793, i64 1
+  %tmp10795 = getelementptr inbounds float* %tmp10794, i64 1
+  %tmp10796 = getelementptr inbounds float* %tmp10795, i64 1
+  %tmp10797 = getelementptr inbounds float* %tmp10796, i64 1
+  %tmp10798 = getelementptr inbounds float* %tmp10797, i64 1
+  %tmp10799 = getelementptr inbounds float* %tmp10798, i64 1
+  %tmp10800 = getelementptr inbounds float* %tmp10799, i64 1
+  %tmp10801 = getelementptr inbounds float* %tmp10800, i64 1
+  %tmp10802 = getelementptr inbounds float* %tmp10801, i64 1
+  %tmp10803 = getelementptr inbounds float* %tmp10802, i64 1
+  %tmp10804 = getelementptr inbounds float* %tmp10803, i64 1
+  %tmp10805 = getelementptr inbounds float* %tmp10804, i64 1
+  %tmp10806 = getelementptr inbounds float* %tmp10805, i64 1
+  %tmp10807 = getelementptr inbounds float* %tmp10806, i64 1
+  %tmp10808 = getelementptr inbounds float* %tmp10807, i64 1
+  %tmp10809 = getelementptr inbounds float* %tmp10808, i64 1
+  %tmp10810 = getelementptr inbounds float* %tmp10809, i64 1
+  %tmp10811 = getelementptr inbounds float* %tmp10810, i64 1
+  %tmp10812 = getelementptr inbounds float* %tmp10811, i64 1
+  %tmp10813 = getelementptr inbounds float* %tmp10812, i64 1
+  %tmp10814 = getelementptr inbounds float* %tmp10813, i64 1
+  %tmp10815 = getelementptr inbounds float* %tmp10814, i64 1
+  %tmp10816 = getelementptr inbounds float* %tmp10815, i64 1
+  %tmp10817 = getelementptr inbounds float* %tmp10816, i64 1
+  %tmp10818 = getelementptr inbounds float* %tmp10817, i64 1
+  %tmp10819 = getelementptr inbounds float* %tmp10818, i64 1
+  %tmp10820 = getelementptr inbounds float* %tmp10819, i64 1
+  %tmp10821 = getelementptr inbounds float* %tmp10820, i64 1
+  %tmp10822 = getelementptr inbounds float* %tmp10821, i64 1
+  %tmp10823 = getelementptr inbounds float* %tmp10822, i64 1
+  %tmp10824 = getelementptr inbounds float* %tmp10823, i64 1
+  %tmp10825 = getelementptr inbounds float* %tmp10824, i64 1
+  %tmp10826 = getelementptr inbounds float* %tmp10825, i64 1
+  %tmp10827 = getelementptr inbounds float* %tmp10826, i64 1
+  %tmp10828 = getelementptr inbounds float* %tmp10827, i64 1
+  %tmp10829 = getelementptr inbounds float* %tmp10828, i64 1
+  %tmp10830 = getelementptr inbounds float* %tmp10829, i64 1
+  %tmp10831 = getelementptr inbounds float* %tmp10830, i64 1
+  %tmp10832 = getelementptr inbounds float* %tmp10831, i64 1
+  %tmp10833 = getelementptr inbounds float* %tmp10832, i64 1
+  %tmp10834 = getelementptr inbounds float* %tmp10833, i64 1
+  %tmp10835 = getelementptr inbounds float* %tmp10834, i64 1
+  %tmp10836 = getelementptr inbounds float* %tmp10835, i64 1
+  %tmp10837 = getelementptr inbounds float* %tmp10836, i64 1
+  %tmp10838 = getelementptr inbounds float* %tmp10837, i64 1
+  %tmp10839 = getelementptr inbounds float* %tmp10838, i64 1
+  %tmp10840 = getelementptr inbounds float* %tmp10839, i64 1
+  %tmp10841 = getelementptr inbounds float* %tmp10840, i64 1
+  %tmp10842 = getelementptr inbounds float* %tmp10841, i64 1
+  %tmp10843 = getelementptr inbounds float* %tmp10842, i64 1
+  %tmp10844 = getelementptr inbounds float* %tmp10843, i64 1
+  %tmp10845 = getelementptr inbounds float* %tmp10844, i64 1
+  %tmp10846 = getelementptr inbounds float* %tmp10845, i64 1
+  %tmp10847 = getelementptr inbounds float* %tmp10846, i64 1
+  %tmp10848 = getelementptr inbounds float* %tmp10847, i64 1
+  %tmp10849 = getelementptr inbounds float* %tmp10848, i64 1
+  %tmp10850 = getelementptr inbounds float* %tmp10849, i64 1
+  %tmp10851 = getelementptr inbounds float* %tmp10850, i64 1
+  %tmp10852 = getelementptr inbounds float* %tmp10851, i64 1
+  %tmp10853 = getelementptr inbounds float* %tmp10852, i64 1
+  %tmp10854 = getelementptr inbounds float* %tmp10853, i64 1
+  %tmp10855 = getelementptr inbounds float* %tmp10854, i64 1
+  %tmp10856 = getelementptr inbounds float* %tmp10855, i64 1
+  %tmp10857 = getelementptr inbounds float* %tmp10856, i64 1
+  %tmp10858 = getelementptr inbounds float* %tmp10857, i64 1
+  %tmp10859 = getelementptr inbounds float* %tmp10858, i64 1
+  %tmp10860 = getelementptr inbounds float* %tmp10859, i64 1
+  %tmp10861 = getelementptr inbounds float* %tmp10860, i64 1
+  %tmp10862 = getelementptr inbounds float* %tmp10861, i64 1
+  %tmp10863 = getelementptr inbounds float* %tmp10862, i64 1
+  %tmp10864 = getelementptr inbounds float* %tmp10863, i64 1
+  %tmp10865 = getelementptr inbounds float* %tmp10864, i64 1
+  %tmp10866 = getelementptr inbounds float* %tmp10865, i64 1
+  %tmp10867 = getelementptr inbounds float* %tmp10866, i64 1
+  %tmp10868 = getelementptr inbounds float* %tmp10867, i64 1
+  %tmp10869 = getelementptr inbounds float* %tmp10868, i64 1
+  %tmp10870 = getelementptr inbounds float* %tmp10869, i64 1
+  %tmp10871 = getelementptr inbounds float* %tmp10870, i64 1
+  %tmp10872 = getelementptr inbounds float* %tmp10871, i64 1
+  %tmp10873 = getelementptr inbounds float* %tmp10872, i64 1
+  %tmp10874 = getelementptr inbounds float* %tmp10873, i64 1
+  %tmp10875 = getelementptr inbounds float* %tmp10874, i64 1
+  %tmp10876 = getelementptr inbounds float* %tmp10875, i64 1
+  %tmp10877 = getelementptr inbounds float* %tmp10876, i64 1
+  %tmp10878 = getelementptr inbounds float* %tmp10877, i64 1
+  %tmp10879 = getelementptr inbounds float* %tmp10878, i64 1
+  %tmp10880 = getelementptr inbounds float* %tmp10879, i64 1
+  %tmp10881 = getelementptr inbounds float* %tmp10880, i64 1
+  %tmp10882 = getelementptr inbounds float* %tmp10881, i64 1
+  %tmp10883 = getelementptr inbounds float* %tmp10882, i64 1
+  %tmp10884 = getelementptr inbounds float* %tmp10883, i64 1
+  %tmp10885 = getelementptr inbounds float* %tmp10884, i64 1
+  %tmp10886 = getelementptr inbounds float* %tmp10885, i64 1
+  %tmp10887 = getelementptr inbounds float* %tmp10886, i64 1
+  %tmp10888 = getelementptr inbounds float* %tmp10887, i64 1
+  %tmp10889 = getelementptr inbounds float* %tmp10888, i64 1
+  %tmp10890 = getelementptr inbounds float* %tmp10889, i64 1
+  %tmp10891 = getelementptr inbounds float* %tmp10890, i64 1
+  %tmp10892 = getelementptr inbounds float* %tmp10891, i64 1
+  %tmp10893 = getelementptr inbounds float* %tmp10892, i64 1
+  %tmp10894 = getelementptr inbounds float* %tmp10893, i64 1
+  %tmp10895 = getelementptr inbounds float* %tmp10894, i64 1
+  %tmp10896 = getelementptr inbounds float* %tmp10895, i64 1
+  %tmp10897 = getelementptr inbounds float* %tmp10896, i64 1
+  %tmp10898 = getelementptr inbounds float* %tmp10897, i64 1
+  %tmp10899 = getelementptr inbounds float* %tmp10898, i64 1
+  %tmp10900 = getelementptr inbounds float* %tmp10899, i64 1
+  %tmp10901 = getelementptr inbounds float* %tmp10900, i64 1
+  %tmp10902 = getelementptr inbounds float* %tmp10901, i64 1
+  %tmp10903 = getelementptr inbounds float* %tmp10902, i64 1
+  %tmp10904 = getelementptr inbounds float* %tmp10903, i64 1
+  %tmp10905 = getelementptr inbounds float* %tmp10904, i64 1
+  %tmp10906 = getelementptr inbounds float* %tmp10905, i64 1
+  %tmp10907 = getelementptr inbounds float* %tmp10906, i64 1
+  %tmp10908 = getelementptr inbounds float* %tmp10907, i64 1
+  %tmp10909 = getelementptr inbounds float* %tmp10908, i64 1
+  %tmp10910 = getelementptr inbounds float* %tmp10909, i64 1
+  %tmp10911 = getelementptr inbounds float* %tmp10910, i64 1
+  %tmp10912 = getelementptr inbounds float* %tmp10911, i64 1
+  %tmp10913 = getelementptr inbounds float* %tmp10912, i64 1
+  %tmp10914 = getelementptr inbounds float* %tmp10913, i64 1
+  %tmp10915 = getelementptr inbounds float* %tmp10914, i64 1
+  %tmp10916 = getelementptr inbounds float* %tmp10915, i64 1
+  %tmp10917 = getelementptr inbounds float* %tmp10916, i64 1
+  %tmp10918 = getelementptr inbounds float* %tmp10917, i64 1
+  %tmp10919 = getelementptr inbounds float* %tmp10918, i64 1
+  %tmp10920 = getelementptr inbounds float* %tmp10919, i64 1
+  %tmp10921 = getelementptr inbounds float* %tmp10920, i64 1
+  %tmp10922 = getelementptr inbounds float* %tmp10921, i64 1
+  %tmp10923 = getelementptr inbounds float* %tmp10922, i64 1
+  %tmp10924 = getelementptr inbounds float* %tmp10923, i64 1
+  %tmp10925 = getelementptr inbounds float* %tmp10924, i64 1
+  %tmp10926 = getelementptr inbounds float* %tmp10925, i64 1
+  %tmp10927 = getelementptr inbounds float* %tmp10926, i64 1
+  %tmp10928 = getelementptr inbounds float* %tmp10927, i64 1
+  %tmp10929 = getelementptr inbounds float* %tmp10928, i64 1
+  %tmp10930 = getelementptr inbounds float* %tmp10929, i64 1
+  %tmp10931 = getelementptr inbounds float* %tmp10930, i64 1
+  %tmp10932 = getelementptr inbounds float* %tmp10931, i64 1
+  %tmp10933 = getelementptr inbounds float* %tmp10932, i64 1
+  %tmp10934 = getelementptr inbounds float* %tmp10933, i64 1
+  %tmp10935 = getelementptr inbounds float* %tmp10934, i64 1
+  %tmp10936 = getelementptr inbounds float* %tmp10935, i64 1
+  %tmp10937 = getelementptr inbounds float* %tmp10936, i64 1
+  %tmp10938 = getelementptr inbounds float* %tmp10937, i64 1
+  %tmp10939 = getelementptr inbounds float* %tmp10938, i64 1
+  %tmp10940 = getelementptr inbounds float* %tmp10939, i64 1
+  %tmp10941 = getelementptr inbounds float* %tmp10940, i64 1
+  %tmp10942 = getelementptr inbounds float* %tmp10941, i64 1
+  %tmp10943 = getelementptr inbounds float* %tmp10942, i64 1
+  %tmp10944 = getelementptr inbounds float* %tmp10943, i64 1
+  %tmp10945 = getelementptr inbounds float* %tmp10944, i64 1
+  %tmp10946 = getelementptr inbounds float* %tmp10945, i64 1
+  %tmp10947 = getelementptr inbounds float* %tmp10946, i64 1
+  %tmp10948 = getelementptr inbounds float* %tmp10947, i64 1
+  %tmp10949 = getelementptr inbounds float* %tmp10948, i64 1
+  %tmp10950 = getelementptr inbounds float* %tmp10949, i64 1
+  %tmp10951 = getelementptr inbounds float* %tmp10950, i64 1
+  %tmp10952 = getelementptr inbounds float* %tmp10951, i64 1
+  %tmp10953 = getelementptr inbounds float* %tmp10952, i64 1
+  %tmp10954 = getelementptr inbounds float* %tmp10953, i64 1
+  %tmp10955 = getelementptr inbounds float* %tmp10954, i64 1
+  %tmp10956 = getelementptr inbounds float* %tmp10955, i64 1
+  %tmp10957 = getelementptr inbounds float* %tmp10956, i64 1
+  %tmp10958 = getelementptr inbounds float* %tmp10957, i64 1
+  %tmp10959 = getelementptr inbounds float* %tmp10958, i64 1
+  %tmp10960 = getelementptr inbounds float* %tmp10959, i64 1
+  %tmp10961 = getelementptr inbounds float* %tmp10960, i64 1
+  %tmp10962 = getelementptr inbounds float* %tmp10961, i64 1
+  %tmp10963 = getelementptr inbounds float* %tmp10962, i64 1
+  %tmp10964 = getelementptr inbounds float* %tmp10963, i64 1
+  %tmp10965 = getelementptr inbounds float* %tmp10964, i64 1
+  %tmp10966 = getelementptr inbounds float* %tmp10965, i64 1
+  %tmp10967 = getelementptr inbounds float* %tmp10966, i64 1
+  %tmp10968 = getelementptr inbounds float* %tmp10967, i64 1
+  %tmp10969 = getelementptr inbounds float* %tmp10968, i64 1
+  %tmp10970 = getelementptr inbounds float* %tmp10969, i64 1
+  %tmp10971 = getelementptr inbounds float* %tmp10970, i64 1
+  %tmp10972 = getelementptr inbounds float* %tmp10971, i64 1
+  %tmp10973 = getelementptr inbounds float* %tmp10972, i64 1
+  %tmp10974 = getelementptr inbounds float* %tmp10973, i64 1
+  %tmp10975 = getelementptr inbounds float* %tmp10974, i64 1
+  %tmp10976 = getelementptr inbounds float* %tmp10975, i64 1
+  %tmp10977 = getelementptr inbounds float* %tmp10976, i64 1
+  %tmp10978 = getelementptr inbounds float* %tmp10977, i64 1
+  %tmp10979 = getelementptr inbounds float* %tmp10978, i64 1
+  %tmp10980 = getelementptr inbounds float* %tmp10979, i64 1
+  %tmp10981 = getelementptr inbounds float* %tmp10980, i64 1
+  %tmp10982 = getelementptr inbounds float* %tmp10981, i64 1
+  %tmp10983 = getelementptr inbounds float* %tmp10982, i64 1
+  %tmp10984 = getelementptr inbounds float* %tmp10983, i64 1
+  %tmp10985 = getelementptr inbounds float* %tmp10984, i64 1
+  %tmp10986 = getelementptr inbounds float* %tmp10985, i64 1
+  %tmp10987 = getelementptr inbounds float* %tmp10986, i64 1
+  %tmp10988 = getelementptr inbounds float* %tmp10987, i64 1
+  %tmp10989 = getelementptr inbounds float* %tmp10988, i64 1
+  %tmp10990 = getelementptr inbounds float* %tmp10989, i64 1
+  %tmp10991 = getelementptr inbounds float* %tmp10990, i64 1
+  %tmp10992 = getelementptr inbounds float* %tmp10991, i64 1
+  %tmp10993 = getelementptr inbounds float* %tmp10992, i64 1
+  %tmp10994 = getelementptr inbounds float* %tmp10993, i64 1
+  %tmp10995 = getelementptr inbounds float* %tmp10994, i64 1
+  %tmp10996 = getelementptr inbounds float* %tmp10995, i64 1
+  %tmp10997 = getelementptr inbounds float* %tmp10996, i64 1
+  %tmp10998 = getelementptr inbounds float* %tmp10997, i64 1
+  %tmp10999 = getelementptr inbounds float* %tmp10998, i64 1
+  %tmp11000 = getelementptr inbounds float* %tmp10999, i64 1
+  %tmp11001 = getelementptr inbounds float* %tmp11000, i64 1
+  %tmp11002 = getelementptr inbounds float* %tmp11001, i64 1
+  %tmp11003 = getelementptr inbounds float* %tmp11002, i64 1
+  %tmp11004 = getelementptr inbounds float* %tmp11003, i64 1
+  %tmp11005 = getelementptr inbounds float* %tmp11004, i64 1
+  %tmp11006 = getelementptr inbounds float* %tmp11005, i64 1
+  %tmp11007 = getelementptr inbounds float* %tmp11006, i64 1
+  %tmp11008 = getelementptr inbounds float* %tmp11007, i64 1
+  %tmp11009 = getelementptr inbounds float* %tmp11008, i64 1
+  %tmp11010 = getelementptr inbounds float* %tmp11009, i64 1
+  %tmp11011 = getelementptr inbounds float* %tmp11010, i64 1
+  %tmp11012 = getelementptr inbounds float* %tmp11011, i64 1
+  %tmp11013 = getelementptr inbounds float* %tmp11012, i64 1
+  %tmp11014 = getelementptr inbounds float* %tmp11013, i64 1
+  %tmp11015 = getelementptr inbounds float* %tmp11014, i64 1
+  %tmp11016 = getelementptr inbounds float* %tmp11015, i64 1
+  %tmp11017 = getelementptr inbounds float* %tmp11016, i64 1
+  %tmp11018 = getelementptr inbounds float* %tmp11017, i64 1
+  %tmp11019 = getelementptr inbounds float* %tmp11018, i64 1
+  %tmp11020 = getelementptr inbounds float* %tmp11019, i64 1
+  %tmp11021 = getelementptr inbounds float* %tmp11020, i64 1
+  %tmp11022 = getelementptr inbounds float* %tmp11021, i64 1
+  %tmp11023 = getelementptr inbounds float* %tmp11022, i64 1
+  %tmp11024 = getelementptr inbounds float* %tmp11023, i64 1
+  %tmp11025 = getelementptr inbounds float* %tmp11024, i64 1
+  %tmp11026 = getelementptr inbounds float* %tmp11025, i64 1
+  %tmp11027 = getelementptr inbounds float* %tmp11026, i64 1
+  %tmp11028 = getelementptr inbounds float* %tmp11027, i64 1
+  %tmp11029 = getelementptr inbounds float* %tmp11028, i64 1
+  %tmp11030 = getelementptr inbounds float* %tmp11029, i64 1
+  %tmp11031 = getelementptr inbounds float* %tmp11030, i64 1
+  %tmp11032 = getelementptr inbounds float* %tmp11031, i64 1
+  %tmp11033 = getelementptr inbounds float* %tmp11032, i64 1
+  %tmp11034 = getelementptr inbounds float* %tmp11033, i64 1
+  %tmp11035 = getelementptr inbounds float* %tmp11034, i64 1
+  %tmp11036 = getelementptr inbounds float* %tmp11035, i64 1
+  %tmp11037 = getelementptr inbounds float* %tmp11036, i64 1
+  %tmp11038 = getelementptr inbounds float* %tmp11037, i64 1
+  %tmp11039 = getelementptr inbounds float* %tmp11038, i64 1
+  %tmp11040 = getelementptr inbounds float* %tmp11039, i64 1
+  %tmp11041 = getelementptr inbounds float* %tmp11040, i64 1
+  %tmp11042 = getelementptr inbounds float* %tmp11041, i64 1
+  %tmp11043 = getelementptr inbounds float* %tmp11042, i64 1
+  %tmp11044 = getelementptr inbounds float* %tmp11043, i64 1
+  %tmp11045 = getelementptr inbounds float* %tmp11044, i64 1
+  %tmp11046 = getelementptr inbounds float* %tmp11045, i64 1
+  %tmp11047 = getelementptr inbounds float* %tmp11046, i64 1
+  %tmp11048 = getelementptr inbounds float* %tmp11047, i64 1
+  %tmp11049 = getelementptr inbounds float* %tmp11048, i64 1
+  %tmp11050 = getelementptr inbounds float* %tmp11049, i64 1
+  %tmp11051 = getelementptr inbounds float* %tmp11050, i64 1
+  %tmp11052 = getelementptr inbounds float* %tmp11051, i64 1
+  %tmp11053 = getelementptr inbounds float* %tmp11052, i64 1
+  %tmp11054 = getelementptr inbounds float* %tmp11053, i64 1
+  %tmp11055 = getelementptr inbounds float* %tmp11054, i64 1
+  %tmp11056 = getelementptr inbounds float* %tmp11055, i64 1
+  %tmp11057 = getelementptr inbounds float* %tmp11056, i64 1
+  %tmp11058 = getelementptr inbounds float* %tmp11057, i64 1
+  %tmp11059 = getelementptr inbounds float* %tmp11058, i64 1
+  %tmp11060 = getelementptr inbounds float* %tmp11059, i64 1
+  %tmp11061 = getelementptr inbounds float* %tmp11060, i64 1
+  %tmp11062 = getelementptr inbounds float* %tmp11061, i64 1
+  %tmp11063 = getelementptr inbounds float* %tmp11062, i64 1
+  %tmp11064 = getelementptr inbounds float* %tmp11063, i64 1
+  %tmp11065 = getelementptr inbounds float* %tmp11064, i64 1
+  %tmp11066 = getelementptr inbounds float* %tmp11065, i64 1
+  %tmp11067 = getelementptr inbounds float* %tmp11066, i64 1
+  %tmp11068 = getelementptr inbounds float* %tmp11067, i64 1
+  %tmp11069 = getelementptr inbounds float* %tmp11068, i64 1
+  %tmp11070 = getelementptr inbounds float* %tmp11069, i64 1
+  %tmp11071 = getelementptr inbounds float* %tmp11070, i64 1
+  %tmp11072 = getelementptr inbounds float* %tmp11071, i64 1
+  %tmp11073 = getelementptr inbounds float* %tmp11072, i64 1
+  %tmp11074 = getelementptr inbounds float* %tmp11073, i64 1
+  %tmp11075 = getelementptr inbounds float* %tmp11074, i64 1
+  %tmp11076 = getelementptr inbounds float* %tmp11075, i64 1
+  %tmp11077 = getelementptr inbounds float* %tmp11076, i64 1
+  %tmp11078 = getelementptr inbounds float* %tmp11077, i64 1
+  %tmp11079 = getelementptr inbounds float* %tmp11078, i64 1
+  %tmp11080 = getelementptr inbounds float* %tmp11079, i64 1
+  %tmp11081 = getelementptr inbounds float* %tmp11080, i64 1
+  %tmp11082 = getelementptr inbounds float* %tmp11081, i64 1
+  %tmp11083 = getelementptr inbounds float* %tmp11082, i64 1
+  %tmp11084 = getelementptr inbounds float* %tmp11083, i64 1
+  %tmp11085 = getelementptr inbounds float* %tmp11084, i64 1
+  %tmp11086 = getelementptr inbounds float* %tmp11085, i64 1
+  %tmp11087 = getelementptr inbounds float* %tmp11086, i64 1
+  %tmp11088 = getelementptr inbounds float* %tmp11087, i64 1
+  %tmp11089 = getelementptr inbounds float* %tmp11088, i64 1
+  %tmp11090 = getelementptr inbounds float* %tmp11089, i64 1
+  %tmp11091 = getelementptr inbounds float* %tmp11090, i64 1
+  %tmp11092 = getelementptr inbounds float* %tmp11091, i64 1
+  %tmp11093 = getelementptr inbounds float* %tmp11092, i64 1
+  %tmp11094 = getelementptr inbounds float* %tmp11093, i64 1
+  %tmp11095 = getelementptr inbounds float* %tmp11094, i64 1
+  %tmp11096 = getelementptr inbounds float* %tmp11095, i64 1
+  %tmp11097 = getelementptr inbounds float* %tmp11096, i64 1
+  %tmp11098 = getelementptr inbounds float* %tmp11097, i64 1
+  %tmp11099 = getelementptr inbounds float* %tmp11098, i64 1
+  %tmp11100 = getelementptr inbounds float* %tmp11099, i64 1
+  %tmp11101 = getelementptr inbounds float* %tmp11100, i64 1
+  %tmp11102 = getelementptr inbounds float* %tmp11101, i64 1
+  %tmp11103 = getelementptr inbounds float* %tmp11102, i64 1
+  %tmp11104 = getelementptr inbounds float* %tmp11103, i64 1
+  %tmp11105 = getelementptr inbounds float* %tmp11104, i64 1
+  %tmp11106 = getelementptr inbounds float* %tmp11105, i64 1
+  %tmp11107 = getelementptr inbounds float* %tmp11106, i64 1
+  %tmp11108 = getelementptr inbounds float* %tmp11107, i64 1
+  %tmp11109 = getelementptr inbounds float* %tmp11108, i64 1
+  %tmp11110 = getelementptr inbounds float* %tmp11109, i64 1
+  %tmp11111 = getelementptr inbounds float* %tmp11110, i64 1
+  %tmp11112 = getelementptr inbounds float* %tmp11111, i64 1
+  %tmp11113 = getelementptr inbounds float* %tmp11112, i64 1
+  %tmp11114 = getelementptr inbounds float* %tmp11113, i64 1
+  %tmp11115 = getelementptr inbounds float* %tmp11114, i64 1
+  %tmp11116 = getelementptr inbounds float* %tmp11115, i64 1
+  %tmp11117 = getelementptr inbounds float* %tmp11116, i64 1
+  %tmp11118 = getelementptr inbounds float* %tmp11117, i64 1
+  %tmp11119 = getelementptr inbounds float* %tmp11118, i64 1
+  %tmp11120 = getelementptr inbounds float* %tmp11119, i64 1
+  %tmp11121 = getelementptr inbounds float* %tmp11120, i64 1
+  %tmp11122 = getelementptr inbounds float* %tmp11121, i64 1
+  %tmp11123 = getelementptr inbounds float* %tmp11122, i64 1
+  %tmp11124 = getelementptr inbounds float* %tmp11123, i64 1
+  %tmp11125 = getelementptr inbounds float* %tmp11124, i64 1
+  %tmp11126 = getelementptr inbounds float* %tmp11125, i64 1
+  %tmp11127 = getelementptr inbounds float* %tmp11126, i64 1
+  %tmp11128 = getelementptr inbounds float* %tmp11127, i64 1
+  %tmp11129 = getelementptr inbounds float* %tmp11128, i64 1
+  %tmp11130 = getelementptr inbounds float* %tmp11129, i64 1
+  %tmp11131 = getelementptr inbounds float* %tmp11130, i64 1
+  %tmp11132 = getelementptr inbounds float* %tmp11131, i64 1
+  %tmp11133 = getelementptr inbounds float* %tmp11132, i64 1
+  %tmp11134 = getelementptr inbounds float* %tmp11133, i64 1
+  %tmp11135 = getelementptr inbounds float* %tmp11134, i64 1
+  %tmp11136 = getelementptr inbounds float* %tmp11135, i64 1
+  %tmp11137 = getelementptr inbounds float* %tmp11136, i64 1
+  %tmp11138 = getelementptr inbounds float* %tmp11137, i64 1
+  %tmp11139 = getelementptr inbounds float* %tmp11138, i64 1
+  %tmp11140 = getelementptr inbounds float* %tmp11139, i64 1
+  %tmp11141 = getelementptr inbounds float* %tmp11140, i64 1
+  %tmp11142 = getelementptr inbounds float* %tmp11141, i64 1
+  %tmp11143 = getelementptr inbounds float* %tmp11142, i64 1
+  %tmp11144 = getelementptr inbounds float* %tmp11143, i64 1
+  %tmp11145 = getelementptr inbounds float* %tmp11144, i64 1
+  %tmp11146 = getelementptr inbounds float* %tmp11145, i64 1
+  %tmp11147 = getelementptr inbounds float* %tmp11146, i64 1
+  %tmp11148 = getelementptr inbounds float* %tmp11147, i64 1
+  %tmp11149 = getelementptr inbounds float* %tmp11148, i64 1
+  %tmp11150 = getelementptr inbounds float* %tmp11149, i64 1
+  %tmp11151 = getelementptr inbounds float* %tmp11150, i64 1
+  %tmp11152 = getelementptr inbounds float* %tmp11151, i64 1
+  %tmp11153 = getelementptr inbounds float* %tmp11152, i64 1
+  %tmp11154 = getelementptr inbounds float* %tmp11153, i64 1
+  %tmp11155 = getelementptr inbounds float* %tmp11154, i64 1
+  %tmp11156 = getelementptr inbounds float* %tmp11155, i64 1
+  %tmp11157 = getelementptr inbounds float* %tmp11156, i64 1
+  %tmp11158 = getelementptr inbounds float* %tmp11157, i64 1
+  %tmp11159 = getelementptr inbounds float* %tmp11158, i64 1
+  %tmp11160 = getelementptr inbounds float* %tmp11159, i64 1
+  %tmp11161 = getelementptr inbounds float* %tmp11160, i64 1
+  %tmp11162 = getelementptr inbounds float* %tmp11161, i64 1
+  %tmp11163 = getelementptr inbounds float* %tmp11162, i64 1
+  %tmp11164 = getelementptr inbounds float* %tmp11163, i64 1
+  %tmp11165 = getelementptr inbounds float* %tmp11164, i64 1
+  %tmp11166 = getelementptr inbounds float* %tmp11165, i64 1
+  %tmp11167 = getelementptr inbounds float* %tmp11166, i64 1
+  %tmp11168 = getelementptr inbounds float* %tmp11167, i64 1
+  %tmp11169 = getelementptr inbounds float* %tmp11168, i64 1
+  %tmp11170 = getelementptr inbounds float* %tmp11169, i64 1
+  %tmp11171 = getelementptr inbounds float* %tmp11170, i64 1
+  %tmp11172 = getelementptr inbounds float* %tmp11171, i64 1
+  %tmp11173 = getelementptr inbounds float* %tmp11172, i64 1
+  %tmp11174 = getelementptr inbounds float* %tmp11173, i64 1
+  %tmp11175 = getelementptr inbounds float* %tmp11174, i64 1
+  %tmp11176 = getelementptr inbounds float* %tmp11175, i64 1
+  %tmp11177 = getelementptr inbounds float* %tmp11176, i64 1
+  %tmp11178 = getelementptr inbounds float* %tmp11177, i64 1
+  %tmp11179 = getelementptr inbounds float* %tmp11178, i64 1
+  %tmp11180 = getelementptr inbounds float* %tmp11179, i64 1
+  %tmp11181 = getelementptr inbounds float* %tmp11180, i64 1
+  %tmp11182 = getelementptr inbounds float* %tmp11181, i64 1
+  %tmp11183 = getelementptr inbounds float* %tmp11182, i64 1
+  %tmp11184 = getelementptr inbounds float* %tmp11183, i64 1
+  %tmp11185 = getelementptr inbounds float* %tmp11184, i64 1
+  %tmp11186 = getelementptr inbounds float* %tmp11185, i64 1
+  %tmp11187 = getelementptr inbounds float* %tmp11186, i64 1
+  %tmp11188 = getelementptr inbounds float* %tmp11187, i64 1
+  %tmp11189 = getelementptr inbounds float* %tmp11188, i64 1
+  %tmp11190 = getelementptr inbounds float* %tmp11189, i64 1
+  %tmp11191 = getelementptr inbounds float* %tmp11190, i64 1
+  %tmp11192 = getelementptr inbounds float* %tmp11191, i64 1
+  %tmp11193 = getelementptr inbounds float* %tmp11192, i64 1
+  %tmp11194 = getelementptr inbounds float* %tmp11193, i64 1
+  %tmp11195 = getelementptr inbounds float* %tmp11194, i64 1
+  %tmp11196 = getelementptr inbounds float* %tmp11195, i64 1
+  %tmp11197 = getelementptr inbounds float* %tmp11196, i64 1
+  %tmp11198 = getelementptr inbounds float* %tmp11197, i64 1
+  %tmp11199 = getelementptr inbounds float* %tmp11198, i64 1
+  %tmp11200 = getelementptr inbounds float* %tmp11199, i64 1
+  %tmp11201 = getelementptr inbounds float* %tmp11200, i64 1
+  %tmp11202 = getelementptr inbounds float* %tmp11201, i64 1
+  %tmp11203 = getelementptr inbounds float* %tmp11202, i64 1
+  %tmp11204 = getelementptr inbounds float* %tmp11203, i64 1
+  %tmp11205 = getelementptr inbounds float* %tmp11204, i64 1
+  %tmp11206 = getelementptr inbounds float* %tmp11205, i64 1
+  %tmp11207 = getelementptr inbounds float* %tmp11206, i64 1
+  %tmp11208 = getelementptr inbounds float* %tmp11207, i64 1
+  %tmp11209 = getelementptr inbounds float* %tmp11208, i64 1
+  %tmp11210 = getelementptr inbounds float* %tmp11209, i64 1
+  %tmp11211 = getelementptr inbounds float* %tmp11210, i64 1
+  %tmp11212 = getelementptr inbounds float* %tmp11211, i64 1
+  %tmp11213 = getelementptr inbounds float* %tmp11212, i64 1
+  %tmp11214 = getelementptr inbounds float* %tmp11213, i64 1
+  %tmp11215 = getelementptr inbounds float* %tmp11214, i64 1
+  %tmp11216 = getelementptr inbounds float* %tmp11215, i64 1
+  %tmp11217 = getelementptr inbounds float* %tmp11216, i64 1
+  %tmp11218 = getelementptr inbounds float* %tmp11217, i64 1
+  %tmp11219 = getelementptr inbounds float* %tmp11218, i64 1
+  %tmp11220 = getelementptr inbounds float* %tmp11219, i64 1
+  %tmp11221 = getelementptr inbounds float* %tmp11220, i64 1
+  %tmp11222 = getelementptr inbounds float* %tmp11221, i64 1
+  %tmp11223 = getelementptr inbounds float* %tmp11222, i64 1
+  %tmp11224 = getelementptr inbounds float* %tmp11223, i64 1
+  %tmp11225 = getelementptr inbounds float* %tmp11224, i64 1
+  %tmp11226 = getelementptr inbounds float* %tmp11225, i64 1
+  %tmp11227 = getelementptr inbounds float* %tmp11226, i64 1
+  %tmp11228 = getelementptr inbounds float* %tmp11227, i64 1
+  %tmp11229 = getelementptr inbounds float* %tmp11228, i64 1
+  %tmp11230 = getelementptr inbounds float* %tmp11229, i64 1
+  %tmp11231 = getelementptr inbounds float* %tmp11230, i64 1
+  %tmp11232 = getelementptr inbounds float* %tmp11231, i64 1
+  %tmp11233 = getelementptr inbounds float* %tmp11232, i64 1
+  %tmp11234 = getelementptr inbounds float* %tmp11233, i64 1
+  %tmp11235 = getelementptr inbounds float* %tmp11234, i64 1
+  %tmp11236 = getelementptr inbounds float* %tmp11235, i64 1
+  %tmp11237 = getelementptr inbounds float* %tmp11236, i64 1
+  %tmp11238 = getelementptr inbounds float* %tmp11237, i64 1
+  %tmp11239 = getelementptr inbounds float* %tmp11238, i64 1
+  %tmp11240 = getelementptr inbounds float* %tmp11239, i64 1
+  %tmp11241 = getelementptr inbounds float* %tmp11240, i64 1
+  %tmp11242 = getelementptr inbounds float* %tmp11241, i64 1
+  %tmp11243 = getelementptr inbounds float* %tmp11242, i64 1
+  %tmp11244 = getelementptr inbounds float* %tmp11243, i64 1
+  %tmp11245 = getelementptr inbounds float* %tmp11244, i64 1
+  %tmp11246 = getelementptr inbounds float* %tmp11245, i64 1
+  %tmp11247 = getelementptr inbounds float* %tmp11246, i64 1
+  %tmp11248 = getelementptr inbounds float* %tmp11247, i64 1
+  %tmp11249 = getelementptr inbounds float* %tmp11248, i64 1
+  %tmp11250 = getelementptr inbounds float* %tmp11249, i64 1
+  %tmp11251 = getelementptr inbounds float* %tmp11250, i64 1
+  %tmp11252 = getelementptr inbounds float* %tmp11251, i64 1
+  %tmp11253 = getelementptr inbounds float* %tmp11252, i64 1
+  %tmp11254 = getelementptr inbounds float* %tmp11253, i64 1
+  %tmp11255 = getelementptr inbounds float* %tmp11254, i64 1
+  %tmp11256 = getelementptr inbounds float* %tmp11255, i64 1
+  %tmp11257 = getelementptr inbounds float* %tmp11256, i64 1
+  %tmp11258 = getelementptr inbounds float* %tmp11257, i64 1
+  %tmp11259 = getelementptr inbounds float* %tmp11258, i64 1
+  %tmp11260 = getelementptr inbounds float* %tmp11259, i64 1
+  %tmp11261 = getelementptr inbounds float* %tmp11260, i64 1
+  %tmp11262 = getelementptr inbounds float* %tmp11261, i64 1
+  %tmp11263 = getelementptr inbounds float* %tmp11262, i64 1
+  %tmp11264 = getelementptr inbounds float* %tmp11263, i64 1
+  %tmp11265 = getelementptr inbounds float* %tmp11264, i64 1
+  %tmp11266 = getelementptr inbounds float* %tmp11265, i64 1
+  %tmp11267 = getelementptr inbounds float* %tmp11266, i64 1
+  %tmp11268 = getelementptr inbounds float* %tmp11267, i64 1
+  %tmp11269 = getelementptr inbounds float* %tmp11268, i64 1
+  %tmp11270 = getelementptr inbounds float* %tmp11269, i64 1
+  %tmp11271 = getelementptr inbounds float* %tmp11270, i64 1
+  %tmp11272 = getelementptr inbounds float* %tmp11271, i64 1
+  %tmp11273 = getelementptr inbounds float* %tmp11272, i64 1
+  %tmp11274 = getelementptr inbounds float* %tmp11273, i64 1
+  %tmp11275 = getelementptr inbounds float* %tmp11274, i64 1
+  %tmp11276 = getelementptr inbounds float* %tmp11275, i64 1
+  %tmp11277 = getelementptr inbounds float* %tmp11276, i64 1
+  %tmp11278 = getelementptr inbounds float* %tmp11277, i64 1
+  %tmp11279 = getelementptr inbounds float* %tmp11278, i64 1
+  %tmp11280 = getelementptr inbounds float* %tmp11279, i64 1
+  %tmp11281 = getelementptr inbounds float* %tmp11280, i64 1
+  %tmp11282 = getelementptr inbounds float* %tmp11281, i64 1
+  %tmp11283 = getelementptr inbounds float* %tmp11282, i64 1
+  %tmp11284 = getelementptr inbounds float* %tmp11283, i64 1
+  %tmp11285 = getelementptr inbounds float* %tmp11284, i64 1
+  %tmp11286 = getelementptr inbounds float* %tmp11285, i64 1
+  %tmp11287 = getelementptr inbounds float* %tmp11286, i64 1
+  %tmp11288 = getelementptr inbounds float* %tmp11287, i64 1
+  %tmp11289 = getelementptr inbounds float* %tmp11288, i64 1
+  %tmp11290 = getelementptr inbounds float* %tmp11289, i64 1
+  %tmp11291 = getelementptr inbounds float* %tmp11290, i64 1
+  %tmp11292 = getelementptr inbounds float* %tmp11291, i64 1
+  %tmp11293 = getelementptr inbounds float* %tmp11292, i64 1
+  %tmp11294 = getelementptr inbounds float* %tmp11293, i64 1
+  %tmp11295 = getelementptr inbounds float* %tmp11294, i64 1
+  %tmp11296 = getelementptr inbounds float* %tmp11295, i64 1
+  %tmp11297 = getelementptr inbounds float* %tmp11296, i64 1
+  %tmp11298 = getelementptr inbounds float* %tmp11297, i64 1
+  %tmp11299 = getelementptr inbounds float* %tmp11298, i64 1
+  %tmp11300 = getelementptr inbounds float* %tmp11299, i64 1
+  %tmp11301 = getelementptr inbounds float* %tmp11300, i64 1
+  %tmp11302 = getelementptr inbounds float* %tmp11301, i64 1
+  %tmp11303 = getelementptr inbounds float* %tmp11302, i64 1
+  %tmp11304 = getelementptr inbounds float* %tmp11303, i64 1
+  %tmp11305 = getelementptr inbounds float* %tmp11304, i64 1
+  %tmp11306 = getelementptr inbounds float* %tmp11305, i64 1
+  %tmp11307 = getelementptr inbounds float* %tmp11306, i64 1
+  %tmp11308 = getelementptr inbounds float* %tmp11307, i64 1
+  %tmp11309 = getelementptr inbounds float* %tmp11308, i64 1
+  %tmp11310 = getelementptr inbounds float* %tmp11309, i64 1
+  %tmp11311 = getelementptr inbounds float* %tmp11310, i64 1
+  %tmp11312 = getelementptr inbounds float* %tmp11311, i64 1
+  %tmp11313 = getelementptr inbounds float* %tmp11312, i64 1
+  %tmp11314 = getelementptr inbounds float* %tmp11313, i64 1
+  %tmp11315 = getelementptr inbounds float* %tmp11314, i64 1
+  %tmp11316 = getelementptr inbounds float* %tmp11315, i64 1
+  %tmp11317 = getelementptr inbounds float* %tmp11316, i64 1
+  %tmp11318 = getelementptr inbounds float* %tmp11317, i64 1
+  %tmp11319 = getelementptr inbounds float* %tmp11318, i64 1
+  %tmp11320 = getelementptr inbounds float* %tmp11319, i64 1
+  %tmp11321 = getelementptr inbounds float* %tmp11320, i64 1
+  %tmp11322 = getelementptr inbounds float* %tmp11321, i64 1
+  %tmp11323 = getelementptr inbounds float* %tmp11322, i64 1
+  %tmp11324 = getelementptr inbounds float* %tmp11323, i64 1
+  %tmp11325 = getelementptr inbounds float* %tmp11324, i64 1
+  %tmp11326 = getelementptr inbounds float* %tmp11325, i64 1
+  %tmp11327 = getelementptr inbounds float* %tmp11326, i64 1
+  %tmp11328 = getelementptr inbounds float* %tmp11327, i64 1
+  %tmp11329 = getelementptr inbounds float* %tmp11328, i64 1
+  %tmp11330 = getelementptr inbounds float* %tmp11329, i64 1
+  %tmp11331 = getelementptr inbounds float* %tmp11330, i64 1
+  %tmp11332 = getelementptr inbounds float* %tmp11331, i64 1
+  %tmp11333 = getelementptr inbounds float* %tmp11332, i64 1
+  %tmp11334 = getelementptr inbounds float* %tmp11333, i64 1
+  %tmp11335 = getelementptr inbounds float* %tmp11334, i64 1
+  %tmp11336 = getelementptr inbounds float* %tmp11335, i64 1
+  %tmp11337 = getelementptr inbounds float* %tmp11336, i64 1
+  %tmp11338 = getelementptr inbounds float* %tmp11337, i64 1
+  %tmp11339 = getelementptr inbounds float* %tmp11338, i64 1
+  %tmp11340 = getelementptr inbounds float* %tmp11339, i64 1
+  %tmp11341 = getelementptr inbounds float* %tmp11340, i64 1
+  %tmp11342 = getelementptr inbounds float* %tmp11341, i64 1
+  %tmp11343 = getelementptr inbounds float* %tmp11342, i64 1
+  %tmp11344 = getelementptr inbounds float* %tmp11343, i64 1
+  %tmp11345 = getelementptr inbounds float* %tmp11344, i64 1
+  %tmp11346 = getelementptr inbounds float* %tmp11345, i64 1
+  %tmp11347 = getelementptr inbounds float* %tmp11346, i64 1
+  %tmp11348 = getelementptr inbounds float* %tmp11347, i64 1
+  %tmp11349 = getelementptr inbounds float* %tmp11348, i64 1
+  %tmp11350 = getelementptr inbounds float* %tmp11349, i64 1
+  %tmp11351 = getelementptr inbounds float* %tmp11350, i64 1
+  %tmp11352 = getelementptr inbounds float* %tmp11351, i64 1
+  %tmp11353 = getelementptr inbounds float* %tmp11352, i64 1
+  %tmp11354 = getelementptr inbounds float* %tmp11353, i64 1
+  %tmp11355 = getelementptr inbounds float* %tmp11354, i64 1
+  %tmp11356 = getelementptr inbounds float* %tmp11355, i64 1
+  %tmp11357 = getelementptr inbounds float* %tmp11356, i64 1
+  %tmp11358 = getelementptr inbounds float* %tmp11357, i64 1
+  %tmp11359 = getelementptr inbounds float* %tmp11358, i64 1
+  %tmp11360 = getelementptr inbounds float* %tmp11359, i64 1
+  %tmp11361 = getelementptr inbounds float* %tmp11360, i64 1
+  %tmp11362 = getelementptr inbounds float* %tmp11361, i64 1
+  %tmp11363 = getelementptr inbounds float* %tmp11362, i64 1
+  %tmp11364 = getelementptr inbounds float* %tmp11363, i64 1
+  %tmp11365 = getelementptr inbounds float* %tmp11364, i64 1
+  %tmp11366 = getelementptr inbounds float* %tmp11365, i64 1
+  %tmp11367 = getelementptr inbounds float* %tmp11366, i64 1
+  %tmp11368 = getelementptr inbounds float* %tmp11367, i64 1
+  %tmp11369 = getelementptr inbounds float* %tmp11368, i64 1
+  %tmp11370 = getelementptr inbounds float* %tmp11369, i64 1
+  %tmp11371 = getelementptr inbounds float* %tmp11370, i64 1
+  %tmp11372 = getelementptr inbounds float* %tmp11371, i64 1
+  %tmp11373 = getelementptr inbounds float* %tmp11372, i64 1
+  %tmp11374 = getelementptr inbounds float* %tmp11373, i64 1
+  %tmp11375 = getelementptr inbounds float* %tmp11374, i64 1
+  %tmp11376 = getelementptr inbounds float* %tmp11375, i64 1
+  %tmp11377 = getelementptr inbounds float* %tmp11376, i64 1
+  %tmp11378 = getelementptr inbounds float* %tmp11377, i64 1
+  %tmp11379 = getelementptr inbounds float* %tmp11378, i64 1
+  %tmp11380 = getelementptr inbounds float* %tmp11379, i64 1
+  %tmp11381 = getelementptr inbounds float* %tmp11380, i64 1
+  %tmp11382 = getelementptr inbounds float* %tmp11381, i64 1
+  %tmp11383 = getelementptr inbounds float* %tmp11382, i64 1
+  %tmp11384 = getelementptr inbounds float* %tmp11383, i64 1
+  %tmp11385 = getelementptr inbounds float* %tmp11384, i64 1
+  %tmp11386 = getelementptr inbounds float* %tmp11385, i64 1
+  %tmp11387 = getelementptr inbounds float* %tmp11386, i64 1
+  %tmp11388 = getelementptr inbounds float* %tmp11387, i64 1
+  %tmp11389 = getelementptr inbounds float* %tmp11388, i64 1
+  %tmp11390 = getelementptr inbounds float* %tmp11389, i64 1
+  %tmp11391 = getelementptr inbounds float* %tmp11390, i64 1
+  %tmp11392 = getelementptr inbounds float* %tmp11391, i64 1
+  %tmp11393 = getelementptr inbounds float* %tmp11392, i64 1
+  %tmp11394 = getelementptr inbounds float* %tmp11393, i64 1
+  %tmp11395 = getelementptr inbounds float* %tmp11394, i64 1
+  %tmp11396 = getelementptr inbounds float* %tmp11395, i64 1
+  %tmp11397 = getelementptr inbounds float* %tmp11396, i64 1
+  %tmp11398 = getelementptr inbounds float* %tmp11397, i64 1
+  %tmp11399 = getelementptr inbounds float* %tmp11398, i64 1
+  %tmp11400 = getelementptr inbounds float* %tmp11399, i64 1
+  %tmp11401 = getelementptr inbounds float* %tmp11400, i64 1
+  %tmp11402 = getelementptr inbounds float* %tmp11401, i64 1
+  %tmp11403 = getelementptr inbounds float* %tmp11402, i64 1
+  %tmp11404 = getelementptr inbounds float* %tmp11403, i64 1
+  %tmp11405 = getelementptr inbounds float* %tmp11404, i64 1
+  %tmp11406 = getelementptr inbounds float* %tmp11405, i64 1
+  %tmp11407 = getelementptr inbounds float* %tmp11406, i64 1
+  %tmp11408 = getelementptr inbounds float* %tmp11407, i64 1
+  %tmp11409 = getelementptr inbounds float* %tmp11408, i64 1
+  %tmp11410 = getelementptr inbounds float* %tmp11409, i64 1
+  %tmp11411 = getelementptr inbounds float* %tmp11410, i64 1
+  %tmp11412 = getelementptr inbounds float* %tmp11411, i64 1
+  %tmp11413 = getelementptr inbounds float* %tmp11412, i64 1
+  %tmp11414 = getelementptr inbounds float* %tmp11413, i64 1
+  %tmp11415 = getelementptr inbounds float* %tmp11414, i64 1
+  %tmp11416 = getelementptr inbounds float* %tmp11415, i64 1
+  %tmp11417 = getelementptr inbounds float* %tmp11416, i64 1
+  %tmp11418 = getelementptr inbounds float* %tmp11417, i64 1
+  %tmp11419 = getelementptr inbounds float* %tmp11418, i64 1
+  %tmp11420 = getelementptr inbounds float* %tmp11419, i64 1
+  %tmp11421 = getelementptr inbounds float* %tmp11420, i64 1
+  %tmp11422 = getelementptr inbounds float* %tmp11421, i64 1
+  %tmp11423 = getelementptr inbounds float* %tmp11422, i64 1
+  %tmp11424 = getelementptr inbounds float* %tmp11423, i64 1
+  %tmp11425 = getelementptr inbounds float* %tmp11424, i64 1
+  %tmp11426 = getelementptr inbounds float* %tmp11425, i64 1
+  %tmp11427 = getelementptr inbounds float* %tmp11426, i64 1
+  %tmp11428 = getelementptr inbounds float* %tmp11427, i64 1
+  %tmp11429 = getelementptr inbounds float* %tmp11428, i64 1
+  %tmp11430 = getelementptr inbounds float* %tmp11429, i64 1
+  %tmp11431 = getelementptr inbounds float* %tmp11430, i64 1
+  %tmp11432 = getelementptr inbounds float* %tmp11431, i64 1
+  %tmp11433 = getelementptr inbounds float* %tmp11432, i64 1
+  %tmp11434 = getelementptr inbounds float* %tmp11433, i64 1
+  %tmp11435 = getelementptr inbounds float* %tmp11434, i64 1
+  %tmp11436 = getelementptr inbounds float* %tmp11435, i64 1
+  %tmp11437 = getelementptr inbounds float* %tmp11436, i64 1
+  %tmp11438 = getelementptr inbounds float* %tmp11437, i64 1
+  %tmp11439 = getelementptr inbounds float* %tmp11438, i64 1
+  %tmp11440 = getelementptr inbounds float* %tmp11439, i64 1
+  %tmp11441 = getelementptr inbounds float* %tmp11440, i64 1
+  %tmp11442 = getelementptr inbounds float* %tmp11441, i64 1
+  %tmp11443 = getelementptr inbounds float* %tmp11442, i64 1
+  %tmp11444 = getelementptr inbounds float* %tmp11443, i64 1
+  %tmp11445 = getelementptr inbounds float* %tmp11444, i64 1
+  %tmp11446 = getelementptr inbounds float* %tmp11445, i64 1
+  %tmp11447 = getelementptr inbounds float* %tmp11446, i64 1
+  %tmp11448 = getelementptr inbounds float* %tmp11447, i64 1
+  %tmp11449 = getelementptr inbounds float* %tmp11448, i64 1
+  %tmp11450 = getelementptr inbounds float* %tmp11449, i64 1
+  %tmp11451 = getelementptr inbounds float* %tmp11450, i64 1
+  %tmp11452 = getelementptr inbounds float* %tmp11451, i64 1
+  %tmp11453 = getelementptr inbounds float* %tmp11452, i64 1
+  %tmp11454 = getelementptr inbounds float* %tmp11453, i64 1
+  %tmp11455 = getelementptr inbounds float* %tmp11454, i64 1
+  %tmp11456 = getelementptr inbounds float* %tmp11455, i64 1
+  %tmp11457 = getelementptr inbounds float* %tmp11456, i64 1
+  %tmp11458 = getelementptr inbounds float* %tmp11457, i64 1
+  %tmp11459 = getelementptr inbounds float* %tmp11458, i64 1
+  %tmp11460 = getelementptr inbounds float* %tmp11459, i64 1
+  %tmp11461 = getelementptr inbounds float* %tmp11460, i64 1
+  %tmp11462 = getelementptr inbounds float* %tmp11461, i64 1
+  %tmp11463 = getelementptr inbounds float* %tmp11462, i64 1
+  %tmp11464 = getelementptr inbounds float* %tmp11463, i64 1
+  %tmp11465 = getelementptr inbounds float* %tmp11464, i64 1
+  %tmp11466 = getelementptr inbounds float* %tmp11465, i64 1
+  %tmp11467 = getelementptr inbounds float* %tmp11466, i64 1
+  %tmp11468 = getelementptr inbounds float* %tmp11467, i64 1
+  %tmp11469 = getelementptr inbounds float* %tmp11468, i64 1
+  %tmp11470 = getelementptr inbounds float* %tmp11469, i64 1
+  %tmp11471 = getelementptr inbounds float* %tmp11470, i64 1
+  %tmp11472 = getelementptr inbounds float* %tmp11471, i64 1
+  %tmp11473 = getelementptr inbounds float* %tmp11472, i64 1
+  %tmp11474 = getelementptr inbounds float* %tmp11473, i64 1
+  %tmp11475 = getelementptr inbounds float* %tmp11474, i64 1
+  %tmp11476 = getelementptr inbounds float* %tmp11475, i64 1
+  %tmp11477 = getelementptr inbounds float* %tmp11476, i64 1
+  %tmp11478 = getelementptr inbounds float* %tmp11477, i64 1
+  %tmp11479 = getelementptr inbounds float* %tmp11478, i64 1
+  %tmp11480 = getelementptr inbounds float* %tmp11479, i64 1
+  %tmp11481 = getelementptr inbounds float* %tmp11480, i64 1
+  %tmp11482 = getelementptr inbounds float* %tmp11481, i64 1
+  %tmp11483 = getelementptr inbounds float* %tmp11482, i64 1
+  %tmp11484 = getelementptr inbounds float* %tmp11483, i64 1
+  %tmp11485 = getelementptr inbounds float* %tmp11484, i64 1
+  %tmp11486 = getelementptr inbounds float* %tmp11485, i64 1
+  %tmp11487 = getelementptr inbounds float* %tmp11486, i64 1
+  %tmp11488 = getelementptr inbounds float* %tmp11487, i64 1
+  %tmp11489 = getelementptr inbounds float* %tmp11488, i64 1
+  %tmp11490 = getelementptr inbounds float* %tmp11489, i64 1
+  %tmp11491 = getelementptr inbounds float* %tmp11490, i64 1
+  %tmp11492 = getelementptr inbounds float* %tmp11491, i64 1
+  %tmp11493 = getelementptr inbounds float* %tmp11492, i64 1
+  %tmp11494 = getelementptr inbounds float* %tmp11493, i64 1
+  %tmp11495 = getelementptr inbounds float* %tmp11494, i64 1
+  %tmp11496 = getelementptr inbounds float* %tmp11495, i64 1
+  %tmp11497 = getelementptr inbounds float* %tmp11496, i64 1
+  %tmp11498 = getelementptr inbounds float* %tmp11497, i64 1
+  %tmp11499 = getelementptr inbounds float* %tmp11498, i64 1
+  %tmp11500 = getelementptr inbounds float* %tmp11499, i64 1
+  %tmp11501 = getelementptr inbounds float* %tmp11500, i64 1
+  %tmp11502 = getelementptr inbounds float* %tmp11501, i64 1
+  %tmp11503 = getelementptr inbounds float* %tmp11502, i64 1
+  %tmp11504 = getelementptr inbounds float* %tmp11503, i64 1
+  %tmp11505 = getelementptr inbounds float* %tmp11504, i64 1
+  %tmp11506 = getelementptr inbounds float* %tmp11505, i64 1
+  %tmp11507 = getelementptr inbounds float* %tmp11506, i64 1
+  %tmp11508 = getelementptr inbounds float* %tmp11507, i64 1
+  %tmp11509 = getelementptr inbounds float* %tmp11508, i64 1
+  %tmp11510 = getelementptr inbounds float* %tmp11509, i64 1
+  %tmp11511 = getelementptr inbounds float* %tmp11510, i64 1
+  %tmp11512 = getelementptr inbounds float* %tmp11511, i64 1
+  %tmp11513 = getelementptr inbounds float* %tmp11512, i64 1
+  %tmp11514 = getelementptr inbounds float* %tmp11513, i64 1
+  %tmp11515 = getelementptr inbounds float* %tmp11514, i64 1
+  %tmp11516 = getelementptr inbounds float* %tmp11515, i64 1
+  %tmp11517 = getelementptr inbounds float* %tmp11516, i64 1
+  %tmp11518 = getelementptr inbounds float* %tmp11517, i64 1
+  %tmp11519 = getelementptr inbounds float* %tmp11518, i64 1
+  %tmp11520 = getelementptr inbounds float* %tmp11519, i64 1
+  %tmp11521 = getelementptr inbounds float* %tmp11520, i64 1
+  %tmp11522 = getelementptr inbounds float* %tmp11521, i64 1
+  %tmp11523 = getelementptr inbounds float* %tmp11522, i64 1
+  %tmp11524 = getelementptr inbounds float* %tmp11523, i64 1
+  %tmp11525 = getelementptr inbounds float* %tmp11524, i64 1
+  %tmp11526 = getelementptr inbounds float* %tmp11525, i64 1
+  %tmp11527 = getelementptr inbounds float* %tmp11526, i64 1
+  %tmp11528 = getelementptr inbounds float* %tmp11527, i64 1
+  %tmp11529 = getelementptr inbounds float* %tmp11528, i64 1
+  %tmp11530 = getelementptr inbounds float* %tmp11529, i64 1
+  %tmp11531 = getelementptr inbounds float* %tmp11530, i64 1
+  %tmp11532 = getelementptr inbounds float* %tmp11531, i64 1
+  %tmp11533 = getelementptr inbounds float* %tmp11532, i64 1
+  %tmp11534 = getelementptr inbounds float* %tmp11533, i64 1
+  %tmp11535 = getelementptr inbounds float* %tmp11534, i64 1
+  %tmp11536 = getelementptr inbounds float* %tmp11535, i64 1
+  %tmp11537 = getelementptr inbounds float* %tmp11536, i64 1
+  %tmp11538 = getelementptr inbounds float* %tmp11537, i64 1
+  %tmp11539 = getelementptr inbounds float* %tmp11538, i64 1
+  %tmp11540 = getelementptr inbounds float* %tmp11539, i64 1
+  %tmp11541 = getelementptr inbounds float* %tmp11540, i64 1
+  %tmp11542 = getelementptr inbounds float* %tmp11541, i64 1
+  %tmp11543 = getelementptr inbounds float* %tmp11542, i64 1
+  %tmp11544 = getelementptr inbounds float* %tmp11543, i64 1
+  %tmp11545 = getelementptr inbounds float* %tmp11544, i64 1
+  %tmp11546 = getelementptr inbounds float* %tmp11545, i64 1
+  %tmp11547 = getelementptr inbounds float* %tmp11546, i64 1
+  %tmp11548 = getelementptr inbounds float* %tmp11547, i64 1
+  %tmp11549 = getelementptr inbounds float* %tmp11548, i64 1
+  %tmp11550 = getelementptr inbounds float* %tmp11549, i64 1
+  %tmp11551 = getelementptr inbounds float* %tmp11550, i64 1
+  %tmp11552 = getelementptr inbounds float* %tmp11551, i64 1
+  %tmp11553 = getelementptr inbounds float* %tmp11552, i64 1
+  %tmp11554 = getelementptr inbounds float* %tmp11553, i64 1
+  %tmp11555 = getelementptr inbounds float* %tmp11554, i64 1
+  %tmp11556 = getelementptr inbounds float* %tmp11555, i64 1
+  %tmp11557 = getelementptr inbounds float* %tmp11556, i64 1
+  %tmp11558 = getelementptr inbounds float* %tmp11557, i64 1
+  %tmp11559 = getelementptr inbounds float* %tmp11558, i64 1
+  %tmp11560 = getelementptr inbounds float* %tmp11559, i64 1
+  %tmp11561 = getelementptr inbounds float* %tmp11560, i64 1
+  %tmp11562 = getelementptr inbounds float* %tmp11561, i64 1
+  %tmp11563 = getelementptr inbounds float* %tmp11562, i64 1
+  %tmp11564 = getelementptr inbounds float* %tmp11563, i64 1
+  %tmp11565 = getelementptr inbounds float* %tmp11564, i64 1
+  %tmp11566 = getelementptr inbounds float* %tmp11565, i64 1
+  %tmp11567 = getelementptr inbounds float* %tmp11566, i64 1
+  %tmp11568 = getelementptr inbounds float* %tmp11567, i64 1
+  %tmp11569 = getelementptr inbounds float* %tmp11568, i64 1
+  %tmp11570 = getelementptr inbounds float* %tmp11569, i64 1
+  %tmp11571 = getelementptr inbounds float* %tmp11570, i64 1
+  %tmp11572 = getelementptr inbounds float* %tmp11571, i64 1
+  %tmp11573 = getelementptr inbounds float* %tmp11572, i64 1
+  %tmp11574 = getelementptr inbounds float* %tmp11573, i64 1
+  %tmp11575 = getelementptr inbounds float* %tmp11574, i64 1
+  %tmp11576 = getelementptr inbounds float* %tmp11575, i64 1
+  %tmp11577 = getelementptr inbounds float* %tmp11576, i64 1
+  %tmp11578 = getelementptr inbounds float* %tmp11577, i64 1
+  %tmp11579 = getelementptr inbounds float* %tmp11578, i64 1
+  %tmp11580 = getelementptr inbounds float* %tmp11579, i64 1
+  %tmp11581 = getelementptr inbounds float* %tmp11580, i64 1
+  %tmp11582 = getelementptr inbounds float* %tmp11581, i64 1
+  %tmp11583 = getelementptr inbounds float* %tmp11582, i64 1
+  %tmp11584 = getelementptr inbounds float* %tmp11583, i64 1
+  %tmp11585 = getelementptr inbounds float* %tmp11584, i64 1
+  %tmp11586 = getelementptr inbounds float* %tmp11585, i64 1
+  %tmp11587 = getelementptr inbounds float* %tmp11586, i64 1
+  %tmp11588 = getelementptr inbounds float* %tmp11587, i64 1
+  %tmp11589 = getelementptr inbounds float* %tmp11588, i64 1
+  %tmp11590 = getelementptr inbounds float* %tmp11589, i64 1
+  %tmp11591 = getelementptr inbounds float* %tmp11590, i64 1
+  %tmp11592 = getelementptr inbounds float* %tmp11591, i64 1
+  %tmp11593 = getelementptr inbounds float* %tmp11592, i64 1
+  %tmp11594 = getelementptr inbounds float* %tmp11593, i64 1
+  %tmp11595 = getelementptr inbounds float* %tmp11594, i64 1
+  %tmp11596 = getelementptr inbounds float* %tmp11595, i64 1
+  %tmp11597 = getelementptr inbounds float* %tmp11596, i64 1
+  %tmp11598 = getelementptr inbounds float* %tmp11597, i64 1
+  %tmp11599 = getelementptr inbounds float* %tmp11598, i64 1
+  %tmp11600 = getelementptr inbounds float* %tmp11599, i64 1
+  %tmp11601 = getelementptr inbounds float* %tmp11600, i64 1
+  %tmp11602 = getelementptr inbounds float* %tmp11601, i64 1
+  %tmp11603 = getelementptr inbounds float* %tmp11602, i64 1
+  %tmp11604 = getelementptr inbounds float* %tmp11603, i64 1
+  %tmp11605 = getelementptr inbounds float* %tmp11604, i64 1
+  %tmp11606 = getelementptr inbounds float* %tmp11605, i64 1
+  %tmp11607 = getelementptr inbounds float* %tmp11606, i64 1
+  %tmp11608 = getelementptr inbounds float* %tmp11607, i64 1
+  %tmp11609 = getelementptr inbounds float* %tmp11608, i64 1
+  %tmp11610 = getelementptr inbounds float* %tmp11609, i64 1
+  %tmp11611 = getelementptr inbounds float* %tmp11610, i64 1
+  %tmp11612 = getelementptr inbounds float* %tmp11611, i64 1
+  %tmp11613 = getelementptr inbounds float* %tmp11612, i64 1
+  %tmp11614 = getelementptr inbounds float* %tmp11613, i64 1
+  %tmp11615 = getelementptr inbounds float* %tmp11614, i64 1
+  %tmp11616 = getelementptr inbounds float* %tmp11615, i64 1
+  %tmp11617 = getelementptr inbounds float* %tmp11616, i64 1
+  %tmp11618 = getelementptr inbounds float* %tmp11617, i64 1
+  %tmp11619 = getelementptr inbounds float* %tmp11618, i64 1
+  %tmp11620 = getelementptr inbounds float* %tmp11619, i64 1
+  %tmp11621 = getelementptr inbounds float* %tmp11620, i64 1
+  %tmp11622 = getelementptr inbounds float* %tmp11621, i64 1
+  %tmp11623 = getelementptr inbounds float* %tmp11622, i64 1
+  %tmp11624 = getelementptr inbounds float* %tmp11623, i64 1
+  %tmp11625 = getelementptr inbounds float* %tmp11624, i64 1
+  %tmp11626 = getelementptr inbounds float* %tmp11625, i64 1
+  %tmp11627 = getelementptr inbounds float* %tmp11626, i64 1
+  %tmp11628 = getelementptr inbounds float* %tmp11627, i64 1
+  %tmp11629 = getelementptr inbounds float* %tmp11628, i64 1
+  %tmp11630 = getelementptr inbounds float* %tmp11629, i64 1
+  %tmp11631 = getelementptr inbounds float* %tmp11630, i64 1
+  %tmp11632 = getelementptr inbounds float* %tmp11631, i64 1
+  %tmp11633 = getelementptr inbounds float* %tmp11632, i64 1
+  %tmp11634 = getelementptr inbounds float* %tmp11633, i64 1
+  %tmp11635 = getelementptr inbounds float* %tmp11634, i64 1
+  %tmp11636 = getelementptr inbounds float* %tmp11635, i64 1
+  %tmp11637 = getelementptr inbounds float* %tmp11636, i64 1
+  %tmp11638 = getelementptr inbounds float* %tmp11637, i64 1
+  %tmp11639 = getelementptr inbounds float* %tmp11638, i64 1
+  %tmp11640 = getelementptr inbounds float* %tmp11639, i64 1
+  %tmp11641 = getelementptr inbounds float* %tmp11640, i64 1
+  %tmp11642 = getelementptr inbounds float* %tmp11641, i64 1
+  %tmp11643 = getelementptr inbounds float* %tmp11642, i64 1
+  %tmp11644 = getelementptr inbounds float* %tmp11643, i64 1
+  %tmp11645 = getelementptr inbounds float* %tmp11644, i64 1
+  %tmp11646 = getelementptr inbounds float* %tmp11645, i64 1
+  %tmp11647 = getelementptr inbounds float* %tmp11646, i64 1
+  %tmp11648 = getelementptr inbounds float* %tmp11647, i64 1
+  %tmp11649 = getelementptr inbounds float* %tmp11648, i64 1
+  %tmp11650 = getelementptr inbounds float* %tmp11649, i64 1
+  %tmp11651 = getelementptr inbounds float* %tmp11650, i64 1
+  %tmp11652 = getelementptr inbounds float* %tmp11651, i64 1
+  %tmp11653 = getelementptr inbounds float* %tmp11652, i64 1
+  %tmp11654 = getelementptr inbounds float* %tmp11653, i64 1
+  %tmp11655 = getelementptr inbounds float* %tmp11654, i64 1
+  %tmp11656 = getelementptr inbounds float* %tmp11655, i64 1
+  %tmp11657 = getelementptr inbounds float* %tmp11656, i64 1
+  %tmp11658 = getelementptr inbounds float* %tmp11657, i64 1
+  %tmp11659 = getelementptr inbounds float* %tmp11658, i64 1
+  %tmp11660 = getelementptr inbounds float* %tmp11659, i64 1
+  %tmp11661 = getelementptr inbounds float* %tmp11660, i64 1
+  %tmp11662 = getelementptr inbounds float* %tmp11661, i64 1
+  %tmp11663 = getelementptr inbounds float* %tmp11662, i64 1
+  %tmp11664 = getelementptr inbounds float* %tmp11663, i64 1
+  %tmp11665 = getelementptr inbounds float* %tmp11664, i64 1
+  %tmp11666 = getelementptr inbounds float* %tmp11665, i64 1
+  %tmp11667 = getelementptr inbounds float* %tmp11666, i64 1
+  %tmp11668 = getelementptr inbounds float* %tmp11667, i64 1
+  %tmp11669 = getelementptr inbounds float* %tmp11668, i64 1
+  %tmp11670 = getelementptr inbounds float* %tmp11669, i64 1
+  %tmp11671 = getelementptr inbounds float* %tmp11670, i64 1
+  %tmp11672 = getelementptr inbounds float* %tmp11671, i64 1
+  %tmp11673 = getelementptr inbounds float* %tmp11672, i64 1
+  %tmp11674 = getelementptr inbounds float* %tmp11673, i64 1
+  %tmp11675 = getelementptr inbounds float* %tmp11674, i64 1
+  %tmp11676 = getelementptr inbounds float* %tmp11675, i64 1
+  %tmp11677 = getelementptr inbounds float* %tmp11676, i64 1
+  %tmp11678 = getelementptr inbounds float* %tmp11677, i64 1
+  %tmp11679 = getelementptr inbounds float* %tmp11678, i64 1
+  %tmp11680 = getelementptr inbounds float* %tmp11679, i64 1
+  %tmp11681 = getelementptr inbounds float* %tmp11680, i64 1
+  %tmp11682 = getelementptr inbounds float* %tmp11681, i64 1
+  %tmp11683 = getelementptr inbounds float* %tmp11682, i64 1
+  %tmp11684 = getelementptr inbounds float* %tmp11683, i64 1
+  %tmp11685 = getelementptr inbounds float* %tmp11684, i64 1
+  %tmp11686 = getelementptr inbounds float* %tmp11685, i64 1
+  %tmp11687 = getelementptr inbounds float* %tmp11686, i64 1
+  %tmp11688 = getelementptr inbounds float* %tmp11687, i64 1
+  %tmp11689 = getelementptr inbounds float* %tmp11688, i64 1
+  %tmp11690 = getelementptr inbounds float* %tmp11689, i64 1
+  %tmp11691 = getelementptr inbounds float* %tmp11690, i64 1
+  %tmp11692 = getelementptr inbounds float* %tmp11691, i64 1
+  %tmp11693 = getelementptr inbounds float* %tmp11692, i64 1
+  %tmp11694 = getelementptr inbounds float* %tmp11693, i64 1
+  %tmp11695 = getelementptr inbounds float* %tmp11694, i64 1
+  %tmp11696 = getelementptr inbounds float* %tmp11695, i64 1
+  %tmp11697 = getelementptr inbounds float* %tmp11696, i64 1
+  %tmp11698 = getelementptr inbounds float* %tmp11697, i64 1
+  %tmp11699 = getelementptr inbounds float* %tmp11698, i64 1
+  %tmp11700 = getelementptr inbounds float* %tmp11699, i64 1
+  %tmp11701 = getelementptr inbounds float* %tmp11700, i64 1
+  %tmp11702 = getelementptr inbounds float* %tmp11701, i64 1
+  %tmp11703 = getelementptr inbounds float* %tmp11702, i64 1
+  %tmp11704 = getelementptr inbounds float* %tmp11703, i64 1
+  %tmp11705 = getelementptr inbounds float* %tmp11704, i64 1
+  %tmp11706 = getelementptr inbounds float* %tmp11705, i64 1
+  %tmp11707 = getelementptr inbounds float* %tmp11706, i64 1
+  %tmp11708 = getelementptr inbounds float* %tmp11707, i64 1
+  %tmp11709 = getelementptr inbounds float* %tmp11708, i64 1
+  %tmp11710 = getelementptr inbounds float* %tmp11709, i64 1
+  %tmp11711 = getelementptr inbounds float* %tmp11710, i64 1
+  %tmp11712 = getelementptr inbounds float* %tmp11711, i64 1
+  %tmp11713 = getelementptr inbounds float* %tmp11712, i64 1
+  %tmp11714 = getelementptr inbounds float* %tmp11713, i64 1
+  %tmp11715 = getelementptr inbounds float* %tmp11714, i64 1
+  %tmp11716 = getelementptr inbounds float* %tmp11715, i64 1
+  %tmp11717 = getelementptr inbounds float* %tmp11716, i64 1
+  %tmp11718 = getelementptr inbounds float* %tmp11717, i64 1
+  %tmp11719 = getelementptr inbounds float* %tmp11718, i64 1
+  %tmp11720 = getelementptr inbounds float* %tmp11719, i64 1
+  %tmp11721 = getelementptr inbounds float* %tmp11720, i64 1
+  %tmp11722 = getelementptr inbounds float* %tmp11721, i64 1
+  %tmp11723 = getelementptr inbounds float* %tmp11722, i64 1
+  %tmp11724 = getelementptr inbounds float* %tmp11723, i64 1
+  %tmp11725 = getelementptr inbounds float* %tmp11724, i64 1
+  %tmp11726 = getelementptr inbounds float* %tmp11725, i64 1
+  %tmp11727 = getelementptr inbounds float* %tmp11726, i64 1
+  %tmp11728 = getelementptr inbounds float* %tmp11727, i64 1
+  %tmp11729 = getelementptr inbounds float* %tmp11728, i64 1
+  %tmp11730 = getelementptr inbounds float* %tmp11729, i64 1
+  %tmp11731 = getelementptr inbounds float* %tmp11730, i64 1
+  %tmp11732 = getelementptr inbounds float* %tmp11731, i64 1
+  %tmp11733 = getelementptr inbounds float* %tmp11732, i64 1
+  %tmp11734 = getelementptr inbounds float* %tmp11733, i64 1
+  %tmp11735 = getelementptr inbounds float* %tmp11734, i64 1
+  %tmp11736 = getelementptr inbounds float* %tmp11735, i64 1
+  %tmp11737 = getelementptr inbounds float* %tmp11736, i64 1
+  %tmp11738 = getelementptr inbounds float* %tmp11737, i64 1
+  %tmp11739 = getelementptr inbounds float* %tmp11738, i64 1
+  %tmp11740 = getelementptr inbounds float* %tmp11739, i64 1
+  %tmp11741 = getelementptr inbounds float* %tmp11740, i64 1
+  %tmp11742 = getelementptr inbounds float* %tmp11741, i64 1
+  %tmp11743 = getelementptr inbounds float* %tmp11742, i64 1
+  %tmp11744 = getelementptr inbounds float* %tmp11743, i64 1
+  %tmp11745 = getelementptr inbounds float* %tmp11744, i64 1
+  %tmp11746 = getelementptr inbounds float* %tmp11745, i64 1
+  %tmp11747 = getelementptr inbounds float* %tmp11746, i64 1
+  %tmp11748 = getelementptr inbounds float* %tmp11747, i64 1
+  %tmp11749 = getelementptr inbounds float* %tmp11748, i64 1
+  %tmp11750 = getelementptr inbounds float* %tmp11749, i64 1
+  %tmp11751 = getelementptr inbounds float* %tmp11750, i64 1
+  %tmp11752 = getelementptr inbounds float* %tmp11751, i64 1
+  %tmp11753 = getelementptr inbounds float* %tmp11752, i64 1
+  %tmp11754 = getelementptr inbounds float* %tmp11753, i64 1
+  %tmp11755 = getelementptr inbounds float* %tmp11754, i64 1
+  %tmp11756 = getelementptr inbounds float* %tmp11755, i64 1
+  %tmp11757 = getelementptr inbounds float* %tmp11756, i64 1
+  %tmp11758 = getelementptr inbounds float* %tmp11757, i64 1
+  %tmp11759 = getelementptr inbounds float* %tmp11758, i64 1
+  %tmp11760 = getelementptr inbounds float* %tmp11759, i64 1
+  %tmp11761 = getelementptr inbounds float* %tmp11760, i64 1
+  %tmp11762 = getelementptr inbounds float* %tmp11761, i64 1
+  %tmp11763 = getelementptr inbounds float* %tmp11762, i64 1
+  %tmp11764 = getelementptr inbounds float* %tmp11763, i64 1
+  %tmp11765 = getelementptr inbounds float* %tmp11764, i64 1
+  %tmp11766 = getelementptr inbounds float* %tmp11765, i64 1
+  %tmp11767 = getelementptr inbounds float* %tmp11766, i64 1
+  %tmp11768 = getelementptr inbounds float* %tmp11767, i64 1
+  %tmp11769 = getelementptr inbounds float* %tmp11768, i64 1
+  %tmp11770 = getelementptr inbounds float* %tmp11769, i64 1
+  %tmp11771 = getelementptr inbounds float* %tmp11770, i64 1
+  %tmp11772 = getelementptr inbounds float* %tmp11771, i64 1
+  %tmp11773 = getelementptr inbounds float* %tmp11772, i64 1
+  %tmp11774 = getelementptr inbounds float* %tmp11773, i64 1
+  %tmp11775 = getelementptr inbounds float* %tmp11774, i64 1
+  %tmp11776 = getelementptr inbounds float* %tmp11775, i64 1
+  %tmp11777 = getelementptr inbounds float* %tmp11776, i64 1
+  %tmp11778 = getelementptr inbounds float* %tmp11777, i64 1
+  %tmp11779 = getelementptr inbounds float* %tmp11778, i64 1
+  %tmp11780 = getelementptr inbounds float* %tmp11779, i64 1
+  %tmp11781 = getelementptr inbounds float* %tmp11780, i64 1
+  %tmp11782 = getelementptr inbounds float* %tmp11781, i64 1
+  %tmp11783 = getelementptr inbounds float* %tmp11782, i64 1
+  %tmp11784 = getelementptr inbounds float* %tmp11783, i64 1
+  %tmp11785 = getelementptr inbounds float* %tmp11784, i64 1
+  %tmp11786 = getelementptr inbounds float* %tmp11785, i64 1
+  %tmp11787 = getelementptr inbounds float* %tmp11786, i64 1
+  %tmp11788 = getelementptr inbounds float* %tmp11787, i64 1
+  %tmp11789 = getelementptr inbounds float* %tmp11788, i64 1
+  %tmp11790 = getelementptr inbounds float* %tmp11789, i64 1
+  %tmp11791 = getelementptr inbounds float* %tmp11790, i64 1
+  %tmp11792 = getelementptr inbounds float* %tmp11791, i64 1
+  %tmp11793 = getelementptr inbounds float* %tmp11792, i64 1
+  %tmp11794 = getelementptr inbounds float* %tmp11793, i64 1
+  %tmp11795 = getelementptr inbounds float* %tmp11794, i64 1
+  %tmp11796 = getelementptr inbounds float* %tmp11795, i64 1
+  %tmp11797 = getelementptr inbounds float* %tmp11796, i64 1
+  %tmp11798 = getelementptr inbounds float* %tmp11797, i64 1
+  %tmp11799 = getelementptr inbounds float* %tmp11798, i64 1
+  %tmp11800 = getelementptr inbounds float* %tmp11799, i64 1
+  %tmp11801 = getelementptr inbounds float* %tmp11800, i64 1
+  %tmp11802 = getelementptr inbounds float* %tmp11801, i64 1
+  %tmp11803 = getelementptr inbounds float* %tmp11802, i64 1
+  %tmp11804 = getelementptr inbounds float* %tmp11803, i64 1
+  %tmp11805 = getelementptr inbounds float* %tmp11804, i64 1
+  %tmp11806 = getelementptr inbounds float* %tmp11805, i64 1
+  %tmp11807 = getelementptr inbounds float* %tmp11806, i64 1
+  %tmp11808 = getelementptr inbounds float* %tmp11807, i64 1
+  %tmp11809 = getelementptr inbounds float* %tmp11808, i64 1
+  %tmp11810 = getelementptr inbounds float* %tmp11809, i64 1
+  %tmp11811 = getelementptr inbounds float* %tmp11810, i64 1
+  %tmp11812 = getelementptr inbounds float* %tmp11811, i64 1
+  %tmp11813 = getelementptr inbounds float* %tmp11812, i64 1
+  %tmp11814 = getelementptr inbounds float* %tmp11813, i64 1
+  %tmp11815 = getelementptr inbounds float* %tmp11814, i64 1
+  %tmp11816 = getelementptr inbounds float* %tmp11815, i64 1
+  %tmp11817 = getelementptr inbounds float* %tmp11816, i64 1
+  %tmp11818 = getelementptr inbounds float* %tmp11817, i64 1
+  %tmp11819 = getelementptr inbounds float* %tmp11818, i64 1
+  %tmp11820 = getelementptr inbounds float* %tmp11819, i64 1
+  %tmp11821 = getelementptr inbounds float* %tmp11820, i64 1
+  %tmp11822 = getelementptr inbounds float* %tmp11821, i64 1
+  %tmp11823 = getelementptr inbounds float* %tmp11822, i64 1
+  %tmp11824 = getelementptr inbounds float* %tmp11823, i64 1
+  %tmp11825 = getelementptr inbounds float* %tmp11824, i64 1
+  %tmp11826 = getelementptr inbounds float* %tmp11825, i64 1
+  %tmp11827 = getelementptr inbounds float* %tmp11826, i64 1
+  %tmp11828 = getelementptr inbounds float* %tmp11827, i64 1
+  %tmp11829 = getelementptr inbounds float* %tmp11828, i64 1
+  %tmp11830 = getelementptr inbounds float* %tmp11829, i64 1
+  %tmp11831 = getelementptr inbounds float* %tmp11830, i64 1
+  %tmp11832 = getelementptr inbounds float* %tmp11831, i64 1
+  %tmp11833 = getelementptr inbounds float* %tmp11832, i64 1
+  %tmp11834 = getelementptr inbounds float* %tmp11833, i64 1
+  %tmp11835 = getelementptr inbounds float* %tmp11834, i64 1
+  %tmp11836 = getelementptr inbounds float* %tmp11835, i64 1
+  %tmp11837 = getelementptr inbounds float* %tmp11836, i64 1
+  %tmp11838 = getelementptr inbounds float* %tmp11837, i64 1
+  %tmp11839 = getelementptr inbounds float* %tmp11838, i64 1
+  %tmp11840 = getelementptr inbounds float* %tmp11839, i64 1
+  %tmp11841 = getelementptr inbounds float* %tmp11840, i64 1
+  %tmp11842 = getelementptr inbounds float* %tmp11841, i64 1
+  %tmp11843 = getelementptr inbounds float* %tmp11842, i64 1
+  %tmp11844 = getelementptr inbounds float* %tmp11843, i64 1
+  %tmp11845 = getelementptr inbounds float* %tmp11844, i64 1
+  %tmp11846 = getelementptr inbounds float* %tmp11845, i64 1
+  %tmp11847 = getelementptr inbounds float* %tmp11846, i64 1
+  %tmp11848 = getelementptr inbounds float* %tmp11847, i64 1
+  %tmp11849 = getelementptr inbounds float* %tmp11848, i64 1
+  %tmp11850 = getelementptr inbounds float* %tmp11849, i64 1
+  %tmp11851 = getelementptr inbounds float* %tmp11850, i64 1
+  %tmp11852 = getelementptr inbounds float* %tmp11851, i64 1
+  %tmp11853 = getelementptr inbounds float* %tmp11852, i64 1
+  %tmp11854 = getelementptr inbounds float* %tmp11853, i64 1
+  %tmp11855 = getelementptr inbounds float* %tmp11854, i64 1
+  %tmp11856 = getelementptr inbounds float* %tmp11855, i64 1
+  %tmp11857 = getelementptr inbounds float* %tmp11856, i64 1
+  %tmp11858 = getelementptr inbounds float* %tmp11857, i64 1
+  %tmp11859 = getelementptr inbounds float* %tmp11858, i64 1
+  %tmp11860 = getelementptr inbounds float* %tmp11859, i64 1
+  %tmp11861 = getelementptr inbounds float* %tmp11860, i64 1
+  %tmp11862 = getelementptr inbounds float* %tmp11861, i64 1
+  %tmp11863 = getelementptr inbounds float* %tmp11862, i64 1
+  %tmp11864 = getelementptr inbounds float* %tmp11863, i64 1
+  %tmp11865 = getelementptr inbounds float* %tmp11864, i64 1
+  %tmp11866 = getelementptr inbounds float* %tmp11865, i64 1
+  %tmp11867 = getelementptr inbounds float* %tmp11866, i64 1
+  %tmp11868 = getelementptr inbounds float* %tmp11867, i64 1
+  %tmp11869 = getelementptr inbounds float* %tmp11868, i64 1
+  %tmp11870 = getelementptr inbounds float* %tmp11869, i64 1
+  %tmp11871 = getelementptr inbounds float* %tmp11870, i64 1
+  %tmp11872 = getelementptr inbounds float* %tmp11871, i64 1
+  %tmp11873 = getelementptr inbounds float* %tmp11872, i64 1
+  %tmp11874 = getelementptr inbounds float* %tmp11873, i64 1
+  %tmp11875 = getelementptr inbounds float* %tmp11874, i64 1
+  %tmp11876 = getelementptr inbounds float* %tmp11875, i64 1
+  %tmp11877 = getelementptr inbounds float* %tmp11876, i64 1
+  %tmp11878 = getelementptr inbounds float* %tmp11877, i64 1
+  %tmp11879 = getelementptr inbounds float* %tmp11878, i64 1
+  %tmp11880 = getelementptr inbounds float* %tmp11879, i64 1
+  %tmp11881 = getelementptr inbounds float* %tmp11880, i64 1
+  %tmp11882 = getelementptr inbounds float* %tmp11881, i64 1
+  %tmp11883 = getelementptr inbounds float* %tmp11882, i64 1
+  %tmp11884 = getelementptr inbounds float* %tmp11883, i64 1
+  %tmp11885 = getelementptr inbounds float* %tmp11884, i64 1
+  %tmp11886 = getelementptr inbounds float* %tmp11885, i64 1
+  %tmp11887 = getelementptr inbounds float* %tmp11886, i64 1
+  %tmp11888 = getelementptr inbounds float* %tmp11887, i64 1
+  %tmp11889 = getelementptr inbounds float* %tmp11888, i64 1
+  %tmp11890 = getelementptr inbounds float* %tmp11889, i64 1
+  %tmp11891 = getelementptr inbounds float* %tmp11890, i64 1
+  %tmp11892 = getelementptr inbounds float* %tmp11891, i64 1
+  %tmp11893 = getelementptr inbounds float* %tmp11892, i64 1
+  %tmp11894 = getelementptr inbounds float* %tmp11893, i64 1
+  %tmp11895 = getelementptr inbounds float* %tmp11894, i64 1
+  %tmp11896 = getelementptr inbounds float* %tmp11895, i64 1
+  %tmp11897 = getelementptr inbounds float* %tmp11896, i64 1
+  %tmp11898 = getelementptr inbounds float* %tmp11897, i64 1
+  %tmp11899 = getelementptr inbounds float* %tmp11898, i64 1
+  %tmp11900 = getelementptr inbounds float* %tmp11899, i64 1
+  %tmp11901 = getelementptr inbounds float* %tmp11900, i64 1
+  %tmp11902 = getelementptr inbounds float* %tmp11901, i64 1
+  %tmp11903 = getelementptr inbounds float* %tmp11902, i64 1
+  %tmp11904 = getelementptr inbounds float* %tmp11903, i64 1
+  %tmp11905 = getelementptr inbounds float* %tmp11904, i64 1
+  %tmp11906 = getelementptr inbounds float* %tmp11905, i64 1
+  %tmp11907 = getelementptr inbounds float* %tmp11906, i64 1
+  %tmp11908 = getelementptr inbounds float* %tmp11907, i64 1
+  %tmp11909 = getelementptr inbounds float* %tmp11908, i64 1
+  %tmp11910 = getelementptr inbounds float* %tmp11909, i64 1
+  %tmp11911 = getelementptr inbounds float* %tmp11910, i64 1
+  %tmp11912 = getelementptr inbounds float* %tmp11911, i64 1
+  %tmp11913 = getelementptr inbounds float* %tmp11912, i64 1
+  %tmp11914 = getelementptr inbounds float* %tmp11913, i64 1
+  %tmp11915 = getelementptr inbounds float* %tmp11914, i64 1
+  %tmp11916 = getelementptr inbounds float* %tmp11915, i64 1
+  %tmp11917 = getelementptr inbounds float* %tmp11916, i64 1
+  %tmp11918 = getelementptr inbounds float* %tmp11917, i64 1
+  %tmp11919 = getelementptr inbounds float* %tmp11918, i64 1
+  %tmp11920 = getelementptr inbounds float* %tmp11919, i64 1
+  %tmp11921 = getelementptr inbounds float* %tmp11920, i64 1
+  %tmp11922 = getelementptr inbounds float* %tmp11921, i64 1
+  %tmp11923 = getelementptr inbounds float* %tmp11922, i64 1
+  %tmp11924 = getelementptr inbounds float* %tmp11923, i64 1
+  %tmp11925 = getelementptr inbounds float* %tmp11924, i64 1
+  %tmp11926 = getelementptr inbounds float* %tmp11925, i64 1
+  %tmp11927 = getelementptr inbounds float* %tmp11926, i64 1
+  %tmp11928 = getelementptr inbounds float* %tmp11927, i64 1
+  %tmp11929 = getelementptr inbounds float* %tmp11928, i64 1
+  %tmp11930 = getelementptr inbounds float* %tmp11929, i64 1
+  %tmp11931 = getelementptr inbounds float* %tmp11930, i64 1
+  %tmp11932 = getelementptr inbounds float* %tmp11931, i64 1
+  %tmp11933 = getelementptr inbounds float* %tmp11932, i64 1
+  %tmp11934 = getelementptr inbounds float* %tmp11933, i64 1
+  %tmp11935 = getelementptr inbounds float* %tmp11934, i64 1
+  %tmp11936 = getelementptr inbounds float* %tmp11935, i64 1
+  %tmp11937 = getelementptr inbounds float* %tmp11936, i64 1
+  %tmp11938 = getelementptr inbounds float* %tmp11937, i64 1
+  %tmp11939 = getelementptr inbounds float* %tmp11938, i64 1
+  %tmp11940 = getelementptr inbounds float* %tmp11939, i64 1
+  %tmp11941 = getelementptr inbounds float* %tmp11940, i64 1
+  %tmp11942 = getelementptr inbounds float* %tmp11941, i64 1
+  %tmp11943 = getelementptr inbounds float* %tmp11942, i64 1
+  %tmp11944 = getelementptr inbounds float* %tmp11943, i64 1
+  %tmp11945 = getelementptr inbounds float* %tmp11944, i64 1
+  %tmp11946 = getelementptr inbounds float* %tmp11945, i64 1
+  %tmp11947 = getelementptr inbounds float* %tmp11946, i64 1
+  %tmp11948 = getelementptr inbounds float* %tmp11947, i64 1
+  %tmp11949 = getelementptr inbounds float* %tmp11948, i64 1
+  %tmp11950 = getelementptr inbounds float* %tmp11949, i64 1
+  %tmp11951 = getelementptr inbounds float* %tmp11950, i64 1
+  %tmp11952 = getelementptr inbounds float* %tmp11951, i64 1
+  %tmp11953 = getelementptr inbounds float* %tmp11952, i64 1
+  %tmp11954 = getelementptr inbounds float* %tmp11953, i64 1
+  %tmp11955 = getelementptr inbounds float* %tmp11954, i64 1
+  %tmp11956 = getelementptr inbounds float* %tmp11955, i64 1
+  %tmp11957 = getelementptr inbounds float* %tmp11956, i64 1
+  %tmp11958 = getelementptr inbounds float* %tmp11957, i64 1
+  %tmp11959 = getelementptr inbounds float* %tmp11958, i64 1
+  %tmp11960 = getelementptr inbounds float* %tmp11959, i64 1
+  %tmp11961 = getelementptr inbounds float* %tmp11960, i64 1
+  %tmp11962 = getelementptr inbounds float* %tmp11961, i64 1
+  %tmp11963 = getelementptr inbounds float* %tmp11962, i64 1
+  %tmp11964 = getelementptr inbounds float* %tmp11963, i64 1
+  %tmp11965 = getelementptr inbounds float* %tmp11964, i64 1
+  %tmp11966 = getelementptr inbounds float* %tmp11965, i64 1
+  %tmp11967 = getelementptr inbounds float* %tmp11966, i64 1
+  %tmp11968 = getelementptr inbounds float* %tmp11967, i64 1
+  %tmp11969 = getelementptr inbounds float* %tmp11968, i64 1
+  %tmp11970 = getelementptr inbounds float* %tmp11969, i64 1
+  %tmp11971 = getelementptr inbounds float* %tmp11970, i64 1
+  %tmp11972 = getelementptr inbounds float* %tmp11971, i64 1
+  %tmp11973 = getelementptr inbounds float* %tmp11972, i64 1
+  %tmp11974 = getelementptr inbounds float* %tmp11973, i64 1
+  %tmp11975 = getelementptr inbounds float* %tmp11974, i64 1
+  %tmp11976 = getelementptr inbounds float* %tmp11975, i64 1
+  %tmp11977 = getelementptr inbounds float* %tmp11976, i64 1
+  %tmp11978 = getelementptr inbounds float* %tmp11977, i64 1
+  %tmp11979 = getelementptr inbounds float* %tmp11978, i64 1
+  %tmp11980 = getelementptr inbounds float* %tmp11979, i64 1
+  %tmp11981 = getelementptr inbounds float* %tmp11980, i64 1
+  %tmp11982 = getelementptr inbounds float* %tmp11981, i64 1
+  %tmp11983 = getelementptr inbounds float* %tmp11982, i64 1
+  %tmp11984 = getelementptr inbounds float* %tmp11983, i64 1
+  %tmp11985 = getelementptr inbounds float* %tmp11984, i64 1
+  %tmp11986 = getelementptr inbounds float* %tmp11985, i64 1
+  %tmp11987 = getelementptr inbounds float* %tmp11986, i64 1
+  %tmp11988 = getelementptr inbounds float* %tmp11987, i64 1
+  %tmp11989 = getelementptr inbounds float* %tmp11988, i64 1
+  %tmp11990 = getelementptr inbounds float* %tmp11989, i64 1
+  %tmp11991 = getelementptr inbounds float* %tmp11990, i64 1
+  %tmp11992 = getelementptr inbounds float* %tmp11991, i64 1
+  %tmp11993 = getelementptr inbounds float* %tmp11992, i64 1
+  %tmp11994 = getelementptr inbounds float* %tmp11993, i64 1
+  %tmp11995 = getelementptr inbounds float* %tmp11994, i64 1
+  %tmp11996 = getelementptr inbounds float* %tmp11995, i64 1
+  %tmp11997 = getelementptr inbounds float* %tmp11996, i64 1
+  %tmp11998 = getelementptr inbounds float* %tmp11997, i64 1
+  %tmp11999 = getelementptr inbounds float* %tmp11998, i64 1
+  %tmp12000 = getelementptr inbounds float* %tmp11999, i64 1
+  %tmp12001 = getelementptr inbounds float* %tmp12000, i64 1
+  %tmp12002 = getelementptr inbounds float* %tmp12001, i64 1
+  %tmp12003 = getelementptr inbounds float* %tmp12002, i64 1
+  %tmp12004 = getelementptr inbounds float* %tmp12003, i64 1
+  %tmp12005 = getelementptr inbounds float* %tmp12004, i64 1
+  %tmp12006 = getelementptr inbounds float* %tmp12005, i64 1
+  %tmp12007 = getelementptr inbounds float* %tmp12006, i64 1
+  %tmp12008 = getelementptr inbounds float* %tmp12007, i64 1
+  %tmp12009 = getelementptr inbounds float* %tmp12008, i64 1
+  %tmp12010 = getelementptr inbounds float* %tmp12009, i64 1
+  %tmp12011 = getelementptr inbounds float* %tmp12010, i64 1
+  %tmp12012 = getelementptr inbounds float* %tmp12011, i64 1
+  %tmp12013 = getelementptr inbounds float* %tmp12012, i64 1
+  %tmp12014 = getelementptr inbounds float* %tmp12013, i64 1
+  %tmp12015 = getelementptr inbounds float* %tmp12014, i64 1
+  %tmp12016 = getelementptr inbounds float* %tmp12015, i64 1
+  %tmp12017 = getelementptr inbounds float* %tmp12016, i64 1
+  %tmp12018 = getelementptr inbounds float* %tmp12017, i64 1
+  %tmp12019 = getelementptr inbounds float* %tmp12018, i64 1
+  %tmp12020 = getelementptr inbounds float* %tmp12019, i64 1
+  %tmp12021 = getelementptr inbounds float* %tmp12020, i64 1
+  %tmp12022 = getelementptr inbounds float* %tmp12021, i64 1
+  %tmp12023 = getelementptr inbounds float* %tmp12022, i64 1
+  %tmp12024 = getelementptr inbounds float* %tmp12023, i64 1
+  %tmp12025 = getelementptr inbounds float* %tmp12024, i64 1
+  %tmp12026 = getelementptr inbounds float* %tmp12025, i64 1
+  %tmp12027 = getelementptr inbounds float* %tmp12026, i64 1
+  %tmp12028 = getelementptr inbounds float* %tmp12027, i64 1
+  %tmp12029 = getelementptr inbounds float* %tmp12028, i64 1
+  %tmp12030 = getelementptr inbounds float* %tmp12029, i64 1
+  %tmp12031 = getelementptr inbounds float* %tmp12030, i64 1
+  %tmp12032 = getelementptr inbounds float* %tmp12031, i64 1
+  %tmp12033 = getelementptr inbounds float* %tmp12032, i64 1
+  %tmp12034 = getelementptr inbounds float* %tmp12033, i64 1
+  %tmp12035 = getelementptr inbounds float* %tmp12034, i64 1
+  %tmp12036 = getelementptr inbounds float* %tmp12035, i64 1
+  %tmp12037 = getelementptr inbounds float* %tmp12036, i64 1
+  %tmp12038 = getelementptr inbounds float* %tmp12037, i64 1
+  %tmp12039 = getelementptr inbounds float* %tmp12038, i64 1
+  %tmp12040 = getelementptr inbounds float* %tmp12039, i64 1
+  %tmp12041 = getelementptr inbounds float* %tmp12040, i64 1
+  %tmp12042 = getelementptr inbounds float* %tmp12041, i64 1
+  %tmp12043 = getelementptr inbounds float* %tmp12042, i64 1
+  %tmp12044 = getelementptr inbounds float* %tmp12043, i64 1
+  %tmp12045 = getelementptr inbounds float* %tmp12044, i64 1
+  %tmp12046 = getelementptr inbounds float* %tmp12045, i64 1
+  %tmp12047 = getelementptr inbounds float* %tmp12046, i64 1
+  %tmp12048 = getelementptr inbounds float* %tmp12047, i64 1
+  %tmp12049 = getelementptr inbounds float* %tmp12048, i64 1
+  %tmp12050 = getelementptr inbounds float* %tmp12049, i64 1
+  %tmp12051 = getelementptr inbounds float* %tmp12050, i64 1
+  %tmp12052 = getelementptr inbounds float* %tmp12051, i64 1
+  %tmp12053 = getelementptr inbounds float* %tmp12052, i64 1
+  %tmp12054 = getelementptr inbounds float* %tmp12053, i64 1
+  %tmp12055 = getelementptr inbounds float* %tmp12054, i64 1
+  %tmp12056 = getelementptr inbounds float* %tmp12055, i64 1
+  %tmp12057 = getelementptr inbounds float* %tmp12056, i64 1
+  %tmp12058 = getelementptr inbounds float* %tmp12057, i64 1
+  %tmp12059 = getelementptr inbounds float* %tmp12058, i64 1
+  %tmp12060 = getelementptr inbounds float* %tmp12059, i64 1
+  %tmp12061 = getelementptr inbounds float* %tmp12060, i64 1
+  %tmp12062 = getelementptr inbounds float* %tmp12061, i64 1
+  %tmp12063 = getelementptr inbounds float* %tmp12062, i64 1
+  %tmp12064 = getelementptr inbounds float* %tmp12063, i64 1
+  %tmp12065 = getelementptr inbounds float* %tmp12064, i64 1
+  %tmp12066 = getelementptr inbounds float* %tmp12065, i64 1
+  %tmp12067 = getelementptr inbounds float* %tmp12066, i64 1
+  %tmp12068 = getelementptr inbounds float* %tmp12067, i64 1
+  %tmp12069 = getelementptr inbounds float* %tmp12068, i64 1
+  %tmp12070 = getelementptr inbounds float* %tmp12069, i64 1
+  %tmp12071 = getelementptr inbounds float* %tmp12070, i64 1
+  %tmp12072 = getelementptr inbounds float* %tmp12071, i64 1
+  %tmp12073 = getelementptr inbounds float* %tmp12072, i64 1
+  %tmp12074 = getelementptr inbounds float* %tmp12073, i64 1
+  %tmp12075 = getelementptr inbounds float* %tmp12074, i64 1
+  %tmp12076 = getelementptr inbounds float* %tmp12075, i64 1
+  %tmp12077 = getelementptr inbounds float* %tmp12076, i64 1
+  %tmp12078 = getelementptr inbounds float* %tmp12077, i64 1
+  %tmp12079 = getelementptr inbounds float* %tmp12078, i64 1
+  %tmp12080 = getelementptr inbounds float* %tmp12079, i64 1
+  %tmp12081 = getelementptr inbounds float* %tmp12080, i64 1
+  %tmp12082 = getelementptr inbounds float* %tmp12081, i64 1
+  %tmp12083 = getelementptr inbounds float* %tmp12082, i64 1
+  %tmp12084 = getelementptr inbounds float* %tmp12083, i64 1
+  %tmp12085 = getelementptr inbounds float* %tmp12084, i64 1
+  %tmp12086 = getelementptr inbounds float* %tmp12085, i64 1
+  %tmp12087 = getelementptr inbounds float* %tmp12086, i64 1
+  %tmp12088 = getelementptr inbounds float* %tmp12087, i64 1
+  %tmp12089 = getelementptr inbounds float* %tmp12088, i64 1
+  %tmp12090 = getelementptr inbounds float* %tmp12089, i64 1
+  %tmp12091 = getelementptr inbounds float* %tmp12090, i64 1
+  %tmp12092 = getelementptr inbounds float* %tmp12091, i64 1
+  %tmp12093 = getelementptr inbounds float* %tmp12092, i64 1
+  %tmp12094 = getelementptr inbounds float* %tmp12093, i64 1
+  %tmp12095 = getelementptr inbounds float* %tmp12094, i64 1
+  %tmp12096 = getelementptr inbounds float* %tmp12095, i64 1
+  %tmp12097 = getelementptr inbounds float* %tmp12096, i64 1
+  %tmp12098 = getelementptr inbounds float* %tmp12097, i64 1
+  %tmp12099 = getelementptr inbounds float* %tmp12098, i64 1
+  %tmp12100 = getelementptr inbounds float* %tmp12099, i64 1
+  %tmp12101 = getelementptr inbounds float* %tmp12100, i64 1
+  %tmp12102 = getelementptr inbounds float* %tmp12101, i64 1
+  %tmp12103 = getelementptr inbounds float* %tmp12102, i64 1
+  %tmp12104 = getelementptr inbounds float* %tmp12103, i64 1
+  %tmp12105 = getelementptr inbounds float* %tmp12104, i64 1
+  %tmp12106 = getelementptr inbounds float* %tmp12105, i64 1
+  %tmp12107 = getelementptr inbounds float* %tmp12106, i64 1
+  %tmp12108 = getelementptr inbounds float* %tmp12107, i64 1
+  %tmp12109 = getelementptr inbounds float* %tmp12108, i64 1
+  %tmp12110 = getelementptr inbounds float* %tmp12109, i64 1
+  %tmp12111 = getelementptr inbounds float* %tmp12110, i64 1
+  %tmp12112 = getelementptr inbounds float* %tmp12111, i64 1
+  %tmp12113 = getelementptr inbounds float* %tmp12112, i64 1
+  %tmp12114 = getelementptr inbounds float* %tmp12113, i64 1
+  %tmp12115 = getelementptr inbounds float* %tmp12114, i64 1
+  %tmp12116 = getelementptr inbounds float* %tmp12115, i64 1
+  %tmp12117 = getelementptr inbounds float* %tmp12116, i64 1
+  %tmp12118 = getelementptr inbounds float* %tmp12117, i64 1
+  %tmp12119 = getelementptr inbounds float* %tmp12118, i64 1
+  %tmp12120 = getelementptr inbounds float* %tmp12119, i64 1
+  %tmp12121 = getelementptr inbounds float* %tmp12120, i64 1
+  %tmp12122 = getelementptr inbounds float* %tmp12121, i64 1
+  %tmp12123 = getelementptr inbounds float* %tmp12122, i64 1
+  %tmp12124 = getelementptr inbounds float* %tmp12123, i64 1
+  %tmp12125 = getelementptr inbounds float* %tmp12124, i64 1
+  %tmp12126 = getelementptr inbounds float* %tmp12125, i64 1
+  %tmp12127 = getelementptr inbounds float* %tmp12126, i64 1
+  %tmp12128 = getelementptr inbounds float* %tmp12127, i64 1
+  %tmp12129 = getelementptr inbounds float* %tmp12128, i64 1
+  %tmp12130 = getelementptr inbounds float* %tmp12129, i64 1
+  %tmp12131 = getelementptr inbounds float* %tmp12130, i64 1
+  %tmp12132 = getelementptr inbounds float* %tmp12131, i64 1
+  %tmp12133 = getelementptr inbounds float* %tmp12132, i64 1
+  %tmp12134 = getelementptr inbounds float* %tmp12133, i64 1
+  %tmp12135 = getelementptr inbounds float* %tmp12134, i64 1
+  %tmp12136 = getelementptr inbounds float* %tmp12135, i64 1
+  %tmp12137 = getelementptr inbounds float* %tmp12136, i64 1
+  %tmp12138 = getelementptr inbounds float* %tmp12137, i64 1
+  %tmp12139 = getelementptr inbounds float* %tmp12138, i64 1
+  %tmp12140 = getelementptr inbounds float* %tmp12139, i64 1
+  %tmp12141 = getelementptr inbounds float* %tmp12140, i64 1
+  %tmp12142 = getelementptr inbounds float* %tmp12141, i64 1
+  %tmp12143 = getelementptr inbounds float* %tmp12142, i64 1
+  %tmp12144 = getelementptr inbounds float* %tmp12143, i64 1
+  %tmp12145 = getelementptr inbounds float* %tmp12144, i64 1
+  %tmp12146 = getelementptr inbounds float* %tmp12145, i64 1
+  %tmp12147 = getelementptr inbounds float* %tmp12146, i64 1
+  %tmp12148 = getelementptr inbounds float* %tmp12147, i64 1
+  %tmp12149 = getelementptr inbounds float* %tmp12148, i64 1
+  %tmp12150 = getelementptr inbounds float* %tmp12149, i64 1
+  %tmp12151 = getelementptr inbounds float* %tmp12150, i64 1
+  %tmp12152 = getelementptr inbounds float* %tmp12151, i64 1
+  %tmp12153 = getelementptr inbounds float* %tmp12152, i64 1
+  %tmp12154 = getelementptr inbounds float* %tmp12153, i64 1
+  %tmp12155 = getelementptr inbounds float* %tmp12154, i64 1
+  %tmp12156 = getelementptr inbounds float* %tmp12155, i64 1
+  %tmp12157 = getelementptr inbounds float* %tmp12156, i64 1
+  %tmp12158 = getelementptr inbounds float* %tmp12157, i64 1
+  %tmp12159 = getelementptr inbounds float* %tmp12158, i64 1
+  %tmp12160 = getelementptr inbounds float* %tmp12159, i64 1
+  %tmp12161 = getelementptr inbounds float* %tmp12160, i64 1
+  %tmp12162 = getelementptr inbounds float* %tmp12161, i64 1
+  %tmp12163 = getelementptr inbounds float* %tmp12162, i64 1
+  %tmp12164 = getelementptr inbounds float* %tmp12163, i64 1
+  %tmp12165 = getelementptr inbounds float* %tmp12164, i64 1
+  %tmp12166 = getelementptr inbounds float* %tmp12165, i64 1
+  %tmp12167 = getelementptr inbounds float* %tmp12166, i64 1
+  %tmp12168 = getelementptr inbounds float* %tmp12167, i64 1
+  %tmp12169 = getelementptr inbounds float* %tmp12168, i64 1
+  %tmp12170 = getelementptr inbounds float* %tmp12169, i64 1
+  %tmp12171 = getelementptr inbounds float* %tmp12170, i64 1
+  %tmp12172 = getelementptr inbounds float* %tmp12171, i64 1
+  %tmp12173 = getelementptr inbounds float* %tmp12172, i64 1
+  %tmp12174 = getelementptr inbounds float* %tmp12173, i64 1
+  %tmp12175 = getelementptr inbounds float* %tmp12174, i64 1
+  %tmp12176 = getelementptr inbounds float* %tmp12175, i64 1
+  %tmp12177 = getelementptr inbounds float* %tmp12176, i64 1
+  %tmp12178 = getelementptr inbounds float* %tmp12177, i64 1
+  %tmp12179 = getelementptr inbounds float* %tmp12178, i64 1
+  %tmp12180 = getelementptr inbounds float* %tmp12179, i64 1
+  %tmp12181 = getelementptr inbounds float* %tmp12180, i64 1
+  %tmp12182 = getelementptr inbounds float* %tmp12181, i64 1
+  %tmp12183 = getelementptr inbounds float* %tmp12182, i64 1
+  %tmp12184 = getelementptr inbounds float* %tmp12183, i64 1
+  %tmp12185 = getelementptr inbounds float* %tmp12184, i64 1
+  %tmp12186 = getelementptr inbounds float* %tmp12185, i64 1
+  %tmp12187 = getelementptr inbounds float* %tmp12186, i64 1
+  %tmp12188 = getelementptr inbounds float* %tmp12187, i64 1
+  %tmp12189 = getelementptr inbounds float* %tmp12188, i64 1
+  %tmp12190 = getelementptr inbounds float* %tmp12189, i64 1
+  %tmp12191 = getelementptr inbounds float* %tmp12190, i64 1
+  %tmp12192 = getelementptr inbounds float* %tmp12191, i64 1
+  %tmp12193 = getelementptr inbounds float* %tmp12192, i64 1
+  %tmp12194 = getelementptr inbounds float* %tmp12193, i64 1
+  %tmp12195 = getelementptr inbounds float* %tmp12194, i64 1
+  %tmp12196 = getelementptr inbounds float* %tmp12195, i64 1
+  %tmp12197 = getelementptr inbounds float* %tmp12196, i64 1
+  %tmp12198 = getelementptr inbounds float* %tmp12197, i64 1
+  %tmp12199 = getelementptr inbounds float* %tmp12198, i64 1
+  %tmp12200 = getelementptr inbounds float* %tmp12199, i64 1
+  %tmp12201 = getelementptr inbounds float* %tmp12200, i64 1
+  %tmp12202 = getelementptr inbounds float* %tmp12201, i64 1
+  %tmp12203 = getelementptr inbounds float* %tmp12202, i64 1
+  %tmp12204 = getelementptr inbounds float* %tmp12203, i64 1
+  %tmp12205 = getelementptr inbounds float* %tmp12204, i64 1
+  %tmp12206 = getelementptr inbounds float* %tmp12205, i64 1
+  %tmp12207 = getelementptr inbounds float* %tmp12206, i64 1
+  %tmp12208 = getelementptr inbounds float* %tmp12207, i64 1
+  %tmp12209 = getelementptr inbounds float* %tmp12208, i64 1
+  %tmp12210 = getelementptr inbounds float* %tmp12209, i64 1
+  %tmp12211 = getelementptr inbounds float* %tmp12210, i64 1
+  %tmp12212 = getelementptr inbounds float* %tmp12211, i64 1
+  %tmp12213 = getelementptr inbounds float* %tmp12212, i64 1
+  %tmp12214 = getelementptr inbounds float* %tmp12213, i64 1
+  %tmp12215 = getelementptr inbounds float* %tmp12214, i64 1
+  %tmp12216 = getelementptr inbounds float* %tmp12215, i64 1
+  %tmp12217 = getelementptr inbounds float* %tmp12216, i64 1
+  %tmp12218 = getelementptr inbounds float* %tmp12217, i64 1
+  %tmp12219 = getelementptr inbounds float* %tmp12218, i64 1
+  %tmp12220 = getelementptr inbounds float* %tmp12219, i64 1
+  %tmp12221 = getelementptr inbounds float* %tmp12220, i64 1
+  %tmp12222 = getelementptr inbounds float* %tmp12221, i64 1
+  %tmp12223 = getelementptr inbounds float* %tmp12222, i64 1
+  %tmp12224 = getelementptr inbounds float* %tmp12223, i64 1
+  %tmp12225 = getelementptr inbounds float* %tmp12224, i64 1
+  %tmp12226 = getelementptr inbounds float* %tmp12225, i64 1
+  %tmp12227 = getelementptr inbounds float* %tmp12226, i64 1
+  %tmp12228 = getelementptr inbounds float* %tmp12227, i64 1
+  %tmp12229 = getelementptr inbounds float* %tmp12228, i64 1
+  %tmp12230 = getelementptr inbounds float* %tmp12229, i64 1
+  %tmp12231 = getelementptr inbounds float* %tmp12230, i64 1
+  %tmp12232 = getelementptr inbounds float* %tmp12231, i64 1
+  %tmp12233 = getelementptr inbounds float* %tmp12232, i64 1
+  %tmp12234 = getelementptr inbounds float* %tmp12233, i64 1
+  %tmp12235 = getelementptr inbounds float* %tmp12234, i64 1
+  %tmp12236 = getelementptr inbounds float* %tmp12235, i64 1
+  %tmp12237 = getelementptr inbounds float* %tmp12236, i64 1
+  %tmp12238 = getelementptr inbounds float* %tmp12237, i64 1
+  %tmp12239 = getelementptr inbounds float* %tmp12238, i64 1
+  %tmp12240 = getelementptr inbounds float* %tmp12239, i64 1
+  %tmp12241 = getelementptr inbounds float* %tmp12240, i64 1
+  %tmp12242 = getelementptr inbounds float* %tmp12241, i64 1
+  %tmp12243 = getelementptr inbounds float* %tmp12242, i64 1
+  %tmp12244 = getelementptr inbounds float* %tmp12243, i64 1
+  %tmp12245 = getelementptr inbounds float* %tmp12244, i64 1
+  %tmp12246 = getelementptr inbounds float* %tmp12245, i64 1
+  %tmp12247 = getelementptr inbounds float* %tmp12246, i64 1
+  %tmp12248 = getelementptr inbounds float* %tmp12247, i64 1
+  %tmp12249 = getelementptr inbounds float* %tmp12248, i64 1
+  %tmp12250 = getelementptr inbounds float* %tmp12249, i64 1
+  %tmp12251 = getelementptr inbounds float* %tmp12250, i64 1
+  %tmp12252 = getelementptr inbounds float* %tmp12251, i64 1
+  %tmp12253 = getelementptr inbounds float* %tmp12252, i64 1
+  %tmp12254 = getelementptr inbounds float* %tmp12253, i64 1
+  %tmp12255 = getelementptr inbounds float* %tmp12254, i64 1
+  %tmp12256 = getelementptr inbounds float* %tmp12255, i64 1
+  %tmp12257 = getelementptr inbounds float* %tmp12256, i64 1
+  %tmp12258 = getelementptr inbounds float* %tmp12257, i64 1
+  %tmp12259 = getelementptr inbounds float* %tmp12258, i64 1
+  %tmp12260 = getelementptr inbounds float* %tmp12259, i64 1
+  %tmp12261 = getelementptr inbounds float* %tmp12260, i64 1
+  %tmp12262 = getelementptr inbounds float* %tmp12261, i64 1
+  %tmp12263 = getelementptr inbounds float* %tmp12262, i64 1
+  %tmp12264 = getelementptr inbounds float* %tmp12263, i64 1
+  %tmp12265 = getelementptr inbounds float* %tmp12264, i64 1
+  %tmp12266 = getelementptr inbounds float* %tmp12265, i64 1
+  %tmp12267 = getelementptr inbounds float* %tmp12266, i64 1
+  %tmp12268 = getelementptr inbounds float* %tmp12267, i64 1
+  %tmp12269 = getelementptr inbounds float* %tmp12268, i64 1
+  %tmp12270 = getelementptr inbounds float* %tmp12269, i64 1
+  %tmp12271 = getelementptr inbounds float* %tmp12270, i64 1
+  %tmp12272 = getelementptr inbounds float* %tmp12271, i64 1
+  %tmp12273 = getelementptr inbounds float* %tmp12272, i64 1
+  %tmp12274 = getelementptr inbounds float* %tmp12273, i64 1
+  %tmp12275 = getelementptr inbounds float* %tmp12274, i64 1
+  %tmp12276 = getelementptr inbounds float* %tmp12275, i64 1
+  %tmp12277 = getelementptr inbounds float* %tmp12276, i64 1
+  %tmp12278 = getelementptr inbounds float* %tmp12277, i64 1
+  %tmp12279 = getelementptr inbounds float* %tmp12278, i64 1
+  %tmp12280 = getelementptr inbounds float* %tmp12279, i64 1
+  %tmp12281 = getelementptr inbounds float* %tmp12280, i64 1
+  %tmp12282 = getelementptr inbounds float* %tmp12281, i64 1
+  %tmp12283 = getelementptr inbounds float* %tmp12282, i64 1
+  %tmp12284 = getelementptr inbounds float* %tmp12283, i64 1
+  %tmp12285 = getelementptr inbounds float* %tmp12284, i64 1
+  %tmp12286 = getelementptr inbounds float* %tmp12285, i64 1
+  %tmp12287 = getelementptr inbounds float* %tmp12286, i64 1
+  %tmp12288 = getelementptr inbounds float* %tmp12287, i64 1
+  %tmp12289 = getelementptr inbounds float* %tmp12288, i64 1
+  %tmp12290 = getelementptr inbounds float* %tmp12289, i64 1
+  %tmp12291 = getelementptr inbounds float* %tmp12290, i64 1
+  %tmp12292 = getelementptr inbounds float* %tmp12291, i64 1
+  %tmp12293 = getelementptr inbounds float* %tmp12292, i64 1
+  %tmp12294 = getelementptr inbounds float* %tmp12293, i64 1
+  %tmp12295 = getelementptr inbounds float* %tmp12294, i64 1
+  %tmp12296 = getelementptr inbounds float* %tmp12295, i64 1
+  %tmp12297 = getelementptr inbounds float* %tmp12296, i64 1
+  %tmp12298 = getelementptr inbounds float* %tmp12297, i64 1
+  %tmp12299 = getelementptr inbounds float* %tmp12298, i64 1
+  %tmp12300 = getelementptr inbounds float* %tmp12299, i64 1
+  %tmp12301 = getelementptr inbounds float* %tmp12300, i64 1
+  %tmp12302 = getelementptr inbounds float* %tmp12301, i64 1
+  %tmp12303 = getelementptr inbounds float* %tmp12302, i64 1
+  %tmp12304 = getelementptr inbounds float* %tmp12303, i64 1
+  %tmp12305 = getelementptr inbounds float* %tmp12304, i64 1
+  %tmp12306 = getelementptr inbounds float* %tmp12305, i64 1
+  %tmp12307 = getelementptr inbounds float* %tmp12306, i64 1
+  %tmp12308 = getelementptr inbounds float* %tmp12307, i64 1
+  %tmp12309 = getelementptr inbounds float* %tmp12308, i64 1
+  %tmp12310 = getelementptr inbounds float* %tmp12309, i64 1
+  %tmp12311 = getelementptr inbounds float* %tmp12310, i64 1
+  %tmp12312 = getelementptr inbounds float* %tmp12311, i64 1
+  %tmp12313 = getelementptr inbounds float* %tmp12312, i64 1
+  %tmp12314 = getelementptr inbounds float* %tmp12313, i64 1
+  %tmp12315 = getelementptr inbounds float* %tmp12314, i64 1
+  %tmp12316 = getelementptr inbounds float* %tmp12315, i64 1
+  %tmp12317 = getelementptr inbounds float* %tmp12316, i64 1
+  %tmp12318 = getelementptr inbounds float* %tmp12317, i64 1
+  %tmp12319 = getelementptr inbounds float* %tmp12318, i64 1
+  %tmp12320 = getelementptr inbounds float* %tmp12319, i64 1
+  %tmp12321 = getelementptr inbounds float* %tmp12320, i64 1
+  %tmp12322 = getelementptr inbounds float* %tmp12321, i64 1
+  %tmp12323 = getelementptr inbounds float* %tmp12322, i64 1
+  %tmp12324 = getelementptr inbounds float* %tmp12323, i64 1
+  %tmp12325 = getelementptr inbounds float* %tmp12324, i64 1
+  %tmp12326 = getelementptr inbounds float* %tmp12325, i64 1
+  %tmp12327 = getelementptr inbounds float* %tmp12326, i64 1
+  %tmp12328 = getelementptr inbounds float* %tmp12327, i64 1
+  %tmp12329 = getelementptr inbounds float* %tmp12328, i64 1
+  %tmp12330 = getelementptr inbounds float* %tmp12329, i64 1
+  %tmp12331 = getelementptr inbounds float* %tmp12330, i64 1
+  %tmp12332 = getelementptr inbounds float* %tmp12331, i64 1
+  %tmp12333 = getelementptr inbounds float* %tmp12332, i64 1
+  %tmp12334 = getelementptr inbounds float* %tmp12333, i64 1
+  %tmp12335 = getelementptr inbounds float* %tmp12334, i64 1
+  %tmp12336 = getelementptr inbounds float* %tmp12335, i64 1
+  %tmp12337 = getelementptr inbounds float* %tmp12336, i64 1
+  %tmp12338 = getelementptr inbounds float* %tmp12337, i64 1
+  %tmp12339 = getelementptr inbounds float* %tmp12338, i64 1
+  %tmp12340 = getelementptr inbounds float* %tmp12339, i64 1
+  %tmp12341 = getelementptr inbounds float* %tmp12340, i64 1
+  %tmp12342 = getelementptr inbounds float* %tmp12341, i64 1
+  %tmp12343 = getelementptr inbounds float* %tmp12342, i64 1
+  %tmp12344 = getelementptr inbounds float* %tmp12343, i64 1
+  %tmp12345 = getelementptr inbounds float* %tmp12344, i64 1
+  %tmp12346 = getelementptr inbounds float* %tmp12345, i64 1
+  %tmp12347 = getelementptr inbounds float* %tmp12346, i64 1
+  %tmp12348 = getelementptr inbounds float* %tmp12347, i64 1
+  %tmp12349 = getelementptr inbounds float* %tmp12348, i64 1
+  %tmp12350 = getelementptr inbounds float* %tmp12349, i64 1
+  %tmp12351 = getelementptr inbounds float* %tmp12350, i64 1
+  %tmp12352 = getelementptr inbounds float* %tmp12351, i64 1
+  %tmp12353 = getelementptr inbounds float* %tmp12352, i64 1
+  %tmp12354 = getelementptr inbounds float* %tmp12353, i64 1
+  %tmp12355 = getelementptr inbounds float* %tmp12354, i64 1
+  %tmp12356 = getelementptr inbounds float* %tmp12355, i64 1
+  %tmp12357 = getelementptr inbounds float* %tmp12356, i64 1
+  %tmp12358 = getelementptr inbounds float* %tmp12357, i64 1
+  %tmp12359 = getelementptr inbounds float* %tmp12358, i64 1
+  %tmp12360 = getelementptr inbounds float* %tmp12359, i64 1
+  %tmp12361 = getelementptr inbounds float* %tmp12360, i64 1
+  %tmp12362 = getelementptr inbounds float* %tmp12361, i64 1
+  %tmp12363 = getelementptr inbounds float* %tmp12362, i64 1
+  %tmp12364 = getelementptr inbounds float* %tmp12363, i64 1
+  %tmp12365 = getelementptr inbounds float* %tmp12364, i64 1
+  %tmp12366 = getelementptr inbounds float* %tmp12365, i64 1
+  %tmp12367 = getelementptr inbounds float* %tmp12366, i64 1
+  %tmp12368 = getelementptr inbounds float* %tmp12367, i64 1
+  %tmp12369 = getelementptr inbounds float* %tmp12368, i64 1
+  %tmp12370 = getelementptr inbounds float* %tmp12369, i64 1
+  %tmp12371 = getelementptr inbounds float* %tmp12370, i64 1
+  %tmp12372 = getelementptr inbounds float* %tmp12371, i64 1
+  %tmp12373 = getelementptr inbounds float* %tmp12372, i64 1
+  %tmp12374 = getelementptr inbounds float* %tmp12373, i64 1
+  %tmp12375 = getelementptr inbounds float* %tmp12374, i64 1
+  %tmp12376 = getelementptr inbounds float* %tmp12375, i64 1
+  %tmp12377 = getelementptr inbounds float* %tmp12376, i64 1
+  %tmp12378 = getelementptr inbounds float* %tmp12377, i64 1
+  %tmp12379 = getelementptr inbounds float* %tmp12378, i64 1
+  %tmp12380 = getelementptr inbounds float* %tmp12379, i64 1
+  %tmp12381 = getelementptr inbounds float* %tmp12380, i64 1
+  %tmp12382 = getelementptr inbounds float* %tmp12381, i64 1
+  %tmp12383 = getelementptr inbounds float* %tmp12382, i64 1
+  %tmp12384 = getelementptr inbounds float* %tmp12383, i64 1
+  %tmp12385 = getelementptr inbounds float* %tmp12384, i64 1
+  %tmp12386 = getelementptr inbounds float* %tmp12385, i64 1
+  %tmp12387 = getelementptr inbounds float* %tmp12386, i64 1
+  %tmp12388 = getelementptr inbounds float* %tmp12387, i64 1
+  %tmp12389 = getelementptr inbounds float* %tmp12388, i64 1
+  %tmp12390 = getelementptr inbounds float* %tmp12389, i64 1
+  %tmp12391 = getelementptr inbounds float* %tmp12390, i64 1
+  %tmp12392 = getelementptr inbounds float* %tmp12391, i64 1
+  %tmp12393 = getelementptr inbounds float* %tmp12392, i64 1
+  %tmp12394 = getelementptr inbounds float* %tmp12393, i64 1
+  %tmp12395 = getelementptr inbounds float* %tmp12394, i64 1
+  %tmp12396 = getelementptr inbounds float* %tmp12395, i64 1
+  %tmp12397 = getelementptr inbounds float* %tmp12396, i64 1
+  %tmp12398 = getelementptr inbounds float* %tmp12397, i64 1
+  %tmp12399 = getelementptr inbounds float* %tmp12398, i64 1
+  %tmp12400 = getelementptr inbounds float* %tmp12399, i64 1
+  %tmp12401 = getelementptr inbounds float* %tmp12400, i64 1
+  %tmp12402 = getelementptr inbounds float* %tmp12401, i64 1
+  %tmp12403 = getelementptr inbounds float* %tmp12402, i64 1
+  %tmp12404 = getelementptr inbounds float* %tmp12403, i64 1
+  %tmp12405 = getelementptr inbounds float* %tmp12404, i64 1
+  %tmp12406 = getelementptr inbounds float* %tmp12405, i64 1
+  %tmp12407 = getelementptr inbounds float* %tmp12406, i64 1
+  %tmp12408 = getelementptr inbounds float* %tmp12407, i64 1
+  %tmp12409 = getelementptr inbounds float* %tmp12408, i64 1
+  %tmp12410 = getelementptr inbounds float* %tmp12409, i64 1
+  %tmp12411 = getelementptr inbounds float* %tmp12410, i64 1
+  %tmp12412 = getelementptr inbounds float* %tmp12411, i64 1
+  %tmp12413 = getelementptr inbounds float* %tmp12412, i64 1
+  %tmp12414 = getelementptr inbounds float* %tmp12413, i64 1
+  %tmp12415 = getelementptr inbounds float* %tmp12414, i64 1
+  %tmp12416 = getelementptr inbounds float* %tmp12415, i64 1
+  %tmp12417 = getelementptr inbounds float* %tmp12416, i64 1
+  %tmp12418 = getelementptr inbounds float* %tmp12417, i64 1
+  %tmp12419 = getelementptr inbounds float* %tmp12418, i64 1
+  %tmp12420 = getelementptr inbounds float* %tmp12419, i64 1
+  %tmp12421 = getelementptr inbounds float* %tmp12420, i64 1
+  %tmp12422 = getelementptr inbounds float* %tmp12421, i64 1
+  %tmp12423 = getelementptr inbounds float* %tmp12422, i64 1
+  %tmp12424 = getelementptr inbounds float* %tmp12423, i64 1
+  %tmp12425 = getelementptr inbounds float* %tmp12424, i64 1
+  %tmp12426 = getelementptr inbounds float* %tmp12425, i64 1
+  %tmp12427 = getelementptr inbounds float* %tmp12426, i64 1
+  %tmp12428 = getelementptr inbounds float* %tmp12427, i64 1
+  %tmp12429 = getelementptr inbounds float* %tmp12428, i64 1
+  %tmp12430 = getelementptr inbounds float* %tmp12429, i64 1
+  %tmp12431 = getelementptr inbounds float* %tmp12430, i64 1
+  %tmp12432 = getelementptr inbounds float* %tmp12431, i64 1
+  %tmp12433 = getelementptr inbounds float* %tmp12432, i64 1
+  %tmp12434 = getelementptr inbounds float* %tmp12433, i64 1
+  %tmp12435 = getelementptr inbounds float* %tmp12434, i64 1
+  %tmp12436 = getelementptr inbounds float* %tmp12435, i64 1
+  %tmp12437 = getelementptr inbounds float* %tmp12436, i64 1
+  %tmp12438 = getelementptr inbounds float* %tmp12437, i64 1
+  %tmp12439 = getelementptr inbounds float* %tmp12438, i64 1
+  %tmp12440 = getelementptr inbounds float* %tmp12439, i64 1
+  %tmp12441 = getelementptr inbounds float* %tmp12440, i64 1
+  %tmp12442 = getelementptr inbounds float* %tmp12441, i64 1
+  %tmp12443 = getelementptr inbounds float* %tmp12442, i64 1
+  %tmp12444 = getelementptr inbounds float* %tmp12443, i64 1
+  %tmp12445 = getelementptr inbounds float* %tmp12444, i64 1
+  %tmp12446 = getelementptr inbounds float* %tmp12445, i64 1
+  %tmp12447 = getelementptr inbounds float* %tmp12446, i64 1
+  %tmp12448 = getelementptr inbounds float* %tmp12447, i64 1
+  %tmp12449 = getelementptr inbounds float* %tmp12448, i64 1
+  %tmp12450 = getelementptr inbounds float* %tmp12449, i64 1
+  %tmp12451 = getelementptr inbounds float* %tmp12450, i64 1
+  %tmp12452 = getelementptr inbounds float* %tmp12451, i64 1
+  %tmp12453 = getelementptr inbounds float* %tmp12452, i64 1
+  %tmp12454 = getelementptr inbounds float* %tmp12453, i64 1
+  %tmp12455 = getelementptr inbounds float* %tmp12454, i64 1
+  %tmp12456 = getelementptr inbounds float* %tmp12455, i64 1
+  %tmp12457 = getelementptr inbounds float* %tmp12456, i64 1
+  %tmp12458 = getelementptr inbounds float* %tmp12457, i64 1
+  %tmp12459 = getelementptr inbounds float* %tmp12458, i64 1
+  %tmp12460 = getelementptr inbounds float* %tmp12459, i64 1
+  %tmp12461 = getelementptr inbounds float* %tmp12460, i64 1
+  %tmp12462 = getelementptr inbounds float* %tmp12461, i64 1
+  %tmp12463 = getelementptr inbounds float* %tmp12462, i64 1
+  %tmp12464 = getelementptr inbounds float* %tmp12463, i64 1
+  %tmp12465 = getelementptr inbounds float* %tmp12464, i64 1
+  %tmp12466 = getelementptr inbounds float* %tmp12465, i64 1
+  %tmp12467 = getelementptr inbounds float* %tmp12466, i64 1
+  %tmp12468 = getelementptr inbounds float* %tmp12467, i64 1
+  %tmp12469 = getelementptr inbounds float* %tmp12468, i64 1
+  %tmp12470 = getelementptr inbounds float* %tmp12469, i64 1
+  %tmp12471 = getelementptr inbounds float* %tmp12470, i64 1
+  %tmp12472 = getelementptr inbounds float* %tmp12471, i64 1
+  %tmp12473 = getelementptr inbounds float* %tmp12472, i64 1
+  %tmp12474 = getelementptr inbounds float* %tmp12473, i64 1
+  %tmp12475 = getelementptr inbounds float* %tmp12474, i64 1
+  %tmp12476 = getelementptr inbounds float* %tmp12475, i64 1
+  %tmp12477 = getelementptr inbounds float* %tmp12476, i64 1
+  %tmp12478 = getelementptr inbounds float* %tmp12477, i64 1
+  %tmp12479 = getelementptr inbounds float* %tmp12478, i64 1
+  %tmp12480 = getelementptr inbounds float* %tmp12479, i64 1
+  %tmp12481 = getelementptr inbounds float* %tmp12480, i64 1
+  %tmp12482 = getelementptr inbounds float* %tmp12481, i64 1
+  %tmp12483 = getelementptr inbounds float* %tmp12482, i64 1
+  %tmp12484 = getelementptr inbounds float* %tmp12483, i64 1
+  %tmp12485 = getelementptr inbounds float* %tmp12484, i64 1
+  %tmp12486 = getelementptr inbounds float* %tmp12485, i64 1
+  %tmp12487 = getelementptr inbounds float* %tmp12486, i64 1
+  %tmp12488 = getelementptr inbounds float* %tmp12487, i64 1
+  %tmp12489 = getelementptr inbounds float* %tmp12488, i64 1
+  %tmp12490 = getelementptr inbounds float* %tmp12489, i64 1
+  %tmp12491 = getelementptr inbounds float* %tmp12490, i64 1
+  %tmp12492 = getelementptr inbounds float* %tmp12491, i64 1
+  %tmp12493 = getelementptr inbounds float* %tmp12492, i64 1
+  %tmp12494 = getelementptr inbounds float* %tmp12493, i64 1
+  %tmp12495 = getelementptr inbounds float* %tmp12494, i64 1
+  %tmp12496 = getelementptr inbounds float* %tmp12495, i64 1
+  %tmp12497 = getelementptr inbounds float* %tmp12496, i64 1
+  %tmp12498 = getelementptr inbounds float* %tmp12497, i64 1
+  %tmp12499 = getelementptr inbounds float* %tmp12498, i64 1
+  %tmp12500 = getelementptr inbounds float* %tmp12499, i64 1
+  %tmp12501 = getelementptr inbounds float* %tmp12500, i64 1
+  %tmp12502 = getelementptr inbounds float* %tmp12501, i64 1
+  %tmp12503 = getelementptr inbounds float* %tmp12502, i64 1
+  %tmp12504 = getelementptr inbounds float* %tmp12503, i64 1
+  %tmp12505 = getelementptr inbounds float* %tmp12504, i64 1
+  %tmp12506 = getelementptr inbounds float* %tmp12505, i64 1
+  %tmp12507 = getelementptr inbounds float* %tmp12506, i64 1
+  %tmp12508 = getelementptr inbounds float* %tmp12507, i64 1
+  %tmp12509 = getelementptr inbounds float* %tmp12508, i64 1
+  %tmp12510 = getelementptr inbounds float* %tmp12509, i64 1
+  %tmp12511 = getelementptr inbounds float* %tmp12510, i64 1
+  %tmp12512 = getelementptr inbounds float* %tmp12511, i64 1
+  %tmp12513 = getelementptr inbounds float* %tmp12512, i64 1
+  %tmp12514 = getelementptr inbounds float* %tmp12513, i64 1
+  %tmp12515 = getelementptr inbounds float* %tmp12514, i64 1
+  %tmp12516 = getelementptr inbounds float* %tmp12515, i64 1
+  %tmp12517 = getelementptr inbounds float* %tmp12516, i64 1
+  %tmp12518 = getelementptr inbounds float* %tmp12517, i64 1
+  %tmp12519 = getelementptr inbounds float* %tmp12518, i64 1
+  %tmp12520 = getelementptr inbounds float* %tmp12519, i64 1
+  %tmp12521 = getelementptr inbounds float* %tmp12520, i64 1
+  %tmp12522 = getelementptr inbounds float* %tmp12521, i64 1
+  %tmp12523 = getelementptr inbounds float* %tmp12522, i64 1
+  %tmp12524 = getelementptr inbounds float* %tmp12523, i64 1
+  %tmp12525 = getelementptr inbounds float* %tmp12524, i64 1
+  %tmp12526 = getelementptr inbounds float* %tmp12525, i64 1
+  %tmp12527 = getelementptr inbounds float* %tmp12526, i64 1
+  %tmp12528 = getelementptr inbounds float* %tmp12527, i64 1
+  %tmp12529 = getelementptr inbounds float* %tmp12528, i64 1
+  %tmp12530 = getelementptr inbounds float* %tmp12529, i64 1
+  %tmp12531 = getelementptr inbounds float* %tmp12530, i64 1
+  %tmp12532 = getelementptr inbounds float* %tmp12531, i64 1
+  %tmp12533 = getelementptr inbounds float* %tmp12532, i64 1
+  %tmp12534 = getelementptr inbounds float* %tmp12533, i64 1
+  %tmp12535 = getelementptr inbounds float* %tmp12534, i64 1
+  %tmp12536 = getelementptr inbounds float* %tmp12535, i64 1
+  %tmp12537 = getelementptr inbounds float* %tmp12536, i64 1
+  %tmp12538 = getelementptr inbounds float* %tmp12537, i64 1
+  %tmp12539 = getelementptr inbounds float* %tmp12538, i64 1
+  %tmp12540 = getelementptr inbounds float* %tmp12539, i64 1
+  %tmp12541 = getelementptr inbounds float* %tmp12540, i64 1
+  %tmp12542 = getelementptr inbounds float* %tmp12541, i64 1
+  %tmp12543 = getelementptr inbounds float* %tmp12542, i64 1
+  %tmp12544 = getelementptr inbounds float* %tmp12543, i64 1
+  %tmp12545 = getelementptr inbounds float* %tmp12544, i64 1
+  %tmp12546 = getelementptr inbounds float* %tmp12545, i64 1
+  %tmp12547 = getelementptr inbounds float* %tmp12546, i64 1
+  %tmp12548 = getelementptr inbounds float* %tmp12547, i64 1
+  %tmp12549 = getelementptr inbounds float* %tmp12548, i64 1
+  %tmp12550 = getelementptr inbounds float* %tmp12549, i64 1
+  %tmp12551 = getelementptr inbounds float* %tmp12550, i64 1
+  %tmp12552 = getelementptr inbounds float* %tmp12551, i64 1
+  %tmp12553 = getelementptr inbounds float* %tmp12552, i64 1
+  %tmp12554 = getelementptr inbounds float* %tmp12553, i64 1
+  %tmp12555 = getelementptr inbounds float* %tmp12554, i64 1
+  %tmp12556 = getelementptr inbounds float* %tmp12555, i64 1
+  %tmp12557 = getelementptr inbounds float* %tmp12556, i64 1
+  %tmp12558 = getelementptr inbounds float* %tmp12557, i64 1
+  %tmp12559 = getelementptr inbounds float* %tmp12558, i64 1
+  %tmp12560 = getelementptr inbounds float* %tmp12559, i64 1
+  %tmp12561 = getelementptr inbounds float* %tmp12560, i64 1
+  %tmp12562 = getelementptr inbounds float* %tmp12561, i64 1
+  %tmp12563 = getelementptr inbounds float* %tmp12562, i64 1
+  %tmp12564 = getelementptr inbounds float* %tmp12563, i64 1
+  %tmp12565 = getelementptr inbounds float* %tmp12564, i64 1
+  %tmp12566 = getelementptr inbounds float* %tmp12565, i64 1
+  %tmp12567 = getelementptr inbounds float* %tmp12566, i64 1
+  %tmp12568 = getelementptr inbounds float* %tmp12567, i64 1
+  %tmp12569 = getelementptr inbounds float* %tmp12568, i64 1
+  %tmp12570 = getelementptr inbounds float* %tmp12569, i64 1
+  %tmp12571 = getelementptr inbounds float* %tmp12570, i64 1
+  %tmp12572 = getelementptr inbounds float* %tmp12571, i64 1
+  %tmp12573 = getelementptr inbounds float* %tmp12572, i64 1
+  %tmp12574 = getelementptr inbounds float* %tmp12573, i64 1
+  %tmp12575 = getelementptr inbounds float* %tmp12574, i64 1
+  %tmp12576 = getelementptr inbounds float* %tmp12575, i64 1
+  %tmp12577 = getelementptr inbounds float* %tmp12576, i64 1
+  %tmp12578 = getelementptr inbounds float* %tmp12577, i64 1
+  %tmp12579 = getelementptr inbounds float* %tmp12578, i64 1
+  %tmp12580 = getelementptr inbounds float* %tmp12579, i64 1
+  %tmp12581 = getelementptr inbounds float* %tmp12580, i64 1
+  %tmp12582 = getelementptr inbounds float* %tmp12581, i64 1
+  %tmp12583 = getelementptr inbounds float* %tmp12582, i64 1
+  %tmp12584 = getelementptr inbounds float* %tmp12583, i64 1
+  %tmp12585 = getelementptr inbounds float* %tmp12584, i64 1
+  %tmp12586 = getelementptr inbounds float* %tmp12585, i64 1
+  %tmp12587 = getelementptr inbounds float* %tmp12586, i64 1
+  %tmp12588 = getelementptr inbounds float* %tmp12587, i64 1
+  %tmp12589 = getelementptr inbounds float* %tmp12588, i64 1
+  %tmp12590 = getelementptr inbounds float* %tmp12589, i64 1
+  %tmp12591 = getelementptr inbounds float* %tmp12590, i64 1
+  %tmp12592 = getelementptr inbounds float* %tmp12591, i64 1
+  %tmp12593 = getelementptr inbounds float* %tmp12592, i64 1
+  %tmp12594 = getelementptr inbounds float* %tmp12593, i64 1
+  %tmp12595 = getelementptr inbounds float* %tmp12594, i64 1
+  %tmp12596 = getelementptr inbounds float* %tmp12595, i64 1
+  %tmp12597 = getelementptr inbounds float* %tmp12596, i64 1
+  %tmp12598 = getelementptr inbounds float* %tmp12597, i64 1
+  %tmp12599 = getelementptr inbounds float* %tmp12598, i64 1
+  %tmp12600 = getelementptr inbounds float* %tmp12599, i64 1
+  %tmp12601 = getelementptr inbounds float* %tmp12600, i64 1
+  %tmp12602 = getelementptr inbounds float* %tmp12601, i64 1
+  %tmp12603 = getelementptr inbounds float* %tmp12602, i64 1
+  %tmp12604 = getelementptr inbounds float* %tmp12603, i64 1
+  %tmp12605 = getelementptr inbounds float* %tmp12604, i64 1
+  %tmp12606 = getelementptr inbounds float* %tmp12605, i64 1
+  %tmp12607 = getelementptr inbounds float* %tmp12606, i64 1
+  %tmp12608 = getelementptr inbounds float* %tmp12607, i64 1
+  %tmp12609 = getelementptr inbounds float* %tmp12608, i64 1
+  %tmp12610 = getelementptr inbounds float* %tmp12609, i64 1
+  %tmp12611 = getelementptr inbounds float* %tmp12610, i64 1
+  %tmp12612 = getelementptr inbounds float* %tmp12611, i64 1
+  %tmp12613 = getelementptr inbounds float* %tmp12612, i64 1
+  %tmp12614 = getelementptr inbounds float* %tmp12613, i64 1
+  %tmp12615 = getelementptr inbounds float* %tmp12614, i64 1
+  %tmp12616 = getelementptr inbounds float* %tmp12615, i64 1
+  %tmp12617 = getelementptr inbounds float* %tmp12616, i64 1
+  %tmp12618 = getelementptr inbounds float* %tmp12617, i64 1
+  %tmp12619 = getelementptr inbounds float* %tmp12618, i64 1
+  %tmp12620 = getelementptr inbounds float* %tmp12619, i64 1
+  %tmp12621 = getelementptr inbounds float* %tmp12620, i64 1
+  %tmp12622 = getelementptr inbounds float* %tmp12621, i64 1
+  %tmp12623 = getelementptr inbounds float* %tmp12622, i64 1
+  %tmp12624 = getelementptr inbounds float* %tmp12623, i64 1
+  %tmp12625 = getelementptr inbounds float* %tmp12624, i64 1
+  %tmp12626 = getelementptr inbounds float* %tmp12625, i64 1
+  %tmp12627 = getelementptr inbounds float* %tmp12626, i64 1
+  %tmp12628 = getelementptr inbounds float* %tmp12627, i64 1
+  %tmp12629 = getelementptr inbounds float* %tmp12628, i64 1
+  %tmp12630 = getelementptr inbounds float* %tmp12629, i64 1
+  %tmp12631 = getelementptr inbounds float* %tmp12630, i64 1
+  %tmp12632 = getelementptr inbounds float* %tmp12631, i64 1
+  %tmp12633 = getelementptr inbounds float* %tmp12632, i64 1
+  %tmp12634 = getelementptr inbounds float* %tmp12633, i64 1
+  %tmp12635 = getelementptr inbounds float* %tmp12634, i64 1
+  %tmp12636 = getelementptr inbounds float* %tmp12635, i64 1
+  %tmp12637 = getelementptr inbounds float* %tmp12636, i64 1
+  %tmp12638 = getelementptr inbounds float* %tmp12637, i64 1
+  %tmp12639 = getelementptr inbounds float* %tmp12638, i64 1
+  %tmp12640 = getelementptr inbounds float* %tmp12639, i64 1
+  %tmp12641 = getelementptr inbounds float* %tmp12640, i64 1
+  %tmp12642 = getelementptr inbounds float* %tmp12641, i64 1
+  %tmp12643 = getelementptr inbounds float* %tmp12642, i64 1
+  %tmp12644 = getelementptr inbounds float* %tmp12643, i64 1
+  %tmp12645 = getelementptr inbounds float* %tmp12644, i64 1
+  %tmp12646 = getelementptr inbounds float* %tmp12645, i64 1
+  %tmp12647 = getelementptr inbounds float* %tmp12646, i64 1
+  %tmp12648 = getelementptr inbounds float* %tmp12647, i64 1
+  %tmp12649 = getelementptr inbounds float* %tmp12648, i64 1
+  %tmp12650 = getelementptr inbounds float* %tmp12649, i64 1
+  %tmp12651 = getelementptr inbounds float* %tmp12650, i64 1
+  %tmp12652 = getelementptr inbounds float* %tmp12651, i64 1
+  %tmp12653 = getelementptr inbounds float* %tmp12652, i64 1
+  %tmp12654 = getelementptr inbounds float* %tmp12653, i64 1
+  %tmp12655 = getelementptr inbounds float* %tmp12654, i64 1
+  %tmp12656 = getelementptr inbounds float* %tmp12655, i64 1
+  %tmp12657 = getelementptr inbounds float* %tmp12656, i64 1
+  %tmp12658 = getelementptr inbounds float* %tmp12657, i64 1
+  %tmp12659 = getelementptr inbounds float* %tmp12658, i64 1
+  %tmp12660 = getelementptr inbounds float* %tmp12659, i64 1
+  %tmp12661 = getelementptr inbounds float* %tmp12660, i64 1
+  %tmp12662 = getelementptr inbounds float* %tmp12661, i64 1
+  %tmp12663 = getelementptr inbounds float* %tmp12662, i64 1
+  %tmp12664 = getelementptr inbounds float* %tmp12663, i64 1
+  %tmp12665 = getelementptr inbounds float* %tmp12664, i64 1
+  %tmp12666 = getelementptr inbounds float* %tmp12665, i64 1
+  %tmp12667 = getelementptr inbounds float* %tmp12666, i64 1
+  %tmp12668 = getelementptr inbounds float* %tmp12667, i64 1
+  %tmp12669 = getelementptr inbounds float* %tmp12668, i64 1
+  %tmp12670 = getelementptr inbounds float* %tmp12669, i64 1
+  %tmp12671 = getelementptr inbounds float* %tmp12670, i64 1
+  %tmp12672 = getelementptr inbounds float* %tmp12671, i64 1
+  %tmp12673 = getelementptr inbounds float* %tmp12672, i64 1
+  %tmp12674 = getelementptr inbounds float* %tmp12673, i64 1
+  %tmp12675 = getelementptr inbounds float* %tmp12674, i64 1
+  %tmp12676 = getelementptr inbounds float* %tmp12675, i64 1
+  %tmp12677 = getelementptr inbounds float* %tmp12676, i64 1
+  %tmp12678 = getelementptr inbounds float* %tmp12677, i64 1
+  %tmp12679 = getelementptr inbounds float* %tmp12678, i64 1
+  %tmp12680 = getelementptr inbounds float* %tmp12679, i64 1
+  %tmp12681 = getelementptr inbounds float* %tmp12680, i64 1
+  %tmp12682 = getelementptr inbounds float* %tmp12681, i64 1
+  %tmp12683 = getelementptr inbounds float* %tmp12682, i64 1
+  %tmp12684 = getelementptr inbounds float* %tmp12683, i64 1
+  %tmp12685 = getelementptr inbounds float* %tmp12684, i64 1
+  %tmp12686 = getelementptr inbounds float* %tmp12685, i64 1
+  %tmp12687 = getelementptr inbounds float* %tmp12686, i64 1
+  %tmp12688 = getelementptr inbounds float* %tmp12687, i64 1
+  %tmp12689 = getelementptr inbounds float* %tmp12688, i64 1
+  %tmp12690 = getelementptr inbounds float* %tmp12689, i64 1
+  %tmp12691 = getelementptr inbounds float* %tmp12690, i64 1
+  %tmp12692 = getelementptr inbounds float* %tmp12691, i64 1
+  %tmp12693 = getelementptr inbounds float* %tmp12692, i64 1
+  %tmp12694 = getelementptr inbounds float* %tmp12693, i64 1
+  %tmp12695 = getelementptr inbounds float* %tmp12694, i64 1
+  %tmp12696 = getelementptr inbounds float* %tmp12695, i64 1
+  %tmp12697 = getelementptr inbounds float* %tmp12696, i64 1
+  %tmp12698 = getelementptr inbounds float* %tmp12697, i64 1
+  %tmp12699 = getelementptr inbounds float* %tmp12698, i64 1
+  %tmp12700 = getelementptr inbounds float* %tmp12699, i64 1
+  %tmp12701 = getelementptr inbounds float* %tmp12700, i64 1
+  %tmp12702 = getelementptr inbounds float* %tmp12701, i64 1
+  %tmp12703 = getelementptr inbounds float* %tmp12702, i64 1
+  %tmp12704 = getelementptr inbounds float* %tmp12703, i64 1
+  %tmp12705 = getelementptr inbounds float* %tmp12704, i64 1
+  %tmp12706 = getelementptr inbounds float* %tmp12705, i64 1
+  %tmp12707 = getelementptr inbounds float* %tmp12706, i64 1
+  %tmp12708 = getelementptr inbounds float* %tmp12707, i64 1
+  %tmp12709 = getelementptr inbounds float* %tmp12708, i64 1
+  %tmp12710 = getelementptr inbounds float* %tmp12709, i64 1
+  %tmp12711 = getelementptr inbounds float* %tmp12710, i64 1
+  %tmp12712 = getelementptr inbounds float* %tmp12711, i64 1
+  %tmp12713 = getelementptr inbounds float* %tmp12712, i64 1
+  %tmp12714 = getelementptr inbounds float* %tmp12713, i64 1
+  %tmp12715 = getelementptr inbounds float* %tmp12714, i64 1
+  %tmp12716 = getelementptr inbounds float* %tmp12715, i64 1
+  %tmp12717 = getelementptr inbounds float* %tmp12716, i64 1
+  %tmp12718 = getelementptr inbounds float* %tmp12717, i64 1
+  %tmp12719 = getelementptr inbounds float* %tmp12718, i64 1
+  %tmp12720 = getelementptr inbounds float* %tmp12719, i64 1
+  %tmp12721 = getelementptr inbounds float* %tmp12720, i64 1
+  %tmp12722 = getelementptr inbounds float* %tmp12721, i64 1
+  %tmp12723 = getelementptr inbounds float* %tmp12722, i64 1
+  %tmp12724 = getelementptr inbounds float* %tmp12723, i64 1
+  %tmp12725 = getelementptr inbounds float* %tmp12724, i64 1
+  %tmp12726 = getelementptr inbounds float* %tmp12725, i64 1
+  %tmp12727 = getelementptr inbounds float* %tmp12726, i64 1
+  %tmp12728 = getelementptr inbounds float* %tmp12727, i64 1
+  %tmp12729 = getelementptr inbounds float* %tmp12728, i64 1
+  %tmp12730 = getelementptr inbounds float* %tmp12729, i64 1
+  %tmp12731 = getelementptr inbounds float* %tmp12730, i64 1
+  %tmp12732 = getelementptr inbounds float* %tmp12731, i64 1
+  %tmp12733 = getelementptr inbounds float* %tmp12732, i64 1
+  %tmp12734 = getelementptr inbounds float* %tmp12733, i64 1
+  %tmp12735 = getelementptr inbounds float* %tmp12734, i64 1
+  %tmp12736 = getelementptr inbounds float* %tmp12735, i64 1
+  %tmp12737 = getelementptr inbounds float* %tmp12736, i64 1
+  %tmp12738 = getelementptr inbounds float* %tmp12737, i64 1
+  %tmp12739 = getelementptr inbounds float* %tmp12738, i64 1
+  %tmp12740 = getelementptr inbounds float* %tmp12739, i64 1
+  %tmp12741 = getelementptr inbounds float* %tmp12740, i64 1
+  %tmp12742 = getelementptr inbounds float* %tmp12741, i64 1
+  %tmp12743 = getelementptr inbounds float* %tmp12742, i64 1
+  %tmp12744 = getelementptr inbounds float* %tmp12743, i64 1
+  %tmp12745 = getelementptr inbounds float* %tmp12744, i64 1
+  %tmp12746 = getelementptr inbounds float* %tmp12745, i64 1
+  %tmp12747 = getelementptr inbounds float* %tmp12746, i64 1
+  %tmp12748 = getelementptr inbounds float* %tmp12747, i64 1
+  %tmp12749 = getelementptr inbounds float* %tmp12748, i64 1
+  %tmp12750 = getelementptr inbounds float* %tmp12749, i64 1
+  %tmp12751 = getelementptr inbounds float* %tmp12750, i64 1
+  %tmp12752 = getelementptr inbounds float* %tmp12751, i64 1
+  %tmp12753 = getelementptr inbounds float* %tmp12752, i64 1
+  %tmp12754 = getelementptr inbounds float* %tmp12753, i64 1
+  %tmp12755 = getelementptr inbounds float* %tmp12754, i64 1
+  %tmp12756 = getelementptr inbounds float* %tmp12755, i64 1
+  %tmp12757 = getelementptr inbounds float* %tmp12756, i64 1
+  %tmp12758 = getelementptr inbounds float* %tmp12757, i64 1
+  %tmp12759 = getelementptr inbounds float* %tmp12758, i64 1
+  %tmp12760 = getelementptr inbounds float* %tmp12759, i64 1
+  %tmp12761 = getelementptr inbounds float* %tmp12760, i64 1
+  %tmp12762 = getelementptr inbounds float* %tmp12761, i64 1
+  %tmp12763 = getelementptr inbounds float* %tmp12762, i64 1
+  %tmp12764 = getelementptr inbounds float* %tmp12763, i64 1
+  %tmp12765 = getelementptr inbounds float* %tmp12764, i64 1
+  %tmp12766 = getelementptr inbounds float* %tmp12765, i64 1
+  %tmp12767 = getelementptr inbounds float* %tmp12766, i64 1
+  %tmp12768 = getelementptr inbounds float* %tmp12767, i64 1
+  %tmp12769 = getelementptr inbounds float* %tmp12768, i64 1
+  %tmp12770 = getelementptr inbounds float* %tmp12769, i64 1
+  %tmp12771 = getelementptr inbounds float* %tmp12770, i64 1
+  %tmp12772 = getelementptr inbounds float* %tmp12771, i64 1
+  %tmp12773 = getelementptr inbounds float* %tmp12772, i64 1
+  %tmp12774 = getelementptr inbounds float* %tmp12773, i64 1
+  %tmp12775 = getelementptr inbounds float* %tmp12774, i64 1
+  %tmp12776 = getelementptr inbounds float* %tmp12775, i64 1
+  %tmp12777 = getelementptr inbounds float* %tmp12776, i64 1
+  %tmp12778 = getelementptr inbounds float* %tmp12777, i64 1
+  %tmp12779 = getelementptr inbounds float* %tmp12778, i64 1
+  %tmp12780 = getelementptr inbounds float* %tmp12779, i64 1
+  %tmp12781 = getelementptr inbounds float* %tmp12780, i64 1
+  %tmp12782 = getelementptr inbounds float* %tmp12781, i64 1
+  %tmp12783 = getelementptr inbounds float* %tmp12782, i64 1
+  %tmp12784 = getelementptr inbounds float* %tmp12783, i64 1
+  %tmp12785 = getelementptr inbounds float* %tmp12784, i64 1
+  %tmp12786 = getelementptr inbounds float* %tmp12785, i64 1
+  %tmp12787 = getelementptr inbounds float* %tmp12786, i64 1
+  %tmp12788 = getelementptr inbounds float* %tmp12787, i64 1
+  %tmp12789 = getelementptr inbounds float* %tmp12788, i64 1
+  %tmp12790 = getelementptr inbounds float* %tmp12789, i64 1
+  %tmp12791 = getelementptr inbounds float* %tmp12790, i64 1
+  %tmp12792 = getelementptr inbounds float* %tmp12791, i64 1
+  %tmp12793 = getelementptr inbounds float* %tmp12792, i64 1
+  %tmp12794 = getelementptr inbounds float* %tmp12793, i64 1
+  %tmp12795 = getelementptr inbounds float* %tmp12794, i64 1
+  %tmp12796 = getelementptr inbounds float* %tmp12795, i64 1
+  %tmp12797 = getelementptr inbounds float* %tmp12796, i64 1
+  %tmp12798 = getelementptr inbounds float* %tmp12797, i64 1
+  %tmp12799 = getelementptr inbounds float* %tmp12798, i64 1
+  %tmp12800 = getelementptr inbounds float* %tmp12799, i64 1
+  %tmp12801 = getelementptr inbounds float* %tmp12800, i64 1
+  %tmp12802 = getelementptr inbounds float* %tmp12801, i64 1
+  %tmp12803 = getelementptr inbounds float* %tmp12802, i64 1
+  %tmp12804 = getelementptr inbounds float* %tmp12803, i64 1
+  %tmp12805 = getelementptr inbounds float* %tmp12804, i64 1
+  %tmp12806 = getelementptr inbounds float* %tmp12805, i64 1
+  %tmp12807 = getelementptr inbounds float* %tmp12806, i64 1
+  %tmp12808 = getelementptr inbounds float* %tmp12807, i64 1
+  %tmp12809 = getelementptr inbounds float* %tmp12808, i64 1
+  %tmp12810 = getelementptr inbounds float* %tmp12809, i64 1
+  %tmp12811 = getelementptr inbounds float* %tmp12810, i64 1
+  %tmp12812 = getelementptr inbounds float* %tmp12811, i64 1
+  %tmp12813 = getelementptr inbounds float* %tmp12812, i64 1
+  %tmp12814 = getelementptr inbounds float* %tmp12813, i64 1
+  %tmp12815 = getelementptr inbounds float* %tmp12814, i64 1
+  %tmp12816 = getelementptr inbounds float* %tmp12815, i64 1
+  %tmp12817 = getelementptr inbounds float* %tmp12816, i64 1
+  %tmp12818 = getelementptr inbounds float* %tmp12817, i64 1
+  %tmp12819 = getelementptr inbounds float* %tmp12818, i64 1
+  %tmp12820 = getelementptr inbounds float* %tmp12819, i64 1
+  %tmp12821 = getelementptr inbounds float* %tmp12820, i64 1
+  %tmp12822 = getelementptr inbounds float* %tmp12821, i64 1
+  %tmp12823 = getelementptr inbounds float* %tmp12822, i64 1
+  %tmp12824 = getelementptr inbounds float* %tmp12823, i64 1
+  %tmp12825 = getelementptr inbounds float* %tmp12824, i64 1
+  %tmp12826 = getelementptr inbounds float* %tmp12825, i64 1
+  %tmp12827 = getelementptr inbounds float* %tmp12826, i64 1
+  %tmp12828 = getelementptr inbounds float* %tmp12827, i64 1
+  %tmp12829 = getelementptr inbounds float* %tmp12828, i64 1
+  %tmp12830 = getelementptr inbounds float* %tmp12829, i64 1
+  %tmp12831 = getelementptr inbounds float* %tmp12830, i64 1
+  %tmp12832 = getelementptr inbounds float* %tmp12831, i64 1
+  %tmp12833 = getelementptr inbounds float* %tmp12832, i64 1
+  %tmp12834 = getelementptr inbounds float* %tmp12833, i64 1
+  %tmp12835 = getelementptr inbounds float* %tmp12834, i64 1
+  %tmp12836 = getelementptr inbounds float* %tmp12835, i64 1
+  %tmp12837 = getelementptr inbounds float* %tmp12836, i64 1
+  %tmp12838 = getelementptr inbounds float* %tmp12837, i64 1
+  %tmp12839 = getelementptr inbounds float* %tmp12838, i64 1
+  %tmp12840 = getelementptr inbounds float* %tmp12839, i64 1
+  %tmp12841 = getelementptr inbounds float* %tmp12840, i64 1
+  %tmp12842 = getelementptr inbounds float* %tmp12841, i64 1
+  %tmp12843 = getelementptr inbounds float* %tmp12842, i64 1
+  %tmp12844 = getelementptr inbounds float* %tmp12843, i64 1
+  %tmp12845 = getelementptr inbounds float* %tmp12844, i64 1
+  %tmp12846 = getelementptr inbounds float* %tmp12845, i64 1
+  %tmp12847 = getelementptr inbounds float* %tmp12846, i64 1
+  %tmp12848 = getelementptr inbounds float* %tmp12847, i64 1
+  %tmp12849 = getelementptr inbounds float* %tmp12848, i64 1
+  %tmp12850 = getelementptr inbounds float* %tmp12849, i64 1
+  %tmp12851 = getelementptr inbounds float* %tmp12850, i64 1
+  %tmp12852 = getelementptr inbounds float* %tmp12851, i64 1
+  %tmp12853 = getelementptr inbounds float* %tmp12852, i64 1
+  %tmp12854 = getelementptr inbounds float* %tmp12853, i64 1
+  %tmp12855 = getelementptr inbounds float* %tmp12854, i64 1
+  %tmp12856 = getelementptr inbounds float* %tmp12855, i64 1
+  %tmp12857 = getelementptr inbounds float* %tmp12856, i64 1
+  %tmp12858 = getelementptr inbounds float* %tmp12857, i64 1
+  %tmp12859 = getelementptr inbounds float* %tmp12858, i64 1
+  %tmp12860 = getelementptr inbounds float* %tmp12859, i64 1
+  %tmp12861 = getelementptr inbounds float* %tmp12860, i64 1
+  %tmp12862 = getelementptr inbounds float* %tmp12861, i64 1
+  %tmp12863 = getelementptr inbounds float* %tmp12862, i64 1
+  %tmp12864 = getelementptr inbounds float* %tmp12863, i64 1
+  %tmp12865 = getelementptr inbounds float* %tmp12864, i64 1
+  %tmp12866 = getelementptr inbounds float* %tmp12865, i64 1
+  %tmp12867 = getelementptr inbounds float* %tmp12866, i64 1
+  %tmp12868 = getelementptr inbounds float* %tmp12867, i64 1
+  %tmp12869 = getelementptr inbounds float* %tmp12868, i64 1
+  %tmp12870 = getelementptr inbounds float* %tmp12869, i64 1
+  %tmp12871 = getelementptr inbounds float* %tmp12870, i64 1
+  %tmp12872 = getelementptr inbounds float* %tmp12871, i64 1
+  %tmp12873 = getelementptr inbounds float* %tmp12872, i64 1
+  %tmp12874 = getelementptr inbounds float* %tmp12873, i64 1
+  %tmp12875 = getelementptr inbounds float* %tmp12874, i64 1
+  %tmp12876 = getelementptr inbounds float* %tmp12875, i64 1
+  %tmp12877 = getelementptr inbounds float* %tmp12876, i64 1
+  %tmp12878 = getelementptr inbounds float* %tmp12877, i64 1
+  %tmp12879 = getelementptr inbounds float* %tmp12878, i64 1
+  %tmp12880 = getelementptr inbounds float* %tmp12879, i64 1
+  %tmp12881 = getelementptr inbounds float* %tmp12880, i64 1
+  %tmp12882 = getelementptr inbounds float* %tmp12881, i64 1
+  %tmp12883 = getelementptr inbounds float* %tmp12882, i64 1
+  %tmp12884 = getelementptr inbounds float* %tmp12883, i64 1
+  %tmp12885 = getelementptr inbounds float* %tmp12884, i64 1
+  %tmp12886 = getelementptr inbounds float* %tmp12885, i64 1
+  %tmp12887 = getelementptr inbounds float* %tmp12886, i64 1
+  %tmp12888 = getelementptr inbounds float* %tmp12887, i64 1
+  %tmp12889 = getelementptr inbounds float* %tmp12888, i64 1
+  %tmp12890 = getelementptr inbounds float* %tmp12889, i64 1
+  %tmp12891 = getelementptr inbounds float* %tmp12890, i64 1
+  %tmp12892 = getelementptr inbounds float* %tmp12891, i64 1
+  %tmp12893 = getelementptr inbounds float* %tmp12892, i64 1
+  %tmp12894 = getelementptr inbounds float* %tmp12893, i64 1
+  %tmp12895 = getelementptr inbounds float* %tmp12894, i64 1
+  %tmp12896 = getelementptr inbounds float* %tmp12895, i64 1
+  %tmp12897 = getelementptr inbounds float* %tmp12896, i64 1
+  %tmp12898 = getelementptr inbounds float* %tmp12897, i64 1
+  %tmp12899 = getelementptr inbounds float* %tmp12898, i64 1
+  %tmp12900 = getelementptr inbounds float* %tmp12899, i64 1
+  %tmp12901 = getelementptr inbounds float* %tmp12900, i64 1
+  %tmp12902 = getelementptr inbounds float* %tmp12901, i64 1
+  %tmp12903 = getelementptr inbounds float* %tmp12902, i64 1
+  %tmp12904 = getelementptr inbounds float* %tmp12903, i64 1
+  %tmp12905 = getelementptr inbounds float* %tmp12904, i64 1
+  %tmp12906 = getelementptr inbounds float* %tmp12905, i64 1
+  %tmp12907 = getelementptr inbounds float* %tmp12906, i64 1
+  %tmp12908 = getelementptr inbounds float* %tmp12907, i64 1
+  %tmp12909 = getelementptr inbounds float* %tmp12908, i64 1
+  %tmp12910 = getelementptr inbounds float* %tmp12909, i64 1
+  %tmp12911 = getelementptr inbounds float* %tmp12910, i64 1
+  %tmp12912 = getelementptr inbounds float* %tmp12911, i64 1
+  %tmp12913 = getelementptr inbounds float* %tmp12912, i64 1
+  %tmp12914 = getelementptr inbounds float* %tmp12913, i64 1
+  %tmp12915 = getelementptr inbounds float* %tmp12914, i64 1
+  %tmp12916 = getelementptr inbounds float* %tmp12915, i64 1
+  %tmp12917 = getelementptr inbounds float* %tmp12916, i64 1
+  %tmp12918 = getelementptr inbounds float* %tmp12917, i64 1
+  %tmp12919 = getelementptr inbounds float* %tmp12918, i64 1
+  %tmp12920 = getelementptr inbounds float* %tmp12919, i64 1
+  %tmp12921 = getelementptr inbounds float* %tmp12920, i64 1
+  %tmp12922 = getelementptr inbounds float* %tmp12921, i64 1
+  %tmp12923 = getelementptr inbounds float* %tmp12922, i64 1
+  %tmp12924 = getelementptr inbounds float* %tmp12923, i64 1
+  %tmp12925 = getelementptr inbounds float* %tmp12924, i64 1
+  %tmp12926 = getelementptr inbounds float* %tmp12925, i64 1
+  %tmp12927 = getelementptr inbounds float* %tmp12926, i64 1
+  %tmp12928 = getelementptr inbounds float* %tmp12927, i64 1
+  %tmp12929 = getelementptr inbounds float* %tmp12928, i64 1
+  %tmp12930 = getelementptr inbounds float* %tmp12929, i64 1
+  %tmp12931 = getelementptr inbounds float* %tmp12930, i64 1
+  %tmp12932 = getelementptr inbounds float* %tmp12931, i64 1
+  %tmp12933 = getelementptr inbounds float* %tmp12932, i64 1
+  %tmp12934 = getelementptr inbounds float* %tmp12933, i64 1
+  %tmp12935 = getelementptr inbounds float* %tmp12934, i64 1
+  %tmp12936 = getelementptr inbounds float* %tmp12935, i64 1
+  %tmp12937 = getelementptr inbounds float* %tmp12936, i64 1
+  %tmp12938 = getelementptr inbounds float* %tmp12937, i64 1
+  %tmp12939 = getelementptr inbounds float* %tmp12938, i64 1
+  %tmp12940 = getelementptr inbounds float* %tmp12939, i64 1
+  %tmp12941 = getelementptr inbounds float* %tmp12940, i64 1
+  %tmp12942 = getelementptr inbounds float* %tmp12941, i64 1
+  %tmp12943 = getelementptr inbounds float* %tmp12942, i64 1
+  %tmp12944 = getelementptr inbounds float* %tmp12943, i64 1
+  %tmp12945 = getelementptr inbounds float* %tmp12944, i64 1
+  %tmp12946 = getelementptr inbounds float* %tmp12945, i64 1
+  %tmp12947 = getelementptr inbounds float* %tmp12946, i64 1
+  %tmp12948 = getelementptr inbounds float* %tmp12947, i64 1
+  %tmp12949 = getelementptr inbounds float* %tmp12948, i64 1
+  %tmp12950 = getelementptr inbounds float* %tmp12949, i64 1
+  %tmp12951 = getelementptr inbounds float* %tmp12950, i64 1
+  %tmp12952 = getelementptr inbounds float* %tmp12951, i64 1
+  %tmp12953 = getelementptr inbounds float* %tmp12952, i64 1
+  %tmp12954 = getelementptr inbounds float* %tmp12953, i64 1
+  %tmp12955 = getelementptr inbounds float* %tmp12954, i64 1
+  %tmp12956 = getelementptr inbounds float* %tmp12955, i64 1
+  %tmp12957 = getelementptr inbounds float* %tmp12956, i64 1
+  %tmp12958 = getelementptr inbounds float* %tmp12957, i64 1
+  %tmp12959 = getelementptr inbounds float* %tmp12958, i64 1
+  %tmp12960 = getelementptr inbounds float* %tmp12959, i64 1
+  %tmp12961 = getelementptr inbounds float* %tmp12960, i64 1
+  %tmp12962 = getelementptr inbounds float* %tmp12961, i64 1
+  %tmp12963 = getelementptr inbounds float* %tmp12962, i64 1
+  %tmp12964 = getelementptr inbounds float* %tmp12963, i64 1
+  %tmp12965 = getelementptr inbounds float* %tmp12964, i64 1
+  %tmp12966 = getelementptr inbounds float* %tmp12965, i64 1
+  %tmp12967 = getelementptr inbounds float* %tmp12966, i64 1
+  %tmp12968 = getelementptr inbounds float* %tmp12967, i64 1
+  %tmp12969 = getelementptr inbounds float* %tmp12968, i64 1
+  %tmp12970 = getelementptr inbounds float* %tmp12969, i64 1
+  %tmp12971 = getelementptr inbounds float* %tmp12970, i64 1
+  %tmp12972 = getelementptr inbounds float* %tmp12971, i64 1
+  %tmp12973 = getelementptr inbounds float* %tmp12972, i64 1
+  %tmp12974 = getelementptr inbounds float* %tmp12973, i64 1
+  %tmp12975 = getelementptr inbounds float* %tmp12974, i64 1
+  %tmp12976 = getelementptr inbounds float* %tmp12975, i64 1
+  %tmp12977 = getelementptr inbounds float* %tmp12976, i64 1
+  %tmp12978 = getelementptr inbounds float* %tmp12977, i64 1
+  %tmp12979 = getelementptr inbounds float* %tmp12978, i64 1
+  %tmp12980 = getelementptr inbounds float* %tmp12979, i64 1
+  %tmp12981 = getelementptr inbounds float* %tmp12980, i64 1
+  %tmp12982 = getelementptr inbounds float* %tmp12981, i64 1
+  %tmp12983 = getelementptr inbounds float* %tmp12982, i64 1
+  %tmp12984 = getelementptr inbounds float* %tmp12983, i64 1
+  %tmp12985 = getelementptr inbounds float* %tmp12984, i64 1
+  %tmp12986 = getelementptr inbounds float* %tmp12985, i64 1
+  %tmp12987 = getelementptr inbounds float* %tmp12986, i64 1
+  %tmp12988 = getelementptr inbounds float* %tmp12987, i64 1
+  %tmp12989 = getelementptr inbounds float* %tmp12988, i64 1
+  %tmp12990 = getelementptr inbounds float* %tmp12989, i64 1
+  %tmp12991 = getelementptr inbounds float* %tmp12990, i64 1
+  %tmp12992 = getelementptr inbounds float* %tmp12991, i64 1
+  %tmp12993 = getelementptr inbounds float* %tmp12992, i64 1
+  %tmp12994 = getelementptr inbounds float* %tmp12993, i64 1
+  %tmp12995 = getelementptr inbounds float* %tmp12994, i64 1
+  %tmp12996 = getelementptr inbounds float* %tmp12995, i64 1
+  %tmp12997 = getelementptr inbounds float* %tmp12996, i64 1
+  %tmp12998 = getelementptr inbounds float* %tmp12997, i64 1
+  %tmp12999 = getelementptr inbounds float* %tmp12998, i64 1
+  %tmp13000 = getelementptr inbounds float* %tmp12999, i64 1
+  %tmp13001 = getelementptr inbounds float* %tmp13000, i64 1
+  %tmp13002 = getelementptr inbounds float* %tmp13001, i64 1
+  %tmp13003 = getelementptr inbounds float* %tmp13002, i64 1
+  %tmp13004 = getelementptr inbounds float* %tmp13003, i64 1
+  %tmp13005 = getelementptr inbounds float* %tmp13004, i64 1
+  %tmp13006 = getelementptr inbounds float* %tmp13005, i64 1
+  %tmp13007 = getelementptr inbounds float* %tmp13006, i64 1
+  %tmp13008 = getelementptr inbounds float* %tmp13007, i64 1
+  %tmp13009 = getelementptr inbounds float* %tmp13008, i64 1
+  %tmp13010 = getelementptr inbounds float* %tmp13009, i64 1
+  %tmp13011 = getelementptr inbounds float* %tmp13010, i64 1
+  %tmp13012 = getelementptr inbounds float* %tmp13011, i64 1
+  %tmp13013 = getelementptr inbounds float* %tmp13012, i64 1
+  %tmp13014 = getelementptr inbounds float* %tmp13013, i64 1
+  %tmp13015 = getelementptr inbounds float* %tmp13014, i64 1
+  %tmp13016 = getelementptr inbounds float* %tmp13015, i64 1
+  %tmp13017 = getelementptr inbounds float* %tmp13016, i64 1
+  %tmp13018 = getelementptr inbounds float* %tmp13017, i64 1
+  %tmp13019 = getelementptr inbounds float* %tmp13018, i64 1
+  %tmp13020 = getelementptr inbounds float* %tmp13019, i64 1
+  %tmp13021 = getelementptr inbounds float* %tmp13020, i64 1
+  %tmp13022 = getelementptr inbounds float* %tmp13021, i64 1
+  %tmp13023 = getelementptr inbounds float* %tmp13022, i64 1
+  %tmp13024 = getelementptr inbounds float* %tmp13023, i64 1
+  %tmp13025 = getelementptr inbounds float* %tmp13024, i64 1
+  %tmp13026 = getelementptr inbounds float* %tmp13025, i64 1
+  %tmp13027 = getelementptr inbounds float* %tmp13026, i64 1
+  %tmp13028 = getelementptr inbounds float* %tmp13027, i64 1
+  %tmp13029 = getelementptr inbounds float* %tmp13028, i64 1
+  %tmp13030 = getelementptr inbounds float* %tmp13029, i64 1
+  %tmp13031 = getelementptr inbounds float* %tmp13030, i64 1
+  %tmp13032 = getelementptr inbounds float* %tmp13031, i64 1
+  %tmp13033 = getelementptr inbounds float* %tmp13032, i64 1
+  %tmp13034 = getelementptr inbounds float* %tmp13033, i64 1
+  %tmp13035 = getelementptr inbounds float* %tmp13034, i64 1
+  %tmp13036 = getelementptr inbounds float* %tmp13035, i64 1
+  %tmp13037 = getelementptr inbounds float* %tmp13036, i64 1
+  %tmp13038 = getelementptr inbounds float* %tmp13037, i64 1
+  %tmp13039 = getelementptr inbounds float* %tmp13038, i64 1
+  %tmp13040 = getelementptr inbounds float* %tmp13039, i64 1
+  %tmp13041 = getelementptr inbounds float* %tmp13040, i64 1
+  %tmp13042 = getelementptr inbounds float* %tmp13041, i64 1
+  %tmp13043 = getelementptr inbounds float* %tmp13042, i64 1
+  %tmp13044 = getelementptr inbounds float* %tmp13043, i64 1
+  %tmp13045 = getelementptr inbounds float* %tmp13044, i64 1
+  %tmp13046 = getelementptr inbounds float* %tmp13045, i64 1
+  %tmp13047 = getelementptr inbounds float* %tmp13046, i64 1
+  %tmp13048 = getelementptr inbounds float* %tmp13047, i64 1
+  %tmp13049 = getelementptr inbounds float* %tmp13048, i64 1
+  %tmp13050 = getelementptr inbounds float* %tmp13049, i64 1
+  %tmp13051 = getelementptr inbounds float* %tmp13050, i64 1
+  %tmp13052 = getelementptr inbounds float* %tmp13051, i64 1
+  %tmp13053 = getelementptr inbounds float* %tmp13052, i64 1
+  %tmp13054 = getelementptr inbounds float* %tmp13053, i64 1
+  %tmp13055 = getelementptr inbounds float* %tmp13054, i64 1
+  %tmp13056 = getelementptr inbounds float* %tmp13055, i64 1
+  %tmp13057 = getelementptr inbounds float* %tmp13056, i64 1
+  %tmp13058 = getelementptr inbounds float* %tmp13057, i64 1
+  %tmp13059 = getelementptr inbounds float* %tmp13058, i64 1
+  %tmp13060 = getelementptr inbounds float* %tmp13059, i64 1
+  %tmp13061 = getelementptr inbounds float* %tmp13060, i64 1
+  %tmp13062 = getelementptr inbounds float* %tmp13061, i64 1
+  %tmp13063 = getelementptr inbounds float* %tmp13062, i64 1
+  %tmp13064 = getelementptr inbounds float* %tmp13063, i64 1
+  %tmp13065 = getelementptr inbounds float* %tmp13064, i64 1
+  %tmp13066 = getelementptr inbounds float* %tmp13065, i64 1
+  %tmp13067 = getelementptr inbounds float* %tmp13066, i64 1
+  %tmp13068 = getelementptr inbounds float* %tmp13067, i64 1
+  %tmp13069 = getelementptr inbounds float* %tmp13068, i64 1
+  %tmp13070 = getelementptr inbounds float* %tmp13069, i64 1
+  %tmp13071 = getelementptr inbounds float* %tmp13070, i64 1
+  %tmp13072 = getelementptr inbounds float* %tmp13071, i64 1
+  %tmp13073 = getelementptr inbounds float* %tmp13072, i64 1
+  %tmp13074 = getelementptr inbounds float* %tmp13073, i64 1
+  %tmp13075 = getelementptr inbounds float* %tmp13074, i64 1
+  %tmp13076 = getelementptr inbounds float* %tmp13075, i64 1
+  %tmp13077 = getelementptr inbounds float* %tmp13076, i64 1
+  %tmp13078 = getelementptr inbounds float* %tmp13077, i64 1
+  %tmp13079 = getelementptr inbounds float* %tmp13078, i64 1
+  %tmp13080 = getelementptr inbounds float* %tmp13079, i64 1
+  %tmp13081 = getelementptr inbounds float* %tmp13080, i64 1
+  %tmp13082 = getelementptr inbounds float* %tmp13081, i64 1
+  %tmp13083 = getelementptr inbounds float* %tmp13082, i64 1
+  %tmp13084 = getelementptr inbounds float* %tmp13083, i64 1
+  %tmp13085 = getelementptr inbounds float* %tmp13084, i64 1
+  %tmp13086 = getelementptr inbounds float* %tmp13085, i64 1
+  %tmp13087 = getelementptr inbounds float* %tmp13086, i64 1
+  %tmp13088 = getelementptr inbounds float* %tmp13087, i64 1
+  %tmp13089 = getelementptr inbounds float* %tmp13088, i64 1
+  %tmp13090 = getelementptr inbounds float* %tmp13089, i64 1
+  %tmp13091 = getelementptr inbounds float* %tmp13090, i64 1
+  %tmp13092 = getelementptr inbounds float* %tmp13091, i64 1
+  %tmp13093 = getelementptr inbounds float* %tmp13092, i64 1
+  %tmp13094 = getelementptr inbounds float* %tmp13093, i64 1
+  %tmp13095 = getelementptr inbounds float* %tmp13094, i64 1
+  %tmp13096 = getelementptr inbounds float* %tmp13095, i64 1
+  %tmp13097 = getelementptr inbounds float* %tmp13096, i64 1
+  %tmp13098 = getelementptr inbounds float* %tmp13097, i64 1
+  %tmp13099 = getelementptr inbounds float* %tmp13098, i64 1
+  %tmp13100 = getelementptr inbounds float* %tmp13099, i64 1
+  %tmp13101 = getelementptr inbounds float* %tmp13100, i64 1
+  %tmp13102 = getelementptr inbounds float* %tmp13101, i64 1
+  %tmp13103 = getelementptr inbounds float* %tmp13102, i64 1
+  %tmp13104 = getelementptr inbounds float* %tmp13103, i64 1
+  %tmp13105 = getelementptr inbounds float* %tmp13104, i64 1
+  %tmp13106 = getelementptr inbounds float* %tmp13105, i64 1
+  %tmp13107 = getelementptr inbounds float* %tmp13106, i64 1
+  %tmp13108 = getelementptr inbounds float* %tmp13107, i64 1
+  %tmp13109 = getelementptr inbounds float* %tmp13108, i64 1
+  %tmp13110 = getelementptr inbounds float* %tmp13109, i64 1
+  %tmp13111 = getelementptr inbounds float* %tmp13110, i64 1
+  %tmp13112 = getelementptr inbounds float* %tmp13111, i64 1
+  %tmp13113 = getelementptr inbounds float* %tmp13112, i64 1
+  %tmp13114 = getelementptr inbounds float* %tmp13113, i64 1
+  %tmp13115 = getelementptr inbounds float* %tmp13114, i64 1
+  %tmp13116 = getelementptr inbounds float* %tmp13115, i64 1
+  %tmp13117 = getelementptr inbounds float* %tmp13116, i64 1
+  %tmp13118 = getelementptr inbounds float* %tmp13117, i64 1
+  %tmp13119 = getelementptr inbounds float* %tmp13118, i64 1
+  %tmp13120 = getelementptr inbounds float* %tmp13119, i64 1
+  %tmp13121 = getelementptr inbounds float* %tmp13120, i64 1
+  %tmp13122 = getelementptr inbounds float* %tmp13121, i64 1
+  %tmp13123 = getelementptr inbounds float* %tmp13122, i64 1
+  %tmp13124 = getelementptr inbounds float* %tmp13123, i64 1
+  %tmp13125 = getelementptr inbounds float* %tmp13124, i64 1
+  %tmp13126 = getelementptr inbounds float* %tmp13125, i64 1
+  %tmp13127 = getelementptr inbounds float* %tmp13126, i64 1
+  %tmp13128 = getelementptr inbounds float* %tmp13127, i64 1
+  %tmp13129 = getelementptr inbounds float* %tmp13128, i64 1
+  %tmp13130 = getelementptr inbounds float* %tmp13129, i64 1
+  %tmp13131 = getelementptr inbounds float* %tmp13130, i64 1
+  %tmp13132 = getelementptr inbounds float* %tmp13131, i64 1
+  %tmp13133 = getelementptr inbounds float* %tmp13132, i64 1
+  %tmp13134 = getelementptr inbounds float* %tmp13133, i64 1
+  %tmp13135 = getelementptr inbounds float* %tmp13134, i64 1
+  %tmp13136 = getelementptr inbounds float* %tmp13135, i64 1
+  %tmp13137 = getelementptr inbounds float* %tmp13136, i64 1
+  %tmp13138 = getelementptr inbounds float* %tmp13137, i64 1
+  %tmp13139 = getelementptr inbounds float* %tmp13138, i64 1
+  %tmp13140 = getelementptr inbounds float* %tmp13139, i64 1
+  %tmp13141 = getelementptr inbounds float* %tmp13140, i64 1
+  %tmp13142 = getelementptr inbounds float* %tmp13141, i64 1
+  %tmp13143 = getelementptr inbounds float* %tmp13142, i64 1
+  %tmp13144 = getelementptr inbounds float* %tmp13143, i64 1
+  %tmp13145 = getelementptr inbounds float* %tmp13144, i64 1
+  %tmp13146 = getelementptr inbounds float* %tmp13145, i64 1
+  %tmp13147 = getelementptr inbounds float* %tmp13146, i64 1
+  %tmp13148 = getelementptr inbounds float* %tmp13147, i64 1
+  %tmp13149 = getelementptr inbounds float* %tmp13148, i64 1
+  %tmp13150 = getelementptr inbounds float* %tmp13149, i64 1
+  %tmp13151 = getelementptr inbounds float* %tmp13150, i64 1
+  %tmp13152 = getelementptr inbounds float* %tmp13151, i64 1
+  %tmp13153 = getelementptr inbounds float* %tmp13152, i64 1
+  %tmp13154 = getelementptr inbounds float* %tmp13153, i64 1
+  %tmp13155 = getelementptr inbounds float* %tmp13154, i64 1
+  %tmp13156 = getelementptr inbounds float* %tmp13155, i64 1
+  %tmp13157 = getelementptr inbounds float* %tmp13156, i64 1
+  %tmp13158 = getelementptr inbounds float* %tmp13157, i64 1
+  %tmp13159 = getelementptr inbounds float* %tmp13158, i64 1
+  %tmp13160 = getelementptr inbounds float* %tmp13159, i64 1
+  %tmp13161 = getelementptr inbounds float* %tmp13160, i64 1
+  %tmp13162 = getelementptr inbounds float* %tmp13161, i64 1
+  %tmp13163 = getelementptr inbounds float* %tmp13162, i64 1
+  %tmp13164 = getelementptr inbounds float* %tmp13163, i64 1
+  %tmp13165 = getelementptr inbounds float* %tmp13164, i64 1
+  %tmp13166 = getelementptr inbounds float* %tmp13165, i64 1
+  %tmp13167 = getelementptr inbounds float* %tmp13166, i64 1
+  %tmp13168 = getelementptr inbounds float* %tmp13167, i64 1
+  %tmp13169 = getelementptr inbounds float* %tmp13168, i64 1
+  %tmp13170 = getelementptr inbounds float* %tmp13169, i64 1
+  %tmp13171 = getelementptr inbounds float* %tmp13170, i64 1
+  %tmp13172 = getelementptr inbounds float* %tmp13171, i64 1
+  %tmp13173 = getelementptr inbounds float* %tmp13172, i64 1
+  %tmp13174 = getelementptr inbounds float* %tmp13173, i64 1
+  %tmp13175 = getelementptr inbounds float* %tmp13174, i64 1
+  %tmp13176 = getelementptr inbounds float* %tmp13175, i64 1
+  %tmp13177 = getelementptr inbounds float* %tmp13176, i64 1
+  %tmp13178 = getelementptr inbounds float* %tmp13177, i64 1
+  %tmp13179 = getelementptr inbounds float* %tmp13178, i64 1
+  %tmp13180 = getelementptr inbounds float* %tmp13179, i64 1
+  %tmp13181 = getelementptr inbounds float* %tmp13180, i64 1
+  %tmp13182 = getelementptr inbounds float* %tmp13181, i64 1
+  %tmp13183 = getelementptr inbounds float* %tmp13182, i64 1
+  %tmp13184 = getelementptr inbounds float* %tmp13183, i64 1
+  %tmp13185 = getelementptr inbounds float* %tmp13184, i64 1
+  %tmp13186 = getelementptr inbounds float* %tmp13185, i64 1
+  %tmp13187 = getelementptr inbounds float* %tmp13186, i64 1
+  %tmp13188 = getelementptr inbounds float* %tmp13187, i64 1
+  %tmp13189 = getelementptr inbounds float* %tmp13188, i64 1
+  %tmp13190 = getelementptr inbounds float* %tmp13189, i64 1
+  %tmp13191 = getelementptr inbounds float* %tmp13190, i64 1
+  %tmp13192 = getelementptr inbounds float* %tmp13191, i64 1
+  %tmp13193 = getelementptr inbounds float* %tmp13192, i64 1
+  %tmp13194 = getelementptr inbounds float* %tmp13193, i64 1
+  %tmp13195 = getelementptr inbounds float* %tmp13194, i64 1
+  %tmp13196 = getelementptr inbounds float* %tmp13195, i64 1
+  %tmp13197 = getelementptr inbounds float* %tmp13196, i64 1
+  %tmp13198 = getelementptr inbounds float* %tmp13197, i64 1
+  %tmp13199 = getelementptr inbounds float* %tmp13198, i64 1
+  %tmp13200 = getelementptr inbounds float* %tmp13199, i64 1
+  %tmp13201 = getelementptr inbounds float* %tmp13200, i64 1
+  %tmp13202 = getelementptr inbounds float* %tmp13201, i64 1
+  %tmp13203 = getelementptr inbounds float* %tmp13202, i64 1
+  %tmp13204 = getelementptr inbounds float* %tmp13203, i64 1
+  %tmp13205 = getelementptr inbounds float* %tmp13204, i64 1
+  %tmp13206 = getelementptr inbounds float* %tmp13205, i64 1
+  %tmp13207 = getelementptr inbounds float* %tmp13206, i64 1
+  %tmp13208 = getelementptr inbounds float* %tmp13207, i64 1
+  %tmp13209 = getelementptr inbounds float* %tmp13208, i64 1
+  %tmp13210 = getelementptr inbounds float* %tmp13209, i64 1
+  %tmp13211 = getelementptr inbounds float* %tmp13210, i64 1
+  %tmp13212 = getelementptr inbounds float* %tmp13211, i64 1
+  %tmp13213 = getelementptr inbounds float* %tmp13212, i64 1
+  %tmp13214 = getelementptr inbounds float* %tmp13213, i64 1
+  %tmp13215 = getelementptr inbounds float* %tmp13214, i64 1
+  %tmp13216 = getelementptr inbounds float* %tmp13215, i64 1
+  %tmp13217 = getelementptr inbounds float* %tmp13216, i64 1
+  %tmp13218 = getelementptr inbounds float* %tmp13217, i64 1
+  %tmp13219 = getelementptr inbounds float* %tmp13218, i64 1
+  %tmp13220 = getelementptr inbounds float* %tmp13219, i64 1
+  %tmp13221 = getelementptr inbounds float* %tmp13220, i64 1
+  %tmp13222 = getelementptr inbounds float* %tmp13221, i64 1
+  %tmp13223 = getelementptr inbounds float* %tmp13222, i64 1
+  %tmp13224 = getelementptr inbounds float* %tmp13223, i64 1
+  %tmp13225 = getelementptr inbounds float* %tmp13224, i64 1
+  %tmp13226 = getelementptr inbounds float* %tmp13225, i64 1
+  %tmp13227 = getelementptr inbounds float* %tmp13226, i64 1
+  %tmp13228 = getelementptr inbounds float* %tmp13227, i64 1
+  %tmp13229 = getelementptr inbounds float* %tmp13228, i64 1
+  %tmp13230 = getelementptr inbounds float* %tmp13229, i64 1
+  %tmp13231 = getelementptr inbounds float* %tmp13230, i64 1
+  %tmp13232 = getelementptr inbounds float* %tmp13231, i64 1
+  %tmp13233 = getelementptr inbounds float* %tmp13232, i64 1
+  %tmp13234 = getelementptr inbounds float* %tmp13233, i64 1
+  %tmp13235 = getelementptr inbounds float* %tmp13234, i64 1
+  %tmp13236 = getelementptr inbounds float* %tmp13235, i64 1
+  %tmp13237 = getelementptr inbounds float* %tmp13236, i64 1
+  %tmp13238 = getelementptr inbounds float* %tmp13237, i64 1
+  %tmp13239 = getelementptr inbounds float* %tmp13238, i64 1
+  %tmp13240 = getelementptr inbounds float* %tmp13239, i64 1
+  %tmp13241 = getelementptr inbounds float* %tmp13240, i64 1
+  %tmp13242 = getelementptr inbounds float* %tmp13241, i64 1
+  %tmp13243 = getelementptr inbounds float* %tmp13242, i64 1
+  %tmp13244 = getelementptr inbounds float* %tmp13243, i64 1
+  %tmp13245 = getelementptr inbounds float* %tmp13244, i64 1
+  %tmp13246 = getelementptr inbounds float* %tmp13245, i64 1
+  %tmp13247 = getelementptr inbounds float* %tmp13246, i64 1
+  %tmp13248 = getelementptr inbounds float* %tmp13247, i64 1
+  %tmp13249 = getelementptr inbounds float* %tmp13248, i64 1
+  %tmp13250 = getelementptr inbounds float* %tmp13249, i64 1
+  %tmp13251 = getelementptr inbounds float* %tmp13250, i64 1
+  %tmp13252 = getelementptr inbounds float* %tmp13251, i64 1
+  %tmp13253 = getelementptr inbounds float* %tmp13252, i64 1
+  %tmp13254 = getelementptr inbounds float* %tmp13253, i64 1
+  %tmp13255 = getelementptr inbounds float* %tmp13254, i64 1
+  %tmp13256 = getelementptr inbounds float* %tmp13255, i64 1
+  %tmp13257 = getelementptr inbounds float* %tmp13256, i64 1
+  %tmp13258 = getelementptr inbounds float* %tmp13257, i64 1
+  %tmp13259 = getelementptr inbounds float* %tmp13258, i64 1
+  %tmp13260 = getelementptr inbounds float* %tmp13259, i64 1
+  %tmp13261 = getelementptr inbounds float* %tmp13260, i64 1
+  %tmp13262 = getelementptr inbounds float* %tmp13261, i64 1
+  %tmp13263 = getelementptr inbounds float* %tmp13262, i64 1
+  %tmp13264 = getelementptr inbounds float* %tmp13263, i64 1
+  %tmp13265 = getelementptr inbounds float* %tmp13264, i64 1
+  %tmp13266 = getelementptr inbounds float* %tmp13265, i64 1
+  %tmp13267 = getelementptr inbounds float* %tmp13266, i64 1
+  %tmp13268 = getelementptr inbounds float* %tmp13267, i64 1
+  %tmp13269 = getelementptr inbounds float* %tmp13268, i64 1
+  %tmp13270 = getelementptr inbounds float* %tmp13269, i64 1
+  %tmp13271 = getelementptr inbounds float* %tmp13270, i64 1
+  %tmp13272 = getelementptr inbounds float* %tmp13271, i64 1
+  %tmp13273 = getelementptr inbounds float* %tmp13272, i64 1
+  %tmp13274 = getelementptr inbounds float* %tmp13273, i64 1
+  %tmp13275 = getelementptr inbounds float* %tmp13274, i64 1
+  %tmp13276 = getelementptr inbounds float* %tmp13275, i64 1
+  %tmp13277 = getelementptr inbounds float* %tmp13276, i64 1
+  %tmp13278 = getelementptr inbounds float* %tmp13277, i64 1
+  %tmp13279 = getelementptr inbounds float* %tmp13278, i64 1
+  %tmp13280 = getelementptr inbounds float* %tmp13279, i64 1
+  %tmp13281 = getelementptr inbounds float* %tmp13280, i64 1
+  %tmp13282 = getelementptr inbounds float* %tmp13281, i64 1
+  %tmp13283 = getelementptr inbounds float* %tmp13282, i64 1
+  %tmp13284 = getelementptr inbounds float* %tmp13283, i64 1
+  %tmp13285 = getelementptr inbounds float* %tmp13284, i64 1
+  %tmp13286 = getelementptr inbounds float* %tmp13285, i64 1
+  %tmp13287 = getelementptr inbounds float* %tmp13286, i64 1
+  %tmp13288 = getelementptr inbounds float* %tmp13287, i64 1
+  %tmp13289 = getelementptr inbounds float* %tmp13288, i64 1
+  %tmp13290 = getelementptr inbounds float* %tmp13289, i64 1
+  %tmp13291 = getelementptr inbounds float* %tmp13290, i64 1
+  %tmp13292 = getelementptr inbounds float* %tmp13291, i64 1
+  %tmp13293 = getelementptr inbounds float* %tmp13292, i64 1
+  %tmp13294 = getelementptr inbounds float* %tmp13293, i64 1
+  %tmp13295 = getelementptr inbounds float* %tmp13294, i64 1
+  %tmp13296 = getelementptr inbounds float* %tmp13295, i64 1
+  %tmp13297 = getelementptr inbounds float* %tmp13296, i64 1
+  %tmp13298 = getelementptr inbounds float* %tmp13297, i64 1
+  %tmp13299 = getelementptr inbounds float* %tmp13298, i64 1
+  %tmp13300 = getelementptr inbounds float* %tmp13299, i64 1
+  %tmp13301 = getelementptr inbounds float* %tmp13300, i64 1
+  %tmp13302 = getelementptr inbounds float* %tmp13301, i64 1
+  %tmp13303 = getelementptr inbounds float* %tmp13302, i64 1
+  %tmp13304 = getelementptr inbounds float* %tmp13303, i64 1
+  %tmp13305 = getelementptr inbounds float* %tmp13304, i64 1
+  %tmp13306 = getelementptr inbounds float* %tmp13305, i64 1
+  %tmp13307 = getelementptr inbounds float* %tmp13306, i64 1
+  %tmp13308 = getelementptr inbounds float* %tmp13307, i64 1
+  %tmp13309 = getelementptr inbounds float* %tmp13308, i64 1
+  %tmp13310 = getelementptr inbounds float* %tmp13309, i64 1
+  %tmp13311 = getelementptr inbounds float* %tmp13310, i64 1
+  %tmp13312 = getelementptr inbounds float* %tmp13311, i64 1
+  %tmp13313 = getelementptr inbounds float* %tmp13312, i64 1
+  %tmp13314 = getelementptr inbounds float* %tmp13313, i64 1
+  %tmp13315 = getelementptr inbounds float* %tmp13314, i64 1
+  %tmp13316 = getelementptr inbounds float* %tmp13315, i64 1
+  %tmp13317 = getelementptr inbounds float* %tmp13316, i64 1
+  %tmp13318 = getelementptr inbounds float* %tmp13317, i64 1
+  %tmp13319 = getelementptr inbounds float* %tmp13318, i64 1
+  %tmp13320 = getelementptr inbounds float* %tmp13319, i64 1
+  %tmp13321 = getelementptr inbounds float* %tmp13320, i64 1
+  %tmp13322 = getelementptr inbounds float* %tmp13321, i64 1
+  %tmp13323 = getelementptr inbounds float* %tmp13322, i64 1
+  %tmp13324 = getelementptr inbounds float* %tmp13323, i64 1
+  %tmp13325 = getelementptr inbounds float* %tmp13324, i64 1
+  %tmp13326 = getelementptr inbounds float* %tmp13325, i64 1
+  %tmp13327 = getelementptr inbounds float* %tmp13326, i64 1
+  %tmp13328 = getelementptr inbounds float* %tmp13327, i64 1
+  %tmp13329 = getelementptr inbounds float* %tmp13328, i64 1
+  %tmp13330 = getelementptr inbounds float* %tmp13329, i64 1
+  %tmp13331 = getelementptr inbounds float* %tmp13330, i64 1
+  %tmp13332 = getelementptr inbounds float* %tmp13331, i64 1
+  %tmp13333 = getelementptr inbounds float* %tmp13332, i64 1
+  %tmp13334 = getelementptr inbounds float* %tmp13333, i64 1
+  %tmp13335 = getelementptr inbounds float* %tmp13334, i64 1
+  %tmp13336 = getelementptr inbounds float* %tmp13335, i64 1
+  %tmp13337 = getelementptr inbounds float* %tmp13336, i64 1
+  %tmp13338 = getelementptr inbounds float* %tmp13337, i64 1
+  %tmp13339 = getelementptr inbounds float* %tmp13338, i64 1
+  %tmp13340 = getelementptr inbounds float* %tmp13339, i64 1
+  %tmp13341 = getelementptr inbounds float* %tmp13340, i64 1
+  %tmp13342 = getelementptr inbounds float* %tmp13341, i64 1
+  %tmp13343 = getelementptr inbounds float* %tmp13342, i64 1
+  %tmp13344 = getelementptr inbounds float* %tmp13343, i64 1
+  %tmp13345 = getelementptr inbounds float* %tmp13344, i64 1
+  %tmp13346 = getelementptr inbounds float* %tmp13345, i64 1
+  %tmp13347 = getelementptr inbounds float* %tmp13346, i64 1
+  %tmp13348 = getelementptr inbounds float* %tmp13347, i64 1
+  %tmp13349 = getelementptr inbounds float* %tmp13348, i64 1
+  %tmp13350 = getelementptr inbounds float* %tmp13349, i64 1
+  %tmp13351 = getelementptr inbounds float* %tmp13350, i64 1
+  %tmp13352 = getelementptr inbounds float* %tmp13351, i64 1
+  %tmp13353 = getelementptr inbounds float* %tmp13352, i64 1
+  %tmp13354 = getelementptr inbounds float* %tmp13353, i64 1
+  %tmp13355 = getelementptr inbounds float* %tmp13354, i64 1
+  %tmp13356 = getelementptr inbounds float* %tmp13355, i64 1
+  %tmp13357 = getelementptr inbounds float* %tmp13356, i64 1
+  %tmp13358 = getelementptr inbounds float* %tmp13357, i64 1
+  %tmp13359 = getelementptr inbounds float* %tmp13358, i64 1
+  %tmp13360 = getelementptr inbounds float* %tmp13359, i64 1
+  %tmp13361 = getelementptr inbounds float* %tmp13360, i64 1
+  %tmp13362 = getelementptr inbounds float* %tmp13361, i64 1
+  %tmp13363 = getelementptr inbounds float* %tmp13362, i64 1
+  %tmp13364 = getelementptr inbounds float* %tmp13363, i64 1
+  %tmp13365 = getelementptr inbounds float* %tmp13364, i64 1
+  %tmp13366 = getelementptr inbounds float* %tmp13365, i64 1
+  %tmp13367 = getelementptr inbounds float* %tmp13366, i64 1
+  %tmp13368 = getelementptr inbounds float* %tmp13367, i64 1
+  %tmp13369 = getelementptr inbounds float* %tmp13368, i64 1
+  %tmp13370 = getelementptr inbounds float* %tmp13369, i64 1
+  %tmp13371 = getelementptr inbounds float* %tmp13370, i64 1
+  %tmp13372 = getelementptr inbounds float* %tmp13371, i64 1
+  %tmp13373 = getelementptr inbounds float* %tmp13372, i64 1
+  %tmp13374 = getelementptr inbounds float* %tmp13373, i64 1
+  %tmp13375 = getelementptr inbounds float* %tmp13374, i64 1
+  %tmp13376 = getelementptr inbounds float* %tmp13375, i64 1
+  %tmp13377 = getelementptr inbounds float* %tmp13376, i64 1
+  %tmp13378 = getelementptr inbounds float* %tmp13377, i64 1
+  %tmp13379 = getelementptr inbounds float* %tmp13378, i64 1
+  %tmp13380 = getelementptr inbounds float* %tmp13379, i64 1
+  %tmp13381 = getelementptr inbounds float* %tmp13380, i64 1
+  %tmp13382 = getelementptr inbounds float* %tmp13381, i64 1
+  %tmp13383 = getelementptr inbounds float* %tmp13382, i64 1
+  %tmp13384 = getelementptr inbounds float* %tmp13383, i64 1
+  %tmp13385 = getelementptr inbounds float* %tmp13384, i64 1
+  %tmp13386 = getelementptr inbounds float* %tmp13385, i64 1
+  %tmp13387 = getelementptr inbounds float* %tmp13386, i64 1
+  %tmp13388 = getelementptr inbounds float* %tmp13387, i64 1
+  %tmp13389 = getelementptr inbounds float* %tmp13388, i64 1
+  %tmp13390 = getelementptr inbounds float* %tmp13389, i64 1
+  %tmp13391 = getelementptr inbounds float* %tmp13390, i64 1
+  %tmp13392 = getelementptr inbounds float* %tmp13391, i64 1
+  %tmp13393 = getelementptr inbounds float* %tmp13392, i64 1
+  %tmp13394 = getelementptr inbounds float* %tmp13393, i64 1
+  %tmp13395 = getelementptr inbounds float* %tmp13394, i64 1
+  %tmp13396 = getelementptr inbounds float* %tmp13395, i64 1
+  %tmp13397 = getelementptr inbounds float* %tmp13396, i64 1
+  %tmp13398 = getelementptr inbounds float* %tmp13397, i64 1
+  %tmp13399 = getelementptr inbounds float* %tmp13398, i64 1
+  %tmp13400 = getelementptr inbounds float* %tmp13399, i64 1
+  %tmp13401 = getelementptr inbounds float* %tmp13400, i64 1
+  %tmp13402 = getelementptr inbounds float* %tmp13401, i64 1
+  %tmp13403 = getelementptr inbounds float* %tmp13402, i64 1
+  %tmp13404 = getelementptr inbounds float* %tmp13403, i64 1
+  %tmp13405 = getelementptr inbounds float* %tmp13404, i64 1
+  %tmp13406 = getelementptr inbounds float* %tmp13405, i64 1
+  %tmp13407 = getelementptr inbounds float* %tmp13406, i64 1
+  %tmp13408 = getelementptr inbounds float* %tmp13407, i64 1
+  %tmp13409 = getelementptr inbounds float* %tmp13408, i64 1
+  %tmp13410 = getelementptr inbounds float* %tmp13409, i64 1
+  %tmp13411 = getelementptr inbounds float* %tmp13410, i64 1
+  %tmp13412 = getelementptr inbounds float* %tmp13411, i64 1
+  %tmp13413 = getelementptr inbounds float* %tmp13412, i64 1
+  %tmp13414 = getelementptr inbounds float* %tmp13413, i64 1
+  %tmp13415 = getelementptr inbounds float* %tmp13414, i64 1
+  %tmp13416 = getelementptr inbounds float* %tmp13415, i64 1
+  %tmp13417 = getelementptr inbounds float* %tmp13416, i64 1
+  %tmp13418 = getelementptr inbounds float* %tmp13417, i64 1
+  %tmp13419 = getelementptr inbounds float* %tmp13418, i64 1
+  %tmp13420 = getelementptr inbounds float* %tmp13419, i64 1
+  %tmp13421 = getelementptr inbounds float* %tmp13420, i64 1
+  %tmp13422 = getelementptr inbounds float* %tmp13421, i64 1
+  %tmp13423 = getelementptr inbounds float* %tmp13422, i64 1
+  %tmp13424 = getelementptr inbounds float* %tmp13423, i64 1
+  %tmp13425 = getelementptr inbounds float* %tmp13424, i64 1
+  %tmp13426 = getelementptr inbounds float* %tmp13425, i64 1
+  %tmp13427 = getelementptr inbounds float* %tmp13426, i64 1
+  %tmp13428 = getelementptr inbounds float* %tmp13427, i64 1
+  %tmp13429 = getelementptr inbounds float* %tmp13428, i64 1
+  %tmp13430 = getelementptr inbounds float* %tmp13429, i64 1
+  %tmp13431 = getelementptr inbounds float* %tmp13430, i64 1
+  %tmp13432 = getelementptr inbounds float* %tmp13431, i64 1
+  %tmp13433 = getelementptr inbounds float* %tmp13432, i64 1
+  %tmp13434 = getelementptr inbounds float* %tmp13433, i64 1
+  %tmp13435 = getelementptr inbounds float* %tmp13434, i64 1
+  %tmp13436 = getelementptr inbounds float* %tmp13435, i64 1
+  %tmp13437 = getelementptr inbounds float* %tmp13436, i64 1
+  %tmp13438 = getelementptr inbounds float* %tmp13437, i64 1
+  %tmp13439 = getelementptr inbounds float* %tmp13438, i64 1
+  %tmp13440 = getelementptr inbounds float* %tmp13439, i64 1
+  %tmp13441 = getelementptr inbounds float* %tmp13440, i64 1
+  %tmp13442 = getelementptr inbounds float* %tmp13441, i64 1
+  %tmp13443 = getelementptr inbounds float* %tmp13442, i64 1
+  %tmp13444 = getelementptr inbounds float* %tmp13443, i64 1
+  %tmp13445 = getelementptr inbounds float* %tmp13444, i64 1
+  %tmp13446 = getelementptr inbounds float* %tmp13445, i64 1
+  %tmp13447 = getelementptr inbounds float* %tmp13446, i64 1
+  %tmp13448 = getelementptr inbounds float* %tmp13447, i64 1
+  %tmp13449 = getelementptr inbounds float* %tmp13448, i64 1
+  %tmp13450 = getelementptr inbounds float* %tmp13449, i64 1
+  %tmp13451 = getelementptr inbounds float* %tmp13450, i64 1
+  %tmp13452 = getelementptr inbounds float* %tmp13451, i64 1
+  %tmp13453 = getelementptr inbounds float* %tmp13452, i64 1
+  %tmp13454 = getelementptr inbounds float* %tmp13453, i64 1
+  %tmp13455 = getelementptr inbounds float* %tmp13454, i64 1
+  %tmp13456 = getelementptr inbounds float* %tmp13455, i64 1
+  %tmp13457 = getelementptr inbounds float* %tmp13456, i64 1
+  %tmp13458 = getelementptr inbounds float* %tmp13457, i64 1
+  %tmp13459 = getelementptr inbounds float* %tmp13458, i64 1
+  %tmp13460 = getelementptr inbounds float* %tmp13459, i64 1
+  %tmp13461 = getelementptr inbounds float* %tmp13460, i64 1
+  %tmp13462 = getelementptr inbounds float* %tmp13461, i64 1
+  %tmp13463 = getelementptr inbounds float* %tmp13462, i64 1
+  %tmp13464 = getelementptr inbounds float* %tmp13463, i64 1
+  %tmp13465 = getelementptr inbounds float* %tmp13464, i64 1
+  %tmp13466 = getelementptr inbounds float* %tmp13465, i64 1
+  %tmp13467 = getelementptr inbounds float* %tmp13466, i64 1
+  %tmp13468 = getelementptr inbounds float* %tmp13467, i64 1
+  %tmp13469 = getelementptr inbounds float* %tmp13468, i64 1
+  %tmp13470 = getelementptr inbounds float* %tmp13469, i64 1
+  %tmp13471 = getelementptr inbounds float* %tmp13470, i64 1
+  %tmp13472 = getelementptr inbounds float* %tmp13471, i64 1
+  %tmp13473 = getelementptr inbounds float* %tmp13472, i64 1
+  %tmp13474 = getelementptr inbounds float* %tmp13473, i64 1
+  %tmp13475 = getelementptr inbounds float* %tmp13474, i64 1
+  %tmp13476 = getelementptr inbounds float* %tmp13475, i64 1
+  %tmp13477 = getelementptr inbounds float* %tmp13476, i64 1
+  %tmp13478 = getelementptr inbounds float* %tmp13477, i64 1
+  %tmp13479 = getelementptr inbounds float* %tmp13478, i64 1
+  %tmp13480 = getelementptr inbounds float* %tmp13479, i64 1
+  %tmp13481 = getelementptr inbounds float* %tmp13480, i64 1
+  %tmp13482 = getelementptr inbounds float* %tmp13481, i64 1
+  %tmp13483 = getelementptr inbounds float* %tmp13482, i64 1
+  %tmp13484 = getelementptr inbounds float* %tmp13483, i64 1
+  %tmp13485 = getelementptr inbounds float* %tmp13484, i64 1
+  %tmp13486 = getelementptr inbounds float* %tmp13485, i64 1
+  %tmp13487 = getelementptr inbounds float* %tmp13486, i64 1
+  %tmp13488 = getelementptr inbounds float* %tmp13487, i64 1
+  %tmp13489 = getelementptr inbounds float* %tmp13488, i64 1
+  %tmp13490 = getelementptr inbounds float* %tmp13489, i64 1
+  %tmp13491 = getelementptr inbounds float* %tmp13490, i64 1
+  %tmp13492 = getelementptr inbounds float* %tmp13491, i64 1
+  %tmp13493 = getelementptr inbounds float* %tmp13492, i64 1
+  %tmp13494 = getelementptr inbounds float* %tmp13493, i64 1
+  %tmp13495 = getelementptr inbounds float* %tmp13494, i64 1
+  %tmp13496 = getelementptr inbounds float* %tmp13495, i64 1
+  %tmp13497 = getelementptr inbounds float* %tmp13496, i64 1
+  %tmp13498 = getelementptr inbounds float* %tmp13497, i64 1
+  %tmp13499 = getelementptr inbounds float* %tmp13498, i64 1
+  %tmp13500 = getelementptr inbounds float* %tmp13499, i64 1
+  %tmp13501 = getelementptr inbounds float* %tmp13500, i64 1
+  %tmp13502 = getelementptr inbounds float* %tmp13501, i64 1
+  %tmp13503 = getelementptr inbounds float* %tmp13502, i64 1
+  %tmp13504 = getelementptr inbounds float* %tmp13503, i64 1
+  %tmp13505 = getelementptr inbounds float* %tmp13504, i64 1
+  %tmp13506 = getelementptr inbounds float* %tmp13505, i64 1
+  %tmp13507 = getelementptr inbounds float* %tmp13506, i64 1
+  %tmp13508 = getelementptr inbounds float* %tmp13507, i64 1
+  %tmp13509 = getelementptr inbounds float* %tmp13508, i64 1
+  %tmp13510 = getelementptr inbounds float* %tmp13509, i64 1
+  %tmp13511 = getelementptr inbounds float* %tmp13510, i64 1
+  %tmp13512 = getelementptr inbounds float* %tmp13511, i64 1
+  %tmp13513 = getelementptr inbounds float* %tmp13512, i64 1
+  %tmp13514 = getelementptr inbounds float* %tmp13513, i64 1
+  %tmp13515 = getelementptr inbounds float* %tmp13514, i64 1
+  %tmp13516 = getelementptr inbounds float* %tmp13515, i64 1
+  %tmp13517 = getelementptr inbounds float* %tmp13516, i64 1
+  %tmp13518 = getelementptr inbounds float* %tmp13517, i64 1
+  %tmp13519 = getelementptr inbounds float* %tmp13518, i64 1
+  %tmp13520 = getelementptr inbounds float* %tmp13519, i64 1
+  %tmp13521 = getelementptr inbounds float* %tmp13520, i64 1
+  %tmp13522 = getelementptr inbounds float* %tmp13521, i64 1
+  %tmp13523 = getelementptr inbounds float* %tmp13522, i64 1
+  %tmp13524 = getelementptr inbounds float* %tmp13523, i64 1
+  %tmp13525 = getelementptr inbounds float* %tmp13524, i64 1
+  %tmp13526 = getelementptr inbounds float* %tmp13525, i64 1
+  %tmp13527 = getelementptr inbounds float* %tmp13526, i64 1
+  %tmp13528 = getelementptr inbounds float* %tmp13527, i64 1
+  %tmp13529 = getelementptr inbounds float* %tmp13528, i64 1
+  %tmp13530 = getelementptr inbounds float* %tmp13529, i64 1
+  %tmp13531 = getelementptr inbounds float* %tmp13530, i64 1
+  %tmp13532 = getelementptr inbounds float* %tmp13531, i64 1
+  %tmp13533 = getelementptr inbounds float* %tmp13532, i64 1
+  %tmp13534 = getelementptr inbounds float* %tmp13533, i64 1
+  %tmp13535 = getelementptr inbounds float* %tmp13534, i64 1
+  %tmp13536 = getelementptr inbounds float* %tmp13535, i64 1
+  %tmp13537 = getelementptr inbounds float* %tmp13536, i64 1
+  %tmp13538 = getelementptr inbounds float* %tmp13537, i64 1
+  %tmp13539 = getelementptr inbounds float* %tmp13538, i64 1
+  %tmp13540 = getelementptr inbounds float* %tmp13539, i64 1
+  %tmp13541 = getelementptr inbounds float* %tmp13540, i64 1
+  %tmp13542 = getelementptr inbounds float* %tmp13541, i64 1
+  %tmp13543 = getelementptr inbounds float* %tmp13542, i64 1
+  %tmp13544 = getelementptr inbounds float* %tmp13543, i64 1
+  %tmp13545 = getelementptr inbounds float* %tmp13544, i64 1
+  %tmp13546 = getelementptr inbounds float* %tmp13545, i64 1
+  %tmp13547 = getelementptr inbounds float* %tmp13546, i64 1
+  %tmp13548 = getelementptr inbounds float* %tmp13547, i64 1
+  %tmp13549 = getelementptr inbounds float* %tmp13548, i64 1
+  %tmp13550 = getelementptr inbounds float* %tmp13549, i64 1
+  %tmp13551 = getelementptr inbounds float* %tmp13550, i64 1
+  %tmp13552 = getelementptr inbounds float* %tmp13551, i64 1
+  %tmp13553 = getelementptr inbounds float* %tmp13552, i64 1
+  %tmp13554 = getelementptr inbounds float* %tmp13553, i64 1
+  %tmp13555 = getelementptr inbounds float* %tmp13554, i64 1
+  %tmp13556 = getelementptr inbounds float* %tmp13555, i64 1
+  %tmp13557 = getelementptr inbounds float* %tmp13556, i64 1
+  %tmp13558 = getelementptr inbounds float* %tmp13557, i64 1
+  %tmp13559 = getelementptr inbounds float* %tmp13558, i64 1
+  %tmp13560 = getelementptr inbounds float* %tmp13559, i64 1
+  %tmp13561 = getelementptr inbounds float* %tmp13560, i64 1
+  %tmp13562 = getelementptr inbounds float* %tmp13561, i64 1
+  %tmp13563 = getelementptr inbounds float* %tmp13562, i64 1
+  %tmp13564 = getelementptr inbounds float* %tmp13563, i64 1
+  %tmp13565 = getelementptr inbounds float* %tmp13564, i64 1
+  %tmp13566 = getelementptr inbounds float* %tmp13565, i64 1
+  %tmp13567 = getelementptr inbounds float* %tmp13566, i64 1
+  %tmp13568 = getelementptr inbounds float* %tmp13567, i64 1
+  %tmp13569 = getelementptr inbounds float* %tmp13568, i64 1
+  %tmp13570 = getelementptr inbounds float* %tmp13569, i64 1
+  %tmp13571 = getelementptr inbounds float* %tmp13570, i64 1
+  %tmp13572 = getelementptr inbounds float* %tmp13571, i64 1
+  %tmp13573 = getelementptr inbounds float* %tmp13572, i64 1
+  %tmp13574 = getelementptr inbounds float* %tmp13573, i64 1
+  %tmp13575 = getelementptr inbounds float* %tmp13574, i64 1
+  %tmp13576 = getelementptr inbounds float* %tmp13575, i64 1
+  %tmp13577 = getelementptr inbounds float* %tmp13576, i64 1
+  %tmp13578 = getelementptr inbounds float* %tmp13577, i64 1
+  %tmp13579 = getelementptr inbounds float* %tmp13578, i64 1
+  %tmp13580 = getelementptr inbounds float* %tmp13579, i64 1
+  %tmp13581 = getelementptr inbounds float* %tmp13580, i64 1
+  %tmp13582 = getelementptr inbounds float* %tmp13581, i64 1
+  %tmp13583 = getelementptr inbounds float* %tmp13582, i64 1
+  %tmp13584 = getelementptr inbounds float* %tmp13583, i64 1
+  %tmp13585 = getelementptr inbounds float* %tmp13584, i64 1
+  %tmp13586 = getelementptr inbounds float* %tmp13585, i64 1
+  %tmp13587 = getelementptr inbounds float* %tmp13586, i64 1
+  %tmp13588 = getelementptr inbounds float* %tmp13587, i64 1
+  %tmp13589 = getelementptr inbounds float* %tmp13588, i64 1
+  %tmp13590 = getelementptr inbounds float* %tmp13589, i64 1
+  %tmp13591 = getelementptr inbounds float* %tmp13590, i64 1
+  %tmp13592 = getelementptr inbounds float* %tmp13591, i64 1
+  %tmp13593 = getelementptr inbounds float* %tmp13592, i64 1
+  %tmp13594 = getelementptr inbounds float* %tmp13593, i64 1
+  %tmp13595 = getelementptr inbounds float* %tmp13594, i64 1
+  %tmp13596 = getelementptr inbounds float* %tmp13595, i64 1
+  %tmp13597 = getelementptr inbounds float* %tmp13596, i64 1
+  %tmp13598 = getelementptr inbounds float* %tmp13597, i64 1
+  %tmp13599 = getelementptr inbounds float* %tmp13598, i64 1
+  %tmp13600 = getelementptr inbounds float* %tmp13599, i64 1
+  %tmp13601 = getelementptr inbounds float* %tmp13600, i64 1
+  %tmp13602 = getelementptr inbounds float* %tmp13601, i64 1
+  %tmp13603 = getelementptr inbounds float* %tmp13602, i64 1
+  %tmp13604 = getelementptr inbounds float* %tmp13603, i64 1
+  %tmp13605 = getelementptr inbounds float* %tmp13604, i64 1
+  %tmp13606 = getelementptr inbounds float* %tmp13605, i64 1
+  %tmp13607 = getelementptr inbounds float* %tmp13606, i64 1
+  %tmp13608 = getelementptr inbounds float* %tmp13607, i64 1
+  %tmp13609 = getelementptr inbounds float* %tmp13608, i64 1
+  %tmp13610 = getelementptr inbounds float* %tmp13609, i64 1
+  %tmp13611 = getelementptr inbounds float* %tmp13610, i64 1
+  %tmp13612 = getelementptr inbounds float* %tmp13611, i64 1
+  %tmp13613 = getelementptr inbounds float* %tmp13612, i64 1
+  %tmp13614 = getelementptr inbounds float* %tmp13613, i64 1
+  %tmp13615 = getelementptr inbounds float* %tmp13614, i64 1
+  %tmp13616 = getelementptr inbounds float* %tmp13615, i64 1
+  %tmp13617 = getelementptr inbounds float* %tmp13616, i64 1
+  %tmp13618 = getelementptr inbounds float* %tmp13617, i64 1
+  %tmp13619 = getelementptr inbounds float* %tmp13618, i64 1
+  %tmp13620 = getelementptr inbounds float* %tmp13619, i64 1
+  %tmp13621 = getelementptr inbounds float* %tmp13620, i64 1
+  %tmp13622 = getelementptr inbounds float* %tmp13621, i64 1
+  %tmp13623 = getelementptr inbounds float* %tmp13622, i64 1
+  %tmp13624 = getelementptr inbounds float* %tmp13623, i64 1
+  %tmp13625 = getelementptr inbounds float* %tmp13624, i64 1
+  %tmp13626 = getelementptr inbounds float* %tmp13625, i64 1
+  %tmp13627 = getelementptr inbounds float* %tmp13626, i64 1
+  %tmp13628 = getelementptr inbounds float* %tmp13627, i64 1
+  %tmp13629 = getelementptr inbounds float* %tmp13628, i64 1
+  %tmp13630 = getelementptr inbounds float* %tmp13629, i64 1
+  %tmp13631 = getelementptr inbounds float* %tmp13630, i64 1
+  %tmp13632 = getelementptr inbounds float* %tmp13631, i64 1
+  %tmp13633 = getelementptr inbounds float* %tmp13632, i64 1
+  %tmp13634 = getelementptr inbounds float* %tmp13633, i64 1
+  %tmp13635 = getelementptr inbounds float* %tmp13634, i64 1
+  %tmp13636 = getelementptr inbounds float* %tmp13635, i64 1
+  %tmp13637 = getelementptr inbounds float* %tmp13636, i64 1
+  %tmp13638 = getelementptr inbounds float* %tmp13637, i64 1
+  %tmp13639 = getelementptr inbounds float* %tmp13638, i64 1
+  %tmp13640 = getelementptr inbounds float* %tmp13639, i64 1
+  %tmp13641 = getelementptr inbounds float* %tmp13640, i64 1
+  %tmp13642 = getelementptr inbounds float* %tmp13641, i64 1
+  %tmp13643 = getelementptr inbounds float* %tmp13642, i64 1
+  %tmp13644 = getelementptr inbounds float* %tmp13643, i64 1
+  %tmp13645 = getelementptr inbounds float* %tmp13644, i64 1
+  %tmp13646 = getelementptr inbounds float* %tmp13645, i64 1
+  %tmp13647 = getelementptr inbounds float* %tmp13646, i64 1
+  %tmp13648 = getelementptr inbounds float* %tmp13647, i64 1
+  %tmp13649 = getelementptr inbounds float* %tmp13648, i64 1
+  %tmp13650 = getelementptr inbounds float* %tmp13649, i64 1
+  %tmp13651 = getelementptr inbounds float* %tmp13650, i64 1
+  %tmp13652 = getelementptr inbounds float* %tmp13651, i64 1
+  %tmp13653 = getelementptr inbounds float* %tmp13652, i64 1
+  %tmp13654 = getelementptr inbounds float* %tmp13653, i64 1
+  %tmp13655 = getelementptr inbounds float* %tmp13654, i64 1
+  %tmp13656 = getelementptr inbounds float* %tmp13655, i64 1
+  %tmp13657 = getelementptr inbounds float* %tmp13656, i64 1
+  %tmp13658 = getelementptr inbounds float* %tmp13657, i64 1
+  %tmp13659 = getelementptr inbounds float* %tmp13658, i64 1
+  %tmp13660 = getelementptr inbounds float* %tmp13659, i64 1
+  %tmp13661 = getelementptr inbounds float* %tmp13660, i64 1
+  %tmp13662 = getelementptr inbounds float* %tmp13661, i64 1
+  %tmp13663 = getelementptr inbounds float* %tmp13662, i64 1
+  %tmp13664 = getelementptr inbounds float* %tmp13663, i64 1
+  %tmp13665 = getelementptr inbounds float* %tmp13664, i64 1
+  %tmp13666 = getelementptr inbounds float* %tmp13665, i64 1
+  %tmp13667 = getelementptr inbounds float* %tmp13666, i64 1
+  %tmp13668 = getelementptr inbounds float* %tmp13667, i64 1
+  %tmp13669 = getelementptr inbounds float* %tmp13668, i64 1
+  %tmp13670 = getelementptr inbounds float* %tmp13669, i64 1
+  %tmp13671 = getelementptr inbounds float* %tmp13670, i64 1
+  %tmp13672 = getelementptr inbounds float* %tmp13671, i64 1
+  %tmp13673 = getelementptr inbounds float* %tmp13672, i64 1
+  %tmp13674 = getelementptr inbounds float* %tmp13673, i64 1
+  %tmp13675 = getelementptr inbounds float* %tmp13674, i64 1
+  %tmp13676 = getelementptr inbounds float* %tmp13675, i64 1
+  %tmp13677 = getelementptr inbounds float* %tmp13676, i64 1
+  %tmp13678 = getelementptr inbounds float* %tmp13677, i64 1
+  %tmp13679 = getelementptr inbounds float* %tmp13678, i64 1
+  %tmp13680 = getelementptr inbounds float* %tmp13679, i64 1
+  %tmp13681 = getelementptr inbounds float* %tmp13680, i64 1
+  %tmp13682 = getelementptr inbounds float* %tmp13681, i64 1
+  %tmp13683 = getelementptr inbounds float* %tmp13682, i64 1
+  %tmp13684 = getelementptr inbounds float* %tmp13683, i64 1
+  %tmp13685 = getelementptr inbounds float* %tmp13684, i64 1
+  %tmp13686 = getelementptr inbounds float* %tmp13685, i64 1
+  %tmp13687 = getelementptr inbounds float* %tmp13686, i64 1
+  %tmp13688 = getelementptr inbounds float* %tmp13687, i64 1
+  %tmp13689 = getelementptr inbounds float* %tmp13688, i64 1
+  %tmp13690 = getelementptr inbounds float* %tmp13689, i64 1
+  %tmp13691 = getelementptr inbounds float* %tmp13690, i64 1
+  %tmp13692 = getelementptr inbounds float* %tmp13691, i64 1
+  %tmp13693 = getelementptr inbounds float* %tmp13692, i64 1
+  %tmp13694 = getelementptr inbounds float* %tmp13693, i64 1
+  %tmp13695 = getelementptr inbounds float* %tmp13694, i64 1
+  %tmp13696 = getelementptr inbounds float* %tmp13695, i64 1
+  %tmp13697 = getelementptr inbounds float* %tmp13696, i64 1
+  %tmp13698 = getelementptr inbounds float* %tmp13697, i64 1
+  %tmp13699 = getelementptr inbounds float* %tmp13698, i64 1
+  %tmp13700 = getelementptr inbounds float* %tmp13699, i64 1
+  %tmp13701 = getelementptr inbounds float* %tmp13700, i64 1
+  %tmp13702 = getelementptr inbounds float* %tmp13701, i64 1
+  %tmp13703 = getelementptr inbounds float* %tmp13702, i64 1
+  %tmp13704 = getelementptr inbounds float* %tmp13703, i64 1
+  %tmp13705 = getelementptr inbounds float* %tmp13704, i64 1
+  %tmp13706 = getelementptr inbounds float* %tmp13705, i64 1
+  %tmp13707 = getelementptr inbounds float* %tmp13706, i64 1
+  %tmp13708 = getelementptr inbounds float* %tmp13707, i64 1
+  %tmp13709 = getelementptr inbounds float* %tmp13708, i64 1
+  %tmp13710 = getelementptr inbounds float* %tmp13709, i64 1
+  %tmp13711 = getelementptr inbounds float* %tmp13710, i64 1
+  %tmp13712 = getelementptr inbounds float* %tmp13711, i64 1
+  %tmp13713 = getelementptr inbounds float* %tmp13712, i64 1
+  %tmp13714 = getelementptr inbounds float* %tmp13713, i64 1
+  %tmp13715 = getelementptr inbounds float* %tmp13714, i64 1
+  %tmp13716 = getelementptr inbounds float* %tmp13715, i64 1
+  %tmp13717 = getelementptr inbounds float* %tmp13716, i64 1
+  %tmp13718 = getelementptr inbounds float* %tmp13717, i64 1
+  %tmp13719 = getelementptr inbounds float* %tmp13718, i64 1
+  %tmp13720 = getelementptr inbounds float* %tmp13719, i64 1
+  %tmp13721 = getelementptr inbounds float* %tmp13720, i64 1
+  %tmp13722 = getelementptr inbounds float* %tmp13721, i64 1
+  %tmp13723 = getelementptr inbounds float* %tmp13722, i64 1
+  %tmp13724 = getelementptr inbounds float* %tmp13723, i64 1
+  %tmp13725 = getelementptr inbounds float* %tmp13724, i64 1
+  %tmp13726 = getelementptr inbounds float* %tmp13725, i64 1
+  %tmp13727 = getelementptr inbounds float* %tmp13726, i64 1
+  %tmp13728 = getelementptr inbounds float* %tmp13727, i64 1
+  %tmp13729 = getelementptr inbounds float* %tmp13728, i64 1
+  %tmp13730 = getelementptr inbounds float* %tmp13729, i64 1
+  %tmp13731 = getelementptr inbounds float* %tmp13730, i64 1
+  %tmp13732 = getelementptr inbounds float* %tmp13731, i64 1
+  %tmp13733 = getelementptr inbounds float* %tmp13732, i64 1
+  %tmp13734 = getelementptr inbounds float* %tmp13733, i64 1
+  %tmp13735 = getelementptr inbounds float* %tmp13734, i64 1
+  %tmp13736 = getelementptr inbounds float* %tmp13735, i64 1
+  %tmp13737 = getelementptr inbounds float* %tmp13736, i64 1
+  %tmp13738 = getelementptr inbounds float* %tmp13737, i64 1
+  %tmp13739 = getelementptr inbounds float* %tmp13738, i64 1
+  %tmp13740 = getelementptr inbounds float* %tmp13739, i64 1
+  %tmp13741 = getelementptr inbounds float* %tmp13740, i64 1
+  %tmp13742 = getelementptr inbounds float* %tmp13741, i64 1
+  %tmp13743 = getelementptr inbounds float* %tmp13742, i64 1
+  %tmp13744 = getelementptr inbounds float* %tmp13743, i64 1
+  %tmp13745 = getelementptr inbounds float* %tmp13744, i64 1
+  %tmp13746 = getelementptr inbounds float* %tmp13745, i64 1
+  %tmp13747 = getelementptr inbounds float* %tmp13746, i64 1
+  %tmp13748 = getelementptr inbounds float* %tmp13747, i64 1
+  %tmp13749 = getelementptr inbounds float* %tmp13748, i64 1
+  %tmp13750 = getelementptr inbounds float* %tmp13749, i64 1
+  %tmp13751 = getelementptr inbounds float* %tmp13750, i64 1
+  %tmp13752 = getelementptr inbounds float* %tmp13751, i64 1
+  %tmp13753 = getelementptr inbounds float* %tmp13752, i64 1
+  %tmp13754 = getelementptr inbounds float* %tmp13753, i64 1
+  %tmp13755 = getelementptr inbounds float* %tmp13754, i64 1
+  %tmp13756 = getelementptr inbounds float* %tmp13755, i64 1
+  %tmp13757 = getelementptr inbounds float* %tmp13756, i64 1
+  %tmp13758 = getelementptr inbounds float* %tmp13757, i64 1
+  %tmp13759 = getelementptr inbounds float* %tmp13758, i64 1
+  %tmp13760 = getelementptr inbounds float* %tmp13759, i64 1
+  %tmp13761 = getelementptr inbounds float* %tmp13760, i64 1
+  %tmp13762 = getelementptr inbounds float* %tmp13761, i64 1
+  %tmp13763 = getelementptr inbounds float* %tmp13762, i64 1
+  %tmp13764 = getelementptr inbounds float* %tmp13763, i64 1
+  %tmp13765 = getelementptr inbounds float* %tmp13764, i64 1
+  %tmp13766 = getelementptr inbounds float* %tmp13765, i64 1
+  %tmp13767 = getelementptr inbounds float* %tmp13766, i64 1
+  %tmp13768 = getelementptr inbounds float* %tmp13767, i64 1
+  %tmp13769 = getelementptr inbounds float* %tmp13768, i64 1
+  %tmp13770 = getelementptr inbounds float* %tmp13769, i64 1
+  %tmp13771 = getelementptr inbounds float* %tmp13770, i64 1
+  %tmp13772 = getelementptr inbounds float* %tmp13771, i64 1
+  %tmp13773 = getelementptr inbounds float* %tmp13772, i64 1
+  %tmp13774 = getelementptr inbounds float* %tmp13773, i64 1
+  %tmp13775 = getelementptr inbounds float* %tmp13774, i64 1
+  %tmp13776 = getelementptr inbounds float* %tmp13775, i64 1
+  %tmp13777 = getelementptr inbounds float* %tmp13776, i64 1
+  %tmp13778 = getelementptr inbounds float* %tmp13777, i64 1
+  %tmp13779 = getelementptr inbounds float* %tmp13778, i64 1
+  %tmp13780 = getelementptr inbounds float* %tmp13779, i64 1
+  %tmp13781 = getelementptr inbounds float* %tmp13780, i64 1
+  %tmp13782 = getelementptr inbounds float* %tmp13781, i64 1
+  %tmp13783 = getelementptr inbounds float* %tmp13782, i64 1
+  %tmp13784 = getelementptr inbounds float* %tmp13783, i64 1
+  %tmp13785 = getelementptr inbounds float* %tmp13784, i64 1
+  %tmp13786 = getelementptr inbounds float* %tmp13785, i64 1
+  %tmp13787 = getelementptr inbounds float* %tmp13786, i64 1
+  %tmp13788 = getelementptr inbounds float* %tmp13787, i64 1
+  %tmp13789 = getelementptr inbounds float* %tmp13788, i64 1
+  %tmp13790 = getelementptr inbounds float* %tmp13789, i64 1
+  %tmp13791 = getelementptr inbounds float* %tmp13790, i64 1
+  %tmp13792 = getelementptr inbounds float* %tmp13791, i64 1
+  %tmp13793 = getelementptr inbounds float* %tmp13792, i64 1
+  %tmp13794 = getelementptr inbounds float* %tmp13793, i64 1
+  %tmp13795 = getelementptr inbounds float* %tmp13794, i64 1
+  %tmp13796 = getelementptr inbounds float* %tmp13795, i64 1
+  %tmp13797 = getelementptr inbounds float* %tmp13796, i64 1
+  %tmp13798 = getelementptr inbounds float* %tmp13797, i64 1
+  %tmp13799 = getelementptr inbounds float* %tmp13798, i64 1
+  %tmp13800 = getelementptr inbounds float* %tmp13799, i64 1
+  %tmp13801 = getelementptr inbounds float* %tmp13800, i64 1
+  %tmp13802 = getelementptr inbounds float* %tmp13801, i64 1
+  %tmp13803 = getelementptr inbounds float* %tmp13802, i64 1
+  %tmp13804 = getelementptr inbounds float* %tmp13803, i64 1
+  %tmp13805 = getelementptr inbounds float* %tmp13804, i64 1
+  %tmp13806 = getelementptr inbounds float* %tmp13805, i64 1
+  %tmp13807 = getelementptr inbounds float* %tmp13806, i64 1
+  %tmp13808 = getelementptr inbounds float* %tmp13807, i64 1
+  %tmp13809 = getelementptr inbounds float* %tmp13808, i64 1
+  %tmp13810 = getelementptr inbounds float* %tmp13809, i64 1
+  %tmp13811 = getelementptr inbounds float* %tmp13810, i64 1
+  %tmp13812 = getelementptr inbounds float* %tmp13811, i64 1
+  %tmp13813 = getelementptr inbounds float* %tmp13812, i64 1
+  %tmp13814 = getelementptr inbounds float* %tmp13813, i64 1
+  %tmp13815 = getelementptr inbounds float* %tmp13814, i64 1
+  %tmp13816 = getelementptr inbounds float* %tmp13815, i64 1
+  %tmp13817 = getelementptr inbounds float* %tmp13816, i64 1
+  %tmp13818 = getelementptr inbounds float* %tmp13817, i64 1
+  %tmp13819 = getelementptr inbounds float* %tmp13818, i64 1
+  %tmp13820 = getelementptr inbounds float* %tmp13819, i64 1
+  %tmp13821 = getelementptr inbounds float* %tmp13820, i64 1
+  %tmp13822 = getelementptr inbounds float* %tmp13821, i64 1
+  %tmp13823 = getelementptr inbounds float* %tmp13822, i64 1
+  %tmp13824 = getelementptr inbounds float* %tmp13823, i64 1
+  %tmp13825 = getelementptr inbounds float* %tmp13824, i64 1
+  %tmp13826 = getelementptr inbounds float* %tmp13825, i64 1
+  %tmp13827 = getelementptr inbounds float* %tmp13826, i64 1
+  %tmp13828 = getelementptr inbounds float* %tmp13827, i64 1
+  %tmp13829 = getelementptr inbounds float* %tmp13828, i64 1
+  %tmp13830 = getelementptr inbounds float* %tmp13829, i64 1
+  %tmp13831 = getelementptr inbounds float* %tmp13830, i64 1
+  %tmp13832 = getelementptr inbounds float* %tmp13831, i64 1
+  %tmp13833 = getelementptr inbounds float* %tmp13832, i64 1
+  %tmp13834 = getelementptr inbounds float* %tmp13833, i64 1
+  %tmp13835 = getelementptr inbounds float* %tmp13834, i64 1
+  %tmp13836 = getelementptr inbounds float* %tmp13835, i64 1
+  %tmp13837 = getelementptr inbounds float* %tmp13836, i64 1
+  %tmp13838 = getelementptr inbounds float* %tmp13837, i64 1
+  %tmp13839 = getelementptr inbounds float* %tmp13838, i64 1
+  %tmp13840 = getelementptr inbounds float* %tmp13839, i64 1
+  %tmp13841 = getelementptr inbounds float* %tmp13840, i64 1
+  %tmp13842 = getelementptr inbounds float* %tmp13841, i64 1
+  %tmp13843 = getelementptr inbounds float* %tmp13842, i64 1
+  %tmp13844 = getelementptr inbounds float* %tmp13843, i64 1
+  %tmp13845 = getelementptr inbounds float* %tmp13844, i64 1
+  %tmp13846 = getelementptr inbounds float* %tmp13845, i64 1
+  %tmp13847 = getelementptr inbounds float* %tmp13846, i64 1
+  %tmp13848 = getelementptr inbounds float* %tmp13847, i64 1
+  %tmp13849 = getelementptr inbounds float* %tmp13848, i64 1
+  %tmp13850 = getelementptr inbounds float* %tmp13849, i64 1
+  %tmp13851 = getelementptr inbounds float* %tmp13850, i64 1
+  %tmp13852 = getelementptr inbounds float* %tmp13851, i64 1
+  %tmp13853 = getelementptr inbounds float* %tmp13852, i64 1
+  %tmp13854 = getelementptr inbounds float* %tmp13853, i64 1
+  %tmp13855 = getelementptr inbounds float* %tmp13854, i64 1
+  %tmp13856 = getelementptr inbounds float* %tmp13855, i64 1
+  %tmp13857 = getelementptr inbounds float* %tmp13856, i64 1
+  %tmp13858 = getelementptr inbounds float* %tmp13857, i64 1
+  %tmp13859 = getelementptr inbounds float* %tmp13858, i64 1
+  %tmp13860 = getelementptr inbounds float* %tmp13859, i64 1
+  %tmp13861 = getelementptr inbounds float* %tmp13860, i64 1
+  %tmp13862 = getelementptr inbounds float* %tmp13861, i64 1
+  %tmp13863 = getelementptr inbounds float* %tmp13862, i64 1
+  %tmp13864 = getelementptr inbounds float* %tmp13863, i64 1
+  %tmp13865 = getelementptr inbounds float* %tmp13864, i64 1
+  %tmp13866 = getelementptr inbounds float* %tmp13865, i64 1
+  %tmp13867 = getelementptr inbounds float* %tmp13866, i64 1
+  %tmp13868 = getelementptr inbounds float* %tmp13867, i64 1
+  %tmp13869 = getelementptr inbounds float* %tmp13868, i64 1
+  %tmp13870 = getelementptr inbounds float* %tmp13869, i64 1
+  %tmp13871 = getelementptr inbounds float* %tmp13870, i64 1
+  %tmp13872 = getelementptr inbounds float* %tmp13871, i64 1
+  %tmp13873 = getelementptr inbounds float* %tmp13872, i64 1
+  %tmp13874 = getelementptr inbounds float* %tmp13873, i64 1
+  %tmp13875 = getelementptr inbounds float* %tmp13874, i64 1
+  %tmp13876 = getelementptr inbounds float* %tmp13875, i64 1
+  %tmp13877 = getelementptr inbounds float* %tmp13876, i64 1
+  %tmp13878 = getelementptr inbounds float* %tmp13877, i64 1
+  %tmp13879 = getelementptr inbounds float* %tmp13878, i64 1
+  %tmp13880 = getelementptr inbounds float* %tmp13879, i64 1
+  %tmp13881 = getelementptr inbounds float* %tmp13880, i64 1
+  %tmp13882 = getelementptr inbounds float* %tmp13881, i64 1
+  %tmp13883 = getelementptr inbounds float* %tmp13882, i64 1
+  %tmp13884 = getelementptr inbounds float* %tmp13883, i64 1
+  %tmp13885 = getelementptr inbounds float* %tmp13884, i64 1
+  %tmp13886 = getelementptr inbounds float* %tmp13885, i64 1
+  %tmp13887 = getelementptr inbounds float* %tmp13886, i64 1
+  %tmp13888 = getelementptr inbounds float* %tmp13887, i64 1
+  %tmp13889 = getelementptr inbounds float* %tmp13888, i64 1
+  %tmp13890 = getelementptr inbounds float* %tmp13889, i64 1
+  %tmp13891 = getelementptr inbounds float* %tmp13890, i64 1
+  %tmp13892 = getelementptr inbounds float* %tmp13891, i64 1
+  %tmp13893 = getelementptr inbounds float* %tmp13892, i64 1
+  %tmp13894 = getelementptr inbounds float* %tmp13893, i64 1
+  %tmp13895 = getelementptr inbounds float* %tmp13894, i64 1
+  %tmp13896 = getelementptr inbounds float* %tmp13895, i64 1
+  %tmp13897 = getelementptr inbounds float* %tmp13896, i64 1
+  %tmp13898 = getelementptr inbounds float* %tmp13897, i64 1
+  %tmp13899 = getelementptr inbounds float* %tmp13898, i64 1
+  %tmp13900 = getelementptr inbounds float* %tmp13899, i64 1
+  %tmp13901 = getelementptr inbounds float* %tmp13900, i64 1
+  %tmp13902 = getelementptr inbounds float* %tmp13901, i64 1
+  %tmp13903 = getelementptr inbounds float* %tmp13902, i64 1
+  %tmp13904 = getelementptr inbounds float* %tmp13903, i64 1
+  %tmp13905 = getelementptr inbounds float* %tmp13904, i64 1
+  %tmp13906 = getelementptr inbounds float* %tmp13905, i64 1
+  %tmp13907 = getelementptr inbounds float* %tmp13906, i64 1
+  %tmp13908 = getelementptr inbounds float* %tmp13907, i64 1
+  %tmp13909 = getelementptr inbounds float* %tmp13908, i64 1
+  %tmp13910 = getelementptr inbounds float* %tmp13909, i64 1
+  %tmp13911 = getelementptr inbounds float* %tmp13910, i64 1
+  %tmp13912 = getelementptr inbounds float* %tmp13911, i64 1
+  %tmp13913 = getelementptr inbounds float* %tmp13912, i64 1
+  %tmp13914 = getelementptr inbounds float* %tmp13913, i64 1
+  %tmp13915 = getelementptr inbounds float* %tmp13914, i64 1
+  %tmp13916 = getelementptr inbounds float* %tmp13915, i64 1
+  %tmp13917 = getelementptr inbounds float* %tmp13916, i64 1
+  %tmp13918 = getelementptr inbounds float* %tmp13917, i64 1
+  %tmp13919 = getelementptr inbounds float* %tmp13918, i64 1
+  %tmp13920 = getelementptr inbounds float* %tmp13919, i64 1
+  %tmp13921 = getelementptr inbounds float* %tmp13920, i64 1
+  %tmp13922 = getelementptr inbounds float* %tmp13921, i64 1
+  %tmp13923 = getelementptr inbounds float* %tmp13922, i64 1
+  %tmp13924 = getelementptr inbounds float* %tmp13923, i64 1
+  %tmp13925 = getelementptr inbounds float* %tmp13924, i64 1
+  %tmp13926 = getelementptr inbounds float* %tmp13925, i64 1
+  %tmp13927 = getelementptr inbounds float* %tmp13926, i64 1
+  %tmp13928 = getelementptr inbounds float* %tmp13927, i64 1
+  %tmp13929 = getelementptr inbounds float* %tmp13928, i64 1
+  %tmp13930 = getelementptr inbounds float* %tmp13929, i64 1
+  %tmp13931 = getelementptr inbounds float* %tmp13930, i64 1
+  %tmp13932 = getelementptr inbounds float* %tmp13931, i64 1
+  %tmp13933 = getelementptr inbounds float* %tmp13932, i64 1
+  %tmp13934 = getelementptr inbounds float* %tmp13933, i64 1
+  %tmp13935 = getelementptr inbounds float* %tmp13934, i64 1
+  %tmp13936 = getelementptr inbounds float* %tmp13935, i64 1
+  %tmp13937 = getelementptr inbounds float* %tmp13936, i64 1
+  %tmp13938 = getelementptr inbounds float* %tmp13937, i64 1
+  %tmp13939 = getelementptr inbounds float* %tmp13938, i64 1
+  %tmp13940 = getelementptr inbounds float* %tmp13939, i64 1
+  %tmp13941 = getelementptr inbounds float* %tmp13940, i64 1
+  %tmp13942 = getelementptr inbounds float* %tmp13941, i64 1
+  %tmp13943 = getelementptr inbounds float* %tmp13942, i64 1
+  %tmp13944 = getelementptr inbounds float* %tmp13943, i64 1
+  %tmp13945 = getelementptr inbounds float* %tmp13944, i64 1
+  %tmp13946 = getelementptr inbounds float* %tmp13945, i64 1
+  %tmp13947 = getelementptr inbounds float* %tmp13946, i64 1
+  %tmp13948 = getelementptr inbounds float* %tmp13947, i64 1
+  %tmp13949 = getelementptr inbounds float* %tmp13948, i64 1
+  %tmp13950 = getelementptr inbounds float* %tmp13949, i64 1
+  %tmp13951 = getelementptr inbounds float* %tmp13950, i64 1
+  %tmp13952 = getelementptr inbounds float* %tmp13951, i64 1
+  %tmp13953 = getelementptr inbounds float* %tmp13952, i64 1
+  %tmp13954 = getelementptr inbounds float* %tmp13953, i64 1
+  %tmp13955 = getelementptr inbounds float* %tmp13954, i64 1
+  %tmp13956 = getelementptr inbounds float* %tmp13955, i64 1
+  %tmp13957 = getelementptr inbounds float* %tmp13956, i64 1
+  %tmp13958 = getelementptr inbounds float* %tmp13957, i64 1
+  %tmp13959 = getelementptr inbounds float* %tmp13958, i64 1
+  %tmp13960 = getelementptr inbounds float* %tmp13959, i64 1
+  %tmp13961 = getelementptr inbounds float* %tmp13960, i64 1
+  %tmp13962 = getelementptr inbounds float* %tmp13961, i64 1
+  %tmp13963 = getelementptr inbounds float* %tmp13962, i64 1
+  %tmp13964 = getelementptr inbounds float* %tmp13963, i64 1
+  %tmp13965 = getelementptr inbounds float* %tmp13964, i64 1
+  %tmp13966 = getelementptr inbounds float* %tmp13965, i64 1
+  %tmp13967 = getelementptr inbounds float* %tmp13966, i64 1
+  %tmp13968 = getelementptr inbounds float* %tmp13967, i64 1
+  %tmp13969 = getelementptr inbounds float* %tmp13968, i64 1
+  %tmp13970 = getelementptr inbounds float* %tmp13969, i64 1
+  %tmp13971 = getelementptr inbounds float* %tmp13970, i64 1
+  %tmp13972 = getelementptr inbounds float* %tmp13971, i64 1
+  %tmp13973 = getelementptr inbounds float* %tmp13972, i64 1
+  %tmp13974 = getelementptr inbounds float* %tmp13973, i64 1
+  %tmp13975 = getelementptr inbounds float* %tmp13974, i64 1
+  %tmp13976 = getelementptr inbounds float* %tmp13975, i64 1
+  %tmp13977 = getelementptr inbounds float* %tmp13976, i64 1
+  %tmp13978 = getelementptr inbounds float* %tmp13977, i64 1
+  %tmp13979 = getelementptr inbounds float* %tmp13978, i64 1
+  %tmp13980 = getelementptr inbounds float* %tmp13979, i64 1
+  %tmp13981 = getelementptr inbounds float* %tmp13980, i64 1
+  %tmp13982 = getelementptr inbounds float* %tmp13981, i64 1
+  %tmp13983 = getelementptr inbounds float* %tmp13982, i64 1
+  %tmp13984 = getelementptr inbounds float* %tmp13983, i64 1
+  %tmp13985 = getelementptr inbounds float* %tmp13984, i64 1
+  %tmp13986 = getelementptr inbounds float* %tmp13985, i64 1
+  %tmp13987 = getelementptr inbounds float* %tmp13986, i64 1
+  %tmp13988 = getelementptr inbounds float* %tmp13987, i64 1
+  %tmp13989 = getelementptr inbounds float* %tmp13988, i64 1
+  %tmp13990 = getelementptr inbounds float* %tmp13989, i64 1
+  %tmp13991 = getelementptr inbounds float* %tmp13990, i64 1
+  %tmp13992 = getelementptr inbounds float* %tmp13991, i64 1
+  %tmp13993 = getelementptr inbounds float* %tmp13992, i64 1
+  %tmp13994 = getelementptr inbounds float* %tmp13993, i64 1
+  %tmp13995 = getelementptr inbounds float* %tmp13994, i64 1
+  %tmp13996 = getelementptr inbounds float* %tmp13995, i64 1
+  %tmp13997 = getelementptr inbounds float* %tmp13996, i64 1
+  %tmp13998 = getelementptr inbounds float* %tmp13997, i64 1
+  %tmp13999 = getelementptr inbounds float* %tmp13998, i64 1
+  %tmp14000 = getelementptr inbounds float* %tmp13999, i64 1
+  %tmp14001 = getelementptr inbounds float* %tmp14000, i64 1
+  %tmp14002 = getelementptr inbounds float* %tmp14001, i64 1
+  %tmp14003 = getelementptr inbounds float* %tmp14002, i64 1
+  %tmp14004 = getelementptr inbounds float* %tmp14003, i64 1
+  %tmp14005 = getelementptr inbounds float* %tmp14004, i64 1
+  %tmp14006 = getelementptr inbounds float* %tmp14005, i64 1
+  %tmp14007 = getelementptr inbounds float* %tmp14006, i64 1
+  %tmp14008 = getelementptr inbounds float* %tmp14007, i64 1
+  %tmp14009 = getelementptr inbounds float* %tmp14008, i64 1
+  %tmp14010 = getelementptr inbounds float* %tmp14009, i64 1
+  %tmp14011 = getelementptr inbounds float* %tmp14010, i64 1
+  %tmp14012 = getelementptr inbounds float* %tmp14011, i64 1
+  %tmp14013 = getelementptr inbounds float* %tmp14012, i64 1
+  %tmp14014 = getelementptr inbounds float* %tmp14013, i64 1
+  %tmp14015 = getelementptr inbounds float* %tmp14014, i64 1
+  %tmp14016 = getelementptr inbounds float* %tmp14015, i64 1
+  %tmp14017 = getelementptr inbounds float* %tmp14016, i64 1
+  %tmp14018 = getelementptr inbounds float* %tmp14017, i64 1
+  %tmp14019 = getelementptr inbounds float* %tmp14018, i64 1
+  %tmp14020 = getelementptr inbounds float* %tmp14019, i64 1
+  %tmp14021 = getelementptr inbounds float* %tmp14020, i64 1
+  %tmp14022 = getelementptr inbounds float* %tmp14021, i64 1
+  %tmp14023 = getelementptr inbounds float* %tmp14022, i64 1
+  %tmp14024 = getelementptr inbounds float* %tmp14023, i64 1
+  %tmp14025 = getelementptr inbounds float* %tmp14024, i64 1
+  %tmp14026 = getelementptr inbounds float* %tmp14025, i64 1
+  %tmp14027 = getelementptr inbounds float* %tmp14026, i64 1
+  %tmp14028 = getelementptr inbounds float* %tmp14027, i64 1
+  %tmp14029 = getelementptr inbounds float* %tmp14028, i64 1
+  %tmp14030 = getelementptr inbounds float* %tmp14029, i64 1
+  %tmp14031 = getelementptr inbounds float* %tmp14030, i64 1
+  %tmp14032 = getelementptr inbounds float* %tmp14031, i64 1
+  %tmp14033 = getelementptr inbounds float* %tmp14032, i64 1
+  %tmp14034 = getelementptr inbounds float* %tmp14033, i64 1
+  %tmp14035 = getelementptr inbounds float* %tmp14034, i64 1
+  %tmp14036 = getelementptr inbounds float* %tmp14035, i64 1
+  %tmp14037 = getelementptr inbounds float* %tmp14036, i64 1
+  %tmp14038 = getelementptr inbounds float* %tmp14037, i64 1
+  %tmp14039 = getelementptr inbounds float* %tmp14038, i64 1
+  %tmp14040 = getelementptr inbounds float* %tmp14039, i64 1
+  %tmp14041 = getelementptr inbounds float* %tmp14040, i64 1
+  %tmp14042 = getelementptr inbounds float* %tmp14041, i64 1
+  %tmp14043 = getelementptr inbounds float* %tmp14042, i64 1
+  %tmp14044 = getelementptr inbounds float* %tmp14043, i64 1
+  %tmp14045 = getelementptr inbounds float* %tmp14044, i64 1
+  %tmp14046 = getelementptr inbounds float* %tmp14045, i64 1
+  %tmp14047 = getelementptr inbounds float* %tmp14046, i64 1
+  %tmp14048 = getelementptr inbounds float* %tmp14047, i64 1
+  %tmp14049 = getelementptr inbounds float* %tmp14048, i64 1
+  %tmp14050 = getelementptr inbounds float* %tmp14049, i64 1
+  %tmp14051 = getelementptr inbounds float* %tmp14050, i64 1
+  %tmp14052 = getelementptr inbounds float* %tmp14051, i64 1
+  %tmp14053 = getelementptr inbounds float* %tmp14052, i64 1
+  %tmp14054 = getelementptr inbounds float* %tmp14053, i64 1
+  %tmp14055 = getelementptr inbounds float* %tmp14054, i64 1
+  %tmp14056 = getelementptr inbounds float* %tmp14055, i64 1
+  %tmp14057 = getelementptr inbounds float* %tmp14056, i64 1
+  %tmp14058 = getelementptr inbounds float* %tmp14057, i64 1
+  %tmp14059 = getelementptr inbounds float* %tmp14058, i64 1
+  %tmp14060 = getelementptr inbounds float* %tmp14059, i64 1
+  %tmp14061 = getelementptr inbounds float* %tmp14060, i64 1
+  %tmp14062 = getelementptr inbounds float* %tmp14061, i64 1
+  %tmp14063 = getelementptr inbounds float* %tmp14062, i64 1
+  %tmp14064 = getelementptr inbounds float* %tmp14063, i64 1
+  %tmp14065 = getelementptr inbounds float* %tmp14064, i64 1
+  %tmp14066 = getelementptr inbounds float* %tmp14065, i64 1
+  %tmp14067 = getelementptr inbounds float* %tmp14066, i64 1
+  %tmp14068 = getelementptr inbounds float* %tmp14067, i64 1
+  %tmp14069 = getelementptr inbounds float* %tmp14068, i64 1
+  %tmp14070 = getelementptr inbounds float* %tmp14069, i64 1
+  %tmp14071 = getelementptr inbounds float* %tmp14070, i64 1
+  %tmp14072 = getelementptr inbounds float* %tmp14071, i64 1
+  %tmp14073 = getelementptr inbounds float* %tmp14072, i64 1
+  %tmp14074 = getelementptr inbounds float* %tmp14073, i64 1
+  %tmp14075 = getelementptr inbounds float* %tmp14074, i64 1
+  %tmp14076 = getelementptr inbounds float* %tmp14075, i64 1
+  %tmp14077 = getelementptr inbounds float* %tmp14076, i64 1
+  %tmp14078 = getelementptr inbounds float* %tmp14077, i64 1
+  %tmp14079 = getelementptr inbounds float* %tmp14078, i64 1
+  %tmp14080 = getelementptr inbounds float* %tmp14079, i64 1
+  %tmp14081 = getelementptr inbounds float* %tmp14080, i64 1
+  %tmp14082 = getelementptr inbounds float* %tmp14081, i64 1
+  %tmp14083 = getelementptr inbounds float* %tmp14082, i64 1
+  %tmp14084 = getelementptr inbounds float* %tmp14083, i64 1
+  %tmp14085 = getelementptr inbounds float* %tmp14084, i64 1
+  %tmp14086 = getelementptr inbounds float* %tmp14085, i64 1
+  %tmp14087 = getelementptr inbounds float* %tmp14086, i64 1
+  %tmp14088 = getelementptr inbounds float* %tmp14087, i64 1
+  %tmp14089 = getelementptr inbounds float* %tmp14088, i64 1
+  %tmp14090 = getelementptr inbounds float* %tmp14089, i64 1
+  %tmp14091 = getelementptr inbounds float* %tmp14090, i64 1
+  %tmp14092 = getelementptr inbounds float* %tmp14091, i64 1
+  %tmp14093 = getelementptr inbounds float* %tmp14092, i64 1
+  %tmp14094 = getelementptr inbounds float* %tmp14093, i64 1
+  %tmp14095 = getelementptr inbounds float* %tmp14094, i64 1
+  %tmp14096 = getelementptr inbounds float* %tmp14095, i64 1
+  %tmp14097 = getelementptr inbounds float* %tmp14096, i64 1
+  %tmp14098 = getelementptr inbounds float* %tmp14097, i64 1
+  %tmp14099 = getelementptr inbounds float* %tmp14098, i64 1
+  %tmp14100 = getelementptr inbounds float* %tmp14099, i64 1
+  %tmp14101 = getelementptr inbounds float* %tmp14100, i64 1
+  %tmp14102 = getelementptr inbounds float* %tmp14101, i64 1
+  %tmp14103 = getelementptr inbounds float* %tmp14102, i64 1
+  %tmp14104 = getelementptr inbounds float* %tmp14103, i64 1
+  %tmp14105 = getelementptr inbounds float* %tmp14104, i64 1
+  %tmp14106 = getelementptr inbounds float* %tmp14105, i64 1
+  %tmp14107 = getelementptr inbounds float* %tmp14106, i64 1
+  %tmp14108 = getelementptr inbounds float* %tmp14107, i64 1
+  %tmp14109 = getelementptr inbounds float* %tmp14108, i64 1
+  %tmp14110 = getelementptr inbounds float* %tmp14109, i64 1
+  %tmp14111 = getelementptr inbounds float* %tmp14110, i64 1
+  %tmp14112 = getelementptr inbounds float* %tmp14111, i64 1
+  %tmp14113 = getelementptr inbounds float* %tmp14112, i64 1
+  %tmp14114 = getelementptr inbounds float* %tmp14113, i64 1
+  %tmp14115 = getelementptr inbounds float* %tmp14114, i64 1
+  %tmp14116 = getelementptr inbounds float* %tmp14115, i64 1
+  %tmp14117 = getelementptr inbounds float* %tmp14116, i64 1
+  %tmp14118 = getelementptr inbounds float* %tmp14117, i64 1
+  %tmp14119 = getelementptr inbounds float* %tmp14118, i64 1
+  %tmp14120 = getelementptr inbounds float* %tmp14119, i64 1
+  %tmp14121 = getelementptr inbounds float* %tmp14120, i64 1
+  %tmp14122 = getelementptr inbounds float* %tmp14121, i64 1
+  %tmp14123 = getelementptr inbounds float* %tmp14122, i64 1
+  %tmp14124 = getelementptr inbounds float* %tmp14123, i64 1
+  %tmp14125 = getelementptr inbounds float* %tmp14124, i64 1
+  %tmp14126 = getelementptr inbounds float* %tmp14125, i64 1
+  %tmp14127 = getelementptr inbounds float* %tmp14126, i64 1
+  %tmp14128 = getelementptr inbounds float* %tmp14127, i64 1
+  %tmp14129 = getelementptr inbounds float* %tmp14128, i64 1
+  %tmp14130 = getelementptr inbounds float* %tmp14129, i64 1
+  %tmp14131 = getelementptr inbounds float* %tmp14130, i64 1
+  %tmp14132 = getelementptr inbounds float* %tmp14131, i64 1
+  %tmp14133 = getelementptr inbounds float* %tmp14132, i64 1
+  %tmp14134 = getelementptr inbounds float* %tmp14133, i64 1
+  %tmp14135 = getelementptr inbounds float* %tmp14134, i64 1
+  %tmp14136 = getelementptr inbounds float* %tmp14135, i64 1
+  %tmp14137 = getelementptr inbounds float* %tmp14136, i64 1
+  %tmp14138 = getelementptr inbounds float* %tmp14137, i64 1
+  %tmp14139 = getelementptr inbounds float* %tmp14138, i64 1
+  %tmp14140 = getelementptr inbounds float* %tmp14139, i64 1
+  %tmp14141 = getelementptr inbounds float* %tmp14140, i64 1
+  %tmp14142 = getelementptr inbounds float* %tmp14141, i64 1
+  %tmp14143 = getelementptr inbounds float* %tmp14142, i64 1
+  %tmp14144 = getelementptr inbounds float* %tmp14143, i64 1
+  %tmp14145 = getelementptr inbounds float* %tmp14144, i64 1
+  %tmp14146 = getelementptr inbounds float* %tmp14145, i64 1
+  %tmp14147 = getelementptr inbounds float* %tmp14146, i64 1
+  %tmp14148 = getelementptr inbounds float* %tmp14147, i64 1
+  %tmp14149 = getelementptr inbounds float* %tmp14148, i64 1
+  %tmp14150 = getelementptr inbounds float* %tmp14149, i64 1
+  %tmp14151 = getelementptr inbounds float* %tmp14150, i64 1
+  %tmp14152 = getelementptr inbounds float* %tmp14151, i64 1
+  %tmp14153 = getelementptr inbounds float* %tmp14152, i64 1
+  %tmp14154 = getelementptr inbounds float* %tmp14153, i64 1
+  %tmp14155 = getelementptr inbounds float* %tmp14154, i64 1
+  %tmp14156 = getelementptr inbounds float* %tmp14155, i64 1
+  %tmp14157 = getelementptr inbounds float* %tmp14156, i64 1
+  %tmp14158 = getelementptr inbounds float* %tmp14157, i64 1
+  %tmp14159 = getelementptr inbounds float* %tmp14158, i64 1
+  %tmp14160 = getelementptr inbounds float* %tmp14159, i64 1
+  %tmp14161 = getelementptr inbounds float* %tmp14160, i64 1
+  %tmp14162 = getelementptr inbounds float* %tmp14161, i64 1
+  %tmp14163 = getelementptr inbounds float* %tmp14162, i64 1
+  %tmp14164 = getelementptr inbounds float* %tmp14163, i64 1
+  %tmp14165 = getelementptr inbounds float* %tmp14164, i64 1
+  %tmp14166 = getelementptr inbounds float* %tmp14165, i64 1
+  %tmp14167 = getelementptr inbounds float* %tmp14166, i64 1
+  %tmp14168 = getelementptr inbounds float* %tmp14167, i64 1
+  %tmp14169 = getelementptr inbounds float* %tmp14168, i64 1
+  %tmp14170 = getelementptr inbounds float* %tmp14169, i64 1
+  %tmp14171 = getelementptr inbounds float* %tmp14170, i64 1
+  %tmp14172 = getelementptr inbounds float* %tmp14171, i64 1
+  %tmp14173 = getelementptr inbounds float* %tmp14172, i64 1
+  %tmp14174 = getelementptr inbounds float* %tmp14173, i64 1
+  %tmp14175 = getelementptr inbounds float* %tmp14174, i64 1
+  %tmp14176 = getelementptr inbounds float* %tmp14175, i64 1
+  %tmp14177 = getelementptr inbounds float* %tmp14176, i64 1
+  %tmp14178 = getelementptr inbounds float* %tmp14177, i64 1
+  %tmp14179 = getelementptr inbounds float* %tmp14178, i64 1
+  %tmp14180 = getelementptr inbounds float* %tmp14179, i64 1
+  %tmp14181 = getelementptr inbounds float* %tmp14180, i64 1
+  %tmp14182 = getelementptr inbounds float* %tmp14181, i64 1
+  %tmp14183 = getelementptr inbounds float* %tmp14182, i64 1
+  %tmp14184 = getelementptr inbounds float* %tmp14183, i64 1
+  %tmp14185 = getelementptr inbounds float* %tmp14184, i64 1
+  %tmp14186 = getelementptr inbounds float* %tmp14185, i64 1
+  %tmp14187 = getelementptr inbounds float* %tmp14186, i64 1
+  %tmp14188 = getelementptr inbounds float* %tmp14187, i64 1
+  %tmp14189 = getelementptr inbounds float* %tmp14188, i64 1
+  %tmp14190 = getelementptr inbounds float* %tmp14189, i64 1
+  %tmp14191 = getelementptr inbounds float* %tmp14190, i64 1
+  %tmp14192 = getelementptr inbounds float* %tmp14191, i64 1
+  %tmp14193 = getelementptr inbounds float* %tmp14192, i64 1
+  %tmp14194 = getelementptr inbounds float* %tmp14193, i64 1
+  %tmp14195 = getelementptr inbounds float* %tmp14194, i64 1
+  %tmp14196 = getelementptr inbounds float* %tmp14195, i64 1
+  %tmp14197 = getelementptr inbounds float* %tmp14196, i64 1
+  %tmp14198 = getelementptr inbounds float* %tmp14197, i64 1
+  %tmp14199 = getelementptr inbounds float* %tmp14198, i64 1
+  %tmp14200 = getelementptr inbounds float* %tmp14199, i64 1
+  %tmp14201 = getelementptr inbounds float* %tmp14200, i64 1
+  %tmp14202 = getelementptr inbounds float* %tmp14201, i64 1
+  %tmp14203 = getelementptr inbounds float* %tmp14202, i64 1
+  %tmp14204 = getelementptr inbounds float* %tmp14203, i64 1
+  %tmp14205 = getelementptr inbounds float* %tmp14204, i64 1
+  %tmp14206 = getelementptr inbounds float* %tmp14205, i64 1
+  %tmp14207 = getelementptr inbounds float* %tmp14206, i64 1
+  %tmp14208 = getelementptr inbounds float* %tmp14207, i64 1
+  %tmp14209 = getelementptr inbounds float* %tmp14208, i64 1
+  %tmp14210 = getelementptr inbounds float* %tmp14209, i64 1
+  %tmp14211 = getelementptr inbounds float* %tmp14210, i64 1
+  %tmp14212 = getelementptr inbounds float* %tmp14211, i64 1
+  %tmp14213 = getelementptr inbounds float* %tmp14212, i64 1
+  %tmp14214 = getelementptr inbounds float* %tmp14213, i64 1
+  %tmp14215 = getelementptr inbounds float* %tmp14214, i64 1
+  %tmp14216 = getelementptr inbounds float* %tmp14215, i64 1
+  %tmp14217 = getelementptr inbounds float* %tmp14216, i64 1
+  %tmp14218 = getelementptr inbounds float* %tmp14217, i64 1
+  %tmp14219 = getelementptr inbounds float* %tmp14218, i64 1
+  %tmp14220 = getelementptr inbounds float* %tmp14219, i64 1
+  %tmp14221 = getelementptr inbounds float* %tmp14220, i64 1
+  %tmp14222 = getelementptr inbounds float* %tmp14221, i64 1
+  %tmp14223 = getelementptr inbounds float* %tmp14222, i64 1
+  %tmp14224 = getelementptr inbounds float* %tmp14223, i64 1
+  %tmp14225 = getelementptr inbounds float* %tmp14224, i64 1
+  %tmp14226 = getelementptr inbounds float* %tmp14225, i64 1
+  %tmp14227 = getelementptr inbounds float* %tmp14226, i64 1
+  %tmp14228 = getelementptr inbounds float* %tmp14227, i64 1
+  %tmp14229 = getelementptr inbounds float* %tmp14228, i64 1
+  %tmp14230 = getelementptr inbounds float* %tmp14229, i64 1
+  %tmp14231 = getelementptr inbounds float* %tmp14230, i64 1
+  %tmp14232 = getelementptr inbounds float* %tmp14231, i64 1
+  %tmp14233 = getelementptr inbounds float* %tmp14232, i64 1
+  %tmp14234 = getelementptr inbounds float* %tmp14233, i64 1
+  %tmp14235 = getelementptr inbounds float* %tmp14234, i64 1
+  %tmp14236 = getelementptr inbounds float* %tmp14235, i64 1
+  %tmp14237 = getelementptr inbounds float* %tmp14236, i64 1
+  %tmp14238 = getelementptr inbounds float* %tmp14237, i64 1
+  %tmp14239 = getelementptr inbounds float* %tmp14238, i64 1
+  %tmp14240 = getelementptr inbounds float* %tmp14239, i64 1
+  %tmp14241 = getelementptr inbounds float* %tmp14240, i64 1
+  %tmp14242 = getelementptr inbounds float* %tmp14241, i64 1
+  %tmp14243 = getelementptr inbounds float* %tmp14242, i64 1
+  %tmp14244 = getelementptr inbounds float* %tmp14243, i64 1
+  %tmp14245 = getelementptr inbounds float* %tmp14244, i64 1
+  %tmp14246 = getelementptr inbounds float* %tmp14245, i64 1
+  %tmp14247 = getelementptr inbounds float* %tmp14246, i64 1
+  %tmp14248 = getelementptr inbounds float* %tmp14247, i64 1
+  %tmp14249 = getelementptr inbounds float* %tmp14248, i64 1
+  %tmp14250 = getelementptr inbounds float* %tmp14249, i64 1
+  %tmp14251 = getelementptr inbounds float* %tmp14250, i64 1
+  %tmp14252 = getelementptr inbounds float* %tmp14251, i64 1
+  %tmp14253 = getelementptr inbounds float* %tmp14252, i64 1
+  %tmp14254 = getelementptr inbounds float* %tmp14253, i64 1
+  %tmp14255 = getelementptr inbounds float* %tmp14254, i64 1
+  %tmp14256 = getelementptr inbounds float* %tmp14255, i64 1
+  %tmp14257 = getelementptr inbounds float* %tmp14256, i64 1
+  %tmp14258 = getelementptr inbounds float* %tmp14257, i64 1
+  %tmp14259 = getelementptr inbounds float* %tmp14258, i64 1
+  %tmp14260 = getelementptr inbounds float* %tmp14259, i64 1
+  %tmp14261 = getelementptr inbounds float* %tmp14260, i64 1
+  %tmp14262 = getelementptr inbounds float* %tmp14261, i64 1
+  %tmp14263 = getelementptr inbounds float* %tmp14262, i64 1
+  %tmp14264 = getelementptr inbounds float* %tmp14263, i64 1
+  %tmp14265 = getelementptr inbounds float* %tmp14264, i64 1
+  %tmp14266 = getelementptr inbounds float* %tmp14265, i64 1
+  %tmp14267 = getelementptr inbounds float* %tmp14266, i64 1
+  %tmp14268 = getelementptr inbounds float* %tmp14267, i64 1
+  %tmp14269 = getelementptr inbounds float* %tmp14268, i64 1
+  %tmp14270 = getelementptr inbounds float* %tmp14269, i64 1
+  %tmp14271 = getelementptr inbounds float* %tmp14270, i64 1
+  %tmp14272 = getelementptr inbounds float* %tmp14271, i64 1
+  %tmp14273 = getelementptr inbounds float* %tmp14272, i64 1
+  %tmp14274 = getelementptr inbounds float* %tmp14273, i64 1
+  %tmp14275 = getelementptr inbounds float* %tmp14274, i64 1
+  %tmp14276 = getelementptr inbounds float* %tmp14275, i64 1
+  %tmp14277 = getelementptr inbounds float* %tmp14276, i64 1
+  %tmp14278 = getelementptr inbounds float* %tmp14277, i64 1
+  %tmp14279 = getelementptr inbounds float* %tmp14278, i64 1
+  %tmp14280 = getelementptr inbounds float* %tmp14279, i64 1
+  %tmp14281 = getelementptr inbounds float* %tmp14280, i64 1
+  %tmp14282 = getelementptr inbounds float* %tmp14281, i64 1
+  %tmp14283 = getelementptr inbounds float* %tmp14282, i64 1
+  %tmp14284 = getelementptr inbounds float* %tmp14283, i64 1
+  %tmp14285 = getelementptr inbounds float* %tmp14284, i64 1
+  %tmp14286 = getelementptr inbounds float* %tmp14285, i64 1
+  %tmp14287 = getelementptr inbounds float* %tmp14286, i64 1
+  %tmp14288 = getelementptr inbounds float* %tmp14287, i64 1
+  %tmp14289 = getelementptr inbounds float* %tmp14288, i64 1
+  %tmp14290 = getelementptr inbounds float* %tmp14289, i64 1
+  %tmp14291 = getelementptr inbounds float* %tmp14290, i64 1
+  %tmp14292 = getelementptr inbounds float* %tmp14291, i64 1
+  %tmp14293 = getelementptr inbounds float* %tmp14292, i64 1
+  %tmp14294 = getelementptr inbounds float* %tmp14293, i64 1
+  %tmp14295 = getelementptr inbounds float* %tmp14294, i64 1
+  %tmp14296 = getelementptr inbounds float* %tmp14295, i64 1
+  %tmp14297 = getelementptr inbounds float* %tmp14296, i64 1
+  %tmp14298 = getelementptr inbounds float* %tmp14297, i64 1
+  %tmp14299 = getelementptr inbounds float* %tmp14298, i64 1
+  %tmp14300 = getelementptr inbounds float* %tmp14299, i64 1
+  %tmp14301 = getelementptr inbounds float* %tmp14300, i64 1
+  %tmp14302 = getelementptr inbounds float* %tmp14301, i64 1
+  %tmp14303 = getelementptr inbounds float* %tmp14302, i64 1
+  %tmp14304 = getelementptr inbounds float* %tmp14303, i64 1
+  %tmp14305 = getelementptr inbounds float* %tmp14304, i64 1
+  %tmp14306 = getelementptr inbounds float* %tmp14305, i64 1
+  %tmp14307 = getelementptr inbounds float* %tmp14306, i64 1
+  %tmp14308 = getelementptr inbounds float* %tmp14307, i64 1
+  %tmp14309 = getelementptr inbounds float* %tmp14308, i64 1
+  %tmp14310 = getelementptr inbounds float* %tmp14309, i64 1
+  %tmp14311 = getelementptr inbounds float* %tmp14310, i64 1
+  %tmp14312 = getelementptr inbounds float* %tmp14311, i64 1
+  %tmp14313 = getelementptr inbounds float* %tmp14312, i64 1
+  %tmp14314 = getelementptr inbounds float* %tmp14313, i64 1
+  %tmp14315 = getelementptr inbounds float* %tmp14314, i64 1
+  %tmp14316 = getelementptr inbounds float* %tmp14315, i64 1
+  %tmp14317 = getelementptr inbounds float* %tmp14316, i64 1
+  %tmp14318 = getelementptr inbounds float* %tmp14317, i64 1
+  %tmp14319 = getelementptr inbounds float* %tmp14318, i64 1
+  %tmp14320 = getelementptr inbounds float* %tmp14319, i64 1
+  %tmp14321 = getelementptr inbounds float* %tmp14320, i64 1
+  %tmp14322 = getelementptr inbounds float* %tmp14321, i64 1
+  %tmp14323 = getelementptr inbounds float* %tmp14322, i64 1
+  %tmp14324 = getelementptr inbounds float* %tmp14323, i64 1
+  %tmp14325 = getelementptr inbounds float* %tmp14324, i64 1
+  %tmp14326 = getelementptr inbounds float* %tmp14325, i64 1
+  %tmp14327 = getelementptr inbounds float* %tmp14326, i64 1
+  %tmp14328 = getelementptr inbounds float* %tmp14327, i64 1
+  %tmp14329 = getelementptr inbounds float* %tmp14328, i64 1
+  %tmp14330 = getelementptr inbounds float* %tmp14329, i64 1
+  %tmp14331 = getelementptr inbounds float* %tmp14330, i64 1
+  %tmp14332 = getelementptr inbounds float* %tmp14331, i64 1
+  %tmp14333 = getelementptr inbounds float* %tmp14332, i64 1
+  %tmp14334 = getelementptr inbounds float* %tmp14333, i64 1
+  %tmp14335 = getelementptr inbounds float* %tmp14334, i64 1
+  %tmp14336 = getelementptr inbounds float* %tmp14335, i64 1
+  %tmp14337 = getelementptr inbounds float* %tmp14336, i64 1
+  %tmp14338 = getelementptr inbounds float* %tmp14337, i64 1
+  %tmp14339 = getelementptr inbounds float* %tmp14338, i64 1
+  %tmp14340 = getelementptr inbounds float* %tmp14339, i64 1
+  %tmp14341 = getelementptr inbounds float* %tmp14340, i64 1
+  %tmp14342 = getelementptr inbounds float* %tmp14341, i64 1
+  %tmp14343 = getelementptr inbounds float* %tmp14342, i64 1
+  %tmp14344 = getelementptr inbounds float* %tmp14343, i64 1
+  %tmp14345 = getelementptr inbounds float* %tmp14344, i64 1
+  %tmp14346 = getelementptr inbounds float* %tmp14345, i64 1
+  %tmp14347 = getelementptr inbounds float* %tmp14346, i64 1
+  %tmp14348 = getelementptr inbounds float* %tmp14347, i64 1
+  %tmp14349 = getelementptr inbounds float* %tmp14348, i64 1
+  %tmp14350 = getelementptr inbounds float* %tmp14349, i64 1
+  %tmp14351 = getelementptr inbounds float* %tmp14350, i64 1
+  %tmp14352 = getelementptr inbounds float* %tmp14351, i64 1
+  %tmp14353 = getelementptr inbounds float* %tmp14352, i64 1
+  %tmp14354 = getelementptr inbounds float* %tmp14353, i64 1
+  %tmp14355 = getelementptr inbounds float* %tmp14354, i64 1
+  %tmp14356 = getelementptr inbounds float* %tmp14355, i64 1
+  %tmp14357 = getelementptr inbounds float* %tmp14356, i64 1
+  %tmp14358 = getelementptr inbounds float* %tmp14357, i64 1
+  %tmp14359 = getelementptr inbounds float* %tmp14358, i64 1
+  %tmp14360 = getelementptr inbounds float* %tmp14359, i64 1
+  %tmp14361 = getelementptr inbounds float* %tmp14360, i64 1
+  %tmp14362 = getelementptr inbounds float* %tmp14361, i64 1
+  %tmp14363 = getelementptr inbounds float* %tmp14362, i64 1
+  %tmp14364 = getelementptr inbounds float* %tmp14363, i64 1
+  %tmp14365 = getelementptr inbounds float* %tmp14364, i64 1
+  %tmp14366 = getelementptr inbounds float* %tmp14365, i64 1
+  %tmp14367 = getelementptr inbounds float* %tmp14366, i64 1
+  %tmp14368 = getelementptr inbounds float* %tmp14367, i64 1
+  %tmp14369 = getelementptr inbounds float* %tmp14368, i64 1
+  %tmp14370 = getelementptr inbounds float* %tmp14369, i64 1
+  %tmp14371 = getelementptr inbounds float* %tmp14370, i64 1
+  %tmp14372 = getelementptr inbounds float* %tmp14371, i64 1
+  %tmp14373 = getelementptr inbounds float* %tmp14372, i64 1
+  %tmp14374 = getelementptr inbounds float* %tmp14373, i64 1
+  %tmp14375 = getelementptr inbounds float* %tmp14374, i64 1
+  %tmp14376 = getelementptr inbounds float* %tmp14375, i64 1
+  %tmp14377 = getelementptr inbounds float* %tmp14376, i64 1
+  %tmp14378 = getelementptr inbounds float* %tmp14377, i64 1
+  %tmp14379 = getelementptr inbounds float* %tmp14378, i64 1
+  %tmp14380 = getelementptr inbounds float* %tmp14379, i64 1
+  %tmp14381 = getelementptr inbounds float* %tmp14380, i64 1
+  %tmp14382 = getelementptr inbounds float* %tmp14381, i64 1
+  %tmp14383 = getelementptr inbounds float* %tmp14382, i64 1
+  %tmp14384 = getelementptr inbounds float* %tmp14383, i64 1
+  %tmp14385 = getelementptr inbounds float* %tmp14384, i64 1
+  %tmp14386 = getelementptr inbounds float* %tmp14385, i64 1
+  %tmp14387 = getelementptr inbounds float* %tmp14386, i64 1
+  %tmp14388 = getelementptr inbounds float* %tmp14387, i64 1
+  %tmp14389 = getelementptr inbounds float* %tmp14388, i64 1
+  %tmp14390 = getelementptr inbounds float* %tmp14389, i64 1
+  %tmp14391 = getelementptr inbounds float* %tmp14390, i64 1
+  %tmp14392 = getelementptr inbounds float* %tmp14391, i64 1
+  %tmp14393 = getelementptr inbounds float* %tmp14392, i64 1
+  %tmp14394 = getelementptr inbounds float* %tmp14393, i64 1
+  %tmp14395 = getelementptr inbounds float* %tmp14394, i64 1
+  %tmp14396 = getelementptr inbounds float* %tmp14395, i64 1
+  %tmp14397 = getelementptr inbounds float* %tmp14396, i64 1
+  %tmp14398 = getelementptr inbounds float* %tmp14397, i64 1
+  %tmp14399 = getelementptr inbounds float* %tmp14398, i64 1
+  %tmp14400 = getelementptr inbounds float* %tmp14399, i64 1
+  %tmp14401 = getelementptr inbounds float* %tmp14400, i64 1
+  %tmp14402 = getelementptr inbounds float* %tmp14401, i64 1
+  %tmp14403 = getelementptr inbounds float* %tmp14402, i64 1
+  %tmp14404 = getelementptr inbounds float* %tmp14403, i64 1
+  %tmp14405 = getelementptr inbounds float* %tmp14404, i64 1
+  %tmp14406 = getelementptr inbounds float* %tmp14405, i64 1
+  %tmp14407 = getelementptr inbounds float* %tmp14406, i64 1
+  %tmp14408 = getelementptr inbounds float* %tmp14407, i64 1
+  %tmp14409 = getelementptr inbounds float* %tmp14408, i64 1
+  %tmp14410 = getelementptr inbounds float* %tmp14409, i64 1
+  %tmp14411 = getelementptr inbounds float* %tmp14410, i64 1
+  %tmp14412 = getelementptr inbounds float* %tmp14411, i64 1
+  %tmp14413 = getelementptr inbounds float* %tmp14412, i64 1
+  %tmp14414 = getelementptr inbounds float* %tmp14413, i64 1
+  %tmp14415 = getelementptr inbounds float* %tmp14414, i64 1
+  %tmp14416 = getelementptr inbounds float* %tmp14415, i64 1
+  %tmp14417 = getelementptr inbounds float* %tmp14416, i64 1
+  %tmp14418 = getelementptr inbounds float* %tmp14417, i64 1
+  %tmp14419 = getelementptr inbounds float* %tmp14418, i64 1
+  %tmp14420 = getelementptr inbounds float* %tmp14419, i64 1
+  %tmp14421 = getelementptr inbounds float* %tmp14420, i64 1
+  %tmp14422 = getelementptr inbounds float* %tmp14421, i64 1
+  %tmp14423 = getelementptr inbounds float* %tmp14422, i64 1
+  %tmp14424 = getelementptr inbounds float* %tmp14423, i64 1
+  %tmp14425 = getelementptr inbounds float* %tmp14424, i64 1
+  %tmp14426 = getelementptr inbounds float* %tmp14425, i64 1
+  %tmp14427 = getelementptr inbounds float* %tmp14426, i64 1
+  %tmp14428 = getelementptr inbounds float* %tmp14427, i64 1
+  %tmp14429 = getelementptr inbounds float* %tmp14428, i64 1
+  %tmp14430 = getelementptr inbounds float* %tmp14429, i64 1
+  %tmp14431 = getelementptr inbounds float* %tmp14430, i64 1
+  %tmp14432 = getelementptr inbounds float* %tmp14431, i64 1
+  %tmp14433 = getelementptr inbounds float* %tmp14432, i64 1
+  %tmp14434 = getelementptr inbounds float* %tmp14433, i64 1
+  %tmp14435 = getelementptr inbounds float* %tmp14434, i64 1
+  %tmp14436 = getelementptr inbounds float* %tmp14435, i64 1
+  %tmp14437 = getelementptr inbounds float* %tmp14436, i64 1
+  %tmp14438 = getelementptr inbounds float* %tmp14437, i64 1
+  %tmp14439 = getelementptr inbounds float* %tmp14438, i64 1
+  %tmp14440 = getelementptr inbounds float* %tmp14439, i64 1
+  %tmp14441 = getelementptr inbounds float* %tmp14440, i64 1
+  %tmp14442 = getelementptr inbounds float* %tmp14441, i64 1
+  %tmp14443 = getelementptr inbounds float* %tmp14442, i64 1
+  %tmp14444 = getelementptr inbounds float* %tmp14443, i64 1
+  %tmp14445 = getelementptr inbounds float* %tmp14444, i64 1
+  %tmp14446 = getelementptr inbounds float* %tmp14445, i64 1
+  %tmp14447 = getelementptr inbounds float* %tmp14446, i64 1
+  %tmp14448 = getelementptr inbounds float* %tmp14447, i64 1
+  %tmp14449 = getelementptr inbounds float* %tmp14448, i64 1
+  %tmp14450 = getelementptr inbounds float* %tmp14449, i64 1
+  %tmp14451 = getelementptr inbounds float* %tmp14450, i64 1
+  %tmp14452 = getelementptr inbounds float* %tmp14451, i64 1
+  %tmp14453 = getelementptr inbounds float* %tmp14452, i64 1
+  %tmp14454 = getelementptr inbounds float* %tmp14453, i64 1
+  %tmp14455 = getelementptr inbounds float* %tmp14454, i64 1
+  %tmp14456 = getelementptr inbounds float* %tmp14455, i64 1
+  %tmp14457 = getelementptr inbounds float* %tmp14456, i64 1
+  %tmp14458 = getelementptr inbounds float* %tmp14457, i64 1
+  %tmp14459 = getelementptr inbounds float* %tmp14458, i64 1
+  %tmp14460 = getelementptr inbounds float* %tmp14459, i64 1
+  %tmp14461 = getelementptr inbounds float* %tmp14460, i64 1
+  %tmp14462 = getelementptr inbounds float* %tmp14461, i64 1
+  %tmp14463 = getelementptr inbounds float* %tmp14462, i64 1
+  %tmp14464 = getelementptr inbounds float* %tmp14463, i64 1
+  %tmp14465 = getelementptr inbounds float* %tmp14464, i64 1
+  %tmp14466 = getelementptr inbounds float* %tmp14465, i64 1
+  %tmp14467 = getelementptr inbounds float* %tmp14466, i64 1
+  %tmp14468 = getelementptr inbounds float* %tmp14467, i64 1
+  %tmp14469 = getelementptr inbounds float* %tmp14468, i64 1
+  %tmp14470 = getelementptr inbounds float* %tmp14469, i64 1
+  %tmp14471 = getelementptr inbounds float* %tmp14470, i64 1
+  %tmp14472 = getelementptr inbounds float* %tmp14471, i64 1
+  %tmp14473 = getelementptr inbounds float* %tmp14472, i64 1
+  %tmp14474 = getelementptr inbounds float* %tmp14473, i64 1
+  %tmp14475 = getelementptr inbounds float* %tmp14474, i64 1
+  %tmp14476 = getelementptr inbounds float* %tmp14475, i64 1
+  %tmp14477 = getelementptr inbounds float* %tmp14476, i64 1
+  %tmp14478 = getelementptr inbounds float* %tmp14477, i64 1
+  %tmp14479 = getelementptr inbounds float* %tmp14478, i64 1
+  %tmp14480 = getelementptr inbounds float* %tmp14479, i64 1
+  %tmp14481 = getelementptr inbounds float* %tmp14480, i64 1
+  %tmp14482 = getelementptr inbounds float* %tmp14481, i64 1
+  %tmp14483 = getelementptr inbounds float* %tmp14482, i64 1
+  %tmp14484 = getelementptr inbounds float* %tmp14483, i64 1
+  %tmp14485 = getelementptr inbounds float* %tmp14484, i64 1
+  %tmp14486 = getelementptr inbounds float* %tmp14485, i64 1
+  %tmp14487 = getelementptr inbounds float* %tmp14486, i64 1
+  %tmp14488 = getelementptr inbounds float* %tmp14487, i64 1
+  %tmp14489 = getelementptr inbounds float* %tmp14488, i64 1
+  %tmp14490 = getelementptr inbounds float* %tmp14489, i64 1
+  %tmp14491 = getelementptr inbounds float* %tmp14490, i64 1
+  %tmp14492 = getelementptr inbounds float* %tmp14491, i64 1
+  %tmp14493 = getelementptr inbounds float* %tmp14492, i64 1
+  %tmp14494 = getelementptr inbounds float* %tmp14493, i64 1
+  %tmp14495 = getelementptr inbounds float* %tmp14494, i64 1
+  %tmp14496 = getelementptr inbounds float* %tmp14495, i64 1
+  %tmp14497 = getelementptr inbounds float* %tmp14496, i64 1
+  %tmp14498 = getelementptr inbounds float* %tmp14497, i64 1
+  %tmp14499 = getelementptr inbounds float* %tmp14498, i64 1
+  %tmp14500 = getelementptr inbounds float* %tmp14499, i64 1
+  %tmp14501 = getelementptr inbounds float* %tmp14500, i64 1
+  %tmp14502 = getelementptr inbounds float* %tmp14501, i64 1
+  %tmp14503 = getelementptr inbounds float* %tmp14502, i64 1
+  %tmp14504 = getelementptr inbounds float* %tmp14503, i64 1
+  %tmp14505 = getelementptr inbounds float* %tmp14504, i64 1
+  %tmp14506 = getelementptr inbounds float* %tmp14505, i64 1
+  %tmp14507 = getelementptr inbounds float* %tmp14506, i64 1
+  %tmp14508 = getelementptr inbounds float* %tmp14507, i64 1
+  %tmp14509 = getelementptr inbounds float* %tmp14508, i64 1
+  %tmp14510 = getelementptr inbounds float* %tmp14509, i64 1
+  %tmp14511 = getelementptr inbounds float* %tmp14510, i64 1
+  %tmp14512 = getelementptr inbounds float* %tmp14511, i64 1
+  %tmp14513 = getelementptr inbounds float* %tmp14512, i64 1
+  %tmp14514 = getelementptr inbounds float* %tmp14513, i64 1
+  %tmp14515 = getelementptr inbounds float* %tmp14514, i64 1
+  %tmp14516 = getelementptr inbounds float* %tmp14515, i64 1
+  %tmp14517 = getelementptr inbounds float* %tmp14516, i64 1
+  %tmp14518 = getelementptr inbounds float* %tmp14517, i64 1
+  %tmp14519 = getelementptr inbounds float* %tmp14518, i64 1
+  %tmp14520 = getelementptr inbounds float* %tmp14519, i64 1
+  %tmp14521 = getelementptr inbounds float* %tmp14520, i64 1
+  %tmp14522 = getelementptr inbounds float* %tmp14521, i64 1
+  %tmp14523 = getelementptr inbounds float* %tmp14522, i64 1
+  %tmp14524 = getelementptr inbounds float* %tmp14523, i64 1
+  %tmp14525 = getelementptr inbounds float* %tmp14524, i64 1
+  %tmp14526 = getelementptr inbounds float* %tmp14525, i64 1
+  %tmp14527 = getelementptr inbounds float* %tmp14526, i64 1
+  %tmp14528 = getelementptr inbounds float* %tmp14527, i64 1
+  %tmp14529 = getelementptr inbounds float* %tmp14528, i64 1
+  %tmp14530 = getelementptr inbounds float* %tmp14529, i64 1
+  %tmp14531 = getelementptr inbounds float* %tmp14530, i64 1
+  %tmp14532 = getelementptr inbounds float* %tmp14531, i64 1
+  %tmp14533 = getelementptr inbounds float* %tmp14532, i64 1
+  %tmp14534 = getelementptr inbounds float* %tmp14533, i64 1
+  %tmp14535 = getelementptr inbounds float* %tmp14534, i64 1
+  %tmp14536 = getelementptr inbounds float* %tmp14535, i64 1
+  %tmp14537 = getelementptr inbounds float* %tmp14536, i64 1
+  %tmp14538 = getelementptr inbounds float* %tmp14537, i64 1
+  %tmp14539 = getelementptr inbounds float* %tmp14538, i64 1
+  %tmp14540 = getelementptr inbounds float* %tmp14539, i64 1
+  %tmp14541 = getelementptr inbounds float* %tmp14540, i64 1
+  %tmp14542 = getelementptr inbounds float* %tmp14541, i64 1
+  %tmp14543 = getelementptr inbounds float* %tmp14542, i64 1
+  %tmp14544 = getelementptr inbounds float* %tmp14543, i64 1
+  %tmp14545 = getelementptr inbounds float* %tmp14544, i64 1
+  %tmp14546 = getelementptr inbounds float* %tmp14545, i64 1
+  %tmp14547 = getelementptr inbounds float* %tmp14546, i64 1
+  %tmp14548 = getelementptr inbounds float* %tmp14547, i64 1
+  %tmp14549 = getelementptr inbounds float* %tmp14548, i64 1
+  %tmp14550 = getelementptr inbounds float* %tmp14549, i64 1
+  %tmp14551 = getelementptr inbounds float* %tmp14550, i64 1
+  %tmp14552 = getelementptr inbounds float* %tmp14551, i64 1
+  %tmp14553 = getelementptr inbounds float* %tmp14552, i64 1
+  %tmp14554 = getelementptr inbounds float* %tmp14553, i64 1
+  %tmp14555 = getelementptr inbounds float* %tmp14554, i64 1
+  %tmp14556 = getelementptr inbounds float* %tmp14555, i64 1
+  %tmp14557 = getelementptr inbounds float* %tmp14556, i64 1
+  %tmp14558 = getelementptr inbounds float* %tmp14557, i64 1
+  %tmp14559 = getelementptr inbounds float* %tmp14558, i64 1
+  %tmp14560 = getelementptr inbounds float* %tmp14559, i64 1
+  %tmp14561 = getelementptr inbounds float* %tmp14560, i64 1
+  %tmp14562 = getelementptr inbounds float* %tmp14561, i64 1
+  %tmp14563 = getelementptr inbounds float* %tmp14562, i64 1
+  %tmp14564 = getelementptr inbounds float* %tmp14563, i64 1
+  %tmp14565 = getelementptr inbounds float* %tmp14564, i64 1
+  %tmp14566 = getelementptr inbounds float* %tmp14565, i64 1
+  %tmp14567 = getelementptr inbounds float* %tmp14566, i64 1
+  %tmp14568 = getelementptr inbounds float* %tmp14567, i64 1
+  %tmp14569 = getelementptr inbounds float* %tmp14568, i64 1
+  %tmp14570 = getelementptr inbounds float* %tmp14569, i64 1
+  %tmp14571 = getelementptr inbounds float* %tmp14570, i64 1
+  %tmp14572 = getelementptr inbounds float* %tmp14571, i64 1
+  %tmp14573 = getelementptr inbounds float* %tmp14572, i64 1
+  %tmp14574 = getelementptr inbounds float* %tmp14573, i64 1
+  %tmp14575 = getelementptr inbounds float* %tmp14574, i64 1
+  %tmp14576 = getelementptr inbounds float* %tmp14575, i64 1
+  %tmp14577 = getelementptr inbounds float* %tmp14576, i64 1
+  %tmp14578 = getelementptr inbounds float* %tmp14577, i64 1
+  %tmp14579 = getelementptr inbounds float* %tmp14578, i64 1
+  %tmp14580 = getelementptr inbounds float* %tmp14579, i64 1
+  %tmp14581 = getelementptr inbounds float* %tmp14580, i64 1
+  %tmp14582 = getelementptr inbounds float* %tmp14581, i64 1
+  %tmp14583 = getelementptr inbounds float* %tmp14582, i64 1
+  %tmp14584 = getelementptr inbounds float* %tmp14583, i64 1
+  %tmp14585 = getelementptr inbounds float* %tmp14584, i64 1
+  %tmp14586 = getelementptr inbounds float* %tmp14585, i64 1
+  %tmp14587 = getelementptr inbounds float* %tmp14586, i64 1
+  %tmp14588 = getelementptr inbounds float* %tmp14587, i64 1
+  %tmp14589 = getelementptr inbounds float* %tmp14588, i64 1
+  %tmp14590 = getelementptr inbounds float* %tmp14589, i64 1
+  %tmp14591 = getelementptr inbounds float* %tmp14590, i64 1
+  %tmp14592 = getelementptr inbounds float* %tmp14591, i64 1
+  %tmp14593 = getelementptr inbounds float* %tmp14592, i64 1
+  %tmp14594 = getelementptr inbounds float* %tmp14593, i64 1
+  %tmp14595 = getelementptr inbounds float* %tmp14594, i64 1
+  %tmp14596 = getelementptr inbounds float* %tmp14595, i64 1
+  %tmp14597 = getelementptr inbounds float* %tmp14596, i64 1
+  %tmp14598 = getelementptr inbounds float* %tmp14597, i64 1
+  %tmp14599 = getelementptr inbounds float* %tmp14598, i64 1
+  %tmp14600 = getelementptr inbounds float* %tmp14599, i64 1
+  %tmp14601 = getelementptr inbounds float* %tmp14600, i64 1
+  %tmp14602 = getelementptr inbounds float* %tmp14601, i64 1
+  %tmp14603 = getelementptr inbounds float* %tmp14602, i64 1
+  %tmp14604 = getelementptr inbounds float* %tmp14603, i64 1
+  %tmp14605 = getelementptr inbounds float* %tmp14604, i64 1
+  %tmp14606 = getelementptr inbounds float* %tmp14605, i64 1
+  %tmp14607 = getelementptr inbounds float* %tmp14606, i64 1
+  %tmp14608 = getelementptr inbounds float* %tmp14607, i64 1
+  %tmp14609 = getelementptr inbounds float* %tmp14608, i64 1
+  %tmp14610 = getelementptr inbounds float* %tmp14609, i64 1
+  %tmp14611 = getelementptr inbounds float* %tmp14610, i64 1
+  %tmp14612 = getelementptr inbounds float* %tmp14611, i64 1
+  %tmp14613 = getelementptr inbounds float* %tmp14612, i64 1
+  %tmp14614 = getelementptr inbounds float* %tmp14613, i64 1
+  %tmp14615 = getelementptr inbounds float* %tmp14614, i64 1
+  %tmp14616 = getelementptr inbounds float* %tmp14615, i64 1
+  %tmp14617 = getelementptr inbounds float* %tmp14616, i64 1
+  %tmp14618 = getelementptr inbounds float* %tmp14617, i64 1
+  %tmp14619 = getelementptr inbounds float* %tmp14618, i64 1
+  %tmp14620 = getelementptr inbounds float* %tmp14619, i64 1
+  %tmp14621 = getelementptr inbounds float* %tmp14620, i64 1
+  %tmp14622 = getelementptr inbounds float* %tmp14621, i64 1
+  %tmp14623 = getelementptr inbounds float* %tmp14622, i64 1
+  %tmp14624 = getelementptr inbounds float* %tmp14623, i64 1
+  %tmp14625 = getelementptr inbounds float* %tmp14624, i64 1
+  %tmp14626 = getelementptr inbounds float* %tmp14625, i64 1
+  %tmp14627 = getelementptr inbounds float* %tmp14626, i64 1
+  %tmp14628 = getelementptr inbounds float* %tmp14627, i64 1
+  %tmp14629 = getelementptr inbounds float* %tmp14628, i64 1
+  %tmp14630 = getelementptr inbounds float* %tmp14629, i64 1
+  %tmp14631 = getelementptr inbounds float* %tmp14630, i64 1
+  %tmp14632 = getelementptr inbounds float* %tmp14631, i64 1
+  %tmp14633 = getelementptr inbounds float* %tmp14632, i64 1
+  %tmp14634 = getelementptr inbounds float* %tmp14633, i64 1
+  %tmp14635 = getelementptr inbounds float* %tmp14634, i64 1
+  %tmp14636 = getelementptr inbounds float* %tmp14635, i64 1
+  %tmp14637 = getelementptr inbounds float* %tmp14636, i64 1
+  %tmp14638 = getelementptr inbounds float* %tmp14637, i64 1
+  %tmp14639 = getelementptr inbounds float* %tmp14638, i64 1
+  %tmp14640 = getelementptr inbounds float* %tmp14639, i64 1
+  %tmp14641 = getelementptr inbounds float* %tmp14640, i64 1
+  %tmp14642 = getelementptr inbounds float* %tmp14641, i64 1
+  %tmp14643 = getelementptr inbounds float* %tmp14642, i64 1
+  %tmp14644 = getelementptr inbounds float* %tmp14643, i64 1
+  %tmp14645 = getelementptr inbounds float* %tmp14644, i64 1
+  %tmp14646 = getelementptr inbounds float* %tmp14645, i64 1
+  %tmp14647 = getelementptr inbounds float* %tmp14646, i64 1
+  %tmp14648 = getelementptr inbounds float* %tmp14647, i64 1
+  %tmp14649 = getelementptr inbounds float* %tmp14648, i64 1
+  %tmp14650 = getelementptr inbounds float* %tmp14649, i64 1
+  %tmp14651 = getelementptr inbounds float* %tmp14650, i64 1
+  %tmp14652 = getelementptr inbounds float* %tmp14651, i64 1
+  %tmp14653 = getelementptr inbounds float* %tmp14652, i64 1
+  %tmp14654 = getelementptr inbounds float* %tmp14653, i64 1
+  %tmp14655 = getelementptr inbounds float* %tmp14654, i64 1
+  %tmp14656 = getelementptr inbounds float* %tmp14655, i64 1
+  %tmp14657 = getelementptr inbounds float* %tmp14656, i64 1
+  %tmp14658 = getelementptr inbounds float* %tmp14657, i64 1
+  %tmp14659 = getelementptr inbounds float* %tmp14658, i64 1
+  %tmp14660 = getelementptr inbounds float* %tmp14659, i64 1
+  %tmp14661 = getelementptr inbounds float* %tmp14660, i64 1
+  %tmp14662 = getelementptr inbounds float* %tmp14661, i64 1
+  %tmp14663 = getelementptr inbounds float* %tmp14662, i64 1
+  %tmp14664 = getelementptr inbounds float* %tmp14663, i64 1
+  %tmp14665 = getelementptr inbounds float* %tmp14664, i64 1
+  %tmp14666 = getelementptr inbounds float* %tmp14665, i64 1
+  %tmp14667 = getelementptr inbounds float* %tmp14666, i64 1
+  %tmp14668 = getelementptr inbounds float* %tmp14667, i64 1
+  %tmp14669 = getelementptr inbounds float* %tmp14668, i64 1
+  %tmp14670 = getelementptr inbounds float* %tmp14669, i64 1
+  %tmp14671 = getelementptr inbounds float* %tmp14670, i64 1
+  %tmp14672 = getelementptr inbounds float* %tmp14671, i64 1
+  %tmp14673 = getelementptr inbounds float* %tmp14672, i64 1
+  %tmp14674 = getelementptr inbounds float* %tmp14673, i64 1
+  %tmp14675 = getelementptr inbounds float* %tmp14674, i64 1
+  %tmp14676 = getelementptr inbounds float* %tmp14675, i64 1
+  %tmp14677 = getelementptr inbounds float* %tmp14676, i64 1
+  %tmp14678 = getelementptr inbounds float* %tmp14677, i64 1
+  %tmp14679 = getelementptr inbounds float* %tmp14678, i64 1
+  %tmp14680 = getelementptr inbounds float* %tmp14679, i64 1
+  %tmp14681 = getelementptr inbounds float* %tmp14680, i64 1
+  %tmp14682 = getelementptr inbounds float* %tmp14681, i64 1
+  %tmp14683 = getelementptr inbounds float* %tmp14682, i64 1
+  %tmp14684 = getelementptr inbounds float* %tmp14683, i64 1
+  %tmp14685 = getelementptr inbounds float* %tmp14684, i64 1
+  %tmp14686 = getelementptr inbounds float* %tmp14685, i64 1
+  %tmp14687 = getelementptr inbounds float* %tmp14686, i64 1
+  %tmp14688 = getelementptr inbounds float* %tmp14687, i64 1
+  %tmp14689 = getelementptr inbounds float* %tmp14688, i64 1
+  %tmp14690 = getelementptr inbounds float* %tmp14689, i64 1
+  %tmp14691 = getelementptr inbounds float* %tmp14690, i64 1
+  %tmp14692 = getelementptr inbounds float* %tmp14691, i64 1
+  %tmp14693 = getelementptr inbounds float* %tmp14692, i64 1
+  %tmp14694 = getelementptr inbounds float* %tmp14693, i64 1
+  %tmp14695 = getelementptr inbounds float* %tmp14694, i64 1
+  %tmp14696 = getelementptr inbounds float* %tmp14695, i64 1
+  %tmp14697 = getelementptr inbounds float* %tmp14696, i64 1
+  %tmp14698 = getelementptr inbounds float* %tmp14697, i64 1
+  %tmp14699 = getelementptr inbounds float* %tmp14698, i64 1
+  %tmp14700 = getelementptr inbounds float* %tmp14699, i64 1
+  %tmp14701 = getelementptr inbounds float* %tmp14700, i64 1
+  %tmp14702 = getelementptr inbounds float* %tmp14701, i64 1
+  %tmp14703 = getelementptr inbounds float* %tmp14702, i64 1
+  %tmp14704 = getelementptr inbounds float* %tmp14703, i64 1
+  %tmp14705 = getelementptr inbounds float* %tmp14704, i64 1
+  %tmp14706 = getelementptr inbounds float* %tmp14705, i64 1
+  %tmp14707 = getelementptr inbounds float* %tmp14706, i64 1
+  %tmp14708 = getelementptr inbounds float* %tmp14707, i64 1
+  %tmp14709 = getelementptr inbounds float* %tmp14708, i64 1
+  %tmp14710 = getelementptr inbounds float* %tmp14709, i64 1
+  %tmp14711 = getelementptr inbounds float* %tmp14710, i64 1
+  %tmp14712 = getelementptr inbounds float* %tmp14711, i64 1
+  %tmp14713 = getelementptr inbounds float* %tmp14712, i64 1
+  %tmp14714 = getelementptr inbounds float* %tmp14713, i64 1
+  %tmp14715 = getelementptr inbounds float* %tmp14714, i64 1
+  %tmp14716 = getelementptr inbounds float* %tmp14715, i64 1
+  %tmp14717 = getelementptr inbounds float* %tmp14716, i64 1
+  %tmp14718 = getelementptr inbounds float* %tmp14717, i64 1
+  %tmp14719 = getelementptr inbounds float* %tmp14718, i64 1
+  %tmp14720 = getelementptr inbounds float* %tmp14719, i64 1
+  %tmp14721 = getelementptr inbounds float* %tmp14720, i64 1
+  %tmp14722 = getelementptr inbounds float* %tmp14721, i64 1
+  %tmp14723 = getelementptr inbounds float* %tmp14722, i64 1
+  %tmp14724 = getelementptr inbounds float* %tmp14723, i64 1
+  %tmp14725 = getelementptr inbounds float* %tmp14724, i64 1
+  %tmp14726 = getelementptr inbounds float* %tmp14725, i64 1
+  %tmp14727 = getelementptr inbounds float* %tmp14726, i64 1
+  %tmp14728 = getelementptr inbounds float* %tmp14727, i64 1
+  %tmp14729 = getelementptr inbounds float* %tmp14728, i64 1
+  %tmp14730 = getelementptr inbounds float* %tmp14729, i64 1
+  %tmp14731 = getelementptr inbounds float* %tmp14730, i64 1
+  %tmp14732 = getelementptr inbounds float* %tmp14731, i64 1
+  %tmp14733 = getelementptr inbounds float* %tmp14732, i64 1
+  %tmp14734 = getelementptr inbounds float* %tmp14733, i64 1
+  %tmp14735 = getelementptr inbounds float* %tmp14734, i64 1
+  %tmp14736 = getelementptr inbounds float* %tmp14735, i64 1
+  %tmp14737 = getelementptr inbounds float* %tmp14736, i64 1
+  %tmp14738 = getelementptr inbounds float* %tmp14737, i64 1
+  %tmp14739 = getelementptr inbounds float* %tmp14738, i64 1
+  %tmp14740 = getelementptr inbounds float* %tmp14739, i64 1
+  %tmp14741 = getelementptr inbounds float* %tmp14740, i64 1
+  %tmp14742 = getelementptr inbounds float* %tmp14741, i64 1
+  %tmp14743 = getelementptr inbounds float* %tmp14742, i64 1
+  %tmp14744 = getelementptr inbounds float* %tmp14743, i64 1
+  %tmp14745 = getelementptr inbounds float* %tmp14744, i64 1
+  %tmp14746 = getelementptr inbounds float* %tmp14745, i64 1
+  %tmp14747 = getelementptr inbounds float* %tmp14746, i64 1
+  %tmp14748 = getelementptr inbounds float* %tmp14747, i64 1
+  %tmp14749 = getelementptr inbounds float* %tmp14748, i64 1
+  %tmp14750 = getelementptr inbounds float* %tmp14749, i64 1
+  %tmp14751 = getelementptr inbounds float* %tmp14750, i64 1
+  %tmp14752 = getelementptr inbounds float* %tmp14751, i64 1
+  %tmp14753 = getelementptr inbounds float* %tmp14752, i64 1
+  %tmp14754 = getelementptr inbounds float* %tmp14753, i64 1
+  %tmp14755 = getelementptr inbounds float* %tmp14754, i64 1
+  %tmp14756 = getelementptr inbounds float* %tmp14755, i64 1
+  %tmp14757 = getelementptr inbounds float* %tmp14756, i64 1
+  %tmp14758 = getelementptr inbounds float* %tmp14757, i64 1
+  %tmp14759 = getelementptr inbounds float* %tmp14758, i64 1
+  %tmp14760 = getelementptr inbounds float* %tmp14759, i64 1
+  %tmp14761 = getelementptr inbounds float* %tmp14760, i64 1
+  %tmp14762 = getelementptr inbounds float* %tmp14761, i64 1
+  %tmp14763 = getelementptr inbounds float* %tmp14762, i64 1
+  %tmp14764 = getelementptr inbounds float* %tmp14763, i64 1
+  %tmp14765 = getelementptr inbounds float* %tmp14764, i64 1
+  %tmp14766 = getelementptr inbounds float* %tmp14765, i64 1
+  %tmp14767 = getelementptr inbounds float* %tmp14766, i64 1
+  %tmp14768 = getelementptr inbounds float* %tmp14767, i64 1
+  %tmp14769 = getelementptr inbounds float* %tmp14768, i64 1
+  %tmp14770 = getelementptr inbounds float* %tmp14769, i64 1
+  %tmp14771 = getelementptr inbounds float* %tmp14770, i64 1
+  %tmp14772 = getelementptr inbounds float* %tmp14771, i64 1
+  %tmp14773 = getelementptr inbounds float* %tmp14772, i64 1
+  %tmp14774 = getelementptr inbounds float* %tmp14773, i64 1
+  %tmp14775 = getelementptr inbounds float* %tmp14774, i64 1
+  %tmp14776 = getelementptr inbounds float* %tmp14775, i64 1
+  %tmp14777 = getelementptr inbounds float* %tmp14776, i64 1
+  %tmp14778 = getelementptr inbounds float* %tmp14777, i64 1
+  %tmp14779 = getelementptr inbounds float* %tmp14778, i64 1
+  %tmp14780 = getelementptr inbounds float* %tmp14779, i64 1
+  %tmp14781 = getelementptr inbounds float* %tmp14780, i64 1
+  %tmp14782 = getelementptr inbounds float* %tmp14781, i64 1
+  %tmp14783 = getelementptr inbounds float* %tmp14782, i64 1
+  %tmp14784 = getelementptr inbounds float* %tmp14783, i64 1
+  %tmp14785 = getelementptr inbounds float* %tmp14784, i64 1
+  %tmp14786 = getelementptr inbounds float* %tmp14785, i64 1
+  %tmp14787 = getelementptr inbounds float* %tmp14786, i64 1
+  %tmp14788 = getelementptr inbounds float* %tmp14787, i64 1
+  %tmp14789 = getelementptr inbounds float* %tmp14788, i64 1
+  %tmp14790 = getelementptr inbounds float* %tmp14789, i64 1
+  %tmp14791 = getelementptr inbounds float* %tmp14790, i64 1
+  %tmp14792 = getelementptr inbounds float* %tmp14791, i64 1
+  %tmp14793 = getelementptr inbounds float* %tmp14792, i64 1
+  %tmp14794 = getelementptr inbounds float* %tmp14793, i64 1
+  %tmp14795 = getelementptr inbounds float* %tmp14794, i64 1
+  %tmp14796 = getelementptr inbounds float* %tmp14795, i64 1
+  %tmp14797 = getelementptr inbounds float* %tmp14796, i64 1
+  %tmp14798 = getelementptr inbounds float* %tmp14797, i64 1
+  %tmp14799 = getelementptr inbounds float* %tmp14798, i64 1
+  %tmp14800 = getelementptr inbounds float* %tmp14799, i64 1
+  %tmp14801 = getelementptr inbounds float* %tmp14800, i64 1
+  %tmp14802 = getelementptr inbounds float* %tmp14801, i64 1
+  %tmp14803 = getelementptr inbounds float* %tmp14802, i64 1
+  %tmp14804 = getelementptr inbounds float* %tmp14803, i64 1
+  %tmp14805 = getelementptr inbounds float* %tmp14804, i64 1
+  %tmp14806 = getelementptr inbounds float* %tmp14805, i64 1
+  %tmp14807 = getelementptr inbounds float* %tmp14806, i64 1
+  %tmp14808 = getelementptr inbounds float* %tmp14807, i64 1
+  %tmp14809 = getelementptr inbounds float* %tmp14808, i64 1
+  %tmp14810 = getelementptr inbounds float* %tmp14809, i64 1
+  %tmp14811 = getelementptr inbounds float* %tmp14810, i64 1
+  %tmp14812 = getelementptr inbounds float* %tmp14811, i64 1
+  %tmp14813 = getelementptr inbounds float* %tmp14812, i64 1
+  %tmp14814 = getelementptr inbounds float* %tmp14813, i64 1
+  %tmp14815 = getelementptr inbounds float* %tmp14814, i64 1
+  %tmp14816 = getelementptr inbounds float* %tmp14815, i64 1
+  %tmp14817 = getelementptr inbounds float* %tmp14816, i64 1
+  %tmp14818 = getelementptr inbounds float* %tmp14817, i64 1
+  %tmp14819 = getelementptr inbounds float* %tmp14818, i64 1
+  %tmp14820 = getelementptr inbounds float* %tmp14819, i64 1
+  %tmp14821 = getelementptr inbounds float* %tmp14820, i64 1
+  %tmp14822 = getelementptr inbounds float* %tmp14821, i64 1
+  %tmp14823 = getelementptr inbounds float* %tmp14822, i64 1
+  %tmp14824 = getelementptr inbounds float* %tmp14823, i64 1
+  %tmp14825 = getelementptr inbounds float* %tmp14824, i64 1
+  %tmp14826 = getelementptr inbounds float* %tmp14825, i64 1
+  %tmp14827 = getelementptr inbounds float* %tmp14826, i64 1
+  %tmp14828 = getelementptr inbounds float* %tmp14827, i64 1
+  %tmp14829 = getelementptr inbounds float* %tmp14828, i64 1
+  %tmp14830 = getelementptr inbounds float* %tmp14829, i64 1
+  %tmp14831 = getelementptr inbounds float* %tmp14830, i64 1
+  %tmp14832 = getelementptr inbounds float* %tmp14831, i64 1
+  %tmp14833 = getelementptr inbounds float* %tmp14832, i64 1
+  %tmp14834 = getelementptr inbounds float* %tmp14833, i64 1
+  %tmp14835 = getelementptr inbounds float* %tmp14834, i64 1
+  %tmp14836 = getelementptr inbounds float* %tmp14835, i64 1
+  %tmp14837 = getelementptr inbounds float* %tmp14836, i64 1
+  %tmp14838 = getelementptr inbounds float* %tmp14837, i64 1
+  %tmp14839 = getelementptr inbounds float* %tmp14838, i64 1
+  %tmp14840 = getelementptr inbounds float* %tmp14839, i64 1
+  %tmp14841 = getelementptr inbounds float* %tmp14840, i64 1
+  %tmp14842 = getelementptr inbounds float* %tmp14841, i64 1
+  %tmp14843 = getelementptr inbounds float* %tmp14842, i64 1
+  %tmp14844 = getelementptr inbounds float* %tmp14843, i64 1
+  %tmp14845 = getelementptr inbounds float* %tmp14844, i64 1
+  %tmp14846 = getelementptr inbounds float* %tmp14845, i64 1
+  %tmp14847 = getelementptr inbounds float* %tmp14846, i64 1
+  %tmp14848 = getelementptr inbounds float* %tmp14847, i64 1
+  %tmp14849 = getelementptr inbounds float* %tmp14848, i64 1
+  %tmp14850 = getelementptr inbounds float* %tmp14849, i64 1
+  %tmp14851 = getelementptr inbounds float* %tmp14850, i64 1
+  %tmp14852 = getelementptr inbounds float* %tmp14851, i64 1
+  %tmp14853 = getelementptr inbounds float* %tmp14852, i64 1
+  %tmp14854 = getelementptr inbounds float* %tmp14853, i64 1
+  %tmp14855 = getelementptr inbounds float* %tmp14854, i64 1
+  %tmp14856 = getelementptr inbounds float* %tmp14855, i64 1
+  %tmp14857 = getelementptr inbounds float* %tmp14856, i64 1
+  %tmp14858 = getelementptr inbounds float* %tmp14857, i64 1
+  %tmp14859 = getelementptr inbounds float* %tmp14858, i64 1
+  %tmp14860 = getelementptr inbounds float* %tmp14859, i64 1
+  %tmp14861 = getelementptr inbounds float* %tmp14860, i64 1
+  %tmp14862 = getelementptr inbounds float* %tmp14861, i64 1
+  %tmp14863 = getelementptr inbounds float* %tmp14862, i64 1
+  %tmp14864 = getelementptr inbounds float* %tmp14863, i64 1
+  %tmp14865 = getelementptr inbounds float* %tmp14864, i64 1
+  %tmp14866 = getelementptr inbounds float* %tmp14865, i64 1
+  %tmp14867 = getelementptr inbounds float* %tmp14866, i64 1
+  %tmp14868 = getelementptr inbounds float* %tmp14867, i64 1
+  %tmp14869 = getelementptr inbounds float* %tmp14868, i64 1
+  %tmp14870 = getelementptr inbounds float* %tmp14869, i64 1
+  %tmp14871 = getelementptr inbounds float* %tmp14870, i64 1
+  %tmp14872 = getelementptr inbounds float* %tmp14871, i64 1
+  %tmp14873 = getelementptr inbounds float* %tmp14872, i64 1
+  %tmp14874 = getelementptr inbounds float* %tmp14873, i64 1
+  %tmp14875 = getelementptr inbounds float* %tmp14874, i64 1
+  %tmp14876 = getelementptr inbounds float* %tmp14875, i64 1
+  %tmp14877 = getelementptr inbounds float* %tmp14876, i64 1
+  %tmp14878 = getelementptr inbounds float* %tmp14877, i64 1
+  %tmp14879 = getelementptr inbounds float* %tmp14878, i64 1
+  %tmp14880 = getelementptr inbounds float* %tmp14879, i64 1
+  %tmp14881 = getelementptr inbounds float* %tmp14880, i64 1
+  %tmp14882 = getelementptr inbounds float* %tmp14881, i64 1
+  %tmp14883 = getelementptr inbounds float* %tmp14882, i64 1
+  %tmp14884 = getelementptr inbounds float* %tmp14883, i64 1
+  %tmp14885 = getelementptr inbounds float* %tmp14884, i64 1
+  %tmp14886 = getelementptr inbounds float* %tmp14885, i64 1
+  %tmp14887 = getelementptr inbounds float* %tmp14886, i64 1
+  %tmp14888 = getelementptr inbounds float* %tmp14887, i64 1
+  %tmp14889 = getelementptr inbounds float* %tmp14888, i64 1
+  %tmp14890 = getelementptr inbounds float* %tmp14889, i64 1
+  %tmp14891 = getelementptr inbounds float* %tmp14890, i64 1
+  %tmp14892 = getelementptr inbounds float* %tmp14891, i64 1
+  %tmp14893 = getelementptr inbounds float* %tmp14892, i64 1
+  %tmp14894 = getelementptr inbounds float* %tmp14893, i64 1
+  %tmp14895 = getelementptr inbounds float* %tmp14894, i64 1
+  %tmp14896 = getelementptr inbounds float* %tmp14895, i64 1
+  %tmp14897 = getelementptr inbounds float* %tmp14896, i64 1
+  %tmp14898 = getelementptr inbounds float* %tmp14897, i64 1
+  %tmp14899 = getelementptr inbounds float* %tmp14898, i64 1
+  %tmp14900 = getelementptr inbounds float* %tmp14899, i64 1
+  %tmp14901 = getelementptr inbounds float* %tmp14900, i64 1
+  %tmp14902 = getelementptr inbounds float* %tmp14901, i64 1
+  %tmp14903 = getelementptr inbounds float* %tmp14902, i64 1
+  %tmp14904 = getelementptr inbounds float* %tmp14903, i64 1
+  %tmp14905 = getelementptr inbounds float* %tmp14904, i64 1
+  %tmp14906 = getelementptr inbounds float* %tmp14905, i64 1
+  %tmp14907 = getelementptr inbounds float* %tmp14906, i64 1
+  %tmp14908 = getelementptr inbounds float* %tmp14907, i64 1
+  %tmp14909 = getelementptr inbounds float* %tmp14908, i64 1
+  %tmp14910 = getelementptr inbounds float* %tmp14909, i64 1
+  %tmp14911 = getelementptr inbounds float* %tmp14910, i64 1
+  %tmp14912 = getelementptr inbounds float* %tmp14911, i64 1
+  %tmp14913 = getelementptr inbounds float* %tmp14912, i64 1
+  %tmp14914 = getelementptr inbounds float* %tmp14913, i64 1
+  %tmp14915 = getelementptr inbounds float* %tmp14914, i64 1
+  %tmp14916 = getelementptr inbounds float* %tmp14915, i64 1
+  %tmp14917 = getelementptr inbounds float* %tmp14916, i64 1
+  %tmp14918 = getelementptr inbounds float* %tmp14917, i64 1
+  %tmp14919 = getelementptr inbounds float* %tmp14918, i64 1
+  %tmp14920 = getelementptr inbounds float* %tmp14919, i64 1
+  %tmp14921 = getelementptr inbounds float* %tmp14920, i64 1
+  %tmp14922 = getelementptr inbounds float* %tmp14921, i64 1
+  %tmp14923 = getelementptr inbounds float* %tmp14922, i64 1
+  %tmp14924 = getelementptr inbounds float* %tmp14923, i64 1
+  %tmp14925 = getelementptr inbounds float* %tmp14924, i64 1
+  %tmp14926 = getelementptr inbounds float* %tmp14925, i64 1
+  %tmp14927 = getelementptr inbounds float* %tmp14926, i64 1
+  %tmp14928 = getelementptr inbounds float* %tmp14927, i64 1
+  %tmp14929 = getelementptr inbounds float* %tmp14928, i64 1
+  %tmp14930 = getelementptr inbounds float* %tmp14929, i64 1
+  %tmp14931 = getelementptr inbounds float* %tmp14930, i64 1
+  %tmp14932 = getelementptr inbounds float* %tmp14931, i64 1
+  %tmp14933 = getelementptr inbounds float* %tmp14932, i64 1
+  %tmp14934 = getelementptr inbounds float* %tmp14933, i64 1
+  %tmp14935 = getelementptr inbounds float* %tmp14934, i64 1
+  %tmp14936 = getelementptr inbounds float* %tmp14935, i64 1
+  %tmp14937 = getelementptr inbounds float* %tmp14936, i64 1
+  %tmp14938 = getelementptr inbounds float* %tmp14937, i64 1
+  %tmp14939 = getelementptr inbounds float* %tmp14938, i64 1
+  %tmp14940 = getelementptr inbounds float* %tmp14939, i64 1
+  %tmp14941 = getelementptr inbounds float* %tmp14940, i64 1
+  %tmp14942 = getelementptr inbounds float* %tmp14941, i64 1
+  %tmp14943 = getelementptr inbounds float* %tmp14942, i64 1
+  %tmp14944 = getelementptr inbounds float* %tmp14943, i64 1
+  %tmp14945 = getelementptr inbounds float* %tmp14944, i64 1
+  %tmp14946 = getelementptr inbounds float* %tmp14945, i64 1
+  %tmp14947 = getelementptr inbounds float* %tmp14946, i64 1
+  %tmp14948 = getelementptr inbounds float* %tmp14947, i64 1
+  %tmp14949 = getelementptr inbounds float* %tmp14948, i64 1
+  %tmp14950 = getelementptr inbounds float* %tmp14949, i64 1
+  %tmp14951 = getelementptr inbounds float* %tmp14950, i64 1
+  %tmp14952 = getelementptr inbounds float* %tmp14951, i64 1
+  %tmp14953 = getelementptr inbounds float* %tmp14952, i64 1
+  %tmp14954 = getelementptr inbounds float* %tmp14953, i64 1
+  %tmp14955 = getelementptr inbounds float* %tmp14954, i64 1
+  %tmp14956 = getelementptr inbounds float* %tmp14955, i64 1
+  %tmp14957 = getelementptr inbounds float* %tmp14956, i64 1
+  %tmp14958 = getelementptr inbounds float* %tmp14957, i64 1
+  %tmp14959 = getelementptr inbounds float* %tmp14958, i64 1
+  %tmp14960 = getelementptr inbounds float* %tmp14959, i64 1
+  %tmp14961 = getelementptr inbounds float* %tmp14960, i64 1
+  %tmp14962 = getelementptr inbounds float* %tmp14961, i64 1
+  %tmp14963 = getelementptr inbounds float* %tmp14962, i64 1
+  %tmp14964 = getelementptr inbounds float* %tmp14963, i64 1
+  %tmp14965 = getelementptr inbounds float* %tmp14964, i64 1
+  %tmp14966 = getelementptr inbounds float* %tmp14965, i64 1
+  %tmp14967 = getelementptr inbounds float* %tmp14966, i64 1
+  %tmp14968 = getelementptr inbounds float* %tmp14967, i64 1
+  %tmp14969 = getelementptr inbounds float* %tmp14968, i64 1
+  %tmp14970 = getelementptr inbounds float* %tmp14969, i64 1
+  %tmp14971 = getelementptr inbounds float* %tmp14970, i64 1
+  %tmp14972 = getelementptr inbounds float* %tmp14971, i64 1
+  %tmp14973 = getelementptr inbounds float* %tmp14972, i64 1
+  %tmp14974 = getelementptr inbounds float* %tmp14973, i64 1
+  %tmp14975 = getelementptr inbounds float* %tmp14974, i64 1
+  %tmp14976 = getelementptr inbounds float* %tmp14975, i64 1
+  %tmp14977 = getelementptr inbounds float* %tmp14976, i64 1
+  %tmp14978 = getelementptr inbounds float* %tmp14977, i64 1
+  %tmp14979 = getelementptr inbounds float* %tmp14978, i64 1
+  %tmp14980 = getelementptr inbounds float* %tmp14979, i64 1
+  %tmp14981 = getelementptr inbounds float* %tmp14980, i64 1
+  %tmp14982 = getelementptr inbounds float* %tmp14981, i64 1
+  %tmp14983 = getelementptr inbounds float* %tmp14982, i64 1
+  %tmp14984 = getelementptr inbounds float* %tmp14983, i64 1
+  %tmp14985 = getelementptr inbounds float* %tmp14984, i64 1
+  %tmp14986 = getelementptr inbounds float* %tmp14985, i64 1
+  %tmp14987 = getelementptr inbounds float* %tmp14986, i64 1
+  %tmp14988 = getelementptr inbounds float* %tmp14987, i64 1
+  %tmp14989 = getelementptr inbounds float* %tmp14988, i64 1
+  %tmp14990 = getelementptr inbounds float* %tmp14989, i64 1
+  %tmp14991 = getelementptr inbounds float* %tmp14990, i64 1
+  %tmp14992 = getelementptr inbounds float* %tmp14991, i64 1
+  %tmp14993 = getelementptr inbounds float* %tmp14992, i64 1
+  %tmp14994 = getelementptr inbounds float* %tmp14993, i64 1
+  %tmp14995 = getelementptr inbounds float* %tmp14994, i64 1
+  %tmp14996 = getelementptr inbounds float* %tmp14995, i64 1
+  %tmp14997 = getelementptr inbounds float* %tmp14996, i64 1
+  %tmp14998 = getelementptr inbounds float* %tmp14997, i64 1
+  %tmp14999 = getelementptr inbounds float* %tmp14998, i64 1
+  %tmp15000 = getelementptr inbounds float* %tmp14999, i64 1
+  %tmp15001 = getelementptr inbounds float* %tmp15000, i64 1
+  %tmp15002 = getelementptr inbounds float* %tmp15001, i64 1
+  %tmp15003 = getelementptr inbounds float* %tmp15002, i64 1
+  %tmp15004 = getelementptr inbounds float* %tmp15003, i64 1
+  %tmp15005 = getelementptr inbounds float* %tmp15004, i64 1
+  %tmp15006 = getelementptr inbounds float* %tmp15005, i64 1
+  %tmp15007 = getelementptr inbounds float* %tmp15006, i64 1
+  %tmp15008 = getelementptr inbounds float* %tmp15007, i64 1
+  %tmp15009 = getelementptr inbounds float* %tmp15008, i64 1
+  %tmp15010 = getelementptr inbounds float* %tmp15009, i64 1
+  %tmp15011 = getelementptr inbounds float* %tmp15010, i64 1
+  %tmp15012 = getelementptr inbounds float* %tmp15011, i64 1
+  %tmp15013 = getelementptr inbounds float* %tmp15012, i64 1
+  %tmp15014 = getelementptr inbounds float* %tmp15013, i64 1
+  %tmp15015 = getelementptr inbounds float* %tmp15014, i64 1
+  %tmp15016 = getelementptr inbounds float* %tmp15015, i64 1
+  %tmp15017 = getelementptr inbounds float* %tmp15016, i64 1
+  %tmp15018 = getelementptr inbounds float* %tmp15017, i64 1
+  %tmp15019 = getelementptr inbounds float* %tmp15018, i64 1
+  %tmp15020 = getelementptr inbounds float* %tmp15019, i64 1
+  %tmp15021 = getelementptr inbounds float* %tmp15020, i64 1
+  %tmp15022 = getelementptr inbounds float* %tmp15021, i64 1
+  %tmp15023 = getelementptr inbounds float* %tmp15022, i64 1
+  %tmp15024 = getelementptr inbounds float* %tmp15023, i64 1
+  %tmp15025 = getelementptr inbounds float* %tmp15024, i64 1
+  %tmp15026 = getelementptr inbounds float* %tmp15025, i64 1
+  %tmp15027 = getelementptr inbounds float* %tmp15026, i64 1
+  %tmp15028 = getelementptr inbounds float* %tmp15027, i64 1
+  %tmp15029 = getelementptr inbounds float* %tmp15028, i64 1
+  %tmp15030 = getelementptr inbounds float* %tmp15029, i64 1
+  %tmp15031 = getelementptr inbounds float* %tmp15030, i64 1
+  %tmp15032 = getelementptr inbounds float* %tmp15031, i64 1
+  %tmp15033 = getelementptr inbounds float* %tmp15032, i64 1
+  %tmp15034 = getelementptr inbounds float* %tmp15033, i64 1
+  %tmp15035 = getelementptr inbounds float* %tmp15034, i64 1
+  %tmp15036 = getelementptr inbounds float* %tmp15035, i64 1
+  %tmp15037 = getelementptr inbounds float* %tmp15036, i64 1
+  %tmp15038 = getelementptr inbounds float* %tmp15037, i64 1
+  %tmp15039 = getelementptr inbounds float* %tmp15038, i64 1
+  %tmp15040 = getelementptr inbounds float* %tmp15039, i64 1
+  %tmp15041 = getelementptr inbounds float* %tmp15040, i64 1
+  %tmp15042 = getelementptr inbounds float* %tmp15041, i64 1
+  %tmp15043 = getelementptr inbounds float* %tmp15042, i64 1
+  %tmp15044 = getelementptr inbounds float* %tmp15043, i64 1
+  %tmp15045 = getelementptr inbounds float* %tmp15044, i64 1
+  %tmp15046 = getelementptr inbounds float* %tmp15045, i64 1
+  %tmp15047 = getelementptr inbounds float* %tmp15046, i64 1
+  %tmp15048 = getelementptr inbounds float* %tmp15047, i64 1
+  %tmp15049 = getelementptr inbounds float* %tmp15048, i64 1
+  %tmp15050 = getelementptr inbounds float* %tmp15049, i64 1
+  %tmp15051 = getelementptr inbounds float* %tmp15050, i64 1
+  %tmp15052 = getelementptr inbounds float* %tmp15051, i64 1
+  %tmp15053 = getelementptr inbounds float* %tmp15052, i64 1
+  %tmp15054 = getelementptr inbounds float* %tmp15053, i64 1
+  %tmp15055 = getelementptr inbounds float* %tmp15054, i64 1
+  %tmp15056 = getelementptr inbounds float* %tmp15055, i64 1
+  %tmp15057 = getelementptr inbounds float* %tmp15056, i64 1
+  %tmp15058 = getelementptr inbounds float* %tmp15057, i64 1
+  %tmp15059 = getelementptr inbounds float* %tmp15058, i64 1
+  %tmp15060 = getelementptr inbounds float* %tmp15059, i64 1
+  %tmp15061 = getelementptr inbounds float* %tmp15060, i64 1
+  %tmp15062 = getelementptr inbounds float* %tmp15061, i64 1
+  %tmp15063 = getelementptr inbounds float* %tmp15062, i64 1
+  %tmp15064 = getelementptr inbounds float* %tmp15063, i64 1
+  %tmp15065 = getelementptr inbounds float* %tmp15064, i64 1
+  %tmp15066 = getelementptr inbounds float* %tmp15065, i64 1
+  %tmp15067 = getelementptr inbounds float* %tmp15066, i64 1
+  %tmp15068 = getelementptr inbounds float* %tmp15067, i64 1
+  %tmp15069 = getelementptr inbounds float* %tmp15068, i64 1
+  %tmp15070 = getelementptr inbounds float* %tmp15069, i64 1
+  %tmp15071 = getelementptr inbounds float* %tmp15070, i64 1
+  %tmp15072 = getelementptr inbounds float* %tmp15071, i64 1
+  %tmp15073 = getelementptr inbounds float* %tmp15072, i64 1
+  %tmp15074 = getelementptr inbounds float* %tmp15073, i64 1
+  %tmp15075 = getelementptr inbounds float* %tmp15074, i64 1
+  %tmp15076 = getelementptr inbounds float* %tmp15075, i64 1
+  %tmp15077 = getelementptr inbounds float* %tmp15076, i64 1
+  %tmp15078 = getelementptr inbounds float* %tmp15077, i64 1
+  %tmp15079 = getelementptr inbounds float* %tmp15078, i64 1
+  %tmp15080 = getelementptr inbounds float* %tmp15079, i64 1
+  %tmp15081 = getelementptr inbounds float* %tmp15080, i64 1
+  %tmp15082 = getelementptr inbounds float* %tmp15081, i64 1
+  %tmp15083 = getelementptr inbounds float* %tmp15082, i64 1
+  %tmp15084 = getelementptr inbounds float* %tmp15083, i64 1
+  %tmp15085 = getelementptr inbounds float* %tmp15084, i64 1
+  %tmp15086 = getelementptr inbounds float* %tmp15085, i64 1
+  %tmp15087 = getelementptr inbounds float* %tmp15086, i64 1
+  %tmp15088 = getelementptr inbounds float* %tmp15087, i64 1
+  %tmp15089 = getelementptr inbounds float* %tmp15088, i64 1
+  %tmp15090 = getelementptr inbounds float* %tmp15089, i64 1
+  %tmp15091 = getelementptr inbounds float* %tmp15090, i64 1
+  %tmp15092 = getelementptr inbounds float* %tmp15091, i64 1
+  %tmp15093 = getelementptr inbounds float* %tmp15092, i64 1
+  %tmp15094 = getelementptr inbounds float* %tmp15093, i64 1
+  %tmp15095 = getelementptr inbounds float* %tmp15094, i64 1
+  %tmp15096 = getelementptr inbounds float* %tmp15095, i64 1
+  %tmp15097 = getelementptr inbounds float* %tmp15096, i64 1
+  %tmp15098 = getelementptr inbounds float* %tmp15097, i64 1
+  %tmp15099 = getelementptr inbounds float* %tmp15098, i64 1
+  %tmp15100 = getelementptr inbounds float* %tmp15099, i64 1
+  %tmp15101 = getelementptr inbounds float* %tmp15100, i64 1
+  %tmp15102 = getelementptr inbounds float* %tmp15101, i64 1
+  %tmp15103 = getelementptr inbounds float* %tmp15102, i64 1
+  %tmp15104 = getelementptr inbounds float* %tmp15103, i64 1
+  %tmp15105 = getelementptr inbounds float* %tmp15104, i64 1
+  %tmp15106 = getelementptr inbounds float* %tmp15105, i64 1
+  %tmp15107 = getelementptr inbounds float* %tmp15106, i64 1
+  %tmp15108 = getelementptr inbounds float* %tmp15107, i64 1
+  %tmp15109 = getelementptr inbounds float* %tmp15108, i64 1
+  %tmp15110 = getelementptr inbounds float* %tmp15109, i64 1
+  %tmp15111 = getelementptr inbounds float* %tmp15110, i64 1
+  %tmp15112 = getelementptr inbounds float* %tmp15111, i64 1
+  %tmp15113 = getelementptr inbounds float* %tmp15112, i64 1
+  %tmp15114 = getelementptr inbounds float* %tmp15113, i64 1
+  %tmp15115 = getelementptr inbounds float* %tmp15114, i64 1
+  %tmp15116 = getelementptr inbounds float* %tmp15115, i64 1
+  %tmp15117 = getelementptr inbounds float* %tmp15116, i64 1
+  %tmp15118 = getelementptr inbounds float* %tmp15117, i64 1
+  %tmp15119 = getelementptr inbounds float* %tmp15118, i64 1
+  %tmp15120 = getelementptr inbounds float* %tmp15119, i64 1
+  %tmp15121 = getelementptr inbounds float* %tmp15120, i64 1
+  %tmp15122 = getelementptr inbounds float* %tmp15121, i64 1
+  %tmp15123 = getelementptr inbounds float* %tmp15122, i64 1
+  %tmp15124 = getelementptr inbounds float* %tmp15123, i64 1
+  %tmp15125 = getelementptr inbounds float* %tmp15124, i64 1
+  %tmp15126 = getelementptr inbounds float* %tmp15125, i64 1
+  %tmp15127 = getelementptr inbounds float* %tmp15126, i64 1
+  %tmp15128 = getelementptr inbounds float* %tmp15127, i64 1
+  %tmp15129 = getelementptr inbounds float* %tmp15128, i64 1
+  %tmp15130 = getelementptr inbounds float* %tmp15129, i64 1
+  %tmp15131 = getelementptr inbounds float* %tmp15130, i64 1
+  %tmp15132 = getelementptr inbounds float* %tmp15131, i64 1
+  %tmp15133 = getelementptr inbounds float* %tmp15132, i64 1
+  %tmp15134 = getelementptr inbounds float* %tmp15133, i64 1
+  %tmp15135 = getelementptr inbounds float* %tmp15134, i64 1
+  %tmp15136 = getelementptr inbounds float* %tmp15135, i64 1
+  %tmp15137 = getelementptr inbounds float* %tmp15136, i64 1
+  %tmp15138 = getelementptr inbounds float* %tmp15137, i64 1
+  %tmp15139 = getelementptr inbounds float* %tmp15138, i64 1
+  %tmp15140 = getelementptr inbounds float* %tmp15139, i64 1
+  %tmp15141 = getelementptr inbounds float* %tmp15140, i64 1
+  %tmp15142 = getelementptr inbounds float* %tmp15141, i64 1
+  %tmp15143 = getelementptr inbounds float* %tmp15142, i64 1
+  %tmp15144 = getelementptr inbounds float* %tmp15143, i64 1
+  %tmp15145 = getelementptr inbounds float* %tmp15144, i64 1
+  %tmp15146 = getelementptr inbounds float* %tmp15145, i64 1
+  %tmp15147 = getelementptr inbounds float* %tmp15146, i64 1
+  %tmp15148 = getelementptr inbounds float* %tmp15147, i64 1
+  %tmp15149 = getelementptr inbounds float* %tmp15148, i64 1
+  %tmp15150 = getelementptr inbounds float* %tmp15149, i64 1
+  %tmp15151 = getelementptr inbounds float* %tmp15150, i64 1
+  %tmp15152 = getelementptr inbounds float* %tmp15151, i64 1
+  %tmp15153 = getelementptr inbounds float* %tmp15152, i64 1
+  %tmp15154 = getelementptr inbounds float* %tmp15153, i64 1
+  %tmp15155 = getelementptr inbounds float* %tmp15154, i64 1
+  %tmp15156 = getelementptr inbounds float* %tmp15155, i64 1
+  %tmp15157 = getelementptr inbounds float* %tmp15156, i64 1
+  %tmp15158 = getelementptr inbounds float* %tmp15157, i64 1
+  %tmp15159 = getelementptr inbounds float* %tmp15158, i64 1
+  %tmp15160 = getelementptr inbounds float* %tmp15159, i64 1
+  %tmp15161 = getelementptr inbounds float* %tmp15160, i64 1
+  %tmp15162 = getelementptr inbounds float* %tmp15161, i64 1
+  %tmp15163 = getelementptr inbounds float* %tmp15162, i64 1
+  %tmp15164 = getelementptr inbounds float* %tmp15163, i64 1
+  %tmp15165 = getelementptr inbounds float* %tmp15164, i64 1
+  %tmp15166 = getelementptr inbounds float* %tmp15165, i64 1
+  %tmp15167 = getelementptr inbounds float* %tmp15166, i64 1
+  %tmp15168 = getelementptr inbounds float* %tmp15167, i64 1
+  %tmp15169 = getelementptr inbounds float* %tmp15168, i64 1
+  %tmp15170 = getelementptr inbounds float* %tmp15169, i64 1
+  %tmp15171 = getelementptr inbounds float* %tmp15170, i64 1
+  %tmp15172 = getelementptr inbounds float* %tmp15171, i64 1
+  %tmp15173 = getelementptr inbounds float* %tmp15172, i64 1
+  %tmp15174 = getelementptr inbounds float* %tmp15173, i64 1
+  %tmp15175 = getelementptr inbounds float* %tmp15174, i64 1
+  %tmp15176 = getelementptr inbounds float* %tmp15175, i64 1
+  %tmp15177 = getelementptr inbounds float* %tmp15176, i64 1
+  %tmp15178 = getelementptr inbounds float* %tmp15177, i64 1
+  %tmp15179 = getelementptr inbounds float* %tmp15178, i64 1
+  %tmp15180 = getelementptr inbounds float* %tmp15179, i64 1
+  %tmp15181 = getelementptr inbounds float* %tmp15180, i64 1
+  %tmp15182 = getelementptr inbounds float* %tmp15181, i64 1
+  %tmp15183 = getelementptr inbounds float* %tmp15182, i64 1
+  %tmp15184 = getelementptr inbounds float* %tmp15183, i64 1
+  %tmp15185 = getelementptr inbounds float* %tmp15184, i64 1
+  %tmp15186 = getelementptr inbounds float* %tmp15185, i64 1
+  %tmp15187 = getelementptr inbounds float* %tmp15186, i64 1
+  %tmp15188 = getelementptr inbounds float* %tmp15187, i64 1
+  %tmp15189 = getelementptr inbounds float* %tmp15188, i64 1
+  %tmp15190 = getelementptr inbounds float* %tmp15189, i64 1
+  %tmp15191 = getelementptr inbounds float* %tmp15190, i64 1
+  %tmp15192 = getelementptr inbounds float* %tmp15191, i64 1
+  %tmp15193 = getelementptr inbounds float* %tmp15192, i64 1
+  %tmp15194 = getelementptr inbounds float* %tmp15193, i64 1
+  %tmp15195 = getelementptr inbounds float* %tmp15194, i64 1
+  %tmp15196 = getelementptr inbounds float* %tmp15195, i64 1
+  %tmp15197 = getelementptr inbounds float* %tmp15196, i64 1
+  %tmp15198 = getelementptr inbounds float* %tmp15197, i64 1
+  %tmp15199 = getelementptr inbounds float* %tmp15198, i64 1
+  %tmp15200 = getelementptr inbounds float* %tmp15199, i64 1
+  %tmp15201 = getelementptr inbounds float* %tmp15200, i64 1
+  %tmp15202 = getelementptr inbounds float* %tmp15201, i64 1
+  %tmp15203 = getelementptr inbounds float* %tmp15202, i64 1
+  %tmp15204 = getelementptr inbounds float* %tmp15203, i64 1
+  %tmp15205 = getelementptr inbounds float* %tmp15204, i64 1
+  %tmp15206 = getelementptr inbounds float* %tmp15205, i64 1
+  %tmp15207 = getelementptr inbounds float* %tmp15206, i64 1
+  %tmp15208 = getelementptr inbounds float* %tmp15207, i64 1
+  %tmp15209 = getelementptr inbounds float* %tmp15208, i64 1
+  %tmp15210 = getelementptr inbounds float* %tmp15209, i64 1
+  %tmp15211 = getelementptr inbounds float* %tmp15210, i64 1
+  %tmp15212 = getelementptr inbounds float* %tmp15211, i64 1
+  %tmp15213 = getelementptr inbounds float* %tmp15212, i64 1
+  %tmp15214 = getelementptr inbounds float* %tmp15213, i64 1
+  %tmp15215 = getelementptr inbounds float* %tmp15214, i64 1
+  %tmp15216 = getelementptr inbounds float* %tmp15215, i64 1
+  %tmp15217 = getelementptr inbounds float* %tmp15216, i64 1
+  %tmp15218 = getelementptr inbounds float* %tmp15217, i64 1
+  %tmp15219 = getelementptr inbounds float* %tmp15218, i64 1
+  %tmp15220 = getelementptr inbounds float* %tmp15219, i64 1
+  %tmp15221 = getelementptr inbounds float* %tmp15220, i64 1
+  %tmp15222 = getelementptr inbounds float* %tmp15221, i64 1
+  %tmp15223 = getelementptr inbounds float* %tmp15222, i64 1
+  %tmp15224 = getelementptr inbounds float* %tmp15223, i64 1
+  %tmp15225 = getelementptr inbounds float* %tmp15224, i64 1
+  %tmp15226 = getelementptr inbounds float* %tmp15225, i64 1
+  %tmp15227 = getelementptr inbounds float* %tmp15226, i64 1
+  %tmp15228 = getelementptr inbounds float* %tmp15227, i64 1
+  %tmp15229 = getelementptr inbounds float* %tmp15228, i64 1
+  %tmp15230 = getelementptr inbounds float* %tmp15229, i64 1
+  %tmp15231 = getelementptr inbounds float* %tmp15230, i64 1
+  %tmp15232 = getelementptr inbounds float* %tmp15231, i64 1
+  %tmp15233 = getelementptr inbounds float* %tmp15232, i64 1
+  %tmp15234 = getelementptr inbounds float* %tmp15233, i64 1
+  %tmp15235 = getelementptr inbounds float* %tmp15234, i64 1
+  %tmp15236 = getelementptr inbounds float* %tmp15235, i64 1
+  %tmp15237 = getelementptr inbounds float* %tmp15236, i64 1
+  %tmp15238 = getelementptr inbounds float* %tmp15237, i64 1
+  %tmp15239 = getelementptr inbounds float* %tmp15238, i64 1
+  %tmp15240 = getelementptr inbounds float* %tmp15239, i64 1
+  %tmp15241 = getelementptr inbounds float* %tmp15240, i64 1
+  %tmp15242 = getelementptr inbounds float* %tmp15241, i64 1
+  %tmp15243 = getelementptr inbounds float* %tmp15242, i64 1
+  %tmp15244 = getelementptr inbounds float* %tmp15243, i64 1
+  %tmp15245 = getelementptr inbounds float* %tmp15244, i64 1
+  %tmp15246 = getelementptr inbounds float* %tmp15245, i64 1
+  %tmp15247 = getelementptr inbounds float* %tmp15246, i64 1
+  %tmp15248 = getelementptr inbounds float* %tmp15247, i64 1
+  %tmp15249 = getelementptr inbounds float* %tmp15248, i64 1
+  %tmp15250 = getelementptr inbounds float* %tmp15249, i64 1
+  %tmp15251 = getelementptr inbounds float* %tmp15250, i64 1
+  %tmp15252 = getelementptr inbounds float* %tmp15251, i64 1
+  %tmp15253 = getelementptr inbounds float* %tmp15252, i64 1
+  %tmp15254 = getelementptr inbounds float* %tmp15253, i64 1
+  %tmp15255 = getelementptr inbounds float* %tmp15254, i64 1
+  %tmp15256 = getelementptr inbounds float* %tmp15255, i64 1
+  %tmp15257 = getelementptr inbounds float* %tmp15256, i64 1
+  %tmp15258 = getelementptr inbounds float* %tmp15257, i64 1
+  %tmp15259 = getelementptr inbounds float* %tmp15258, i64 1
+  %tmp15260 = getelementptr inbounds float* %tmp15259, i64 1
+  %tmp15261 = getelementptr inbounds float* %tmp15260, i64 1
+  %tmp15262 = getelementptr inbounds float* %tmp15261, i64 1
+  %tmp15263 = getelementptr inbounds float* %tmp15262, i64 1
+  %tmp15264 = getelementptr inbounds float* %tmp15263, i64 1
+  %tmp15265 = getelementptr inbounds float* %tmp15264, i64 1
+  %tmp15266 = getelementptr inbounds float* %tmp15265, i64 1
+  %tmp15267 = getelementptr inbounds float* %tmp15266, i64 1
+  %tmp15268 = getelementptr inbounds float* %tmp15267, i64 1
+  %tmp15269 = getelementptr inbounds float* %tmp15268, i64 1
+  %tmp15270 = getelementptr inbounds float* %tmp15269, i64 1
+  %tmp15271 = getelementptr inbounds float* %tmp15270, i64 1
+  %tmp15272 = getelementptr inbounds float* %tmp15271, i64 1
+  %tmp15273 = getelementptr inbounds float* %tmp15272, i64 1
+  %tmp15274 = getelementptr inbounds float* %tmp15273, i64 1
+  %tmp15275 = getelementptr inbounds float* %tmp15274, i64 1
+  %tmp15276 = getelementptr inbounds float* %tmp15275, i64 1
+  %tmp15277 = getelementptr inbounds float* %tmp15276, i64 1
+  %tmp15278 = getelementptr inbounds float* %tmp15277, i64 1
+  %tmp15279 = getelementptr inbounds float* %tmp15278, i64 1
+  %tmp15280 = getelementptr inbounds float* %tmp15279, i64 1
+  %tmp15281 = getelementptr inbounds float* %tmp15280, i64 1
+  %tmp15282 = getelementptr inbounds float* %tmp15281, i64 1
+  %tmp15283 = getelementptr inbounds float* %tmp15282, i64 1
+  %tmp15284 = getelementptr inbounds float* %tmp15283, i64 1
+  %tmp15285 = getelementptr inbounds float* %tmp15284, i64 1
+  %tmp15286 = getelementptr inbounds float* %tmp15285, i64 1
+  %tmp15287 = getelementptr inbounds float* %tmp15286, i64 1
+  %tmp15288 = getelementptr inbounds float* %tmp15287, i64 1
+  %tmp15289 = getelementptr inbounds float* %tmp15288, i64 1
+  %tmp15290 = getelementptr inbounds float* %tmp15289, i64 1
+  %tmp15291 = getelementptr inbounds float* %tmp15290, i64 1
+  %tmp15292 = getelementptr inbounds float* %tmp15291, i64 1
+  %tmp15293 = getelementptr inbounds float* %tmp15292, i64 1
+  %tmp15294 = getelementptr inbounds float* %tmp15293, i64 1
+  %tmp15295 = getelementptr inbounds float* %tmp15294, i64 1
+  %tmp15296 = getelementptr inbounds float* %tmp15295, i64 1
+  %tmp15297 = getelementptr inbounds float* %tmp15296, i64 1
+  %tmp15298 = getelementptr inbounds float* %tmp15297, i64 1
+  %tmp15299 = getelementptr inbounds float* %tmp15298, i64 1
+  %tmp15300 = getelementptr inbounds float* %tmp15299, i64 1
+  %tmp15301 = getelementptr inbounds float* %tmp15300, i64 1
+  %tmp15302 = getelementptr inbounds float* %tmp15301, i64 1
+  %tmp15303 = getelementptr inbounds float* %tmp15302, i64 1
+  %tmp15304 = getelementptr inbounds float* %tmp15303, i64 1
+  %tmp15305 = getelementptr inbounds float* %tmp15304, i64 1
+  %tmp15306 = getelementptr inbounds float* %tmp15305, i64 1
+  %tmp15307 = getelementptr inbounds float* %tmp15306, i64 1
+  %tmp15308 = getelementptr inbounds float* %tmp15307, i64 1
+  %tmp15309 = getelementptr inbounds float* %tmp15308, i64 1
+  %tmp15310 = getelementptr inbounds float* %tmp15309, i64 1
+  %tmp15311 = getelementptr inbounds float* %tmp15310, i64 1
+  %tmp15312 = getelementptr inbounds float* %tmp15311, i64 1
+  %tmp15313 = getelementptr inbounds float* %tmp15312, i64 1
+  %tmp15314 = getelementptr inbounds float* %tmp15313, i64 1
+  %tmp15315 = getelementptr inbounds float* %tmp15314, i64 1
+  %tmp15316 = getelementptr inbounds float* %tmp15315, i64 1
+  %tmp15317 = getelementptr inbounds float* %tmp15316, i64 1
+  %tmp15318 = getelementptr inbounds float* %tmp15317, i64 1
+  %tmp15319 = getelementptr inbounds float* %tmp15318, i64 1
+  %tmp15320 = getelementptr inbounds float* %tmp15319, i64 1
+  %tmp15321 = getelementptr inbounds float* %tmp15320, i64 1
+  %tmp15322 = getelementptr inbounds float* %tmp15321, i64 1
+  %tmp15323 = getelementptr inbounds float* %tmp15322, i64 1
+  %tmp15324 = getelementptr inbounds float* %tmp15323, i64 1
+  %tmp15325 = getelementptr inbounds float* %tmp15324, i64 1
+  %tmp15326 = getelementptr inbounds float* %tmp15325, i64 1
+  %tmp15327 = getelementptr inbounds float* %tmp15326, i64 1
+  %tmp15328 = getelementptr inbounds float* %tmp15327, i64 1
+  %tmp15329 = getelementptr inbounds float* %tmp15328, i64 1
+  %tmp15330 = getelementptr inbounds float* %tmp15329, i64 1
+  %tmp15331 = getelementptr inbounds float* %tmp15330, i64 1
+  %tmp15332 = getelementptr inbounds float* %tmp15331, i64 1
+  %tmp15333 = getelementptr inbounds float* %tmp15332, i64 1
+  %tmp15334 = getelementptr inbounds float* %tmp15333, i64 1
+  %tmp15335 = getelementptr inbounds float* %tmp15334, i64 1
+  %tmp15336 = getelementptr inbounds float* %tmp15335, i64 1
+  %tmp15337 = getelementptr inbounds float* %tmp15336, i64 1
+  %tmp15338 = getelementptr inbounds float* %tmp15337, i64 1
+  %tmp15339 = getelementptr inbounds float* %tmp15338, i64 1
+  %tmp15340 = getelementptr inbounds float* %tmp15339, i64 1
+  %tmp15341 = getelementptr inbounds float* %tmp15340, i64 1
+  %tmp15342 = getelementptr inbounds float* %tmp15341, i64 1
+  %tmp15343 = getelementptr inbounds float* %tmp15342, i64 1
+  %tmp15344 = getelementptr inbounds float* %tmp15343, i64 1
+  %tmp15345 = getelementptr inbounds float* %tmp15344, i64 1
+  %tmp15346 = getelementptr inbounds float* %tmp15345, i64 1
+  %tmp15347 = getelementptr inbounds float* %tmp15346, i64 1
+  %tmp15348 = getelementptr inbounds float* %tmp15347, i64 1
+  %tmp15349 = getelementptr inbounds float* %tmp15348, i64 1
+  %tmp15350 = getelementptr inbounds float* %tmp15349, i64 1
+  %tmp15351 = getelementptr inbounds float* %tmp15350, i64 1
+  %tmp15352 = getelementptr inbounds float* %tmp15351, i64 1
+  %tmp15353 = getelementptr inbounds float* %tmp15352, i64 1
+  %tmp15354 = getelementptr inbounds float* %tmp15353, i64 1
+  %tmp15355 = getelementptr inbounds float* %tmp15354, i64 1
+  %tmp15356 = getelementptr inbounds float* %tmp15355, i64 1
+  %tmp15357 = getelementptr inbounds float* %tmp15356, i64 1
+  %tmp15358 = getelementptr inbounds float* %tmp15357, i64 1
+  %tmp15359 = getelementptr inbounds float* %tmp15358, i64 1
+  %tmp15360 = getelementptr inbounds float* %tmp15359, i64 1
+  %tmp15361 = getelementptr inbounds float* %tmp15360, i64 1
+  %tmp15362 = getelementptr inbounds float* %tmp15361, i64 1
+  %tmp15363 = getelementptr inbounds float* %tmp15362, i64 1
+  %tmp15364 = getelementptr inbounds float* %tmp15363, i64 1
+  %tmp15365 = getelementptr inbounds float* %tmp15364, i64 1
+  %tmp15366 = getelementptr inbounds float* %tmp15365, i64 1
+  %tmp15367 = getelementptr inbounds float* %tmp15366, i64 1
+  %tmp15368 = getelementptr inbounds float* %tmp15367, i64 1
+  %tmp15369 = getelementptr inbounds float* %tmp15368, i64 1
+  %tmp15370 = getelementptr inbounds float* %tmp15369, i64 1
+  %tmp15371 = getelementptr inbounds float* %tmp15370, i64 1
+  %tmp15372 = getelementptr inbounds float* %tmp15371, i64 1
+  %tmp15373 = getelementptr inbounds float* %tmp15372, i64 1
+  %tmp15374 = getelementptr inbounds float* %tmp15373, i64 1
+  %tmp15375 = getelementptr inbounds float* %tmp15374, i64 1
+  %tmp15376 = getelementptr inbounds float* %tmp15375, i64 1
+  %tmp15377 = getelementptr inbounds float* %tmp15376, i64 1
+  %tmp15378 = getelementptr inbounds float* %tmp15377, i64 1
+  %tmp15379 = getelementptr inbounds float* %tmp15378, i64 1
+  %tmp15380 = getelementptr inbounds float* %tmp15379, i64 1
+  %tmp15381 = getelementptr inbounds float* %tmp15380, i64 1
+  %tmp15382 = getelementptr inbounds float* %tmp15381, i64 1
+  %tmp15383 = getelementptr inbounds float* %tmp15382, i64 1
+  %tmp15384 = getelementptr inbounds float* %tmp15383, i64 1
+  %tmp15385 = getelementptr inbounds float* %tmp15384, i64 1
+  %tmp15386 = getelementptr inbounds float* %tmp15385, i64 1
+  %tmp15387 = getelementptr inbounds float* %tmp15386, i64 1
+  %tmp15388 = getelementptr inbounds float* %tmp15387, i64 1
+  %tmp15389 = getelementptr inbounds float* %tmp15388, i64 1
+  %tmp15390 = getelementptr inbounds float* %tmp15389, i64 1
+  %tmp15391 = getelementptr inbounds float* %tmp15390, i64 1
+  %tmp15392 = getelementptr inbounds float* %tmp15391, i64 1
+  %tmp15393 = getelementptr inbounds float* %tmp15392, i64 1
+  %tmp15394 = getelementptr inbounds float* %tmp15393, i64 1
+  %tmp15395 = getelementptr inbounds float* %tmp15394, i64 1
+  %tmp15396 = getelementptr inbounds float* %tmp15395, i64 1
+  %tmp15397 = getelementptr inbounds float* %tmp15396, i64 1
+  %tmp15398 = getelementptr inbounds float* %tmp15397, i64 1
+  %tmp15399 = getelementptr inbounds float* %tmp15398, i64 1
+  %tmp15400 = getelementptr inbounds float* %tmp15399, i64 1
+  %tmp15401 = getelementptr inbounds float* %tmp15400, i64 1
+  %tmp15402 = getelementptr inbounds float* %tmp15401, i64 1
+  %tmp15403 = getelementptr inbounds float* %tmp15402, i64 1
+  %tmp15404 = getelementptr inbounds float* %tmp15403, i64 1
+  %tmp15405 = getelementptr inbounds float* %tmp15404, i64 1
+  %tmp15406 = getelementptr inbounds float* %tmp15405, i64 1
+  %tmp15407 = getelementptr inbounds float* %tmp15406, i64 1
+  %tmp15408 = getelementptr inbounds float* %tmp15407, i64 1
+  %tmp15409 = getelementptr inbounds float* %tmp15408, i64 1
+  %tmp15410 = getelementptr inbounds float* %tmp15409, i64 1
+  %tmp15411 = getelementptr inbounds float* %tmp15410, i64 1
+  %tmp15412 = getelementptr inbounds float* %tmp15411, i64 1
+  %tmp15413 = getelementptr inbounds float* %tmp15412, i64 1
+  %tmp15414 = getelementptr inbounds float* %tmp15413, i64 1
+  %tmp15415 = getelementptr inbounds float* %tmp15414, i64 1
+  %tmp15416 = getelementptr inbounds float* %tmp15415, i64 1
+  %tmp15417 = getelementptr inbounds float* %tmp15416, i64 1
+  %tmp15418 = getelementptr inbounds float* %tmp15417, i64 1
+  %tmp15419 = getelementptr inbounds float* %tmp15418, i64 1
+  %tmp15420 = getelementptr inbounds float* %tmp15419, i64 1
+  %tmp15421 = getelementptr inbounds float* %tmp15420, i64 1
+  %tmp15422 = getelementptr inbounds float* %tmp15421, i64 1
+  %tmp15423 = getelementptr inbounds float* %tmp15422, i64 1
+  %tmp15424 = getelementptr inbounds float* %tmp15423, i64 1
+  %tmp15425 = getelementptr inbounds float* %tmp15424, i64 1
+  %tmp15426 = getelementptr inbounds float* %tmp15425, i64 1
+  %tmp15427 = getelementptr inbounds float* %tmp15426, i64 1
+  %tmp15428 = getelementptr inbounds float* %tmp15427, i64 1
+  %tmp15429 = getelementptr inbounds float* %tmp15428, i64 1
+  %tmp15430 = getelementptr inbounds float* %tmp15429, i64 1
+  %tmp15431 = getelementptr inbounds float* %tmp15430, i64 1
+  %tmp15432 = getelementptr inbounds float* %tmp15431, i64 1
+  %tmp15433 = getelementptr inbounds float* %tmp15432, i64 1
+  %tmp15434 = getelementptr inbounds float* %tmp15433, i64 1
+  %tmp15435 = getelementptr inbounds float* %tmp15434, i64 1
+  %tmp15436 = getelementptr inbounds float* %tmp15435, i64 1
+  %tmp15437 = getelementptr inbounds float* %tmp15436, i64 1
+  %tmp15438 = getelementptr inbounds float* %tmp15437, i64 1
+  %tmp15439 = getelementptr inbounds float* %tmp15438, i64 1
+  %tmp15440 = getelementptr inbounds float* %tmp15439, i64 1
+  %tmp15441 = getelementptr inbounds float* %tmp15440, i64 1
+  %tmp15442 = getelementptr inbounds float* %tmp15441, i64 1
+  %tmp15443 = getelementptr inbounds float* %tmp15442, i64 1
+  %tmp15444 = getelementptr inbounds float* %tmp15443, i64 1
+  %tmp15445 = getelementptr inbounds float* %tmp15444, i64 1
+  %tmp15446 = getelementptr inbounds float* %tmp15445, i64 1
+  %tmp15447 = getelementptr inbounds float* %tmp15446, i64 1
+  %tmp15448 = getelementptr inbounds float* %tmp15447, i64 1
+  %tmp15449 = getelementptr inbounds float* %tmp15448, i64 1
+  %tmp15450 = getelementptr inbounds float* %tmp15449, i64 1
+  %tmp15451 = getelementptr inbounds float* %tmp15450, i64 1
+  %tmp15452 = getelementptr inbounds float* %tmp15451, i64 1
+  %tmp15453 = getelementptr inbounds float* %tmp15452, i64 1
+  %tmp15454 = getelementptr inbounds float* %tmp15453, i64 1
+  %tmp15455 = getelementptr inbounds float* %tmp15454, i64 1
+  %tmp15456 = getelementptr inbounds float* %tmp15455, i64 1
+  %tmp15457 = getelementptr inbounds float* %tmp15456, i64 1
+  %tmp15458 = getelementptr inbounds float* %tmp15457, i64 1
+  %tmp15459 = getelementptr inbounds float* %tmp15458, i64 1
+  %tmp15460 = getelementptr inbounds float* %tmp15459, i64 1
+  %tmp15461 = getelementptr inbounds float* %tmp15460, i64 1
+  %tmp15462 = getelementptr inbounds float* %tmp15461, i64 1
+  %tmp15463 = getelementptr inbounds float* %tmp15462, i64 1
+  %tmp15464 = getelementptr inbounds float* %tmp15463, i64 1
+  %tmp15465 = getelementptr inbounds float* %tmp15464, i64 1
+  %tmp15466 = getelementptr inbounds float* %tmp15465, i64 1
+  %tmp15467 = getelementptr inbounds float* %tmp15466, i64 1
+  %tmp15468 = getelementptr inbounds float* %tmp15467, i64 1
+  %tmp15469 = getelementptr inbounds float* %tmp15468, i64 1
+  %tmp15470 = getelementptr inbounds float* %tmp15469, i64 1
+  %tmp15471 = getelementptr inbounds float* %tmp15470, i64 1
+  %tmp15472 = getelementptr inbounds float* %tmp15471, i64 1
+  %tmp15473 = getelementptr inbounds float* %tmp15472, i64 1
+  %tmp15474 = getelementptr inbounds float* %tmp15473, i64 1
+  %tmp15475 = getelementptr inbounds float* %tmp15474, i64 1
+  %tmp15476 = getelementptr inbounds float* %tmp15475, i64 1
+  %tmp15477 = getelementptr inbounds float* %tmp15476, i64 1
+  %tmp15478 = getelementptr inbounds float* %tmp15477, i64 1
+  %tmp15479 = getelementptr inbounds float* %tmp15478, i64 1
+  %tmp15480 = getelementptr inbounds float* %tmp15479, i64 1
+  %tmp15481 = getelementptr inbounds float* %tmp15480, i64 1
+  %tmp15482 = getelementptr inbounds float* %tmp15481, i64 1
+  %tmp15483 = getelementptr inbounds float* %tmp15482, i64 1
+  %tmp15484 = getelementptr inbounds float* %tmp15483, i64 1
+  %tmp15485 = getelementptr inbounds float* %tmp15484, i64 1
+  %tmp15486 = getelementptr inbounds float* %tmp15485, i64 1
+  %tmp15487 = getelementptr inbounds float* %tmp15486, i64 1
+  %tmp15488 = getelementptr inbounds float* %tmp15487, i64 1
+  %tmp15489 = getelementptr inbounds float* %tmp15488, i64 1
+  %tmp15490 = getelementptr inbounds float* %tmp15489, i64 1
+  %tmp15491 = getelementptr inbounds float* %tmp15490, i64 1
+  %tmp15492 = getelementptr inbounds float* %tmp15491, i64 1
+  %tmp15493 = getelementptr inbounds float* %tmp15492, i64 1
+  %tmp15494 = getelementptr inbounds float* %tmp15493, i64 1
+  %tmp15495 = getelementptr inbounds float* %tmp15494, i64 1
+  %tmp15496 = getelementptr inbounds float* %tmp15495, i64 1
+  %tmp15497 = getelementptr inbounds float* %tmp15496, i64 1
+  %tmp15498 = getelementptr inbounds float* %tmp15497, i64 1
+  %tmp15499 = getelementptr inbounds float* %tmp15498, i64 1
+  %tmp15500 = getelementptr inbounds float* %tmp15499, i64 1
+  %tmp15501 = getelementptr inbounds float* %tmp15500, i64 1
+  %tmp15502 = getelementptr inbounds float* %tmp15501, i64 1
+  %tmp15503 = getelementptr inbounds float* %tmp15502, i64 1
+  %tmp15504 = getelementptr inbounds float* %tmp15503, i64 1
+  %tmp15505 = getelementptr inbounds float* %tmp15504, i64 1
+  %tmp15506 = getelementptr inbounds float* %tmp15505, i64 1
+  %tmp15507 = getelementptr inbounds float* %tmp15506, i64 1
+  %tmp15508 = getelementptr inbounds float* %tmp15507, i64 1
+  %tmp15509 = getelementptr inbounds float* %tmp15508, i64 1
+  %tmp15510 = getelementptr inbounds float* %tmp15509, i64 1
+  %tmp15511 = getelementptr inbounds float* %tmp15510, i64 1
+  %tmp15512 = getelementptr inbounds float* %tmp15511, i64 1
+  %tmp15513 = getelementptr inbounds float* %tmp15512, i64 1
+  %tmp15514 = getelementptr inbounds float* %tmp15513, i64 1
+  %tmp15515 = getelementptr inbounds float* %tmp15514, i64 1
+  %tmp15516 = getelementptr inbounds float* %tmp15515, i64 1
+  %tmp15517 = getelementptr inbounds float* %tmp15516, i64 1
+  %tmp15518 = getelementptr inbounds float* %tmp15517, i64 1
+  %tmp15519 = getelementptr inbounds float* %tmp15518, i64 1
+  %tmp15520 = getelementptr inbounds float* %tmp15519, i64 1
+  %tmp15521 = getelementptr inbounds float* %tmp15520, i64 1
+  %tmp15522 = getelementptr inbounds float* %tmp15521, i64 1
+  %tmp15523 = getelementptr inbounds float* %tmp15522, i64 1
+  %tmp15524 = getelementptr inbounds float* %tmp15523, i64 1
+  %tmp15525 = getelementptr inbounds float* %tmp15524, i64 1
+  %tmp15526 = getelementptr inbounds float* %tmp15525, i64 1
+  %tmp15527 = getelementptr inbounds float* %tmp15526, i64 1
+  %tmp15528 = getelementptr inbounds float* %tmp15527, i64 1
+  %tmp15529 = getelementptr inbounds float* %tmp15528, i64 1
+  %tmp15530 = getelementptr inbounds float* %tmp15529, i64 1
+  %tmp15531 = getelementptr inbounds float* %tmp15530, i64 1
+  %tmp15532 = getelementptr inbounds float* %tmp15531, i64 1
+  %tmp15533 = getelementptr inbounds float* %tmp15532, i64 1
+  %tmp15534 = getelementptr inbounds float* %tmp15533, i64 1
+  %tmp15535 = getelementptr inbounds float* %tmp15534, i64 1
+  %tmp15536 = getelementptr inbounds float* %tmp15535, i64 1
+  %tmp15537 = getelementptr inbounds float* %tmp15536, i64 1
+  %tmp15538 = getelementptr inbounds float* %tmp15537, i64 1
+  %tmp15539 = getelementptr inbounds float* %tmp15538, i64 1
+  %tmp15540 = getelementptr inbounds float* %tmp15539, i64 1
+  %tmp15541 = getelementptr inbounds float* %tmp15540, i64 1
+  %tmp15542 = getelementptr inbounds float* %tmp15541, i64 1
+  %tmp15543 = getelementptr inbounds float* %tmp15542, i64 1
+  %tmp15544 = getelementptr inbounds float* %tmp15543, i64 1
+  %tmp15545 = getelementptr inbounds float* %tmp15544, i64 1
+  %tmp15546 = getelementptr inbounds float* %tmp15545, i64 1
+  %tmp15547 = getelementptr inbounds float* %tmp15546, i64 1
+  %tmp15548 = getelementptr inbounds float* %tmp15547, i64 1
+  %tmp15549 = getelementptr inbounds float* %tmp15548, i64 1
+  %tmp15550 = getelementptr inbounds float* %tmp15549, i64 1
+  %tmp15551 = getelementptr inbounds float* %tmp15550, i64 1
+  %tmp15552 = getelementptr inbounds float* %tmp15551, i64 1
+  %tmp15553 = getelementptr inbounds float* %tmp15552, i64 1
+  %tmp15554 = getelementptr inbounds float* %tmp15553, i64 1
+  %tmp15555 = getelementptr inbounds float* %tmp15554, i64 1
+  %tmp15556 = getelementptr inbounds float* %tmp15555, i64 1
+  %tmp15557 = getelementptr inbounds float* %tmp15556, i64 1
+  %tmp15558 = getelementptr inbounds float* %tmp15557, i64 1
+  %tmp15559 = getelementptr inbounds float* %tmp15558, i64 1
+  %tmp15560 = getelementptr inbounds float* %tmp15559, i64 1
+  %tmp15561 = getelementptr inbounds float* %tmp15560, i64 1
+  %tmp15562 = getelementptr inbounds float* %tmp15561, i64 1
+  %tmp15563 = getelementptr inbounds float* %tmp15562, i64 1
+  %tmp15564 = getelementptr inbounds float* %tmp15563, i64 1
+  %tmp15565 = getelementptr inbounds float* %tmp15564, i64 1
+  %tmp15566 = getelementptr inbounds float* %tmp15565, i64 1
+  %tmp15567 = getelementptr inbounds float* %tmp15566, i64 1
+  %tmp15568 = getelementptr inbounds float* %tmp15567, i64 1
+  %tmp15569 = getelementptr inbounds float* %tmp15568, i64 1
+  %tmp15570 = getelementptr inbounds float* %tmp15569, i64 1
+  %tmp15571 = getelementptr inbounds float* %tmp15570, i64 1
+  %tmp15572 = getelementptr inbounds float* %tmp15571, i64 1
+  %tmp15573 = getelementptr inbounds float* %tmp15572, i64 1
+  %tmp15574 = getelementptr inbounds float* %tmp15573, i64 1
+  %tmp15575 = getelementptr inbounds float* %tmp15574, i64 1
+  %tmp15576 = getelementptr inbounds float* %tmp15575, i64 1
+  %tmp15577 = getelementptr inbounds float* %tmp15576, i64 1
+  %tmp15578 = getelementptr inbounds float* %tmp15577, i64 1
+  %tmp15579 = getelementptr inbounds float* %tmp15578, i64 1
+  %tmp15580 = getelementptr inbounds float* %tmp15579, i64 1
+  %tmp15581 = getelementptr inbounds float* %tmp15580, i64 1
+  %tmp15582 = getelementptr inbounds float* %tmp15581, i64 1
+  %tmp15583 = getelementptr inbounds float* %tmp15582, i64 1
+  %tmp15584 = getelementptr inbounds float* %tmp15583, i64 1
+  %tmp15585 = getelementptr inbounds float* %tmp15584, i64 1
+  %tmp15586 = getelementptr inbounds float* %tmp15585, i64 1
+  %tmp15587 = getelementptr inbounds float* %tmp15586, i64 1
+  %tmp15588 = getelementptr inbounds float* %tmp15587, i64 1
+  %tmp15589 = getelementptr inbounds float* %tmp15588, i64 1
+  %tmp15590 = getelementptr inbounds float* %tmp15589, i64 1
+  %tmp15591 = getelementptr inbounds float* %tmp15590, i64 1
+  %tmp15592 = getelementptr inbounds float* %tmp15591, i64 1
+  %tmp15593 = getelementptr inbounds float* %tmp15592, i64 1
+  %tmp15594 = getelementptr inbounds float* %tmp15593, i64 1
+  %tmp15595 = getelementptr inbounds float* %tmp15594, i64 1
+  %tmp15596 = getelementptr inbounds float* %tmp15595, i64 1
+  %tmp15597 = getelementptr inbounds float* %tmp15596, i64 1
+  %tmp15598 = getelementptr inbounds float* %tmp15597, i64 1
+  %tmp15599 = getelementptr inbounds float* %tmp15598, i64 1
+  %tmp15600 = getelementptr inbounds float* %tmp15599, i64 1
+  %tmp15601 = getelementptr inbounds float* %tmp15600, i64 1
+  %tmp15602 = getelementptr inbounds float* %tmp15601, i64 1
+  %tmp15603 = getelementptr inbounds float* %tmp15602, i64 1
+  %tmp15604 = getelementptr inbounds float* %tmp15603, i64 1
+  %tmp15605 = getelementptr inbounds float* %tmp15604, i64 1
+  %tmp15606 = getelementptr inbounds float* %tmp15605, i64 1
+  %tmp15607 = getelementptr inbounds float* %tmp15606, i64 1
+  %tmp15608 = getelementptr inbounds float* %tmp15607, i64 1
+  %tmp15609 = getelementptr inbounds float* %tmp15608, i64 1
+  %tmp15610 = getelementptr inbounds float* %tmp15609, i64 1
+  %tmp15611 = getelementptr inbounds float* %tmp15610, i64 1
+  %tmp15612 = getelementptr inbounds float* %tmp15611, i64 1
+  %tmp15613 = getelementptr inbounds float* %tmp15612, i64 1
+  %tmp15614 = getelementptr inbounds float* %tmp15613, i64 1
+  %tmp15615 = getelementptr inbounds float* %tmp15614, i64 1
+  %tmp15616 = getelementptr inbounds float* %tmp15615, i64 1
+  %tmp15617 = getelementptr inbounds float* %tmp15616, i64 1
+  %tmp15618 = getelementptr inbounds float* %tmp15617, i64 1
+  %tmp15619 = getelementptr inbounds float* %tmp15618, i64 1
+  %tmp15620 = getelementptr inbounds float* %tmp15619, i64 1
+  %tmp15621 = getelementptr inbounds float* %tmp15620, i64 1
+  %tmp15622 = getelementptr inbounds float* %tmp15621, i64 1
+  %tmp15623 = getelementptr inbounds float* %tmp15622, i64 1
+  %tmp15624 = getelementptr inbounds float* %tmp15623, i64 1
+  %tmp15625 = getelementptr inbounds float* %tmp15624, i64 1
+  %tmp15626 = getelementptr inbounds float* %tmp15625, i64 1
+  %tmp15627 = getelementptr inbounds float* %tmp15626, i64 1
+  %tmp15628 = getelementptr inbounds float* %tmp15627, i64 1
+  %tmp15629 = getelementptr inbounds float* %tmp15628, i64 1
+  %tmp15630 = getelementptr inbounds float* %tmp15629, i64 1
+  %tmp15631 = getelementptr inbounds float* %tmp15630, i64 1
+  %tmp15632 = getelementptr inbounds float* %tmp15631, i64 1
+  %tmp15633 = getelementptr inbounds float* %tmp15632, i64 1
+  %tmp15634 = getelementptr inbounds float* %tmp15633, i64 1
+  %tmp15635 = getelementptr inbounds float* %tmp15634, i64 1
+  %tmp15636 = getelementptr inbounds float* %tmp15635, i64 1
+  %tmp15637 = getelementptr inbounds float* %tmp15636, i64 1
+  %tmp15638 = getelementptr inbounds float* %tmp15637, i64 1
+  %tmp15639 = getelementptr inbounds float* %tmp15638, i64 1
+  %tmp15640 = getelementptr inbounds float* %tmp15639, i64 1
+  %tmp15641 = getelementptr inbounds float* %tmp15640, i64 1
+  %tmp15642 = getelementptr inbounds float* %tmp15641, i64 1
+  %tmp15643 = getelementptr inbounds float* %tmp15642, i64 1
+  %tmp15644 = getelementptr inbounds float* %tmp15643, i64 1
+  %tmp15645 = getelementptr inbounds float* %tmp15644, i64 1
+  %tmp15646 = getelementptr inbounds float* %tmp15645, i64 1
+  %tmp15647 = getelementptr inbounds float* %tmp15646, i64 1
+  %tmp15648 = getelementptr inbounds float* %tmp15647, i64 1
+  %tmp15649 = getelementptr inbounds float* %tmp15648, i64 1
+  %tmp15650 = getelementptr inbounds float* %tmp15649, i64 1
+  %tmp15651 = getelementptr inbounds float* %tmp15650, i64 1
+  %tmp15652 = getelementptr inbounds float* %tmp15651, i64 1
+  %tmp15653 = getelementptr inbounds float* %tmp15652, i64 1
+  %tmp15654 = getelementptr inbounds float* %tmp15653, i64 1
+  %tmp15655 = getelementptr inbounds float* %tmp15654, i64 1
+  %tmp15656 = getelementptr inbounds float* %tmp15655, i64 1
+  %tmp15657 = getelementptr inbounds float* %tmp15656, i64 1
+  %tmp15658 = getelementptr inbounds float* %tmp15657, i64 1
+  %tmp15659 = getelementptr inbounds float* %tmp15658, i64 1
+  %tmp15660 = getelementptr inbounds float* %tmp15659, i64 1
+  %tmp15661 = getelementptr inbounds float* %tmp15660, i64 1
+  %tmp15662 = getelementptr inbounds float* %tmp15661, i64 1
+  %tmp15663 = getelementptr inbounds float* %tmp15662, i64 1
+  %tmp15664 = getelementptr inbounds float* %tmp15663, i64 1
+  %tmp15665 = getelementptr inbounds float* %tmp15664, i64 1
+  %tmp15666 = getelementptr inbounds float* %tmp15665, i64 1
+  %tmp15667 = getelementptr inbounds float* %tmp15666, i64 1
+  %tmp15668 = getelementptr inbounds float* %tmp15667, i64 1
+  %tmp15669 = getelementptr inbounds float* %tmp15668, i64 1
+  %tmp15670 = getelementptr inbounds float* %tmp15669, i64 1
+  %tmp15671 = getelementptr inbounds float* %tmp15670, i64 1
+  %tmp15672 = getelementptr inbounds float* %tmp15671, i64 1
+  %tmp15673 = getelementptr inbounds float* %tmp15672, i64 1
+  %tmp15674 = getelementptr inbounds float* %tmp15673, i64 1
+  %tmp15675 = getelementptr inbounds float* %tmp15674, i64 1
+  %tmp15676 = getelementptr inbounds float* %tmp15675, i64 1
+  %tmp15677 = getelementptr inbounds float* %tmp15676, i64 1
+  %tmp15678 = getelementptr inbounds float* %tmp15677, i64 1
+  %tmp15679 = getelementptr inbounds float* %tmp15678, i64 1
+  %tmp15680 = getelementptr inbounds float* %tmp15679, i64 1
+  %tmp15681 = getelementptr inbounds float* %tmp15680, i64 1
+  %tmp15682 = getelementptr inbounds float* %tmp15681, i64 1
+  %tmp15683 = getelementptr inbounds float* %tmp15682, i64 1
+  %tmp15684 = getelementptr inbounds float* %tmp15683, i64 1
+  %tmp15685 = getelementptr inbounds float* %tmp15684, i64 1
+  %tmp15686 = getelementptr inbounds float* %tmp15685, i64 1
+  %tmp15687 = getelementptr inbounds float* %tmp15686, i64 1
+  %tmp15688 = getelementptr inbounds float* %tmp15687, i64 1
+  %tmp15689 = getelementptr inbounds float* %tmp15688, i64 1
+  %tmp15690 = getelementptr inbounds float* %tmp15689, i64 1
+  %tmp15691 = getelementptr inbounds float* %tmp15690, i64 1
+  %tmp15692 = getelementptr inbounds float* %tmp15691, i64 1
+  %tmp15693 = getelementptr inbounds float* %tmp15692, i64 1
+  %tmp15694 = getelementptr inbounds float* %tmp15693, i64 1
+  %tmp15695 = getelementptr inbounds float* %tmp15694, i64 1
+  %tmp15696 = getelementptr inbounds float* %tmp15695, i64 1
+  %tmp15697 = getelementptr inbounds float* %tmp15696, i64 1
+  %tmp15698 = getelementptr inbounds float* %tmp15697, i64 1
+  %tmp15699 = getelementptr inbounds float* %tmp15698, i64 1
+  %tmp15700 = getelementptr inbounds float* %tmp15699, i64 1
+  %tmp15701 = getelementptr inbounds float* %tmp15700, i64 1
+  %tmp15702 = getelementptr inbounds float* %tmp15701, i64 1
+  %tmp15703 = getelementptr inbounds float* %tmp15702, i64 1
+  %tmp15704 = getelementptr inbounds float* %tmp15703, i64 1
+  %tmp15705 = getelementptr inbounds float* %tmp15704, i64 1
+  %tmp15706 = getelementptr inbounds float* %tmp15705, i64 1
+  %tmp15707 = getelementptr inbounds float* %tmp15706, i64 1
+  %tmp15708 = getelementptr inbounds float* %tmp15707, i64 1
+  %tmp15709 = getelementptr inbounds float* %tmp15708, i64 1
+  %tmp15710 = getelementptr inbounds float* %tmp15709, i64 1
+  %tmp15711 = getelementptr inbounds float* %tmp15710, i64 1
+  %tmp15712 = getelementptr inbounds float* %tmp15711, i64 1
+  %tmp15713 = getelementptr inbounds float* %tmp15712, i64 1
+  %tmp15714 = getelementptr inbounds float* %tmp15713, i64 1
+  %tmp15715 = getelementptr inbounds float* %tmp15714, i64 1
+  %tmp15716 = getelementptr inbounds float* %tmp15715, i64 1
+  %tmp15717 = getelementptr inbounds float* %tmp15716, i64 1
+  %tmp15718 = getelementptr inbounds float* %tmp15717, i64 1
+  %tmp15719 = getelementptr inbounds float* %tmp15718, i64 1
+  %tmp15720 = getelementptr inbounds float* %tmp15719, i64 1
+  %tmp15721 = getelementptr inbounds float* %tmp15720, i64 1
+  %tmp15722 = getelementptr inbounds float* %tmp15721, i64 1
+  %tmp15723 = getelementptr inbounds float* %tmp15722, i64 1
+  %tmp15724 = getelementptr inbounds float* %tmp15723, i64 1
+  %tmp15725 = getelementptr inbounds float* %tmp15724, i64 1
+  %tmp15726 = getelementptr inbounds float* %tmp15725, i64 1
+  %tmp15727 = getelementptr inbounds float* %tmp15726, i64 1
+  %tmp15728 = getelementptr inbounds float* %tmp15727, i64 1
+  %tmp15729 = getelementptr inbounds float* %tmp15728, i64 1
+  %tmp15730 = getelementptr inbounds float* %tmp15729, i64 1
+  %tmp15731 = getelementptr inbounds float* %tmp15730, i64 1
+  %tmp15732 = getelementptr inbounds float* %tmp15731, i64 1
+  %tmp15733 = getelementptr inbounds float* %tmp15732, i64 1
+  %tmp15734 = getelementptr inbounds float* %tmp15733, i64 1
+  %tmp15735 = getelementptr inbounds float* %tmp15734, i64 1
+  %tmp15736 = getelementptr inbounds float* %tmp15735, i64 1
+  %tmp15737 = getelementptr inbounds float* %tmp15736, i64 1
+  %tmp15738 = getelementptr inbounds float* %tmp15737, i64 1
+  %tmp15739 = getelementptr inbounds float* %tmp15738, i64 1
+  %tmp15740 = getelementptr inbounds float* %tmp15739, i64 1
+  %tmp15741 = getelementptr inbounds float* %tmp15740, i64 1
+  %tmp15742 = getelementptr inbounds float* %tmp15741, i64 1
+  %tmp15743 = getelementptr inbounds float* %tmp15742, i64 1
+  %tmp15744 = getelementptr inbounds float* %tmp15743, i64 1
+  %tmp15745 = getelementptr inbounds float* %tmp15744, i64 1
+  %tmp15746 = getelementptr inbounds float* %tmp15745, i64 1
+  %tmp15747 = getelementptr inbounds float* %tmp15746, i64 1
+  %tmp15748 = getelementptr inbounds float* %tmp15747, i64 1
+  %tmp15749 = getelementptr inbounds float* %tmp15748, i64 1
+  %tmp15750 = getelementptr inbounds float* %tmp15749, i64 1
+  %tmp15751 = getelementptr inbounds float* %tmp15750, i64 1
+  %tmp15752 = getelementptr inbounds float* %tmp15751, i64 1
+  %tmp15753 = getelementptr inbounds float* %tmp15752, i64 1
+  %tmp15754 = getelementptr inbounds float* %tmp15753, i64 1
+  %tmp15755 = getelementptr inbounds float* %tmp15754, i64 1
+  %tmp15756 = getelementptr inbounds float* %tmp15755, i64 1
+  %tmp15757 = getelementptr inbounds float* %tmp15756, i64 1
+  %tmp15758 = getelementptr inbounds float* %tmp15757, i64 1
+  %tmp15759 = getelementptr inbounds float* %tmp15758, i64 1
+  %tmp15760 = getelementptr inbounds float* %tmp15759, i64 1
+  %tmp15761 = getelementptr inbounds float* %tmp15760, i64 1
+  %tmp15762 = getelementptr inbounds float* %tmp15761, i64 1
+  %tmp15763 = getelementptr inbounds float* %tmp15762, i64 1
+  %tmp15764 = getelementptr inbounds float* %tmp15763, i64 1
+  %tmp15765 = getelementptr inbounds float* %tmp15764, i64 1
+  %tmp15766 = getelementptr inbounds float* %tmp15765, i64 1
+  %tmp15767 = getelementptr inbounds float* %tmp15766, i64 1
+  %tmp15768 = getelementptr inbounds float* %tmp15767, i64 1
+  %tmp15769 = getelementptr inbounds float* %tmp15768, i64 1
+  %tmp15770 = getelementptr inbounds float* %tmp15769, i64 1
+  %tmp15771 = getelementptr inbounds float* %tmp15770, i64 1
+  %tmp15772 = getelementptr inbounds float* %tmp15771, i64 1
+  %tmp15773 = getelementptr inbounds float* %tmp15772, i64 1
+  %tmp15774 = getelementptr inbounds float* %tmp15773, i64 1
+  %tmp15775 = getelementptr inbounds float* %tmp15774, i64 1
+  %tmp15776 = getelementptr inbounds float* %tmp15775, i64 1
+  %tmp15777 = getelementptr inbounds float* %tmp15776, i64 1
+  %tmp15778 = getelementptr inbounds float* %tmp15777, i64 1
+  %tmp15779 = getelementptr inbounds float* %tmp15778, i64 1
+  %tmp15780 = getelementptr inbounds float* %tmp15779, i64 1
+  %tmp15781 = getelementptr inbounds float* %tmp15780, i64 1
+  %tmp15782 = getelementptr inbounds float* %tmp15781, i64 1
+  %tmp15783 = getelementptr inbounds float* %tmp15782, i64 1
+  %tmp15784 = getelementptr inbounds float* %tmp15783, i64 1
+  %tmp15785 = getelementptr inbounds float* %tmp15784, i64 1
+  %tmp15786 = getelementptr inbounds float* %tmp15785, i64 1
+  %tmp15787 = getelementptr inbounds float* %tmp15786, i64 1
+  %tmp15788 = getelementptr inbounds float* %tmp15787, i64 1
+  %tmp15789 = getelementptr inbounds float* %tmp15788, i64 1
+  %tmp15790 = getelementptr inbounds float* %tmp15789, i64 1
+  %tmp15791 = getelementptr inbounds float* %tmp15790, i64 1
+  %tmp15792 = getelementptr inbounds float* %tmp15791, i64 1
+  %tmp15793 = getelementptr inbounds float* %tmp15792, i64 1
+  %tmp15794 = getelementptr inbounds float* %tmp15793, i64 1
+  %tmp15795 = getelementptr inbounds float* %tmp15794, i64 1
+  %tmp15796 = getelementptr inbounds float* %tmp15795, i64 1
+  %tmp15797 = getelementptr inbounds float* %tmp15796, i64 1
+  %tmp15798 = getelementptr inbounds float* %tmp15797, i64 1
+  %tmp15799 = getelementptr inbounds float* %tmp15798, i64 1
+  %tmp15800 = getelementptr inbounds float* %tmp15799, i64 1
+  %tmp15801 = getelementptr inbounds float* %tmp15800, i64 1
+  %tmp15802 = getelementptr inbounds float* %tmp15801, i64 1
+  %tmp15803 = getelementptr inbounds float* %tmp15802, i64 1
+  %tmp15804 = getelementptr inbounds float* %tmp15803, i64 1
+  %tmp15805 = getelementptr inbounds float* %tmp15804, i64 1
+  %tmp15806 = getelementptr inbounds float* %tmp15805, i64 1
+  %tmp15807 = getelementptr inbounds float* %tmp15806, i64 1
+  %tmp15808 = getelementptr inbounds float* %tmp15807, i64 1
+  %tmp15809 = getelementptr inbounds float* %tmp15808, i64 1
+  %tmp15810 = getelementptr inbounds float* %tmp15809, i64 1
+  %tmp15811 = getelementptr inbounds float* %tmp15810, i64 1
+  %tmp15812 = getelementptr inbounds float* %tmp15811, i64 1
+  %tmp15813 = getelementptr inbounds float* %tmp15812, i64 1
+  %tmp15814 = getelementptr inbounds float* %tmp15813, i64 1
+  %tmp15815 = getelementptr inbounds float* %tmp15814, i64 1
+  %tmp15816 = getelementptr inbounds float* %tmp15815, i64 1
+  %tmp15817 = getelementptr inbounds float* %tmp15816, i64 1
+  %tmp15818 = getelementptr inbounds float* %tmp15817, i64 1
+  %tmp15819 = getelementptr inbounds float* %tmp15818, i64 1
+  %tmp15820 = getelementptr inbounds float* %tmp15819, i64 1
+  %tmp15821 = getelementptr inbounds float* %tmp15820, i64 1
+  %tmp15822 = getelementptr inbounds float* %tmp15821, i64 1
+  %tmp15823 = getelementptr inbounds float* %tmp15822, i64 1
+  %tmp15824 = getelementptr inbounds float* %tmp15823, i64 1
+  %tmp15825 = getelementptr inbounds float* %tmp15824, i64 1
+  %tmp15826 = getelementptr inbounds float* %tmp15825, i64 1
+  %tmp15827 = getelementptr inbounds float* %tmp15826, i64 1
+  %tmp15828 = getelementptr inbounds float* %tmp15827, i64 1
+  %tmp15829 = getelementptr inbounds float* %tmp15828, i64 1
+  %tmp15830 = getelementptr inbounds float* %tmp15829, i64 1
+  %tmp15831 = getelementptr inbounds float* %tmp15830, i64 1
+  %tmp15832 = getelementptr inbounds float* %tmp15831, i64 1
+  %tmp15833 = getelementptr inbounds float* %tmp15832, i64 1
+  %tmp15834 = getelementptr inbounds float* %tmp15833, i64 1
+  %tmp15835 = getelementptr inbounds float* %tmp15834, i64 1
+  %tmp15836 = getelementptr inbounds float* %tmp15835, i64 1
+  %tmp15837 = getelementptr inbounds float* %tmp15836, i64 1
+  %tmp15838 = getelementptr inbounds float* %tmp15837, i64 1
+  %tmp15839 = getelementptr inbounds float* %tmp15838, i64 1
+  %tmp15840 = getelementptr inbounds float* %tmp15839, i64 1
+  %tmp15841 = getelementptr inbounds float* %tmp15840, i64 1
+  %tmp15842 = getelementptr inbounds float* %tmp15841, i64 1
+  %tmp15843 = getelementptr inbounds float* %tmp15842, i64 1
+  %tmp15844 = getelementptr inbounds float* %tmp15843, i64 1
+  %tmp15845 = getelementptr inbounds float* %tmp15844, i64 1
+  %tmp15846 = getelementptr inbounds float* %tmp15845, i64 1
+  %tmp15847 = getelementptr inbounds float* %tmp15846, i64 1
+  %tmp15848 = getelementptr inbounds float* %tmp15847, i64 1
+  %tmp15849 = getelementptr inbounds float* %tmp15848, i64 1
+  %tmp15850 = getelementptr inbounds float* %tmp15849, i64 1
+  %tmp15851 = getelementptr inbounds float* %tmp15850, i64 1
+  %tmp15852 = getelementptr inbounds float* %tmp15851, i64 1
+  %tmp15853 = getelementptr inbounds float* %tmp15852, i64 1
+  %tmp15854 = getelementptr inbounds float* %tmp15853, i64 1
+  %tmp15855 = getelementptr inbounds float* %tmp15854, i64 1
+  %tmp15856 = getelementptr inbounds float* %tmp15855, i64 1
+  %tmp15857 = getelementptr inbounds float* %tmp15856, i64 1
+  %tmp15858 = getelementptr inbounds float* %tmp15857, i64 1
+  %tmp15859 = getelementptr inbounds float* %tmp15858, i64 1
+  %tmp15860 = getelementptr inbounds float* %tmp15859, i64 1
+  %tmp15861 = getelementptr inbounds float* %tmp15860, i64 1
+  %tmp15862 = getelementptr inbounds float* %tmp15861, i64 1
+  %tmp15863 = getelementptr inbounds float* %tmp15862, i64 1
+  %tmp15864 = getelementptr inbounds float* %tmp15863, i64 1
+  %tmp15865 = getelementptr inbounds float* %tmp15864, i64 1
+  %tmp15866 = getelementptr inbounds float* %tmp15865, i64 1
+  %tmp15867 = getelementptr inbounds float* %tmp15866, i64 1
+  %tmp15868 = getelementptr inbounds float* %tmp15867, i64 1
+  %tmp15869 = getelementptr inbounds float* %tmp15868, i64 1
+  %tmp15870 = getelementptr inbounds float* %tmp15869, i64 1
+  %tmp15871 = getelementptr inbounds float* %tmp15870, i64 1
+  %tmp15872 = getelementptr inbounds float* %tmp15871, i64 1
+  %tmp15873 = getelementptr inbounds float* %tmp15872, i64 1
+  %tmp15874 = getelementptr inbounds float* %tmp15873, i64 1
+  %tmp15875 = getelementptr inbounds float* %tmp15874, i64 1
+  %tmp15876 = getelementptr inbounds float* %tmp15875, i64 1
+  %tmp15877 = getelementptr inbounds float* %tmp15876, i64 1
+  %tmp15878 = getelementptr inbounds float* %tmp15877, i64 1
+  %tmp15879 = getelementptr inbounds float* %tmp15878, i64 1
+  %tmp15880 = getelementptr inbounds float* %tmp15879, i64 1
+  %tmp15881 = getelementptr inbounds float* %tmp15880, i64 1
+  %tmp15882 = getelementptr inbounds float* %tmp15881, i64 1
+  %tmp15883 = getelementptr inbounds float* %tmp15882, i64 1
+  %tmp15884 = getelementptr inbounds float* %tmp15883, i64 1
+  %tmp15885 = getelementptr inbounds float* %tmp15884, i64 1
+  %tmp15886 = getelementptr inbounds float* %tmp15885, i64 1
+  %tmp15887 = getelementptr inbounds float* %tmp15886, i64 1
+  %tmp15888 = getelementptr inbounds float* %tmp15887, i64 1
+  %tmp15889 = getelementptr inbounds float* %tmp15888, i64 1
+  %tmp15890 = getelementptr inbounds float* %tmp15889, i64 1
+  %tmp15891 = getelementptr inbounds float* %tmp15890, i64 1
+  %tmp15892 = getelementptr inbounds float* %tmp15891, i64 1
+  %tmp15893 = getelementptr inbounds float* %tmp15892, i64 1
+  %tmp15894 = getelementptr inbounds float* %tmp15893, i64 1
+  %tmp15895 = getelementptr inbounds float* %tmp15894, i64 1
+  %tmp15896 = getelementptr inbounds float* %tmp15895, i64 1
+  %tmp15897 = getelementptr inbounds float* %tmp15896, i64 1
+  %tmp15898 = getelementptr inbounds float* %tmp15897, i64 1
+  %tmp15899 = getelementptr inbounds float* %tmp15898, i64 1
+  %tmp15900 = getelementptr inbounds float* %tmp15899, i64 1
+  %tmp15901 = getelementptr inbounds float* %tmp15900, i64 1
+  %tmp15902 = getelementptr inbounds float* %tmp15901, i64 1
+  %tmp15903 = getelementptr inbounds float* %tmp15902, i64 1
+  %tmp15904 = getelementptr inbounds float* %tmp15903, i64 1
+  %tmp15905 = getelementptr inbounds float* %tmp15904, i64 1
+  %tmp15906 = getelementptr inbounds float* %tmp15905, i64 1
+  %tmp15907 = getelementptr inbounds float* %tmp15906, i64 1
+  %tmp15908 = getelementptr inbounds float* %tmp15907, i64 1
+  %tmp15909 = getelementptr inbounds float* %tmp15908, i64 1
+  %tmp15910 = getelementptr inbounds float* %tmp15909, i64 1
+  %tmp15911 = getelementptr inbounds float* %tmp15910, i64 1
+  %tmp15912 = getelementptr inbounds float* %tmp15911, i64 1
+  %tmp15913 = getelementptr inbounds float* %tmp15912, i64 1
+  %tmp15914 = getelementptr inbounds float* %tmp15913, i64 1
+  %tmp15915 = getelementptr inbounds float* %tmp15914, i64 1
+  %tmp15916 = getelementptr inbounds float* %tmp15915, i64 1
+  %tmp15917 = getelementptr inbounds float* %tmp15916, i64 1
+  %tmp15918 = getelementptr inbounds float* %tmp15917, i64 1
+  %tmp15919 = getelementptr inbounds float* %tmp15918, i64 1
+  %tmp15920 = getelementptr inbounds float* %tmp15919, i64 1
+  %tmp15921 = getelementptr inbounds float* %tmp15920, i64 1
+  %tmp15922 = getelementptr inbounds float* %tmp15921, i64 1
+  %tmp15923 = getelementptr inbounds float* %tmp15922, i64 1
+  %tmp15924 = getelementptr inbounds float* %tmp15923, i64 1
+  %tmp15925 = getelementptr inbounds float* %tmp15924, i64 1
+  %tmp15926 = getelementptr inbounds float* %tmp15925, i64 1
+  %tmp15927 = getelementptr inbounds float* %tmp15926, i64 1
+  %tmp15928 = getelementptr inbounds float* %tmp15927, i64 1
+  %tmp15929 = getelementptr inbounds float* %tmp15928, i64 1
+  %tmp15930 = getelementptr inbounds float* %tmp15929, i64 1
+  %tmp15931 = getelementptr inbounds float* %tmp15930, i64 1
+  %tmp15932 = getelementptr inbounds float* %tmp15931, i64 1
+  %tmp15933 = getelementptr inbounds float* %tmp15932, i64 1
+  %tmp15934 = getelementptr inbounds float* %tmp15933, i64 1
+  %tmp15935 = getelementptr inbounds float* %tmp15934, i64 1
+  %tmp15936 = getelementptr inbounds float* %tmp15935, i64 1
+  %tmp15937 = getelementptr inbounds float* %tmp15936, i64 1
+  %tmp15938 = getelementptr inbounds float* %tmp15937, i64 1
+  %tmp15939 = getelementptr inbounds float* %tmp15938, i64 1
+  %tmp15940 = getelementptr inbounds float* %tmp15939, i64 1
+  %tmp15941 = getelementptr inbounds float* %tmp15940, i64 1
+  %tmp15942 = getelementptr inbounds float* %tmp15941, i64 1
+  %tmp15943 = getelementptr inbounds float* %tmp15942, i64 1
+  %tmp15944 = getelementptr inbounds float* %tmp15943, i64 1
+  %tmp15945 = getelementptr inbounds float* %tmp15944, i64 1
+  %tmp15946 = getelementptr inbounds float* %tmp15945, i64 1
+  %tmp15947 = getelementptr inbounds float* %tmp15946, i64 1
+  %tmp15948 = getelementptr inbounds float* %tmp15947, i64 1
+  %tmp15949 = getelementptr inbounds float* %tmp15948, i64 1
+  %tmp15950 = getelementptr inbounds float* %tmp15949, i64 1
+  %tmp15951 = getelementptr inbounds float* %tmp15950, i64 1
+  %tmp15952 = getelementptr inbounds float* %tmp15951, i64 1
+  %tmp15953 = getelementptr inbounds float* %tmp15952, i64 1
+  %tmp15954 = getelementptr inbounds float* %tmp15953, i64 1
+  %tmp15955 = getelementptr inbounds float* %tmp15954, i64 1
+  %tmp15956 = getelementptr inbounds float* %tmp15955, i64 1
+  %tmp15957 = getelementptr inbounds float* %tmp15956, i64 1
+  %tmp15958 = getelementptr inbounds float* %tmp15957, i64 1
+  %tmp15959 = getelementptr inbounds float* %tmp15958, i64 1
+  %tmp15960 = getelementptr inbounds float* %tmp15959, i64 1
+  %tmp15961 = getelementptr inbounds float* %tmp15960, i64 1
+  %tmp15962 = getelementptr inbounds float* %tmp15961, i64 1
+  %tmp15963 = getelementptr inbounds float* %tmp15962, i64 1
+  %tmp15964 = getelementptr inbounds float* %tmp15963, i64 1
+  %tmp15965 = getelementptr inbounds float* %tmp15964, i64 1
+  %tmp15966 = getelementptr inbounds float* %tmp15965, i64 1
+  %tmp15967 = getelementptr inbounds float* %tmp15966, i64 1
+  %tmp15968 = getelementptr inbounds float* %tmp15967, i64 1
+  %tmp15969 = getelementptr inbounds float* %tmp15968, i64 1
+  %tmp15970 = getelementptr inbounds float* %tmp15969, i64 1
+  %tmp15971 = getelementptr inbounds float* %tmp15970, i64 1
+  %tmp15972 = getelementptr inbounds float* %tmp15971, i64 1
+  %tmp15973 = getelementptr inbounds float* %tmp15972, i64 1
+  %tmp15974 = getelementptr inbounds float* %tmp15973, i64 1
+  %tmp15975 = getelementptr inbounds float* %tmp15974, i64 1
+  %tmp15976 = getelementptr inbounds float* %tmp15975, i64 1
+  %tmp15977 = getelementptr inbounds float* %tmp15976, i64 1
+  %tmp15978 = getelementptr inbounds float* %tmp15977, i64 1
+  %tmp15979 = getelementptr inbounds float* %tmp15978, i64 1
+  %tmp15980 = getelementptr inbounds float* %tmp15979, i64 1
+  %tmp15981 = getelementptr inbounds float* %tmp15980, i64 1
+  %tmp15982 = getelementptr inbounds float* %tmp15981, i64 1
+  %tmp15983 = getelementptr inbounds float* %tmp15982, i64 1
+  %tmp15984 = getelementptr inbounds float* %tmp15983, i64 1
+  %tmp15985 = getelementptr inbounds float* %tmp15984, i64 1
+  %tmp15986 = getelementptr inbounds float* %tmp15985, i64 1
+  %tmp15987 = getelementptr inbounds float* %tmp15986, i64 1
+  %tmp15988 = getelementptr inbounds float* %tmp15987, i64 1
+  %tmp15989 = getelementptr inbounds float* %tmp15988, i64 1
+  %tmp15990 = getelementptr inbounds float* %tmp15989, i64 1
+  %tmp15991 = getelementptr inbounds float* %tmp15990, i64 1
+  %tmp15992 = getelementptr inbounds float* %tmp15991, i64 1
+  %tmp15993 = getelementptr inbounds float* %tmp15992, i64 1
+  %tmp15994 = getelementptr inbounds float* %tmp15993, i64 1
+  %tmp15995 = getelementptr inbounds float* %tmp15994, i64 1
+  %tmp15996 = getelementptr inbounds float* %tmp15995, i64 1
+  %tmp15997 = getelementptr inbounds float* %tmp15996, i64 1
+  %tmp15998 = getelementptr inbounds float* %tmp15997, i64 1
+  %tmp15999 = getelementptr inbounds float* %tmp15998, i64 1
+  %tmp16000 = getelementptr inbounds float* %tmp15999, i64 1
+  %tmp16001 = getelementptr inbounds float* %tmp16000, i64 1
+  %tmp16002 = getelementptr inbounds float* %tmp16001, i64 1
+  %tmp16003 = getelementptr inbounds float* %tmp16002, i64 1
+  %tmp16004 = getelementptr inbounds float* %tmp16003, i64 1
+  %tmp16005 = getelementptr inbounds float* %tmp16004, i64 1
+  %tmp16006 = getelementptr inbounds float* %tmp16005, i64 1
+  %tmp16007 = getelementptr inbounds float* %tmp16006, i64 1
+  %tmp16008 = getelementptr inbounds float* %tmp16007, i64 1
+  %tmp16009 = getelementptr inbounds float* %tmp16008, i64 1
+  %tmp16010 = getelementptr inbounds float* %tmp16009, i64 1
+  %tmp16011 = getelementptr inbounds float* %tmp16010, i64 1
+  %tmp16012 = getelementptr inbounds float* %tmp16011, i64 1
+  %tmp16013 = getelementptr inbounds float* %tmp16012, i64 1
+  %tmp16014 = getelementptr inbounds float* %tmp16013, i64 1
+  %tmp16015 = getelementptr inbounds float* %tmp16014, i64 1
+  %tmp16016 = getelementptr inbounds float* %tmp16015, i64 1
+  %tmp16017 = getelementptr inbounds float* %tmp16016, i64 1
+  %tmp16018 = getelementptr inbounds float* %tmp16017, i64 1
+  %tmp16019 = getelementptr inbounds float* %tmp16018, i64 1
+  %tmp16020 = getelementptr inbounds float* %tmp16019, i64 1
+  %tmp16021 = getelementptr inbounds float* %tmp16020, i64 1
+  %tmp16022 = getelementptr inbounds float* %tmp16021, i64 1
+  %tmp16023 = getelementptr inbounds float* %tmp16022, i64 1
+  %tmp16024 = getelementptr inbounds float* %tmp16023, i64 1
+  %tmp16025 = getelementptr inbounds float* %tmp16024, i64 1
+  %tmp16026 = getelementptr inbounds float* %tmp16025, i64 1
+  %tmp16027 = getelementptr inbounds float* %tmp16026, i64 1
+  %tmp16028 = getelementptr inbounds float* %tmp16027, i64 1
+  %tmp16029 = getelementptr inbounds float* %tmp16028, i64 1
+  %tmp16030 = getelementptr inbounds float* %tmp16029, i64 1
+  %tmp16031 = getelementptr inbounds float* %tmp16030, i64 1
+  %tmp16032 = getelementptr inbounds float* %tmp16031, i64 1
+  %tmp16033 = getelementptr inbounds float* %tmp16032, i64 1
+  %tmp16034 = getelementptr inbounds float* %tmp16033, i64 1
+  %tmp16035 = getelementptr inbounds float* %tmp16034, i64 1
+  %tmp16036 = getelementptr inbounds float* %tmp16035, i64 1
+  %tmp16037 = getelementptr inbounds float* %tmp16036, i64 1
+  %tmp16038 = getelementptr inbounds float* %tmp16037, i64 1
+  %tmp16039 = getelementptr inbounds float* %tmp16038, i64 1
+  %tmp16040 = getelementptr inbounds float* %tmp16039, i64 1
+  %tmp16041 = getelementptr inbounds float* %tmp16040, i64 1
+  %tmp16042 = getelementptr inbounds float* %tmp16041, i64 1
+  %tmp16043 = getelementptr inbounds float* %tmp16042, i64 1
+  %tmp16044 = getelementptr inbounds float* %tmp16043, i64 1
+  %tmp16045 = getelementptr inbounds float* %tmp16044, i64 1
+  %tmp16046 = getelementptr inbounds float* %tmp16045, i64 1
+  %tmp16047 = getelementptr inbounds float* %tmp16046, i64 1
+  %tmp16048 = getelementptr inbounds float* %tmp16047, i64 1
+  %tmp16049 = getelementptr inbounds float* %tmp16048, i64 1
+  %tmp16050 = getelementptr inbounds float* %tmp16049, i64 1
+  %tmp16051 = getelementptr inbounds float* %tmp16050, i64 1
+  %tmp16052 = getelementptr inbounds float* %tmp16051, i64 1
+  %tmp16053 = getelementptr inbounds float* %tmp16052, i64 1
+  %tmp16054 = getelementptr inbounds float* %tmp16053, i64 1
+  %tmp16055 = getelementptr inbounds float* %tmp16054, i64 1
+  %tmp16056 = getelementptr inbounds float* %tmp16055, i64 1
+  %tmp16057 = getelementptr inbounds float* %tmp16056, i64 1
+  %tmp16058 = getelementptr inbounds float* %tmp16057, i64 1
+  %tmp16059 = getelementptr inbounds float* %tmp16058, i64 1
+  %tmp16060 = getelementptr inbounds float* %tmp16059, i64 1
+  %tmp16061 = getelementptr inbounds float* %tmp16060, i64 1
+  %tmp16062 = getelementptr inbounds float* %tmp16061, i64 1
+  %tmp16063 = getelementptr inbounds float* %tmp16062, i64 1
+  %tmp16064 = getelementptr inbounds float* %tmp16063, i64 1
+  %tmp16065 = getelementptr inbounds float* %tmp16064, i64 1
+  %tmp16066 = getelementptr inbounds float* %tmp16065, i64 1
+  %tmp16067 = getelementptr inbounds float* %tmp16066, i64 1
+  %tmp16068 = getelementptr inbounds float* %tmp16067, i64 1
+  %tmp16069 = getelementptr inbounds float* %tmp16068, i64 1
+  %tmp16070 = getelementptr inbounds float* %tmp16069, i64 1
+  %tmp16071 = getelementptr inbounds float* %tmp16070, i64 1
+  %tmp16072 = getelementptr inbounds float* %tmp16071, i64 1
+  %tmp16073 = getelementptr inbounds float* %tmp16072, i64 1
+  %tmp16074 = getelementptr inbounds float* %tmp16073, i64 1
+  %tmp16075 = getelementptr inbounds float* %tmp16074, i64 1
+  %tmp16076 = getelementptr inbounds float* %tmp16075, i64 1
+  %tmp16077 = getelementptr inbounds float* %tmp16076, i64 1
+  %tmp16078 = getelementptr inbounds float* %tmp16077, i64 1
+  %tmp16079 = getelementptr inbounds float* %tmp16078, i64 1
+  %tmp16080 = getelementptr inbounds float* %tmp16079, i64 1
+  %tmp16081 = getelementptr inbounds float* %tmp16080, i64 1
+  %tmp16082 = getelementptr inbounds float* %tmp16081, i64 1
+  %tmp16083 = getelementptr inbounds float* %tmp16082, i64 1
+  %tmp16084 = getelementptr inbounds float* %tmp16083, i64 1
+  %tmp16085 = getelementptr inbounds float* %tmp16084, i64 1
+  %tmp16086 = getelementptr inbounds float* %tmp16085, i64 1
+  %tmp16087 = getelementptr inbounds float* %tmp16086, i64 1
+  %tmp16088 = getelementptr inbounds float* %tmp16087, i64 1
+  %tmp16089 = getelementptr inbounds float* %tmp16088, i64 1
+  %tmp16090 = getelementptr inbounds float* %tmp16089, i64 1
+  %tmp16091 = getelementptr inbounds float* %tmp16090, i64 1
+  %tmp16092 = getelementptr inbounds float* %tmp16091, i64 1
+  %tmp16093 = getelementptr inbounds float* %tmp16092, i64 1
+  %tmp16094 = getelementptr inbounds float* %tmp16093, i64 1
+  %tmp16095 = getelementptr inbounds float* %tmp16094, i64 1
+  %tmp16096 = getelementptr inbounds float* %tmp16095, i64 1
+  %tmp16097 = getelementptr inbounds float* %tmp16096, i64 1
+  %tmp16098 = getelementptr inbounds float* %tmp16097, i64 1
+  %tmp16099 = getelementptr inbounds float* %tmp16098, i64 1
+  %tmp16100 = getelementptr inbounds float* %tmp16099, i64 1
+  %tmp16101 = getelementptr inbounds float* %tmp16100, i64 1
+  %tmp16102 = getelementptr inbounds float* %tmp16101, i64 1
+  %tmp16103 = getelementptr inbounds float* %tmp16102, i64 1
+  %tmp16104 = getelementptr inbounds float* %tmp16103, i64 1
+  %tmp16105 = getelementptr inbounds float* %tmp16104, i64 1
+  %tmp16106 = getelementptr inbounds float* %tmp16105, i64 1
+  %tmp16107 = getelementptr inbounds float* %tmp16106, i64 1
+  %tmp16108 = getelementptr inbounds float* %tmp16107, i64 1
+  %tmp16109 = getelementptr inbounds float* %tmp16108, i64 1
+  %tmp16110 = getelementptr inbounds float* %tmp16109, i64 1
+  %tmp16111 = getelementptr inbounds float* %tmp16110, i64 1
+  %tmp16112 = getelementptr inbounds float* %tmp16111, i64 1
+  %tmp16113 = getelementptr inbounds float* %tmp16112, i64 1
+  %tmp16114 = getelementptr inbounds float* %tmp16113, i64 1
+  %tmp16115 = getelementptr inbounds float* %tmp16114, i64 1
+  %tmp16116 = getelementptr inbounds float* %tmp16115, i64 1
+  %tmp16117 = getelementptr inbounds float* %tmp16116, i64 1
+  %tmp16118 = getelementptr inbounds float* %tmp16117, i64 1
+  %tmp16119 = getelementptr inbounds float* %tmp16118, i64 1
+  %tmp16120 = getelementptr inbounds float* %tmp16119, i64 1
+  %tmp16121 = getelementptr inbounds float* %tmp16120, i64 1
+  %tmp16122 = getelementptr inbounds float* %tmp16121, i64 1
+  %tmp16123 = getelementptr inbounds float* %tmp16122, i64 1
+  %tmp16124 = getelementptr inbounds float* %tmp16123, i64 1
+  %tmp16125 = getelementptr inbounds float* %tmp16124, i64 1
+  %tmp16126 = getelementptr inbounds float* %tmp16125, i64 1
+  %tmp16127 = getelementptr inbounds float* %tmp16126, i64 1
+  %tmp16128 = getelementptr inbounds float* %tmp16127, i64 1
+  %tmp16129 = getelementptr inbounds float* %tmp16128, i64 1
+  %tmp16130 = getelementptr inbounds float* %tmp16129, i64 1
+  %tmp16131 = getelementptr inbounds float* %tmp16130, i64 1
+  %tmp16132 = getelementptr inbounds float* %tmp16131, i64 1
+  %tmp16133 = getelementptr inbounds float* %tmp16132, i64 1
+  %tmp16134 = getelementptr inbounds float* %tmp16133, i64 1
+  %tmp16135 = getelementptr inbounds float* %tmp16134, i64 1
+  %tmp16136 = getelementptr inbounds float* %tmp16135, i64 1
+  %tmp16137 = getelementptr inbounds float* %tmp16136, i64 1
+  %tmp16138 = getelementptr inbounds float* %tmp16137, i64 1
+  %tmp16139 = getelementptr inbounds float* %tmp16138, i64 1
+  %tmp16140 = getelementptr inbounds float* %tmp16139, i64 1
+  %tmp16141 = getelementptr inbounds float* %tmp16140, i64 1
+  %tmp16142 = getelementptr inbounds float* %tmp16141, i64 1
+  %tmp16143 = getelementptr inbounds float* %tmp16142, i64 1
+  %tmp16144 = getelementptr inbounds float* %tmp16143, i64 1
+  %tmp16145 = getelementptr inbounds float* %tmp16144, i64 1
+  %tmp16146 = getelementptr inbounds float* %tmp16145, i64 1
+  %tmp16147 = getelementptr inbounds float* %tmp16146, i64 1
+  %tmp16148 = getelementptr inbounds float* %tmp16147, i64 1
+  %tmp16149 = getelementptr inbounds float* %tmp16148, i64 1
+  %tmp16150 = getelementptr inbounds float* %tmp16149, i64 1
+  %tmp16151 = getelementptr inbounds float* %tmp16150, i64 1
+  %tmp16152 = getelementptr inbounds float* %tmp16151, i64 1
+  %tmp16153 = getelementptr inbounds float* %tmp16152, i64 1
+  %tmp16154 = getelementptr inbounds float* %tmp16153, i64 1
+  %tmp16155 = getelementptr inbounds float* %tmp16154, i64 1
+  %tmp16156 = getelementptr inbounds float* %tmp16155, i64 1
+  %tmp16157 = getelementptr inbounds float* %tmp16156, i64 1
+  %tmp16158 = getelementptr inbounds float* %tmp16157, i64 1
+  %tmp16159 = getelementptr inbounds float* %tmp16158, i64 1
+  %tmp16160 = getelementptr inbounds float* %tmp16159, i64 1
+  %tmp16161 = getelementptr inbounds float* %tmp16160, i64 1
+  %tmp16162 = getelementptr inbounds float* %tmp16161, i64 1
+  %tmp16163 = getelementptr inbounds float* %tmp16162, i64 1
+  %tmp16164 = getelementptr inbounds float* %tmp16163, i64 1
+  %tmp16165 = getelementptr inbounds float* %tmp16164, i64 1
+  %tmp16166 = getelementptr inbounds float* %tmp16165, i64 1
+  %tmp16167 = getelementptr inbounds float* %tmp16166, i64 1
+  %tmp16168 = getelementptr inbounds float* %tmp16167, i64 1
+  %tmp16169 = getelementptr inbounds float* %tmp16168, i64 1
+  %tmp16170 = getelementptr inbounds float* %tmp16169, i64 1
+  %tmp16171 = getelementptr inbounds float* %tmp16170, i64 1
+  %tmp16172 = getelementptr inbounds float* %tmp16171, i64 1
+  %tmp16173 = getelementptr inbounds float* %tmp16172, i64 1
+  %tmp16174 = getelementptr inbounds float* %tmp16173, i64 1
+  %tmp16175 = getelementptr inbounds float* %tmp16174, i64 1
+  %tmp16176 = getelementptr inbounds float* %tmp16175, i64 1
+  %tmp16177 = getelementptr inbounds float* %tmp16176, i64 1
+  %tmp16178 = getelementptr inbounds float* %tmp16177, i64 1
+  %tmp16179 = getelementptr inbounds float* %tmp16178, i64 1
+  %tmp16180 = getelementptr inbounds float* %tmp16179, i64 1
+  %tmp16181 = getelementptr inbounds float* %tmp16180, i64 1
+  %tmp16182 = getelementptr inbounds float* %tmp16181, i64 1
+  %tmp16183 = getelementptr inbounds float* %tmp16182, i64 1
+  %tmp16184 = getelementptr inbounds float* %tmp16183, i64 1
+  %tmp16185 = getelementptr inbounds float* %tmp16184, i64 1
+  %tmp16186 = getelementptr inbounds float* %tmp16185, i64 1
+  %tmp16187 = getelementptr inbounds float* %tmp16186, i64 1
+  %tmp16188 = getelementptr inbounds float* %tmp16187, i64 1
+  %tmp16189 = getelementptr inbounds float* %tmp16188, i64 1
+  %tmp16190 = getelementptr inbounds float* %tmp16189, i64 1
+  %tmp16191 = getelementptr inbounds float* %tmp16190, i64 1
+  %tmp16192 = getelementptr inbounds float* %tmp16191, i64 1
+  %tmp16193 = getelementptr inbounds float* %tmp16192, i64 1
+  %tmp16194 = getelementptr inbounds float* %tmp16193, i64 1
+  %tmp16195 = getelementptr inbounds float* %tmp16194, i64 1
+  %tmp16196 = getelementptr inbounds float* %tmp16195, i64 1
+  %tmp16197 = getelementptr inbounds float* %tmp16196, i64 1
+  %tmp16198 = getelementptr inbounds float* %tmp16197, i64 1
+  %tmp16199 = getelementptr inbounds float* %tmp16198, i64 1
+  %tmp16200 = getelementptr inbounds float* %tmp16199, i64 1
+  %tmp16201 = getelementptr inbounds float* %tmp16200, i64 1
+  %tmp16202 = getelementptr inbounds float* %tmp16201, i64 1
+  %tmp16203 = getelementptr inbounds float* %tmp16202, i64 1
+  %tmp16204 = getelementptr inbounds float* %tmp16203, i64 1
+  %tmp16205 = getelementptr inbounds float* %tmp16204, i64 1
+  %tmp16206 = getelementptr inbounds float* %tmp16205, i64 1
+  %tmp16207 = getelementptr inbounds float* %tmp16206, i64 1
+  %tmp16208 = getelementptr inbounds float* %tmp16207, i64 1
+  %tmp16209 = getelementptr inbounds float* %tmp16208, i64 1
+  %tmp16210 = getelementptr inbounds float* %tmp16209, i64 1
+  %tmp16211 = getelementptr inbounds float* %tmp16210, i64 1
+  %tmp16212 = getelementptr inbounds float* %tmp16211, i64 1
+  %tmp16213 = getelementptr inbounds float* %tmp16212, i64 1
+  %tmp16214 = getelementptr inbounds float* %tmp16213, i64 1
+  %tmp16215 = getelementptr inbounds float* %tmp16214, i64 1
+  %tmp16216 = getelementptr inbounds float* %tmp16215, i64 1
+  %tmp16217 = getelementptr inbounds float* %tmp16216, i64 1
+  %tmp16218 = getelementptr inbounds float* %tmp16217, i64 1
+  %tmp16219 = getelementptr inbounds float* %tmp16218, i64 1
+  %tmp16220 = getelementptr inbounds float* %tmp16219, i64 1
+  %tmp16221 = getelementptr inbounds float* %tmp16220, i64 1
+  %tmp16222 = getelementptr inbounds float* %tmp16221, i64 1
+  %tmp16223 = getelementptr inbounds float* %tmp16222, i64 1
+  %tmp16224 = getelementptr inbounds float* %tmp16223, i64 1
+  %tmp16225 = getelementptr inbounds float* %tmp16224, i64 1
+  %tmp16226 = getelementptr inbounds float* %tmp16225, i64 1
+  %tmp16227 = getelementptr inbounds float* %tmp16226, i64 1
+  %tmp16228 = getelementptr inbounds float* %tmp16227, i64 1
+  %tmp16229 = getelementptr inbounds float* %tmp16228, i64 1
+  %tmp16230 = getelementptr inbounds float* %tmp16229, i64 1
+  %tmp16231 = getelementptr inbounds float* %tmp16230, i64 1
+  %tmp16232 = getelementptr inbounds float* %tmp16231, i64 1
+  %tmp16233 = getelementptr inbounds float* %tmp16232, i64 1
+  %tmp16234 = getelementptr inbounds float* %tmp16233, i64 1
+  %tmp16235 = getelementptr inbounds float* %tmp16234, i64 1
+  %tmp16236 = getelementptr inbounds float* %tmp16235, i64 1
+  %tmp16237 = getelementptr inbounds float* %tmp16236, i64 1
+  %tmp16238 = getelementptr inbounds float* %tmp16237, i64 1
+  %tmp16239 = getelementptr inbounds float* %tmp16238, i64 1
+  %tmp16240 = getelementptr inbounds float* %tmp16239, i64 1
+  %tmp16241 = getelementptr inbounds float* %tmp16240, i64 1
+  %tmp16242 = getelementptr inbounds float* %tmp16241, i64 1
+  %tmp16243 = getelementptr inbounds float* %tmp16242, i64 1
+  %tmp16244 = getelementptr inbounds float* %tmp16243, i64 1
+  %tmp16245 = getelementptr inbounds float* %tmp16244, i64 1
+  %tmp16246 = getelementptr inbounds float* %tmp16245, i64 1
+  %tmp16247 = getelementptr inbounds float* %tmp16246, i64 1
+  %tmp16248 = getelementptr inbounds float* %tmp16247, i64 1
+  %tmp16249 = getelementptr inbounds float* %tmp16248, i64 1
+  %tmp16250 = getelementptr inbounds float* %tmp16249, i64 1
+  %tmp16251 = getelementptr inbounds float* %tmp16250, i64 1
+  %tmp16252 = getelementptr inbounds float* %tmp16251, i64 1
+  %tmp16253 = getelementptr inbounds float* %tmp16252, i64 1
+  %tmp16254 = getelementptr inbounds float* %tmp16253, i64 1
+  %tmp16255 = getelementptr inbounds float* %tmp16254, i64 1
+  %tmp16256 = getelementptr inbounds float* %tmp16255, i64 1
+  %tmp16257 = getelementptr inbounds float* %tmp16256, i64 1
+  %tmp16258 = getelementptr inbounds float* %tmp16257, i64 1
+  %tmp16259 = getelementptr inbounds float* %tmp16258, i64 1
+  %tmp16260 = getelementptr inbounds float* %tmp16259, i64 1
+  %tmp16261 = getelementptr inbounds float* %tmp16260, i64 1
+  %tmp16262 = getelementptr inbounds float* %tmp16261, i64 1
+  %tmp16263 = getelementptr inbounds float* %tmp16262, i64 1
+  %tmp16264 = getelementptr inbounds float* %tmp16263, i64 1
+  %tmp16265 = getelementptr inbounds float* %tmp16264, i64 1
+  %tmp16266 = getelementptr inbounds float* %tmp16265, i64 1
+  %tmp16267 = getelementptr inbounds float* %tmp16266, i64 1
+  %tmp16268 = getelementptr inbounds float* %tmp16267, i64 1
+  %tmp16269 = getelementptr inbounds float* %tmp16268, i64 1
+  %tmp16270 = getelementptr inbounds float* %tmp16269, i64 1
+  %tmp16271 = getelementptr inbounds float* %tmp16270, i64 1
+  %tmp16272 = getelementptr inbounds float* %tmp16271, i64 1
+  %tmp16273 = getelementptr inbounds float* %tmp16272, i64 1
+  %tmp16274 = getelementptr inbounds float* %tmp16273, i64 1
+  %tmp16275 = getelementptr inbounds float* %tmp16274, i64 1
+  %tmp16276 = getelementptr inbounds float* %tmp16275, i64 1
+  %tmp16277 = getelementptr inbounds float* %tmp16276, i64 1
+  %tmp16278 = getelementptr inbounds float* %tmp16277, i64 1
+  %tmp16279 = getelementptr inbounds float* %tmp16278, i64 1
+  %tmp16280 = getelementptr inbounds float* %tmp16279, i64 1
+  %tmp16281 = getelementptr inbounds float* %tmp16280, i64 1
+  %tmp16282 = getelementptr inbounds float* %tmp16281, i64 1
+  %tmp16283 = getelementptr inbounds float* %tmp16282, i64 1
+  %tmp16284 = getelementptr inbounds float* %tmp16283, i64 1
+  %tmp16285 = getelementptr inbounds float* %tmp16284, i64 1
+  %tmp16286 = getelementptr inbounds float* %tmp16285, i64 1
+  %tmp16287 = getelementptr inbounds float* %tmp16286, i64 1
+  %tmp16288 = getelementptr inbounds float* %tmp16287, i64 1
+  %tmp16289 = getelementptr inbounds float* %tmp16288, i64 1
+  %tmp16290 = getelementptr inbounds float* %tmp16289, i64 1
+  %tmp16291 = getelementptr inbounds float* %tmp16290, i64 1
+  %tmp16292 = getelementptr inbounds float* %tmp16291, i64 1
+  %tmp16293 = getelementptr inbounds float* %tmp16292, i64 1
+  %tmp16294 = getelementptr inbounds float* %tmp16293, i64 1
+  %tmp16295 = getelementptr inbounds float* %tmp16294, i64 1
+  %tmp16296 = getelementptr inbounds float* %tmp16295, i64 1
+  %tmp16297 = getelementptr inbounds float* %tmp16296, i64 1
+  %tmp16298 = getelementptr inbounds float* %tmp16297, i64 1
+  %tmp16299 = getelementptr inbounds float* %tmp16298, i64 1
+  %tmp16300 = getelementptr inbounds float* %tmp16299, i64 1
+  %tmp16301 = getelementptr inbounds float* %tmp16300, i64 1
+  %tmp16302 = getelementptr inbounds float* %tmp16301, i64 1
+  %tmp16303 = getelementptr inbounds float* %tmp16302, i64 1
+  %tmp16304 = getelementptr inbounds float* %tmp16303, i64 1
+  %tmp16305 = getelementptr inbounds float* %tmp16304, i64 1
+  %tmp16306 = getelementptr inbounds float* %tmp16305, i64 1
+  %tmp16307 = getelementptr inbounds float* %tmp16306, i64 1
+  %tmp16308 = getelementptr inbounds float* %tmp16307, i64 1
+  %tmp16309 = getelementptr inbounds float* %tmp16308, i64 1
+  %tmp16310 = getelementptr inbounds float* %tmp16309, i64 1
+  %tmp16311 = getelementptr inbounds float* %tmp16310, i64 1
+  %tmp16312 = getelementptr inbounds float* %tmp16311, i64 1
+  %tmp16313 = getelementptr inbounds float* %tmp16312, i64 1
+  %tmp16314 = getelementptr inbounds float* %tmp16313, i64 1
+  %tmp16315 = getelementptr inbounds float* %tmp16314, i64 1
+  %tmp16316 = getelementptr inbounds float* %tmp16315, i64 1
+  %tmp16317 = getelementptr inbounds float* %tmp16316, i64 1
+  %tmp16318 = getelementptr inbounds float* %tmp16317, i64 1
+  %tmp16319 = getelementptr inbounds float* %tmp16318, i64 1
+  %tmp16320 = getelementptr inbounds float* %tmp16319, i64 1
+  %tmp16321 = getelementptr inbounds float* %tmp16320, i64 1
+  %tmp16322 = getelementptr inbounds float* %tmp16321, i64 1
+  %tmp16323 = getelementptr inbounds float* %tmp16322, i64 1
+  %tmp16324 = getelementptr inbounds float* %tmp16323, i64 1
+  %tmp16325 = getelementptr inbounds float* %tmp16324, i64 1
+  %tmp16326 = getelementptr inbounds float* %tmp16325, i64 1
+  %tmp16327 = getelementptr inbounds float* %tmp16326, i64 1
+  %tmp16328 = getelementptr inbounds float* %tmp16327, i64 1
+  %tmp16329 = getelementptr inbounds float* %tmp16328, i64 1
+  %tmp16330 = getelementptr inbounds float* %tmp16329, i64 1
+  %tmp16331 = getelementptr inbounds float* %tmp16330, i64 1
+  %tmp16332 = getelementptr inbounds float* %tmp16331, i64 1
+  %tmp16333 = getelementptr inbounds float* %tmp16332, i64 1
+  %tmp16334 = getelementptr inbounds float* %tmp16333, i64 1
+  %tmp16335 = getelementptr inbounds float* %tmp16334, i64 1
+  %tmp16336 = getelementptr inbounds float* %tmp16335, i64 1
+  %tmp16337 = getelementptr inbounds float* %tmp16336, i64 1
+  %tmp16338 = getelementptr inbounds float* %tmp16337, i64 1
+  %tmp16339 = getelementptr inbounds float* %tmp16338, i64 1
+  %tmp16340 = getelementptr inbounds float* %tmp16339, i64 1
+  %tmp16341 = getelementptr inbounds float* %tmp16340, i64 1
+  %tmp16342 = getelementptr inbounds float* %tmp16341, i64 1
+  %tmp16343 = getelementptr inbounds float* %tmp16342, i64 1
+  %tmp16344 = getelementptr inbounds float* %tmp16343, i64 1
+  %tmp16345 = getelementptr inbounds float* %tmp16344, i64 1
+  %tmp16346 = getelementptr inbounds float* %tmp16345, i64 1
+  %tmp16347 = getelementptr inbounds float* %tmp16346, i64 1
+  %tmp16348 = getelementptr inbounds float* %tmp16347, i64 1
+  %tmp16349 = getelementptr inbounds float* %tmp16348, i64 1
+  %tmp16350 = getelementptr inbounds float* %tmp16349, i64 1
+  %tmp16351 = getelementptr inbounds float* %tmp16350, i64 1
+  %tmp16352 = getelementptr inbounds float* %tmp16351, i64 1
+  %tmp16353 = getelementptr inbounds float* %tmp16352, i64 1
+  %tmp16354 = getelementptr inbounds float* %tmp16353, i64 1
+  %tmp16355 = getelementptr inbounds float* %tmp16354, i64 1
+  %tmp16356 = getelementptr inbounds float* %tmp16355, i64 1
+  %tmp16357 = getelementptr inbounds float* %tmp16356, i64 1
+  %tmp16358 = getelementptr inbounds float* %tmp16357, i64 1
+  %tmp16359 = getelementptr inbounds float* %tmp16358, i64 1
+  %tmp16360 = getelementptr inbounds float* %tmp16359, i64 1
+  %tmp16361 = getelementptr inbounds float* %tmp16360, i64 1
+  %tmp16362 = getelementptr inbounds float* %tmp16361, i64 1
+  %tmp16363 = getelementptr inbounds float* %tmp16362, i64 1
+  %tmp16364 = getelementptr inbounds float* %tmp16363, i64 1
+  %tmp16365 = getelementptr inbounds float* %tmp16364, i64 1
+  %tmp16366 = getelementptr inbounds float* %tmp16365, i64 1
+  %tmp16367 = getelementptr inbounds float* %tmp16366, i64 1
+  %tmp16368 = getelementptr inbounds float* %tmp16367, i64 1
+  %tmp16369 = getelementptr inbounds float* %tmp16368, i64 1
+  %tmp16370 = getelementptr inbounds float* %tmp16369, i64 1
+  %tmp16371 = getelementptr inbounds float* %tmp16370, i64 1
+  %tmp16372 = getelementptr inbounds float* %tmp16371, i64 1
+  %tmp16373 = getelementptr inbounds float* %tmp16372, i64 1
+  %tmp16374 = getelementptr inbounds float* %tmp16373, i64 1
+  %tmp16375 = getelementptr inbounds float* %tmp16374, i64 1
+  %tmp16376 = getelementptr inbounds float* %tmp16375, i64 1
+  %tmp16377 = getelementptr inbounds float* %tmp16376, i64 1
+  %tmp16378 = getelementptr inbounds float* %tmp16377, i64 1
+  %tmp16379 = getelementptr inbounds float* %tmp16378, i64 1
+  %tmp16380 = getelementptr inbounds float* %tmp16379, i64 1
+  %tmp16381 = getelementptr inbounds float* %tmp16380, i64 1
+  %tmp16382 = getelementptr inbounds float* %tmp16381, i64 1
+  %tmp16383 = getelementptr inbounds float* %tmp16382, i64 1
+  %tmp16384 = getelementptr inbounds float* %tmp16383, i64 1
+  %tmp16385 = getelementptr inbounds float* %tmp16384, i64 1
+  %tmp16386 = getelementptr inbounds float* %tmp16385, i64 1
+  %tmp16387 = getelementptr inbounds float* %tmp16386, i64 1
+  %tmp16388 = getelementptr inbounds float* %tmp16387, i64 1
+  %tmp16389 = getelementptr inbounds float* %tmp16388, i64 1
+  %tmp16390 = getelementptr inbounds float* %tmp16389, i64 1
+  %tmp16391 = getelementptr inbounds float* %tmp16390, i64 1
+  %tmp16392 = getelementptr inbounds float* %tmp16391, i64 1
+  %tmp16393 = getelementptr inbounds float* %tmp16392, i64 1
+  %tmp16394 = getelementptr inbounds float* %tmp16393, i64 1
+  %tmp16395 = getelementptr inbounds float* %tmp16394, i64 1
+  %tmp16396 = getelementptr inbounds float* %tmp16395, i64 1
+  %tmp16397 = getelementptr inbounds float* %tmp16396, i64 1
+  %tmp16398 = getelementptr inbounds float* %tmp16397, i64 1
+  %tmp16399 = getelementptr inbounds float* %tmp16398, i64 1
+  %tmp16400 = getelementptr inbounds float* %tmp16399, i64 1
+  %tmp16401 = getelementptr inbounds float* %tmp16400, i64 1
+  %tmp16402 = getelementptr inbounds float* %tmp16401, i64 1
+  %tmp16403 = getelementptr inbounds float* %tmp16402, i64 1
+  %tmp16404 = getelementptr inbounds float* %tmp16403, i64 1
+  %tmp16405 = getelementptr inbounds float* %tmp16404, i64 1
+  %tmp16406 = getelementptr inbounds float* %tmp16405, i64 1
+  %tmp16407 = getelementptr inbounds float* %tmp16406, i64 1
+  %tmp16408 = getelementptr inbounds float* %tmp16407, i64 1
+  %tmp16409 = getelementptr inbounds float* %tmp16408, i64 1
+  %tmp16410 = getelementptr inbounds float* %tmp16409, i64 1
+  %tmp16411 = getelementptr inbounds float* %tmp16410, i64 1
+  %tmp16412 = getelementptr inbounds float* %tmp16411, i64 1
+  %tmp16413 = getelementptr inbounds float* %tmp16412, i64 1
+  %tmp16414 = getelementptr inbounds float* %tmp16413, i64 1
+  %tmp16415 = getelementptr inbounds float* %tmp16414, i64 1
+  %tmp16416 = getelementptr inbounds float* %tmp16415, i64 1
+  %tmp16417 = getelementptr inbounds float* %tmp16416, i64 1
+  %tmp16418 = getelementptr inbounds float* %tmp16417, i64 1
+  %tmp16419 = getelementptr inbounds float* %tmp16418, i64 1
+  %tmp16420 = getelementptr inbounds float* %tmp16419, i64 1
+  %tmp16421 = getelementptr inbounds float* %tmp16420, i64 1
+  %tmp16422 = getelementptr inbounds float* %tmp16421, i64 1
+  %tmp16423 = getelementptr inbounds float* %tmp16422, i64 1
+  %tmp16424 = getelementptr inbounds float* %tmp16423, i64 1
+  %tmp16425 = getelementptr inbounds float* %tmp16424, i64 1
+  %tmp16426 = getelementptr inbounds float* %tmp16425, i64 1
+  %tmp16427 = getelementptr inbounds float* %tmp16426, i64 1
+  %tmp16428 = getelementptr inbounds float* %tmp16427, i64 1
+  %tmp16429 = getelementptr inbounds float* %tmp16428, i64 1
+  %tmp16430 = getelementptr inbounds float* %tmp16429, i64 1
+  %tmp16431 = getelementptr inbounds float* %tmp16430, i64 1
+  %tmp16432 = getelementptr inbounds float* %tmp16431, i64 1
+  %tmp16433 = getelementptr inbounds float* %tmp16432, i64 1
+  %tmp16434 = getelementptr inbounds float* %tmp16433, i64 1
+  %tmp16435 = getelementptr inbounds float* %tmp16434, i64 1
+  %tmp16436 = getelementptr inbounds float* %tmp16435, i64 1
+  %tmp16437 = getelementptr inbounds float* %tmp16436, i64 1
+  %tmp16438 = getelementptr inbounds float* %tmp16437, i64 1
+  %tmp16439 = getelementptr inbounds float* %tmp16438, i64 1
+  %tmp16440 = getelementptr inbounds float* %tmp16439, i64 1
+  %tmp16441 = getelementptr inbounds float* %tmp16440, i64 1
+  %tmp16442 = getelementptr inbounds float* %tmp16441, i64 1
+  %tmp16443 = getelementptr inbounds float* %tmp16442, i64 1
+  %tmp16444 = getelementptr inbounds float* %tmp16443, i64 1
+  %tmp16445 = getelementptr inbounds float* %tmp16444, i64 1
+  %tmp16446 = getelementptr inbounds float* %tmp16445, i64 1
+  %tmp16447 = getelementptr inbounds float* %tmp16446, i64 1
+  %tmp16448 = getelementptr inbounds float* %tmp16447, i64 1
+  %tmp16449 = getelementptr inbounds float* %tmp16448, i64 1
+  %tmp16450 = getelementptr inbounds float* %tmp16449, i64 1
+  %tmp16451 = getelementptr inbounds float* %tmp16450, i64 1
+  %tmp16452 = getelementptr inbounds float* %tmp16451, i64 1
+  %tmp16453 = getelementptr inbounds float* %tmp16452, i64 1
+  %tmp16454 = getelementptr inbounds float* %tmp16453, i64 1
+  %tmp16455 = getelementptr inbounds float* %tmp16454, i64 1
+  %tmp16456 = getelementptr inbounds float* %tmp16455, i64 1
+  %tmp16457 = getelementptr inbounds float* %tmp16456, i64 1
+  %tmp16458 = getelementptr inbounds float* %tmp16457, i64 1
+  %tmp16459 = getelementptr inbounds float* %tmp16458, i64 1
+  %tmp16460 = getelementptr inbounds float* %tmp16459, i64 1
+  %tmp16461 = getelementptr inbounds float* %tmp16460, i64 1
+  %tmp16462 = getelementptr inbounds float* %tmp16461, i64 1
+  %tmp16463 = getelementptr inbounds float* %tmp16462, i64 1
+  %tmp16464 = getelementptr inbounds float* %tmp16463, i64 1
+  %tmp16465 = getelementptr inbounds float* %tmp16464, i64 1
+  %tmp16466 = getelementptr inbounds float* %tmp16465, i64 1
+  %tmp16467 = getelementptr inbounds float* %tmp16466, i64 1
+  %tmp16468 = getelementptr inbounds float* %tmp16467, i64 1
+  %tmp16469 = getelementptr inbounds float* %tmp16468, i64 1
+  %tmp16470 = getelementptr inbounds float* %tmp16469, i64 1
+  %tmp16471 = getelementptr inbounds float* %tmp16470, i64 1
+  %tmp16472 = getelementptr inbounds float* %tmp16471, i64 1
+  %tmp16473 = getelementptr inbounds float* %tmp16472, i64 1
+  %tmp16474 = getelementptr inbounds float* %tmp16473, i64 1
+  %tmp16475 = getelementptr inbounds float* %tmp16474, i64 1
+  %tmp16476 = getelementptr inbounds float* %tmp16475, i64 1
+  %tmp16477 = getelementptr inbounds float* %tmp16476, i64 1
+  %tmp16478 = getelementptr inbounds float* %tmp16477, i64 1
+  %tmp16479 = getelementptr inbounds float* %tmp16478, i64 1
+  %tmp16480 = getelementptr inbounds float* %tmp16479, i64 1
+  %tmp16481 = getelementptr inbounds float* %tmp16480, i64 1
+  %tmp16482 = getelementptr inbounds float* %tmp16481, i64 1
+  %tmp16483 = getelementptr inbounds float* %tmp16482, i64 1
+  %tmp16484 = getelementptr inbounds float* %tmp16483, i64 1
+  %tmp16485 = getelementptr inbounds float* %tmp16484, i64 1
+  %tmp16486 = getelementptr inbounds float* %tmp16485, i64 1
+  %tmp16487 = getelementptr inbounds float* %tmp16486, i64 1
+  %tmp16488 = getelementptr inbounds float* %tmp16487, i64 1
+  %tmp16489 = getelementptr inbounds float* %tmp16488, i64 1
+  %tmp16490 = getelementptr inbounds float* %tmp16489, i64 1
+  %tmp16491 = getelementptr inbounds float* %tmp16490, i64 1
+  %tmp16492 = getelementptr inbounds float* %tmp16491, i64 1
+  %tmp16493 = getelementptr inbounds float* %tmp16492, i64 1
+  %tmp16494 = getelementptr inbounds float* %tmp16493, i64 1
+  %tmp16495 = getelementptr inbounds float* %tmp16494, i64 1
+  %tmp16496 = getelementptr inbounds float* %tmp16495, i64 1
+  %tmp16497 = getelementptr inbounds float* %tmp16496, i64 1
+  %tmp16498 = getelementptr inbounds float* %tmp16497, i64 1
+  %tmp16499 = getelementptr inbounds float* %tmp16498, i64 1
+  %tmp16500 = getelementptr inbounds float* %tmp16499, i64 1
+  %tmp16501 = getelementptr inbounds float* %tmp16500, i64 1
+  %tmp16502 = getelementptr inbounds float* %tmp16501, i64 1
+  %tmp16503 = getelementptr inbounds float* %tmp16502, i64 1
+  %tmp16504 = getelementptr inbounds float* %tmp16503, i64 1
+  %tmp16505 = getelementptr inbounds float* %tmp16504, i64 1
+  %tmp16506 = getelementptr inbounds float* %tmp16505, i64 1
+  %tmp16507 = getelementptr inbounds float* %tmp16506, i64 1
+  %tmp16508 = getelementptr inbounds float* %tmp16507, i64 1
+  %tmp16509 = getelementptr inbounds float* %tmp16508, i64 1
+  %tmp16510 = getelementptr inbounds float* %tmp16509, i64 1
+  %tmp16511 = getelementptr inbounds float* %tmp16510, i64 1
+  %tmp16512 = getelementptr inbounds float* %tmp16511, i64 1
+  %tmp16513 = getelementptr inbounds float* %tmp16512, i64 1
+  %tmp16514 = getelementptr inbounds float* %tmp16513, i64 1
+  %tmp16515 = getelementptr inbounds float* %tmp16514, i64 1
+  %tmp16516 = getelementptr inbounds float* %tmp16515, i64 1
+  %tmp16517 = getelementptr inbounds float* %tmp16516, i64 1
+  %tmp16518 = getelementptr inbounds float* %tmp16517, i64 1
+  %tmp16519 = getelementptr inbounds float* %tmp16518, i64 1
+  %tmp16520 = getelementptr inbounds float* %tmp16519, i64 1
+  %tmp16521 = getelementptr inbounds float* %tmp16520, i64 1
+  %tmp16522 = getelementptr inbounds float* %tmp16521, i64 1
+  %tmp16523 = getelementptr inbounds float* %tmp16522, i64 1
+  %tmp16524 = getelementptr inbounds float* %tmp16523, i64 1
+  %tmp16525 = getelementptr inbounds float* %tmp16524, i64 1
+  %tmp16526 = getelementptr inbounds float* %tmp16525, i64 1
+  %tmp16527 = getelementptr inbounds float* %tmp16526, i64 1
+  %tmp16528 = getelementptr inbounds float* %tmp16527, i64 1
+  %tmp16529 = getelementptr inbounds float* %tmp16528, i64 1
+  %tmp16530 = getelementptr inbounds float* %tmp16529, i64 1
+  %tmp16531 = getelementptr inbounds float* %tmp16530, i64 1
+  %tmp16532 = getelementptr inbounds float* %tmp16531, i64 1
+  %tmp16533 = getelementptr inbounds float* %tmp16532, i64 1
+  %tmp16534 = getelementptr inbounds float* %tmp16533, i64 1
+  %tmp16535 = getelementptr inbounds float* %tmp16534, i64 1
+  %tmp16536 = getelementptr inbounds float* %tmp16535, i64 1
+  %tmp16537 = getelementptr inbounds float* %tmp16536, i64 1
+  %tmp16538 = getelementptr inbounds float* %tmp16537, i64 1
+  %tmp16539 = getelementptr inbounds float* %tmp16538, i64 1
+  %tmp16540 = getelementptr inbounds float* %tmp16539, i64 1
+  %tmp16541 = getelementptr inbounds float* %tmp16540, i64 1
+  %tmp16542 = getelementptr inbounds float* %tmp16541, i64 1
+  %tmp16543 = getelementptr inbounds float* %tmp16542, i64 1
+  %tmp16544 = getelementptr inbounds float* %tmp16543, i64 1
+  %tmp16545 = getelementptr inbounds float* %tmp16544, i64 1
+  %tmp16546 = getelementptr inbounds float* %tmp16545, i64 1
+  %tmp16547 = getelementptr inbounds float* %tmp16546, i64 1
+  %tmp16548 = getelementptr inbounds float* %tmp16547, i64 1
+  %tmp16549 = getelementptr inbounds float* %tmp16548, i64 1
+  %tmp16550 = getelementptr inbounds float* %tmp16549, i64 1
+  %tmp16551 = getelementptr inbounds float* %tmp16550, i64 1
+  %tmp16552 = getelementptr inbounds float* %tmp16551, i64 1
+  %tmp16553 = getelementptr inbounds float* %tmp16552, i64 1
+  %tmp16554 = getelementptr inbounds float* %tmp16553, i64 1
+  %tmp16555 = getelementptr inbounds float* %tmp16554, i64 1
+  %tmp16556 = getelementptr inbounds float* %tmp16555, i64 1
+  %tmp16557 = getelementptr inbounds float* %tmp16556, i64 1
+  %tmp16558 = getelementptr inbounds float* %tmp16557, i64 1
+  %tmp16559 = getelementptr inbounds float* %tmp16558, i64 1
+  %tmp16560 = getelementptr inbounds float* %tmp16559, i64 1
+  %tmp16561 = getelementptr inbounds float* %tmp16560, i64 1
+  %tmp16562 = getelementptr inbounds float* %tmp16561, i64 1
+  %tmp16563 = getelementptr inbounds float* %tmp16562, i64 1
+  %tmp16564 = getelementptr inbounds float* %tmp16563, i64 1
+  %tmp16565 = getelementptr inbounds float* %tmp16564, i64 1
+  %tmp16566 = getelementptr inbounds float* %tmp16565, i64 1
+  %tmp16567 = getelementptr inbounds float* %tmp16566, i64 1
+  %tmp16568 = getelementptr inbounds float* %tmp16567, i64 1
+  %tmp16569 = getelementptr inbounds float* %tmp16568, i64 1
+  %tmp16570 = getelementptr inbounds float* %tmp16569, i64 1
+  %tmp16571 = getelementptr inbounds float* %tmp16570, i64 1
+  %tmp16572 = getelementptr inbounds float* %tmp16571, i64 1
+  %tmp16573 = getelementptr inbounds float* %tmp16572, i64 1
+  %tmp16574 = getelementptr inbounds float* %tmp16573, i64 1
+  %tmp16575 = getelementptr inbounds float* %tmp16574, i64 1
+  %tmp16576 = getelementptr inbounds float* %tmp16575, i64 1
+  %tmp16577 = getelementptr inbounds float* %tmp16576, i64 1
+  %tmp16578 = getelementptr inbounds float* %tmp16577, i64 1
+  %tmp16579 = getelementptr inbounds float* %tmp16578, i64 1
+  %tmp16580 = getelementptr inbounds float* %tmp16579, i64 1
+  %tmp16581 = getelementptr inbounds float* %tmp16580, i64 1
+  %tmp16582 = getelementptr inbounds float* %tmp16581, i64 1
+  %tmp16583 = getelementptr inbounds float* %tmp16582, i64 1
+  %tmp16584 = getelementptr inbounds float* %tmp16583, i64 1
+  %tmp16585 = getelementptr inbounds float* %tmp16584, i64 1
+  %tmp16586 = getelementptr inbounds float* %tmp16585, i64 1
+  %tmp16587 = getelementptr inbounds float* %tmp16586, i64 1
+  %tmp16588 = getelementptr inbounds float* %tmp16587, i64 1
+  %tmp16589 = getelementptr inbounds float* %tmp16588, i64 1
+  %tmp16590 = getelementptr inbounds float* %tmp16589, i64 1
+  %tmp16591 = getelementptr inbounds float* %tmp16590, i64 1
+  %tmp16592 = getelementptr inbounds float* %tmp16591, i64 1
+  %tmp16593 = getelementptr inbounds float* %tmp16592, i64 1
+  %tmp16594 = getelementptr inbounds float* %tmp16593, i64 1
+  %tmp16595 = getelementptr inbounds float* %tmp16594, i64 1
+  %tmp16596 = getelementptr inbounds float* %tmp16595, i64 1
+  %tmp16597 = getelementptr inbounds float* %tmp16596, i64 1
+  %tmp16598 = getelementptr inbounds float* %tmp16597, i64 1
+  %tmp16599 = getelementptr inbounds float* %tmp16598, i64 1
+  %tmp16600 = getelementptr inbounds float* %tmp16599, i64 1
+  %tmp16601 = getelementptr inbounds float* %tmp16600, i64 1
+  %tmp16602 = getelementptr inbounds float* %tmp16601, i64 1
+  %tmp16603 = getelementptr inbounds float* %tmp16602, i64 1
+  %tmp16604 = getelementptr inbounds float* %tmp16603, i64 1
+  %tmp16605 = getelementptr inbounds float* %tmp16604, i64 1
+  %tmp16606 = getelementptr inbounds float* %tmp16605, i64 1
+  %tmp16607 = getelementptr inbounds float* %tmp16606, i64 1
+  %tmp16608 = getelementptr inbounds float* %tmp16607, i64 1
+  %tmp16609 = getelementptr inbounds float* %tmp16608, i64 1
+  %tmp16610 = getelementptr inbounds float* %tmp16609, i64 1
+  %tmp16611 = getelementptr inbounds float* %tmp16610, i64 1
+  %tmp16612 = getelementptr inbounds float* %tmp16611, i64 1
+  %tmp16613 = getelementptr inbounds float* %tmp16612, i64 1
+  %tmp16614 = getelementptr inbounds float* %tmp16613, i64 1
+  %tmp16615 = getelementptr inbounds float* %tmp16614, i64 1
+  %tmp16616 = getelementptr inbounds float* %tmp16615, i64 1
+  %tmp16617 = getelementptr inbounds float* %tmp16616, i64 1
+  %tmp16618 = getelementptr inbounds float* %tmp16617, i64 1
+  %tmp16619 = getelementptr inbounds float* %tmp16618, i64 1
+  %tmp16620 = getelementptr inbounds float* %tmp16619, i64 1
+  %tmp16621 = getelementptr inbounds float* %tmp16620, i64 1
+  %tmp16622 = getelementptr inbounds float* %tmp16621, i64 1
+  %tmp16623 = getelementptr inbounds float* %tmp16622, i64 1
+  %tmp16624 = getelementptr inbounds float* %tmp16623, i64 1
+  %tmp16625 = getelementptr inbounds float* %tmp16624, i64 1
+  %tmp16626 = getelementptr inbounds float* %tmp16625, i64 1
+  %tmp16627 = getelementptr inbounds float* %tmp16626, i64 1
+  %tmp16628 = getelementptr inbounds float* %tmp16627, i64 1
+  %tmp16629 = getelementptr inbounds float* %tmp16628, i64 1
+  %tmp16630 = getelementptr inbounds float* %tmp16629, i64 1
+  %tmp16631 = getelementptr inbounds float* %tmp16630, i64 1
+  %tmp16632 = getelementptr inbounds float* %tmp16631, i64 1
+  %tmp16633 = getelementptr inbounds float* %tmp16632, i64 1
+  %tmp16634 = getelementptr inbounds float* %tmp16633, i64 1
+  %tmp16635 = getelementptr inbounds float* %tmp16634, i64 1
+  %tmp16636 = getelementptr inbounds float* %tmp16635, i64 1
+  %tmp16637 = getelementptr inbounds float* %tmp16636, i64 1
+  %tmp16638 = getelementptr inbounds float* %tmp16637, i64 1
+  %tmp16639 = getelementptr inbounds float* %tmp16638, i64 1
+  %tmp16640 = getelementptr inbounds float* %tmp16639, i64 1
+  %tmp16641 = getelementptr inbounds float* %tmp16640, i64 1
+  %tmp16642 = getelementptr inbounds float* %tmp16641, i64 1
+  %tmp16643 = getelementptr inbounds float* %tmp16642, i64 1
+  %tmp16644 = getelementptr inbounds float* %tmp16643, i64 1
+  %tmp16645 = getelementptr inbounds float* %tmp16644, i64 1
+  %tmp16646 = getelementptr inbounds float* %tmp16645, i64 1
+  %tmp16647 = getelementptr inbounds float* %tmp16646, i64 1
+  %tmp16648 = getelementptr inbounds float* %tmp16647, i64 1
+  %tmp16649 = getelementptr inbounds float* %tmp16648, i64 1
+  %tmp16650 = getelementptr inbounds float* %tmp16649, i64 1
+  %tmp16651 = getelementptr inbounds float* %tmp16650, i64 1
+  %tmp16652 = getelementptr inbounds float* %tmp16651, i64 1
+  %tmp16653 = getelementptr inbounds float* %tmp16652, i64 1
+  %tmp16654 = getelementptr inbounds float* %tmp16653, i64 1
+  %tmp16655 = getelementptr inbounds float* %tmp16654, i64 1
+  %tmp16656 = getelementptr inbounds float* %tmp16655, i64 1
+  %tmp16657 = getelementptr inbounds float* %tmp16656, i64 1
+  %tmp16658 = getelementptr inbounds float* %tmp16657, i64 1
+  %tmp16659 = getelementptr inbounds float* %tmp16658, i64 1
+  %tmp16660 = getelementptr inbounds float* %tmp16659, i64 1
+  %tmp16661 = getelementptr inbounds float* %tmp16660, i64 1
+  %tmp16662 = getelementptr inbounds float* %tmp16661, i64 1
+  %tmp16663 = getelementptr inbounds float* %tmp16662, i64 1
+  %tmp16664 = getelementptr inbounds float* %tmp16663, i64 1
+  %tmp16665 = getelementptr inbounds float* %tmp16664, i64 1
+  %tmp16666 = getelementptr inbounds float* %tmp16665, i64 1
+  %tmp16667 = getelementptr inbounds float* %tmp16666, i64 1
+  %tmp16668 = getelementptr inbounds float* %tmp16667, i64 1
+  %tmp16669 = getelementptr inbounds float* %tmp16668, i64 1
+  %tmp16670 = getelementptr inbounds float* %tmp16669, i64 1
+  %tmp16671 = getelementptr inbounds float* %tmp16670, i64 1
+  %tmp16672 = getelementptr inbounds float* %tmp16671, i64 1
+  %tmp16673 = getelementptr inbounds float* %tmp16672, i64 1
+  %tmp16674 = getelementptr inbounds float* %tmp16673, i64 1
+  %tmp16675 = getelementptr inbounds float* %tmp16674, i64 1
+  %tmp16676 = getelementptr inbounds float* %tmp16675, i64 1
+  %tmp16677 = getelementptr inbounds float* %tmp16676, i64 1
+  %tmp16678 = getelementptr inbounds float* %tmp16677, i64 1
+  %tmp16679 = getelementptr inbounds float* %tmp16678, i64 1
+  %tmp16680 = getelementptr inbounds float* %tmp16679, i64 1
+  %tmp16681 = getelementptr inbounds float* %tmp16680, i64 1
+  %tmp16682 = getelementptr inbounds float* %tmp16681, i64 1
+  %tmp16683 = getelementptr inbounds float* %tmp16682, i64 1
+  %tmp16684 = getelementptr inbounds float* %tmp16683, i64 1
+  %tmp16685 = getelementptr inbounds float* %tmp16684, i64 1
+  %tmp16686 = getelementptr inbounds float* %tmp16685, i64 1
+  %tmp16687 = getelementptr inbounds float* %tmp16686, i64 1
+  %tmp16688 = getelementptr inbounds float* %tmp16687, i64 1
+  %tmp16689 = getelementptr inbounds float* %tmp16688, i64 1
+  %tmp16690 = getelementptr inbounds float* %tmp16689, i64 1
+  %tmp16691 = getelementptr inbounds float* %tmp16690, i64 1
+  %tmp16692 = getelementptr inbounds float* %tmp16691, i64 1
+  %tmp16693 = getelementptr inbounds float* %tmp16692, i64 1
+  %tmp16694 = getelementptr inbounds float* %tmp16693, i64 1
+  %tmp16695 = getelementptr inbounds float* %tmp16694, i64 1
+  %tmp16696 = getelementptr inbounds float* %tmp16695, i64 1
+  %tmp16697 = getelementptr inbounds float* %tmp16696, i64 1
+  %tmp16698 = getelementptr inbounds float* %tmp16697, i64 1
+  %tmp16699 = getelementptr inbounds float* %tmp16698, i64 1
+  %tmp16700 = getelementptr inbounds float* %tmp16699, i64 1
+  %tmp16701 = getelementptr inbounds float* %tmp16700, i64 1
+  %tmp16702 = getelementptr inbounds float* %tmp16701, i64 1
+  %tmp16703 = getelementptr inbounds float* %tmp16702, i64 1
+  %tmp16704 = getelementptr inbounds float* %tmp16703, i64 1
+  %tmp16705 = getelementptr inbounds float* %tmp16704, i64 1
+  %tmp16706 = getelementptr inbounds float* %tmp16705, i64 1
+  %tmp16707 = getelementptr inbounds float* %tmp16706, i64 1
+  %tmp16708 = getelementptr inbounds float* %tmp16707, i64 1
+  %tmp16709 = getelementptr inbounds float* %tmp16708, i64 1
+  %tmp16710 = getelementptr inbounds float* %tmp16709, i64 1
+  %tmp16711 = getelementptr inbounds float* %tmp16710, i64 1
+  %tmp16712 = getelementptr inbounds float* %tmp16711, i64 1
+  %tmp16713 = getelementptr inbounds float* %tmp16712, i64 1
+  %tmp16714 = getelementptr inbounds float* %tmp16713, i64 1
+  %tmp16715 = getelementptr inbounds float* %tmp16714, i64 1
+  %tmp16716 = getelementptr inbounds float* %tmp16715, i64 1
+  %tmp16717 = getelementptr inbounds float* %tmp16716, i64 1
+  %tmp16718 = getelementptr inbounds float* %tmp16717, i64 1
+  %tmp16719 = getelementptr inbounds float* %tmp16718, i64 1
+  %tmp16720 = getelementptr inbounds float* %tmp16719, i64 1
+  %tmp16721 = getelementptr inbounds float* %tmp16720, i64 1
+  %tmp16722 = getelementptr inbounds float* %tmp16721, i64 1
+  %tmp16723 = getelementptr inbounds float* %tmp16722, i64 1
+  %tmp16724 = getelementptr inbounds float* %tmp16723, i64 1
+  %tmp16725 = getelementptr inbounds float* %tmp16724, i64 1
+  %tmp16726 = getelementptr inbounds float* %tmp16725, i64 1
+  %tmp16727 = getelementptr inbounds float* %tmp16726, i64 1
+  %tmp16728 = getelementptr inbounds float* %tmp16727, i64 1
+  %tmp16729 = getelementptr inbounds float* %tmp16728, i64 1
+  %tmp16730 = getelementptr inbounds float* %tmp16729, i64 1
+  %tmp16731 = getelementptr inbounds float* %tmp16730, i64 1
+  %tmp16732 = getelementptr inbounds float* %tmp16731, i64 1
+  %tmp16733 = getelementptr inbounds float* %tmp16732, i64 1
+  %tmp16734 = getelementptr inbounds float* %tmp16733, i64 1
+  %tmp16735 = getelementptr inbounds float* %tmp16734, i64 1
+  %tmp16736 = getelementptr inbounds float* %tmp16735, i64 1
+  %tmp16737 = getelementptr inbounds float* %tmp16736, i64 1
+  %tmp16738 = getelementptr inbounds float* %tmp16737, i64 1
+  %tmp16739 = getelementptr inbounds float* %tmp16738, i64 1
+  %tmp16740 = getelementptr inbounds float* %tmp16739, i64 1
+  %tmp16741 = getelementptr inbounds float* %tmp16740, i64 1
+  %tmp16742 = getelementptr inbounds float* %tmp16741, i64 1
+  %tmp16743 = getelementptr inbounds float* %tmp16742, i64 1
+  %tmp16744 = getelementptr inbounds float* %tmp16743, i64 1
+  %tmp16745 = getelementptr inbounds float* %tmp16744, i64 1
+  %tmp16746 = getelementptr inbounds float* %tmp16745, i64 1
+  %tmp16747 = getelementptr inbounds float* %tmp16746, i64 1
+  %tmp16748 = getelementptr inbounds float* %tmp16747, i64 1
+  %tmp16749 = getelementptr inbounds float* %tmp16748, i64 1
+  %tmp16750 = getelementptr inbounds float* %tmp16749, i64 1
+  %tmp16751 = getelementptr inbounds float* %tmp16750, i64 1
+  %tmp16752 = getelementptr inbounds float* %tmp16751, i64 1
+  %tmp16753 = getelementptr inbounds float* %tmp16752, i64 1
+  %tmp16754 = getelementptr inbounds float* %tmp16753, i64 1
+  %tmp16755 = getelementptr inbounds float* %tmp16754, i64 1
+  %tmp16756 = getelementptr inbounds float* %tmp16755, i64 1
+  %tmp16757 = getelementptr inbounds float* %tmp16756, i64 1
+  %tmp16758 = getelementptr inbounds float* %tmp16757, i64 1
+  %tmp16759 = getelementptr inbounds float* %tmp16758, i64 1
+  %tmp16760 = getelementptr inbounds float* %tmp16759, i64 1
+  %tmp16761 = getelementptr inbounds float* %tmp16760, i64 1
+  %tmp16762 = getelementptr inbounds float* %tmp16761, i64 1
+  %tmp16763 = getelementptr inbounds float* %tmp16762, i64 1
+  %tmp16764 = getelementptr inbounds float* %tmp16763, i64 1
+  %tmp16765 = getelementptr inbounds float* %tmp16764, i64 1
+  %tmp16766 = getelementptr inbounds float* %tmp16765, i64 1
+  %tmp16767 = getelementptr inbounds float* %tmp16766, i64 1
+  %tmp16768 = getelementptr inbounds float* %tmp16767, i64 1
+  %tmp16769 = getelementptr inbounds float* %tmp16768, i64 1
+  %tmp16770 = getelementptr inbounds float* %tmp16769, i64 1
+  %tmp16771 = getelementptr inbounds float* %tmp16770, i64 1
+  %tmp16772 = getelementptr inbounds float* %tmp16771, i64 1
+  %tmp16773 = getelementptr inbounds float* %tmp16772, i64 1
+  %tmp16774 = getelementptr inbounds float* %tmp16773, i64 1
+  %tmp16775 = getelementptr inbounds float* %tmp16774, i64 1
+  %tmp16776 = getelementptr inbounds float* %tmp16775, i64 1
+  %tmp16777 = getelementptr inbounds float* %tmp16776, i64 1
+  %tmp16778 = getelementptr inbounds float* %tmp16777, i64 1
+  %tmp16779 = getelementptr inbounds float* %tmp16778, i64 1
+  %tmp16780 = getelementptr inbounds float* %tmp16779, i64 1
+  %tmp16781 = getelementptr inbounds float* %tmp16780, i64 1
+  %tmp16782 = getelementptr inbounds float* %tmp16781, i64 1
+  %tmp16783 = getelementptr inbounds float* %tmp16782, i64 1
+  %tmp16784 = getelementptr inbounds float* %tmp16783, i64 1
+  %tmp16785 = getelementptr inbounds float* %tmp16784, i64 1
+  %tmp16786 = getelementptr inbounds float* %tmp16785, i64 1
+  %tmp16787 = getelementptr inbounds float* %tmp16786, i64 1
+  %tmp16788 = getelementptr inbounds float* %tmp16787, i64 1
+  %tmp16789 = getelementptr inbounds float* %tmp16788, i64 1
+  %tmp16790 = getelementptr inbounds float* %tmp16789, i64 1
+  %tmp16791 = getelementptr inbounds float* %tmp16790, i64 1
+  %tmp16792 = getelementptr inbounds float* %tmp16791, i64 1
+  %tmp16793 = getelementptr inbounds float* %tmp16792, i64 1
+  %tmp16794 = getelementptr inbounds float* %tmp16793, i64 1
+  %tmp16795 = getelementptr inbounds float* %tmp16794, i64 1
+  %tmp16796 = getelementptr inbounds float* %tmp16795, i64 1
+  %tmp16797 = getelementptr inbounds float* %tmp16796, i64 1
+  %tmp16798 = getelementptr inbounds float* %tmp16797, i64 1
+  %tmp16799 = getelementptr inbounds float* %tmp16798, i64 1
+  %tmp16800 = getelementptr inbounds float* %tmp16799, i64 1
+  %tmp16801 = getelementptr inbounds float* %tmp16800, i64 1
+  %tmp16802 = getelementptr inbounds float* %tmp16801, i64 1
+  %tmp16803 = getelementptr inbounds float* %tmp16802, i64 1
+  %tmp16804 = getelementptr inbounds float* %tmp16803, i64 1
+  %tmp16805 = getelementptr inbounds float* %tmp16804, i64 1
+  %tmp16806 = getelementptr inbounds float* %tmp16805, i64 1
+  %tmp16807 = getelementptr inbounds float* %tmp16806, i64 1
+  %tmp16808 = getelementptr inbounds float* %tmp16807, i64 1
+  %tmp16809 = getelementptr inbounds float* %tmp16808, i64 1
+  %tmp16810 = getelementptr inbounds float* %tmp16809, i64 1
+  %tmp16811 = getelementptr inbounds float* %tmp16810, i64 1
+  %tmp16812 = getelementptr inbounds float* %tmp16811, i64 1
+  %tmp16813 = getelementptr inbounds float* %tmp16812, i64 1
+  %tmp16814 = getelementptr inbounds float* %tmp16813, i64 1
+  %tmp16815 = getelementptr inbounds float* %tmp16814, i64 1
+  %tmp16816 = getelementptr inbounds float* %tmp16815, i64 1
+  %tmp16817 = getelementptr inbounds float* %tmp16816, i64 1
+  %tmp16818 = getelementptr inbounds float* %tmp16817, i64 1
+  %tmp16819 = getelementptr inbounds float* %tmp16818, i64 1
+  %tmp16820 = getelementptr inbounds float* %tmp16819, i64 1
+  %tmp16821 = getelementptr inbounds float* %tmp16820, i64 1
+  %tmp16822 = getelementptr inbounds float* %tmp16821, i64 1
+  %tmp16823 = getelementptr inbounds float* %tmp16822, i64 1
+  %tmp16824 = getelementptr inbounds float* %tmp16823, i64 1
+  %tmp16825 = getelementptr inbounds float* %tmp16824, i64 1
+  %tmp16826 = getelementptr inbounds float* %tmp16825, i64 1
+  %tmp16827 = getelementptr inbounds float* %tmp16826, i64 1
+  %tmp16828 = getelementptr inbounds float* %tmp16827, i64 1
+  %tmp16829 = getelementptr inbounds float* %tmp16828, i64 1
+  %tmp16830 = getelementptr inbounds float* %tmp16829, i64 1
+  %tmp16831 = getelementptr inbounds float* %tmp16830, i64 1
+  %tmp16832 = getelementptr inbounds float* %tmp16831, i64 1
+  %tmp16833 = getelementptr inbounds float* %tmp16832, i64 1
+  %tmp16834 = getelementptr inbounds float* %tmp16833, i64 1
+  %tmp16835 = getelementptr inbounds float* %tmp16834, i64 1
+  %tmp16836 = getelementptr inbounds float* %tmp16835, i64 1
+  %tmp16837 = getelementptr inbounds float* %tmp16836, i64 1
+  %tmp16838 = getelementptr inbounds float* %tmp16837, i64 1
+  %tmp16839 = getelementptr inbounds float* %tmp16838, i64 1
+  %tmp16840 = getelementptr inbounds float* %tmp16839, i64 1
+  %tmp16841 = getelementptr inbounds float* %tmp16840, i64 1
+  %tmp16842 = getelementptr inbounds float* %tmp16841, i64 1
+  %tmp16843 = getelementptr inbounds float* %tmp16842, i64 1
+  %tmp16844 = getelementptr inbounds float* %tmp16843, i64 1
+  %tmp16845 = getelementptr inbounds float* %tmp16844, i64 1
+  %tmp16846 = getelementptr inbounds float* %tmp16845, i64 1
+  %tmp16847 = getelementptr inbounds float* %tmp16846, i64 1
+  %tmp16848 = getelementptr inbounds float* %tmp16847, i64 1
+  %tmp16849 = getelementptr inbounds float* %tmp16848, i64 1
+  %tmp16850 = getelementptr inbounds float* %tmp16849, i64 1
+  %tmp16851 = getelementptr inbounds float* %tmp16850, i64 1
+  %tmp16852 = getelementptr inbounds float* %tmp16851, i64 1
+  %tmp16853 = getelementptr inbounds float* %tmp16852, i64 1
+  %tmp16854 = getelementptr inbounds float* %tmp16853, i64 1
+  %tmp16855 = getelementptr inbounds float* %tmp16854, i64 1
+  %tmp16856 = getelementptr inbounds float* %tmp16855, i64 1
+  %tmp16857 = getelementptr inbounds float* %tmp16856, i64 1
+  %tmp16858 = getelementptr inbounds float* %tmp16857, i64 1
+  %tmp16859 = getelementptr inbounds float* %tmp16858, i64 1
+  %tmp16860 = getelementptr inbounds float* %tmp16859, i64 1
+  %tmp16861 = getelementptr inbounds float* %tmp16860, i64 1
+  %tmp16862 = getelementptr inbounds float* %tmp16861, i64 1
+  %tmp16863 = getelementptr inbounds float* %tmp16862, i64 1
+  %tmp16864 = getelementptr inbounds float* %tmp16863, i64 1
+  %tmp16865 = getelementptr inbounds float* %tmp16864, i64 1
+  %tmp16866 = getelementptr inbounds float* %tmp16865, i64 1
+  %tmp16867 = getelementptr inbounds float* %tmp16866, i64 1
+  %tmp16868 = getelementptr inbounds float* %tmp16867, i64 1
+  %tmp16869 = getelementptr inbounds float* %tmp16868, i64 1
+  %tmp16870 = getelementptr inbounds float* %tmp16869, i64 1
+  %tmp16871 = getelementptr inbounds float* %tmp16870, i64 1
+  %tmp16872 = getelementptr inbounds float* %tmp16871, i64 1
+  %tmp16873 = getelementptr inbounds float* %tmp16872, i64 1
+  %tmp16874 = getelementptr inbounds float* %tmp16873, i64 1
+  %tmp16875 = getelementptr inbounds float* %tmp16874, i64 1
+  %tmp16876 = getelementptr inbounds float* %tmp16875, i64 1
+  %tmp16877 = getelementptr inbounds float* %tmp16876, i64 1
+  %tmp16878 = getelementptr inbounds float* %tmp16877, i64 1
+  %tmp16879 = getelementptr inbounds float* %tmp16878, i64 1
+  %tmp16880 = getelementptr inbounds float* %tmp16879, i64 1
+  %tmp16881 = getelementptr inbounds float* %tmp16880, i64 1
+  %tmp16882 = getelementptr inbounds float* %tmp16881, i64 1
+  %tmp16883 = getelementptr inbounds float* %tmp16882, i64 1
+  %tmp16884 = getelementptr inbounds float* %tmp16883, i64 1
+  %tmp16885 = getelementptr inbounds float* %tmp16884, i64 1
+  %tmp16886 = getelementptr inbounds float* %tmp16885, i64 1
+  %tmp16887 = getelementptr inbounds float* %tmp16886, i64 1
+  %tmp16888 = getelementptr inbounds float* %tmp16887, i64 1
+  %tmp16889 = getelementptr inbounds float* %tmp16888, i64 1
+  %tmp16890 = getelementptr inbounds float* %tmp16889, i64 1
+  %tmp16891 = getelementptr inbounds float* %tmp16890, i64 1
+  %tmp16892 = getelementptr inbounds float* %tmp16891, i64 1
+  %tmp16893 = getelementptr inbounds float* %tmp16892, i64 1
+  %tmp16894 = getelementptr inbounds float* %tmp16893, i64 1
+  %tmp16895 = getelementptr inbounds float* %tmp16894, i64 1
+  %tmp16896 = getelementptr inbounds float* %tmp16895, i64 1
+  %tmp16897 = getelementptr inbounds float* %tmp16896, i64 1
+  %tmp16898 = getelementptr inbounds float* %tmp16897, i64 1
+  %tmp16899 = getelementptr inbounds float* %tmp16898, i64 1
+  %tmp16900 = getelementptr inbounds float* %tmp16899, i64 1
+  %tmp16901 = getelementptr inbounds float* %tmp16900, i64 1
+  %tmp16902 = getelementptr inbounds float* %tmp16901, i64 1
+  %tmp16903 = getelementptr inbounds float* %tmp16902, i64 1
+  %tmp16904 = getelementptr inbounds float* %tmp16903, i64 1
+  %tmp16905 = getelementptr inbounds float* %tmp16904, i64 1
+  %tmp16906 = getelementptr inbounds float* %tmp16905, i64 1
+  %tmp16907 = getelementptr inbounds float* %tmp16906, i64 1
+  %tmp16908 = getelementptr inbounds float* %tmp16907, i64 1
+  %tmp16909 = getelementptr inbounds float* %tmp16908, i64 1
+  %tmp16910 = getelementptr inbounds float* %tmp16909, i64 1
+  %tmp16911 = getelementptr inbounds float* %tmp16910, i64 1
+  %tmp16912 = getelementptr inbounds float* %tmp16911, i64 1
+  %tmp16913 = getelementptr inbounds float* %tmp16912, i64 1
+  %tmp16914 = getelementptr inbounds float* %tmp16913, i64 1
+  %tmp16915 = getelementptr inbounds float* %tmp16914, i64 1
+  %tmp16916 = getelementptr inbounds float* %tmp16915, i64 1
+  %tmp16917 = getelementptr inbounds float* %tmp16916, i64 1
+  %tmp16918 = getelementptr inbounds float* %tmp16917, i64 1
+  %tmp16919 = getelementptr inbounds float* %tmp16918, i64 1
+  %tmp16920 = getelementptr inbounds float* %tmp16919, i64 1
+  %tmp16921 = getelementptr inbounds float* %tmp16920, i64 1
+  %tmp16922 = getelementptr inbounds float* %tmp16921, i64 1
+  %tmp16923 = getelementptr inbounds float* %tmp16922, i64 1
+  %tmp16924 = getelementptr inbounds float* %tmp16923, i64 1
+  %tmp16925 = getelementptr inbounds float* %tmp16924, i64 1
+  %tmp16926 = getelementptr inbounds float* %tmp16925, i64 1
+  %tmp16927 = getelementptr inbounds float* %tmp16926, i64 1
+  %tmp16928 = getelementptr inbounds float* %tmp16927, i64 1
+  %tmp16929 = getelementptr inbounds float* %tmp16928, i64 1
+  %tmp16930 = getelementptr inbounds float* %tmp16929, i64 1
+  %tmp16931 = getelementptr inbounds float* %tmp16930, i64 1
+  %tmp16932 = getelementptr inbounds float* %tmp16931, i64 1
+  %tmp16933 = getelementptr inbounds float* %tmp16932, i64 1
+  %tmp16934 = getelementptr inbounds float* %tmp16933, i64 1
+  %tmp16935 = getelementptr inbounds float* %tmp16934, i64 1
+  %tmp16936 = getelementptr inbounds float* %tmp16935, i64 1
+  %tmp16937 = getelementptr inbounds float* %tmp16936, i64 1
+  %tmp16938 = getelementptr inbounds float* %tmp16937, i64 1
+  %tmp16939 = getelementptr inbounds float* %tmp16938, i64 1
+  %tmp16940 = getelementptr inbounds float* %tmp16939, i64 1
+  %tmp16941 = getelementptr inbounds float* %tmp16940, i64 1
+  %tmp16942 = getelementptr inbounds float* %tmp16941, i64 1
+  %tmp16943 = getelementptr inbounds float* %tmp16942, i64 1
+  %tmp16944 = getelementptr inbounds float* %tmp16943, i64 1
+  %tmp16945 = getelementptr inbounds float* %tmp16944, i64 1
+  %tmp16946 = getelementptr inbounds float* %tmp16945, i64 1
+  %tmp16947 = getelementptr inbounds float* %tmp16946, i64 1
+  %tmp16948 = getelementptr inbounds float* %tmp16947, i64 1
+  %tmp16949 = getelementptr inbounds float* %tmp16948, i64 1
+  %tmp16950 = getelementptr inbounds float* %tmp16949, i64 1
+  %tmp16951 = getelementptr inbounds float* %tmp16950, i64 1
+  %tmp16952 = getelementptr inbounds float* %tmp16951, i64 1
+  %tmp16953 = getelementptr inbounds float* %tmp16952, i64 1
+  %tmp16954 = getelementptr inbounds float* %tmp16953, i64 1
+  %tmp16955 = getelementptr inbounds float* %tmp16954, i64 1
+  %tmp16956 = getelementptr inbounds float* %tmp16955, i64 1
+  %tmp16957 = getelementptr inbounds float* %tmp16956, i64 1
+  %tmp16958 = getelementptr inbounds float* %tmp16957, i64 1
+  %tmp16959 = getelementptr inbounds float* %tmp16958, i64 1
+  %tmp16960 = getelementptr inbounds float* %tmp16959, i64 1
+  %tmp16961 = getelementptr inbounds float* %tmp16960, i64 1
+  %tmp16962 = getelementptr inbounds float* %tmp16961, i64 1
+  %tmp16963 = getelementptr inbounds float* %tmp16962, i64 1
+  %tmp16964 = getelementptr inbounds float* %tmp16963, i64 1
+  %tmp16965 = getelementptr inbounds float* %tmp16964, i64 1
+  %tmp16966 = getelementptr inbounds float* %tmp16965, i64 1
+  %tmp16967 = getelementptr inbounds float* %tmp16966, i64 1
+  %tmp16968 = getelementptr inbounds float* %tmp16967, i64 1
+  %tmp16969 = getelementptr inbounds float* %tmp16968, i64 1
+  %tmp16970 = getelementptr inbounds float* %tmp16969, i64 1
+  %tmp16971 = getelementptr inbounds float* %tmp16970, i64 1
+  %tmp16972 = getelementptr inbounds float* %tmp16971, i64 1
+  %tmp16973 = getelementptr inbounds float* %tmp16972, i64 1
+  %tmp16974 = getelementptr inbounds float* %tmp16973, i64 1
+  %tmp16975 = getelementptr inbounds float* %tmp16974, i64 1
+  %tmp16976 = getelementptr inbounds float* %tmp16975, i64 1
+  %tmp16977 = getelementptr inbounds float* %tmp16976, i64 1
+  %tmp16978 = getelementptr inbounds float* %tmp16977, i64 1
+  %tmp16979 = getelementptr inbounds float* %tmp16978, i64 1
+  %tmp16980 = getelementptr inbounds float* %tmp16979, i64 1
+  %tmp16981 = getelementptr inbounds float* %tmp16980, i64 1
+  %tmp16982 = getelementptr inbounds float* %tmp16981, i64 1
+  %tmp16983 = getelementptr inbounds float* %tmp16982, i64 1
+  %tmp16984 = getelementptr inbounds float* %tmp16983, i64 1
+  %tmp16985 = getelementptr inbounds float* %tmp16984, i64 1
+  %tmp16986 = getelementptr inbounds float* %tmp16985, i64 1
+  %tmp16987 = getelementptr inbounds float* %tmp16986, i64 1
+  %tmp16988 = getelementptr inbounds float* %tmp16987, i64 1
+  %tmp16989 = getelementptr inbounds float* %tmp16988, i64 1
+  %tmp16990 = getelementptr inbounds float* %tmp16989, i64 1
+  %tmp16991 = getelementptr inbounds float* %tmp16990, i64 1
+  %tmp16992 = getelementptr inbounds float* %tmp16991, i64 1
+  %tmp16993 = getelementptr inbounds float* %tmp16992, i64 1
+  %tmp16994 = getelementptr inbounds float* %tmp16993, i64 1
+  %tmp16995 = getelementptr inbounds float* %tmp16994, i64 1
+  %tmp16996 = getelementptr inbounds float* %tmp16995, i64 1
+  %tmp16997 = getelementptr inbounds float* %tmp16996, i64 1
+  %tmp16998 = getelementptr inbounds float* %tmp16997, i64 1
+  %tmp16999 = getelementptr inbounds float* %tmp16998, i64 1
+  %tmp17000 = getelementptr inbounds float* %tmp16999, i64 1
+  %tmp17001 = getelementptr inbounds float* %tmp17000, i64 1
+  %tmp17002 = getelementptr inbounds float* %tmp17001, i64 1
+  %tmp17003 = getelementptr inbounds float* %tmp17002, i64 1
+  %tmp17004 = getelementptr inbounds float* %tmp17003, i64 1
+  %tmp17005 = getelementptr inbounds float* %tmp17004, i64 1
+  %tmp17006 = getelementptr inbounds float* %tmp17005, i64 1
+  %tmp17007 = getelementptr inbounds float* %tmp17006, i64 1
+  %tmp17008 = getelementptr inbounds float* %tmp17007, i64 1
+  %tmp17009 = getelementptr inbounds float* %tmp17008, i64 1
+  %tmp17010 = getelementptr inbounds float* %tmp17009, i64 1
+  %tmp17011 = getelementptr inbounds float* %tmp17010, i64 1
+  %tmp17012 = getelementptr inbounds float* %tmp17011, i64 1
+  %tmp17013 = getelementptr inbounds float* %tmp17012, i64 1
+  %tmp17014 = getelementptr inbounds float* %tmp17013, i64 1
+  %tmp17015 = getelementptr inbounds float* %tmp17014, i64 1
+  %tmp17016 = getelementptr inbounds float* %tmp17015, i64 1
+  %tmp17017 = getelementptr inbounds float* %tmp17016, i64 1
+  %tmp17018 = getelementptr inbounds float* %tmp17017, i64 1
+  %tmp17019 = getelementptr inbounds float* %tmp17018, i64 1
+  %tmp17020 = getelementptr inbounds float* %tmp17019, i64 1
+  %tmp17021 = getelementptr inbounds float* %tmp17020, i64 1
+  %tmp17022 = getelementptr inbounds float* %tmp17021, i64 1
+  %tmp17023 = getelementptr inbounds float* %tmp17022, i64 1
+  %tmp17024 = getelementptr inbounds float* %tmp17023, i64 1
+  %tmp17025 = getelementptr inbounds float* %tmp17024, i64 1
+  %tmp17026 = getelementptr inbounds float* %tmp17025, i64 1
+  %tmp17027 = getelementptr inbounds float* %tmp17026, i64 1
+  %tmp17028 = getelementptr inbounds float* %tmp17027, i64 1
+  %tmp17029 = getelementptr inbounds float* %tmp17028, i64 1
+  %tmp17030 = getelementptr inbounds float* %tmp17029, i64 1
+  %tmp17031 = getelementptr inbounds float* %tmp17030, i64 1
+  %tmp17032 = getelementptr inbounds float* %tmp17031, i64 1
+  %tmp17033 = getelementptr inbounds float* %tmp17032, i64 1
+  %tmp17034 = getelementptr inbounds float* %tmp17033, i64 1
+  %tmp17035 = getelementptr inbounds float* %tmp17034, i64 1
+  %tmp17036 = getelementptr inbounds float* %tmp17035, i64 1
+  %tmp17037 = getelementptr inbounds float* %tmp17036, i64 1
+  %tmp17038 = getelementptr inbounds float* %tmp17037, i64 1
+  %tmp17039 = getelementptr inbounds float* %tmp17038, i64 1
+  %tmp17040 = getelementptr inbounds float* %tmp17039, i64 1
+  %tmp17041 = getelementptr inbounds float* %tmp17040, i64 1
+  %tmp17042 = getelementptr inbounds float* %tmp17041, i64 1
+  %tmp17043 = getelementptr inbounds float* %tmp17042, i64 1
+  %tmp17044 = getelementptr inbounds float* %tmp17043, i64 1
+  %tmp17045 = getelementptr inbounds float* %tmp17044, i64 1
+  %tmp17046 = getelementptr inbounds float* %tmp17045, i64 1
+  %tmp17047 = getelementptr inbounds float* %tmp17046, i64 1
+  %tmp17048 = getelementptr inbounds float* %tmp17047, i64 1
+  %tmp17049 = getelementptr inbounds float* %tmp17048, i64 1
+  %tmp17050 = getelementptr inbounds float* %tmp17049, i64 1
+  %tmp17051 = getelementptr inbounds float* %tmp17050, i64 1
+  %tmp17052 = getelementptr inbounds float* %tmp17051, i64 1
+  %tmp17053 = getelementptr inbounds float* %tmp17052, i64 1
+  %tmp17054 = getelementptr inbounds float* %tmp17053, i64 1
+  %tmp17055 = getelementptr inbounds float* %tmp17054, i64 1
+  %tmp17056 = getelementptr inbounds float* %tmp17055, i64 1
+  %tmp17057 = getelementptr inbounds float* %tmp17056, i64 1
+  %tmp17058 = getelementptr inbounds float* %tmp17057, i64 1
+  %tmp17059 = getelementptr inbounds float* %tmp17058, i64 1
+  %tmp17060 = getelementptr inbounds float* %tmp17059, i64 1
+  %tmp17061 = getelementptr inbounds float* %tmp17060, i64 1
+  %tmp17062 = getelementptr inbounds float* %tmp17061, i64 1
+  %tmp17063 = getelementptr inbounds float* %tmp17062, i64 1
+  %tmp17064 = getelementptr inbounds float* %tmp17063, i64 1
+  %tmp17065 = getelementptr inbounds float* %tmp17064, i64 1
+  %tmp17066 = getelementptr inbounds float* %tmp17065, i64 1
+  %tmp17067 = getelementptr inbounds float* %tmp17066, i64 1
+  %tmp17068 = getelementptr inbounds float* %tmp17067, i64 1
+  %tmp17069 = getelementptr inbounds float* %tmp17068, i64 1
+  %tmp17070 = getelementptr inbounds float* %tmp17069, i64 1
+  %tmp17071 = getelementptr inbounds float* %tmp17070, i64 1
+  %tmp17072 = getelementptr inbounds float* %tmp17071, i64 1
+  %tmp17073 = getelementptr inbounds float* %tmp17072, i64 1
+  %tmp17074 = getelementptr inbounds float* %tmp17073, i64 1
+  %tmp17075 = getelementptr inbounds float* %tmp17074, i64 1
+  %tmp17076 = getelementptr inbounds float* %tmp17075, i64 1
+  %tmp17077 = getelementptr inbounds float* %tmp17076, i64 1
+  %tmp17078 = getelementptr inbounds float* %tmp17077, i64 1
+  %tmp17079 = getelementptr inbounds float* %tmp17078, i64 1
+  %tmp17080 = getelementptr inbounds float* %tmp17079, i64 1
+  %tmp17081 = getelementptr inbounds float* %tmp17080, i64 1
+  %tmp17082 = getelementptr inbounds float* %tmp17081, i64 1
+  %tmp17083 = getelementptr inbounds float* %tmp17082, i64 1
+  %tmp17084 = getelementptr inbounds float* %tmp17083, i64 1
+  %tmp17085 = getelementptr inbounds float* %tmp17084, i64 1
+  %tmp17086 = getelementptr inbounds float* %tmp17085, i64 1
+  %tmp17087 = getelementptr inbounds float* %tmp17086, i64 1
+  %tmp17088 = getelementptr inbounds float* %tmp17087, i64 1
+  %tmp17089 = getelementptr inbounds float* %tmp17088, i64 1
+  %tmp17090 = getelementptr inbounds float* %tmp17089, i64 1
+  %tmp17091 = getelementptr inbounds float* %tmp17090, i64 1
+  %tmp17092 = getelementptr inbounds float* %tmp17091, i64 1
+  %tmp17093 = getelementptr inbounds float* %tmp17092, i64 1
+  %tmp17094 = getelementptr inbounds float* %tmp17093, i64 1
+  %tmp17095 = getelementptr inbounds float* %tmp17094, i64 1
+  %tmp17096 = getelementptr inbounds float* %tmp17095, i64 1
+  %tmp17097 = getelementptr inbounds float* %tmp17096, i64 1
+  %tmp17098 = getelementptr inbounds float* %tmp17097, i64 1
+  %tmp17099 = getelementptr inbounds float* %tmp17098, i64 1
+  %tmp17100 = getelementptr inbounds float* %tmp17099, i64 1
+  %tmp17101 = getelementptr inbounds float* %tmp17100, i64 1
+  %tmp17102 = getelementptr inbounds float* %tmp17101, i64 1
+  %tmp17103 = getelementptr inbounds float* %tmp17102, i64 1
+  %tmp17104 = getelementptr inbounds float* %tmp17103, i64 1
+  %tmp17105 = getelementptr inbounds float* %tmp17104, i64 1
+  %tmp17106 = getelementptr inbounds float* %tmp17105, i64 1
+  %tmp17107 = getelementptr inbounds float* %tmp17106, i64 1
+  %tmp17108 = getelementptr inbounds float* %tmp17107, i64 1
+  %tmp17109 = getelementptr inbounds float* %tmp17108, i64 1
+  %tmp17110 = getelementptr inbounds float* %tmp17109, i64 1
+  %tmp17111 = getelementptr inbounds float* %tmp17110, i64 1
+  %tmp17112 = getelementptr inbounds float* %tmp17111, i64 1
+  %tmp17113 = getelementptr inbounds float* %tmp17112, i64 1
+  %tmp17114 = getelementptr inbounds float* %tmp17113, i64 1
+  %tmp17115 = getelementptr inbounds float* %tmp17114, i64 1
+  %tmp17116 = getelementptr inbounds float* %tmp17115, i64 1
+  %tmp17117 = getelementptr inbounds float* %tmp17116, i64 1
+  %tmp17118 = getelementptr inbounds float* %tmp17117, i64 1
+  %tmp17119 = getelementptr inbounds float* %tmp17118, i64 1
+  %tmp17120 = getelementptr inbounds float* %tmp17119, i64 1
+  %tmp17121 = getelementptr inbounds float* %tmp17120, i64 1
+  %tmp17122 = getelementptr inbounds float* %tmp17121, i64 1
+  %tmp17123 = getelementptr inbounds float* %tmp17122, i64 1
+  %tmp17124 = getelementptr inbounds float* %tmp17123, i64 1
+  %tmp17125 = getelementptr inbounds float* %tmp17124, i64 1
+  %tmp17126 = getelementptr inbounds float* %tmp17125, i64 1
+  %tmp17127 = getelementptr inbounds float* %tmp17126, i64 1
+  %tmp17128 = getelementptr inbounds float* %tmp17127, i64 1
+  %tmp17129 = getelementptr inbounds float* %tmp17128, i64 1
+  %tmp17130 = getelementptr inbounds float* %tmp17129, i64 1
+  %tmp17131 = getelementptr inbounds float* %tmp17130, i64 1
+  %tmp17132 = getelementptr inbounds float* %tmp17131, i64 1
+  %tmp17133 = getelementptr inbounds float* %tmp17132, i64 1
+  %tmp17134 = getelementptr inbounds float* %tmp17133, i64 1
+  %tmp17135 = getelementptr inbounds float* %tmp17134, i64 1
+  %tmp17136 = getelementptr inbounds float* %tmp17135, i64 1
+  %tmp17137 = getelementptr inbounds float* %tmp17136, i64 1
+  %tmp17138 = getelementptr inbounds float* %tmp17137, i64 1
+  %tmp17139 = getelementptr inbounds float* %tmp17138, i64 1
+  %tmp17140 = getelementptr inbounds float* %tmp17139, i64 1
+  %tmp17141 = getelementptr inbounds float* %tmp17140, i64 1
+  %tmp17142 = getelementptr inbounds float* %tmp17141, i64 1
+  %tmp17143 = getelementptr inbounds float* %tmp17142, i64 1
+  %tmp17144 = getelementptr inbounds float* %tmp17143, i64 1
+  %tmp17145 = getelementptr inbounds float* %tmp17144, i64 1
+  %tmp17146 = getelementptr inbounds float* %tmp17145, i64 1
+  %tmp17147 = getelementptr inbounds float* %tmp17146, i64 1
+  %tmp17148 = getelementptr inbounds float* %tmp17147, i64 1
+  %tmp17149 = getelementptr inbounds float* %tmp17148, i64 1
+  %tmp17150 = getelementptr inbounds float* %tmp17149, i64 1
+  %tmp17151 = getelementptr inbounds float* %tmp17150, i64 1
+  %tmp17152 = getelementptr inbounds float* %tmp17151, i64 1
+  %tmp17153 = getelementptr inbounds float* %tmp17152, i64 1
+  %tmp17154 = getelementptr inbounds float* %tmp17153, i64 1
+  %tmp17155 = getelementptr inbounds float* %tmp17154, i64 1
+  %tmp17156 = getelementptr inbounds float* %tmp17155, i64 1
+  %tmp17157 = getelementptr inbounds float* %tmp17156, i64 1
+  %tmp17158 = getelementptr inbounds float* %tmp17157, i64 1
+  %tmp17159 = getelementptr inbounds float* %tmp17158, i64 1
+  %tmp17160 = getelementptr inbounds float* %tmp17159, i64 1
+  %tmp17161 = getelementptr inbounds float* %tmp17160, i64 1
+  %tmp17162 = getelementptr inbounds float* %tmp17161, i64 1
+  %tmp17163 = getelementptr inbounds float* %tmp17162, i64 1
+  %tmp17164 = getelementptr inbounds float* %tmp17163, i64 1
+  %tmp17165 = getelementptr inbounds float* %tmp17164, i64 1
+  %tmp17166 = getelementptr inbounds float* %tmp17165, i64 1
+  %tmp17167 = getelementptr inbounds float* %tmp17166, i64 1
+  %tmp17168 = getelementptr inbounds float* %tmp17167, i64 1
+  %tmp17169 = getelementptr inbounds float* %tmp17168, i64 1
+  %tmp17170 = getelementptr inbounds float* %tmp17169, i64 1
+  %tmp17171 = getelementptr inbounds float* %tmp17170, i64 1
+  %tmp17172 = getelementptr inbounds float* %tmp17171, i64 1
+  %tmp17173 = getelementptr inbounds float* %tmp17172, i64 1
+  %tmp17174 = getelementptr inbounds float* %tmp17173, i64 1
+  %tmp17175 = getelementptr inbounds float* %tmp17174, i64 1
+  %tmp17176 = getelementptr inbounds float* %tmp17175, i64 1
+  %tmp17177 = getelementptr inbounds float* %tmp17176, i64 1
+  %tmp17178 = getelementptr inbounds float* %tmp17177, i64 1
+  %tmp17179 = getelementptr inbounds float* %tmp17178, i64 1
+  %tmp17180 = getelementptr inbounds float* %tmp17179, i64 1
+  %tmp17181 = getelementptr inbounds float* %tmp17180, i64 1
+  %tmp17182 = getelementptr inbounds float* %tmp17181, i64 1
+  %tmp17183 = getelementptr inbounds float* %tmp17182, i64 1
+  %tmp17184 = getelementptr inbounds float* %tmp17183, i64 1
+  %tmp17185 = getelementptr inbounds float* %tmp17184, i64 1
+  %tmp17186 = getelementptr inbounds float* %tmp17185, i64 1
+  %tmp17187 = getelementptr inbounds float* %tmp17186, i64 1
+  %tmp17188 = getelementptr inbounds float* %tmp17187, i64 1
+  %tmp17189 = getelementptr inbounds float* %tmp17188, i64 1
+  %tmp17190 = getelementptr inbounds float* %tmp17189, i64 1
+  %tmp17191 = getelementptr inbounds float* %tmp17190, i64 1
+  %tmp17192 = getelementptr inbounds float* %tmp17191, i64 1
+  %tmp17193 = getelementptr inbounds float* %tmp17192, i64 1
+  %tmp17194 = getelementptr inbounds float* %tmp17193, i64 1
+  %tmp17195 = getelementptr inbounds float* %tmp17194, i64 1
+  %tmp17196 = getelementptr inbounds float* %tmp17195, i64 1
+  %tmp17197 = getelementptr inbounds float* %tmp17196, i64 1
+  %tmp17198 = getelementptr inbounds float* %tmp17197, i64 1
+  %tmp17199 = getelementptr inbounds float* %tmp17198, i64 1
+  %tmp17200 = getelementptr inbounds float* %tmp17199, i64 1
+  %tmp17201 = getelementptr inbounds float* %tmp17200, i64 1
+  %tmp17202 = getelementptr inbounds float* %tmp17201, i64 1
+  %tmp17203 = getelementptr inbounds float* %tmp17202, i64 1
+  %tmp17204 = getelementptr inbounds float* %tmp17203, i64 1
+  %tmp17205 = getelementptr inbounds float* %tmp17204, i64 1
+  %tmp17206 = getelementptr inbounds float* %tmp17205, i64 1
+  %tmp17207 = getelementptr inbounds float* %tmp17206, i64 1
+  %tmp17208 = getelementptr inbounds float* %tmp17207, i64 1
+  %tmp17209 = getelementptr inbounds float* %tmp17208, i64 1
+  %tmp17210 = getelementptr inbounds float* %tmp17209, i64 1
+  %tmp17211 = getelementptr inbounds float* %tmp17210, i64 1
+  %tmp17212 = getelementptr inbounds float* %tmp17211, i64 1
+  %tmp17213 = getelementptr inbounds float* %tmp17212, i64 1
+  %tmp17214 = getelementptr inbounds float* %tmp17213, i64 1
+  %tmp17215 = getelementptr inbounds float* %tmp17214, i64 1
+  %tmp17216 = getelementptr inbounds float* %tmp17215, i64 1
+  %tmp17217 = getelementptr inbounds float* %tmp17216, i64 1
+  %tmp17218 = getelementptr inbounds float* %tmp17217, i64 1
+  %tmp17219 = getelementptr inbounds float* %tmp17218, i64 1
+  %tmp17220 = getelementptr inbounds float* %tmp17219, i64 1
+  %tmp17221 = getelementptr inbounds float* %tmp17220, i64 1
+  %tmp17222 = getelementptr inbounds float* %tmp17221, i64 1
+  %tmp17223 = getelementptr inbounds float* %tmp17222, i64 1
+  %tmp17224 = getelementptr inbounds float* %tmp17223, i64 1
+  %tmp17225 = getelementptr inbounds float* %tmp17224, i64 1
+  %tmp17226 = getelementptr inbounds float* %tmp17225, i64 1
+  %tmp17227 = getelementptr inbounds float* %tmp17226, i64 1
+  %tmp17228 = getelementptr inbounds float* %tmp17227, i64 1
+  %tmp17229 = getelementptr inbounds float* %tmp17228, i64 1
+  %tmp17230 = getelementptr inbounds float* %tmp17229, i64 1
+  %tmp17231 = getelementptr inbounds float* %tmp17230, i64 1
+  %tmp17232 = getelementptr inbounds float* %tmp17231, i64 1
+  %tmp17233 = getelementptr inbounds float* %tmp17232, i64 1
+  %tmp17234 = getelementptr inbounds float* %tmp17233, i64 1
+  %tmp17235 = getelementptr inbounds float* %tmp17234, i64 1
+  %tmp17236 = getelementptr inbounds float* %tmp17235, i64 1
+  %tmp17237 = getelementptr inbounds float* %tmp17236, i64 1
+  %tmp17238 = getelementptr inbounds float* %tmp17237, i64 1
+  %tmp17239 = getelementptr inbounds float* %tmp17238, i64 1
+  %tmp17240 = getelementptr inbounds float* %tmp17239, i64 1
+  %tmp17241 = getelementptr inbounds float* %tmp17240, i64 1
+  %tmp17242 = getelementptr inbounds float* %tmp17241, i64 1
+  %tmp17243 = getelementptr inbounds float* %tmp17242, i64 1
+  %tmp17244 = getelementptr inbounds float* %tmp17243, i64 1
+  %tmp17245 = getelementptr inbounds float* %tmp17244, i64 1
+  %tmp17246 = getelementptr inbounds float* %tmp17245, i64 1
+  %tmp17247 = getelementptr inbounds float* %tmp17246, i64 1
+  %tmp17248 = getelementptr inbounds float* %tmp17247, i64 1
+  %tmp17249 = getelementptr inbounds float* %tmp17248, i64 1
+  %tmp17250 = getelementptr inbounds float* %tmp17249, i64 1
+  %tmp17251 = getelementptr inbounds float* %tmp17250, i64 1
+  %tmp17252 = getelementptr inbounds float* %tmp17251, i64 1
+  %tmp17253 = getelementptr inbounds float* %tmp17252, i64 1
+  %tmp17254 = getelementptr inbounds float* %tmp17253, i64 1
+  %tmp17255 = getelementptr inbounds float* %tmp17254, i64 1
+  %tmp17256 = getelementptr inbounds float* %tmp17255, i64 1
+  %tmp17257 = getelementptr inbounds float* %tmp17256, i64 1
+  %tmp17258 = getelementptr inbounds float* %tmp17257, i64 1
+  %tmp17259 = getelementptr inbounds float* %tmp17258, i64 1
+  %tmp17260 = getelementptr inbounds float* %tmp17259, i64 1
+  %tmp17261 = getelementptr inbounds float* %tmp17260, i64 1
+  %tmp17262 = getelementptr inbounds float* %tmp17261, i64 1
+  %tmp17263 = getelementptr inbounds float* %tmp17262, i64 1
+  %tmp17264 = getelementptr inbounds float* %tmp17263, i64 1
+  %tmp17265 = getelementptr inbounds float* %tmp17264, i64 1
+  %tmp17266 = getelementptr inbounds float* %tmp17265, i64 1
+  %tmp17267 = getelementptr inbounds float* %tmp17266, i64 1
+  %tmp17268 = getelementptr inbounds float* %tmp17267, i64 1
+  %tmp17269 = getelementptr inbounds float* %tmp17268, i64 1
+  %tmp17270 = getelementptr inbounds float* %tmp17269, i64 1
+  %tmp17271 = getelementptr inbounds float* %tmp17270, i64 1
+  %tmp17272 = getelementptr inbounds float* %tmp17271, i64 1
+  %tmp17273 = getelementptr inbounds float* %tmp17272, i64 1
+  %tmp17274 = getelementptr inbounds float* %tmp17273, i64 1
+  %tmp17275 = getelementptr inbounds float* %tmp17274, i64 1
+  %tmp17276 = getelementptr inbounds float* %tmp17275, i64 1
+  %tmp17277 = getelementptr inbounds float* %tmp17276, i64 1
+  %tmp17278 = getelementptr inbounds float* %tmp17277, i64 1
+  %tmp17279 = getelementptr inbounds float* %tmp17278, i64 1
+  %tmp17280 = getelementptr inbounds float* %tmp17279, i64 1
+  %tmp17281 = getelementptr inbounds float* %tmp17280, i64 1
+  %tmp17282 = getelementptr inbounds float* %tmp17281, i64 1
+  %tmp17283 = getelementptr inbounds float* %tmp17282, i64 1
+  %tmp17284 = getelementptr inbounds float* %tmp17283, i64 1
+  %tmp17285 = getelementptr inbounds float* %tmp17284, i64 1
+  %tmp17286 = getelementptr inbounds float* %tmp17285, i64 1
+  %tmp17287 = getelementptr inbounds float* %tmp17286, i64 1
+  %tmp17288 = getelementptr inbounds float* %tmp17287, i64 1
+  %tmp17289 = getelementptr inbounds float* %tmp17288, i64 1
+  %tmp17290 = getelementptr inbounds float* %tmp17289, i64 1
+  %tmp17291 = getelementptr inbounds float* %tmp17290, i64 1
+  %tmp17292 = getelementptr inbounds float* %tmp17291, i64 1
+  %tmp17293 = getelementptr inbounds float* %tmp17292, i64 1
+  %tmp17294 = getelementptr inbounds float* %tmp17293, i64 1
+  %tmp17295 = getelementptr inbounds float* %tmp17294, i64 1
+  %tmp17296 = getelementptr inbounds float* %tmp17295, i64 1
+  %tmp17297 = getelementptr inbounds float* %tmp17296, i64 1
+  %tmp17298 = getelementptr inbounds float* %tmp17297, i64 1
+  %tmp17299 = getelementptr inbounds float* %tmp17298, i64 1
+  %tmp17300 = getelementptr inbounds float* %tmp17299, i64 1
+  %tmp17301 = getelementptr inbounds float* %tmp17300, i64 1
+  %tmp17302 = getelementptr inbounds float* %tmp17301, i64 1
+  %tmp17303 = getelementptr inbounds float* %tmp17302, i64 1
+  %tmp17304 = getelementptr inbounds float* %tmp17303, i64 1
+  %tmp17305 = getelementptr inbounds float* %tmp17304, i64 1
+  %tmp17306 = getelementptr inbounds float* %tmp17305, i64 1
+  %tmp17307 = getelementptr inbounds float* %tmp17306, i64 1
+  %tmp17308 = getelementptr inbounds float* %tmp17307, i64 1
+  %tmp17309 = getelementptr inbounds float* %tmp17308, i64 1
+  %tmp17310 = getelementptr inbounds float* %tmp17309, i64 1
+  %tmp17311 = getelementptr inbounds float* %tmp17310, i64 1
+  %tmp17312 = getelementptr inbounds float* %tmp17311, i64 1
+  %tmp17313 = getelementptr inbounds float* %tmp17312, i64 1
+  %tmp17314 = getelementptr inbounds float* %tmp17313, i64 1
+  %tmp17315 = getelementptr inbounds float* %tmp17314, i64 1
+  %tmp17316 = getelementptr inbounds float* %tmp17315, i64 1
+  %tmp17317 = getelementptr inbounds float* %tmp17316, i64 1
+  %tmp17318 = getelementptr inbounds float* %tmp17317, i64 1
+  %tmp17319 = getelementptr inbounds float* %tmp17318, i64 1
+  %tmp17320 = getelementptr inbounds float* %tmp17319, i64 1
+  %tmp17321 = getelementptr inbounds float* %tmp17320, i64 1
+  %tmp17322 = getelementptr inbounds float* %tmp17321, i64 1
+  %tmp17323 = getelementptr inbounds float* %tmp17322, i64 1
+  %tmp17324 = getelementptr inbounds float* %tmp17323, i64 1
+  %tmp17325 = getelementptr inbounds float* %tmp17324, i64 1
+  %tmp17326 = getelementptr inbounds float* %tmp17325, i64 1
+  %tmp17327 = getelementptr inbounds float* %tmp17326, i64 1
+  %tmp17328 = getelementptr inbounds float* %tmp17327, i64 1
+  %tmp17329 = getelementptr inbounds float* %tmp17328, i64 1
+  %tmp17330 = getelementptr inbounds float* %tmp17329, i64 1
+  %tmp17331 = getelementptr inbounds float* %tmp17330, i64 1
+  %tmp17332 = getelementptr inbounds float* %tmp17331, i64 1
+  %tmp17333 = getelementptr inbounds float* %tmp17332, i64 1
+  %tmp17334 = getelementptr inbounds float* %tmp17333, i64 1
+  %tmp17335 = getelementptr inbounds float* %tmp17334, i64 1
+  %tmp17336 = getelementptr inbounds float* %tmp17335, i64 1
+  %tmp17337 = getelementptr inbounds float* %tmp17336, i64 1
+  %tmp17338 = getelementptr inbounds float* %tmp17337, i64 1
+  %tmp17339 = getelementptr inbounds float* %tmp17338, i64 1
+  %tmp17340 = getelementptr inbounds float* %tmp17339, i64 1
+  %tmp17341 = getelementptr inbounds float* %tmp17340, i64 1
+  %tmp17342 = getelementptr inbounds float* %tmp17341, i64 1
+  %tmp17343 = getelementptr inbounds float* %tmp17342, i64 1
+  %tmp17344 = getelementptr inbounds float* %tmp17343, i64 1
+  %tmp17345 = getelementptr inbounds float* %tmp17344, i64 1
+  %tmp17346 = getelementptr inbounds float* %tmp17345, i64 1
+  %tmp17347 = getelementptr inbounds float* %tmp17346, i64 1
+  %tmp17348 = getelementptr inbounds float* %tmp17347, i64 1
+  %tmp17349 = getelementptr inbounds float* %tmp17348, i64 1
+  %tmp17350 = getelementptr inbounds float* %tmp17349, i64 1
+  %tmp17351 = getelementptr inbounds float* %tmp17350, i64 1
+  %tmp17352 = getelementptr inbounds float* %tmp17351, i64 1
+  %tmp17353 = getelementptr inbounds float* %tmp17352, i64 1
+  %tmp17354 = getelementptr inbounds float* %tmp17353, i64 1
+  %tmp17355 = getelementptr inbounds float* %tmp17354, i64 1
+  %tmp17356 = getelementptr inbounds float* %tmp17355, i64 1
+  %tmp17357 = getelementptr inbounds float* %tmp17356, i64 1
+  %tmp17358 = getelementptr inbounds float* %tmp17357, i64 1
+  %tmp17359 = getelementptr inbounds float* %tmp17358, i64 1
+  %tmp17360 = getelementptr inbounds float* %tmp17359, i64 1
+  %tmp17361 = getelementptr inbounds float* %tmp17360, i64 1
+  %tmp17362 = getelementptr inbounds float* %tmp17361, i64 1
+  %tmp17363 = getelementptr inbounds float* %tmp17362, i64 1
+  %tmp17364 = getelementptr inbounds float* %tmp17363, i64 1
+  %tmp17365 = getelementptr inbounds float* %tmp17364, i64 1
+  %tmp17366 = getelementptr inbounds float* %tmp17365, i64 1
+  %tmp17367 = getelementptr inbounds float* %tmp17366, i64 1
+  %tmp17368 = getelementptr inbounds float* %tmp17367, i64 1
+  %tmp17369 = getelementptr inbounds float* %tmp17368, i64 1
+  %tmp17370 = getelementptr inbounds float* %tmp17369, i64 1
+  %tmp17371 = getelementptr inbounds float* %tmp17370, i64 1
+  %tmp17372 = getelementptr inbounds float* %tmp17371, i64 1
+  %tmp17373 = getelementptr inbounds float* %tmp17372, i64 1
+  %tmp17374 = getelementptr inbounds float* %tmp17373, i64 1
+  %tmp17375 = getelementptr inbounds float* %tmp17374, i64 1
+  %tmp17376 = getelementptr inbounds float* %tmp17375, i64 1
+  %tmp17377 = getelementptr inbounds float* %tmp17376, i64 1
+  %tmp17378 = getelementptr inbounds float* %tmp17377, i64 1
+  %tmp17379 = getelementptr inbounds float* %tmp17378, i64 1
+  %tmp17380 = getelementptr inbounds float* %tmp17379, i64 1
+  %tmp17381 = getelementptr inbounds float* %tmp17380, i64 1
+  %tmp17382 = getelementptr inbounds float* %tmp17381, i64 1
+  %tmp17383 = getelementptr inbounds float* %tmp17382, i64 1
+  %tmp17384 = getelementptr inbounds float* %tmp17383, i64 1
+  %tmp17385 = getelementptr inbounds float* %tmp17384, i64 1
+  %tmp17386 = getelementptr inbounds float* %tmp17385, i64 1
+  %tmp17387 = getelementptr inbounds float* %tmp17386, i64 1
+  %tmp17388 = getelementptr inbounds float* %tmp17387, i64 1
+  %tmp17389 = getelementptr inbounds float* %tmp17388, i64 1
+  %tmp17390 = getelementptr inbounds float* %tmp17389, i64 1
+  %tmp17391 = getelementptr inbounds float* %tmp17390, i64 1
+  %tmp17392 = getelementptr inbounds float* %tmp17391, i64 1
+  %tmp17393 = getelementptr inbounds float* %tmp17392, i64 1
+  %tmp17394 = getelementptr inbounds float* %tmp17393, i64 1
+  %tmp17395 = getelementptr inbounds float* %tmp17394, i64 1
+  %tmp17396 = getelementptr inbounds float* %tmp17395, i64 1
+  %tmp17397 = getelementptr inbounds float* %tmp17396, i64 1
+  %tmp17398 = getelementptr inbounds float* %tmp17397, i64 1
+  %tmp17399 = getelementptr inbounds float* %tmp17398, i64 1
+  %tmp17400 = getelementptr inbounds float* %tmp17399, i64 1
+  %tmp17401 = getelementptr inbounds float* %tmp17400, i64 1
+  %tmp17402 = getelementptr inbounds float* %tmp17401, i64 1
+  %tmp17403 = getelementptr inbounds float* %tmp17402, i64 1
+  %tmp17404 = getelementptr inbounds float* %tmp17403, i64 1
+  %tmp17405 = getelementptr inbounds float* %tmp17404, i64 1
+  %tmp17406 = getelementptr inbounds float* %tmp17405, i64 1
+  %tmp17407 = getelementptr inbounds float* %tmp17406, i64 1
+  %tmp17408 = getelementptr inbounds float* %tmp17407, i64 1
+  %tmp17409 = getelementptr inbounds float* %tmp17408, i64 1
+  %tmp17410 = getelementptr inbounds float* %tmp17409, i64 1
+  %tmp17411 = getelementptr inbounds float* %tmp17410, i64 1
+  %tmp17412 = getelementptr inbounds float* %tmp17411, i64 1
+  %tmp17413 = getelementptr inbounds float* %tmp17412, i64 1
+  %tmp17414 = getelementptr inbounds float* %tmp17413, i64 1
+  %tmp17415 = getelementptr inbounds float* %tmp17414, i64 1
+  %tmp17416 = getelementptr inbounds float* %tmp17415, i64 1
+  %tmp17417 = getelementptr inbounds float* %tmp17416, i64 1
+  %tmp17418 = getelementptr inbounds float* %tmp17417, i64 1
+  %tmp17419 = getelementptr inbounds float* %tmp17418, i64 1
+  %tmp17420 = getelementptr inbounds float* %tmp17419, i64 1
+  %tmp17421 = getelementptr inbounds float* %tmp17420, i64 1
+  %tmp17422 = getelementptr inbounds float* %tmp17421, i64 1
+  %tmp17423 = getelementptr inbounds float* %tmp17422, i64 1
+  %tmp17424 = getelementptr inbounds float* %tmp17423, i64 1
+  %tmp17425 = getelementptr inbounds float* %tmp17424, i64 1
+  %tmp17426 = getelementptr inbounds float* %tmp17425, i64 1
+  %tmp17427 = getelementptr inbounds float* %tmp17426, i64 1
+  %tmp17428 = getelementptr inbounds float* %tmp17427, i64 1
+  %tmp17429 = getelementptr inbounds float* %tmp17428, i64 1
+  %tmp17430 = getelementptr inbounds float* %tmp17429, i64 1
+  %tmp17431 = getelementptr inbounds float* %tmp17430, i64 1
+  %tmp17432 = getelementptr inbounds float* %tmp17431, i64 1
+  %tmp17433 = getelementptr inbounds float* %tmp17432, i64 1
+  %tmp17434 = getelementptr inbounds float* %tmp17433, i64 1
+  %tmp17435 = getelementptr inbounds float* %tmp17434, i64 1
+  %tmp17436 = getelementptr inbounds float* %tmp17435, i64 1
+  %tmp17437 = getelementptr inbounds float* %tmp17436, i64 1
+  %tmp17438 = getelementptr inbounds float* %tmp17437, i64 1
+  %tmp17439 = getelementptr inbounds float* %tmp17438, i64 1
+  %tmp17440 = getelementptr inbounds float* %tmp17439, i64 1
+  %tmp17441 = getelementptr inbounds float* %tmp17440, i64 1
+  %tmp17442 = getelementptr inbounds float* %tmp17441, i64 1
+  %tmp17443 = getelementptr inbounds float* %tmp17442, i64 1
+  %tmp17444 = getelementptr inbounds float* %tmp17443, i64 1
+  %tmp17445 = getelementptr inbounds float* %tmp17444, i64 1
+  %tmp17446 = getelementptr inbounds float* %tmp17445, i64 1
+  %tmp17447 = getelementptr inbounds float* %tmp17446, i64 1
+  %tmp17448 = getelementptr inbounds float* %tmp17447, i64 1
+  %tmp17449 = getelementptr inbounds float* %tmp17448, i64 1
+  %tmp17450 = getelementptr inbounds float* %tmp17449, i64 1
+  %tmp17451 = getelementptr inbounds float* %tmp17450, i64 1
+  %tmp17452 = getelementptr inbounds float* %tmp17451, i64 1
+  %tmp17453 = getelementptr inbounds float* %tmp17452, i64 1
+  %tmp17454 = getelementptr inbounds float* %tmp17453, i64 1
+  %tmp17455 = getelementptr inbounds float* %tmp17454, i64 1
+  %tmp17456 = getelementptr inbounds float* %tmp17455, i64 1
+  %tmp17457 = getelementptr inbounds float* %tmp17456, i64 1
+  %tmp17458 = getelementptr inbounds float* %tmp17457, i64 1
+  %tmp17459 = getelementptr inbounds float* %tmp17458, i64 1
+  %tmp17460 = getelementptr inbounds float* %tmp17459, i64 1
+  %tmp17461 = getelementptr inbounds float* %tmp17460, i64 1
+  %tmp17462 = getelementptr inbounds float* %tmp17461, i64 1
+  %tmp17463 = getelementptr inbounds float* %tmp17462, i64 1
+  %tmp17464 = getelementptr inbounds float* %tmp17463, i64 1
+  %tmp17465 = getelementptr inbounds float* %tmp17464, i64 1
+  %tmp17466 = getelementptr inbounds float* %tmp17465, i64 1
+  %tmp17467 = getelementptr inbounds float* %tmp17466, i64 1
+  %tmp17468 = getelementptr inbounds float* %tmp17467, i64 1
+  %tmp17469 = getelementptr inbounds float* %tmp17468, i64 1
+  %tmp17470 = getelementptr inbounds float* %tmp17469, i64 1
+  %tmp17471 = getelementptr inbounds float* %tmp17470, i64 1
+  %tmp17472 = getelementptr inbounds float* %tmp17471, i64 1
+  %tmp17473 = getelementptr inbounds float* %tmp17472, i64 1
+  %tmp17474 = getelementptr inbounds float* %tmp17473, i64 1
+  %tmp17475 = getelementptr inbounds float* %tmp17474, i64 1
+  %tmp17476 = getelementptr inbounds float* %tmp17475, i64 1
+  %tmp17477 = getelementptr inbounds float* %tmp17476, i64 1
+  %tmp17478 = getelementptr inbounds float* %tmp17477, i64 1
+  %tmp17479 = getelementptr inbounds float* %tmp17478, i64 1
+  %tmp17480 = getelementptr inbounds float* %tmp17479, i64 1
+  %tmp17481 = getelementptr inbounds float* %tmp17480, i64 1
+  %tmp17482 = getelementptr inbounds float* %tmp17481, i64 1
+  %tmp17483 = getelementptr inbounds float* %tmp17482, i64 1
+  %tmp17484 = getelementptr inbounds float* %tmp17483, i64 1
+  %tmp17485 = getelementptr inbounds float* %tmp17484, i64 1
+  %tmp17486 = getelementptr inbounds float* %tmp17485, i64 1
+  %tmp17487 = getelementptr inbounds float* %tmp17486, i64 1
+  %tmp17488 = getelementptr inbounds float* %tmp17487, i64 1
+  %tmp17489 = getelementptr inbounds float* %tmp17488, i64 1
+  %tmp17490 = getelementptr inbounds float* %tmp17489, i64 1
+  %tmp17491 = getelementptr inbounds float* %tmp17490, i64 1
+  %tmp17492 = getelementptr inbounds float* %tmp17491, i64 1
+  %tmp17493 = getelementptr inbounds float* %tmp17492, i64 1
+  %tmp17494 = getelementptr inbounds float* %tmp17493, i64 1
+  %tmp17495 = getelementptr inbounds float* %tmp17494, i64 1
+  %tmp17496 = getelementptr inbounds float* %tmp17495, i64 1
+  %tmp17497 = getelementptr inbounds float* %tmp17496, i64 1
+  %tmp17498 = getelementptr inbounds float* %tmp17497, i64 1
+  %tmp17499 = getelementptr inbounds float* %tmp17498, i64 1
+  %tmp17500 = getelementptr inbounds float* %tmp17499, i64 1
+  %tmp17501 = getelementptr inbounds float* %tmp17500, i64 1
+  %tmp17502 = getelementptr inbounds float* %tmp17501, i64 1
+  %tmp17503 = getelementptr inbounds float* %tmp17502, i64 1
+  %tmp17504 = getelementptr inbounds float* %tmp17503, i64 1
+  %tmp17505 = getelementptr inbounds float* %tmp17504, i64 1
+  %tmp17506 = getelementptr inbounds float* %tmp17505, i64 1
+  %tmp17507 = getelementptr inbounds float* %tmp17506, i64 1
+  %tmp17508 = getelementptr inbounds float* %tmp17507, i64 1
+  %tmp17509 = getelementptr inbounds float* %tmp17508, i64 1
+  %tmp17510 = getelementptr inbounds float* %tmp17509, i64 1
+  %tmp17511 = getelementptr inbounds float* %tmp17510, i64 1
+  %tmp17512 = getelementptr inbounds float* %tmp17511, i64 1
+  %tmp17513 = getelementptr inbounds float* %tmp17512, i64 1
+  %tmp17514 = getelementptr inbounds float* %tmp17513, i64 1
+  %tmp17515 = getelementptr inbounds float* %tmp17514, i64 1
+  %tmp17516 = getelementptr inbounds float* %tmp17515, i64 1
+  %tmp17517 = getelementptr inbounds float* %tmp17516, i64 1
+  %tmp17518 = getelementptr inbounds float* %tmp17517, i64 1
+  %tmp17519 = getelementptr inbounds float* %tmp17518, i64 1
+  %tmp17520 = getelementptr inbounds float* %tmp17519, i64 1
+  %tmp17521 = getelementptr inbounds float* %tmp17520, i64 1
+  %tmp17522 = getelementptr inbounds float* %tmp17521, i64 1
+  %tmp17523 = getelementptr inbounds float* %tmp17522, i64 1
+  %tmp17524 = getelementptr inbounds float* %tmp17523, i64 1
+  %tmp17525 = getelementptr inbounds float* %tmp17524, i64 1
+  %tmp17526 = getelementptr inbounds float* %tmp17525, i64 1
+  %tmp17527 = getelementptr inbounds float* %tmp17526, i64 1
+  %tmp17528 = getelementptr inbounds float* %tmp17527, i64 1
+  %tmp17529 = getelementptr inbounds float* %tmp17528, i64 1
+  %tmp17530 = getelementptr inbounds float* %tmp17529, i64 1
+  %tmp17531 = getelementptr inbounds float* %tmp17530, i64 1
+  %tmp17532 = getelementptr inbounds float* %tmp17531, i64 1
+  %tmp17533 = getelementptr inbounds float* %tmp17532, i64 1
+  %tmp17534 = getelementptr inbounds float* %tmp17533, i64 1
+  %tmp17535 = getelementptr inbounds float* %tmp17534, i64 1
+  %tmp17536 = getelementptr inbounds float* %tmp17535, i64 1
+  %tmp17537 = getelementptr inbounds float* %tmp17536, i64 1
+  %tmp17538 = getelementptr inbounds float* %tmp17537, i64 1
+  %tmp17539 = getelementptr inbounds float* %tmp17538, i64 1
+  %tmp17540 = getelementptr inbounds float* %tmp17539, i64 1
+  %tmp17541 = getelementptr inbounds float* %tmp17540, i64 1
+  %tmp17542 = getelementptr inbounds float* %tmp17541, i64 1
+  %tmp17543 = getelementptr inbounds float* %tmp17542, i64 1
+  %tmp17544 = getelementptr inbounds float* %tmp17543, i64 1
+  %tmp17545 = getelementptr inbounds float* %tmp17544, i64 1
+  %tmp17546 = getelementptr inbounds float* %tmp17545, i64 1
+  %tmp17547 = getelementptr inbounds float* %tmp17546, i64 1
+  %tmp17548 = getelementptr inbounds float* %tmp17547, i64 1
+  %tmp17549 = getelementptr inbounds float* %tmp17548, i64 1
+  %tmp17550 = getelementptr inbounds float* %tmp17549, i64 1
+  %tmp17551 = getelementptr inbounds float* %tmp17550, i64 1
+  %tmp17552 = getelementptr inbounds float* %tmp17551, i64 1
+  %tmp17553 = getelementptr inbounds float* %tmp17552, i64 1
+  %tmp17554 = getelementptr inbounds float* %tmp17553, i64 1
+  %tmp17555 = getelementptr inbounds float* %tmp17554, i64 1
+  %tmp17556 = getelementptr inbounds float* %tmp17555, i64 1
+  %tmp17557 = getelementptr inbounds float* %tmp17556, i64 1
+  %tmp17558 = getelementptr inbounds float* %tmp17557, i64 1
+  %tmp17559 = getelementptr inbounds float* %tmp17558, i64 1
+  %tmp17560 = getelementptr inbounds float* %tmp17559, i64 1
+  %tmp17561 = getelementptr inbounds float* %tmp17560, i64 1
+  %tmp17562 = getelementptr inbounds float* %tmp17561, i64 1
+  %tmp17563 = getelementptr inbounds float* %tmp17562, i64 1
+  %tmp17564 = getelementptr inbounds float* %tmp17563, i64 1
+  %tmp17565 = getelementptr inbounds float* %tmp17564, i64 1
+  %tmp17566 = getelementptr inbounds float* %tmp17565, i64 1
+  %tmp17567 = getelementptr inbounds float* %tmp17566, i64 1
+  %tmp17568 = getelementptr inbounds float* %tmp17567, i64 1
+  %tmp17569 = getelementptr inbounds float* %tmp17568, i64 1
+  %tmp17570 = getelementptr inbounds float* %tmp17569, i64 1
+  %tmp17571 = getelementptr inbounds float* %tmp17570, i64 1
+  %tmp17572 = getelementptr inbounds float* %tmp17571, i64 1
+  %tmp17573 = getelementptr inbounds float* %tmp17572, i64 1
+  %tmp17574 = getelementptr inbounds float* %tmp17573, i64 1
+  %tmp17575 = getelementptr inbounds float* %tmp17574, i64 1
+  %tmp17576 = getelementptr inbounds float* %tmp17575, i64 1
+  %tmp17577 = getelementptr inbounds float* %tmp17576, i64 1
+  %tmp17578 = getelementptr inbounds float* %tmp17577, i64 1
+  %tmp17579 = getelementptr inbounds float* %tmp17578, i64 1
+  %tmp17580 = getelementptr inbounds float* %tmp17579, i64 1
+  %tmp17581 = getelementptr inbounds float* %tmp17580, i64 1
+  %tmp17582 = getelementptr inbounds float* %tmp17581, i64 1
+  %tmp17583 = getelementptr inbounds float* %tmp17582, i64 1
+  %tmp17584 = getelementptr inbounds float* %tmp17583, i64 1
+  %tmp17585 = getelementptr inbounds float* %tmp17584, i64 1
+  %tmp17586 = getelementptr inbounds float* %tmp17585, i64 1
+  %tmp17587 = getelementptr inbounds float* %tmp17586, i64 1
+  %tmp17588 = getelementptr inbounds float* %tmp17587, i64 1
+  %tmp17589 = getelementptr inbounds float* %tmp17588, i64 1
+  %tmp17590 = getelementptr inbounds float* %tmp17589, i64 1
+  %tmp17591 = getelementptr inbounds float* %tmp17590, i64 1
+  %tmp17592 = getelementptr inbounds float* %tmp17591, i64 1
+  %tmp17593 = getelementptr inbounds float* %tmp17592, i64 1
+  %tmp17594 = getelementptr inbounds float* %tmp17593, i64 1
+  %tmp17595 = getelementptr inbounds float* %tmp17594, i64 1
+  %tmp17596 = getelementptr inbounds float* %tmp17595, i64 1
+  %tmp17597 = getelementptr inbounds float* %tmp17596, i64 1
+  %tmp17598 = getelementptr inbounds float* %tmp17597, i64 1
+  %tmp17599 = getelementptr inbounds float* %tmp17598, i64 1
+  %tmp17600 = getelementptr inbounds float* %tmp17599, i64 1
+  %tmp17601 = getelementptr inbounds float* %tmp17600, i64 1
+  %tmp17602 = getelementptr inbounds float* %tmp17601, i64 1
+  %tmp17603 = getelementptr inbounds float* %tmp17602, i64 1
+  %tmp17604 = getelementptr inbounds float* %tmp17603, i64 1
+  %tmp17605 = getelementptr inbounds float* %tmp17604, i64 1
+  %tmp17606 = getelementptr inbounds float* %tmp17605, i64 1
+  %tmp17607 = getelementptr inbounds float* %tmp17606, i64 1
+  %tmp17608 = getelementptr inbounds float* %tmp17607, i64 1
+  %tmp17609 = getelementptr inbounds float* %tmp17608, i64 1
+  %tmp17610 = getelementptr inbounds float* %tmp17609, i64 1
+  %tmp17611 = getelementptr inbounds float* %tmp17610, i64 1
+  %tmp17612 = getelementptr inbounds float* %tmp17611, i64 1
+  %tmp17613 = getelementptr inbounds float* %tmp17612, i64 1
+  %tmp17614 = getelementptr inbounds float* %tmp17613, i64 1
+  %tmp17615 = getelementptr inbounds float* %tmp17614, i64 1
+  %tmp17616 = getelementptr inbounds float* %tmp17615, i64 1
+  %tmp17617 = getelementptr inbounds float* %tmp17616, i64 1
+  %tmp17618 = getelementptr inbounds float* %tmp17617, i64 1
+  %tmp17619 = getelementptr inbounds float* %tmp17618, i64 1
+  %tmp17620 = getelementptr inbounds float* %tmp17619, i64 1
+  %tmp17621 = getelementptr inbounds float* %tmp17620, i64 1
+  %tmp17622 = getelementptr inbounds float* %tmp17621, i64 1
+  %tmp17623 = getelementptr inbounds float* %tmp17622, i64 1
+  %tmp17624 = getelementptr inbounds float* %tmp17623, i64 1
+  %tmp17625 = getelementptr inbounds float* %tmp17624, i64 1
+  %tmp17626 = getelementptr inbounds float* %tmp17625, i64 1
+  %tmp17627 = getelementptr inbounds float* %tmp17626, i64 1
+  %tmp17628 = getelementptr inbounds float* %tmp17627, i64 1
+  %tmp17629 = getelementptr inbounds float* %tmp17628, i64 1
+  %tmp17630 = getelementptr inbounds float* %tmp17629, i64 1
+  %tmp17631 = getelementptr inbounds float* %tmp17630, i64 1
+  %tmp17632 = getelementptr inbounds float* %tmp17631, i64 1
+  %tmp17633 = getelementptr inbounds float* %tmp17632, i64 1
+  %tmp17634 = getelementptr inbounds float* %tmp17633, i64 1
+  %tmp17635 = getelementptr inbounds float* %tmp17634, i64 1
+  %tmp17636 = getelementptr inbounds float* %tmp17635, i64 1
+  %tmp17637 = getelementptr inbounds float* %tmp17636, i64 1
+  %tmp17638 = getelementptr inbounds float* %tmp17637, i64 1
+  %tmp17639 = getelementptr inbounds float* %tmp17638, i64 1
+  %tmp17640 = getelementptr inbounds float* %tmp17639, i64 1
+  %tmp17641 = getelementptr inbounds float* %tmp17640, i64 1
+  %tmp17642 = getelementptr inbounds float* %tmp17641, i64 1
+  %tmp17643 = getelementptr inbounds float* %tmp17642, i64 1
+  %tmp17644 = getelementptr inbounds float* %tmp17643, i64 1
+  %tmp17645 = getelementptr inbounds float* %tmp17644, i64 1
+  %tmp17646 = getelementptr inbounds float* %tmp17645, i64 1
+  %tmp17647 = getelementptr inbounds float* %tmp17646, i64 1
+  %tmp17648 = getelementptr inbounds float* %tmp17647, i64 1
+  %tmp17649 = getelementptr inbounds float* %tmp17648, i64 1
+  %tmp17650 = getelementptr inbounds float* %tmp17649, i64 1
+  %tmp17651 = getelementptr inbounds float* %tmp17650, i64 1
+  %tmp17652 = getelementptr inbounds float* %tmp17651, i64 1
+  %tmp17653 = getelementptr inbounds float* %tmp17652, i64 1
+  %tmp17654 = getelementptr inbounds float* %tmp17653, i64 1
+  %tmp17655 = getelementptr inbounds float* %tmp17654, i64 1
+  %tmp17656 = getelementptr inbounds float* %tmp17655, i64 1
+  %tmp17657 = getelementptr inbounds float* %tmp17656, i64 1
+  %tmp17658 = getelementptr inbounds float* %tmp17657, i64 1
+  %tmp17659 = getelementptr inbounds float* %tmp17658, i64 1
+  %tmp17660 = getelementptr inbounds float* %tmp17659, i64 1
+  %tmp17661 = getelementptr inbounds float* %tmp17660, i64 1
+  %tmp17662 = getelementptr inbounds float* %tmp17661, i64 1
+  %tmp17663 = getelementptr inbounds float* %tmp17662, i64 1
+  %tmp17664 = getelementptr inbounds float* %tmp17663, i64 1
+  %tmp17665 = getelementptr inbounds float* %tmp17664, i64 1
+  %tmp17666 = getelementptr inbounds float* %tmp17665, i64 1
+  %tmp17667 = getelementptr inbounds float* %tmp17666, i64 1
+  %tmp17668 = getelementptr inbounds float* %tmp17667, i64 1
+  %tmp17669 = getelementptr inbounds float* %tmp17668, i64 1
+  %tmp17670 = getelementptr inbounds float* %tmp17669, i64 1
+  %tmp17671 = getelementptr inbounds float* %tmp17670, i64 1
+  %tmp17672 = getelementptr inbounds float* %tmp17671, i64 1
+  %tmp17673 = getelementptr inbounds float* %tmp17672, i64 1
+  %tmp17674 = getelementptr inbounds float* %tmp17673, i64 1
+  %tmp17675 = getelementptr inbounds float* %tmp17674, i64 1
+  %tmp17676 = getelementptr inbounds float* %tmp17675, i64 1
+  %tmp17677 = getelementptr inbounds float* %tmp17676, i64 1
+  %tmp17678 = getelementptr inbounds float* %tmp17677, i64 1
+  %tmp17679 = getelementptr inbounds float* %tmp17678, i64 1
+  %tmp17680 = getelementptr inbounds float* %tmp17679, i64 1
+  %tmp17681 = getelementptr inbounds float* %tmp17680, i64 1
+  %tmp17682 = getelementptr inbounds float* %tmp17681, i64 1
+  %tmp17683 = getelementptr inbounds float* %tmp17682, i64 1
+  %tmp17684 = getelementptr inbounds float* %tmp17683, i64 1
+  %tmp17685 = getelementptr inbounds float* %tmp17684, i64 1
+  %tmp17686 = getelementptr inbounds float* %tmp17685, i64 1
+  %tmp17687 = getelementptr inbounds float* %tmp17686, i64 1
+  %tmp17688 = getelementptr inbounds float* %tmp17687, i64 1
+  %tmp17689 = getelementptr inbounds float* %tmp17688, i64 1
+  %tmp17690 = getelementptr inbounds float* %tmp17689, i64 1
+  %tmp17691 = getelementptr inbounds float* %tmp17690, i64 1
+  %tmp17692 = getelementptr inbounds float* %tmp17691, i64 1
+  %tmp17693 = getelementptr inbounds float* %tmp17692, i64 1
+  %tmp17694 = getelementptr inbounds float* %tmp17693, i64 1
+  %tmp17695 = getelementptr inbounds float* %tmp17694, i64 1
+  %tmp17696 = getelementptr inbounds float* %tmp17695, i64 1
+  %tmp17697 = getelementptr inbounds float* %tmp17696, i64 1
+  %tmp17698 = getelementptr inbounds float* %tmp17697, i64 1
+  %tmp17699 = getelementptr inbounds float* %tmp17698, i64 1
+  %tmp17700 = getelementptr inbounds float* %tmp17699, i64 1
+  %tmp17701 = getelementptr inbounds float* %tmp17700, i64 1
+  %tmp17702 = getelementptr inbounds float* %tmp17701, i64 1
+  %tmp17703 = getelementptr inbounds float* %tmp17702, i64 1
+  %tmp17704 = getelementptr inbounds float* %tmp17703, i64 1
+  %tmp17705 = getelementptr inbounds float* %tmp17704, i64 1
+  %tmp17706 = getelementptr inbounds float* %tmp17705, i64 1
+  %tmp17707 = getelementptr inbounds float* %tmp17706, i64 1
+  %tmp17708 = getelementptr inbounds float* %tmp17707, i64 1
+  %tmp17709 = getelementptr inbounds float* %tmp17708, i64 1
+  %tmp17710 = getelementptr inbounds float* %tmp17709, i64 1
+  %tmp17711 = getelementptr inbounds float* %tmp17710, i64 1
+  %tmp17712 = getelementptr inbounds float* %tmp17711, i64 1
+  %tmp17713 = getelementptr inbounds float* %tmp17712, i64 1
+  %tmp17714 = getelementptr inbounds float* %tmp17713, i64 1
+  %tmp17715 = getelementptr inbounds float* %tmp17714, i64 1
+  %tmp17716 = getelementptr inbounds float* %tmp17715, i64 1
+  %tmp17717 = getelementptr inbounds float* %tmp17716, i64 1
+  %tmp17718 = getelementptr inbounds float* %tmp17717, i64 1
+  %tmp17719 = getelementptr inbounds float* %tmp17718, i64 1
+  %tmp17720 = getelementptr inbounds float* %tmp17719, i64 1
+  %tmp17721 = getelementptr inbounds float* %tmp17720, i64 1
+  %tmp17722 = getelementptr inbounds float* %tmp17721, i64 1
+  %tmp17723 = getelementptr inbounds float* %tmp17722, i64 1
+  %tmp17724 = getelementptr inbounds float* %tmp17723, i64 1
+  %tmp17725 = getelementptr inbounds float* %tmp17724, i64 1
+  %tmp17726 = getelementptr inbounds float* %tmp17725, i64 1
+  %tmp17727 = getelementptr inbounds float* %tmp17726, i64 1
+  %tmp17728 = getelementptr inbounds float* %tmp17727, i64 1
+  %tmp17729 = getelementptr inbounds float* %tmp17728, i64 1
+  %tmp17730 = getelementptr inbounds float* %tmp17729, i64 1
+  %tmp17731 = getelementptr inbounds float* %tmp17730, i64 1
+  %tmp17732 = getelementptr inbounds float* %tmp17731, i64 1
+  %tmp17733 = getelementptr inbounds float* %tmp17732, i64 1
+  %tmp17734 = getelementptr inbounds float* %tmp17733, i64 1
+  %tmp17735 = getelementptr inbounds float* %tmp17734, i64 1
+  %tmp17736 = getelementptr inbounds float* %tmp17735, i64 1
+  %tmp17737 = getelementptr inbounds float* %tmp17736, i64 1
+  %tmp17738 = getelementptr inbounds float* %tmp17737, i64 1
+  %tmp17739 = getelementptr inbounds float* %tmp17738, i64 1
+  %tmp17740 = getelementptr inbounds float* %tmp17739, i64 1
+  %tmp17741 = getelementptr inbounds float* %tmp17740, i64 1
+  %tmp17742 = getelementptr inbounds float* %tmp17741, i64 1
+  %tmp17743 = getelementptr inbounds float* %tmp17742, i64 1
+  %tmp17744 = getelementptr inbounds float* %tmp17743, i64 1
+  %tmp17745 = getelementptr inbounds float* %tmp17744, i64 1
+  %tmp17746 = getelementptr inbounds float* %tmp17745, i64 1
+  %tmp17747 = getelementptr inbounds float* %tmp17746, i64 1
+  %tmp17748 = getelementptr inbounds float* %tmp17747, i64 1
+  %tmp17749 = getelementptr inbounds float* %tmp17748, i64 1
+  %tmp17750 = getelementptr inbounds float* %tmp17749, i64 1
+  %tmp17751 = getelementptr inbounds float* %tmp17750, i64 1
+  %tmp17752 = getelementptr inbounds float* %tmp17751, i64 1
+  %tmp17753 = getelementptr inbounds float* %tmp17752, i64 1
+  %tmp17754 = getelementptr inbounds float* %tmp17753, i64 1
+  %tmp17755 = getelementptr inbounds float* %tmp17754, i64 1
+  %tmp17756 = getelementptr inbounds float* %tmp17755, i64 1
+  %tmp17757 = getelementptr inbounds float* %tmp17756, i64 1
+  %tmp17758 = getelementptr inbounds float* %tmp17757, i64 1
+  %tmp17759 = getelementptr inbounds float* %tmp17758, i64 1
+  %tmp17760 = getelementptr inbounds float* %tmp17759, i64 1
+  %tmp17761 = getelementptr inbounds float* %tmp17760, i64 1
+  %tmp17762 = getelementptr inbounds float* %tmp17761, i64 1
+  %tmp17763 = getelementptr inbounds float* %tmp17762, i64 1
+  %tmp17764 = getelementptr inbounds float* %tmp17763, i64 1
+  %tmp17765 = getelementptr inbounds float* %tmp17764, i64 1
+  %tmp17766 = getelementptr inbounds float* %tmp17765, i64 1
+  %tmp17767 = getelementptr inbounds float* %tmp17766, i64 1
+  %tmp17768 = getelementptr inbounds float* %tmp17767, i64 1
+  %tmp17769 = getelementptr inbounds float* %tmp17768, i64 1
+  %tmp17770 = getelementptr inbounds float* %tmp17769, i64 1
+  %tmp17771 = getelementptr inbounds float* %tmp17770, i64 1
+  %tmp17772 = getelementptr inbounds float* %tmp17771, i64 1
+  %tmp17773 = getelementptr inbounds float* %tmp17772, i64 1
+  %tmp17774 = getelementptr inbounds float* %tmp17773, i64 1
+  %tmp17775 = getelementptr inbounds float* %tmp17774, i64 1
+  %tmp17776 = getelementptr inbounds float* %tmp17775, i64 1
+  %tmp17777 = getelementptr inbounds float* %tmp17776, i64 1
+  %tmp17778 = getelementptr inbounds float* %tmp17777, i64 1
+  %tmp17779 = getelementptr inbounds float* %tmp17778, i64 1
+  %tmp17780 = getelementptr inbounds float* %tmp17779, i64 1
+  %tmp17781 = getelementptr inbounds float* %tmp17780, i64 1
+  %tmp17782 = getelementptr inbounds float* %tmp17781, i64 1
+  %tmp17783 = getelementptr inbounds float* %tmp17782, i64 1
+  %tmp17784 = getelementptr inbounds float* %tmp17783, i64 1
+  %tmp17785 = getelementptr inbounds float* %tmp17784, i64 1
+  %tmp17786 = getelementptr inbounds float* %tmp17785, i64 1
+  %tmp17787 = getelementptr inbounds float* %tmp17786, i64 1
+  %tmp17788 = getelementptr inbounds float* %tmp17787, i64 1
+  %tmp17789 = getelementptr inbounds float* %tmp17788, i64 1
+  %tmp17790 = getelementptr inbounds float* %tmp17789, i64 1
+  %tmp17791 = getelementptr inbounds float* %tmp17790, i64 1
+  %tmp17792 = getelementptr inbounds float* %tmp17791, i64 1
+  %tmp17793 = getelementptr inbounds float* %tmp17792, i64 1
+  %tmp17794 = getelementptr inbounds float* %tmp17793, i64 1
+  %tmp17795 = getelementptr inbounds float* %tmp17794, i64 1
+  %tmp17796 = getelementptr inbounds float* %tmp17795, i64 1
+  %tmp17797 = getelementptr inbounds float* %tmp17796, i64 1
+  %tmp17798 = getelementptr inbounds float* %tmp17797, i64 1
+  %tmp17799 = getelementptr inbounds float* %tmp17798, i64 1
+  %tmp17800 = getelementptr inbounds float* %tmp17799, i64 1
+  %tmp17801 = getelementptr inbounds float* %tmp17800, i64 1
+  %tmp17802 = getelementptr inbounds float* %tmp17801, i64 1
+  %tmp17803 = getelementptr inbounds float* %tmp17802, i64 1
+  %tmp17804 = getelementptr inbounds float* %tmp17803, i64 1
+  %tmp17805 = getelementptr inbounds float* %tmp17804, i64 1
+  %tmp17806 = getelementptr inbounds float* %tmp17805, i64 1
+  %tmp17807 = getelementptr inbounds float* %tmp17806, i64 1
+  %tmp17808 = getelementptr inbounds float* %tmp17807, i64 1
+  %tmp17809 = getelementptr inbounds float* %tmp17808, i64 1
+  %tmp17810 = getelementptr inbounds float* %tmp17809, i64 1
+  %tmp17811 = getelementptr inbounds float* %tmp17810, i64 1
+  %tmp17812 = getelementptr inbounds float* %tmp17811, i64 1
+  %tmp17813 = getelementptr inbounds float* %tmp17812, i64 1
+  %tmp17814 = getelementptr inbounds float* %tmp17813, i64 1
+  %tmp17815 = getelementptr inbounds float* %tmp17814, i64 1
+  %tmp17816 = getelementptr inbounds float* %tmp17815, i64 1
+  %tmp17817 = getelementptr inbounds float* %tmp17816, i64 1
+  %tmp17818 = getelementptr inbounds float* %tmp17817, i64 1
+  %tmp17819 = getelementptr inbounds float* %tmp17818, i64 1
+  %tmp17820 = getelementptr inbounds float* %tmp17819, i64 1
+  %tmp17821 = getelementptr inbounds float* %tmp17820, i64 1
+  %tmp17822 = getelementptr inbounds float* %tmp17821, i64 1
+  %tmp17823 = getelementptr inbounds float* %tmp17822, i64 1
+  %tmp17824 = getelementptr inbounds float* %tmp17823, i64 1
+  %tmp17825 = getelementptr inbounds float* %tmp17824, i64 1
+  %tmp17826 = getelementptr inbounds float* %tmp17825, i64 1
+  %tmp17827 = getelementptr inbounds float* %tmp17826, i64 1
+  %tmp17828 = getelementptr inbounds float* %tmp17827, i64 1
+  %tmp17829 = getelementptr inbounds float* %tmp17828, i64 1
+  %tmp17830 = getelementptr inbounds float* %tmp17829, i64 1
+  %tmp17831 = getelementptr inbounds float* %tmp17830, i64 1
+  %tmp17832 = getelementptr inbounds float* %tmp17831, i64 1
+  %tmp17833 = getelementptr inbounds float* %tmp17832, i64 1
+  %tmp17834 = getelementptr inbounds float* %tmp17833, i64 1
+  %tmp17835 = getelementptr inbounds float* %tmp17834, i64 1
+  %tmp17836 = getelementptr inbounds float* %tmp17835, i64 1
+  %tmp17837 = getelementptr inbounds float* %tmp17836, i64 1
+  %tmp17838 = getelementptr inbounds float* %tmp17837, i64 1
+  %tmp17839 = getelementptr inbounds float* %tmp17838, i64 1
+  %tmp17840 = getelementptr inbounds float* %tmp17839, i64 1
+  %tmp17841 = getelementptr inbounds float* %tmp17840, i64 1
+  %tmp17842 = getelementptr inbounds float* %tmp17841, i64 1
+  %tmp17843 = getelementptr inbounds float* %tmp17842, i64 1
+  %tmp17844 = getelementptr inbounds float* %tmp17843, i64 1
+  %tmp17845 = getelementptr inbounds float* %tmp17844, i64 1
+  %tmp17846 = getelementptr inbounds float* %tmp17845, i64 1
+  %tmp17847 = getelementptr inbounds float* %tmp17846, i64 1
+  %tmp17848 = getelementptr inbounds float* %tmp17847, i64 1
+  %tmp17849 = getelementptr inbounds float* %tmp17848, i64 1
+  %tmp17850 = getelementptr inbounds float* %tmp17849, i64 1
+  %tmp17851 = getelementptr inbounds float* %tmp17850, i64 1
+  %tmp17852 = getelementptr inbounds float* %tmp17851, i64 1
+  %tmp17853 = getelementptr inbounds float* %tmp17852, i64 1
+  %tmp17854 = getelementptr inbounds float* %tmp17853, i64 1
+  %tmp17855 = getelementptr inbounds float* %tmp17854, i64 1
+  %tmp17856 = getelementptr inbounds float* %tmp17855, i64 1
+  %tmp17857 = getelementptr inbounds float* %tmp17856, i64 1
+  %tmp17858 = getelementptr inbounds float* %tmp17857, i64 1
+  %tmp17859 = getelementptr inbounds float* %tmp17858, i64 1
+  %tmp17860 = getelementptr inbounds float* %tmp17859, i64 1
+  %tmp17861 = getelementptr inbounds float* %tmp17860, i64 1
+  %tmp17862 = getelementptr inbounds float* %tmp17861, i64 1
+  %tmp17863 = getelementptr inbounds float* %tmp17862, i64 1
+  %tmp17864 = getelementptr inbounds float* %tmp17863, i64 1
+  %tmp17865 = getelementptr inbounds float* %tmp17864, i64 1
+  %tmp17866 = getelementptr inbounds float* %tmp17865, i64 1
+  %tmp17867 = getelementptr inbounds float* %tmp17866, i64 1
+  %tmp17868 = getelementptr inbounds float* %tmp17867, i64 1
+  %tmp17869 = getelementptr inbounds float* %tmp17868, i64 1
+  %tmp17870 = getelementptr inbounds float* %tmp17869, i64 1
+  %tmp17871 = getelementptr inbounds float* %tmp17870, i64 1
+  %tmp17872 = getelementptr inbounds float* %tmp17871, i64 1
+  %tmp17873 = getelementptr inbounds float* %tmp17872, i64 1
+  %tmp17874 = getelementptr inbounds float* %tmp17873, i64 1
+  %tmp17875 = getelementptr inbounds float* %tmp17874, i64 1
+  %tmp17876 = getelementptr inbounds float* %tmp17875, i64 1
+  %tmp17877 = getelementptr inbounds float* %tmp17876, i64 1
+  %tmp17878 = getelementptr inbounds float* %tmp17877, i64 1
+  %tmp17879 = getelementptr inbounds float* %tmp17878, i64 1
+  %tmp17880 = getelementptr inbounds float* %tmp17879, i64 1
+  %tmp17881 = getelementptr inbounds float* %tmp17880, i64 1
+  %tmp17882 = getelementptr inbounds float* %tmp17881, i64 1
+  %tmp17883 = getelementptr inbounds float* %tmp17882, i64 1
+  %tmp17884 = getelementptr inbounds float* %tmp17883, i64 1
+  %tmp17885 = getelementptr inbounds float* %tmp17884, i64 1
+  %tmp17886 = getelementptr inbounds float* %tmp17885, i64 1
+  %tmp17887 = getelementptr inbounds float* %tmp17886, i64 1
+  %tmp17888 = getelementptr inbounds float* %tmp17887, i64 1
+  %tmp17889 = getelementptr inbounds float* %tmp17888, i64 1
+  %tmp17890 = getelementptr inbounds float* %tmp17889, i64 1
+  %tmp17891 = getelementptr inbounds float* %tmp17890, i64 1
+  %tmp17892 = getelementptr inbounds float* %tmp17891, i64 1
+  %tmp17893 = getelementptr inbounds float* %tmp17892, i64 1
+  %tmp17894 = getelementptr inbounds float* %tmp17893, i64 1
+  %tmp17895 = getelementptr inbounds float* %tmp17894, i64 1
+  %tmp17896 = getelementptr inbounds float* %tmp17895, i64 1
+  %tmp17897 = getelementptr inbounds float* %tmp17896, i64 1
+  %tmp17898 = getelementptr inbounds float* %tmp17897, i64 1
+  %tmp17899 = getelementptr inbounds float* %tmp17898, i64 1
+  %tmp17900 = getelementptr inbounds float* %tmp17899, i64 1
+  %tmp17901 = getelementptr inbounds float* %tmp17900, i64 1
+  %tmp17902 = getelementptr inbounds float* %tmp17901, i64 1
+  %tmp17903 = getelementptr inbounds float* %tmp17902, i64 1
+  %tmp17904 = getelementptr inbounds float* %tmp17903, i64 1
+  %tmp17905 = getelementptr inbounds float* %tmp17904, i64 1
+  %tmp17906 = getelementptr inbounds float* %tmp17905, i64 1
+  %tmp17907 = getelementptr inbounds float* %tmp17906, i64 1
+  %tmp17908 = getelementptr inbounds float* %tmp17907, i64 1
+  %tmp17909 = getelementptr inbounds float* %tmp17908, i64 1
+  %tmp17910 = getelementptr inbounds float* %tmp17909, i64 1
+  %tmp17911 = getelementptr inbounds float* %tmp17910, i64 1
+  %tmp17912 = getelementptr inbounds float* %tmp17911, i64 1
+  %tmp17913 = getelementptr inbounds float* %tmp17912, i64 1
+  %tmp17914 = getelementptr inbounds float* %tmp17913, i64 1
+  %tmp17915 = getelementptr inbounds float* %tmp17914, i64 1
+  %tmp17916 = getelementptr inbounds float* %tmp17915, i64 1
+  %tmp17917 = getelementptr inbounds float* %tmp17916, i64 1
+  %tmp17918 = getelementptr inbounds float* %tmp17917, i64 1
+  %tmp17919 = getelementptr inbounds float* %tmp17918, i64 1
+  %tmp17920 = getelementptr inbounds float* %tmp17919, i64 1
+  %tmp17921 = getelementptr inbounds float* %tmp17920, i64 1
+  %tmp17922 = getelementptr inbounds float* %tmp17921, i64 1
+  %tmp17923 = getelementptr inbounds float* %tmp17922, i64 1
+  %tmp17924 = getelementptr inbounds float* %tmp17923, i64 1
+  %tmp17925 = getelementptr inbounds float* %tmp17924, i64 1
+  %tmp17926 = getelementptr inbounds float* %tmp17925, i64 1
+  %tmp17927 = getelementptr inbounds float* %tmp17926, i64 1
+  %tmp17928 = getelementptr inbounds float* %tmp17927, i64 1
+  %tmp17929 = getelementptr inbounds float* %tmp17928, i64 1
+  %tmp17930 = getelementptr inbounds float* %tmp17929, i64 1
+  %tmp17931 = getelementptr inbounds float* %tmp17930, i64 1
+  %tmp17932 = getelementptr inbounds float* %tmp17931, i64 1
+  %tmp17933 = getelementptr inbounds float* %tmp17932, i64 1
+  %tmp17934 = getelementptr inbounds float* %tmp17933, i64 1
+  %tmp17935 = getelementptr inbounds float* %tmp17934, i64 1
+  %tmp17936 = getelementptr inbounds float* %tmp17935, i64 1
+  %tmp17937 = getelementptr inbounds float* %tmp17936, i64 1
+  %tmp17938 = getelementptr inbounds float* %tmp17937, i64 1
+  %tmp17939 = getelementptr inbounds float* %tmp17938, i64 1
+  %tmp17940 = getelementptr inbounds float* %tmp17939, i64 1
+  %tmp17941 = getelementptr inbounds float* %tmp17940, i64 1
+  %tmp17942 = getelementptr inbounds float* %tmp17941, i64 1
+  %tmp17943 = getelementptr inbounds float* %tmp17942, i64 1
+  %tmp17944 = getelementptr inbounds float* %tmp17943, i64 1
+  %tmp17945 = getelementptr inbounds float* %tmp17944, i64 1
+  %tmp17946 = getelementptr inbounds float* %tmp17945, i64 1
+  %tmp17947 = getelementptr inbounds float* %tmp17946, i64 1
+  %tmp17948 = getelementptr inbounds float* %tmp17947, i64 1
+  %tmp17949 = getelementptr inbounds float* %tmp17948, i64 1
+  %tmp17950 = getelementptr inbounds float* %tmp17949, i64 1
+  %tmp17951 = getelementptr inbounds float* %tmp17950, i64 1
+  %tmp17952 = getelementptr inbounds float* %tmp17951, i64 1
+  %tmp17953 = getelementptr inbounds float* %tmp17952, i64 1
+  %tmp17954 = getelementptr inbounds float* %tmp17953, i64 1
+  %tmp17955 = getelementptr inbounds float* %tmp17954, i64 1
+  %tmp17956 = getelementptr inbounds float* %tmp17955, i64 1
+  %tmp17957 = getelementptr inbounds float* %tmp17956, i64 1
+  %tmp17958 = getelementptr inbounds float* %tmp17957, i64 1
+  %tmp17959 = getelementptr inbounds float* %tmp17958, i64 1
+  %tmp17960 = getelementptr inbounds float* %tmp17959, i64 1
+  %tmp17961 = getelementptr inbounds float* %tmp17960, i64 1
+  %tmp17962 = getelementptr inbounds float* %tmp17961, i64 1
+  %tmp17963 = getelementptr inbounds float* %tmp17962, i64 1
+  %tmp17964 = getelementptr inbounds float* %tmp17963, i64 1
+  %tmp17965 = getelementptr inbounds float* %tmp17964, i64 1
+  %tmp17966 = getelementptr inbounds float* %tmp17965, i64 1
+  %tmp17967 = getelementptr inbounds float* %tmp17966, i64 1
+  %tmp17968 = getelementptr inbounds float* %tmp17967, i64 1
+  %tmp17969 = getelementptr inbounds float* %tmp17968, i64 1
+  %tmp17970 = getelementptr inbounds float* %tmp17969, i64 1
+  %tmp17971 = getelementptr inbounds float* %tmp17970, i64 1
+  %tmp17972 = getelementptr inbounds float* %tmp17971, i64 1
+  %tmp17973 = getelementptr inbounds float* %tmp17972, i64 1
+  %tmp17974 = getelementptr inbounds float* %tmp17973, i64 1
+  %tmp17975 = getelementptr inbounds float* %tmp17974, i64 1
+  %tmp17976 = getelementptr inbounds float* %tmp17975, i64 1
+  %tmp17977 = getelementptr inbounds float* %tmp17976, i64 1
+  %tmp17978 = getelementptr inbounds float* %tmp17977, i64 1
+  %tmp17979 = getelementptr inbounds float* %tmp17978, i64 1
+  %tmp17980 = getelementptr inbounds float* %tmp17979, i64 1
+  %tmp17981 = getelementptr inbounds float* %tmp17980, i64 1
+  %tmp17982 = getelementptr inbounds float* %tmp17981, i64 1
+  %tmp17983 = getelementptr inbounds float* %tmp17982, i64 1
+  %tmp17984 = getelementptr inbounds float* %tmp17983, i64 1
+  %tmp17985 = getelementptr inbounds float* %tmp17984, i64 1
+  %tmp17986 = getelementptr inbounds float* %tmp17985, i64 1
+  %tmp17987 = getelementptr inbounds float* %tmp17986, i64 1
+  %tmp17988 = getelementptr inbounds float* %tmp17987, i64 1
+  %tmp17989 = getelementptr inbounds float* %tmp17988, i64 1
+  %tmp17990 = getelementptr inbounds float* %tmp17989, i64 1
+  %tmp17991 = getelementptr inbounds float* %tmp17990, i64 1
+  %tmp17992 = getelementptr inbounds float* %tmp17991, i64 1
+  %tmp17993 = getelementptr inbounds float* %tmp17992, i64 1
+  %tmp17994 = getelementptr inbounds float* %tmp17993, i64 1
+  %tmp17995 = getelementptr inbounds float* %tmp17994, i64 1
+  %tmp17996 = getelementptr inbounds float* %tmp17995, i64 1
+  %tmp17997 = getelementptr inbounds float* %tmp17996, i64 1
+  %tmp17998 = getelementptr inbounds float* %tmp17997, i64 1
+  %tmp17999 = getelementptr inbounds float* %tmp17998, i64 1
+  %tmp18000 = getelementptr inbounds float* %tmp17999, i64 1
+  %tmp18001 = getelementptr inbounds float* %tmp18000, i64 1
+  %tmp18002 = getelementptr inbounds float* %tmp18001, i64 1
+  %tmp18003 = getelementptr inbounds float* %tmp18002, i64 1
+  %tmp18004 = getelementptr inbounds float* %tmp18003, i64 1
+  %tmp18005 = getelementptr inbounds float* %tmp18004, i64 1
+  %tmp18006 = getelementptr inbounds float* %tmp18005, i64 1
+  %tmp18007 = getelementptr inbounds float* %tmp18006, i64 1
+  %tmp18008 = getelementptr inbounds float* %tmp18007, i64 1
+  %tmp18009 = getelementptr inbounds float* %tmp18008, i64 1
+  %tmp18010 = getelementptr inbounds float* %tmp18009, i64 1
+  %tmp18011 = getelementptr inbounds float* %tmp18010, i64 1
+  %tmp18012 = getelementptr inbounds float* %tmp18011, i64 1
+  %tmp18013 = getelementptr inbounds float* %tmp18012, i64 1
+  %tmp18014 = getelementptr inbounds float* %tmp18013, i64 1
+  %tmp18015 = getelementptr inbounds float* %tmp18014, i64 1
+  %tmp18016 = getelementptr inbounds float* %tmp18015, i64 1
+  %tmp18017 = getelementptr inbounds float* %tmp18016, i64 1
+  %tmp18018 = getelementptr inbounds float* %tmp18017, i64 1
+  %tmp18019 = getelementptr inbounds float* %tmp18018, i64 1
+  %tmp18020 = getelementptr inbounds float* %tmp18019, i64 1
+  %tmp18021 = getelementptr inbounds float* %tmp18020, i64 1
+  %tmp18022 = getelementptr inbounds float* %tmp18021, i64 1
+  %tmp18023 = getelementptr inbounds float* %tmp18022, i64 1
+  %tmp18024 = getelementptr inbounds float* %tmp18023, i64 1
+  %tmp18025 = getelementptr inbounds float* %tmp18024, i64 1
+  %tmp18026 = getelementptr inbounds float* %tmp18025, i64 1
+  %tmp18027 = getelementptr inbounds float* %tmp18026, i64 1
+  %tmp18028 = getelementptr inbounds float* %tmp18027, i64 1
+  %tmp18029 = getelementptr inbounds float* %tmp18028, i64 1
+  %tmp18030 = getelementptr inbounds float* %tmp18029, i64 1
+  %tmp18031 = getelementptr inbounds float* %tmp18030, i64 1
+  %tmp18032 = getelementptr inbounds float* %tmp18031, i64 1
+  %tmp18033 = getelementptr inbounds float* %tmp18032, i64 1
+  %tmp18034 = getelementptr inbounds float* %tmp18033, i64 1
+  %tmp18035 = getelementptr inbounds float* %tmp18034, i64 1
+  %tmp18036 = getelementptr inbounds float* %tmp18035, i64 1
+  %tmp18037 = getelementptr inbounds float* %tmp18036, i64 1
+  %tmp18038 = getelementptr inbounds float* %tmp18037, i64 1
+  %tmp18039 = getelementptr inbounds float* %tmp18038, i64 1
+  %tmp18040 = getelementptr inbounds float* %tmp18039, i64 1
+  %tmp18041 = getelementptr inbounds float* %tmp18040, i64 1
+  %tmp18042 = getelementptr inbounds float* %tmp18041, i64 1
+  %tmp18043 = getelementptr inbounds float* %tmp18042, i64 1
+  %tmp18044 = getelementptr inbounds float* %tmp18043, i64 1
+  %tmp18045 = getelementptr inbounds float* %tmp18044, i64 1
+  %tmp18046 = getelementptr inbounds float* %tmp18045, i64 1
+  %tmp18047 = getelementptr inbounds float* %tmp18046, i64 1
+  %tmp18048 = getelementptr inbounds float* %tmp18047, i64 1
+  %tmp18049 = getelementptr inbounds float* %tmp18048, i64 1
+  %tmp18050 = getelementptr inbounds float* %tmp18049, i64 1
+  %tmp18051 = getelementptr inbounds float* %tmp18050, i64 1
+  %tmp18052 = getelementptr inbounds float* %tmp18051, i64 1
+  %tmp18053 = getelementptr inbounds float* %tmp18052, i64 1
+  %tmp18054 = getelementptr inbounds float* %tmp18053, i64 1
+  %tmp18055 = getelementptr inbounds float* %tmp18054, i64 1
+  %tmp18056 = getelementptr inbounds float* %tmp18055, i64 1
+  %tmp18057 = getelementptr inbounds float* %tmp18056, i64 1
+  %tmp18058 = getelementptr inbounds float* %tmp18057, i64 1
+  %tmp18059 = getelementptr inbounds float* %tmp18058, i64 1
+  %tmp18060 = getelementptr inbounds float* %tmp18059, i64 1
+  %tmp18061 = getelementptr inbounds float* %tmp18060, i64 1
+  %tmp18062 = getelementptr inbounds float* %tmp18061, i64 1
+  %tmp18063 = getelementptr inbounds float* %tmp18062, i64 1
+  %tmp18064 = getelementptr inbounds float* %tmp18063, i64 1
+  %tmp18065 = getelementptr inbounds float* %tmp18064, i64 1
+  %tmp18066 = getelementptr inbounds float* %tmp18065, i64 1
+  %tmp18067 = getelementptr inbounds float* %tmp18066, i64 1
+  %tmp18068 = getelementptr inbounds float* %tmp18067, i64 1
+  %tmp18069 = getelementptr inbounds float* %tmp18068, i64 1
+  %tmp18070 = getelementptr inbounds float* %tmp18069, i64 1
+  %tmp18071 = getelementptr inbounds float* %tmp18070, i64 1
+  %tmp18072 = getelementptr inbounds float* %tmp18071, i64 1
+  %tmp18073 = getelementptr inbounds float* %tmp18072, i64 1
+  %tmp18074 = getelementptr inbounds float* %tmp18073, i64 1
+  %tmp18075 = getelementptr inbounds float* %tmp18074, i64 1
+  %tmp18076 = getelementptr inbounds float* %tmp18075, i64 1
+  %tmp18077 = getelementptr inbounds float* %tmp18076, i64 1
+  %tmp18078 = getelementptr inbounds float* %tmp18077, i64 1
+  %tmp18079 = getelementptr inbounds float* %tmp18078, i64 1
+  %tmp18080 = getelementptr inbounds float* %tmp18079, i64 1
+  %tmp18081 = getelementptr inbounds float* %tmp18080, i64 1
+  %tmp18082 = getelementptr inbounds float* %tmp18081, i64 1
+  %tmp18083 = getelementptr inbounds float* %tmp18082, i64 1
+  %tmp18084 = getelementptr inbounds float* %tmp18083, i64 1
+  %tmp18085 = getelementptr inbounds float* %tmp18084, i64 1
+  %tmp18086 = getelementptr inbounds float* %tmp18085, i64 1
+  %tmp18087 = getelementptr inbounds float* %tmp18086, i64 1
+  %tmp18088 = getelementptr inbounds float* %tmp18087, i64 1
+  %tmp18089 = getelementptr inbounds float* %tmp18088, i64 1
+  %tmp18090 = getelementptr inbounds float* %tmp18089, i64 1
+  %tmp18091 = getelementptr inbounds float* %tmp18090, i64 1
+  %tmp18092 = getelementptr inbounds float* %tmp18091, i64 1
+  %tmp18093 = getelementptr inbounds float* %tmp18092, i64 1
+  %tmp18094 = getelementptr inbounds float* %tmp18093, i64 1
+  %tmp18095 = getelementptr inbounds float* %tmp18094, i64 1
+  %tmp18096 = getelementptr inbounds float* %tmp18095, i64 1
+  %tmp18097 = getelementptr inbounds float* %tmp18096, i64 1
+  %tmp18098 = getelementptr inbounds float* %tmp18097, i64 1
+  %tmp18099 = getelementptr inbounds float* %tmp18098, i64 1
+  %tmp18100 = getelementptr inbounds float* %tmp18099, i64 1
+  %tmp18101 = getelementptr inbounds float* %tmp18100, i64 1
+  %tmp18102 = getelementptr inbounds float* %tmp18101, i64 1
+  %tmp18103 = getelementptr inbounds float* %tmp18102, i64 1
+  %tmp18104 = getelementptr inbounds float* %tmp18103, i64 1
+  %tmp18105 = getelementptr inbounds float* %tmp18104, i64 1
+  %tmp18106 = getelementptr inbounds float* %tmp18105, i64 1
+  %tmp18107 = getelementptr inbounds float* %tmp18106, i64 1
+  %tmp18108 = getelementptr inbounds float* %tmp18107, i64 1
+  %tmp18109 = getelementptr inbounds float* %tmp18108, i64 1
+  %tmp18110 = getelementptr inbounds float* %tmp18109, i64 1
+  %tmp18111 = getelementptr inbounds float* %tmp18110, i64 1
+  %tmp18112 = getelementptr inbounds float* %tmp18111, i64 1
+  %tmp18113 = getelementptr inbounds float* %tmp18112, i64 1
+  %tmp18114 = getelementptr inbounds float* %tmp18113, i64 1
+  %tmp18115 = getelementptr inbounds float* %tmp18114, i64 1
+  %tmp18116 = getelementptr inbounds float* %tmp18115, i64 1
+  %tmp18117 = getelementptr inbounds float* %tmp18116, i64 1
+  %tmp18118 = getelementptr inbounds float* %tmp18117, i64 1
+  %tmp18119 = getelementptr inbounds float* %tmp18118, i64 1
+  %tmp18120 = getelementptr inbounds float* %tmp18119, i64 1
+  %tmp18121 = getelementptr inbounds float* %tmp18120, i64 1
+  %tmp18122 = getelementptr inbounds float* %tmp18121, i64 1
+  %tmp18123 = getelementptr inbounds float* %tmp18122, i64 1
+  %tmp18124 = getelementptr inbounds float* %tmp18123, i64 1
+  %tmp18125 = getelementptr inbounds float* %tmp18124, i64 1
+  %tmp18126 = getelementptr inbounds float* %tmp18125, i64 1
+  %tmp18127 = getelementptr inbounds float* %tmp18126, i64 1
+  %tmp18128 = getelementptr inbounds float* %tmp18127, i64 1
+  %tmp18129 = getelementptr inbounds float* %tmp18128, i64 1
+  %tmp18130 = getelementptr inbounds float* %tmp18129, i64 1
+  %tmp18131 = getelementptr inbounds float* %tmp18130, i64 1
+  %tmp18132 = getelementptr inbounds float* %tmp18131, i64 1
+  %tmp18133 = getelementptr inbounds float* %tmp18132, i64 1
+  %tmp18134 = getelementptr inbounds float* %tmp18133, i64 1
+  %tmp18135 = getelementptr inbounds float* %tmp18134, i64 1
+  %tmp18136 = getelementptr inbounds float* %tmp18135, i64 1
+  %tmp18137 = getelementptr inbounds float* %tmp18136, i64 1
+  %tmp18138 = getelementptr inbounds float* %tmp18137, i64 1
+  %tmp18139 = getelementptr inbounds float* %tmp18138, i64 1
+  %tmp18140 = getelementptr inbounds float* %tmp18139, i64 1
+  %tmp18141 = getelementptr inbounds float* %tmp18140, i64 1
+  %tmp18142 = getelementptr inbounds float* %tmp18141, i64 1
+  %tmp18143 = getelementptr inbounds float* %tmp18142, i64 1
+  %tmp18144 = getelementptr inbounds float* %tmp18143, i64 1
+  %tmp18145 = getelementptr inbounds float* %tmp18144, i64 1
+  %tmp18146 = getelementptr inbounds float* %tmp18145, i64 1
+  %tmp18147 = getelementptr inbounds float* %tmp18146, i64 1
+  %tmp18148 = getelementptr inbounds float* %tmp18147, i64 1
+  %tmp18149 = getelementptr inbounds float* %tmp18148, i64 1
+  %tmp18150 = getelementptr inbounds float* %tmp18149, i64 1
+  %tmp18151 = getelementptr inbounds float* %tmp18150, i64 1
+  %tmp18152 = getelementptr inbounds float* %tmp18151, i64 1
+  %tmp18153 = getelementptr inbounds float* %tmp18152, i64 1
+  %tmp18154 = getelementptr inbounds float* %tmp18153, i64 1
+  %tmp18155 = getelementptr inbounds float* %tmp18154, i64 1
+  %tmp18156 = getelementptr inbounds float* %tmp18155, i64 1
+  %tmp18157 = getelementptr inbounds float* %tmp18156, i64 1
+  %tmp18158 = getelementptr inbounds float* %tmp18157, i64 1
+  %tmp18159 = getelementptr inbounds float* %tmp18158, i64 1
+  %tmp18160 = getelementptr inbounds float* %tmp18159, i64 1
+  %tmp18161 = getelementptr inbounds float* %tmp18160, i64 1
+  %tmp18162 = getelementptr inbounds float* %tmp18161, i64 1
+  %tmp18163 = getelementptr inbounds float* %tmp18162, i64 1
+  %tmp18164 = getelementptr inbounds float* %tmp18163, i64 1
+  %tmp18165 = getelementptr inbounds float* %tmp18164, i64 1
+  %tmp18166 = getelementptr inbounds float* %tmp18165, i64 1
+  %tmp18167 = getelementptr inbounds float* %tmp18166, i64 1
+  %tmp18168 = getelementptr inbounds float* %tmp18167, i64 1
+  %tmp18169 = getelementptr inbounds float* %tmp18168, i64 1
+  %tmp18170 = getelementptr inbounds float* %tmp18169, i64 1
+  %tmp18171 = getelementptr inbounds float* %tmp18170, i64 1
+  %tmp18172 = getelementptr inbounds float* %tmp18171, i64 1
+  %tmp18173 = getelementptr inbounds float* %tmp18172, i64 1
+  %tmp18174 = getelementptr inbounds float* %tmp18173, i64 1
+  %tmp18175 = getelementptr inbounds float* %tmp18174, i64 1
+  %tmp18176 = getelementptr inbounds float* %tmp18175, i64 1
+  %tmp18177 = getelementptr inbounds float* %tmp18176, i64 1
+  %tmp18178 = getelementptr inbounds float* %tmp18177, i64 1
+  %tmp18179 = getelementptr inbounds float* %tmp18178, i64 1
+  %tmp18180 = getelementptr inbounds float* %tmp18179, i64 1
+  %tmp18181 = getelementptr inbounds float* %tmp18180, i64 1
+  %tmp18182 = getelementptr inbounds float* %tmp18181, i64 1
+  %tmp18183 = getelementptr inbounds float* %tmp18182, i64 1
+  %tmp18184 = getelementptr inbounds float* %tmp18183, i64 1
+  %tmp18185 = getelementptr inbounds float* %tmp18184, i64 1
+  %tmp18186 = getelementptr inbounds float* %tmp18185, i64 1
+  %tmp18187 = getelementptr inbounds float* %tmp18186, i64 1
+  %tmp18188 = getelementptr inbounds float* %tmp18187, i64 1
+  %tmp18189 = getelementptr inbounds float* %tmp18188, i64 1
+  %tmp18190 = getelementptr inbounds float* %tmp18189, i64 1
+  %tmp18191 = getelementptr inbounds float* %tmp18190, i64 1
+  %tmp18192 = getelementptr inbounds float* %tmp18191, i64 1
+  %tmp18193 = getelementptr inbounds float* %tmp18192, i64 1
+  %tmp18194 = getelementptr inbounds float* %tmp18193, i64 1
+  %tmp18195 = getelementptr inbounds float* %tmp18194, i64 1
+  %tmp18196 = getelementptr inbounds float* %tmp18195, i64 1
+  %tmp18197 = getelementptr inbounds float* %tmp18196, i64 1
+  %tmp18198 = getelementptr inbounds float* %tmp18197, i64 1
+  %tmp18199 = getelementptr inbounds float* %tmp18198, i64 1
+  %tmp18200 = getelementptr inbounds float* %tmp18199, i64 1
+  %tmp18201 = getelementptr inbounds float* %tmp18200, i64 1
+  %tmp18202 = getelementptr inbounds float* %tmp18201, i64 1
+  %tmp18203 = getelementptr inbounds float* %tmp18202, i64 1
+  %tmp18204 = getelementptr inbounds float* %tmp18203, i64 1
+  %tmp18205 = getelementptr inbounds float* %tmp18204, i64 1
+  %tmp18206 = getelementptr inbounds float* %tmp18205, i64 1
+  %tmp18207 = getelementptr inbounds float* %tmp18206, i64 1
+  %tmp18208 = getelementptr inbounds float* %tmp18207, i64 1
+  %tmp18209 = getelementptr inbounds float* %tmp18208, i64 1
+  %tmp18210 = getelementptr inbounds float* %tmp18209, i64 1
+  %tmp18211 = getelementptr inbounds float* %tmp18210, i64 1
+  %tmp18212 = getelementptr inbounds float* %tmp18211, i64 1
+  %tmp18213 = getelementptr inbounds float* %tmp18212, i64 1
+  %tmp18214 = getelementptr inbounds float* %tmp18213, i64 1
+  %tmp18215 = getelementptr inbounds float* %tmp18214, i64 1
+  %tmp18216 = getelementptr inbounds float* %tmp18215, i64 1
+  %tmp18217 = getelementptr inbounds float* %tmp18216, i64 1
+  %tmp18218 = getelementptr inbounds float* %tmp18217, i64 1
+  %tmp18219 = getelementptr inbounds float* %tmp18218, i64 1
+  %tmp18220 = getelementptr inbounds float* %tmp18219, i64 1
+  %tmp18221 = getelementptr inbounds float* %tmp18220, i64 1
+  %tmp18222 = getelementptr inbounds float* %tmp18221, i64 1
+  %tmp18223 = getelementptr inbounds float* %tmp18222, i64 1
+  %tmp18224 = getelementptr inbounds float* %tmp18223, i64 1
+  %tmp18225 = getelementptr inbounds float* %tmp18224, i64 1
+  %tmp18226 = getelementptr inbounds float* %tmp18225, i64 1
+  %tmp18227 = getelementptr inbounds float* %tmp18226, i64 1
+  %tmp18228 = getelementptr inbounds float* %tmp18227, i64 1
+  %tmp18229 = getelementptr inbounds float* %tmp18228, i64 1
+  %tmp18230 = getelementptr inbounds float* %tmp18229, i64 1
+  %tmp18231 = getelementptr inbounds float* %tmp18230, i64 1
+  %tmp18232 = getelementptr inbounds float* %tmp18231, i64 1
+  %tmp18233 = getelementptr inbounds float* %tmp18232, i64 1
+  %tmp18234 = getelementptr inbounds float* %tmp18233, i64 1
+  %tmp18235 = getelementptr inbounds float* %tmp18234, i64 1
+  %tmp18236 = getelementptr inbounds float* %tmp18235, i64 1
+  %tmp18237 = getelementptr inbounds float* %tmp18236, i64 1
+  %tmp18238 = getelementptr inbounds float* %tmp18237, i64 1
+  %tmp18239 = getelementptr inbounds float* %tmp18238, i64 1
+  %tmp18240 = getelementptr inbounds float* %tmp18239, i64 1
+  %tmp18241 = getelementptr inbounds float* %tmp18240, i64 1
+  %tmp18242 = getelementptr inbounds float* %tmp18241, i64 1
+  %tmp18243 = getelementptr inbounds float* %tmp18242, i64 1
+  %tmp18244 = getelementptr inbounds float* %tmp18243, i64 1
+  %tmp18245 = getelementptr inbounds float* %tmp18244, i64 1
+  %tmp18246 = getelementptr inbounds float* %tmp18245, i64 1
+  %tmp18247 = getelementptr inbounds float* %tmp18246, i64 1
+  %tmp18248 = getelementptr inbounds float* %tmp18247, i64 1
+  %tmp18249 = getelementptr inbounds float* %tmp18248, i64 1
+  %tmp18250 = getelementptr inbounds float* %tmp18249, i64 1
+  %tmp18251 = getelementptr inbounds float* %tmp18250, i64 1
+  %tmp18252 = getelementptr inbounds float* %tmp18251, i64 1
+  %tmp18253 = getelementptr inbounds float* %tmp18252, i64 1
+  %tmp18254 = getelementptr inbounds float* %tmp18253, i64 1
+  %tmp18255 = getelementptr inbounds float* %tmp18254, i64 1
+  %tmp18256 = getelementptr inbounds float* %tmp18255, i64 1
+  %tmp18257 = getelementptr inbounds float* %tmp18256, i64 1
+  %tmp18258 = getelementptr inbounds float* %tmp18257, i64 1
+  %tmp18259 = getelementptr inbounds float* %tmp18258, i64 1
+  %tmp18260 = getelementptr inbounds float* %tmp18259, i64 1
+  %tmp18261 = getelementptr inbounds float* %tmp18260, i64 1
+  %tmp18262 = getelementptr inbounds float* %tmp18261, i64 1
+  %tmp18263 = getelementptr inbounds float* %tmp18262, i64 1
+  %tmp18264 = getelementptr inbounds float* %tmp18263, i64 1
+  %tmp18265 = getelementptr inbounds float* %tmp18264, i64 1
+  %tmp18266 = getelementptr inbounds float* %tmp18265, i64 1
+  %tmp18267 = getelementptr inbounds float* %tmp18266, i64 1
+  %tmp18268 = getelementptr inbounds float* %tmp18267, i64 1
+  %tmp18269 = getelementptr inbounds float* %tmp18268, i64 1
+  %tmp18270 = getelementptr inbounds float* %tmp18269, i64 1
+  %tmp18271 = getelementptr inbounds float* %tmp18270, i64 1
+  %tmp18272 = getelementptr inbounds float* %tmp18271, i64 1
+  %tmp18273 = getelementptr inbounds float* %tmp18272, i64 1
+  %tmp18274 = getelementptr inbounds float* %tmp18273, i64 1
+  %tmp18275 = getelementptr inbounds float* %tmp18274, i64 1
+  %tmp18276 = getelementptr inbounds float* %tmp18275, i64 1
+  %tmp18277 = getelementptr inbounds float* %tmp18276, i64 1
+  %tmp18278 = getelementptr inbounds float* %tmp18277, i64 1
+  %tmp18279 = getelementptr inbounds float* %tmp18278, i64 1
+  %tmp18280 = getelementptr inbounds float* %tmp18279, i64 1
+  %tmp18281 = getelementptr inbounds float* %tmp18280, i64 1
+  %tmp18282 = getelementptr inbounds float* %tmp18281, i64 1
+  %tmp18283 = getelementptr inbounds float* %tmp18282, i64 1
+  %tmp18284 = getelementptr inbounds float* %tmp18283, i64 1
+  %tmp18285 = getelementptr inbounds float* %tmp18284, i64 1
+  %tmp18286 = getelementptr inbounds float* %tmp18285, i64 1
+  %tmp18287 = getelementptr inbounds float* %tmp18286, i64 1
+  %tmp18288 = getelementptr inbounds float* %tmp18287, i64 1
+  %tmp18289 = getelementptr inbounds float* %tmp18288, i64 1
+  %tmp18290 = getelementptr inbounds float* %tmp18289, i64 1
+  %tmp18291 = getelementptr inbounds float* %tmp18290, i64 1
+  %tmp18292 = getelementptr inbounds float* %tmp18291, i64 1
+  %tmp18293 = getelementptr inbounds float* %tmp18292, i64 1
+  %tmp18294 = getelementptr inbounds float* %tmp18293, i64 1
+  %tmp18295 = getelementptr inbounds float* %tmp18294, i64 1
+  %tmp18296 = getelementptr inbounds float* %tmp18295, i64 1
+  %tmp18297 = getelementptr inbounds float* %tmp18296, i64 1
+  %tmp18298 = getelementptr inbounds float* %tmp18297, i64 1
+  %tmp18299 = getelementptr inbounds float* %tmp18298, i64 1
+  %tmp18300 = getelementptr inbounds float* %tmp18299, i64 1
+  %tmp18301 = getelementptr inbounds float* %tmp18300, i64 1
+  %tmp18302 = getelementptr inbounds float* %tmp18301, i64 1
+  %tmp18303 = getelementptr inbounds float* %tmp18302, i64 1
+  %tmp18304 = getelementptr inbounds float* %tmp18303, i64 1
+  %tmp18305 = getelementptr inbounds float* %tmp18304, i64 1
+  %tmp18306 = getelementptr inbounds float* %tmp18305, i64 1
+  %tmp18307 = getelementptr inbounds float* %tmp18306, i64 1
+  %tmp18308 = getelementptr inbounds float* %tmp18307, i64 1
+  %tmp18309 = getelementptr inbounds float* %tmp18308, i64 1
+  %tmp18310 = getelementptr inbounds float* %tmp18309, i64 1
+  %tmp18311 = getelementptr inbounds float* %tmp18310, i64 1
+  %tmp18312 = getelementptr inbounds float* %tmp18311, i64 1
+  %tmp18313 = getelementptr inbounds float* %tmp18312, i64 1
+  %tmp18314 = getelementptr inbounds float* %tmp18313, i64 1
+  %tmp18315 = getelementptr inbounds float* %tmp18314, i64 1
+  %tmp18316 = getelementptr inbounds float* %tmp18315, i64 1
+  %tmp18317 = getelementptr inbounds float* %tmp18316, i64 1
+  %tmp18318 = getelementptr inbounds float* %tmp18317, i64 1
+  %tmp18319 = getelementptr inbounds float* %tmp18318, i64 1
+  %tmp18320 = getelementptr inbounds float* %tmp18319, i64 1
+  %tmp18321 = getelementptr inbounds float* %tmp18320, i64 1
+  %tmp18322 = getelementptr inbounds float* %tmp18321, i64 1
+  %tmp18323 = getelementptr inbounds float* %tmp18322, i64 1
+  %tmp18324 = getelementptr inbounds float* %tmp18323, i64 1
+  %tmp18325 = getelementptr inbounds float* %tmp18324, i64 1
+  %tmp18326 = getelementptr inbounds float* %tmp18325, i64 1
+  %tmp18327 = getelementptr inbounds float* %tmp18326, i64 1
+  %tmp18328 = getelementptr inbounds float* %tmp18327, i64 1
+  %tmp18329 = getelementptr inbounds float* %tmp18328, i64 1
+  %tmp18330 = getelementptr inbounds float* %tmp18329, i64 1
+  %tmp18331 = getelementptr inbounds float* %tmp18330, i64 1
+  %tmp18332 = getelementptr inbounds float* %tmp18331, i64 1
+  %tmp18333 = getelementptr inbounds float* %tmp18332, i64 1
+  %tmp18334 = getelementptr inbounds float* %tmp18333, i64 1
+  %tmp18335 = getelementptr inbounds float* %tmp18334, i64 1
+  %tmp18336 = getelementptr inbounds float* %tmp18335, i64 1
+  %tmp18337 = getelementptr inbounds float* %tmp18336, i64 1
+  %tmp18338 = getelementptr inbounds float* %tmp18337, i64 1
+  %tmp18339 = getelementptr inbounds float* %tmp18338, i64 1
+  %tmp18340 = getelementptr inbounds float* %tmp18339, i64 1
+  %tmp18341 = getelementptr inbounds float* %tmp18340, i64 1
+  %tmp18342 = getelementptr inbounds float* %tmp18341, i64 1
+  %tmp18343 = getelementptr inbounds float* %tmp18342, i64 1
+  %tmp18344 = getelementptr inbounds float* %tmp18343, i64 1
+  %tmp18345 = getelementptr inbounds float* %tmp18344, i64 1
+  %tmp18346 = getelementptr inbounds float* %tmp18345, i64 1
+  %tmp18347 = getelementptr inbounds float* %tmp18346, i64 1
+  %tmp18348 = getelementptr inbounds float* %tmp18347, i64 1
+  %tmp18349 = getelementptr inbounds float* %tmp18348, i64 1
+  %tmp18350 = getelementptr inbounds float* %tmp18349, i64 1
+  %tmp18351 = getelementptr inbounds float* %tmp18350, i64 1
+  %tmp18352 = getelementptr inbounds float* %tmp18351, i64 1
+  %tmp18353 = getelementptr inbounds float* %tmp18352, i64 1
+  %tmp18354 = getelementptr inbounds float* %tmp18353, i64 1
+  %tmp18355 = getelementptr inbounds float* %tmp18354, i64 1
+  %tmp18356 = getelementptr inbounds float* %tmp18355, i64 1
+  %tmp18357 = getelementptr inbounds float* %tmp18356, i64 1
+  %tmp18358 = getelementptr inbounds float* %tmp18357, i64 1
+  %tmp18359 = getelementptr inbounds float* %tmp18358, i64 1
+  %tmp18360 = getelementptr inbounds float* %tmp18359, i64 1
+  %tmp18361 = getelementptr inbounds float* %tmp18360, i64 1
+  %tmp18362 = getelementptr inbounds float* %tmp18361, i64 1
+  %tmp18363 = getelementptr inbounds float* %tmp18362, i64 1
+  %tmp18364 = getelementptr inbounds float* %tmp18363, i64 1
+  %tmp18365 = getelementptr inbounds float* %tmp18364, i64 1
+  %tmp18366 = getelementptr inbounds float* %tmp18365, i64 1
+  %tmp18367 = getelementptr inbounds float* %tmp18366, i64 1
+  %tmp18368 = getelementptr inbounds float* %tmp18367, i64 1
+  %tmp18369 = getelementptr inbounds float* %tmp18368, i64 1
+  %tmp18370 = getelementptr inbounds float* %tmp18369, i64 1
+  %tmp18371 = getelementptr inbounds float* %tmp18370, i64 1
+  %tmp18372 = getelementptr inbounds float* %tmp18371, i64 1
+  %tmp18373 = getelementptr inbounds float* %tmp18372, i64 1
+  %tmp18374 = getelementptr inbounds float* %tmp18373, i64 1
+  %tmp18375 = getelementptr inbounds float* %tmp18374, i64 1
+  %tmp18376 = getelementptr inbounds float* %tmp18375, i64 1
+  %tmp18377 = getelementptr inbounds float* %tmp18376, i64 1
+  %tmp18378 = getelementptr inbounds float* %tmp18377, i64 1
+  %tmp18379 = getelementptr inbounds float* %tmp18378, i64 1
+  %tmp18380 = getelementptr inbounds float* %tmp18379, i64 1
+  %tmp18381 = getelementptr inbounds float* %tmp18380, i64 1
+  %tmp18382 = getelementptr inbounds float* %tmp18381, i64 1
+  %tmp18383 = getelementptr inbounds float* %tmp18382, i64 1
+  %tmp18384 = getelementptr inbounds float* %tmp18383, i64 1
+  %tmp18385 = getelementptr inbounds float* %tmp18384, i64 1
+  %tmp18386 = getelementptr inbounds float* %tmp18385, i64 1
+  %tmp18387 = getelementptr inbounds float* %tmp18386, i64 1
+  %tmp18388 = getelementptr inbounds float* %tmp18387, i64 1
+  %tmp18389 = getelementptr inbounds float* %tmp18388, i64 1
+  %tmp18390 = getelementptr inbounds float* %tmp18389, i64 1
+  %tmp18391 = getelementptr inbounds float* %tmp18390, i64 1
+  %tmp18392 = getelementptr inbounds float* %tmp18391, i64 1
+  %tmp18393 = getelementptr inbounds float* %tmp18392, i64 1
+  %tmp18394 = getelementptr inbounds float* %tmp18393, i64 1
+  %tmp18395 = getelementptr inbounds float* %tmp18394, i64 1
+  %tmp18396 = getelementptr inbounds float* %tmp18395, i64 1
+  %tmp18397 = getelementptr inbounds float* %tmp18396, i64 1
+  %tmp18398 = getelementptr inbounds float* %tmp18397, i64 1
+  %tmp18399 = getelementptr inbounds float* %tmp18398, i64 1
+  %tmp18400 = getelementptr inbounds float* %tmp18399, i64 1
+  %tmp18401 = getelementptr inbounds float* %tmp18400, i64 1
+  %tmp18402 = getelementptr inbounds float* %tmp18401, i64 1
+  %tmp18403 = getelementptr inbounds float* %tmp18402, i64 1
+  %tmp18404 = getelementptr inbounds float* %tmp18403, i64 1
+  %tmp18405 = getelementptr inbounds float* %tmp18404, i64 1
+  %tmp18406 = getelementptr inbounds float* %tmp18405, i64 1
+  %tmp18407 = getelementptr inbounds float* %tmp18406, i64 1
+  %tmp18408 = getelementptr inbounds float* %tmp18407, i64 1
+  %tmp18409 = getelementptr inbounds float* %tmp18408, i64 1
+  %tmp18410 = getelementptr inbounds float* %tmp18409, i64 1
+  %tmp18411 = getelementptr inbounds float* %tmp18410, i64 1
+  %tmp18412 = getelementptr inbounds float* %tmp18411, i64 1
+  %tmp18413 = getelementptr inbounds float* %tmp18412, i64 1
+  %tmp18414 = getelementptr inbounds float* %tmp18413, i64 1
+  %tmp18415 = getelementptr inbounds float* %tmp18414, i64 1
+  %tmp18416 = getelementptr inbounds float* %tmp18415, i64 1
+  %tmp18417 = getelementptr inbounds float* %tmp18416, i64 1
+  %tmp18418 = getelementptr inbounds float* %tmp18417, i64 1
+  %tmp18419 = getelementptr inbounds float* %tmp18418, i64 1
+  %tmp18420 = getelementptr inbounds float* %tmp18419, i64 1
+  %tmp18421 = getelementptr inbounds float* %tmp18420, i64 1
+  %tmp18422 = getelementptr inbounds float* %tmp18421, i64 1
+  %tmp18423 = getelementptr inbounds float* %tmp18422, i64 1
+  %tmp18424 = getelementptr inbounds float* %tmp18423, i64 1
+  %tmp18425 = getelementptr inbounds float* %tmp18424, i64 1
+  %tmp18426 = getelementptr inbounds float* %tmp18425, i64 1
+  %tmp18427 = getelementptr inbounds float* %tmp18426, i64 1
+  %tmp18428 = getelementptr inbounds float* %tmp18427, i64 1
+  %tmp18429 = getelementptr inbounds float* %tmp18428, i64 1
+  %tmp18430 = getelementptr inbounds float* %tmp18429, i64 1
+  %tmp18431 = getelementptr inbounds float* %tmp18430, i64 1
+  %tmp18432 = getelementptr inbounds float* %tmp18431, i64 1
+  %tmp18433 = getelementptr inbounds float* %tmp18432, i64 1
+  %tmp18434 = getelementptr inbounds float* %tmp18433, i64 1
+  %tmp18435 = getelementptr inbounds float* %tmp18434, i64 1
+  %tmp18436 = getelementptr inbounds float* %tmp18435, i64 1
+  %tmp18437 = getelementptr inbounds float* %tmp18436, i64 1
+  %tmp18438 = getelementptr inbounds float* %tmp18437, i64 1
+  %tmp18439 = getelementptr inbounds float* %tmp18438, i64 1
+  %tmp18440 = getelementptr inbounds float* %tmp18439, i64 1
+  %tmp18441 = getelementptr inbounds float* %tmp18440, i64 1
+  %tmp18442 = getelementptr inbounds float* %tmp18441, i64 1
+  %tmp18443 = getelementptr inbounds float* %tmp18442, i64 1
+  %tmp18444 = getelementptr inbounds float* %tmp18443, i64 1
+  %tmp18445 = getelementptr inbounds float* %tmp18444, i64 1
+  %tmp18446 = getelementptr inbounds float* %tmp18445, i64 1
+  %tmp18447 = getelementptr inbounds float* %tmp18446, i64 1
+  %tmp18448 = getelementptr inbounds float* %tmp18447, i64 1
+  %tmp18449 = getelementptr inbounds float* %tmp18448, i64 1
+  %tmp18450 = getelementptr inbounds float* %tmp18449, i64 1
+  %tmp18451 = getelementptr inbounds float* %tmp18450, i64 1
+  %tmp18452 = getelementptr inbounds float* %tmp18451, i64 1
+  %tmp18453 = getelementptr inbounds float* %tmp18452, i64 1
+  %tmp18454 = getelementptr inbounds float* %tmp18453, i64 1
+  %tmp18455 = getelementptr inbounds float* %tmp18454, i64 1
+  %tmp18456 = getelementptr inbounds float* %tmp18455, i64 1
+  %tmp18457 = getelementptr inbounds float* %tmp18456, i64 1
+  %tmp18458 = getelementptr inbounds float* %tmp18457, i64 1
+  %tmp18459 = getelementptr inbounds float* %tmp18458, i64 1
+  %tmp18460 = getelementptr inbounds float* %tmp18459, i64 1
+  %tmp18461 = getelementptr inbounds float* %tmp18460, i64 1
+  %tmp18462 = getelementptr inbounds float* %tmp18461, i64 1
+  %tmp18463 = getelementptr inbounds float* %tmp18462, i64 1
+  %tmp18464 = getelementptr inbounds float* %tmp18463, i64 1
+  %tmp18465 = getelementptr inbounds float* %tmp18464, i64 1
+  %tmp18466 = getelementptr inbounds float* %tmp18465, i64 1
+  %tmp18467 = getelementptr inbounds float* %tmp18466, i64 1
+  %tmp18468 = getelementptr inbounds float* %tmp18467, i64 1
+  %tmp18469 = getelementptr inbounds float* %tmp18468, i64 1
+  %tmp18470 = getelementptr inbounds float* %tmp18469, i64 1
+  %tmp18471 = getelementptr inbounds float* %tmp18470, i64 1
+  %tmp18472 = getelementptr inbounds float* %tmp18471, i64 1
+  %tmp18473 = getelementptr inbounds float* %tmp18472, i64 1
+  %tmp18474 = getelementptr inbounds float* %tmp18473, i64 1
+  %tmp18475 = getelementptr inbounds float* %tmp18474, i64 1
+  %tmp18476 = getelementptr inbounds float* %tmp18475, i64 1
+  %tmp18477 = getelementptr inbounds float* %tmp18476, i64 1
+  %tmp18478 = getelementptr inbounds float* %tmp18477, i64 1
+  %tmp18479 = getelementptr inbounds float* %tmp18478, i64 1
+  %tmp18480 = getelementptr inbounds float* %tmp18479, i64 1
+  %tmp18481 = getelementptr inbounds float* %tmp18480, i64 1
+  %tmp18482 = getelementptr inbounds float* %tmp18481, i64 1
+  %tmp18483 = getelementptr inbounds float* %tmp18482, i64 1
+  %tmp18484 = getelementptr inbounds float* %tmp18483, i64 1
+  %tmp18485 = getelementptr inbounds float* %tmp18484, i64 1
+  %tmp18486 = getelementptr inbounds float* %tmp18485, i64 1
+  %tmp18487 = getelementptr inbounds float* %tmp18486, i64 1
+  %tmp18488 = getelementptr inbounds float* %tmp18487, i64 1
+  %tmp18489 = getelementptr inbounds float* %tmp18488, i64 1
+  %tmp18490 = getelementptr inbounds float* %tmp18489, i64 1
+  %tmp18491 = getelementptr inbounds float* %tmp18490, i64 1
+  %tmp18492 = getelementptr inbounds float* %tmp18491, i64 1
+  %tmp18493 = getelementptr inbounds float* %tmp18492, i64 1
+  %tmp18494 = getelementptr inbounds float* %tmp18493, i64 1
+  %tmp18495 = getelementptr inbounds float* %tmp18494, i64 1
+  %tmp18496 = getelementptr inbounds float* %tmp18495, i64 1
+  %tmp18497 = getelementptr inbounds float* %tmp18496, i64 1
+  %tmp18498 = getelementptr inbounds float* %tmp18497, i64 1
+  %tmp18499 = getelementptr inbounds float* %tmp18498, i64 1
+  %tmp18500 = getelementptr inbounds float* %tmp18499, i64 1
+  %tmp18501 = getelementptr inbounds float* %tmp18500, i64 1
+  %tmp18502 = getelementptr inbounds float* %tmp18501, i64 1
+  %tmp18503 = getelementptr inbounds float* %tmp18502, i64 1
+  %tmp18504 = getelementptr inbounds float* %tmp18503, i64 1
+  %tmp18505 = getelementptr inbounds float* %tmp18504, i64 1
+  %tmp18506 = getelementptr inbounds float* %tmp18505, i64 1
+  %tmp18507 = getelementptr inbounds float* %tmp18506, i64 1
+  %tmp18508 = getelementptr inbounds float* %tmp18507, i64 1
+  %tmp18509 = getelementptr inbounds float* %tmp18508, i64 1
+  %tmp18510 = getelementptr inbounds float* %tmp18509, i64 1
+  %tmp18511 = getelementptr inbounds float* %tmp18510, i64 1
+  %tmp18512 = getelementptr inbounds float* %tmp18511, i64 1
+  %tmp18513 = getelementptr inbounds float* %tmp18512, i64 1
+  %tmp18514 = getelementptr inbounds float* %tmp18513, i64 1
+  %tmp18515 = getelementptr inbounds float* %tmp18514, i64 1
+  %tmp18516 = getelementptr inbounds float* %tmp18515, i64 1
+  %tmp18517 = getelementptr inbounds float* %tmp18516, i64 1
+  %tmp18518 = getelementptr inbounds float* %tmp18517, i64 1
+  %tmp18519 = getelementptr inbounds float* %tmp18518, i64 1
+  %tmp18520 = getelementptr inbounds float* %tmp18519, i64 1
+  %tmp18521 = getelementptr inbounds float* %tmp18520, i64 1
+  %tmp18522 = getelementptr inbounds float* %tmp18521, i64 1
+  %tmp18523 = getelementptr inbounds float* %tmp18522, i64 1
+  %tmp18524 = getelementptr inbounds float* %tmp18523, i64 1
+  %tmp18525 = getelementptr inbounds float* %tmp18524, i64 1
+  %tmp18526 = getelementptr inbounds float* %tmp18525, i64 1
+  %tmp18527 = getelementptr inbounds float* %tmp18526, i64 1
+  %tmp18528 = getelementptr inbounds float* %tmp18527, i64 1
+  %tmp18529 = getelementptr inbounds float* %tmp18528, i64 1
+  %tmp18530 = getelementptr inbounds float* %tmp18529, i64 1
+  %tmp18531 = getelementptr inbounds float* %tmp18530, i64 1
+  %tmp18532 = getelementptr inbounds float* %tmp18531, i64 1
+  %tmp18533 = getelementptr inbounds float* %tmp18532, i64 1
+  %tmp18534 = getelementptr inbounds float* %tmp18533, i64 1
+  %tmp18535 = getelementptr inbounds float* %tmp18534, i64 1
+  %tmp18536 = getelementptr inbounds float* %tmp18535, i64 1
+  %tmp18537 = getelementptr inbounds float* %tmp18536, i64 1
+  %tmp18538 = getelementptr inbounds float* %tmp18537, i64 1
+  %tmp18539 = getelementptr inbounds float* %tmp18538, i64 1
+  %tmp18540 = getelementptr inbounds float* %tmp18539, i64 1
+  %tmp18541 = getelementptr inbounds float* %tmp18540, i64 1
+  %tmp18542 = getelementptr inbounds float* %tmp18541, i64 1
+  %tmp18543 = getelementptr inbounds float* %tmp18542, i64 1
+  %tmp18544 = getelementptr inbounds float* %tmp18543, i64 1
+  %tmp18545 = getelementptr inbounds float* %tmp18544, i64 1
+  %tmp18546 = getelementptr inbounds float* %tmp18545, i64 1
+  %tmp18547 = getelementptr inbounds float* %tmp18546, i64 1
+  %tmp18548 = getelementptr inbounds float* %tmp18547, i64 1
+  %tmp18549 = getelementptr inbounds float* %tmp18548, i64 1
+  %tmp18550 = getelementptr inbounds float* %tmp18549, i64 1
+  %tmp18551 = getelementptr inbounds float* %tmp18550, i64 1
+  %tmp18552 = getelementptr inbounds float* %tmp18551, i64 1
+  %tmp18553 = getelementptr inbounds float* %tmp18552, i64 1
+  %tmp18554 = getelementptr inbounds float* %tmp18553, i64 1
+  %tmp18555 = getelementptr inbounds float* %tmp18554, i64 1
+  %tmp18556 = getelementptr inbounds float* %tmp18555, i64 1
+  %tmp18557 = getelementptr inbounds float* %tmp18556, i64 1
+  %tmp18558 = getelementptr inbounds float* %tmp18557, i64 1
+  %tmp18559 = getelementptr inbounds float* %tmp18558, i64 1
+  %tmp18560 = getelementptr inbounds float* %tmp18559, i64 1
+  %tmp18561 = getelementptr inbounds float* %tmp18560, i64 1
+  %tmp18562 = getelementptr inbounds float* %tmp18561, i64 1
+  %tmp18563 = getelementptr inbounds float* %tmp18562, i64 1
+  %tmp18564 = getelementptr inbounds float* %tmp18563, i64 1
+  %tmp18565 = getelementptr inbounds float* %tmp18564, i64 1
+  %tmp18566 = getelementptr inbounds float* %tmp18565, i64 1
+  %tmp18567 = getelementptr inbounds float* %tmp18566, i64 1
+  %tmp18568 = getelementptr inbounds float* %tmp18567, i64 1
+  %tmp18569 = getelementptr inbounds float* %tmp18568, i64 1
+  %tmp18570 = getelementptr inbounds float* %tmp18569, i64 1
+  %tmp18571 = getelementptr inbounds float* %tmp18570, i64 1
+  %tmp18572 = getelementptr inbounds float* %tmp18571, i64 1
+  %tmp18573 = getelementptr inbounds float* %tmp18572, i64 1
+  %tmp18574 = getelementptr inbounds float* %tmp18573, i64 1
+  %tmp18575 = getelementptr inbounds float* %tmp18574, i64 1
+  %tmp18576 = getelementptr inbounds float* %tmp18575, i64 1
+  %tmp18577 = getelementptr inbounds float* %tmp18576, i64 1
+  %tmp18578 = getelementptr inbounds float* %tmp18577, i64 1
+  %tmp18579 = getelementptr inbounds float* %tmp18578, i64 1
+  %tmp18580 = getelementptr inbounds float* %tmp18579, i64 1
+  %tmp18581 = getelementptr inbounds float* %tmp18580, i64 1
+  %tmp18582 = getelementptr inbounds float* %tmp18581, i64 1
+  %tmp18583 = getelementptr inbounds float* %tmp18582, i64 1
+  %tmp18584 = getelementptr inbounds float* %tmp18583, i64 1
+  %tmp18585 = getelementptr inbounds float* %tmp18584, i64 1
+  %tmp18586 = getelementptr inbounds float* %tmp18585, i64 1
+  %tmp18587 = getelementptr inbounds float* %tmp18586, i64 1
+  %tmp18588 = getelementptr inbounds float* %tmp18587, i64 1
+  %tmp18589 = getelementptr inbounds float* %tmp18588, i64 1
+  %tmp18590 = getelementptr inbounds float* %tmp18589, i64 1
+  %tmp18591 = getelementptr inbounds float* %tmp18590, i64 1
+  %tmp18592 = getelementptr inbounds float* %tmp18591, i64 1
+  %tmp18593 = getelementptr inbounds float* %tmp18592, i64 1
+  %tmp18594 = getelementptr inbounds float* %tmp18593, i64 1
+  %tmp18595 = getelementptr inbounds float* %tmp18594, i64 1
+  %tmp18596 = getelementptr inbounds float* %tmp18595, i64 1
+  %tmp18597 = getelementptr inbounds float* %tmp18596, i64 1
+  %tmp18598 = getelementptr inbounds float* %tmp18597, i64 1
+  %tmp18599 = getelementptr inbounds float* %tmp18598, i64 1
+  %tmp18600 = getelementptr inbounds float* %tmp18599, i64 1
+  %tmp18601 = getelementptr inbounds float* %tmp18600, i64 1
+  %tmp18602 = getelementptr inbounds float* %tmp18601, i64 1
+  %tmp18603 = getelementptr inbounds float* %tmp18602, i64 1
+  %tmp18604 = getelementptr inbounds float* %tmp18603, i64 1
+  %tmp18605 = getelementptr inbounds float* %tmp18604, i64 1
+  %tmp18606 = getelementptr inbounds float* %tmp18605, i64 1
+  %tmp18607 = getelementptr inbounds float* %tmp18606, i64 1
+  %tmp18608 = getelementptr inbounds float* %tmp18607, i64 1
+  %tmp18609 = getelementptr inbounds float* %tmp18608, i64 1
+  %tmp18610 = getelementptr inbounds float* %tmp18609, i64 1
+  %tmp18611 = getelementptr inbounds float* %tmp18610, i64 1
+  %tmp18612 = getelementptr inbounds float* %tmp18611, i64 1
+  %tmp18613 = getelementptr inbounds float* %tmp18612, i64 1
+  %tmp18614 = getelementptr inbounds float* %tmp18613, i64 1
+  %tmp18615 = getelementptr inbounds float* %tmp18614, i64 1
+  %tmp18616 = getelementptr inbounds float* %tmp18615, i64 1
+  %tmp18617 = getelementptr inbounds float* %tmp18616, i64 1
+  %tmp18618 = getelementptr inbounds float* %tmp18617, i64 1
+  %tmp18619 = getelementptr inbounds float* %tmp18618, i64 1
+  %tmp18620 = getelementptr inbounds float* %tmp18619, i64 1
+  %tmp18621 = getelementptr inbounds float* %tmp18620, i64 1
+  %tmp18622 = getelementptr inbounds float* %tmp18621, i64 1
+  %tmp18623 = getelementptr inbounds float* %tmp18622, i64 1
+  %tmp18624 = getelementptr inbounds float* %tmp18623, i64 1
+  %tmp18625 = getelementptr inbounds float* %tmp18624, i64 1
+  %tmp18626 = getelementptr inbounds float* %tmp18625, i64 1
+  %tmp18627 = getelementptr inbounds float* %tmp18626, i64 1
+  %tmp18628 = getelementptr inbounds float* %tmp18627, i64 1
+  %tmp18629 = getelementptr inbounds float* %tmp18628, i64 1
+  %tmp18630 = getelementptr inbounds float* %tmp18629, i64 1
+  %tmp18631 = getelementptr inbounds float* %tmp18630, i64 1
+  %tmp18632 = getelementptr inbounds float* %tmp18631, i64 1
+  %tmp18633 = getelementptr inbounds float* %tmp18632, i64 1
+  %tmp18634 = getelementptr inbounds float* %tmp18633, i64 1
+  %tmp18635 = getelementptr inbounds float* %tmp18634, i64 1
+  %tmp18636 = getelementptr inbounds float* %tmp18635, i64 1
+  %tmp18637 = getelementptr inbounds float* %tmp18636, i64 1
+  %tmp18638 = getelementptr inbounds float* %tmp18637, i64 1
+  %tmp18639 = getelementptr inbounds float* %tmp18638, i64 1
+  %tmp18640 = getelementptr inbounds float* %tmp18639, i64 1
+  %tmp18641 = getelementptr inbounds float* %tmp18640, i64 1
+  %tmp18642 = getelementptr inbounds float* %tmp18641, i64 1
+  %tmp18643 = getelementptr inbounds float* %tmp18642, i64 1
+  %tmp18644 = getelementptr inbounds float* %tmp18643, i64 1
+  %tmp18645 = getelementptr inbounds float* %tmp18644, i64 1
+  %tmp18646 = getelementptr inbounds float* %tmp18645, i64 1
+  %tmp18647 = getelementptr inbounds float* %tmp18646, i64 1
+  %tmp18648 = getelementptr inbounds float* %tmp18647, i64 1
+  %tmp18649 = getelementptr inbounds float* %tmp18648, i64 1
+  %tmp18650 = getelementptr inbounds float* %tmp18649, i64 1
+  %tmp18651 = getelementptr inbounds float* %tmp18650, i64 1
+  %tmp18652 = getelementptr inbounds float* %tmp18651, i64 1
+  %tmp18653 = getelementptr inbounds float* %tmp18652, i64 1
+  %tmp18654 = getelementptr inbounds float* %tmp18653, i64 1
+  %tmp18655 = getelementptr inbounds float* %tmp18654, i64 1
+  %tmp18656 = getelementptr inbounds float* %tmp18655, i64 1
+  %tmp18657 = getelementptr inbounds float* %tmp18656, i64 1
+  %tmp18658 = getelementptr inbounds float* %tmp18657, i64 1
+  %tmp18659 = getelementptr inbounds float* %tmp18658, i64 1
+  %tmp18660 = getelementptr inbounds float* %tmp18659, i64 1
+  %tmp18661 = getelementptr inbounds float* %tmp18660, i64 1
+  %tmp18662 = getelementptr inbounds float* %tmp18661, i64 1
+  %tmp18663 = getelementptr inbounds float* %tmp18662, i64 1
+  %tmp18664 = getelementptr inbounds float* %tmp18663, i64 1
+  %tmp18665 = getelementptr inbounds float* %tmp18664, i64 1
+  %tmp18666 = getelementptr inbounds float* %tmp18665, i64 1
+  %tmp18667 = getelementptr inbounds float* %tmp18666, i64 1
+  %tmp18668 = getelementptr inbounds float* %tmp18667, i64 1
+  %tmp18669 = getelementptr inbounds float* %tmp18668, i64 1
+  %tmp18670 = getelementptr inbounds float* %tmp18669, i64 1
+  %tmp18671 = getelementptr inbounds float* %tmp18670, i64 1
+  %tmp18672 = getelementptr inbounds float* %tmp18671, i64 1
+  %tmp18673 = getelementptr inbounds float* %tmp18672, i64 1
+  %tmp18674 = getelementptr inbounds float* %tmp18673, i64 1
+  %tmp18675 = getelementptr inbounds float* %tmp18674, i64 1
+  %tmp18676 = getelementptr inbounds float* %tmp18675, i64 1
+  %tmp18677 = getelementptr inbounds float* %tmp18676, i64 1
+  %tmp18678 = getelementptr inbounds float* %tmp18677, i64 1
+  %tmp18679 = getelementptr inbounds float* %tmp18678, i64 1
+  %tmp18680 = getelementptr inbounds float* %tmp18679, i64 1
+  %tmp18681 = getelementptr inbounds float* %tmp18680, i64 1
+  %tmp18682 = getelementptr inbounds float* %tmp18681, i64 1
+  %tmp18683 = getelementptr inbounds float* %tmp18682, i64 1
+  %tmp18684 = getelementptr inbounds float* %tmp18683, i64 1
+  %tmp18685 = getelementptr inbounds float* %tmp18684, i64 1
+  %tmp18686 = getelementptr inbounds float* %tmp18685, i64 1
+  %tmp18687 = getelementptr inbounds float* %tmp18686, i64 1
+  %tmp18688 = getelementptr inbounds float* %tmp18687, i64 1
+  %tmp18689 = getelementptr inbounds float* %tmp18688, i64 1
+  %tmp18690 = getelementptr inbounds float* %tmp18689, i64 1
+  %tmp18691 = getelementptr inbounds float* %tmp18690, i64 1
+  %tmp18692 = getelementptr inbounds float* %tmp18691, i64 1
+  %tmp18693 = getelementptr inbounds float* %tmp18692, i64 1
+  %tmp18694 = getelementptr inbounds float* %tmp18693, i64 1
+  %tmp18695 = getelementptr inbounds float* %tmp18694, i64 1
+  %tmp18696 = getelementptr inbounds float* %tmp18695, i64 1
+  %tmp18697 = getelementptr inbounds float* %tmp18696, i64 1
+  %tmp18698 = getelementptr inbounds float* %tmp18697, i64 1
+  %tmp18699 = getelementptr inbounds float* %tmp18698, i64 1
+  %tmp18700 = getelementptr inbounds float* %tmp18699, i64 1
+  %tmp18701 = getelementptr inbounds float* %tmp18700, i64 1
+  %tmp18702 = getelementptr inbounds float* %tmp18701, i64 1
+  %tmp18703 = getelementptr inbounds float* %tmp18702, i64 1
+  %tmp18704 = getelementptr inbounds float* %tmp18703, i64 1
+  %tmp18705 = getelementptr inbounds float* %tmp18704, i64 1
+  %tmp18706 = getelementptr inbounds float* %tmp18705, i64 1
+  %tmp18707 = getelementptr inbounds float* %tmp18706, i64 1
+  %tmp18708 = getelementptr inbounds float* %tmp18707, i64 1
+  %tmp18709 = getelementptr inbounds float* %tmp18708, i64 1
+  %tmp18710 = getelementptr inbounds float* %tmp18709, i64 1
+  %tmp18711 = getelementptr inbounds float* %tmp18710, i64 1
+  %tmp18712 = getelementptr inbounds float* %tmp18711, i64 1
+  %tmp18713 = getelementptr inbounds float* %tmp18712, i64 1
+  %tmp18714 = getelementptr inbounds float* %tmp18713, i64 1
+  %tmp18715 = getelementptr inbounds float* %tmp18714, i64 1
+  %tmp18716 = getelementptr inbounds float* %tmp18715, i64 1
+  %tmp18717 = getelementptr inbounds float* %tmp18716, i64 1
+  %tmp18718 = getelementptr inbounds float* %tmp18717, i64 1
+  %tmp18719 = getelementptr inbounds float* %tmp18718, i64 1
+  %tmp18720 = getelementptr inbounds float* %tmp18719, i64 1
+  %tmp18721 = getelementptr inbounds float* %tmp18720, i64 1
+  %tmp18722 = getelementptr inbounds float* %tmp18721, i64 1
+  %tmp18723 = getelementptr inbounds float* %tmp18722, i64 1
+  %tmp18724 = getelementptr inbounds float* %tmp18723, i64 1
+  %tmp18725 = getelementptr inbounds float* %tmp18724, i64 1
+  %tmp18726 = getelementptr inbounds float* %tmp18725, i64 1
+  %tmp18727 = getelementptr inbounds float* %tmp18726, i64 1
+  %tmp18728 = getelementptr inbounds float* %tmp18727, i64 1
+  %tmp18729 = getelementptr inbounds float* %tmp18728, i64 1
+  %tmp18730 = getelementptr inbounds float* %tmp18729, i64 1
+  %tmp18731 = getelementptr inbounds float* %tmp18730, i64 1
+  %tmp18732 = getelementptr inbounds float* %tmp18731, i64 1
+  %tmp18733 = getelementptr inbounds float* %tmp18732, i64 1
+  %tmp18734 = getelementptr inbounds float* %tmp18733, i64 1
+  %tmp18735 = getelementptr inbounds float* %tmp18734, i64 1
+  %tmp18736 = getelementptr inbounds float* %tmp18735, i64 1
+  %tmp18737 = getelementptr inbounds float* %tmp18736, i64 1
+  %tmp18738 = getelementptr inbounds float* %tmp18737, i64 1
+  %tmp18739 = getelementptr inbounds float* %tmp18738, i64 1
+  %tmp18740 = getelementptr inbounds float* %tmp18739, i64 1
+  %tmp18741 = getelementptr inbounds float* %tmp18740, i64 1
+  %tmp18742 = getelementptr inbounds float* %tmp18741, i64 1
+  %tmp18743 = getelementptr inbounds float* %tmp18742, i64 1
+  %tmp18744 = getelementptr inbounds float* %tmp18743, i64 1
+  %tmp18745 = getelementptr inbounds float* %tmp18744, i64 1
+  %tmp18746 = getelementptr inbounds float* %tmp18745, i64 1
+  %tmp18747 = getelementptr inbounds float* %tmp18746, i64 1
+  %tmp18748 = getelementptr inbounds float* %tmp18747, i64 1
+  %tmp18749 = getelementptr inbounds float* %tmp18748, i64 1
+  %tmp18750 = getelementptr inbounds float* %tmp18749, i64 1
+  %tmp18751 = getelementptr inbounds float* %tmp18750, i64 1
+  %tmp18752 = getelementptr inbounds float* %tmp18751, i64 1
+  %tmp18753 = getelementptr inbounds float* %tmp18752, i64 1
+  %tmp18754 = getelementptr inbounds float* %tmp18753, i64 1
+  %tmp18755 = getelementptr inbounds float* %tmp18754, i64 1
+  %tmp18756 = getelementptr inbounds float* %tmp18755, i64 1
+  %tmp18757 = getelementptr inbounds float* %tmp18756, i64 1
+  %tmp18758 = getelementptr inbounds float* %tmp18757, i64 1
+  %tmp18759 = getelementptr inbounds float* %tmp18758, i64 1
+  %tmp18760 = getelementptr inbounds float* %tmp18759, i64 1
+  %tmp18761 = getelementptr inbounds float* %tmp18760, i64 1
+  %tmp18762 = getelementptr inbounds float* %tmp18761, i64 1
+  %tmp18763 = getelementptr inbounds float* %tmp18762, i64 1
+  %tmp18764 = getelementptr inbounds float* %tmp18763, i64 1
+  %tmp18765 = getelementptr inbounds float* %tmp18764, i64 1
+  %tmp18766 = getelementptr inbounds float* %tmp18765, i64 1
+  %tmp18767 = getelementptr inbounds float* %tmp18766, i64 1
+  %tmp18768 = getelementptr inbounds float* %tmp18767, i64 1
+  %tmp18769 = getelementptr inbounds float* %tmp18768, i64 1
+  %tmp18770 = getelementptr inbounds float* %tmp18769, i64 1
+  %tmp18771 = getelementptr inbounds float* %tmp18770, i64 1
+  %tmp18772 = getelementptr inbounds float* %tmp18771, i64 1
+  %tmp18773 = getelementptr inbounds float* %tmp18772, i64 1
+  %tmp18774 = getelementptr inbounds float* %tmp18773, i64 1
+  %tmp18775 = getelementptr inbounds float* %tmp18774, i64 1
+  %tmp18776 = getelementptr inbounds float* %tmp18775, i64 1
+  %tmp18777 = getelementptr inbounds float* %tmp18776, i64 1
+  %tmp18778 = getelementptr inbounds float* %tmp18777, i64 1
+  %tmp18779 = getelementptr inbounds float* %tmp18778, i64 1
+  %tmp18780 = getelementptr inbounds float* %tmp18779, i64 1
+  %tmp18781 = getelementptr inbounds float* %tmp18780, i64 1
+  %tmp18782 = getelementptr inbounds float* %tmp18781, i64 1
+  %tmp18783 = getelementptr inbounds float* %tmp18782, i64 1
+  %tmp18784 = getelementptr inbounds float* %tmp18783, i64 1
+  %tmp18785 = getelementptr inbounds float* %tmp18784, i64 1
+  %tmp18786 = getelementptr inbounds float* %tmp18785, i64 1
+  %tmp18787 = getelementptr inbounds float* %tmp18786, i64 1
+  %tmp18788 = getelementptr inbounds float* %tmp18787, i64 1
+  %tmp18789 = getelementptr inbounds float* %tmp18788, i64 1
+  %tmp18790 = getelementptr inbounds float* %tmp18789, i64 1
+  %tmp18791 = getelementptr inbounds float* %tmp18790, i64 1
+  %tmp18792 = getelementptr inbounds float* %tmp18791, i64 1
+  %tmp18793 = getelementptr inbounds float* %tmp18792, i64 1
+  %tmp18794 = getelementptr inbounds float* %tmp18793, i64 1
+  %tmp18795 = getelementptr inbounds float* %tmp18794, i64 1
+  %tmp18796 = getelementptr inbounds float* %tmp18795, i64 1
+  %tmp18797 = getelementptr inbounds float* %tmp18796, i64 1
+  %tmp18798 = getelementptr inbounds float* %tmp18797, i64 1
+  %tmp18799 = getelementptr inbounds float* %tmp18798, i64 1
+  %tmp18800 = getelementptr inbounds float* %tmp18799, i64 1
+  %tmp18801 = getelementptr inbounds float* %tmp18800, i64 1
+  %tmp18802 = getelementptr inbounds float* %tmp18801, i64 1
+  %tmp18803 = getelementptr inbounds float* %tmp18802, i64 1
+  %tmp18804 = getelementptr inbounds float* %tmp18803, i64 1
+  %tmp18805 = getelementptr inbounds float* %tmp18804, i64 1
+  %tmp18806 = getelementptr inbounds float* %tmp18805, i64 1
+  %tmp18807 = getelementptr inbounds float* %tmp18806, i64 1
+  %tmp18808 = getelementptr inbounds float* %tmp18807, i64 1
+  %tmp18809 = getelementptr inbounds float* %tmp18808, i64 1
+  %tmp18810 = getelementptr inbounds float* %tmp18809, i64 1
+  %tmp18811 = getelementptr inbounds float* %tmp18810, i64 1
+  %tmp18812 = getelementptr inbounds float* %tmp18811, i64 1
+  %tmp18813 = getelementptr inbounds float* %tmp18812, i64 1
+  %tmp18814 = getelementptr inbounds float* %tmp18813, i64 1
+  %tmp18815 = getelementptr inbounds float* %tmp18814, i64 1
+  %tmp18816 = getelementptr inbounds float* %tmp18815, i64 1
+  %tmp18817 = getelementptr inbounds float* %tmp18816, i64 1
+  %tmp18818 = getelementptr inbounds float* %tmp18817, i64 1
+  %tmp18819 = getelementptr inbounds float* %tmp18818, i64 1
+  %tmp18820 = getelementptr inbounds float* %tmp18819, i64 1
+  %tmp18821 = getelementptr inbounds float* %tmp18820, i64 1
+  %tmp18822 = getelementptr inbounds float* %tmp18821, i64 1
+  %tmp18823 = getelementptr inbounds float* %tmp18822, i64 1
+  %tmp18824 = getelementptr inbounds float* %tmp18823, i64 1
+  %tmp18825 = getelementptr inbounds float* %tmp18824, i64 1
+  %tmp18826 = getelementptr inbounds float* %tmp18825, i64 1
+  %tmp18827 = getelementptr inbounds float* %tmp18826, i64 1
+  %tmp18828 = getelementptr inbounds float* %tmp18827, i64 1
+  %tmp18829 = getelementptr inbounds float* %tmp18828, i64 1
+  %tmp18830 = getelementptr inbounds float* %tmp18829, i64 1
+  %tmp18831 = getelementptr inbounds float* %tmp18830, i64 1
+  %tmp18832 = getelementptr inbounds float* %tmp18831, i64 1
+  %tmp18833 = getelementptr inbounds float* %tmp18832, i64 1
+  %tmp18834 = getelementptr inbounds float* %tmp18833, i64 1
+  %tmp18835 = getelementptr inbounds float* %tmp18834, i64 1
+  %tmp18836 = getelementptr inbounds float* %tmp18835, i64 1
+  %tmp18837 = getelementptr inbounds float* %tmp18836, i64 1
+  %tmp18838 = getelementptr inbounds float* %tmp18837, i64 1
+  %tmp18839 = getelementptr inbounds float* %tmp18838, i64 1
+  %tmp18840 = getelementptr inbounds float* %tmp18839, i64 1
+  %tmp18841 = getelementptr inbounds float* %tmp18840, i64 1
+  %tmp18842 = getelementptr inbounds float* %tmp18841, i64 1
+  %tmp18843 = getelementptr inbounds float* %tmp18842, i64 1
+  %tmp18844 = getelementptr inbounds float* %tmp18843, i64 1
+  %tmp18845 = getelementptr inbounds float* %tmp18844, i64 1
+  %tmp18846 = getelementptr inbounds float* %tmp18845, i64 1
+  %tmp18847 = getelementptr inbounds float* %tmp18846, i64 1
+  %tmp18848 = getelementptr inbounds float* %tmp18847, i64 1
+  %tmp18849 = getelementptr inbounds float* %tmp18848, i64 1
+  %tmp18850 = getelementptr inbounds float* %tmp18849, i64 1
+  %tmp18851 = getelementptr inbounds float* %tmp18850, i64 1
+  %tmp18852 = getelementptr inbounds float* %tmp18851, i64 1
+  %tmp18853 = getelementptr inbounds float* %tmp18852, i64 1
+  %tmp18854 = getelementptr inbounds float* %tmp18853, i64 1
+  %tmp18855 = getelementptr inbounds float* %tmp18854, i64 1
+  %tmp18856 = getelementptr inbounds float* %tmp18855, i64 1
+  %tmp18857 = getelementptr inbounds float* %tmp18856, i64 1
+  %tmp18858 = getelementptr inbounds float* %tmp18857, i64 1
+  %tmp18859 = getelementptr inbounds float* %tmp18858, i64 1
+  %tmp18860 = getelementptr inbounds float* %tmp18859, i64 1
+  %tmp18861 = getelementptr inbounds float* %tmp18860, i64 1
+  %tmp18862 = getelementptr inbounds float* %tmp18861, i64 1
+  %tmp18863 = getelementptr inbounds float* %tmp18862, i64 1
+  %tmp18864 = getelementptr inbounds float* %tmp18863, i64 1
+  %tmp18865 = getelementptr inbounds float* %tmp18864, i64 1
+  %tmp18866 = getelementptr inbounds float* %tmp18865, i64 1
+  %tmp18867 = getelementptr inbounds float* %tmp18866, i64 1
+  %tmp18868 = getelementptr inbounds float* %tmp18867, i64 1
+  %tmp18869 = getelementptr inbounds float* %tmp18868, i64 1
+  %tmp18870 = getelementptr inbounds float* %tmp18869, i64 1
+  %tmp18871 = getelementptr inbounds float* %tmp18870, i64 1
+  %tmp18872 = getelementptr inbounds float* %tmp18871, i64 1
+  %tmp18873 = getelementptr inbounds float* %tmp18872, i64 1
+  %tmp18874 = getelementptr inbounds float* %tmp18873, i64 1
+  %tmp18875 = getelementptr inbounds float* %tmp18874, i64 1
+  %tmp18876 = getelementptr inbounds float* %tmp18875, i64 1
+  %tmp18877 = getelementptr inbounds float* %tmp18876, i64 1
+  %tmp18878 = getelementptr inbounds float* %tmp18877, i64 1
+  %tmp18879 = getelementptr inbounds float* %tmp18878, i64 1
+  %tmp18880 = getelementptr inbounds float* %tmp18879, i64 1
+  %tmp18881 = getelementptr inbounds float* %tmp18880, i64 1
+  %tmp18882 = getelementptr inbounds float* %tmp18881, i64 1
+  %tmp18883 = getelementptr inbounds float* %tmp18882, i64 1
+  %tmp18884 = getelementptr inbounds float* %tmp18883, i64 1
+  %tmp18885 = getelementptr inbounds float* %tmp18884, i64 1
+  %tmp18886 = getelementptr inbounds float* %tmp18885, i64 1
+  %tmp18887 = getelementptr inbounds float* %tmp18886, i64 1
+  %tmp18888 = getelementptr inbounds float* %tmp18887, i64 1
+  %tmp18889 = getelementptr inbounds float* %tmp18888, i64 1
+  %tmp18890 = getelementptr inbounds float* %tmp18889, i64 1
+  %tmp18891 = getelementptr inbounds float* %tmp18890, i64 1
+  %tmp18892 = getelementptr inbounds float* %tmp18891, i64 1
+  %tmp18893 = getelementptr inbounds float* %tmp18892, i64 1
+  %tmp18894 = getelementptr inbounds float* %tmp18893, i64 1
+  %tmp18895 = getelementptr inbounds float* %tmp18894, i64 1
+  %tmp18896 = getelementptr inbounds float* %tmp18895, i64 1
+  %tmp18897 = getelementptr inbounds float* %tmp18896, i64 1
+  %tmp18898 = getelementptr inbounds float* %tmp18897, i64 1
+  %tmp18899 = getelementptr inbounds float* %tmp18898, i64 1
+  %tmp18900 = getelementptr inbounds float* %tmp18899, i64 1
+  %tmp18901 = getelementptr inbounds float* %tmp18900, i64 1
+  %tmp18902 = getelementptr inbounds float* %tmp18901, i64 1
+  %tmp18903 = getelementptr inbounds float* %tmp18902, i64 1
+  %tmp18904 = getelementptr inbounds float* %tmp18903, i64 1
+  %tmp18905 = getelementptr inbounds float* %tmp18904, i64 1
+  %tmp18906 = getelementptr inbounds float* %tmp18905, i64 1
+  %tmp18907 = getelementptr inbounds float* %tmp18906, i64 1
+  %tmp18908 = getelementptr inbounds float* %tmp18907, i64 1
+  %tmp18909 = getelementptr inbounds float* %tmp18908, i64 1
+  %tmp18910 = getelementptr inbounds float* %tmp18909, i64 1
+  %tmp18911 = getelementptr inbounds float* %tmp18910, i64 1
+  %tmp18912 = getelementptr inbounds float* %tmp18911, i64 1
+  %tmp18913 = getelementptr inbounds float* %tmp18912, i64 1
+  %tmp18914 = getelementptr inbounds float* %tmp18913, i64 1
+  %tmp18915 = getelementptr inbounds float* %tmp18914, i64 1
+  %tmp18916 = getelementptr inbounds float* %tmp18915, i64 1
+  %tmp18917 = getelementptr inbounds float* %tmp18916, i64 1
+  %tmp18918 = getelementptr inbounds float* %tmp18917, i64 1
+  %tmp18919 = getelementptr inbounds float* %tmp18918, i64 1
+  %tmp18920 = getelementptr inbounds float* %tmp18919, i64 1
+  %tmp18921 = getelementptr inbounds float* %tmp18920, i64 1
+  %tmp18922 = getelementptr inbounds float* %tmp18921, i64 1
+  %tmp18923 = getelementptr inbounds float* %tmp18922, i64 1
+  %tmp18924 = getelementptr inbounds float* %tmp18923, i64 1
+  %tmp18925 = getelementptr inbounds float* %tmp18924, i64 1
+  %tmp18926 = getelementptr inbounds float* %tmp18925, i64 1
+  %tmp18927 = getelementptr inbounds float* %tmp18926, i64 1
+  %tmp18928 = getelementptr inbounds float* %tmp18927, i64 1
+  %tmp18929 = getelementptr inbounds float* %tmp18928, i64 1
+  %tmp18930 = getelementptr inbounds float* %tmp18929, i64 1
+  %tmp18931 = getelementptr inbounds float* %tmp18930, i64 1
+  %tmp18932 = getelementptr inbounds float* %tmp18931, i64 1
+  %tmp18933 = getelementptr inbounds float* %tmp18932, i64 1
+  %tmp18934 = getelementptr inbounds float* %tmp18933, i64 1
+  %tmp18935 = getelementptr inbounds float* %tmp18934, i64 1
+  %tmp18936 = getelementptr inbounds float* %tmp18935, i64 1
+  %tmp18937 = getelementptr inbounds float* %tmp18936, i64 1
+  %tmp18938 = getelementptr inbounds float* %tmp18937, i64 1
+  %tmp18939 = getelementptr inbounds float* %tmp18938, i64 1
+  %tmp18940 = getelementptr inbounds float* %tmp18939, i64 1
+  %tmp18941 = getelementptr inbounds float* %tmp18940, i64 1
+  %tmp18942 = getelementptr inbounds float* %tmp18941, i64 1
+  %tmp18943 = getelementptr inbounds float* %tmp18942, i64 1
+  %tmp18944 = getelementptr inbounds float* %tmp18943, i64 1
+  %tmp18945 = getelementptr inbounds float* %tmp18944, i64 1
+  %tmp18946 = getelementptr inbounds float* %tmp18945, i64 1
+  %tmp18947 = getelementptr inbounds float* %tmp18946, i64 1
+  %tmp18948 = getelementptr inbounds float* %tmp18947, i64 1
+  %tmp18949 = getelementptr inbounds float* %tmp18948, i64 1
+  %tmp18950 = getelementptr inbounds float* %tmp18949, i64 1
+  %tmp18951 = getelementptr inbounds float* %tmp18950, i64 1
+  %tmp18952 = getelementptr inbounds float* %tmp18951, i64 1
+  %tmp18953 = getelementptr inbounds float* %tmp18952, i64 1
+  %tmp18954 = getelementptr inbounds float* %tmp18953, i64 1
+  %tmp18955 = getelementptr inbounds float* %tmp18954, i64 1
+  %tmp18956 = getelementptr inbounds float* %tmp18955, i64 1
+  %tmp18957 = getelementptr inbounds float* %tmp18956, i64 1
+  %tmp18958 = getelementptr inbounds float* %tmp18957, i64 1
+  %tmp18959 = getelementptr inbounds float* %tmp18958, i64 1
+  %tmp18960 = getelementptr inbounds float* %tmp18959, i64 1
+  %tmp18961 = getelementptr inbounds float* %tmp18960, i64 1
+  %tmp18962 = getelementptr inbounds float* %tmp18961, i64 1
+  %tmp18963 = getelementptr inbounds float* %tmp18962, i64 1
+  %tmp18964 = getelementptr inbounds float* %tmp18963, i64 1
+  %tmp18965 = getelementptr inbounds float* %tmp18964, i64 1
+  %tmp18966 = getelementptr inbounds float* %tmp18965, i64 1
+  %tmp18967 = getelementptr inbounds float* %tmp18966, i64 1
+  %tmp18968 = getelementptr inbounds float* %tmp18967, i64 1
+  %tmp18969 = getelementptr inbounds float* %tmp18968, i64 1
+  %tmp18970 = getelementptr inbounds float* %tmp18969, i64 1
+  %tmp18971 = getelementptr inbounds float* %tmp18970, i64 1
+  %tmp18972 = getelementptr inbounds float* %tmp18971, i64 1
+  %tmp18973 = getelementptr inbounds float* %tmp18972, i64 1
+  %tmp18974 = getelementptr inbounds float* %tmp18973, i64 1
+  %tmp18975 = getelementptr inbounds float* %tmp18974, i64 1
+  %tmp18976 = getelementptr inbounds float* %tmp18975, i64 1
+  %tmp18977 = getelementptr inbounds float* %tmp18976, i64 1
+  %tmp18978 = getelementptr inbounds float* %tmp18977, i64 1
+  %tmp18979 = getelementptr inbounds float* %tmp18978, i64 1
+  %tmp18980 = getelementptr inbounds float* %tmp18979, i64 1
+  %tmp18981 = getelementptr inbounds float* %tmp18980, i64 1
+  %tmp18982 = getelementptr inbounds float* %tmp18981, i64 1
+  %tmp18983 = getelementptr inbounds float* %tmp18982, i64 1
+  %tmp18984 = getelementptr inbounds float* %tmp18983, i64 1
+  %tmp18985 = getelementptr inbounds float* %tmp18984, i64 1
+  %tmp18986 = getelementptr inbounds float* %tmp18985, i64 1
+  %tmp18987 = getelementptr inbounds float* %tmp18986, i64 1
+  %tmp18988 = getelementptr inbounds float* %tmp18987, i64 1
+  %tmp18989 = getelementptr inbounds float* %tmp18988, i64 1
+  %tmp18990 = getelementptr inbounds float* %tmp18989, i64 1
+  %tmp18991 = getelementptr inbounds float* %tmp18990, i64 1
+  %tmp18992 = getelementptr inbounds float* %tmp18991, i64 1
+  %tmp18993 = getelementptr inbounds float* %tmp18992, i64 1
+  %tmp18994 = getelementptr inbounds float* %tmp18993, i64 1
+  %tmp18995 = getelementptr inbounds float* %tmp18994, i64 1
+  %tmp18996 = getelementptr inbounds float* %tmp18995, i64 1
+  %tmp18997 = getelementptr inbounds float* %tmp18996, i64 1
+  %tmp18998 = getelementptr inbounds float* %tmp18997, i64 1
+  %tmp18999 = getelementptr inbounds float* %tmp18998, i64 1
+  %tmp19000 = getelementptr inbounds float* %tmp18999, i64 1
+  %tmp19001 = getelementptr inbounds float* %tmp19000, i64 1
+  %tmp19002 = getelementptr inbounds float* %tmp19001, i64 1
+  %tmp19003 = getelementptr inbounds float* %tmp19002, i64 1
+  %tmp19004 = getelementptr inbounds float* %tmp19003, i64 1
+  %tmp19005 = getelementptr inbounds float* %tmp19004, i64 1
+  %tmp19006 = getelementptr inbounds float* %tmp19005, i64 1
+  %tmp19007 = getelementptr inbounds float* %tmp19006, i64 1
+  %tmp19008 = getelementptr inbounds float* %tmp19007, i64 1
+  %tmp19009 = getelementptr inbounds float* %tmp19008, i64 1
+  %tmp19010 = getelementptr inbounds float* %tmp19009, i64 1
+  %tmp19011 = getelementptr inbounds float* %tmp19010, i64 1
+  %tmp19012 = getelementptr inbounds float* %tmp19011, i64 1
+  %tmp19013 = getelementptr inbounds float* %tmp19012, i64 1
+  %tmp19014 = getelementptr inbounds float* %tmp19013, i64 1
+  %tmp19015 = getelementptr inbounds float* %tmp19014, i64 1
+  %tmp19016 = getelementptr inbounds float* %tmp19015, i64 1
+  %tmp19017 = getelementptr inbounds float* %tmp19016, i64 1
+  %tmp19018 = getelementptr inbounds float* %tmp19017, i64 1
+  %tmp19019 = getelementptr inbounds float* %tmp19018, i64 1
+  %tmp19020 = getelementptr inbounds float* %tmp19019, i64 1
+  %tmp19021 = getelementptr inbounds float* %tmp19020, i64 1
+  %tmp19022 = getelementptr inbounds float* %tmp19021, i64 1
+  %tmp19023 = getelementptr inbounds float* %tmp19022, i64 1
+  %tmp19024 = getelementptr inbounds float* %tmp19023, i64 1
+  %tmp19025 = getelementptr inbounds float* %tmp19024, i64 1
+  %tmp19026 = getelementptr inbounds float* %tmp19025, i64 1
+  %tmp19027 = getelementptr inbounds float* %tmp19026, i64 1
+  %tmp19028 = getelementptr inbounds float* %tmp19027, i64 1
+  %tmp19029 = getelementptr inbounds float* %tmp19028, i64 1
+  %tmp19030 = getelementptr inbounds float* %tmp19029, i64 1
+  %tmp19031 = getelementptr inbounds float* %tmp19030, i64 1
+  %tmp19032 = getelementptr inbounds float* %tmp19031, i64 1
+  %tmp19033 = getelementptr inbounds float* %tmp19032, i64 1
+  %tmp19034 = getelementptr inbounds float* %tmp19033, i64 1
+  %tmp19035 = getelementptr inbounds float* %tmp19034, i64 1
+  %tmp19036 = getelementptr inbounds float* %tmp19035, i64 1
+  %tmp19037 = getelementptr inbounds float* %tmp19036, i64 1
+  %tmp19038 = getelementptr inbounds float* %tmp19037, i64 1
+  %tmp19039 = getelementptr inbounds float* %tmp19038, i64 1
+  %tmp19040 = getelementptr inbounds float* %tmp19039, i64 1
+  %tmp19041 = getelementptr inbounds float* %tmp19040, i64 1
+  %tmp19042 = getelementptr inbounds float* %tmp19041, i64 1
+  %tmp19043 = getelementptr inbounds float* %tmp19042, i64 1
+  %tmp19044 = getelementptr inbounds float* %tmp19043, i64 1
+  %tmp19045 = getelementptr inbounds float* %tmp19044, i64 1
+  %tmp19046 = getelementptr inbounds float* %tmp19045, i64 1
+  %tmp19047 = getelementptr inbounds float* %tmp19046, i64 1
+  %tmp19048 = getelementptr inbounds float* %tmp19047, i64 1
+  %tmp19049 = getelementptr inbounds float* %tmp19048, i64 1
+  %tmp19050 = getelementptr inbounds float* %tmp19049, i64 1
+  %tmp19051 = getelementptr inbounds float* %tmp19050, i64 1
+  %tmp19052 = getelementptr inbounds float* %tmp19051, i64 1
+  %tmp19053 = getelementptr inbounds float* %tmp19052, i64 1
+  %tmp19054 = getelementptr inbounds float* %tmp19053, i64 1
+  %tmp19055 = getelementptr inbounds float* %tmp19054, i64 1
+  %tmp19056 = getelementptr inbounds float* %tmp19055, i64 1
+  %tmp19057 = getelementptr inbounds float* %tmp19056, i64 1
+  %tmp19058 = getelementptr inbounds float* %tmp19057, i64 1
+  %tmp19059 = getelementptr inbounds float* %tmp19058, i64 1
+  %tmp19060 = getelementptr inbounds float* %tmp19059, i64 1
+  %tmp19061 = getelementptr inbounds float* %tmp19060, i64 1
+  %tmp19062 = getelementptr inbounds float* %tmp19061, i64 1
+  %tmp19063 = getelementptr inbounds float* %tmp19062, i64 1
+  %tmp19064 = getelementptr inbounds float* %tmp19063, i64 1
+  %tmp19065 = getelementptr inbounds float* %tmp19064, i64 1
+  %tmp19066 = getelementptr inbounds float* %tmp19065, i64 1
+  %tmp19067 = getelementptr inbounds float* %tmp19066, i64 1
+  %tmp19068 = getelementptr inbounds float* %tmp19067, i64 1
+  %tmp19069 = getelementptr inbounds float* %tmp19068, i64 1
+  %tmp19070 = getelementptr inbounds float* %tmp19069, i64 1
+  %tmp19071 = getelementptr inbounds float* %tmp19070, i64 1
+  %tmp19072 = getelementptr inbounds float* %tmp19071, i64 1
+  %tmp19073 = getelementptr inbounds float* %tmp19072, i64 1
+  %tmp19074 = getelementptr inbounds float* %tmp19073, i64 1
+  %tmp19075 = getelementptr inbounds float* %tmp19074, i64 1
+  %tmp19076 = getelementptr inbounds float* %tmp19075, i64 1
+  %tmp19077 = getelementptr inbounds float* %tmp19076, i64 1
+  %tmp19078 = getelementptr inbounds float* %tmp19077, i64 1
+  %tmp19079 = getelementptr inbounds float* %tmp19078, i64 1
+  %tmp19080 = getelementptr inbounds float* %tmp19079, i64 1
+  %tmp19081 = getelementptr inbounds float* %tmp19080, i64 1
+  %tmp19082 = getelementptr inbounds float* %tmp19081, i64 1
+  %tmp19083 = getelementptr inbounds float* %tmp19082, i64 1
+  %tmp19084 = getelementptr inbounds float* %tmp19083, i64 1
+  %tmp19085 = getelementptr inbounds float* %tmp19084, i64 1
+  %tmp19086 = getelementptr inbounds float* %tmp19085, i64 1
+  %tmp19087 = getelementptr inbounds float* %tmp19086, i64 1
+  %tmp19088 = getelementptr inbounds float* %tmp19087, i64 1
+  %tmp19089 = getelementptr inbounds float* %tmp19088, i64 1
+  %tmp19090 = getelementptr inbounds float* %tmp19089, i64 1
+  %tmp19091 = getelementptr inbounds float* %tmp19090, i64 1
+  %tmp19092 = getelementptr inbounds float* %tmp19091, i64 1
+  %tmp19093 = getelementptr inbounds float* %tmp19092, i64 1
+  %tmp19094 = getelementptr inbounds float* %tmp19093, i64 1
+  %tmp19095 = getelementptr inbounds float* %tmp19094, i64 1
+  %tmp19096 = getelementptr inbounds float* %tmp19095, i64 1
+  %tmp19097 = getelementptr inbounds float* %tmp19096, i64 1
+  %tmp19098 = getelementptr inbounds float* %tmp19097, i64 1
+  %tmp19099 = getelementptr inbounds float* %tmp19098, i64 1
+  %tmp19100 = getelementptr inbounds float* %tmp19099, i64 1
+  %tmp19101 = getelementptr inbounds float* %tmp19100, i64 1
+  %tmp19102 = getelementptr inbounds float* %tmp19101, i64 1
+  %tmp19103 = getelementptr inbounds float* %tmp19102, i64 1
+  %tmp19104 = getelementptr inbounds float* %tmp19103, i64 1
+  %tmp19105 = getelementptr inbounds float* %tmp19104, i64 1
+  %tmp19106 = getelementptr inbounds float* %tmp19105, i64 1
+  %tmp19107 = getelementptr inbounds float* %tmp19106, i64 1
+  %tmp19108 = getelementptr inbounds float* %tmp19107, i64 1
+  %tmp19109 = getelementptr inbounds float* %tmp19108, i64 1
+  %tmp19110 = getelementptr inbounds float* %tmp19109, i64 1
+  %tmp19111 = getelementptr inbounds float* %tmp19110, i64 1
+  %tmp19112 = getelementptr inbounds float* %tmp19111, i64 1
+  %tmp19113 = getelementptr inbounds float* %tmp19112, i64 1
+  %tmp19114 = getelementptr inbounds float* %tmp19113, i64 1
+  %tmp19115 = getelementptr inbounds float* %tmp19114, i64 1
+  %tmp19116 = getelementptr inbounds float* %tmp19115, i64 1
+  %tmp19117 = getelementptr inbounds float* %tmp19116, i64 1
+  %tmp19118 = getelementptr inbounds float* %tmp19117, i64 1
+  %tmp19119 = getelementptr inbounds float* %tmp19118, i64 1
+  %tmp19120 = getelementptr inbounds float* %tmp19119, i64 1
+  %tmp19121 = getelementptr inbounds float* %tmp19120, i64 1
+  %tmp19122 = getelementptr inbounds float* %tmp19121, i64 1
+  %tmp19123 = getelementptr inbounds float* %tmp19122, i64 1
+  %tmp19124 = getelementptr inbounds float* %tmp19123, i64 1
+  %tmp19125 = getelementptr inbounds float* %tmp19124, i64 1
+  %tmp19126 = getelementptr inbounds float* %tmp19125, i64 1
+  %tmp19127 = getelementptr inbounds float* %tmp19126, i64 1
+  %tmp19128 = getelementptr inbounds float* %tmp19127, i64 1
+  %tmp19129 = getelementptr inbounds float* %tmp19128, i64 1
+  %tmp19130 = getelementptr inbounds float* %tmp19129, i64 1
+  %tmp19131 = getelementptr inbounds float* %tmp19130, i64 1
+  %tmp19132 = getelementptr inbounds float* %tmp19131, i64 1
+  %tmp19133 = getelementptr inbounds float* %tmp19132, i64 1
+  %tmp19134 = getelementptr inbounds float* %tmp19133, i64 1
+  %tmp19135 = getelementptr inbounds float* %tmp19134, i64 1
+  %tmp19136 = getelementptr inbounds float* %tmp19135, i64 1
+  %tmp19137 = getelementptr inbounds float* %tmp19136, i64 1
+  %tmp19138 = getelementptr inbounds float* %tmp19137, i64 1
+  %tmp19139 = getelementptr inbounds float* %tmp19138, i64 1
+  %tmp19140 = getelementptr inbounds float* %tmp19139, i64 1
+  %tmp19141 = getelementptr inbounds float* %tmp19140, i64 1
+  %tmp19142 = getelementptr inbounds float* %tmp19141, i64 1
+  %tmp19143 = getelementptr inbounds float* %tmp19142, i64 1
+  %tmp19144 = getelementptr inbounds float* %tmp19143, i64 1
+  %tmp19145 = getelementptr inbounds float* %tmp19144, i64 1
+  %tmp19146 = getelementptr inbounds float* %tmp19145, i64 1
+  %tmp19147 = getelementptr inbounds float* %tmp19146, i64 1
+  %tmp19148 = getelementptr inbounds float* %tmp19147, i64 1
+  %tmp19149 = getelementptr inbounds float* %tmp19148, i64 1
+  %tmp19150 = getelementptr inbounds float* %tmp19149, i64 1
+  %tmp19151 = getelementptr inbounds float* %tmp19150, i64 1
+  %tmp19152 = getelementptr inbounds float* %tmp19151, i64 1
+  %tmp19153 = getelementptr inbounds float* %tmp19152, i64 1
+  %tmp19154 = getelementptr inbounds float* %tmp19153, i64 1
+  %tmp19155 = getelementptr inbounds float* %tmp19154, i64 1
+  %tmp19156 = getelementptr inbounds float* %tmp19155, i64 1
+  %tmp19157 = getelementptr inbounds float* %tmp19156, i64 1
+  %tmp19158 = getelementptr inbounds float* %tmp19157, i64 1
+  %tmp19159 = getelementptr inbounds float* %tmp19158, i64 1
+  %tmp19160 = getelementptr inbounds float* %tmp19159, i64 1
+  %tmp19161 = getelementptr inbounds float* %tmp19160, i64 1
+  %tmp19162 = getelementptr inbounds float* %tmp19161, i64 1
+  %tmp19163 = getelementptr inbounds float* %tmp19162, i64 1
+  %tmp19164 = getelementptr inbounds float* %tmp19163, i64 1
+  %tmp19165 = getelementptr inbounds float* %tmp19164, i64 1
+  %tmp19166 = getelementptr inbounds float* %tmp19165, i64 1
+  %tmp19167 = getelementptr inbounds float* %tmp19166, i64 1
+  %tmp19168 = getelementptr inbounds float* %tmp19167, i64 1
+  %tmp19169 = getelementptr inbounds float* %tmp19168, i64 1
+  %tmp19170 = getelementptr inbounds float* %tmp19169, i64 1
+  %tmp19171 = getelementptr inbounds float* %tmp19170, i64 1
+  %tmp19172 = getelementptr inbounds float* %tmp19171, i64 1
+  %tmp19173 = getelementptr inbounds float* %tmp19172, i64 1
+  %tmp19174 = getelementptr inbounds float* %tmp19173, i64 1
+  %tmp19175 = getelementptr inbounds float* %tmp19174, i64 1
+  %tmp19176 = getelementptr inbounds float* %tmp19175, i64 1
+  %tmp19177 = getelementptr inbounds float* %tmp19176, i64 1
+  %tmp19178 = getelementptr inbounds float* %tmp19177, i64 1
+  %tmp19179 = getelementptr inbounds float* %tmp19178, i64 1
+  %tmp19180 = getelementptr inbounds float* %tmp19179, i64 1
+  %tmp19181 = getelementptr inbounds float* %tmp19180, i64 1
+  %tmp19182 = getelementptr inbounds float* %tmp19181, i64 1
+  %tmp19183 = getelementptr inbounds float* %tmp19182, i64 1
+  %tmp19184 = getelementptr inbounds float* %tmp19183, i64 1
+  %tmp19185 = getelementptr inbounds float* %tmp19184, i64 1
+  %tmp19186 = getelementptr inbounds float* %tmp19185, i64 1
+  %tmp19187 = getelementptr inbounds float* %tmp19186, i64 1
+  %tmp19188 = getelementptr inbounds float* %tmp19187, i64 1
+  %tmp19189 = getelementptr inbounds float* %tmp19188, i64 1
+  %tmp19190 = getelementptr inbounds float* %tmp19189, i64 1
+  %tmp19191 = getelementptr inbounds float* %tmp19190, i64 1
+  %tmp19192 = getelementptr inbounds float* %tmp19191, i64 1
+  %tmp19193 = getelementptr inbounds float* %tmp19192, i64 1
+  %tmp19194 = getelementptr inbounds float* %tmp19193, i64 1
+  %tmp19195 = getelementptr inbounds float* %tmp19194, i64 1
+  %tmp19196 = getelementptr inbounds float* %tmp19195, i64 1
+  %tmp19197 = getelementptr inbounds float* %tmp19196, i64 1
+  %tmp19198 = getelementptr inbounds float* %tmp19197, i64 1
+  %tmp19199 = getelementptr inbounds float* %tmp19198, i64 1
+  %tmp19200 = getelementptr inbounds float* %tmp19199, i64 1
+  %tmp19201 = getelementptr inbounds float* %tmp19200, i64 1
+  %tmp19202 = getelementptr inbounds float* %tmp19201, i64 1
+  %tmp19203 = getelementptr inbounds float* %tmp19202, i64 1
+  %tmp19204 = getelementptr inbounds float* %tmp19203, i64 1
+  %tmp19205 = getelementptr inbounds float* %tmp19204, i64 1
+  %tmp19206 = getelementptr inbounds float* %tmp19205, i64 1
+  %tmp19207 = getelementptr inbounds float* %tmp19206, i64 1
+  %tmp19208 = getelementptr inbounds float* %tmp19207, i64 1
+  %tmp19209 = getelementptr inbounds float* %tmp19208, i64 1
+  %tmp19210 = getelementptr inbounds float* %tmp19209, i64 1
+  %tmp19211 = getelementptr inbounds float* %tmp19210, i64 1
+  %tmp19212 = getelementptr inbounds float* %tmp19211, i64 1
+  %tmp19213 = getelementptr inbounds float* %tmp19212, i64 1
+  %tmp19214 = getelementptr inbounds float* %tmp19213, i64 1
+  %tmp19215 = getelementptr inbounds float* %tmp19214, i64 1
+  %tmp19216 = getelementptr inbounds float* %tmp19215, i64 1
+  %tmp19217 = getelementptr inbounds float* %tmp19216, i64 1
+  %tmp19218 = getelementptr inbounds float* %tmp19217, i64 1
+  %tmp19219 = getelementptr inbounds float* %tmp19218, i64 1
+  %tmp19220 = getelementptr inbounds float* %tmp19219, i64 1
+  %tmp19221 = getelementptr inbounds float* %tmp19220, i64 1
+  %tmp19222 = getelementptr inbounds float* %tmp19221, i64 1
+  %tmp19223 = getelementptr inbounds float* %tmp19222, i64 1
+  %tmp19224 = getelementptr inbounds float* %tmp19223, i64 1
+  %tmp19225 = getelementptr inbounds float* %tmp19224, i64 1
+  %tmp19226 = getelementptr inbounds float* %tmp19225, i64 1
+  %tmp19227 = getelementptr inbounds float* %tmp19226, i64 1
+  %tmp19228 = getelementptr inbounds float* %tmp19227, i64 1
+  %tmp19229 = getelementptr inbounds float* %tmp19228, i64 1
+  %tmp19230 = getelementptr inbounds float* %tmp19229, i64 1
+  %tmp19231 = getelementptr inbounds float* %tmp19230, i64 1
+  %tmp19232 = getelementptr inbounds float* %tmp19231, i64 1
+  %tmp19233 = getelementptr inbounds float* %tmp19232, i64 1
+  %tmp19234 = getelementptr inbounds float* %tmp19233, i64 1
+  %tmp19235 = getelementptr inbounds float* %tmp19234, i64 1
+  %tmp19236 = getelementptr inbounds float* %tmp19235, i64 1
+  %tmp19237 = getelementptr inbounds float* %tmp19236, i64 1
+  %tmp19238 = getelementptr inbounds float* %tmp19237, i64 1
+  %tmp19239 = getelementptr inbounds float* %tmp19238, i64 1
+  %tmp19240 = getelementptr inbounds float* %tmp19239, i64 1
+  %tmp19241 = getelementptr inbounds float* %tmp19240, i64 1
+  %tmp19242 = getelementptr inbounds float* %tmp19241, i64 1
+  %tmp19243 = getelementptr inbounds float* %tmp19242, i64 1
+  %tmp19244 = getelementptr inbounds float* %tmp19243, i64 1
+  %tmp19245 = getelementptr inbounds float* %tmp19244, i64 1
+  %tmp19246 = getelementptr inbounds float* %tmp19245, i64 1
+  %tmp19247 = getelementptr inbounds float* %tmp19246, i64 1
+  %tmp19248 = getelementptr inbounds float* %tmp19247, i64 1
+  %tmp19249 = getelementptr inbounds float* %tmp19248, i64 1
+  %tmp19250 = getelementptr inbounds float* %tmp19249, i64 1
+  %tmp19251 = getelementptr inbounds float* %tmp19250, i64 1
+  %tmp19252 = getelementptr inbounds float* %tmp19251, i64 1
+  %tmp19253 = getelementptr inbounds float* %tmp19252, i64 1
+  %tmp19254 = getelementptr inbounds float* %tmp19253, i64 1
+  %tmp19255 = getelementptr inbounds float* %tmp19254, i64 1
+  %tmp19256 = getelementptr inbounds float* %tmp19255, i64 1
+  %tmp19257 = getelementptr inbounds float* %tmp19256, i64 1
+  %tmp19258 = getelementptr inbounds float* %tmp19257, i64 1
+  %tmp19259 = getelementptr inbounds float* %tmp19258, i64 1
+  %tmp19260 = getelementptr inbounds float* %tmp19259, i64 1
+  %tmp19261 = getelementptr inbounds float* %tmp19260, i64 1
+  %tmp19262 = getelementptr inbounds float* %tmp19261, i64 1
+  %tmp19263 = getelementptr inbounds float* %tmp19262, i64 1
+  %tmp19264 = getelementptr inbounds float* %tmp19263, i64 1
+  %tmp19265 = getelementptr inbounds float* %tmp19264, i64 1
+  %tmp19266 = getelementptr inbounds float* %tmp19265, i64 1
+  %tmp19267 = getelementptr inbounds float* %tmp19266, i64 1
+  %tmp19268 = getelementptr inbounds float* %tmp19267, i64 1
+  %tmp19269 = getelementptr inbounds float* %tmp19268, i64 1
+  %tmp19270 = getelementptr inbounds float* %tmp19269, i64 1
+  %tmp19271 = getelementptr inbounds float* %tmp19270, i64 1
+  %tmp19272 = getelementptr inbounds float* %tmp19271, i64 1
+  %tmp19273 = getelementptr inbounds float* %tmp19272, i64 1
+  %tmp19274 = getelementptr inbounds float* %tmp19273, i64 1
+  %tmp19275 = getelementptr inbounds float* %tmp19274, i64 1
+  %tmp19276 = getelementptr inbounds float* %tmp19275, i64 1
+  %tmp19277 = getelementptr inbounds float* %tmp19276, i64 1
+  %tmp19278 = getelementptr inbounds float* %tmp19277, i64 1
+  %tmp19279 = getelementptr inbounds float* %tmp19278, i64 1
+  %tmp19280 = getelementptr inbounds float* %tmp19279, i64 1
+  %tmp19281 = getelementptr inbounds float* %tmp19280, i64 1
+  %tmp19282 = getelementptr inbounds float* %tmp19281, i64 1
+  %tmp19283 = getelementptr inbounds float* %tmp19282, i64 1
+  %tmp19284 = getelementptr inbounds float* %tmp19283, i64 1
+  %tmp19285 = getelementptr inbounds float* %tmp19284, i64 1
+  %tmp19286 = getelementptr inbounds float* %tmp19285, i64 1
+  %tmp19287 = getelementptr inbounds float* %tmp19286, i64 1
+  %tmp19288 = getelementptr inbounds float* %tmp19287, i64 1
+  %tmp19289 = getelementptr inbounds float* %tmp19288, i64 1
+  %tmp19290 = getelementptr inbounds float* %tmp19289, i64 1
+  %tmp19291 = getelementptr inbounds float* %tmp19290, i64 1
+  %tmp19292 = getelementptr inbounds float* %tmp19291, i64 1
+  %tmp19293 = getelementptr inbounds float* %tmp19292, i64 1
+  %tmp19294 = getelementptr inbounds float* %tmp19293, i64 1
+  %tmp19295 = getelementptr inbounds float* %tmp19294, i64 1
+  %tmp19296 = getelementptr inbounds float* %tmp19295, i64 1
+  %tmp19297 = getelementptr inbounds float* %tmp19296, i64 1
+  %tmp19298 = getelementptr inbounds float* %tmp19297, i64 1
+  %tmp19299 = getelementptr inbounds float* %tmp19298, i64 1
+  %tmp19300 = getelementptr inbounds float* %tmp19299, i64 1
+  %tmp19301 = getelementptr inbounds float* %tmp19300, i64 1
+  %tmp19302 = getelementptr inbounds float* %tmp19301, i64 1
+  %tmp19303 = getelementptr inbounds float* %tmp19302, i64 1
+  %tmp19304 = getelementptr inbounds float* %tmp19303, i64 1
+  %tmp19305 = getelementptr inbounds float* %tmp19304, i64 1
+  %tmp19306 = getelementptr inbounds float* %tmp19305, i64 1
+  %tmp19307 = getelementptr inbounds float* %tmp19306, i64 1
+  %tmp19308 = getelementptr inbounds float* %tmp19307, i64 1
+  %tmp19309 = getelementptr inbounds float* %tmp19308, i64 1
+  %tmp19310 = getelementptr inbounds float* %tmp19309, i64 1
+  %tmp19311 = getelementptr inbounds float* %tmp19310, i64 1
+  %tmp19312 = getelementptr inbounds float* %tmp19311, i64 1
+  %tmp19313 = getelementptr inbounds float* %tmp19312, i64 1
+  %tmp19314 = getelementptr inbounds float* %tmp19313, i64 1
+  %tmp19315 = getelementptr inbounds float* %tmp19314, i64 1
+  %tmp19316 = getelementptr inbounds float* %tmp19315, i64 1
+  %tmp19317 = getelementptr inbounds float* %tmp19316, i64 1
+  %tmp19318 = getelementptr inbounds float* %tmp19317, i64 1
+  %tmp19319 = getelementptr inbounds float* %tmp19318, i64 1
+  %tmp19320 = getelementptr inbounds float* %tmp19319, i64 1
+  %tmp19321 = getelementptr inbounds float* %tmp19320, i64 1
+  %tmp19322 = getelementptr inbounds float* %tmp19321, i64 1
+  %tmp19323 = getelementptr inbounds float* %tmp19322, i64 1
+  %tmp19324 = getelementptr inbounds float* %tmp19323, i64 1
+  %tmp19325 = getelementptr inbounds float* %tmp19324, i64 1
+  %tmp19326 = getelementptr inbounds float* %tmp19325, i64 1
+  %tmp19327 = getelementptr inbounds float* %tmp19326, i64 1
+  %tmp19328 = getelementptr inbounds float* %tmp19327, i64 1
+  %tmp19329 = getelementptr inbounds float* %tmp19328, i64 1
+  %tmp19330 = getelementptr inbounds float* %tmp19329, i64 1
+  %tmp19331 = getelementptr inbounds float* %tmp19330, i64 1
+  %tmp19332 = getelementptr inbounds float* %tmp19331, i64 1
+  %tmp19333 = getelementptr inbounds float* %tmp19332, i64 1
+  %tmp19334 = getelementptr inbounds float* %tmp19333, i64 1
+  %tmp19335 = getelementptr inbounds float* %tmp19334, i64 1
+  %tmp19336 = getelementptr inbounds float* %tmp19335, i64 1
+  %tmp19337 = getelementptr inbounds float* %tmp19336, i64 1
+  %tmp19338 = getelementptr inbounds float* %tmp19337, i64 1
+  %tmp19339 = getelementptr inbounds float* %tmp19338, i64 1
+  %tmp19340 = getelementptr inbounds float* %tmp19339, i64 1
+  %tmp19341 = getelementptr inbounds float* %tmp19340, i64 1
+  %tmp19342 = getelementptr inbounds float* %tmp19341, i64 1
+  %tmp19343 = getelementptr inbounds float* %tmp19342, i64 1
+  %tmp19344 = getelementptr inbounds float* %tmp19343, i64 1
+  %tmp19345 = getelementptr inbounds float* %tmp19344, i64 1
+  %tmp19346 = getelementptr inbounds float* %tmp19345, i64 1
+  %tmp19347 = getelementptr inbounds float* %tmp19346, i64 1
+  %tmp19348 = getelementptr inbounds float* %tmp19347, i64 1
+  %tmp19349 = getelementptr inbounds float* %tmp19348, i64 1
+  %tmp19350 = getelementptr inbounds float* %tmp19349, i64 1
+  %tmp19351 = getelementptr inbounds float* %tmp19350, i64 1
+  %tmp19352 = getelementptr inbounds float* %tmp19351, i64 1
+  %tmp19353 = getelementptr inbounds float* %tmp19352, i64 1
+  %tmp19354 = getelementptr inbounds float* %tmp19353, i64 1
+  %tmp19355 = getelementptr inbounds float* %tmp19354, i64 1
+  %tmp19356 = getelementptr inbounds float* %tmp19355, i64 1
+  %tmp19357 = getelementptr inbounds float* %tmp19356, i64 1
+  %tmp19358 = getelementptr inbounds float* %tmp19357, i64 1
+  %tmp19359 = getelementptr inbounds float* %tmp19358, i64 1
+  %tmp19360 = getelementptr inbounds float* %tmp19359, i64 1
+  %tmp19361 = getelementptr inbounds float* %tmp19360, i64 1
+  %tmp19362 = getelementptr inbounds float* %tmp19361, i64 1
+  %tmp19363 = getelementptr inbounds float* %tmp19362, i64 1
+  %tmp19364 = getelementptr inbounds float* %tmp19363, i64 1
+  %tmp19365 = getelementptr inbounds float* %tmp19364, i64 1
+  %tmp19366 = getelementptr inbounds float* %tmp19365, i64 1
+  %tmp19367 = getelementptr inbounds float* %tmp19366, i64 1
+  %tmp19368 = getelementptr inbounds float* %tmp19367, i64 1
+  %tmp19369 = getelementptr inbounds float* %tmp19368, i64 1
+  %tmp19370 = getelementptr inbounds float* %tmp19369, i64 1
+  %tmp19371 = getelementptr inbounds float* %tmp19370, i64 1
+  %tmp19372 = getelementptr inbounds float* %tmp19371, i64 1
+  %tmp19373 = getelementptr inbounds float* %tmp19372, i64 1
+  %tmp19374 = getelementptr inbounds float* %tmp19373, i64 1
+  %tmp19375 = getelementptr inbounds float* %tmp19374, i64 1
+  %tmp19376 = getelementptr inbounds float* %tmp19375, i64 1
+  %tmp19377 = getelementptr inbounds float* %tmp19376, i64 1
+  %tmp19378 = getelementptr inbounds float* %tmp19377, i64 1
+  %tmp19379 = getelementptr inbounds float* %tmp19378, i64 1
+  %tmp19380 = getelementptr inbounds float* %tmp19379, i64 1
+  %tmp19381 = getelementptr inbounds float* %tmp19380, i64 1
+  %tmp19382 = getelementptr inbounds float* %tmp19381, i64 1
+  %tmp19383 = getelementptr inbounds float* %tmp19382, i64 1
+  %tmp19384 = getelementptr inbounds float* %tmp19383, i64 1
+  %tmp19385 = getelementptr inbounds float* %tmp19384, i64 1
+  %tmp19386 = getelementptr inbounds float* %tmp19385, i64 1
+  %tmp19387 = getelementptr inbounds float* %tmp19386, i64 1
+  %tmp19388 = getelementptr inbounds float* %tmp19387, i64 1
+  %tmp19389 = getelementptr inbounds float* %tmp19388, i64 1
+  %tmp19390 = getelementptr inbounds float* %tmp19389, i64 1
+  %tmp19391 = getelementptr inbounds float* %tmp19390, i64 1
+  %tmp19392 = getelementptr inbounds float* %tmp19391, i64 1
+  %tmp19393 = getelementptr inbounds float* %tmp19392, i64 1
+  %tmp19394 = getelementptr inbounds float* %tmp19393, i64 1
+  %tmp19395 = getelementptr inbounds float* %tmp19394, i64 1
+  %tmp19396 = getelementptr inbounds float* %tmp19395, i64 1
+  %tmp19397 = getelementptr inbounds float* %tmp19396, i64 1
+  %tmp19398 = getelementptr inbounds float* %tmp19397, i64 1
+  %tmp19399 = getelementptr inbounds float* %tmp19398, i64 1
+  %tmp19400 = getelementptr inbounds float* %tmp19399, i64 1
+  %tmp19401 = getelementptr inbounds float* %tmp19400, i64 1
+  %tmp19402 = getelementptr inbounds float* %tmp19401, i64 1
+  %tmp19403 = getelementptr inbounds float* %tmp19402, i64 1
+  %tmp19404 = getelementptr inbounds float* %tmp19403, i64 1
+  %tmp19405 = getelementptr inbounds float* %tmp19404, i64 1
+  %tmp19406 = getelementptr inbounds float* %tmp19405, i64 1
+  %tmp19407 = getelementptr inbounds float* %tmp19406, i64 1
+  %tmp19408 = getelementptr inbounds float* %tmp19407, i64 1
+  %tmp19409 = getelementptr inbounds float* %tmp19408, i64 1
+  %tmp19410 = getelementptr inbounds float* %tmp19409, i64 1
+  %tmp19411 = getelementptr inbounds float* %tmp19410, i64 1
+  %tmp19412 = getelementptr inbounds float* %tmp19411, i64 1
+  %tmp19413 = getelementptr inbounds float* %tmp19412, i64 1
+  %tmp19414 = getelementptr inbounds float* %tmp19413, i64 1
+  %tmp19415 = getelementptr inbounds float* %tmp19414, i64 1
+  %tmp19416 = getelementptr inbounds float* %tmp19415, i64 1
+  %tmp19417 = getelementptr inbounds float* %tmp19416, i64 1
+  %tmp19418 = getelementptr inbounds float* %tmp19417, i64 1
+  %tmp19419 = getelementptr inbounds float* %tmp19418, i64 1
+  %tmp19420 = getelementptr inbounds float* %tmp19419, i64 1
+  %tmp19421 = getelementptr inbounds float* %tmp19420, i64 1
+  %tmp19422 = getelementptr inbounds float* %tmp19421, i64 1
+  %tmp19423 = getelementptr inbounds float* %tmp19422, i64 1
+  %tmp19424 = getelementptr inbounds float* %tmp19423, i64 1
+  %tmp19425 = getelementptr inbounds float* %tmp19424, i64 1
+  %tmp19426 = getelementptr inbounds float* %tmp19425, i64 1
+  %tmp19427 = getelementptr inbounds float* %tmp19426, i64 1
+  %tmp19428 = getelementptr inbounds float* %tmp19427, i64 1
+  %tmp19429 = getelementptr inbounds float* %tmp19428, i64 1
+  %tmp19430 = getelementptr inbounds float* %tmp19429, i64 1
+  %tmp19431 = getelementptr inbounds float* %tmp19430, i64 1
+  %tmp19432 = getelementptr inbounds float* %tmp19431, i64 1
+  %tmp19433 = getelementptr inbounds float* %tmp19432, i64 1
+  %tmp19434 = getelementptr inbounds float* %tmp19433, i64 1
+  %tmp19435 = getelementptr inbounds float* %tmp19434, i64 1
+  %tmp19436 = getelementptr inbounds float* %tmp19435, i64 1
+  %tmp19437 = getelementptr inbounds float* %tmp19436, i64 1
+  %tmp19438 = getelementptr inbounds float* %tmp19437, i64 1
+  %tmp19439 = getelementptr inbounds float* %tmp19438, i64 1
+  %tmp19440 = getelementptr inbounds float* %tmp19439, i64 1
+  %tmp19441 = getelementptr inbounds float* %tmp19440, i64 1
+  %tmp19442 = getelementptr inbounds float* %tmp19441, i64 1
+  %tmp19443 = getelementptr inbounds float* %tmp19442, i64 1
+  %tmp19444 = getelementptr inbounds float* %tmp19443, i64 1
+  %tmp19445 = getelementptr inbounds float* %tmp19444, i64 1
+  %tmp19446 = getelementptr inbounds float* %tmp19445, i64 1
+  %tmp19447 = getelementptr inbounds float* %tmp19446, i64 1
+  %tmp19448 = getelementptr inbounds float* %tmp19447, i64 1
+  %tmp19449 = getelementptr inbounds float* %tmp19448, i64 1
+  %tmp19450 = getelementptr inbounds float* %tmp19449, i64 1
+  %tmp19451 = getelementptr inbounds float* %tmp19450, i64 1
+  %tmp19452 = getelementptr inbounds float* %tmp19451, i64 1
+  %tmp19453 = getelementptr inbounds float* %tmp19452, i64 1
+  %tmp19454 = getelementptr inbounds float* %tmp19453, i64 1
+  %tmp19455 = getelementptr inbounds float* %tmp19454, i64 1
+  %tmp19456 = getelementptr inbounds float* %tmp19455, i64 1
+  %tmp19457 = getelementptr inbounds float* %tmp19456, i64 1
+  %tmp19458 = getelementptr inbounds float* %tmp19457, i64 1
+  %tmp19459 = getelementptr inbounds float* %tmp19458, i64 1
+  %tmp19460 = getelementptr inbounds float* %tmp19459, i64 1
+  %tmp19461 = getelementptr inbounds float* %tmp19460, i64 1
+  %tmp19462 = getelementptr inbounds float* %tmp19461, i64 1
+  %tmp19463 = getelementptr inbounds float* %tmp19462, i64 1
+  %tmp19464 = getelementptr inbounds float* %tmp19463, i64 1
+  %tmp19465 = getelementptr inbounds float* %tmp19464, i64 1
+  %tmp19466 = getelementptr inbounds float* %tmp19465, i64 1
+  %tmp19467 = getelementptr inbounds float* %tmp19466, i64 1
+  %tmp19468 = getelementptr inbounds float* %tmp19467, i64 1
+  %tmp19469 = getelementptr inbounds float* %tmp19468, i64 1
+  %tmp19470 = getelementptr inbounds float* %tmp19469, i64 1
+  %tmp19471 = getelementptr inbounds float* %tmp19470, i64 1
+  %tmp19472 = getelementptr inbounds float* %tmp19471, i64 1
+  %tmp19473 = getelementptr inbounds float* %tmp19472, i64 1
+  %tmp19474 = getelementptr inbounds float* %tmp19473, i64 1
+  %tmp19475 = getelementptr inbounds float* %tmp19474, i64 1
+  %tmp19476 = getelementptr inbounds float* %tmp19475, i64 1
+  %tmp19477 = getelementptr inbounds float* %tmp19476, i64 1
+  %tmp19478 = getelementptr inbounds float* %tmp19477, i64 1
+  %tmp19479 = getelementptr inbounds float* %tmp19478, i64 1
+  %tmp19480 = getelementptr inbounds float* %tmp19479, i64 1
+  %tmp19481 = getelementptr inbounds float* %tmp19480, i64 1
+  %tmp19482 = getelementptr inbounds float* %tmp19481, i64 1
+  %tmp19483 = getelementptr inbounds float* %tmp19482, i64 1
+  %tmp19484 = getelementptr inbounds float* %tmp19483, i64 1
+  %tmp19485 = getelementptr inbounds float* %tmp19484, i64 1
+  %tmp19486 = getelementptr inbounds float* %tmp19485, i64 1
+  %tmp19487 = getelementptr inbounds float* %tmp19486, i64 1
+  %tmp19488 = getelementptr inbounds float* %tmp19487, i64 1
+  %tmp19489 = getelementptr inbounds float* %tmp19488, i64 1
+  %tmp19490 = getelementptr inbounds float* %tmp19489, i64 1
+  %tmp19491 = getelementptr inbounds float* %tmp19490, i64 1
+  %tmp19492 = getelementptr inbounds float* %tmp19491, i64 1
+  %tmp19493 = getelementptr inbounds float* %tmp19492, i64 1
+  %tmp19494 = getelementptr inbounds float* %tmp19493, i64 1
+  %tmp19495 = getelementptr inbounds float* %tmp19494, i64 1
+  %tmp19496 = getelementptr inbounds float* %tmp19495, i64 1
+  %tmp19497 = getelementptr inbounds float* %tmp19496, i64 1
+  %tmp19498 = getelementptr inbounds float* %tmp19497, i64 1
+  %tmp19499 = getelementptr inbounds float* %tmp19498, i64 1
+  %tmp19500 = getelementptr inbounds float* %tmp19499, i64 1
+  %tmp19501 = getelementptr inbounds float* %tmp19500, i64 1
+  %tmp19502 = getelementptr inbounds float* %tmp19501, i64 1
+  %tmp19503 = getelementptr inbounds float* %tmp19502, i64 1
+  %tmp19504 = getelementptr inbounds float* %tmp19503, i64 1
+  %tmp19505 = getelementptr inbounds float* %tmp19504, i64 1
+  %tmp19506 = getelementptr inbounds float* %tmp19505, i64 1
+  %tmp19507 = getelementptr inbounds float* %tmp19506, i64 1
+  %tmp19508 = getelementptr inbounds float* %tmp19507, i64 1
+  %tmp19509 = getelementptr inbounds float* %tmp19508, i64 1
+  %tmp19510 = getelementptr inbounds float* %tmp19509, i64 1
+  %tmp19511 = getelementptr inbounds float* %tmp19510, i64 1
+  %tmp19512 = getelementptr inbounds float* %tmp19511, i64 1
+  %tmp19513 = getelementptr inbounds float* %tmp19512, i64 1
+  %tmp19514 = getelementptr inbounds float* %tmp19513, i64 1
+  %tmp19515 = getelementptr inbounds float* %tmp19514, i64 1
+  %tmp19516 = getelementptr inbounds float* %tmp19515, i64 1
+  %tmp19517 = getelementptr inbounds float* %tmp19516, i64 1
+  %tmp19518 = getelementptr inbounds float* %tmp19517, i64 1
+  %tmp19519 = getelementptr inbounds float* %tmp19518, i64 1
+  %tmp19520 = getelementptr inbounds float* %tmp19519, i64 1
+  %tmp19521 = getelementptr inbounds float* %tmp19520, i64 1
+  %tmp19522 = getelementptr inbounds float* %tmp19521, i64 1
+  %tmp19523 = getelementptr inbounds float* %tmp19522, i64 1
+  %tmp19524 = getelementptr inbounds float* %tmp19523, i64 1
+  %tmp19525 = getelementptr inbounds float* %tmp19524, i64 1
+  %tmp19526 = getelementptr inbounds float* %tmp19525, i64 1
+  %tmp19527 = getelementptr inbounds float* %tmp19526, i64 1
+  %tmp19528 = getelementptr inbounds float* %tmp19527, i64 1
+  %tmp19529 = getelementptr inbounds float* %tmp19528, i64 1
+  %tmp19530 = getelementptr inbounds float* %tmp19529, i64 1
+  %tmp19531 = getelementptr inbounds float* %tmp19530, i64 1
+  %tmp19532 = getelementptr inbounds float* %tmp19531, i64 1
+  %tmp19533 = getelementptr inbounds float* %tmp19532, i64 1
+  %tmp19534 = getelementptr inbounds float* %tmp19533, i64 1
+  %tmp19535 = getelementptr inbounds float* %tmp19534, i64 1
+  %tmp19536 = getelementptr inbounds float* %tmp19535, i64 1
+  %tmp19537 = getelementptr inbounds float* %tmp19536, i64 1
+  %tmp19538 = getelementptr inbounds float* %tmp19537, i64 1
+  %tmp19539 = getelementptr inbounds float* %tmp19538, i64 1
+  %tmp19540 = getelementptr inbounds float* %tmp19539, i64 1
+  %tmp19541 = getelementptr inbounds float* %tmp19540, i64 1
+  %tmp19542 = getelementptr inbounds float* %tmp19541, i64 1
+  %tmp19543 = getelementptr inbounds float* %tmp19542, i64 1
+  %tmp19544 = getelementptr inbounds float* %tmp19543, i64 1
+  %tmp19545 = getelementptr inbounds float* %tmp19544, i64 1
+  %tmp19546 = getelementptr inbounds float* %tmp19545, i64 1
+  %tmp19547 = getelementptr inbounds float* %tmp19546, i64 1
+  %tmp19548 = getelementptr inbounds float* %tmp19547, i64 1
+  %tmp19549 = getelementptr inbounds float* %tmp19548, i64 1
+  %tmp19550 = getelementptr inbounds float* %tmp19549, i64 1
+  %tmp19551 = getelementptr inbounds float* %tmp19550, i64 1
+  %tmp19552 = getelementptr inbounds float* %tmp19551, i64 1
+  %tmp19553 = getelementptr inbounds float* %tmp19552, i64 1
+  %tmp19554 = getelementptr inbounds float* %tmp19553, i64 1
+  %tmp19555 = getelementptr inbounds float* %tmp19554, i64 1
+  %tmp19556 = getelementptr inbounds float* %tmp19555, i64 1
+  %tmp19557 = getelementptr inbounds float* %tmp19556, i64 1
+  %tmp19558 = getelementptr inbounds float* %tmp19557, i64 1
+  %tmp19559 = getelementptr inbounds float* %tmp19558, i64 1
+  %tmp19560 = getelementptr inbounds float* %tmp19559, i64 1
+  %tmp19561 = getelementptr inbounds float* %tmp19560, i64 1
+  %tmp19562 = getelementptr inbounds float* %tmp19561, i64 1
+  %tmp19563 = getelementptr inbounds float* %tmp19562, i64 1
+  %tmp19564 = getelementptr inbounds float* %tmp19563, i64 1
+  %tmp19565 = getelementptr inbounds float* %tmp19564, i64 1
+  %tmp19566 = getelementptr inbounds float* %tmp19565, i64 1
+  %tmp19567 = getelementptr inbounds float* %tmp19566, i64 1
+  %tmp19568 = getelementptr inbounds float* %tmp19567, i64 1
+  %tmp19569 = getelementptr inbounds float* %tmp19568, i64 1
+  %tmp19570 = getelementptr inbounds float* %tmp19569, i64 1
+  %tmp19571 = getelementptr inbounds float* %tmp19570, i64 1
+  %tmp19572 = getelementptr inbounds float* %tmp19571, i64 1
+  %tmp19573 = getelementptr inbounds float* %tmp19572, i64 1
+  %tmp19574 = getelementptr inbounds float* %tmp19573, i64 1
+  %tmp19575 = getelementptr inbounds float* %tmp19574, i64 1
+  %tmp19576 = getelementptr inbounds float* %tmp19575, i64 1
+  %tmp19577 = getelementptr inbounds float* %tmp19576, i64 1
+  %tmp19578 = getelementptr inbounds float* %tmp19577, i64 1
+  %tmp19579 = getelementptr inbounds float* %tmp19578, i64 1
+  %tmp19580 = getelementptr inbounds float* %tmp19579, i64 1
+  %tmp19581 = getelementptr inbounds float* %tmp19580, i64 1
+  %tmp19582 = getelementptr inbounds float* %tmp19581, i64 1
+  %tmp19583 = getelementptr inbounds float* %tmp19582, i64 1
+  %tmp19584 = getelementptr inbounds float* %tmp19583, i64 1
+  %tmp19585 = getelementptr inbounds float* %tmp19584, i64 1
+  %tmp19586 = getelementptr inbounds float* %tmp19585, i64 1
+  %tmp19587 = getelementptr inbounds float* %tmp19586, i64 1
+  %tmp19588 = getelementptr inbounds float* %tmp19587, i64 1
+  %tmp19589 = getelementptr inbounds float* %tmp19588, i64 1
+  %tmp19590 = getelementptr inbounds float* %tmp19589, i64 1
+  %tmp19591 = getelementptr inbounds float* %tmp19590, i64 1
+  %tmp19592 = getelementptr inbounds float* %tmp19591, i64 1
+  %tmp19593 = getelementptr inbounds float* %tmp19592, i64 1
+  %tmp19594 = getelementptr inbounds float* %tmp19593, i64 1
+  %tmp19595 = getelementptr inbounds float* %tmp19594, i64 1
+  %tmp19596 = getelementptr inbounds float* %tmp19595, i64 1
+  %tmp19597 = getelementptr inbounds float* %tmp19596, i64 1
+  %tmp19598 = getelementptr inbounds float* %tmp19597, i64 1
+  %tmp19599 = getelementptr inbounds float* %tmp19598, i64 1
+  %tmp19600 = getelementptr inbounds float* %tmp19599, i64 1
+  %tmp19601 = getelementptr inbounds float* %tmp19600, i64 1
+  %tmp19602 = getelementptr inbounds float* %tmp19601, i64 1
+  %tmp19603 = getelementptr inbounds float* %tmp19602, i64 1
+  %tmp19604 = getelementptr inbounds float* %tmp19603, i64 1
+  %tmp19605 = getelementptr inbounds float* %tmp19604, i64 1
+  %tmp19606 = getelementptr inbounds float* %tmp19605, i64 1
+  %tmp19607 = getelementptr inbounds float* %tmp19606, i64 1
+  %tmp19608 = getelementptr inbounds float* %tmp19607, i64 1
+  %tmp19609 = getelementptr inbounds float* %tmp19608, i64 1
+  %tmp19610 = getelementptr inbounds float* %tmp19609, i64 1
+  %tmp19611 = getelementptr inbounds float* %tmp19610, i64 1
+  %tmp19612 = getelementptr inbounds float* %tmp19611, i64 1
+  %tmp19613 = getelementptr inbounds float* %tmp19612, i64 1
+  %tmp19614 = getelementptr inbounds float* %tmp19613, i64 1
+  %tmp19615 = getelementptr inbounds float* %tmp19614, i64 1
+  %tmp19616 = getelementptr inbounds float* %tmp19615, i64 1
+  %tmp19617 = getelementptr inbounds float* %tmp19616, i64 1
+  %tmp19618 = getelementptr inbounds float* %tmp19617, i64 1
+  %tmp19619 = getelementptr inbounds float* %tmp19618, i64 1
+  %tmp19620 = getelementptr inbounds float* %tmp19619, i64 1
+  %tmp19621 = getelementptr inbounds float* %tmp19620, i64 1
+  %tmp19622 = getelementptr inbounds float* %tmp19621, i64 1
+  %tmp19623 = getelementptr inbounds float* %tmp19622, i64 1
+  %tmp19624 = getelementptr inbounds float* %tmp19623, i64 1
+  %tmp19625 = getelementptr inbounds float* %tmp19624, i64 1
+  %tmp19626 = getelementptr inbounds float* %tmp19625, i64 1
+  %tmp19627 = getelementptr inbounds float* %tmp19626, i64 1
+  %tmp19628 = getelementptr inbounds float* %tmp19627, i64 1
+  %tmp19629 = getelementptr inbounds float* %tmp19628, i64 1
+  %tmp19630 = getelementptr inbounds float* %tmp19629, i64 1
+  %tmp19631 = getelementptr inbounds float* %tmp19630, i64 1
+  %tmp19632 = getelementptr inbounds float* %tmp19631, i64 1
+  %tmp19633 = getelementptr inbounds float* %tmp19632, i64 1
+  %tmp19634 = getelementptr inbounds float* %tmp19633, i64 1
+  %tmp19635 = getelementptr inbounds float* %tmp19634, i64 1
+  %tmp19636 = getelementptr inbounds float* %tmp19635, i64 1
+  %tmp19637 = getelementptr inbounds float* %tmp19636, i64 1
+  %tmp19638 = getelementptr inbounds float* %tmp19637, i64 1
+  %tmp19639 = getelementptr inbounds float* %tmp19638, i64 1
+  %tmp19640 = getelementptr inbounds float* %tmp19639, i64 1
+  %tmp19641 = getelementptr inbounds float* %tmp19640, i64 1
+  %tmp19642 = getelementptr inbounds float* %tmp19641, i64 1
+  %tmp19643 = getelementptr inbounds float* %tmp19642, i64 1
+  %tmp19644 = getelementptr inbounds float* %tmp19643, i64 1
+  %tmp19645 = getelementptr inbounds float* %tmp19644, i64 1
+  %tmp19646 = getelementptr inbounds float* %tmp19645, i64 1
+  %tmp19647 = getelementptr inbounds float* %tmp19646, i64 1
+  %tmp19648 = getelementptr inbounds float* %tmp19647, i64 1
+  %tmp19649 = getelementptr inbounds float* %tmp19648, i64 1
+  %tmp19650 = getelementptr inbounds float* %tmp19649, i64 1
+  %tmp19651 = getelementptr inbounds float* %tmp19650, i64 1
+  %tmp19652 = getelementptr inbounds float* %tmp19651, i64 1
+  %tmp19653 = getelementptr inbounds float* %tmp19652, i64 1
+  %tmp19654 = getelementptr inbounds float* %tmp19653, i64 1
+  %tmp19655 = getelementptr inbounds float* %tmp19654, i64 1
+  %tmp19656 = getelementptr inbounds float* %tmp19655, i64 1
+  %tmp19657 = getelementptr inbounds float* %tmp19656, i64 1
+  %tmp19658 = getelementptr inbounds float* %tmp19657, i64 1
+  %tmp19659 = getelementptr inbounds float* %tmp19658, i64 1
+  %tmp19660 = getelementptr inbounds float* %tmp19659, i64 1
+  %tmp19661 = getelementptr inbounds float* %tmp19660, i64 1
+  %tmp19662 = getelementptr inbounds float* %tmp19661, i64 1
+  %tmp19663 = getelementptr inbounds float* %tmp19662, i64 1
+  %tmp19664 = getelementptr inbounds float* %tmp19663, i64 1
+  %tmp19665 = getelementptr inbounds float* %tmp19664, i64 1
+  %tmp19666 = getelementptr inbounds float* %tmp19665, i64 1
+  %tmp19667 = getelementptr inbounds float* %tmp19666, i64 1
+  %tmp19668 = getelementptr inbounds float* %tmp19667, i64 1
+  %tmp19669 = getelementptr inbounds float* %tmp19668, i64 1
+  %tmp19670 = getelementptr inbounds float* %tmp19669, i64 1
+  %tmp19671 = getelementptr inbounds float* %tmp19670, i64 1
+  %tmp19672 = getelementptr inbounds float* %tmp19671, i64 1
+  %tmp19673 = getelementptr inbounds float* %tmp19672, i64 1
+  %tmp19674 = getelementptr inbounds float* %tmp19673, i64 1
+  %tmp19675 = getelementptr inbounds float* %tmp19674, i64 1
+  %tmp19676 = getelementptr inbounds float* %tmp19675, i64 1
+  %tmp19677 = getelementptr inbounds float* %tmp19676, i64 1
+  %tmp19678 = getelementptr inbounds float* %tmp19677, i64 1
+  %tmp19679 = getelementptr inbounds float* %tmp19678, i64 1
+  %tmp19680 = getelementptr inbounds float* %tmp19679, i64 1
+  %tmp19681 = getelementptr inbounds float* %tmp19680, i64 1
+  %tmp19682 = getelementptr inbounds float* %tmp19681, i64 1
+  %tmp19683 = getelementptr inbounds float* %tmp19682, i64 1
+  %tmp19684 = getelementptr inbounds float* %tmp19683, i64 1
+  %tmp19685 = getelementptr inbounds float* %tmp19684, i64 1
+  %tmp19686 = getelementptr inbounds float* %tmp19685, i64 1
+  %tmp19687 = getelementptr inbounds float* %tmp19686, i64 1
+  %tmp19688 = getelementptr inbounds float* %tmp19687, i64 1
+  %tmp19689 = getelementptr inbounds float* %tmp19688, i64 1
+  %tmp19690 = getelementptr inbounds float* %tmp19689, i64 1
+  %tmp19691 = getelementptr inbounds float* %tmp19690, i64 1
+  %tmp19692 = getelementptr inbounds float* %tmp19691, i64 1
+  %tmp19693 = getelementptr inbounds float* %tmp19692, i64 1
+  %tmp19694 = getelementptr inbounds float* %tmp19693, i64 1
+  %tmp19695 = getelementptr inbounds float* %tmp19694, i64 1
+  %tmp19696 = getelementptr inbounds float* %tmp19695, i64 1
+  %tmp19697 = getelementptr inbounds float* %tmp19696, i64 1
+  %tmp19698 = getelementptr inbounds float* %tmp19697, i64 1
+  %tmp19699 = getelementptr inbounds float* %tmp19698, i64 1
+  %tmp19700 = getelementptr inbounds float* %tmp19699, i64 1
+  %tmp19701 = getelementptr inbounds float* %tmp19700, i64 1
+  %tmp19702 = getelementptr inbounds float* %tmp19701, i64 1
+  %tmp19703 = getelementptr inbounds float* %tmp19702, i64 1
+  %tmp19704 = getelementptr inbounds float* %tmp19703, i64 1
+  %tmp19705 = getelementptr inbounds float* %tmp19704, i64 1
+  %tmp19706 = getelementptr inbounds float* %tmp19705, i64 1
+  %tmp19707 = getelementptr inbounds float* %tmp19706, i64 1
+  %tmp19708 = getelementptr inbounds float* %tmp19707, i64 1
+  %tmp19709 = getelementptr inbounds float* %tmp19708, i64 1
+  %tmp19710 = getelementptr inbounds float* %tmp19709, i64 1
+  %tmp19711 = getelementptr inbounds float* %tmp19710, i64 1
+  %tmp19712 = getelementptr inbounds float* %tmp19711, i64 1
+  %tmp19713 = getelementptr inbounds float* %tmp19712, i64 1
+  %tmp19714 = getelementptr inbounds float* %tmp19713, i64 1
+  %tmp19715 = getelementptr inbounds float* %tmp19714, i64 1
+  %tmp19716 = getelementptr inbounds float* %tmp19715, i64 1
+  %tmp19717 = getelementptr inbounds float* %tmp19716, i64 1
+  %tmp19718 = getelementptr inbounds float* %tmp19717, i64 1
+  %tmp19719 = getelementptr inbounds float* %tmp19718, i64 1
+  %tmp19720 = getelementptr inbounds float* %tmp19719, i64 1
+  %tmp19721 = getelementptr inbounds float* %tmp19720, i64 1
+  %tmp19722 = getelementptr inbounds float* %tmp19721, i64 1
+  %tmp19723 = getelementptr inbounds float* %tmp19722, i64 1
+  %tmp19724 = getelementptr inbounds float* %tmp19723, i64 1
+  %tmp19725 = getelementptr inbounds float* %tmp19724, i64 1
+  %tmp19726 = getelementptr inbounds float* %tmp19725, i64 1
+  %tmp19727 = getelementptr inbounds float* %tmp19726, i64 1
+  %tmp19728 = getelementptr inbounds float* %tmp19727, i64 1
+  %tmp19729 = getelementptr inbounds float* %tmp19728, i64 1
+  %tmp19730 = getelementptr inbounds float* %tmp19729, i64 1
+  %tmp19731 = getelementptr inbounds float* %tmp19730, i64 1
+  %tmp19732 = getelementptr inbounds float* %tmp19731, i64 1
+  %tmp19733 = getelementptr inbounds float* %tmp19732, i64 1
+  %tmp19734 = getelementptr inbounds float* %tmp19733, i64 1
+  %tmp19735 = getelementptr inbounds float* %tmp19734, i64 1
+  %tmp19736 = getelementptr inbounds float* %tmp19735, i64 1
+  %tmp19737 = getelementptr inbounds float* %tmp19736, i64 1
+  %tmp19738 = getelementptr inbounds float* %tmp19737, i64 1
+  %tmp19739 = getelementptr inbounds float* %tmp19738, i64 1
+  %tmp19740 = getelementptr inbounds float* %tmp19739, i64 1
+  %tmp19741 = getelementptr inbounds float* %tmp19740, i64 1
+  %tmp19742 = getelementptr inbounds float* %tmp19741, i64 1
+  %tmp19743 = getelementptr inbounds float* %tmp19742, i64 1
+  %tmp19744 = getelementptr inbounds float* %tmp19743, i64 1
+  %tmp19745 = getelementptr inbounds float* %tmp19744, i64 1
+  %tmp19746 = getelementptr inbounds float* %tmp19745, i64 1
+  %tmp19747 = getelementptr inbounds float* %tmp19746, i64 1
+  %tmp19748 = getelementptr inbounds float* %tmp19747, i64 1
+  %tmp19749 = getelementptr inbounds float* %tmp19748, i64 1
+  %tmp19750 = getelementptr inbounds float* %tmp19749, i64 1
+  %tmp19751 = getelementptr inbounds float* %tmp19750, i64 1
+  %tmp19752 = getelementptr inbounds float* %tmp19751, i64 1
+  %tmp19753 = getelementptr inbounds float* %tmp19752, i64 1
+  %tmp19754 = getelementptr inbounds float* %tmp19753, i64 1
+  %tmp19755 = getelementptr inbounds float* %tmp19754, i64 1
+  %tmp19756 = getelementptr inbounds float* %tmp19755, i64 1
+  %tmp19757 = getelementptr inbounds float* %tmp19756, i64 1
+  %tmp19758 = getelementptr inbounds float* %tmp19757, i64 1
+  %tmp19759 = getelementptr inbounds float* %tmp19758, i64 1
+  %tmp19760 = getelementptr inbounds float* %tmp19759, i64 1
+  %tmp19761 = getelementptr inbounds float* %tmp19760, i64 1
+  %tmp19762 = getelementptr inbounds float* %tmp19761, i64 1
+  %tmp19763 = getelementptr inbounds float* %tmp19762, i64 1
+  %tmp19764 = getelementptr inbounds float* %tmp19763, i64 1
+  %tmp19765 = getelementptr inbounds float* %tmp19764, i64 1
+  %tmp19766 = getelementptr inbounds float* %tmp19765, i64 1
+  %tmp19767 = getelementptr inbounds float* %tmp19766, i64 1
+  %tmp19768 = getelementptr inbounds float* %tmp19767, i64 1
+  %tmp19769 = getelementptr inbounds float* %tmp19768, i64 1
+  %tmp19770 = getelementptr inbounds float* %tmp19769, i64 1
+  %tmp19771 = getelementptr inbounds float* %tmp19770, i64 1
+  %tmp19772 = getelementptr inbounds float* %tmp19771, i64 1
+  %tmp19773 = getelementptr inbounds float* %tmp19772, i64 1
+  %tmp19774 = getelementptr inbounds float* %tmp19773, i64 1
+  %tmp19775 = getelementptr inbounds float* %tmp19774, i64 1
+  %tmp19776 = getelementptr inbounds float* %tmp19775, i64 1
+  %tmp19777 = getelementptr inbounds float* %tmp19776, i64 1
+  %tmp19778 = getelementptr inbounds float* %tmp19777, i64 1
+  %tmp19779 = getelementptr inbounds float* %tmp19778, i64 1
+  %tmp19780 = getelementptr inbounds float* %tmp19779, i64 1
+  %tmp19781 = getelementptr inbounds float* %tmp19780, i64 1
+  %tmp19782 = getelementptr inbounds float* %tmp19781, i64 1
+  %tmp19783 = getelementptr inbounds float* %tmp19782, i64 1
+  %tmp19784 = getelementptr inbounds float* %tmp19783, i64 1
+  %tmp19785 = getelementptr inbounds float* %tmp19784, i64 1
+  %tmp19786 = getelementptr inbounds float* %tmp19785, i64 1
+  %tmp19787 = getelementptr inbounds float* %tmp19786, i64 1
+  %tmp19788 = getelementptr inbounds float* %tmp19787, i64 1
+  %tmp19789 = getelementptr inbounds float* %tmp19788, i64 1
+  %tmp19790 = getelementptr inbounds float* %tmp19789, i64 1
+  %tmp19791 = getelementptr inbounds float* %tmp19790, i64 1
+  %tmp19792 = getelementptr inbounds float* %tmp19791, i64 1
+  %tmp19793 = getelementptr inbounds float* %tmp19792, i64 1
+  %tmp19794 = getelementptr inbounds float* %tmp19793, i64 1
+  %tmp19795 = getelementptr inbounds float* %tmp19794, i64 1
+  %tmp19796 = getelementptr inbounds float* %tmp19795, i64 1
+  %tmp19797 = getelementptr inbounds float* %tmp19796, i64 1
+  %tmp19798 = getelementptr inbounds float* %tmp19797, i64 1
+  %tmp19799 = getelementptr inbounds float* %tmp19798, i64 1
+  %tmp19800 = getelementptr inbounds float* %tmp19799, i64 1
+  %tmp19801 = getelementptr inbounds float* %tmp19800, i64 1
+  %tmp19802 = getelementptr inbounds float* %tmp19801, i64 1
+  %tmp19803 = getelementptr inbounds float* %tmp19802, i64 1
+  %tmp19804 = getelementptr inbounds float* %tmp19803, i64 1
+  %tmp19805 = getelementptr inbounds float* %tmp19804, i64 1
+  %tmp19806 = getelementptr inbounds float* %tmp19805, i64 1
+  %tmp19807 = getelementptr inbounds float* %tmp19806, i64 1
+  %tmp19808 = getelementptr inbounds float* %tmp19807, i64 1
+  %tmp19809 = getelementptr inbounds float* %tmp19808, i64 1
+  %tmp19810 = getelementptr inbounds float* %tmp19809, i64 1
+  %tmp19811 = getelementptr inbounds float* %tmp19810, i64 1
+  %tmp19812 = getelementptr inbounds float* %tmp19811, i64 1
+  %tmp19813 = getelementptr inbounds float* %tmp19812, i64 1
+  %tmp19814 = getelementptr inbounds float* %tmp19813, i64 1
+  %tmp19815 = getelementptr inbounds float* %tmp19814, i64 1
+  %tmp19816 = getelementptr inbounds float* %tmp19815, i64 1
+  %tmp19817 = getelementptr inbounds float* %tmp19816, i64 1
+  %tmp19818 = getelementptr inbounds float* %tmp19817, i64 1
+  %tmp19819 = getelementptr inbounds float* %tmp19818, i64 1
+  %tmp19820 = getelementptr inbounds float* %tmp19819, i64 1
+  %tmp19821 = getelementptr inbounds float* %tmp19820, i64 1
+  %tmp19822 = getelementptr inbounds float* %tmp19821, i64 1
+  %tmp19823 = getelementptr inbounds float* %tmp19822, i64 1
+  %tmp19824 = getelementptr inbounds float* %tmp19823, i64 1
+  %tmp19825 = getelementptr inbounds float* %tmp19824, i64 1
+  %tmp19826 = getelementptr inbounds float* %tmp19825, i64 1
+  %tmp19827 = getelementptr inbounds float* %tmp19826, i64 1
+  %tmp19828 = getelementptr inbounds float* %tmp19827, i64 1
+  %tmp19829 = getelementptr inbounds float* %tmp19828, i64 1
+  %tmp19830 = getelementptr inbounds float* %tmp19829, i64 1
+  %tmp19831 = getelementptr inbounds float* %tmp19830, i64 1
+  %tmp19832 = getelementptr inbounds float* %tmp19831, i64 1
+  %tmp19833 = getelementptr inbounds float* %tmp19832, i64 1
+  %tmp19834 = getelementptr inbounds float* %tmp19833, i64 1
+  %tmp19835 = getelementptr inbounds float* %tmp19834, i64 1
+  %tmp19836 = getelementptr inbounds float* %tmp19835, i64 1
+  %tmp19837 = getelementptr inbounds float* %tmp19836, i64 1
+  %tmp19838 = getelementptr inbounds float* %tmp19837, i64 1
+  %tmp19839 = getelementptr inbounds float* %tmp19838, i64 1
+  %tmp19840 = getelementptr inbounds float* %tmp19839, i64 1
+  %tmp19841 = getelementptr inbounds float* %tmp19840, i64 1
+  %tmp19842 = getelementptr inbounds float* %tmp19841, i64 1
+  %tmp19843 = getelementptr inbounds float* %tmp19842, i64 1
+  %tmp19844 = getelementptr inbounds float* %tmp19843, i64 1
+  %tmp19845 = getelementptr inbounds float* %tmp19844, i64 1
+  %tmp19846 = getelementptr inbounds float* %tmp19845, i64 1
+  %tmp19847 = getelementptr inbounds float* %tmp19846, i64 1
+  %tmp19848 = getelementptr inbounds float* %tmp19847, i64 1
+  %tmp19849 = getelementptr inbounds float* %tmp19848, i64 1
+  %tmp19850 = getelementptr inbounds float* %tmp19849, i64 1
+  %tmp19851 = getelementptr inbounds float* %tmp19850, i64 1
+  %tmp19852 = getelementptr inbounds float* %tmp19851, i64 1
+  %tmp19853 = getelementptr inbounds float* %tmp19852, i64 1
+  %tmp19854 = getelementptr inbounds float* %tmp19853, i64 1
+  %tmp19855 = getelementptr inbounds float* %tmp19854, i64 1
+  %tmp19856 = getelementptr inbounds float* %tmp19855, i64 1
+  %tmp19857 = getelementptr inbounds float* %tmp19856, i64 1
+  %tmp19858 = getelementptr inbounds float* %tmp19857, i64 1
+  %tmp19859 = getelementptr inbounds float* %tmp19858, i64 1
+  %tmp19860 = getelementptr inbounds float* %tmp19859, i64 1
+  %tmp19861 = getelementptr inbounds float* %tmp19860, i64 1
+  %tmp19862 = getelementptr inbounds float* %tmp19861, i64 1
+  %tmp19863 = getelementptr inbounds float* %tmp19862, i64 1
+  %tmp19864 = getelementptr inbounds float* %tmp19863, i64 1
+  %tmp19865 = getelementptr inbounds float* %tmp19864, i64 1
+  %tmp19866 = getelementptr inbounds float* %tmp19865, i64 1
+  %tmp19867 = getelementptr inbounds float* %tmp19866, i64 1
+  %tmp19868 = getelementptr inbounds float* %tmp19867, i64 1
+  %tmp19869 = getelementptr inbounds float* %tmp19868, i64 1
+  %tmp19870 = getelementptr inbounds float* %tmp19869, i64 1
+  %tmp19871 = getelementptr inbounds float* %tmp19870, i64 1
+  %tmp19872 = getelementptr inbounds float* %tmp19871, i64 1
+  %tmp19873 = getelementptr inbounds float* %tmp19872, i64 1
+  %tmp19874 = getelementptr inbounds float* %tmp19873, i64 1
+  %tmp19875 = getelementptr inbounds float* %tmp19874, i64 1
+  %tmp19876 = getelementptr inbounds float* %tmp19875, i64 1
+  %tmp19877 = getelementptr inbounds float* %tmp19876, i64 1
+  %tmp19878 = getelementptr inbounds float* %tmp19877, i64 1
+  %tmp19879 = getelementptr inbounds float* %tmp19878, i64 1
+  %tmp19880 = getelementptr inbounds float* %tmp19879, i64 1
+  %tmp19881 = getelementptr inbounds float* %tmp19880, i64 1
+  %tmp19882 = getelementptr inbounds float* %tmp19881, i64 1
+  %tmp19883 = getelementptr inbounds float* %tmp19882, i64 1
+  %tmp19884 = getelementptr inbounds float* %tmp19883, i64 1
+  %tmp19885 = getelementptr inbounds float* %tmp19884, i64 1
+  %tmp19886 = getelementptr inbounds float* %tmp19885, i64 1
+  %tmp19887 = getelementptr inbounds float* %tmp19886, i64 1
+  %tmp19888 = getelementptr inbounds float* %tmp19887, i64 1
+  %tmp19889 = getelementptr inbounds float* %tmp19888, i64 1
+  %tmp19890 = getelementptr inbounds float* %tmp19889, i64 1
+  %tmp19891 = getelementptr inbounds float* %tmp19890, i64 1
+  %tmp19892 = getelementptr inbounds float* %tmp19891, i64 1
+  %tmp19893 = getelementptr inbounds float* %tmp19892, i64 1
+  %tmp19894 = getelementptr inbounds float* %tmp19893, i64 1
+  %tmp19895 = getelementptr inbounds float* %tmp19894, i64 1
+  %tmp19896 = getelementptr inbounds float* %tmp19895, i64 1
+  %tmp19897 = getelementptr inbounds float* %tmp19896, i64 1
+  %tmp19898 = getelementptr inbounds float* %tmp19897, i64 1
+  %tmp19899 = getelementptr inbounds float* %tmp19898, i64 1
+  %tmp19900 = getelementptr inbounds float* %tmp19899, i64 1
+  %tmp19901 = getelementptr inbounds float* %tmp19900, i64 1
+  %tmp19902 = getelementptr inbounds float* %tmp19901, i64 1
+  %tmp19903 = getelementptr inbounds float* %tmp19902, i64 1
+  %tmp19904 = getelementptr inbounds float* %tmp19903, i64 1
+  %tmp19905 = getelementptr inbounds float* %tmp19904, i64 1
+  %tmp19906 = getelementptr inbounds float* %tmp19905, i64 1
+  %tmp19907 = getelementptr inbounds float* %tmp19906, i64 1
+  %tmp19908 = getelementptr inbounds float* %tmp19907, i64 1
+  %tmp19909 = getelementptr inbounds float* %tmp19908, i64 1
+  %tmp19910 = getelementptr inbounds float* %tmp19909, i64 1
+  %tmp19911 = getelementptr inbounds float* %tmp19910, i64 1
+  %tmp19912 = getelementptr inbounds float* %tmp19911, i64 1
+  %tmp19913 = getelementptr inbounds float* %tmp19912, i64 1
+  %tmp19914 = getelementptr inbounds float* %tmp19913, i64 1
+  %tmp19915 = getelementptr inbounds float* %tmp19914, i64 1
+  %tmp19916 = getelementptr inbounds float* %tmp19915, i64 1
+  %tmp19917 = getelementptr inbounds float* %tmp19916, i64 1
+  %tmp19918 = getelementptr inbounds float* %tmp19917, i64 1
+  %tmp19919 = getelementptr inbounds float* %tmp19918, i64 1
+  %tmp19920 = getelementptr inbounds float* %tmp19919, i64 1
+  %tmp19921 = getelementptr inbounds float* %tmp19920, i64 1
+  %tmp19922 = getelementptr inbounds float* %tmp19921, i64 1
+  %tmp19923 = getelementptr inbounds float* %tmp19922, i64 1
+  %tmp19924 = getelementptr inbounds float* %tmp19923, i64 1
+  %tmp19925 = getelementptr inbounds float* %tmp19924, i64 1
+  %tmp19926 = getelementptr inbounds float* %tmp19925, i64 1
+  %tmp19927 = getelementptr inbounds float* %tmp19926, i64 1
+  %tmp19928 = getelementptr inbounds float* %tmp19927, i64 1
+  %tmp19929 = getelementptr inbounds float* %tmp19928, i64 1
+  %tmp19930 = getelementptr inbounds float* %tmp19929, i64 1
+  %tmp19931 = getelementptr inbounds float* %tmp19930, i64 1
+  %tmp19932 = getelementptr inbounds float* %tmp19931, i64 1
+  %tmp19933 = getelementptr inbounds float* %tmp19932, i64 1
+  %tmp19934 = getelementptr inbounds float* %tmp19933, i64 1
+  %tmp19935 = getelementptr inbounds float* %tmp19934, i64 1
+  %tmp19936 = getelementptr inbounds float* %tmp19935, i64 1
+  %tmp19937 = getelementptr inbounds float* %tmp19936, i64 1
+  %tmp19938 = getelementptr inbounds float* %tmp19937, i64 1
+  %tmp19939 = getelementptr inbounds float* %tmp19938, i64 1
+  %tmp19940 = getelementptr inbounds float* %tmp19939, i64 1
+  %tmp19941 = getelementptr inbounds float* %tmp19940, i64 1
+  %tmp19942 = getelementptr inbounds float* %tmp19941, i64 1
+  %tmp19943 = getelementptr inbounds float* %tmp19942, i64 1
+  %tmp19944 = getelementptr inbounds float* %tmp19943, i64 1
+  %tmp19945 = getelementptr inbounds float* %tmp19944, i64 1
+  %tmp19946 = getelementptr inbounds float* %tmp19945, i64 1
+  %tmp19947 = getelementptr inbounds float* %tmp19946, i64 1
+  %tmp19948 = getelementptr inbounds float* %tmp19947, i64 1
+  %tmp19949 = getelementptr inbounds float* %tmp19948, i64 1
+  %tmp19950 = getelementptr inbounds float* %tmp19949, i64 1
+  %tmp19951 = getelementptr inbounds float* %tmp19950, i64 1
+  %tmp19952 = getelementptr inbounds float* %tmp19951, i64 1
+  %tmp19953 = getelementptr inbounds float* %tmp19952, i64 1
+  %tmp19954 = getelementptr inbounds float* %tmp19953, i64 1
+  %tmp19955 = getelementptr inbounds float* %tmp19954, i64 1
+  %tmp19956 = getelementptr inbounds float* %tmp19955, i64 1
+  %tmp19957 = getelementptr inbounds float* %tmp19956, i64 1
+  %tmp19958 = getelementptr inbounds float* %tmp19957, i64 1
+  %tmp19959 = getelementptr inbounds float* %tmp19958, i64 1
+  %tmp19960 = getelementptr inbounds float* %tmp19959, i64 1
+  %tmp19961 = getelementptr inbounds float* %tmp19960, i64 1
+  %tmp19962 = getelementptr inbounds float* %tmp19961, i64 1
+  %tmp19963 = getelementptr inbounds float* %tmp19962, i64 1
+  %tmp19964 = getelementptr inbounds float* %tmp19963, i64 1
+  %tmp19965 = getelementptr inbounds float* %tmp19964, i64 1
+  %tmp19966 = getelementptr inbounds float* %tmp19965, i64 1
+  %tmp19967 = getelementptr inbounds float* %tmp19966, i64 1
+  %tmp19968 = getelementptr inbounds float* %tmp19967, i64 1
+  %tmp19969 = getelementptr inbounds float* %tmp19968, i64 1
+  %tmp19970 = getelementptr inbounds float* %tmp19969, i64 1
+  %tmp19971 = getelementptr inbounds float* %tmp19970, i64 1
+  %tmp19972 = getelementptr inbounds float* %tmp19971, i64 1
+  %tmp19973 = getelementptr inbounds float* %tmp19972, i64 1
+  %tmp19974 = getelementptr inbounds float* %tmp19973, i64 1
+  %tmp19975 = getelementptr inbounds float* %tmp19974, i64 1
+  %tmp19976 = getelementptr inbounds float* %tmp19975, i64 1
+  %tmp19977 = getelementptr inbounds float* %tmp19976, i64 1
+  %tmp19978 = getelementptr inbounds float* %tmp19977, i64 1
+  %tmp19979 = getelementptr inbounds float* %tmp19978, i64 1
+  %tmp19980 = getelementptr inbounds float* %tmp19979, i64 1
+  %tmp19981 = getelementptr inbounds float* %tmp19980, i64 1
+  %tmp19982 = getelementptr inbounds float* %tmp19981, i64 1
+  %tmp19983 = getelementptr inbounds float* %tmp19982, i64 1
+  %tmp19984 = getelementptr inbounds float* %tmp19983, i64 1
+  %tmp19985 = getelementptr inbounds float* %tmp19984, i64 1
+  %tmp19986 = getelementptr inbounds float* %tmp19985, i64 1
+  %tmp19987 = getelementptr inbounds float* %tmp19986, i64 1
+  %tmp19988 = getelementptr inbounds float* %tmp19987, i64 1
+  %tmp19989 = getelementptr inbounds float* %tmp19988, i64 1
+  %tmp19990 = getelementptr inbounds float* %tmp19989, i64 1
+  %tmp19991 = getelementptr inbounds float* %tmp19990, i64 1
+  %tmp19992 = getelementptr inbounds float* %tmp19991, i64 1
+  %tmp19993 = getelementptr inbounds float* %tmp19992, i64 1
+  %tmp19994 = getelementptr inbounds float* %tmp19993, i64 1
+  %tmp19995 = getelementptr inbounds float* %tmp19994, i64 1
+  %tmp19996 = getelementptr inbounds float* %tmp19995, i64 1
+  %tmp19997 = getelementptr inbounds float* %tmp19996, i64 1
+  %tmp19998 = getelementptr inbounds float* %tmp19997, i64 1
+  %tmp19999 = getelementptr inbounds float* %tmp19998, i64 1
+  %tmp20000 = getelementptr inbounds float* %tmp19999, i64 1
+  %tmp20001 = getelementptr inbounds float* %tmp20000, i64 1
+  %tmp20002 = getelementptr inbounds float* %tmp20001, i64 1
+  %tmp20003 = getelementptr inbounds float* %tmp20002, i64 1
+  %tmp20004 = getelementptr inbounds float* %tmp20003, i64 1
+  %tmp20005 = getelementptr inbounds float* %tmp20004, i64 1
+  %tmp20006 = getelementptr inbounds float* %tmp20005, i64 1
+  %tmp20007 = getelementptr inbounds float* %tmp20006, i64 1
+  %tmp20008 = getelementptr inbounds float* %tmp20007, i64 1
+  %tmp20009 = getelementptr inbounds float* %tmp20008, i64 1
+  %tmp20010 = getelementptr inbounds float* %tmp20009, i64 1
+  %tmp20011 = getelementptr inbounds float* %tmp20010, i64 1
+  %tmp20012 = getelementptr inbounds float* %tmp20011, i64 1
+  %tmp20013 = getelementptr inbounds float* %tmp20012, i64 1
+  %tmp20014 = getelementptr inbounds float* %tmp20013, i64 1
+  %tmp20015 = getelementptr inbounds float* %tmp20014, i64 1
+  %tmp20016 = getelementptr inbounds float* %tmp20015, i64 1
+  %tmp20017 = getelementptr inbounds float* %tmp20016, i64 1
+  %tmp20018 = getelementptr inbounds float* %tmp20017, i64 1
+  %tmp20019 = getelementptr inbounds float* %tmp20018, i64 1
+  %tmp20020 = getelementptr inbounds float* %tmp20019, i64 1
+  %tmp20021 = getelementptr inbounds float* %tmp20020, i64 1
+  %tmp20022 = getelementptr inbounds float* %tmp20021, i64 1
+  %tmp20023 = getelementptr inbounds float* %tmp20022, i64 1
+  %tmp20024 = getelementptr inbounds float* %tmp20023, i64 1
+  %tmp20025 = getelementptr inbounds float* %tmp20024, i64 1
+  %tmp20026 = getelementptr inbounds float* %tmp20025, i64 1
+  %tmp20027 = getelementptr inbounds float* %tmp20026, i64 1
+  %tmp20028 = getelementptr inbounds float* %tmp20027, i64 1
+  %tmp20029 = getelementptr inbounds float* %tmp20028, i64 1
+  %tmp20030 = getelementptr inbounds float* %tmp20029, i64 1
+  %tmp20031 = getelementptr inbounds float* %tmp20030, i64 1
+  %tmp20032 = getelementptr inbounds float* %tmp20031, i64 1
+  %tmp20033 = getelementptr inbounds float* %tmp20032, i64 1
+  %tmp20034 = getelementptr inbounds float* %tmp20033, i64 1
+  %tmp20035 = getelementptr inbounds float* %tmp20034, i64 1
+  %tmp20036 = getelementptr inbounds float* %tmp20035, i64 1
+  %tmp20037 = getelementptr inbounds float* %tmp20036, i64 1
+  %tmp20038 = getelementptr inbounds float* %tmp20037, i64 1
+  %tmp20039 = getelementptr inbounds float* %tmp20038, i64 1
+  %tmp20040 = getelementptr inbounds float* %tmp20039, i64 1
+  %tmp20041 = getelementptr inbounds float* %tmp20040, i64 1
+  %tmp20042 = getelementptr inbounds float* %tmp20041, i64 1
+  %tmp20043 = getelementptr inbounds float* %tmp20042, i64 1
+  %tmp20044 = getelementptr inbounds float* %tmp20043, i64 1
+  %tmp20045 = getelementptr inbounds float* %tmp20044, i64 1
+  %tmp20046 = getelementptr inbounds float* %tmp20045, i64 1
+  %tmp20047 = getelementptr inbounds float* %tmp20046, i64 1
+  %tmp20048 = getelementptr inbounds float* %tmp20047, i64 1
+  %tmp20049 = getelementptr inbounds float* %tmp20048, i64 1
+  %tmp20050 = getelementptr inbounds float* %tmp20049, i64 1
+  %tmp20051 = getelementptr inbounds float* %tmp20050, i64 1
+  %tmp20052 = getelementptr inbounds float* %tmp20051, i64 1
+  %tmp20053 = getelementptr inbounds float* %tmp20052, i64 1
+  %tmp20054 = getelementptr inbounds float* %tmp20053, i64 1
+  %tmp20055 = getelementptr inbounds float* %tmp20054, i64 1
+  %tmp20056 = getelementptr inbounds float* %tmp20055, i64 1
+  %tmp20057 = getelementptr inbounds float* %tmp20056, i64 1
+  %tmp20058 = getelementptr inbounds float* %tmp20057, i64 1
+  %tmp20059 = getelementptr inbounds float* %tmp20058, i64 1
+  %tmp20060 = getelementptr inbounds float* %tmp20059, i64 1
+  %tmp20061 = getelementptr inbounds float* %tmp20060, i64 1
+  %tmp20062 = getelementptr inbounds float* %tmp20061, i64 1
+  %tmp20063 = getelementptr inbounds float* %tmp20062, i64 1
+  %tmp20064 = getelementptr inbounds float* %tmp20063, i64 1
+  %tmp20065 = getelementptr inbounds float* %tmp20064, i64 1
+  %tmp20066 = getelementptr inbounds float* %tmp20065, i64 1
+  %tmp20067 = getelementptr inbounds float* %tmp20066, i64 1
+  %tmp20068 = getelementptr inbounds float* %tmp20067, i64 1
+  %tmp20069 = getelementptr inbounds float* %tmp20068, i64 1
+  %tmp20070 = getelementptr inbounds float* %tmp20069, i64 1
+  %tmp20071 = getelementptr inbounds float* %tmp20070, i64 1
+  %tmp20072 = getelementptr inbounds float* %tmp20071, i64 1
+  %tmp20073 = getelementptr inbounds float* %tmp20072, i64 1
+  %tmp20074 = getelementptr inbounds float* %tmp20073, i64 1
+  %tmp20075 = getelementptr inbounds float* %tmp20074, i64 1
+  %tmp20076 = getelementptr inbounds float* %tmp20075, i64 1
+  %tmp20077 = getelementptr inbounds float* %tmp20076, i64 1
+  %tmp20078 = getelementptr inbounds float* %tmp20077, i64 1
+  %tmp20079 = getelementptr inbounds float* %tmp20078, i64 1
+  %tmp20080 = getelementptr inbounds float* %tmp20079, i64 1
+  %tmp20081 = getelementptr inbounds float* %tmp20080, i64 1
+  %tmp20082 = getelementptr inbounds float* %tmp20081, i64 1
+  %tmp20083 = getelementptr inbounds float* %tmp20082, i64 1
+  %tmp20084 = getelementptr inbounds float* %tmp20083, i64 1
+  %tmp20085 = getelementptr inbounds float* %tmp20084, i64 1
+  %tmp20086 = getelementptr inbounds float* %tmp20085, i64 1
+  %tmp20087 = getelementptr inbounds float* %tmp20086, i64 1
+  %tmp20088 = getelementptr inbounds float* %tmp20087, i64 1
+  %tmp20089 = getelementptr inbounds float* %tmp20088, i64 1
+  %tmp20090 = getelementptr inbounds float* %tmp20089, i64 1
+  %tmp20091 = getelementptr inbounds float* %tmp20090, i64 1
+  %tmp20092 = getelementptr inbounds float* %tmp20091, i64 1
+  %tmp20093 = getelementptr inbounds float* %tmp20092, i64 1
+  %tmp20094 = getelementptr inbounds float* %tmp20093, i64 1
+  %tmp20095 = getelementptr inbounds float* %tmp20094, i64 1
+  %tmp20096 = getelementptr inbounds float* %tmp20095, i64 1
+  %tmp20097 = getelementptr inbounds float* %tmp20096, i64 1
+  %tmp20098 = getelementptr inbounds float* %tmp20097, i64 1
+  %tmp20099 = getelementptr inbounds float* %tmp20098, i64 1
+  %tmp20100 = getelementptr inbounds float* %tmp20099, i64 1
+  %tmp20101 = getelementptr inbounds float* %tmp20100, i64 1
+  %tmp20102 = getelementptr inbounds float* %tmp20101, i64 1
+  %tmp20103 = getelementptr inbounds float* %tmp20102, i64 1
+  %tmp20104 = getelementptr inbounds float* %tmp20103, i64 1
+  %tmp20105 = getelementptr inbounds float* %tmp20104, i64 1
+  %tmp20106 = getelementptr inbounds float* %tmp20105, i64 1
+  %tmp20107 = getelementptr inbounds float* %tmp20106, i64 1
+  %tmp20108 = getelementptr inbounds float* %tmp20107, i64 1
+  %tmp20109 = getelementptr inbounds float* %tmp20108, i64 1
+  %tmp20110 = getelementptr inbounds float* %tmp20109, i64 1
+  %tmp20111 = getelementptr inbounds float* %tmp20110, i64 1
+  %tmp20112 = getelementptr inbounds float* %tmp20111, i64 1
+  %tmp20113 = getelementptr inbounds float* %tmp20112, i64 1
+  %tmp20114 = getelementptr inbounds float* %tmp20113, i64 1
+  %tmp20115 = getelementptr inbounds float* %tmp20114, i64 1
+  %tmp20116 = getelementptr inbounds float* %tmp20115, i64 1
+  %tmp20117 = getelementptr inbounds float* %tmp20116, i64 1
+  %tmp20118 = getelementptr inbounds float* %tmp20117, i64 1
+  %tmp20119 = getelementptr inbounds float* %tmp20118, i64 1
+  %tmp20120 = getelementptr inbounds float* %tmp20119, i64 1
+  %tmp20121 = getelementptr inbounds float* %tmp20120, i64 1
+  %tmp20122 = getelementptr inbounds float* %tmp20121, i64 1
+  %tmp20123 = getelementptr inbounds float* %tmp20122, i64 1
+  %tmp20124 = getelementptr inbounds float* %tmp20123, i64 1
+  %tmp20125 = getelementptr inbounds float* %tmp20124, i64 1
+  %tmp20126 = getelementptr inbounds float* %tmp20125, i64 1
+  %tmp20127 = getelementptr inbounds float* %tmp20126, i64 1
+  %tmp20128 = getelementptr inbounds float* %tmp20127, i64 1
+  %tmp20129 = getelementptr inbounds float* %tmp20128, i64 1
+  %tmp20130 = getelementptr inbounds float* %tmp20129, i64 1
+  %tmp20131 = getelementptr inbounds float* %tmp20130, i64 1
+  %tmp20132 = getelementptr inbounds float* %tmp20131, i64 1
+  %tmp20133 = getelementptr inbounds float* %tmp20132, i64 1
+  %tmp20134 = getelementptr inbounds float* %tmp20133, i64 1
+  %tmp20135 = getelementptr inbounds float* %tmp20134, i64 1
+  %tmp20136 = getelementptr inbounds float* %tmp20135, i64 1
+  %tmp20137 = getelementptr inbounds float* %tmp20136, i64 1
+  %tmp20138 = getelementptr inbounds float* %tmp20137, i64 1
+  %tmp20139 = getelementptr inbounds float* %tmp20138, i64 1
+  %tmp20140 = getelementptr inbounds float* %tmp20139, i64 1
+  %tmp20141 = getelementptr inbounds float* %tmp20140, i64 1
+  %tmp20142 = getelementptr inbounds float* %tmp20141, i64 1
+  %tmp20143 = getelementptr inbounds float* %tmp20142, i64 1
+  %tmp20144 = getelementptr inbounds float* %tmp20143, i64 1
+  %tmp20145 = getelementptr inbounds float* %tmp20144, i64 1
+  %tmp20146 = getelementptr inbounds float* %tmp20145, i64 1
+  %tmp20147 = getelementptr inbounds float* %tmp20146, i64 1
+  %tmp20148 = getelementptr inbounds float* %tmp20147, i64 1
+  %tmp20149 = getelementptr inbounds float* %tmp20148, i64 1
+  %tmp20150 = getelementptr inbounds float* %tmp20149, i64 1
+  %tmp20151 = getelementptr inbounds float* %tmp20150, i64 1
+  %tmp20152 = getelementptr inbounds float* %tmp20151, i64 1
+  %tmp20153 = getelementptr inbounds float* %tmp20152, i64 1
+  %tmp20154 = getelementptr inbounds float* %tmp20153, i64 1
+  %tmp20155 = getelementptr inbounds float* %tmp20154, i64 1
+  %tmp20156 = getelementptr inbounds float* %tmp20155, i64 1
+  %tmp20157 = getelementptr inbounds float* %tmp20156, i64 1
+  %tmp20158 = getelementptr inbounds float* %tmp20157, i64 1
+  %tmp20159 = getelementptr inbounds float* %tmp20158, i64 1
+  %tmp20160 = getelementptr inbounds float* %tmp20159, i64 1
+  %tmp20161 = getelementptr inbounds float* %tmp20160, i64 1
+  %tmp20162 = getelementptr inbounds float* %tmp20161, i64 1
+  %tmp20163 = getelementptr inbounds float* %tmp20162, i64 1
+  %tmp20164 = getelementptr inbounds float* %tmp20163, i64 1
+  %tmp20165 = getelementptr inbounds float* %tmp20164, i64 1
+  %tmp20166 = getelementptr inbounds float* %tmp20165, i64 1
+  %tmp20167 = getelementptr inbounds float* %tmp20166, i64 1
+  %tmp20168 = getelementptr inbounds float* %tmp20167, i64 1
+  %tmp20169 = getelementptr inbounds float* %tmp20168, i64 1
+  %tmp20170 = getelementptr inbounds float* %tmp20169, i64 1
+  %tmp20171 = getelementptr inbounds float* %tmp20170, i64 1
+  %tmp20172 = getelementptr inbounds float* %tmp20171, i64 1
+  %tmp20173 = getelementptr inbounds float* %tmp20172, i64 1
+  %tmp20174 = getelementptr inbounds float* %tmp20173, i64 1
+  %tmp20175 = getelementptr inbounds float* %tmp20174, i64 1
+  %tmp20176 = getelementptr inbounds float* %tmp20175, i64 1
+  %tmp20177 = getelementptr inbounds float* %tmp20176, i64 1
+  %tmp20178 = getelementptr inbounds float* %tmp20177, i64 1
+  %tmp20179 = getelementptr inbounds float* %tmp20178, i64 1
+  %tmp20180 = getelementptr inbounds float* %tmp20179, i64 1
+  %tmp20181 = getelementptr inbounds float* %tmp20180, i64 1
+  %tmp20182 = getelementptr inbounds float* %tmp20181, i64 1
+  %tmp20183 = getelementptr inbounds float* %tmp20182, i64 1
+  %tmp20184 = getelementptr inbounds float* %tmp20183, i64 1
+  %tmp20185 = getelementptr inbounds float* %tmp20184, i64 1
+  %tmp20186 = getelementptr inbounds float* %tmp20185, i64 1
+  %tmp20187 = getelementptr inbounds float* %tmp20186, i64 1
+  %tmp20188 = getelementptr inbounds float* %tmp20187, i64 1
+  %tmp20189 = getelementptr inbounds float* %tmp20188, i64 1
+  %tmp20190 = getelementptr inbounds float* %tmp20189, i64 1
+  %tmp20191 = getelementptr inbounds float* %tmp20190, i64 1
+  %tmp20192 = getelementptr inbounds float* %tmp20191, i64 1
+  %tmp20193 = getelementptr inbounds float* %tmp20192, i64 1
+  %tmp20194 = getelementptr inbounds float* %tmp20193, i64 1
+  %tmp20195 = getelementptr inbounds float* %tmp20194, i64 1
+  %tmp20196 = getelementptr inbounds float* %tmp20195, i64 1
+  %tmp20197 = getelementptr inbounds float* %tmp20196, i64 1
+  %tmp20198 = getelementptr inbounds float* %tmp20197, i64 1
+  %tmp20199 = getelementptr inbounds float* %tmp20198, i64 1
+  %tmp20200 = getelementptr inbounds float* %tmp20199, i64 1
+  %tmp20201 = getelementptr inbounds float* %tmp20200, i64 1
+  %tmp20202 = getelementptr inbounds float* %tmp20201, i64 1
+  %tmp20203 = getelementptr inbounds float* %tmp20202, i64 1
+  %tmp20204 = getelementptr inbounds float* %tmp20203, i64 1
+  %tmp20205 = getelementptr inbounds float* %tmp20204, i64 1
+  %tmp20206 = getelementptr inbounds float* %tmp20205, i64 1
+  %tmp20207 = getelementptr inbounds float* %tmp20206, i64 1
+  %tmp20208 = getelementptr inbounds float* %tmp20207, i64 1
+  %tmp20209 = getelementptr inbounds float* %tmp20208, i64 1
+  %tmp20210 = getelementptr inbounds float* %tmp20209, i64 1
+  %tmp20211 = getelementptr inbounds float* %tmp20210, i64 1
+  %tmp20212 = getelementptr inbounds float* %tmp20211, i64 1
+  %tmp20213 = getelementptr inbounds float* %tmp20212, i64 1
+  %tmp20214 = getelementptr inbounds float* %tmp20213, i64 1
+  %tmp20215 = getelementptr inbounds float* %tmp20214, i64 1
+  %tmp20216 = getelementptr inbounds float* %tmp20215, i64 1
+  %tmp20217 = getelementptr inbounds float* %tmp20216, i64 1
+  %tmp20218 = getelementptr inbounds float* %tmp20217, i64 1
+  %tmp20219 = getelementptr inbounds float* %tmp20218, i64 1
+  %tmp20220 = getelementptr inbounds float* %tmp20219, i64 1
+  %tmp20221 = getelementptr inbounds float* %tmp20220, i64 1
+  %tmp20222 = getelementptr inbounds float* %tmp20221, i64 1
+  %tmp20223 = getelementptr inbounds float* %tmp20222, i64 1
+  %tmp20224 = getelementptr inbounds float* %tmp20223, i64 1
+  %tmp20225 = getelementptr inbounds float* %tmp20224, i64 1
+  %tmp20226 = getelementptr inbounds float* %tmp20225, i64 1
+  %tmp20227 = getelementptr inbounds float* %tmp20226, i64 1
+  %tmp20228 = getelementptr inbounds float* %tmp20227, i64 1
+  %tmp20229 = getelementptr inbounds float* %tmp20228, i64 1
+  %tmp20230 = getelementptr inbounds float* %tmp20229, i64 1
+  %tmp20231 = getelementptr inbounds float* %tmp20230, i64 1
+  %tmp20232 = getelementptr inbounds float* %tmp20231, i64 1
+  %tmp20233 = getelementptr inbounds float* %tmp20232, i64 1
+  %tmp20234 = getelementptr inbounds float* %tmp20233, i64 1
+  %tmp20235 = getelementptr inbounds float* %tmp20234, i64 1
+  %tmp20236 = getelementptr inbounds float* %tmp20235, i64 1
+  %tmp20237 = getelementptr inbounds float* %tmp20236, i64 1
+  %tmp20238 = getelementptr inbounds float* %tmp20237, i64 1
+  %tmp20239 = getelementptr inbounds float* %tmp20238, i64 1
+  %tmp20240 = getelementptr inbounds float* %tmp20239, i64 1
+  %tmp20241 = getelementptr inbounds float* %tmp20240, i64 1
+  %tmp20242 = getelementptr inbounds float* %tmp20241, i64 1
+  %tmp20243 = getelementptr inbounds float* %tmp20242, i64 1
+  %tmp20244 = getelementptr inbounds float* %tmp20243, i64 1
+  %tmp20245 = getelementptr inbounds float* %tmp20244, i64 1
+  %tmp20246 = getelementptr inbounds float* %tmp20245, i64 1
+  %tmp20247 = getelementptr inbounds float* %tmp20246, i64 1
+  %tmp20248 = getelementptr inbounds float* %tmp20247, i64 1
+  %tmp20249 = getelementptr inbounds float* %tmp20248, i64 1
+  %tmp20250 = getelementptr inbounds float* %tmp20249, i64 1
+  %tmp20251 = getelementptr inbounds float* %tmp20250, i64 1
+  %tmp20252 = getelementptr inbounds float* %tmp20251, i64 1
+  %tmp20253 = getelementptr inbounds float* %tmp20252, i64 1
+  %tmp20254 = getelementptr inbounds float* %tmp20253, i64 1
+  %tmp20255 = getelementptr inbounds float* %tmp20254, i64 1
+  %tmp20256 = getelementptr inbounds float* %tmp20255, i64 1
+  %tmp20257 = getelementptr inbounds float* %tmp20256, i64 1
+  %tmp20258 = getelementptr inbounds float* %tmp20257, i64 1
+  %tmp20259 = getelementptr inbounds float* %tmp20258, i64 1
+  %tmp20260 = getelementptr inbounds float* %tmp20259, i64 1
+  %tmp20261 = getelementptr inbounds float* %tmp20260, i64 1
+  %tmp20262 = getelementptr inbounds float* %tmp20261, i64 1
+  %tmp20263 = getelementptr inbounds float* %tmp20262, i64 1
+  %tmp20264 = getelementptr inbounds float* %tmp20263, i64 1
+  %tmp20265 = getelementptr inbounds float* %tmp20264, i64 1
+  %tmp20266 = getelementptr inbounds float* %tmp20265, i64 1
+  %tmp20267 = getelementptr inbounds float* %tmp20266, i64 1
+  %tmp20268 = getelementptr inbounds float* %tmp20267, i64 1
+  %tmp20269 = getelementptr inbounds float* %tmp20268, i64 1
+  %tmp20270 = getelementptr inbounds float* %tmp20269, i64 1
+  %tmp20271 = getelementptr inbounds float* %tmp20270, i64 1
+  %tmp20272 = getelementptr inbounds float* %tmp20271, i64 1
+  %tmp20273 = getelementptr inbounds float* %tmp20272, i64 1
+  %tmp20274 = getelementptr inbounds float* %tmp20273, i64 1
+  %tmp20275 = getelementptr inbounds float* %tmp20274, i64 1
+  %tmp20276 = getelementptr inbounds float* %tmp20275, i64 1
+  %tmp20277 = getelementptr inbounds float* %tmp20276, i64 1
+  %tmp20278 = getelementptr inbounds float* %tmp20277, i64 1
+  %tmp20279 = getelementptr inbounds float* %tmp20278, i64 1
+  %tmp20280 = getelementptr inbounds float* %tmp20279, i64 1
+  %tmp20281 = getelementptr inbounds float* %tmp20280, i64 1
+  %tmp20282 = getelementptr inbounds float* %tmp20281, i64 1
+  %tmp20283 = getelementptr inbounds float* %tmp20282, i64 1
+  %tmp20284 = getelementptr inbounds float* %tmp20283, i64 1
+  %tmp20285 = getelementptr inbounds float* %tmp20284, i64 1
+  %tmp20286 = getelementptr inbounds float* %tmp20285, i64 1
+  %tmp20287 = getelementptr inbounds float* %tmp20286, i64 1
+  %tmp20288 = getelementptr inbounds float* %tmp20287, i64 1
+  %tmp20289 = getelementptr inbounds float* %tmp20288, i64 1
+  %tmp20290 = getelementptr inbounds float* %tmp20289, i64 1
+  %tmp20291 = getelementptr inbounds float* %tmp20290, i64 1
+  %tmp20292 = getelementptr inbounds float* %tmp20291, i64 1
+  %tmp20293 = getelementptr inbounds float* %tmp20292, i64 1
+  %tmp20294 = getelementptr inbounds float* %tmp20293, i64 1
+  %tmp20295 = getelementptr inbounds float* %tmp20294, i64 1
+  %tmp20296 = getelementptr inbounds float* %tmp20295, i64 1
+  %tmp20297 = getelementptr inbounds float* %tmp20296, i64 1
+  %tmp20298 = getelementptr inbounds float* %tmp20297, i64 1
+  %tmp20299 = getelementptr inbounds float* %tmp20298, i64 1
+  %tmp20300 = getelementptr inbounds float* %tmp20299, i64 1
+  %tmp20301 = getelementptr inbounds float* %tmp20300, i64 1
+  %tmp20302 = getelementptr inbounds float* %tmp20301, i64 1
+  %tmp20303 = getelementptr inbounds float* %tmp20302, i64 1
+  %tmp20304 = getelementptr inbounds float* %tmp20303, i64 1
+  %tmp20305 = getelementptr inbounds float* %tmp20304, i64 1
+  %tmp20306 = getelementptr inbounds float* %tmp20305, i64 1
+  %tmp20307 = getelementptr inbounds float* %tmp20306, i64 1
+  %tmp20308 = getelementptr inbounds float* %tmp20307, i64 1
+  %tmp20309 = getelementptr inbounds float* %tmp20308, i64 1
+  %tmp20310 = getelementptr inbounds float* %tmp20309, i64 1
+  %tmp20311 = getelementptr inbounds float* %tmp20310, i64 1
+  %tmp20312 = getelementptr inbounds float* %tmp20311, i64 1
+  %tmp20313 = getelementptr inbounds float* %tmp20312, i64 1
+  %tmp20314 = getelementptr inbounds float* %tmp20313, i64 1
+  %tmp20315 = getelementptr inbounds float* %tmp20314, i64 1
+  %tmp20316 = getelementptr inbounds float* %tmp20315, i64 1
+  %tmp20317 = getelementptr inbounds float* %tmp20316, i64 1
+  %tmp20318 = getelementptr inbounds float* %tmp20317, i64 1
+  %tmp20319 = getelementptr inbounds float* %tmp20318, i64 1
+  %tmp20320 = getelementptr inbounds float* %tmp20319, i64 1
+  %tmp20321 = getelementptr inbounds float* %tmp20320, i64 1
+  %tmp20322 = getelementptr inbounds float* %tmp20321, i64 1
+  %tmp20323 = getelementptr inbounds float* %tmp20322, i64 1
+  %tmp20324 = getelementptr inbounds float* %tmp20323, i64 1
+  %tmp20325 = getelementptr inbounds float* %tmp20324, i64 1
+  %tmp20326 = getelementptr inbounds float* %tmp20325, i64 1
+  %tmp20327 = getelementptr inbounds float* %tmp20326, i64 1
+  %tmp20328 = getelementptr inbounds float* %tmp20327, i64 1
+  %tmp20329 = getelementptr inbounds float* %tmp20328, i64 1
+  %tmp20330 = getelementptr inbounds float* %tmp20329, i64 1
+  %tmp20331 = getelementptr inbounds float* %tmp20330, i64 1
+  %tmp20332 = getelementptr inbounds float* %tmp20331, i64 1
+  %tmp20333 = getelementptr inbounds float* %tmp20332, i64 1
+  %tmp20334 = getelementptr inbounds float* %tmp20333, i64 1
+  %tmp20335 = getelementptr inbounds float* %tmp20334, i64 1
+  %tmp20336 = getelementptr inbounds float* %tmp20335, i64 1
+  %tmp20337 = getelementptr inbounds float* %tmp20336, i64 1
+  %tmp20338 = getelementptr inbounds float* %tmp20337, i64 1
+  %tmp20339 = getelementptr inbounds float* %tmp20338, i64 1
+  %tmp20340 = getelementptr inbounds float* %tmp20339, i64 1
+  %tmp20341 = getelementptr inbounds float* %tmp20340, i64 1
+  %tmp20342 = getelementptr inbounds float* %tmp20341, i64 1
+  %tmp20343 = getelementptr inbounds float* %tmp20342, i64 1
+  %tmp20344 = getelementptr inbounds float* %tmp20343, i64 1
+  %tmp20345 = getelementptr inbounds float* %tmp20344, i64 1
+  %tmp20346 = getelementptr inbounds float* %tmp20345, i64 1
+  %tmp20347 = getelementptr inbounds float* %tmp20346, i64 1
+  %tmp20348 = getelementptr inbounds float* %tmp20347, i64 1
+  %tmp20349 = getelementptr inbounds float* %tmp20348, i64 1
+  %tmp20350 = getelementptr inbounds float* %tmp20349, i64 1
+  %tmp20351 = getelementptr inbounds float* %tmp20350, i64 1
+  %tmp20352 = getelementptr inbounds float* %tmp20351, i64 1
+  %tmp20353 = getelementptr inbounds float* %tmp20352, i64 1
+  %tmp20354 = getelementptr inbounds float* %tmp20353, i64 1
+  %tmp20355 = getelementptr inbounds float* %tmp20354, i64 1
+  %tmp20356 = getelementptr inbounds float* %tmp20355, i64 1
+  %tmp20357 = getelementptr inbounds float* %tmp20356, i64 1
+  %tmp20358 = getelementptr inbounds float* %tmp20357, i64 1
+  %tmp20359 = getelementptr inbounds float* %tmp20358, i64 1
+  %tmp20360 = getelementptr inbounds float* %tmp20359, i64 1
+  %tmp20361 = getelementptr inbounds float* %tmp20360, i64 1
+  %tmp20362 = getelementptr inbounds float* %tmp20361, i64 1
+  %tmp20363 = getelementptr inbounds float* %tmp20362, i64 1
+  %tmp20364 = getelementptr inbounds float* %tmp20363, i64 1
+  %tmp20365 = getelementptr inbounds float* %tmp20364, i64 1
+  %tmp20366 = getelementptr inbounds float* %tmp20365, i64 1
+  %tmp20367 = getelementptr inbounds float* %tmp20366, i64 1
+  %tmp20368 = getelementptr inbounds float* %tmp20367, i64 1
+  %tmp20369 = getelementptr inbounds float* %tmp20368, i64 1
+  %tmp20370 = getelementptr inbounds float* %tmp20369, i64 1
+  %tmp20371 = getelementptr inbounds float* %tmp20370, i64 1
+  %tmp20372 = getelementptr inbounds float* %tmp20371, i64 1
+  %tmp20373 = getelementptr inbounds float* %tmp20372, i64 1
+  %tmp20374 = getelementptr inbounds float* %tmp20373, i64 1
+  %tmp20375 = getelementptr inbounds float* %tmp20374, i64 1
+  %tmp20376 = getelementptr inbounds float* %tmp20375, i64 1
+  %tmp20377 = getelementptr inbounds float* %tmp20376, i64 1
+  %tmp20378 = getelementptr inbounds float* %tmp20377, i64 1
+  %tmp20379 = getelementptr inbounds float* %tmp20378, i64 1
+  %tmp20380 = getelementptr inbounds float* %tmp20379, i64 1
+  %tmp20381 = getelementptr inbounds float* %tmp20380, i64 1
+  %tmp20382 = getelementptr inbounds float* %tmp20381, i64 1
+  %tmp20383 = getelementptr inbounds float* %tmp20382, i64 1
+  %tmp20384 = getelementptr inbounds float* %tmp20383, i64 1
+  %tmp20385 = getelementptr inbounds float* %tmp20384, i64 1
+  %tmp20386 = getelementptr inbounds float* %tmp20385, i64 1
+  %tmp20387 = getelementptr inbounds float* %tmp20386, i64 1
+  %tmp20388 = getelementptr inbounds float* %tmp20387, i64 1
+  %tmp20389 = getelementptr inbounds float* %tmp20388, i64 1
+  %tmp20390 = getelementptr inbounds float* %tmp20389, i64 1
+  %tmp20391 = getelementptr inbounds float* %tmp20390, i64 1
+  %tmp20392 = getelementptr inbounds float* %tmp20391, i64 1
+  %tmp20393 = getelementptr inbounds float* %tmp20392, i64 1
+  %tmp20394 = getelementptr inbounds float* %tmp20393, i64 1
+  %tmp20395 = getelementptr inbounds float* %tmp20394, i64 1
+  %tmp20396 = getelementptr inbounds float* %tmp20395, i64 1
+  %tmp20397 = getelementptr inbounds float* %tmp20396, i64 1
+  %tmp20398 = getelementptr inbounds float* %tmp20397, i64 1
+  %tmp20399 = getelementptr inbounds float* %tmp20398, i64 1
+  %tmp20400 = getelementptr inbounds float* %tmp20399, i64 1
+  %tmp20401 = getelementptr inbounds float* %tmp20400, i64 1
+  %tmp20402 = getelementptr inbounds float* %tmp20401, i64 1
+  %tmp20403 = getelementptr inbounds float* %tmp20402, i64 1
+  %tmp20404 = getelementptr inbounds float* %tmp20403, i64 1
+  %tmp20405 = getelementptr inbounds float* %tmp20404, i64 1
+  %tmp20406 = getelementptr inbounds float* %tmp20405, i64 1
+  %tmp20407 = getelementptr inbounds float* %tmp20406, i64 1
+  %tmp20408 = getelementptr inbounds float* %tmp20407, i64 1
+  %tmp20409 = getelementptr inbounds float* %tmp20408, i64 1
+  %tmp20410 = getelementptr inbounds float* %tmp20409, i64 1
+  %tmp20411 = getelementptr inbounds float* %tmp20410, i64 1
+  %tmp20412 = getelementptr inbounds float* %tmp20411, i64 1
+  %tmp20413 = getelementptr inbounds float* %tmp20412, i64 1
+  %tmp20414 = getelementptr inbounds float* %tmp20413, i64 1
+  %tmp20415 = getelementptr inbounds float* %tmp20414, i64 1
+  %tmp20416 = getelementptr inbounds float* %tmp20415, i64 1
+  %tmp20417 = getelementptr inbounds float* %tmp20416, i64 1
+  %tmp20418 = getelementptr inbounds float* %tmp20417, i64 1
+  %tmp20419 = getelementptr inbounds float* %tmp20418, i64 1
+  %tmp20420 = getelementptr inbounds float* %tmp20419, i64 1
+  %tmp20421 = getelementptr inbounds float* %tmp20420, i64 1
+  %tmp20422 = getelementptr inbounds float* %tmp20421, i64 1
+  %tmp20423 = getelementptr inbounds float* %tmp20422, i64 1
+  %tmp20424 = getelementptr inbounds float* %tmp20423, i64 1
+  %tmp20425 = getelementptr inbounds float* %tmp20424, i64 1
+  %tmp20426 = getelementptr inbounds float* %tmp20425, i64 1
+  %tmp20427 = getelementptr inbounds float* %tmp20426, i64 1
+  %tmp20428 = getelementptr inbounds float* %tmp20427, i64 1
+  %tmp20429 = getelementptr inbounds float* %tmp20428, i64 1
+  %tmp20430 = getelementptr inbounds float* %tmp20429, i64 1
+  %tmp20431 = getelementptr inbounds float* %tmp20430, i64 1
+  %tmp20432 = getelementptr inbounds float* %tmp20431, i64 1
+  %tmp20433 = getelementptr inbounds float* %tmp20432, i64 1
+  %tmp20434 = getelementptr inbounds float* %tmp20433, i64 1
+  %tmp20435 = getelementptr inbounds float* %tmp20434, i64 1
+  %tmp20436 = getelementptr inbounds float* %tmp20435, i64 1
+  %tmp20437 = getelementptr inbounds float* %tmp20436, i64 1
+  %tmp20438 = getelementptr inbounds float* %tmp20437, i64 1
+  %tmp20439 = getelementptr inbounds float* %tmp20438, i64 1
+  %tmp20440 = getelementptr inbounds float* %tmp20439, i64 1
+  %tmp20441 = getelementptr inbounds float* %tmp20440, i64 1
+  %tmp20442 = getelementptr inbounds float* %tmp20441, i64 1
+  %tmp20443 = getelementptr inbounds float* %tmp20442, i64 1
+  %tmp20444 = getelementptr inbounds float* %tmp20443, i64 1
+  %tmp20445 = getelementptr inbounds float* %tmp20444, i64 1
+  %tmp20446 = getelementptr inbounds float* %tmp20445, i64 1
+  %tmp20447 = getelementptr inbounds float* %tmp20446, i64 1
+  %tmp20448 = getelementptr inbounds float* %tmp20447, i64 1
+  %tmp20449 = getelementptr inbounds float* %tmp20448, i64 1
+  %tmp20450 = getelementptr inbounds float* %tmp20449, i64 1
+  %tmp20451 = getelementptr inbounds float* %tmp20450, i64 1
+  %tmp20452 = getelementptr inbounds float* %tmp20451, i64 1
+  %tmp20453 = getelementptr inbounds float* %tmp20452, i64 1
+  %tmp20454 = getelementptr inbounds float* %tmp20453, i64 1
+  %tmp20455 = getelementptr inbounds float* %tmp20454, i64 1
+  %tmp20456 = getelementptr inbounds float* %tmp20455, i64 1
+  %tmp20457 = getelementptr inbounds float* %tmp20456, i64 1
+  %tmp20458 = getelementptr inbounds float* %tmp20457, i64 1
+  %tmp20459 = getelementptr inbounds float* %tmp20458, i64 1
+  %tmp20460 = getelementptr inbounds float* %tmp20459, i64 1
+  %tmp20461 = getelementptr inbounds float* %tmp20460, i64 1
+  %tmp20462 = getelementptr inbounds float* %tmp20461, i64 1
+  %tmp20463 = getelementptr inbounds float* %tmp20462, i64 1
+  %tmp20464 = getelementptr inbounds float* %tmp20463, i64 1
+  %tmp20465 = getelementptr inbounds float* %tmp20464, i64 1
+  %tmp20466 = getelementptr inbounds float* %tmp20465, i64 1
+  %tmp20467 = getelementptr inbounds float* %tmp20466, i64 1
+  %tmp20468 = getelementptr inbounds float* %tmp20467, i64 1
+  %tmp20469 = getelementptr inbounds float* %tmp20468, i64 1
+  %tmp20470 = getelementptr inbounds float* %tmp20469, i64 1
+  %tmp20471 = getelementptr inbounds float* %tmp20470, i64 1
+  %tmp20472 = getelementptr inbounds float* %tmp20471, i64 1
+  %tmp20473 = getelementptr inbounds float* %tmp20472, i64 1
+  %tmp20474 = getelementptr inbounds float* %tmp20473, i64 1
+  %tmp20475 = getelementptr inbounds float* %tmp20474, i64 1
+  %tmp20476 = getelementptr inbounds float* %tmp20475, i64 1
+  %tmp20477 = getelementptr inbounds float* %tmp20476, i64 1
+  %tmp20478 = getelementptr inbounds float* %tmp20477, i64 1
+  %tmp20479 = getelementptr inbounds float* %tmp20478, i64 1
+  %tmp20480 = getelementptr inbounds float* %tmp20479, i64 1
+  %tmp20481 = getelementptr inbounds float* %tmp20480, i64 1
+  %tmp20482 = getelementptr inbounds float* %tmp20481, i64 1
+  %tmp20483 = getelementptr inbounds float* %tmp20482, i64 1
+  %tmp20484 = getelementptr inbounds float* %tmp20483, i64 1
+  %tmp20485 = getelementptr inbounds float* %tmp20484, i64 1
+  %tmp20486 = getelementptr inbounds float* %tmp20485, i64 1
+  %tmp20487 = getelementptr inbounds float* %tmp20486, i64 1
+  %tmp20488 = getelementptr inbounds float* %tmp20487, i64 1
+  %tmp20489 = getelementptr inbounds float* %tmp20488, i64 1
+  %tmp20490 = getelementptr inbounds float* %tmp20489, i64 1
+  %tmp20491 = getelementptr inbounds float* %tmp20490, i64 1
+  %tmp20492 = getelementptr inbounds float* %tmp20491, i64 1
+  %tmp20493 = getelementptr inbounds float* %tmp20492, i64 1
+  %tmp20494 = getelementptr inbounds float* %tmp20493, i64 1
+  %tmp20495 = getelementptr inbounds float* %tmp20494, i64 1
+  %tmp20496 = getelementptr inbounds float* %tmp20495, i64 1
+  %tmp20497 = getelementptr inbounds float* %tmp20496, i64 1
+  %tmp20498 = getelementptr inbounds float* %tmp20497, i64 1
+  %tmp20499 = getelementptr inbounds float* %tmp20498, i64 1
+  %tmp20500 = getelementptr inbounds float* %tmp20499, i64 1
+  %tmp20501 = getelementptr inbounds float* %tmp20500, i64 1
+  %tmp20502 = getelementptr inbounds float* %tmp20501, i64 1
+  %tmp20503 = getelementptr inbounds float* %tmp20502, i64 1
+  %tmp20504 = getelementptr inbounds float* %tmp20503, i64 1
+  %tmp20505 = getelementptr inbounds float* %tmp20504, i64 1
+  %tmp20506 = getelementptr inbounds float* %tmp20505, i64 1
+  %tmp20507 = getelementptr inbounds float* %tmp20506, i64 1
+  %tmp20508 = getelementptr inbounds float* %tmp20507, i64 1
+  %tmp20509 = getelementptr inbounds float* %tmp20508, i64 1
+  %tmp20510 = getelementptr inbounds float* %tmp20509, i64 1
+  %tmp20511 = getelementptr inbounds float* %tmp20510, i64 1
+  %tmp20512 = getelementptr inbounds float* %tmp20511, i64 1
+  %tmp20513 = getelementptr inbounds float* %tmp20512, i64 1
+  %tmp20514 = getelementptr inbounds float* %tmp20513, i64 1
+  %tmp20515 = getelementptr inbounds float* %tmp20514, i64 1
+  %tmp20516 = getelementptr inbounds float* %tmp20515, i64 1
+  %tmp20517 = getelementptr inbounds float* %tmp20516, i64 1
+  %tmp20518 = getelementptr inbounds float* %tmp20517, i64 1
+  %tmp20519 = getelementptr inbounds float* %tmp20518, i64 1
+  %tmp20520 = getelementptr inbounds float* %tmp20519, i64 1
+  %tmp20521 = getelementptr inbounds float* %tmp20520, i64 1
+  %tmp20522 = getelementptr inbounds float* %tmp20521, i64 1
+  %tmp20523 = getelementptr inbounds float* %tmp20522, i64 1
+  %tmp20524 = getelementptr inbounds float* %tmp20523, i64 1
+  %tmp20525 = getelementptr inbounds float* %tmp20524, i64 1
+  %tmp20526 = getelementptr inbounds float* %tmp20525, i64 1
+  %tmp20527 = getelementptr inbounds float* %tmp20526, i64 1
+  %tmp20528 = getelementptr inbounds float* %tmp20527, i64 1
+  %tmp20529 = getelementptr inbounds float* %tmp20528, i64 1
+  %tmp20530 = getelementptr inbounds float* %tmp20529, i64 1
+  %tmp20531 = getelementptr inbounds float* %tmp20530, i64 1
+  %tmp20532 = getelementptr inbounds float* %tmp20531, i64 1
+  %tmp20533 = getelementptr inbounds float* %tmp20532, i64 1
+  %tmp20534 = getelementptr inbounds float* %tmp20533, i64 1
+  %tmp20535 = getelementptr inbounds float* %tmp20534, i64 1
+  %tmp20536 = getelementptr inbounds float* %tmp20535, i64 1
+  %tmp20537 = getelementptr inbounds float* %tmp20536, i64 1
+  %tmp20538 = getelementptr inbounds float* %tmp20537, i64 1
+  %tmp20539 = getelementptr inbounds float* %tmp20538, i64 1
+  %tmp20540 = getelementptr inbounds float* %tmp20539, i64 1
+  %tmp20541 = getelementptr inbounds float* %tmp20540, i64 1
+  %tmp20542 = getelementptr inbounds float* %tmp20541, i64 1
+  %tmp20543 = getelementptr inbounds float* %tmp20542, i64 1
+  %tmp20544 = getelementptr inbounds float* %tmp20543, i64 1
+  %tmp20545 = getelementptr inbounds float* %tmp20544, i64 1
+  %tmp20546 = getelementptr inbounds float* %tmp20545, i64 1
+  %tmp20547 = getelementptr inbounds float* %tmp20546, i64 1
+  %tmp20548 = getelementptr inbounds float* %tmp20547, i64 1
+  %tmp20549 = getelementptr inbounds float* %tmp20548, i64 1
+  %tmp20550 = getelementptr inbounds float* %tmp20549, i64 1
+  %tmp20551 = getelementptr inbounds float* %tmp20550, i64 1
+  %tmp20552 = getelementptr inbounds float* %tmp20551, i64 1
+  %tmp20553 = getelementptr inbounds float* %tmp20552, i64 1
+  %tmp20554 = getelementptr inbounds float* %tmp20553, i64 1
+  %tmp20555 = getelementptr inbounds float* %tmp20554, i64 1
+  %tmp20556 = getelementptr inbounds float* %tmp20555, i64 1
+  %tmp20557 = getelementptr inbounds float* %tmp20556, i64 1
+  %tmp20558 = getelementptr inbounds float* %tmp20557, i64 1
+  %tmp20559 = getelementptr inbounds float* %tmp20558, i64 1
+  %tmp20560 = getelementptr inbounds float* %tmp20559, i64 1
+  %tmp20561 = getelementptr inbounds float* %tmp20560, i64 1
+  %tmp20562 = getelementptr inbounds float* %tmp20561, i64 1
+  %tmp20563 = getelementptr inbounds float* %tmp20562, i64 1
+  %tmp20564 = getelementptr inbounds float* %tmp20563, i64 1
+  %tmp20565 = getelementptr inbounds float* %tmp20564, i64 1
+  %tmp20566 = getelementptr inbounds float* %tmp20565, i64 1
+  %tmp20567 = getelementptr inbounds float* %tmp20566, i64 1
+  %tmp20568 = getelementptr inbounds float* %tmp20567, i64 1
+  %tmp20569 = getelementptr inbounds float* %tmp20568, i64 1
+  %tmp20570 = getelementptr inbounds float* %tmp20569, i64 1
+  %tmp20571 = getelementptr inbounds float* %tmp20570, i64 1
+  %tmp20572 = getelementptr inbounds float* %tmp20571, i64 1
+  %tmp20573 = getelementptr inbounds float* %tmp20572, i64 1
+  %tmp20574 = getelementptr inbounds float* %tmp20573, i64 1
+  %tmp20575 = getelementptr inbounds float* %tmp20574, i64 1
+  %tmp20576 = getelementptr inbounds float* %tmp20575, i64 1
+  %tmp20577 = getelementptr inbounds float* %tmp20576, i64 1
+  %tmp20578 = getelementptr inbounds float* %tmp20577, i64 1
+  %tmp20579 = getelementptr inbounds float* %tmp20578, i64 1
+  %tmp20580 = getelementptr inbounds float* %tmp20579, i64 1
+  %tmp20581 = getelementptr inbounds float* %tmp20580, i64 1
+  %tmp20582 = getelementptr inbounds float* %tmp20581, i64 1
+  %tmp20583 = getelementptr inbounds float* %tmp20582, i64 1
+  %tmp20584 = getelementptr inbounds float* %tmp20583, i64 1
+  %tmp20585 = getelementptr inbounds float* %tmp20584, i64 1
+  %tmp20586 = getelementptr inbounds float* %tmp20585, i64 1
+  %tmp20587 = getelementptr inbounds float* %tmp20586, i64 1
+  %tmp20588 = getelementptr inbounds float* %tmp20587, i64 1
+  %tmp20589 = getelementptr inbounds float* %tmp20588, i64 1
+  %tmp20590 = getelementptr inbounds float* %tmp20589, i64 1
+  %tmp20591 = getelementptr inbounds float* %tmp20590, i64 1
+  %tmp20592 = getelementptr inbounds float* %tmp20591, i64 1
+  %tmp20593 = getelementptr inbounds float* %tmp20592, i64 1
+  %tmp20594 = getelementptr inbounds float* %tmp20593, i64 1
+  %tmp20595 = getelementptr inbounds float* %tmp20594, i64 1
+  %tmp20596 = getelementptr inbounds float* %tmp20595, i64 1
+  %tmp20597 = getelementptr inbounds float* %tmp20596, i64 1
+  %tmp20598 = getelementptr inbounds float* %tmp20597, i64 1
+  %tmp20599 = getelementptr inbounds float* %tmp20598, i64 1
+  %tmp20600 = getelementptr inbounds float* %tmp20599, i64 1
+  %tmp20601 = getelementptr inbounds float* %tmp20600, i64 1
+  %tmp20602 = getelementptr inbounds float* %tmp20601, i64 1
+  %tmp20603 = getelementptr inbounds float* %tmp20602, i64 1
+  %tmp20604 = getelementptr inbounds float* %tmp20603, i64 1
+  %tmp20605 = getelementptr inbounds float* %tmp20604, i64 1
+  %tmp20606 = getelementptr inbounds float* %tmp20605, i64 1
+  %tmp20607 = getelementptr inbounds float* %tmp20606, i64 1
+  %tmp20608 = getelementptr inbounds float* %tmp20607, i64 1
+  %tmp20609 = getelementptr inbounds float* %tmp20608, i64 1
+  %tmp20610 = getelementptr inbounds float* %tmp20609, i64 1
+  %tmp20611 = getelementptr inbounds float* %tmp20610, i64 1
+  %tmp20612 = getelementptr inbounds float* %tmp20611, i64 1
+  %tmp20613 = getelementptr inbounds float* %tmp20612, i64 1
+  %tmp20614 = getelementptr inbounds float* %tmp20613, i64 1
+  %tmp20615 = getelementptr inbounds float* %tmp20614, i64 1
+  %tmp20616 = getelementptr inbounds float* %tmp20615, i64 1
+  %tmp20617 = getelementptr inbounds float* %tmp20616, i64 1
+  %tmp20618 = getelementptr inbounds float* %tmp20617, i64 1
+  %tmp20619 = getelementptr inbounds float* %tmp20618, i64 1
+  %tmp20620 = getelementptr inbounds float* %tmp20619, i64 1
+  %tmp20621 = getelementptr inbounds float* %tmp20620, i64 1
+  %tmp20622 = getelementptr inbounds float* %tmp20621, i64 1
+  %tmp20623 = getelementptr inbounds float* %tmp20622, i64 1
+  %tmp20624 = getelementptr inbounds float* %tmp20623, i64 1
+  %tmp20625 = getelementptr inbounds float* %tmp20624, i64 1
+  %tmp20626 = getelementptr inbounds float* %tmp20625, i64 1
+  %tmp20627 = getelementptr inbounds float* %tmp20626, i64 1
+  %tmp20628 = getelementptr inbounds float* %tmp20627, i64 1
+  %tmp20629 = getelementptr inbounds float* %tmp20628, i64 1
+  %tmp20630 = getelementptr inbounds float* %tmp20629, i64 1
+  %tmp20631 = getelementptr inbounds float* %tmp20630, i64 1
+  %tmp20632 = getelementptr inbounds float* %tmp20631, i64 1
+  %tmp20633 = getelementptr inbounds float* %tmp20632, i64 1
+  %tmp20634 = getelementptr inbounds float* %tmp20633, i64 1
+  %tmp20635 = getelementptr inbounds float* %tmp20634, i64 1
+  %tmp20636 = getelementptr inbounds float* %tmp20635, i64 1
+  %tmp20637 = getelementptr inbounds float* %tmp20636, i64 1
+  %tmp20638 = getelementptr inbounds float* %tmp20637, i64 1
+  %tmp20639 = getelementptr inbounds float* %tmp20638, i64 1
+  %tmp20640 = getelementptr inbounds float* %tmp20639, i64 1
+  %tmp20641 = getelementptr inbounds float* %tmp20640, i64 1
+  %tmp20642 = getelementptr inbounds float* %tmp20641, i64 1
+  %tmp20643 = getelementptr inbounds float* %tmp20642, i64 1
+  %tmp20644 = getelementptr inbounds float* %tmp20643, i64 1
+  %tmp20645 = getelementptr inbounds float* %tmp20644, i64 1
+  %tmp20646 = getelementptr inbounds float* %tmp20645, i64 1
+  %tmp20647 = getelementptr inbounds float* %tmp20646, i64 1
+  %tmp20648 = getelementptr inbounds float* %tmp20647, i64 1
+  %tmp20649 = getelementptr inbounds float* %tmp20648, i64 1
+  %tmp20650 = getelementptr inbounds float* %tmp20649, i64 1
+  %tmp20651 = getelementptr inbounds float* %tmp20650, i64 1
+  %tmp20652 = getelementptr inbounds float* %tmp20651, i64 1
+  %tmp20653 = getelementptr inbounds float* %tmp20652, i64 1
+  %tmp20654 = getelementptr inbounds float* %tmp20653, i64 1
+  %tmp20655 = getelementptr inbounds float* %tmp20654, i64 1
+  %tmp20656 = getelementptr inbounds float* %tmp20655, i64 1
+  %tmp20657 = getelementptr inbounds float* %tmp20656, i64 1
+  %tmp20658 = getelementptr inbounds float* %tmp20657, i64 1
+  %tmp20659 = getelementptr inbounds float* %tmp20658, i64 1
+  %tmp20660 = getelementptr inbounds float* %tmp20659, i64 1
+  %tmp20661 = getelementptr inbounds float* %tmp20660, i64 1
+  %tmp20662 = getelementptr inbounds float* %tmp20661, i64 1
+  %tmp20663 = getelementptr inbounds float* %tmp20662, i64 1
+  %tmp20664 = getelementptr inbounds float* %tmp20663, i64 1
+  %tmp20665 = getelementptr inbounds float* %tmp20664, i64 1
+  %tmp20666 = getelementptr inbounds float* %tmp20665, i64 1
+  %tmp20667 = getelementptr inbounds float* %tmp20666, i64 1
+  %tmp20668 = getelementptr inbounds float* %tmp20667, i64 1
+  %tmp20669 = getelementptr inbounds float* %tmp20668, i64 1
+  %tmp20670 = getelementptr inbounds float* %tmp20669, i64 1
+  %tmp20671 = getelementptr inbounds float* %tmp20670, i64 1
+  %tmp20672 = getelementptr inbounds float* %tmp20671, i64 1
+  %tmp20673 = getelementptr inbounds float* %tmp20672, i64 1
+  %tmp20674 = getelementptr inbounds float* %tmp20673, i64 1
+  %tmp20675 = getelementptr inbounds float* %tmp20674, i64 1
+  %tmp20676 = getelementptr inbounds float* %tmp20675, i64 1
+  %tmp20677 = getelementptr inbounds float* %tmp20676, i64 1
+  %tmp20678 = getelementptr inbounds float* %tmp20677, i64 1
+  %tmp20679 = getelementptr inbounds float* %tmp20678, i64 1
+  %tmp20680 = getelementptr inbounds float* %tmp20679, i64 1
+  %tmp20681 = getelementptr inbounds float* %tmp20680, i64 1
+  %tmp20682 = getelementptr inbounds float* %tmp20681, i64 1
+  %tmp20683 = getelementptr inbounds float* %tmp20682, i64 1
+  %tmp20684 = getelementptr inbounds float* %tmp20683, i64 1
+  %tmp20685 = getelementptr inbounds float* %tmp20684, i64 1
+  %tmp20686 = getelementptr inbounds float* %tmp20685, i64 1
+  %tmp20687 = getelementptr inbounds float* %tmp20686, i64 1
+  %tmp20688 = getelementptr inbounds float* %tmp20687, i64 1
+  %tmp20689 = getelementptr inbounds float* %tmp20688, i64 1
+  %tmp20690 = getelementptr inbounds float* %tmp20689, i64 1
+  %tmp20691 = getelementptr inbounds float* %tmp20690, i64 1
+  %tmp20692 = getelementptr inbounds float* %tmp20691, i64 1
+  %tmp20693 = getelementptr inbounds float* %tmp20692, i64 1
+  %tmp20694 = getelementptr inbounds float* %tmp20693, i64 1
+  %tmp20695 = getelementptr inbounds float* %tmp20694, i64 1
+  %tmp20696 = getelementptr inbounds float* %tmp20695, i64 1
+  %tmp20697 = getelementptr inbounds float* %tmp20696, i64 1
+  %tmp20698 = getelementptr inbounds float* %tmp20697, i64 1
+  %tmp20699 = getelementptr inbounds float* %tmp20698, i64 1
+  %tmp20700 = getelementptr inbounds float* %tmp20699, i64 1
+  %tmp20701 = getelementptr inbounds float* %tmp20700, i64 1
+  %tmp20702 = getelementptr inbounds float* %tmp20701, i64 1
+  %tmp20703 = getelementptr inbounds float* %tmp20702, i64 1
+  %tmp20704 = getelementptr inbounds float* %tmp20703, i64 1
+  %tmp20705 = getelementptr inbounds float* %tmp20704, i64 1
+  %tmp20706 = getelementptr inbounds float* %tmp20705, i64 1
+  %tmp20707 = getelementptr inbounds float* %tmp20706, i64 1
+  %tmp20708 = getelementptr inbounds float* %tmp20707, i64 1
+  %tmp20709 = getelementptr inbounds float* %tmp20708, i64 1
+  %tmp20710 = getelementptr inbounds float* %tmp20709, i64 1
+  %tmp20711 = getelementptr inbounds float* %tmp20710, i64 1
+  %tmp20712 = getelementptr inbounds float* %tmp20711, i64 1
+  %tmp20713 = getelementptr inbounds float* %tmp20712, i64 1
+  %tmp20714 = getelementptr inbounds float* %tmp20713, i64 1
+  %tmp20715 = getelementptr inbounds float* %tmp20714, i64 1
+  %tmp20716 = getelementptr inbounds float* %tmp20715, i64 1
+  %tmp20717 = getelementptr inbounds float* %tmp20716, i64 1
+  %tmp20718 = getelementptr inbounds float* %tmp20717, i64 1
+  %tmp20719 = getelementptr inbounds float* %tmp20718, i64 1
+  %tmp20720 = getelementptr inbounds float* %tmp20719, i64 1
+  %tmp20721 = getelementptr inbounds float* %tmp20720, i64 1
+  %tmp20722 = getelementptr inbounds float* %tmp20721, i64 1
+  %tmp20723 = getelementptr inbounds float* %tmp20722, i64 1
+  %tmp20724 = getelementptr inbounds float* %tmp20723, i64 1
+  %tmp20725 = getelementptr inbounds float* %tmp20724, i64 1
+  %tmp20726 = getelementptr inbounds float* %tmp20725, i64 1
+  %tmp20727 = getelementptr inbounds float* %tmp20726, i64 1
+  %tmp20728 = getelementptr inbounds float* %tmp20727, i64 1
+  %tmp20729 = getelementptr inbounds float* %tmp20728, i64 1
+  %tmp20730 = getelementptr inbounds float* %tmp20729, i64 1
+  %tmp20731 = getelementptr inbounds float* %tmp20730, i64 1
+  %tmp20732 = getelementptr inbounds float* %tmp20731, i64 1
+  %tmp20733 = getelementptr inbounds float* %tmp20732, i64 1
+  %tmp20734 = getelementptr inbounds float* %tmp20733, i64 1
+  %tmp20735 = getelementptr inbounds float* %tmp20734, i64 1
+  %tmp20736 = getelementptr inbounds float* %tmp20735, i64 1
+  %tmp20737 = getelementptr inbounds float* %tmp20736, i64 1
+  %tmp20738 = getelementptr inbounds float* %tmp20737, i64 1
+  %tmp20739 = getelementptr inbounds float* %tmp20738, i64 1
+  %tmp20740 = getelementptr inbounds float* %tmp20739, i64 1
+  %tmp20741 = getelementptr inbounds float* %tmp20740, i64 1
+  %tmp20742 = getelementptr inbounds float* %tmp20741, i64 1
+  %tmp20743 = getelementptr inbounds float* %tmp20742, i64 1
+  %tmp20744 = getelementptr inbounds float* %tmp20743, i64 1
+  %tmp20745 = getelementptr inbounds float* %tmp20744, i64 1
+  %tmp20746 = getelementptr inbounds float* %tmp20745, i64 1
+  %tmp20747 = getelementptr inbounds float* %tmp20746, i64 1
+  %tmp20748 = getelementptr inbounds float* %tmp20747, i64 1
+  %tmp20749 = getelementptr inbounds float* %tmp20748, i64 1
+  %tmp20750 = getelementptr inbounds float* %tmp20749, i64 1
+  %tmp20751 = getelementptr inbounds float* %tmp20750, i64 1
+  %tmp20752 = getelementptr inbounds float* %tmp20751, i64 1
+  %tmp20753 = getelementptr inbounds float* %tmp20752, i64 1
+  %tmp20754 = getelementptr inbounds float* %tmp20753, i64 1
+  %tmp20755 = getelementptr inbounds float* %tmp20754, i64 1
+  %tmp20756 = getelementptr inbounds float* %tmp20755, i64 1
+  %tmp20757 = getelementptr inbounds float* %tmp20756, i64 1
+  %tmp20758 = getelementptr inbounds float* %tmp20757, i64 1
+  %tmp20759 = getelementptr inbounds float* %tmp20758, i64 1
+  %tmp20760 = getelementptr inbounds float* %tmp20759, i64 1
+  %tmp20761 = getelementptr inbounds float* %tmp20760, i64 1
+  %tmp20762 = getelementptr inbounds float* %tmp20761, i64 1
+  %tmp20763 = getelementptr inbounds float* %tmp20762, i64 1
+  %tmp20764 = getelementptr inbounds float* %tmp20763, i64 1
+  %tmp20765 = getelementptr inbounds float* %tmp20764, i64 1
+  %tmp20766 = getelementptr inbounds float* %tmp20765, i64 1
+  %tmp20767 = getelementptr inbounds float* %tmp20766, i64 1
+  %tmp20768 = getelementptr inbounds float* %tmp20767, i64 1
+  %tmp20769 = getelementptr inbounds float* %tmp20768, i64 1
+  %tmp20770 = getelementptr inbounds float* %tmp20769, i64 1
+  %tmp20771 = getelementptr inbounds float* %tmp20770, i64 1
+  %tmp20772 = getelementptr inbounds float* %tmp20771, i64 1
+  %tmp20773 = getelementptr inbounds float* %tmp20772, i64 1
+  %tmp20774 = getelementptr inbounds float* %tmp20773, i64 1
+  %tmp20775 = getelementptr inbounds float* %tmp20774, i64 1
+  %tmp20776 = getelementptr inbounds float* %tmp20775, i64 1
+  %tmp20777 = getelementptr inbounds float* %tmp20776, i64 1
+  %tmp20778 = getelementptr inbounds float* %tmp20777, i64 1
+  %tmp20779 = getelementptr inbounds float* %tmp20778, i64 1
+  %tmp20780 = getelementptr inbounds float* %tmp20779, i64 1
+  %tmp20781 = getelementptr inbounds float* %tmp20780, i64 1
+  %tmp20782 = getelementptr inbounds float* %tmp20781, i64 1
+  %tmp20783 = getelementptr inbounds float* %tmp20782, i64 1
+  %tmp20784 = getelementptr inbounds float* %tmp20783, i64 1
+  %tmp20785 = getelementptr inbounds float* %tmp20784, i64 1
+  %tmp20786 = getelementptr inbounds float* %tmp20785, i64 1
+  %tmp20787 = getelementptr inbounds float* %tmp20786, i64 1
+  %tmp20788 = getelementptr inbounds float* %tmp20787, i64 1
+  %tmp20789 = getelementptr inbounds float* %tmp20788, i64 1
+  %tmp20790 = getelementptr inbounds float* %tmp20789, i64 1
+  %tmp20791 = getelementptr inbounds float* %tmp20790, i64 1
+  %tmp20792 = getelementptr inbounds float* %tmp20791, i64 1
+  %tmp20793 = getelementptr inbounds float* %tmp20792, i64 1
+  %tmp20794 = getelementptr inbounds float* %tmp20793, i64 1
+  %tmp20795 = getelementptr inbounds float* %tmp20794, i64 1
+  %tmp20796 = getelementptr inbounds float* %tmp20795, i64 1
+  %tmp20797 = getelementptr inbounds float* %tmp20796, i64 1
+  %tmp20798 = getelementptr inbounds float* %tmp20797, i64 1
+  %tmp20799 = getelementptr inbounds float* %tmp20798, i64 1
+  %tmp20800 = getelementptr inbounds float* %tmp20799, i64 1
+  %tmp20801 = getelementptr inbounds float* %tmp20800, i64 1
+  %tmp20802 = getelementptr inbounds float* %tmp20801, i64 1
+  %tmp20803 = getelementptr inbounds float* %tmp20802, i64 1
+  %tmp20804 = getelementptr inbounds float* %tmp20803, i64 1
+  %tmp20805 = getelementptr inbounds float* %tmp20804, i64 1
+  %tmp20806 = getelementptr inbounds float* %tmp20805, i64 1
+  %tmp20807 = getelementptr inbounds float* %tmp20806, i64 1
+  %tmp20808 = getelementptr inbounds float* %tmp20807, i64 1
+  %tmp20809 = getelementptr inbounds float* %tmp20808, i64 1
+  %tmp20810 = getelementptr inbounds float* %tmp20809, i64 1
+  %tmp20811 = getelementptr inbounds float* %tmp20810, i64 1
+  %tmp20812 = getelementptr inbounds float* %tmp20811, i64 1
+  %tmp20813 = getelementptr inbounds float* %tmp20812, i64 1
+  %tmp20814 = getelementptr inbounds float* %tmp20813, i64 1
+  %tmp20815 = getelementptr inbounds float* %tmp20814, i64 1
+  %tmp20816 = getelementptr inbounds float* %tmp20815, i64 1
+  %tmp20817 = getelementptr inbounds float* %tmp20816, i64 1
+  %tmp20818 = getelementptr inbounds float* %tmp20817, i64 1
+  %tmp20819 = getelementptr inbounds float* %tmp20818, i64 1
+  %tmp20820 = getelementptr inbounds float* %tmp20819, i64 1
+  %tmp20821 = getelementptr inbounds float* %tmp20820, i64 1
+  %tmp20822 = getelementptr inbounds float* %tmp20821, i64 1
+  %tmp20823 = getelementptr inbounds float* %tmp20822, i64 1
+  %tmp20824 = getelementptr inbounds float* %tmp20823, i64 1
+  %tmp20825 = getelementptr inbounds float* %tmp20824, i64 1
+  %tmp20826 = getelementptr inbounds float* %tmp20825, i64 1
+  %tmp20827 = getelementptr inbounds float* %tmp20826, i64 1
+  %tmp20828 = getelementptr inbounds float* %tmp20827, i64 1
+  %tmp20829 = getelementptr inbounds float* %tmp20828, i64 1
+  %tmp20830 = getelementptr inbounds float* %tmp20829, i64 1
+  %tmp20831 = getelementptr inbounds float* %tmp20830, i64 1
+  %tmp20832 = getelementptr inbounds float* %tmp20831, i64 1
+  %tmp20833 = getelementptr inbounds float* %tmp20832, i64 1
+  %tmp20834 = getelementptr inbounds float* %tmp20833, i64 1
+  %tmp20835 = getelementptr inbounds float* %tmp20834, i64 1
+  %tmp20836 = getelementptr inbounds float* %tmp20835, i64 1
+  %tmp20837 = getelementptr inbounds float* %tmp20836, i64 1
+  %tmp20838 = getelementptr inbounds float* %tmp20837, i64 1
+  %tmp20839 = getelementptr inbounds float* %tmp20838, i64 1
+  %tmp20840 = getelementptr inbounds float* %tmp20839, i64 1
+  %tmp20841 = getelementptr inbounds float* %tmp20840, i64 1
+  %tmp20842 = getelementptr inbounds float* %tmp20841, i64 1
+  %tmp20843 = getelementptr inbounds float* %tmp20842, i64 1
+  %tmp20844 = getelementptr inbounds float* %tmp20843, i64 1
+  %tmp20845 = getelementptr inbounds float* %tmp20844, i64 1
+  %tmp20846 = getelementptr inbounds float* %tmp20845, i64 1
+  %tmp20847 = getelementptr inbounds float* %tmp20846, i64 1
+  %tmp20848 = getelementptr inbounds float* %tmp20847, i64 1
+  %tmp20849 = getelementptr inbounds float* %tmp20848, i64 1
+  %tmp20850 = getelementptr inbounds float* %tmp20849, i64 1
+  %tmp20851 = getelementptr inbounds float* %tmp20850, i64 1
+  %tmp20852 = getelementptr inbounds float* %tmp20851, i64 1
+  %tmp20853 = getelementptr inbounds float* %tmp20852, i64 1
+  %tmp20854 = getelementptr inbounds float* %tmp20853, i64 1
+  %tmp20855 = getelementptr inbounds float* %tmp20854, i64 1
+  %tmp20856 = getelementptr inbounds float* %tmp20855, i64 1
+  %tmp20857 = getelementptr inbounds float* %tmp20856, i64 1
+  %tmp20858 = getelementptr inbounds float* %tmp20857, i64 1
+  %tmp20859 = getelementptr inbounds float* %tmp20858, i64 1
+  %tmp20860 = getelementptr inbounds float* %tmp20859, i64 1
+  %tmp20861 = getelementptr inbounds float* %tmp20860, i64 1
+  %tmp20862 = getelementptr inbounds float* %tmp20861, i64 1
+  %tmp20863 = getelementptr inbounds float* %tmp20862, i64 1
+  %tmp20864 = getelementptr inbounds float* %tmp20863, i64 1
+  %tmp20865 = getelementptr inbounds float* %tmp20864, i64 1
+  %tmp20866 = getelementptr inbounds float* %tmp20865, i64 1
+  %tmp20867 = getelementptr inbounds float* %tmp20866, i64 1
+  %tmp20868 = getelementptr inbounds float* %tmp20867, i64 1
+  %tmp20869 = getelementptr inbounds float* %tmp20868, i64 1
+  %tmp20870 = getelementptr inbounds float* %tmp20869, i64 1
+  %tmp20871 = getelementptr inbounds float* %tmp20870, i64 1
+  %tmp20872 = getelementptr inbounds float* %tmp20871, i64 1
+  %tmp20873 = getelementptr inbounds float* %tmp20872, i64 1
+  %tmp20874 = getelementptr inbounds float* %tmp20873, i64 1
+  %tmp20875 = getelementptr inbounds float* %tmp20874, i64 1
+  %tmp20876 = getelementptr inbounds float* %tmp20875, i64 1
+  %tmp20877 = getelementptr inbounds float* %tmp20876, i64 1
+  %tmp20878 = getelementptr inbounds float* %tmp20877, i64 1
+  %tmp20879 = getelementptr inbounds float* %tmp20878, i64 1
+  %tmp20880 = getelementptr inbounds float* %tmp20879, i64 1
+  %tmp20881 = getelementptr inbounds float* %tmp20880, i64 1
+  %tmp20882 = getelementptr inbounds float* %tmp20881, i64 1
+  %tmp20883 = getelementptr inbounds float* %tmp20882, i64 1
+  %tmp20884 = getelementptr inbounds float* %tmp20883, i64 1
+  %tmp20885 = getelementptr inbounds float* %tmp20884, i64 1
+  %tmp20886 = getelementptr inbounds float* %tmp20885, i64 1
+  %tmp20887 = getelementptr inbounds float* %tmp20886, i64 1
+  %tmp20888 = getelementptr inbounds float* %tmp20887, i64 1
+  %tmp20889 = getelementptr inbounds float* %tmp20888, i64 1
+  %tmp20890 = getelementptr inbounds float* %tmp20889, i64 1
+  %tmp20891 = getelementptr inbounds float* %tmp20890, i64 1
+  %tmp20892 = getelementptr inbounds float* %tmp20891, i64 1
+  %tmp20893 = getelementptr inbounds float* %tmp20892, i64 1
+  %tmp20894 = getelementptr inbounds float* %tmp20893, i64 1
+  %tmp20895 = getelementptr inbounds float* %tmp20894, i64 1
+  %tmp20896 = getelementptr inbounds float* %tmp20895, i64 1
+  %tmp20897 = getelementptr inbounds float* %tmp20896, i64 1
+  %tmp20898 = getelementptr inbounds float* %tmp20897, i64 1
+  %tmp20899 = getelementptr inbounds float* %tmp20898, i64 1
+  %tmp20900 = getelementptr inbounds float* %tmp20899, i64 1
+  %tmp20901 = getelementptr inbounds float* %tmp20900, i64 1
+  %tmp20902 = getelementptr inbounds float* %tmp20901, i64 1
+  %tmp20903 = getelementptr inbounds float* %tmp20902, i64 1
+  %tmp20904 = getelementptr inbounds float* %tmp20903, i64 1
+  %tmp20905 = getelementptr inbounds float* %tmp20904, i64 1
+  %tmp20906 = getelementptr inbounds float* %tmp20905, i64 1
+  %tmp20907 = getelementptr inbounds float* %tmp20906, i64 1
+  %tmp20908 = getelementptr inbounds float* %tmp20907, i64 1
+  %tmp20909 = getelementptr inbounds float* %tmp20908, i64 1
+  %tmp20910 = getelementptr inbounds float* %tmp20909, i64 1
+  %tmp20911 = getelementptr inbounds float* %tmp20910, i64 1
+  %tmp20912 = getelementptr inbounds float* %tmp20911, i64 1
+  %tmp20913 = getelementptr inbounds float* %tmp20912, i64 1
+  %tmp20914 = getelementptr inbounds float* %tmp20913, i64 1
+  %tmp20915 = getelementptr inbounds float* %tmp20914, i64 1
+  %tmp20916 = getelementptr inbounds float* %tmp20915, i64 1
+  %tmp20917 = getelementptr inbounds float* %tmp20916, i64 1
+  %tmp20918 = getelementptr inbounds float* %tmp20917, i64 1
+  %tmp20919 = getelementptr inbounds float* %tmp20918, i64 1
+  %tmp20920 = getelementptr inbounds float* %tmp20919, i64 1
+  %tmp20921 = getelementptr inbounds float* %tmp20920, i64 1
+  %tmp20922 = getelementptr inbounds float* %tmp20921, i64 1
+  %tmp20923 = getelementptr inbounds float* %tmp20922, i64 1
+  %tmp20924 = getelementptr inbounds float* %tmp20923, i64 1
+  %tmp20925 = getelementptr inbounds float* %tmp20924, i64 1
+  %tmp20926 = getelementptr inbounds float* %tmp20925, i64 1
+  %tmp20927 = getelementptr inbounds float* %tmp20926, i64 1
+  %tmp20928 = getelementptr inbounds float* %tmp20927, i64 1
+  %tmp20929 = getelementptr inbounds float* %tmp20928, i64 1
+  %tmp20930 = getelementptr inbounds float* %tmp20929, i64 1
+  %tmp20931 = getelementptr inbounds float* %tmp20930, i64 1
+  %tmp20932 = getelementptr inbounds float* %tmp20931, i64 1
+  %tmp20933 = getelementptr inbounds float* %tmp20932, i64 1
+  %tmp20934 = getelementptr inbounds float* %tmp20933, i64 1
+  %tmp20935 = getelementptr inbounds float* %tmp20934, i64 1
+  %tmp20936 = getelementptr inbounds float* %tmp20935, i64 1
+  %tmp20937 = getelementptr inbounds float* %tmp20936, i64 1
+  %tmp20938 = getelementptr inbounds float* %tmp20937, i64 1
+  %tmp20939 = getelementptr inbounds float* %tmp20938, i64 1
+  %tmp20940 = getelementptr inbounds float* %tmp20939, i64 1
+  %tmp20941 = getelementptr inbounds float* %tmp20940, i64 1
+  %tmp20942 = getelementptr inbounds float* %tmp20941, i64 1
+  %tmp20943 = getelementptr inbounds float* %tmp20942, i64 1
+  %tmp20944 = getelementptr inbounds float* %tmp20943, i64 1
+  %tmp20945 = getelementptr inbounds float* %tmp20944, i64 1
+  %tmp20946 = getelementptr inbounds float* %tmp20945, i64 1
+  %tmp20947 = getelementptr inbounds float* %tmp20946, i64 1
+  %tmp20948 = getelementptr inbounds float* %tmp20947, i64 1
+  %tmp20949 = getelementptr inbounds float* %tmp20948, i64 1
+  %tmp20950 = getelementptr inbounds float* %tmp20949, i64 1
+  %tmp20951 = getelementptr inbounds float* %tmp20950, i64 1
+  %tmp20952 = getelementptr inbounds float* %tmp20951, i64 1
+  %tmp20953 = getelementptr inbounds float* %tmp20952, i64 1
+  %tmp20954 = getelementptr inbounds float* %tmp20953, i64 1
+  %tmp20955 = getelementptr inbounds float* %tmp20954, i64 1
+  %tmp20956 = getelementptr inbounds float* %tmp20955, i64 1
+  %tmp20957 = getelementptr inbounds float* %tmp20956, i64 1
+  %tmp20958 = getelementptr inbounds float* %tmp20957, i64 1
+  %tmp20959 = getelementptr inbounds float* %tmp20958, i64 1
+  %tmp20960 = getelementptr inbounds float* %tmp20959, i64 1
+  %tmp20961 = getelementptr inbounds float* %tmp20960, i64 1
+  %tmp20962 = getelementptr inbounds float* %tmp20961, i64 1
+  %tmp20963 = getelementptr inbounds float* %tmp20962, i64 1
+  %tmp20964 = getelementptr inbounds float* %tmp20963, i64 1
+  %tmp20965 = getelementptr inbounds float* %tmp20964, i64 1
+  %tmp20966 = getelementptr inbounds float* %tmp20965, i64 1
+  %tmp20967 = getelementptr inbounds float* %tmp20966, i64 1
+  %tmp20968 = getelementptr inbounds float* %tmp20967, i64 1
+  %tmp20969 = getelementptr inbounds float* %tmp20968, i64 1
+  %tmp20970 = getelementptr inbounds float* %tmp20969, i64 1
+  %tmp20971 = getelementptr inbounds float* %tmp20970, i64 1
+  %tmp20972 = getelementptr inbounds float* %tmp20971, i64 1
+  %tmp20973 = getelementptr inbounds float* %tmp20972, i64 1
+  %tmp20974 = getelementptr inbounds float* %tmp20973, i64 1
+  %tmp20975 = getelementptr inbounds float* %tmp20974, i64 1
+  %tmp20976 = getelementptr inbounds float* %tmp20975, i64 1
+  %tmp20977 = getelementptr inbounds float* %tmp20976, i64 1
+  %tmp20978 = getelementptr inbounds float* %tmp20977, i64 1
+  %tmp20979 = getelementptr inbounds float* %tmp20978, i64 1
+  %tmp20980 = getelementptr inbounds float* %tmp20979, i64 1
+  %tmp20981 = getelementptr inbounds float* %tmp20980, i64 1
+  %tmp20982 = getelementptr inbounds float* %tmp20981, i64 1
+  %tmp20983 = getelementptr inbounds float* %tmp20982, i64 1
+  %tmp20984 = getelementptr inbounds float* %tmp20983, i64 1
+  %tmp20985 = getelementptr inbounds float* %tmp20984, i64 1
+  %tmp20986 = getelementptr inbounds float* %tmp20985, i64 1
+  %tmp20987 = getelementptr inbounds float* %tmp20986, i64 1
+  %tmp20988 = getelementptr inbounds float* %tmp20987, i64 1
+  %tmp20989 = getelementptr inbounds float* %tmp20988, i64 1
+  %tmp20990 = getelementptr inbounds float* %tmp20989, i64 1
+  %tmp20991 = getelementptr inbounds float* %tmp20990, i64 1
+  %tmp20992 = getelementptr inbounds float* %tmp20991, i64 1
+  %tmp20993 = getelementptr inbounds float* %tmp20992, i64 1
+  %tmp20994 = getelementptr inbounds float* %tmp20993, i64 1
+  %tmp20995 = getelementptr inbounds float* %tmp20994, i64 1
+  %tmp20996 = getelementptr inbounds float* %tmp20995, i64 1
+  %tmp20997 = getelementptr inbounds float* %tmp20996, i64 1
+  %tmp20998 = getelementptr inbounds float* %tmp20997, i64 1
+  %tmp20999 = getelementptr inbounds float* %tmp20998, i64 1
+  %tmp21000 = getelementptr inbounds float* %tmp20999, i64 1
+  %tmp21001 = getelementptr inbounds float* %tmp21000, i64 1
+  %tmp21002 = getelementptr inbounds float* %tmp21001, i64 1
+  %tmp21003 = getelementptr inbounds float* %tmp21002, i64 1
+  %tmp21004 = getelementptr inbounds float* %tmp21003, i64 1
+  %tmp21005 = getelementptr inbounds float* %tmp21004, i64 1
+  %tmp21006 = getelementptr inbounds float* %tmp21005, i64 1
+  %tmp21007 = getelementptr inbounds float* %tmp21006, i64 1
+  %tmp21008 = getelementptr inbounds float* %tmp21007, i64 1
+  %tmp21009 = getelementptr inbounds float* %tmp21008, i64 1
+  %tmp21010 = getelementptr inbounds float* %tmp21009, i64 1
+  %tmp21011 = getelementptr inbounds float* %tmp21010, i64 1
+  %tmp21012 = getelementptr inbounds float* %tmp21011, i64 1
+  %tmp21013 = getelementptr inbounds float* %tmp21012, i64 1
+  %tmp21014 = getelementptr inbounds float* %tmp21013, i64 1
+  %tmp21015 = getelementptr inbounds float* %tmp21014, i64 1
+  %tmp21016 = getelementptr inbounds float* %tmp21015, i64 1
+  %tmp21017 = getelementptr inbounds float* %tmp21016, i64 1
+  %tmp21018 = getelementptr inbounds float* %tmp21017, i64 1
+  %tmp21019 = getelementptr inbounds float* %tmp21018, i64 1
+  %tmp21020 = getelementptr inbounds float* %tmp21019, i64 1
+  %tmp21021 = getelementptr inbounds float* %tmp21020, i64 1
+  %tmp21022 = getelementptr inbounds float* %tmp21021, i64 1
+  %tmp21023 = getelementptr inbounds float* %tmp21022, i64 1
+  %tmp21024 = getelementptr inbounds float* %tmp21023, i64 1
+  %tmp21025 = getelementptr inbounds float* %tmp21024, i64 1
+  %tmp21026 = getelementptr inbounds float* %tmp21025, i64 1
+  %tmp21027 = getelementptr inbounds float* %tmp21026, i64 1
+  %tmp21028 = getelementptr inbounds float* %tmp21027, i64 1
+  %tmp21029 = getelementptr inbounds float* %tmp21028, i64 1
+  %tmp21030 = getelementptr inbounds float* %tmp21029, i64 1
+  %tmp21031 = getelementptr inbounds float* %tmp21030, i64 1
+  %tmp21032 = getelementptr inbounds float* %tmp21031, i64 1
+  %tmp21033 = getelementptr inbounds float* %tmp21032, i64 1
+  %tmp21034 = getelementptr inbounds float* %tmp21033, i64 1
+  %tmp21035 = getelementptr inbounds float* %tmp21034, i64 1
+  %tmp21036 = getelementptr inbounds float* %tmp21035, i64 1
+  %tmp21037 = getelementptr inbounds float* %tmp21036, i64 1
+  %tmp21038 = getelementptr inbounds float* %tmp21037, i64 1
+  %tmp21039 = getelementptr inbounds float* %tmp21038, i64 1
+  %tmp21040 = getelementptr inbounds float* %tmp21039, i64 1
+  %tmp21041 = getelementptr inbounds float* %tmp21040, i64 1
+  %tmp21042 = getelementptr inbounds float* %tmp21041, i64 1
+  %tmp21043 = getelementptr inbounds float* %tmp21042, i64 1
+  %tmp21044 = getelementptr inbounds float* %tmp21043, i64 1
+  %tmp21045 = getelementptr inbounds float* %tmp21044, i64 1
+  %tmp21046 = getelementptr inbounds float* %tmp21045, i64 1
+  %tmp21047 = getelementptr inbounds float* %tmp21046, i64 1
+  %tmp21048 = getelementptr inbounds float* %tmp21047, i64 1
+  %tmp21049 = getelementptr inbounds float* %tmp21048, i64 1
+  %tmp21050 = getelementptr inbounds float* %tmp21049, i64 1
+  %tmp21051 = getelementptr inbounds float* %tmp21050, i64 1
+  %tmp21052 = getelementptr inbounds float* %tmp21051, i64 1
+  %tmp21053 = getelementptr inbounds float* %tmp21052, i64 1
+  %tmp21054 = getelementptr inbounds float* %tmp21053, i64 1
+  %tmp21055 = getelementptr inbounds float* %tmp21054, i64 1
+  %tmp21056 = getelementptr inbounds float* %tmp21055, i64 1
+  %tmp21057 = getelementptr inbounds float* %tmp21056, i64 1
+  %tmp21058 = getelementptr inbounds float* %tmp21057, i64 1
+  %tmp21059 = getelementptr inbounds float* %tmp21058, i64 1
+  %tmp21060 = getelementptr inbounds float* %tmp21059, i64 1
+  %tmp21061 = getelementptr inbounds float* %tmp21060, i64 1
+  %tmp21062 = getelementptr inbounds float* %tmp21061, i64 1
+  %tmp21063 = getelementptr inbounds float* %tmp21062, i64 1
+  %tmp21064 = getelementptr inbounds float* %tmp21063, i64 1
+  %tmp21065 = getelementptr inbounds float* %tmp21064, i64 1
+  %tmp21066 = getelementptr inbounds float* %tmp21065, i64 1
+  %tmp21067 = getelementptr inbounds float* %tmp21066, i64 1
+  %tmp21068 = getelementptr inbounds float* %tmp21067, i64 1
+  %tmp21069 = getelementptr inbounds float* %tmp21068, i64 1
+  %tmp21070 = getelementptr inbounds float* %tmp21069, i64 1
+  %tmp21071 = getelementptr inbounds float* %tmp21070, i64 1
+  %tmp21072 = getelementptr inbounds float* %tmp21071, i64 1
+  %tmp21073 = getelementptr inbounds float* %tmp21072, i64 1
+  %tmp21074 = getelementptr inbounds float* %tmp21073, i64 1
+  %tmp21075 = getelementptr inbounds float* %tmp21074, i64 1
+  %tmp21076 = getelementptr inbounds float* %tmp21075, i64 1
+  %tmp21077 = getelementptr inbounds float* %tmp21076, i64 1
+  %tmp21078 = getelementptr inbounds float* %tmp21077, i64 1
+  %tmp21079 = getelementptr inbounds float* %tmp21078, i64 1
+  %tmp21080 = getelementptr inbounds float* %tmp21079, i64 1
+  %tmp21081 = getelementptr inbounds float* %tmp21080, i64 1
+  %tmp21082 = getelementptr inbounds float* %tmp21081, i64 1
+  %tmp21083 = getelementptr inbounds float* %tmp21082, i64 1
+  %tmp21084 = getelementptr inbounds float* %tmp21083, i64 1
+  %tmp21085 = getelementptr inbounds float* %tmp21084, i64 1
+  %tmp21086 = getelementptr inbounds float* %tmp21085, i64 1
+  %tmp21087 = getelementptr inbounds float* %tmp21086, i64 1
+  %tmp21088 = getelementptr inbounds float* %tmp21087, i64 1
+  %tmp21089 = getelementptr inbounds float* %tmp21088, i64 1
+  %tmp21090 = getelementptr inbounds float* %tmp21089, i64 1
+  %tmp21091 = getelementptr inbounds float* %tmp21090, i64 1
+  %tmp21092 = getelementptr inbounds float* %tmp21091, i64 1
+  %tmp21093 = getelementptr inbounds float* %tmp21092, i64 1
+  %tmp21094 = getelementptr inbounds float* %tmp21093, i64 1
+  %tmp21095 = getelementptr inbounds float* %tmp21094, i64 1
+  %tmp21096 = getelementptr inbounds float* %tmp21095, i64 1
+  %tmp21097 = getelementptr inbounds float* %tmp21096, i64 1
+  %tmp21098 = getelementptr inbounds float* %tmp21097, i64 1
+  %tmp21099 = getelementptr inbounds float* %tmp21098, i64 1
+  %tmp21100 = getelementptr inbounds float* %tmp21099, i64 1
+  %tmp21101 = getelementptr inbounds float* %tmp21100, i64 1
+  %tmp21102 = getelementptr inbounds float* %tmp21101, i64 1
+  %tmp21103 = getelementptr inbounds float* %tmp21102, i64 1
+  %tmp21104 = getelementptr inbounds float* %tmp21103, i64 1
+  %tmp21105 = getelementptr inbounds float* %tmp21104, i64 1
+  %tmp21106 = getelementptr inbounds float* %tmp21105, i64 1
+  %tmp21107 = getelementptr inbounds float* %tmp21106, i64 1
+  %tmp21108 = getelementptr inbounds float* %tmp21107, i64 1
+  %tmp21109 = getelementptr inbounds float* %tmp21108, i64 1
+  %tmp21110 = getelementptr inbounds float* %tmp21109, i64 1
+  %tmp21111 = getelementptr inbounds float* %tmp21110, i64 1
+  %tmp21112 = getelementptr inbounds float* %tmp21111, i64 1
+  %tmp21113 = getelementptr inbounds float* %tmp21112, i64 1
+  %tmp21114 = getelementptr inbounds float* %tmp21113, i64 1
+  %tmp21115 = getelementptr inbounds float* %tmp21114, i64 1
+  %tmp21116 = getelementptr inbounds float* %tmp21115, i64 1
+  %tmp21117 = getelementptr inbounds float* %tmp21116, i64 1
+  %tmp21118 = getelementptr inbounds float* %tmp21117, i64 1
+  %tmp21119 = getelementptr inbounds float* %tmp21118, i64 1
+  %tmp21120 = getelementptr inbounds float* %tmp21119, i64 1
+  %tmp21121 = getelementptr inbounds float* %tmp21120, i64 1
+  %tmp21122 = getelementptr inbounds float* %tmp21121, i64 1
+  %tmp21123 = getelementptr inbounds float* %tmp21122, i64 1
+  %tmp21124 = getelementptr inbounds float* %tmp21123, i64 1
+  %tmp21125 = getelementptr inbounds float* %tmp21124, i64 1
+  %tmp21126 = getelementptr inbounds float* %tmp21125, i64 1
+  %tmp21127 = getelementptr inbounds float* %tmp21126, i64 1
+  %tmp21128 = getelementptr inbounds float* %tmp21127, i64 1
+  %tmp21129 = getelementptr inbounds float* %tmp21128, i64 1
+  %tmp21130 = getelementptr inbounds float* %tmp21129, i64 1
+  %tmp21131 = getelementptr inbounds float* %tmp21130, i64 1
+  %tmp21132 = getelementptr inbounds float* %tmp21131, i64 1
+  %tmp21133 = getelementptr inbounds float* %tmp21132, i64 1
+  %tmp21134 = getelementptr inbounds float* %tmp21133, i64 1
+  %tmp21135 = getelementptr inbounds float* %tmp21134, i64 1
+  %tmp21136 = getelementptr inbounds float* %tmp21135, i64 1
+  %tmp21137 = getelementptr inbounds float* %tmp21136, i64 1
+  %tmp21138 = getelementptr inbounds float* %tmp21137, i64 1
+  %tmp21139 = getelementptr inbounds float* %tmp21138, i64 1
+  %tmp21140 = getelementptr inbounds float* %tmp21139, i64 1
+  %tmp21141 = getelementptr inbounds float* %tmp21140, i64 1
+  %tmp21142 = getelementptr inbounds float* %tmp21141, i64 1
+  %tmp21143 = getelementptr inbounds float* %tmp21142, i64 1
+  %tmp21144 = getelementptr inbounds float* %tmp21143, i64 1
+  %tmp21145 = getelementptr inbounds float* %tmp21144, i64 1
+  %tmp21146 = getelementptr inbounds float* %tmp21145, i64 1
+  %tmp21147 = getelementptr inbounds float* %tmp21146, i64 1
+  %tmp21148 = getelementptr inbounds float* %tmp21147, i64 1
+  %tmp21149 = getelementptr inbounds float* %tmp21148, i64 1
+  %tmp21150 = getelementptr inbounds float* %tmp21149, i64 1
+  %tmp21151 = getelementptr inbounds float* %tmp21150, i64 1
+  %tmp21152 = getelementptr inbounds float* %tmp21151, i64 1
+  %tmp21153 = getelementptr inbounds float* %tmp21152, i64 1
+  %tmp21154 = getelementptr inbounds float* %tmp21153, i64 1
+  %tmp21155 = getelementptr inbounds float* %tmp21154, i64 1
+  %tmp21156 = getelementptr inbounds float* %tmp21155, i64 1
+  %tmp21157 = getelementptr inbounds float* %tmp21156, i64 1
+  %tmp21158 = getelementptr inbounds float* %tmp21157, i64 1
+  %tmp21159 = getelementptr inbounds float* %tmp21158, i64 1
+  %tmp21160 = getelementptr inbounds float* %tmp21159, i64 1
+  %tmp21161 = getelementptr inbounds float* %tmp21160, i64 1
+  %tmp21162 = getelementptr inbounds float* %tmp21161, i64 1
+  %tmp21163 = getelementptr inbounds float* %tmp21162, i64 1
+  %tmp21164 = getelementptr inbounds float* %tmp21163, i64 1
+  %tmp21165 = getelementptr inbounds float* %tmp21164, i64 1
+  %tmp21166 = getelementptr inbounds float* %tmp21165, i64 1
+  %tmp21167 = getelementptr inbounds float* %tmp21166, i64 1
+  %tmp21168 = getelementptr inbounds float* %tmp21167, i64 1
+  %tmp21169 = getelementptr inbounds float* %tmp21168, i64 1
+  %tmp21170 = getelementptr inbounds float* %tmp21169, i64 1
+  %tmp21171 = getelementptr inbounds float* %tmp21170, i64 1
+  %tmp21172 = getelementptr inbounds float* %tmp21171, i64 1
+  %tmp21173 = getelementptr inbounds float* %tmp21172, i64 1
+  %tmp21174 = getelementptr inbounds float* %tmp21173, i64 1
+  %tmp21175 = getelementptr inbounds float* %tmp21174, i64 1
+  %tmp21176 = getelementptr inbounds float* %tmp21175, i64 1
+  %tmp21177 = getelementptr inbounds float* %tmp21176, i64 1
+  %tmp21178 = getelementptr inbounds float* %tmp21177, i64 1
+  %tmp21179 = getelementptr inbounds float* %tmp21178, i64 1
+  %tmp21180 = getelementptr inbounds float* %tmp21179, i64 1
+  %tmp21181 = getelementptr inbounds float* %tmp21180, i64 1
+  %tmp21182 = getelementptr inbounds float* %tmp21181, i64 1
+  %tmp21183 = getelementptr inbounds float* %tmp21182, i64 1
+  %tmp21184 = getelementptr inbounds float* %tmp21183, i64 1
+  %tmp21185 = getelementptr inbounds float* %tmp21184, i64 1
+  %tmp21186 = getelementptr inbounds float* %tmp21185, i64 1
+  %tmp21187 = getelementptr inbounds float* %tmp21186, i64 1
+  %tmp21188 = getelementptr inbounds float* %tmp21187, i64 1
+  %tmp21189 = getelementptr inbounds float* %tmp21188, i64 1
+  %tmp21190 = getelementptr inbounds float* %tmp21189, i64 1
+  %tmp21191 = getelementptr inbounds float* %tmp21190, i64 1
+  %tmp21192 = getelementptr inbounds float* %tmp21191, i64 1
+  %tmp21193 = getelementptr inbounds float* %tmp21192, i64 1
+  %tmp21194 = getelementptr inbounds float* %tmp21193, i64 1
+  %tmp21195 = getelementptr inbounds float* %tmp21194, i64 1
+  %tmp21196 = getelementptr inbounds float* %tmp21195, i64 1
+  %tmp21197 = getelementptr inbounds float* %tmp21196, i64 1
+  %tmp21198 = getelementptr inbounds float* %tmp21197, i64 1
+  %tmp21199 = getelementptr inbounds float* %tmp21198, i64 1
+  %tmp21200 = getelementptr inbounds float* %tmp21199, i64 1
+  %tmp21201 = getelementptr inbounds float* %tmp21200, i64 1
+  %tmp21202 = getelementptr inbounds float* %tmp21201, i64 1
+  %tmp21203 = getelementptr inbounds float* %tmp21202, i64 1
+  %tmp21204 = getelementptr inbounds float* %tmp21203, i64 1
+  %tmp21205 = getelementptr inbounds float* %tmp21204, i64 1
+  %tmp21206 = getelementptr inbounds float* %tmp21205, i64 1
+  %tmp21207 = getelementptr inbounds float* %tmp21206, i64 1
+  %tmp21208 = getelementptr inbounds float* %tmp21207, i64 1
+  %tmp21209 = getelementptr inbounds float* %tmp21208, i64 1
+  %tmp21210 = getelementptr inbounds float* %tmp21209, i64 1
+  %tmp21211 = getelementptr inbounds float* %tmp21210, i64 1
+  %tmp21212 = getelementptr inbounds float* %tmp21211, i64 1
+  %tmp21213 = getelementptr inbounds float* %tmp21212, i64 1
+  %tmp21214 = getelementptr inbounds float* %tmp21213, i64 1
+  %tmp21215 = getelementptr inbounds float* %tmp21214, i64 1
+  %tmp21216 = getelementptr inbounds float* %tmp21215, i64 1
+  %tmp21217 = getelementptr inbounds float* %tmp21216, i64 1
+  %tmp21218 = getelementptr inbounds float* %tmp21217, i64 1
+  %tmp21219 = getelementptr inbounds float* %tmp21218, i64 1
+  %tmp21220 = getelementptr inbounds float* %tmp21219, i64 1
+  %tmp21221 = getelementptr inbounds float* %tmp21220, i64 1
+  %tmp21222 = getelementptr inbounds float* %tmp21221, i64 1
+  %tmp21223 = getelementptr inbounds float* %tmp21222, i64 1
+  %tmp21224 = getelementptr inbounds float* %tmp21223, i64 1
+  %tmp21225 = getelementptr inbounds float* %tmp21224, i64 1
+  %tmp21226 = getelementptr inbounds float* %tmp21225, i64 1
+  %tmp21227 = getelementptr inbounds float* %tmp21226, i64 1
+  %tmp21228 = getelementptr inbounds float* %tmp21227, i64 1
+  %tmp21229 = getelementptr inbounds float* %tmp21228, i64 1
+  %tmp21230 = getelementptr inbounds float* %tmp21229, i64 1
+  %tmp21231 = getelementptr inbounds float* %tmp21230, i64 1
+  %tmp21232 = getelementptr inbounds float* %tmp21231, i64 1
+  %tmp21233 = getelementptr inbounds float* %tmp21232, i64 1
+  %tmp21234 = getelementptr inbounds float* %tmp21233, i64 1
+  %tmp21235 = getelementptr inbounds float* %tmp21234, i64 1
+  %tmp21236 = getelementptr inbounds float* %tmp21235, i64 1
+  %tmp21237 = getelementptr inbounds float* %tmp21236, i64 1
+  %tmp21238 = getelementptr inbounds float* %tmp21237, i64 1
+  %tmp21239 = getelementptr inbounds float* %tmp21238, i64 1
+  %tmp21240 = getelementptr inbounds float* %tmp21239, i64 1
+  %tmp21241 = getelementptr inbounds float* %tmp21240, i64 1
+  %tmp21242 = getelementptr inbounds float* %tmp21241, i64 1
+  %tmp21243 = getelementptr inbounds float* %tmp21242, i64 1
+  %tmp21244 = getelementptr inbounds float* %tmp21243, i64 1
+  %tmp21245 = getelementptr inbounds float* %tmp21244, i64 1
+  %tmp21246 = getelementptr inbounds float* %tmp21245, i64 1
+  %tmp21247 = getelementptr inbounds float* %tmp21246, i64 1
+  %tmp21248 = getelementptr inbounds float* %tmp21247, i64 1
+  %tmp21249 = getelementptr inbounds float* %tmp21248, i64 1
+  %tmp21250 = getelementptr inbounds float* %tmp21249, i64 1
+  %tmp21251 = getelementptr inbounds float* %tmp21250, i64 1
+  %tmp21252 = getelementptr inbounds float* %tmp21251, i64 1
+  %tmp21253 = getelementptr inbounds float* %tmp21252, i64 1
+  %tmp21254 = getelementptr inbounds float* %tmp21253, i64 1
+  %tmp21255 = getelementptr inbounds float* %tmp21254, i64 1
+  %tmp21256 = getelementptr inbounds float* %tmp21255, i64 1
+  %tmp21257 = getelementptr inbounds float* %tmp21256, i64 1
+  %tmp21258 = getelementptr inbounds float* %tmp21257, i64 1
+  %tmp21259 = getelementptr inbounds float* %tmp21258, i64 1
+  %tmp21260 = getelementptr inbounds float* %tmp21259, i64 1
+  %tmp21261 = getelementptr inbounds float* %tmp21260, i64 1
+  %tmp21262 = getelementptr inbounds float* %tmp21261, i64 1
+  %tmp21263 = getelementptr inbounds float* %tmp21262, i64 1
+  %tmp21264 = getelementptr inbounds float* %tmp21263, i64 1
+  %tmp21265 = getelementptr inbounds float* %tmp21264, i64 1
+  %tmp21266 = getelementptr inbounds float* %tmp21265, i64 1
+  %tmp21267 = getelementptr inbounds float* %tmp21266, i64 1
+  %tmp21268 = getelementptr inbounds float* %tmp21267, i64 1
+  %tmp21269 = getelementptr inbounds float* %tmp21268, i64 1
+  %tmp21270 = getelementptr inbounds float* %tmp21269, i64 1
+  %tmp21271 = getelementptr inbounds float* %tmp21270, i64 1
+  %tmp21272 = getelementptr inbounds float* %tmp21271, i64 1
+  %tmp21273 = getelementptr inbounds float* %tmp21272, i64 1
+  %tmp21274 = getelementptr inbounds float* %tmp21273, i64 1
+  %tmp21275 = getelementptr inbounds float* %tmp21274, i64 1
+  %tmp21276 = getelementptr inbounds float* %tmp21275, i64 1
+  %tmp21277 = getelementptr inbounds float* %tmp21276, i64 1
+  %tmp21278 = getelementptr inbounds float* %tmp21277, i64 1
+  %tmp21279 = getelementptr inbounds float* %tmp21278, i64 1
+  %tmp21280 = getelementptr inbounds float* %tmp21279, i64 1
+  %tmp21281 = getelementptr inbounds float* %tmp21280, i64 1
+  %tmp21282 = getelementptr inbounds float* %tmp21281, i64 1
+  %tmp21283 = getelementptr inbounds float* %tmp21282, i64 1
+  %tmp21284 = getelementptr inbounds float* %tmp21283, i64 1
+  %tmp21285 = getelementptr inbounds float* %tmp21284, i64 1
+  %tmp21286 = getelementptr inbounds float* %tmp21285, i64 1
+  %tmp21287 = getelementptr inbounds float* %tmp21286, i64 1
+  %tmp21288 = getelementptr inbounds float* %tmp21287, i64 1
+  %tmp21289 = getelementptr inbounds float* %tmp21288, i64 1
+  %tmp21290 = getelementptr inbounds float* %tmp21289, i64 1
+  %tmp21291 = getelementptr inbounds float* %tmp21290, i64 1
+  %tmp21292 = getelementptr inbounds float* %tmp21291, i64 1
+  %tmp21293 = getelementptr inbounds float* %tmp21292, i64 1
+  %tmp21294 = getelementptr inbounds float* %tmp21293, i64 1
+  %tmp21295 = getelementptr inbounds float* %tmp21294, i64 1
+  %tmp21296 = getelementptr inbounds float* %tmp21295, i64 1
+  %tmp21297 = getelementptr inbounds float* %tmp21296, i64 1
+  %tmp21298 = getelementptr inbounds float* %tmp21297, i64 1
+  %tmp21299 = getelementptr inbounds float* %tmp21298, i64 1
+  %tmp21300 = getelementptr inbounds float* %tmp21299, i64 1
+  %tmp21301 = getelementptr inbounds float* %tmp21300, i64 1
+  %tmp21302 = getelementptr inbounds float* %tmp21301, i64 1
+  %tmp21303 = getelementptr inbounds float* %tmp21302, i64 1
+  %tmp21304 = getelementptr inbounds float* %tmp21303, i64 1
+  %tmp21305 = getelementptr inbounds float* %tmp21304, i64 1
+  %tmp21306 = getelementptr inbounds float* %tmp21305, i64 1
+  %tmp21307 = getelementptr inbounds float* %tmp21306, i64 1
+  %tmp21308 = getelementptr inbounds float* %tmp21307, i64 1
+  %tmp21309 = getelementptr inbounds float* %tmp21308, i64 1
+  %tmp21310 = getelementptr inbounds float* %tmp21309, i64 1
+  %tmp21311 = getelementptr inbounds float* %tmp21310, i64 1
+  %tmp21312 = getelementptr inbounds float* %tmp21311, i64 1
+  %tmp21313 = getelementptr inbounds float* %tmp21312, i64 1
+  %tmp21314 = getelementptr inbounds float* %tmp21313, i64 1
+  %tmp21315 = getelementptr inbounds float* %tmp21314, i64 1
+  %tmp21316 = getelementptr inbounds float* %tmp21315, i64 1
+  %tmp21317 = getelementptr inbounds float* %tmp21316, i64 1
+  %tmp21318 = getelementptr inbounds float* %tmp21317, i64 1
+  %tmp21319 = getelementptr inbounds float* %tmp21318, i64 1
+  %tmp21320 = getelementptr inbounds float* %tmp21319, i64 1
+  %tmp21321 = getelementptr inbounds float* %tmp21320, i64 1
+  %tmp21322 = getelementptr inbounds float* %tmp21321, i64 1
+  %tmp21323 = getelementptr inbounds float* %tmp21322, i64 1
+  %tmp21324 = getelementptr inbounds float* %tmp21323, i64 1
+  %tmp21325 = getelementptr inbounds float* %tmp21324, i64 1
+  %tmp21326 = getelementptr inbounds float* %tmp21325, i64 1
+  %tmp21327 = getelementptr inbounds float* %tmp21326, i64 1
+  %tmp21328 = getelementptr inbounds float* %tmp21327, i64 1
+  %tmp21329 = getelementptr inbounds float* %tmp21328, i64 1
+  %tmp21330 = getelementptr inbounds float* %tmp21329, i64 1
+  %tmp21331 = getelementptr inbounds float* %tmp21330, i64 1
+  %tmp21332 = getelementptr inbounds float* %tmp21331, i64 1
+  %tmp21333 = getelementptr inbounds float* %tmp21332, i64 1
+  %tmp21334 = getelementptr inbounds float* %tmp21333, i64 1
+  %tmp21335 = getelementptr inbounds float* %tmp21334, i64 1
+  %tmp21336 = getelementptr inbounds float* %tmp21335, i64 1
+  %tmp21337 = getelementptr inbounds float* %tmp21336, i64 1
+  %tmp21338 = getelementptr inbounds float* %tmp21337, i64 1
+  %tmp21339 = getelementptr inbounds float* %tmp21338, i64 1
+  %tmp21340 = getelementptr inbounds float* %tmp21339, i64 1
+  %tmp21341 = getelementptr inbounds float* %tmp21340, i64 1
+  %tmp21342 = getelementptr inbounds float* %tmp21341, i64 1
+  %tmp21343 = getelementptr inbounds float* %tmp21342, i64 1
+  %tmp21344 = getelementptr inbounds float* %tmp21343, i64 1
+  %tmp21345 = getelementptr inbounds float* %tmp21344, i64 1
+  %tmp21346 = getelementptr inbounds float* %tmp21345, i64 1
+  %tmp21347 = getelementptr inbounds float* %tmp21346, i64 1
+  %tmp21348 = getelementptr inbounds float* %tmp21347, i64 1
+  %tmp21349 = getelementptr inbounds float* %tmp21348, i64 1
+  %tmp21350 = getelementptr inbounds float* %tmp21349, i64 1
+  %tmp21351 = getelementptr inbounds float* %tmp21350, i64 1
+  %tmp21352 = getelementptr inbounds float* %tmp21351, i64 1
+  %tmp21353 = getelementptr inbounds float* %tmp21352, i64 1
+  %tmp21354 = getelementptr inbounds float* %tmp21353, i64 1
+  %tmp21355 = getelementptr inbounds float* %tmp21354, i64 1
+  %tmp21356 = getelementptr inbounds float* %tmp21355, i64 1
+  %tmp21357 = getelementptr inbounds float* %tmp21356, i64 1
+  %tmp21358 = getelementptr inbounds float* %tmp21357, i64 1
+  %tmp21359 = getelementptr inbounds float* %tmp21358, i64 1
+  %tmp21360 = getelementptr inbounds float* %tmp21359, i64 1
+  %tmp21361 = getelementptr inbounds float* %tmp21360, i64 1
+  %tmp21362 = getelementptr inbounds float* %tmp21361, i64 1
+  %tmp21363 = getelementptr inbounds float* %tmp21362, i64 1
+  %tmp21364 = getelementptr inbounds float* %tmp21363, i64 1
+  %tmp21365 = getelementptr inbounds float* %tmp21364, i64 1
+  %tmp21366 = getelementptr inbounds float* %tmp21365, i64 1
+  %tmp21367 = getelementptr inbounds float* %tmp21366, i64 1
+  %tmp21368 = getelementptr inbounds float* %tmp21367, i64 1
+  %tmp21369 = getelementptr inbounds float* %tmp21368, i64 1
+  %tmp21370 = getelementptr inbounds float* %tmp21369, i64 1
+  %tmp21371 = getelementptr inbounds float* %tmp21370, i64 1
+  %tmp21372 = getelementptr inbounds float* %tmp21371, i64 1
+  %tmp21373 = getelementptr inbounds float* %tmp21372, i64 1
+  %tmp21374 = getelementptr inbounds float* %tmp21373, i64 1
+  %tmp21375 = getelementptr inbounds float* %tmp21374, i64 1
+  %tmp21376 = getelementptr inbounds float* %tmp21375, i64 1
+  %tmp21377 = getelementptr inbounds float* %tmp21376, i64 1
+  %tmp21378 = getelementptr inbounds float* %tmp21377, i64 1
+  %tmp21379 = getelementptr inbounds float* %tmp21378, i64 1
+  %tmp21380 = getelementptr inbounds float* %tmp21379, i64 1
+  %tmp21381 = getelementptr inbounds float* %tmp21380, i64 1
+  %tmp21382 = getelementptr inbounds float* %tmp21381, i64 1
+  %tmp21383 = getelementptr inbounds float* %tmp21382, i64 1
+  %tmp21384 = getelementptr inbounds float* %tmp21383, i64 1
+  %tmp21385 = getelementptr inbounds float* %tmp21384, i64 1
+  %tmp21386 = getelementptr inbounds float* %tmp21385, i64 1
+  %tmp21387 = getelementptr inbounds float* %tmp21386, i64 1
+  %tmp21388 = getelementptr inbounds float* %tmp21387, i64 1
+  %tmp21389 = getelementptr inbounds float* %tmp21388, i64 1
+  %tmp21390 = getelementptr inbounds float* %tmp21389, i64 1
+  %tmp21391 = getelementptr inbounds float* %tmp21390, i64 1
+  %tmp21392 = getelementptr inbounds float* %tmp21391, i64 1
+  %tmp21393 = getelementptr inbounds float* %tmp21392, i64 1
+  %tmp21394 = getelementptr inbounds float* %tmp21393, i64 1
+  %tmp21395 = getelementptr inbounds float* %tmp21394, i64 1
+  %tmp21396 = getelementptr inbounds float* %tmp21395, i64 1
+  %tmp21397 = getelementptr inbounds float* %tmp21396, i64 1
+  %tmp21398 = getelementptr inbounds float* %tmp21397, i64 1
+  %tmp21399 = getelementptr inbounds float* %tmp21398, i64 1
+  %tmp21400 = getelementptr inbounds float* %tmp21399, i64 1
+  %tmp21401 = getelementptr inbounds float* %tmp21400, i64 1
+  %tmp21402 = getelementptr inbounds float* %tmp21401, i64 1
+  %tmp21403 = getelementptr inbounds float* %tmp21402, i64 1
+  %tmp21404 = getelementptr inbounds float* %tmp21403, i64 1
+  %tmp21405 = getelementptr inbounds float* %tmp21404, i64 1
+  %tmp21406 = getelementptr inbounds float* %tmp21405, i64 1
+  %tmp21407 = getelementptr inbounds float* %tmp21406, i64 1
+  %tmp21408 = getelementptr inbounds float* %tmp21407, i64 1
+  %tmp21409 = getelementptr inbounds float* %tmp21408, i64 1
+  %tmp21410 = getelementptr inbounds float* %tmp21409, i64 1
+  %tmp21411 = getelementptr inbounds float* %tmp21410, i64 1
+  %tmp21412 = getelementptr inbounds float* %tmp21411, i64 1
+  %tmp21413 = getelementptr inbounds float* %tmp21412, i64 1
+  %tmp21414 = getelementptr inbounds float* %tmp21413, i64 1
+  %tmp21415 = getelementptr inbounds float* %tmp21414, i64 1
+  %tmp21416 = getelementptr inbounds float* %tmp21415, i64 1
+  %tmp21417 = getelementptr inbounds float* %tmp21416, i64 1
+  %tmp21418 = getelementptr inbounds float* %tmp21417, i64 1
+  %tmp21419 = getelementptr inbounds float* %tmp21418, i64 1
+  %tmp21420 = getelementptr inbounds float* %tmp21419, i64 1
+  %tmp21421 = getelementptr inbounds float* %tmp21420, i64 1
+  %tmp21422 = getelementptr inbounds float* %tmp21421, i64 1
+  %tmp21423 = getelementptr inbounds float* %tmp21422, i64 1
+  %tmp21424 = getelementptr inbounds float* %tmp21423, i64 1
+  %tmp21425 = getelementptr inbounds float* %tmp21424, i64 1
+  %tmp21426 = getelementptr inbounds float* %tmp21425, i64 1
+  %tmp21427 = getelementptr inbounds float* %tmp21426, i64 1
+  %tmp21428 = getelementptr inbounds float* %tmp21427, i64 1
+  %tmp21429 = getelementptr inbounds float* %tmp21428, i64 1
+  %tmp21430 = getelementptr inbounds float* %tmp21429, i64 1
+  %tmp21431 = getelementptr inbounds float* %tmp21430, i64 1
+  %tmp21432 = getelementptr inbounds float* %tmp21431, i64 1
+  %tmp21433 = getelementptr inbounds float* %tmp21432, i64 1
+  %tmp21434 = getelementptr inbounds float* %tmp21433, i64 1
+  %tmp21435 = getelementptr inbounds float* %tmp21434, i64 1
+  %tmp21436 = getelementptr inbounds float* %tmp21435, i64 1
+  %tmp21437 = getelementptr inbounds float* %tmp21436, i64 1
+  %tmp21438 = getelementptr inbounds float* %tmp21437, i64 1
+  %tmp21439 = getelementptr inbounds float* %tmp21438, i64 1
+  %tmp21440 = getelementptr inbounds float* %tmp21439, i64 1
+  %tmp21441 = getelementptr inbounds float* %tmp21440, i64 1
+  %tmp21442 = getelementptr inbounds float* %tmp21441, i64 1
+  %tmp21443 = getelementptr inbounds float* %tmp21442, i64 1
+  %tmp21444 = getelementptr inbounds float* %tmp21443, i64 1
+  %tmp21445 = getelementptr inbounds float* %tmp21444, i64 1
+  %tmp21446 = getelementptr inbounds float* %tmp21445, i64 1
+  %tmp21447 = getelementptr inbounds float* %tmp21446, i64 1
+  %tmp21448 = getelementptr inbounds float* %tmp21447, i64 1
+  %tmp21449 = getelementptr inbounds float* %tmp21448, i64 1
+  %tmp21450 = getelementptr inbounds float* %tmp21449, i64 1
+  %tmp21451 = getelementptr inbounds float* %tmp21450, i64 1
+  %tmp21452 = getelementptr inbounds float* %tmp21451, i64 1
+  %tmp21453 = getelementptr inbounds float* %tmp21452, i64 1
+  %tmp21454 = getelementptr inbounds float* %tmp21453, i64 1
+  %tmp21455 = getelementptr inbounds float* %tmp21454, i64 1
+  %tmp21456 = getelementptr inbounds float* %tmp21455, i64 1
+  %tmp21457 = getelementptr inbounds float* %tmp21456, i64 1
+  %tmp21458 = getelementptr inbounds float* %tmp21457, i64 1
+  %tmp21459 = getelementptr inbounds float* %tmp21458, i64 1
+  %tmp21460 = getelementptr inbounds float* %tmp21459, i64 1
+  %tmp21461 = getelementptr inbounds float* %tmp21460, i64 1
+  %tmp21462 = getelementptr inbounds float* %tmp21461, i64 1
+  %tmp21463 = getelementptr inbounds float* %tmp21462, i64 1
+  %tmp21464 = getelementptr inbounds float* %tmp21463, i64 1
+  %tmp21465 = getelementptr inbounds float* %tmp21464, i64 1
+  %tmp21466 = getelementptr inbounds float* %tmp21465, i64 1
+  %tmp21467 = getelementptr inbounds float* %tmp21466, i64 1
+  %tmp21468 = getelementptr inbounds float* %tmp21467, i64 1
+  %tmp21469 = getelementptr inbounds float* %tmp21468, i64 1
+  %tmp21470 = getelementptr inbounds float* %tmp21469, i64 1
+  %tmp21471 = getelementptr inbounds float* %tmp21470, i64 1
+  %tmp21472 = getelementptr inbounds float* %tmp21471, i64 1
+  %tmp21473 = getelementptr inbounds float* %tmp21472, i64 1
+  %tmp21474 = getelementptr inbounds float* %tmp21473, i64 1
+  %tmp21475 = getelementptr inbounds float* %tmp21474, i64 1
+  %tmp21476 = getelementptr inbounds float* %tmp21475, i64 1
+  %tmp21477 = getelementptr inbounds float* %tmp21476, i64 1
+  %tmp21478 = getelementptr inbounds float* %tmp21477, i64 1
+  %tmp21479 = getelementptr inbounds float* %tmp21478, i64 1
+  %tmp21480 = getelementptr inbounds float* %tmp21479, i64 1
+  %tmp21481 = getelementptr inbounds float* %tmp21480, i64 1
+  %tmp21482 = getelementptr inbounds float* %tmp21481, i64 1
+  %tmp21483 = getelementptr inbounds float* %tmp21482, i64 1
+  %tmp21484 = getelementptr inbounds float* %tmp21483, i64 1
+  %tmp21485 = getelementptr inbounds float* %tmp21484, i64 1
+  %tmp21486 = getelementptr inbounds float* %tmp21485, i64 1
+  %tmp21487 = getelementptr inbounds float* %tmp21486, i64 1
+  %tmp21488 = getelementptr inbounds float* %tmp21487, i64 1
+  %tmp21489 = getelementptr inbounds float* %tmp21488, i64 1
+  %tmp21490 = getelementptr inbounds float* %tmp21489, i64 1
+  %tmp21491 = getelementptr inbounds float* %tmp21490, i64 1
+  %tmp21492 = getelementptr inbounds float* %tmp21491, i64 1
+  %tmp21493 = getelementptr inbounds float* %tmp21492, i64 1
+  %tmp21494 = getelementptr inbounds float* %tmp21493, i64 1
+  %tmp21495 = getelementptr inbounds float* %tmp21494, i64 1
+  %tmp21496 = getelementptr inbounds float* %tmp21495, i64 1
+  %tmp21497 = getelementptr inbounds float* %tmp21496, i64 1
+  %tmp21498 = getelementptr inbounds float* %tmp21497, i64 1
+  %tmp21499 = getelementptr inbounds float* %tmp21498, i64 1
+  %tmp21500 = getelementptr inbounds float* %tmp21499, i64 1
+  %tmp21501 = getelementptr inbounds float* %tmp21500, i64 1
+  %tmp21502 = getelementptr inbounds float* %tmp21501, i64 1
+  %tmp21503 = getelementptr inbounds float* %tmp21502, i64 1
+  %tmp21504 = getelementptr inbounds float* %tmp21503, i64 1
+  %tmp21505 = getelementptr inbounds float* %tmp21504, i64 1
+  %tmp21506 = getelementptr inbounds float* %tmp21505, i64 1
+  %tmp21507 = getelementptr inbounds float* %tmp21506, i64 1
+  %tmp21508 = getelementptr inbounds float* %tmp21507, i64 1
+  %tmp21509 = getelementptr inbounds float* %tmp21508, i64 1
+  %tmp21510 = getelementptr inbounds float* %tmp21509, i64 1
+  %tmp21511 = getelementptr inbounds float* %tmp21510, i64 1
+  %tmp21512 = getelementptr inbounds float* %tmp21511, i64 1
+  %tmp21513 = getelementptr inbounds float* %tmp21512, i64 1
+  %tmp21514 = getelementptr inbounds float* %tmp21513, i64 1
+  %tmp21515 = getelementptr inbounds float* %tmp21514, i64 1
+  %tmp21516 = getelementptr inbounds float* %tmp21515, i64 1
+  %tmp21517 = getelementptr inbounds float* %tmp21516, i64 1
+  %tmp21518 = getelementptr inbounds float* %tmp21517, i64 1
+  %tmp21519 = getelementptr inbounds float* %tmp21518, i64 1
+  %tmp21520 = getelementptr inbounds float* %tmp21519, i64 1
+  %tmp21521 = getelementptr inbounds float* %tmp21520, i64 1
+  %tmp21522 = getelementptr inbounds float* %tmp21521, i64 1
+  %tmp21523 = getelementptr inbounds float* %tmp21522, i64 1
+  %tmp21524 = getelementptr inbounds float* %tmp21523, i64 1
+  %tmp21525 = getelementptr inbounds float* %tmp21524, i64 1
+  %tmp21526 = getelementptr inbounds float* %tmp21525, i64 1
+  %tmp21527 = getelementptr inbounds float* %tmp21526, i64 1
+  %tmp21528 = getelementptr inbounds float* %tmp21527, i64 1
+  %tmp21529 = getelementptr inbounds float* %tmp21528, i64 1
+  %tmp21530 = getelementptr inbounds float* %tmp21529, i64 1
+  %tmp21531 = getelementptr inbounds float* %tmp21530, i64 1
+  %tmp21532 = getelementptr inbounds float* %tmp21531, i64 1
+  %tmp21533 = getelementptr inbounds float* %tmp21532, i64 1
+  %tmp21534 = getelementptr inbounds float* %tmp21533, i64 1
+  %tmp21535 = getelementptr inbounds float* %tmp21534, i64 1
+  %tmp21536 = getelementptr inbounds float* %tmp21535, i64 1
+  %tmp21537 = getelementptr inbounds float* %tmp21536, i64 1
+  %tmp21538 = getelementptr inbounds float* %tmp21537, i64 1
+  %tmp21539 = getelementptr inbounds float* %tmp21538, i64 1
+  %tmp21540 = getelementptr inbounds float* %tmp21539, i64 1
+  %tmp21541 = getelementptr inbounds float* %tmp21540, i64 1
+  %tmp21542 = getelementptr inbounds float* %tmp21541, i64 1
+  %tmp21543 = getelementptr inbounds float* %tmp21542, i64 1
+  %tmp21544 = getelementptr inbounds float* %tmp21543, i64 1
+  %tmp21545 = getelementptr inbounds float* %tmp21544, i64 1
+  %tmp21546 = getelementptr inbounds float* %tmp21545, i64 1
+  %tmp21547 = getelementptr inbounds float* %tmp21546, i64 1
+  %tmp21548 = getelementptr inbounds float* %tmp21547, i64 1
+  %tmp21549 = getelementptr inbounds float* %tmp21548, i64 1
+  %tmp21550 = getelementptr inbounds float* %tmp21549, i64 1
+  %tmp21551 = getelementptr inbounds float* %tmp21550, i64 1
+  %tmp21552 = getelementptr inbounds float* %tmp21551, i64 1
+  %tmp21553 = getelementptr inbounds float* %tmp21552, i64 1
+  %tmp21554 = getelementptr inbounds float* %tmp21553, i64 1
+  %tmp21555 = getelementptr inbounds float* %tmp21554, i64 1
+  %tmp21556 = getelementptr inbounds float* %tmp21555, i64 1
+  %tmp21557 = getelementptr inbounds float* %tmp21556, i64 1
+  %tmp21558 = getelementptr inbounds float* %tmp21557, i64 1
+  %tmp21559 = getelementptr inbounds float* %tmp21558, i64 1
+  %tmp21560 = getelementptr inbounds float* %tmp21559, i64 1
+  %tmp21561 = getelementptr inbounds float* %tmp21560, i64 1
+  %tmp21562 = getelementptr inbounds float* %tmp21561, i64 1
+  %tmp21563 = getelementptr inbounds float* %tmp21562, i64 1
+  %tmp21564 = getelementptr inbounds float* %tmp21563, i64 1
+  %tmp21565 = getelementptr inbounds float* %tmp21564, i64 1
+  %tmp21566 = getelementptr inbounds float* %tmp21565, i64 1
+  %tmp21567 = getelementptr inbounds float* %tmp21566, i64 1
+  %tmp21568 = getelementptr inbounds float* %tmp21567, i64 1
+  %tmp21569 = getelementptr inbounds float* %tmp21568, i64 1
+  %tmp21570 = getelementptr inbounds float* %tmp21569, i64 1
+  %tmp21571 = getelementptr inbounds float* %tmp21570, i64 1
+  %tmp21572 = getelementptr inbounds float* %tmp21571, i64 1
+  %tmp21573 = getelementptr inbounds float* %tmp21572, i64 1
+  %tmp21574 = getelementptr inbounds float* %tmp21573, i64 1
+  %tmp21575 = getelementptr inbounds float* %tmp21574, i64 1
+  %tmp21576 = getelementptr inbounds float* %tmp21575, i64 1
+  %tmp21577 = getelementptr inbounds float* %tmp21576, i64 1
+  %tmp21578 = getelementptr inbounds float* %tmp21577, i64 1
+  %tmp21579 = getelementptr inbounds float* %tmp21578, i64 1
+  %tmp21580 = getelementptr inbounds float* %tmp21579, i64 1
+  %tmp21581 = getelementptr inbounds float* %tmp21580, i64 1
+  %tmp21582 = getelementptr inbounds float* %tmp21581, i64 1
+  %tmp21583 = getelementptr inbounds float* %tmp21582, i64 1
+  %tmp21584 = getelementptr inbounds float* %tmp21583, i64 1
+  %tmp21585 = getelementptr inbounds float* %tmp21584, i64 1
+  %tmp21586 = getelementptr inbounds float* %tmp21585, i64 1
+  %tmp21587 = getelementptr inbounds float* %tmp21586, i64 1
+  %tmp21588 = getelementptr inbounds float* %tmp21587, i64 1
+  %tmp21589 = getelementptr inbounds float* %tmp21588, i64 1
+  %tmp21590 = getelementptr inbounds float* %tmp21589, i64 1
+  %tmp21591 = getelementptr inbounds float* %tmp21590, i64 1
+  %tmp21592 = getelementptr inbounds float* %tmp21591, i64 1
+  %tmp21593 = getelementptr inbounds float* %tmp21592, i64 1
+  %tmp21594 = getelementptr inbounds float* %tmp21593, i64 1
+  %tmp21595 = getelementptr inbounds float* %tmp21594, i64 1
+  %tmp21596 = getelementptr inbounds float* %tmp21595, i64 1
+  %tmp21597 = getelementptr inbounds float* %tmp21596, i64 1
+  %tmp21598 = getelementptr inbounds float* %tmp21597, i64 1
+  %tmp21599 = getelementptr inbounds float* %tmp21598, i64 1
+  %tmp21600 = getelementptr inbounds float* %tmp21599, i64 1
+  %tmp21601 = getelementptr inbounds float* %tmp21600, i64 1
+  %tmp21602 = getelementptr inbounds float* %tmp21601, i64 1
+  %tmp21603 = getelementptr inbounds float* %tmp21602, i64 1
+  %tmp21604 = getelementptr inbounds float* %tmp21603, i64 1
+  %tmp21605 = getelementptr inbounds float* %tmp21604, i64 1
+  %tmp21606 = getelementptr inbounds float* %tmp21605, i64 1
+  %tmp21607 = getelementptr inbounds float* %tmp21606, i64 1
+  %tmp21608 = getelementptr inbounds float* %tmp21607, i64 1
+  %tmp21609 = getelementptr inbounds float* %tmp21608, i64 1
+  %tmp21610 = getelementptr inbounds float* %tmp21609, i64 1
+  %tmp21611 = getelementptr inbounds float* %tmp21610, i64 1
+  %tmp21612 = getelementptr inbounds float* %tmp21611, i64 1
+  %tmp21613 = getelementptr inbounds float* %tmp21612, i64 1
+  %tmp21614 = getelementptr inbounds float* %tmp21613, i64 1
+  %tmp21615 = getelementptr inbounds float* %tmp21614, i64 1
+  %tmp21616 = getelementptr inbounds float* %tmp21615, i64 1
+  %tmp21617 = getelementptr inbounds float* %tmp21616, i64 1
+  %tmp21618 = getelementptr inbounds float* %tmp21617, i64 1
+  %tmp21619 = getelementptr inbounds float* %tmp21618, i64 1
+  %tmp21620 = getelementptr inbounds float* %tmp21619, i64 1
+  %tmp21621 = getelementptr inbounds float* %tmp21620, i64 1
+  %tmp21622 = getelementptr inbounds float* %tmp21621, i64 1
+  %tmp21623 = getelementptr inbounds float* %tmp21622, i64 1
+  %tmp21624 = getelementptr inbounds float* %tmp21623, i64 1
+  %tmp21625 = getelementptr inbounds float* %tmp21624, i64 1
+  %tmp21626 = getelementptr inbounds float* %tmp21625, i64 1
+  %tmp21627 = getelementptr inbounds float* %tmp21626, i64 1
+  %tmp21628 = getelementptr inbounds float* %tmp21627, i64 1
+  %tmp21629 = getelementptr inbounds float* %tmp21628, i64 1
+  %tmp21630 = getelementptr inbounds float* %tmp21629, i64 1
+  %tmp21631 = getelementptr inbounds float* %tmp21630, i64 1
+  %tmp21632 = getelementptr inbounds float* %tmp21631, i64 1
+  %tmp21633 = getelementptr inbounds float* %tmp21632, i64 1
+  %tmp21634 = getelementptr inbounds float* %tmp21633, i64 1
+  %tmp21635 = getelementptr inbounds float* %tmp21634, i64 1
+  %tmp21636 = getelementptr inbounds float* %tmp21635, i64 1
+  %tmp21637 = getelementptr inbounds float* %tmp21636, i64 1
+  %tmp21638 = getelementptr inbounds float* %tmp21637, i64 1
+  %tmp21639 = getelementptr inbounds float* %tmp21638, i64 1
+  %tmp21640 = getelementptr inbounds float* %tmp21639, i64 1
+  %tmp21641 = getelementptr inbounds float* %tmp21640, i64 1
+  %tmp21642 = getelementptr inbounds float* %tmp21641, i64 1
+  %tmp21643 = getelementptr inbounds float* %tmp21642, i64 1
+  %tmp21644 = getelementptr inbounds float* %tmp21643, i64 1
+  %tmp21645 = getelementptr inbounds float* %tmp21644, i64 1
+  %tmp21646 = getelementptr inbounds float* %tmp21645, i64 1
+  %tmp21647 = getelementptr inbounds float* %tmp21646, i64 1
+  %tmp21648 = getelementptr inbounds float* %tmp21647, i64 1
+  %tmp21649 = getelementptr inbounds float* %tmp21648, i64 1
+  %tmp21650 = getelementptr inbounds float* %tmp21649, i64 1
+  %tmp21651 = getelementptr inbounds float* %tmp21650, i64 1
+  %tmp21652 = getelementptr inbounds float* %tmp21651, i64 1
+  %tmp21653 = getelementptr inbounds float* %tmp21652, i64 1
+  %tmp21654 = getelementptr inbounds float* %tmp21653, i64 1
+  %tmp21655 = getelementptr inbounds float* %tmp21654, i64 1
+  %tmp21656 = getelementptr inbounds float* %tmp21655, i64 1
+  %tmp21657 = getelementptr inbounds float* %tmp21656, i64 1
+  %tmp21658 = getelementptr inbounds float* %tmp21657, i64 1
+  %tmp21659 = getelementptr inbounds float* %tmp21658, i64 1
+  %tmp21660 = getelementptr inbounds float* %tmp21659, i64 1
+  %tmp21661 = getelementptr inbounds float* %tmp21660, i64 1
+  %tmp21662 = getelementptr inbounds float* %tmp21661, i64 1
+  %tmp21663 = getelementptr inbounds float* %tmp21662, i64 1
+  %tmp21664 = getelementptr inbounds float* %tmp21663, i64 1
+  %tmp21665 = getelementptr inbounds float* %tmp21664, i64 1
+  %tmp21666 = getelementptr inbounds float* %tmp21665, i64 1
+  %tmp21667 = getelementptr inbounds float* %tmp21666, i64 1
+  %tmp21668 = getelementptr inbounds float* %tmp21667, i64 1
+  %tmp21669 = getelementptr inbounds float* %tmp21668, i64 1
+  %tmp21670 = getelementptr inbounds float* %tmp21669, i64 1
+  %tmp21671 = getelementptr inbounds float* %tmp21670, i64 1
+  %tmp21672 = getelementptr inbounds float* %tmp21671, i64 1
+  %tmp21673 = getelementptr inbounds float* %tmp21672, i64 1
+  %tmp21674 = getelementptr inbounds float* %tmp21673, i64 1
+  %tmp21675 = getelementptr inbounds float* %tmp21674, i64 1
+  %tmp21676 = getelementptr inbounds float* %tmp21675, i64 1
+  %tmp21677 = getelementptr inbounds float* %tmp21676, i64 1
+  %tmp21678 = getelementptr inbounds float* %tmp21677, i64 1
+  %tmp21679 = getelementptr inbounds float* %tmp21678, i64 1
+  %tmp21680 = getelementptr inbounds float* %tmp21679, i64 1
+  %tmp21681 = getelementptr inbounds float* %tmp21680, i64 1
+  %tmp21682 = getelementptr inbounds float* %tmp21681, i64 1
+  %tmp21683 = getelementptr inbounds float* %tmp21682, i64 1
+  %tmp21684 = getelementptr inbounds float* %tmp21683, i64 1
+  %tmp21685 = getelementptr inbounds float* %tmp21684, i64 1
+  %tmp21686 = getelementptr inbounds float* %tmp21685, i64 1
+  %tmp21687 = getelementptr inbounds float* %tmp21686, i64 1
+  %tmp21688 = getelementptr inbounds float* %tmp21687, i64 1
+  %tmp21689 = getelementptr inbounds float* %tmp21688, i64 1
+  %tmp21690 = getelementptr inbounds float* %tmp21689, i64 1
+  %tmp21691 = getelementptr inbounds float* %tmp21690, i64 1
+  %tmp21692 = getelementptr inbounds float* %tmp21691, i64 1
+  %tmp21693 = getelementptr inbounds float* %tmp21692, i64 1
+  %tmp21694 = getelementptr inbounds float* %tmp21693, i64 1
+  %tmp21695 = getelementptr inbounds float* %tmp21694, i64 1
+  %tmp21696 = getelementptr inbounds float* %tmp21695, i64 1
+  %tmp21697 = getelementptr inbounds float* %tmp21696, i64 1
+  %tmp21698 = getelementptr inbounds float* %tmp21697, i64 1
+  %tmp21699 = getelementptr inbounds float* %tmp21698, i64 1
+  %tmp21700 = getelementptr inbounds float* %tmp21699, i64 1
+  %tmp21701 = getelementptr inbounds float* %tmp21700, i64 1
+  %tmp21702 = getelementptr inbounds float* %tmp21701, i64 1
+  %tmp21703 = getelementptr inbounds float* %tmp21702, i64 1
+  %tmp21704 = getelementptr inbounds float* %tmp21703, i64 1
+  %tmp21705 = getelementptr inbounds float* %tmp21704, i64 1
+  %tmp21706 = getelementptr inbounds float* %tmp21705, i64 1
+  %tmp21707 = getelementptr inbounds float* %tmp21706, i64 1
+  %tmp21708 = getelementptr inbounds float* %tmp21707, i64 1
+  %tmp21709 = getelementptr inbounds float* %tmp21708, i64 1
+  %tmp21710 = getelementptr inbounds float* %tmp21709, i64 1
+  %tmp21711 = getelementptr inbounds float* %tmp21710, i64 1
+  %tmp21712 = getelementptr inbounds float* %tmp21711, i64 1
+  %tmp21713 = getelementptr inbounds float* %tmp21712, i64 1
+  %tmp21714 = getelementptr inbounds float* %tmp21713, i64 1
+  %tmp21715 = getelementptr inbounds float* %tmp21714, i64 1
+  %tmp21716 = getelementptr inbounds float* %tmp21715, i64 1
+  %tmp21717 = getelementptr inbounds float* %tmp21716, i64 1
+  %tmp21718 = getelementptr inbounds float* %tmp21717, i64 1
+  %tmp21719 = getelementptr inbounds float* %tmp21718, i64 1
+  %tmp21720 = getelementptr inbounds float* %tmp21719, i64 1
+  %tmp21721 = getelementptr inbounds float* %tmp21720, i64 1
+  %tmp21722 = getelementptr inbounds float* %tmp21721, i64 1
+  %tmp21723 = getelementptr inbounds float* %tmp21722, i64 1
+  %tmp21724 = getelementptr inbounds float* %tmp21723, i64 1
+  %tmp21725 = getelementptr inbounds float* %tmp21724, i64 1
+  %tmp21726 = getelementptr inbounds float* %tmp21725, i64 1
+  %tmp21727 = getelementptr inbounds float* %tmp21726, i64 1
+  %tmp21728 = getelementptr inbounds float* %tmp21727, i64 1
+  %tmp21729 = getelementptr inbounds float* %tmp21728, i64 1
+  %tmp21730 = getelementptr inbounds float* %tmp21729, i64 1
+  %tmp21731 = getelementptr inbounds float* %tmp21730, i64 1
+  %tmp21732 = getelementptr inbounds float* %tmp21731, i64 1
+  %tmp21733 = getelementptr inbounds float* %tmp21732, i64 1
+  %tmp21734 = getelementptr inbounds float* %tmp21733, i64 1
+  %tmp21735 = getelementptr inbounds float* %tmp21734, i64 1
+  %tmp21736 = getelementptr inbounds float* %tmp21735, i64 1
+  %tmp21737 = getelementptr inbounds float* %tmp21736, i64 1
+  %tmp21738 = getelementptr inbounds float* %tmp21737, i64 1
+  %tmp21739 = getelementptr inbounds float* %tmp21738, i64 1
+  %tmp21740 = getelementptr inbounds float* %tmp21739, i64 1
+  %tmp21741 = getelementptr inbounds float* %tmp21740, i64 1
+  %tmp21742 = getelementptr inbounds float* %tmp21741, i64 1
+  %tmp21743 = getelementptr inbounds float* %tmp21742, i64 1
+  %tmp21744 = getelementptr inbounds float* %tmp21743, i64 1
+  %tmp21745 = getelementptr inbounds float* %tmp21744, i64 1
+  %tmp21746 = getelementptr inbounds float* %tmp21745, i64 1
+  %tmp21747 = getelementptr inbounds float* %tmp21746, i64 1
+  %tmp21748 = getelementptr inbounds float* %tmp21747, i64 1
+  %tmp21749 = getelementptr inbounds float* %tmp21748, i64 1
+  %tmp21750 = getelementptr inbounds float* %tmp21749, i64 1
+  %tmp21751 = getelementptr inbounds float* %tmp21750, i64 1
+  %tmp21752 = getelementptr inbounds float* %tmp21751, i64 1
+  %tmp21753 = getelementptr inbounds float* %tmp21752, i64 1
+  %tmp21754 = getelementptr inbounds float* %tmp21753, i64 1
+  %tmp21755 = getelementptr inbounds float* %tmp21754, i64 1
+  %tmp21756 = getelementptr inbounds float* %tmp21755, i64 1
+  %tmp21757 = getelementptr inbounds float* %tmp21756, i64 1
+  %tmp21758 = getelementptr inbounds float* %tmp21757, i64 1
+  %tmp21759 = getelementptr inbounds float* %tmp21758, i64 1
+  %tmp21760 = getelementptr inbounds float* %tmp21759, i64 1
+  %tmp21761 = getelementptr inbounds float* %tmp21760, i64 1
+  %tmp21762 = getelementptr inbounds float* %tmp21761, i64 1
+  %tmp21763 = getelementptr inbounds float* %tmp21762, i64 1
+  %tmp21764 = getelementptr inbounds float* %tmp21763, i64 1
+  %tmp21765 = getelementptr inbounds float* %tmp21764, i64 1
+  %tmp21766 = getelementptr inbounds float* %tmp21765, i64 1
+  %tmp21767 = getelementptr inbounds float* %tmp21766, i64 1
+  %tmp21768 = getelementptr inbounds float* %tmp21767, i64 1
+  %tmp21769 = getelementptr inbounds float* %tmp21768, i64 1
+  %tmp21770 = getelementptr inbounds float* %tmp21769, i64 1
+  %tmp21771 = getelementptr inbounds float* %tmp21770, i64 1
+  %tmp21772 = getelementptr inbounds float* %tmp21771, i64 1
+  %tmp21773 = getelementptr inbounds float* %tmp21772, i64 1
+  %tmp21774 = getelementptr inbounds float* %tmp21773, i64 1
+  %tmp21775 = getelementptr inbounds float* %tmp21774, i64 1
+  %tmp21776 = getelementptr inbounds float* %tmp21775, i64 1
+  %tmp21777 = getelementptr inbounds float* %tmp21776, i64 1
+  %tmp21778 = getelementptr inbounds float* %tmp21777, i64 1
+  %tmp21779 = getelementptr inbounds float* %tmp21778, i64 1
+  %tmp21780 = getelementptr inbounds float* %tmp21779, i64 1
+  %tmp21781 = getelementptr inbounds float* %tmp21780, i64 1
+  %tmp21782 = getelementptr inbounds float* %tmp21781, i64 1
+  %tmp21783 = getelementptr inbounds float* %tmp21782, i64 1
+  %tmp21784 = getelementptr inbounds float* %tmp21783, i64 1
+  %tmp21785 = getelementptr inbounds float* %tmp21784, i64 1
+  %tmp21786 = getelementptr inbounds float* %tmp21785, i64 1
+  %tmp21787 = getelementptr inbounds float* %tmp21786, i64 1
+  %tmp21788 = getelementptr inbounds float* %tmp21787, i64 1
+  %tmp21789 = getelementptr inbounds float* %tmp21788, i64 1
+  %tmp21790 = getelementptr inbounds float* %tmp21789, i64 1
+  %tmp21791 = getelementptr inbounds float* %tmp21790, i64 1
+  %tmp21792 = getelementptr inbounds float* %tmp21791, i64 1
+  %tmp21793 = getelementptr inbounds float* %tmp21792, i64 1
+  %tmp21794 = getelementptr inbounds float* %tmp21793, i64 1
+  %tmp21795 = getelementptr inbounds float* %tmp21794, i64 1
+  %tmp21796 = getelementptr inbounds float* %tmp21795, i64 1
+  %tmp21797 = getelementptr inbounds float* %tmp21796, i64 1
+  %tmp21798 = getelementptr inbounds float* %tmp21797, i64 1
+  %tmp21799 = getelementptr inbounds float* %tmp21798, i64 1
+  %tmp21800 = getelementptr inbounds float* %tmp21799, i64 1
+  %tmp21801 = getelementptr inbounds float* %tmp21800, i64 1
+  %tmp21802 = getelementptr inbounds float* %tmp21801, i64 1
+  %tmp21803 = getelementptr inbounds float* %tmp21802, i64 1
+  %tmp21804 = getelementptr inbounds float* %tmp21803, i64 1
+  %tmp21805 = getelementptr inbounds float* %tmp21804, i64 1
+  %tmp21806 = getelementptr inbounds float* %tmp21805, i64 1
+  %tmp21807 = getelementptr inbounds float* %tmp21806, i64 1
+  %tmp21808 = getelementptr inbounds float* %tmp21807, i64 1
+  %tmp21809 = getelementptr inbounds float* %tmp21808, i64 1
+  %tmp21810 = getelementptr inbounds float* %tmp21809, i64 1
+  %tmp21811 = getelementptr inbounds float* %tmp21810, i64 1
+  %tmp21812 = getelementptr inbounds float* %tmp21811, i64 1
+  %tmp21813 = getelementptr inbounds float* %tmp21812, i64 1
+  %tmp21814 = getelementptr inbounds float* %tmp21813, i64 1
+  %tmp21815 = getelementptr inbounds float* %tmp21814, i64 1
+  %tmp21816 = getelementptr inbounds float* %tmp21815, i64 1
+  %tmp21817 = getelementptr inbounds float* %tmp21816, i64 1
+  %tmp21818 = getelementptr inbounds float* %tmp21817, i64 1
+  %tmp21819 = getelementptr inbounds float* %tmp21818, i64 1
+  %tmp21820 = getelementptr inbounds float* %tmp21819, i64 1
+  %tmp21821 = getelementptr inbounds float* %tmp21820, i64 1
+  %tmp21822 = getelementptr inbounds float* %tmp21821, i64 1
+  %tmp21823 = getelementptr inbounds float* %tmp21822, i64 1
+  %tmp21824 = getelementptr inbounds float* %tmp21823, i64 1
+  %tmp21825 = getelementptr inbounds float* %tmp21824, i64 1
+  %tmp21826 = getelementptr inbounds float* %tmp21825, i64 1
+  %tmp21827 = getelementptr inbounds float* %tmp21826, i64 1
+  %tmp21828 = getelementptr inbounds float* %tmp21827, i64 1
+  %tmp21829 = getelementptr inbounds float* %tmp21828, i64 1
+  %tmp21830 = getelementptr inbounds float* %tmp21829, i64 1
+  %tmp21831 = getelementptr inbounds float* %tmp21830, i64 1
+  %tmp21832 = getelementptr inbounds float* %tmp21831, i64 1
+  %tmp21833 = getelementptr inbounds float* %tmp21832, i64 1
+  %tmp21834 = getelementptr inbounds float* %tmp21833, i64 1
+  %tmp21835 = getelementptr inbounds float* %tmp21834, i64 1
+  %tmp21836 = getelementptr inbounds float* %tmp21835, i64 1
+  %tmp21837 = getelementptr inbounds float* %tmp21836, i64 1
+  %tmp21838 = getelementptr inbounds float* %tmp21837, i64 1
+  %tmp21839 = getelementptr inbounds float* %tmp21838, i64 1
+  %tmp21840 = getelementptr inbounds float* %tmp21839, i64 1
+  %tmp21841 = getelementptr inbounds float* %tmp21840, i64 1
+  %tmp21842 = getelementptr inbounds float* %tmp21841, i64 1
+  %tmp21843 = getelementptr inbounds float* %tmp21842, i64 1
+  %tmp21844 = getelementptr inbounds float* %tmp21843, i64 1
+  %tmp21845 = getelementptr inbounds float* %tmp21844, i64 1
+  %tmp21846 = getelementptr inbounds float* %tmp21845, i64 1
+  %tmp21847 = getelementptr inbounds float* %tmp21846, i64 1
+  %tmp21848 = getelementptr inbounds float* %tmp21847, i64 1
+  %tmp21849 = getelementptr inbounds float* %tmp21848, i64 1
+  %tmp21850 = getelementptr inbounds float* %tmp21849, i64 1
+  %tmp21851 = getelementptr inbounds float* %tmp21850, i64 1
+  %tmp21852 = getelementptr inbounds float* %tmp21851, i64 1
+  %tmp21853 = getelementptr inbounds float* %tmp21852, i64 1
+  %tmp21854 = getelementptr inbounds float* %tmp21853, i64 1
+  %tmp21855 = getelementptr inbounds float* %tmp21854, i64 1
+  %tmp21856 = getelementptr inbounds float* %tmp21855, i64 1
+  %tmp21857 = getelementptr inbounds float* %tmp21856, i64 1
+  %tmp21858 = getelementptr inbounds float* %tmp21857, i64 1
+  %tmp21859 = getelementptr inbounds float* %tmp21858, i64 1
+  %tmp21860 = getelementptr inbounds float* %tmp21859, i64 1
+  %tmp21861 = getelementptr inbounds float* %tmp21860, i64 1
+  %tmp21862 = getelementptr inbounds float* %tmp21861, i64 1
+  %tmp21863 = getelementptr inbounds float* %tmp21862, i64 1
+  %tmp21864 = getelementptr inbounds float* %tmp21863, i64 1
+  %tmp21865 = getelementptr inbounds float* %tmp21864, i64 1
+  %tmp21866 = getelementptr inbounds float* %tmp21865, i64 1
+  %tmp21867 = getelementptr inbounds float* %tmp21866, i64 1
+  %tmp21868 = getelementptr inbounds float* %tmp21867, i64 1
+  %tmp21869 = getelementptr inbounds float* %tmp21868, i64 1
+  %tmp21870 = getelementptr inbounds float* %tmp21869, i64 1
+  %tmp21871 = getelementptr inbounds float* %tmp21870, i64 1
+  %tmp21872 = getelementptr inbounds float* %tmp21871, i64 1
+  %tmp21873 = getelementptr inbounds float* %tmp21872, i64 1
+  %tmp21874 = getelementptr inbounds float* %tmp21873, i64 1
+  %tmp21875 = getelementptr inbounds float* %tmp21874, i64 1
+  %tmp21876 = getelementptr inbounds float* %tmp21875, i64 1
+  %tmp21877 = getelementptr inbounds float* %tmp21876, i64 1
+  %tmp21878 = getelementptr inbounds float* %tmp21877, i64 1
+  %tmp21879 = getelementptr inbounds float* %tmp21878, i64 1
+  %tmp21880 = getelementptr inbounds float* %tmp21879, i64 1
+  %tmp21881 = getelementptr inbounds float* %tmp21880, i64 1
+  %tmp21882 = getelementptr inbounds float* %tmp21881, i64 1
+  %tmp21883 = getelementptr inbounds float* %tmp21882, i64 1
+  %tmp21884 = getelementptr inbounds float* %tmp21883, i64 1
+  %tmp21885 = getelementptr inbounds float* %tmp21884, i64 1
+  %tmp21886 = getelementptr inbounds float* %tmp21885, i64 1
+  %tmp21887 = getelementptr inbounds float* %tmp21886, i64 1
+  %tmp21888 = getelementptr inbounds float* %tmp21887, i64 1
+  %tmp21889 = getelementptr inbounds float* %tmp21888, i64 1
+  %tmp21890 = getelementptr inbounds float* %tmp21889, i64 1
+  %tmp21891 = getelementptr inbounds float* %tmp21890, i64 1
+  %tmp21892 = getelementptr inbounds float* %tmp21891, i64 1
+  %tmp21893 = getelementptr inbounds float* %tmp21892, i64 1
+  %tmp21894 = getelementptr inbounds float* %tmp21893, i64 1
+  %tmp21895 = getelementptr inbounds float* %tmp21894, i64 1
+  %tmp21896 = getelementptr inbounds float* %tmp21895, i64 1
+  %tmp21897 = getelementptr inbounds float* %tmp21896, i64 1
+  %tmp21898 = getelementptr inbounds float* %tmp21897, i64 1
+  %tmp21899 = getelementptr inbounds float* %tmp21898, i64 1
+  %tmp21900 = getelementptr inbounds float* %tmp21899, i64 1
+  %tmp21901 = getelementptr inbounds float* %tmp21900, i64 1
+  %tmp21902 = getelementptr inbounds float* %tmp21901, i64 1
+  %tmp21903 = getelementptr inbounds float* %tmp21902, i64 1
+  %tmp21904 = getelementptr inbounds float* %tmp21903, i64 1
+  %tmp21905 = getelementptr inbounds float* %tmp21904, i64 1
+  %tmp21906 = getelementptr inbounds float* %tmp21905, i64 1
+  %tmp21907 = getelementptr inbounds float* %tmp21906, i64 1
+  %tmp21908 = getelementptr inbounds float* %tmp21907, i64 1
+  %tmp21909 = getelementptr inbounds float* %tmp21908, i64 1
+  %tmp21910 = getelementptr inbounds float* %tmp21909, i64 1
+  %tmp21911 = getelementptr inbounds float* %tmp21910, i64 1
+  %tmp21912 = getelementptr inbounds float* %tmp21911, i64 1
+  %tmp21913 = getelementptr inbounds float* %tmp21912, i64 1
+  %tmp21914 = getelementptr inbounds float* %tmp21913, i64 1
+  %tmp21915 = getelementptr inbounds float* %tmp21914, i64 1
+  %tmp21916 = getelementptr inbounds float* %tmp21915, i64 1
+  %tmp21917 = getelementptr inbounds float* %tmp21916, i64 1
+  %tmp21918 = getelementptr inbounds float* %tmp21917, i64 1
+  %tmp21919 = getelementptr inbounds float* %tmp21918, i64 1
+  %tmp21920 = getelementptr inbounds float* %tmp21919, i64 1
+  %tmp21921 = getelementptr inbounds float* %tmp21920, i64 1
+  %tmp21922 = getelementptr inbounds float* %tmp21921, i64 1
+  %tmp21923 = getelementptr inbounds float* %tmp21922, i64 1
+  %tmp21924 = getelementptr inbounds float* %tmp21923, i64 1
+  %tmp21925 = getelementptr inbounds float* %tmp21924, i64 1
+  %tmp21926 = getelementptr inbounds float* %tmp21925, i64 1
+  %tmp21927 = getelementptr inbounds float* %tmp21926, i64 1
+  %tmp21928 = getelementptr inbounds float* %tmp21927, i64 1
+  %tmp21929 = getelementptr inbounds float* %tmp21928, i64 1
+  %tmp21930 = getelementptr inbounds float* %tmp21929, i64 1
+  %tmp21931 = getelementptr inbounds float* %tmp21930, i64 1
+  %tmp21932 = getelementptr inbounds float* %tmp21931, i64 1
+  %tmp21933 = getelementptr inbounds float* %tmp21932, i64 1
+  %tmp21934 = getelementptr inbounds float* %tmp21933, i64 1
+  %tmp21935 = getelementptr inbounds float* %tmp21934, i64 1
+  %tmp21936 = getelementptr inbounds float* %tmp21935, i64 1
+  %tmp21937 = getelementptr inbounds float* %tmp21936, i64 1
+  %tmp21938 = getelementptr inbounds float* %tmp21937, i64 1
+  %tmp21939 = getelementptr inbounds float* %tmp21938, i64 1
+  %tmp21940 = getelementptr inbounds float* %tmp21939, i64 1
+  %tmp21941 = getelementptr inbounds float* %tmp21940, i64 1
+  %tmp21942 = getelementptr inbounds float* %tmp21941, i64 1
+  %tmp21943 = getelementptr inbounds float* %tmp21942, i64 1
+  %tmp21944 = getelementptr inbounds float* %tmp21943, i64 1
+  %tmp21945 = getelementptr inbounds float* %tmp21944, i64 1
+  %tmp21946 = getelementptr inbounds float* %tmp21945, i64 1
+  %tmp21947 = getelementptr inbounds float* %tmp21946, i64 1
+  %tmp21948 = getelementptr inbounds float* %tmp21947, i64 1
+  %tmp21949 = getelementptr inbounds float* %tmp21948, i64 1
+  %tmp21950 = getelementptr inbounds float* %tmp21949, i64 1
+  %tmp21951 = getelementptr inbounds float* %tmp21950, i64 1
+  %tmp21952 = getelementptr inbounds float* %tmp21951, i64 1
+  %tmp21953 = getelementptr inbounds float* %tmp21952, i64 1
+  %tmp21954 = getelementptr inbounds float* %tmp21953, i64 1
+  %tmp21955 = getelementptr inbounds float* %tmp21954, i64 1
+  %tmp21956 = getelementptr inbounds float* %tmp21955, i64 1
+  %tmp21957 = getelementptr inbounds float* %tmp21956, i64 1
+  %tmp21958 = getelementptr inbounds float* %tmp21957, i64 1
+  %tmp21959 = getelementptr inbounds float* %tmp21958, i64 1
+  %tmp21960 = getelementptr inbounds float* %tmp21959, i64 1
+  %tmp21961 = getelementptr inbounds float* %tmp21960, i64 1
+  %tmp21962 = getelementptr inbounds float* %tmp21961, i64 1
+  %tmp21963 = getelementptr inbounds float* %tmp21962, i64 1
+  %tmp21964 = getelementptr inbounds float* %tmp21963, i64 1
+  %tmp21965 = getelementptr inbounds float* %tmp21964, i64 1
+  %tmp21966 = getelementptr inbounds float* %tmp21965, i64 1
+  %tmp21967 = getelementptr inbounds float* %tmp21966, i64 1
+  %tmp21968 = getelementptr inbounds float* %tmp21967, i64 1
+  %tmp21969 = getelementptr inbounds float* %tmp21968, i64 1
+  %tmp21970 = getelementptr inbounds float* %tmp21969, i64 1
+  %tmp21971 = getelementptr inbounds float* %tmp21970, i64 1
+  %tmp21972 = getelementptr inbounds float* %tmp21971, i64 1
+  %tmp21973 = getelementptr inbounds float* %tmp21972, i64 1
+  %tmp21974 = getelementptr inbounds float* %tmp21973, i64 1
+  %tmp21975 = getelementptr inbounds float* %tmp21974, i64 1
+  %tmp21976 = getelementptr inbounds float* %tmp21975, i64 1
+  %tmp21977 = getelementptr inbounds float* %tmp21976, i64 1
+  %tmp21978 = getelementptr inbounds float* %tmp21977, i64 1
+  %tmp21979 = getelementptr inbounds float* %tmp21978, i64 1
+  %tmp21980 = getelementptr inbounds float* %tmp21979, i64 1
+  %tmp21981 = getelementptr inbounds float* %tmp21980, i64 1
+  %tmp21982 = getelementptr inbounds float* %tmp21981, i64 1
+  %tmp21983 = getelementptr inbounds float* %tmp21982, i64 1
+  %tmp21984 = getelementptr inbounds float* %tmp21983, i64 1
+  %tmp21985 = getelementptr inbounds float* %tmp21984, i64 1
+  %tmp21986 = getelementptr inbounds float* %tmp21985, i64 1
+  %tmp21987 = getelementptr inbounds float* %tmp21986, i64 1
+  %tmp21988 = getelementptr inbounds float* %tmp21987, i64 1
+  %tmp21989 = getelementptr inbounds float* %tmp21988, i64 1
+  %tmp21990 = getelementptr inbounds float* %tmp21989, i64 1
+  %tmp21991 = getelementptr inbounds float* %tmp21990, i64 1
+  %tmp21992 = getelementptr inbounds float* %tmp21991, i64 1
+  %tmp21993 = getelementptr inbounds float* %tmp21992, i64 1
+  %tmp21994 = getelementptr inbounds float* %tmp21993, i64 1
+  %tmp21995 = getelementptr inbounds float* %tmp21994, i64 1
+  %tmp21996 = getelementptr inbounds float* %tmp21995, i64 1
+  %tmp21997 = getelementptr inbounds float* %tmp21996, i64 1
+  %tmp21998 = getelementptr inbounds float* %tmp21997, i64 1
+  %tmp21999 = getelementptr inbounds float* %tmp21998, i64 1
+  %tmp22000 = getelementptr inbounds float* %tmp21999, i64 1
+  %tmp22001 = getelementptr inbounds float* %tmp22000, i64 1
+  %tmp22002 = getelementptr inbounds float* %tmp22001, i64 1
+  %tmp22003 = getelementptr inbounds float* %tmp22002, i64 1
+  %tmp22004 = getelementptr inbounds float* %tmp22003, i64 1
+  %tmp22005 = getelementptr inbounds float* %tmp22004, i64 1
+  %tmp22006 = getelementptr inbounds float* %tmp22005, i64 1
+  %tmp22007 = getelementptr inbounds float* %tmp22006, i64 1
+  %tmp22008 = getelementptr inbounds float* %tmp22007, i64 1
+  %tmp22009 = getelementptr inbounds float* %tmp22008, i64 1
+  %tmp22010 = getelementptr inbounds float* %tmp22009, i64 1
+  %tmp22011 = getelementptr inbounds float* %tmp22010, i64 1
+  %tmp22012 = getelementptr inbounds float* %tmp22011, i64 1
+  %tmp22013 = getelementptr inbounds float* %tmp22012, i64 1
+  %tmp22014 = getelementptr inbounds float* %tmp22013, i64 1
+  %tmp22015 = getelementptr inbounds float* %tmp22014, i64 1
+  %tmp22016 = getelementptr inbounds float* %tmp22015, i64 1
+  %tmp22017 = getelementptr inbounds float* %tmp22016, i64 1
+  %tmp22018 = getelementptr inbounds float* %tmp22017, i64 1
+  %tmp22019 = getelementptr inbounds float* %tmp22018, i64 1
+  %tmp22020 = getelementptr inbounds float* %tmp22019, i64 1
+  %tmp22021 = getelementptr inbounds float* %tmp22020, i64 1
+  %tmp22022 = getelementptr inbounds float* %tmp22021, i64 1
+  %tmp22023 = getelementptr inbounds float* %tmp22022, i64 1
+  %tmp22024 = getelementptr inbounds float* %tmp22023, i64 1
+  %tmp22025 = getelementptr inbounds float* %tmp22024, i64 1
+  %tmp22026 = getelementptr inbounds float* %tmp22025, i64 1
+  %tmp22027 = getelementptr inbounds float* %tmp22026, i64 1
+  %tmp22028 = getelementptr inbounds float* %tmp22027, i64 1
+  %tmp22029 = getelementptr inbounds float* %tmp22028, i64 1
+  %tmp22030 = getelementptr inbounds float* %tmp22029, i64 1
+  %tmp22031 = getelementptr inbounds float* %tmp22030, i64 1
+  %tmp22032 = getelementptr inbounds float* %tmp22031, i64 1
+  %tmp22033 = getelementptr inbounds float* %tmp22032, i64 1
+  %tmp22034 = getelementptr inbounds float* %tmp22033, i64 1
+  %tmp22035 = getelementptr inbounds float* %tmp22034, i64 1
+  %tmp22036 = getelementptr inbounds float* %tmp22035, i64 1
+  %tmp22037 = getelementptr inbounds float* %tmp22036, i64 1
+  %tmp22038 = getelementptr inbounds float* %tmp22037, i64 1
+  %tmp22039 = getelementptr inbounds float* %tmp22038, i64 1
+  %tmp22040 = getelementptr inbounds float* %tmp22039, i64 1
+  %tmp22041 = getelementptr inbounds float* %tmp22040, i64 1
+  %tmp22042 = getelementptr inbounds float* %tmp22041, i64 1
+  %tmp22043 = getelementptr inbounds float* %tmp22042, i64 1
+  %tmp22044 = getelementptr inbounds float* %tmp22043, i64 1
+  %tmp22045 = getelementptr inbounds float* %tmp22044, i64 1
+  %tmp22046 = getelementptr inbounds float* %tmp22045, i64 1
+  %tmp22047 = getelementptr inbounds float* %tmp22046, i64 1
+  %tmp22048 = getelementptr inbounds float* %tmp22047, i64 1
+  %tmp22049 = getelementptr inbounds float* %tmp22048, i64 1
+  %tmp22050 = getelementptr inbounds float* %tmp22049, i64 1
+  %tmp22051 = getelementptr inbounds float* %tmp22050, i64 1
+  %tmp22052 = getelementptr inbounds float* %tmp22051, i64 1
+  %tmp22053 = getelementptr inbounds float* %tmp22052, i64 1
+  %tmp22054 = getelementptr inbounds float* %tmp22053, i64 1
+  %tmp22055 = getelementptr inbounds float* %tmp22054, i64 1
+  %tmp22056 = getelementptr inbounds float* %tmp22055, i64 1
+  %tmp22057 = getelementptr inbounds float* %tmp22056, i64 1
+  %tmp22058 = getelementptr inbounds float* %tmp22057, i64 1
+  %tmp22059 = getelementptr inbounds float* %tmp22058, i64 1
+  %tmp22060 = getelementptr inbounds float* %tmp22059, i64 1
+  %tmp22061 = getelementptr inbounds float* %tmp22060, i64 1
+  %tmp22062 = getelementptr inbounds float* %tmp22061, i64 1
+  %tmp22063 = getelementptr inbounds float* %tmp22062, i64 1
+  %tmp22064 = getelementptr inbounds float* %tmp22063, i64 1
+  %tmp22065 = getelementptr inbounds float* %tmp22064, i64 1
+  %tmp22066 = getelementptr inbounds float* %tmp22065, i64 1
+  %tmp22067 = getelementptr inbounds float* %tmp22066, i64 1
+  %tmp22068 = getelementptr inbounds float* %tmp22067, i64 1
+  %tmp22069 = getelementptr inbounds float* %tmp22068, i64 1
+  %tmp22070 = getelementptr inbounds float* %tmp22069, i64 1
+  %tmp22071 = getelementptr inbounds float* %tmp22070, i64 1
+  %tmp22072 = getelementptr inbounds float* %tmp22071, i64 1
+  %tmp22073 = getelementptr inbounds float* %tmp22072, i64 1
+  %tmp22074 = getelementptr inbounds float* %tmp22073, i64 1
+  %tmp22075 = getelementptr inbounds float* %tmp22074, i64 1
+  %tmp22076 = getelementptr inbounds float* %tmp22075, i64 1
+  %tmp22077 = getelementptr inbounds float* %tmp22076, i64 1
+  %tmp22078 = getelementptr inbounds float* %tmp22077, i64 1
+  %tmp22079 = getelementptr inbounds float* %tmp22078, i64 1
+  %tmp22080 = getelementptr inbounds float* %tmp22079, i64 1
+  %tmp22081 = getelementptr inbounds float* %tmp22080, i64 1
+  %tmp22082 = getelementptr inbounds float* %tmp22081, i64 1
+  %tmp22083 = getelementptr inbounds float* %tmp22082, i64 1
+  %tmp22084 = getelementptr inbounds float* %tmp22083, i64 1
+  %tmp22085 = getelementptr inbounds float* %tmp22084, i64 1
+  %tmp22086 = getelementptr inbounds float* %tmp22085, i64 1
+  %tmp22087 = getelementptr inbounds float* %tmp22086, i64 1
+  %tmp22088 = getelementptr inbounds float* %tmp22087, i64 1
+  %tmp22089 = getelementptr inbounds float* %tmp22088, i64 1
+  %tmp22090 = getelementptr inbounds float* %tmp22089, i64 1
+  %tmp22091 = getelementptr inbounds float* %tmp22090, i64 1
+  %tmp22092 = getelementptr inbounds float* %tmp22091, i64 1
+  %tmp22093 = getelementptr inbounds float* %tmp22092, i64 1
+  %tmp22094 = getelementptr inbounds float* %tmp22093, i64 1
+  %tmp22095 = getelementptr inbounds float* %tmp22094, i64 1
+  %tmp22096 = getelementptr inbounds float* %tmp22095, i64 1
+  %tmp22097 = getelementptr inbounds float* %tmp22096, i64 1
+  %tmp22098 = getelementptr inbounds float* %tmp22097, i64 1
+  %tmp22099 = getelementptr inbounds float* %tmp22098, i64 1
+  %tmp22100 = getelementptr inbounds float* %tmp22099, i64 1
+  %tmp22101 = getelementptr inbounds float* %tmp22100, i64 1
+  %tmp22102 = getelementptr inbounds float* %tmp22101, i64 1
+  %tmp22103 = getelementptr inbounds float* %tmp22102, i64 1
+  %tmp22104 = getelementptr inbounds float* %tmp22103, i64 1
+  %tmp22105 = getelementptr inbounds float* %tmp22104, i64 1
+  %tmp22106 = getelementptr inbounds float* %tmp22105, i64 1
+  %tmp22107 = getelementptr inbounds float* %tmp22106, i64 1
+  %tmp22108 = getelementptr inbounds float* %tmp22107, i64 1
+  %tmp22109 = getelementptr inbounds float* %tmp22108, i64 1
+  %tmp22110 = getelementptr inbounds float* %tmp22109, i64 1
+  %tmp22111 = getelementptr inbounds float* %tmp22110, i64 1
+  %tmp22112 = getelementptr inbounds float* %tmp22111, i64 1
+  %tmp22113 = getelementptr inbounds float* %tmp22112, i64 1
+  %tmp22114 = getelementptr inbounds float* %tmp22113, i64 1
+  %tmp22115 = getelementptr inbounds float* %tmp22114, i64 1
+  %tmp22116 = getelementptr inbounds float* %tmp22115, i64 1
+  %tmp22117 = getelementptr inbounds float* %tmp22116, i64 1
+  %tmp22118 = getelementptr inbounds float* %tmp22117, i64 1
+  %tmp22119 = getelementptr inbounds float* %tmp22118, i64 1
+  %tmp22120 = getelementptr inbounds float* %tmp22119, i64 1
+  %tmp22121 = getelementptr inbounds float* %tmp22120, i64 1
+  %tmp22122 = getelementptr inbounds float* %tmp22121, i64 1
+  %tmp22123 = getelementptr inbounds float* %tmp22122, i64 1
+  %tmp22124 = getelementptr inbounds float* %tmp22123, i64 1
+  %tmp22125 = getelementptr inbounds float* %tmp22124, i64 1
+  %tmp22126 = getelementptr inbounds float* %tmp22125, i64 1
+  %tmp22127 = getelementptr inbounds float* %tmp22126, i64 1
+  %tmp22128 = getelementptr inbounds float* %tmp22127, i64 1
+  %tmp22129 = getelementptr inbounds float* %tmp22128, i64 1
+  %tmp22130 = getelementptr inbounds float* %tmp22129, i64 1
+  %tmp22131 = getelementptr inbounds float* %tmp22130, i64 1
+  %tmp22132 = getelementptr inbounds float* %tmp22131, i64 1
+  %tmp22133 = getelementptr inbounds float* %tmp22132, i64 1
+  %tmp22134 = getelementptr inbounds float* %tmp22133, i64 1
+  %tmp22135 = getelementptr inbounds float* %tmp22134, i64 1
+  %tmp22136 = getelementptr inbounds float* %tmp22135, i64 1
+  %tmp22137 = getelementptr inbounds float* %tmp22136, i64 1
+  %tmp22138 = getelementptr inbounds float* %tmp22137, i64 1
+  %tmp22139 = getelementptr inbounds float* %tmp22138, i64 1
+  %tmp22140 = getelementptr inbounds float* %tmp22139, i64 1
+  %tmp22141 = getelementptr inbounds float* %tmp22140, i64 1
+  %tmp22142 = getelementptr inbounds float* %tmp22141, i64 1
+  %tmp22143 = getelementptr inbounds float* %tmp22142, i64 1
+  %tmp22144 = getelementptr inbounds float* %tmp22143, i64 1
+  %tmp22145 = getelementptr inbounds float* %tmp22144, i64 1
+  %tmp22146 = getelementptr inbounds float* %tmp22145, i64 1
+  %tmp22147 = getelementptr inbounds float* %tmp22146, i64 1
+  %tmp22148 = getelementptr inbounds float* %tmp22147, i64 1
+  %tmp22149 = getelementptr inbounds float* %tmp22148, i64 1
+  %tmp22150 = getelementptr inbounds float* %tmp22149, i64 1
+  %tmp22151 = getelementptr inbounds float* %tmp22150, i64 1
+  %tmp22152 = getelementptr inbounds float* %tmp22151, i64 1
+  %tmp22153 = getelementptr inbounds float* %tmp22152, i64 1
+  %tmp22154 = getelementptr inbounds float* %tmp22153, i64 1
+  %tmp22155 = getelementptr inbounds float* %tmp22154, i64 1
+  %tmp22156 = getelementptr inbounds float* %tmp22155, i64 1
+  %tmp22157 = getelementptr inbounds float* %tmp22156, i64 1
+  %tmp22158 = getelementptr inbounds float* %tmp22157, i64 1
+  %tmp22159 = getelementptr inbounds float* %tmp22158, i64 1
+  %tmp22160 = getelementptr inbounds float* %tmp22159, i64 1
+  %tmp22161 = getelementptr inbounds float* %tmp22160, i64 1
+  %tmp22162 = getelementptr inbounds float* %tmp22161, i64 1
+  %tmp22163 = getelementptr inbounds float* %tmp22162, i64 1
+  %tmp22164 = getelementptr inbounds float* %tmp22163, i64 1
+  %tmp22165 = getelementptr inbounds float* %tmp22164, i64 1
+  %tmp22166 = getelementptr inbounds float* %tmp22165, i64 1
+  %tmp22167 = getelementptr inbounds float* %tmp22166, i64 1
+  %tmp22168 = getelementptr inbounds float* %tmp22167, i64 1
+  %tmp22169 = getelementptr inbounds float* %tmp22168, i64 1
+  %tmp22170 = getelementptr inbounds float* %tmp22169, i64 1
+  %tmp22171 = getelementptr inbounds float* %tmp22170, i64 1
+  %tmp22172 = getelementptr inbounds float* %tmp22171, i64 1
+  %tmp22173 = getelementptr inbounds float* %tmp22172, i64 1
+  %tmp22174 = getelementptr inbounds float* %tmp22173, i64 1
+  %tmp22175 = getelementptr inbounds float* %tmp22174, i64 1
+  %tmp22176 = getelementptr inbounds float* %tmp22175, i64 1
+  %tmp22177 = getelementptr inbounds float* %tmp22176, i64 1
+  %tmp22178 = getelementptr inbounds float* %tmp22177, i64 1
+  %tmp22179 = getelementptr inbounds float* %tmp22178, i64 1
+  %tmp22180 = getelementptr inbounds float* %tmp22179, i64 1
+  %tmp22181 = getelementptr inbounds float* %tmp22180, i64 1
+  %tmp22182 = getelementptr inbounds float* %tmp22181, i64 1
+  %tmp22183 = getelementptr inbounds float* %tmp22182, i64 1
+  %tmp22184 = getelementptr inbounds float* %tmp22183, i64 1
+  %tmp22185 = getelementptr inbounds float* %tmp22184, i64 1
+  %tmp22186 = getelementptr inbounds float* %tmp22185, i64 1
+  %tmp22187 = getelementptr inbounds float* %tmp22186, i64 1
+  %tmp22188 = getelementptr inbounds float* %tmp22187, i64 1
+  %tmp22189 = getelementptr inbounds float* %tmp22188, i64 1
+  %tmp22190 = getelementptr inbounds float* %tmp22189, i64 1
+  %tmp22191 = getelementptr inbounds float* %tmp22190, i64 1
+  %tmp22192 = getelementptr inbounds float* %tmp22191, i64 1
+  %tmp22193 = getelementptr inbounds float* %tmp22192, i64 1
+  %tmp22194 = getelementptr inbounds float* %tmp22193, i64 1
+  %tmp22195 = getelementptr inbounds float* %tmp22194, i64 1
+  %tmp22196 = getelementptr inbounds float* %tmp22195, i64 1
+  %tmp22197 = getelementptr inbounds float* %tmp22196, i64 1
+  %tmp22198 = getelementptr inbounds float* %tmp22197, i64 1
+  %tmp22199 = getelementptr inbounds float* %tmp22198, i64 1
+  %tmp22200 = getelementptr inbounds float* %tmp22199, i64 1
+  %tmp22201 = getelementptr inbounds float* %tmp22200, i64 1
+  %tmp22202 = getelementptr inbounds float* %tmp22201, i64 1
+  %tmp22203 = getelementptr inbounds float* %tmp22202, i64 1
+  %tmp22204 = getelementptr inbounds float* %tmp22203, i64 1
+  %tmp22205 = getelementptr inbounds float* %tmp22204, i64 1
+  %tmp22206 = getelementptr inbounds float* %tmp22205, i64 1
+  %tmp22207 = getelementptr inbounds float* %tmp22206, i64 1
+  %tmp22208 = getelementptr inbounds float* %tmp22207, i64 1
+  %tmp22209 = getelementptr inbounds float* %tmp22208, i64 1
+  %tmp22210 = getelementptr inbounds float* %tmp22209, i64 1
+  %tmp22211 = getelementptr inbounds float* %tmp22210, i64 1
+  %tmp22212 = getelementptr inbounds float* %tmp22211, i64 1
+  %tmp22213 = getelementptr inbounds float* %tmp22212, i64 1
+  %tmp22214 = getelementptr inbounds float* %tmp22213, i64 1
+  %tmp22215 = getelementptr inbounds float* %tmp22214, i64 1
+  %tmp22216 = getelementptr inbounds float* %tmp22215, i64 1
+  %tmp22217 = getelementptr inbounds float* %tmp22216, i64 1
+  %tmp22218 = getelementptr inbounds float* %tmp22217, i64 1
+  %tmp22219 = getelementptr inbounds float* %tmp22218, i64 1
+  %tmp22220 = getelementptr inbounds float* %tmp22219, i64 1
+  %tmp22221 = getelementptr inbounds float* %tmp22220, i64 1
+  %tmp22222 = getelementptr inbounds float* %tmp22221, i64 1
+  %tmp22223 = getelementptr inbounds float* %tmp22222, i64 1
+  %tmp22224 = getelementptr inbounds float* %tmp22223, i64 1
+  %tmp22225 = getelementptr inbounds float* %tmp22224, i64 1
+  %tmp22226 = getelementptr inbounds float* %tmp22225, i64 1
+  %tmp22227 = getelementptr inbounds float* %tmp22226, i64 1
+  %tmp22228 = getelementptr inbounds float* %tmp22227, i64 1
+  %tmp22229 = getelementptr inbounds float* %tmp22228, i64 1
+  %tmp22230 = getelementptr inbounds float* %tmp22229, i64 1
+  %tmp22231 = getelementptr inbounds float* %tmp22230, i64 1
+  %tmp22232 = getelementptr inbounds float* %tmp22231, i64 1
+  %tmp22233 = getelementptr inbounds float* %tmp22232, i64 1
+  %tmp22234 = getelementptr inbounds float* %tmp22233, i64 1
+  %tmp22235 = getelementptr inbounds float* %tmp22234, i64 1
+  %tmp22236 = getelementptr inbounds float* %tmp22235, i64 1
+  %tmp22237 = getelementptr inbounds float* %tmp22236, i64 1
+  %tmp22238 = getelementptr inbounds float* %tmp22237, i64 1
+  %tmp22239 = getelementptr inbounds float* %tmp22238, i64 1
+  %tmp22240 = getelementptr inbounds float* %tmp22239, i64 1
+  %tmp22241 = getelementptr inbounds float* %tmp22240, i64 1
+  %tmp22242 = getelementptr inbounds float* %tmp22241, i64 1
+  %tmp22243 = getelementptr inbounds float* %tmp22242, i64 1
+  %tmp22244 = getelementptr inbounds float* %tmp22243, i64 1
+  %tmp22245 = getelementptr inbounds float* %tmp22244, i64 1
+  %tmp22246 = getelementptr inbounds float* %tmp22245, i64 1
+  %tmp22247 = getelementptr inbounds float* %tmp22246, i64 1
+  %tmp22248 = getelementptr inbounds float* %tmp22247, i64 1
+  %tmp22249 = getelementptr inbounds float* %tmp22248, i64 1
+  %tmp22250 = getelementptr inbounds float* %tmp22249, i64 1
+  %tmp22251 = getelementptr inbounds float* %tmp22250, i64 1
+  %tmp22252 = getelementptr inbounds float* %tmp22251, i64 1
+  %tmp22253 = getelementptr inbounds float* %tmp22252, i64 1
+  %tmp22254 = getelementptr inbounds float* %tmp22253, i64 1
+  %tmp22255 = getelementptr inbounds float* %tmp22254, i64 1
+  %tmp22256 = getelementptr inbounds float* %tmp22255, i64 1
+  %tmp22257 = getelementptr inbounds float* %tmp22256, i64 1
+  %tmp22258 = getelementptr inbounds float* %tmp22257, i64 1
+  %tmp22259 = getelementptr inbounds float* %tmp22258, i64 1
+  %tmp22260 = getelementptr inbounds float* %tmp22259, i64 1
+  %tmp22261 = getelementptr inbounds float* %tmp22260, i64 1
+  %tmp22262 = getelementptr inbounds float* %tmp22261, i64 1
+  %tmp22263 = getelementptr inbounds float* %tmp22262, i64 1
+  %tmp22264 = getelementptr inbounds float* %tmp22263, i64 1
+  %tmp22265 = getelementptr inbounds float* %tmp22264, i64 1
+  %tmp22266 = getelementptr inbounds float* %tmp22265, i64 1
+  %tmp22267 = getelementptr inbounds float* %tmp22266, i64 1
+  %tmp22268 = getelementptr inbounds float* %tmp22267, i64 1
+  %tmp22269 = getelementptr inbounds float* %tmp22268, i64 1
+  %tmp22270 = getelementptr inbounds float* %tmp22269, i64 1
+  %tmp22271 = getelementptr inbounds float* %tmp22270, i64 1
+  %tmp22272 = getelementptr inbounds float* %tmp22271, i64 1
+  %tmp22273 = getelementptr inbounds float* %tmp22272, i64 1
+  %tmp22274 = getelementptr inbounds float* %tmp22273, i64 1
+  %tmp22275 = getelementptr inbounds float* %tmp22274, i64 1
+  %tmp22276 = getelementptr inbounds float* %tmp22275, i64 1
+  %tmp22277 = getelementptr inbounds float* %tmp22276, i64 1
+  %tmp22278 = getelementptr inbounds float* %tmp22277, i64 1
+  %tmp22279 = getelementptr inbounds float* %tmp22278, i64 1
+  %tmp22280 = getelementptr inbounds float* %tmp22279, i64 1
+  %tmp22281 = getelementptr inbounds float* %tmp22280, i64 1
+  %tmp22282 = getelementptr inbounds float* %tmp22281, i64 1
+  %tmp22283 = getelementptr inbounds float* %tmp22282, i64 1
+  %tmp22284 = getelementptr inbounds float* %tmp22283, i64 1
+  %tmp22285 = getelementptr inbounds float* %tmp22284, i64 1
+  %tmp22286 = getelementptr inbounds float* %tmp22285, i64 1
+  %tmp22287 = getelementptr inbounds float* %tmp22286, i64 1
+  %tmp22288 = getelementptr inbounds float* %tmp22287, i64 1
+  %tmp22289 = getelementptr inbounds float* %tmp22288, i64 1
+  %tmp22290 = getelementptr inbounds float* %tmp22289, i64 1
+  %tmp22291 = getelementptr inbounds float* %tmp22290, i64 1
+  %tmp22292 = getelementptr inbounds float* %tmp22291, i64 1
+  %tmp22293 = getelementptr inbounds float* %tmp22292, i64 1
+  %tmp22294 = getelementptr inbounds float* %tmp22293, i64 1
+  %tmp22295 = getelementptr inbounds float* %tmp22294, i64 1
+  %tmp22296 = getelementptr inbounds float* %tmp22295, i64 1
+  %tmp22297 = getelementptr inbounds float* %tmp22296, i64 1
+  %tmp22298 = getelementptr inbounds float* %tmp22297, i64 1
+  %tmp22299 = getelementptr inbounds float* %tmp22298, i64 1
+  %tmp22300 = getelementptr inbounds float* %tmp22299, i64 1
+  %tmp22301 = getelementptr inbounds float* %tmp22300, i64 1
+  %tmp22302 = getelementptr inbounds float* %tmp22301, i64 1
+  %tmp22303 = getelementptr inbounds float* %tmp22302, i64 1
+  %tmp22304 = getelementptr inbounds float* %tmp22303, i64 1
+  %tmp22305 = getelementptr inbounds float* %tmp22304, i64 1
+  %tmp22306 = getelementptr inbounds float* %tmp22305, i64 1
+  %tmp22307 = getelementptr inbounds float* %tmp22306, i64 1
+  %tmp22308 = getelementptr inbounds float* %tmp22307, i64 1
+  %tmp22309 = getelementptr inbounds float* %tmp22308, i64 1
+  %tmp22310 = getelementptr inbounds float* %tmp22309, i64 1
+  %tmp22311 = getelementptr inbounds float* %tmp22310, i64 1
+  %tmp22312 = getelementptr inbounds float* %tmp22311, i64 1
+  %tmp22313 = getelementptr inbounds float* %tmp22312, i64 1
+  %tmp22314 = getelementptr inbounds float* %tmp22313, i64 1
+  %tmp22315 = getelementptr inbounds float* %tmp22314, i64 1
+  %tmp22316 = getelementptr inbounds float* %tmp22315, i64 1
+  %tmp22317 = getelementptr inbounds float* %tmp22316, i64 1
+  %tmp22318 = getelementptr inbounds float* %tmp22317, i64 1
+  %tmp22319 = getelementptr inbounds float* %tmp22318, i64 1
+  %tmp22320 = getelementptr inbounds float* %tmp22319, i64 1
+  %tmp22321 = getelementptr inbounds float* %tmp22320, i64 1
+  %tmp22322 = getelementptr inbounds float* %tmp22321, i64 1
+  %tmp22323 = getelementptr inbounds float* %tmp22322, i64 1
+  %tmp22324 = getelementptr inbounds float* %tmp22323, i64 1
+  %tmp22325 = getelementptr inbounds float* %tmp22324, i64 1
+  %tmp22326 = getelementptr inbounds float* %tmp22325, i64 1
+  %tmp22327 = getelementptr inbounds float* %tmp22326, i64 1
+  %tmp22328 = getelementptr inbounds float* %tmp22327, i64 1
+  %tmp22329 = getelementptr inbounds float* %tmp22328, i64 1
+  %tmp22330 = getelementptr inbounds float* %tmp22329, i64 1
+  %tmp22331 = getelementptr inbounds float* %tmp22330, i64 1
+  %tmp22332 = getelementptr inbounds float* %tmp22331, i64 1
+  %tmp22333 = getelementptr inbounds float* %tmp22332, i64 1
+  %tmp22334 = getelementptr inbounds float* %tmp22333, i64 1
+  %tmp22335 = getelementptr inbounds float* %tmp22334, i64 1
+  %tmp22336 = getelementptr inbounds float* %tmp22335, i64 1
+  %tmp22337 = getelementptr inbounds float* %tmp22336, i64 1
+  %tmp22338 = getelementptr inbounds float* %tmp22337, i64 1
+  %tmp22339 = getelementptr inbounds float* %tmp22338, i64 1
+  %tmp22340 = getelementptr inbounds float* %tmp22339, i64 1
+  %tmp22341 = getelementptr inbounds float* %tmp22340, i64 1
+  %tmp22342 = getelementptr inbounds float* %tmp22341, i64 1
+  %tmp22343 = getelementptr inbounds float* %tmp22342, i64 1
+  %tmp22344 = getelementptr inbounds float* %tmp22343, i64 1
+  %tmp22345 = getelementptr inbounds float* %tmp22344, i64 1
+  %tmp22346 = getelementptr inbounds float* %tmp22345, i64 1
+  %tmp22347 = getelementptr inbounds float* %tmp22346, i64 1
+  %tmp22348 = getelementptr inbounds float* %tmp22347, i64 1
+  %tmp22349 = getelementptr inbounds float* %tmp22348, i64 1
+  %tmp22350 = getelementptr inbounds float* %tmp22349, i64 1
+  %tmp22351 = getelementptr inbounds float* %tmp22350, i64 1
+  %tmp22352 = getelementptr inbounds float* %tmp22351, i64 1
+  %tmp22353 = getelementptr inbounds float* %tmp22352, i64 1
+  %tmp22354 = getelementptr inbounds float* %tmp22353, i64 1
+  %tmp22355 = getelementptr inbounds float* %tmp22354, i64 1
+  %tmp22356 = getelementptr inbounds float* %tmp22355, i64 1
+  %tmp22357 = getelementptr inbounds float* %tmp22356, i64 1
+  %tmp22358 = getelementptr inbounds float* %tmp22357, i64 1
+  %tmp22359 = getelementptr inbounds float* %tmp22358, i64 1
+  %tmp22360 = getelementptr inbounds float* %tmp22359, i64 1
+  %tmp22361 = getelementptr inbounds float* %tmp22360, i64 1
+  %tmp22362 = getelementptr inbounds float* %tmp22361, i64 1
+  %tmp22363 = getelementptr inbounds float* %tmp22362, i64 1
+  %tmp22364 = getelementptr inbounds float* %tmp22363, i64 1
+  %tmp22365 = getelementptr inbounds float* %tmp22364, i64 1
+  %tmp22366 = getelementptr inbounds float* %tmp22365, i64 1
+  %tmp22367 = getelementptr inbounds float* %tmp22366, i64 1
+  %tmp22368 = getelementptr inbounds float* %tmp22367, i64 1
+  %tmp22369 = getelementptr inbounds float* %tmp22368, i64 1
+  %tmp22370 = getelementptr inbounds float* %tmp22369, i64 1
+  %tmp22371 = getelementptr inbounds float* %tmp22370, i64 1
+  %tmp22372 = getelementptr inbounds float* %tmp22371, i64 1
+  %tmp22373 = getelementptr inbounds float* %tmp22372, i64 1
+  %tmp22374 = getelementptr inbounds float* %tmp22373, i64 1
+  %tmp22375 = getelementptr inbounds float* %tmp22374, i64 1
+  %tmp22376 = getelementptr inbounds float* %tmp22375, i64 1
+  %tmp22377 = getelementptr inbounds float* %tmp22376, i64 1
+  %tmp22378 = getelementptr inbounds float* %tmp22377, i64 1
+  %tmp22379 = getelementptr inbounds float* %tmp22378, i64 1
+  %tmp22380 = getelementptr inbounds float* %tmp22379, i64 1
+  %tmp22381 = getelementptr inbounds float* %tmp22380, i64 1
+  %tmp22382 = getelementptr inbounds float* %tmp22381, i64 1
+  %tmp22383 = getelementptr inbounds float* %tmp22382, i64 1
+  %tmp22384 = getelementptr inbounds float* %tmp22383, i64 1
+  %tmp22385 = getelementptr inbounds float* %tmp22384, i64 1
+  %tmp22386 = getelementptr inbounds float* %tmp22385, i64 1
+  %tmp22387 = getelementptr inbounds float* %tmp22386, i64 1
+  %tmp22388 = getelementptr inbounds float* %tmp22387, i64 1
+  %tmp22389 = getelementptr inbounds float* %tmp22388, i64 1
+  %tmp22390 = getelementptr inbounds float* %tmp22389, i64 1
+  %tmp22391 = getelementptr inbounds float* %tmp22390, i64 1
+  %tmp22392 = getelementptr inbounds float* %tmp22391, i64 1
+  %tmp22393 = getelementptr inbounds float* %tmp22392, i64 1
+  %tmp22394 = getelementptr inbounds float* %tmp22393, i64 1
+  %tmp22395 = getelementptr inbounds float* %tmp22394, i64 1
+  %tmp22396 = getelementptr inbounds float* %tmp22395, i64 1
+  %tmp22397 = getelementptr inbounds float* %tmp22396, i64 1
+  %tmp22398 = getelementptr inbounds float* %tmp22397, i64 1
+  %tmp22399 = getelementptr inbounds float* %tmp22398, i64 1
+  %tmp22400 = getelementptr inbounds float* %tmp22399, i64 1
+  %tmp22401 = getelementptr inbounds float* %tmp22400, i64 1
+  %tmp22402 = getelementptr inbounds float* %tmp22401, i64 1
+  %tmp22403 = getelementptr inbounds float* %tmp22402, i64 1
+  %tmp22404 = getelementptr inbounds float* %tmp22403, i64 1
+  %tmp22405 = getelementptr inbounds float* %tmp22404, i64 1
+  %tmp22406 = getelementptr inbounds float* %tmp22405, i64 1
+  %tmp22407 = getelementptr inbounds float* %tmp22406, i64 1
+  %tmp22408 = getelementptr inbounds float* %tmp22407, i64 1
+  %tmp22409 = getelementptr inbounds float* %tmp22408, i64 1
+  %tmp22410 = getelementptr inbounds float* %tmp22409, i64 1
+  %tmp22411 = getelementptr inbounds float* %tmp22410, i64 1
+  %tmp22412 = getelementptr inbounds float* %tmp22411, i64 1
+  %tmp22413 = getelementptr inbounds float* %tmp22412, i64 1
+  %tmp22414 = getelementptr inbounds float* %tmp22413, i64 1
+  %tmp22415 = getelementptr inbounds float* %tmp22414, i64 1
+  %tmp22416 = getelementptr inbounds float* %tmp22415, i64 1
+  %tmp22417 = getelementptr inbounds float* %tmp22416, i64 1
+  %tmp22418 = getelementptr inbounds float* %tmp22417, i64 1
+  %tmp22419 = getelementptr inbounds float* %tmp22418, i64 1
+  %tmp22420 = getelementptr inbounds float* %tmp22419, i64 1
+  %tmp22421 = getelementptr inbounds float* %tmp22420, i64 1
+  %tmp22422 = getelementptr inbounds float* %tmp22421, i64 1
+  %tmp22423 = getelementptr inbounds float* %tmp22422, i64 1
+  %tmp22424 = getelementptr inbounds float* %tmp22423, i64 1
+  %tmp22425 = getelementptr inbounds float* %tmp22424, i64 1
+  %tmp22426 = getelementptr inbounds float* %tmp22425, i64 1
+  %tmp22427 = getelementptr inbounds float* %tmp22426, i64 1
+  %tmp22428 = getelementptr inbounds float* %tmp22427, i64 1
+  %tmp22429 = getelementptr inbounds float* %tmp22428, i64 1
+  %tmp22430 = getelementptr inbounds float* %tmp22429, i64 1
+  %tmp22431 = getelementptr inbounds float* %tmp22430, i64 1
+  %tmp22432 = getelementptr inbounds float* %tmp22431, i64 1
+  %tmp22433 = getelementptr inbounds float* %tmp22432, i64 1
+  %tmp22434 = getelementptr inbounds float* %tmp22433, i64 1
+  %tmp22435 = getelementptr inbounds float* %tmp22434, i64 1
+  %tmp22436 = getelementptr inbounds float* %tmp22435, i64 1
+  %tmp22437 = getelementptr inbounds float* %tmp22436, i64 1
+  %tmp22438 = getelementptr inbounds float* %tmp22437, i64 1
+  %tmp22439 = getelementptr inbounds float* %tmp22438, i64 1
+  %tmp22440 = getelementptr inbounds float* %tmp22439, i64 1
+  %tmp22441 = getelementptr inbounds float* %tmp22440, i64 1
+  %tmp22442 = getelementptr inbounds float* %tmp22441, i64 1
+  %tmp22443 = getelementptr inbounds float* %tmp22442, i64 1
+  %tmp22444 = getelementptr inbounds float* %tmp22443, i64 1
+  %tmp22445 = getelementptr inbounds float* %tmp22444, i64 1
+  %tmp22446 = getelementptr inbounds float* %tmp22445, i64 1
+  %tmp22447 = getelementptr inbounds float* %tmp22446, i64 1
+  %tmp22448 = getelementptr inbounds float* %tmp22447, i64 1
+  %tmp22449 = getelementptr inbounds float* %tmp22448, i64 1
+  %tmp22450 = getelementptr inbounds float* %tmp22449, i64 1
+  %tmp22451 = getelementptr inbounds float* %tmp22450, i64 1
+  %tmp22452 = getelementptr inbounds float* %tmp22451, i64 1
+  %tmp22453 = getelementptr inbounds float* %tmp22452, i64 1
+  %tmp22454 = getelementptr inbounds float* %tmp22453, i64 1
+  %tmp22455 = getelementptr inbounds float* %tmp22454, i64 1
+  %tmp22456 = getelementptr inbounds float* %tmp22455, i64 1
+  %tmp22457 = getelementptr inbounds float* %tmp22456, i64 1
+  %tmp22458 = getelementptr inbounds float* %tmp22457, i64 1
+  %tmp22459 = getelementptr inbounds float* %tmp22458, i64 1
+  %tmp22460 = getelementptr inbounds float* %tmp22459, i64 1
+  %tmp22461 = getelementptr inbounds float* %tmp22460, i64 1
+  %tmp22462 = getelementptr inbounds float* %tmp22461, i64 1
+  %tmp22463 = getelementptr inbounds float* %tmp22462, i64 1
+  %tmp22464 = getelementptr inbounds float* %tmp22463, i64 1
+  %tmp22465 = getelementptr inbounds float* %tmp22464, i64 1
+  %tmp22466 = getelementptr inbounds float* %tmp22465, i64 1
+  %tmp22467 = getelementptr inbounds float* %tmp22466, i64 1
+  %tmp22468 = getelementptr inbounds float* %tmp22467, i64 1
+  %tmp22469 = getelementptr inbounds float* %tmp22468, i64 1
+  %tmp22470 = getelementptr inbounds float* %tmp22469, i64 1
+  %tmp22471 = getelementptr inbounds float* %tmp22470, i64 1
+  %tmp22472 = getelementptr inbounds float* %tmp22471, i64 1
+  %tmp22473 = getelementptr inbounds float* %tmp22472, i64 1
+  %tmp22474 = getelementptr inbounds float* %tmp22473, i64 1
+  %tmp22475 = getelementptr inbounds float* %tmp22474, i64 1
+  %tmp22476 = getelementptr inbounds float* %tmp22475, i64 1
+  %tmp22477 = getelementptr inbounds float* %tmp22476, i64 1
+  %tmp22478 = getelementptr inbounds float* %tmp22477, i64 1
+  %tmp22479 = getelementptr inbounds float* %tmp22478, i64 1
+  %tmp22480 = getelementptr inbounds float* %tmp22479, i64 1
+  %tmp22481 = getelementptr inbounds float* %tmp22480, i64 1
+  %tmp22482 = getelementptr inbounds float* %tmp22481, i64 1
+  %tmp22483 = getelementptr inbounds float* %tmp22482, i64 1
+  %tmp22484 = getelementptr inbounds float* %tmp22483, i64 1
+  %tmp22485 = getelementptr inbounds float* %tmp22484, i64 1
+  %tmp22486 = getelementptr inbounds float* %tmp22485, i64 1
+  %tmp22487 = getelementptr inbounds float* %tmp22486, i64 1
+  %tmp22488 = getelementptr inbounds float* %tmp22487, i64 1
+  %tmp22489 = getelementptr inbounds float* %tmp22488, i64 1
+  %tmp22490 = getelementptr inbounds float* %tmp22489, i64 1
+  %tmp22491 = getelementptr inbounds float* %tmp22490, i64 1
+  %tmp22492 = getelementptr inbounds float* %tmp22491, i64 1
+  %tmp22493 = getelementptr inbounds float* %tmp22492, i64 1
+  %tmp22494 = getelementptr inbounds float* %tmp22493, i64 1
+  %tmp22495 = getelementptr inbounds float* %tmp22494, i64 1
+  %tmp22496 = getelementptr inbounds float* %tmp22495, i64 1
+  %tmp22497 = getelementptr inbounds float* %tmp22496, i64 1
+  %tmp22498 = getelementptr inbounds float* %tmp22497, i64 1
+  %tmp22499 = getelementptr inbounds float* %tmp22498, i64 1
+  %tmp22500 = getelementptr inbounds float* %tmp22499, i64 1
+  %tmp22501 = getelementptr inbounds float* %tmp22500, i64 1
+  %tmp22502 = getelementptr inbounds float* %tmp22501, i64 1
+  %tmp22503 = getelementptr inbounds float* %tmp22502, i64 1
+  %tmp22504 = getelementptr inbounds float* %tmp22503, i64 1
+  %tmp22505 = getelementptr inbounds float* %tmp22504, i64 1
+  %tmp22506 = getelementptr inbounds float* %tmp22505, i64 1
+  %tmp22507 = getelementptr inbounds float* %tmp22506, i64 1
+  %tmp22508 = getelementptr inbounds float* %tmp22507, i64 1
+  %tmp22509 = getelementptr inbounds float* %tmp22508, i64 1
+  %tmp22510 = getelementptr inbounds float* %tmp22509, i64 1
+  %tmp22511 = getelementptr inbounds float* %tmp22510, i64 1
+  %tmp22512 = getelementptr inbounds float* %tmp22511, i64 1
+  %tmp22513 = getelementptr inbounds float* %tmp22512, i64 1
+  %tmp22514 = getelementptr inbounds float* %tmp22513, i64 1
+  %tmp22515 = getelementptr inbounds float* %tmp22514, i64 1
+  %tmp22516 = getelementptr inbounds float* %tmp22515, i64 1
+  %tmp22517 = getelementptr inbounds float* %tmp22516, i64 1
+  %tmp22518 = getelementptr inbounds float* %tmp22517, i64 1
+  %tmp22519 = getelementptr inbounds float* %tmp22518, i64 1
+  %tmp22520 = getelementptr inbounds float* %tmp22519, i64 1
+  %tmp22521 = getelementptr inbounds float* %tmp22520, i64 1
+  %tmp22522 = getelementptr inbounds float* %tmp22521, i64 1
+  %tmp22523 = getelementptr inbounds float* %tmp22522, i64 1
+  %tmp22524 = getelementptr inbounds float* %tmp22523, i64 1
+  %tmp22525 = getelementptr inbounds float* %tmp22524, i64 1
+  %tmp22526 = getelementptr inbounds float* %tmp22525, i64 1
+  %tmp22527 = getelementptr inbounds float* %tmp22526, i64 1
+  %tmp22528 = getelementptr inbounds float* %tmp22527, i64 1
+  %tmp22529 = getelementptr inbounds float* %tmp22528, i64 1
+  %tmp22530 = getelementptr inbounds float* %tmp22529, i64 1
+  %tmp22531 = getelementptr inbounds float* %tmp22530, i64 1
+  %tmp22532 = getelementptr inbounds float* %tmp22531, i64 1
+  %tmp22533 = getelementptr inbounds float* %tmp22532, i64 1
+  %tmp22534 = getelementptr inbounds float* %tmp22533, i64 1
+  %tmp22535 = getelementptr inbounds float* %tmp22534, i64 1
+  %tmp22536 = getelementptr inbounds float* %tmp22535, i64 1
+  %tmp22537 = getelementptr inbounds float* %tmp22536, i64 1
+  %tmp22538 = getelementptr inbounds float* %tmp22537, i64 1
+  %tmp22539 = getelementptr inbounds float* %tmp22538, i64 1
+  %tmp22540 = getelementptr inbounds float* %tmp22539, i64 1
+  %tmp22541 = getelementptr inbounds float* %tmp22540, i64 1
+  %tmp22542 = getelementptr inbounds float* %tmp22541, i64 1
+  %tmp22543 = getelementptr inbounds float* %tmp22542, i64 1
+  %tmp22544 = getelementptr inbounds float* %tmp22543, i64 1
+  %tmp22545 = getelementptr inbounds float* %tmp22544, i64 1
+  %tmp22546 = getelementptr inbounds float* %tmp22545, i64 1
+  %tmp22547 = getelementptr inbounds float* %tmp22546, i64 1
+  %tmp22548 = getelementptr inbounds float* %tmp22547, i64 1
+  %tmp22549 = getelementptr inbounds float* %tmp22548, i64 1
+  %tmp22550 = getelementptr inbounds float* %tmp22549, i64 1
+  %tmp22551 = getelementptr inbounds float* %tmp22550, i64 1
+  %tmp22552 = getelementptr inbounds float* %tmp22551, i64 1
+  %tmp22553 = getelementptr inbounds float* %tmp22552, i64 1
+  %tmp22554 = getelementptr inbounds float* %tmp22553, i64 1
+  %tmp22555 = getelementptr inbounds float* %tmp22554, i64 1
+  %tmp22556 = getelementptr inbounds float* %tmp22555, i64 1
+  %tmp22557 = getelementptr inbounds float* %tmp22556, i64 1
+  %tmp22558 = getelementptr inbounds float* %tmp22557, i64 1
+  %tmp22559 = getelementptr inbounds float* %tmp22558, i64 1
+  %tmp22560 = getelementptr inbounds float* %tmp22559, i64 1
+  %tmp22561 = getelementptr inbounds float* %tmp22560, i64 1
+  %tmp22562 = getelementptr inbounds float* %tmp22561, i64 1
+  %tmp22563 = getelementptr inbounds float* %tmp22562, i64 1
+  %tmp22564 = getelementptr inbounds float* %tmp22563, i64 1
+  %tmp22565 = getelementptr inbounds float* %tmp22564, i64 1
+  %tmp22566 = getelementptr inbounds float* %tmp22565, i64 1
+  %tmp22567 = getelementptr inbounds float* %tmp22566, i64 1
+  %tmp22568 = getelementptr inbounds float* %tmp22567, i64 1
+  %tmp22569 = getelementptr inbounds float* %tmp22568, i64 1
+  %tmp22570 = getelementptr inbounds float* %tmp22569, i64 1
+  %tmp22571 = getelementptr inbounds float* %tmp22570, i64 1
+  %tmp22572 = getelementptr inbounds float* %tmp22571, i64 1
+  %tmp22573 = getelementptr inbounds float* %tmp22572, i64 1
+  %tmp22574 = getelementptr inbounds float* %tmp22573, i64 1
+  %tmp22575 = getelementptr inbounds float* %tmp22574, i64 1
+  %tmp22576 = getelementptr inbounds float* %tmp22575, i64 1
+  %tmp22577 = getelementptr inbounds float* %tmp22576, i64 1
+  %tmp22578 = getelementptr inbounds float* %tmp22577, i64 1
+  %tmp22579 = getelementptr inbounds float* %tmp22578, i64 1
+  %tmp22580 = getelementptr inbounds float* %tmp22579, i64 1
+  %tmp22581 = getelementptr inbounds float* %tmp22580, i64 1
+  %tmp22582 = getelementptr inbounds float* %tmp22581, i64 1
+  %tmp22583 = getelementptr inbounds float* %tmp22582, i64 1
+  %tmp22584 = getelementptr inbounds float* %tmp22583, i64 1
+  %tmp22585 = getelementptr inbounds float* %tmp22584, i64 1
+  %tmp22586 = getelementptr inbounds float* %tmp22585, i64 1
+  %tmp22587 = getelementptr inbounds float* %tmp22586, i64 1
+  %tmp22588 = getelementptr inbounds float* %tmp22587, i64 1
+  %tmp22589 = getelementptr inbounds float* %tmp22588, i64 1
+  %tmp22590 = getelementptr inbounds float* %tmp22589, i64 1
+  %tmp22591 = getelementptr inbounds float* %tmp22590, i64 1
+  %tmp22592 = getelementptr inbounds float* %tmp22591, i64 1
+  %tmp22593 = getelementptr inbounds float* %tmp22592, i64 1
+  %tmp22594 = getelementptr inbounds float* %tmp22593, i64 1
+  %tmp22595 = getelementptr inbounds float* %tmp22594, i64 1
+  %tmp22596 = getelementptr inbounds float* %tmp22595, i64 1
+  %tmp22597 = getelementptr inbounds float* %tmp22596, i64 1
+  %tmp22598 = getelementptr inbounds float* %tmp22597, i64 1
+  %tmp22599 = getelementptr inbounds float* %tmp22598, i64 1
+  %tmp22600 = getelementptr inbounds float* %tmp22599, i64 1
+  %tmp22601 = getelementptr inbounds float* %tmp22600, i64 1
+  %tmp22602 = getelementptr inbounds float* %tmp22601, i64 1
+  %tmp22603 = getelementptr inbounds float* %tmp22602, i64 1
+  %tmp22604 = getelementptr inbounds float* %tmp22603, i64 1
+  %tmp22605 = getelementptr inbounds float* %tmp22604, i64 1
+  %tmp22606 = getelementptr inbounds float* %tmp22605, i64 1
+  %tmp22607 = getelementptr inbounds float* %tmp22606, i64 1
+  %tmp22608 = getelementptr inbounds float* %tmp22607, i64 1
+  %tmp22609 = getelementptr inbounds float* %tmp22608, i64 1
+  %tmp22610 = getelementptr inbounds float* %tmp22609, i64 1
+  %tmp22611 = getelementptr inbounds float* %tmp22610, i64 1
+  %tmp22612 = getelementptr inbounds float* %tmp22611, i64 1
+  %tmp22613 = getelementptr inbounds float* %tmp22612, i64 1
+  %tmp22614 = getelementptr inbounds float* %tmp22613, i64 1
+  %tmp22615 = getelementptr inbounds float* %tmp22614, i64 1
+  %tmp22616 = getelementptr inbounds float* %tmp22615, i64 1
+  %tmp22617 = getelementptr inbounds float* %tmp22616, i64 1
+  %tmp22618 = getelementptr inbounds float* %tmp22617, i64 1
+  %tmp22619 = getelementptr inbounds float* %tmp22618, i64 1
+  %tmp22620 = getelementptr inbounds float* %tmp22619, i64 1
+  %tmp22621 = getelementptr inbounds float* %tmp22620, i64 1
+  %tmp22622 = getelementptr inbounds float* %tmp22621, i64 1
+  %tmp22623 = getelementptr inbounds float* %tmp22622, i64 1
+  %tmp22624 = getelementptr inbounds float* %tmp22623, i64 1
+  %tmp22625 = getelementptr inbounds float* %tmp22624, i64 1
+  %tmp22626 = getelementptr inbounds float* %tmp22625, i64 1
+  %tmp22627 = getelementptr inbounds float* %tmp22626, i64 1
+  %tmp22628 = getelementptr inbounds float* %tmp22627, i64 1
+  %tmp22629 = getelementptr inbounds float* %tmp22628, i64 1
+  %tmp22630 = getelementptr inbounds float* %tmp22629, i64 1
+  %tmp22631 = getelementptr inbounds float* %tmp22630, i64 1
+  %tmp22632 = getelementptr inbounds float* %tmp22631, i64 1
+  %tmp22633 = getelementptr inbounds float* %tmp22632, i64 1
+  %tmp22634 = getelementptr inbounds float* %tmp22633, i64 1
+  %tmp22635 = getelementptr inbounds float* %tmp22634, i64 1
+  %tmp22636 = getelementptr inbounds float* %tmp22635, i64 1
+  %tmp22637 = getelementptr inbounds float* %tmp22636, i64 1
+  %tmp22638 = getelementptr inbounds float* %tmp22637, i64 1
+  %tmp22639 = getelementptr inbounds float* %tmp22638, i64 1
+  %tmp22640 = getelementptr inbounds float* %tmp22639, i64 1
+  %tmp22641 = getelementptr inbounds float* %tmp22640, i64 1
+  %tmp22642 = getelementptr inbounds float* %tmp22641, i64 1
+  %tmp22643 = getelementptr inbounds float* %tmp22642, i64 1
+  %tmp22644 = getelementptr inbounds float* %tmp22643, i64 1
+  %tmp22645 = getelementptr inbounds float* %tmp22644, i64 1
+  %tmp22646 = getelementptr inbounds float* %tmp22645, i64 1
+  %tmp22647 = getelementptr inbounds float* %tmp22646, i64 1
+  %tmp22648 = getelementptr inbounds float* %tmp22647, i64 1
+  %tmp22649 = getelementptr inbounds float* %tmp22648, i64 1
+  %tmp22650 = getelementptr inbounds float* %tmp22649, i64 1
+  %tmp22651 = getelementptr inbounds float* %tmp22650, i64 1
+  %tmp22652 = getelementptr inbounds float* %tmp22651, i64 1
+  %tmp22653 = getelementptr inbounds float* %tmp22652, i64 1
+  %tmp22654 = getelementptr inbounds float* %tmp22653, i64 1
+  %tmp22655 = getelementptr inbounds float* %tmp22654, i64 1
+  %tmp22656 = getelementptr inbounds float* %tmp22655, i64 1
+  %tmp22657 = getelementptr inbounds float* %tmp22656, i64 1
+  %tmp22658 = getelementptr inbounds float* %tmp22657, i64 1
+  %tmp22659 = getelementptr inbounds float* %tmp22658, i64 1
+  %tmp22660 = getelementptr inbounds float* %tmp22659, i64 1
+  %tmp22661 = getelementptr inbounds float* %tmp22660, i64 1
+  %tmp22662 = getelementptr inbounds float* %tmp22661, i64 1
+  %tmp22663 = getelementptr inbounds float* %tmp22662, i64 1
+  %tmp22664 = getelementptr inbounds float* %tmp22663, i64 1
+  %tmp22665 = getelementptr inbounds float* %tmp22664, i64 1
+  %tmp22666 = getelementptr inbounds float* %tmp22665, i64 1
+  %tmp22667 = getelementptr inbounds float* %tmp22666, i64 1
+  %tmp22668 = getelementptr inbounds float* %tmp22667, i64 1
+  %tmp22669 = getelementptr inbounds float* %tmp22668, i64 1
+  %tmp22670 = getelementptr inbounds float* %tmp22669, i64 1
+  %tmp22671 = getelementptr inbounds float* %tmp22670, i64 1
+  %tmp22672 = getelementptr inbounds float* %tmp22671, i64 1
+  %tmp22673 = getelementptr inbounds float* %tmp22672, i64 1
+  %tmp22674 = getelementptr inbounds float* %tmp22673, i64 1
+  %tmp22675 = getelementptr inbounds float* %tmp22674, i64 1
+  %tmp22676 = getelementptr inbounds float* %tmp22675, i64 1
+  %tmp22677 = getelementptr inbounds float* %tmp22676, i64 1
+  %tmp22678 = getelementptr inbounds float* %tmp22677, i64 1
+  %tmp22679 = getelementptr inbounds float* %tmp22678, i64 1
+  %tmp22680 = getelementptr inbounds float* %tmp22679, i64 1
+  %tmp22681 = getelementptr inbounds float* %tmp22680, i64 1
+  %tmp22682 = getelementptr inbounds float* %tmp22681, i64 1
+  %tmp22683 = getelementptr inbounds float* %tmp22682, i64 1
+  %tmp22684 = getelementptr inbounds float* %tmp22683, i64 1
+  %tmp22685 = getelementptr inbounds float* %tmp22684, i64 1
+  %tmp22686 = getelementptr inbounds float* %tmp22685, i64 1
+  %tmp22687 = getelementptr inbounds float* %tmp22686, i64 1
+  %tmp22688 = getelementptr inbounds float* %tmp22687, i64 1
+  %tmp22689 = getelementptr inbounds float* %tmp22688, i64 1
+  %tmp22690 = getelementptr inbounds float* %tmp22689, i64 1
+  %tmp22691 = getelementptr inbounds float* %tmp22690, i64 1
+  %tmp22692 = getelementptr inbounds float* %tmp22691, i64 1
+  %tmp22693 = getelementptr inbounds float* %tmp22692, i64 1
+  %tmp22694 = getelementptr inbounds float* %tmp22693, i64 1
+  %tmp22695 = getelementptr inbounds float* %tmp22694, i64 1
+  %tmp22696 = getelementptr inbounds float* %tmp22695, i64 1
+  %tmp22697 = getelementptr inbounds float* %tmp22696, i64 1
+  %tmp22698 = getelementptr inbounds float* %tmp22697, i64 1
+  %tmp22699 = getelementptr inbounds float* %tmp22698, i64 1
+  %tmp22700 = getelementptr inbounds float* %tmp22699, i64 1
+  %tmp22701 = getelementptr inbounds float* %tmp22700, i64 1
+  %tmp22702 = getelementptr inbounds float* %tmp22701, i64 1
+  %tmp22703 = getelementptr inbounds float* %tmp22702, i64 1
+  %tmp22704 = getelementptr inbounds float* %tmp22703, i64 1
+  %tmp22705 = getelementptr inbounds float* %tmp22704, i64 1
+  %tmp22706 = getelementptr inbounds float* %tmp22705, i64 1
+  %tmp22707 = getelementptr inbounds float* %tmp22706, i64 1
+  %tmp22708 = getelementptr inbounds float* %tmp22707, i64 1
+  %tmp22709 = getelementptr inbounds float* %tmp22708, i64 1
+  %tmp22710 = getelementptr inbounds float* %tmp22709, i64 1
+  %tmp22711 = getelementptr inbounds float* %tmp22710, i64 1
+  %tmp22712 = getelementptr inbounds float* %tmp22711, i64 1
+  %tmp22713 = getelementptr inbounds float* %tmp22712, i64 1
+  %tmp22714 = getelementptr inbounds float* %tmp22713, i64 1
+  %tmp22715 = getelementptr inbounds float* %tmp22714, i64 1
+  %tmp22716 = getelementptr inbounds float* %tmp22715, i64 1
+  %tmp22717 = getelementptr inbounds float* %tmp22716, i64 1
+  %tmp22718 = getelementptr inbounds float* %tmp22717, i64 1
+  %tmp22719 = getelementptr inbounds float* %tmp22718, i64 1
+  %tmp22720 = getelementptr inbounds float* %tmp22719, i64 1
+  %tmp22721 = getelementptr inbounds float* %tmp22720, i64 1
+  %tmp22722 = getelementptr inbounds float* %tmp22721, i64 1
+  %tmp22723 = getelementptr inbounds float* %tmp22722, i64 1
+  %tmp22724 = getelementptr inbounds float* %tmp22723, i64 1
+  %tmp22725 = getelementptr inbounds float* %tmp22724, i64 1
+  %tmp22726 = getelementptr inbounds float* %tmp22725, i64 1
+  %tmp22727 = getelementptr inbounds float* %tmp22726, i64 1
+  %tmp22728 = getelementptr inbounds float* %tmp22727, i64 1
+  %tmp22729 = getelementptr inbounds float* %tmp22728, i64 1
+  %tmp22730 = getelementptr inbounds float* %tmp22729, i64 1
+  %tmp22731 = getelementptr inbounds float* %tmp22730, i64 1
+  %tmp22732 = getelementptr inbounds float* %tmp22731, i64 1
+  %tmp22733 = getelementptr inbounds float* %tmp22732, i64 1
+  %tmp22734 = getelementptr inbounds float* %tmp22733, i64 1
+  %tmp22735 = getelementptr inbounds float* %tmp22734, i64 1
+  %tmp22736 = getelementptr inbounds float* %tmp22735, i64 1
+  %tmp22737 = getelementptr inbounds float* %tmp22736, i64 1
+  %tmp22738 = getelementptr inbounds float* %tmp22737, i64 1
+  %tmp22739 = getelementptr inbounds float* %tmp22738, i64 1
+  %tmp22740 = getelementptr inbounds float* %tmp22739, i64 1
+  %tmp22741 = getelementptr inbounds float* %tmp22740, i64 1
+  %tmp22742 = getelementptr inbounds float* %tmp22741, i64 1
+  %tmp22743 = getelementptr inbounds float* %tmp22742, i64 1
+  %tmp22744 = getelementptr inbounds float* %tmp22743, i64 1
+  %tmp22745 = getelementptr inbounds float* %tmp22744, i64 1
+  %tmp22746 = getelementptr inbounds float* %tmp22745, i64 1
+  %tmp22747 = getelementptr inbounds float* %tmp22746, i64 1
+  %tmp22748 = getelementptr inbounds float* %tmp22747, i64 1
+  %tmp22749 = getelementptr inbounds float* %tmp22748, i64 1
+  %tmp22750 = getelementptr inbounds float* %tmp22749, i64 1
+  %tmp22751 = getelementptr inbounds float* %tmp22750, i64 1
+  %tmp22752 = getelementptr inbounds float* %tmp22751, i64 1
+  %tmp22753 = getelementptr inbounds float* %tmp22752, i64 1
+  %tmp22754 = getelementptr inbounds float* %tmp22753, i64 1
+  %tmp22755 = getelementptr inbounds float* %tmp22754, i64 1
+  %tmp22756 = getelementptr inbounds float* %tmp22755, i64 1
+  %tmp22757 = getelementptr inbounds float* %tmp22756, i64 1
+  %tmp22758 = getelementptr inbounds float* %tmp22757, i64 1
+  %tmp22759 = getelementptr inbounds float* %tmp22758, i64 1
+  %tmp22760 = getelementptr inbounds float* %tmp22759, i64 1
+  %tmp22761 = getelementptr inbounds float* %tmp22760, i64 1
+  %tmp22762 = getelementptr inbounds float* %tmp22761, i64 1
+  %tmp22763 = getelementptr inbounds float* %tmp22762, i64 1
+  %tmp22764 = getelementptr inbounds float* %tmp22763, i64 1
+  %tmp22765 = getelementptr inbounds float* %tmp22764, i64 1
+  %tmp22766 = getelementptr inbounds float* %tmp22765, i64 1
+  %tmp22767 = getelementptr inbounds float* %tmp22766, i64 1
+  %tmp22768 = getelementptr inbounds float* %tmp22767, i64 1
+  %tmp22769 = getelementptr inbounds float* %tmp22768, i64 1
+  %tmp22770 = getelementptr inbounds float* %tmp22769, i64 1
+  %tmp22771 = getelementptr inbounds float* %tmp22770, i64 1
+  %tmp22772 = getelementptr inbounds float* %tmp22771, i64 1
+  %tmp22773 = getelementptr inbounds float* %tmp22772, i64 1
+  %tmp22774 = getelementptr inbounds float* %tmp22773, i64 1
+  %tmp22775 = getelementptr inbounds float* %tmp22774, i64 1
+  %tmp22776 = getelementptr inbounds float* %tmp22775, i64 1
+  %tmp22777 = getelementptr inbounds float* %tmp22776, i64 1
+  %tmp22778 = getelementptr inbounds float* %tmp22777, i64 1
+  %tmp22779 = getelementptr inbounds float* %tmp22778, i64 1
+  %tmp22780 = getelementptr inbounds float* %tmp22779, i64 1
+  %tmp22781 = getelementptr inbounds float* %tmp22780, i64 1
+  %tmp22782 = getelementptr inbounds float* %tmp22781, i64 1
+  %tmp22783 = getelementptr inbounds float* %tmp22782, i64 1
+  %tmp22784 = getelementptr inbounds float* %tmp22783, i64 1
+  %tmp22785 = getelementptr inbounds float* %tmp22784, i64 1
+  %tmp22786 = getelementptr inbounds float* %tmp22785, i64 1
+  %tmp22787 = getelementptr inbounds float* %tmp22786, i64 1
+  %tmp22788 = getelementptr inbounds float* %tmp22787, i64 1
+  %tmp22789 = getelementptr inbounds float* %tmp22788, i64 1
+  %tmp22790 = getelementptr inbounds float* %tmp22789, i64 1
+  %tmp22791 = getelementptr inbounds float* %tmp22790, i64 1
+  %tmp22792 = getelementptr inbounds float* %tmp22791, i64 1
+  %tmp22793 = getelementptr inbounds float* %tmp22792, i64 1
+  %tmp22794 = getelementptr inbounds float* %tmp22793, i64 1
+  %tmp22795 = getelementptr inbounds float* %tmp22794, i64 1
+  %tmp22796 = getelementptr inbounds float* %tmp22795, i64 1
+  %tmp22797 = getelementptr inbounds float* %tmp22796, i64 1
+  %tmp22798 = getelementptr inbounds float* %tmp22797, i64 1
+  %tmp22799 = getelementptr inbounds float* %tmp22798, i64 1
+  %tmp22800 = getelementptr inbounds float* %tmp22799, i64 1
+  %tmp22801 = getelementptr inbounds float* %tmp22800, i64 1
+  %tmp22802 = getelementptr inbounds float* %tmp22801, i64 1
+  %tmp22803 = getelementptr inbounds float* %tmp22802, i64 1
+  %tmp22804 = getelementptr inbounds float* %tmp22803, i64 1
+  %tmp22805 = getelementptr inbounds float* %tmp22804, i64 1
+  %tmp22806 = getelementptr inbounds float* %tmp22805, i64 1
+  %tmp22807 = getelementptr inbounds float* %tmp22806, i64 1
+  %tmp22808 = getelementptr inbounds float* %tmp22807, i64 1
+  %tmp22809 = getelementptr inbounds float* %tmp22808, i64 1
+  %tmp22810 = getelementptr inbounds float* %tmp22809, i64 1
+  %tmp22811 = getelementptr inbounds float* %tmp22810, i64 1
+  %tmp22812 = getelementptr inbounds float* %tmp22811, i64 1
+  %tmp22813 = getelementptr inbounds float* %tmp22812, i64 1
+  %tmp22814 = getelementptr inbounds float* %tmp22813, i64 1
+  %tmp22815 = getelementptr inbounds float* %tmp22814, i64 1
+  %tmp22816 = getelementptr inbounds float* %tmp22815, i64 1
+  %tmp22817 = getelementptr inbounds float* %tmp22816, i64 1
+  %tmp22818 = getelementptr inbounds float* %tmp22817, i64 1
+  %tmp22819 = getelementptr inbounds float* %tmp22818, i64 1
+  %tmp22820 = getelementptr inbounds float* %tmp22819, i64 1
+  %tmp22821 = getelementptr inbounds float* %tmp22820, i64 1
+  %tmp22822 = getelementptr inbounds float* %tmp22821, i64 1
+  %tmp22823 = getelementptr inbounds float* %tmp22822, i64 1
+  %tmp22824 = getelementptr inbounds float* %tmp22823, i64 1
+  %tmp22825 = getelementptr inbounds float* %tmp22824, i64 1
+  %tmp22826 = getelementptr inbounds float* %tmp22825, i64 1
+  %tmp22827 = getelementptr inbounds float* %tmp22826, i64 1
+  %tmp22828 = getelementptr inbounds float* %tmp22827, i64 1
+  %tmp22829 = getelementptr inbounds float* %tmp22828, i64 1
+  %tmp22830 = getelementptr inbounds float* %tmp22829, i64 1
+  %tmp22831 = getelementptr inbounds float* %tmp22830, i64 1
+  %tmp22832 = getelementptr inbounds float* %tmp22831, i64 1
+  %tmp22833 = getelementptr inbounds float* %tmp22832, i64 1
+  %tmp22834 = getelementptr inbounds float* %tmp22833, i64 1
+  %tmp22835 = getelementptr inbounds float* %tmp22834, i64 1
+  %tmp22836 = getelementptr inbounds float* %tmp22835, i64 1
+  %tmp22837 = getelementptr inbounds float* %tmp22836, i64 1
+  %tmp22838 = getelementptr inbounds float* %tmp22837, i64 1
+  %tmp22839 = getelementptr inbounds float* %tmp22838, i64 1
+  %tmp22840 = getelementptr inbounds float* %tmp22839, i64 1
+  %tmp22841 = getelementptr inbounds float* %tmp22840, i64 1
+  %tmp22842 = getelementptr inbounds float* %tmp22841, i64 1
+  %tmp22843 = getelementptr inbounds float* %tmp22842, i64 1
+  %tmp22844 = getelementptr inbounds float* %tmp22843, i64 1
+  %tmp22845 = getelementptr inbounds float* %tmp22844, i64 1
+  %tmp22846 = getelementptr inbounds float* %tmp22845, i64 1
+  %tmp22847 = getelementptr inbounds float* %tmp22846, i64 1
+  %tmp22848 = getelementptr inbounds float* %tmp22847, i64 1
+  %tmp22849 = getelementptr inbounds float* %tmp22848, i64 1
+  %tmp22850 = getelementptr inbounds float* %tmp22849, i64 1
+  %tmp22851 = getelementptr inbounds float* %tmp22850, i64 1
+  %tmp22852 = getelementptr inbounds float* %tmp22851, i64 1
+  %tmp22853 = getelementptr inbounds float* %tmp22852, i64 1
+  %tmp22854 = getelementptr inbounds float* %tmp22853, i64 1
+  %tmp22855 = getelementptr inbounds float* %tmp22854, i64 1
+  %tmp22856 = getelementptr inbounds float* %tmp22855, i64 1
+  %tmp22857 = getelementptr inbounds float* %tmp22856, i64 1
+  %tmp22858 = getelementptr inbounds float* %tmp22857, i64 1
+  %tmp22859 = getelementptr inbounds float* %tmp22858, i64 1
+  %tmp22860 = getelementptr inbounds float* %tmp22859, i64 1
+  %tmp22861 = getelementptr inbounds float* %tmp22860, i64 1
+  %tmp22862 = getelementptr inbounds float* %tmp22861, i64 1
+  %tmp22863 = getelementptr inbounds float* %tmp22862, i64 1
+  %tmp22864 = getelementptr inbounds float* %tmp22863, i64 1
+  %tmp22865 = getelementptr inbounds float* %tmp22864, i64 1
+  %tmp22866 = getelementptr inbounds float* %tmp22865, i64 1
+  %tmp22867 = getelementptr inbounds float* %tmp22866, i64 1
+  %tmp22868 = getelementptr inbounds float* %tmp22867, i64 1
+  %tmp22869 = getelementptr inbounds float* %tmp22868, i64 1
+  %tmp22870 = getelementptr inbounds float* %tmp22869, i64 1
+  %tmp22871 = getelementptr inbounds float* %tmp22870, i64 1
+  %tmp22872 = getelementptr inbounds float* %tmp22871, i64 1
+  %tmp22873 = getelementptr inbounds float* %tmp22872, i64 1
+  %tmp22874 = getelementptr inbounds float* %tmp22873, i64 1
+  %tmp22875 = getelementptr inbounds float* %tmp22874, i64 1
+  %tmp22876 = getelementptr inbounds float* %tmp22875, i64 1
+  %tmp22877 = getelementptr inbounds float* %tmp22876, i64 1
+  %tmp22878 = getelementptr inbounds float* %tmp22877, i64 1
+  %tmp22879 = getelementptr inbounds float* %tmp22878, i64 1
+  %tmp22880 = getelementptr inbounds float* %tmp22879, i64 1
+  %tmp22881 = getelementptr inbounds float* %tmp22880, i64 1
+  %tmp22882 = getelementptr inbounds float* %tmp22881, i64 1
+  %tmp22883 = getelementptr inbounds float* %tmp22882, i64 1
+  %tmp22884 = getelementptr inbounds float* %tmp22883, i64 1
+  %tmp22885 = getelementptr inbounds float* %tmp22884, i64 1
+  %tmp22886 = getelementptr inbounds float* %tmp22885, i64 1
+  %tmp22887 = getelementptr inbounds float* %tmp22886, i64 1
+  %tmp22888 = getelementptr inbounds float* %tmp22887, i64 1
+  %tmp22889 = getelementptr inbounds float* %tmp22888, i64 1
+  %tmp22890 = getelementptr inbounds float* %tmp22889, i64 1
+  %tmp22891 = getelementptr inbounds float* %tmp22890, i64 1
+  %tmp22892 = getelementptr inbounds float* %tmp22891, i64 1
+  %tmp22893 = getelementptr inbounds float* %tmp22892, i64 1
+  %tmp22894 = getelementptr inbounds float* %tmp22893, i64 1
+  %tmp22895 = getelementptr inbounds float* %tmp22894, i64 1
+  %tmp22896 = getelementptr inbounds float* %tmp22895, i64 1
+  %tmp22897 = getelementptr inbounds float* %tmp22896, i64 1
+  %tmp22898 = getelementptr inbounds float* %tmp22897, i64 1
+  %tmp22899 = getelementptr inbounds float* %tmp22898, i64 1
+  %tmp22900 = getelementptr inbounds float* %tmp22899, i64 1
+  %tmp22901 = getelementptr inbounds float* %tmp22900, i64 1
+  %tmp22902 = getelementptr inbounds float* %tmp22901, i64 1
+  %tmp22903 = getelementptr inbounds float* %tmp22902, i64 1
+  %tmp22904 = getelementptr inbounds float* %tmp22903, i64 1
+  %tmp22905 = getelementptr inbounds float* %tmp22904, i64 1
+  %tmp22906 = getelementptr inbounds float* %tmp22905, i64 1
+  %tmp22907 = getelementptr inbounds float* %tmp22906, i64 1
+  %tmp22908 = getelementptr inbounds float* %tmp22907, i64 1
+  %tmp22909 = getelementptr inbounds float* %tmp22908, i64 1
+  %tmp22910 = getelementptr inbounds float* %tmp22909, i64 1
+  %tmp22911 = getelementptr inbounds float* %tmp22910, i64 1
+  %tmp22912 = getelementptr inbounds float* %tmp22911, i64 1
+  %tmp22913 = getelementptr inbounds float* %tmp22912, i64 1
+  %tmp22914 = getelementptr inbounds float* %tmp22913, i64 1
+  %tmp22915 = getelementptr inbounds float* %tmp22914, i64 1
+  %tmp22916 = getelementptr inbounds float* %tmp22915, i64 1
+  %tmp22917 = getelementptr inbounds float* %tmp22916, i64 1
+  %tmp22918 = getelementptr inbounds float* %tmp22917, i64 1
+  %tmp22919 = getelementptr inbounds float* %tmp22918, i64 1
+  %tmp22920 = getelementptr inbounds float* %tmp22919, i64 1
+  %tmp22921 = getelementptr inbounds float* %tmp22920, i64 1
+  %tmp22922 = getelementptr inbounds float* %tmp22921, i64 1
+  %tmp22923 = getelementptr inbounds float* %tmp22922, i64 1
+  %tmp22924 = getelementptr inbounds float* %tmp22923, i64 1
+  %tmp22925 = getelementptr inbounds float* %tmp22924, i64 1
+  %tmp22926 = getelementptr inbounds float* %tmp22925, i64 1
+  %tmp22927 = getelementptr inbounds float* %tmp22926, i64 1
+  %tmp22928 = getelementptr inbounds float* %tmp22927, i64 1
+  %tmp22929 = getelementptr inbounds float* %tmp22928, i64 1
+  %tmp22930 = getelementptr inbounds float* %tmp22929, i64 1
+  %tmp22931 = getelementptr inbounds float* %tmp22930, i64 1
+  %tmp22932 = getelementptr inbounds float* %tmp22931, i64 1
+  %tmp22933 = getelementptr inbounds float* %tmp22932, i64 1
+  %tmp22934 = getelementptr inbounds float* %tmp22933, i64 1
+  %tmp22935 = getelementptr inbounds float* %tmp22934, i64 1
+  %tmp22936 = getelementptr inbounds float* %tmp22935, i64 1
+  %tmp22937 = getelementptr inbounds float* %tmp22936, i64 1
+  %tmp22938 = getelementptr inbounds float* %tmp22937, i64 1
+  %tmp22939 = getelementptr inbounds float* %tmp22938, i64 1
+  %tmp22940 = getelementptr inbounds float* %tmp22939, i64 1
+  %tmp22941 = getelementptr inbounds float* %tmp22940, i64 1
+  %tmp22942 = getelementptr inbounds float* %tmp22941, i64 1
+  %tmp22943 = getelementptr inbounds float* %tmp22942, i64 1
+  %tmp22944 = getelementptr inbounds float* %tmp22943, i64 1
+  %tmp22945 = getelementptr inbounds float* %tmp22944, i64 1
+  %tmp22946 = getelementptr inbounds float* %tmp22945, i64 1
+  %tmp22947 = getelementptr inbounds float* %tmp22946, i64 1
+  %tmp22948 = getelementptr inbounds float* %tmp22947, i64 1
+  %tmp22949 = getelementptr inbounds float* %tmp22948, i64 1
+  %tmp22950 = getelementptr inbounds float* %tmp22949, i64 1
+  %tmp22951 = getelementptr inbounds float* %tmp22950, i64 1
+  %tmp22952 = getelementptr inbounds float* %tmp22951, i64 1
+  %tmp22953 = getelementptr inbounds float* %tmp22952, i64 1
+  %tmp22954 = getelementptr inbounds float* %tmp22953, i64 1
+  %tmp22955 = getelementptr inbounds float* %tmp22954, i64 1
+  %tmp22956 = getelementptr inbounds float* %tmp22955, i64 1
+  %tmp22957 = getelementptr inbounds float* %tmp22956, i64 1
+  %tmp22958 = getelementptr inbounds float* %tmp22957, i64 1
+  %tmp22959 = getelementptr inbounds float* %tmp22958, i64 1
+  %tmp22960 = getelementptr inbounds float* %tmp22959, i64 1
+  %tmp22961 = getelementptr inbounds float* %tmp22960, i64 1
+  %tmp22962 = getelementptr inbounds float* %tmp22961, i64 1
+  %tmp22963 = getelementptr inbounds float* %tmp22962, i64 1
+  %tmp22964 = getelementptr inbounds float* %tmp22963, i64 1
+  %tmp22965 = getelementptr inbounds float* %tmp22964, i64 1
+  %tmp22966 = getelementptr inbounds float* %tmp22965, i64 1
+  %tmp22967 = getelementptr inbounds float* %tmp22966, i64 1
+  %tmp22968 = getelementptr inbounds float* %tmp22967, i64 1
+  %tmp22969 = getelementptr inbounds float* %tmp22968, i64 1
+  %tmp22970 = getelementptr inbounds float* %tmp22969, i64 1
+  %tmp22971 = getelementptr inbounds float* %tmp22970, i64 1
+  %tmp22972 = getelementptr inbounds float* %tmp22971, i64 1
+  %tmp22973 = getelementptr inbounds float* %tmp22972, i64 1
+  %tmp22974 = getelementptr inbounds float* %tmp22973, i64 1
+  %tmp22975 = getelementptr inbounds float* %tmp22974, i64 1
+  %tmp22976 = getelementptr inbounds float* %tmp22975, i64 1
+  %tmp22977 = getelementptr inbounds float* %tmp22976, i64 1
+  %tmp22978 = getelementptr inbounds float* %tmp22977, i64 1
+  %tmp22979 = getelementptr inbounds float* %tmp22978, i64 1
+  %tmp22980 = getelementptr inbounds float* %tmp22979, i64 1
+  %tmp22981 = getelementptr inbounds float* %tmp22980, i64 1
+  %tmp22982 = getelementptr inbounds float* %tmp22981, i64 1
+  %tmp22983 = getelementptr inbounds float* %tmp22982, i64 1
+  %tmp22984 = getelementptr inbounds float* %tmp22983, i64 1
+  %tmp22985 = getelementptr inbounds float* %tmp22984, i64 1
+  %tmp22986 = getelementptr inbounds float* %tmp22985, i64 1
+  %tmp22987 = getelementptr inbounds float* %tmp22986, i64 1
+  %tmp22988 = getelementptr inbounds float* %tmp22987, i64 1
+  %tmp22989 = getelementptr inbounds float* %tmp22988, i64 1
+  %tmp22990 = getelementptr inbounds float* %tmp22989, i64 1
+  %tmp22991 = getelementptr inbounds float* %tmp22990, i64 1
+  %tmp22992 = getelementptr inbounds float* %tmp22991, i64 1
+  %tmp22993 = getelementptr inbounds float* %tmp22992, i64 1
+  %tmp22994 = getelementptr inbounds float* %tmp22993, i64 1
+  %tmp22995 = getelementptr inbounds float* %tmp22994, i64 1
+  %tmp22996 = getelementptr inbounds float* %tmp22995, i64 1
+  %tmp22997 = getelementptr inbounds float* %tmp22996, i64 1
+  %tmp22998 = getelementptr inbounds float* %tmp22997, i64 1
+  %tmp22999 = getelementptr inbounds float* %tmp22998, i64 1
+  %tmp23000 = getelementptr inbounds float* %tmp22999, i64 1
+  %tmp23001 = getelementptr inbounds float* %tmp23000, i64 1
+  %tmp23002 = getelementptr inbounds float* %tmp23001, i64 1
+  %tmp23003 = getelementptr inbounds float* %tmp23002, i64 1
+  %tmp23004 = getelementptr inbounds float* %tmp23003, i64 1
+  %tmp23005 = getelementptr inbounds float* %tmp23004, i64 1
+  %tmp23006 = getelementptr inbounds float* %tmp23005, i64 1
+  %tmp23007 = getelementptr inbounds float* %tmp23006, i64 1
+  %tmp23008 = getelementptr inbounds float* %tmp23007, i64 1
+  %tmp23009 = getelementptr inbounds float* %tmp23008, i64 1
+  %tmp23010 = getelementptr inbounds float* %tmp23009, i64 1
+  %tmp23011 = getelementptr inbounds float* %tmp23010, i64 1
+  %tmp23012 = getelementptr inbounds float* %tmp23011, i64 1
+  %tmp23013 = getelementptr inbounds float* %tmp23012, i64 1
+  %tmp23014 = getelementptr inbounds float* %tmp23013, i64 1
+  %tmp23015 = getelementptr inbounds float* %tmp23014, i64 1
+  %tmp23016 = getelementptr inbounds float* %tmp23015, i64 1
+  %tmp23017 = getelementptr inbounds float* %tmp23016, i64 1
+  %tmp23018 = getelementptr inbounds float* %tmp23017, i64 1
+  %tmp23019 = getelementptr inbounds float* %tmp23018, i64 1
+  %tmp23020 = getelementptr inbounds float* %tmp23019, i64 1
+  %tmp23021 = getelementptr inbounds float* %tmp23020, i64 1
+  %tmp23022 = getelementptr inbounds float* %tmp23021, i64 1
+  %tmp23023 = getelementptr inbounds float* %tmp23022, i64 1
+  %tmp23024 = getelementptr inbounds float* %tmp23023, i64 1
+  %tmp23025 = getelementptr inbounds float* %tmp23024, i64 1
+  %tmp23026 = getelementptr inbounds float* %tmp23025, i64 1
+  %tmp23027 = getelementptr inbounds float* %tmp23026, i64 1
+  %tmp23028 = getelementptr inbounds float* %tmp23027, i64 1
+  %tmp23029 = getelementptr inbounds float* %tmp23028, i64 1
+  %tmp23030 = getelementptr inbounds float* %tmp23029, i64 1
+  %tmp23031 = getelementptr inbounds float* %tmp23030, i64 1
+  %tmp23032 = getelementptr inbounds float* %tmp23031, i64 1
+  %tmp23033 = getelementptr inbounds float* %tmp23032, i64 1
+  %tmp23034 = getelementptr inbounds float* %tmp23033, i64 1
+  %tmp23035 = getelementptr inbounds float* %tmp23034, i64 1
+  %tmp23036 = getelementptr inbounds float* %tmp23035, i64 1
+  %tmp23037 = getelementptr inbounds float* %tmp23036, i64 1
+  %tmp23038 = getelementptr inbounds float* %tmp23037, i64 1
+  %tmp23039 = getelementptr inbounds float* %tmp23038, i64 1
+  %tmp23040 = getelementptr inbounds float* %tmp23039, i64 1
+  %tmp23041 = getelementptr inbounds float* %tmp23040, i64 1
+  %tmp23042 = getelementptr inbounds float* %tmp23041, i64 1
+  %tmp23043 = getelementptr inbounds float* %tmp23042, i64 1
+  %tmp23044 = getelementptr inbounds float* %tmp23043, i64 1
+  %tmp23045 = getelementptr inbounds float* %tmp23044, i64 1
+  %tmp23046 = getelementptr inbounds float* %tmp23045, i64 1
+  %tmp23047 = getelementptr inbounds float* %tmp23046, i64 1
+  %tmp23048 = getelementptr inbounds float* %tmp23047, i64 1
+  %tmp23049 = getelementptr inbounds float* %tmp23048, i64 1
+  %tmp23050 = getelementptr inbounds float* %tmp23049, i64 1
+  %tmp23051 = getelementptr inbounds float* %tmp23050, i64 1
+  %tmp23052 = getelementptr inbounds float* %tmp23051, i64 1
+  %tmp23053 = getelementptr inbounds float* %tmp23052, i64 1
+  %tmp23054 = getelementptr inbounds float* %tmp23053, i64 1
+  %tmp23055 = getelementptr inbounds float* %tmp23054, i64 1
+  %tmp23056 = getelementptr inbounds float* %tmp23055, i64 1
+  %tmp23057 = getelementptr inbounds float* %tmp23056, i64 1
+  %tmp23058 = getelementptr inbounds float* %tmp23057, i64 1
+  %tmp23059 = getelementptr inbounds float* %tmp23058, i64 1
+  %tmp23060 = getelementptr inbounds float* %tmp23059, i64 1
+  %tmp23061 = getelementptr inbounds float* %tmp23060, i64 1
+  %tmp23062 = getelementptr inbounds float* %tmp23061, i64 1
+  %tmp23063 = getelementptr inbounds float* %tmp23062, i64 1
+  %tmp23064 = getelementptr inbounds float* %tmp23063, i64 1
+  %tmp23065 = getelementptr inbounds float* %tmp23064, i64 1
+  %tmp23066 = getelementptr inbounds float* %tmp23065, i64 1
+  %tmp23067 = getelementptr inbounds float* %tmp23066, i64 1
+  %tmp23068 = getelementptr inbounds float* %tmp23067, i64 1
+  %tmp23069 = getelementptr inbounds float* %tmp23068, i64 1
+  %tmp23070 = getelementptr inbounds float* %tmp23069, i64 1
+  %tmp23071 = getelementptr inbounds float* %tmp23070, i64 1
+  %tmp23072 = getelementptr inbounds float* %tmp23071, i64 1
+  %tmp23073 = getelementptr inbounds float* %tmp23072, i64 1
+  %tmp23074 = getelementptr inbounds float* %tmp23073, i64 1
+  %tmp23075 = getelementptr inbounds float* %tmp23074, i64 1
+  %tmp23076 = getelementptr inbounds float* %tmp23075, i64 1
+  %tmp23077 = getelementptr inbounds float* %tmp23076, i64 1
+  %tmp23078 = getelementptr inbounds float* %tmp23077, i64 1
+  %tmp23079 = getelementptr inbounds float* %tmp23078, i64 1
+  %tmp23080 = getelementptr inbounds float* %tmp23079, i64 1
+  %tmp23081 = getelementptr inbounds float* %tmp23080, i64 1
+  %tmp23082 = getelementptr inbounds float* %tmp23081, i64 1
+  %tmp23083 = getelementptr inbounds float* %tmp23082, i64 1
+  %tmp23084 = getelementptr inbounds float* %tmp23083, i64 1
+  %tmp23085 = getelementptr inbounds float* %tmp23084, i64 1
+  %tmp23086 = getelementptr inbounds float* %tmp23085, i64 1
+  %tmp23087 = getelementptr inbounds float* %tmp23086, i64 1
+  %tmp23088 = getelementptr inbounds float* %tmp23087, i64 1
+  %tmp23089 = getelementptr inbounds float* %tmp23088, i64 1
+  %tmp23090 = getelementptr inbounds float* %tmp23089, i64 1
+  %tmp23091 = getelementptr inbounds float* %tmp23090, i64 1
+  %tmp23092 = getelementptr inbounds float* %tmp23091, i64 1
+  %tmp23093 = getelementptr inbounds float* %tmp23092, i64 1
+  %tmp23094 = getelementptr inbounds float* %tmp23093, i64 1
+  %tmp23095 = getelementptr inbounds float* %tmp23094, i64 1
+  %tmp23096 = getelementptr inbounds float* %tmp23095, i64 1
+  %tmp23097 = getelementptr inbounds float* %tmp23096, i64 1
+  %tmp23098 = getelementptr inbounds float* %tmp23097, i64 1
+  %tmp23099 = getelementptr inbounds float* %tmp23098, i64 1
+  %tmp23100 = getelementptr inbounds float* %tmp23099, i64 1
+  %tmp23101 = getelementptr inbounds float* %tmp23100, i64 1
+  %tmp23102 = getelementptr inbounds float* %tmp23101, i64 1
+  %tmp23103 = getelementptr inbounds float* %tmp23102, i64 1
+  %tmp23104 = getelementptr inbounds float* %tmp23103, i64 1
+  %tmp23105 = getelementptr inbounds float* %tmp23104, i64 1
+  %tmp23106 = getelementptr inbounds float* %tmp23105, i64 1
+  %tmp23107 = getelementptr inbounds float* %tmp23106, i64 1
+  %tmp23108 = getelementptr inbounds float* %tmp23107, i64 1
+  %tmp23109 = getelementptr inbounds float* %tmp23108, i64 1
+  %tmp23110 = getelementptr inbounds float* %tmp23109, i64 1
+  %tmp23111 = getelementptr inbounds float* %tmp23110, i64 1
+  %tmp23112 = getelementptr inbounds float* %tmp23111, i64 1
+  %tmp23113 = getelementptr inbounds float* %tmp23112, i64 1
+  %tmp23114 = getelementptr inbounds float* %tmp23113, i64 1
+  %tmp23115 = getelementptr inbounds float* %tmp23114, i64 1
+  %tmp23116 = getelementptr inbounds float* %tmp23115, i64 1
+  %tmp23117 = getelementptr inbounds float* %tmp23116, i64 1
+  %tmp23118 = getelementptr inbounds float* %tmp23117, i64 1
+  %tmp23119 = getelementptr inbounds float* %tmp23118, i64 1
+  %tmp23120 = getelementptr inbounds float* %tmp23119, i64 1
+  %tmp23121 = getelementptr inbounds float* %tmp23120, i64 1
+  %tmp23122 = getelementptr inbounds float* %tmp23121, i64 1
+  %tmp23123 = getelementptr inbounds float* %tmp23122, i64 1
+  %tmp23124 = getelementptr inbounds float* %tmp23123, i64 1
+  %tmp23125 = getelementptr inbounds float* %tmp23124, i64 1
+  %tmp23126 = getelementptr inbounds float* %tmp23125, i64 1
+  %tmp23127 = getelementptr inbounds float* %tmp23126, i64 1
+  %tmp23128 = getelementptr inbounds float* %tmp23127, i64 1
+  %tmp23129 = getelementptr inbounds float* %tmp23128, i64 1
+  %tmp23130 = getelementptr inbounds float* %tmp23129, i64 1
+  %tmp23131 = getelementptr inbounds float* %tmp23130, i64 1
+  %tmp23132 = getelementptr inbounds float* %tmp23131, i64 1
+  %tmp23133 = getelementptr inbounds float* %tmp23132, i64 1
+  %tmp23134 = getelementptr inbounds float* %tmp23133, i64 1
+  %tmp23135 = getelementptr inbounds float* %tmp23134, i64 1
+  %tmp23136 = getelementptr inbounds float* %tmp23135, i64 1
+  %tmp23137 = getelementptr inbounds float* %tmp23136, i64 1
+  %tmp23138 = getelementptr inbounds float* %tmp23137, i64 1
+  %tmp23139 = getelementptr inbounds float* %tmp23138, i64 1
+  %tmp23140 = getelementptr inbounds float* %tmp23139, i64 1
+  %tmp23141 = getelementptr inbounds float* %tmp23140, i64 1
+  %tmp23142 = getelementptr inbounds float* %tmp23141, i64 1
+  %tmp23143 = getelementptr inbounds float* %tmp23142, i64 1
+  %tmp23144 = getelementptr inbounds float* %tmp23143, i64 1
+  %tmp23145 = getelementptr inbounds float* %tmp23144, i64 1
+  %tmp23146 = getelementptr inbounds float* %tmp23145, i64 1
+  %tmp23147 = getelementptr inbounds float* %tmp23146, i64 1
+  %tmp23148 = getelementptr inbounds float* %tmp23147, i64 1
+  %tmp23149 = getelementptr inbounds float* %tmp23148, i64 1
+  %tmp23150 = getelementptr inbounds float* %tmp23149, i64 1
+  %tmp23151 = getelementptr inbounds float* %tmp23150, i64 1
+  %tmp23152 = getelementptr inbounds float* %tmp23151, i64 1
+  %tmp23153 = getelementptr inbounds float* %tmp23152, i64 1
+  %tmp23154 = getelementptr inbounds float* %tmp23153, i64 1
+  %tmp23155 = getelementptr inbounds float* %tmp23154, i64 1
+  %tmp23156 = getelementptr inbounds float* %tmp23155, i64 1
+  %tmp23157 = getelementptr inbounds float* %tmp23156, i64 1
+  %tmp23158 = getelementptr inbounds float* %tmp23157, i64 1
+  %tmp23159 = getelementptr inbounds float* %tmp23158, i64 1
+  %tmp23160 = getelementptr inbounds float* %tmp23159, i64 1
+  %tmp23161 = getelementptr inbounds float* %tmp23160, i64 1
+  %tmp23162 = getelementptr inbounds float* %tmp23161, i64 1
+  %tmp23163 = getelementptr inbounds float* %tmp23162, i64 1
+  %tmp23164 = getelementptr inbounds float* %tmp23163, i64 1
+  %tmp23165 = getelementptr inbounds float* %tmp23164, i64 1
+  %tmp23166 = getelementptr inbounds float* %tmp23165, i64 1
+  %tmp23167 = getelementptr inbounds float* %tmp23166, i64 1
+  %tmp23168 = getelementptr inbounds float* %tmp23167, i64 1
+  %tmp23169 = getelementptr inbounds float* %tmp23168, i64 1
+  %tmp23170 = getelementptr inbounds float* %tmp23169, i64 1
+  %tmp23171 = getelementptr inbounds float* %tmp23170, i64 1
+  %tmp23172 = getelementptr inbounds float* %tmp23171, i64 1
+  %tmp23173 = getelementptr inbounds float* %tmp23172, i64 1
+  %tmp23174 = getelementptr inbounds float* %tmp23173, i64 1
+  %tmp23175 = getelementptr inbounds float* %tmp23174, i64 1
+  %tmp23176 = getelementptr inbounds float* %tmp23175, i64 1
+  %tmp23177 = getelementptr inbounds float* %tmp23176, i64 1
+  %tmp23178 = getelementptr inbounds float* %tmp23177, i64 1
+  %tmp23179 = getelementptr inbounds float* %tmp23178, i64 1
+  %tmp23180 = getelementptr inbounds float* %tmp23179, i64 1
+  %tmp23181 = getelementptr inbounds float* %tmp23180, i64 1
+  %tmp23182 = getelementptr inbounds float* %tmp23181, i64 1
+  %tmp23183 = getelementptr inbounds float* %tmp23182, i64 1
+  %tmp23184 = getelementptr inbounds float* %tmp23183, i64 1
+  %tmp23185 = getelementptr inbounds float* %tmp23184, i64 1
+  %tmp23186 = getelementptr inbounds float* %tmp23185, i64 1
+  %tmp23187 = getelementptr inbounds float* %tmp23186, i64 1
+  %tmp23188 = getelementptr inbounds float* %tmp23187, i64 1
+  %tmp23189 = getelementptr inbounds float* %tmp23188, i64 1
+  %tmp23190 = getelementptr inbounds float* %tmp23189, i64 1
+  %tmp23191 = getelementptr inbounds float* %tmp23190, i64 1
+  %tmp23192 = getelementptr inbounds float* %tmp23191, i64 1
+  %tmp23193 = getelementptr inbounds float* %tmp23192, i64 1
+  %tmp23194 = getelementptr inbounds float* %tmp23193, i64 1
+  %tmp23195 = getelementptr inbounds float* %tmp23194, i64 1
+  %tmp23196 = getelementptr inbounds float* %tmp23195, i64 1
+  %tmp23197 = getelementptr inbounds float* %tmp23196, i64 1
+  %tmp23198 = getelementptr inbounds float* %tmp23197, i64 1
+  %tmp23199 = getelementptr inbounds float* %tmp23198, i64 1
+  %tmp23200 = getelementptr inbounds float* %tmp23199, i64 1
+  %tmp23201 = getelementptr inbounds float* %tmp23200, i64 1
+  %tmp23202 = getelementptr inbounds float* %tmp23201, i64 1
+  %tmp23203 = getelementptr inbounds float* %tmp23202, i64 1
+  %tmp23204 = getelementptr inbounds float* %tmp23203, i64 1
+  %tmp23205 = getelementptr inbounds float* %tmp23204, i64 1
+  %tmp23206 = getelementptr inbounds float* %tmp23205, i64 1
+  %tmp23207 = getelementptr inbounds float* %tmp23206, i64 1
+  %tmp23208 = getelementptr inbounds float* %tmp23207, i64 1
+  %tmp23209 = getelementptr inbounds float* %tmp23208, i64 1
+  %tmp23210 = getelementptr inbounds float* %tmp23209, i64 1
+  %tmp23211 = getelementptr inbounds float* %tmp23210, i64 1
+  %tmp23212 = getelementptr inbounds float* %tmp23211, i64 1
+  %tmp23213 = getelementptr inbounds float* %tmp23212, i64 1
+  %tmp23214 = getelementptr inbounds float* %tmp23213, i64 1
+  %tmp23215 = getelementptr inbounds float* %tmp23214, i64 1
+  %tmp23216 = getelementptr inbounds float* %tmp23215, i64 1
+  %tmp23217 = getelementptr inbounds float* %tmp23216, i64 1
+  %tmp23218 = getelementptr inbounds float* %tmp23217, i64 1
+  %tmp23219 = getelementptr inbounds float* %tmp23218, i64 1
+  %tmp23220 = getelementptr inbounds float* %tmp23219, i64 1
+  %tmp23221 = getelementptr inbounds float* %tmp23220, i64 1
+  %tmp23222 = getelementptr inbounds float* %tmp23221, i64 1
+  %tmp23223 = getelementptr inbounds float* %tmp23222, i64 1
+  %tmp23224 = getelementptr inbounds float* %tmp23223, i64 1
+  %tmp23225 = getelementptr inbounds float* %tmp23224, i64 1
+  %tmp23226 = getelementptr inbounds float* %tmp23225, i64 1
+  %tmp23227 = getelementptr inbounds float* %tmp23226, i64 1
+  %tmp23228 = getelementptr inbounds float* %tmp23227, i64 1
+  %tmp23229 = getelementptr inbounds float* %tmp23228, i64 1
+  %tmp23230 = getelementptr inbounds float* %tmp23229, i64 1
+  %tmp23231 = getelementptr inbounds float* %tmp23230, i64 1
+  %tmp23232 = getelementptr inbounds float* %tmp23231, i64 1
+  %tmp23233 = getelementptr inbounds float* %tmp23232, i64 1
+  %tmp23234 = getelementptr inbounds float* %tmp23233, i64 1
+  %tmp23235 = getelementptr inbounds float* %tmp23234, i64 1
+  %tmp23236 = getelementptr inbounds float* %tmp23235, i64 1
+  %tmp23237 = getelementptr inbounds float* %tmp23236, i64 1
+  %tmp23238 = getelementptr inbounds float* %tmp23237, i64 1
+  %tmp23239 = getelementptr inbounds float* %tmp23238, i64 1
+  %tmp23240 = getelementptr inbounds float* %tmp23239, i64 1
+  %tmp23241 = getelementptr inbounds float* %tmp23240, i64 1
+  %tmp23242 = getelementptr inbounds float* %tmp23241, i64 1
+  %tmp23243 = getelementptr inbounds float* %tmp23242, i64 1
+  %tmp23244 = getelementptr inbounds float* %tmp23243, i64 1
+  %tmp23245 = getelementptr inbounds float* %tmp23244, i64 1
+  %tmp23246 = getelementptr inbounds float* %tmp23245, i64 1
+  %tmp23247 = getelementptr inbounds float* %tmp23246, i64 1
+  %tmp23248 = getelementptr inbounds float* %tmp23247, i64 1
+  %tmp23249 = getelementptr inbounds float* %tmp23248, i64 1
+  %tmp23250 = getelementptr inbounds float* %tmp23249, i64 1
+  %tmp23251 = getelementptr inbounds float* %tmp23250, i64 1
+  %tmp23252 = getelementptr inbounds float* %tmp23251, i64 1
+  %tmp23253 = getelementptr inbounds float* %tmp23252, i64 1
+  %tmp23254 = getelementptr inbounds float* %tmp23253, i64 1
+  %tmp23255 = getelementptr inbounds float* %tmp23254, i64 1
+  %tmp23256 = getelementptr inbounds float* %tmp23255, i64 1
+  %tmp23257 = getelementptr inbounds float* %tmp23256, i64 1
+  %tmp23258 = getelementptr inbounds float* %tmp23257, i64 1
+  %tmp23259 = getelementptr inbounds float* %tmp23258, i64 1
+  %tmp23260 = getelementptr inbounds float* %tmp23259, i64 1
+  %tmp23261 = getelementptr inbounds float* %tmp23260, i64 1
+  %tmp23262 = getelementptr inbounds float* %tmp23261, i64 1
+  %tmp23263 = getelementptr inbounds float* %tmp23262, i64 1
+  %tmp23264 = getelementptr inbounds float* %tmp23263, i64 1
+  %tmp23265 = getelementptr inbounds float* %tmp23264, i64 1
+  %tmp23266 = getelementptr inbounds float* %tmp23265, i64 1
+  %tmp23267 = getelementptr inbounds float* %tmp23266, i64 1
+  %tmp23268 = getelementptr inbounds float* %tmp23267, i64 1
+  %tmp23269 = getelementptr inbounds float* %tmp23268, i64 1
+  %tmp23270 = getelementptr inbounds float* %tmp23269, i64 1
+  %tmp23271 = getelementptr inbounds float* %tmp23270, i64 1
+  %tmp23272 = getelementptr inbounds float* %tmp23271, i64 1
+  %tmp23273 = getelementptr inbounds float* %tmp23272, i64 1
+  %tmp23274 = getelementptr inbounds float* %tmp23273, i64 1
+  %tmp23275 = getelementptr inbounds float* %tmp23274, i64 1
+  %tmp23276 = getelementptr inbounds float* %tmp23275, i64 1
+  %tmp23277 = getelementptr inbounds float* %tmp23276, i64 1
+  %tmp23278 = getelementptr inbounds float* %tmp23277, i64 1
+  %tmp23279 = getelementptr inbounds float* %tmp23278, i64 1
+  %tmp23280 = getelementptr inbounds float* %tmp23279, i64 1
+  %tmp23281 = getelementptr inbounds float* %tmp23280, i64 1
+  %tmp23282 = getelementptr inbounds float* %tmp23281, i64 1
+  %tmp23283 = getelementptr inbounds float* %tmp23282, i64 1
+  %tmp23284 = getelementptr inbounds float* %tmp23283, i64 1
+  %tmp23285 = getelementptr inbounds float* %tmp23284, i64 1
+  %tmp23286 = getelementptr inbounds float* %tmp23285, i64 1
+  %tmp23287 = getelementptr inbounds float* %tmp23286, i64 1
+  %tmp23288 = getelementptr inbounds float* %tmp23287, i64 1
+  %tmp23289 = getelementptr inbounds float* %tmp23288, i64 1
+  %tmp23290 = getelementptr inbounds float* %tmp23289, i64 1
+  %tmp23291 = getelementptr inbounds float* %tmp23290, i64 1
+  %tmp23292 = getelementptr inbounds float* %tmp23291, i64 1
+  %tmp23293 = getelementptr inbounds float* %tmp23292, i64 1
+  %tmp23294 = getelementptr inbounds float* %tmp23293, i64 1
+  %tmp23295 = getelementptr inbounds float* %tmp23294, i64 1
+  %tmp23296 = getelementptr inbounds float* %tmp23295, i64 1
+  %tmp23297 = getelementptr inbounds float* %tmp23296, i64 1
+  %tmp23298 = getelementptr inbounds float* %tmp23297, i64 1
+  %tmp23299 = getelementptr inbounds float* %tmp23298, i64 1
+  %tmp23300 = getelementptr inbounds float* %tmp23299, i64 1
+  %tmp23301 = getelementptr inbounds float* %tmp23300, i64 1
+  %tmp23302 = getelementptr inbounds float* %tmp23301, i64 1
+  %tmp23303 = getelementptr inbounds float* %tmp23302, i64 1
+  %tmp23304 = getelementptr inbounds float* %tmp23303, i64 1
+  %tmp23305 = getelementptr inbounds float* %tmp23304, i64 1
+  %tmp23306 = getelementptr inbounds float* %tmp23305, i64 1
+  %tmp23307 = getelementptr inbounds float* %tmp23306, i64 1
+  %tmp23308 = getelementptr inbounds float* %tmp23307, i64 1
+  %tmp23309 = getelementptr inbounds float* %tmp23308, i64 1
+  %tmp23310 = getelementptr inbounds float* %tmp23309, i64 1
+  %tmp23311 = getelementptr inbounds float* %tmp23310, i64 1
+  %tmp23312 = getelementptr inbounds float* %tmp23311, i64 1
+  %tmp23313 = getelementptr inbounds float* %tmp23312, i64 1
+  %tmp23314 = getelementptr inbounds float* %tmp23313, i64 1
+  %tmp23315 = getelementptr inbounds float* %tmp23314, i64 1
+  %tmp23316 = getelementptr inbounds float* %tmp23315, i64 1
+  %tmp23317 = getelementptr inbounds float* %tmp23316, i64 1
+  %tmp23318 = getelementptr inbounds float* %tmp23317, i64 1
+  %tmp23319 = getelementptr inbounds float* %tmp23318, i64 1
+  %tmp23320 = getelementptr inbounds float* %tmp23319, i64 1
+  %tmp23321 = getelementptr inbounds float* %tmp23320, i64 1
+  %tmp23322 = getelementptr inbounds float* %tmp23321, i64 1
+  %tmp23323 = getelementptr inbounds float* %tmp23322, i64 1
+  %tmp23324 = getelementptr inbounds float* %tmp23323, i64 1
+  %tmp23325 = getelementptr inbounds float* %tmp23324, i64 1
+  %tmp23326 = getelementptr inbounds float* %tmp23325, i64 1
+  %tmp23327 = getelementptr inbounds float* %tmp23326, i64 1
+  %tmp23328 = getelementptr inbounds float* %tmp23327, i64 1
+  %tmp23329 = getelementptr inbounds float* %tmp23328, i64 1
+  %tmp23330 = getelementptr inbounds float* %tmp23329, i64 1
+  %tmp23331 = getelementptr inbounds float* %tmp23330, i64 1
+  %tmp23332 = getelementptr inbounds float* %tmp23331, i64 1
+  %tmp23333 = getelementptr inbounds float* %tmp23332, i64 1
+  %tmp23334 = getelementptr inbounds float* %tmp23333, i64 1
+  %tmp23335 = getelementptr inbounds float* %tmp23334, i64 1
+  %tmp23336 = getelementptr inbounds float* %tmp23335, i64 1
+  %tmp23337 = getelementptr inbounds float* %tmp23336, i64 1
+  %tmp23338 = getelementptr inbounds float* %tmp23337, i64 1
+  %tmp23339 = getelementptr inbounds float* %tmp23338, i64 1
+  %tmp23340 = getelementptr inbounds float* %tmp23339, i64 1
+  %tmp23341 = getelementptr inbounds float* %tmp23340, i64 1
+  %tmp23342 = getelementptr inbounds float* %tmp23341, i64 1
+  %tmp23343 = getelementptr inbounds float* %tmp23342, i64 1
+  %tmp23344 = getelementptr inbounds float* %tmp23343, i64 1
+  %tmp23345 = getelementptr inbounds float* %tmp23344, i64 1
+  %tmp23346 = getelementptr inbounds float* %tmp23345, i64 1
+  %tmp23347 = getelementptr inbounds float* %tmp23346, i64 1
+  %tmp23348 = getelementptr inbounds float* %tmp23347, i64 1
+  %tmp23349 = getelementptr inbounds float* %tmp23348, i64 1
+  %tmp23350 = getelementptr inbounds float* %tmp23349, i64 1
+  %tmp23351 = getelementptr inbounds float* %tmp23350, i64 1
+  %tmp23352 = getelementptr inbounds float* %tmp23351, i64 1
+  %tmp23353 = getelementptr inbounds float* %tmp23352, i64 1
+  %tmp23354 = getelementptr inbounds float* %tmp23353, i64 1
+  %tmp23355 = getelementptr inbounds float* %tmp23354, i64 1
+  %tmp23356 = getelementptr inbounds float* %tmp23355, i64 1
+  %tmp23357 = getelementptr inbounds float* %tmp23356, i64 1
+  %tmp23358 = getelementptr inbounds float* %tmp23357, i64 1
+  %tmp23359 = getelementptr inbounds float* %tmp23358, i64 1
+  %tmp23360 = getelementptr inbounds float* %tmp23359, i64 1
+  %tmp23361 = getelementptr inbounds float* %tmp23360, i64 1
+  %tmp23362 = getelementptr inbounds float* %tmp23361, i64 1
+  %tmp23363 = getelementptr inbounds float* %tmp23362, i64 1
+  %tmp23364 = getelementptr inbounds float* %tmp23363, i64 1
+  %tmp23365 = getelementptr inbounds float* %tmp23364, i64 1
+  %tmp23366 = getelementptr inbounds float* %tmp23365, i64 1
+  %tmp23367 = getelementptr inbounds float* %tmp23366, i64 1
+  %tmp23368 = getelementptr inbounds float* %tmp23367, i64 1
+  %tmp23369 = getelementptr inbounds float* %tmp23368, i64 1
+  %tmp23370 = getelementptr inbounds float* %tmp23369, i64 1
+  %tmp23371 = getelementptr inbounds float* %tmp23370, i64 1
+  %tmp23372 = getelementptr inbounds float* %tmp23371, i64 1
+  %tmp23373 = getelementptr inbounds float* %tmp23372, i64 1
+  %tmp23374 = getelementptr inbounds float* %tmp23373, i64 1
+  %tmp23375 = getelementptr inbounds float* %tmp23374, i64 1
+  %tmp23376 = getelementptr inbounds float* %tmp23375, i64 1
+  %tmp23377 = getelementptr inbounds float* %tmp23376, i64 1
+  %tmp23378 = getelementptr inbounds float* %tmp23377, i64 1
+  %tmp23379 = getelementptr inbounds float* %tmp23378, i64 1
+  %tmp23380 = getelementptr inbounds float* %tmp23379, i64 1
+  %tmp23381 = getelementptr inbounds float* %tmp23380, i64 1
+  %tmp23382 = getelementptr inbounds float* %tmp23381, i64 1
+  %tmp23383 = getelementptr inbounds float* %tmp23382, i64 1
+  %tmp23384 = getelementptr inbounds float* %tmp23383, i64 1
+  %tmp23385 = getelementptr inbounds float* %tmp23384, i64 1
+  %tmp23386 = getelementptr inbounds float* %tmp23385, i64 1
+  %tmp23387 = getelementptr inbounds float* %tmp23386, i64 1
+  %tmp23388 = getelementptr inbounds float* %tmp23387, i64 1
+  %tmp23389 = getelementptr inbounds float* %tmp23388, i64 1
+  %tmp23390 = getelementptr inbounds float* %tmp23389, i64 1
+  %tmp23391 = getelementptr inbounds float* %tmp23390, i64 1
+  %tmp23392 = getelementptr inbounds float* %tmp23391, i64 1
+  %tmp23393 = getelementptr inbounds float* %tmp23392, i64 1
+  %tmp23394 = getelementptr inbounds float* %tmp23393, i64 1
+  %tmp23395 = getelementptr inbounds float* %tmp23394, i64 1
+  %tmp23396 = getelementptr inbounds float* %tmp23395, i64 1
+  %tmp23397 = getelementptr inbounds float* %tmp23396, i64 1
+  %tmp23398 = getelementptr inbounds float* %tmp23397, i64 1
+  %tmp23399 = getelementptr inbounds float* %tmp23398, i64 1
+  %tmp23400 = getelementptr inbounds float* %tmp23399, i64 1
+  %tmp23401 = getelementptr inbounds float* %tmp23400, i64 1
+  %tmp23402 = getelementptr inbounds float* %tmp23401, i64 1
+  %tmp23403 = getelementptr inbounds float* %tmp23402, i64 1
+  %tmp23404 = getelementptr inbounds float* %tmp23403, i64 1
+  %tmp23405 = getelementptr inbounds float* %tmp23404, i64 1
+  %tmp23406 = getelementptr inbounds float* %tmp23405, i64 1
+  %tmp23407 = getelementptr inbounds float* %tmp23406, i64 1
+  %tmp23408 = getelementptr inbounds float* %tmp23407, i64 1
+  %tmp23409 = getelementptr inbounds float* %tmp23408, i64 1
+  %tmp23410 = getelementptr inbounds float* %tmp23409, i64 1
+  %tmp23411 = getelementptr inbounds float* %tmp23410, i64 1
+  %tmp23412 = getelementptr inbounds float* %tmp23411, i64 1
+  %tmp23413 = getelementptr inbounds float* %tmp23412, i64 1
+  %tmp23414 = getelementptr inbounds float* %tmp23413, i64 1
+  %tmp23415 = getelementptr inbounds float* %tmp23414, i64 1
+  %tmp23416 = getelementptr inbounds float* %tmp23415, i64 1
+  %tmp23417 = getelementptr inbounds float* %tmp23416, i64 1
+  %tmp23418 = getelementptr inbounds float* %tmp23417, i64 1
+  %tmp23419 = getelementptr inbounds float* %tmp23418, i64 1
+  %tmp23420 = getelementptr inbounds float* %tmp23419, i64 1
+  %tmp23421 = getelementptr inbounds float* %tmp23420, i64 1
+  %tmp23422 = getelementptr inbounds float* %tmp23421, i64 1
+  %tmp23423 = getelementptr inbounds float* %tmp23422, i64 1
+  %tmp23424 = getelementptr inbounds float* %tmp23423, i64 1
+  %tmp23425 = getelementptr inbounds float* %tmp23424, i64 1
+  %tmp23426 = getelementptr inbounds float* %tmp23425, i64 1
+  %tmp23427 = getelementptr inbounds float* %tmp23426, i64 1
+  %tmp23428 = getelementptr inbounds float* %tmp23427, i64 1
+  %tmp23429 = getelementptr inbounds float* %tmp23428, i64 1
+  %tmp23430 = getelementptr inbounds float* %tmp23429, i64 1
+  %tmp23431 = getelementptr inbounds float* %tmp23430, i64 1
+  %tmp23432 = getelementptr inbounds float* %tmp23431, i64 1
+  %tmp23433 = getelementptr inbounds float* %tmp23432, i64 1
+  %tmp23434 = getelementptr inbounds float* %tmp23433, i64 1
+  %tmp23435 = getelementptr inbounds float* %tmp23434, i64 1
+  %tmp23436 = getelementptr inbounds float* %tmp23435, i64 1
+  %tmp23437 = getelementptr inbounds float* %tmp23436, i64 1
+  %tmp23438 = getelementptr inbounds float* %tmp23437, i64 1
+  %tmp23439 = getelementptr inbounds float* %tmp23438, i64 1
+  %tmp23440 = getelementptr inbounds float* %tmp23439, i64 1
+  %tmp23441 = getelementptr inbounds float* %tmp23440, i64 1
+  %tmp23442 = getelementptr inbounds float* %tmp23441, i64 1
+  %tmp23443 = getelementptr inbounds float* %tmp23442, i64 1
+  %tmp23444 = getelementptr inbounds float* %tmp23443, i64 1
+  %tmp23445 = getelementptr inbounds float* %tmp23444, i64 1
+  %tmp23446 = getelementptr inbounds float* %tmp23445, i64 1
+  %tmp23447 = getelementptr inbounds float* %tmp23446, i64 1
+  %tmp23448 = getelementptr inbounds float* %tmp23447, i64 1
+  %tmp23449 = getelementptr inbounds float* %tmp23448, i64 1
+  %tmp23450 = getelementptr inbounds float* %tmp23449, i64 1
+  %tmp23451 = getelementptr inbounds float* %tmp23450, i64 1
+  %tmp23452 = getelementptr inbounds float* %tmp23451, i64 1
+  %tmp23453 = getelementptr inbounds float* %tmp23452, i64 1
+  %tmp23454 = getelementptr inbounds float* %tmp23453, i64 1
+  %tmp23455 = getelementptr inbounds float* %tmp23454, i64 1
+  %tmp23456 = getelementptr inbounds float* %tmp23455, i64 1
+  %tmp23457 = getelementptr inbounds float* %tmp23456, i64 1
+  %tmp23458 = getelementptr inbounds float* %tmp23457, i64 1
+  %tmp23459 = getelementptr inbounds float* %tmp23458, i64 1
+  %tmp23460 = getelementptr inbounds float* %tmp23459, i64 1
+  %tmp23461 = getelementptr inbounds float* %tmp23460, i64 1
+  %tmp23462 = getelementptr inbounds float* %tmp23461, i64 1
+  %tmp23463 = getelementptr inbounds float* %tmp23462, i64 1
+  %tmp23464 = getelementptr inbounds float* %tmp23463, i64 1
+  %tmp23465 = getelementptr inbounds float* %tmp23464, i64 1
+  %tmp23466 = getelementptr inbounds float* %tmp23465, i64 1
+  %tmp23467 = getelementptr inbounds float* %tmp23466, i64 1
+  %tmp23468 = getelementptr inbounds float* %tmp23467, i64 1
+  %tmp23469 = getelementptr inbounds float* %tmp23468, i64 1
+  %tmp23470 = getelementptr inbounds float* %tmp23469, i64 1
+  %tmp23471 = getelementptr inbounds float* %tmp23470, i64 1
+  %tmp23472 = getelementptr inbounds float* %tmp23471, i64 1
+  %tmp23473 = getelementptr inbounds float* %tmp23472, i64 1
+  %tmp23474 = getelementptr inbounds float* %tmp23473, i64 1
+  %tmp23475 = getelementptr inbounds float* %tmp23474, i64 1
+  %tmp23476 = getelementptr inbounds float* %tmp23475, i64 1
+  %tmp23477 = getelementptr inbounds float* %tmp23476, i64 1
+  %tmp23478 = getelementptr inbounds float* %tmp23477, i64 1
+  %tmp23479 = getelementptr inbounds float* %tmp23478, i64 1
+  %tmp23480 = getelementptr inbounds float* %tmp23479, i64 1
+  %tmp23481 = getelementptr inbounds float* %tmp23480, i64 1
+  %tmp23482 = getelementptr inbounds float* %tmp23481, i64 1
+  %tmp23483 = getelementptr inbounds float* %tmp23482, i64 1
+  %tmp23484 = getelementptr inbounds float* %tmp23483, i64 1
+  %tmp23485 = getelementptr inbounds float* %tmp23484, i64 1
+  %tmp23486 = getelementptr inbounds float* %tmp23485, i64 1
+  %tmp23487 = getelementptr inbounds float* %tmp23486, i64 1
+  %tmp23488 = getelementptr inbounds float* %tmp23487, i64 1
+  %tmp23489 = getelementptr inbounds float* %tmp23488, i64 1
+  %tmp23490 = getelementptr inbounds float* %tmp23489, i64 1
+  %tmp23491 = getelementptr inbounds float* %tmp23490, i64 1
+  %tmp23492 = getelementptr inbounds float* %tmp23491, i64 1
+  %tmp23493 = getelementptr inbounds float* %tmp23492, i64 1
+  %tmp23494 = getelementptr inbounds float* %tmp23493, i64 1
+  %tmp23495 = getelementptr inbounds float* %tmp23494, i64 1
+  %tmp23496 = getelementptr inbounds float* %tmp23495, i64 1
+  %tmp23497 = getelementptr inbounds float* %tmp23496, i64 1
+  %tmp23498 = getelementptr inbounds float* %tmp23497, i64 1
+  %tmp23499 = getelementptr inbounds float* %tmp23498, i64 1
+  %tmp23500 = getelementptr inbounds float* %tmp23499, i64 1
+  %tmp23501 = getelementptr inbounds float* %tmp23500, i64 1
+  %tmp23502 = getelementptr inbounds float* %tmp23501, i64 1
+  %tmp23503 = getelementptr inbounds float* %tmp23502, i64 1
+  %tmp23504 = getelementptr inbounds float* %tmp23503, i64 1
+  %tmp23505 = getelementptr inbounds float* %tmp23504, i64 1
+  %tmp23506 = getelementptr inbounds float* %tmp23505, i64 1
+  %tmp23507 = getelementptr inbounds float* %tmp23506, i64 1
+  %tmp23508 = getelementptr inbounds float* %tmp23507, i64 1
+  %tmp23509 = getelementptr inbounds float* %tmp23508, i64 1
+  %tmp23510 = getelementptr inbounds float* %tmp23509, i64 1
+  %tmp23511 = getelementptr inbounds float* %tmp23510, i64 1
+  %tmp23512 = getelementptr inbounds float* %tmp23511, i64 1
+  %tmp23513 = getelementptr inbounds float* %tmp23512, i64 1
+  %tmp23514 = getelementptr inbounds float* %tmp23513, i64 1
+  %tmp23515 = getelementptr inbounds float* %tmp23514, i64 1
+  %tmp23516 = getelementptr inbounds float* %tmp23515, i64 1
+  %tmp23517 = getelementptr inbounds float* %tmp23516, i64 1
+  %tmp23518 = getelementptr inbounds float* %tmp23517, i64 1
+  %tmp23519 = getelementptr inbounds float* %tmp23518, i64 1
+  %tmp23520 = getelementptr inbounds float* %tmp23519, i64 1
+  %tmp23521 = getelementptr inbounds float* %tmp23520, i64 1
+  %tmp23522 = getelementptr inbounds float* %tmp23521, i64 1
+  %tmp23523 = getelementptr inbounds float* %tmp23522, i64 1
+  %tmp23524 = getelementptr inbounds float* %tmp23523, i64 1
+  %tmp23525 = getelementptr inbounds float* %tmp23524, i64 1
+  %tmp23526 = getelementptr inbounds float* %tmp23525, i64 1
+  %tmp23527 = getelementptr inbounds float* %tmp23526, i64 1
+  %tmp23528 = getelementptr inbounds float* %tmp23527, i64 1
+  %tmp23529 = getelementptr inbounds float* %tmp23528, i64 1
+  %tmp23530 = getelementptr inbounds float* %tmp23529, i64 1
+  %tmp23531 = getelementptr inbounds float* %tmp23530, i64 1
+  %tmp23532 = getelementptr inbounds float* %tmp23531, i64 1
+  %tmp23533 = getelementptr inbounds float* %tmp23532, i64 1
+  %tmp23534 = getelementptr inbounds float* %tmp23533, i64 1
+  %tmp23535 = getelementptr inbounds float* %tmp23534, i64 1
+  %tmp23536 = getelementptr inbounds float* %tmp23535, i64 1
+  %tmp23537 = getelementptr inbounds float* %tmp23536, i64 1
+  %tmp23538 = getelementptr inbounds float* %tmp23537, i64 1
+  %tmp23539 = getelementptr inbounds float* %tmp23538, i64 1
+  %tmp23540 = getelementptr inbounds float* %tmp23539, i64 1
+  %tmp23541 = getelementptr inbounds float* %tmp23540, i64 1
+  %tmp23542 = getelementptr inbounds float* %tmp23541, i64 1
+  %tmp23543 = getelementptr inbounds float* %tmp23542, i64 1
+  %tmp23544 = getelementptr inbounds float* %tmp23543, i64 1
+  %tmp23545 = getelementptr inbounds float* %tmp23544, i64 1
+  %tmp23546 = getelementptr inbounds float* %tmp23545, i64 1
+  %tmp23547 = getelementptr inbounds float* %tmp23546, i64 1
+  %tmp23548 = getelementptr inbounds float* %tmp23547, i64 1
+  %tmp23549 = getelementptr inbounds float* %tmp23548, i64 1
+  %tmp23550 = getelementptr inbounds float* %tmp23549, i64 1
+  %tmp23551 = getelementptr inbounds float* %tmp23550, i64 1
+  %tmp23552 = getelementptr inbounds float* %tmp23551, i64 1
+  %tmp23553 = getelementptr inbounds float* %tmp23552, i64 1
+  %tmp23554 = getelementptr inbounds float* %tmp23553, i64 1
+  %tmp23555 = getelementptr inbounds float* %tmp23554, i64 1
+  %tmp23556 = getelementptr inbounds float* %tmp23555, i64 1
+  %tmp23557 = getelementptr inbounds float* %tmp23556, i64 1
+  %tmp23558 = getelementptr inbounds float* %tmp23557, i64 1
+  %tmp23559 = getelementptr inbounds float* %tmp23558, i64 1
+  %tmp23560 = getelementptr inbounds float* %tmp23559, i64 1
+  %tmp23561 = getelementptr inbounds float* %tmp23560, i64 1
+  %tmp23562 = getelementptr inbounds float* %tmp23561, i64 1
+  %tmp23563 = getelementptr inbounds float* %tmp23562, i64 1
+  %tmp23564 = getelementptr inbounds float* %tmp23563, i64 1
+  %tmp23565 = getelementptr inbounds float* %tmp23564, i64 1
+  %tmp23566 = getelementptr inbounds float* %tmp23565, i64 1
+  %tmp23567 = getelementptr inbounds float* %tmp23566, i64 1
+  %tmp23568 = getelementptr inbounds float* %tmp23567, i64 1
+  %tmp23569 = getelementptr inbounds float* %tmp23568, i64 1
+  %tmp23570 = getelementptr inbounds float* %tmp23569, i64 1
+  %tmp23571 = getelementptr inbounds float* %tmp23570, i64 1
+  %tmp23572 = getelementptr inbounds float* %tmp23571, i64 1
+  %tmp23573 = getelementptr inbounds float* %tmp23572, i64 1
+  %tmp23574 = getelementptr inbounds float* %tmp23573, i64 1
+  %tmp23575 = getelementptr inbounds float* %tmp23574, i64 1
+  %tmp23576 = getelementptr inbounds float* %tmp23575, i64 1
+  %tmp23577 = getelementptr inbounds float* %tmp23576, i64 1
+  %tmp23578 = getelementptr inbounds float* %tmp23577, i64 1
+  %tmp23579 = getelementptr inbounds float* %tmp23578, i64 1
+  %tmp23580 = getelementptr inbounds float* %tmp23579, i64 1
+  %tmp23581 = getelementptr inbounds float* %tmp23580, i64 1
+  %tmp23582 = getelementptr inbounds float* %tmp23581, i64 1
+  %tmp23583 = getelementptr inbounds float* %tmp23582, i64 1
+  %tmp23584 = getelementptr inbounds float* %tmp23583, i64 1
+  %tmp23585 = getelementptr inbounds float* %tmp23584, i64 1
+  %tmp23586 = getelementptr inbounds float* %tmp23585, i64 1
+  %tmp23587 = getelementptr inbounds float* %tmp23586, i64 1
+  %tmp23588 = getelementptr inbounds float* %tmp23587, i64 1
+  %tmp23589 = getelementptr inbounds float* %tmp23588, i64 1
+  %tmp23590 = getelementptr inbounds float* %tmp23589, i64 1
+  %tmp23591 = getelementptr inbounds float* %tmp23590, i64 1
+  %tmp23592 = getelementptr inbounds float* %tmp23591, i64 1
+  %tmp23593 = getelementptr inbounds float* %tmp23592, i64 1
+  %tmp23594 = getelementptr inbounds float* %tmp23593, i64 1
+  %tmp23595 = getelementptr inbounds float* %tmp23594, i64 1
+  %tmp23596 = getelementptr inbounds float* %tmp23595, i64 1
+  %tmp23597 = getelementptr inbounds float* %tmp23596, i64 1
+  %tmp23598 = getelementptr inbounds float* %tmp23597, i64 1
+  %tmp23599 = getelementptr inbounds float* %tmp23598, i64 1
+  %tmp23600 = getelementptr inbounds float* %tmp23599, i64 1
+  %tmp23601 = getelementptr inbounds float* %tmp23600, i64 1
+  %tmp23602 = getelementptr inbounds float* %tmp23601, i64 1
+  %tmp23603 = getelementptr inbounds float* %tmp23602, i64 1
+  %tmp23604 = getelementptr inbounds float* %tmp23603, i64 1
+  %tmp23605 = getelementptr inbounds float* %tmp23604, i64 1
+  %tmp23606 = getelementptr inbounds float* %tmp23605, i64 1
+  %tmp23607 = getelementptr inbounds float* %tmp23606, i64 1
+  %tmp23608 = getelementptr inbounds float* %tmp23607, i64 1
+  %tmp23609 = getelementptr inbounds float* %tmp23608, i64 1
+  %tmp23610 = getelementptr inbounds float* %tmp23609, i64 1
+  %tmp23611 = getelementptr inbounds float* %tmp23610, i64 1
+  %tmp23612 = getelementptr inbounds float* %tmp23611, i64 1
+  %tmp23613 = getelementptr inbounds float* %tmp23612, i64 1
+  %tmp23614 = getelementptr inbounds float* %tmp23613, i64 1
+  %tmp23615 = getelementptr inbounds float* %tmp23614, i64 1
+  %tmp23616 = getelementptr inbounds float* %tmp23615, i64 1
+  %tmp23617 = getelementptr inbounds float* %tmp23616, i64 1
+  %tmp23618 = getelementptr inbounds float* %tmp23617, i64 1
+  %tmp23619 = getelementptr inbounds float* %tmp23618, i64 1
+  %tmp23620 = getelementptr inbounds float* %tmp23619, i64 1
+  %tmp23621 = getelementptr inbounds float* %tmp23620, i64 1
+  %tmp23622 = getelementptr inbounds float* %tmp23621, i64 1
+  %tmp23623 = getelementptr inbounds float* %tmp23622, i64 1
+  %tmp23624 = getelementptr inbounds float* %tmp23623, i64 1
+  %tmp23625 = getelementptr inbounds float* %tmp23624, i64 1
+  %tmp23626 = getelementptr inbounds float* %tmp23625, i64 1
+  %tmp23627 = getelementptr inbounds float* %tmp23626, i64 1
+  %tmp23628 = getelementptr inbounds float* %tmp23627, i64 1
+  %tmp23629 = getelementptr inbounds float* %tmp23628, i64 1
+  %tmp23630 = getelementptr inbounds float* %tmp23629, i64 1
+  %tmp23631 = getelementptr inbounds float* %tmp23630, i64 1
+  %tmp23632 = getelementptr inbounds float* %tmp23631, i64 1
+  %tmp23633 = getelementptr inbounds float* %tmp23632, i64 1
+  %tmp23634 = getelementptr inbounds float* %tmp23633, i64 1
+  %tmp23635 = getelementptr inbounds float* %tmp23634, i64 1
+  %tmp23636 = getelementptr inbounds float* %tmp23635, i64 1
+  %tmp23637 = getelementptr inbounds float* %tmp23636, i64 1
+  %tmp23638 = getelementptr inbounds float* %tmp23637, i64 1
+  %tmp23639 = getelementptr inbounds float* %tmp23638, i64 1
+  %tmp23640 = getelementptr inbounds float* %tmp23639, i64 1
+  %tmp23641 = getelementptr inbounds float* %tmp23640, i64 1
+  %tmp23642 = getelementptr inbounds float* %tmp23641, i64 1
+  %tmp23643 = getelementptr inbounds float* %tmp23642, i64 1
+  %tmp23644 = getelementptr inbounds float* %tmp23643, i64 1
+  %tmp23645 = getelementptr inbounds float* %tmp23644, i64 1
+  %tmp23646 = getelementptr inbounds float* %tmp23645, i64 1
+  %tmp23647 = getelementptr inbounds float* %tmp23646, i64 1
+  %tmp23648 = getelementptr inbounds float* %tmp23647, i64 1
+  %tmp23649 = getelementptr inbounds float* %tmp23648, i64 1
+  %tmp23650 = getelementptr inbounds float* %tmp23649, i64 1
+  %tmp23651 = getelementptr inbounds float* %tmp23650, i64 1
+  %tmp23652 = getelementptr inbounds float* %tmp23651, i64 1
+  %tmp23653 = getelementptr inbounds float* %tmp23652, i64 1
+  %tmp23654 = getelementptr inbounds float* %tmp23653, i64 1
+  %tmp23655 = getelementptr inbounds float* %tmp23654, i64 1
+  %tmp23656 = getelementptr inbounds float* %tmp23655, i64 1
+  %tmp23657 = getelementptr inbounds float* %tmp23656, i64 1
+  %tmp23658 = getelementptr inbounds float* %tmp23657, i64 1
+  %tmp23659 = getelementptr inbounds float* %tmp23658, i64 1
+  %tmp23660 = getelementptr inbounds float* %tmp23659, i64 1
+  %tmp23661 = getelementptr inbounds float* %tmp23660, i64 1
+  %tmp23662 = getelementptr inbounds float* %tmp23661, i64 1
+  %tmp23663 = getelementptr inbounds float* %tmp23662, i64 1
+  %tmp23664 = getelementptr inbounds float* %tmp23663, i64 1
+  %tmp23665 = getelementptr inbounds float* %tmp23664, i64 1
+  %tmp23666 = getelementptr inbounds float* %tmp23665, i64 1
+  %tmp23667 = getelementptr inbounds float* %tmp23666, i64 1
+  %tmp23668 = getelementptr inbounds float* %tmp23667, i64 1
+  %tmp23669 = getelementptr inbounds float* %tmp23668, i64 1
+  %tmp23670 = getelementptr inbounds float* %tmp23669, i64 1
+  %tmp23671 = getelementptr inbounds float* %tmp23670, i64 1
+  %tmp23672 = getelementptr inbounds float* %tmp23671, i64 1
+  %tmp23673 = getelementptr inbounds float* %tmp23672, i64 1
+  %tmp23674 = getelementptr inbounds float* %tmp23673, i64 1
+  %tmp23675 = getelementptr inbounds float* %tmp23674, i64 1
+  %tmp23676 = getelementptr inbounds float* %tmp23675, i64 1
+  %tmp23677 = getelementptr inbounds float* %tmp23676, i64 1
+  %tmp23678 = getelementptr inbounds float* %tmp23677, i64 1
+  %tmp23679 = getelementptr inbounds float* %tmp23678, i64 1
+  %tmp23680 = getelementptr inbounds float* %tmp23679, i64 1
+  %tmp23681 = getelementptr inbounds float* %tmp23680, i64 1
+  %tmp23682 = getelementptr inbounds float* %tmp23681, i64 1
+  %tmp23683 = getelementptr inbounds float* %tmp23682, i64 1
+  %tmp23684 = getelementptr inbounds float* %tmp23683, i64 1
+  %tmp23685 = getelementptr inbounds float* %tmp23684, i64 1
+  %tmp23686 = getelementptr inbounds float* %tmp23685, i64 1
+  %tmp23687 = getelementptr inbounds float* %tmp23686, i64 1
+  %tmp23688 = getelementptr inbounds float* %tmp23687, i64 1
+  %tmp23689 = getelementptr inbounds float* %tmp23688, i64 1
+  %tmp23690 = getelementptr inbounds float* %tmp23689, i64 1
+  %tmp23691 = getelementptr inbounds float* %tmp23690, i64 1
+  %tmp23692 = getelementptr inbounds float* %tmp23691, i64 1
+  %tmp23693 = getelementptr inbounds float* %tmp23692, i64 1
+  %tmp23694 = getelementptr inbounds float* %tmp23693, i64 1
+  %tmp23695 = getelementptr inbounds float* %tmp23694, i64 1
+  %tmp23696 = getelementptr inbounds float* %tmp23695, i64 1
+  %tmp23697 = getelementptr inbounds float* %tmp23696, i64 1
+  %tmp23698 = getelementptr inbounds float* %tmp23697, i64 1
+  %tmp23699 = getelementptr inbounds float* %tmp23698, i64 1
+  %tmp23700 = getelementptr inbounds float* %tmp23699, i64 1
+  %tmp23701 = getelementptr inbounds float* %tmp23700, i64 1
+  %tmp23702 = getelementptr inbounds float* %tmp23701, i64 1
+  %tmp23703 = getelementptr inbounds float* %tmp23702, i64 1
+  %tmp23704 = getelementptr inbounds float* %tmp23703, i64 1
+  %tmp23705 = getelementptr inbounds float* %tmp23704, i64 1
+  %tmp23706 = getelementptr inbounds float* %tmp23705, i64 1
+  %tmp23707 = getelementptr inbounds float* %tmp23706, i64 1
+  %tmp23708 = getelementptr inbounds float* %tmp23707, i64 1
+  %tmp23709 = getelementptr inbounds float* %tmp23708, i64 1
+  %tmp23710 = getelementptr inbounds float* %tmp23709, i64 1
+  %tmp23711 = getelementptr inbounds float* %tmp23710, i64 1
+  %tmp23712 = getelementptr inbounds float* %tmp23711, i64 1
+  %tmp23713 = getelementptr inbounds float* %tmp23712, i64 1
+  %tmp23714 = getelementptr inbounds float* %tmp23713, i64 1
+  %tmp23715 = getelementptr inbounds float* %tmp23714, i64 1
+  %tmp23716 = getelementptr inbounds float* %tmp23715, i64 1
+  %tmp23717 = getelementptr inbounds float* %tmp23716, i64 1
+  %tmp23718 = getelementptr inbounds float* %tmp23717, i64 1
+  %tmp23719 = getelementptr inbounds float* %tmp23718, i64 1
+  %tmp23720 = getelementptr inbounds float* %tmp23719, i64 1
+  %tmp23721 = getelementptr inbounds float* %tmp23720, i64 1
+  %tmp23722 = getelementptr inbounds float* %tmp23721, i64 1
+  %tmp23723 = getelementptr inbounds float* %tmp23722, i64 1
+  %tmp23724 = getelementptr inbounds float* %tmp23723, i64 1
+  %tmp23725 = getelementptr inbounds float* %tmp23724, i64 1
+  %tmp23726 = getelementptr inbounds float* %tmp23725, i64 1
+  %tmp23727 = getelementptr inbounds float* %tmp23726, i64 1
+  %tmp23728 = getelementptr inbounds float* %tmp23727, i64 1
+  %tmp23729 = getelementptr inbounds float* %tmp23728, i64 1
+  %tmp23730 = getelementptr inbounds float* %tmp23729, i64 1
+  %tmp23731 = getelementptr inbounds float* %tmp23730, i64 1
+  %tmp23732 = getelementptr inbounds float* %tmp23731, i64 1
+  %tmp23733 = getelementptr inbounds float* %tmp23732, i64 1
+  %tmp23734 = getelementptr inbounds float* %tmp23733, i64 1
+  %tmp23735 = getelementptr inbounds float* %tmp23734, i64 1
+  %tmp23736 = getelementptr inbounds float* %tmp23735, i64 1
+  %tmp23737 = getelementptr inbounds float* %tmp23736, i64 1
+  %tmp23738 = getelementptr inbounds float* %tmp23737, i64 1
+  %tmp23739 = getelementptr inbounds float* %tmp23738, i64 1
+  %tmp23740 = getelementptr inbounds float* %tmp23739, i64 1
+  %tmp23741 = getelementptr inbounds float* %tmp23740, i64 1
+  %tmp23742 = getelementptr inbounds float* %tmp23741, i64 1
+  %tmp23743 = getelementptr inbounds float* %tmp23742, i64 1
+  %tmp23744 = getelementptr inbounds float* %tmp23743, i64 1
+  %tmp23745 = getelementptr inbounds float* %tmp23744, i64 1
+  %tmp23746 = getelementptr inbounds float* %tmp23745, i64 1
+  %tmp23747 = getelementptr inbounds float* %tmp23746, i64 1
+  %tmp23748 = getelementptr inbounds float* %tmp23747, i64 1
+  %tmp23749 = getelementptr inbounds float* %tmp23748, i64 1
+  %tmp23750 = getelementptr inbounds float* %tmp23749, i64 1
+  %tmp23751 = getelementptr inbounds float* %tmp23750, i64 1
+  %tmp23752 = getelementptr inbounds float* %tmp23751, i64 1
+  %tmp23753 = getelementptr inbounds float* %tmp23752, i64 1
+  %tmp23754 = getelementptr inbounds float* %tmp23753, i64 1
+  %tmp23755 = getelementptr inbounds float* %tmp23754, i64 1
+  %tmp23756 = getelementptr inbounds float* %tmp23755, i64 1
+  %tmp23757 = getelementptr inbounds float* %tmp23756, i64 1
+  %tmp23758 = getelementptr inbounds float* %tmp23757, i64 1
+  %tmp23759 = getelementptr inbounds float* %tmp23758, i64 1
+  %tmp23760 = getelementptr inbounds float* %tmp23759, i64 1
+  %tmp23761 = getelementptr inbounds float* %tmp23760, i64 1
+  %tmp23762 = getelementptr inbounds float* %tmp23761, i64 1
+  %tmp23763 = getelementptr inbounds float* %tmp23762, i64 1
+  %tmp23764 = getelementptr inbounds float* %tmp23763, i64 1
+  %tmp23765 = getelementptr inbounds float* %tmp23764, i64 1
+  %tmp23766 = getelementptr inbounds float* %tmp23765, i64 1
+  %tmp23767 = getelementptr inbounds float* %tmp23766, i64 1
+  %tmp23768 = getelementptr inbounds float* %tmp23767, i64 1
+  %tmp23769 = getelementptr inbounds float* %tmp23768, i64 1
+  %tmp23770 = getelementptr inbounds float* %tmp23769, i64 1
+  %tmp23771 = getelementptr inbounds float* %tmp23770, i64 1
+  %tmp23772 = getelementptr inbounds float* %tmp23771, i64 1
+  %tmp23773 = getelementptr inbounds float* %tmp23772, i64 1
+  %tmp23774 = getelementptr inbounds float* %tmp23773, i64 1
+  %tmp23775 = getelementptr inbounds float* %tmp23774, i64 1
+  %tmp23776 = getelementptr inbounds float* %tmp23775, i64 1
+  %tmp23777 = getelementptr inbounds float* %tmp23776, i64 1
+  %tmp23778 = getelementptr inbounds float* %tmp23777, i64 1
+  %tmp23779 = getelementptr inbounds float* %tmp23778, i64 1
+  %tmp23780 = getelementptr inbounds float* %tmp23779, i64 1
+  %tmp23781 = getelementptr inbounds float* %tmp23780, i64 1
+  %tmp23782 = getelementptr inbounds float* %tmp23781, i64 1
+  %tmp23783 = getelementptr inbounds float* %tmp23782, i64 1
+  %tmp23784 = getelementptr inbounds float* %tmp23783, i64 1
+  %tmp23785 = getelementptr inbounds float* %tmp23784, i64 1
+  %tmp23786 = getelementptr inbounds float* %tmp23785, i64 1
+  %tmp23787 = getelementptr inbounds float* %tmp23786, i64 1
+  %tmp23788 = getelementptr inbounds float* %tmp23787, i64 1
+  %tmp23789 = getelementptr inbounds float* %tmp23788, i64 1
+  %tmp23790 = getelementptr inbounds float* %tmp23789, i64 1
+  %tmp23791 = getelementptr inbounds float* %tmp23790, i64 1
+  %tmp23792 = getelementptr inbounds float* %tmp23791, i64 1
+  %tmp23793 = getelementptr inbounds float* %tmp23792, i64 1
+  %tmp23794 = getelementptr inbounds float* %tmp23793, i64 1
+  %tmp23795 = getelementptr inbounds float* %tmp23794, i64 1
+  %tmp23796 = getelementptr inbounds float* %tmp23795, i64 1
+  %tmp23797 = getelementptr inbounds float* %tmp23796, i64 1
+  %tmp23798 = getelementptr inbounds float* %tmp23797, i64 1
+  %tmp23799 = getelementptr inbounds float* %tmp23798, i64 1
+  %tmp23800 = getelementptr inbounds float* %tmp23799, i64 1
+  %tmp23801 = getelementptr inbounds float* %tmp23800, i64 1
+  %tmp23802 = getelementptr inbounds float* %tmp23801, i64 1
+  %tmp23803 = getelementptr inbounds float* %tmp23802, i64 1
+  %tmp23804 = getelementptr inbounds float* %tmp23803, i64 1
+  %tmp23805 = getelementptr inbounds float* %tmp23804, i64 1
+  %tmp23806 = getelementptr inbounds float* %tmp23805, i64 1
+  %tmp23807 = getelementptr inbounds float* %tmp23806, i64 1
+  %tmp23808 = getelementptr inbounds float* %tmp23807, i64 1
+  %tmp23809 = getelementptr inbounds float* %tmp23808, i64 1
+  %tmp23810 = getelementptr inbounds float* %tmp23809, i64 1
+  %tmp23811 = getelementptr inbounds float* %tmp23810, i64 1
+  %tmp23812 = getelementptr inbounds float* %tmp23811, i64 1
+  %tmp23813 = getelementptr inbounds float* %tmp23812, i64 1
+  %tmp23814 = getelementptr inbounds float* %tmp23813, i64 1
+  %tmp23815 = getelementptr inbounds float* %tmp23814, i64 1
+  %tmp23816 = getelementptr inbounds float* %tmp23815, i64 1
+  %tmp23817 = getelementptr inbounds float* %tmp23816, i64 1
+  %tmp23818 = getelementptr inbounds float* %tmp23817, i64 1
+  %tmp23819 = getelementptr inbounds float* %tmp23818, i64 1
+  %tmp23820 = getelementptr inbounds float* %tmp23819, i64 1
+  %tmp23821 = getelementptr inbounds float* %tmp23820, i64 1
+  %tmp23822 = getelementptr inbounds float* %tmp23821, i64 1
+  %tmp23823 = getelementptr inbounds float* %tmp23822, i64 1
+  %tmp23824 = getelementptr inbounds float* %tmp23823, i64 1
+  %tmp23825 = getelementptr inbounds float* %tmp23824, i64 1
+  %tmp23826 = getelementptr inbounds float* %tmp23825, i64 1
+  %tmp23827 = getelementptr inbounds float* %tmp23826, i64 1
+  %tmp23828 = getelementptr inbounds float* %tmp23827, i64 1
+  %tmp23829 = getelementptr inbounds float* %tmp23828, i64 1
+  %tmp23830 = getelementptr inbounds float* %tmp23829, i64 1
+  %tmp23831 = getelementptr inbounds float* %tmp23830, i64 1
+  %tmp23832 = getelementptr inbounds float* %tmp23831, i64 1
+  %tmp23833 = getelementptr inbounds float* %tmp23832, i64 1
+  %tmp23834 = getelementptr inbounds float* %tmp23833, i64 1
+  %tmp23835 = getelementptr inbounds float* %tmp23834, i64 1
+  %tmp23836 = getelementptr inbounds float* %tmp23835, i64 1
+  %tmp23837 = getelementptr inbounds float* %tmp23836, i64 1
+  %tmp23838 = getelementptr inbounds float* %tmp23837, i64 1
+  %tmp23839 = getelementptr inbounds float* %tmp23838, i64 1
+  %tmp23840 = getelementptr inbounds float* %tmp23839, i64 1
+  %tmp23841 = getelementptr inbounds float* %tmp23840, i64 1
+  %tmp23842 = getelementptr inbounds float* %tmp23841, i64 1
+  %tmp23843 = getelementptr inbounds float* %tmp23842, i64 1
+  %tmp23844 = getelementptr inbounds float* %tmp23843, i64 1
+  %tmp23845 = getelementptr inbounds float* %tmp23844, i64 1
+  %tmp23846 = getelementptr inbounds float* %tmp23845, i64 1
+  %tmp23847 = getelementptr inbounds float* %tmp23846, i64 1
+  %tmp23848 = getelementptr inbounds float* %tmp23847, i64 1
+  %tmp23849 = getelementptr inbounds float* %tmp23848, i64 1
+  %tmp23850 = getelementptr inbounds float* %tmp23849, i64 1
+  %tmp23851 = getelementptr inbounds float* %tmp23850, i64 1
+  %tmp23852 = getelementptr inbounds float* %tmp23851, i64 1
+  %tmp23853 = getelementptr inbounds float* %tmp23852, i64 1
+  %tmp23854 = getelementptr inbounds float* %tmp23853, i64 1
+  %tmp23855 = getelementptr inbounds float* %tmp23854, i64 1
+  %tmp23856 = getelementptr inbounds float* %tmp23855, i64 1
+  %tmp23857 = getelementptr inbounds float* %tmp23856, i64 1
+  %tmp23858 = getelementptr inbounds float* %tmp23857, i64 1
+  %tmp23859 = getelementptr inbounds float* %tmp23858, i64 1
+  %tmp23860 = getelementptr inbounds float* %tmp23859, i64 1
+  %tmp23861 = getelementptr inbounds float* %tmp23860, i64 1
+  %tmp23862 = getelementptr inbounds float* %tmp23861, i64 1
+  %tmp23863 = getelementptr inbounds float* %tmp23862, i64 1
+  %tmp23864 = getelementptr inbounds float* %tmp23863, i64 1
+  %tmp23865 = getelementptr inbounds float* %tmp23864, i64 1
+  %tmp23866 = getelementptr inbounds float* %tmp23865, i64 1
+  %tmp23867 = getelementptr inbounds float* %tmp23866, i64 1
+  %tmp23868 = getelementptr inbounds float* %tmp23867, i64 1
+  %tmp23869 = getelementptr inbounds float* %tmp23868, i64 1
+  %tmp23870 = getelementptr inbounds float* %tmp23869, i64 1
+  %tmp23871 = getelementptr inbounds float* %tmp23870, i64 1
+  %tmp23872 = getelementptr inbounds float* %tmp23871, i64 1
+  %tmp23873 = getelementptr inbounds float* %tmp23872, i64 1
+  %tmp23874 = getelementptr inbounds float* %tmp23873, i64 1
+  %tmp23875 = getelementptr inbounds float* %tmp23874, i64 1
+  %tmp23876 = getelementptr inbounds float* %tmp23875, i64 1
+  %tmp23877 = getelementptr inbounds float* %tmp23876, i64 1
+  %tmp23878 = getelementptr inbounds float* %tmp23877, i64 1
+  %tmp23879 = getelementptr inbounds float* %tmp23878, i64 1
+  %tmp23880 = getelementptr inbounds float* %tmp23879, i64 1
+  %tmp23881 = getelementptr inbounds float* %tmp23880, i64 1
+  %tmp23882 = getelementptr inbounds float* %tmp23881, i64 1
+  %tmp23883 = getelementptr inbounds float* %tmp23882, i64 1
+  %tmp23884 = getelementptr inbounds float* %tmp23883, i64 1
+  %tmp23885 = getelementptr inbounds float* %tmp23884, i64 1
+  %tmp23886 = getelementptr inbounds float* %tmp23885, i64 1
+  %tmp23887 = getelementptr inbounds float* %tmp23886, i64 1
+  %tmp23888 = getelementptr inbounds float* %tmp23887, i64 1
+  %tmp23889 = getelementptr inbounds float* %tmp23888, i64 1
+  %tmp23890 = getelementptr inbounds float* %tmp23889, i64 1
+  %tmp23891 = getelementptr inbounds float* %tmp23890, i64 1
+  %tmp23892 = getelementptr inbounds float* %tmp23891, i64 1
+  %tmp23893 = getelementptr inbounds float* %tmp23892, i64 1
+  %tmp23894 = getelementptr inbounds float* %tmp23893, i64 1
+  %tmp23895 = getelementptr inbounds float* %tmp23894, i64 1
+  %tmp23896 = getelementptr inbounds float* %tmp23895, i64 1
+  %tmp23897 = getelementptr inbounds float* %tmp23896, i64 1
+  %tmp23898 = getelementptr inbounds float* %tmp23897, i64 1
+  %tmp23899 = getelementptr inbounds float* %tmp23898, i64 1
+  %tmp23900 = getelementptr inbounds float* %tmp23899, i64 1
+  %tmp23901 = getelementptr inbounds float* %tmp23900, i64 1
+  %tmp23902 = getelementptr inbounds float* %tmp23901, i64 1
+  %tmp23903 = getelementptr inbounds float* %tmp23902, i64 1
+  %tmp23904 = getelementptr inbounds float* %tmp23903, i64 1
+  %tmp23905 = getelementptr inbounds float* %tmp23904, i64 1
+  %tmp23906 = getelementptr inbounds float* %tmp23905, i64 1
+  %tmp23907 = getelementptr inbounds float* %tmp23906, i64 1
+  %tmp23908 = getelementptr inbounds float* %tmp23907, i64 1
+  %tmp23909 = getelementptr inbounds float* %tmp23908, i64 1
+  %tmp23910 = getelementptr inbounds float* %tmp23909, i64 1
+  %tmp23911 = getelementptr inbounds float* %tmp23910, i64 1
+  %tmp23912 = getelementptr inbounds float* %tmp23911, i64 1
+  %tmp23913 = getelementptr inbounds float* %tmp23912, i64 1
+  %tmp23914 = getelementptr inbounds float* %tmp23913, i64 1
+  %tmp23915 = getelementptr inbounds float* %tmp23914, i64 1
+  %tmp23916 = getelementptr inbounds float* %tmp23915, i64 1
+  %tmp23917 = getelementptr inbounds float* %tmp23916, i64 1
+  %tmp23918 = getelementptr inbounds float* %tmp23917, i64 1
+  %tmp23919 = getelementptr inbounds float* %tmp23918, i64 1
+  %tmp23920 = getelementptr inbounds float* %tmp23919, i64 1
+  %tmp23921 = getelementptr inbounds float* %tmp23920, i64 1
+  %tmp23922 = getelementptr inbounds float* %tmp23921, i64 1
+  %tmp23923 = getelementptr inbounds float* %tmp23922, i64 1
+  %tmp23924 = getelementptr inbounds float* %tmp23923, i64 1
+  %tmp23925 = getelementptr inbounds float* %tmp23924, i64 1
+  %tmp23926 = getelementptr inbounds float* %tmp23925, i64 1
+  %tmp23927 = getelementptr inbounds float* %tmp23926, i64 1
+  %tmp23928 = getelementptr inbounds float* %tmp23927, i64 1
+  %tmp23929 = getelementptr inbounds float* %tmp23928, i64 1
+  %tmp23930 = getelementptr inbounds float* %tmp23929, i64 1
+  %tmp23931 = getelementptr inbounds float* %tmp23930, i64 1
+  %tmp23932 = getelementptr inbounds float* %tmp23931, i64 1
+  %tmp23933 = getelementptr inbounds float* %tmp23932, i64 1
+  %tmp23934 = getelementptr inbounds float* %tmp23933, i64 1
+  %tmp23935 = getelementptr inbounds float* %tmp23934, i64 1
+  %tmp23936 = getelementptr inbounds float* %tmp23935, i64 1
+  %tmp23937 = getelementptr inbounds float* %tmp23936, i64 1
+  %tmp23938 = getelementptr inbounds float* %tmp23937, i64 1
+  %tmp23939 = getelementptr inbounds float* %tmp23938, i64 1
+  %tmp23940 = getelementptr inbounds float* %tmp23939, i64 1
+  %tmp23941 = getelementptr inbounds float* %tmp23940, i64 1
+  %tmp23942 = getelementptr inbounds float* %tmp23941, i64 1
+  %tmp23943 = getelementptr inbounds float* %tmp23942, i64 1
+  %tmp23944 = getelementptr inbounds float* %tmp23943, i64 1
+  %tmp23945 = getelementptr inbounds float* %tmp23944, i64 1
+  %tmp23946 = getelementptr inbounds float* %tmp23945, i64 1
+  %tmp23947 = getelementptr inbounds float* %tmp23946, i64 1
+  %tmp23948 = getelementptr inbounds float* %tmp23947, i64 1
+  %tmp23949 = getelementptr inbounds float* %tmp23948, i64 1
+  %tmp23950 = getelementptr inbounds float* %tmp23949, i64 1
+  %tmp23951 = getelementptr inbounds float* %tmp23950, i64 1
+  %tmp23952 = getelementptr inbounds float* %tmp23951, i64 1
+  %tmp23953 = getelementptr inbounds float* %tmp23952, i64 1
+  %tmp23954 = getelementptr inbounds float* %tmp23953, i64 1
+  %tmp23955 = getelementptr inbounds float* %tmp23954, i64 1
+  %tmp23956 = getelementptr inbounds float* %tmp23955, i64 1
+  %tmp23957 = getelementptr inbounds float* %tmp23956, i64 1
+  %tmp23958 = getelementptr inbounds float* %tmp23957, i64 1
+  %tmp23959 = getelementptr inbounds float* %tmp23958, i64 1
+  %tmp23960 = getelementptr inbounds float* %tmp23959, i64 1
+  %tmp23961 = getelementptr inbounds float* %tmp23960, i64 1
+  %tmp23962 = getelementptr inbounds float* %tmp23961, i64 1
+  %tmp23963 = getelementptr inbounds float* %tmp23962, i64 1
+  %tmp23964 = getelementptr inbounds float* %tmp23963, i64 1
+  %tmp23965 = getelementptr inbounds float* %tmp23964, i64 1
+  %tmp23966 = getelementptr inbounds float* %tmp23965, i64 1
+  %tmp23967 = getelementptr inbounds float* %tmp23966, i64 1
+  %tmp23968 = getelementptr inbounds float* %tmp23967, i64 1
+  %tmp23969 = getelementptr inbounds float* %tmp23968, i64 1
+  %tmp23970 = getelementptr inbounds float* %tmp23969, i64 1
+  %tmp23971 = getelementptr inbounds float* %tmp23970, i64 1
+  %tmp23972 = getelementptr inbounds float* %tmp23971, i64 1
+  %tmp23973 = getelementptr inbounds float* %tmp23972, i64 1
+  %tmp23974 = getelementptr inbounds float* %tmp23973, i64 1
+  %tmp23975 = getelementptr inbounds float* %tmp23974, i64 1
+  %tmp23976 = getelementptr inbounds float* %tmp23975, i64 1
+  %tmp23977 = getelementptr inbounds float* %tmp23976, i64 1
+  %tmp23978 = getelementptr inbounds float* %tmp23977, i64 1
+  %tmp23979 = getelementptr inbounds float* %tmp23978, i64 1
+  %tmp23980 = getelementptr inbounds float* %tmp23979, i64 1
+  %tmp23981 = getelementptr inbounds float* %tmp23980, i64 1
+  %tmp23982 = getelementptr inbounds float* %tmp23981, i64 1
+  %tmp23983 = getelementptr inbounds float* %tmp23982, i64 1
+  %tmp23984 = getelementptr inbounds float* %tmp23983, i64 1
+  %tmp23985 = getelementptr inbounds float* %tmp23984, i64 1
+  %tmp23986 = getelementptr inbounds float* %tmp23985, i64 1
+  %tmp23987 = getelementptr inbounds float* %tmp23986, i64 1
+  %tmp23988 = getelementptr inbounds float* %tmp23987, i64 1
+  %tmp23989 = getelementptr inbounds float* %tmp23988, i64 1
+  %tmp23990 = getelementptr inbounds float* %tmp23989, i64 1
+  %tmp23991 = getelementptr inbounds float* %tmp23990, i64 1
+  %tmp23992 = getelementptr inbounds float* %tmp23991, i64 1
+  %tmp23993 = getelementptr inbounds float* %tmp23992, i64 1
+  %tmp23994 = getelementptr inbounds float* %tmp23993, i64 1
+  %tmp23995 = getelementptr inbounds float* %tmp23994, i64 1
+  %tmp23996 = getelementptr inbounds float* %tmp23995, i64 1
+  %tmp23997 = getelementptr inbounds float* %tmp23996, i64 1
+  %tmp23998 = getelementptr inbounds float* %tmp23997, i64 1
+  %tmp23999 = getelementptr inbounds float* %tmp23998, i64 1
+  %tmp24000 = getelementptr inbounds float* %tmp23999, i64 1
+  %tmp24001 = getelementptr inbounds float* %tmp24000, i64 1
+  %tmp24002 = getelementptr inbounds float* %tmp24001, i64 1
+  %tmp24003 = getelementptr inbounds float* %tmp24002, i64 1
+  %tmp24004 = getelementptr inbounds float* %tmp24003, i64 1
+  %tmp24005 = getelementptr inbounds float* %tmp24004, i64 1
+  %tmp24006 = getelementptr inbounds float* %tmp24005, i64 1
+  %tmp24007 = getelementptr inbounds float* %tmp24006, i64 1
+  %tmp24008 = getelementptr inbounds float* %tmp24007, i64 1
+  %tmp24009 = getelementptr inbounds float* %tmp24008, i64 1
+  %tmp24010 = getelementptr inbounds float* %tmp24009, i64 1
+  %tmp24011 = getelementptr inbounds float* %tmp24010, i64 1
+  %tmp24012 = getelementptr inbounds float* %tmp24011, i64 1
+  %tmp24013 = getelementptr inbounds float* %tmp24012, i64 1
+  %tmp24014 = getelementptr inbounds float* %tmp24013, i64 1
+  %tmp24015 = getelementptr inbounds float* %tmp24014, i64 1
+  %tmp24016 = getelementptr inbounds float* %tmp24015, i64 1
+  %tmp24017 = getelementptr inbounds float* %tmp24016, i64 1
+  %tmp24018 = getelementptr inbounds float* %tmp24017, i64 1
+  %tmp24019 = getelementptr inbounds float* %tmp24018, i64 1
+  %tmp24020 = getelementptr inbounds float* %tmp24019, i64 1
+  %tmp24021 = getelementptr inbounds float* %tmp24020, i64 1
+  %tmp24022 = getelementptr inbounds float* %tmp24021, i64 1
+  %tmp24023 = getelementptr inbounds float* %tmp24022, i64 1
+  %tmp24024 = getelementptr inbounds float* %tmp24023, i64 1
+  %tmp24025 = getelementptr inbounds float* %tmp24024, i64 1
+  %tmp24026 = getelementptr inbounds float* %tmp24025, i64 1
+  %tmp24027 = getelementptr inbounds float* %tmp24026, i64 1
+  %tmp24028 = getelementptr inbounds float* %tmp24027, i64 1
+  %tmp24029 = getelementptr inbounds float* %tmp24028, i64 1
+  %tmp24030 = getelementptr inbounds float* %tmp24029, i64 1
+  %tmp24031 = getelementptr inbounds float* %tmp24030, i64 1
+  %tmp24032 = getelementptr inbounds float* %tmp24031, i64 1
+  %tmp24033 = getelementptr inbounds float* %tmp24032, i64 1
+  %tmp24034 = getelementptr inbounds float* %tmp24033, i64 1
+  %tmp24035 = getelementptr inbounds float* %tmp24034, i64 1
+  %tmp24036 = getelementptr inbounds float* %tmp24035, i64 1
+  %tmp24037 = getelementptr inbounds float* %tmp24036, i64 1
+  %tmp24038 = getelementptr inbounds float* %tmp24037, i64 1
+  %tmp24039 = getelementptr inbounds float* %tmp24038, i64 1
+  %tmp24040 = getelementptr inbounds float* %tmp24039, i64 1
+  %tmp24041 = getelementptr inbounds float* %tmp24040, i64 1
+  %tmp24042 = getelementptr inbounds float* %tmp24041, i64 1
+  %tmp24043 = getelementptr inbounds float* %tmp24042, i64 1
+  %tmp24044 = getelementptr inbounds float* %tmp24043, i64 1
+  %tmp24045 = getelementptr inbounds float* %tmp24044, i64 1
+  %tmp24046 = getelementptr inbounds float* %tmp24045, i64 1
+  %tmp24047 = getelementptr inbounds float* %tmp24046, i64 1
+  %tmp24048 = getelementptr inbounds float* %tmp24047, i64 1
+  %tmp24049 = getelementptr inbounds float* %tmp24048, i64 1
+  %tmp24050 = getelementptr inbounds float* %tmp24049, i64 1
+  %tmp24051 = getelementptr inbounds float* %tmp24050, i64 1
+  %tmp24052 = getelementptr inbounds float* %tmp24051, i64 1
+  %tmp24053 = getelementptr inbounds float* %tmp24052, i64 1
+  %tmp24054 = getelementptr inbounds float* %tmp24053, i64 1
+  %tmp24055 = getelementptr inbounds float* %tmp24054, i64 1
+  %tmp24056 = getelementptr inbounds float* %tmp24055, i64 1
+  %tmp24057 = getelementptr inbounds float* %tmp24056, i64 1
+  %tmp24058 = getelementptr inbounds float* %tmp24057, i64 1
+  %tmp24059 = getelementptr inbounds float* %tmp24058, i64 1
+  %tmp24060 = getelementptr inbounds float* %tmp24059, i64 1
+  %tmp24061 = getelementptr inbounds float* %tmp24060, i64 1
+  %tmp24062 = getelementptr inbounds float* %tmp24061, i64 1
+  %tmp24063 = getelementptr inbounds float* %tmp24062, i64 1
+  %tmp24064 = getelementptr inbounds float* %tmp24063, i64 1
+  %tmp24065 = getelementptr inbounds float* %tmp24064, i64 1
+  %tmp24066 = getelementptr inbounds float* %tmp24065, i64 1
+  %tmp24067 = getelementptr inbounds float* %tmp24066, i64 1
+  %tmp24068 = getelementptr inbounds float* %tmp24067, i64 1
+  %tmp24069 = getelementptr inbounds float* %tmp24068, i64 1
+  %tmp24070 = getelementptr inbounds float* %tmp24069, i64 1
+  %tmp24071 = getelementptr inbounds float* %tmp24070, i64 1
+  %tmp24072 = getelementptr inbounds float* %tmp24071, i64 1
+  %tmp24073 = getelementptr inbounds float* %tmp24072, i64 1
+  %tmp24074 = getelementptr inbounds float* %tmp24073, i64 1
+  %tmp24075 = getelementptr inbounds float* %tmp24074, i64 1
+  %tmp24076 = getelementptr inbounds float* %tmp24075, i64 1
+  %tmp24077 = getelementptr inbounds float* %tmp24076, i64 1
+  %tmp24078 = getelementptr inbounds float* %tmp24077, i64 1
+  %tmp24079 = getelementptr inbounds float* %tmp24078, i64 1
+  %tmp24080 = getelementptr inbounds float* %tmp24079, i64 1
+  %tmp24081 = getelementptr inbounds float* %tmp24080, i64 1
+  %tmp24082 = getelementptr inbounds float* %tmp24081, i64 1
+  %tmp24083 = getelementptr inbounds float* %tmp24082, i64 1
+  %tmp24084 = getelementptr inbounds float* %tmp24083, i64 1
+  %tmp24085 = getelementptr inbounds float* %tmp24084, i64 1
+  %tmp24086 = getelementptr inbounds float* %tmp24085, i64 1
+  %tmp24087 = getelementptr inbounds float* %tmp24086, i64 1
+  %tmp24088 = getelementptr inbounds float* %tmp24087, i64 1
+  %tmp24089 = getelementptr inbounds float* %tmp24088, i64 1
+  %tmp24090 = getelementptr inbounds float* %tmp24089, i64 1
+  %tmp24091 = getelementptr inbounds float* %tmp24090, i64 1
+  %tmp24092 = getelementptr inbounds float* %tmp24091, i64 1
+  %tmp24093 = getelementptr inbounds float* %tmp24092, i64 1
+  %tmp24094 = getelementptr inbounds float* %tmp24093, i64 1
+  %tmp24095 = getelementptr inbounds float* %tmp24094, i64 1
+  %tmp24096 = getelementptr inbounds float* %tmp24095, i64 1
+  %tmp24097 = getelementptr inbounds float* %tmp24096, i64 1
+  %tmp24098 = getelementptr inbounds float* %tmp24097, i64 1
+  %tmp24099 = getelementptr inbounds float* %tmp24098, i64 1
+  %tmp24100 = getelementptr inbounds float* %tmp24099, i64 1
+  %tmp24101 = getelementptr inbounds float* %tmp24100, i64 1
+  %tmp24102 = getelementptr inbounds float* %tmp24101, i64 1
+  %tmp24103 = getelementptr inbounds float* %tmp24102, i64 1
+  %tmp24104 = getelementptr inbounds float* %tmp24103, i64 1
+  %tmp24105 = getelementptr inbounds float* %tmp24104, i64 1
+  %tmp24106 = getelementptr inbounds float* %tmp24105, i64 1
+  %tmp24107 = getelementptr inbounds float* %tmp24106, i64 1
+  %tmp24108 = getelementptr inbounds float* %tmp24107, i64 1
+  %tmp24109 = getelementptr inbounds float* %tmp24108, i64 1
+  %tmp24110 = getelementptr inbounds float* %tmp24109, i64 1
+  %tmp24111 = getelementptr inbounds float* %tmp24110, i64 1
+  %tmp24112 = getelementptr inbounds float* %tmp24111, i64 1
+  %tmp24113 = getelementptr inbounds float* %tmp24112, i64 1
+  %tmp24114 = getelementptr inbounds float* %tmp24113, i64 1
+  %tmp24115 = getelementptr inbounds float* %tmp24114, i64 1
+  %tmp24116 = getelementptr inbounds float* %tmp24115, i64 1
+  %tmp24117 = getelementptr inbounds float* %tmp24116, i64 1
+  %tmp24118 = getelementptr inbounds float* %tmp24117, i64 1
+  %tmp24119 = getelementptr inbounds float* %tmp24118, i64 1
+  %tmp24120 = getelementptr inbounds float* %tmp24119, i64 1
+  %tmp24121 = getelementptr inbounds float* %tmp24120, i64 1
+  %tmp24122 = getelementptr inbounds float* %tmp24121, i64 1
+  %tmp24123 = getelementptr inbounds float* %tmp24122, i64 1
+  %tmp24124 = getelementptr inbounds float* %tmp24123, i64 1
+  %tmp24125 = getelementptr inbounds float* %tmp24124, i64 1
+  %tmp24126 = getelementptr inbounds float* %tmp24125, i64 1
+  %tmp24127 = getelementptr inbounds float* %tmp24126, i64 1
+  %tmp24128 = getelementptr inbounds float* %tmp24127, i64 1
+  %tmp24129 = getelementptr inbounds float* %tmp24128, i64 1
+  %tmp24130 = getelementptr inbounds float* %tmp24129, i64 1
+  %tmp24131 = getelementptr inbounds float* %tmp24130, i64 1
+  %tmp24132 = getelementptr inbounds float* %tmp24131, i64 1
+  %tmp24133 = getelementptr inbounds float* %tmp24132, i64 1
+  %tmp24134 = getelementptr inbounds float* %tmp24133, i64 1
+  %tmp24135 = getelementptr inbounds float* %tmp24134, i64 1
+  %tmp24136 = getelementptr inbounds float* %tmp24135, i64 1
+  %tmp24137 = getelementptr inbounds float* %tmp24136, i64 1
+  %tmp24138 = getelementptr inbounds float* %tmp24137, i64 1
+  %tmp24139 = getelementptr inbounds float* %tmp24138, i64 1
+  %tmp24140 = getelementptr inbounds float* %tmp24139, i64 1
+  %tmp24141 = getelementptr inbounds float* %tmp24140, i64 1
+  %tmp24142 = getelementptr inbounds float* %tmp24141, i64 1
+  %tmp24143 = getelementptr inbounds float* %tmp24142, i64 1
+  %tmp24144 = getelementptr inbounds float* %tmp24143, i64 1
+  %tmp24145 = getelementptr inbounds float* %tmp24144, i64 1
+  %tmp24146 = getelementptr inbounds float* %tmp24145, i64 1
+  %tmp24147 = getelementptr inbounds float* %tmp24146, i64 1
+  %tmp24148 = getelementptr inbounds float* %tmp24147, i64 1
+  %tmp24149 = getelementptr inbounds float* %tmp24148, i64 1
+  %tmp24150 = getelementptr inbounds float* %tmp24149, i64 1
+  %tmp24151 = getelementptr inbounds float* %tmp24150, i64 1
+  %tmp24152 = getelementptr inbounds float* %tmp24151, i64 1
+  %tmp24153 = getelementptr inbounds float* %tmp24152, i64 1
+  %tmp24154 = getelementptr inbounds float* %tmp24153, i64 1
+  %tmp24155 = getelementptr inbounds float* %tmp24154, i64 1
+  %tmp24156 = getelementptr inbounds float* %tmp24155, i64 1
+  %tmp24157 = getelementptr inbounds float* %tmp24156, i64 1
+  %tmp24158 = getelementptr inbounds float* %tmp24157, i64 1
+  %tmp24159 = getelementptr inbounds float* %tmp24158, i64 1
+  %tmp24160 = getelementptr inbounds float* %tmp24159, i64 1
+  %tmp24161 = getelementptr inbounds float* %tmp24160, i64 1
+  %tmp24162 = getelementptr inbounds float* %tmp24161, i64 1
+  %tmp24163 = getelementptr inbounds float* %tmp24162, i64 1
+  %tmp24164 = getelementptr inbounds float* %tmp24163, i64 1
+  %tmp24165 = getelementptr inbounds float* %tmp24164, i64 1
+  %tmp24166 = getelementptr inbounds float* %tmp24165, i64 1
+  %tmp24167 = getelementptr inbounds float* %tmp24166, i64 1
+  %tmp24168 = getelementptr inbounds float* %tmp24167, i64 1
+  %tmp24169 = getelementptr inbounds float* %tmp24168, i64 1
+  %tmp24170 = getelementptr inbounds float* %tmp24169, i64 1
+  %tmp24171 = getelementptr inbounds float* %tmp24170, i64 1
+  %tmp24172 = getelementptr inbounds float* %tmp24171, i64 1
+  %tmp24173 = getelementptr inbounds float* %tmp24172, i64 1
+  %tmp24174 = getelementptr inbounds float* %tmp24173, i64 1
+  %tmp24175 = getelementptr inbounds float* %tmp24174, i64 1
+  %tmp24176 = getelementptr inbounds float* %tmp24175, i64 1
+  %tmp24177 = getelementptr inbounds float* %tmp24176, i64 1
+  %tmp24178 = getelementptr inbounds float* %tmp24177, i64 1
+  %tmp24179 = getelementptr inbounds float* %tmp24178, i64 1
+  %tmp24180 = getelementptr inbounds float* %tmp24179, i64 1
+  %tmp24181 = getelementptr inbounds float* %tmp24180, i64 1
+  %tmp24182 = getelementptr inbounds float* %tmp24181, i64 1
+  %tmp24183 = getelementptr inbounds float* %tmp24182, i64 1
+  %tmp24184 = getelementptr inbounds float* %tmp24183, i64 1
+  %tmp24185 = getelementptr inbounds float* %tmp24184, i64 1
+  %tmp24186 = getelementptr inbounds float* %tmp24185, i64 1
+  %tmp24187 = getelementptr inbounds float* %tmp24186, i64 1
+  %tmp24188 = getelementptr inbounds float* %tmp24187, i64 1
+  %tmp24189 = getelementptr inbounds float* %tmp24188, i64 1
+  %tmp24190 = getelementptr inbounds float* %tmp24189, i64 1
+  %tmp24191 = getelementptr inbounds float* %tmp24190, i64 1
+  %tmp24192 = getelementptr inbounds float* %tmp24191, i64 1
+  %tmp24193 = getelementptr inbounds float* %tmp24192, i64 1
+  %tmp24194 = getelementptr inbounds float* %tmp24193, i64 1
+  %tmp24195 = getelementptr inbounds float* %tmp24194, i64 1
+  %tmp24196 = getelementptr inbounds float* %tmp24195, i64 1
+  %tmp24197 = getelementptr inbounds float* %tmp24196, i64 1
+  %tmp24198 = getelementptr inbounds float* %tmp24197, i64 1
+  %tmp24199 = getelementptr inbounds float* %tmp24198, i64 1
+  %tmp24200 = getelementptr inbounds float* %tmp24199, i64 1
+  %tmp24201 = getelementptr inbounds float* %tmp24200, i64 1
+  %tmp24202 = getelementptr inbounds float* %tmp24201, i64 1
+  %tmp24203 = getelementptr inbounds float* %tmp24202, i64 1
+  %tmp24204 = getelementptr inbounds float* %tmp24203, i64 1
+  %tmp24205 = getelementptr inbounds float* %tmp24204, i64 1
+  %tmp24206 = getelementptr inbounds float* %tmp24205, i64 1
+  %tmp24207 = getelementptr inbounds float* %tmp24206, i64 1
+  %tmp24208 = getelementptr inbounds float* %tmp24207, i64 1
+  %tmp24209 = getelementptr inbounds float* %tmp24208, i64 1
+  %tmp24210 = getelementptr inbounds float* %tmp24209, i64 1
+  %tmp24211 = getelementptr inbounds float* %tmp24210, i64 1
+  %tmp24212 = getelementptr inbounds float* %tmp24211, i64 1
+  %tmp24213 = getelementptr inbounds float* %tmp24212, i64 1
+  %tmp24214 = getelementptr inbounds float* %tmp24213, i64 1
+  %tmp24215 = getelementptr inbounds float* %tmp24214, i64 1
+  %tmp24216 = getelementptr inbounds float* %tmp24215, i64 1
+  %tmp24217 = getelementptr inbounds float* %tmp24216, i64 1
+  %tmp24218 = getelementptr inbounds float* %tmp24217, i64 1
+  %tmp24219 = getelementptr inbounds float* %tmp24218, i64 1
+  %tmp24220 = getelementptr inbounds float* %tmp24219, i64 1
+  %tmp24221 = getelementptr inbounds float* %tmp24220, i64 1
+  %tmp24222 = getelementptr inbounds float* %tmp24221, i64 1
+  %tmp24223 = getelementptr inbounds float* %tmp24222, i64 1
+  %tmp24224 = getelementptr inbounds float* %tmp24223, i64 1
+  %tmp24225 = getelementptr inbounds float* %tmp24224, i64 1
+  %tmp24226 = getelementptr inbounds float* %tmp24225, i64 1
+  %tmp24227 = getelementptr inbounds float* %tmp24226, i64 1
+  %tmp24228 = getelementptr inbounds float* %tmp24227, i64 1
+  %tmp24229 = getelementptr inbounds float* %tmp24228, i64 1
+  %tmp24230 = getelementptr inbounds float* %tmp24229, i64 1
+  %tmp24231 = getelementptr inbounds float* %tmp24230, i64 1
+  %tmp24232 = getelementptr inbounds float* %tmp24231, i64 1
+  %tmp24233 = getelementptr inbounds float* %tmp24232, i64 1
+  %tmp24234 = getelementptr inbounds float* %tmp24233, i64 1
+  %tmp24235 = getelementptr inbounds float* %tmp24234, i64 1
+  %tmp24236 = getelementptr inbounds float* %tmp24235, i64 1
+  %tmp24237 = getelementptr inbounds float* %tmp24236, i64 1
+  %tmp24238 = getelementptr inbounds float* %tmp24237, i64 1
+  %tmp24239 = getelementptr inbounds float* %tmp24238, i64 1
+  %tmp24240 = getelementptr inbounds float* %tmp24239, i64 1
+  %tmp24241 = getelementptr inbounds float* %tmp24240, i64 1
+  %tmp24242 = getelementptr inbounds float* %tmp24241, i64 1
+  %tmp24243 = getelementptr inbounds float* %tmp24242, i64 1
+  %tmp24244 = getelementptr inbounds float* %tmp24243, i64 1
+  %tmp24245 = getelementptr inbounds float* %tmp24244, i64 1
+  %tmp24246 = getelementptr inbounds float* %tmp24245, i64 1
+  %tmp24247 = getelementptr inbounds float* %tmp24246, i64 1
+  %tmp24248 = getelementptr inbounds float* %tmp24247, i64 1
+  %tmp24249 = getelementptr inbounds float* %tmp24248, i64 1
+  %tmp24250 = getelementptr inbounds float* %tmp24249, i64 1
+  %tmp24251 = getelementptr inbounds float* %tmp24250, i64 1
+  %tmp24252 = getelementptr inbounds float* %tmp24251, i64 1
+  %tmp24253 = getelementptr inbounds float* %tmp24252, i64 1
+  %tmp24254 = getelementptr inbounds float* %tmp24253, i64 1
+  %tmp24255 = getelementptr inbounds float* %tmp24254, i64 1
+  %tmp24256 = getelementptr inbounds float* %tmp24255, i64 1
+  %tmp24257 = getelementptr inbounds float* %tmp24256, i64 1
+  %tmp24258 = getelementptr inbounds float* %tmp24257, i64 1
+  %tmp24259 = getelementptr inbounds float* %tmp24258, i64 1
+  %tmp24260 = getelementptr inbounds float* %tmp24259, i64 1
+  %tmp24261 = getelementptr inbounds float* %tmp24260, i64 1
+  %tmp24262 = getelementptr inbounds float* %tmp24261, i64 1
+  %tmp24263 = getelementptr inbounds float* %tmp24262, i64 1
+  %tmp24264 = getelementptr inbounds float* %tmp24263, i64 1
+  %tmp24265 = getelementptr inbounds float* %tmp24264, i64 1
+  %tmp24266 = getelementptr inbounds float* %tmp24265, i64 1
+  %tmp24267 = getelementptr inbounds float* %tmp24266, i64 1
+  %tmp24268 = getelementptr inbounds float* %tmp24267, i64 1
+  %tmp24269 = getelementptr inbounds float* %tmp24268, i64 1
+  %tmp24270 = getelementptr inbounds float* %tmp24269, i64 1
+  %tmp24271 = getelementptr inbounds float* %tmp24270, i64 1
+  %tmp24272 = getelementptr inbounds float* %tmp24271, i64 1
+  %tmp24273 = getelementptr inbounds float* %tmp24272, i64 1
+  %tmp24274 = getelementptr inbounds float* %tmp24273, i64 1
+  %tmp24275 = getelementptr inbounds float* %tmp24274, i64 1
+  %tmp24276 = getelementptr inbounds float* %tmp24275, i64 1
+  %tmp24277 = getelementptr inbounds float* %tmp24276, i64 1
+  %tmp24278 = getelementptr inbounds float* %tmp24277, i64 1
+  %tmp24279 = getelementptr inbounds float* %tmp24278, i64 1
+  %tmp24280 = getelementptr inbounds float* %tmp24279, i64 1
+  %tmp24281 = getelementptr inbounds float* %tmp24280, i64 1
+  %tmp24282 = getelementptr inbounds float* %tmp24281, i64 1
+  %tmp24283 = getelementptr inbounds float* %tmp24282, i64 1
+  %tmp24284 = getelementptr inbounds float* %tmp24283, i64 1
+  %tmp24285 = getelementptr inbounds float* %tmp24284, i64 1
+  %tmp24286 = getelementptr inbounds float* %tmp24285, i64 1
+  %tmp24287 = getelementptr inbounds float* %tmp24286, i64 1
+  %tmp24288 = getelementptr inbounds float* %tmp24287, i64 1
+  %tmp24289 = getelementptr inbounds float* %tmp24288, i64 1
+  %tmp24290 = getelementptr inbounds float* %tmp24289, i64 1
+  %tmp24291 = getelementptr inbounds float* %tmp24290, i64 1
+  %tmp24292 = getelementptr inbounds float* %tmp24291, i64 1
+  %tmp24293 = getelementptr inbounds float* %tmp24292, i64 1
+  %tmp24294 = getelementptr inbounds float* %tmp24293, i64 1
+  %tmp24295 = getelementptr inbounds float* %tmp24294, i64 1
+  %tmp24296 = getelementptr inbounds float* %tmp24295, i64 1
+  %tmp24297 = getelementptr inbounds float* %tmp24296, i64 1
+  %tmp24298 = getelementptr inbounds float* %tmp24297, i64 1
+  %tmp24299 = getelementptr inbounds float* %tmp24298, i64 1
+  %tmp24300 = getelementptr inbounds float* %tmp24299, i64 1
+  %tmp24301 = getelementptr inbounds float* %tmp24300, i64 1
+  %tmp24302 = getelementptr inbounds float* %tmp24301, i64 1
+  %tmp24303 = getelementptr inbounds float* %tmp24302, i64 1
+  %tmp24304 = getelementptr inbounds float* %tmp24303, i64 1
+  %tmp24305 = getelementptr inbounds float* %tmp24304, i64 1
+  %tmp24306 = getelementptr inbounds float* %tmp24305, i64 1
+  %tmp24307 = getelementptr inbounds float* %tmp24306, i64 1
+  %tmp24308 = getelementptr inbounds float* %tmp24307, i64 1
+  %tmp24309 = getelementptr inbounds float* %tmp24308, i64 1
+  %tmp24310 = getelementptr inbounds float* %tmp24309, i64 1
+  %tmp24311 = getelementptr inbounds float* %tmp24310, i64 1
+  %tmp24312 = getelementptr inbounds float* %tmp24311, i64 1
+  %tmp24313 = getelementptr inbounds float* %tmp24312, i64 1
+  %tmp24314 = getelementptr inbounds float* %tmp24313, i64 1
+  %tmp24315 = getelementptr inbounds float* %tmp24314, i64 1
+  %tmp24316 = getelementptr inbounds float* %tmp24315, i64 1
+  %tmp24317 = getelementptr inbounds float* %tmp24316, i64 1
+  %tmp24318 = getelementptr inbounds float* %tmp24317, i64 1
+  %tmp24319 = getelementptr inbounds float* %tmp24318, i64 1
+  %tmp24320 = getelementptr inbounds float* %tmp24319, i64 1
+  %tmp24321 = getelementptr inbounds float* %tmp24320, i64 1
+  %tmp24322 = getelementptr inbounds float* %tmp24321, i64 1
+  %tmp24323 = getelementptr inbounds float* %tmp24322, i64 1
+  %tmp24324 = getelementptr inbounds float* %tmp24323, i64 1
+  %tmp24325 = getelementptr inbounds float* %tmp24324, i64 1
+  %tmp24326 = getelementptr inbounds float* %tmp24325, i64 1
+  %tmp24327 = getelementptr inbounds float* %tmp24326, i64 1
+  %tmp24328 = getelementptr inbounds float* %tmp24327, i64 1
+  %tmp24329 = getelementptr inbounds float* %tmp24328, i64 1
+  %tmp24330 = getelementptr inbounds float* %tmp24329, i64 1
+  %tmp24331 = getelementptr inbounds float* %tmp24330, i64 1
+  %tmp24332 = getelementptr inbounds float* %tmp24331, i64 1
+  %tmp24333 = getelementptr inbounds float* %tmp24332, i64 1
+  %tmp24334 = getelementptr inbounds float* %tmp24333, i64 1
+  %tmp24335 = getelementptr inbounds float* %tmp24334, i64 1
+  %tmp24336 = getelementptr inbounds float* %tmp24335, i64 1
+  %tmp24337 = getelementptr inbounds float* %tmp24336, i64 1
+  %tmp24338 = getelementptr inbounds float* %tmp24337, i64 1
+  %tmp24339 = getelementptr inbounds float* %tmp24338, i64 1
+  %tmp24340 = getelementptr inbounds float* %tmp24339, i64 1
+  %tmp24341 = getelementptr inbounds float* %tmp24340, i64 1
+  %tmp24342 = getelementptr inbounds float* %tmp24341, i64 1
+  %tmp24343 = getelementptr inbounds float* %tmp24342, i64 1
+  %tmp24344 = getelementptr inbounds float* %tmp24343, i64 1
+  %tmp24345 = getelementptr inbounds float* %tmp24344, i64 1
+  %tmp24346 = getelementptr inbounds float* %tmp24345, i64 1
+  %tmp24347 = getelementptr inbounds float* %tmp24346, i64 1
+  %tmp24348 = getelementptr inbounds float* %tmp24347, i64 1
+  %tmp24349 = getelementptr inbounds float* %tmp24348, i64 1
+  %tmp24350 = getelementptr inbounds float* %tmp24349, i64 1
+  %tmp24351 = getelementptr inbounds float* %tmp24350, i64 1
+  %tmp24352 = getelementptr inbounds float* %tmp24351, i64 1
+  %tmp24353 = getelementptr inbounds float* %tmp24352, i64 1
+  %tmp24354 = getelementptr inbounds float* %tmp24353, i64 1
+  %tmp24355 = getelementptr inbounds float* %tmp24354, i64 1
+  %tmp24356 = getelementptr inbounds float* %tmp24355, i64 1
+  %tmp24357 = getelementptr inbounds float* %tmp24356, i64 1
+  %tmp24358 = getelementptr inbounds float* %tmp24357, i64 1
+  %tmp24359 = getelementptr inbounds float* %tmp24358, i64 1
+  %tmp24360 = getelementptr inbounds float* %tmp24359, i64 1
+  %tmp24361 = getelementptr inbounds float* %tmp24360, i64 1
+  %tmp24362 = getelementptr inbounds float* %tmp24361, i64 1
+  %tmp24363 = getelementptr inbounds float* %tmp24362, i64 1
+  %tmp24364 = getelementptr inbounds float* %tmp24363, i64 1
+  %tmp24365 = getelementptr inbounds float* %tmp24364, i64 1
+  %tmp24366 = getelementptr inbounds float* %tmp24365, i64 1
+  %tmp24367 = getelementptr inbounds float* %tmp24366, i64 1
+  %tmp24368 = getelementptr inbounds float* %tmp24367, i64 1
+  %tmp24369 = getelementptr inbounds float* %tmp24368, i64 1
+  %tmp24370 = getelementptr inbounds float* %tmp24369, i64 1
+  %tmp24371 = getelementptr inbounds float* %tmp24370, i64 1
+  %tmp24372 = getelementptr inbounds float* %tmp24371, i64 1
+  %tmp24373 = getelementptr inbounds float* %tmp24372, i64 1
+  %tmp24374 = getelementptr inbounds float* %tmp24373, i64 1
+  %tmp24375 = getelementptr inbounds float* %tmp24374, i64 1
+  %tmp24376 = getelementptr inbounds float* %tmp24375, i64 1
+  %tmp24377 = getelementptr inbounds float* %tmp24376, i64 1
+  %tmp24378 = getelementptr inbounds float* %tmp24377, i64 1
+  %tmp24379 = getelementptr inbounds float* %tmp24378, i64 1
+  %tmp24380 = getelementptr inbounds float* %tmp24379, i64 1
+  %tmp24381 = getelementptr inbounds float* %tmp24380, i64 1
+  %tmp24382 = getelementptr inbounds float* %tmp24381, i64 1
+  %tmp24383 = getelementptr inbounds float* %tmp24382, i64 1
+  %tmp24384 = getelementptr inbounds float* %tmp24383, i64 1
+  %tmp24385 = getelementptr inbounds float* %tmp24384, i64 1
+  %tmp24386 = getelementptr inbounds float* %tmp24385, i64 1
+  %tmp24387 = getelementptr inbounds float* %tmp24386, i64 1
+  %tmp24388 = getelementptr inbounds float* %tmp24387, i64 1
+  %tmp24389 = getelementptr inbounds float* %tmp24388, i64 1
+  %tmp24390 = getelementptr inbounds float* %tmp24389, i64 1
+  %tmp24391 = getelementptr inbounds float* %tmp24390, i64 1
+  %tmp24392 = getelementptr inbounds float* %tmp24391, i64 1
+  %tmp24393 = getelementptr inbounds float* %tmp24392, i64 1
+  %tmp24394 = getelementptr inbounds float* %tmp24393, i64 1
+  %tmp24395 = getelementptr inbounds float* %tmp24394, i64 1
+  %tmp24396 = getelementptr inbounds float* %tmp24395, i64 1
+  %tmp24397 = getelementptr inbounds float* %tmp24396, i64 1
+  %tmp24398 = getelementptr inbounds float* %tmp24397, i64 1
+  %tmp24399 = getelementptr inbounds float* %tmp24398, i64 1
+  %tmp24400 = getelementptr inbounds float* %tmp24399, i64 1
+  %tmp24401 = getelementptr inbounds float* %tmp24400, i64 1
+  %tmp24402 = getelementptr inbounds float* %tmp24401, i64 1
+  %tmp24403 = getelementptr inbounds float* %tmp24402, i64 1
+  %tmp24404 = getelementptr inbounds float* %tmp24403, i64 1
+  %tmp24405 = getelementptr inbounds float* %tmp24404, i64 1
+  %tmp24406 = getelementptr inbounds float* %tmp24405, i64 1
+  %tmp24407 = getelementptr inbounds float* %tmp24406, i64 1
+  %tmp24408 = getelementptr inbounds float* %tmp24407, i64 1
+  %tmp24409 = getelementptr inbounds float* %tmp24408, i64 1
+  %tmp24410 = getelementptr inbounds float* %tmp24409, i64 1
+  %tmp24411 = getelementptr inbounds float* %tmp24410, i64 1
+  %tmp24412 = getelementptr inbounds float* %tmp24411, i64 1
+  %tmp24413 = getelementptr inbounds float* %tmp24412, i64 1
+  %tmp24414 = getelementptr inbounds float* %tmp24413, i64 1
+  %tmp24415 = getelementptr inbounds float* %tmp24414, i64 1
+  %tmp24416 = getelementptr inbounds float* %tmp24415, i64 1
+  %tmp24417 = getelementptr inbounds float* %tmp24416, i64 1
+  %tmp24418 = getelementptr inbounds float* %tmp24417, i64 1
+  %tmp24419 = getelementptr inbounds float* %tmp24418, i64 1
+  %tmp24420 = getelementptr inbounds float* %tmp24419, i64 1
+  %tmp24421 = getelementptr inbounds float* %tmp24420, i64 1
+  %tmp24422 = getelementptr inbounds float* %tmp24421, i64 1
+  %tmp24423 = getelementptr inbounds float* %tmp24422, i64 1
+  %tmp24424 = getelementptr inbounds float* %tmp24423, i64 1
+  %tmp24425 = getelementptr inbounds float* %tmp24424, i64 1
+  %tmp24426 = getelementptr inbounds float* %tmp24425, i64 1
+  %tmp24427 = getelementptr inbounds float* %tmp24426, i64 1
+  %tmp24428 = getelementptr inbounds float* %tmp24427, i64 1
+  %tmp24429 = getelementptr inbounds float* %tmp24428, i64 1
+  %tmp24430 = getelementptr inbounds float* %tmp24429, i64 1
+  %tmp24431 = getelementptr inbounds float* %tmp24430, i64 1
+  %tmp24432 = getelementptr inbounds float* %tmp24431, i64 1
+  %tmp24433 = getelementptr inbounds float* %tmp24432, i64 1
+  %tmp24434 = getelementptr inbounds float* %tmp24433, i64 1
+  %tmp24435 = getelementptr inbounds float* %tmp24434, i64 1
+  %tmp24436 = getelementptr inbounds float* %tmp24435, i64 1
+  %tmp24437 = getelementptr inbounds float* %tmp24436, i64 1
+  %tmp24438 = getelementptr inbounds float* %tmp24437, i64 1
+  %tmp24439 = getelementptr inbounds float* %tmp24438, i64 1
+  %tmp24440 = getelementptr inbounds float* %tmp24439, i64 1
+  %tmp24441 = getelementptr inbounds float* %tmp24440, i64 1
+  %tmp24442 = getelementptr inbounds float* %tmp24441, i64 1
+  %tmp24443 = getelementptr inbounds float* %tmp24442, i64 1
+  %tmp24444 = getelementptr inbounds float* %tmp24443, i64 1
+  %tmp24445 = getelementptr inbounds float* %tmp24444, i64 1
+  %tmp24446 = getelementptr inbounds float* %tmp24445, i64 1
+  %tmp24447 = getelementptr inbounds float* %tmp24446, i64 1
+  %tmp24448 = getelementptr inbounds float* %tmp24447, i64 1
+  %tmp24449 = getelementptr inbounds float* %tmp24448, i64 1
+  %tmp24450 = getelementptr inbounds float* %tmp24449, i64 1
+  %tmp24451 = getelementptr inbounds float* %tmp24450, i64 1
+  %tmp24452 = getelementptr inbounds float* %tmp24451, i64 1
+  %tmp24453 = getelementptr inbounds float* %tmp24452, i64 1
+  %tmp24454 = getelementptr inbounds float* %tmp24453, i64 1
+  %tmp24455 = getelementptr inbounds float* %tmp24454, i64 1
+  %tmp24456 = getelementptr inbounds float* %tmp24455, i64 1
+  %tmp24457 = getelementptr inbounds float* %tmp24456, i64 1
+  %tmp24458 = getelementptr inbounds float* %tmp24457, i64 1
+  %tmp24459 = getelementptr inbounds float* %tmp24458, i64 1
+  %tmp24460 = getelementptr inbounds float* %tmp24459, i64 1
+  %tmp24461 = getelementptr inbounds float* %tmp24460, i64 1
+  %tmp24462 = getelementptr inbounds float* %tmp24461, i64 1
+  %tmp24463 = getelementptr inbounds float* %tmp24462, i64 1
+  %tmp24464 = getelementptr inbounds float* %tmp24463, i64 1
+  %tmp24465 = getelementptr inbounds float* %tmp24464, i64 1
+  %tmp24466 = getelementptr inbounds float* %tmp24465, i64 1
+  %tmp24467 = getelementptr inbounds float* %tmp24466, i64 1
+  %tmp24468 = getelementptr inbounds float* %tmp24467, i64 1
+  %tmp24469 = getelementptr inbounds float* %tmp24468, i64 1
+  %tmp24470 = getelementptr inbounds float* %tmp24469, i64 1
+  %tmp24471 = getelementptr inbounds float* %tmp24470, i64 1
+  %tmp24472 = getelementptr inbounds float* %tmp24471, i64 1
+  %tmp24473 = getelementptr inbounds float* %tmp24472, i64 1
+  %tmp24474 = getelementptr inbounds float* %tmp24473, i64 1
+  %tmp24475 = getelementptr inbounds float* %tmp24474, i64 1
+  %tmp24476 = getelementptr inbounds float* %tmp24475, i64 1
+  %tmp24477 = getelementptr inbounds float* %tmp24476, i64 1
+  %tmp24478 = getelementptr inbounds float* %tmp24477, i64 1
+  %tmp24479 = getelementptr inbounds float* %tmp24478, i64 1
+  %tmp24480 = getelementptr inbounds float* %tmp24479, i64 1
+  %tmp24481 = getelementptr inbounds float* %tmp24480, i64 1
+  %tmp24482 = getelementptr inbounds float* %tmp24481, i64 1
+  %tmp24483 = getelementptr inbounds float* %tmp24482, i64 1
+  %tmp24484 = getelementptr inbounds float* %tmp24483, i64 1
+  %tmp24485 = getelementptr inbounds float* %tmp24484, i64 1
+  %tmp24486 = getelementptr inbounds float* %tmp24485, i64 1
+  %tmp24487 = getelementptr inbounds float* %tmp24486, i64 1
+  %tmp24488 = getelementptr inbounds float* %tmp24487, i64 1
+  %tmp24489 = getelementptr inbounds float* %tmp24488, i64 1
+  %tmp24490 = getelementptr inbounds float* %tmp24489, i64 1
+  %tmp24491 = getelementptr inbounds float* %tmp24490, i64 1
+  %tmp24492 = getelementptr inbounds float* %tmp24491, i64 1
+  %tmp24493 = getelementptr inbounds float* %tmp24492, i64 1
+  %tmp24494 = getelementptr inbounds float* %tmp24493, i64 1
+  %tmp24495 = getelementptr inbounds float* %tmp24494, i64 1
+  %tmp24496 = getelementptr inbounds float* %tmp24495, i64 1
+  %tmp24497 = getelementptr inbounds float* %tmp24496, i64 1
+  %tmp24498 = getelementptr inbounds float* %tmp24497, i64 1
+  %tmp24499 = getelementptr inbounds float* %tmp24498, i64 1
+  %tmp24500 = getelementptr inbounds float* %tmp24499, i64 1
+  %tmp24501 = getelementptr inbounds float* %tmp24500, i64 1
+  %tmp24502 = getelementptr inbounds float* %tmp24501, i64 1
+  %tmp24503 = getelementptr inbounds float* %tmp24502, i64 1
+  %tmp24504 = getelementptr inbounds float* %tmp24503, i64 1
+  %tmp24505 = getelementptr inbounds float* %tmp24504, i64 1
+  %tmp24506 = getelementptr inbounds float* %tmp24505, i64 1
+  %tmp24507 = getelementptr inbounds float* %tmp24506, i64 1
+  %tmp24508 = getelementptr inbounds float* %tmp24507, i64 1
+  %tmp24509 = getelementptr inbounds float* %tmp24508, i64 1
+  %tmp24510 = getelementptr inbounds float* %tmp24509, i64 1
+  %tmp24511 = getelementptr inbounds float* %tmp24510, i64 1
+  %tmp24512 = getelementptr inbounds float* %tmp24511, i64 1
+  %tmp24513 = getelementptr inbounds float* %tmp24512, i64 1
+  %tmp24514 = getelementptr inbounds float* %tmp24513, i64 1
+  %tmp24515 = getelementptr inbounds float* %tmp24514, i64 1
+  %tmp24516 = getelementptr inbounds float* %tmp24515, i64 1
+  %tmp24517 = getelementptr inbounds float* %tmp24516, i64 1
+  %tmp24518 = getelementptr inbounds float* %tmp24517, i64 1
+  %tmp24519 = getelementptr inbounds float* %tmp24518, i64 1
+  %tmp24520 = getelementptr inbounds float* %tmp24519, i64 1
+  %tmp24521 = getelementptr inbounds float* %tmp24520, i64 1
+  %tmp24522 = getelementptr inbounds float* %tmp24521, i64 1
+  %tmp24523 = getelementptr inbounds float* %tmp24522, i64 1
+  %tmp24524 = getelementptr inbounds float* %tmp24523, i64 1
+  %tmp24525 = getelementptr inbounds float* %tmp24524, i64 1
+  %tmp24526 = getelementptr inbounds float* %tmp24525, i64 1
+  %tmp24527 = getelementptr inbounds float* %tmp24526, i64 1
+  %tmp24528 = getelementptr inbounds float* %tmp24527, i64 1
+  %tmp24529 = getelementptr inbounds float* %tmp24528, i64 1
+  %tmp24530 = getelementptr inbounds float* %tmp24529, i64 1
+  %tmp24531 = getelementptr inbounds float* %tmp24530, i64 1
+  %tmp24532 = getelementptr inbounds float* %tmp24531, i64 1
+  %tmp24533 = getelementptr inbounds float* %tmp24532, i64 1
+  %tmp24534 = getelementptr inbounds float* %tmp24533, i64 1
+  %tmp24535 = getelementptr inbounds float* %tmp24534, i64 1
+  %tmp24536 = getelementptr inbounds float* %tmp24535, i64 1
+  %tmp24537 = getelementptr inbounds float* %tmp24536, i64 1
+  %tmp24538 = getelementptr inbounds float* %tmp24537, i64 1
+  %tmp24539 = getelementptr inbounds float* %tmp24538, i64 1
+  %tmp24540 = getelementptr inbounds float* %tmp24539, i64 1
+  %tmp24541 = getelementptr inbounds float* %tmp24540, i64 1
+  %tmp24542 = getelementptr inbounds float* %tmp24541, i64 1
+  %tmp24543 = getelementptr inbounds float* %tmp24542, i64 1
+  %tmp24544 = getelementptr inbounds float* %tmp24543, i64 1
+  %tmp24545 = getelementptr inbounds float* %tmp24544, i64 1
+  %tmp24546 = getelementptr inbounds float* %tmp24545, i64 1
+  %tmp24547 = getelementptr inbounds float* %tmp24546, i64 1
+  %tmp24548 = getelementptr inbounds float* %tmp24547, i64 1
+  %tmp24549 = getelementptr inbounds float* %tmp24548, i64 1
+  %tmp24550 = getelementptr inbounds float* %tmp24549, i64 1
+  %tmp24551 = getelementptr inbounds float* %tmp24550, i64 1
+  %tmp24552 = getelementptr inbounds float* %tmp24551, i64 1
+  %tmp24553 = getelementptr inbounds float* %tmp24552, i64 1
+  %tmp24554 = getelementptr inbounds float* %tmp24553, i64 1
+  %tmp24555 = getelementptr inbounds float* %tmp24554, i64 1
+  %tmp24556 = getelementptr inbounds float* %tmp24555, i64 1
+  %tmp24557 = getelementptr inbounds float* %tmp24556, i64 1
+  %tmp24558 = getelementptr inbounds float* %tmp24557, i64 1
+  %tmp24559 = getelementptr inbounds float* %tmp24558, i64 1
+  %tmp24560 = getelementptr inbounds float* %tmp24559, i64 1
+  %tmp24561 = getelementptr inbounds float* %tmp24560, i64 1
+  %tmp24562 = getelementptr inbounds float* %tmp24561, i64 1
+  %tmp24563 = getelementptr inbounds float* %tmp24562, i64 1
+  %tmp24564 = getelementptr inbounds float* %tmp24563, i64 1
+  %tmp24565 = getelementptr inbounds float* %tmp24564, i64 1
+  %tmp24566 = getelementptr inbounds float* %tmp24565, i64 1
+  %tmp24567 = getelementptr inbounds float* %tmp24566, i64 1
+  %tmp24568 = getelementptr inbounds float* %tmp24567, i64 1
+  %tmp24569 = getelementptr inbounds float* %tmp24568, i64 1
+  %tmp24570 = getelementptr inbounds float* %tmp24569, i64 1
+  %tmp24571 = getelementptr inbounds float* %tmp24570, i64 1
+  %tmp24572 = getelementptr inbounds float* %tmp24571, i64 1
+  %tmp24573 = getelementptr inbounds float* %tmp24572, i64 1
+  %tmp24574 = getelementptr inbounds float* %tmp24573, i64 1
+  %tmp24575 = getelementptr inbounds float* %tmp24574, i64 1
+  %tmp24576 = getelementptr inbounds float* %tmp24575, i64 1
+  %tmp24577 = getelementptr inbounds float* %tmp24576, i64 1
+  %tmp24578 = getelementptr inbounds float* %tmp24577, i64 1
+  %tmp24579 = getelementptr inbounds float* %tmp24578, i64 1
+  %tmp24580 = getelementptr inbounds float* %tmp24579, i64 1
+  %tmp24581 = getelementptr inbounds float* %tmp24580, i64 1
+  %tmp24582 = getelementptr inbounds float* %tmp24581, i64 1
+  %tmp24583 = getelementptr inbounds float* %tmp24582, i64 1
+  %tmp24584 = getelementptr inbounds float* %tmp24583, i64 1
+  %tmp24585 = getelementptr inbounds float* %tmp24584, i64 1
+  %tmp24586 = getelementptr inbounds float* %tmp24585, i64 1
+  %tmp24587 = getelementptr inbounds float* %tmp24586, i64 1
+  %tmp24588 = getelementptr inbounds float* %tmp24587, i64 1
+  %tmp24589 = getelementptr inbounds float* %tmp24588, i64 1
+  %tmp24590 = getelementptr inbounds float* %tmp24589, i64 1
+  %tmp24591 = getelementptr inbounds float* %tmp24590, i64 1
+  %tmp24592 = getelementptr inbounds float* %tmp24591, i64 1
+  %tmp24593 = getelementptr inbounds float* %tmp24592, i64 1
+  %tmp24594 = getelementptr inbounds float* %tmp24593, i64 1
+  %tmp24595 = getelementptr inbounds float* %tmp24594, i64 1
+  %tmp24596 = getelementptr inbounds float* %tmp24595, i64 1
+  %tmp24597 = getelementptr inbounds float* %tmp24596, i64 1
+  %tmp24598 = getelementptr inbounds float* %tmp24597, i64 1
+  %tmp24599 = getelementptr inbounds float* %tmp24598, i64 1
+  %tmp24600 = getelementptr inbounds float* %tmp24599, i64 1
+  %tmp24601 = getelementptr inbounds float* %tmp24600, i64 1
+  %tmp24602 = getelementptr inbounds float* %tmp24601, i64 1
+  %tmp24603 = getelementptr inbounds float* %tmp24602, i64 1
+  %tmp24604 = getelementptr inbounds float* %tmp24603, i64 1
+  %tmp24605 = getelementptr inbounds float* %tmp24604, i64 1
+  %tmp24606 = getelementptr inbounds float* %tmp24605, i64 1
+  %tmp24607 = getelementptr inbounds float* %tmp24606, i64 1
+  %tmp24608 = getelementptr inbounds float* %tmp24607, i64 1
+  %tmp24609 = getelementptr inbounds float* %tmp24608, i64 1
+  %tmp24610 = getelementptr inbounds float* %tmp24609, i64 1
+  %tmp24611 = getelementptr inbounds float* %tmp24610, i64 1
+  %tmp24612 = getelementptr inbounds float* %tmp24611, i64 1
+  %tmp24613 = getelementptr inbounds float* %tmp24612, i64 1
+  %tmp24614 = getelementptr inbounds float* %tmp24613, i64 1
+  %tmp24615 = getelementptr inbounds float* %tmp24614, i64 1
+  %tmp24616 = getelementptr inbounds float* %tmp24615, i64 1
+  %tmp24617 = getelementptr inbounds float* %tmp24616, i64 1
+  %tmp24618 = getelementptr inbounds float* %tmp24617, i64 1
+  %tmp24619 = getelementptr inbounds float* %tmp24618, i64 1
+  %tmp24620 = getelementptr inbounds float* %tmp24619, i64 1
+  %tmp24621 = getelementptr inbounds float* %tmp24620, i64 1
+  %tmp24622 = getelementptr inbounds float* %tmp24621, i64 1
+  %tmp24623 = getelementptr inbounds float* %tmp24622, i64 1
+  %tmp24624 = getelementptr inbounds float* %tmp24623, i64 1
+  %tmp24625 = getelementptr inbounds float* %tmp24624, i64 1
+  %tmp24626 = getelementptr inbounds float* %tmp24625, i64 1
+  %tmp24627 = getelementptr inbounds float* %tmp24626, i64 1
+  %tmp24628 = getelementptr inbounds float* %tmp24627, i64 1
+  %tmp24629 = getelementptr inbounds float* %tmp24628, i64 1
+  %tmp24630 = getelementptr inbounds float* %tmp24629, i64 1
+  %tmp24631 = getelementptr inbounds float* %tmp24630, i64 1
+  %tmp24632 = getelementptr inbounds float* %tmp24631, i64 1
+  %tmp24633 = getelementptr inbounds float* %tmp24632, i64 1
+  %tmp24634 = getelementptr inbounds float* %tmp24633, i64 1
+  %tmp24635 = getelementptr inbounds float* %tmp24634, i64 1
+  %tmp24636 = getelementptr inbounds float* %tmp24635, i64 1
+  %tmp24637 = getelementptr inbounds float* %tmp24636, i64 1
+  %tmp24638 = getelementptr inbounds float* %tmp24637, i64 1
+  %tmp24639 = getelementptr inbounds float* %tmp24638, i64 1
+  %tmp24640 = getelementptr inbounds float* %tmp24639, i64 1
+  %tmp24641 = getelementptr inbounds float* %tmp24640, i64 1
+  %tmp24642 = getelementptr inbounds float* %tmp24641, i64 1
+  %tmp24643 = getelementptr inbounds float* %tmp24642, i64 1
+  %tmp24644 = getelementptr inbounds float* %tmp24643, i64 1
+  %tmp24645 = getelementptr inbounds float* %tmp24644, i64 1
+  %tmp24646 = getelementptr inbounds float* %tmp24645, i64 1
+  %tmp24647 = getelementptr inbounds float* %tmp24646, i64 1
+  %tmp24648 = getelementptr inbounds float* %tmp24647, i64 1
+  %tmp24649 = getelementptr inbounds float* %tmp24648, i64 1
+  %tmp24650 = getelementptr inbounds float* %tmp24649, i64 1
+  %tmp24651 = getelementptr inbounds float* %tmp24650, i64 1
+  %tmp24652 = getelementptr inbounds float* %tmp24651, i64 1
+  %tmp24653 = getelementptr inbounds float* %tmp24652, i64 1
+  %tmp24654 = getelementptr inbounds float* %tmp24653, i64 1
+  %tmp24655 = getelementptr inbounds float* %tmp24654, i64 1
+  %tmp24656 = getelementptr inbounds float* %tmp24655, i64 1
+  %tmp24657 = getelementptr inbounds float* %tmp24656, i64 1
+  %tmp24658 = getelementptr inbounds float* %tmp24657, i64 1
+  %tmp24659 = getelementptr inbounds float* %tmp24658, i64 1
+  %tmp24660 = getelementptr inbounds float* %tmp24659, i64 1
+  %tmp24661 = getelementptr inbounds float* %tmp24660, i64 1
+  %tmp24662 = getelementptr inbounds float* %tmp24661, i64 1
+  %tmp24663 = getelementptr inbounds float* %tmp24662, i64 1
+  %tmp24664 = getelementptr inbounds float* %tmp24663, i64 1
+  %tmp24665 = getelementptr inbounds float* %tmp24664, i64 1
+  %tmp24666 = getelementptr inbounds float* %tmp24665, i64 1
+  %tmp24667 = getelementptr inbounds float* %tmp24666, i64 1
+  %tmp24668 = getelementptr inbounds float* %tmp24667, i64 1
+  %tmp24669 = getelementptr inbounds float* %tmp24668, i64 1
+  %tmp24670 = getelementptr inbounds float* %tmp24669, i64 1
+  %tmp24671 = getelementptr inbounds float* %tmp24670, i64 1
+  %tmp24672 = getelementptr inbounds float* %tmp24671, i64 1
+  %tmp24673 = getelementptr inbounds float* %tmp24672, i64 1
+  %tmp24674 = getelementptr inbounds float* %tmp24673, i64 1
+  %tmp24675 = getelementptr inbounds float* %tmp24674, i64 1
+  %tmp24676 = getelementptr inbounds float* %tmp24675, i64 1
+  %tmp24677 = getelementptr inbounds float* %tmp24676, i64 1
+  %tmp24678 = getelementptr inbounds float* %tmp24677, i64 1
+  %tmp24679 = getelementptr inbounds float* %tmp24678, i64 1
+  %tmp24680 = getelementptr inbounds float* %tmp24679, i64 1
+  %tmp24681 = getelementptr inbounds float* %tmp24680, i64 1
+  %tmp24682 = getelementptr inbounds float* %tmp24681, i64 1
+  %tmp24683 = getelementptr inbounds float* %tmp24682, i64 1
+  %tmp24684 = getelementptr inbounds float* %tmp24683, i64 1
+  %tmp24685 = getelementptr inbounds float* %tmp24684, i64 1
+  %tmp24686 = getelementptr inbounds float* %tmp24685, i64 1
+  %tmp24687 = getelementptr inbounds float* %tmp24686, i64 1
+  %tmp24688 = getelementptr inbounds float* %tmp24687, i64 1
+  %tmp24689 = getelementptr inbounds float* %tmp24688, i64 1
+  %tmp24690 = getelementptr inbounds float* %tmp24689, i64 1
+  %tmp24691 = getelementptr inbounds float* %tmp24690, i64 1
+  %tmp24692 = getelementptr inbounds float* %tmp24691, i64 1
+  %tmp24693 = getelementptr inbounds float* %tmp24692, i64 1
+  %tmp24694 = getelementptr inbounds float* %tmp24693, i64 1
+  %tmp24695 = getelementptr inbounds float* %tmp24694, i64 1
+  %tmp24696 = getelementptr inbounds float* %tmp24695, i64 1
+  %tmp24697 = getelementptr inbounds float* %tmp24696, i64 1
+  %tmp24698 = getelementptr inbounds float* %tmp24697, i64 1
+  %tmp24699 = getelementptr inbounds float* %tmp24698, i64 1
+  %tmp24700 = getelementptr inbounds float* %tmp24699, i64 1
+  %tmp24701 = getelementptr inbounds float* %tmp24700, i64 1
+  %tmp24702 = getelementptr inbounds float* %tmp24701, i64 1
+  %tmp24703 = getelementptr inbounds float* %tmp24702, i64 1
+  %tmp24704 = getelementptr inbounds float* %tmp24703, i64 1
+  %tmp24705 = getelementptr inbounds float* %tmp24704, i64 1
+  %tmp24706 = getelementptr inbounds float* %tmp24705, i64 1
+  %tmp24707 = getelementptr inbounds float* %tmp24706, i64 1
+  %tmp24708 = getelementptr inbounds float* %tmp24707, i64 1
+  %tmp24709 = getelementptr inbounds float* %tmp24708, i64 1
+  %tmp24710 = getelementptr inbounds float* %tmp24709, i64 1
+  %tmp24711 = getelementptr inbounds float* %tmp24710, i64 1
+  %tmp24712 = getelementptr inbounds float* %tmp24711, i64 1
+  %tmp24713 = getelementptr inbounds float* %tmp24712, i64 1
+  %tmp24714 = getelementptr inbounds float* %tmp24713, i64 1
+  %tmp24715 = getelementptr inbounds float* %tmp24714, i64 1
+  %tmp24716 = getelementptr inbounds float* %tmp24715, i64 1
+  %tmp24717 = getelementptr inbounds float* %tmp24716, i64 1
+  %tmp24718 = getelementptr inbounds float* %tmp24717, i64 1
+  %tmp24719 = getelementptr inbounds float* %tmp24718, i64 1
+  %tmp24720 = getelementptr inbounds float* %tmp24719, i64 1
+  %tmp24721 = getelementptr inbounds float* %tmp24720, i64 1
+  %tmp24722 = getelementptr inbounds float* %tmp24721, i64 1
+  %tmp24723 = getelementptr inbounds float* %tmp24722, i64 1
+  %tmp24724 = getelementptr inbounds float* %tmp24723, i64 1
+  %tmp24725 = getelementptr inbounds float* %tmp24724, i64 1
+  %tmp24726 = getelementptr inbounds float* %tmp24725, i64 1
+  %tmp24727 = getelementptr inbounds float* %tmp24726, i64 1
+  %tmp24728 = getelementptr inbounds float* %tmp24727, i64 1
+  %tmp24729 = getelementptr inbounds float* %tmp24728, i64 1
+  %tmp24730 = getelementptr inbounds float* %tmp24729, i64 1
+  %tmp24731 = getelementptr inbounds float* %tmp24730, i64 1
+  %tmp24732 = getelementptr inbounds float* %tmp24731, i64 1
+  %tmp24733 = getelementptr inbounds float* %tmp24732, i64 1
+  %tmp24734 = getelementptr inbounds float* %tmp24733, i64 1
+  %tmp24735 = getelementptr inbounds float* %tmp24734, i64 1
+  %tmp24736 = getelementptr inbounds float* %tmp24735, i64 1
+  %tmp24737 = getelementptr inbounds float* %tmp24736, i64 1
+  %tmp24738 = getelementptr inbounds float* %tmp24737, i64 1
+  %tmp24739 = getelementptr inbounds float* %tmp24738, i64 1
+  %tmp24740 = getelementptr inbounds float* %tmp24739, i64 1
+  %tmp24741 = getelementptr inbounds float* %tmp24740, i64 1
+  %tmp24742 = getelementptr inbounds float* %tmp24741, i64 1
+  %tmp24743 = getelementptr inbounds float* %tmp24742, i64 1
+  %tmp24744 = getelementptr inbounds float* %tmp24743, i64 1
+  %tmp24745 = getelementptr inbounds float* %tmp24744, i64 1
+  %tmp24746 = getelementptr inbounds float* %tmp24745, i64 1
+  %tmp24747 = getelementptr inbounds float* %tmp24746, i64 1
+  %tmp24748 = getelementptr inbounds float* %tmp24747, i64 1
+  %tmp24749 = getelementptr inbounds float* %tmp24748, i64 1
+  %tmp24750 = getelementptr inbounds float* %tmp24749, i64 1
+  %tmp24751 = getelementptr inbounds float* %tmp24750, i64 1
+  %tmp24752 = getelementptr inbounds float* %tmp24751, i64 1
+  %tmp24753 = getelementptr inbounds float* %tmp24752, i64 1
+  %tmp24754 = getelementptr inbounds float* %tmp24753, i64 1
+  %tmp24755 = getelementptr inbounds float* %tmp24754, i64 1
+  %tmp24756 = getelementptr inbounds float* %tmp24755, i64 1
+  %tmp24757 = getelementptr inbounds float* %tmp24756, i64 1
+  %tmp24758 = getelementptr inbounds float* %tmp24757, i64 1
+  %tmp24759 = getelementptr inbounds float* %tmp24758, i64 1
+  %tmp24760 = getelementptr inbounds float* %tmp24759, i64 1
+  %tmp24761 = getelementptr inbounds float* %tmp24760, i64 1
+  %tmp24762 = getelementptr inbounds float* %tmp24761, i64 1
+  %tmp24763 = getelementptr inbounds float* %tmp24762, i64 1
+  %tmp24764 = getelementptr inbounds float* %tmp24763, i64 1
+  %tmp24765 = getelementptr inbounds float* %tmp24764, i64 1
+  %tmp24766 = getelementptr inbounds float* %tmp24765, i64 1
+  %tmp24767 = getelementptr inbounds float* %tmp24766, i64 1
+  %tmp24768 = getelementptr inbounds float* %tmp24767, i64 1
+  %tmp24769 = getelementptr inbounds float* %tmp24768, i64 1
+  %tmp24770 = getelementptr inbounds float* %tmp24769, i64 1
+  %tmp24771 = getelementptr inbounds float* %tmp24770, i64 1
+  %tmp24772 = getelementptr inbounds float* %tmp24771, i64 1
+  %tmp24773 = getelementptr inbounds float* %tmp24772, i64 1
+  %tmp24774 = getelementptr inbounds float* %tmp24773, i64 1
+  %tmp24775 = getelementptr inbounds float* %tmp24774, i64 1
+  %tmp24776 = getelementptr inbounds float* %tmp24775, i64 1
+  %tmp24777 = getelementptr inbounds float* %tmp24776, i64 1
+  %tmp24778 = getelementptr inbounds float* %tmp24777, i64 1
+  %tmp24779 = getelementptr inbounds float* %tmp24778, i64 1
+  %tmp24780 = getelementptr inbounds float* %tmp24779, i64 1
+  %tmp24781 = getelementptr inbounds float* %tmp24780, i64 1
+  %tmp24782 = getelementptr inbounds float* %tmp24781, i64 1
+  %tmp24783 = getelementptr inbounds float* %tmp24782, i64 1
+  %tmp24784 = getelementptr inbounds float* %tmp24783, i64 1
+  %tmp24785 = getelementptr inbounds float* %tmp24784, i64 1
+  %tmp24786 = getelementptr inbounds float* %tmp24785, i64 1
+  %tmp24787 = getelementptr inbounds float* %tmp24786, i64 1
+  %tmp24788 = getelementptr inbounds float* %tmp24787, i64 1
+  %tmp24789 = getelementptr inbounds float* %tmp24788, i64 1
+  %tmp24790 = getelementptr inbounds float* %tmp24789, i64 1
+  %tmp24791 = getelementptr inbounds float* %tmp24790, i64 1
+  %tmp24792 = getelementptr inbounds float* %tmp24791, i64 1
+  %tmp24793 = getelementptr inbounds float* %tmp24792, i64 1
+  %tmp24794 = getelementptr inbounds float* %tmp24793, i64 1
+  %tmp24795 = getelementptr inbounds float* %tmp24794, i64 1
+  %tmp24796 = getelementptr inbounds float* %tmp24795, i64 1
+  %tmp24797 = getelementptr inbounds float* %tmp24796, i64 1
+  %tmp24798 = getelementptr inbounds float* %tmp24797, i64 1
+  %tmp24799 = getelementptr inbounds float* %tmp24798, i64 1
+  %tmp24800 = getelementptr inbounds float* %tmp24799, i64 1
+  %tmp24801 = getelementptr inbounds float* %tmp24800, i64 1
+  %tmp24802 = getelementptr inbounds float* %tmp24801, i64 1
+  %tmp24803 = getelementptr inbounds float* %tmp24802, i64 1
+  %tmp24804 = getelementptr inbounds float* %tmp24803, i64 1
+  %tmp24805 = getelementptr inbounds float* %tmp24804, i64 1
+  %tmp24806 = getelementptr inbounds float* %tmp24805, i64 1
+  %tmp24807 = getelementptr inbounds float* %tmp24806, i64 1
+  %tmp24808 = getelementptr inbounds float* %tmp24807, i64 1
+  %tmp24809 = getelementptr inbounds float* %tmp24808, i64 1
+  %tmp24810 = getelementptr inbounds float* %tmp24809, i64 1
+  %tmp24811 = getelementptr inbounds float* %tmp24810, i64 1
+  %tmp24812 = getelementptr inbounds float* %tmp24811, i64 1
+  %tmp24813 = getelementptr inbounds float* %tmp24812, i64 1
+  %tmp24814 = getelementptr inbounds float* %tmp24813, i64 1
+  %tmp24815 = getelementptr inbounds float* %tmp24814, i64 1
+  %tmp24816 = getelementptr inbounds float* %tmp24815, i64 1
+  %tmp24817 = getelementptr inbounds float* %tmp24816, i64 1
+  %tmp24818 = getelementptr inbounds float* %tmp24817, i64 1
+  %tmp24819 = getelementptr inbounds float* %tmp24818, i64 1
+  %tmp24820 = getelementptr inbounds float* %tmp24819, i64 1
+  %tmp24821 = getelementptr inbounds float* %tmp24820, i64 1
+  %tmp24822 = getelementptr inbounds float* %tmp24821, i64 1
+  %tmp24823 = getelementptr inbounds float* %tmp24822, i64 1
+  %tmp24824 = getelementptr inbounds float* %tmp24823, i64 1
+  %tmp24825 = getelementptr inbounds float* %tmp24824, i64 1
+  %tmp24826 = getelementptr inbounds float* %tmp24825, i64 1
+  %tmp24827 = getelementptr inbounds float* %tmp24826, i64 1
+  %tmp24828 = getelementptr inbounds float* %tmp24827, i64 1
+  %tmp24829 = getelementptr inbounds float* %tmp24828, i64 1
+  %tmp24830 = getelementptr inbounds float* %tmp24829, i64 1
+  %tmp24831 = getelementptr inbounds float* %tmp24830, i64 1
+  %tmp24832 = getelementptr inbounds float* %tmp24831, i64 1
+  %tmp24833 = getelementptr inbounds float* %tmp24832, i64 1
+  %tmp24834 = getelementptr inbounds float* %tmp24833, i64 1
+  %tmp24835 = getelementptr inbounds float* %tmp24834, i64 1
+  %tmp24836 = getelementptr inbounds float* %tmp24835, i64 1
+  %tmp24837 = getelementptr inbounds float* %tmp24836, i64 1
+  %tmp24838 = getelementptr inbounds float* %tmp24837, i64 1
+  %tmp24839 = getelementptr inbounds float* %tmp24838, i64 1
+  %tmp24840 = getelementptr inbounds float* %tmp24839, i64 1
+  %tmp24841 = getelementptr inbounds float* %tmp24840, i64 1
+  %tmp24842 = getelementptr inbounds float* %tmp24841, i64 1
+  %tmp24843 = getelementptr inbounds float* %tmp24842, i64 1
+  %tmp24844 = getelementptr inbounds float* %tmp24843, i64 1
+  %tmp24845 = getelementptr inbounds float* %tmp24844, i64 1
+  %tmp24846 = getelementptr inbounds float* %tmp24845, i64 1
+  %tmp24847 = getelementptr inbounds float* %tmp24846, i64 1
+  %tmp24848 = getelementptr inbounds float* %tmp24847, i64 1
+  %tmp24849 = getelementptr inbounds float* %tmp24848, i64 1
+  %tmp24850 = getelementptr inbounds float* %tmp24849, i64 1
+  %tmp24851 = getelementptr inbounds float* %tmp24850, i64 1
+  %tmp24852 = getelementptr inbounds float* %tmp24851, i64 1
+  %tmp24853 = getelementptr inbounds float* %tmp24852, i64 1
+  %tmp24854 = getelementptr inbounds float* %tmp24853, i64 1
+  %tmp24855 = getelementptr inbounds float* %tmp24854, i64 1
+  %tmp24856 = getelementptr inbounds float* %tmp24855, i64 1
+  %tmp24857 = getelementptr inbounds float* %tmp24856, i64 1
+  %tmp24858 = getelementptr inbounds float* %tmp24857, i64 1
+  %tmp24859 = getelementptr inbounds float* %tmp24858, i64 1
+  %tmp24860 = getelementptr inbounds float* %tmp24859, i64 1
+  %tmp24861 = getelementptr inbounds float* %tmp24860, i64 1
+  %tmp24862 = getelementptr inbounds float* %tmp24861, i64 1
+  %tmp24863 = getelementptr inbounds float* %tmp24862, i64 1
+  %tmp24864 = getelementptr inbounds float* %tmp24863, i64 1
+  %tmp24865 = getelementptr inbounds float* %tmp24864, i64 1
+  %tmp24866 = getelementptr inbounds float* %tmp24865, i64 1
+  %tmp24867 = getelementptr inbounds float* %tmp24866, i64 1
+  %tmp24868 = getelementptr inbounds float* %tmp24867, i64 1
+  %tmp24869 = getelementptr inbounds float* %tmp24868, i64 1
+  %tmp24870 = getelementptr inbounds float* %tmp24869, i64 1
+  %tmp24871 = getelementptr inbounds float* %tmp24870, i64 1
+  %tmp24872 = getelementptr inbounds float* %tmp24871, i64 1
+  %tmp24873 = getelementptr inbounds float* %tmp24872, i64 1
+  %tmp24874 = getelementptr inbounds float* %tmp24873, i64 1
+  %tmp24875 = getelementptr inbounds float* %tmp24874, i64 1
+  %tmp24876 = getelementptr inbounds float* %tmp24875, i64 1
+  %tmp24877 = getelementptr inbounds float* %tmp24876, i64 1
+  %tmp24878 = getelementptr inbounds float* %tmp24877, i64 1
+  %tmp24879 = getelementptr inbounds float* %tmp24878, i64 1
+  %tmp24880 = getelementptr inbounds float* %tmp24879, i64 1
+  %tmp24881 = getelementptr inbounds float* %tmp24880, i64 1
+  %tmp24882 = getelementptr inbounds float* %tmp24881, i64 1
+  %tmp24883 = getelementptr inbounds float* %tmp24882, i64 1
+  %tmp24884 = getelementptr inbounds float* %tmp24883, i64 1
+  %tmp24885 = getelementptr inbounds float* %tmp24884, i64 1
+  %tmp24886 = getelementptr inbounds float* %tmp24885, i64 1
+  %tmp24887 = getelementptr inbounds float* %tmp24886, i64 1
+  %tmp24888 = getelementptr inbounds float* %tmp24887, i64 1
+  %tmp24889 = getelementptr inbounds float* %tmp24888, i64 1
+  %tmp24890 = getelementptr inbounds float* %tmp24889, i64 1
+  %tmp24891 = getelementptr inbounds float* %tmp24890, i64 1
+  %tmp24892 = getelementptr inbounds float* %tmp24891, i64 1
+  %tmp24893 = getelementptr inbounds float* %tmp24892, i64 1
+  %tmp24894 = getelementptr inbounds float* %tmp24893, i64 1
+  %tmp24895 = getelementptr inbounds float* %tmp24894, i64 1
+  %tmp24896 = getelementptr inbounds float* %tmp24895, i64 1
+  %tmp24897 = getelementptr inbounds float* %tmp24896, i64 1
+  %tmp24898 = getelementptr inbounds float* %tmp24897, i64 1
+  %tmp24899 = getelementptr inbounds float* %tmp24898, i64 1
+  %tmp24900 = getelementptr inbounds float* %tmp24899, i64 1
+  %tmp24901 = getelementptr inbounds float* %tmp24900, i64 1
+  %tmp24902 = getelementptr inbounds float* %tmp24901, i64 1
+  %tmp24903 = getelementptr inbounds float* %tmp24902, i64 1
+  %tmp24904 = getelementptr inbounds float* %tmp24903, i64 1
+  %tmp24905 = getelementptr inbounds float* %tmp24904, i64 1
+  %tmp24906 = getelementptr inbounds float* %tmp24905, i64 1
+  %tmp24907 = getelementptr inbounds float* %tmp24906, i64 1
+  %tmp24908 = getelementptr inbounds float* %tmp24907, i64 1
+  %tmp24909 = getelementptr inbounds float* %tmp24908, i64 1
+  %tmp24910 = getelementptr inbounds float* %tmp24909, i64 1
+  %tmp24911 = getelementptr inbounds float* %tmp24910, i64 1
+  %tmp24912 = getelementptr inbounds float* %tmp24911, i64 1
+  %tmp24913 = getelementptr inbounds float* %tmp24912, i64 1
+  %tmp24914 = getelementptr inbounds float* %tmp24913, i64 1
+  %tmp24915 = getelementptr inbounds float* %tmp24914, i64 1
+  %tmp24916 = getelementptr inbounds float* %tmp24915, i64 1
+  %tmp24917 = getelementptr inbounds float* %tmp24916, i64 1
+  %tmp24918 = getelementptr inbounds float* %tmp24917, i64 1
+  %tmp24919 = getelementptr inbounds float* %tmp24918, i64 1
+  %tmp24920 = getelementptr inbounds float* %tmp24919, i64 1
+  %tmp24921 = getelementptr inbounds float* %tmp24920, i64 1
+  %tmp24922 = getelementptr inbounds float* %tmp24921, i64 1
+  %tmp24923 = getelementptr inbounds float* %tmp24922, i64 1
+  %tmp24924 = getelementptr inbounds float* %tmp24923, i64 1
+  %tmp24925 = getelementptr inbounds float* %tmp24924, i64 1
+  %tmp24926 = getelementptr inbounds float* %tmp24925, i64 1
+  %tmp24927 = getelementptr inbounds float* %tmp24926, i64 1
+  %tmp24928 = getelementptr inbounds float* %tmp24927, i64 1
+  %tmp24929 = getelementptr inbounds float* %tmp24928, i64 1
+  %tmp24930 = getelementptr inbounds float* %tmp24929, i64 1
+  %tmp24931 = getelementptr inbounds float* %tmp24930, i64 1
+  %tmp24932 = getelementptr inbounds float* %tmp24931, i64 1
+  %tmp24933 = getelementptr inbounds float* %tmp24932, i64 1
+  %tmp24934 = getelementptr inbounds float* %tmp24933, i64 1
+  %tmp24935 = getelementptr inbounds float* %tmp24934, i64 1
+  %tmp24936 = getelementptr inbounds float* %tmp24935, i64 1
+  %tmp24937 = getelementptr inbounds float* %tmp24936, i64 1
+  %tmp24938 = getelementptr inbounds float* %tmp24937, i64 1
+  %tmp24939 = getelementptr inbounds float* %tmp24938, i64 1
+  %tmp24940 = getelementptr inbounds float* %tmp24939, i64 1
+  %tmp24941 = getelementptr inbounds float* %tmp24940, i64 1
+  %tmp24942 = getelementptr inbounds float* %tmp24941, i64 1
+  %tmp24943 = getelementptr inbounds float* %tmp24942, i64 1
+  %tmp24944 = getelementptr inbounds float* %tmp24943, i64 1
+  %tmp24945 = getelementptr inbounds float* %tmp24944, i64 1
+  %tmp24946 = getelementptr inbounds float* %tmp24945, i64 1
+  store float 0x3F43FD0D00000000, float* %tmp24946
+  %tmp24947 = getelementptr inbounds float* undef, i64 1
+  %tmp24948 = getelementptr inbounds float* undef, i64 1
+  %tmp24949 = getelementptr inbounds float* undef, i64 1
+  %tmp24950 = getelementptr inbounds float* undef, i64 1
+  %tmp24951 = getelementptr inbounds float* %tmp24950, i64 1
+  %tmp24952 = getelementptr inbounds float* undef, i64 1
+  %tmp24953 = getelementptr inbounds float* undef, i64 1
+  %tmp24954 = getelementptr inbounds float* undef, i64 1
+  %tmp24955 = getelementptr inbounds float* undef, i64 1
+  %tmp24956 = getelementptr inbounds float* undef, i64 1
+  %tmp24957 = getelementptr inbounds float* undef, i64 1
+  %tmp24958 = getelementptr inbounds float* %tmp24957, i64 1
+  %tmp24959 = getelementptr inbounds float* undef, i64 1
+  %tmp24960 = getelementptr inbounds float* undef, i64 1
+  %tmp24961 = getelementptr inbounds float* undef, i64 1
+  %tmp24962 = getelementptr inbounds float* undef, i64 1
+  %tmp24963 = getelementptr inbounds float* undef, i64 1
+  %tmp24964 = getelementptr inbounds float* undef, i64 1
+  %tmp24965 = getelementptr inbounds float* undef, i64 1
+  %tmp24966 = getelementptr inbounds float* %tmp24965, i64 1
+  %tmp24967 = getelementptr inbounds float* undef, i64 1
+  %tmp24968 = getelementptr inbounds float* undef, i64 1
+  %tmp24969 = getelementptr inbounds float* undef, i64 1
+  %tmp24970 = getelementptr inbounds float* undef, i64 1
+  %tmp24971 = getelementptr inbounds float* %tmp24970, i64 1
+  %tmp24972 = getelementptr inbounds float* %tmp24971, i64 1
+  %tmp24973 = getelementptr inbounds float* %tmp24972, i64 1
+  %tmp24974 = getelementptr inbounds float* undef, i64 1
+  %tmp24975 = getelementptr inbounds float* undef, i64 1
+  %tmp24976 = getelementptr inbounds float* %tmp24975, i64 1
+  %tmp24977 = getelementptr inbounds float* undef, i64 1
+  %tmp24978 = getelementptr inbounds float* undef, i64 1
+  %tmp24979 = getelementptr inbounds float* undef, i64 1
+  %tmp24980 = getelementptr inbounds float* undef, i64 1
+  %tmp24981 = getelementptr inbounds float* undef, i64 1
+  %tmp24982 = getelementptr inbounds float* undef, i64 1
+  %tmp24983 = getelementptr inbounds float* %tmp24982, i64 1
+  %tmp24984 = getelementptr inbounds float* undef, i64 1
+  %tmp24985 = getelementptr inbounds float* %tmp24984, i64 1
+  %tmp24986 = getelementptr inbounds float* undef, i64 1
+  %tmp24987 = getelementptr inbounds float* %tmp24986, i64 1
+  %tmp24988 = getelementptr inbounds float* %tmp24987, i64 1
+  %tmp24989 = getelementptr inbounds float* undef, i64 1
+  %tmp24990 = getelementptr inbounds float* undef, i64 1
+  %tmp24991 = getelementptr inbounds float* %tmp24990, i64 1
+  %tmp24992 = getelementptr inbounds float* undef, i64 1
+  %tmp24993 = getelementptr inbounds float* %tmp24992, i64 1
+  %tmp24994 = getelementptr inbounds float* %tmp24993, i64 1
+  %tmp24995 = getelementptr inbounds float* undef, i64 1
+  %tmp24996 = getelementptr inbounds float* undef, i64 1
+  %tmp24997 = getelementptr inbounds float* undef, i64 1
+  %tmp24998 = getelementptr inbounds float* undef, i64 1
+  %tmp24999 = getelementptr inbounds float* undef, i64 1
+  %tmp25000 = getelementptr inbounds float* undef, i64 1
+  %tmp25001 = getelementptr inbounds float* undef, i64 1
+  %tmp25002 = getelementptr inbounds float* undef, i64 1
+  %tmp25003 = getelementptr inbounds float* undef, i64 1
+  %tmp25004 = getelementptr inbounds float* undef, i64 1
+  %tmp25005 = getelementptr inbounds float* undef, i64 1
+  %tmp25006 = getelementptr inbounds float* undef, i64 1
+  %tmp25007 = getelementptr inbounds float* undef, i64 1
+  %tmp25008 = getelementptr inbounds float* undef, i64 1
+  %tmp25009 = getelementptr inbounds float* undef, i64 1
+  %tmp25010 = getelementptr inbounds float* undef, i64 1
+  %tmp25011 = getelementptr inbounds float* undef, i64 1
+  %tmp25012 = getelementptr inbounds float* %tmp25011, i64 1
+  %tmp25013 = getelementptr inbounds float* undef, i64 1
+  %tmp25014 = getelementptr inbounds float* undef, i64 1
+  %tmp25015 = getelementptr inbounds float* undef, i64 1
+  %tmp25016 = getelementptr inbounds float* undef, i64 1
+  %tmp25017 = getelementptr inbounds float* %tmp25016, i64 1
+  %tmp25018 = getelementptr inbounds float* undef, i64 1
+  %tmp25019 = getelementptr inbounds float* undef, i64 1
+  %tmp25020 = getelementptr inbounds float* undef, i64 1
+  %tmp25021 = getelementptr inbounds float* undef, i64 1
+  %tmp25022 = getelementptr inbounds float* undef, i64 1
+  %tmp25023 = getelementptr inbounds float* %tmp25022, i64 1
+  %tmp25024 = getelementptr inbounds float* %tmp25023, i64 1
+  %tmp25025 = getelementptr inbounds float* undef, i64 1
+  %tmp25026 = getelementptr inbounds float* undef, i64 1
+  %tmp25027 = getelementptr inbounds float* undef, i64 1
+  %tmp25028 = getelementptr inbounds float* undef, i64 1
+  %tmp25029 = getelementptr inbounds float* undef, i64 1
+  %tmp25030 = getelementptr inbounds float* undef, i64 1
+  %tmp25031 = getelementptr inbounds float* undef, i64 1
+  %tmp25032 = getelementptr inbounds float* undef, i64 1
+  %tmp25033 = getelementptr inbounds float* undef, i64 1
+  %tmp25034 = getelementptr inbounds float* undef, i64 1
+  %tmp25035 = getelementptr inbounds float* %tmp25034, i64 1
+  %tmp25036 = getelementptr inbounds float* undef, i64 1
+  %tmp25037 = getelementptr inbounds float* undef, i64 1
+  %tmp25038 = getelementptr inbounds float* %tmp25037, i64 1
+  %tmp25039 = getelementptr inbounds float* undef, i64 1
+  %tmp25040 = getelementptr inbounds float* undef, i64 1
+  %tmp25041 = getelementptr inbounds float* undef, i64 1
+  %tmp25042 = getelementptr inbounds float* undef, i64 1
+  %tmp25043 = getelementptr inbounds float* undef, i64 1
+  %tmp25044 = getelementptr inbounds float* undef, i64 1
+  %tmp25045 = getelementptr inbounds float* %tmp25044, i64 1
+  %tmp25046 = getelementptr inbounds float* undef, i64 1
+  %tmp25047 = getelementptr inbounds float* %tmp25046, i64 1
+  %tmp25048 = getelementptr inbounds float* undef, i64 1
+  %tmp25049 = getelementptr inbounds float* %tmp25048, i64 1
+  %tmp25050 = getelementptr inbounds float* %tmp25049, i64 1
+  %tmp25051 = getelementptr inbounds float* undef, i64 1
+  %tmp25052 = getelementptr inbounds float* undef, i64 1
+  %tmp25053 = getelementptr inbounds float* undef, i64 1
+  %tmp25054 = getelementptr inbounds float* undef, i64 1
+  %tmp25055 = getelementptr inbounds float* undef, i64 1
+  %tmp25056 = getelementptr inbounds float* undef, i64 1
+  %tmp25057 = getelementptr inbounds float* undef, i64 1
+  %tmp25058 = getelementptr inbounds float* undef, i64 1
+  %tmp25059 = getelementptr inbounds float* undef, i64 1
+  %tmp25060 = getelementptr inbounds float* undef, i64 1
+  %tmp25061 = getelementptr inbounds float* undef, i64 1
+  %tmp25062 = getelementptr inbounds float* undef, i64 1
+  %tmp25063 = getelementptr inbounds float* undef, i64 1
+  %tmp25064 = getelementptr inbounds float* undef, i64 1
+  %tmp25065 = getelementptr inbounds float* undef, i64 1
+  %tmp25066 = getelementptr inbounds float* undef, i64 1
+  %tmp25067 = getelementptr inbounds float* %tmp25066, i64 1
+  %tmp25068 = getelementptr inbounds float* undef, i64 1
+  %tmp25069 = getelementptr inbounds float* %tmp25068, i64 1
+  %tmp25070 = getelementptr inbounds float* undef, i64 1
+  %tmp25071 = getelementptr inbounds float* undef, i64 1
+  %tmp25072 = getelementptr inbounds float* undef, i64 1
+  %tmp25073 = getelementptr inbounds float* undef, i64 1
+  %tmp25074 = getelementptr inbounds float* undef, i64 1
+  %tmp25075 = getelementptr inbounds float* %tmp25074, i64 1
+  %tmp25076 = getelementptr inbounds float* undef, i64 1
+  %tmp25077 = getelementptr inbounds float* undef, i64 1
+  %tmp25078 = getelementptr inbounds float* undef, i64 1
+  %tmp25079 = getelementptr inbounds float* undef, i64 1
+  %tmp25080 = getelementptr inbounds float* undef, i64 1
+  %tmp25081 = getelementptr inbounds float* undef, i64 1
+  %tmp25082 = getelementptr inbounds float* undef, i64 1
+  %tmp25083 = getelementptr inbounds float* undef, i64 1
+  %tmp25084 = getelementptr inbounds float* undef, i64 1
+  %tmp25085 = getelementptr inbounds float* undef, i64 1
+  %tmp25086 = getelementptr inbounds float* undef, i64 1
+  %tmp25087 = getelementptr inbounds float* undef, i64 1
+  %tmp25088 = getelementptr inbounds float* undef, i64 1
+  %tmp25089 = getelementptr inbounds float* undef, i64 1
+  %tmp25090 = getelementptr inbounds float* undef, i64 1
+  %tmp25091 = getelementptr inbounds float* undef, i64 1
+  %tmp25092 = getelementptr inbounds float* undef, i64 1
+  %tmp25093 = getelementptr inbounds float* undef, i64 1
+  %tmp25094 = getelementptr inbounds float* undef, i64 1
+  %tmp25095 = getelementptr inbounds float* %tmp25094, i64 1
+  %tmp25096 = getelementptr inbounds float* undef, i64 1
+  %tmp25097 = getelementptr inbounds float* %tmp25096, i64 1
+  %tmp25098 = getelementptr inbounds float* %tmp25097, i64 1
+  %tmp25099 = getelementptr inbounds float* undef, i64 1
+  %tmp25100 = getelementptr inbounds float* undef, i64 1
+  %tmp25101 = getelementptr inbounds float* undef, i64 1
+  %tmp25102 = getelementptr inbounds float* undef, i64 1
+  %tmp25103 = getelementptr inbounds float* undef, i64 1
+  %tmp25104 = getelementptr inbounds float* undef, i64 1
+  %tmp25105 = getelementptr inbounds float* undef, i64 1
+  %tmp25106 = getelementptr inbounds float* undef, i64 1
+  %tmp25107 = getelementptr inbounds float* %tmp25106, i64 1
+  %tmp25108 = getelementptr inbounds float* undef, i64 1
+  %tmp25109 = getelementptr inbounds float* undef, i64 1
+  %tmp25110 = getelementptr inbounds float* undef, i64 1
+  %tmp25111 = getelementptr inbounds float* undef, i64 1
+  %tmp25112 = getelementptr inbounds float* undef, i64 1
+  %tmp25113 = getelementptr inbounds float* undef, i64 1
+  %tmp25114 = getelementptr inbounds float* undef, i64 1
+  %tmp25115 = getelementptr inbounds float* undef, i64 1
+  %tmp25116 = getelementptr inbounds float* undef, i64 1
+  %tmp25117 = getelementptr inbounds float* undef, i64 1
+  %tmp25118 = getelementptr inbounds float* undef, i64 1
+  %tmp25119 = getelementptr inbounds float* undef, i64 1
+  %tmp25120 = getelementptr inbounds float* undef, i64 1
+  %tmp25121 = getelementptr inbounds float* undef, i64 1
+  %tmp25122 = getelementptr inbounds float* %tmp25121, i64 1
+  %tmp25123 = getelementptr inbounds float* undef, i64 1
+  %tmp25124 = getelementptr inbounds float* undef, i64 1
+  %tmp25125 = getelementptr inbounds float* undef, i64 1
+  %tmp25126 = getelementptr inbounds float* undef, i64 1
+  %tmp25127 = getelementptr inbounds float* undef, i64 1
+  %tmp25128 = getelementptr inbounds float* undef, i64 1
+  %tmp25129 = getelementptr inbounds float* undef, i64 1
+  %tmp25130 = getelementptr inbounds float* undef, i64 1
+  %tmp25131 = getelementptr inbounds float* undef, i64 1
+  %tmp25132 = getelementptr inbounds float* undef, i64 1
+  %tmp25133 = getelementptr inbounds float* undef, i64 1
+  %tmp25134 = getelementptr inbounds float* undef, i64 1
+  %tmp25135 = getelementptr inbounds float* undef, i64 1
+  %tmp25136 = getelementptr inbounds float* undef, i64 1
+  %tmp25137 = getelementptr inbounds float* undef, i64 1
+  %tmp25138 = getelementptr inbounds float* undef, i64 1
+  %tmp25139 = getelementptr inbounds float* undef, i64 1
+  %tmp25140 = getelementptr inbounds float* undef, i64 1
+  %tmp25141 = getelementptr inbounds float* undef, i64 1
+  %tmp25142 = getelementptr inbounds float* undef, i64 1
+  %tmp25143 = getelementptr inbounds float* undef, i64 1
+  %tmp25144 = getelementptr inbounds float* undef, i64 1
+  %tmp25145 = getelementptr inbounds float* undef, i64 1
+  %tmp25146 = getelementptr inbounds float* %tmp25145, i64 1
+  %tmp25147 = getelementptr inbounds float* undef, i64 1
+  %tmp25148 = getelementptr inbounds float* %tmp25147, i64 1
+  %tmp25149 = getelementptr inbounds float* undef, i64 1
+  %tmp25150 = getelementptr inbounds float* undef, i64 1
+  %tmp25151 = getelementptr inbounds float* undef, i64 1
+  %tmp25152 = getelementptr inbounds float* undef, i64 1
+  %tmp25153 = getelementptr inbounds float* %tmp25152, i64 1
+  %tmp25154 = getelementptr inbounds float* undef, i64 1
+  %tmp25155 = getelementptr inbounds float* undef, i64 1
+  %tmp25156 = getelementptr inbounds float* undef, i64 1
+  %tmp25157 = getelementptr inbounds float* undef, i64 1
+  %tmp25158 = getelementptr inbounds float* undef, i64 1
+  %tmp25159 = getelementptr inbounds float* undef, i64 1
+  %tmp25160 = getelementptr inbounds float* undef, i64 1
+  %tmp25161 = getelementptr inbounds float* undef, i64 1
+  %tmp25162 = getelementptr inbounds float* %tmp25161, i64 1
+  %tmp25163 = getelementptr inbounds float* undef, i64 1
+  %tmp25164 = getelementptr inbounds float* undef, i64 1
+  %tmp25165 = getelementptr inbounds float* undef, i64 1
+  %tmp25166 = getelementptr inbounds float* undef, i64 1
+  %tmp25167 = getelementptr inbounds float* undef, i64 1
+  %tmp25168 = getelementptr inbounds float* undef, i64 1
+  %tmp25169 = getelementptr inbounds float* undef, i64 1
+  %tmp25170 = getelementptr inbounds float* %tmp25169, i64 1
+  %tmp25171 = getelementptr inbounds float* undef, i64 1
+  %tmp25172 = getelementptr inbounds float* undef, i64 1
+  %tmp25173 = getelementptr inbounds float* undef, i64 1
+  %tmp25174 = getelementptr inbounds float* undef, i64 1
+  %tmp25175 = getelementptr inbounds float* %tmp25174, i64 1
+  %tmp25176 = getelementptr inbounds float* undef, i64 1
+  %tmp25177 = getelementptr inbounds float* undef, i64 1
+  %tmp25178 = getelementptr inbounds float* %tmp25177, i64 1
+  %tmp25179 = getelementptr inbounds float* undef, i64 1
+  %tmp25180 = getelementptr inbounds float* undef, i64 1
+  %tmp25181 = getelementptr inbounds float* undef, i64 1
+  %tmp25182 = getelementptr inbounds float* undef, i64 1
+  %tmp25183 = getelementptr inbounds float* undef, i64 1
+  %tmp25184 = getelementptr inbounds float* undef, i64 1
+  %tmp25185 = getelementptr inbounds float* undef, i64 1
+  %tmp25186 = getelementptr inbounds float* undef, i64 1
+  %tmp25187 = getelementptr inbounds float* %tmp25186, i64 1
+  %tmp25188 = getelementptr inbounds float* %tmp25187, i64 1
+  %tmp25189 = getelementptr inbounds float* undef, i64 1
+  %tmp25190 = getelementptr inbounds float* undef, i64 1
+  %tmp25191 = getelementptr inbounds float* undef, i64 1
+  %tmp25192 = getelementptr inbounds float* %tmp25191, i64 1
+  %tmp25193 = getelementptr inbounds float* undef, i64 1
+  %tmp25194 = getelementptr inbounds float* undef, i64 1
+  %tmp25195 = getelementptr inbounds float* undef, i64 1
+  %tmp25196 = getelementptr inbounds float* undef, i64 1
+  %tmp25197 = getelementptr inbounds float* undef, i64 1
+  %tmp25198 = getelementptr inbounds float* undef, i64 1
+  %tmp25199 = getelementptr inbounds float* undef, i64 1
+  %tmp25200 = getelementptr inbounds float* undef, i64 1
+  %tmp25201 = getelementptr inbounds float* %tmp25200, i64 1
+  %tmp25202 = getelementptr inbounds float* undef, i64 1
+  %tmp25203 = getelementptr inbounds float* undef, i64 1
+  %tmp25204 = getelementptr inbounds float* undef, i64 1
+  %tmp25205 = getelementptr inbounds float* undef, i64 1
+  %tmp25206 = getelementptr inbounds float* undef, i64 1
+  %tmp25207 = getelementptr inbounds float* undef, i64 1
+  %tmp25208 = getelementptr inbounds float* undef, i64 1
+  %tmp25209 = getelementptr inbounds float* undef, i64 1
+  %tmp25210 = getelementptr inbounds float* undef, i64 1
+  %tmp25211 = getelementptr inbounds float* undef, i64 1
+  %tmp25212 = getelementptr inbounds float* undef, i64 1
+  %tmp25213 = getelementptr inbounds float* undef, i64 1
+  %tmp25214 = getelementptr inbounds float* undef, i64 1
+  %tmp25215 = getelementptr inbounds float* undef, i64 1
+  %tmp25216 = getelementptr inbounds float* undef, i64 1
+  %tmp25217 = getelementptr inbounds float* undef, i64 1
+  %tmp25218 = getelementptr inbounds float* undef, i64 1
+  %tmp25219 = getelementptr inbounds float* undef, i64 1
+  %tmp25220 = getelementptr inbounds float* undef, i64 1
+  %tmp25221 = getelementptr inbounds float* undef, i64 1
+  %tmp25222 = getelementptr inbounds float* undef, i64 1
+  %tmp25223 = getelementptr inbounds float* undef, i64 1
+  %tmp25224 = getelementptr inbounds float* undef, i64 1
+  %tmp25225 = getelementptr inbounds float* undef, i64 1
+  %tmp25226 = getelementptr inbounds float* undef, i64 1
+  %tmp25227 = getelementptr inbounds float* undef, i64 1
+  %tmp25228 = getelementptr inbounds float* undef, i64 1
+  %tmp25229 = getelementptr inbounds float* undef, i64 1
+  %tmp25230 = getelementptr inbounds float* %tmp25229, i64 1
+  %tmp25231 = getelementptr inbounds float* undef, i64 1
+  %tmp25232 = getelementptr inbounds float* undef, i64 1
+  %tmp25233 = getelementptr inbounds float* undef, i64 1
+  %tmp25234 = getelementptr inbounds float* undef, i64 1
+  %tmp25235 = getelementptr inbounds float* %tmp25234, i64 1
+  %tmp25236 = getelementptr inbounds float* undef, i64 1
+  %tmp25237 = getelementptr inbounds float* %tmp25236, i64 1
+  %tmp25238 = getelementptr inbounds float* undef, i64 1
+  %tmp25239 = getelementptr inbounds float* undef, i64 1
+  %tmp25240 = getelementptr inbounds float* undef, i64 1
+  %tmp25241 = getelementptr inbounds float* undef, i64 1
+  %tmp25242 = getelementptr inbounds float* undef, i64 1
+  %tmp25243 = getelementptr inbounds float* undef, i64 1
+  %tmp25244 = getelementptr inbounds float* undef, i64 1
+  %tmp25245 = getelementptr inbounds float* undef, i64 1
+  %tmp25246 = getelementptr inbounds float* undef, i64 1
+  %tmp25247 = getelementptr inbounds float* undef, i64 1
+  %tmp25248 = getelementptr inbounds float* %tmp25247, i64 1
+  %tmp25249 = getelementptr inbounds float* undef, i64 1
+  %tmp25250 = getelementptr inbounds float* undef, i64 1
+  %tmp25251 = getelementptr inbounds float* undef, i64 1
+  %tmp25252 = getelementptr inbounds float* undef, i64 1
+  %tmp25253 = getelementptr inbounds float* undef, i64 1
+  %tmp25254 = getelementptr inbounds float* undef, i64 1
+  %tmp25255 = getelementptr inbounds float* undef, i64 1
+  %tmp25256 = getelementptr inbounds float* undef, i64 1
+  %tmp25257 = getelementptr inbounds float* undef, i64 1
+  %tmp25258 = getelementptr inbounds float* undef, i64 1
+  %tmp25259 = getelementptr inbounds float* undef, i64 1
+  %tmp25260 = getelementptr inbounds float* undef, i64 1
+  %tmp25261 = getelementptr inbounds float* undef, i64 1
+  %tmp25262 = getelementptr inbounds float* undef, i64 1
+  %tmp25263 = getelementptr inbounds float* undef, i64 1
+  %tmp25264 = getelementptr inbounds float* undef, i64 1
+  %tmp25265 = getelementptr inbounds float* undef, i64 1
+  %tmp25266 = getelementptr inbounds float* undef, i64 1
+  %tmp25267 = getelementptr inbounds float* undef, i64 1
+  %tmp25268 = getelementptr inbounds float* undef, i64 1
+  %tmp25269 = getelementptr inbounds float* undef, i64 1
+  br i1 undef, label %bb25270, label %bb25271
+
+bb25270:                                          ; preds = %bb2
+  br label %bb25362
+
+bb25271:                                          ; preds = %bb2
+  br label %bb25272
+
+bb25272:                                          ; preds = %bb25275, %bb25271
+  br i1 false, label %bb25273, label %bb25278
+
+bb25273:                                          ; preds = %bb25272
+  invoke void @foo()
+          to label %bb25274 unwind label %bb25276
+
+bb25274:                                          ; preds = %bb25273
+  invoke void @bar()
+          to label %bb25275 unwind label %bb25276
+
+bb25275:                                          ; preds = %bb25274
+  br label %bb25272
+
+bb25276:                                          ; preds = %bb25283, %bb25274, %bb25273
+  %tmp25277 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25361
+
+bb25278:                                          ; preds = %bb25272
+  br label %bb25279
+
+bb25279:                                          ; preds = %bb25284, %bb25278
+  br i1 undef, label %bb25280, label %bb25285
+
+bb25280:                                          ; preds = %bb25279
+  br label %bb25281
+
+bb25281:                                          ; preds = %bb25282, %bb25280
+  br i1 undef, label %bb25282, label %bb25283
+
+bb25282:                                          ; preds = %bb25281
+  br label %bb25281
+
+bb25283:                                          ; preds = %bb25281
+  invoke void @bar()
+          to label %bb25284 unwind label %bb25276
+
+bb25284:                                          ; preds = %bb25283
+  br label %bb25279
+
+bb25285:                                          ; preds = %bb25279
+  br label %bb25286
+
+bb25286:                                          ; preds = %bb25303, %bb25285
+  br i1 undef, label %bb25287, label %bb25304
+
+bb25287:                                          ; preds = %bb25286
+  invoke void @bar()
+          to label %bb25288 unwind label %bb25298
+
+bb25288:                                          ; preds = %bb25287
+  br i1 undef, label %bb25289, label %bb25300
+
+bb25289:                                          ; preds = %bb25288
+  br i1 undef, label %bb25290, label %bb25300
+
+bb25290:                                          ; preds = %bb25289
+  invoke void @bar()
+          to label %bb25291 unwind label %bb25298
+
+bb25291:                                          ; preds = %bb25290
+  br i1 undef, label %bb25292, label %bb25295
+
+bb25292:                                          ; preds = %bb25291
+  br i1 undef, label %bb25294, label %bb25293
+
+bb25293:                                          ; preds = %bb25292
+  br label %bb25294
+
+bb25294:                                          ; preds = %bb25293, %bb25292
+  br label %bb25296
+
+bb25295:                                          ; preds = %bb25291
+  invoke void @quuuux()
+          to label %bb25296 unwind label %bb25298
+
+bb25296:                                          ; preds = %bb25295, %bb25294
+  invoke void @baz()
+          to label %bb25297 unwind label %bb25298
+
+bb25297:                                          ; preds = %bb25296
+  br label %bb25300
+
+bb25298:                                          ; preds = %bb25296, %bb25295, %bb25290, %bb25287
+  %tmp25299 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25360
+
+bb25300:                                          ; preds = %bb25297, %bb25289, %bb25288
+  br i1 undef, label %bb25301, label %bb25302
+
+bb25301:                                          ; preds = %bb25300
+  br label %bb25303
+
+bb25302:                                          ; preds = %bb25300
+  br label %bb25303
+
+bb25303:                                          ; preds = %bb25302, %bb25301
+  br label %bb25286
+
+bb25304:                                          ; preds = %bb25286
+  br label %bb25305
+
+bb25305:                                          ; preds = %bb25331, %bb25304
+  br i1 undef, label %bb25306, label %bb25332
+
+bb25306:                                          ; preds = %bb25305
+  invoke void @quuux()
+          to label %bb25307 unwind label %bb25324
+
+bb25307:                                          ; preds = %bb25306
+  invoke void @quux()
+          to label %bb25308 unwind label %bb25324
+
+bb25308:                                          ; preds = %bb25307
+  br i1 undef, label %bb25309, label %bb25330
+
+bb25309:                                          ; preds = %bb25308
+  br i1 undef, label %bb25310, label %bb25330
+
+bb25310:                                          ; preds = %bb25309
+  br i1 undef, label %bb25311, label %bb25317
+
+bb25311:                                          ; preds = %bb25310
+  br label %bb25312
+
+bb25312:                                          ; preds = %bb25316, %bb25315, %bb25311
+  br i1 undef, label %bb25313, label %bb25317
+
+bb25313:                                          ; preds = %bb25312
+  %tmp25314 = invoke zeroext i1 undef(%0* undef, %0* undef)
+          to label %bb25315 unwind label %bb25324
+
+bb25315:                                          ; preds = %bb25313
+  br i1 %tmp25314, label %bb25316, label %bb25312
+
+bb25316:                                          ; preds = %bb25315
+  br label %bb25312
+
+bb25317:                                          ; preds = %bb25312, %bb25310
+  br i1 undef, label %bb25318, label %bb25326
+
+bb25318:                                          ; preds = %bb25317
+  br i1 undef, label %bb25319, label %bb25326
+
+bb25319:                                          ; preds = %bb25318
+  br i1 undef, label %bb25320, label %bb25323
+
+bb25320:                                          ; preds = %bb25319
+  br i1 undef, label %bb25322, label %bb25321
+
+bb25321:                                          ; preds = %bb25320
+  br label %bb25322
+
+bb25322:                                          ; preds = %bb25321, %bb25320
+  br label %bb25326
+
+bb25323:                                          ; preds = %bb25319
+  invoke void @qux()
+          to label %bb25326 unwind label %bb25324
+
+bb25324:                                          ; preds = %bb25357, %bb25344, %bb25343, %bb25342, %bb25337, %bb25334, %bb25333, %bb25323, %bb25313, %bb25307, %bb25306
+  %tmp25325 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25359
+
+bb25326:                                          ; preds = %bb25323, %bb25322, %bb25318, %bb25317
+  br label %bb25327
+
+bb25327:                                          ; preds = %bb25328, %bb25326
+  br i1 undef, label %bb25328, label %bb25329
+
+bb25328:                                          ; preds = %bb25327
+  br label %bb25327
+
+bb25329:                                          ; preds = %bb25327
+  br label %bb25330
+
+bb25330:                                          ; preds = %bb25329, %bb25309, %bb25308
+  br i1 undef, label %bb25332, label %bb25331
+
+bb25331:                                          ; preds = %bb25330
+  br label %bb25305
+
+bb25332:                                          ; preds = %bb25330, %bb25305
+  br i1 undef, label %bb25333, label %bb25357
+
+bb25333:                                          ; preds = %bb25332
+  invoke void (...)* @printf()
+          to label %bb25334 unwind label %bb25324
+
+bb25334:                                          ; preds = %bb25333
+  invoke void (...)* @printf(i32 undef)
+          to label %bb25335 unwind label %bb25324
+
+bb25335:                                          ; preds = %bb25334
+  br label %bb25336
+
+bb25336:                                          ; preds = %bb25338, %bb25335
+  br i1 undef, label %bb25337, label %bb25339
+
+bb25337:                                          ; preds = %bb25336
+  invoke void (...)* @printf(i32 undef, double undef)
+          to label %bb25338 unwind label %bb25324
+
+bb25338:                                          ; preds = %bb25337
+  br label %bb25336
+
+bb25339:                                          ; preds = %bb25336
+  br label %bb25340
+
+bb25340:                                          ; preds = %bb25341, %bb25339
+  br i1 undef, label %bb25341, label %bb25342
+
+bb25341:                                          ; preds = %bb25340
+  br label %bb25340
+
+bb25342:                                          ; preds = %bb25340
+  invoke void (...)* @printf()
+          to label %bb25343 unwind label %bb25324
+
+bb25343:                                          ; preds = %bb25342
+  invoke void (...)* @printf(double undef, double undef)
+          to label %bb25344 unwind label %bb25324
+
+bb25344:                                          ; preds = %bb25343
+  invoke void @mux()
+          to label %bb25345 unwind label %bb25324
+
+bb25345:                                          ; preds = %bb25344
+  br label %bb25346
+
+bb25346:                                          ; preds = %bb25347, %bb25345
+  br i1 undef, label %bb25347, label %bb25348
+
+bb25347:                                          ; preds = %bb25346
+  br label %bb25346
+
+bb25348:                                          ; preds = %bb25346
+  br label %bb25349
+
+bb25349:                                          ; preds = %bb25350, %bb25348
+  br i1 undef, label %bb25350, label %bb25351
+
+bb25350:                                          ; preds = %bb25349
+  br label %bb25349
+
+bb25351:                                          ; preds = %bb25349
+  invoke void (...)* @printf()
+          to label %bb25352 unwind label %bb25355
+
+bb25352:                                          ; preds = %bb25351
+  invoke void (...)* @printf(double undef)
+          to label %bb25353 unwind label %bb25355
+
+bb25353:                                          ; preds = %bb25352
+  invoke void (...)* @printf()
+          to label %bb25354 unwind label %bb25355
+
+bb25354:                                          ; preds = %bb25353
+  br label %bb25358
+
+bb25355:                                          ; preds = %bb25353, %bb25352, %bb25351
+  %tmp25356 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25359
+
+bb25357:                                          ; preds = %bb25332
+  invoke void (...)* @printf()
+          to label %bb25358 unwind label %bb25324
+
+bb25358:                                          ; preds = %bb25357, %bb25354
+  br label %bb25362
+
+bb25359:                                          ; preds = %bb25355, %bb25324
+  br label %bb25360
+
+bb25360:                                          ; preds = %bb25359, %bb25298
+  br label %bb25361
+
+bb25361:                                          ; preds = %bb25360, %bb25276
+  resume { i8*, i32 } undef
+
+bb25362:                                          ; preds = %bb25358, %bb25270, %bb1
+  ret void
+}
+
+declare void @foo()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @bar() uwtable ssp align 2
+
+declare hidden void @baz() uwtable ssp align 2
+
+declare void @printf(...)
+
+declare void @mux() unnamed_addr uwtable ssp align 2
+
+declare hidden void @qux() uwtable ssp align 2
+
+declare void @quux() uwtable ssp
+
+declare void @quuux() uwtable ssp
+
+declare hidden void @quuuux() uwtable ssp align 2
diff --git a/test/CodeGen/X86/lea-recursion.ll b/test/CodeGen/X86/lea-recursion.ll
index 3f32fd2..9480600 100644
--- a/test/CodeGen/X86/lea-recursion.ll
+++ b/test/CodeGen/X86/lea-recursion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep lea | count 12
+; RUN: llc < %s -march=x86-64 | grep lea | count 13
 
 ; This testcase was written to demonstrate an instruction-selection problem,
 ; however it also happens to expose a limitation in the DAGCombiner's
@@ -44,4 +44,3 @@ entry:
 	store i32 %tmp10.6, i32* getelementptr ([1000 x i32]* @g0, i32 0, i32 7)
 	ret void
 }
-
diff --git a/test/CodeGen/X86/lea.ll b/test/CodeGen/X86/lea.ll
index affd6bf..93cfe46 100644
--- a/test/CodeGen/X86/lea.ll
+++ b/test/CodeGen/X86/lea.ll
@@ -28,8 +28,7 @@ bb.nph:
 bb2:
 	ret i32 %x_offs
 ; CHECK-LABEL: test2:
-; CHECK: movl %e[[A0]], %eax
-; CHECK: addl $-5, %eax
+; CHECK:        leal    -5(%r[[A0:..]]), %eax
 ; CHECK:	andl	$-4, %eax
 ; CHECK:	negl	%eax
 ; CHECK:	leal	-4(%r[[A0]],%rax), %eax
diff --git a/test/CodeGen/X86/leaf-fp-elim.ll b/test/CodeGen/X86/leaf-fp-elim.ll
index 7eebf8d..1bb3c75 100644
--- a/test/CodeGen/X86/leaf-fp-elim.ll
+++ b/test/CodeGen/X86/leaf-fp-elim.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-apple-darwin11.0"
 @msg = internal global i8* null                   ; <i8**> [#uses=1]
 @.str = private constant [2 x i8] c"x\00", align 1 ; <[2 x i8]*> [#uses=1]
 
-define void @test(i8* %p) "no-frame-pointer-elim-non-leaf"="true" nounwind optsize ssp {
+define void @test(i8* %p) "no-frame-pointer-elim-non-leaf" nounwind optsize ssp {
 
 ; No stack frame, please.
 ; CHECK:     _test
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index 7736468..64460bb 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -64,3 +64,31 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
 ; CHECK: shl
 ; CHECK: shldl
 }
+
+; PR16108
+define i32 @test6() {
+  %x = alloca i32, align 4
+  %t = alloca i64, align 8
+  store i32 1, i32* %x, align 4
+  store i64 1, i64* %t, align 8  ;; DEAD
+  %load = load i32* %x, align 4
+  %shl = shl i32 %load, 8
+  %add = add i32 %shl, -224
+  %sh_prom = zext i32 %add to i64
+  %shl1 = shl i64 1, %sh_prom
+  %cmp = icmp ne i64 %shl1, 4294967296
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret i32 1
+
+if.end:                                           ; preds = %entry
+  ret i32 0
+
+; CHECK-LABEL: test6:
+; CHECK-NOT: andb $31
+; CHECK: sete
+; CHECK: movzbl
+; CHECK: xorl $1
+; CHECK: orl
+}
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 9d285bf..1637fa4 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -1,4 +1,10 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
+# FIXME: For now, override suffixes to exclude any .s tests, because some of the
+# buildbots have a stray misched-copy.s output file lying around that causes
+# failures. See misched-copy.s where we try and clean up that file.
+#
+# It should be possible to remove this override once all the bots have cycled
+# cleanly.
+config.suffixes = ['.ll', '.c', '.cpp', '.test', '.txt']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/CodeGen/X86/load-slice.ll b/test/CodeGen/X86/load-slice.ll
new file mode 100644
index 0000000..85fd7f0
--- /dev/null
+++ b/test/CodeGen/X86/load-slice.ll
@@ -0,0 +1,139 @@
+; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
+; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
+;
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+
+
+; Check that independant slices leads to independant loads then the slices leads to
+; different register file.
+;
+; The layout is:
+; LSB 0 1 2 3 | 4 5 6 7 MSB
+;       Low      High
+; The base address points to 0 and is 8-bytes aligned.
+; Low slice starts at 0 (base) and is 8-bytes aligned.
+; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
+;
+; STRESS-LABEL: t1:
+; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
+; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
+; Add low slice: out[out_start].real, this is base + 0.
+; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
+; Swap Imm and Real.
+; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
+; Put the results back into out[out_start].
+; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
+;
+; Same for REGULAR, we eliminate register bank copy with each slices.
+; REGULAR-LABEL: t1:
+; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
+; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
+; Add low slice: out[out_start].real, this is base + 0.
+; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
+; Swap Imm and Real.
+; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
+; Put the results back into out[out_start].
+; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
+define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %tmp = bitcast %class.Complex* %arrayidx to i64*
+  %tmp1 = load i64* %tmp, align 8
+  %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
+  %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+  %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+  %tmp4 = load float* %i.i, align 4
+  %add.i = fadd float %tmp4, %tmp2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+  %tmp5 = load float* %r.i, align 4
+  %add5.i = fadd float %tmp5, %tmp3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+; Check that we do not read outside of the chunk of bits of the original loads.
+;
+; The 64-bits should have been split in one 32-bits and one 16-bits slices.
+; The 16-bits should be zero extended to match the final type.
+;
+; The memory layout is:
+; LSB 0 1 2 3 | 4 5 | 6 7 MSB
+;      Low            High
+; The base address points to 0 and is 8-bytes aligned.
+; Low slice starts at 0 (base) and is 8-bytes aligned.
+; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
+;
+; STRESS-LABEL: t2:
+; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
+; STRESS-NEXT: addl ([[BASE]]), %eax
+; STRESS-NEXT: ret
+;
+; For the REGULAR heuristic, this is not profitable to slice things that are not
+; next to each other in memory. Here we have a hole with bytes #4-5.
+; REGULAR-LABEL: t2:
+; REGULAR: shrq $48
+define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %bitcast = bitcast %class.Complex* %arrayidx to i64*
+  %chunk64 = load i64* %bitcast, align 8
+  %slice32_low = trunc i64 %chunk64 to i32
+  %shift48 = lshr i64 %chunk64, 48
+  %slice32_high = trunc i64 %shift48 to i32
+  %res = add i32 %slice32_high, %slice32_low
+  ret i32 %res
+}
+
+; Check that we do not optimize overlapping slices.
+;
+; The 64-bits should NOT have been split in as slices are overlapping.
+; First slice uses bytes numbered 0 to 3.
+; Second slice uses bytes numbered 6 and 7.
+; Third slice uses bytes numbered 4 to 7.
+;
+; STRESS-LABEL: t3:
+; STRESS: shrq $48
+; STRESS: shrq $32
+;
+; REGULAR-LABEL: t3:
+; REGULAR: shrq $48
+; REGULAR: shrq $32
+define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %bitcast = bitcast %class.Complex* %arrayidx to i64*
+  %chunk64 = load i64* %bitcast, align 8
+  %slice32_low = trunc i64 %chunk64 to i32
+  %shift48 = lshr i64 %chunk64, 48
+  %slice32_high = trunc i64 %shift48 to i32
+  %shift32 = lshr i64 %chunk64, 32
+  %slice32_lowhigh = trunc i64 %shift32 to i32
+  %tmpres = add i32 %slice32_high, %slice32_low
+  %res = add i32 %slice32_lowhigh, %tmpres
+  ret i32 %res
+}
diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll
new file mode 100644
index 0000000..5bbd41d
--- /dev/null
+++ b/test/CodeGen/X86/long-extend.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s
+define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind {
+; CHECK-LABEL: test_long_extend
+; CHECK: vpunpcklbw	%xmm1, %xmm0, [[REG1:%xmm[0-9]+]]
+; CHECK: vpunpckhwd	%xmm1, [[REG1]], [[REG2:%xmm[0-9]+]]
+; CHECK: vpunpcklwd	%xmm1, [[REG1]], %x[[REG3:mm[0-9]+]]
+; CHECK: vinsertf128	$1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]]
+; CHECK: vpunpckhbw	%xmm1, %xmm0, [[REG4:%xmm[0-9]+]]
+; CHECK: vpunpckhwd	%xmm1, [[REG4]], [[REG5:%xmm[0-9]+]]
+; CHECK: vpunpcklwd	%xmm1, [[REG4]], %x[[REG6:mm[0-9]+]]
+; CHECK: vinsertf128	$1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]]
+; CHECK: vmovaps	[[REG_result1]], 32(%rdi)
+; CHECK: vmovaps	[[REG_result0]], (%rdi)
+
+  %tmp = zext <16 x i8> %a to <16 x i32>
+  store <16 x i32> %tmp, <16 x i32>*%p
+  ret void
+}
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index c7a3186..e7d74a9 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK-LABEL: t:
-; CHECK: decq
-; CHECK-NEXT: movl (%r9,%rax,4), %eax
+; CHECK: movl (%r9,%rax,4), %e{{..}}
+; CHECK-NEXT: decq
 ; CHECK-NEXT: jne
 
 ; ATOM-LABEL: t:
-; ATOM: movl (%r9,%r{{.+}},4), %eax
+; ATOM: movl (%r9,%r{{.+}},4), %e{{..}}
 ; ATOM-NEXT: decq
 ; ATOM-NEXT: jne
 
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index c33cac2..4a4d178 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -1,15 +1,13 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 > %t
-; RUN: not grep and %t
-; RUN: not grep movz %t
-; RUN: not grep sar %t
-; RUN: not grep shl %t
-; RUN: grep add %t | count 5
-; RUN: grep inc %t | count 2
-; RUN: grep lea %t | count 3
+; RUN: llc < %s -mcpu=generic -march=x86-64 | FileCheck %s
 
 ; Optimize away zext-inreg and sext-inreg on the loop induction
 ; variable using trip-count information.
 
+; CHECK-LABEL: count_up
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: inc
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -38,6 +36,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: count_down
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_down(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -66,6 +69,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: count_up_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: inc
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -96,6 +104,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: count_down_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_down_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -126,6 +139,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_up
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -154,6 +172,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_down
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: decq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -182,6 +205,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_up_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_up_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -212,6 +240,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_down_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: decq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_down_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
diff --git a/test/CodeGen/X86/maskmovdqu.ll b/test/CodeGen/X86/maskmovdqu.ll
index 7796f0e..0b3334d 100644
--- a/test/CodeGen/X86/maskmovdqu.ll
+++ b/test/CodeGen/X86/maskmovdqu.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=x86    -mattr=+sse2 | grep -i EDI
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | grep -i RDI
+; RUN: llc < %s -march=x86    -mattr=+sse2,-avx | grep -i EDI
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | grep -i RDI
+; RUN: llc < %s -march=x86    -mattr=+avx | grep -i EDI
+; RUN: llc < %s -march=x86-64 -mattr=+avx | grep -i RDI
 ; rdar://6573467
 
 define void @test(<16 x i8> %a, <16 x i8> %b, i32 %dummy, i8* %c) nounwind {
diff --git a/test/CodeGen/X86/mcinst-avx-lowering.ll b/test/CodeGen/X86/mcinst-avx-lowering.ll
index 41f96e8..db72e08 100644
--- a/test/CodeGen/X86/mcinst-avx-lowering.ll
+++ b/test/CodeGen/X86/mcinst-avx-lowering.ll
@@ -4,7 +4,7 @@ define i64 @t1(double %d_ivar) nounwind uwtable ssp {
 entry:
 ; CHECK: t1
   %0 = bitcast double %d_ivar to i64
-; CHECK: vmovd
+; CHECK: vmovq
 ; CHECK: encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
   ret i64 %0
 }
@@ -13,7 +13,7 @@ define double @t2(i64 %d_ivar) nounwind uwtable ssp {
 entry:
 ; CHECK: t2
   %0 = bitcast i64 %d_ivar to double
-; CHECK: vmovd
+; CHECK: vmovq
 ; CHECK: encoding: [0xc4,0xe1,0xf9,0x6e,0xc7]
   ret double %0
 }
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index c17cc7f..6ae7807 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -56,15 +56,15 @@ entry:
 define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
 ; SSE2-Darwin-LABEL: t2:
-; SSE2-Darwin: movaps (%eax), %xmm0
+; SSE2-Darwin: movaps (%ecx), %xmm0
 ; SSE2-Darwin: movaps %xmm0, (%eax)
 
 ; SSE2-Mingw32-LABEL: t2:
-; SSE2-Mingw32: movaps (%eax), %xmm0
+; SSE2-Mingw32: movaps (%ecx), %xmm0
 ; SSE2-Mingw32: movaps %xmm0, (%eax)
 
 ; SSE1-LABEL: t2:
-; SSE1: movaps (%eax), %xmm0
+; SSE1: movaps (%ecx), %xmm0
 ; SSE1: movaps %xmm0, (%eax)
 
 ; NOSSE-LABEL: t2:
@@ -91,14 +91,14 @@ entry:
 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
 ; SSE2-Darwin-LABEL: t3:
-; SSE2-Darwin: movsd (%eax), %xmm0
-; SSE2-Darwin: movsd 8(%eax), %xmm1
+; SSE2-Darwin: movsd (%ecx), %xmm0
+; SSE2-Darwin: movsd 8(%ecx), %xmm1
 ; SSE2-Darwin: movsd %xmm1, 8(%eax)
 ; SSE2-Darwin: movsd %xmm0, (%eax)
 
 ; SSE2-Mingw32-LABEL: t3:
-; SSE2-Mingw32: movsd (%eax), %xmm0
-; SSE2-Mingw32: movsd 8(%eax), %xmm1
+; SSE2-Mingw32: movsd (%ecx), %xmm0
+; SSE2-Mingw32: movsd 8(%ecx), %xmm1
 ; SSE2-Mingw32: movsd %xmm1, 8(%eax)
 ; SSE2-Mingw32: movsd %xmm0, (%eax)
 
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
new file mode 100644
index 0000000..940688c
--- /dev/null
+++ b/test/CodeGen/X86/merge_store.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+
+define void @merge_store(i32* nocapture %a) {
+; CHECK-LABEL: merge_store:
+; CHECK: movq
+; CHECK: movq
+entry:
+  br label %for.body
+
+  for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 1, i32* %arrayidx, align 4
+  %0 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %0
+  store i32 1, i32* %arrayidx2, align 4
+  %1 = or i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds i32* %a, i64 %1
+  store i32 1, i32* %arrayidx5, align 4
+  %2 = or i64 %indvars.iv, 3
+  %arrayidx8 = getelementptr inbounds i32* %a, i64 %2
+  store i32 1, i32* %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+  for.end:
+  ret void
+}
diff --git a/test/CodeGen/X86/mingw-alloca.ll b/test/CodeGen/X86/mingw-alloca.ll
index ded4b73..72b6940 100644
--- a/test/CodeGen/X86/mingw-alloca.ll
+++ b/test/CodeGen/X86/mingw-alloca.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -mtriple=i386-pc-mingw32      | FileCheck %s -check-prefix=COFF
+; RUN: llc < %s -mtriple=i386-pc-mingw32-elf  | FileCheck %s -check-prefix=ELF
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "i386-pc-mingw32"
 
 define void @foo1(i32 %N) nounwind {
 entry:
-; CHECK: _foo1:
-; CHECK: calll __alloca
+; COFF: _foo1:
+; COFF: calll __alloca
+; ELF: foo1:
+; ELF: calll _alloca
 	%tmp14 = alloca i32, i32 %N		; <i32*> [#uses=1]
 	call void @bar1( i32* %tmp14 )
 	ret void
@@ -16,11 +18,16 @@ declare void @bar1(i32*)
 
 define void @foo2(i32 inreg  %N) nounwind {
 entry:
-; CHECK: _foo2:
-; CHECK: andl $-16, %esp
-; CHECK: pushl %eax
-; CHECK: calll __alloca
-; CHECK: movl	8028(%esp), %eax
+; COFF: _foo2:
+; COFF: andl $-16, %esp
+; COFF: pushl %eax
+; COFF: calll __alloca
+; COFF: movl	8028(%esp), %eax
+; ELF: foo2:
+; ELF: andl $-16, %esp
+; ELF: pushl %eax
+; ELF: calll _alloca
+; ELF: movl	8028(%esp), %eax
 	%A2 = alloca [2000 x i32], align 16		; <[2000 x i32]*> [#uses=1]
 	%A2.sub = getelementptr [2000 x i32]* %A2, i32 0, i32 0		; <i32*> [#uses=1]
 	call void @bar2( i32* %A2.sub, i32 %N )
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
index 5f6c501..1900802 100644
--- a/test/CodeGen/X86/misched-balance.ll
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s
 ;
 ; Verify that misched resource/latency balancy heuristics are sane.
 
@@ -15,7 +15,7 @@ entry:
 ; Since mmult1 IR is already in good order, this effectively ensure
 ; the scheduler maintains source order.
 ;
-; CHECK: %for.body
+; CHECK-LABEL: %for.body
 ; CHECK-NOT: %rsp
 ; CHECK: imull 4
 ; CHECK-NOT: {{imull|rsp}}
@@ -45,7 +45,7 @@ entry:
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: %end
+; CHECK-LABEL: %end
 for.body:
   %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
   %tmp57 = load i32* %tmp56, align 4
@@ -120,7 +120,7 @@ end:
 ; Unlike the above loop, this IR starts out bad and must be
 ; rescheduled.
 ;
-; CHECK: %for.body
+; CHECK-LABEL: %for.body
 ; CHECK-NOT: %rsp
 ; CHECK: imull 4
 ; CHECK-NOT: {{imull|rsp}}
@@ -150,7 +150,7 @@ end:
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: %end
+; CHECK-LABEL: %end
 define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
   i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
   i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
@@ -232,8 +232,8 @@ end:
 ; balanced heuristics are interesting here because we have resource,
 ; latency, and register limits all at once. For now, simply check that
 ; we don't use any callee-saves.
-; CHECK: @encpc1
-; CHECK: %entry
+; CHECK-LABEL: @encpc1
+; CHECK-LABEL: %entry
 ; CHECK-NOT: push
 ; CHECK-NOT: pop
 ; CHECK: ret
diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll
index 0450cfb..4485b8a 100644
--- a/test/CodeGen/X86/misched-copy.ll
+++ b/test/CodeGen/X86/misched-copy.ll
@@ -8,11 +8,11 @@
 ; MUL_HiLo PhysReg use copies should be just above the mul.
 ; MUL_HiLo PhysReg def copies should be just below the mul.
 ;
-; CHECK:      *** Final schedule for BB#1 ***
-; CHECK-NEXT: %EAX<def> = COPY
-; CHECK:      MUL32r %vreg{{[0-9]+}}, %EAX<imp-def>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use>;
-; CHECK-NEXT: COPY %E{{[AD]}}X;
-; CHECK-NEXT: COPY %E{{[AD]}}X;
+; CHECK: *** Final schedule for BB#1 ***
+; CHECK:      %EAX<def> = COPY
+; CHECK-NEXT: MUL32r %vreg{{[0-9]+}}, %EAX<imp-def>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use>;
+; CHECK-NEXT: COPY %E{{[AD]}}X
+; CHECK-NEXT: COPY %E{{[AD]}}X
 ; CHECK:      DIVSSrm
 define i64 @mulhoist(i32 %a, i32 %b) #0 {
 entry:
@@ -42,7 +42,7 @@ end:
   ret i64 %add
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !0 = metadata !{metadata !"float", metadata !1}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 6b67607..5454b7c 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -3,11 +3,14 @@
 ;
 ; Verify that register pressure heuristics are working in MachineScheduler.
 ;
-; When we enable subtree scheduling heuristics on X86, we may need a
-; flag to disable it for this test case.
+; We can further reduce spills in this case with a global register
+; pressure heuristic, like sethi-ullman numbers or biasing toward
+; scheduled subtrees. However, these heuristics are marginally
+; beneficial on x86_64 and exacerbate register pressure in other
+; more complex cases.
 ;
 ; CHECK: @wrap_mul4
-; CHECK: 22 regalloc - Number of spills inserted
+; CHECK: 23 regalloc - Number of spills inserted
 
 define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
 entry:
@@ -221,4 +224,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
index 4dc95c5..23b561f 100644
--- a/test/CodeGen/X86/misched-matrix.ll
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -15,19 +15,19 @@
 ; been reordered with the stores. This tests the scheduler's cheap
 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
 ;
-; TOPDOWN: %for.body
+; TOPDOWN-LABEL: %for.body
 ; TOPDOWN: movl %{{.*}}, (
 ; TOPDOWN: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 4(
 ; TOPDOWN: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 8(
 ; TOPDOWN: movl %{{.*}}, 12(
-; TOPDOWN: %for.end
+; TOPDOWN-LABEL: %for.end
 ;
 ; For -misched=ilpmin, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are interleaved.
 ;
-; ILPMIN: %for.body
+; ILPMIN-LABEL: %for.body
 ; ILPMIN: movl %{{.*}}, (
 ; ILPMIN: imull
 ; ILPMIN: imull
@@ -53,12 +53,12 @@
 ; ILPMIN: imull
 ; ILPMIN: addl
 ; ILPMIN: movl %{{.*}}, 12(
-; ILPMIN: %for.end
+; ILPMIN-LABEL: %for.end
 ;
 ; For -misched=ilpmax, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are clustered.
 ;
-; ILPMAX: %for.body
+; ILPMAX-LABEL: %for.body
 ; ILPMAX: movl %{{.*}}, (
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -84,7 +84,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: movl %{{.*}}, 12(
-; ILPMAX: %for.end
+; ILPMAX-LABEL: %for.end
 
 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
 [4 x i32]* noalias nocapture %m3) nounwind uwtable ssp {
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
index f5b3f76..aabdd53 100644
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ b/test/CodeGen/X86/mmx-builtins.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3,-avx | FileCheck %s
 ; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+ssse3,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
 
 declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
 
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
index 206cb33..9e8f5bf 100644
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ b/test/CodeGen/X86/mmx-punpckhdq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse42 -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=x86_64-apple-darwin10 | FileCheck %s
 ; There are no MMX operations in bork; promoted to XMM.
 
 define void @bork(<1 x i64>* %x) {
diff --git a/test/CodeGen/X86/movbe.ll b/test/CodeGen/X86/movbe.ll
index aa58c10..3f459be 100644
--- a/test/CodeGen/X86/movbe.ll
+++ b/test/CodeGen/X86/movbe.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=x86_64-linux -mcpu=atom < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=slm < %s | FileCheck %s -check-prefix=SLM
 
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
@@ -9,6 +10,8 @@ define void @test1(i32* nocapture %x, i32 %y) nounwind {
   ret void
 ; CHECK-LABEL: test1:
 ; CHECK: movbel	%esi, (%rdi)
+; SLM-LABEL: test1:
+; SLM: movbel	%esi, (%rdi)
 }
 
 define i32 @test2(i32* %x) nounwind {
@@ -17,6 +20,8 @@ define i32 @test2(i32* %x) nounwind {
   ret i32 %bswap
 ; CHECK-LABEL: test2:
 ; CHECK: movbel	(%rdi), %eax
+; SLM-LABEL: test2:
+; SLM: movbel	(%rdi), %eax
 }
 
 define void @test3(i64* %x, i64 %y) nounwind {
@@ -25,6 +30,8 @@ define void @test3(i64* %x, i64 %y) nounwind {
   ret void
 ; CHECK-LABEL: test3:
 ; CHECK: movbeq	%rsi, (%rdi)
+; SLM-LABEL: test3:
+; SLM: movbeq	%rsi, (%rdi)
 }
 
 define i64 @test4(i64* %x) nounwind {
@@ -33,4 +40,6 @@ define i64 @test4(i64* %x) nounwind {
   ret i64 %bswap
 ; CHECK-LABEL: test4:
 ; CHECK: movbeq	(%rdi), %rax
+; SLM-LABEL: test4:
+; SLM: movbeq	(%rdi), %rax
 }
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index d3930fa..71b0723 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=sse41 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -mattr=sse41 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse41 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
 
 define i32 @test1() nounwind readonly {
 entry:
diff --git a/test/CodeGen/X86/neg_fp.ll b/test/CodeGen/X86/neg_fp.ll
index 57164f2..efb02f8 100644
--- a/test/CodeGen/X86/neg_fp.ll
+++ b/test/CodeGen/X86/neg_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 -o %t
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 -o %t
 ; RUN: grep xorps %t | count 1
 
 ; Test that when we don't -enable-unsafe-fp-math, we don't do the optimization
diff --git a/test/CodeGen/X86/newline-and-quote.ll b/test/CodeGen/X86/newline-and-quote.ll
new file mode 100644
index 0000000..9206e9f
--- /dev/null
+++ b/test/CodeGen/X86/newline-and-quote.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s
+@"foo\22bar" = global i32 42
+; CHECK: .globl "foo\"bar"
+
+@"foo\0abar" = global i32 42
+; CHECK: .globl "foo\nbar"
diff --git a/test/CodeGen/X86/no-compact-unwind.ll b/test/CodeGen/X86/no-compact-unwind.ll
index 627f7da..991cd4e 100644
--- a/test/CodeGen/X86/no-compact-unwind.ll
+++ b/test/CodeGen/X86/no-compact-unwind.ll
@@ -1,4 +1,10 @@
-; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -disable-cfi | FileCheck %s
+; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -mcpu corei7 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-macosx10.8.0 -s - \
+; RUN:  | FileCheck -check-prefix=CU %s
+; RUN: llc < %s -mtriple x86_64-apple-darwin11 -mcpu corei7 \
+; RUN:  | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | FileCheck -check-prefix=FROM-ASM %s
 
 %"struct.dyld::MappedRanges" = type { [400 x %struct.anon], %"struct.dyld::MappedRanges"* }
 %struct.anon = type { %class.ImageLoader*, i64, i64 }
@@ -12,13 +18,15 @@ declare void @OSMemoryBarrier() optsize
 ; This compact unwind encoding indicates that we could not generate correct
 ; compact unwind encodings for this function. This then defaults to using the
 ; DWARF EH frame.
-;
-; CHECK: .section __LD,__compact_unwind,regular,debug
-; CHECK: .quad _func
-; CHECK: .long 67108864                ## Compact Unwind Encoding: 0x4000000
-; CHECK: .quad 0                       ## Personality Function
-; CHECK: .quad 0                       ## LSDA
-;
+
+; CU:      Contents of section __compact_unwind:
+; CU-NEXT: 0048 00000000 00000000 42000000 00000004
+; CU-NEXT: 0058 00000000 00000000 00000000 00000000
+
+; FROM-ASM:      Contents of section __compact_unwind:
+; FROM-ASM-NEXT: 0048 00000000 00000000 42000000 00000004
+; FROM-ASM-NEXT: 0058 00000000 00000000 00000000 00000000
+
 define void @func(%class.ImageLoader* %image) optsize ssp uwtable {
 entry:
   br label %for.cond1.preheader
diff --git a/test/CodeGen/X86/no-elf-compact-unwind.ll b/test/CodeGen/X86/no-elf-compact-unwind.ll
new file mode 100644
index 0000000..8a15817
--- /dev/null
+++ b/test/CodeGen/X86/no-elf-compact-unwind.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -disable-cfi | FileCheck -check-prefix=MACHO %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux -disable-cfi | FileCheck -check-prefix=ELF %s
+
+; Make sure we don't generate a compact unwind for ELF.
+
+; MACHO-LABEL: _Z3barv:
+; MACHO:       __compact_unwind
+
+; ELF-LABEL:   _Z3barv:
+; ELF-NOT:     __compact_unwind
+
+@_ZTIi = external constant i8*
+
+define void @_Z3barv() uwtable {
+entry:
+  invoke void @_Z3foov()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = extractvalue { i8*, i32 } %0, 0
+  %4 = tail call i8* @__cxa_begin_catch(i8* %3)
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  ret void
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %0
+}
+
+declare void @_Z3foov()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
diff --git a/test/CodeGen/X86/nocx16.ll b/test/CodeGen/X86/nocx16.ll
new file mode 100644
index 0000000..cceaac4
--- /dev/null
+++ b/test/CodeGen/X86/nocx16.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=-cx16 | FileCheck %s
+define void @test(i128* %a) nounwind {
+entry:
+; CHECK: __sync_val_compare_and_swap_16
+  %0 = cmpxchg i128* %a, i128 1, i128 1 seq_cst
+; CHECK: __sync_lock_test_and_set_16
+  %1 = atomicrmw xchg i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_add_16
+  %2 = atomicrmw add i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_sub_16
+  %3 = atomicrmw sub i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_and_16
+  %4 = atomicrmw and i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_nand_16
+  %5 = atomicrmw nand i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_or_16
+  %6 = atomicrmw or i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_xor_16
+  %7 = atomicrmw xor i128* %a, i128 1 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll
index 8f1eabd..ec35d29 100644
--- a/test/CodeGen/X86/object-size.ll
+++ b/test/CodeGen/X86/object-size.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-apple-darwin10.0"
 define void @bar() nounwind ssp {
 entry:
   %tmp = load i8** @p                             ; <i8*> [#uses=1]
-  %0 = call i64 @llvm.objectsize.i64(i8* %tmp, i1 0) ; <i64> [#uses=1]
+  %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp, i1 0) ; <i64> [#uses=1]
   %cmp = icmp ne i64 %0, -1                       ; <i1> [#uses=1]
 ; X64: movabsq $-1, [[RAX:%r..]]
 ; X64: cmpq    $-1, [[RAX]]
@@ -19,7 +19,7 @@ entry:
 cond.true:                                        ; preds = %entry
   %tmp1 = load i8** @p                            ; <i8*> [#uses=1]
   %tmp2 = load i8** @p                            ; <i8*> [#uses=1]
-  %1 = call i64 @llvm.objectsize.i64(i8* %tmp2, i1 1) ; <i64> [#uses=1]
+  %1 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp2, i1 1) ; <i64> [#uses=1]
   %call = call i8* @__strcpy_chk(i8* %tmp1, i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i64 %1) ssp ; <i8*> [#uses=1]
   br label %cond.end
 
@@ -33,7 +33,7 @@ cond.end:                                         ; preds = %cond.false, %cond.t
   ret void
 }
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readonly
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readonly
 
 declare i8* @__strcpy_chk(i8*, i8*, i64) ssp
 
@@ -47,7 +47,7 @@ entry:
   %tmp = load i8** %__dest.addr                   ; <i8*> [#uses=1]
   %tmp1 = load i8** %__src.addr                   ; <i8*> [#uses=1]
   %tmp2 = load i8** %__dest.addr                  ; <i8*> [#uses=1]
-  %0 = call i64 @llvm.objectsize.i64(i8* %tmp2, i1 1) ; <i64> [#uses=1]
+  %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp2, i1 1) ; <i64> [#uses=1]
   %call = call i8* @__strcpy_chk(i8* %tmp, i8* %tmp1, i64 %0) ssp ; <i8*> [#uses=1]
   store i8* %call, i8** %retval
   %1 = load i8** %retval                          ; <i8*> [#uses=1]
diff --git a/test/CodeGen/X86/opt-shuff-tstore.ll b/test/CodeGen/X86/opt-shuff-tstore.ll
index 3e72084..fc43e81 100644
--- a/test/CodeGen/X86/opt-shuff-tstore.ll
+++ b/test/CodeGen/X86/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse41 | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse4.1 | FileCheck %s
 
 ; CHECK: func_4_8
 ; A single memory write
diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index c76cbbe..ec6564d 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=core2 -mattr=+ssse3 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck --check-prefix=YONAH %s
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck --check-prefix=CHECK-YONAH %s
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
new file mode 100644
index 0000000..d534639
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK:      movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK-NEXT: nop
+; CHECK:      movq %rax, %[[REG:r.+]]
+; CHECK:      callq *%r11
+; CHECK-NEXT: nop
+; CHECK:      movq %[[REG]], %rax
+; CHECK:      ret
+  %resolveCall2 = inttoptr i64 -559038736 to i8*
+  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 -559038737 to i8*
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK: subq $32, %rsp
+; CHECK: Ltmp
+; CHECK: addq $32, %rsp
+; CHECK: ret
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test the webkit_jscc calling convention.
+; Two arguments will be pushed on the stack.
+; Return value in $rax.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      movq %r{{.+}}, 8(%rsp)
+; CHECK:      movq %r{{.+}}, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK:      movq %rax, 8(%rsp)
+; CHECK:      callq
+  %resolveCall2 = inttoptr i64 -559038736 to i8*
+  %result = tail call webkit_jscc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveCall2, i32 2, i64 %p1, i64 %p2)
+  %resolveCall3 = inttoptr i64 -559038737 to i8*
+  tail call webkit_jscc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret void
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: popq
+; CHECK-NEXT: ret
+  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i32, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll
index 606a9be..f73ebb9 100644
--- a/test/CodeGen/X86/peep-vector-extract-concat.ll
+++ b/test/CodeGen/X86/peep-vector-extract-concat.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse4.1 | FileCheck %s
 ; CHECK: pshufd $3, %xmm0, %xmm0
 
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse41 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse4.1 | FileCheck %s -check-prefix=WIN64
 ; %a is passed indirectly on Win64.
 ; WIN64: movss   12(%rcx), %xmm0
 
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
index b85b4c3..f0e468f 100644
--- a/test/CodeGen/X86/pmovext.ll
+++ b/test/CodeGen/X86/pmovext.ll
@@ -18,5 +18,28 @@ define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable
 }
 
 declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+; rdar://15245794
+
+define <4 x i32> @foo0(double %v.coerce) nounwind ssp {
+; CHECK-LABEL: foo0
+; CHECK: pmovzxwd %xmm0, %xmm0
+; CHECK-NEXT: ret
+  %tmp = bitcast double %v.coerce to <4 x i16>
+  %tmp1 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp2 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp1) nounwind
+  ret <4 x i32> %tmp2
+}
+
+define <8 x i16> @foo1(double %v.coerce) nounwind ssp {
+; CHECK-LABEL: foo1
+; CHECK: pmovzxbw %xmm0, %xmm0
+; CHECK-NEXT: ret
+  %tmp = bitcast double %v.coerce to <8 x i8>
+  %tmp1 = shufflevector <8 x i8> %tmp, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp2 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %tmp1)
+  ret <8 x i16> %tmp2
+}
+
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/pmovsx-inreg.ll b/test/CodeGen/X86/pmovsx-inreg.ll
index d30d7d0..07979f6 100644
--- a/test/CodeGen/X86/pmovsx-inreg.ll
+++ b/test/CodeGen/X86/pmovsx-inreg.ll
@@ -86,8 +86,7 @@ define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
   ret void
 
 ; AVX2-LABEL: test6:
-; FIXME: v16i8 -> v16i16 is scalarized.
-; AVX2-NOT: pmovsx
+; AVX2: vpmovsxbw
 }
 
 define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index da4af81..7bf8a61 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -mcpu=nehalem -stack-alignment=16 > %t
+; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 11
+; RUN: grep mov %t | count 14
 
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
diff --git a/test/CodeGen/X86/pmulld.ll b/test/CodeGen/X86/pmulld.ll
index 4103eab..3db0f73 100644
--- a/test/CodeGen/X86/pmulld.ll
+++ b/test/CodeGen/X86/pmulld.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse41 -asm-verbose=0 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse41 -asm-verbose=0 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.1 -asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse4.1 -asm-verbose=0 | FileCheck %s -check-prefix=WIN64
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/X86/pr10523.ll b/test/CodeGen/X86/pr10523.ll
index 7191d69..0ec22a0 100644
--- a/test/CodeGen/X86/pr10523.ll
+++ b/test/CodeGen/X86/pr10523.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr10524.ll b/test/CodeGen/X86/pr10524.ll
index ed3e7c5..12bdba9 100644
--- a/test/CodeGen/X86/pr10524.ll
+++ b/test/CodeGen/X86/pr10524.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr10525.ll b/test/CodeGen/X86/pr10525.ll
index 342c1d6..30ce297 100644
--- a/test/CodeGen/X86/pr10525.ll
+++ b/test/CodeGen/X86/pr10525.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr10526.ll b/test/CodeGen/X86/pr10526.ll
index 6963fe5..9fa83ce 100644
--- a/test/CodeGen/X86/pr10526.ll
+++ b/test/CodeGen/X86/pr10526.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
index 087b8d7..81aaf91 100644
--- a/test/CodeGen/X86/pr12312.ll
+++ b/test/CodeGen/X86/pr12312.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx < %s | FileCheck %s --check-prefix SSE41
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
 
 define i32 @veccond128(<4 x i32> %input) {
diff --git a/test/CodeGen/X86/pr14088.ll b/test/CodeGen/X86/pr14088.ll
index 505e3b5..16f20d0 100644
--- a/test/CodeGen/X86/pr14088.ll
+++ b/test/CodeGen/X86/pr14088.ll
@@ -19,7 +19,14 @@ return:
   ret i32 %retval.0
 }
 
-; We were miscompiling this and using %ax instead of %cx in the movw.
-; CHECK: movswl	%cx, %ecx
-; CHECK: movw	%cx, (%rsi)
-; CHECK: movslq	%ecx, %rcx
+; We were miscompiling this and using %ax instead of %cx in the movw
+; in the following sequence:
+;	movswl	%cx, %ecx
+;	movw	%cx, (%rsi)
+;	movslq	%ecx, %rcx
+;
+; We can't produce the above sequence without special SD-level
+; heuristics. Now we produce this:
+; CHECK: movw	%ax, (%rsi)
+; CHECK: cwtl
+; CHECK: cltq
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
index d76b912..2f7c720 100644
--- a/test/CodeGen/X86/pr14090.ll
+++ b/test/CodeGen/X86/pr14090.ll
@@ -48,11 +48,11 @@ entry:
   %fifteen = bitcast i64* %retval.i.i to i32**
   %sixteen = bitcast i64* %retval.i.i to i8*
   call void @llvm.lifetime.start(i64 8, i8* %sixteen)
-  store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0
+  store i32* %.ph.i80, i32** %fifteen, align 8
   %sunkaddr = ptrtoint i64* %retval.i.i to i32
   %sunkaddr86 = add i32 %sunkaddr, 4
   %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
-  store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3
+  store i32 %fourteen, i32* %sunkaddr87, align 4
   %seventeen = load i64* %retval.i.i, align 8
   call void @llvm.lifetime.end(i64 8, i8* %sixteen)
   %eighteen = lshr i64 %seventeen, 32
@@ -68,9 +68,3 @@ entry:
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
-!4 = metadata !{metadata !"vtable pointer", metadata !2}
diff --git a/test/CodeGen/X86/pr1505b.ll b/test/CodeGen/X86/pr1505b.ll
index 9b0ef83..c348fec 100644
--- a/test/CodeGen/X86/pr1505b.ll
+++ b/test/CodeGen/X86/pr1505b.ll
@@ -57,11 +57,10 @@ entry:
 	%tmp22 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZNSolsEd( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp16, double %tmp1920 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
 	%tmp30 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp22 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=0]
 ; reload:
-; CHECK: fld
-; CHECK: fstps
 ; CHECK: ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	%tmp34 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc( %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZSt4cout, i8* getelementptr ([13 x i8]* @.str1, i32 0, i32 0) )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
 	%tmp3940 = fpext float %tmp1314 to double		; <double> [#uses=1]
+; CHECK: fld
 ; CHECK: fstpl
 ; CHECK: ZNSolsEd
 	%tmp42 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZNSolsEd( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp34, double %tmp3940 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
diff --git a/test/CodeGen/X86/pr16031.ll b/test/CodeGen/X86/pr16031.ll
index ab0b5ef..ecf6218 100644
--- a/test/CodeGen/X86/pr16031.ll
+++ b/test/CodeGen/X86/pr16031.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=corei7-avx -enable-misched=false | FileCheck %s
 
 ; CHECK-LABEL: main:
 ; CHECK: pushl %esi
diff --git a/test/CodeGen/X86/pr16807.ll b/test/CodeGen/X86/pr16807.ll
new file mode 100644
index 0000000..6d55d99
--- /dev/null
+++ b/test/CodeGen/X86/pr16807.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core-avx-i | FileCheck %s
+
+define <16 x i16> @f_fu(<16 x i16> %bf) {
+allocas:
+  %avg.i.i = sdiv <16 x i16> %bf, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  ret <16 x i16> %avg.i.i
+}
+
+; CHECK: f_fu
+; CHECK: psraw
+; CHECK: psrlw
+; CHECK: paddw
+; CHECK: psraw
+; CHECK: psraw
+; CHECK: psrlw
+; CHECK: paddw
+; CHECK: psraw
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr17546.ll b/test/CodeGen/X86/pr17546.ll
new file mode 100644
index 0000000..174fa5c
--- /dev/null
+++ b/test/CodeGen/X86/pr17546.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core-avx2 | FileCheck %s
+
+define i32 @f_f___un_3C_unf_3E_un_3C_unf_3E_(<8 x i32> %__mask, i64 %BBBB) {
+  %QQQ = trunc i64 %BBBB to i32
+  %1 = extractelement <8 x i32> %__mask, i32 %QQQ
+  ret i32 %1
+}
+
+; CHECK: f_f___un_3C_unf_3E_un_3C_unf_3E_
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
new file mode 100644
index 0000000..98f951f
--- /dev/null
+++ b/test/CodeGen/X86/pr17631.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
+
+%struct_type = type { [64 x <8 x float>], <8 x float> }
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
+
+; Function Attrs: nounwind
+define i32 @equal(<8 x i32> %A) {
+allocas:
+  %first_alloc  = alloca [64 x <8 x i32>]
+  %second_alloc = alloca %struct_type
+
+  %A1 = bitcast <8 x i32> %A to <8 x float>
+  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
+  ret i32 %A2
+}
+
+; CHECK: equal
+; CHECK-NOT: vzeroupper
+; CHECK: _chkstk
+; CHECK: ret
+
+define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
+  %i = fptoui double %x to i64
+  store i64 %i, i64* %p
+  %ret = fadd <8 x float> %y, %y
+  ret <8 x float> %ret
+}
+
+; CHECK: foo
+; CHECK-NOT: vzeroupper
+; CHECK: _ftol2
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
new file mode 100644
index 0000000..7a3fd6d
--- /dev/null
+++ b/test/CodeGen/X86/pr17764.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s
+
+define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
+  ret <16 x i16> %ret
+}
+
+; CHECK: foo
+; CHECK: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr18014.ll b/test/CodeGen/X86/pr18014.ll
new file mode 100644
index 0000000..e3860b8
--- /dev/null
+++ b/test/CodeGen/X86/pr18014.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=penryn | FileCheck %s
+
+; Ensure PSRAD is generated as the condition is consumed by both PADD and
+; BLENDVPS. PAND requires all bits setting properly.
+
+define <4 x i32> @foo(<4 x i32>* %p, <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %sext_cond = sext <4 x i1> %cond to <4 x i32>
+  %t1 = add <4 x i32> %v1, %sext_cond
+  %t2 = select <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2
+  store <4 x i32> %t2, <4 x i32>* %p
+  ret <4 x i32> %t1
+; CHECK: foo
+; CHECK: pslld
+; CHECK: psrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr18023.ll b/test/CodeGen/X86/pr18023.ll
new file mode 100644
index 0000000..4c6f8cf
--- /dev/null
+++ b/test/CodeGen/X86/pr18023.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple x86_64-apple-macosx10.9.0 | FileCheck %s
+; PR18023
+
+; CHECK: movabsq $4294967296, %rcx
+; CHECK: movq  %rcx, (%rax)
+; CHECK: movl  $1, 4(%rax)
+; CHECK: movl  $0, 4(%rax)
+; CHECK: movq  $1, 4(%rax)
+
+@c = common global i32 0, align 4
+@a = common global [3 x i32] zeroinitializer, align 4
+@b = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+define void @func() {
+  store i32 1, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  store i32 0, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 0), align 4
+  %1 = load volatile i32* @b, align 4
+  store i32 1, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  store i32 0, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  %2 = load volatile i32* @b, align 4
+  store i32 1, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  store i32 0, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 2), align 4
+  %3 = load volatile i32* @b, align 4
+  store i32 3, i32* @c, align 4
+  %4 = load i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %4)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/X86/pr18054.ll b/test/CodeGen/X86/pr18054.ll
new file mode 100644
index 0000000..b7af516
--- /dev/null
+++ b/test/CodeGen/X86/pr18054.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=penryn | FileCheck %s
+
+define void @foo(<16 x i32>* %p, <16 x i1> %x) {
+  %ret = sext <16 x i1> %x to <16 x i32>
+  store <16 x i32> %ret, <16 x i32>* %p
+  ret void
+; CHECK: foo
+; CHECK-NOT: pmovsxbd
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr18162.ll b/test/CodeGen/X86/pr18162.ll
new file mode 100644
index 0000000..523e47d
--- /dev/null
+++ b/test/CodeGen/X86/pr18162.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s
+
+; Make sure we are not crashing on this one.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+%"Iterator" = type { i32* }
+
+declare { i64, <2 x float> } @Call() 
+declare { i64, <2 x float> }* @CallPtr() 
+
+define { i64, <2 x float> } @Foo(%"Iterator"* %this) {
+entry:
+  %retval = alloca i32
+  %this.addr = alloca %"Iterator"*
+  %this1 = load %"Iterator"** %this.addr
+  %bundle_ = getelementptr inbounds %"Iterator"* %this1, i32 0, i32 0
+  %0 = load i32** %bundle_
+  %1 = call { i64, <2 x float> } @Call()
+  %2 = call { i64, <2 x float> }* @CallPtr()
+  %3 = getelementptr { i64, <2 x float> }* %2, i32 0, i32 1
+  %4 = extractvalue { i64, <2 x float> } %1, 1
+  store <2 x float> %4, <2 x float>* %3
+  %5 = load { i64, <2 x float> }* %2
+  ret { i64, <2 x float> } %5
+}
+
diff --git a/test/CodeGen/X86/pre-ra-sched.ll b/test/CodeGen/X86/pre-ra-sched.ll
index b792ffa..70135d4 100644
--- a/test/CodeGen/X86/pre-ra-sched.ll
+++ b/test/CodeGen/X86/pre-ra-sched.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -debug-only=pre-RA-sched \
-; RUN:     2>&1 | FileCheck %s
+; RUN-disabled: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=ilp -debug-only=pre-RA-sched \
+; RUN-disabled:     2>&1 | FileCheck %s
+; RUN: true
 ; REQUIRES: asserts
 ;
 ; rdar:13279013: pre-RA-sched should not check all interferences and
diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll
index efb5191..d6571ac 100644
--- a/test/CodeGen/X86/prefetch.ll
+++ b/test/CodeGen/X86/prefetch.ll
@@ -1,6 +1,9 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
 ; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
 ; RUN: llc < %s -march=x86 -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW
+; RUN: llc < %s -march=x86 -mcpu=slm | FileCheck %s -check-prefix=SLM
+; RUN: llc < %s -march=x86 -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW
+; RUN: llc < %s -march=x86 -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW
 
 ; rdar://10538297
 
@@ -11,6 +14,8 @@ entry:
 ; CHECK: prefetcht0
 ; CHECK: prefetchnta
 ; PRFCHW: prefetchw
+; NOPRFCHW-NOT: prefetchw
+; SLM: prefetchw
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
diff --git a/test/CodeGen/X86/prefixdata.ll b/test/CodeGen/X86/prefixdata.ll
new file mode 100644
index 0000000..2ec1892
--- /dev/null
+++ b/test/CodeGen/X86/prefixdata.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+@i = linkonce_odr global i32 1
+
+; CHECK: f:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .long	1
+define void @f() prefix i32 1 {
+  ret void
+}
+
+; CHECK: g:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .quad	i
+define void @g() prefix i32* @i {
+  ret void
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
index 1b16a2d..48182d0 100644
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrand | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s
 declare {i16, i32} @llvm.x86.rdrand.16()
 declare {i32, i32} @llvm.x86.rdrand.32()
 declare {i64, i32} @llvm.x86.rdrand.64()
@@ -11,10 +11,10 @@ define i32 @_rdrand16_step(i16* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdrand16_step:
 ; CHECK: rdrandw	%ax
-; CHECK: movw	%ax, (%r[[A0:di|cx]])
 ; CHECK: movzwl	%ax, %ecx
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%ecx, %eax
+; CHECK: movw	%cx, (%r[[A0:di|cx]])
 ; CHECK: ret
 }
 
@@ -26,9 +26,9 @@ define i32 @_rdrand32_step(i32* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdrand32_step:
 ; CHECK: rdrandl	%e[[T0:[a-z]+]]
-; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: ret
 }
 
@@ -40,9 +40,9 @@ define i32 @_rdrand64_step(i64* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdrand64_step:
 ; CHECK: rdrandq	%r[[T1:[a-z]+]]
-; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/rdseed.ll b/test/CodeGen/X86/rdseed.ll
index edc5069..c219b4a 100644
--- a/test/CodeGen/X86/rdseed.ll
+++ b/test/CodeGen/X86/rdseed.ll
@@ -12,10 +12,10 @@ define i32 @_rdseed16_step(i16* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdseed16_step:
 ; CHECK: rdseedw	%ax
-; CHECK: movw	%ax, (%r[[A0:di|cx]])
 ; CHECK: movzwl	%ax, %ecx
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%ecx, %eax
+; CHECK: movw	%cx, (%r[[A0:di|cx]])
 ; CHECK: ret
 }
 
@@ -27,9 +27,9 @@ define i32 @_rdseed32_step(i32* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdseed32_step:
 ; CHECK: rdseedl	%e[[T0:[a-z]+]]
-; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: ret
 }
 
@@ -41,8 +41,8 @@ define i32 @_rdseed64_step(i64* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdseed64_step:
 ; CHECK: rdseedq	%r[[T1:[a-z]+]]
-; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/rem-2.ll b/test/CodeGen/X86/rem-2.ll
deleted file mode 100644
index 1b2af4b..0000000
--- a/test/CodeGen/X86/rem-2.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=x86 | not grep cltd
-
-define i32 @test(i32 %X) nounwind readnone {
-entry:
-	%0 = srem i32 41, %X
-	ret i32 %0
-}
diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll
index 394070e..733b794 100644
--- a/test/CodeGen/X86/rem.ll
+++ b/test/CodeGen/X86/rem.ll
@@ -1,22 +1,37 @@
-; RUN: llc < %s -march=x86 | not grep div
+; RUN: llc < %s -march=x86 | FileCheck %s
 
+; CHECK-LABEL: test1:
+; CHECK-NOT: div
 define i32 @test1(i32 %X) {
         %tmp1 = srem i32 %X, 255                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test2:
+; CHECK-NOT: div
 define i32 @test2(i32 %X) {
         %tmp1 = srem i32 %X, 256                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test3:
+; CHECK-NOT: div
 define i32 @test3(i32 %X) {
         %tmp1 = urem i32 %X, 255                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test4:
+; CHECK-NOT: div
 define i32 @test4(i32 %X) {
         %tmp1 = urem i32 %X, 256                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test5:
+; CHECK-NOT: cltd
+define i32 @test5(i32 %X) nounwind readnone {
+entry:
+	%0 = srem i32 41, %X
+	ret i32 %0
+}
diff --git a/test/CodeGen/X86/rounding-ops.ll b/test/CodeGen/X86/rounding-ops.ll
index ace31cf..69f4bfb 100644
--- a/test/CodeGen/X86/rounding-ops.ll
+++ b/test/CodeGen/X86/rounding-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck -check-prefix=CHECK-SSE %s
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
 
 define float @test1(float %x) nounwind  {
diff --git a/test/CodeGen/X86/scalar_widen_div.ll b/test/CodeGen/X86/scalar_widen_div.ll
index e99ea93..5807d5b 100644
--- a/test/CodeGen/X86/scalar_widen_div.ll
+++ b/test/CodeGen/X86/scalar_widen_div.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 |  FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 |  FileCheck %s
 
 ; Verify when widening a divide/remainder operation, we only generate a
 ; divide/rem per element since divide/remainder can trap.
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index c2aa617..e170762 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -31,7 +31,7 @@ false:
 ; X32-NEXT: ret
 
 ; X32:      movl %esp, %eax
-; X32-NEXT: subl %ecx, %eax
+; X32:      subl %ecx, %eax
 ; X32-NEXT: cmpl %eax, %gs:48
 
 ; X32:      movl %eax, %esp
@@ -52,7 +52,7 @@ false:
 ; X64-NEXT: ret
 
 ; X64:      movq %rsp, %[[RDI:rdi|rax]]
-; X64-NEXT: subq %{{.*}}, %[[RDI]]
+; X64:      subq %{{.*}}, %[[RDI]]
 ; X64-NEXT: cmpq %[[RDI]], %fs:112
 
 ; X64:      movq %[[RDI]], %rsp
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 5fe2b70..cdd258d 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -34,12 +34,12 @@ bb90:		; preds = %bb84, %bb72
 bb91:		; preds = %bb84
 	ret i32 0
 ; CHECK-LABEL: test2:
-; CHECK: movnew
-; CHECK: movswl
+; CHECK: cmovnew
+; CHECK: cwtl
 
 ; ATOM-LABEL: test2:
-; ATOM: movnew
-; ATOM: movswl
+; ATOM: cmovnew
+; ATOM: cwtl
 }
 
 declare i1 @return_false()
@@ -256,8 +256,8 @@ entry:
   %call = tail call noalias i8* @_Znam(i64 %D) nounwind noredzone
   ret i8* %call
 ; CHECK-LABEL: test12:
-; CHECK: movq $-1, %[[R:r..]]
 ; CHECK: mulq
+; CHECK: movq $-1, %[[R:r..]]
 ; CHECK: cmovnoq	%rax, %[[R]]
 ; CHECK: jmp	__Znam
 
diff --git a/test/CodeGen/X86/setcc-narrowing.ll b/test/CodeGen/X86/setcc-narrowing.ll
new file mode 100644
index 0000000..25cb2c8
--- /dev/null
+++ b/test/CodeGen/X86/setcc-narrowing.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s
+; PR17338
+
+@t1.global = internal global i64 -1, align 8
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: cmpl	$0, _t1.global
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: ret
+  %0 = load i64* @t1.global, align 8
+  %and = and i64 4294967295, %0
+  %cmp = icmp sgt i64 %and, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/X86/setcc-sentinals.ll b/test/CodeGen/X86/setcc-sentinals.ll
new file mode 100644
index 0000000..d36e678
--- /dev/null
+++ b/test/CodeGen/X86/setcc-sentinals.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mcpu=generic -march=x86-64 -asm-verbose=false | FileCheck %s
+
+define zeroext i1 @test0(i64 %x) nounwind {
+; CHECK-LABEL: test0:
+; CHECK-NEXT: incq %[[X:rdi|rcx]]
+; CHECK-NEXT: cmpq $1, %[[X]]
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: ret
+  %cmp1 = icmp ne i64 %x, -1
+  %not.cmp = icmp ne i64 %x, 0
+  %.cmp1 = and i1 %cmp1, %not.cmp
+  ret i1 %.cmp1
+}
diff --git a/test/CodeGen/X86/sha.ll b/test/CodeGen/X86/sha.ll
new file mode 100644
index 0000000..bf81e99
--- /dev/null
+++ b/test/CodeGen/X86/sha.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -mattr=+sha -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-unknown-unknown
+
+declare <4 x i32> @llvm.x86.sha1rnds4(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <4 x i32> @test_sha1rnds4rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a, <4 x i32> %b, i8 3)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1rnds4rr
+  ; CHECK: sha1rnds4 $3, %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1rnds4rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a, <4 x i32> %0, i8 3)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1rnds4rm
+  ; CHECK: sha1rnds4 $3, (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha1nexte(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1nexterr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1nexterr
+  ; CHECK: sha1nexte %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1nexterm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1nexterm
+  ; CHECK: sha1nexte (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha1msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1msg1rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1msg1rr
+  ; CHECK: sha1msg1 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1msg1rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1msg1rm
+  ; CHECK: sha1msg1 (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha1msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1msg2rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1msg2rr
+  ; CHECK: sha1msg2 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1msg2rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1msg2rm
+  ; CHECK: sha1msg2 (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha256rnds2(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256rnds2rr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %0
+  ; CHECK: test_sha256rnds2rr
+  ; CHECK: movaps %xmm0, [[XMM_TMP1:%xmm[1-9][0-9]?]]
+  ; CHECK: movaps %xmm2, %xmm0
+  ; CHECK: sha256rnds2 %xmm1, [[XMM_TMP1]]
+}
+
+define <4 x i32> @test_sha256rnds2rm(<4 x i32> %a, <4 x i32>* %b, <4 x i32> %c) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %0, <4 x i32> %c)
+  ret <4 x i32> %1
+  ; CHECK: test_sha256rnds2rm
+  ; CHECK: movaps %xmm0, [[XMM_TMP2:%xmm[1-9][0-9]?]]
+  ; CHECK: movaps %xmm1, %xmm0
+  ; CHECK: sha256rnds2 (%rdi), [[XMM_TMP2]]
+}
+
+declare <4 x i32> @llvm.x86.sha256msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256msg1rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha256msg1rr
+  ; CHECK: sha256msg1 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha256msg1rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha256msg1rm
+  ; CHECK: sha256msg1 (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha256msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256msg2rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha256msg2rr
+  ; CHECK: sha256msg2 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha256msg2rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha256msg2rm
+  ; CHECK: sha256msg2 (%rdi), %xmm0
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
index 0116789..7615754 100644
--- a/test/CodeGen/X86/shift-bmi2.ll
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -30,10 +30,11 @@ entry:
   %x = load i32* %p
   %shl = shl i32 %x, %shamt
 ; BMI2: shl32p
-; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: shlxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: shl32p
-; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shlxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -74,7 +75,7 @@ entry:
   %x = load i64* %p
   %shl = shl i64 %x, %shamt
 ; BMI264: shl64p
-; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shlxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
@@ -106,10 +107,11 @@ entry:
   %x = load i32* %p
   %shl = lshr i32 %x, %shamt
 ; BMI2: lshr32p
-; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: shrxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: lshr32p
-; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shrxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -128,7 +130,7 @@ entry:
   %x = load i64* %p
   %shl = lshr i64 %x, %shamt
 ; BMI264: lshr64p
-; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shrxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
@@ -150,10 +152,11 @@ entry:
   %x = load i32* %p
   %shl = ashr i32 %x, %shamt
 ; BMI2: ashr32p
-; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: sarxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: ashr32p
-; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: sarxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -172,7 +175,7 @@ entry:
   %x = load i64* %p
   %shl = ashr i64 %x, %shamt
 ; BMI264: ashr64p
-; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: sarxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index 7b774f6..589e9ec 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -106,10 +106,10 @@ declare i32 @bar2(i32, i32, i32)
 define signext i16 @t8() nounwind ssp {
 entry:
 ; 32-LABEL: t8:
-; 32: calll {{_?}}bar3
+; 32: jmp {{_?}}bar3
 
 ; 64-LABEL: t8:
-; 64: callq {{_?}}bar3
+; 64: jmp {{_?}}bar3
   %0 = tail call signext i16 @bar3() nounwind      ; <i16> [#uses=1]
   ret i16 %0
 }
@@ -122,7 +122,7 @@ entry:
 ; 32: calll *
 
 ; 64-LABEL: t9:
-; 64: callq *
+; 64: jmpq *
   %0 = bitcast i32 (i32)* %x to i16 (i32)*
   %1 = tail call signext i16 %0(i32 0) nounwind
   ret i16 %1
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll
index 0741635..64f5311 100644
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -26,11 +26,10 @@ define double @foo(double %x, double %y, i1 %c) nounwind {
 
 ; CHECK-LABEL: split:
 ; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne
-; CHECK-NEXT: movaps
-; CHECK-NEXT: ret
+; CHECK-NEXT: je
 ; CHECK:      divsd
-; CHECK-NEXT: ret
+; CHECK:      movaps
+; CHECK:      ret
 define double @split(double %x, double %y, i1 %c) nounwind {
   %a = fdiv double %x, 3.2
   %z = select i1 %c, double %a, double %y
@@ -65,7 +64,7 @@ return:
 ; Sink instructions with dead EFLAGS defs.
 
 ; FIXME: Unfail the zzz test if we can correctly mark pregs with the kill flag.
-; 
+;
 ; See <rdar://problem/8030636>. This test isn't valid after we made machine
 ; sinking more conservative about sinking instructions that define a preg into a
 ; block when we don't know if the preg is killed within the current block.
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 9b5179e..fc79e31 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -55,6 +55,6 @@ entry:
 ; Function Attrs: nounwind readnone
 declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
 
-attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
new file mode 100644
index 0000000..65d44bf
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -0,0 +1,308 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s
+
+define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: addss
+  %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: cmpordps
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: cmpordss
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: sbb
+  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
+  ; CHECK: movl
+  ; CHECK: cvtsi2ss
+  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
+
+
+define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
+  ; CHECK: cvtss2si
+  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
+  ; CHECK: cvttss2si
+  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: divss
+  %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_ldmxcsr(i8* %a0) {
+  ; CHECK: movl
+  ; CHECK: ldmxcsr
+  call void @llvm.x86.sse.ldmxcsr(i8* %a0)
+  ret void
+}
+declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
+
+
+
+define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: maxps
+  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: maxss
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: minps
+  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: minss
+  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
+  ; CHECK: movmskps
+  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+
+
+define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: mulss
+  %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
+  ; CHECK: rcpps
+  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
+  ; CHECK: rcpss
+  %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
+  ; CHECK: rsqrtps
+  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
+  ; CHECK: rsqrtss
+  %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
+  ; CHECK: sqrtps
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
+  ; CHECK: sqrtss
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_stmxcsr(i8* %a0) {
+  ; CHECK: movl
+  ; CHECK: stmxcsr
+  call void @llvm.x86.sse.stmxcsr(i8* %a0)
+  ret void
+}
+declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
+
+
+define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
+  ; CHECK: movl
+  ; CHECK: movups
+  call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
+  ret void
+}
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+
+define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: subss
+  %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
index 30a0fbe..1ac9832 100644
--- a/test/CodeGen/X86/sse2-blend.ll
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
 
 ; CHECK: vsel_float
 ; CHECK: pandn
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
new file mode 100644
index 0000000..ff6c10b
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -0,0 +1,712 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s
+
+define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: addsd
+  %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: cmpordpd
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: cmpordsd
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: sbbl    %eax, %eax
+  ; CHECK: andl    $1, %eax
+  %res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
+  ; CHECK: cvtdq2pd
+  %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
+  ; CHECK: cvtdq2ps
+  %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
+  ; CHECK: cvtpd2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
+  ; CHECK: cvtpd2ps
+  %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
+  ; CHECK: cvtps2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
+  ; CHECK: cvtps2pd
+  %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
+  ; CHECK: cvtsd2si
+  %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
+  ; CHECK: cvtsd2ss 
+  ; CHECK-NOT: cvtsd2ss %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} 
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0) {
+  ; CHECK: movl
+  ; CHECK: cvtsi2sd
+  %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
+  ; CHECK: cvtss2sd
+  %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
+  ; CHECK: cvttpd2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+  ; CHECK: cvttps2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
+  ; CHECK: cvttsd2si
+  %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: divsd
+  %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+
+define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: maxpd
+  %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: maxsd
+  %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: minpd
+  %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: minsd
+  %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
+  ; CHECK: movmskpd
+  %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
+
+
+
+
+define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: test_x86_sse2_mul_sd
+  ; CHECK: mulsd
+  %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: packssdw
+  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: packsswb
+  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: packuswb
+  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: paddsb
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: paddsw
+  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: paddusb
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: paddusw
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pavgb
+  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pavgw
+  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmaddwd
+  %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmaxsw
+  %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pmaxub
+  %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pminsw
+  %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pminub
+  %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
+  ; CHECK: pmovmskb
+  %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmulhw
+  %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmulhuw
+  %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmuludq
+  %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psadbw
+  %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pslld
+  %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: pslldq
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+  ; CHECK: pslldq
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: psllq
+  %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psllw
+  %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
+  ; CHECK: pslld
+  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
+  ; CHECK: psllq
+  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
+  ; CHECK: psllw
+  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: psrad
+  %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psraw
+  %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
+  ; CHECK: psrad
+  %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
+  ; CHECK: psraw
+  %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: psrld
+  %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: psrldq
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+  ; CHECK: psrldq
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: psrlq
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psrlw
+  %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
+  ; CHECK: psrld
+  %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
+  ; CHECK: psrlq
+  %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
+  ; CHECK: psrlw
+  %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psubsb
+  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psubsw
+  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psubusb
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psubusw
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
+  ; CHECK: sqrtpd
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
+  ; CHECK: sqrtsd
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+
+define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
+  ; CHECK: test_x86_sse2_storel_dq
+  ; CHECK: movl
+  ; CHECK: movq
+  call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
+  ret void
+}
+declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
+
+
+define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
+  ; CHECK: test_x86_sse2_storeu_dq
+  ; CHECK: movl
+  ; CHECK: movdqu
+  ; add operation forces the execution domain.
+  %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
+  ret void
+}
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+
+define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
+  ; CHECK: test_x86_sse2_storeu_pd
+  ; CHECK: movl
+  ; CHECK: movupd
+  ; fadd operation forces the execution domain.
+  %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
+  call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
+  ret void
+}
+declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
+
+
+define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: test_x86_sse2_sub_sd
+  ; CHECK: subsd
+  %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index e2d6125..462def9 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -121,7 +121,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_3:
-; CHECK: psraw   $16, %xmm0
+; CHECK: psraw   $15, %xmm0
 ; CHECK-NEXT: ret
 
 define <4 x i32> @test_srad_1(<4 x i32> %InVec) {
@@ -151,7 +151,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_3:
-; CHECK: psrad   $32, %xmm0
+; CHECK: psrad   $31, %xmm0
 ; CHECK-NEXT: ret
 
 ; SSE Logical Shift Right
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 217139a..9147c22 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -7,7 +7,7 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-        
+
 ; CHECK-LABEL: test1:
 ; CHECK: 	movl	8(%esp), %eax
 ; CHECK-NEXT: 	movapd	(%eax), %xmm0
@@ -23,12 +23,12 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-        
+
 ; CHECK-LABEL: test2:
-; CHECK: 	movl	8(%esp), %eax
-; CHECK-NEXT: 	movapd	(%eax), %xmm0
+; CHECK: 	movl	4(%esp), %eax
+; CHECK: 	movl	8(%esp), %ecx
+; CHECK-NEXT: 	movapd	(%ecx), %xmm0
 ; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movl	4(%esp), %eax
 ; CHECK-NEXT: 	movapd	%xmm0, (%eax)
 ; CHECK-NEXT: 	ret
 }
@@ -48,7 +48,7 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind
 	store <4 x float> %tmp13, <4 x float>* %res
 	ret void
 ; CHECK: @test3
-; CHECK: 	unpcklps	
+; CHECK: 	unpcklps
 }
 
 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
@@ -85,9 +85,9 @@ define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
         %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
         store <4 x float> %tmp2, <4 x float>* %res
         ret void
-        
+
 ; CHECK-LABEL: test6:
-; CHECK: 	movaps	(%eax), %xmm0
+; CHECK: 	movaps	(%ecx), %xmm0
 ; CHECK:	movaps	%xmm0, (%eax)
 }
 
@@ -96,7 +96,7 @@ define void @test7() nounwind {
         shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
         store <4 x float> %2, <4 x float>* null
         ret void
-        
+
 ; CHECK-LABEL: test7:
 ; CHECK:	xorps	%xmm0, %xmm0
 ; CHECK:	movaps	%xmm0, 0
@@ -166,7 +166,7 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x fl
         store <4 x float> %tmp11, <4 x float>* %res
         ret void
 ; CHECK: test13
-; CHECK: shufps	$69, (%eax), %xmm0
+; CHECK: shufps	$69, (%ecx), %xmm0
 ; CHECK: pshufd	$-40, %xmm0, %xmm0
 }
 
@@ -178,8 +178,8 @@ define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
         %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
         ret <4 x float> %tmp27
 ; CHECK-LABEL: test14:
-; CHECK: 	subps	[[X1:%xmm[0-9]+]], [[X2:%xmm[0-9]+]]
-; CHECK: 	addps	[[X1]], [[X0:%xmm[0-9]+]]
+; CHECK: 	addps	[[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
+; CHECK: 	subps	[[X1]], [[X2:%xmm[0-9]+]]
 ; CHECK: 	movlhps	[[X2]], [[X0]]
 }
 
@@ -221,4 +221,3 @@ entry:
  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
  ret <4 x float> %double2float.i
 }
-
diff --git a/test/CodeGen/X86/sse3-intrinsics-x86.ll b/test/CodeGen/X86/sse3-intrinsics-x86.ll
new file mode 100644
index 0000000..dbd14b8
--- /dev/null
+++ b/test/CodeGen/X86/sse3-intrinsics-x86.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse3 | FileCheck %s
+
+define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: addsubpd
+  %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: addsubps
+  %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: haddpd
+  %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: haddps
+  %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: hsubpd
+  %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: hsubps
+  %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
+  ; CHECK: movl
+  ; CHECK: lddqu
+  %res = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index bd92d22..a32f5de 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: vsel_float:
 ;CHECK: blendvps
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
new file mode 100644
index 0000000..37eff43
--- /dev/null
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -0,0 +1,326 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
+
+define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: blendpd
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: blendps
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: blendvpd
+  %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: blendvps
+  %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: dppd
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: dpps
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: insertps
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+
+define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: mpsadbw
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: packusdw
+  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  ; CHECK: pblendvb
+  %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pblendw
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
+  ; CHECK: phminposuw
+  %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pmaxsb
+  %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmaxsd
+  %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmaxud
+  %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmaxuw
+  %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pminsb
+  %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pminsd
+  %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pminud
+  %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pminuw
+  %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
+  ; CHECK: pmovsxbd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
+  ; CHECK: pmovsxbq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
+  ; CHECK: pmovsxbw
+  %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
+  ; CHECK: pmovsxdq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
+  ; CHECK: pmovsxwd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
+  ; CHECK: pmovsxwq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
+  ; CHECK: pmovzxbd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
+  ; CHECK: pmovzxbq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
+  ; CHECK: pmovzxbw
+  %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
+  ; CHECK: pmovzxdq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
+  ; CHECK: pmovzxwd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
+  ; CHECK: pmovzxwq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmuldq
+  %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: ptest 
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: ptest 
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: ptest 
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
+  ; CHECK: roundpd
+  %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
+  ; CHECK: roundps
+  %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: roundsd
+  %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: roundss
+  %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 87b64e5..c15e24c 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
 
 @g16 = external global i16
 
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86.ll b/test/CodeGen/X86/sse42-intrinsics-x86.ll
new file mode 100644
index 0000000..5ca8009
--- /dev/null
+++ b/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -0,0 +1,182 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 | FileCheck %s
+
+define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: pcmpestri $7
+  ; CHECK: movl
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: pcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
+define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: seta
+  %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: seto
+  %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: sets
+  %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: sete
+  %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestrm
+  ; CHECK-NOT: vmov
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: pcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri $7
+  ; CHECK: movl
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: pcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
+define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: seta
+  %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: seto
+  %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: sets
+  %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: sete
+  %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistrm $7
+  ; CHECK-NOT: vmov
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: pcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/sse42.ll b/test/CodeGen/X86/sse42.ll
index c787523..db51d99 100644
--- a/test/CodeGen/X86/sse42.ll
+++ b/test/CodeGen/X86/sse42.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X64
 
 declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
 declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
diff --git a/test/CodeGen/X86/sse42_64.ll b/test/CodeGen/X86/sse42_64.ll
index 8b3a69b..b39e76c 100644
--- a/test/CodeGen/X86/sse42_64.ll
+++ b/test/CodeGen/X86/sse42_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X64
 
 declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
 declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
diff --git a/test/CodeGen/X86/ssse3-intrinsics-x86.ll b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
new file mode 100644
index 0000000..728cbc9
--- /dev/null
+++ b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
@@ -0,0 +1,120 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+ssse3 | FileCheck %s
+
+define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
+  ; CHECK: pabsb
+  %res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
+  ; CHECK: pabsd
+  %res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
+  ; CHECK: pabsw
+  %res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: phaddd
+  %res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phaddsw
+  %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phaddw
+  %res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: phsubd
+  %res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phsubsw
+  %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phsubw
+  %res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pmaddubsw
+  %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmulhrsw
+  %res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pshufb
+  %res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psignb
+  %res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: psignd
+  %res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psignw
+  %res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
new file mode 100644
index 0000000..bd27ac3
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -0,0 +1,97 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o -
+
+; PR16954
+;
+; Make sure that when we splice off the end of a machine basic block, we include
+; DBG_VALUE MI in the terminator sequence.
+
+@a = external global { i64, [56 x i8] }, align 32
+
+; Function Attrs: nounwind sspreq
+define i32 @_Z18read_response_sizev() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23), !dbg !39
+  %0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40
+  tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64), !dbg !71
+  %1 = trunc i64 %0 to i32
+  ret i32 %1
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata)
+
+attributes #0 = { sspreq }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !72}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"}
+!2 = metadata !{metadata !3}
+!3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{i32 0}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
+!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !15, metadata !19}
+!15 = metadata !{i32 786688, metadata !9, metadata !"b", metadata !10, i32 28, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 28]
+!16 = metadata !{i32 786451, metadata !1, null, metadata !"B", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"end_of_file", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
+!19 = metadata !{i32 786688, metadata !9, metadata !"c", metadata !10, i32 29, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 29]
+!20 = metadata !{}
+!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)}
+!23 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long>", metadata !"min<unsigned long long>", metadata !"_ZN3__13minIyEERKT_S3_RS1_", i32 12, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !33, null, metadata !35, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
+!25 = metadata !{i32 786489, metadata !26, null, metadata !"__1", i32 1} ; [ DW_TAG_namespace ] [__1] [line 1]
+!26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"}
+!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !29, metadata !29, metadata !32}
+!29 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!31 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!32 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!33 = metadata !{metadata !34}
+!34 = metadata !{i32 786479, null, metadata !"_Tp", metadata !31, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!35 = metadata !{metadata !36, metadata !37}
+!36 = metadata !{i32 786689, metadata !24, metadata !"p1", metadata !10, i32 16777228, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 12]
+!37 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!38 = metadata !{i32 33, i32 0, metadata !9, null}
+!39 = metadata !{i32 12, i32 0, metadata !24, metadata !38}
+!40 = metadata !{i32 9, i32 0, metadata !41, metadata !59}
+!41 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long, __1::A>", metadata !"min<unsigned long long, __1::A>", metadata !"_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", i32 7, metadata !42, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !53, null, metadata !55, i32 8} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
+!42 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44}
+!44 = metadata !{i32 786451, metadata !1, metadata !25, metadata !"A", i32 0, i64 8, i64 8, i32 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
+!45 = metadata !{metadata !46}
+!46 = metadata !{i32 786478, metadata !1, metadata !44, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !52, i32 1} ; [ DW_TAG_subprogram ] [line 1] [operator()]
+!47 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !48, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50}
+!49 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!50 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!52 = metadata !{i32 786468}
+!53 = metadata !{metadata !34, metadata !54}
+!54 = metadata !{i32 786479, null, metadata !"_Compare", metadata !44, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!55 = metadata !{metadata !56, metadata !57, metadata !58}
+!56 = metadata !{i32 786689, metadata !41, metadata !"p1", metadata !10, i32 16777223, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 7]
+!57 = metadata !{i32 786689, metadata !41, metadata !"p2", metadata !10, i32 33554439, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 7]
+!58 = metadata !{i32 786689, metadata !41, metadata !"p3", metadata !10, i32 50331656, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p3] [line 8]
+!59 = metadata !{i32 13, i32 0, metadata !24, metadata !38}
+!63 = metadata !{i32 undef}
+!64 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!65 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !46, metadata !66, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
+!66 = metadata !{metadata !67, metadata !69, metadata !70}
+!67 = metadata !{i32 786689, metadata !65, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!69 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!70 = metadata !{i32 786689, metadata !65, metadata !"", metadata !10, i32 50331650, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 2]
+!71 = metadata !{i32 1, i32 0, metadata !65, metadata !40}
+!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll b/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
new file mode 100644
index 0000000..7d499f9
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple i386-unknown-freebsd10.0 -march=x86 --relocation-model=pic %s -o -
+
+; PR16979
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd10.0"
+
+@state = internal unnamed_addr global i32 0, align 4
+
+; Function Attrs: nounwind sspreq
+define void @set_state(i32 %s) #0 {
+entry:
+  store i32 %s, i32* @state, align 4
+  ret void
+}
+
+; Function Attrs: nounwind sspreq
+define void @zero_char(i8* nocapture %p) #0 {
+entry:
+  store i8 0, i8* %p, align 1
+  tail call void @g(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) #2
+  ret void
+}
+
+declare void @g(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind sspreq
+define void @do_something(i32 %i) #0 {
+entry:
+  %data = alloca [8 x i8], align 1
+  %0 = load i32* @state, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call fastcc void @send_int(i32 0)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call fastcc void @send_int(i32 %i)
+  %arrayidx = getelementptr inbounds [8 x i8]* %data, i32 0, i32 0
+  call void @zero_char(i8* %arrayidx)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: nounwind sspreq
+define internal fastcc void @send_int(i32 %p) #0 {
+entry:
+  tail call void @f(i32 %p) #2
+  tail call void @g(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) #2
+  ret void
+}
+
+declare void @f(i32) #1
+
+attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index a4dbbb9..265ec80 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -2313,18 +2313,22 @@ entry:
 ; LINUX-I386-LABEL: test19d:
 ; LINUX-I386: mov{{l|q}} %gs:
 ; LINUX-I386: calll __stack_chk_fail
+; LINUX-I386-NOT: calll __stack_chk_fail
 
 ; LINUX-X64-LABEL: test19d:
 ; LINUX-X64: mov{{l|q}} %fs:
 ; LINUX-X64: callq __stack_chk_fail
+; LINUX-X64-NOT: callq __stack_chk_fail
 
 ; LINUX-KERNEL-X64-LABEL: test19d:
 ; LINUX-KERNEL-X64: mov{{l|q}} %gs:
 ; LINUX-KERNEL-X64: callq __stack_chk_fail
+; LINUX-KERNEL-X64-NOT: callq ___stack_chk_fail
 
 ; DARWIN-X64-LABEL: test19d:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
+; DARWIN-X64-NOT: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %exn.slot = alloca i8*
   %ehselector.slot = alloca i32
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
new file mode 100644
index 0000000..ed95583
--- /dev/null
+++ b/test/CodeGen/X86/stackmap.ll
@@ -0,0 +1,292 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; CHECK-NEXT:   .long   0
+; Num LargeConstants
+; CHECK-NEXT:   .long   1
+; CHECK-NEXT:   .quad   4294967296
+; Num Callsites
+; CHECK-NEXT:   .long   11
+
+; Constant arguments
+;
+; CHECK-NEXT:   .long   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 12345 to i8*
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 1, i32 15, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-NEXT:   .long   3
+; CHECK-NEXT:   .long   L{{.*}}-_osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 -559038737 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-NEXT:   .long  4
+; CHECK-NEXT:   .long   L{{.*}}-_osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 -559038737 to i8*
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-NEXT:  .long  5
+; CHECK-NEXT:   .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:  .short  0
+; CHECK-NEXT:  .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 -559038737 to i8*
+  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-NEXT:  .long  6
+; CHECK-NEXT:   .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:  .short  0
+; CHECK-NEXT:  .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 -559038737 to i8*
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-NEXT:   .long  7
+; CHECK-NEXT:   .long   L{{.*}}-_jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 -559038737 to i8*
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK:        .long  8
+; CHECK-NEXT:   .long   L{{.*}}-_jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 -559038737 to i8*
+  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 17 stack map entries.
+;
+; CHECK:      .long 11
+; CHECK-NEXT: .long L{{.*}}-_spilledValue
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 17
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect RBP + ...
+; CHECK:      .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 6
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
+entry:
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 17 stack map entries.
+;
+; CHECK:       .long 12
+; CHECK-LABEL: .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:  .short 0
+; CHECK-NEXT:  .short 17
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect RBP + ...
+; CHECK:      .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 6
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
+entry:
+  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  ret void
+}
+
+; Spill a subregister stackmap operand.
+;
+; CHECK:       .long 13
+; CHECK-LABEL: .long L{{.*}}-_spillSubReg
+; CHECK-NEXT:  .short 0
+; 4 locations
+; CHECK-NEXT:  .short 1
+;
+; Check that the subregister operand is a 4-byte spill.
+; Location: Indirect, 4-byte, RBP + ...
+; CHECK:      .byte 3
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .short 6
+define void @spillSubReg(i64 %arg) #0 {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  unreachable
+
+bb2:
+  %tmp = load i64* inttoptr (i64 140685446136880 to i64*)
+  br i1 undef, label %bb16, label %bb17
+
+bb16:
+  unreachable
+
+bb17:
+  %tmp32 = trunc i64 %tmp to i32
+  br i1 undef, label %bb60, label %bb61
+
+bb60:
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 5, i32 %tmp32)
+  unreachable
+
+bb61:
+  unreachable
+}
+
+; Map a single byte subregister. There is no DWARF register number, so
+; we expect the register to be encoded with the proper size and spill offset. We don't know which
+;
+; CHECK:       .long 14
+; CHECK-LABEL: .long L{{.*}}-_subRegOffset
+; CHECK-NEXT:  .short 0
+; 2 locations
+; CHECK-NEXT:  .short 2
+;
+; Check that the subregister operands are 1-byte spills.
+; Location 0: Register, 4-byte, AL
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 0
+;
+; Location 1: Register, 4-byte, BL
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .short 3
+; CHECK-NEXT: .long 0
+define void @subRegOffset(i16 %arg) {
+  %v = mul i16 %arg, 5
+  %a0 = trunc i16 %v to i8
+  tail call void asm sideeffect "nop", "~{bx}"() nounwind
+  %arghi = lshr i16 %v, 8
+  %a1 = trunc i16 %arghi to i8
+  tail call void asm sideeffect "nop", "~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i8 %a0, i8 %a1)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i32, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index fab266f..7557f25 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -12,7 +12,7 @@ entry:
   %D = or i32 %C, %B
   store i32 %D, i32* %a0, align 4
   ret void
-  
+
 ; X64-LABEL: test1:
 ; X64: movb	%sil, (%rdi)
 
@@ -34,8 +34,8 @@ entry:
 ; X64: movb	%sil, 1(%rdi)
 
 ; X32-LABEL: test2:
-; X32: movb	8(%esp), %al
-; X32: movb	%al, 1(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 1(%{{.*}})
 }
 
 define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -67,8 +67,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test4:
-; X32: movl	8(%esp), %eax
-; X32: movw	%ax, 2(%{{.*}})
+; X32: movl	8(%esp), %e[[REG:[abcd]x]]
+; X32: movw	%[[REG]], 2(%{{.*}})
 }
 
 define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -84,8 +84,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test5:
-; X32: movzwl	8(%esp), %eax
-; X32: movw	%ax, 2(%{{.*}})
+; X32: movzwl	8(%esp), %e[[REG:[abcd]x]]
+; X32: movw	%[[REG]], 2(%{{.*}})
 }
 
 define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
@@ -102,8 +102,8 @@ entry:
 
 
 ; X32-LABEL: test6:
-; X32: movb	8(%esp), %al
-; X32: movb	%al, 5(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 5(%{{.*}})
 }
 
 define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {
@@ -121,8 +121,8 @@ entry:
 
 
 ; X32-LABEL: test7:
-; X32: movb	8(%esp), %cl
-; X32: movb	%cl, 5(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 5(%{{.*}})
 }
 
 ; PR7833
diff --git a/test/CodeGen/X86/tail-call-attrs.ll b/test/CodeGen/X86/tail-call-attrs.ll
new file mode 100644
index 0000000..17ebe99
--- /dev/null
+++ b/test/CodeGen/X86/tail-call-attrs.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -o - %s | FileCheck %s
+
+; Simple case: completely identical returns, even with extensions, shouldn't be
+; a barrier to tail calls.
+declare zeroext i1 @give_bool()
+define zeroext i1 @test_bool() {
+; CHECK-LABEL: test_bool:
+; CHECK: jmp
+  %call = tail call zeroext i1 @give_bool()
+  ret i1 %call
+}
+
+; Here, there's more zero extension to be done between the call and the return,
+; so a tail call is impossible (well, according to current Clang practice
+; anyway. The AMD64 ABI isn't crystal clear on the matter).
+declare zeroext i32 @give_i32()
+define zeroext i8 @test_i32() {
+; CHECK-LABEL: test_i32:
+; CHECK: callq _give_i32
+; CHECK: movzbl %al, %eax
+; CHECK: ret
+
+  %call = tail call zeroext i32 @give_i32()
+  %val = trunc i32 %call to i8
+  ret i8 %val
+}
+
+; Here, one function is zeroext and the other is signext. To the extent that
+; these both mean something they are incompatible so no tail call is possible.
+declare zeroext i16 @give_unsigned_i16()
+define signext i16 @test_incompatible_i16() {
+; CHECK-LABEL: test_incompatible_i16:
+; CHECK: callq _give_unsigned_i16
+; CHECK: cwtl
+; CHECK: ret
+
+  %call = tail call zeroext i16 @give_unsigned_i16()
+  ret i16 %call
+}
+
+declare inreg i32 @give_i32_inreg()
+define i32 @test_inreg_to_normal() {
+; CHECK-LABEL: test_inreg_to_normal:
+; CHECK: callq _give_i32_inreg
+; CHECK: ret
+  %val = tail call inreg i32 @give_i32_inreg()
+  ret i32 %val
+}
+
+define inreg i32 @test_normal_to_inreg() {
+; CHECK-LABEL: test_normal_to_inreg:
+; CHECK: callq _give_i32
+; CHECK: ret
+  %val = tail call i32 @give_i32()
+  ret i32 %val
+}
diff --git a/test/CodeGen/X86/tailcall-largecode.ll b/test/CodeGen/X86/tailcall-largecode.ll
index e9b8721..f5662d9 100644
--- a/test/CodeGen/X86/tailcall-largecode.ll
+++ b/test/CodeGen/X86/tailcall-largecode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -tailcallopt -code-model=large | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -tailcallopt -code-model=large -enable-misched=false | FileCheck %s
 
 declare fastcc i32 @callee(i32 %arg)
 define fastcc i32 @directcall(i32 %arg) {
diff --git a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
new file mode 100644
index 0000000..1bc6175
--- /dev/null
+++ b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+tbm < %s | FileCheck %s
+
+define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a, i32 2814)
+  ret i32 %0
+}
+
+declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) nounwind readnone
+
+define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %tmp1 = load i32* %a, align 4
+  %0 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %tmp1, i32 2814)
+  ret i32 %0
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a, i64 2814)
+  ret i64 %0
+}
+
+declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
+
+define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEl: test_x86_tbm_bextri_u64_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %tmp1 = load i64* %a, align 8
+  %0 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %tmp1, i64 2814)
+  ret i64 %0
+}
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
new file mode 100644
index 0000000..79eea10
--- /dev/null
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -0,0 +1,253 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+tbm < %s | FileCheck %s
+
+define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = lshr i32 %a, 4
+  %1 = and i32 %0, 4095
+  ret i32 %1
+}
+
+define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = load i32* %a
+  %1 = lshr i32 %0, 4
+  %2 = and i32 %1, 4095
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = lshr i64 %a, 4
+  %1 = and i64 %0, 4095
+  ret i64 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = load i64* %a
+  %1 = lshr i64 %0, 4
+  %2 = and i64 %1, 4095
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcfill_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcfill %
+  %0 = add i32 %a, 1
+  %1 = and i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcfill_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcfill %
+  %0 = add i64 %a, 1
+  %1 = and i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = add i32 1, %a
+  %1 = xor i32 %0, -1
+  %2 = or i32 %1, %a
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = add i64 1, %a
+  %1 = xor i64 %0, -1
+  %2 = or i64 %1, %a
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u32_b:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = sub i32 -2, %a
+  %1 = or i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u64_b:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = sub i64 -2, %a
+  %1 = or i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcic_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcic %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, 1
+  %2 = and i32 %1, %0
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcic_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcic %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, 1
+  %2 = and i64 %1, %0
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcmsk_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcmsk %
+  %0 = add i32 %a, 1
+  %1 = xor i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcmsk_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcmsk %
+  %0 = add i64 %a, 1
+  %1 = xor i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcs_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcs %
+  %0 = add i32 %a, 1
+  %1 = or i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcs_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcs %
+  %0 = add i64 %a, 1
+  %1 = or i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsfill_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blsfill %
+  %0 = add i32 %a, -1
+  %1 = or i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsfill_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blsfill %
+  %0 = add i64 %a, -1
+  %1 = or i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsic_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blsic %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, -1
+  %2 = or i32 %0, %1
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsic_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blsic %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, -1
+  %2 = or i64 %0, %1
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_t1mskc_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: t1mskc %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, 1
+  %2 = or i32 %0, %1
+  ret i32 %2
+}
+
+define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_t1mskc_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: t1mskc %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, 1
+  %2 = or i64 %0, %1
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_tzmsk_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: tzmsk %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, -1
+  %2 = and i32 %0, %1
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_tzmsk_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: tzmsk %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, -1
+  %2 = and i64 %0, %1
+  ret i64 %2
+}
diff --git a/test/CodeGen/X86/test-nofold.ll b/test/CodeGen/X86/test-nofold.ll
index 97db1b3..19fbaaf 100644
--- a/test/CodeGen/X86/test-nofold.ll
+++ b/test/CodeGen/X86/test-nofold.ll
@@ -2,10 +2,10 @@
 ; rdar://5752025
 
 ; We want:
-;      CHECK: movl	$42, %ecx
-; CHECK-NEXT: movl	4(%esp), %eax
-; CHECK-NEXT: andl	$15, %eax
-; CHECK-NEXT: cmovnel	%ecx, %eax
+;      CHECK: movl	4(%esp), %ecx
+; CHECK-NEXT: andl	$15, %ecx
+; CHECK-NEXT: movl	$42, %eax
+; CHECK-NEXT: cmovel	%ecx, %eax
 ; CHECK-NEXT: ret
 ;
 ; We don't want:
@@ -39,4 +39,3 @@ entry:
 	%retval = select i1 %tmp4, i32 %tmp2, i32 42		; <i32> [#uses=1]
 	ret i32 %retval
 }
-
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 24284e5..76a8402 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -223,27 +223,22 @@ entry:
 define i16 @f11() {
 ; X32_LINUX-LABEL: f11:
 ; X32_LINUX:      movzwl %gs:s1@NTPOFF, %eax
-; Why is this kill line here, but no where else?
-; X32_LINUX-NEXT: # kill
-; X32_LINUX-NEXT: ret
+; X32_LINUX:      ret
 ; X64_LINUX-LABEL: f11:
 ; X64_LINUX:      movzwl %fs:s1@TPOFF, %eax
-; X64_LINUX-NEXT: # kill
-; X64_LINUX-NEXT: ret
+; X64_LINUX:      ret
 ; X32_WIN-LABEL: f11:
 ; X32_WIN:      movl __tls_index, %eax
 ; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
 ; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
 ; X32_WIN-NEXT: movzwl _s1@SECREL32(%eax), %eax
-; X32_WIN-NEXT: # kill
-; X32_WIN-NEXT: ret
+; X32_WIN:      ret
 ; X64_WIN-LABEL: f11:
 ; X64_WIN:      movl _tls_index(%rip), %eax
 ; X64_WIN-NEXT: movq %gs:88, %rcx
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movzwl s1@SECREL32(%rax), %eax
-; X64_WIN-NEXT: # kill
-; X64_WIN-NEXT: ret
+; X64_WIN:      ret
 
 entry:
 	%tmp1 = load i16* @s1
diff --git a/test/CodeGen/X86/tlv-3.ll b/test/CodeGen/X86/tlv-3.ll
new file mode 100644
index 0000000..4f79305
--- /dev/null
+++ b/test/CodeGen/X86/tlv-3.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple x86_64-apple-darwin | FileCheck %s
+; PR17964
+
+; CHECK: __DATA,__thread_data,thread_local_regular
+; CHECK: _foo$tlv$init
+@foo = weak_odr thread_local global i8 1, align 4
+
+define i32 @main() {
+    ret i32 0
+}
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 408bdc8..d230f1f 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: load_2_i8:
 ; A single 16-bit load
diff --git a/test/CodeGen/X86/trunc-to-bool.ll b/test/CodeGen/X86/trunc-to-bool.ll
index 3711cf1..0ed6347 100644
--- a/test/CodeGen/X86/trunc-to-bool.ll
+++ b/test/CodeGen/X86/trunc-to-bool.ll
@@ -22,7 +22,7 @@ ret_false:
     ret i1 false
 }
 ; CHECK-LABEL: test2:
-; CHECK: btl %eax
+; CHECK: btl
 
 define i32 @test3(i8* %ptr) nounwind {
     %val = load i8* %ptr
diff --git a/test/CodeGen/X86/unaligned-spill-folding.ll b/test/CodeGen/X86/unaligned-spill-folding.ll
new file mode 100644
index 0000000..154ce9e
--- /dev/null
+++ b/test/CodeGen/X86/unaligned-spill-folding.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=4 -relocation-model=pic < %s | FileCheck %s -check-prefix=UNALIGNED
+; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=16 -relocation-model=pic < %s | FileCheck %s -check-prefix=ALIGNED
+; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=4 -force-align-stack -relocation-model=pic < %s | FileCheck %s -check-prefix=FORCEALIGNED
+
+@arr = internal unnamed_addr global [32 x i32] zeroinitializer, align 16
+
+; PR12250
+define i32 @test1() {
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds [32 x i32]* @arr, i32 0, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>* %1, align 16
+  %2 = add nsw <4 x i32> %wide.load, <i32 10, i32 10, i32 10, i32 10>
+  %3 = xor <4 x i32> %2, <i32 123345, i32 123345, i32 123345, i32 123345>
+  %4 = add nsw <4 x i32> %3, <i32 112, i32 112, i32 112, i32 112>
+  %5 = xor <4 x i32> %4, <i32 543345, i32 543345, i32 543345, i32 543345>
+  %6 = add nsw <4 x i32> %5, <i32 73, i32 73, i32 73, i32 73>
+  %7 = xor <4 x i32> %6, <i32 345987, i32 345987, i32 345987, i32 345987>
+  %8 = add nsw <4 x i32> %7, <i32 48, i32 48, i32 48, i32 48>
+  %9 = xor <4 x i32> %8, <i32 123987, i32 123987, i32 123987, i32 123987>
+  store <4 x i32> %9, <4 x i32>* %1, align 16
+  %index.next = add i32 %index, 4
+  %10 = icmp eq i32 %index.next, 32
+  br i1 %10, label %middle.block, label %vector.body
+
+middle.block:
+  ret i32 0
+
+; We can't fold the spill into a padd unless the stack is aligned. Just spilling
+; doesn't force stack realignment though
+; UNALIGNED-LABEL: @test1
+; UNALIGNED-NOT: andl $-{{..}}, %esp
+; UNALIGNED: movdqu {{.*}} # 16-byte Folded Spill
+; UNALIGNED-NOT: paddd {{.*}} # 16-byte Folded Reload
+
+; ALIGNED-LABEL: @test1
+; ALIGNED-NOT: andl $-{{..}}, %esp
+; ALIGNED: movdqa {{.*}} # 16-byte Spill
+; ALIGNED: paddd {{.*}} # 16-byte Folded Reload
+
+; FORCEALIGNED-LABEL: @test1
+; FORCEALIGNED: andl $-{{..}}, %esp
+; FORCEALIGNED: movdqa {{.*}} # 16-byte Spill
+; FORCEALIGNED: paddd {{.*}} # 16-byte Folded Reload
+}
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index 2422de9..d7ae469 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -19,12 +19,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!12}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 1, metadata !6} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, i32, i32, i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"producer", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
 !6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 786443, metadata !2, metadata !1, i32 1, i32 30, i32 0} ; [ DW_TAG_lexical_block ]
@@ -32,3 +33,4 @@ entry:
 !9 = metadata !{metadata !1}
 !10 = metadata !{metadata !"test.c", metadata !"/dir"}
 !11 = metadata !{i32 0}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/v-binop-widen.ll b/test/CodeGen/X86/v-binop-widen.ll
index 8655c6c..fca4da6 100644
--- a/test/CodeGen/X86/v-binop-widen.ll
+++ b/test/CodeGen/X86/v-binop-widen.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s
-; CHECK: divss
 ; CHECK: divps
 ; CHECK: divps
+; CHECK: divss
 
 %vec = type <9 x float>
 define %vec @vecdiv( %vec %p1, %vec %p2)
@@ -9,4 +9,3 @@ define %vec @vecdiv( %vec %p1, %vec %p2)
   %result = fdiv %vec %p1, %p2
   ret %vec %result
 }
-
diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
index 569586a..3342111 100644
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ b/test/CodeGen/X86/v-binop-widen2.ll
@@ -2,9 +2,9 @@
 ; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
 
 %vec = type <6 x float>
+; CHECK: divps
 ; CHECK: divss
 ; CHECK: divss
-; CHECK: divps
 
 ; Scheduler causes a different instruction order to be produced on Intel Atom
 ; ATOM: divps
diff --git a/test/CodeGen/X86/v4i32load-crash.ll b/test/CodeGen/X86/v4i32load-crash.ll
new file mode 100644
index 0000000..052c4c3
--- /dev/null
+++ b/test/CodeGen/X86/v4i32load-crash.ll
@@ -0,0 +1,27 @@
+; RUN: llc --mcpu=x86-64 --mattr=ssse3 < %s
+
+;PR18045:
+;Issue of selection for 'v4i32 load'.
+;This instruction is not legal for X86 CPUs with sse < 'sse4.1'.
+;This node was generated by X86ISelLowering.cpp, EltsFromConsecutiveLoads
+;static function after legilize stage.
+
+@e = external global [4 x i32], align 4
+@f = external global [4 x i32], align 4
+
+; Function Attrs: nounwind
+define void @fn3(i32 %el) {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 0)
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 1)
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 2)
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 3)
+  %4 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %5 = insertelement <4 x i32> %4, i32 %1, i32 1
+  %6 = insertelement <4 x i32> %5, i32 %2, i32 2
+  %7 = insertelement <4 x i32> %6, i32 %3, i32 3
+  %8 = add <4 x i32> %6, %7
+  store <4 x i32> %8, <4 x i32>* bitcast ([4 x i32]* @f to <4 x i32>*)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/vec_compare-sse4.ll b/test/CodeGen/X86/vec_compare-sse4.ll
index a08d9f5..084d611 100644
--- a/test/CodeGen/X86/vec_compare-sse4.ll
+++ b/test/CodeGen/X86/vec_compare-sse4.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mattr=-sse3,+sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86 -mattr=-sse42,+sse41 | FileCheck %s -check-prefix=SSE41
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s -check-prefix=SSE42
+; RUN: llc < %s -march=x86 -mattr=-sse4.2,+sse4.1 | FileCheck %s -check-prefix=SSE41
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -check-prefix=SSE42
 
 define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; SSE42-LABEL: test1:
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 42d7f27..3cb519a 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse41 -o %t
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 -o %t
 ; RUN: not grep extractps   %t
 ; RUN: not grep pextrd      %t
 ; RUN: not grep pshufd  %t
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 2c8796b..88f5a58 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse41 -o %t
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 -o %t
 ; RUN: grep movss    %t | count 4
 ; RUN: grep movhlps  %t | count 1
 ; RUN: not grep pshufd   %t 
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 863712f..7ec07ae 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.1,-avx | FileCheck %s
 ; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll
index bfac1ba..fe20a47 100644
--- a/test/CodeGen/X86/vec_insert-2.ll
+++ b/test/CodeGen/X86/vec_insert-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | FileCheck --check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse41 | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X32 %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X64 %s
 
 define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
 ; X32-LABEL: t1:
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll
index a18cd864..a871339 100644
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse41 | grep punpcklqdq | count 1
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | grep punpcklqdq | count 1
 
 define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
         %tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 268b5c4..6d4f828 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse42 -mtriple=i686-apple-darwin9 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=i686-apple-darwin9 | FileCheck %s
 ; MMX insertelement is not available; these are promoted to XMM.
 ; (Without SSE they are split to two ints, and the code is much better.)
 
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index 650951c..917832c 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 -o %t
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 -o %t
 
 ; tests variable insert and extract of a 4 x i32
 
diff --git a/test/CodeGen/X86/vec_insert-9.ll b/test/CodeGen/X86/vec_insert-9.ll
index e5a7ccc..5f2e676 100644
--- a/test/CodeGen/X86/vec_insert-9.ll
+++ b/test/CodeGen/X86/vec_insert-9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 > %t
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 > %t
 ; RUN: grep pinsrd %t | count 1
 
 define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind  {
diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll
index 4e5d445..0ed8f10 100644
--- a/test/CodeGen/X86/vec_insert.ll
+++ b/test/CodeGen/X86/vec_insert.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep movss | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | not grep pinsrw
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep movss | count 1
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | not grep pinsrw
 
 define void @test(<4 x float>* %F, i32 %I) nounwind {
 	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_round.ll b/test/CodeGen/X86/vec_round.ll
new file mode 100644
index 0000000..baa2f58
--- /dev/null
+++ b/test/CodeGen/X86/vec_round.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=nehalem -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @use(<2 x double>)
+
+; CHECK-LABEL: @test
+; CHECK callq round
+
+; Function Attrs: nounwind uwtable
+define void @test() {
+entry:
+  %tmp = call <2 x double> @llvm.round.v2f64(<2 x double> undef)
+  call void @use(<2 x double> %tmp)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #0
+
+attributes #0 = { nounwind readonly }
+
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index 66056d0..41061ae 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
 ; CHECK-NOT: movsd
 ; CHECK: movd {{%rdi|%rcx}}, %xmm0
 ; CHECK-NOT: movsd
diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
index 6979f6b..a739090 100644
--- a/test/CodeGen/X86/vec_set-9.ll
+++ b/test/CodeGen/X86/vec_set-9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s
 
 ; CHECK: test3
 ; CHECK: movd
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index 133f23b..052da30 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2 | grep movq
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2 | grep mov | count 1
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux -mattr=+sse2 | grep movd
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep movq
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep mov | count 1
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | grep movd
 
 define <2 x i64> @t1(i64 %x) nounwind  {
 	%tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index 7f5f8dd..53d880b 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep punpckl | count 7
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep punpckl | count 7
 
 define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
         %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0          ; <<8 x i16>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index bcfd4d3..fc8a56d 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse41 | FileCheck %s -check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse4.1 | FileCheck %s -check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx | FileCheck %s -check-prefix=AVX
 
 define <16 x i8> @v16i8_icmp_uge(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
@@ -124,3 +124,64 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
 ; AVX: pcmpeqd %xmm1, %xmm0, %xmm0
 }
 
+; At one point we were incorrectly constant-folding a setcc to 0x1 instead of
+; 0xff, leading to a constpool load. The instruction doesn't matter here, but it
+; should set all bits to 1.
+define <16 x i8> @test_setcc_constfold_vi8(<16 x i8> %l, <16 x i8> %r) {
+  %test1 = icmp eq <16 x i8> %l, %r
+  %mask1 = sext <16 x i1> %test1 to <16 x i8>
+
+  %test2 = icmp ne <16 x i8> %l, %r
+  %mask2 = sext <16 x i1> %test2 to <16 x i8>
+
+  %res = or <16 x i8> %mask1, %mask2
+  ret <16 x i8> %res
+; SSE2-LABEL: test_setcc_constfold_vi8:
+; SSE2: pcmpeqd %xmm0, %xmm0
+
+; SSE41-LABEL: test_setcc_constfold_vi8:
+; SSE41: pcmpeqd %xmm0, %xmm0
+
+; AVX-LABEL: test_setcc_constfold_vi8:
+; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
+}
+
+; Make sure sensible results come from doing extension afterwards
+define <16 x i8> @test_setcc_constfold_vi1(<16 x i8> %l, <16 x i8> %r) {
+  %test1 = icmp eq <16 x i8> %l, %r
+  %test2 = icmp ne <16 x i8> %l, %r
+
+  %res = or <16 x i1> %test1, %test2
+  %mask = sext <16 x i1> %res to <16 x i8>
+  ret <16 x i8> %mask
+; SSE2-LABEL: test_setcc_constfold_vi1:
+; SSE2: pcmpeqd %xmm0, %xmm0
+
+; SSE41-LABEL: test_setcc_constfold_vi1:
+; SSE41: pcmpeqd %xmm0, %xmm0
+
+; AVX-LABEL: test_setcc_constfold_vi1:
+; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
+}
+
+
+; 64-bit case is also particularly important, as the constant "-1" is probably
+; just 32-bits wide.
+define <2 x i64> @test_setcc_constfold_vi64(<2 x i64> %l, <2 x i64> %r) {
+  %test1 = icmp eq <2 x i64> %l, %r
+  %mask1 = sext <2 x i1> %test1 to <2 x i64>
+
+  %test2 = icmp ne <2 x i64> %l, %r
+  %mask2 = sext <2 x i1> %test2 to <2 x i64>
+
+  %res = or <2 x i64> %mask1, %mask2
+  ret <2 x i64> %res
+; SSE2-LABEL: test_setcc_constfold_vi64:
+; SSE2: pcmpeqd %xmm0, %xmm0
+
+; SSE41-LABEL: test_setcc_constfold_vi64:
+; SSE41: pcmpeqd %xmm0, %xmm0
+
+; AVX-LABEL: test_setcc_constfold_vi64:
+; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
+}
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index 9ef7fbd..e2fe45c 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 | FileCheck %s
 
 define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll
index 95e9a18..8f25197 100644
--- a/test/CodeGen/X86/vec_shuffle-14.ll
+++ b/test/CodeGen/X86/vec_shuffle-14.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64
 
 define <4 x i32> @t1(i32 %a) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll
index ebc8c5b..f2f96ba 100644
--- a/test/CodeGen/X86/vec_shuffle-17.ll
+++ b/test/CodeGen/X86/vec_shuffle-17.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
 ; CHECK-NOT: xor
 ; CHECK: movd {{%rdi|%rcx}}, %xmm0
 ; CHECK-NOT: xor
diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll
index d9b2388..3f42a13 100644
--- a/test/CodeGen/X86/vec_shuffle-25.ll
+++ b/test/CodeGen/X86/vec_shuffle-25.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t
 ; RUN: grep unpcklps %t | count 3
 ; RUN: grep unpckhps %t | count 1
  
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 4c56f84..00e8e73 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s
 ; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
index 0aff822..c9b2fb5 100644
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ b/test/CodeGen/X86/vec_shuffle-27.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
 
 ; ModuleID = 'vec_shuffle-27.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
@@ -7,10 +7,10 @@ target triple = "i686-apple-cl.1.0"
 define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
 entry:
 ; CHECK: subps
-; CHECK: mulps
-; CHECK: addps
 ; CHECK: subps
 ; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
 ; CHECK: addps
 	%tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 >		; <<8 x float>> [#uses=1]
 	%sub = fsub <8 x float> %T1, %T0		; <<8 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
index 9a06015..f1d0f93 100644
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ b/test/CodeGen/X86/vec_shuffle-36.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
 
 define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; CHECK: pshufb
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
index 1560454..8fd9a5c 100644
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ b/test/CodeGen/X86/vec_shuffle-39.ll
@@ -54,8 +54,8 @@ entry:
 define <2 x double> @t3() nounwind readonly {
 bb:
 ; CHECK-LABEL: t3:
-; CHECK: punpcklqdq %xmm1, %xmm0
 ; CHECK: movq (%rax), %xmm1
+; CHECK: punpcklqdq %xmm2, %xmm0
 ; CHECK: movsd %xmm1, %xmm0
   %tmp0 = load i128* null, align 1
   %tmp1 = load <2 x i32>* undef, align 8
@@ -72,9 +72,9 @@ bb:
 define <2 x i64> @t4() nounwind readonly {
 bb:
 ; CHECK-LABEL: t4:
-; CHECK: punpcklqdq %xmm0, %xmm1
 ; CHECK: movq (%rax), %xmm0
-; CHECK: movsd %xmm1, %xmm0
+; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]]
+; CHECK: movsd %[[XMM]], %xmm0
   %tmp0 = load i128* null, align 1
   %tmp1 = load <2 x i32>* undef, align 8
   %tmp2 = bitcast i128 %tmp0 to <16 x i8>
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
index 60e3005..754cbf4 100644
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ b/test/CodeGen/X86/vec_splat-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
+; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
 
 ; Splat test for v8i16
 define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
diff --git a/test/CodeGen/X86/vec_split.ll b/test/CodeGen/X86/vec_split.ll
new file mode 100644
index 0000000..f9e7c20
--- /dev/null
+++ b/test/CodeGen/X86/vec_split.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE4
+; RUN: llc -march=x86-64 -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
+
+define <16 x i16> @split16(<16 x i16> %a, <16 x i16> %b, <16 x i8> %__mask) {
+; SSE4-LABEL: split16:
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: ret
+; AVX1-LABEL: split16:
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: ret
+; AVX2-LABEL: split16:
+; AVX2: vpminuw
+; AVX2: ret
+  %1 = icmp ult <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <32 x i16> @split32(<32 x i16> %a, <32 x i16> %b, <32 x i8> %__mask) {
+; SSE4-LABEL: split32:
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: ret
+; AVX1-LABEL: split32:
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: ret
+; AVX2-LABEL: split32:
+; AVX2: vpminuw
+; AVX2: vpminuw
+; AVX2: ret
+  %1 = icmp ult <32 x i16> %a, %b
+  %2 = select <32 x i1> %1, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %2
+}
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index 2eb911f..80f12a2 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,+sse2,+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin8.7.2"
diff --git a/test/CodeGen/X86/vector-variable-idx2.ll b/test/CodeGen/X86/vector-variable-idx2.ll
index d47df90..6e8ae2e 100644
--- a/test/CodeGen/X86/vector-variable-idx2.ll
+++ b/test/CodeGen/X86/vector-variable-idx2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin11.0.0"
diff --git a/test/CodeGen/X86/vsplit-and.ll b/test/CodeGen/X86/vsplit-and.ll
index 3b7fdff..c16b294 100644
--- a/test/CodeGen/X86/vsplit-and.ll
+++ b/test/CodeGen/X86/vsplit-and.ll
@@ -14,7 +14,7 @@ define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind read
 
 define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
 ; CHECK: t2
-; CHECK-NOT: pand
+; CHECK: pand
 ; CHECK: ret
   %cmp1 = icmp ne <3 x i64> %src1, zeroinitializer
   %cmp2 = icmp ne <3 x i64> %src2, zeroinitializer
diff --git a/test/CodeGen/X86/weak_def_can_be_hidden.ll b/test/CodeGen/X86/weak_def_can_be_hidden.ll
new file mode 100644
index 0000000..f78f357
--- /dev/null
+++ b/test/CodeGen/X86/weak_def_can_be_hidden.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=x86_64-apple-darwin  -O0 < %s | FileCheck %s
+
+@v1 = linkonce_odr global i32 32
+; CHECK: .globl  _v1
+; CHECK: .weak_def_can_be_hidden _v1
+
+define i32 @f1() {
+  %x = load i32 * @v1
+  ret i32 %x
+}
+
+@v2 = linkonce_odr global i32 32
+; CHECK: .globl  _v2
+; CHECK: .weak_definition _v2
+
+@v3 = linkonce_odr unnamed_addr global i32 32
+; CHECK: .globl  _v3
+; CHECK: .weak_def_can_be_hidden _v3
+
+define i32* @f2() {
+  ret i32* @v2
+}
+
+define i32* @f3() {
+  ret i32* @v3
+}
diff --git a/test/CodeGen/X86/widen_arith-1.ll b/test/CodeGen/X86/widen_arith-1.ll
index 661cde8..6041356 100644
--- a/test/CodeGen/X86/widen_arith-1.ll
+++ b/test/CodeGen/X86/widen_arith-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse42 |  FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse4.2 |  FileCheck %s
 
 define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
 entry:
diff --git a/test/CodeGen/X86/widen_arith-2.ll b/test/CodeGen/X86/widen_arith-2.ll
index d35abc3..1b81e9f 100644
--- a/test/CodeGen/X86/widen_arith-2.ll
+++ b/test/CodeGen/X86/widen_arith-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: padd
 ; CHECK: pand
 
diff --git a/test/CodeGen/X86/widen_arith-3.ll b/test/CodeGen/X86/widen_arith-3.ll
index d86042a..d2b8e6e 100644
--- a/test/CodeGen/X86/widen_arith-3.ll
+++ b/test/CodeGen/X86/widen_arith-3.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse42 -post-RA-scheduler=true | FileCheck %s
-; CHECK: incl
-; CHECK: incl
-; CHECK: incl
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse4.2 -post-RA-scheduler=true | FileCheck %s
+; CHECK: paddd
 
 ; Widen a v3i16 to v8i16 to do a vector add
 
diff --git a/test/CodeGen/X86/widen_arith-4.ll b/test/CodeGen/X86/widen_arith-4.ll
index 63c8d0e..5207e1f 100644
--- a/test/CodeGen/X86/widen_arith-4.ll
+++ b/test/CodeGen/X86/widen_arith-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: psubw
 ; CHECK-NEXT: pmullw
 
diff --git a/test/CodeGen/X86/widen_arith-5.ll b/test/CodeGen/X86/widen_arith-5.ll
index 41df0e4..70b6a8a 100644
--- a/test/CodeGen/X86/widen_arith-5.ll
+++ b/test/CodeGen/X86/widen_arith-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42  | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2  | FileCheck %s
 ; CHECK: movdqa
 ; CHECK: pslld $2
 ; CHECK: psubd
diff --git a/test/CodeGen/X86/widen_arith-6.ll b/test/CodeGen/X86/widen_arith-6.ll
index b983d14..329048a 100644
--- a/test/CodeGen/X86/widen_arith-6.ll
+++ b/test/CodeGen/X86/widen_arith-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: mulps
 ; CHECK: addps
 
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 56c6364..d115929 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse4.2 < %s | FileCheck %s
 ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
-; CHECK: paddd
 ; CHECK: movl
+; CHECK: paddd
 ; CHECK: movlpd
 
 ; Scheduler causes produce a different instruction order
diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll
index 3979ce4..40b42fb 100644
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
 ; CHECK: pextrd
 ; CHECK: pextrd
 ; CHECK: movd
diff --git a/test/CodeGen/X86/widen_cast-3.ll b/test/CodeGen/X86/widen_cast-3.ll
index 87486d9..40a8dc5 100644
--- a/test/CodeGen/X86/widen_cast-3.ll
+++ b/test/CodeGen/X86/widen_cast-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: paddd
 ; CHECK: pextrd
 ; CHECK: pextrd
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index 5ea5426..1bc06a7 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: psraw
 ; CHECK: psraw
 
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index 9086d3a..ccf0bd1 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: movl
 ; CHECK: movlpd
 
diff --git a/test/CodeGen/X86/widen_cast-6.ll b/test/CodeGen/X86/widen_cast-6.ll
index 3903234..7c06ad8 100644
--- a/test/CodeGen/X86/widen_cast-6.ll
+++ b/test/CodeGen/X86/widen_cast-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 | FileCheck %s
 ; CHECK: movd
 
 ; Test bit convert that requires widening in the operand.
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 51f1c88..9f6778c 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: paddq
 
 ; truncate v2i64 to v2i32
diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll
index db8fa93..906f7cd 100644
--- a/test/CodeGen/X86/widen_conv-2.ll
+++ b/test/CodeGen/X86/widen_conv-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
-; CHECK: cwtl
-; CHECK: cwtl
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
+; CHECK: {{cwtl|movswl}}
+; CHECK: {{cwtl|movswl}}
 
 ; sign extension v2i32 to v2i16
 
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index a25fae9..a2f3d7b 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: cvtsi2ss
 
 ; sign to float v2i16 to v2f32
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index 1158e04..f633592 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
 ; CHECK-NOT: cvtsi2ss
 
 ; unsigned to float v7i16 to v7f32
diff --git a/test/CodeGen/X86/widen_extract-1.ll b/test/CodeGen/X86/widen_extract-1.ll
index c4fe43a..6832de1 100644
--- a/test/CodeGen/X86/widen_extract-1.ll
+++ b/test/CodeGen/X86/widen_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
 ; widen extract subvector
 
 define void @convert(<2 x double>* %dst.addr, <3 x double> %src)  {
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index f0f94e4..26815a4 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.2 | FileCheck %s
 
 ; Test based on pr5626 to load/store
 ;
@@ -73,9 +73,7 @@ define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
 ; CHECK: add3i16
 %i16vec3 = type <3 x i16>
 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
-; CHECK: addl
-; CHECK: addl
-; CHECK: addl
+; CHECK: paddd
 ; CHECK: ret
 	%a = load %i16vec3* %ap, align 16
 	%b = load %i16vec3* %bp, align 16
@@ -135,9 +133,7 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
 ; CHECK: add3i8
 %i8vec3 = type <3 x i8>
 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
-; CHECK: addb
-; CHECK: addb
-; CHECK: addb
+; CHECK: paddd
 ; CHECK: ret
 	%a = load %i8vec3* %ap, align 16
 	%b = load %i8vec3* %bp, align 16
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index c7d2044..803402b 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 
 ; widening shuffle v3float and then a add
 define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index 275ebf9..aff5305 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,10 +1,13 @@
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
 
-define i64 @foo(i64 %n, i64 %x) nounwind {
+define i64 @unaligned(i64 %n, i64 %x) nounwind {
+; M64-LABEL: unaligned:
+; W64-LABEL: unaligned:
+; EFI-LABEL: unaligned:
 entry:
 
   %buf0 = alloca i8, i64 4096, align 1
@@ -49,18 +52,18 @@ entry:
   %r = call i64 @bar(i64 %n, i64 %x, i64 %n, i8* %buf0, i8* %buf1) nounwind
 
 ; M64: subq  $48, %rsp
-; M64: leaq  -4096(%rbp), %r9
 ; M64: movq  %rax, 32(%rsp)
+; M64: leaq  -4096(%rbp), %r9
 ; M64: callq bar
 
 ; W64: subq  $48, %rsp
-; W64: leaq  -4096(%rbp), %r9
 ; W64: movq  %rax, 32(%rsp)
+; W64: leaq  -4096(%rbp), %r9
 ; W64: callq bar
 
 ; EFI: subq  $48, %rsp
-; EFI: leaq  -[[B0OFS]](%rbp), %r9
 ; EFI: movq  [[R64]], 32(%rsp)
+; EFI: leaq  -[[B0OFS]](%rbp), %r9
 ; EFI: callq _bar
 
   ret i64 %r
@@ -71,4 +74,51 @@ entry:
 
 }
 
+define i64 @aligned(i64 %n, i64 %x) nounwind {
+; M64-LABEL: aligned:
+; W64-LABEL: aligned:
+; EFI-LABEL: aligned:
+entry:
+
+  %buf1 = alloca i8, i64 %n, align 128
+
+; M64: leaq  15(%{{.*}}), %rax
+; M64: andq  $-16, %rax
+; M64: callq ___chkstk
+; M64: movq  %rsp, [[R2:%r.*]]
+; M64: andq  $-128, [[R2]]
+; M64: movq  [[R2]], %rsp
+
+; W64: leaq  15(%{{.*}}), %rax
+; W64: andq  $-16, %rax
+; W64: callq __chkstk
+; W64: subq  %rax, %rsp
+; W64: movq  %rsp, [[R2:%r.*]]
+; W64: andq  $-128, [[R2]]
+; W64: movq  [[R2]], %rsp
+
+; EFI: leaq  15(%{{.*}}), [[R1:%r.*]]
+; EFI: andq  $-16, [[R1]]
+; EFI: movq  %rsp, [[R64:%r.*]]
+; EFI: subq  [[R1]], [[R64]]
+; EFI: andq  $-128, [[R64]]
+; EFI: movq  [[R64]], %rsp
+
+  %r = call i64 @bar(i64 %n, i64 %x, i64 %n, i8* undef, i8* %buf1) nounwind
+
+; M64: subq  $48, %rsp
+; M64: movq  [[R2]], 32(%rsp)
+; M64: callq bar
+
+; W64: subq  $48, %rsp
+; W64: movq  [[R2]], 32(%rsp)
+; W64: callq bar
+
+; EFI: subq  $48, %rsp
+; EFI: movq  [[R64]], 32(%rsp)
+; EFI: callq _bar
+
+  ret i64 %r
+}
+
 declare i64 @bar(i64, i64, i64, i8* nocapture, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index 3ec172b..da8082b 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -9,4 +9,6 @@ entry:
         ret void
 }
 
-declare extern_weak i32 @f()
+define weak i32 @f() {
+  ret i32 42
+}
diff --git a/test/CodeGen/X86/x86-64-psub.ll b/test/CodeGen/X86/x86-64-psub.ll
index be09a4f..183ddf4 100644
--- a/test/CodeGen/X86/x86-64-psub.ll
+++ b/test/CodeGen/X86/x86-64-psub.ll
@@ -4,8 +4,8 @@
 ; This test checks that the operands of packed sub instructions are
 ; never interchanged by the "Two-Address instruction pass".
 
-declare { i64, double } @getFirstParam() 
-declare { i64, double } @getSecondParam() 
+declare { i64, double } @getFirstParam()
+declare { i64, double } @getSecondParam()
 
 define i64 @test_psubb() {
 entry:
@@ -28,9 +28,10 @@ entry:
 
 ; CHECK-LABEL: test_psubb:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -55,9 +56,10 @@ entry:
 
 ; CHECK-LABEL: test_psubw:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -83,9 +85,10 @@ entry:
 
 ; CHECK-LABEL: test_psubd:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubd [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -110,9 +113,10 @@ entry:
 
 ; CHECK-LABEL: test_psubsb:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubsb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -137,9 +141,10 @@ entry:
 
 ; CHECK-LABEL: test_psubswv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubsw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -164,9 +169,10 @@ entry:
 
 ; CHECK-LABEL: test_psubusbv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubusb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -191,9 +197,10 @@ entry:
 
 ; CHECK-LABEL: test_psubuswv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubusw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 8d3b300..641786f 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll
@@ -1,6 +1,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 @tm_nest_level = internal thread_local global i32 0
 define i64 @z() nounwind {
-; CHECK: movabsq    $tm_nest_level@TPOFF, %rcx
+; FIXME: The codegen here is primitive at best and could be much better.
+; The add and the moves can be folded together.
+; CHECK-DAG: movq    $tm_nest_level@TPOFF, %rcx
+; CHECK-DAG: movq    %fs:0, %rax
+; CHECK: addl    %ecx, %eax
   ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
 }
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
index af57e5c..2f3adb8 100644
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -6,8 +6,8 @@
 define <4 x i32> @shl4(<4 x i32> %A) nounwind {
 entry:
 ; CHECK:      shl4
-; CHECK:      padd
 ; CHECK:      pslld
+; CHECK:      padd
 ; CHECK:      ret
   %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
   %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
@@ -67,8 +67,8 @@ entry:
 define <8 x i16> @shl8(<8 x i16> %A) nounwind {
 entry:
 ; CHECK:      shl8
-; CHECK:      padd
 ; CHECK:      psllw
+; CHECK:      padd
 ; CHECK:      ret
   %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index b56ce0f..fd8e1b4 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -165,3 +165,19 @@ define <4 x i32> @test10(<4 x i32> %a) nounwind {
 ; X32-LABEL: test10:
 ; X32:    andnps
 }
+
+define i32 @PR17487(i1 %tobool) {
+  %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1
+  %tmp1 = zext <2 x i1> %tmp to <2 x i64>
+  %tmp2 = xor <2 x i64> %tmp1, <i64 1, i64 1>
+  %tmp3 = extractelement <2 x i64> %tmp2, i32 1
+  %add = add nsw i64 0, %tmp3
+  %cmp6 = icmp ne i64 %add, 1
+  %conv7 = zext i1 %cmp6 to i32
+  ret i32 %conv7
+
+; X64-LABEL: PR17487:
+; X64: andn
+; X32-LABEL: PR17487:
+; X32: andn
+}
diff --git a/test/CodeGen/X86/zext-fold.ll b/test/CodeGen/X86/zext-fold.ll
index ff93c68..a10923f 100644
--- a/test/CodeGen/X86/zext-fold.ll
+++ b/test/CodeGen/X86/zext-fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -enable-misched=false | FileCheck %s
 
 ;; Simple case
 define i32 @test1(i8 %x) nounwind readnone {
@@ -10,7 +10,7 @@ define i32 @test1(i8 %x) nounwind readnone {
 ; CHECK: movzbl
 ; CHECK-NEXT: andl {{.*}}224
 
-;; Multiple uses of %x but easily extensible. 
+;; Multiple uses of %x but easily extensible.
 define i32 @test2(i8 %x) nounwind readnone {
   %A = and i8 %x, -32
   %B = zext i8 %A to i32
@@ -21,8 +21,8 @@ define i32 @test2(i8 %x) nounwind readnone {
 }
 ; CHECK: test2
 ; CHECK: movzbl
-; CHECK: orl $63
 ; CHECK: andl $224
+; CHECK: orl $63
 
 declare void @use(i32, i8)
 
diff --git a/test/CodeGen/X86/zext-sext.ll b/test/CodeGen/X86/zext-sext.ll
index 25dabbe..5b2713d 100644
--- a/test/CodeGen/X86/zext-sext.ll
+++ b/test/CodeGen/X86/zext-sext.ll
@@ -34,10 +34,10 @@ entry:
   %tmp12 = add i64 %tmp11, 5089792279245435153
 
 ; CHECK:      addl	$2138875574, %e[[REGISTER_zext:[a-z0-9]+]]
-; CHECK-NEXT: cmpl	$-8608074, %e[[REGISTER_zext]]
-; CHECK:      movslq	%e[[REGISTER_zext]], [[REGISTER_tmp:%r[a-z0-9]+]]
-; CHECK:      movq	[[REGISTER_tmp]], [[REGISTER_sext:%r[a-z0-9]+]]
+; CHECK:      movslq	%e[[REGISTER_zext]], [[REGISTER_sext:%r[a-z0-9]+]]
+; CHECK:      cmpl	$-8608074, %e[[REGISTER_zext]]
 ; CHECK-NOT:  [[REGISTER_zext]]
+; CHECK-DAG:  testl     %e[[REGISTER_zext]]
 ; CHECK:      subq	%r[[REGISTER_zext]], [[REGISTER_sext]]
 
   %tmp13 = sub i64 %tmp12, 2138875574
diff --git a/test/CodeGen/XCore/aliases.ll b/test/CodeGen/XCore/aliases.ll
index d4da63c..b7ad416 100644
--- a/test/CodeGen/XCore/aliases.ll
+++ b/test/CodeGen/XCore/aliases.ll
@@ -1,7 +1,9 @@
 ; RUN: llc < %s -march=xcore | FileCheck %s
-declare void @a_val() nounwind
-@b_val = external constant i32, section ".cp.rodata"
-@c_val = external global i32
+define void @a_val() nounwind {
+  ret void
+}
+@b_val = constant i32 42, section ".cp.rodata"
+@c_val = global i32 42
 
 @a = alias void ()* @a_val
 @b = alias i32* @b_val
diff --git a/test/CodeGen/XCore/alignment.ll b/test/CodeGen/XCore/alignment.ll
new file mode 100644
index 0000000..28bdf3b
--- /dev/null
+++ b/test/CodeGen/XCore/alignment.ll
@@ -0,0 +1,9 @@
+; RUN: not llc < %s -march=xcore 2>&1 | FileCheck %s
+
+; CHECK: emitPrologue unsupported alignment: 8
+define void @f() nounwind {
+entry:
+  %BadAlignment = alloca i64, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/XCore/ashr.ll b/test/CodeGen/XCore/ashr.ll
index 2752f52..78cb144 100644
--- a/test/CodeGen/XCore/ashr.ll
+++ b/test/CodeGen/XCore/ashr.ll
@@ -1,26 +1,26 @@
 ; RUN: llc < %s -march=xcore -asm-verbose=0 | FileCheck %s
-define i32 @ashr(i32 %a, i32 %b) {
+define i32 @ashr(i32 %a, i32 %b) nounwind {
 	%1 = ashr i32 %a, %b
 	ret i32 %1
 }
 ; CHECK-LABEL: ashr:
 ; CHECK-NEXT: ashr r0, r0, r1
 
-define i32 @ashri1(i32 %a) {
+define i32 @ashri1(i32 %a) nounwind {
 	%1 = ashr i32 %a, 24
 	ret i32 %1
 }
 ; CHECK-LABEL: ashri1:
 ; CHECK-NEXT: ashr r0, r0, 24
 
-define i32 @ashri2(i32 %a) {
+define i32 @ashri2(i32 %a) nounwind {
 	%1 = ashr i32 %a, 31
 	ret i32 %1
 }
 ; CHECK-LABEL: ashri2:
 ; CHECK-NEXT: ashr r0, r0, 32
 
-define i32 @f1(i32 %a) {
+define i32 @f1(i32 %a) nounwind nounwind {
         %1 = icmp slt i32 %a, 0
 	br i1 %1, label %less, label %not_less
 less:
@@ -32,7 +32,7 @@ not_less:
 ; CHECK-NEXT: ashr r0, r0, 32
 ; CHECK-NEXT: bt r0
 
-define i32 @f2(i32 %a) {
+define i32 @f2(i32 %a) nounwind {
         %1 = icmp sge i32 %a, 0
 	br i1 %1, label %greater, label %not_greater
 greater:
@@ -44,7 +44,7 @@ not_greater:
 ; CHECK-NEXT: ashr r0, r0, 32
 ; CHECK-NEXT: bt r0
 
-define i32 @f3(i32 %a) {
+define i32 @f3(i32 %a) nounwind {
         %1 = icmp slt i32 %a, 0
 	%2 = select i1 %1, i32 10, i32 17
 	ret i32 %2
@@ -55,7 +55,7 @@ define i32 @f3(i32 %a) {
 ; CHECK-NEXT: ldc r0, 17
 ; CHECK: ldc r0, 10
 
-define i32 @f4(i32 %a) {
+define i32 @f4(i32 %a) nounwind {
         %1 = icmp sge i32 %a, 0
 	%2 = select i1 %1, i32 10, i32 17
 	ret i32 %2
@@ -66,7 +66,7 @@ define i32 @f4(i32 %a) {
 ; CHECK-NEXT: ldc r0, 10
 ; CHECK: ldc r0, 17
 
-define i32 @f5(i32 %a) {
+define i32 @f5(i32 %a) nounwind {
         %1 = icmp sge i32 %a, 0
 	%2 = zext i1 %1 to i32
 	ret i32 %2
diff --git a/test/CodeGen/XCore/atomic.ll b/test/CodeGen/XCore/atomic.ll
new file mode 100644
index 0000000..95fca9a
--- /dev/null
+++ b/test/CodeGen/XCore/atomic.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+; CHECK-LABEL: atomic_fence
+; CHECK: #MEMBARRIER
+; CHECK: #MEMBARRIER
+; CHECK: #MEMBARRIER
+; CHECK: #MEMBARRIER
+; CHECK: retsp 0
+define void @atomic_fence() nounwind {
+entry:
+  fence acquire
+  fence release
+  fence acq_rel
+  fence seq_cst
+  ret void
+}
diff --git a/test/CodeGen/XCore/byVal.ll b/test/CodeGen/XCore/byVal.ll
index a5d25d2..e9612fd 100644
--- a/test/CodeGen/XCore/byVal.ll
+++ b/test/CodeGen/XCore/byVal.ll
@@ -56,3 +56,18 @@ entry:
   call void @f2(i32 %i, %struct.st2* %s2)
   ret void
 }
+
+; CHECK-LABEL: f3Test
+; CHECK: entsp 2
+; CHECK: ldc r1, 0
+; CHECK: ld8u r2, r0[r1]
+; CHECK: ldaw r0, sp[1]
+; CHECK: st8 r2, r0[r1]
+; CHECK: bl f
+; CHECK: retsp 2
+declare void @f3(i8*) nounwind
+define void @f3Test(i8* byval %v) nounwind {
+entry:
+  call void @f3(i8* %v) nounwind
+  ret void
+}
diff --git a/test/CodeGen/XCore/exception.ll b/test/CodeGen/XCore/exception.ll
new file mode 100644
index 0000000..8018cdc
--- /dev/null
+++ b/test/CodeGen/XCore/exception.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+declare void @g()
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_allocate_exception(i32)
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+@_ZTIi = external constant i8*
+@_ZTId = external constant i8*
+
+; CHECK-LABEL: fn_typeid:
+; CHECK: .cfi_startproc
+; CHECK: mkmsk r0, 1
+; CHECK: retsp 0
+; CHECK: .cfi_endproc
+define i32 @fn_typeid() {
+entry:
+  %0 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  ret i32 %0
+}
+
+; CHECK-LABEL: fn_throw
+; CHECK: .cfi_startproc
+; CHECK: entsp 1
+; CHECK: .cfi_def_cfa_offset 4
+; CHECK: .cfi_offset 15, 0
+; CHECK: ldc r0, 4
+; CHECK: bl __cxa_allocate_exception
+; CHECK: ldaw r11, cp[_ZTIi]
+; CHECK: ldc r2, 0
+; CHECK: mov r1, r11
+; CHECK: bl __cxa_throw
+define void @fn_throw() {
+entry:
+  %0 = call i8* @__cxa_allocate_exception(i32 4) nounwind
+  call void @__cxa_throw(i8* %0, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+  unreachable
+}
+
+; CHECK-LABEL: fn_catch
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 0, __gxx_personality_v0
+; CHECK: [[START:.L[a-zA-Z0-9_]+]]
+; CHECK: .cfi_lsda 0, [[LSDA:.L[a-zA-Z0-9_]+]]
+; CHECK: entsp 4
+; CHECK: .cfi_def_cfa_offset 16
+; CHECK: .cfi_offset 15, 0
+define void @fn_catch() {
+entry:
+
+; N.B. we alloc no variables, hence force compiler to spill
+; CHECK: stw r4, sp[3]
+; CHECK: .cfi_offset 4, -4
+; CHECK: stw r5, sp[2]
+; CHECK: .cfi_offset 5, -8
+; CHECK: stw r6, sp[1]
+; CHECK: .cfi_offset 6, -12
+; CHECK: [[PRE_G:.L[a-zA-Z0-9_]+]]
+; CHECK: bl g
+; CHECK: [[POST_G:.L[a-zA-Z0-9_]+]]
+; CHECK: [[RETURN:.L[a-zA-Z0-9_]+]]
+; CHECK: ldw r6, sp[1]
+; CHECK: ldw r5, sp[2]
+; CHECK: ldw r4, sp[3]
+; CHECK: retsp 4
+  invoke void @g() to label %cont unwind label %lpad
+cont:
+  ret void
+
+; CHECK: {{.L[a-zA-Z0-9_]+}}
+; CHECK: [[LANDING:.L[a-zA-Z0-9_]+]]
+; CHECK: mov r5, r1
+; CHECK: mov r4, r0
+; CHECK: bl __cxa_begin_catch
+; CHECK: ldw r6, r0[0]
+; CHECK: bl __cxa_end_catch
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+          catch i8* bitcast (i8** @_ZTId to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  %3 = call i8* @__cxa_begin_catch(i8* %1) nounwind
+  %4 = bitcast i8* %3 to i32*
+  %5 = load i32* %4
+  call void @__cxa_end_catch() nounwind
+
+; CHECK: eq r0, r6, r5
+; CHECK: bf r0, [[RETURN]]
+; CHECK: mov r0, r4
+; CHECK: bl _Unwind_Resume
+; CHECK: .cfi_endproc
+; CHECK: [[END:.L[a-zA-Z0-9_]+]]
+  %6 = icmp eq i32 %5, %2
+  br i1 %6, label %Resume, label %Exit
+Resume:
+  resume { i8*, i32 } %0
+Exit:
+  ret void
+}
+
+; CHECK: [[LSDA]]:
+; CHECK: .byte  255
+; CHECK: .byte  0
+; CHECK: .asciiz
+; CHECK: .byte  3
+; CHECK: .byte  26
+; CHECK: [[SET0:.L[a-zA-Z0-9_]+]] = [[PRE_G]]-[[START]]
+; CHECK: .long [[SET0]]
+; CHECK: [[SET1:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[PRE_G]]
+; CHECK: .long [[SET1]]
+; CHECK: [[SET2:.L[a-zA-Z0-9_]+]] = [[LANDING]]-[[START]]
+; CHECK: .long [[SET2]]
+; CHECK: .byte 3
+; CHECK: [[SET3:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[START]]
+; CHECK: .long [[SET3]]
+; CHECK: [[SET4:.L[a-zA-Z0-9_]+]] = [[END]]-[[POST_G]]
+; CHECK: .long [[SET4]]
+; CHECK: .long 0
+; CHECK: .byte 0
+; CHECK: .byte 1
+; CHECK: .byte 0
+; CHECK: .byte 2
+; CHECK: .byte 125
+; CHECK: .long _ZTIi
+; CHECK: .long _ZTId
diff --git a/test/CodeGen/XCore/globals.ll b/test/CodeGen/XCore/globals.ll
index b140587..b3a872b 100644
--- a/test/CodeGen/XCore/globals.ll
+++ b/test/CodeGen/XCore/globals.ll
@@ -93,4 +93,4 @@ entry:
 
 @array = global [10 x i16] zeroinitializer, align 2
 ; CHECK: .globl  array.globound
-; CHECK: .set  array.globound,10
+; CHECK:  array.globound = 10
diff --git a/test/CodeGen/XCore/linkage.ll b/test/CodeGen/XCore/linkage.ll
new file mode 100644
index 0000000..7a1179b
--- /dev/null
+++ b/test/CodeGen/XCore/linkage.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+; CHECK: .weak fd
+define weak void @fd() {
+  call void @fr(i32* @gd, i32* @gr)
+  ret void
+}
+
+; CHECK-NOT: .hidden test_hidden
+declare hidden void @test_hidden_declaration()
+define hidden void @test_hidden() {
+  call void @test_hidden_declaration()
+  unreachable
+}
+
+; CHECK-NOT: .protected
+define protected void @test_protected() {
+  unreachable
+}
+
+; CHECK: .globl array.globound
+; CHECK: array.globound = 2
+; CHECK: .weak array.globound
+; CHECK: .globl array
+; CHECK: .weak array
+@array = weak global [2 x i32] zeroinitializer
+
+; CHECK: .weak gd
+@gd = weak global i32 0
+
+; CHECK-NOT: .hidden test_hidden_declaration
+
+; CHECK: .weak gr
+@gr = extern_weak global i32
+
+; CHECK: .weak fr
+declare extern_weak void @fr(i32*, i32*)
+
diff --git a/test/CodeGen/XCore/lit.local.cfg b/test/CodeGen/XCore/lit.local.cfg
index 8756f37..3e84c1b 100644
--- a/test/CodeGen/XCore/lit.local.cfg
+++ b/test/CodeGen/XCore/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'XCore' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/XCore/shedulingPreference.ll b/test/CodeGen/XCore/shedulingPreference.ll
new file mode 100644
index 0000000..6c2ac6d
--- /dev/null
+++ b/test/CodeGen/XCore/shedulingPreference.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=xcore
+
+define void @f( ) {
+entry:
+
+  switch i32 undef, label %default [
+    i32 0, label %start
+  ]
+
+start:
+  br label %end
+
+default:
+  %arg = fadd double undef, undef
+  %res = call double @f2(i32 undef, double %arg, double undef)
+  br label %end
+
+end:
+  %unused = phi double [ %res, %default ], [ undef, %start ]
+
+  unreachable
+}
+
+declare double @f2(i32, double, double)
+
diff --git a/test/CodeGen/XCore/threads.ll b/test/CodeGen/XCore/threads.ll
index 5840e77..c50da1d 100644
--- a/test/CodeGen/XCore/threads.ll
+++ b/test/CodeGen/XCore/threads.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=xcore < %s | FileCheck %s
+; RUN: llc -march=xcore -O=0 < %s | FileCheck %s -check-prefix=PHINODE
 
 declare i8 addrspace(1)* @llvm.xcore.getst.p1i8.p1i8(i8 addrspace(1)* %r)
 declare void @llvm.xcore.msync.p1i8(i8 addrspace(1)* %r)
@@ -13,55 +14,132 @@ declare void @llvm.xcore.initdp.p1i8(i8 addrspace(1)* %r, i8* %value)
 define i8 addrspace(1)* @test_getst(i8 addrspace(1)* %r) {
 ; CHECK-LABEL: test_getst:
 ; CHECK: getst r0, res[r0]
-        %result = call i8 addrspace(1)* @llvm.xcore.getst.p1i8.p1i8(i8 addrspace(1)* %r)
-        ret i8 addrspace(1)* %result
+  %result = call i8 addrspace(1)* @llvm.xcore.getst.p1i8.p1i8(i8 addrspace(1)* %r)
+  ret i8 addrspace(1)* %result
 }
 
 define void @test_ssync() {
 ; CHECK-LABEL: test_ssync:
 ; CHECK: ssync
-	call void @llvm.xcore.ssync()
-	ret void
+  call void @llvm.xcore.ssync()
+  ret void
 }
 
 define void @test_mjoin(i8 addrspace(1)* %r) {
 ; CHECK-LABEL: test_mjoin:
 ; CHECK: mjoin res[r0]
-	call void @llvm.xcore.mjoin.p1i8(i8 addrspace(1)* %r)
-	ret void
+  call void @llvm.xcore.mjoin.p1i8(i8 addrspace(1)* %r)
+  ret void
 }
 
 define void @test_initsp(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initsp:
 ; CHECK: init t[r0]:sp, r1
-        call void @llvm.xcore.initsp.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initsp.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initpc(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initpc:
 ; CHECK: init t[r0]:pc, r1
-        call void @llvm.xcore.initpc.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initpc.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initlr(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initlr:
 ; CHECK: init t[r0]:lr, r1
-        call void @llvm.xcore.initlr.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initlr.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initcp(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initcp:
 ; CHECK: init t[r0]:cp, r1
-        call void @llvm.xcore.initcp.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initcp.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initdp(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initdp:
 ; CHECK: init t[r0]:dp, r1
-        call void @llvm.xcore.initdp.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initdp.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
+
+@tl = thread_local global [3 x i32] zeroinitializer
+@tle = external thread_local global [2 x i32]
+
+define i32* @f_tl() {
+; CHECK-LABEL: f_tl:
+; CHECK: get r11, id
+; CHECK: ldaw [[R0:r[0-9]]], dp[tl]
+; CHECK: ldc [[R1:r[0-9]]], 8
+; CHECK: ldc [[R2:r[0-9]]], 12
+; r0 = id*12 + 8 + &tl
+; CHECK: lmul {{r[0-9]}}, r0, r11, [[R2]], [[R0]], [[R1]]
+  ret i32* getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 2)
+}
+
+define i32* @f_tle() {
+; CHECK-LABEL: f_tle:
+; CHECK: get r11, id
+; CHECK: shl [[R0:r[0-9]]], r11, 3
+; CHECK: ldaw [[R1:r[0-9]]], dp[tle]
+; r0 = &tl + id*8
+; CHECK: add r0, [[R1]], [[R0]]
+  ret i32* getelementptr inbounds ([2 x i32]* @tle, i32 0, i32 0)
+}
+
+define i32 @f_tlExpr () {
+; CHECK-LABEL: f_tlExpr:
+; CHECK: get r11, id
+; CHECK: shl [[R0:r[0-9]]], r11, 3
+; CHECK: ldaw [[R1:r[0-9]]], dp[tle]
+; CHECK: add [[R2:r[0-9]]], [[R1]], [[R0]]
+; CHECK: add r0, [[R2]], [[R2]]
+  ret i32 add(
+      i32 ptrtoint( i32* getelementptr inbounds ([2 x i32]* @tle, i32 0, i32 0) to i32),
+      i32 ptrtoint( i32* getelementptr inbounds ([2 x i32]* @tle, i32 0, i32 0) to i32))
+}
+
+define void @phiNode1() {
+; N.B. lowering of duplicate constexpr in a PHI node requires -O=0
+; PHINODE-LABEL: phiNode1:
+; PHINODE: get r11, id
+; PHINODE-LABEL: .LBB11_1:
+; PHINODE: get r11, id
+; PHINODE: bu .LBB11_1
+entry:
+  br label %ConstantExpPhiNode
+ConstantExpPhiNode:
+  %ptr = phi i32* [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %entry ],
+                  [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %ConstantExpPhiNode ]
+  br label %ConstantExpPhiNode
+exit:
+  ret void
+}
+
+define void @phiNode2( i1 %bool) {
+; N.B. check an extra 'Node_crit_edge' (LBB12_1) is inserted
+; PHINODE-LABEL: phiNode2:
+; PHINODE: bf {{r[0-9]}}, .LBB12_3
+; PHINODE: bu .LBB12_1
+; PHINODE-LABEL: .LBB12_1:
+; PHINODE: get r11, id
+; PHINODE-LABEL: .LBB12_2:
+; PHINODE: get r11, id
+; PHINODE: bu .LBB12_2
+; PHINODE-LABEL: .LBB12_3:
+entry:
+  br i1 %bool, label %ConstantExpPhiNode, label %exit
+ConstantExpPhiNode:
+  %ptr = phi i32* [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %entry ],
+                  [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %ConstantExpPhiNode ]
+  br label %ConstantExpPhiNode
+exit:
+  ret void
+}
+
+; CHECK-LABEL: tl:
+; CHECK: .space  96
diff --git a/test/CodeGen/XCore/zextfree.ll b/test/CodeGen/XCore/zextfree.ll
new file mode 100644
index 0000000..48dce88
--- /dev/null
+++ b/test/CodeGen/XCore/zextfree.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=xcore < %s | FileCheck %s
+
+; CHECK-LABEL: test:
+; CHECK-NOT: zext
+define void @test(i8* %s1) {
+entry:
+  %u8 = load i8* %s1, align 1
+  %bool = icmp eq i8 %u8, 0
+  br label %BB1
+BB1:
+  br i1 %bool, label %BB1, label %BB2
+BB2:
+  br i1 %bool, label %BB1, label %BB2
+}
+
diff --git a/test/DebugInfo/2009-11-03-InsertExtractValue.ll b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
index 9a3e622..21a60b8 100644
--- a/test/DebugInfo/2009-11-03-InsertExtractValue.ll
+++ b/test/DebugInfo/2009-11-03-InsertExtractValue.ll
@@ -2,6 +2,7 @@
 
 !llvm.dbg.sp = !{!0}
 !llvm.dbg.cu = !{!5}
+!llvm.module.flags = !{!6}
 
 !0 = metadata !{i32 786478, metadata !4, metadata !1, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 3, metadata !2, i1 false, i1 false, i32 0, i32 0, null, i32 258, i1 false, null, null, i32 0, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 41, metadata !4} ; [ DW_TAG_file_type ]
@@ -19,3 +20,4 @@ define <{i32, i32}> @f1() {
 }
 
 ; CHECK: [protected]
+!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
index 1ca88ae..6fd7887 100644
--- a/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-05-DeadGlobalVariable.ll
@@ -8,13 +8,14 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18}
 
 !0 = metadata !{i32 720913, metadata !17, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @foo, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @foo, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
 !6 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
@@ -24,3 +25,4 @@ entry:
 !15 = metadata !{i32 3, i32 3, metadata !16, null}
 !16 = metadata !{i32 720907, metadata !17, metadata !5, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
 !17 = metadata !{metadata !"fb.c", metadata !"/private/tmp"}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
index aec299b..5a10459 100644
--- a/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
+++ b/test/DebugInfo/2009-11-06-NamelessGlobalVariable.ll
@@ -2,6 +2,7 @@
 @0 = internal constant i32 1
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
 
 !0 = metadata !{i32 720913, metadata !8, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{i32 0}
@@ -10,3 +11,4 @@
 !6 = metadata !{i32 720937, metadata !8} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !"g.c", metadata !"/private/tmp"}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2009-11-10-CurrentFn.ll b/test/DebugInfo/2009-11-10-CurrentFn.ll
index 10f2e88..d154c43 100644
--- a/test/DebugInfo/2009-11-10-CurrentFn.ll
+++ b/test/DebugInfo/2009-11-10-CurrentFn.ll
@@ -11,13 +11,14 @@ declare void @foo(...)
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18}
 
 !0 = metadata !{i32 720913, metadata !17, i32 12, metadata !"clang version 3.0 (trunk 139632)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void (i32)* @bar, null, null, metadata !9, metadata !""} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !17, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32)* @bar, null, null, metadata !9, metadata !""} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [bar]
 !6 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{metadata !11}
 !11 = metadata !{i32 721153, metadata !17, metadata !5, metadata !"i", i32 16777219, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
@@ -27,3 +28,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !15 = metadata !{i32 720907, metadata !17, metadata !5, i32 3, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{i32 5, i32 1, metadata !15, null}
 !17 = metadata !{metadata !"cf.c", metadata !"/private/tmp"}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-01-05-DbgScope.ll b/test/DebugInfo/2010-01-05-DbgScope.ll
index e421c93..809cebf 100644
--- a/test/DebugInfo/2010-01-05-DbgScope.ll
+++ b/test/DebugInfo/2010-01-05-DbgScope.ll
@@ -9,15 +9,17 @@ entry:
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!14}
 
 !0 = metadata !{i32 571, i32 3, metadata !1, null}
 !1 = metadata !{i32 458763, metadata !11, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
 !2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"foo", metadata !"foo", metadata !"foo", i32 561, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
 !3 = metadata !{i32 458769, metadata !11, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, metadata !12, metadata !12, metadata !13, null, null, metadata !""}; [DW_TAG_compile_unit ]
-!4 = metadata !{i32 458773, null, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0}; [DW_TAG_subroutine_type ]
+!4 = metadata !{i32 458773, null, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
 !6 = metadata !{i32 458788, null, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 588, i32 1, metadata !2, null}
 !11 = metadata !{metadata !"hashtab.c", metadata !"/usr/src/gnu/usr.bin/cc/cc_tools/../../../../contrib/gcclibs/libiberty"}
 !12 = metadata !{i32 0}
 !13 = metadata !{metadata !2}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-01-19-DbgScope.ll b/test/DebugInfo/2010-01-19-DbgScope.ll
index 6aedfc8..1a7e378 100644
--- a/test/DebugInfo/2010-01-19-DbgScope.ll
+++ b/test/DebugInfo/2010-01-19-DbgScope.ll
@@ -15,12 +15,13 @@ bb11:                                             ; preds = %entry
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!15}
 
 !0 = metadata !{i32 8647, i32 0, metadata !1, null}
 !1 = metadata !{i32 458763, metadata !12, metadata !2, i32 0, i32 0, i32 0}          ; [ DW_TAG_lexical_block ]
 !2 = metadata !{i32 458798, null, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 8639, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !3 = metadata !{i32 458769, metadata !12, i32 1, metadata !"LLVM build 00", i1 true, metadata !"", i32 0, metadata !13, metadata !13, metadata !14, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 458773, null, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 458773, null, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
 !6 = metadata !{i32 458788, null, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 8648, i32 0, metadata !8, null}
@@ -31,3 +32,4 @@ bb11:                                             ; preds = %entry
 !12 = metadata !{metadata !"c-parser.c", metadata !"llvmgcc"}
 !13 = metadata !{i32 0}
 !14 = metadata !{metadata !2}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-03-12-llc-crash.ll b/test/DebugInfo/2010-03-12-llc-crash.ll
index 114c870..241bb37 100644
--- a/test/DebugInfo/2010-03-12-llc-crash.ll
+++ b/test/DebugInfo/2010-03-12-llc-crash.ll
@@ -13,7 +13,7 @@ entry:
 !1 = metadata !{i32 524334, metadata !8, metadata !3, metadata !"foo", metadata !"foo", metadata !"foo", i32 892, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 524329, metadata !8} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 524305, metadata !9, i32 4, metadata !"clang 1.1", i1 true, metadata !"", i32 0, metadata !10, metadata !10, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !9, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 524309, metadata !9, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{i32 524329, metadata !9} ; [ DW_TAG_file_type ]
 !6 = metadata !{null}
 !7 = metadata !{i32 524324, metadata !9, metadata !5, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
diff --git a/test/DebugInfo/2010-03-19-DbgDeclare.ll b/test/DebugInfo/2010-03-19-DbgDeclare.ll
index 9f52d11..d1afade 100644
--- a/test/DebugInfo/2010-03-19-DbgDeclare.ll
+++ b/test/DebugInfo/2010-03-19-DbgDeclare.ll
@@ -8,6 +8,7 @@ entry:
   ret void
 }
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!5}
 !2 = metadata !{i32 786449, metadata !4, i32 32769, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !3, metadata !3, metadata !3, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [lang 0x8001]
 !3 = metadata !{}
 !0 = metadata !{i32 662302, i32 26, metadata !1, null}
@@ -15,3 +16,4 @@ entry:
 !4 = metadata !{metadata !"scratch.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-03-24-MemberFn.ll b/test/DebugInfo/2010-03-24-MemberFn.ll
index dfdf87f..1689fe6 100644
--- a/test/DebugInfo/2010-03-24-MemberFn.ll
+++ b/test/DebugInfo/2010-03-24-MemberFn.ll
@@ -37,6 +37,7 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
+!llvm.module.flags = !{!28}
 
 !0 = metadata !{i32 786688, metadata !1, metadata !"s1", metadata !4, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
 !1 = metadata !{i32 786443, metadata !25, metadata !2, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
@@ -44,14 +45,14 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !3 = metadata !{i32 786478, metadata !25, metadata !4, metadata !"bar", metadata !"bar", metadata !"_Z3barv", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @_Z3barv, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
 !4 = metadata !{i32 786473, metadata !25} ; [ DW_TAG_file_type ]
 !5 = metadata !{i32 786449, metadata !25, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !27, metadata !27, metadata !24, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 786453, metadata !25, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 786453, metadata !25, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
 !8 = metadata !{i32 786468, metadata !25, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 786451, metadata !26, metadata !4, metadata !"S", i32 2, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
+!9 = metadata !{i32 786451, metadata !26, metadata !4, metadata !"S", i32 2, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 2, size 8, align 8, offset 0] [def] [from ]
 !10 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 786478, metadata !26, metadata !9, metadata !"foo", metadata !"foo", metadata !"_ZN1S3fooEv", i32 3, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (%struct.S*)* @_ZN1S3fooEv, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 786453, metadata !25, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!13 = metadata !{i32 786453, metadata !25, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !8, metadata !15}
 !15 = metadata !{i32 786447, metadata !25, metadata !4, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !9} ; [ DW_TAG_pointer_type ]
 !16 = metadata !{i32 3, i32 0, metadata !1, null}
@@ -66,3 +67,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !25 = metadata !{metadata !"one.cc", metadata !"/tmp/"}
 !26 = metadata !{metadata !"one.h", metadata !"/tmp/"}
 !27 = metadata !{i32 0}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
index 7958f49..81285a9 100644
--- a/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
+++ b/test/DebugInfo/2010-03-30-InvalidDbgInfoCrash.ll
@@ -9,6 +9,7 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
+!llvm.module.flags = !{!22}
 
 !0 = metadata !{{ [0 x i8] }** undef}
 !1 = metadata !{i32 524544, metadata !2, metadata !"x", metadata !4, i32 11, metadata !9} ; [ DW_TAG_auto_variable ]
@@ -16,15 +17,15 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !3 = metadata !{i32 524334, metadata !20, null, metadata !"baz", metadata !"baz", metadata !"baz", i32 8, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !4 = metadata !{i32 524329, metadata !20} ; [ DW_TAG_file_type ]
 !5 = metadata !{i32 524305, metadata !20, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 524309, metadata !20, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 524309, metadata !20, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
 !8 = metadata !{i32 524324, metadata !20, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 524303, metadata !20, metadata !4, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 524307, metadata !20, metadata !3, metadata !"", i32 11, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
+!10 = metadata !{i32 524307, metadata !20, metadata !3, metadata !"", i32 11, i64 8, i64 8, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 11, size 8, align 8, offset 0] [def] [from ]
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 524301, metadata !20, metadata !10, metadata !"b", i32 11, i64 8, i64 8, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ]
 !13 = metadata !{i32 524310, metadata !20, metadata !3, metadata !"A", i32 11, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ]
-!14 = metadata !{i32 524289, metadata !20, metadata !4, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !15, metadata !16, i32 0, null} ; [ DW_TAG_array_type ]
+!14 = metadata !{i32 524289, metadata !20, metadata !4, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
 !15 = metadata !{i32 524324, metadata !20, metadata !4, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !16 = metadata !{metadata !17}
 !17 = metadata !{i32 524321, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
@@ -32,3 +33,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !19 = metadata !{metadata !"llvm.mdnode.fwdref.23"}
 !20 = metadata !{metadata !"2007-12-VarArrayDebug.c", metadata !"/Users/sabre/llvm/test/FrontendC/"}
 !21 = metadata !{i32 0}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
index aea98fd..4d4d616 100644
--- a/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
+++ b/test/DebugInfo/2010-04-06-NestedFnDbgInfo.ll
@@ -51,6 +51,7 @@ entry:
 }
 
 !llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!40}
 !37 = metadata !{metadata !2, metadata !10, metadata !23}
 
 !0 = metadata !{i32 786688, metadata !1, metadata !"b", metadata !3, i32 16, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
@@ -58,13 +59,13 @@ entry:
 !2 = metadata !{i32 786478, metadata !38, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 15, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i32 ()* @main, null, null, null, i32 15} ; [ DW_TAG_subprogram ]
 !3 = metadata !{i32 786473, metadata !38} ; [ DW_TAG_file_type ]
 !4 = metadata !{i32 786449, metadata !38, i32 4, metadata !"clang 1.5", i1 false, metadata !"", i32 0, metadata !39, metadata !39, metadata !37, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786468, metadata !38, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786434, metadata !38, metadata !3, metadata !"B", i32 2, i64 8, i64 8, i64 0, i32 0, null, metadata !9, i32 0, null} ; [ DW_TAG_class_type ]
+!8 = metadata !{i32 786434, metadata !38, metadata !3, metadata !"B", i32 2, i64 8, i64 8, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_class_type ] [B] [line 2, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
 !10 = metadata !{i32 786478, metadata !38, metadata !8, metadata !"fn", metadata !"fn", metadata !"_ZN1B2fnEv", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i32 (%class.A*)* @_ZN1B2fnEv, null, null, null, i32 4} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!11 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !7, metadata !13}
 !13 = metadata !{i32 786447, metadata !38, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
 !14 = metadata !{i32 16, i32 5, metadata !1, null}
@@ -74,10 +75,10 @@ entry:
 !18 = metadata !{i32 4, i32 7, metadata !10, null}
 !19 = metadata !{i32 786688, metadata !20, metadata !"a", metadata !3, i32 9, metadata !21, i32 0, null} ; [ DW_TAG_auto_variable ]
 !20 = metadata !{i32 786443, metadata !38, metadata !10, i32 4, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!21 = metadata !{i32 786434, metadata !38, metadata !10, metadata !"A", i32 5, i64 8, i64 8, i64 0, i32 0, null, metadata !22, i32 0, null} ; [ DW_TAG_class_type ]
+!21 = metadata !{i32 786434, metadata !38, metadata !10, metadata !"A", i32 5, i64 8, i64 8, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 5, size 8, align 8, offset 0] [def] [from ]
 !22 = metadata !{metadata !23}
 !23 = metadata !{i32 786478, metadata !38, metadata !21, metadata !"foo", metadata !"foo", metadata !"_ZZN1B2fnEvEN1A3fooEv", i32 7, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i32 (%class.A*)* @_ZZN1B2fnEvEN1A3fooEv, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
-!24 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!24 = metadata !{i32 786453, metadata !38, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{metadata !7, metadata !26}
 !26 = metadata !{i32 786447, metadata !38, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !21} ; [ DW_TAG_pointer_type ]
 !27 = metadata !{i32 9, i32 7, metadata !20, null}
@@ -92,3 +93,4 @@ entry:
 !36 = metadata !{i32 786443, metadata !38, metadata !23, i32 7, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
 !38 = metadata !{metadata !"one.cc", metadata !"/tmp" }
 !39 = metadata !{i32 0}
+!40 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-04-19-FramePtr.ll b/test/DebugInfo/2010-04-19-FramePtr.ll
index 30aad38..4af2fdc 100644
--- a/test/DebugInfo/2010-04-19-FramePtr.ll
+++ b/test/DebugInfo/2010-04-19-FramePtr.ll
@@ -20,16 +20,18 @@ return:                                           ; preds = %entry
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!12}
 !9 = metadata !{metadata !1}
 
 !0 = metadata !{i32 2, i32 0, metadata !1, null}
 !1 = metadata !{i32 786478, metadata !10, null, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 2} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !10, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
 !6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 2, i32 0, metadata !8, null}
 !8 = metadata !{i32 786443, metadata !10, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{metadata !"a.c", metadata !"/tmp"}
 !11 = metadata !{i32 0}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-05-03-DisableFramePtr.ll b/test/DebugInfo/2010-05-03-DisableFramePtr.ll
index 1aa2240..ba8d0e5 100644
--- a/test/DebugInfo/2010-05-03-DisableFramePtr.ll
+++ b/test/DebugInfo/2010-05-03-DisableFramePtr.ll
@@ -2,7 +2,7 @@
 ; Radar 7937664
 %struct.AppleEvent = type opaque
 
-define void @DisposeDMNotificationUPP(void (%struct.AppleEvent*)* %userUPP) "no-frame-pointer-elim-non-leaf"="true" nounwind ssp {
+define void @DisposeDMNotificationUPP(void (%struct.AppleEvent*)* %userUPP) "no-frame-pointer-elim-non-leaf" nounwind ssp {
 entry:
   %userUPP_addr = alloca void (%struct.AppleEvent*)* ; <void (%struct.AppleEvent*)**> [#uses=1]
   %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
@@ -17,22 +17,24 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!19}
 !0 = metadata !{i32 524545, metadata !1, metadata !"userUPP", metadata !2, i32 7, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 524334, metadata !16, null, metadata !"DisposeDMNotificationUPP", metadata !"DisposeDMNotificationUPP", metadata !"DisposeDMNotificationUPP", i32 7, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 524329, metadata !16} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 524305, metadata !16, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !16, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 524309, metadata !16, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{null, metadata !6}
 !6 = metadata !{i32 524310, metadata !16, metadata !2, metadata !"DMNotificationUPP", i32 6, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
 !7 = metadata !{i32 524303, metadata !16, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 524309, metadata !16, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{i32 524309, metadata !16, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{null, metadata !10}
 !10 = metadata !{i32 524303, metadata !16, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 524310, metadata !16, metadata !2, metadata !"AppleEvent", i32 4, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{i32 524307, metadata !16, metadata !2, metadata !"AEDesc", i32 1, i64 0, i64 0, i64 0, i32 4, null, null, i32 0, null} ; [ DW_TAG_structure_type ]
+!12 = metadata !{i32 524307, metadata !16, metadata !2, metadata !"AEDesc", i32 1, i64 0, i64 0, i64 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [AEDesc] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !13 = metadata !{i32 7, i32 0, metadata !1, null}
 !14 = metadata !{i32 8, i32 0, metadata !15, null}
 !15 = metadata !{i32 524299, metadata !16, metadata !1, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{metadata !"t.c", metadata !"/Users/echeng/LLVM/radars/r7937664/"}
 !17 = metadata !{i32 0}
 !18 = metadata !{metadata !1}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-05-03-OriginDIE.ll b/test/DebugInfo/2010-05-03-OriginDIE.ll
index b4c2bc2b..0c5d876 100644
--- a/test/DebugInfo/2010-05-03-OriginDIE.ll
+++ b/test/DebugInfo/2010-05-03-OriginDIE.ll
@@ -49,12 +49,13 @@ declare i64 @llvm.bswap.i64(i64) nounwind readnone
 declare void @uuid_LtoB(i8*, i8*)
 
 !llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!41}
 !0 = metadata !{i32 808, i32 0, metadata !1, null}
 !1 = metadata !{i32 524299, metadata !39, metadata !2, i32 807, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !2 = metadata !{i32 524334, metadata !39, null, metadata !"gpt2gpm", metadata !"gpt2gpm", metadata !"gpt2gpm", i32 807, metadata !5, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !3 = metadata !{i32 524329, metadata !39} ; [ DW_TAG_file_type ]
 !4 = metadata !{i32 524305, metadata !39, i32 1, metadata !"llvm-gcc", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !40, null, null, i32 0} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 524309, metadata !39, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{i32 524309, metadata !39, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{null}
 !7 = metadata !{i32 810, i32 0, metadata !1, null}
 !8 = metadata !{i32 524545, metadata !9, metadata !"data", metadata !10, i32 201, metadata !11} ; [ DW_TAG_arg_variable ]
@@ -74,11 +75,11 @@ declare void @uuid_LtoB(i8*, i8*)
 !22 = metadata !{i32 524324, metadata !39, metadata !3, metadata !"long unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !23 = metadata !{i32 524544, metadata !24, metadata !"u", metadata !10, i32 100, metadata !25} ; [ DW_TAG_auto_variable ]
 !24 = metadata !{i32 524299, metadata !38, metadata !16, i32 95, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!25 = metadata !{i32 524311, metadata !38, metadata !16, metadata !"", i32 97, i64 64, i64 64, i64 0, i32 0, null, metadata !26, i32 0, null} ; [ DW_TAG_union_type ]
+!25 = metadata !{i32 524311, metadata !38, metadata !16, metadata !"", i32 97, i64 64, i64 64, i64 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_union_type ] [line 97, size 64, align 64, offset 0] [def] [from ]
 !26 = metadata !{metadata !27, metadata !28}
 !27 = metadata !{i32 524301, metadata !38, metadata !25, metadata !"u64", i32 98, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_member ]
 !28 = metadata !{i32 524301, metadata !38, metadata !25, metadata !"u32", i32 99, i64 64, i64 32, i64 0, i32 0, metadata !29} ; [ DW_TAG_member ]
-!29 = metadata !{i32 524289, metadata !39, metadata !3, metadata !"", i32 0, i64 64, i64 32, i64 0, i32 0, metadata !30, metadata !32, i32 0, null} ; [ DW_TAG_array_type ]
+!29 = metadata !{i32 524289, metadata !39, metadata !3, metadata !"", i32 0, i64 64, i64 32, i64 0, i32 0, metadata !30, metadata !32, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from uint32_t]
 !30 = metadata !{i32 524310, metadata !36, metadata !3, metadata !"uint32_t", i32 55, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_typedef ]
 !31 = metadata !{i32 524324, metadata !39, metadata !3, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !32 = metadata !{metadata !33}
@@ -90,3 +91,4 @@ declare void @uuid_LtoB(i8*, i8*)
 !38 = metadata !{metadata !"OSByteOrder.h", metadata !"/usr/include/libkern/ppc"}
 !39 = metadata !{metadata !"G.c", metadata !"/tmp"}
 !40 = metadata !{metadata !2, metadata !9, metadata !16}
+!41 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-05-10-MultipleCU.ll b/test/DebugInfo/2010-05-10-MultipleCU.ll
index da0b2e8..ad7c7d1 100644
--- a/test/DebugInfo/2010-05-10-MultipleCU.ll
+++ b/test/DebugInfo/2010-05-10-MultipleCU.ll
@@ -27,6 +27,7 @@ return:
 }
 
 !llvm.dbg.cu = !{!4, !12}
+!llvm.module.flags = !{!21}
 !16 = metadata !{metadata !2}
 !17 = metadata !{metadata !10}
 
@@ -35,7 +36,7 @@ return:
 !2 = metadata !{i32 786478, metadata !18, metadata !3, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !3 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
 !4 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !16, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 786453, metadata !18, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{i32 786453, metadata !18, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786468, metadata !18, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 3, i32 0, metadata !9, null}
@@ -43,9 +44,10 @@ return:
 !10 = metadata !{i32 786478, metadata !20, metadata !11, metadata !"bar", metadata !"bar", metadata !"bar", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !11 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
 !12 = metadata !{i32 786449, metadata !20, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!13 = metadata !{i32 786453, metadata !20, metadata !11, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!13 = metadata !{i32 786453, metadata !20, metadata !11, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !15}
 !15 = metadata !{i32 786468, metadata !20, metadata !11, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !18 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !19 = metadata !{i32 0}
 !20 = metadata !{metadata !"b.c", metadata !"/tmp/"}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
index 295648f..50a3422 100644
--- a/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
+++ b/test/DebugInfo/2010-06-29-InlinedFnLocalVar.ll
@@ -22,20 +22,21 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!28}
 
 !0 = metadata !{i32 786478, metadata !27, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 9, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !24, i32 9} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !27, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !20, metadata !20, metadata !25, metadata !26,  metadata !26, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !27, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !27, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5}
 !5 = metadata !{i32 786468, metadata !27, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786478, metadata !27, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !27, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, metadata !27, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !5}
 !9 = metadata !{i32 786689, metadata !0, metadata !"j", metadata !1, i32 9, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
 !10 = metadata !{i32 786688, metadata !11, metadata !"xyz", metadata !1, i32 10, metadata !12, i32 0, null} ; [ DW_TAG_auto_variable ]
 !11 = metadata !{i32 786443, metadata !1, metadata !0, i32 9, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 786451, metadata !27, metadata !0, metadata !"X", i32 10, i64 64, i64 32, i64 0, i32 0, null, metadata !13, i32 0, null} ; [ DW_TAG_structure_type ]
+!12 = metadata !{i32 786451, metadata !27, metadata !0, metadata !"X", i32 10, i64 64, i64 32, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 10, size 64, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !15}
 !14 = metadata !{i32 786445, metadata !27, metadata !12, metadata !"a", i32 10, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
 !15 = metadata !{i32 786445, metadata !27, metadata !12, metadata !"b", i32 10, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ]
@@ -51,3 +52,4 @@ entry:
 !25 = metadata !{metadata !0, metadata !6}
 !26 = metadata !{metadata !16}
 !27 = metadata !{metadata !"bar.c", metadata !"/tmp/"}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-07-19-Crash.ll b/test/DebugInfo/2010-07-19-Crash.ll
index a395efe..6b6e61d 100644
--- a/test/DebugInfo/2010-07-19-Crash.ll
+++ b/test/DebugInfo/2010-07-19-Crash.ll
@@ -8,13 +8,14 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!15}
 !llvm.dbg.sp = !{!0, !6, !11}
 !llvm.dbg.lv.foo = !{!7}
 
 !0 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 524305, metadata !12, i32 12, metadata !"clang 2.8", i1 true, metadata !"", i32 0, metadata !14, metadata !14, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 524309, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 524309, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 524324, metadata !12, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
@@ -26,3 +27,4 @@ entry:
 !12 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
 !13 = metadata !{metadata !0, metadata !6, metadata !11}
 !14 = metadata !{i32 0}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/2010-10-01-crash.ll b/test/DebugInfo/2010-10-01-crash.ll
index ddb9acc..f8dbb6e 100644
--- a/test/DebugInfo/2010-10-01-crash.ll
+++ b/test/DebugInfo/2010-10-01-crash.ll
@@ -14,7 +14,8 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32,
 
 
 !llvm.dbg.cu = !{!2}
-!0 = metadata !{i32 589870, metadata !1, i32 0, metadata !"CGRectStandardize", metadata !"CGRectStandardize", metadata !"CGRectStandardize", i32 54, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i32*, i32*)* @CGRectStandardize, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!llvm.module.flags = !{!27}
+!0 = metadata !{i32 589870, metadata !1, null, metadata !"CGRectStandardize", metadata !"CGRectStandardize", metadata !"CGRectStandardize", i32 54, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i32*, i32*)* @CGRectStandardize, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 54] [def] [scope 0] [CGRectStandardize]
 !1 = metadata !{i32 589865, metadata !25}
 !2 = metadata !{i32 589841, metadata !25, i32 16, metadata !"clang version 2.9 (trunk 115292)", i1 true, metadata !"", i32 1, metadata !26, metadata !26, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !5 = metadata !{i32 589846, metadata !25, null, metadata !"CGRect", i32 49, i64 0, i64 0, i64 0, i32 0, null}
@@ -22,3 +23,4 @@ declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32,
 !24 = metadata !{i32 53, i32 33, metadata !0, null}
 !25 = metadata !{metadata !"GSFusedSilica.m", metadata !"/Volumes/Data/Users/sabre/Desktop"}
 !26 = metadata !{i32 0}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/AArch64/dwarfdump.ll b/test/DebugInfo/AArch64/dwarfdump.ll
index 2598d5c..4c20507 100644
--- a/test/DebugInfo/AArch64/dwarfdump.ll
+++ b/test/DebugInfo/AArch64/dwarfdump.ll
@@ -21,14 +21,16 @@ define i32 @main() nounwind {
 attributes #0 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10}
 
 !0 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/timnor01/llvm/build/tmp.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
 !2 = metadata !{metadata !3}
 !3 = metadata !{i32 786478, metadata !9, metadata !4, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
 !4 = metadata !{i32 786473, metadata !9} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786453, null, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !8 = metadata !{i32 2, i32 0, metadata !3, null}
 !9 = metadata !{metadata !"tmp.c", metadata !"/home/tim/llvm/build"}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/AArch64/lit.local.cfg b/test/DebugInfo/AArch64/lit.local.cfg
index c5ce241..9a66a00 100644
--- a/test/DebugInfo/AArch64/lit.local.cfg
+++ b/test/DebugInfo/AArch64/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'AArch64' in targets:
     config.unsupported = True
diff --git a/test/DebugInfo/AArch64/variable-loc.ll b/test/DebugInfo/AArch64/variable-loc.ll
index 30eabc8..f42cb74 100644
--- a/test/DebugInfo/AArch64/variable-loc.ll
+++ b/test/DebugInfo/AArch64/variable-loc.ll
@@ -23,7 +23,10 @@
 ; CHECK: add x29, sp, #416
 ; CHECK: add {{x[0-9]+}}, sp, #4
 
-  ; Now check the debugging information reflects this:
+; CHECK: .Linfo_string7:
+; CHECK-NEXT: main_arr
+
+; Now check the debugging information reflects this:
 ; CHECK: DW_TAG_variable
 ; CHECK-NEXT: .word .Linfo_string7
 
@@ -32,8 +35,6 @@
 ; CHECK-NEXT: .byte 145
 ; CHECK-NEXT: .ascii "\344|"
 
-; CHECK: .Linfo_string7:
-; CHECK-NEXT: main_arr
 
 
 target datalayout = "e-p:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-n32:64-S128"
@@ -68,25 +69,26 @@ entry:
 declare i32 @printf(i8*, ...)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!30}
 
 !0 = metadata !{i32 786449, metadata !29, i32 12, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !11, metadata !14}
 !5 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"populate_array", metadata !"populate_array", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*, i32)* @populate_array, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
 !6 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !10}
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !11 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"sum_array", metadata !"sum_array", metadata !"", i32 9, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*, i32)* @sum_array, null, null, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
-!12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{metadata !10, metadata !9, metadata !10}
 !14 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!15 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !10}
 !17 = metadata !{i32 786688, metadata !18, metadata !"main_arr", metadata !6, i32 19, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
 !18 = metadata !{i32 786443, metadata !29, metadata !14, i32 18, i32 16, i32 4} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
-!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
+!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
 !20 = metadata !{i32 786465, i64 0, i64 99}       ; [ DW_TAG_subrange_type ] [0, 99]
 !22 = metadata !{i32 19, i32 7, metadata !18, null}
 !23 = metadata !{i32 786688, metadata !18, metadata !"val", metadata !6, i32 20, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [val] [line 20]
@@ -96,3 +98,4 @@ declare i32 @printf(i8*, ...)
 !27 = metadata !{i32 24, i32 3, metadata !18, null}
 !28 = metadata !{i32 26, i32 3, metadata !18, null}
 !29 = metadata !{metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build"}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/ARM/PR16736.ll b/test/DebugInfo/ARM/PR16736.ll
new file mode 100644
index 0000000..d01fa22
--- /dev/null
+++ b/test/DebugInfo/ARM/PR16736.ll
@@ -0,0 +1,65 @@
+; RUN: llc -filetype=asm < %s | FileCheck %s
+; CHECK: @DEBUG_VALUE: h:x <- [R{{.*}}+{{.*}}]
+; generated from:
+; clang -cc1 -triple  thumbv7 -S -O1 arm.cpp  -g
+;
+; int f();
+; void g(float);
+; void h(int, int, int, int, float x) {
+;    g(x = f());
+; }
+;
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
+target triple = "thumbv7-apple-ios"
+
+; Function Attrs: nounwind
+define arm_aapcscc void @_Z1hiiiif(i32, i32, i32, i32, float %x) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32 %0}, i64 0, metadata !12), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !13), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %2}, i64 0, metadata !14), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{i32 %3}, i64 0, metadata !15), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{float %x}, i64 0, metadata !16), !dbg !18
+  %call = tail call arm_aapcscc i32 @_Z1fv() #3, !dbg !19
+  %conv = sitofp i32 %call to float, !dbg !19
+  tail call void @llvm.dbg.value(metadata !{float %conv}, i64 0, metadata !16), !dbg !19
+  tail call arm_aapcscc void @_Z1gf(float %conv) #3, !dbg !19
+  ret void, !dbg !20
+}
+
+declare arm_aapcscc void @_Z1gf(float)
+
+declare arm_aapcscc i32 @_Z1fv()
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { nounwind  }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !21}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 190804) (llvm/trunk 190797)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [//<unknown>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"/<unknown>", metadata !""}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"h", metadata !"h", metadata !"_Z1hiiiif", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32, i32, i32, float)* @_Z1hiiiif, null, null, metadata !11, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [h]
+!5 = metadata !{metadata !"/arm.cpp", metadata !""}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [//arm.cpp]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9, metadata !9, metadata !9, metadata !9, metadata !10}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!11 = metadata !{metadata !12, metadata !13, metadata !14, metadata !15, metadata !16}
+!12 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
+!13 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 33554435, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
+!14 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 50331651, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
+!15 = metadata !{i32 786689, metadata !4, metadata !"", metadata !6, i32 67108867, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 3]
+!16 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !6, i32 83886083, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 3]
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!18 = metadata !{i32 3, i32 0, metadata !4, null}
+!19 = metadata !{i32 4, i32 0, metadata !4, null}
+!20 = metadata !{i32 5, i32 0, metadata !4, null}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/ARM/lit.local.cfg b/test/DebugInfo/ARM/lit.local.cfg
new file mode 100644
index 0000000..8a3ba96
--- /dev/null
+++ b/test/DebugInfo/ARM/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll b/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll
new file mode 100644
index 0000000..0378c75
--- /dev/null
+++ b/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll
@@ -0,0 +1,103 @@
+; RUN: opt  -instcombine %s -S | FileCheck %s
+;
+; Generate me from:
+; clang -cc1 -triple thumbv7-apple-ios7.0.0 -S -target-abi apcs-gnu -gdwarf-2 -Os test.c -o test.ll -emit-llvm
+; void run(float r)
+; {
+;   int count = r;
+;   float vla[count];
+;   vla[0] = r;
+;   for (int i = 0; i < count; i++)
+;     vla[i] /= r;
+; }
+; rdar://problem/15464571
+;
+; ModuleID = 'test.c'
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios8.0.0"
+
+; Function Attrs: nounwind optsize readnone
+define void @run(float %r) #0 {
+entry:
+  tail call void @llvm.dbg.declare(metadata !{float %r}, metadata !11), !dbg !22
+  %conv = fptosi float %r to i32, !dbg !23
+  tail call void @llvm.dbg.declare(metadata !{i32 %conv}, metadata !12), !dbg !23
+  %vla = alloca float, i32 %conv, align 4, !dbg !24
+  tail call void @llvm.dbg.declare(metadata !{float* %vla}, metadata !14), !dbg !24
+; The VLA alloca should be described by a dbg.declare:
+; CHECK: call void @llvm.dbg.declare(metadata !{float* %vla}, metadata ![[VLA:.*]])
+; The VLA alloca and following store into the array should not be lowered to like this:
+; CHECK-NOT:  call void @llvm.dbg.value(metadata !{float %r}, i64 0, metadata ![[VLA]])
+; the backend interprets this as "vla has the location of %r".
+  store float %r, float* %vla, align 4, !dbg !25, !tbaa !26
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !18), !dbg !30
+  %cmp8 = icmp sgt i32 %conv, 0, !dbg !30
+  br i1 %cmp8, label %for.body, label %for.end, !dbg !30
+
+for.body:                                         ; preds = %entry, %for.body.for.body_crit_edge
+  %0 = phi float [ %.pre, %for.body.for.body_crit_edge ], [ %r, %entry ]
+  %i.09 = phi i32 [ %inc, %for.body.for.body_crit_edge ], [ 0, %entry ]
+  %arrayidx2 = getelementptr inbounds float* %vla, i32 %i.09, !dbg !31
+  %div = fdiv float %0, %r, !dbg !31
+  store float %div, float* %arrayidx2, align 4, !dbg !31, !tbaa !26
+  %inc = add nsw i32 %i.09, 1, !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18), !dbg !30
+  %exitcond = icmp eq i32 %inc, %conv, !dbg !30
+  br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !dbg !30
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %arrayidx2.phi.trans.insert = getelementptr inbounds float* %vla, i32 %inc
+  %.pre = load float* %arrayidx2.phi.trans.insert, align 4, !dbg !31, !tbaa !26
+  br label %for.body, !dbg !30
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !32
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind optsize readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20, !33}
+!llvm.ident = !{!21}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Data/radar/15464571/<unknown>] [DW_LANG_C99]
+!1 = metadata !{metadata !"<unknown>", metadata !"/Volumes/Data/radar/15464571"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"run", metadata !"run", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (float)* @run, null, null, metadata !10, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [run]
+!5 = metadata !{metadata !"test.c", metadata !"/Volumes/Data/radar/15464571"}
+!6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15464571/test.c]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
+!10 = metadata !{metadata !11, metadata !12, metadata !14, metadata !18}
+!11 = metadata !{i32 786689, metadata !4, metadata !"r", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 1]
+!12 = metadata !{i32 786688, metadata !4, metadata !"count", metadata !6, i32 3, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [count] [line 3]
+!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{i32 786688, metadata !4, metadata !"vla", metadata !6, i32 4, metadata !15, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [vla] [line 4]
+!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from float]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786465, i64 0, i64 -1}       ; [ DW_TAG_subrange_type ] [unbounded]
+!18 = metadata !{i32 786688, metadata !19, metadata !"i", metadata !6, i32 6, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 6]
+!19 = metadata !{i32 786443, metadata !5, metadata !4, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Data/radar/15464571/test.c]
+!20 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!21 = metadata !{metadata !"clang version 3.4 "}
+!22 = metadata !{i32 1, i32 0, metadata !4, null}
+!23 = metadata !{i32 3, i32 0, metadata !4, null}
+!24 = metadata !{i32 4, i32 0, metadata !4, null}
+!25 = metadata !{i32 5, i32 0, metadata !4, null}
+!26 = metadata !{metadata !27, metadata !27, i64 0}
+!27 = metadata !{metadata !"float", metadata !28, i64 0}
+!28 = metadata !{metadata !"omnipotent char", metadata !29, i64 0}
+!29 = metadata !{metadata !"Simple C/C++ TBAA"}
+!30 = metadata !{i32 6, i32 0, metadata !19, null}
+!31 = metadata !{i32 7, i32 0, metadata !19, null}
+!32 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/ARM/selectiondag-deadcode.ll b/test/DebugInfo/ARM/selectiondag-deadcode.ll
new file mode 100644
index 0000000..cc151e0
--- /dev/null
+++ b/test/DebugInfo/ARM/selectiondag-deadcode.ll
@@ -0,0 +1,27 @@
+; RUN: llc -filetype=asm < %s | FileCheck %s
+target triple = "thumbv7-apple-ios7.0.0"
+%class.Matrix3.0.6.10 = type { [9 x float] }
+define arm_aapcscc void @_Z9GetMatrixv(%class.Matrix3.0.6.10* noalias nocapture sret %agg.result) #0 {
+  br i1 fcmp oeq (float fadd (float fadd (float fmul (float undef, float undef), float fmul (float undef, float undef)), float fmul (float undef, float undef)), float 0.000000e+00), label %_ZN7Vector39NormalizeEv.exit, label %1
+  tail call arm_aapcscc void @_ZL4Sqrtd() #3
+  br label %_ZN7Vector39NormalizeEv.exit
+_ZN7Vector39NormalizeEv.exit:                     ; preds = %1, %0
+  ; rdar://problem/15094721.
+  ;
+  ; When this (partially) dead use gets eliminated (and thus the def
+  ; of the vreg holding %agg.result) the dbg_value becomes dangling
+  ; and SelectionDAGISel crashes.  It should definitely not
+  ; crash. Drop the dbg_value instead.
+  ; CHECK-NOT: "matrix"
+  tail call void @llvm.dbg.declare(metadata !{%class.Matrix3.0.6.10* %agg.result}, metadata !45)
+  %2 = getelementptr inbounds %class.Matrix3.0.6.10* %agg.result, i32 0, i32 0, i32 8
+  ret void
+}
+declare void @llvm.dbg.declare(metadata, metadata) #1
+declare arm_aapcscc void @_ZL4Sqrtd() #2
+!4 = metadata !{i32 786434, metadata !5, null, metadata !"Matrix3", i32 20, i64 288, i64 32, i32 0, i32 0, null, null, i32 0, null, null, metadata !"_ZTS7Matrix3"} ; [ DW_TAG_class_type ] [Matrix3] [line 20, size 288, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"test.ii", metadata !"/Volumes/Data/radar/15094721"}
+!39 = metadata !{i32 786478, metadata !5, metadata !40, metadata !"GetMatrix", metadata !"GetMatrix", metadata !"_Z9GetMatrixv", i32 32, metadata !41, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.Matrix3.0.6.10*)* @_Z9GetMatrixv, null, null, null, i32 32} ; [ DW_TAG_subprogram ] [line 32] [def] [GetMatrix]
+!40 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/Volumes/Data/radar/15094721/test.ii]
+!41 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!45 = metadata !{i32 786688, metadata !39, metadata !"matrix", metadata !40, i32 35, metadata !4, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [matrix] [line 35]
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.cc b/test/DebugInfo/Inputs/dwarfdump-inl-test.cc
index 8ffbb52..edf956d 100644
--- a/test/DebugInfo/Inputs/dwarfdump-inl-test.cc
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.cc
@@ -13,3 +13,6 @@ int main() {
 // $ cp dwarfdump-inl-test.* /tmp/dbginfo
 // $ cd /tmp/dbginfo
 // $ clang++ -O2 -gline-tables-only -fsanitize=address -fPIC -shared dwarfdump-inl-test.cc -o <output>
+//
+// And similarly with with gcc 4.8.2:
+// $ gcc dwarfdump-inl-test.cc -o dwarfdump-inl-test.high_pc.elf-x86-64 -g -O2 -fPIC -shared
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64
new file mode 100755
index 0000000..f108861
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64.debuglink b/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64.debuglink
new file mode 100755
index 0000000..8c08037
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-test.elf-x86-64.debuglink
diff --git a/test/DebugInfo/Inputs/dwarfdump-type-units.cc b/test/DebugInfo/Inputs/dwarfdump-type-units.cc
new file mode 100644
index 0000000..06bc9a2
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-type-units.cc
@@ -0,0 +1,15 @@
+struct foo {};
+struct bar {};
+void sink(void*);
+int main() {
+  foo f;
+  sink(&f);
+  bar b;
+  sink(&b);
+}
+
+// Built with GCC 4.8.1
+// $ mkdir -p /tmp/dbginfo
+// $ cp dwarfdump-type-units.cc /tmp/dbginfo
+// $ cd /tmp/dbginfo
+// $ g++-4.8.1 -g -fdebug-types-section -c dwarfdump-type-units.cc -o dwarfdump-type-units.elf-x86-64
diff --git a/test/DebugInfo/Inputs/dwarfdump-type-units.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-type-units.elf-x86-64
new file mode 100644
index 0000000..064b4f0
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-type-units.elf-x86-64
diff --git a/test/DebugInfo/Inputs/lit.local.cfg b/test/DebugInfo/Inputs/lit.local.cfg
deleted file mode 100644
index e6f55ee..0000000
--- a/test/DebugInfo/Inputs/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = []
diff --git a/test/DebugInfo/PowerPC/lit.local.cfg b/test/DebugInfo/PowerPC/lit.local.cfg
index 112a1c3..193ebeb 100644
--- a/test/DebugInfo/PowerPC/lit.local.cfg
+++ b/test/DebugInfo/PowerPC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'PowerPC' in targets:
     config.unsupported = True
diff --git a/test/DebugInfo/PowerPC/tls-fission.ll b/test/DebugInfo/PowerPC/tls-fission.ll
index 83a2cf3..4a744c7 100644
--- a/test/DebugInfo/PowerPC/tls-fission.ll
+++ b/test/DebugInfo/PowerPC/tls-fission.ll
@@ -19,7 +19,7 @@
 @tls = thread_local global i32 0, align 4
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7}
+!llvm.module.flags = !{!7, !8}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !"tls.dwo"} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
@@ -29,3 +29,4 @@
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
 !6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/PowerPC/tls.ll b/test/DebugInfo/PowerPC/tls.ll
index ae32a90..6557f5e 100644
--- a/test/DebugInfo/PowerPC/tls.ll
+++ b/test/DebugInfo/PowerPC/tls.ll
@@ -15,7 +15,7 @@
 @tls = thread_local global i32 7, align 4
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7}
+!llvm.module.flags = !{!7, !8}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
@@ -26,3 +26,4 @@
 !6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/SystemZ/lit.local.cfg b/test/DebugInfo/SystemZ/lit.local.cfg
index a70a685..b12af09 100644
--- a/test/DebugInfo/SystemZ/lit.local.cfg
+++ b/test/DebugInfo/SystemZ/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'SystemZ' in targets:
     config.unsupported = True
diff --git a/test/DebugInfo/SystemZ/variable-loc.ll b/test/DebugInfo/SystemZ/variable-loc.ll
index 139fae8..560b477 100644
--- a/test/DebugInfo/SystemZ/variable-loc.ll
+++ b/test/DebugInfo/SystemZ/variable-loc.ll
@@ -11,6 +11,9 @@
 ; CHECK: la      %r2, 164(%r11)
 ; CHECK: brasl   %r14, populate_array@PLT
 ;
+; CHECK: .Linfo_string7:
+; CHECK-NEXT: main_arr
+;
 ; Now check that the debugging information reflects this:
 ; CHECK: DW_TAG_variable
 ; CHECK-NEXT: .long .Linfo_string7
@@ -21,8 +24,6 @@
 ; CHECK-NEXT: .byte 145
 ; CHECK-NEXT: .ascii "\244\001"
 ;
-; CHECK: .Linfo_string7:
-; CHECK-NEXT: main_arr
 
 
 @.str = private unnamed_addr constant [13 x i8] c"Total is %d\0A\00", align 2
@@ -54,25 +55,26 @@ entry:
 declare i32 @printf(i8*, ...)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!30}
 
 !0 = metadata !{i32 786449, metadata !29, i32 12, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/timnor01/a64-trunk/build/simple.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !11, metadata !14}
 !5 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"populate_array", metadata !"populate_array", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*, i32)* @populate_array, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [populate_array]
 !6 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !10}
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !11 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"sum_array", metadata !"sum_array", metadata !"", i32 9, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*, i32)* @sum_array, null, null, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [sum_array]
-!12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{metadata !10, metadata !9, metadata !10}
 !14 = metadata !{i32 786478, metadata !29, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
-!15 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !10}
 !17 = metadata !{i32 786688, metadata !18, metadata !"main_arr", metadata !6, i32 19, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [main_arr] [line 19]
 !18 = metadata !{i32 786443, metadata !29, metadata !14, i32 18, i32 16, i32 4} ; [ DW_TAG_lexical_block ] [/home/timnor01/a64-trunk/build/simple.c]
-!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
+!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 3200, i64 32, i32 0, i32 0, metadata !10, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 3200, align 32, offset 0] [from int]
 !20 = metadata !{i32 786465, i64 0, i64 99}       ; [ DW_TAG_subrange_type ] [0, 99]
 !22 = metadata !{i32 19, i32 7, metadata !18, null}
 !23 = metadata !{i32 786688, metadata !18, metadata !"val", metadata !6, i32 20, metadata !10, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [val] [line 20]
@@ -82,3 +84,4 @@ declare i32 @printf(i8*, ...)
 !27 = metadata !{i32 24, i32 3, metadata !18, null}
 !28 = metadata !{i32 26, i32 3, metadata !18, null}
 !29 = metadata !{metadata !"simple.c", metadata !"/home/timnor01/a64-trunk/build"}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/2010-04-13-PubType.ll b/test/DebugInfo/X86/2010-04-13-PubType.ll
index 0ec7f59..0440afc 100644
--- a/test/DebugInfo/X86/2010-04-13-PubType.ll
+++ b/test/DebugInfo/X86/2010-04-13-PubType.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -asm-verbose -mtriple=x86_64-macosx < %s | FileCheck %s
+; RUN: llc -O0 -asm-verbose -mtriple=x86_64-macosx -generate-dwarf-pub-sections=Enable < %s | FileCheck %s
 ; CHECK-NOT: .asciz "X" ## External Name
 ; CHECK: .asciz "Y" ## External Name
 ; Test to check type with no definition is listed in pubtypes section.
@@ -29,18 +29,19 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!20}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 7, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (%struct.X*, %struct.Y*)* @foo, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7, metadata !9}
 !6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 786447, metadata !18, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !18, metadata !2, metadata !"X", i32 3, i64 0, i64 0, i64 0, i32 4, null, null, i32 0, null} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 786451, metadata !18, metadata !2, metadata !"X", i32 3, i64 0, i64 0, i64 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [X] [line 3, size 0, align 0, offset 0] [decl] [from ]
 !9 = metadata !{i32 786447, metadata !18, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786451, metadata !18, metadata !2, metadata !"Y", i32 4, i64 32, i64 32, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
+!10 = metadata !{i32 786451, metadata !18, metadata !2, metadata !"Y", i32 4, i64 32, i64 32, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Y] [line 4, size 32, align 32, offset 0] [def] [from ]
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 786445, metadata !18, metadata !10, metadata !"x", i32 5, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
 !13 = metadata !{i32 7, i32 0, metadata !1, null}
@@ -50,3 +51,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !17 = metadata !{metadata !1}
 !18 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !19 = metadata !{i32 0}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
index 51a375a..d0a2dfa 100644
--- a/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
+++ b/test/DebugInfo/X86/2010-08-10-DbgConstant.ll
@@ -1,7 +1,7 @@
 ; RUN: llc  -mtriple=i686-linux -O0 -filetype=obj -o %t %s
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-; CHECK: DW_TAG_constant [4]
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x0000002c] = "ro")
+; CHECK: DW_TAG_constant
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( .debug_str[0x{{[0-9a-f]*}}] = "ro")
 
 define void @foo() nounwind ssp {
 entry:
@@ -12,11 +12,12 @@ entry:
 declare void @bar(i32)
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!13}
 
 !0 = metadata !{i32 786478, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void ()* @foo, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang 2.8", i1 false, metadata !"", i32 0, metadata !4, metadata !4, metadata !10, metadata !11,  metadata !11, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 786471, i32 0, metadata !1, metadata !"ro", metadata !"ro", metadata !"ro", metadata !1, i32 1, metadata !6, i1 true, i1 true, i32 201, null} ; [ DW_TAG_constant ]
 !6 = metadata !{i32 786470, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_const_type ]
@@ -26,3 +27,4 @@ declare void @bar(i32)
 !10 = metadata !{metadata !0}
 !11 = metadata !{metadata !5}
 !12 = metadata !{metadata !"/tmp/l.c", metadata !"/Volumes/Lalgate/clean/D"}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
index 1c6778c..cdfd952 100644
--- a/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
+++ b/test/DebugInfo/X86/2011-09-26-GlobalVarContext.ll
@@ -17,13 +17,14 @@ define i32 @f() nounwind {
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"clang version 3.0 (trunk)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12,  metadata !12, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @f, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @f, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [f]
 !6 = metadata !{i32 720937, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
@@ -37,13 +38,14 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !19 = metadata !{i32 5, i32 5, metadata !16, null}
 !20 = metadata !{metadata !"test.c", metadata !"/work/llvm/vanilla/test/DebugInfo"}
 
-; CHECK: DW_TAG_variable [3]
-; CHECK: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x00000043] = "GLB")
+; CHECK: DW_TAG_variable
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "GLB")
 ; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
 ; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
 
-; CHECK: DW_TAG_variable [6]
-; CHECK: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x0000004d] = "LOC")
+; CHECK: DW_TAG_variable
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]   ( .debug_str[0x{{[0-9a-f]*}}] = "LOC")
 ; CHECK: DW_AT_decl_file [DW_FORM_data1]     (0x01)
 ; CHECK: DW_AT_decl_line [DW_FORM_data1]     (0x04)
 
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
index 405d9f5..5e6a601 100644
--- a/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
+++ b/test/DebugInfo/X86/2011-12-16-BadStructRef.ll
@@ -87,51 +87,52 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!83}
 
 !0 = metadata !{i32 720913, metadata !82, i32 4, metadata !"clang version 3.1 (trunk 146596)", i1 false, metadata !"", i32 0, metadata !1, metadata !3, metadata !27, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !9}
-!5 = metadata !{i32 720898, metadata !82, null, metadata !"bar", i32 9, i64 128, i64 64, i32 0, i32 0, null, metadata !7, i32 0, null, null} ; [ DW_TAG_class_type ]
+!5 = metadata !{i32 720898, metadata !82, null, metadata !"bar", i32 9, i64 128, i64 64, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 128, align 64, offset 0] [def] [from ]
 !6 = metadata !{i32 720937, metadata !82} ; [ DW_TAG_file_type ]
 !7 = metadata !{metadata !8, metadata !19, metadata !21}
 !8 = metadata !{i32 720909, metadata !82, metadata !5, metadata !"b", i32 11, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ]
-!9 = metadata !{i32 720898, metadata !82, null, metadata !"baz", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null} ; [ DW_TAG_class_type ]
+!9 = metadata !{i32 720898, metadata !82, null, metadata !"baz", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_class_type ] [baz] [line 3, size 32, align 32, offset 0] [def] [from ]
 !10 = metadata !{metadata !11, metadata !13}
 !11 = metadata !{i32 720909, metadata !82, metadata !9, metadata !"h", i32 5, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]
 !12 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !13 = metadata !{i32 720942, metadata !82, metadata !9, metadata !"baz", metadata !"baz", metadata !"", i32 6, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 0} ; [ DW_TAG_subprogram ]
-!14 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !15, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!14 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !16, metadata !12}
-!16 = metadata !{i32 720911, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !9} ; [ DW_TAG_pointer_type ]
+!16 = metadata !{i32 720911, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !9} ; [ DW_TAG_pointer_type ]
 !17 = metadata !{metadata !18}
 !18 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !19 = metadata !{i32 720909, metadata !82, metadata !5, metadata !"b_ref", i32 12, i64 64, i64 64, i64 64, i32 0, metadata !20} ; [ DW_TAG_member ]
 !20 = metadata !{i32 720912, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ]
 !21 = metadata !{i32 720942, metadata !82, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 13, metadata !22, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !25, i32 0} ; [ DW_TAG_subprogram ]
-!22 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !23, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!22 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null, metadata !24, metadata !12}
-!24 = metadata !{i32 720911, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !5} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{i32 720911, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !5} ; [ DW_TAG_pointer_type ]
 !25 = metadata !{metadata !26}
 !26 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !27 = metadata !{metadata !29, metadata !37, metadata !40, metadata !43, metadata !46}
-!29 = metadata !{i32 720942, metadata !82, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 17, metadata !30, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !35, i32 0} ; [ DW_TAG_subprogram ]
-!30 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !31, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!29 = metadata !{i32 720942, metadata !82, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 17, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 17] [def] [scope 0] [main]
+!30 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !31 = metadata !{metadata !12, metadata !12, metadata !32}
 !32 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !33} ; [ DW_TAG_pointer_type ]
 !33 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !34} ; [ DW_TAG_pointer_type ]
 !34 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !35 = metadata !{metadata !36}
 !36 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!37 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC1Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, metadata !21, metadata !38, i32 0} ; [ DW_TAG_subprogram ]
+!37 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC1Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC1Ei, null, metadata !21, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
 !38 = metadata !{metadata !39}
 !39 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!40 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC2Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, metadata !21, metadata !41, i32 0} ; [ DW_TAG_subprogram ]
+!40 = metadata !{i32 720942, metadata !82, null, metadata !"bar", metadata !"bar", metadata !"_ZN3barC2Ei", i32 13, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.bar*, i32)* @_ZN3barC2Ei, null, metadata !21, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 0] [bar]
 !41 = metadata !{metadata !42}
 !42 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!43 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC1Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, metadata !13, metadata !44, i32 0} ; [ DW_TAG_subprogram ]
+!43 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC1Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC1Ei, null, metadata !13, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
 !44 = metadata !{metadata !45}
 !45 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!46 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC2Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, metadata !13, metadata !47, i32 0} ; [ DW_TAG_subprogram ]
+!46 = metadata !{i32 720942, metadata !82, null, metadata !"baz", metadata !"baz", metadata !"_ZN3bazC2Ei", i32 6, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.baz*, i32)* @_ZN3bazC2Ei, null, metadata !13, metadata !47, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [baz]
 !47 = metadata !{metadata !48}
 !48 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !49 = metadata !{i32 721153, metadata !29, metadata !"argc", metadata !6, i32 16777232, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
@@ -168,3 +169,4 @@ entry:
 !80 = metadata !{i32 6, i32 24, metadata !81, null}
 !81 = metadata !{i32 720907, metadata !82, metadata !46, i32 6, i32 23, i32 2} ; [ DW_TAG_lexical_block ]
 !82 = metadata !{metadata !"main.cpp", metadata !"/Users/echristo/tmp/bad-struct-ref"}
+!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index ef55839..87e242a 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll
@@ -5,6 +5,7 @@
 ; CHECK: DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type
 ; CHECK-NOT: DW_AT_byte_size
+; CHECK: DW_TAG
 ; CHECK: .debug_info contents
 
 %struct.A = type { i32 }
@@ -23,17 +24,18 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.1 (trunk 150996)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooP1A", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.A*)* @_Z3fooP1A, null, null, metadata !14, i32 3} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !12, i32 0, null, null} ; [ DW_TAG_class_type ]
+!11 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786445, metadata !20, metadata !11, metadata !"b", i32 1, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ]
 !14 = metadata !{metadata !15}
@@ -43,3 +45,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !18 = metadata !{i32 4, i32 3, metadata !19, null}
 !19 = metadata !{i32 786443, metadata !20, metadata !5, i32 3, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
 !20 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo"}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_AT_location-reference.ll b/test/DebugInfo/X86/DW_AT_location-reference.ll
index f0f4f48..bdd0e04 100644
--- a/test/DebugInfo/X86/DW_AT_location-reference.ll
+++ b/test/DebugInfo/X86/DW_AT_location-reference.ll
@@ -86,11 +86,12 @@ declare i32 @g(i32, i32)
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!24}
 
-!0 = metadata !{i32 786478, metadata !23, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f, null, null, metadata !22, i32 4} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !23, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @f, null, null, metadata !22, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [f]
 !1 = metadata !{i32 786473, metadata !23} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !23, i32 12, metadata !"clang version 3.0 (trunk)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !21, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !23, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !23, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 786688, metadata !6, metadata !"x", metadata !1, i32 5, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
 !6 = metadata !{i32 786443, metadata !23, metadata !0, i32 4, i32 14, i32 0} ; [ DW_TAG_lexical_block ]
@@ -108,3 +109,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !21 = metadata !{metadata !0}
 !22 = metadata !{metadata !5}
 !23 = metadata !{metadata !"simple.c", metadata !"/home/rengol01/temp/tests/dwarf/relocation"}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
index 789f556..6e6c3a1 100644
--- a/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -2,10 +2,12 @@
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: DW_TAG_formal_parameter [
+; CHECK-NOT: ""
+; CHECK: DW_TAG
 ; CHECK: DW_TAG_class_type
-; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x00fd => {0x000000fd})
-; CHECK: 0x000000fd:     DW_TAG_formal_parameter [13]
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x00000086] = "this")
+; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK: [[PARAM]]:     DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
 
 %class.A = type { i32 }
 
@@ -46,20 +48,21 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!38}
 
 !0 = metadata !{i32 786449, metadata !37, i32 4, metadata !"clang version 3.2 (trunk 163586) (llvm/trunk 163570)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !10, metadata !20}
 !5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3fooi, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
 !6 = metadata !{i32 786473, metadata !37} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786478, metadata !6, null, metadata !"A", metadata !"A", metadata !"_ZN1AC1Ev", i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
-!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
-!14 = metadata !{i32 786434, metadata !37, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [from ]
+!13 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!14 = metadata !{i32 786434, metadata !37, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
 !16 = metadata !{i32 786445, metadata !37, metadata !14, metadata !"m_a", i32 4, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
 !17 = metadata !{i32 786478, metadata !6, metadata !14, metadata !"A", metadata !"A", metadata !"", i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
@@ -83,3 +86,4 @@ entry:
 !35 = metadata !{i32 7, i32 0, metadata !5, null}
 !36 = metadata !{i32 786689, metadata !5, metadata !"", metadata !6, i32 16777223, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 7]
 !37 = metadata !{metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests"}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_AT_specification.ll b/test/DebugInfo/X86/DW_AT_specification.ll
index 93e1ecf..4d7ef4f 100644
--- a/test/DebugInfo/X86/DW_AT_specification.ll
+++ b/test/DebugInfo/X86/DW_AT_specification.ll
@@ -3,8 +3,10 @@
 
 ; test that the DW_AT_specification is a back edge in the file.
 
-; CHECK: 0x0000003a: DW_TAG_subprogram [5] *
-; CHECK: 0x00000060: DW_AT_specification [DW_FORM_ref4]      (cu + 0x003a => {0x0000003a})
+; CHECK: DW_TAG_subprogram [{{[0-9]+}}] *
+; CHECK: DW_AT_specification [DW_FORM_ref4]      (cu + 0x[[OFFSET:[0-9a-f]*]] => {0x0000[[OFFSET]]})
+; CHECK: 0x0000[[OFFSET]]: DW_TAG_subprogram [{{[0-9]+}}] *
+; CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x{{[0-9a-f]*}}] = "bar")
 
 
 @_ZZN3foo3barEvE1x = constant i32 0, align 4
@@ -15,16 +17,17 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!28}
 
 !0 = metadata !{i32 786449, metadata !27, i32 4, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !18, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @_ZN3foo3barEv, null, metadata !11, metadata !16, i32 4} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN3foo3barEv, null, metadata !11, metadata !16, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [bar]
 !6 = metadata !{i32 720937, metadata !27} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786451, metadata !27, null, metadata !"foo", i32 1, i64 0, i64 0, i32 0, i32 4, i32 0, null, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!9 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{i32 786451, metadata !27, null, metadata !"foo", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !11 = metadata !{i32 720942, metadata !6, metadata !12, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !14, i32 2} ; [ DW_TAG_subprogram ]
 !12 = metadata !{i32 720898, metadata !27, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !13, i32 0, null, null} ; [ DW_TAG_class_type ]
 !13 = metadata !{metadata !11}
@@ -39,3 +42,4 @@ entry:
 !25 = metadata !{i32 6, i32 1, metadata !26, null}
 !26 = metadata !{i32 786443, metadata !5, i32 4, i32 17, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
 !27 = metadata !{metadata !"nsNativeAppSupportBase.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/toolkit/library"}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll b/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll
new file mode 100644
index 0000000..0c08f23
--- /dev/null
+++ b/test/DebugInfo/X86/DW_AT_stmt_list_sec_offset.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=i686-w64-mingw32 -o %t -filetype=obj %s
+; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
+
+; CHECK:  	 DW_AT_stmt_list [DW_FORM_sec_offset]
+;
+; generated from:
+; clang -g -S -emit-llvm test.c -o test.ll
+; int main()
+; {
+; 	return 0;
+; }
+
+; ModuleID = 'test.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i686-pc-win32"
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0, !dbg !10
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!10 = metadata !{i32 3, i32 0, metadata !4, null}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/DW_TAG_friend.ll b/test/DebugInfo/X86/DW_TAG_friend.ll
index 2e23222..2da9627 100644
--- a/test/DebugInfo/X86/DW_TAG_friend.ll
+++ b/test/DebugInfo/X86/DW_TAG_friend.ll
@@ -3,10 +3,10 @@
 
 ; Check that the friend tag is there and is followed by a DW_AT_friend that has a reference back.
 
-; CHECK: 0x00000032:   DW_TAG_class_type [4]
-; CHECK: 0x00000077:   DW_TAG_class_type [4]
-; CHECK: 0x000000a0:     DW_TAG_friend [9]  
-; CHECK: 0x000000a1:       DW_AT_friend [DW_FORM_ref4]   (cu + 0x0032 => {0x00000032})
+; CHECK: [[BACK:0x[0-9a-f]*]]:   DW_TAG_class_type
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]       ( .debug_str[{{.*}}] = "A")
+; CHECK: DW_TAG_friend
+; CHECK-NEXT: DW_AT_friend [DW_FORM_ref4]   (cu + 0x{{[0-9a-f]*}} => {[[BACK]]})
 
 
 %class.A = type { i32 }
@@ -16,31 +16,33 @@
 @b = global %class.B zeroinitializer, align 4
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!29}
 
 !0 = metadata !{i32 786449, metadata !28, i32 4, metadata !"clang version 3.1 (trunk 153413) (llvm/trunk 153428)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !17}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 10, metadata !7, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786434, metadata !28, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null} ; [ DW_TAG_class_type ]
+!7 = metadata !{i32 786434, metadata !28, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !11}
 !9 = metadata !{i32 786445, metadata !28, metadata !7, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !11 = metadata !{i32 786478, metadata !6, metadata !7, metadata !"A", metadata !"A", metadata !"", i32 1, metadata !12, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !15, i32 1} ; [ DW_TAG_subprogram ]
-!12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !14}
-!14 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !7} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !7} ; [ DW_TAG_pointer_type ]
 !15 = metadata !{metadata !16}
 !16 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !17 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !6, i32 11, metadata !18, i32 0, i32 1, %class.B* @b, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 786434, metadata !28, null, metadata !"B", i32 5, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null} ; [ DW_TAG_class_type ]
+!18 = metadata !{i32 786434, metadata !28, null, metadata !"B", i32 5, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_class_type ] [B] [line 5, size 32, align 32, offset 0] [def] [from ]
 !19 = metadata !{metadata !20, metadata !21, metadata !27}
 !20 = metadata !{i32 786445, metadata !28, metadata !18, metadata !"b", i32 7, i64 32, i64 32, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ]
 !21 = metadata !{i32 786478, metadata !6, metadata !18, metadata !"B", metadata !"B", metadata !"", i32 5, metadata !22, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !25, i32 5} ; [ DW_TAG_subprogram ]
-!22 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null, metadata !24}
-!24 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !18} ; [ DW_TAG_pointer_type ]
+!24 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !18} ; [ DW_TAG_pointer_type ]
 !25 = metadata !{metadata !26}
 !26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !27 = metadata !{i32 786474, metadata !18, null, metadata !6, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_friend ]
 !28 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/aligned_stack_var.ll b/test/DebugInfo/X86/aligned_stack_var.ll
index 5b23f64..d733dfd 100644
--- a/test/DebugInfo/X86/aligned_stack_var.ll
+++ b/test/DebugInfo/X86/aligned_stack_var.ll
@@ -25,13 +25,14 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!15}
 
 !0 = metadata !{i32 786449, metadata !14, i32 4, metadata !"clang version 3.2 (trunk 155696:155697) (llvm/trunk 155696)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !14, metadata !6, metadata !"run", metadata !"run", metadata !"_Z3runv", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 786688, metadata !10, metadata !"x", metadata !6, i32 2, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 !10 = metadata !{i32 786443, metadata !14, metadata !5, i32 1, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
@@ -39,3 +40,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !12 = metadata !{i32 2, i32 7, metadata !10, null}
 !13 = metadata !{i32 3, i32 1, metadata !10, null}
 !14 = metadata !{metadata !"test.cc", metadata !"/home/samsonov/debuginfo"}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/arguments.ll b/test/DebugInfo/X86/arguments.ll
index 6f99f87..1d51049 100644
--- a/test/DebugInfo/X86/arguments.ll
+++ b/test/DebugInfo/X86/arguments.ll
@@ -44,6 +44,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!24}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"scratch.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
@@ -51,16 +52,16 @@ attributes #1 = { nounwind readnone }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_Z4func3fooS_", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*, %struct.foo*)* @_Z4func3fooS_, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/scratch.cpp]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8, metadata !8}
-!8 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [from ]
+!8 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !12}
 !10 = metadata !{i32 786445, metadata !1, metadata !8, metadata !"i", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !11} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 0] [from int]
 !11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 2} ; [ DW_TAG_subprogram ] [line 2] [foo]
-!13 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{null, metadata !15, metadata !16}
-!15 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
+!15 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
 !16 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
 !18 = metadata !{i32 786468}
@@ -69,3 +70,4 @@ attributes #1 = { nounwind readnone }
 !21 = metadata !{i32 786689, metadata !4, metadata !"g", metadata !5, i32 33554438, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [g] [line 6]
 !22 = metadata !{i32 7, i32 0, metadata !4, null}
 !23 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/block-capture.ll b/test/DebugInfo/X86/block-capture.ll
index 0046730..2f966a7 100644
--- a/test/DebugInfo/X86/block-capture.ll
+++ b/test/DebugInfo/X86/block-capture.ll
@@ -2,10 +2,10 @@
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Checks that we emit debug info for the block variable declare.
-; CHECK: 0x00000030:   DW_TAG_subprogram [3]
-; CHECK: 0x0000005b:     DW_TAG_variable [5]
-; CHECK: 0x0000005c:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x000000e6] = "block")
-; CHECK: 0x00000066:       DW_AT_location [DW_FORM_data4]        (0x00000023)
+; CHECK: DW_TAG_subprogram [3]
+; CHECK: DW_TAG_variable [5]
+; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[{{.*}}] = "block")
+; CHECK: DW_AT_location [DW_FORM_sec_offset]        ({{.*}})
 
 %struct.__block_descriptor = type { i64, i64 }
 %struct.__block_literal_generic = type { i8*, i32, i32, i8*, %struct.__block_descriptor* }
@@ -60,18 +60,18 @@ declare void @objc_end_catch()
 declare i32 @__objc_personality_v0(...)
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!35, !36, !37, !38}
+!llvm.module.flags = !{!35, !36, !37, !38, !64}
 
 !0 = metadata !{i32 786449, metadata !63, i32 16, metadata !"clang version 3.1 (trunk 151227)", i1 false, metadata !"", i32 2, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !28, metadata !31, metadata !34}
 !5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !26, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !63} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
 !9 = metadata !{i32 786454, metadata !63, null, metadata !"dispatch_block_t", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
 !10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_literal_generic", i32 5, i64 256, i64 0, i32 0, i32 8, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!11 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_literal_generic", i32 5, i64 256, i64 0, i32 0, i32 8, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 5, size 256, align 0, offset 0] [def] [from ]
 !12 = metadata !{metadata !13, metadata !15, metadata !17, metadata !18, metadata !19}
 !13 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__isa", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_member ]
 !14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
@@ -81,7 +81,7 @@ declare i32 @__objc_personality_v0(...)
 !18 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__FuncPtr", i32 0, i64 64, i64 64, i64 128, i32 0, metadata !14} ; [ DW_TAG_member ]
 !19 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__descriptor", i32 5, i64 64, i64 64, i64 192, i32 0, metadata !20} ; [ DW_TAG_member ]
 !20 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !21} ; [ DW_TAG_pointer_type ]
-!21 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_descriptor", i32 5, i64 128, i64 0, i32 0, i32 8, null, metadata !22, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!21 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_descriptor", i32 5, i64 128, i64 0, i32 0, i32 8, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 5, size 128, align 0, offset 0] [def] [from ]
 !22 = metadata !{metadata !23, metadata !25}
 !23 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"reserved", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_member ]
 !24 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
@@ -89,10 +89,10 @@ declare i32 @__objc_personality_v0(...)
 !26 = metadata !{metadata !27}
 !27 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !28 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__foo_block_invoke_0", metadata !"__foo_block_invoke_0", metadata !"", i32 7, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @__foo_block_invoke_0, null, null, metadata !26, i32 7} ; [ DW_TAG_subprogram ]
-!29 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!29 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !30 = metadata !{null, metadata !14}
 !31 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__copy_helper_block_", metadata !"__copy_helper_block_", metadata !"", i32 10, metadata !32, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !26, i32 10} ; [ DW_TAG_subprogram ]
-!32 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!32 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !33 = metadata !{null, metadata !14, metadata !14}
 !34 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"__destroy_helper_block_", metadata !"__destroy_helper_block_", metadata !"", i32 10, metadata !29, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, metadata !26, i32 10} ; [ DW_TAG_subprogram ]
 !35 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
@@ -101,7 +101,7 @@ declare i32 @__objc_personality_v0(...)
 !38 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
 !39 = metadata !{i32 786689, metadata !28, metadata !".block_descriptor", metadata !6, i32 16777223, metadata !40, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
 !40 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !41} ; [ DW_TAG_pointer_type ]
-!41 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_literal_1", i32 7, i64 320, i64 64, i32 0, i32 0, null, metadata !42, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!41 = metadata !{i32 786451, metadata !63, metadata !6, metadata !"__block_literal_1", i32 7, i64 320, i64 64, i32 0, i32 0, null, metadata !42, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 7, size 320, align 64, offset 0] [def] [from ]
 !42 = metadata !{metadata !43, metadata !44, metadata !45, metadata !46, metadata !47, metadata !50}
 !43 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__isa", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_member ]
 !44 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__flags", i32 7, i64 32, i64 32, i64 64, i32 0, metadata !16} ; [ DW_TAG_member ]
@@ -109,7 +109,7 @@ declare i32 @__objc_personality_v0(...)
 !46 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__FuncPtr", i32 7, i64 64, i64 64, i64 128, i32 0, metadata !14} ; [ DW_TAG_member ]
 !47 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"__descriptor", i32 7, i64 64, i64 64, i64 192, i32 0, metadata !48} ; [ DW_TAG_member ]
 !48 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !49} ; [ DW_TAG_pointer_type ]
-!49 = metadata !{i32 786451, metadata !63, null, metadata !"__block_descriptor_withcopydispose", i32 7, i32 0, i32 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ]
+!49 = metadata !{i32 786451, metadata !63, null, metadata !"__block_descriptor_withcopydispose", i32 7, i32 0, i32 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 7, size 0, align 0, offset 0] [decl] [from ]
 !50 = metadata !{i32 786445, metadata !63, metadata !6, metadata !"block", i32 7, i64 64, i64 64, i64 256, i32 0, metadata !9} ; [ DW_TAG_member ]
 !51 = metadata !{i32 7, i32 18, metadata !28, null}
 !52 = metadata !{i32 7, i32 19, metadata !28, null}
@@ -124,3 +124,4 @@ declare i32 @__objc_personality_v0(...)
 !61 = metadata !{i32 10, i32 21, metadata !28, null}
 !62 = metadata !{i32 9, i32 20, metadata !56, null}
 !63 = metadata !{metadata !"foo.m", metadata !"/Users/echristo"}
+!64 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/byvalstruct.ll b/test/DebugInfo/X86/byvalstruct.ll
index 7d42c9f..3dea863 100644
--- a/test/DebugInfo/X86/byvalstruct.ll
+++ b/test/DebugInfo/X86/byvalstruct.ll
@@ -84,25 +84,25 @@ attributes #0 = { ssp uwtable }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!24, !25, !26, !27}
+!llvm.module.flags = !{!24, !25, !26, !27, !38}
 
 !0 = metadata !{i32 786449, metadata !1, i32 17, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 2, metadata !2, metadata !3, metadata !6, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/t.mm] [DW_LANG_ObjC_plus_plus]
 !1 = metadata !{metadata !"t.mm", metadata !""}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !1, metadata !5, metadata !"Bitmap", i32 8, i64 8, i64 8, i32 0, i32 512, null, metadata !2, i32 17, null, null} ; [ DW_TAG_structure_type ] [Bitmap] [line 8, size 8, align 8, offset 0] [from ]
+!4 = metadata !{i32 786451, metadata !1, metadata !5, metadata !"Bitmap", i32 8, i64 8, i64 8, i32 0, i32 512, null, metadata !2, i32 17, null, null, null} ; [ DW_TAG_structure_type ] [Bitmap] [line 8, size 8, align 8, offset 0] [def] [from ]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/t.mm]
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"-[Bitmap initWithCopy:andInfo:andLength:]", metadata !"-[Bitmap initWithCopy:andInfo:andLength:]", metadata !"", i32 9, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, i8* (%0*, i8*, %0*, %struct.ImageInfo*, i64)* @"\01-[Bitmap initWithCopy:andInfo:andLength:]", null, null, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [local] [def] [-[Bitmap initWithCopy:andInfo:andLength:]]
-!8 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{metadata !4, metadata !10, metadata !11, metadata !14, metadata !15, metadata !19}
-!10 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Bitmap]
-!11 = metadata !{i32 786454, metadata !1, i32 0, metadata !"SEL", i32 9, i64 0, i64 0, i64 0, i32 64, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [artificial] [from ]
+!10 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Bitmap]
+!11 = metadata !{i32 786454, metadata !1, null, metadata !"SEL", i32 9, i64 0, i64 0, i64 0, i32 64, metadata !12} ; [ DW_TAG_typedef ] [SEL] [line 9, size 0, align 0, offset 0] [artificial] [from ]
 !12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!13 = metadata !{i32 786451, metadata !1, null, metadata !"objc_selector", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [fwd] [from ]
+!13 = metadata !{i32 786451, metadata !1, null, metadata !"objc_selector", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
 !14 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Bitmap]
 !15 = metadata !{i32 786454, metadata !1, null, metadata !"ImageInfo", i32 7, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ] [ImageInfo] [line 7, size 0, align 0, offset 0] [from ]
-!16 = metadata !{i32 786451, metadata !1, null, metadata !"", i32 2, i64 192, i64 64, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [line 2, size 192, align 64, offset 0] [from ]
+!16 = metadata !{i32 786451, metadata !1, null, metadata !"", i32 2, i64 192, i64 64, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 2, size 192, align 64, offset 0] [def] [from ]
 !17 = metadata !{metadata !18, metadata !21, metadata !22}
 !18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"width", i32 4, i64 64, i64 64, i64 0, i32 0, metadata !19} ; [ DW_TAG_member ] [width] [line 4, size 64, align 64, offset 0] [from NSUInteger]
 !19 = metadata !{i32 786454, metadata !1, null, metadata !"NSUInteger", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_typedef ] [NSUInteger] [line 1, size 0, align 0, offset 0] [from long unsigned int]
@@ -124,3 +124,4 @@ attributes #1 = { nounwind readnone }
 !35 = metadata !{i32 786689, metadata !7, metadata !"length", metadata !5, i32 83886091, metadata !19, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [length] [line 11]
 !36 = metadata !{i32 11, i32 0, metadata !7, null}
 !37 = metadata !{i32 13, i32 0, metadata !7, null}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/coff_relative_names.ll b/test/DebugInfo/X86/coff_relative_names.ll
index 8e46e0b..4cc38a6 100644
--- a/test/DebugInfo/X86/coff_relative_names.ll
+++ b/test/DebugInfo/X86/coff_relative_names.ll
@@ -1,40 +1,41 @@
-; RUN: llc -mtriple=i686-w64-mingw32 -filetype=asm -O0 < %s | FileCheck %s
-
-; CHECK:  	.secrel32 Linfo_string0
-; CHECK:  	.secrel32 Linfo_string1
-;
-; generated from:
-; clang -g -S -emit-llvm test.c -o test.ll
-; int main()
-; {
-; 	return 0;
-; }
-
-; ModuleID = 'test.c'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
-target triple = "i686-pc-win32"
-
-; Function Attrs: nounwind
-define i32 @main() #0 {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  ret i32 0, !dbg !10
-}
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!9}
-
-!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
-!1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!10 = metadata !{i32 3, i32 0, metadata !4, null}
+; RUN: llc -mtriple=i686-w64-mingw32 -filetype=asm -O0 < %s | FileCheck %s
+
+; CHECK:  	.secrel32 Linfo_string0
+; CHECK:  	.secrel32 Linfo_string1
+;
+; generated from:
+; clang -g -S -emit-llvm test.c -o test.ll
+; int main()
+; {
+; 	return 0;
+; }
+
+; ModuleID = 'test.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i686-pc-win32"
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0, !dbg !10
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [C:\Projects/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"C:\5CProjects"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [C:\Projects/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!10 = metadata !{i32 3, i32 0, metadata !4, null}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index 3b9aefc..4a15296 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -7,15 +7,15 @@
 ; first check that we have a TAG_subprogram at a given offset and it has
 ; AT_inline.
 
-; CHECK: 0x0000011e:   DW_TAG_subprogram [18]
+; CHECK: 0x0000011c:   DW_TAG_subprogram [17]
 ; CHECK-NEXT:     DW_AT_specification
 ; CHECK-NEXT:     DW_AT_inline
 
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
-; CHECK: 0x0000015f:   DW_TAG_subprogram [20]
-; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x011e => {0x0000011e})
+; CHECK: 0x0000015d:   DW_TAG_subprogram [19]
+; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x011c => {0x0000011c})
 
 define i32 @_ZN17nsAutoRefCnt7ReleaseEv() {
 entry:
@@ -33,43 +33,44 @@ entry:
 declare void @_Z8moz_freePv(i8*)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!60}
 
 !0 = metadata !{i32 786449, metadata !59, i32 4, metadata !"clang version 3.1 ()", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !47,  metadata !47, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !23, metadata !27, metadata !31}
-!5 = metadata !{i32 720942, metadata !6, null, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32* null, null, metadata !12, metadata !20, i32 14} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !6, null, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", i32 14, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !12, metadata !20, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [Release]
 !6 = metadata !{i32 720937, metadata !59} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786451, metadata !59, null, metadata !"nsAutoRefCnt", i32 10, i64 0, i64 0, i32 0, i32 4, i32 0, null, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!10 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786451, metadata !59, null, metadata !"nsAutoRefCnt", i32 10, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 10, size 0, align 0, offset 0] [decl] [from ]
 !12 = metadata !{i32 720942, metadata !6, metadata !13, metadata !"Release", metadata !"Release", metadata !"_ZN17nsAutoRefCnt7ReleaseEv", i32 11, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 11} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 720898, metadata !59, null, metadata !"nsAutoRefCnt", i32 10, i64 8, i64 8, i32 0, i32 0, null, metadata !14, i32 0, null, null} ; [ DW_TAG_class_type ]
+!13 = metadata !{i32 720898, metadata !59, null, metadata !"nsAutoRefCnt", i32 10, i64 8, i64 8, i32 0, i32 0, null, metadata !14, null, null, null} ; [ DW_TAG_class_type ]
 !14 = metadata !{metadata !12, metadata !15}
 !15 = metadata !{i32 720942, metadata !6, metadata !13, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"", i32 12, metadata !16, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 12} ; [ DW_TAG_subprogram ]
-!16 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!16 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null, metadata !10}
 !18 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !20 = metadata !{metadata !22}
 !22 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777230, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!23 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD1Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32* null, null, metadata !15, metadata !24, i32 18} ; [ DW_TAG_subprogram ]
+!23 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD1Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !15, metadata !24, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
 !24 = metadata !{metadata !26}
 !26 = metadata !{i32 786689, metadata !23, metadata !"this", metadata !6, i32 16777234, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD2Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32* null, null, metadata !15, metadata !28, i32 18} ; [ DW_TAG_subprogram ]
+!27 = metadata !{i32 720942, metadata !6, null, metadata !"~nsAutoRefCnt", metadata !"~nsAutoRefCnt", metadata !"_ZN17nsAutoRefCntD2Ev", i32 18, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32* null, null, metadata !15, metadata !28, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [~nsAutoRefCnt]
 !28 = metadata !{metadata !30}
 !30 = metadata !{i32 786689, metadata !27, metadata !"this", metadata !6, i32 16777234, metadata !10, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!31 = metadata !{i32 720942, metadata !6, null, metadata !"operator=", metadata !"operator=", metadata !"_ZN12nsAutoRefCntaSEi", i32 4, metadata !32, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, metadata !36, metadata !43, i32 4} ; [ DW_TAG_subprogram ]
-!32 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !33, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!31 = metadata !{i32 720942, metadata !6, null, metadata !"operator=", metadata !"operator=", metadata !"_ZN12nsAutoRefCntaSEi", i32 4, metadata !32, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !36, metadata !43, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [operator=]
+!32 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !33 = metadata !{metadata !9, metadata !34, metadata !9}
-!34 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !35} ; [ DW_TAG_pointer_type ]
-!35 = metadata !{i32 786451, metadata !59, null, metadata !"nsAutoRefCnt", i32 2, i64 0, i64 0, i32 0, i32 4, i32 0, null, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!34 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !35} ; [ DW_TAG_pointer_type ]
+!35 = metadata !{i32 786451, metadata !59, null, metadata !"nsAutoRefCnt", i32 2, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [nsAutoRefCnt] [line 2, size 0, align 0, offset 0] [decl] [from ]
 !36 = metadata !{i32 720942, metadata !6, metadata !37, metadata !"operator=", metadata !"operator=", metadata !"_ZN12nsAutoRefCntaSEi", i32 4, metadata !32, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 4} ; [ DW_TAG_subprogram ]
-!37 = metadata !{i32 720898, metadata !59, null, metadata !"nsAutoRefCnt", i32 2, i64 32, i64 32, i32 0, i32 0, null, metadata !38, i32 0, null, null} ; [ DW_TAG_class_type ]
+!37 = metadata !{i32 720898, metadata !59, null, metadata !"nsAutoRefCnt", i32 2, i64 32, i64 32, i32 0, i32 0, null, metadata !38, i32 0, null, null, null} ; [ DW_TAG_class_type ] [nsAutoRefCnt] [line 2, size 32, align 32, offset 0] [def] [from ]
 !38 = metadata !{metadata !39, metadata !40, metadata !36}
 !39 = metadata !{i32 786445, metadata !59, metadata !37, metadata !"mValue", i32 7, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ]
 !40 = metadata !{i32 720942, metadata !6, metadata !37, metadata !"nsAutoRefCnt", metadata !"nsAutoRefCnt", metadata !"", i32 3, metadata !41, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ]
-!41 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !42, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!41 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !42, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !42 = metadata !{null, metadata !34}
 !43 = metadata !{metadata !45, metadata !46}
 !45 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777220, metadata !34, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
@@ -86,3 +87,4 @@ declare void @_Z8moz_freePv(i8*)
 !57 = metadata !{i32 19, i32 3, metadata !55, metadata !58}
 !58 = metadata !{i32 18, i32 41, metadata !23, null}
 !59 = metadata !{metadata !"nsAutoRefCnt.ii", metadata !"/Users/espindola/mozilla-central/obj-x86_64-apple-darwin11.2.0/netwerk/base/src"}
+!60 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/data_member_location.ll b/test/DebugInfo/X86/data_member_location.ll
new file mode 100644
index 0000000..1adddb9
--- /dev/null
+++ b/test/DebugInfo/X86/data_member_location.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=x86_64-linux -O0 -o - -filetype=obj < %s | llvm-dwarfdump -debug-dump=info -| FileCheck %s
+
+; Generated from Clang with the following source:
+;
+; struct foo {
+;   char c;
+;   int i;
+; };
+; 
+; foo f;
+
+; CHECK: DW_AT_name {{.*}} "c"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_data_member_location {{.*}} (0x00)
+
+; CHECK: DW_AT_name {{.*}} "i"
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_data_member_location {{.*}} (0x04)
+
+%struct.foo = type { i8, i32 }
+
+@f = global %struct.foo zeroinitializer, align 4
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !15}
+!llvm.ident = !{!14}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !10, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/data_member_location.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"data_member_location.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 64, i64 32, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !8}
+!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS3foo", metadata !"c", i32 2, i64 8, i64 8, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [c] [line 2, size 8, align 8, offset 0] [from char]
+!7 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!8 = metadata !{i32 786445, metadata !1, metadata !"_ZTS3foo", metadata !"i", i32 3, i64 32, i64 32, i64 32, i32 0, metadata !9} ; [ DW_TAG_member ] [i] [line 3, size 32, align 32, offset 32] [from int]
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !12, i32 6, metadata !4, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
+!12 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/data_member_location.cpp]
+!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!14 = metadata !{metadata !"clang version 3.4 "}
+
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-at-specficiation.ll b/test/DebugInfo/X86/dbg-at-specficiation.ll
new file mode 100644
index 0000000..8003a0f
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-at-specficiation.ll
@@ -0,0 +1,21 @@
+; RUN: llc  < %s | FileCheck %s
+; Radar 10147769
+; Do not unnecessarily use AT_specification DIE.
+; CHECK-NOT: AT_specification
+
+@a = common global [10 x i32] zeroinitializer, align 16
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12}
+
+!0 = metadata !{i32 720913, metadata !11, i32 12, metadata !"clang version 3.0 (trunk 140253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, i32 0} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
+!5 = metadata !{i32 720948, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
+!6 = metadata !{i32 720937, metadata !11} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 720897, null, null, null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !8, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
+!8 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
+!11 = metadata !{metadata !"x.c", metadata !"/private/tmp"}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-byval-parameter.ll b/test/DebugInfo/X86/dbg-byval-parameter.ll
new file mode 100644
index 0000000..d66486d
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-byval-parameter.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=x86 -asm-verbose < %s | grep DW_TAG_formal_parameter
+
+
+%struct.Pt = type { double, double }
+%struct.Rect = type { %struct.Pt, %struct.Pt }
+
+define double @foo(%struct.Rect* byval %my_r0) nounwind ssp {
+entry:
+  %retval = alloca double                         ; <double*> [#uses=2]
+  %0 = alloca double                              ; <double*> [#uses=2]
+  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
+  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15
+  %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
+  %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
+  %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
+  store double %3, double* %0, align 8, !dbg !16
+  %4 = load double* %0, align 8, !dbg !16         ; <double> [#uses=1]
+  store double %4, double* %retval, align 8, !dbg !16
+  br label %return, !dbg !16
+
+return:                                           ; preds = %entry
+  %retval1 = load double* %retval, !dbg !16       ; <double> [#uses=1]
+  ret double %retval1, !dbg !16
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!21}
+
+!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
+!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{metadata !6, metadata !7}
+!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
+!8 = metadata !{metadata !9, metadata !14}
+!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
+!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
+!11 = metadata !{metadata !12, metadata !13}
+!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
+!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
+!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ]
+!15 = metadata !{i32 11, i32 0, metadata !1, null}
+!16 = metadata !{i32 12, i32 0, metadata !17, null}
+!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{metadata !1}
+!19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
+!20 = metadata !{i32 0}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-const-int.ll b/test/DebugInfo/X86/dbg-const-int.ll
new file mode 100644
index 0000000..f2f51c9
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-const-int.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=x86_64-apple-darwin12 -filetype=obj %s -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+; Radar 9511391
+
+; CHECK: DW_TAG_variable
+; CHECK: "i"
+; CHECK: DW_AT_const_value [DW_FORM_sdata]   (42)
+
+define i32 @foo() nounwind uwtable readnone optsize ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  ret i32 42, !dbg !10
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!15}
+
+!0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.0 (trunk 132191)", i1 true, metadata !"", i32 0, metadata !14, metadata !14, metadata !11, null,  null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 786478, metadata !13, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
+!2 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 786453, metadata !13, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 786688, metadata !7, metadata !"i", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
+!7 = metadata !{i32 786443, metadata !13, metadata !1, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{i32 42}
+!9 = metadata !{i32 2, i32 12, metadata !7, null}
+!10 = metadata !{i32 3, i32 2, metadata !7, null}
+!11 = metadata !{metadata !1}
+!12 = metadata !{metadata !6}
+!13 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
+!14 = metadata !{i32 0}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-const.ll b/test/DebugInfo/X86/dbg-const.ll
new file mode 100644
index 0000000..12dc154
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-const.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s - | FileCheck %s
+;
+; FIXME: A potentially more interesting test case would be:
+; %call = @bar()
+; dbg.value j=0
+; %call2 = @bar()
+; dbg.value j=%call
+;
+; We cannot current handle the above sequence because codegenprepare
+; hoists the second dbg.value above %call2, which then appears to
+; conflict with j=0. It does this because SelectionDAG cannot handle
+; global debug values.
+
+target triple = "x86_64-apple-darwin10.0.0"
+
+;CHECK:        ## DW_OP_constu
+;CHECK-NEXT:  .byte	42
+define i32 @foobar() nounwind readonly noinline ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
+  %call = tail call i32 @bar(), !dbg !11
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6), !dbg !11
+  %call2 = tail call i32 @bar(), !dbg !11
+  %add = add nsw i32 %call2, %call, !dbg !12
+  ret i32 %add, !dbg !10
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+declare i32 @bar() nounwind readnone
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!17}
+
+!0 = metadata !{i32 786478, metadata !15, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"foobar", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @foobar, null, null, metadata !14, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114183)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 786453, metadata !15, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786468, metadata !15, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!6 = metadata !{i32 786688, metadata !7, metadata !"j", metadata !1, i32 15, metadata !5, i32 0, null}
+!7 = metadata !{i32 786443, metadata !15, metadata !0, i32 12, i32 52, i32 0} ; [ DW_TAG_lexical_block ]
+!8 = metadata !{i32 42}
+!9 = metadata !{i32 15, i32 12, metadata !7, null}
+!10 = metadata !{i32 23, i32 3, metadata !7, null}
+!11 = metadata !{i32 17, i32 3, metadata !7, null}
+!12 = metadata !{i32 18, i32 3, metadata !7, null}
+!13 = metadata !{metadata !0}
+!14 = metadata !{metadata !6}
+!15 = metadata !{metadata !"mu.c", metadata !"/private/tmp"}
+!16 = metadata !{i32 0}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-declare-arg.ll b/test/DebugInfo/X86/dbg-declare-arg.ll
new file mode 100644
index 0000000..7bf6f4f
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-declare-arg.ll
@@ -0,0 +1,127 @@
+; RUN: llc -O0 -fast-isel=false < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+;Radar 9321650
+
+;CHECK: ##DEBUG_VALUE: my_a 
+
+%class.A = type { i32, i32, i32, i32 }
+
+define void @_Z3fooi(%class.A* sret %agg.result, i32 %i) ssp {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  %nrvo = alloca i1
+  %cleanup.dest.slot = alloca i32
+  store i32 %i, i32* %i.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !26), !dbg !27
+  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !28), !dbg !30
+  store i32 0, i32* %j, align 4, !dbg !31
+  %tmp = load i32* %i.addr, align 4, !dbg !32
+  %cmp = icmp eq i32 %tmp, 42, !dbg !32
+  br i1 %cmp, label %if.then, label %if.end, !dbg !32
+
+if.then:                                          ; preds = %entry
+  %tmp1 = load i32* %i.addr, align 4, !dbg !33
+  %add = add nsw i32 %tmp1, 1, !dbg !33
+  store i32 %add, i32* %j, align 4, !dbg !33
+  br label %if.end, !dbg !35
+
+if.end:                                           ; preds = %if.then, %entry
+  store i1 false, i1* %nrvo, !dbg !36
+  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !37), !dbg !39
+  %tmp2 = load i32* %j, align 4, !dbg !40
+  %x = getelementptr inbounds %class.A* %agg.result, i32 0, i32 0, !dbg !40
+  store i32 %tmp2, i32* %x, align 4, !dbg !40
+  store i1 true, i1* %nrvo, !dbg !41
+  store i32 1, i32* %cleanup.dest.slot
+  %nrvo.val = load i1* %nrvo, !dbg !42
+  br i1 %nrvo.val, label %nrvo.skipdtor, label %nrvo.unused, !dbg !42
+
+nrvo.unused:                                      ; preds = %if.end
+  call void @_ZN1AD1Ev(%class.A* %agg.result), !dbg !42
+  br label %nrvo.skipdtor, !dbg !42
+
+nrvo.skipdtor:                                    ; preds = %nrvo.unused, %if.end
+  ret void, !dbg !42
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define linkonce_odr void @_ZN1AD1Ev(%class.A* %this) unnamed_addr ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !43), !dbg !44
+  %this1 = load %class.A** %this.addr
+  call void @_ZN1AD2Ev(%class.A* %this1)
+  ret void, !dbg !45
+}
+
+define linkonce_odr void @_ZN1AD2Ev(%class.A* %this) unnamed_addr nounwind ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !46), !dbg !47
+  %this1 = load %class.A** %this.addr
+  %x = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !48
+  store i32 1, i32* %x, align 4, !dbg !48
+  ret void, !dbg !48
+}
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!52}
+
+!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"~A", metadata !"~A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589826, metadata !51, metadata !2, metadata !"A", i32 2, i64 128, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 2, size 128, align 32, offset 0] [def] [from ]
+!2 = metadata !{i32 786449, metadata !51, i32 4, metadata !"clang version 3.0 (trunk 130127)", i1 false, metadata !"", i32 0, metadata !24, metadata !24, metadata !50, null, null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
+!4 = metadata !{metadata !5, metadata !7, metadata !8, metadata !9, metadata !0, metadata !10, metadata !14}
+!5 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
+!6 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !6} ; [ DW_TAG_member ]
+!8 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"z", i32 2, i64 32, i64 32, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
+!9 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"o", i32 2, i64 32, i64 32, i64 96, i32 0, metadata !6} ; [ DW_TAG_member ]
+!10 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 786447, metadata !2, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!15 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null, metadata !13, metadata !17}
+!17 = metadata !{i32 589840, null, metadata !2, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ]
+!18 = metadata !{i32 786470, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_const_type ]
+!19 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 4, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, i32)* @_Z3fooi, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
+!20 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = metadata !{metadata !1}
+!22 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD1Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AD1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
+!23 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !24, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{null}
+!25 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AD2Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [~A]
+!26 = metadata !{i32 786689, metadata !19, metadata !"i", metadata !3, i32 16777220, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
+!27 = metadata !{i32 4, i32 11, metadata !19, null}
+!28 = metadata !{i32 786688, metadata !29, metadata !"j", metadata !3, i32 5, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!29 = metadata !{i32 786443, metadata !51, metadata !19, i32 4, i32 14, i32 0} ; [ DW_TAG_lexical_block ]
+!30 = metadata !{i32 5, i32 7, metadata !29, null}
+!31 = metadata !{i32 5, i32 12, metadata !29, null}
+!32 = metadata !{i32 6, i32 3, metadata !29, null}
+!33 = metadata !{i32 7, i32 5, metadata !34, null}
+!34 = metadata !{i32 786443, metadata !51, metadata !29, i32 6, i32 16, i32 1} ; [ DW_TAG_lexical_block ]
+!35 = metadata !{i32 8, i32 3, metadata !34, null}
+!36 = metadata !{i32 9, i32 9, metadata !29, null}
+!37 = metadata !{i32 786688, metadata !29, metadata !"my_a", metadata !3, i32 9, metadata !38, i32 0, null} ; [ DW_TAG_auto_variable ]
+!38 = metadata !{i32 589840, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
+!39 = metadata !{i32 9, i32 5, metadata !29, null}
+!40 = metadata !{i32 10, i32 3, metadata !29, null}
+!41 = metadata !{i32 11, i32 3, metadata !29, null}
+!42 = metadata !{i32 12, i32 1, metadata !29, null}
+!43 = metadata !{i32 786689, metadata !22, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
+!44 = metadata !{i32 2, i32 47, metadata !22, null}
+!45 = metadata !{i32 2, i32 61, metadata !22, null}
+!46 = metadata !{i32 786689, metadata !25, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
+!47 = metadata !{i32 2, i32 47, metadata !25, null}
+!48 = metadata !{i32 2, i32 54, metadata !49, null}
+!49 = metadata !{i32 786443, metadata !51, metadata !25, i32 2, i32 52, i32 2} ; [ DW_TAG_lexical_block ]
+!50 = metadata !{metadata !0, metadata !10, metadata !14, metadata !19, metadata !22, metadata !25}
+!51 = metadata !{metadata !"a.cc", metadata !"/private/tmp"}
+!52 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-declare.ll b/test/DebugInfo/X86/dbg-declare.ll
new file mode 100644
index 0000000..988d0bc
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-declare.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -O0 -mtriple x86_64-apple-darwin
+; <rdar://problem/11134152>
+
+define i32 @foo(i32* %x) nounwind uwtable ssp {
+entry:
+  %x.addr = alloca i32*, align 8
+  %saved_stack = alloca i8*
+  %cleanup.dest.slot = alloca i32
+  store i32* %x, i32** %x.addr, align 8
+  call void @llvm.dbg.declare(metadata !{i32** %x.addr}, metadata !14), !dbg !15
+  %0 = load i32** %x.addr, align 8, !dbg !16
+  %1 = load i32* %0, align 4, !dbg !16
+  %2 = zext i32 %1 to i64, !dbg !16
+  %3 = call i8* @llvm.stacksave(), !dbg !16
+  store i8* %3, i8** %saved_stack, !dbg !16
+  %vla = alloca i8, i64 %2, align 16, !dbg !16
+  call void @llvm.dbg.declare(metadata !{i8* %vla}, metadata !18), !dbg !23
+  store i32 1, i32* %cleanup.dest.slot
+  %4 = load i8** %saved_stack, !dbg !24
+  call void @llvm.stackrestore(i8* %4), !dbg !24
+  ret i32 0, !dbg !25
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare i8* @llvm.stacksave() nounwind
+
+declare void @llvm.stackrestore(i8*) nounwind
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!27}
+
+!0 = metadata !{i32 786449, metadata !26, i32 12, metadata !"clang version 3.1 (trunk 153698)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, metadata !26, metadata !0, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*)* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !10}
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777221, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{i32 5, i32 21, metadata !5, null}
+!16 = metadata !{i32 7, i32 13, metadata !17, null}
+!17 = metadata !{i32 786443, metadata !26, metadata !5, i32 6, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 786688, metadata !17, metadata !"a", metadata !6, i32 7, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
+!19 = metadata !{i32 786433, null, null, null, i32 0, i64 0, i64 8, i32 0, i32 0, metadata !20, metadata !21, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 8, offset 0] [from char]
+!20 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!21 = metadata !{metadata !22}
+!22 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
+!23 = metadata !{i32 7, i32 8, metadata !17, null}
+!24 = metadata !{i32 9, i32 1, metadata !17, null}
+!25 = metadata !{i32 8, i32 3, metadata !17, null}
+!26 = metadata !{metadata !"20020104-2.c", metadata !"/Volumes/Sandbox/llvm"}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-file-name.ll b/test/DebugInfo/X86/dbg-file-name.ll
new file mode 100644
index 0000000..e9c61c1
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-file-name.ll
@@ -0,0 +1,24 @@
+; RUN: llc -enable-dwarf-directory -mtriple x86_64-apple-darwin10.0.0  < %s | FileCheck %s
+
+; Radar 8884898
+; CHECK: file	1 "simple.c"
+
+declare i32 @printf(i8*, ...) nounwind
+
+define i32 @main() nounwind {
+  ret i32 0
+}
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!12}
+
+!1 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 786449, metadata !10, i32 1, metadata !"LLVM build 00", i1 true, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!5 = metadata !{i32 786468, metadata !10, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 786478, metadata !10, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 9, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!7 = metadata !{i32 786453, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !5}
+!9 = metadata !{metadata !6}
+!10 = metadata !{metadata !"simple.c", metadata !"/Users/manav/one/two"}
+!11 = metadata !{i32 0}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-i128-const.ll b/test/DebugInfo/X86/dbg-i128-const.ll
new file mode 100644
index 0000000..01b105f
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-i128-const.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+
+; CHECK: DW_AT_const_value
+; CHECK-NEXT: 42
+
+define i128 @__foo(i128 %a, i128 %b) nounwind {
+entry:
+  tail call void @llvm.dbg.value(metadata !0, i64 0, metadata !1), !dbg !11
+  %add = add i128 %a, %b, !dbg !11
+  ret i128 %add, !dbg !11
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!5}
+!llvm.module.flags = !{!16}
+
+!0 = metadata !{i128 42 }
+!1 = metadata !{i32 786688, metadata !2, metadata !"MAX", metadata !4, i32 29, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
+!2 = metadata !{i32 786443, metadata !13, metadata !3, i32 26, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!3 = metadata !{i32 786478, metadata !13, metadata !4, metadata !"__foo", metadata !"__foo", metadata !"__foo", i32 26, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i128 (i128, i128)* @__foo, null, null, null, i32 26} ; [ DW_TAG_subprogram ]
+!4 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 786449, metadata !13, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !15, metadata !15, metadata !12, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
+!6 = metadata !{i32 786453, metadata !13, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8, metadata !8}
+!8 = metadata !{i32 786454, metadata !14, metadata !4, metadata !"ti_int", i32 78, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
+!9 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
+!10 = metadata !{i32 786468, metadata !13, metadata !4, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!11 = metadata !{i32 29, i32 0, metadata !2, null}
+!12 = metadata !{metadata !3}
+!13 = metadata !{metadata !"foo.c", metadata !"/tmp"}
+!14 = metadata !{metadata !"myint.h", metadata !"/tmp"}
+!15 = metadata !{i32 0}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-large-unsigned-const.ll b/test/DebugInfo/X86/dbg-large-unsigned-const.ll
new file mode 100644
index 0000000..a037f3c
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-large-unsigned-const.ll
@@ -0,0 +1,62 @@
+; RUN: llc -filetype=obj %s -o /dev/null
+; Hanle large unsigned constant values.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-macosx10.7.0"
+
+define zeroext i1 @_Z3iseRKxS0_(i64* nocapture %LHS, i64* nocapture %RHS) nounwind readonly optsize ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i64* %LHS}, i64 0, metadata !7), !dbg !13
+  tail call void @llvm.dbg.value(metadata !{i64* %RHS}, i64 0, metadata !11), !dbg !14
+  %tmp1 = load i64* %LHS, align 4, !dbg !15
+  %tmp3 = load i64* %RHS, align 4, !dbg !15
+  %cmp = icmp eq i64 %tmp1, %tmp3, !dbg !15
+  ret i1 %cmp, !dbg !15
+}
+
+define zeroext i1 @_Z2fnx(i64 %a) nounwind readnone optsize ssp {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
+  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !21), !dbg !24
+  tail call void @llvm.dbg.value(metadata !25, i64 0, metadata !26), !dbg !27
+  %cmp.i = icmp eq i64 %a, 9223372036854775807, !dbg !28
+  ret i1 %cmp.i, !dbg !22
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!34}
+!29 = metadata !{metadata !1, metadata !6}
+!30 = metadata !{metadata !7, metadata !11}
+!31 = metadata !{metadata !12}
+
+!0 = metadata !{i32 786449, metadata !32, i32 4, metadata !"clang version 3.0 (trunk 135593)", i1 true, metadata !"", i32 0, metadata !33, metadata !33, metadata !29, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 786478, metadata !32, null, metadata !"ise", metadata !"ise", metadata !"_Z3iseRKxS0_", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i1 (i64*, i64*)* @_Z3iseRKxS0_, null, null, metadata !30, i32 2} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786468, null, metadata !0, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 786478, metadata !32, null, metadata !"fn", metadata !"fn", metadata !"_Z2fnx", i32 6, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i1 (i64)* @_Z2fnx, null, null, metadata !31, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [fn]
+!7 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 786448, metadata !0, null, null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ]
+!9 = metadata !{i32 786470, metadata !0, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ]
+!10 = metadata !{i32 786468, null, metadata !0, metadata !"long long int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!11 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!12 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !2, i32 16777222, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{i32 2, i32 27, metadata !1, null}
+!14 = metadata !{i32 2, i32 49, metadata !1, null}
+!15 = metadata !{i32 3, i32 3, metadata !16, null}
+!16 = metadata !{i32 786443, metadata !32, metadata !1, i32 2, i32 54, i32 0} ; [ DW_TAG_lexical_block ]
+!20 = metadata !{i32 6, i32 19, metadata !6, null}
+!21 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{i32 7, i32 10, metadata !23, null}
+!23 = metadata !{i32 786443, metadata !32, metadata !6, i32 6, i32 22, i32 1} ; [ DW_TAG_lexical_block ]
+!24 = metadata !{i32 2, i32 27, metadata !1, metadata !22}
+!25 = metadata !{i64 9223372036854775807}         
+!26 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
+!27 = metadata !{i32 2, i32 49, metadata !1, metadata !22}
+!28 = metadata !{i32 3, i32 3, metadata !16, metadata !22}
+!32 = metadata !{metadata !"lli.cc", metadata !"/private/tmp"}
+!33 = metadata !{i32 0}
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-merge-loc-entry.ll b/test/DebugInfo/X86/dbg-merge-loc-entry.ll
new file mode 100644
index 0000000..8b619ea
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-merge-loc-entry.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+; RUN: llc < %s -o %t -filetype=obj -regalloc=basic
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin8"
+
+;CHECK: DW_AT_location{{.*}}(<0x01> 55 )
+
+%0 = type { i64, i1 }
+
+@__clz_tab = external constant [256 x i8]
+
+define hidden i128 @__divti3(i128 %u, i128 %v) nounwind readnone {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i128 %u}, i64 0, metadata !14), !dbg !15
+  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !17), !dbg !21
+  br i1 undef, label %bb2, label %bb4, !dbg !22
+
+bb2:                                              ; preds = %entry
+  br label %bb4, !dbg !23
+
+bb4:                                              ; preds = %bb2, %entry
+  br i1 undef, label %__udivmodti4.exit, label %bb82.i, !dbg !24
+
+bb82.i:                                           ; preds = %bb4
+  unreachable
+
+__udivmodti4.exit:                                ; preds = %bb4
+  ret i128 undef, !dbg !27
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+declare %0 @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!32}
+
+!0 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__udivmodti4", metadata !"__udivmodti4", metadata !"", i32 879, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, null, i32 879} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 786449, metadata !29, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !31, metadata !31, metadata !28, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !5, metadata !5, metadata !5, metadata !8}
+!5 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"UTItype", i32 166, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
+!6 = metadata !{i32 786473, metadata !30} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 786447, metadata !29, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
+!9 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__divti3", metadata !"__divti3", metadata !"__divti3", i32 1094, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i128 (i128, i128)* @__divti3, null, null, null, i32 1094} ; [ DW_TAG_subprogram ]
+!10 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !12, metadata !12, metadata !12}
+!12 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"TItype", i32 160, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_typedef ]
+!13 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!14 = metadata !{i32 786689, metadata !9, metadata !"u", metadata !1, i32 1093, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
+!15 = metadata !{i32 1093, i32 0, metadata !9, null}
+!16 = metadata !{i64 0}
+!17 = metadata !{i32 786688, metadata !18, metadata !"c", metadata !1, i32 1095, metadata !19, i32 0, null} ; [ DW_TAG_auto_variable ]
+!18 = metadata !{i32 786443, metadata !29, metadata !9, i32 1094, i32 0, i32 13} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"word_type", i32 424, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_typedef ]
+!20 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!21 = metadata !{i32 1095, i32 0, metadata !18, null}
+!22 = metadata !{i32 1103, i32 0, metadata !18, null}
+!23 = metadata !{i32 1104, i32 0, metadata !18, null}
+!24 = metadata !{i32 1003, i32 0, metadata !25, metadata !26}
+!25 = metadata !{i32 786443, metadata !29, metadata !0, i32 879, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
+!26 = metadata !{i32 1107, i32 0, metadata !18, null}
+!27 = metadata !{i32 1111, i32 0, metadata !18, null}
+!28 = metadata !{metadata !0, metadata !9}
+!29 = metadata !{metadata !"foobar.c", metadata !"/tmp"}
+!30 = metadata !{metadata !"foobar.h", metadata !"/tmp"}
+!31 = metadata !{i32 0}
+!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-prolog-end.ll b/test/DebugInfo/X86/dbg-prolog-end.ll
new file mode 100644
index 0000000..a7c6cb5
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-prolog-end.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O0 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-macosx10.6.7"
+
+;CHECK: .loc	1 2 11 prologue_end
+define i32 @foo(i32 %i) nounwind ssp {
+entry:
+  %i.addr = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7), !dbg !8
+  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9), !dbg !11
+  store i32 2, i32* %j, align 4, !dbg !12
+  %tmp = load i32* %j, align 4, !dbg !13
+  %inc = add nsw i32 %tmp, 1, !dbg !13
+  store i32 %inc, i32* %j, align 4, !dbg !13
+  %tmp1 = load i32* %j, align 4, !dbg !14
+  %tmp2 = load i32* %i.addr, align 4, !dbg !14
+  %add = add nsw i32 %tmp1, %tmp2, !dbg !14
+  store i32 %add, i32* %j, align 4, !dbg !14
+  %tmp3 = load i32* %j, align 4, !dbg !15
+  ret i32 %tmp3, !dbg !15
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @foo(i32 21), !dbg !16
+  ret i32 %call, !dbg !16
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21}
+!18 = metadata !{metadata !1, metadata !6}
+
+!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.0 (trunk 131100)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
+!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!7 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777217, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 1, i32 13, metadata !1, null}
+!9 = metadata !{i32 786688, metadata !10, metadata !"j", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{i32 786443, metadata !19, metadata !1, i32 1, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 2, i32 6, metadata !10, null}
+!12 = metadata !{i32 2, i32 11, metadata !10, null}
+!13 = metadata !{i32 3, i32 2, metadata !10, null}
+!14 = metadata !{i32 4, i32 2, metadata !10, null}
+!15 = metadata !{i32 5, i32 2, metadata !10, null}
+!16 = metadata !{i32 8, i32 2, metadata !17, null}
+!17 = metadata !{i32 786443, metadata !19, metadata !6, i32 7, i32 12, i32 1} ; [ DW_TAG_lexical_block ]
+!19 = metadata !{metadata !"/tmp/a.c", metadata !"/private/tmp"}
+!20 = metadata !{i32 0}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-subrange.ll b/test/DebugInfo/X86/dbg-subrange.ll
new file mode 100644
index 0000000..5bf330c
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-subrange.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 < %s | FileCheck %s
+; Radar 10464995
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.2"
+
+@s = common global [4294967296 x i8] zeroinitializer, align 16
+;CHECK: .long	4294967295
+
+define void @bar() nounwind uwtable ssp {
+entry:
+  store i8 97, i8* getelementptr inbounds ([4294967296 x i8]* @s, i32 0, i64 0), align 1, !dbg !18
+  ret void, !dbg !20
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22}
+
+!0 = metadata !{i32 786449, metadata !21, i32 12, metadata !"clang version 3.1 (trunk 144833)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !11,  metadata !11, metadata !""} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 0}
+!3 = metadata !{metadata !5}
+!5 = metadata !{i32 720942, metadata !21, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @bar, null, null, metadata !9, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
+!6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null}
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !13}
+!13 = metadata !{i32 720948, i32 0, null, metadata !"s", metadata !"s", metadata !"", metadata !6, i32 2, metadata !14, i32 0, i32 1, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
+!14 = metadata !{i32 720897, null, null, null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 34359738368, align 8, offset 0] [from char]
+!15 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 720929, i64 0, i64 4294967296} ; [ DW_TAG_subrange_type ]
+!18 = metadata !{i32 5, i32 3, metadata !19, null}
+!19 = metadata !{i32 786443, metadata !21, metadata !5, i32 4, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!20 = metadata !{i32 6, i32 1, metadata !19, null}
+!21 = metadata !{metadata !"small.c", metadata !"/private/tmp"}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-value-dag-combine.ll b/test/DebugInfo/X86/dbg-value-dag-combine.ll
new file mode 100644
index 0000000..12aa61b
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-value-dag-combine.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+; PR 9817
+
+
+declare  <4 x i32> @__amdil_get_global_id_int()
+declare  void @llvm.dbg.value(metadata , i64 , metadata )
+define void @__OpenCL_test_kernel(i32 addrspace(1)* %ip) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata
+!7), !dbg !8
+  %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
+  %1 = extractelement <4 x i32> %0, i32 0
+  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9), !dbg !11
+  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13), !dbg !14
+  %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
+  %tmp3 = add i32 0, %tmp2, !dbg !15
+; CHECK:  ##DEBUG_VALUE: idx <- E{{..$}}
+  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13), !dbg
+!15
+  %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16
+  store i32 %tmp3, i32 addrspace(1)* %arrayidx, align 4, !dbg !16
+  ret void, !dbg !17
+}
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!20}
+
+!0 = metadata !{i32 786478, metadata !19, metadata !1, metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_test_kernel]
+!1 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 786449, metadata !19, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !12, metadata !12, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 786453, metadata !19, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{null, metadata !5}
+!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 1, i32 42, metadata !0, null}
+!9 = metadata !{i32 786688, metadata !10, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!10 = metadata !{i32 786443, metadata !19, metadata !0, i32 2, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!11 = metadata !{i32 3, i32 41, metadata !10, null}
+!12 = metadata !{i32 0}
+!13 = metadata !{i32 786688, metadata !10, metadata !"idx", metadata !1, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{i32 4, i32 20, metadata !10, null}
+!15 = metadata !{i32 5, i32 15, metadata !10, null}
+!16 = metadata !{i32 6, i32 18, metadata !10, null}
+!17 = metadata !{i32 7, i32 1, metadata !0, null}
+!18 = metadata !{metadata !0}
+!19 = metadata !{metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp"}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index de9f672..1a78772 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -47,20 +47,21 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!43}
 
-!0 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (%struct.S1*, i32)* @foo, null, null, metadata !41, i32 8} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.S1*, i32)* @foo, null, null, metadata !41, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [foo]
 !1 = metadata !{i32 786473, metadata !42} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !42, i32 12, metadata !"clang version 2.9 (trunk 125693)", i1 true, metadata !"", i32 0, metadata !8, metadata !8, metadata !39, metadata !40,  metadata !40, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"", i32 15, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, void ()* @foobar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"", i32 15, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @foobar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 15] [def] [scope 0] [foobar]
+!7 = metadata !{i32 786453, metadata !42, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 786689, metadata !0, metadata !"sp", metadata !1, i32 7, metadata !10, i32 0, metadata !32} ; [ DW_TAG_arg_variable ]
 !10 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
 !11 = metadata !{i32 786454, metadata !42, metadata !2, metadata !"S1", i32 4, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_typedef ]
-!12 = metadata !{i32 786451, metadata !42, metadata !2, metadata !"S1", i32 1, i64 128, i64 64, i32 0, i32 0, i32 0, metadata !13, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!12 = metadata !{i32 786451, metadata !42, metadata !2, metadata !"S1", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S1] [line 1, size 128, align 64, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !17}
 !14 = metadata !{i32 786445, metadata !42, metadata !1, metadata !"m", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_member ]
 !15 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ]
@@ -87,3 +88,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !40 = metadata !{metadata !19}
 !41 = metadata !{metadata !9, metadata !18}
 !42 = metadata !{metadata !"nm2.c", metadata !"/private/tmp"}
+!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-value-isel.ll b/test/DebugInfo/X86/dbg-value-isel.ll
new file mode 100644
index 0000000..f899f48
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-value-isel.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+; PR 9879
+
+; CHECK: ##DEBUG_VALUE: tid <-
+%0 = type { i8*, i8*, i8*, i8*, i32 }
+
+@sgv = internal addrspace(2) constant [1 x i8] zeroinitializer
+@fgv = internal addrspace(2) constant [1 x i8] zeroinitializer
+@lvgv = internal constant [0 x i8*] zeroinitializer
+@llvm.global.annotations = appending global [1 x %0] [%0 { i8* bitcast (void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel to i8*), i8* addrspacecast ([1 x i8] addrspace(2)* @sgv to i8*), i8* addrspacecast ([1 x i8] addrspace(2)* @fgv to i8*), i8* bitcast ([0 x i8*]* @lvgv to i8*), i32 0 }], section "llvm.metadata"
+
+define void @__OpenCL_nbt02_kernel(i32 addrspace(1)* %ip) nounwind {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8), !dbg !9
+  %0 = call <4 x i32> @__amdil_get_local_id_int() nounwind
+  %1 = extractelement <4 x i32> %0, i32 0
+  br label %2
+
+; <label>:2                                       ; preds = %entry
+  %3 = phi i32 [ %1, %entry ]
+  br label %4
+
+; <label>:4                                       ; preds = %2
+  %5 = phi i32 [ %3, %2 ]
+  br label %get_local_id.exit
+
+get_local_id.exit:                                ; preds = %4
+  %6 = phi i32 [ %5, %4 ]
+  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10), !dbg !12
+  %7 = call <4 x i32> @__amdil_get_global_id_int() nounwind, !dbg !12
+  %8 = extractelement <4 x i32> %7, i32 0, !dbg !12
+  br label %9
+
+; <label>:9                                       ; preds = %get_local_id.exit
+  %10 = phi i32 [ %8, %get_local_id.exit ]
+  br label %11
+
+; <label>:11                                      ; preds = %9
+  %12 = phi i32 [ %10, %9 ]
+  br label %get_global_id.exit
+
+get_global_id.exit:                               ; preds = %11
+  %13 = phi i32 [ %12, %11 ]
+  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13), !dbg !14
+  %14 = call <4 x i32> @__amdil_get_local_size_int() nounwind
+  %15 = extractelement <4 x i32> %14, i32 0
+  br label %16
+
+; <label>:16                                      ; preds = %get_global_id.exit
+  %17 = phi i32 [ %15, %get_global_id.exit ]
+  br label %18
+
+; <label>:18                                      ; preds = %16
+  %19 = phi i32 [ %17, %16 ]
+  br label %get_local_size.exit
+
+get_local_size.exit:                              ; preds = %18
+  %20 = phi i32 [ %19, %18 ]
+  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15), !dbg !16
+  %tmp5 = add i32 %6, %13, !dbg !17
+  %tmp7 = add i32 %tmp5, %20, !dbg !17
+  store i32 %tmp7, i32 addrspace(1)* %ip, align 4, !dbg !17
+  br label %return, !dbg !17
+
+return:                                           ; preds = %get_local_size.exit
+  ret void, !dbg !18
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+declare <4 x i32> @__amdil_get_local_size_int() nounwind
+
+declare <4 x i32> @__amdil_get_local_id_int() nounwind
+
+declare <4 x i32> @__amdil_get_global_id_int() nounwind
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!22}
+
+!0 = metadata !{i32 786478, metadata !20, metadata !1, metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [__OpenCL_nbt02_kernel]
+!1 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 786449, metadata !20, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !21, metadata !21, metadata !19, null,  null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 786453, metadata !20, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{null, metadata !5}
+!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
+!6 = metadata !{i32 589846, metadata !20, metadata !2, metadata !"uint", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
+!7 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
+!8 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!9 = metadata !{i32 1, i32 32, metadata !0, null}
+!10 = metadata !{i32 786688, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{i32 786443, metadata !0, i32 2, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{i32 5, i32 24, metadata !11, null}
+!13 = metadata !{i32 786688, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!14 = metadata !{i32 6, i32 25, metadata !11, null}
+!15 = metadata !{i32 786688, metadata !11, metadata !"lsz", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!16 = metadata !{i32 7, i32 26, metadata !11, null}
+!17 = metadata !{i32 9, i32 24, metadata !11, null}
+!18 = metadata !{i32 10, i32 1, metadata !0, null}
+!19 = metadata !{metadata !0}
+!20 = metadata !{metadata !"OCLlLwTXZ.cl", metadata !"/tmp"}
+!21 = metadata !{i32 0}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-value-location.ll b/test/DebugInfo/X86/dbg-value-location.ll
new file mode 100644
index 0000000..1e21c6a
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-value-location.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -regalloc=basic | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+;Radar 8950491
+
+;CHECK: .long Lset5
+;CHECK-NEXT:        ## DW_AT_decl_file
+;CHECK-NEXT:        ## DW_AT_decl_line
+;CHECK-NEXT:        ## DW_AT_type
+;CHECK-NEXT:        ## DW_AT_location
+
+@dfm = external global i32, align 4
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @foo(i32 %dev, i64 %cmd, i8* %data, i32 %data2) nounwind optsize ssp {
+entry:
+  call void @llvm.dbg.value(metadata !{i32 %dev}, i64 0, metadata !12), !dbg !13
+  %tmp.i = load i32* @dfm, align 4, !dbg !14
+  %cmp.i = icmp eq i32 %tmp.i, 0, !dbg !14
+  br i1 %cmp.i, label %if.else, label %if.end.i, !dbg !14
+
+if.end.i:                                         ; preds = %entry
+  switch i64 %cmd, label %if.then [
+    i64 2147772420, label %bb.i
+    i64 536897538, label %bb116.i
+  ], !dbg !22
+
+bb.i:                                             ; preds = %if.end.i
+  unreachable
+
+bb116.i:                                          ; preds = %if.end.i
+  unreachable
+
+if.then:                                          ; preds = %if.end.i
+  ret i32 undef, !dbg !23
+
+if.else:                                          ; preds = %entry
+  ret i32 0
+}
+
+declare hidden fastcc i32 @bar(i32, i32* nocapture) nounwind optsize ssp
+declare hidden fastcc i32 @bar2(i32) nounwind optsize ssp
+declare hidden fastcc i32 @bar3(i32) nounwind optsize ssp
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!29}
+
+!0 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 19510, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i64, i8*, i32)* @foo, null, null, null, i32 19510} ; [ DW_TAG_subprogram ] [line 19510] [def] [foo]
+!1 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
+!2 = metadata !{i32 786449, metadata !27, i32 12, metadata !"clang version 2.9 (trunk 124753)", i1 true, metadata !"", i32 0, metadata !28, metadata !28, metadata !24, null,  null, null} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar3", metadata !"bar3", metadata !"", i32 14827, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @bar3, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 14827] [local] [def] [scope 0] [bar3]
+!7 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar2", metadata !"bar2", metadata !"", i32 15397, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @bar2, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 15397] [local] [def] [scope 0] [bar2]
+!8 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar", metadata !"bar", metadata !"", i32 12382, metadata !9, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 12382] [local] [def] [scope 0] [bar]
+!9 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
+!12 = metadata !{i32 786689, metadata !0, metadata !"var", metadata !1, i32 19509, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!13 = metadata !{i32 19509, i32 20, metadata !0, null}
+!14 = metadata !{i32 18091, i32 2, metadata !15, metadata !17}
+!15 = metadata !{i32 786443, metadata !26, metadata !16, i32 18086, i32 1, i32 748} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo_bar", metadata !"foo_bar", metadata !"", i32 18086, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 18086] [local] [def] [scope 0] [foo_bar]
+!17 = metadata !{i32 19514, i32 2, metadata !18, null}
+!18 = metadata !{i32 786443, metadata !26, metadata !0, i32 19510, i32 1, i32 99} ; [ DW_TAG_lexical_block ]
+!22 = metadata !{i32 18094, i32 2, metadata !15, metadata !17}
+!23 = metadata !{i32 19524, i32 1, metadata !18, null}
+!24 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8}
+!25 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
+!26 = metadata !{metadata !"/tmp/f.c", metadata !"/tmp"}
+!27 = metadata !{metadata !"f.i", metadata !"/tmp"}
+!28 = metadata !{i32 0}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-value-range.ll b/test/DebugInfo/X86/dbg-value-range.ll
index a784cc1..d9e7a63 100644
--- a/test/DebugInfo/X86/dbg-value-range.ll
+++ b/test/DebugInfo/X86/dbg-value-range.ll
@@ -18,16 +18,17 @@ declare i32 @foo(...)
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!24}
 
-!0 = metadata !{i32 786478, metadata !22, metadata !1, metadata !"bar", metadata !"bar", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (%struct.a*)* @bar, null, null, metadata !21, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !22, metadata !1, metadata !"bar", metadata !"bar", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.a*)* @bar, null, null, metadata !21, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [bar]
 !1 = metadata !{i32 786473, metadata !22} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !22, i32 12, metadata !"clang version 2.9 (trunk 122997)", i1 true, metadata !"", i32 0, metadata !23, metadata !23, metadata !20, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !22, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !22, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786689, metadata !0, metadata !"b", metadata !1, i32 5, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !7 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !22, metadata !2, metadata !"a", i32 1, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !9, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 786451, metadata !22, metadata !2, metadata !"a", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 1, size 32, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
 !10 = metadata !{i32 786445, metadata !22, metadata !1, metadata !"c", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
 !11 = metadata !{i32 786688, metadata !12, metadata !"x", metadata !1, i32 6, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
@@ -59,3 +60,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ;CHECK-NEXT: Ltmp
 ;CHECK-NEXT:	.quad	0
 ;CHECK-NEXT:	.quad	0
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg-value-terminator.ll b/test/DebugInfo/X86/dbg-value-terminator.ll
new file mode 100644
index 0000000..f08f281
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-value-terminator.ll
@@ -0,0 +1,133 @@
+; RUN: llc -mtriple=x86_64-apple-macosx < %s -verify-machineinstrs | FileCheck %s
+;
+; PR16143: MachineOperand::setIsKill(bool): Assertion
+;
+; verify-machineinstrs should ensure that DEBUG_VALUEs go before the
+; terminator.
+;
+; CHECK-LABEL: test:
+; CHECK: ##DEBUG_VALUE: i
+%a = type { i32, i32 }
+
+define hidden fastcc %a* @test() #1 {
+entry:
+  %0 = icmp eq %a* undef, null, !dbg !1
+  br i1 %0, label %"14", label %return, !dbg !1
+
+"14":                                             ; preds = %"8"
+  br i1 undef, label %"25", label %"21", !dbg !1
+
+"21":                                             ; preds = %"14"
+  br i1 undef, label %may_unswitch_on.exit, label %"6.i", !dbg !1
+
+"6.i":                                            ; preds = %"21"
+  br i1 undef, label %"10.i", label %may_unswitch_on.exit, !dbg !1
+
+"10.i":                                           ; preds = %"6.i"
+  br i1 undef, label %may_unswitch_on.exit, label %"12.i", !dbg !1
+
+"12.i":                                           ; preds = %"10.i"
+  br i1 undef, label %"4.i.i", label %"3.i.i", !dbg !1
+
+"3.i.i":                                          ; preds = %"12.i"
+  br i1 undef, label %"4.i.i", label %VEC_edge_base_index.exit.i, !dbg !1
+
+"4.i.i":                                          ; preds = %"3.i.i", %"12.i"
+  unreachable, !dbg !1
+
+VEC_edge_base_index.exit.i:                       ; preds = %"3.i.i"
+  br i1 undef, label %may_unswitch_on.exit, label %"16.i", !dbg !1
+
+"16.i":                                           ; preds = %VEC_edge_base_index.exit.i
+  br i1 undef, label %"4.i6.i", label %"3.i5.i", !dbg !1
+
+"3.i5.i":                                         ; preds = %"16.i"
+  br i1 undef, label %VEC_edge_base_index.exit7.i, label %"4.i6.i", !dbg !1
+
+"4.i6.i":                                         ; preds = %"3.i5.i", %"16.i"
+  unreachable, !dbg !1
+
+VEC_edge_base_index.exit7.i:                      ; preds = %"3.i5.i"
+  br i1 undef, label %may_unswitch_on.exit, label %"21.i", !dbg !1
+
+"21.i":                                           ; preds = %VEC_edge_base_index.exit7.i
+  br i1 undef, label %may_unswitch_on.exit, label %"23.i", !dbg !1
+
+"23.i":                                           ; preds = %"21.i"
+  br i1 undef, label %may_unswitch_on.exit, label %"26.i", !dbg !1
+
+"26.i":                                           ; preds = %"34.i", %"23.i"
+  %1 = icmp eq i32 undef, 9, !dbg !1
+  br i1 %1, label %"34.i", label %"28.i", !dbg !1
+
+"28.i":                                           ; preds = %"26.i"
+  unreachable
+
+"34.i":                                           ; preds = %"26.i"
+  br i1 undef, label %"26.i", label %"36.i", !dbg !1
+
+"36.i":                                           ; preds = %"34.i"
+  br i1 undef, label %"37.i", label %"38.i", !dbg !1
+
+"37.i":                                           ; preds = %"36.i"
+  br label %"38.i", !dbg !1
+
+"38.i":                                           ; preds = %"37.i", %"36.i"
+  br i1 undef, label %"39.i", label %"45.i", !dbg !1
+
+"39.i":                                           ; preds = %"38.i"
+  br i1 undef, label %"41.i", label %may_unswitch_on.exit, !dbg !1
+
+"41.i":                                           ; preds = %"39.i"
+  br i1 undef, label %may_unswitch_on.exit, label %"42.i", !dbg !1
+
+"42.i":                                           ; preds = %"41.i"
+  br i1 undef, label %may_unswitch_on.exit, label %"44.i", !dbg !1
+
+"44.i":                                           ; preds = %"42.i"
+  %2 = load %a** undef, align 8, !dbg !1
+  %3 = bitcast %a* %2 to %a*, !dbg !1
+  call void @llvm.dbg.value(metadata !{%a* %3}, i64 0, metadata !6), !dbg !12
+  br label %may_unswitch_on.exit, !dbg !1
+
+"45.i":                                           ; preds = %"38.i"
+  unreachable
+
+may_unswitch_on.exit:                             ; preds = %"44.i", %"42.i", %"41.i", %"39.i", %"23.i", %"21.i", %VEC_edge_base_index.exit7.i, %VEC_edge_base_index.exit.i, %"10.i", %"6.i", %"21"
+  %4 = phi %a* [ %3, %"44.i" ], [ null, %"6.i" ], [ null, %"10.i" ], [ null, %VEC_edge_base_index.exit7.i ], [ null, %VEC_edge_base_index.exit.i ], [ null, %"21.i" ], [ null, %"23.i" ], [ null, %"39.i" ], [ null, %"42.i" ], [ null, %"41.i" ], [ null, %"21" ]
+  br label %return
+
+"25":                                             ; preds = %"14"
+  unreachable
+
+"return":
+  %result = phi %a* [ null, %entry ], [ %4, %may_unswitch_on.exit ]
+  ret %a* %result, !dbg !1
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind uwtable }
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22}
+
+!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, %a* ()* @test, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
+!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
+!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
+!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
+!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
+!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ]
+!12 = metadata !{i32 2, i32 13, metadata !1, null}
+!18 = metadata !{metadata !1}
+!19 = metadata !{metadata !6, metadata !7, metadata !10}
+!20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
+!21 = metadata !{i32 0}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dbg_value_direct.ll b/test/DebugInfo/X86/dbg_value_direct.ll
index 9a40d59..8a22cd7 100644
--- a/test/DebugInfo/X86/dbg_value_direct.ll
+++ b/test/DebugInfo/X86/dbg_value_direct.ll
@@ -145,7 +145,7 @@ attributes #1 = { nounwind readnone }
 attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "ssp-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!22}
+!llvm.module.flags = !{!22, !27}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/crash.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"crash.cpp", metadata !"/tmp"}
@@ -153,17 +153,17 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_Z4funci", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.A*, i32)* @_Z4funci, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/crash.cpp]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !21}
-!8 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
+!8 = metadata !{i32 786451, metadata !1, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !15}
 !10 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !14, i32 2} ; [ DW_TAG_subprogram ] [line 2] [A]
-!11 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!13 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
 !14 = metadata !{i32 786468}
 !15 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"A", metadata !"A", metadata !"", i32 3, metadata !16, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !20, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
-!16 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null, metadata !13, metadata !18}
 !18 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !19} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from A]
@@ -174,3 +174,4 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !24 = metadata !{i32 786688, metadata !4, metadata !"a", metadata !5, i32 7, metadata !8, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 7]
 !25 = metadata !{i32 7, i32 0, metadata !4, null}
 !26 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/debug-info-block-captured-self.ll b/test/DebugInfo/X86/debug-info-block-captured-self.ll
index 3d36350..6e4d200 100644
--- a/test/DebugInfo/X86/debug-info-block-captured-self.ll
+++ b/test/DebugInfo/X86/debug-info-block-captured-self.ll
@@ -77,23 +77,24 @@ define internal void @"__24-[Main initWithContext:]_block_invoke_2"(i8* %.block_
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!108}
 !0 = metadata !{i32 786449, metadata !107, i32 16, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 2, metadata !2, metadata !4, metadata !23, metadata !15,  metadata !15, metadata !""} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m] [DW_LANG_ObjC]
 !1 = metadata !{i32 786473, metadata !107} ; [ DW_TAG_file_type ]
 !2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, metadata !107, null, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !4, i32 0, i32 0} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [from ]
+!3 = metadata !{i32 786436, metadata !107, null, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
 !4 = metadata !{}
 !15 = metadata !{i32 0}
 !23 = metadata !{metadata !38, metadata !42}
 !27 = metadata !{i32 786454, metadata !107, null, metadata !"id", i32 31, i64 0, i64 0, i64 0, i32 0, metadata !28} ; [ DW_TAG_typedef ] [id] [line 31, size 0, align 0, offset 0] [from ]
 !28 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!29 = metadata !{i32 786451, metadata !107, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !30, i32 0, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [from ]
+!29 = metadata !{i32 786451, metadata !107, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
 !30 = metadata !{metadata !31}
 !31 = metadata !{i32 786445, metadata !107, metadata !29, metadata !"isa", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
 !32 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!33 = metadata !{i32 786451, metadata !107, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [fwd] [from ]
-!34 = metadata !{i32 786451, metadata !107, null, metadata !"Main", i32 23, i64 0, i64 0, i32 0, i32 1092, i32 0, i32 0, i32 16} ; [ DW_TAG_structure_type ] [Main] [line 23, size 0, align 0, offset 0] [artificial] [fwd] [from ]
+!33 = metadata !{i32 786451, metadata !107, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!34 = metadata !{i32 786451, metadata !107, null, metadata !"Main", i32 23, i64 0, i64 0, i32 0, i32 1092, null, i32 0, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [Main] [line 23, size 0, align 0, offset 0] [artificial] [decl] [from ]
 !38 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"__24-[Main initWithContext:]_block_invoke", metadata !"__24-[Main initWithContext:]_block_invoke", metadata !"", i32 33, metadata !39, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke", null, null, metadata !15, i32 33} ; [ DW_TAG_subprogram ] [line 33] [local] [def] [__24-[Main initWithContext:]_block_invoke]
-!39 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!39 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !40 = metadata !{null, metadata !41, metadata !27}
 !41 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !42 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"__24-[Main initWithContext:]_block_invoke_2", metadata !"__24-[Main initWithContext:]_block_invoke_2", metadata !"", i32 35, metadata !39, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, i8*)* @"__24-[Main initWithContext:]_block_invoke_2", null, null, metadata !15, i32 35} ; [ DW_TAG_subprogram ] [line 35] [local] [def] [__24-[Main initWithContext:]_block_invoke_2]
@@ -104,3 +105,4 @@ define internal void @"__24-[Main initWithContext:]_block_invoke_2"(i8* %.block_
 !105 = metadata !{i32 786688, metadata !42, metadata !"self", metadata !1, i32 40, metadata !34, i32 0, i32 0, i64 1, i64 32} ; [ DW_TAG_auto_variable ] [self] [line 40]
 !106 = metadata !{i32 40, i32 0, metadata !42, null}
 !107 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/debug-info-block-captured-self.m", metadata !""}
+!108 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/debug-info-blocks.ll b/test/DebugInfo/X86/debug-info-blocks.ll
index ae95033..c3bedf2 100644
--- a/test/DebugInfo/X86/debug-info-blocks.ll
+++ b/test/DebugInfo/X86/debug-info-blocks.ll
@@ -5,13 +5,15 @@
 ; rdar://problem/9279956
 ; test that the DW_AT_location of self is at ( fbreg +{{[0-9]+}}, deref, +{{[0-9]+}} )
 
+; CHECK: DW_TAG_subprogram
 ; CHECK: DW_AT_name{{.*}}_block_invoke
+
 ; CHECK-NOT: DW_TAG_subprogram
 ; CHECK: DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK: .block_descriptor
+; CHECK-NEXT: DW_AT_name{{.*}}.block_descriptor
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_location
+
 ; CHECK-NOT: DW_TAG_subprogram
 ; CHECK: DW_TAG_variable
 ; CHECK-NEXT: DW_AT_name{{.*}}"self"
@@ -24,7 +26,7 @@
 ; 0x23 = DW_OP_uconst
 ; 0x91 = DW_OP_fbreg
 ; CHECK: DW_AT_location{{.*}}91 {{[0-9]+}} 06 23 {{[0-9]+}} )
-; CHECK: DW_TAG_structure_type
+
 ; CHECK: [[A:.*]]:   DW_TAG_structure_type
 ; CHECK-NEXT: DW_AT_APPLE_objc_complete_type
 ; CHECK-NEXT: DW_AT_name{{.*}}"A"
@@ -258,52 +260,52 @@ attributes #2 = { nonlazybind }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!56, !57, !58, !59}
+!llvm.module.flags = !{!56, !57, !58, !59, !110}
 
 !0 = metadata !{i32 786449, metadata !1, i32 16, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 2, metadata !2, metadata !3, metadata !12, metadata !2,  metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [llvm/tools/clang/test/CodeGenObjC/<unknown>] [DW_LANG_ObjC]
 !1 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/<unknown>", metadata !"llvm/_build.ninja.Debug"}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !4}
-!4 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"A", i32 33, i64 32, i64 32, i32 0, i32 512, null, metadata !7, i32 16, null, null} ; [ DW_TAG_structure_type ] [A] [line 33, size 32, align 32, offset 0] [from ]
+!4 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"A", i32 33, i64 32, i64 32, i32 0, i32 512, null, metadata !7, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 33, size 32, align 32, offset 0] [def] [from ]
 !5 = metadata !{metadata !"llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m", metadata !"llvm/_build.ninja.Debug"}
 !6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
 !7 = metadata !{metadata !8, metadata !10}
 !8 = metadata !{i32 786460, null, metadata !4, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
-!9 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSObject", i32 21, i64 0, i64 8, i32 0, i32 0, null, metadata !2, i32 16, null, null} ; [ DW_TAG_structure_type ] [NSObject] [line 21, size 0, align 8, offset 0] [from ]
+!9 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSObject", i32 21, i64 0, i64 8, i32 0, i32 0, null, metadata !2, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSObject] [line 21, size 0, align 8, offset 0] [def] [from ]
 !10 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"ivar", i32 35, i64 32, i64 32, i64 0, i32 0, metadata !11, null} ; [ DW_TAG_member ] [ivar] [line 35, size 32, align 32, offset 0] [from int]
 !11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13, metadata !27, metadata !31, metadata !35, metadata !36, metadata !39}
 !13 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"-[A init]", metadata !"-[A init]", metadata !"", i32 46, metadata !14, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, i8* (%0*, i8*)* @"\01-[A init]", null, null, metadata !2, i32 46} ; [ DW_TAG_subprogram ] [line 46] [local] [def] [-[A init]]
-!14 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{metadata !16, metadata !23, metadata !24}
 !16 = metadata !{i32 786454, metadata !5, null, metadata !"id", i32 46, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_typedef ] [id] [line 46, size 0, align 0, offset 0] [from ]
 !17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!18 = metadata !{i32 786451, metadata !1, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !19, i32 0, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{i32 786451, metadata !1, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
 !19 = metadata !{metadata !20}
 !20 = metadata !{i32 786445, metadata !1, metadata !18, metadata !"isa", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !21} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
 !21 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!22 = metadata !{i32 786451, metadata !1, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [fwd] [from ]
-!23 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!24 = metadata !{i32 786454, metadata !5, i32 0, metadata !"SEL", i32 46, i64 0, i64 0, i64 0, i32 64, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [artificial] [from ]
+!22 = metadata !{i32 786451, metadata !1, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
+!23 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !4} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!24 = metadata !{i32 786454, metadata !5, null, metadata !"SEL", i32 46, i64 0, i64 0, i64 0, i32 64, metadata !25} ; [ DW_TAG_typedef ] [SEL] [line 46, size 0, align 0, offset 0] [artificial] [from ]
 !25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !26} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_selector]
-!26 = metadata !{i32 786451, metadata !1, null, metadata !"objc_selector", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [fwd] [from ]
+!26 = metadata !{i32 786451, metadata !1, null, metadata !"objc_selector", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_selector] [line 0, size 0, align 0, offset 0] [decl] [from ]
 !27 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"__9-[A init]_block_invoke", metadata !"__9-[A init]_block_invoke", metadata !"", i32 49, metadata !28, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @"__9-[A init]_block_invoke", null, null, metadata !2, i32 49} ; [ DW_TAG_subprogram ] [line 49] [local] [def] [__9-[A init]_block_invoke]
-!28 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !29 = metadata !{null, metadata !30}
 !30 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !31 = metadata !{i32 786478, metadata !1, metadata !32, metadata !"__copy_helper_block_", metadata !"__copy_helper_block_", metadata !"", i32 52, metadata !33, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i8*, i8*)* @__copy_helper_block_, null, null, metadata !2, i32 52} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__copy_helper_block_]
 !32 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [llvm/tools/clang/test/CodeGenObjC/<unknown>]
-!33 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !34 = metadata !{null, metadata !30, metadata !30}
 !35 = metadata !{i32 786478, metadata !1, metadata !32, metadata !"__destroy_helper_block_", metadata !"__destroy_helper_block_", metadata !"", i32 52, metadata !28, i1 true, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i8*)* @__destroy_helper_block_, null, null, metadata !2, i32 52} ; [ DW_TAG_subprogram ] [line 52] [local] [def] [__destroy_helper_block_]
 !36 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !37, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 60} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 60] [main]
-!37 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !38, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!37 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !38, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !38 = metadata !{metadata !11}
 !39 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"run", metadata !"run", metadata !"", i32 39, metadata !40, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (void ()*)* @run, null, null, metadata !2, i32 40} ; [ DW_TAG_subprogram ] [line 39] [local] [def] [scope 40] [run]
-!40 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !41, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!40 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !41 = metadata !{null, metadata !42}
 !42 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !43} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_generic]
-!43 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_literal_generic", i32 40, i64 256, i64 0, i32 0, i32 8, null, metadata !44, i32 0, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 40, size 256, align 0, offset 0] [from ]
+!43 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_literal_generic", i32 40, i64 256, i64 0, i32 0, i32 8, null, metadata !44, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_generic] [line 40, size 256, align 0, offset 0] [def] [from ]
 !44 = metadata !{metadata !45, metadata !46, metadata !47, metadata !48, metadata !49}
 !45 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__isa", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !30} ; [ DW_TAG_member ] [__isa] [line 0, size 64, align 64, offset 0] [from ]
 !46 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__flags", i32 0, i64 32, i64 32, i64 64, i32 0, metadata !11} ; [ DW_TAG_member ] [__flags] [line 0, size 32, align 32, offset 64] [from int]
@@ -311,7 +313,7 @@ attributes #3 = { nounwind }
 !48 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__FuncPtr", i32 0, i64 64, i64 64, i64 128, i32 0, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 0, size 64, align 64, offset 128] [from ]
 !49 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__descriptor", i32 40, i64 64, i64 64, i64 192, i32 0, metadata !50} ; [ DW_TAG_member ] [__descriptor] [line 40, size 64, align 64, offset 192] [from ]
 !50 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_descriptor]
-!51 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_descriptor", i32 40, i64 128, i64 0, i32 0, i32 8, null, metadata !52, i32 0, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 40, size 128, align 0, offset 0] [from ]
+!51 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_descriptor", i32 40, i64 128, i64 0, i32 0, i32 8, null, metadata !52, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor] [line 40, size 128, align 0, offset 0] [def] [from ]
 !52 = metadata !{metadata !53, metadata !55}
 !53 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"reserved", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !54} ; [ DW_TAG_member ] [reserved] [line 0, size 64, align 64, offset 0] [from long unsigned int]
 !54 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
@@ -338,7 +340,7 @@ attributes #3 = { nounwind }
 !75 = metadata !{i32 42, i32 0, metadata !39, null}
 !76 = metadata !{i32 786689, metadata !27, metadata !".block_descriptor", metadata !6, i32 16777265, metadata !77, i32 64, i32 0} ; [ DW_TAG_arg_variable ] [.block_descriptor] [line 49]
 !77 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !78} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __block_literal_1]
-!78 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_literal_1", i32 49, i64 320, i64 64, i32 0, i32 0, null, metadata !79, i32 0, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 49, size 320, align 64, offset 0] [from ]
+!78 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"__block_literal_1", i32 49, i64 320, i64 64, i32 0, i32 0, null, metadata !79, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_1] [line 49, size 320, align 64, offset 0] [def] [from ]
 !79 = metadata !{metadata !80, metadata !81, metadata !82, metadata !83, metadata !84, metadata !87}
 !80 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__isa", i32 49, i64 64, i64 64, i64 0, i32 0, metadata !30} ; [ DW_TAG_member ] [__isa] [line 49, size 64, align 64, offset 0] [from ]
 !81 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__flags", i32 49, i64 32, i64 32, i64 64, i32 0, metadata !11} ; [ DW_TAG_member ] [__flags] [line 49, size 32, align 32, offset 64] [from int]
@@ -346,7 +348,7 @@ attributes #3 = { nounwind }
 !83 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__FuncPtr", i32 49, i64 64, i64 64, i64 128, i32 0, metadata !30} ; [ DW_TAG_member ] [__FuncPtr] [line 49, size 64, align 64, offset 128] [from ]
 !84 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"__descriptor", i32 49, i64 64, i64 64, i64 192, i32 0, metadata !85} ; [ DW_TAG_member ] [__descriptor] [line 49, size 64, align 64, offset 192] [from ]
 !85 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !86} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from __block_descriptor_withcopydispose]
-!86 = metadata !{i32 786451, metadata !1, null, metadata !"__block_descriptor_withcopydispose", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 49, size 0, align 0, offset 0] [fwd] [from ]
+!86 = metadata !{i32 786451, metadata !1, null, metadata !"__block_descriptor_withcopydispose", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 49, size 0, align 0, offset 0] [decl] [from ]
 !87 = metadata !{i32 786445, metadata !5, metadata !6, metadata !"self", i32 49, i64 64, i64 64, i64 256, i32 0, metadata !61} ; [ DW_TAG_member ] [self] [line 49, size 64, align 64, offset 256] [from ]
 !88 = metadata !{i32 49, i32 0, metadata !27, null}
 !89 = metadata !{i32 786688, metadata !27, metadata !"self", metadata !32, i32 52, metadata !23, i32 0, i32 0, i64 2, i64 1, i64 32} ; [ DW_TAG_auto_variable ] [self] [line 52]
@@ -354,10 +356,10 @@ attributes #3 = { nounwind }
 !91 = metadata !{i32 786688, metadata !92, metadata !"d", metadata !6, i32 50, metadata !93, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 50]
 !92 = metadata !{i32 786443, metadata !5, metadata !27, i32 49, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [llvm/tools/clang/test/CodeGenObjC/debug-info-blocks.m]
 !93 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !94} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from NSMutableDictionary]
-!94 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSMutableDictionary", i32 30, i64 0, i64 8, i32 0, i32 0, null, metadata !95, i32 16, null, null} ; [ DW_TAG_structure_type ] [NSMutableDictionary] [line 30, size 0, align 8, offset 0] [from ]
+!94 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSMutableDictionary", i32 30, i64 0, i64 8, i32 0, i32 0, null, metadata !95, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSMutableDictionary] [line 30, size 0, align 8, offset 0] [def] [from ]
 !95 = metadata !{metadata !96}
 !96 = metadata !{i32 786460, null, metadata !94, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !97} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSDictionary]
-!97 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSDictionary", i32 26, i64 0, i64 8, i32 0, i32 0, null, metadata !98, i32 16, null, null} ; [ DW_TAG_structure_type ] [NSDictionary] [line 26, size 0, align 8, offset 0] [from ]
+!97 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"NSDictionary", i32 26, i64 0, i64 8, i32 0, i32 0, null, metadata !98, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSDictionary] [line 26, size 0, align 8, offset 0] [def] [from ]
 !98 = metadata !{metadata !99}
 !99 = metadata !{i32 786460, null, metadata !97, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from NSObject]
 !100 = metadata !{i32 50, i32 0, metadata !92, null}
@@ -370,3 +372,4 @@ attributes #3 = { nounwind }
 !107 = metadata !{i32 786688, metadata !36, metadata !"a", metadata !6, i32 61, metadata !61, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 61]
 !108 = metadata !{i32 61, i32 0, metadata !36, null}
 !109 = metadata !{i32 62, i32 0, metadata !36, null}
+!110 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/debug-info-static-member.ll b/test/DebugInfo/X86/debug-info-static-member.ll
index 02b8ae0..1792bb4 100644
--- a/test/DebugInfo/X86/debug-info-static-member.ll
+++ b/test/DebugInfo/X86/debug-info-static-member.ll
@@ -57,18 +57,19 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!34}
 
 !0 = metadata !{i32 786449, metadata !33, i32 4, metadata !"clang version 3.3 (trunk 171914)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !10,  metadata !10, metadata !""} ; [ DW_TAG_compile_unit ] [/home/probinson/projects/upstream/static-member/test/debug-info-static-member.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !33, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 23} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 23] [main]
 !6 = metadata !{i32 786473, metadata !33} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !12, metadata !27, metadata !28}
 !12 = metadata !{i32 786484, i32 0, metadata !13, metadata !"a", metadata !"a", metadata !"_ZN1C1aE", metadata !6, i32 14, metadata !9, i32 0, i32 1, i32* @_ZN1C1aE, metadata !15} ; [ DW_TAG_variable ] [a] [line 14] [def]
-!13 = metadata !{i32 786434, metadata !33, null, metadata !"C", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !14, i32 0, null, null} ; [ DW_TAG_class_type ] [C] [line 1, size 32, align 32, offset 0] [from ]
+!13 = metadata !{i32 786434, metadata !33, null, metadata !"C", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_class_type ] [C] [line 1, size 32, align 32, offset 0] [def] [from ]
 !14 = metadata !{metadata !15, metadata !16, metadata !19, metadata !20, metadata !23, metadata !24, metadata !26}
 !15 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"a", i32 3, i64 0, i64 0, i64 0, i32 4097, metadata !9, null} ; [ DW_TAG_member ] [a] [line 3, size 0, align 0, offset 0] [private] [static] [from int]
 !16 = metadata !{i32 786445, metadata !33, metadata !13, metadata !"const_a", i32 4, i64 0, i64 0, i64 0, i32 4097, metadata !17, i1 true} ; [ DW_TAG_member ] [const_a] [line 4, size 0, align 0, offset 0] [private] [static] [from ]
@@ -120,7 +121,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; PRESENT:      DW_TAG_member
 ; PRESENT-NEXT: DW_AT_name {{.*}} "const_c"
 ; PRESENT:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
-; PRESENT:      DW_AT_const_value {{.*}} (0x00000012)
+; PRESENT:      DW_AT_const_value {{.*}} (18)
 ; While we're here, a normal member has data_member_location and
 ; accessibility attributes.
 ; PRESENT:      DW_TAG_member
@@ -151,7 +152,6 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; DARWINP:      DW_AT_external
 ; DARWINP:      DW_AT_declaration
 ; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x03)
-; DARWINP:      DW_AT_MIPS_linkage_name {{.*}} "_ZN1C1aE"
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "const_a"
 ; DARWINP:      DW_AT_external
@@ -161,7 +161,6 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; DARWINP:      0x[[DECL_B:[0-9a-f]+]]: DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "b"
 ; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
-; DARWINP:      DW_AT_MIPS_linkage_name {{.*}} "_ZN1C1bE"
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "const_b"
 ; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x02)
@@ -169,11 +168,10 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; DARWINP:      0x[[DECL_C:[0-9a-f]+]]: DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "c"
 ; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
-; DARWINP:      DW_AT_MIPS_linkage_name {{.*}} "_ZN1C1cE"
 ; DARWINP:      DW_TAG_member
 ; DARWINP-NEXT: DW_AT_name {{.*}} "const_c"
 ; DARWINP:      DW_AT_accessibility [DW_FORM_data1]   (0x01)
-; DARWINP:      DW_AT_const_value {{.*}} (0x00000012)
+; DARWINP:      DW_AT_const_value {{.*}} (18)
 ; While we're here, a normal member has data_member_location and
 ; accessibility attributes.
 ; DARWINP:      DW_TAG_member
@@ -255,3 +253,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; DARWINA-NOT:  DW_AT_const_value
 ; DARWINA-NOT:  DW_AT_location
 ; DARWINA:      NULL
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/debug_frame.ll b/test/DebugInfo/X86/debug_frame.ll
index 622e5d3..67f2e5d 100644
--- a/test/DebugInfo/X86/debug_frame.ll
+++ b/test/DebugInfo/X86/debug_frame.ll
@@ -10,11 +10,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7}
 !5 = metadata !{metadata !0}
 
-!0 = metadata !{i32 786478, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
 !1 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !6, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !5, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !6 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build"}
+!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll b/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll
new file mode 100644
index 0000000..42a57bf
--- /dev/null
+++ b/test/DebugInfo/X86/dwarf-aranges-no-dwarf-labels.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s | FileCheck %s
+
+; CHECK: .short  2 # DWARF Arange version number
+; CHECK: # Segment Size
+; CHECK-NOT: debug_loc
+; CHECK: .quad global
+; CHECK-NOT: debug_loc
+; CHECK: # ARange terminator
+
+; --- Source code ---
+; Generated with "clang -g -O1 -S -emit-llvm"
+
+; int global = 2;
+; int foo(int bar) { return bar; }
+; int foo2(int bar2) { return bar2; }
+
+; int main() {
+;   return foo(2) + foo2(1) + global;
+; }
+
+
+; ModuleID = 'tmp/debug_ranges/a.cc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@global = global i32 2, align 4
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @_Z3fooi(i32 %bar) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32 %bar}, i64 0, metadata !10), !dbg !20
+  ret i32 %bar, !dbg !20
+}
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @_Z4foo2i(i32 %bar2) #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32 %bar2}, i64 0, metadata !13), !dbg !21
+  ret i32 %bar2, !dbg !21
+}
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @main() #1 {
+entry:
+  %call = tail call i32 @_Z3fooi(i32 2), !dbg !22
+  %call1 = tail call i32 @_Z4foo2i(i32 1), !dbg !22
+  %add = add nsw i32 %call1, %call, !dbg !22
+  %0 = load i32* @global, align 4, !dbg !22, !tbaa !23
+  %add2 = add nsw i32 %add, %0, !dbg !22
+  ret i32 %add2, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!19, !26}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (191881)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !17, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/debug_ranges/a.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"tmp/debug_ranges/a.cc", metadata !"/"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !11, metadata !14}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @_Z3fooi, null, null, metadata !9, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [foo]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/debug_ranges/a.cc]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786689, metadata !4, metadata !"bar", metadata !5, i32 16777218, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bar] [line 2]
+!11 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo2", metadata !"foo2", metadata !"_Z4foo2i", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @_Z4foo2i, null, null, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [foo2]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786689, metadata !11, metadata !"bar2", metadata !5, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bar2] [line 3]
+!14 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 5, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [main]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !8}
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786484, i32 0, null, metadata !"global", metadata !"global", metadata !"", metadata !5, i32 1, metadata !8, i32 0, i32 1, i32* @global, null} ; [ DW_TAG_variable ] [global] [line 1] [def]
+!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!20 = metadata !{i32 2, i32 0, metadata !4, null}
+!21 = metadata !{i32 3, i32 0, metadata !11, null}
+!22 = metadata !{i32 6, i32 0, metadata !14, null}
+!23 = metadata !{metadata !"int", metadata !24}
+!24 = metadata !{metadata !"omnipotent char", metadata !25}
+!25 = metadata !{metadata !"Simple C/C++ TBAA"}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dwarf-aranges.ll b/test/DebugInfo/X86/dwarf-aranges.ll
new file mode 100644
index 0000000..203afc7
--- /dev/null
+++ b/test/DebugInfo/X86/dwarf-aranges.ll
@@ -0,0 +1,87 @@
+; RUN: llc < %s | FileCheck %s
+
+
+; -- header --
+; CHECK: .short 2 # DWARF Arange version number
+; CHECK-NEXT: .long .L.debug_info_begin0
+; CHECK-NEXT: .byte 8 # Address Size (in bytes)
+; CHECK-NEXT: .byte 0 # Segment Size (in bytes)
+; -- alignment --
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+
+; <common symbols> - it should have made one span for each symbol.
+; CHECK-NEXT: .quad some_bss
+; CHECK-NEXT: .quad 4
+
+; <data section> - it should have made one span covering all vars in this CU.
+; CHECK-NEXT: .quad some_data
+; CHECK-NEXT: .Lset0 = .Ldebug_end1-some_data
+; CHECK-NEXT: .quad .Lset0
+
+; <text section> - it should have made one span covering all functions in this CU.
+; CHECK-NEXT: .quad .Lfunc_begin0
+; CHECK-NEXT: .Lset1 = .Ldebug_end2-.Lfunc_begin0
+; CHECK-NEXT: .quad .Lset1
+
+; <other sections> - it should have made one span covering all vars in this CU.
+; CHECK-NEXT: .quad some_other
+; CHECK-NEXT: .Lset2 = .Ldebug_end3-some_other
+; CHECK-NEXT: .quad .Lset2
+
+; -- finish --
+; CHECK-NEXT: # ARange terminator
+
+
+
+; -- source code --
+; Generated from: "clang -c -g -emit-llvm"
+;
+; int some_data = 4;
+; int some_bss;
+; int some_other __attribute__ ((section ("strange+section"))) = 5;
+; 
+; void some_code()
+; {
+;    some_bss += some_data + some_other;
+; }
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@some_data = global i32 4, align 4
+@some_other = global i32 5, section "strange+section", align 4
+@some_bss = common global i32 0, align 4
+
+define void @some_code() {
+entry:
+  %0 = load i32* @some_data, align 4, !dbg !14
+  %1 = load i32* @some_other, align 4, !dbg !14
+  %add = add nsw i32 %0, %1, !dbg !14
+  %2 = load i32* @some_bss, align 4, !dbg !14
+  %add1 = add nsw i32 %2, %add, !dbg !14
+  store i32 %add1, i32* @some_bss, align 4, !dbg !14
+  ret void, !dbg !15
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13, !16}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !8, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/kayamon/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"/home/kayamon"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"some_code", metadata !"some_code", metadata !"", i32 5, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @some_code, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 6] [some_code]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{metadata !9, metadata !11, metadata !12}
+!9 = metadata !{i32 786484, i32 0, null, metadata !"some_data", metadata !"some_data", metadata !"", metadata !5, i32 1, metadata !10, i32 0, i32 1, i32* @some_data, null} ; [ DW_TAG_variable ] [some_data] [line 1] [def]
+!10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!11 = metadata !{i32 786484, i32 0, null, metadata !"some_other", metadata !"some_other", metadata !"", metadata !5, i32 3, metadata !10, i32 0, i32 1, i32* @some_other, null} ; [ DW_TAG_variable ] [some_other] [line 3] [def]
+!12 = metadata !{i32 786484, i32 0, null, metadata !"some_bss", metadata !"some_bss", metadata !"", metadata !5, i32 2, metadata !10, i32 0, i32 1, i32* @some_bss, null} ; [ DW_TAG_variable ] [some_bss] [line 2] [def]
+!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!14 = metadata !{i32 7, i32 0, metadata !4, null}
+!15 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dwarf-public-names.ll b/test/DebugInfo/X86/dwarf-public-names.ll
new file mode 100644
index 0000000..d66e5a0
--- /dev/null
+++ b/test/DebugInfo/X86/dwarf-public-names.ll
@@ -0,0 +1,132 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -filetype=obj -o %t.o < %s
+; RUN: llvm-dwarfdump -debug-dump=pubnames %t.o | FileCheck --check-prefix=LINUX %s
+; RUN: llc -mtriple=x86_64-apple-darwin12 -filetype=obj -o %t.o < %s
+; RUN: llvm-dwarfdump -debug-dump=pubnames %t.o | FileCheck --check-prefix=DARWIN %s
+; ModuleID = 'dwarf-public-names.cpp'
+;
+; Generated from:
+;
+; struct C {
+;   void member_function();
+;   static int static_member_function();
+;   static int static_member_variable;
+; };
+;
+; int C::static_member_variable = 0;
+;
+; void C::member_function() {
+;   static_member_variable = 0;
+; }
+;
+; int C::static_member_function() {
+;   return static_member_variable;
+; }
+;
+; C global_variable;
+;
+; int global_function() {
+;   return -1;
+; }
+;
+; namespace ns {
+;   void global_namespace_function() {
+;     global_variable.member_function();
+;   }
+;   int global_namespace_variable = 1;
+; }
+
+; Darwin shouldn't be generating the section by default
+; DARWIN: debug_pubnames
+; DARWIN: unit_size = 0x00000000
+
+; Skip the output to the header of the pubnames section.
+; LINUX: debug_pubnames
+
+; Check for each name in the output.
+; LINUX: global_namespace_variable
+; LINUX: global_namespace_function
+; LINUX: static_member_function
+; LINUX: global_variable
+; LINUX: global_function
+; LINUX: member_function
+
+%struct.C = type { i8 }
+
+@_ZN1C22static_member_variableE = global i32 0, align 4
+@global_variable = global %struct.C zeroinitializer, align 1
+@_ZN2ns25global_namespace_variableE = global i32 1, align 4
+
+define void @_ZN1C15member_functionEv(%struct.C* %this) nounwind uwtable align 2 {
+entry:
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !28), !dbg !30
+  %this1 = load %struct.C** %this.addr
+  store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !31
+  ret void, !dbg !32
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define i32 @_ZN1C22static_member_functionEv() nounwind uwtable align 2 {
+entry:
+  %0 = load i32* @_ZN1C22static_member_variableE, align 4, !dbg !33
+  ret i32 %0, !dbg !33
+}
+
+define i32 @_Z15global_functionv() nounwind uwtable {
+entry:
+  ret i32 -1, !dbg !34
+}
+
+define void @_ZN2ns25global_namespace_functionEv() nounwind uwtable {
+entry:
+  call void @_ZN1C15member_functionEv(%struct.C* @global_variable), !dbg !35
+  ret void, !dbg !36
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!38}
+
+!0 = metadata !{i32 786449, metadata !37, i32 4, metadata !"clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !24, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{i32 0}
+!2 = metadata !{metadata !3, metadata !18, metadata !19, metadata !20}
+!3 = metadata !{i32 786478, metadata !4, null, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!4 = metadata !{i32 786473, metadata !37} ; [ DW_TAG_file_type ]
+!5 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{null, metadata !7}
+!7 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
+!8 = metadata !{i32 786451, metadata !37, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!9 = metadata !{metadata !10, metadata !12, metadata !14}
+!10 = metadata !{i32 786445, metadata !37, metadata !8, metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !5, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!14 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !11}
+!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
+!19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
+!20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
+!21 = metadata !{i32 786489, null, metadata !"ns", metadata !4, i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
+!22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{null}
+!24 = metadata !{metadata !25, metadata !26, metadata !27}
+!25 = metadata !{i32 786484, i32 0, metadata !8, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !4, i32 7, metadata !11, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!26 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !4, i32 17, metadata !8, i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 17] [def]
+!27 = metadata !{i32 786484, i32 0, metadata !21, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !4, i32 27, metadata !11, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 27] [def]
+!28 = metadata !{i32 786689, metadata !3, metadata !"this", metadata !4, i32 16777225, metadata !29, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 9]
+!29 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from C]
+!30 = metadata !{i32 9, i32 0, metadata !3, null}
+!31 = metadata !{i32 10, i32 0, metadata !3, null}
+!32 = metadata !{i32 11, i32 0, metadata !3, null}
+!33 = metadata !{i32 14, i32 0, metadata !18, null}
+!34 = metadata !{i32 20, i32 0, metadata !19, null}
+!35 = metadata !{i32 25, i32 0, metadata !20, null}
+!36 = metadata !{i32 26, i32 0, metadata !20, null}
+!37 = metadata !{metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t"}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/dwarf-pubnames-split.ll b/test/DebugInfo/X86/dwarf-pubnames-split.ll
new file mode 100644
index 0000000..131e5aa
--- /dev/null
+++ b/test/DebugInfo/X86/dwarf-pubnames-split.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -split-dwarf=Enable %s -o - | FileCheck %s
+; Derived from:
+
+; int main (void) {
+;    return 0;
+; }
+
+; Check that we get a symbol off of the debug_info section when using split dwarf and pubnames.
+
+; CHECK: .Lpubtypes_begin0:
+; CHECK-NEXT: .short    2                       # DWARF Version
+; CHECK-NEXT: .long     .L.debug_info_begin0    # Offset of Compilation Unit Info
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0, !dbg !10
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 189287) (llvm/trunk 189296)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!10 = metadata !{i32 2, i32 0, metadata !4, null}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/earlydup-crash.ll b/test/DebugInfo/X86/earlydup-crash.ll
index 1e66264..b5dc01e 100644
--- a/test/DebugInfo/X86/earlydup-crash.ll
+++ b/test/DebugInfo/X86/earlydup-crash.ll
@@ -43,12 +43,13 @@ bb33:                                             ; preds = %bb31, %bb22, %bb18,
 declare void @foobar(i32)
 
 !llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!47}
 !0 = metadata !{i32 590080, metadata !1, metadata !"frname_len", metadata !3, i32 517, metadata !38, i32 0} ; [ DW_TAG_auto_variable ]
 !1 = metadata !{i32 589835, metadata !44, metadata !2, i32 515, i32 0, i32 19} ; [ DW_TAG_lexical_block ]
 !2 = metadata !{i32 589870, metadata !44, null, metadata !"framework_construct_pathname", metadata !"framework_construct_pathname", metadata !"", i32 515, metadata !5, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8* (i8*, %struct.cpp_dir*)* @framework_construct_pathname, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !3 = metadata !{i32 589865, metadata !44}  ; [ DW_TAG_file_type ]
 !4 = metadata !{i32 589841, metadata !44, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !46, metadata !46, metadata !45, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 589845, metadata !44, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{i32 589845, metadata !44, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7, metadata !9, metadata !11}
 !7 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
 !8 = metadata !{i32 589860, metadata !44, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
@@ -57,7 +58,7 @@ declare void @foobar(i32)
 !11 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
 !12 = metadata !{i32 589846, metadata !41, metadata !13, metadata !"cpp_dir", i32 45, i64 0, i64 0, i64 0, i32 0, metadata !14} ; [ DW_TAG_typedef ]
 !13 = metadata !{i32 589865, metadata !41} ; [ DW_TAG_file_type ]
-!14 = metadata !{i32 589843, metadata !41, metadata !3, metadata !"cpp_dir", i32 43, i64 352, i64 32, i64 0, i32 0, null, metadata !15, i32 0, null} ; [ DW_TAG_structure_type ]
+!14 = metadata !{i32 589843, metadata !41, metadata !3, metadata !"cpp_dir", i32 43, i64 352, i64 32, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [cpp_dir] [line 43, size 352, align 32, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !18, metadata !19, metadata !21, metadata !23, metadata !25, metadata !27, metadata !29, metadata !33, metadata !36}
 !16 = metadata !{i32 589837, metadata !41, metadata !14, metadata !"next", i32 572, i64 32, i64 32, i64 0, i32 0, metadata !17} ; [ DW_TAG_member ]
 !17 = metadata !{i32 589839, metadata !44, metadata !3, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
@@ -90,3 +91,4 @@ declare void @foobar(i32)
 !44 = metadata !{metadata !"darwin-c.c", metadata !"/Users/espindola/llvm/build-llvm-gcc/gcc/../../llvm-gcc-4.2/gcc/config"}
 !45 = metadata !{metadata !2}
 !46 = metadata !{i32 0}
+!47 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/eh_symbol.ll b/test/DebugInfo/X86/eh_symbol.ll
index 9ab95b1..172ca92 100644
--- a/test/DebugInfo/X86/eh_symbol.ll
+++ b/test/DebugInfo/X86/eh_symbol.ll
@@ -9,14 +9,16 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!9}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 ()* @f, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 589870, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @f, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [f]
 !1 = metadata !{i32 589865, metadata !6} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !6, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !7, metadata !7, metadata !8, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build"}
 !7 = metadata !{i32 0}
 !8 = metadata !{metadata !0}
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index 7bc532e..7b38fde 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -3,12 +3,15 @@
 ; RUN: llvm-as < %s | llvm-dis | FileCheck --check-prefix=CHECK-DIS %s
 
 ; CHECK: 0x0000000b: DW_TAG_compile_unit
-; CHECK: 0x00000012:   DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000035] = "foo.cpp")
-; CHECK: 0x0000003c:   DW_TAG_class_type
-; CHECK: 0x0000003d:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000006d] = "D")
-; CHECK: 0x00000044:     DW_TAG_member
-; CHECK: 0x00000045:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x0000005d] = "c1")
-; CHECK: 0x0000008d:       DW_AT_artificial [DW_FORM_flag_present]       (true)
+; CHECK:               DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000035] = "foo.cpp")
+; CHECK: 0x{{[0-9a-f]+}}:   DW_TAG_class_type
+; CHECK:                 DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]+}}] = "D")
+; CHECK: 0x{{[0-9a-f]+}}:     DW_TAG_member
+; CHECK:                   DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]+}}] = "c1")
+; CHECK: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]+}}] = "D")
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_artificial [DW_FORM_flag_present]       (true)
 
 ; CHECK-DIS: [artificial]
 
@@ -57,16 +60,17 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!54}
 
 !0 = metadata !{i32 786449, metadata !53, i32 4, metadata !"clang version 3.2 (trunk 167506) (llvm/trunk 167505)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !31}
 !5 = metadata !{i32 786478, metadata !6, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2Ev", i32 12, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
 !6 = metadata !{i32 786473, metadata !53} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
-!10 = metadata !{i32 786434, metadata !53, null, metadata !"D", i32 1, i64 128, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [from ]
+!9 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!10 = metadata !{i32 786434, metadata !53, null, metadata !"D", i32 1, i64 128, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !14, metadata !15, metadata !16, metadata !17, metadata !20}
 !12 = metadata !{i32 786445, metadata !53, metadata !10, metadata !"c1", i32 6, i64 32, i64 32, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
 !13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
@@ -77,7 +81,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !18 = metadata !{metadata !19}
 !19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !20 = metadata !{i32 786478, metadata !6, metadata !10, metadata !"D", metadata !"D", metadata !"", i32 4, metadata !21, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !25, i32 4} ; [ DW_TAG_subprogram ] [line 4] [D]
-!21 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{null, metadata !9, metadata !23}
 !23 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
 !24 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
@@ -107,3 +111,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !51 = metadata !{i32 23, i32 0, metadata !48, null}
 !52 = metadata !{i32 24, i32 0, metadata !48, null}
 !53 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo"}
+!54 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/empty-and-one-elem-array.ll b/test/DebugInfo/X86/empty-and-one-elem-array.ll
index ce3035e..a3a08f0 100644
--- a/test/DebugInfo/X86/empty-and-one-elem-array.ll
+++ b/test/DebugInfo/X86/empty-and-one-elem-array.ll
@@ -28,61 +28,68 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 ; An empty array should not have an AT_upper_bound attribute. But an array of 1
 ; should.
 
-; CHECK:      0x00000074:   DW_TAG_base_type [5]  
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000043] = "int")
+; CHECK:      DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "int")
 ; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
 ; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
 
+; int foo::b[1]:
+; CHECK: DW_TAG_structure_type
+; CHECK: DW_AT_name{{.*}}"foo"
+; CHECK:      DW_TAG_member
+; CHECK:      DW_TAG_member
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "b")
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
+
 ; int[1]:
-; CHECK:      0x00000082:   DW_TAG_array_type [7] *
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]    (cu + 0x0074 => {0x00000074})
-; CHECK:      0x00000087:     DW_TAG_subrange_type [8]
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x007b => {0x0000007b})
+; CHECK:      DW_TAG_array_type [{{.*}}] *
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
+; CHECK:      DW_TAG_subrange_type [{{.*}}]
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
 ; CHECK-NEXT: DW_AT_upper_bound [DW_FORM_data1]  (0x00)
 
-; int foo::b[1]:
-; CHECK:      0x000000a5:     DW_TAG_member [10]
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000050] = "b")
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x0082 => {0x00000082})
+; int bar::b[0]:
+; CHECK: DW_TAG_structure_type
+; CHECK: DW_AT_name{{.*}}"bar"
+; CHECK:      DW_TAG_member
+; CHECK:      DW_TAG_member
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[{{.*}}] = "b")
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
 
 ; int[0]:
-; CHECK:      0x000000b5:   DW_TAG_array_type [7] *
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]    (cu + 0x0074 => {0x00000074})
-; CHECK:      0x000000ba:     DW_TAG_subrange_type [11]
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x007b => {0x0000007b})
+; CHECK:      DW_TAG_array_type [{{.*}}] *
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
+; CHECK:      DW_TAG_subrange_type [11]
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]
 ; CHECK-NOT:  DW_AT_upper_bound
 
-; int bar::b[0]:
-; CHECK:      0x000000d7:     DW_TAG_member [10]
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000050] = "b")
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x00b5 => {0x000000b5})
-
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33}
 
 !0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"clang version 3.3 (trunk 169136)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/test.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"func", metadata !"func", metadata !"", i32 11, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @func, null, null, metadata !1, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [func]
 !6 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786688, metadata !11, metadata !"my_foo", metadata !6, i32 12, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [my_foo] [line 12]
 !11 = metadata !{i32 786443, metadata !6, metadata !5, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Sandbox/llvm/test.c]
-!12 = metadata !{i32 786451, metadata !32, null, metadata !"foo", i32 1, i64 64, i64 32, i32 0, i32 0, null, metadata !13, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [from ]
+!12 = metadata !{i32 786451, metadata !32, null, metadata !"foo", i32 1, i64 64, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 64, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !15}
 !14 = metadata !{i32 786445, metadata !32, metadata !12, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
 !15 = metadata !{i32 786445, metadata !32, metadata !12, metadata !"b", i32 3, i64 32, i64 32, i64 32, i32 0, metadata !16} ; [ DW_TAG_member ] [b] [line 3, size 32, align 32, offset 32] [from ]
-!16 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 32, i32 0, i32 0, metadata !9, metadata !17, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 32, align 32, offset 0] [from int]
+!16 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 32, i32 0, i32 0, metadata !9, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 32, offset 0] [from int]
 !17 = metadata !{metadata !18}
 !18 = metadata !{i32 786465, i64 0, i64 1} ; [ DW_TAG_subrange_type ] [0, 1]
 !19 = metadata !{i32 12, i32 0, metadata !11, null}
 !20 = metadata !{i32 786688, metadata !11, metadata !"my_bar", metadata !6, i32 13, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [my_bar] [line 13]
-!21 = metadata !{i32 786451, metadata !32, null, metadata !"bar", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !22, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [bar] [line 6, size 32, align 32, offset 0] [from ]
+!21 = metadata !{i32 786451, metadata !32, null, metadata !"bar", i32 6, i64 32, i64 32, i32 0, i32 0, null, metadata !22, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 32, align 32, offset 0] [def] [from ]
 !22 = metadata !{metadata !23, metadata !24}
 !23 = metadata !{i32 786445, metadata !32, metadata !21, metadata !"a", i32 7, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [a] [line 7, size 32, align 32, offset 0] [from int]
 !24 = metadata !{i32 786445, metadata !32, metadata !21, metadata !"b", i32 8, i64 0, i64 32, i64 32, i32 0, metadata !25} ; [ DW_TAG_member ] [b] [line 8, size 0, align 32, offset 32] [from ]
-!25 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !26, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!25 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !26, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !26 = metadata !{metadata !27}
 !27 = metadata !{i32 786465, i64 0, i64 0} ; [ DW_TAG_subrange_type ] [0, 0]
 !28 = metadata !{i32 13, i32 0, metadata !11, null}
@@ -90,3 +97,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !30 = metadata !{i32 16, i32 0, metadata !11, null}
 !31 = metadata !{i32 17, i32 0, metadata !11, null}
 !32 = metadata !{metadata !"test.c", metadata !"/Volumes/Sandbox/llvm"}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/empty-array.ll b/test/DebugInfo/X86/empty-array.ll
index 1f46281..2436467 100644
--- a/test/DebugInfo/X86/empty-array.ll
+++ b/test/DebugInfo/X86/empty-array.ll
@@ -6,40 +6,44 @@
 
 @a = global %class.A zeroinitializer, align 4
 
-; CHECK:      0x0000002d:   DW_TAG_base_type [3]  
-; CHECK-NEXT: DW_AT_name
-; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
-; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
+; CHECK: DW_TAG_class_type
+; CHECK:      DW_TAG_member
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x{{[0-9a-f]*}}] = "x")
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x{{[0-9a-f]*}} => {[[ARRAY:0x[0-9a-f]*]]})
 
-; CHECK:      0x00000034:   DW_TAG_array_type [4] *
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]    (cu + 0x0026 => {0x00000026})
+; CHECK:      [[ARRAY]]: DW_TAG_array_type [{{.*}}] *
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]    (cu + 0x{{[0-9a-f]*}} => {[[BASETYPE:0x[0-9a-f]*]]})
 
-; CHECK:      0x00000039:     DW_TAG_subrange_type [5]
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x002d => {0x0000002d})
+; CHECK:      DW_TAG_subrange_type
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x{{[0-9a-f]*}} => {[[BASE2:0x[0-9a-f]*]]})
 ; CHECK-NOT:  DW_AT_upper_bound
 
-; CHECK:      DW_TAG_member [8]
-; CHECK-NEXT: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x0000003f] = "x")
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]  (cu + 0x0034 => {0x00000034})
+; CHECK: [[BASETYPE]]: DW_TAG_base_type
+; CHECK: [[BASE2]]: DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name
+; CHECK-NEXT: DW_AT_byte_size [DW_FORM_data1]  (0x04)
+; CHECK-NEXT: DW_AT_encoding [DW_FORM_data1]   (0x05)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.3 (trunk 169136)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
 !6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [from ]
+!7 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
 !9 = metadata !{i32 786445, metadata !20, metadata !7, metadata !"x", i32 1, i64 0, i64 0, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
-!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !11, metadata !12, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786465, i64 0, i64 -1} ; [ DW_TAG_subrange_type ] [unbound]
 !14 = metadata !{i32 786478, metadata !6, metadata !7, metadata !"A", metadata !"A", metadata !"", i32 1, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !18, i32 1} ; [ DW_TAG_subprogram ] [line 1] [A]
-!15 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!17 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
 !18 = metadata !{metadata !19}
 !19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !20 = metadata !{metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm"}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/ending-run.ll b/test/DebugInfo/X86/ending-run.ll
index b55ccc4..ae17fd0 100644
--- a/test/DebugInfo/X86/ending-run.ll
+++ b/test/DebugInfo/X86/ending-run.ll
@@ -27,13 +27,14 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20}
 
 !0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.1 (trunk 153921) (llvm/trunk 153916)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !19, metadata !6, metadata !"callee", metadata !"callee", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 (i32)* @callee, null, null, metadata !10, i32 7} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
@@ -46,3 +47,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !17 = metadata !{i32 8, i32 18, metadata !15, null}
 !18 = metadata !{i32 9, i32 5, metadata !15, null}
 !19 = metadata !{metadata !"ending-run.c", metadata !"/Users/echristo/tmp"}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/enum-class.ll b/test/DebugInfo/X86/enum-class.ll
index af6129c..a31e254 100644
--- a/test/DebugInfo/X86/enum-class.ll
+++ b/test/DebugInfo/X86/enum-class.ll
@@ -6,19 +6,20 @@
 @c = global i32 0, align 4
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!23}
 
 !0 = metadata !{i32 786449, metadata !22, i32 4, metadata !"clang version 3.2 (trunk 157269) (llvm/trunk 157264)", i1 false, metadata !"", i32 0, metadata !1, metadata !15, metadata !15, metadata !17,  metadata !17, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !3, metadata !8, metadata !12}
-!3 = metadata !{i32 786436, metadata !4, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, metadata !5, metadata !6, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!3 = metadata !{i32 786436, metadata !4, null, metadata !"A", i32 1, i64 32, i64 32, i32 0, i32 0, metadata !5, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [A] [line 1, size 32, align 32, offset 0] [def] [from int]
 !4 = metadata !{i32 786473, metadata !22} ; [ DW_TAG_file_type ]
 !5 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786472, metadata !"A1", i64 1} ; [ DW_TAG_enumerator ]
-!8 = metadata !{i32 786436, metadata !4, null, metadata !"B", i32 2, i64 64, i64 64, i32 0, i32 0, metadata !9, metadata !10, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!8 = metadata !{i32 786436, metadata !4, null, metadata !"B", i32 2, i64 64, i64 64, i32 0, i32 0, metadata !9, metadata !10, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [B] [line 2, size 64, align 64, offset 0] [def] [from long unsigned int]
 !9 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
 !11 = metadata !{i32 786472, metadata !"B1", i64 1} ; [ DW_TAG_enumerator ]
-!12 = metadata !{i32 786436, metadata !4, null, metadata !"C", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!12 = metadata !{i32 786436, metadata !4, null, metadata !"C", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [C] [line 3, size 32, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14}
 !14 = metadata !{i32 786472, metadata !"C1", i64 1} ; [ DW_TAG_enumerator ]
 !15 = metadata !{i32 0}
@@ -28,16 +29,17 @@
 !21 = metadata !{i32 786484, i32 0, null, metadata !"c", metadata !"c", metadata !"", metadata !4, i32 6, metadata !12, i32 0, i32 1, i32* @c, null} ; [ DW_TAG_variable ]
 !22 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
 
-; CHECK: DW_TAG_enumeration_type [3]
-; CHECK: DW_AT_type [DW_FORM_ref4]      (cu + 0x0026 => {0x00000026})
-; CHECK: DW_AT_enum_class [DW_FORM_flag]    (0x01)
+; CHECK: DW_TAG_enumeration_type [{{.*}}]
+; CHECK: DW_AT_type [DW_FORM_ref4]
+; CHECK: DW_AT_enum_class [DW_FORM_flag_present] (true)
 ; CHECK: DW_AT_name [DW_FORM_strp]      ( .debug_str[{{.*}}] = "A")
 
-; CHECK: DW_TAG_enumeration_type [3] *
-; CHECK: DW_AT_type [DW_FORM_ref4]      (cu + 0x0057 => {0x00000057})
-; CHECK: DW_AT_enum_class [DW_FORM_flag]    (0x01)
+; CHECK: DW_TAG_enumeration_type [{{.*}}] *
+; CHECK: DW_AT_type [DW_FORM_ref4]
+; CHECK: DW_AT_enum_class [DW_FORM_flag_present] (true)
 ; CHECK: DW_AT_name [DW_FORM_strp]          ( .debug_str[{{.*}}] = "B")
 
 ; CHECK: DW_TAG_enumeration_type [6]
 ; CHECK-NOT: DW_AT_enum_class
 ; CHECK: DW_AT_name [DW_FORM_strp]      ( .debug_str[{{.*}}] = "C")
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
index 03f590c..6bfb930 100644
--- a/test/DebugInfo/X86/enum-fwd-decl.ll
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll
@@ -4,16 +4,18 @@
 @e = global i16 0, align 2
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
 
 !0 = metadata !{i32 786449, metadata !8, i32 4, metadata !"clang version 3.2 (trunk 165274) (llvm/trunk 165272)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i16* @e, null} ; [ DW_TAG_variable ] [e] [line 2] [def]
 !6 = metadata !{i32 786473, metadata !8} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786436, metadata !8, null, metadata !"E", i32 1, i64 16, i64 16, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [fwd] [from ]
+!7 = metadata !{i32 786436, metadata !8, null, metadata !"E", i32 1, i64 16, i64 16, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [decl] [from ]
 !8 = metadata !{metadata !"foo.cpp", metadata !"/tmp"}
 
 ; CHECK: DW_TAG_enumeration_type
 ; CHECK-NEXT: DW_AT_name
 ; CHECK-NEXT: DW_AT_byte_size
 ; CHECK-NEXT: DW_AT_declaration
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/fission-cu.ll b/test/DebugInfo/X86/fission-cu.ll
index 8ad3c2d..06408d7 100644
--- a/test/DebugInfo/X86/fission-cu.ll
+++ b/test/DebugInfo/X86/fission-cu.ll
@@ -5,6 +5,7 @@
 @a = common global i32 0, align 4
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
 
 !0 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.3 (trunk 169021) (llvm/trunk 169020)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !"baz.dwo"} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/baz.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
@@ -23,20 +24,20 @@
 ; CHECK: Abbrev table for offset: 0x00000000
 ; CHECK: [1] DW_TAG_compile_unit DW_CHILDREN_no
 ; CHECK: DW_AT_GNU_dwo_name      DW_FORM_strp
-; CHECK: DW_AT_GNU_dwo_id        DW_FORM_data8
 ; CHECK: DW_AT_GNU_addr_base     DW_FORM_sec_offset
 ; CHECK: DW_AT_low_pc    DW_FORM_addr
 ; CHECK: DW_AT_stmt_list DW_FORM_sec_offset
 ; CHECK: DW_AT_comp_dir  DW_FORM_strp
+; CHECK: DW_AT_GNU_dwo_id        DW_FORM_data8
 
 ; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK: DW_AT_GNU_dwo_name [DW_FORM_strp] ( .debug_str[0x00000000] = "baz.dwo")
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x0000000000000000)
 ; CHECK: DW_AT_GNU_addr_base [DW_FORM_sec_offset]                   (0x00000000)
 ; CHECK: DW_AT_low_pc [DW_FORM_addr]       (0x0000000000000000)
 ; CHECK: DW_AT_stmt_list [DW_FORM_sec_offset]   (0x00000000)
 ; CHECK: DW_AT_comp_dir [DW_FORM_strp]     ( .debug_str[0x00000008] = "/usr/local/google/home/echristo/tmp")
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x0000000000000000)
 
 ; CHECK: .debug_str contents:
 ; CHECK: 0x00000000: "baz.dwo"
@@ -54,12 +55,7 @@
 ; CHECK-NOT: DW_AT_comp_dir
 ; CHECK: DW_AT_GNU_dwo_id        DW_FORM_data8
 
-; CHECK: [2] DW_TAG_base_type    DW_CHILDREN_no
-; CHECK: DW_AT_name      DW_FORM_GNU_str_index
-; CHECK: DW_AT_encoding  DW_FORM_data1
-; CHECK: DW_AT_byte_size DW_FORM_data1
-
-; CHECK: [3] DW_TAG_variable     DW_CHILDREN_no
+; CHECK: [2] DW_TAG_variable     DW_CHILDREN_no
 ; CHECK: DW_AT_name      DW_FORM_GNU_str_index
 ; CHECK: DW_AT_type      DW_FORM_ref4
 ; CHECK: DW_AT_external  DW_FORM_flag_present
@@ -67,6 +63,11 @@
 ; CHECK: DW_AT_decl_line DW_FORM_data1
 ; CHECK: DW_AT_location  DW_FORM_block1
 
+; CHECK: [3] DW_TAG_base_type    DW_CHILDREN_no
+; CHECK: DW_AT_name      DW_FORM_GNU_str_index
+; CHECK: DW_AT_encoding  DW_FORM_data1
+; CHECK: DW_AT_byte_size DW_FORM_data1
+
 ; Check that the rest of the compile units have information.
 ; CHECK: .debug_info.dwo contents:
 ; CHECK: DW_TAG_compile_unit
@@ -77,15 +78,15 @@
 ; CHECK-NOT: DW_AT_stmt_list
 ; CHECK-NOT: DW_AT_comp_dir
 ; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8]  (0x0000000000000000)
-; CHECK: DW_TAG_base_type
-; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000003) string = "int")
 ; CHECK: DW_TAG_variable
 ; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000002) string = "a")
-; CHECK: DW_AT_type [DW_FORM_ref4]       (cu + 0x0018 => {0x00000018})
+; CHECK: DW_AT_type [DW_FORM_ref4]       (cu + 0x{{[0-9a-f]*}} => {[[TYPE:0x[0-9a-f]*]]})
 ; CHECK: DW_AT_external [DW_FORM_flag_present]   (true)
 ; CHECK: DW_AT_decl_file [DW_FORM_data1] (0x01)
 ; CHECK: DW_AT_decl_line [DW_FORM_data1] (0x01)
 ; CHECK: DW_AT_location [DW_FORM_block1] (<0x02> fb 00 )
+; CHECK: [[TYPE]]: DW_TAG_base_type
+; CHECK: DW_AT_name [DW_FORM_GNU_str_index]     ( indexed (00000003) string = "int")
 
 
 ; CHECK: .debug_str.dwo contents:
@@ -110,3 +111,4 @@
 ; OBJ-NEXT: R_X86_64_32 .debug_line
 ; OBJ-NEXT: R_X86_64_32 .debug_str
 ; OBJ-NEXT: }
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/fission-hash.ll b/test/DebugInfo/X86/fission-hash.ll
new file mode 100644
index 0000000..d3e46a9
--- /dev/null
+++ b/test/DebugInfo/X86/fission-hash.ll
@@ -0,0 +1,16 @@
+; RUN: llc -split-dwarf=Enable -generate-cu-hash -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
+; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
+
+; The source is an empty file.
+
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x0c1e629c9e5ada4f)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x0c1e629c9e5ada4f)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 188230) (llvm/trunk 188234)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !"foo.dwo"} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
new file mode 100644
index 0000000..0a10079
--- /dev/null
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -0,0 +1,151 @@
+; RUN: llc -split-dwarf=Enable -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
+; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
+
+; From the code:
+
+; extern int c;
+; static void foo (int p)
+; {
+;   int a, b; 
+;   unsigned int d, e;
+
+;   for (a = 0; a < 30; a++)
+;     for (d = 0; d < 30; d++)
+;       for (b = 0; b < 30; b++)
+;         for (e = 0; e < 30; e++)
+;           {
+;             int *w = &c; 
+;             *w &= p; 
+;           }
+; }
+
+; void 
+; bar ()
+; {
+;   foo (1);
+; }
+
+; compiled with:
+
+; clang -g -S -gsplit-dwarf -O1 small.c
+
+; CHECK: DW_AT_GNU_ranges_base
+
+@c = external global i32
+
+; Function Attrs: nounwind uwtable
+define void @bar() #0 {
+entry:
+  tail call fastcc void @foo(), !dbg !27
+  ret void, !dbg !28
+}
+
+; Function Attrs: nounwind uwtable
+define internal fastcc void @foo() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !29, i64 0, metadata !13), !dbg !30
+  tail call void @llvm.dbg.value(metadata !2, i64 0, metadata !14), !dbg !31
+  %c.promoted9 = load i32* @c, align 4, !dbg !32, !tbaa !33
+  br label %for.cond1.preheader, !dbg !31
+
+for.cond1.preheader:                              ; preds = %for.inc16, %entry
+  %and.lcssa.lcssa.lcssa10 = phi i32 [ %c.promoted9, %entry ], [ %and, %for.inc16 ]
+  %a.08 = phi i32 [ 0, %entry ], [ %inc17, %for.inc16 ]
+  br label %for.cond4.preheader, !dbg !37
+
+for.cond4.preheader:                              ; preds = %for.inc13, %for.cond1.preheader
+  %and.lcssa.lcssa7 = phi i32 [ %and.lcssa.lcssa.lcssa10, %for.cond1.preheader ], [ %and, %for.inc13 ]
+  %d.06 = phi i32 [ 0, %for.cond1.preheader ], [ %inc14, %for.inc13 ]
+  br label %for.cond7.preheader, !dbg !38
+
+for.cond7.preheader:                              ; preds = %for.inc10, %for.cond4.preheader
+  %and.lcssa5 = phi i32 [ %and.lcssa.lcssa7, %for.cond4.preheader ], [ %and, %for.inc10 ]
+  %b.03 = phi i32 [ 0, %for.cond4.preheader ], [ %inc11, %for.inc10 ]
+  br label %for.body9, !dbg !39
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %and2 = phi i32 [ %and.lcssa5, %for.cond7.preheader ], [ %and, %for.body9 ], !dbg !40
+  %e.01 = phi i32 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  tail call void @llvm.dbg.value(metadata !41, i64 0, metadata !19), !dbg !40
+  %and = and i32 %and2, 1, !dbg !32
+  %inc = add i32 %e.01, 1, !dbg !39
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !18), !dbg !39
+  %exitcond = icmp eq i32 %inc, 30, !dbg !39
+  br i1 %exitcond, label %for.inc10, label %for.body9, !dbg !39
+
+for.inc10:                                        ; preds = %for.body9
+  %inc11 = add nsw i32 %b.03, 1, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i32 %inc11}, i64 0, metadata !15), !dbg !38
+  %exitcond11 = icmp eq i32 %inc11, 30, !dbg !38
+  br i1 %exitcond11, label %for.inc13, label %for.cond7.preheader, !dbg !38
+
+for.inc13:                                        ; preds = %for.inc10
+  %inc14 = add i32 %d.06, 1, !dbg !37
+  tail call void @llvm.dbg.value(metadata !{i32 %inc14}, i64 0, metadata !16), !dbg !37
+  %exitcond12 = icmp eq i32 %inc14, 30, !dbg !37
+  br i1 %exitcond12, label %for.inc16, label %for.cond4.preheader, !dbg !37
+
+for.inc16:                                        ; preds = %for.inc13
+  %inc17 = add nsw i32 %a.08, 1, !dbg !31
+  tail call void @llvm.dbg.value(metadata !{i32 %inc17}, i64 0, metadata !14), !dbg !31
+  %exitcond13 = icmp eq i32 %inc17, 30, !dbg !31
+  br i1 %exitcond13, label %for.end18, label %for.cond1.preheader, !dbg !31
+
+for.end18:                                        ; preds = %for.inc16
+  store i32 %and, i32* @c, align 4, !dbg !32, !tbaa !33
+  ret void, !dbg !42
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26, !43}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 191700) (llvm/trunk 191710)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !"small.dwo"} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/small.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"small.c", metadata !"/usr/local/google/home/echristo/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"", i32 18, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void ()* @bar, null, null, metadata !2, i32 19} ; [ DW_TAG_subprogram ] [line 18] [def] [scope 19] [bar]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/small.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !9, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @foo, null, null, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 2] [local] [def] [scope 3] [foo]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{null, metadata !11}
+!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !18, metadata !19}
+!13 = metadata !{i32 786689, metadata !8, metadata !"p", metadata !5, i32 16777218, metadata !11, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 2]
+!14 = metadata !{i32 786688, metadata !8, metadata !"a", metadata !5, i32 4, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 4]
+!15 = metadata !{i32 786688, metadata !8, metadata !"b", metadata !5, i32 4, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 4]
+!16 = metadata !{i32 786688, metadata !8, metadata !"d", metadata !5, i32 5, metadata !17, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 5]
+!17 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
+!18 = metadata !{i32 786688, metadata !8, metadata !"e", metadata !5, i32 5, metadata !17, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 5]
+!19 = metadata !{i32 786688, metadata !20, metadata !"w", metadata !5, i32 12, metadata !25, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [w] [line 12]
+!20 = metadata !{i32 786443, metadata !1, metadata !21, i32 11, i32 0, i32 4} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!21 = metadata !{i32 786443, metadata !1, metadata !22, i32 10, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!22 = metadata !{i32 786443, metadata !1, metadata !23, i32 9, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!23 = metadata !{i32 786443, metadata !1, metadata !24, i32 8, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!24 = metadata !{i32 786443, metadata !1, metadata !8, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/small.c]
+!25 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!26 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!27 = metadata !{i32 20, i32 0, metadata !4, null}
+!28 = metadata !{i32 21, i32 0, metadata !4, null}
+!29 = metadata !{i32 1}
+!30 = metadata !{i32 2, i32 0, metadata !8, null}
+!31 = metadata !{i32 7, i32 0, metadata !24, null}
+!32 = metadata !{i32 13, i32 0, metadata !20, null}
+!33 = metadata !{metadata !34, metadata !34, i64 0}
+!34 = metadata !{metadata !"int", metadata !35, i64 0}
+!35 = metadata !{metadata !"omnipotent char", metadata !36, i64 0}
+!36 = metadata !{metadata !"Simple C/C++ TBAA"}
+!37 = metadata !{i32 8, i32 0, metadata !23, null} ; [ DW_TAG_imported_declaration ]
+!38 = metadata !{i32 9, i32 0, metadata !22, null}
+!39 = metadata !{i32 10, i32 0, metadata !21, null}
+!40 = metadata !{i32 12, i32 0, metadata !20, null}
+!41 = metadata !{i32* @c}
+!42 = metadata !{i32 15, i32 0, metadata !8, null}
+!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/generate-odr-hash.ll b/test/DebugInfo/X86/generate-odr-hash.ll
index 956d3f6..4f9cc78 100644
--- a/test/DebugInfo/X86/generate-odr-hash.ll
+++ b/test/DebugInfo/X86/generate-odr-hash.ll
@@ -103,8 +103,8 @@
 define void @_Z3foov() #0 {
 entry:
   %b = alloca %struct.baz, align 1
-  call void @llvm.dbg.declare(metadata !{%struct.baz* %b}, metadata !63), !dbg !71
-  ret void, !dbg !72
+  call void @llvm.dbg.declare(metadata !{%struct.baz* %b}, metadata !44), !dbg !46
+  ret void, !dbg !47
 }
 
 ; Function Attrs: nounwind readnone
@@ -112,8 +112,8 @@ declare void @llvm.dbg.declare(metadata, metadata) #1
 
 define internal void @__cxx_global_var_init() section ".text.startup" {
 entry:
-  call void @_ZN12_GLOBAL__N_16walrusC1Ev(%"struct.<anonymous namespace>::walrus"* @w), !dbg !73
-  ret void, !dbg !73
+  call void @_ZN12_GLOBAL__N_16walrusC1Ev(%"struct.<anonymous namespace>::walrus"* @w), !dbg !48
+  ret void, !dbg !48
 }
 
 ; Function Attrs: nounwind uwtable
@@ -121,98 +121,76 @@ define internal void @_ZN12_GLOBAL__N_16walrusC2Ev(%"struct.<anonymous namespace
 entry:
   %this.addr = alloca %"struct.<anonymous namespace>::walrus"*, align 8
   store %"struct.<anonymous namespace>::walrus"* %this, %"struct.<anonymous namespace>::walrus"** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%"struct.<anonymous namespace>::walrus"** %this.addr}, metadata !74), !dbg !76
+  call void @llvm.dbg.declare(metadata !{%"struct.<anonymous namespace>::walrus"** %this.addr}, metadata !49), !dbg !51
   %this1 = load %"struct.<anonymous namespace>::walrus"** %this.addr
-  ret void, !dbg !76
+  ret void, !dbg !52
 }
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
 entry:
-  call void @__cxx_global_var_init(), !dbg !77
-  ret void, !dbg !77
+  call void @__cxx_global_var_init(), !dbg !53
+  ret void, !dbg !53
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!62}
+!llvm.module.flags = !{!42, !54}
+!llvm.ident = !{!43}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 187387) (llvm/trunk 187385)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !20, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"bar.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !20, metadata !37, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"bar.cpp", metadata !"/tmp/dbginfo"}
 !2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !8, metadata !9, metadata !18}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3foov, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [foo]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/bar.cpp]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 31, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 31} ; [ DW_TAG_subprogram ] [line 31] [local] [def] [__cxx_global_var_init]
-!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"walrus", metadata !"walrus", metadata !"_ZN12_GLOBAL__N_16walrusC2Ev", i32 27, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%"struct.<anonymous namespace>::walrus"*)* @_ZN12_GLOBAL__N_16walrusC2Ev, null, metadata !16, metadata !2, i32 27} ; [ DW_TAG_subprogram ] [line 27] [local] [def] [walrus]
-!10 = metadata !{i32 786489, metadata !1, null, metadata !"", i32 25} ; [ DW_TAG_namespace ] [line 25]
-!11 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from walrus]
-!14 = metadata !{i32 786451, metadata !1, metadata !10, metadata !"walrus", i32 26, i64 8, i64 8, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_structure_type ] [walrus] [line 26, size 8, align 8, offset 0] [def] [from ]
-!15 = metadata !{metadata !16}
-!16 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"walrus", metadata !"walrus", metadata !"", i32 27, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 27} ; [ DW_TAG_subprogram ] [line 27] [walrus]
-!17 = metadata !{i32 786468}
-!18 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"_GLOBAL__I_a", metadata !"_GLOBAL__I_a", metadata !"", i32 27, metadata !19, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 27} ; [ DW_TAG_subprogram ] [line 27] [local] [def] [_GLOBAL__I_a]
-!19 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!20 = metadata !{metadata !21, metadata !29, metadata !43, metadata !44}
-!21 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !5, i32 4, metadata !22, i32 0, i32 1, %struct.bar* @b, null} ; [ DW_TAG_variable ] [b] [line 4] [def]
-!22 = metadata !{i32 786451, metadata !1, null, metadata !"bar", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !23, i32 0, null, null} ; [ DW_TAG_structure_type ] [bar] [line 1, size 8, align 8, offset 0] [def] [from ]
-!23 = metadata !{metadata !24}
-!24 = metadata !{i32 786478, metadata !1, metadata !22, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !25, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !28, i32 1} ; [ DW_TAG_subprogram ] [line 1] [bar]
-!25 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !26, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!26 = metadata !{null, metadata !27}
-!27 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
-!28 = metadata !{i32 786468}
-!29 = metadata !{i32 786484, i32 0, metadata !30, metadata !"animal", metadata !"animal", metadata !"_ZN7echidna8capybara8mongoose6animalE", metadata !5, i32 20, metadata !33, i32 0, i32 1, %"class.echidna::capybara::mongoose::fluffy"* @_ZN7echidna8capybara8mongoose6animalE, null} ; [ DW_TAG_variable ] [animal] [line 20] [def]
-!30 = metadata !{i32 786489, metadata !1, metadata !31, metadata !"mongoose", i32 14} ; [ DW_TAG_namespace ] [mongoose] [line 14]
-!31 = metadata !{i32 786489, metadata !1, metadata !32, metadata !"capybara", i32 13} ; [ DW_TAG_namespace ] [capybara] [line 13]
-!32 = metadata !{i32 786489, metadata !1, null, metadata !"echidna", i32 12} ; [ DW_TAG_namespace ] [echidna] [line 12]
-!33 = metadata !{i32 786434, metadata !1, metadata !30, metadata !"fluffy", i32 15, i64 64, i64 32, i32 0, i32 0, null, metadata !34, i32 0, null, null} ; [ DW_TAG_class_type ] [fluffy] [line 15, size 64, align 32, offset 0] [def] [from ]
-!34 = metadata !{metadata !35, metadata !37, metadata !38}
-!35 = metadata !{i32 786445, metadata !1, metadata !33, metadata !"a", i32 16, i64 32, i64 32, i64 0, i32 1, metadata !36} ; [ DW_TAG_member ] [a] [line 16, size 32, align 32, offset 0] [private] [from int]
-!36 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!37 = metadata !{i32 786445, metadata !1, metadata !33, metadata !"b", i32 17, i64 32, i64 32, i64 32, i32 1, metadata !36} ; [ DW_TAG_member ] [b] [line 17, size 32, align 32, offset 32] [private] [from int]
-!38 = metadata !{i32 786478, metadata !1, metadata !33, metadata !"fluffy", metadata !"fluffy", metadata !"", i32 15, metadata !39, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !42, i32 15} ; [ DW_TAG_subprogram ] [line 15] [fluffy]
-!39 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!40 = metadata !{null, metadata !41}
-!41 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !33} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from fluffy]
-!42 = metadata !{i32 786468}
-!43 = metadata !{i32 786484, i32 0, null, metadata !"w", metadata !"w", metadata !"", metadata !5, i32 31, metadata !14, i32 1, i32 1, %"struct.<anonymous namespace>::walrus"* @w, null} ; [ DW_TAG_variable ] [w] [line 31] [local] [def]
-!44 = metadata !{i32 786484, i32 0, null, metadata !"wom", metadata !"wom", metadata !"", metadata !5, i32 40, metadata !45, i32 0, i32 1, %struct.wombat* @wom, null} ; [ DW_TAG_variable ] [wom] [line 40] [def]
-!45 = metadata !{i32 786451, metadata !1, null, metadata !"wombat", i32 33, i64 64, i64 32, i32 0, i32 0, null, metadata !46, i32 0, null, null} ; [ DW_TAG_structure_type ] [wombat] [line 33, size 64, align 32, offset 0] [def] [from ]
-!46 = metadata !{metadata !47, metadata !57}
-!47 = metadata !{i32 786445, metadata !1, metadata !45, metadata !"a_b", i32 37, i64 64, i64 32, i64 0, i32 0, metadata !48} ; [ DW_TAG_member ] [a_b] [line 37, size 64, align 32, offset 0] [from ]
-!48 = metadata !{i32 786451, metadata !1, metadata !45, metadata !"", i32 34, i64 64, i64 32, i32 0, i32 0, null, metadata !49, i32 0, null, null} ; [ DW_TAG_structure_type ] [line 34, size 64, align 32, offset 0] [def] [from ]
-!49 = metadata !{metadata !50, metadata !51, metadata !52}
-!50 = metadata !{i32 786445, metadata !1, metadata !48, metadata !"a", i32 35, i64 32, i64 32, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ] [a] [line 35, size 32, align 32, offset 0] [from int]
-!51 = metadata !{i32 786445, metadata !1, metadata !48, metadata !"b", i32 36, i64 32, i64 32, i64 32, i32 0, metadata !36} ; [ DW_TAG_member ] [b] [line 36, size 32, align 32, offset 32] [from int]
-!52 = metadata !{i32 786478, metadata !1, metadata !48, metadata !"", metadata !"", metadata !"", i32 34, metadata !53, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !56, i32 34} ; [ DW_TAG_subprogram ] [line 34]
-!53 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !54, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!54 = metadata !{null, metadata !55}
-!55 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !48} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ]
-!56 = metadata !{i32 786468}
-!57 = metadata !{i32 786478, metadata !1, metadata !45, metadata !"wombat", metadata !"wombat", metadata !"", i32 33, metadata !58, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !61, i32 33} ; [ DW_TAG_subprogram ] [line 33] [wombat]
-!58 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !59, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!59 = metadata !{null, metadata !60}
-!60 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !45} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from wombat]
-!61 = metadata !{i32 786468}
-!62 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!63 = metadata !{i32 786688, metadata !4, metadata !"b", metadata !5, i32 9, metadata !64, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 9]
-!64 = metadata !{i32 786451, metadata !1, metadata !4, metadata !"baz", i32 7, i64 8, i64 8, i32 0, i32 0, null, metadata !65, i32 0, null, null} ; [ DW_TAG_structure_type ] [baz] [line 7, size 8, align 8, offset 0] [def] [from ]
-!65 = metadata !{metadata !66}
-!66 = metadata !{i32 786478, metadata !1, metadata !64, metadata !"baz", metadata !"baz", metadata !"", i32 7, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !70, i32 7} ; [ DW_TAG_subprogram ] [line 7] [baz]
-!67 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !68, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!68 = metadata !{null, metadata !69}
-!69 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !64} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from baz]
-!70 = metadata !{i32 786468}
-!71 = metadata !{i32 9, i32 0, metadata !4, null}
-!72 = metadata !{i32 10, i32 0, metadata !4, null}
-!73 = metadata !{i32 31, i32 0, metadata !8, null}
-!74 = metadata !{i32 786689, metadata !9, metadata !"this", metadata !5, i32 16777243, metadata !75, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 27]
-!75 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from walrus]
-!76 = metadata !{i32 27, i32 0, metadata !9, null}
-!77 = metadata !{i32 27, i32 0, metadata !18, null}
+!3 = metadata !{metadata !4, metadata !5, metadata !13, metadata !16}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"bar", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{i32 786434, metadata !1, metadata !6, metadata !"fluffy", i32 13, i64 64, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE"} ; [ DW_TAG_class_type ] [fluffy] [line 13, size 64, align 32, offset 0] [def] [from ]
+!6 = metadata !{i32 786489, metadata !1, metadata !7, metadata !"mongoose", i32 12} ; [ DW_TAG_namespace ] [mongoose] [line 12]
+!7 = metadata !{i32 786489, metadata !1, metadata !8, metadata !"capybara", i32 11} ; [ DW_TAG_namespace ] [capybara] [line 11]
+!8 = metadata !{i32 786489, metadata !1, null, metadata !"echidna", i32 10} ; [ DW_TAG_namespace ] [echidna] [line 10]
+!9 = metadata !{metadata !10, metadata !12}
+!10 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !"a", i32 14, i64 32, i64 32, i64 0, i32 1, metadata !11} ; [ DW_TAG_member ] [a] [line 14, size 32, align 32, offset 0] [private] [from int]
+!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!12 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN7echidna8capybara8mongoose6fluffyE", metadata !"b", i32 15, i64 32, i64 32, i64 32, i32 1, metadata !11} ; [ DW_TAG_member ] [b] [line 15, size 32, align 32, offset 32] [private] [from int]
+!13 = metadata !{i32 786451, metadata !1, null, metadata !"wombat", i32 31, i64 64, i64 32, i32 0, i32 0, null, metadata !14, i32 0, null, null, metadata !"_ZTS6wombat"} ; [ DW_TAG_structure_type ] [wombat] [line 31, size 64, align 32, offset 0] [def] [from ]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786445, metadata !1, metadata !"_ZTS6wombat", metadata !"a_b", i32 35, i64 64, i64 32, i64 0, i32 0, metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_member ] [a_b] [line 35, size 64, align 32, offset 0] [from _ZTSN6wombatUt_E]
+!16 = metadata !{i32 786451, metadata !1, metadata !"_ZTS6wombat", metadata !"", i32 32, i64 64, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null, metadata !"_ZTSN6wombatUt_E"} ; [ DW_TAG_structure_type ] [line 32, size 64, align 32, offset 0] [def] [from ]
+!17 = metadata !{metadata !18, metadata !19}
+!18 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !"a", i32 33, i64 32, i64 32, i64 0, i32 0, metadata !11} ; [ DW_TAG_member ] [a] [line 33, size 32, align 32, offset 0] [from int]
+!19 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN6wombatUt_E", metadata !"b", i32 34, i64 32, i64 32, i64 32, i32 0, metadata !11} ; [ DW_TAG_member ] [b] [line 34, size 32, align 32, offset 32] [from int]
+!20 = metadata !{metadata !21, metadata !25, metadata !26, metadata !35}
+!21 = metadata !{i32 786478, metadata !1, metadata !22, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !23, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3foov, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
+!22 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/dbginfo/bar.cpp]
+!23 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !24, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{null}
+!25 = metadata !{i32 786478, metadata !1, metadata !22, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 29, metadata !23, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 29} ; [ DW_TAG_subprogram ] [line 29] [local] [def] [__cxx_global_var_init]
+!26 = metadata !{i32 786478, metadata !1, metadata !27, metadata !"walrus", metadata !"walrus", metadata !"_ZN12_GLOBAL__N_16walrusC2Ev", i32 25, metadata !31, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%"struct.<anonymous namespace>::walrus"*)* @_ZN12_GLOBAL__N_16walrusC2Ev, null, metadata !30, metadata !2, i32 25} ; [ DW_TAG_subprogram ] [line 25] [local] [def] [walrus]
+!27 = metadata !{i32 786451, metadata !1, metadata !28, metadata !"walrus", i32 24, i64 8, i64 8, i32 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [walrus] [line 24, size 8, align 8, offset 0] [def] [from ]
+!28 = metadata !{i32 786489, metadata !1, null, metadata !"", i32 23} ; [ DW_TAG_namespace ] [line 23]
+!29 = metadata !{metadata !30}
+!30 = metadata !{i32 786478, metadata !1, metadata !27, metadata !"walrus", metadata !"walrus", metadata !"", i32 25, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !34, i32 25} ; [ DW_TAG_subprogram ] [line 25] [walrus]
+!31 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !32, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!32 = metadata !{null, metadata !33}
+!33 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !27} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from walrus]
+!34 = metadata !{i32 786468}
+!35 = metadata !{i32 786478, metadata !1, metadata !22, metadata !"", metadata !"", metadata !"_GLOBAL__I_a", i32 25, metadata !36, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 25} ; [ DW_TAG_subprogram ] [line 25] [local] [def]
+!36 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!37 = metadata !{metadata !38, metadata !39, metadata !40, metadata !41}
+!38 = metadata !{i32 786484, i32 0, null, metadata !"b", metadata !"b", metadata !"", metadata !22, i32 3, metadata !4, i32 0, i32 1, %struct.bar* @b, null} ; [ DW_TAG_variable ] [b] [line 3] [def]
+!39 = metadata !{i32 786484, i32 0, metadata !6, metadata !"animal", metadata !"animal", metadata !"_ZN7echidna8capybara8mongoose6animalE", metadata !22, i32 18, metadata !5, i32 0, i32 1, %"class.echidna::capybara::mongoose::fluffy"* @_ZN7echidna8capybara8mongoose6animalE, null} ; [ DW_TAG_variable ] [animal] [line 18] [def]
+!40 = metadata !{i32 786484, i32 0, null, metadata !"w", metadata !"w", metadata !"", metadata !22, i32 29, metadata !27, i32 1, i32 1, %"struct.<anonymous namespace>::walrus"* @w, null} ; [ DW_TAG_variable ] [w] [line 29] [local] [def]
+!41 = metadata !{i32 786484, i32 0, null, metadata !"wom", metadata !"wom", metadata !"", metadata !22, i32 38, metadata !13, i32 0, i32 1, %struct.wombat* @wom, null} ; [ DW_TAG_variable ] [wom] [line 38] [def]
+!42 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!43 = metadata !{metadata !"clang version 3.4 "}
+!44 = metadata !{i32 786688, metadata !21, metadata !"b", metadata !22, i32 7, metadata !45, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 7]
+!45 = metadata !{i32 786451, metadata !1, metadata !21, metadata !"baz", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [baz] [line 6, size 8, align 8, offset 0] [def] [from ]
+!46 = metadata !{i32 7, i32 0, metadata !21, null}
+!47 = metadata !{i32 8, i32 0, metadata !21, null} ; [ DW_TAG_imported_declaration ]
+!48 = metadata !{i32 29, i32 0, metadata !25, null}
+!49 = metadata !{i32 786689, metadata !26, metadata !"this", null, i32 16777216, metadata !50, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!50 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !27} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from walrus]
+!51 = metadata !{i32 0, i32 0, metadata !26, null}
+!52 = metadata !{i32 25, i32 0, metadata !26, null}
+!53 = metadata !{i32 25, i32 0, metadata !35, null}
+!54 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/gnu-public-names-empty.ll b/test/DebugInfo/X86/gnu-public-names-empty.ll
new file mode 100644
index 0000000..8b0309c
--- /dev/null
+++ b/test/DebugInfo/X86/gnu-public-names-empty.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
+
+; Generated from:
+
+; static int a __attribute__((section("a")));
+
+; Check that the attributes in the compile unit both point to a correct
+; location, even when nothing is exported.
+; CHECK: DW_AT_GNU_pubnames [DW_FORM_sec_offset]   (0x00000000)
+; CHECK: DW_AT_GNU_pubtypes [DW_FORM_sec_offset]   (0x00000000)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 191846) (llvm/trunk 191866)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!4 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/gnu-public-names.ll b/test/DebugInfo/X86/gnu-public-names.ll
new file mode 100644
index 0000000..7ad5032
--- /dev/null
+++ b/test/DebugInfo/X86/gnu-public-names.ll
@@ -0,0 +1,219 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections < %s | FileCheck -check-prefix=ASM %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-gnu-dwarf-pub-sections -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
+; ModuleID = 'dwarf-public-names.cpp'
+;
+; Generated from:
+;
+; struct C {
+;   void member_function();
+;   static int static_member_function();
+;   static int static_member_variable;
+; };
+;
+; int C::static_member_variable = 0;
+;
+; void C::member_function() {
+;   static_member_variable = 0;
+; }
+;
+; int C::static_member_function() {
+;   return static_member_variable;
+; }
+;
+; C global_variable;
+;
+; int global_function() {
+;   return -1;
+; }
+;
+; namespace ns {
+;   void global_namespace_function() {
+;     global_variable.member_function();
+;   }
+;   int global_namespace_variable = 1;
+;   struct D {
+;     int A;
+;   } d;
+; }
+
+; ASM: .section        .debug_gnu_pubnames
+; ASM: .byte   32                      # Kind: VARIABLE, EXTERNAL
+; ASM-NEXT: .asciz  "global_variable"       # External Name
+
+; ASM: .section        .debug_gnu_pubtypes
+; ASM: .byte   16                      # Kind: TYPE, EXTERNAL
+; ASM-NEXT: .asciz  "C"                     # External Name
+
+; CHECK: .debug_info contents:
+; CHECK: DW_AT_GNU_pubnames [DW_FORM_sec_offset]   (0x00000000)
+; CHECK: DW_AT_GNU_pubtypes [DW_FORM_sec_offset]   (0x00000000)
+
+; CHECK: [[C:[0-9a-f]+]]: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}} "C"
+
+; CHECK: [[STATIC_MEM_DECL:[0-9a-f]+]]: DW_TAG_member
+; CHECK-NEXT: DW_AT_name {{.*}} "static_member_variable"
+
+; CHECK: [[MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_MIPS_linkage_name
+; CHECK-NEXT: DW_AT_name {{.*}} "member_function"
+
+; CHECK: [[STATIC_MEM_FUNC_DECL:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_MIPS_linkage_name
+; CHECK-NEXT: DW_AT_name {{.*}} "static_member_function"
+
+; CHECK: [[INT:[0-9a-f]+]]: DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name {{.*}} "int"
+
+; CHECK: [[STATIC_MEM_VAR:[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_DECL]]
+
+; CHECK: [[GLOB_VAR:[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_name {{.*}} "global_variable"
+
+; CHECK: [[NS:[0-9a-f]+]]: DW_TAG_namespace
+; CHECK-NEXT: DW_AT_name {{.*}} "ns"
+
+; CHECK: [[GLOB_NS_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_name {{.*}} "global_namespace_variable"
+
+; CHECK: [[D_VAR_DECL:[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_name {{.*}} "d"
+
+; CHECK: [[D:[0-9a-f]+]]: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}} "D"
+
+; CHECK: [[GLOB_NS_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_MIPS_linkage_name
+; CHECK-NEXT: DW_AT_name {{.*}} "global_namespace_function"
+
+; CHECK: [[GLOB_NS_VAR:[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}}[[GLOB_NS_VAR_DECL]]
+
+; CHECK: [[D_VAR:[0-9a-f]+]]: DW_TAG_variable
+; CHECK-NEXT: DW_AT_specification {{.*}}[[D_VAR_DECL]]
+
+; CHECK: [[MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_specification {{.*}}[[MEM_FUNC_DECL]]
+
+; CHECK: [[STATIC_MEM_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_specification {{.*}}[[STATIC_MEM_FUNC_DECL]]
+
+; CHECK: [[GLOBAL_FUNC:[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_MIPS_linkage_name
+; CHECK-NEXT: DW_AT_name {{.*}} "global_function"
+
+; CHECK-LABEL: .debug_gnu_pubnames contents:
+; CHECK-NEXT: length = 0x000000e7 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x0000017b
+; CHECK-NEXT: Offset     Linkage  Kind     Name
+; CHECK-DAG:  [[GLOBAL_FUNC]] EXTERNAL FUNCTION "global_function"
+; CHECK-DAG:  [[NS]] EXTERNAL TYPE     "ns"
+; CHECK-DAG:  [[MEM_FUNC]] EXTERNAL FUNCTION "C::member_function"
+; CHECK-DAG:  [[GLOB_VAR]] EXTERNAL VARIABLE "global_variable"
+; CHECK-DAG:  [[GLOB_NS_VAR]] EXTERNAL VARIABLE "ns::global_namespace_variable"
+; CHECK-DAG:  [[GLOB_NS_FUNC]] EXTERNAL FUNCTION "ns::global_namespace_function"
+; CHECK-DAG:  [[D_VAR]] EXTERNAL VARIABLE "ns::d"
+; CHECK-DAG:  [[STATIC_MEM_VAR]] EXTERNAL VARIABLE "C::static_member_variable"
+; CHECK-DAG:  [[STATIC_MEM_FUNC]] EXTERNAL FUNCTION "C::static_member_function"
+
+
+; CHECK-LABEL: debug_gnu_pubtypes contents:
+; CHECK: Offset     Linkage  Kind     Name
+; CHECK-DAG:  [[C]] EXTERNAL TYPE     "C"
+; CHECK-DAG:  [[D]] EXTERNAL TYPE     "ns::D"
+; CHECK-DAG:  [[INT]] STATIC   TYPE     "int"
+
+%struct.C = type { i8 }
+%"struct.ns::D" = type { i32 }
+
+@_ZN1C22static_member_variableE = global i32 0, align 4
+@global_variable = global %struct.C zeroinitializer, align 1
+@_ZN2ns25global_namespace_variableE = global i32 1, align 4
+@_ZN2ns1dE = global %"struct.ns::D" zeroinitializer, align 4
+
+; Function Attrs: nounwind uwtable
+define void @_ZN1C15member_functionEv(%struct.C* %this) #0 align 2 {
+entry:
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !36), !dbg !38
+  %this1 = load %struct.C** %this.addr
+  store i32 0, i32* @_ZN1C22static_member_variableE, align 4, !dbg !39
+  ret void, !dbg !39
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind uwtable
+define i32 @_ZN1C22static_member_functionEv() #0 align 2 {
+entry:
+  %0 = load i32* @_ZN1C22static_member_variableE, align 4, !dbg !40
+  ret i32 %0, !dbg !40
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @_Z15global_functionv() #0 {
+entry:
+  ret i32 -1, !dbg !41
+}
+
+; Function Attrs: nounwind uwtable
+define void @_ZN2ns25global_namespace_functionEv() #0 {
+entry:
+  call void @_ZN1C15member_functionEv(%struct.C* @global_variable), !dbg !42
+  ret void, !dbg !42
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!34, !43}
+!llvm.ident = !{!35}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 192862) (llvm/trunk 192861)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !21, metadata !29, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/pubnames.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"pubnames.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !17}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !8, metadata !13}
+!6 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1C", metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !7, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
+!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !9, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !12, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{null, metadata !11}
+!11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!12 = metadata !{i32 786468}
+!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !16, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{metadata !7}
+!16 = metadata !{i32 786468}
+!17 = metadata !{i32 786451, metadata !1, metadata !18, metadata !"D", i32 21, i64 32, i64 32, i32 0, i32 0, null, metadata !19, i32 0, null, null, metadata !"_ZTSN2ns1DE"} ; [ DW_TAG_structure_type ] [D] [line 21, size 32, align 32, offset 0] [def] [from ]
+!18 = metadata !{i32 786489, metadata !1, null, metadata !"ns", i32 17} ; [ DW_TAG_namespace ] [ns] [line 17]
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN2ns1DE", metadata !"A", i32 22, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] [A] [line 22, size 32, align 32, offset 0] [from int]
+!21 = metadata !{metadata !22, metadata !23, metadata !24, metadata !26}
+!22 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !8, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
+!23 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !13, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [static_member_function]
+!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 15, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !2, i32 15} ; [ DW_TAG_subprogram ] [line 15] [def] [global_function]
+!25 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/pubnames.cpp]
+!26 = metadata !{i32 786478, metadata !1, metadata !18, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 18, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !2, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [global_namespace_function]
+!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{null}
+!29 = metadata !{metadata !30, metadata !31, metadata !32, metadata !33}
+!30 = metadata !{i32 786484, i32 0, metadata !4, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !25, i32 7, metadata !7, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !6} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
+!31 = metadata !{i32 786484, i32 0, null, metadata !"global_variable", metadata !"global_variable", metadata !"", metadata !25, i32 13, metadata !4, i32 0, i32 1, %struct.C* @global_variable, null} ; [ DW_TAG_variable ] [global_variable] [line 13] [def]
+!32 = metadata !{i32 786484, i32 0, metadata !18, metadata !"global_namespace_variable", metadata !"global_namespace_variable", metadata !"_ZN2ns25global_namespace_variableE", metadata !25, i32 19, metadata !7, i32 0, i32 1, i32* @_ZN2ns25global_namespace_variableE, null} ; [ DW_TAG_variable ] [global_namespace_variable] [line 19] [def]
+!33 = metadata !{i32 786484, i32 0, metadata !18, metadata !"d", metadata !"d", metadata !"_ZN2ns1dE", metadata !25, i32 23, metadata !17, i32 0, i32 1, %"struct.ns::D"* @_ZN2ns1dE, null} ; [ DW_TAG_variable ] [d] [line 23] [def]
+!34 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!35 = metadata !{metadata !"clang version 3.4 (trunk 192862) (llvm/trunk 192861)"}
+!36 = metadata !{i32 786689, metadata !22, metadata !"this", null, i32 16777216, metadata !37, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!37 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!38 = metadata !{i32 0, i32 0, metadata !22, null}
+!39 = metadata !{i32 9, i32 0, metadata !22, null}
+!40 = metadata !{i32 11, i32 0, metadata !23, null}
+!41 = metadata !{i32 15, i32 0, metadata !24, null}
+!42 = metadata !{i32 18, i32 0, metadata !26, null}
+
+!43 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/instcombine-instrinsics.ll b/test/DebugInfo/X86/instcombine-instrinsics.ll
index 886b0eb..41dd09f 100644
--- a/test/DebugInfo/X86/instcombine-instrinsics.ll
+++ b/test/DebugInfo/X86/instcombine-instrinsics.ll
@@ -60,6 +60,7 @@ declare i32 @put(i64, i64*, i64, %struct.i24*) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!73}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.3 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !48, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"i1", metadata !""}
@@ -67,10 +68,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !3 = metadata !{metadata !4, metadata !21, metadata !33, metadata !47}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"i2", metadata !"i2", metadata !"", i32 31, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, %struct.i3* (i64)* @barz, null, null, metadata !16, i32 32} ; [ DW_TAG_subprogram ] [line 31]  [scope 32]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !13}
 !8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i3]
-!9 = metadata !{i32 786451, metadata !1, null, metadata !"i3", i32 25, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null} ; [ DW_TAG_structure_type ]  [line 25, size 32, align 32, offset 0] [from ]
+!9 = metadata !{i32 786451, metadata !1, null, metadata !"i3", i32 25, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [i3] [line 25, size 32, align 32, offset 0] [def] [from ]
 !10 = metadata !{metadata !11}
 !11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"i4", i32 26, i64 32, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]  [line 26, size 32, align 32, offset 0] [from i5]
 !12 = metadata !{i32 786468, null, null, metadata !"i5", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]  [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
@@ -79,12 +80,12 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !15 = metadata !{i32 786468, null, null, metadata !"i8", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]  [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
 !16 = metadata !{}
 !21 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"i13", metadata !"i13", metadata !"", i32 42, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @init, null, null, metadata !24, i32 43} ; [ DW_TAG_subprogram ] [line 42]  [scope 43]
-!22 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25}
 !25 = metadata !{i32 786688, metadata !21, metadata !"i14", metadata !5, i32 45, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ]  [line 45]
 !27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from i14]
-!28 = metadata !{i32 786451, metadata !1, null, metadata !"i14", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !29, i32 0, null, null} ; [ DW_TAG_structure_type ]  [line 16, size 32, align 32, offset 0] [from ]
+!28 = metadata !{i32 786451, metadata !1, null, metadata !"i14", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [i14] [line 16, size 32, align 32, offset 0] [def] [from ]
 !29 = metadata !{metadata !30}
 !30 = metadata !{i32 786445, metadata !1, metadata !28, metadata !"i16", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !31} ; [ DW_TAG_member ]  [line 17, size 32, align 32, offset 0] [from i17]
 !31 = metadata !{i32 786454, metadata !1, null, metadata !"i17", i32 7, i64 0, i64 0, i64 0, i32 0, metadata !32} ; [ DW_TAG_typedef ]  [line 7, size 0, align 0, offset 0] [from int]
@@ -98,3 +99,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !52 = metadata !{i64 0}
 !55 = metadata !{%struct.i3* null}
 !72 = metadata !{%struct.i24* null}
+!73 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/line-info.ll b/test/DebugInfo/X86/line-info.ll
index fd813b3..46daccf 100644
--- a/test/DebugInfo/X86/line-info.ll
+++ b/test/DebugInfo/X86/line-info.ll
@@ -36,6 +36,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!19}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2,  metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/list0.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"list0.c", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
@@ -44,15 +45,16 @@ attributes #1 = { nounwind readnone }
 !4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !5 = metadata !{metadata !"./list0.h", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
 !6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/./list0.h]
-!7 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
 !11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
-!12 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{metadata !9}
 !14 = metadata !{i32 786689, metadata !4, metadata !"x", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [x] [line 1]
 !15 = metadata !{i32 1, i32 0, metadata !4, null}
 !16 = metadata !{i32 2, i32 0, metadata !4, null}
 !17 = metadata !{i32 3, i32 0, metadata !18, null}
 !18 = metadata !{i32 786443, metadata !11, metadata !10} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/scratch/list0.c]
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
index 1ed7b18..3d11667 100644
--- a/test/DebugInfo/X86/linkage-name.ll
+++ b/test/DebugInfo/X86/linkage-name.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-macosx -darwin-gdb-compat=Disable %s -o %t -filetype=obj
+; RUN: llc -mtriple=x86_64-macosx %s -o %t -filetype=obj
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; CHECK: DW_TAG_subprogram [9] *
@@ -25,17 +25,18 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!29}
 
 !0 = metadata !{i32 786449, metadata !28, i32 4, metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18,  metadata !18, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !6, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, metadata !16, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786434, metadata !28, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null} ; [ DW_TAG_class_type ]
+!10 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786434, metadata !28, null, metadata !"A", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 8, align 8, offset 0] [def] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786478, metadata !6, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !14, i32 0} ; [ DW_TAG_subprogram ]
 !14 = metadata !{metadata !15}
@@ -52,3 +53,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !26 = metadata !{i32 6, i32 4, metadata !27, null}
 !27 = metadata !{i32 786443, metadata !6, metadata !5, i32 5, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
 !28 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo"}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/lit.local.cfg b/test/DebugInfo/X86/lit.local.cfg
index 60d66ea..19840aa 100644
--- a/test/DebugInfo/X86/lit.local.cfg
+++ b/test/DebugInfo/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/DebugInfo/X86/low-pc-cu.ll b/test/DebugInfo/X86/low-pc-cu.ll
index f7e1aae..922ae8d 100644
--- a/test/DebugInfo/X86/low-pc-cu.ll
+++ b/test/DebugInfo/X86/low-pc-cu.ll
@@ -13,13 +13,14 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16}
 
 !0 = metadata !{i32 786449, metadata !15, i32 4, metadata !"clang version 3.1 (trunk 153454) (llvm/trunk 153471)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !12}
-!5 = metadata !{i32 786478, metadata !6, i32 0, metadata !"q", metadata !"q", metadata !"_Z1qv", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z1qv, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 786478, metadata !6, null, metadata !"q", metadata !"q", metadata !"_Z1qv", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z1qv, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [q]
 !6 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, metadata !15, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
@@ -28,3 +29,4 @@ entry:
 !13 = metadata !{i32 7, i32 1, metadata !14, null}
 !14 = metadata !{i32 786443, metadata !5, i32 5, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/misched-dbg-value.ll b/test/DebugInfo/X86/misched-dbg-value.ll
index 9aa362f..cfb0667 100644
--- a/test/DebugInfo/X86/misched-dbg-value.ll
+++ b/test/DebugInfo/X86/misched-dbg-value.ll
@@ -88,10 +88,11 @@ attributes #0 = { nounwind optsize ssp uwtable }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!83}
 
 !0 = metadata !{i32 786449, metadata !82, i32 12, metadata !"clang version 3.3 (trunk 175015)", i1 true, metadata !"", i32 0, metadata !1, metadata !10, metadata !11, metadata !29,  metadata !29, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/rdar_13183203/test2/dry.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
-!2 = metadata !{i32 786436, metadata !82, null, metadata !"", i32 128, i64 32, i64 32, i32 0, i32 0, null, metadata !4, i32 0, i32 0} ; [ DW_TAG_enumeration_type ] [line 128, size 32, align 32, offset 0] [from ]
+!2 = metadata !{i32 786436, metadata !82, null, metadata !"", i32 128, i64 32, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 128, size 32, align 32, offset 0] [def] [from ]
 !3 = metadata !{i32 786473, metadata !82} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8, metadata !9}
 !5 = metadata !{i32 786472, metadata !"Ident1", i64 0} ; [ DW_TAG_enumerator ] [Ident1 :: 0]
@@ -102,12 +103,12 @@ attributes #1 = { nounwind readnone }
 !10 = metadata !{i32 0}
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 786478, metadata !82, metadata !3, metadata !"Proc8", metadata !"Proc8", metadata !"", i32 180, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, void (i32*, [51 x i32]*, i32, i32)* @Proc8, null, null, metadata !22, i32 185} ; [ DW_TAG_subprogram ] [line 180] [def] [scope 185] [Proc8]
-!13 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{null, metadata !15, metadata !17, metadata !21, metadata !21}
 !15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
 !16 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !18} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!18 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1632, i64 32, i32 0, i32 0, metadata !16, metadata !19, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 1632, align 32, offset 0] [from int]
+!18 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1632, i64 32, i32 0, i32 0, metadata !16, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1632, align 32, offset 0] [from int]
 !19 = metadata !{metadata !20}
 !20 = metadata !{i32 786465, i64 0, i64 51}       ; [ DW_TAG_subrange_type ] [0, 50]
 !21 = metadata !{i32 786454, metadata !82, null, metadata !"OneToFifty", i32 132, i64 0, i64 0, i64 0, i32 0, metadata !16} ; [ DW_TAG_typedef ] [OneToFifty] [line 132, size 0, align 0, offset 0] [from int]
@@ -120,7 +121,7 @@ attributes #1 = { nounwind readnone }
 !28 = metadata !{i32 786688, metadata !12, metadata !"IntIndex", metadata !3, i32 187, metadata !21, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [IntIndex] [line 187]
 !29 = metadata !{metadata !30, metadata !35, metadata !36, metadata !38, metadata !39, metadata !40, metadata !42, metadata !46, metadata !63}
 !30 = metadata !{i32 786484, i32 0, null, metadata !"Version", metadata !"Version", metadata !"", metadata !3, i32 111, metadata !31, i32 0, i32 1, [4 x i8]* @Version, null} ; [ DW_TAG_variable ] [Version] [line 111] [def]
-!31 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !32, metadata !33, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
+!31 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 32, i64 8, i32 0, i32 0, metadata !32, metadata !33, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char]
 !32 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !33 = metadata !{metadata !34}
 !34 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
@@ -133,13 +134,13 @@ attributes #1 = { nounwind readnone }
 !41 = metadata !{i32 786454, metadata !82, null, metadata !"Array1Dim", i32 135, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_typedef ] [Array1Dim] [line 135, size 0, align 0, offset 0] [from ]
 !42 = metadata !{i32 786484, i32 0, null, metadata !"Array2Glob", metadata !"Array2Glob", metadata !"", metadata !3, i32 176, metadata !43, i32 0, i32 1, [51 x [51 x i32]]* @Array2Glob, null} ; [ DW_TAG_variable ] [Array2Glob] [line 176] [def]
 !43 = metadata !{i32 786454, metadata !82, null, metadata !"Array2Dim", i32 136, i64 0, i64 0, i64 0, i32 0, metadata !44} ; [ DW_TAG_typedef ] [Array2Dim] [line 136, size 0, align 0, offset 0] [from ]
-!44 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 83232, i64 32, i32 0, i32 0, metadata !16, metadata !45, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 83232, align 32, offset 0] [from int]
+!44 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 83232, i64 32, i32 0, i32 0, metadata !16, metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 83232, align 32, offset 0] [from int]
 !45 = metadata !{metadata !20, metadata !20}
 !46 = metadata !{i32 786484, i32 0, null, metadata !"PtrGlb", metadata !"PtrGlb", metadata !"", metadata !3, i32 177, metadata !47, i32 0, i32 1, %struct.Record** @PtrGlb, null} ; [ DW_TAG_variable ] [PtrGlb] [line 177] [def]
 !47 = metadata !{i32 786454, metadata !82, null, metadata !"RecordPtr", i32 148, i64 0, i64 0, i64 0, i32 0, metadata !48} ; [ DW_TAG_typedef ] [RecordPtr] [line 148, size 0, align 0, offset 0] [from ]
 !48 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !49} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from RecordType]
 !49 = metadata !{i32 786454, metadata !82, null, metadata !"RecordType", i32 147, i64 0, i64 0, i64 0, i32 0, metadata !50} ; [ DW_TAG_typedef ] [RecordType] [line 147, size 0, align 0, offset 0] [from Record]
-!50 = metadata !{i32 786451, metadata !82, null, metadata !"Record", i32 138, i64 448, i64 64, i32 0, i32 0, null, metadata !51, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [Record] [line 138, size 448, align 64, offset 0] [from ]
+!50 = metadata !{i32 786451, metadata !82, null, metadata !"Record", i32 138, i64 448, i64 64, i32 0, i32 0, null, metadata !51, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [Record] [line 138, size 448, align 64, offset 0] [def] [from ]
 !51 = metadata !{metadata !52, metadata !54, metadata !56, metadata !57, metadata !58}
 !52 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"PtrComp", i32 140, i64 64, i64 64, i64 0, i32 0, metadata !53} ; [ DW_TAG_member ] [PtrComp] [line 140, size 64, align 64, offset 0] [from ]
 !53 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !50} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Record]
@@ -149,7 +150,7 @@ attributes #1 = { nounwind readnone }
 !57 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"IntComp", i32 143, i64 32, i64 32, i64 128, i32 0, metadata !21} ; [ DW_TAG_member ] [IntComp] [line 143, size 32, align 32, offset 128] [from OneToFifty]
 !58 = metadata !{i32 786445, metadata !82, metadata !50, metadata !"StringComp", i32 144, i64 248, i64 8, i64 160, i32 0, metadata !59} ; [ DW_TAG_member ] [StringComp] [line 144, size 248, align 8, offset 160] [from String30]
 !59 = metadata !{i32 786454, metadata !82, null, metadata !"String30", i32 134, i64 0, i64 0, i64 0, i32 0, metadata !60} ; [ DW_TAG_typedef ] [String30] [line 134, size 0, align 0, offset 0] [from ]
-!60 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 248, i64 8, i32 0, i32 0, metadata !32, metadata !61, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 248, align 8, offset 0] [from char]
+!60 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 248, i64 8, i32 0, i32 0, metadata !32, metadata !61, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 248, align 8, offset 0] [from char]
 !61 = metadata !{metadata !62}
 !62 = metadata !{i32 786465, i64 0, i64 31}       ; [ DW_TAG_subrange_type ] [0, 30]
 !63 = metadata !{i32 786484, i32 0, null, metadata !"PtrGlbNext", metadata !"PtrGlbNext", metadata !"", metadata !3, i32 178, metadata !47, i32 0, i32 1, %struct.Record** @PtrGlbNext, null} ; [ DW_TAG_variable ] [PtrGlbNext] [line 178] [def]
@@ -169,3 +170,4 @@ attributes #1 = { nounwind readnone }
 !80 = metadata !{i32 197, i32 0, metadata !12, null}
 !81 = metadata !{i32 198, i32 0, metadata !12, null}
 !82 = metadata !{metadata !"dry.c", metadata !"/Users/manmanren/test-Nov/rdar_13183203/test2"}
+!83 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/multiple-aranges.ll b/test/DebugInfo/X86/multiple-aranges.ll
new file mode 100644
index 0000000..4c205d8
--- /dev/null
+++ b/test/DebugInfo/X86/multiple-aranges.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s | FileCheck %s
+
+; First CU
+; CHECK:      .long   44                      # Length of ARange Set
+; CHECK-NEXT: .short  2                       # DWARF Arange version number
+; CHECK-NEXT: .long   .L.debug_info_begin0    # Offset Into Debug Info Section
+; CHECK-NEXT: .byte   8                       # Address Size (in bytes)
+; CHECK-NEXT: .byte   0                       # Segment Size (in bytes)
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .quad   kittens
+; CHECK-NEXT: .Lset0 = rainbows-kittens
+; CHECK-NEXT: .quad   .Lset0
+; CHECK-NEXT: .quad   0                       # ARange terminator
+; CHECK-NEXT: .quad   0
+
+; Second CU
+; CHECK-NEXT: .long   44                      # Length of ARange Set
+; CHECK-NEXT: .short  2                       # DWARF Arange version number
+; CHECK-NEXT: .long   .L.debug_info_begin1    # Offset Into Debug Info Section
+; CHECK-NEXT: .byte   8                       # Address Size (in bytes)
+; CHECK-NEXT: .byte   0                       # Segment Size (in bytes)
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .byte   255
+; CHECK-NEXT: .quad   rainbows
+; CHECK-NEXT: .Lset1 = .Ldebug_end0-rainbows
+; CHECK-NEXT: .quad   .Lset1
+; CHECK-NEXT: .quad   0                       # ARange terminator
+; CHECK-NEXT: .quad   0
+
+
+; Generated from: clang -c -g -emit-llvm
+;                 llvm-link test1.bc test2.bc -o test.bc
+; test1.c: int kittens = 4;
+; test2.c: int rainbows = 5;
+
+
+
+
+; ModuleID = 'test.bc'
+target triple = "x86_64-unknown-linux-gnu"
+
+@kittens = global i32 4, align 4
+@rainbows = global i32 5, align 4
+
+!llvm.dbg.cu = !{!0, !7}
+!llvm.module.flags = !{!12, !13}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/kayamon/test1.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test1.c", metadata !"/home/kayamon"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786484, i32 0, null, metadata !"kittens", metadata !"kittens", metadata !"", metadata !5, i32 1, metadata !6, i32 0, i32 1, i32* @kittens, null} ; [ DW_TAG_variable ] [kittens] [line 1] [def]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/kayamon/test1.c]
+!6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!7 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !9, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/kayamon/test2.c] [DW_LANG_C99]
+!8 = metadata !{metadata !"test2.c", metadata !"/home/kayamon"}
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786484, i32 0, null, metadata !"rainbows", metadata !"rainbows", metadata !"", metadata !11, i32 1, metadata !6, i32 0, i32 1, i32* @rainbows, null} ; [ DW_TAG_variable ] [rainbows] [line 1] [def]
+!11 = metadata !{i32 786473, metadata !8}         ; [ DW_TAG_file_type ] [/home/kayamon/test2.c]
+!12 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/multiple-at-const-val.ll b/test/DebugInfo/X86/multiple-at-const-val.ll
index 2e02cbd..9a66061 100644
--- a/test/DebugInfo/X86/multiple-at-const-val.ll
+++ b/test/DebugInfo/X86/multiple-at-const-val.ll
@@ -8,7 +8,7 @@
 ; CHECK: DW_TAG_class_type
 ; CHECK: DW_TAG_member
 ; CHECK: badbit
-; CHECK: DW_AT_const_value [DW_FORM_data4]	(0x00000001)
+; CHECK: DW_AT_const_value [DW_FORM_sdata]      (1)
 ; CHECK-NOT: DW_AT_const_value
 ; CHECK: NULL
 
@@ -30,22 +30,23 @@ declare %"class.std::basic_ostream"* @test(%"class.std::basic_ostream"*, i8*, i6
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!1803}
 
 !0 = metadata !{i32 786449, metadata !1802, i32 4, metadata !"clang version 3.3 (trunk 174207)", i1 true, metadata !"", i32 0, metadata !1, metadata !955, metadata !956, metadata !1786,  metadata !1786, metadata !""} ; [ DW_TAG_compile_unit ] [/privite/tmp/student2.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !26}
 !4 = metadata !{i32 786489, null, metadata !"std", metadata !5, i32 48} ; [ DW_TAG_namespace ]
 !5 = metadata !{i32 786473, metadata !1801} ; [ DW_TAG_file_type ]
 !25 = metadata !{i32 786472, metadata !"_S_os_fmtflags_end", i64 65536} ; [ DW_TAG_enumerator ]
-!26 = metadata !{i32 786436, metadata !1801, metadata !4, metadata !"_Ios_Iostate", i32 146, i64 32, i64 32, i32 0, i32 0, null, metadata !27, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!26 = metadata !{i32 786436, metadata !1801, metadata !4, metadata !"_Ios_Iostate", i32 146, i64 32, i64 32, i32 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [_Ios_Iostate] [line 146, size 32, align 32, offset 0] [def] [from ]
 !27 = metadata !{metadata !28, metadata !29, metadata !30, metadata !31, metadata !32}
 !28 = metadata !{i32 786472, metadata !"_S_goodbit", i64 0} ; [ DW_TAG_enumerator ] [_S_goodbit :: 0]
 !29 = metadata !{i32 786472, metadata !"_S_badbit", i64 1} ; [ DW_TAG_enumerator ] [_S_badbit :: 1]
 !30 = metadata !{i32 786472, metadata !"_S_eofbit", i64 2} ; [ DW_TAG_enumerator ] [_S_eofbit :: 2]
 !31 = metadata !{i32 786472, metadata !"_S_failbit", i64 4} ; [ DW_TAG_enumerator ] [_S_failbit :: 4]
 !32 = metadata !{i32 786472, metadata !"_S_os_ostate_end", i64 65536} ; [ DW_TAG_enumerator ] [_S_os_ostate_end :: 65536]
-!49 = metadata !{i32 786434, metadata !1801, metadata !4, metadata !"os_base", i32 200, i64 1728, i64 64, i32 0, i32 0, null, metadata !50, i32 0, metadata !49, null} ; [ DW_TAG_class_type ]
+!49 = metadata !{i32 786434, metadata !1801, metadata !4, metadata !"os_base", i32 200, i64 1728, i64 64, i32 0, i32 0, null, metadata !50, i32 0, metadata !49, null, null} ; [ DW_TAG_class_type ] [os_base] [line 200, size 1728, align 64, offset 0] [def] [from ]
 !50 = metadata !{metadata !77}
-!54 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !55, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!54 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !55, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !55 = metadata !{metadata !56}
 !56 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !77 = metadata !{i32 786445, metadata !1801, metadata !49, metadata !"badbit", i32 331, i64 0, i64 0, i64 0, i32 4096, metadata !78, i32 1} ; [ DW_TAG_member ]
@@ -59,3 +60,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !1800 = metadata !{i32 786484, i32 0, metadata !5, metadata !"badbit", metadata !"badbit", metadata !"badbit", metadata !5, i32 331, metadata !78, i32 1, i32 1, i32 1, metadata !77} ; [ DW_TAG_variable ]
 !1801 = metadata !{metadata !"os_base.h", metadata !"/privite/tmp"}
 !1802 = metadata !{metadata !"student2.cpp", metadata !"/privite/tmp"}
+!1803 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/nondefault-subrange-array.ll b/test/DebugInfo/X86/nondefault-subrange-array.ll
index a5f786c..91065a3 100644
--- a/test/DebugInfo/X86/nondefault-subrange-array.ll
+++ b/test/DebugInfo/X86/nondefault-subrange-array.ll
@@ -8,41 +8,45 @@
 ; Check that we can handle non-default array bounds. In this case, the array
 ; goes from [-3, 38].
 
-; CHECK:      0x0000002d:   DW_TAG_base_type [3]
-; CHECK-NEXT: 0x0000002e:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x00000041] = "int")
-; CHECK-NEXT: 0x00000032:     DW_AT_byte_size [DW_FORM_data1] (0x04)
-; CHECK-NEXT: 0x00000033:     DW_AT_encoding [DW_FORM_data1]  (0x05)
+; CHECK: DW_TAG_class_type
+; CHECK: DW_TAG_member
+; CHECK-NEXT:                   DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "x")
+; CHECK-NEXT:                   DW_AT_type [DW_FORM_ref4]       (cu + 0x{{[0-9a-f]*}} => {[[ARRAY:0x[0-9a-f]*]]})
 
-; CHECK:      0x00000034:   DW_TAG_array_type [4] *
-; CHECK-NEXT: 0x00000035:     DW_AT_type [DW_FORM_ref4]    (cu + 0x0026 => {0x00000026})
+; CHECK: [[ARRAY]]: DW_TAG_array_type [{{.*}}] *
+; CHECK-NEXT:                 DW_AT_type [DW_FORM_ref4]    (cu + 0x{{[0-9a-f]*}} => {[[BASE:0x[0-9a-f]*]]})
 
-; CHECK:      0x00000039:     DW_TAG_subrange_type [5]
-; CHECK-NEXT: 0x0000003a:       DW_AT_type [DW_FORM_ref4]  (cu + 0x002d => {0x0000002d})
-; CHECK-NEXT: 0x0000003e:       DW_AT_lower_bound [DW_FORM_data8]       (0xfffffffffffffffd)
-; CHECK-NEXT: 0x00000046:       DW_AT_upper_bound [DW_FORM_data1]       (0x26)
+; CHECK: DW_TAG_subrange_type
+; CHECK-NEXT:                   DW_AT_type [DW_FORM_ref4]  (cu + 0x{{[0-9a-f]*}} => {[[BASE2:0x[0-9a-f]*]]})
+; CHECK-NEXT:                   DW_AT_lower_bound [DW_FORM_data8]       (0xfffffffffffffffd)
+; CHECK-NEXT:                   DW_AT_upper_bound [DW_FORM_data1]       (0x26)
 
-; CHECK:      0x00000055:     DW_TAG_member [8]
-; CHECK-NEXT: 0x00000056:       DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000003f] = "x")
-; CHECK-NEXT: 0x0000005a:       DW_AT_type [DW_FORM_ref4]       (cu + 0x0034 => {0x00000034})
+; CHECK: [[BASE]]: DW_TAG_base_type
+; CHECK: [[BASE2]]: DW_TAG_base_type
+; CHECK-NEXT:                 DW_AT_name [DW_FORM_strp]       ( .debug_str[0x{{[0-9a-f]*}}] = "int")
+; CHECK-NEXT:                 DW_AT_byte_size [DW_FORM_data1] (0x04)
+; CHECK-NEXT:                 DW_AT_encoding [DW_FORM_data1]  (0x05)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.3 (trunk 169136)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Sandbox/llvm/t.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %class.A* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
 !6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [from ]
+!7 = metadata !{i32 786434, metadata !20, null, metadata !"A", i32 1, i64 0, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 0, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
 !9 = metadata !{i32 786445, metadata !20, metadata !7, metadata !"x", i32 1, i64 0, i64 0, i64 0, i32 1, metadata !10} ; [ DW_TAG_member ] [x] [line 1, size 0, align 0, offset 0] [private] [from ]
-!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !11, metadata !12, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!10 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !11, metadata !12, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786465, i64 -3, i64 42} ; [ DW_TAG_subrange_type ] [-3, 39]
 !14 = metadata !{i32 786478, metadata !6, metadata !7, metadata !"A", metadata !"A", metadata !"", i32 1, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !18, i32 1} ; [ DW_TAG_subprogram ] [line 1] [A]
-!15 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!17 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
 !18 = metadata !{metadata !19}
 !19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !20 = metadata !{metadata !"t.cpp", metadata !"/Volumes/Sandbox/llvm"}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/objc-fwd-decl.ll b/test/DebugInfo/X86/objc-fwd-decl.ll
index 3070ff8..a5e9632 100644
--- a/test/DebugInfo/X86/objc-fwd-decl.ll
+++ b/test/DebugInfo/X86/objc-fwd-decl.ll
@@ -1,16 +1,16 @@
 ; RUN: llc -mtriple=x86_64-macosx %s -o %t -filetype=obj
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
-; CHECK: 0x00000027:   DW_TAG_structure_type
-; CHECK: 0x0000002c:     DW_AT_declaration
-; CHECK: 0x0000002d:     DW_AT_APPLE_runtime_class
+; CHECK: DW_TAG_structure_type
+; CHECK:                 DW_AT_declaration
+; CHECK:                 DW_AT_APPLE_runtime_class
 
 %0 = type opaque
 
 @a = common global %0* null, align 8
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!9, !10, !11, !12}
+!llvm.module.flags = !{!9, !10, !11, !12, !14}
 
 !0 = metadata !{i32 786449, metadata !13, i32 16, metadata !"clang version 3.1 (trunk 152054 trunk 152094)", i1 false, metadata !"", i32 2, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
@@ -18,9 +18,10 @@
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 3, metadata !7, i32 0, i32 1, %0** @a, null} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !13, null, metadata !"FooBarBaz", i32 1, i32 0, i32 0, i32 0, i32 4, null, null, i32 16} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 786451, metadata !13, null, metadata !"FooBarBaz", i32 1, i32 0, i32 0, i32 0, i32 4, null, null, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [FooBarBaz] [line 1, size 0, align 0, offset 0] [decl] [from ]
 !9 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
 !10 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
 !11 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
 !12 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
 !13 = metadata !{metadata !"foo.m", metadata !"/Users/echristo"}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/op_deref.ll b/test/DebugInfo/X86/op_deref.ll
index 864cbef..300f13d 100644
--- a/test/DebugInfo/X86/op_deref.ll
+++ b/test/DebugInfo/X86/op_deref.ll
@@ -4,13 +4,14 @@
 ; DW-CHECK: DW_AT_name [DW_FORM_strp]  ( .debug_str[0x00000067] = "vla")
 ; FIXME: The location here needs to be fixed, but llvm-dwarfdump doesn't handle
 ; DW_AT_location lists yet.
-; DW-CHECK: DW_AT_location [DW_FORM_data4]                      (0x00000000)
+; DW-CHECK: DW_AT_location [DW_FORM_sec_offset]                      (0x00000000)
 
 ; Unfortunately llvm-dwarfdump can't unparse a list of DW_AT_locations
 ; right now, so we check the asm output:
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin %s -o - -filetype=asm | FileCheck %s -check-prefix=ASM-CHECK
 ; vla should have a register-indirect address at one point.
-; ASM-CHECK: DEBUG_VALUE: vla <- [RCX+0]
+; ASM-CHECK: DEBUG_VALUE: vla <- RCX
+; ASM-CHECK: DW_OP_breg2
 
 define void @testVLAwithSize(i32 %s) nounwind uwtable ssp {
 entry:
@@ -64,21 +65,22 @@ declare i8* @llvm.stacksave() nounwind
 declare void @llvm.stackrestore(i8*) nounwind
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!29}
 
 !0 = metadata !{i32 786449, metadata !28, i32 12, metadata !"clang version 3.2 (trunk 156005) (llvm/trunk 156000)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !28, metadata !6, metadata !"testVLAwithSize", metadata !"testVLAwithSize", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @testVLAwithSize, null, null, metadata !1, i32 2} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 786689, metadata !5, metadata !"s", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !11 = metadata !{i32 1, i32 26, metadata !5, null}
 !12 = metadata !{i32 3, i32 13, metadata !13, null}
 !13 = metadata !{i32 786443, metadata !28, metadata !5, i32 2, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 786688, metadata !13, metadata !"vla", metadata !6, i32 3, metadata !15, i32 0, i32 0, i64 2} ; [ DW_TAG_auto_variable ]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!14 = metadata !{i32 786688, metadata !13, metadata !"vla", metadata !6, i32 3, metadata !15, i32 8192, i32 0, i64 2} ; [ DW_TAG_auto_variable ]
+!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !9, metadata !16, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !16 = metadata !{metadata !17}
 !17 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
 !18 = metadata !{i32 3, i32 7, metadata !13, null}
@@ -92,3 +94,4 @@ declare void @llvm.stackrestore(i8*) nounwind
 !26 = metadata !{i32 5, i32 22, metadata !22, null}
 !27 = metadata !{i32 8, i32 1, metadata !13, null}
 !28 = metadata !{metadata !"bar.c", metadata !"/Users/echristo/tmp"}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/parameters.ll b/test/DebugInfo/X86/parameters.ll
index 7846924..fa91bd2 100644
--- a/test/DebugInfo/X86/parameters.ll
+++ b/test/DebugInfo/X86/parameters.ll
@@ -78,7 +78,7 @@ attributes #1 = { nounwind readnone }
 attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!21}
+!llvm.module.flags = !{!21, !33}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/pass.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"pass.cpp", metadata !"/tmp"}
@@ -86,19 +86,19 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !3 = metadata !{metadata !4, metadata !17}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func", metadata !"func", metadata !"_ZN7pr147634funcENS_3fooE", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%"struct.pr14763::foo"*, %"struct.pr14763::foo"*)* @_ZN7pr147634funcENS_3fooE, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [func]
 !5 = metadata !{i32 786489, metadata !1, null, metadata !"pr14763", i32 1} ; [ DW_TAG_namespace ] [pr14763] [line 1]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
-!8 = metadata !{i32 786451, metadata !1, metadata !5, metadata !"foo", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null} ; [ DW_TAG_structure_type ] [foo] [line 2, size 8, align 8, offset 0] [from ]
+!8 = metadata !{i32 786451, metadata !1, metadata !5, metadata !"foo", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 2, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
 !10 = metadata !{i32 786478, metadata !1, metadata !8, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !16, i32 3} ; [ DW_TAG_subprogram ] [line 3] [foo]
-!11 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13, metadata !14}
-!13 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
+!13 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo]
 !14 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !15} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo]
 !16 = metadata !{i32 786468}
 !17 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func2", metadata !"func2", metadata !"_ZN7pr147635func2EbNS_3fooE", i32 12, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i1, %"struct.pr14763::foo"*)* @_ZN7pr147635func2EbNS_3fooE, null, null, metadata !2, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [func2]
-!18 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{null, metadata !20, metadata !8}
 !20 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
 !21 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
@@ -113,3 +113,4 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !30 = metadata !{i32 786443, metadata !1, metadata !17, i32 13, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp/pass.cpp]
 !31 = metadata !{i32 14, i32 0, metadata !30, null}
 !32 = metadata !{i32 15, i32 0, metadata !17, null}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/pointer-type-size.ll b/test/DebugInfo/X86/pointer-type-size.ll
index a640069..cf789b2 100644
--- a/test/DebugInfo/X86/pointer-type-size.ll
+++ b/test/DebugInfo/X86/pointer-type-size.ll
@@ -9,16 +9,18 @@
 @crass = common global %struct.crass zeroinitializer, align 8
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!14}
 
 !0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.1 (trunk 147882)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 720948, i32 0, null, metadata !"crass", metadata !"crass", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, %struct.crass* @crass, null} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 720937, metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786451, metadata !13, null, metadata !"crass", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!7 = metadata !{i32 786451, metadata !13, null, metadata !"crass", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [crass] [line 1, size 64, align 64, offset 0] [def] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786445, metadata !13, metadata !7, metadata !"ptr", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
 !10 = metadata !{i32 720934, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !11} ; [ DW_TAG_const_type ]
 !11 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
 !12 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !13 = metadata !{metadata !"foo.c", metadata !"/Users/echristo/tmp"}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/pr11300.ll b/test/DebugInfo/X86/pr11300.ll
index 54e0c8b..caa24ee 100644
--- a/test/DebugInfo/X86/pr11300.ll
+++ b/test/DebugInfo/X86/pr11300.ll
@@ -3,8 +3,11 @@
 
 ; test that the DW_AT_specification is a back edge in the file.
 
-; CHECK: 0x0000005c:     DW_TAG_subprogram [5]
-; CHECK: 0x0000007c:     DW_AT_specification [DW_FORM_ref4]      (cu + 0x005c => {0x0000005c})
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_AT_name [DW_FORM_strp]	( .debug_str[0x{{[0-9a-f]*}}] = "zed")
+; CHECK: DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_specification [DW_FORM_ref4]      (cu + {{.*}} => {[[BACK:0x[0-9a-f]*]]})
+; CHECK: [[BACK]]:     DW_TAG_subprogram
 
 %struct.foo = type { i8 }
 
@@ -30,26 +33,27 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33}
 
 !0 = metadata !{i32 786449, metadata !32, i32 4, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !20}
-!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"zed", metadata !"zed", metadata !"_Z3zedP3foo", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%struct.foo*)* @_Z3zedP3foo, null, null, metadata !18, i32 4} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !6, metadata !6, metadata !"zed", metadata !"zed", metadata !"_Z3zedP3foo", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_Z3zedP3foo, null, null, metadata !21, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [zed]
 !6 = metadata !{i32 720937, metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 720898, metadata !32, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ]
+!10 = metadata !{i32 720898, metadata !32, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_class_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 720942, metadata !6, metadata !10, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !13, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !16, i32 2} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!13 = metadata !{i32 720917, i32 0, null, i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{null, metadata !15}
-!15 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
+!15 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
 !16 = metadata !{metadata !17}
 !17 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !18 = metadata !{metadata !19}
 !19 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!20 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%struct.foo*)* @_ZN3foo3barEv, null, metadata !12, metadata !21, i32 2} ; [ DW_TAG_subprogram ]
+!20 = metadata !{i32 720942, metadata !6, null, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEv", i32 2, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_ZN3foo3barEv, null, metadata !12, metadata !21, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [bar]
 !21 = metadata !{metadata !22}
 !22 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
 !23 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
@@ -62,3 +66,4 @@ entry:
 !30 = metadata !{i32 2, i32 15, metadata !31, null}
 !31 = metadata !{i32 786443, metadata !6, metadata !20, i32 2, i32 14, i32 1} ; [ DW_TAG_lexical_block ]
 !32 = metadata !{metadata !"/home/espindola/llvm/test.cc", metadata !"/home/espindola/tmpfs/build"}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/pr12831.ll b/test/DebugInfo/X86/pr12831.ll
index 0244d1e..6dea4a0 100644
--- a/test/DebugInfo/X86/pr12831.ll
+++ b/test/DebugInfo/X86/pr12831.ll
@@ -76,48 +76,49 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!162}
 
 !0 = metadata !{i32 786449, metadata !161, i32 4, metadata !"clang version 3.2 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !128, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !106, metadata !107, metadata !126, metadata !127}
 !5 = metadata !{i32 786478, metadata !6, null, metadata !"writeExpr", metadata !"writeExpr", metadata !"_ZN17BPLFunctionWriter9writeExprEv", i32 19, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.BPLFunctionWriter*)* @_ZN17BPLFunctionWriter9writeExprEv, null, metadata !103, metadata !1, i32 19} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !160} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
-!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 786434, metadata !160, null, metadata !"BPLFunctionWriter", i32 15, i64 64, i64 64, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ]
+!9 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
+!10 = metadata !{i32 786434, metadata !160, null, metadata !"BPLFunctionWriter", i32 15, i64 64, i64 64, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_class_type ] [BPLFunctionWriter] [line 15, size 64, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !103}
 !12 = metadata !{i32 786445, metadata !160, metadata !10, metadata !"MW", i32 16, i64 64, i64 64, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786434, metadata !160, null, metadata !"BPLModuleWriter", i32 12, i64 8, i64 8, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_class_type ]
+!13 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
+!14 = metadata !{i32 786434, metadata !160, null, metadata !"BPLModuleWriter", i32 12, i64 8, i64 8, i32 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_class_type ] [BPLModuleWriter] [line 12, size 8, align 8, offset 0] [def] [from ]
 !15 = metadata !{metadata !16}
 !16 = metadata !{i32 786478, metadata !6, metadata !14, metadata !"writeIntrinsic", metadata !"writeIntrinsic", metadata !"_ZN15BPLModuleWriter14writeIntrinsicE8functionIFvvEE", i32 13, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !101, i32 13} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!17 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19, metadata !20}
-!19 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !14} ; [ DW_TAG_pointer_type ]
-!20 = metadata !{i32 786434, metadata !160, null, metadata !"function<void ()>", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !21, i32 0, null, metadata !97} ; [ DW_TAG_class_type ]
+!19 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !14} ; [ DW_TAG_pointer_type ]
+!20 = metadata !{i32 786434, metadata !160, null, metadata !"function<void ()>", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !21, i32 0, null, metadata !97, null} ; [ DW_TAG_class_type ] [function<void ()>] [line 6, size 8, align 8, offset 0] [def] [from ]
 !21 = metadata !{metadata !22, metadata !51, metadata !58, metadata !86, metadata !92}
 !22 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"", i32 8, metadata !23, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !47, i32 0, metadata !49, i32 8} ; [ DW_TAG_subprogram ]
-!23 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !24, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!23 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !24, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !24 = metadata !{null, metadata !25, metadata !26}
-!25 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_pointer_type ]
-!26 = metadata !{i32 786434, metadata !160, metadata !5, metadata !"", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !27, i32 0, null, null} ; [ DW_TAG_class_type ]
+!25 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !20} ; [ DW_TAG_pointer_type ]
+!26 = metadata !{i32 786434, metadata !160, metadata !5, metadata !"", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_class_type ] [line 20, size 8, align 8, offset 0] [def] [from ]
 !27 = metadata !{metadata !28, metadata !35, metadata !41}
 !28 = metadata !{i32 786478, metadata !6, metadata !26, metadata !"operator()", metadata !"operator()", metadata !"", i32 20, metadata !29, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !33, i32 20} ; [ DW_TAG_subprogram ]
-!29 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!29 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !30 = metadata !{null, metadata !31}
-!31 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !32} ; [ DW_TAG_pointer_type ]
-!32 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_const_type ]
+!31 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !32} ; [ DW_TAG_pointer_type ]
+!32 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_const_type ]
 !33 = metadata !{metadata !34}
 !34 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !35 = metadata !{i32 786478, metadata !6, metadata !26, metadata !"~", metadata !"~", metadata !"", i32 20, metadata !36, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !39, i32 20} ; [ DW_TAG_subprogram ]
-!36 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!36 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !37 = metadata !{null, metadata !38}
-!38 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !26} ; [ DW_TAG_pointer_type ]
+!38 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !26} ; [ DW_TAG_pointer_type ]
 !39 = metadata !{metadata !40}
 !40 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !41 = metadata !{i32 786478, metadata !6, metadata !26, metadata !"", metadata !"", metadata !"", i32 20, metadata !42, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !45, i32 20} ; [ DW_TAG_subprogram ]
-!42 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!42 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !43 = metadata !{null, metadata !38, metadata !44}
 !44 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_rvalue_reference_type ]
 !45 = metadata !{metadata !46}
@@ -127,32 +128,32 @@ entry:
 !49 = metadata !{metadata !50}
 !50 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !51 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function<function<void ()> >", metadata !"function<function<void ()> >", metadata !"", i32 8, metadata !52, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !54, i32 0, metadata !56, i32 8} ; [ DW_TAG_subprogram ]
-!52 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !53, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!52 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !53, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !53 = metadata !{null, metadata !25, metadata !20}
 !54 = metadata !{metadata !55}
 !55 = metadata !{i32 786479, null, metadata !"_Functor", metadata !20, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
 !56 = metadata !{metadata !57}
 !57 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !58 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"", i32 8, metadata !59, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !82, i32 0, metadata !84, i32 8} ; [ DW_TAG_subprogram ]
-!59 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !60, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!59 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !60, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !60 = metadata !{null, metadata !25, metadata !61}
-!61 = metadata !{i32 786434, metadata !160, metadata !5, metadata !"", i32 23, i64 8, i64 8, i32 0, i32 0, null, metadata !62, i32 0, null, null} ; [ DW_TAG_class_type ]
+!61 = metadata !{i32 786434, metadata !160, metadata !5, metadata !"", i32 23, i64 8, i64 8, i32 0, i32 0, null, metadata !62, i32 0, null, null, null} ; [ DW_TAG_class_type ] [line 23, size 8, align 8, offset 0] [def] [from ]
 !62 = metadata !{metadata !63, metadata !70, metadata !76}
 !63 = metadata !{i32 786478, metadata !6, metadata !61, metadata !"operator()", metadata !"operator()", metadata !"", i32 23, metadata !64, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !68, i32 23} ; [ DW_TAG_subprogram ]
-!64 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!64 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !65 = metadata !{null, metadata !66}
-!66 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !67} ; [ DW_TAG_pointer_type ]
-!67 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_const_type ]
+!66 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !67} ; [ DW_TAG_pointer_type ]
+!67 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_const_type ]
 !68 = metadata !{metadata !69}
 !69 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !70 = metadata !{i32 786478, metadata !6, metadata !61, metadata !"~", metadata !"~", metadata !"", i32 23, metadata !71, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !74, i32 23} ; [ DW_TAG_subprogram ]
-!71 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !72, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!71 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !72, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !72 = metadata !{null, metadata !73}
-!73 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !61} ; [ DW_TAG_pointer_type ]
+!73 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !61} ; [ DW_TAG_pointer_type ]
 !74 = metadata !{metadata !75}
 !75 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !76 = metadata !{i32 786478, metadata !6, metadata !61, metadata !"", metadata !"", metadata !"", i32 23, metadata !77, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !80, i32 23} ; [ DW_TAG_subprogram ]
-!77 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !78, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!77 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !78, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !78 = metadata !{null, metadata !73, metadata !79}
 !79 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_rvalue_reference_type ]
 !80 = metadata !{metadata !81}
@@ -162,19 +163,19 @@ entry:
 !84 = metadata !{metadata !85}
 !85 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !86 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"function", metadata !"function", metadata !"", i32 6, metadata !87, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !90, i32 6} ; [ DW_TAG_subprogram ]
-!87 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !88, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!87 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !88, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !88 = metadata !{null, metadata !25, metadata !89}
 !89 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_rvalue_reference_type ]
 !90 = metadata !{metadata !91}
 !91 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !92 = metadata !{i32 786478, metadata !6, metadata !20, metadata !"~function", metadata !"~function", metadata !"", i32 6, metadata !93, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !95, i32 6} ; [ DW_TAG_subprogram ]
-!93 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !94, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!93 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !94, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !94 = metadata !{null, metadata !25}
 !95 = metadata !{metadata !96}
 !96 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
 !97 = metadata !{metadata !98}
 !98 = metadata !{i32 786479, null, metadata !"T", metadata !99, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
-!99 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !100, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!99 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !100, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !100 = metadata !{null}
 !101 = metadata !{metadata !102}
 !102 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
@@ -183,16 +184,16 @@ entry:
 !105 = metadata !{i32 786468}                     ; [ DW_TAG_base_type ]
 !106 = metadata !{i32 786478, metadata !6, null, metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", i32 8, metadata !59, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.function*)* @"_ZN8functionIFvvEEC2IZN17BPLFunctionWriter9writeExprEvE3$_1_0EET_", metadata !82, metadata !58, metadata !1, i32 8} ; [ DW_TAG_subprogram ]
 !107 = metadata !{i32 786478, metadata !6, null, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", i32 3, metadata !108, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.anon.0*)* @"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", metadata !111, metadata !113, metadata !1, i32 3} ; [ DW_TAG_subprogram ]
-!108 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !109, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!108 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !109, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !109 = metadata !{null, metadata !110}
 !110 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !61} ; [ DW_TAG_reference_type ]
 !111 = metadata !{metadata !112}
 !112 = metadata !{i32 786479, null, metadata !"_Tp", metadata !61, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
 !113 = metadata !{i32 786478, metadata !6, metadata !114, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:23:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_1_0EEvRKT_", i32 3, metadata !108, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !111, i32 0, metadata !124, i32 3} ; [ DW_TAG_subprogram ]
-!114 = metadata !{i32 786434, metadata !160, null, metadata !"_Base_manager", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !115, i32 0, null, null} ; [ DW_TAG_class_type ]
+!114 = metadata !{i32 786434, metadata !160, null, metadata !"_Base_manager", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !115, i32 0, null, null, null} ; [ DW_TAG_class_type ] [_Base_manager] [line 1, size 8, align 8, offset 0] [def] [from ]
 !115 = metadata !{metadata !116, metadata !113}
 !116 = metadata !{i32 786478, metadata !6, metadata !114, metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_M_not_empty_function<BPLFunctionWriter::<lambda at BPLFunctionWriter2.ii:20:36> >", metadata !"_ZN13_Base_manager21_M_not_empty_functionIZN17BPLFunctionWriter9writeExprEvE3$_0EEvRKT_", i32 3, metadata !117, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, metadata !120, i32 0, metadata !122, i32 3} ; [ DW_TAG_subprogram ]
-!117 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !118, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!117 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !118, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !118 = metadata !{null, metadata !119}
 !119 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !26} ; [ DW_TAG_reference_type ]
 !120 = metadata !{metadata !121}
@@ -208,14 +209,14 @@ entry:
 !131 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !132} ; [ DW_TAG_const_type ]
 !132 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
 !133 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777235, metadata !134, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!134 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
+!134 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ]
 !135 = metadata !{i32 19, i32 39, metadata !5, null}
 !136 = metadata !{i32 20, i32 17, metadata !137, null}
 !137 = metadata !{i32 786443, metadata !5, i32 19, i32 51, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
 !138 = metadata !{i32 23, i32 17, metadata !137, null}
 !139 = metadata !{i32 26, i32 15, metadata !137, null}
 !140 = metadata !{i32 786689, metadata !106, metadata !"this", metadata !6, i32 16777224, metadata !141, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
-!141 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ]
+!141 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ]
 !142 = metadata !{i32 8, i32 45, metadata !106, null}
 !143 = metadata !{i32 786689, metadata !106, metadata !"__f", metadata !6, i32 33554440, metadata !61, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !144 = metadata !{i32 8, i32 63, metadata !106, null}
@@ -236,3 +237,4 @@ entry:
 !159 = metadata !{i32 786473, metadata !161} ; [ DW_TAG_file_type ]
 !160 = metadata !{metadata !"BPLFunctionWriter2.ii", metadata !"/home/peter/crashdelta"}
 !161 = metadata !{metadata !"BPLFunctionWriter.cpp", metadata !"/home/peter/crashdelta"}
+!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
index 338c576..4737862 100644
--- a/test/DebugInfo/X86/pr13303.ll
+++ b/test/DebugInfo/X86/pr13303.ll
@@ -13,15 +13,17 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13}
 
 !0 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 3.2 (trunk 160143)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !12, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
 !6 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 1, i32 14, metadata !11, null}
 !11 = metadata !{i32 786443, metadata !12, metadata !5, i32 1, i32 12, i32 0} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
 !12 = metadata !{metadata !"PR13303.c", metadata !"/home/probinson"}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/pr9951.ll b/test/DebugInfo/X86/pr9951.ll
index db60fb1..d933beb 100644
--- a/test/DebugInfo/X86/pr9951.ll
+++ b/test/DebugInfo/X86/pr9951.ll
@@ -6,12 +6,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!9}
 !6 = metadata !{metadata !0}
 
-!0 = metadata !{i32 786478, metadata !7, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !7, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
 !1 = metadata !{i32 786473, metadata !7} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !7, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !8, metadata !8, metadata !6, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !7, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !7, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/llvm/build-rust2"}
@@ -23,3 +24,4 @@ entry:
 ; CHECK:      Ltmp9 = (Ltmp3-Ltmp2)-0
 ; CHECK-NEXT:	.long	Ltmp9
 ; CHECK-NEXT:	.quad	Ltmp0
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/prologue-stack.ll b/test/DebugInfo/X86/prologue-stack.ll
index 57d164a..b37e41a 100644
--- a/test/DebugInfo/X86/prologue-stack.ll
+++ b/test/DebugInfo/X86/prologue-stack.ll
@@ -19,16 +19,18 @@ entry:
 declare i32 @callme(i32)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!14}
 
 !0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.2 (trunk 164980) (llvm/trunk 164979)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !13, metadata !6, metadata !"isel_line_test2", metadata !"isel_line_test2", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @isel_line_test2, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
 !6 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 5, i32 3, metadata !11, null}
 !11 = metadata !{i32 786443, metadata !13, metadata !5, i32 4, i32 1, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
 !12 = metadata !{i32 6, i32 3, metadata !11, null}
 !13 = metadata !{metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp"}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/ref_addr_relocation.ll b/test/DebugInfo/X86/ref_addr_relocation.ll
new file mode 100644
index 0000000..fc5197d
--- /dev/null
+++ b/test/DebugInfo/X86/ref_addr_relocation.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=asm -O0 -mtriple=x86_64-linux-gnu < %s | FileCheck %s
+; RUN: llc -filetype=obj -O0 %s -mtriple=x86_64-linux-gnu -o %t
+; RUN: llvm-dwarfdump %t | FileCheck %s -check-prefix=CHECK-DWARF
+
+; RUN: llc -filetype=obj %s -mtriple=x86_64-apple-darwin -o %t2
+; RUN: llvm-dwarfdump %t2 | FileCheck %s -check-prefix=DARWIN-DWARF
+
+; Testing case generated from:
+; clang++ tu1.cpp tu2.cpp -g -emit-llvm -c
+; llvm-link tu1.bc tu2.bc -o tu12.ll -S
+; cat hdr.h
+; struct foo {
+; };
+; cat tu1.cpp
+; #include "hdr.h"
+; foo f;
+; cat tu2.cpp
+; #include "hdr.h"
+; foo g;
+
+; Make sure we use relocation for ref_addr on non-darwin platforms.
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_variable
+; CHECK: .long [[TYPE:.*]] # DW_AT_type
+; CHECK: DW_TAG_structure_type
+; CHECK: debug_info_end0
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG_structure_type
+; This variable's type is in the 1st CU.
+; CHECK: DW_TAG_variable
+; Make sure this is relocatable.
+; CHECK: .quad .Lsection_info+[[TYPE]] # DW_AT_type
+; CHECK-NOT: DW_TAG_structure_type
+; CHECK: debug_info_end1
+
+; CHECK-DWARF: DW_TAG_compile_unit
+; CHECK-DWARF: 0x[[ADDR:.*]]: DW_TAG_structure_type
+; CHECK-DWARF: DW_TAG_compile_unit
+; CHECK-DWARF: DW_TAG_variable
+; CHECK-DWARF: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[ADDR]])
+
+; DARWIN-DWARF: DW_TAG_compile_unit
+; DARWIN-DWARF: 0x[[ADDR:.*]]: DW_TAG_structure_type
+; DARWIN-DWARF: DW_TAG_compile_unit
+; DARWIN-DWARF: DW_TAG_variable
+; DARWIN-DWARF: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[ADDR]])
+
+%struct.foo = type { i8 }
+
+@f = global %struct.foo zeroinitializer, align 1
+@g = global %struct.foo zeroinitializer, align 1
+
+!llvm.dbg.cu = !{!0, !9}
+!llvm.module.flags = !{!14, !15}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 191799)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !6, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"tu1.cpp", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !"./hdr.h", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !8, i32 2, metadata !4, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ] [f] [line 2] [def]
+!8 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu1.cpp]
+!9 = metadata !{i32 786449, metadata !10, i32 4, metadata !"clang version 3.4 (trunk 191799)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !11, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp] [DW_LANG_C_plus_plus]
+!10 = metadata !{metadata !"tu2.cpp", metadata !"/Users/manmanren/test-Nov/type_unique_air/ref_addr"}
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 786484, i32 0, null, metadata !"g", metadata !"g", metadata !"", metadata !13, i32 2, metadata !4, i32 0, i32 1, %struct.foo* @g, null} ; [ DW_TAG_variable ] [g] [line 2] [def]
+!13 = metadata !{i32 786473, metadata !10}        ; [ DW_TAG_file_type ] [/Users/manmanren/test-Nov/type_unique_air/ref_addr/tu2.cpp]
+!14 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/reference-argument.ll b/test/DebugInfo/X86/reference-argument.ll
index 00846b3..be54386 100644
--- a/test/DebugInfo/X86/reference-argument.ll
+++ b/test/DebugInfo/X86/reference-argument.ll
@@ -30,7 +30,7 @@ declare void @_ZN4SValD1Ev(%class.SVal* %this)
 declare void @_ZN4SValD2Ev(%class.SVal* %this)
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!47}
+!llvm.module.flags = !{!47, !68}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [aggregate-indirect-arg.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"aggregate-indirect-arg.cpp", metadata !""}
@@ -38,10 +38,10 @@ declare void @_ZN4SValD2Ev(%class.SVal* %this)
 !3 = metadata !{metadata !4, metadata !29, metadata !33, metadata !34, metadata !35}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"bar", metadata !"bar", metadata !"_Z3barR4SVal", i32 19, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.SVal*)* @_Z3barR4SVal, null, null, metadata !2, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [bar]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [aggregate-indirect-arg.cpp]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
 !8 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from SVal]
-!9 = metadata !{i32 786434, metadata !1, null, metadata !"SVal", i32 12, i64 128, i64 64, i32 0, i32 0, null, metadata !10, i32 0, null, null} ; [ DW_TAG_class_type ] [SVal] [line 12, size 128, align 64, offset 0] [def] [from ]
+!9 = metadata !{i32 786434, metadata !1, null, metadata !"SVal", i32 12, i64 128, i64 64, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_class_type ] [SVal] [line 12, size 128, align 64, offset 0] [def] [from ]
 !10 = metadata !{metadata !11, metadata !14, metadata !16, metadata !21, metadata !23}
 !11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"Data", i32 15, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ] [Data] [line 15, size 64, align 64, offset 0] [from ]
 !12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
@@ -49,34 +49,34 @@ declare void @_ZN4SValD2Ev(%class.SVal* %this)
 !14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"Kind", i32 16, i64 32, i64 32, i64 64, i32 0, metadata !15} ; [ DW_TAG_member ] [Kind] [line 16, size 32, align 32, offset 64] [from unsigned int]
 !15 = metadata !{i32 786468, null, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned]
 !16 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 14, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !20, i32 14} ; [ DW_TAG_subprogram ] [line 14] [~SVal]
-!17 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19}
-!19 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from SVal]
+!19 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from SVal]
 !20 = metadata !{i32 786468}
 !21 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"SVal", metadata !"SVal", metadata !"", i32 12, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !22, i32 12} ; [ DW_TAG_subprogram ] [line 12] [SVal]
 !22 = metadata !{i32 786468}
 !23 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"SVal", metadata !"SVal", metadata !"", i32 12, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !28, i32 12} ; [ DW_TAG_subprogram ] [line 12] [SVal]
-!24 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !19, metadata !26}
 !26 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !27} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
 !27 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from SVal]
 !28 = metadata !{i32 786468}
 !29 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !30, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 25} ; [ DW_TAG_subprogram ] [line 25] [def] [main]
-!30 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !31, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !31 = metadata !{metadata !32}
 !32 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !33 = metadata !{i32 786478, metadata !1, null, metadata !"~SVal", metadata !"~SVal", metadata !"_ZN4SValD1Ev", i32 14, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.SVal*)* @_ZN4SValD1Ev, null, metadata !16, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
 !34 = metadata !{i32 786478, metadata !1, null, metadata !"~SVal", metadata !"~SVal", metadata !"_ZN4SValD2Ev", i32 14, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.SVal*)* @_ZN4SValD2Ev, null, metadata !16, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [~SVal]
 !35 = metadata !{i32 786478, metadata !1, null, metadata !"foo", metadata !"foo", metadata !"_ZN1A3fooE4SVal", i32 22, metadata !36, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*, %class.SVal*)* @_ZN1A3fooE4SVal, null, metadata !41, metadata !2, i32 22} ; [ DW_TAG_subprogram ] [line 22] [def] [foo]
-!36 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!36 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !37 = metadata !{null, metadata !38, metadata !9}
-!38 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
-!39 = metadata !{i32 786434, metadata !1, null, metadata !"A", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !40, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 20, size 8, align 8, offset 0] [def] [from ]
+!38 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!39 = metadata !{i32 786434, metadata !1, null, metadata !"A", i32 20, i64 8, i64 8, i32 0, i32 0, null, metadata !40, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 20, size 8, align 8, offset 0] [def] [from ]
 !40 = metadata !{metadata !41, metadata !43}
 !41 = metadata !{i32 786478, metadata !1, metadata !39, metadata !"foo", metadata !"foo", metadata !"_ZN1A3fooE4SVal", i32 22, metadata !36, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !42, i32 22} ; [ DW_TAG_subprogram ] [line 22] [foo]
 !42 = metadata !{i32 786468}
 !43 = metadata !{i32 786478, metadata !1, metadata !39, metadata !"A", metadata !"A", metadata !"", i32 20, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !46, i32 20} ; [ DW_TAG_subprogram ] [line 20] [A]
-!44 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!44 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !45 = metadata !{null, metadata !38}
 !46 = metadata !{i32 786468}
 !47 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
@@ -100,3 +100,4 @@ declare void @_ZN4SValD2Ev(%class.SVal* %this)
 !65 = metadata !{i32 14, i32 0, metadata !33, null}
 !66 = metadata !{i32 786689, metadata !34, metadata !"this", metadata !5, i32 16777230, metadata !64, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 14]
 !67 = metadata !{i32 14, i32 0, metadata !34, null}
+!68 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/rvalue-ref.ll b/test/DebugInfo/X86/rvalue-ref.ll
index d4f69fe..e9ea427 100644
--- a/test/DebugInfo/X86/rvalue-ref.ll
+++ b/test/DebugInfo/X86/rvalue-ref.ll
@@ -21,13 +21,14 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 declare i32 @printf(i8*, ...)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17}
 
 !0 = metadata !{i32 786449, metadata !16, i32 4, metadata !"clang version 3.2 (trunk 157054) (llvm/trunk 157060)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !16, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooOi", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32*)* @_Z3fooOi, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9}
 !9 = metadata !{i32 786498, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_rvalue_reference_type ]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
@@ -37,3 +38,4 @@ declare i32 @printf(i8*, ...)
 !14 = metadata !{i32 786443, metadata !16, metadata !5, i32 5, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 7, i32 1, metadata !14, null}
 !16 = metadata !{metadata !"foo.cpp", metadata !"/Users/echristo/tmp"}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
index a0bed16..72eb62f 100644
--- a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
+++ b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
@@ -48,12 +48,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0, !10}
+!llvm.module.flags = !{!25}
 !0 = metadata !{i32 786449, metadata !23, i32 12, metadata !"clang version 3.3", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !23, metadata !6, metadata !"test", metadata !"test", metadata !"", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @test, null, null, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [test]
 !6 = metadata !{i32 786473, metadata !23} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786449, metadata !24, i32 12, metadata !"clang version 3.3 (trunk 172862)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !11, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
@@ -70,3 +71,4 @@ entry:
 !22 = metadata !{i32 786443, metadata !24, metadata !13, i32 1, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !23 = metadata !{metadata !"simple.c", metadata !"/private/tmp"}
 !24 = metadata !{metadata !"simple2.c", metadata !"/private/tmp"}
+!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/stmt-list.ll b/test/DebugInfo/X86/stmt-list.ll
index c3a456e..6f846c1 100644
--- a/test/DebugInfo/X86/stmt-list.ll
+++ b/test/DebugInfo/X86/stmt-list.ll
@@ -11,11 +11,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7}
 !5 = metadata !{metadata !0}
 
-!0 = metadata !{i32 786478, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !6, metadata !1, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
 !1 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !6, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !5, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !6, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !6 = metadata !{metadata !"test2.c", metadata !"/home/espindola/llvm"}
+!7 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index d9604de..fccac26 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll
@@ -4,6 +4,7 @@
 @yyyy = common global i32 0, align 4
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9}
 
 !0 = metadata !{i32 786449, metadata !8, i32 12, metadata !"clang version 3.1 (trunk 143009)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
@@ -13,9 +14,15 @@
 !7 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !"z.c", metadata !"/home/nicholas"}
 
+; Verify that "yyyy" ended up in the stringpool.
+; LINUX: .section .debug_str,"MS",@progbits,1
+; LINUX: yyyy
+; DARWIN: .section __DWARF,__debug_str,regular,debug
+; DARWIN: yyyy
+
 ; Verify that we refer to 'yyyy' with a relocation.
 ; LINUX:      .long   .Linfo_string3          # DW_AT_name
-; LINUX-NEXT: .long   38                      # DW_AT_type
+; LINUX-NEXT: .long   {{[0-9]+}}              # DW_AT_type
 ; LINUX-NEXT:                                 # DW_AT_external
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_file
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_line
@@ -24,20 +31,13 @@
 ; LINUX-NEXT: .quad   yyyy
 
 ; Verify that we refer to 'yyyy' without a relocation.
-; DARWIN: Lset5 = Linfo_string3-Linfo_string          ## DW_AT_name
-; DARWIN-NEXT:        .long   Lset5
-; DARWIN-NEXT:        .long   39                      ## DW_AT_type
-; DARWIN-NEXT:        .byte   1                       ## DW_AT_external
+; DARWIN: Lset[[ID:[0-9]+]] = Linfo_string3-Linfo_string ## DW_AT_name
+; DARWIN-NEXT:        .long   Lset[[ID]]
+; DARWIN-NEXT:        .long   {{[0-9]+}}              ## DW_AT_type
+; DARWIN-NEXT:                                        ## DW_AT_external
 ; DARWIN-NEXT:        .byte   1                       ## DW_AT_decl_file
 ; DARWIN-NEXT:        .byte   1                       ## DW_AT_decl_line
 ; DARWIN-NEXT:        .byte   9                       ## DW_AT_location
 ; DARWIN-NEXT:        .byte   3
 ; DARWIN-NEXT:        .quad   _yyyy
-
-; Verify that "yyyy" ended up in the stringpool.
-; LINUX: .section .debug_str,"MS",@progbits,1
-; LINUX-NOT: .section
-; LINUX: yyyy
-; DARWIN: .section __DWARF,__debug_str,regular,debug
-; DARWIN-NOT: .section
-; DARWIN: yyyy
+!9 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/struct-loc.ll b/test/DebugInfo/X86/struct-loc.ll
index fb990b2..95bdd41 100644
--- a/test/DebugInfo/X86/struct-loc.ll
+++ b/test/DebugInfo/X86/struct-loc.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Make sure that structures have a decl file and decl line attached.
-; CHECK: DW_TAG_structure_type [3]
+; CHECK: DW_TAG_structure_type
 ; CHECK: DW_AT_decl_file
 ; CHECK: DW_AT_decl_line
 ; CHECK: DW_TAG_member
@@ -12,14 +12,16 @@
 @f = common global %struct.foo zeroinitializer, align 4
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12}
 
 !0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.1 (trunk 152837) (llvm/trunk 152845)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786484, i32 0, null, metadata !"f", metadata !"f", metadata !"", metadata !6, i32 5, metadata !7, i32 0, i32 1, %struct.foo* @f, null} ; [ DW_TAG_variable ]
 !6 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786451, metadata !11, null, metadata !"foo", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!7 = metadata !{i32 786451, metadata !11, null, metadata !"foo", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 1, size 32, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786445, metadata !11, metadata !7, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !11 = metadata !{metadata !"struct_bug.c", metadata !"/Users/echristo/tmp"}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/subrange-type.ll b/test/DebugInfo/X86/subrange-type.ll
index da95893..05b1477 100644
--- a/test/DebugInfo/X86/subrange-type.ll
+++ b/test/DebugInfo/X86/subrange-type.ll
@@ -2,10 +2,10 @@
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 
 ; Make sure that the base type from the subrange type has a name.
-; CHECK: 0x0000006b:   DW_TAG_base_type [6]
+; CHECK: DW_TAG_subrange_type
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]+}} => {[[SUBTYPE:0x[0-9a-f]*]]})
+; CHECK: [[SUBTYPE]]: DW_TAG_base_type
 ; CHECK-NEXT: DW_AT_name
-; CHECK: DW_TAG_subrange_type [8]
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]     (cu + 0x006b => {0x0000006b})
 
 define i32 @main() nounwind uwtable {
 entry:
@@ -19,20 +19,22 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18}
 
 !0 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 3.3 (trunk 171472) (llvm/trunk 171487)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !6, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 3] [main]
 !6 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786688, metadata !11, metadata !"i", metadata !6, i32 4, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 4]
 !11 = metadata !{i32 786443, metadata !6, metadata !5, i32 3, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/foo.c]
-!12 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 64, i64 32, i32 0, i32 0, metadata !9, metadata !13, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from int]
+!12 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 64, i64 32, i32 0, i32 0, metadata !9, metadata !13, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 64, align 32, offset 0] [from int]
 !13 = metadata !{metadata !14}
 !14 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
 !15 = metadata !{i32 4, i32 0, metadata !11, null}
 !16 = metadata !{i32 6, i32 0, metadata !11, null}
 !17 = metadata !{metadata !"foo.c", metadata !"/usr/local/google/home/echristo/tmp"}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/subreg.ll b/test/DebugInfo/X86/subreg.ll
index 9aa6e54..162c2d1 100644
--- a/test/DebugInfo/X86/subreg.ll
+++ b/test/DebugInfo/X86/subreg.ll
@@ -17,15 +17,17 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!11}
 !9 = metadata !{metadata !1}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"zzz", metadata !2, i32 16777219, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !4, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i16 (i16)* @f, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i16 (i16)* @f, null, null, null, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
 !2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"clang version 3.0 ()", i1 false, metadata !"", i32 0, metadata !5, metadata !5, metadata !9, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{null}
 !6 = metadata !{i32 786468, null, metadata !3, metadata !"short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 4, i32 22, metadata !8, null}
 !8 = metadata !{i32 786443, metadata !10, metadata !1, i32 3, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{metadata !"/home/espindola/llvm/test.c", metadata !"/home/espindola/tmpfs/build"}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/template.ll b/test/DebugInfo/X86/template.ll
index f80dd5c..64a8f7a 100644
--- a/test/DebugInfo/X86/template.ll
+++ b/test/DebugInfo/X86/template.ll
@@ -27,7 +27,7 @@
 ; even as data1. DWARF strongly urges implementations to prefer 
 ; _sdata/_udata rather than dataN
 
-; CHECK-NEXT: DW_AT_const_value [DW_FORM_data4]{{.*}}(0x00000003)
+; CHECK-NEXT: DW_AT_const_value [DW_FORM_sdata]{{.*}}(3)
 
 ; CHECK: DW_TAG_template_value_parameter
 ; CHECK-NEXT: DW_AT_type{{.*}}=> {[[INTPTR:0x[0-9a-f]*]]}
@@ -47,11 +47,11 @@
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_template_value_parameter
 ; CHECK-NEXT: DW_AT_type{{.*}}=> {[[INT]]}
-; CHECK-NEXT: DW_AT_const_value  [DW_FORM_data4]{{.*}}(0x00000001)
+; CHECK-NEXT: DW_AT_const_value  [DW_FORM_sdata]{{.*}}(1)
 ; CHECK-NOT: NULL
 ; CHECK: DW_TAG_template_value_parameter
 ; CHECK-NEXT: DW_AT_type{{.*}}=> {[[INT]]}
-; CHECK-NEXT: DW_AT_const_value  [DW_FORM_data4]{{.*}}(0x00000002)
+; CHECK-NEXT: DW_AT_const_value  [DW_FORM_sdata]{{.*}}(2)
 
 ; CHECK: [[INTPTR]]:{{ *}}DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type{{.*}} => {[[INT]]}
@@ -64,65 +64,63 @@
 
 define internal void @__cxx_global_var_init() section ".text.startup" {
 entry:
-  %call = call i32 @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv(), !dbg !37
-  store i32 %call, i32* @glbl, align 4, !dbg !37
-  ret void, !dbg !37
+  %call = call i32 @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv(), !dbg !33
+  store i32 %call, i32* @glbl, align 4, !dbg !33
+  ret void, !dbg !33
 }
 
 ; Function Attrs: nounwind uwtable
 define linkonce_odr i32 @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv() #0 {
 entry:
-  ret i32 3, !dbg !38
+  ret i32 3, !dbg !34
 }
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
 entry:
-  call void @__cxx_global_var_init(), !dbg !39
-  ret void, !dbg !39
+  call void @__cxx_global_var_init(), !dbg !35
+  ret void, !dbg !35
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!36}
+!llvm.module.flags = !{!31, !36}
+!llvm.ident = !{!32}
 
-!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !23, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/templ.cpp] [DW_LANG_C_plus_plus]
-!1 = metadata !{metadata !"templ.cpp", metadata !"/tmp"}
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 192849) (llvm/trunk 192850)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !9, metadata !28, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"bar.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
 !2 = metadata !{i32 0}
-!3 = metadata !{metadata !4, metadata !8, metadata !21}
-!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 3, metadata !6, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [__cxx_global_var_init]
-!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/templ.cpp]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!7 = metadata !{null}
-!8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"func<3, &glbl, y_impl, 1, 2>", metadata !"func<3, &glbl, y_impl, 1, 2>", metadata !"_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv", i32 1, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv, metadata !12, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func<3, &glbl, y_impl, 1, 2>]
-!9 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!12 = metadata !{metadata !13, metadata !14, metadata !16, metadata !17}
-!13 = metadata !{i32 786480, null, metadata !"x", metadata !11, i32 3, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!14 = metadata !{i32 786480, null, metadata !"", metadata !15, i32* @glbl, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!15 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
-!16 = metadata !{i32 803078, null, metadata !"y", null, metadata !"y_impl", null, i32 0, i32 0} ; [ DW_TAG_GNU_template_template_param ]
-!17 = metadata !{i32 803079, null, metadata !"z", null, metadata !18, null, i32 0, i32 0} ; [ DW_TAG_GNU_template_parameter_pack ]
-!18 = metadata !{metadata !19, metadata !20}
-!19 = metadata !{i32 786480, null, metadata !"", metadata !11, i32 1, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!20 = metadata !{i32 786480, null, metadata !"", metadata !11, i32 2, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
-!21 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"_GLOBAL__I_a", metadata !"_GLOBAL__I_a", metadata !"", i32 1, metadata !22, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [local] [def] [_GLOBAL__I_a]
-!22 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"y_impl<int>", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, metadata !5, metadata !"_ZTS6y_implIiE"} ; [ DW_TAG_structure_type ] [y_impl<int>] [line 2, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 786479, null, metadata !"", metadata !7, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!8 = metadata !{i32 786451, metadata !1, metadata !"_ZTS6y_implIiE", metadata !"nested", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTSN6y_implIiE6nestedE"} ; [ DW_TAG_structure_type ] [nested] [line 2, size 8, align 8, offset 0] [def] [from ]
+!9 = metadata !{metadata !10, metadata !14, metadata !26}
+!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"__cxx_global_var_init", metadata !"__cxx_global_var_init", metadata !"", i32 3, metadata !12, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @__cxx_global_var_init, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [local] [def] [__cxx_global_var_init]
+!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/bar.cpp]
+!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{null}
+!14 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"func<3, &glbl, y_impl, 1, 2>", metadata !"func<3, &glbl, y_impl, 1, 2>", metadata !"_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv", i32 1, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z4funcILi3EXadL_Z4glblEE6y_implJLi1ELi2EEEiv, metadata !17, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [func<3, &glbl, y_impl, 1, 2>]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !7}
+!17 = metadata !{metadata !18, metadata !19, metadata !21, metadata !22}
+!18 = metadata !{i32 786480, null, metadata !"x", metadata !7, i32 3, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!19 = metadata !{i32 786480, null, metadata !"", metadata !20, i32* @glbl, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!20 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
+!21 = metadata !{i32 803078, null, metadata !"y", null, metadata !"y_impl", null, i32 0, i32 0} ; [ DW_TAG_GNU_template_template_param ]
+!22 = metadata !{i32 803079, null, metadata !"z", null, metadata !23, null, i32 0, i32 0} ; [ DW_TAG_GNU_template_parameter_pack ]
 !23 = metadata !{metadata !24, metadata !25}
-!24 = metadata !{i32 786484, i32 0, null, metadata !"glbl", metadata !"glbl", metadata !"", metadata !5, i32 3, metadata !11, i32 0, i32 1, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 3] [def]
-!25 = metadata !{i32 786484, i32 0, null, metadata !"n", metadata !"n", metadata !"", metadata !5, i32 4, metadata !26, i32 0, i32 1, %"struct.y_impl<int>::nested"* @n, null} ; [ DW_TAG_variable ] [n] [line 4] [def]
-!26 = metadata !{i32 786451, metadata !1, metadata !27, metadata !"nested", i32 2, i64 8, i64 8, i32 0, i32 0, null, metadata !30, i32 0, null, null} ; [ DW_TAG_structure_type ] [nested] [line 2, size 8, align 8, offset 0] [def] [from ]
-!27 = metadata !{i32 786451, metadata !1, null, metadata !"y_impl<int>", i32 2, i64 8, i64 8, i32 0, i32 0, null, null, i32 0, null, metadata !28} ; [ DW_TAG_structure_type ] [y_impl<int>] [line 2, size 8, align 8, offset 0] [def] [from ]
-!28 = metadata !{metadata !29}
-!29 = metadata !{i32 786479, null, metadata !"", metadata !11, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
-!30 = metadata !{metadata !31}
-!31 = metadata !{i32 786478, metadata !1, metadata !26, metadata !"nested", metadata !"nested", metadata !"", i32 2, metadata !32, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !35, i32 2} ; [ DW_TAG_subprogram ] [line 2] [nested]
-!32 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!33 = metadata !{null, metadata !34}
-!34 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !26} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from nested]
-!35 = metadata !{i32 786468}
-!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
-!37 = metadata !{i32 3, i32 0, metadata !4, null}
-!38 = metadata !{i32 1, i32 0, metadata !8, null}
-!39 = metadata !{i32 1, i32 0, metadata !21, null}
+!24 = metadata !{i32 786480, null, metadata !"", metadata !7, i32 1, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!25 = metadata !{i32 786480, null, metadata !"", metadata !7, i32 2, null, i32 0, i32 0} ; [ DW_TAG_template_value_parameter ]
+!26 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"", metadata !"", metadata !"_GLOBAL__I_a", i32 1, metadata !27, i1 true, i1 true, i32 0, i32 0, null, i32 64, i1 false, void ()* @_GLOBAL__I_a, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [local] [def]
+!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !29, metadata !30}
+!29 = metadata !{i32 786484, i32 0, null, metadata !"glbl", metadata !"glbl", metadata !"", metadata !11, i32 3, metadata !7, i32 0, i32 1, i32* @glbl, null} ; [ DW_TAG_variable ] [glbl] [line 3] [def]
+!30 = metadata !{i32 786484, i32 0, null, metadata !"n", metadata !"n", metadata !"", metadata !11, i32 4, metadata !8, i32 0, i32 1, %"struct.y_impl<int>::nested"* @n, null} ; [ DW_TAG_variable ] [n] [line 4] [def]
+!31 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!32 = metadata !{metadata !"clang version 3.4 (trunk 192849) (llvm/trunk 192850)"}
+!33 = metadata !{i32 3, i32 0, metadata !10, null}
+!34 = metadata !{i32 1, i32 0, metadata !14, null}
+!35 = metadata !{i32 1, i32 0, metadata !26, null}
+!36 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/tls-fission.ll b/test/DebugInfo/X86/tls-fission.ll
index b95ff40..8a25ace 100644
--- a/test/DebugInfo/X86/tls-fission.ll
+++ b/test/DebugInfo/X86/tls-fission.ll
@@ -19,7 +19,7 @@
 @tls = thread_local global i32 0, align 4
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7}
+!llvm.module.flags = !{!7, !8}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !"tls.dwo"} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
@@ -29,3 +29,4 @@
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
 !6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/tls.ll b/test/DebugInfo/X86/tls.ll
index e2a9af9..745c2f4 100644
--- a/test/DebugInfo/X86/tls.ll
+++ b/test/DebugInfo/X86/tls.ll
@@ -22,7 +22,7 @@
 @tls = thread_local global i32 7, align 4
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!7}
+!llvm.module.flags = !{!7, !8}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/tls.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"tls.cpp", metadata !"/tmp"}
@@ -32,3 +32,4 @@
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/tls.cpp]
 !6 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !7 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/union-template.ll b/test/DebugInfo/X86/union-template.ll
index 8d23cae..c70ae07 100644
--- a/test/DebugInfo/X86/union-template.ll
+++ b/test/DebugInfo/X86/union-template.ll
@@ -27,6 +27,7 @@ attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!28}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.3 (trunk 178499) (llvm/trunk 178472)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9,  metadata !9, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/foo.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"foo.cc", metadata !"/usr/local/google/home/echristo/tmp"}
@@ -34,20 +35,20 @@ attributes #1 = { nounwind readnone }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"g", metadata !"g", metadata !"_ZN7PR156371gEf", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (float)* @_ZN7PR156371gEf, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [g]
 !5 = metadata !{i32 786489, metadata !1, null, metadata !"PR15637", i32 1} ; [ DW_TAG_namespace ] [PR15637] [line 1]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
 !9 = metadata !{metadata !10}
 !10 = metadata !{i32 786484, i32 0, metadata !5, metadata !"f", metadata !"f", metadata !"_ZN7PR156371fE", metadata !11, i32 6, metadata !12, i32 0, i32 1, %"union.PR15637::Value"* @_ZN7PR156371fE, null} ; [ DW_TAG_variable ] [f] [line 6] [def]
 !11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/foo.cc]
-!12 = metadata !{i32 786455, metadata !1, metadata !5, metadata !"Value<float>", i32 2, i64 32, i64 32, i64 0, i32 0, null, metadata !13, i32 0, null, metadata !21} ; [ DW_TAG_union_type ] [Value<float>] [line 2, size 32, align 32, offset 0] [from ]
+!12 = metadata !{i32 786455, metadata !1, metadata !5, metadata !"Value<float>", i32 2, i64 32, i64 32, i64 0, i32 0, null, metadata !13, i32 0, null, metadata !21, null} ; [ DW_TAG_union_type ] [Value<float>] [line 2, size 32, align 32, offset 0] [def] [from ]
 !13 = metadata !{metadata !14, metadata !16}
 !14 = metadata !{i32 786445, metadata !1, metadata !12, metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !15} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
 !15 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !16 = metadata !{i32 786478, metadata !1, metadata !12, metadata !"Value", metadata !"Value", metadata !"", i32 2, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !20, i32 2} ; [ DW_TAG_subprogram ] [line 2] [Value]
-!17 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19}
-!19 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Value<float>]
+!19 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from Value<float>]
 !20 = metadata !{i32 786468}
 !21 = metadata !{metadata !22}
 !22 = metadata !{i32 786479, null, metadata !"T", metadata !8, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
@@ -56,3 +57,4 @@ attributes #1 = { nounwind readnone }
 !25 = metadata !{i32 786688, metadata !4, metadata !"tempValue", metadata !11, i32 4, metadata !12, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [tempValue] [line 4]
 !26 = metadata !{i32 4, i32 0, metadata !4, null}
 !27 = metadata !{i32 5, i32 0, metadata !4, null}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/vector.ll b/test/DebugInfo/X86/vector.ll
index 0f33032..6e14ed6 100644
--- a/test/DebugInfo/X86/vector.ll
+++ b/test/DebugInfo/X86/vector.ll
@@ -10,6 +10,7 @@
 @a = common global <4 x i32> zeroinitializer, align 16
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13}
 
 !0 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 3.3 (trunk 171825) (llvm/trunk 171822)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/echristo/foo.c] [DW_LANG_C99]
 !1 = metadata !{i32 0}
@@ -17,7 +18,7 @@
 !5 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 3, metadata !7, i32 0, i32 1, <4 x i32>* @a, null} ; [ DW_TAG_variable ] [a] [line 3] [def]
 !6 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786454, metadata !12, null, metadata !"v4si", i32 1, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [v4si] [line 1, size 0, align 0, offset 0] [from ]
-!8 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 2048, metadata !9, metadata !10, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
+!8 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 2048, metadata !9, metadata !10, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [vector] [from int]
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11}
 !11 = metadata !{i32 786465, i64 0, i64 4}        ; [ DW_TAG_subrange_type ] [0, 3]
@@ -26,3 +27,4 @@
 ; Check that we get an array type with a vector attribute.
 ; CHECK: DW_TAG_array_type
 ; CHECK-NEXT: DW_AT_GNU_vector
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/X86/vla.ll b/test/DebugInfo/X86/vla.ll
index 81faec7..512b223 100644
--- a/test/DebugInfo/X86/vla.ll
+++ b/test/DebugInfo/X86/vla.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=asm %s -o - | FileCheck %s
-; Ensure that we generate a breg+0 location for the variable length array a.
-; CHECK: ##DEBUG_VALUE: vla:a <- [RDX+0]
+; Ensure that we generate an indirect location for the variable length array a.
+; CHECK: ##DEBUG_VALUE: vla:a <- RDX
+; CHECK: DW_OP_breg1
 ; rdar://problem/13658587
 ;
 ; generated from:
@@ -72,6 +73,7 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!29}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/vla.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"vla.c", metadata !""}
@@ -79,11 +81,11 @@ entry:
 !3 = metadata !{metadata !4, metadata !9}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"vla", metadata !"vla", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @vla, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [vla]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/vla.c]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
-!10 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !8, metadata !8, metadata !12}
 !12 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !13 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
@@ -91,8 +93,8 @@ entry:
 !15 = metadata !{i32 786689, metadata !4, metadata !"n", metadata !5, i32 16777217, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [n] [line 1]
 !16 = metadata !{i32 1, i32 0, metadata !4, null}
 !17 = metadata !{i32 2, i32 0, metadata !4, null}
-!18 = metadata !{i32 786688, metadata !4, metadata !"a", metadata !5, i32 2, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 2]
-!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !8, metadata !20, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
+!18 = metadata !{i32 786688, metadata !4, metadata !"a", metadata !5, i32 2, metadata !19, i32 8192, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 2]
+!19 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !8, metadata !20, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !20 = metadata !{metadata !21}
 !21 = metadata !{i32 786465, i64 0, i64 -1}       ; [ DW_TAG_subrange_type ] [unbounded]
 !22 = metadata !{i32 3, i32 0, metadata !4, null}
@@ -102,3 +104,4 @@ entry:
 !26 = metadata !{i32 7, i32 0, metadata !9, null}
 !27 = metadata !{i32 786689, metadata !9, metadata !"argv", metadata !5, i32 33554439, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 7]
 !28 = metadata !{i32 8, i32 0, metadata !9, null}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/array.ll b/test/DebugInfo/array.ll
index f6556fc..e5e07ff 100644
--- a/test/DebugInfo/array.ll
+++ b/test/DebugInfo/array.ll
@@ -13,16 +13,17 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!16}
 
-!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @main, null, null, null, i32 3} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [main]
 !1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !14, i32 12, metadata !"clang version 3.0 (trunk 129138)", i1 false, metadata !"", i32 0, metadata !15, metadata !15, metadata !13, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786688, metadata !7, metadata !"a", metadata !1, i32 4, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
 !7 = metadata !{i32 786443, metadata !14, metadata !0, i32 3, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 786433, metadata !14, metadata !2, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !5, metadata !9, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!8 = metadata !{i32 786433, metadata !14, metadata !2, metadata !"", i32 0, i64 0, i64 32, i32 0, i32 0, metadata !5, metadata !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 0, align 32, offset 0] [from int]
 !9 = metadata !{metadata !10}
 ;CHECK: DW_TAG_subrange_type
 ;CHECK-NEXT: DW_AT_type
@@ -35,3 +36,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !13 = metadata !{metadata !0}
 !14 = metadata !{metadata !"array.c", metadata !"/private/tmp"}
 !15 = metadata !{i32 0}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/bug_null_debuginfo.ll b/test/DebugInfo/bug_null_debuginfo.ll
index 06436f9..458fb58 100644
--- a/test/DebugInfo/bug_null_debuginfo.ll
+++ b/test/DebugInfo/bug_null_debuginfo.ll
@@ -1,6 +1,8 @@
 ; RUN: llc < %s
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"", i1 false, metadata !"", i32 0, null, null, null,  null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"t", metadata !""}
+!2 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/debuginfofinder-multiple-cu.ll b/test/DebugInfo/debuginfofinder-multiple-cu.ll
new file mode 100644
index 0000000..74965df
--- /dev/null
+++ b/test/DebugInfo/debuginfofinder-multiple-cu.ll
@@ -0,0 +1,41 @@
+; RUN: opt -analyze -module-debuginfo < %s | FileCheck %s
+
+; Produced from linking:
+;   /tmp/test1.c containing f()
+;   /tmp/test2.c containing g()
+
+; Verify that both compile units and both their contained functions are
+; listed by DebugInfoFinder:
+;CHECK: Compile Unit: [ DW_TAG_compile_unit ] [/tmp/test1.c] [DW_LANG_C99]
+;CHECK: Compile Unit: [ DW_TAG_compile_unit ] [/tmp/test2.c] [DW_LANG_C99]
+;CHECK: Subprogram: [ DW_TAG_subprogram ] [line 1] [def] [f]
+;CHECK: Subprogram: [ DW_TAG_subprogram ] [line 1] [def] [g]
+
+define void @f() {
+  ret void, !dbg !14
+}
+
+define void @g() {
+  ret void, !dbg !15
+}
+
+!llvm.dbg.cu = !{!0, !8}
+!llvm.module.flags = !{!13, !16}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (192092)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/test1.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test1.c", metadata !"/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @f, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/test1.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{null}
+!8 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 3.4 (192092)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !10, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/test2.c] [DW_LANG_C99]
+!9 = metadata !{metadata !"test2.c", metadata !"/tmp"}
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786478, metadata !9, metadata !12, metadata !"g", metadata !"g", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @g, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [g]
+!12 = metadata !{i32 786473, metadata !9}         ; [ DW_TAG_file_type ] [/tmp/test2.c]
+!13 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!14 = metadata !{i32 1, i32 0, metadata !4, null}
+!15 = metadata !{i32 1, i32 0, metadata !11, null}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/dwarf-public-names.ll b/test/DebugInfo/dwarf-public-names.ll
index 0733c1b..fc33631 100644
--- a/test/DebugInfo/dwarf-public-names.ll
+++ b/test/DebugInfo/dwarf-public-names.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: object-emission
 
-; RUN: llc -generate-dwarf-pubnames -filetype=obj -o %t.o < %s
+; RUN: llc -generate-dwarf-pub-sections=Enable -filetype=obj -o %t.o < %s
 ; RUN: llvm-dwarfdump -debug-dump=pubnames %t.o | FileCheck %s
 ; ModuleID = 'dwarf-public-names.cpp'
 ;
@@ -37,6 +37,7 @@
 
 ; Skip the output to the header of the pubnames section.
 ; CHECK: debug_pubnames
+; CHECK: version = 0x0002
 
 ; Check for each name in the output.
 ; CHECK: global_namespace_variable
@@ -85,30 +86,31 @@ attributes #0 = { nounwind uwtable }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!38}
 
 !0 = metadata !{i32 786449, metadata !37, i32 4, metadata !"clang version 3.3 (http://llvm.org/git/clang.git a09cd8103a6a719cb2628cdf0c91682250a17bd2) (http://llvm.org/git/llvm.git 47d03cec0afca0c01ae42b82916d1d731716cd20)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !24,  metadata !24, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !2 = metadata !{metadata !3, metadata !18, metadata !19, metadata !20}
 !3 = metadata !{i32 786478, metadata !4, null, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 9, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C15member_functionEv, null, metadata !12, metadata !1, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [member_function]
 !4 = metadata !{i32 786473, metadata !37} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{null, metadata !7}
-!7 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
-!8 = metadata !{i32 786451, metadata !37, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [from ]
+!7 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !8} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from C]
+!8 = metadata !{i32 786451, metadata !37, null, metadata !"C", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 8, align 8, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !12, metadata !14}
 !10 = metadata !{i32 786445, metadata !37, metadata !8, metadata !"static_member_variable", i32 4, i64 0, i64 0, i64 0, i32 4096, metadata !11, null} ; [ DW_TAG_member ] [static_member_variable] [line 4, size 0, align 0, offset 0] [static] [from int]
 !11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"member_function", metadata !"member_function", metadata !"_ZN1C15member_functionEv", i32 2, metadata !5, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !13, i32 2} ; [ DW_TAG_subprogram ] [line 2] [member_function]
 !13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !14 = metadata !{i32 786478, metadata !4, metadata !8, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 3, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !17, i32 3} ; [ DW_TAG_subprogram ] [line 3] [static_member_function]
-!15 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{metadata !11}
 !17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
 !18 = metadata !{i32 786478, metadata !4, null, metadata !"static_member_function", metadata !"static_member_function", metadata !"_ZN1C22static_member_functionEv", i32 13, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_ZN1C22static_member_functionEv, null, metadata !14, metadata !1, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [static_member_function]
 !19 = metadata !{i32 786478, metadata !4, metadata !4, metadata !"global_function", metadata !"global_function", metadata !"_Z15global_functionv", i32 19, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z15global_functionv, null, null, metadata !1, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [global_function]
 !20 = metadata !{i32 786478, metadata !4, metadata !21, metadata !"global_namespace_function", metadata !"global_namespace_function", metadata !"_ZN2ns25global_namespace_functionEv", i32 24, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_ZN2ns25global_namespace_functionEv, null, null, metadata !1, i32 24} ; [ DW_TAG_subprogram ] [line 24] [def] [global_namespace_function]
 !21 = metadata !{i32 786489, null, metadata !"ns", metadata !4, i32 23} ; [ DW_TAG_namespace ] [/usr2/kparzysz/s.hex/t/dwarf-public-names.cpp]
-!22 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !23 = metadata !{null}
 !24 = metadata !{metadata !25, metadata !26, metadata !27}
 !25 = metadata !{i32 786484, i32 0, metadata !8, metadata !"static_member_variable", metadata !"static_member_variable", metadata !"_ZN1C22static_member_variableE", metadata !4, i32 7, metadata !11, i32 0, i32 1, i32* @_ZN1C22static_member_variableE, metadata !10} ; [ DW_TAG_variable ] [static_member_variable] [line 7] [def]
@@ -124,3 +126,4 @@ attributes #1 = { nounwind readnone }
 !35 = metadata !{i32 25, i32 0, metadata !20, null}
 !36 = metadata !{i32 26, i32 0, metadata !20, null}
 !37 = metadata !{metadata !"dwarf-public-names.cpp", metadata !"/usr2/kparzysz/s.hex/t"}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/dwarfdump-pubnames.test b/test/DebugInfo/dwarfdump-pubnames.test
index e1b16c2..215953a 100644
--- a/test/DebugInfo/dwarfdump-pubnames.test
+++ b/test/DebugInfo/dwarfdump-pubnames.test
@@ -2,15 +2,13 @@ RUN: llvm-dwarfdump %p/Inputs/dwarfdump-pubnames.elf-x86-64 \
 RUN:   -debug-dump=pubnames | FileCheck %s
 
 CHECK: .debug_pubnames contents:
-CHECK: Length:                161
-CHECK: Version:               2
-CHECK: Offset in .debug_info: 0
-CHECK: Size:                  321
+CHECK: length = 0x000000a1 version = 0x0002 unit_offset = 0x00000000 unit_size = 0x00000141
+
+CHECK: Offset        Name
+CHECK: 0x00000098    "global_namespace_variable"
+CHECK: 0x000000a7    "global_namespace_function"
+CHECK: 0x000000ec    "static_member_function"
+CHECK: 0x0000007c    "global_variable"
+CHECK: 0x00000103    "global_function"
+CHECK: 0x000000c2    "member_function"
 
-CHECK:  Offset    Name
-CHECK:      98    global_namespace_variable
-CHECK:      a7    global_namespace_function
-CHECK:      ec    static_member_function
-CHECK:      7c    global_variable
-CHECK:     103    global_function
-CHECK:      c2    member_function
diff --git a/test/DebugInfo/dwarfdump-type-units.test b/test/DebugInfo/dwarfdump-type-units.test
new file mode 100644
index 0000000..5fca81d
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-type-units.test
@@ -0,0 +1,32 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-type-units.elf-x86-64 > %t
+RUN: cat %t | FileCheck -check-prefix=FOO %s
+RUN: cat %t | FileCheck -check-prefix=BAR %s
+RUN: llvm-dwarfdump -debug-dump=types %p/Inputs/dwarfdump-type-units.elf-x86-64 | FileCheck -check-prefix=TYPES %s
+
+FOO: debug_info contents:
+FOO: DW_TAG_variable
+FOO-NEXT: DW_AT_name {{.*}}"f"
+FOO: DW_AT_type [DW_FORM_ref_sig8] ([[FOO_SIG:0x[0-9a-f]*]])
+
+FOO: debug_types contents:
+FOO: 0x00000000: Type Unit: {{.*}} type_signature = [[FOO_SIG]] type_offset = 0x[[FOO_OFF:[0-9a-f]*]] (next unit at
+FOO: DW_TAG_type_unit
+FOO-NOT: NULL
+FOO: 0x0000[[FOO_OFF]]: DW_TAG_structure_type
+FOO-NEXT: DW_AT_name {{.*}}"foo"
+
+BAR: debug_info contents:
+BAR: DW_TAG_variable
+BAR: DW_TAG_variable
+BAR-NEXT: DW_AT_name {{.*}}"b"
+BAR: DW_AT_type [DW_FORM_ref_sig8] ([[BAR_SIG:0x[0-9a-f]*]])
+
+BAR: debug_types contents:
+BAR: 0x00000000: Type Unit: {{.*}} type_signature = [[BAR_SIG]] type_offset = 0x[[BAR_OFF:[0-9a-f]*]] (next unit at
+BAR: DW_TAG_type_unit
+BAR-NOT: NULL
+BAR: 0x0000[[BAR_OFF]]: DW_TAG_structure_type
+BAR-NEXT: DW_AT_name {{.*}}"bar"
+
+TYPES-NOT: debug_info contents:
+TYPES: debug_types contents:
diff --git a/test/DebugInfo/enum.ll b/test/DebugInfo/enum.ll
index 59a303e..bc09846 100644
--- a/test/DebugInfo/enum.ll
+++ b/test/DebugInfo/enum.ll
@@ -47,28 +47,28 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata) #1
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!19}
+!llvm.module.flags = !{!19, !24}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !11, metadata !12, metadata !17, metadata !11, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/enum.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"enum.cpp", metadata !"/tmp"}
 !2 = metadata !{metadata !3, metadata !8}
-!3 = metadata !{i32 786436, metadata !1, null, metadata !"e1", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !4, i32 0, i32 0} ; [ DW_TAG_enumeration_type ] [e1] [line 1, size 64, align 64, offset 0] [def] [from ]
+!3 = metadata !{i32 786436, metadata !1, null, metadata !"e1", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [e1] [line 1, size 64, align 64, offset 0] [def] [from ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7}
 !5 = metadata !{i32 786472, metadata !"I", i64 0} ; [ DW_TAG_enumerator ] [I :: 0]
 !6 = metadata !{i32 786472, metadata !"J", i64 4294967295} ; [ DW_TAG_enumerator ] [J :: 4294967295]
 !7 = metadata !{i32 786472, metadata !"K", i64 -1152921504606846976} ; [ DW_TAG_enumerator ] [K :: 17293822569102704640]
-!8 = metadata !{i32 786436, metadata !1, null, metadata !"e2", i32 2, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_enumeration_type ] [e2] [line 2, size 32, align 32, offset 0] [def] [from ]
+!8 = metadata !{i32 786436, metadata !1, null, metadata !"e2", i32 2, i64 32, i64 32, i32 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [e2] [line 2, size 32, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10}
 !10 = metadata !{i32 786472, metadata !"X", i64 0} ; [ DW_TAG_enumerator ] [X :: 0]
 !11 = metadata !{i32 0}
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786478, metadata !1, metadata !14, metadata !"func", metadata !"func", metadata !"_Z4funcv", i32 3, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4funcv, null, null, metadata !11, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [func]
 !14 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/tmp/enum.cpp]
-!15 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null}
 !17 = metadata !{metadata !18}
 !18 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !14, i32 1, metadata !3, i32 0, i32 1, i64* @a, null} ; [ DW_TAG_variable ] [a] [line 1] [def]
@@ -77,3 +77,4 @@ attributes #1 = { nounwind readnone }
 !21 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !22 = metadata !{i32 4, i32 0, metadata !13, null}
 !23 = metadata !{i32 5, i32 0, metadata !13, null}
+!24 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/global.ll b/test/DebugInfo/global.ll
index b438305..9a0c32a 100644
--- a/test/DebugInfo/global.ll
+++ b/test/DebugInfo/global.ll
@@ -18,10 +18,10 @@ entry:
   ret i32 0, !dbg !12
 }
 
-attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!11}
+!llvm.module.flags = !{!11, !13}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/global.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"global.cpp", metadata !"/tmp"}
@@ -29,10 +29,11 @@ attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-fra
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !2, i32 2} ; [ DW_TAG_subprogram ] [line 2] [def] [main]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/global.cpp]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10}
 !10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null}
 !11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !12 = metadata !{i32 4, i32 0, metadata !4, null}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/inheritance.ll b/test/DebugInfo/inheritance.ll
index b0665b2..6b3ae09 100644
--- a/test/DebugInfo/inheritance.ll
+++ b/test/DebugInfo/inheritance.ll
@@ -110,29 +110,29 @@ declare void @_ZdlPv(i8*) nounwind
 !2 = metadata !{i32 458763, metadata !44, metadata !3, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !3 = metadata !{i32 458798, i32 0, metadata !4, metadata !"main", metadata !"main", metadata !"main", i32 11, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !4 = metadata !{i32 458769, metadata !44, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !45, metadata !45, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 458773, metadata !4, metadata !"", metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{i32 458773, metadata !4, null, metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 458788, null, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 458771, metadata !44, metadata !4, metadata !"test1", i32 1, i64 64, i64 64, i64 0, i32 0, null, metadata !9, i32 0, metadata !8} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 458771, metadata !44, metadata !4, metadata !"test1", i32 1, i64 64, i64 64, i64 0, i32 0, null, metadata !9, i32 0, metadata !8, null, null} ; [ DW_TAG_structure_type ] [test1] [line 1, size 64, align 64, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !14, metadata !18}
 !10 = metadata !{i32 458765, metadata !44, metadata !8, metadata !"_vptr$test1", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_member ]
-!11 = metadata !{i32 458767, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 458767, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
 !12 = metadata !{i32 458767, null, metadata !4, metadata !"__vtbl_ptr_type", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
 !13 = metadata !{i32 458769, metadata !46, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !45, metadata !45, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !14 = metadata !{i32 458798, i32 0, metadata !8, metadata !"test1", metadata !"test1", metadata !"", i32 1, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i1 true, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!15 = metadata !{i32 458773, metadata !4, metadata !"", metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!15 = metadata !{i32 458773, metadata !4, null, metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !16 = metadata !{null, metadata !17}
-!17 = metadata !{i32 458767, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
+!17 = metadata !{i32 458767, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
 !18 = metadata !{i32 458798, i32 0, metadata !8, metadata !"~test1", metadata !"~test1", metadata !"", i32 4, metadata !19, i1 false, i1 false, i32 1, i32 0, metadata !8, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!19 = metadata !{i32 458773, metadata !4, metadata !"", metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!19 = metadata !{i32 458773, metadata !4, null, metadata !4, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{null, metadata !17, metadata !7}
 !21 = metadata !{i32 11, i32 0, metadata !1, null}
 !22 = metadata !{i32 13, i32 0, metadata !1, null}
 !23 = metadata !{i32 14, i32 0, metadata !1, null}
 !24 = metadata !{i32 459009, metadata !25, metadata !"this", metadata !4, i32 13, metadata !26} ; [ DW_TAG_arg_variable ]
 !25 = metadata !{i32 458798, i32 0, metadata !4, metadata !"test1", metadata !"test1", metadata !"_ZN5test1C1Ev", i32 1, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!26 = metadata !{i32 458790, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !27} ; [ DW_TAG_const_type ]
-!27 = metadata !{i32 458767, metadata !4, metadata !"", metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
+!26 = metadata !{i32 458790, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !27} ; [ DW_TAG_const_type ]
+!27 = metadata !{i32 458767, metadata !4, null, metadata !4, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
 !28 = metadata !{i32 1, i32 0, metadata !25, null}
 !29 = metadata !{i32 1, i32 0, metadata !30, null}
 !30 = metadata !{i32 458763, metadata !44, metadata !31, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/inline-debug-info-multiret.ll b/test/DebugInfo/inline-debug-info-multiret.ll
index 108f212..594512f 100644
--- a/test/DebugInfo/inline-debug-info-multiret.ll
+++ b/test/DebugInfo/inline-debug-info-multiret.ll
@@ -120,6 +120,7 @@ attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!31}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
@@ -128,11 +129,11 @@ attributes #2 = { nounwind }
 !4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test", metadata !"test", metadata !"_Z4testi", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4testi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
 !5 = metadata !{metadata !"test.cpp", metadata !""}
 !6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
-!7 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test2", metadata !"test2", metadata !"_Z5test2v", i32 11, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z5test2v, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
-!11 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9}
 !13 = metadata !{i32 786689, metadata !4, metadata !"k", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [k] [line 4]
 !14 = metadata !{i32 4, i32 0, metadata !4, null}
@@ -152,3 +153,4 @@ attributes #2 = { nounwind }
 !28 = metadata !{i32 18, i32 0, metadata !27, null}
 !29 = metadata !{i32 19, i32 0, metadata !10, null}
 !30 = metadata !{i32 20, i32 0, metadata !10, null}
+!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/inline-debug-info.ll b/test/DebugInfo/inline-debug-info.ll
index 7c3267a..b56ca95 100644
--- a/test/DebugInfo/inline-debug-info.ll
+++ b/test/DebugInfo/inline-debug-info.ll
@@ -138,6 +138,7 @@ attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!31}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [<unknown>] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"<unknown>", metadata !""}
@@ -146,11 +147,11 @@ attributes #2 = { nounwind }
 !4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test", metadata !"test", metadata !"_Z4testi", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z4testi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [test]
 !5 = metadata !{metadata !"test.cpp", metadata !""}
 !6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [test.cpp]
-!7 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"test2", metadata !"test2", metadata !"_Z5test2v", i32 11, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z5test2v, null, null, metadata !2, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [test2]
-!11 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9}
 !13 = metadata !{i32 786689, metadata !4, metadata !"k", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [k] [line 4]
 !14 = metadata !{i32 4, i32 0, metadata !4, null}
@@ -170,3 +171,4 @@ attributes #2 = { nounwind }
 !28 = metadata !{i32 18, i32 0, metadata !27, null}
 !29 = metadata !{i32 19, i32 0, metadata !10, null}
 !30 = metadata !{i32 20, i32 0, metadata !10, null}
+!31 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/inlined-arguments.ll b/test/DebugInfo/inlined-arguments.ll
index 50a9068..1dd5b2c 100644
--- a/test/DebugInfo/inlined-arguments.ll
+++ b/test/DebugInfo/inlined-arguments.ll
@@ -41,11 +41,12 @@ declare void @_Z2f3i(i32) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) #2
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"exp.cpp", metadata !"/usr/local/google/home/blaikie/dev/scratch"}
@@ -53,10 +54,10 @@ attributes #2 = { nounwind readnone }
 !3 = metadata !{metadata !4, metadata !8}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f2", metadata !"f2", metadata !"_Z2f2v", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z2f2v, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f2]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/scratch/exp.cpp]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"f1", metadata !"f1", metadata !"_Z2f1ii", i32 6, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32, i32)* @_Z2f1ii, null, null, metadata !12, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
-!9 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!9 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{null, metadata !11, metadata !11}
 !11 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !12 = metadata !{metadata !13, metadata !14}
@@ -73,3 +74,4 @@ attributes #2 = { nounwind readnone }
 !23 = metadata !{i32 6, i32 0, metadata !8, null}
 !24 = metadata !{i32 7, i32 0, metadata !8, null}
 !25 = metadata !{i32 8, i32 0, metadata !8, null} ; [ DW_TAG_imported_declaration ]
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/inlined-vars.ll b/test/DebugInfo/inlined-vars.ll
index cd98e1d..34c5101 100644
--- a/test/DebugInfo/inlined-vars.ll
+++ b/test/DebugInfo/inlined-vars.ll
@@ -16,17 +16,18 @@ declare void @smth(i32)
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!27}
 
 !0 = metadata !{i32 786449, metadata !26, i32 4, metadata !"clang version 3.2 (trunk 159419)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !10}
 !5 = metadata !{i32 786478, metadata !26, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 10, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @main, null, null, metadata !1, i32 10} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 786478, metadata !26, metadata !6, metadata !"f", metadata !"f", metadata !"_ZL1fi", i32 3, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !13, i32 3} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!11 = metadata !{i32 786453, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !9, metadata !9}
 !13 = metadata !{metadata !14}
 !14 = metadata !{metadata !15, metadata !16}
@@ -52,3 +53,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !24 = metadata !{i32 5, i32 3, metadata !10, metadata !19}
 !25 = metadata !{i32 6, i32 3, metadata !10, metadata !19}
 !26 = metadata !{metadata !"inline-bug.cc", metadata !"/tmp/dbginfo/pr13202"}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/lit.local.cfg b/test/DebugInfo/lit.local.cfg
deleted file mode 100644
index 00bd9b8..0000000
--- a/test/DebugInfo/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index 4dc3699..a8799cf 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -1,7 +1,9 @@
 RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64 0x400559" > %t.input
+RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64.debuglink 0x400559" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-test.elf-x86-64 0x400436" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-test4.elf-x86-64 0x62c" >> %t.input
 RUN: echo "%p/Inputs/dwarfdump-inl-test.elf-x86-64 0x710" >> %t.input
+RUN: echo "%p/Inputs/dwarfdump-inl-test.high_pc.elf-x86-64 0x568" >> %t.input
 RUN: echo "\"%p/Inputs/dwarfdump-test3.elf-x86-64 space\" 0x633" >> %t.input
 RUN: echo "%p/Inputs/macho-universal 0x1f84" >> %t.input
 RUN: echo "%p/Inputs/macho-universal:i386 0x1f67" >> %t.input
@@ -13,6 +15,9 @@ RUN:    --default-arch=i386 < %t.input | FileCheck %s
 CHECK:       main
 CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16
 
+CHECK:       main
+CHECK-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16
+
 CHECK:      _start
 
 CHECK:      _Z1cv
@@ -27,6 +32,15 @@ CHECK-NEXT: dwarfdump-inl-test.cc:3
 CHECK-NEXT: main
 CHECK-NEXT: dwarfdump-inl-test.cc:
 
+CHECK:      inlined_h
+CHECK-NEXT: dwarfdump-inl-test.h:3
+CHECK-NEXT: inlined_g
+CHECK-NEXT: dwarfdump-inl-test.h:7
+CHECK-NEXT: inlined_f
+CHECK-NEXT: dwarfdump-inl-test.cc:3
+CHECK-NEXT: main
+CHECK-NEXT: dwarfdump-inl-test.cc:
+
 CHECK:       _Z3do1v
 CHECK-NEXT: dwarfdump-test3-decl.h:7
 
diff --git a/test/DebugInfo/member-order.ll b/test/DebugInfo/member-order.ll
new file mode 100644
index 0000000..a0c283d
--- /dev/null
+++ b/test/DebugInfo/member-order.ll
@@ -0,0 +1,66 @@
+; REQUIRES: object-emission
+
+; RUN: llc -filetype=obj -O0 < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; generated by clang from:
+; struct foo {
+;   void f1();
+;   void f2();
+; };
+;
+; void foo::f1() {
+; }
+
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}} "foo"
+; CHECK-NOT: NULL
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: NULL
+; CHECK: DW_AT_name {{.*}} "f1"
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: NULL
+; CHECK: DW_AT_name {{.*}} "f2"
+
+
+%struct.foo = type { i8 }
+
+; Function Attrs: nounwind uwtable
+define void @_ZN3foo2f1Ev(%struct.foo* %this) #0 align 2 {
+entry:
+  %this.addr = alloca %struct.foo*, align 8
+  store %struct.foo* %this, %struct.foo** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.foo** %this.addr}, metadata !16), !dbg !18
+  %this1 = load %struct.foo** %this.addr
+  ret void, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!15, !20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !13, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/member-order.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"member-order.cpp", metadata !"/tmp/dbginfo"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !11}
+!6 = metadata !{i32 786478, metadata !1, metadata !4, metadata !"f1", metadata !"f1", metadata !"_ZN3foo2f1Ev", i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !10, i32 2} ; [ DW_TAG_subprogram ] [line 2] [f1]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS3foo]
+!10 = metadata !{i32 786468}
+!11 = metadata !{i32 786478, metadata !1, metadata !4, metadata !"f2", metadata !"f2", metadata !"_ZN3foo2f2Ev", i32 3, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 3] [f2]
+!12 = metadata !{i32 786468}
+!13 = metadata !{metadata !14}
+!14 = metadata !{i32 786478, metadata !1, null, metadata !"f1", metadata !"f1", metadata !"_ZN3foo2f1Ev", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.foo*)* @_ZN3foo2f1Ev, null, metadata !6, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [f1]
+!15 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!16 = metadata !{i32 786689, metadata !14, metadata !"this", null, i32 16777216, metadata !17, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!17 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3foo"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3foo]
+!18 = metadata !{i32 0, i32 0, metadata !14, null}
+!19 = metadata !{i32 7, i32 0, metadata !14, null}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/member-pointers.ll b/test/DebugInfo/member-pointers.ll
index 7d999f1..0bc4ee6 100644
--- a/test/DebugInfo/member-pointers.ll
+++ b/test/DebugInfo/member-pointers.ll
@@ -4,12 +4,12 @@
 ; RUN: llc -filetype=obj -O0 < %s > %t
 ; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
 ; CHECK: DW_TAG_ptr_to_member_type
-; CHECK: [[TYPE:.*]]:   DW_TAG_subroutine_type
+; CHECK: DW_TAG_ptr_to_member_type
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]       (cu + {{.*}} => {[[TYPE:0x[0-9a-f]+]]})
+; CHECK: [[TYPE]]:   DW_TAG_subroutine_type
 ; CHECK: DW_TAG_formal_parameter
 ; CHECK-NEXT: DW_AT_type
 ; CHECK-NEXT: DW_AT_artificial [DW_FORM_flag
-; CHECK: DW_TAG_ptr_to_member_type
-; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]       (cu + {{.*}} => {[[TYPE]]})
 ; IR generated from clang -g with the following source:
 ; struct S {
 ; };
@@ -21,6 +21,7 @@
 @y = global { i64, i64 } zeroinitializer, align 8
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16}
 
 !0 = metadata !{i32 786449, metadata !15, i32 4, metadata !"clang version 3.3 ", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/home/blaikie/Development/scratch/simple.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
@@ -29,10 +30,11 @@
 !6 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
 !7 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !8, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!9 = metadata !{i32 786451, metadata !15, null, metadata !"S", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !1, i32 0, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [from ]
+!9 = metadata !{i32 786451, metadata !15, null, metadata !"S", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !1, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [S] [line 1, size 8, align 8, offset 0] [def] [from ]
 !10 = metadata !{i32 786484, i32 0, null, metadata !"y", metadata !"y", metadata !"", metadata !6, i32 5, metadata !11, i32 0, i32 1, { i64, i64 }* @y, null} ; [ DW_TAG_variable ] [y] [line 5] [def]
 !11 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !12, metadata !9} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from ]
-!12 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !13 = metadata !{null, metadata !14, metadata !8}
-!14 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from S]
+!14 = metadata !{i32 786447, i32 0, null, i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from S]
 !15 = metadata !{metadata !"simple.cpp", metadata !"/home/blaikie/Development/scratch"}
+!16 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/namespace.ll b/test/DebugInfo/namespace.ll
index 81b8a87..9d0b25c 100644
--- a/test/DebugInfo/namespace.ll
+++ b/test/DebugInfo/namespace.ll
@@ -191,10 +191,11 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %5, !dbg !51
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!52}
 
 !0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !19, metadata !21, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/blaikie/dev/llvm/build/clang/debug//usr/local/google/home/blaikie/dev/llvm/src/tools/clang/test/CodeGenCXX/debug-info-namespace.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !"/usr/local/google/home/blaikie/dev/llvm/src/tools/clang/test/CodeGenCXX/debug-info-namespace.cpp", metadata !"/usr/local/google/home/blaikie/dev/llvm/build/clang/debug"}
@@ -204,15 +205,15 @@ attributes #1 = { nounwind readnone }
 !5 = metadata !{metadata !"foo.cpp", metadata !"/usr/local/google/home/blaikie/dev/llvm/build/clang/debug"}
 !6 = metadata !{i32 786489, metadata !5, metadata !7, metadata !"B", i32 1} ; [ DW_TAG_namespace ] [B] [line 1]
 !7 = metadata !{i32 786489, metadata !1, null, metadata !"A", i32 3} ; [ DW_TAG_namespace ] [A] [line 3]
-!8 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !9 = metadata !{null}
 !10 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f1", metadata !"f1", metadata !"_ZN1A1B2f1Ei", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_ZN1A1B2f1Ei, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [f1]
-!11 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !13}
 !13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !14 = metadata !{i32 786478, metadata !5, metadata !15, metadata !"func", metadata !"func", metadata !"_Z4funcb", i32 13, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i1)* @_Z4funcb, null, null, metadata !2, i32 13} ; [ DW_TAG_subprogram ] [line 13] [def] [func]
 !15 = metadata !{i32 786473, metadata !5}         ; [ DW_TAG_file_type ] [/usr/local/google/home/blaikie/dev/llvm/build/clang/debug/foo.cpp]
-!16 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{metadata !13, metadata !18}
 !18 = metadata !{i32 786468, null, null, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ] [bool] [line 0, size 8, align 8, offset 0, enc DW_ATE_boolean]
 !19 = metadata !{metadata !20}
@@ -224,14 +225,14 @@ attributes #1 = { nounwind readnone }
 !25 = metadata !{i32 786443, metadata !5, metadata !14, i32 14, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/blaikie/dev/llvm/build/clang/debug/foo.cpp]
 !26 = metadata !{i32 786490, metadata !14, metadata !7, i32 18} ; [ DW_TAG_imported_module ]
 !27 = metadata !{i32 786440, metadata !14, metadata !28, i32 19} ; [ DW_TAG_imported_declaration ]
-!28 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"foo", i32 5, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [foo] [line 5, size 0, align 0, offset 0] [fwd] [from ]
+!28 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"foo", i32 5, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 5, size 0, align 0, offset 0] [decl] [from ]
 !29 = metadata !{i32 786440, metadata !14, metadata !30, i32 20} ; [ DW_TAG_imported_declaration ]
-!30 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"bar", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !31, i32 0, null, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 8, align 8, offset 0] [from ]
+!30 = metadata !{i32 786451, metadata !5, metadata !6, metadata !"bar", i32 6, i64 8, i64 8, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 6, size 8, align 8, offset 0] [def] [from ]
 !31 = metadata !{metadata !32}
 !32 = metadata !{i32 786478, metadata !5, metadata !30, metadata !"bar", metadata !"bar", metadata !"", i32 6, metadata !33, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !36, i32 6} ; [ DW_TAG_subprogram ] [line 6] [bar]
-!33 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !34 = metadata !{null, metadata !35}
-!35 = metadata !{i32 786447, i32 0, i32 0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !30} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
+!35 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !30} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
 !36 = metadata !{i32 786468}
 !37 = metadata !{i32 786440, metadata !14, metadata !10, i32 21} ; [ DW_TAG_imported_declaration ]
 !38 = metadata !{i32 786440, metadata !14, metadata !20, i32 22} ; [ DW_TAG_imported_declaration ]
@@ -248,3 +249,4 @@ attributes #1 = { nounwind readnone }
 !49 = metadata !{i32 23, i32 0, metadata !14, null}
 !50 = metadata !{i32 26, i32 0, metadata !14, null}
 !51 = metadata !{i32 27, i32 0, metadata !14, null}
+!52 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/template-recursive-void.ll b/test/DebugInfo/template-recursive-void.ll
new file mode 100644
index 0000000..2ed57a6
--- /dev/null
+++ b/test/DebugInfo/template-recursive-void.ll
@@ -0,0 +1,65 @@
+; REQUIRES: object-emission
+
+; RUN: llc -O0 -filetype=obj < %s > %t
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; This was pulled from clang's debug-info-template-recursive.cpp test.
+; class base { };
+
+; template <class T> class foo : public base  {
+;   void operator=(const foo r) { }
+; };
+
+; class bar : public foo<void> { };
+; bar filters;
+
+; CHECK: DW_TAG_template_type_parameter [{{.*}}]
+; CHECK-NEXT: DW_AT_name{{.*}}"T"
+; CHECK-NOT: DW_AT_type
+; CHECK: NULL
+
+%class.bar = type { i8 }
+
+@filters = global %class.bar zeroinitializer, align 1
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!36, !37}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 187958) (llvm/trunk 187964)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"debug-info-template-recursive.cpp", metadata !"/usr/local/google/home/echristo/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786484, i32 0, null, metadata !"filters", metadata !"filters", metadata !"", metadata !5, i32 10, metadata !6, i32 0, i32 1, %class.bar* @filters, null} ; [ DW_TAG_variable ] [filters] [line 10] [def]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/usr/local/google/home/echristo/tmp/debug-info-template-recursive.cpp]
+!6 = metadata !{i32 786434, metadata !1, null, metadata !"bar", i32 9, i64 8, i64 8, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_class_type ] [bar] [line 9, size 8, align 8, offset 0] [def] [from ]
+!7 = metadata !{metadata !8, metadata !31}
+!8 = metadata !{i32 786460, null, metadata !6, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from foo<void>]
+!9 = metadata !{i32 786434, metadata !1, null, metadata !"foo<void>", i32 5, i64 8, i64 8, i32 0, i32 0, null, metadata !10, i32 0, null, metadata !29, null} ; [ DW_TAG_class_type ] [foo<void>] [line 5, size 8, align 8, offset 0] [def] [from ]
+!10 = metadata !{metadata !11, metadata !19, metadata !25}
+!11 = metadata !{i32 786460, null, metadata !9, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !12} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [from base]
+!12 = metadata !{i32 786434, metadata !1, null, metadata !"base", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_class_type ] [base] [line 3, size 8, align 8, offset 0] [def] [from ]
+!13 = metadata !{metadata !14}
+!14 = metadata !{i32 786478, metadata !1, metadata !12, metadata !"base", metadata !"base", metadata !"", i32 3, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [base]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{null, metadata !17}
+!17 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !12} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from base]
+!18 = metadata !{i32 786468}
+!19 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"operator=", metadata !"operator=", metadata !"_ZN3fooIvEaSES0_", i32 6, metadata !20, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !24, i32 6} ; [ DW_TAG_subprogram ] [line 6] [private] [operator=]
+!20 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!21 = metadata !{null, metadata !22, metadata !23}
+!22 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from foo<void>]
+!23 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from foo<void>]
+!24 = metadata !{i32 786468}
+!25 = metadata !{i32 786478, metadata !1, metadata !9, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !26, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !28, i32 5} ; [ DW_TAG_subprogram ] [line 5] [foo]
+!26 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !27, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!27 = metadata !{null, metadata !22}
+!28 = metadata !{i32 786468}
+!29 = metadata !{metadata !30}
+!30 = metadata !{i32 786479, null, metadata !"T", null, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!31 = metadata !{i32 786478, metadata !1, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 9, metadata !32, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !35, i32 9} ; [ DW_TAG_subprogram ] [line 9] [bar]
+!32 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !33, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!33 = metadata !{null, metadata !34}
+!34 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !6} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from bar]
+!35 = metadata !{i32 786468}
+!36 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
+!37 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/tu-composite.ll b/test/DebugInfo/tu-composite.ll
new file mode 100644
index 0000000..f838eca
--- /dev/null
+++ b/test/DebugInfo/tu-composite.ll
@@ -0,0 +1,185 @@
+; REQUIRES: object-emission
+
+; RUN: llc -filetype=obj -O0 < %s > %t
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+; CHECK: [[TYPE:.*]]: DW_TAG_structure_type
+; Make sure we correctly handle containing type of a struct being a type identifier.
+; CHECK-NEXT: DW_AT_containing_type [DW_FORM_ref4]       (cu + {{.*}} => {[[TYPE]]})
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] {{.*}}= "C")
+
+; Make sure we correctly handle context of a subprogram being a type identifier.
+; CHECK: [[SP:.*]]: DW_TAG_subprogram
+; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "foo")
+; Make sure we correctly handle containing type of a subprogram being a type identifier.
+; CHECK: DW_AT_containing_type [DW_FORM_ref4]       (cu + {{.*}} => {[[TYPE]]})
+; CHECK: DW_TAG_formal_parameter
+; CHECK: NULL
+; CHECK: NULL
+
+; CHECK: [[TYPE2:.*]]: DW_TAG_structure_type
+; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "bar")
+; CHECK: DW_TAG_structure_type
+; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "D")
+; CHECK: DW_TAG_member
+; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "a") 
+; Make sure we correctly handle context of a struct being a type identifier.
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] {{.*}}= "Nested")
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] {{.*}}= "Nested2")
+; CHECK-NEXT: DW_AT_declaration [DW_FORM_flag]      (0x01)
+; CHECK: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] {{.*}}= "virt<bar>")
+; Make sure we correctly handle type of a template_type being a type identifier.
+; CHECK: DW_TAG_template_type_parameter
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + {{.*}} => {[[TYPE2]]})
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp] {{.*}}= "T")
+; Make sure we correctly handle derived-from of a typedef being a type identifier.
+; CHECK: DW_TAG_typedef
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + {{.*}} => {[[TYPE2]]})
+; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "baz2")
+; Make sure we correctly handle derived-from of a pointer type being a type identifier.
+; CHECK: DW_TAG_pointer_type
+; CHECK: DW_AT_type [DW_FORM_ref4] (cu + {{.*}} => {[[TYPE]]})
+; CHECK: DW_TAG_typedef
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + {{.*}} => {[[TYPE2]]})
+; CHECK: DW_AT_name [DW_FORM_strp] {{.*}}= "baz")
+; Make sure we correctly handle derived-from of an array type being a type identifier.
+; CHECK: DW_TAG_array_type
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + {{.*}} => {[[TYPE2]]})
+; IR generated from clang -g with the following source:
+; struct C {
+;   virtual void foo();
+; };
+; void C::foo() {
+; }
+;
+; struct bar { };
+; typedef bar baz;
+; struct D {
+;   typedef bar baz2;
+;   static int a;
+;   struct Nested { };
+;   struct Nested2 { };
+;   template <typename T>
+;   struct virt {
+;     T* values;
+;   };
+; };
+; void test() {
+;   baz B;
+;   bar A[3];
+;   D::baz2 B2;
+;   D::Nested e;
+;   D::Nested2 *p;
+;   D::virt<bar> t;
+; }
+
+%struct.C = type { i32 (...)** }
+%struct.bar = type { i8 }
+%"struct.D::Nested" = type { i8 }
+%"struct.D::Nested2" = type { i8 }
+%"struct.D::virt" = type { %struct.bar* }
+
+@_ZTV1C = unnamed_addr constant [3 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI1C to i8*), i8* bitcast (void (%struct.C*)* @_ZN1C3fooEv to i8*)]
+@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+@_ZTS1C = constant [3 x i8] c"1C\00"
+@_ZTI1C = unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([3 x i8]* @_ZTS1C, i32 0, i32 0) }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_ZN1C3fooEv(%struct.C* %this) unnamed_addr #0 align 2 {
+entry:
+  %this.addr = alloca %struct.C*, align 8
+  store %struct.C* %this, %struct.C** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%struct.C** %this.addr}, metadata !36), !dbg !38
+  %this1 = load %struct.C** %this.addr
+  ret void, !dbg !39
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z4testv() #0 {
+entry:
+  %B = alloca %struct.bar, align 1
+  %A = alloca [3 x %struct.bar], align 1
+  %B2 = alloca %struct.bar, align 1
+  %e = alloca %"struct.D::Nested", align 1
+  %p = alloca %"struct.D::Nested2"*, align 8
+  %t = alloca %"struct.D::virt", align 8
+  call void @llvm.dbg.declare(metadata !{%struct.bar* %B}, metadata !40), !dbg !42
+  call void @llvm.dbg.declare(metadata !{[3 x %struct.bar]* %A}, metadata !43), !dbg !47
+  call void @llvm.dbg.declare(metadata !{%struct.bar* %B2}, metadata !48), !dbg !50
+  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested"* %e}, metadata !51), !dbg !52
+  call void @llvm.dbg.declare(metadata !{%"struct.D::Nested2"** %p}, metadata !53), !dbg !55
+  call void @llvm.dbg.declare(metadata !{%"struct.D::virt"* %t}, metadata !56), !dbg !57
+  ret void, !dbg !58
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!35, !59}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !30, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [tmp.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"tmp.cpp", metadata !"."}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !18, metadata !19, metadata !22, metadata !23, metadata !24}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 1, i64 64, i64 64, i32 0, i32 0, null, metadata !5, i32 0, metadata !"_ZTS1C", null, metadata !"_ZTS1C"} ; [ DW_TAG_structure_type ] [C] [line 1, size 64, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !6, metadata !13}
+!6 = metadata !{i32 786445, metadata !1, metadata !7, metadata !"_vptr$C", i32 0, i64 64, i64 0, i64 0, i32 64, metadata !8} ; [ DW_TAG_member ] [_vptr$C] [line 0, size 64, align 0, offset 0] [artificial] [from ]
+!7 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [tmp.cpp]
+!8 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from __vtbl_ptr_type]
+!9 = metadata !{i32 786447, null, null, metadata !"__vtbl_ptr_type", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [__vtbl_ptr_type] [line 0, size 64, align 0, offset 0] [from ]
+!10 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{i32 786478, metadata !1, metadata !"_ZTS1C", metadata !"foo", metadata !"foo", metadata !"_ZN1C3fooEv", i32 2, metadata !14, i1 false, i1 false, i32 1, i32 0, metadata !"_ZTS1C", i32 256, i1 false, null, null, i32 0, metadata !17, i32 2} ; [ DW_TAG_subprogram ] [line 2] [foo]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{null, metadata !16}
+!16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C]
+!17 = metadata !{i32 786468}
+!18 = metadata !{i32 786451, metadata !1, null, metadata !"bar", i32 7, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3bar"} ; [ DW_TAG_structure_type ] [bar] [line 7, size 8, align 8, offset 0] [def] [from ]
+!19 = metadata !{i32 786451, metadata !1, null, metadata !"D", i32 9, i64 8, i64 8, i32 0, i32 0, null, metadata !20, i32 0, null, null, metadata !"_ZTS1D"} ; [ DW_TAG_structure_type ] [D] [line 9, size 8, align 8, offset 0] [def] [from ]
+!20 = metadata !{metadata !21}
+!21 = metadata !{i32 786445, metadata !1, metadata !"_ZTS1D", metadata !"a", i32 11, i64 0, i64 0, i64 0, i32 4096, metadata !12, null} ; [ DW_TAG_member ] [a] [line 11, size 0, align 0, offset 0] [static] [from int]
+!22 = metadata !{i32 786451, metadata !1, metadata !"_ZTS1D", metadata !"Nested", i32 12, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTSN1D6NestedE"} ; [ DW_TAG_structure_type ] [Nested] [line 12, size 8, align 8, offset 0] [def] [from ]
+!23 = metadata !{i32 786451, metadata !1, metadata !"_ZTS1D", metadata !"Nested2", i32 13, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_structure_type ] [Nested2] [line 13, size 0, align 0, offset 0] [decl] [from ]
+!24 = metadata !{i32 786451, metadata !1, metadata !"_ZTS1D", metadata !"virt<bar>", i32 15, i64 64, i64 64, i32 0, i32 0, null, metadata !25, i32 0, null, metadata !28, metadata !"_ZTSN1D4virtI3barEE"} ; [ DW_TAG_structure_type ] [virt<bar>] [line 15, size 64, align 64, offset 0] [def] [from ]
+!25 = metadata !{metadata !26}
+!26 = metadata !{i32 786445, metadata !1, metadata !"_ZTSN1D4virtI3barEE", metadata !"values", i32 16, i64 64, i64 64, i64 0, i32 0, metadata !27} ; [ DW_TAG_member ] [values] [line 16, size 64, align 64, offset 0] [from ]
+!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS3bar"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS3bar]
+!28 = metadata !{metadata !29}
+!29 = metadata !{i32 786479, null, metadata !"T", metadata !"_ZTS3bar", null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!30 = metadata !{metadata !31, metadata !32}
+!31 = metadata !{i32 786478, metadata !1, null, metadata !"foo", metadata !"foo", metadata !"_ZN1C3fooEv", i32 4, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%struct.C*)* @_ZN1C3fooEv, null, metadata !13, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [foo]
+!32 = metadata !{i32 786478, metadata !1, metadata !7, metadata !"test", metadata !"test", metadata !"_Z4testv", i32 20, metadata !33, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z4testv, null, null, metadata !2, i32 20} ; [ DW_TAG_subprogram ] [line 20] [def] [test]
+!33 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !34, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!34 = metadata !{null}
+!35 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!36 = metadata !{i32 786689, metadata !31, metadata !"this", null, i32 16777216, metadata !37, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!37 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS1C]
+!38 = metadata !{i32 0, i32 0, metadata !31, null}
+!39 = metadata !{i32 5, i32 0, metadata !31, null}
+!40 = metadata !{i32 786688, metadata !32, metadata !"B", metadata !7, i32 21, metadata !41, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [B] [line 21]
+!41 = metadata !{i32 786454, metadata !1, null, metadata !"baz", i32 8, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz] [line 8, size 0, align 0, offset 0] [from _ZTS3bar]
+!42 = metadata !{i32 21, i32 0, metadata !32, null}
+!43 = metadata !{i32 786688, metadata !32, metadata !"A", metadata !7, i32 22, metadata !44, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [A] [line 22]
+!44 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 24, i64 8, i32 0, i32 0, metadata !"_ZTS3bar", metadata !45, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 24, align 8, offset 0] [from _ZTS3bar]
+!45 = metadata !{metadata !46}
+!46 = metadata !{i32 786465, i64 0, i64 3}        ; [ DW_TAG_subrange_type ] [0, 2]
+!47 = metadata !{i32 22, i32 0, metadata !32, null}
+!48 = metadata !{i32 786688, metadata !32, metadata !"B2", metadata !7, i32 23, metadata !49, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [B2] [line 23]
+!49 = metadata !{i32 786454, metadata !1, metadata !"_ZTS1D", metadata !"baz2", i32 10, i64 0, i64 0, i64 0, i32 0, metadata !"_ZTS3bar"} ; [ DW_TAG_typedef ] [baz2] [line 10, size 0, align 0, offset 0] [from _ZTS3bar]
+!50 = metadata !{i32 23, i32 0, metadata !32, null}
+!51 = metadata !{i32 786688, metadata !32, metadata !"e", metadata !7, i32 24, metadata !22, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [e] [line 24]
+!52 = metadata !{i32 24, i32 0, metadata !32, null}
+!53 = metadata !{i32 786688, metadata !32, metadata !"p", metadata !7, i32 25, metadata !54, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [p] [line 25]
+!54 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTSN1D7Nested2E"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTSN1D7Nested2E]
+!55 = metadata !{i32 25, i32 0, metadata !32, null}
+!56 = metadata !{i32 786688, metadata !32, metadata !"t", metadata !7, i32 26, metadata !24, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 26]
+!57 = metadata !{i32 26, i32 0, metadata !32, null}
+!58 = metadata !{i32 27, i32 0, metadata !32, null}
+!59 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/tu-member-pointer.ll b/test/DebugInfo/tu-member-pointer.ll
new file mode 100644
index 0000000..b746d3b
--- /dev/null
+++ b/test/DebugInfo/tu-member-pointer.ll
@@ -0,0 +1,30 @@
+; REQUIRES: object-emission
+
+; RUN: llc -filetype=obj -O0 < %s > %t
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+; CHECK: DW_TAG_ptr_to_member_type
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4]       (cu + {{.*}} => {[[TYPE:0x[0-9a-f]+]]})
+; CHECK: [[TYPE]]:   DW_TAG_base_type
+; IR generated from clang -g with the following source:
+; struct Foo {
+;   int e;
+; };
+; int Foo:*x = 0;
+
+@x = global i64 -1, align 8
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !2, metadata !5, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"foo.cpp", metadata !"."}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"Foo", i32 1, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, metadata !"_ZTS3Foo"} ; [ DW_TAG_structure_type ] [Foo] [line 1, size 0, align 0, offset 0] [decl] [from ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !7, i32 4, metadata !8, i32 0, i32 1, i64* @x, null} ; [ DW_TAG_variable ] [x] [line 4] [def]
+!7 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [foo.cpp]
+!8 = metadata !{i32 786463, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9, metadata !"_ZTS3Foo"} ; [ DW_TAG_ptr_to_member_type ] [line 0, size 0, align 0, offset 0] [from int]
+!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/two-cus-from-same-file.ll b/test/DebugInfo/two-cus-from-same-file.ll
index 6d8c484..8589840 100644
--- a/test/DebugInfo/two-cus-from-same-file.ll
+++ b/test/DebugInfo/two-cus-from-same-file.ll
@@ -33,18 +33,19 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0, !9}
+!llvm.module.flags = !{!33}
 
 !0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"clang version 3.2 (trunk 156513)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !32, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @foo, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 786449, metadata !32, i32 12, metadata !"clang version 3.2 (trunk 156513)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !10, metadata !1, metadata !1, metadata !""} ; [ DW_TAG_compile_unit ]
 !10 = metadata !{metadata !12}
 !12 = metadata !{i32 786478, metadata !32, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 11, metadata !13, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !19, i32 11} ; [ DW_TAG_subprogram ]
-!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!13 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !14, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !14 = metadata !{metadata !15, metadata !15, metadata !16}
 !15 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !16 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ]
@@ -70,3 +71,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; CHECK: {{DW_TAG_compile_unit}}
 ; CHECK: {{foo\.c}}
 
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/DebugInfo/version.ll b/test/DebugInfo/version.ll
index b36e38e..f4dde0a 100644
--- a/test/DebugInfo/version.ll
+++ b/test/DebugInfo/version.ll
@@ -13,10 +13,10 @@ entry:
   ret i32 0, !dbg !10
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!9}
+!llvm.module.flags = !{!9, !11}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 185475)", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !"CodeGen/dwarf-version.c", metadata !"test"}
@@ -24,8 +24,9 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 6} ; [ DW_TAG_subprogram ] [line 6] [def] [main]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !10 = metadata !{i32 7, i32 0, metadata !4, null}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/ExecutionEngine/MCJIT/Inputs/cross-module-b.ll b/test/ExecutionEngine/MCJIT/Inputs/cross-module-b.ll
new file mode 100644
index 0000000..6870117
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/Inputs/cross-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FA()
+
+define i32 @FB() {
+  %r = call i32 @FA( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/Inputs/multi-module-b.ll b/test/ExecutionEngine/MCJIT/Inputs/multi-module-b.ll
new file mode 100644
index 0000000..103b601
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/Inputs/multi-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FC()
+
+define i32 @FB() {
+  %r = call i32 @FC( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/Inputs/multi-module-c.ll b/test/ExecutionEngine/MCJIT/Inputs/multi-module-c.ll
new file mode 100644
index 0000000..b39306b
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/Inputs/multi-module-c.ll
@@ -0,0 +1,4 @@
+define i32 @FC() {
+  ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/Inputs/multi-module-eh-b.ll b/test/ExecutionEngine/MCJIT/Inputs/multi-module-eh-b.ll
new file mode 100644
index 0000000..d7dbb03
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/Inputs/multi-module-eh-b.ll
@@ -0,0 +1,30 @@
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+define void @throwException_B() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @FB() {
+entry:
+  invoke void @throwException_B()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/cross-module-a.ll b/test/ExecutionEngine/MCJIT/cross-module-a.ll
new file mode 100644
index 0000000..fe8d386
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/cross-module-a.ll
@@ -0,0 +1,13 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @FA() {
+  ret i32 0
+}
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/cross-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/cross-module-sm-pic-a.ll
new file mode 100644
index 0000000..ee26702
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/cross-module-sm-pic-a.ll
@@ -0,0 +1,14 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, arm
+
+declare i32 @FB()
+
+define i32 @FA() {
+  ret i32 0
+}
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/eh-lg-pic.ll b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
new file mode 100644
index 0000000..7c0227d
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/eh-lg-pic.ll
@@ -0,0 +1,32 @@
+; RUN: %lli_mcjit -relocation-model=pic -code-model=large %s
+; XFAIL: cygwin, win32, mingw, mips, powerpc64, i686, i386, aarch64, arm
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+define void @throwException() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @main() {
+entry:
+  invoke void @throwException()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/eh-sm-pic.ll b/test/ExecutionEngine/MCJIT/eh-sm-pic.ll
new file mode 100644
index 0000000..00c2bb0
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/eh-sm-pic.ll
@@ -0,0 +1,32 @@
+; RUN: %lli_mcjit -relocation-model=pic -code-model=small %s
+; XFAIL: cygwin, win32, mingw, mips, i686, i386, darwin, aarch64, arm
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+define void @throwException() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @main() {
+entry:
+  invoke void @throwException()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/MCJIT/hello-sm-pic.ll b/test/ExecutionEngine/MCJIT/hello-sm-pic.ll
new file mode 100644
index 0000000..115846c
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/hello-sm-pic.ll
@@ -0,0 +1,12 @@
+; RUN: %lli_mcjit -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, darwin, aarch64, arm
+
+@.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
+
+declare i32 @puts(i8*)
+
+define i32 @main() {
+	%reg210 = call i32 @puts( i8* getelementptr ([12 x i8]* @.LC0, i64 0, i64 0) )		; <i32> [#uses=0]
+	ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index f21468f..fdb36ee 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -1,12 +1,4 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
+root = config.root
 targets = set(root.targets_to_build.split())
 if ('X86' in targets) | ('AArch64' in targets) | ('ARM' in targets) | \
    ('Mips' in targets) | ('PowerPC' in targets) | ('SystemZ' in targets):
@@ -23,8 +15,12 @@ if root.host_arch not in ['i386', 'x86', 'x86_64',
 if 'armv7' in root.host_arch:
     config.unsupported = False
 
-if 'i386-apple-darwin'  in root.target_triple:
+if 'i386-apple-darwin' in root.target_triple:
     config.unsupported = True
 
 if 'powerpc' in root.target_triple and not 'powerpc64' in root.target_triple:
     config.unsupported = True
+
+# ExecutionEngine tests are not expected to pass in a cross-compilation setup.
+if 'native' not in config.available_features:
+    config.unsupported = True
diff --git a/test/ExecutionEngine/MCJIT/multi-module-a.ll b/test/ExecutionEngine/MCJIT/multi-module-a.ll
new file mode 100644
index 0000000..8848ca6
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/multi-module-a.ll
@@ -0,0 +1,9 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll b/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll
new file mode 100644
index 0000000..66fafc9
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/multi-module-eh-a.ll
@@ -0,0 +1,35 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-eh-b.ll %s
+; XFAIL: arm, cygwin, win32, mingw
+declare i8* @__cxa_allocate_exception(i64)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_begin_catch(i8*)
+
+@_ZTIi = external constant i8*
+
+declare i32 @FB()
+
+define void @throwException() {
+  %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+  unreachable
+}
+
+define i32 @main() {
+entry:
+  invoke void @throwException()
+          to label %try.cont unwind label %lpad
+
+lpad:
+  %p = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %e = extractvalue { i8*, i32 } %p, 0
+  call i8* @__cxa_begin_catch(i8* %e)
+  call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:
+  %r = call i32 @FB( )
+  ret i32 %r
+}
diff --git a/test/ExecutionEngine/MCJIT/multi-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/multi-module-sm-pic-a.ll
new file mode 100644
index 0000000..f2fa59f
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/multi-module-sm-pic-a.ll
@@ -0,0 +1,10 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, arm
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/Inputs/cross-module-b.ll b/test/ExecutionEngine/MCJIT/remote/Inputs/cross-module-b.ll
new file mode 100644
index 0000000..6870117
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/Inputs/cross-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FA()
+
+define i32 @FB() {
+  %r = call i32 @FA( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/Inputs/multi-module-b.ll b/test/ExecutionEngine/MCJIT/remote/Inputs/multi-module-b.ll
new file mode 100644
index 0000000..103b601
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/Inputs/multi-module-b.ll
@@ -0,0 +1,7 @@
+declare i32 @FC()
+
+define i32 @FB() {
+  %r = call i32 @FC( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/Inputs/multi-module-c.ll b/test/ExecutionEngine/MCJIT/remote/Inputs/multi-module-c.ll
new file mode 100644
index 0000000..b39306b
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/Inputs/multi-module-c.ll
@@ -0,0 +1,4 @@
+define i32 @FC() {
+  ret i32 0
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
new file mode 100644
index 0000000..094d362
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/cross-module-a.ll
@@ -0,0 +1,13 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @FA() {
+  ret i32 0
+}
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/cross-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/remote/cross-module-sm-pic-a.ll
new file mode 100644
index 0000000..bdaa9a0
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/cross-module-sm-pic-a.ll
@@ -0,0 +1,14 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/cross-module-b.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, arm
+
+declare i32 @FB()
+
+define i32 @FA() {
+  ret i32 0
+}
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/lit.local.cfg b/test/ExecutionEngine/MCJIT/remote/lit.local.cfg
index 39b2a95..6b192ae 100644
--- a/test/ExecutionEngine/MCJIT/remote/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/remote/lit.local.cfg
@@ -1,11 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-if 'armv4' in root.target_triple or 'armv5' in root.target_triple:
+if 'armv4' in config.root.target_triple or \
+        'armv5' in config.root.target_triple:
     config.unsupported = True
diff --git a/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
new file mode 100644
index 0000000..91d0387
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/multi-module-a.ll
@@ -0,0 +1,9 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target %s > /dev/null
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/multi-module-sm-pic-a.ll b/test/ExecutionEngine/MCJIT/remote/multi-module-sm-pic-a.ll
new file mode 100644
index 0000000..73228e4
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/multi-module-sm-pic-a.ll
@@ -0,0 +1,10 @@
+; RUN: %lli_mcjit -extra-module=%p/Inputs/multi-module-b.ll -extra-module=%p/Inputs/multi-module-c.ll -disable-lazy-compilation=true -remote-mcjit -mcjit-remote-process=lli-child-target -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, i686, i386, arm
+
+declare i32 @FB()
+
+define i32 @main() {
+  %r = call i32 @FB( )   ; <i32> [#uses=1]
+  ret i32 %r
+}
+
diff --git a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
index f1a69d8..d10a411 100644
--- a/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/simpletest-remote.ll
@@ -1,5 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit %s > /dev/null
-; XFAIL:  mips
+; RUN: %lli_mcjit -remote-mcjit -mcjit-remote-process=lli-child-target %s > /dev/null
 
 define i32 @bar() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
index 47a710d..97932bc 100644
--- a/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/stubs-remote.ll
@@ -1,5 +1,6 @@
-; RUN: %lli_mcjit -remote-mcjit -disable-lazy-compilation=false %s
-; XFAIL:  mips
+; RUN: %lli_mcjit -remote-mcjit -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target %s
+; XFAIL: *
+; This test should fail until remote symbol resolution is supported.
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/remote/stubs-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/stubs-sm-pic.ll
new file mode 100644
index 0000000..88faf21
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/stubs-sm-pic.ll
@@ -0,0 +1,37 @@
+; RUN: %lli_mcjit -remote-mcjit -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
+; XFAIL: *
+; This function should fail until remote symbol resolution is supported.
+
+define i32 @main() nounwind {
+entry:
+	call void @lazily_compiled_address_is_consistent()
+	ret i32 0
+}
+
+; Test PR3043: @test should have the same address before and after
+; it's JIT-compiled.
+@funcPtr = common global i1 ()* null, align 4
+@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
+
+define void @lazily_compiled_address_is_consistent() nounwind {
+entry:
+	store i1 ()* @test, i1 ()** @funcPtr
+	%pass = tail call i1 @test()		; <i32> [#uses=1]
+	br i1 %pass, label %pass_block, label %fail_block
+pass_block:
+	ret void
+fail_block:
+	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
+	call void @exit(i32 1)
+	unreachable
+}
+
+define i1 @test() nounwind {
+entry:
+	%tmp = load i1 ()** @funcPtr
+	%eq = icmp eq i1 ()* %tmp, @test
+	ret i1 %eq
+}
+
+declare i32 @puts(i8*) noreturn
+declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
index eb2e686..6328089 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-common-symbols-remote.ll
@@ -1,5 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -O0 -disable-lazy-compilation=false %s
-; XFAIL: mips
+; RUN: %lli_mcjit -remote-mcjit -O0 -disable-lazy-compilation=false -mcjit-remote-process=lli-child-target %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
index 874eeae..6b2b97b 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-data-align-remote.ll
@@ -1,5 +1,4 @@
-; RUN:  %lli_mcjit -remote-mcjit -O0 %s
-; XFAIL: mips
+; RUN:  %lli_mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target %s
 
 ; Check that a variable is always aligned as specified.
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
index d7e8c35..a8a93a8 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-fp-no-external-funcs-remote.ll
@@ -1,5 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit %s > /dev/null
-; XFAIL:  mips
+; RUN: %lli_mcjit -remote-mcjit -mcjit-remote-process=lli-child-target %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
index 5337c5d..4181fb0 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-remote.ll
@@ -1,5 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit %s > /dev/null
-; XFAIL: mips
+; RUN: %lli_mcjit -remote-mcjit -mcjit-remote-process=lli-child-target %s > /dev/null
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
new file mode 100644
index 0000000..29ab24b
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/test-global-init-nonzero-sm-pic.ll
@@ -0,0 +1,35 @@
+; RUN: %lli_mcjit -remote-mcjit -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@count = global i32 1, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 49
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
index bd044b6..8b56297 100644
--- a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-remote.ll
@@ -1,4 +1,4 @@
-; RUN: %lli_mcjit -remote-mcjit -O0 %s
+; RUN: %lli_mcjit -remote-mcjit -O0 -mcjit-remote-process=lli-child-target %s
 
 @.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
 @ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
diff --git a/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
new file mode 100644
index 0000000..bad026f
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/remote/test-ptr-reloc-sm-pic.ll
@@ -0,0 +1,17 @@
+; RUN: %lli_mcjit -remote-mcjit -O0 -relocation-model=pic -code-model=small %s
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/MCJIT/stubs-sm-pic.ll b/test/ExecutionEngine/MCJIT/stubs-sm-pic.ll
new file mode 100644
index 0000000..9e214f5
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/stubs-sm-pic.ll
@@ -0,0 +1,36 @@
+; RUN: %lli_mcjit -disable-lazy-compilation=false -relocation-model=pic -code-model=small %s
+; XFAIL: mips, i686, i386, aarch64, arm
+
+define i32 @main() nounwind {
+entry:
+	call void @lazily_compiled_address_is_consistent()
+	ret i32 0
+}
+
+; Test PR3043: @test should have the same address before and after
+; it's JIT-compiled.
+@funcPtr = common global i1 ()* null, align 4
+@lcaic_failure = internal constant [46 x i8] c"@lazily_compiled_address_is_consistent failed\00"
+
+define void @lazily_compiled_address_is_consistent() nounwind {
+entry:
+	store i1 ()* @test, i1 ()** @funcPtr
+	%pass = tail call i1 @test()		; <i32> [#uses=1]
+	br i1 %pass, label %pass_block, label %fail_block
+pass_block:
+	ret void
+fail_block:
+	call i32 @puts(i8* getelementptr([46 x i8]* @lcaic_failure, i32 0, i32 0))
+	call void @exit(i32 1)
+	unreachable
+}
+
+define i1 @test() nounwind {
+entry:
+	%tmp = load i1 ()** @funcPtr
+	%eq = icmp eq i1 ()* %tmp, @test
+	ret i1 %eq
+}
+
+declare i32 @puts(i8*) noreturn
+declare void @exit(i32) noreturn
diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero-sm-pic.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero-sm-pic.ll
new file mode 100644
index 0000000..eb031f2
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero-sm-pic.ll
@@ -0,0 +1,35 @@
+; RUN: %lli_mcjit -relocation-model=pic -code-model=small %s > /dev/null
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@count = global i32 1, align 4
+
+define i32 @main() nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 49
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32* @count, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* @count, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %2 = load i32* %i, align 4
+  %inc1 = add nsw i32 %2, 1
+  store i32 %inc1, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32* @count, align 4
+  %sub = sub nsw i32 %3, 50
+  ret i32 %sub
+}
diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc-sm-pic.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc-sm-pic.ll
new file mode 100644
index 0000000..9e06742
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc-sm-pic.ll
@@ -0,0 +1,17 @@
+; RUN: %lli_mcjit -O0 -relocation-model=pic -code-model=small %s
+; XFAIL: mips, aarch64, arm, i686, i386
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/RuntimeDyld/Inputs/arm_secdiff_reloc.o b/test/ExecutionEngine/RuntimeDyld/Inputs/arm_secdiff_reloc.o
new file mode 100644
index 0000000..5392266
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/Inputs/arm_secdiff_reloc.o
diff --git a/test/ExecutionEngine/RuntimeDyld/arm_secdiff_reloc.test b/test/ExecutionEngine/RuntimeDyld/arm_secdiff_reloc.test
new file mode 100644
index 0000000..92e4dd7
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/arm_secdiff_reloc.test
@@ -0,0 +1 @@
+RUN: llvm-rtdyld -printline %p/Inputs/arm_secdiff_reloc.o
diff --git a/test/ExecutionEngine/fma3-jit.ll b/test/ExecutionEngine/fma3-jit.ll
new file mode 100644
index 0000000..25eaa65
--- /dev/null
+++ b/test/ExecutionEngine/fma3-jit.ll
@@ -0,0 +1,18 @@
+; RUN: %lli %s | FileCheck %s
+; REQUIRES: fma3
+; CHECK: 12.000000
+
+@msg_double = internal global [4 x i8] c"%f\0A\00"
+
+declare i32 @printf(i8*, ...)
+
+define i32 @main() {
+  %fma = tail call double @llvm.fma.f64(double 3.0, double 3.0, double 3.0) nounwind readnone
+
+  %ptr1 = getelementptr [4 x i8]* @msg_double, i32 0, i32 0
+  call i32 (i8*,...)* @printf(i8* %ptr1, double %fma)
+
+  ret i32 0
+}
+
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index b6945ad..28c56ad 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -1,14 +1,9 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
+if config.root.host_arch in ['PowerPC', 'AArch64', 'SystemZ']:
+    config.unsupported = True
 
-if root.host_arch in ['PowerPC', 'AArch64', 'SystemZ']:
+if 'hexagon' in config.root.target_triple:
     config.unsupported = True
 
-if 'hexagon' in root.target_triple:
+# ExecutionEngine tests are not expected to pass in a cross-compilation setup.
+if 'native' not in config.available_features:
     config.unsupported = True
diff --git a/test/ExecutionEngine/test-interp-vec-insertelement.ll b/test/ExecutionEngine/test-interp-vec-insertelement.ll
new file mode 100644
index 0000000..814b905
--- /dev/null
+++ b/test/ExecutionEngine/test-interp-vec-insertelement.ll
@@ -0,0 +1,41 @@
+ ; RUN: %lli -force-interpreter=true %s > /dev/null
+
+define i32 @main() {
+ %v0 = insertelement <2 x i8> zeroinitializer, i8 1, i32 1
+ %v1 = insertelement <3 x i8> zeroinitializer, i8 2, i32 2
+ %v2 = insertelement <4 x i8> zeroinitializer, i8 3, i32 3
+ %v3 = insertelement <8 x i8> zeroinitializer, i8 4, i32 4
+ %v4 = insertelement <16 x i8> zeroinitializer, i8 5, i32 7
+
+ %v5 = insertelement <2 x i16> zeroinitializer, i16 1, i32 1
+ %v6 = insertelement <3 x i16> zeroinitializer, i16 2, i32 2
+ %v7 = insertelement <4 x i16> zeroinitializer, i16 3, i32 3
+ %v8 = insertelement <8 x i16> zeroinitializer, i16 4, i32 4
+ %v9 = insertelement <16 x i16> zeroinitializer, i16 5, i32 7
+
+ %v10 = insertelement <2 x i32> zeroinitializer, i32 1, i32 1
+ %v11 = insertelement <3 x i32> zeroinitializer, i32 2, i32 2
+ %v12 = insertelement <4 x i32> zeroinitializer, i32 3, i32 3
+ %v13 = insertelement <8 x i32> zeroinitializer, i32 4, i32 4
+ %v14 = insertelement <16 x i32> zeroinitializer, i32 5, i32 7
+
+ %v15 = insertelement <2 x i64> zeroinitializer, i64 1, i32 1
+ %v16 = insertelement <3 x i64> zeroinitializer, i64 2, i32 2
+ %v17 = insertelement <4 x i64> zeroinitializer, i64 3, i32 3
+ %v18 = insertelement <8 x i64> zeroinitializer, i64 4, i32 4
+ %v19 = insertelement <16 x i64> zeroinitializer, i64 5, i32 7
+
+ %v20 = insertelement <2 x float> zeroinitializer, float 1.0, i32 1
+ %v21 = insertelement <3 x float> zeroinitializer, float 2.0, i32 2
+ %v22 = insertelement <4 x float> zeroinitializer, float 3.0, i32 3
+ %v23 = insertelement <8 x float> zeroinitializer, float 4.0, i32 4
+ %v24 = insertelement <16 x float> zeroinitializer, float 5.0, i32 7
+
+ %v25 = insertelement <2 x double> zeroinitializer, double 1.0, i32 1
+ %v26 = insertelement <3 x double> zeroinitializer, double 2.0, i32 2
+ %v27 = insertelement <4 x double> zeroinitializer, double 3.0, i32 3
+ %v28 = insertelement <8 x double> zeroinitializer, double 4.0, i32 4
+ %v29 = insertelement <16 x double> zeroinitializer, double 5.0, i32 7
+
+ ret i32 0
+}
diff --git a/test/ExecutionEngine/test-interp-vec-insertextractvalue.ll b/test/ExecutionEngine/test-interp-vec-insertextractvalue.ll
new file mode 100644
index 0000000..09fbf6a
--- /dev/null
+++ b/test/ExecutionEngine/test-interp-vec-insertextractvalue.ll
@@ -0,0 +1,21 @@
+ ; RUN: %lli -force-interpreter=true %s > /dev/null
+
+define i32 @main() {
+
+    %s1 = insertvalue { i32, { float, double} } undef, i32 9, 0
+    %s2 = insertvalue { i32, { float, double} } %s1, float 3.0, 1, 0
+    %s3 = insertvalue { i32, { float, double} } %s2, double 5.0, 1, 1
+
+    %s4 = extractvalue { i32, { float, double} } %s3, 1
+
+    %a1 = extractvalue { i32, { float, double} } %s3, 0
+
+    %a2 = extractvalue { i32, { float, double} } %s3, 1, 0
+    %a3 = extractvalue { i32, { float, double} } %s3, 1, 1
+    %a4 = extractvalue { float, double} %s4, 0
+    %a5 = extractvalue { float, double} %s4, 1
+
+    %aa = fpext float %a4 to double
+
+ ret i32 0
+}
diff --git a/test/ExecutionEngine/test-interp-vec-select.ll b/test/ExecutionEngine/test-interp-vec-select.ll
new file mode 100644
index 0000000..ce086e4
--- /dev/null
+++ b/test/ExecutionEngine/test-interp-vec-select.ll
@@ -0,0 +1,118 @@
+; RUN: %lli -force-interpreter=true %s > /dev/null
+
+define i32 @main() {
+
+  ; Vector values
+  %a2_i8 = add <2 x i8> zeroinitializer, <i8 0, i8 1>
+  %a3_i8 = add <3 x i8> zeroinitializer, <i8 0, i8 1, i8 2>
+  %a4_i8 = add <4 x i8> zeroinitializer, <i8 0, i8 1, i8 2, i8 3>
+  %a8_i8 = add <8 x i8> zeroinitializer, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  %a16_i8 = add <16 x i8> zeroinitializer, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>
+
+  %a2_i16 = add <2 x i16> zeroinitializer, <i16 0, i16 1>
+  %a3_i16 = add <3 x i16> zeroinitializer, <i16 0, i16 1, i16 2>
+  %a4_i16 = add <4 x i16> zeroinitializer, <i16 0, i16 1, i16 2, i16 3>
+  %a8_i16 = add <8 x i16> zeroinitializer, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  %a16_i16 = add <16 x i16> zeroinitializer, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+
+  %a2_i32 = add <2 x i32> zeroinitializer, <i32 0, i32 1>
+  %a3_i32 = add <3 x i32> zeroinitializer, <i32 0, i32 1, i32 2>
+  %a4_i32 = add <4 x i32> zeroinitializer, <i32 0, i32 1, i32 2, i32 3>
+  %a8_i32 = add <8 x i32> zeroinitializer, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %a16_i32 = add <16 x i32> zeroinitializer, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %a2_i64 = add <2 x i64> zeroinitializer, <i64 0, i64 1>
+  %a3_i64 = add <3 x i64> zeroinitializer, <i64 0, i64 1, i64 2>
+  %a4_i64 = add <4 x i64> zeroinitializer, <i64 0, i64 1, i64 2, i64 3>
+  %a8_i64 = add <8 x i64> zeroinitializer, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %a16_i64 = add <16 x i64> zeroinitializer, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+
+  %a2_float = fadd <2 x float> zeroinitializer, <float 0.0, float 1.0>
+  %a3_float = fadd <3 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0>
+  %a4_float = fadd <4 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0, float 3.0>
+  %a8_float = fadd <8 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>
+  %a16_float = fadd <16 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>
+
+  %a2_double = fadd <2 x double> zeroinitializer, <double 0.0, double 1.0>
+  %a3_double = fadd <3 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0>
+  %a4_double = fadd <4 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0, double 3.0>
+  %a8_double = fadd <8 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>
+  %a16_double = fadd <16 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>
+
+  %b2_i8  = sub <2 x i8> zeroinitializer, %a2_i8
+  %b3_i8  = sub <3 x i8> zeroinitializer, %a3_i8
+  %b4_i8  = sub <4 x i8> zeroinitializer, %a4_i8
+  %b8_i8  = sub <8 x i8> zeroinitializer, %a8_i8
+  %b16_i8 = sub <16 x i8> zeroinitializer, %a16_i8
+
+  %b2_i16  = sub <2 x i16> zeroinitializer, %a2_i16
+  %b3_i16  = sub <3 x i16> zeroinitializer, %a3_i16
+  %b4_i16  = sub <4 x i16> zeroinitializer, %a4_i16
+  %b8_i16  = sub <8 x i16> zeroinitializer, %a8_i16
+  %b16_i16 = sub <16 x i16> zeroinitializer, %a16_i16
+
+  %b2_i32  = sub <2 x i32> zeroinitializer, %a2_i32
+  %b3_i32  = sub <3 x i32> zeroinitializer, %a3_i32
+  %b4_i32  = sub <4 x i32> zeroinitializer, %a4_i32
+  %b8_i32  = sub <8 x i32> zeroinitializer, %a8_i32
+  %b16_i32 = sub <16 x i32> zeroinitializer, %a16_i32
+
+  %b2_i64  = sub <2 x i64> zeroinitializer, %a2_i64
+  %b3_i64  = sub <3 x i64> zeroinitializer, %a3_i64
+  %b4_i64  = sub <4 x i64> zeroinitializer, %a4_i64
+  %b8_i64  = sub <8 x i64> zeroinitializer, %a8_i64
+  %b16_i64 = sub <16 x i64> zeroinitializer, %a16_i64
+
+  %b2_float  = fsub <2 x float> zeroinitializer, %a2_float
+  %b3_float  = fsub <3 x float> zeroinitializer, %a3_float
+  %b4_float  = fsub <4 x float> zeroinitializer, %a4_float
+  %b8_float  = fsub <8 x float> zeroinitializer, %a8_float
+  %b16_float = fsub <16 x float> zeroinitializer, %a16_float
+
+  %b2_double  = fsub <2 x double> zeroinitializer, %a2_double
+  %b3_double  = fsub <3 x double> zeroinitializer, %a3_double
+  %b4_double  = fsub <4 x double> zeroinitializer, %a4_double
+  %b8_double  = fsub <8 x double> zeroinitializer, %a8_double
+  %b16_double = fsub <16 x double> zeroinitializer, %a16_double
+
+
+
+  %v0 = select <2 x i1> <i1 true, i1 false>, <2 x i8> %a2_i8, <2 x i8> %b2_i8
+  %v1 = select <3 x i1> <i1 true, i1 false, i1 true>, <3 x i8> %a3_i8, <3 x i8> %b3_i8
+  %v2 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i8> %a4_i8, <4 x i8> %b4_i8
+  %v3 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i8> %a8_i8, <8 x i8> %b8_i8
+  %v4 = select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> %a16_i8, <16 x i8> %b16_i8
+
+  %v5 = select <2 x i1> <i1 true, i1 false>, <2 x i16> %a2_i16, <2 x i16> %b2_i16
+  %v6 = select <3 x i1> <i1 true, i1 false, i1 true>, <3 x i16> %a3_i16, <3 x i16> %b3_i16
+  %v7 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i16> %a4_i16, <4 x i16> %b4_i16
+  %v8 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a8_i16, <8 x i16> %b8_i16
+  %v9 = select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i16> %a16_i16, <16 x i16> %b16_i16
+
+  %v10 = select <2 x i1> <i1 true, i1 false>, <2 x i32> %a2_i32, <2 x i32> %b2_i32
+  %v11 = select <3 x i1> <i1 true, i1 false, i1 true>, <3 x i32> %a3_i32, <3 x i32> %b3_i32
+  %v12 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %a4_i32, <4 x i32> %b4_i32
+  %v13 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i32> %a8_i32, <8 x i32> %b8_i32
+  %v14 = select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i32> %a16_i32, <16 x i32> %b16_i32
+
+  %v15 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a2_i64, <2 x i64> %b2_i64
+  %v16 = select <3 x i1> <i1 true, i1 false, i1 true>, <3 x i64> %a3_i64, <3 x i64> %b3_i64
+  %v17 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i64> %a4_i64, <4 x i64> %b4_i64
+  %v18 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i64> %a8_i64, <8 x i64> %b8_i64
+  %v19 = select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i64> %a16_i64, <16 x i64> %b16_i64
+
+  %v20 = select <2 x i1> <i1 true, i1 false>, <2 x float> %a2_float, <2 x float> %b2_float
+  %v21 = select <3 x i1> <i1 true, i1 false, i1 true>, <3 x float> %a3_float, <3 x float> %b3_float
+  %v22 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a4_float, <4 x float> %b4_float
+  %v23 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %a8_float, <8 x float> %b8_float
+  %v24 = select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x float> %a16_float, <16 x float> %b16_float
+
+  %v25 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a2_double, <2 x double> %b2_double
+  %v26 = select <3 x i1> <i1 true, i1 false, i1 true>, <3 x double> %a3_double, <3 x double> %b3_double
+  %v27 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %a4_double, <4 x double> %b4_double
+  %v28 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x double> %a8_double, <8 x double> %b8_double
+  %v29 = select <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x double> %a16_double, <16 x double> %b16_double
+
+
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/test-interp-vec-shuffle.ll b/test/ExecutionEngine/test-interp-vec-shuffle.ll
new file mode 100644
index 0000000..e55fa99
--- /dev/null
+++ b/test/ExecutionEngine/test-interp-vec-shuffle.ll
@@ -0,0 +1,81 @@
+; RUN: %lli -force-interpreter=true %s > /dev/null
+
+define i32 @main() {
+
+  ; Vector values
+  %a2_i8 = add <2 x i8> zeroinitializer, <i8 0, i8 1>
+  %a3_i8 = add <3 x i8> zeroinitializer, <i8 0, i8 1, i8 2>
+  %a4_i8 = add <4 x i8> zeroinitializer, <i8 0, i8 1, i8 2, i8 3>
+  %a8_i8 = add <8 x i8> zeroinitializer, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+  %a16_i8 = add <16 x i8> zeroinitializer, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>
+
+  %a2_i16 = add <2 x i16> zeroinitializer, <i16 0, i16 1>
+  %a3_i16 = add <3 x i16> zeroinitializer, <i16 0, i16 1, i16 2>
+  %a4_i16 = add <4 x i16> zeroinitializer, <i16 0, i16 1, i16 2, i16 3>
+  %a8_i16 = add <8 x i16> zeroinitializer, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+  %a16_i16 = add <16 x i16> zeroinitializer, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
+
+  %a2_i32 = add <2 x i32> zeroinitializer, <i32 0, i32 1>
+  %a3_i32 = add <3 x i32> zeroinitializer, <i32 0, i32 1, i32 2>
+  %a4_i32 = add <4 x i32> zeroinitializer, <i32 0, i32 1, i32 2, i32 3>
+  %a8_i32 = add <8 x i32> zeroinitializer, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %a16_i32 = add <16 x i32> zeroinitializer, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %a2_i64 = add <2 x i64> zeroinitializer, <i64 0, i64 1>
+  %a3_i64 = add <3 x i64> zeroinitializer, <i64 0, i64 1, i64 2>
+  %a4_i64 = add <4 x i64> zeroinitializer, <i64 0, i64 1, i64 2, i64 3>
+  %a8_i64 = add <8 x i64> zeroinitializer, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+  %a16_i64 = add <16 x i64> zeroinitializer, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+
+  %a2_float = fadd <2 x float> zeroinitializer, <float 0.0, float 1.0>
+  %a3_float = fadd <3 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0>
+  %a4_float = fadd <4 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0, float 3.0>
+  %a8_float = fadd <8 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>
+  %a16_float = fadd <16 x float> zeroinitializer, <float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>
+
+  %a2_double = fadd <2 x double> zeroinitializer, <double 0.0, double 1.0>
+  %a3_double = fadd <3 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0>
+  %a4_double = fadd <4 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0, double 3.0>
+  %a8_double = fadd <8 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>
+  %a16_double = fadd <16 x double> zeroinitializer, <double 0.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>
+
+
+  %v0 = shufflevector <2 x i8> %a2_i8, <2 x i8>undef, <2 x i32> <i32 1, i32 0>
+  %v1 = shufflevector <3 x i8> %a3_i8, <3 x i8>undef, <3 x i32> <i32 2, i32 1, i32 0>
+  %v2 = shufflevector <4 x i8> %a4_i8, <4 x i8>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v3 = shufflevector <8 x i8> %a8_i8, <8 x i8>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v4 = shufflevector <16 x i8> %a16_i8, <16 x i8>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  %v5 = shufflevector <2 x i16> %a2_i16, <2 x i16>undef, <2 x i32> <i32 1, i32 0>
+  %v6 = shufflevector <3 x i16> %a3_i16, <3 x i16>undef, <3 x i32> <i32 2, i32 1, i32 0>
+  %v7 = shufflevector <4 x i16> %a4_i16, <4 x i16>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v8 = shufflevector <8 x i16> %a8_i16, <8 x i16>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v9 = shufflevector <16 x i16> %a16_i16, <16 x i16>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  %v10 = shufflevector <2 x i32> %a2_i32, <2 x i32>undef, <2 x i32> <i32 1, i32 0>
+  %v11 = shufflevector <3 x i32> %a3_i32, <3 x i32>undef, <3 x i32> <i32 2, i32 1, i32 0>
+  %v12 = shufflevector <4 x i32> %a4_i32, <4 x i32>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v13 = shufflevector <8 x i32> %a8_i32, <8 x i32>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v14 = shufflevector <16 x i32> %a16_i32, <16 x i32>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  %v15 = shufflevector <2 x i64> %a2_i64, <2 x i64>undef, <2 x i32> <i32 1, i32 0>
+  %v16 = shufflevector <3 x i64> %a3_i64, <3 x i64>undef, <3 x i32> <i32 2, i32 1, i32 0>
+  %v17 = shufflevector <4 x i64> %a4_i64, <4 x i64>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v18 = shufflevector <8 x i64> %a8_i64, <8 x i64>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v19 = shufflevector <16 x i64> %a16_i64, <16 x i64>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  %v20 = shufflevector <2 x float> %a2_float, <2 x float>undef, <2 x i32> <i32 1, i32 0>
+  %v21 = shufflevector <3 x float> %a3_float, <3 x float>undef, <3 x i32> <i32 2, i32 1, i32 0>
+  %v22 = shufflevector <4 x float> %a4_float, <4 x float>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v23 = shufflevector <8 x float> %a8_float, <8 x float>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v24 = shufflevector <16 x float> %a16_float, <16 x float>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  %v25 = shufflevector <2 x double> %a2_double, <2 x double>undef, <2 x i32> <i32 1, i32 0>
+  %v26 = shufflevector <3 x double> %a3_double, <3 x double>undef, <3 x i32> <i32 2, i32 1, i32 0>
+  %v27 = shufflevector <4 x double> %a4_double, <4 x double>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v28 = shufflevector <8 x double> %a8_double, <8 x double>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v29 = shufflevector <16 x double> %a16_double, <16 x double>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret i32 0
+}
+
diff --git a/test/Feature/lit.local.cfg b/test/Feature/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Feature/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Feature/md_on_instruction.ll b/test/Feature/md_on_instruction.ll
index 8599601..955ace3 100644
--- a/test/Feature/md_on_instruction.ll
+++ b/test/Feature/md_on_instruction.ll
@@ -16,9 +16,12 @@ declare void @llvm.dbg.func.start(metadata) nounwind readnone
 
 declare void @llvm.dbg.region.end(metadata) nounwind readnone
 
+!llvm.module.flags = !{!6}
+
 !0 = metadata !{i32 458798, i32 0, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", metadata !1, i32 1, metadata !2, i1 false, i1 true}
 !1 = metadata !{i32 458769, metadata !4, i32 12, metadata !"clang 1.0", i1 true, metadata !"", i32 0, metadata !5, metadata !5, metadata !4, null, null, metadata !""}
 !2 = metadata !{i32 458788, null, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
 !3 = metadata !{i32 1, i32 13, metadata !1, metadata !1}
 !4 = metadata !{metadata !"foo.c", metadata !"/tmp"}
 !5 = metadata !{i32 0}
+!6 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Feature/newcasts.ll b/test/Feature/newcasts.ll
index 4cfc8bc..dd47484 100644
--- a/test/Feature/newcasts.ll
+++ b/test/Feature/newcasts.ll
@@ -20,6 +20,9 @@ define void @"NewCasts" (i16 %x) {
   %p = uitofp <4 x i32> %n to <4 x float>
   %q = fptosi <4 x float> %p to <4 x i32>
   %r = fptoui <4 x float> %p to <4 x i32>
+  %s = inttoptr <4 x i32> %n to <4 x i32*>
+  %t = addrspacecast <4 x i32*> %s to <4 x i32 addrspace(1)*>
+  %z = addrspacecast <4 x i32*> %s to <4 x float addrspace(2)*>
   ret void
 }
 
diff --git a/test/Feature/optnone.ll b/test/Feature/optnone.ll
new file mode 100644
index 0000000..7d8afd4
--- /dev/null
+++ b/test/Feature/optnone.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Check for the presence of attribute optnone in the disassembly.
+
+; CHECK: @foo() #0
+define void @foo() #0 {
+  ret void
+}
+
+; CHECK: attributes #0 = { noinline optnone }
+attributes #0 = { optnone noinline }
+
diff --git a/test/Feature/prefixdata.ll b/test/Feature/prefixdata.ll
new file mode 100644
index 0000000..b53945c
--- /dev/null
+++ b/test/Feature/prefixdata.ll
@@ -0,0 +1,18 @@
+; RUN: llvm-as < %s | llvm-dis > %t1.ll
+; RUN: FileCheck %s < %t1.ll
+; RUN: llvm-as < %t1.ll | llvm-dis > %t2.ll
+; RUN: diff %t1.ll %t2.ll
+; RUN: opt -O3 -S < %t1.ll | FileCheck %s
+
+; CHECK: @i
+@i = linkonce_odr global i32 1
+
+; CHECK: f(){{.*}}prefix i32 1
+define void @f() prefix i32 1 {
+  ret void
+}
+
+; CHECK: g(){{.*}}prefix i32* @i
+define void @g() prefix i32* @i {
+  ret void
+}
diff --git a/test/FileCheck/check-a-b-has-b.txt b/test/FileCheck/check-a-b-has-b.txt
new file mode 100644
index 0000000..4d64d09
--- /dev/null
+++ b/test/FileCheck/check-a-b-has-b.txt
@@ -0,0 +1,5 @@
+; RUN: FileCheck -check-prefix=A -check-prefix=B -input-file %s %s
+
+this is the string to be matched
+
+; B-DAG: this is the string to be {{matched}}
diff --git a/test/FileCheck/check-b-a-has-b.txt b/test/FileCheck/check-b-a-has-b.txt
new file mode 100644
index 0000000..ac14990
--- /dev/null
+++ b/test/FileCheck/check-b-a-has-b.txt
@@ -0,0 +1,5 @@
+; RUN: FileCheck -check-prefix=B -check-prefix=A -input-file %s %s
+
+this is the string to be matched
+
+; B-DAG: this is the string to be {{matched}}
diff --git a/test/FileCheck/check-dag-multi-prefix-2.txt b/test/FileCheck/check-dag-multi-prefix-2.txt
new file mode 100644
index 0000000..4add70d
--- /dev/null
+++ b/test/FileCheck/check-dag-multi-prefix-2.txt
@@ -0,0 +1,7 @@
+; RUN: FileCheck -check-prefix=A -input-file %s %s
+
+this should be matched
+
+; B-DAG: foo
+
+; A-DAG: {{this}} should be matched
diff --git a/test/FileCheck/check-dag-multi-prefix.txt b/test/FileCheck/check-dag-multi-prefix.txt
new file mode 100644
index 0000000..95dfe5a
--- /dev/null
+++ b/test/FileCheck/check-dag-multi-prefix.txt
@@ -0,0 +1,27 @@
+; RUN: FileCheck -check-prefix=A -check-prefix=B -input-file %s %s
+
+add r10, r1, r2
+add r11, r3, r4
+mul r5, r10, r11
+
+mul r11, r3, r4
+mul r10, r1, r2
+add r5, r10, r11
+
+add r11, r3, r4
+add r10, r1, r2
+mul r5, r10, r11
+
+; B-DAG: add [[REG1:r[0-9]+]], r1, r2
+; B-DAG: add [[REG2:r[0-9]+]], r3, r4
+; B: mul r5, [[REG1]], [[REG2]]
+
+; A-DAG: mul [[REG1:r[0-9]+]], r1, r2
+; A-DAG: mul [[REG2:r[0-9]+]], r3, r4
+; A: add r5, [[REG1]], [[REG2]]
+
+; B-DAG: add [[REG1:r[0-9]+]], r1, r2
+; B-DAG: add [[REG2:r[0-9]+]], r3, r4
+; B-NOT: xor
+; B-DAG: mul r5, [[REG1]], [[REG2]]
+
diff --git a/test/FileCheck/check-dag-substring-prefix.txt b/test/FileCheck/check-dag-substring-prefix.txt
new file mode 100644
index 0000000..49d4b2b
--- /dev/null
+++ b/test/FileCheck/check-dag-substring-prefix.txt
@@ -0,0 +1,7 @@
+; RUN: not FileCheck -check-prefix=A -check-prefix=AA -input-file %s %s
+
+this is the string to be matched
+this should also be matched
+
+; BAA-DAG: this is the string to be {{matched}}
+; BAA-DAG: this should also be {{matched}}
diff --git a/test/FileCheck/check-label-dag-capture.txt b/test/FileCheck/check-label-dag-capture.txt
new file mode 100644
index 0000000..d8f90f4
--- /dev/null
+++ b/test/FileCheck/check-label-dag-capture.txt
@@ -0,0 +1,11 @@
+; RUN: FileCheck -input-file %s %s
+
+bar
+foo
+foo
+zed
+
+CHECK-LABEL: {{^}}bar
+CHECK: {{^}}[[FOO:foo]]
+CHECK-DAG: {{^}}[[FOO]]
+CHECK-LABEL: {{^}}zed
diff --git a/test/FileCheck/check-label-dag.txt b/test/FileCheck/check-label-dag.txt
new file mode 100644
index 0000000..2f54c3e
--- /dev/null
+++ b/test/FileCheck/check-label-dag.txt
@@ -0,0 +1,11 @@
+; RUN: not FileCheck -input-file %s %s 2>&1 | FileCheck --check-prefix=ERROR %s
+
+bar
+zed
+
+CHECK-LABEL: {{^}}bar
+CHECK-DAG: {{^}}foo
+CHECK-LABEL: {{^}}zed
+
+ERROR: error: expected string not found in input
+ERROR-NEXT: CHECK-DAG: {{.....}}foo
diff --git a/test/FileCheck/check-multi-prefix-label.txt b/test/FileCheck/check-multi-prefix-label.txt
new file mode 100644
index 0000000..41fe641
--- /dev/null
+++ b/test/FileCheck/check-multi-prefix-label.txt
@@ -0,0 +1,6 @@
+// RUN: FileCheck -check-prefix=ONE -check-prefix=TWO -input-file %s %s
+
+foo
+bar
+; ONE-LABEL: {{f}}oo
+; TWO-NEXT: {{b}}ar
diff --git a/test/FileCheck/check-multiple-prefixes-mixed.txt b/test/FileCheck/check-multiple-prefixes-mixed.txt
new file mode 100644
index 0000000..cd3b70a
--- /dev/null
+++ b/test/FileCheck/check-multiple-prefixes-mixed.txt
@@ -0,0 +1,10 @@
+// RUN: FileCheck -check-prefix=B -check-prefix=BOTH -input-file %s %s
+// RUN: FileCheck -check-prefix=A -check-prefix=BOTH -input-file %s %s
+
+; A: {{a}}aaaaa
+; B: {{b}}bbbb
+; BOTH: {{q}}qqqqq
+aaaaaa
+bbbbb
+qqqqqq
+ccccc
diff --git a/test/FileCheck/check-multiple-prefixes-nomatch.txt b/test/FileCheck/check-multiple-prefixes-nomatch.txt
new file mode 100644
index 0000000..9d38359
--- /dev/null
+++ b/test/FileCheck/check-multiple-prefixes-nomatch.txt
@@ -0,0 +1,10 @@
+; RUN: not FileCheck -input-file %s %s -check-prefix=FOO -check-prefix=BAR 2>&1 | FileCheck %s
+
+BAR
+bar
+foo
+; BAR: ba{{z}}
+; FOO: fo{{o}}
+
+; CHECK: {{error: expected string not found in input}}
+; CHECK-NEXT: {{B}}AR: ba{{[{][{]z[}][}]}}
diff --git a/test/FileCheck/check-multiple-prefixes-substr.txt b/test/FileCheck/check-multiple-prefixes-substr.txt
new file mode 100644
index 0000000..76a2ca8
--- /dev/null
+++ b/test/FileCheck/check-multiple-prefixes-substr.txt
@@ -0,0 +1,5 @@
+// RUN: FileCheck -check-prefix=CHECKER -check-prefix=CHECK -input-file %s %s
+// RUN: FileCheck -check-prefix=CHECK -check-prefix=CHECKER -input-file %s %s
+
+foo
+; CHECKER: fo{{o}}
diff --git a/test/FileCheck/check-prefixes.txt b/test/FileCheck/check-prefixes.txt
new file mode 100644
index 0000000..fc7a043
--- /dev/null
+++ b/test/FileCheck/check-prefixes.txt
@@ -0,0 +1,9 @@
+// RUN: FileCheck -check-prefix=ANOTHER-PREFIX -input-file %s %s
+// RUN: not FileCheck -check-prefix=PREFIX -input-file %s %s 2>&1 | FileCheck -check-prefix=CHECK-NONEXISTENT-PREFIX %s
+
+foobar
+; ANOTHER-PREFIX: foobar
+
+; We use regex to match the colon so that FileCheck won't think it is a check
+; prefix.
+; CHECK-NONEXISTENT-PREFIX: error: no check strings found with prefix 'PREFIX{{:}}'
diff --git a/test/FileCheck/check-substring-multi-prefix-2.txt b/test/FileCheck/check-substring-multi-prefix-2.txt
new file mode 100644
index 0000000..618a288
--- /dev/null
+++ b/test/FileCheck/check-substring-multi-prefix-2.txt
@@ -0,0 +1,11 @@
+; RUN: FileCheck -check-prefix=FOO -check-prefix=FOOBAR -check-prefix=BARFOO -input-file %s %s
+; RUN: FileCheck -check-prefix=FOOBAR -check-prefix=FOO -check-prefix=BARFOO -input-file %s %s
+; RUN: FileCheck -check-prefix=FOOBAR -check-prefix=BARFOO -check-prefix=FOO -input-file %s %s
+
+this is the match
+this is another
+
+FOO
+FOOBAR
+FOOBAR: this is the {{match}}
+BARFOO: this is {{another}}
diff --git a/test/FileCheck/check-substring-multi-prefix.txt b/test/FileCheck/check-substring-multi-prefix.txt
new file mode 100644
index 0000000..b7edb8b
--- /dev/null
+++ b/test/FileCheck/check-substring-multi-prefix.txt
@@ -0,0 +1,9 @@
+// RUN: FileCheck -check-prefix=AAAOVERLAP -check-prefix=OVERLAP -input-file %s %s
+
+foo
+bar
+buzz
+
+OVERLAP: foo
+AAAOVERLAP: bar
+OVERLAP: buzz
diff --git a/test/FileCheck/first-character-match.txt b/test/FileCheck/first-character-match.txt
new file mode 100644
index 0000000..4b09c21
--- /dev/null
+++ b/test/FileCheck/first-character-match.txt
@@ -0,0 +1,2 @@
+RUN: FileCheck -check-prefix=RUN -input-file %s %s
+// Prefix is at the first character in the file. The run line then matches itself.
diff --git a/test/FileCheck/line-count-2.txt b/test/FileCheck/line-count-2.txt
new file mode 100644
index 0000000..a56ab6d
--- /dev/null
+++ b/test/FileCheck/line-count-2.txt
@@ -0,0 +1,11 @@
+// RUN: FileCheck -input-file %s %s
+
+something else
+CHECK: {{some}}thing else
+
+foobar
+
+CHECK: {{foo}}bar
+ALMOSTCHECK
+10 wowomg
+CHECK: [[@LINE-1]] {{wow}}omg
diff --git a/test/FileCheck/line-count.txt b/test/FileCheck/line-count.txt
new file mode 100644
index 0000000..6f91c20
--- /dev/null
+++ b/test/FileCheck/line-count.txt
@@ -0,0 +1,15 @@
+; RUN: FileCheck  -input-file %s %s
+2
+3 aaa
+4 bbb
+5 ccc
+6 CHECK: [[@LINE-3]] {{a}}aa
+7 CHECK: [[@LINE-3]] {{b}}bb
+8 CHECK: [[@LINE-3]] {{c}}cc
+9 foobar
+10 CHECK: [[@LINE-1]] {{foo}}bar
+11
+12 arst CHECK: [[@LINE]] {{a}}rst
+13
+14
+
diff --git a/test/FileCheck/lit.local.cfg b/test/FileCheck/lit.local.cfg
deleted file mode 100644
index ee25f56..0000000
--- a/test/FileCheck/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.txt']
diff --git a/test/FileCheck/multiple-missing-prefixes.txt b/test/FileCheck/multiple-missing-prefixes.txt
new file mode 100644
index 0000000..cb557d9
--- /dev/null
+++ b/test/FileCheck/multiple-missing-prefixes.txt
@@ -0,0 +1,9 @@
+// RUN: FileCheck -check-prefix=ANOTHER-PREFIX -input-file %s %s
+// RUN: not FileCheck -check-prefix=PREFIX1 -check-prefix=PREFIX2 -input-file %s %s 2>&1 | FileCheck -strict-whitespace -check-prefix=CHECK-NONEXISTENT-PREFIX -check-prefix=ALSO-NONEXISTENT %s
+
+foobar
+; ANOTHER-PREFIX: foobar
+
+; We use regex to match the colon so that FileCheck won't think it is a check
+; prefix.
+; CHECK-NONEXISTENT-PREFIX: error: no check strings found with prefixes 'PREFIX1{{:}}', 'PREFIX2{{:}}'
diff --git a/test/FileCheck/separate-multi-prefix.txt b/test/FileCheck/separate-multi-prefix.txt
new file mode 100644
index 0000000..5578d7f
--- /dev/null
+++ b/test/FileCheck/separate-multi-prefix.txt
@@ -0,0 +1,7 @@
+// RUN: not FileCheck -check-prefix=SOMEPREFIX -input-file %s %s
+// RUN: FileCheck -check-prefix=ANOTHER -input-file %s %s
+
+asdf
+; SOMEPREFIX: {{t}}his_is_not_asdf
+; ANOTHER: {{a}}sdf
+
diff --git a/test/FileCheck/validate-check-prefix.txt b/test/FileCheck/validate-check-prefix.txt
new file mode 100644
index 0000000..db3392d
--- /dev/null
+++ b/test/FileCheck/validate-check-prefix.txt
@@ -0,0 +1,9 @@
+// RUN: not FileCheck -check-prefix=A! -input-file %s %s 2>&1 | FileCheck -check-prefix=BAD_PREFIX %s
+// RUN: FileCheck -check-prefix=A1a-B_c -input-file %s %s
+// RUN: not FileCheck -check-prefix=REPEAT -check-prefix=REPEAT -input-file %s %s 2>&1 | FileCheck -check-prefix=BAD_PREFIX %s
+// RUN: not FileCheck -check-prefix=VALID -check-prefix=A! -input-file %s %s 2>&1 | FileCheck -check-prefix=BAD_PREFIX %s
+foobar
+; A1a-B_c: foobar
+
+; BAD_PREFIX: Supplied check-prefix is invalid! Prefixes must be
+  unique and start with a letter and contain only alphanumeric characters, hyphens and underscores
diff --git a/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll b/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
index 38168fc..2c4d82e 100644
--- a/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
+++ b/test/Instrumentation/AddressSanitizer/X86/bug_11395.ll
@@ -64,8 +64,10 @@ entry:
   ret void
 }
 
-!0 = metadata !{metadata !"any pointer", metadata !1}
+!0 = metadata !{metadata !5, metadata !5, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"int", metadata !1}
+!3 = metadata !{metadata !6, metadata !6, i64 0}
 !4 = metadata !{i32 156132, i32 156164, i32 156205, i32 156238, i32 156282, i32 156332, i32 156370, i32 156408, i32 156447, i32 156486, i32 156536, i32 156574, i32 156612, i32 156651, i32 156690, i32 156740, i32 156778, i32 156816, i32 156855, i32 156894, i32 156944, i32 156982, i32 157020, i32 157059, i32 157098, i32 157148, i32 157186, i32 157224, i32 157263, i32 157302, i32 157352, i32 157390, i32 157428, i32 157467, i32 157506, i32 157556, i32 157594, i32 157632, i32 157671, i32 157710, i32 157760, i32 157798, i32 157836, i32 157875, i32 157914, i32 157952, i32 157996, i32 158046, i32 158099, i32 158140, i32 158179, i32 158218, i32 158268, i32 158321, i32 158362, i32 158401, i32 158440, i32 158490, i32 158543, i32 158584, i32 158623, i32 158662, i32 158712, i32 158765, i32 158806, i32 158845, i32 158884, i32 158922, i32 158963, i32 158996, i32 159029, i32 159062, i32 159109, i32 159154, i32 159199, i32 159243, i32 159286, i32 159329, i32 159375, i32 159422, i32 159478, i32 159522, i32 159566}
+!5 = metadata !{metadata !"any pointer", metadata !1}
+!6 = metadata !{metadata !"int", metadata !1}
diff --git a/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg b/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
index b05ed3c..ba763cf 100644
--- a/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
+++ b/test/Instrumentation/AddressSanitizer/X86/lit.local.cfg
@@ -1,13 +1,4 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
 
diff --git a/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll b/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll
index da8f541..1087c9a 100644
--- a/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll
+++ b/test/Instrumentation/AddressSanitizer/asan-vs-gvn.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 %struct_of_7_bytes_4_aligned = type { i32, i8, i8, i8}
 
-@f = global %struct_of_7_bytes_4_aligned zeroinitializer, align 4
+@f = external global %struct_of_7_bytes_4_aligned , align 4
 
 ; Accessing bytes 4 and 6, not ok to widen to i32 if sanitize_address is set.
 
diff --git a/test/Instrumentation/AddressSanitizer/coverage.ll b/test/Instrumentation/AddressSanitizer/coverage.ll
new file mode 100644
index 0000000..47a54c0
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/coverage.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -asan -asan-coverage=1 -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @foo(i32* %a) sanitize_address {
+entry:
+  ret i32 0
+}
+; CHECK: define i32 @foo(i32* %a) #0 {
+; CHECK: %0 = load atomic i8* @__asan_gen_cov_foo monotonic, align 1
+; CHECK: %1 = icmp eq i8 0, %0
+; CHECK: br i1 %1, label %2, label %3
+; CHECK: call void @__sanitizer_cov(i64 ptrtoint (i32 (i32*)* @foo to i64))
+; CHECK: store atomic i8 1, i8* @__asan_gen_cov_foo monotonic, align 1
diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
index ec89d26..daf2957 100644
--- a/test/Instrumentation/AddressSanitizer/debug_info.ll
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll
@@ -31,13 +31,14 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17}
 
 !0 = metadata !{i32 786449, metadata !16, i32 4, metadata !"clang version 3.3 (trunk 169314)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !16, metadata !6, metadata !"zzz", metadata !"zzz", metadata !"_Z3zzzi", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3zzzi, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [zzz]
 !6 = metadata !{i32 786473, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786689, metadata !5, metadata !"p", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
@@ -57,3 +58,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !14 = metadata !{i32 2, i32 0, metadata !13, null}
 !15 = metadata !{i32 3, i32 0, metadata !13, null}
 !16 = metadata !{metadata !"a.cc", metadata !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo"}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Instrumentation/AddressSanitizer/instrument_global.ll b/test/Instrumentation/AddressSanitizer/instrument_global.ll
index 2c183f5..4717277 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_global.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_global.ll
@@ -9,12 +9,73 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: llvm.global_ctors
 ; CHECK: llvm.global_dtors
 
-; CHECK: define internal void @asan.module_ctor
+; Test that we don't instrument global arrays with static initializer
+; indexed with constants in-bounds. But instrument all other cases.
+
+@GlobSt = global [10 x i32] zeroinitializer, align 16  ; static initializer
+@GlobDy = global [10 x i32] zeroinitializer, align 16  ; dynamic initializer
+@GlobEx = external global [10 x i32] , align 16        ; extern initializer
+
+; GlobSt is declared here, and has static initializer -- ok to optimize.
+define i32 @AccessGlobSt_0_2() sanitize_address {
+entry:
+    %0 = load i32* getelementptr inbounds ([10 x i32]* @GlobSt, i64 0, i64 2), align 8
+    ret i32 %0
+; CHECK-LABEL: define i32 @AccessGlobSt_0_2
+; CHECK-NOT: __asan_report
+; CHECK: ret i32 %0
+}
+
+; GlobSt is accessed out of bounds -- can't optimize
+define i32 @AccessGlobSt_0_12() sanitize_address {
+entry:
+    %0 = load i32* getelementptr inbounds ([10 x i32]* @GlobSt, i64 0, i64 12), align 8
+    ret i32 %0
+; CHECK-LABEL: define i32 @AccessGlobSt_0_12
+; CHECK: __asan_report
+; CHECK: ret i32
+}
+
+; GlobSt is accessed with Gep that has non-0 first index -- can't optimize.
+define i32 @AccessGlobSt_1_2() sanitize_address {
+entry:
+    %0 = load i32* getelementptr inbounds ([10 x i32]* @GlobSt, i64 1, i64 2), align 8
+    ret i32 %0
+; CHECK-LABEL: define i32 @AccessGlobSt_1_2
+; CHECK: __asan_report
+; CHECK: ret i32
+}
+
+; GlobDy is declared with dynamic initializer -- can't optimize.
+define i32 @AccessGlobDy_0_2() sanitize_address {
+entry:
+    %0 = load i32* getelementptr inbounds ([10 x i32]* @GlobDy, i64 0, i64 2), align 8
+    ret i32 %0
+; CHECK-LABEL: define i32 @AccessGlobDy_0_2
+; CHECK: __asan_report
+; CHECK: ret i32
+}
+
+; GlobEx is an external global -- can't optimize.
+define i32 @AccessGlobEx_0_2() sanitize_address {
+entry:
+    %0 = load i32* getelementptr inbounds ([10 x i32]* @GlobEx, i64 0, i64 2), align 8
+    ret i32 %0
+; CHECK-LABEL: define i32 @AccessGlobEx_0_2
+; CHECK: __asan_report
+; CHECK: ret i32
+}
+
+
+!llvm.asan.dynamically_initialized_globals = !{!0}
+!0 = metadata !{[10 x i32]* @GlobDy}
+
+; CHECK-LABEL: define internal void @asan.module_ctor
 ; CHECK-NOT: ret
 ; CHECK: call void @__asan_register_globals
 ; CHECK: ret
 
-; CHECK: define internal void @asan.module_dtor
+; CHECK-LABEL: define internal void @asan.module_dtor
 ; CHECK-NOT: ret
 ; CHECK: call void @__asan_unregister_globals
 ; CHECK: ret
diff --git a/test/Instrumentation/AddressSanitizer/lifetime-uar.ll b/test/Instrumentation/AddressSanitizer/lifetime-uar.ll
new file mode 100644
index 0000000..21eaf7f
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/lifetime-uar.ll
@@ -0,0 +1,33 @@
+; Test handling of llvm.lifetime intrinsics in UAR mode.
+; RUN: opt < %s -asan -asan-use-after-return -asan-check-lifetime -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+define i32 @basic_test() sanitize_address {
+  ; CHECK-LABEL: define i32 @basic_test()
+
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i8, align 1
+
+  call void @llvm.lifetime.start(i64 1, i8* %c)
+  ; Memory is unpoisoned at llvm.lifetime.start
+  ; CHECK: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 1)
+
+  store i32 0, i32* %retval
+  store i8 0, i8* %c, align 1
+
+  call void @llvm.lifetime.end(i64 1, i8* %c)
+  ; Memory is poisoned at llvm.lifetime.end
+  ; CHECK: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 1)
+
+  ; No need to unpoison memory at function exit in UAR mode.
+  ; CHECK-NOT: @__asan_unpoison_stack_memory
+  ; CHECK: ret void
+
+  ret i32 0
+}
+
diff --git a/test/Instrumentation/AddressSanitizer/lifetime.ll b/test/Instrumentation/AddressSanitizer/lifetime.ll
index 3348728..d80331e 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -15,7 +15,7 @@ entry:
   call void @llvm.lifetime.end(i64 -1, i8* %i.ptr)
 
 ; Check that lifetime with no size are ignored.
-; CHECK: @lifetime_no_size
+; CHECK-LABEL: define void @lifetime_no_size()
 ; CHECK-NOT: @__asan_poison_stack_memory
 ; CHECK-NOT: @__asan_unpoison_stack_memory
 ; CHECK: ret void
@@ -24,7 +24,7 @@ entry:
 
 ; Generic case of lifetime analysis.
 define void @lifetime() sanitize_address {
-  ; CHECK: @lifetime
+  ; CHECK-LABEL: define void @lifetime()
 
   ; Regular variable lifetime intrinsics.
   %i = alloca i32, align 4
@@ -62,7 +62,7 @@ define void @lifetime() sanitize_address {
 
 ; Check that arguments of lifetime may come from phi nodes.
 define void @phi_args(i1 %x) sanitize_address {
-  ; CHECK: @phi_args
+  ; CHECK-LABEL: define void @phi_args(i1 %x)
 
 entry:
   %i = alloca i64, align 4
diff --git a/test/Instrumentation/AddressSanitizer/lit.local.cfg b/test/Instrumentation/AddressSanitizer/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Instrumentation/AddressSanitizer/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Instrumentation/BoundsChecking/lit.local.cfg b/test/Instrumentation/BoundsChecking/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Instrumentation/BoundsChecking/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Instrumentation/BoundsChecking/simple-32.ll b/test/Instrumentation/BoundsChecking/simple-32.ll
new file mode 100644
index 0000000..38b210f
--- /dev/null
+++ b/test/Instrumentation/BoundsChecking/simple-32.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -bounds-checking -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+%struct.s2_packed = type <{ i64, i32, i32, i32, i16, i8 }>
+
+; CHECK-LABEL: @f
+; CHECK-NOT: trap
+define i16 @f() {
+entry:
+  %packed1 = alloca %struct.s2_packed, align 8
+  %gep = getelementptr inbounds %struct.s2_packed* %packed1, i32 0, i32 4
+  %ptr = bitcast i16* %gep to i32*
+  %val = load i32* %ptr, align 4
+  %valt = trunc i32 %val to i16
+  ret i16 %valt
+}
+
+; CHECK-LABEL: @f
+; CHECK: call void @llvm.trap()
+define i16 @f2() {
+entry:
+  %packed1 = alloca %struct.s2_packed, align 8
+  %gep = getelementptr inbounds %struct.s2_packed* %packed1, i32 0, i32 4
+  %ptr = bitcast i16* %gep to i48*
+  %val = load i48* %ptr, align 4
+  %valt = trunc i48 %val to i16
+  ret i16 %valt
+}
diff --git a/test/Instrumentation/BoundsChecking/simple.ll b/test/Instrumentation/BoundsChecking/simple.ll
index 16870c7..72b58f4 100644
--- a/test/Instrumentation/BoundsChecking/simple.ll
+++ b/test/Instrumentation/BoundsChecking/simple.ll
@@ -126,3 +126,20 @@ define i64 @f12(i64 %x, i64 %y) nounwind {
   %4 = load i64* %3, align 8
   ret i64 %4
 }
+
+; PR17402
+; CHECK-LABEL: @f13
+define void @f13() nounwind {
+entry:
+  br label %alive
+
+dead:
+  ; Self-refential GEPs can occur in dead code.
+  %incdec.ptr = getelementptr inbounds i32* %incdec.ptr, i64 1
+  ; CHECK: %incdec.ptr = getelementptr inbounds i32* %incdec.ptr
+  %l = load i32* %incdec.ptr
+  br label %alive
+
+alive:
+  ret void
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/Inputs/abilist.txt b/test/Instrumentation/DataFlowSanitizer/Inputs/abilist.txt
new file mode 100644
index 0000000..97ce5e6
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/Inputs/abilist.txt
@@ -0,0 +1,8 @@
+fun:discard*=uninstrumented
+fun:discard*=discard
+
+fun:functional=uninstrumented
+fun:functional=functional
+
+fun:custom*=uninstrumented
+fun:custom*=custom
diff --git a/test/Instrumentation/DataFlowSanitizer/abilist.ll b/test/Instrumentation/DataFlowSanitizer/abilist.ll
new file mode 100644
index 0000000..66ddc14
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/abilist.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK: i32 @discard(i32 %a, i32 %b)
+define i32 @discard(i32 %a, i32 %b) {
+  ret i32 0
+}
+
+; CHECK: i32 @functional(i32 %a, i32 %b)
+define i32 @functional(i32 %a, i32 %b) {
+  %c = add i32 %a, %b
+  ret i32 %c
+}
+
+declare void @custom1(i32 %a, i32 %b)
+
+declare i32 @custom2(i32 %a, i32 %b)
+
+declare void @customcb(i32 (i32)* %cb)
+
+declare i32 @cb(i32)
+
+; CHECK: @"dfs$f"
+define void @f() {
+  ; CHECK: %[[LABELRETURN:.*]] = alloca i16
+
+  ; CHECK: call void @__dfsw_custom1(i32 1, i32 2, i16 0, i16 0)
+  call void @custom1(i32 1, i32 2)
+
+  ; CHECK: call i32 @__dfsw_custom2(i32 1, i32 2, i16 0, i16 0, i16* %[[LABELRETURN]])
+  call i32 @custom2(i32 1, i32 2)
+
+  ; CHECK: call void @__dfsw_customcb({{.*}} @"dfst0$customcb", i8* bitcast ({{.*}} @"dfs$cb" to i8*), i16 0)
+  call void @customcb(i32 (i32)* @cb)
+
+  ret void
+}
+
+; CHECK: define i32 (i32, i32)* @discardg(i32)
+; CHECK: %[[CALL:.*]] = call { i32 (i32, i32)*, i16 } @"dfs$g"(i32 %0, i16 0)
+; CHECK: %[[XVAL:.*]] = extractvalue { i32 (i32, i32)*, i16 } %[[CALL]], 0
+; CHECK: ret {{.*}} %[[XVAL]]
+@discardg = alias i32 (i32, i32)* (i32)* @g
+
+; CHECK: define linkonce_odr { i32, i16 } @"dfsw$custom2"(i32, i32, i16, i16)
+; CHECK: %[[LABELRETURN2:.*]] = alloca i16
+; CHECK: %[[RV:.*]] = call i32 @__dfsw_custom2
+; CHECK: %[[RVSHADOW:.*]] = load i16* %[[LABELRETURN2]]
+; CHECK: insertvalue {{.*}}[[RV]], 0
+; CHECK: insertvalue {{.*}}[[RVSHADOW]], 1
+; CHECK: ret { i32, i16 }
+
+; CHECK: @"dfs$g"
+define i32 (i32, i32)* @g(i32) {
+  ; CHECK: ret {{.*}} @"dfsw$custom2"
+  ret i32 (i32, i32)* @custom2
+}
+
+; CHECK: define { i32, i16 } @"dfs$adiscard"(i32, i32, i16, i16)
+; CHECK: %[[CALL:.*]] = call i32 @discard(i32 %0, i32 %1)
+; CHECK: %[[IVAL0:.*]] = insertvalue { i32, i16 } undef, i32 %[[CALL]], 0
+; CHECK: %[[IVAL1:.*]] = insertvalue { i32, i16 } %[[IVAL0]], i16 0, 1
+; CHECK: ret { i32, i16 } %[[IVAL1]]
+@adiscard = alias i32 (i32, i32)* @discard
+
+; CHECK: declare void @__dfsw_custom1(i32, i32, i16, i16)
+; CHECK: declare i32 @__dfsw_custom2(i32, i32, i16, i16, i16*)
+
+; CHECK-LABEL: define linkonce_odr i32 @"dfst0$customcb"(i32 (i32)*, i32, i16, i16*)
+; CHECK: %[[BC:.*]] = bitcast i32 (i32)* %0 to { i32, i16 } (i32, i16)*
+; CHECK: %[[CALL:.*]] = call { i32, i16 } %[[BC]](i32 %1, i16 %2)
+; CHECK: %[[XVAL0:.*]] = extractvalue { i32, i16 } %[[CALL]], 0
+; CHECK: %[[XVAL1:.*]] = extractvalue { i32, i16 } %[[CALL]], 1
+; CHECK: store i16 %[[XVAL1]], i16* %3
+; CHECK: ret i32 %[[XVAL0]]
diff --git a/test/Instrumentation/DataFlowSanitizer/args-unreachable-bb.ll b/test/Instrumentation/DataFlowSanitizer/args-unreachable-bb.ll
new file mode 100644
index 0000000..a699f75
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/args-unreachable-bb.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -dfsan -verify -dfsan-args-abi -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @"dfs$unreachable_bb1"
+define i8 @unreachable_bb1() {
+  ; CHECK: ret { i8, i16 } { i8 1, i16 0 }
+  ; CHECK-NOT: bb2:
+  ; CHECK-NOT: bb3:
+  ; CHECK-NOT: bb4:
+  ret i8 1
+
+bb2:
+  ret i8 2
+
+bb3:
+  br label %bb4
+
+bb4:
+  br label %bb3
+}
+
+declare void @abort() noreturn
+
+; CHECK-LABEL: @"dfs$unreachable_bb2"
+define i8 @unreachable_bb2() {
+  call void @abort() noreturn
+  ; CHECK-NOT: i8 12
+  ; CHECK: unreachable
+  ret i8 12
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/arith.ll b/test/Instrumentation/DataFlowSanitizer/arith.ll
new file mode 100644
index 0000000..dc61896
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/arith.ll
@@ -0,0 +1,63 @@
+; RUN: opt < %s -dfsan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i8 @add(i8 %a, i8 %b) {
+  ; CHECK: @"dfs$add"
+  ; CHECK-DAG: %[[ALABEL:.*]] = load{{.*}}__dfsan_arg_tls, i64 0, i64 0
+  ; CHECK-DAG: %[[BLABEL:.*]] = load{{.*}}__dfsan_arg_tls, i64 0, i64 1
+  ; CHECK: %[[UNION:.*]] = call{{.*}}__dfsan_union(i16 zeroext %[[ALABEL]], i16 zeroext %[[BLABEL]])
+  ; CHECK: %[[ADDLABEL:.*]] = phi i16 [ %[[UNION]], {{.*}} ], [ %[[ALABEL]], {{.*}} ]
+  ; CHECK: add i8
+  ; CHECK: store i16 %[[ADDLABEL]], i16* @__dfsan_retval_tls
+  ; CHECK: ret i8
+  %c = add i8 %a, %b
+  ret i8 %c
+}
+
+define i8 @sub(i8 %a, i8 %b) {
+  ; CHECK: @"dfs$sub"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: call{{.*}}__dfsan_union
+  ; CHECK: sub i8
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i8
+  %c = sub i8 %a, %b
+  ret i8 %c
+}
+
+define i8 @mul(i8 %a, i8 %b) {
+  ; CHECK: @"dfs$mul"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: call{{.*}}__dfsan_union
+  ; CHECK: mul i8
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i8
+  %c = mul i8 %a, %b
+  ret i8 %c
+}
+
+define i8 @sdiv(i8 %a, i8 %b) {
+  ; CHECK: @"dfs$sdiv"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: call{{.*}}__dfsan_union
+  ; CHECK: sdiv i8
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i8
+  %c = sdiv i8 %a, %b
+  ret i8 %c
+}
+
+define i8 @udiv(i8 %a, i8 %b) {
+  ; CHECK: @"dfs$udiv"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: call{{.*}}__dfsan_union
+  ; CHECK: udiv i8
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i8
+  %c = udiv i8 %a, %b
+  ret i8 %c
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/call.ll b/test/Instrumentation/DataFlowSanitizer/call.ll
new file mode 100644
index 0000000..813f4c1
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/call.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -dfsan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK: @__dfsan_arg_tls = external thread_local(initialexec) global [64 x i16]
+; CHECK: @__dfsan_retval_tls = external thread_local(initialexec) global i16
+
+declare i32 @f(i32)
+declare float @llvm.sqrt.f32(float)
+
+; CHECK: @"dfs$call"
+define i32 @call() {
+  ; CHECK: store{{.*}}__dfsan_arg_tls
+  ; CHECK: call{{.*}}@"dfs$f"
+  ; CHECK: load{{.*}}__dfsan_retval_tls
+  %r = call i32 @f(i32 0)
+
+  ; CHECK-NOT: store{{.*}}__dfsan_arg_tls
+  %i = call float @llvm.sqrt.f32(float -1.0)
+
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i32
+  ret i32 %r
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll b/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll
new file mode 100644
index 0000000..6bcd5c5
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/debug-nonzero-labels.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-debug-nonzero-labels -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare i32 @g()
+
+; CHECK: define { i32, i16 } @"dfs$f"(i32, i16)
+define i32 @f(i32) {
+  ; CHECK: [[LOCALLABELALLOCA:%.*]] = alloca i16
+  ; CHECK: [[ARGCMP:%.*]] = icmp ne i16 %1, 0
+  ; CHECK: br i1 [[ARGCMP]]
+  %i = alloca i32
+  store i32 %0, i32* %i
+  ; CHECK: [[CALL:%.*]] = call { i32, i16 } @"dfs$g"()
+  ; CHECK: [[CALLLABEL:%.*]] = extractvalue { i32, i16 } [[CALL]], 1
+  ; CHECK: [[CALLCMP:%.*]] = icmp ne i16 [[CALLLABEL]], 0
+  ; CHECK: br i1 [[CALLCMP]]
+  %call = call i32 @g()
+  ; CHECK: [[LOCALLABEL:%.*]] = load i16* [[LOCALLABELALLOCA]]
+  ; CHECK: [[LOCALCMP:%.*]] = icmp ne i16 [[LOCALLABEL]], 0
+  ; CHECK: br i1 [[LOCALCMP]]
+  %load = load i32* %i
+  ret i32 %load
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/load.ll b/test/Instrumentation/DataFlowSanitizer/load.ll
new file mode 100644
index 0000000..6431213
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/load.ll
@@ -0,0 +1,81 @@
+; RUN: opt < %s -dfsan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i8 @load8(i8* %p) {
+  ; CHECK: @"dfs$load8"
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: load
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i8
+  %a = load i8* %p
+  ret i8 %a
+}
+
+define i16 @load16(i16* %p) {
+  ; CHECK: @"dfs$load16"
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: load
+  ; CHECK: load
+  ; CHECK: icmp ne
+  ; CHECK: call{{.*}}__dfsan_union
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i16
+  %a = load i16* %p
+  ret i16 %a
+}
+
+define i32 @load32(i32* %p) {
+  ; CHECK: @"dfs$load32"
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: bitcast
+  ; CHECK: load
+  ; CHECK: trunc
+  ; CHECK: shl
+  ; CHECK: lshr
+  ; CHECK: or
+  ; CHECK: icmp eq
+
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i32
+
+  ; CHECK: call{{.*}}__dfsan_union_load
+
+  %a = load i32* %p
+  ret i32 %a
+}
+
+define i64 @load64(i64* %p) {
+  ; CHECK: @"dfs$load64"
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: bitcast
+  ; CHECK: load
+  ; CHECK: trunc
+  ; CHECK: shl
+  ; CHECK: lshr
+  ; CHECK: or
+  ; CHECK: icmp eq
+
+  ; CHECK: store{{.*}}__dfsan_retval_tls
+  ; CHECK: ret i64
+
+  ; CHECK: call{{.*}}__dfsan_union_load
+
+  ; CHECK: getelementptr
+  ; CHECK: load
+  ; CHECK: icmp eq
+
+  %a = load i64* %p
+  ret i64 %a
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/memset.ll b/test/Instrumentation/DataFlowSanitizer/memset.ll
new file mode 100644
index 0000000..062ef1a
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/memset.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @ms(i8* %p, i8 %v) {
+  ; CHECK-LABEL: @"dfs$ms"(i8*, i8, i16, i16)
+  ; CHECK: call void @__dfsan_set_label(i16 %3, i8* %0, i64 1)
+  call void @llvm.memset.p0i8.i64(i8* %p, i8 %v, i64 1, i32 1, i1 1)
+  ret void
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll b/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
new file mode 100644
index 0000000..1a56460
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/prefix-rename.ll
@@ -0,0 +1,14 @@
+; RUN: opt < %s -dfsan -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK: module asm ".symver dfs$f1,dfs$f@@version1"
+module asm ".symver f1,f@@version1"
+
+; CHECK: @"dfs$f2" = alias {{.*}} @"dfs$f1"
+@f2 = alias void ()* @f1
+
+; CHECK: define void @"dfs$f1"
+define void @f1() {
+  ret void
+}
diff --git a/test/Instrumentation/DataFlowSanitizer/store.ll b/test/Instrumentation/DataFlowSanitizer/store.ll
new file mode 100644
index 0000000..9509177
--- /dev/null
+++ b/test/Instrumentation/DataFlowSanitizer/store.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -dfsan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @store8(i8 %v, i8* %p) {
+  ; CHECK: @"dfs$store8"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: store
+  store i8 %v, i8* %p
+  ret void
+}
+
+define void @store16(i16 %v, i16* %p) {
+  ; CHECK: @"dfs$store16"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: store
+  store i16 %v, i16* %p
+  ret void
+}
+
+define void @store32(i32 %v, i32* %p) {
+  ; CHECK: @"dfs$store32"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: store
+  store i32 %v, i32* %p
+  ret void
+}
+
+define void @store64(i64 %v, i64* %p) {
+  ; CHECK: @"dfs$store64"
+  ; CHECK: load{{.*}}__dfsan_arg_tls
+  ; CHECK: ptrtoint
+  ; CHECK: and
+  ; CHECK: mul
+  ; CHECK: inttoptr
+  ; CHECK: insertelement
+  ; CHECK: insertelement
+  ; CHECK: insertelement
+  ; CHECK: insertelement
+  ; CHECK: insertelement
+  ; CHECK: insertelement
+  ; CHECK: insertelement
+  ; CHECK: insertelement
+  ; CHECK: bitcast
+  ; CHECK: getelementptr
+  ; CHECK: store
+  ; CHECK: store
+  store i64 %v, i64* %p
+  ret void
+}
diff --git a/test/Instrumentation/MemorySanitizer/atomics.ll b/test/Instrumentation/MemorySanitizer/atomics.ll
new file mode 100644
index 0000000..ff02452
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/atomics.ll
@@ -0,0 +1,189 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; atomicrmw xchg: store clean shadow, return clean shadow
+
+define i32 @AtomicRmwXchg(i32* %p, i32 %x) sanitize_memory {
+entry:
+  %0 = atomicrmw xchg i32* %p, i32 %x seq_cst
+  ret i32 %0
+}
+
+; CHECK: @AtomicRmwXchg
+; CHECK: store i32 0,
+; CHECK: atomicrmw xchg {{.*}} seq_cst
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; atomicrmw max: exactly the same as above
+
+define i32 @AtomicRmwMax(i32* %p, i32 %x) sanitize_memory {
+entry:
+  %0 = atomicrmw max i32* %p, i32 %x seq_cst
+  ret i32 %0
+}
+
+; CHECK: @AtomicRmwMax
+; CHECK: store i32 0,
+; CHECK: atomicrmw max {{.*}} seq_cst
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; cmpxchg: the same as above, but also check %a shadow
+
+define i32 @Cmpxchg(i32* %p, i32 %a, i32 %b) sanitize_memory {
+entry:
+  %0 = cmpxchg i32* %p, i32 %a, i32 %b seq_cst
+  ret i32 %0
+}
+
+; CHECK: @Cmpxchg
+; CHECK: store i32 0,
+; CHECK: icmp
+; CHECK: br
+; CHECK: @__msan_warning
+; CHECK: cmpxchg {{.*}} seq_cst
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; relaxed cmpxchg: bump up to "release"
+
+define i32 @CmpxchgMonotonic(i32* %p, i32 %a, i32 %b) sanitize_memory {
+entry:
+  %0 = cmpxchg i32* %p, i32 %a, i32 %b monotonic
+  ret i32 %0
+}
+
+; CHECK: @CmpxchgMonotonic
+; CHECK: store i32 0,
+; CHECK: icmp
+; CHECK: br
+; CHECK: @__msan_warning
+; CHECK: cmpxchg {{.*}} release
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; atomic load: preserve alignment, load shadow value after app value
+
+define i32 @AtomicLoad(i32* %p) sanitize_memory {
+entry:
+  %0 = load atomic i32* %p seq_cst, align 16
+  ret i32 %0
+}
+
+; CHECK: @AtomicLoad
+; CHECK: load atomic i32* {{.*}} seq_cst, align 16
+; CHECK: [[SHADOW:%[01-9a-z_]+]] = load i32* {{.*}}, align 16
+; CHECK: store i32 {{.*}}[[SHADOW]], {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; atomic load: preserve alignment, load shadow value after app value
+
+define i32 @AtomicLoadAcquire(i32* %p) sanitize_memory {
+entry:
+  %0 = load atomic i32* %p acquire, align 16
+  ret i32 %0
+}
+
+; CHECK: @AtomicLoadAcquire
+; CHECK: load atomic i32* {{.*}} acquire, align 16
+; CHECK: [[SHADOW:%[01-9a-z_]+]] = load i32* {{.*}}, align 16
+; CHECK: store i32 {{.*}}[[SHADOW]], {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; atomic load monotonic: bump up to load acquire
+
+define i32 @AtomicLoadMonotonic(i32* %p) sanitize_memory {
+entry:
+  %0 = load atomic i32* %p monotonic, align 16
+  ret i32 %0
+}
+
+; CHECK: @AtomicLoadMonotonic
+; CHECK: load atomic i32* {{.*}} acquire, align 16
+; CHECK: [[SHADOW:%[01-9a-z_]+]] = load i32* {{.*}}, align 16
+; CHECK: store i32 {{.*}}[[SHADOW]], {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; atomic load unordered: bump up to load acquire
+
+define i32 @AtomicLoadUnordered(i32* %p) sanitize_memory {
+entry:
+  %0 = load atomic i32* %p unordered, align 16
+  ret i32 %0
+}
+
+; CHECK: @AtomicLoadUnordered
+; CHECK: load atomic i32* {{.*}} acquire, align 16
+; CHECK: [[SHADOW:%[01-9a-z_]+]] = load i32* {{.*}}, align 16
+; CHECK: store i32 {{.*}}[[SHADOW]], {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+
+; atomic store: preserve alignment, store clean shadow value before app value
+
+define void @AtomicStore(i32* %p, i32 %x) sanitize_memory {
+entry:
+  store atomic i32 %x, i32* %p seq_cst, align 16
+  ret void
+}
+
+; CHECK: @AtomicStore
+; CHECK-NOT: @__msan_param_tls
+; CHECK: store i32 0, i32* {{.*}}, align 16
+; CHECK: store atomic i32 %x, i32* %p seq_cst, align 16
+; CHECK: ret void
+
+
+; atomic store: preserve alignment, store clean shadow value before app value
+
+define void @AtomicStoreRelease(i32* %p, i32 %x) sanitize_memory {
+entry:
+  store atomic i32 %x, i32* %p release, align 16
+  ret void
+}
+
+; CHECK: @AtomicStoreRelease
+; CHECK-NOT: @__msan_param_tls
+; CHECK: store i32 0, i32* {{.*}}, align 16
+; CHECK: store atomic i32 %x, i32* %p release, align 16
+; CHECK: ret void
+
+
+; atomic store monotonic: bumped up to store release
+
+define void @AtomicStoreMonotonic(i32* %p, i32 %x) sanitize_memory {
+entry:
+  store atomic i32 %x, i32* %p monotonic, align 16
+  ret void
+}
+
+; CHECK: @AtomicStoreMonotonic
+; CHECK-NOT: @__msan_param_tls
+; CHECK: store i32 0, i32* {{.*}}, align 16
+; CHECK: store atomic i32 %x, i32* %p release, align 16
+; CHECK: ret void
+
+
+; atomic store unordered: bumped up to store release
+
+define void @AtomicStoreUnordered(i32* %p, i32 %x) sanitize_memory {
+entry:
+  store atomic i32 %x, i32* %p unordered, align 16
+  ret void
+}
+
+; CHECK: @AtomicStoreUnordered
+; CHECK-NOT: @__msan_param_tls
+; CHECK: store i32 0, i32* {{.*}}, align 16
+; CHECK: store atomic i32 %x, i32* %p release, align 16
+; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/lit.local.cfg b/test/Instrumentation/MemorySanitizer/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Instrumentation/MemorySanitizer/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 4fa0319..72a992d 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -260,6 +260,8 @@ entry:
 
 ; CHECK: @Select
 ; CHECK: select
+; CHECK-NEXT: sext i1 {{.*}} to i32
+; CHECK-NEXT: or i32
 ; CHECK-NEXT: select
 ; CHECK: ret i32
 
@@ -274,6 +276,13 @@ entry:
   ret <8 x i16> %cond
 }
 
+; CHECK: @SelectVector
+; CHECK: select <8 x i1>
+; CHECK-NEXT: sext <8 x i1> {{.*}} to <8 x i16>
+; CHECK-NEXT: or <8 x i16>
+; CHECK-NEXT: select <8 x i1>
+; CHECK: ret <8 x i16>
+
 ; CHECK-ORIGINS: @SelectVector
 ; CHECK-ORIGINS: bitcast <8 x i1> {{.*}} to i8
 ; CHECK-ORIGINS: icmp ne i8
@@ -281,6 +290,38 @@ entry:
 ; CHECK-ORIGINS: ret <8 x i16>
 
 
+; Check that we propagate origin for "select" with scalar condition and vector
+; arguments. Select condition shadow is sign-extended to the vector type and
+; mixed into the result shadow.
+
+define <8 x i16> @SelectVector2(<8 x i16> %a, <8 x i16> %b, i1 %c) nounwind uwtable readnone sanitize_memory {
+entry:
+  %cond = select i1 %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %cond
+}
+
+; CHECK: @SelectVector2
+; CHECK: select i1
+; CHECK: sext i1 {{.*}} to i128
+; CHECK: bitcast i128 {{.*}} to <8 x i16>
+; CHECK: or <8 x i16>
+; CHECK: select i1
+; CHECK: ret <8 x i16>
+
+
+define { i64, i64 } @SelectStruct(i1 zeroext %x, { i64, i64 } %a, { i64, i64 } %b) readnone sanitize_memory {
+entry:
+  %c = select i1 %x, { i64, i64 } %a, { i64, i64 } %b
+  ret { i64, i64 } %c
+}
+
+; CHECK: @SelectStruct
+; CHECK: select i1 {{.*}}, { i64, i64 }
+; CHECK-NEXT: select i1 {{.*}}, { i64, i64 } { i64 -1, i64 -1 }, { i64, i64 }
+; CHECK-NEXT: select i1 {{.*}}, { i64, i64 }
+; CHECK: ret { i64, i64 }
+
+
 define i8* @IntToPtr(i64 %x) nounwind uwtable readnone sanitize_memory {
 entry:
   %0 = inttoptr i64 %x to i8*
@@ -420,8 +461,8 @@ define i32 @ShadowLoadAlignmentLarge() nounwind uwtable sanitize_memory {
 }
 
 ; CHECK: @ShadowLoadAlignmentLarge
-; CHECK: load i32* {{.*}} align 64
 ; CHECK: load volatile i32* {{.*}} align 64
+; CHECK: load i32* {{.*}} align 64
 ; CHECK: ret i32
 
 define i32 @ShadowLoadAlignmentSmall() nounwind uwtable sanitize_memory {
@@ -431,14 +472,14 @@ define i32 @ShadowLoadAlignmentSmall() nounwind uwtable sanitize_memory {
 }
 
 ; CHECK: @ShadowLoadAlignmentSmall
-; CHECK: load i32* {{.*}} align 2
 ; CHECK: load volatile i32* {{.*}} align 2
+; CHECK: load i32* {{.*}} align 2
 ; CHECK: ret i32
 
 ; CHECK-ORIGINS: @ShadowLoadAlignmentSmall
+; CHECK-ORIGINS: load volatile i32* {{.*}} align 2
 ; CHECK-ORIGINS: load i32* {{.*}} align 2
 ; CHECK-ORIGINS: load i32* {{.*}} align 4
-; CHECK-ORIGINS: load volatile i32* {{.*}} align 2
 ; CHECK-ORIGINS: ret i32
 
 
@@ -578,8 +619,8 @@ define <8 x i8*> @VectorOfPointers(<8 x i8*>* %p) nounwind uwtable sanitize_memo
 }
 
 ; CHECK: @VectorOfPointers
-; CHECK: load <8 x i64>*
 ; CHECK: load <8 x i8*>*
+; CHECK: load <8 x i64>*
 ; CHECK: store <8 x i64> {{.*}} @__msan_retval_tls
 ; CHECK: ret <8 x i8*>
 
@@ -597,6 +638,31 @@ define void @VACopy(i8* %p1, i8* %p2) nounwind uwtable sanitize_memory {
 ; CHECK: ret void
 
 
+; Test that va_start instrumentation does not use va_arg_tls*.
+; It should work with a local stack copy instead.
+
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+declare void @llvm.va_start(i8*) nounwind
+
+; Function Attrs: nounwind uwtable
+define void @VAStart(i32 %x, ...) {
+entry:
+  %x.addr = alloca i32, align 4
+  %va = alloca [1 x %struct.__va_list_tag], align 16
+  store i32 %x, i32* %x.addr, align 4
+  %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i32 0, i32 0
+  %arraydecay1 = bitcast %struct.__va_list_tag* %arraydecay to i8*
+  call void @llvm.va_start(i8* %arraydecay1)
+  ret void
+}
+
+; CHECK: @VAStart
+; CHECK: call void @llvm.va_start
+; CHECK-NOT: @__msan_va_arg_tls
+; CHECK-NOT: @__msan_va_arg_overflow_size_tls
+; CHECK: ret void
+
+
 ; Test handling of volatile stores.
 ; Check that MemorySanitizer does not add a check of the value being stored.
 
@@ -709,3 +775,29 @@ entry:
 ; CHECK-AA: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 2, i32 2, i1 false)
 ; CHECK-AA: ret i16
 
+
+; Test origin propagation for insertvalue
+
+define { i64, i32 } @make_pair_64_32(i64 %x, i32 %y) sanitize_memory {
+entry:
+  %a = insertvalue { i64, i32 } undef, i64 %x, 0
+  %b = insertvalue { i64, i32 } %a, i32 %y, 1
+  ret { i64, i32 } %b
+}
+
+; CHECK-ORIGINS: @make_pair_64_32
+; First element shadow
+; CHECK-ORIGINS: insertvalue { i64, i32 } { i64 -1, i32 -1 }, i64 {{.*}}, 0
+; First element origin
+; CHECK-ORIGINS: icmp ne i64
+; CHECK-ORIGINS: select i1
+; First element app value
+; CHECK-ORIGINS: insertvalue { i64, i32 } undef, i64 {{.*}}, 0
+; Second element shadow
+; CHECK-ORIGINS: insertvalue { i64, i32 } {{.*}}, i32 {{.*}}, 1
+; Second element origin
+; CHECK-ORIGINS: icmp ne i32
+; CHECK-ORIGINS: select i1
+; Second element app value
+; CHECK-ORIGINS: insertvalue { i64, i32 } {{.*}}, i32 {{.*}}, 1
+; CHECK-ORIGINS: ret { i64, i32 }
diff --git a/test/Instrumentation/MemorySanitizer/return_from_main.ll b/test/Instrumentation/MemorySanitizer/return_from_main.ll
new file mode 100644
index 0000000..81dc888
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/return_from_main.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() sanitize_memory {
+entry:
+  %call = tail call i32 @f()
+  ret i32 %call
+}
+
+declare i32 @f() sanitize_memory
+
+; CHECK: @main
+; CHECK: call i32 @f()
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
+; CHECK: br i1
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: ret i32
diff --git a/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/test/Instrumentation/MemorySanitizer/vector_cvt.ll
new file mode 100644
index 0000000..9425e25
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
+declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+
+; Single argument vector conversion.
+
+define i32 @test_cvtsd2si(<2 x double> %value) sanitize_memory {
+entry:
+  %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %value)
+  ret i32 %0
+}
+
+; CHECK: @test_cvtsd2si
+; CHECK: [[S:%[_01-9a-z]+]] = extractelement <2 x i64> {{.*}}, i32 0
+; CHECK: icmp ne {{.*}}[[S]], 0
+; CHECK: br
+; CHECK: call void @__msan_warning_noreturn
+; CHECK: call i32 @llvm.x86.sse2.cvtsd2si
+; CHECK: store i32 0, {{.*}} @__msan_retval_tls
+; CHECK: ret i32
+
+; Two-argument vector conversion.
+
+define <2 x double> @test_cvtsi2sd(i32 %a, double %b) sanitize_memory {
+entry:
+  %vec = insertelement <2 x double> undef, double %b, i32 1
+  %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %vec, i32 %a)
+  ret <2 x double> %0
+}
+
+; CHECK: @test_cvtsi2sd
+; CHECK: [[Sa:%[_01-9a-z]+]] = load i32* {{.*}} @__msan_param_tls
+; CHECK: [[Sout0:%[_01-9a-z]+]] = insertelement <2 x i64> <i64 -1, i64 -1>, i64 {{.*}}, i32 1
+; Clear low half of result shadow
+; CHECK: [[Sout:%[_01-9a-z]+]] = insertelement <2 x i64> {{.*}}[[Sout0]], i64 0, i32 0
+; Trap on %a shadow.
+; CHECK: icmp ne {{.*}}[[Sa]], 0
+; CHECK: br
+; CHECK: call void @__msan_warning_noreturn
+; CHECK: call <2 x double> @llvm.x86.sse2.cvtsi2sd
+; CHECK: store <2 x i64> {{.*}}[[Sout]], {{.*}} @__msan_retval_tls
+; CHECK: ret <2 x double>
+
+; x86_mmx packed vector conversion.
+
+define x86_mmx @test_cvtps2pi(<4 x float> %value) sanitize_memory {
+entry:
+  %0 = tail call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %value)
+  ret x86_mmx %0
+}
+
+; CHECK: @test_cvtps2pi
+; CHECK: extractelement <4 x i32> {{.*}}, i32 0
+; CHECK: extractelement <4 x i32> {{.*}}, i32 1
+; CHECK: [[S:%[_01-9a-z]+]] = or i32
+; CHECK: icmp ne {{.*}}[[S]], 0
+; CHECK: br
+; CHECK: call void @__msan_warning_noreturn
+; CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+; CHECK: store i64 0, {{.*}} @__msan_retval_tls
+; CHECK: ret x86_mmx
diff --git a/test/Instrumentation/MemorySanitizer/wrap_indirect_calls.ll b/test/Instrumentation/MemorySanitizer/wrap_indirect_calls.ll
new file mode 100644
index 0000000..555695d
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/wrap_indirect_calls.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-wrap-indirect-calls=zzz -msan-wrap-indirect-calls-fast=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-wrap-indirect-calls=zzz -msan-wrap-indirect-calls-fast=1 -S | FileCheck -check-prefix=CHECK-FAST %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test for -msan-wrap-indirect-calls functionality.
+; Replaces indirect call to %f with a call to whatever is returned from the
+; wrapper function.
+
+; This does not depend on the sanitize_memory attribute.
+define i32 @func(i32 (i32, i32)* nocapture %f, i32 %x, i32 %y) {
+entry:
+  %call = tail call i32 %f(i32 %x, i32 %y)
+  ret i32 %call
+}
+
+; CHECK: @func
+; CHECK: bitcast i32 (i32, i32)* %f to void ()*
+; CHECK: call void ()* (void ()*)* @zzz(void ()*
+; CHECK: [[A:%[01-9a-z_.]+]] = bitcast void ()* {{.*}} to i32 (i32, i32)*
+; CHECK: call i32 {{.*}}[[A]](i32 {{.*}}, i32 {{.*}})
+; CHECK: ret i32
+
+; CHECK-FAST: @func
+; CHECK-FAST: bitcast i32 (i32, i32)* %f to void ()*
+; CHECK-FAST-DAG: icmp ult void ()* {{.*}}, bitcast (i32* @__executable_start to void ()*)
+; CHECK-FAST-DAG: icmp uge void ()* {{.*}}, bitcast (i32* @_end to void ()*)
+; CHECK-FAST: or i1
+; CHECK-FAST: br i1
+; CHECK-FAST: call void ()* (void ()*)* @zzz(void ()*
+; CHECK-FAST: br label
+; CHECK-FAST: [[A:%[01-9a-z_.]+]] = phi i32 (i32, i32)* [ %f, %entry ], [ {{.*}} ]
+; CHECK-FAST: call i32 {{.*}}[[A]](i32 {{.*}}, i32 {{.*}})
+; CHECK-FAST: ret i32
diff --git a/test/Instrumentation/ThreadSanitizer/lit.local.cfg b/test/Instrumentation/ThreadSanitizer/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Instrumentation/ThreadSanitizer/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Instrumentation/ThreadSanitizer/no_sanitize_thread.ll b/test/Instrumentation/ThreadSanitizer/no_sanitize_thread.ll
new file mode 100644
index 0000000..3949fd5
--- /dev/null
+++ b/test/Instrumentation/ThreadSanitizer/no_sanitize_thread.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -tsan -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; no sanitize_thread attribute here
+define i32 @read_4_bytes(i32* %a) {
+entry:
+  %tmp1 = load i32* %a, align 4
+  ret i32 %tmp1
+}
+
+; CHECK: define i32 @read_4_bytes(i32* %a) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %tmp1 = load i32* %a, align 4
+; CHECK: ret i32 %tmp1
+
+; no sanitize_thread attribute here
+define i32 @read_4_bytes_and_call(i32* %a) {
+entry:
+  call void @foo()
+  %tmp1 = load i32* %a, align 4
+  ret i32 %tmp1
+}
+
+; CHECK: define i32 @read_4_bytes_and_call(i32* %a) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %0 = call i8* @llvm.returnaddress(i32 0)
+; CHECK-NEXT:   call void @__tsan_func_entry(i8* %0)
+; CHECK-NEXT:   call void @foo()
+; CHECK-NEXT:   %tmp1 = load i32* %a, align 4
+; CHECK-NEXT:   call void @__tsan_func_exit()
+; CHECK-NEXT:   ret i32 %tmp1
+
+declare void @foo()
+
diff --git a/test/Instrumentation/ThreadSanitizer/read_before_write.ll b/test/Instrumentation/ThreadSanitizer/read_before_write.ll
index 482362a..cb6603b 100644
--- a/test/Instrumentation/ThreadSanitizer/read_before_write.ll
+++ b/test/Instrumentation/ThreadSanitizer/read_before_write.ll
@@ -2,7 +2,7 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define void @IncrementMe(i32* nocapture %ptr) nounwind uwtable {
+define void @IncrementMe(i32* nocapture %ptr) nounwind uwtable sanitize_thread {
 entry:
   %0 = load i32* %ptr, align 4
   %inc = add nsw i32 %0, 1
@@ -14,7 +14,7 @@ entry:
 ; CHECK: __tsan_write
 ; CHECK: ret void
 
-define void @IncrementMeWithCallInBetween(i32* nocapture %ptr) nounwind uwtable {
+define void @IncrementMeWithCallInBetween(i32* nocapture %ptr) nounwind uwtable sanitize_thread {
 entry:
   %0 = load i32* %ptr, align 4
   %inc = add nsw i32 %0, 1
diff --git a/test/Instrumentation/ThreadSanitizer/read_from_global.ll b/test/Instrumentation/ThreadSanitizer/read_from_global.ll
index 7b6b94e..33614a3 100644
--- a/test/Instrumentation/ThreadSanitizer/read_from_global.ll
+++ b/test/Instrumentation/ThreadSanitizer/read_from_global.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 @const_global = external constant i32
-define i32 @read_from_const_global() nounwind uwtable readnone {
+define i32 @read_from_const_global() nounwind uwtable sanitize_thread readnone {
 entry:
   %0 = load i32* @const_global, align 4
   ret i32 %0
@@ -14,7 +14,7 @@ entry:
 ; CHECK: ret i32
 
 @non_const_global = global i32 0, align 4
-define i32 @read_from_non_const_global() nounwind uwtable readonly {
+define i32 @read_from_non_const_global() nounwind uwtable sanitize_thread readonly {
 entry:
   %0 = load i32* @non_const_global, align 4
   ret i32 %0
@@ -25,7 +25,7 @@ entry:
 ; CHECK: ret i32
 
 @const_global_array = external constant [10 x i32]
-define i32 @read_from_const_global_array(i32 %idx) nounwind uwtable readnone {
+define i32 @read_from_const_global_array(i32 %idx) nounwind uwtable sanitize_thread readnone {
 entry:
   %idxprom = sext i32 %idx to i64
   %arrayidx = getelementptr inbounds [10 x i32]* @const_global_array, i64 0, i64 %idxprom
@@ -38,10 +38,10 @@ entry:
 ; CHECK: ret i32
 
 %struct.Foo = type { i32 (...)** }
-define void @call_virtual_func(%struct.Foo* %f) uwtable {
+define void @call_virtual_func(%struct.Foo* %f) uwtable sanitize_thread {
 entry:
   %0 = bitcast %struct.Foo* %f to void (%struct.Foo*)***
-  %vtable = load void (%struct.Foo*)*** %0, align 8, !tbaa !3
+  %vtable = load void (%struct.Foo*)*** %0, align 8, !tbaa !2
   %1 = load void (%struct.Foo*)** %vtable, align 8
   call void %1(%struct.Foo* %f)
   ret void
@@ -54,8 +54,6 @@ entry:
 ; CHECK: = load
 ; CHECK: ret void
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"vtable pointer", metadata !2}
-
+!0 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!1 = metadata !{metadata !"vtable pointer", metadata !0}
+!2 = metadata !{metadata !1, metadata !1, i64 0}
diff --git a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
index 19dd45b..d449a97 100644
--- a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
+++ b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @read_4_bytes(i32* %a) {
+define i32 @read_4_bytes(i32* %a) sanitize_thread {
 entry:
   %tmp1 = load i32* %a, align 4
   ret i32 %tmp1
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK: @llvm.global_ctors = {{.*}}@__tsan_init
 
-; CHECK: define i32 @read_4_bytes(i32* %a) {
+; CHECK: define i32 @read_4_bytes(i32* %a)
 ; CHECK:        call void @__tsan_func_entry(i8* %0)
 ; CHECK-NEXT:   %1 = bitcast i32* %a to i8*
 ; CHECK-NEXT:   call void @__tsan_read4(i8* %1)
diff --git a/test/Instrumentation/ThreadSanitizer/vptr_read.ll b/test/Instrumentation/ThreadSanitizer/vptr_read.ll
index 404ca3f..811ad8d 100644
--- a/test/Instrumentation/ThreadSanitizer/vptr_read.ll
+++ b/test/Instrumentation/ThreadSanitizer/vptr_read.ll
@@ -2,12 +2,12 @@
 ; Check that vptr reads are treated in a special way.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define i8 @Foo(i8* %a) nounwind uwtable {
+define i8 @Foo(i8* %a) nounwind uwtable sanitize_thread {
 entry:
 ; CHECK: call void @__tsan_vptr_read
   %0 = load i8* %a, align 8, !tbaa !0
   ret i8 %0
 }
-!0 = metadata !{metadata !"vtable pointer", metadata !1}
+!0 = metadata !{metadata !2, metadata !2, i64 0}
 !1 = metadata !{metadata !"Simple C/C++ TBAA", null}
-
+!2 = metadata !{metadata !"vtable pointer", metadata !1}
diff --git a/test/Instrumentation/ThreadSanitizer/vptr_update.ll b/test/Instrumentation/ThreadSanitizer/vptr_update.ll
index f318659..95c7bb0 100644
--- a/test/Instrumentation/ThreadSanitizer/vptr_update.ll
+++ b/test/Instrumentation/ThreadSanitizer/vptr_update.ll
@@ -2,12 +2,12 @@
 ; Check that vtable pointer updates are treated in a special way.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define void @Foo(i8** nocapture %a, i8* %b) nounwind uwtable {
+define void @Foo(i8** nocapture %a, i8* %b) nounwind uwtable sanitize_thread {
 entry:
 ; CHECK: call void @__tsan_vptr_update
   store i8* %b, i8** %a, align 8, !tbaa !0
   ret void
 }
-!0 = metadata !{metadata !"vtable pointer", metadata !1}
+!0 = metadata !{metadata !2, metadata !2, i64 0}
 !1 = metadata !{metadata !"Simple C/C++ TBAA", null}
-
+!2 = metadata !{metadata !"vtable pointer", metadata !1}
diff --git a/test/Integer/lit.local.cfg b/test/Integer/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Integer/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/JitListener/lit.local.cfg b/test/JitListener/lit.local.cfg
index a5aa6de..d995820 100644
--- a/test/JitListener/lit.local.cfg
+++ b/test/JitListener/lit.local.cfg
@@ -1,11 +1,3 @@
-config.suffixes = ['.ll']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-if not root.llvm_use_intel_jitevents == "ON":
+if not config.root.llvm_use_intel_jitevents == "ON":
     config.unsupported = True
 
diff --git a/test/JitListener/test-common-symbols.ll b/test/JitListener/test-common-symbols.ll
index 91891d8..a389bf7 100644
--- a/test/JitListener/test-common-symbols.ll
+++ b/test/JitListener/test-common-symbols.ll
@@ -76,13 +76,14 @@ for.end:                                          ; preds = %for.cond
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!35}
 
 !0 = metadata !{i32 720913, metadata !34, i32 12, metadata !"clang version 3.1 ()", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !12, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, metadata !34, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !10, i32 0} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !34} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{metadata !11}
@@ -92,7 +93,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !15 = metadata !{i32 720948, i32 0, null, metadata !"zero_double", metadata !"zero_double", metadata !"", metadata !6, i32 2, metadata !16, i32 0, i32 1, double* @zero_double, null} ; [ DW_TAG_variable ]
 !16 = metadata !{i32 720932, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !17 = metadata !{i32 720948, i32 0, null, metadata !"zero_arr", metadata !"zero_arr", metadata !"", metadata !6, i32 3, metadata !18, i32 0, i32 1, [10 x i32]* @zero_arr, null} ; [ DW_TAG_variable ]
-!18 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !9, metadata !19, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!18 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !9, metadata !19, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 32, offset 0] [from int]
 !19 = metadata !{metadata !20}
 !20 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
 !21 = metadata !{i32 7, i32 5, metadata !22, null}
@@ -109,3 +110,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !32 = metadata !{i32 12, i32 29, metadata !26, null}
 !33 = metadata !{i32 15, i32 5, metadata !22, null}
 !34 = metadata !{metadata !"test-common-symbols.c", metadata !"/store/store/llvm/build"}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/JitListener/test-inline.ll b/test/JitListener/test-inline.ll
index 5c16c94..0d365b1 100644
--- a/test/JitListener/test-inline.ll
+++ b/test/JitListener/test-inline.ll
@@ -132,30 +132,31 @@ entry:
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!78}
 
 !0 = metadata !{i32 786449, metadata !77, i32 4, metadata !"clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !43, null, metadata !""} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-inline.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !35, metadata !40}
 !5 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", i32 32, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1, i32 33} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
 !6 = metadata !{i32 786473, metadata !77} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !12, metadata !16, metadata !29, metadata !32, metadata !33}
 !9 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
 !10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
 !11 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
 !12 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !13} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!13 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !9, metadata !14, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
+!13 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !9, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
 !14 = metadata !{metadata !15}
 !15 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
 !16 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !17} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
-!17 = metadata !{i32 786451, metadata !77, null, metadata !"char_struct", i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !18, i32 0, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [from ]
+!17 = metadata !{i32 786451, metadata !77, null, metadata !"char_struct", i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
 !18 = metadata !{metadata !19, metadata !21, metadata !23}
 !19 = metadata !{i32 786445, metadata !77, metadata !17, metadata !"c", i32 23, i64 8, i64 8, i64 0, i32 0, metadata !20} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
 !20 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !21 = metadata !{i32 786445, metadata !77, metadata !17, metadata !"c2", i32 24, i64 16, i64 8, i64 8, i32 0, metadata !22} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
-!22 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !20, metadata !14, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
+!22 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !20, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
 !23 = metadata !{i32 786478, metadata !77, metadata !17, metadata !"char_struct", metadata !"char_struct", metadata !"", i32 22, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !27, i32 22} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
-!24 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !26}
 !26 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
 !27 = metadata !{metadata !28}
@@ -167,12 +168,12 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 !33 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !34} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
 !34 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
 !35 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 38, metadata !36, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 39} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
-!36 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!36 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !37, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !37 = metadata !{metadata !31, metadata !31, metadata !38}
 !38 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !39 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
 !40 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 27, metadata !41, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 28} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
-!41 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !42, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!41 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !42, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !42 = metadata !{metadata !31}
 !43 = metadata !{metadata !45}
 !45 = metadata !{i32 786484, i32 0, null, metadata !"compound_char", metadata !"compound_char", metadata !"", metadata !6, i32 25, metadata !17, i32 0, i32 1, %struct.char_struct* @compound_char, null} ; [ DW_TAG_variable ] [compound_char] [line 25] [def]
@@ -198,7 +199,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 !65 = metadata !{i32 786688, metadata !63, metadata !"f", metadata !6, i32 41, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 41]
 !66 = metadata !{i32 41, i32 0, metadata !63, null}
 !67 = metadata !{i32 786688, metadata !63, metadata !"d", metadata !6, i32 42, metadata !68, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 42]
-!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !9, metadata !69, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
+!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !9, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
 !69 = metadata !{metadata !15, metadata !15}
 !70 = metadata !{i32 42, i32 0, metadata !63, null}
 !71 = metadata !{i32 44, i32 0, metadata !63, null}
@@ -208,3 +209,4 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 !75 = metadata !{i32 48, i32 0, metadata !63, null}
 !76 = metadata !{i32 49, i32 0, metadata !63, null}
 !77 = metadata !{metadata !"test-inline.cpp", metadata !"/home/akaylor/dev"}
+!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/JitListener/test-parameters.ll b/test/JitListener/test-parameters.ll
index 96af18e..7feb6bb 100644
--- a/test/JitListener/test-parameters.ll
+++ b/test/JitListener/test-parameters.ll
@@ -131,34 +131,35 @@ entry:
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!78}
 
 !0 = metadata !{i32 786449, metadata !77, i32 4, metadata !"clang version 3.3 (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-clang2 gitosis@miro.kw.intel.com:clang.git 39450d0469e0d5589ad39fd0b20b5742750619a0) (ssh://akaylor@git-amr-1.devtools.intel.com:29418/ssg_llvm-llvm gitosis@miro.kw.intel.com:llvm.git 376642ed620ecae05b68c7bc81f79aeb2065abe0)", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !43, null, metadata !""} ; [ DW_TAG_compile_unit ] [/home/akaylor/dev/test-parameters.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !10, metadata !38}
 !5 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 27, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 28} ; [ DW_TAG_subprogram ] [line 27] [def] [scope 28] [foo]
 !6 = metadata !{i32 786473, metadata !77} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"test_parameters", metadata !"test_parameters", metadata !"_Z15test_parametersPfPA2_dR11char_structPPitm", i32 32, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, double (float*, [2 x double]*, %struct.char_struct*, i32**, i16, i64)* @_Z15test_parametersPfPA2_dR11char_structPPitm, null, null, metadata !1, i32 33} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 33] [test_parameters]
-!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13, metadata !14, metadata !16, metadata !20, metadata !33, metadata !35, metadata !36}
 !13 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
 !14 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from float]
 !15 = metadata !{i32 786468, null, null, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [float] [line 0, size 32, align 32, offset 0, enc DW_ATE_float]
 !16 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !17} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
-!17 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !13, metadata !18, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
+!17 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 128, i64 64, i32 0, i32 0, metadata !13, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 64, offset 0] [from double]
 !18 = metadata !{metadata !19}
 !19 = metadata !{i32 786465, i64 0, i64 2}        ; [ DW_TAG_subrange_type ] [0, 1]
 !20 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !21} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from char_struct]
-!21 = metadata !{i32 786451, metadata !77, null, metadata !"char_struct", i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !22, i32 0, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [from ]
+!21 = metadata !{i32 786451, metadata !77, null, metadata !"char_struct", i32 22, i64 24, i64 8, i32 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [char_struct] [line 22, size 24, align 8, offset 0] [def] [from ]
 !22 = metadata !{metadata !23, metadata !25, metadata !27}
 !23 = metadata !{i32 786445, metadata !77, metadata !21, metadata !"c", i32 23, i64 8, i64 8, i64 0, i32 0, metadata !24} ; [ DW_TAG_member ] [c] [line 23, size 8, align 8, offset 0] [from char]
 !24 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !25 = metadata !{i32 786445, metadata !77, metadata !21, metadata !"c2", i32 24, i64 16, i64 8, i64 8, i32 0, metadata !26} ; [ DW_TAG_member ] [c2] [line 24, size 16, align 8, offset 8] [from ]
-!26 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !24, metadata !18, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
+!26 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 16, i64 8, i32 0, i32 0, metadata !24, metadata !18, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 16, align 8, offset 0] [from char]
 !27 = metadata !{i32 786478, metadata !77, metadata !21, metadata !"char_struct", metadata !"char_struct", metadata !"", i32 22, metadata !28, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, i32 0, metadata !31, i32 22} ; [ DW_TAG_subprogram ] [line 22] [char_struct]
-!28 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !29 = metadata !{null, metadata !30}
 !30 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !21} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char_struct]
 !31 = metadata !{metadata !32}
@@ -169,7 +170,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 !36 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !37} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long unsigned int]
 !37 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
 !38 = metadata !{i32 786478, metadata !77, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 38, metadata !39, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 39} ; [ DW_TAG_subprogram ] [line 38] [def] [scope 39] [main]
-!39 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!39 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !40, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !40 = metadata !{metadata !9, metadata !9, metadata !41}
 !41 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !42} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
 !42 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
@@ -197,7 +198,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 !65 = metadata !{i32 786688, metadata !63, metadata !"f", metadata !6, i32 41, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [f] [line 41]
 !66 = metadata !{i32 41, i32 0, metadata !63, null}
 !67 = metadata !{i32 786688, metadata !63, metadata !"d", metadata !6, i32 42, metadata !68, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 42]
-!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !13, metadata !69, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
+!68 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 256, i64 64, i32 0, i32 0, metadata !13, metadata !69, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 256, align 64, offset 0] [from double]
 !69 = metadata !{metadata !19, metadata !19}
 !70 = metadata !{i32 42, i32 0, metadata !63, null}
 !71 = metadata !{i32 44, i32 0, metadata !63, null}
@@ -207,3 +208,4 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32,
 !75 = metadata !{i32 48, i32 0, metadata !63, null}
 !76 = metadata !{i32 49, i32 0, metadata !63, null}
 !77 = metadata !{metadata !"test-parameters.cpp", metadata !"/home/akaylor/dev"}
+!78 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/LTO/cfi_endproc.ll b/test/LTO/cfi_endproc.ll
new file mode 100644
index 0000000..a5cc649
--- /dev/null
+++ b/test/LTO/cfi_endproc.ll
@@ -0,0 +1,37 @@
+; RUN: llvm-as < %s >%t1
+; RUN: llvm-lto -o %t2 %t1
+; RUN: llvm-nm %t2 | FileCheck %s -check-prefix=NOEXPORT
+; RUN: llvm-lto -o %t3 -exported-symbol=main %t1
+; RUN: llvm-nm %t3 | FileCheck %s -check-prefix=EXPORT
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm ".text"
+module asm ".align 16, 0x90"
+module asm ".type PR14512, @function"
+module asm "PR14512:.cfi_startproc"
+module asm "ret"
+module asm ".cfi_endproc"
+
+declare void @PR14512()
+
+; Without -exported-symbol, main should be eliminated by LTO.
+; With -exported-symbol=main, main should be preserved by LTO.
+define i32 @main(i32 %argc, i8** %argv) {
+; NOEXPORT-NOT: main
+; EXPORT: main
+  call void @PR14512()
+  ret i32 0
+}
+
+; RUN: llvm-lto -o %t -dso-symbol=zed1 -dso-symbol=zed2 %t1 -disable-opt
+; RUN: llvm-nm %t | FileCheck %s -check-prefix=ZED1_AND_ZED2
+; ZED1_AND_ZED2: V zed1
+@zed1 = linkonce_odr global i32 42
+define i32* @get_zed1() {
+  ret i32* @zed1
+}
+
+; ZED1_AND_ZED2: d zed2
+@zed2 = linkonce_odr unnamed_addr global i32 42
diff --git a/test/LTO/linkonce_odr_func.ll b/test/LTO/linkonce_odr_func.ll
new file mode 100644
index 0000000..8a49326
--- /dev/null
+++ b/test/LTO/linkonce_odr_func.ll
@@ -0,0 +1,45 @@
+; RUN: llvm-as < %s >%t1
+; RUN: llvm-lto -o %t2 -dso-symbol=foo1 -dso-symbol=foo2 -dso-symbol=foo3 \
+; RUN:     -dso-symbol=foo4  %t1 -disable-opt
+; RUN: llvm-nm %t2 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: t foo1
+define linkonce_odr void @foo1() noinline {
+  ret void
+}
+
+; CHECK: W foo2
+define linkonce_odr void @foo2() noinline {
+  ret void
+}
+
+; CHECK: t foo3
+define linkonce_odr void @foo3() noinline {
+  ret void
+}
+
+; CHECK: W foo4
+define linkonce_odr void @foo4() noinline {
+  ret void
+}
+
+declare void @f(void()*)
+
+declare void @p()
+
+define void @bar() {
+bb0:
+  call void @foo1()
+  call void @f(void()* @foo2)
+  invoke void @foo3() to label %bb1 unwind label %clean
+bb1:
+  invoke void @f(void()* @foo4) to label %bb2 unwind label %clean
+bb2:
+  ret void
+clean:
+  landingpad {i32, i32} personality void()* @p cleanup
+  ret void
+}
diff --git a/test/LTO/lit.local.cfg b/test/LTO/lit.local.cfg
new file mode 100644
index 0000000..6df0e03
--- /dev/null
+++ b/test/LTO/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+  config.unsupported = True
diff --git a/test/LTO/runtime-library.ll b/test/LTO/runtime-library.ll
new file mode 100644
index 0000000..76fc6f0
--- /dev/null
+++ b/test/LTO/runtime-library.ll
@@ -0,0 +1,27 @@
+; runtime library implementations should be added to llvm.compiler.used
+; RUN: llvm-as <%s >%t1
+; RUN: llvm-lto -o %t2 %t1
+; RUN: llvm-nm -no-sort %t2 | FileCheck %s -check-prefix=KEEP -check-prefix=LOSE
+
+target triple = "x86_64-apple-darwin9"
+
+; KEEP-LABEL: _puts
+define void @puts() {
+  ret void
+}
+
+; KEEP-LABEL: ___divti3
+define void @__divti3() {
+  ret void
+}
+
+; KEEP-LABEL: _memset
+define void @memset() {
+  ret void
+}
+
+; LOSE-NOT: _myprintf
+define void @myprintf() {
+  ret void
+}
+
diff --git a/test/Linker/2011-08-04-DebugLoc.ll b/test/Linker/2011-08-04-DebugLoc.ll
index 5daf33b..d26e8cd 100644
--- a/test/Linker/2011-08-04-DebugLoc.ll
+++ b/test/Linker/2011-08-04-DebugLoc.ll
@@ -14,12 +14,13 @@ define i32 @foo() nounwind ssp {
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 
 !0 = metadata !{i32 589841, metadata !8, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, metadata !10, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 589870, metadata !8, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589870, metadata !8, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
 !2 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589845, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 2, i32 13, metadata !7, null}
@@ -27,3 +28,4 @@ define i32 @foo() nounwind ssp {
 !8 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !9 = metadata !{i32 0}
 !10 = metadata !{metadata !1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/2011-08-04-DebugLoc2.ll b/test/Linker/2011-08-04-DebugLoc2.ll
index 3f8504f..c20941d 100644
--- a/test/Linker/2011-08-04-DebugLoc2.ll
+++ b/test/Linker/2011-08-04-DebugLoc2.ll
@@ -11,12 +11,13 @@ define i32 @bar() nounwind ssp {
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 
 !0 = metadata !{i32 589841, metadata !8, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-209.11) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, metadata !10, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 589870, metadata !8, metadata !2, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 589870, metadata !8, metadata !2, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
 !2 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 589845, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !8, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 1, i32 13, metadata !7, null}
@@ -24,3 +25,4 @@ define i32 @bar() nounwind ssp {
 !8 = metadata !{metadata !"b.c", metadata !"/private/tmp"}
 !9 = metadata !{i32 0}
 !10 = metadata !{metadata !1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/2011-08-04-Metadata.ll b/test/Linker/2011-08-04-Metadata.ll
index b800e5d..cdf4f6f 100644
--- a/test/Linker/2011-08-04-Metadata.ll
+++ b/test/Linker/2011-08-04-Metadata.ll
@@ -15,13 +15,14 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 !llvm.dbg.gv = !{!5}
 
 !0 = metadata !{i32 589841, metadata !9, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !10, null, null, metadata !""}
-!1 = metadata !{i32 589870, metadata !9, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, void ()* @foo, null, null, null, i32 0}
+!1 = metadata !{i32 589870, metadata !9, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
 !2 = metadata !{i32 589865, metadata !9}
-!3 = metadata !{i32 589845, metadata !9, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0}
+!3 = metadata !{i32 589845, metadata !9, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 589876, i32 0, metadata !0, metadata !"x", metadata !"x", metadata !"", metadata !2, i32 2, metadata !6, i32 1, i32 1, i32* @x}
 !6 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
@@ -29,3 +30,4 @@ entry:
 !8 = metadata !{i32 589835, metadata !9, metadata !1, i32 3, i32 12, i32 0}
 !9 = metadata !{metadata !"/tmp/one.c", metadata !"/Volumes/Lalgate/Slate/D"}
 !10 = metadata !{metadata !1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/2011-08-04-Metadata2.ll b/test/Linker/2011-08-04-Metadata2.ll
index 311a7c6..80884cc 100644
--- a/test/Linker/2011-08-04-Metadata2.ll
+++ b/test/Linker/2011-08-04-Metadata2.ll
@@ -15,13 +15,14 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11}
 !llvm.dbg.sp = !{!1}
 !llvm.dbg.gv = !{!5}
 
 !0 = metadata !{i32 589841, metadata !9, i32 12, metadata !"clang version 3.0 ()", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !10, null, null, metadata !""}
-!1 = metadata !{i32 589870, metadata !9, metadata !2, metadata !"bar", metadata !"bar", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, void ()* @bar, null, null, null, i32 0}
+!1 = metadata !{i32 589870, metadata !9, metadata !2, metadata !"bar", metadata !"bar", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [bar]
 !2 = metadata !{i32 589865, metadata !9}
-!3 = metadata !{i32 589845, metadata !9, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0}
+!3 = metadata !{i32 589845, metadata !9, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 589876, i32 0, metadata !0, metadata !"x", metadata !"x", metadata !"", metadata !2, i32 1, metadata !6, i32 1, i32 1, i32* @x}
 !6 = metadata !{i32 589860, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
@@ -29,3 +30,4 @@ entry:
 !8 = metadata !{i32 589835, metadata !9, metadata !1, i32 2, i32 12, i32 0}
 !9 = metadata !{metadata !"/tmp/two.c", metadata !"/Volumes/Lalgate/Slate/D"}
 !10 = metadata !{metadata !1}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/2011-08-18-unique-class-type.ll b/test/Linker/2011-08-18-unique-class-type.ll
index b460ac3..b077f23 100644
--- a/test/Linker/2011-08-18-unique-class-type.ll
+++ b/test/Linker/2011-08-18-unique-class-type.ll
@@ -18,17 +18,18 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18}
 
 !0 = metadata !{i32 720913, metadata !16, i32 4, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !16, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooN2N11AE", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @_Z3fooN2N11AE, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !16, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3fooN2N11AE", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3fooN2N11AE, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [foo]
 !6 = metadata !{i32 720937, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !16, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, metadata !16, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 721153, metadata !5, metadata !"mya", metadata !6, i32 16777220, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 720898, metadata !17, metadata !11, metadata !"A", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null} ; [ DW_TAG_class_type ]
+!10 = metadata !{i32 720898, metadata !17, metadata !11, metadata !"A", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
 !11 = metadata !{i32 720953, metadata !17, null, metadata !"N1", i32 2} ; [ DW_TAG_namespace ]
 !12 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
 !13 = metadata !{i32 4, i32 12, metadata !5, null}
@@ -36,3 +37,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !15 = metadata !{i32 720907, metadata !16, metadata !5, i32 4, i32 17, i32 0} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{metadata !"n1.c", metadata !"/private/tmp"}
 !17 = metadata !{metadata !"./n.h", metadata !"/private/tmp"}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/2011-08-18-unique-class-type2.ll b/test/Linker/2011-08-18-unique-class-type2.ll
index 8bd3841..7bfcd91 100644
--- a/test/Linker/2011-08-18-unique-class-type2.ll
+++ b/test/Linker/2011-08-18-unique-class-type2.ll
@@ -16,17 +16,18 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18}
 
 !0 = metadata !{i32 720913, metadata !16, i32 4, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"bar", metadata !"bar", metadata !"_Z3barN2N11AE", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @_Z3barN2N11AE, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"bar", metadata !"bar", metadata !"_Z3barN2N11AE", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3barN2N11AE, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 4] [def] [scope 0] [bar]
 !6 = metadata !{i32 720937, metadata !16} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !16, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, metadata !16, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 721153, metadata !5, metadata !"youra", metadata !6, i32 16777220, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!10 = metadata !{i32 720898, metadata !17, metadata !11, metadata !"A", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null} ; [ DW_TAG_class_type ]
+!10 = metadata !{i32 720898, metadata !17, metadata !11, metadata !"A", i32 3, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_class_type ] [A] [line 3, size 8, align 8, offset 0] [def] [from ]
 !11 = metadata !{i32 720953, metadata !17, null, metadata !"N1", i32 2} ; [ DW_TAG_namespace ]
 !12 = metadata !{i32 720937, metadata !17} ; [ DW_TAG_file_type ]
 !13 = metadata !{i32 4, i32 12, metadata !5, null}
@@ -34,3 +35,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !15 = metadata !{i32 720907, metadata !16, metadata !5, i32 4, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
 !16 = metadata !{metadata !"n2.c", metadata !"/private/tmp"}
 !17 = metadata !{metadata !"./n.h", metadata !"/private/tmp"}
+!18 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/2011-08-18-unique-debug-type.ll b/test/Linker/2011-08-18-unique-debug-type.ll
index d56968d..0e14f46 100644
--- a/test/Linker/2011-08-18-unique-debug-type.ll
+++ b/test/Linker/2011-08-18-unique-debug-type.ll
@@ -10,16 +10,18 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13}
 
 !0 = metadata !{i32 720913, metadata !12, i32 12, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !12, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !12, metadata !6, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [foo]
 !6 = metadata !{i32 720937, metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !12, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, metadata !12, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 1, i32 13, metadata !11, null}
 !11 = metadata !{i32 720907, metadata !12, metadata !5, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/2011-08-18-unique-debug-type2.ll b/test/Linker/2011-08-18-unique-debug-type2.ll
index e724a67..1185100 100644
--- a/test/Linker/2011-08-18-unique-debug-type2.ll
+++ b/test/Linker/2011-08-18-unique-debug-type2.ll
@@ -10,16 +10,18 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!13}
 
 !0 = metadata !{i32 720913, metadata !12, i32 12, metadata !"clang version 3.0 (trunk 137954)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !12, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!5 = metadata !{i32 720942, metadata !12, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 0] [bar]
 !6 = metadata !{i32 720937, metadata !12} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, metadata !12, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, metadata !12, metadata !6, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 1, i32 13, metadata !11, null}
 !11 = metadata !{i32 720907, metadata !12, metadata !5, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{metadata !"two.c", metadata !"/private/tmp"}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/DbgDeclare.ll b/test/Linker/DbgDeclare.ll
index 7f3fbde..4cca9d5 100644
--- a/test/Linker/DbgDeclare.ll
+++ b/test/Linker/DbgDeclare.ll
@@ -35,6 +35,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 declare void @test(i32, i8**)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.3 (trunk 173515)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
@@ -42,12 +43,12 @@ declare void @test(i32, i8**)
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !20, null, metadata !"main", metadata !"main", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i8**)* @main, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9, metadata !10}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
+!10 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
 !13 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !14 = metadata !{i32 786689, metadata !5, metadata !"argc", metadata !6, i32 16777219, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 3, i32 0, metadata !5, null}
@@ -56,3 +57,4 @@ declare void @test(i32, i8**)
 !18 = metadata !{i32 786443, metadata !20, metadata !5, i32 4, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !19 = metadata !{i32 6, i32 0, metadata !18, null}
 !20 = metadata !{metadata !"main.cpp", metadata !"/private/tmp"}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/DbgDeclare2.ll b/test/Linker/DbgDeclare2.ll
index fbcae30..2649fcc 100644
--- a/test/Linker/DbgDeclare2.ll
+++ b/test/Linker/DbgDeclare2.ll
@@ -48,6 +48,7 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 declare i32 @puts(i8*)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!27}
 
 !0 = metadata !{i32 786449, metadata !25, i32 4, metadata !"clang version 3.3 (trunk 173515)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{metadata !2}
@@ -55,12 +56,12 @@ declare i32 @puts(i8*)
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !26, null, metadata !"print_args", metadata !"print_args", metadata !"test", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32, i8**)* @test, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !10}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
-!12 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
+!10 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786447, null, null, null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !12} ; [ DW_TAG_pointer_type ]
+!12 = metadata !{i32 786470, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ]
 !13 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !14 = metadata !{i32 786689, metadata !5, metadata !"argc", metadata !6, i32 16777220, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !15 = metadata !{i32 4, i32 0, metadata !5, null}
@@ -75,3 +76,4 @@ declare i32 @puts(i8*)
 !24 = metadata !{i32 10, i32 0, metadata !19, null}
 !25 = metadata !{metadata !"main.cpp", metadata !"/private/tmp"}
 !26 = metadata !{metadata !"test.cpp", metadata !"/private/tmp"}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/Inputs/type-unique-inheritance-a.ll b/test/Linker/Inputs/type-unique-inheritance-a.ll
new file mode 100644
index 0000000..381210c
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-inheritance-a.ll
@@ -0,0 +1,94 @@
+; CHECK: [ DW_TAG_class_type ] [A]
+; CHECK: [ DW_TAG_class_type ] [Base]
+; CHECK: [ DW_TAG_class_type ] [B]
+; CHECK-NOT: DW_TAG_class_type
+; Content of header files:
+; 
+; class Base;
+; class A : Base {
+;   int x;
+; };
+; 
+; class A;
+; class Base {
+;   int b;
+; };
+; 
+; class B {
+;   int bb;
+;   A *a;
+; };
+; Content of foo.cpp:
+; 
+; #include "b.hpp"
+; #include "a.hpp"
+; 
+; void f(int a) {
+;   A t;
+; }
+; Content of bar.cpp:
+; 
+; #include "b.hpp"
+; #include "a.hpp"
+; void g(int a) {
+;   B t;
+; }
+; 
+; void f(int);
+; int main() {
+;   A a;
+;   f(0);
+;   g(1);
+;   return 0;
+; }
+; ModuleID = 'foo.cpp'
+
+%class.A = type { %class.Base, i32 }
+%class.Base = type { i32 }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z1fi(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %t = alloca %class.A, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20), !dbg !21
+  call void @llvm.dbg.declare(metadata !{%class.A* %t}, metadata !22), !dbg !23
+  ret void, !dbg !24
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!19, !25}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !14, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"foo.cpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !8}
+!4 = metadata !{i32 786434, metadata !5, null, metadata !"A", i32 3, i64 64, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
+!6 = metadata !{metadata !7, metadata !13}
+!7 = metadata !{i32 786460, null, metadata !"_ZTS1A", null, i32 0, i64 0, i64 0, i64 0, i32 1, metadata !8} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
+!8 = metadata !{i32 786434, metadata !9, null, metadata !"Base", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
+!9 = metadata !{metadata !"./b.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
+!10 = metadata !{metadata !11}
+!11 = metadata !{i32 786445, metadata !9, metadata !"_ZTS4Base", metadata !"b", i32 4, i64 32, i64 32, i64 0, i32 1, metadata !12} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
+!12 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!13 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1A", metadata !"x", i32 4, i64 32, i64 32, i64 32, i32 1, metadata !12} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786478, metadata !1, metadata !16, metadata !"f", metadata !"f", metadata !"_Z1fi", i32 5, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1fi, null, null, metadata !2, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [f]
+!16 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/foo.cpp]
+!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{null, metadata !12}
+!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!20 = metadata !{i32 786689, metadata !15, metadata !"a", metadata !16, i32 16777221, metadata !12, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 5]
+!21 = metadata !{i32 5, i32 0, metadata !15, null}
+!22 = metadata !{i32 786688, metadata !15, metadata !"t", metadata !16, i32 6, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 6]
+!23 = metadata !{i32 6, i32 0, metadata !15, null}
+!24 = metadata !{i32 7, i32 0, metadata !15, null}
+!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/Inputs/type-unique-inheritance-b.ll b/test/Linker/Inputs/type-unique-inheritance-b.ll
new file mode 100644
index 0000000..0cd43f6
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-inheritance-b.ll
@@ -0,0 +1,81 @@
+; ModuleID = 'bar.cpp'
+
+%class.B = type { i32, %class.A* }
+%class.A = type { %class.Base, i32 }
+%class.Base = type { i32 }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z1gi(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %t = alloca %class.B, align 8
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !28), !dbg !29
+  call void @llvm.dbg.declare(metadata !{%class.B* %t}, metadata !30), !dbg !31
+  ret void, !dbg !32
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: ssp uwtable
+define i32 @main() #2 {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca %class.A, align 4
+  store i32 0, i32* %retval
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !33), !dbg !34
+  call void @_Z1fi(i32 0), !dbg !35
+  call void @_Z1gi(i32 1), !dbg !36
+  ret i32 0, !dbg !37
+}
+
+declare void @_Z1fi(i32) #3
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!27, !38}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git f54e02f969d02d640103db73efc30c45439fceab) (http://llvm.org/git/llvm.git 284353b55896cb1babfaa7add7c0a363245342d2)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !19, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"bar.cpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !11, metadata !15}
+!4 = metadata !{i32 786434, metadata !5, null, metadata !"B", i32 7, i64 128, i64 64, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS1B"} ; [ DW_TAG_class_type ] [B] [line 7, size 128, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"./b.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
+!6 = metadata !{metadata !7, metadata !9}
+!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1B", metadata !"bb", i32 8, i64 32, i64 32, i64 0, i32 1, metadata !8} ; [ DW_TAG_member ] [bb] [line 8, size 32, align 32, offset 0] [private] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS1B", metadata !"a", i32 9, i64 64, i64 64, i64 64, i32 1, metadata !10} ; [ DW_TAG_member ] [a] [line 9, size 64, align 64, offset 64] [private] [from ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!11 = metadata !{i32 786434, metadata !12, null, metadata !"A", i32 3, i64 64, i64 32, i32 0, i32 0, null, metadata !13, i32 0, null, null, metadata !"_ZTS1A"} ; [ DW_TAG_class_type ] [A] [line 3, size 64, align 32, offset 0] [def] [from ]
+!12 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/inher"}
+!13 = metadata !{metadata !14, metadata !18}
+!14 = metadata !{i32 786460, null, metadata !"_ZTS1A", null, i32 0, i64 0, i64 0, i64 0, i32 1, metadata !15} ; [ DW_TAG_inheritance ] [line 0, size 0, align 0, offset 0] [private] [from Base]
+!15 = metadata !{i32 786434, metadata !5, null, metadata !"Base", i32 3, i64 32, i64 32, i32 0, i32 0, null, metadata !16, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_class_type ] [Base] [line 3, size 32, align 32, offset 0] [def] [from ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"b", i32 4, i64 32, i64 32, i64 0, i32 1, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 0] [private] [from int]
+!18 = metadata !{i32 786445, metadata !12, metadata !"_ZTS1A", metadata !"x", i32 4, i64 32, i64 32, i64 32, i32 1, metadata !8} ; [ DW_TAG_member ] [x] [line 4, size 32, align 32, offset 32] [private] [from int]
+!19 = metadata !{metadata !20, metadata !24}
+!20 = metadata !{i32 786478, metadata !1, metadata !21, metadata !"g", metadata !"g", metadata !"_Z1gi", i32 4, metadata !22, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1gi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!21 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/inher/bar.cpp]
+!22 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !23, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!23 = metadata !{null, metadata !8}
+!24 = metadata !{i32 786478, metadata !1, metadata !21, metadata !"main", metadata !"main", metadata !"", i32 9, metadata !25, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 9} ; [ DW_TAG_subprogram ] [line 9] [def] [main]
+!25 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!26 = metadata !{metadata !8}
+!27 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!28 = metadata !{i32 786689, metadata !20, metadata !"a", metadata !21, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!29 = metadata !{i32 4, i32 0, metadata !20, null}
+!30 = metadata !{i32 786688, metadata !20, metadata !"t", metadata !21, i32 5, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!31 = metadata !{i32 5, i32 0, metadata !20, null}
+!32 = metadata !{i32 6, i32 0, metadata !20, null}
+!33 = metadata !{i32 786688, metadata !24, metadata !"a", metadata !21, i32 10, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 10]
+!34 = metadata !{i32 10, i32 0, metadata !24, null}
+!35 = metadata !{i32 11, i32 0, metadata !24, null}
+!36 = metadata !{i32 12, i32 0, metadata !24, null}
+!37 = metadata !{i32 13, i32 0, metadata !24, null}
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/Inputs/type-unique-simple2-a.ll b/test/Linker/Inputs/type-unique-simple2-a.ll
new file mode 100644
index 0000000..63470f3
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-simple2-a.ll
@@ -0,0 +1,88 @@
+; Make sure the backend generates a single DIE and uses ref_addr.
+; CHECK: 0x[[BASE:.*]]: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}} = "Base"
+; CHECK-NOT: DW_TAG_structure_type
+; CHECK: 0x[[INT:.*]]: DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name {{.*}} = "int"
+; CHECK-NOT: DW_TAG_base_type
+
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[INT]])
+; CHECK: DW_TAG_variable
+; CHECK: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[BASE]])
+
+; Make sure llvm-link only generates a single copy of the struct.
+; LINK: DW_TAG_structure_type
+; LINK-NOT: DW_TAG_structure_type
+
+; Content of header files:
+; struct Base {
+;   int a;
+;   Base *b;
+; };
+; Content of foo.cpp:
+; 
+; #include "a.hpp"
+; void f(int a) {
+;   Base t;
+; }
+; Content of bar.cpp:
+; 
+; #include "a.hpp"
+; void f(int);
+; void g(int a) {
+;   Base t;
+; }
+; int main() {
+;   f(0);
+;   g(1);
+;   return 0;
+; }
+; ModuleID = 'foo.cpp'
+
+%struct.Base = type { i32, %struct.Base* }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z1fi(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %t = alloca %struct.Base, align 8
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !17), !dbg !18
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !19), !dbg !20
+  ret void, !dbg !21
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !22}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"foo.cpp", metadata !"."}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"./a.hpp", metadata !"."}
+!6 = metadata !{metadata !7, metadata !9}
+!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"b", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4Base"}
+!11 = metadata !{metadata !12}
+!12 = metadata !{i32 786478, metadata !1, metadata !13, metadata !"f", metadata !"f", metadata !"_Z1fi", i32 3, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1fi, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!13 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [foo.cpp]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{null, metadata !8}
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!17 = metadata !{i32 786689, metadata !12, metadata !"a", metadata !13, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!18 = metadata !{i32 3, i32 0, metadata !12, null}
+!19 = metadata !{i32 786688, metadata !12, metadata !"t", metadata !13, i32 4, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 4]
+!20 = metadata !{i32 4, i32 0, metadata !12, null}
+!21 = metadata !{i32 5, i32 0, metadata !12, null}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/Inputs/type-unique-simple2-b.ll b/test/Linker/Inputs/type-unique-simple2-b.ll
new file mode 100644
index 0000000..f564d81
--- /dev/null
+++ b/test/Linker/Inputs/type-unique-simple2-b.ll
@@ -0,0 +1,67 @@
+; ModuleID = 'bar.cpp'
+
+%struct.Base = type { i32, %struct.Base* }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z1gi(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %t = alloca %struct.Base, align 8
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !20), !dbg !21
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !22), !dbg !23
+  ret void, !dbg !24
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: ssp uwtable
+define i32 @main() #2 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  call void @_Z1fi(i32 0), !dbg !25
+  call void @_Z1gi(i32 1), !dbg !26
+  ret i32 0, !dbg !27
+}
+
+declare void @_Z1fi(i32) #3
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!19, !28}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git 8a3f9e46cb988d2c664395b21910091e3730ae82) (http://llvm.org/git/llvm.git 4699e9549358bc77824a59114548eecc3f7c523c)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !11, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"bar.cpp", metadata !"."}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 128, i64 64, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 128, align 64, offset 0] [def] [from ]
+!5 = metadata !{metadata !"./a.hpp", metadata !"."}
+!6 = metadata !{metadata !7, metadata !9}
+!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"b", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [b] [line 3, size 64, align 64, offset 64] [from ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !"_ZTS4Base"}
+!11 = metadata !{metadata !12, metadata !16}
+!12 = metadata !{i32 786478, metadata !1, metadata !13, metadata !"g", metadata !"g", metadata !"_Z1gi", i32 4, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1gi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!13 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [bar.cpp]
+!14 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!15 = metadata !{null, metadata !8}
+!16 = metadata !{i32 786478, metadata !1, metadata !13, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !17, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!17 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!18 = metadata !{metadata !8}
+!19 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!20 = metadata !{i32 786689, metadata !12, metadata !"a", metadata !13, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!21 = metadata !{i32 4, i32 0, metadata !12, null}
+!22 = metadata !{i32 786688, metadata !12, metadata !"t", metadata !13, i32 5, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!23 = metadata !{i32 5, i32 0, metadata !12, null}
+!24 = metadata !{i32 6, i32 0, metadata !12, null}
+!25 = metadata !{i32 8, i32 0, metadata !16, null} ; [ DW_TAG_imported_declaration ]
+!26 = metadata !{i32 9, i32 0, metadata !16, null}
+!27 = metadata !{i32 10, i32 0, metadata !16, null}
+!28 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/lit.local.cfg b/test/Linker/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Linker/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Linker/prefixdata.ll b/test/Linker/prefixdata.ll
new file mode 100644
index 0000000..1f11dc7
--- /dev/null
+++ b/test/Linker/prefixdata.ll
@@ -0,0 +1,9 @@
+; RUN: echo > %t.ll
+; RUN: llvm-link %t.ll %s -S -o - | FileCheck %s
+
+@i = linkonce_odr global i32 1
+
+; CHECK: define void @f() prefix i32* @i
+define void @f() prefix i32* @i {
+  ret void
+}
diff --git a/test/Linker/transitive-lazy-link.ll b/test/Linker/transitive-lazy-link.ll
index 32b9d21..c1cacae 100644
--- a/test/Linker/transitive-lazy-link.ll
+++ b/test/Linker/transitive-lazy-link.ll
@@ -1,6 +1,11 @@
 ; @f and @g are lazily linked. @f requires @g - ensure @g is correctly linked.
 
-; RUN: echo -e "declare i32 @f(i32)\ndefine i32 @h(i32 %x) {\n%1 = call i32 @f(i32 %x)\nret i32 %1\n}" | llvm-as >%t.1.bc
+; RUN: echo "declare i32 @f(i32)" > %t.1.ll
+; RUN: echo "define i32 @h(i32 %x) {" >> %t.1.ll
+; RUN: echo "  %1 = call i32 @f(i32 %x)" >> %t.1.ll
+; RUN: echo "  ret i32 %1" >> %t.1.ll
+; RUN: echo "}" >> %t.1.ll
+; RUN: llvm-as < %t.1.ll > %t.1.bc
 ; RUN: llvm-as < %s > %t.2.bc
 ; RUN: llvm-link %t.1.bc %t.2.bc
 
diff --git a/test/Linker/type-unique-inheritance.ll b/test/Linker/type-unique-inheritance.ll
new file mode 100644
index 0000000..1ba1b08
--- /dev/null
+++ b/test/Linker/type-unique-inheritance.ll
@@ -0,0 +1 @@
+; RUN: llvm-link %S/Inputs/type-unique-inheritance-a.ll %S/Inputs/type-unique-inheritance-b.ll -S -o - | FileCheck %S/Inputs/type-unique-inheritance-a.ll
diff --git a/test/Linker/type-unique-simple-a.ll b/test/Linker/type-unique-simple-a.ll
new file mode 100644
index 0000000..4bfdff9
--- /dev/null
+++ b/test/Linker/type-unique-simple-a.ll
@@ -0,0 +1,91 @@
+; REQUIRES: object-emission
+
+; RUN: llvm-link %s %p/type-unique-simple-b.ll -S -o %t
+; RUN: cat %t | FileCheck %s -check-prefix=LINK
+; RUN: llc -filetype=obj -O0 < %t > %t2
+; RUN: llvm-dwarfdump -debug-dump=info %t2 | FileCheck %s
+
+; Make sure the backend generates a single DIE and uses ref_addr.
+; CHECK: 0x[[BASE:.*]]: DW_TAG_structure_type
+; CHECK-NEXT: DW_AT_name {{.*}} = "Base"
+; CHECK-NOT: DW_TAG_structure_type
+; CHECK: 0x[[INT:.*]]: DW_TAG_base_type
+; CHECK-NEXT: DW_AT_name {{.*}} = "int"
+; CHECK-NOT: DW_TAG_base_type
+
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[INT]])
+; CHECK: DW_TAG_variable
+; CHECK: DW_AT_type [DW_FORM_ref_addr] {{.*}}[[BASE]])
+
+; Make sure llvm-link only generates a single copy of the struct.
+; LINK: DW_TAG_structure_type
+; LINK-NOT: DW_TAG_structure_type
+; Content of header files:
+; struct Base {
+;   int a;
+; };
+; Content of foo.cpp:
+; 
+; #include "a.hpp"
+; void f(int a) {
+;   Base t;
+; }
+; Content of bar.cpp:
+; 
+; #include "a.hpp"
+; void f(int);
+; void g(int a) {
+;   Base t;
+; }
+; int main() {
+;   f(0);
+;   g(1);
+;   return 0;
+; }
+; ModuleID = 'foo.cpp'
+
+%struct.Base = type { i32 }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z1fi(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %t = alloca %struct.Base, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !15), !dbg !16
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !17), !dbg !18
+  ret void, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!14, !20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"foo.cpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"f", metadata !"f", metadata !"_Z1fi", i32 3, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1fi, null, null, metadata !2, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
+!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/foo.cpp]
+!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{null, metadata !8}
+!14 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!15 = metadata !{i32 786689, metadata !10, metadata !"a", metadata !11, i32 16777219, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 3]
+!16 = metadata !{i32 3, i32 0, metadata !10, null}
+!17 = metadata !{i32 786688, metadata !10, metadata !"t", metadata !11, i32 4, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 4]
+!18 = metadata !{i32 4, i32 0, metadata !10, null}
+!19 = metadata !{i32 5, i32 0, metadata !10, null}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/type-unique-simple-b.ll b/test/Linker/type-unique-simple-b.ll
new file mode 100644
index 0000000..c46e67f
--- /dev/null
+++ b/test/Linker/type-unique-simple-b.ll
@@ -0,0 +1,67 @@
+; RUN: true
+
+; ModuleID = 'bar.cpp'
+
+%struct.Base = type { i32 }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z1gi(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %t = alloca %struct.Base, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %a.addr}, metadata !18), !dbg !19
+  call void @llvm.dbg.declare(metadata !{%struct.Base* %t}, metadata !20), !dbg !21
+  ret void, !dbg !22
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+; Function Attrs: ssp uwtable
+define i32 @main() #2 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  call void @_Z1fi(i32 0), !dbg !23
+  call void @_Z1gi(i32 1), !dbg !24
+  ret i32 0, !dbg !25
+}
+
+declare void @_Z1fi(i32) #3
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!17, !26}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (http://llvm.org/git/clang.git c23b1db6268c8e7ce64026d57d1510c1aac200a0) (http://llvm.org/git/llvm.git 09b98fe3978eddefc2145adc1056cf21580ce945)", i1 false, metadata !"", i32 0, metadata !2, metadata !3, metadata !9, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"bar.cpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786451, metadata !5, null, metadata !"Base", i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, metadata !"_ZTS4Base"} ; [ DW_TAG_structure_type ] [Base] [line 1, size 32, align 32, offset 0] [def] [from ]
+!5 = metadata !{metadata !"./a.hpp", metadata !"/Users/mren/c_testing/type_unique_air/simple"}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786445, metadata !5, metadata !"_ZTS4Base", metadata !"a", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 2, size 32, align 32, offset 0] [from int]
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !10, metadata !14}
+!10 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"g", metadata !"g", metadata !"_Z1gi", i32 4, metadata !12, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @_Z1gi, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [g]
+!11 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/mren/c_testing/type_unique_air/simple/bar.cpp]
+!12 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !13, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!13 = metadata !{null, metadata !8}
+!14 = metadata !{i32 786478, metadata !1, metadata !11, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !15, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, metadata !2, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [main]
+!15 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{metadata !8}
+!17 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!18 = metadata !{i32 786689, metadata !10, metadata !"a", metadata !11, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [a] [line 4]
+!19 = metadata !{i32 4, i32 0, metadata !10, null}
+!20 = metadata !{i32 786688, metadata !10, metadata !"t", metadata !11, i32 5, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [t] [line 5]
+!21 = metadata !{i32 5, i32 0, metadata !10, null}
+!22 = metadata !{i32 6, i32 0, metadata !10, null}
+!23 = metadata !{i32 8, i32 0, metadata !14, null} ; [ DW_TAG_imported_declaration ]
+!24 = metadata !{i32 9, i32 0, metadata !14, null}
+!25 = metadata !{i32 10, i32 0, metadata !14, null}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Linker/type-unique-simple2.ll b/test/Linker/type-unique-simple2.ll
new file mode 100644
index 0000000..ead91df
--- /dev/null
+++ b/test/Linker/type-unique-simple2.ll
@@ -0,0 +1,6 @@
+; REQUIRES: object-emission
+
+; RUN: llvm-link %S/Inputs/type-unique-simple2-a.ll %S/Inputs/type-unique-simple2-b.ll -S -o %t
+; RUN: cat %t | FileCheck %S/Inputs/type-unique-simple2-a.ll -check-prefix=LINK
+; RUN: llc -filetype=obj -O0 < %t > %t2
+; RUN: llvm-dwarfdump -debug-dump=info %t2 | FileCheck %S/Inputs/type-unique-simple2-a.ll
diff --git a/test/Linker/unnamed-addr-err-a.ll b/test/Linker/unnamed-addr-err-a.ll
new file mode 100644
index 0000000..4872098
--- /dev/null
+++ b/test/Linker/unnamed-addr-err-a.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-link %s %p/unnamed-addr-err-b.ll -S -o - 2>&1 | FileCheck %s
+
+@foo = appending unnamed_addr global [1 x i32] [i32 42]
+; CHECK: Appending variables with different unnamed_addr need to be linked
diff --git a/test/Linker/unnamed-addr-err-b.ll b/test/Linker/unnamed-addr-err-b.ll
new file mode 100644
index 0000000..5e5fed9
--- /dev/null
+++ b/test/Linker/unnamed-addr-err-b.ll
@@ -0,0 +1,4 @@
+; This file is for use with unnamed-addr-err-a.ll
+; RUN: true
+
+@foo = appending global [1 x i32] [i32 42]
diff --git a/test/Linker/unnamed-addr1-a.ll b/test/Linker/unnamed-addr1-a.ll
index e9c03ee..adaa400 100644
--- a/test/Linker/unnamed-addr1-a.ll
+++ b/test/Linker/unnamed-addr1-a.ll
@@ -1,27 +1,46 @@
-; RUN: llvm-link %s %p/unnamed-addr1-b.ll -S -o - | sort | FileCheck %s
+; RUN: llvm-link %s %p/unnamed-addr1-b.ll -S -o - | FileCheck %s
 
 ; Only in this file
-@a = common global i32 0
-; CHECK: @a = common global i32 0
-@b = common unnamed_addr global i32 0
-; CHECK: @b = common unnamed_addr global i32 0
+@global-a = common global i32 0
+; CHECK-DAG: @global-a = common global i32 0
+@global-b = common unnamed_addr global i32 0
+; CHECK-DAG: @global-b = common unnamed_addr global i32 0
+
+define weak void @func-a() { ret void }
+; CHECK-DAG: define weak void @func-a() {
+define weak void @func-b() unnamed_addr { ret void }
+; CHECK-DAG: define weak void @func-b() unnamed_addr {
 
 ; Other file has unnamed_addr definition
-@c = common unnamed_addr global i32 0
-; CHECK: @c = common unnamed_addr global i32 0
-@d = external global i32
-; CHECK: @d = unnamed_addr global i32 42
-@e = external unnamed_addr global i32
-; CHECK: @e = unnamed_addr global i32 42
-@f = weak global i32 42
-; CHECK: @f = unnamed_addr global i32 42
+@global-c = common unnamed_addr global i32 0
+; CHECK-DAG: @global-c = common unnamed_addr global i32 0
+@global-d = external global i32
+; CHECK-DAG: @global-d = global i32 42
+@global-e = external unnamed_addr global i32
+; CHECK-DAG: @global-e = unnamed_addr global i32 42
+@global-f = weak global i32 42
+; CHECK-DAG: @global-f = global i32 42
+
+declare void @func-c()
+; CHECK-DAG: define weak void @func-c() {
+define weak void @func-d() { ret void }
+; CHECK-DAG: define weak void @func-d() {
+define weak void @func-e() unnamed_addr { ret void }
+; CHECK-DAG: define weak void @func-e() unnamed_addr {
 
 ; Other file has non-unnamed_addr definition
-@g = common unnamed_addr global i32 0
-; CHECK: @g = common unnamed_addr global i32 0
-@h = external global i32
-; CHECK: @h = global i32 42
-@i = external unnamed_addr global i32
-; CHECK: @i = global i32 42
-@j = weak global i32 42
-; CHECK: @j = global i32 42
+@global-g = common unnamed_addr global i32 0
+; CHECK-DAG: @global-g = common global i32 0
+@global-h = external global i32
+; CHECK-DAG: @global-h = global i32 42
+@global-i = external unnamed_addr global i32
+; CHECK-DAG: @global-i = global i32 42
+@global-j = weak global i32 42
+; CHECK-DAG: @global-j = global i32 42
+
+declare void @func-g()
+; CHECK-DAG: define weak void @func-g() {
+define weak void @func-h() { ret void }
+; CHECK-DAG: define weak void @func-h() {
+define weak void @func-i() unnamed_addr { ret void }
+; CHECK-DAG: define weak void @func-i() {
diff --git a/test/Linker/unnamed-addr1-b.ll b/test/Linker/unnamed-addr1-b.ll
index 7d94dc1..aa1507b 100644
--- a/test/Linker/unnamed-addr1-b.ll
+++ b/test/Linker/unnamed-addr1-b.ll
@@ -1,12 +1,20 @@
 ; This file is for use with unnamed-addr1-a.ll
 ; RUN: true
 
-@c = common unnamed_addr global i32 42
-@d = unnamed_addr global i32 42
-@e = unnamed_addr global i32 42
-@f = unnamed_addr global i32 42
-
-@g = common global i32 42
-@h = global i32 42
-@i = global i32 42
-@j = global i32 42
+@global-c = common unnamed_addr global i32 42
+@global-d = unnamed_addr global i32 42
+@global-e = unnamed_addr global i32 42
+@global-f = unnamed_addr global i32 42
+
+define weak void @func-c() unnamed_addr { ret void }
+define weak void @func-d() unnamed_addr { ret void }
+define weak void @func-e() unnamed_addr { ret void }
+
+@global-g = common global i32 42
+@global-h = global i32 42
+@global-i = global i32 42
+@global-j = global i32 42
+
+define weak void @func-g() { ret void }
+define weak void @func-h() { ret void }
+define weak void @func-i() { ret void }
diff --git a/test/MC/AArch64/adrp-relocation.s b/test/MC/AArch64/adrp-relocation.s
new file mode 100644
index 0000000..3bcef34
--- /dev/null
+++ b/test/MC/AArch64/adrp-relocation.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple=aarch64-linux-gnu -filetype=obj -o - %s| llvm-readobj -r - | FileCheck %s
+        .text
+// These should produce an ADRP/ADD pair to calculate the address of
+// testfn. The important point is that LLVM shouldn't think it can deal with the
+// relocation on the ADRP itself (even though it knows everything about the
+// relative offsets of testfn and foo) because its value depends on where this
+// object file's .text section gets relocated in memory.
+        adrp x0, sym
+        adrp x0, :got:sym
+        adrp x0, :gottprel:sym
+        adrp x0, :tlsdesc:sym
+
+        .global sym
+sym:
+// CHECK: R_AARCH64_ADR_PREL_PG_HI21 sym
+// CHECK: R_AARCH64_ADR_GOT_PAGE sym
+// CHECK: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym
+// CHECK: R_AARCH64_TLSDESC_ADR_PAGE sym
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index e4f6b21..a50efb3 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+fp-armv8 < %s | FileCheck %s
   .globl _func
 
 // Check that the assembler can handle the documented syntax from the ARM ARM.
diff --git a/test/MC/AArch64/basic-pic.s b/test/MC/AArch64/basic-pic.s
new file mode 100644
index 0000000..a10874d
--- /dev/null
+++ b/test/MC/AArch64/basic-pic.s
@@ -0,0 +1,98 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o -| llvm-objdump -r - | FileCheck %s
+
+// CHECK: RELOCATION RECORDS FOR [.rela.text]
+
+	.file	"/home/espindola/llvm/llvm/test/CodeGen/AArch64/basic-pic.ll"
+	.text
+	.globl	get_globalvar
+	.type	get_globalvar,@function
+get_globalvar:                          // @get_globalvar
+	.cfi_startproc
+// BB#0:
+	adrp	x0, :got:var
+	ldr	x0, [x0, #:got_lo12:var]
+	ldr	 w0, [x0]
+	ret
+.Ltmp0:
+	.size	get_globalvar, .Ltmp0-get_globalvar
+	.cfi_endproc
+
+// CHECK: R_AARCH64_ADR_GOT_PAGE var
+// CHECK: R_AARCH64_LD64_GOT_LO12_NC var
+
+	.globl	get_globalvaraddr
+	.type	get_globalvaraddr,@function
+get_globalvaraddr:                      // @get_globalvaraddr
+	.cfi_startproc
+// BB#0:
+	adrp	x0, :got:var
+	ldr	x0, [x0, #:got_lo12:var]
+	ret
+.Ltmp1:
+	.size	get_globalvaraddr, .Ltmp1-get_globalvaraddr
+	.cfi_endproc
+// CHECK: R_AARCH64_ADR_GOT_PAGE var
+// CHECK: R_AARCH64_LD64_GOT_LO12_NC var
+
+	.globl	get_hiddenvar
+	.type	get_hiddenvar,@function
+get_hiddenvar:                          // @get_hiddenvar
+	.cfi_startproc
+// BB#0:
+	adrp	x0, hiddenvar
+	ldr	w0, [x0, #:lo12:hiddenvar]
+	ret
+.Ltmp2:
+	.size	get_hiddenvar, .Ltmp2-get_hiddenvar
+	.cfi_endproc
+// CHECK: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
+// CHECK: R_AARCH64_LDST32_ABS_LO12_NC hiddenvar
+
+	.globl	get_hiddenvaraddr
+	.type	get_hiddenvaraddr,@function
+get_hiddenvaraddr:                      // @get_hiddenvaraddr
+	.cfi_startproc
+// BB#0:
+	adrp	x0, hiddenvar
+	add	x0, x0, #:lo12:hiddenvar
+	ret
+.Ltmp3:
+	.size	get_hiddenvaraddr, .Ltmp3-get_hiddenvaraddr
+	.cfi_endproc
+// CHECK: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
+// CHECK: R_AARCH64_ADD_ABS_LO12_NC hiddenvar
+
+	.globl	get_func
+	.type	get_func,@function
+get_func:                               // @get_func
+	.cfi_startproc
+// BB#0:
+	adrp	x0, :got:get_func
+	ldr	x0, [x0, #:got_lo12:get_func]
+	ret
+.Ltmp4:
+	.size	get_func, .Ltmp4-get_func
+	.cfi_endproc
+
+// Particularly important that the ADRP gets a relocation, LLVM tends to think
+// it can relax it because it knows where get_func is. It can't!
+// CHECK: R_AARCH64_ADR_GOT_PAGE get_func
+// CHECK: R_AARCH64_LD64_GOT_LO12_NC get_func
+
+	.type	var,@object             // @var
+	.bss
+	.globl	var
+	.align	2
+var:
+	.word	0                       // 0x0
+	.size	var, 4
+
+	.hidden	hiddenvar               // @hiddenvar
+	.type	hiddenvar,@object
+	.globl	hiddenvar
+	.align	2
+hiddenvar:
+	.word	0                       // 0x0
+	.size	hiddenvar, 4
+
+
diff --git a/test/MC/AArch64/elf-extern.s b/test/MC/AArch64/elf-extern.s
new file mode 100644
index 0000000..dfa3fb0
--- /dev/null
+++ b/test/MC/AArch64/elf-extern.s
@@ -0,0 +1,33 @@
+// RUN: llvm-mc < %s -triple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
+
+// External symbols are a different concept to global variables but should still
+// get relocations and so on when used.
+
+	.file	"<stdin>"
+	.text
+	.globl	check_extern
+	.type	check_extern,@function
+check_extern:                           // @check_extern
+	.cfi_startproc
+// BB#0:
+	sub	sp, sp, #16
+.Ltmp2:
+	.cfi_def_cfa sp, 16
+	str	x30, [sp, #8]           // 8-byte Folded Spill
+.Ltmp3:
+	.cfi_offset x30, -8
+	bl	memcpy
+	mov	 x0, xzr
+	ldr	x30, [sp, #8]           // 8-byte Folded Reload
+	add	sp, sp, #16
+	ret
+.Ltmp4:
+	.size	check_extern, .Ltmp4-check_extern
+	.cfi_endproc
+
+
+// CHECK: Relocations [
+// CHECK:   Section (2) .rela.text {
+// CHECK:     0x{{[0-9,A-F]+}} R_AARCH64_CALL26 memcpy
+// CHECK:   }
+// CHECK: ]
diff --git a/test/MC/AArch64/elf-reloc-ldstunsimm.s b/test/MC/AArch64/elf-reloc-ldstunsimm.s
index 64bf63a..371e7e5 100644
--- a/test/MC/AArch64/elf-reloc-ldstunsimm.s
+++ b/test/MC/AArch64/elf-reloc-ldstunsimm.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj %s -o - | \
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+fp-armv8 -filetype=obj %s -o - | \
 // RUN:   llvm-readobj -r | FileCheck -check-prefix=OBJ %s
 
         ldrb w0, [sp, #:lo12:some_label]
diff --git a/test/MC/AArch64/inline-asm-modifiers.s b/test/MC/AArch64/inline-asm-modifiers.s
new file mode 100644
index 0000000..cf34a95
--- /dev/null
+++ b/test/MC/AArch64/inline-asm-modifiers.s
@@ -0,0 +1,209 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -filetype=obj -mattr=+fp-armv8 < %s | llvm-objdump -r - | FileCheck %s
+
+	.file	"<stdin>"
+	.text
+	.globl	test_inline_modifier_L
+	.type	test_inline_modifier_L,@function
+test_inline_modifier_L:                 // @test_inline_modifier_L
+// BB#0:
+	//APP
+	add x0, x0, #:lo12:var_simple
+	//NO_APP
+	//APP
+	ldr x0, [x0, #:got_lo12:var_got]
+	//NO_APP
+	//APP
+	add x0, x0, #:tlsdesc_lo12:var_tlsgd
+	//NO_APP
+	//APP
+	add x0, x0, #:dtprel_lo12:var_tlsld
+	//NO_APP
+	//APP
+	ldr x0, [x0, #:gottprel_lo12:var_tlsie]
+	//NO_APP
+	//APP
+	add x0, x0, #:tprel_lo12:var_tlsle
+	//NO_APP
+	ret
+.Ltmp0:
+	.size	test_inline_modifier_L, .Ltmp0-test_inline_modifier_L
+
+// CHECK: R_AARCH64_ADD_ABS_LO12_NC var_simple
+// CHECK: R_AARCH64_LD64_GOT_LO12_NC var_got
+// CHECK: R_AARCH64_TLSDESC_ADD_LO12_NC var_tlsgd
+// CHECK: R_AARCH64_TLSLD_ADD_DTPREL_LO12 var_tlsld
+// CHECK: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC var_tlsie
+// CHECK: R_AARCH64_TLSLE_ADD_TPREL_LO12 var_tlsle
+
+	.globl	test_inline_modifier_G
+	.type	test_inline_modifier_G,@function
+test_inline_modifier_G:                 // @test_inline_modifier_G
+// BB#0:
+	//APP
+	add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
+	//NO_APP
+	//APP
+	add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
+	//NO_APP
+	ret
+.Ltmp1:
+	.size	test_inline_modifier_G, .Ltmp1-test_inline_modifier_G
+
+// CHECK: R_AARCH64_TLSLD_ADD_DTPREL_HI12 var_tlsld
+// CHECK: R_AARCH64_TLSLE_ADD_TPREL_HI12 var_tlsle
+
+	.globl	test_inline_modifier_A
+	.type	test_inline_modifier_A,@function
+test_inline_modifier_A:                 // @test_inline_modifier_A
+// BB#0:
+	//APP
+	adrp x0, var_simple
+	//NO_APP
+	//APP
+	adrp x0, :got:var_got
+	//NO_APP
+	//APP
+	adrp x0, :tlsdesc:var_tlsgd
+	//NO_APP
+	//APP
+	adrp x0, :gottprel:var_tlsie
+	//NO_APP
+	ret
+.Ltmp2:
+	.size	test_inline_modifier_A, .Ltmp2-test_inline_modifier_A
+// CHECK: R_AARCH64_ADR_PREL_PG_HI21 var_simple
+// CHECK: R_AARCH64_ADR_GOT_PAGE var_got
+// CHECK: R_AARCH64_TLSDESC_ADR_PAGE var_tlsgd
+// CHECK: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 var_tlsie
+
+	.globl	test_inline_modifier_wx
+	.type	test_inline_modifier_wx,@function
+test_inline_modifier_wx:                // @test_inline_modifier_wx
+// BB#0:
+	mov	 w2, w0
+	//APP
+	add w2, w2, w2
+	//NO_APP
+	mov	 w2, w0
+	//APP
+	add w2, w2, w2
+	//NO_APP
+	//APP
+	add x0, x0, x0
+	//NO_APP
+	mov	 x0, x1
+	//APP
+	add x0, x0, x0
+	//NO_APP
+	mov	 x0, x1
+	//APP
+	add w0, w0, w0
+	//NO_APP
+	//APP
+	add x1, x1, x1
+	//NO_APP
+	//APP
+	add w0, wzr, wzr
+	//NO_APP
+	//APP
+	add x0, xzr, xzr
+	//NO_APP
+	ret
+.Ltmp3:
+	.size	test_inline_modifier_wx, .Ltmp3-test_inline_modifier_wx
+
+	.globl	test_inline_modifier_bhsdq
+	.type	test_inline_modifier_bhsdq,@function
+test_inline_modifier_bhsdq:             // @test_inline_modifier_bhsdq
+// BB#0:
+	//APP
+	ldr b0, [sp]
+	//NO_APP
+	//APP
+	ldr h0, [sp]
+	//NO_APP
+	//APP
+	ldr s0, [sp]
+	//NO_APP
+	//APP
+	ldr d0, [sp]
+	//NO_APP
+	//APP
+	ldr q0, [sp]
+	//NO_APP
+	//APP
+	ldr b0, [sp]
+	//NO_APP
+	//APP
+	ldr h0, [sp]
+	//NO_APP
+	//APP
+	ldr s0, [sp]
+	//NO_APP
+	//APP
+	ldr d0, [sp]
+	//NO_APP
+	//APP
+	ldr q0, [sp]
+	//NO_APP
+	ret
+.Ltmp4:
+	.size	test_inline_modifier_bhsdq, .Ltmp4-test_inline_modifier_bhsdq
+
+	.globl	test_inline_modifier_c
+	.type	test_inline_modifier_c,@function
+test_inline_modifier_c:                 // @test_inline_modifier_c
+// BB#0:
+	//APP
+	adr x0, 3
+	//NO_APP
+	ret
+.Ltmp5:
+	.size	test_inline_modifier_c, .Ltmp5-test_inline_modifier_c
+
+	.hidden	var_simple              // @var_simple
+	.type	var_simple,@object
+	.bss
+	.globl	var_simple
+	.align	2
+var_simple:
+	.word	0                       // 0x0
+	.size	var_simple, 4
+
+	.type	var_got,@object         // @var_got
+	.globl	var_got
+	.align	2
+var_got:
+	.word	0                       // 0x0
+	.size	var_got, 4
+
+	.type	var_tlsgd,@object       // @var_tlsgd
+	.section	.tbss,"awT",@nobits
+	.globl	var_tlsgd
+	.align	2
+var_tlsgd:
+	.word	0                       // 0x0
+	.size	var_tlsgd, 4
+
+	.type	var_tlsld,@object       // @var_tlsld
+	.globl	var_tlsld
+	.align	2
+var_tlsld:
+	.word	0                       // 0x0
+	.size	var_tlsld, 4
+
+	.type	var_tlsie,@object       // @var_tlsie
+	.globl	var_tlsie
+	.align	2
+var_tlsie:
+	.word	0                       // 0x0
+	.size	var_tlsie, 4
+
+	.type	var_tlsle,@object       // @var_tlsle
+	.globl	var_tlsle
+	.align	2
+var_tlsle:
+	.word	0                       // 0x0
+	.size	var_tlsle, 4
+
+
diff --git a/test/MC/AArch64/jump-table.s b/test/MC/AArch64/jump-table.s
new file mode 100644
index 0000000..578ebf4
--- /dev/null
+++ b/test/MC/AArch64/jump-table.s
@@ -0,0 +1,59 @@
+// RUN: llvm-mc < %s -triple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
+
+	.file	"<stdin>"
+	.text
+	.globl	test_jumptable
+	.type	test_jumptable,@function
+test_jumptable:                         // @test_jumptable
+	.cfi_startproc
+// BB#0:
+	ubfx	w1, w0, #0, #32
+	cmp w0, #4
+	b.hi .LBB0_3
+// BB#1:
+	adrp	x0, .LJTI0_0
+	add	x0, x0, #:lo12:.LJTI0_0
+	ldr	x0, [x0, x1, lsl #3]
+	br	x0
+.LBB0_2:                                // %lbl1
+	movz	x0, #1
+	ret
+.LBB0_3:                                // %def
+	mov	 x0, xzr
+	ret
+.LBB0_4:                                // %lbl2
+	movz	x0, #2
+	ret
+.LBB0_5:                                // %lbl3
+	movz	x0, #4
+	ret
+.LBB0_6:                                // %lbl4
+	movz	x0, #8
+	ret
+.Ltmp0:
+	.size	test_jumptable, .Ltmp0-test_jumptable
+	.cfi_endproc
+	.section	.rodata,"a",@progbits
+	.align	3
+.LJTI0_0:
+	.xword	.LBB0_2
+	.xword	.LBB0_4
+	.xword	.LBB0_5
+	.xword	.LBB0_3
+	.xword	.LBB0_6
+
+
+
+// First make sure we get a page/lo12 pair in .text to pick up the jump-table
+
+// CHECK:      Relocations [
+// CHECK:        Section ({{[0-9]+}}) .rela.text {
+// CHECK-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 .rodata
+// CHECK-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ADD_ABS_LO12_NC .rodata
+// CHECK:        }
+
+// Also check the targets in .rodata are relocated
+// CHECK:        Section ({{[0-9]+}}) .rela.rodata {
+// CHECK-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ABS64 .text
+// CHECK:        }
+// CHECK:      ]
diff --git a/test/MC/AArch64/lit.local.cfg b/test/MC/AArch64/lit.local.cfg
index cc02173..75dba81 100644
--- a/test/MC/AArch64/lit.local.cfg
+++ b/test/MC/AArch64/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'AArch64' in targets:
     config.unsupported = True
 \ No newline at end of file
diff --git a/test/MC/AArch64/neon-2velem.s b/test/MC/AArch64/neon-2velem.s
new file mode 100644
index 0000000..cde792a
--- /dev/null
+++ b/test/MC/AArch64/neon-2velem.s
@@ -0,0 +1,271 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Instructions with 2 vectors and an element
+//------------------------------------------------------------------------------
+
+        mla v0.2s, v1.2s, v2.s[2]
+        mla v0.2s, v1.2s, v22.s[2]
+        mla v3.4s, v8.4s, v2.s[1]
+        mla v3.4s, v8.4s, v22.s[3]
+
+// CHECK: mla	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x08,0x82,0x2f]
+// CHECK: mla	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x08,0x96,0x2f]
+// CHECK: mla	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x01,0xa2,0x6f]
+// CHECK: mla	v3.4s, v8.4s, v22.s[3]  // encoding: [0x03,0x09,0xb6,0x6f]
+
+        mla v0.4h, v1.4h, v2.h[2]
+        mla v0.4h, v1.4h, v15.h[2]
+        mla v0.8h, v1.8h, v2.h[7]
+        mla v0.8h, v1.8h, v14.h[6]
+
+// CHECK: mla	v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x00,0x62,0x2f]
+// CHECK: mla	v0.4h, v1.4h, v15.h[2]  // encoding: [0x20,0x00,0x6f,0x2f]
+// CHECK: mla	v0.8h, v1.8h, v2.h[7]   // encoding: [0x20,0x08,0x72,0x6f]
+// CHECK: mla	v0.8h, v1.8h, v14.h[6]  // encoding: [0x20,0x08,0x6e,0x6f]
+
+        mls v0.2s, v1.2s, v2.s[2]
+        mls v0.2s, v1.2s, v22.s[2]
+        mls v3.4s, v8.4s, v2.s[1]
+        mls v3.4s, v8.4s, v22.s[3]
+
+// CHECK: mls	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x48,0x82,0x2f]
+// CHECK: mls	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x48,0x96,0x2f]
+// CHECK: mls	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x41,0xa2,0x6f]
+// CHECK: mls	v3.4s, v8.4s, v22.s[3]  // encoding: [0x03,0x49,0xb6,0x6f]
+
+        mls v0.4h, v1.4h, v2.h[2]
+        mls v0.4h, v1.4h, v15.h[2]
+        mls v0.8h, v1.8h, v2.h[7]
+        mls v0.8h, v1.8h, v14.h[6]
+
+// CHECK: mls	v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x40,0x62,0x2f]
+// CHECK: mls	v0.4h, v1.4h, v15.h[2]  // encoding: [0x20,0x40,0x6f,0x2f]
+// CHECK: mls	v0.8h, v1.8h, v2.h[7]   // encoding: [0x20,0x48,0x72,0x6f]
+// CHECK: mls	v0.8h, v1.8h, v14.h[6]  // encoding: [0x20,0x48,0x6e,0x6f]
+
+        fmla v0.2s, v1.2s, v2.s[2]
+        fmla v0.2s, v1.2s, v22.s[2]
+        fmla v3.4s, v8.4s, v2.s[1]
+        fmla v3.4s, v8.4s, v22.s[3]
+        fmla v0.2d, v1.2d, v2.d[1]
+        fmla v0.2d, v1.2d, v22.d[1]
+
+// CHECK: fmla	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x18,0x82,0x0f]
+// CHECK: fmla	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x18,0x96,0x0f]
+// CHECK: fmla	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x11,0xa2,0x4f]
+// CHECK: fmla	v3.4s, v8.4s, v22.s[3]  // encoding: [0x03,0x19,0xb6,0x4f]
+// CHECK: fmla	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x18,0xc2,0x4f]
+// CHECK: fmla	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x18,0xd6,0x4f]
+
+        fmls v0.2s, v1.2s, v2.s[2]
+        fmls v0.2s, v1.2s, v22.s[2]
+        fmls v3.4s, v8.4s, v2.s[1]
+        fmls v3.4s, v8.4s, v22.s[3]
+        fmls v0.2d, v1.2d, v2.d[1]
+        fmls v0.2d, v1.2d, v22.d[1]
+
+// CHECK: fmls	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x58,0x82,0x0f]
+// CHECK: fmls	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x58,0x96,0x0f]
+// CHECK: fmls	v3.4s, v8.4s, v2.s[1]   // encoding: [0x03,0x51,0xa2,0x4f]
+// CHECK: fmls	v3.4s, v8.4s, v22.s[3]  // encoding: [0x03,0x59,0xb6,0x4f]
+// CHECK: fmls	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x58,0xc2,0x4f]
+// CHECK: fmls	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x58,0xd6,0x4f]
+
+        smlal v0.4s, v1.4h, v2.h[2]
+        smlal v0.2d, v1.2s, v2.s[2]
+        smlal v0.2d, v1.2s, v22.s[2]
+        smlal2 v0.4s, v1.8h, v1.h[2]
+        smlal2 v0.2d, v1.4s, v1.s[2]
+        smlal2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: smlal	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0x20,0x62,0x0f]
+// CHECK: smlal	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0x28,0x82,0x0f]
+// CHECK: smlal	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0x28,0x96,0x0f]
+// CHECK: smlal2	v0.4s, v1.8h, v1.h[2]   // encoding: [0x20,0x20,0x61,0x4f]
+// CHECK: smlal2	v0.2d, v1.4s, v1.s[2]   // encoding: [0x20,0x28,0x81,0x4f]
+// CHECK: smlal2	v0.2d, v1.4s, v22.s[2]  // encoding: [0x20,0x28,0x96,0x4f]
+
+        smlsl v0.4s, v1.4h, v2.h[2]
+        smlsl v0.2d, v1.2s, v2.s[2]
+        smlsl v0.2d, v1.2s, v22.s[2]
+        smlsl2 v0.4s, v1.8h, v1.h[2]
+        smlsl2 v0.2d, v1.4s, v1.s[2]
+        smlsl2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: smlsl	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0x60,0x62,0x0f]
+// CHECK: smlsl	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0x68,0x82,0x0f]
+// CHECK: smlsl	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0x68,0x96,0x0f]
+// CHECK: smlsl2	v0.4s, v1.8h, v1.h[2]   // encoding: [0x20,0x60,0x61,0x4f]
+// CHECK: smlsl2	v0.2d, v1.4s, v1.s[2]   // encoding: [0x20,0x68,0x81,0x4f]
+// CHECK: smlsl2	v0.2d, v1.4s, v22.s[2]  // encoding: [0x20,0x68,0x96,0x4f]
+
+        sqdmlal v0.4s, v1.4h, v2.h[2]
+        sqdmlal v0.2d, v1.2s, v2.s[2]
+        sqdmlal v0.2d, v1.2s, v22.s[2]
+        sqdmlal2 v0.4s, v1.8h, v1.h[2]
+        sqdmlal2 v0.2d, v1.4s, v1.s[2]
+        sqdmlal2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: sqdmlal	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0x30,0x62,0x0f]
+// CHECK: sqdmlal	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0x38,0x82,0x0f]
+// CHECK: sqdmlal	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0x38,0x96,0x0f]
+// CHECK: sqdmlal2	v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x30,0x61,0x4f]
+// CHECK: sqdmlal2	v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x38,0x81,0x4f]
+// CHECK: sqdmlal2	v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x38,0x96,0x4f]
+
+        umlal v0.4s, v1.4h, v2.h[2]
+        umlal v0.2d, v1.2s, v2.s[2]
+        umlal v0.2d, v1.2s, v22.s[2]
+        umlal2 v0.4s, v1.8h, v1.h[2]
+        umlal2 v0.2d, v1.4s, v1.s[2]
+        umlal2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: umlal	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0x20,0x62,0x2f]
+// CHECK: umlal	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0x28,0x82,0x2f]
+// CHECK: umlal	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0x28,0x96,0x2f]
+// CHECK: umlal2	v0.4s, v1.8h, v1.h[2]   // encoding: [0x20,0x20,0x61,0x6f]
+// CHECK: umlal2	v0.2d, v1.4s, v1.s[2]   // encoding: [0x20,0x28,0x81,0x6f]
+// CHECK: umlal2	v0.2d, v1.4s, v22.s[2]  // encoding: [0x20,0x28,0x96,0x6f]
+
+        umlsl v0.4s, v1.4h, v2.h[2]
+        umlsl v0.2d, v1.2s, v2.s[2]
+        umlsl v0.2d, v1.2s, v22.s[2]
+        umlsl2 v0.4s, v1.8h, v1.h[2]
+        umlsl2 v0.2d, v1.4s, v1.s[2]
+        umlsl2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: umlsl	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0x60,0x62,0x2f]
+// CHECK: umlsl	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0x68,0x82,0x2f]
+// CHECK: umlsl	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0x68,0x96,0x2f]
+// CHECK: umlsl2	v0.4s, v1.8h, v1.h[2]   // encoding: [0x20,0x60,0x61,0x6f]
+// CHECK: umlsl2	v0.2d, v1.4s, v1.s[2]   // encoding: [0x20,0x68,0x81,0x6f]
+// CHECK: umlsl2	v0.2d, v1.4s, v22.s[2]  // encoding: [0x20,0x68,0x96,0x6f]
+
+        sqdmlsl v0.4s, v1.4h, v2.h[2]
+        sqdmlsl v0.2d, v1.2s, v2.s[2]
+        sqdmlsl v0.2d, v1.2s, v22.s[2]
+        sqdmlsl2 v0.4s, v1.8h, v1.h[2]
+        sqdmlsl2 v0.2d, v1.4s, v1.s[2]
+        sqdmlsl2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: sqdmlsl	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0x70,0x62,0x0f]
+// CHECK: sqdmlsl	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0x78,0x82,0x0f]
+// CHECK: sqdmlsl	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0x78,0x96,0x0f]
+// CHECK: sqdmlsl2	v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x70,0x61,0x4f]
+// CHECK: sqdmlsl2	v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x78,0x81,0x4f]
+// CHECK: sqdmlsl2	v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x78,0x96,0x4f]
+
+        mul v0.4h, v1.4h, v2.h[2]
+        mul v0.8h, v1.8h, v2.h[2]
+        mul v0.2s, v1.2s, v2.s[2]
+        mul v0.2s, v1.2s, v22.s[2]
+        mul v0.4s, v1.4s, v2.s[2]
+        mul v0.4s, v1.4s, v22.s[2]
+
+// CHECK: mul	v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0x80,0x62,0x0f]
+// CHECK: mul	v0.8h, v1.8h, v2.h[2]   // encoding: [0x20,0x80,0x62,0x4f]
+// CHECK: mul	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x88,0x82,0x0f]
+// CHECK: mul	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x88,0x96,0x0f]
+// CHECK: mul	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x88,0x82,0x4f]
+// CHECK: mul	v0.4s, v1.4s, v22.s[2]  // encoding: [0x20,0x88,0x96,0x4f]
+
+        fmul v0.2s, v1.2s, v2.s[2]
+        fmul v0.2s, v1.2s, v22.s[2]
+        fmul v0.4s, v1.4s, v2.s[2]
+        fmul v0.4s, v1.4s, v22.s[2]
+        fmul v0.2d, v1.2d, v2.d[1]
+        fmul v0.2d, v1.2d, v22.d[1]
+
+// CHECK: fmul	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x0f]
+// CHECK: fmul	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x0f]
+// CHECK: fmul	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x4f]
+// CHECK: fmul	v0.4s, v1.4s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x4f]
+// CHECK: fmul	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x98,0xc2,0x4f]
+// CHECK: fmul	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x98,0xd6,0x4f]
+
+        fmulx v0.2s, v1.2s, v2.s[2]
+        fmulx v0.2s, v1.2s, v22.s[2]
+        fmulx v0.4s, v1.4s, v2.s[2]
+        fmulx v0.4s, v1.4s, v22.s[2]
+        fmulx v0.2d, v1.2d, v2.d[1]
+        fmulx v0.2d, v1.2d, v22.d[1]
+
+// CHECK: fmulx	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x2f]
+// CHECK: fmulx	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x2f]
+// CHECK: fmulx	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0x98,0x82,0x6f]
+// CHECK: fmulx	v0.4s, v1.4s, v22.s[2]  // encoding: [0x20,0x98,0x96,0x6f]
+// CHECK: fmulx	v0.2d, v1.2d, v2.d[1]   // encoding: [0x20,0x98,0xc2,0x6f]
+// CHECK: fmulx	v0.2d, v1.2d, v22.d[1]  // encoding: [0x20,0x98,0xd6,0x6f]
+
+        smull v0.4s, v1.4h, v2.h[2]
+        smull v0.2d, v1.2s, v2.s[2]
+        smull v0.2d, v1.2s, v22.s[2]
+        smull2 v0.4s, v1.8h, v2.h[2]
+        smull2 v0.2d, v1.4s, v2.s[2]
+        smull2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: smull	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0xa0,0x62,0x0f]
+// CHECK: smull	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0xa8,0x82,0x0f]
+// CHECK: smull	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0xa8,0x96,0x0f]
+// CHECK: smull2	v0.4s, v1.8h, v2.h[2]   // encoding: [0x20,0xa0,0x62,0x4f]
+// CHECK: smull2	v0.2d, v1.4s, v2.s[2]   // encoding: [0x20,0xa8,0x82,0x4f]
+// CHECK: smull2	v0.2d, v1.4s, v22.s[2]  // encoding: [0x20,0xa8,0x96,0x4f]
+
+        umull v0.4s, v1.4h, v2.h[2]
+        umull v0.2d, v1.2s, v2.s[2]
+        umull v0.2d, v1.2s, v22.s[2]
+        umull2 v0.4s, v1.8h, v2.h[2]
+        umull2 v0.2d, v1.4s, v2.s[2]
+        umull2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: umull	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0xa0,0x62,0x2f]
+// CHECK: umull	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0xa8,0x82,0x2f]
+// CHECK: umull	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0xa8,0x96,0x2f]
+// CHECK: umull2	v0.4s, v1.8h, v2.h[2]   // encoding: [0x20,0xa0,0x62,0x6f]
+// CHECK: umull2	v0.2d, v1.4s, v2.s[2]   // encoding: [0x20,0xa8,0x82,0x6f]
+// CHECK: umull2	v0.2d, v1.4s, v22.s[2]  // encoding: [0x20,0xa8,0x96,0x6f]
+
+        sqdmull v0.4s, v1.4h, v2.h[2]
+        sqdmull v0.2d, v1.2s, v2.s[2]
+        sqdmull v0.2d, v1.2s, v22.s[2]
+        sqdmull2 v0.4s, v1.8h, v2.h[2]
+        sqdmull2 v0.2d, v1.4s, v2.s[2]
+        sqdmull2 v0.2d, v1.4s, v22.s[2]
+
+// CHECK: sqdmull	v0.4s, v1.4h, v2.h[2]   // encoding: [0x20,0xb0,0x62,0x0f]
+// CHECK: sqdmull	v0.2d, v1.2s, v2.s[2]   // encoding: [0x20,0xb8,0x82,0x0f]
+// CHECK: sqdmull	v0.2d, v1.2s, v22.s[2]  // encoding: [0x20,0xb8,0x96,0x0f]
+// CHECK: sqdmull2	v0.4s, v1.8h, v2.h[2] // encoding: [0x20,0xb0,0x62,0x4f]
+// CHECK: sqdmull2	v0.2d, v1.4s, v2.s[2] // encoding: [0x20,0xb8,0x82,0x4f]
+// CHECK: sqdmull2	v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0xb8,0x96,0x4f]
+
+        sqdmulh v0.4h, v1.4h, v2.h[2]
+        sqdmulh v0.8h, v1.8h, v2.h[2]
+        sqdmulh v0.2s, v1.2s, v2.s[2]
+        sqdmulh v0.2s, v1.2s, v22.s[2]
+        sqdmulh v0.4s, v1.4s, v2.s[2]
+        sqdmulh v0.4s, v1.4s, v22.s[2]
+
+// CHECK: sqdmulh	v0.4h, v1.4h, v2.h[2]   // encoding: [0x20,0xc0,0x62,0x0f]
+// CHECK: sqdmulh	v0.8h, v1.8h, v2.h[2]   // encoding: [0x20,0xc0,0x62,0x4f]
+// CHECK: sqdmulh	v0.2s, v1.2s, v2.s[2]   // encoding: [0x20,0xc8,0x82,0x0f]
+// CHECK: sqdmulh	v0.2s, v1.2s, v22.s[2]  // encoding: [0x20,0xc8,0x96,0x0f]
+// CHECK: sqdmulh	v0.4s, v1.4s, v2.s[2]   // encoding: [0x20,0xc8,0x82,0x4f]
+// CHECK: sqdmulh	v0.4s, v1.4s, v22.s[2]  // encoding: [0x20,0xc8,0x96,0x4f]
+
+        sqrdmulh v0.4h, v1.4h, v2.h[2]
+        sqrdmulh v0.8h, v1.8h, v2.h[2]
+        sqrdmulh v0.2s, v1.2s, v2.s[2]
+        sqrdmulh v0.2s, v1.2s, v22.s[2]
+        sqrdmulh v0.4s, v1.4s, v2.s[2]
+        sqrdmulh v0.4s, v1.4s, v22.s[2]
+
+// CHECK: sqrdmulh	v0.4h, v1.4h, v2.h[2] // encoding: [0x20,0xd0,0x62,0x0f]
+// CHECK: sqrdmulh	v0.8h, v1.8h, v2.h[2] // encoding: [0x20,0xd0,0x62,0x4f]
+// CHECK: sqrdmulh	v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0xd8,0x82,0x0f]
+// CHECK: sqrdmulh	v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0xd8,0x96,0x0f]
+// CHECK: sqrdmulh	v0.4s, v1.4s, v2.s[2] // encoding: [0x20,0xd8,0x82,0x4f]
+// CHECK: sqrdmulh	v0.4s, v1.4s, v22.s[2] // encoding: [0x20,0xd8,0x96,0x4f]
diff --git a/test/MC/AArch64/neon-3vdiff.s b/test/MC/AArch64/neon-3vdiff.s
new file mode 100644
index 0000000..3ff86bf
--- /dev/null
+++ b/test/MC/AArch64/neon-3vdiff.s
@@ -0,0 +1,415 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Instructions with 3 different vector data types
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Long
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Long - Variant 1
+//------------------------------------------------------------------------------
+
+        saddl v0.8h, v1.8b, v2.8b
+        saddl v0.4s, v1.4h, v2.4h
+        saddl v0.2d, v1.2s, v2.2s
+
+// CHECK: saddl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x00,0x22,0x0e]
+// CHECK: saddl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x00,0x62,0x0e]
+// CHECK: saddl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x00,0xa2,0x0e]
+
+        saddl2 v0.4s, v1.8h, v2.8h
+        saddl2 v0.8h, v1.16b, v2.16b
+        saddl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: saddl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x00,0x62,0x4e]
+// CHECK: saddl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x00,0x22,0x4e]
+// CHECK: saddl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x00,0xa2,0x4e]
+
+        uaddl v0.8h, v1.8b, v2.8b
+        uaddl v0.4s, v1.4h, v2.4h
+        uaddl v0.2d, v1.2s, v2.2s
+
+// CHECK: uaddl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x00,0x22,0x2e]
+// CHECK: uaddl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x00,0x62,0x2e]
+// CHECK: uaddl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x00,0xa2,0x2e]
+
+        uaddl2 v0.8h, v1.16b, v2.16b
+        uaddl2 v0.4s, v1.8h, v2.8h
+        uaddl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: uaddl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x00,0x22,0x6e]
+// CHECK: uaddl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x00,0x62,0x6e]
+// CHECK: uaddl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x00,0xa2,0x6e]
+
+        ssubl v0.8h, v1.8b, v2.8b
+        ssubl v0.4s, v1.4h, v2.4h
+        ssubl v0.2d, v1.2s, v2.2s
+
+// CHECK: ssubl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x20,0x22,0x0e]
+// CHECK: ssubl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x20,0x62,0x0e]
+// CHECK: ssubl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x20,0xa2,0x0e]
+
+        ssubl2 v0.8h, v1.16b, v2.16b
+        ssubl2 v0.4s, v1.8h, v2.8h
+        ssubl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: ssubl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x20,0x22,0x4e]
+// CHECK: ssubl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x20,0x62,0x4e]
+// CHECK: ssubl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x20,0xa2,0x4e]
+
+        usubl v0.8h, v1.8b, v2.8b
+        usubl v0.4s, v1.4h, v2.4h
+        usubl v0.2d, v1.2s, v2.2s
+
+// CHECK: usubl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x20,0x22,0x2e]
+// CHECK: usubl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x20,0x62,0x2e]
+// CHECK: usubl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x20,0xa2,0x2e]
+
+        usubl2 v0.8h, v1.16b, v2.16b
+        usubl2 v0.4s, v1.8h, v2.8h
+        usubl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: usubl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x20,0x22,0x6e]
+// CHECK: usubl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x20,0x62,0x6e]
+// CHECK: usubl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x20,0xa2,0x6e]
+
+        sabal v0.8h, v1.8b, v2.8b
+        sabal v0.4s, v1.4h, v2.4h
+        sabal v0.2d, v1.2s, v2.2s
+
+// CHECK: sabal	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x50,0x22,0x0e]
+// CHECK: sabal	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x50,0x62,0x0e]
+// CHECK: sabal	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x50,0xa2,0x0e]
+
+        sabal2 v0.8h, v1.16b, v2.16b
+        sabal2 v0.4s, v1.8h, v2.8h
+        sabal2 v0.2d, v1.4s, v2.4s
+
+// CHECK: sabal2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x50,0x22,0x4e]
+// CHECK: sabal2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x50,0x62,0x4e]
+// CHECK: sabal2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x50,0xa2,0x4e]
+
+        uabal v0.8h, v1.8b, v2.8b
+        uabal v0.4s, v1.4h, v2.4h
+        uabal v0.2d, v1.2s, v2.2s
+
+// CHECK: uabal	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x50,0x22,0x2e]
+// CHECK: uabal	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x50,0x62,0x2e]
+// CHECK: uabal	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x50,0xa2,0x2e]
+
+        uabal2 v0.8h, v1.16b, v2.16b
+        uabal2 v0.4s, v1.8h, v2.8h
+        uabal2 v0.2d, v1.4s, v2.4s
+
+// CHECK: uabal2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x50,0x22,0x6e]
+// CHECK: uabal2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x50,0x62,0x6e]
+// CHECK: uabal2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x50,0xa2,0x6e]
+
+        sabdl v0.8h, v1.8b, v2.8b
+        sabdl v0.4s, v1.4h, v2.4h
+        sabdl v0.2d, v1.2s, v2.2s
+
+// CHECK: sabdl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x70,0x22,0x0e]
+// CHECK: sabdl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x70,0x62,0x0e]
+// CHECK: sabdl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x70,0xa2,0x0e]
+
+        sabdl2 v0.8h, v1.16b, v2.16b
+        sabdl2 v0.4s, v1.8h, v2.8h
+        sabdl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: sabdl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x70,0x22,0x4e]
+// CHECK: sabdl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x70,0x62,0x4e]
+// CHECK: sabdl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x70,0xa2,0x4e]
+
+        uabdl v0.8h, v1.8b, v2.8b
+        uabdl v0.4s, v1.4h, v2.4h
+        uabdl v0.2d, v1.2s, v2.2s
+
+// CHECK: uabdl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x70,0x22,0x2e]
+// CHECK: uabdl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x70,0x62,0x2e]
+// CHECK: uabdl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x70,0xa2,0x2e]
+
+        uabdl2 v0.8h, v1.16b, v2.16b
+        uabdl2 v0.4s, v1.8h, v2.8h
+        uabdl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: uabdl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x70,0x22,0x6e]
+// CHECK: uabdl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x70,0x62,0x6e]
+// CHECK: uabdl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x70,0xa2,0x6e]
+
+        smlal v0.8h, v1.8b, v2.8b
+        smlal v0.4s, v1.4h, v2.4h
+        smlal v0.2d, v1.2s, v2.2s
+
+// CHECK: smlal	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x80,0x22,0x0e]
+// CHECK: smlal	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x80,0x62,0x0e]
+// CHECK: smlal	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x80,0xa2,0x0e]
+
+        smlal2 v0.8h, v1.16b, v2.16b
+        smlal2 v0.4s, v1.8h, v2.8h
+        smlal2 v0.2d, v1.4s, v2.4s
+
+// CHECK: smlal2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x80,0x22,0x4e]
+// CHECK: smlal2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x80,0x62,0x4e]
+// CHECK: smlal2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x80,0xa2,0x4e]
+
+        umlal v0.8h, v1.8b, v2.8b
+        umlal v0.4s, v1.4h, v2.4h
+        umlal v0.2d, v1.2s, v2.2s
+
+// CHECK: umlal	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0x80,0x22,0x2e]
+// CHECK: umlal	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x80,0x62,0x2e]
+// CHECK: umlal	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x80,0xa2,0x2e]
+
+        umlal2 v0.8h, v1.16b, v2.16b
+        umlal2 v0.4s, v1.8h, v2.8h
+        umlal2 v0.2d, v1.4s, v2.4s
+
+// CHECK: umlal2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0x80,0x22,0x6e]
+// CHECK: umlal2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0x80,0x62,0x6e]
+// CHECK: umlal2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0x80,0xa2,0x6e]
+
+        smlsl v0.8h, v1.8b, v2.8b
+        smlsl v0.4s, v1.4h, v2.4h
+        smlsl v0.2d, v1.2s, v2.2s
+
+// CHECK: smlsl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0xa0,0x22,0x0e]
+// CHECK: smlsl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0xa0,0x62,0x0e]
+// CHECK: smlsl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0xa0,0xa2,0x0e]
+
+        smlsl2 v0.8h, v1.16b, v2.16b
+        smlsl2 v0.4s, v1.8h, v2.8h
+        smlsl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: smlsl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0xa0,0x22,0x4e]
+// CHECK: smlsl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0xa0,0x62,0x4e]
+// CHECK: smlsl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0xa0,0xa2,0x4e]
+
+        umlsl v0.8h, v1.8b, v2.8b
+        umlsl v0.4s, v1.4h, v2.4h
+        umlsl v0.2d, v1.2s, v2.2s
+
+// CHECK: umlsl	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0xa0,0x22,0x2e]
+// CHECK: umlsl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0xa0,0x62,0x2e]
+// CHECK: umlsl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0xa0,0xa2,0x2e]
+
+        umlsl2 v0.8h, v1.16b, v2.16b
+        umlsl2 v0.4s, v1.8h, v2.8h
+        umlsl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: umlsl2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0xa0,0x22,0x6e]
+// CHECK: umlsl2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0xa0,0x62,0x6e]
+// CHECK: umlsl2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0xa0,0xa2,0x6e]
+
+        smull v0.8h, v1.8b, v2.8b
+        smull v0.4s, v1.4h, v2.4h
+        smull v0.2d, v1.2s, v2.2s
+
+// CHECK: smull	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0xc0,0x22,0x0e]
+// CHECK: smull	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0xc0,0x62,0x0e]
+// CHECK: smull	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0xc0,0xa2,0x0e]
+
+        smull2 v0.8h, v1.16b, v2.16b
+        smull2 v0.4s, v1.8h, v2.8h
+        smull2 v0.2d, v1.4s, v2.4s
+
+// CHECK: smull2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0xc0,0x22,0x4e]
+// CHECK: smull2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0xc0,0x62,0x4e]
+// CHECK: smull2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0xc0,0xa2,0x4e]
+
+        umull v0.8h, v1.8b, v2.8b
+        umull v0.4s, v1.4h, v2.4h
+        umull v0.2d, v1.2s, v2.2s
+
+// CHECK: umull	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0xc0,0x22,0x2e]
+// CHECK: umull	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0xc0,0x62,0x2e]
+// CHECK: umull	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0xc0,0xa2,0x2e]
+
+        umull2 v0.8h, v1.16b, v2.16b
+        umull2 v0.4s, v1.8h, v2.8h
+        umull2 v0.2d, v1.4s, v2.4s
+
+// CHECK: umull2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0xc0,0x22,0x6e]
+// CHECK: umull2	v0.4s, v1.8h, v2.8h     // encoding: [0x20,0xc0,0x62,0x6e]
+// CHECK: umull2	v0.2d, v1.4s, v2.4s     // encoding: [0x20,0xc0,0xa2,0x6e]
+
+//------------------------------------------------------------------------------
+// Long - Variant 2
+//------------------------------------------------------------------------------
+
+        sqdmlal v0.4s, v1.4h, v2.4h
+        sqdmlal v0.2d, v1.2s, v2.2s
+
+// CHECK: sqdmlal	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0x90,0x62,0x0e]
+// CHECK: sqdmlal	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0x90,0xa2,0x0e]
+
+        sqdmlal2 v0.4s, v1.8h, v2.8h
+        sqdmlal2 v0.2d, v1.4s, v2.4s
+
+// CHECK: sqdmlal2	v0.4s, v1.8h, v2.8h // encoding: [0x20,0x90,0x62,0x4e]
+// CHECK: sqdmlal2	v0.2d, v1.4s, v2.4s // encoding: [0x20,0x90,0xa2,0x4e]
+
+        sqdmlsl v0.4s, v1.4h, v2.4h
+        sqdmlsl v0.2d, v1.2s, v2.2s
+
+// CHECK: sqdmlsl	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0xb0,0x62,0x0e]
+// CHECK: sqdmlsl	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0xb0,0xa2,0x0e]
+
+        sqdmlsl2 v0.4s, v1.8h, v2.8h
+        sqdmlsl2 v0.2d, v1.4s, v2.4s
+
+// CHECK: sqdmlsl2	v0.4s, v1.8h, v2.8h // encoding: [0x20,0xb0,0x62,0x4e]
+// CHECK: sqdmlsl2	v0.2d, v1.4s, v2.4s // encoding: [0x20,0xb0,0xa2,0x4e]
+
+        sqdmull v0.4s, v1.4h, v2.4h
+        sqdmull v0.2d, v1.2s, v2.2s
+
+// CHECK: sqdmull	v0.4s, v1.4h, v2.4h     // encoding: [0x20,0xd0,0x62,0x0e]
+// CHECK: sqdmull	v0.2d, v1.2s, v2.2s     // encoding: [0x20,0xd0,0xa2,0x0e]
+
+        sqdmull2 v0.4s, v1.8h, v2.8h
+        sqdmull2 v0.2d, v1.4s, v2.4s
+
+// CHECK: sqdmull2	v0.4s, v1.8h, v2.8h // encoding: [0x20,0xd0,0x62,0x4e]
+// CHECK: sqdmull2	v0.2d, v1.4s, v2.4s // encoding: [0x20,0xd0,0xa2,0x4e]
+
+//------------------------------------------------------------------------------
+// Long - Variant 3
+//------------------------------------------------------------------------------
+
+        pmull v0.8h, v1.8b, v2.8b
+        pmull v0.1q, v1.1d, v2.1d
+
+// CHECK: pmull	v0.8h, v1.8b, v2.8b     // encoding: [0x20,0xe0,0x22,0x0e]
+// CHECK: pmull	v0.1q, v1.1d, v2.1d     // encoding: [0x20,0xe0,0xe2,0x0e]
+
+        pmull2 v0.8h, v1.16b, v2.16b
+        pmull2 v0.1q, v1.2d, v2.2d
+
+// CHECK: pmull2	v0.8h, v1.16b, v2.16b   // encoding: [0x20,0xe0,0x22,0x4e]
+// CHECK: pmull2	v0.1q, v1.2d, v2.2d     // encoding: [0x20,0xe0,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Widen
+//------------------------------------------------------------------------------
+
+        saddw v0.8h, v1.8h, v2.8b
+        saddw v0.4s, v1.4s, v2.4h
+        saddw v0.2d, v1.2d, v2.2s
+
+// CHECK: saddw	v0.8h, v1.8h, v2.8b     // encoding: [0x20,0x10,0x22,0x0e]
+// CHECK: saddw	v0.4s, v1.4s, v2.4h     // encoding: [0x20,0x10,0x62,0x0e]
+// CHECK: saddw	v0.2d, v1.2d, v2.2s     // encoding: [0x20,0x10,0xa2,0x0e]
+
+        saddw2 v0.8h, v1.8h, v2.16b
+        saddw2 v0.4s, v1.4s, v2.8h
+        saddw2 v0.2d, v1.2d, v2.4s
+
+// CHECK: saddw2	v0.8h, v1.8h, v2.16b    // encoding: [0x20,0x10,0x22,0x4e]
+// CHECK: saddw2	v0.4s, v1.4s, v2.8h     // encoding: [0x20,0x10,0x62,0x4e]
+// CHECK: saddw2	v0.2d, v1.2d, v2.4s     // encoding: [0x20,0x10,0xa2,0x4e]
+
+        uaddw v0.8h, v1.8h, v2.8b
+        uaddw v0.4s, v1.4s, v2.4h
+        uaddw v0.2d, v1.2d, v2.2s
+
+// CHECK: uaddw	v0.8h, v1.8h, v2.8b     // encoding: [0x20,0x10,0x22,0x2e]
+// CHECK: uaddw	v0.4s, v1.4s, v2.4h     // encoding: [0x20,0x10,0x62,0x2e]
+// CHECK: uaddw	v0.2d, v1.2d, v2.2s     // encoding: [0x20,0x10,0xa2,0x2e]
+
+        uaddw2 v0.8h, v1.8h, v2.16b
+        uaddw2 v0.4s, v1.4s, v2.8h
+        uaddw2 v0.2d, v1.2d, v2.4s
+
+// CHECK: uaddw2	v0.8h, v1.8h, v2.16b    // encoding: [0x20,0x10,0x22,0x6e]
+// CHECK: uaddw2	v0.4s, v1.4s, v2.8h     // encoding: [0x20,0x10,0x62,0x6e]
+// CHECK: uaddw2	v0.2d, v1.2d, v2.4s     // encoding: [0x20,0x10,0xa2,0x6e]
+
+        ssubw v0.8h, v1.8h, v2.8b
+        ssubw v0.4s, v1.4s, v2.4h
+        ssubw v0.2d, v1.2d, v2.2s
+
+// CHECK: ssubw	v0.8h, v1.8h, v2.8b     // encoding: [0x20,0x30,0x22,0x0e]
+// CHECK: ssubw	v0.4s, v1.4s, v2.4h     // encoding: [0x20,0x30,0x62,0x0e]
+// CHECK: ssubw	v0.2d, v1.2d, v2.2s     // encoding: [0x20,0x30,0xa2,0x0e]
+
+        ssubw2 v0.8h, v1.8h, v2.16b
+        ssubw2 v0.4s, v1.4s, v2.8h
+        ssubw2 v0.2d, v1.2d, v2.4s
+
+// CHECK: ssubw2	v0.8h, v1.8h, v2.16b    // encoding: [0x20,0x30,0x22,0x4e]
+// CHECK: ssubw2	v0.4s, v1.4s, v2.8h     // encoding: [0x20,0x30,0x62,0x4e]
+// CHECK: ssubw2	v0.2d, v1.2d, v2.4s     // encoding: [0x20,0x30,0xa2,0x4e]
+
+        usubw v0.8h, v1.8h, v2.8b
+        usubw v0.4s, v1.4s, v2.4h
+        usubw v0.2d, v1.2d, v2.2s
+
+// CHECK: usubw	v0.8h, v1.8h, v2.8b     // encoding: [0x20,0x30,0x22,0x2e]
+// CHECK: usubw	v0.4s, v1.4s, v2.4h     // encoding: [0x20,0x30,0x62,0x2e]
+// CHECK: usubw	v0.2d, v1.2d, v2.2s     // encoding: [0x20,0x30,0xa2,0x2e]
+
+        usubw2 v0.8h, v1.8h, v2.16b
+        usubw2 v0.4s, v1.4s, v2.8h
+        usubw2 v0.2d, v1.2d, v2.4s
+
+// CHECK: usubw2	v0.8h, v1.8h, v2.16b    // encoding: [0x20,0x30,0x22,0x6e]
+// CHECK: usubw2	v0.4s, v1.4s, v2.8h     // encoding: [0x20,0x30,0x62,0x6e]
+// CHECK: usubw2	v0.2d, v1.2d, v2.4s     // encoding: [0x20,0x30,0xa2,0x6e]
+
+//------------------------------------------------------------------------------
+// Narrow
+//------------------------------------------------------------------------------
+
+        addhn v0.8b, v1.8h, v2.8h
+        addhn v0.4h, v1.4s, v2.4s
+        addhn v0.2s, v1.2d, v2.2d
+
+// CHECK: addhn	v0.8b, v1.8h, v2.8h     // encoding: [0x20,0x40,0x22,0x0e]
+// CHECK: addhn	v0.4h, v1.4s, v2.4s     // encoding: [0x20,0x40,0x62,0x0e]
+// CHECK: addhn	v0.2s, v1.2d, v2.2d     // encoding: [0x20,0x40,0xa2,0x0e]
+
+        addhn2 v0.16b, v1.8h, v2.8h
+        addhn2 v0.8h, v1.4s, v2.4s
+        addhn2 v0.4s, v1.2d, v2.2d
+
+// CHECK: addhn2	v0.16b, v1.8h, v2.8h    // encoding: [0x20,0x40,0x22,0x4e]
+// CHECK: addhn2	v0.8h, v1.4s, v2.4s     // encoding: [0x20,0x40,0x62,0x4e]
+// CHECK: addhn2	v0.4s, v1.2d, v2.2d     // encoding: [0x20,0x40,0xa2,0x4e]
+
+        raddhn v0.8b, v1.8h, v2.8h
+        raddhn v0.4h, v1.4s, v2.4s
+        raddhn v0.2s, v1.2d, v2.2d
+
+// CHECK: raddhn	v0.8b, v1.8h, v2.8h     // encoding: [0x20,0x40,0x22,0x2e]
+// CHECK: raddhn	v0.4h, v1.4s, v2.4s     // encoding: [0x20,0x40,0x62,0x2e]
+// CHECK: raddhn	v0.2s, v1.2d, v2.2d     // encoding: [0x20,0x40,0xa2,0x2e]
+
+        raddhn2 v0.16b, v1.8h, v2.8h
+        raddhn2 v0.8h, v1.4s, v2.4s
+        raddhn2 v0.4s, v1.2d, v2.2d
+
+// CHECK: raddhn2	v0.16b, v1.8h, v2.8h    // encoding: [0x20,0x40,0x22,0x6e]
+// CHECK: raddhn2	v0.8h, v1.4s, v2.4s     // encoding: [0x20,0x40,0x62,0x6e]
+// CHECK: raddhn2	v0.4s, v1.2d, v2.2d     // encoding: [0x20,0x40,0xa2,0x6e]
+
+        rsubhn v0.8b, v1.8h, v2.8h
+        rsubhn v0.4h, v1.4s, v2.4s
+        rsubhn v0.2s, v1.2d, v2.2d
+
+// CHECK: rsubhn	v0.8b, v1.8h, v2.8h     // encoding: [0x20,0x60,0x22,0x2e]
+// CHECK: rsubhn	v0.4h, v1.4s, v2.4s     // encoding: [0x20,0x60,0x62,0x2e]
+// CHECK: rsubhn	v0.2s, v1.2d, v2.2d     // encoding: [0x20,0x60,0xa2,0x2e]
+
+        rsubhn2 v0.16b, v1.8h, v2.8h
+        rsubhn2 v0.8h, v1.4s, v2.4s
+        rsubhn2 v0.4s, v1.2d, v2.2d
+
+// CHECK: rsubhn2	v0.16b, v1.8h, v2.8h    // encoding: [0x20,0x60,0x22,0x6e]
+// CHECK: rsubhn2	v0.8h, v1.4s, v2.4s     // encoding: [0x20,0x60,0x62,0x6e]
+// CHECK: rsubhn2	v0.4s, v1.2d, v2.2d     // encoding: [0x20,0x60,0xa2,0x6e]
diff --git a/test/MC/AArch64/neon-across.s b/test/MC/AArch64/neon-across.s
new file mode 100644
index 0000000..8b1c2d4
--- /dev/null
+++ b/test/MC/AArch64/neon-across.s
@@ -0,0 +1,101 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Instructions across vector registers
+//------------------------------------------------------------------------------
+
+        saddlv h0, v1.8b
+        saddlv h0, v1.16b
+        saddlv s0, v1.4h
+        saddlv s0, v1.8h
+        saddlv d0, v1.4s
+
+// CHECK: saddlv	h0, v1.8b               // encoding: [0x20,0x38,0x30,0x0e]
+// CHECK: saddlv	h0, v1.16b              // encoding: [0x20,0x38,0x30,0x4e]
+// CHECK: saddlv	s0, v1.4h               // encoding: [0x20,0x38,0x70,0x0e]
+// CHECK: saddlv	s0, v1.8h               // encoding: [0x20,0x38,0x70,0x4e]
+// CHECK: saddlv	d0, v1.4s               // encoding: [0x20,0x38,0xb0,0x4e]
+
+        uaddlv h0, v1.8b
+        uaddlv h0, v1.16b
+        uaddlv s0, v1.4h
+        uaddlv s0, v1.8h
+        uaddlv d0, v1.4s
+
+// CHECK: uaddlv	h0, v1.8b               // encoding: [0x20,0x38,0x30,0x2e]
+// CHECK: uaddlv	h0, v1.16b              // encoding: [0x20,0x38,0x30,0x6e]
+// CHECK: uaddlv	s0, v1.4h               // encoding: [0x20,0x38,0x70,0x2e]
+// CHECK: uaddlv	s0, v1.8h               // encoding: [0x20,0x38,0x70,0x6e]
+// CHECK: uaddlv	d0, v1.4s               // encoding: [0x20,0x38,0xb0,0x6e]
+
+        smaxv b0, v1.8b
+        smaxv b0, v1.16b
+        smaxv h0, v1.4h
+        smaxv h0, v1.8h
+        smaxv s0, v1.4s
+
+// CHECK: smaxv	b0, v1.8b               // encoding: [0x20,0xa8,0x30,0x0e]
+// CHECK: smaxv	b0, v1.16b              // encoding: [0x20,0xa8,0x30,0x4e]
+// CHECK: smaxv	h0, v1.4h               // encoding: [0x20,0xa8,0x70,0x0e]
+// CHECK: smaxv	h0, v1.8h               // encoding: [0x20,0xa8,0x70,0x4e]
+// CHECK: smaxv	s0, v1.4s               // encoding: [0x20,0xa8,0xb0,0x4e]
+
+        sminv b0, v1.8b
+        sminv b0, v1.16b
+        sminv h0, v1.4h
+        sminv h0, v1.8h
+        sminv s0, v1.4s
+
+// CHECK: sminv	b0, v1.8b               // encoding: [0x20,0xa8,0x31,0x0e]
+// CHECK: sminv	b0, v1.16b              // encoding: [0x20,0xa8,0x31,0x4e]
+// CHECK: sminv	h0, v1.4h               // encoding: [0x20,0xa8,0x71,0x0e]
+// CHECK: sminv	h0, v1.8h               // encoding: [0x20,0xa8,0x71,0x4e]
+// CHECK: sminv	s0, v1.4s               // encoding: [0x20,0xa8,0xb1,0x4e]
+
+        umaxv b0, v1.8b
+        umaxv b0, v1.16b
+        umaxv h0, v1.4h
+        umaxv h0, v1.8h
+        umaxv s0, v1.4s
+
+// CHECK: umaxv	b0, v1.8b               // encoding: [0x20,0xa8,0x30,0x2e]
+// CHECK: umaxv	b0, v1.16b              // encoding: [0x20,0xa8,0x30,0x6e]
+// CHECK: umaxv	h0, v1.4h               // encoding: [0x20,0xa8,0x70,0x2e]
+// CHECK: umaxv	h0, v1.8h               // encoding: [0x20,0xa8,0x70,0x6e]
+// CHECK: umaxv	s0, v1.4s               // encoding: [0x20,0xa8,0xb0,0x6e]
+
+        uminv b0, v1.8b
+        uminv b0, v1.16b
+        uminv h0, v1.4h
+        uminv h0, v1.8h
+        uminv s0, v1.4s
+
+// CHECK: uminv	b0, v1.8b               // encoding: [0x20,0xa8,0x31,0x2e]
+// CHECK: uminv	b0, v1.16b              // encoding: [0x20,0xa8,0x31,0x6e]
+// CHECK: uminv	h0, v1.4h               // encoding: [0x20,0xa8,0x71,0x2e]
+// CHECK: uminv	h0, v1.8h               // encoding: [0x20,0xa8,0x71,0x6e]
+// CHECK: uminv	s0, v1.4s               // encoding: [0x20,0xa8,0xb1,0x6e]
+
+        addv b0, v1.8b
+        addv b0, v1.16b
+        addv h0, v1.4h
+        addv h0, v1.8h
+        addv s0, v1.4s
+
+// CHECK: addv	b0, v1.8b               // encoding: [0x20,0xb8,0x31,0x0e]
+// CHECK: addv	b0, v1.16b              // encoding: [0x20,0xb8,0x31,0x4e]
+// CHECK: addv	h0, v1.4h               // encoding: [0x20,0xb8,0x71,0x0e]
+// CHECK: addv	h0, v1.8h               // encoding: [0x20,0xb8,0x71,0x4e]
+// CHECK: addv	s0, v1.4s               // encoding: [0x20,0xb8,0xb1,0x4e]
+
+        fmaxnmv s0, v1.4s
+        fminnmv s0, v1.4s
+        fmaxv s0, v1.4s
+        fminv s0, v1.4s
+
+// CHECK: fmaxnmv	s0, v1.4s               // encoding: [0x20,0xc8,0x30,0x6e]
+// CHECK: fminnmv	s0, v1.4s               // encoding: [0x20,0xc8,0xb0,0x6e]
+// CHECK: fmaxv	s0, v1.4s               // encoding: [0x20,0xf8,0x30,0x6e]
+// CHECK: fminv	s0, v1.4s               // encoding: [0x20,0xf8,0xb0,0x6e]
diff --git a/test/MC/AArch64/neon-add-pairwise.s b/test/MC/AArch64/neon-add-pairwise.s
index b586c22..df9938b 100644
--- a/test/MC/AArch64/neon-add-pairwise.s
+++ b/test/MC/AArch64/neon-add-pairwise.s
@@ -32,4 +32,3 @@
 // CHECK: faddp v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0x22,0x2e]
 // CHECK: faddp v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0x22,0x6e]
 // CHECK: faddp v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0x62,0x6e]
-
diff --git a/test/MC/AArch64/neon-add-sub-instructions.s b/test/MC/AArch64/neon-add-sub-instructions.s
index 863798e..68f169b 100644
--- a/test/MC/AArch64/neon-add-sub-instructions.s
+++ b/test/MC/AArch64/neon-add-sub-instructions.s
@@ -64,19 +64,5 @@
 // CHECK: fsub v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0xa2,0x4e]
 // CHECK: fsub v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0xe2,0x4e]
 
-//------------------------------------------------------------------------------
-// Scalar Integer Add
-//------------------------------------------------------------------------------
-         add d31, d0, d16
-
-// CHECK: add d31, d0, d16       // encoding: [0x1f,0x84,0xf0,0x5e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Sub
-//------------------------------------------------------------------------------
-         sub d1, d7, d8
-
-// CHECK: sub d1, d7, d8       // encoding: [0xe1,0x84,0xe8,0x7e]
-
 
 
diff --git a/test/MC/AArch64/neon-crypto.s b/test/MC/AArch64/neon-crypto.s
new file mode 100644
index 0000000..2952dd5
--- /dev/null
+++ b/test/MC/AArch64/neon-crypto.s
@@ -0,0 +1,44 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -mattr=+crypto -show-encoding < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s 2>&1 | FileCheck -check-prefix=CHECK-NO-CRYPTO %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Instructions for crypto
+//------------------------------------------------------------------------------
+
+        aese v0.16b, v1.16b
+        aesd v0.16b, v1.16b
+        aesmc v0.16b, v1.16b
+        aesimc v0.16b, v1.16b
+
+// CHECK-NO-CRYPTO: error: instruction requires a CPU feature not currently enabled
+// CHECK: aese	v0.16b, v1.16b          // encoding: [0x20,0x48,0x28,0x4e]
+// CHECK: aesd	v0.16b, v1.16b          // encoding: [0x20,0x58,0x28,0x4e]
+// CHECK: aesmc	v0.16b, v1.16b          // encoding: [0x20,0x68,0x28,0x4e]
+// CHECK: aesimc	v0.16b, v1.16b          // encoding: [0x20,0x78,0x28,0x4e]
+
+        sha1h s0, s1
+        sha1su1 v0.4s, v1.4s
+        sha256su0 v0.4s, v1.4s
+
+// CHECK: sha1h	s0, s1                  // encoding: [0x20,0x08,0x28,0x5e]
+// CHECK: sha1su1	v0.4s, v1.4s            // encoding: [0x20,0x18,0x28,0x5e]
+// CHECK: sha256su0	v0.4s, v1.4s    // encoding: [0x20,0x28,0x28,0x5e]
+
+        sha1c q0, s1, v2.4s
+        sha1p q0, s1, v2.4s
+        sha1m q0, s1, v2.4s
+        sha1su0 v0.4s, v1.4s, v2.4s
+        sha256h q0, q1, v2.4s
+        sha256h2 q0, q1, v2.4s
+        sha256su1 v0.4s, v1.4s, v2.4s
+
+// CHECK: sha1c	q0, s1, v2.4s           // encoding: [0x20,0x00,0x02,0x5e]
+// CHECK: sha1p	q0, s1, v2.4s           // encoding: [0x20,0x10,0x02,0x5e]
+// CHECK: sha1m	q0, s1, v2.4s           // encoding: [0x20,0x20,0x02,0x5e]
+// CHECK: sha1su0	v0.4s, v1.4s, v2.4s     // encoding: [0x20,0x30,0x02,0x5e]
+// CHECK: sha256h	q0, q1, v2.4s           // encoding: [0x20,0x40,0x02,0x5e]
+// CHECK: sha256h2	q0, q1, v2.4s   // encoding: [0x20,0x50,0x02,0x5e]
+// CHECK: sha256su1	v0.4s, v1.4s, v2.4s // encoding: [0x20,0x60,0x02,0x5e]
+
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index 5373889..be6c163 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -213,6 +213,47 @@
 // CHECK-ERROR:         movi v1.16b, #256
 // CHECK-ERROR:                      ^
 
+//----------------------------------------------------------------------
+// Scalar Floating-point Reciprocal Estimate
+//----------------------------------------------------------------------
+
+    frecpe s19, h14
+    frecpe d13, s13
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frecpe s19, h14
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frecpe d13, s13
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Reciprocal Exponent
+//----------------------------------------------------------------------
+
+    frecpx s18, h10
+    frecpx d16, s19
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frecpx s18, h10
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frecpx d16, s19
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Reciprocal Square Root Estimate
+//----------------------------------------------------------------------
+
+    frsqrte s22, h13
+    frsqrte d21, s12
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frsqrte s22, h13
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frsqrte d21, s12
+// CHECK-ERROR:                     ^
 
 //----------------------------------------------------------------------
 // Vector Move Immediate - bytemask, per doubleword
@@ -826,6 +867,33 @@
 // CHECK-ERROR:        uqsub h1, h2, d2
 // CHECK-ERROR:                      ^
 
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Doubling Multiply Half High (Signed)
+//----------------------------------------------------------------------
+
+    sqdmulh h10, s11, h12
+    sqdmulh s20, h21, s2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmulh h10, s11, h12
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmulh s20, h21, s2
+// CHECK-ERROR:                     ^
+
+//------------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Doubling Multiply Half High (Signed)
+//------------------------------------------------------------------------
+
+    sqrdmulh h10, s11, h12
+    sqrdmulh s20, h21, s2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrdmulh h10, s11, h12
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrdmulh s20, h21, s2
+// CHECK-ERROR:                      ^
 
 //----------------------------------------------------------------------
 // Vector Shift Left (Signed and Unsigned Integer)
@@ -845,12 +913,12 @@
 // Vector Saturating Shift Left (Signed and Unsigned Integer)
 //----------------------------------------------------------------------
         // Mismatched vector types
-        sqshl v0.2s, v15.2s, v16.2d
+        sqshl v0.2s, v15.4s, v16.2d
         uqshl v1.8b, v25.4h, v6.8h
 
 // CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR:        sqshl v0.2s, v15.2s, v16.2d
-// CHECK-ERROR:                                 ^
+// CHECK-ERROR:        sqshl v0.2s, v15.4s, v16.2d 
+// CHECK-ERROR:                         ^
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        uqshl v1.8b, v25.4h, v6.8h
 // CHECK-ERROR:                         ^
@@ -902,23 +970,23 @@
 //----------------------------------------------------------------------
 
         // Mismatched vector types
-        sqshl b0, b1, s0
-        uqshl h0, h1, b0
-        sqshl s0, s1, h0
-        uqshl d0, d1, b0
+        sqshl b0, s1, b0
+        uqshl h0, b1, h0
+        sqshl s0, h1, s0
+        uqshl d0, b1, d0
 
 // CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR:        sqshl b0, b1, s0
-// CHECK-ERROR:                      ^
+// CHECK-ERROR:        sqshl b0, s1, b0
+// CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR:        uqshl h0, h1, b0
-// CHECK-ERROR:                      ^
+// CHECK-ERROR:        uqshl h0, b1, h0
+// CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR:        sqshl s0, s1, h0
-// CHECK-ERROR:                      ^
+// CHECK-ERROR:        sqshl s0, h1, s0
+// CHECK-ERROR:                  ^
 // CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR:        uqshl d0, d1, b0
-// CHECK-ERROR:                      ^
+// CHECK-ERROR:        uqshl d0, b1, d0
+// CHECK-ERROR:                  ^
 
 //----------------------------------------------------------------------
 // Scalar Integer Rouding Shift Left (Signed, Unsigned)
@@ -1205,3 +1273,6046 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         fmulx v1.4h, v25.4h, v3.4h
 // CHECK-ERROR:                  ^
+
+//------------------------------------------------------------------------------
+// Vector Shift Left by Immediate
+//------------------------------------------------------------------------------
+         // Mismatched vector types and out of range
+         shl v0.4s, v15,2s, #3
+         shl v0.2d, v17.4s, #3
+         shl v0.8b, v31.8b, #-1
+         shl v0.8b, v31.8b, #8
+         shl v0.4s, v21.4s, #32
+         shl v0.2d, v1.2d, #64
+
+// CHECK-ERROR: error: expected comma before next operand
+// CHECK-ERROR:         shl v0.4s, v15,2s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shl v0.2d, v17.4s, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:         shl v0.8b, v31.8b, #-1
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:         shl v0.8b, v31.8b, #8
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:         shl v0.4s, v21.4s, #32
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:         shl v0.2d, v1.2d, #64
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Vector Shift Left Long by Immediate
+//----------------------------------------------------------------------
+        // Mismatched vector types
+        sshll v0.4s, v15.2s, #3
+        ushll v1.16b, v25.16b, #6
+        sshll2 v0.2d, v3.8s, #15
+        ushll2 v1.4s, v25.4s, #7
+
+        // Out of range 
+        sshll v0.8h, v1.8b, #-1
+        sshll v0.8h, v1.8b, #9
+        ushll v0.4s, v1.4h, #17
+        ushll v0.2d, v1.2s, #33
+        sshll2 v0.8h, v1.16b, #9
+        sshll2 v0.4s, v1.8h, #17
+        ushll2 v0.2d, v1.4s, #33
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sshll v0.4s, v15.2s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ushll v1.16b, v25.16b, #6
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sshll2 v0.2d, v3.8s, #15
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ushll2 v1.4s, v25.4s, #7
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:        sshll v0.8h, v1.8b, #-1
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:        sshll v0.8h, v1.8b, #9
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:        ushll v0.4s, v1.4h, #17
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:        ushll v0.2d, v1.2s, #33
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:        sshll2 v0.8h, v1.16b, #9
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:        sshll2 v0.4s, v1.8h, #17
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:        ushll2 v0.2d, v1.4s, #33
+// CHECK-ERROR:                             ^
+
+
+//------------------------------------------------------------------------------
+// Vector shift right by immediate
+//------------------------------------------------------------------------------
+         sshr v0.8b, v1.8h, #3
+         sshr v0.4h, v1.4s, #3
+         sshr v0.2s, v1.2d, #3
+         sshr v0.16b, v1.16b, #9
+         sshr v0.8h, v1.8h, #17
+         sshr v0.4s, v1.4s, #33
+         sshr v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sshr v0.8b, v1.8h, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sshr v0.4h, v1.4s, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sshr v0.2s, v1.2d, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         sshr v0.16b, v1.16b, #9
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         sshr v0.8h, v1.8h, #17
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         sshr v0.4s, v1.4s, #33
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         sshr v0.2d, v1.2d, #65
+// CHECK-ERROR:                            ^
+
+//------------------------------------------------------------------------------
+// Vector  shift right by immediate
+//------------------------------------------------------------------------------
+         ushr v0.8b, v1.8h, #3
+         ushr v0.4h, v1.4s, #3
+         ushr v0.2s, v1.2d, #3
+         ushr v0.16b, v1.16b, #9
+         ushr v0.8h, v1.8h, #17
+         ushr v0.4s, v1.4s, #33
+         ushr v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ushr v0.8b, v1.8h, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ushr v0.4h, v1.4s, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ushr v0.2s, v1.2d, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         ushr v0.16b, v1.16b, #9
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         ushr v0.8h, v1.8h, #17
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         ushr v0.4s, v1.4s, #33
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         ushr v0.2d, v1.2d, #65
+// CHECK-ERROR:                            ^
+
+//------------------------------------------------------------------------------
+// Vector shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         ssra v0.8b, v1.8h, #3
+         ssra v0.4h, v1.4s, #3
+         ssra v0.2s, v1.2d, #3
+         ssra v0.16b, v1.16b, #9
+         ssra v0.8h, v1.8h, #17
+         ssra v0.4s, v1.4s, #33
+         ssra v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ssra v0.8b, v1.8h, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ssra v0.4h, v1.4s, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ssra v0.2s, v1.2d, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         ssra v0.16b, v1.16b, #9
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         ssra v0.8h, v1.8h, #17
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         ssra v0.4s, v1.4s, #33
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         ssra v0.2d, v1.2d, #65
+// CHECK-ERROR:                            ^
+
+//------------------------------------------------------------------------------
+// Vector  shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         usra v0.8b, v1.8h, #3
+         usra v0.4h, v1.4s, #3
+         usra v0.2s, v1.2d, #3
+         usra v0.16b, v1.16b, #9
+         usra v0.8h, v1.8h, #17
+         usra v0.4s, v1.4s, #33
+         usra v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         usra v0.8b, v1.8h, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         usra v0.4h, v1.4s, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         usra v0.2s, v1.2d, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         usra v0.16b, v1.16b, #9
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         usra v0.8h, v1.8h, #17
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         usra v0.4s, v1.4s, #33
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         usra v0.2d, v1.2d, #65
+// CHECK-ERROR:                            ^
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right by immediate
+//------------------------------------------------------------------------------
+         srshr v0.8b, v1.8h, #3
+         srshr v0.4h, v1.4s, #3
+         srshr v0.2s, v1.2d, #3
+         srshr v0.16b, v1.16b, #9
+         srshr v0.8h, v1.8h, #17
+         srshr v0.4s, v1.4s, #33
+         srshr v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         srshr v0.8b, v1.8h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         srshr v0.4h, v1.4s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         srshr v0.2s, v1.2d, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         srshr v0.16b, v1.16b, #9
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         srshr v0.8h, v1.8h, #17
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         srshr v0.4s, v1.4s, #33
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         srshr v0.2d, v1.2d, #65
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Vecotr rounding shift right by immediate
+//------------------------------------------------------------------------------
+         urshr v0.8b, v1.8h, #3
+         urshr v0.4h, v1.4s, #3
+         urshr v0.2s, v1.2d, #3
+         urshr v0.16b, v1.16b, #9
+         urshr v0.8h, v1.8h, #17
+         urshr v0.4s, v1.4s, #33
+         urshr v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urshr v0.8b, v1.8h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urshr v0.4h, v1.4s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urshr v0.2s, v1.2d, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         urshr v0.16b, v1.16b, #9
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         urshr v0.8h, v1.8h, #17
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         urshr v0.4s, v1.4s, #33
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         urshr v0.2d, v1.2d, #65
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         srsra v0.8b, v1.8h, #3
+         srsra v0.4h, v1.4s, #3
+         srsra v0.2s, v1.2d, #3
+         srsra v0.16b, v1.16b, #9
+         srsra v0.8h, v1.8h, #17
+         srsra v0.4s, v1.4s, #33
+         srsra v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         srsra v0.8b, v1.8h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         srsra v0.4h, v1.4s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         srsra v0.2s, v1.2d, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         srsra v0.16b, v1.16b, #9
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         srsra v0.8h, v1.8h, #17
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         srsra v0.4s, v1.4s, #33
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         srsra v0.2d, v1.2d, #65
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         ursra v0.8b, v1.8h, #3
+         ursra v0.4h, v1.4s, #3
+         ursra v0.2s, v1.2d, #3
+         ursra v0.16b, v1.16b, #9
+         ursra v0.8h, v1.8h, #17
+         ursra v0.4s, v1.4s, #33
+         ursra v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursra v0.8b, v1.8h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursra v0.4h, v1.4s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursra v0.2s, v1.2d, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         ursra v0.16b, v1.16b, #9
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         ursra v0.8h, v1.8h, #17
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         ursra v0.4s, v1.4s, #33
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         ursra v0.2d, v1.2d, #65
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Vector shift right and insert by immediate
+//------------------------------------------------------------------------------
+         sri v0.8b, v1.8h, #3
+         sri v0.4h, v1.4s, #3
+         sri v0.2s, v1.2d, #3
+         sri v0.16b, v1.16b, #9
+         sri v0.8h, v1.8h, #17
+         sri v0.4s, v1.4s, #33
+         sri v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sri v0.8b, v1.8h, #3
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sri v0.4h, v1.4s, #3
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sri v0.2s, v1.2d, #3
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         sri v0.16b, v1.16b, #9
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         sri v0.8h, v1.8h, #17
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         sri v0.4s, v1.4s, #33
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         sri v0.2d, v1.2d, #65
+// CHECK-ERROR:                           ^
+
+//------------------------------------------------------------------------------
+// Vector shift left and insert by immediate
+//------------------------------------------------------------------------------
+         sli v0.8b, v1.8h, #3
+         sli v0.4h, v1.4s, #3
+         sli v0.2s, v1.2d, #3
+         sli v0.16b, v1.16b, #8
+         sli v0.8h, v1.8h, #16
+         sli v0.4s, v1.4s, #32
+         sli v0.2d, v1.2d, #64
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sli v0.8b, v1.8h, #3
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sli v0.4h, v1.4s, #3
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sli v0.2s, v1.2d, #3
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:         sli v0.16b, v1.16b, #8
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:         sli v0.8h, v1.8h, #16
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:         sli v0.4s, v1.4s, #32
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:         sli v0.2d, v1.2d, #64
+// CHECK-ERROR:                           ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift left unsigned by immediate
+//------------------------------------------------------------------------------
+         sqshlu v0.8b, v1.8h, #3
+         sqshlu v0.4h, v1.4s, #3
+         sqshlu v0.2s, v1.2d, #3
+         sqshlu v0.16b, v1.16b, #8
+         sqshlu v0.8h, v1.8h, #16
+         sqshlu v0.4s, v1.4s, #32
+         sqshlu v0.2d, v1.2d, #64
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshlu v0.8b, v1.8h, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshlu v0.4h, v1.4s, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshlu v0.2s, v1.2d, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:         sqshlu v0.16b, v1.16b, #8
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:         sqshlu v0.8h, v1.8h, #16
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:         sqshlu v0.4s, v1.4s, #32
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:         sqshlu v0.2d, v1.2d, #64
+// CHECK-ERROR:                              ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift left by immediate
+//------------------------------------------------------------------------------
+         sqshl v0.8b, v1.8h, #3
+         sqshl v0.4h, v1.4s, #3
+         sqshl v0.2s, v1.2d, #3
+         sqshl v0.16b, v1.16b, #8
+         sqshl v0.8h, v1.8h, #16
+         sqshl v0.4s, v1.4s, #32
+         sqshl v0.2d, v1.2d, #64
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshl v0.8b, v1.8h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshl v0.4h, v1.4s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshl v0.2s, v1.2d, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:         sqshl v0.16b, v1.16b, #8
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:         sqshl v0.8h, v1.8h, #16
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:         sqshl v0.4s, v1.4s, #32
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:         sqshl v0.2d, v1.2d, #64
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift left by immediate
+//------------------------------------------------------------------------------
+         uqshl v0.8b, v1.8h, #3
+         uqshl v0.4h, v1.4s, #3
+         uqshl v0.2s, v1.2d, #3
+         uqshl v0.16b, v1.16b, #8
+         uqshl v0.8h, v1.8h, #16
+         uqshl v0.4s, v1.4s, #32
+         uqshl v0.2d, v1.2d, #64
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqshl v0.8b, v1.8h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqshl v0.4h, v1.4s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqshl v0.2s, v1.2d, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:         uqshl v0.16b, v1.16b, #8
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:         uqshl v0.8h, v1.8h, #16
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:         uqshl v0.4s, v1.4s, #32
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:         uqshl v0.2d, v1.2d, #64
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Vector shift right narrow by immediate
+//------------------------------------------------------------------------------
+         shrn v0.8b, v1.8b, #3
+         shrn v0.4h, v1.4h, #3
+         shrn v0.2s, v1.2s, #3
+         shrn2 v0.16b, v1.8h, #17
+         shrn2 v0.8h, v1.4s, #33
+         shrn2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shrn v0.8b, v1.8b, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shrn v0.4h, v1.4h, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shrn v0.2s, v1.2s, #3
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         shrn2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         shrn2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         shrn2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right unsigned narrow by immediate
+//------------------------------------------------------------------------------
+         sqshrun v0.8b, v1.8b, #3
+         sqshrun v0.4h, v1.4h, #3
+         sqshrun v0.2s, v1.2s, #3
+         sqshrun2 v0.16b, v1.8h, #17
+         sqshrun2 v0.8h, v1.4s, #33
+         sqshrun2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshrun v0.8b, v1.8b, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshrun v0.4h, v1.4h, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshrun v0.2s, v1.2s, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         sqshrun2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         sqshrun2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         sqshrun2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                                ^
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right narrow by immediate
+//------------------------------------------------------------------------------
+         rshrn v0.8b, v1.8b, #3
+         rshrn v0.4h, v1.4h, #3
+         rshrn v0.2s, v1.2s, #3
+         rshrn2 v0.16b, v1.8h, #17
+         rshrn2 v0.8h, v1.4s, #33
+         rshrn2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rshrn v0.8b, v1.8b, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rshrn v0.4h, v1.4h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rshrn v0.2s, v1.2s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         rshrn2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         rshrn2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         rshrn2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                              ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right rounded unsigned narrow by immediate
+//------------------------------------------------------------------------------
+         sqrshrun v0.8b, v1.8b, #3
+         sqrshrun v0.4h, v1.4h, #3
+         sqrshrun v0.2s, v1.2s, #3
+         sqrshrun2 v0.16b, v1.8h, #17
+         sqrshrun2 v0.8h, v1.4s, #33
+         sqrshrun2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrshrun v0.8b, v1.8b, #3
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrshrun v0.4h, v1.4h, #3
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrshrun v0.2s, v1.2s, #3
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         sqrshrun2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         sqrshrun2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         sqrshrun2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                                 ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right narrow by immediate
+//------------------------------------------------------------------------------
+         sqshrn v0.8b, v1.8b, #3
+         sqshrn v0.4h, v1.4h, #3
+         sqshrn v0.2s, v1.2s, #3
+         sqshrn2 v0.16b, v1.8h, #17
+         sqshrn2 v0.8h, v1.4s, #33
+         sqshrn2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshrn v0.8b, v1.8b, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshrn v0.4h, v1.4h, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqshrn v0.2s, v1.2s, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         sqshrn2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         sqshrn2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         sqshrn2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                               ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right narrow by immediate
+//------------------------------------------------------------------------------
+         uqshrn v0.8b, v1.8b, #3
+         uqshrn v0.4h, v1.4h, #3
+         uqshrn v0.2s, v1.2s, #3
+         uqshrn2 v0.16b, v1.8h, #17
+         uqshrn2 v0.8h, v1.4s, #33
+         uqshrn2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqshrn v0.8b, v1.8b, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqshrn v0.4h, v1.4h, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqshrn v0.2s, v1.2s, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         uqshrn2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         uqshrn2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         uqshrn2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                               ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right rounded narrow by immediate
+//------------------------------------------------------------------------------
+         sqrshrn v0.8b, v1.8b, #3
+         sqrshrn v0.4h, v1.4h, #3
+         sqrshrn v0.2s, v1.2s, #3
+         sqrshrn2 v0.16b, v1.8h, #17
+         sqrshrn2 v0.8h, v1.4s, #33
+         sqrshrn2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrshrn v0.8b, v1.8b, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrshrn v0.4h, v1.4h, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrshrn v0.2s, v1.2s, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         sqrshrn2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         sqrshrn2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         sqrshrn2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                                ^
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right rounded narrow by immediate
+//------------------------------------------------------------------------------
+         uqrshrn v0.8b, v1.8b, #3
+         uqrshrn v0.4h, v1.4h, #3
+         uqrshrn v0.2s, v1.2s, #3
+         uqrshrn2 v0.16b, v1.8h, #17
+         uqrshrn2 v0.8h, v1.4s, #33
+         uqrshrn2 v0.4s, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqrshrn v0.8b, v1.8b, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqrshrn v0.4h, v1.4h, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqrshrn v0.2s, v1.2s, #3
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:         uqrshrn2 v0.16b, v1.8h, #17
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:         uqrshrn2 v0.8h, v1.4s, #33
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         uqrshrn2 v0.4s, v1.2d, #65
+// CHECK-ERROR:                                ^
+
+//------------------------------------------------------------------------------
+// Fixed-point convert to floating-point
+//------------------------------------------------------------------------------
+         scvtf v0.2s, v1.2d, #3
+         scvtf v0.4s, v1.4h, #3
+         scvtf v0.2d, v1.2s, #3
+         ucvtf v0.2s, v1.2s, #33
+         ucvtf v0.4s, v1.4s, #33
+         ucvtf v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         scvtf v0.2s, v1.2d, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         scvtf v0.4s, v1.4h, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         scvtf v0.2d, v1.2s, #3
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         ucvtf v0.2s, v1.2s, #33
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         ucvtf v0.4s, v1.4s, #33
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         ucvtf v0.2d, v1.2d, #65
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Floating-point convert to fixed-point
+//------------------------------------------------------------------------------
+         fcvtzs v0.2s, v1.2d, #3
+         fcvtzs v0.4s, v1.4h, #3
+         fcvtzs v0.2d, v1.2s, #3
+         fcvtzu v0.2s, v1.2s, #33
+         fcvtzu v0.4s, v1.4s, #33
+         fcvtzu v0.2d, v1.2d, #65
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzs v0.2s, v1.2d, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzs v0.4s, v1.4h, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzs v0.2d, v1.2s, #3
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         fcvtzu v0.2s, v1.2s, #33
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:         fcvtzu v0.4s, v1.4s, #33
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:         fcvtzu v0.2d, v1.2d, #65
+// CHECK-ERROR:                              ^
+
+//----------------------------------------------------------------------
+// Vector operation on 3 operands with different types
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+        saddl v0.8h, v1.8h, v2.8b
+        saddl v0.4s, v1.4s, v2.4h
+        saddl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        saddl2 v0.4s, v1.8s, v2.8h
+        saddl2 v0.8h, v1.16h, v2.16b
+        saddl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        uaddl v0.8h, v1.8h, v2.8b
+        uaddl v0.4s, v1.4s, v2.4h
+        uaddl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        uaddl2 v0.8h, v1.16h, v2.16b
+        uaddl2 v0.4s, v1.8s, v2.8h
+        uaddl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        ssubl v0.8h, v1.8h, v2.8b
+        ssubl v0.4s, v1.4s, v2.4h
+        ssubl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        ssubl2 v0.8h, v1.16h, v2.16b
+        ssubl2 v0.4s, v1.8s, v2.8h
+        ssubl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        usubl v0.8h, v1.8h, v2.8b
+        usubl v0.4s, v1.4s, v2.4h
+        usubl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        usubl2 v0.8h, v1.16h, v2.16b
+        usubl2 v0.4s, v1.8s, v2.8h
+        usubl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        sabal v0.8h, v1.8h, v2.8b
+        sabal v0.4s, v1.4s, v2.4h
+        sabal v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabal v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabal v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabal v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        sabal2 v0.8h, v1.16h, v2.16b
+        sabal2 v0.4s, v1.8s, v2.8h
+        sabal2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabal2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabal2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabal2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        uabal v0.8h, v1.8h, v2.8b
+        uabal v0.4s, v1.4s, v2.4h
+        uabal v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabal v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabal v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabal v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        uabal2 v0.8h, v1.16h, v2.16b
+        uabal2 v0.4s, v1.8s, v2.8h
+        uabal2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabal2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabal2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabal2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        sabdl v0.8h, v1.8h, v2.8b
+        sabdl v0.4s, v1.4s, v2.4h
+        sabdl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabdl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabdl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabdl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        sabdl2 v0.8h, v1.16h, v2.16b
+        sabdl2 v0.4s, v1.8s, v2.8h
+        sabdl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabdl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabdl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sabdl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        uabdl v0.8h, v1.8h, v2.8b
+        uabdl v0.4s, v1.4s, v2.4h
+        uabdl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabdl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabdl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabdl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        uabdl2 v0.8h, v1.16h, v2.16b
+        uabdl2 v0.4s, v1.8s, v2.8h
+        uabdl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabdl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabdl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uabdl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        smlal v0.8h, v1.8h, v2.8b
+        smlal v0.4s, v1.4s, v2.4h
+        smlal v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        smlal2 v0.8h, v1.16h, v2.16b
+        smlal2 v0.4s, v1.8s, v2.8h
+        smlal2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        umlal v0.8h, v1.8h, v2.8b
+        umlal v0.4s, v1.4s, v2.4h
+        umlal v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        umlal2 v0.8h, v1.16h, v2.16b
+        umlal2 v0.4s, v1.8s, v2.8h
+        umlal2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        smlsl v0.8h, v1.8h, v2.8b
+        smlsl v0.4s, v1.4s, v2.4h
+        smlsl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        smlsl2 v0.8h, v1.16h, v2.16b
+        smlsl2 v0.4s, v1.8s, v2.8h
+        smlsl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        umlsl v0.8h, v1.8h, v2.8b
+        umlsl v0.4s, v1.4s, v2.4h
+        umlsl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        umlsl2 v0.8h, v1.16h, v2.16b
+        umlsl2 v0.4s, v1.8s, v2.8h
+        umlsl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        smull v0.8h, v1.8h, v2.8b
+        smull v0.4s, v1.4s, v2.4h
+        smull v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        smull2 v0.8h, v1.16h, v2.16b
+        smull2 v0.4s, v1.8s, v2.8h
+        smull2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+        umull v0.8h, v1.8h, v2.8b
+        umull v0.4s, v1.4s, v2.4h
+        umull v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                        ^
+
+        umull2 v0.8h, v1.16h, v2.16b
+        umull2 v0.4s, v1.8s, v2.8h
+        umull2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                      ^
+
+//------------------------------------------------------------------------------
+// Long - Variant 2
+//------------------------------------------------------------------------------
+
+        sqdmlal v0.4s, v1.4s, v2.4h
+        sqdmlal v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                          ^
+
+        sqdmlal2 v0.4s, v1.8s, v2.8h
+        sqdmlal2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                        ^
+
+        // Mismatched vector types
+        sqdmlal v0.8h, v1.8b, v2.8b
+        sqdmlal2 v0.8h, v1.16b, v2.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal v0.8h, v1.8b, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal2 v0.8h, v1.16b, v2.16b
+// CHECK-ERROR:                    ^
+
+        sqdmlsl v0.4s, v1.4s, v2.4h
+        sqdmlsl v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                          ^
+
+        sqdmlsl2 v0.4s, v1.8s, v2.8h
+        sqdmlsl2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                        ^
+
+        // Mismatched vector types
+        sqdmlsl v0.8h, v1.8b, v2.8b
+        sqdmlsl2 v0.8h, v1.16b, v2.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl v0.8h, v1.8b, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl2 v0.8h, v1.16b, v2.16b
+// CHECK-ERROR:                    ^
+
+
+        sqdmull v0.4s, v1.4s, v2.4h
+        sqdmull v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull v0.4s, v1.4s, v2.4h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                          ^
+
+        sqdmull2 v0.4s, v1.8s, v2.8h
+        sqdmull2 v0.2d, v1.4d, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull2 v0.4s, v1.8s, v2.8h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull2 v0.2d, v1.4d, v2.4s
+// CHECK-ERROR:                        ^
+
+        // Mismatched vector types
+        sqdmull v0.8h, v1.8b, v2.8b
+        sqdmull2 v0.8h, v1.16b, v2.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull v0.8h, v1.8b, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull2 v0.8h, v1.16b, v2.16b
+// CHECK-ERROR:                    ^
+
+
+//------------------------------------------------------------------------------
+// Long - Variant 3
+//------------------------------------------------------------------------------
+
+        pmull v0.8h, v1.8h, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        pmull v0.8h, v1.8h, v2.8b
+// CHECK-ERROR:                        ^
+
+        pmull v0.1q, v1.2d, v2.2d
+        
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        pmull v0.1q, v1.2d, v2.2d
+// CHECK-ERROR:                     ^
+
+        // Mismatched vector types
+        pmull v0.4s, v1.4h, v2.4h
+        pmull v0.2d, v1.2s, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        pmull v0.4s, v1.4h, v2.4h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        pmull v0.2d, v1.2s, v2.2s
+// CHECK-ERROR:                 ^
+
+
+        pmull2 v0.8h, v1.16h, v2.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        pmull2 v0.8h, v1.16h, v2.16b
+// CHECK-ERROR:                      ^
+
+        pmull2 v0.q, v1.2d, v2.2d
+        
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        pmull2 v0.q, v1.2d, v2.2d
+// CHECK-ERROR:                  ^
+
+        // Mismatched vector types
+        pmull2 v0.4s, v1.8h v2.8h
+        pmull2 v0.2d, v1.4s, v2.4s
+
+// CHECK-ERROR: error: expected comma before next operand
+// CHECK-ERROR:        pmull2 v0.4s, v1.8h v2.8h
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        pmull2 v0.2d, v1.4s, v2.4s
+// CHECK-ERROR:                  ^
+
+//------------------------------------------------------------------------------
+// Widen
+//------------------------------------------------------------------------------
+
+        saddw v0.8h, v1.8h, v2.8h
+        saddw v0.4s, v1.4s, v2.4s
+        saddw v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddw v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddw v0.4s, v1.4s, v2.4s
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddw v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                               ^
+
+        saddw2 v0.8h, v1.8h, v2.16h
+        saddw2 v0.4s, v1.4s, v2.8s
+        saddw2 v0.2d, v1.2d, v2.4d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddw2 v0.8h, v1.8h, v2.16h
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddw2 v0.4s, v1.4s, v2.8s
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddw2 v0.2d, v1.2d, v2.4d
+// CHECK-ERROR:                             ^
+
+        uaddw v0.8h, v1.8h, v2.8h
+        uaddw v0.4s, v1.4s, v2.4s
+        uaddw v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddw v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddw v0.4s, v1.4s, v2.4s
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddw v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                               ^
+
+        uaddw2 v0.8h, v1.8h, v2.16h
+        uaddw2 v0.4s, v1.4s, v2.8s
+        uaddw2 v0.2d, v1.2d, v2.4d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddw2 v0.8h, v1.8h, v2.16h
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddw2 v0.4s, v1.4s, v2.8s
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddw2 v0.2d, v1.2d, v2.4d
+// CHECK-ERROR:                             ^
+
+        ssubw v0.8h, v1.8h, v2.8h
+        ssubw v0.4s, v1.4s, v2.4s
+        ssubw v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubw v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubw v0.4s, v1.4s, v2.4s
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubw v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                               ^
+
+        ssubw2 v0.8h, v1.8h, v2.16h
+        ssubw2 v0.4s, v1.4s, v2.8s
+        ssubw2 v0.2d, v1.2d, v2.4d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubw2 v0.8h, v1.8h, v2.16h
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubw2 v0.4s, v1.4s, v2.8s
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ssubw2 v0.2d, v1.2d, v2.4d
+// CHECK-ERROR:                             ^
+
+        usubw v0.8h, v1.8h, v2.8h
+        usubw v0.4s, v1.4s, v2.4s
+        usubw v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubw v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubw v0.4s, v1.4s, v2.4s
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubw v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                               ^
+
+        usubw2 v0.8h, v1.8h, v2.16h
+        usubw2 v0.4s, v1.4s, v2.8s
+        usubw2 v0.2d, v1.2d, v2.4d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubw2 v0.8h, v1.8h, v2.16h
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubw2 v0.4s, v1.4s, v2.8s
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usubw2 v0.2d, v1.2d, v2.4d
+// CHECK-ERROR:                             ^
+
+//------------------------------------------------------------------------------
+// Narrow
+//------------------------------------------------------------------------------
+
+        addhn v0.8b, v1.8h, v2.8d
+        addhn v0.4h, v1.4s, v2.4h
+        addhn v0.2s, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addhn v0.8b, v1.8h, v2.8d
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addhn v0.4h, v1.4s, v2.4h
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addhn v0.2s, v1.2d, v2.2s
+// CHECK-ERROR:                               ^
+
+        addhn2 v0.16b, v1.8h, v2.8b
+        addhn2 v0.8h, v1.4s, v2.4h
+        addhn2 v0.4s, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addhn2 v0.16b, v1.8h, v2.8b
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addhn2 v0.8h, v1.4s, v2.4h
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addhn2 v0.4s, v1.2d, v2.2s
+// CHECK-ERROR:                                ^
+
+        raddhn v0.8b, v1.8h, v2.8b
+        raddhn v0.4h, v1.4s, v2.4h
+        raddhn v0.2s, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        raddhn v0.8b, v1.8h, v2.8b
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        raddhn v0.4h, v1.4s, v2.4h
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        raddhn v0.2s, v1.2d, v2.2s
+// CHECK-ERROR:                                ^
+
+        raddhn2 v0.16b, v1.8h, v2.8b
+        raddhn2 v0.8h, v1.4s, v2.4h
+        raddhn2 v0.4s, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        raddhn2 v0.16b, v1.8h, v2.8b
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        raddhn2 v0.8h, v1.4s, v2.4h
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        raddhn2 v0.4s, v1.2d, v2.2s
+// CHECK-ERROR:                                 ^
+
+        rsubhn v0.8b, v1.8h, v2.8b
+        rsubhn v0.4h, v1.4s, v2.4h
+        rsubhn v0.2s, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        rsubhn v0.8b, v1.8h, v2.8b
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        rsubhn v0.4h, v1.4s, v2.4h
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        rsubhn v0.2s, v1.2d, v2.2s
+// CHECK-ERROR:                                ^
+
+        rsubhn2 v0.16b, v1.8h, v2.8b
+        rsubhn2 v0.8h, v1.4s, v2.4h
+        rsubhn2 v0.4s, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        rsubhn2 v0.16b, v1.8h, v2.8b
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        rsubhn2 v0.8h, v1.4s, v2.4h
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        rsubhn2 v0.4s, v1.2d, v2.2s
+// CHECK-ERROR:                                 ^
+
+//----------------------------------------------------------------------
+// Scalar Reduce Add Pairwise (Integer)
+//----------------------------------------------------------------------
+         // invalid vector types
+      addp s0, d1.2d
+      addp d0, d1.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          addp s0, d1.2d
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          addp d0, d1.2s
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Scalar Reduce Add Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         // invalid vector types
+      faddp s0, d1.2d
+      faddp d0, d1.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          faddp s0, d1.2d
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          faddp d0, d1.2s
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Reduce Maximum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         // mismatched and invalid vector types
+      fmaxp s0, v1.2d
+      fmaxp d31, v2.2s
+      fmaxp h3, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmaxp s0, v1.2d
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmaxp d31, v2.2s
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmaxp h3, v2.2s
+// CHECK-ERROR:                ^
+
+
+//----------------------------------------------------------------------
+// Scalar Reduce Minimum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         // mismatched and invalid vector types
+      fminp s0, v1.4h
+      fminp d31, v2.8h
+      fminp b3, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fminp s0, v1.4h
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fminp d31, v2.8h
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fminp b3, v2.2s
+// CHECK-ERROR:                ^
+
+
+//----------------------------------------------------------------------
+// Scalar Reduce maxNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         // mismatched and invalid vector types
+      fmaxnmp s0, v1.8b
+      fmaxnmp d31, v2.16b
+      fmaxnmp v1.2s, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmaxnmp s0, v1.8b
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmaxnmp d31, v2.16b
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR:          fmaxnmp v1.2s, v2.2s
+// CHECK-ERROR:          ^
+
+//----------------------------------------------------------------------
+// Scalar Reduce minNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         // mismatched and invalid vector types
+      fminnmp s0, v1.2d
+      fminnmp d31, v2.4s
+      fminnmp v1.4s, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fminnmp s0, v1.2d
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fminnmp d31, v2.4s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fminnmp v1.4s, v2.2d
+// CHECK-ERROR:          ^
+
+      mla v0.2d, v1.2d, v16.d[1]
+      mla v0.2s, v1.2s, v2.s[4]
+      mla v0.4s, v1.4s, v2.s[4]
+      mla v0.2h, v1.2h, v2.h[1]
+      mla v0.4h, v1.4h, v2.h[8]
+      mla v0.8h, v1.8h, v2.h[8]
+      mla v0.4h, v1.4h, v16.h[2]
+      mla v0.8h, v1.8h, v16.h[2]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mla v0.2d, v1.2d, v16.d[1]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mla v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mla v0.4s, v1.4s, v2.s[4]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mla v0.2h, v1.2h, v2.h[1]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mla v0.4h, v1.4h, v2.h[8]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mla v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mla v0.4h, v1.4h, v16.h[2]
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mla v0.8h, v1.8h, v16.h[2]
+// CHECK-ERROR:                              ^
+
+      mls v0.2d, v1.2d, v16.d[1]
+      mls v0.2s, v1.2s, v2.s[4]
+      mls v0.4s, v1.4s, v2.s[4]
+      mls v0.2h, v1.2h, v2.h[1]
+      mls v0.4h, v1.4h, v2.h[8]
+      mls v0.8h, v1.8h, v2.h[8]
+      mls v0.4h, v1.4h, v16.h[2]
+      mls v0.8h, v1.8h, v16.h[2]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mls v0.2d, v1.2d, v16.d[1]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mls v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mls v0.4s, v1.4s, v2.s[4]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mls v0.2h, v1.2h, v2.h[1]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mls v0.4h, v1.4h, v2.h[8]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mls v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mls v0.4h, v1.4h, v16.h[2]
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mls v0.8h, v1.8h, v16.h[2]
+// CHECK-ERROR:                              ^
+
+      fmla v0.4h, v1.4h, v2.h[2]
+      fmla v0.8h, v1.8h, v2.h[2]
+      fmla v0.2s, v1.2s, v2.s[4]
+      fmla v0.2s, v1.2s, v22.s[4]
+      fmla v3.4s, v8.4s, v2.s[4]
+      fmla v3.4s, v8.4s, v22.s[4]
+      fmla v0.2d, v1.2d, v2.d[2]
+      fmla v0.2d, v1.2d, v22.d[2]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmla v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmla v0.8h, v1.8h, v2.h[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmla v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmla v0.2s, v1.2s, v22.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmla v3.4s, v8.4s, v2.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmla v3.4s, v8.4s, v22.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmla v0.2d, v1.2d, v2.d[2]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmla v0.2d, v1.2d, v22.d[2]
+// CHECK-ERROR:                                 ^
+
+      fmls v0.4h, v1.4h, v2.h[2]
+      fmls v0.8h, v1.8h, v2.h[2]
+      fmls v0.2s, v1.2s, v2.s[4]
+      fmls v0.2s, v1.2s, v22.s[4]
+      fmls v3.4s, v8.4s, v2.s[4]
+      fmls v3.4s, v8.4s, v22.s[4]
+      fmls v0.2d, v1.2d, v2.d[2]
+      fmls v0.2d, v1.2d, v22.d[2]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmls v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmls v0.8h, v1.8h, v2.h[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmls v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmls v0.2s, v1.2s, v22.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmls v3.4s, v8.4s, v2.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmls v3.4s, v8.4s, v22.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmls v0.2d, v1.2d, v2.d[2]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmls v0.2d, v1.2d, v22.d[2]
+// CHECK-ERROR:                                 ^
+
+      smlal v0.4h, v1.4h, v2.h[2]
+      smlal v0.4s, v1.4h, v2.h[8]
+      smlal v0.4s, v1.4h, v16.h[2]
+      smlal v0.2s, v1.2s, v2.s[4]
+      smlal v0.2d, v1.2s, v2.s[4]
+      smlal v0.2d, v1.2s, v22.s[4]
+      smlal2 v0.4h, v1.8h, v1.h[2]
+      smlal2 v0.4s, v1.8h, v1.h[8]
+      smlal2 v0.4s, v1.8h, v16.h[2]
+      smlal2 v0.2s, v1.4s, v1.s[2]
+      smlal2 v0.2d, v1.4s, v1.s[4]
+      smlal2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlal v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal v0.4s, v1.4h, v16.h[2]
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlal v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlal v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlal v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal2 v0.4h, v1.8h, v1.h[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlal2 v0.4s, v1.8h, v1.h[8]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal2 v0.4s, v1.8h, v16.h[2]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlal2 v0.2s, v1.4s, v1.s[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlal2 v0.2d, v1.4s, v1.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlal2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                   ^
+
+      smlsl v0.4h, v1.4h, v2.h[2]
+      smlsl v0.4s, v1.4h, v2.h[8]
+      smlsl v0.4s, v1.4h, v16.h[2]
+      smlsl v0.2s, v1.2s, v2.s[4]
+      smlsl v0.2d, v1.2s, v2.s[4]
+      smlsl v0.2d, v1.2s, v22.s[4]
+      smlsl2 v0.4h, v1.8h, v1.h[2]
+      smlsl2 v0.4s, v1.8h, v1.h[8]
+      smlsl2 v0.4s, v1.8h, v16.h[2]
+      smlsl2 v0.2s, v1.4s, v1.s[2]
+      smlsl2 v0.2d, v1.4s, v1.s[4]
+      smlsl2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlsl v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl v0.4s, v1.4h, v16.h[2]
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlsl v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlsl v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlsl v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl2 v0.4h, v1.8h, v1.h[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlsl2 v0.4s, v1.8h, v1.h[8]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl2 v0.4s, v1.8h, v16.h[2]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smlsl2 v0.2s, v1.4s, v1.s[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v1.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smlsl2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                   ^
+
+      umlal v0.4h, v1.4h, v2.h[2]
+      umlal v0.4s, v1.4h, v2.h[8]
+      umlal v0.4s, v1.4h, v16.h[2]
+      umlal v0.2s, v1.2s, v2.s[4]
+      umlal v0.2d, v1.2s, v2.s[4]
+      umlal v0.2d, v1.2s, v22.s[4]
+      umlal2 v0.4h, v1.8h, v1.h[2]
+      umlal2 v0.4s, v1.8h, v1.h[8]
+      umlal2 v0.4s, v1.8h, v16.h[2]
+      umlal2 v0.2s, v1.4s, v1.s[2]
+      umlal2 v0.2d, v1.4s, v1.s[4]
+      umlal2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlal v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal v0.4s, v1.4h, v16.h[2]
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlal v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlal v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlal v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal2 v0.4h, v1.8h, v1.h[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlal2 v0.4s, v1.8h, v1.h[8]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal2 v0.4s, v1.8h, v16.h[2]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlal2 v0.2s, v1.4s, v1.s[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlal2 v0.2d, v1.4s, v1.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlal2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                   ^
+
+      umlsl v0.4h, v1.4h, v2.h[2]
+      umlsl v0.4s, v1.4h, v2.h[8]
+      umlsl v0.4s, v1.4h, v16.h[2]
+      umlsl v0.2s, v1.2s, v2.s[4]
+      umlsl v0.2d, v1.2s, v2.s[4]
+      umlsl v0.2d, v1.2s, v22.s[4]
+      umlsl2 v0.4h, v1.8h, v1.h[2]
+      umlsl2 v0.4s, v1.8h, v1.h[8]
+      umlsl2 v0.4s, v1.8h, v16.h[2]
+      umlsl2 v0.2s, v1.4s, v1.s[2]
+      umlsl2 v0.2d, v1.4s, v1.s[4]
+      umlsl2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlsl v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl v0.4s, v1.4h, v16.h[2]
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlsl v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlsl v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlsl v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl2 v0.4h, v1.8h, v1.h[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlsl2 v0.4s, v1.8h, v1.h[8]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl2 v0.4s, v1.8h, v16.h[2]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umlsl2 v0.2s, v1.4s, v1.s[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v1.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umlsl2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                   ^
+
+      sqdmlal v0.4h, v1.4h, v2.h[2]
+      sqdmlal v0.4s, v1.4h, v2.h[8]
+      sqdmlal v0.4s, v1.4h, v16.h[2]
+      sqdmlal v0.2s, v1.2s, v2.s[4]
+      sqdmlal v0.2d, v1.2s, v2.s[4]
+      sqdmlal v0.2d, v1.2s, v22.s[4]
+      sqdmlal2 v0.4h, v1.8h, v1.h[2]
+      sqdmlal2 v0.4s, v1.8h, v1.h[8]
+      sqdmlal2 v0.4s, v1.8h, v16.h[2]
+      sqdmlal2 v0.2s, v1.4s, v1.s[2]
+      sqdmlal2 v0.2d, v1.4s, v1.s[4]
+      sqdmlal2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlal v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal v0.4s, v1.4h, v16.h[2]
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlal v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlal v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal2 v0.4h, v1.8h, v1.h[2]
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlal2 v0.4s, v1.8h, v1.h[8]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal2 v0.4s, v1.8h, v16.h[2]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal2 v0.2s, v1.4s, v1.s[2]
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v1.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlal2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                     ^
+
+      sqdmlsl v0.4h, v1.4h, v2.h[2]
+      sqdmlsl v0.4s, v1.4h, v2.h[8]
+      sqdmlsl v0.4s, v1.4h, v16.h[2]
+      sqdmlsl v0.2s, v1.2s, v2.s[4]
+      sqdmlsl v0.2d, v1.2s, v2.s[4]
+      sqdmlsl v0.2d, v1.2s, v22.s[4]
+      sqdmlsl2 v0.4h, v1.8h, v1.h[2]
+      sqdmlsl2 v0.4s, v1.8h, v1.h[8]
+      sqdmlsl2 v0.4s, v1.8h, v16.h[2]
+      sqdmlsl2 v0.2s, v1.4s, v1.s[2]
+      sqdmlsl2 v0.2d, v1.4s, v1.s[4]
+      sqdmlsl2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlsl v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl v0.4s, v1.4h, v16.h[2]
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlsl v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlsl v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl2 v0.4h, v1.8h, v1.h[2]
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlsl2 v0.4s, v1.8h, v1.h[8]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl2 v0.4s, v1.8h, v16.h[2]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl2 v0.2s, v1.4s, v1.s[2]
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v1.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmlsl2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                     ^
+
+      mul v0.4h, v1.4h, v2.h[8]
+      mul v0.4h, v1.4h, v16.h[8]
+      mul v0.8h, v1.8h, v2.h[8]
+      mul v0.8h, v1.8h, v16.h[8]
+      mul v0.2s, v1.2s, v2.s[4]
+      mul v0.2s, v1.2s, v22.s[4]
+      mul v0.4s, v1.4s, v2.s[4]
+      mul v0.4s, v1.4s, v22.s[4]
+      mul v0.2d, v1.2d, v2.d[1]
+
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.4h, v1.4h, v2.h[8]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.4h, v1.4h, v16.h[8]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.8h, v1.8h, v16.h[8]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.2s, v1.2s, v22.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.4s, v1.4s, v2.s[4]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        mul v0.4s, v1.4s, v22.s[4]
+// CHECK-ERROR:                                ^
+
+      fmul v0.4h, v1.4h, v2.h[4]
+      fmul v0.2s, v1.2s, v2.s[4]
+      fmul v0.2s, v1.2s, v22.s[4]
+      fmul v0.4s, v1.4s, v2.s[4]
+      fmul v0.4s, v1.4s, v22.s[4]
+      fmul v0.2d, v1.2d, v2.d[2]
+      fmul v0.2d, v1.2d, v22.d[2]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        mul v0.2d, v1.2d, v2.d[1]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmul v0.4h, v1.4h, v2.h[4]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmul v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmul v0.2s, v1.2s, v22.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmul v0.4s, v1.4s, v2.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmul v0.4s, v1.4s, v22.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmul v0.2d, v1.2d, v2.d[2]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmul v0.2d, v1.2d, v22.d[2]
+// CHECK-ERROR:                                 ^
+
+      fmulx v0.4h, v1.4h, v2.h[4]
+      fmulx v0.2s, v1.2s, v2.s[4]
+      fmulx v0.2s, v1.2s, v22.s[4]
+      fmulx v0.4s, v1.4s, v2.s[4]
+      fmulx v0.4s, v1.4s, v22.s[4]
+      fmulx v0.2d, v1.2d, v2.d[2]
+      fmulx v0.2d, v1.2d, v22.d[2]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmulx v0.4h, v1.4h, v2.h[4]
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmulx v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmulx v0.2s, v1.2s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmulx v0.4s, v1.4s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmulx v0.4s, v1.4s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmulx v0.2d, v1.2d, v2.d[2]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        fmulx v0.2d, v1.2d, v22.d[2]
+// CHECK-ERROR:                                  ^
+
+      smull v0.4h, v1.4h, v2.h[2]
+      smull v0.4s, v1.4h, v2.h[8]
+      smull v0.4s, v1.4h, v16.h[4]
+      smull v0.2s, v1.2s, v2.s[2]
+      smull v0.2d, v1.2s, v2.s[4]
+      smull v0.2d, v1.2s, v22.s[4]
+      smull2 v0.4h, v1.8h, v2.h[2]
+      smull2 v0.4s, v1.8h, v2.h[8]
+      smull2 v0.4s, v1.8h, v16.h[4]
+      smull2 v0.2s, v1.4s, v2.s[2]
+      smull2 v0.2d, v1.4s, v2.s[4]
+      smull2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smull v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull v0.4s, v1.4h, v16.h[4]
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull v0.2s, v1.2s, v2.s[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smull v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smull v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull2 v0.4h, v1.8h, v2.h[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smull2 v0.4s, v1.8h, v2.h[8]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull2 v0.4s, v1.8h, v16.h[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smull2 v0.2s, v1.4s, v2.s[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smull2 v0.2d, v1.4s, v2.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        smull2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                   ^
+
+      umull v0.4h, v1.4h, v2.h[2]
+      umull v0.4s, v1.4h, v2.h[8]
+      umull v0.4s, v1.4h, v16.h[4]
+      umull v0.2s, v1.2s, v2.s[2]
+      umull v0.2d, v1.2s, v2.s[4]
+      umull v0.2d, v1.2s, v22.s[4]
+      umull2 v0.4h, v1.8h, v2.h[2]
+      umull2 v0.4s, v1.8h, v2.h[8]
+      umull2 v0.4s, v1.8h, v16.h[4]
+      umull2 v0.2s, v1.4s, v2.s[2]
+      umull2 v0.2d, v1.4s, v2.s[4]
+      umull2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umull v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull v0.4s, v1.4h, v16.h[4]
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull v0.2s, v1.2s, v2.s[2]
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umull v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umull v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull2 v0.4h, v1.8h, v2.h[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umull2 v0.4s, v1.8h, v2.h[8]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull2 v0.4s, v1.8h, v16.h[4]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umull2 v0.2s, v1.4s, v2.s[2]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umull2 v0.2d, v1.4s, v2.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        umull2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                   ^
+
+      sqdmull v0.4h, v1.4h, v2.h[2]
+      sqdmull v0.4s, v1.4h, v2.h[8]
+      sqdmull v0.4s, v1.4h, v16.h[4]
+      sqdmull v0.2s, v1.2s, v2.s[2]
+      sqdmull v0.2d, v1.2s, v2.s[4]
+      sqdmull v0.2d, v1.2s, v22.s[4]
+      sqdmull2 v0.4h, v1.8h, v2.h[2]
+      sqdmull2 v0.4s, v1.8h, v2.h[8]
+      sqdmull2 v0.4s, v1.8h, v16.h[4]
+      sqdmull2 v0.2s, v1.4s, v2.s[2]
+      sqdmull2 v0.2d, v1.4s, v2.s[4]
+      sqdmull2 v0.2d, v1.4s, v22.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull v0.4h, v1.4h, v2.h[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmull v0.4s, v1.4h, v2.h[8]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull v0.4s, v1.4h, v16.h[4]
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull v0.2s, v1.2s, v2.s[2]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmull v0.2d, v1.2s, v2.s[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmull v0.2d, v1.2s, v22.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull2 v0.4h, v1.8h, v2.h[2]
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmull2 v0.4s, v1.8h, v2.h[8]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull2 v0.4s, v1.8h, v16.h[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull2 v0.2s, v1.4s, v2.s[2]
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v2.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmull2 v0.2d, v1.4s, v22.s[4]
+// CHECK-ERROR:                                     ^
+
+      sqdmulh v0.4h, v1.4h, v2.h[8]
+      sqdmulh v0.4h, v1.4h, v16.h[2]
+      sqdmulh v0.8h, v1.8h, v2.h[8]
+      sqdmulh v0.8h, v1.8h, v16.h[2]
+      sqdmulh v0.2s, v1.2s, v2.s[4]
+      sqdmulh v0.2s, v1.2s, v22.s[4]
+      sqdmulh v0.4s, v1.4s, v2.s[4]
+      sqdmulh v0.4s, v1.4s, v22.s[4]
+      sqdmulh v0.2d, v1.2d, v22.d[1]
+
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v2.h[8]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmulh v0.4h, v1.4h, v16.h[2]
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmulh v0.8h, v1.8h, v16.h[2]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmulh v0.2s, v1.2s, v22.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v2.s[4]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqdmulh v0.4s, v1.4s, v22.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmulh v0.2d, v1.2d, v22.d[1]
+// CHECK-ERROR:                   ^
+
+      sqrdmulh v0.4h, v1.4h, v2.h[8]
+      sqrdmulh v0.4h, v1.4h, v16.h[2]
+      sqrdmulh v0.8h, v1.8h, v2.h[8]
+      sqrdmulh v0.8h, v1.8h, v16.h[2]
+      sqrdmulh v0.2s, v1.2s, v2.s[4]
+      sqrdmulh v0.2s, v1.2s, v22.s[4]
+      sqrdmulh v0.4s, v1.4s, v2.s[4]
+      sqrdmulh v0.4s, v1.4s, v22.s[4]
+      sqrdmulh v0.2d, v1.2d, v22.d[1]
+
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v2.h[8]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrdmulh v0.4h, v1.4h, v16.h[2]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrdmulh v0.8h, v1.8h, v16.h[2]
+// CHECK-ERROR:                                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v2.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqrdmulh v0.2s, v1.2s, v22.s[4]
+// CHECK-ERROR:                                     ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v2.s[4]
+// CHECK-ERROR:                                    ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:        sqrdmulh v0.4s, v1.4s, v22.s[4]
+// CHECK-ERROR:                                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrdmulh v0.2d, v1.2d, v22.d[1]
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Across vectors
+//----------------------------------------------------------------------
+
+        saddlv b0, v1.8b
+        saddlv b0, v1.16b
+        saddlv h0, v1.4h
+        saddlv h0, v1.8h
+        saddlv s0, v1.2s
+        saddlv s0, v1.4s
+        saddlv d0, v1.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddlv b0, v1.8b
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddlv b0, v1.16b
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddlv h0, v1.4h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddlv h0, v1.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddlv s0, v1.2s
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddlv s0, v1.4s
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        saddlv d0, v1.2s
+// CHECK-ERROR:                   ^
+
+        uaddlv b0, v1.8b
+        uaddlv b0, v1.16b
+        uaddlv h0, v1.4h
+        uaddlv h0, v1.8h
+        uaddlv s0, v1.2s
+        uaddlv s0, v1.4s
+        uaddlv d0, v1.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddlv b0, v1.8b
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddlv b0, v1.16b
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddlv h0, v1.4h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddlv h0, v1.8h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddlv s0, v1.2s
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddlv s0, v1.4s
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uaddlv d0, v1.2s
+// CHECK-ERROR:                   ^
+
+        smaxv s0, v1.2s
+        sminv s0, v1.2s
+        umaxv s0, v1.2s
+        uminv s0, v1.2s
+        addv s0, v1.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smaxv s0, v1.2s
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sminv s0, v1.2s
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umaxv s0, v1.2s
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uminv s0, v1.2s
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addv s0, v1.2s
+// CHECK-ERROR:                 ^
+
+        smaxv d0, v1.2d
+        sminv d0, v1.2d
+        umaxv d0, v1.2d
+        uminv d0, v1.2d
+        addv d0, v1.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smaxv d0, v1.2d
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sminv d0, v1.2d
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umaxv d0, v1.2d
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uminv d0, v1.2d
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        addv d0, v1.2d
+// CHECK-ERROR:             ^
+
+        fmaxnmv b0, v1.16b
+        fminnmv b0, v1.16b
+        fmaxv b0, v1.16b
+        fminv b0, v1.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxnmv b0, v1.16b
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminnmv b0, v1.16b
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxv b0, v1.16b
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminv b0, v1.16b
+// CHECK-ERROR:              ^
+
+        fmaxnmv h0, v1.8h
+        fminnmv h0, v1.8h
+        fmaxv h0, v1.8h
+        fminv h0, v1.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxnmv h0, v1.8h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminnmv h0, v1.8h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxv h0, v1.8h
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminv h0, v1.8h
+// CHECK-ERROR:              ^
+
+        fmaxnmv d0, v1.2d
+        fminnmv d0, v1.2d
+        fmaxv d0, v1.2d
+        fminv d0, v1.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxnmv d0, v1.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminnmv d0, v1.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxv d0, v1.2d
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminv d0, v1.2d
+// CHECK-ERROR:              ^
+
+//----------------------------------------------------------------------
+// Floating-point Multiply Extended
+//----------------------------------------------------------------------
+
+    fmulx s20, h22, s15
+    fmulx d23, d11, s1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmulx s20, h22, s15
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmulx d23, d11, s1
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Floating-point Reciprocal Step
+//----------------------------------------------------------------------
+
+    frecps s21, s16, h13
+    frecps d22, s30, d21
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          frecps s21, s16, h13
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          frecps d22, s30, d21
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Floating-point Reciprocal Square Root Step
+//----------------------------------------------------------------------
+
+    frsqrts s21, h5, s12
+    frsqrts d8, s22, d18
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          frsqrts s21, h5, s12
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          frsqrts d8, s22, d18
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Vector load/store multiple N-element structure (class SIMD lselem)
+//----------------------------------------------------------------------
+         ld1 {x3}, [x2]
+         ld1 {v4}, [x0]
+         ld1 {v32.16b}, [x0]
+         ld1 {v15.8h}, [x32]
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        ld1 {x3}, [x2]
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        ld1 {v4}, [x0]
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        ld1 {v32.16b}, [x0]
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ld1 {v15.8h}, [x32]
+// CHECK-ERROR:                       ^
+
+         ld1 {v0.16b, v2.16b}, [x0]
+         ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
+         ld1 v0.8b, v1.8b}, [x0]
+         ld1 {v0.8h-v4.8h}, [x0]
+         ld1 {v1.8h-v1.8h}, [x0]
+         ld1 {v15.8h-v17.4h}, [x15]
+         ld1 {v0.8b-v2.8b, [x0]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        ld1 {v0.16b, v2.16b}, [x0]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
+// CHECK-ERROR:                                         ^
+// CHECK-ERROR: error: '{' expected
+// CHECK-ERROR:        ld1 v0.8b, v1.8b}, [x0]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        ld1 {v0.8h-v4.8h}, [x0]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        ld1 {v1.8h-v1.8h}, [x0]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        ld1 {v15.8h-v17.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: '}' expected
+// CHECK-ERROR:        ld1 {v0.8b-v2.8b, [x0]
+// CHECK-ERROR:                        ^
+
+         ld2 {v15.8h, v16.4h}, [x15]
+         ld2 {v0.8b, v2.8b}, [x0]
+         ld2 {v15.4h, v16.4h, v17.4h}, [x32]
+         ld2 {v15.8h-v16.4h}, [x15]
+         ld2 {v0.2d-v2.2d}, [x0]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        ld2 {v15.8h, v16.4h}, [x15]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        ld2 {v0.8b, v2.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ld2 {v15.4h, v16.4h, v17.4h}, [x32]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        ld2 {v15.8h-v16.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ld2 {v0.2d-v2.2d}, [x0]
+// CHECK-ERROR:            ^
+
+         ld3 {v15.8h, v16.8h, v17.4h}, [x15]
+         ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
+         ld3 {v0.8b, v2.8b, v3.8b}, [x0]
+         ld3 {v15.8h-v17.4h}, [x15]
+         ld3 {v31.4s-v2.4s}, [sp]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        ld3 {v15.8h, v16.8h, v17.4h}, [x15]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        ld3 {v0.8b, v2.8b, v3.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        ld3 {v15.8h-v17.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ld3 {v31.4s-v2.4s}, [sp]
+// CHECK-ERROR:            ^
+
+         ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
+         ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
+         ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
+         ld4 {v15.8h-v18.4h}, [x15]
+         ld4 {v31.2s-v1.2s}, [x31]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
+// CHECK-ERROR:                                             ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        ld4 {v15.8h-v18.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ld4 {v31.2s-v1.2s}, [x31]
+// CHECK-ERROR:            ^
+
+         st1 {x3}, [x2]
+         st1 {v4}, [x0]
+         st1 {v32.16b}, [x0]
+         st1 {v15.8h}, [x32]
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        st1 {x3}, [x2]
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        st1 {v4}, [x0]
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        st1 {v32.16b}, [x0]
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        st1 {v15.8h}, [x32]
+// CHECK-ERROR:                       ^
+
+         st1 {v0.16b, v2.16b}, [x0]
+         st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
+         st1 v0.8b, v1.8b}, [x0]
+         st1 {v0.8h-v4.8h}, [x0]
+         st1 {v1.8h-v1.8h}, [x0]
+         st1 {v15.8h-v17.4h}, [x15]
+         st1 {v0.8b-v2.8b, [x0]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        st1 {v0.16b, v2.16b}, [x0]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
+// CHECK-ERROR:                                         ^
+// CHECK-ERROR: error: '{' expected
+// CHECK-ERROR:        st1 v0.8b, v1.8b}, [x0]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        st1 {v0.8h-v4.8h}, [x0]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        st1 {v1.8h-v1.8h}, [x0]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        st1 {v15.8h-v17.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: '}' expected
+// CHECK-ERROR:        st1 {v0.8b-v2.8b, [x0]
+// CHECK-ERROR:                        ^
+
+         st2 {v15.8h, v16.4h}, [x15]
+         st2 {v0.8b, v2.8b}, [x0]
+         st2 {v15.4h, v16.4h, v17.4h}, [x30]
+         st2 {v15.8h-v16.4h}, [x15]
+         st2 {v0.2d-v2.2d}, [x0]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        st2 {v15.8h, v16.4h}, [x15]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        st2 {v0.8b, v2.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        st2 {v15.4h, v16.4h, v17.4h}, [x30]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        st2 {v15.8h-v16.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        st2 {v0.2d-v2.2d}, [x0]
+// CHECK-ERROR:            ^
+
+         st3 {v15.8h, v16.8h, v17.4h}, [x15]
+         st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
+         st3 {v0.8b, v2.8b, v3.8b}, [x0]
+         st3 {v15.8h-v17.4h}, [x15]
+         st3 {v31.4s-v2.4s}, [sp]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        st3 {v15.8h, v16.8h, v17.4h}, [x15]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR:        st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        st3 {v0.8b, v2.8b, v3.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        st3 {v15.8h-v17.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        st3 {v31.4s-v2.4s}, [sp]
+// CHECK-ERROR:            ^
+
+         st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
+         st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
+         st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
+         st4 {v15.8h-v18.4h}, [x15]
+         st4 {v31.2s-v1.2s}, [x31]
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR:        st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
+// CHECK-ERROR:                                             ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR:        st4 {v15.8h-v18.4h}, [x15]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        st4 {v31.2s-v1.2s}, [x31]
+// CHECK-ERROR:            ^
+
+//----------------------------------------------------------------------
+// Vector post-index load/store multiple N-element structure
+// (class SIMD lselem-post)
+//----------------------------------------------------------------------
+         ld1 {v0.16b}, [x0], #8
+         ld1 {v0.8h, v1.16h}, [x0], x1
+         ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #24
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          ld1 {v0.16b}, [x0], #8
+// CHECK-ERROR:                              ^
+// CHECK-ERROR:  error: expected vector type register
+// CHECK-ERROR:          ld1 {v0.8h, v1.16h}, [x0], x1
+// CHECK-ERROR:                      ^
+// CHECK-ERROR:  error: invalid operand for instruction
+// CHECK-ERROR:          ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #24
+// CHECK-ERROR:                                                  ^
+
+         ld2 {v0.16b, v1.16b}, [x0], #16
+         ld3 {v5.2s, v6.2s, v7.2s}, [x1], #48
+         ld4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
+// CHECK-ERROR:  error: invalid operand for instruction
+// CHECK-ERROR:          ld2 {v0.16b, v1.16b}, [x0], #16
+// CHECK-ERROR:                                      ^
+// CHECK-ERROR:  error: invalid operand for instruction
+// CHECK-ERROR:          ld3 {v5.2s, v6.2s, v7.2s}, [x1], #48
+// CHECK-ERROR:                                           ^
+// CHECK-ERROR:  error: invalid space between two vectors
+// CHECK-ERROR:          ld4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
+// CHECK-ERROR:                                     ^
+
+         st1 {v0.16b}, [x0], #8
+         st1 {v0.8h, v1.16h}, [x0], x1
+         st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #24
+// CHECK-ERROR:  error: invalid operand for instruction
+// CHECK-ERROR:          st1 {v0.16b}, [x0], #8
+// CHECK-ERROR:                              ^
+// CHECK-ERROR:  error: expected vector type register
+// CHECK-ERROR:          st1 {v0.8h, v1.16h}, [x0], x1
+// CHECK-ERROR:                      ^
+// CHECK-ERROR:  error: invalid operand for instruction
+// CHECK-ERROR:          st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #24
+                                                 ^
+
+         st2 {v0.16b, v1.16b}, [x0], #16
+         st3 {v5.2s, v6.2s, v7.2s}, [x1], #48
+         st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
+// CHECK-ERROR:  error: invalid operand for instruction
+// CHECK-ERROR:          st2 {v0.16b, v1.16b}, [x0], #16
+// CHECK-ERROR:                                      ^
+// CHECK-ERROR:  error: invalid operand for instruction
+// CHECK-ERROR:          st3 {v5.2s, v6.2s, v7.2s}, [x1], #48
+// CHECK-ERROR:                                           ^
+// CHECK-ERROR:  error: invalid space between two vectors
+// CHECK-ERROR:          st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
+// CHECK-ERROR:                                     ^
+
+//------------------------------------------------------------------------------
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 1,2,3,4)
+//------------------------------------------------------------------------------
+         ld1r {x1}, [x0]
+         ld2r {v31.4s, v0.2s}, [sp]
+         ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: ld1r {x1}, [x0]
+// CHECK-ERROR:       ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: ld2r {v31.4s, v0.2s}, [sp]
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+// CHECK-ERROR:      ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
+// CHECK-ERROR:                      ^
+
+//------------------------------------------------------------------------------
+// Load/Store single N-element structure to/from one lane of N consecutive
+// registers (N = 1, 2,3,4)
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[16], [x0]
+         ld2 {v15.h, v16.h}[8], [x15]
+         ld3 {v31.s, v0.s, v1.s}[-1], [sp]
+         ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: ld1 {v0.b}[16], [x0]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: ld2 {v15.h, v16.h}[8], [x15]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: ld3 {v31.s, v0.s, v1.s}[-1], [sp]
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:                              ^
+
+         st1 {v0.d}[16], [x0]
+         st2 {v31.s, v0.s}[3], [8]
+         st3 {v15.h, v16.h, v17.h}[-1], [x15]
+         st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: st1 {v0.d}[16], [x0]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st2 {v31.s, v0.s}[3], [8]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[-1], [x15]
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: lane number incompatible with layout
+// CHECK-ERROR: st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:                              ^
+
+//------------------------------------------------------------------------------
+// Post-index of load single N-element structure to all lanes of N consecutive
+// registers (N = 1,2,3,4)
+//------------------------------------------------------------------------------
+         ld1r {v15.8h}, [x15], #5
+         ld2r {v0.2d, v1.2d}, [x0], #7
+         ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1
+         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld1r {v15.8h}, [x15], #5
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld2r {v0.2d, v1.2d}, [x0], #7
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1
+// CHECK-ERROR:                                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp
+// CHECK-ERROR:                                           ^
+
+//------------------------------------------------------------------------------
+// Post-index of Load/Store single N-element structure to/from one lane of N
+// consecutive registers (N = 1, 2,3,4)
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[0], [x0], #2
+         ld2 {v15.h, v16.h}[0], [x15], #3
+         ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
+         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld1 {v0.b}[0], [x0], #2
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld2 {v15.h, v16.h}[0], [x15], #3
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+// CHECK-ERROR:                                        ^
+
+         st1 {v0.d}[0], [x0], #7
+         st2 {v31.s, v0.s}[0], [sp], #6
+         st3 {v15.h, v16.h, v17.h}[0], [x15], #8
+         st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st1 {v0.d}[0], [x0], #7
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st2 {v31.s, v0.s}[0], [sp], #6
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[0], [x15], #8
+// CHECK-ERROR:                                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1
+// CHECK-ERROR:                                        ^
+
+
+         ins v2.b[16], w1
+         ins v7.h[8], w14
+         ins v20.s[5], w30
+         ins v1.d[2], x7
+         ins v2.b[3], b1
+         ins v7.h[2], h14
+         ins v20.s[1], s30
+         ins v1.d[0], d7
+
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v2.b[16], w1
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v7.h[8], w14
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v20.s[5], w30
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v1.d[2], x7
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v2.b[3], b1
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v7.h[2], h14
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v20.s[1], s30
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v1.d[0], d7
+// CHECK-ERROR:                      ^
+
+         smov w1, v0.b[16]
+         smov w14, v6.h[8]
+         smov x1, v0.b[16]
+         smov x14, v6.h[8]
+         smov x20, v9.s[5]
+         smov w1, v0.d[0]
+         smov w14, v6.d[1]
+         smov x1, v0.d[0]
+         smov x14, v6.d[1]
+         smov x20, v9.d[0]
+
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov w1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov w14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x20, v9.s[5]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov w1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov w14, v6.d[1]
+// CHECK-ERROR                      ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x14, v6.d[1]
+// CHECK-ERROR                      ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x20, v9.d[0]
+// CHECK-ERROR                      ^
+
+         umov w1, v0.b[16]
+         umov w14, v6.h[8]
+         umov w20, v9.s[5]
+         umov x7, v18.d[3]
+         umov w1, v0.d[0]
+         umov s20, v9.s[2]
+         umov d7, v18.d[1]
+
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w20, v9.s[5]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov x7, v18.d[3]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov w1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov s20, v9.s[2]
+// CHECK-ERROR              ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov d7, v18.d[1]
+// CHECK-ERROR              ^
+
+         Ins v1.h[2], v3.b[6]
+         Ins v6.h[7], v7.s[2]
+         Ins v15.d[0], v22.s[2]
+         Ins v0.d[0], v4.b[1]
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v1.h[2], v3.b[6]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v6.h[7], v7.s[2]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v15.d[0], v22.s[2]
+// CHECK-ERROR                           ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v0.d[0], v4.b[1]
+// CHECK-ERROR                         ^
+
+         dup v1.8h, v2.b[2]
+         dup v11.4s, v7.h[7]
+         dup v17.2d, v20.s[0]
+         dup v1.16b, v2.h[2]
+         dup v11.8h, v7.s[3]
+         dup v17.4s, v20.d[0]
+         dup v5.2d, v1.b[1]
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.8h, v2.b[2]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.4s, v7.h[7]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.2d, v20.s[0]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.16b, v2.h[2]
+// CHECK-ERROR                        ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v11.8h, v7.s[3]
+// CHECK-ERROR                        ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v17.4s, v20.d[0]
+// CHECK-ERROR                         ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v5.2d, v1.b[1]
+// CHECK-ERROR                       ^
+
+         dup v1.8b, b1
+         dup v11.4h, h14
+         dup v17.2s, s30
+         dup v1.16b, d2
+         dup v11.8s, w16
+         dup v17.4d, w28
+         dup v5.2d, w0
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.8b, b1
+// CHECK-ERROR                    ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.4h, h14
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.2s, s30
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.16b, d2
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.8s, w16
+// CHECK-ERROR             ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.4d, w28
+// CHECK-ERROR             ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v5.2d, w0
+// CHECK-ERROR                    ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Bitwise Equal
+//----------------------------------------------------------------------
+
+         cmeq b20, d21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmeq b20, d21, d22
+// CHECK-ERROR:               ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Bitwise Equal To Zero
+//----------------------------------------------------------------------
+
+         cmeq d20, b21, #0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmeq d20, b21, #0
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Unsigned Higher Or Same
+//----------------------------------------------------------------------
+
+         cmhs b20, d21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmhs b20, d21, d22
+// CHECK-ERROR:               ^
+
+        
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greather Than Or Equal
+//----------------------------------------------------------------------
+
+         cmge b20, d21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmge b20, d21, d22
+// CHECK-ERROR:               ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greather Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         cmge d20, b21, #0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmge d20, b21, #0
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Unsigned Higher
+//----------------------------------------------------------------------
+
+         cmhi b20, d21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmhi b20, d21, d22
+// CHECK-ERROR:               ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greater Than
+//----------------------------------------------------------------------
+
+         cmgt b20, d21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmgt b20, d21, d22
+// CHECK-ERROR:               ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greater Than Zero
+//----------------------------------------------------------------------
+
+         cmgt d20, b21, #0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmgt d20, b21, #0
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Less Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         cmle d20, b21, #0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmle d20, b21, #0
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Less Than Zero
+//----------------------------------------------------------------------
+
+         cmlt d20, b21, #0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmlt d20, b21, #0
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Compare Bitwise Test Bits
+//----------------------------------------------------------------------
+
+         cmtst b20, d21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          cmtst b20, d21, d22
+// CHECK-ERROR:                ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Equal
+//----------------------------------------------------------------------
+
+         fcmeq s10, h11, s12
+         fcmeq d20, s21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmeq s10, h11, s12
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmeq d20, s21, d22
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Equal To Zero
+//----------------------------------------------------------------------
+
+         fcmeq h10, s11, #0.0
+         fcmeq d20, s21, #0.0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmeq h10, s11, #0.0
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmeq d20, s21, #0.0
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greater Than Or Equal
+//----------------------------------------------------------------------
+
+         fcmge s10, h11, s12
+         fcmge d20, s21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmge s10, h11, s12
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmge d20, s21, d22
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         fcmge h10, s11, #0.0
+         fcmge d20, s21, #0.0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmge h10, s11, #0.0
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmge d20, s21, #0.0
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greather Than
+//----------------------------------------------------------------------
+
+         fcmgt s10, h11, s12
+         fcmgt d20, s21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmgt s10, h11, s12
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmgt d20, s21, d22
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greather Than Zero
+//----------------------------------------------------------------------
+
+         fcmgt h10, s11, #0.0
+         fcmgt d20, s21, #0.0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmgt h10, s11, #0.0
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmgt d20, s21, #0.0
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         fcmle h10, s11, #0.0
+         fcmle d20, s21, #0.0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmle h10, s11, #0.0
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmle d20, s21, #0.0
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Less Than
+//----------------------------------------------------------------------
+
+         fcmlt h10, s11, #0.0
+         fcmlt d20, s21, #0.0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmlt h10, s11, #0.0
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fcmlt d20, s21, #0.0
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
+//----------------------------------------------------------------------
+
+         facge s10, h11, s12
+         facge d20, s21, d22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          facge s10, h11, s12
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          facge d20, s21, d22
+// CHECK-ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Absolute Compare Mask Greater Than
+//----------------------------------------------------------------------
+
+         facgt s10, h11, s12
+         facgt d20, d21, s22
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          facgt s10, h11, s12
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          facgt d20, d21, s22
+// CHECK-ERROR:                          ^
+        
+//----------------------------------------------------------------------
+// Scalar Signed Saturating Accumulated of Unsigned Value
+//----------------------------------------------------------------------
+
+        suqadd b0, h1
+        suqadd h0, s1
+        suqadd s0, d1
+        suqadd d0, b0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        suqadd b0, h1
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        suqadd h0, s1
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        suqadd s0, d1
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        suqadd d0, b0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Saturating Accumulated of Signed Value
+//----------------------------------------------------------------------
+
+        usqadd b0, h1
+        usqadd h0, s1
+        usqadd s0, d1
+        usqadd d0, b1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usqadd b0, h1
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usqadd h0, s1
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usqadd s0, d1
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        usqadd d0, b1
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Absolute Value
+//----------------------------------------------------------------------
+
+    abs d29, s24
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        abs d29, s24
+// CHECK-ERROR:                 ^
+
+//----------------------------------------------------------------------
+// Scalar Negate
+//----------------------------------------------------------------------
+
+    neg d29, s24
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        neg d29, s24
+// CHECK-ERROR:                 ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Doubling Multiply-Add Long
+//----------------------------------------------------------------------
+
+    sqdmlal s17, h27, s12
+    sqdmlal d19, s24, d12
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlal s17, h27, s12
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR:        sqdmlal d19, s24, d12
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Doubling Multiply-Subtract Long
+//----------------------------------------------------------------------
+
+    sqdmlsl s14, h12, s25
+    sqdmlsl d12, s23, d13
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmlsl s14, h12, s25
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR:        sqdmlsl d12, s23, d13
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Doubling Multiply Long
+//----------------------------------------------------------------------
+
+    sqdmull s12, h22, s12
+    sqdmull d15, s22, d12
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqdmull s12, h22, s12
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: too few operands for instruction
+// CHECK-ERROR:        sqdmull d15, s22, d12
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Scalar Signed Saturating Extract Unsigned Narrow
+//----------------------------------------------------------------------
+
+    sqxtun b19, b14
+    sqxtun h21, h15
+    sqxtun s20, s12
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqxtun b19, b14
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqxtun h21, h15
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqxtun s20, s12
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Signed Saturating Extract Signed Narrow
+//----------------------------------------------------------------------
+
+    sqxtn b18, b18
+    sqxtn h20, h17
+    sqxtn s19, s14
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqxtn b18, b18
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqxtn h20, h17
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqxtn s19, s14
+// CHECK-ERROR:                   ^
+
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Saturating Extract Narrow
+//----------------------------------------------------------------------
+
+    uqxtn b18, b18
+    uqxtn h20, h17
+    uqxtn s19, s14
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqxtn b18, b18
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqxtn h20, h17
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqxtn s19, s14
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Signed Shift Right (Immediate)
+//----------------------------------------------------------------------
+        sshr d15, d16, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        sshr d15, d16, #99
+// CHECK-ERROR:                       ^
+
+        sshr d15, s16, #31
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sshr d15, s16, #31
+// CHECK-ERROR:                  ^
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Shift Right (Immediate)
+//----------------------------------------------------------------------
+
+        ushr d10, d17, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        ushr d10, d17, #99
+// CHECK-ERROR:                       ^
+
+//----------------------------------------------------------------------
+// Scalar Signed Rounding Shift Right (Immediate)
+//----------------------------------------------------------------------
+
+        srshr d19, d18, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        srshr d19, d18, #99
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Scalar Unigned Rounding Shift Right (Immediate)
+//----------------------------------------------------------------------
+
+        urshr d20, d23, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        urshr d20, d23, #99
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Scalar Signed Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+
+        ssra d18, d12, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        ssra d18, d12, #99
+// CHECK-ERROR:                       ^
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+
+        usra d20, d13, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        usra d20, d13, #99
+// CHECK-ERROR:                       ^
+
+//----------------------------------------------------------------------
+// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+
+        srsra d15, d11, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        srsra d15, d11, #99
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+
+        ursra d18, d10, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        ursra d18, d10, #99
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Scalar Shift Left (Immediate)
+//----------------------------------------------------------------------
+
+        shl d7, d10, #99
+
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:        shl d7, d10, #99
+// CHECK-ERROR:                     ^
+
+        shl d7, s16, #31
+        
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        shl d7, s16, #31
+// CHECK-ERROR:                ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Left (Immediate)
+//----------------------------------------------------------------------
+
+        sqshl b11, b19, #99
+        sqshl h13, h18, #99
+        sqshl s14, s17, #99
+        sqshl d15, d16, #99
+
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:        sqshl b11, b19, #99
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:        sqshl h13, h18, #99
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:        sqshl s14, s17, #99
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:        sqshl d15, d16, #99
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Unsigned Saturating Shift Left (Immediate)
+//----------------------------------------------------------------------
+
+        uqshl b18, b15, #99
+        uqshl h11, h18, #99
+        uqshl s14, s19, #99
+        uqshl d15, d12, #99
+
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:        uqshl b18, b15, #99
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:        uqshl h11, h18, #99
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:        uqshl s14, s19, #99
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:        uqshl d15, d12, #99
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Left Unsigned (Immediate)
+//----------------------------------------------------------------------
+
+        sqshlu b15, b18, #99
+        sqshlu h19, h17, #99
+        sqshlu s16, s14, #99
+        sqshlu d11, d13, #99
+
+// CHECK-ERROR: error: expected integer in range [0, 7]
+// CHECK-ERROR:        sqshlu  b15, b18, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [0, 15]
+// CHECK-ERROR:        sqshlu  h19, h17, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [0, 31]
+// CHECK-ERROR:        sqshlu  s16, s14, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:        sqshlu  d11, d13, #99
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Shift Right And Insert (Immediate)
+//----------------------------------------------------------------------
+
+        sri d10, d12, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        sri d10, d12, #99
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Shift Left And Insert (Immediate)
+//----------------------------------------------------------------------
+
+        sli d10, d14, #99
+
+// CHECK-ERROR: error: expected integer in range [0, 63]
+// CHECK-ERROR:        sli d10, d14, #99
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+
+        sqshrn b10, h15, #99
+        sqshrn h17, s10, #99
+        sqshrn s18, d10, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:        sqshrn  b10, h15, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:        sqshrn  h17, s10, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        sqshrn  s18, d10, #99
+// CHECK-ERROR:                          ^
+        
+//----------------------------------------------------------------------
+// Unsigned Saturating Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+
+        uqshrn b12, h10, #99
+        uqshrn h10, s14, #99
+        uqshrn s10, d12, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:        uqshrn  b12, h10, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:        uqshrn  h10, s14, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        uqshrn  s10, d12, #99
+// CHECK-ERROR:                          ^
+        
+//----------------------------------------------------------------------
+// Signed Saturating Rounded Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+
+        sqrshrn b10, h13, #99
+        sqrshrn h15, s10, #99
+        sqrshrn s15, d12, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:        sqrshrn b10, h13, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:        sqrshrn h15, s10, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        sqrshrn s15, d12, #99
+// CHECK-ERROR:                          ^
+        
+//----------------------------------------------------------------------
+// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+
+        uqrshrn b10, h12, #99
+        uqrshrn h12, s10, #99
+        uqrshrn s10, d10, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:        uqrshrn b10, h12, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:        uqrshrn h12, s10, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        uqrshrn s10, d10, #99
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Right Unsigned Narrow (Immediate)
+//----------------------------------------------------------------------
+
+        sqshrun b15, h10, #99
+        sqshrun h20, s14, #99
+        sqshrun s10, d15, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:        sqshrun b15, h10, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:        sqshrun h20, s14, #99
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        sqshrun s10, d15, #99
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
+//----------------------------------------------------------------------
+
+        sqrshrun b17, h10, #99
+        sqrshrun h10, s13, #99
+        sqrshrun s22, d16, #99
+
+// CHECK-ERROR: error: expected integer in range [1, 8]
+// CHECK-ERROR:        sqrshrun b17, h10, #99
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [1, 16]
+// CHECK-ERROR:        sqrshrun h10, s13, #99
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        sqrshrun s22, d16, #99
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
+//----------------------------------------------------------------------
+
+    scvtf s22, s13, #0
+    scvtf s22, s13, #33
+    scvtf d21, d12, #65
+    scvtf d21, s12, #31
+        
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        scvtf s22, s13, #0
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        scvtf s22, s13, #33
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        scvtf d21, d12, #65
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        scvtf d21, s12, #31
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
+//----------------------------------------------------------------------
+
+    ucvtf s22, s13, #34
+    ucvtf d21, d14, #65
+    ucvtf d21, s14, #64
+        
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        ucvtf s22, s13, #34
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        ucvtf d21, d14, #65
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ucvtf d21, s14, #64
+// CHECK-ERROR:                   ^
+
+//------------------------------------------------------------------------------
+// Element reverse
+//------------------------------------------------------------------------------
+         rev64 v6.2d, v8.2d
+         rev32 v30.2s, v31.2s
+         rev32 v30.4s, v31.4s
+         rev32 v30.2d, v31.2d
+         rev16 v21.4h, v1.4h
+         rev16 v21.8h, v1.8h
+         rev16 v21.2s, v1.2s
+         rev16 v21.4s, v1.4s
+         rev16 v21.2d, v1.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev64 v6.2d, v8.2d
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev32 v30.2s, v31.2s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev32 v30.4s, v31.4s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev32 v30.2d, v31.2d
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev16 v21.4h, v1.4h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev16 v21.8h, v1.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev16 v21.2s, v1.2s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev16 v21.4s, v1.4s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rev16 v21.2d, v1.2d
+// CHECK-ERROR:                   ^
+
+//------------------------------------------------------------------------------
+// Signed integer pairwise add long
+//------------------------------------------------------------------------------
+
+         saddlp v3.8h, v21.8h
+         saddlp v8.8b, v5.8b
+         saddlp v9.8h, v1.4s
+         saddlp v0.4s, v1.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         saddlp v3.8h, v21.8h
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         saddlp v8.8b, v5.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         saddlp v9.8h, v1.4s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         saddlp v0.4s, v1.2d
+// CHECK-ERROR:                          ^
+
+//------------------------------------------------------------------------------
+// Unsigned integer pairwise add long
+//------------------------------------------------------------------------------
+
+         uaddlp v3.8h, v21.8h
+         uaddlp v8.8b, v5.8b
+         uaddlp v9.8h, v1.4s
+         uaddlp v0.4s, v1.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uaddlp v3.8h, v21.8h
+// CHECK-ERROR:                           ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uaddlp v8.8b, v5.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uaddlp v9.8h, v1.4s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uaddlp v0.4s, v1.2d
+// CHECK-ERROR:                          ^
+
+//------------------------------------------------------------------------------
+// Signed integer pairwise add and accumulate long
+//------------------------------------------------------------------------------
+
+         sadalp v3.16b, v21.16b
+         sadalp v8.4h, v5.4h
+         sadalp v9.4s, v1.4s
+         sadalp v0.4h, v1.2s
+         sadalp v12.2d, v4.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sadalp v3.16b, v21.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sadalp v8.4h, v5.4h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sadalp v9.4s, v1.4s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sadalp v0.4h, v1.2s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sadalp v12.2d, v4.8h
+// CHECK-ERROR:                           ^
+
+//------------------------------------------------------------------------------
+// Unsigned integer pairwise add and accumulate long
+//------------------------------------------------------------------------------
+
+         uadalp v3.16b, v21.16b
+         uadalp v8.4h, v5.4h
+         uadalp v9.4s, v1.4s
+         uadalp v0.4h, v1.2s
+         uadalp v12.2d, v4.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uadalp v3.16b, v21.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uadalp v8.4h, v5.4h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uadalp v9.4s, v1.4s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uadalp v0.4h, v1.2s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uadalp v12.2d, v4.8h
+// CHECK-ERROR:                           ^
+
+//------------------------------------------------------------------------------
+// Signed integer saturating accumulate of unsigned value
+//------------------------------------------------------------------------------
+
+         suqadd v0.16b, v31.8b
+         suqadd v1.8b, v9.8h
+         suqadd v13.4h, v21.4s
+         suqadd v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         suqadd v0.16b, v31.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         suqadd v1.8b, v9.8h
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         suqadd v13.4h, v21.4s
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         suqadd v4.2s, v0.2d
+// CHECK-ERROR:                       ^
+
+//------------------------------------------------------------------------------
+// Unsigned integer saturating accumulate of signed value
+//------------------------------------------------------------------------------
+
+         usqadd v0.16b, v31.8b
+         usqadd v2.8h, v4.4h
+         usqadd v13.4h, v21.4s
+         usqadd v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         usqadd v0.16b, v31.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         usqadd v2.8h, v4.4h
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         usqadd v13.4h, v21.4s
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         usqadd v4.2s, v0.2d
+// CHECK-ERROR:                       ^
+
+//------------------------------------------------------------------------------
+// Integer saturating absolute
+//------------------------------------------------------------------------------
+
+         sqabs v0.16b, v31.8b
+         sqabs v2.8h, v4.4h
+         sqabs v6.4s, v8.2s
+         sqabs v6.2d, v8.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqabs v0.16b, v31.8b
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqabs v2.8h, v4.4h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqabs v6.4s, v8.2s
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqabs v6.2d, v8.2s
+// CHECK-ERROR:                      ^
+
+//------------------------------------------------------------------------------
+// Signed integer saturating negate
+//------------------------------------------------------------------------------
+
+         sqneg v0.16b, v31.8b
+         sqneg v2.8h, v4.4h
+         sqneg v6.4s, v8.2s
+         sqneg v6.2d, v8.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqneg v0.16b, v31.8b
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqneg v2.8h, v4.4h
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqneg v6.4s, v8.2s
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqneg v6.2d, v8.2s
+// CHECK-ERROR:                      ^
+
+//------------------------------------------------------------------------------
+// Integer absolute
+//------------------------------------------------------------------------------
+
+         abs v0.16b, v31.8b
+         abs v2.8h, v4.4h
+         abs v6.4s, v8.2s
+         abs v6.2d, v8.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         abs v0.16b, v31.8b
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         abs v2.8h, v4.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         abs v6.4s, v8.2s
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         abs v6.2d, v8.2s
+// CHECK-ERROR:                    ^
+
+//------------------------------------------------------------------------------
+// Integer count leading sign bits
+//------------------------------------------------------------------------------
+
+         cls v0.2d, v31.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cls v0.2d, v31.2d
+// CHECK-ERROR:                ^
+
+//------------------------------------------------------------------------------
+// Integer count leading zeros
+//------------------------------------------------------------------------------
+
+         clz v0.2d, v31.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         clz v0.2d, v31.2d
+// CHECK-ERROR:                ^
+
+//------------------------------------------------------------------------------
+// Population count
+//------------------------------------------------------------------------------
+
+         cnt v2.8h, v4.8h
+         cnt v6.4s, v8.4s
+         cnt v6.2d, v8.2d
+         cnt v13.4h, v21.4h
+         cnt v4.2s, v0.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cnt v2.8h, v4.8h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cnt v6.4s, v8.4s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cnt v6.2d, v8.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cnt v13.4h, v21.4h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cnt v4.2s, v0.2s
+// CHECK-ERROR:                ^
+
+
+//------------------------------------------------------------------------------
+// Bitwise NOT
+//------------------------------------------------------------------------------
+
+         not v2.8h, v4.8h
+         not v6.4s, v8.4s
+         not v6.2d, v8.2d
+         not v13.4h, v21.4h
+         not v4.2s, v0.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         not v2.8h, v4.8h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         not v6.4s, v8.4s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         not v6.2d, v8.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         not v13.4h, v21.4h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         not v4.2s, v0.2s
+// CHECK-ERROR:                ^
+
+//------------------------------------------------------------------------------
+// Bitwise reverse
+//------------------------------------------------------------------------------
+
+         rbit v2.8h, v4.8h
+         rbit v6.4s, v8.4s
+         rbit v6.2d, v8.2d
+         rbit v13.4h, v21.4h
+         rbit v4.2s, v0.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rbit v2.8h, v4.8h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rbit v6.4s, v8.4s
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rbit v6.2d, v8.2d
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rbit v13.4h, v21.4h
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         rbit v4.2s, v0.2s
+// CHECK-ERROR:                 ^
+
+//------------------------------------------------------------------------------
+// Floating-point absolute
+//------------------------------------------------------------------------------
+
+         fabs v0.16b, v31.16b
+         fabs v2.8h, v4.8h
+         fabs v1.8b, v9.8b
+         fabs v13.4h, v21.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fabs v0.16b, v31.16b
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fabs v2.8h, v4.8h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fabs v1.8b, v9.8b
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fabs v13.4h, v21.4h
+// CHECK-ERROR:                  ^
+
+//------------------------------------------------------------------------------
+// Floating-point negate
+//------------------------------------------------------------------------------
+
+         fneg v0.16b, v31.16b
+         fneg v2.8h, v4.8h
+         fneg v1.8b, v9.8b
+         fneg v13.4h, v21.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fneg v0.16b, v31.16b
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fneg v2.8h, v4.8h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fneg v1.8b, v9.8b
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fneg v13.4h, v21.4h
+// CHECK-ERROR:                  ^
+
+//------------------------------------------------------------------------------
+// Integer extract and narrow
+//------------------------------------------------------------------------------
+
+         xtn v0.16b, v31.8h
+         xtn v2.8h, v4.4s
+         xtn v6.4s, v8.2d
+         xtn2 v1.8b, v9.8h
+         xtn2 v13.4h, v21.4s
+         xtn2 v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         xtn v0.16b, v31.8h
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         xtn v2.8h, v4.4s
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         xtn v6.4s, v8.2d
+// CHECK-ERROR:             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         xtn2 v1.8b, v9.8h
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         xtn2 v13.4h, v21.4s
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         xtn2 v4.2s, v0.2d
+// CHECK-ERROR:              ^
+
+//------------------------------------------------------------------------------
+// Signed integer saturating extract and unsigned narrow
+//------------------------------------------------------------------------------
+
+         sqxtun v0.16b, v31.8h
+         sqxtun v2.8h, v4.4s
+         sqxtun v6.4s, v8.2d
+         sqxtun2 v1.8b, v9.8h
+         sqxtun2 v13.4h, v21.4s
+         sqxtun2 v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtun v0.16b, v31.8h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtun v2.8h, v4.4s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtun v6.4s, v8.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtun2 v1.8b, v9.8h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtun2 v13.4h, v21.4s
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtun2 v4.2s, v0.2d
+// CHECK-ERROR:                 ^
+
+//------------------------------------------------------------------------------
+// Signed integer saturating extract and narrow
+//------------------------------------------------------------------------------
+
+         sqxtn v0.16b, v31.8h
+         sqxtn v2.8h, v4.4s
+         sqxtn v6.4s, v8.2d
+         sqxtn2 v1.8b, v9.8h
+         sqxtn2 v13.4h, v21.4s
+         sqxtn2 v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtn v0.16b, v31.8h
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtn v2.8h, v4.4s
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtn v6.4s, v8.2d
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtn2 v1.8b, v9.8h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtn2 v13.4h, v21.4s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqxtn2 v4.2s, v0.2d
+// CHECK-ERROR:                ^
+
+//------------------------------------------------------------------------------
+// Unsigned integer saturating extract and narrow
+//------------------------------------------------------------------------------
+
+         uqxtn v0.16b, v31.8h
+         uqxtn v2.8h, v4.4s
+         uqxtn v6.4s, v8.2d
+         uqxtn2 v1.8b, v9.8h
+         uqxtn2 v13.4h, v21.4s
+         uqxtn2 v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqxtn v0.16b, v31.8h
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqxtn v2.8h, v4.4s
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqxtn v6.4s, v8.2d
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqxtn2 v1.8b, v9.8h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqxtn2 v13.4h, v21.4s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uqxtn2 v4.2s, v0.2d
+// CHECK-ERROR:                ^
+
+//------------------------------------------------------------------------------
+// Integer shift left long
+//------------------------------------------------------------------------------
+
+         shll2 v2.8h, v4.16b, #7
+         shll2 v6.4s, v8.8h, #15
+         shll2 v6.2d, v8.4s, #31
+         shll v2.8h, v4.16b, #8
+         shll v6.4s, v8.8h, #16
+         shll v6.2d, v8.4s, #32
+         shll v2.8h, v4.8b, #8
+         shll v6.4s, v8.4h, #16
+         shll v6.2d, v8.2s, #32
+         shll2 v2.8h, v4.8b, #5
+         shll2 v6.4s, v8.4h, #14
+         shll2 v6.2d, v8.2s, #1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll2 v2.8h, v4.16b, #7
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll2 v6.4s, v8.8h, #15
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll2 v6.2d, v8.4s, #31
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll v2.8h, v4.16b, #8
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll v6.4s, v8.8h, #16
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll v6.2d, v8.4s, #32
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll2 v2.8h, v4.8b, #5
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll2 v6.4s, v8.4h, #14
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         shll2 v6.2d, v8.2s, #1
+// CHECK-ERROR:                      ^
+
+//------------------------------------------------------------------------------
+// Floating-point convert downsize
+//------------------------------------------------------------------------------
+
+         fcvtn v2.8h, v4.4s
+         fcvtn v6.4s, v8.2d
+         fcvtn2 v13.4h, v21.4s
+         fcvtn2 v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtn v2.8h, v4.4s
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtn v6.4s, v8.2d
+// CHECK-ERROR:               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtn2 v13.4h, v21.4s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtn2 v4.2s, v0.2d
+// CHECK-ERROR:                ^
+
+//------------------------------------------------------------------------------
+// Floating-point convert downsize with inexact
+//------------------------------------------------------------------------------
+
+         fcvtxn v6.4s, v8.2d
+         fcvtxn2 v4.2s, v0.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtxn v6.4s, v8.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtxn2 v4.2s, v0.2d
+// CHECK-ERROR:                 ^
+
+//------------------------------------------------------------------------------
+// Floating-point convert upsize
+//------------------------------------------------------------------------------
+
+         fcvtl2 v9.4s, v1.4h
+         fcvtl2 v0.2d, v1.2s
+         fcvtl v12.4s, v4.8h
+         fcvtl v17.2d, v28.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtl2 v9.4s, v1.4h
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtl2 v0.2d, v1.2s
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtl v12.4s, v4.8h
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtl v17.2d, v28.4s
+// CHECK-ERROR:                       ^
+
+//------------------------------------------------------------------------------
+// Floating-point round to integral
+//------------------------------------------------------------------------------
+
+         frintn v0.16b, v31.16b
+         frintn v2.8h, v4.8h
+         frintn v1.8b, v9.8b
+         frintn v13.4h, v21.4h
+
+         frinta v0.16b, v31.16b
+         frinta v2.8h, v4.8h
+         frinta v1.8b, v9.8b
+         frinta v13.4h, v21.4h
+
+         frintp v0.16b, v31.16b
+         frintp v2.8h, v4.8h
+         frintp v1.8b, v9.8b
+         frintp v13.4h, v21.4h
+
+         frintm v0.16b, v31.16b
+         frintm v2.8h, v4.8h
+         frintm v1.8b, v9.8b
+         frintm v13.4h, v21.4h
+
+         frintx v0.16b, v31.16b
+         frintx v2.8h, v4.8h
+         frintx v1.8b, v9.8b
+         frintx v13.4h, v21.4h
+
+         frintz v0.16b, v31.16b
+         frintz v2.8h, v4.8h
+         frintz v1.8b, v9.8b
+         frintz v13.4h, v21.4h
+
+         frinti v0.16b, v31.16b
+         frinti v2.8h, v4.8h
+         frinti v1.8b, v9.8b
+         frinti v13.4h, v21.4h
+
+         fcvtns v0.16b, v31.16b
+         fcvtns v2.8h, v4.8h
+         fcvtns v1.8b, v9.8b
+         fcvtns v13.4h, v21.4h
+
+         fcvtnu v0.16b, v31.16b
+         fcvtnu v2.8h, v4.8h
+         fcvtnu v1.8b, v9.8b
+         fcvtnu v13.4h, v21.4h
+
+         fcvtps v0.16b, v31.16b
+         fcvtps v2.8h, v4.8h
+         fcvtps v1.8b, v9.8b
+         fcvtps v13.4h, v21.4h
+
+         fcvtpu v0.16b, v31.16b
+         fcvtpu v2.8h, v4.8h
+         fcvtpu v1.8b, v9.8b
+         fcvtpu v13.4h, v21.4h
+
+         fcvtms v0.16b, v31.16b
+         fcvtms v2.8h, v4.8h
+         fcvtms v1.8b, v9.8b
+         fcvtms v13.4h, v21.4h
+
+         fcvtmu v0.16b, v31.16b
+         fcvtmu v2.8h, v4.8h
+         fcvtmu v1.8b, v9.8b
+         fcvtmu v13.4h, v21.4h
+
+         fcvtzs v0.16b, v31.16b
+         fcvtzs v2.8h, v4.8h
+         fcvtzs v1.8b, v9.8b
+         fcvtzs v13.4h, v21.4h
+
+         fcvtzu v0.16b, v31.16b
+         fcvtzu v2.8h, v4.8h
+         fcvtzu v1.8b, v9.8b
+         fcvtzu v13.4h, v21.4h
+
+         fcvtas v0.16b, v31.16b
+         fcvtas v2.8h, v4.8h
+         fcvtas v1.8b, v9.8b
+         fcvtas v13.4h, v21.4h
+
+         fcvtau v0.16b, v31.16b
+         fcvtau v2.8h, v4.8h
+         fcvtau v1.8b, v9.8b
+         fcvtau v13.4h, v21.4h
+
+         urecpe v0.16b, v31.16b
+         urecpe v2.8h, v4.8h
+         urecpe v1.8b, v9.8b
+         urecpe v13.4h, v21.4h
+         urecpe v1.2d, v9.2d
+
+         ursqrte v0.16b, v31.16b
+         ursqrte v2.8h, v4.8h
+         ursqrte v1.8b, v9.8b
+         ursqrte v13.4h, v21.4h
+         ursqrte v1.2d, v9.2d
+
+         scvtf v0.16b, v31.16b
+         scvtf v2.8h, v4.8h
+         scvtf v1.8b, v9.8b
+         scvtf v13.4h, v21.4h
+
+         ucvtf v0.16b, v31.16b
+         ucvtf v2.8h, v4.8h
+         ucvtf v1.8b, v9.8b
+         ucvtf v13.4h, v21.4h
+
+         frecpe v0.16b, v31.16b
+         frecpe v2.8h, v4.8h
+         frecpe v1.8b, v9.8b
+         frecpe v13.4h, v21.4h
+
+         frsqrte v0.16b, v31.16b
+         frsqrte v2.8h, v4.8h
+         frsqrte v1.8b, v9.8b
+         frsqrte v13.4h, v21.4h
+
+         fsqrt v0.16b, v31.16b
+         fsqrt v2.8h, v4.8h
+         fsqrt v1.8b, v9.8b
+         fsqrt v13.4h, v21.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintn v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintn v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintn v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintn v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinta v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinta v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinta v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinta v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintp v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintp v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintp v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintp v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintm v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintm v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintm v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintm v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintx v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintx v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintx v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintx v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintz v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintz v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintz v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frintz v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinti v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinti v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinti v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frinti v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtns v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtns v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtns v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtns v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtnu v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtnu v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtnu v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtnu v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtps v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtps v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtps v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtps v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtpu v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtpu v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtpu v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtpu v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtms v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtms v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtms v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtms v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtmu v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtmu v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtmu v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtmu v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzs v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzs v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzs v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzs v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzu v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzu v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzu v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtzu v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtas v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtas v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtas v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtas v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtau v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtau v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtau v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fcvtau v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urecpe v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urecpe v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urecpe v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urecpe v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         urecpe v1.2d, v9.2d
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursqrte v0.16b, v31.16b
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursqrte v2.8h, v4.8h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursqrte v1.8b, v9.8b
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursqrte v13.4h, v21.4h
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ursqrte v1.2d, v9.2d
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         scvtf v0.16b, v31.16b
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         scvtf v2.8h, v4.8h
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         scvtf v1.8b, v9.8b
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         scvtf v13.4h, v21.4h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ucvtf v0.16b, v31.16b
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ucvtf v2.8h, v4.8h
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ucvtf v1.8b, v9.8b
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ucvtf v13.4h, v21.4h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frecpe v0.16b, v31.16b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frecpe v2.8h, v4.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frecpe v1.8b, v9.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frecpe v13.4h, v21.4h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frsqrte v0.16b, v31.16b
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frsqrte v2.8h, v4.8h
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frsqrte v1.8b, v9.8b
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         frsqrte v13.4h, v21.4h
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fsqrt v0.16b, v31.16b
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fsqrt v2.8h, v4.8h
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fsqrt v1.8b, v9.8b
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fsqrt v13.4h, v21.4h
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
+//----------------------------------------------------------------------
+
+    fcvtzs s21, s12, #0
+    fcvtzs d21, d12, #65
+    fcvtzs s21, d12, #1
+
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        fcvtzs s21, s12, #0
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        fcvtzs d21, d12, #65
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtzs s21, d12, #1
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
+//----------------------------------------------------------------------
+
+    fcvtzu s21, s12, #33
+    fcvtzu d21, d12, #0
+    fcvtzu s21, d12, #1
+
+// CHECK-ERROR: error: expected integer in range [1, 32]
+// CHECK-ERROR:        fcvtzu s21, s12, #33
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected integer in range [1, 64]
+// CHECK-ERROR:        fcvtzu d21, d12, #0
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtzu s21, d12, #1
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Saturating Extract Narrow
+//----------------------------------------------------------------------
+
+        aese v0.8h, v1.8h
+        aese v0.4s, v1.4s
+        aese v0.2d, v1.2d
+        aesd v0.8h, v1.8h
+        aesmc v0.8h, v1.8h
+        aesimc v0.8h, v1.8h
+
+// CHECK:  error: invalid operand for instruction
+// CHECK:         aese v0.8h, v1.8h
+// CHECK:                 ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         aese v0.4s, v1.4s
+// CHECK:                 ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         aese v0.2d, v1.2d
+// CHECK:                 ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         aesd v0.8h, v1.8h
+// CHECK:                 ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         aesmc v0.8h, v1.8h
+// CHECK:                  ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         aesimc v0.8h, v1.8h
+// CHECK:                   ^
+
+        sha1h b0, b1
+        sha1h h0, h1
+        sha1h d0, d1
+        sha1h q0, q1
+        sha1su1 v0.16b, v1.16b
+        sha1su1 v0.8h, v1.8h
+        sha1su1 v0.2d, v1.2d
+        sha256su0 v0.16b, v1.16b
+
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1h b0, b1
+// CHECK:               ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1h h0, h1
+// CHECK:               ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1h d0, d1
+// CHECK:               ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1h q0, q1
+// CHECK:               ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1su1 v0.16b, v1.16b
+// CHECK:                    ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1su1 v0.8h, v1.8h
+// CHECK:                    ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1su1 v0.2d, v1.2d
+// CHECK:                    ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha256su0 v0.16b, v1.16b
+// CHECK:                      ^
+
+        sha1c q0, q1, v2.4s
+        sha1p q0, q1, v2.4s
+        sha1m q0, q1, v2.4s
+        sha1su0 v0.16b, v1.16b, v2.16b
+        sha1su0 v0.8h, v1.8h, v2.8h
+        sha1su0 v0.2d, v1.2d, v2.2d
+        sha256h q0, q1, q2
+        sha256h v0.4s, v1.4s, v2.4s
+        sha256h2 q0, q1, q2
+        sha256su1 v0.16b, v1.16b, v2.16b
+
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1c q0, q1, v2.4s
+// CHECK:                   ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1p q0, q1, v2.4s
+// CHECK:                   ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1m q0, q1, v2.4s
+// CHECK:                   ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1su0 v0.16b, v1.16b, v2.16b
+// CHECK:                    ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1su0 v0.8h, v1.8h, v2.8h
+// CHECK:                    ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha1su0 v0.2d, v1.2d, v2.2d
+// CHECK:                    ^
+// CHECK:  error: too few operands for instruction
+// CHECK:         sha256h q0, q1, q2
+// CHECK:         ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha256h v0.4s, v1.4s, v2.4s
+// CHECK:                    ^
+// CHECK:  error: too few operands for instruction
+// CHECK:         sha256h2 q0, q1, q2
+// CHECK:         ^
+// CHECK:  error: invalid operand for instruction
+// CHECK:         sha256su1 v0.16b, v1.16b, v2.16b
+// CHECK:                      ^
+
+//----------------------------------------------------------------------
+// Bitwise extract
+//----------------------------------------------------------------------
+
+        ext v0.8b, v1.8b, v2.4h, #0x3
+        ext v0.4h, v1.4h, v2.4h, #0x3
+        ext v0.2s, v1.2s, v2.2s, #0x1
+        ext v0.1d, v1.1d, v2.1d, #0x0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.8b, v1.8b, v2.4h, #0x3
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.4h, v1.4h, v2.4h, #0x3
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.2s, v1.2s, v2.2s, #0x1
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.1d, v1.1d, v2.1d, #0x0
+// CHECK-ERROR:                ^
+
+        ext v0.16b, v1.16b, v2.8h, #0x3
+        ext v0.8h, v1.8h, v2.8h, #0x3
+        ext v0.4s, v1.4s, v2.4s, #0x1
+        ext v0.2d, v1.2d, v2.2d, #0x0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.16b, v1.16b, v2.8h, #0x3
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.8h, v1.8h, v2.8h, #0x3
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.4s, v1.4s, v2.4s, #0x1
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ext v0.2d, v1.2d, v2.2d, #0x0
+// CHECK-ERROR:                ^
+
+
+//----------------------------------------------------------------------
+// Permutation with 3 vectors
+//----------------------------------------------------------------------
+
+        uzp1 v0.16b, v1.8b, v2.8b
+        uzp1 v0.8b, v1.4b, v2.4b
+        uzp1 v0.8h, v1.4h, v2.4h
+        uzp1 v0.4h, v1.2h, v2.2h
+        uzp1 v0.4s, v1.2s, v2.2s
+        uzp1 v0.2s, v1.1s, v2.1s
+        uzp1 v0.2d, v1.1d, v2.1d
+        uzp1 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4289:22: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4290:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4291:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4292:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4293:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4294:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4295:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4296:17: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        uzp2 v0.16b, v1.8b, v2.8b
+        uzp2 v0.8b, v1.4b, v2.4b
+        uzp2 v0.8h, v1.4h, v2.4h
+        uzp2 v0.4h, v1.2h, v2.2h
+        uzp2 v0.4s, v1.2s, v2.2s
+        uzp2 v0.2s, v1.1s, v2.1s
+        uzp2 v0.2d, v1.1d, v2.1d
+        uzp2 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4298:22: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4299:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4300:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4301:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4302:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4303:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4304:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4305:17: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        zip1 v0.16b, v1.8b, v2.8b
+        zip1 v0.8b, v1.4b, v2.4b
+        zip1 v0.8h, v1.4h, v2.4h
+        zip1 v0.4h, v1.2h, v2.2h
+        zip1 v0.4s, v1.2s, v2.2s
+        zip1 v0.2s, v1.1s, v2.1s
+        zip1 v0.2d, v1.1d, v2.1d
+        zip1 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4307:22: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4308:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4309:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4310:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4311:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4312:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4313:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4314:17: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        zip2 v0.16b, v1.8b, v2.8b
+        zip2 v0.8b, v1.4b, v2.4b
+        zip2 v0.8h, v1.4h, v2.4h
+        zip2 v0.4h, v1.2h, v2.2h
+        zip2 v0.4s, v1.2s, v2.2s
+        zip2 v0.2s, v1.1s, v2.1s
+        zip2 v0.2d, v1.1d, v2.1d
+        zip2 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4316:22: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4317:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4318:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4319:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4320:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4321:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4322:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4323:17: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        trn1 v0.16b, v1.8b, v2.8b
+        trn1 v0.8b, v1.4b, v2.4b
+        trn1 v0.8h, v1.4h, v2.4h
+        trn1 v0.4h, v1.2h, v2.2h
+        trn1 v0.4s, v1.2s, v2.2s
+        trn1 v0.2s, v1.1s, v2.1s
+        trn1 v0.2d, v1.1d, v2.1d
+        trn1 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4325:22: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4326:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4327:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4328:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4329:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4330:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4331:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4332:17: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        trn2 v0.16b, v1.8b, v2.8b
+        trn2 v0.8b, v1.4b, v2.4b
+        trn2 v0.8h, v1.4h, v2.4h
+        trn2 v0.4h, v1.2h, v2.2h
+        trn2 v0.4s, v1.2s, v2.2s
+        trn2 v0.2s, v1.1s, v2.1s
+        trn2 v0.2d, v1.1d, v2.1d
+        trn2 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4334:22: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4335:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4336:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4337:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4338:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4339:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4340:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4341:17: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+//----------------------------------------------------------------------
+// Permutation with 3 vectors
+//----------------------------------------------------------------------
+
+        uzp1 v0.16b, v1.8b, v2.8b
+        uzp1 v0.8b, v1.4b, v2.4b
+        uzp1 v0.8h, v1.4h, v2.4h
+        uzp1 v0.4h, v1.2h, v2.2h
+        uzp1 v0.4s, v1.2s, v2.2s
+        uzp1 v0.2s, v1.1s, v2.1s
+        uzp1 v0.2d, v1.1d, v2.1d
+        uzp1 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4289:22: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4290:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4291:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4292:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4293:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4294:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4295:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4296:17: error: invalid operand for instruction
+// CHECK-ERROR         uzp1 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        uzp2 v0.16b, v1.8b, v2.8b
+        uzp2 v0.8b, v1.4b, v2.4b
+        uzp2 v0.8h, v1.4h, v2.4h
+        uzp2 v0.4h, v1.2h, v2.2h
+        uzp2 v0.4s, v1.2s, v2.2s
+        uzp2 v0.2s, v1.1s, v2.1s
+        uzp2 v0.2d, v1.1d, v2.1d
+        uzp2 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4298:22: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4299:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4300:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4301:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4302:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4303:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4304:21: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4305:17: error: invalid operand for instruction
+// CHECK-ERROR         uzp2 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        zip1 v0.16b, v1.8b, v2.8b
+        zip1 v0.8b, v1.4b, v2.4b
+        zip1 v0.8h, v1.4h, v2.4h
+        zip1 v0.4h, v1.2h, v2.2h
+        zip1 v0.4s, v1.2s, v2.2s
+        zip1 v0.2s, v1.1s, v2.1s
+        zip1 v0.2d, v1.1d, v2.1d
+        zip1 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4307:22: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4308:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4309:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4310:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4311:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4312:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4313:21: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4314:17: error: invalid operand for instruction
+// CHECK-ERROR         zip1 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        zip2 v0.16b, v1.8b, v2.8b
+        zip2 v0.8b, v1.4b, v2.4b
+        zip2 v0.8h, v1.4h, v2.4h
+        zip2 v0.4h, v1.2h, v2.2h
+        zip2 v0.4s, v1.2s, v2.2s
+        zip2 v0.2s, v1.1s, v2.1s
+        zip2 v0.2d, v1.1d, v2.1d
+        zip2 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4316:22: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4317:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4318:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4319:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4320:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4321:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4322:21: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4323:17: error: invalid operand for instruction
+// CHECK-ERROR         zip2 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        trn1 v0.16b, v1.8b, v2.8b
+        trn1 v0.8b, v1.4b, v2.4b
+        trn1 v0.8h, v1.4h, v2.4h
+        trn1 v0.4h, v1.2h, v2.2h
+        trn1 v0.4s, v1.2s, v2.2s
+        trn1 v0.2s, v1.1s, v2.1s
+        trn1 v0.2d, v1.1d, v2.1d
+        trn1 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4325:22: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4326:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4327:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4328:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4329:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4330:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4331:21: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4332:17: error: invalid operand for instruction
+// CHECK-ERROR         trn1 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+        trn2 v0.16b, v1.8b, v2.8b
+        trn2 v0.8b, v1.4b, v2.4b
+        trn2 v0.8h, v1.4h, v2.4h
+        trn2 v0.4h, v1.2h, v2.2h
+        trn2 v0.4s, v1.2s, v2.2s
+        trn2 v0.2s, v1.1s, v2.1s
+        trn2 v0.2d, v1.1d, v2.1d
+        trn2 v0.1d, v1.1d, v2.1d
+
+// CHECK-ERROR <stdin>:4334:22: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.16b, v1.8b, v2.8b
+// CHECK-ERROR                      ^
+// CHECK-ERROR <stdin>:4335:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.8b, v1.4b, v2.4b
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4336:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.8h, v1.4h, v2.4h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4337:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.4h, v1.2h, v2.2h
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4338:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.4s, v1.2s, v2.2s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4339:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.2s, v1.1s, v2.1s
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4340:21: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.2d, v1.1d, v2.1d
+// CHECK-ERROR                     ^
+// CHECK-ERROR <stdin>:4341:17: error: invalid operand for instruction
+// CHECK-ERROR         trn2 v0.1d, v1.1d, v2.1d
+// CHECK-ERROR                 ^
+
+//----------------------------------------------------------------------
+// Floating Point  multiply (scalar, by element)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      fmul    s0, s1, v1.h[0]
+      fmul    h0, h1, v1.s[0]
+      // invalid lane
+      fmul    s2, s29, v10.s[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmul    s0, s1, v1.h[0]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmul    h0, h1, v1.s[0]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR:          fmul    s2, s29, v10.s[4]
+// CHECK-ERROR:                                 ^
+
+//----------------------------------------------------------------------
+// Floating Point  multiply extended (scalar, by element)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      fmulx    d0, d1, v1.b[0]
+      fmulx    h0, h1, v1.d[0]
+      // invalid lane
+      fmulx    d2, d29, v10.d[3]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmulx    d0, d1, v1.b[0]
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmulx    h0, h1, v1.d[0]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR:          fmulx    d2, d29, v10.d[3]
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Floating Point fused multiply-add (scalar, by element)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      fmla    b0, b1, v1.b[0]
+      fmla    d30, s11, v1.d[1]
+      // invalid lane
+      fmla    s16, s22, v16.s[5]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmla    b0, b1, v1.b[0]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmla    d30, s11, v1.d[1]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error:  lane number incompatible with layout
+// CHECK-ERROR:          fmla    s16, s22, v16.s[5]
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Floating Point fused multiply-subtract (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    fmls    s29, h10, v28.s[1]
+    fmls    h7, h17, v26.s[2]
+    // invalid lane
+    fmls    d16, d22, v16.d[-1]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmls    s29, h10, v28.s[1]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          fmls    h7, h17, v26.s[2]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error:  expected lane number
+// CHECK-ERROR:          fmls    d16, d22, v16.d[-1]
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply-add long
+// (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqdmlal s0, h0, v0.s[0]
+    sqdmlal s8, s9, v14.s[1]
+    // invalid lane
+    sqdmlal s4, s5, v1.s[5]
+    // invalid vector index
+    sqdmlal s0, h0, v17.h[0]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlal s0, h0, v0.s[0]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlal s8, s9, v14.s[1]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmlal s4, s5, v1.s[5]
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlal s0, h0, v17.h[0]
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply-subtract long
+// (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqdmlsl s1, h1, v1.d[0]
+    sqdmlsl d1, h1, v13.s[0]
+    // invalid lane
+    sqdmlsl d1, s1, v13.s[4]
+    // invalid vector index
+    sqdmlsl s1, h1, v20.h[7]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlsl s1, h1, v1.d[0]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlsl d1, h1, v13.s[0]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmlsl d1, s1, v13.s[4]
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmlsl s1, h1, v20.h[7]
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply long (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    // invalid lane
+    // invalid vector index
+    // mismatched and invalid vector types
+    sqdmull s1, h1, v1.s[1]
+    sqdmull s1, s1, v4.s[0]
+    // invalid lane
+    sqdmull s12, h17, v9.h[9]
+    // invalid vector index
+    sqdmull s1, h1, v16.h[5]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmull s1, h1, v1.s[1]
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmull s1, s1, v4.s[0]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmull s12, h17, v9.h[9]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmull s1, h1, v16.h[5]
+// CHECK-ERROR:                           ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqdmulh h0, s1, v0.h[0]
+    sqdmulh s25, s26, v27.h[3]
+    // invalid lane
+    sqdmulh s25, s26, v27.s[4]
+    // invalid vector index
+    sqdmulh s0, h1, v30.h[0]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmulh h0, s1, v0.h[0]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmulh s25, s26, v27.h[3]
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqdmulh s25, s26, v27.s[4]
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqdmulh s0, h1, v30.h[0]
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Scalar Signed saturating rounding doubling multiply
+// returning high half (scalar, by element)
+//----------------------------------------------------------------------
+    // mismatched and invalid vector types
+    sqrdmulh h31, h30, v14.s[2]
+    sqrdmulh s5, h6, v7.s[2]
+    // invalid lane
+    sqrdmulh h31, h30, v14.h[9]
+    // invalid vector index
+    sqrdmulh h31, h30, v20.h[4]
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqrdmulh h31, h30, v14.s[2]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqrdmulh s5, h6, v7.s[2]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          sqrdmulh h31, h30, v14.h[9]
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          sqrdmulh h31, h30, v20.h[4]
+// CHECK-ERROR:                              ^
+
+//----------------------------------------------------------------------
+// Scalar Duplicate element (scalar)
+//----------------------------------------------------------------------
+      // mismatched and invalid vector types
+      dup b0, v1.d[0]
+      dup h0, v31.b[8]
+      dup s0, v2.h[4]
+      dup d0, v17.s[3]
+      // invalid  lane
+      dup d0, v17.d[4]
+      dup s0, v1.s[7]
+      dup h0, v31.h[16]
+      dup b1, v3.b[16]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup b0, v1.d[0]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup h0, v31.b[8]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup s0, v2.h[4]
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          dup d0, v17.s[3]
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup d0, v17.d[4]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup s0, v1.s[7]
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup h0, v31.h[16]
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:          dup b1, v3.b[16]
+// CHECK-ERROR:                       ^
+
+//----------------------------------------------------------------------
+// Table look up
+//----------------------------------------------------------------------
+
+        tbl v0.8b, {v1.8b}, v2.8b
+        tbl v0.8b, {v1.8b, v2.8b}, v2.8b
+        tbl v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b
+        tbl v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b
+        tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbl v0.8b, {v1.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbl v0.8b, {v1.8b, v2.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbl v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbl v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b
+// CHECK-ERROR:                                                    ^
+
+        tbx v0.8b, {v1.8b}, v2.8b
+        tbx v0.8b, {v1.8b, v2.8b}, v2.8b
+        tbx v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b
+        tbx v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b
+        tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbx v0.8b, {v1.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbx v0.8b, {v1.8b, v2.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbx v0.8b, {v1.8b, v2.8b, v3.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        tbx v0.8b, {v1.8b, v2.8b, v3.8b, v4.8b}, v2.8b
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid number of vectors
+// CHECK-ERROR:        tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b, v5.16b}, v2.8b
+// CHECK-ERROR:                                                    ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Lower Precision Narrow, Rounding To
+// Odd
+//----------------------------------------------------------------------
+
+    fcvtxn s0, s1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtxn s0, s1
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding To Nearest
+// With Ties To Away
+//----------------------------------------------------------------------
+
+    fcvtas s0, d0
+    fcvtas d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtas s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtas d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding To
+// Nearest With Ties To Away
+//----------------------------------------------------------------------
+
+    fcvtau s0, d0
+    fcvtau d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtau s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtau d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding Toward
+// Minus Infinity
+//----------------------------------------------------------------------
+
+    fcvtms s0, d0
+    fcvtms d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtms s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtms d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding Toward
+// Minus Infinity
+//----------------------------------------------------------------------
+
+    fcvtmu s0, d0
+    fcvtmu d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtmu s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtmu d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding To Nearest
+// With Ties To Even
+//----------------------------------------------------------------------
+
+    fcvtns s0, d0
+    fcvtns d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtns s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtns d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding To
+// Nearest With Ties To Even
+//----------------------------------------------------------------------
+
+    fcvtnu s0, d0
+    fcvtnu d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtnu s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtnu d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding Toward
+// Positive Infinity
+//----------------------------------------------------------------------
+
+    fcvtps s0, d0
+    fcvtps d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtps s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtps d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding Toward
+// Positive Infinity
+//----------------------------------------------------------------------
+
+    fcvtpu s0, d0
+    fcvtpu d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtpu s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtpu d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding Toward Zero
+//----------------------------------------------------------------------
+
+    fcvtzs s0, d0
+    fcvtzs d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtzs s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtzs d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding Toward 
+// Zero
+//----------------------------------------------------------------------
+
+    fcvtzu s0, d0
+    fcvtzu d0, s0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtzu s0, d0
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcvtzu d0, s0
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Absolute Difference
+//----------------------------------------------------------------------
+
+
+    fabd s29, d24, s20
+    fabd d29, s24, d20
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fabd s29, d24, s20
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fabd d29, s24, d20
+// CHECK-ERROR:                  ^
diff --git a/test/MC/AArch64/neon-extract.s b/test/MC/AArch64/neon-extract.s
new file mode 100644
index 0000000..2d58a75
--- /dev/null
+++ b/test/MC/AArch64/neon-extract.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Instructions for bitwise extract
+//------------------------------------------------------------------------------
+
+        ext v0.8b, v1.8b, v2.8b, #0x3
+        ext v0.16b, v1.16b, v2.16b, #0x3
+
+// CHECK: ext	v0.8b, v1.8b, v2.8b, #0x3  // encoding: [0x20,0x18,0x02,0x2e]
+// CHECK: ext	v0.16b, v1.16b, v2.16b, #0x3 // encoding: [0x20,0x18,0x02,0x6e]
diff --git a/test/MC/AArch64/neon-mov.s b/test/MC/AArch64/neon-mov.s
index 8331372..c2ca803 100644
--- a/test/MC/AArch64/neon-mov.s
+++ b/test/MC/AArch64/neon-mov.s
@@ -195,13 +195,15 @@
 //----------------------------------------------------------------------
 // Vector Move -  register
 //----------------------------------------------------------------------
+
+      // FIXME: these should all print with the "mov" syntax.
       mov v0.8b, v31.8b
       mov v15.16b, v16.16b
       orr v0.8b, v31.8b, v31.8b
       orr v15.16b, v16.16b, v16.16b
 
-// CHECK:   mov v0.8b, v31.8b     // encoding: [0xe0,0x1f,0xbf,0x0e]
-// CHECK:   mov v15.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
-// CHECK:   mov v0.8b, v31.8b     // encoding: [0xe0,0x1f,0xbf,0x0e]
-// CHECK:   mov v15.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
+// CHECK:   orr v0.8b, v31.8b, v31.8b      // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK:   orr v15.16b, v16.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
+// CHECK:   orr v0.8b, v31.8b, v31.8b      // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK:   orr v15.16b, v16.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
 
diff --git a/test/MC/AArch64/neon-perm.s b/test/MC/AArch64/neon-perm.s
new file mode 100644
index 0000000..20a4acde
--- /dev/null
+++ b/test/MC/AArch64/neon-perm.s
@@ -0,0 +1,103 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Instructions for permute
+//------------------------------------------------------------------------------
+
+        uzp1 v0.8b, v1.8b, v2.8b
+        uzp1 v0.16b, v1.16b, v2.16b
+        uzp1 v0.4h, v1.4h, v2.4h
+        uzp1 v0.8h, v1.8h, v2.8h
+        uzp1 v0.2s, v1.2s, v2.2s
+        uzp1 v0.4s, v1.4s, v2.4s
+        uzp1 v0.2d, v1.2d, v2.2d
+
+// CHECK: uzp1	v0.8b, v1.8b, v2.8b     // encoding: [0x20,0x18,0x02,0x0e]
+// CHECK: uzp1	v0.16b, v1.16b, v2.16b  // encoding: [0x20,0x18,0x02,0x4e]
+// CHECK: uzp1	v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x18,0x42,0x0e]
+// CHECK: uzp1	v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x18,0x42,0x4e]
+// CHECK: uzp1	v0.2s, v1.2s, v2.2s     // encoding: [0x20,0x18,0x82,0x0e]
+// CHECK: uzp1	v0.4s, v1.4s, v2.4s     // encoding: [0x20,0x18,0x82,0x4e]
+// CHECK: uzp1	v0.2d, v1.2d, v2.2d     // encoding: [0x20,0x18,0xc2,0x4e]
+
+        trn1 v0.8b, v1.8b, v2.8b
+        trn1 v0.16b, v1.16b, v2.16b
+        trn1 v0.4h, v1.4h, v2.4h
+        trn1 v0.8h, v1.8h, v2.8h
+        trn1 v0.2s, v1.2s, v2.2s
+        trn1 v0.4s, v1.4s, v2.4s
+        trn1 v0.2d, v1.2d, v2.2d
+
+// CHECK: trn1	v0.8b, v1.8b, v2.8b     // encoding: [0x20,0x28,0x02,0x0e]
+// CHECK: trn1	v0.16b, v1.16b, v2.16b  // encoding: [0x20,0x28,0x02,0x4e]
+// CHECK: trn1	v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x28,0x42,0x0e]
+// CHECK: trn1	v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x28,0x42,0x4e]
+// CHECK: trn1	v0.2s, v1.2s, v2.2s     // encoding: [0x20,0x28,0x82,0x0e]
+// CHECK: trn1	v0.4s, v1.4s, v2.4s     // encoding: [0x20,0x28,0x82,0x4e]
+// CHECK: trn1	v0.2d, v1.2d, v2.2d     // encoding: [0x20,0x28,0xc2,0x4e]
+
+        zip1 v0.8b, v1.8b, v2.8b
+        zip1 v0.16b, v1.16b, v2.16b
+        zip1 v0.4h, v1.4h, v2.4h
+        zip1 v0.8h, v1.8h, v2.8h
+        zip1 v0.2s, v1.2s, v2.2s
+        zip1 v0.4s, v1.4s, v2.4s
+        zip1 v0.2d, v1.2d, v2.2d
+
+// CHECK: zip1	v0.8b, v1.8b, v2.8b     // encoding: [0x20,0x38,0x02,0x0e]
+// CHECK: zip1	v0.16b, v1.16b, v2.16b  // encoding: [0x20,0x38,0x02,0x4e]
+// CHECK: zip1	v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x38,0x42,0x0e]
+// CHECK: zip1	v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x38,0x42,0x4e]
+// CHECK: zip1	v0.2s, v1.2s, v2.2s     // encoding: [0x20,0x38,0x82,0x0e]
+// CHECK: zip1	v0.4s, v1.4s, v2.4s     // encoding: [0x20,0x38,0x82,0x4e]
+// CHECK: zip1	v0.2d, v1.2d, v2.2d     // encoding: [0x20,0x38,0xc2,0x4e]
+
+        uzp2 v0.8b, v1.8b, v2.8b
+        uzp2 v0.16b, v1.16b, v2.16b
+        uzp2 v0.4h, v1.4h, v2.4h
+        uzp2 v0.8h, v1.8h, v2.8h
+        uzp2 v0.2s, v1.2s, v2.2s
+        uzp2 v0.4s, v1.4s, v2.4s
+        uzp2 v0.2d, v1.2d, v2.2d
+
+// CHECK: uzp2	v0.8b, v1.8b, v2.8b     // encoding: [0x20,0x58,0x02,0x0e]
+// CHECK: uzp2	v0.16b, v1.16b, v2.16b  // encoding: [0x20,0x58,0x02,0x4e]
+// CHECK: uzp2	v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x58,0x42,0x0e]
+// CHECK: uzp2	v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x58,0x42,0x4e]
+// CHECK: uzp2	v0.2s, v1.2s, v2.2s     // encoding: [0x20,0x58,0x82,0x0e]
+// CHECK: uzp2	v0.4s, v1.4s, v2.4s     // encoding: [0x20,0x58,0x82,0x4e]
+// CHECK: uzp2	v0.2d, v1.2d, v2.2d     // encoding: [0x20,0x58,0xc2,0x4e]
+
+        trn2 v0.8b, v1.8b, v2.8b
+        trn2 v0.16b, v1.16b, v2.16b
+        trn2 v0.4h, v1.4h, v2.4h
+        trn2 v0.8h, v1.8h, v2.8h
+        trn2 v0.2s, v1.2s, v2.2s
+        trn2 v0.4s, v1.4s, v2.4s
+        trn2 v0.2d, v1.2d, v2.2d
+
+// CHECK: trn2	v0.8b, v1.8b, v2.8b     // encoding: [0x20,0x68,0x02,0x0e]
+// CHECK: trn2	v0.16b, v1.16b, v2.16b  // encoding: [0x20,0x68,0x02,0x4e]
+// CHECK: trn2	v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x68,0x42,0x0e]
+// CHECK: trn2	v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x68,0x42,0x4e]
+// CHECK: trn2	v0.2s, v1.2s, v2.2s     // encoding: [0x20,0x68,0x82,0x0e]
+// CHECK: trn2	v0.4s, v1.4s, v2.4s     // encoding: [0x20,0x68,0x82,0x4e]
+// CHECK: trn2	v0.2d, v1.2d, v2.2d     // encoding: [0x20,0x68,0xc2,0x4e]
+
+        zip2 v0.8b, v1.8b, v2.8b
+        zip2 v0.16b, v1.16b, v2.16b
+        zip2 v0.4h, v1.4h, v2.4h
+        zip2 v0.8h, v1.8h, v2.8h
+        zip2 v0.2s, v1.2s, v2.2s
+        zip2 v0.4s, v1.4s, v2.4s
+        zip2 v0.2d, v1.2d, v2.2d
+
+// CHECK: zip2	v0.8b, v1.8b, v2.8b     // encoding: [0x20,0x78,0x02,0x0e]
+// CHECK: zip2	v0.16b, v1.16b, v2.16b  // encoding: [0x20,0x78,0x02,0x4e]
+// CHECK: zip2	v0.4h, v1.4h, v2.4h     // encoding: [0x20,0x78,0x42,0x0e]
+// CHECK: zip2	v0.8h, v1.8h, v2.8h     // encoding: [0x20,0x78,0x42,0x4e]
+// CHECK: zip2	v0.2s, v1.2s, v2.2s     // encoding: [0x20,0x78,0x82,0x0e]
+// CHECK: zip2	v0.4s, v1.4s, v2.4s     // encoding: [0x20,0x78,0x82,0x4e]
+// CHECK: zip2	v0.2d, v1.2d, v2.2d     // encoding: [0x20,0x78,0xc2,0x4e]
diff --git a/test/MC/AArch64/neon-rounding-shift.s b/test/MC/AArch64/neon-rounding-shift.s
index f3c70d7..e70f766 100644
--- a/test/MC/AArch64/neon-rounding-shift.s
+++ b/test/MC/AArch64/neon-rounding-shift.s
@@ -41,17 +41,5 @@
 // CHECK: urshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x54,0xa2,0x6e]
 // CHECK: urshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x54,0xe2,0x6e]
 
-//------------------------------------------------------------------------------
-// Scalar Integer Rounding Shift Lef (Signed)
-//------------------------------------------------------------------------------
-         srshl d17, d31, d8
-
-// CHECK: srshl d17, d31, d8      // encoding: [0xf1,0x57,0xe8,0x5e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Rounding Shift Lef (Unsigned)
-//------------------------------------------------------------------------------
-         urshl d17, d31, d8
 
-// CHECK: urshl d17, d31, d8      // encoding: [0xf1,0x57,0xe8,0x7e]
 
diff --git a/test/MC/AArch64/neon-saturating-add-sub.s b/test/MC/AArch64/neon-saturating-add-sub.s
index 1032ae4..4a7ed10 100644
--- a/test/MC/AArch64/neon-saturating-add-sub.s
+++ b/test/MC/AArch64/neon-saturating-add-sub.s
@@ -79,55 +79,4 @@
 // CHECK: uqsub v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x2c,0xa2,0x6e]
 // CHECK: uqsub v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x2c,0xe2,0x6e]
 
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Add (Signed)
-//------------------------------------------------------------------------------
-         sqadd b0, b1, b2
-         sqadd h10, h11, h12
-         sqadd s20, s21, s2
-         sqadd d17, d31, d8
-
-// CHECK: sqadd b0, b1, b2        // encoding: [0x20,0x0c,0x22,0x5e]
-// CHECK: sqadd h10, h11, h12     // encoding: [0x6a,0x0d,0x6c,0x5e]
-// CHECK: sqadd s20, s21, s2      // encoding: [0xb4,0x0e,0xa2,0x5e]
-// CHECK: sqadd d17, d31, d8      // encoding: [0xf1,0x0f,0xe8,0x5e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Add (Unsigned)
-//------------------------------------------------------------------------------
-         uqadd b0, b1, b2
-         uqadd h10, h11, h12
-         uqadd s20, s21, s2
-         uqadd d17, d31, d8
-
-// CHECK: uqadd b0, b1, b2        // encoding: [0x20,0x0c,0x22,0x7e]
-// CHECK: uqadd h10, h11, h12     // encoding: [0x6a,0x0d,0x6c,0x7e]
-// CHECK: uqadd s20, s21, s2      // encoding: [0xb4,0x0e,0xa2,0x7e]
-// CHECK: uqadd d17, d31, d8      // encoding: [0xf1,0x0f,0xe8,0x7e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Sub (Signed)
-//------------------------------------------------------------------------------
-         sqsub b0, b1, b2
-         sqsub h10, h11, h12
-         sqsub s20, s21, s2
-         sqsub d17, d31, d8
-
-// CHECK: sqsub b0, b1, b2        // encoding: [0x20,0x2c,0x22,0x5e]
-// CHECK: sqsub h10, h11, h12     // encoding: [0x6a,0x2d,0x6c,0x5e]
-// CHECK: sqsub s20, s21, s2      // encoding: [0xb4,0x2e,0xa2,0x5e]
-// CHECK: sqsub d17, d31, d8      // encoding: [0xf1,0x2f,0xe8,0x5e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Sub (Unsigned)
-//------------------------------------------------------------------------------
-         uqsub b0, b1, b2
-         uqsub h10, h11, h12
-         uqsub s20, s21, s2
-         uqsub d17, d31, d8
-
-// CHECK: uqsub b0, b1, b2        // encoding: [0x20,0x2c,0x22,0x7e]
-// CHECK: uqsub h10, h11, h12     // encoding: [0x6a,0x2d,0x6c,0x7e]
-// CHECK: uqsub s20, s21, s2      // encoding: [0xb4,0x2e,0xa2,0x7e]
-// CHECK: uqsub d17, d31, d8      // encoding: [0xf1,0x2f,0xe8,0x7e]
 
diff --git a/test/MC/AArch64/neon-saturating-rounding-shift.s b/test/MC/AArch64/neon-saturating-rounding-shift.s
index a36e689..9215c1c 100644
--- a/test/MC/AArch64/neon-saturating-rounding-shift.s
+++ b/test/MC/AArch64/neon-saturating-rounding-shift.s
@@ -41,30 +41,3 @@
 // CHECK: uqrshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x5c,0xa2,0x6e]
 // CHECK: uqrshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x5c,0xe2,0x6e]
 
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Rounding Shift Lef (Signed)
-//------------------------------------------------------------------------------
-         sqrshl b0, b1, b2
-         sqrshl h10, h11, h12
-         sqrshl s20, s21, s2
-         sqrshl d17, d31, d8
-
-// CHECK: sqrshl b0, b1, b2        // encoding: [0x20,0x5c,0x22,0x5e]
-// CHECK: sqrshl h10, h11, h12     // encoding: [0x6a,0x5d,0x6c,0x5e]
-// CHECK: sqrshl s20, s21, s2      // encoding: [0xb4,0x5e,0xa2,0x5e]
-// CHECK: sqrshl d17, d31, d8      // encoding: [0xf1,0x5f,0xe8,0x5e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Rounding Shift Lef (Unsigned)
-//------------------------------------------------------------------------------
-         uqrshl b0, b1, b2
-         uqrshl h10, h11, h12
-         uqrshl s20, s21, s2
-         uqrshl d17, d31, d8
-
-// CHECK: uqrshl b0, b1, b2        // encoding: [0x20,0x5c,0x22,0x7e]
-// CHECK: uqrshl h10, h11, h12     // encoding: [0x6a,0x5d,0x6c,0x7e]
-// CHECK: uqrshl s20, s21, s2      // encoding: [0xb4,0x5e,0xa2,0x7e]
-// CHECK: uqrshl d17, d31, d8      // encoding: [0xf1,0x5f,0xe8,0x7e]
-
-
diff --git a/test/MC/AArch64/neon-saturating-shift.s b/test/MC/AArch64/neon-saturating-shift.s
index 2c8456d..9ae393a 100644
--- a/test/MC/AArch64/neon-saturating-shift.s
+++ b/test/MC/AArch64/neon-saturating-shift.s
@@ -41,29 +41,3 @@
 // CHECK: uqshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x4c,0xa2,0x6e]
 // CHECK: uqshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x4c,0xe2,0x6e]
 
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Shift Lef (Signed)
-//------------------------------------------------------------------------------
-         sqshl b0, b1, b2
-         sqshl h10, h11, h12
-         sqshl s20, s21, s2
-         sqshl d17, d31, d8
-
-// CHECK: sqshl b0, b1, b2        // encoding: [0x20,0x4c,0x22,0x5e]
-// CHECK: sqshl h10, h11, h12     // encoding: [0x6a,0x4d,0x6c,0x5e]
-// CHECK: sqshl s20, s21, s2      // encoding: [0xb4,0x4e,0xa2,0x5e]
-// CHECK: sqshl d17, d31, d8      // encoding: [0xf1,0x4f,0xe8,0x5e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Saturating Shift Lef (Unsigned)
-//------------------------------------------------------------------------------
-         uqshl b0, b1, b2
-         uqshl h10, h11, h12
-         uqshl s20, s21, s2
-         uqshl d17, d31, d8
-
-// CHECK: uqshl b0, b1, b2        // encoding: [0x20,0x4c,0x22,0x7e]
-// CHECK: uqshl h10, h11, h12     // encoding: [0x6a,0x4d,0x6c,0x7e]
-// CHECK: uqshl s20, s21, s2      // encoding: [0xb4,0x4e,0xa2,0x7e]
-// CHECK: uqshl d17, d31, d8      // encoding: [0xf1,0x4f,0xe8,0x7e]
-
diff --git a/test/MC/AArch64/neon-scalar-abs.s b/test/MC/AArch64/neon-scalar-abs.s
new file mode 100644
index 0000000..d08756c
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-abs.s
@@ -0,0 +1,35 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Absolute Value
+//----------------------------------------------------------------------
+
+    abs d29, d24
+
+// CHECK: abs d29, d24    // encoding: [0x1d,0xbb,0xe0,0x5e]
+        
+//----------------------------------------------------------------------
+// Scalar Floating-point Absolute Difference
+//----------------------------------------------------------------------
+
+    fabd s29, s24, s20
+    fabd d29, d24, d20
+
+// CHECK: fabd s29, s24, s20  // encoding: [0x1d,0xd7,0xb4,0x7e]
+// CHECK: fabd d29, d24, d20  // encoding: [0x1d,0xd7,0xf4,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Signed Saturating Absolute Value
+//----------------------------------------------------------------------
+
+    sqabs b19, b14
+    sqabs h21, h15
+    sqabs s20, s12
+    sqabs d18, d12
+
+// CHECK: sqabs b19, b14    // encoding: [0xd3,0x79,0x20,0x5e]
+// CHECK: sqabs h21, h15    // encoding: [0xf5,0x79,0x60,0x5e]
+// CHECK: sqabs s20, s12    // encoding: [0x94,0x79,0xa0,0x5e]
+// CHECK: sqabs d18, d12    // encoding: [0x92,0x79,0xe0,0x5e]
diff --git a/test/MC/AArch64/neon-scalar-add-sub.s b/test/MC/AArch64/neon-scalar-add-sub.s
new file mode 100644
index 0000000..0a3eba7
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-add-sub.s
@@ -0,0 +1,16 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Scalar Integer Add
+//------------------------------------------------------------------------------
+         add d31, d0, d16
+
+// CHECK: add d31, d0, d16       // encoding: [0x1f,0x84,0xf0,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Sub
+//------------------------------------------------------------------------------
+         sub d1, d7, d8
+
+// CHECK: sub d1, d7, d8       // encoding: [0xe1,0x84,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mla.s b/test/MC/AArch64/neon-scalar-by-elem-mla.s
new file mode 100644
index 0000000..fec9d12
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-mla.s
@@ -0,0 +1,44 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Floating Point fused multiply-add (scalar, by element)
+//------------------------------------------------------------------------------
+    fmla    s0, s1, v1.s[0]
+    fmla    s30, s11, v1.s[1]
+    fmla    s4, s5, v7.s[2]
+    fmla    s16, s22, v16.s[3]
+    fmla    d0, d1, v1.d[0]
+    fmla    d30, d11, v1.d[1]
+
+// CHECK: fmla    s0, s1, v1.s[0]         // encoding: [0x20,0x10,0x81,0x5f]
+// CHECK: fmla    s30, s11, v1.s[1]       // encoding: [0x7e,0x11,0xa1,0x5f]
+// CHECK: fmla    s4, s5, v7.s[2]         // encoding: [0xa4,0x18,0x87,0x5f]
+// CHECK: fmla    s16, s22, v16.s[3]      // encoding: [0xd0,0x1a,0xb0,0x5f]
+// CHECK: fmla    d0, d1, v1.d[0]         // encoding: [0x20,0x10,0xc1,0x5f]
+// CHECK: fmla    d30, d11, v1.d[1]       // encoding: [0x7e,0x19,0xc1,0x5f]
+ 
+//------------------------------------------------------------------------------
+// Floating Point fused multiply-subtract (scalar, by element)
+//------------------------------------------------------------------------------
+
+    fmls    s2, s3, v4.s[0]
+    fmls    s29, s10, v28.s[1]      
+    fmls    s5, s12, v23.s[2]       
+    fmls    s7, s17, v26.s[3]       
+    fmls    d0, d1, v1.d[0]         
+    fmls    d30, d11, v1.d[1]       
+
+// CHECK: fmls    s2, s3, v4.s[0]     // encoding: [0x62,0x50,0x84,0x5f]
+// CHECK: fmls    s29, s10, v28.s[1]  // encoding: [0x5d,0x51,0xbc,0x5f]
+// CHECK: fmls    s5, s12, v23.s[2]   // encoding: [0x85,0x59,0x97,0x5f]
+// CHECK: fmls    s7, s17, v26.s[3]   // encoding: [0x27,0x5a,0xba,0x5f]
+// CHECK: fmls    d0, d1, v1.d[0]     // encoding: [0x20,0x50,0xc1,0x5f]
+// CHECK: fmls    d30, d11, v1.d[1]   // encoding: [0x7e,0x59,0xc1,0x5f]
+
+        
+        
+        
+
+        
+        
+
diff --git a/test/MC/AArch64/neon-scalar-by-elem-mul.s b/test/MC/AArch64/neon-scalar-by-elem-mul.s
new file mode 100644
index 0000000..8b8a3f5
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-mul.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Floating Point  multiply (scalar, by element)
+//------------------------------------------------------------------------------
+    fmul    s0, s1, v1.s[0]
+    fmul    s30, s11, v1.s[1]
+    fmul    s4, s5, v7.s[2]
+    fmul    s16, s22, v16.s[3]
+    fmul    d0, d1, v1.d[0]
+    fmul    d30, d11, v1.d[1]
+
+// CHECK: fmul    s0, s1, v1.s[0]      // encoding: [0x20,0x90,0x81,0x5f]
+// CHECK: fmul    s30, s11, v1.s[1]    // encoding: [0x7e,0x91,0xa1,0x5f]
+// CHECK: fmul    s4, s5, v7.s[2]      // encoding: [0xa4,0x98,0x87,0x5f]
+// CHECK: fmul    s16, s22, v16.s[3]   // encoding: [0xd0,0x9a,0xb0,0x5f]
+// CHECK: fmul    d0, d1, v1.d[0]      // encoding: [0x20,0x90,0xc1,0x5f]
+// CHECK: fmul    d30, d11, v1.d[1]    // encoding: [0x7e,0x99,0xc1,0x5f]
+
+
+//------------------------------------------------------------------------------
+// Floating Point  multiply extended (scalar, by element)
+//------------------------------------------------------------------------------
+    fmulx   s6, s2, v8.s[0]
+    fmulx   s7, s3, v13.s[1]
+    fmulx   s9, s7, v9.s[2]
+    fmulx   s13, s21, v10.s[3]
+    fmulx   d15, d9, v7.d[0]
+    fmulx   d13, d12, v11.d[1]
+
+// CHECK: fmulx   s6, s2, v8.s[0]         // encoding: [0x46,0x90,0x88,0x7f]
+// CHECK: fmulx   s7, s3, v13.s[1]        // encoding: [0x67,0x90,0xad,0x7f]
+// CHECK: fmulx   s9, s7, v9.s[2]         // encoding: [0xe9,0x98,0x89,0x7f]
+// CHECK: fmulx   s13, s21, v10.s[3]      // encoding: [0xad,0x9a,0xaa,0x7f]
+// CHECK: fmulx   d15, d9, v7.d[0]        // encoding: [0x2f,0x91,0xc7,0x7f]
+// CHECK: fmulx   d13, d12, v11.d[1]      // encoding: [0x8d,0x99,0xcb,0x7f]
+
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
new file mode 100644
index 0000000..e3d7e05
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mla.s
@@ -0,0 +1,46 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//-----------------------------------------------------------------------------
+// Signed saturating doubling multiply-add long (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmlal s0, h0, v0.h[0]
+    sqdmlal s7, h1, v4.h[3]
+    sqdmlal s11, h16, v8.h[4]
+    sqdmlal s30, h30, v15.h[7]
+    sqdmlal d0, s0, v3.s[0]
+    sqdmlal d30, s30, v30.s[3]
+    sqdmlal d8, s9, v14.s[1]
+
+// CHECK: sqdmlal s0, h0, v0.h[0]       // encoding: [0x00,0x30,0x40,0x5f]
+// CHECK: sqdmlal s7, h1, v4.h[3]       // encoding: [0x27,0x30,0x74,0x5f]
+// CHECK: sqdmlal s11, h16, v8.h[4]     // encoding: [0x0b,0x3a,0x48,0x5f]
+// CHECK: sqdmlal s30, h30, v15.h[7]    // encoding: [0xde,0x3b,0x7f,0x5f]
+// CHECK: sqdmlal d0, s0, v3.s[0]       // encoding: [0x00,0x30,0x83,0x5f]
+// CHECK: sqdmlal d30, s30, v30.s[3]    // encoding: [0xde,0x3b,0xbe,0x5f]
+// CHECK: sqdmlal d8, s9, v14.s[1]      // encoding: [0x28,0x31,0xae,0x5f]
+ 
+//-----------------------------------------------------------------------------
+// Signed saturating doubling multiply-subtract long (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmlsl s1, h1, v1.h[0]
+    sqdmlsl s8, h2, v5.h[1]
+    sqdmlsl s12, h13, v14.h[2]
+    sqdmlsl s29, h28, v11.h[7]
+    sqdmlsl d1, s1, v13.s[0]
+    sqdmlsl d31, s31, v31.s[2]
+    sqdmlsl d16, s18, v28.s[3]
+
+// CHECK: sqdmlsl s1, h1, v1.h[0]       // encoding: [0x21,0x70,0x41,0x5f]
+// CHECK: sqdmlsl s8, h2, v5.h[1]       // encoding: [0x48,0x70,0x55,0x5f]
+// CHECK: sqdmlsl s12, h13, v14.h[2]    // encoding: [0xac,0x71,0x6e,0x5f]
+// CHECK: sqdmlsl s29, h28, v11.h[7]    // encoding: [0x9d,0x7b,0x7b,0x5f]
+// CHECK: sqdmlsl d1, s1, v13.s[0]      // encoding: [0x21,0x70,0x8d,0x5f]
+// CHECK: sqdmlsl d31, s31, v31.s[2]    // encoding: [0xff,0x7b,0x9f,0x5f]
+// CHECK: sqdmlsl d16, s18, v28.s[3]    // encoding: [0x50,0x7a,0xbc,0x5f]
+
+
+        
+
+        
+        
+
diff --git a/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
new file mode 100644
index 0000000..8a8405e
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-by-elem-saturating-mul.s
@@ -0,0 +1,58 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//-----------------------------------------------------------------------------
+// Signed saturating doubling multiply long (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmull s1, h1, v1.h[1]
+    sqdmull s8, h2, v5.h[2]
+    sqdmull s12, h17, v9.h[3]
+    sqdmull s31, h31, v15.h[7]
+    sqdmull d1, s1, v4.s[0]
+    sqdmull d31, s31, v31.s[3]
+    sqdmull d9, s10, v15.s[0]
+
+
+// CHECK: sqdmull s1, h1, v1.h[1]       // encoding: [0x21,0xb0,0x51,0x5f]
+// CHECK: sqdmull s8, h2, v5.h[2]       // encoding: [0x48,0xb0,0x65,0x5f]
+// CHECK: sqdmull s12, h17, v9.h[3]     // encoding: [0x2c,0xb2,0x79,0x5f]
+// CHECK: sqdmull s31, h31, v15.h[7]    // encoding: [0xff,0xbb,0x7f,0x5f]
+// CHECK: sqdmull d1, s1, v4.s[0]       // encoding: [0x21,0xb0,0x84,0x5f]
+// CHECK: sqdmull d31, s31, v31.s[3]    // encoding: [0xff,0xbb,0xbf,0x5f]
+// CHECK: sqdmull d9, s10, v15.s[0]     // encoding: [0x49,0xb1,0x8f,0x5f]
+ 
+//-----------------------------------------------------------------------------
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqdmulh h0, h1, v0.h[0]
+    sqdmulh h10, h11, v10.h[4]
+    sqdmulh h20, h21, v15.h[7]
+    sqdmulh s25, s26, v27.s[3]
+    sqdmulh s2, s6, v7.s[0]
+
+// CHECK: sqdmulh h0, h1, v0.h[0]       // encoding: [0x20,0xc0,0x40,0x5f]
+// CHECK: sqdmulh h10, h11, v10.h[4]    // encoding: [0x6a,0xc9,0x4a,0x5f]
+// CHECK: sqdmulh h20, h21, v15.h[7]    // encoding: [0xb4,0xca,0x7f,0x5f]
+// CHECK: sqdmulh s25, s26, v27.s[3]    // encoding: [0x59,0xcb,0xbb,0x5f]
+// CHECK: sqdmulh s2, s6, v7.s[0]       // encoding: [0xc2,0xc0,0x87,0x5f]
+
+//-----------------------------------------------------------------------------
+// Signed saturating rounding doubling multiply returning
+// high half (scalar, by element)
+//-----------------------------------------------------------------------------
+    sqrdmulh h31, h30, v14.h[2]
+    sqrdmulh h1, h1, v1.h[4]
+    sqrdmulh h21, h22, v15.h[7]
+    sqrdmulh s5, s6, v7.s[2]
+    sqrdmulh s20, s26, v27.s[1]
+
+// CHECK: sqrdmulh h31, h30, v14.h[2]   // encoding: [0xdf,0xd3,0x6e,0x5f]
+// CHECK: sqrdmulh h1, h1, v1.h[4]      // encoding: [0x21,0xd8,0x41,0x5f]
+// CHECK: sqrdmulh h21, h22, v15.h[7]   // encoding: [0xd5,0xda,0x7f,0x5f]
+// CHECK: sqrdmulh s5, s6, v7.s[2]      // encoding: [0xc5,0xd8,0x87,0x5f]
+// CHECK: sqrdmulh s20, s26, v27.s[1]   // encoding: [0x54,0xd3,0xbb,0x5f]
+        
+
+        
+        
+
diff --git a/test/MC/AArch64/neon-scalar-compare.s b/test/MC/AArch64/neon-scalar-compare.s
new file mode 100644
index 0000000..55ade0e
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-compare.s
@@ -0,0 +1,90 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Compare Bitwise Equal
+//----------------------------------------------------------------------
+
+         cmeq d20, d21, d22
+
+// CHECK: cmeq d20, d21, d22   // encoding: [0xb4,0x8e,0xf6,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Bitwise Equal To Zero
+//----------------------------------------------------------------------
+
+         cmeq d20, d21, #0x0
+
+// CHECK: cmeq d20, d21, #0x0   // encoding: [0xb4,0x9a,0xe0,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Unsigned Higher Or Same
+//----------------------------------------------------------------------
+
+         cmhs d20, d21, d22
+
+// CHECK: cmhs d20, d21, d22   // encoding: [0xb4,0x3e,0xf6,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greather Than Or Equal
+//----------------------------------------------------------------------
+
+         cmge d20, d21, d22
+
+// CHECK: cmge d20, d21, d22    // encoding: [0xb4,0x3e,0xf6,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greather Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         cmge d20, d21, #0x0
+
+// CHECK: cmge d20, d21, #0x0   // encoding: [0xb4,0x8a,0xe0,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Unsigned Higher
+//----------------------------------------------------------------------
+
+         cmhi d20, d21, d22
+
+// CHECK: cmhi d20, d21, d22   // encoding: [0xb4,0x36,0xf6,0x7e]
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greater Than
+//----------------------------------------------------------------------
+
+         cmgt d20, d21, d22
+
+// CHECK: cmgt d20, d21, d22   // encoding: [0xb4,0x36,0xf6,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Greater Than Zero
+//----------------------------------------------------------------------
+
+         cmgt d20, d21, #0x0
+
+// CHECK: cmgt d20, d21, #0x0   // encoding: [0xb4,0x8a,0xe0,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Signed Less Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         cmle d20, d21, #0x0
+
+// CHECK: cmle d20, d21, #0x0   // encoding: [0xb4,0x9a,0xe0,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Less Than Zero
+//----------------------------------------------------------------------
+
+         cmlt d20, d21, #0x0
+
+// CHECK: cmlt d20, d21, #0x0   // encoding: [0xb4,0xaa,0xe0,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Compare Bitwise Test Bits
+//----------------------------------------------------------------------
+
+         cmtst d20, d21, d22
+
+// CHECK: cmtst d20, d21, d22   // encoding: [0xb4,0x8e,0xf6,0x5e]
diff --git a/test/MC/AArch64/neon-scalar-cvt.s b/test/MC/AArch64/neon-scalar-cvt.s
new file mode 100644
index 0000000..97416da
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-cvt.s
@@ -0,0 +1,181 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Signed Integer Convert To Floating-point
+//----------------------------------------------------------------------
+
+    scvtf s22, s13
+    scvtf d21, d12
+
+// CHECK: scvtf s22, s13    // encoding: [0xb6,0xd9,0x21,0x5e]
+// CHECK: scvtf d21, d12    // encoding: [0x95,0xd9,0x61,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Integer Convert To Floating-point
+//----------------------------------------------------------------------
+
+    ucvtf s22, s13
+    ucvtf d21, d14
+
+// CHECK: ucvtf s22, s13    // encoding: [0xb6,0xd9,0x21,0x7e]
+// CHECK: ucvtf d21, d14    // encoding: [0xd5,0xd9,0x61,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
+//----------------------------------------------------------------------
+
+    scvtf s22, s13, #32
+    scvtf d21, d12, #64
+
+// CHECK: scvtf s22, s13, #32  // encoding: [0xb6,0xe5,0x20,0x5f]
+// CHECK: scvtf d21, d12, #64  // encoding: [0x95,0xe5,0x40,0x5f]    
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
+//----------------------------------------------------------------------
+
+    ucvtf s22, s13, #32
+    ucvtf d21, d14, #64
+
+// CHECK: ucvtf s22, s13, #32  // encoding: [0xb6,0xe5,0x20,0x7f]
+// CHECK: ucvtf d21, d14, #64  // encoding: [0xd5,0xe5,0x40,0x7f]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
+//----------------------------------------------------------------------
+
+    fcvtzs s21, s12, #1
+    fcvtzs d21, d12, #1
+
+// CHECK: fcvtzs s21, s12, #1  // encoding: [0x95,0xfd,0x3f,0x5f]
+// CHECK: fcvtzs d21, d12, #1  // encoding: [0x95,0xfd,0x7f,0x5f]
+        
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
+//----------------------------------------------------------------------
+
+    fcvtzu s21, s12, #1
+    fcvtzu d21, d12, #1
+
+// CHECK: fcvtzu s21, s12, #1  // encoding: [0x95,0xfd,0x3f,0x7f]
+// CHECK: fcvtzu d21, d12, #1  // encoding: [0x95,0xfd,0x7f,0x7f]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Lower Precision Narrow, Rounding To
+// Odd
+//----------------------------------------------------------------------
+
+    fcvtxn s22, d13
+
+// CHECK: fcvtxn s22, d13    // encoding: [0xb6,0x69,0x61,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding To Nearest
+// With Ties To Away
+//----------------------------------------------------------------------
+
+    fcvtas s12, s13
+    fcvtas d21, d14
+
+// CHECK: fcvtas s12, s13    // encoding: [0xac,0xc9,0x21,0x5e]
+// CHECK: fcvtas d21, d14    // encoding: [0xd5,0xc9,0x61,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding To
+// Nearest With Ties To Away
+//----------------------------------------------------------------------
+
+    fcvtau s12, s13
+    fcvtau d21, d14
+
+// CHECK: fcvtau s12, s13    // encoding: [0xac,0xc9,0x21,0x7e]
+// CHECK: fcvtau d21, d14    // encoding: [0xd5,0xc9,0x61,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding Toward
+// Minus Infinity
+//----------------------------------------------------------------------
+
+    fcvtms s22, s13
+    fcvtms d21, d14
+
+// CHECK: fcvtms s22, s13    // encoding: [0xb6,0xb9,0x21,0x5e]
+// CHECK: fcvtms d21, d14    // encoding: [0xd5,0xb9,0x61,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding Toward
+// Minus Infinity
+//----------------------------------------------------------------------
+
+    fcvtmu s12, s13
+    fcvtmu d21, d14
+
+// CHECK: fcvtmu s12, s13    // encoding: [0xac,0xb9,0x21,0x7e]
+// CHECK: fcvtmu d21, d14    // encoding: [0xd5,0xb9,0x61,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding To Nearest
+// With Ties To Even
+//----------------------------------------------------------------------
+
+    fcvtns s22, s13
+    fcvtns d21, d14
+
+// CHECK: fcvtns s22, s13    // encoding: [0xb6,0xa9,0x21,0x5e]
+// CHECK: fcvtns d21, d14    // encoding: [0xd5,0xa9,0x61,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding To
+// Nearest With Ties To Even
+//----------------------------------------------------------------------
+
+    fcvtnu s12, s13
+    fcvtnu d21, d14
+
+// CHECK: fcvtnu s12, s13    // encoding: [0xac,0xa9,0x21,0x7e]
+// CHECK: fcvtnu d21, d14    // encoding: [0xd5,0xa9,0x61,0x7e]
+        
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding Toward
+// Positive Infinity
+//----------------------------------------------------------------------
+
+    fcvtps s22, s13
+    fcvtps d21, d14
+
+// CHECK: fcvtps s22, s13    // encoding: [0xb6,0xa9,0xa1,0x5e]
+// CHECK: fcvtps d21, d14    // encoding: [0xd5,0xa9,0xe1,0x5e]
+        
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding Toward
+// Positive Infinity
+//----------------------------------------------------------------------
+
+    fcvtpu s12, s13
+    fcvtpu d21, d14
+
+// CHECK: fcvtpu s12, s13    // encoding: [0xac,0xa9,0xa1,0x7e]
+// CHECK: fcvtpu d21, d14    // encoding: [0xd5,0xa9,0xe1,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Signed Integer, Rounding Toward Zero
+//----------------------------------------------------------------------
+
+    fcvtzs s12, s13
+    fcvtzs d21, d14
+
+// CHECK: fcvtzs s12, s13    // encoding: [0xac,0xb9,0xa1,0x5e]
+// CHECK: fcvtzs d21, d14    // encoding: [0xd5,0xb9,0xe1,0x5e]
+        
+//----------------------------------------------------------------------
+// Scalar Floating-point Convert To Unsigned Integer, Rounding Toward 
+// Zero
+//----------------------------------------------------------------------
+
+    fcvtzu s12, s13
+    fcvtzu d21, d14
+
+// CHECK: fcvtzu s12, s13    // encoding: [0xac,0xb9,0xa1,0x7e]
+// CHECK: fcvtzu d21, d14    // encoding: [0xd5,0xb9,0xe1,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-dup.s b/test/MC/AArch64/neon-scalar-dup.s
new file mode 100644
index 0000000..77c638d
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-dup.s
@@ -0,0 +1,55 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Duplicate element (scalar)
+//------------------------------------------------------------------------------
+         dup b0, v0.b[15]
+         dup b1, v0.b[7]
+         dup b17, v0.b[0]
+         dup h5, v31.h[7]
+         dup h9, v1.h[4]
+         dup h11, v17.h[0]
+         dup s2, v2.s[3]
+         dup s4, v21.s[0]
+         dup s31, v21.s[2]
+         dup d3, v5.d[0]
+         dup d6, v5.d[1]
+
+// CHECK: dup b0, v0.b[15]      // encoding: [0x00,0x04,0x1f,0x5e]
+// CHECK: dup b1, v0.b[7]       // encoding: [0x01,0x04,0x0f,0x5e]
+// CHECK: dup b17, v0.b[0]      // encoding: [0x11,0x04,0x01,0x5e]
+// CHECK: dup h5, v31.h[7]      // encoding: [0xe5,0x07,0x1e,0x5e]
+// CHECK: dup h9, v1.h[4]       // encoding: [0x29,0x04,0x12,0x5e]
+// CHECK: dup h11, v17.h[0]     // encoding: [0x2b,0x06,0x02,0x5e]
+// CHECK: dup s2, v2.s[3]       // encoding: [0x42,0x04,0x1c,0x5e]
+// CHECK: dup s4, v21.s[0]      // encoding: [0xa4,0x06,0x04,0x5e]
+// CHECK: dup s31, v21.s[2]     // encoding: [0xbf,0x06,0x14,0x5e]
+// CHECK: dup d3, v5.d[0]       // encoding: [0xa3,0x04,0x08,0x5e]
+// CHECK: dup d6, v5.d[1]       // encoding: [0xa6,0x04,0x18,0x5e]
+
+//------------------------------------------------------------------------------
+// Aliases for Duplicate element (scalar)
+//------------------------------------------------------------------------------
+         mov b0, v0.b[15]
+         mov b1, v0.b[7]
+         mov b17, v0.b[0]
+         mov h5, v31.h[7]
+         mov h9, v1.h[4]
+         mov h11, v17.h[0]
+         mov s2, v2.s[3]
+         mov s4, v21.s[0]
+         mov s31, v21.s[2]
+         mov d3, v5.d[0]
+         mov d6, v5.d[1]
+
+// CHECK: dup b0, v0.b[15]      // encoding: [0x00,0x04,0x1f,0x5e]
+// CHECK: dup b1, v0.b[7]       // encoding: [0x01,0x04,0x0f,0x5e]
+// CHECK: dup b17, v0.b[0]      // encoding: [0x11,0x04,0x01,0x5e]
+// CHECK: dup h5, v31.h[7]      // encoding: [0xe5,0x07,0x1e,0x5e]
+// CHECK: dup h9, v1.h[4]       // encoding: [0x29,0x04,0x12,0x5e]
+// CHECK: dup h11, v17.h[0]     // encoding: [0x2b,0x06,0x02,0x5e]
+// CHECK: dup s2, v2.s[3]       // encoding: [0x42,0x04,0x1c,0x5e]
+// CHECK: dup s4, v21.s[0]      // encoding: [0xa4,0x06,0x04,0x5e]
+// CHECK: dup s31, v21.s[2]     // encoding: [0xbf,0x06,0x14,0x5e]
+// CHECK: dup d3, v5.d[0]       // encoding: [0xa3,0x04,0x08,0x5e]
+// CHECK: dup d6, v5.d[1]       // encoding: [0xa6,0x04,0x18,0x5e]
diff --git a/test/MC/AArch64/neon-scalar-extract-narrow.s b/test/MC/AArch64/neon-scalar-extract-narrow.s
new file mode 100644
index 0000000..e25224e
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-extract-narrow.s
@@ -0,0 +1,40 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Signed Saturating Extract Unsigned Narrow
+//----------------------------------------------------------------------
+
+    sqxtun b19, h14
+    sqxtun h21, s15
+    sqxtun s20, d12
+
+// CHECK: sqxtun b19, h14  // encoding: [0xd3,0x29,0x21,0x7e]
+// CHECK: sqxtun h21, s15  // encoding: [0xf5,0x29,0x61,0x7e]
+// CHECK: sqxtun s20, d12  // encoding: [0x94,0x29,0xa1,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Signed Saturating Extract Signed Narrow
+//----------------------------------------------------------------------
+
+    sqxtn b18, h18
+    sqxtn h20, s17
+    sqxtn s19, d14
+
+// CHECK: sqxtn b18, h18  // encoding: [0x52,0x4a,0x21,0x5e]
+// CHECK: sqxtn h20, s17  // encoding: [0x34,0x4a,0x61,0x5e]
+// CHECK: sqxtn s19, d14  // encoding: [0xd3,0x49,0xa1,0x5e]
+
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Saturating Extract Narrow
+//----------------------------------------------------------------------
+
+    uqxtn b18, h18
+    uqxtn h20, s17
+    uqxtn s19, d14
+
+// CHECK: uqxtn b18, h18  // encoding: [0x52,0x4a,0x21,0x7e]
+// CHECK: uqxtn h20, s17  // encoding: [0x34,0x4a,0x61,0x7e]
+// CHECK: uqxtn s19, d14  // encoding: [0xd3,0x49,0xa1,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-fp-compare.s b/test/MC/AArch64/neon-scalar-fp-compare.s
new file mode 100644
index 0000000..a59ec0d
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-fp-compare.s
@@ -0,0 +1,103 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Equal
+//----------------------------------------------------------------------
+
+         fcmeq s10, s11, s12
+         fcmeq d20, d21, d22
+
+// CHECK: fcmeq s10, s11, s12   // encoding: [0x6a,0xe5,0x2c,0x5e]
+// CHECK: fcmeq d20, d21, d22   // encoding: [0xb4,0xe6,0x76,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Equal To Zero
+//----------------------------------------------------------------------
+
+         fcmeq s10, s11, #0.0
+         fcmeq d20, d21, #0.0
+
+// CHECK: fcmeq s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x5e]
+// CHECK: fcmeq d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greater Than Or Equal
+//----------------------------------------------------------------------
+
+         fcmge s10, s11, s12
+         fcmge d20, d21, d22
+
+// CHECK: fcmge s10, s11, s12   // encoding: [0x6a,0xe5,0x2c,0x7e]
+// CHECK: fcmge d20, d21, d22   // encoding: [0xb4,0xe6,0x76,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         fcmge s10, s11, #0.0
+         fcmge d20, d21, #0.0
+
+// CHECK: fcmge s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x7e]
+// CHECK: fcmge d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greather Than
+//----------------------------------------------------------------------
+
+         fcmgt s10, s11, s12
+         fcmgt d20, d21, d22
+
+// CHECK: fcmgt s10, s11, s12   // encoding: [0x6a,0xe5,0xac,0x7e]
+// CHECK: fcmgt d20, d21, d22   // encoding: [0xb4,0xe6,0xf6,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Greather Than Zero
+//----------------------------------------------------------------------
+
+         fcmgt s10, s11, #0.0
+         fcmgt d20, d21, #0.0
+
+// CHECK: fcmgt s10, s11, #0.0   // encoding: [0x6a,0xc9,0xa0,0x5e]
+// CHECK: fcmgt d20, d21, #0.0   // encoding: [0xb4,0xca,0xe0,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
+//----------------------------------------------------------------------
+
+         fcmle s10, s11, #0.0
+         fcmle d20, d21, #0.0
+
+// CHECK: fcmle s10, s11, #0.0   // encoding: [0x6a,0xd9,0xa0,0x7e]
+// CHECK: fcmle d20, d21, #0.0   // encoding: [0xb4,0xda,0xe0,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Compare Mask Less Than
+//----------------------------------------------------------------------
+
+         fcmlt s10, s11, #0.0
+         fcmlt d20, d21, #0.0
+
+// CHECK: fcmlt s10, s11, #0.0   // encoding: [0x6a,0xe9,0xa0,0x5e]
+// CHECK: fcmlt d20, d21, #0.0   // encoding: [0xb4,0xea,0xe0,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
+//----------------------------------------------------------------------
+
+         facge s10, s11, s12
+         facge d20, d21, d22
+
+// CHECK: facge s10, s11, s12    // encoding: [0x6a,0xed,0x2c,0x7e]
+// CHECK: facge d20, d21, d22    // encoding: [0xb4,0xee,0x76,0x7e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Absolute Compare Mask Greater Than
+//----------------------------------------------------------------------
+
+         facgt s10, s11, s12
+         facgt d20, d21, d22
+
+// CHECK: facgt s10, s11, s12   // encoding: [0x6a,0xed,0xac,0x7e]
+// CHECK: facgt d20, d21, d22   // encoding: [0xb4,0xee,0xf6,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-mul.s b/test/MC/AArch64/neon-scalar-mul.s
new file mode 100644
index 0000000..e33bdad
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-mul.s
@@ -0,0 +1,63 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Doubling Multiply Half High
+//----------------------------------------------------------------------
+
+    sqdmulh h10, h11, h12
+    sqdmulh s20, s21, s2
+        
+// CHECK: sqdmulh h10, h11, h12     // encoding: [0x6a,0xb5,0x6c,0x5e]
+// CHECK: sqdmulh s20, s21, s2      // encoding: [0xb4,0xb6,0xa2,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Doubling Multiply Half High
+//----------------------------------------------------------------------
+
+    sqrdmulh h10, h11, h12
+    sqrdmulh s20, s21, s2
+        
+// CHECK: sqrdmulh h10, h11, h12     // encoding: [0x6a,0xb5,0x6c,0x7e]
+// CHECK: sqrdmulh s20, s21, s2      // encoding: [0xb4,0xb6,0xa2,0x7e]
+
+//----------------------------------------------------------------------
+// Floating-point Multiply Extended
+//----------------------------------------------------------------------
+
+    fmulx s20, s22, s15
+    fmulx d23, d11, d1
+
+// CHECK: fmulx s20, s22, s15   // encoding: [0xd4,0xde,0x2f,0x5e]
+// CHECK: fmulx d23, d11, d1    // encoding: [0x77,0xdd,0x61,0x5e]
+
+//----------------------------------------------------------------------
+// Signed Saturating Doubling Multiply-Add Long
+//----------------------------------------------------------------------
+
+    sqdmlal s17, h27, h12
+    sqdmlal d19, s24, s12
+
+// CHECK: sqdmlal s17, h27, h12  // encoding: [0x71,0x93,0x6c,0x5e]
+// CHECK: sqdmlal d19, s24, s12  // encoding: [0x13,0x93,0xac,0x5e]
+
+//----------------------------------------------------------------------
+// Signed Saturating Doubling Multiply-Subtract Long
+//----------------------------------------------------------------------
+
+    sqdmlsl s14, h12, h25
+    sqdmlsl d12, s23, s13
+
+// CHECK: sqdmlsl s14, h12, h25  // encoding: [0x8e,0xb1,0x79,0x5e]
+// CHECK: sqdmlsl d12, s23, s13  // encoding: [0xec,0xb2,0xad,0x5e]
+
+//----------------------------------------------------------------------
+// Signed Saturating Doubling Multiply Long
+//----------------------------------------------------------------------
+
+    sqdmull s12, h22, h12
+    sqdmull d15, s22, s12
+
+// CHECK: sqdmull s12, h22, h12  // encoding: [0xcc,0xd2,0x6c,0x5e]
+// CHECK: sqdmull d15, s22, s12  // encoding: [0xcf,0xd2,0xac,0x5e]
diff --git a/test/MC/AArch64/neon-scalar-neg.s b/test/MC/AArch64/neon-scalar-neg.s
new file mode 100644
index 0000000..8e5d61d
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-neg.s
@@ -0,0 +1,25 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Negate
+//----------------------------------------------------------------------
+
+    neg d29, d24
+
+// CHECK: neg d29, d24    // encoding: [0x1d,0xbb,0xe0,0x7e]
+        
+//----------------------------------------------------------------------
+// Scalar Signed Saturating Negate
+//----------------------------------------------------------------------
+
+    sqneg b19, b14
+    sqneg h21, h15
+    sqneg s20, s12
+    sqneg d18, d12
+
+// CHECK: sqneg b19, b14    // encoding: [0xd3,0x79,0x20,0x7e]
+// CHECK: sqneg h21, h15    // encoding: [0xf5,0x79,0x60,0x7e]
+// CHECK: sqneg s20, s12    // encoding: [0x94,0x79,0xa0,0x7e]
+// CHECK: sqneg d18, d12    // encoding: [0x92,0x79,0xe0,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-recip.s b/test/MC/AArch64/neon-scalar-recip.s
new file mode 100644
index 0000000..7a886f3
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-recip.s
@@ -0,0 +1,53 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Floating-point Reciprocal Step
+//----------------------------------------------------------------------
+
+    frecps s21, s16, s13
+    frecps d22, d30, d21
+
+// CHECK: frecps s21, s16, s13   // encoding: [0x15,0xfe,0x2d,0x5e]
+// CHECK: frecps d22, d30, d21   // encoding: [0xd6,0xff,0x75,0x5e]
+
+//----------------------------------------------------------------------
+// Floating-point Reciprocal Square Root Step
+//----------------------------------------------------------------------
+
+    frsqrts s21, s5, s12
+    frsqrts d8, d22, d18
+
+// CHECK: frsqrts s21, s5, s12   // encoding: [0xb5,0xfc,0xac,0x5e]
+// CHECK: frsqrts d8, d22, d18   // encoding: [0xc8,0xfe,0xf2,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Reciprocal Estimate
+//----------------------------------------------------------------------
+
+    frecpe s19, s14
+    frecpe d13, d13
+
+// CHECK: frecpe s19, s14    // encoding: [0xd3,0xd9,0xa1,0x5e]
+// CHECK: frecpe d13, d13    // encoding: [0xad,0xd9,0xe1,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Reciprocal Exponent
+//----------------------------------------------------------------------
+
+    frecpx s18, s10
+    frecpx d16, d19
+
+// CHECK: frecpx s18, s10    // encoding: [0x52,0xf9,0xa1,0x5e]
+// CHECK: frecpx d16, d19    // encoding: [0x70,0xfa,0xe1,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Floating-point Reciprocal Square Root Estimate
+//----------------------------------------------------------------------
+
+    frsqrte s22, s13
+    frsqrte d21, d12
+
+// CHECK: frsqrte s22, s13    // encoding: [0xb6,0xd9,0xa1,0x7e]
+// CHECK: frsqrte d21, d12    // encoding: [0x95,0xd9,0xe1,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-reduce-pairwise.s b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
new file mode 100644
index 0000000..403a940
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-reduce-pairwise.s
@@ -0,0 +1,16 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//----------------------------------------------------------------------
+// Scalar Reduce Add Pairwise (Integer)
+//----------------------------------------------------------------------
+      addp d0, v1.2d
+
+// CHECK: addp d0, v1.2d     // encoding: [0x20,0xb8,0xf1,0x5e]
+
+//----------------------------------------------------------------------
+// Scalar Reduce Add Pairwise (Floating Point)
+//----------------------------------------------------------------------
+      faddp d20, v1.2d
+
+// CHECK: faddp d20, v1.2d     // encoding: [0x34,0xd8,0x70,0x7e]
+
diff --git a/test/MC/AArch64/neon-scalar-rounding-shift.s b/test/MC/AArch64/neon-scalar-rounding-shift.s
new file mode 100644
index 0000000..6113e09
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-rounding-shift.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+
+//------------------------------------------------------------------------------
+// Scalar Integer Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         srshl d17, d31, d8
+
+// CHECK: srshl d17, d31, d8      // encoding: [0xf1,0x57,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         urshl d17, d31, d8
+
+// CHECK: urshl d17, d31, d8      // encoding: [0xf1,0x57,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/neon-scalar-saturating-add-sub.s b/test/MC/AArch64/neon-scalar-saturating-add-sub.s
new file mode 100644
index 0000000..0bf2434
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-saturating-add-sub.s
@@ -0,0 +1,81 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Add (Signed)
+//------------------------------------------------------------------------------
+         sqadd b0, b1, b2
+         sqadd h10, h11, h12
+         sqadd s20, s21, s2
+         sqadd d17, d31, d8
+
+// CHECK: sqadd b0, b1, b2        // encoding: [0x20,0x0c,0x22,0x5e]
+// CHECK: sqadd h10, h11, h12     // encoding: [0x6a,0x0d,0x6c,0x5e]
+// CHECK: sqadd s20, s21, s2      // encoding: [0xb4,0x0e,0xa2,0x5e]
+// CHECK: sqadd d17, d31, d8      // encoding: [0xf1,0x0f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Add (Unsigned)
+//------------------------------------------------------------------------------
+         uqadd b0, b1, b2
+         uqadd h10, h11, h12
+         uqadd s20, s21, s2
+         uqadd d17, d31, d8
+
+// CHECK: uqadd b0, b1, b2        // encoding: [0x20,0x0c,0x22,0x7e]
+// CHECK: uqadd h10, h11, h12     // encoding: [0x6a,0x0d,0x6c,0x7e]
+// CHECK: uqadd s20, s21, s2      // encoding: [0xb4,0x0e,0xa2,0x7e]
+// CHECK: uqadd d17, d31, d8      // encoding: [0xf1,0x0f,0xe8,0x7e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Sub (Signed)
+//------------------------------------------------------------------------------
+         sqsub b0, b1, b2
+         sqsub h10, h11, h12
+         sqsub s20, s21, s2
+         sqsub d17, d31, d8
+
+// CHECK: sqsub b0, b1, b2        // encoding: [0x20,0x2c,0x22,0x5e]
+// CHECK: sqsub h10, h11, h12     // encoding: [0x6a,0x2d,0x6c,0x5e]
+// CHECK: sqsub s20, s21, s2      // encoding: [0xb4,0x2e,0xa2,0x5e]
+// CHECK: sqsub d17, d31, d8      // encoding: [0xf1,0x2f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Sub (Unsigned)
+//------------------------------------------------------------------------------
+         uqsub b0, b1, b2
+         uqsub h10, h11, h12
+         uqsub s20, s21, s2
+         uqsub d17, d31, d8
+
+// CHECK: uqsub b0, b1, b2        // encoding: [0x20,0x2c,0x22,0x7e]
+// CHECK: uqsub h10, h11, h12     // encoding: [0x6a,0x2d,0x6c,0x7e]
+// CHECK: uqsub s20, s21, s2      // encoding: [0xb4,0x2e,0xa2,0x7e]
+// CHECK: uqsub d17, d31, d8      // encoding: [0xf1,0x2f,0xe8,0x7e]
+
+//----------------------------------------------------------------------
+// Signed Saturating Accumulated of Unsigned Value
+//----------------------------------------------------------------------
+
+    suqadd b19, b14
+    suqadd h20, h15
+    suqadd s21, s12
+    suqadd d18, d22
+
+// CHECK: suqadd b19, b14    // encoding: [0xd3,0x39,0x20,0x5e]
+// CHECK: suqadd h20, h15    // encoding: [0xf4,0x39,0x60,0x5e]
+// CHECK: suqadd s21, s12    // encoding: [0x95,0x39,0xa0,0x5e]
+// CHECK: suqadd d18, d22    // encoding: [0xd2,0x3a,0xe0,0x5e]
+
+//----------------------------------------------------------------------
+// Unsigned Saturating Accumulated of Signed Value
+//----------------------------------------------------------------------
+
+    usqadd b19, b14
+    usqadd h20, h15
+    usqadd s21, s12
+    usqadd d18, d22
+
+// CHECK: usqadd b19, b14    // encoding: [0xd3,0x39,0x20,0x7e]
+// CHECK: usqadd h20, h15    // encoding: [0xf4,0x39,0x60,0x7e]
+// CHECK: usqadd s21, s12    // encoding: [0x95,0x39,0xa0,0x7e]
+// CHECK: usqadd d18, d22    // encoding: [0xd2,0x3a,0xe0,0x7e]
diff --git a/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s b/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
new file mode 100644
index 0000000..b09a589
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-saturating-rounding-shift.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sqrshl b0, b1, b2
+         sqrshl h10, h11, h12
+         sqrshl s20, s21, s2
+         sqrshl d17, d31, d8
+
+// CHECK: sqrshl b0, b1, b2        // encoding: [0x20,0x5c,0x22,0x5e]
+// CHECK: sqrshl h10, h11, h12     // encoding: [0x6a,0x5d,0x6c,0x5e]
+// CHECK: sqrshl s20, s21, s2      // encoding: [0xb4,0x5e,0xa2,0x5e]
+// CHECK: sqrshl d17, d31, d8      // encoding: [0xf1,0x5f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         uqrshl b0, b1, b2
+         uqrshl h10, h11, h12
+         uqrshl s20, s21, s2
+         uqrshl d17, d31, d8
+
+// CHECK: uqrshl b0, b1, b2        // encoding: [0x20,0x5c,0x22,0x7e]
+// CHECK: uqrshl h10, h11, h12     // encoding: [0x6a,0x5d,0x6c,0x7e]
+// CHECK: uqrshl s20, s21, s2      // encoding: [0xb4,0x5e,0xa2,0x7e]
+// CHECK: uqrshl d17, d31, d8      // encoding: [0xf1,0x5f,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/neon-scalar-saturating-shift.s b/test/MC/AArch64/neon-scalar-saturating-shift.s
new file mode 100644
index 0000000..b53c9f0
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-saturating-shift.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sqshl b0, b1, b2
+         sqshl h10, h11, h12
+         sqshl s20, s21, s2
+         sqshl d17, d31, d8
+
+// CHECK: sqshl b0, b1, b2        // encoding: [0x20,0x4c,0x22,0x5e]
+// CHECK: sqshl h10, h11, h12     // encoding: [0x6a,0x4d,0x6c,0x5e]
+// CHECK: sqshl s20, s21, s2      // encoding: [0xb4,0x4e,0xa2,0x5e]
+// CHECK: sqshl d17, d31, d8      // encoding: [0xf1,0x4f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         uqshl b0, b1, b2
+         uqshl h10, h11, h12
+         uqshl s20, s21, s2
+         uqshl d17, d31, d8
+
+// CHECK: uqshl b0, b1, b2        // encoding: [0x20,0x4c,0x22,0x7e]
+// CHECK: uqshl h10, h11, h12     // encoding: [0x6a,0x4d,0x6c,0x7e]
+// CHECK: uqshl s20, s21, s2      // encoding: [0xb4,0x4e,0xa2,0x7e]
+// CHECK: uqshl d17, d31, d8      // encoding: [0xf1,0x4f,0xe8,0x7e]
+
+
diff --git a/test/MC/AArch64/neon-scalar-shift-imm.s b/test/MC/AArch64/neon-scalar-shift-imm.s
new file mode 100644
index 0000000..96cb815
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-shift-imm.s
@@ -0,0 +1,186 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Scalar Signed Shift Right (Immediate)
+//----------------------------------------------------------------------
+        sshr d15, d16, #12
+
+// CHECK: sshr d15, d16, #12  // encoding: [0x0f,0x06,0x74,0x5f]
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Shift Right (Immediate)
+//----------------------------------------------------------------------
+        ushr d10, d17, #18
+
+// CHECK: ushr d10, d17, #18  // encoding: [0x2a,0x06,0x6e,0x7f]
+
+//----------------------------------------------------------------------
+// Scalar Signed Rounding Shift Right (Immediate)
+//----------------------------------------------------------------------
+        srshr d19, d18, #7
+
+// CHECK: srshr d19, d18, #7  // encoding: [0x53,0x26,0x79,0x5f]
+
+//----------------------------------------------------------------------
+// Scalar Unigned Rounding Shift Right (Immediate)
+//----------------------------------------------------------------------
+        urshr d20, d23, #31
+
+// CHECK: urshr d20, d23, #31  // encoding: [0xf4,0x26,0x61,0x7f]
+
+//----------------------------------------------------------------------
+// Scalar Signed Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+        ssra d18, d12, #21
+
+// CHECK: ssra d18, d12, #21  // encoding: [0x92,0x15,0x6b,0x5f]
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+        usra d20, d13, #61
+
+// CHECK: usra d20, d13, #61  // encoding: [0xb4,0x15,0x43,0x7f]
+
+//----------------------------------------------------------------------
+// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+        srsra d15, d11, #19
+
+// CHECK: srsra d15, d11, #19  // encoding: [0x6f,0x35,0x6d,0x5f]
+
+//----------------------------------------------------------------------
+// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
+//----------------------------------------------------------------------
+        ursra d18, d10, #13
+
+// CHECK: ursra d18, d10, #13  // encoding: [0x52,0x35,0x73,0x7f]
+
+//----------------------------------------------------------------------
+// Scalar Shift Left (Immediate)
+//----------------------------------------------------------------------
+        shl d7, d10, #12
+
+// CHECK: shl d7, d10, #12  // encoding: [0x47,0x55,0x4c,0x5f]
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Left (Immediate)
+//----------------------------------------------------------------------
+        sqshl b11, b19, #7
+        sqshl h13, h18, #11
+        sqshl s14, s17, #22
+        sqshl d15, d16, #51
+
+// CHECK: sqshl b11, b19, #7   // encoding: [0x6b,0x76,0x0f,0x5f]
+// CHECK: sqshl h13, h18, #11  // encoding: [0x4d,0x76,0x1b,0x5f]
+// CHECK: sqshl s14, s17, #22  // encoding: [0x2e,0x76,0x36,0x5f]
+// CHECK: sqshl d15, d16, #51  // encoding: [0x0f,0x76,0x73,0x5f]
+
+//----------------------------------------------------------------------
+// Unsigned Saturating Shift Left (Immediate)
+//----------------------------------------------------------------------
+        uqshl b18, b15, #6
+        uqshl h11, h18, #7
+        uqshl s14, s19, #18
+        uqshl d15, d12, #19
+
+// CHECK: uqshl b18, b15, #6   // encoding: [0xf2,0x75,0x0e,0x7f]
+// CHECK: uqshl h11, h18, #7   // encoding: [0x4b,0x76,0x17,0x7f]
+// CHECK: uqshl s14, s19, #18  // encoding: [0x6e,0x76,0x32,0x7f]
+// CHECK: uqshl d15, d12, #19  // encoding: [0x8f,0x75,0x53,0x7f]
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Left Unsigned (Immediate)
+//----------------------------------------------------------------------
+        sqshlu b15, b18, #6
+        sqshlu h19, h17, #6
+        sqshlu s16, s14, #25
+        sqshlu d11, d13, #32
+
+// CHECK: sqshlu  b15, b18, #6   // encoding: [0x4f,0x66,0x0e,0x7f]
+// CHECK: sqshlu  h19, h17, #6   // encoding: [0x33,0x66,0x16,0x7f]
+// CHECK: sqshlu  s16, s14, #25  // encoding: [0xd0,0x65,0x39,0x7f]
+// CHECK: sqshlu  d11, d13, #32  // encoding: [0xab,0x65,0x60,0x7f]
+
+//----------------------------------------------------------------------
+// Shift Right And Insert (Immediate)
+//----------------------------------------------------------------------
+        sri d10, d12, #14
+
+// CHECK: sri d10, d12, #14  // encoding: [0x8a,0x45,0x72,0x7f]
+
+//----------------------------------------------------------------------
+// Shift Left And Insert (Immediate)
+//----------------------------------------------------------------------
+        sli d10, d14, #12
+
+// CHECK: sli d10, d14, #12  // encoding: [0xca,0x55,0x4c,0x7f]
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+        sqshrn b10, h15, #5
+        sqshrn h17, s10, #4
+        sqshrn s18, d10, #31
+
+// CHECK: sqshrn  b10, h15, #5   // encoding: [0xea,0x95,0x0b,0x5f]
+// CHECK: sqshrn  h17, s10, #4   // encoding: [0x51,0x95,0x1c,0x5f]
+// CHECK: sqshrn  s18, d10, #31  // encoding: [0x52,0x95,0x21,0x5f]
+
+//----------------------------------------------------------------------
+// Unsigned Saturating Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+        uqshrn b12, h10, #7
+        uqshrn h10, s14, #5
+        uqshrn s10, d12, #13
+
+// CHECK: uqshrn  b12, h10, #7   // encoding: [0x4c,0x95,0x09,0x7f]
+// CHECK: uqshrn  h10, s14, #5   // encoding: [0xca,0x95,0x1b,0x7f]
+// CHECK: uqshrn  s10, d12, #13  // encoding: [0x8a,0x95,0x33,0x7f]
+
+//----------------------------------------------------------------------
+// Signed Saturating Rounded Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+        sqrshrn b10, h13, #2
+        sqrshrn h15, s10, #6
+        sqrshrn s15, d12, #9
+
+// CHECK: sqrshrn b10, h13, #2  // encoding: [0xaa,0x9d,0x0e,0x5f]
+// CHECK: sqrshrn h15, s10, #6  // encoding: [0x4f,0x9d,0x1a,0x5f]
+// CHECK: sqrshrn s15, d12, #9  // encoding: [0x8f,0x9d,0x37,0x5f]
+
+//----------------------------------------------------------------------
+// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
+//----------------------------------------------------------------------
+        uqrshrn b10, h12, #5
+        uqrshrn h12, s10, #14
+        uqrshrn s10, d10, #25
+
+// CHECK: uqrshrn b10, h12, #5   // encoding: [0x8a,0x9d,0x0b,0x7f]
+// CHECK: uqrshrn h12, s10, #14  // encoding: [0x4c,0x9d,0x12,0x7f]
+// CHECK: uqrshrn s10, d10, #25  // encoding: [0x4a,0x9d,0x27,0x7f]
+
+//----------------------------------------------------------------------
+// Signed Saturating Shift Right Unsigned Narrow (Immediate)
+//----------------------------------------------------------------------
+        sqshrun b15, h10, #7
+        sqshrun h20, s14, #3
+        sqshrun s10, d15, #15
+
+// CHECK: sqshrun b15, h10, #7   // encoding: [0x4f,0x85,0x09,0x7f]
+// CHECK: sqshrun h20, s14, #3   // encoding: [0xd4,0x85,0x1d,0x7f]
+// CHECK: sqshrun s10, d15, #15  // encoding: [0xea,0x85,0x31,0x7f]
+
+//----------------------------------------------------------------------
+// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
+//----------------------------------------------------------------------
+
+        sqrshrun b17, h10, #6
+        sqrshrun h10, s13, #15
+        sqrshrun s22, d16, #31
+
+// CHECK: sqrshrun b17, h10, #6   // encoding: [0x51,0x8d,0x0a,0x7f]
+// CHECK: sqrshrun h10, s13, #15  // encoding: [0xaa,0x8d,0x11,0x7f]
+// CHECK: sqrshrun s22, d16, #31  // encoding: [0x16,0x8e,0x21,0x7f]
diff --git a/test/MC/AArch64/neon-scalar-shift.s b/test/MC/AArch64/neon-scalar-shift.s
new file mode 100644
index 0000000..366840a
--- /dev/null
+++ b/test/MC/AArch64/neon-scalar-shift.s
@@ -0,0 +1,16 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+//------------------------------------------------------------------------------
+// Scalar Integer Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sshl d17, d31, d8
+
+// CHECK: sshl d17, d31, d8      // encoding: [0xf1,0x47,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         ushl d17, d31, d8
+
+// CHECK: ushl d17, d31, d8      // encoding: [0xf1,0x47,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/neon-shift-left-long.s b/test/MC/AArch64/neon-shift-left-long.s
new file mode 100644
index 0000000..9760458
--- /dev/null
+++ b/test/MC/AArch64/neon-shift-left-long.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Integer shift left long (Signed)
+//------------------------------------------------------------------------------
+         sshll v0.8h, v1.8b, #3
+         sshll v0.4s, v1.4h, #3
+         sshll v0.2d, v1.2s, #3
+         sshll2 v0.8h, v1.16b, #3
+         sshll2 v0.4s, v1.8h, #3
+         sshll2 v0.2d, v1.4s, #3
+
+// CHECK: sshll v0.8h, v1.8b, #3         // encoding: [0x20,0xa4,0x0b,0x0f]
+// CHECK: sshll v0.4s, v1.4h, #3         // encoding: [0x20,0xa4,0x13,0x0f]
+// CHECK: sshll v0.2d, v1.2s, #3         // encoding: [0x20,0xa4,0x23,0x0f]
+// CHECK: sshll2 v0.8h, v1.16b, #3       // encoding: [0x20,0xa4,0x0b,0x4f]
+// CHECK: sshll2 v0.4s, v1.8h, #3        // encoding: [0x20,0xa4,0x13,0x4f]
+// CHECK: sshll2 v0.2d, v1.4s, #3        // encoding: [0x20,0xa4,0x23,0x4f]
+
+//------------------------------------------------------------------------------
+// Integer shift left long (Unsigned)
+//------------------------------------------------------------------------------
+         ushll v0.8h, v1.8b, #3
+         ushll v0.4s, v1.4h, #3
+         ushll v0.2d, v1.2s, #3
+         ushll2 v0.8h, v1.16b, #3
+         ushll2 v0.4s, v1.8h, #3
+         ushll2 v0.2d, v1.4s, #3
+
+// CHECK: ushll v0.8h, v1.8b, #3         // encoding: [0x20,0xa4,0x0b,0x2f]
+// CHECK: ushll v0.4s, v1.4h, #3         // encoding: [0x20,0xa4,0x13,0x2f]
+// CHECK: ushll v0.2d, v1.2s, #3         // encoding: [0x20,0xa4,0x23,0x2f]
+// CHECK: ushll2 v0.8h, v1.16b, #3       // encoding: [0x20,0xa4,0x0b,0x6f]
+// CHECK: ushll2 v0.4s, v1.8h, #3        // encoding: [0x20,0xa4,0x13,0x6f]
+// CHECK: ushll2 v0.2d, v1.4s, #3        // encoding: [0x20,0xa4,0x23,0x6f]
diff --git a/test/MC/AArch64/neon-shift.s b/test/MC/AArch64/neon-shift.s
index be1799e..614e6de 100644
--- a/test/MC/AArch64/neon-shift.s
+++ b/test/MC/AArch64/neon-shift.s
@@ -42,16 +42,20 @@
 // CHECK: ushl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x44,0xe2,0x6e]
 
 //------------------------------------------------------------------------------
-// Scalar Integer Shift Lef (Signed)
-//------------------------------------------------------------------------------
-         sshl d17, d31, d8
-
-// CHECK: sshl d17, d31, d8      // encoding: [0xf1,0x47,0xe8,0x5e]
-
-//------------------------------------------------------------------------------
-// Scalar Integer Shift Lef (Unsigned)
-//------------------------------------------------------------------------------
-         ushl d17, d31, d8
-
-// CHECK: ushl d17, d31, d8      // encoding: [0xf1,0x47,0xe8,0x7e]
-
+// Vector Integer Shift Left by Immediate
+//------------------------------------------------------------------------------
+         shl v0.8b, v1.8b, #3
+         shl v0.4h, v1.4h, #3
+         shl v0.2s, v1.2s, #3
+         shl v0.16b, v1.16b, #3
+         shl v0.8h, v1.8h, #3
+         shl v0.4s, v1.4s, #3
+         shl v0.2d, v1.2d, #3
+
+// CHECK: shl v0.8b, v1.8b, #3        // encoding: [0x20,0x54,0x0b,0x0f]
+// CHECK: shl v0.4h, v1.4h, #3        // encoding: [0x20,0x54,0x13,0x0f]
+// CHECK: shl v0.2s, v1.2s, #3        // encoding: [0x20,0x54,0x23,0x0f]
+// CHECK: shl v0.16b, v1.16b, #3      // encoding: [0x20,0x54,0x0b,0x4f]
+// CHECK: shl v0.8h, v1.8h, #3        // encoding: [0x20,0x54,0x13,0x4f]
+// CHECK: shl v0.4s, v1.4s, #3        // encoding: [0x20,0x54,0x23,0x4f]
+// CHECK: shl v0.2d, v1.2d, #3        // encoding: [0x20,0x54,0x43,0x4f]
diff --git a/test/MC/AArch64/neon-simd-copy.s b/test/MC/AArch64/neon-simd-copy.s
new file mode 100644
index 0000000..f254d65
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-copy.s
@@ -0,0 +1,135 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Insert element (vector, from main)
+//------------------------------------------------------------------------------
+         ins v2.b[2], w1
+         ins v7.h[7], w14
+         ins v20.s[0], w30
+         ins v1.d[1], x7
+         
+         mov v2.b[2], w1
+         mov v7.h[7], w14
+         mov v20.s[0], w30
+         mov v1.d[1], x7
+
+// CHECK: ins	v2.b[2], w1           // encoding: [0x22,0x1c,0x05,0x4e]
+// CHECK: ins	v7.h[7], w14          // encoding: [0xc7,0x1d,0x1e,0x4e]
+// CHECK: ins	v20.s[0], w30         // encoding: [0xd4,0x1f,0x04,0x4e]
+// CHECK: ins	v1.d[1], x7           // encoding: [0xe1,0x1c,0x18,0x4e]
+
+// CHECK: ins v2.b[2], w1           // encoding: [0x22,0x1c,0x05,0x4e]
+// CHECK: ins v7.h[7], w14          // encoding: [0xc7,0x1d,0x1e,0x4e]
+// CHECK: ins v20.s[0], w30         // encoding: [0xd4,0x1f,0x04,0x4e]
+// CHECK: ins v1.d[1], x7           // encoding: [0xe1,0x1c,0x18,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Signed integer move (main, from element)
+//------------------------------------------------------------------------------
+         smov w1, v0.b[15]
+         smov w14, v6.h[4]
+         smov x1, v0.b[15]
+         smov x14, v6.h[4]
+         smov x20, v9.s[2]
+
+// CHECK: smov	w1, v0.b[15]          // encoding: [0x01,0x2c,0x1f,0x0e]
+// CHECK: smov	w14, v6.h[4]          // encoding: [0xce,0x2c,0x12,0x0e]
+// CHECK: smov	x1, v0.b[15]          // encoding: [0x01,0x2c,0x1f,0x4e]
+// CHECK: smov	x14, v6.h[4]          // encoding: [0xce,0x2c,0x12,0x4e]
+// CHECK: smov	x20, v9.s[2]          // encoding: [0x34,0x2d,0x14,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Unsigned integer move (main, from element)
+//------------------------------------------------------------------------------
+         umov w1, v0.b[15]
+         umov w14, v6.h[4]
+         umov w20, v9.s[2]
+         umov x7, v18.d[1]
+
+         mov w20, v9.s[2]
+         mov x7, v18.d[1]
+
+// CHECK: umov	w1, v0.b[15]          // encoding: [0x01,0x3c,0x1f,0x0e]
+// CHECK: umov	w14, v6.h[4]          // encoding: [0xce,0x3c,0x12,0x0e]
+// CHECK: umov	w20, v9.s[2]          // encoding: [0x34,0x3d,0x14,0x0e]
+// CHECK: umov	x7, v18.d[1]          // encoding: [0x47,0x3e,0x18,0x4e]
+
+// CHECK: umov  w20, v9.s[2]          // encoding: [0x34,0x3d,0x14,0x0e]
+// CHECK: umov  x7, v18.d[1]          // encoding: [0x47,0x3e,0x18,0x4e]
+
+//------------------------------------------------------------------------------
+// Insert element (vector, from element)
+//------------------------------------------------------------------------------
+
+         ins v1.b[14], v3.b[6]
+         ins v6.h[7], v7.h[5]
+         ins v15.s[3], v22.s[2]
+         ins v0.d[0], v4.d[1]
+
+         mov v1.b[14], v3.b[6]
+         mov v6.h[7], v7.h[5]
+         mov v15.s[3], v22.s[2]
+         mov v0.d[0], v4.d[1]
+
+// CHECK: ins	v1.b[14], v3.b[6]       // encoding: [0x61,0x34,0x1d,0x6e]
+// CHECK: ins	v6.h[7], v7.h[5]        // encoding: [0xe6,0x54,0x1e,0x6e]
+// CHECK: ins	v15.s[3], v22.s[2]      // encoding: [0xcf,0x46,0x1c,0x6e]
+// CHECK: ins	v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]
+
+// CHECK: ins v1.b[14], v3.b[6]       // encoding: [0x61,0x34,0x1d,0x6e]
+// CHECK: ins v6.h[7], v7.h[5]        // encoding: [0xe6,0x54,0x1e,0x6e]
+// CHECK: ins v15.s[3], v22.s[2]      // encoding: [0xcf,0x46,0x1c,0x6e]
+// CHECK: ins v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]
+
+//------------------------------------------------------------------------------
+// Duplicate to all lanes( vector, from element)
+//------------------------------------------------------------------------------
+         dup v1.8b, v2.b[2]
+         dup v11.4h, v7.h[7]
+         dup v17.2s, v20.s[0]
+         dup v1.16b, v2.b[2]
+         dup v11.8h, v7.h[7]
+         dup v17.4s, v20.s[0]
+         dup v5.2d, v1.d[1]         
+
+// CHECK: dup v1.8b, v2.b[2]        // encoding: [0x41,0x04,0x05,0x0e]
+// CHECK: dup v11.4h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x0e]
+// CHECK: dup v17.2s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x0e]
+// CHECK: dup v1.16b, v2.b[2]       // encoding: [0x41,0x04,0x05,0x4e]
+// CHECK: dup v11.8h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x4e]
+// CHECK: dup v17.4s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x4e]
+// CHECK: dup v5.2d, v1.d[1]        // encoding: [0x25,0x04,0x18,0x4e]
+
+//------------------------------------------------------------------------------
+// Duplicate to all lanes( vector, from main)
+//------------------------------------------------------------------------------
+         dup v1.8b, w1
+         dup v11.4h, w14
+         dup v17.2s, w30
+         dup v1.16b, w2
+         dup v11.8h, w16
+         dup v17.4s, w28
+         dup v5.2d, x0        
+
+// CHECK: dup	v1.8b, w1             // encoding: [0x21,0x0c,0x01,0x0e]
+// CHECK: dup	v11.4h, w14           // encoding: [0xcb,0x0d,0x02,0x0e]
+// CHECK: dup	v17.2s, w30           // encoding: [0xd1,0x0f,0x04,0x0e]
+// CHECK: dup	v1.16b, w2            // encoding: [0x41,0x0c,0x01,0x4e]
+// CHECK: dup	v11.8h, w16           // encoding: [0x0b,0x0e,0x02,0x4e]
+// CHECK: dup	v17.4s, w28           // encoding: [0x91,0x0f,0x04,0x4e]
+// CHECK: dup	v5.2d, x0             // encoding: [0x05,0x0c,0x08,0x4e]
+
+
+
+
+
+
+
+
+
+
diff --git a/test/MC/AArch64/neon-simd-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-ldst-multi-elem.s
new file mode 100644
index 0000000..05fe4da
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-ldst-multi-elem.s
@@ -0,0 +1,463 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from one register
+//------------------------------------------------------------------------------
+         st1 {v0.16b}, [x0]
+         st1 {v15.8h}, [x15]
+         st1 {v31.4s}, [sp]
+         st1 {v0.2d}, [x0]
+         st1 {v0.8b}, [x0]
+         st1 {v15.4h}, [x15]
+         st1 {v31.2s}, [sp]
+         st1 {v0.1d}, [x0]
+// CHECK:	st1	{v0.16b}, [x0]          // encoding: [0x00,0x70,0x00,0x4c]
+// CHECK:	st1	{v15.8h}, [x15]         // encoding: [0xef,0x75,0x00,0x4c]
+// CHECK:	st1	{v31.4s}, [sp]          // encoding: [0xff,0x7b,0x00,0x4c]
+// CHECK:	st1	{v0.2d}, [x0]           // encoding: [0x00,0x7c,0x00,0x4c]
+// CHECK:	st1	{v0.8b}, [x0]           // encoding: [0x00,0x70,0x00,0x0c]
+// CHECK:	st1	{v15.4h}, [x15]         // encoding: [0xef,0x75,0x00,0x0c]
+// CHECK:	st1	{v31.2s}, [sp]          // encoding: [0xff,0x7b,0x00,0x0c]
+// CHECK:	st1	{v0.1d}, [x0]           // encoding: [0x00,0x7c,0x00,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from two consecutive registers
+//------------------------------------------------------------------------------
+         st1 {v0.16b, v1.16b}, [x0]
+         st1 {v15.8h, v16.8h}, [x15]
+         st1 {v31.4s, v0.4s}, [sp]
+         st1 {v0.2d, v1.2d}, [x0]
+         st1 {v0.8b, v1.8b}, [x0]
+         st1 {v15.4h, v16.4h}, [x15]
+         st1 {v31.2s, v0.2s}, [sp]
+         st1 {v0.1d, v1.1d}, [x0]
+// CHECK:	st1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x00,0x4c]
+// CHECK:	st1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
+// CHECK:	st1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x00,0x4c]
+// CHECK:	st1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x00,0x4c]
+// CHECK:	st1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x00,0x0c]
+// CHECK:	st1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
+// CHECK:	st1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x00,0x0c]
+// CHECK:	st1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x00,0x0c]
+
+         st1 {v0.16b-v1.16b}, [x0]
+         st1 {v15.8h-v16.8h}, [x15]
+         st1 {v31.4s-v0.4s}, [sp]
+         st1 {v0.2d-v1.2d}, [x0]
+         st1 {v0.8b-v1.8b}, [x0]
+         st1 {v15.4h-v16.4h}, [x15]
+         st1 {v31.2s-v0.2s}, [sp]
+         st1 {v0.1d-v1.1d}, [x0]
+// CHECK:	st1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x00,0x4c]
+// CHECK:	st1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
+// CHECK:	st1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x00,0x4c]
+// CHECK:	st1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x00,0x4c]
+// CHECK:	st1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x00,0x0c]
+// CHECK:	st1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
+// CHECK:	st1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x00,0x0c]
+// CHECK:	st1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x00,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from three consecutive registers
+//------------------------------------------------------------------------------
+         st1 {v0.16b, v1.16b, v2.16b}, [x0]
+         st1 {v15.8h, v16.8h, v17.8h}, [x15]
+         st1 {v31.4s, v0.4s, v1.4s}, [sp]
+         st1 {v0.2d, v1.2d, v2.2d}, [x0]
+         st1 {v0.8b, v1.8b, v2.8b}, [x0]
+         st1 {v15.4h, v16.4h, v17.4h}, [x15]
+         st1 {v31.2s, v0.2s, v1.2s}, [sp]
+         st1 {v0.1d, v1.1d, v2.1d}, [x0]
+// CHECK:	st1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c]
+// CHECK:	st1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c]
+// CHECK:	st1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
+// CHECK:	st1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
+// CHECK:	st1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c]
+// CHECK:	st1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c]
+// CHECK:	st1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
+// CHECK:	st1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
+
+         st1 {v0.16b-v2.16b}, [x0]
+         st1 {v15.8h-v17.8h}, [x15]
+         st1 {v31.4s-v1.4s}, [sp]
+         st1 {v0.2d-v2.2d}, [x0]
+         st1 {v0.8b-v2.8b}, [x0]
+         st1 {v15.4h-v17.4h}, [x15]
+         st1 {v31.2s-v1.2s}, [sp]
+         st1 {v0.1d-v2.1d}, [x0]
+// CHECK:	st1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c]
+// CHECK:	st1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c]
+// CHECK:	st1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
+// CHECK:	st1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
+// CHECK:	st1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c]
+// CHECK:	st1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c]
+// CHECK:	st1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
+// CHECK:	st1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from four consecutive registers
+//------------------------------------------------------------------------------
+         st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+         st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
+         st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
+         st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+         st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
+         st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+         st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0]
+// CHECK:	st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c]
+// CHECK:	st1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c]
+// CHECK:	st1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
+// CHECK:	st1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
+// CHECK:	st1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c]
+// CHECK:	st1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c]
+// CHECK:	st1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
+// CHECK:	st1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
+
+         st1 {v0.16b-v3.16b}, [x0]
+         st1 {v15.8h-v18.8h}, [x15]
+         st1 {v31.4s-v2.4s}, [sp]
+         st1 {v0.2d-v3.2d}, [x0]
+         st1 {v0.8b-v3.8b}, [x0]
+         st1 {v15.4h-v18.4h}, [x15]
+         st1 {v31.2s-v2.2s}, [sp]
+         st1 {v0.1d-v3.1d}, [x0]
+// CHECK:	st1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c]
+// CHECK:	st1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c]
+// CHECK:	st1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
+// CHECK:	st1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
+// CHECK:	st1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c]
+// CHECK:	st1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c]
+// CHECK:	st1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
+// CHECK:	st1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 2-element structures from two consecutive registers
+//------------------------------------------------------------------------------
+         st2 {v0.16b, v1.16b}, [x0]
+         st2 {v15.8h, v16.8h}, [x15]
+         st2 {v31.4s, v0.4s}, [sp]
+         st2 {v0.2d, v1.2d}, [x0]
+         st2 {v0.8b, v1.8b}, [x0]
+         st2 {v15.4h, v16.4h}, [x15]
+         st2 {v31.2s, v0.2s}, [sp]
+// CHECK:	st2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x00,0x4c]
+// CHECK:	st2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c]
+// CHECK:	st2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x00,0x4c]
+// CHECK:	st2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x00,0x4c]
+// CHECK:	st2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x00,0x0c]
+// CHECK:	st2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c]
+// CHECK:	st2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x00,0x0c]
+
+         st2 {v0.16b-v1.16b}, [x0]
+         st2 {v15.8h-v16.8h}, [x15]
+         st2 {v31.4s-v0.4s}, [sp]
+         st2 {v0.2d-v1.2d}, [x0]
+         st2 {v0.8b-v1.8b}, [x0]
+         st2 {v15.4h-v16.4h}, [x15]
+         st2 {v31.2s-v0.2s}, [sp]
+// CHECK:	st2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x00,0x4c]
+// CHECK:	st2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c]
+// CHECK:	st2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x00,0x4c]
+// CHECK:	st2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x00,0x4c]
+// CHECK:	st2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x00,0x0c]
+// CHECK:	st2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c]
+// CHECK:	st2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x00,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 3-element structures from three consecutive registers
+//------------------------------------------------------------------------------
+         st3 {v0.16b, v1.16b, v2.16b}, [x0]
+         st3 {v15.8h, v16.8h, v17.8h}, [x15]
+         st3 {v31.4s, v0.4s, v1.4s}, [sp]
+         st3 {v0.2d, v1.2d, v2.2d}, [x0]
+         st3 {v0.8b, v1.8b, v2.8b}, [x0]
+         st3 {v15.4h, v16.4h, v17.4h}, [x15]
+         st3 {v31.2s, v0.2s, v1.2s}, [sp]
+// CHECK:	st3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c]
+// CHECK:	st3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c]
+// CHECK:	st3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
+// CHECK:	st3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
+// CHECK:	st3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c]
+// CHECK:	st3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c]
+// CHECK:	st3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
+
+         st3 {v0.16b-v2.16b}, [x0]
+         st3 {v15.8h-v17.8h}, [x15]
+         st3 {v31.4s-v1.4s}, [sp]
+         st3 {v0.2d-v2.2d}, [x0]
+         st3 {v0.8b-v2.8b}, [x0]
+         st3 {v15.4h-v17.4h}, [x15]
+         st3 {v31.2s-v1.2s}, [sp]
+// CHECK:	st3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c]
+// CHECK:	st3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c]
+// CHECK:	st3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
+// CHECK:	st3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
+// CHECK:	st3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c]
+// CHECK:	st3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c]
+// CHECK:	st3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 4-element structures from four consecutive registers
+//------------------------------------------------------------------------------
+         st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+         st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
+         st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
+         st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+         st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
+         st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+// CHECK:	st4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c]
+// CHECK:	st4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c]
+// CHECK:	st4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
+// CHECK:	st4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
+// CHECK:	st4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c]
+// CHECK:	st4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c]
+// CHECK:	st4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
+
+         st4 {v0.16b-v3.16b}, [x0]
+         st4 {v15.8h-v18.8h}, [x15]
+         st4 {v31.4s-v2.4s}, [sp]
+         st4 {v0.2d-v3.2d}, [x0]
+         st4 {v0.8b-v3.8b}, [x0]
+         st4 {v15.4h-v18.4h}, [x15]
+         st4 {v31.2s-v2.2s}, [sp]
+// CHECK:	st4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c]
+// CHECK:	st4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c]
+// CHECK:	st4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
+// CHECK:	st4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
+// CHECK:	st4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c]
+// CHECK:	st4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c]
+// CHECK:	st4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures to one register
+//------------------------------------------------------------------------------
+         ld1 {v0.16b}, [x0]
+         ld1 {v15.8h}, [x15]
+         ld1 {v31.4s}, [sp]
+         ld1 {v0.2d}, [x0]
+         ld1 {v0.8b}, [x0]
+         ld1 {v15.4h}, [x15]
+         ld1 {v31.2s}, [sp]
+         ld1 {v0.1d}, [x0]
+// CHECK:	ld1	{v0.16b}, [x0]          // encoding: [0x00,0x70,0x40,0x4c]
+// CHECK:	ld1	{v15.8h}, [x15]         // encoding: [0xef,0x75,0x40,0x4c]
+// CHECK:	ld1	{v31.4s}, [sp]          // encoding: [0xff,0x7b,0x40,0x4c]
+// CHECK:	ld1	{v0.2d}, [x0]           // encoding: [0x00,0x7c,0x40,0x4c]
+// CHECK:	ld1	{v0.8b}, [x0]           // encoding: [0x00,0x70,0x40,0x0c]
+// CHECK:	ld1	{v15.4h}, [x15]         // encoding: [0xef,0x75,0x40,0x0c]
+// CHECK:	ld1	{v31.2s}, [sp]          // encoding: [0xff,0x7b,0x40,0x0c]
+// CHECK:	ld1	{v0.1d}, [x0]           // encoding: [0x00,0x7c,0x40,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures to two consecutive registers
+//------------------------------------------------------------------------------
+         ld1 {v0.16b, v1.16b}, [x0]
+         ld1 {v15.8h, v16.8h}, [x15]
+         ld1 {v31.4s, v0.4s}, [sp]
+         ld1 {v0.2d, v1.2d}, [x0]
+         ld1 {v0.8b, v1.8b}, [x0]
+         ld1 {v15.4h, v16.4h}, [x15]
+         ld1 {v31.2s, v0.2s}, [sp]
+         ld1 {v0.1d, v1.1d}, [x0]
+// CHECK:	ld1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x40,0x4c]
+// CHECK:	ld1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
+// CHECK:	ld1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x40,0x4c]
+// CHECK:	ld1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x40,0x4c]
+// CHECK:	ld1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x40,0x0c]
+// CHECK:	ld1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
+// CHECK:	ld1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x40,0x0c]
+// CHECK:	ld1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x40,0x0c]
+
+         ld1 {v0.16b-v1.16b}, [x0]
+         ld1 {v15.8h-v16.8h}, [x15]
+         ld1 {v31.4s-v0.4s}, [sp]
+         ld1 {v0.2d-v1.2d}, [x0]
+         ld1 {v0.8b-v1.8b}, [x0]
+         ld1 {v15.4h-v16.4h}, [x15]
+         ld1 {v31.2s-v0.2s}, [sp]
+         ld1 {v0.1d-v1.1d}, [x0]
+// CHECK:	ld1	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xa0,0x40,0x4c]
+// CHECK:	ld1	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
+// CHECK:	ld1	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xab,0x40,0x4c]
+// CHECK:	ld1	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xac,0x40,0x4c]
+// CHECK:	ld1	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xa0,0x40,0x0c]
+// CHECK:	ld1	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
+// CHECK:	ld1	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xab,0x40,0x0c]
+// CHECK:	ld1	{v0.1d, v1.1d}, [x0]    // encoding: [0x00,0xac,0x40,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures to three consecutive registers
+//------------------------------------------------------------------------------
+         ld1 {v0.16b, v1.16b, v2.16b}, [x0]
+         ld1 {v15.8h, v16.8h, v17.8h}, [x15]
+         ld1 {v31.4s, v0.4s, v1.4s}, [sp]
+         ld1 {v0.2d, v1.2d, v2.2d}, [x0]
+         ld1 {v0.8b, v1.8b, v2.8b}, [x0]
+         ld1 {v15.4h, v16.4h, v17.4h}, [x15]
+         ld1 {v31.2s, v0.2s, v1.2s}, [sp]
+         ld1 {v0.1d, v1.1d, v2.1d}, [x0]
+// CHECK:	ld1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c]
+// CHECK:	ld1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c]
+// CHECK:	ld1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
+// CHECK:	ld1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
+// CHECK:	ld1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c]
+// CHECK:	ld1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c]
+// CHECK:	ld1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
+// CHECK:	ld1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
+
+         ld1 {v0.16b-v2.16b}, [x0]
+         ld1 {v15.8h-v17.8h}, [x15]
+         ld1 {v31.4s-v1.4s}, [sp]
+         ld1 {v0.2d-v2.2d}, [x0]
+         ld1 {v0.8b-v2.8b}, [x0]
+         ld1 {v15.4h-v17.4h}, [x15]
+         ld1 {v31.2s-v1.2s}, [sp]
+         ld1 {v0.1d-v2.1d}, [x0]
+// CHECK:	ld1	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c]
+// CHECK:	ld1	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c]
+// CHECK:	ld1	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
+// CHECK:	ld1	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
+// CHECK:	ld1	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c]
+// CHECK:	ld1	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c]
+// CHECK:	ld1	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
+// CHECK:	ld1	{v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures to four consecutive registers
+//------------------------------------------------------------------------------
+         ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+         ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
+         ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
+         ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+         ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
+         ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+         ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0]
+// CHECK:	ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c]
+// CHECK:	ld1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c]
+// CHECK:	ld1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
+// CHECK:	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
+// CHECK:	ld1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c]
+// CHECK:	ld1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c]
+// CHECK:	ld1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
+// CHECK:	ld1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
+
+         ld1 {v0.16b-v3.16b}, [x0]
+         ld1 {v15.8h-v18.8h}, [x15]
+         ld1 {v31.4s-v2.4s}, [sp]
+         ld1 {v0.2d-v3.2d}, [x0]
+         ld1 {v0.8b-v3.8b}, [x0]
+         ld1 {v15.4h-v18.4h}, [x15]
+         ld1 {v31.2s-v2.2s}, [sp]
+         ld1 {v0.1d-v3.1d}, [x0]
+// CHECK:	ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c]
+// CHECK:	ld1	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c]
+// CHECK:	ld1	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
+// CHECK:	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
+// CHECK:	ld1	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c]
+// CHECK:	ld1	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c]
+// CHECK:	ld1	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
+// CHECK:	ld1	{v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 4-element structures to two consecutive registers
+//------------------------------------------------------------------------------
+         ld2 {v0.16b, v1.16b}, [x0]
+         ld2 {v15.8h, v16.8h}, [x15]
+         ld2 {v31.4s, v0.4s}, [sp]
+         ld2 {v0.2d, v1.2d}, [x0]
+         ld2 {v0.8b, v1.8b}, [x0]
+         ld2 {v15.4h, v16.4h}, [x15]
+         ld2 {v31.2s, v0.2s}, [sp]
+// CHECK:	ld2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x40,0x4c]
+// CHECK:	ld2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c]
+// CHECK:	ld2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x40,0x4c]
+// CHECK:	ld2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x40,0x4c]
+// CHECK:	ld2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x40,0x0c]
+// CHECK:	ld2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c]
+// CHECK:	ld2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x40,0x0c]
+
+         ld2 {v0.16b-v1.16b}, [x0]
+         ld2 {v15.8h-v16.8h}, [x15]
+         ld2 {v31.4s-v0.4s}, [sp]
+         ld2 {v0.2d-v1.2d}, [x0]
+         ld2 {v0.8b-v1.8b}, [x0]
+         ld2 {v15.4h-v16.4h}, [x15]
+         ld2 {v31.2s-v0.2s}, [sp]
+// CHECK:	ld2	{v0.16b, v1.16b}, [x0]  // encoding: [0x00,0x80,0x40,0x4c]
+// CHECK:	ld2	{v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c]
+// CHECK:	ld2	{v31.4s, v0.4s}, [sp]   // encoding: [0xff,0x8b,0x40,0x4c]
+// CHECK:	ld2	{v0.2d, v1.2d}, [x0]    // encoding: [0x00,0x8c,0x40,0x4c]
+// CHECK:	ld2	{v0.8b, v1.8b}, [x0]    // encoding: [0x00,0x80,0x40,0x0c]
+// CHECK:	ld2	{v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c]
+// CHECK:	ld2	{v31.2s, v0.2s}, [sp]   // encoding: [0xff,0x8b,0x40,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 3-element structures to three consecutive registers
+//------------------------------------------------------------------------------
+         ld3 {v0.16b, v1.16b, v2.16b}, [x0]
+         ld3 {v15.8h, v16.8h, v17.8h}, [x15]
+         ld3 {v31.4s, v0.4s, v1.4s}, [sp]
+         ld3 {v0.2d, v1.2d, v2.2d}, [x0]
+         ld3 {v0.8b, v1.8b, v2.8b}, [x0]
+         ld3 {v15.4h, v16.4h, v17.4h}, [x15]
+         ld3 {v31.2s, v0.2s, v1.2s}, [sp]
+// CHECK:	ld3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c]
+// CHECK:	ld3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c]
+// CHECK:	ld3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
+// CHECK:	ld3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
+// CHECK:	ld3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c]
+// CHECK:	ld3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c]
+// CHECK:	ld3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
+
+         ld3 {v0.16b-v2.16b}, [x0]
+         ld3 {v15.8h-v17.8h}, [x15]
+         ld3 {v31.4s-v1.4s}, [sp]
+         ld3 {v0.2d-v2.2d}, [x0]
+         ld3 {v0.8b-v2.8b}, [x0]
+         ld3 {v15.4h-v17.4h}, [x15]
+         ld3 {v31.2s-v1.2s}, [sp]
+// CHECK:	ld3	{v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c]
+// CHECK:	ld3	{v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c]
+// CHECK:	ld3	{v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
+// CHECK:	ld3	{v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
+// CHECK:	ld3	{v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c]
+// CHECK:	ld3	{v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c]
+// CHECK:	ld3	{v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 4-element structures to four consecutive registers
+//------------------------------------------------------------------------------
+         ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+         ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
+         ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
+         ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+         ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
+         ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+// CHECK:	ld4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c]
+// CHECK:	ld4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c]
+// CHECK:	ld4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
+// CHECK:	ld4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
+// CHECK:	ld4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c]
+// CHECK:	ld4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c]
+// CHECK:	ld4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c]
+
+         ld4 {v0.16b-v3.16b}, [x0]
+         ld4 {v15.8h-v18.8h}, [x15]
+         ld4 {v31.4s-v2.4s}, [sp]
+         ld4 {v0.2d-v3.2d}, [x0]
+         ld4 {v0.8b-v3.8b}, [x0]
+         ld4 {v15.4h-v18.4h}, [x15]
+         ld4 {v31.2s-v2.2s}, [sp]
+// CHECK:	ld4	{v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c]
+// CHECK:	ld4	{v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c]
+// CHECK:	ld4	{v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
+// CHECK:	ld4	{v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
+// CHECK:	ld4	{v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c]
+// CHECK:	ld4	{v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c]
+// CHECK:	ld4	{v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c]
diff --git a/test/MC/AArch64/neon-simd-ldst-one-elem.s b/test/MC/AArch64/neon-simd-ldst-one-elem.s
new file mode 100644
index 0000000..140d752
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-ldst-one-elem.s
@@ -0,0 +1,325 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Load single 1-element structure to all lanes of 1 register
+//------------------------------------------------------------------------------
+         ld1r {v0.16b}, [x0]
+         ld1r {v15.8h}, [x15]
+         ld1r {v31.4s}, [sp]
+         ld1r {v0.2d}, [x0]
+         ld1r {v0.8b}, [x0]
+         ld1r {v15.4h}, [x15]
+         ld1r {v31.2s}, [sp]
+         ld1r {v0.1d}, [x0]
+// CHECK: ld1r {v0.16b}, [x0]          // encoding: [0x00,0xc0,0x40,0x4d]
+// CHECK: ld1r {v15.8h}, [x15]         // encoding: [0xef,0xc5,0x40,0x4d]
+// CHECK: ld1r {v31.4s}, [sp]          // encoding: [0xff,0xcb,0x40,0x4d]
+// CHECK: ld1r {v0.2d}, [x0]           // encoding: [0x00,0xcc,0x40,0x4d]
+// CHECK: ld1r {v0.8b}, [x0]           // encoding: [0x00,0xc0,0x40,0x0d]
+// CHECK: ld1r {v15.4h}, [x15]         // encoding: [0xef,0xc5,0x40,0x0d]
+// CHECK: ld1r {v31.2s}, [sp]          // encoding: [0xff,0xcb,0x40,0x0d]
+// CHECK: ld1r {v0.1d}, [x0]           // encoding: [0x00,0xcc,0x40,0x0d]
+
+//------------------------------------------------------------------------------
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2r {v0.16b, v1.16b}, [x0]
+         ld2r {v15.8h, v16.8h}, [x15]
+         ld2r {v31.4s, v0.4s}, [sp]
+         ld2r {v0.2d, v1.2d}, [x0]
+         ld2r {v0.8b, v1.8b}, [x0]
+         ld2r {v15.4h, v16.4h}, [x15]
+         ld2r {v31.2s, v0.2s}, [sp]
+         ld2r {v31.1d, v0.1d}, [sp]
+// CHECK: ld2r {v0.16b, v1.16b}, [x0]  // encoding: [0x00,0xc0,0x60,0x4d]
+// CHECK: ld2r {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xc5,0x60,0x4d]
+// CHECK: ld2r {v31.4s, v0.4s}, [sp]   // encoding: [0xff,0xcb,0x60,0x4d]
+// CHECK: ld2r {v0.2d, v1.2d}, [x0]    // encoding: [0x00,0xcc,0x60,0x4d]
+// CHECK: ld2r {v0.8b, v1.8b}, [x0]    // encoding: [0x00,0xc0,0x60,0x0d]
+// CHECK: ld2r {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xc5,0x60,0x0d]
+// CHECK: ld2r {v31.2s, v0.2s}, [sp]   // encoding: [0xff,0xcb,0x60,0x0d]
+// CHECK: ld2r {v31.1d, v0.1d}, [sp]   // encoding: [0xff,0xcf,0x60,0x0d]
+
+         ld3r {v0.16b, v1.16b, v2.16b}, [x0]
+         ld3r {v15.8h, v16.8h, v17.8h}, [x15]
+         ld3r {v31.4s, v0.4s, v1.4s}, [sp]
+         ld3r {v0.2d, v1.2d, v2.2d}, [x0]
+         ld3r {v0.8b, v1.8b, v2.8b}, [x0]
+         ld3r {v15.4h, v16.4h, v17.4h}, [x15]
+         ld3r {v31.2s, v0.2s, v1.2s}, [sp]
+         ld3r {v31.1d, v0.1d, v1.1d}, [sp]
+// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0xe0,0x40,0x4d]
+// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0xe5,0x40,0x4d]
+// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0xeb,0x40,0x4d]
+// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0xec,0x40,0x4d]
+// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0xe0,0x40,0x0d]
+// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0xe5,0x40,0x0d]
+// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0xeb,0x40,0x0d]
+// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp] // encoding: [0xff,0xef,0x40,0x0d]
+
+         ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+         ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
+         ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
+         ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+         ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+         ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
+         ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
+// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0xe0,0x60,0x4d]
+// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0xe5,0x60,0x4d]
+// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0xeb,0x60,0x4d]
+// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0xec,0x60,0x4d]
+// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0xe0,0x60,0x0d]
+// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0xe5,0x60,0x0d]
+// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0xeb,0x60,0x0d]
+// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp] // encoding: [0xff,0xef,0x60,0x0d]
+
+//------------------------------------------------------------------------------
+// Load single 1-element structure to one lane of 1 register.
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[9], [x0]
+         ld1 {v15.h}[7], [x15]
+         ld1 {v31.s}[3], [sp]
+         ld1 {v0.d}[1], [x0]
+// CHECK: ld1 {v0.b}[9], [x0]         // encoding: [0x00,0x04,0x40,0x4d]
+// CHECK: ld1 {v15.h}[7], [x15]       // encoding: [0xef,0x59,0x40,0x4d]
+// CHECK: ld1 {v31.s}[3], [sp]        // encoding: [0xff,0x93,0x40,0x4d]
+// CHECK: ld1 {v0.d}[1], [x0]         // encoding: [0x00,0x84,0x40,0x4d]
+
+//------------------------------------------------------------------------------
+// Load single N-element structure to one lane of N consecutive registers
+// (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2 {v0.b, v1.b}[9], [x0]
+         ld2 {v15.h, v16.h}[7], [x15]
+         ld2 {v31.s, v0.s}[3], [sp]
+         ld2 {v0.d, v1.d}[1], [x0]
+// CHECK: ld2 {v0.b, v1.b}[9], [x0]   // encoding: [0x00,0x04,0x60,0x4d]
+// CHECK: ld2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x60,0x4d]
+// CHECK: ld2 {v31.s, v0.s}[3], [sp]  // encoding: [0xff,0x93,0x60,0x4d]
+// CHECK: ld2 {v0.d, v1.d}[1], [x0]   // encoding: [0x00,0x84,0x60,0x4d]
+
+         ld3 {v0.b, v1.b, v2.b}[9], [x0]
+         ld3 {v15.h, v16.h, v17.h}[7], [x15]
+         ld3 {v31.s, v0.s, v1.s}[3], [sp]
+         ld3 {v0.d, v1.d, v2.d}[1], [x0]
+// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x40,0x4d]
+// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x40,0x4d]
+// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x40,0x4d]
+// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x40,0x4d]
+
+         ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+         ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
+         ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
+         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x60,0x4d]
+// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x60,0x4d]
+// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x60,0x4d]
+// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x60,0x4d]
+
+//------------------------------------------------------------------------------
+// Store single 1-element structure from one lane of 1 register.
+//------------------------------------------------------------------------------
+         st1 {v0.b}[9], [x0]
+         st1 {v15.h}[7], [x15]
+         st1 {v31.s}[3], [sp]
+         st1 {v0.d}[1], [x0]
+// CHECK: st1 {v0.b}[9], [x0]         // encoding: [0x00,0x04,0x00,0x4d]
+// CHECK: st1 {v15.h}[7], [x15]       // encoding: [0xef,0x59,0x00,0x4d]
+// CHECK: st1 {v31.s}[3], [sp]        // encoding: [0xff,0x93,0x00,0x4d]
+// CHECK: st1 {v0.d}[1], [x0]         // encoding: [0x00,0x84,0x00,0x4d]
+
+//------------------------------------------------------------------------------
+// Store single N-element structure from one lane of N consecutive registers
+// (N = 2,3,4)
+//------------------------------------------------------------------------------
+         st2 {v0.b, v1.b}[9], [x0]
+         st2 {v15.h, v16.h}[7], [x15]
+         st2 {v31.s, v0.s}[3], [sp]
+         st2 {v0.d, v1.d}[1], [x0]
+// CHECK: st2 {v0.b, v1.b}[9], [x0]   // encoding: [0x00,0x04,0x20,0x4d]
+// CHECK: st2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x20,0x4d]
+// CHECK: st2 {v31.s, v0.s}[3], [sp]  // encoding: [0xff,0x93,0x20,0x4d]
+// CHECK: st2 {v0.d, v1.d}[1], [x0]   // encoding: [0x00,0x84,0x20,0x4d]
+
+         st3 {v0.b, v1.b, v2.b}[9], [x0]
+         st3 {v15.h, v16.h, v17.h}[7], [x15]
+         st3 {v31.s, v0.s, v1.s}[3], [sp]
+         st3 {v0.d, v1.d, v2.d}[1], [x0]
+// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x00,0x4d]
+// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x00,0x4d]
+// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x00,0x4d]
+// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x00,0x4d]
+
+         st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+         st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
+         st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
+         st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x20,0x4d]
+// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x20,0x4d]
+// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x20,0x4d]
+// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x20,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index oad single 1-element structure to all lanes of 1 register
+//------------------------------------------------------------------------------
+         ld1r {v0.16b}, [x0], #1
+         ld1r {v15.8h}, [x15], #2
+         ld1r {v31.4s}, [sp], #4
+         ld1r {v0.2d}, [x0], #8
+         ld1r {v0.8b}, [x0], x0
+         ld1r {v15.4h}, [x15], x1
+         ld1r {v31.2s}, [sp], x2
+         ld1r {v0.1d}, [x0], x3
+// CHECK: ld1r {v0.16b}, [x0], #1      // encoding: [0x00,0xc0,0xdf,0x4d]
+// CHECK: ld1r {v15.8h}, [x15], #2     // encoding: [0xef,0xc5,0xdf,0x4d]
+// CHECK: ld1r {v31.4s}, [sp], #4      // encoding: [0xff,0xcb,0xdf,0x4d]
+// CHECK: ld1r {v0.2d}, [x0], #8       // encoding: [0x00,0xcc,0xdf,0x4d]
+// CHECK: ld1r {v0.8b}, [x0], x0       // encoding: [0x00,0xc0,0xc0,0x0d]
+// CHECK: ld1r {v15.4h}, [x15], x1     // encoding: [0xef,0xc5,0xc1,0x0d]
+// CHECK: ld1r {v31.2s}, [sp], x2      // encoding: [0xff,0xcb,0xc2,0x0d]
+// CHECK: ld1r {v0.1d}, [x0], x3       // encoding: [0x00,0xcc,0xc3,0x0d]
+
+//------------------------------------------------------------------------------
+// Post-index load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2r {v0.16b, v1.16b}, [x0], #2
+         ld2r {v15.8h, v16.8h}, [x15], #4
+         ld2r {v31.4s, v0.4s}, [sp], #8
+         ld2r {v0.2d, v1.2d}, [x0], #16
+         ld2r {v0.8b, v1.8b}, [x0], x6
+         ld2r {v15.4h, v16.4h}, [x15], x7
+         ld2r {v31.2s, v0.2s}, [sp], x9
+         ld2r {v31.1d, v0.1d}, [x0], x5
+// CHECK: ld2r {v0.16b, v1.16b}, [x0], #2 // encoding: [0x00,0xc0,0xff,0x4d]
+// CHECK: ld2r {v15.8h, v16.8h}, [x15], #4 // encoding: [0xef,0xc5,0xff,0x4d]
+// CHECK: ld2r {v31.4s, v0.4s}, [sp], #8 // encoding: [0xff,0xcb,0xff,0x4d]
+// CHECK: ld2r {v0.2d, v1.2d}, [x0], #16 // encoding: [0x00,0xcc,0xff,0x4d]
+// CHECK: ld2r {v0.8b, v1.8b}, [x0], x6 // encoding: [0x00,0xc0,0xe6,0x0d]
+// CHECK: ld2r {v15.4h, v16.4h}, [x15], x7 // encoding: [0xef,0xc5,0xe7,0x0d]
+// CHECK: ld2r {v31.2s, v0.2s}, [sp], x9 // encoding: [0xff,0xcb,0xe9,0x0d]
+// CHECK: ld2r {v31.1d, v0.1d}, [x0], x5 // encoding: [0x1f,0xcc,0xe5,0x0d]
+
+         ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9
+         ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6
+         ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7
+         ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5
+         ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
+         ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
+         ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12
+         ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24
+// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9 // encoding: [0x00,0xe0,0xc9,0x4d]
+// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6 // encoding: [0xef,0xe5,0xc6,0x4d]
+// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7 // encoding: [0xff,0xeb,0xc7,0x4d]
+// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5 // encoding: [0x00,0xec,0xc5,0x4d]
+// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3 // encoding: [0x00,0xe0,0xdf,0x0d]
+// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6 // encoding: [0xef,0xe5,0xdf,0x0d]
+// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12 // encoding: [0xff,0xeb,0xdf,0x0d]
+// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24 // encoding: [0xff,0xef,0xdf,0x0d]
+
+         ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4
+         ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8
+         ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16
+         ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32
+         ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5
+         ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9
+         ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
+         ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
+// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4 // encoding: [0x00,0xe0,0xff,0x4d]
+// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8 // encoding: [0xef,0xe5,0xff,0x4d]
+// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16 // encoding: [0xff,0xeb,0xff,0x4d]
+// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32 // encoding: [0x00,0xec,0xff,0x4d]
+// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5 // encoding: [0x00,0xe0,0xe5,0x0d]
+// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9 // encoding: [0xef,0xe5,0xe9,0x0d]
+// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30 // encoding: [0xff,0xeb,0xfe,0x0d]
+// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7 // encoding: [0xff,0xef,0xe7,0x0d]
+
+//------------------------------------------------------------------------------
+// Post-index load single 1-element structure to one lane of 1 register.
+//------------------------------------------------------------------------------
+         ld1 {v0.b}[9], [x0], #1
+         ld1 {v15.h}[7], [x15], x9
+         ld1 {v31.s}[3], [sp], x6
+         ld1 {v0.d}[1], [x0], #8
+// CHECK: ld1 {v0.b}[9], [x0], #1     // encoding: [0x00,0x04,0xdf,0x4d]
+// CHECK: ld1 {v15.h}[7], [x15], x9   // encoding: [0xef,0x59,0xc9,0x4d]
+// CHECK: ld1 {v31.s}[3], [sp], x6    // encoding: [0xff,0x93,0xc6,0x4d]
+// CHECK: ld1 {v0.d}[1], [x0], #8     // encoding: [0x00,0x84,0xdf,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index load single N-element structure to one lane of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         ld2 {v0.b, v1.b}[9], [x0], x3
+         ld2 {v15.h, v16.h}[7], [x15], #4
+         ld2 {v31.s, v0.s}[3], [sp], #8
+         ld2 {v0.d, v1.d}[1], [x0], x0
+// CHECK: ld2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xe3,0x4d]
+// CHECK: ld2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xff,0x4d]
+// CHECK: ld2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xff,0x4d]
+// CHECK: ld2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xe0,0x4d]
+
+         ld3 {v0.b, v1.b, v2.b}[9], [x0], #3
+         ld3 {v15.h, v16.h, v17.h}[7], [x15], #6
+         ld3 {v31.s, v0.s, v1.s}[3], [sp], x3
+         ld3 {v0.d, v1.d, v2.d}[1], [x0], x6
+// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0xdf,0x4d]
+// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0xdf,0x4d]
+// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0xc3,0x4d]
+// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0xc6,0x4d]
+
+         ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+         ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
+         ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
+         ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
+// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xe5,0x4d]
+// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xe7,0x4d]
+// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xff,0x4d]
+// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xff,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index store single 1-element structure from one lane of 1 register.
+//------------------------------------------------------------------------------
+         st1 {v0.b}[9], [x0], #1
+         st1 {v15.h}[7], [x15], x9
+         st1 {v31.s}[3], [sp], x6
+         st1 {v0.d}[1], [x0], #8
+// CHECK: st1 {v0.b}[9], [x0], #1     // encoding: [0x00,0x04,0x9f,0x4d]
+// CHECK: st1 {v15.h}[7], [x15], x9   // encoding: [0xef,0x59,0x89,0x4d]
+// CHECK: st1 {v31.s}[3], [sp], x6    // encoding: [0xff,0x93,0x86,0x4d]
+// CHECK: st1 {v0.d}[1], [x0], #8     // encoding: [0x00,0x84,0x9f,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index store single N-element structure from one lane of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+         st2 {v0.b, v1.b}[9], [x0], x3
+         st2 {v15.h, v16.h}[7], [x15], #4
+         st2 {v31.s, v0.s}[3], [sp], #8
+         st2 {v0.d, v1.d}[1], [x0], x0
+// CHECK: st2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xa3,0x4d]
+// CHECK: st2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xbf,0x4d]
+// CHECK: st2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xbf,0x4d]
+// CHECK: st2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xa0,0x4d]
+
+         st3 {v0.b, v1.b, v2.b}[9], [x0], #3
+         st3 {v15.h, v16.h, v17.h}[7], [x15], #6
+         st3 {v31.s, v0.s, v1.s}[3], [sp], x3
+         st3 {v0.d, v1.d, v2.d}[1], [x0], x6
+// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0x9f,0x4d]
+// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0x9f,0x4d]
+// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0x83,0x4d]
+// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0x86,0x4d]
+
+         st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+         st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
+         st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
+         st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
+// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xa5,0x4d]
+// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xa7,0x4d]
+// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xbf,0x4d]
+// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xbf,0x4d]
diff --git a/test/MC/AArch64/neon-simd-misc.s b/test/MC/AArch64/neon-simd-misc.s
new file mode 100644
index 0000000..9e0f9c5
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-misc.s
@@ -0,0 +1,646 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Element reverse
+//------------------------------------------------------------------------------
+         rev64 v0.16b, v31.16b
+         rev64 v2.8h, v4.8h
+         rev64 v6.4s, v8.4s
+         rev64 v1.8b, v9.8b
+         rev64 v13.4h, v21.4h
+         rev64 v4.2s, v0.2s
+
+// CHECK:	rev64	v0.16b, v31.16b         // encoding: [0xe0,0x0b,0x20,0x4e]
+// CHECK:	rev64	v2.8h, v4.8h            // encoding: [0x82,0x08,0x60,0x4e]
+// CHECK:	rev64	v6.4s, v8.4s            // encoding: [0x06,0x09,0xa0,0x4e]
+// CHECK:	rev64	v1.8b, v9.8b            // encoding: [0x21,0x09,0x20,0x0e]
+// CHECK:	rev64	v13.4h, v21.4h          // encoding: [0xad,0x0a,0x60,0x0e]
+// CHECK:	rev64	v4.2s, v0.2s            // encoding: [0x04,0x08,0xa0,0x0e]
+
+         rev32 v30.16b, v31.16b
+         rev32 v4.8h, v7.8h
+         rev32 v21.8b, v1.8b
+         rev32 v0.4h, v9.4h
+
+// CHECK:	rev32	v30.16b, v31.16b        // encoding: [0xfe,0x0b,0x20,0x6e]
+// CHECK:	rev32	v4.8h, v7.8h            // encoding: [0xe4,0x08,0x60,0x6e]
+// CHECK:	rev32	v21.8b, v1.8b           // encoding: [0x35,0x08,0x20,0x2e]
+// CHECK:	rev32	v0.4h, v9.4h            // encoding: [0x20,0x09,0x60,0x2e]
+
+         rev16 v30.16b, v31.16b
+         rev16 v21.8b, v1.8b
+
+// CHECK:	rev16	v30.16b, v31.16b        // encoding: [0xfe,0x1b,0x20,0x4e]
+// CHECK:	rev16	v21.8b, v1.8b           // encoding: [0x35,0x18,0x20,0x0e]
+
+//------------------------------------------------------------------------------
+// Signed integer pairwise add long
+//------------------------------------------------------------------------------
+
+         saddlp v3.8h, v21.16b
+         saddlp v8.4h, v5.8b
+         saddlp v9.4s, v1.8h
+         saddlp v0.2s, v1.4h
+         saddlp v12.2d, v4.4s
+         saddlp v17.1d, v28.2s
+
+// CHECK:	saddlp	v3.8h, v21.16b          // encoding: [0xa3,0x2a,0x20,0x4e]
+// CHECK:	saddlp	v8.4h, v5.8b            // encoding: [0xa8,0x28,0x20,0x0e]
+// CHECK:	saddlp	v9.4s, v1.8h            // encoding: [0x29,0x28,0x60,0x4e]
+// CHECK:	saddlp	v0.2s, v1.4h            // encoding: [0x20,0x28,0x60,0x0e]
+// CHECK:	saddlp	v12.2d, v4.4s           // encoding: [0x8c,0x28,0xa0,0x4e]
+// CHECK:	saddlp	v17.1d, v28.2s          // encoding: [0x91,0x2b,0xa0,0x0e]
+
+//------------------------------------------------------------------------------
+// Unsigned integer pairwise add long
+//------------------------------------------------------------------------------
+
+         uaddlp v3.8h, v21.16b
+         uaddlp v8.4h, v5.8b
+         uaddlp v9.4s, v1.8h
+         uaddlp v0.2s, v1.4h
+         uaddlp v12.2d, v4.4s
+         uaddlp v17.1d, v28.2s
+
+// CHECK:	uaddlp	v3.8h, v21.16b          // encoding: [0xa3,0x2a,0x20,0x6e]
+// CHECK:	uaddlp	v8.4h, v5.8b            // encoding: [0xa8,0x28,0x20,0x2e]
+// CHECK:	uaddlp	v9.4s, v1.8h            // encoding: [0x29,0x28,0x60,0x6e]
+// CHECK:	uaddlp	v0.2s, v1.4h            // encoding: [0x20,0x28,0x60,0x2e]
+// CHECK:	uaddlp	v12.2d, v4.4s           // encoding: [0x8c,0x28,0xa0,0x6e]
+// CHECK:	uaddlp	v17.1d, v28.2s          // encoding: [0x91,0x2b,0xa0,0x2e]
+
+//------------------------------------------------------------------------------
+// Signed integer pairwise add and accumulate long
+//------------------------------------------------------------------------------
+
+         sadalp v3.8h, v21.16b
+         sadalp v8.4h, v5.8b
+         sadalp v9.4s, v1.8h
+         sadalp v0.2s, v1.4h
+         sadalp v12.2d, v4.4s
+         sadalp v17.1d, v28.2s
+
+// CHECK:	sadalp	v3.8h, v21.16b          // encoding: [0xa3,0x6a,0x20,0x4e]
+// CHECK:	sadalp	v8.4h, v5.8b            // encoding: [0xa8,0x68,0x20,0x0e]
+// CHECK:	sadalp	v9.4s, v1.8h            // encoding: [0x29,0x68,0x60,0x4e]
+// CHECK:	sadalp	v0.2s, v1.4h            // encoding: [0x20,0x68,0x60,0x0e]
+// CHECK:	sadalp	v12.2d, v4.4s           // encoding: [0x8c,0x68,0xa0,0x4e]
+// CHECK:	sadalp	v17.1d, v28.2s          // encoding: [0x91,0x6b,0xa0,0x0e]
+
+//------------------------------------------------------------------------------
+// Unsigned integer pairwise add and accumulate long
+//------------------------------------------------------------------------------
+
+         uadalp v3.8h, v21.16b
+         uadalp v8.4h, v5.8b
+         uadalp v9.4s, v1.8h
+         uadalp v0.2s, v1.4h
+         uadalp v12.2d, v4.4s
+         uadalp v17.1d, v28.2s
+
+// CHECK:	uadalp	v3.8h, v21.16b          // encoding: [0xa3,0x6a,0x20,0x6e]
+// CHECK:	uadalp	v8.4h, v5.8b            // encoding: [0xa8,0x68,0x20,0x2e]
+// CHECK:	uadalp	v9.4s, v1.8h            // encoding: [0x29,0x68,0x60,0x6e]
+// CHECK:	uadalp	v0.2s, v1.4h            // encoding: [0x20,0x68,0x60,0x2e]
+// CHECK:	uadalp	v12.2d, v4.4s           // encoding: [0x8c,0x68,0xa0,0x6e]
+// CHECK:	uadalp	v17.1d, v28.2s          // encoding: [0x91,0x6b,0xa0,0x2e]
+
+//------------------------------------------------------------------------------
+// Signed integer saturating accumulate of unsigned value
+//------------------------------------------------------------------------------
+
+         suqadd v0.16b, v31.16b
+         suqadd v2.8h, v4.8h
+         suqadd v6.4s, v8.4s
+         suqadd v6.2d, v8.2d
+         suqadd v1.8b, v9.8b
+         suqadd v13.4h, v21.4h
+         suqadd v4.2s, v0.2s
+
+// CHECK:	suqadd	v0.16b, v31.16b         // encoding: [0xe0,0x3b,0x20,0x4e]
+// CHECK:	suqadd	v2.8h, v4.8h            // encoding: [0x82,0x38,0x60,0x4e]
+// CHECK:	suqadd	v6.4s, v8.4s            // encoding: [0x06,0x39,0xa0,0x4e]
+// CHECK:	suqadd	v6.2d, v8.2d            // encoding: [0x06,0x39,0xe0,0x4e]
+// CHECK:	suqadd	v1.8b, v9.8b            // encoding: [0x21,0x39,0x20,0x0e]
+// CHECK:	suqadd	v13.4h, v21.4h          // encoding: [0xad,0x3a,0x60,0x0e]
+// CHECK:	suqadd	v4.2s, v0.2s            // encoding: [0x04,0x38,0xa0,0x0e]
+
+//------------------------------------------------------------------------------
+// Unsigned integer saturating accumulate of signed value
+//------------------------------------------------------------------------------
+
+         usqadd v0.16b, v31.16b
+         usqadd v2.8h, v4.8h
+         usqadd v6.4s, v8.4s
+         usqadd v6.2d, v8.2d
+         usqadd v1.8b, v9.8b
+         usqadd v13.4h, v21.4h
+         usqadd v4.2s, v0.2s
+
+// CHECK:	usqadd	v0.16b, v31.16b         // encoding: [0xe0,0x3b,0x20,0x6e]
+// CHECK:	usqadd	v2.8h, v4.8h            // encoding: [0x82,0x38,0x60,0x6e]
+// CHECK:	usqadd	v6.4s, v8.4s            // encoding: [0x06,0x39,0xa0,0x6e]
+// CHECK:	usqadd	v6.2d, v8.2d            // encoding: [0x06,0x39,0xe0,0x6e]
+// CHECK:	usqadd	v1.8b, v9.8b            // encoding: [0x21,0x39,0x20,0x2e]
+// CHECK:	usqadd	v13.4h, v21.4h          // encoding: [0xad,0x3a,0x60,0x2e]
+// CHECK:	usqadd	v4.2s, v0.2s            // encoding: [0x04,0x38,0xa0,0x2e]
+
+//------------------------------------------------------------------------------
+// Integer saturating absolute
+//------------------------------------------------------------------------------
+
+         sqabs v0.16b, v31.16b
+         sqabs v2.8h, v4.8h
+         sqabs v6.4s, v8.4s
+         sqabs v6.2d, v8.2d
+         sqabs v1.8b, v9.8b
+         sqabs v13.4h, v21.4h
+         sqabs v4.2s, v0.2s
+
+// CHECK:	sqabs	v0.16b, v31.16b         // encoding: [0xe0,0x7b,0x20,0x4e]
+// CHECK:	sqabs	v2.8h, v4.8h            // encoding: [0x82,0x78,0x60,0x4e]
+// CHECK:	sqabs	v6.4s, v8.4s            // encoding: [0x06,0x79,0xa0,0x4e]
+// CHECK:	sqabs	v6.2d, v8.2d            // encoding: [0x06,0x79,0xe0,0x4e]
+// CHECK:	sqabs	v1.8b, v9.8b            // encoding: [0x21,0x79,0x20,0x0e]
+// CHECK:	sqabs	v13.4h, v21.4h          // encoding: [0xad,0x7a,0x60,0x0e]
+// CHECK:	sqabs	v4.2s, v0.2s            // encoding: [0x04,0x78,0xa0,0x0e]
+
+//------------------------------------------------------------------------------
+// Signed integer saturating negate
+//------------------------------------------------------------------------------
+
+         sqneg v0.16b, v31.16b
+         sqneg v2.8h, v4.8h
+         sqneg v6.4s, v8.4s
+         sqneg v6.2d, v8.2d
+         sqneg v1.8b, v9.8b
+         sqneg v13.4h, v21.4h
+         sqneg v4.2s, v0.2s
+
+// CHECK:	sqneg	v0.16b, v31.16b         // encoding: [0xe0,0x7b,0x20,0x6e]
+// CHECK:	sqneg	v2.8h, v4.8h            // encoding: [0x82,0x78,0x60,0x6e]
+// CHECK:	sqneg	v6.4s, v8.4s            // encoding: [0x06,0x79,0xa0,0x6e]
+// CHECK:	sqneg	v6.2d, v8.2d            // encoding: [0x06,0x79,0xe0,0x6e]
+// CHECK:	sqneg	v1.8b, v9.8b            // encoding: [0x21,0x79,0x20,0x2e]
+// CHECK:	sqneg	v13.4h, v21.4h          // encoding: [0xad,0x7a,0x60,0x2e]
+// CHECK:	sqneg	v4.2s, v0.2s            // encoding: [0x04,0x78,0xa0,0x2e]
+
+//------------------------------------------------------------------------------
+// Integer absolute
+//------------------------------------------------------------------------------
+
+         abs v0.16b, v31.16b
+         abs v2.8h, v4.8h
+         abs v6.4s, v8.4s
+         abs v6.2d, v8.2d
+         abs v1.8b, v9.8b
+         abs v13.4h, v21.4h
+         abs v4.2s, v0.2s
+
+// CHECK:	abs	v0.16b, v31.16b         // encoding: [0xe0,0xbb,0x20,0x4e]
+// CHECK:	abs	v2.8h, v4.8h            // encoding: [0x82,0xb8,0x60,0x4e]
+// CHECK:	abs	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa0,0x4e]
+// CHECK:	abs	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe0,0x4e]
+// CHECK:	abs	v1.8b, v9.8b            // encoding: [0x21,0xb9,0x20,0x0e]
+// CHECK:	abs	v13.4h, v21.4h          // encoding: [0xad,0xba,0x60,0x0e]
+// CHECK:	abs	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa0,0x0e]
+
+//------------------------------------------------------------------------------
+// Integer negate
+//------------------------------------------------------------------------------
+
+         neg v0.16b, v31.16b
+         neg v2.8h, v4.8h
+         neg v6.4s, v8.4s
+         neg v6.2d, v8.2d
+         neg v1.8b, v9.8b
+         neg v13.4h, v21.4h
+         neg v4.2s, v0.2s
+
+// CHECK:	neg	v0.16b, v31.16b         // encoding: [0xe0,0xbb,0x20,0x6e]
+// CHECK:	neg	v2.8h, v4.8h            // encoding: [0x82,0xb8,0x60,0x6e]
+// CHECK:	neg	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa0,0x6e]
+// CHECK:	neg	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe0,0x6e]
+// CHECK:	neg	v1.8b, v9.8b            // encoding: [0x21,0xb9,0x20,0x2e]
+// CHECK:	neg	v13.4h, v21.4h          // encoding: [0xad,0xba,0x60,0x2e]
+// CHECK:	neg	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa0,0x2e]
+
+//------------------------------------------------------------------------------
+// Integer count leading sign bits
+//------------------------------------------------------------------------------
+
+         cls v0.16b, v31.16b
+         cls v2.8h, v4.8h
+         cls v6.4s, v8.4s
+         cls v1.8b, v9.8b
+         cls v13.4h, v21.4h
+         cls v4.2s, v0.2s
+
+// CHECK:	cls	v0.16b, v31.16b         // encoding: [0xe0,0x4b,0x20,0x4e]
+// CHECK:	cls	v2.8h, v4.8h            // encoding: [0x82,0x48,0x60,0x4e]
+// CHECK:	cls	v6.4s, v8.4s            // encoding: [0x06,0x49,0xa0,0x4e]
+// CHECK:	cls	v1.8b, v9.8b            // encoding: [0x21,0x49,0x20,0x0e]
+// CHECK:	cls	v13.4h, v21.4h          // encoding: [0xad,0x4a,0x60,0x0e]
+// CHECK:	cls	v4.2s, v0.2s            // encoding: [0x04,0x48,0xa0,0x0e]
+
+//------------------------------------------------------------------------------
+// Integer count leading zeros
+//------------------------------------------------------------------------------
+
+         clz v0.16b, v31.16b
+         clz v2.8h, v4.8h
+         clz v6.4s, v8.4s
+         clz v1.8b, v9.8b
+         clz v13.4h, v21.4h
+         clz v4.2s, v0.2s
+
+// CHECK:	clz	v0.16b, v31.16b         // encoding: [0xe0,0x4b,0x20,0x6e]
+// CHECK:	clz	v2.8h, v4.8h            // encoding: [0x82,0x48,0x60,0x6e]
+// CHECK:	clz	v6.4s, v8.4s            // encoding: [0x06,0x49,0xa0,0x6e]
+// CHECK:	clz	v1.8b, v9.8b            // encoding: [0x21,0x49,0x20,0x2e]
+// CHECK:	clz	v13.4h, v21.4h          // encoding: [0xad,0x4a,0x60,0x2e]
+// CHECK:	clz	v4.2s, v0.2s            // encoding: [0x04,0x48,0xa0,0x2e]
+
+//------------------------------------------------------------------------------
+// Population count
+//------------------------------------------------------------------------------
+
+         cnt v0.16b, v31.16b
+         cnt v1.8b, v9.8b
+
+// CHECK:	cnt	v0.16b, v31.16b         // encoding: [0xe0,0x5b,0x20,0x4e]
+// CHECK:	cnt	v1.8b, v9.8b            // encoding: [0x21,0x59,0x20,0x0e]
+
+//------------------------------------------------------------------------------
+// Bitwise NOT
+//------------------------------------------------------------------------------
+
+         not v0.16b, v31.16b
+         not v1.8b, v9.8b
+
+// CHECK:	not	v0.16b, v31.16b         // encoding: [0xe0,0x5b,0x20,0x6e]
+// CHECK:	not	v1.8b, v9.8b            // encoding: [0x21,0x59,0x20,0x2e]
+
+//------------------------------------------------------------------------------
+// Bitwise reverse
+//------------------------------------------------------------------------------
+
+         rbit v0.16b, v31.16b
+         rbit v1.8b, v9.8b
+
+// CHECK:	rbit	v0.16b, v31.16b         // encoding: [0xe0,0x5b,0x60,0x6e]
+// CHECK:	rbit	v1.8b, v9.8b            // encoding: [0x21,0x59,0x60,0x2e]
+
+//------------------------------------------------------------------------------
+// Floating-point absolute
+//------------------------------------------------------------------------------
+
+         fabs v6.4s, v8.4s
+         fabs v6.2d, v8.2d
+         fabs v4.2s, v0.2s
+
+// CHECK:	fabs	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa0,0x4e]
+// CHECK:	fabs	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe0,0x4e]
+// CHECK:	fabs	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa0,0x0e]
+
+//------------------------------------------------------------------------------
+// Floating-point negate
+//------------------------------------------------------------------------------
+
+         fneg v6.4s, v8.4s
+         fneg v6.2d, v8.2d
+         fneg v4.2s, v0.2s
+
+// CHECK:	fneg	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa0,0x6e]
+// CHECK:	fneg	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe0,0x6e]
+// CHECK:	fneg	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa0,0x2e]
+
+//------------------------------------------------------------------------------
+// Integer extract and narrow
+//------------------------------------------------------------------------------
+
+         xtn2 v0.16b, v31.8h
+         xtn2 v2.8h, v4.4s
+         xtn2 v6.4s, v8.2d
+         xtn v1.8b, v9.8h
+         xtn v13.4h, v21.4s
+         xtn v4.2s, v0.2d
+
+// CHECK:	xtn2	v0.16b, v31.8h          // encoding: [0xe0,0x2b,0x21,0x4e]
+// CHECK:	xtn2	v2.8h, v4.4s            // encoding: [0x82,0x28,0x61,0x4e]
+// CHECK:	xtn2	v6.4s, v8.2d            // encoding: [0x06,0x29,0xa1,0x4e]
+// CHECK:	xtn	v1.8b, v9.8h            // encoding: [0x21,0x29,0x21,0x0e]
+// CHECK:	xtn	v13.4h, v21.4s          // encoding: [0xad,0x2a,0x61,0x0e]
+// CHECK:	xtn	v4.2s, v0.2d            // encoding: [0x04,0x28,0xa1,0x0e]
+
+//------------------------------------------------------------------------------
+// Signed integer saturating extract and unsigned narrow
+//------------------------------------------------------------------------------
+
+         sqxtun2 v0.16b, v31.8h
+         sqxtun2 v2.8h, v4.4s
+         sqxtun2 v6.4s, v8.2d
+         sqxtun v1.8b, v9.8h
+         sqxtun v13.4h, v21.4s
+         sqxtun v4.2s, v0.2d
+
+// CHECK:	sqxtun2	v0.16b, v31.8h          // encoding: [0xe0,0x2b,0x21,0x6e]
+// CHECK:	sqxtun2	v2.8h, v4.4s            // encoding: [0x82,0x28,0x61,0x6e]
+// CHECK:	sqxtun2	v6.4s, v8.2d            // encoding: [0x06,0x29,0xa1,0x6e]
+// CHECK:	sqxtun	v1.8b, v9.8h            // encoding: [0x21,0x29,0x21,0x2e]
+// CHECK:	sqxtun	v13.4h, v21.4s          // encoding: [0xad,0x2a,0x61,0x2e]
+// CHECK:	sqxtun	v4.2s, v0.2d            // encoding: [0x04,0x28,0xa1,0x2e]
+
+//------------------------------------------------------------------------------
+// Signed integer saturating extract and narrow
+//------------------------------------------------------------------------------
+
+         sqxtn2 v0.16b, v31.8h
+         sqxtn2 v2.8h, v4.4s
+         sqxtn2 v6.4s, v8.2d
+         sqxtn v1.8b, v9.8h
+         sqxtn v13.4h, v21.4s
+         sqxtn v4.2s, v0.2d
+
+// CHECK:	sqxtn2	v0.16b, v31.8h          // encoding: [0xe0,0x4b,0x21,0x4e]
+// CHECK:	sqxtn2	v2.8h, v4.4s            // encoding: [0x82,0x48,0x61,0x4e]
+// CHECK:	sqxtn2	v6.4s, v8.2d            // encoding: [0x06,0x49,0xa1,0x4e]
+// CHECK:	sqxtn	v1.8b, v9.8h            // encoding: [0x21,0x49,0x21,0x0e]
+// CHECK:	sqxtn	v13.4h, v21.4s          // encoding: [0xad,0x4a,0x61,0x0e]
+// CHECK:	sqxtn	v4.2s, v0.2d            // encoding: [0x04,0x48,0xa1,0x0e]
+
+//------------------------------------------------------------------------------
+// Unsigned integer saturating extract and narrow
+//------------------------------------------------------------------------------
+
+         uqxtn2 v0.16b, v31.8h
+         uqxtn2 v2.8h, v4.4s
+         uqxtn2 v6.4s, v8.2d
+         uqxtn v1.8b, v9.8h
+         uqxtn v13.4h, v21.4s
+         uqxtn v4.2s, v0.2d
+
+// CHECK:	uqxtn2	v0.16b, v31.8h          // encoding: [0xe0,0x4b,0x21,0x6e]
+// CHECK:	uqxtn2	v2.8h, v4.4s            // encoding: [0x82,0x48,0x61,0x6e]
+// CHECK:	uqxtn2	v6.4s, v8.2d            // encoding: [0x06,0x49,0xa1,0x6e]
+// CHECK:	uqxtn	v1.8b, v9.8h            // encoding: [0x21,0x49,0x21,0x2e]
+// CHECK:	uqxtn	v13.4h, v21.4s          // encoding: [0xad,0x4a,0x61,0x2e]
+// CHECK:	uqxtn	v4.2s, v0.2d            // encoding: [0x04,0x48,0xa1,0x2e]
+
+//------------------------------------------------------------------------------
+// Integer shift left long
+//------------------------------------------------------------------------------
+
+         shll2 v2.8h, v4.16b, #8
+         shll2 v6.4s, v8.8h, #16
+         shll2 v6.2d, v8.4s, #32
+         shll v2.8h, v4.8b, #8
+         shll v6.4s, v8.4h, #16
+         shll v6.2d, v8.2s, #32
+
+// CHECK:	shll2	v2.8h, v4.16b, #8      // encoding: [0x82,0x38,0x21,0x6e]
+// CHECK:	shll2	v6.4s, v8.8h, #16      // encoding: [0x06,0x39,0x61,0x6e]
+// CHECK:	shll2	v6.2d, v8.4s, #32      // encoding: [0x06,0x39,0xa1,0x6e]
+// CHECK:	shll	v2.8h, v4.8b, #8       // encoding: [0x82,0x38,0x21,0x2e]
+// CHECK:	shll	v6.4s, v8.4h, #16      // encoding: [0x06,0x39,0x61,0x2e]
+// CHECK:	shll	v6.2d, v8.2s, #32      // encoding: [0x06,0x39,0xa1,0x2e]
+
+//------------------------------------------------------------------------------
+// Floating-point convert downsize
+//------------------------------------------------------------------------------
+
+         fcvtn2 v2.8h, v4.4s
+         fcvtn2 v6.4s, v8.2d
+         fcvtn v13.4h, v21.4s
+         fcvtn v4.2s, v0.2d
+
+// CHECK:	fcvtn2	v2.8h, v4.4s            // encoding: [0x82,0x68,0x21,0x4e]
+// CHECK:	fcvtn2	v6.4s, v8.2d            // encoding: [0x06,0x69,0x61,0x4e]
+// CHECK:	fcvtn	v13.4h, v21.4s          // encoding: [0xad,0x6a,0x21,0x0e]
+// CHECK:	fcvtn	v4.2s, v0.2d            // encoding: [0x04,0x68,0x61,0x0e]
+
+//------------------------------------------------------------------------------
+// Floating-point convert downsize with inexact
+//------------------------------------------------------------------------------
+
+         fcvtxn2 v6.4s, v8.2d
+         fcvtxn v4.2s, v0.2d
+
+// CHECK:	fcvtxn2	v6.4s, v8.2d            // encoding: [0x06,0x69,0x61,0x6e]
+// CHECK:	fcvtxn	v4.2s, v0.2d            // encoding: [0x04,0x68,0x61,0x2e]
+
+//------------------------------------------------------------------------------
+// Floating-point convert upsize
+//------------------------------------------------------------------------------
+
+         fcvtl v9.4s, v1.4h
+         fcvtl v0.2d, v1.2s
+         fcvtl2 v12.4s, v4.8h
+         fcvtl2 v17.2d, v28.4s
+
+// CHECK:	fcvtl	v9.4s, v1.4h            // encoding: [0x29,0x78,0x21,0x0e]
+// CHECK:	fcvtl	v0.2d, v1.2s            // encoding: [0x20,0x78,0x61,0x0e]
+// CHECK:	fcvtl2	v12.4s, v4.8h           // encoding: [0x8c,0x78,0x21,0x4e]
+// CHECK:	fcvtl2	v17.2d, v28.4s          // encoding: [0x91,0x7b,0x61,0x4e]
+
+//------------------------------------------------------------------------------
+// Floating-point round to integral
+//------------------------------------------------------------------------------
+
+         frintn v6.4s, v8.4s
+         frintn v6.2d, v8.2d
+         frintn v4.2s, v0.2s
+
+// CHECK:	frintn	v6.4s, v8.4s            // encoding: [0x06,0x89,0x21,0x4e]
+// CHECK:	frintn	v6.2d, v8.2d            // encoding: [0x06,0x89,0x61,0x4e]
+// CHECK:	frintn	v4.2s, v0.2s            // encoding: [0x04,0x88,0x21,0x0e]
+
+         frinta v6.4s, v8.4s
+         frinta v6.2d, v8.2d
+         frinta v4.2s, v0.2s
+
+// CHECK:	frinta	v6.4s, v8.4s            // encoding: [0x06,0x89,0x21,0x6e]
+// CHECK:	frinta	v6.2d, v8.2d            // encoding: [0x06,0x89,0x61,0x6e]
+// CHECK:	frinta	v4.2s, v0.2s            // encoding: [0x04,0x88,0x21,0x2e]
+
+         frintp v6.4s, v8.4s
+         frintp v6.2d, v8.2d
+         frintp v4.2s, v0.2s
+
+// CHECK:	frintp	v6.4s, v8.4s            // encoding: [0x06,0x89,0xa1,0x4e]
+// CHECK:	frintp	v6.2d, v8.2d            // encoding: [0x06,0x89,0xe1,0x4e]
+// CHECK:	frintp	v4.2s, v0.2s            // encoding: [0x04,0x88,0xa1,0x0e]
+
+         frintm v6.4s, v8.4s
+         frintm v6.2d, v8.2d
+         frintm v4.2s, v0.2s
+
+// CHECK:	frintm	v6.4s, v8.4s            // encoding: [0x06,0x99,0x21,0x4e]
+// CHECK:	frintm	v6.2d, v8.2d            // encoding: [0x06,0x99,0x61,0x4e]
+// CHECK:	frintm	v4.2s, v0.2s            // encoding: [0x04,0x98,0x21,0x0e]
+
+         frintx v6.4s, v8.4s
+         frintx v6.2d, v8.2d
+         frintx v4.2s, v0.2s
+
+// CHECK:	frintx	v6.4s, v8.4s            // encoding: [0x06,0x99,0x21,0x6e]
+// CHECK:	frintx	v6.2d, v8.2d            // encoding: [0x06,0x99,0x61,0x6e]
+// CHECK:	frintx	v4.2s, v0.2s            // encoding: [0x04,0x98,0x21,0x2e]
+
+         frintz v6.4s, v8.4s
+         frintz v6.2d, v8.2d
+         frintz v4.2s, v0.2s
+
+// CHECK:	frintz	v6.4s, v8.4s            // encoding: [0x06,0x99,0xa1,0x4e]
+// CHECK:	frintz	v6.2d, v8.2d            // encoding: [0x06,0x99,0xe1,0x4e]
+// CHECK:	frintz	v4.2s, v0.2s            // encoding: [0x04,0x98,0xa1,0x0e]
+
+         frinti v6.4s, v8.4s
+         frinti v6.2d, v8.2d
+         frinti v4.2s, v0.2s
+
+// CHECK:	frinti	v6.4s, v8.4s            // encoding: [0x06,0x99,0xa1,0x6e]
+// CHECK:	frinti	v6.2d, v8.2d            // encoding: [0x06,0x99,0xe1,0x6e]
+// CHECK:	frinti	v4.2s, v0.2s            // encoding: [0x04,0x98,0xa1,0x2e]
+
+//------------------------------------------------------------------------------
+// Floating-point convert to integer
+//------------------------------------------------------------------------------
+
+         fcvtns v6.4s, v8.4s
+         fcvtns v6.2d, v8.2d
+         fcvtns v4.2s, v0.2s
+
+// CHECK:	fcvtns	v6.4s, v8.4s            // encoding: [0x06,0xa9,0x21,0x4e]
+// CHECK:	fcvtns	v6.2d, v8.2d            // encoding: [0x06,0xa9,0x61,0x4e]
+// CHECK:	fcvtns	v4.2s, v0.2s            // encoding: [0x04,0xa8,0x21,0x0e]
+
+         fcvtnu v6.4s, v8.4s
+         fcvtnu v6.2d, v8.2d
+         fcvtnu v4.2s, v0.2s
+
+// CHECK:	fcvtnu	v6.4s, v8.4s            // encoding: [0x06,0xa9,0x21,0x6e]
+// CHECK:	fcvtnu	v6.2d, v8.2d            // encoding: [0x06,0xa9,0x61,0x6e]
+// CHECK:	fcvtnu	v4.2s, v0.2s            // encoding: [0x04,0xa8,0x21,0x2e]
+
+         fcvtps v6.4s, v8.4s
+         fcvtps v6.2d, v8.2d
+         fcvtps v4.2s, v0.2s
+
+// CHECK:	fcvtps	v6.4s, v8.4s            // encoding: [0x06,0xa9,0xa1,0x4e]
+// CHECK:	fcvtps	v6.2d, v8.2d            // encoding: [0x06,0xa9,0xe1,0x4e]
+// CHECK:	fcvtps	v4.2s, v0.2s            // encoding: [0x04,0xa8,0xa1,0x0e]
+
+         fcvtpu v6.4s, v8.4s
+         fcvtpu v6.2d, v8.2d
+         fcvtpu v4.2s, v0.2s
+
+// CHECK:	fcvtpu	v6.4s, v8.4s            // encoding: [0x06,0xa9,0xa1,0x6e]
+// CHECK:	fcvtpu	v6.2d, v8.2d            // encoding: [0x06,0xa9,0xe1,0x6e]
+// CHECK:	fcvtpu	v4.2s, v0.2s            // encoding: [0x04,0xa8,0xa1,0x2e]
+
+         fcvtms v6.4s, v8.4s
+         fcvtms v6.2d, v8.2d
+         fcvtms v4.2s, v0.2s
+
+// CHECK:	fcvtms	v6.4s, v8.4s            // encoding: [0x06,0xb9,0x21,0x4e]
+// CHECK:	fcvtms	v6.2d, v8.2d            // encoding: [0x06,0xb9,0x61,0x4e]
+// CHECK:	fcvtms	v4.2s, v0.2s            // encoding: [0x04,0xb8,0x21,0x0e]
+
+         fcvtmu v6.4s, v8.4s
+         fcvtmu v6.2d, v8.2d
+         fcvtmu v4.2s, v0.2s
+
+// CHECK:	fcvtmu	v6.4s, v8.4s            // encoding: [0x06,0xb9,0x21,0x6e]
+// CHECK:	fcvtmu	v6.2d, v8.2d            // encoding: [0x06,0xb9,0x61,0x6e]
+// CHECK:	fcvtmu	v4.2s, v0.2s            // encoding: [0x04,0xb8,0x21,0x2e]
+
+         fcvtzs v6.4s, v8.4s
+         fcvtzs v6.2d, v8.2d
+         fcvtzs v4.2s, v0.2s
+
+// CHECK:	fcvtzs	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa1,0x4e]
+// CHECK:	fcvtzs	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe1,0x4e]
+// CHECK:	fcvtzs	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa1,0x0e]
+
+
+         fcvtzu v6.4s, v8.4s
+         fcvtzu v6.2d, v8.2d
+         fcvtzu v4.2s, v0.2s
+
+// CHECK:	fcvtzu	v6.4s, v8.4s            // encoding: [0x06,0xb9,0xa1,0x6e]
+// CHECK:	fcvtzu	v6.2d, v8.2d            // encoding: [0x06,0xb9,0xe1,0x6e]
+// CHECK:	fcvtzu	v4.2s, v0.2s            // encoding: [0x04,0xb8,0xa1,0x2e]
+
+         fcvtas v6.4s, v8.4s
+         fcvtas v6.2d, v8.2d
+         fcvtas v4.2s, v0.2s
+
+// CHECK:	fcvtas	v6.4s, v8.4s            // encoding: [0x06,0xc9,0x21,0x4e]
+// CHECK:	fcvtas	v6.2d, v8.2d            // encoding: [0x06,0xc9,0x61,0x4e]
+// CHECK:	fcvtas	v4.2s, v0.2s            // encoding: [0x04,0xc8,0x21,0x0e]
+
+         fcvtau v6.4s, v8.4s
+         fcvtau v6.2d, v8.2d
+         fcvtau v4.2s, v0.2s
+
+// CHECK:	fcvtau	v6.4s, v8.4s            // encoding: [0x06,0xc9,0x21,0x6e]
+// CHECK:	fcvtau	v6.2d, v8.2d            // encoding: [0x06,0xc9,0x61,0x6e]
+// CHECK:	fcvtau	v4.2s, v0.2s            // encoding: [0x04,0xc8,0x21,0x2e]
+
+         urecpe v6.4s, v8.4s
+         urecpe v4.2s, v0.2s
+
+// CHECK:	urecpe	v6.4s, v8.4s            // encoding: [0x06,0xc9,0xa1,0x4e]
+// CHECK:	urecpe	v4.2s, v0.2s            // encoding: [0x04,0xc8,0xa1,0x0e]
+
+         ursqrte v6.4s, v8.4s
+         ursqrte v4.2s, v0.2s
+
+// CHECK:	ursqrte	v6.4s, v8.4s            // encoding: [0x06,0xc9,0xa1,0x6e]
+// CHECK:	ursqrte	v4.2s, v0.2s            // encoding: [0x04,0xc8,0xa1,0x2e]
+
+         scvtf v6.4s, v8.4s
+         scvtf v6.2d, v8.2d
+         scvtf v4.2s, v0.2s
+
+// CHECK:	scvtf	v6.4s, v8.4s            // encoding: [0x06,0xd9,0x21,0x4e]
+// CHECK:	scvtf	v6.2d, v8.2d            // encoding: [0x06,0xd9,0x61,0x4e]
+// CHECK:	scvtf	v4.2s, v0.2s            // encoding: [0x04,0xd8,0x21,0x0e]
+
+         ucvtf v6.4s, v8.4s
+         ucvtf v6.2d, v8.2d
+         ucvtf v4.2s, v0.2s
+
+// CHECK:	ucvtf	v6.4s, v8.4s            // encoding: [0x06,0xd9,0x21,0x6e]
+// CHECK:	ucvtf	v6.2d, v8.2d            // encoding: [0x06,0xd9,0x61,0x6e]
+// CHECK:	ucvtf	v4.2s, v0.2s            // encoding: [0x04,0xd8,0x21,0x2e]
+
+         frecpe v6.4s, v8.4s
+         frecpe v6.2d, v8.2d
+         frecpe v4.2s, v0.2s
+
+// CHECK:	frecpe	v6.4s, v8.4s            // encoding: [0x06,0xd9,0xa1,0x4e]
+// CHECK:	frecpe	v6.2d, v8.2d            // encoding: [0x06,0xd9,0xe1,0x4e]
+// CHECK:	frecpe	v4.2s, v0.2s            // encoding: [0x04,0xd8,0xa1,0x0e]
+
+         frsqrte v6.4s, v8.4s
+         frsqrte v6.2d, v8.2d
+         frsqrte v4.2s, v0.2s
+
+// CHECK:	frsqrte	v6.4s, v8.4s            // encoding: [0x06,0xd9,0xa1,0x6e]
+// CHECK:	frsqrte	v6.2d, v8.2d            // encoding: [0x06,0xd9,0xe1,0x6e]
+// CHECK:	frsqrte	v4.2s, v0.2s            // encoding: [0x04,0xd8,0xa1,0x2e]
+
+         fsqrt v6.4s, v8.4s
+         fsqrt v6.2d, v8.2d
+         fsqrt v4.2s, v0.2s
+
+// CHECK:	fsqrt	v6.4s, v8.4s            // encoding: [0x06,0xf9,0xa1,0x6e]
+// CHECK:	fsqrt	v6.2d, v8.2d            // encoding: [0x06,0xf9,0xe1,0x6e]
+// CHECK:	fsqrt	v4.2s, v0.2s            // encoding: [0x04,0xf8,0xa1,0x2e]
+
+
diff --git a/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
new file mode 100644
index 0000000..8dc271e
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-post-ldst-multi-elem.s
@@ -0,0 +1,389 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures from one register (post-index)
+//------------------------------------------------------------------------------
+         ld1 {v0.16b}, [x0], x1
+         ld1 {v15.8h}, [x15], x2
+         ld1 {v31.4s}, [sp], #16
+         ld1 {v0.2d}, [x0], #16
+         ld1 {v0.8b}, [x0], x2
+         ld1 {v15.4h}, [x15], x3
+         ld1 {v31.2s}, [sp], #8
+         ld1 {v0.1d}, [x0], #8
+// CHECK: ld1 {v0.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x70,0xc1,0x4c]
+// CHECK: ld1 {v15.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x75,0xc2,0x4c]
+// CHECK: ld1 {v31.4s}, [sp], #16
+// CHECK:     // encoding: [0xff,0x7b,0xdf,0x4c]
+// CHECK: ld1 {v0.2d}, [x0], #16
+// CHECK:     // encoding: [0x00,0x7c,0xdf,0x4c]
+// CHECK: ld1 {v0.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x70,0xc2,0x0c]
+// CHECK: ld1 {v15.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x75,0xc3,0x0c]
+// CHECK: ld1 {v31.2s}, [sp], #8
+// CHECK:     // encoding: [0xff,0x7b,0xdf,0x0c]
+// CHECK: ld1 {v0.1d}, [x0], #8
+// CHECK:     // encoding: [0x00,0x7c,0xdf,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures from two consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         ld1 {v0.16b, v1.16b}, [x0], x1
+         ld1 {v15.8h, v16.8h}, [x15], x2
+         ld1 {v31.4s, v0.4s}, [sp], #32
+         ld1 {v0.2d, v1.2d}, [x0], #32
+         ld1 {v0.8b, v1.8b}, [x0], x2
+         ld1 {v15.4h, v16.4h}, [x15], x3
+         ld1 {v31.2s, v0.2s}, [sp], #16
+         ld1 {v0.1d, v1.1d}, [x0], #16
+// CHECK: ld1 {v0.16b, v1.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0xa0,0xc1,0x4c]
+// CHECK: ld1 {v15.8h, v16.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0xa5,0xc2,0x4c]
+// CHECK: ld1 {v31.4s, v0.4s}, [sp], #32
+// CHECK:     // encoding: [0xff,0xab,0xdf,0x4c]
+// CHECK: ld1 {v0.2d, v1.2d}, [x0], #32
+// CHECK:     // encoding: [0x00,0xac,0xdf,0x4c]
+// CHECK: ld1 {v0.8b, v1.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0xa0,0xc2,0x0c]
+// CHECK: ld1 {v15.4h, v16.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0xa5,0xc3,0x0c]
+// CHECK: ld1 {v31.2s, v0.2s}, [sp], #16
+// CHECK:     // encoding: [0xff,0xab,0xdf,0x0c]
+// CHECK: ld1 {v0.1d, v1.1d}, [x0], #16
+// CHECK:     // encoding: [0x00,0xac,0xdf,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures from three consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         ld1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         ld1 {v15.8h, v16.8h, v17.8h}, [x15], x2
+         ld1 {v31.4s, v0.4s, v1.4s}, [sp], #48
+         ld1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+         ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2
+         ld1 {v15.4h, v16.4h, v17.4h}, [x15], x3
+         ld1 {v31.2s, v0.2s, v1.2s}, [sp], #24
+         ld1 {v0.1d, v1.1d, v2.1d}, [x0], #24
+// CHECK: ld1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x60,0xc1,0x4c]
+// CHECK: ld1 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x65,0xc2,0x4c]
+// CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK:     // encoding: [0xff,0x6b,0xdf,0x4c]
+// CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK:     // encoding: [0x00,0x6c,0xdf,0x4c]
+// CHECK: ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x60,0xc2,0x0c]
+// CHECK: ld1 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x65,0xc3,0x0c]
+// CHECK: ld1 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK:     // encoding: [0xff,0x6b,0xdf,0x0c]
+// CHECK: ld1 {v0.1d, v1.1d, v2.1d}, [x0], #24
+// CHECK:     // encoding: [0x00,0x6c,0xdf,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 1-element structures from four consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+         ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+         ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+         ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+         ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+         ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+         ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
+// CHECK: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x20,0xc1,0x4c]
+// CHECK: ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x25,0xc2,0x4c]
+// CHECK: ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK:     // encoding: [0xff,0x2b,0xdf,0x4c]
+// CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK:     // encoding: [0x00,0x2c,0xdf,0x4c]
+// CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK:     // encoding: [0x00,0x20,0xc3,0x0c]
+// CHECK: ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK:     // encoding: [0xef,0x25,0xc4,0x0c]
+// CHECK: ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK:     // encoding: [0xff,0x2b,0xdf,0x0c]
+// CHECK: ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
+// CHECK:     // encoding: [0x00,0x2c,0xdf,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 2-element structures from two consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         ld2 {v0.16b, v1.16b}, [x0], x1
+         ld2 {v15.8h, v16.8h}, [x15], x2
+         ld2 {v31.4s, v0.4s}, [sp], #32
+         ld2 {v0.2d, v1.2d}, [x0], #32
+         ld2 {v0.8b, v1.8b}, [x0], x2
+         ld2 {v15.4h, v16.4h}, [x15], x3
+         ld2 {v31.2s, v0.2s}, [sp], #16
+// CHECK: ld2 {v0.16b, v1.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x80,0xc1,0x4c]
+// CHECK: ld2 {v15.8h, v16.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x85,0xc2,0x4c]
+// CHECK: ld2 {v31.4s, v0.4s}, [sp], #32
+// CHECK:     // encoding: [0xff,0x8b,0xdf,0x4c]
+// CHECK: ld2 {v0.2d, v1.2d}, [x0], #32
+// CHECK:     // encoding: [0x00,0x8c,0xdf,0x4c]
+// CHECK: ld2 {v0.8b, v1.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x80,0xc2,0x0c]
+// CHECK: ld2 {v15.4h, v16.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x85,0xc3,0x0c]
+// CHECK: ld2 {v31.2s, v0.2s}, [sp], #16
+// CHECK:     // encoding: [0xff,0x8b,0xdf,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 3-element structures from three consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         ld3 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         ld3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+         ld3 {v31.4s, v0.4s, v1.4s}, [sp], #48
+         ld3 {v0.2d, v1.2d, v2.2d}, [x0], #48
+         ld3 {v0.8b, v1.8b, v2.8b}, [x0], x2
+         ld3 {v15.4h, v16.4h, v17.4h}, [x15], x3
+         ld3 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x40,0xc1,0x4c]
+// CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x45,0xc2,0x4c]
+// CHECK: ld3 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK:     // encoding: [0xff,0x4b,0xdf,0x4c]
+// CHECK: ld3 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK:     // encoding: [0x00,0x4c,0xdf,0x4c]
+// CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x40,0xc2,0x0c]
+// CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x45,0xc3,0x0c]
+// CHECK: ld3 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK:     // encoding: [0xff,0x4b,0xdf,0x0c]
+
+//------------------------------------------------------------------------------
+// Load multiple 4-element structures from four consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+         ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+         ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+         ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+         ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+         ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x00,0xc1,0x4c]
+// CHECK: ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x05,0xc2,0x4c]
+// CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK:     // encoding: [0xff,0x0b,0xdf,0x4c]
+// CHECK: ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK:     // encoding: [0x00,0x0c,0xdf,0x4c]
+// CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK:     // encoding: [0x00,0x00,0xc3,0x0c]
+// CHECK: ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK:     // encoding: [0xef,0x05,0xc4,0x0c]
+// CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK:     // encoding: [0xff,0x0b,0xdf,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from one register (post-index)
+//------------------------------------------------------------------------------
+         st1 {v0.16b}, [x0], x1
+         st1 {v15.8h}, [x15], x2
+         st1 {v31.4s}, [sp], #16
+         st1 {v0.2d}, [x0], #16
+         st1 {v0.8b}, [x0], x2
+         st1 {v15.4h}, [x15], x3
+         st1 {v31.2s}, [sp], #8
+         st1 {v0.1d}, [x0], #8
+// CHECK: st1 {v0.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x70,0x81,0x4c]
+// CHECK: st1 {v15.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x75,0x82,0x4c]
+// CHECK: st1 {v31.4s}, [sp], #16
+// CHECK:     // encoding: [0xff,0x7b,0x9f,0x4c]
+// CHECK: st1 {v0.2d}, [x0], #16
+// CHECK:     // encoding: [0x00,0x7c,0x9f,0x4c]
+// CHECK: st1 {v0.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x70,0x82,0x0c]
+// CHECK: st1 {v15.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x75,0x83,0x0c]
+// CHECK: st1 {v31.2s}, [sp], #8
+// CHECK:     // encoding: [0xff,0x7b,0x9f,0x0c]
+// CHECK: st1 {v0.1d}, [x0], #8
+// CHECK:     // encoding: [0x00,0x7c,0x9f,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from two consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         st1 {v0.16b, v1.16b}, [x0], x1
+         st1 {v15.8h, v16.8h}, [x15], x2
+         st1 {v31.4s, v0.4s}, [sp], #32
+         st1 {v0.2d, v1.2d}, [x0], #32
+         st1 {v0.8b, v1.8b}, [x0], x2
+         st1 {v15.4h, v16.4h}, [x15], x3
+         st1 {v31.2s, v0.2s}, [sp], #16
+         st1 {v0.1d, v1.1d}, [x0], #16
+// CHECK: st1 {v0.16b, v1.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0xa0,0x81,0x4c]
+// CHECK: st1 {v15.8h, v16.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0xa5,0x82,0x4c]
+// CHECK: st1 {v31.4s, v0.4s}, [sp], #32
+// CHECK:     // encoding: [0xff,0xab,0x9f,0x4c]
+// CHECK: st1 {v0.2d, v1.2d}, [x0], #32
+// CHECK:     // encoding: [0x00,0xac,0x9f,0x4c]
+// CHECK: st1 {v0.8b, v1.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0xa0,0x82,0x0c]
+// CHECK: st1 {v15.4h, v16.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0xa5,0x83,0x0c]
+// CHECK: st1 {v31.2s, v0.2s}, [sp], #16
+// CHECK:     // encoding: [0xff,0xab,0x9f,0x0c]
+// CHECK: st1 {v0.1d, v1.1d}, [x0], #16
+// CHECK:     // encoding: [0x00,0xac,0x9f,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from three consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         st1 {v15.8h, v16.8h, v17.8h}, [x15], x2
+         st1 {v31.4s, v0.4s, v1.4s}, [sp], #48
+         st1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+         st1 {v0.8b, v1.8b, v2.8b}, [x0], x2
+         st1 {v15.4h, v16.4h, v17.4h}, [x15], x3
+         st1 {v31.2s, v0.2s, v1.2s}, [sp], #24
+         st1 {v0.1d, v1.1d, v2.1d}, [x0], #24
+// CHECK: st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x60,0x81,0x4c]
+// CHECK: st1 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x65,0x82,0x4c]
+// CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK:     // encoding: [0xff,0x6b,0x9f,0x4c]
+// CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK:     // encoding: [0x00,0x6c,0x9f,0x4c]
+// CHECK: st1 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x60,0x82,0x0c]
+// CHECK: st1 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x65,0x83,0x0c]
+// CHECK: st1 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK:     // encoding: [0xff,0x6b,0x9f,0x0c]
+// CHECK: st1 {v0.1d, v1.1d, v2.1d}, [x0], #24
+// CHECK:     // encoding: [0x00,0x6c,0x9f,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 1-element structures from four consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+         st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+         st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+         st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+         st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+         st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+         st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
+// CHECK: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x20,0x81,0x4c]
+// CHECK: st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x25,0x82,0x4c]
+// CHECK: st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK:     // encoding: [0xff,0x2b,0x9f,0x4c]
+// CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK:     // encoding: [0x00,0x2c,0x9f,0x4c]
+// CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK:     // encoding: [0x00,0x20,0x83,0x0c]
+// CHECK: st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK:     // encoding: [0xef,0x25,0x84,0x0c]
+// CHECK: st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK:     // encoding: [0xff,0x2b,0x9f,0x0c]
+// CHECK: st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0], #32
+// CHECK:     // encoding: [0x00,0x2c,0x9f,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 2-element structures from two consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         st2 {v0.16b, v1.16b}, [x0], x1
+         st2 {v15.8h, v16.8h}, [x15], x2
+         st2 {v31.4s, v0.4s}, [sp], #32
+         st2 {v0.2d, v1.2d}, [x0], #32
+         st2 {v0.8b, v1.8b}, [x0], x2
+         st2 {v15.4h, v16.4h}, [x15], x3
+         st2 {v31.2s, v0.2s}, [sp], #16
+// CHECK: st2 {v0.16b, v1.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x80,0x81,0x4c]
+// CHECK: st2 {v15.8h, v16.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x85,0x82,0x4c]
+// CHECK: st2 {v31.4s, v0.4s}, [sp], #32
+// CHECK:     // encoding: [0xff,0x8b,0x9f,0x4c]
+// CHECK: st2 {v0.2d, v1.2d}, [x0], #32
+// CHECK:     // encoding: [0x00,0x8c,0x9f,0x4c]
+// CHECK: st2 {v0.8b, v1.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x80,0x82,0x0c]
+// CHECK: st2 {v15.4h, v16.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x85,0x83,0x0c]
+// CHECK: st2 {v31.2s, v0.2s}, [sp], #16
+// CHECK:     // encoding: [0xff,0x8b,0x9f,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 3-element structures from three consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         st3 {v0.16b, v1.16b, v2.16b}, [x0], x1
+         st3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+         st3 {v31.4s, v0.4s, v1.4s}, [sp], #48
+         st3 {v0.2d, v1.2d, v2.2d}, [x0], #48
+         st3 {v0.8b, v1.8b, v2.8b}, [x0], x2
+         st3 {v15.4h, v16.4h, v17.4h}, [x15], x3
+         st3 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x40,0x81,0x4c]
+// CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x45,0x82,0x4c]
+// CHECK: st3 {v31.4s, v0.4s, v1.4s}, [sp], #48
+// CHECK:     // encoding: [0xff,0x4b,0x9f,0x4c]
+// CHECK: st3 {v0.2d, v1.2d, v2.2d}, [x0], #48
+// CHECK:     // encoding: [0x00,0x4c,0x9f,0x4c]
+// CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0], x2
+// CHECK:     // encoding: [0x00,0x40,0x82,0x0c]
+// CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15], x3
+// CHECK:     // encoding: [0xef,0x45,0x83,0x0c]
+// CHECK: st3 {v31.2s, v0.2s, v1.2s}, [sp], #24
+// CHECK:     // encoding: [0xff,0x4b,0x9f,0x0c]
+
+//------------------------------------------------------------------------------
+// Store multiple 4-element structures from four consecutive registers
+// (post-index)
+//------------------------------------------------------------------------------
+         st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+         st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+         st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+         st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+         st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+         st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+         st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+// CHECK:     // encoding: [0x00,0x00,0x81,0x4c]
+// CHECK: st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], x2
+// CHECK:     // encoding: [0xef,0x05,0x82,0x4c]
+// CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+// CHECK:     // encoding: [0xff,0x0b,0x9f,0x4c]
+// CHECK: st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64
+// CHECK:     // encoding: [0x00,0x0c,0x9f,0x4c]
+// CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+// CHECK:     // encoding: [0x00,0x00,0x83,0x0c]
+// CHECK: st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x4
+// CHECK:     // encoding: [0xef,0x05,0x84,0x0c]
+// CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], #32
+// CHECK:     // encoding: [0xff,0x0b,0x9f,0x0c]
diff --git a/test/MC/AArch64/neon-simd-shift.s b/test/MC/AArch64/neon-simd-shift.s
new file mode 100644
index 0000000..a164323
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-shift.s
@@ -0,0 +1,434 @@
+// RUN: llvm-mc -triple=aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Vector shift right by immediate
+//------------------------------------------------------------------------------
+         sshr v0.8b, v1.8b, #3
+         sshr v0.4h, v1.4h, #3
+         sshr v0.2s, v1.2s, #3
+         sshr v0.16b, v1.16b, #3
+         sshr v0.8h, v1.8h, #3
+         sshr v0.4s, v1.4s, #3
+         sshr v0.2d, v1.2d, #3
+// CHECK:	sshr	v0.8b, v1.8b, #3        // encoding: [0x20,0x04,0x0d,0x0f]
+// CHECK:	sshr	v0.4h, v1.4h, #3        // encoding: [0x20,0x04,0x1d,0x0f]
+// CHECK:	sshr	v0.2s, v1.2s, #3        // encoding: [0x20,0x04,0x3d,0x0f]
+// CHECK:	sshr	v0.16b, v1.16b, #3      // encoding: [0x20,0x04,0x0d,0x4f]
+// CHECK:	sshr	v0.8h, v1.8h, #3        // encoding: [0x20,0x04,0x1d,0x4f]
+// CHECK:	sshr	v0.4s, v1.4s, #3        // encoding: [0x20,0x04,0x3d,0x4f]
+// CHECK:	sshr	v0.2d, v1.2d, #3        // encoding: [0x20,0x04,0x7d,0x4f]
+
+//------------------------------------------------------------------------------
+// Vector  shift right by immediate
+//------------------------------------------------------------------------------
+         ushr v0.8b, v1.8b, #3
+         ushr v0.4h, v1.4h, #3
+         ushr v0.2s, v1.2s, #3
+         ushr v0.16b, v1.16b, #3
+         ushr v0.8h, v1.8h, #3
+         ushr v0.4s, v1.4s, #3
+         ushr v0.2d, v1.2d, #3
+
+// CHECK: 	ushr	v0.8b, v1.8b, #3        // encoding: [0x20,0x04,0x0d,0x2f]
+// CHECK: 	ushr	v0.4h, v1.4h, #3        // encoding: [0x20,0x04,0x1d,0x2f]
+// CHECK:	ushr	v0.2s, v1.2s, #3        // encoding: [0x20,0x04,0x3d,0x2f]
+// CHECK:	ushr	v0.16b, v1.16b, #3      // encoding: [0x20,0x04,0x0d,0x6f]
+// CHECK:	ushr	v0.8h, v1.8h, #3        // encoding: [0x20,0x04,0x1d,0x6f]
+// CHECK:	ushr	v0.4s, v1.4s, #3        // encoding: [0x20,0x04,0x3d,0x6f]
+// CHECK:	ushr	v0.2d, v1.2d, #3        // encoding: [0x20,0x04,0x7d,0x6f]
+
+//------------------------------------------------------------------------------
+// Vector shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         ssra v0.8b, v1.8b, #3
+         ssra v0.4h, v1.4h, #3
+         ssra v0.2s, v1.2s, #3
+         ssra v0.16b, v1.16b, #3
+         ssra v0.8h, v1.8h, #3
+         ssra v0.4s, v1.4s, #3
+         ssra v0.2d, v1.2d, #3
+
+// CHECK:	ssra	v0.8b, v1.8b, #3        // encoding: [0x20,0x14,0x0d,0x0f]
+// CHECK:	ssra	v0.4h, v1.4h, #3        // encoding: [0x20,0x14,0x1d,0x0f]
+// CHECK:	ssra	v0.2s, v1.2s, #3        // encoding: [0x20,0x14,0x3d,0x0f]
+// CHECK:	ssra	v0.16b, v1.16b, #3      // encoding: [0x20,0x14,0x0d,0x4f]
+// CHECK:	ssra	v0.8h, v1.8h, #3        // encoding: [0x20,0x14,0x1d,0x4f]
+// CHECK:	ssra	v0.4s, v1.4s, #3        // encoding: [0x20,0x14,0x3d,0x4f]
+// CHECK:	ssra	v0.2d, v1.2d, #3        // encoding: [0x20,0x14,0x7d,0x4f]
+
+//------------------------------------------------------------------------------
+// Vector  shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         usra v0.8b, v1.8b, #3
+         usra v0.4h, v1.4h, #3
+         usra v0.2s, v1.2s, #3
+         usra v0.16b, v1.16b, #3
+         usra v0.8h, v1.8h, #3
+         usra v0.4s, v1.4s, #3
+         usra v0.2d, v1.2d, #3
+
+// CHECK:	usra	v0.8b, v1.8b, #3        // encoding: [0x20,0x14,0x0d,0x2f]
+// CHECK:	usra	v0.4h, v1.4h, #3        // encoding: [0x20,0x14,0x1d,0x2f]
+// CHECK:	usra	v0.2s, v1.2s, #3        // encoding: [0x20,0x14,0x3d,0x2f]
+// CHECK:	usra	v0.16b, v1.16b, #3      // encoding: [0x20,0x14,0x0d,0x6f]
+// CHECK:	usra	v0.8h, v1.8h, #3        // encoding: [0x20,0x14,0x1d,0x6f]
+// CHECK:	usra	v0.4s, v1.4s, #3        // encoding: [0x20,0x14,0x3d,0x6f]
+// CHECK:	usra	v0.2d, v1.2d, #3        // encoding: [0x20,0x14,0x7d,0x6f]
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right by immediate
+//------------------------------------------------------------------------------
+         srshr v0.8b, v1.8b, #3
+         srshr v0.4h, v1.4h, #3
+         srshr v0.2s, v1.2s, #3
+         srshr v0.16b, v1.16b, #3
+         srshr v0.8h, v1.8h, #3
+         srshr v0.4s, v1.4s, #3
+         srshr v0.2d, v1.2d, #3
+
+// CHECK:	srshr	v0.8b, v1.8b, #3        // encoding: [0x20,0x24,0x0d,0x0f]
+// CHECK:	srshr	v0.4h, v1.4h, #3        // encoding: [0x20,0x24,0x1d,0x0f]
+// CHECK:	srshr	v0.2s, v1.2s, #3        // encoding: [0x20,0x24,0x3d,0x0f]
+// CHECK:	srshr	v0.16b, v1.16b, #3      // encoding: [0x20,0x24,0x0d,0x4f]
+// CHECK:	srshr	v0.8h, v1.8h, #3        // encoding: [0x20,0x24,0x1d,0x4f]
+// CHECK:	srshr	v0.4s, v1.4s, #3        // encoding: [0x20,0x24,0x3d,0x4f]
+// CHECK:	srshr	v0.2d, v1.2d, #3        // encoding: [0x20,0x24,0x7d,0x4f]
+
+
+//------------------------------------------------------------------------------
+// Vecotr rounding shift right by immediate
+//------------------------------------------------------------------------------
+         urshr v0.8b, v1.8b, #3
+         urshr v0.4h, v1.4h, #3
+         urshr v0.2s, v1.2s, #3
+         urshr v0.16b, v1.16b, #3
+         urshr v0.8h, v1.8h, #3
+         urshr v0.4s, v1.4s, #3
+         urshr v0.2d, v1.2d, #3
+
+// CHECK:	urshr	v0.8b, v1.8b, #3        // encoding: [0x20,0x24,0x0d,0x2f]
+// CHECK:	urshr	v0.4h, v1.4h, #3        // encoding: [0x20,0x24,0x1d,0x2f]
+// CHECK:	urshr	v0.2s, v1.2s, #3        // encoding: [0x20,0x24,0x3d,0x2f]
+// CHECK:	urshr	v0.16b, v1.16b, #3      // encoding: [0x20,0x24,0x0d,0x6f]
+// CHECK:	urshr	v0.8h, v1.8h, #3        // encoding: [0x20,0x24,0x1d,0x6f]
+// CHECK:	urshr	v0.4s, v1.4s, #3        // encoding: [0x20,0x24,0x3d,0x6f]
+// CHECK:	urshr	v0.2d, v1.2d, #3        // encoding: [0x20,0x24,0x7d,0x6f]
+
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         srsra v0.8b, v1.8b, #3
+         srsra v0.4h, v1.4h, #3
+         srsra v0.2s, v1.2s, #3
+         srsra v0.16b, v1.16b, #3
+         srsra v0.8h, v1.8h, #3
+         srsra v0.4s, v1.4s, #3
+         srsra v0.2d, v1.2d, #3
+
+// CHECK:	srsra	v0.8b, v1.8b, #3        // encoding: [0x20,0x34,0x0d,0x0f]
+// CHECK:	srsra	v0.4h, v1.4h, #3        // encoding: [0x20,0x34,0x1d,0x0f]
+// CHECK:	srsra	v0.2s, v1.2s, #3        // encoding: [0x20,0x34,0x3d,0x0f]
+// CHECK:	srsra	v0.16b, v1.16b, #3      // encoding: [0x20,0x34,0x0d,0x4f]
+// CHECK:	srsra	v0.8h, v1.8h, #3        // encoding: [0x20,0x34,0x1d,0x4f]
+// CHECK:	srsra	v0.4s, v1.4s, #3        // encoding: [0x20,0x34,0x3d,0x4f]
+// CHECK:	srsra	v0.2d, v1.2d, #3        // encoding: [0x20,0x34,0x7d,0x4f]
+
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right and accumulate by immediate
+//------------------------------------------------------------------------------
+         ursra v0.8b, v1.8b, #3
+         ursra v0.4h, v1.4h, #3
+         ursra v0.2s, v1.2s, #3
+         ursra v0.16b, v1.16b, #3
+         ursra v0.8h, v1.8h, #3
+         ursra v0.4s, v1.4s, #3
+         ursra v0.2d, v1.2d, #3
+
+// CHECK:	ursra	v0.8b, v1.8b, #3        // encoding: [0x20,0x34,0x0d,0x2f]
+// CHECK:	ursra	v0.4h, v1.4h, #3        // encoding: [0x20,0x34,0x1d,0x2f]
+// CHECK:	ursra	v0.2s, v1.2s, #3        // encoding: [0x20,0x34,0x3d,0x2f]
+// CHECK:	ursra	v0.16b, v1.16b, #3      // encoding: [0x20,0x34,0x0d,0x6f]
+// CHECK:	ursra	v0.8h, v1.8h, #3        // encoding: [0x20,0x34,0x1d,0x6f]
+// CHECK:	ursra	v0.4s, v1.4s, #3        // encoding: [0x20,0x34,0x3d,0x6f]
+// CHECK:	ursra	v0.2d, v1.2d, #3        // encoding: [0x20,0x34,0x7d,0x6f]
+
+
+//------------------------------------------------------------------------------
+// Vector shift right and insert by immediate
+//------------------------------------------------------------------------------
+         sri v0.8b, v1.8b, #3
+         sri v0.4h, v1.4h, #3
+         sri v0.2s, v1.2s, #3
+         sri v0.16b, v1.16b, #3
+         sri v0.8h, v1.8h, #3
+         sri v0.4s, v1.4s, #3
+         sri v0.2d, v1.2d, #3
+
+// CHECK:	sri	v0.8b, v1.8b, #3        // encoding: [0x20,0x44,0x0d,0x2f]
+// CHECK:	sri	v0.4h, v1.4h, #3        // encoding: [0x20,0x44,0x1d,0x2f]
+// CHECK:	sri	v0.2s, v1.2s, #3        // encoding: [0x20,0x44,0x3d,0x2f]
+// CHECK:	sri	v0.16b, v1.16b, #3      // encoding: [0x20,0x44,0x0d,0x6f]
+// CHECK:	sri	v0.8h, v1.8h, #3        // encoding: [0x20,0x44,0x1d,0x6f]
+// CHECK:	sri	v0.4s, v1.4s, #3        // encoding: [0x20,0x44,0x3d,0x6f]
+
+
+//------------------------------------------------------------------------------
+// Vector shift left and insert by immediate
+//------------------------------------------------------------------------------
+         sli v0.8b, v1.8b, #3
+         sli v0.4h, v1.4h, #3
+         sli v0.2s, v1.2s, #3
+         sli v0.16b, v1.16b, #3
+         sli v0.8h, v1.8h, #3
+         sli v0.4s, v1.4s, #3
+         sli v0.2d, v1.2d, #3
+
+// CHECK:	sli	v0.8b, v1.8b, #3        // encoding: [0x20,0x54,0x0b,0x2f]
+// CHECK:	sli	v0.4h, v1.4h, #3        // encoding: [0x20,0x54,0x13,0x2f]
+// CHECK:	sli	v0.2s, v1.2s, #3        // encoding: [0x20,0x54,0x23,0x2f]
+// CHECK:	sli	v0.16b, v1.16b, #3      // encoding: [0x20,0x54,0x0b,0x6f]
+// CHECK:	sli	v0.8h, v1.8h, #3        // encoding: [0x20,0x54,0x13,0x6f]
+// CHECK:	sli	v0.4s, v1.4s, #3        // encoding: [0x20,0x54,0x23,0x6f]
+// CHECK:	sli	v0.2d, v1.2d, #3        // encoding: [0x20,0x54,0x43,0x6f]
+
+//------------------------------------------------------------------------------
+// Vector saturating shift left unsigned by immediate
+//------------------------------------------------------------------------------
+         sqshlu v0.8b, v1.8b, #3
+         sqshlu v0.4h, v1.4h, #3
+         sqshlu v0.2s, v1.2s, #3
+         sqshlu v0.16b, v1.16b, #3
+         sqshlu v0.8h, v1.8h, #3
+         sqshlu v0.4s, v1.4s, #3
+         sqshlu v0.2d, v1.2d, #3
+
+// CHECK:	sqshlu	v0.8b, v1.8b, #3        // encoding: [0x20,0x64,0x0b,0x2f]
+// CHECK:	sqshlu	v0.4h, v1.4h, #3        // encoding: [0x20,0x64,0x13,0x2f]
+// CHECK:	sqshlu	v0.2s, v1.2s, #3        // encoding: [0x20,0x64,0x23,0x2f]
+// CHECK:	sqshlu	v0.16b, v1.16b, #3      // encoding: [0x20,0x64,0x0b,0x6f]
+// CHECK:	sqshlu	v0.8h, v1.8h, #3        // encoding: [0x20,0x64,0x13,0x6f]
+// CHECK:	sqshlu	v0.4s, v1.4s, #3        // encoding: [0x20,0x64,0x23,0x6f]
+// CHECK:	sqshlu	v0.2d, v1.2d, #3        // encoding: [0x20,0x64,0x43,0x6f]
+
+
+//------------------------------------------------------------------------------
+// Vector saturating shift left by immediate
+//------------------------------------------------------------------------------
+         sqshl v0.8b, v1.8b, #3
+         sqshl v0.4h, v1.4h, #3
+         sqshl v0.2s, v1.2s, #3
+         sqshl v0.16b, v1.16b, #3
+         sqshl v0.8h, v1.8h, #3
+         sqshl v0.4s, v1.4s, #3
+         sqshl v0.2d, v1.2d, #3
+
+// CHECK:	sqshl	v0.8b, v1.8b, #3        // encoding: [0x20,0x74,0x0b,0x0f]
+// CHECK:	sqshl	v0.4h, v1.4h, #3        // encoding: [0x20,0x74,0x13,0x0f]
+// CHECK:	sqshl	v0.2s, v1.2s, #3        // encoding: [0x20,0x74,0x23,0x0f]
+// CHECK:	sqshl	v0.16b, v1.16b, #3      // encoding: [0x20,0x74,0x0b,0x4f]
+// CHECK:	sqshl	v0.8h, v1.8h, #3        // encoding: [0x20,0x74,0x13,0x4f]
+// CHECK:	sqshl	v0.4s, v1.4s, #3        // encoding: [0x20,0x74,0x23,0x4f]
+// CHECK:	sqshl	v0.2d, v1.2d, #3        // encoding: [0x20,0x74,0x43,0x4f]
+
+
+
+//------------------------------------------------------------------------------
+// Vector saturating shift left by immediate
+//------------------------------------------------------------------------------
+         uqshl v0.8b, v1.8b, #3
+         uqshl v0.4h, v1.4h, #3
+         uqshl v0.2s, v1.2s, #3
+         uqshl v0.16b, v1.16b, #3
+         uqshl v0.8h, v1.8h, #3
+         uqshl v0.4s, v1.4s, #3
+         uqshl v0.2d, v1.2d, #3
+
+// CHECK:	uqshl	v0.8b, v1.8b, #3        // encoding: [0x20,0x74,0x0b,0x2f]
+// CHECK:	uqshl	v0.4h, v1.4h, #3        // encoding: [0x20,0x74,0x13,0x2f]
+// CHECK:	uqshl	v0.2s, v1.2s, #3        // encoding: [0x20,0x74,0x23,0x2f]
+// CHECK:	uqshl	v0.16b, v1.16b, #3      // encoding: [0x20,0x74,0x0b,0x6f]
+// CHECK:	uqshl	v0.8h, v1.8h, #3        // encoding: [0x20,0x74,0x13,0x6f]
+// CHECK:	uqshl	v0.4s, v1.4s, #3        // encoding: [0x20,0x74,0x23,0x6f]
+// CHECK:	uqshl	v0.2d, v1.2d, #3        // encoding: [0x20,0x74,0x43,0x6f]
+
+
+//------------------------------------------------------------------------------
+// Vector shift right narrow by immediate
+//------------------------------------------------------------------------------
+         shrn v0.8b, v1.8h, #3
+         shrn v0.4h, v1.4s, #3
+         shrn v0.2s, v1.2d, #3
+         shrn2 v0.16b, v1.8h, #3
+         shrn2 v0.8h, v1.4s, #3
+         shrn2 v0.4s, v1.2d, #3
+
+// CHECK:	shrn	v0.8b, v1.8h, #3        // encoding: [0x20,0x84,0x0d,0x0f]
+// CHECK:	shrn	v0.4h, v1.4s, #3        // encoding: [0x20,0x84,0x1d,0x0f]
+// CHECK:	shrn	v0.2s, v1.2d, #3        // encoding: [0x20,0x84,0x3d,0x0f]
+// CHECK:	shrn2	v0.16b, v1.8h, #3       // encoding: [0x20,0x84,0x0d,0x4f]
+// CHECK:	shrn2	v0.8h, v1.4s, #3        // encoding: [0x20,0x84,0x1d,0x4f]
+// CHECK:	shrn2	v0.4s, v1.2d, #3        // encoding: [0x20,0x84,0x3d,0x4f]
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right unsigned narrow by immediate
+//------------------------------------------------------------------------------
+         sqshrun v0.8b, v1.8h, #3
+         sqshrun v0.4h, v1.4s, #3
+         sqshrun v0.2s, v1.2d, #3
+         sqshrun2 v0.16b, v1.8h, #3
+         sqshrun2 v0.8h, v1.4s, #3
+         sqshrun2 v0.4s, v1.2d, #3
+
+// CHECK:	sqshrun	v0.8b, v1.8h, #3        // encoding: [0x20,0x84,0x0d,0x2f]
+// CHECK:	sqshrun	v0.4h, v1.4s, #3        // encoding: [0x20,0x84,0x1d,0x2f]
+// CHECK:	sqshrun	v0.2s, v1.2d, #3        // encoding: [0x20,0x84,0x3d,0x2f]
+// CHECK:	sqshrun2	v0.16b, v1.8h, #3 	// encoding: [0x20,0x84,0x0d,0x6f]
+// CHECK:	sqshrun2	v0.8h, v1.4s, #3 	// encoding: [0x20,0x84,0x1d,0x6f]
+// CHECK:	sqshrun2	v0.4s, v1.2d, #3 	// encoding: [0x20,0x84,0x3d,0x6f]
+
+//------------------------------------------------------------------------------
+// Vector rounding shift right narrow by immediate
+//------------------------------------------------------------------------------
+         rshrn v0.8b, v1.8h, #3
+         rshrn v0.4h, v1.4s, #3
+         rshrn v0.2s, v1.2d, #3
+         rshrn2 v0.16b, v1.8h, #3
+         rshrn2 v0.8h, v1.4s, #3
+         rshrn2 v0.4s, v1.2d, #3
+
+// CHECK:	rshrn	v0.8b, v1.8h, #3        // encoding: [0x20,0x8c,0x0d,0x0f]
+// CHECK:	rshrn	v0.4h, v1.4s, #3        // encoding: [0x20,0x8c,0x1d,0x0f]
+// CHECK:	rshrn	v0.2s, v1.2d, #3        // encoding: [0x20,0x8c,0x3d,0x0f]
+// CHECK:	rshrn2	v0.16b, v1.8h, #3       // encoding: [0x20,0x8c,0x0d,0x4f]
+// CHECK:	rshrn2	v0.8h, v1.4s, #3        // encoding: [0x20,0x8c,0x1d,0x4f]
+// CHECK:	rshrn2	v0.4s, v1.2d, #3        // encoding: [0x20,0x8c,0x3d,0x4f]
+
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right rounded unsigned narrow by immediate
+//------------------------------------------------------------------------------
+         sqrshrun v0.8b, v1.8h, #3
+         sqrshrun v0.4h, v1.4s, #3
+         sqrshrun v0.2s, v1.2d, #3
+         sqrshrun2 v0.16b, v1.8h, #3
+         sqrshrun2 v0.8h, v1.4s, #3
+         sqrshrun2 v0.4s, v1.2d, #3
+
+// CHECK:	sqrshrun	v0.8b, v1.8h, #3    // encoding: [0x20,0x8c,0x0d,0x2f]
+// CHECK:	sqrshrun	v0.4h, v1.4s, #3    // encoding: [0x20,0x8c,0x1d,0x2f]
+// CHECK:	sqrshrun	v0.2s, v1.2d, #3    // encoding: [0x20,0x8c,0x3d,0x2f]
+// CHECK:	sqrshrun2	v0.16b, v1.8h, #3   // encoding: [0x20,0x8c,0x0d,0x6f]
+// CHECK:	sqrshrun2	v0.8h, v1.4s, #3    // encoding: [0x20,0x8c,0x1d,0x6f]
+// CHECK:	sqrshrun2	v0.4s, v1.2d, #3    // encoding: [0x20,0x8c,0x3d,0x6f]
+
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right narrow by immediate
+//------------------------------------------------------------------------------
+         sqshrn v0.8b, v1.8h, #3
+         sqshrn v0.4h, v1.4s, #3
+         sqshrn v0.2s, v1.2d, #3
+         sqshrn2 v0.16b, v1.8h, #3
+         sqshrn2 v0.8h, v1.4s, #3
+         sqshrn2 v0.4s, v1.2d, #3
+
+// CHECK:	sqshrn	v0.8b, v1.8h, #3        // encoding: [0x20,0x94,0x0d,0x0f]
+// CHECK:	sqshrn	v0.4h, v1.4s, #3        // encoding: [0x20,0x94,0x1d,0x0f]
+// CHECK:	sqshrn	v0.2s, v1.2d, #3        // encoding: [0x20,0x94,0x3d,0x0f]
+// CHECK:	sqshrn2	v0.16b, v1.8h, #3       // encoding: [0x20,0x94,0x0d,0x4f]
+// CHECK:	sqshrn2	v0.8h, v1.4s, #3        // encoding: [0x20,0x94,0x1d,0x4f]
+// CHECK:	sqshrn2	v0.4s, v1.2d, #3        // encoding: [0x20,0x94,0x3d,0x4f]
+
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right narrow by immediate
+//------------------------------------------------------------------------------
+         uqshrn v0.8b, v1.8h, #3
+         uqshrn v0.4h, v1.4s, #3
+         uqshrn v0.2s, v1.2d, #3
+         uqshrn2 v0.16b, v1.8h, #3
+         uqshrn2 v0.8h, v1.4s, #3
+         uqshrn2 v0.4s, v1.2d, #3
+
+// CHECK:	uqshrn	v0.8b, v1.8h, #3        // encoding: [0x20,0x94,0x0d,0x2f]
+// CHECK:	uqshrn	v0.4h, v1.4s, #3        // encoding: [0x20,0x94,0x1d,0x2f]
+// CHECK:	uqshrn	v0.2s, v1.2d, #3        // encoding: [0x20,0x94,0x3d,0x2f]
+// CHECK:	uqshrn2	v0.16b, v1.8h, #3       // encoding: [0x20,0x94,0x0d,0x6f]
+// CHECK:	uqshrn2	v0.8h, v1.4s, #3        // encoding: [0x20,0x94,0x1d,0x6f]
+// CHECK:	uqshrn2	v0.4s, v1.2d, #3        // encoding: [0x20,0x94,0x3d,0x6f]
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right rounded narrow by immediate
+//------------------------------------------------------------------------------
+         sqrshrn v0.8b, v1.8h, #3
+         sqrshrn v0.4h, v1.4s, #3
+         sqrshrn v0.2s, v1.2d, #3
+         sqrshrn2 v0.16b, v1.8h, #3
+         sqrshrn2 v0.8h, v1.4s, #3
+         sqrshrn2 v0.4s, v1.2d, #3
+
+// CHECK:	sqrshrn	v0.8b, v1.8h, #3        // encoding: [0x20,0x9c,0x0d,0x0f]
+// CHECK:	sqrshrn	v0.4h, v1.4s, #3        // encoding: [0x20,0x9c,0x1d,0x0f]
+// CHECK:	sqrshrn	v0.2s, v1.2d, #3        // encoding: [0x20,0x9c,0x3d,0x0f]
+// CHECK:	sqrshrn2	v0.16b, v1.8h, #3   // encoding: [0x20,0x9c,0x0d,0x4f]
+// CHECK:	sqrshrn2	v0.8h, v1.4s, #3    // encoding: [0x20,0x9c,0x1d,0x4f]
+// CHECK:	sqrshrn2	v0.4s, v1.2d, #3    // encoding: [0x20,0x9c,0x3d,0x4f]
+
+
+//------------------------------------------------------------------------------
+// Vector saturating shift right rounded narrow by immediate
+//------------------------------------------------------------------------------
+         uqrshrn v0.8b, v1.8h, #3
+         uqrshrn v0.4h, v1.4s, #3
+         uqrshrn v0.2s, v1.2d, #3
+         uqrshrn2 v0.16b, v1.8h, #3
+         uqrshrn2 v0.8h, v1.4s, #3
+         uqrshrn2 v0.4s, v1.2d, #3
+
+// CHECK:	uqrshrn	v0.8b, v1.8h, #3        // encoding: [0x20,0x9c,0x0d,0x2f]
+// CHECK:	uqrshrn	v0.4h, v1.4s, #3        // encoding: [0x20,0x9c,0x1d,0x2f]
+// CHECK:	uqrshrn	v0.2s, v1.2d, #3        // encoding: [0x20,0x9c,0x3d,0x2f]
+// CHECK:	uqrshrn2	v0.16b, v1.8h, #3   // encoding: [0x20,0x9c,0x0d,0x6f]
+// CHECK:	uqrshrn2	v0.8h, v1.4s, #3    // encoding: [0x20,0x9c,0x1d,0x6f]
+// CHECK:	uqrshrn2	v0.4s, v1.2d, #3    // encoding: [0x20,0x9c,0x3d,0x6f]
+
+
+//------------------------------------------------------------------------------
+// Fixed-point convert to floating-point
+//------------------------------------------------------------------------------
+         scvtf v0.2s, v1.2s, #3
+         scvtf v0.4s, v1.4s, #3
+         scvtf v0.2d, v1.2d, #3
+         ucvtf v0.2s, v1.2s, #3
+         ucvtf v0.4s, v1.4s, #3
+         ucvtf v0.2d, v1.2d, #3
+
+// CHECK:	scvtf	v0.2s, v1.2s, #3        // encoding: [0x20,0xe4,0x3d,0x0f]
+// CHECK:	scvtf	v0.4s, v1.4s, #3        // encoding: [0x20,0xe4,0x3d,0x4f]
+// CHECK:	scvtf	v0.2d, v1.2d, #3        // encoding: [0x20,0xe4,0x7d,0x4f]
+// CHECK:	ucvtf	v0.2s, v1.2s, #3        // encoding: [0x20,0xe4,0x3d,0x2f]
+// CHECK:	ucvtf	v0.4s, v1.4s, #3        // encoding: [0x20,0xe4,0x3d,0x6f]
+// CHECK:	ucvtf	v0.2d, v1.2d, #3        // encoding: [0x20,0xe4,0x7d,0x6f]
+
+//------------------------------------------------------------------------------
+// Floating-point convert to fixed-point
+//------------------------------------------------------------------------------
+         fcvtzs v0.2s, v1.2s, #3
+         fcvtzs v0.4s, v1.4s, #3
+         fcvtzs v0.2d, v1.2d, #3
+         fcvtzu v0.2s, v1.2s, #3
+         fcvtzu v0.4s, v1.4s, #3
+         fcvtzu v0.2d, v1.2d, #3
+
+
+// CHECK:	fcvtzs	v0.2s, v1.2s, #3        // encoding: [0x20,0xfc,0x3d,0x0f]
+// CHECK:	fcvtzs	v0.4s, v1.4s, #3        // encoding: [0x20,0xfc,0x3d,0x4f]
+// CHECK:	fcvtzs	v0.2d, v1.2d, #3        // encoding: [0x20,0xfc,0x7d,0x4f]
+// CHECK:	fcvtzu	v0.2s, v1.2s, #3        // encoding: [0x20,0xfc,0x3d,0x2f]
+// CHECK:	fcvtzu	v0.4s, v1.4s, #3        // encoding: [0x20,0xfc,0x3d,0x6f]
+// CHECK:	fcvtzu	v0.2d, v1.2d, #3        // encoding: [0x20,0xfc,0x7d,0x6f]
+
diff --git a/test/MC/AArch64/neon-tbl.s b/test/MC/AArch64/neon-tbl.s
new file mode 100644
index 0000000..ff3e86b
--- /dev/null
+++ b/test/MC/AArch64/neon-tbl.s
@@ -0,0 +1,56 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Instructions across vector registers
+//------------------------------------------------------------------------------
+
+        tbl v0.8b, {v1.16b}, v2.8b
+        tbl v0.8b, {v1.16b, v2.16b}, v2.8b
+        tbl v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
+        tbl v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b
+        tbl v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b
+
+// CHECK: tbl	v0.8b, {v1.16b}, v2.8b  // encoding: [0x20,0x00,0x02,0x0e]
+// CHECK: tbl	v0.8b, {v1.16b, v2.16b}, v2.8b // encoding: [0x20,0x20,0x02,0x0e]
+// CHECK: tbl	v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b // encoding: [0x20,0x40,0x02,0x0e]
+// CHECK: tbl	v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b // encoding: [0x20,0x60,0x02,0x0e]
+// CHECK: tbl	v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b // encoding: [0xe0,0x63,0x02,0x0e]
+
+        tbl v0.16b, {v1.16b}, v2.16b
+        tbl v0.16b, {v1.16b, v2.16b}, v2.16b
+        tbl v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
+        tbl v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b
+        tbl v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b
+
+// CHECK: tbl	v0.16b, {v1.16b}, v2.16b // encoding: [0x20,0x00,0x02,0x4e]
+// CHECK: tbl	v0.16b, {v1.16b, v2.16b}, v2.16b // encoding: [0x20,0x20,0x02,0x4e]
+// CHECK: tbl	v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b // encoding: [0x20,0x40,0x02,0x4e]
+// CHECK: tbl	v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b // encoding: [0x20,0x60,0x02,0x4e]
+// CHECK: tbl	v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b // encoding: [0xc0,0x63,0x02,0x4e]
+
+        tbx v0.8b, {v1.16b}, v2.8b
+        tbx v0.8b, {v1.16b, v2.16b}, v2.8b
+        tbx v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
+        tbx v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b
+        tbx v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b
+
+// CHECK: tbx	v0.8b, {v1.16b}, v2.8b  // encoding: [0x20,0x10,0x02,0x0e]
+// CHECK: tbx	v0.8b, {v1.16b, v2.16b}, v2.8b // encoding: [0x20,0x30,0x02,0x0e]
+// CHECK: tbx	v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b // encoding: [0x20,0x50,0x02,0x0e]
+// CHECK: tbx	v0.8b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.8b // encoding: [0x20,0x70,0x02,0x0e]
+// CHECK: tbx	v0.8b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.8b // encoding: [0xe0,0x73,0x02,0x0e]
+
+        tbx v0.16b, {v1.16b}, v2.16b
+        tbx v0.16b, {v1.16b, v2.16b}, v2.16b
+        tbx v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
+        tbx v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b
+        tbx v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b
+
+// CHECK: tbx	v0.16b, {v1.16b}, v2.16b // encoding: [0x20,0x10,0x02,0x4e]
+// CHECK: tbx	v0.16b, {v1.16b, v2.16b}, v2.16b // encoding: [0x20,0x30,0x02,0x4e]
+// CHECK: tbx	v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b // encoding: [0x20,0x50,0x02,0x4e]
+// CHECK: tbx	v0.16b, {v1.16b, v2.16b, v3.16b, v4.16b}, v2.16b // encoding: [0x20,0x70,0x02,0x4e]
+// CHECK: tbx	v0.16b, {v30.16b, v31.16b, v0.16b, v1.16b}, v2.16b // encoding: [0xc0,0x73,0x02,0x4e]
+
diff --git a/test/MC/ARM/2010-11-30-reloc-movt.s b/test/MC/ARM/2010-11-30-reloc-movt.s
new file mode 100644
index 0000000..9de88f0
--- /dev/null
+++ b/test/MC/ARM/2010-11-30-reloc-movt.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc  %s -triple=armv7-linux-gnueabi -filetype=obj -o - | \
+// RUN:    llvm-readobj -s -sr -sd | FileCheck  %s
+
+	.syntax unified
+	.eabi_attribute	6, 10
+	.eabi_attribute	8, 1
+	.eabi_attribute	9, 2
+	.fpu	neon
+	.eabi_attribute	20, 1
+	.eabi_attribute	21, 1
+	.eabi_attribute	23, 3
+	.eabi_attribute	24, 1
+	.eabi_attribute	25, 1
+	.file	"/home/espindola/llvm/llvm/test/CodeGen/ARM/2010-11-30-reloc-movt.ll"
+	.text
+	.globl	barf
+	.align	2
+	.type	barf,%function
+barf:                                   @ @barf
+@ BB#0:                                 @ %entry
+	push	{r11, lr}
+	movw	r0, :lower16:a
+	movt	r0, :upper16:a
+	bl	foo
+	pop	{r11, pc}
+.Ltmp0:
+	.size	barf, .Ltmp0-barf
+
+
+
+// CHECK:        Section {
+// CHECK:          Name: .text
+// CHECK:          SectionData (
+// CHECK-NEXT:       0000: 00482DE9 000000E3 000040E3 FEFFFFEB
+// CHECK-NEXT:       0010: 0088BDE8
+// CHECK-NEXT:     )
+// CHECK:          Relocations [
+// CHECK-NEXT:       0x4 R_ARM_MOVW_ABS_NC a
+// CHECK-NEXT:       0x8 R_ARM_MOVT_ABS
+// CHECK-NEXT:       0xC R_ARM_CALL foo
+// CHECK-NEXT:     ]
diff --git a/test/MC/ARM/AlignedBundling/lit.local.cfg b/test/MC/ARM/AlignedBundling/lit.local.cfg
index 6c49f08..ba763cf 100644
--- a/test/MC/ARM/AlignedBundling/lit.local.cfg
+++ b/test/MC/ARM/AlignedBundling/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/ARM/align_arm_2_thumb.s b/test/MC/ARM/align_arm_2_thumb.s
new file mode 100644
index 0000000..120e964
--- /dev/null
+++ b/test/MC/ARM/align_arm_2_thumb.s
@@ -0,0 +1,15 @@
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %s
+@ RUN: llvm-objdump -triple thumbv7-none-linux -d %t.o | FileCheck --check-prefix=ARM_2_THUMB %s
+
+@ RUN: llvm-mc -triple armv7-apple-darwin -filetype=obj -o %t_darwin.o %s
+@ RUN: llvm-objdump -triple thumbv7-apple-darwin -d %t_darwin.o | FileCheck --check-prefix=ARM_2_THUMB %s
+
+.syntax unified
+.code 16
+@ ARM_2_THUMB-LABEL: foo
+foo:
+  add r0, r0
+.align 3
+@ ARM_2_THUMB: 2: 00 bf     nop
+  add r0, r0
+
diff --git a/test/MC/ARM/align_thumb_2_arm.s b/test/MC/ARM/align_thumb_2_arm.s
new file mode 100644
index 0000000..328bfab
--- /dev/null
+++ b/test/MC/ARM/align_thumb_2_arm.s
@@ -0,0 +1,15 @@
+@ RUN: llvm-mc -triple thumbv7-none-linux -filetype=obj -o %t.o %s
+@ RUN: llvm-objdump -triple armv7-none-linux -d %t.o | FileCheck --check-prefix=THUMB_2_ARM %s
+
+@ RUN: llvm-mc -triple thumbv7-apple-darwin -filetype=obj -o %t_darwin.o %s
+@ RUN: llvm-objdump -triple armv7-apple-darwin -d %t_darwin.o | FileCheck --check-prefix=THUMB_2_ARM %s
+
+.syntax unified
+.code 32
+@ THUMB_2_ARM-LABEL: foo
+foo:
+  add r0, r0
+.align 3
+@ THUMB_2_ARM: 4: 00 f0 20 e3    nop
+  add r0, r0
+
diff --git a/test/MC/ARM/arm-ldrd.s b/test/MC/ARM/arm-ldrd.s
new file mode 100644
index 0000000..c26ee25
--- /dev/null
+++ b/test/MC/ARM/arm-ldrd.s
@@ -0,0 +1,57 @@
+// RUN: not llvm-mc -arch arm -mattr=+v5te %s 2>&1 | FileCheck %s
+//
+// rdar://14479793
+
+ldrd r1, r2, [pc, #0]
+ldrd r1, r2, [r3, #4]
+ldrd r1, r2, [r3], #4
+ldrd r1, r2, [r3, #4]!
+ldrd r1, r2, [r3, -r4]!
+ldrd r1, r2, [r3, r4]
+ldrd r1, r2, [r3], r4
+// CHECK: error: Rt must be even-numbered
+// CHECK: error: Rt must be even-numbered
+// CHECK: error: Rt must be even-numbered
+// CHECK: error: Rt must be even-numbered
+// CHECK: error: Rt must be even-numbered
+// CHECK: error: Rt must be even-numbered
+// CHECK: error: Rt must be even-numbered
+
+ldrd r0, r3, [pc, #0]
+ldrd r0, r3, [r4, #4]
+ldrd r0, r3, [r4], #4
+ldrd r0, r3, [r4, #4]!
+ldrd r0, r3, [r4, -r5]!
+ldrd r0, r3, [r4, r5]
+ldrd r0, r3, [r4], r5
+// CHECK: error: destination operands must be sequential
+// CHECK: error: destination operands must be sequential
+// CHECK: error: destination operands must be sequential
+// CHECK: error: destination operands must be sequential
+// CHECK: error: destination operands must be sequential
+// CHECK: error: destination operands must be sequential
+// CHECK: error: destination operands must be sequential
+
+ldrd lr, pc, [pc, #0]
+ldrd lr, pc, [r3, #4]
+ldrd lr, pc, [r3], #4
+ldrd lr, pc, [r3, #4]!
+ldrd lr, pc, [r3, -r4]!
+ldrd lr, pc, [r3, r4]
+ldrd lr, pc, [r3], r4
+// CHECK: error: Rt can't be R14
+// CHECK: error: Rt can't be R14
+// CHECK: error: Rt can't be R14
+// CHECK: error: Rt can't be R14
+// CHECK: error: Rt can't be R14
+// CHECK: error: Rt can't be R14
+// CHECK: error: Rt can't be R14
+
+ldrd r0, r1, [r0], #4
+ldrd r0, r1, [r1], #4
+ldrd r0, r1, [r0, #4]!
+ldrd r0, r1, [r1, #4]!
+// CHECK: error: base register needs to be different from destination registers
+// CHECK: error: base register needs to be different from destination registers
+// CHECK: error: base register needs to be different from destination registers
+// CHECK: error: base register needs to be different from destination registers
diff --git a/test/MC/ARM/arm-memory-instructions.s b/test/MC/ARM/arm-memory-instructions.s
index e9f0c3d..ad35dd2 100644
--- a/test/MC/ARM/arm-memory-instructions.s
+++ b/test/MC/ARM/arm-memory-instructions.s
@@ -114,21 +114,21 @@ _func:
 @------------------------------------------------------------------------------
 @ LDRD (immediate)
 @------------------------------------------------------------------------------
-        ldrd r3, r4, [r5]
-        ldrd r7, r8, [r2, #15]
-        ldrd r1, r2, [r9, #32]!
+        ldrd r2, r3, [r5]
+        ldrd r6, r7, [r2, #15]
+        ldrd r0, r1, [r9, #32]!
         ldrd r6, r7, [r1], #8
-        ldrd r1, r2, [r8], #0
-        ldrd r1, r2, [r8], #+0
-        ldrd r1, r2, [r8], #-0
+        ldrd r0, r1, [r8], #0
+        ldrd r0, r1, [r8], #+0
+        ldrd r0, r1, [r8], #-0
 
-@ CHECK: ldrd	r3, r4, [r5]            @ encoding: [0xd0,0x30,0xc5,0xe1]
-@ CHECK: ldrd	r7, r8, [r2, #15]       @ encoding: [0xdf,0x70,0xc2,0xe1]
-@ CHECK: ldrd	r1, r2, [r9, #32]!      @ encoding: [0xd0,0x12,0xe9,0xe1]
-@ CHECK: ldrd	r6, r7, [r1], #8        @ encoding: [0xd8,0x60,0xc1,0xe0]
-@ CHECK: ldrd	r1, r2, [r8], #0        @ encoding: [0xd0,0x10,0xc8,0xe0]
-@ CHECK: ldrd	r1, r2, [r8], #0        @ encoding: [0xd0,0x10,0xc8,0xe0]
-@ CHECK: ldrd	r1, r2, [r8], #-0       @ encoding: [0xd0,0x10,0x48,0xe0]
+@ CHECK: ldrd r2, r3, [r5]            @ encoding: [0xd0,0x20,0xc5,0xe1]
+@ CHECK: ldrd r6, r7, [r2, #15]       @ encoding: [0xdf,0x60,0xc2,0xe1]
+@ CHECK: ldrd r0, r1, [r9, #32]!      @ encoding: [0xd0,0x02,0xe9,0xe1]
+@ CHECK: ldrd r6, r7, [r1], #8        @ encoding: [0xd8,0x60,0xc1,0xe0]
+@ CHECK: ldrd r0, r1, [r8], #0        @ encoding: [0xd0,0x00,0xc8,0xe0]
+@ CHECK: ldrd r0, r1, [r8], #0        @ encoding: [0xd0,0x00,0xc8,0xe0]
+@ CHECK: ldrd r0, r1, [r8], #-0       @ encoding: [0xd0,0x00,0x48,0xe0]
 
 
 @------------------------------------------------------------------------------
@@ -143,15 +143,15 @@ Lbaz: .quad 0
 @------------------------------------------------------------------------------
 @ LDRD (register)
 @------------------------------------------------------------------------------
-        ldrd r3, r4, [r1, r3]
+        ldrd r4, r5, [r1, r3]
         ldrd r4, r5, [r7, r2]!
-        ldrd r1, r2, [r8], r12
-        ldrd r1, r2, [r8], -r12
+        ldrd r0, r1, [r8], r12
+        ldrd r0, r1, [r8], -r12
 
-@ CHECK: ldrd	r3, r4, [r1, r3]        @ encoding: [0xd3,0x30,0x81,0xe1]
-@ CHECK: ldrd	r4, r5, [r7, r2]!       @ encoding: [0xd2,0x40,0xa7,0xe1]
-@ CHECK: ldrd	r1, r2, [r8], r12       @ encoding: [0xdc,0x10,0x88,0xe0]
-@ CHECK: ldrd	r1, r2, [r8], -r12      @ encoding: [0xdc,0x10,0x08,0xe0]
+@ CHECK: ldrd r4, r5, [r1, r3]        @ encoding: [0xd3,0x40,0x81,0xe1]
+@ CHECK: ldrd r4, r5, [r7, r2]!       @ encoding: [0xd2,0x40,0xa7,0xe1]
+@ CHECK: ldrd r0, r1, [r8], r12       @ encoding: [0xdc,0x00,0x88,0xe0]
+@ CHECK: ldrd r0, r1, [r8], -r12      @ encoding: [0xdc,0x00,0x08,0xe0]
 
 
 @------------------------------------------------------------------------------
diff --git a/test/MC/ARM/basic-arm-instructions-v8.s b/test/MC/ARM/basic-arm-instructions-v8.s
new file mode 100644
index 0000000..4ed83c1
--- /dev/null
+++ b/test/MC/ARM/basic-arm-instructions-v8.s
@@ -0,0 +1,59 @@
+@ New ARMv8 A32 encodings
+
+@ RUN: llvm-mc -triple armv8 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-V8
+@ RUN: not llvm-mc -triple armv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V7
+
+@ HLT
+        hlt  #0
+        hlt  #65535
+@ CHECK-V8: hlt  #0                       @ encoding: [0x70,0x00,0x00,0xe1]
+@ CHECK-V8: hlt  #65535                   @ encoding: [0x7f,0xff,0x0f,0xe1]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+
+@ AL condition code allowable
+        hltal  #0
+@ CHECK-V8: hlt  #0                       @ encoding: [0x70,0x00,0x00,0xe1]
+@ CHECK-V7: error: instruction requires: armv8
+
+@------------------------------------------------------------------------------
+@ DMB (v8 barriers)
+@------------------------------------------------------------------------------
+        dmb ishld
+        dmb oshld
+        dmb nshld
+        dmb ld
+
+@ CHECK-V8: dmb ishld @ encoding: [0x59,0xf0,0x7f,0xf5]
+@ CHECK-V8: dmb oshld @ encoding: [0x51,0xf0,0x7f,0xf5]
+@ CHECK-V8: dmb nshld @ encoding: [0x55,0xf0,0x7f,0xf5]
+@ CHECK-V8: dmb ld @ encoding: [0x5d,0xf0,0x7f,0xf5]
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+
+@------------------------------------------------------------------------------
+@ DSB (v8 barriers)
+@------------------------------------------------------------------------------
+        dsb ishld
+        dsb oshld
+        dsb nshld
+        dsb ld
+
+@ CHECK-V8: dsb ishld @ encoding: [0x49,0xf0,0x7f,0xf5]
+@ CHECK-V8: dsb oshld @ encoding: [0x41,0xf0,0x7f,0xf5]
+@ CHECK-V8: dsb nshld @ encoding: [0x45,0xf0,0x7f,0xf5]
+@ CHECK-V8: dsb ld @ encoding: [0x4d,0xf0,0x7f,0xf5]
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+
+@------------------------------------------------------------------------------
+@ SEVL
+@------------------------------------------------------------------------------
+        sevl
+
+@ CHECK-V8: sevl @ encoding: [0x05,0xf0,0x20,0xe3]
+@ CHECK-V7: error: instruction requires: armv8
diff --git a/test/MC/ARM/basic-arm-instructions.s b/test/MC/ARM/basic-arm-instructions.s
index ead2ce1..29bc6c0 100644
--- a/test/MC/ARM/basic-arm-instructions.s
+++ b/test/MC/ARM/basic-arm-instructions.s
@@ -153,7 +153,6 @@ Lforward:
 @ CHECK: adr	r1, #301989888          @ encoding: [0x12,0x14,0x8f,0xe2]
 @ CHECK: adr	r1, #-2147483647        @ encoding: [0x06,0x11,0x8f,0xe2]
 
-
 @------------------------------------------------------------------------------
 @ ADD
 @------------------------------------------------------------------------------
@@ -187,6 +186,7 @@ Lforward:
 
 	add r0, #-4
 	add r4, r5, #-21
+        add r0, pc, #0xc0000000
 
 @ CHECK: add	r4, r5, #61440          @ encoding: [0x0f,0x4a,0x85,0xe2]
 @ CHECK: add	r4, r5, r6              @ encoding: [0x06,0x40,0x85,0xe0]
@@ -217,6 +217,7 @@ Lforward:
 
 @ CHECK: sub	r0, r0, #4              @ encoding: [0x04,0x00,0x40,0xe2]
 @ CHECK: sub	r4, r5, #21             @ encoding: [0x15,0x40,0x45,0xe2]
+@ CHECK: adr    r0, #-1073741824        @ encoding: [0x03,0x01,0x8f,0xe2]
 
     @ Test right shift by 32, which is encoded as 0
     add r3, r1, r2, lsr #32
@@ -459,11 +460,11 @@ Lforward:
 @------------------------------------------------------------------------------
         cdp  p7, #1, c1, c1, c1, #4
         cdp2  p7, #1, c1, c1, c1, #4
-        cdp2   p10, #0, c6, c12, c0, #7
+        cdp2   p12, #0, c6, c12, c0, #7
 
 @ CHECK: cdp  p7, #1, c1, c1, c1, #4     @ encoding: [0x81,0x17,0x11,0xee]
 @ CHECK: cdp2  p7, #1, c1, c1, c1, #4    @ encoding: [0x81,0x17,0x11,0xfe]
-@ CHECK: cdp2  p10, #0, c6, c12, c0, #7   @ encoding: [0xe0,0x6a,0x0c,0xfe]
+@ CHECK: cdp2  p12, #0, c6, c12, c0, #7   @ encoding: [0xe0,0x6c,0x0c,0xfe]
 
         cdpne  p7, #1, c1, c1, c1, #4
 @ CHECK: cdpne  p7, #1, c1, c1, c1, #4     @ encoding: [0x81,0x17,0x11,0x1e]
@@ -804,8 +805,8 @@ Lforward:
         ldc2l p7, c1, [r8]
         ldc2l p8, c0, [r9, #-224]
         ldc2l p9, c1, [r10, #-120]!
-        ldc2l p10, c2, [r11], #16
-        ldc2l p11, c3, [r12], #-72
+        ldc2l p0, c2, [r11], #16
+        ldc2l p1, c3, [r12], #-72
 
         ldc p12, c4, [r0, #4]
         ldc p13, c5, [r1]
@@ -845,8 +846,8 @@ Lforward:
 @ CHECK: ldc2l	p7, c1, [r8]            @ encoding: [0x00,0x17,0xd8,0xfd]
 @ CHECK: ldc2l	p8, c0, [r9, #-224]     @ encoding: [0x38,0x08,0x59,0xfd]
 @ CHECK: ldc2l	p9, c1, [r10, #-120]!   @ encoding: [0x1e,0x19,0x7a,0xfd]
-@ CHECK: ldc2l	p10, c2, [r11], #16     @ encoding: [0x04,0x2a,0xfb,0xfc]
-@ CHECK: ldc2l	p11, c3, [r12], #-72    @ encoding: [0x12,0x3b,0x7c,0xfc]
+@ CHECK: ldc2l	p0, c2, [r11], #16      @ encoding: [0x04,0x20,0xfb,0xfc]
+@ CHECK: ldc2l	p1, c3, [r12], #-72     @ encoding: [0x12,0x31,0x7c,0xfc]
 
 @ CHECK: ldc	p12, c4, [r0, #4]       @ encoding: [0x01,0x4c,0x90,0xed]
 @ CHECK: ldc	p13, c5, [r1]           @ encoding: [0x00,0x5d,0x91,0xed]
@@ -1077,12 +1078,12 @@ Lforward:
         mrc  p14, #0, r1, c1, c2, #4
         mrc  p15, #7, apsr_nzcv, c15, c6, #6
         mrc2  p14, #0, r1, c1, c2, #4
-        mrc2  p10, #7, apsr_nzcv, c15, c0, #1
+        mrc2  p9, #7, apsr_nzcv, c15, c0, #1
 
 @ CHECK: mrc  p14, #0, r1, c1, c2, #4             @ encoding: [0x92,0x1e,0x11,0xee]
 @ CHECK: mrc  p15, #7, apsr_nzcv, c15, c6, #6     @ encoding: [0xd6,0xff,0xff,0xee]
 @ CHECK: mrc2  p14, #0, r1, c1, c2, #4            @ encoding: [0x92,0x1e,0x11,0xfe]
-@ CHECK: mrc2  p10, #7, apsr_nzcv, c15, c0, #1    @ encoding: [0x30,0xfa,0xff,0xfe]
+@ CHECK: mrc2  p9, #7, apsr_nzcv, c15, c0, #1     @ encoding: [0x30,0xf9,0xff,0xfe]
 
          mrceq  p15, #7, apsr_nzcv, c15, c6, #6
 @ CHECK: mrceq  p15, #7, apsr_nzcv, c15, c6, #6   @ encoding: [0xd6,0xff,0xff,0x0e]
@@ -2239,8 +2240,8 @@ Lforward:
         stc2l p7, c1, [r8]
         stc2l p8, c0, [r9, #-224]
         stc2l p9, c1, [r10, #-120]!
-        stc2l p10, c2, [r11], #16
-        stc2l p11, c3, [r12], #-72
+        stc2l p0, c2, [r11], #16
+        stc2l p1, c3, [r12], #-72
 
         stc p12, c4, [r0, #4]
         stc p13, c5, [r1]
@@ -2280,8 +2281,8 @@ Lforward:
 @ CHECK: stc2l	p7, c1, [r8]            @ encoding: [0x00,0x17,0xc8,0xfd]
 @ CHECK: stc2l	p8, c0, [r9, #-224]     @ encoding: [0x38,0x08,0x49,0xfd]
 @ CHECK: stc2l	p9, c1, [r10, #-120]!   @ encoding: [0x1e,0x19,0x6a,0xfd]
-@ CHECK: stc2l	p10, c2, [r11], #16     @ encoding: [0x04,0x2a,0xeb,0xfc]
-@ CHECK: stc2l	p11, c3, [r12], #-72    @ encoding: [0x12,0x3b,0x6c,0xfc]
+@ CHECK: stc2l	p0, c2, [r11], #16      @ encoding: [0x04,0x20,0xeb,0xfc]
+@ CHECK: stc2l	p1, c3, [r12], #-72     @ encoding: [0x12,0x31,0x6c,0xfc]
 
 @ CHECK: stc	p12, c4, [r0, #4]       @ encoding: [0x01,0x4c,0x80,0xed]
 @ CHECK: stc	p13, c5, [r1]           @ encoding: [0x00,0x5d,0x81,0xed]
@@ -2927,6 +2928,7 @@ Lforward:
         hint #2
         hint #1
         hint #0
+        hintgt #239
 
 @ CHECK: wfe                            @ encoding: [0x02,0xf0,0x20,0xe3]
 @ CHECK: wfehi                          @ encoding: [0x02,0xf0,0x20,0x83]
@@ -2939,3 +2941,4 @@ Lforward:
 @ CHECK: wfe                            @ encoding: [0x02,0xf0,0x20,0xe3]
 @ CHECK: yield                          @ encoding: [0x01,0xf0,0x20,0xe3]
 @ CHECK: nop                            @ encoding: [0x00,0xf0,0x20,0xe3]
+@ CHECK: hintgt #239                    @ encoding: [0xef,0xf0,0x20,0xc3]
diff --git a/test/MC/ARM/basic-thumb-instructions.s b/test/MC/ARM/basic-thumb-instructions.s
index b48db9a..dec7f5b 100644
--- a/test/MC/ARM/basic-thumb-instructions.s
+++ b/test/MC/ARM/basic-thumb-instructions.s
@@ -128,7 +128,7 @@ _func:
         beq _bar
         b       #1838
         b       #-420
-        beq     #336
+        beq     #-256
         beq     #160
 
 @ CHECK: b	_baz                    @ encoding: [A,0xe0'A']
@@ -137,7 +137,7 @@ _func:
              @   fixup A - offset: 0, value: _bar, kind: fixup_arm_thumb_bcc
 @ CHECK: b       #1838                   @ encoding: [0x97,0xe3]
 @ CHECK: b       #-420                   @ encoding: [0x2e,0xe7]
-@ CHECK: beq     #336                    @ encoding: [0xa8,0xd0]
+@ CHECK: beq     #-256                   @ encoding: [0x80,0xd0]
 @ CHECK: beq     #160                    @ encoding: [0x50,0xd0]
 
 @------------------------------------------------------------------------------
@@ -216,6 +216,16 @@ _func:
 @ CHECK: cmp	r8, r1                  @ encoding: [0x88,0x45]
 
 @------------------------------------------------------------------------------
+@ CPS
+@------------------------------------------------------------------------------
+
+        cpsie f
+        cpsid a
+
+@ CHECK: cpsie f                        @ encoding: [0x61,0xb6]
+@ CHECK: cpsid a                        @ encoding: [0x74,0xb6]
+
+@------------------------------------------------------------------------------
 @ EOR
 @------------------------------------------------------------------------------
         eors r4, r5
diff --git a/test/MC/ARM/basic-thumb2-instructions-v8.s b/test/MC/ARM/basic-thumb2-instructions-v8.s
new file mode 100644
index 0000000..a7882ae
--- /dev/null
+++ b/test/MC/ARM/basic-thumb2-instructions-v8.s
@@ -0,0 +1,87 @@
+@ New ARMv8 T32 encodings
+
+@ RUN: llvm-mc -triple thumbv8 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-V8
+@ RUN: not llvm-mc -triple thumbv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V7
+
+@ HLT
+        hlt  #0
+        hlt  #63
+@ CHECK-V8: hlt  #0                       @ encoding: [0x80,0xba]
+@ CHECK-V8: hlt  #63                      @ encoding: [0xbf,0xba]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+
+@ In IT block
+        it pl
+        hlt #24
+
+@ CHECK-V8: it pl                         @ encoding: [0x58,0xbf]
+@ CHECK-V8: hlt #24                       @ encoding: [0x98,0xba]
+@ CHECK-V7: error: instruction requires: armv8
+
+@ Can accept AL condition code
+        hltal #24
+@ CHECK-V8: hlt #24                       @ encoding: [0x98,0xba]
+@ CHECK-V7: error: instruction requires: armv8
+
+@ DCPS{1,2,3}
+        dcps1
+        dcps2
+        dcps3
+@ CHECK-V8: dcps1                         @ encoding: [0x8f,0xf7,0x01,0x80]
+@ CHECK-V8: dcps2                         @ encoding: [0x8f,0xf7,0x02,0x80]
+@ CHECK-V8: dcps3                         @ encoding: [0x8f,0xf7,0x03,0x80]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+
+@------------------------------------------------------------------------------
+@ DMB (v8 barriers)
+@------------------------------------------------------------------------------
+        dmb ishld
+        dmb oshld
+        dmb nshld
+        dmb ld
+
+@ CHECK-V8: dmb ishld @ encoding: [0xbf,0xf3,0x59,0x8f]
+@ CHECK-V8: dmb oshld @ encoding: [0xbf,0xf3,0x51,0x8f]
+@ CHECK-V8: dmb nshld @ encoding: [0xbf,0xf3,0x55,0x8f]
+@ CHECK-V8: dmb ld @ encoding: [0xbf,0xf3,0x5d,0x8f]
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+
+@------------------------------------------------------------------------------
+@ DSB (v8 barriers)
+@------------------------------------------------------------------------------
+        dsb ishld
+        dsb oshld
+        dsb nshld
+        dsb ld
+
+@ CHECK-V8: dsb ishld @ encoding: [0xbf,0xf3,0x49,0x8f]
+@ CHECK-V8: dsb oshld @ encoding: [0xbf,0xf3,0x41,0x8f]
+@ CHECK-V8: dsb nshld @ encoding: [0xbf,0xf3,0x45,0x8f]
+@ CHECK-V8: dsb ld @ encoding: [0xbf,0xf3,0x4d,0x8f]
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+@ CHECK-V7: error: invalid operand for instruction
+
+@------------------------------------------------------------------------------
+@ SEVL
+@------------------------------------------------------------------------------
+        sevl
+        sevl.w
+        it ge
+        sevlge
+
+@ CHECK-V8: sevl @ encoding: [0x50,0xbf]
+@ CHECK-V8: sevl.w @ encoding: [0xaf,0xf3,0x05,0x80]
+@ CHECK-V8: it ge @ encoding: [0xa8,0xbf]
+@ CHECK-V8: sevlge @ encoding: [0x50,0xbf]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error:
+@ CHECK-V7: error: instruction requires: armv8
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index b5d3966..3a5f488 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -80,6 +80,7 @@ _func:
         adds r2, r2, #56
         adds r2, #56
         add r1, r7, #0xcbcbcbcb
+        add sp, sp, #0x1fe0000
 
         adds.w r2, #-16
         adds.w r2, r2, #-16
@@ -103,6 +104,7 @@ _func:
 @ CHECK: adds	r2, #56                 @ encoding: [0x38,0x32]
 @ CHECK: adds	r2, #56                 @ encoding: [0x38,0x32]
 @ CHECK: add.w  r1, r7, #3419130827     @ encoding: [0x07,0xf1,0xcb,0x31]
+@ CHECK: add.w	sp, sp, #33423360       @ encoding: [0x0d,0xf1,0xff,0x7d]
 
 @ CHECK: subs.w	r2, r2, #16             @ encoding: [0xb2,0xf1,0x10,0x02]
 @ CHECK: subs.w	r2, r2, #16             @ encoding: [0xb2,0xf1,0x10,0x02]
@@ -152,12 +154,15 @@ _func:
         ands r3, r12, #0xf
         and r1, #0xff
         and r1, r1, #0xff
+        and r5, r4, #0xffffffff
+        ands r1, r9, #0xffffffff
 
 @ CHECK: and	r2, r5, #1044480        @ encoding: [0x05,0xf4,0x7f,0x22]
 @ CHECK: ands	r3, r12, #15            @ encoding: [0x1c,0xf0,0x0f,0x03]
 @ CHECK: and	r1, r1, #255            @ encoding: [0x01,0xf0,0xff,0x01]
 @ CHECK: and	r1, r1, #255            @ encoding: [0x01,0xf0,0xff,0x01]
-
+@ CHECK: and	r5, r4, #4294967295     @ encoding: [0x04,0xf0,0xff,0x35]
+@ CHECK: ands	r1, r9, #4294967295     @ encoding: [0x19,0xf0,0xff,0x31]
 
 @------------------------------------------------------------------------------
 @ AND (register)
@@ -259,6 +264,8 @@ _func:
 @ BIC
 @------------------------------------------------------------------------------
         bic r10, r1, #0xf
+        bic r5, r2, #0xffffffff
+        bics r11, r10, #0xffffffff
         bic r12, r3, r6
         bic r11, r2, r6, lsl #12
         bic r8, r4, r1, lsr #11
@@ -276,6 +283,8 @@ _func:
         bic r12, r6, ror #29
 
 @ CHECK: bic	r10, r1, #15            @ encoding: [0x21,0xf0,0x0f,0x0a]
+@ CHECK: bic	r5, r2, #4294967295     @ encoding: [0x22,0xf0,0xff,0x35]
+@ CHECK: bics	r11, r10, #4294967295   @ encoding: [0x3a,0xf0,0xff,0x3b]
 @ CHECK: bic.w	r12, r3, r6             @ encoding: [0x23,0xea,0x06,0x0c]
 @ CHECK: bic.w	r11, r2, r6, lsl #12    @ encoding: [0x22,0xea,0x06,0x3b]
 @ CHECK: bic.w	r8, r4, r1, lsr #11     @ encoding: [0x24,0xea,0xd1,0x28]
@@ -405,6 +414,31 @@ _func:
 @ CHECK: cmn.w	r2, #2                  @ encoding: [0x12,0xf1,0x02,0x0f]
 @ CHECK: cmp.w	r9, #1                  @ encoding: [0xb9,0xf1,0x01,0x0f]
 
+@------------------------------------------------------------------------------
+@ CPS
+@------------------------------------------------------------------------------
+
+        cpsie f
+        cpsid a
+        cpsie.w f
+        cpsid.w a
+        cpsie i, #3
+        cpsie.w i, #3
+        cpsid f, #9
+        cpsid.w f, #9
+        cps #0
+        cps.w #0
+
+@ CHECK: cpsie f                        @ encoding: [0x61,0xb6]
+@ CHECK: cpsid a                        @ encoding: [0x74,0xb6]
+@ CHECK: cpsie.w f                      @ encoding: [0xaf,0xf3,0x20,0x84]
+@ CHECK: cpsid.w a                      @ encoding: [0xaf,0xf3,0x80,0x86]
+@ CHECK: cpsie i, #3                    @ encoding: [0xaf,0xf3,0x43,0x85]
+@ CHECK: cpsie i, #3                    @ encoding: [0xaf,0xf3,0x43,0x85]
+@ CHECK: cpsid f, #9                    @ encoding: [0xaf,0xf3,0x29,0x87]
+@ CHECK: cpsid f, #9                    @ encoding: [0xaf,0xf3,0x29,0x87]
+@ CHECK: cps   #0                       @ encoding: [0xaf,0xf3,0x00,0x81]
+@ CHECK: cps   #0                       @ encoding: [0xaf,0xf3,0x00,0x81]
 
 @------------------------------------------------------------------------------
 @ DBG
@@ -628,8 +662,8 @@ _func:
         ldc2l p7, c1, [r8]
         ldc2l p8, c0, [r9, #-224]
         ldc2l p9, c1, [r10, #-120]!
-        ldc2l p10, c2, [r11], #16
-        ldc2l p11, c3, [r12], #-72
+        ldc2l p0, c2, [r11], #16
+        ldc2l p1, c3, [r12], #-72
 
         ldc p12, c4, [r0, #4]
         ldc p13, c5, [r1]
@@ -656,8 +690,8 @@ _func:
 @ CHECK: ldc2l	p7, c1, [r8]            @ encoding: [0xd8,0xfd,0x00,0x17]
 @ CHECK: ldc2l	p8, c0, [r9, #-224]     @ encoding: [0x59,0xfd,0x38,0x08]
 @ CHECK: ldc2l	p9, c1, [r10, #-120]!   @ encoding: [0x7a,0xfd,0x1e,0x19]
-@ CHECK: ldc2l	p10, c2, [r11], #16     @ encoding: [0xfb,0xfc,0x04,0x2a]
-@ CHECK: ldc2l	p11, c3, [r12], #-72    @ encoding: [0x7c,0xfc,0x12,0x3b]
+@ CHECK: ldc2l	p0, c2, [r11], #16      @ encoding: [0xfb,0xfc,0x04,0x20]
+@ CHECK: ldc2l	p1, c3, [r12], #-72     @ encoding: [0x7c,0xfc,0x12,0x31]
 
 @ CHECK: ldc	p12, c4, [r0, #4]       @ encoding: [0x90,0xed,0x01,0x4c]
 @ CHECK: ldc	p13, c5, [r1]           @ encoding: [0x91,0xed,0x00,0x5d]
@@ -772,23 +806,43 @@ _func:
 @ CHECK: ldr.w	lr, _strcmp-4           @ encoding: [0x5f'A',0xf8'A',A,0xe0'A']
 @ CHECK: @   fixup A - offset: 0, value: _strcmp-4, kind: fixup_t2_ldst_pcrel_12
 
+        ldr r7, [pc, #8]
+        ldr.n r7, [pc, #8]
+        ldr.w r7, [pc, #8]
         ldr r4, [pc, #1020]
         ldr r3, [pc, #-1020]
         ldr r6, [pc, #1024]
         ldr r0, [pc, #-1024]
         ldr r2, [pc, #4095]
         ldr r1, [pc, #-4095]
-        ldr.n r8, [pc, #132]
-        ldr.w r8, [pc, #132]
+        ldr r8, [pc, #132]
+        ldr pc, [pc, #256]
+        ldr pc, [pc, #-400]
 
+@ CHECK: ldr	r7, [pc, #8]            @ encoding: [0x02,0x4f]
+@ CHECK: ldr	r7, [pc, #8]            @ encoding: [0x02,0x4f]
+@ CHECK: ldr.w	r7, [pc, #8]            @ encoding: [0xdf,0xf8,0x08,0x70]
 @ CHECK: ldr	r4, [pc, #1020]       @ encoding: [0xff,0x4c]
-@ CHECK: ldr	r3, [pc, #-1020]      @ encoding: [0x01,0x4b]
+@ CHECK: ldr.w	r3, [pc, #-1020]        @ encoding: [0x5f,0xf8,0xfc,0x33]
 @ CHECK: ldr.w	r6, [pc, #1024]       @ encoding: [0xdf,0xf8,0x00,0x64]
 @ CHECK: ldr.w	r0, [pc, #-1024]      @ encoding: [0x5f,0xf8,0x00,0x04]
 @ CHECK: ldr.w	r2, [pc, #4095]       @ encoding: [0xdf,0xf8,0xff,0x2f]
 @ CHECK: ldr.w	r1, [pc, #-4095]      @ encoding: [0x5f,0xf8,0xff,0x1f]
-@ CHECK: ldr	r8, [pc, #132]        @ encoding: [0x21,0x48]
 @ CHECK: ldr.w	r8, [pc, #132]        @ encoding: [0xdf,0xf8,0x84,0x80]
+@ CHECK: ldr.w	pc, [pc, #256]          @ encoding: [0xdf,0xf8,0x00,0xf1]
+@ CHECK: ldr.w	pc, [pc, #-400]         @ encoding: [0x5f,0xf8,0x90,0xf1]
+
+        ldrb  r9, [pc, #-0]
+        ldrsb r11, [pc, #-0]
+        ldrh  r10, [pc, #-0]
+        ldrsh r1, [pc, #-0]
+        ldr   r5, [pc, #-0]
+
+@ CHECK: ldrb.w	r9, [pc, #-0]           @ encoding: [0x1f,0xf8,0x00,0x90]
+@ CHECK: ldrsb.w	r11, [pc, #-0]        @ encoding: [0x1f,0xf9,0x00,0xb0]
+@ CHECK: ldrh.w	r10, [pc, #-0]          @ encoding: [0x3f,0xf8,0x00,0xa0]
+@ CHECK: ldrsh.w	r1, [pc, #-0]         @ encoding: [0x3f,0xf9,0x00,0x10]
+@ CHECK: ldr.w	r5, [pc, #-0]           @ encoding: [0x5f,0xf8,0x00,0x50]
 
 @------------------------------------------------------------------------------
 @ LDR(register)
@@ -1269,8 +1323,15 @@ _func:
         movlo r1, #-1
 
         @ alias for mvn
-	mov r3, #-3
+        mov r3, #-3
+        mov r11, #0xabcd
+        movs r0, #1
+        it ne
+        movne r3, #15
 
+        itt eq
+        moveq r0, #255
+        moveq r1, #256
 
 @ CHECK: movs	r1, #21                 @ encoding: [0x15,0x21]
 @ CHECK: movs.w	r1, #21                 @ encoding: [0x5f,0xf0,0x15,0x01]
@@ -1289,6 +1350,14 @@ _func:
 @ CHECK: it	lo                      @ encoding: [0x38,0xbf]
 @ CHECK: movlo.w	r1, #-1         @ encoding: [0x4f,0xf0,0xff,0x31]
 @ CHECK: mvn	r3, #2                  @ encoding: [0x6f,0xf0,0x02,0x03]
+@ CHECK: movw	r11, #43981             @ encoding: [0x4a,0xf6,0xcd,0x3b]
+@ CHECK: movs	r0, #1                  @ encoding: [0x01,0x20]
+@ CHECK: it	ne                      @ encoding: [0x18,0xbf]
+@ CHECK: movne	r3, #15                 @ encoding: [0x0f,0x23]
+
+@ CHECK: itt    eq                      @ encoding: [0x04,0xbf]
+@ CHECK: moveq  r0, #255                @ encoding: [0xff,0x20]
+@ CHECK: movweq r1, #256                @ encoding: [0x40,0xf2,0x00,0x11]
 
 @------------------------------------------------------------------------------
 @ MOV(shifted register)
@@ -1348,17 +1417,17 @@ _func:
 @------------------------------------------------------------------------------
         mrc  p14, #0, r1, c1, c2, #4
         mrc  p15, #7, apsr_nzcv, c15, c6, #6
-        mrc  p11, #1, r1, c2, c2
+        mrc  p9, #1, r1, c2, c2
         mrc2 p12, #3, r3, c3, c4
         mrc2 p14, #0, r1, c1, c2, #4
-        mrc2 p10, #7, apsr_nzcv, c15, c0, #1
+        mrc2 p8, #7, apsr_nzcv, c15, c0, #1
  
 @ CHECK: mrc  p14, #0, r1, c1, c2, #4            @ encoding: [0x11,0xee,0x92,0x1e]
 @ CHECK: mrc  p15, #7, apsr_nzcv, c15, c6, #6    @ encoding: [0xff,0xee,0xd6,0xff]
-@ CHECK: mrc  p11, #1, r1, c2, c2, #0            @ encoding: [0x32,0xee,0x12,0x1b]
+@ CHECK: mrc  p9, #1, r1, c2, c2, #0             @ encoding: [0x32,0xee,0x12,0x19]
 @ CHECK: mrc2 p12, #3, r3, c3, c4, #0            @ encoding: [0x73,0xfe,0x14,0x3c]
 @ CHECK: mrc2 p14, #0, r1, c1, c2, #4            @ encoding: [0x11,0xfe,0x92,0x1e]
-@ CHECK: mrc2 p10, #7, apsr_nzcv, c15, c0, #1    @ encoding: [0xff,0xfe,0x30,0xfa]
+@ CHECK: mrc2 p8, #7, apsr_nzcv, c15, c0, #1     @ encoding: [0xff,0xfe,0x30,0xf8]
  
 @------------------------------------------------------------------------------
 @ MRRC/MRRC2
@@ -2485,8 +2554,8 @@ _func:
         stc2l p7, c1, [r8]
         stc2l p8, c0, [r9, #-224]
         stc2l p9, c1, [r10, #-120]!
-        stc2l p10, c2, [r11], #16
-        stc2l p11, c3, [r12], #-72
+        stc2l p0, c2, [r11], #16
+        stc2l p1, c3, [r12], #-72
 
         stc p12, c4, [r0, #4]
         stc p13, c5, [r1]
@@ -2513,8 +2582,8 @@ _func:
 @ CHECK: stc2l	p7, c1, [r8]            @ encoding: [0xc8,0xfd,0x00,0x17]
 @ CHECK: stc2l	p8, c0, [r9, #-224]     @ encoding: [0x49,0xfd,0x38,0x08]
 @ CHECK: stc2l	p9, c1, [r10, #-120]!   @ encoding: [0x6a,0xfd,0x1e,0x19]
-@ CHECK: stc2l	p10, c2, [r11], #16     @ encoding: [0xeb,0xfc,0x04,0x2a]
-@ CHECK: stc2l	p11, c3, [r12], #-72    @ encoding: [0x6c,0xfc,0x12,0x3b]
+@ CHECK: stc2l	p0, c2, [r11], #16      @ encoding: [0xeb,0xfc,0x04,0x20]
+@ CHECK: stc2l	p1, c3, [r12], #-72     @ encoding: [0x6c,0xfc,0x12,0x31]
 
 @ CHECK: stc	p12, c4, [r0, #4]       @ encoding: [0x80,0xed,0x01,0x4c]
 @ CHECK: stc	p13, c5, [r1]           @ encoding: [0x81,0xed,0x00,0x5d]
@@ -3524,11 +3593,21 @@ _func:
         wfige
         yieldlt
         hint.w #4
+        hint.w #3
+        hint.w #2
+        hint.w #1
+        hint.w #0
+        hint #4
         hint #3
         hint #2
         hint #1
         hint #0
 
+        itet lt
+        hintlt #15
+        hintge #16
+        hintlt #239
+
 @ CHECK: wfe                            @ encoding: [0x20,0xbf]
 @ CHECK: wfi                            @ encoding: [0x30,0xbf]
 @ CHECK: yield                          @ encoding: [0x10,0xbf]
@@ -3541,7 +3620,24 @@ _func:
 @ CHECK: wfe.w                          @ encoding: [0xaf,0xf3,0x02,0x80]
 @ CHECK: yield.w                        @ encoding: [0xaf,0xf3,0x01,0x80]
 @ CHECK: nop.w                          @ encoding: [0xaf,0xf3,0x00,0x80]
+@ CHECK: sev                            @ encoding: [0x40,0xbf]
+@ CHECK: wfi                            @ encoding: [0x30,0xbf]
+@ CHECK: wfe                            @ encoding: [0x20,0xbf]
+@ CHECK: yield                          @ encoding: [0x10,0xbf]
+@ CHECK: nop                            @ encoding: [0x00,0xbf]
 
+@ CHECK: itet	lt                      @ encoding: [0xb6,0xbf]
+@ CHECK: hintlt #15                     @ encoding: [0xf0,0xbf]
+@ CHECK: hintge.w #16                   @ encoding: [0xaf,0xf3,0x10,0x80]
+@ CHECK: hintlt.w #239                  @ encoding: [0xaf,0xf3,0xef,0x80]
+
+@------------------------------------------------------------------------------
+@ Unallocated wide/narrow hints
+@------------------------------------------------------------------------------
+        hint #7
+        hint.w #7
+@ CHECK: hint #7                        @ encoding: [0x70,0xbf]
+@ CHECK: hint.w #7                      @ encoding: [0xaf,0xf3,0x07,0x80]
 
 @------------------------------------------------------------------------------
 @ Alternate syntax for LDR*(literal) encodings
diff --git a/test/MC/ARM/crc32-thumb.s b/test/MC/ARM/crc32-thumb.s
new file mode 100644
index 0000000..3a0e7a9
--- /dev/null
+++ b/test/MC/ARM/crc32-thumb.s
@@ -0,0 +1,30 @@
+@ RUN: llvm-mc -triple=thumbv8 -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V7
+@ RUN: not llvm-mc -triple=thumbv8 -mattr=-crc -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOCRC
+        crc32b  r0, r1, r2
+        crc32h  r0, r1, r2
+        crc32w  r0, r1, r2
+
+@ CHECK:  crc32b    r0, r1, r2              @ encoding: [0xc1,0xfa,0x82,0xf0]
+@ CHECK:  crc32h    r0, r1, r2              @ encoding: [0xc1,0xfa,0x92,0xf0]
+@ CHECK:  crc32w    r0, r1, r2              @ encoding: [0xc1,0xfa,0xa2,0xf0]
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
+
+        crc32cb  r0, r1, r2
+        crc32ch  r0, r1, r2
+        crc32cw  r0, r1, r2
+
+@ CHECK:  crc32cb   r0, r1, r2              @ encoding: [0xd1,0xfa,0x82,0xf0]
+@ CHECK:  crc32ch   r0, r1, r2              @ encoding: [0xd1,0xfa,0x92,0xf0]
+@ CHECK:  crc32cw   r0, r1, r2              @ encoding: [0xd1,0xfa,0xa2,0xf0]
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
diff --git a/test/MC/ARM/crc32.s b/test/MC/ARM/crc32.s
new file mode 100644
index 0000000..45a1f0c
--- /dev/null
+++ b/test/MC/ARM/crc32.s
@@ -0,0 +1,30 @@
+@ RUN: llvm-mc -triple=armv8 -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V7
+@ RUN: not llvm-mc -triple=thumbv8 -mattr=-crc -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOCRC
+        crc32b  r0, r1, r2
+        crc32h  r0, r1, r2
+        crc32w  r0, r1, r2
+
+@ CHECK:  crc32b    r0, r1, r2              @ encoding: [0x42,0x00,0x01,0xe1]
+@ CHECK:  crc32h    r0, r1, r2              @ encoding: [0x42,0x00,0x21,0xe1]
+@ CHECK:  crc32w    r0, r1, r2              @ encoding: [0x42,0x00,0x41,0xe1]
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
+
+        crc32cb  r0, r1, r2
+        crc32ch  r0, r1, r2
+        crc32cw  r0, r1, r2
+
+@ CHECK:  crc32cb   r0, r1, r2              @ encoding: [0x42,0x02,0x01,0xe1]
+@ CHECK:  crc32ch   r0, r1, r2              @ encoding: [0x42,0x02,0x21,0xe1]
+@ CHECK:  crc32cw   r0, r1, r2              @ encoding: [0x42,0x02,0x41,0xe1]
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-V7: error: instruction requires: crc armv8
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
+@ CHECK-NOCRC: error: instruction requires: crc
diff --git a/test/MC/ARM/deprecated-v8.s b/test/MC/ARM/deprecated-v8.s
index e509a35..aa72c2e 100644
--- a/test/MC/ARM/deprecated-v8.s
+++ b/test/MC/ARM/deprecated-v8.s
@@ -1,3 +1,51 @@
-@ RUN: llvm-mc -triple armv8 -show-encoding < %s 2>&1 | FileCheck %s
+@ RUN: llvm-mc -triple armv8 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ARMV8
+@ RUN: llvm-mc -triple thumbv8 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-THUMBV8
+@ RUN: llvm-mc -triple armv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ARMV7
+@ RUN: llvm-mc -triple thumbv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-THUMBV7
+@ RUN: llvm-mc -triple armv6 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ARMV6
 setend be
-@ CHECK: warning: deprecated on armv8
+@ CHECK-ARMV8: warning: deprecated
+@ CHECK-THUMBV8: warning: deprecated
+@ CHECK-ARMV7-NOT: warning: deprecated
+@ CHECK-THUMBV7-NOT: warning: deprecated
+mcr p15, #0, r5, c7, c5, #4
+@ CHECK-ARMV8: warning: deprecated since v7, use 'isb'
+@ CHECK-THUMBV8: warning: deprecated since v7, use 'isb'
+@ CHECK-ARMV7: warning: deprecated since v7, use 'isb'
+@ CHECK-THUMBV7: warning: deprecated since v7, use 'isb'
+@ CHECK-ARMV6-NOT: warning: deprecated since v7, use 'isb'
+mcr p15, #0, r5, c7, c10, #4
+@ CHECK-ARMV8: warning: deprecated since v7, use 'dsb'
+@ CHECK-THUMBV8: warning: deprecated since v7, use 'dsb'
+@ CHECK-ARMV7: warning: deprecated since v7, use 'dsb'
+@ CHECK-THUMBV7: warning: deprecated since v7, use 'dsb'
+@ CHECK-ARMV6-NOT: warning: deprecated since v7, use 'dsb'
+mcr p15, #0, r5, c7, c10, #5
+@ CHECK-ARMV8: warning: deprecated since v7, use 'dmb'
+@ CHECK-THUMBV8: warning: deprecated since v7, use 'dmb'
+@ CHECK-ARMV7: warning: deprecated since v7, use 'dmb'
+@ CHECK-THUMBV7: warning: deprecated since v7, use 'dmb'
+@ CHECK-ARMV6-NOT: warning: deprecated since v7, use 'dmb'
+it ge
+movge r0, #4096
+@ CHECK-THUMBV8: warning: deprecated instruction in IT block
+@ CHECK-THUMBV7-NOT: warning
+ite ge
+addge r0, r1
+addlt r0, r2
+@ CHECK-ARMV8: warning: applying IT instruction to more than one subsequent instruction is deprecated
+@ CHECK-THUMBV8: warning: applying IT instruction to more than one subsequent instruction is deprecated
+@ CHECK-THUMBV7-NOT: warning
+it ge
+movge r0, pc // invalid operand
+@ CHECK-THUMBV8: warning: deprecated instruction in IT block
+@ CHECK-THUMBV7-NOT: warning
+it ge
+revge r0, r0 // invalid instruction
+@ CHECK-THUMBV8: warning: deprecated instruction in IT block
+@ CHECK-THUMBV7-NOT: warning
+it ge
+clzge r0, r0 // only has 32-bit form
+@ CHECK-THUMBV8: warning: deprecated instruction in IT block
+@ CHECK-THUMBV7-NOT: warning
+
diff --git a/test/MC/ARM/diagnostics-noneon.s b/test/MC/ARM/diagnostics-noneon.s
new file mode 100644
index 0000000..310344a
--- /dev/null
+++ b/test/MC/ARM/diagnostics-noneon.s
@@ -0,0 +1,7 @@
+@ RUN: not llvm-mc -triple=armv7-apple-darwin -mattr=-neon < %s 2> %t
+@ RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+        vmov d5, d10
+        vmov q4, q5
+@ CHECK-ERRORS: error: instruction requires: NEON
+@ CHECK-ERRORS: error: instruction requires: NEON
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 1aea117..3c26f6d 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -1,5 +1,7 @@
 @ RUN: not llvm-mc -triple=armv7-apple-darwin < %s 2> %t
 @ RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+@ RUN: not llvm-mc -triple=armv8 < %s 2> %t
+@ RUN: FileCheck --check-prefix=CHECK-ERRORS-V8 < %t %s
 
 @ Check for various assembly diagnostic messages on invalid input.
 
@@ -93,6 +95,26 @@
 
 @ CHECK-ERRORS: error: invalid operand for instruction
 
+        @ Out of range immediates for v8 HLT instruction.
+        hlt #65536
+        hlt #-1
+@CHECK-ERRORS-V8: error: invalid operand for instruction
+@CHECK-ERRORS-V8:         hlt #65536
+@CHECK-ERRORS-V8:              ^
+@CHECK-ERRORS-V8: error: invalid operand for instruction
+@CHECK-ERRORS-V8:         hlt #-1
+@CHECK-ERRORS-V8:              ^
+
+        @ Illegal condition code for v8 HLT instruction.
+        hlteq #2
+        hltlt #23
+@CHECK-ERRORS-V8: error: instruction 'hlt' is not predicable, but condition code specified
+@CHECK-ERRORS-V8:        hlteq #2
+@CHECK-ERRORS-V8:        ^
+@CHECK-ERRORS-V8: error: instruction 'hlt' is not predicable, but condition code specified
+@CHECK-ERRORS-V8:        hltlt #23
+@CHECK-ERRORS-V8:        ^
+
         @ Out of range 4 and 3 bit immediates on CDP[2]
 
         @ Out of range immediates for CDP/CDP2
@@ -129,6 +151,11 @@
 @ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 @ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 
+        @ p10 and p11 are reserved for NEON
+        mcr p10, #2, r5, c1, c1, #4
+        mcrr p11, #8, r5, r4, c1
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: invalid operand for instruction
 
         @ Out of range immediate for MOV
         movw r9, 0x10000
@@ -407,3 +434,34 @@
 
         bkpteq #7
 @ CHECK-ERRORS: error: instruction 'bkpt' is not predicable, but condition code specified
+
+        ldm r2!, {r2, r3}
+        ldmdb r2!, {r2, r3}
+        ldmda r2!, {r2, r3}
+        popeq {sp}
+@ CHECK-ERRORS: error: writeback register not allowed in register list
+@ CHECK-ERRORS: error: writeback register not allowed in register list
+@ CHECK-ERRORS: error: writeback register not allowed in register list
+@ CHECK-ERRORS: error: writeback register not allowed in register list
+
+        vrintz.f32.f32 s0, s1
+        vrintr.f32 s0, s1
+        vrintx.f64.f64 d2, d5
+        vrintz.f64 d10, d9
+        vrinta.f32.f32 s6, s7
+        vrintn.f32 s8, s9
+        vrintp.f64.f64 d10, d11
+        vrintm.f64 d12, d13
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+@ CHECK-ERRORS: error: instruction requires: FPARMv8
+
+        stm sp!, {r0, pc}^
+        ldm sp!, {r0}^
+@ CHECK-ERRORS: error: system STM cannot have writeback register
+@ CHECK-ERRORS: error: writeback register only allowed on system LDM if PC in register-list
diff --git a/test/MC/ARM/directive-cpu.s b/test/MC/ARM/directive-cpu.s
new file mode 100644
index 0000000..952dd93
--- /dev/null
+++ b/test/MC/ARM/directive-cpu.s
@@ -0,0 +1,26 @@
+@ RUN: llvm-mc < %s -triple armv7-unknown-linux-gnueabi -filetype=obj -o - \
+@ RUN:   | llvm-readobj -s -sd | FileCheck %s
+
+@ CHECK: Name: .ARM.attribute
+@ CHECK: SectionData (
+
+@ <format-version>
+@ CHECK: 41
+
+@ <section-length>
+@ CHECK: 1A0000 00
+
+@ <vendor-name> "aeabi\0"
+@ CHECK: 616561 626900
+
+@ <file-tag>
+@ CHECK: 01
+
+@ <size>
+@ CHECK: 10000000
+
+	.cpu	cortex-a8
+@ CHECK: 05
+@ CHECK: 434F52 5445582D 413800
+
+@ CHECK: )
diff --git a/test/MC/ARM/directive-eabi_attribute.s b/test/MC/ARM/directive-eabi_attribute.s
new file mode 100644
index 0000000..c060b80
--- /dev/null
+++ b/test/MC/ARM/directive-eabi_attribute.s
@@ -0,0 +1,56 @@
+@ RUN: llvm-mc < %s -triple armv7-unknown-linux-gnueabi -filetype=obj -o - \
+@ RUN:   | llvm-readobj -s -sd | FileCheck %s
+
+@ CHECK: Name: .ARM.attribute
+@ CHECK: SectionData (
+
+@ <format-version>
+@ CHECK: 41
+
+@ <section-length>
+@ CHECK: 250000 00
+
+@ <vendor-name> "aeabi\0"
+@ CHECK: 616561 626900
+
+@ <file-tag>
+@ CHECK: 01
+
+@ <size>
+@ CHECK: 1B000000
+
+@ <attribute>*
+
+	.eabi_attribute 6, 10
+@ CHECK: 060A
+
+	.eabi_attribute 7, 65
+@ CHECK: 0741
+
+	.eabi_attribute 8, 1
+@ CHECK: 0801
+
+	.eabi_attribute 9, 2
+@ CHECK: 0902
+
+	.eabi_attribute 10, 3
+@ CHECK: 0A03
+
+	.eabi_attribute 12, 1
+@ CHECK: 0C01
+
+	.eabi_attribute 20, 1
+@ CHECK: 1401
+
+	.eabi_attribute 21, 1
+@ CHECK: 1501
+
+	.eabi_attribute 23, 3
+@ CHECK: 1703
+
+	.eabi_attribute 24, 1
+@ CHECK: 1801
+
+	.eabi_attribute 25, 1
+@ CHECK: 1901
+@ CHECK: )
diff --git a/test/MC/ARM/directive-fpu-multiple.s b/test/MC/ARM/directive-fpu-multiple.s
new file mode 100644
index 0000000..6a93f24
--- /dev/null
+++ b/test/MC/ARM/directive-fpu-multiple.s
@@ -0,0 +1,26 @@
+@ Check multiple .fpu directives.
+
+@ The later .fpu directive should overwrite the earlier one.
+@ See also: directive-fpu-multiple2.s.
+
+@ RUN: llvm-mc < %s -triple arm-unknown-linux-gnueabi -filetype=obj \
+@ RUN:   | llvm-readobj -s -sd | FileCheck %s
+
+	.fpu neon
+	.fpu vfpv4
+
+@ CHECK:      Name: .ARM.attributes
+@ CHECK-NEXT: Type: SHT_ARM_ATTRIBUTES (0x70000003)
+@ CHECK-NEXT: Flags [ (0x0)
+@ CHECK-NEXT: ]
+@ CHECK-NEXT: Address: 0x0
+@ CHECK-NEXT: Offset: 0x34
+@ CHECK-NEXT: Size: 18
+@ CHECK-NEXT: Link: 0
+@ CHECK-NEXT: Info: 0
+@ CHECK-NEXT: AddressAlignment: 1
+@ CHECK-NEXT: EntrySize: 0
+@ CHECK-NEXT: SectionData (
+@ CHECK-NEXT:   0000: 41110000 00616561 62690001 07000000
+@ CHECK-NEXT:   0010: 0A05
+@ CHECK-NEXT: )
diff --git a/test/MC/ARM/directive-fpu.s b/test/MC/ARM/directive-fpu.s
new file mode 100644
index 0000000..24e159c
--- /dev/null
+++ b/test/MC/ARM/directive-fpu.s
@@ -0,0 +1,26 @@
+@ RUN: llvm-mc < %s -triple armv7-unknown-linux-gnueabi -filetype=obj -o - \
+@ RUN:   | llvm-readobj -s -sd | FileCheck %s
+
+@ CHECK: Name: .ARM.attribute
+@ CHECK: SectionData (
+
+@ <format-version>
+@ CHECK: 41
+
+@ <section-length>
+@ CHECK: 130000 00
+
+@ <vendor-name> "aeabi\0"
+@ CHECK: 616561 626900
+
+@ <file-tag>
+@ CHECK: 01
+
+@ <size>
+@ CHECK: 09000000
+
+	.fpu	neon
+@ CHECK: 0A03
+@ CHECK: 0C01
+
+@ CHECK: )
diff --git a/test/MC/ARM/elf-eflags-eabi-cg.ll b/test/MC/ARM/elf-eflags-eabi-cg.ll
deleted file mode 100644
index 0b9de7f..0000000
--- a/test/MC/ARM/elf-eflags-eabi-cg.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; Codegen version to check for ELF header flags.
-;
-; RUN: llc %s -mtriple=thumbv7-linux-gnueabi -relocation-model=pic \
-; RUN: -filetype=obj -o - | llvm-readobj -h | \
-; RUN: FileCheck %s
-
-define void @bar() nounwind {
-entry:
-  ret void
-}
-
-; For now the only e_flag set is EF_ARM_EABI_VER5
-; CHECK: ElfHeader {
-; CHECK:   Flags [ (0x5000000)
diff --git a/test/MC/ARM/elf-thumbfunc.s b/test/MC/ARM/elf-thumbfunc.s
index 26f5f0b..0ea1182 100644
--- a/test/MC/ARM/elf-thumbfunc.s
+++ b/test/MC/ARM/elf-thumbfunc.s
@@ -5,9 +5,9 @@
 	.text
 	.globl	foo
 	.align	2
-	.type	foo,%function
 	.code	16
 	.thumb_func
+	.type	foo,%function
 foo:
 	bx	lr
 
diff --git a/test/MC/ARM/fp-armv8.s b/test/MC/ARM/fp-armv8.s
new file mode 100644
index 0000000..1ffd590
--- /dev/null
+++ b/test/MC/ARM/fp-armv8.s
@@ -0,0 +1,129 @@
+@ RUN: llvm-mc -triple armv8 -mattr=+fp-armv8 -show-encoding < %s | FileCheck %s
+
+@ VCVT{B,T}
+
+  vcvtt.f64.f16 d3, s1
+@ CHECK: vcvtt.f64.f16 d3, s1      @ encoding: [0xe0,0x3b,0xb2,0xee]
+  vcvtt.f16.f64 s5, d12
+@ CHECK: vcvtt.f16.f64 s5, d12     @ encoding: [0xcc,0x2b,0xf3,0xee]
+
+  vcvtb.f64.f16 d3, s1
+@ CHECK: vcvtb.f64.f16 d3, s1     @ encoding: [0x60,0x3b,0xb2,0xee]
+  vcvtb.f16.f64 s4, d1
+@ CHECK: vcvtb.f16.f64 s4, d1     @ encoding: [0x41,0x2b,0xb3,0xee]
+
+  vcvttge.f64.f16 d3, s1
+@ CHECK: vcvttge.f64.f16 d3, s1      @ encoding: [0xe0,0x3b,0xb2,0xae]
+  vcvttgt.f16.f64 s5, d12
+@ CHECK: vcvttgt.f16.f64 s5, d12     @ encoding: [0xcc,0x2b,0xf3,0xce]
+
+  vcvtbeq.f64.f16 d3, s1
+@ CHECK: vcvtbeq.f64.f16 d3, s1     @ encoding: [0x60,0x3b,0xb2,0x0e]
+  vcvtblt.f16.f64 s4, d1
+@ CHECK: vcvtblt.f16.f64 s4, d1     @ encoding: [0x41,0x2b,0xb3,0xbe]
+
+
+@ VCVT{A,N,P,M}
+
+  vcvta.s32.f32 s2, s3
+@ CHECK: vcvta.s32.f32 s2, s3     @ encoding: [0xe1,0x1a,0xbc,0xfe]
+  vcvta.s32.f64 s2, d3
+@ CHECK: vcvta.s32.f64 s2, d3     @ encoding: [0xc3,0x1b,0xbc,0xfe]
+  vcvtn.s32.f32 s6, s23
+@ CHECK: vcvtn.s32.f32 s6, s23     @ encoding: [0xeb,0x3a,0xbd,0xfe]
+  vcvtn.s32.f64 s6, d23
+@ CHECK: vcvtn.s32.f64 s6, d23     @ encoding: [0xe7,0x3b,0xbd,0xfe]
+  vcvtp.s32.f32 s0, s4
+@ CHECK: vcvtp.s32.f32 s0, s4     @ encoding: [0xc2,0x0a,0xbe,0xfe]
+  vcvtp.s32.f64 s0, d4
+@ CHECK: vcvtp.s32.f64 s0, d4     @ encoding: [0xc4,0x0b,0xbe,0xfe]
+  vcvtm.s32.f32 s17, s8
+@ CHECK: vcvtm.s32.f32 s17, s8     @ encoding: [0xc4,0x8a,0xff,0xfe]
+  vcvtm.s32.f64 s17, d8
+@ CHECK: vcvtm.s32.f64 s17, d8     @ encoding: [0xc8,0x8b,0xff,0xfe]
+
+  vcvta.u32.f32 s2, s3
+@ CHECK: vcvta.u32.f32 s2, s3     @ encoding: [0x61,0x1a,0xbc,0xfe]
+  vcvta.u32.f64 s2, d3
+@ CHECK: vcvta.u32.f64 s2, d3     @ encoding: [0x43,0x1b,0xbc,0xfe]
+  vcvtn.u32.f32 s6, s23
+@ CHECK: vcvtn.u32.f32 s6, s23     @ encoding: [0x6b,0x3a,0xbd,0xfe]
+  vcvtn.u32.f64 s6, d23
+@ CHECK: vcvtn.u32.f64 s6, d23     @ encoding: [0x67,0x3b,0xbd,0xfe]
+  vcvtp.u32.f32 s0, s4
+@ CHECK: vcvtp.u32.f32 s0, s4     @ encoding: [0x42,0x0a,0xbe,0xfe]
+  vcvtp.u32.f64 s0, d4
+@ CHECK: vcvtp.u32.f64 s0, d4     @ encoding: [0x44,0x0b,0xbe,0xfe]
+  vcvtm.u32.f32 s17, s8
+@ CHECK: vcvtm.u32.f32 s17, s8     @ encoding: [0x44,0x8a,0xff,0xfe]
+  vcvtm.u32.f64 s17, d8
+@ CHECK: vcvtm.u32.f64 s17, d8     @ encoding: [0x48,0x8b,0xff,0xfe]
+
+
+@ VSEL
+  vselge.f32 s4, s1, s23
+@ CHECK: vselge.f32 s4, s1, s23    @ encoding: [0xab,0x2a,0x20,0xfe]
+  vselge.f64 d30, d31, d23
+@ CHECK: vselge.f64 d30, d31, d23  @ encoding: [0xa7,0xeb,0x6f,0xfe]
+  vselgt.f32 s0, s1, s0
+@ CHECK: vselgt.f32 s0, s1, s0    @ encoding: [0x80,0x0a,0x30,0xfe]
+  vselgt.f64 d5, d10, d20
+@ CHECK: vselgt.f64 d5, d10, d20  @ encoding: [0x24,0x5b,0x3a,0xfe]
+  vseleq.f32 s30, s28, s23
+@ CHECK: vseleq.f32 s30, s28, s23 @ encoding: [0x2b,0xfa,0x0e,0xfe]
+  vseleq.f64 d2, d4, d8
+@ CHECK: vseleq.f64 d2, d4, d8    @ encoding: [0x08,0x2b,0x04,0xfe]
+  vselvs.f32 s21, s16, s14
+@ CHECK: vselvs.f32 s21, s16, s14 @ encoding: [0x07,0xaa,0x58,0xfe]
+  vselvs.f64 d0, d1, d31
+@ CHECK: vselvs.f64 d0, d1, d31   @ encoding: [0x2f,0x0b,0x11,0xfe]
+
+
+@ VMAXNM / VMINNM
+  vmaxnm.f32 s5, s12, s0
+@ CHECK: vmaxnm.f32 s5, s12, s0    @ encoding: [0x00,0x2a,0xc6,0xfe]
+  vmaxnm.f64 d5, d22, d30
+@ CHECK: vmaxnm.f64 d5, d22, d30   @ encoding: [0xae,0x5b,0x86,0xfe]
+  vminnm.f32 s0, s0, s12
+@ CHECK: vminnm.f32 s0, s0, s12    @ encoding: [0x46,0x0a,0x80,0xfe]
+  vminnm.f64 d4, d6, d9
+@ CHECK: vminnm.f64 d4, d6, d9     @ encoding: [0x49,0x4b,0x86,0xfe]
+
+@ VRINT{Z,R,X}
+
+  vrintzge.f64 d3, d12
+@ CHECK: vrintzge.f64 d3, d12   @ encoding: [0xcc,0x3b,0xb6,0xae]
+  vrintz.f32 s3, s24
+@ CHECK: vrintz.f32 s3, s24     @ encoding: [0xcc,0x1a,0xf6,0xee]
+  vrintrlt.f64 d5, d0
+@ CHECK: vrintrlt.f64 d5, d0    @ encoding: [0x40,0x5b,0xb6,0xbe]
+  vrintr.f32 s0, s9
+@ CHECK: vrintr.f32 s0, s9      @ encoding: [0x64,0x0a,0xb6,0xee]
+  vrintxeq.f64 d28, d30
+@ CHECK: vrintxeq.f64 d28, d30  @ encoding: [0x6e,0xcb,0xf7,0x0e]
+  vrintxvs.f32 s10, s14
+@ CHECK: vrintxvs.f32 s10, s14  @ encoding: [0x47,0x5a,0xb7,0x6e]
+
+@ VRINT{A,N,P,M}
+
+  vrinta.f64 d3, d4
+@ CHECK: vrinta.f64 d3, d4     @ encoding: [0x44,0x3b,0xb8,0xfe]
+  vrinta.f32 s12, s1
+@ CHECK: vrinta.f32 s12, s1    @ encoding: [0x60,0x6a,0xb8,0xfe]
+  vrintn.f64 d3, d4
+@ CHECK: vrintn.f64 d3, d4     @ encoding: [0x44,0x3b,0xb9,0xfe]
+  vrintn.f32 s12, s1
+@ CHECK: vrintn.f32 s12, s1    @ encoding: [0x60,0x6a,0xb9,0xfe]
+  vrintp.f64 d3, d4
+@ CHECK: vrintp.f64 d3, d4     @ encoding: [0x44,0x3b,0xba,0xfe]
+  vrintp.f32 s12, s1
+@ CHECK: vrintp.f32 s12, s1    @ encoding: [0x60,0x6a,0xba,0xfe]
+  vrintm.f64 d3, d4
+@ CHECK: vrintm.f64 d3, d4     @ encoding: [0x44,0x3b,0xbb,0xfe]
+  vrintm.f32 s12, s1
+@ CHECK: vrintm.f32 s12, s1    @ encoding: [0x60,0x6a,0xbb,0xfe]
+
+@ MVFR2
+
+  vmrs sp, mvfr2
+@ CHECK: vmrs sp, mvfr2        @ encoding: [0x10,0xda,0xf5,0xee]
diff --git a/test/MC/ARM/idiv.s b/test/MC/ARM/idiv.s
new file mode 100644
index 0000000..6238a0f
--- /dev/null
+++ b/test/MC/ARM/idiv.s
@@ -0,0 +1,33 @@
+@ RUN: llvm-mc -triple=armv7 -mcpu=cortex-a15 -show-encoding < %s 2>&1 | FileCheck -check-prefix A15-ARM %s
+@ RUN: llvm-mc -triple=thumbv7 -mcpu=cortex-a15 -show-encoding < %s 2>&1 | FileCheck -check-prefix A15-THUMB %s
+
+@ RUN: llvm-mc -triple=armv7 -mcpu=cortex-a15 -mattr=-hwdiv -show-encoding < %s 2>&1 | FileCheck -check-prefix A15-ARM-NOTHUMBHWDIV %s
+@ RUN: llvm-mc -triple=thumbv7 -mcpu=cortex-a15 -mattr=-hwdiv-arm -show-encoding < %s 2>&1 | FileCheck -check-prefix A15-THUMB-NOARMHWDIV %s
+
+@ RUN: llvm-mc -triple=armv8 -show-encoding < %s 2>&1 | FileCheck -check-prefix ARMV8 %s
+@ RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck -check-prefix THUMBV8 %s
+
+@ RUN: llvm-mc -triple=armv8 -mattr=-hwdiv -show-encoding < %s 2>&1 | FileCheck -check-prefix ARMV8-NOTHUMBHWDIV %s
+@ RUN: llvm-mc -triple=thumbv8 -mattr=-hwdiv-arm -show-encoding < %s 2>&1 | FileCheck -check-prefix THUMBV8-NOTHUMBHWDIV %s
+
+        sdiv  r1, r2, r3
+        udiv  r3, r4, r5
+@ A15-ARM:              sdiv   r1, r2, r3               @ encoding: [0x12,0xf3,0x11,0xe7]
+@ A15-ARM:              udiv   r3, r4, r5               @ encoding: [0x14,0xf5,0x33,0xe7]
+@ A15-THUMB:            sdiv   r1, r2, r3               @ encoding: [0x92,0xfb,0xf3,0xf1]
+@ A15-THUMB:            udiv   r3, r4, r5               @ encoding: [0xb4,0xfb,0xf5,0xf3]
+
+@ A15-ARM-NOTHUMBHWDIV: sdiv    r1, r2, r3              @ encoding: [0x12,0xf3,0x11,0xe7]
+@ A15-ARM-NOTHUMBHWDIV: udiv    r3, r4, r5              @ encoding: [0x14,0xf5,0x33,0xe7]
+@ A15-THUMB-NOARMHWDIV: sdiv    r1, r2, r3              @ encoding: [0x92,0xfb,0xf3,0xf1]
+@ A15-THUMB-NOARMHWDIV: udiv    r3, r4, r5              @ encoding: [0xb4,0xfb,0xf5,0xf3]
+
+@ ARMV8:                sdiv    r1, r2, r3              @ encoding: [0x12,0xf3,0x11,0xe7]
+@ ARMV8:                udiv    r3, r4, r5              @ encoding: [0x14,0xf5,0x33,0xe7]
+@ THUMBV8:              sdiv    r1, r2, r3              @ encoding: [0x92,0xfb,0xf3,0xf1]
+@ THUMBV8:              udiv    r3, r4, r5              @ encoding: [0xb4,0xfb,0xf5,0xf3]
+
+@ ARMV8-NOTHUMBHWDIV:   sdiv    r1, r2, r3              @ encoding: [0x12,0xf3,0x11,0xe7]
+@ ARMV8-NOTHUMBHWDIV:   udiv    r3, r4, r5              @ encoding: [0x14,0xf5,0x33,0xe7]
+@ THUMBV8-NOTHUMBHWDIV: sdiv    r1, r2, r3              @ encoding: [0x92,0xfb,0xf3,0xf1]
+@ THUMBV8-NOTHUMBHWDIV: udiv    r3, r4, r5              @ encoding: [0xb4,0xfb,0xf5,0xf3]
diff --git a/test/MC/ARM/invalid-barrier.s b/test/MC/ARM/invalid-barrier.s
new file mode 100644
index 0000000..29fcd8e
--- /dev/null
+++ b/test/MC/ARM/invalid-barrier.s
@@ -0,0 +1,28 @@
+@ RUN: not llvm-mc -triple=armv7   -show-encoding < %s 2>&1 | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck %s
+
+@------------------------------------------------------------------------------
+@ DMB
+@------------------------------------------------------------------------------
+        dmb #0x10
+        dmb imaginary_scope
+
+@ CHECK: error: immediate value out of range
+@ CHECK: error: invalid operand for instruction
+
+@------------------------------------------------------------------------------
+@ DSB
+@------------------------------------------------------------------------------
+        dsb #0x10
+        dsb imaginary_scope
+@ CHECK: error: immediate value out of range
+@ CHECK: error: invalid operand for instruction
+
+@------------------------------------------------------------------------------
+@ ISB
+@------------------------------------------------------------------------------
+        isb #0x1f
+        isb imaginary_domain
+
+@ CHECK: error: immediate value out of range
+@ CHECK: error: invalid operand for instruction
diff --git a/test/MC/ARM/invalid-crc32.s b/test/MC/ARM/invalid-crc32.s
new file mode 100644
index 0000000..a541002
--- /dev/null
+++ b/test/MC/ARM/invalid-crc32.s
@@ -0,0 +1,16 @@
+@ RUN: not llvm-mc -triple=armv8 -show-encoding < %s 2>&1 | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck %s
+
+        crc32cbeq  r0, r1, r2
+        crc32bne   r0, r1, r2
+        crc32chcc  r0, r1, r2
+        crc32hpl   r0, r1, r2
+        crc32cwgt  r0, r1, r2
+        crc32wle   r0, r1, r2
+
+@ CHECK: error: instruction 'crc32cb' is not predicable, but condition code specified
+@ CHECK: error: instruction 'crc32b' is not predicable, but condition code specified
+@ CHECK: error: instruction 'crc32ch' is not predicable, but condition code specified
+@ CHECK: error: instruction 'crc32h' is not predicable, but condition code specified
+@ CHECK: error: instruction 'crc32cw' is not predicable, but condition code specified
+@ CHECK: error: instruction 'crc32w' is not predicable, but condition code specified
diff --git a/test/MC/ARM/invalid-fp-armv8.s b/test/MC/ARM/invalid-fp-armv8.s
new file mode 100644
index 0000000..21adb7e
--- /dev/null
+++ b/test/MC/ARM/invalid-fp-armv8.s
@@ -0,0 +1,89 @@
+@ RUN: not llvm-mc -triple armv8 -show-encoding -mattr=-neon < %s 2>&1 | FileCheck %s --check-prefix=V8
+
+@ VCVT{B,T}
+
+  vcvtt.f64.f16 d3, s1
+@ V7-NOT: vcvtt.f64.f16 d3, s1      @ encoding: [0xe0,0x3b,0xb2,0xee]
+  vcvtt.f16.f64 s5, d12
+@ V7-NOT: vcvtt.f16.f64 s5, d12     @ encoding: [0xcc,0x2b,0xf3,0xee]
+
+  vsel.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselne.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselmi.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselpl.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselvc.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselcs.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselcc.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselhs.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vsello.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselhi.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vsells.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vsellt.f32 s3, s4, s6
+@ V8: error: invalid instruction
+  vselle.f32 s3, s4, s6
+@ V8: error: invalid instruction
+
+vseleq.f32 s0, d2, d1
+@ V8: error: invalid operand for instruction
+vselgt.f64 s3, s2, s1
+@ V8: error: invalid operand for instruction
+vselgt.f32 s0, q3, q1
+@ V8: error: invalid operand for instruction
+vselgt.f64 q0, s3, q1
+@ V8: error: invalid operand for instruction
+
+vmaxnm.f32 s0, d2, d1
+@ V8: error: invalid operand for instruction
+vminnm.f64 s3, s2, s1
+@ V8: error: invalid operand for instruction
+vmaxnm.f32 s0, q3, q1
+@ V8: error: invalid operand for instruction
+vmaxnm.f64 q0, s3, q1
+@ V8: error: invalid operand for instruction
+vmaxnmgt.f64 q0, s3, q1
+@ CHECK: error: instruction 'vmaxnm' is not predicable, but condition code specified
+
+vcvta.s32.f64 d3, s2
+@ V8: error: invalid operand for instruction
+vcvtp.s32.f32 d3, s2
+@ V8: error: invalid operand for instruction
+vcvtn.u32.f64 d3, s2
+@ V8: error: invalid operand for instruction
+vcvtm.u32.f32 d3, s2
+@ V8: error: invalid operand for instruction
+vcvtnge.u32.f64 d3, s2
+@ V8: error: instruction 'vcvtn' is not predicable, but condition code specified
+
+vcvtbgt.f64.f16 q0, d3
+@ V8: error: invalid operand for instruction
+vcvttlt.f64.f16 s0, s3
+@ V8: error: invalid operand for instruction
+vcvttvs.f16.f64 s0, s3
+@ V8: error: invalid operand for instruction
+vcvtthi.f16.f64 q0, d3
+@ V8: error: invalid operand for instruction
+
+vrintrlo.f32.f32 d3, q0
+@ V8: error: invalid operand for instruction
+vrintxcs.f32.f32 d3, d0
+@ V8: error: instruction requires: NEON
+
+vrinta.f64.f64 s3, q0
+@ V8: error: invalid operand for instruction
+vrintn.f32.f32 d3, d0
+@ V8: error: instruction requires: NEON
+vrintp.f32 q3, q0
+@ V8: error: instruction requires: NEON
+vrintmlt.f32 q3, q0
+@ V8: error: instruction 'vrintm' is not predicable, but condition code specified
diff --git a/test/MC/ARM/invalid-hint-arm.s b/test/MC/ARM/invalid-hint-arm.s
index 3608e95..49a2e5c 100644
--- a/test/MC/ARM/invalid-hint-arm.s
+++ b/test/MC/ARM/invalid-hint-arm.s
@@ -1,7 +1,8 @@
 @ RUN: not llvm-mc -triple=armv7-apple-darwin -mcpu=cortex-a8 < %s 2>&1 | FileCheck %s
 
-hint #5
-hint #100
+hint #240
+hint #1000
+
+@ CHECK: error: immediate operand must be in the range [0,239]
+@ CHECK: error: immediate operand must be in the range [0,239]
 
-@ CHECK: error: immediate operand must be in the range [0,4]
-@ CHECK: error: immediate operand must be in the range [0,4]
diff --git a/test/MC/ARM/invalid-hint-thumb.s b/test/MC/ARM/invalid-hint-thumb.s
index bde987c..d2b50c4 100644
--- a/test/MC/ARM/invalid-hint-thumb.s
+++ b/test/MC/ARM/invalid-hint-thumb.s
@@ -1,9 +1,8 @@
 @ RUN: not llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 < %s 2>&1 | FileCheck %s
 
-hint #5
-hint.w #5
-hint #100
+hint #240
+hint #1000
+
+@ CHECK: error: immediate operand must be in the range [0,239]
+@ CHECK: error: immediate operand must be in the range [0,239]
 
-@ CHECK: error: immediate operand must be in the range [0,4]
-@ CHECK: error: immediate operand must be in the range [0,4]
-@ CHECK: error: immediate operand must be in the range [0,4]
diff --git a/test/MC/ARM/invalid-idiv.s b/test/MC/ARM/invalid-idiv.s
new file mode 100644
index 0000000..a84e66a
--- /dev/null
+++ b/test/MC/ARM/invalid-idiv.s
@@ -0,0 +1,28 @@
+@ RUN: not llvm-mc -triple=armv7 -mcpu=cortex-a15 -mattr=-hwdiv-arm < %s 2> %t
+@ RUN: FileCheck --check-prefix=ARM-A15 < %t %s
+@ RUN: not llvm-mc -triple=thumbv7 -mcpu=cortex-a15 -mattr=-hwdiv < %s 2> %t
+@ RUN: FileCheck --check-prefix=THUMB-A15 < %t %s
+@ RUN: not llvm-mc -triple=armv7 < %s 2> %t
+@ RUN: FileCheck --check-prefix=ARM < %t %s
+@ RUN: not llvm-mc -triple=thumbv7 < %s 2> %t
+@ RUN: FileCheck --check-prefix=THUMB < %t %s
+
+        sdiv  r1, r2, r3
+        udiv  r3, r4, r5
+@ ARM-A15: error: instruction requires: divide in ARM
+@ ARM-A15: sdiv r1, r2, r3
+@ ARM-A15: error: instruction requires: divide in ARM
+@ ARM-A15: udiv r3, r4, r5
+@ THUMB-A15: error: instruction requires: arm-mode
+@ THUMB-A15: sdiv r1, r2, r3
+@ THUMB-A15: error: instruction requires: arm-mode
+@ THUMB-A15: udiv r3, r4, r5
+
+@ ARM: error: instruction requires: divide in ARM
+@ ARM: sdiv r1, r2, r3
+@ ARM: error: instruction requires: divide in ARM
+@ ARM: udiv r3, r4, r5
+@ THUMB: error: instruction requires: divide in THUMB
+@ THUMB: sdiv r1, r2, r3
+@ THUMB: error: instruction requires: divide in THUMB
+@ THUMB: udiv r3, r4, r5
diff --git a/test/MC/ARM/invalid-neon-v8.s b/test/MC/ARM/invalid-neon-v8.s
new file mode 100644
index 0000000..361946d
--- /dev/null
+++ b/test/MC/ARM/invalid-neon-v8.s
@@ -0,0 +1,70 @@
+@ RUN: not llvm-mc -triple armv8 -mattr=-fp-armv8 -show-encoding < %s 2>&1 | FileCheck %s
+
+vmaxnm.f32 s4, d5, q1
+@ CHECK: error: invalid operand for instruction
+vmaxnm.f64.f64 s4, d5, q1
+@ CHECK: error: invalid operand for instruction
+vmaxnmge.f64.f64 s4, d5, q1
+@ CHECK: error: instruction 'vmaxnm' is not predicable, but condition code specified
+
+vcvta.s32.f32 s1, s2
+@ CHECK: error: instruction requires: FPARMv8
+vcvtp.u32.f32 s1, d2
+@ CHECK: error: invalid operand for instruction
+vcvtp.f32.u32 d1, q2
+@ CHECK: error: invalid operand for instruction
+vcvtplo.f32.u32 s1, s2
+@ CHECK: error: instruction 'vcvtp' is not predicable, but condition code specified
+
+vrinta.f64.f64 s3, d12
+@ CHECK: error: invalid operand for instruction
+vrintn.f32 d3, q12
+@ CHECK: error: invalid operand for instruction
+vrintz.f32 d3, q12
+@ CHECK: error: invalid operand for instruction
+vrintmge.f32.f32 d3, d4
+@ CHECK: error: instruction 'vrintm' is not predicable, but condition code specified
+
+aesd.8  q0, s1
+@ CHECK: error: invalid operand for instruction
+aese.8  s0, q1
+@ CHECK: error: invalid operand for instruction
+aesimc.8  s0, q1
+@ CHECK: error: invalid operand for instruction
+aesmc.8  q0, d1
+@ CHECK: error: invalid operand for instruction
+aesdge.8 q0, q1
+@ CHECK: error: instruction 'aesd' is not predicable, but condition code specified
+
+sha1h.32  d0, q1
+@ CHECK: error: invalid operand for instruction
+sha1su1.32  q0, s1
+@ CHECK: error: invalid operand for instruction
+sha256su0.32  s0, q1
+@ CHECK: error: invalid operand for instruction
+sha1heq.32  q0, q1
+@ CHECK: error: instruction 'sha1h' is not predicable, but condition code specified
+
+sha1c.32  s0, d1, q2
+@ CHECK: error: invalid operand for instruction
+sha1m.32  q0, s1, q2
+@ CHECK: error: invalid operand for instruction
+sha1p.32  s0, q1, q2
+@ CHECK: error: invalid operand for instruction
+sha1su0.32  d0, q1, q2
+@ CHECK: error: invalid operand for instruction
+sha256h.32  q0, s1, q2
+@ CHECK: error: invalid operand for instruction
+sha256h2.32  q0, q1, s2
+@ CHECK: error: invalid operand for instruction
+sha256su1.32  s0, d1, q2
+@ CHECK: error: invalid operand for instruction
+sha256su1lt.32  q0, d1, q2
+@ CHECK: error: instruction 'sha256su1' is not predicable, but condition code specified
+
+vmull.p64 q0, s1, s3
+@ CHECK: error: invalid operand for instruction
+vmull.p64 s1, d2, d3
+@ CHECK: error: invalid operand for instruction
+vmullge.p64 q0, d16, d17
+@ CHECK: error: instruction 'vmull' is not predicable, but condition code specified
diff --git a/test/MC/ARM/invalid-v8fp.s b/test/MC/ARM/invalid-v8fp.s
deleted file mode 100644
index 4dff188..0000000
--- a/test/MC/ARM/invalid-v8fp.s
+++ /dev/null
@@ -1,10 +0,0 @@
-@ RUN: not llvm-mc -triple armv7 -show-encoding < %s | FileCheck %s
-
-@ VCVT{B,T}
-
-  vcvtt.f64.f16 d3, s1
-@ CHECK-NOT: vcvtt.f64.f16 d3, s1      @ encoding: [0xe0,0x3b,0xb2,0xee]
-  vcvtt.f16.f64 s5, d12
-@ CHECK-NOT: vcvtt.f16.f64 s5, d12     @ encoding: [0xcc,0x2b,0xf3,0xee]
-
-
diff --git a/test/MC/ARM/lit.local.cfg b/test/MC/ARM/lit.local.cfg
index 5700913..8a3ba96 100644
--- a/test/MC/ARM/lit.local.cfg
+++ b/test/MC/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/MC/ARM/load-store-acquire-release-v8-thumb.s b/test/MC/ARM/load-store-acquire-release-v8-thumb.s
new file mode 100644
index 0000000..e34a263
--- /dev/null
+++ b/test/MC/ARM/load-store-acquire-release-v8-thumb.s
@@ -0,0 +1,48 @@
+@ RUN: llvm-mc -triple=thumbv8 -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V7
+        ldaexb  r3, [r4]
+        ldaexh  r2, [r5]
+        ldaex  r1, [r7]
+        ldaexd  r6, r7, [r8]
+
+@ CHECK:  ldaexb	r3, [r4]                @ encoding: [0xd4,0xe8,0xcf,0x3f]
+@ CHECK:  ldaexh	r2, [r5]                @ encoding: [0xd5,0xe8,0xdf,0x2f]
+@ CHECK:  ldaex	r1, [r7]                @ encoding: [0xd7,0xe8,0xef,0x1f]
+@ CHECK:  ldaexd	r6, r7, [r8]            @ encoding: [0xd8,0xe8,0xff,0x67]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+
+        stlexb  r1, r3, [r4]
+        stlexh  r4, r2, [r5]
+        stlex  r2, r1, [r7]
+        stlexd  r6, r2, r3, [r8]
+@ CHECK: stlexb r1, r3, [r4]            @ encoding: [0xc4,0xe8,0xc1,0x3f]
+@ CHECK: stlexh r4, r2, [r5]            @ encoding: [0xc5,0xe8,0xd4,0x2f]
+@ CHECK: stlex r2, r1, [r7]            @ encoding: [0xc7,0xe8,0xe2,0x1f]
+@ CHECK: stlexd r6, r2, r3, [r8]        @ encoding: [0xc8,0xe8,0xf6,0x23]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+
+         lda r5, [r6]
+         ldab r5, [r6]
+         ldah r12, [r9]
+@ CHECK: lda r5, [r6]                   @ encoding: [0xd6,0xe8,0xaf,0x5f]
+@ CHECK: ldab r5, [r6]                  @ encoding: [0xd6,0xe8,0x8f,0x5f]
+@ CHECK: ldah r12, [r9]                 @ encoding: [0xd9,0xe8,0x9f,0xcf]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+
+         stl r3, [r0]
+         stlb r2, [r1]
+         stlh r2, [r3]
+@ CHECK: stl r3, [r0]                   @ encoding: [0xc0,0xe8,0xaf,0x3f]
+@ CHECK: stlb r2, [r1]                  @ encoding: [0xc1,0xe8,0x8f,0x2f]
+@ CHECK: stlh r2, [r3]                  @ encoding: [0xc3,0xe8,0x9f,0x2f]
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
+@ CHECK-V7: error: instruction requires: armv8
diff --git a/test/MC/ARM/load-store-acquire-release-v8.s b/test/MC/ARM/load-store-acquire-release-v8.s
new file mode 100644
index 0000000..bc55364
--- /dev/null
+++ b/test/MC/ARM/load-store-acquire-release-v8.s
@@ -0,0 +1,48 @@
+@ RUN: llvm-mc -triple=armv8 -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V7
+        ldaexb  r3, [r4]
+        ldaexh  r2, [r5]
+        ldaex  r1, [r7]
+        ldaexd  r6, r7, [r8]
+
+@ CHECK: ldaexb r3, [r4]                @ encoding: [0x9f,0x3e,0xd4,0xe1]
+@ CHECK: ldaexh r2, [r5]                @ encoding: [0x9f,0x2e,0xf5,0xe1]
+@ CHECK: ldaex r1, [r7]                @ encoding: [0x9f,0x1e,0x97,0xe1]
+@ CHECK: ldaexd r6, r7, [r8]            @ encoding: [0x9f,0x6e,0xb8,0xe1]
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+
+        stlexb  r1, r3, [r4]
+        stlexh  r4, r2, [r5]
+        stlex  r2, r1, [r7]
+        stlexd  r6, r2, r3, [r8]
+@ CHECK: stlexb r1, r3, [r4]            @ encoding: [0x93,0x1e,0xc4,0xe1]
+@ CHECK: stlexh r4, r2, [r5]            @ encoding: [0x92,0x4e,0xe5,0xe1]
+@ CHECK: stlex r2, r1, [r7]            @ encoding: [0x91,0x2e,0x87,0xe1]
+@ CHECK: stlexd r6, r2, r3, [r8]        @ encoding: [0x92,0x6e,0xa8,0xe1]
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+
+         lda r5, [r6]
+         ldab r5, [r6]
+         ldah r12, [r9]
+@ CHECK: lda r5, [r6]                   @ encoding: [0x9f,0x5c,0x96,0xe1]
+@ CHECK: ldab r5, [r6]                  @ encoding: [0x9f,0x5c,0xd6,0xe1]
+@ CHECK: ldah r12, [r9]                 @ encoding: [0x9f,0xcc,0xf9,0xe1]
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+
+         stl r3, [r0]
+         stlb r2, [r1]
+         stlh r2, [r3]
+@ CHECK: stl r3, [r0]                   @ encoding: [0x93,0xfc,0x80,0xe1]
+@ CHECK: stlb r2, [r1]                  @ encoding: [0x92,0xfc,0xc1,0xe1]
+@ CHECK: stlh r2, [r3]                  @ encoding: [0x92,0xfc,0xe3,0xe1]
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
+@ CHECK-V7: instruction requires: armv8
diff --git a/test/MC/ARM/neon-convert-encoding.s b/test/MC/ARM/neon-convert-encoding.s
index 1733c52..20c7895 100644
--- a/test/MC/ARM/neon-convert-encoding.s
+++ b/test/MC/ARM/neon-convert-encoding.s
@@ -18,20 +18,36 @@
 	vcvt.f32.u32	q8, q8
 @ CHECK: vcvt.s32.f32	d16, d16, #1    @ encoding: [0x30,0x0f,0xff,0xf2]
 	vcvt.s32.f32	d16, d16, #1
+@ CHECK: vcvt.s32.f32	d16, d16        @ encoding: [0x20,0x07,0xfb,0xf3]
+	vcvt.s32.f32	d16, d16, #0
 @ CHECK: vcvt.u32.f32	d16, d16, #1    @ encoding: [0x30,0x0f,0xff,0xf3]
 	vcvt.u32.f32	d16, d16, #1
+@ CHECK: vcvt.u32.f32	d16, d16        @ encoding: [0xa0,0x07,0xfb,0xf3]
+	vcvt.u32.f32	d16, d16, #0
 @ CHECK: vcvt.f32.s32	d16, d16, #1    @ encoding: [0x30,0x0e,0xff,0xf2]
 	vcvt.f32.s32	d16, d16, #1
+@ CHECK: vcvt.f32.s32	d16, d16        @ encoding: [0x20,0x06,0xfb,0xf3]
+	vcvt.f32.s32	d16, d16, #0
 @ CHECK: vcvt.f32.u32	d16, d16, #1    @ encoding: [0x30,0x0e,0xff,0xf3]
 	vcvt.f32.u32	d16, d16, #1
+@ CHECK: vcvt.f32.u32	d16, d16        @ encoding: [0xa0,0x06,0xfb,0xf3]
+	vcvt.f32.u32	d16, d16, #0
 @ CHECK: vcvt.s32.f32	q8, q8, #1      @ encoding: [0x70,0x0f,0xff,0xf2]
 	vcvt.s32.f32	q8, q8, #1
+@ CHECK: vcvt.s32.f32	q8, q8          @ encoding: [0x60,0x07,0xfb,0xf3]
+	vcvt.s32.f32	q8, q8, #0
 @ CHECK: vcvt.u32.f32	q8, q8, #1      @ encoding: [0x70,0x0f,0xff,0xf3]
 	vcvt.u32.f32	q8, q8, #1
+@ CHECK: vcvt.u32.f32	q8, q8          @ encoding: [0xe0,0x07,0xfb,0xf3]
+	vcvt.u32.f32	q8, q8, #0
 @ CHECK: vcvt.f32.s32	q8, q8, #1      @ encoding: [0x70,0x0e,0xff,0xf2]
 	vcvt.f32.s32	q8, q8, #1
+@ CHECK: vcvt.f32.s32	q8, q8          @ encoding: [0x60,0x06,0xfb,0xf3]
+	vcvt.f32.s32	q8, q8, #0
 @ CHECK: vcvt.f32.u32	q8, q8, #1      @ encoding: [0x70,0x0e,0xff,0xf3]
 	vcvt.f32.u32	q8, q8, #1
+@ CHECK: vcvt.f32.u32	q8, q8          @ encoding: [0xe0,0x06,0xfb,0xf3]
+	vcvt.f32.u32	q8, q8, #0
 @ CHECK: vcvt.f32.f16	q8, d16         @ encoding: [0x20,0x07,0xf6,0xf3]
 	vcvt.f32.f16	q8, d16
 @ CHECK: vcvt.f16.f32	d16, q8         @ encoding: [0x20,0x06,0xf6,0xf3]
diff --git a/test/MC/ARM/neon-crypto.s b/test/MC/ARM/neon-crypto.s
new file mode 100644
index 0000000..92d24da
--- /dev/null
+++ b/test/MC/ARM/neon-crypto.s
@@ -0,0 +1,51 @@
+@ RUN: llvm-mc -triple armv8 -mattr=+neon,+crypto -show-encoding < %s | FileCheck %s
+@ RUN: not llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V7
+
+aesd.8  q0, q1
+aese.8  q0, q1
+aesimc.8  q0, q1
+aesmc.8  q0, q1
+@ CHECK: aesd.8 q0, q1          @ encoding: [0x42,0x03,0xb0,0xf3]
+@ CHECK: aese.8 q0, q1          @ encoding: [0x02,0x03,0xb0,0xf3]
+@ CHECK: aesimc.8 q0, q1        @ encoding: [0xc2,0x03,0xb0,0xf3]
+@ CHECK: aesmc.8 q0, q1         @ encoding: [0x82,0x03,0xb0,0xf3]
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+
+sha1h.32  q0, q1
+sha1su1.32  q0, q1
+sha256su0.32  q0, q1
+@ CHECK: sha1h.32  q0, q1       @ encoding: [0xc2,0x02,0xb9,0xf3]
+@ CHECK: sha1su1.32 q0, q1      @ encoding: [0x82,0x03,0xba,0xf3]
+@ CHECK: sha256su0.32 q0, q1    @ encoding: [0xc2,0x03,0xba,0xf3]
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+
+sha1c.32  q0, q1, q2
+sha1m.32  q0, q1, q2
+sha1p.32  q0, q1, q2
+sha1su0.32  q0, q1, q2
+sha256h.32  q0, q1, q2
+sha256h2.32  q0, q1, q2
+sha256su1.32  q0, q1, q2
+@ CHECK: sha1c.32  q0, q1, q2   @ encoding: [0x44,0x0c,0x02,0xf2]
+@ CHECK: sha1m.32  q0, q1, q2   @ encoding: [0x44,0x0c,0x22,0xf2]
+@ CHECK: sha1p.32 q0, q1, q2    @ encoding: [0x44,0x0c,0x12,0xf2]
+@ CHECK: sha1su0.32  q0, q1, q2      @ encoding: [0x44,0x0c,0x32,0xf2]
+@ CHECK: sha256h.32  q0, q1, q2      @ encoding: [0x44,0x0c,0x02,0xf3]
+@ CHECK: sha256h2.32 q0, q1, q2      @ encoding: [0x44,0x0c,0x12,0xf3]
+@ CHECK: sha256su1.32 q0, q1, q2     @ encoding: [0x44,0x0c,0x22,0xf3]
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+@ CHECK-V7: instruction requires: crypto armv8
+
+vmull.p64 q8, d16, d17
+@ CHECK: vmull.p64  q8, d16, d17    @ encoding: [0xa1,0x0e,0xe0,0xf2]
+@ CHECK-V7: instruction requires: crypto armv8
diff --git a/test/MC/ARM/obsolete-v8.s b/test/MC/ARM/obsolete-v8.s
new file mode 100644
index 0000000..0d6176b
--- /dev/null
+++ b/test/MC/ARM/obsolete-v8.s
@@ -0,0 +1,7 @@
+@ RUN: not llvm-mc -triple=armv8 < %s 2>&1 | FileCheck %s
+
+swp r0, r1, [r2]
+@ CHECK: instruction requires: armv7 or earlier
+
+swpb r0, r1, [r2]
+@ CHECK: instruction requires: armv7 or earlier
diff --git a/test/MC/ARM/single-precision-fp.s b/test/MC/ARM/single-precision-fp.s
new file mode 100644
index 0000000..2ed0cfe
--- /dev/null
+++ b/test/MC/ARM/single-precision-fp.s
@@ -0,0 +1,194 @@
+@ RUN: not llvm-mc < %s -triple thumbv8-unknown-unknown -show-encoding -mattr=+fp-only-sp,-neon 2> %t > %t2
+@ RUN:     FileCheck %s < %t --check-prefix=CHECK-ERRORS
+@ RUN:     FileCheck %s < %t2
+
+        vadd.f64 d0, d1, d2
+        vsub.f64 d2, d3, d4
+        vdiv.f64 d4, d5, d6
+        vmul.f64 d6, d7, d8
+        vnmul.f64 d8, d9, d10
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vadd.f64 d0, d1, d2
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vsub.f64 d2, d3, d4
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vdiv.f64 d4, d5, d6
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vmul.f64 d6, d7, d8
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vnmul.f64 d8, d9, d10
+
+        vmla.f64 d11, d10, d9
+        vmls.f64 d8, d7, d6
+        vnmla.f64 d5, d4, d3
+        vnmls.f64 d2, d1, d0
+        vfma.f64 d1, d2, d3
+        vfms.f64 d4, d5, d6
+        vfnma.f64 d7, d8, d9
+        vfnms.f64 d10, d11, d12
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vmla.f64 d11, d10, d9
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vmls.f64 d8, d7, d6
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vnmla.f64 d5, d4, d3
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vnmls.f64 d2, d1, d0
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vfma.f64 d1, d2, d3
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vfms.f64 d4, d5, d6
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vfnma.f64 d7, d8, d9
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vfnms.f64 d10, d11, d12
+
+        vneg.f64 d15, d14
+        vsqrt.f64 d13, d12
+        vsqrt d13, d14
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vneg.f64 d15, d14
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vsqrt.f64 d13, d12
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vsqrt d13, d14
+
+        vcmpe.f64 d0, d1
+        vcmp.f64 d2, d3
+        vabs.f64 d4, d5
+        vcmpe.f64 d5, #0
+        vcmp.f64 d6, #0
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcmpe.f64 d0, d1
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcmp.f64 d2, d3
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vabs.f64 d4, d5
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcmpe.f64 d5, #0
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcmp.f64 d6, #0
+
+        @ FIXME: overlapping aliases and a probable TableGen indeterminacy mean
+        @ that the actual reason can vary by platform.
+        vmov.f64 d11, d10
+@ CHECK-ERRORS: error: instruction requires:
+@ CHECK-ERRORS-NEXT: vmov.f64 d11, d10
+
+        vcvt.f64.s32 d9, s8
+        vcvt.f64.u32 d7, s6
+        vcvt.s32.f64 s5, d4
+        vcvt.u32.f64 s3, d2
+        vcvtr.s32.f64 s1, d0
+        vcvtr.u32.f64 s1, d2
+        vcvt.s16.f64 d3, d4, #1
+        vcvt.u16.f64 d5, d6, #2
+        vcvt.s32.f64 d7, d8, #3
+        vcvt.u32.f64 d9, d10, #4
+        vcvt.f64.s16 d11, d12, #3
+        vcvt.f64.u16 d13, d14, #2
+        vcvt.f64.s32 d15, d14, #1
+        vcvt.f64.u32 d13, d12, #1
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.f64.s32 d9, s8
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.f64.u32 d7, s6
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.s32.f64 s5, d4
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.u32.f64 s3, d2
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvtr.s32.f64 s1, d0
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvtr.u32.f64 s1, d2
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.s16.f64 d3, d4, #1
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.u16.f64 d5, d6, #2
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.s32.f64 d7, d8, #3
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.u32.f64 d9, d10, #4
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.f64.s16 d11, d12, #3
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.f64.u16 d13, d14, #2
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.f64.s32 d15, d14, #1
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvt.f64.u32 d13, d12, #1
+
+        @ v8 operations, also double precision so make sure they're rejected.
+        vselgt.f64 d0, d1, d2
+        vselge.f64 d3, d4, d5
+        vseleq.f64 d6, d7, d8
+        vselvs.f64 d9, d10, d11
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vselgt.f64 d0, d1, d2
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vselge.f64 d3, d4, d5
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vseleq.f64 d6, d7, d8
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vselvs.f64 d9, d10, d11
+
+        vmaxnm.f64 d12, d13, d14
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vmaxnm.f64 d12, d13, d14
+
+        vcvtb.f64.f16 d7, s8
+        vcvtb.f16.f64 s9, d10
+        vcvtt.f64.f16 d11, s12
+        vcvtt.f16.f64 s13, d14
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvtb.f64.f16 d7, s8
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvtb.f16.f64 s9, d10
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvtt.f64.f16 d11, s12
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vcvtt.f16.f64 s13, d14
+
+        vrintz.f64 d15, d14
+        vrintr.f64.f64 d13, d12
+        vrintx.f64 d11, d10
+        vrinta.f64.f64 d9, d8
+        vrintn.f64 d7, d6
+        vrintp.f64.f64 d5, d4
+        vrintm.f64 d3, d2
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vrintz.f64 d15, d14
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vrintr.f64.f64 d13, d12
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vrintx.f64 d11, d10
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vrinta.f64.f64 d9, d8
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vrintn.f64 d7, d6
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vrintp.f64.f64 d5, d4
+@ CHECK-ERRORS: error: instruction requires: double precision VFP
+@ CHECK-ERRORS-NEXT: vrintm.f64 d3, d2
+
+        @ Double precisionish operations that actually *are* allowed.
+        vldr d0, [sp]
+        vstr d3, [sp]
+        vldm r0, {d0, d1}
+        vstm r4, {d3, d4}
+        vpush {d6, d7}
+        vpop {d8, d9}
+        vmov r1, r0, d1
+        vmov d2, r3, r4
+        vmov.f64 r5, r6, d7
+        vmov.f64 d8, r9, r10
+@ CHECK: vldr d0, [sp]
+@ CHECK: vstr d3, [sp]
+@ CHECK: vldmia r0, {d0, d1}
+@ CHECK: vstmia r4, {d3, d4}
+@ CHECK: vpush {d6, d7}
+@ CHECK: vpop {d8, d9}
+@ CHECK: vmov r1, r0, d1
+@ CHECK: vmov d2, r3, r4
+@ CHECK: vmov r5, r6, d7
+@ CHECK: vmov d8, r9, r10
diff --git a/test/MC/ARM/thumb-diagnostics.s b/test/MC/ARM/thumb-diagnostics.s
index a194ab4..19d17c2 100644
--- a/test/MC/ARM/thumb-diagnostics.s
+++ b/test/MC/ARM/thumb-diagnostics.s
@@ -2,6 +2,8 @@
 @ RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
 @ RUN: not llvm-mc -triple=thumbv5-apple-darwin < %s 2> %t
 @ RUN: FileCheck --check-prefix=CHECK-ERRORS-V5 < %t %s
+@ RUN: not llvm-mc -triple=thumbv8 < %s 2> %t
+@ RUN: FileCheck --check-prefix=CHECK-ERRORS-V8 < %t %s
 
 @ Check for various assembly diagnostic messages on invalid input.
 
@@ -38,10 +40,25 @@ error: invalid operand for instruction
         bkpt #-1
              ^
 
+@ Out of range immediates for v8 HLT instruction.
+        hlt #64
+        hlt #-1
+@CHECK-ERRORS: error: instruction requires: armv8 arm-mode
+@CHECK-ERRORS:        hlt #64
+@CHECK-ERRORS:        ^
+@CHECK-ERRORS-V8: error: instruction requires: arm-mode
+@CHECK-ERRORS-V8:         hlt #64
+@CHECK-ERRORS-V8:              ^
+@CHECK-ERRORS: error: invalid operand for instruction
+@CHECK-ERRORS:         hlt #-1
+@CHECK-ERRORS:              ^
+
 @ Invalid writeback and register lists for LDM
         ldm r2!, {r5, r8}
         ldm r2, {r5, r7}
         ldm r2!, {r2, r3, r4}
+        ldm r2!, {r2, r3, r4, r10}
+        ldmdb r2!, {r2, r3, r4}
 @ CHECK-ERRORS: error: registers must be in range r0-r7
 @ CHECK-ERRORS:         ldm r2!, {r5, r8}
 @ CHECK-ERRORS:                  ^
@@ -51,7 +68,12 @@ error: invalid operand for instruction
 @ CHECK-ERRORS: error: writeback operator '!' not allowed when base register in register list
 @ CHECK-ERRORS:         ldm r2!, {r2, r3, r4}
 @ CHECK-ERRORS:               ^
-
+@ CHECK-ERRORS-V8: error: writeback operator '!' not allowed when base register in register list
+@ CHECK-ERRORS-V8:         ldm r2!, {r2, r3, r4, r10}
+@ CHECK-ERRORS-V8:               ^
+@ CHECK-ERRORS-V8: error: writeback register not allowed in register list
+@ CHECK-ERRORS-V8:         ldmdb r2!, {r2, r3, r4}
+@ CHECK-ERRORS-V8:                 ^
 
 @ Invalid writeback and register lists for PUSH/POP
         pop {r1, r2, r10}
@@ -67,12 +89,20 @@ error: invalid operand for instruction
 @ Invalid writeback and register lists for STM
         stm r1, {r2, r6}
         stm r1!, {r2, r9}
+        stm r2!, {r2, r9}
+        stmdb r2!, {r0, r2}
 @ CHECK-ERRORS: error: instruction requires: thumb2
 @ CHECK-ERRORS:         stm r1, {r2, r6}
 @ CHECK-ERRORS:         ^
 @ CHECK-ERRORS: error: registers must be in range r0-r7
 @ CHECK-ERRORS:         stm r1!, {r2, r9}
 @ CHECK-ERRORS:                  ^
+@ CHECK-ERRORS-V8: error: writeback operator '!' not allowed when base register in register list
+@ CHECK-ERRORS-V8:         stm r2!, {r2, r9}
+@ CHECK-ERRORS-V8:                  ^
+@ CHECK-ERRORS-V8: error: writeback register not allowed in register list
+@ CHECK-ERRORS-V8:         stmdb r2!, {r0, r2}
+@ CHECK-ERRORS-V8:                  ^
 
 @ Out of range immediates for LSL instruction.
         lsls r4, r5, #-1
@@ -138,7 +168,26 @@ error: invalid operand for instruction
 @ CHECK-ERRORS: error: source register must be the same as destination
 @ CHECK-ERRORS:         add r2, sp, ip
 @ CHECK-ERRORS:                     ^
- 
+
+
+@------------------------------------------------------------------------------
+@ B/Bcc - out of range immediates for Thumb1 branches
+@------------------------------------------------------------------------------
+
+        beq    #-258
+        bne    #256
+        bgt    #13
+        b      #-1048578
+        b      #1048576
+        b      #10323
+
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+
 @------------------------------------------------------------------------------
 @ WFE/WFI/YIELD - are not supported pre v6T2
 @------------------------------------------------------------------------------
@@ -146,13 +195,13 @@ error: invalid operand for instruction
         wfi
         yield
 
-@ CHECK-ERRORS: error: instruction requires: thumb2
+@ CHECK-ERRORS: error: instruction requires: armv6m or armv6t2
 @ CHECK-ERRORS: wfe
 @ CHECK-ERRORS: ^
-@ CHECK-ERRORS: error: instruction requires: thumb2
+@ CHECK-ERRORS: error: instruction requires: armv6m or armv6t2
 @ CHECK-ERRORS: wfi
 @ CHECK-ERRORS: ^
-@ CHECK-ERRORS: error: instruction requires: thumb2
+@ CHECK-ERRORS: error: instruction requires: armv6m or armv6t2
 @ CHECK-ERRORS: yield
 @ CHECK-ERRORS: ^
 
@@ -161,3 +210,11 @@ error: invalid operand for instruction
 @------------------------------------------------------------------------------
         pldw [r0, #4]
 @ CHECK-ERRORS: error: instruction requires: mp-extensions
+
+@------------------------------------------------------------------------------
+@ LDR(lit) - invalid offsets
+@------------------------------------------------------------------------------
+
+        ldr r4, [pc, #-12]
+@ CHECK-ERRORS: error: instruction requires: thumb2
+
diff --git a/test/MC/ARM/thumb-fp-armv8.s b/test/MC/ARM/thumb-fp-armv8.s
new file mode 100644
index 0000000..a730fa2
--- /dev/null
+++ b/test/MC/ARM/thumb-fp-armv8.s
@@ -0,0 +1,130 @@
+@ RUN: llvm-mc -triple thumbv8 -mattr=+fp-armv8 -show-encoding < %s | FileCheck %s
+
+@ VCVT{B,T}
+
+  vcvtt.f64.f16 d3, s1
+@ CHECK: vcvtt.f64.f16 d3, s1      @ encoding: [0xb2,0xee,0xe0,0x3b]
+  vcvtt.f16.f64 s5, d12
+@ CHECK: vcvtt.f16.f64 s5, d12     @ encoding: [0xf3,0xee,0xcc,0x2b]
+
+  vcvtb.f64.f16 d3, s1
+@ CHECK: vcvtb.f64.f16 d3, s1     @ encoding: [0xb2,0xee,0x60,0x3b]
+  vcvtb.f16.f64 s4, d1
+@ CHECK: vcvtb.f16.f64 s4, d1     @ encoding: [0xb3,0xee,0x41,0x2b]
+
+  it ge
+  vcvttge.f64.f16 d3, s1
+@ CHECK: vcvttge.f64.f16 d3, s1      @ encoding: [0xb2,0xee,0xe0,0x3b]
+  it gt
+  vcvttgt.f16.f64 s5, d12
+@ CHECK: vcvttgt.f16.f64 s5, d12     @ encoding: [0xf3,0xee,0xcc,0x2b]
+  it eq
+  vcvtbeq.f64.f16 d3, s1
+@ CHECK: vcvtbeq.f64.f16 d3, s1     @ encoding: [0xb2,0xee,0x60,0x3b]
+  it lt
+  vcvtblt.f16.f64 s4, d1
+@ CHECK: vcvtblt.f16.f64 s4, d1     @ encoding: [0xb3,0xee,0x41,0x2b]
+
+
+@ VCVT{A,N,P,M}
+
+  vcvta.s32.f32 s2, s3
+@ CHECK: vcvta.s32.f32 s2, s3     @ encoding: [0xbc,0xfe,0xe1,0x1a]
+  vcvta.s32.f64 s2, d3
+@ CHECK: vcvta.s32.f64 s2, d3     @ encoding: [0xbc,0xfe,0xc3,0x1b]
+  vcvtn.s32.f32 s6, s23
+@ CHECK: vcvtn.s32.f32 s6, s23     @ encoding: [0xbd,0xfe,0xeb,0x3a]
+  vcvtn.s32.f64 s6, d23
+@ CHECK: vcvtn.s32.f64 s6, d23     @ encoding: [0xbd,0xfe,0xe7,0x3b]
+  vcvtp.s32.f32 s0, s4
+@ CHECK: vcvtp.s32.f32 s0, s4     @ encoding: [0xbe,0xfe,0xc2,0x0a]
+  vcvtp.s32.f64 s0, d4
+@ CHECK: vcvtp.s32.f64 s0, d4     @ encoding: [0xbe,0xfe,0xc4,0x0b]
+  vcvtm.s32.f32 s17, s8
+@ CHECK: vcvtm.s32.f32 s17, s8     @ encoding: [0xff,0xfe,0xc4,0x8a]
+  vcvtm.s32.f64 s17, d8
+@ CHECK: vcvtm.s32.f64 s17, d8     @ encoding: [0xff,0xfe,0xc8,0x8b]
+
+  vcvta.u32.f32 s2, s3
+@ CHECK: vcvta.u32.f32 s2, s3     @ encoding: [0xbc,0xfe,0x61,0x1a]
+  vcvta.u32.f64 s2, d3
+@ CHECK: vcvta.u32.f64 s2, d3     @ encoding: [0xbc,0xfe,0x43,0x1b]
+  vcvtn.u32.f32 s6, s23
+@ CHECK: vcvtn.u32.f32 s6, s23     @ encoding: [0xbd,0xfe,0x6b,0x3a]
+  vcvtn.u32.f64 s6, d23
+@ CHECK: vcvtn.u32.f64 s6, d23     @ encoding: [0xbd,0xfe,0x67,0x3b]
+  vcvtp.u32.f32 s0, s4
+@ CHECK: vcvtp.u32.f32 s0, s4     @ encoding: [0xbe,0xfe,0x42,0x0a]
+  vcvtp.u32.f64 s0, d4
+@ CHECK: vcvtp.u32.f64 s0, d4     @ encoding: [0xbe,0xfe,0x44,0x0b]
+  vcvtm.u32.f32 s17, s8
+@ CHECK: vcvtm.u32.f32 s17, s8     @ encoding: [0xff,0xfe,0x44,0x8a]
+  vcvtm.u32.f64 s17, d8
+@ CHECK: vcvtm.u32.f64 s17, d8     @ encoding: [0xff,0xfe,0x48,0x8b]
+
+
+@ VSEL
+  vselge.f32 s4, s1, s23
+@ CHECK: vselge.f32 s4, s1, s23    @ encoding: [0x20,0xfe,0xab,0x2a]
+  vselge.f64 d30, d31, d23
+@ CHECK: vselge.f64 d30, d31, d23  @ encoding: [0x6f,0xfe,0xa7,0xeb]
+  vselgt.f32 s0, s1, s0
+@ CHECK: vselgt.f32 s0, s1, s0    @ encoding: [0x30,0xfe,0x80,0x0a]
+  vselgt.f64 d5, d10, d20
+@ CHECK: vselgt.f64 d5, d10, d20  @ encoding: [0x3a,0xfe,0x24,0x5b]
+  vseleq.f32 s30, s28, s23
+@ CHECK: vseleq.f32 s30, s28, s23 @ encoding: [0x0e,0xfe,0x2b,0xfa]
+  vseleq.f64 d2, d4, d8
+@ CHECK: vseleq.f64 d2, d4, d8    @ encoding: [0x04,0xfe,0x08,0x2b]
+  vselvs.f32 s21, s16, s14
+@ CHECK: vselvs.f32 s21, s16, s14 @ encoding: [0x58,0xfe,0x07,0xaa]
+  vselvs.f64 d0, d1, d31
+@ CHECK: vselvs.f64 d0, d1, d31   @ encoding: [0x11,0xfe,0x2f,0x0b]
+
+
+@ VMAXNM / VMINNM
+  vmaxnm.f32 s5, s12, s0
+@ CHECK: vmaxnm.f32 s5, s12, s0    @ encoding: [0xc6,0xfe,0x00,0x2a]
+  vmaxnm.f64 d5, d22, d30
+@ CHECK: vmaxnm.f64 d5, d22, d30   @ encoding: [0x86,0xfe,0xae,0x5b]
+  vminnm.f32 s0, s0, s12
+@ CHECK: vminnm.f32 s0, s0, s12    @ encoding: [0x80,0xfe,0x46,0x0a]
+  vminnm.f64 d4, d6, d9
+@ CHECK: vminnm.f64 d4, d6, d9     @ encoding: [0x86,0xfe,0x49,0x4b]
+
+@ VRINT{Z,R,X}
+  it ge
+  vrintzge.f64 d3, d12
+@ CHECK: vrintzge.f64 d3, d12   @ encoding: [0xb6,0xee,0xcc,0x3b]
+  vrintz.f32 s3, s24
+@ CHECK: vrintz.f32 s3, s24     @ encoding: [0xf6,0xee,0xcc,0x1a]
+  it lt
+  vrintrlt.f64 d5, d0
+@ CHECK: vrintrlt.f64 d5, d0    @ encoding: [0xb6,0xee,0x40,0x5b]
+  vrintr.f32 s0, s9
+@ CHECK: vrintr.f32 s0, s9      @ encoding: [0xb6,0xee,0x64,0x0a]
+  it eq
+  vrintxeq.f64 d28, d30
+@ CHECK: vrintxeq.f64 d28, d30  @ encoding: [0xf7,0xee,0x6e,0xcb]
+  it vs
+  vrintxvs.f32 s10, s14
+@ CHECK: vrintxvs.f32 s10, s14  @ encoding: [0xb7,0xee,0x47,0x5a]
+
+@ VRINT{A,N,P,M}
+
+  vrinta.f64 d3, d4
+@ CHECK: vrinta.f64 d3, d4     @ encoding: [0xb8,0xfe,0x44,0x3b]
+  vrinta.f32 s12, s1
+@ CHECK: vrinta.f32 s12, s1    @ encoding: [0xb8,0xfe,0x60,0x6a]
+  vrintn.f64 d3, d4
+@ CHECK: vrintn.f64 d3, d4     @ encoding: [0xb9,0xfe,0x44,0x3b]
+  vrintn.f32 s12, s1
+@ CHECK: vrintn.f32 s12, s1    @ encoding: [0xb9,0xfe,0x60,0x6a]
+  vrintp.f64 d3, d4
+@ CHECK: vrintp.f64 d3, d4     @ encoding: [0xba,0xfe,0x44,0x3b]
+  vrintp.f32 s12, s1
+@ CHECK: vrintp.f32 s12, s1    @ encoding: [0xba,0xfe,0x60,0x6a]
+  vrintm.f64 d3, d4
+@ CHECK: vrintm.f64 d3, d4     @ encoding: [0xbb,0xfe,0x44,0x3b]
+  vrintm.f32 s12, s1
+@ CHECK: vrintm.f32 s12, s1    @ encoding: [0xbb,0xfe,0x60,0x6a]
diff --git a/test/MC/ARM/thumb-hints.s b/test/MC/ARM/thumb-hints.s
new file mode 100644
index 0000000..b3c4cee
--- /dev/null
+++ b/test/MC/ARM/thumb-hints.s
@@ -0,0 +1,64 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -show-encoding < %s | FileCheck %s
+@ RUN: llvm-mc -triple=thumbv6-apple-darwin -mcpu=cortex-m0 -show-encoding < %s | FileCheck %s 
+@ RUN: not llvm-mc -triple=thumbv6-apple-darwin -show-encoding < %s > %t 2> %t2
+@ RUN: FileCheck %s --check-prefix=CHECK-EVIL-PRE-UAL < %t
+@ RUN: FileCheck %s --check-prefix CHECK-ERROR < %t2
+
+  .syntax unified
+
+        nop
+        yield
+        wfe
+        wfi
+        sev
+@ CHECK: nop                            @ encoding: [0x00,0xbf]
+@ CHECK: yield                          @ encoding: [0x10,0xbf]
+@ CHECK: wfe                            @ encoding: [0x20,0xbf]
+@ CHECK: wfi                            @ encoding: [0x30,0xbf]
+@ CHECK: sev                            @ encoding: [0x40,0xbf]
+
+@ CHECK-EVIL-PRE-UAL: mov r8, r8                     @ encoding: [0xc0,0x46]
+
+        dmb sy
+        dmb
+        dsb sy
+        dsb
+        isb sy
+        isb
+@ CHECK: dmb	sy                      @ encoding: [0xbf,0xf3,0x5f,0x8f]
+@ CHECK: dmb	sy                      @ encoding: [0xbf,0xf3,0x5f,0x8f]
+@ CHECK: dsb	sy                      @ encoding: [0xbf,0xf3,0x4f,0x8f]
+@ CHECK: dsb	sy                      @ encoding: [0xbf,0xf3,0x4f,0x8f]
+@ CHECK: isb	sy                      @ encoding: [0xbf,0xf3,0x6f,0x8f]
+@ CHECK: isb	sy                      @ encoding: [0xbf,0xf3,0x6f,0x8f]
+
+
+@ CHECK-ERROR: error: instruction requires: armv6m or armv6t2
+@ CHECK-ERROR-NEXT: yield
+
+@ CHECK-ERROR: error: instruction requires: armv6m or armv6t2
+@ CHECK-ERROR-NEXT: wfe
+
+@ CHECK-ERROR: error: instruction requires: armv6m or armv6t2
+@ CHECK-ERROR-NEXT: wfi
+
+@ CHECK-ERROR: error: instruction requires: armv6m or armv6t2
+@ CHECK-ERROR-NEXT: sev
+
+@ CHECK-ERROR: error:
+@ CHECK-ERROR-NEXT: dmb sy
+
+@ CHECK-ERROR: error: instruction requires: data-barriers
+@ CHECK-ERROR-NEXT: dmb
+
+@ CHECK-ERROR: error:
+@ CHECK-ERROR-NEXT: dsb sy
+
+@ CHECK-ERROR: error: instruction requires: data-barriers
+@ CHECK-ERROR-NEXT: dsb
+
+@ CHECK-ERROR: error:
+@ CHECK-ERROR-NEXT: isb sy
+
+@ CHECK-ERROR: error: instruction requires: data-barriers
+@ CHECK-ERROR-NEXT: isb
diff --git a/test/MC/ARM/thumb-invalid-crypto.txt b/test/MC/ARM/thumb-invalid-crypto.txt
new file mode 100644
index 0000000..a5f9a19
--- /dev/null
+++ b/test/MC/ARM/thumb-invalid-crypto.txt
@@ -0,0 +1,42 @@
+@ RUN: not llvm-mc -triple thumbv8 -mattr=+neon,+crypto -show-encoding < %s 2>&1 | FileCheck %s
+
+iteee lo
+aesdlo.8  q0, q1
+@ CHECK: error: instruction 'aesd' is not predicable, but condition code specified
+aesimchs.8  q0, q1
+@ CHECK: error: instruction 'aesimc' is not predicable, but condition code specified
+aesmchs.8  q0, q1
+@ CHECK: error: instruction 'aesmc' is not predicable, but condition code specified
+aesehs.8 q0, q1
+@ CHECK: error: instruction 'aese' is not predicable, but condition code specified
+
+itee hs
+sha1hhs.32  q0, q1
+@ CHECK: error: instruction 'sha1h' is not predicable, but condition code specified
+sha1su1lo.32  q0, q1
+@ CHECK: error: instruction 'sha1su1' is not predicable, but condition code specified
+sha256su0lo.32  q0, q1
+@ CHECK: error: instruction 'sha256su0' is not predicable, but condition code specified
+
+iteee lo
+sha1clo.32  s0, d1, q2
+@ CHECK: error: instruction 'sha1c' is not predicable, but condition code specified
+sha1mhs.32  q0, s1, q2
+@ CHECK: error: instruction 'sha1m' is not predicable, but condition code specified
+sha1phs.32  s0, q1, q2
+@ CHECK: error: instruction 'sha1p' is not predicable, but condition code specified
+sha1su0hs.32  d0, q1, q2
+@ CHECK: error: instruction 'sha1su0' is not predicable, but condition code specified
+itee hs
+sha256hhs.32  q0, s1, q2
+@ CHECK: error: instruction 'sha256h' is not predicable, but condition code specified
+sha256h2lo.32  q0, q1, s2
+@ CHECK: error: instruction 'sha256h2' is not predicable, but condition code specified
+sha256su1lo.32  s0, d1, q2
+@ CHECK: error: instruction 'sha256su1' is not predicable, but condition code specified
+
+ite lo
+vmulllo.p64 q0, s1, s3
+@ CHECK: error: instruction 'vmull' is not predicable, but condition code specified
+vmullhs.p64 q0, d16, d17
+@ CHECK: error: instruction 'vmull' is not predicable, but condition code specified
diff --git a/test/MC/ARM/thumb-neon-crypto.s b/test/MC/ARM/thumb-neon-crypto.s
new file mode 100644
index 0000000..096e9e8
--- /dev/null
+++ b/test/MC/ARM/thumb-neon-crypto.s
@@ -0,0 +1,35 @@
+@ RUN: llvm-mc -triple thumbv8 -mattr=+neon,+crypto -show-encoding < %s | FileCheck %s
+
+aesd.8  q0, q1
+@ CHECK: aesd.8  q0, q1         @ encoding: [0xb0,0xff,0x42,0x03]
+aese.8  q0, q1
+@ CHECK: aese.8 q0, q1          @ encoding: [0xb0,0xff,0x02,0x03]
+aesimc.8  q0, q1
+@ CHECK: aesimc.8 q0, q1        @ encoding: [0xb0,0xff,0xc2,0x03]
+aesmc.8  q0, q1
+@ CHECK: aesmc.8 q0, q1         @ encoding: [0xb0,0xff,0x82,0x03]
+
+sha1h.32  q0, q1
+@ CHECK: sha1h.32  q0, q1       @ encoding: [0xb9,0xff,0xc2,0x02]
+sha1su1.32  q0, q1
+@ CHECK: sha1su1.32 q0, q1      @ encoding: [0xba,0xff,0x82,0x03]
+sha256su0.32  q0, q1
+@ CHECK: sha256su0.32 q0, q1    @ encoding: [0xba,0xff,0xc2,0x03]
+
+sha1c.32  q0, q1, q2
+@ CHECK: sha1c.32  q0, q1, q2   @ encoding: [0x02,0xef,0x44,0x0c]
+sha1m.32  q0, q1, q2
+@ CHECK: sha1m.32  q0, q1, q2   @ encoding: [0x22,0xef,0x44,0x0c]
+sha1p.32  q0, q1, q2
+@ CHECK: sha1p.32 q0, q1, q2    @ encoding: [0x12,0xef,0x44,0x0c]
+sha1su0.32  q0, q1, q2
+@ CHECK: sha1su0.32  q0, q1, q2      @ encoding: [0x32,0xef,0x44,0x0c]
+sha256h.32  q0, q1, q2
+@ CHECK: sha256h.32  q0, q1, q2      @ encoding: [0x02,0xff,0x44,0x0c]
+sha256h2.32  q0, q1, q2
+@ CHECK: sha256h2.32 q0, q1, q2      @ encoding: [0x12,0xff,0x44,0x0c]
+sha256su1.32  q0, q1, q2
+@ CHECK: sha256su1.32 q0, q1, q2     @ encoding: [0x22,0xff,0x44,0x0c]
+
+vmull.p64 q8, d16, d17
+@ CHECK: vmull.p64  q8, d16, d17    @ encoding: [0xe0,0xef,0xa1,0x0e]
diff --git a/test/MC/ARM/thumb-nop.s b/test/MC/ARM/thumb-nop.s
deleted file mode 100644
index 66f61a6..0000000
--- a/test/MC/ARM/thumb-nop.s
+++ /dev/null
@@ -1,9 +0,0 @@
-@ RUN: llvm-mc -triple=thumbv6-apple-darwin -show-encoding < %s | FileCheck %s -check-prefix=CHECK-V6
-@ RUN: llvm-mc -triple=thumbv7-apple-darwin -show-encoding < %s | FileCheck %s -check-prefix=CHECK-V7
-
-  .syntax unified
-
-        nop
-
-@ CHECK-V6: mov r8, r8                     @ encoding: [0xc0,0x46]
-@ CHECK-V7: nop                            @ encoding: [0x00,0xbf]
diff --git a/test/MC/ARM/thumb-only-conditionals.s b/test/MC/ARM/thumb-only-conditionals.s
index 6d13ce5..8693c24 100644
--- a/test/MC/ARM/thumb-only-conditionals.s
+++ b/test/MC/ARM/thumb-only-conditionals.s
@@ -40,10 +40,10 @@
 @ CHECK-NEXT: mcrr2gt  p7, #15, r5, r4, c1
 
         ite eq
-        mrceq p11, #1, r1, c2, c2
+        mrceq p9, #1, r1, c2, c2
         mrc2ne p12, #3, r3, c3, c4
 @ CHECK: ite eq
-@ CHECK-NEXT: mrceq p11, #1, r1, c2, c2
+@ CHECK-NEXT: mrceq p9, #1, r1, c2, c2
 @ CHECK-NEXT: mrc2ne p12, #3, r3, c3, c4
 
         itt lo
diff --git a/test/MC/ARM/thumb-v8fp.s b/test/MC/ARM/thumb-v8fp.s
deleted file mode 100644
index 50cd005..0000000
--- a/test/MC/ARM/thumb-v8fp.s
+++ /dev/null
@@ -1,130 +0,0 @@
-@ RUN: llvm-mc -triple thumbv8 -mattr=+v8fp -show-encoding < %s | FileCheck %s
-
-@ VCVT{B,T}
-
-  vcvtt.f64.f16 d3, s1
-@ CHECK: vcvtt.f64.f16 d3, s1      @ encoding: [0xb2,0xee,0xe0,0x3b]
-  vcvtt.f16.f64 s5, d12
-@ CHECK: vcvtt.f16.f64 s5, d12     @ encoding: [0xf3,0xee,0xcc,0x2b]
-
-  vcvtb.f64.f16 d3, s1
-@ CHECK: vcvtb.f64.f16 d3, s1     @ encoding: [0xb2,0xee,0x60,0x3b]
-  vcvtb.f16.f64 s4, d1
-@ CHECK: vcvtb.f16.f64 s4, d1     @ encoding: [0xb3,0xee,0x41,0x2b]
-
-  it ge
-  vcvttge.f64.f16 d3, s1
-@ CHECK: vcvttge.f64.f16 d3, s1      @ encoding: [0xb2,0xee,0xe0,0x3b]
-  it gt
-  vcvttgt.f16.f64 s5, d12
-@ CHECK: vcvttgt.f16.f64 s5, d12     @ encoding: [0xf3,0xee,0xcc,0x2b]
-  it eq
-  vcvtbeq.f64.f16 d3, s1
-@ CHECK: vcvtbeq.f64.f16 d3, s1     @ encoding: [0xb2,0xee,0x60,0x3b]
-  it lt
-  vcvtblt.f16.f64 s4, d1
-@ CHECK: vcvtblt.f16.f64 s4, d1     @ encoding: [0xb3,0xee,0x41,0x2b]
-
-
-@ VCVT{A,N,P,M}
-
-  vcvta.s32.f32 s2, s3
-@ CHECK: vcvta.s32.f32 s2, s3     @ encoding: [0xbc,0xfe,0xe1,0x1a]
-  vcvta.s32.f64 s2, d3
-@ CHECK: vcvta.s32.f64 s2, d3     @ encoding: [0xbc,0xfe,0xc3,0x1b]
-  vcvtn.s32.f32 s6, s23
-@ CHECK: vcvtn.s32.f32 s6, s23     @ encoding: [0xbd,0xfe,0xeb,0x3a]
-  vcvtn.s32.f64 s6, d23
-@ CHECK: vcvtn.s32.f64 s6, d23     @ encoding: [0xbd,0xfe,0xe7,0x3b]
-  vcvtp.s32.f32 s0, s4
-@ CHECK: vcvtp.s32.f32 s0, s4     @ encoding: [0xbe,0xfe,0xc2,0x0a]
-  vcvtp.s32.f64 s0, d4
-@ CHECK: vcvtp.s32.f64 s0, d4     @ encoding: [0xbe,0xfe,0xc4,0x0b]
-  vcvtm.s32.f32 s17, s8
-@ CHECK: vcvtm.s32.f32 s17, s8     @ encoding: [0xff,0xfe,0xc4,0x8a]
-  vcvtm.s32.f64 s17, d8
-@ CHECK: vcvtm.s32.f64 s17, d8     @ encoding: [0xff,0xfe,0xc8,0x8b]
-
-  vcvta.u32.f32 s2, s3
-@ CHECK: vcvta.u32.f32 s2, s3     @ encoding: [0xbc,0xfe,0x61,0x1a]
-  vcvta.u32.f64 s2, d3
-@ CHECK: vcvta.u32.f64 s2, d3     @ encoding: [0xbc,0xfe,0x43,0x1b]
-  vcvtn.u32.f32 s6, s23
-@ CHECK: vcvtn.u32.f32 s6, s23     @ encoding: [0xbd,0xfe,0x6b,0x3a]
-  vcvtn.u32.f64 s6, d23
-@ CHECK: vcvtn.u32.f64 s6, d23     @ encoding: [0xbd,0xfe,0x67,0x3b]
-  vcvtp.u32.f32 s0, s4
-@ CHECK: vcvtp.u32.f32 s0, s4     @ encoding: [0xbe,0xfe,0x42,0x0a]
-  vcvtp.u32.f64 s0, d4
-@ CHECK: vcvtp.u32.f64 s0, d4     @ encoding: [0xbe,0xfe,0x44,0x0b]
-  vcvtm.u32.f32 s17, s8
-@ CHECK: vcvtm.u32.f32 s17, s8     @ encoding: [0xff,0xfe,0x44,0x8a]
-  vcvtm.u32.f64 s17, d8
-@ CHECK: vcvtm.u32.f64 s17, d8     @ encoding: [0xff,0xfe,0x48,0x8b]
-
-
-@ VSEL
-  vselge.f32 s4, s1, s23
-@ CHECK: vselge.f32 s4, s1, s23    @ encoding: [0x20,0xfe,0xab,0x2a]
-  vselge.f64 d30, d31, d23
-@ CHECK: vselge.f64 d30, d31, d23  @ encoding: [0x6f,0xfe,0xa7,0xeb]
-  vselgt.f32 s0, s1, s0
-@ CHECK: vselgt.f32 s0, s1, s0    @ encoding: [0x30,0xfe,0x80,0x0a]
-  vselgt.f64 d5, d10, d20
-@ CHECK: vselgt.f64 d5, d10, d20  @ encoding: [0x3a,0xfe,0x24,0x5b]
-  vseleq.f32 s30, s28, s23
-@ CHECK: vseleq.f32 s30, s28, s23 @ encoding: [0x0e,0xfe,0x2b,0xfa]
-  vseleq.f64 d2, d4, d8
-@ CHECK: vseleq.f64 d2, d4, d8    @ encoding: [0x04,0xfe,0x08,0x2b]
-  vselvs.f32 s21, s16, s14
-@ CHECK: vselvs.f32 s21, s16, s14 @ encoding: [0x58,0xfe,0x07,0xaa]
-  vselvs.f64 d0, d1, d31
-@ CHECK: vselvs.f64 d0, d1, d31   @ encoding: [0x11,0xfe,0x2f,0x0b]
-
-
-@ VMAXNM / VMINNM
-  vmaxnm.f32 s5, s12, s0
-@ CHECK: vmaxnm.f32 s5, s12, s0    @ encoding: [0xc6,0xfe,0x00,0x2a]
-  vmaxnm.f64 d5, d22, d30
-@ CHECK: vmaxnm.f64 d5, d22, d30   @ encoding: [0x86,0xfe,0xae,0x5b]
-  vminnm.f32 s0, s0, s12
-@ CHECK: vminnm.f32 s0, s0, s12    @ encoding: [0x80,0xfe,0x46,0x0a]
-  vminnm.f64 d4, d6, d9
-@ CHECK: vminnm.f64 d4, d6, d9     @ encoding: [0x86,0xfe,0x49,0x4b]
-
-@ VRINT{Z,R,X}
-  it ge
-  vrintzge.f64 d3, d12
-@ CHECK: vrintzge.f64 d3, d12   @ encoding: [0xb6,0xee,0xcc,0x3b]
-  vrintz.f32 s3, s24
-@ CHECK: vrintz.f32 s3, s24     @ encoding: [0xf6,0xee,0xcc,0x1a]
-  it lt
-  vrintrlt.f64 d5, d0
-@ CHECK: vrintrlt.f64 d5, d0    @ encoding: [0xb6,0xee,0x40,0x5b]
-  vrintr.f32 s0, s9
-@ CHECK: vrintr.f32 s0, s9      @ encoding: [0xb6,0xee,0x64,0x0a]
-  it eq
-  vrintxeq.f64 d28, d30
-@ CHECK: vrintxeq.f64 d28, d30  @ encoding: [0xf7,0xee,0x6e,0xcb]
-  it vs
-  vrintxvs.f32 s10, s14
-@ CHECK: vrintxvs.f32 s10, s14  @ encoding: [0xb7,0xee,0x47,0x5a]
-
-@ VRINT{A,N,P,M}
-
-  vrinta.f64 d3, d4
-@ CHECK: vrinta.f64 d3, d4     @ encoding: [0xb8,0xfe,0x44,0x3b]
-  vrinta.f32 s12, s1
-@ CHECK: vrinta.f32 s12, s1    @ encoding: [0xb8,0xfe,0x60,0x6a]
-  vrintn.f64 d3, d4
-@ CHECK: vrintn.f64 d3, d4     @ encoding: [0xb9,0xfe,0x44,0x3b]
-  vrintn.f32 s12, s1
-@ CHECK: vrintn.f32 s12, s1    @ encoding: [0xb9,0xfe,0x60,0x6a]
-  vrintp.f64 d3, d4
-@ CHECK: vrintp.f64 d3, d4     @ encoding: [0xba,0xfe,0x44,0x3b]
-  vrintp.f32 s12, s1
-@ CHECK: vrintp.f32 s12, s1    @ encoding: [0xba,0xfe,0x60,0x6a]
-  vrintm.f64 d3, d4
-@ CHECK: vrintm.f64 d3, d4     @ encoding: [0xbb,0xfe,0x44,0x3b]
-  vrintm.f32 s12, s1
-@ CHECK: vrintm.f32 s12, s1    @ encoding: [0xbb,0xfe,0x60,0x6a]
diff --git a/test/MC/ARM/thumb2-b.w-encodingT4.s b/test/MC/ARM/thumb2-b.w-encodingT4.s
index be77b06..aff02e1 100644
--- a/test/MC/ARM/thumb2-b.w-encodingT4.s
+++ b/test/MC/ARM/thumb2-b.w-encodingT4.s
@@ -9,4 +9,4 @@ _foo:
 @------------------------------------------------------------------------------
         b.w   0x3680c
 
-@ CHECK: b.w	#223244                    @ encoding: [0x6d,0xf0,0x0c,0xb0]
+@ CHECK: b.w	#223244                    @ encoding: [0x36,0xf0,0x06,0xbc]
diff --git a/test/MC/ARM/thumb2-branches.s b/test/MC/ARM/thumb2-branches.s
new file mode 100644
index 0000000..9148233
--- /dev/null
+++ b/test/MC/ARM/thumb2-branches.s
@@ -0,0 +1,286 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
+
+@------------------------------------------------------------------------------
+@ unconditional branches accept narrow suffix and encode to short encodings
+@------------------------------------------------------------------------------
+
+         b.n    #-2048
+         b.n    #2046
+
+@ CHECK: b	#-2048                  @ encoding: [0x00,0xe4]
+@ CHECK: b	#2046                   @ encoding: [0xff,0xe3]
+
+@------------------------------------------------------------------------------
+@ unconditional branches accept wide suffix and encode to wide encodings
+@------------------------------------------------------------------------------
+
+         b.w    #-2048
+         b.w    #2046
+         b.w    #-1677216
+         b.w    #1677214
+
+@ CHECK: b.w	#-2048                  @ encoding: [0xff,0xf7,0x00,0xbc]
+@ CHECK: b.w	#2046                   @ encoding: [0x00,0xf0,0xff,0xbb]
+@ CHECK: b.w	#-1677216               @ encoding: [0x66,0xf6,0x30,0xbc]
+@ CHECK: b.w	#1677214                @ encoding: [0x99,0xf1,0xcf,0xbb]
+
+@------------------------------------------------------------------------------
+@ unconditional branches without width suffix encode depending of offset size
+@------------------------------------------------------------------------------
+
+         b      #-2048
+         b      #2046
+         b      #-2050
+         b      #2048
+         b      #-1677216
+         b      #1677214
+
+@ CHECK: b	#-2048                  @ encoding: [0x00,0xe4]
+@ CHECK: b	#2046                   @ encoding: [0xff,0xe3]
+@ CHECK: b.w	#-2050                  @ encoding: [0xff,0xf7,0xff,0xbb]
+@ CHECK: b.w	#2048                   @ encoding: [0x00,0xf0,0x00,0xbc]
+@ CHECK: b.w	#-1677216               @ encoding: [0x66,0xf6,0x30,0xbc]
+@ CHECK: b.w	#1677214                @ encoding: [0x99,0xf1,0xcf,0xbb]
+
+@------------------------------------------------------------------------------
+@ unconditional branches with width narrow suffix in IT block 
+@------------------------------------------------------------------------------
+
+         it     eq
+         beq.n  #-2048
+         it     ne
+         bne.n  #-2046
+
+@ CHECK: it	eq                      @ encoding: [0x08,0xbf]
+@ CHECK: beq	#-2048                  @ encoding: [0x00,0xe4] 
+@ CHECK: it	ne                      @ encoding: [0x18,0xbf] 
+@ CHECK: bne	#-2046                  @ encoding: [0x01,0xe4]
+
+@------------------------------------------------------------------------------
+@ unconditional branches with wide suffix in IT block
+@------------------------------------------------------------------------------
+
+         it     gt
+         bgt.w  #-2048
+         it     le
+         ble.w  #2046
+         it     ge
+         bge.w  #-1677216
+         it     lt
+         blt.w  #1677214
+
+@ CHECK: it	gt                      @ encoding: [0xc8,0xbf]
+@ CHECK: bgt.w	#-2048                  @ encoding: [0xff,0xf7,0x00,0xbc]
+@ CHECK: it	le                      @ encoding: [0xd8,0xbf]
+@ CHECK: ble.w	#2046                   @ encoding: [0x00,0xf0,0xff,0xbb]
+@ CHECK: it	ge                      @ encoding: [0xa8,0xbf]
+@ CHECK: bge.w	#-1677216               @ encoding: [0x66,0xf6,0x30,0xbc]
+@ CHECK: it	lt                      @ encoding: [0xb8,0xbf]
+@ CHECK: blt.w	#1677214                @ encoding: [0x99,0xf1,0xcf,0xbb]
+
+@------------------------------------------------------------------------------
+@ conditional branches accept narrow suffix and encode to short encodings
+@------------------------------------------------------------------------------
+
+         beq.n    #-256
+         bne.n    #254
+
+@ CHECK: beq	#-256                   @ encoding: [0x80,0xd0]
+@ CHECK: bne	#254                    @ encoding: [0x7f,0xd1]
+
+@------------------------------------------------------------------------------
+@ unconditional branches accept wide suffix and encode to wide encodings
+@------------------------------------------------------------------------------
+
+         bmi.w    #-256
+         bne.w    #254
+         blt.w    #-1048576
+         bge.w    #1048574
+
+@ CHECK: bmi.w	#-256                   @ encoding: [0x3f,0xf5,0x80,0xaf]
+@ CHECK: bne.w	#254                    @ encoding: [0x40,0xf0,0x7f,0x80]
+@ CHECK: blt.w	#-1048576               @ encoding: [0xc0,0xf6,0x00,0x80]
+@ CHECK: bge.w	#1048574                @ encoding: [0xbf,0xf2,0xff,0xaf]
+
+@------------------------------------------------------------------------------
+@ unconditional branches without width suffix encode depending of offset size
+@------------------------------------------------------------------------------
+
+         bne     #-256
+         bgt     #254
+         bne     #-258
+         bgt     #256
+         bne     #-1048576
+         bgt     #1048574
+
+@ CHECK: bne	#-256                   @ encoding: [0x80,0xd1]
+@ CHECK: bgt	#254                    @ encoding: [0x7f,0xdc]
+@ CHECK: bne.w	#-258                   @ encoding: [0x7f,0xf4,0x7f,0xaf]
+@ CHECK: bgt.w	#256                    @ encoding: [0x00,0xf3,0x80,0x80]
+@ CHECK: bne.w	#-1048576               @ encoding: [0x40,0xf4,0x00,0x80]
+@ CHECK: bgt.w	#1048574                @ encoding: [0x3f,0xf3,0xff,0xaf]
+
+@------------------------------------------------------------------------------
+@ same branch insturction encoding to conditional or unconditional depending
+@ on whether it is in an IT block or not
+@------------------------------------------------------------------------------
+
+         it     eq
+         addeq  r0, r1
+         bne    #128
+
+@ CHECK: it	eq                      @ encoding: [0x08,0xbf]
+@ CHECK: addeq	r0, r1                  @ encoding: [0x08,0x44]
+@ CHECK: bne	#128                    @ encoding: [0x40,0xd1]
+
+         ite    eq
+         addeq  r0, r1
+         bne    #128
+
+@ CHECK: ite	eq                      @ encoding: [0x0c,0xbf]
+@ CHECK: addeq	r0, r1                  @ encoding: [0x08,0x44]
+@ CHECK: bne	#128                    @ encoding: [0x40,0xe0]
+
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
+
+@------------------------------------------------------------------------------
+@ unconditional branches accept narrow suffix and encode to short encodings
+@------------------------------------------------------------------------------
+
+         b.n    #-2048
+         b.n    #2046
+
+@ CHECK: b	#-2048                  @ encoding: [0x00,0xe4]
+@ CHECK: b	#2046                   @ encoding: [0xff,0xe3]
+
+@------------------------------------------------------------------------------
+@ unconditional branches accept wide suffix and encode to wide encodings
+@------------------------------------------------------------------------------
+
+         b.w    #-2048
+         b.w    #2046
+         b.w    #-1677216
+         b.w    #1677214
+
+@ CHECK: b.w	#-2048                  @ encoding: [0xff,0xf7,0x00,0xbc]
+@ CHECK: b.w	#2046                   @ encoding: [0x00,0xf0,0xff,0xbb]
+@ CHECK: b.w	#-1677216               @ encoding: [0x66,0xf6,0x30,0xbc]
+@ CHECK: b.w	#1677214                @ encoding: [0x99,0xf1,0xcf,0xbb]
+
+@------------------------------------------------------------------------------
+@ unconditional branches without width suffix encode depending of offset size
+@------------------------------------------------------------------------------
+
+         b      #-2048
+         b      #2046
+         b      #-2050
+         b      #2048
+         b      #-1677216
+         b      #1677214
+
+@ CHECK: b	#-2048                  @ encoding: [0x00,0xe4]
+@ CHECK: b	#2046                   @ encoding: [0xff,0xe3]
+@ CHECK: b.w	#-2050                  @ encoding: [0xff,0xf7,0xff,0xbb]
+@ CHECK: b.w	#2048                   @ encoding: [0x00,0xf0,0x00,0xbc]
+@ CHECK: b.w	#-1677216               @ encoding: [0x66,0xf6,0x30,0xbc]
+@ CHECK: b.w	#1677214                @ encoding: [0x99,0xf1,0xcf,0xbb]
+
+@------------------------------------------------------------------------------
+@ unconditional branches with width narrow suffix in IT block 
+@------------------------------------------------------------------------------
+
+         it     eq
+         beq.n  #-2048
+         it     ne
+         bne.n  #-2046
+
+@ CHECK: it	eq                      @ encoding: [0x08,0xbf]
+@ CHECK: beq	#-2048                  @ encoding: [0x00,0xe4] 
+@ CHECK: it	ne                      @ encoding: [0x18,0xbf] 
+@ CHECK: bne	#-2046                  @ encoding: [0x01,0xe4]
+
+@------------------------------------------------------------------------------
+@ unconditional branches with wide suffix in IT block
+@------------------------------------------------------------------------------
+
+         it     gt
+         bgt.w  #-2048
+         it     le
+         ble.w  #2046
+         it     ge
+         bge.w  #-1677216
+         it     lt
+         blt.w  #1677214
+
+@ CHECK: it	gt                      @ encoding: [0xc8,0xbf]
+@ CHECK: bgt.w	#-2048                  @ encoding: [0xff,0xf7,0x00,0xbc]
+@ CHECK: it	le                      @ encoding: [0xd8,0xbf]
+@ CHECK: ble.w	#2046                   @ encoding: [0x00,0xf0,0xff,0xbb]
+@ CHECK: it	ge                      @ encoding: [0xa8,0xbf]
+@ CHECK: bge.w	#-1677216               @ encoding: [0x66,0xf6,0x30,0xbc]
+@ CHECK: it	lt                      @ encoding: [0xb8,0xbf]
+@ CHECK: blt.w	#1677214                @ encoding: [0x99,0xf1,0xcf,0xbb]
+
+@------------------------------------------------------------------------------
+@ conditional branches accept narrow suffix and encode to short encodings
+@------------------------------------------------------------------------------
+
+         beq.n    #-256
+         bne.n    #254
+
+@ CHECK: beq	#-256                   @ encoding: [0x80,0xd0]
+@ CHECK: bne	#254                    @ encoding: [0x7f,0xd1]
+
+@------------------------------------------------------------------------------
+@ unconditional branches accept wide suffix and encode to wide encodings
+@------------------------------------------------------------------------------
+
+         bmi.w    #-256
+         bne.w    #254
+         blt.w    #-1048576
+         bge.w    #1048574
+
+@ CHECK: bmi.w	#-256                   @ encoding: [0x3f,0xf5,0x80,0xaf]
+@ CHECK: bne.w	#254                    @ encoding: [0x40,0xf0,0x7f,0x80]
+@ CHECK: blt.w	#-1048576               @ encoding: [0xc0,0xf6,0x00,0x80]
+@ CHECK: bge.w	#1048574                @ encoding: [0xbf,0xf2,0xff,0xaf]
+
+@------------------------------------------------------------------------------
+@ unconditional branches without width suffix encode depending of offset size
+@------------------------------------------------------------------------------
+
+         bne     #-256
+         bgt     #254
+         bne     #-258
+         bgt     #256
+         bne     #-1048576
+         bgt     #1048574
+
+@ CHECK: bne	#-256                   @ encoding: [0x80,0xd1]
+@ CHECK: bgt	#254                    @ encoding: [0x7f,0xdc]
+@ CHECK: bne.w	#-258                   @ encoding: [0x7f,0xf4,0x7f,0xaf]
+@ CHECK: bgt.w	#256                    @ encoding: [0x00,0xf3,0x80,0x80]
+@ CHECK: bne.w	#-1048576               @ encoding: [0x40,0xf4,0x00,0x80]
+@ CHECK: bgt.w	#1048574                @ encoding: [0x3f,0xf3,0xff,0xaf]
+
+@------------------------------------------------------------------------------
+@ same branch insturction encoding to conditional or unconditional depending
+@ on whether it is in an IT block or not
+@------------------------------------------------------------------------------
+
+         it     eq
+         addeq  r0, r1
+         bne    #128
+
+@ CHECK: it	eq                      @ encoding: [0x08,0xbf]
+@ CHECK: addeq	r0, r1                  @ encoding: [0x08,0x44]
+@ CHECK: bne	#128                    @ encoding: [0x40,0xd1]
+
+         ite    eq
+         addeq  r0, r1
+         bne    #128
+
+@ CHECK: ite	eq                      @ encoding: [0x0c,0xbf]
+@ CHECK: addeq	r0, r1                  @ encoding: [0x08,0x44]
+@ CHECK: bne	#128                    @ encoding: [0x40,0xe0]
+
diff --git a/test/MC/ARM/thumb2-diagnostics.s b/test/MC/ARM/thumb2-diagnostics.s
index e1c0058..6ac2db0 100644
--- a/test/MC/ARM/thumb2-diagnostics.s
+++ b/test/MC/ARM/thumb2-diagnostics.s
@@ -51,3 +51,22 @@
         itt eq
         bkpteq #1
 @ CHECK-ERRORS: error: instruction 'bkpt' is not predicable, but condition code specified
+
+        nopeq
+        nopeq
+
+@ out of range operands for Thumb2 targets
+
+        beq.w  #-1048578
+        bne.w  #1048576
+        blt.w  #1013411
+        b.w    #-16777218
+        b.w    #16777216
+        b.w    #1592313
+
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
+@ CHECK-ERRORS: error: branch target out of range
diff --git a/test/MC/ARM/thumb2-ldrd.s b/test/MC/ARM/thumb2-ldrd.s
new file mode 100644
index 0000000..4463c21
--- /dev/null
+++ b/test/MC/ARM/thumb2-ldrd.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -arch thumb -mattr=+thumb2 \
+// RUN: < %s >/dev/null 2> %t
+// RUN: grep "error: destination operands can't be identical" %t | count 4
+// rdar://14479780
+
+ldrd r0, r0, [pc, #0]
+ldrd r0, r0, [r1, #4]
+ldrd r0, r0, [r1], #4
+ldrd r0, r0, [r1, #4]!
diff --git a/test/MC/ARM/v8_IT_manual.s b/test/MC/ARM/v8_IT_manual.s
new file mode 100644
index 0000000..4b63aa8
--- /dev/null
+++ b/test/MC/ARM/v8_IT_manual.s
@@ -0,0 +1,6739 @@
+@ RUN: llvm-mc -triple thumbv8 -show-encoding < %s 2>&1 | FileCheck %s
+
+@ ADD reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+addge r1, r2, r3
+@ ADD reg, encoding T2
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+addge r1, r2
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge r1, pc
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge pc, r2
+@ ADD reg, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge r11, r2, r3
+@ ADD imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+addge r1, r2, #3
+@ ADD imm, encoding T2
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+addge r1, #3
+@ ADD imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge r11, r2, #3
+@ ADD imm, encoding T4 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge r11, r2, #333
+@ ADD SP+imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+addge r1, sp, #32
+@ ADD SP+imm, encoding T2
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge sp, #32
+@ ADD SP+imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge r1, sp, #33
+@ ADD SP+imm, encoding T4 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge r1, sp, #333
+
+@ SUB reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+subge r4, r3, r2
+@ SUB reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge r14, r3, r2
+@ SUB imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+subge r4, r3, #2
+@ SUB imm, encoding T2
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+subge r4, #3
+@ SUB imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge r14, r3, #2
+@ SUB imm, encoding T4 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge r14, r3, #2222
+@ SUB SP-imm, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge sp, #32
+@ SUB SP-imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge r4, sp, #33
+@ SUB SP-imm, encoding T4 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge r4, sp, #3333
+
+@ MOV reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+movge r4, r5
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movge r4, pc
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movge pc, r5
+@ MOV reg, encoding T3 (32-bit) -- can only appear as MOVS or MOV.W
+@ MOV imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+movge r4, #5
+@ MOV imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movge r14, #5
+@ MOV imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movge r14, #555
+
+@ CMP reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+cmpge r3, r4
+@ CMP reg, encoding T2
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+cmpge r13, r4
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+cmpge r3, pc
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+cmpge pc, r4
+@ CMP reg, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+cmpge r3, r4, lsl #1 
+@ CMP imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+cmpge r3, #4
+@ CMP imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+cmpge r13, #4
+
+@ AND reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+andge r5, r6
+@ AND reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r9, r6
+
+@ EOR reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+eorge r7, r6
+@ EOR reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r7, r9
+
+@ LSL imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+lslge r7, r0, #1
+@ LSL imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+lslge r7, r10, #1
+@ LSL reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+lslge r7, r0
+@ LSL reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+lslge r7, r10
+
+@ LSR imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+lsrge r3, r2, #1
+@ LSR imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+lsrge r3, r12, #1
+@ LSR reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+lsrge r3, r2
+@ LSR reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+lsrge r3, r12
+
+@ ASR imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+asrge r2, r3, #4
+@ ASR imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+asrge r12, r3, #4
+@ ASR reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+asrge r2, r3
+@ ASR reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+asrge r12, r3
+
+@ ADC reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+adcge r5, r4
+@ ADC reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r5, r5, r14
+
+@ SBC reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+sbcge r5, r6
+@ SBC reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r9, r9, r6
+
+@ ROR reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+rorge r7, r6
+@ ROR reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rorge r7, r9
+
+@ TST reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+tstge r7, r0
+@ TST reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+tstge r7, r10
+
+@ RSB imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+rsbge r1, r0, #0
+@ RSB imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r11, r0, #0
+
+@ CMN reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+cmnge r1, r2
+@ CMN reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+cmnge r11, r2
+
+@ ORR reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+orrge r3, r2
+@ ORR reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r3, r12
+
+@ MUL reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+mulge r3, r4, r3
+@ MUL reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mulge r3, r4, r5
+
+@ BIC reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+bicge r5, r4
+@ BIC reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r5, r14
+
+@ MVN reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+mvnge r5, r6
+@ MVN reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mvnge r9, r6
+
+@ BX, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+bxge r6
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bxge pc
+
+@ BLX, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+blxge r7
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+blxge pc
+
+@ LDR reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrge r0, [r1, r2]
+@ LDR reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge r10, [r1, r2]
+@ LDR imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrge r0, [r1]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrge r0, [r1, #8]
+@ LDR imm, encoding T2
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrge r0, [sp]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrge r0, [sp, #8]
+@ LDR reg, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge r0, [r1, #2]
+@ LDR reg, encoding T4 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge r0, [r1, #-2]
+@ LDR lit, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge r0, [pc, #8]
+@ LDR lit, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge r10, [pc, #8]
+
+@ STR reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strge r1, [r2, r3]
+@ STR reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge r11, [r2, r3]
+@ STR imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strge r1, [r2]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strge r1, [r2, #4]
+@ STR imm, encoding T2
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strge r1, [sp]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strge r1, [sp, #4]
+@ STR imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge r1, [r2, #3]
+@ STR imm, encoding T4 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge r1, [r2, #-3]
+
+@ STRH reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strhge r4, [r3, r2]
+@ STRH reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge r14, [r3, r2]
+@ STRH imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strhge r4, [r3]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strhge r4, [r3, #2]
+@ STRH imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge r4, [r3, #1]
+@ STRH imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge r4, [r3, #-2]
+
+@ STRB reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strbge r3, [r4, r5]
+@ STRB reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge r3, [r14, r5]
+@ STRB imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strbge r3, [r4]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+strbge r3, [r4, #5]
+@ STRB reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge r3, [r14, #5]
+@ STRB reg, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge r3, [r4, #-5]
+
+@ LDRSB reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrsbge r6, [r5, r4]
+@ LDRSB reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge r9, [r5, r4]
+
+@ LDRH reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrhge r5, [r6, r7]
+@ LDRH reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge r5, [r9, r7]
+@ LDRH imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrhge r5, [r6]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrhge r5, [r6, #8]
+@ LDRH imm, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge r5, [r6, #7]
+@ LDRH imm, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge r5, [r6, #-8]
+
+@ LDRB reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrbge r0, [r7, r6]
+@ LDRB reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge r10, [r7, r6]
+@ LDRB imm, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrbge r0, [r7]
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrbge r0, [r7, #6]
+@ LDRB reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge r10, [r7, #6]
+@ LDRB reg, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge r0, [r7, #-6]
+
+@ LDRSH reg, encoding T1
+@ CHECK-NOT: [[@LINE+2]]:1: warning
+it ge
+ldrshge r7, [r0, r1]
+@ LDRSH reg, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge r7, [r0, r11]
+
+@ ADR, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adrge r1, #24
+@ ADR, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adrge r1, #-23
+@ ADR, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adrge r1, #23
+
+@ SXTH, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sxthge r4, r3
+@ SXTH, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sxthge r4, r9
+
+@ SXTB, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sxtbge r4, r5
+@ SXTB, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sxtbge r14, r5
+
+@ UXTH, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+uxthge r6, r5
+@ UXTH, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+uxthge r9, r5
+
+@ UXTB, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+uxtbge r6, r7
+@ UXTB, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+uxtbge r6, r9
+
+@ PUSH, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+pushge {r1, r3, r7}
+@ PUSH, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+pushge {r1, r13, r7}
+@ PUSH, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+pushge {r13}
+
+@ REV, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+revge r7, r6
+@ REV, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+revge r9, r6
+
+@ REV16, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rev16ge r7, r0
+@ REV16, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rev16ge r7, r10
+
+@ REVSH, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+revshge r1, r0
+@ REVSH, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+revshge r11, r0
+
+@ POP, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+popge {r1, r0, r5}
+@ POP, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+popge {r1, r5, r10}
+@ POP, encoding T3 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+popge {r10}
+
+@ NOP, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+nopge
+@ NOP, encoding T2 (32-bit) -- can only appear as NOP.W
+
+@ STM, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stmge r1!, {r2, r3}
+@ STM, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stmge r1, {r2, r3}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stmge r1!, {r2, r13}
+
+@ LDM, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldmge r4!, {r2, r3}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldmge r4, {r2, r3}
+@ LDM, encoding T2 (32-bit)
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldmge r14!, {r2, r3}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldmge r14, {r2, r3}
+
+@ SVC, encoding T1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+svcge #55
+
+@ B, encoding T2
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bge #2014
+
+@ The following Thumb instructions only have 32-bit encodings.
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strexge r0, r0, [pc]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r0], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r1], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r2], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r3], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r4], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r5], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r6], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r7], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r8], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r9], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r10], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r11], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r12], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [sp], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [lr], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [pc], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r0], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r1], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r2], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r3], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r4], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r5], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r6], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r7], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r8], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r9], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r10], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r11], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r12], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [sp], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [lr], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [pc], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r0, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r1, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r2, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r3, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r4, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r5, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r6, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r7, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r8, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r9, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r10, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r11, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r12, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [sp, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [lr, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r0, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r1, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r2, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r3, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r4, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r5, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r6, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r7, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r8, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r9, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r10, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r11, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r12, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [sp, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [lr, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [pc, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [pc]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r0, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r1, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r2, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r3, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r4, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r5, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r6, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r7, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r8, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r9, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r10, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r11, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [r12, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [sp, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [lr, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strdge r0, r0, [pc, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movge.w r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movsge.w r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mvnge.w r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mvnsge.w r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, sp, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, sp, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, sp, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, sp, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r0], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r1], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r2], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r3], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r4], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r5], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r6], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r7], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r8], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r9], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r10], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r11], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r12], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [sp], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [lr], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [pc], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r0], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r1], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r2], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r3], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r4], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r5], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r6], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r7], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r8], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r9], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r10], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r11], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r12], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [sp], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [lr], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [pc], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r0, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r1, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r2, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r3, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r4, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r5, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r6, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r7, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r8, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r9, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r10, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r11, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, r12, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, sp, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, lr, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mcrrge p0, #0, r0, pc, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r0, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r1, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r2, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r3, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r4, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r5, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r6, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r7, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r8, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r9, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r10, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r11, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, r12, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, sp, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, lr, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mrrcge p14, #0, r0, pc, c0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r0], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r1], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r2], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r3], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r4], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r5], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r6], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r7], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r8], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r9], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r10], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r11], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r12], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [sp], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [lr], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [pc], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r0], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r1], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r2], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r3], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r4], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r5], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r6], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r7], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r8], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r9], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r10], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r11], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r12], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [sp], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [lr], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [pc], #-0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r0], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r1], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r2], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r3], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r4], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r5], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r6], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r7], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r8], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r9], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r10], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r11], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r12], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [sp], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [lr], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [pc], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r0], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r1], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r2], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r3], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r4], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r5], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r6], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r7], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r8], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r9], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r10], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r11], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r12], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [sp], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [lr], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [pc], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r0], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r1], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r2], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r3], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r4], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r5], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r6], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r7], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r8], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r9], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r10], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r11], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r12], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [sp], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [lr], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [pc], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r0], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r1], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r2], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r3], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r4], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r5], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r6], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r7], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r8], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r9], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r10], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r11], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r12], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [sp], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [lr], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [pc], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r0], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r1], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r2], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r3], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r4], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r5], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r6], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r7], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r8], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r9], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r10], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r11], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r12], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [sp], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [lr], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [pc], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r0], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r1], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r2], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r3], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r4], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r5], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r6], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r7], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r8], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r9], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r10], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r11], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r12], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [sp], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [lr], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [pc], {0}
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r0], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r1], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r2], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r3], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r4], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r5], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r6], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r7], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r8], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r9], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r10], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r11], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r12], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [sp], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [lr], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [pc], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r0], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r1], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r2], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r3], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r4], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r5], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r6], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r7], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r8], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r9], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r10], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r11], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r12], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [sp], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [lr], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [pc], #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r0, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r1, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r2, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r3, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r4, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r5, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r6, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r7, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r8, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r9, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r10, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r11, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r12, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [sp, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [lr, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r0, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r1, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r2, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r3, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r4, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r5, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r6, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r7, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r8, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r9, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r10, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r11, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r12, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [sp, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [lr, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r0, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r1, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r2, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r3, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r4, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r5, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r6, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r7, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r8, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r9, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r10, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r11, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r12, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [sp, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [lr, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [pc, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r0, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r1, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r2, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r3, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r4, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r5, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r6, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r7, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r8, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r9, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r10, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r11, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r12, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [sp, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [lr, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [pc, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r0, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r1, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r2, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r3, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r4, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r5, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r6, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r7, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r8, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r9, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r10, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r11, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r12, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [sp, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [lr, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r0, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r1, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r2, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r3, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r4, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r5, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r6, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r7, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r8, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r9, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r10, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r11, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r12, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [sp, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [lr, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r0, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r1, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r2, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r3, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r4, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r5, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r6, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r7, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r8, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r9, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r10, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r11, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r12, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [sp, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [lr, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [pc, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r0, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r1, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r2, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r3, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r4, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r5, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r6, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r7, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r8, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r9, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r10, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r11, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r12, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [sp, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [lr, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [pc, #-0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [pc]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [pc]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r0, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r1, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r2, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r3, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r4, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r5, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r6, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r7, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r8, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r9, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r10, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r11, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [r12, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [sp, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [lr, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stcge p0, c0, [pc, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r0, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r1, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r2, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r3, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r4, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r5, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r6, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r7, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r8, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r9, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r10, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r11, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [r12, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [sp, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [lr, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldcge p0, c0, [pc, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [pc]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [pc]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r0, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r1, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r2, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r3, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r4, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r5, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r6, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r7, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r8, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r9, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r10, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r11, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [r12, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [sp, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [lr, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+stclge p0, c0, [pc, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r0, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r1, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r2, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r3, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r4, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r5, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r6, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r7, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r8, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r9, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r10, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r11, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [r12, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [sp, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [lr, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldclge p0, c0, [pc, #0]!
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movge.w r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movsge.w r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mvnge r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mvnge r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, sp, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, sp, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, sp, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, sp, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, sp, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, pc, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #4096
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #8192
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #12288
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #16384
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #20480
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #24576
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #28672
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #32768
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #36864
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #40960
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #45056
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #49152
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #53248
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #57344
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #61440
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r1, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r2, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r3, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r4, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r5, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r6, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r7, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r8, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r9, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r10, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r11, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r12, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, sp, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, lr, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, pc, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #4096
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #8192
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #12288
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #16384
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #20480
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #24576
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #28672
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #32768
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #36864
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #40960
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #45056
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #49152
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #53248
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #57344
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #61440
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r2
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r3
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r4
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r5
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r6
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r7
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r8
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r9
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r10
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r11
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r12
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, lr
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r0, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r1, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r2, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r3, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r4, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r5, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r6, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r7, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r8, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r9, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r10, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r11, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r12, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, lr, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r0, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r1, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r2, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r3, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r4, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r5, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r6, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r7, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r8, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r9, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r10, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r11, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, r12, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfige r0, lr, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bfcge r0, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r2
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r3
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r4
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r5
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r6
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r7
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r8
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r9
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r10
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r11
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r12
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, lr
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r0, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r1, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r2, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r3, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r4, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r5, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r6, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r7, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r8, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r9, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r10, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r11, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r12, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, lr, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+andge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+bicge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movge.w r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+orrge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movsge.w r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mvnge r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ornge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mvnge r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+eorge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, sp, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addge.w r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, sp, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addsge.w r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+adcge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbcge r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, sp, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subge.w r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, sp, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subsge.w r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbge.w r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r0, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r1, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r2, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r3, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r4, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r5, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r6, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r7, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r8, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r9, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r10, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r11, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, r12, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+rsbsge.w r0, lr, #8388608
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r0, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r1, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r2, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r3, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r4, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r5, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r6, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r7, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r8, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r9, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r10, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r11, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, r12, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, sp, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, lr, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+addwge r0, pc, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #6144
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #10240
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #14336
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #18432
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #22528
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #26624
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #30720
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #34816
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #38912
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #43008
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #47104
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #51200
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #55296
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #59392
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movwge r0, #63488
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r0, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r1, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r2, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r3, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r4, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r5, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r6, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r7, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r8, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r9, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r10, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r11, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, r12, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, sp, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, lr, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+subwge r0, pc, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #2048
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #6144
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #10240
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #14336
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #18432
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #22528
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #26624
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #30720
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #34816
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #38912
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #43008
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #47104
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #51200
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #55296
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #59392
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+movtge r0, #63488
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r2
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r3
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r4
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r5
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r6
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r7
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r8
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r9
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r10
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r11
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, r12
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ssatge r0, #1, lr
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r0, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r1, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r2, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r3, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r4, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r5, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r6, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r7, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r8, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r9, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r10, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r11, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, r12, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+sbfxge r0, lr, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r2
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r3
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r4
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r5
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r6
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r7
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r8
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r9
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r10
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r11
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, r12
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+usatge r0, #0, lr
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r0, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r1, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r2, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r3, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r4, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r5, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r6, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r7, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r8, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r9, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r10, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r11, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, r12, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ubfxge r0, lr, #0, #1
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strbge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrbge.w r0, [pc, #0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strhge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrhge.w r0, [pc, #0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+strge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrge.w r0, [pc, #0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r0, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r1, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r2, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r3, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r4, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r5, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r6, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r7, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r8, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r9, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r10, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r11, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r12, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [sp, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [lr, r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [pc, #-0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrsbge.w r0, [pc, #0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r1]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r2]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r3]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r4]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r5]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r6]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r7]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r8]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r9]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r10]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r11]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [r12]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [sp]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [lr]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+ldrshge.w r0, [pc, #0]
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r1, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r2, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r3, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r4, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r5, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r6, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r7, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r8, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r9, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r10, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r11, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, r12, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+mlage r0, lr, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smullge r0, r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umullge r0, r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+smlalge r0, r0, lr, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r0, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r1, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r2, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r3, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r4, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r5, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r6, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r7, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r8, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r9, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r10, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r11, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, r12, r0
+@ CHECK: [[@LINE+2]]:1: warning: deprecated instruction in IT block
+it ge
+umlalge r0, r0, lr, r0
diff --git a/test/MC/ARM/v8fp.s b/test/MC/ARM/v8fp.s
deleted file mode 100644
index b12d7e2..0000000
--- a/test/MC/ARM/v8fp.s
+++ /dev/null
@@ -1,124 +0,0 @@
-@ RUN: llvm-mc -triple armv8 -mattr=+v8fp -show-encoding < %s | FileCheck %s
-
-@ VCVT{B,T}
-
-  vcvtt.f64.f16 d3, s1
-@ CHECK: vcvtt.f64.f16 d3, s1      @ encoding: [0xe0,0x3b,0xb2,0xee]
-  vcvtt.f16.f64 s5, d12
-@ CHECK: vcvtt.f16.f64 s5, d12     @ encoding: [0xcc,0x2b,0xf3,0xee]
-
-  vcvtb.f64.f16 d3, s1
-@ CHECK: vcvtb.f64.f16 d3, s1     @ encoding: [0x60,0x3b,0xb2,0xee]
-  vcvtb.f16.f64 s4, d1
-@ CHECK: vcvtb.f16.f64 s4, d1     @ encoding: [0x41,0x2b,0xb3,0xee]
-
-  vcvttge.f64.f16 d3, s1
-@ CHECK: vcvttge.f64.f16 d3, s1      @ encoding: [0xe0,0x3b,0xb2,0xae]
-  vcvttgt.f16.f64 s5, d12
-@ CHECK: vcvttgt.f16.f64 s5, d12     @ encoding: [0xcc,0x2b,0xf3,0xce]
-
-  vcvtbeq.f64.f16 d3, s1
-@ CHECK: vcvtbeq.f64.f16 d3, s1     @ encoding: [0x60,0x3b,0xb2,0x0e]
-  vcvtblt.f16.f64 s4, d1
-@ CHECK: vcvtblt.f16.f64 s4, d1     @ encoding: [0x41,0x2b,0xb3,0xbe]
-
-
-@ VCVT{A,N,P,M}
-
-  vcvta.s32.f32 s2, s3
-@ CHECK: vcvta.s32.f32 s2, s3     @ encoding: [0xe1,0x1a,0xbc,0xfe]
-  vcvta.s32.f64 s2, d3
-@ CHECK: vcvta.s32.f64 s2, d3     @ encoding: [0xc3,0x1b,0xbc,0xfe]
-  vcvtn.s32.f32 s6, s23
-@ CHECK: vcvtn.s32.f32 s6, s23     @ encoding: [0xeb,0x3a,0xbd,0xfe]
-  vcvtn.s32.f64 s6, d23
-@ CHECK: vcvtn.s32.f64 s6, d23     @ encoding: [0xe7,0x3b,0xbd,0xfe]
-  vcvtp.s32.f32 s0, s4
-@ CHECK: vcvtp.s32.f32 s0, s4     @ encoding: [0xc2,0x0a,0xbe,0xfe]
-  vcvtp.s32.f64 s0, d4
-@ CHECK: vcvtp.s32.f64 s0, d4     @ encoding: [0xc4,0x0b,0xbe,0xfe]
-  vcvtm.s32.f32 s17, s8
-@ CHECK: vcvtm.s32.f32 s17, s8     @ encoding: [0xc4,0x8a,0xff,0xfe]
-  vcvtm.s32.f64 s17, d8
-@ CHECK: vcvtm.s32.f64 s17, d8     @ encoding: [0xc8,0x8b,0xff,0xfe]
-
-  vcvta.u32.f32 s2, s3
-@ CHECK: vcvta.u32.f32 s2, s3     @ encoding: [0x61,0x1a,0xbc,0xfe]
-  vcvta.u32.f64 s2, d3
-@ CHECK: vcvta.u32.f64 s2, d3     @ encoding: [0x43,0x1b,0xbc,0xfe]
-  vcvtn.u32.f32 s6, s23
-@ CHECK: vcvtn.u32.f32 s6, s23     @ encoding: [0x6b,0x3a,0xbd,0xfe]
-  vcvtn.u32.f64 s6, d23
-@ CHECK: vcvtn.u32.f64 s6, d23     @ encoding: [0x67,0x3b,0xbd,0xfe]
-  vcvtp.u32.f32 s0, s4
-@ CHECK: vcvtp.u32.f32 s0, s4     @ encoding: [0x42,0x0a,0xbe,0xfe]
-  vcvtp.u32.f64 s0, d4
-@ CHECK: vcvtp.u32.f64 s0, d4     @ encoding: [0x44,0x0b,0xbe,0xfe]
-  vcvtm.u32.f32 s17, s8
-@ CHECK: vcvtm.u32.f32 s17, s8     @ encoding: [0x44,0x8a,0xff,0xfe]
-  vcvtm.u32.f64 s17, d8
-@ CHECK: vcvtm.u32.f64 s17, d8     @ encoding: [0x48,0x8b,0xff,0xfe]
-
-
-@ VSEL
-  vselge.f32 s4, s1, s23
-@ CHECK: vselge.f32 s4, s1, s23    @ encoding: [0xab,0x2a,0x20,0xfe]
-  vselge.f64 d30, d31, d23
-@ CHECK: vselge.f64 d30, d31, d23  @ encoding: [0xa7,0xeb,0x6f,0xfe]
-  vselgt.f32 s0, s1, s0
-@ CHECK: vselgt.f32 s0, s1, s0    @ encoding: [0x80,0x0a,0x30,0xfe]
-  vselgt.f64 d5, d10, d20
-@ CHECK: vselgt.f64 d5, d10, d20  @ encoding: [0x24,0x5b,0x3a,0xfe]
-  vseleq.f32 s30, s28, s23
-@ CHECK: vseleq.f32 s30, s28, s23 @ encoding: [0x2b,0xfa,0x0e,0xfe]
-  vseleq.f64 d2, d4, d8
-@ CHECK: vseleq.f64 d2, d4, d8    @ encoding: [0x08,0x2b,0x04,0xfe]
-  vselvs.f32 s21, s16, s14
-@ CHECK: vselvs.f32 s21, s16, s14 @ encoding: [0x07,0xaa,0x58,0xfe]
-  vselvs.f64 d0, d1, d31
-@ CHECK: vselvs.f64 d0, d1, d31   @ encoding: [0x2f,0x0b,0x11,0xfe]
-
-
-@ VMAXNM / VMINNM
-  vmaxnm.f32 s5, s12, s0
-@ CHECK: vmaxnm.f32 s5, s12, s0    @ encoding: [0x00,0x2a,0xc6,0xfe]
-  vmaxnm.f64 d5, d22, d30
-@ CHECK: vmaxnm.f64 d5, d22, d30   @ encoding: [0xae,0x5b,0x86,0xfe]
-  vminnm.f32 s0, s0, s12
-@ CHECK: vminnm.f32 s0, s0, s12    @ encoding: [0x46,0x0a,0x80,0xfe]
-  vminnm.f64 d4, d6, d9
-@ CHECK: vminnm.f64 d4, d6, d9     @ encoding: [0x49,0x4b,0x86,0xfe]
-
-@ VRINT{Z,R,X}
-
-  vrintzge.f64 d3, d12
-@ CHECK: vrintzge.f64 d3, d12   @ encoding: [0xcc,0x3b,0xb6,0xae]
-  vrintz.f32 s3, s24
-@ CHECK: vrintz.f32 s3, s24     @ encoding: [0xcc,0x1a,0xf6,0xee]
-  vrintrlt.f64 d5, d0
-@ CHECK: vrintrlt.f64 d5, d0    @ encoding: [0x40,0x5b,0xb6,0xbe]
-  vrintr.f32 s0, s9
-@ CHECK: vrintr.f32 s0, s9      @ encoding: [0x64,0x0a,0xb6,0xee]
-  vrintxeq.f64 d28, d30
-@ CHECK: vrintxeq.f64 d28, d30  @ encoding: [0x6e,0xcb,0xf7,0x0e]
-  vrintxvs.f32 s10, s14
-@ CHECK: vrintxvs.f32 s10, s14  @ encoding: [0x47,0x5a,0xb7,0x6e]
-
-@ VRINT{A,N,P,M}
-
-  vrinta.f64 d3, d4
-@ CHECK: vrinta.f64 d3, d4     @ encoding: [0x44,0x3b,0xb8,0xfe]
-  vrinta.f32 s12, s1
-@ CHECK: vrinta.f32 s12, s1    @ encoding: [0x60,0x6a,0xb8,0xfe]
-  vrintn.f64 d3, d4
-@ CHECK: vrintn.f64 d3, d4     @ encoding: [0x44,0x3b,0xb9,0xfe]
-  vrintn.f32 s12, s1
-@ CHECK: vrintn.f32 s12, s1    @ encoding: [0x60,0x6a,0xb9,0xfe]
-  vrintp.f64 d3, d4
-@ CHECK: vrintp.f64 d3, d4     @ encoding: [0x44,0x3b,0xba,0xfe]
-  vrintp.f32 s12, s1
-@ CHECK: vrintp.f32 s12, s1    @ encoding: [0x60,0x6a,0xba,0xfe]
-  vrintm.f64 d3, d4
-@ CHECK: vrintm.f64 d3, d4     @ encoding: [0x44,0x3b,0xbb,0xfe]
-  vrintm.f32 s12, s1
-@ CHECK: vrintm.f32 s12, s1    @ encoding: [0x60,0x6a,0xbb,0xfe]
diff --git a/test/MC/ARM/vfp4.s b/test/MC/ARM/vfp4.s
index 0a1fe92..8b1b0e0 100644
--- a/test/MC/ARM/vfp4.s
+++ b/test/MC/ARM/vfp4.s
@@ -1,9 +1,13 @@
 @ RUN: llvm-mc < %s -triple armv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4   | FileCheck %s --check-prefix=ARM
 @ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=THUMB
-@ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mcpu=cortex-m4 | FileCheck %s --check-prefix=THUMB_V7EM
+@ RUN: not llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mcpu=cortex-m4 > %t 2> %t2
+@ RUN:     FileCheck %s < %t --check-prefix=THUMB_V7EM
+@ RUN:     FileCheck %s < %t2 --check-prefix=THUMB_V7EM-ERRORS
 
 @ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee]
 @ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b]
+@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS-NEXT: vfma.f64 d16, d18, d17
 vfma.f64 d16, d18, d17
 
 @ ARM: vfma.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0xa2,0xee]
@@ -13,14 +17,20 @@ vfma.f32 s2, s4, s0
 
 @ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2]
 @ THUMB: vfma.f32 d16, d18, d17 @ encoding: [0x42,0xef,0xb1,0x0c]
+@ THUMB_V7EM-ERRORS: error: instruction requires: NEON
+@ THUMB_V7EM-ERRORS-NEXT: vfma.f32 d16, d18, d17
 vfma.f32 d16, d18, d17
 
 @ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2]
 @ THUMB: vfma.f32	q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c]
+@ THUMB_V7EM-ERRORS: error: instruction requires: NEON
+@ THUMB_V7EM-ERRORS-NEXT: vfma.f32 q2, q4, q0
 vfma.f32 q2, q4, q0
 
 @ ARM: vfnma.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xd2,0xee]
 @ THUMB: vfnma.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xe1,0x0b]
+@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS-NEXT: vfnma.f64 d16, d18, d17
 vfnma.f64 d16, d18, d17
 
 @ ARM: vfnma.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0x92,0xee]
@@ -30,6 +40,8 @@ vfnma.f32 s2, s4, s0
 
 @ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee]
 @ THUMB: vfms.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xe1,0x0b]
+@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS-NEXT: vfms.f64 d16, d18, d17
 vfms.f64 d16, d18, d17
 
 @ ARM: vfms.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0xa2,0xee]
@@ -39,16 +51,23 @@ vfms.f32 s2, s4, s0
 
 @ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2]
 @ THUMB: vfms.f32 d16, d18, d17 @ encoding: [0x62,0xef,0xb1,0x0c]
+@ THUMB_V7EM-ERRORS: error: instruction requires: NEON
+@ THUMB_V7EM-ERRORS-NEXT: vfms.f32 d16, d18, d17
 vfms.f32 d16, d18, d17
 
 @ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2]
 @ THUMB: vfms.f32	q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c]
+@ THUMB_V7EM-ERRORS: error: instruction requires: NEON
+@ THUMB_V7EM-ERRORS-NEXT: vfms.f32 q2, q4, q0
 vfms.f32 q2, q4, q0
 
 @ ARM: vfnms.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xd2,0xee]
 @ THUMB: vfnms.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xa1,0x0b]
+@ THUMB_V7EM-ERRORS: error: instruction requires: double precision VFP
+@ THUMB_V7EM-ERRORS-NEXT: vfnms.f64 d16, d18, d17
 vfnms.f64 d16, d18, d17
 
 @ ARM: vfnms.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0x92,0xee]
 @ THUMB: vfnms.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x00,0x1a]
+@ THUMB_V7EM: vfnms.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x00,0x1a]
 vfnms.f32 s2, s4, s0
diff --git a/test/MC/ARM/xscale-attributes.ll b/test/MC/ARM/xscale-attributes.ll
index d1e9931..718fd8f 100644
--- a/test/MC/ARM/xscale-attributes.ll
+++ b/test/MC/ARM/xscale-attributes.ll
@@ -26,13 +26,14 @@ entry:
 ; OBJ-NEXT:     ]
 ; OBJ-NEXT:     Address: 0x0
 ; OBJ-NEXT:     Offset: 0x38
-; OBJ-NEXT:     Size: 32
+; OBJ-NEXT:     Size: 40
 ; OBJ-NEXT:     Link: 0
 ; OBJ-NEXT:     Info: 0
 ; OBJ-NEXT:     AddressAlignment: 1
 ; OBJ-NEXT:     EntrySize: 0
 ; OBJ-NEXT:     SectionData (
-; OBJ-NEXT:       0000: 411F0000 00616561 62690001 15000000
-; OBJ-NEXT:       0010: 06050801 09011401 15011703 18011901
+; OBJ-NEXT:       0000: 41270000 00616561 62690001 1D000000
+; OBJ-NEXT:       0010: 05585343 414C4500 06050801 09011401
+; OBJ-NEXT:       0020: 15011703 18011901
 ; OBJ-NEXT:     )
 ; OBJ-NEXT:   }
diff --git a/test/MC/AsmParser/cfi-window-save.s b/test/MC/AsmParser/cfi-window-save.s
new file mode 100644
index 0000000..c309436
--- /dev/null
+++ b/test/MC/AsmParser/cfi-window-save.s
@@ -0,0 +1,15 @@
+# RUN: llvm-mc -filetype=asm -triple x86_64-pc-linux-gnu <%s | FileCheck %s
+
+# Should use SPARC as the target to test this. However, SPARC does not support
+# asm parsing yet.
+
+# CHECK: .cfi_window_save
+
+
+f:
+        .cfi_startproc
+        nop
+        .cfi_window_save
+        nop
+        .cfi_endproc
+
diff --git a/test/MC/AsmParser/directive_file.s b/test/MC/AsmParser/directive_file.s
index 121890e..9b99e0f 100644
--- a/test/MC/AsmParser/directive_file.s
+++ b/test/MC/AsmParser/directive_file.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
 
         .file "hello"
-        .file 1 "world"
+        .file 1 "worl\144"   # "\144" is "d"
         .file 2 "directory" "file"
 
 # CHECK: .file "hello"
diff --git a/test/MC/AsmParser/directive_fill.s b/test/MC/AsmParser/directive_fill.s
index 60bd468..bb3ced0 100644
--- a/test/MC/AsmParser/directive_fill.s
+++ b/test/MC/AsmParser/directive_fill.s
@@ -15,3 +15,19 @@ TEST1:
 # CHECK: .quad 4
 TEST2:  
         .fill 1, 8, 4
+
+# CHECK: TEST3
+# CHECK: .byte 0
+# CHECK: .byte 0
+# CHECK: .byte 0
+# CHECK: .byte 0
+TEST3:
+	.fill 4
+
+# CHECK: TEST4
+# CHECK: .short 0
+# CHECK: .short 0
+# CHECK: .short 0
+# CHECK: .short 0
+TEST4:
+	.fill 4, 2
diff --git a/test/MC/AsmParser/directive_incbin.s b/test/MC/AsmParser/directive_incbin.s
index 55f9c79..ed4e27a 100644
--- a/test/MC/AsmParser/directive_incbin.s
+++ b/test/MC/AsmParser/directive_incbin.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s -I %p | FileCheck %s
 
 .data
-.incbin "incbin_abcd"
+.incbin "incbin\137abcd"  # "\137" is underscore "_"
 
 # CHECK: .ascii	 "abcd\n"
diff --git a/test/MC/AsmParser/directive_include.s b/test/MC/AsmParser/directive_include.s
index fabd941..f53bc67 100644
--- a/test/MC/AsmParser/directive_include.s
+++ b/test/MC/AsmParser/directive_include.s
@@ -5,5 +5,5 @@
 # CHECK: a = 0
 # CHECK: TESTB:
 TESTA:  
-	.include       "directive_set.s"
+	.include       "directive\137set.s"   # "\137" is underscore "_"
 TESTB:
diff --git a/test/MC/AsmParser/directive_loc.s b/test/MC/AsmParser/directive_loc.s
index 164d42a..700a32c 100644
--- a/test/MC/AsmParser/directive_loc.s
+++ b/test/MC/AsmParser/directive_loc.s
@@ -6,3 +6,4 @@
         .loc 1 2
         .loc 1 2 3
         .loc 1 2 discriminator 1
+        .loc 1 0
diff --git a/test/MC/AsmParser/floating-literals.s b/test/MC/AsmParser/floating-literals.s
index d44bb98..6578e32 100644
--- a/test/MC/AsmParser/floating-literals.s
+++ b/test/MC/AsmParser/floating-literals.s
@@ -1,4 +1,5 @@
-# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-unknown-unknown %s 2> /dev/null | FileCheck %s
+# RUN: not llvm-mc -triple i386-unknown-unknown %s 2>&1 > /dev/null| FileCheck %s --check-prefix=CHECK-ERROR
 
 # CHECK: .long	1067412619
 # CHECK: .long	1075000115
@@ -42,3 +43,40 @@
 // APFloat should reject these with an error, not crash:
 //.double -1.2e+
 //.double -1.2e
+
+# CHECK: .long 1310177520
+.float 0x12f7.1ep+17
+# CHECK: .long 1084227584
+.float 0x.ap+3
+# CHECK: .quad 4602678819172646912
+.double 0x2.p-2
+# CHECK: .long 1094713344
+.float 0x3p2
+# CHECK: .long 872284160
+.float 0x7fp-30
+# CHECK: .long 3212836864
+.float -0x1.0p0
+
+# CHECK-ERROR: invalid hexadecimal floating-point constant: expected at least one exponent digit
+# CHECK-ERROR: unexpected token in directive
+.float 0xa.apa
+
+# CHECK-ERROR: invalid hexadecimal floating-point constant: expected at least one exponent digit
+# CHECK-ERROR: unexpected token in directive
+.double -0x1.2p+
+
+# CHECK-ERROR: invalid hexadecimal floating-point constant: expected at least one exponent digit
+# CHECK-ERROR: unexpected token in directive
+.double -0x1.2p
+
+# CHECK-ERROR: invalid hexadecimal floating-point constant: expected at least one significand digit
+# CHECK-ERROR: unexpected token in directive
+.float 0xp2
+
+# CHECK-ERROR: invalid hexadecimal floating-point constant: expected at least one significand digit
+# CHECK-ERROR: unexpected token in directive
+.float 0x.p5
+
+# CHECK-ERROR: error: invalid hexadecimal floating-point constant: expected exponent part 'p'
+# CHECK-ERROR: unexpected token in directive
+.float 0x1.2
diff --git a/test/MC/AsmParser/lit.local.cfg b/test/MC/AsmParser/lit.local.cfg
index 6c49f08..ba763cf 100644
--- a/test/MC/AsmParser/lit.local.cfg
+++ b/test/MC/AsmParser/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/COFF/bss_section.ll b/test/MC/COFF/bss_section.ll
new file mode 100644
index 0000000..60924f1
--- /dev/null
+++ b/test/MC/COFF/bss_section.ll
@@ -0,0 +1,6 @@
+; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s
+
+%struct.foo = type { i32, i32 }
+
+@"\01?thingy@@3Ufoo@@B" = global %struct.foo zeroinitializer, align 4
+; CHECK: .bss
diff --git a/test/MC/COFF/eh-frame.s b/test/MC/COFF/eh-frame.s
new file mode 100644
index 0000000..e606b76
--- /dev/null
+++ b/test/MC/COFF/eh-frame.s
@@ -0,0 +1,14 @@
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-readobj -s | FileCheck %s
+
+	.def	 _main;
+	.scl	2;
+	.type	32;
+	.endef
+	.text
+	.globl	_main
+_main:
+	.cfi_startproc
+	ret
+	.cfi_endproc
+
+// CHECK:    Name: .eh_frame
diff --git a/test/MC/COFF/feat00.s b/test/MC/COFF/feat00.s
new file mode 100644
index 0000000..d08f407
--- /dev/null
+++ b/test/MC/COFF/feat00.s
@@ -0,0 +1,14 @@
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-readobj -t | FileCheck %s
+
+"@feat.00" = 123
+.globl @feat.00
+
+// CHECK: Symbol {
+// CHECK:   Name: @feat.00
+// CHECK:   Value: 123
+// CHECK:   Section: (-1)
+// CHECK:   BaseType: Null (0x0)
+// CHECK:   ComplexType: Null (0x0)
+// CHECK:   StorageClass: External (0x2)
+// CHECK:   AuxSymbolCount: 0
+// CHECK: }
diff --git a/test/MC/COFF/lit.local.cfg b/test/MC/COFF/lit.local.cfg
index 41a8434..ba763cf 100644
--- a/test/MC/COFF/lit.local.cfg
+++ b/test/MC/COFF/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.s', '.ll']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/COFF/lset0.s b/test/MC/COFF/lset0.s
new file mode 100755
index 0000000..f5020c8
--- /dev/null
+++ b/test/MC/COFF/lset0.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s -o - | llvm-nm | FileCheck %s
+
+not_global = 123
+global = 456
+.globl global
+.Llocal = 789
+
+// CHECK-NOT: not_global
+// CHECK-NOT: Llocal
+// CHECK: global
+// CHECK-NOT: not_global
+// CHECK-NOT: Llocal
diff --git a/test/MC/COFF/rdata.ll b/test/MC/COFF/rdata.ll
new file mode 100644
index 0000000..f041781
--- /dev/null
+++ b/test/MC/COFF/rdata.ll
@@ -0,0 +1,6 @@
+; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s
+
+%struct.foo = type { i32, i32 }
+
+@"\01?thingy@@3Ufoo@@B" = constant %struct.foo zeroinitializer, align 4
+; CHECK: .section        .rdata
diff --git a/test/MC/COFF/section-comdat.s b/test/MC/COFF/section-comdat.s
new file mode 100644
index 0000000..dd5be87
--- /dev/null
+++ b/test/MC/COFF/section-comdat.s
@@ -0,0 +1,188 @@
+// RUN: llvm-mc -triple i386-pc-win32 -filetype=obj %s | llvm-readobj -s -t | FileCheck %s
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -t | FileCheck %s
+
+.section assocSec
+.linkonce
+.long 1
+
+.section secName, "dr", discard, "Symbol1"
+.globl Symbol1
+Symbol1:
+.long 1
+
+.section secName, "dr", one_only, "Symbol2"
+.globl Symbol2
+Symbol2:
+.long 1
+
+.section SecName, "dr", same_size, "Symbol3"
+.globl Symbol3
+Symbol3:
+.long 1
+
+.section SecName, "dr", same_contents, "Symbol4"
+.globl Symbol4
+Symbol4:
+.long 1
+
+.section SecName, "dr", associative assocSec, "Symbol5"
+.globl Symbol5
+Symbol5:
+.long 1
+
+.section SecName, "dr", largest, "Symbol6"
+.globl Symbol6
+Symbol6:
+.long 1
+
+.section SecName, "dr", newest, "Symbol7"
+.globl Symbol7
+Symbol7:
+.long 1
+
+// CHECK: Sections [
+// CHECK:   Section {
+// CHECK:     Number: 1
+// CHECK:     Name: assocSec
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: 2
+// CHECK:     Name: secName
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: 3
+// CHECK:     Name: secName
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: 4
+// CHECK:     Name: SecName
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: 5
+// CHECK:     Name: SecName
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: 6
+// CHECK:     Name: SecName
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: 7
+// CHECK:     Name: SecName
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK:   Section {
+// CHECK:     Number: 8
+// CHECK:     Name: SecName
+// CHECK:     Characteristics [
+// CHECK:       IMAGE_SCN_LNK_COMDAT
+// CHECK:     ]
+// CHECK:   }
+// CHECK: ]
+// CHECK: Symbols [
+// CHECK:   Symbol {
+// CHECK:     Name: assocSec
+// CHECK:     Section: assocSec (1)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: Any
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: secName
+// CHECK:     Section: secName (2)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: Any
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: secName
+// CHECK:     Section: secName (3)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: NoDuplicates
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: SecName
+// CHECK:     Section: SecName (4)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: SameSize
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: SecName
+// CHECK:     Section: SecName (5)
+// CHECK:     AuxSymbolCount: 1
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: ExactMatch
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: SecName
+// CHECK:     Section: SecName (6)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: Associative
+// CHECK:       AssocSection: assocSec (1)
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: SecName
+// CHECK:     Section: SecName (7)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: Largest
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: SecName
+// CHECK:     Section: SecName (8)
+// CHECK:     AuxSectionDef {
+// CHECK:       Selection: Newest (0x7)
+// CHECK:     }
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: Symbol1
+// CHECK:     Section: secName (2)
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: Symbol2
+// CHECK:     Section: secName (3)
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: Symbol3
+// CHECK:     Section: SecName (4)
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: Symbol4
+// CHECK:     Section: SecName (5)
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: Symbol5
+// CHECK:     Section: SecName (6)
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: Symbol6
+// CHECK:     Section: SecName (7)
+// CHECK:   }
+// CHECK:   Symbol {
+// CHECK:     Name: Symbol7
+// CHECK:     Section: SecName (8)
+// CHECK:   }
+// CHECK: ]
diff --git a/test/MC/COFF/seh-align1.s b/test/MC/COFF/seh-align1.s
new file mode 100644
index 0000000..aafc6ed
--- /dev/null
+++ b/test/MC/COFF/seh-align1.s
@@ -0,0 +1,65 @@
+// This test checks the alignment and padding of the unwind info.
+
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -sd -sr -u | FileCheck %s
+
+// CHECK:      Sections [
+// CHECK:        Section {
+// CHECK:          Name: .xdata
+// CHECK:          RawDataSize: 8
+// CHECK:          RelocationCount: 0
+// CHECK:          Characteristics [
+// CHECK-NEXT:       ALIGN_4BYTES
+// CHECK-NEXT:       CNT_INITIALIZED_DATA
+// CHECK-NEXT:       MEM_READ
+// CHECK-NEXT:     ]
+// CHECK:          Relocations [
+// CHECK-NEXT:     ]
+// CHECK:          SectionData (
+// CHECK-NEXT:       0000: 01000000 00000000
+// CHECK-NEXT:     )
+// CHECK-NEXT:   }
+// CHECK:        Section {
+// CHECK:          Name: .pdata
+// CHECK:          RawDataSize: 12
+// CHECK:          RelocationCount: 3
+// CHECK:          Characteristics [
+// CHECK-NEXT:       IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:       IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:       IMAGE_SCN_MEM_READ
+// CHECK-NEXT:     ]
+// CHECK:          Relocations [
+// CHECK-NEXT:       [[BeginDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB smallFunc
+// CHECK-NEXT:       [[EndDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB smallFunc
+// CHECK-NEXT:       [[UnwindDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB .xdata
+// CHECK-NEXT:     ]
+// CHECK:          SectionData (
+// CHECK-NEXT:       0000: 00000000 01000000 00000000
+// CHECK-NEXT:     )
+// CHECK-NEXT:   }
+// CHECK:        UnwindInformation [
+// CHECK-NEXT:     RuntimeFunction {
+// CHECK-NEXT:     StartAddress: smallFunc {{(\+0x[A-F0-9]+ )?}}([[BeginDisp]])
+// CHECK-NEXT:     EndAddress: smallFunc {{(\+0x[A-F0-9]+ )?}}([[EndDisp]])
+// CHECK-NEXT:     UnwindInfoAddress: .xdata {{(\+0x[A-F0-9]+ )?}}([[UnwindDisp]])
+// CHECK-NEXT:     UnwindInfo {
+// CHECK-NEXT:       Version: 1
+// CHECK-NEXT:       Flags [
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       PrologSize: 0
+// CHECK-NEXT:       FrameRegister: -
+// CHECK-NEXT:       FrameOffset: -
+// CHECK-NEXT:       UnwindCodeCount: 0
+// CHECK-NEXT:       UnwindCodes [
+// CHECK-NEXT:       ]
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+// Generate the minimal unwind info.
+// It contains only the version set to 1. All other bytes are 0.
+    .globl smallFunc
+    .def smallFunc; .scl 2; .type 32; .endef
+    .seh_proc smallFunc
+smallFunc:
+    ret
+    .seh_endproc
diff --git a/test/MC/COFF/seh-align2.s b/test/MC/COFF/seh-align2.s
new file mode 100644
index 0000000..5e6c49a
--- /dev/null
+++ b/test/MC/COFF/seh-align2.s
@@ -0,0 +1,78 @@
+// This test checks the alignment and padding of the unwind info.
+
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -sd -sr -u | FileCheck %s
+
+// CHECK:      Sections [
+// CHECK:        Section {
+// CHECK:          Name: .xdata
+// CHECK:          RawDataSize: 16
+// CHECK:          RelocationCount: 1
+// CHECK:          Characteristics [
+// CHECK-NEXT:       ALIGN_4BYTES
+// CHECK-NEXT:       CNT_INITIALIZED_DATA
+// CHECK-NEXT:       MEM_READ
+// CHECK-NEXT:     ]
+// CHECK:          Relocations [
+// CHECK-NEXT:       [[HandlerDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB __C_specific_handler
+// CHECK-NEXT:     ]
+// CHECK:          SectionData (
+// CHECK-NEXT:       0000: 09000100 04220000 00000000 BEBAFECA
+// CHECK-NEXT:     )
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Section {
+// CHECK:          Name: .pdata
+// CHECK:          RawDataSize: 12
+// CHECK:          RelocationCount: 3
+// CHECK:          Characteristics [
+// CHECK-NEXT:       IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:       IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:       IMAGE_SCN_MEM_READ
+// CHECK-NEXT:     ]
+// CHECK:          Relocations [
+// CHECK-NEXT:       [[BeginDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:       [[EndDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:       [[UnwindDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB .xdata
+// CHECK-NEXT:     ]
+// CHECK:          SectionData (
+// CHECK-NEXT:       0000: FCFFFFFF 05000000 00000000
+// CHECK-NEXT:     )
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+// CHECK:      UnwindInformation [
+// CHECK-NEXT:   RuntimeFunction {
+// CHECK-NEXT:     StartAddress: func {{(\+0x[A-F0-9]+ )?}}([[BeginDisp]])
+// CHECK-NEXT:     EndAddress: func {{(\+0x[A-F0-9]+ )?}}([[EndDisp]])
+// CHECK-NEXT:     UnwindInfoAddress: .xdata {{(\+0x[A-F0-9]+ )?}}([[UnwindDisp]])
+// CHECK-NEXT:     UnwindInfo {
+// CHECK-NEXT:       Version: 1
+// CHECK-NEXT:       Flags [
+// CHECK-NEXT:         ExceptionHandler
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       PrologSize: 0
+// CHECK-NEXT:       FrameRegister: -
+// CHECK-NEXT:       FrameOffset: -
+// CHECK-NEXT:       UnwindCodeCount: 1
+// CHECK-NEXT:       UnwindCodes [
+// CHECK-NEXT:         0x04: ALLOC_SMALL size=24
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       Handler: __C_specific_handler ([[HandlerDisp]])
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+// Generates only one unwind code.
+// Requires padding of the unwind code array.
+    .globl func
+    .def func; .scl 2; .type 32; .endef
+    .seh_proc func
+    subq $24, %rsp
+    .seh_stackalloc 24
+    .seh_handler __C_specific_handler, @except
+    .seh_handlerdata
+    .long 0xcafebabe
+    .text
+    .seh_endprologue
+func:
+    addq $24, %rsp
+    ret
+    .seh_endproc
diff --git a/test/MC/COFF/seh-align3.s b/test/MC/COFF/seh-align3.s
new file mode 100644
index 0000000..238b5de
--- /dev/null
+++ b/test/MC/COFF/seh-align3.s
@@ -0,0 +1,83 @@
+// This test checks the alignment and padding of the unwind info.
+
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -sd -sr -u | FileCheck %s
+
+// CHECK:      Sections [
+// CHECK:        Section {
+// CHECK:          Name: .xdata
+// CHECK:          RawDataSize: 16
+// CHECK:          RelocationCount: 1
+// CHECK:          Characteristics [
+// CHECK-NEXT:       ALIGN_4BYTES
+// CHECK-NEXT:       CNT_INITIALIZED_DATA
+// CHECK-NEXT:       MEM_READ
+// CHECK-NEXT:     ]
+// CHECK:          Relocations [
+// CHECK-NEXT:       [[HandlerDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB __C_specific_handler
+// CHECK-NEXT:     ]
+// CHECK:          SectionData (
+// CHECK-NEXT:       0000: 19000200 04D002C0 00000000 BEBAFECA
+// CHECK-NEXT:     )
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Section {
+// CHECK:          Name: .pdata
+// CHECK:          RawDataSize: 12
+// CHECK:          RelocationCount: 3
+// CHECK:          Characteristics [
+// CHECK-NEXT:       IMAGE_SCN_ALIGN_4BYTES
+// CHECK-NEXT:       IMAGE_SCN_CNT_INITIALIZED_DATA
+// CHECK-NEXT:       IMAGE_SCN_MEM_READ
+// CHECK-NEXT:     ]
+// CHECK:          Relocations [
+// CHECK-NEXT:       [[BeginDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:       [[EndDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:       [[UnwindDisp:0x[A-F0-9]+]] IMAGE_REL_AMD64_ADDR32NB .xdata
+// CHECK-NEXT:     ]
+// CHECK:          SectionData (
+// CHECK-NEXT:       0000: FCFFFFFF 05000000 00000000
+// CHECK-NEXT:     )
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+// CHECK:      UnwindInformation [
+// CHECK-NEXT:   RuntimeFunction {
+// CHECK-NEXT:     StartAddress: func {{(\+0x[A-F0-9]+ )?}}([[BeginDisp]])
+// CHECK-NEXT:     EndAddress: func {{(\+0x[A-F0-9]+ )?}}([[EndDisp]])
+// CHECK-NEXT:     UnwindInfoAddress: .xdata {{(\+0x[A-F0-9]+ )?}}([[UnwindDisp]])
+// CHECK-NEXT:     UnwindInfo {
+// CHECK-NEXT:       Version: 1
+// CHECK-NEXT:       Flags [
+// CHECK-NEXT:         ExceptionHandler
+// CHECK-NEXT:         TerminateHandler
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       PrologSize: 0
+// CHECK-NEXT:       FrameRegister: -
+// CHECK-NEXT:       FrameOffset: -
+// CHECK-NEXT:       UnwindCodeCount: 2
+// CHECK-NEXT:       UnwindCodes [
+// CHECK-NEXT:         0x04: PUSH_NONVOL reg=R13
+// CHECK-NEXT:         0x02: PUSH_NONVOL reg=R12
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       Handler: __C_specific_handler ([[HandlerDisp]])
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+// Generates two unwind codes.
+// Requires no padding of the unwind code array.
+    .globl func
+    .def func; .scl 2; .type 32; .endef
+    .seh_proc func
+    push %r12
+    .seh_pushreg 12
+    push %r13
+    .seh_pushreg 13
+    .seh_handler __C_specific_handler, @except, @unwind
+    .seh_handlerdata
+    .long 0xcafebabe
+    .text
+    .seh_endprologue
+func:
+    pop %r13
+    pop %r12
+    ret
+    .seh_endproc
diff --git a/test/MC/COFF/seh.s b/test/MC/COFF/seh.s
index bef425e..72d42f4 100644
--- a/test/MC/COFF/seh.s
+++ b/test/MC/COFF/seh.s
@@ -1,8 +1,6 @@
 // This test checks that the SEH directives emit the correct unwind data.
 
-// TODO: Expected fail because SET_FPREG has a wrong offset.
-// XFAIL: *
-// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -u | FileCheck %s
+// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-readobj -s -u -r | FileCheck %s
 
 // CHECK:      Sections [
 // CHECK:        Section {
@@ -36,6 +34,27 @@
 // CHECK-NEXT:   }
 // CHECK-NEXT: ]
 
+// CHECK-NEXT: Relocations [
+// CHECK-NEXT:   Section (2) .xdata {
+// CHECK-NEXT:     0x14 IMAGE_REL_AMD64_ADDR32NB __C_specific_handler
+// CHECK-NEXT:     0x20 IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:     0x24 IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:     0x28 IMAGE_REL_AMD64_ADDR32NB .xdata
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Section (3) .pdata {
+// CHECK-NEXT:     0x0 IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:     0x4 IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:     0x8 IMAGE_REL_AMD64_ADDR32NB .xdata
+// CHECK-NEXT:     0xC IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:     0x10 IMAGE_REL_AMD64_ADDR32NB func
+// CHECK-NEXT:     0x14 IMAGE_REL_AMD64_ADDR32NB .xdata
+// CHECK-NEXT:     0x18 IMAGE_REL_AMD64_ADDR32NB smallFunc
+// CHECK-NEXT:     0x1C IMAGE_REL_AMD64_ADDR32NB smallFunc
+// CHECK-NEXT:     0x20 IMAGE_REL_AMD64_ADDR32NB .xdata
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+
 // CHECK:      UnwindInformation [
 // CHECK-NEXT:   RuntimeFunction {
 // CHECK-NEXT:     StartAddress: [[CodeSect1:[^ ]+]] [[BeginDisp1:(\+0x[A-F0-9]+)?]]
diff --git a/test/MC/COFF/tricky-names.ll b/test/MC/COFF/tricky-names.ll
new file mode 100644
index 0000000..6e041d3
--- /dev/null
+++ b/test/MC/COFF/tricky-names.ll
@@ -0,0 +1,38 @@
+; Check how tricky symbols are printed in the asm output.
+; RUN: llc -mtriple=i686-pc-win32 %s -o - | FileCheck %s --check-prefix=ASM
+
+; Check that we can roundtrip these names through our assembler.
+; RUN: llc -mtriple=i686-pc-win32 %s -o - | llvm-mc -triple i686-pc-win32 -filetype=obj | llvm-readobj -t | FileCheck %s --check-prefix=READOBJ
+
+
+@"\01??__E_Generic_object@?$_Error_objects@H@std@@YAXXZ" = global i32 0
+@"\01__ZL16ExceptionHandlerP19_EXCEPTION_POINTERS@4" = global i32 0
+@"\01@foo.bar" = global i32 0
+
+define weak i32 @"\01??_B?$num_put@_WV?$back_insert_iterator@V?$basic_string@_WU?$char_traits@_W@std@@V?$allocator@_W@2@@std@@@std@@@std@@51"() section ".text" {
+  %a = load i32* @"\01??__E_Generic_object@?$_Error_objects@H@std@@YAXXZ"
+  %b = load i32* @"\01__ZL16ExceptionHandlerP19_EXCEPTION_POINTERS@4"
+  %c = load i32* @"\01@foo.bar"
+  %x = add i32 %a, %b
+  %y = add i32 %x, %c
+  ret i32 %y
+}
+
+; Check that these symbols are not quoted. They occur in output that gets passed to GAS.
+; ASM: .globl __ZL16ExceptionHandlerP19_EXCEPTION_POINTERS@4
+; ASM-NOT: .globl "__ZL16ExceptionHandlerP19_EXCEPTION_POINTERS@4"
+; ASM: .globl @foo.bar
+; ASM-NOT: .globl "@foo.bar"
+
+; READOBJ: Symbol
+; READOBJ: Name: .text$??_B?$num_put@_WV?$back_insert_iterator@V?$basic_string@_WU?$char_traits@_W@std@@V?$allocator@_W@2@@std@@@std@@@std@@51
+; READOBJ: Section: .text$??_B?$num_put@_WV?$back_insert_iterator@V?$basic_string@_WU?$char_traits@_W@std@@V?$allocator@_W@2@@std@@@std@@@std@@51
+; READOBJ: Symbol
+; READOBJ: Name: ??_B?$num_put@_WV?$back_insert_iterator@V?$basic_string@_WU?$char_traits@_W@std@@V?$allocator@_W@2@@std@@@std@@@std@@51
+; READOBJ: Section: .text$??_B?$num_put@_WV?$back_insert_iterator@V?$basic_string@_WU?$char_traits@_W@std@@V?$allocator@_W@2@@std@@@std@@@std@@51
+; READOBJ: Symbol
+; READOBJ: Name: ??__E_Generic_object@?$_Error_objects@H@std@@YAXXZ
+; READOBJ: Symbol
+; READOBJ: Name: __ZL16ExceptionHandlerP19_EXCEPTION_POINTERS@4
+; READOBJ: Symbol
+; READOBJ: Name: @foo.bar
diff --git a/test/MC/Disassembler/AArch64/a64-ignored-fields.txt b/test/MC/Disassembler/AArch64/a64-ignored-fields.txt
index 966530d..799ecdf 100644
--- a/test/MC/Disassembler/AArch64/a64-ignored-fields.txt
+++ b/test/MC/Disassembler/AArch64/a64-ignored-fields.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=aarch64 -disassemble -show-encoding < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64 -mattr=fp-armv8 -disassemble -show-encoding < %s | FileCheck %s
 
 # The "Rm" bits are ignored, but the canonical representation has them filled
 # with 0s. This is what we should produce even if the input bit-pattern had
diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
index 4fa2d50..40926b1 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=aarch64 -disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s
 
 #------------------------------------------------------------------------------
 # Add/sub (immediate)
diff --git a/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt b/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
index adb8f75..5363863 100644
--- a/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
+++ b/test/MC/Disassembler/AArch64/basic-a64-unpredictable.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
 
 #------------------------------------------------------------------------------
 # Load-store exclusive
diff --git a/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt b/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
index 775660b..637ebdb 100644
--- a/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
+++ b/test/MC/Disassembler/AArch64/ldp-postind.predictable.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
 
 # None of these instructions should be classified as unpredictable:
 
diff --git a/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt b/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
index 48ea817..f52d37f 100644
--- a/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
+++ b/test/MC/Disassembler/AArch64/ldp-preind.predictable.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple=aarch64 -disassemble < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s 2>&1 | FileCheck %s
 
 # None of these instructions should be classified as unpredictable:
 
diff --git a/test/MC/Disassembler/AArch64/lit.local.cfg b/test/MC/Disassembler/AArch64/lit.local.cfg
index f9df30e..9a66a00 100644
--- a/test/MC/Disassembler/AArch64/lit.local.cfg
+++ b/test/MC/Disassembler/AArch64/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.txt']
-
 targets = set(config.root.targets_to_build.split())
 if not 'AArch64' in targets:
     config.unsupported = True
diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt
index 40d1f4c..863730a 100644
--- a/test/MC/Disassembler/AArch64/neon-instructions.txt
+++ b/test/MC/Disassembler/AArch64/neon-instructions.txt
@@ -131,8 +131,11 @@
 #------------------------------------------------------------------------------
 # Vector Move - register
 #------------------------------------------------------------------------------
-# CHECK: mov v1.16b, v15.16b
-# CHECK: mov v25.8b, v4.8b
+
+# FIXME: these should print as "mov", but TableGen can't handle it.
+
+# CHECK: orr v1.16b, v15.16b, v15.16b
+# CHECK: orr v25.8b, v4.8b, v4.8b
 0xe1 0x1d 0xaf 0x4e
 0x99 0x1c 0xa4 0x0e
 
@@ -671,3 +674,1965 @@
 0xf5 0xdd 0x23 0x4e
 0xab 0xdc 0x77 0x4e
 
+#----------------------------------------------------------------------
+# Vector Shift Left long 
+#----------------------------------------------------------------------
+# CHECK: shll2	v2.8h, v4.16b, #8
+# CHECK: shll2	v6.4s, v8.8h, #16
+# CHECK: shll2	v6.2d, v8.4s, #32
+# CHECK: shll	v2.8h, v4.8b, #8
+# CHECK: shll	v6.4s, v8.4h, #16
+# CHECK: shll	v6.2d, v8.2s, #32
+
+0x82,0x38,0x21,0x6e
+0x06,0x39,0x61,0x6e
+0x06,0x39,0xa1,0x6e
+0x82,0x38,0x21,0x2e
+0x06,0x39,0x61,0x2e
+0x06,0x39,0xa1,0x2e
+
+#----------------------------------------------------------------------
+# Vector Shift Left by Immediate
+#----------------------------------------------------------------------
+# CHECK: shl v0.4h, v1.4h, #3
+# CHECK: shl v0.16b, v1.16b, #3
+# CHECK: shl v0.4s, v1.4s, #3
+# CHECK: shl v0.2d, v1.2d, #3
+0x20,0x54,0x13,0x0f
+0x20,0x54,0x0b,0x4f
+0x20,0x54,0x23,0x4f
+0x20,0x54,0x43,0x4f
+
+#----------------------------------------------------------------------
+# Vector Shift Left Long (Signed, Unsigned) by Immediate
+#----------------------------------------------------------------------
+# CHECK: sshll v0.2d, v1.2s, #3
+# CHECK: sshll2 v0.4s, v1.8h, #3
+# CHECK: ushll v0.4s, v1.4h, #3
+# CHECK: ushll2 v0.8h, v1.16b, #3
+0x20 0xa4 0x23 0x0f
+0x20 0xa4 0x13 0x4f
+0x20 0xa4 0x13 0x2f
+0x20 0xa4 0x0b 0x6f
+
+#-----------------------------------------------------------------------------
+#Integer shift right (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: sshr v0.8b, v1.8b, #3
+# CHECK: sshr v0.4h, v1.4h, #3
+# CHECK: sshr v0.2s, v1.2s, #3
+# CHECK: sshr v0.16b, v1.16b, #3
+# CHECK: sshr v0.8h, v1.8h, #3
+# CHECK: sshr v0.4s, v1.4s, #3
+# CHECK: sshr v0.2d, v1.2d, #3
+0x20,0x04,0x0d,0x0f
+0x20,0x04,0x1d,0x0f
+0x20,0x04,0x3d,0x0f
+0x20,0x04,0x0d,0x4f
+0x20,0x04,0x1d,0x4f
+0x20,0x04,0x3d,0x4f
+0x20,0x04,0x7d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer shift right (Unsigned)
+#-----------------------------------------------------------------------------
+# CHECK: ushr v0.8b, v1.8b, #3
+# CHECK: ushr v0.4h, v1.4h, #3
+# CHECK: ushr v0.2s, v1.2s, #3
+# CHECK: ushr v0.16b, v1.16b, #3
+# CHECK: ushr v0.8h, v1.8h, #3
+# CHECK: ushr v0.4s, v1.4s, #3
+# CHECK: ushr v0.2d, v1.2d, #3
+0x20,0x04,0x0d,0x2f
+0x20,0x04,0x1d,0x2f
+0x20,0x04,0x3d,0x2f
+0x20,0x04,0x0d,0x6f
+0x20,0x04,0x1d,0x6f
+0x20,0x04,0x3d,0x6f
+0x20,0x04,0x7d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer shift right and accumulate (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: ssra v0.8b, v1.8b, #3
+# CHECK: ssra v0.4h, v1.4h, #3
+# CHECK: ssra v0.2s, v1.2s, #3
+# CHECK: ssra v0.16b, v1.16b, #3
+# CHECK: ssra v0.8h, v1.8h, #3
+# CHECK: ssra v0.4s, v1.4s, #3
+# CHECK: ssra v0.2d, v1.2d, #3
+0x20,0x14,0x0d,0x0f
+0x20,0x14,0x1d,0x0f
+0x20,0x14,0x3d,0x0f
+0x20,0x14,0x0d,0x4f
+0x20,0x14,0x1d,0x4f
+0x20,0x14,0x3d,0x4f
+0x20,0x14,0x7d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer shift right and accumulate (Unsigned)
+#-----------------------------------------------------------------------------
+# CHECK: usra v0.8b, v1.8b, #3
+# CHECK: usra v0.4h, v1.4h, #3
+# CHECK: usra v0.2s, v1.2s, #3
+# CHECK: usra v0.16b, v1.16b, #3
+# CHECK: usra v0.8h, v1.8h, #3
+# CHECK: usra v0.4s, v1.4s, #3
+# CHECK: usra v0.2d, v1.2d, #3
+0x20,0x14,0x0d,0x2f
+0x20,0x14,0x1d,0x2f
+0x20,0x14,0x3d,0x2f
+0x20,0x14,0x0d,0x6f
+0x20,0x14,0x1d,0x6f
+0x20,0x14,0x3d,0x6f
+0x20,0x14,0x7d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer rounding shift right (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: srshr v0.8b, v1.8b, #3
+# CHECK: srshr v0.4h, v1.4h, #3
+# CHECK: srshr v0.2s, v1.2s, #3
+# CHECK: srshr v0.16b, v1.16b, #3
+# CHECK: srshr v0.8h, v1.8h, #3
+# CHECK: srshr v0.4s, v1.4s, #3
+# CHECK: srshr v0.2d, v1.2d, #3
+0x20,0x24,0x0d,0x0f
+0x20,0x24,0x1d,0x0f
+0x20,0x24,0x3d,0x0f
+0x20,0x24,0x0d,0x4f
+0x20,0x24,0x1d,0x4f
+0x20,0x24,0x3d,0x4f
+0x20,0x24,0x7d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer rounding shift right (Unsigned)
+#-----------------------------------------------------------------------------
+# CHECK: urshr v0.8b, v1.8b, #3
+# CHECK: urshr v0.4h, v1.4h, #3
+# CHECK: urshr v0.2s, v1.2s, #3
+# CHECK: urshr v0.16b, v1.16b, #3
+# CHECK: urshr v0.8h, v1.8h, #3
+# CHECK: urshr v0.4s, v1.4s, #3
+# CHECK: urshr v0.2d, v1.2d, #3
+0x20,0x24,0x0d,0x2f
+0x20,0x24,0x1d,0x2f
+0x20,0x24,0x3d,0x2f
+0x20,0x24,0x0d,0x6f
+0x20,0x24,0x1d,0x6f
+0x20,0x24,0x3d,0x6f
+0x20,0x24,0x7d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer rounding shift right and accumulate (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: srsra v0.8b, v1.8b, #3
+# CHECK: srsra v0.4h, v1.4h, #3
+# CHECK: srsra v0.2s, v1.2s, #3
+# CHECK: srsra v0.16b, v1.16b, #3
+# CHECK: srsra v0.8h, v1.8h, #3
+# CHECK: srsra v0.4s, v1.4s, #3
+# CHECK: srsra v0.2d, v1.2d, #3
+0x20,0x34,0x0d,0x0f
+0x20,0x34,0x1d,0x0f
+0x20,0x34,0x3d,0x0f
+0x20,0x34,0x0d,0x4f
+0x20,0x34,0x1d,0x4f
+0x20,0x34,0x3d,0x4f
+0x20,0x34,0x7d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer rounding shift right and accumulate (Unsigned)
+#-----------------------------------------------------------------------------
+# CHECK: ursra v0.8b, v1.8b, #3
+# CHECK: ursra v0.4h, v1.4h, #3
+# CHECK: ursra v0.2s, v1.2s, #3
+# CHECK: ursra v0.16b, v1.16b, #3
+# CHECK: ursra v0.8h, v1.8h, #3
+# CHECK: ursra v0.4s, v1.4s, #3
+# CHECK: ursra v0.2d, v1.2d, #3
+0x20,0x34,0x0d,0x2f
+0x20,0x34,0x1d,0x2f
+0x20,0x34,0x3d,0x2f
+0x20,0x34,0x0d,0x6f
+0x20,0x34,0x1d,0x6f
+0x20,0x34,0x3d,0x6f
+0x20,0x34,0x7d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer shift right and insert
+#-----------------------------------------------------------------------------
+# CHECK: sri v0.8b, v1.8b, #3
+# CHECK: sri v0.4h, v1.4h, #3
+# CHECK: sri v0.2s, v1.2s, #3
+# CHECK: sri v0.16b, v1.16b, #3
+# CHECK: sri v0.8h, v1.8h, #3
+# CHECK: sri v0.4s, v1.4s, #3
+# CHECK: sri v0.2d, v1.2d, #3
+0x20,0x44,0x0d,0x2f
+0x20,0x44,0x1d,0x2f
+0x20,0x44,0x3d,0x2f
+0x20,0x44,0x0d,0x6f
+0x20,0x44,0x1d,0x6f
+0x20,0x44,0x3d,0x6f
+0x20,0x44,0x7d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer shift left and insert
+#-----------------------------------------------------------------------------
+# CHECK: sli v0.8b, v1.8b, #3
+# CHECK: sli v0.4h, v1.4h, #3
+# CHECK: sli v0.2s, v1.2s, #3
+# CHECK: sli v0.16b, v1.16b, #3
+# CHECK: sli v0.8h, v1.8h, #3
+# CHECK: sli v0.4s, v1.4s, #3
+# CHECK: sli v0.2d, v1.2d, #3
+0x20,0x54,0x0b,0x2f
+0x20,0x54,0x13,0x2f
+0x20,0x54,0x23,0x2f
+0x20,0x54,0x0b,0x6f
+0x20,0x54,0x13,0x6f
+0x20,0x54,0x23,0x6f
+0x20,0x54,0x43,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift left unsigned
+#-----------------------------------------------------------------------------
+# CHECK: sqshlu v0.8b, v1.8b, #3
+# CHECK: sqshlu v0.4h, v1.4h, #3
+# CHECK: sqshlu v0.2s, v1.2s, #3
+# CHECK: sqshlu v0.16b, v1.16b, #3
+# CHECK: sqshlu v0.8h, v1.8h, #3
+# CHECK: sqshlu v0.4s, v1.4s, #3
+# CHECK: sqshlu v0.2d, v1.2d, #3
+0x20,0x64,0x0b,0x2f
+0x20,0x64,0x13,0x2f
+0x20,0x64,0x23,0x2f
+0x20,0x64,0x0b,0x6f
+0x20,0x64,0x13,0x6f
+0x20,0x64,0x23,0x6f
+0x20,0x64,0x43,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift left (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: sqshl v0.8b, v1.8b, #3
+# CHECK: sqshl v0.4h, v1.4h, #3
+# CHECK: sqshl v0.2s, v1.2s, #3
+# CHECK: sqshl v0.16b, v1.16b, #3
+# CHECK: sqshl v0.8h, v1.8h, #3
+# CHECK: sqshl v0.4s, v1.4s, #3
+# CHECK: sqshl v0.2d, v1.2d, #3
+0x20,0x74,0x0b,0x0f
+0x20,0x74,0x13,0x0f
+0x20,0x74,0x23,0x0f
+0x20,0x74,0x0b,0x4f
+0x20,0x74,0x13,0x4f
+0x20,0x74,0x23,0x4f
+0x20,0x74,0x43,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift left (Unsigned)
+#-----------------------------------------------------------------------------
+# CHECK: uqshl v0.8b, v1.8b, #3
+# CHECK: uqshl v0.4h, v1.4h, #3
+# CHECK: uqshl v0.2s, v1.2s, #3
+# CHECK: uqshl v0.16b, v1.16b, #3
+# CHECK: uqshl v0.8h, v1.8h, #3
+# CHECK: uqshl v0.4s, v1.4s, #3
+# CHECK: uqshl v0.2d, v1.2d, #3
+0x20,0x74,0x0b,0x2f
+0x20,0x74,0x13,0x2f
+0x20,0x74,0x23,0x2f
+0x20,0x74,0x0b,0x6f
+0x20,0x74,0x13,0x6f
+0x20,0x74,0x23,0x6f
+0x20,0x74,0x43,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer shift right narrow
+#-----------------------------------------------------------------------------
+# CHECK: shrn v0.8b, v1.8h, #3
+# CHECK: shrn v0.4h, v1.4s, #3
+# CHECK: shrn v0.2s, v1.2d, #3
+# CHECK: shrn2 v0.16b, v1.8h, #3
+# CHECK: shrn2 v0.8h, v1.4s, #3
+# CHECK: shrn2 v0.4s, v1.2d, #3
+0x20,0x84,0x0d,0x0f
+0x20,0x84,0x1d,0x0f
+0x20,0x84,0x3d,0x0f
+0x20,0x84,0x0d,0x4f
+0x20,0x84,0x1d,0x4f
+0x20,0x84,0x3d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift right unsigned narrow (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: sqshrun v0.8b, v1.8h, #3
+# CHECK: sqshrun v0.4h, v1.4s, #3
+# CHECK: sqshrun v0.2s, v1.2d, #3
+# CHECK: sqshrun2 v0.16b, v1.8h, #3
+# CHECK: sqshrun2 v0.8h, v1.4s, #3
+# CHECK: sqshrun2 v0.4s, v1.2d, #3
+0x20,0x84,0x0d,0x2f
+0x20,0x84,0x1d,0x2f
+0x20,0x84,0x3d,0x2f
+0x20,0x84,0x0d,0x6f
+0x20,0x84,0x1d,0x6f
+0x20,0x84,0x3d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer rounding shift right narrow
+#-----------------------------------------------------------------------------
+# CHECK: rshrn v0.8b, v1.8h, #3
+# CHECK: rshrn v0.4h, v1.4s, #3
+# CHECK: rshrn v0.2s, v1.2d, #3
+# CHECK: rshrn2 v0.16b, v1.8h, #3
+# CHECK: rshrn2 v0.8h, v1.4s, #3
+# CHECK: rshrn2 v0.4s, v1.2d, #3
+0x20,0x8c,0x0d,0x0f
+0x20,0x8c,0x1d,0x0f
+0x20,0x8c,0x3d,0x0f
+0x20,0x8c,0x0d,0x4f
+0x20,0x8c,0x1d,0x4f
+0x20,0x8c,0x3d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift right rounded unsigned narrow (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: sqrshrun v0.8b, v1.8h, #3
+# CHECK: sqrshrun v0.4h, v1.4s, #3
+# CHECK: sqrshrun v0.2s, v1.2d, #3
+# CHECK: sqrshrun2 v0.16b, v1.8h, #3
+# CHECK: sqrshrun2 v0.8h, v1.4s, #3
+# CHECK: sqrshrun2 v0.4s, v1.2d, #3
+0x20,0x8c,0x0d,0x2f
+0x20,0x8c,0x1d,0x2f
+0x20,0x8c,0x3d,0x2f
+0x20,0x8c,0x0d,0x6f
+0x20,0x8c,0x1d,0x6f
+0x20,0x8c,0x3d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift right narrow (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: sqshrn v0.8b, v1.8h, #3
+# CHECK: sqshrn v0.4h, v1.4s, #3
+# CHECK: sqshrn v0.2s, v1.2d, #3
+# CHECK: sqshrn2 v0.16b, v1.8h, #3
+# CHECK: sqshrn2 v0.8h, v1.4s, #3
+# CHECK: sqshrn2 v0.4s, v1.2d, #3
+0x20,0x94,0x0d,0x0f
+0x20,0x94,0x1d,0x0f
+0x20,0x94,0x3d,0x0f
+0x20,0x94,0x0d,0x4f
+0x20,0x94,0x1d,0x4f
+0x20,0x94,0x3d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift right narrow (Unsigned)
+#-----------------------------------------------------------------------------
+# CHECK: uqshrn v0.8b, v1.8h, #3
+# CHECK: uqshrn v0.4h, v1.4s, #3
+# CHECK: uqshrn v0.2s, v1.2d, #3
+# CHECK: uqshrn2 v0.16b, v1.8h, #3
+# CHECK: uqshrn2 v0.8h, v1.4s, #3
+# CHECK: uqshrn2 v0.4s, v1.2d, #3
+0x20,0x94,0x0d,0x2f
+0x20,0x94,0x1d,0x2f
+0x20,0x94,0x3d,0x2f
+0x20,0x94,0x0d,0x6f
+0x20,0x94,0x1d,0x6f
+0x20,0x94,0x3d,0x6f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift right rounded narrow (Signed)
+#-----------------------------------------------------------------------------
+# CHECK: sqrshrn v0.8b, v1.8h, #3
+# CHECK: sqrshrn v0.4h, v1.4s, #3
+# CHECK: sqrshrn v0.2s, v1.2d, #3
+# CHECK: sqrshrn2 v0.16b, v1.8h, #3
+# CHECK: sqrshrn2 v0.8h, v1.4s, #3
+# CHECK: sqrshrn2 v0.4s, v1.2d, #3
+0x20,0x9c,0x0d,0x0f
+0x20,0x9c,0x1d,0x0f
+0x20,0x9c,0x3d,0x0f
+0x20,0x9c,0x0d,0x4f
+0x20,0x9c,0x1d,0x4f
+0x20,0x9c,0x3d,0x4f
+
+#-----------------------------------------------------------------------------
+#Integer saturating shift right rounded narrow (Unsigned)
+#-----------------------------------------------------------------------------
+# CHECK: uqrshrn v0.8b, v1.8h, #3
+# CHECK: uqrshrn v0.4h, v1.4s, #3
+# CHECK: uqrshrn v0.2s, v1.2d, #3
+# CHECK: uqrshrn2 v0.16b, v1.8h, #3
+# CHECK: uqrshrn2 v0.8h, v1.4s, #3
+# CHECK: uqrshrn2 v0.4s, v1.2d, #3
+0x20,0x9c,0x0d,0x2f
+0x20,0x9c,0x1d,0x2f
+0x20,0x9c,0x3d,0x2f
+0x20,0x9c,0x0d,0x6f
+0x20,0x9c,0x1d,0x6f
+0x20,0x9c,0x3d,0x6f
+
+#-----------------------------------------------------------------------------
+#Fixed-point convert to floating-point
+#-----------------------------------------------------------------------------
+# CHECK: scvtf v0.2s, v1.2s, #3
+# CHECK: scvtf v0.4s, v1.4s, #3
+# CHECK: scvtf v0.2d, v1.2d, #3
+# CHECK: ucvtf v0.2s, v1.2s, #3
+# CHECK: ucvtf v0.4s, v1.4s, #3
+# CHECK: ucvtf v0.2d, v1.2d, #3
+
+0x20,0xe4,0x3d,0x0f
+0x20,0xe4,0x3d,0x4f
+0x20,0xe4,0x7d,0x4f
+0x20,0xe4,0x3d,0x2f
+0x20,0xe4,0x3d,0x6f
+0x20,0xe4,0x7d,0x6f
+
+#-----------------------------------------------------------------------------
+#Floating-point convert to fixed-point
+#-----------------------------------------------------------------------------
+# CHECK: fcvtzs v0.2s, v1.2s, #3
+# CHECK: fcvtzs v0.4s, v1.4s, #3
+# CHECK: fcvtzs v0.2d, v1.2d, #3
+# CHECK: fcvtzu v0.2s, v1.2s, #3
+# CHECK: fcvtzu v0.4s, v1.4s, #3
+# CHECK: fcvtzu v0.2d, v1.2d, #3
+0x20,0xfc,0x3d,0x0f
+0x20,0xfc,0x3d,0x4f
+0x20,0xfc,0x7d,0x4f
+0x20,0xfc,0x3d,0x2f
+0x20,0xfc,0x3d,0x6f
+0x20,0xfc,0x7d,0x6f
+
+
+#------------------------------------------------------------------------------
+# Vector with 3 operands having different data types
+#------------------------------------------------------------------------------
+
+#------------------------------------------------------------------------------
+# Long
+#------------------------------------------------------------------------------
+
+#------------------------------------------------------------------------------
+# Long - Variant 1
+#------------------------------------------------------------------------------
+
+# CHECK: saddl v0.8h, v1.8b, v2.8b
+# CHECK: saddl v0.4s, v1.4h, v2.4h
+# CHECK: saddl v0.2d, v1.2s, v2.2s
+0x20 0x00 0x22 0x0e
+0x20 0x00 0x62 0x0e
+0x20 0x00 0xa2 0x0e
+
+# CHECK: saddl2 v0.4s, v1.8h, v2.8h
+# CHECK: saddl2 v0.8h, v1.16b, v2.16b
+# CHECK: saddl2 v0.2d, v1.4s, v2.4s
+0x20 0x00 0x62 0x4e
+0x20 0x00 0x22 0x4e
+0x20 0x00 0xa2 0x4e
+
+# CHECK: uaddl v0.8h, v1.8b, v2.8b
+# CHECK: uaddl v0.4s, v1.4h, v2.4h
+# CHECK: uaddl v0.2d, v1.2s, v2.2s
+0x20 0x00 0x22 0x2e
+0x20 0x00 0x62 0x2e
+0x20 0x00 0xa2 0x2e
+
+# CHECK: uaddl2 v0.8h, v1.16b, v2.16b
+# CHECK: uaddl2 v0.4s, v1.8h, v2.8h
+# CHECK: uaddl2 v0.2d, v1.4s, v2.4s
+0x20 0x00 0x22 0x6e
+0x20 0x00 0x62 0x6e
+0x20 0x00 0xa2 0x6e
+
+# CHECK: ssubl v0.8h, v1.8b, v2.8b
+# CHECK: ssubl v0.4s, v1.4h, v2.4h
+# CHECK: ssubl v0.2d, v1.2s, v2.2s
+0x20 0x20 0x22 0x0e
+0x20 0x20 0x62 0x0e
+0x20 0x20 0xa2 0x0e
+
+# CHECK: ssubl2 v0.8h, v1.16b, v2.16b
+# CHECK: ssubl2 v0.4s, v1.8h, v2.8h
+# CHECK: ssubl2 v0.2d, v1.4s, v2.4s
+0x20 0x20 0x22 0x4e
+0x20 0x20 0x62 0x4e
+0x20 0x20 0xa2 0x4e
+
+# CHECK: usubl v0.8h, v1.8b, v2.8b
+# CHECK: usubl v0.4s, v1.4h, v2.4h
+# CHECK: usubl v0.2d, v1.2s, v2.2s
+0x20 0x20 0x22 0x2e
+0x20 0x20 0x62 0x2e
+0x20 0x20 0xa2 0x2e
+
+# CHECK: usubl2 v0.8h, v1.16b, v2.16b
+# CHECK: usubl2 v0.4s, v1.8h, v2.8h
+# CHECK: usubl2 v0.2d, v1.4s, v2.4s
+0x20 0x20 0x22 0x6e
+0x20 0x20 0x62 0x6e
+0x20 0x20 0xa2 0x6e
+
+# CHECK: sabal v0.8h, v1.8b, v2.8b
+# CHECK: sabal v0.4s, v1.4h, v2.4h
+# CHECK: sabal v0.2d, v1.2s, v2.2s
+0x20 0x50 0x22 0x0e
+0x20 0x50 0x62 0x0e
+0x20 0x50 0xa2 0x0e
+
+# CHECK: sabal2 v0.8h, v1.16b, v2.16b
+# CHECK: sabal2 v0.4s, v1.8h, v2.8h
+# CHECK: sabal2 v0.2d, v1.4s, v2.4s
+0x20 0x50 0x22 0x4e
+0x20 0x50 0x62 0x4e
+0x20 0x50 0xa2 0x4e
+
+# CHECK: uabal v0.8h, v1.8b, v2.8b
+# CHECK: uabal v0.4s, v1.4h, v2.4h
+# CHECK: uabal v0.2d, v1.2s, v2.2s
+0x20 0x50 0x22 0x2e
+0x20 0x50 0x62 0x2e
+0x20 0x50 0xa2 0x2e
+
+# CHECK: uabal2 v0.8h, v1.16b, v2.16b
+# CHECK: uabal2 v0.4s, v1.8h, v2.8h
+# CHECK: uabal2 v0.2d, v1.4s, v2.4s
+0x20 0x50 0x22 0x6e
+0x20 0x50 0x62 0x6e
+0x20 0x50 0xa2 0x6e
+
+# CHECK: sabdl v0.8h, v1.8b, v2.8b
+# CHECK: sabdl v0.4s, v1.4h, v2.4h
+# CHECK: sabdl v0.2d, v1.2s, v2.2s
+0x20 0x70 0x22 0x0e
+0x20 0x70 0x62 0x0e
+0x20 0x70 0xa2 0x0e
+
+# CHECK: sabdl2 v0.8h, v1.16b, v2.16b
+# CHECK: sabdl2 v0.4s, v1.8h, v2.8h
+# CHECK: sabdl2 v0.2d, v1.4s, v2.4s
+0x20 0x70 0x22 0x4e
+0x20 0x70 0x62 0x4e
+0x20 0x70 0xa2 0x4e
+
+# CHECK: uabdl v0.8h, v1.8b, v2.8b
+# CHECK: uabdl v0.4s, v1.4h, v2.4h
+# CHECK: uabdl v0.2d, v1.2s, v2.2s
+0x20 0x70 0x22 0x2e
+0x20 0x70 0x62 0x2e
+0x20 0x70 0xa2 0x2e
+
+# CHECK: uabdl2 v0.8h, v1.16b, v2.16b
+# CHECK: uabdl2 v0.4s, v1.8h, v2.8h
+# CHECK: uabdl2 v0.2d, v1.4s, v2.4s
+0x20 0x70 0x22 0x6e
+0x20 0x70 0x62 0x6e
+0x20 0x70 0xa2 0x6e
+
+# CHECK: smlal v0.8h, v1.8b, v2.8b
+# CHECK: smlal v0.4s, v1.4h, v2.4h
+# CHECK: smlal v0.2d, v1.2s, v2.2s
+0x20 0x80 0x22 0x0e
+0x20 0x80 0x62 0x0e
+0x20 0x80 0xa2 0x0e
+
+# CHECK: smlal2 v0.8h, v1.16b, v2.16b
+# CHECK: smlal2 v0.4s, v1.8h, v2.8h
+# CHECK: smlal2 v0.2d, v1.4s, v2.4s
+0x20 0x80 0x22 0x4e
+0x20 0x80 0x62 0x4e
+0x20 0x80 0xa2 0x4e
+
+# CHECK: umlal v0.8h, v1.8b, v2.8b
+# CHECK: umlal v0.4s, v1.4h, v2.4h
+# CHECK: umlal v0.2d, v1.2s, v2.2s
+
+0x20 0x80 0x22 0x2e
+0x20 0x80 0x62 0x2e
+0x20 0x80 0xa2 0x2e
+
+# CHECK: umlal2 v0.8h, v1.16b, v2.16b
+# CHECK: umlal2 v0.4s, v1.8h, v2.8h
+# CHECK: umlal2 v0.2d, v1.4s, v2.4s
+0x20 0x80 0x22 0x6e
+0x20 0x80 0x62 0x6e
+0x20 0x80 0xa2 0x6e
+
+# CHECK: smlsl v0.8h, v1.8b, v2.8b
+# CHECK: smlsl v0.4s, v1.4h, v2.4h
+# CHECK: smlsl v0.2d, v1.2s, v2.2s
+0x20 0xa0 0x22 0x0e
+0x20 0xa0 0x62 0x0e
+0x20 0xa0 0xa2 0x0e
+
+# CHECK: smlsl2 v0.8h, v1.16b, v2.16b
+# CHECK: smlsl2 v0.4s, v1.8h, v2.8h
+# CHECK: smlsl2 v0.2d, v1.4s, v2.4s
+0x20 0xa0 0x22 0x4e
+0x20 0xa0 0x62 0x4e
+0x20 0xa0 0xa2 0x4e
+
+# CHECK: umlsl v0.8h, v1.8b, v2.8b
+# CHECK: umlsl v0.4s, v1.4h, v2.4h
+# CHECK: umlsl v0.2d, v1.2s, v2.2s
+0x20 0xa0 0x22 0x2e
+0x20 0xa0 0x62 0x2e
+0x20 0xa0 0xa2 0x2e
+
+# CHECK: umlsl2 v0.8h, v1.16b, v2.16b
+# CHECK: umlsl2 v0.4s, v1.8h, v2.8h
+# CHECK: umlsl2 v0.2d, v1.4s, v2.4s
+0x20 0xa0 0x22 0x6e
+0x20 0xa0 0x62 0x6e
+0x20 0xa0 0xa2 0x6e
+
+# CHECK: smull v0.8h, v1.8b, v2.8b
+# CHECK: smull v0.4s, v1.4h, v2.4h
+# CHECK: smull v0.2d, v1.2s, v2.2s
+0x20 0xc0 0x22 0x0e
+0x20 0xc0 0x62 0x0e
+0x20 0xc0 0xa2 0x0e
+
+# CHECK: smull2 v0.8h, v1.16b, v2.16b
+# CHECK: smull2 v0.4s, v1.8h, v2.8h
+# CHECK: smull2 v0.2d, v1.4s, v2.4s
+0x20 0xc0 0x22 0x4e
+0x20 0xc0 0x62 0x4e
+0x20 0xc0 0xa2 0x4e
+
+# CHECK: umull v0.8h, v1.8b, v2.8b
+# CHECK: umull v0.4s, v1.4h, v2.4h
+# CHECK: umull v0.2d, v1.2s, v2.2s
+0x20 0xc0 0x22 0x2e
+0x20 0xc0 0x62 0x2e
+0x20 0xc0 0xa2 0x2e
+
+# CHECK: umull2 v0.8h, v1.16b, v2.16b
+# CHECK: umull2 v0.4s, v1.8h, v2.8h
+# CHECK: umull2 v0.2d, v1.4s, v2.4s
+0x20 0xc0 0x22 0x6e
+0x20 0xc0 0x62 0x6e
+0x20 0xc0 0xa2 0x6e
+
+#------------------------------------------------------------------------------
+# Long - Variant 2
+#------------------------------------------------------------------------------
+
+# CHECK: sqdmlal v0.4s, v1.4h, v2.4h
+# CHECK: sqdmlal v0.2d, v1.2s, v2.2s
+0x20 0x90 0x62 0x0e
+0x20 0x90 0xa2 0x0e
+
+# CHECK: sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK: sqdmlal2 v0.2d, v1.4s, v2.4s
+0x20 0x90 0x62 0x4e
+0x20 0x90 0xa2 0x4e
+
+# CHECK: sqdmlsl v0.4s, v1.4h, v2.4h
+# CHECK: sqdmlsl v0.2d, v1.2s, v2.2s
+0x20 0xb0 0x62 0x0e
+0x20 0xb0 0xa2 0x0e
+
+# CHECK: sqdmlsl2 v0.4s, v1.8h, v2.8h
+# CHECK: sqdmlsl2 v0.2d, v1.4s, v2.4s
+0x20 0xb0 0x62 0x4e
+0x20 0xb0 0xa2 0x4e
+
+# CHECK: sqdmull v0.4s, v1.4h, v2.4h
+# CHECK: sqdmull v0.2d, v1.2s, v2.2s
+0x20 0xd0 0x62 0x0e
+0x20 0xd0 0xa2 0x0e
+
+# CHECK: sqdmull2 v0.4s, v1.8h, v2.8h
+# CHECK: sqdmull2 v0.2d, v1.4s, v2.4s
+0x20 0xd0 0x62 0x4e
+0x20 0xd0 0xa2 0x4e
+
+#------------------------------------------------------------------------------
+# Long - Variant 3
+#------------------------------------------------------------------------------
+
+# CHECK: pmull v0.8h, v1.8b, v2.8b
+0x20 0xe0 0x22 0x0e
+
+# CHECK: pmull2 v0.8h, v1.16b, v2.16b
+0x20 0xe0 0x22 0x4e
+
+#------------------------------------------------------------------------------
+# Widen
+#------------------------------------------------------------------------------
+
+# CHECK: saddw v0.8h, v1.8h, v2.8b
+# CHECK: saddw v0.4s, v1.4s, v2.4h
+# CHECK: saddw v0.2d, v1.2d, v2.2s
+0x20 0x10 0x22 0x0e
+0x20 0x10 0x62 0x0e
+0x20 0x10 0xa2 0x0e
+
+# CHECK: saddw2 v0.8h, v1.8h, v2.16b
+# CHECK: saddw2 v0.4s, v1.4s, v2.8h
+# CHECK: saddw2 v0.2d, v1.2d, v2.4s
+0x20 0x10 0x22 0x4e
+0x20 0x10 0x62 0x4e
+0x20 0x10 0xa2 0x4e
+
+# CHECK: uaddw v0.8h, v1.8h, v2.8b
+# CHECK: uaddw v0.4s, v1.4s, v2.4h
+# CHECK: uaddw v0.2d, v1.2d, v2.2s
+0x20 0x10 0x22 0x2e
+0x20 0x10 0x62 0x2e
+0x20 0x10 0xa2 0x2e
+
+# CHECK: uaddw2 v0.8h, v1.8h, v2.16b
+# CHECK: uaddw2 v0.4s, v1.4s, v2.8h
+# CHECK: uaddw2 v0.2d, v1.2d, v2.4s
+0x20 0x10 0x22 0x6e
+0x20 0x10 0x62 0x6e
+0x20 0x10 0xa2 0x6e
+
+# CHECK: ssubw v0.8h, v1.8h, v2.8b
+# CHECK: ssubw v0.4s, v1.4s, v2.4h
+# CHECK: ssubw v0.2d, v1.2d, v2.2s
+0x20 0x30 0x22 0x0e
+0x20 0x30 0x62 0x0e
+0x20 0x30 0xa2 0x0e
+
+# CHECK: ssubw2 v0.8h, v1.8h, v2.16b
+# CHECK: ssubw2 v0.4s, v1.4s, v2.8h
+# CHECK: ssubw2 v0.2d, v1.2d, v2.4s
+0x20 0x30 0x22 0x4e
+0x20 0x30 0x62 0x4e
+0x20 0x30 0xa2 0x4e
+
+# CHECK: usubw v0.8h, v1.8h, v2.8b
+# CHECK: usubw v0.4s, v1.4s, v2.4h
+# CHECK: usubw v0.2d, v1.2d, v2.2s
+0x20 0x30 0x22 0x2e
+0x20 0x30 0x62 0x2e
+0x20 0x30 0xa2 0x2e
+
+# CHECK: usubw2 v0.8h, v1.8h, v2.16b
+# CHECK: usubw2 v0.4s, v1.4s, v2.8h
+# CHECK: usubw2 v0.2d, v1.2d, v2.4s
+0x20 0x30 0x22 0x6e
+0x20 0x30 0x62 0x6e
+0x20 0x30 0xa2 0x6e
+
+#------------------------------------------------------------------------------
+# Narrow
+#------------------------------------------------------------------------------
+
+# CHECK: addhn v0.8b, v1.8h, v2.8h
+# CHECK: addhn v0.4h, v1.4s, v2.4s
+# CHECK: addhn v0.2s, v1.2d, v2.2d
+0x20 0x40 0x22 0x0e
+0x20 0x40 0x62 0x0e
+0x20 0x40 0xa2 0x0e
+
+# CHECK: addhn2 v0.16b, v1.8h, v2.8h
+# CHECK: addhn2 v0.8h, v1.4s, v2.4s
+# CHECK: addhn2 v0.4s, v1.2d, v2.2d
+0x20 0x40 0x22 0x4e
+0x20 0x40 0x62 0x4e
+0x20 0x40 0xa2 0x4e
+
+# CHECK: raddhn v0.8b, v1.8h, v2.8h
+# CHECK: raddhn v0.4h, v1.4s, v2.4s
+# CHECK: raddhn v0.2s, v1.2d, v2.2d
+0x20 0x40 0x22 0x2e
+0x20 0x40 0x62 0x2e
+0x20 0x40 0xa2 0x2e
+
+# CHECK: raddhn2 v0.16b, v1.8h, v2.8h
+# CHECK: raddhn2 v0.8h, v1.4s, v2.4s
+# CHECK: raddhn2 v0.4s, v1.2d, v2.2d
+0x20 0x40 0x22 0x6e
+0x20 0x40 0x62 0x6e
+0x20 0x40 0xa2 0x6e
+
+# CHECK: rsubhn v0.8b, v1.8h, v2.8h
+# CHECK: rsubhn v0.4h, v1.4s, v2.4s
+# CHECK: rsubhn v0.2s, v1.2d, v2.2d
+0x20 0x60 0x22 0x2e
+0x20 0x60 0x62 0x2e
+0x20 0x60 0xa2 0x2e
+
+# CHECK: rsubhn2 v0.16b, v1.8h, v2.8h
+# CHECK: rsubhn2 v0.8h, v1.4s, v2.4s
+# CHECK: rsubhn2 v0.4s, v1.2d, v2.2d
+0x20 0x60 0x22 0x6e
+0x20 0x60 0x62 0x6e
+0x20 0x60 0xa2 0x6e
+
+#----------------------------------------------------------------------
+# Scalar Integer Saturating Doubling Multiply Half High
+#----------------------------------------------------------------------
+# CHECK: sqdmulh h10, h11, h12
+# CHECK: sqdmulh s20, s21, s2
+0x6a,0xb5,0x6c,0x5e
+0xb4,0xb6,0xa2,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Integer Saturating Rounding Doubling Multiply Half High
+#----------------------------------------------------------------------
+# CHECK: sqrdmulh h10, h11, h12
+# CHECK: sqrdmulh s20, s21, s2
+0x6a,0xb5,0x6c,0x7e
+0xb4,0xb6,0xa2,0x7e
+
+#----------------------------------------------------------------------
+# Floating-point multiply extended
+#----------------------------------------------------------------------
+# CHECK: fmulx s20, s22, s15
+# CHECK: fmulx d23, d11, d1
+0xd4,0xde,0x2f,0x5e
+0x77,0xdd,0x61,0x5e
+
+#----------------------------------------------------------------------
+# Floating-point Reciprocal Step
+#----------------------------------------------------------------------
+# CHECK: frecps s21, s16, s13
+# CHECK: frecps d22, d30, d21
+0x15,0xfe,0x2d,0x5e
+0xd6,0xff,0x75,0x5e
+
+#----------------------------------------------------------------------
+# Floating-point Reciprocal Square Root Step
+#----------------------------------------------------------------------
+# CHECK: frsqrts s21, s5, s12
+# CHECK: frsqrts d8, d22, d18
+0xb5,0xfc,0xac,0x5e
+0xc8,0xfe,0xf2,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Signed Integer Convert To Floating-point
+#----------------------------------------------------------------------
+# CHECK: scvtf s22, s13
+# CHECK: scvtf d21, d12
+0xb6,0xd9,0x21,0x5e
+0x95,0xd9,0x61,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Unsigned Integer Convert To Floating-point
+#----------------------------------------------------------------------
+# CHECK: ucvtf s22, s13
+# CHECK: ucvtf d21, d14
+0xb6,0xd9,0x21,0x7e
+0xd5,0xd9,0x61,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Reciprocal Estimate
+#----------------------------------------------------------------------
+# CHECK: frecpe s19, s14
+# CHECK: frecpe d13, d13
+0xd3,0xd9,0xa1,0x5e
+0xad,0xd9,0xe1,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Reciprocal Exponent
+#----------------------------------------------------------------------
+# CHECK: frecpx s18, s10
+# CHECK: frecpx d16, d19
+0x52,0xf9,0xa1,0x5e
+0x70,0xfa,0xe1,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Reciprocal Square Root Estimate
+#----------------------------------------------------------------------
+# CHECK: frsqrte s22, s13
+# CHECK: frsqrte d21, d12
+0xb6,0xd9,0xa1,0x7e
+0x95,0xd9,0xe1,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Compare Bitwise Equal
+#----------------------------------------------------------------------
+# CHECK: cmeq d20, d21, d22
+0xb4,0x8e,0xf6,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Compare Bitwise Equal To Zero
+#----------------------------------------------------------------------
+# CHECK: cmeq d20, d21, #0x0
+0xb4,0x9a,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Compare Unsigned Higher Or Same
+#----------------------------------------------------------------------
+# CHECK: cmhs d20, d21, d22
+0xb4,0x3e,0xf6,0x7e
+
+        
+#----------------------------------------------------------------------
+# Scalar Compare Signed Greather Than Or Equal
+#----------------------------------------------------------------------
+# CHECK: cmge d20, d21, d22
+0xb4,0x3e,0xf6,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Compare Signed Greather Than Or Equal To Zero
+#----------------------------------------------------------------------
+# CHECK: cmge d20, d21, #0x0
+0xb4,0x8a,0xe0,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Compare Unsigned Higher
+#----------------------------------------------------------------------
+# CHECK: cmhi d20, d21, d22
+0xb4,0x36,0xf6,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Compare Signed Greater Than
+#----------------------------------------------------------------------
+# CHECK: cmgt d20, d21, d22
+0xb4,0x36,0xf6,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Compare Signed Greater Than Zero
+#----------------------------------------------------------------------
+# CHECK: cmgt d20, d21, #0x0
+0xb4,0x8a,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Compare Signed Less Than Or Equal To Zero
+#----------------------------------------------------------------------
+# CHECK: cmle d20, d21, #0x0
+0xb4,0x9a,0xe0,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Compare Less Than Zero
+#----------------------------------------------------------------------
+# CHECK: cmlt d20, d21, #0x0
+0xb4,0xaa,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Compare Bitwise Test Bits
+#----------------------------------------------------------------------
+# CHECK: cmtst d20, d21, d22
+0xb4,0x8e,0xf6,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Equal
+#----------------------------------------------------------------------
+# CHECK: fcmeq s10, s11, s12
+# CHECK: fcmeq d20, d21, d22
+0x6a,0xe5,0x2c,0x5e
+0xb4,0xe6,0x76,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Equal To Zero
+#----------------------------------------------------------------------
+# CHECK: fcmeq s10, s11, #0.0
+# CHECK: fcmeq d20, d21, #0.0
+0x6a,0xd9,0xa0,0x5e
+0xb4,0xda,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Greater Than Or Equal
+#----------------------------------------------------------------------
+# CHECK: fcmge s10, s11, s12
+# CHECK: fcmge d20, d21, d22
+0x6a,0xe5,0x2c,0x7e
+0xb4,0xe6,0x76,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
+#----------------------------------------------------------------------
+# CHECK: fcmge s10, s11, #0.0
+# CHECK: fcmge d20, d21, #0.0
+0x6a,0xc9,0xa0,0x7e
+0xb4,0xca,0xe0,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Greather Than
+#----------------------------------------------------------------------
+# CHECK: fcmgt s10, s11, s12
+# CHECK: fcmgt d20, d21, d22
+0x6a,0xe5,0xac,0x7e
+0xb4,0xe6,0xf6,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Greather Than Zero
+#----------------------------------------------------------------------
+# CHECK: fcmgt s10, s11, #0.0
+# CHECK: fcmgt d20, d21, #0.0
+0x6a,0xc9,0xa0,0x5e
+0xb4,0xca,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Less Than Or Equal To Zero
+#----------------------------------------------------------------------
+# CHECK: fcmle s10, s11, #0.0
+# CHECK: fcmle d20, d21, #0.0
+0x6a,0xd9,0xa0,0x7e
+0xb4,0xda,0xe0,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Compare Mask Less Than
+#----------------------------------------------------------------------
+# CHECK: fcmlt s10, s11, #0.0
+# CHECK: fcmlt d20, d21, #0.0
+0x6a,0xe9,0xa0,0x5e
+0xb4,0xea,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
+#----------------------------------------------------------------------
+# CHECK: facge s10, s11, s12
+# CHECK: facge d20, d21, d22
+0x6a,0xed,0x2c,0x7e
+0xb4,0xee,0x76,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Absolute Compare Mask Greater Than
+#----------------------------------------------------------------------
+# CHECK: facgt s10, s11, s12   
+# CHECK: facgt d20, d21, d22   
+0x6a,0xed,0xac,0x7e
+0xb4,0xee,0xf6,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Absolute Value
+#----------------------------------------------------------------------
+# CHECK: abs d29, d24
+0x1d,0xbb,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Signed Saturating Absolute Value
+#----------------------------------------------------------------------
+# CHECK: sqabs b19, b14
+# CHECK: sqabs h21, h15
+# CHECK: sqabs s20, s12
+# CHECK: sqabs d18, d12
+0xd3,0x79,0x20,0x5e
+0xf5,0x79,0x60,0x5e
+0x94,0x79,0xa0,0x5e
+0x92,0x79,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Negate
+#----------------------------------------------------------------------
+# CHECK: neg d29, d24
+0x1d,0xbb,0xe0,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Signed Saturating Negate
+#----------------------------------------------------------------------
+# CHECK: sqneg b19, b14
+# CHECK: sqneg h21, h15
+# CHECK: sqneg s20, s12
+# CHECK: sqneg d18, d12
+0xd3,0x79,0x20,0x7e
+0xf5,0x79,0x60,0x7e
+0x94,0x79,0xa0,0x7e
+0x92,0x79,0xe0,0x7e
+
+#----------------------------------------------------------------------
+# Signed Saturating Accumulated of Unsigned Value
+#----------------------------------------------------------------------
+# CHECK: suqadd b19, b14
+# CHECK: suqadd h20, h15
+# CHECK: suqadd s21, s12
+# CHECK: suqadd d18, d22
+0xd3,0x39,0x20,0x5e
+0xf4,0x39,0x60,0x5e
+0x95,0x39,0xa0,0x5e
+0xd2,0x3a,0xe0,0x5e
+
+#----------------------------------------------------------------------
+# Unsigned Saturating Accumulated of Signed Value
+#----------------------------------------------------------------------
+# CHECK: usqadd b19, b14
+# CHECK: usqadd h20, h15
+# CHECK: usqadd s21, s12
+# CHECK: usqadd d18, d22
+0xd3,0x39,0x20,0x7e
+0xf4,0x39,0x60,0x7e
+0x95,0x39,0xa0,0x7e
+0xd2,0x3a,0xe0,0x7e
+
+#----------------------------------------------------------------------
+# Signed Saturating Doubling Multiply-Add Long
+#----------------------------------------------------------------------
+# CHECK: sqdmlal s17, h27, h12
+# CHECK: sqdmlal d19, s24, s12
+0x71,0x93,0x6c,0x5e
+0x13,0x93,0xac,0x5e
+        
+#----------------------------------------------------------------------
+# Signed Saturating Doubling Multiply-Subtract Long
+#----------------------------------------------------------------------
+# CHECK: sqdmlsl s14, h12, h25
+# CHECK: sqdmlsl d12, s23, s13
+0x8e,0xb1,0x79,0x5e
+0xec,0xb2,0xad,0x5e
+        
+#----------------------------------------------------------------------
+# Signed Saturating Doubling Multiply Long
+#----------------------------------------------------------------------
+# CHECK: sqdmull s12, h22, h12
+# CHECK: sqdmull d15, s22, s12
+0xcc,0xd2,0x6c,0x5e
+0xcf,0xd2,0xac,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Signed Saturating Extract Unsigned Narrow
+#----------------------------------------------------------------------
+# CHECK: sqxtun b19, h14
+# CHECK: sqxtun h21, s15
+# CHECK: sqxtun s20, d12
+0xd3,0x29,0x21,0x7e
+0xf5,0x29,0x61,0x7e
+0x94,0x29,0xa1,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Signed Saturating Extract Signed Narrow
+#----------------------------------------------------------------------
+# CHECK: sqxtn b18, h18
+# CHECK: sqxtn h20, s17
+# CHECK: sqxtn s19, d14
+0x52,0x4a,0x21,0x5e
+0x34,0x4a,0x61,0x5e
+0xd3,0x49,0xa1,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Unsigned Saturating Extract Narrow
+#----------------------------------------------------------------------
+# CHECK: uqxtn b18, h18
+# CHECK: uqxtn h20, s17
+# CHECK: uqxtn s19, d14
+0x52,0x4a,0x21,0x7e
+0x34,0x4a,0x61,0x7e
+0xd3,0x49,0xa1,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Signed Shift Right (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sshr d15, d16, #12
+0x0f,0x06,0x74,0x5f
+
+#----------------------------------------------------------------------
+# Scalar Unsigned Shift Right (Immediate)
+#----------------------------------------------------------------------
+# CHECK: ushr d10, d17, #18
+0x2a,0x06,0x6e,0x7f
+
+#----------------------------------------------------------------------
+# Scalar Signed Rounding Shift Right (Immediate)
+#----------------------------------------------------------------------
+# CHECK: srshr d19, d18, #7
+0x53,0x26,0x79,0x5f
+
+#----------------------------------------------------------------------
+# Scalar Unigned Rounding Shift Right (Immediate)
+#----------------------------------------------------------------------
+# CHECK: urshr d20, d23, #31
+0xf4,0x26,0x61,0x7f
+
+#----------------------------------------------------------------------
+# Scalar Signed Shift Right and Accumulate (Immediate)
+#----------------------------------------------------------------------
+# CHECK: ssra d18, d12, #21
+0x92,0x15,0x6b,0x5f
+
+#----------------------------------------------------------------------
+# Scalar Unsigned Shift Right and Accumulate (Immediate)
+#----------------------------------------------------------------------
+# CHECK: usra d20, d13, #61
+0xb4,0x15,0x43,0x7f
+
+#----------------------------------------------------------------------
+# Scalar Signed Rounding Shift Right and Accumulate (Immediate)
+#----------------------------------------------------------------------
+# CHECK: srsra d15, d11, #19
+0x6f,0x35,0x6d,0x5f
+
+#----------------------------------------------------------------------
+# Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
+#----------------------------------------------------------------------
+# CHECK: ursra d18, d10, #13
+0x52,0x35,0x73,0x7f
+
+#----------------------------------------------------------------------
+# Scalar Shift Left (Immediate)
+#----------------------------------------------------------------------
+# CHECK: shl d7, d10, #12
+0x47,0x55,0x4c,0x5f
+
+#----------------------------------------------------------------------
+# Signed Saturating Shift Left (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sqshl b11, b19, #7
+# CHECK: sqshl h13, h18, #11
+# CHECK: sqshl s14, s17, #22
+# CHECK: sqshl d15, d16, #51
+0x6b,0x76,0x0f,0x5f
+0x4d,0x76,0x1b,0x5f
+0x2e,0x76,0x36,0x5f
+0x0f,0x76,0x73,0x5f
+
+#----------------------------------------------------------------------
+# Unsigned Saturating Shift Left (Immediate)
+#----------------------------------------------------------------------
+# CHECK: uqshl b18, b15, #6
+# CHECK: uqshl h11, h18, #7
+# CHECK: uqshl s14, s19, #18
+# CHECK: uqshl d15, d12, #19
+0xf2,0x75,0x0e,0x7f
+0x4b,0x76,0x17,0x7f
+0x6e,0x76,0x32,0x7f
+0x8f,0x75,0x53,0x7f
+
+#----------------------------------------------------------------------
+# Signed Saturating Shift Left Unsigned (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sqshlu b15, b18, #6
+# CHECK: sqshlu h19, h17, #6
+# CHECK: sqshlu s16, s14, #25
+# CHECK: sqshlu d11, d13, #32
+0x4f,0x66,0x0e,0x7f
+0x33,0x66,0x16,0x7f
+0xd0,0x65,0x39,0x7f
+0xab,0x65,0x60,0x7f
+
+#----------------------------------------------------------------------
+# Shift Right And Insert (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sri d10, d12, #14
+0x8a,0x45,0x72,0x7f
+
+#----------------------------------------------------------------------
+# Shift Left And Insert (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sli d10, d14, #12
+0xca,0x55,0x4c,0x7f
+
+#----------------------------------------------------------------------
+# Signed Saturating Shift Right Narrow (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sqshrn b10, h15, #5
+# CHECK: sqshrn h17, s10, #4
+# CHECK: sqshrn s18, d10, #31
+0xea,0x95,0x0b,0x5f
+0x51,0x95,0x1c,0x5f
+0x52,0x95,0x21,0x5f
+
+#----------------------------------------------------------------------
+# Unsigned Saturating Shift Right Narrow (Immediate)
+#----------------------------------------------------------------------
+# CHECK: uqshrn b12, h10, #7
+# CHECK: uqshrn h10, s14, #5
+# CHECK: uqshrn s10, d12, #13
+0x4c,0x95,0x09,0x7f
+0xca,0x95,0x1b,0x7f
+0x8a,0x95,0x33,0x7f
+
+#----------------------------------------------------------------------
+# Signed Saturating Rounded Shift Right Narrow (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sqrshrn b10, h13, #2
+# CHECK: sqrshrn h15, s10, #6
+# CHECK: sqrshrn s15, d12, #9
+0xaa,0x9d,0x0e,0x5f
+0x4f,0x9d,0x1a,0x5f
+0x8f,0x9d,0x37,0x5f
+
+#----------------------------------------------------------------------
+# Unsigned Saturating Rounded Shift Right Narrow (Immediate)
+#----------------------------------------------------------------------
+# CHECK: uqrshrn b10, h12, #5
+# CHECK: uqrshrn h12, s10, #14
+# CHECK: uqrshrn s10, d10, #25
+0x8a,0x9d,0x0b,0x7f
+0x4c,0x9d,0x12,0x7f
+0x4a,0x9d,0x27,0x7f
+
+#----------------------------------------------------------------------
+# Signed Saturating Shift Right Unsigned Narrow (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sqshrun b15, h10, #7
+# CHECK: sqshrun h20, s14, #3
+# CHECK: sqshrun s10, d15, #15
+0x4f,0x85,0x09,0x7f
+0xd4,0x85,0x1d,0x7f
+0xea,0x85,0x31,0x7f
+
+#----------------------------------------------------------------------
+# Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
+#----------------------------------------------------------------------
+# CHECK: sqrshrun b17, h10, #6
+# CHECK: sqrshrun h10, s13, #15
+# CHECK: sqrshrun s22, d16, #31
+0x51,0x8d,0x0a,0x7f
+0xaa,0x8d,0x11,0x7f
+0x16,0x8e,0x21,0x7f
+
+#----------------------------------------------------------------------
+# Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
+#----------------------------------------------------------------------
+# CHECK: scvtf s22, s13, #32
+# CHECK: scvtf d21, d12, #64
+0xb6,0xe5,0x20,0x5f
+0x95,0xe5,0x40,0x5f
+
+#----------------------------------------------------------------------
+# Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
+#----------------------------------------------------------------------
+# CHECK: ucvtf s22, s13, #32
+# CHECK: ucvtf d21, d14, #64
+0xb6,0xe5,0x20,0x7f
+0xd5,0xe5,0x40,0x7f
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Signed Fixed-point (Immediate)
+#----------------------------------------------------------------------
+# CHECK: fcvtzs s21, s12, #1
+# CHECK: fcvtzs d21, d12, #1
+0x95,0xfd,0x3f,0x5f
+0x95,0xfd,0x7f,0x5f
+        
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
+#----------------------------------------------------------------------
+# CHECK: fcvtzu s21, s12, #1
+# CHECK: fcvtzu d21, d12, #1
+0x95,0xfd,0x3f,0x7f
+0x95,0xfd,0x7f,0x7f
+
+#----------------------------------------------------------------------
+# Vector load/store multiple N-element structure
+#----------------------------------------------------------------------
+# CHECK: ld1 {v0.16b}, [x0]
+# CHECK: ld1 {v15.8h, v16.8h}, [x15]
+# CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp]
+# CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+0x00,0x70,0x40,0x4c
+0xef,0xa5,0x40,0x4c
+0xff,0x6b,0x40,0x4c
+0x00,0x2c,0x40,0x4c
+
+# CHECK: ld2 {v0.8b, v1.8b}, [x0]
+# CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15]
+# CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+0x00,0x80,0x40,0x0c
+0xef,0x45,0x40,0x0c
+0xff,0x0b,0x40,0x0c
+
+# CHECK: st1 {v0.16b}, [x0]
+# CHECK: st1 {v15.8h, v16.8h}, [x15]
+# CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp]
+# CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+0x00,0x70,0x00,0x4c
+0xef,0xa5,0x00,0x4c
+0xff,0x6b,0x00,0x4c
+0x00,0x2c,0x00,0x4c
+
+# CHECK: st2 {v0.8b, v1.8b}, [x0]
+# CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15]
+# CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+0x00,0x80,0x00,0x0c
+0xef,0x45,0x00,0x0c
+0xff,0x0b,0x00,0x0c
+
+#----------------------------------------------------------------------
+# Vector load/store multiple N-element structure (post-index)
+#----------------------------------------------------------------------
+# CHECK: ld1 {v15.8h}, [x15], x2
+# CHECK: ld1 {v31.4s, v0.4s}, [sp], #32
+# CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+# CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+0xef,0x75,0xc2,0x4c
+0xff,0xab,0xdf,0x4c
+0x00,0x6c,0xdf,0x4c
+0x00,0x20,0xc3,0x0c
+
+# CHECK: ld2 {v0.16b, v1.16b}, [x0], x1
+# CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+# CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+0x00,0x80,0xc1,0x4c
+0xef,0x45,0xc2,0x4c
+0xff,0x0b,0xdf,0x4c
+
+
+# CHECK: st1 {v15.8h}, [x15], x2
+# CHECK: st1 {v31.4s, v0.4s}, [sp], #32
+# CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0], #48
+# CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x3
+0xef,0x75,0x82,0x4c
+0xff,0xab,0x9f,0x4c
+0x00,0x6c,0x9f,0x4c
+0x00,0x20,0x83,0x0c
+
+# CHECK: st2 {v0.16b, v1.16b}, [x0], x1
+# CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15], x2
+# CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #64
+0x00,0x80,0x81,0x4c
+0xef,0x45,0x82,0x4c
+0xff,0x0b,0x9f,0x4c
+
+#----------------------------------------------------------------------
+# Vector load single N-element structure to all lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1r {v0.16b}, [x0]
+# CHECK: ld1r {v15.8h}, [x15]
+# CHECK: ld2r {v31.4s, v0.4s}, [sp]
+# CHECK: ld2r {v0.2d, v1.2d}, [x0]
+# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0]
+# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15]
+# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
+0x00,0xc0,0x40,0x4d
+0xef,0xc5,0x40,0x4d
+0xff,0xcb,0x60,0x4d
+0x00,0xcc,0x60,0x4d
+0x00,0xe0,0x40,0x0d
+0xef,0xe5,0x40,0x0d
+0xff,0xeb,0x60,0x0d
+0xff,0xef,0x60,0x0d
+
+#----------------------------------------------------------------------
+# Vector load/store single N-element structure to/from one lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1 {v0.b}[9], [x0]
+# CHECK: ld2 {v15.h, v16.h}[7], [x15]
+# CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp]
+# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+# CHECK: st1 {v0.d}[1], [x0]
+# CHECK: st2 {v31.s, v0.s}[3], [sp]
+# CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15]
+# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+0x00,0x04,0x40,0x4d
+0xef,0x59,0x60,0x4d
+0xff,0xb3,0x40,0x4d
+0x00,0xa4,0x60,0x4d
+0x00,0x84,0x00,0x4d
+0xff,0x93,0x20,0x4d
+0xef,0x79,0x00,0x4d
+0x00,0x24,0x20,0x4d
+
+#----------------------------------------------------------------------
+# Post-index of vector load single N-element structure to all lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1r {v0.16b}, [x0], #1
+# CHECK: ld1r {v15.8h}, [x15], #2
+# CHECK: ld2r {v31.4s, v0.4s}, [sp], #8
+# CHECK: ld2r {v0.2d, v1.2d}, [x0], #16
+# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
+# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
+# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
+# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
+0x00,0xc0,0xdf,0x4d
+0xef,0xc5,0xdf,0x4d
+0xff,0xcb,0xff,0x4d
+0x00,0xcc,0xff,0x4d
+0x00,0xe0,0xdf,0x0d
+0xef,0xe5,0xdf,0x0d
+0xff,0xeb,0xfe,0x0d
+0xff,0xef,0xe7,0x0d
+
+#----------------------------------------------------------------------
+# Post-index of vector load/store single N-element structure to/from
+#  one lane of N consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1 {v0.b}[9], [x0], #1
+# CHECK: ld2 {v15.h, v16.h}[7], [x15], #4
+# CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp], x3
+# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
+# CHECK: ld4 {v0.h, v1.h, v2.h, v3.h}[7], [x0], x0
+# CHECK: st1 {v0.d}[1], [x0], #8
+# CHECK: st2 {v31.s, v0.s}[3], [sp], #8
+# CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15], #6
+# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+0x00,0x04,0xdf,0x4d
+0xef,0x59,0xff,0x4d
+0xff,0xb3,0xc3,0x4d
+0x00,0xa4,0xff,0x4d
+0x00,0x78,0xe0,0x4d
+0x00,0x84,0x9f,0x4d
+0xff,0x93,0xbf,0x4d
+0xef,0x79,0x9f,0x4d
+0x00,0x24,0xa5,0x4d
+
+#----------------------------------------------------------------------
+# Bitwise extract
+#----------------------------------------------------------------------
+0x20,0x18,0x02,0x2e
+0x20,0x18,0x02,0x6e
+# CHECK: ext v0.8b, v1.8b, v2.8b, #0x3
+# CHECK: ext v0.16b, v1.16b, v2.16b, #0x3
+
+#----------------------------------------------------------------------
+# unzip with 3 same vectors to get primary result
+#----------------------------------------------------------------------
+# CHECK: uzp1	v1.8b, v1.8b, v2.8b
+# CHECK: uzp1	v2.16b, v1.16b, v2.16b
+# CHECK: uzp1	v3.4h, v1.4h, v2.4h
+# CHECK: uzp1	v4.8h, v1.8h, v2.8h
+# CHECK: uzp1	v5.2s, v1.2s, v2.2s
+# CHECK: uzp1	v6.4s, v1.4s, v2.4s
+# CHECK: uzp1	v7.2d, v1.2d, v2.2d
+0x21,0x18,0x02,0x0e
+0x22,0x18,0x02,0x4e
+0x23,0x18,0x42,0x0e
+0x24,0x18,0x42,0x4e
+0x25,0x18,0x82,0x0e
+0x26,0x18,0x82,0x4e
+0x27,0x18,0xc2,0x4e
+
+#----------------------------------------------------------------------
+# transpose with 3 same vectors to get primary result
+#----------------------------------------------------------------------
+# CHECK: trn1	v8.8b, v1.8b, v2.8b
+# CHECK: trn1	v9.16b, v1.16b, v2.16b
+# CHECK: trn1	v10.4h, v1.4h, v2.4h
+# CHECK: trn1	v27.8h, v7.8h, v2.8h
+# CHECK: trn1	v12.2s, v7.2s, v2.2s
+# CHECK: trn1	v29.4s, v6.4s, v2.4s
+# CHECK: trn1	v14.2d, v6.2d, v2.2d
+0x28,0x28,0x02,0x0e
+0x29,0x28,0x02,0x4e
+0x2a,0x28,0x42,0x0e
+0xfb,0x28,0x42,0x4e
+0xec,0x28,0x82,0x0e
+0xdd,0x28,0x82,0x4e
+0xce,0x28,0xc2,0x4e
+
+#----------------------------------------------------------------------
+# zip with 3 same vectors to get primary result
+#----------------------------------------------------------------------
+# CHECK: zip1	v31.8b, v5.8b, v2.8b
+# CHECK: zip1	v0.16b, v5.16b, v2.16b
+# CHECK: zip1	v17.4h, v4.4h, v2.4h
+# CHECK: zip1	v2.8h, v4.8h, v2.8h
+# CHECK: zip1	v19.2s, v3.2s, v2.2s
+# CHECK: zip1	v4.4s, v3.4s, v2.4s
+# CHECK: zip1	v21.2d, v2.2d, v2.2d
+0xbf,0x38,0x02,0x0e
+0xa0,0x38,0x02,0x4e
+0x91,0x38,0x42,0x0e
+0x82,0x38,0x42,0x4e
+0x73,0x38,0x82,0x0e
+0x64,0x38,0x82,0x4e
+0x55,0x38,0xc2,0x4e
+
+#----------------------------------------------------------------------
+# unzip with 3 same vectors to get secondary result
+#----------------------------------------------------------------------
+# CHECK: uzp2	v6.8b, v2.8b, v2.8b
+# CHECK: uzp2	v23.16b, v1.16b, v2.16b
+# CHECK: uzp2	v8.4h, v1.4h, v2.4h
+# CHECK: uzp2	v25.8h, v0.8h, v2.8h
+# CHECK: uzp2	v10.2s, v0.2s, v2.2s
+# CHECK: uzp2	v27.4s, v7.4s, v2.4s
+# CHECK: uzp2	v12.2d, v7.2d, v2.2d
+0x46,0x58,0x02,0x0e
+0x37,0x58,0x02,0x4e
+0x28,0x58,0x42,0x0e
+0x19,0x58,0x42,0x4e
+0x0a,0x58,0x82,0x0e
+0xfb,0x58,0x82,0x4e
+0xec,0x58,0xc2,0x4e
+
+#----------------------------------------------------------------------
+# transpose with 3 same vectors to get secondary result
+#----------------------------------------------------------------------
+# CHECK: trn2	v29.8b, v6.8b, v2.8b
+# CHECK: trn2	v14.16b, v6.16b, v2.16b
+# CHECK: trn2	v31.4h, v5.4h, v2.4h
+# CHECK: trn2	v0.8h, v5.8h, v2.8h
+# CHECK: trn2	v17.2s, v4.2s, v2.2s
+# CHECK: trn2	v2.4s, v4.4s, v2.4s
+# CHECK: trn2	v19.2d, v3.2d, v2.2d
+0xdd,0x68,0x02,0x0e
+0xce,0x68,0x02,0x4e
+0xbf,0x68,0x42,0x0e
+0xa0,0x68,0x42,0x4e
+0x91,0x68,0x82,0x0e
+0x82,0x68,0x82,0x4e
+0x73,0x68,0xc2,0x4e
+
+#----------------------------------------------------------------------
+# zip with 3 same vectors to get secondary result
+#----------------------------------------------------------------------
+# CHECK: zip2	v4.8b, v3.8b, v2.8b
+# CHECK: zip2	v21.16b, v2.16b, v2.16b
+# CHECK: zip2	v6.4h, v2.4h, v2.4h
+# CHECK: zip2	v23.8h, v1.8h, v2.8h
+# CHECK: zip2	v8.2s, v1.2s, v2.2s
+# CHECK: zip2	v25.4s, v0.4s, v2.4s
+# CHECK: zip2	v10.2d, v0.2d, v2.2d
+0x64,0x78,0x02,0x0e
+0x55,0x78,0x02,0x4e
+0x46,0x78,0x42,0x0e
+0x37,0x78,0x42,0x4e
+0x28,0x78,0x82,0x0e
+0x19,0x78,0x82,0x4e
+0x0a,0x78,0xc2,0x4e
+
+#----------------------------------------------------------------------
+# Scalar Floating Point  multiply (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmul s0, s1, v1.s[0]
+# CHECK: fmul s0, s1, v1.s[3]
+# CHECK: fmul d0, d1, v1.d[0]
+# CHECK: fmul d0, d1, v1.d[1]
+# CHECK: fmul d15, d15, v15.d[1]
+0x20 0x90 0x81 0x5f
+0x20 0x98 0xa1 0x5f
+0x20 0x90 0xc1 0x5f
+0x20 0x98 0xc1 0x5f
+0xef 0x99 0xcf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Floating Point  multiply extended (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmulx s3, s5, v7.s[0]
+# CHECK: fmulx s3, s5, v7.s[3]
+# CHECK: fmulx s3, s5, v15.s[3]
+# CHECK: fmulx d0, d4, v8.d[0]
+# CHECK: fmulx d0, d4, v8.d[1]
+0xa3 0x90 0x87 0x7f
+0xa3 0x98 0xa7 0x7f
+0xa3 0x98 0xaf 0x7f
+0x80 0x90 0xc8 0x7f
+0x80 0x98 0xc8 0x7f
+
+#----------------------------------------------------------------------
+# Scalar Floating Point fused multiply-add (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmla s0, s1, v1.s[0]
+# CHECK: fmla s0, s1, v1.s[3]
+# CHECK: fmla d0, d1, v1.d[0]
+# CHECK: fmla d0, d1, v1.d[1]
+# CHECK: fmla d15, d15, v15.d[1]
+0x20 0x10 0x81 0x5f
+0x20 0x18 0xa1 0x5f
+0x20 0x10 0xc1 0x5f
+0x20 0x18 0xc1 0x5f
+0xef 0x19 0xcf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Floating Point fused multiply-sub (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: fmls s3, s5, v7.s[0]
+# CHECK: fmls s3, s5, v7.s[3]
+# CHECK: fmls s3, s5, v15.s[3]
+# CHECK: fmls d0, d4, v8.d[0]
+# CHECK: fmls d0, d4, v8.d[1]
+0xa3 0x50 0x87 0x5f
+0xa3 0x58 0xa7 0x5f
+0xa3 0x58 0xaf 0x5f
+0x80 0x50 0xc8 0x5f
+0x80 0x58 0xc8 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling
+# multiply-add long (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmlal s0, h0, v0.h[0]
+# CHECK: sqdmlal s0, h0, v0.h[1]
+# CHECK: sqdmlal s0, h0, v0.h[2]
+# CHECK: sqdmlal s0, h0, v0.h[3]
+# CHECK: sqdmlal s0, h0, v0.h[4]
+# CHECK: sqdmlal s0, h0, v0.h[5]
+# CHECK: sqdmlal s0, h0, v0.h[6]
+# CHECK: sqdmlal s0, h0, v0.h[7]
+# CHECK: sqdmlal d8, s9, v15.s[0]
+# CHECK: sqdmlal d8, s9, v15.s[1]
+# CHECK: sqdmlal d8, s9, v15.s[2]
+# CHECK: sqdmlal d8, s9, v15.s[3]
+0x00 0x30 0x40 0x5f
+0x00 0x30 0x50 0x5f
+0x00 0x30 0x60 0x5f
+0x00 0x30 0x70 0x5f
+0x00 0x38 0x40 0x5f
+0x00 0x38 0x50 0x5f
+0x00 0x38 0x60 0x5f
+0x00 0x38 0x70 0x5f
+0x28 0x31 0x8f 0x5f
+0x28 0x31 0xaf 0x5f
+0x28 0x39 0x8f 0x5f
+0x28 0x39 0xaf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling
+# multiply-sub long (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmlsl s0, h0, v0.h[0]
+# CHECK: sqdmlsl s0, h0, v0.h[1]
+# CHECK: sqdmlsl s0, h0, v0.h[2]
+# CHECK: sqdmlsl s0, h0, v0.h[3]
+# CHECK: sqdmlsl s0, h0, v0.h[4]
+# CHECK: sqdmlsl s0, h0, v0.h[5]
+# CHECK: sqdmlsl s0, h0, v0.h[6]
+# CHECK: sqdmlsl s0, h0, v0.h[7]
+# CHECK: sqdmlsl d8, s9, v15.s[0]
+# CHECK: sqdmlsl d8, s9, v15.s[1]
+# CHECK: sqdmlsl d8, s9, v15.s[2]
+# CHECK: sqdmlsl d8, s9, v15.s[3]
+0x00 0x70 0x40 0x5f
+0x00 0x70 0x50 0x5f
+0x00 0x70 0x60 0x5f
+0x00 0x70 0x70 0x5f
+0x00 0x78 0x40 0x5f
+0x00 0x78 0x50 0x5f
+0x00 0x78 0x60 0x5f
+0x00 0x78 0x70 0x5f
+0x28 0x71 0x8f 0x5f
+0x28 0x71 0xaf 0x5f
+0x28 0x79 0x8f 0x5f
+0x28 0x79 0xaf 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling multiply long (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmull s1, h1, v1.h[0]
+# CHECK: sqdmull s1, h1, v1.h[1]
+# CHECK: sqdmull s1, h1, v1.h[2]
+# CHECK: sqdmull s1, h1, v1.h[3]
+# CHECK: sqdmull s1, h1, v1.h[4]
+# CHECK: sqdmull s1, h1, v1.h[5]
+# CHECK: sqdmull s1, h1, v1.h[6]
+# CHECK: sqdmull s1, h1, v1.h[7]
+# CHECK: sqdmull d1, s1, v4.s[0]
+# CHECK: sqdmull d1, s1, v4.s[1]
+# CHECK: sqdmull d1, s1, v4.s[2]
+# CHECK: sqdmull d1, s1, v4.s[3]
+0x21 0xb0 0x41 0x5f
+0x21 0xb0 0x51 0x5f
+0x21 0xb0 0x61 0x5f
+0x21 0xb0 0x71 0x5f
+0x21 0xb8 0x41 0x5f
+0x21 0xb8 0x51 0x5f
+0x21 0xb8 0x61 0x5f
+0x21 0xb8 0x71 0x5f
+0x21 0xb0 0x84 0x5f
+0x21 0xb0 0xa4 0x5f
+0x21 0xb8 0x84 0x5f
+0x21 0xb8 0xa4 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating doubling multiply returning
+# high half (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqdmulh h7, h1, v14.h[0]
+# CHECK: sqdmulh h7, h15, v8.h[1]
+# CHECK: sqdmulh h7, h15, v8.h[2]
+# CHECK: sqdmulh h7, h15, v8.h[3]
+# CHECK: sqdmulh h7, h15, v8.h[4]
+# CHECK: sqdmulh h7, h15, v8.h[5]
+# CHECK: sqdmulh h7, h15, v8.h[6]
+# CHECK: sqdmulh h7, h15, v8.h[7]
+# CHECK: sqdmulh s15, s3, v4.s[0]
+# CHECK: sqdmulh s15, s14, v16.s[1]
+# CHECK: sqdmulh s15, s15, v16.s[2]
+# CHECK: sqdmulh s15, s16, v17.s[3]
+0x27 0xc0 0x4e 0x5f
+0xe7 0xc1 0x58 0x5f
+0xe7 0xc1 0x68 0x5f
+0xe7 0xc1 0x78 0x5f
+0xe7 0xc9 0x48 0x5f
+0xe7 0xc9 0x58 0x5f
+0xe7 0xc9 0x68 0x5f
+0xe7 0xc9 0x78 0x5f
+0x6f 0xc0 0x84 0x5f
+0xcf 0xc1 0xb0 0x5f
+0xef 0xc9 0x90 0x5f
+0x0f 0xca 0xb1 0x5f
+
+#----------------------------------------------------------------------
+# Scalar Signed saturating rounding doubling multiply
+# returning high half (scalar, by element)
+#----------------------------------------------------------------------
+# CHECK: sqrdmulh h7, h1, v14.h[0]
+# CHECK: sqrdmulh h7, h15, v8.h[1]
+# CHECK: sqrdmulh h7, h15, v8.h[2]
+# CHECK: sqrdmulh h7, h15, v8.h[3]
+# CHECK: sqrdmulh h7, h15, v8.h[4]
+# CHECK: sqrdmulh h7, h15, v8.h[5]
+# CHECK: sqrdmulh h7, h15, v8.h[6]
+# CHECK: sqrdmulh h7, h15, v8.h[7]
+# CHECK: sqrdmulh s15, s3, v4.s[0]
+# CHECK: sqrdmulh s15, s14, v16.s[1]
+# CHECK: sqrdmulh s15, s15, v16.s[2]
+# CHECK: sqrdmulh s15, s16, v17.s[3]
+0x27 0xd0 0x4e 0x5f
+0xe7 0xd1 0x58 0x5f
+0xe7 0xd1 0x68 0x5f
+0xe7 0xd1 0x78 0x5f
+0xe7 0xd9 0x48 0x5f
+0xe7 0xd9 0x58 0x5f
+0xe7 0xd9 0x68 0x5f
+0xe7 0xd9 0x78 0x5f
+0x6f 0xd0 0x84 0x5f
+0xcf 0xd1 0xb0 0x5f
+0xef 0xd9 0x90 0x5f
+0x0f 0xda 0xb1 0x5f
+
+#----------------------------------------------------------------------
+#Duplicate element (scalar)
+#----------------------------------------------------------------------
+# CHECK: dup b0, v0.b[15]
+# CHECK: dup h2, v31.h[5]
+# CHECK: dup s17, v2.s[2]
+# CHECK: dup d6, v12.d[1]
+0x00 0x04 0x1f 0x5e
+0xe2 0x07 0x16 0x5e
+0x51 0x04 0x14 0x5e
+0x86 0x05 0x18 0x5e
+
+#----------------------------------------------------------------------
+# Table look up
+#----------------------------------------------------------------------
+0x20,0x00,0x02,0x0e
+0xf0,0x23,0x02,0x0e
+0x20,0x40,0x02,0x0e
+0xf0,0x62,0x02,0x0e
+# CHECK: tbl v0.8b, {v1.16b}, v2.8b
+# CHECK: tbl v16.8b, {v31.16b, v0.16b}, v2.8b
+# CHECK: tbl v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
+# CHECK: tbl v16.8b, {v23.16b, v24.16b, v25.16b, v26.16b}, v2.8b
+
+0x20,0x00,0x02,0x4e
+0xf0,0x23,0x02,0x4e
+0x20,0x40,0x02,0x4e
+0xe0,0x63,0x02,0x4e
+# CHECK: tbl v0.16b, {v1.16b}, v2.16b
+# CHECK: tbl v16.16b, {v31.16b, v0.16b}, v2.16b
+# CHECK: tbl v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
+# CHECK: tbl v0.16b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.16b
+
+0x20,0x10,0x02,0x0e
+0xf0,0x33,0x02,0x0e
+0x20,0x50,0x02,0x0e
+0xf0,0x72,0x02,0x0e
+# CHECK: tbx v0.8b, {v1.16b}, v2.8b
+# CHECK: tbx v16.8b, {v31.16b, v0.16b}, v2.8b
+# CHECK: tbx v0.8b, {v1.16b, v2.16b, v3.16b}, v2.8b
+# CHECK: tbx v16.8b, {v23.16b, v24.16b, v25.16b, v26.16b}, v2.8b
+
+0x20,0x10,0x02,0x4e
+0xf0,0x33,0x02,0x4e
+0x20,0x50,0x02,0x4e
+0xf0,0x73,0x02,0x4e
+# CHECK: tbx v0.16b, {v1.16b}, v2.16b
+# CHECK: tbx v16.16b, {v31.16b, v0.16b}, v2.16b
+# CHECK: tbx v0.16b, {v1.16b, v2.16b, v3.16b}, v2.16b
+# CHECK: tbx v16.16b, {v31.16b, v0.16b, v1.16b, v2.16b}, v2.16b
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Lower Precision Narrow, Rounding To
+# Odd
+#----------------------------------------------------------------------
+# CHECK: fcvtxn s22, d13
+0xb6,0x69,0x61,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Signed Integer, Rounding To Nearest
+# With Ties To Away
+#----------------------------------------------------------------------
+# CHECK: fcvtas s12, s13
+# CHECK: fcvtas d21, d14
+
+0xac,0xc9,0x21,0x5e
+0xd5,0xc9,0x61,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Unsigned Integer, Rounding To
+# Nearest With Ties To Away
+#----------------------------------------------------------------------
+# CHECK: fcvtau s12, s13
+# CHECK: fcvtau d21, d14
+0xac,0xc9,0x21,0x7e
+0xd5,0xc9,0x61,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Signed Integer, Rounding Toward
+# Minus Infinity
+#----------------------------------------------------------------------
+# CHECK: fcvtms s22, s13
+# CHECK: fcvtms d21, d14
+0xb6,0xb9,0x21,0x5e
+0xd5,0xb9,0x61,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Unsigned Integer, Rounding Toward
+# Minus Infinity
+#----------------------------------------------------------------------
+# CHECK: fcvtmu s12, s13
+# CHECK: fcvtmu d21, d14
+0xac,0xb9,0x21,0x7e
+0xd5,0xb9,0x61,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Signed Integer, Rounding To Nearest
+# With Ties To Even
+#----------------------------------------------------------------------
+
+# CHECK: fcvtns s22, s13
+# CHECK: fcvtns d21, d14
+
+0xb6,0xa9,0x21,0x5e
+0xd5,0xa9,0x61,0x5e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Unsigned Integer, Rounding To
+# Nearest With Ties To Even
+#----------------------------------------------------------------------
+
+# CHECK: fcvtnu s12, s13
+# CHECK: fcvtnu d21, d14
+0xac,0xa9,0x21,0x7e
+0xd5,0xa9,0x61,0x7e
+        
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Signed Integer, Rounding Toward
+# Positive Infinity
+#----------------------------------------------------------------------
+# CHECK: fcvtps s22, s13
+# CHECK: fcvtps d21, d14
+0xb6,0xa9,0xa1,0x5e
+0xd5,0xa9,0xe1,0x5e
+        
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Unsigned Integer, Rounding Toward
+# Positive Infinity
+#----------------------------------------------------------------------
+# CHECK: fcvtpu s12, s13
+# CHECK: fcvtpu d21, d14
+0xac,0xa9,0xa1,0x7e
+0xd5,0xa9,0xe1,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Signed Integer, Rounding Toward Zero
+#----------------------------------------------------------------------
+# CHECK: fcvtzs s12, s13
+# CHECK: fcvtzs d21, d14
+0xac,0xb9,0xa1,0x5e
+0xd5,0xb9,0xe1,0x5e
+        
+#----------------------------------------------------------------------
+# Scalar Floating-point Convert To Unsigned Integer, Rounding Toward 
+# Zero
+#----------------------------------------------------------------------
+# CHECK: fcvtzu s12, s13
+# CHECK: fcvtzu d21, d14
+0xac,0xb9,0xa1,0x7e
+0xd5,0xb9,0xe1,0x7e
+
+#----------------------------------------------------------------------
+# Scalar Floating-point Absolute Difference
+#----------------------------------------------------------------------
+# CHECK: fabd s29, s24, s20
+# CHECK: fabd d29, d24, d20
+0x1d,0xd7,0xb4,0x7e
+0x1d,0xd7,0xf4,0x7e
diff --git a/test/MC/Disassembler/ARM/basic-arm-instructions-v8.txt b/test/MC/Disassembler/ARM/basic-arm-instructions-v8.txt
new file mode 100644
index 0000000..d9286bf
--- /dev/null
+++ b/test/MC/Disassembler/ARM/basic-arm-instructions-v8.txt
@@ -0,0 +1,58 @@
+# RUN: llvm-mc -disassemble -triple armv8 -mattr=+db -show-encoding < %s | FileCheck %s
+
+# New v8 ARM instructions
+
+# HLT
+
+0x70 0x00 0x00 0xe1
+# CHECK: hlt #0
+
+0x7f 0xff 0x0f 0xe1
+# CHECK: hlt #65535
+
+0x59 0xf0 0x7f 0xf5
+0x51 0xf0 0x7f 0xf5
+0x55 0xf0 0x7f 0xf5
+0x5d 0xf0 0x7f 0xf5
+# CHECK: dmb ishld
+# CHECK: dmb oshld
+# CHECK: dmb nshld
+# CHECK: dmb ld
+
+0x05 0xf0 0x20 0xe3
+# CHECK: sevl
+
+
+# These are the only coprocessor instructions that remain defined in ARMv8
+# (The operations on p10/p11 disassemble into FP/NEON instructions)
+
+0x10 0x0e 0x00 0xee
+# CHECK: mcr p14
+
+0x10 0x0f 0x00 0xee
+# CHECK: mcr p15
+
+0x10 0x0e 0x10 0xee
+# CHECK: mrc p14
+
+0x10 0x0f 0x10 0xee
+# CHECK: mrc p15
+
+0x00 0x0e 0x40 0xec
+# CHECK: mcrr p14
+
+0x00 0x0f 0x40 0xec
+# CHECK: mcrr p15
+
+0x00 0x0e 0x50 0xec
+# CHECK: mrrc p14
+
+0x00 0x0f 0x50 0xec
+# CHECK: mrrc p15
+
+0x00 0x0e 0x80 0xec
+# CHECK: stc p14
+
+0x00 0x0e 0x90 0xec
+# CHECK: ldc p14
+
diff --git a/test/MC/Disassembler/ARM/basic-arm-instructions.txt b/test/MC/Disassembler/ARM/basic-arm-instructions.txt
index fd36268..8bcf4e6 100644
--- a/test/MC/Disassembler/ARM/basic-arm-instructions.txt
+++ b/test/MC/Disassembler/ARM/basic-arm-instructions.txt
@@ -2420,6 +2420,7 @@
 # CHECK: wfilt
 # CHECK: yield
 # CHECK: yieldne
+# CHECK: hint #5
 
 0x02 0xf0 0x20 0xe3
 0x02 0xf0 0x20 0x83
@@ -2427,3 +2428,4 @@
 0x03 0xf0 0x20 0xb3
 0x01 0xf0 0x20 0xe3
 0x01 0xf0 0x20 0x13
+0x05 0xf0 0x20 0xe3
diff --git a/test/MC/Disassembler/ARM/crc32-thumb.txt b/test/MC/Disassembler/ARM/crc32-thumb.txt
new file mode 100644
index 0000000..2f83b58
--- /dev/null
+++ b/test/MC/Disassembler/ARM/crc32-thumb.txt
@@ -0,0 +1,15 @@
+# RUN: llvm-mc --disassemble %s -triple=thumbv8 2>&1 | FileCheck %s
+
+# CHECK:  crc32b  r0, r1, r2
+# CHECK:  crc32h  r0, r1, r2
+# CHECK:  crc32w  r0, r1, r2
+# CHECK:  crc32cb r0, r1, r2
+# CHECK:  crc32ch r0, r1, r2
+# CHECK:  crc32cw r0, r1, r2
+
+0xc1 0xfa 0x82 0xf0
+0xc1 0xfa 0x92 0xf0
+0xc1 0xfa 0xa2 0xf0
+0xd1 0xfa 0x82 0xf0
+0xd1 0xfa 0x92 0xf0
+0xd1 0xfa 0xa2 0xf0
diff --git a/test/MC/Disassembler/ARM/crc32.txt b/test/MC/Disassembler/ARM/crc32.txt
new file mode 100644
index 0000000..17bb032
--- /dev/null
+++ b/test/MC/Disassembler/ARM/crc32.txt
@@ -0,0 +1,15 @@
+# RUN: llvm-mc --disassemble %s -triple=armv8 2>&1 | FileCheck %s
+
+# CHECK:  crc32b  r0, r1, r2
+# CHECK:  crc32h  r0, r1, r2
+# CHECK:  crc32w  r0, r1, r2
+# CHECK:  crc32cb r0, r1, r2
+# CHECK:  crc32ch r0, r1, r2
+# CHECK:  crc32cw r0, r1, r2
+
+0x42 0x00 0x01 0xe1
+0x42 0x00 0x21 0xe1
+0x42 0x00 0x41 0xe1
+0x42 0x02 0x01 0xe1
+0x42 0x02 0x21 0xe1
+0x42 0x02 0x41 0xe1
diff --git a/test/MC/Disassembler/ARM/fp-armv8.txt b/test/MC/Disassembler/ARM/fp-armv8.txt
new file mode 100644
index 0000000..46a26f5
--- /dev/null
+++ b/test/MC/Disassembler/ARM/fp-armv8.txt
@@ -0,0 +1,160 @@
+# RUN: llvm-mc -disassemble -triple armv8 -mattr=+fp-armv8 -show-encoding < %s | FileCheck %s
+
+0xe0 0x3b 0xb2 0xee
+# CHECK: vcvtt.f64.f16 d3, s1
+
+0xcc 0x2b 0xf3 0xee
+# CHECK: vcvtt.f16.f64 s5, d12
+
+0x60 0x3b 0xb2 0xee
+# CHECK: vcvtb.f64.f16 d3, s1
+
+0x41 0x2b 0xb3 0xee
+# CHECK: vcvtb.f16.f64 s4, d1
+
+0xe0 0x3b 0xb2 0xae
+# CHECK: vcvttge.f64.f16 d3, s1
+
+0xcc 0x2b 0xf3 0xce
+# CHECK: vcvttgt.f16.f64 s5, d12
+
+0x60 0x3b 0xb2 0x0e
+# CHECK: vcvtbeq.f64.f16 d3, s1
+
+0x41 0x2b 0xb3 0xbe
+# CHECK: vcvtblt.f16.f64 s4, d1
+
+
+0xe1 0x1a 0xbc 0xfe
+# CHECK: vcvta.s32.f32 s2, s3
+
+0xc3 0x1b 0xbc 0xfe
+# CHECK: vcvta.s32.f64 s2, d3
+
+0xeb 0x3a 0xbd 0xfe
+# CHECK: vcvtn.s32.f32 s6, s23
+
+0xe7 0x3b 0xbd 0xfe
+# CHECK: vcvtn.s32.f64 s6, d23
+
+0xc2 0x0a 0xbe 0xfe
+# CHECK: vcvtp.s32.f32 s0, s4
+
+0xc4 0x0b 0xbe 0xfe
+# CHECK: vcvtp.s32.f64 s0, d4
+
+0xc4 0x8a 0xff 0xfe
+# CHECK: vcvtm.s32.f32 s17, s8
+
+0xc8 0x8b 0xff 0xfe
+# CHECK: vcvtm.s32.f64 s17, d8
+
+0x61 0x1a 0xbc 0xfe
+# CHECK: vcvta.u32.f32 s2, s3
+
+0x43 0x1b 0xbc 0xfe
+# CHECK: vcvta.u32.f64 s2, d3
+
+0x6b 0x3a 0xbd 0xfe
+# CHECK: vcvtn.u32.f32 s6, s23
+
+0x67 0x3b 0xbd 0xfe
+# CHECK: vcvtn.u32.f64 s6, d23
+
+0x42 0x0a 0xbe 0xfe
+# CHECK: vcvtp.u32.f32 s0, s4
+
+0x44 0x0b 0xbe 0xfe
+# CHECK: vcvtp.u32.f64 s0, d4
+
+0x44 0x8a 0xff 0xfe
+# CHECK: vcvtm.u32.f32 s17, s8
+
+0x48 0x8b 0xff 0xfe
+# CHECK: vcvtm.u32.f64 s17, d8
+
+
+0xab 0x2a 0x20 0xfe
+# CHECK: vselge.f32 s4, s1, s23
+
+0xa7 0xeb 0x6f 0xfe
+# CHECK: vselge.f64 d30, d31, d23
+
+0x80 0x0a 0x30 0xfe
+# CHECK: vselgt.f32 s0, s1, s0
+
+0x24 0x5b 0x3a 0xfe
+# CHECK: vselgt.f64 d5, d10, d20
+
+0x2b 0xfa 0x0e 0xfe
+# CHECK: vseleq.f32 s30, s28, s23
+
+0x08 0x2b 0x04 0xfe
+# CHECK: vseleq.f64 d2, d4, d8
+
+0x07 0xaa 0x58 0xfe
+# CHECK: vselvs.f32 s21, s16, s14
+
+0x2f 0x0b 0x11 0xfe
+# CHECK: vselvs.f64 d0, d1, d31
+
+
+0x00 0x2a 0xc6 0xfe
+# CHECK: vmaxnm.f32 s5, s12, s0
+
+0xae 0x5b 0x86 0xfe
+# CHECK: vmaxnm.f64 d5, d22, d30
+
+0x46 0x0a 0x80 0xfe
+# CHECK: vminnm.f32 s0, s0, s12
+
+0x49 0x4b 0x86 0xfe
+# CHECK: vminnm.f64 d4, d6, d9
+
+
+0xcc 0x3b 0xb6 0xae
+# CHECK: vrintzge.f64 d3, d12
+
+0xcc 0x1a 0xf6 0xee
+# CHECK: vrintz.f32 s3, s24
+
+0x40 0x5b 0xb6 0xbe
+# CHECK: vrintrlt.f64 d5, d0
+
+0x64 0x0a 0xb6 0xee
+# CHECK: vrintr.f32 s0, s9
+
+0x6e 0xcb 0xf7 0x0e
+# CHECK: vrintxeq.f64 d28, d30
+
+0x47 0x5a 0xb7 0x6e
+# CHECK: vrintxvs.f32 s10, s14
+
+0x44 0x3b 0xb8 0xfe
+# CHECK: vrinta.f64 d3, d4
+
+0x60 0x6a 0xb8 0xfe
+# CHECK: vrinta.f32 s12, s1
+
+0x44 0x3b 0xb9 0xfe
+# CHECK: vrintn.f64 d3, d4
+
+0x60 0x6a 0xb9 0xfe
+# CHECK: vrintn.f32 s12, s1
+
+0x44 0x3b 0xba 0xfe
+# CHECK: vrintp.f64 d3, d4
+
+0x60 0x6a 0xba 0xfe
+# CHECK: vrintp.f32 s12, s1
+
+0x44 0x3b 0xbb 0xfe
+# CHECK: vrintm.f64 d3, d4
+
+0x60 0x6a 0xbb 0xfe
+# CHECK: vrintm.f32 s12, s1
+
+
+0x10 0xa 0xf5 0xee
+# CHECK: vmrs r0, mvfr2
+
diff --git a/test/MC/Disassembler/ARM/invalid-armv7.txt b/test/MC/Disassembler/ARM/invalid-armv7.txt
index be79326..550173f 100644
--- a/test/MC/Disassembler/ARM/invalid-armv7.txt
+++ b/test/MC/Disassembler/ARM/invalid-armv7.txt
@@ -69,14 +69,6 @@
 # Undefined encoding space for hint instructions
 #------------------------------------------------------------------------------
 
-[0x05 0xf0 0x20 0xe3]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0x05 0xf0 0x20 0xe3]
-
-[0x41 0xf0 0x20 0xe3]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0x41 0xf0 0x20 0xe3]
-
 # FIXME: is it "dbg #14" or not????
 [0xfe 0xf0 0x20 0xe3]
 # CHCK: invalid instruction encoding
@@ -183,7 +175,7 @@
 # | 1: 1: 1: 1| 0: 0: 0: 1| 1: 0: 1: 1| 1: 1: 0: 0| 1: 1: 0: 1| 0: 0: 0: 1| 0: 0: 0: 0| 0: 0: 1: 0|
 # -------------------------------------------------------------------------------------------------
 # To qualify as an LSL (immediate) instruction, Inst{19-16} "should" be 0b0000, instead it is = 0b1100.
-# The instruction is UNPREDICTABLE, and is not a valid intruction.
+# The instruction is UNPREDICTABLE, and is not a valid instruction.
 #
 # See also
 # A8.6.88 LSL (immediate)
@@ -201,7 +193,7 @@
 # | 1: 1: 1: 1| 0: 0: 0: 1| 1: 0: 1: 1| 1: 1: 0: 0| 1: 1: 0: 1| 0: 0: 0: 0| 0: 0: 0: 0| 0: 0: 1: 0|
 # -------------------------------------------------------------------------------------------------
 # To qualify as a MOV (register) instruction, Inst{19-16} "should" be 0b0000, instead it is = 0b1100.
-# The instruction is UNPREDICTABLE, and is not a valid intruction.
+# The instruction is UNPREDICTABLE, and is not a valid instruction.
 #
 # See also
 # A8.6.97 MOV (register)
diff --git a/test/MC/Disassembler/ARM/invalid-armv8.txt b/test/MC/Disassembler/ARM/invalid-armv8.txt
new file mode 100644
index 0000000..772ff1d
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-armv8.txt
@@ -0,0 +1,167 @@
+# RUN: not llvm-mc -triple armv8 -show-encoding -disassemble %s 2>&1 | FileCheck %s
+
+# Coprocessors other than CP10, CP11, CP14 and CP15 are undefined in ARMv8;
+# but in ARMv7, all these instructions are valid
+
+# RUN: llvm-mc -triple armv7 -show-encoding -disassemble %s | FileCheck %s --check-prefix=CHECK-V7
+
+[0x00 0x01 0x00 0xee]
+# CHECK-V7: cdp
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x00 0xee]
+
+[0x00 0x0e 0x00 0xee]
+# CHECK-V7: cdp
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0e 0x00 0xee]
+
+[0x00 0x0f 0x00 0xee]
+# CHECK-V7: cdp
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x00 0xee]
+
+[0x00 0x01 0x00 0xfe]
+# CHECK-V7: cdp2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x00 0xfe]
+
+[0x00 0x0e 0x00 0xfe]
+# CHECK-V7: cdp2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0e 0x00 0xfe]
+
+[0x00 0x0f 0x00 0xfe]
+# CHECK-V7: cdp2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x00 0xfe]
+
+[0x10 0x01 0x00 0xee]
+# CHECK-V7: mcr
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x01 0x00 0xee]
+
+[0x10 0x01 0x00 0xfe]
+# CHECK-V7: mcr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x01 0x00 0xfe]
+
+[0x10 0x0e 0x00 0xfe]
+# CHECK-V7: mcr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x0e 0x00 0xfe]
+
+[0x10 0x0f 0x00 0xfe]
+# CHECK-V7: mcr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x0f 0x00 0xfe]
+
+[0x10 0x01 0x10 0xee]
+# CHECK-V7: mrc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x01 0x10 0xee]
+
+[0x10 0x01 0x10 0xfe]
+# CHECK-V7: mrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x01 0x10 0xfe]
+
+[0x10 0x0e 0x10 0xfe]
+# CHECK-V7: mrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x0e 0x10 0xfe]
+
+[0x10 0x0f 0x10 0xfe]
+# CHECK-V7: mrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0x0f 0x10 0xfe]
+
+[0x00 0x01 0x40 0xec]
+# CHECK-V7: mcrr
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x40 0xec]
+
+[0x00 0x01 0x40 0xfc]
+# CHECK-V7: mcrr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x40 0xfc]
+
+[0x00 0x0e 0x40 0xfc]
+# CHECK-V7: mcrr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0e 0x40 0xfc]
+
+[0x00 0x0f 0x40 0xfc]
+# CHECK-V7: mcrr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x40 0xfc]
+
+[0x00 0x01 0x50 0xec]
+# CHECK-V7: mrrc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x50 0xec]
+
+[0x00 0x0e 0x50 0xfc]
+# CHECK-V7: mrrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0e 0x50 0xfc]
+
+[0x00 0x0f 0x50 0xfc]
+# CHECK-V7: mrrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x50 0xfc]
+
+[0x00 0x01 0x50 0xfc]
+# CHECK-V7: mrrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x50 0xfc]
+
+[0x00 0x01 0x80 0xec]
+# CHECK-V7: stc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x80 0xec]
+
+[0x00 0x0f 0x80 0xec]
+# CHECK-V7: stc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x80 0xec]
+
+[0x00 0x01 0x80 0xfc]
+# CHECK-V7: stc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x80 0xfc]
+
+[0x00 0x0e 0x80 0xfc]
+# CHECK-V7: stc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0e 0x80 0xfc]
+
+[0x00 0x0f 0x80 0xfc]
+# CHECK-V7: stc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x80 0xfc]
+
+[0x00 0x01 0x90 0xec]
+# CHECK-V7: ldc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x90 0xec]
+
+[0x00 0x0f 0x90 0xec]
+# CHECK-V7: ldc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x90 0xec]
+
+[0x00 0x01 0x90 0xfc]
+# CHECK-V7: ldc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x01 0x90 0xfc]
+
+[0x00 0x0e 0x90 0xfc]
+# CHECK-V7: ldc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0e 0x90 0xfc]
+
+[0x00 0x0f 0x90 0xfc]
+# CHECK-V7: ldc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0x0f 0x90 0xfc]
+
diff --git a/test/MC/Disassembler/ARM/invalid-because-armv7.txt b/test/MC/Disassembler/ARM/invalid-because-armv7.txt
index 4bf4833..beed8e4 100644
--- a/test/MC/Disassembler/ARM/invalid-because-armv7.txt
+++ b/test/MC/Disassembler/ARM/invalid-because-armv7.txt
@@ -18,3 +18,9 @@
 [0x41 0x2b 0xb3 0xbe]
 # CHECK: invalid instruction encoding
 # CHECK-NEXT: [0x41 0x2b 0xb3 0xbe]
+
+# Would be vmrs r0, mvfr2
+[0x10 0xa 0xf5 0xee]
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0xa 0xf5 0xee]
+
diff --git a/test/MC/Disassembler/ARM/invalid-thumbv7.txt b/test/MC/Disassembler/ARM/invalid-thumbv7.txt
index f465b3c..2c84b8a 100644
--- a/test/MC/Disassembler/ARM/invalid-thumbv7.txt
+++ b/test/MC/Disassembler/ARM/invalid-thumbv7.txt
@@ -32,16 +32,6 @@
 # CHECK: invalid instruction encoding
 # CHECK-NEXT: [0x6f 0xde]
 
-
-#------------------------------------------------------------------------------
-# Undefined encoding space for hint instructions
-#------------------------------------------------------------------------------
-
-[0xaf 0xf3 0x05 0x80]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0xaf 0xf3 0x05 0x80]
-
-
 #------------------------------------------------------------------------------
 # Undefined encoding for it
 #------------------------------------------------------------------------------
@@ -50,10 +40,7 @@
 # CHECK: potentially undefined instruction encoding
 # CHECK-NEXT: [0xff 0xbf 0x6b 0x80 0x00 0x75]
 
-# mask = 0
-[0x50 0xbf 0x00 0x00]
-# CHECK: invalid instruction encoding
-# CHECK-NEXT: [0x50 0xbf 0x00 0x00]
+[0x50 0xbf] # hint #5; legal as the third instruction for the iteee above
 
 # Two warnings from this block since there are two instructions in there
 [0xdb 0xbf 0x42 0xbb]
@@ -402,3 +389,19 @@
 [0x80 0xf9 0x30 0x0b]
 # CHECK: invalid instruction encoding
 # CHECK-NEXT: [0x80 0xf9 0x30 0x0b]
+
+
+#------------------------------------------------------------------------------
+# Unpredictable STMs
+#------------------------------------------------------------------------------
+
+# 32-bit Thumb STM instructions cannot have a writeback register which appears
+# in the list.
+
+[0xa1,0xe8,0x07,0x04]
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: [0xa1,0xe8,0x07,0x04]
+
+[0x21,0xe9,0x07,0x04]
+# CHECK: warning: potentially undefined instruction encoding
+# CHECK-NEXT: [0x21,0xe9,0x07,0x04]
diff --git a/test/MC/Disassembler/ARM/invalid-thumbv8.txt b/test/MC/Disassembler/ARM/invalid-thumbv8.txt
new file mode 100644
index 0000000..4c6b249
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-thumbv8.txt
@@ -0,0 +1,167 @@
+# RUN: not llvm-mc -disassemble %s -show-encoding -triple thumbv8 2>&1 | FileCheck %s
+
+# Coprocessors other than CP10, CP11, CP14 and CP15 are undefined in ARMv8;
+# but in ARMv7, all these instructions are valid
+
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble %s | FileCheck %s --check-prefix=CHECK-V7
+
+[0x00 0xee 0x00 0x01]
+# CHECK-V7: cdp
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xee 0x00 0x01]
+
+[0x00 0xee 0x00 0x0e]
+# CHECK-V7: cdp
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xee 0x00 0x0e]
+
+[0x00 0xee 0x00 0x0f]
+# CHECK-V7: cdp
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xee 0x00 0x0f]
+
+[0x00 0xfe 0x00 0x01]
+# CHECK-V7: cdp2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xfe 0x00 0x01]
+
+[0x00 0xfe 0x00 0x0e]
+# CHECK-V7: cdp2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xfe 0x00 0x0e]
+
+[0x00 0xfe 0x00 0x0f]
+# CHECK-V7: cdp2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xfe 0x00 0x0f]
+
+[0x00 0xee 0x10 0x01]
+# CHECK-V7: mcr
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xee 0x10 0x01]
+
+[0x00 0xfe 0x10 0x01]
+# CHECK-V7: mcr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xfe 0x10 0x01]
+
+[0x00 0xfe 0x10 0x0e]
+# CHECK-V7: mcr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xfe 0x10 0x0e]
+
+[0x00 0xfe 0x10 0x0f]
+# CHECK-V7: mcr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x00 0xfe 0x10 0x0f]
+
+[0x10 0xee 0x10 0x01]
+# CHECK-V7: mrc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0xee 0x10 0x01]
+
+[0x10 0xfe 0x10 0x01]
+# CHECK-V7: mrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0xfe 0x10 0x01]
+
+[0x10 0xfe 0x10 0x0e]
+# CHECK-V7: mrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0xfe 0x10 0x0e]
+
+[0x10 0xfe 0x10 0x0f]
+# CHECK-V7: mrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x10 0xfe 0x10 0x0f]
+
+[0x40 0xec 0x00 0x01]
+# CHECK-V7: mcrr
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x40 0xec 0x00 0x01]
+
+[0x40 0xfc 0x00 0x01]
+# CHECK-V7: mcrr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x40 0xfc 0x00 0x01]
+
+[0x40 0xfc 0x00 0x0e]
+# CHECK-V7: mcrr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x40 0xfc 0x00 0x0e]
+
+[0x40 0xfc 0x00 0x0f]
+# CHECK-V7: mcrr2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x40 0xfc 0x00 0x0f]
+
+[0x50 0xec 0x00 0x01]
+# CHECK-V7: mrrc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x50 0xec 0x00 0x01]
+
+[0x50 0xfc 0x00 0x0e]
+# CHECK-V7: mrrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x50 0xfc 0x00 0x0e]
+
+[0x50 0xfc 0x00 0x0f]
+# CHECK-V7: mrrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x50 0xfc 0x00 0x0f]
+
+[0x50 0xfc 0x00 0x01]
+# CHECK-V7: mrrc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x50 0xfc 0x00 0x01]
+
+[0x80 0xec 0x00 0x01]
+# CHECK-V7: stc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x80 0xec 0x00 0x01]
+
+[0x80 0xec 0x00 0x0f]
+# CHECK-V7: stc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x80 0xec 0x00 0x0f]
+
+[0x80 0xfc 0x00 0x01]
+# CHECK-V7: stc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x80 0xfc 0x00 0x01]
+
+[0x80 0xfc 0x00 0x0e]
+# CHECK-V7: stc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x80 0xfc 0x00 0x0e]
+
+[0x80 0xfc 0x00 0x0f]
+# CHECK-V7: stc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x80 0xfc 0x00 0x0f]
+
+[0x90 0xec 0x00 0x01]
+# CHECK-V7: ldc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x90 0xec 0x00 0x01]
+
+[0x90 0xec 0x00 0x0f]
+# CHECK-V7: ldc
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x90 0xec 0x00 0x0f]
+
+[0x90 0xfc 0x00 0x01]
+# CHECK-V7: ldc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x90 0xfc 0x00 0x01]
+
+[0x90 0xfc 0x00 0x0e]
+# CHECK-V7: ldc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x90 0xfc 0x00 0x0e]
+
+[0x90 0xfc 0x00 0x0f]
+# CHECK-V7: ldc2
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: [0x90 0xfc 0x00 0x0f]
+
diff --git a/test/MC/Disassembler/ARM/lit.local.cfg b/test/MC/Disassembler/ARM/lit.local.cfg
index 22a76e5..8a3ba96 100644
--- a/test/MC/Disassembler/ARM/lit.local.cfg
+++ b/test/MC/Disassembler/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.txt']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/MC/Disassembler/ARM/load-store-acquire-release-v8-thumb.txt b/test/MC/Disassembler/ARM/load-store-acquire-release-v8-thumb.txt
new file mode 100644
index 0000000..8a2ba74
--- /dev/null
+++ b/test/MC/Disassembler/ARM/load-store-acquire-release-v8-thumb.txt
@@ -0,0 +1,33 @@
+# RUN: llvm-mc -triple=thumbv8 -disassemble -show-encoding < %s | FileCheck %s
+
+0xd4 0xe8 0xcf 0x3f
+0xd5 0xe8 0xdf 0x2f
+0xd7 0xe8 0xef 0x1f
+0xd8 0xe8 0xff 0x67
+# CHECK:  ldaexb	r3, [r4]                @ encoding: [0xd4,0xe8,0xcf,0x3f]
+# CHECK:  ldaexh	r2, [r5]                @ encoding: [0xd5,0xe8,0xdf,0x2f]
+# CHECK:  ldaex	r1, [r7]                @ encoding: [0xd7,0xe8,0xef,0x1f]
+# CHECK:  ldaexd	r6, r7, [r8]            @ encoding: [0xd8,0xe8,0xff,0x67]
+
+0xc4 0xe8 0xc1 0x3f
+0xc5 0xe8 0xd4 0x2f
+0xc7 0xe8 0xe2 0x1f
+0xc8 0xe8 0xf6 0x23
+# CHECK: stlexb r1, r3, [r4]            @ encoding: [0xc4,0xe8,0xc1,0x3f]
+# CHECK: stlexh r4, r2, [r5]            @ encoding: [0xc5,0xe8,0xd4,0x2f]
+# CHECK: stlex r2, r1, [r7]            @ encoding: [0xc7,0xe8,0xe2,0x1f]
+# CHECK: stlexd r6, r2, r3, [r8]        @ encoding: [0xc8,0xe8,0xf6,0x23]
+
+0xd6 0xe8 0xaf 0x5f
+0xd6 0xe8 0x8f 0x5f
+0xd9 0xe8 0x9f 0xcf
+# CHECK: lda r5, [r6]                   @ encoding: [0xd6,0xe8,0xaf,0x5f]
+# CHECK: ldab r5, [r6]                  @ encoding: [0xd6,0xe8,0x8f,0x5f]
+# CHECK: ldah r12, [r9]                 @ encoding: [0xd9,0xe8,0x9f,0xcf]
+
+0xc0 0xe8 0xaf 0x3f
+0xc1 0xe8 0x8f 0x2f
+0xc3 0xe8 0x9f 0x2f
+# CHECK: stl r3, [r0]                   @ encoding: [0xc0,0xe8,0xaf,0x3f]
+# CHECK: stlb r2, [r1]                  @ encoding: [0xc1,0xe8,0x8f,0x2f]
+# CHECK: stlh r2, [r3]                  @ encoding: [0xc3,0xe8,0x9f,0x2f]
diff --git a/test/MC/Disassembler/ARM/load-store-acquire-release-v8.txt b/test/MC/Disassembler/ARM/load-store-acquire-release-v8.txt
new file mode 100644
index 0000000..058f9cc
--- /dev/null
+++ b/test/MC/Disassembler/ARM/load-store-acquire-release-v8.txt
@@ -0,0 +1,32 @@
+# RUN: llvm-mc -triple=armv8 -disassemble -show-encoding < %s | FileCheck %s
+0x9f 0x0e 0xd8 0xe1
+0x9f 0x1e 0xfc 0xe1
+0x9f 0x1e 0x90 0xe1
+0x9f 0x8e 0xbd 0xe1
+# CHECK: ldaexb r0, [r8]       @ encoding: [0x9f,0x0e,0xd8,0xe1]
+# CHECK: ldaexh r1, [r12]      @ encoding: [0x9f,0x1e,0xfc,0xe1]
+# CHECK: ldaex  r1, [r0]       @ encoding: [0x9f,0x1e,0x90,0xe1]
+# CHECK: ldaexd r8, r9, [sp]   @ encoding: [0x9f,0x8e,0xbd,0xe1]
+
+0x93 0x1e 0xc4 0xe1
+0x92 0x4e 0xe5 0xe1
+0x91 0x2e 0x87 0xe1
+0x92 0x6e 0xa8 0xe1
+# CHECK: stlexb r1, r3, [r4]            @ encoding: [0x93,0x1e,0xc4,0xe1]
+# CHECK: stlexh r4, r2, [r5]            @ encoding: [0x92,0x4e,0xe5,0xe1]
+# CHECK: stlex r2, r1, [r7]            @ encoding: [0x91,0x2e,0x87,0xe1]
+# CHECK: stlexd r6, r2, r3, [r8]        @ encoding: [0x92,0x6e,0xa8,0xe1]
+
+0x9f 0x5c 0x96 0xe1
+0x9f 0x5c 0xd6 0xe1
+0x9f 0xcc 0xf9 0xe1
+# CHECK: lda r5, [r6]          @ encoding: [0x9f,0x5c,0x96,0xe1]
+# CHECK: ldab r5, [r6]         @ encoding: [0x9f,0x5c,0xd6,0xe1]
+# CHECK: ldah r12, [r9]        @ encoding: [0x9f,0xcc,0xf9,0xe1]
+
+0x93 0xfc 0x80 0xe1
+0x92 0xfc 0xc1 0xe1
+0x92 0xfc 0xe3 0xe1
+# CHECK: stl r3, [r0]                   @ encoding: [0x93,0xfc,0x80,0xe1]
+# CHECK: stlb r2, [r1]                  @ encoding: [0x92,0xfc,0xc1,0xe1]
+# CHECK: stlh r2, [r3]                  @ encoding: [0x92,0xfc,0xe3,0xe1]
diff --git a/test/MC/Disassembler/ARM/neon-crypto.txt b/test/MC/Disassembler/ARM/neon-crypto.txt
new file mode 100644
index 0000000..086c781
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neon-crypto.txt
@@ -0,0 +1,35 @@
+# RUN: llvm-mc -triple armv8-unknown-unknown -mattr=+neon,+crypto -disassemble < %s | FileCheck %s
+
+0x42,0x03,0xb0,0xf3
+# CHECK: aesd.8 q0, q1
+0x02,0x03,0xb0,0xf3
+# CHECK: aese.8 q0, q1
+0xc2,0x03,0xb0,0xf3
+# CHECK: aesimc.8 q0, q1
+0x82,0x03,0xb0,0xf3
+# CHECK: aesmc.8 q0, q1
+
+0xc2,0x02,0xb9,0xf3
+# CHECK: sha1h.32  q0, q1
+0x82,0x03,0xba,0xf3
+# CHECK: sha1su1.32 q0, q1
+0xc2,0x03,0xba,0xf3
+# CHECK: sha256su0.32 q0, q1
+
+0x44,0x0c,0x02,0xf2
+# CHECK: sha1c.32  q0, q1, q2
+0x44,0x0c,0x22,0xf2
+# CHECK: sha1m.32  q0, q1, q2
+0x44,0x0c,0x12,0xf2
+# CHECK: sha1p.32 q0, q1, q2
+0x44,0x0c,0x32,0xf2
+# CHECK: sha1su0.32  q0, q1, q2
+0x44,0x0c,0x02,0xf3
+# CHECK: sha256h.32  q0, q1, q2
+0x44,0x0c,0x12,0xf3
+# CHECK: sha256h2.32 q0, q1, q2
+0x44,0x0c,0x22,0xf3
+# CHECK: sha256su1.32 q0, q1, q2
+
+0xa1,0x0e,0xe0,0xf2
+# CHECK: vmull.p64  q8, d16, d17
diff --git a/test/MC/Disassembler/ARM/thumb-fp-armv8.txt b/test/MC/Disassembler/ARM/thumb-fp-armv8.txt
new file mode 100644
index 0000000..c90eed6
--- /dev/null
+++ b/test/MC/Disassembler/ARM/thumb-fp-armv8.txt
@@ -0,0 +1,163 @@
+# RUN: llvm-mc -disassemble -triple thumbv8 -mattr=+fp-armv8 -show-encoding < %s | FileCheck %s
+
+0xb2 0xee 0xe0 0x3b
+# CHECK: vcvtt.f64.f16 d3, s1
+
+0xf3 0xee 0xcc 0x2b
+# CHECK: vcvtt.f16.f64 s5, d12
+
+0xb2 0xee 0x60 0x3b
+# CHECK: vcvtb.f64.f16 d3, s1
+
+0xb3 0xee 0x41 0x2b
+# CHECK: vcvtb.f16.f64 s4, d1
+
+0xa8 0xbf # IT block
+0xb2 0xee 0xe0 0x3b
+# CHECK: vcvttge.f64.f16 d3, s1
+
+0xc8 0xbf # IT block
+0xf3 0xee 0xcc 0x2b
+# CHECK: vcvttgt.f16.f64 s5, d12
+
+0x08 0xbf # IT block
+0xb2 0xee 0x60 0x3b
+# CHECK: vcvtbeq.f64.f16 d3, s1
+
+0xb8 0xbf # IT block
+0xb3 0xee 0x41 0x2b
+# CHECK: vcvtblt.f16.f64 s4, d1
+
+
+0xbc 0xfe 0xe1 0x1a
+# CHECK: vcvta.s32.f32 s2, s3
+
+0xbc 0xfe 0xc3 0x1b
+# CHECK: vcvta.s32.f64 s2, d3
+
+0xbd 0xfe 0xeb 0x3a
+# CHECK: vcvtn.s32.f32 s6, s23
+
+0xbd 0xfe 0xe7 0x3b
+# CHECK: vcvtn.s32.f64 s6, d23
+
+0xbe 0xfe 0xc2 0x0a
+# CHECK: vcvtp.s32.f32 s0, s4
+
+0xbe 0xfe 0xc4 0x0b
+# CHECK: vcvtp.s32.f64 s0, d4
+
+0xff 0xfe 0xc4 0x8a
+# CHECK: vcvtm.s32.f32 s17, s8
+
+0xff 0xfe 0xc8 0x8b
+# CHECK: vcvtm.s32.f64 s17, d8
+
+0xbc 0xfe 0x61 0x1a
+# CHECK: vcvta.u32.f32 s2, s3
+
+0xbc 0xfe 0x43 0x1b
+# CHECK: vcvta.u32.f64 s2, d3
+
+0xbd 0xfe 0x6b 0x3a
+# CHECK: vcvtn.u32.f32 s6, s23
+
+0xbd 0xfe 0x67 0x3b
+# CHECK: vcvtn.u32.f64 s6, d23
+
+0xbe 0xfe 0x42 0x0a
+# CHECK: vcvtp.u32.f32 s0, s4
+
+0xbe 0xfe 0x44 0x0b
+# CHECK: vcvtp.u32.f64 s0, d4
+
+0xff 0xfe 0x44 0x8a
+# CHECK: vcvtm.u32.f32 s17, s8
+
+0xff 0xfe 0x48 0x8b
+# CHECK: vcvtm.u32.f64 s17, d8
+
+
+0x20 0xfe 0xab 0x2a
+# CHECK: vselge.f32 s4, s1, s23
+
+0x6f 0xfe 0xa7 0xeb
+# CHECK: vselge.f64 d30, d31, d23
+
+0x30 0xfe 0x80 0x0a
+# CHECK: vselgt.f32 s0, s1, s0
+
+0x3a 0xfe 0x24 0x5b
+# CHECK: vselgt.f64 d5, d10, d20
+
+0x0e 0xfe 0x2b 0xfa
+# CHECK: vseleq.f32 s30, s28, s23
+
+0x04 0xfe 0x08 0x2b
+# CHECK: vseleq.f64 d2, d4, d8
+
+0x58 0xfe 0x07 0xaa
+# CHECK: vselvs.f32 s21, s16, s14
+
+0x11 0xfe 0x2f 0x0b
+# CHECK: vselvs.f64 d0, d1, d31
+
+
+0xc6 0xfe 0x00 0x2a
+# CHECK: vmaxnm.f32 s5, s12, s0
+
+0x86 0xfe 0xae 0x5b
+# CHECK: vmaxnm.f64 d5, d22, d30
+
+0x80 0xfe 0x46 0x0a
+# CHECK: vminnm.f32 s0, s0, s12
+
+0x86 0xfe 0x49 0x4b
+# CHECK: vminnm.f64 d4, d6, d9
+
+
+0xa8 0xbf # IT block
+0xb6 0xee 0xcc 0x3b
+# CHECK: vrintzge.f64 d3, d12
+
+0xf6 0xee 0xcc 0x1a
+# CHECK: vrintz.f32 s3, s24
+
+0xb8 0xbf # IT block
+0xb6 0xee 0x40 0x5b
+# CHECK: vrintrlt.f64 d5, d0
+
+0xb6 0xee 0x64 0x0a
+# CHECK: vrintr.f32 s0, s9
+
+0x08 0xbf # IT block
+0xf7 0xee 0x6e 0xcb
+# CHECK: vrintxeq.f64 d28, d30
+
+0x68 0xbf # IT block
+0xb7 0xee 0x47 0x5a
+# CHECK: vrintxvs.f32 s10, s14
+
+0xb8 0xfe 0x44 0x3b
+# CHECK: vrinta.f64 d3, d4
+
+0xb8 0xfe 0x60 0x6a
+# CHECK: vrinta.f32 s12, s1
+
+0xb9 0xfe 0x44 0x3b
+# CHECK: vrintn.f64 d3, d4
+
+0xb9 0xfe 0x60 0x6a
+# CHECK: vrintn.f32 s12, s1
+
+0xba 0xfe 0x44 0x3b
+# CHECK: vrintp.f64 d3, d4
+
+0xba 0xfe 0x60 0x6a
+# CHECK: vrintp.f32 s12, s1
+
+0xbb 0xfe 0x44 0x3b
+# CHECK: vrintm.f64 d3, d4
+
+0xbb 0xfe 0x60 0x6a
+# CHECK: vrintm.f32 s12, s1
diff --git a/test/MC/Disassembler/ARM/thumb-neon-crypto.txt b/test/MC/Disassembler/ARM/thumb-neon-crypto.txt
new file mode 100644
index 0000000..c725c7f
--- /dev/null
+++ b/test/MC/Disassembler/ARM/thumb-neon-crypto.txt
@@ -0,0 +1,43 @@
+# RUN: llvm-mc -triple thumbv8-unknown-unknown -mattr=+neon,+crypto -disassemble < %s | FileCheck %s
+
+0xb0 0xff 0x42 0x03
+# CHECK: aesd.8  q0, q1
+0xb0 0xff 0x02 0x03
+# CHECK: aese.8 q0, q1
+0xb0 0xff 0xc2 0x03
+# CHECK: aesimc.8 q0, q1
+0xb0 0xff 0x82 0x03
+# CHECK: aesmc.8 q0, q1
+
+0xb9 0xff 0xc2 0x02
+# CHECK: sha1h.32  q0, q1
+0xba 0xff 0x82 0x03
+# CHECK: sha1su1.32 q0, q1
+0xba 0xff 0xc2 0x03
+# CHECK: sha256su0.32 q0, q1
+
+0x02 0xef 0x44 0x0c
+# CHECK: sha1c.32  q0, q1, q2
+0x22 0xef 0x44 0x0c
+# CHECK: sha1m.32  q0, q1, q2
+0x12 0xef 0x44 0x0c
+# CHECK: sha1p.32 q0, q1, q2
+0x32 0xef 0x44 0x0c
+# CHECK: sha1su0.32  q0, q1, q2
+0x02 0xff 0x44 0x0c
+# CHECK: sha256h.32  q0, q1, q2
+0x12 0xff 0x44 0x0c
+# CHECK: sha256h2.32 q0, q1, q2
+0x22 0xff 0x44 0x0c
+# CHECK: sha256su1.32 q0, q1, q2
+
+0xe0 0xef 0xa1 0x0e
+# CHECK: vmull.p64  q8, d16, d17
+
+# This used to be incorrectly decoded into an sha256h.32 [0x00,0xff,0x40,0x0c]
+# The other similar encodings are stc2 [0x00,0xfd,0x40,0x0c] and cdp2 [0x00,0xfe,0x40,0x0c]
+0x00 0xfc 0x40 0x0c
+# CHECK-NOT: sha256h.32
+# CHECK-NOT: stc2
+# CHECK-NOT: cdp2
+
diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index 84dd075..df2bac1 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt
@@ -125,7 +125,7 @@
 # CHECK: cps  #15
 0xaf 0xf3 0x0f 0x81
 
-# CHECK: cpsie.w  if, #10
+# CHECK: cpsie  if, #10
 0xaf 0xf3 0x6a 0x85
 
 # CHECK: cpsie aif
diff --git a/test/MC/Disassembler/ARM/thumb-v8.txt b/test/MC/Disassembler/ARM/thumb-v8.txt
new file mode 100644
index 0000000..eb5ffea
--- /dev/null
+++ b/test/MC/Disassembler/ARM/thumb-v8.txt
@@ -0,0 +1,28 @@
+# RUN: llvm-mc -disassemble -triple thumbv8 -mattr=+db -show-encoding < %s | FileCheck %s
+
+0x80 0xba
+# CHECK: hlt #0
+
+0xbf 0xba
+# CHECK: hlt #63
+
+# DCPS{1,2,3}
+
+0x8f 0xf7 0x01 0x80 
+# CHECK: dcps1
+
+0x8f 0xf7 0x02 0x80 
+# CHECK: dcps2
+
+0x8f 0xf7 0x03 0x80 
+# CHECK: dcps3
+
+0xbf 0xf3 0x59 0x8f
+0xbf 0xf3 0x51 0x8f
+0xbf 0xf3 0x55 0x8f
+0xbf 0xf3 0x5d 0x8f
+
+# CHECK: dmb ishld
+# CHECK: dmb oshld
+# CHECK: dmb nshld
+# CHECK: dmb ld
diff --git a/test/MC/Disassembler/ARM/thumb-v8fp.txt b/test/MC/Disassembler/ARM/thumb-v8fp.txt
deleted file mode 100644
index 3457192..0000000
--- a/test/MC/Disassembler/ARM/thumb-v8fp.txt
+++ /dev/null
@@ -1,163 +0,0 @@
-# RUN: llvm-mc -disassemble -triple thumbv8 -mattr=+v8fp -show-encoding < %s | FileCheck %s
-
-0xb2 0xee 0xe0 0x3b
-# CHECK: vcvtt.f64.f16 d3, s1
-
-0xf3 0xee 0xcc 0x2b
-# CHECK: vcvtt.f16.f64 s5, d12
-
-0xb2 0xee 0x60 0x3b
-# CHECK: vcvtb.f64.f16 d3, s1
-
-0xb3 0xee 0x41 0x2b
-# CHECK: vcvtb.f16.f64 s4, d1
-
-0xa8 0xbf # IT block
-0xb2 0xee 0xe0 0x3b
-# CHECK: vcvttge.f64.f16 d3, s1
-
-0xc8 0xbf # IT block
-0xf3 0xee 0xcc 0x2b
-# CHECK: vcvttgt.f16.f64 s5, d12
-
-0x08 0xbf # IT block
-0xb2 0xee 0x60 0x3b
-# CHECK: vcvtbeq.f64.f16 d3, s1
-
-0xb8 0xbf # IT block
-0xb3 0xee 0x41 0x2b
-# CHECK: vcvtblt.f16.f64 s4, d1
-
-
-0xbc 0xfe 0xe1 0x1a
-# CHECK: vcvta.s32.f32 s2, s3
-
-0xbc 0xfe 0xc3 0x1b
-# CHECK: vcvta.s32.f64 s2, d3
-
-0xbd 0xfe 0xeb 0x3a
-# CHECK: vcvtn.s32.f32 s6, s23
-
-0xbd 0xfe 0xe7 0x3b
-# CHECK: vcvtn.s32.f64 s6, d23
-
-0xbe 0xfe 0xc2 0x0a
-# CHECK: vcvtp.s32.f32 s0, s4
-
-0xbe 0xfe 0xc4 0x0b
-# CHECK: vcvtp.s32.f64 s0, d4
-
-0xff 0xfe 0xc4 0x8a
-# CHECK: vcvtm.s32.f32 s17, s8
-
-0xff 0xfe 0xc8 0x8b
-# CHECK: vcvtm.s32.f64 s17, d8
-
-0xbc 0xfe 0x61 0x1a
-# CHECK: vcvta.u32.f32 s2, s3
-
-0xbc 0xfe 0x43 0x1b
-# CHECK: vcvta.u32.f64 s2, d3
-
-0xbd 0xfe 0x6b 0x3a
-# CHECK: vcvtn.u32.f32 s6, s23
-
-0xbd 0xfe 0x67 0x3b
-# CHECK: vcvtn.u32.f64 s6, d23
-
-0xbe 0xfe 0x42 0x0a
-# CHECK: vcvtp.u32.f32 s0, s4
-
-0xbe 0xfe 0x44 0x0b
-# CHECK: vcvtp.u32.f64 s0, d4
-
-0xff 0xfe 0x44 0x8a
-# CHECK: vcvtm.u32.f32 s17, s8
-
-0xff 0xfe 0x48 0x8b
-# CHECK: vcvtm.u32.f64 s17, d8
-
-
-0x20 0xfe 0xab 0x2a
-# CHECK: vselge.f32 s4, s1, s23
-
-0x6f 0xfe 0xa7 0xeb
-# CHECK: vselge.f64 d30, d31, d23
-
-0x30 0xfe 0x80 0x0a
-# CHECK: vselgt.f32 s0, s1, s0
-
-0x3a 0xfe 0x24 0x5b
-# CHECK: vselgt.f64 d5, d10, d20
-
-0x0e 0xfe 0x2b 0xfa
-# CHECK: vseleq.f32 s30, s28, s23
-
-0x04 0xfe 0x08 0x2b
-# CHECK: vseleq.f64 d2, d4, d8
-
-0x58 0xfe 0x07 0xaa
-# CHECK: vselvs.f32 s21, s16, s14
-
-0x11 0xfe 0x2f 0x0b
-# CHECK: vselvs.f64 d0, d1, d31
-
-
-0xc6 0xfe 0x00 0x2a
-# CHECK: vmaxnm.f32 s5, s12, s0
-
-0x86 0xfe 0xae 0x5b
-# CHECK: vmaxnm.f64 d5, d22, d30
-
-0x80 0xfe 0x46 0x0a
-# CHECK: vminnm.f32 s0, s0, s12
-
-0x86 0xfe 0x49 0x4b
-# CHECK: vminnm.f64 d4, d6, d9
-
-
-0xa8 0xbf # IT block
-0xb6 0xee 0xcc 0x3b
-# CHECK: vrintzge.f64 d3, d12
-
-0xf6 0xee 0xcc 0x1a
-# CHECK: vrintz.f32 s3, s24
-
-0xb8 0xbf # IT block
-0xb6 0xee 0x40 0x5b
-# CHECK: vrintrlt.f64 d5, d0
-
-0xb6 0xee 0x64 0x0a
-# CHECK: vrintr.f32 s0, s9
-
-0x08 0xbf # IT block
-0xf7 0xee 0x6e 0xcb
-# CHECK: vrintxeq.f64 d28, d30
-
-0x68 0xbf # IT block
-0xb7 0xee 0x47 0x5a
-# CHECK: vrintxvs.f32 s10, s14
-
-0xb8 0xfe 0x44 0x3b
-# CHECK: vrinta.f64 d3, d4
-
-0xb8 0xfe 0x60 0x6a
-# CHECK: vrinta.f32 s12, s1
-
-0xb9 0xfe 0x44 0x3b
-# CHECK: vrintn.f64 d3, d4
-
-0xb9 0xfe 0x60 0x6a
-# CHECK: vrintn.f32 s12, s1
-
-0xba 0xfe 0x44 0x3b
-# CHECK: vrintp.f64 d3, d4
-
-0xba 0xfe 0x60 0x6a
-# CHECK: vrintp.f32 s12, s1
-
-0xbb 0xfe 0x44 0x3b
-# CHECK: vrintm.f64 d3, d4
-
-0xbb 0xfe 0x60 0x6a
-# CHECK: vrintm.f32 s12, s1
diff --git a/test/MC/Disassembler/ARM/thumb2-v8.txt b/test/MC/Disassembler/ARM/thumb2-v8.txt
new file mode 100644
index 0000000..1b2f095
--- /dev/null
+++ b/test/MC/Disassembler/ARM/thumb2-v8.txt
@@ -0,0 +1,40 @@
+# RUN: llvm-mc -triple=thumbv8 -disassemble < %s | FileCheck %s
+# CHECK: sevl
+# CHECK: sevl.w
+0x50 0xbf
+0xaf 0xf3 0x05 0x80
+
+
+# These are the only coprocessor instructions that remain defined in ARMv8
+# (The operations on p10/p11 disassemble into FP/NEON instructions)
+
+0x00 0xee 0x10 0x0e
+# CHECK: mcr p14
+
+0x00 0xee 0x10 0x0f
+# CHECK: mcr p15
+
+0x10 0xee 0x10 0x0e
+# CHECK: mrc p14
+
+0x10 0xee 0x10 0x0f
+# CHECK: mrc p15
+
+0x40 0xec 0x00 0x0e
+# CHECK: mcrr p14
+
+0x40 0xec 0x00 0x0f
+# CHECK: mcrr p15
+
+0x50 0xec 0x00 0x0e
+# CHECK: mrrc p14
+
+0x50 0xec 0x00 0x0f
+# CHECK: mrrc p15
+
+0x80 0xec 0x00 0x0e
+# CHECK: stc p14
+
+0x90 0xec 0x00 0x0e
+# CHECK: ldc p14
+
diff --git a/test/MC/Disassembler/ARM/thumb2.txt b/test/MC/Disassembler/ARM/thumb2.txt
index 9fc166f..c8b4080 100644
--- a/test/MC/Disassembler/ARM/thumb2.txt
+++ b/test/MC/Disassembler/ARM/thumb2.txt
@@ -2707,3 +2707,14 @@
 0x30 0xbf
 0x10 0xbf
 
+#------------------------------------------------------------------------------
+# Unallocated hints (They execute as NOPs, but software must not use them.)
+#------------------------------------------------------------------------------
+# CHECK: hint #6
+# CHECK: hint.w #6
+# CHECK: hint.w #102
+
+0x60 0xbf
+0xaf 0xf3 0x06 0x80
+0xaf 0xf3 0x66 0x80
+
diff --git a/test/MC/Disassembler/ARM/v8fp.txt b/test/MC/Disassembler/ARM/v8fp.txt
deleted file mode 100644
index a6e88b6..0000000
--- a/test/MC/Disassembler/ARM/v8fp.txt
+++ /dev/null
@@ -1,155 +0,0 @@
-# RUN: llvm-mc -disassemble -triple armv8 -mattr=+v8fp -show-encoding < %s | FileCheck %s
-
-0xe0 0x3b 0xb2 0xee
-# CHECK: vcvtt.f64.f16 d3, s1
-
-0xcc 0x2b 0xf3 0xee
-# CHECK: vcvtt.f16.f64 s5, d12
-
-0x60 0x3b 0xb2 0xee
-# CHECK: vcvtb.f64.f16 d3, s1
-
-0x41 0x2b 0xb3 0xee
-# CHECK: vcvtb.f16.f64 s4, d1
-
-0xe0 0x3b 0xb2 0xae
-# CHECK: vcvttge.f64.f16 d3, s1
-
-0xcc 0x2b 0xf3 0xce
-# CHECK: vcvttgt.f16.f64 s5, d12
-
-0x60 0x3b 0xb2 0x0e
-# CHECK: vcvtbeq.f64.f16 d3, s1
-
-0x41 0x2b 0xb3 0xbe
-# CHECK: vcvtblt.f16.f64 s4, d1
-
-
-0xe1 0x1a 0xbc 0xfe
-# CHECK: vcvta.s32.f32 s2, s3
-
-0xc3 0x1b 0xbc 0xfe
-# CHECK: vcvta.s32.f64 s2, d3
-
-0xeb 0x3a 0xbd 0xfe
-# CHECK: vcvtn.s32.f32 s6, s23
-
-0xe7 0x3b 0xbd 0xfe
-# CHECK: vcvtn.s32.f64 s6, d23
-
-0xc2 0x0a 0xbe 0xfe
-# CHECK: vcvtp.s32.f32 s0, s4
-
-0xc4 0x0b 0xbe 0xfe
-# CHECK: vcvtp.s32.f64 s0, d4
-
-0xc4 0x8a 0xff 0xfe
-# CHECK: vcvtm.s32.f32 s17, s8
-
-0xc8 0x8b 0xff 0xfe
-# CHECK: vcvtm.s32.f64 s17, d8
-
-0x61 0x1a 0xbc 0xfe
-# CHECK: vcvta.u32.f32 s2, s3
-
-0x43 0x1b 0xbc 0xfe
-# CHECK: vcvta.u32.f64 s2, d3
-
-0x6b 0x3a 0xbd 0xfe
-# CHECK: vcvtn.u32.f32 s6, s23
-
-0x67 0x3b 0xbd 0xfe
-# CHECK: vcvtn.u32.f64 s6, d23
-
-0x42 0x0a 0xbe 0xfe
-# CHECK: vcvtp.u32.f32 s0, s4
-
-0x44 0x0b 0xbe 0xfe
-# CHECK: vcvtp.u32.f64 s0, d4
-
-0x44 0x8a 0xff 0xfe
-# CHECK: vcvtm.u32.f32 s17, s8
-
-0x48 0x8b 0xff 0xfe
-# CHECK: vcvtm.u32.f64 s17, d8
-
-
-0xab 0x2a 0x20 0xfe
-# CHECK: vselge.f32 s4, s1, s23
-
-0xa7 0xeb 0x6f 0xfe
-# CHECK: vselge.f64 d30, d31, d23
-
-0x80 0x0a 0x30 0xfe
-# CHECK: vselgt.f32 s0, s1, s0
-
-0x24 0x5b 0x3a 0xfe
-# CHECK: vselgt.f64 d5, d10, d20
-
-0x2b 0xfa 0x0e 0xfe
-# CHECK: vseleq.f32 s30, s28, s23
-
-0x08 0x2b 0x04 0xfe
-# CHECK: vseleq.f64 d2, d4, d8
-
-0x07 0xaa 0x58 0xfe
-# CHECK: vselvs.f32 s21, s16, s14
-
-0x2f 0x0b 0x11 0xfe
-# CHECK: vselvs.f64 d0, d1, d31
-
-
-0x00 0x2a 0xc6 0xfe
-# CHECK: vmaxnm.f32 s5, s12, s0
-
-0xae 0x5b 0x86 0xfe
-# CHECK: vmaxnm.f64 d5, d22, d30
-
-0x46 0x0a 0x80 0xfe
-# CHECK: vminnm.f32 s0, s0, s12
-
-0x49 0x4b 0x86 0xfe
-# CHECK: vminnm.f64 d4, d6, d9
-
-
-0xcc 0x3b 0xb6 0xae
-# CHECK: vrintzge.f64 d3, d12
-
-0xcc 0x1a 0xf6 0xee
-# CHECK: vrintz.f32 s3, s24
-
-0x40 0x5b 0xb6 0xbe
-# CHECK: vrintrlt.f64 d5, d0
-
-0x64 0x0a 0xb6 0xee
-# CHECK: vrintr.f32 s0, s9
-
-0x6e 0xcb 0xf7 0x0e
-# CHECK: vrintxeq.f64 d28, d30
-
-0x47 0x5a 0xb7 0x6e
-# CHECK: vrintxvs.f32 s10, s14
-
-0x44 0x3b 0xb8 0xfe
-# CHECK: vrinta.f64 d3, d4
-
-0x60 0x6a 0xb8 0xfe
-# CHECK: vrinta.f32 s12, s1
-
-0x44 0x3b 0xb9 0xfe
-# CHECK: vrintn.f64 d3, d4
-
-0x60 0x6a 0xb9 0xfe
-# CHECK: vrintn.f32 s12, s1
-
-0x44 0x3b 0xba 0xfe
-# CHECK: vrintp.f64 d3, d4
-
-0x60 0x6a 0xba 0xfe
-# CHECK: vrintp.f32 s12, s1
-
-0x44 0x3b 0xbb 0xfe
-# CHECK: vrintm.f64 d3, d4
-
-0x60 0x6a 0xbb 0xfe
-# CHECK: vrintm.f32 s12, s1
diff --git a/test/MC/Disassembler/Mips/lit.local.cfg b/test/MC/Disassembler/Mips/lit.local.cfg
index 9b698b2..1fa54b4 100644
--- a/test/MC/Disassembler/Mips/lit.local.cfg
+++ b/test/MC/Disassembler/Mips/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.txt']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Mips' in targets:
     config.unsupported = True
diff --git a/test/MC/Disassembler/Mips/micromips.txt b/test/MC/Disassembler/Mips/micromips.txt
new file mode 100644
index 0000000..b2d0cc0
--- /dev/null
+++ b/test/MC/Disassembler/Mips/micromips.txt
@@ -0,0 +1,287 @@
+# RUN: llvm-mc --disassemble %s -triple=mips-unknown-linux -mattr=micromips \
+# RUN: | FileCheck %s
+
+# CHECK: add $9, $6, $7
+0x00 0xe6 0x49 0x10
+
+# CHECK: addi $9, $6, 17767
+0x11 0x26 0x45 0x67
+
+# CHECK: addiu $9, $6, -15001
+0x31 0x26 0xc5 0x67
+
+# CHECK: addi $9, $6, 17767
+0x11 0x26 0x45 0x67
+
+# CHECK: addiu $9, $6, -15001
+0x31 0x26 0xc5 0x67
+
+# CHECK: addu $9, $6, $7
+0x00 0xe6 0x49 0x50
+
+# CHECK: sub $9, $6, $7
+0x00 0xe6 0x49 0x90
+
+# CHECK: subu $4, $3, $5
+0x00 0xa3 0x21 0xd0
+
+# CHECK: sub $6, $zero, $7
+0x00 0xe0 0x31 0x90
+
+# CHECK: subu $6, $zero, $7
+0x00 0xe0 0x31 0xd0
+
+# CHECK: addu $7, $8, $zero
+0x00 0x08 0x39 0x50
+
+# CHECK: slt $3, $3, $5
+0x00 0xa3 0x1b 0x50
+
+# CHECK: slti $3, $3, 103
+0x90 0x63 0x00 0x67
+
+# CHECK: slti $3, $3, 103
+0x90 0x63 0x00 0x67
+
+# CHECK: sltiu $3, $3, 103
+0xb0 0x63 0x00 0x67
+
+# CHECK: sltu $3, $3, $5
+0x00 0xa3 0x1b 0x90
+
+# CHECK: lui $9, 17767
+0x41 0xa9 0x45 0x67
+
+# CHECK: and $9, $6, $7
+0x00 0xe6 0x4a 0x50
+
+# CHECK: andi $9, $6, 17767
+0xd1 0x26 0x45 0x67
+
+# CHECK: andi $9, $6, 17767
+0xd1 0x26 0x45 0x67
+
+# CHECK: or $3, $4, $5
+0x00 0xa4 0x1a 0x90
+
+# CHECK: ori $9, $6, 17767
+0x51 0x26 0x45 0x67
+
+# CHECK: xor $3, $3, $5
+0x00 0xa3 0x1b 0x10
+
+# CHECK: xori $9, $6, 17767
+0x71 0x26 0x45 0x67
+
+# CHECK: xori $9, $6, 17767
+0x71 0x26 0x45 0x67
+
+# CHECK: nor $9, $6, $7
+0x00 0xe6 0x4a 0xd0
+
+# CHECK: not $7, $8
+0x00 0x08 0x3a 0xd0
+
+# CHECK: mul $9, $6, $7
+0x00 0xe6 0x4a 0x10
+
+# CHECK: mult $9, $7
+0x00 0xe9 0x8b 0x3c
+
+# CHECK: multu $9, $7
+0x00 0xe9 0x9b 0x3c
+
+# CHECK-EB: div $zero, $9, $7
+0x00 0xe9 0xab 0x3c
+
+# CHECK-EB: divu $zero, $9, $7
+0x00 0xe9 0xbb 0x3c
+
+# CHECK: sll $4, $3, 7
+0x00 0x83 0x38 0x00
+
+# CHECK: sllv $2, $3, $5
+0x00 0x65 0x10 0x10
+
+# CHECK: sra $4, $3, 7
+0x00 0x83 0x38 0x80
+
+# CHECK: srav $2, $3, $5
+0x00 0x65 0x10 0x90
+
+# CHECK: srl $4, $3, 7
+0x00 0x83 0x38 0x40
+
+# CHECK: srlv $2, $3, $5
+0x00 0x65 0x10 0x50
+
+# CHECK: rotr $9, $6, 7
+0x01 0x26 0x38 0xc0
+
+# CHECK: rotrv $9, $6, $7
+0x00 0xc7 0x48 0xd0
+
+# CHECK: lb $5, 8($4)
+0x1c 0xa4 0x00 0x08
+
+# CHECK: lbu $6, 8($4)
+0x14 0xc4 0x00 0x08
+
+# CHECK: lh $2, 8($4)
+0x3c 0x44 0x00 0x08
+
+# CHECK: lhu $4, 8($2)
+0x34 0x82 0x00 0x08
+
+# CHECK: lw  $6, 4($5)
+0xfc 0xc5 0x00 0x04
+
+# CHECK: sb $5, 8($4)
+0x18 0xa4 0x00 0x08
+
+# CHECK: sh  $2, 8($4)
+0x38 0x44 0x00 0x08
+
+# CHECK: sw  $5, 4($6)
+0xf8 0xa6 0x00 0x04
+
+# CHECK: lwl $4, 16($5)
+0x60 0x85 0x00 0x10
+
+# CHECK: lwr $4, 16($5)
+0x60 0x85 0x10 0x10
+
+# CHECK: swl $4, 16($5)
+0x60 0x85 0x80 0x10
+
+# CHECK: swr $4, 16($5)
+0x60 0x85 0x90 0x10
+
+# CHECK: movz $9, $6, $7
+0x00 0xe6 0x48 0x58
+
+# CHECK: movn $9, $6, $7
+0x00 0xe6 0x48 0x18
+
+# CHECK: movt $9, $6, $fcc0
+0x55 0x26 0x09 0x7b
+
+# CHECK: movf $9, $6, $fcc0
+0x55 0x26 0x01 0x7b
+
+# CHECK: mthi   $6
+0x00 0x06 0x2d 0x7c
+
+# CHECK: mfhi   $6
+0x00 0x06 0x0d 0x7c
+
+# CHECK: mtlo   $6
+0x00 0x06 0x3d 0x7c
+
+# CHECK: mflo   $6
+0x00 0x06 0x1d 0x7c
+
+# CHECK: madd   $4, $5
+0x00 0xa4 0xcb 0x3c
+
+# CHECK: maddu  $4, $5
+0x00 0xa4 0xdb 0x3c
+
+# CHECK: msub   $4, $5
+0x00 0xa4 0xeb 0x3c
+
+# CHECK: msubu  $4, $5
+0x00 0xa4 0xfb 0x3c
+
+# CHECK: clz $9, $6
+0x01 0x26 0x5b 0x3c
+
+# CHECK: clo $9, $6
+0x01 0x26 0x4b 0x3c
+
+# CHECK: seb $9, $6
+0x01 0x26 0x2b 0x3c
+
+# CHECK: seh $9, $6
+0x01 0x26 0x3b 0x3c
+
+# CHECK: wsbh $9, $6
+0x01 0x26 0x7b 0x3c
+
+# CHECK: ext $9, $6, 3, 7
+0x01 0x26 0x30 0xec
+
+# CHECK: ins $9, $6, 3, 7
+0x01 0x26 0x48 0xcc
+
+# CHECK: j 1328
+0xd4 0x00 0x02 0x98
+
+# CHECK: jal 1328
+0xf4 0x00 0x02 0x98
+
+# CHECK: jalr $ra, $6
+0x03 0xe6 0x0f 0x3c
+
+# CHECK: jr $7
+0x00 0x07 0x0f 0x3c
+
+# CHECK: beq $9, $6, 1332
+0x94 0xc9 0x02 0x9a
+
+# CHECK: bgez $6, 1332
+0x40 0x46 0x02 0x9a
+
+# CHECK: bgezal $6, 1332
+0x40 0x66 0x02 0x9a
+
+# CHECK: bltzal $6, 1332
+0x40 0x26 0x02 0x9a
+
+# CHECK: bgtz $6, 1332
+0x40 0xc6 0x02 0x9a
+
+# CHECK: blez $6, 1332
+0x40 0x86 0x02 0x9a
+
+# CHECK: bne $9, $6, 1332
+0xb4 0xc9 0x02 0x9a
+
+# CHECK: bltz $6, 1332
+0x40 0x06 0x02 0x9a
+
+# CHECK: teq $8, $9, 0
+0x01 0x28 0x00 0x3c
+
+# CHECK: tge $8, $9, 0
+0x01 0x28 0x02 0x3c
+
+# CHECK: tgeu $8, $9, 0
+0x01 0x28 0x04 0x3c
+
+# CHECK: tlt $8, $9, 0
+0x01 0x28 0x08 0x3c
+
+# CHECK: tltu $8, $9, 0
+0x01 0x28 0x0a 0x3c
+
+# CHECK: tne $8, $9, 0
+0x01 0x28 0x0c 0x3c
+
+# CHECK: teqi $9, 17767
+0x41,0xc9,0x45,0x67
+
+# CHECK: tgei $9, 17767
+0x41 0x29 0x45 0x67
+
+# CHECK: tgeiu $9, 17767
+0x41 0x69 0x45 0x67
+
+# CHECK: tlti $9, 17767
+0x41 0x09 0x45 0x67
+
+# CHECK: tltiu $9, 17767
+0x41 0x49 0x45 0x67
+
+# CHECK: tnei $9, 17767
+0x41 0x89 0x45 0x67
diff --git a/test/MC/Disassembler/Mips/micromips_le.txt b/test/MC/Disassembler/Mips/micromips_le.txt
new file mode 100644
index 0000000..5b2fe30
--- /dev/null
+++ b/test/MC/Disassembler/Mips/micromips_le.txt
@@ -0,0 +1,287 @@
+# RUN: llvm-mc --disassemble %s -triple=mipsel-unknown-linux -mattr=micromips \
+# RUN: | FileCheck %s
+
+# CHECK: add $9, $6, $7
+0xe6 0x00 0x10 0x49
+
+# CHECK: addi $9, $6, 17767
+0x26 0x11 0x67 0x45
+
+# CHECK: addiu $9, $6, -15001
+0x26 0x31 0x67 0xc5
+
+# CHECK: addi $9, $6, 17767
+0x26 0x11 0x67 0x45
+
+# CHECK: addiu $9, $6, -15001
+0x26 0x31 0x67 0xc5
+
+# CHECK: addu $9, $6, $7
+0xe6 0x00 0x50 0x49
+
+# CHECK: sub $9, $6, $7
+0xe6 0x00 0x90 0x49
+
+# CHECK: subu  $4, $3, $5
+0xa3 0x00 0xd0 0x21
+
+# CHECK: sub $6, $zero, $7
+0xe0 0x00 0x90 0x31
+
+# CHECK: subu $6, $zero, $7
+0xe0 0x00 0xd0 0x31
+
+# CHECK: addu $7, $8, $zero
+0x08 0x00 0x50 0x39
+
+# CHECK: slt $3, $3, $5
+0xa3 0x00 0x50 0x1b
+
+# CHECK: slti $3, $3, 103
+0x63 0x90 0x67 0x00
+
+# CHECK: slti $3, $3, 103
+0x63 0x90 0x67 0x00
+
+# CHECK: sltiu $3, $3, 103
+0x63 0xb0 0x67 0x00
+
+# CHECK: sltu $3, $3, $5
+0xa3 0x00 0x90 0x1b
+
+# CHECK: lui $9, 17767
+0xa9 0x41 0x67 0x45
+
+# CHECK: and $9, $6, $7
+0xe6 0x00 0x50 0x4a
+
+# CHECK: andi $9, $6, 17767
+0x26 0xd1 0x67 0x45
+
+# CHECK: andi $9, $6, 17767
+0x26 0xd1 0x67 0x45
+
+# CHECK: or $3, $4, $5
+0xa4 0x00 0x90 0x1a
+
+# CHECK: ori $9, $6, 17767
+0x26 0x51 0x67 0x45
+
+# CHECK: xor $3, $3, $5
+0xa3 0x00 0x10 0x1b
+
+# CHECK: xori $9, $6, 17767
+0x26 0x71 0x67 0x45
+
+# CHECK: xori $9, $6, 17767
+0x26 0x71 0x67 0x45
+
+# CHECK: nor $9, $6, $7
+0xe6 0x00 0xd0 0x4a
+
+# CHECK: not $7, $8
+0x08 0x00 0xd0 0x3a
+
+# CHECK: mul $9, $6, $7
+0xe6 0x00 0x10 0x4a
+
+# CHECK: mult $9, $7
+0xe9 0x00 0x3c 0x8b
+
+# CHECK: multu $9, $7
+0xe9 0x00 0x3c 0x9b
+
+# CHECK: div $zero, $9, $7
+0xe9 0x00 0x3c 0xab
+
+# CHECK: divu $zero, $9, $7
+0xe9 0x00 0x3c 0xbb
+
+# CHECK: sll $4, $3, 7
+0x83 0x00 0x00 0x38
+
+# CHECK: sllv $2, $3, $5
+0x65 0x00 0x10 0x10
+
+# CHECK: sra $4, $3, 7
+0x83 0x00 0x80 0x38
+
+# CHECK: srav $2, $3, $5
+0x65 0x00 0x90 0x10
+
+# CHECK: srl $4, $3, 7
+0x83 0x00 0x40 0x38
+
+# CHECK: srlv $2, $3, $5
+0x65 0x00 0x50 0x10
+
+# CHECK: rotr $9, $6, 7
+0x26 0x01 0xc0 0x38
+
+# CHECK: rotrv $9, $6, $7
+0xc7 0x00 0xd0 0x48
+
+# CHECK: lb $5, 8($4)
+0xa4 0x1c 0x08 0x00
+
+# CHECK: lbu $6, 8($4)
+0xc4 0x14 0x08 0x00
+
+# CHECK: lh $2, 8($4)
+0x44 0x3c 0x08 0x00
+
+# CHECK: lhu $4, 8($2)
+0x82 0x34 0x08 0x00
+
+# CHECK: lw $6, 4($5)
+0xc5 0xfc 0x04 0x00
+
+# CHECK: sb $5, 8($4)
+0xa4 0x18 0x08 0x00
+
+# CHECK: sh $2, 8($4)
+0x44 0x38 0x08 0x00
+
+# CHECK: sw $5, 4($6)
+0xa6 0xf8 0x04 0x00
+
+# CHECK: lwl $4, 16($5)
+0x85 0x60 0x10 0x00
+
+# CHECK: lwr $4, 16($5)
+0x85 0x60 0x10 0x10
+
+# CHECK: swl $4, 16($5)
+0x85 0x60 0x10 0x80
+
+# CHECK: swr $4, 16($5)
+0x85 0x60 0x10 0x90
+
+# CHECK: movz $9, $6, $7
+0xe6 0x00 0x58 0x48
+
+# CHECK: movn $9, $6, $7
+0xe6 0x00 0x18 0x48
+
+# CHECK: movt $9, $6, $fcc0
+0x26 0x55 0x7b 0x09
+
+# CHECK: movf $9, $6, $fcc0
+0x26 0x55 0x7b 0x01
+
+# CHECK: mthi $6
+0x06 0x00 0x7c 0x2d
+
+# CHECK: mfhi $6
+0x06 0x00 0x7c 0x0d
+
+# CHECK: mtlo $6
+0x06 0x00 0x7c 0x3d
+
+# CHECK: mflo $6
+0x06 0x00 0x7c 0x1d
+
+# CHECK: madd $4, $5
+0xa4 0x00 0x3c 0xcb
+
+# CHECK: maddu $4, $5
+0xa4 0x00 0x3c 0xdb
+
+# CHECK: msub $4, $5
+0xa4 0x00 0x3c 0xeb
+
+# CHECK: msubu $4, $5
+0xa4 0x00 0x3c 0xfb
+
+# CHECK: clz $9, $6
+0x26 0x01 0x3c 0x5b
+
+# CHECK: clo $9, $6
+0x26 0x01 0x3c 0x4b
+
+# CHECK: seb $9, $6
+0x26 0x01 0x3c 0x2b
+
+# CHECK: seh $9, $6
+0x26 0x01 0x3c 0x3b
+
+# CHECK: wsbh $9, $6
+0x26 0x01 0x3c 0x7b
+
+# CHECK: ext $9, $6, 3, 7
+0x26 0x01 0xec 0x30
+
+# CHECK: ins $9, $6, 3, 7
+0x26 0x01 0xcc 0x48
+
+# CHECK: j 1328
+0x00 0xd4 0x98 0x02
+
+# CHECK: jal 1328
+0x00 0xf4 0x98 0x02
+
+# CHECK: jalr $ra, $6
+0xe6 0x03 0x3c 0x0f
+
+# CHECK: jr $7
+0x07 0x00 0x3c 0x0f
+
+# CHECK: beq $9, $6, 1332
+0xc9 0x94 0x9a 0x02
+
+# CHECK: bgez $6, 1332
+0x46 0x40 0x9a 0x02
+
+# CHECK: bgezal $6, 1332
+0x66 0x40 0x9a 0x02
+
+# CHECK: bltzal $6, 1332
+0x26 0x40 0x9a 0x02
+
+# CHECK: bgtz $6, 1332
+0xc6 0x40 0x9a 0x02
+
+# CHECK: blez $6, 1332
+0x86 0x40 0x9a 0x02
+
+# CHECK: bne $9, $6, 1332
+0xc9 0xb4 0x9a 0x02
+
+# CHECK: bltz $6, 1332
+0x06 0x40 0x9a 0x02
+
+# CHECK: teq $8, $9, 0
+0x28 0x01 0x3c 0x00
+
+# CHECK: tge $8, $9, 0
+0x28 0x01 0x3c 0x02
+
+# CHECK: tgeu $8, $9, 0
+0x28 0x01 0x3c 0x04
+
+# CHECK: tlt $8, $9, 0
+0x28 0x01 0x3c 0x08
+
+# CHECK: tltu $8, $9, 0
+0x28 0x01 0x3c 0x0a
+
+# CHECK: tne $8, $9, 0
+0x28 0x01 0x3c 0x0c
+
+# CHECK: teqi $9, 17767
+0xc9 0x41 0x67 0x45
+
+# CHECK: tgei $9, 17767
+0x29 0x41 0x67 0x45
+
+# CHECK: tgeiu $9, 17767
+0x69 0x41 0x67 0x45
+
+# CHECK: tlti $9, 17767
+0x09 0x41 0x67 0x45
+
+# CHECK: tltiu $9, 17767
+0x49 0x41 0x67 0x45
+
+# CHECK: tnei $9, 17767
+0x89 0x41 0x67 0x45
diff --git a/test/MC/Disassembler/Mips/mips-dsp.txt b/test/MC/Disassembler/Mips/mips-dsp.txt
index d10e62c..3f60ae1 100644
--- a/test/MC/Disassembler/Mips/mips-dsp.txt
+++ b/test/MC/Disassembler/Mips/mips-dsp.txt
@@ -11,3 +11,12 @@
 
 # CHECK: mtlo $21, $ac3
 0x13 0x18 0xa0 0x02
+
+# CHECK: lbux $10, $20($26)
+0x8a 0x51 0x54 0x7f
+
+# CHECK: lhx  $11, $21($27)
+0x0a 0x59 0x75 0x7f
+
+# CHECK: lwx  $12, $22($gp)
+0x0a 0x60 0x96 0x7f
diff --git a/test/MC/Disassembler/Mips/mips32r2.txt b/test/MC/Disassembler/Mips/mips32r2.txt
index 48b6ad4..11d9058 100644
--- a/test/MC/Disassembler/Mips/mips32r2.txt
+++ b/test/MC/Disassembler/Mips/mips32r2.txt
@@ -242,6 +242,9 @@
 # CHECK: lui  $6, 17767
 0x3c 0x06 0x45 0x67
 
+# CHECK: luxc1 $f0, $6($5)
+0x4c 0xa6 0x00 0x05
+
 # CHECK: lw  $4, 24($5)
 0x8c 0xa4 0x00 0x18
 
@@ -254,6 +257,9 @@
 # CHECK: lwr   $3, 16($5)
 0x98 0xa3 0x00 0x10
 
+# CHECK: lwxc1 $f20, $12($14)
+0x4d 0xcc 0x05 0x00
+
 # CHECK: madd   $6,  $7
 0x70 0xc7 0x00 0x00
 
@@ -404,6 +410,9 @@
 # CHECK: subu  $4, $3, $5
 0x00 0x65 0x20 0x23
 
+# CHECK: suxc1 $f4, $24($5)
+0x4c 0xb8 0x20 0x0d
+
 # CHECK: sw  $4, 24($5)
 0xac 0xa4 0x00 0x18
 
@@ -416,6 +425,9 @@
 # CHECK: swr $6, 16($7)
 0xb8 0xe6 0x00 0x10
 
+# CHECK: swxc1 $f26, $18($22)
+0x4e 0xd2 0xd0 0x08
+
 # CHECK: sync  7
 0x00 0x00 0x01 0xcf
 
diff --git a/test/MC/Disassembler/Mips/mips32r2_le.txt b/test/MC/Disassembler/Mips/mips32r2_le.txt
index c62c695..adafcf1 100644
--- a/test/MC/Disassembler/Mips/mips32r2_le.txt
+++ b/test/MC/Disassembler/Mips/mips32r2_le.txt
@@ -242,6 +242,9 @@
 # CHECK: lui  $6, 17767
 0x67 0x45 0x06 0x3c
 
+# CHECK: luxc1 $f0, $6($5)
+0x05 0x00 0xa6 0x4c
+
 # CHECK: lw  $4, 24($5)
 0x18 0x00 0xa4 0x8c
 
@@ -254,6 +257,9 @@
 # CHECK: lwr   $3, 16($5)
 0x10 0x00 0xa3 0x98
 
+# CHECK: lwxc1 $f20, $12($14)
+0x00 0x05 0xcc 0x4d
+
 # CHECK: madd   $6,  $7
 0x00 0x00 0xc7 0x70
 
@@ -404,6 +410,9 @@
 # CHECK: subu  $4, $3, $5
 0x23 0x20 0x65 0x00
 
+# CHECK: suxc1 $f4, $24($5)
+0x0d 0x20 0xb8 0x4c
+
 # CHECK: sw  $4, 24($5)
 0x18 0x00 0xa4 0xac
 
@@ -416,6 +425,9 @@
 # CHECK: swr $6, 16($7)
 0x10 0x00 0xe6 0xb8
 
+# CHECK: swxc1 $f26, $18($22)
+0x08 0xd0 0xd2 0x4e
+
 # CHECK: sync  7
 0xcf 0x01 0x00 0x00
 
diff --git a/test/MC/Disassembler/Mips/mips64.txt b/test/MC/Disassembler/Mips/mips64.txt
index b887473..f3d2d10 100644
--- a/test/MC/Disassembler/Mips/mips64.txt
+++ b/test/MC/Disassembler/Mips/mips64.txt
@@ -2,6 +2,9 @@
 # CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
+# CHECK: daddiu $sp, $sp, -32
+0x67 0xbd 0xff 0xe0
+
 # CHECK: daddu $26, $1, $11
 0x00 0x2b 0xd0 0x2d
 
@@ -64,3 +67,21 @@
 
 # CHECK: sd $6, 17767($zero)
 0xfc 0x06 0x45 0x67
+
+# CHECK: luxc1 $f0, $6($5)
+0x4c 0xa6 0x00 0x05
+
+# CHECK: lwxc1 $f20, $12($14)
+0x4d 0xcc 0x05 0x00
+
+# CHECK: suxc1 $f4, $24($5)
+0x4c 0xb8 0x20 0x0d
+
+# CHECK: swxc1 $f26, $18($22)
+0x4e 0xd2 0xd0 0x08
+
+# CHECK: ldxc1 $f2, $2($10)
+0x4d 0x42 0x00 0x81
+
+# CHECK: sdxc1 $f8, $4($25)
+0x4f 0x24 0x40 0x09
diff --git a/test/MC/Disassembler/Mips/mips64_le.txt b/test/MC/Disassembler/Mips/mips64_le.txt
index ddc3c2b..0d3d2fa 100644
--- a/test/MC/Disassembler/Mips/mips64_le.txt
+++ b/test/MC/Disassembler/Mips/mips64_le.txt
@@ -64,3 +64,21 @@
 
 # CHECK: sd $6, 17767($zero)
 0x67 0x45 0x06 0xfc
+
+# CHECK: luxc1 $f0, $6($5)
+0x05 0x00 0xa6 0x4c
+
+# CHECK: lwxc1 $f20, $12($14)
+0x00 0x05 0xcc 0x4d
+
+# CHECK: suxc1 $f4, $24($5)
+0x0d 0x20 0xb8 0x4c
+
+# CHECK: swxc1 $f26, $18($22)
+0x08 0xd0 0xd2 0x4e
+
+# CHECK: ldxc1 $f2, $2($10)
+0x81 0x00 0x42 0x4d
+
+# CHECK: sdxc1 $f8, $4($25)
+0x09 0x40 0x24 0x4f
diff --git a/test/MC/Disassembler/SystemZ/insns-pcrel.txt b/test/MC/Disassembler/SystemZ/insns-pcrel.txt
index c565b6e..b7edab6 100644
--- a/test/MC/Disassembler/SystemZ/insns-pcrel.txt
+++ b/test/MC/Disassembler/SystemZ/insns-pcrel.txt
@@ -1330,3 +1330,403 @@
 # 0x0000077c:
 # CHECK: brctg %r15, 0x1077a
 0xa7 0xf7 0x7f 0xff
+
+# 0x00000780:
+# CHECK: pfdrl 0, 0x780
+0xc6 0x02 0x00 0x00 0x00 0x00
+
+# 0x00000786:
+# CHECK: pfdrl 15, 0x786
+0xc6 0xf2 0x00 0x00 0x00 0x00
+
+# 0x0000078c:
+# CHECK: pfdrl 0, 0x78a
+0xc6 0x02 0xff 0xff 0xff 0xff
+
+# 0x00000792:
+# CHECK: pfdrl 15, 0x790
+0xc6 0xf2 0xff 0xff 0xff 0xff
+
+# 0x00000798:
+# CHECK: pfdrl 0, 0xffffffff00000798
+0xc6 0x02 0x80 0x00 0x00 0x00
+
+# 0x0000079e:
+# CHECK: pfdrl 15, 0xffffffff0000079e
+0xc6 0xf2 0x80 0x00 0x00 0x00
+
+# 0x000007a4:
+# CHECK: pfdrl 0, 0x1000007a2
+0xc6 0x02 0x7f 0xff 0xff 0xff
+
+# 0x000007aa:
+# CHECK: pfdrl 15, 0x1000007a8
+0xc6 0xf2 0x7f 0xff 0xff 0xff
+
+# 0x000007b0:
+# CHECK: clgrj %r0, %r0, 0, 0x7b0
+0xec 0x00 0x00 0x00 0x00 0x65
+
+# 0x000007b6:
+# CHECK: clgrj %r0, %r15, 0, 0x7b6
+0xec 0x0f 0x00 0x00 0x00 0x65
+
+# 0x000007bc:
+# CHECK: clgrj %r15, %r0, 0, 0x7bc
+0xec 0xf0 0x00 0x00 0x00 0x65
+
+# 0x000007c2:
+# CHECK: clgrj %r7, %r8, 0, 0x7c2
+0xec 0x78 0x00 0x00 0x00 0x65
+
+# 0x000007c8:
+# CHECK: clgrj %r0, %r0, 0, 0x7c6
+0xec 0x00 0xff 0xff 0x00 0x65
+
+# 0x000007ce:
+# CHECK: clgrj %r0, %r0, 0, 0xffffffffffff07ce
+0xec 0x00 0x80 0x00 0x00 0x65
+
+# 0x000007d4:
+# CHECK: clgrj %r0, %r0, 0, 0x107d2
+0xec 0x00 0x7f 0xff 0x00 0x65
+
+# 0x000007da:
+# CHECK: clgrj %r0, %r0, 1, 0x7da
+0xec 0x00 0x00 0x00 0x10 0x65
+
+# 0x000007e0:
+# CHECK: clgrjh %r0, %r0, 0x7e0
+0xec 0x00 0x00 0x00 0x20 0x65
+
+# 0x000007e6:
+# CHECK: clgrj %r0, %r0, 3, 0x7e6
+0xec 0x00 0x00 0x00 0x30 0x65
+
+# 0x000007ec:
+# CHECK: clgrjl %r0, %r0, 0x7ec
+0xec 0x00 0x00 0x00 0x40 0x65
+
+# 0x000007f2:
+# CHECK: clgrj %r0, %r0, 5, 0x7f2
+0xec 0x00 0x00 0x00 0x50 0x65
+
+# 0x000007f8:
+# CHECK: clgrjlh %r0, %r0, 0x7f8
+0xec 0x00 0x00 0x00 0x60 0x65
+
+# 0x000007fe:
+# CHECK: clgrj %r0, %r0, 7, 0x7fe
+0xec 0x00 0x00 0x00 0x70 0x65
+
+# 0x00000804:
+# CHECK: clgrje %r0, %r0, 0x804
+0xec 0x00 0x00 0x00 0x80 0x65
+
+# 0x0000080a:
+# CHECK: clgrj %r0, %r0, 9, 0x80a
+0xec 0x00 0x00 0x00 0x90 0x65
+
+# 0x00000810:
+# CHECK: clgrjhe %r0, %r0, 0x810
+0xec 0x00 0x00 0x00 0xa0 0x65
+
+# 0x00000816:
+# CHECK: clgrj %r0, %r0, 11, 0x816
+0xec 0x00 0x00 0x00 0xb0 0x65
+
+# 0x0000081c:
+# CHECK: clgrjle %r0, %r0, 0x81c
+0xec 0x00 0x00 0x00 0xc0 0x65
+
+# 0x00000822:
+# CHECK: clgrj %r0, %r0, 13, 0x822
+0xec 0x00 0x00 0x00 0xd0 0x65
+
+# 0x00000828:
+# CHECK: clgrj %r0, %r0, 14, 0x828
+0xec 0x00 0x00 0x00 0xe0 0x65
+
+# 0x0000082e:
+# CHECK: clgrj %r0, %r0, 15, 0x82e
+0xec 0x00 0x00 0x00 0xf0 0x65
+
+# 0x00000834:
+# CHECK: clrj %r0, %r0, 0, 0x834
+0xec 0x00 0x00 0x00 0x00 0x77
+
+# 0x0000083a:
+# CHECK: clrj %r0, %r15, 0, 0x83a
+0xec 0x0f 0x00 0x00 0x00 0x77
+
+# 0x00000840:
+# CHECK: clrj %r15, %r0, 0, 0x840
+0xec 0xf0 0x00 0x00 0x00 0x77
+
+# 0x00000846:
+# CHECK: clrj %r7, %r8, 0, 0x846
+0xec 0x78 0x00 0x00 0x00 0x77
+
+# 0x0000084c:
+# CHECK: clrj %r0, %r0, 0, 0x84a
+0xec 0x00 0xff 0xff 0x00 0x77
+
+# 0x00000852:
+# CHECK: clrj %r0, %r0, 0, 0xffffffffffff0852
+0xec 0x00 0x80 0x00 0x00 0x77
+
+# 0x00000858:
+# CHECK: clrj %r0, %r0, 0, 0x10856
+0xec 0x00 0x7f 0xff 0x00 0x77
+
+# 0x0000085e:
+# CHECK: clrj %r0, %r0, 1, 0x85e
+0xec 0x00 0x00 0x00 0x10 0x77
+
+# 0x00000864:
+# CHECK: clrjh %r0, %r0, 0x864
+0xec 0x00 0x00 0x00 0x20 0x77
+
+# 0x0000086a:
+# CHECK: clrj %r0, %r0, 3, 0x86a
+0xec 0x00 0x00 0x00 0x30 0x77
+
+# 0x00000870:
+# CHECK: clrjl %r0, %r0, 0x870
+0xec 0x00 0x00 0x00 0x40 0x77
+
+# 0x00000876:
+# CHECK: clrj %r0, %r0, 5, 0x876
+0xec 0x00 0x00 0x00 0x50 0x77
+
+# 0x0000087c:
+# CHECK: clrjlh %r0, %r0, 0x87c
+0xec 0x00 0x00 0x00 0x60 0x77
+
+# 0x00000882:
+# CHECK: clrj %r0, %r0, 7, 0x882
+0xec 0x00 0x00 0x00 0x70 0x77
+
+# 0x00000888:
+# CHECK: clrje %r0, %r0, 0x888
+0xec 0x00 0x00 0x00 0x80 0x77
+
+# 0x0000088e:
+# CHECK: clrj %r0, %r0, 9, 0x88e
+0xec 0x00 0x00 0x00 0x90 0x77
+
+# 0x00000894:
+# CHECK: clrjhe %r0, %r0, 0x894
+0xec 0x00 0x00 0x00 0xa0 0x77
+
+# 0x0000089a:
+# CHECK: clrj %r0, %r0, 11, 0x89a
+0xec 0x00 0x00 0x00 0xb0 0x77
+
+# 0x000008a0:
+# CHECK: clrjle %r0, %r0, 0x8a0
+0xec 0x00 0x00 0x00 0xc0 0x77
+
+# 0x000008a6:
+# CHECK: clrj %r0, %r0, 13, 0x8a6
+0xec 0x00 0x00 0x00 0xd0 0x77
+
+# 0x000008ac:
+# CHECK: clrj %r0, %r0, 14, 0x8ac
+0xec 0x00 0x00 0x00 0xe0 0x77
+
+# 0x000008b2:
+# CHECK: clrj %r0, %r0, 15, 0x8b2
+0xec 0x00 0x00 0x00 0xf0 0x77
+
+# 0x000008b8:
+# CHECK: clgij %r0, 0, 0, 0x8b8
+0xec 0x00 0x00 0x00 0x00 0x7d
+
+# 0x000008be:
+# CHECK: clgij %r0, 127, 0, 0x8be
+0xec 0x00 0x00 0x00 0x7f 0x7d
+
+# 0x000008c4:
+# CHECK: clgij %r0, 128, 0, 0x8c4
+0xec 0x00 0x00 0x00 0x80 0x7d
+
+# 0x000008ca:
+# CHECK: clgij %r0, 255, 0, 0x8ca
+0xec 0x00 0x00 0x00 0xff 0x7d
+
+# 0x000008d0:
+# CHECK: clgij %r15, 0, 0, 0x8d0
+0xec 0xf0 0x00 0x00 0x00 0x7d
+
+# 0x000008d6:
+# CHECK: clgij %r7, 100, 0, 0x8d6
+0xec 0x70 0x00 0x00 0x64 0x7d
+
+# 0x000008dc:
+# CHECK: clgij %r0, 0, 0, 0x8da
+0xec 0x00 0xff 0xff 0x00 0x7d
+
+# 0x000008e2:
+# CHECK: clgij %r0, 0, 0, 0xffffffffffff08e2
+0xec 0x00 0x80 0x00 0x00 0x7d
+
+# 0x000008e8:
+# CHECK: clgij %r0, 0, 0, 0x108e6
+0xec 0x00 0x7f 0xff 0x00 0x7d
+
+# 0x000008ee:
+# CHECK: clgij %r0, 0, 1, 0x8ee
+0xec 0x01 0x00 0x00 0x00 0x7d
+
+# 0x000008f4:
+# CHECK: clgijh %r0, 0, 0x8f4
+0xec 0x02 0x00 0x00 0x00 0x7d
+
+# 0x000008fa:
+# CHECK: clgij %r0, 0, 3, 0x8fa
+0xec 0x03 0x00 0x00 0x00 0x7d
+
+# 0x00000900:
+# CHECK: clgijl %r0, 0, 0x900
+0xec 0x04 0x00 0x00 0x00 0x7d
+
+# 0x00000906:
+# CHECK: clgij %r0, 0, 5, 0x906
+0xec 0x05 0x00 0x00 0x00 0x7d
+
+# 0x0000090c:
+# CHECK: clgijlh %r0, 0, 0x90c
+0xec 0x06 0x00 0x00 0x00 0x7d
+
+# 0x00000912:
+# CHECK: clgij %r0, 0, 7, 0x912
+0xec 0x07 0x00 0x00 0x00 0x7d
+
+# 0x00000918:
+# CHECK: clgije %r0, 0, 0x918
+0xec 0x08 0x00 0x00 0x00 0x7d
+
+# 0x0000091e:
+# CHECK: clgij %r0, 0, 9, 0x91e
+0xec 0x09 0x00 0x00 0x00 0x7d
+
+# 0x00000924:
+# CHECK: clgijhe %r0, 0, 0x924
+0xec 0x0a 0x00 0x00 0x00 0x7d
+
+# 0x0000092a:
+# CHECK: clgij %r0, 0, 11, 0x92a
+0xec 0x0b 0x00 0x00 0x00 0x7d
+
+# 0x00000930:
+# CHECK: clgijle %r0, 0, 0x930
+0xec 0x0c 0x00 0x00 0x00 0x7d
+
+# 0x00000936:
+# CHECK: clgij %r0, 0, 13, 0x936
+0xec 0x0d 0x00 0x00 0x00 0x7d
+
+# 0x0000093c:
+# CHECK: clgij %r0, 0, 14, 0x93c
+0xec 0x0e 0x00 0x00 0x00 0x7d
+
+# 0x00000942:
+# CHECK: clgij %r0, 0, 15, 0x942
+0xec 0x0f 0x00 0x00 0x00 0x7d
+
+# 0x00000948:
+# CHECK: clij %r0, 0, 0, 0x948
+0xec 0x00 0x00 0x00 0x00 0x7f
+
+# 0x0000094e:
+# CHECK: clij %r0, 127, 0, 0x94e
+0xec 0x00 0x00 0x00 0x7f 0x7f
+
+# 0x00000954:
+# CHECK: clij %r0, 128, 0, 0x954
+0xec 0x00 0x00 0x00 0x80 0x7f
+
+# 0x0000095a:
+# CHECK: clij %r0, 255, 0, 0x95a
+0xec 0x00 0x00 0x00 0xff 0x7f
+
+# 0x00000960:
+# CHECK: clij %r15, 0, 0, 0x960
+0xec 0xf0 0x00 0x00 0x00 0x7f
+
+# 0x00000966:
+# CHECK: clij %r7, 100, 0, 0x966
+0xec 0x70 0x00 0x00 0x64 0x7f
+
+# 0x0000096c:
+# CHECK: clij %r0, 0, 0, 0x96a
+0xec 0x00 0xff 0xff 0x00 0x7f
+
+# 0x00000972:
+# CHECK: clij %r0, 0, 0, 0xffffffffffff0972
+0xec 0x00 0x80 0x00 0x00 0x7f
+
+# 0x00000978:
+# CHECK: clij %r0, 0, 0, 0x10976
+0xec 0x00 0x7f 0xff 0x00 0x7f
+
+# 0x0000097e:
+# CHECK: clij %r0, 0, 1, 0x97e
+0xec 0x01 0x00 0x00 0x00 0x7f
+
+# 0x00000984:
+# CHECK: clijh %r0, 0, 0x984
+0xec 0x02 0x00 0x00 0x00 0x7f
+
+# 0x0000098a:
+# CHECK: clij %r0, 0, 3, 0x98a
+0xec 0x03 0x00 0x00 0x00 0x7f
+
+# 0x00000990:
+# CHECK: clijl %r0, 0, 0x990
+0xec 0x04 0x00 0x00 0x00 0x7f
+
+# 0x00000996:
+# CHECK: clij %r0, 0, 5, 0x996
+0xec 0x05 0x00 0x00 0x00 0x7f
+
+# 0x0000099c:
+# CHECK: clijlh %r0, 0, 0x99c
+0xec 0x06 0x00 0x00 0x00 0x7f
+
+# 0x000009a2:
+# CHECK: clij %r0, 0, 7, 0x9a2
+0xec 0x07 0x00 0x00 0x00 0x7f
+
+# 0x000009a8:
+# CHECK: clije %r0, 0, 0x9a8
+0xec 0x08 0x00 0x00 0x00 0x7f
+
+# 0x000009ae:
+# CHECK: clij %r0, 0, 9, 0x9ae
+0xec 0x09 0x00 0x00 0x00 0x7f
+
+# 0x000009b4:
+# CHECK: clijhe %r0, 0, 0x9b4
+0xec 0x0a 0x00 0x00 0x00 0x7f
+
+# 0x000009ba:
+# CHECK: clij %r0, 0, 11, 0x9ba
+0xec 0x0b 0x00 0x00 0x00 0x7f
+
+# 0x000009c0:
+# CHECK: clijle %r0, 0, 0x9c0
+0xec 0x0c 0x00 0x00 0x00 0x7f
+
+# 0x000009c6:
+# CHECK: clij %r0, 0, 13, 0x9c6
+0xec 0x0d 0x00 0x00 0x00 0x7f
+
+# 0x000009cc:
+# CHECK: clij %r0, 0, 14, 0x9cc
+0xec 0x0e 0x00 0x00 0x00 0x7f
+
+# 0x000009d2:
+# CHECK: clij %r0, 0, 15, 0x9d2
+0xec 0x0f 0x00 0x00 0x00 0x7f
diff --git a/test/MC/Disassembler/SystemZ/insns.txt b/test/MC/Disassembler/SystemZ/insns.txt
index 51860cc..78d348d 100644
--- a/test/MC/Disassembler/SystemZ/insns.txt
+++ b/test/MC/Disassembler/SystemZ/insns.txt
@@ -349,6 +349,24 @@
 # CHECK: ahy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x7a
 
+# CHECK: aih %r0, -2147483648
+0xcc 0x08 0x80 0x00 0x00 0x00
+
+# CHECK: aih %r0, -1
+0xcc 0x08 0xff 0xff 0xff 0xff
+
+# CHECK: aih %r0, 0
+0xcc 0x08 0x00 0x00 0x00 0x00
+
+# CHECK: aih %r0, 1
+0xcc 0x08 0x00 0x00 0x00 0x01
+
+# CHECK: aih %r0, 2147483647
+0xcc 0x08 0x7f 0xff 0xff 0xff
+
+# CHECK: aih %r15, 0
+0xcc 0xf8 0x00 0x00 0x00 0x00
+
 # CHECK: alcgr %r0, %r0
 0xb9 0x88 0x00 0x00
 
@@ -772,6 +790,51 @@
 # CHECK: basr %r15, %r1
 0x0d 0xf1
 
+# CHECK: bcr 0, %r14
+0x07 0x0e
+
+# CHECK: bor %r13
+0x07 0x1d
+
+# CHECK: bhr %r12
+0x07 0x2c
+
+# CHECK: bnler %r11
+0x07 0x3b
+
+# CHECK: blr %r10
+0x07 0x4a
+
+# CHECK: bnher %r9
+0x07 0x59
+
+# CHECK: blhr %r8
+0x07 0x68
+
+# CHECK: bner %r7
+0x07 0x77
+
+# CHECK: ber %r6
+0x07 0x86
+
+# CHECK: bnlhr %r5
+0x07 0x95
+
+# CHECK: bher %r4
+0x07 0xa4
+
+# CHECK: bnlr %r3
+0x07 0xb3
+
+# CHECK: bler %r2
+0x07 0xc2
+
+# CHECK: bnhr %r1
+0x07 0xd1
+
+# CHECK: bnor %r0
+0x07 0xe0
+
 # CHECK: br %r1
 0x07 0xf1
 
@@ -1198,6 +1261,36 @@
 # CHECK: cgxbr %r15, 0, %f0
 0xb3 0xaa 0x00 0xf0
 
+# CHECK: chf %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xcd
+
+# CHECK: chf %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xcd
+
+# CHECK: chf %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xcd
+
+# CHECK: chf %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xcd
+
+# CHECK: chf %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xcd
+
+# CHECK: chf %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xcd
+
+# CHECK: chf %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xcd
+
+# CHECK: chf %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xcd
+
+# CHECK: chf %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xcd
+
+# CHECK: chf %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xcd
+
 # CHECK: chhsi 0, 0
 0xe5 0x54 0x00 0x00 0x00 0x00
 
@@ -1333,6 +1426,60 @@
 # CHECK: chy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x79
 
+# CHECK: cih %r0, -2147483648
+0xcc 0x0d 0x80 0x00 0x00 0x00
+
+# CHECK: cih %r0, -1
+0xcc 0x0d 0xff 0xff 0xff 0xff
+
+# CHECK: cih %r0, 0
+0xcc 0x0d 0x00 0x00 0x00 0x00
+
+# CHECK: cih %r0, 1
+0xcc 0x0d 0x00 0x00 0x00 0x01
+
+# CHECK: cih %r0, 2147483647
+0xcc 0x0d 0x7f 0xff 0xff 0xff
+
+# CHECK: cih %r15, 0
+0xcc 0xfd 0x00 0x00 0x00 0x00
+
+# CHECK: clc 0(1), 0
+0xd5 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: clc 0(1), 0(%r1)
+0xd5 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: clc 0(1), 0(%r15)
+0xd5 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: clc 0(1), 4095
+0xd5 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: clc 0(1), 4095(%r1)
+0xd5 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: clc 0(1), 4095(%r15)
+0xd5 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: clc 0(1,%r1), 0
+0xd5 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: clc 0(1,%r15), 0
+0xd5 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: clc 4095(1,%r1), 0
+0xd5 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: clc 4095(1,%r15), 0
+0xd5 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: clc 0(256,%r1), 0
+0xd5 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: clc 0(256,%r15), 0
+0xd5 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: clfhsi 0, 0
 0xe5 0x5d 0x00 0x00 0x00 0x00
 
@@ -1477,6 +1624,36 @@
 # CHECK: clg %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x21
 
+# CHECK: clhf %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xcf
+
+# CHECK: clhf %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xcf
+
+# CHECK: clhf %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xcf
+
+# CHECK: clhf %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xcf
+
+# CHECK: clhf %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xcf
+
+# CHECK: clhf %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xcf
+
+# CHECK: clhf %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xcf
+
+# CHECK: clhf %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xcf
+
+# CHECK: clhf %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xcf
+
+# CHECK: clhf %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xcf
+
 # CHECK: clhhsi 0, 0
 0xe5 0x55 0x00 0x00 0x00 0x00
 
@@ -1519,6 +1696,18 @@
 # CHECK: cli 4095(%r15), 42
 0x95 0x2a 0xff 0xff
 
+# CHECK: clih %r0, 0
+0xcc 0x0f 0x00 0x00 0x00 0x00
+
+# CHECK: clih %r0, 1
+0xcc 0x0f 0x00 0x00 0x00 0x01
+
+# CHECK: clih %r0, 4294967295
+0xcc 0x0f 0xff 0xff 0xff 0xff
+
+# CHECK: clih %r15, 0
+0xcc 0xff 0x00 0x00 0x00 0x00
+
 # CHECK: cliy -524288, 0
 0xeb 0x00 0x00 0x00 0x80 0x55
 
@@ -1561,6 +1750,18 @@
 # CHECK: clr %r7, %r8
 0x15 0x78
 
+# CHECK: clst %r0, %r0
+0xb2 0x5d 0x00 0x00
+
+# CHECK: clst %r0, %r15
+0xb2 0x5d 0x00 0x0f
+
+# CHECK: clst %r15, %r0
+0xb2 0x5d 0x00 0xf0
+
+# CHECK: clst %r7, %r8
+0xb2 0x5d 0x00 0x78
+
 # CHECK: cl %r0, 0
 0x55 0x00 0x00 0x00
 
@@ -2101,6 +2302,24 @@
 # CHECK: fidbr %f15, 0, %f0
 0xb3 0x5f 0x00 0xf0
 
+# CHECK: fidbra	%f0, 0, %f0, 1
+0xb3 0x5f 0x01 0x00
+
+# CHECK: fidbra	%f0, 0, %f0, 15
+0xb3 0x5f 0x0f 0x00
+
+# CHECK: fidbra	%f0, 0, %f15, 1
+0xb3 0x5f 0x01 0x0f
+
+# CHECK: fidbra	%f0, 15, %f0, 1
+0xb3 0x5f 0xf1 0x00
+
+# CHECK: fidbra	%f4, 5, %f6, 7
+0xb3 0x5f 0x57 0x46
+
+# CHECK: fidbra	%f15, 0, %f0, 1
+0xb3 0x5f 0x01 0xf0
+
 # CHECK: fiebr %f0, 0, %f0
 0xb3 0x57 0x00 0x00
 
@@ -2116,6 +2335,24 @@
 # CHECK: fiebr %f15, 0, %f0
 0xb3 0x57 0x00 0xf0
 
+# CHECK: fiebra	%f0, 0, %f0, 1
+0xb3 0x57 0x01 0x00
+
+# CHECK: fiebra	%f0, 0, %f0, 15
+0xb3 0x57 0x0f 0x00
+
+# CHECK: fiebra	%f0, 0, %f15, 1
+0xb3 0x57 0x01 0x0f
+
+# CHECK: fiebra	%f0, 15, %f0, 1
+0xb3 0x57 0xf1 0x00
+
+# CHECK: fiebra	%f4, 5, %f6, 7
+0xb3 0x57 0x57 0x46
+
+# CHECK: fiebra	%f15, 0, %f0, 1
+0xb3 0x57 0x01 0xf0
+
 # CHECK: fixbr %f0, 0, %f0
 0xb3 0x47 0x00 0x00
 
@@ -2131,6 +2368,24 @@
 # CHECK: fixbr %f13, 0, %f0
 0xb3 0x47 0x00 0xd0
 
+# CHECK: fixbra	%f0, 0, %f0, 1
+0xb3 0x47 0x01 0x00
+
+# CHECK: fixbra	%f0, 0, %f0, 15
+0xb3 0x47 0x0f 0x00
+
+# CHECK: fixbra	%f0, 0, %f13, 1
+0xb3 0x47 0x01 0x0d
+
+# CHECK: fixbra	%f0, 15, %f0, 1
+0xb3 0x47 0xf1 0x00
+
+# CHECK: fixbra	%f4, 5, %f8, 9
+0xb3 0x47 0x59 0x48
+
+# CHECK: fixbra	%f13, 0, %f0, 1
+0xb3 0x47 0x01 0xd0
+
 # CHECK: flogr %r0, %r0
 0xb9 0x83 0x00 0x00
 
@@ -2260,6 +2515,15 @@
 # CHECK: iill %r15, 0
 0xa5 0xf3 0x00 0x00
 
+# CHECK: ipm %r0
+0xb2 0x22 0x00 0x00
+
+# CHECK: ipm %r1
+0xb2 0x22 0x00 0x10
+
+# CHECK: ipm %r15
+0xb2 0x22 0x00 0xf0
+
 # CHECK: la %r0, 0
 0x41 0x00 0x00 0x00
 
@@ -2350,6 +2614,36 @@
 # CHECK: lb %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x76
 
+# CHECK: lbh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xc0
+
+# CHECK: lbh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xc0
+
+# CHECK: lbh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xc0
+
+# CHECK: lbh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xc0
+
+# CHECK: lbh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xc0
+
+# CHECK: lbh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xc0
+
+# CHECK: lbh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xc0
+
+# CHECK: lbh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xc0
+
+# CHECK: lbh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xc0
+
+# CHECK: lbh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xc0
+
 # CHECK: lcdbr %f0, %f9
 0xb3 0x13 0x00 0x09
 
@@ -2638,6 +2932,36 @@
 # CHECK: ley %f15, 0
 0xed 0xf0 0x00 0x00 0x00 0x64
 
+# CHECK: lfh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xca
+
+# CHECK: lfh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xca
+
+# CHECK: lfh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xca
+
+# CHECK: lfh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xca
+
+# CHECK: lfh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xca
+
+# CHECK: lfh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xca
+
+# CHECK: lfh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xca
+
+# CHECK: lfh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xca
+
+# CHECK: lfh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xca
+
+# CHECK: lfh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xca
+
 # CHECK: lgbr %r0, %r15
 0xb9 0x06 0x00 0x0f
 
@@ -2866,6 +3190,36 @@
 # CHECK: lhi %r15, 0
 0xa7 0xf8 0x00 0x00
 
+# CHECK: lhh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xc4
+
+# CHECK: lhh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xc4
+
+# CHECK: lhh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xc4
+
+# CHECK: lhh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xc4
+
+# CHECK: lhh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xc4
+
+# CHECK: lhh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xc4
+
+# CHECK: lhh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xc4
+
+# CHECK: lhh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xc4
+
+# CHECK: lhh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xc4
+
+# CHECK: lhh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xc4
+
 # CHECK: lhr %r0, %r15
 0xb9 0x27 0x00 0x0f
 
@@ -2965,6 +3319,36 @@
 # CHECK: llc %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x94
 
+# CHECK: llch %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xc2
+
+# CHECK: llch %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xc2
+
+# CHECK: llch %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xc2
+
+# CHECK: llch %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xc2
+
+# CHECK: llch %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xc2
+
+# CHECK: llch %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xc2
+
+# CHECK: llch %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xc2
+
+# CHECK: llch %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xc2
+
+# CHECK: llch %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xc2
+
+# CHECK: llch %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xc2
+
 # CHECK: llgcr %r0, %r15
 0xb9 0x84 0x00 0x0f
 
@@ -3121,6 +3505,36 @@
 # CHECK: llh %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x95
 
+# CHECK: llhh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xc6
+
+# CHECK: llhh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xc6
+
+# CHECK: llhh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xc6
+
+# CHECK: llhh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xc6
+
+# CHECK: llhh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xc6
+
+# CHECK: llhh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xc6
+
+# CHECK: llhh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xc6
+
+# CHECK: llhh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xc6
+
+# CHECK: llhh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xc6
+
+# CHECK: llhh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xc6
+
 # CHECK: llihf %r0, 0
 0xc0 0x0e 0x00 0x00 0x00 0x00
 
@@ -3250,6 +3664,42 @@
 # CHECK: lnebr %f15, %f9
 0xb3 0x01 0x00 0xf9
 
+# CHECK: lngfr %r0, %r0
+0xb9 0x11 0x00 0x00
+
+# CHECK: lngfr %r0, %r15
+0xb9 0x11 0x00 0x0f
+
+# CHECK: lngfr %r15, %r0
+0xb9 0x11 0x00 0xf0
+
+# CHECK: lngfr %r7, %r8
+0xb9 0x11 0x00 0x78
+
+# CHECK: lngr %r0, %r0
+0xb9 0x01 0x00 0x00
+
+# CHECK: lngr %r0, %r15
+0xb9 0x01 0x00 0x0f
+
+# CHECK: lngr %r15, %r0
+0xb9 0x01 0x00 0xf0
+
+# CHECK: lngr %r7, %r8
+0xb9 0x01 0x00 0x78
+
+# CHECK: lnr %r0, %r0
+0x11 0x00
+
+# CHECK: lnr %r0, %r15
+0x11 0x0f
+
+# CHECK: lnr %r15, %r0
+0x11 0xf0
+
+# CHECK: lnr %r7, %r8
+0x11 0x78
+
 # CHECK: lnxbr %f0, %f8
 0xb3 0x41 0x00 0x08
 
@@ -3478,6 +3928,42 @@
 # CHECK: lpebr %f15, %f9
 0xb3 0x00 0x00 0xf9
 
+# CHECK: lpgfr %r0, %r0
+0xb9 0x10 0x00 0x00
+
+# CHECK: lpgfr %r0, %r15
+0xb9 0x10 0x00 0x0f
+
+# CHECK: lpgfr %r15, %r0
+0xb9 0x10 0x00 0xf0
+
+# CHECK: lpgfr %r7, %r8
+0xb9 0x10 0x00 0x78
+
+# CHECK: lpgr %r0, %r0
+0xb9 0x00 0x00 0x00
+
+# CHECK: lpgr %r0, %r15
+0xb9 0x00 0x00 0x0f
+
+# CHECK: lpgr %r15, %r0
+0xb9 0x00 0x00 0xf0
+
+# CHECK: lpgr %r7, %r8
+0xb9 0x00 0x00 0x78
+
+# CHECK: lpr %r0, %r0
+0x10 0x00
+
+# CHECK: lpr %r0, %r15
+0x10 0x0f
+
+# CHECK: lpr %r15, %r0
+0x10 0xf0
+
+# CHECK: lpr %r7, %r8
+0x10 0x78
+
 # CHECK: lpxbr %f0, %f8
 0xb3 0x40 0x00 0x08
 
@@ -4435,6 +4921,42 @@
 # CHECK: msy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x51
 
+# CHECK: mvc 0(1), 0
+0xd2 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: mvc 0(1), 0(%r1)
+0xd2 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: mvc 0(1), 0(%r15)
+0xd2 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: mvc 0(1), 4095
+0xd2 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: mvc 0(1), 4095(%r1)
+0xd2 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: mvc 0(1), 4095(%r15)
+0xd2 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: mvc 0(1,%r1), 0
+0xd2 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: mvc 0(1,%r15), 0
+0xd2 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: mvc 4095(1,%r1), 0
+0xd2 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: mvc 4095(1,%r15), 0
+0xd2 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: mvc 0(256,%r1), 0
+0xd2 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: mvc 0(256,%r15), 0
+0xd2 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: mvghi 0, 0
 0xe5 0x48 0x00 0x00 0x00 0x00
 
@@ -4585,6 +5107,18 @@
 # CHECK: mviy 524287(%r15), 42
 0xeb 0x2a 0xff 0xff 0x7f 0x52
 
+# CHECK: mvst %r0, %r0
+0xb2 0x55 0x00 0x00
+
+# CHECK: mvst %r0, %r15
+0xb2 0x55 0x00 0x0f
+
+# CHECK: mvst %r15, %r0
+0xb2 0x55 0x00 0xf0
+
+# CHECK: mvst %r7, %r8
+0xb2 0x55 0x00 0x78
+
 # CHECK: mxbr %f0, %f0
 0xb3 0x4c 0x00 0x00
 
@@ -4630,6 +5164,42 @@
 # CHECK: mxdb %f13, 0
 0xed 0xd0 0x00 0x00 0x00 0x07
 
+# CHECK: nc 0(1), 0
+0xd4 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: nc 0(1), 0(%r1)
+0xd4 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: nc 0(1), 0(%r15)
+0xd4 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: nc 0(1), 4095
+0xd4 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: nc 0(1), 4095(%r1)
+0xd4 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: nc 0(1), 4095(%r15)
+0xd4 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: nc 0(1,%r1), 0
+0xd4 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: nc 0(1,%r15), 0
+0xd4 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: nc 4095(1,%r1), 0
+0xd4 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: nc 4095(1,%r15), 0
+0xd4 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: nc 0(256,%r1), 0
+0xd4 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: nc 0(256,%r15), 0
+0xd4 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: ngr %r0, %r0
 0xb9 0x80 0x00 0x00
 
@@ -4864,6 +5434,42 @@
 # CHECK: ny %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x54
 
+# CHECK: oc 0(1), 0
+0xd6 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: oc 0(1), 0(%r1)
+0xd6 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: oc 0(1), 0(%r15)
+0xd6 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: oc 0(1), 4095
+0xd6 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: oc 0(1), 4095(%r1)
+0xd6 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: oc 0(1), 4095(%r15)
+0xd6 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: oc 0(1,%r1), 0
+0xd6 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: oc 0(1,%r15), 0
+0xd6 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: oc 4095(1,%r1), 0
+0xd6 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: oc 4095(1,%r15), 0
+0xd6 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: oc 0(256,%r1), 0
+0xd6 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: oc 0(256,%r15), 0
+0xd6 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: ogr %r0, %r0
 0xb9 0x81 0x00 0x00
 
@@ -5098,6 +5704,36 @@
 # CHECK: oy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x56
 
+# CHECK: pfd 0, -524288
+0xe3 0x00 0x00 0x00 0x80 0x36
+
+# CHECK: pfd 0, -1
+0xe3 0x00 0x0f 0xff 0xff 0x36
+
+# CHECK: pfd 0, 0
+0xe3 0x00 0x00 0x00 0x00 0x36
+
+# CHECK: pfd 0, 1
+0xe3 0x00 0x00 0x01 0x00 0x36
+
+# CHECK: pfd 0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0x36
+
+# CHECK: pfd 0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0x36
+
+# CHECK: pfd 0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0x36
+
+# CHECK: pfd 0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0x36
+
+# CHECK: pfd 0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0x36
+
+# CHECK: pfd 15, 0
+0xe3 0xf0 0x00 0x00 0x00 0x36
+
 # CHECK: risbg %r0, %r0, 0, 0, 0
 0xec 0x00 0x00 0x00 0x00 0x55
 
@@ -6148,6 +6784,18 @@
 # CHECK: srk %r2, %r3, %r4
 0xb9 0xf9 0x40 0x23
 
+# CHECK: srst %r0, %r0
+0xb2 0x5e 0x00 0x00
+
+# CHECK: srst %r0, %r15
+0xb2 0x5e 0x00 0x0f
+
+# CHECK: srst %r15, %r0
+0xb2 0x5e 0x00 0xf0
+
+# CHECK: srst %r7, %r8
+0xb2 0x5e 0x00 0x78
+
 # CHECK: stc %r0, 0
 0x42 0x00 0x00 0x00
 
@@ -6169,6 +6817,36 @@
 # CHECK: stc %r15, 0
 0x42 0xf0 0x00 0x00
 
+# CHECK: stch %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xc3
+
+# CHECK: stch %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xc3
+
+# CHECK: stch %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xc3
+
+# CHECK: stch %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xc3
+
+# CHECK: stch %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xc3
+
+# CHECK: stch %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xc3
+
+# CHECK: stch %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xc3
+
+# CHECK: stch %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xc3
+
+# CHECK: stch %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xc3
+
+# CHECK: stch %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xc3
+
 # CHECK: stcy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x72
 
@@ -6352,6 +7030,66 @@
 # CHECK: sth %r15, 0
 0x40 0xf0 0x00 0x00
 
+# CHECK: sthh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xc7
+
+# CHECK: sthh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xc7
+
+# CHECK: sthh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xc7
+
+# CHECK: sthh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xc7
+
+# CHECK: sthh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xc7
+
+# CHECK: sthh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xc7
+
+# CHECK: sthh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xc7
+
+# CHECK: sthh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xc7
+
+# CHECK: sthh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xc7
+
+# CHECK: sthh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xc7
+
+# CHECK: stfh %r0, -524288
+0xe3 0x00 0x00 0x00 0x80 0xcb
+
+# CHECK: stfh %r0, -1
+0xe3 0x00 0x0f 0xff 0xff 0xcb
+
+# CHECK: stfh %r0, 0
+0xe3 0x00 0x00 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 1
+0xe3 0x00 0x00 0x01 0x00 0xcb
+
+# CHECK: stfh %r0, 524287
+0xe3 0x00 0x0f 0xff 0x7f 0xcb
+
+# CHECK: stfh %r0, 0(%r1)
+0xe3 0x00 0x10 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 0(%r15)
+0xe3 0x00 0xf0 0x00 0x00 0xcb
+
+# CHECK: stfh %r0, 524287(%r1,%r15)
+0xe3 0x01 0xff 0xff 0x7f 0xcb
+
+# CHECK: stfh %r0, 524287(%r15,%r1)
+0xe3 0x0f 0x1f 0xff 0x7f 0xcb
+
+# CHECK: stfh %r15, 0
+0xe3 0xf0 0x00 0x00 0x00 0xcb
+
 # CHECK: sthy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x70
 
@@ -6691,6 +7429,141 @@
 # CHECK: sy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x5b
 
+# CHECK: tm 0, 0
+0x91 0x00 0x00 0x00
+
+# CHECK: tm 4095, 0
+0x91 0x00 0x0f 0xff
+
+# CHECK: tm 0, 255
+0x91 0xff 0x00 0x00
+
+# CHECK: tm 0(%r1), 42
+0x91 0x2a 0x10 0x00
+
+# CHECK: tm 0(%r15), 42
+0x91 0x2a 0xf0 0x00
+
+# CHECK: tm 4095(%r1), 42
+0x91 0x2a 0x1f 0xff
+
+# CHECK: tm 4095(%r15), 42
+0x91 0x2a 0xff 0xff
+
+# CHECK: tmhh %r0, 0
+0xa7 0x02 0x00 0x00
+
+# CHECK: tmhh %r0, 32768
+0xa7 0x02 0x80 0x00
+
+# CHECK: tmhh %r0, 65535
+0xa7 0x02 0xff 0xff
+
+# CHECK: tmhh %r15, 0
+0xa7 0xf2 0x00 0x00
+
+# CHECK: tmhl %r0, 0
+0xa7 0x03 0x00 0x00
+
+# CHECK: tmhl %r0, 32768
+0xa7 0x03 0x80 0x00
+
+# CHECK: tmhl %r0, 65535
+0xa7 0x03 0xff 0xff
+
+# CHECK: tmhl %r15, 0
+0xa7 0xf3 0x00 0x00
+
+# CHECK: tmlh %r0, 0
+0xa7 0x00 0x00 0x00
+
+# CHECK: tmlh %r0, 32768
+0xa7 0x00 0x80 0x00
+
+# CHECK: tmlh %r0, 65535
+0xa7 0x00 0xff 0xff
+
+# CHECK: tmlh %r15, 0
+0xa7 0xf0 0x00 0x00
+
+# CHECK: tmll %r0, 0
+0xa7 0x01 0x00 0x00
+
+# CHECK: tmll %r0, 32768
+0xa7 0x01 0x80 0x00
+
+# CHECK: tmll %r0, 65535
+0xa7 0x01 0xff 0xff
+
+# CHECK: tmll %r15, 0
+0xa7 0xf1 0x00 0x00
+
+# CHECK: tmy -524288, 0
+0xeb 0x00 0x00 0x00 0x80 0x51
+
+# CHECK: tmy -1, 0
+0xeb 0x00 0x0f 0xff 0xff 0x51
+
+# CHECK: tmy 0, 0
+0xeb 0x00 0x00 0x00 0x00 0x51
+
+# CHECK: tmy 1, 0
+0xeb 0x00 0x00 0x01 0x00 0x51
+
+# CHECK: tmy 524287, 0
+0xeb 0x00 0x0f 0xff 0x7f 0x51
+
+# CHECK: tmy 0, 255
+0xeb 0xff 0x00 0x00 0x00 0x51
+
+# CHECK: tmy 0(%r1), 42
+0xeb 0x2a 0x10 0x00 0x00 0x51
+
+# CHECK: tmy 0(%r15), 42
+0xeb 0x2a 0xf0 0x00 0x00 0x51
+
+# CHECK: tmy 524287(%r1), 42
+0xeb 0x2a 0x1f 0xff 0x7f 0x51
+
+# CHECK: tmy 524287(%r15), 42
+0xeb 0x2a 0xff 0xff 0x7f 0x51
+
+# CHECK: xc 0(1), 0
+0xd7 0x00 0x00 0x00 0x00 0x00
+
+# CHECK: xc 0(1), 0(%r1)
+0xd7 0x00 0x00 0x00 0x10 0x00
+
+# CHECK: xc 0(1), 0(%r15)
+0xd7 0x00 0x00 0x00 0xf0 0x00
+
+# CHECK: xc 0(1), 4095
+0xd7 0x00 0x00 0x00 0x0f 0xff
+
+# CHECK: xc 0(1), 4095(%r1)
+0xd7 0x00 0x00 0x00 0x1f 0xff
+
+# CHECK: xc 0(1), 4095(%r15)
+0xd7 0x00 0x00 0x00 0xff 0xff
+
+# CHECK: xc 0(1,%r1), 0
+0xd7 0x00 0x10 0x00 0x00 0x00
+
+# CHECK: xc 0(1,%r15), 0
+0xd7 0x00 0xf0 0x00 0x00 0x00
+
+# CHECK: xc 4095(1,%r1), 0
+0xd7 0x00 0x1f 0xff 0x00 0x00
+
+# CHECK: xc 4095(1,%r15), 0
+0xd7 0x00 0xff 0xff 0x00 0x00
+
+# CHECK: xc 0(256,%r1), 0
+0xd7 0xff 0x10 0x00 0x00 0x00
+
+# CHECK: xc 0(256,%r15), 0
+0xd7 0xff 0xf0 0x00 0x00 0x00
+
 # CHECK: xgr %r0, %r0
 0xb9 0x82 0x00 0x00
 
diff --git a/test/MC/Disassembler/SystemZ/lit.local.cfg b/test/MC/Disassembler/SystemZ/lit.local.cfg
index 1da00ea..b12af09 100644
--- a/test/MC/Disassembler/SystemZ/lit.local.cfg
+++ b/test/MC/Disassembler/SystemZ/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.txt']
-
 targets = set(config.root.targets_to_build.split())
 if not 'SystemZ' in targets:
     config.unsupported = True
diff --git a/test/MC/Disassembler/X86/intel-syntax-32.txt b/test/MC/Disassembler/X86/intel-syntax-32.txt
index 08bae6e..2298823 100644
--- a/test/MC/Disassembler/X86/intel-syntax-32.txt
+++ b/test/MC/Disassembler/X86/intel-syntax-32.txt
@@ -1,13 +1,31 @@
 # RUN: llvm-mc --disassemble %s -triple=i386 --output-asm-variant=1 | FileCheck %s
 
-# CHECK: sgdt
+# CHECK: sgdt opaque ptr [eax]
 0x0f 0x01 0x00
 
-# CHECK: sidt
+# CHECK: sidt opaque ptr [eax]
 0x0f 0x01 0x08
 
-# CHECK: lgdt
+# CHECK: lgdt opaque ptr [eax]
 0x0f 0x01 0x10
 
-# CHECK: lidt
+# CHECK: lidt opaque ptr [eax]
 0x0f 0x01 0x18
+
+# CHECK: mov al, byte ptr [878082192]
+0xa0 0x90 0x78 0x56 0x34
+
+# CHECK: mov ax, word ptr [878082192]
+0x66 0xa1 0x90 0x78 0x56 0x34
+
+# CHECK: mov eax, dword ptr [878082192]
+0xa1 0x90 0x78 0x56 0x34
+
+# CHECK: mov byte ptr [878082192], al
+0xa2 0x90 0x78 0x56 0x34
+
+# CHECK: mov word ptr [878082192], ax
+0x66 0xa3 0x90 0x78 0x56 0x34
+
+# CHECK: mov dword ptr [878082192], eax
+0xa3 0x90 0x78 0x56 0x34
diff --git a/test/MC/Disassembler/X86/intel-syntax.txt b/test/MC/Disassembler/X86/intel-syntax.txt
index 6c0c239..3689525 100644
--- a/test/MC/Disassembler/X86/intel-syntax.txt
+++ b/test/MC/Disassembler/X86/intel-syntax.txt
@@ -105,6 +105,9 @@
 # CHECK: retf
 0x66 0xcb
 
+# CHECK: vshufpd xmm0, xmm1, xmm2, 1
+0xc5 0xf1 0xc6 0xc2 0x01
+
 # CHECK: vpgatherqq ymm2, qword ptr [rdi + 2*ymm1], ymm0
 0xc4 0xe2 0xfd 0x91 0x14 0x4f
 
@@ -119,3 +122,33 @@
 
 # CHECK: xsaveopt64 opaque ptr [rax]
 0x48 0x0f 0xae 0x30
+
+# CHECK: movabs al, byte ptr [-6066930261531658096]
+0xa0 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs al, byte ptr [-6066930261531658096]
+0x48 0xa0 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs ax, word ptr [-6066930261531658096]
+0x66 0xa1 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs eax, dword ptr [-6066930261531658096]
+0xa1 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs rax, qword ptr [-6066930261531658096]
+0x48 0xa1 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs byte ptr [-6066930261531658096], al
+0xa2 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs byte ptr [-6066930261531658096], al
+0x48 0xa2 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs word ptr [-6066930261531658096], ax
+0x66 0xa3 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs dword ptr [-6066930261531658096], eax
+0xa3 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: movabs qword ptr [-6066930261531658096], rax
+0x48 0xa3 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
diff --git a/test/MC/Disassembler/X86/lit.local.cfg b/test/MC/Disassembler/X86/lit.local.cfg
index 6211b3e..ba763cf 100644
--- a/test/MC/Disassembler/X86/lit.local.cfg
+++ b/test/MC/Disassembler/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.txt']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/Disassembler/X86/prefixes.txt b/test/MC/Disassembler/X86/prefixes.txt
new file mode 100644
index 0000000..56596e3
--- /dev/null
+++ b/test/MC/Disassembler/X86/prefixes.txt
@@ -0,0 +1,59 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s
+
+# CHECK: lock
+# CHECK-NEXT:	orl	$16, %fs:776
+0xf0 0x64 0x83 0x0c 0x25 0x08 0x03 0x00 0x00 0x10
+
+# CHECK: movq	%fs:768, %rdi
+0x64 0x48 0x8b 0x3c 0x25 0x00 0x03 0x00 0x00
+
+# CHECK: rep
+# CHECK-NEXT:		stosq
+0xf3 0x48 0xab
+
+# CHECK: rep
+# CHECK-NEXT:		stosl
+0xf3 0x67 0x48 0xab
+
+# CHECK: movl 32(%rbp), %eax
+0x8b 0x45 0x20
+
+# CHECK: movl %es:32(%rbp), %eax
+0x26 0x8b 0x45 0x20
+
+# CHECK: movl %es:32(%rbp), %eax
+0x2e 0x26 0x8b 0x45 0x20
+
+# Test that multiple prefixes stack.
+#    (todo- the correct disassembly is actually more like "es movl %cs:32(%rbp), %eax"
+#    but we don't support that)
+# CHECK: movl %cs:32(%rbp), %eax
+0x26 0x2e 0x8b 0x45 0x20
+
+# Test that 0xf3 as part of the opcode works.
+# CHECK: cvtdq2pd	(%rax), %xmm0
+0xf3 0x0f 0xe6 0x00
+
+# CHECK: pause
+0xf3 0x90
+
+# CHECK: nop
+0x90
+
+# CHECK: 		lock
+# CHECK-NEXT:	nop
+0xf0 0x90
+
+# Test that multiple redundant prefixes work (redundant, but valid x86).
+# CHECK: rep
+# CHECK-NEXT: rep
+# CHECK-NEXT: stosq
+0xf3 0xf3 0x48 0xab
+
+# Test that a prefix on it's own works. It's debatable as to if this is 
+# something that is considered valid, but however as LLVM's own disassembler
+# has decided to disassemble prefixes as being separate opcodes, it therefore 
+# should be capable of re-consuming it's own output.
+# CHECK: rep
+0xf3
+# ***IMPORTANT ^-- this must be at the end of the file to be a valid test *** 
diff --git a/test/MC/Disassembler/X86/simple-tests.txt b/test/MC/Disassembler/X86/simple-tests.txt
index 940b1f7..7ca0874 100644
--- a/test/MC/Disassembler/X86/simple-tests.txt
+++ b/test/MC/Disassembler/X86/simple-tests.txt
@@ -129,6 +129,9 @@
 # CHECK: vcvtsd2si %xmm0, %rax
 0xc4 0xe1 0xfb 0x2d 0xc0
 
+# CHECK: vcvtsd2si %xmm0, %rax
+0xc4 0xe1 0xff 0x2d 0xc0
+
 # CHECK: vmaskmovpd %xmm0, %xmm1, (%rax)
 0xc4 0xe2 0x71 0x2f 0x00
 
@@ -260,6 +263,9 @@
 # CHECK: vmovups %ymm0, %ymm1
 0xc5 0xfc 0x11 0xc1
 
+# CHECK: vmovups %ymm0, %ymm1
+0xc4 0xe1 0xfc 0x11 0xc1
+
 # CHECK: vmovaps %ymm1, %ymm0
 0xc5 0xfc 0x28 0xc1
 
@@ -722,9 +728,66 @@
 # CHECK: vfmaddss %xmm1, (%rcx), %xmm0, %xmm0
 0xc4 0xe3 0x79 0x6a 0x01 0x10
 
+# CHECK: vfmaddss (%rcx), %xmm1, %xmm0, %xmm0
+0xc4 0xe3 0xfd 0x6a 0x01 0x10
+
+# CHECK: vfmaddss %xmm1, (%rcx), %xmm0, %xmm0
+0xc4 0xe3 0x7d 0x6a 0x01 0x10
+
+# CHECK: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+0xc4 0xe3 0xf9 0x6a 0xc2 0x10
+
+# CHECK: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
+0xc4 0xe3 0x79 0x6a 0xc2 0x10
+
+# CHECK: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+0xc4 0xe3 0xfd 0x6a 0xc2 0x10
+
+# CHECK: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
+0xc4 0xe3 0x7d 0x6a 0xc2 0x10
+
+# CHECK: vfmaddps  (%rcx), %xmm1, %xmm0, %xmm0
+0xc4 0xe3 0xf9 0x68 0x01 0x10
+
+# CHECK: vfmaddps   %xmm1, (%rcx), %xmm0, %xmm0
+0xc4 0xe3 0x79 0x68 0x01 0x10
+
+# CHECK: vfmaddps   %xmm1, %xmm2, %xmm0, %xmm0
+0xc4 0xe3 0x79 0x68 0xc2 0x10
+
+# CHECK: vfmaddps   %xmm2, %xmm1, %xmm0, %xmm0
+0xc4 0xe3 0xf9 0x68 0xc2 0x10
+
+# CHECK: vfmaddps  (%rcx), %ymm1, %ymm0, %ymm0
+0xc4 0xe3 0xfd 0x68 0x01 0x10
+
+# CHECK: vfmaddps   %ymm1, (%rcx), %ymm0, %ymm0
+0xc4 0xe3 0x7d 0x68 0x01 0x10
+
+# CHECK: vfmaddps   %ymm1, %ymm2, %ymm0, %ymm0
+0xc4 0xe3 0x7d 0x68 0xc2 0x10
+
+# CHECK: vfmaddps   %ymm2, %ymm1, %ymm0, %ymm0
+0xc4 0xe3 0xfd 0x68 0xc2 0x10
+
+# CHECK: vpermil2ps $0, %xmm4, %xmm3, %xmm2, %xmm1
+0xc4 0xe3 0x69 0x48 0xcb 0x40
+
 # CHECK: vpermil2ps $1, 4(%rax), %xmm2, %xmm3, %xmm0
 0xc4 0xe3 0xe1 0x48 0x40 0x04 0x21
 
+# CHECK: vpermil2ps $2, (%rax), %ymm1, %ymm5, %ymm6
+0xc4 0xe3 0xd5 0x48 0x30 0x12
+
+# CHECK: vpermil2ps $3, %xmm1, (%rax), %xmm3, %xmm4
+0xc4 0xe3 0x61 0x48 0x20 0x13
+
+# CHECK: vpermil2ps $0, %ymm4, %ymm4, %ymm2, %ymm2
+0xc4 0xe3 0x6d 0x48 0xd4 0x40
+
+# CHECK: vpermil2pd $1, %ymm1, 4(%rax), %ymm1, %ymm0
+0xc4 0xe3 0x75 0x49 0x40 0x04 0x11
+
 # CHECK: vgatherdpd %xmm0, (%rdi,%xmm1,2), %xmm2
 0xc4 0xe2 0xf9 0x92 0x14 0x4f
 
@@ -796,3 +859,60 @@
 # CHECK: xacquire
 # CHECK-NEXT: xchgl %ebx, (%rax)
 0xf2 0x87 0x18
+
+# CHECK: bextr $2814, %edi, %eax
+0x8f 0xea 0x78 0x10 0xc7 0xfe 0x0a 0x00 0x00
+
+# CHECK: blci %rdi, %rax
+0x8f 0xe9 0xf8 0x02 0xf7
+
+# CHECK: vpcmov %xmm1, %xmm2, %xmm3, %xmm4
+0x8f 0xe8 0x60 0xa2 0xe2 0x10
+
+# CHECK: vpcmov (%rax), %xmm2, %xmm3, %xmm4
+0x8f 0xe8 0xe0 0xa2 0x20 0x20
+
+# CHECK: vpcmov %xmm1, (%rax), %xmm3, %xmm4
+0x8f 0xe8 0x60 0xa2 0x20 0x10
+
+# CHECK: vpcmov %ymm1, %ymm2, %ymm3, %ymm4
+0x8f 0xe8 0x64 0xa2 0xe2 0x10
+
+# CHECK: vpcmov (%rax), %ymm2, %ymm3, %ymm4
+0x8f 0xe8 0xe4 0xa2 0x20 0x20
+
+# CHECK: vpcmov %ymm1, (%rax), %ymm3, %ymm4
+0x8f 0xe8 0x64 0xa2 0x20 0x10
+
+# CHECK: vpcomb $55, %xmm6, %xmm4, %xmm2
+0x8f 0xe8 0x58 0xcc 0xd6 0x37
+
+# CHECK: vpcomb $56, 8(%rax), %xmm3, %xmm2
+0x8f 0xe8 0x60 0xcc 0x50 0x08 0x38
+
+# CHECK: vpmacsdd %xmm4, %xmm6, %xmm4, %xmm2
+0x8f 0xe8 0x58 0x9e 0xd6 0x40
+# CHECK: vpmacsdd %xmm4, (%rax,%rcx), %xmm4, %xmm3
+0x8f 0xe8 0x58 0x9e 0x1c 0x08 0x40
+
+# CHECK: vprotd (%rax), %xmm0, %xmm3
+0x8f 0xe9 0xf8 0x92 0x18
+# CHECK: vprotd %xmm2, (%rax,%rcx), %xmm4
+0x8f 0xe9 0x68 0x92 0x24 0x08
+# CHECK: vprotd %xmm5, %xmm3, %xmm2
+0x8f 0xe9 0x50 0x92 0xd3
+# CHECK: vprotd $43, (%rcx), %xmm6
+0x8f 0xe8 0x78 0xc2 0x31 0x2b
+# CHECK: vprotd $44, (%rax,%rcx), %xmm7
+0x8f 0xe8 0x78 0xc2 0x3c 0x08 0x2c
+# CHECK: vprotd $45, %xmm4, %xmm4
+0x8f 0xe8 0x78 0xc2 0xe4 0x2d
+
+# CHECK: vfrczps 4(%rax), %xmm3
+0x8f 0xe9 0x78 0x80 0x58 0x04
+# CHECK: vfrczps %xmm6, %xmm5
+0x8f 0xe9 0x78 0x80 0xee
+# CHECK: vfrczps (%rcx), %xmm1
+0x8f 0xe9 0x78 0x80 0x09
+# CHECK: vfrczps %ymm2, %ymm4
+0x8f 0xe9 0x7c 0x80 0xe2
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index 76d67d3..b6a62c4 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -648,3 +648,51 @@
 
 # CHECK: adoxl (%eax), %eax
 0xf3 0x0f 0x38 0xf6 0x00
+
+# CHECK: movb 878082192, %al
+0xa0 0x90 0x78 0x56 0x34
+
+# CHECK: movw 878082192, %ax
+0x66 0xa1 0x90 0x78 0x56 0x34
+
+# CHECK: movl 878082192, %eax
+0xa1 0x90 0x78 0x56 0x34
+
+# CHECK: movb %al, 878082192
+0xa2 0x90 0x78 0x56 0x34
+
+# CHECK: movw %ax, 878082192
+0x66 0xa3 0x90 0x78 0x56 0x34
+
+# CHECK: movl %eax, 878082192
+0xa3 0x90 0x78 0x56 0x34
+
+# CHECK: incl %ecx
+0xff 0xc1
+
+# CHECK: decl %ecx
+0xff 0xc9
+
+# CHECK: incw %cx
+0x66 0xff 0xc1
+
+# CHECK: decw %cx
+0x66 0xff 0xc9
+
+# CHECK: incb %cl
+0xfe 0xc1
+
+# CHECK: decb %cl
+0xfe 0xc9
+
+# CHECK: incl %ecx
+0x41
+
+# CHECK: decl %ecx
+0x49
+
+# CHECK: movq %xmm0, %xmm0
+0xf3 0x0f 0x7e 0xc0
+
+# CHECK: vmovq %xmm0, %xmm0
+0xc5 0xfa 0x7e 0xc0
diff --git a/test/MC/Disassembler/X86/x86-64.txt b/test/MC/Disassembler/X86/x86-64.txt
index bf1fa21..8c6bc0e 100644
--- a/test/MC/Disassembler/X86/x86-64.txt
+++ b/test/MC/Disassembler/X86/x86-64.txt
@@ -157,3 +157,87 @@
 
 # CHECK: movabsq %rax, -6066930261531658096
 0x48 0xa3 0x90 0x78 0x56 0x34 0x12 0xef 0xcd 0xab
+
+# CHECK: sha1rnds4 $1, %xmm1, %xmm2
+0x0f 0x3a 0xcc 0xd1 0x01
+
+# CHECK: sha1rnds4 $1, (%rax), %xmm2
+0x0f 0x3a 0xcc 0x10 0x01
+
+# CHECK: sha1nexte %xmm1, %xmm2
+0x0f 0x38 0xc8 0xd1
+
+# CHECK: sha1nexte (%rax), %xmm2
+0x0f 0x38 0xc8 0x10
+
+# CHECK: sha1msg1 %xmm1, %xmm2
+0x0f 0x38 0xc9 0xd1
+
+# CHECK: sha1msg1 (%rax), %xmm2
+0x0f 0x38 0xc9 0x10
+
+# CHECK: sha1msg2 %xmm1, %xmm2
+0x0f 0x38 0xca 0xd1
+
+# CHECK: sha1msg2 (%rax), %xmm2
+0x0f 0x38 0xca 0x10
+
+# CHECK: sha256rnds2 (%rax), %xmm2
+0x0f 0x38 0xcb 0x10
+
+# CHECK: sha256rnds2 %xmm1, %xmm2
+0x0f 0x38 0xcb 0xd1
+
+# CHECK: sha256msg1 %xmm1, %xmm2
+0x0f 0x38 0xcc 0xd1
+
+# CHECK: sha256msg1 (%rax), %xmm2
+0x0f 0x38 0xcc 0x10
+
+# CHECK: sha256msg2 %xmm1, %xmm2
+0x0f 0x38 0xcd 0xd1
+
+# CHECK: sha256msg2 (%rax), %xmm2
+0x0f 0x38 0xcd 0x10
+
+# CHECK: incl %ecx
+0xff 0xc1
+
+# CHECK: decl %ecx
+0xff 0xc9
+
+# CHECK: incw %cx
+0x66 0xff 0xc1
+
+# CHECK: decw %cx
+0x66 0xff 0xc9
+
+# CHECK: incb %cl
+0xfe 0xc1
+
+# CHECK: decb %cl
+0xfe 0xc9
+
+# CHECK: incq %rcx
+0x48 0xff 0xc1
+
+# CHECK: decq %rcx
+0x48 0xff 0xc9
+
+# CHECK: movq %xmm0, %xmm0
+0xf3 0x0f 0x7e 0xc0
+
+# CHECK: vmovq %xmm0, %xmm0
+0xc5 0xfa 0x7e 0xc0
+
+# CHECK: vmovq %xmm0, %rax
+0xc4 0xe1 0xf9 0x7e 0xc0
+
+# CHECK: movd %xmm0, %rax
+0x66 0x48 0x0f 0x7e 0xc0
+
+# CHECK: pextrw $3, %xmm3, %ecx
+0x66 0x0f 0x3a 0x15 0xd9 0x03
+
+# CHECK: pextrw $3, %xmm3, (%rax)
+0x66 0x0f 0x3a 0x15 0x18 0x03
diff --git a/test/MC/Disassembler/XCore/lit.local.cfg b/test/MC/Disassembler/XCore/lit.local.cfg
index 15b6583..4d17d46 100644
--- a/test/MC/Disassembler/XCore/lit.local.cfg
+++ b/test/MC/Disassembler/XCore/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.txt']
-
 targets = set(config.root.targets_to_build.split())
 if not 'XCore' in targets:
     config.unsupported = True
diff --git a/test/MC/ELF/bad-relocation.s b/test/MC/ELF/bad-relocation.s
new file mode 100644
index 0000000..1a66744
--- /dev/null
+++ b/test/MC/ELF/bad-relocation.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o /dev/null 2>&1 | FileCheck  %s
+
+// CHECK: error: invalid variant 'BADRELOC'
+
+        .text
+foo:
+	leal	.Lfoo@BADRELOC(%ebx), %eax
diff --git a/test/MC/ELF/cfi-window-save.s b/test/MC/ELF/cfi-window-save.s
new file mode 100644
index 0000000..c7d438a
--- /dev/null
+++ b/test/MC/ELF/cfi-window-save.s
@@ -0,0 +1,51 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -sr -sd | FileCheck %s
+
+# Should use SPARC as the target to test this. However, SPARC does not
+# use MC yet.
+
+f:
+        .cfi_startproc
+        nop
+        .cfi_window_save
+        nop
+        .cfi_endproc
+
+// CHECK:        Section {
+// CHECK:          Name: .eh_frame
+// CHECK-NEXT:     Type: SHT_PROGBITS
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:       SHF_ALLOC
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Address: 0x0
+// CHECK-NEXT:     Offset: 0x48
+// CHECK-NEXT:     Size: 48
+// CHECK-NEXT:     Link: 0
+// CHECK-NEXT:     Info: 0
+// CHECK-NEXT:     AddressAlignment: 8
+// CHECK-NEXT:     EntrySize: 0
+// CHECK-NEXT:     Relocations [
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     SectionData (
+// CHECK-NEXT:       0000: 14000000 00000000 017A5200 01781001
+// CHECK-NEXT:       0010: 1B0C0708 90010000 14000000 1C000000
+// CHECK-NEXT:       0020: 00000000 02000000 00412D00 00000000
+// CHECK-NEXT:     )
+// CHECK-NEXT:   }
+
+// CHECK:        Section {
+// CHECK:          Name: .rela.eh_frame
+// CHECK-NEXT:     Type: SHT_RELA
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Address: 0x0
+// CHECK-NEXT:     Offset: 0x390
+// CHECK-NEXT:     Size: 24
+// CHECK-NEXT:     Link: 7
+// CHECK-NEXT:     Info: 4
+// CHECK-NEXT:     AddressAlignment: 8
+// CHECK-NEXT:     EntrySize: 24
+// CHECK-NEXT:     Relocations [
+// CHECK-NEXT:       0x20 R_X86_64_PC32 .text 0x0
+// CHECK-NEXT:     ]
+// CHECK:        }
+
diff --git a/test/MC/ELF/comdat-dup-group-name.s b/test/MC/ELF/comdat-dup-group-name.s
new file mode 100644
index 0000000..1181e2e
--- /dev/null
+++ b/test/MC/ELF/comdat-dup-group-name.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -t | FileCheck %s
+
+// Test that we produce two foo sections, each in separate groups
+
+// CHECK: Index: 1
+// CHECK-NEXT: Name: .group
+
+// CHECK: Index: 2
+// CHECK-NEXT: Name: .group
+
+// CHECK: Index: 6
+// CHECK-NEXT: Name: .foo
+
+// CHECK: Index: 7
+// CHECK-NEXT: Name: .foo
+
+// CHECK: Symbols [
+
+// CHECK: Name: f1
+// CHECK-NOT: }
+// CHECK: Section: .group (0x1)
+
+// CHECK: Name: f2
+// CHECK-NOT: }
+// CHECK: Section: .group (0x2)
+
+// CHECK: Name: .foo
+// CHECK-NOT: }
+// CHECK: Section: .foo (0x6)
+
+// CHECK: Name: .foo
+// CHECK-NOT: }
+// CHECK: Section: .foo (0x7)
+
+
+	.section	.foo,"axG",@progbits,f1,comdat
+        nop
+
+	.section	.foo,"axG",@progbits,f2,comdat
+        nop
+
diff --git a/test/MC/ELF/comdat.s b/test/MC/ELF/comdat.s
index 6dbe583..3e4a001 100644
--- a/test/MC/ELF/comdat.s
+++ b/test/MC/ELF/comdat.s
@@ -82,7 +82,7 @@
 g1:
         nop
 
-        .section	.bar,"axG",@progbits,g1,comdat
+        .section	.bar,"ax?",@progbits
         nop
 
         .section	.zed,"axG",@progbits,g2,comdat
diff --git a/test/MC/ELF/comp-dir.s b/test/MC/ELF/comp-dir.s
index 59e3d7d..1b91f64 100644
--- a/test/MC/ELF/comp-dir.s
+++ b/test/MC/ELF/comp-dir.s
@@ -1,7 +1,17 @@
+// REQUIRES: shell
+// XFAIL: mingw
 // RUN: llvm-mc -triple=x86_64-linux-unknown -g -fdebug-compilation-dir=/test/comp/dir %s -filetype=obj -o %t.o
 // RUN: llvm-dwarfdump -debug-dump=info %t.o | FileCheck %s
 
 // CHECK: DW_AT_comp_dir [DW_FORM_string] ("{{([A-Za-z]:.*)?}}/test/comp/dir")
 
+// RUN: mkdir -p %t.foo
+// RUN: ln -sf %t.foo %t.bar
+// RUN: cd %t.foo
+// RUN: env PWD=%t.bar llvm-mc -triple=x86_64-linux-unknown -g %s -filetype=obj -o %t.o
+// RUN: llvm-dwarfdump -debug-dump=info %t.o | FileCheck --check-prefix=PWD %s
+// PWD: DW_AT_comp_dir [DW_FORM_string] ("{{.*}}.bar")
+
+
 f:
   nop
diff --git a/test/MC/ELF/debug-line.s b/test/MC/ELF/debug-line.s
index 6766f10..38ef828 100644
--- a/test/MC/ELF/debug-line.s
+++ b/test/MC/ELF/debug-line.s
@@ -1,6 +1,15 @@
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -sd | FileCheck %s
 
-// Test that .debug_line is populated.
+// Test that .debug_line is populated.  TODO: This test should really be using
+// llvm-dwarfdump, but it cannot parse this particular object file.  The content
+// of .debug_line was checked using GNU binutils:
+
+// $ objdump --dwarf=decodedline debug-line.o
+// [...]
+// File name                            Line number    Starting address
+// foo.c                                          4                   0
+// foo.c                                          5                 0x4
+// foo.c                                          6                 0x5
 
 // CHECK:        Section {
 // CHECK:          Name: .debug_line
@@ -8,18 +17,18 @@
 // CHECK-NEXT:     Flags [
 // CHECK-NEXT:     ]
 // CHECK-NEXT:     Address: 0x0
-// CHECK-NEXT:     Offset: 0x48
-// CHECK-NEXT:     Size: 72
+// CHECK-NEXT:     Offset: 0x50
+// CHECK-NEXT:     Size: 57
 // CHECK-NEXT:     Link: 0
 // CHECK-NEXT:     Info: 0
 // CHECK-NEXT:     AddressAlignment: 1
 // CHECK-NEXT:     EntrySize: 0
 // CHECK-NEXT:     SectionData (
-// CHECK-NEXT:       0000: 44000000 02001C00 00000101 FB0E0D00
+
+// CHECK-NEXT:       0000: 35000000 02001C00 00000101 FB0E0D00
 // CHECK-NEXT:       0010: 01010101 00000001 00000100 666F6F2E
 // CHECK-NEXT:       0020: 63000000 00000009 02000000 00000000
-// CHECK-NEXT:       0030: 00150205 00010100 09020000 00000000
-// CHECK-NEXT:       0040: 00001602 43000101
+// CHECK-NEXT:       0030: 00154B21 02080001 01
 // CHECK-NEXT:     )
 // CHECK-NEXT:   }
 
@@ -34,3 +43,7 @@
 
 	.loc 1 5 0
 	.byte 0xc3
+
+	.loc 1 6 0
+l:
+	.quad l
diff --git a/test/MC/ELF/file-double.s b/test/MC/ELF/file-double.s
new file mode 100644
index 0000000..b0731e6
--- /dev/null
+++ b/test/MC/ELF/file-double.s
@@ -0,0 +1,47 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -t | FileCheck %s
+
+// Test that a STT_FILE symbol and a symbol of the same name can coexist.
+
+.file "foo.c"
+.file "bar.c"
+	.globl foo.c
+foo.c:
+
+	.globl bar.c
+bar.c:
+
+// CHECK:        Symbol {
+// CHECK:          Name: foo.c (1)
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: File
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: (0xFFF1)
+// CHECK-NEXT:   }
+// CHECK:          Name: bar.c (7)
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Local
+// CHECK-NEXT:     Type: File
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: (0xFFF1)
+// CHECK-NEXT:   }
+// CHECK:        Symbol {
+// CHECK:        Name: bar.c (7)
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text (0x1)
+// CHECK-NEXT:   }
+// CHECK:        Symbol {
+// CHECK:        Name: foo.c (1)
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: None
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text (0x1)
+// CHECK-NEXT:   }
diff --git a/test/MC/ELF/lit.local.cfg b/test/MC/ELF/lit.local.cfg
index 56bf008..ba763cf 100644
--- a/test/MC/ELF/lit.local.cfg
+++ b/test/MC/ELF/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/ELF/relocation.s b/test/MC/ELF/relocation.s
index d2c4f2e..6823075 100644
--- a/test/MC/ELF/relocation.s
+++ b/test/MC/ELF/relocation.s
@@ -14,11 +14,14 @@ bar:
         leaq	foo@TPOFF(%rax), %rax    # R_X86_64_TPOFF32
         leaq	foo@TLSLD(%rip), %rdi    # R_X86_64_TLSLD
         leaq	foo@dtpoff(%rax), %rcx   # R_X86_64_DTPOFF32
+        movabs  foo@GOT, %rax		 # R_X86_64_GOT64
+        movabs  foo@GOTOFF, %rax	 # R_X86_64_GOTOFF64
         pushq    $bar
         movq	foo(%rip), %rdx
         leaq    foo-bar(%r14),%r14
         addq	$bar,%rax         # R_X86_64_32S
 	.quad	foo@DTPOFF
+        movabsq	$baz@TPOFF, %rax
 
 // CHECK:        Section {
 // CHECK:          Name: .rela.text
@@ -34,11 +37,14 @@ bar:
 // CHECK-NEXT:       0x3B R_X86_64_TPOFF32  foo 0x0
 // CHECK-NEXT:       0x42 R_X86_64_TLSLD    foo 0xFFFFFFFFFFFFFFFC
 // CHECK-NEXT:       0x49 R_X86_64_DTPOFF32 foo 0x0
-// CHECK-NEXT:       0x4E R_X86_64_32S      .text 0x0
-// CHECK-NEXT:       0x55 R_X86_64_PC32     foo 0xFFFFFFFFFFFFFFFC
-// CHECK-NEXT:       0x5C R_X86_64_PC32     foo 0x5C
-// CHECK-NEXT:       0x63 R_X86_64_32S      .text 0x0
-// CHECK-NEXT:       0x67 R_X86_64_DTPOFF64 foo 0x0
+// CHECK-NEXT:       0x4F R_X86_64_GOT64 foo 0x0
+// CHECK-NEXT:       0x59 R_X86_64_GOTOFF64 foo 0x0
+// CHECK-NEXT:       0x62 R_X86_64_32S .text 0x0
+// CHECK-NEXT:       0x69 R_X86_64_PC32 foo 0xFFFFFFFFFFFFFFFC
+// CHECK-NEXT:       0x70 R_X86_64_PC32 foo 0x70
+// CHECK-NEXT:       0x77 R_X86_64_32S .text 0x0
+// CHECK-NEXT:       0x7B R_X86_64_DTPOFF64 foo 0x0
+// CHECK-NEXT:       0x85 R_X86_64_TPOFF64 baz 0x0
 // CHECK-NEXT:     ]
 // CHECK-NEXT:   }
 
diff --git a/test/MC/ELF/section.s b/test/MC/ELF/section.s
index a679403..7dc23c2 100644
--- a/test/MC/ELF/section.s
+++ b/test/MC/ELF/section.s
@@ -5,12 +5,12 @@
 .section	.note.GNU-stack,"",@progbits
 .section	.note.GNU-stack2,"",%progbits
 .section	.note.GNU-,"",@progbits
-.section	-.note.GNU,"",@progbits
+.section	-.note.GNU,"","progbits"
 
 // CHECK: Name: .note.GNU-stack (56)
-// CHECK: Name: .note.GNU-stack2 (143)
-// CHECK: Name: .note.GNU- (160)
-// CHECK: Name: -.note.GNU (132)
+// CHECK: Name: .note.GNU-stack2 (153)
+// CHECK: Name: .note.GNU- (170)
+// CHECK: Name: -.note.GNU (142)
 
 // Test that the defaults are used
 
@@ -120,11 +120,28 @@ bar:
 // CHECK-NEXT:     EntrySize: 0
 // CHECK-NEXT:   }
 
+.section .excluded,"e",@progbits
+
+// CHECK:      Section {
+// CHECK:        Name: .excluded (92)
+// CHECK-NEXT:   Type: SHT_PROGBITS (0x1)
+// CHECK-NEXT:   Flags [ (0x80000000)
+// CHECK-NEXT:     SHF_EXCLUDE (0x80000000)
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Address: 0x0
+// CHECK-NEXT:   Offset: 0x50
+// CHECK-NEXT:   Size: 0
+// CHECK-NEXT:   Link: 0
+// CHECK-NEXT:   Info: 0
+// CHECK-NEXT:   AddressAlignment: 1
+// CHECK-NEXT:   EntrySize: 0
+// CHECK-NEXT: }
+
 // Test that we handle the strings like gas
 .section bar-"foo"
 .section "foo"
 
 // CHECK:        Section {
-// CHECK:          Name: bar-"foo" (171)
+// CHECK:          Name: bar-"foo" (181)
 // CHECK:        Section {
 // CHECK:          Name: foo (52)
diff --git a/test/MC/ELF/symbol-names.s b/test/MC/ELF/symbol-names.s
new file mode 100644
index 0000000..6459ac9
--- /dev/null
+++ b/test/MC/ELF/symbol-names.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -triple i686-pc-linux -filetype=obj %s -o - | llvm-readobj -t | FileCheck %s
+
+// MC allows ?'s in symbol names as an extension.
+
+.text
+.globl foo?bar
+.type foo?bar, @function
+foo?bar:
+ret
+
+// CHECK: Symbol
+// CHECK: Name: foo?bar
diff --git a/test/MC/ELF/type.s b/test/MC/ELF/type.s
index a5b9812..c2f3631 100644
--- a/test/MC/ELF/type.s
+++ b/test/MC/ELF/type.s
@@ -31,6 +31,16 @@ tls:
         .type tls,@tls_object
         .type tls,@gnu_indirect_function
 
+// Test that "<type>" is accepted.
+tls_quoted:
+        .global tls_quoted
+        .type tls_quoted,"tls_object"
+
+// Test that "<type>" is accepted.
+tls_upper_case:
+        .global tls_upper_case
+        .type tls_upper_case,STT_TLS
+
 // CHECK:        Symbol {
 // CHECK:          Name: bar
 // CHECK-NEXT:     Value: 0x0
@@ -85,3 +95,21 @@ tls:
 // CHECK-NEXT:     Other: 0
 // CHECK-NEXT:     Section: .text (0x1)
 // CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: tls_quoted
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: TLS
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text (0x1)
+// CHECK-NEXT:   }
+// CHECK-NEXT:   Symbol {
+// CHECK-NEXT:     Name: tls_upper_case
+// CHECK-NEXT:     Value: 0x0
+// CHECK-NEXT:     Size: 0
+// CHECK-NEXT:     Binding: Global
+// CHECK-NEXT:     Type: TLS
+// CHECK-NEXT:     Other: 0
+// CHECK-NEXT:     Section: .text (0x1)
+// CHECK-NEXT:   }
diff --git a/test/MC/MachO/ARM/lit.local.cfg b/test/MC/MachO/ARM/lit.local.cfg
index 9f0d39d..8a3ba96 100644
--- a/test/MC/MachO/ARM/lit.local.cfg
+++ b/test/MC/MachO/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.s', '.ll']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/MC/MachO/bad-darwin-x86_64-32-bit-abs-addr.s b/test/MC/MachO/bad-darwin-x86_64-32-bit-abs-addr.s
new file mode 100644
index 0000000..5fcd316
--- /dev/null
+++ b/test/MC/MachO/bad-darwin-x86_64-32-bit-abs-addr.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+mov $_f, %rsi
+// CHECK-ERROR: 32-bit absolute addressing is not supported in 64-bit mode
diff --git a/test/MC/MachO/bad-darwin-x86_64-diff-relocs.s b/test/MC/MachO/bad-darwin-x86_64-diff-relocs.s
new file mode 100644
index 0000000..1ccebc5
--- /dev/null
+++ b/test/MC/MachO/bad-darwin-x86_64-diff-relocs.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+.quad _foo - _bar
+// CHECK-ERROR: unsupported relocation with subtraction expression
diff --git a/test/MC/MachO/bad-indirect-symbols.s b/test/MC/MachO/bad-indirect-symbols.s
new file mode 100644
index 0000000..7c16e90
--- /dev/null
+++ b/test/MC/MachO/bad-indirect-symbols.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - 2> %t.err > %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t.err %s
+
+x: .indirect_symbol _y
+// CHECK-ERROR: 4:4: error: indirect symbol not in a symbol pointer or stub section
diff --git a/test/MC/MachO/bss.s b/test/MC/MachO/bss.s
new file mode 100644
index 0000000..15d490a
--- /dev/null
+++ b/test/MC/MachO/bss.s
@@ -0,0 +1,17 @@
+// The purpose of this test is to verify that bss sections are emited correctly.
+
+// RUN: llvm-mc -filetype=obj -triple i686-apple-darwin9 %s | llvm-readobj -s | FileCheck %s
+// RUN: llvm-mc -filetype=obj -triple x86_64-apple-darwin9 %s | llvm-readobj -s | FileCheck %s
+
+    .bss
+    .globl _g0
+    .align 4
+_g0:
+    .long 0
+
+// CHECK:		Name: __bss (5F 5F 62 73 73 00 00 00 00 00 00 00 00 00 00 00)
+// CHECK-NEXT:	Segment: __DATA (5F 5F 44 41 54 41 00 00 00 00 00 00 00 00 00 00)
+// CHECK-NEXT:	Address: 0x0
+// CHECK-NEXT:	Size: 0x4
+// CHECK-NEXT:	Offset: 0
+// CHECK-NEXT:	Alignment: 4
diff --git a/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s b/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s
new file mode 100644
index 0000000..5d54879
--- /dev/null
+++ b/test/MC/MachO/darwin-x86_64-diff-reloc-assign-2.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin9 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+
+// Test case for rdar://9356266
+
+// This tests that this expression does not cause a crash and produces these
+// four relocation entries:
+// Relocation information (__DATA,__data) 4 entries
+// address  pcrel length extern type    scattered symbolnum/value
+// 00000004 False long   False  SUB     False     2 (__DATA,__data)
+// 00000004 False long   False  UNSIGND False     2 (__DATA,__data)
+// 00000000 False long   False  SUB     False     2 (__DATA,__data)
+// 00000000 False long   False  UNSIGND False     2 (__DATA,__data)
+
+	.data
+L_var1:
+L_var2:
+// This was working fine
+	.long L_var2 - L_var1
+	
+	.set L_var3, .
+	.set L_var4, .
+// But this was causing a crash
+	.long L_var4 - L_var3
+
+// CHECK:  ('_relocations', [
+// CHECK:    # Relocation 0
+// CHECK:    (('word-0', 0x4),
+// CHECK:     ('word-1', 0x54000002)),
+// CHECK:    # Relocation 1
+// CHECK:    (('word-0', 0x4),
+// CHECK:     ('word-1', 0x4000002)),
+// CHECK:    # Relocation 2
+// CHECK:    (('word-0', 0x0),
+// CHECK:     ('word-1', 0x54000002)),
+// CHECK:    # Relocation 3
+// CHECK:    (('word-0', 0x0),
+// CHECK:     ('word-1', 0x4000002)),
+// CHECK:  ])
diff --git a/test/MC/MachO/lit.local.cfg b/test/MC/MachO/lit.local.cfg
index 41a8434..ba763cf 100644
--- a/test/MC/MachO/lit.local.cfg
+++ b/test/MC/MachO/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.s', '.ll']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/Mips/elf_eflags.ll b/test/MC/Mips/elf_eflags.ll
index 91217bc..9432dcf 100644
--- a/test/MC/Mips/elf_eflags.ll
+++ b/test/MC/Mips/elf_eflags.ll
@@ -16,53 +16,53 @@
 ; Note that EF_MIPS_CPIC is set by -mabicalls which is the default on Linux
 ; TODO need to support -mno-abicalls
 
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE32 %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE32_PIC %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -relocation-model=static %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE32R2 %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE32R2_PIC %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -relocation-model=static %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE32R2-MICROMIPS %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE32R2-MICROMIPS_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32 -relocation-model=static %s -print-hack-directives -o - | FileCheck -check-prefix=CHECK-BE32 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32 -print-hack-directives %s -o - | FileCheck -check-prefix=CHECK-BE32_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -relocation-model=static %s -print-hack-directives -o - | FileCheck -check-prefix=CHECK-BE32R2 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -print-hack-directives %s -o - | FileCheck -check-prefix=CHECK-BE32R2_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -relocation-model=static -print-hack-directives %s -o - | FileCheck -check-prefix=CHECK-BE32R2-MICROMIPS %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -print-hack-directives %s -o - | FileCheck -check-prefix=CHECK-BE32R2-MICROMIPS_PIC %s
 
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64 -relocation-model=static %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE64 %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64 %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE64_PIC %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64r2 -relocation-model=static %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE64R2 %s
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips64r2 %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-BE64R2_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 -relocation-model=static %s -print-hack-directives -o - | FileCheck -check-prefix=CHECK-BE64 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64 %s -print-hack-directives -o - | FileCheck -check-prefix=CHECK-BE64_PIC %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -relocation-model=static -print-hack-directives %s -o - | FileCheck -check-prefix=CHECK-BE64R2 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips64r2 -print-hack-directives %s -o - | FileCheck -check-prefix=CHECK-BE64R2_PIC %s
 
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+mips16 -relocation-model=pic %s -o - | llvm-readobj -h | FileCheck -check-prefix=CHECK-LE32R2-MIPS16 %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+mips16 -relocation-model=pic -print-hack-directives %s -o - | FileCheck -check-prefix=CHECK-LE32R2-MIPS16 %s
  
 ; 32(R1) bit with NO_REORDER and static
-; CHECK-BE32: Flags [ (0x50001005)
+; CHECK-BE32: .mips_hack_elf_flags 0x50001005
 ;
 ; 32(R1) bit with NO_REORDER and PIC
-; CHECK-BE32_PIC: Flags [ (0x50001007)
+; CHECK-BE32_PIC: .mips_hack_elf_flags 0x50001007
 ;
 ; 32R2 bit with NO_REORDER and static
-; CHECK-BE32R2: Flags [ (0x70001005)
+; CHECK-BE32R2: .mips_hack_elf_flags 0x70001005
 ;
 ; 32R2 bit with NO_REORDER and PIC
-; CHECK-BE32R2_PIC: Flags [ (0x70001007)
+; CHECK-BE32R2_PIC: .mips_hack_elf_flags 0x70001007
 ;
 ; 32R2 bit MICROMIPS with NO_REORDER and static
-; CHECK-BE32R2-MICROMIPS: Flags [ (0x72001005)
+; CHECK-BE32R2-MICROMIPS: .mips_hack_elf_flags 0x72001005
 ;
 ; 32R2 bit MICROMIPS with NO_REORDER and PIC
-;CHECK-BE32R2-MICROMIPS_PIC: Flags [ (0x72001007)
+; CHECK-BE32R2-MICROMIPS_PIC: .mips_hack_elf_flags 0x72001007
 ;
 ; 64(R1) bit with NO_REORDER and static
-; CHECK-BE64: Flags [ (0x60000005)
+; CHECK-BE64: .mips_hack_elf_flags 0x60000005
 ;
 ; 64(R1) bit with NO_REORDER and PIC
-; CHECK-BE64_PIC: Flags [ (0x60000007)
+; CHECK-BE64_PIC: .mips_hack_elf_flags 0x60000007
 ;
 ; 64R2 bit with NO_REORDER and static
-; CHECK-BE64R2: Flags [ (0x80000005)
+; CHECK-BE64R2: .mips_hack_elf_flags 0x80000005
 ;
 ; 64R2 bit with NO_REORDER and PIC
-; CHECK-BE64R2_PIC: Flags [ (0x80000007)
+; CHECK-BE64R2_PIC: .mips_hack_elf_flags 0x80000007
 ;
 ; 32R2 bit MIPS16 with PIC
-; CHECK-LE32R2-MIPS16: Flags [ (0x74001006)
- 
+; CHECK-LE32R2-MIPS16: .mips_hack_elf_flags 0x74001006
+
 define i32 @main() nounwind {
 entry:
   ret i32 0
diff --git a/test/MC/Mips/elf_eflags.s b/test/MC/Mips/elf_eflags.s
new file mode 100644
index 0000000..c565964
--- /dev/null
+++ b/test/MC/Mips/elf_eflags.s
@@ -0,0 +1,5 @@
+// RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux %s -o -| llvm-readobj -h | FileCheck %s
+
+        .mips_hack_elf_flags 0x50001005
+
+// CHECK: Flags [ (0x50001005)
diff --git a/test/MC/Mips/elf_st_other.ll b/test/MC/Mips/elf_st_other.ll
index bc56c00..31294c8 100644
--- a/test/MC/Mips/elf_st_other.ll
+++ b/test/MC/Mips/elf_st_other.ll
@@ -1,12 +1,11 @@
 ; This tests value of ELF st_other field for function symbol table entries.
 ; For microMIPS value should be equal to STO_MIPS_MICROMIPS.
 
-; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips %s -o - | llvm-readobj -t | FileCheck %s
+; RUN: llc -mtriple mipsel-unknown-linux -mcpu=mips32r2 -mattr=+micromips -print-hack-directives %s -o - | FileCheck %s
 
 define i32 @main() nounwind {
 entry:
   ret i32 0
 }
 
-; CHECK:     Name: main
-; CHECK:     Other: 128
+; CHECK:     .mips_hack_stocg main, 128
diff --git a/test/MC/Mips/elf_st_other.s b/test/MC/Mips/elf_st_other.s
new file mode 100644
index 0000000..2d63288
--- /dev/null
+++ b/test/MC/Mips/elf_st_other.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -filetype=obj -triple mipsel-unknown-linux %s -o -| llvm-readobj -t | FileCheck %s
+
+        .text
+        .globl  main
+        .align  2
+        .type   main,@function
+        .set    nomips16                # @main
+        .ent    main
+        .mips_hack_stocg main, 128
+main:
+
+// CHECK:     Name: main
+// CHECK:     Other: 128
diff --git a/test/MC/Mips/lit.local.cfg b/test/MC/Mips/lit.local.cfg
index d2e3b28..1fa54b4 100644
--- a/test/MC/Mips/lit.local.cfg
+++ b/test/MC/Mips/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Mips' in targets:
     config.unsupported = True
diff --git a/test/MC/Mips/micromips-alu-instructions.s b/test/MC/Mips/micromips-alu-instructions.s
index c541e1a..276a83e 100644
--- a/test/MC/Mips/micromips-alu-instructions.s
+++ b/test/MC/Mips/micromips-alu-instructions.s
@@ -1,38 +1,79 @@
-# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips | FileCheck %s
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips | FileCheck -check-prefix=CHECK-EL %s
+# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips | FileCheck -check-prefix=CHECK-EB %s
 # Check that the assembler can handle the documented syntax
 # for arithmetic and logical instructions.
 #------------------------------------------------------------------------------
 # Arithmetic and Logical Instructions
 #------------------------------------------------------------------------------
-# CHECK: add   $9, $6, $7      # encoding: [0x10,0x49,0xe6,0x00]
-# CHECK: addi  $9, $6, 17767   # encoding: [0x67,0x45,0x26,0x11]
-# CHECK: addiu $9, $6, -15001  # encoding: [0x67,0xc5,0x26,0x31]
-# CHECK: addi  $9, $6, 17767   # encoding: [0x67,0x45,0x26,0x11]
-# CHECK: addiu $9, $6, -15001  # encoding: [0x67,0xc5,0x26,0x31]
-# CHECK: addu  $9, $6, $7      # encoding: [0x50,0x49,0xe6,0x00]
-# CHECK: sub   $9, $6, $7      # encoding: [0x90,0x49,0xe6,0x00]
-# CHECK: subu  $4, $3, $5      # encoding: [0xd0,0x21,0xa3,0x00]
-# CHECK: neg   $6, $7          # encoding: [0x90,0x31,0xe0,0x00]
-# CHECK: negu  $6, $7          # encoding: [0xd0,0x31,0xe0,0x00]
-# CHECK: move  $7, $8          # encoding: [0x50,0x39,0x08,0x00]
-# CHECK: slt    $3, $3, $5     # encoding: [0x50,0x1b,0xa3,0x00]
-# CHECK: slti   $3, $3, 103    # encoding: [0x67,0x00,0x63,0x90]
-# CHECK: slti   $3, $3, 103    # encoding: [0x67,0x00,0x63,0x90]
-# CHECK: sltiu  $3, $3, 103    # encoding: [0x67,0x00,0x63,0xb0]
-# CHECK: sltu   $3, $3, $5     # encoding: [0x90,0x1b,0xa3,0x00]
-# CHECK: and    $9, $6, $7     # encoding: [0x50,0x4a,0xe6,0x00]
-# CHECK: andi   $9, $6, 17767  # encoding: [0x67,0x45,0x26,0xd1]
-# CHECK: andi   $9, $6, 17767  # encoding: [0x67,0x45,0x26,0xd1]
-# CHECK: or     $3, $4, $5     # encoding: [0x90,0x1a,0xa4,0x00]
-# CHECK: ori    $9, $6, 17767  # encoding: [0x67,0x45,0x26,0x51]
-# CHECK: xor    $3, $3, $5     # encoding: [0x10,0x1b,0xa3,0x00]
-# CHECK: xori   $9, $6, 17767  # encoding: [0x67,0x45,0x26,0x71]
-# CHECK: xori   $9, $6, 17767  # encoding: [0x67,0x45,0x26,0x71]
-# CHECK: nor    $9, $6, $7     # encoding: [0xd0,0x4a,0xe6,0x00]
-# CHECK: not    $7, $8         # encoding: [0xd0,0x3a,0x08,0x00]
-# CHECK: mul    $9, $6, $7     # encoding: [0x10,0x4a,0xe6,0x00]
-# CHECK: mult   $9, $7         # encoding: [0x3c,0x8b,0xe9,0x00]
-# CHECK: multu  $9, $7         # encoding: [0x3c,0x9b,0xe9,0x00]
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: add   $9, $6, $7      # encoding: [0xe6,0x00,0x10,0x49]
+# CHECK-EL: addi  $9, $6, 17767   # encoding: [0x26,0x11,0x67,0x45]
+# CHECK-EL: addiu $9, $6, -15001  # encoding: [0x26,0x31,0x67,0xc5]
+# CHECK-EL: addi  $9, $6, 17767   # encoding: [0x26,0x11,0x67,0x45]
+# CHECK-EL: addiu $9, $6, -15001  # encoding: [0x26,0x31,0x67,0xc5]
+# CHECK-EL: addu  $9, $6, $7      # encoding: [0xe6,0x00,0x50,0x49]
+# CHECK-EL: sub   $9, $6, $7      # encoding: [0xe6,0x00,0x90,0x49]
+# CHECK-EL: subu  $4, $3, $5      # encoding: [0xa3,0x00,0xd0,0x21]
+# CHECK-EL: neg   $6, $7          # encoding: [0xe0,0x00,0x90,0x31]
+# CHECK-EL: negu  $6, $7          # encoding: [0xe0,0x00,0xd0,0x31]
+# CHECK-EL: move  $7, $8          # encoding: [0x08,0x00,0x50,0x39]
+# CHECK-EL: slt    $3, $3, $5     # encoding: [0xa3,0x00,0x50,0x1b]
+# CHECK-EL: slti   $3, $3, 103    # encoding: [0x63,0x90,0x67,0x00]
+# CHECK-EL: slti   $3, $3, 103    # encoding: [0x63,0x90,0x67,0x00]
+# CHECK-EL: sltiu  $3, $3, 103    # encoding: [0x63,0xb0,0x67,0x00]
+# CHECK-EL: sltu   $3, $3, $5     # encoding: [0xa3,0x00,0x90,0x1b]
+# CHECK-EL: lui    $9, 17767      # encoding: [0xa9,0x41,0x67,0x45]
+# CHECK-EL: and    $9, $6, $7     # encoding: [0xe6,0x00,0x50,0x4a]
+# CHECK-EL: andi   $9, $6, 17767  # encoding: [0x26,0xd1,0x67,0x45]
+# CHECK-EL: andi   $9, $6, 17767  # encoding: [0x26,0xd1,0x67,0x45]
+# CHECK-EL: or     $3, $4, $5     # encoding: [0xa4,0x00,0x90,0x1a]
+# CHECK-EL: ori    $9, $6, 17767  # encoding: [0x26,0x51,0x67,0x45]
+# CHECK-EL: xor    $3, $3, $5     # encoding: [0xa3,0x00,0x10,0x1b]
+# CHECK-EL: xori   $9, $6, 17767  # encoding: [0x26,0x71,0x67,0x45]
+# CHECK-EL: xori   $9, $6, 17767  # encoding: [0x26,0x71,0x67,0x45]
+# CHECK-EL: nor    $9, $6, $7     # encoding: [0xe6,0x00,0xd0,0x4a]
+# CHECK-EL: not    $7, $8         # encoding: [0x08,0x00,0xd0,0x3a]
+# CHECK-EL: mul    $9, $6, $7     # encoding: [0xe6,0x00,0x10,0x4a]
+# CHECK-EL: mult   $9, $7         # encoding: [0xe9,0x00,0x3c,0x8b]
+# CHECK-EL: multu  $9, $7         # encoding: [0xe9,0x00,0x3c,0x9b]
+# CHECK-EL: div    $zero, $9, $7  # encoding: [0xe9,0x00,0x3c,0xab]
+# CHECK-EL: divu   $zero, $9, $7  # encoding: [0xe9,0x00,0x3c,0xbb]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: add $9, $6, $7        # encoding: [0x00,0xe6,0x49,0x10]
+# CHECK-EB: addi  $9, $6, 17767   # encoding: [0x11,0x26,0x45,0x67]
+# CHECK-EB: addiu $9, $6, -15001  # encoding: [0x31,0x26,0xc5,0x67]
+# CHECK-EB: addi  $9, $6, 17767   # encoding: [0x11,0x26,0x45,0x67]
+# CHECK-EB: addiu $9, $6, -15001  # encoding: [0x31,0x26,0xc5,0x67]
+# CHECK-EB: addu  $9, $6, $7      # encoding: [0x00,0xe6,0x49,0x50]
+# CHECK-EB: sub $9, $6, $7        # encoding: [0x00,0xe6,0x49,0x90]
+# CHECK-EB: subu  $4, $3, $5      # encoding: [0x00,0xa3,0x21,0xd0]
+# CHECK-EB: neg $6, $7            # encoding: [0x00,0xe0,0x31,0x90]
+# CHECK-EB: negu  $6, $7          # encoding: [0x00,0xe0,0x31,0xd0]
+# CHECK-EB: move  $7, $8          # encoding: [0x00,0x08,0x39,0x50]
+# CHECK-EB: slt $3, $3, $5        # encoding: [0x00,0xa3,0x1b,0x50]
+# CHECK-EB: slti  $3, $3, 103     # encoding: [0x90,0x63,0x00,0x67]
+# CHECK-EB: slti  $3, $3, 103     # encoding: [0x90,0x63,0x00,0x67]
+# CHECK-EB: sltiu $3, $3, 103     # encoding: [0xb0,0x63,0x00,0x67]
+# CHECK-EB: sltu  $3, $3, $5      # encoding: [0x00,0xa3,0x1b,0x90]
+# CHECK-EB: lui $9, 17767         # encoding: [0x41,0xa9,0x45,0x67]
+# CHECK-EB: and $9, $6, $7        # encoding: [0x00,0xe6,0x4a,0x50]
+# CHECK-EB:  andi  $9, $6, 17767  # encoding: [0xd1,0x26,0x45,0x67]
+# CHECK-EB:  andi  $9, $6, 17767  # encoding: [0xd1,0x26,0x45,0x67]
+# CHECK-EB:  or  $3, $4, $5       # encoding: [0x00,0xa4,0x1a,0x90]
+# CHECK-EB:  ori $9, $6, 17767    # encoding: [0x51,0x26,0x45,0x67]
+# CHECK-EB:  xor $3, $3, $5       # encoding: [0x00,0xa3,0x1b,0x10]
+# CHECK-EB:  xori  $9, $6, 17767  # encoding: [0x71,0x26,0x45,0x67]
+# CHECK-EB:  xori  $9, $6, 17767  # encoding: [0x71,0x26,0x45,0x67]
+# CHECK-EB:  nor $9, $6, $7       # encoding: [0x00,0xe6,0x4a,0xd0]
+# CHECK-EB:  not $7, $8           # encoding: [0x00,0x08,0x3a,0xd0]
+# CHECK-EB:  mul $9, $6, $7       # encoding: [0x00,0xe6,0x4a,0x10]
+# CHECK-EB:  mult  $9, $7         # encoding: [0x00,0xe9,0x8b,0x3c]
+# CHECK-EB:  multu $9, $7         # encoding: [0x00,0xe9,0x9b,0x3c]
+# CHECK-EB: div  $zero, $9, $7    # encoding: [0x00,0xe9,0xab,0x3c]
+# CHECK-EB: divu $zero, $9, $7    # encoding: [0x00,0xe9,0xbb,0x3c]
     add    $9, $6, $7
     add    $9, $6, 17767
     addu   $9, $6, -15001
@@ -49,6 +90,7 @@
     slti   $3, $3, 103
     sltiu  $3, $3, 103
     sltu   $3, $3, $5
+    lui    $9, 17767
     and    $9, $6, $7
     and    $9, $6, 17767
     andi   $9, $6, 17767
@@ -62,3 +104,5 @@
     mul    $9, $6, $7
     mult   $9, $7
     multu  $9, $7
+    div    $0, $9, $7
+    divu   $0, $9, $7
diff --git a/test/MC/Mips/micromips-branch-instructions.s b/test/MC/Mips/micromips-branch-instructions.s
new file mode 100644
index 0000000..84df2a1
--- /dev/null
+++ b/test/MC/Mips/micromips-branch-instructions.s
@@ -0,0 +1,65 @@
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips \
+# RUN: | FileCheck %s -check-prefix=CHECK-EL
+# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips \
+# RUN: | FileCheck %s -check-prefix=CHECK-EB
+# Check that the assembler can handle the documented syntax
+# for arithmetic and logical instructions.
+#------------------------------------------------------------------------------
+# Branch Instructions
+#------------------------------------------------------------------------------
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: b 1332               # encoding: [0x00,0x94,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: beq $9, $6, 1332     # encoding: [0xc9,0x94,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bgez $6, 1332        # encoding: [0x46,0x40,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bgezal $6, 1332      # encoding: [0x66,0x40,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bltzal $6, 1332      # encoding: [0x26,0x40,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bgtz $6, 1332        # encoding: [0xc6,0x40,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: blez $6, 1332        # encoding: [0x86,0x40,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bne $9, $6, 1332     # encoding: [0xc9,0xb4,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bal 1332             # encoding: [0x60,0x40,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: bltz $6, 1332        # encoding: [0x06,0x40,0x9a,0x02]
+# CHECK-EL: nop                  # encoding: [0x00,0x00,0x00,0x00]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: b 1332               # encoding: [0x94,0x00,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: beq $9, $6, 1332     # encoding: [0x94,0xc9,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bgez $6, 1332        # encoding: [0x40,0x46,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bgezal $6, 1332      # encoding: [0x40,0x66,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bltzal $6, 1332      # encoding: [0x40,0x26,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bgtz $6, 1332        # encoding: [0x40,0xc6,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: blez $6, 1332        # encoding: [0x40,0x86,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bne $9, $6, 1332     # encoding: [0xb4,0xc9,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bal 1332             # encoding: [0x40,0x60,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: bltz $6, 1332        # encoding: [0x40,0x06,0x02,0x9a]
+# CHECK-EB: nop                  # encoding: [0x00,0x00,0x00,0x00]
+
+     b      1332
+     beq    $9,$6,1332
+     bgez   $6,1332
+     bgezal $6,1332
+     bltzal $6,1332
+     bgtz   $6,1332
+     blez   $6,1332
+     bne    $9,$6,1332
+     bal    1332
+     bltz   $6,1332
diff --git a/test/MC/Mips/micromips-branch16.s b/test/MC/Mips/micromips-branch16.s
new file mode 100644
index 0000000..321ee86
--- /dev/null
+++ b/test/MC/Mips/micromips-branch16.s
@@ -0,0 +1,69 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mattr=micromips | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mipsel-unknown-linux \
+# RUN: -mattr=micromips | llvm-readobj -r \
+# RUN: | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax
+# for relocations.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: b           bar # encoding: [A,0x94'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: beq $3, $4, bar # encoding: [0x83'A',0x94'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bne $3, $4, bar # encoding: [0x83'A',0xb4'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bgez    $4, bar # encoding: [0x44'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bgtz    $4, bar # encoding: [0xc4'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: blez    $4, bar # encoding: [0x84'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bltz    $4, bar # encoding: [0x04'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bgezal  $4, bar # encoding: [0x64'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-FIXUP: bltzal  $4, bar # encoding: [0x24'A',0x40'A',0x00,0x00]
+# CHECK-FIXUP:                 #   fixup A - offset: 0,
+# CHECK-FIXUP:                     value: bar, kind: fixup_MICROMIPS_PC16_S1
+# CHECK-FIXUP: nop             # encoding: [0x00,0x00,0x00,0x00]
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_PC16_S1
+# CHECK-ELF: ]
+
+  b       bar
+  beq     $3, $4, bar
+  bne     $3, $4, bar
+  bgez    $4, bar
+  bgtz    $4, bar
+  blez    $4, bar
+  bltz    $4, bar
+  bgezal  $4, bar
+  bltzal  $4, bar
diff --git a/test/MC/Mips/micromips-expansions.s b/test/MC/Mips/micromips-expansions.s
new file mode 100644
index 0000000..af4d3b5
--- /dev/null
+++ b/test/MC/Mips/micromips-expansions.s
@@ -0,0 +1,57 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mcpu=mips32r2 -mattr=micromips | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for macro instructions
+#------------------------------------------------------------------------------
+# Load immediate instructions
+#------------------------------------------------------------------------------
+# CHECK: ori   $5, $zero, 123        # encoding: [0xa0,0x50,0x7b,0x00]
+# CHECK: addiu $6, $zero, -2345      # encoding: [0xc0,0x30,0xd7,0xf6]
+# CHECK: lui   $7, 1                 # encoding: [0xa7,0x41,0x01,0x00]
+# CHECK: ori   $7, $7, 2             # encoding: [0xe7,0x50,0x02,0x00]
+# CHECK: addiu $4, $zero, 20         # encoding: [0x80,0x30,0x14,0x00]
+# CHECK: lui   $7, 1                 # encoding: [0xa7,0x41,0x01,0x00]
+# CHECK: ori   $7, $7, 2             # encoding: [0xe7,0x50,0x02,0x00]
+# CHECK: addiu $4, $5, 20            # encoding: [0x85,0x30,0x14,0x00]
+# CHECK: lui   $7, 1                 # encoding: [0xa7,0x41,0x01,0x00]
+# CHECK: ori   $7, $7, 2             # encoding: [0xe7,0x50,0x02,0x00]
+# CHECK: addu  $7, $7, $8            # encoding: [0x07,0x01,0x50,0x39]
+# CHECK: lui   $10, %hi(symbol)      # encoding: [0xaa'A',0x41'A',0x00,0x00]
+# CHECK:                             # fixup A - offset: 0,
+# CHECK:                               value: symbol@ABS_HI,
+# CHECK:                               kind: fixup_MICROMIPS_HI16
+# CHECK: addu  $10, $10, $4          # encoding: [0x8a,0x00,0x50,0x51]
+# CHECK: lw    $10, %lo(symbol)($10) # encoding: [0x4a'A',0xfd'A',0x00,0x00]
+# CHECK:                             # fixup A - offset: 0,
+# CHECK:                               value: symbol@ABS_LO,
+# CHECK:                               kind: fixup_MICROMIPS_LO16
+# CHECK: lui   $1, %hi(symbol)       # encoding: [0xa1'A',0x41'A',0x00,0x00]
+# CHECK:                             # fixup A - offset: 0,
+# CHECK:                               value: symbol@ABS_HI,
+# CHECK:                               kind: fixup_MICROMIPS_HI16
+# CHECK: addu  $1, $1, $9            # encoding: [0x21,0x01,0x50,0x09]
+# CHECK: sw    $10, %lo(symbol)($1)  # encoding: [0x41'A',0xf9'A',0x00,0x00]
+# CHECK:                             # fixup A - offset: 0,
+# CHECK:                               value: symbol@ABS_LO,
+# CHECK:                               kind: fixup_MICROMIPS_LO16
+# CHECK: lui   $10, 10               # encoding: [0xaa,0x41,0x0a,0x00]
+# CHECK: addu  $10, $10, $4          # encoding: [0x8a,0x00,0x50,0x51]
+# CHECK: lw    $10, 123($10)         # encoding: [0x4a,0xfd,0x7b,0x00]
+# CHECK: lui   $1, 2                 # encoding: [0xa1,0x41,0x02,0x00]
+# CHECK: addu  $1, $1, $9            # encoding: [0x21,0x01,0x50,0x09]
+# CHECK: sw    $10, 57920($1)        # encoding: [0x41,0xf9,0x40,0xe2]
+
+    li $5,123
+    li $6,-2345
+    li $7,65538
+
+    la $a0, 20
+    la $7,65538
+    la $a0, 20($a1)
+    la $7,65538($8)
+
+    lw  $t2, symbol($a0)
+    sw  $t2, symbol($t1)
+
+    lw  $t2, 655483($a0)
+    sw  $t2, 123456($t1)
diff --git a/test/MC/Mips/micromips-expressions.s b/test/MC/Mips/micromips-expressions.s
new file mode 100644
index 0000000..509e980
--- /dev/null
+++ b/test/MC/Mips/micromips-expressions.s
@@ -0,0 +1,35 @@
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mcpu=mips32r2 -mattr=micromips | FileCheck %s
+# Check that the assembler can handle the expressions as operands.
+# CHECK:  .text
+# CHECK:  .globl  foo
+# CHECK:  foo:
+# CHECK:  lw   $4, %lo(foo)($4)    # encoding: [0x84'A',0xfc'A',0x00,0x00]
+# CHECK:                           # fixup A - offset: 0,
+# CHECK:                             value: foo@ABS_LO,
+# CHECK:                             kind: fixup_MICROMIPS_LO16
+# CHECK:  lw   $4, 56($4)          # encoding: [0x84,0xfc,0x38,0x00]
+# CHECK:  lw   $4, %lo(foo+8)($4)  # encoding: [0x84'A',0xfc'A',0x08,0x00]
+# CHECK:                           # fixup A - offset: 0,
+# CHECK:                             value: foo@ABS_LO,
+# CHECK:                             kind: fixup_MICROMIPS_LO16
+# CHECK:  lw   $4, %lo(foo+8)($4)  # encoding: [0x84'A',0xfc'A',0x08,0x00]
+# CHECK:                           # fixup A - offset: 0,
+# CHECK:                             value: foo@ABS_LO,
+# CHECK:                             kind: fixup_MICROMIPS_LO16
+# CHECK:  lw   $4, %lo(foo+8)($4)  # encoding: [0x84'A',0xfc'A',0x08,0x00]
+# CHECK:                           # fixup A - offset: 0,
+# CHECK:                             value: foo@ABS_LO,
+# CHECK:                             kind: fixup_MICROMIPS_LO16
+# CHECK:  .space  64
+
+  .globl  foo
+  .ent  foo
+foo:
+  lw  $4,%lo(foo)($4)
+  lw  $4,((10 + 4) * 4)($4)
+  lw  $4,%lo (2 * 4) + foo($4)
+  lw  $4,%lo((2 * 4) + foo)($4)
+  lw  $4,(((%lo ((2 * 4) + foo))))($4)
+  .space  64
+  .end  foo
diff --git a/test/MC/Mips/micromips-jump-instructions.s b/test/MC/Mips/micromips-jump-instructions.s
new file mode 100644
index 0000000..6f571b6
--- /dev/null
+++ b/test/MC/Mips/micromips-jump-instructions.s
@@ -0,0 +1,40 @@
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips \
+# RUN: | FileCheck %s -check-prefix=CHECK-EL
+# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips \
+# RUN: | FileCheck %s -check-prefix=CHECK-EB
+# Check that the assembler can handle the documented syntax
+# for jump and branch instructions.
+#------------------------------------------------------------------------------
+# Jump instructions
+#------------------------------------------------------------------------------
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: j 1328      # encoding: [0x00,0xd4,0x98,0x02]
+# CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jal 1328    # encoding: [0x00,0xf4,0x98,0x02]
+# CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jalr $6     # encoding: [0xe6,0x03,0x3c,0x0f]
+# CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jr $7       # encoding: [0x07,0x00,0x3c,0x0f]
+# CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EL: jr $7       # encoding: [0x07,0x00,0x3c,0x0f]
+# CHECK-EL: nop         # encoding: [0x00,0x00,0x00,0x00]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: j 1328      # encoding: [0xd4,0x00,0x02,0x98]
+# CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jal 1328    # encoding: [0xf4,0x00,0x02,0x98]
+# CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jalr $6     # encoding: [0x03,0xe6,0x0f,0x3c]
+# CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jr $7       # encoding: [0x00,0x07,0x0f,0x3c]
+# CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-EB: jr $7       # encoding: [0x00,0x07,0x0f,0x3c]
+# CHECK-EB: nop         # encoding: [0x00,0x00,0x00,0x00]
+
+     j 1328
+     jal 1328
+     jalr $6
+     jr $7
+     j $7
diff --git a/test/MC/Mips/micromips-jump26.s b/test/MC/Mips/micromips-jump26.s
new file mode 100644
index 0000000..936a998
--- /dev/null
+++ b/test/MC/Mips/micromips-jump26.s
@@ -0,0 +1,23 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mattr=micromips | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mipsel-unknown-linux \
+# RUN: -mattr=micromips | llvm-readobj -r \
+# RUN: | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax
+# for relocations.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: foo:
+# CHECK-FIXUP:   jal bar # encoding: [A,0xf4'A',A,0b000000AA]
+# CHECK-FIXUP:           #   fixup A - offset: 0,
+# CHECK-FIXUP:               value: bar, kind: fixup_MICROMIPS_26_S1
+# CHECK-FIXUP:   nop     # encoding: [0x00,0x00,0x00,0x00]
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_26_S1
+# CHECK-ELF: ]
+
+foo:
+  jal bar
diff --git a/test/MC/Mips/micromips-loadstore-instructions.s b/test/MC/Mips/micromips-loadstore-instructions.s
index 623e2ac..cc7514b 100644
--- a/test/MC/Mips/micromips-loadstore-instructions.s
+++ b/test/MC/Mips/micromips-loadstore-instructions.s
@@ -1,17 +1,31 @@
-# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips | FileCheck %s
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips | FileCheck -check-prefix=CHECK-EL %s
+# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips | FileCheck -check-prefix=CHECK-EB %s
 # Check that the assembler can handle the documented syntax
 # for load and store instructions.
 #------------------------------------------------------------------------------
 # Load and Store Instructions
 #------------------------------------------------------------------------------
-# CHECK: lb     $5, 8($4)      # encoding: [0x08,0x00,0xa4,0x1c]
-# CHECK: lbu    $6, 8($4)      # encoding: [0x08,0x00,0xc4,0x14]
-# CHECK: lh     $2, 8($4)      # encoding: [0x08,0x00,0x44,0x3c]
-# CHECK: lhu    $4, 8($2)      # encoding: [0x08,0x00,0x82,0x34]
-# CHECK: lw     $6, 4($5)      # encoding: [0x04,0x00,0xc5,0xfc]
-# CHECK: sb     $5, 8($4)      # encoding: [0x08,0x00,0xa4,0x18]
-# CHECK: sh     $2, 8($4)      # encoding: [0x08,0x00,0x44,0x38]
-# CHECK: sw     $5, 4($6)      # encoding: [0x04,0x00,0xa6,0xf8]
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: lb     $5, 8($4)      # encoding: [0xa4,0x1c,0x08,0x00]
+# CHECK-EL: lbu    $6, 8($4)      # encoding: [0xc4,0x14,0x08,0x00]
+# CHECK-EL: lh     $2, 8($4)      # encoding: [0x44,0x3c,0x08,0x00]
+# CHECK-EL: lhu    $4, 8($2)      # encoding: [0x82,0x34,0x08,0x00]
+# CHECK-EL: lw     $6, 4($5)      # encoding: [0xc5,0xfc,0x04,0x00]
+# CHECK-EL: sb     $5, 8($4)      # encoding: [0xa4,0x18,0x08,0x00]
+# CHECK-EL: sh     $2, 8($4)      # encoding: [0x44,0x38,0x08,0x00]
+# CHECK-EL: sw     $5, 4($6)      # encoding: [0xa6,0xf8,0x04,0x00]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: lb     $5, 8($4)      # encoding: [0x1c,0xa4,0x00,0x08]
+# CHECK-EB: lbu    $6, 8($4)      # encoding: [0x14,0xc4,0x00,0x08]
+# CHECK-EB: lh     $2, 8($4)      # encoding: [0x3c,0x44,0x00,0x08]
+# CHECK-EB: lhu    $4, 8($2)      # encoding: [0x34,0x82,0x00,0x08]
+# CHECK-EB: lw     $6, 4($5)      # encoding: [0xfc,0xc5,0x00,0x04]
+# CHECK-EB: sb     $5, 8($4)      # encoding: [0x18,0xa4,0x00,0x08]
+# CHECK-EB: sh     $2, 8($4)      # encoding: [0x38,0x44,0x00,0x08]
+# CHECK-EB: sw     $5, 4($6)      # encoding: [0xf8,0xa6,0x00,0x04]
      lb     $5, 8($4)
      lbu    $6, 8($4)
      lh     $2, 8($4)
diff --git a/test/MC/Mips/micromips-loadstore-unaligned.s b/test/MC/Mips/micromips-loadstore-unaligned.s
new file mode 100644
index 0000000..ab1d8b9
--- /dev/null
+++ b/test/MC/Mips/micromips-loadstore-unaligned.s
@@ -0,0 +1,26 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mattr=micromips | FileCheck -check-prefix=CHECK-EL %s
+# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding \
+# RUN: -mattr=micromips | FileCheck -check-prefix=CHECK-EB %s
+# Check that the assembler can handle the documented syntax
+# for loads and stores.
+#------------------------------------------------------------------------------
+# Load and Store unaligned instructions
+#------------------------------------------------------------------------------
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: lwl $4, 16($5)   # encoding: [0x85,0x60,0x10,0x00]
+# CHECK-EL: lwr $4, 16($5)   # encoding: [0x85,0x60,0x10,0x10]
+# CHECK-EL: swl $4, 16($5)   # encoding: [0x85,0x60,0x10,0x80]
+# CHECK-EL: swr $4, 16($5)   # encoding: [0x85,0x60,0x10,0x90]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: lwl $4, 16($5)   # encoding: [0x60,0x85,0x00,0x10]
+# CHECK-EB: lwr $4, 16($5)   # encoding: [0x60,0x85,0x10,0x10]
+# CHECK-EB: swl $4, 16($5)   # encoding: [0x60,0x85,0x80,0x10]
+# CHECK-EB: swr $4, 16($5)   # encoding: [0x60,0x85,0x90,0x10]
+     lwl  $4, 16($5)
+     lwr  $4, 16($5)
+     swl  $4, 16($5)
+     swr  $4, 16($5)
diff --git a/test/MC/Mips/micromips-long-branch.ll b/test/MC/Mips/micromips-long-branch.ll
new file mode 100644
index 0000000..3267f4a
--- /dev/null
+++ b/test/MC/Mips/micromips-long-branch.ll
@@ -0,0 +1,16437 @@
+; RUN: llc %s -march=mipsel -mcpu=mips32r2 -mattr=micromips -filetype=asm \
+; RUN: -relocation-model=pic -O3 -o - | FileCheck %s
+
+@a = common global [10 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK:  addiu $sp, $sp, -8
+; CHECK:  sw  $ra, 0($sp)
+; CHECK:  lui $[[REG1:[0-9]+]], 2
+; CHECK:  addiu $[[REG1]], $[[REG1]], 0
+; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
+; CHECK:  lw  $ra, 0($sp)
+; CHECK:  jr  $[[REG1]]
+; CHECK:  addiu $sp, $sp, 8
+
+for.body:
+  %1 = load i32* %i, align 4
+  %2 = load i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @a, i32 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4  %nop0 = alloca i1, i1 0
+  %nop1 = alloca i1, i1 0
+  %nop2 = alloca i1, i1 0
+  %nop3 = alloca i1, i1 0
+  %nop4 = alloca i1, i1 0
+  %nop5 = alloca i1, i1 0
+  %nop6 = alloca i1, i1 0
+  %nop7 = alloca i1, i1 0
+  %nop8 = alloca i1, i1 0
+  %nop9 = alloca i1, i1 0
+  %nop10 = alloca i1, i1 0
+  %nop11 = alloca i1, i1 0
+  %nop12 = alloca i1, i1 0
+  %nop13 = alloca i1, i1 0
+  %nop14 = alloca i1, i1 0
+  %nop15 = alloca i1, i1 0
+  %nop16 = alloca i1, i1 0
+  %nop17 = alloca i1, i1 0
+  %nop18 = alloca i1, i1 0
+  %nop19 = alloca i1, i1 0
+  %nop20 = alloca i1, i1 0
+  %nop21 = alloca i1, i1 0
+  %nop22 = alloca i1, i1 0
+  %nop23 = alloca i1, i1 0
+  %nop24 = alloca i1, i1 0
+  %nop25 = alloca i1, i1 0
+  %nop26 = alloca i1, i1 0
+  %nop27 = alloca i1, i1 0
+  %nop28 = alloca i1, i1 0
+  %nop29 = alloca i1, i1 0
+  %nop30 = alloca i1, i1 0
+  %nop31 = alloca i1, i1 0
+  %nop32 = alloca i1, i1 0
+  %nop33 = alloca i1, i1 0
+  %nop34 = alloca i1, i1 0
+  %nop35 = alloca i1, i1 0
+  %nop36 = alloca i1, i1 0
+  %nop37 = alloca i1, i1 0
+  %nop38 = alloca i1, i1 0
+  %nop39 = alloca i1, i1 0
+  %nop40 = alloca i1, i1 0
+  %nop41 = alloca i1, i1 0
+  %nop42 = alloca i1, i1 0
+  %nop43 = alloca i1, i1 0
+  %nop44 = alloca i1, i1 0
+  %nop45 = alloca i1, i1 0
+  %nop46 = alloca i1, i1 0
+  %nop47 = alloca i1, i1 0
+  %nop48 = alloca i1, i1 0
+  %nop49 = alloca i1, i1 0
+  %nop50 = alloca i1, i1 0
+  %nop51 = alloca i1, i1 0
+  %nop52 = alloca i1, i1 0
+  %nop53 = alloca i1, i1 0
+  %nop54 = alloca i1, i1 0
+  %nop55 = alloca i1, i1 0
+  %nop56 = alloca i1, i1 0
+  %nop57 = alloca i1, i1 0
+  %nop58 = alloca i1, i1 0
+  %nop59 = alloca i1, i1 0
+  %nop60 = alloca i1, i1 0
+  %nop61 = alloca i1, i1 0
+  %nop62 = alloca i1, i1 0
+  %nop63 = alloca i1, i1 0
+  %nop64 = alloca i1, i1 0
+  %nop65 = alloca i1, i1 0
+  %nop66 = alloca i1, i1 0
+  %nop67 = alloca i1, i1 0
+  %nop68 = alloca i1, i1 0
+  %nop69 = alloca i1, i1 0
+  %nop70 = alloca i1, i1 0
+  %nop71 = alloca i1, i1 0
+  %nop72 = alloca i1, i1 0
+  %nop73 = alloca i1, i1 0
+  %nop74 = alloca i1, i1 0
+  %nop75 = alloca i1, i1 0
+  %nop76 = alloca i1, i1 0
+  %nop77 = alloca i1, i1 0
+  %nop78 = alloca i1, i1 0
+  %nop79 = alloca i1, i1 0
+  %nop80 = alloca i1, i1 0
+  %nop81 = alloca i1, i1 0
+  %nop82 = alloca i1, i1 0
+  %nop83 = alloca i1, i1 0
+  %nop84 = alloca i1, i1 0
+  %nop85 = alloca i1, i1 0
+  %nop86 = alloca i1, i1 0
+  %nop87 = alloca i1, i1 0
+  %nop88 = alloca i1, i1 0
+  %nop89 = alloca i1, i1 0
+  %nop90 = alloca i1, i1 0
+  %nop91 = alloca i1, i1 0
+  %nop92 = alloca i1, i1 0
+  %nop93 = alloca i1, i1 0
+  %nop94 = alloca i1, i1 0
+  %nop95 = alloca i1, i1 0
+  %nop96 = alloca i1, i1 0
+  %nop97 = alloca i1, i1 0
+  %nop98 = alloca i1, i1 0
+  %nop99 = alloca i1, i1 0
+  %nop100 = alloca i1, i1 0
+  %nop101 = alloca i1, i1 0
+  %nop102 = alloca i1, i1 0
+  %nop103 = alloca i1, i1 0
+  %nop104 = alloca i1, i1 0
+  %nop105 = alloca i1, i1 0
+  %nop106 = alloca i1, i1 0
+  %nop107 = alloca i1, i1 0
+  %nop108 = alloca i1, i1 0
+  %nop109 = alloca i1, i1 0
+  %nop110 = alloca i1, i1 0
+  %nop111 = alloca i1, i1 0
+  %nop112 = alloca i1, i1 0
+  %nop113 = alloca i1, i1 0
+  %nop114 = alloca i1, i1 0
+  %nop115 = alloca i1, i1 0
+  %nop116 = alloca i1, i1 0
+  %nop117 = alloca i1, i1 0
+  %nop118 = alloca i1, i1 0
+  %nop119 = alloca i1, i1 0
+  %nop120 = alloca i1, i1 0
+  %nop121 = alloca i1, i1 0
+  %nop122 = alloca i1, i1 0
+  %nop123 = alloca i1, i1 0
+  %nop124 = alloca i1, i1 0
+  %nop125 = alloca i1, i1 0
+  %nop126 = alloca i1, i1 0
+  %nop127 = alloca i1, i1 0
+  %nop128 = alloca i1, i1 0
+  %nop129 = alloca i1, i1 0
+  %nop130 = alloca i1, i1 0
+  %nop131 = alloca i1, i1 0
+  %nop132 = alloca i1, i1 0
+  %nop133 = alloca i1, i1 0
+  %nop134 = alloca i1, i1 0
+  %nop135 = alloca i1, i1 0
+  %nop136 = alloca i1, i1 0
+  %nop137 = alloca i1, i1 0
+  %nop138 = alloca i1, i1 0
+  %nop139 = alloca i1, i1 0
+  %nop140 = alloca i1, i1 0
+  %nop141 = alloca i1, i1 0
+  %nop142 = alloca i1, i1 0
+  %nop143 = alloca i1, i1 0
+  %nop144 = alloca i1, i1 0
+  %nop145 = alloca i1, i1 0
+  %nop146 = alloca i1, i1 0
+  %nop147 = alloca i1, i1 0
+  %nop148 = alloca i1, i1 0
+  %nop149 = alloca i1, i1 0
+  %nop150 = alloca i1, i1 0
+  %nop151 = alloca i1, i1 0
+  %nop152 = alloca i1, i1 0
+  %nop153 = alloca i1, i1 0
+  %nop154 = alloca i1, i1 0
+  %nop155 = alloca i1, i1 0
+  %nop156 = alloca i1, i1 0
+  %nop157 = alloca i1, i1 0
+  %nop158 = alloca i1, i1 0
+  %nop159 = alloca i1, i1 0
+  %nop160 = alloca i1, i1 0
+  %nop161 = alloca i1, i1 0
+  %nop162 = alloca i1, i1 0
+  %nop163 = alloca i1, i1 0
+  %nop164 = alloca i1, i1 0
+  %nop165 = alloca i1, i1 0
+  %nop166 = alloca i1, i1 0
+  %nop167 = alloca i1, i1 0
+  %nop168 = alloca i1, i1 0
+  %nop169 = alloca i1, i1 0
+  %nop170 = alloca i1, i1 0
+  %nop171 = alloca i1, i1 0
+  %nop172 = alloca i1, i1 0
+  %nop173 = alloca i1, i1 0
+  %nop174 = alloca i1, i1 0
+  %nop175 = alloca i1, i1 0
+  %nop176 = alloca i1, i1 0
+  %nop177 = alloca i1, i1 0
+  %nop178 = alloca i1, i1 0
+  %nop179 = alloca i1, i1 0
+  %nop180 = alloca i1, i1 0
+  %nop181 = alloca i1, i1 0
+  %nop182 = alloca i1, i1 0
+  %nop183 = alloca i1, i1 0
+  %nop184 = alloca i1, i1 0
+  %nop185 = alloca i1, i1 0
+  %nop186 = alloca i1, i1 0
+  %nop187 = alloca i1, i1 0
+  %nop188 = alloca i1, i1 0
+  %nop189 = alloca i1, i1 0
+  %nop190 = alloca i1, i1 0
+  %nop191 = alloca i1, i1 0
+  %nop192 = alloca i1, i1 0
+  %nop193 = alloca i1, i1 0
+  %nop194 = alloca i1, i1 0
+  %nop195 = alloca i1, i1 0
+  %nop196 = alloca i1, i1 0
+  %nop197 = alloca i1, i1 0
+  %nop198 = alloca i1, i1 0
+  %nop199 = alloca i1, i1 0
+  %nop200 = alloca i1, i1 0
+  %nop201 = alloca i1, i1 0
+  %nop202 = alloca i1, i1 0
+  %nop203 = alloca i1, i1 0
+  %nop204 = alloca i1, i1 0
+  %nop205 = alloca i1, i1 0
+  %nop206 = alloca i1, i1 0
+  %nop207 = alloca i1, i1 0
+  %nop208 = alloca i1, i1 0
+  %nop209 = alloca i1, i1 0
+  %nop210 = alloca i1, i1 0
+  %nop211 = alloca i1, i1 0
+  %nop212 = alloca i1, i1 0
+  %nop213 = alloca i1, i1 0
+  %nop214 = alloca i1, i1 0
+  %nop215 = alloca i1, i1 0
+  %nop216 = alloca i1, i1 0
+  %nop217 = alloca i1, i1 0
+  %nop218 = alloca i1, i1 0
+  %nop219 = alloca i1, i1 0
+  %nop220 = alloca i1, i1 0
+  %nop221 = alloca i1, i1 0
+  %nop222 = alloca i1, i1 0
+  %nop223 = alloca i1, i1 0
+  %nop224 = alloca i1, i1 0
+  %nop225 = alloca i1, i1 0
+  %nop226 = alloca i1, i1 0
+  %nop227 = alloca i1, i1 0
+  %nop228 = alloca i1, i1 0
+  %nop229 = alloca i1, i1 0
+  %nop230 = alloca i1, i1 0
+  %nop231 = alloca i1, i1 0
+  %nop232 = alloca i1, i1 0
+  %nop233 = alloca i1, i1 0
+  %nop234 = alloca i1, i1 0
+  %nop235 = alloca i1, i1 0
+  %nop236 = alloca i1, i1 0
+  %nop237 = alloca i1, i1 0
+  %nop238 = alloca i1, i1 0
+  %nop239 = alloca i1, i1 0
+  %nop240 = alloca i1, i1 0
+  %nop241 = alloca i1, i1 0
+  %nop242 = alloca i1, i1 0
+  %nop243 = alloca i1, i1 0
+  %nop244 = alloca i1, i1 0
+  %nop245 = alloca i1, i1 0
+  %nop246 = alloca i1, i1 0
+  %nop247 = alloca i1, i1 0
+  %nop248 = alloca i1, i1 0
+  %nop249 = alloca i1, i1 0
+  %nop250 = alloca i1, i1 0
+  %nop251 = alloca i1, i1 0
+  %nop252 = alloca i1, i1 0
+  %nop253 = alloca i1, i1 0
+  %nop254 = alloca i1, i1 0
+  %nop255 = alloca i1, i1 0
+  %nop256 = alloca i1, i1 0
+  %nop257 = alloca i1, i1 0
+  %nop258 = alloca i1, i1 0
+  %nop259 = alloca i1, i1 0
+  %nop260 = alloca i1, i1 0
+  %nop261 = alloca i1, i1 0
+  %nop262 = alloca i1, i1 0
+  %nop263 = alloca i1, i1 0
+  %nop264 = alloca i1, i1 0
+  %nop265 = alloca i1, i1 0
+  %nop266 = alloca i1, i1 0
+  %nop267 = alloca i1, i1 0
+  %nop268 = alloca i1, i1 0
+  %nop269 = alloca i1, i1 0
+  %nop270 = alloca i1, i1 0
+  %nop271 = alloca i1, i1 0
+  %nop272 = alloca i1, i1 0
+  %nop273 = alloca i1, i1 0
+  %nop274 = alloca i1, i1 0
+  %nop275 = alloca i1, i1 0
+  %nop276 = alloca i1, i1 0
+  %nop277 = alloca i1, i1 0
+  %nop278 = alloca i1, i1 0
+  %nop279 = alloca i1, i1 0
+  %nop280 = alloca i1, i1 0
+  %nop281 = alloca i1, i1 0
+  %nop282 = alloca i1, i1 0
+  %nop283 = alloca i1, i1 0
+  %nop284 = alloca i1, i1 0
+  %nop285 = alloca i1, i1 0
+  %nop286 = alloca i1, i1 0
+  %nop287 = alloca i1, i1 0
+  %nop288 = alloca i1, i1 0
+  %nop289 = alloca i1, i1 0
+  %nop290 = alloca i1, i1 0
+  %nop291 = alloca i1, i1 0
+  %nop292 = alloca i1, i1 0
+  %nop293 = alloca i1, i1 0
+  %nop294 = alloca i1, i1 0
+  %nop295 = alloca i1, i1 0
+  %nop296 = alloca i1, i1 0
+  %nop297 = alloca i1, i1 0
+  %nop298 = alloca i1, i1 0
+  %nop299 = alloca i1, i1 0
+  %nop300 = alloca i1, i1 0
+  %nop301 = alloca i1, i1 0
+  %nop302 = alloca i1, i1 0
+  %nop303 = alloca i1, i1 0
+  %nop304 = alloca i1, i1 0
+  %nop305 = alloca i1, i1 0
+  %nop306 = alloca i1, i1 0
+  %nop307 = alloca i1, i1 0
+  %nop308 = alloca i1, i1 0
+  %nop309 = alloca i1, i1 0
+  %nop310 = alloca i1, i1 0
+  %nop311 = alloca i1, i1 0
+  %nop312 = alloca i1, i1 0
+  %nop313 = alloca i1, i1 0
+  %nop314 = alloca i1, i1 0
+  %nop315 = alloca i1, i1 0
+  %nop316 = alloca i1, i1 0
+  %nop317 = alloca i1, i1 0
+  %nop318 = alloca i1, i1 0
+  %nop319 = alloca i1, i1 0
+  %nop320 = alloca i1, i1 0
+  %nop321 = alloca i1, i1 0
+  %nop322 = alloca i1, i1 0
+  %nop323 = alloca i1, i1 0
+  %nop324 = alloca i1, i1 0
+  %nop325 = alloca i1, i1 0
+  %nop326 = alloca i1, i1 0
+  %nop327 = alloca i1, i1 0
+  %nop328 = alloca i1, i1 0
+  %nop329 = alloca i1, i1 0
+  %nop330 = alloca i1, i1 0
+  %nop331 = alloca i1, i1 0
+  %nop332 = alloca i1, i1 0
+  %nop333 = alloca i1, i1 0
+  %nop334 = alloca i1, i1 0
+  %nop335 = alloca i1, i1 0
+  %nop336 = alloca i1, i1 0
+  %nop337 = alloca i1, i1 0
+  %nop338 = alloca i1, i1 0
+  %nop339 = alloca i1, i1 0
+  %nop340 = alloca i1, i1 0
+  %nop341 = alloca i1, i1 0
+  %nop342 = alloca i1, i1 0
+  %nop343 = alloca i1, i1 0
+  %nop344 = alloca i1, i1 0
+  %nop345 = alloca i1, i1 0
+  %nop346 = alloca i1, i1 0
+  %nop347 = alloca i1, i1 0
+  %nop348 = alloca i1, i1 0
+  %nop349 = alloca i1, i1 0
+  %nop350 = alloca i1, i1 0
+  %nop351 = alloca i1, i1 0
+  %nop352 = alloca i1, i1 0
+  %nop353 = alloca i1, i1 0
+  %nop354 = alloca i1, i1 0
+  %nop355 = alloca i1, i1 0
+  %nop356 = alloca i1, i1 0
+  %nop357 = alloca i1, i1 0
+  %nop358 = alloca i1, i1 0
+  %nop359 = alloca i1, i1 0
+  %nop360 = alloca i1, i1 0
+  %nop361 = alloca i1, i1 0
+  %nop362 = alloca i1, i1 0
+  %nop363 = alloca i1, i1 0
+  %nop364 = alloca i1, i1 0
+  %nop365 = alloca i1, i1 0
+  %nop366 = alloca i1, i1 0
+  %nop367 = alloca i1, i1 0
+  %nop368 = alloca i1, i1 0
+  %nop369 = alloca i1, i1 0
+  %nop370 = alloca i1, i1 0
+  %nop371 = alloca i1, i1 0
+  %nop372 = alloca i1, i1 0
+  %nop373 = alloca i1, i1 0
+  %nop374 = alloca i1, i1 0
+  %nop375 = alloca i1, i1 0
+  %nop376 = alloca i1, i1 0
+  %nop377 = alloca i1, i1 0
+  %nop378 = alloca i1, i1 0
+  %nop379 = alloca i1, i1 0
+  %nop380 = alloca i1, i1 0
+  %nop381 = alloca i1, i1 0
+  %nop382 = alloca i1, i1 0
+  %nop383 = alloca i1, i1 0
+  %nop384 = alloca i1, i1 0
+  %nop385 = alloca i1, i1 0
+  %nop386 = alloca i1, i1 0
+  %nop387 = alloca i1, i1 0
+  %nop388 = alloca i1, i1 0
+  %nop389 = alloca i1, i1 0
+  %nop390 = alloca i1, i1 0
+  %nop391 = alloca i1, i1 0
+  %nop392 = alloca i1, i1 0
+  %nop393 = alloca i1, i1 0
+  %nop394 = alloca i1, i1 0
+  %nop395 = alloca i1, i1 0
+  %nop396 = alloca i1, i1 0
+  %nop397 = alloca i1, i1 0
+  %nop398 = alloca i1, i1 0
+  %nop399 = alloca i1, i1 0
+  %nop400 = alloca i1, i1 0
+  %nop401 = alloca i1, i1 0
+  %nop402 = alloca i1, i1 0
+  %nop403 = alloca i1, i1 0
+  %nop404 = alloca i1, i1 0
+  %nop405 = alloca i1, i1 0
+  %nop406 = alloca i1, i1 0
+  %nop407 = alloca i1, i1 0
+  %nop408 = alloca i1, i1 0
+  %nop409 = alloca i1, i1 0
+  %nop410 = alloca i1, i1 0
+  %nop411 = alloca i1, i1 0
+  %nop412 = alloca i1, i1 0
+  %nop413 = alloca i1, i1 0
+  %nop414 = alloca i1, i1 0
+  %nop415 = alloca i1, i1 0
+  %nop416 = alloca i1, i1 0
+  %nop417 = alloca i1, i1 0
+  %nop418 = alloca i1, i1 0
+  %nop419 = alloca i1, i1 0
+  %nop420 = alloca i1, i1 0
+  %nop421 = alloca i1, i1 0
+  %nop422 = alloca i1, i1 0
+  %nop423 = alloca i1, i1 0
+  %nop424 = alloca i1, i1 0
+  %nop425 = alloca i1, i1 0
+  %nop426 = alloca i1, i1 0
+  %nop427 = alloca i1, i1 0
+  %nop428 = alloca i1, i1 0
+  %nop429 = alloca i1, i1 0
+  %nop430 = alloca i1, i1 0
+  %nop431 = alloca i1, i1 0
+  %nop432 = alloca i1, i1 0
+  %nop433 = alloca i1, i1 0
+  %nop434 = alloca i1, i1 0
+  %nop435 = alloca i1, i1 0
+  %nop436 = alloca i1, i1 0
+  %nop437 = alloca i1, i1 0
+  %nop438 = alloca i1, i1 0
+  %nop439 = alloca i1, i1 0
+  %nop440 = alloca i1, i1 0
+  %nop441 = alloca i1, i1 0
+  %nop442 = alloca i1, i1 0
+  %nop443 = alloca i1, i1 0
+  %nop444 = alloca i1, i1 0
+  %nop445 = alloca i1, i1 0
+  %nop446 = alloca i1, i1 0
+  %nop447 = alloca i1, i1 0
+  %nop448 = alloca i1, i1 0
+  %nop449 = alloca i1, i1 0
+  %nop450 = alloca i1, i1 0
+  %nop451 = alloca i1, i1 0
+  %nop452 = alloca i1, i1 0
+  %nop453 = alloca i1, i1 0
+  %nop454 = alloca i1, i1 0
+  %nop455 = alloca i1, i1 0
+  %nop456 = alloca i1, i1 0
+  %nop457 = alloca i1, i1 0
+  %nop458 = alloca i1, i1 0
+  %nop459 = alloca i1, i1 0
+  %nop460 = alloca i1, i1 0
+  %nop461 = alloca i1, i1 0
+  %nop462 = alloca i1, i1 0
+  %nop463 = alloca i1, i1 0
+  %nop464 = alloca i1, i1 0
+  %nop465 = alloca i1, i1 0
+  %nop466 = alloca i1, i1 0
+  %nop467 = alloca i1, i1 0
+  %nop468 = alloca i1, i1 0
+  %nop469 = alloca i1, i1 0
+  %nop470 = alloca i1, i1 0
+  %nop471 = alloca i1, i1 0
+  %nop472 = alloca i1, i1 0
+  %nop473 = alloca i1, i1 0
+  %nop474 = alloca i1, i1 0
+  %nop475 = alloca i1, i1 0
+  %nop476 = alloca i1, i1 0
+  %nop477 = alloca i1, i1 0
+  %nop478 = alloca i1, i1 0
+  %nop479 = alloca i1, i1 0
+  %nop480 = alloca i1, i1 0
+  %nop481 = alloca i1, i1 0
+  %nop482 = alloca i1, i1 0
+  %nop483 = alloca i1, i1 0
+  %nop484 = alloca i1, i1 0
+  %nop485 = alloca i1, i1 0
+  %nop486 = alloca i1, i1 0
+  %nop487 = alloca i1, i1 0
+  %nop488 = alloca i1, i1 0
+  %nop489 = alloca i1, i1 0
+  %nop490 = alloca i1, i1 0
+  %nop491 = alloca i1, i1 0
+  %nop492 = alloca i1, i1 0
+  %nop493 = alloca i1, i1 0
+  %nop494 = alloca i1, i1 0
+  %nop495 = alloca i1, i1 0
+  %nop496 = alloca i1, i1 0
+  %nop497 = alloca i1, i1 0
+  %nop498 = alloca i1, i1 0
+  %nop499 = alloca i1, i1 0
+  %nop500 = alloca i1, i1 0
+  %nop501 = alloca i1, i1 0
+  %nop502 = alloca i1, i1 0
+  %nop503 = alloca i1, i1 0
+  %nop504 = alloca i1, i1 0
+  %nop505 = alloca i1, i1 0
+  %nop506 = alloca i1, i1 0
+  %nop507 = alloca i1, i1 0
+  %nop508 = alloca i1, i1 0
+  %nop509 = alloca i1, i1 0
+  %nop510 = alloca i1, i1 0
+  %nop511 = alloca i1, i1 0
+  %nop512 = alloca i1, i1 0
+  %nop513 = alloca i1, i1 0
+  %nop514 = alloca i1, i1 0
+  %nop515 = alloca i1, i1 0
+  %nop516 = alloca i1, i1 0
+  %nop517 = alloca i1, i1 0
+  %nop518 = alloca i1, i1 0
+  %nop519 = alloca i1, i1 0
+  %nop520 = alloca i1, i1 0
+  %nop521 = alloca i1, i1 0
+  %nop522 = alloca i1, i1 0
+  %nop523 = alloca i1, i1 0
+  %nop524 = alloca i1, i1 0
+  %nop525 = alloca i1, i1 0
+  %nop526 = alloca i1, i1 0
+  %nop527 = alloca i1, i1 0
+  %nop528 = alloca i1, i1 0
+  %nop529 = alloca i1, i1 0
+  %nop530 = alloca i1, i1 0
+  %nop531 = alloca i1, i1 0
+  %nop532 = alloca i1, i1 0
+  %nop533 = alloca i1, i1 0
+  %nop534 = alloca i1, i1 0
+  %nop535 = alloca i1, i1 0
+  %nop536 = alloca i1, i1 0
+  %nop537 = alloca i1, i1 0
+  %nop538 = alloca i1, i1 0
+  %nop539 = alloca i1, i1 0
+  %nop540 = alloca i1, i1 0
+  %nop541 = alloca i1, i1 0
+  %nop542 = alloca i1, i1 0
+  %nop543 = alloca i1, i1 0
+  %nop544 = alloca i1, i1 0
+  %nop545 = alloca i1, i1 0
+  %nop546 = alloca i1, i1 0
+  %nop547 = alloca i1, i1 0
+  %nop548 = alloca i1, i1 0
+  %nop549 = alloca i1, i1 0
+  %nop550 = alloca i1, i1 0
+  %nop551 = alloca i1, i1 0
+  %nop552 = alloca i1, i1 0
+  %nop553 = alloca i1, i1 0
+  %nop554 = alloca i1, i1 0
+  %nop555 = alloca i1, i1 0
+  %nop556 = alloca i1, i1 0
+  %nop557 = alloca i1, i1 0
+  %nop558 = alloca i1, i1 0
+  %nop559 = alloca i1, i1 0
+  %nop560 = alloca i1, i1 0
+  %nop561 = alloca i1, i1 0
+  %nop562 = alloca i1, i1 0
+  %nop563 = alloca i1, i1 0
+  %nop564 = alloca i1, i1 0
+  %nop565 = alloca i1, i1 0
+  %nop566 = alloca i1, i1 0
+  %nop567 = alloca i1, i1 0
+  %nop568 = alloca i1, i1 0
+  %nop569 = alloca i1, i1 0
+  %nop570 = alloca i1, i1 0
+  %nop571 = alloca i1, i1 0
+  %nop572 = alloca i1, i1 0
+  %nop573 = alloca i1, i1 0
+  %nop574 = alloca i1, i1 0
+  %nop575 = alloca i1, i1 0
+  %nop576 = alloca i1, i1 0
+  %nop577 = alloca i1, i1 0
+  %nop578 = alloca i1, i1 0
+  %nop579 = alloca i1, i1 0
+  %nop580 = alloca i1, i1 0
+  %nop581 = alloca i1, i1 0
+  %nop582 = alloca i1, i1 0
+  %nop583 = alloca i1, i1 0
+  %nop584 = alloca i1, i1 0
+  %nop585 = alloca i1, i1 0
+  %nop586 = alloca i1, i1 0
+  %nop587 = alloca i1, i1 0
+  %nop588 = alloca i1, i1 0
+  %nop589 = alloca i1, i1 0
+  %nop590 = alloca i1, i1 0
+  %nop591 = alloca i1, i1 0
+  %nop592 = alloca i1, i1 0
+  %nop593 = alloca i1, i1 0
+  %nop594 = alloca i1, i1 0
+  %nop595 = alloca i1, i1 0
+  %nop596 = alloca i1, i1 0
+  %nop597 = alloca i1, i1 0
+  %nop598 = alloca i1, i1 0
+  %nop599 = alloca i1, i1 0
+  %nop600 = alloca i1, i1 0
+  %nop601 = alloca i1, i1 0
+  %nop602 = alloca i1, i1 0
+  %nop603 = alloca i1, i1 0
+  %nop604 = alloca i1, i1 0
+  %nop605 = alloca i1, i1 0
+  %nop606 = alloca i1, i1 0
+  %nop607 = alloca i1, i1 0
+  %nop608 = alloca i1, i1 0
+  %nop609 = alloca i1, i1 0
+  %nop610 = alloca i1, i1 0
+  %nop611 = alloca i1, i1 0
+  %nop612 = alloca i1, i1 0
+  %nop613 = alloca i1, i1 0
+  %nop614 = alloca i1, i1 0
+  %nop615 = alloca i1, i1 0
+  %nop616 = alloca i1, i1 0
+  %nop617 = alloca i1, i1 0
+  %nop618 = alloca i1, i1 0
+  %nop619 = alloca i1, i1 0
+  %nop620 = alloca i1, i1 0
+  %nop621 = alloca i1, i1 0
+  %nop622 = alloca i1, i1 0
+  %nop623 = alloca i1, i1 0
+  %nop624 = alloca i1, i1 0
+  %nop625 = alloca i1, i1 0
+  %nop626 = alloca i1, i1 0
+  %nop627 = alloca i1, i1 0
+  %nop628 = alloca i1, i1 0
+  %nop629 = alloca i1, i1 0
+  %nop630 = alloca i1, i1 0
+  %nop631 = alloca i1, i1 0
+  %nop632 = alloca i1, i1 0
+  %nop633 = alloca i1, i1 0
+  %nop634 = alloca i1, i1 0
+  %nop635 = alloca i1, i1 0
+  %nop636 = alloca i1, i1 0
+  %nop637 = alloca i1, i1 0
+  %nop638 = alloca i1, i1 0
+  %nop639 = alloca i1, i1 0
+  %nop640 = alloca i1, i1 0
+  %nop641 = alloca i1, i1 0
+  %nop642 = alloca i1, i1 0
+  %nop643 = alloca i1, i1 0
+  %nop644 = alloca i1, i1 0
+  %nop645 = alloca i1, i1 0
+  %nop646 = alloca i1, i1 0
+  %nop647 = alloca i1, i1 0
+  %nop648 = alloca i1, i1 0
+  %nop649 = alloca i1, i1 0
+  %nop650 = alloca i1, i1 0
+  %nop651 = alloca i1, i1 0
+  %nop652 = alloca i1, i1 0
+  %nop653 = alloca i1, i1 0
+  %nop654 = alloca i1, i1 0
+  %nop655 = alloca i1, i1 0
+  %nop656 = alloca i1, i1 0
+  %nop657 = alloca i1, i1 0
+  %nop658 = alloca i1, i1 0
+  %nop659 = alloca i1, i1 0
+  %nop660 = alloca i1, i1 0
+  %nop661 = alloca i1, i1 0
+  %nop662 = alloca i1, i1 0
+  %nop663 = alloca i1, i1 0
+  %nop664 = alloca i1, i1 0
+  %nop665 = alloca i1, i1 0
+  %nop666 = alloca i1, i1 0
+  %nop667 = alloca i1, i1 0
+  %nop668 = alloca i1, i1 0
+  %nop669 = alloca i1, i1 0
+  %nop670 = alloca i1, i1 0
+  %nop671 = alloca i1, i1 0
+  %nop672 = alloca i1, i1 0
+  %nop673 = alloca i1, i1 0
+  %nop674 = alloca i1, i1 0
+  %nop675 = alloca i1, i1 0
+  %nop676 = alloca i1, i1 0
+  %nop677 = alloca i1, i1 0
+  %nop678 = alloca i1, i1 0
+  %nop679 = alloca i1, i1 0
+  %nop680 = alloca i1, i1 0
+  %nop681 = alloca i1, i1 0
+  %nop682 = alloca i1, i1 0
+  %nop683 = alloca i1, i1 0
+  %nop684 = alloca i1, i1 0
+  %nop685 = alloca i1, i1 0
+  %nop686 = alloca i1, i1 0
+  %nop687 = alloca i1, i1 0
+  %nop688 = alloca i1, i1 0
+  %nop689 = alloca i1, i1 0
+  %nop690 = alloca i1, i1 0
+  %nop691 = alloca i1, i1 0
+  %nop692 = alloca i1, i1 0
+  %nop693 = alloca i1, i1 0
+  %nop694 = alloca i1, i1 0
+  %nop695 = alloca i1, i1 0
+  %nop696 = alloca i1, i1 0
+  %nop697 = alloca i1, i1 0
+  %nop698 = alloca i1, i1 0
+  %nop699 = alloca i1, i1 0
+  %nop700 = alloca i1, i1 0
+  %nop701 = alloca i1, i1 0
+  %nop702 = alloca i1, i1 0
+  %nop703 = alloca i1, i1 0
+  %nop704 = alloca i1, i1 0
+  %nop705 = alloca i1, i1 0
+  %nop706 = alloca i1, i1 0
+  %nop707 = alloca i1, i1 0
+  %nop708 = alloca i1, i1 0
+  %nop709 = alloca i1, i1 0
+  %nop710 = alloca i1, i1 0
+  %nop711 = alloca i1, i1 0
+  %nop712 = alloca i1, i1 0
+  %nop713 = alloca i1, i1 0
+  %nop714 = alloca i1, i1 0
+  %nop715 = alloca i1, i1 0
+  %nop716 = alloca i1, i1 0
+  %nop717 = alloca i1, i1 0
+  %nop718 = alloca i1, i1 0
+  %nop719 = alloca i1, i1 0
+  %nop720 = alloca i1, i1 0
+  %nop721 = alloca i1, i1 0
+  %nop722 = alloca i1, i1 0
+  %nop723 = alloca i1, i1 0
+  %nop724 = alloca i1, i1 0
+  %nop725 = alloca i1, i1 0
+  %nop726 = alloca i1, i1 0
+  %nop727 = alloca i1, i1 0
+  %nop728 = alloca i1, i1 0
+  %nop729 = alloca i1, i1 0
+  %nop730 = alloca i1, i1 0
+  %nop731 = alloca i1, i1 0
+  %nop732 = alloca i1, i1 0
+  %nop733 = alloca i1, i1 0
+  %nop734 = alloca i1, i1 0
+  %nop735 = alloca i1, i1 0
+  %nop736 = alloca i1, i1 0
+  %nop737 = alloca i1, i1 0
+  %nop738 = alloca i1, i1 0
+  %nop739 = alloca i1, i1 0
+  %nop740 = alloca i1, i1 0
+  %nop741 = alloca i1, i1 0
+  %nop742 = alloca i1, i1 0
+  %nop743 = alloca i1, i1 0
+  %nop744 = alloca i1, i1 0
+  %nop745 = alloca i1, i1 0
+  %nop746 = alloca i1, i1 0
+  %nop747 = alloca i1, i1 0
+  %nop748 = alloca i1, i1 0
+  %nop749 = alloca i1, i1 0
+  %nop750 = alloca i1, i1 0
+  %nop751 = alloca i1, i1 0
+  %nop752 = alloca i1, i1 0
+  %nop753 = alloca i1, i1 0
+  %nop754 = alloca i1, i1 0
+  %nop755 = alloca i1, i1 0
+  %nop756 = alloca i1, i1 0
+  %nop757 = alloca i1, i1 0
+  %nop758 = alloca i1, i1 0
+  %nop759 = alloca i1, i1 0
+  %nop760 = alloca i1, i1 0
+  %nop761 = alloca i1, i1 0
+  %nop762 = alloca i1, i1 0
+  %nop763 = alloca i1, i1 0
+  %nop764 = alloca i1, i1 0
+  %nop765 = alloca i1, i1 0
+  %nop766 = alloca i1, i1 0
+  %nop767 = alloca i1, i1 0
+  %nop768 = alloca i1, i1 0
+  %nop769 = alloca i1, i1 0
+  %nop770 = alloca i1, i1 0
+  %nop771 = alloca i1, i1 0
+  %nop772 = alloca i1, i1 0
+  %nop773 = alloca i1, i1 0
+  %nop774 = alloca i1, i1 0
+  %nop775 = alloca i1, i1 0
+  %nop776 = alloca i1, i1 0
+  %nop777 = alloca i1, i1 0
+  %nop778 = alloca i1, i1 0
+  %nop779 = alloca i1, i1 0
+  %nop780 = alloca i1, i1 0
+  %nop781 = alloca i1, i1 0
+  %nop782 = alloca i1, i1 0
+  %nop783 = alloca i1, i1 0
+  %nop784 = alloca i1, i1 0
+  %nop785 = alloca i1, i1 0
+  %nop786 = alloca i1, i1 0
+  %nop787 = alloca i1, i1 0
+  %nop788 = alloca i1, i1 0
+  %nop789 = alloca i1, i1 0
+  %nop790 = alloca i1, i1 0
+  %nop791 = alloca i1, i1 0
+  %nop792 = alloca i1, i1 0
+  %nop793 = alloca i1, i1 0
+  %nop794 = alloca i1, i1 0
+  %nop795 = alloca i1, i1 0
+  %nop796 = alloca i1, i1 0
+  %nop797 = alloca i1, i1 0
+  %nop798 = alloca i1, i1 0
+  %nop799 = alloca i1, i1 0
+  %nop800 = alloca i1, i1 0
+  %nop801 = alloca i1, i1 0
+  %nop802 = alloca i1, i1 0
+  %nop803 = alloca i1, i1 0
+  %nop804 = alloca i1, i1 0
+  %nop805 = alloca i1, i1 0
+  %nop806 = alloca i1, i1 0
+  %nop807 = alloca i1, i1 0
+  %nop808 = alloca i1, i1 0
+  %nop809 = alloca i1, i1 0
+  %nop810 = alloca i1, i1 0
+  %nop811 = alloca i1, i1 0
+  %nop812 = alloca i1, i1 0
+  %nop813 = alloca i1, i1 0
+  %nop814 = alloca i1, i1 0
+  %nop815 = alloca i1, i1 0
+  %nop816 = alloca i1, i1 0
+  %nop817 = alloca i1, i1 0
+  %nop818 = alloca i1, i1 0
+  %nop819 = alloca i1, i1 0
+  %nop820 = alloca i1, i1 0
+  %nop821 = alloca i1, i1 0
+  %nop822 = alloca i1, i1 0
+  %nop823 = alloca i1, i1 0
+  %nop824 = alloca i1, i1 0
+  %nop825 = alloca i1, i1 0
+  %nop826 = alloca i1, i1 0
+  %nop827 = alloca i1, i1 0
+  %nop828 = alloca i1, i1 0
+  %nop829 = alloca i1, i1 0
+  %nop830 = alloca i1, i1 0
+  %nop831 = alloca i1, i1 0
+  %nop832 = alloca i1, i1 0
+  %nop833 = alloca i1, i1 0
+  %nop834 = alloca i1, i1 0
+  %nop835 = alloca i1, i1 0
+  %nop836 = alloca i1, i1 0
+  %nop837 = alloca i1, i1 0
+  %nop838 = alloca i1, i1 0
+  %nop839 = alloca i1, i1 0
+  %nop840 = alloca i1, i1 0
+  %nop841 = alloca i1, i1 0
+  %nop842 = alloca i1, i1 0
+  %nop843 = alloca i1, i1 0
+  %nop844 = alloca i1, i1 0
+  %nop845 = alloca i1, i1 0
+  %nop846 = alloca i1, i1 0
+  %nop847 = alloca i1, i1 0
+  %nop848 = alloca i1, i1 0
+  %nop849 = alloca i1, i1 0
+  %nop850 = alloca i1, i1 0
+  %nop851 = alloca i1, i1 0
+  %nop852 = alloca i1, i1 0
+  %nop853 = alloca i1, i1 0
+  %nop854 = alloca i1, i1 0
+  %nop855 = alloca i1, i1 0
+  %nop856 = alloca i1, i1 0
+  %nop857 = alloca i1, i1 0
+  %nop858 = alloca i1, i1 0
+  %nop859 = alloca i1, i1 0
+  %nop860 = alloca i1, i1 0
+  %nop861 = alloca i1, i1 0
+  %nop862 = alloca i1, i1 0
+  %nop863 = alloca i1, i1 0
+  %nop864 = alloca i1, i1 0
+  %nop865 = alloca i1, i1 0
+  %nop866 = alloca i1, i1 0
+  %nop867 = alloca i1, i1 0
+  %nop868 = alloca i1, i1 0
+  %nop869 = alloca i1, i1 0
+  %nop870 = alloca i1, i1 0
+  %nop871 = alloca i1, i1 0
+  %nop872 = alloca i1, i1 0
+  %nop873 = alloca i1, i1 0
+  %nop874 = alloca i1, i1 0
+  %nop875 = alloca i1, i1 0
+  %nop876 = alloca i1, i1 0
+  %nop877 = alloca i1, i1 0
+  %nop878 = alloca i1, i1 0
+  %nop879 = alloca i1, i1 0
+  %nop880 = alloca i1, i1 0
+  %nop881 = alloca i1, i1 0
+  %nop882 = alloca i1, i1 0
+  %nop883 = alloca i1, i1 0
+  %nop884 = alloca i1, i1 0
+  %nop885 = alloca i1, i1 0
+  %nop886 = alloca i1, i1 0
+  %nop887 = alloca i1, i1 0
+  %nop888 = alloca i1, i1 0
+  %nop889 = alloca i1, i1 0
+  %nop890 = alloca i1, i1 0
+  %nop891 = alloca i1, i1 0
+  %nop892 = alloca i1, i1 0
+  %nop893 = alloca i1, i1 0
+  %nop894 = alloca i1, i1 0
+  %nop895 = alloca i1, i1 0
+  %nop896 = alloca i1, i1 0
+  %nop897 = alloca i1, i1 0
+  %nop898 = alloca i1, i1 0
+  %nop899 = alloca i1, i1 0
+  %nop900 = alloca i1, i1 0
+  %nop901 = alloca i1, i1 0
+  %nop902 = alloca i1, i1 0
+  %nop903 = alloca i1, i1 0
+  %nop904 = alloca i1, i1 0
+  %nop905 = alloca i1, i1 0
+  %nop906 = alloca i1, i1 0
+  %nop907 = alloca i1, i1 0
+  %nop908 = alloca i1, i1 0
+  %nop909 = alloca i1, i1 0
+  %nop910 = alloca i1, i1 0
+  %nop911 = alloca i1, i1 0
+  %nop912 = alloca i1, i1 0
+  %nop913 = alloca i1, i1 0
+  %nop914 = alloca i1, i1 0
+  %nop915 = alloca i1, i1 0
+  %nop916 = alloca i1, i1 0
+  %nop917 = alloca i1, i1 0
+  %nop918 = alloca i1, i1 0
+  %nop919 = alloca i1, i1 0
+  %nop920 = alloca i1, i1 0
+  %nop921 = alloca i1, i1 0
+  %nop922 = alloca i1, i1 0
+  %nop923 = alloca i1, i1 0
+  %nop924 = alloca i1, i1 0
+  %nop925 = alloca i1, i1 0
+  %nop926 = alloca i1, i1 0
+  %nop927 = alloca i1, i1 0
+  %nop928 = alloca i1, i1 0
+  %nop929 = alloca i1, i1 0
+  %nop930 = alloca i1, i1 0
+  %nop931 = alloca i1, i1 0
+  %nop932 = alloca i1, i1 0
+  %nop933 = alloca i1, i1 0
+  %nop934 = alloca i1, i1 0
+  %nop935 = alloca i1, i1 0
+  %nop936 = alloca i1, i1 0
+  %nop937 = alloca i1, i1 0
+  %nop938 = alloca i1, i1 0
+  %nop939 = alloca i1, i1 0
+  %nop940 = alloca i1, i1 0
+  %nop941 = alloca i1, i1 0
+  %nop942 = alloca i1, i1 0
+  %nop943 = alloca i1, i1 0
+  %nop944 = alloca i1, i1 0
+  %nop945 = alloca i1, i1 0
+  %nop946 = alloca i1, i1 0
+  %nop947 = alloca i1, i1 0
+  %nop948 = alloca i1, i1 0
+  %nop949 = alloca i1, i1 0
+  %nop950 = alloca i1, i1 0
+  %nop951 = alloca i1, i1 0
+  %nop952 = alloca i1, i1 0
+  %nop953 = alloca i1, i1 0
+  %nop954 = alloca i1, i1 0
+  %nop955 = alloca i1, i1 0
+  %nop956 = alloca i1, i1 0
+  %nop957 = alloca i1, i1 0
+  %nop958 = alloca i1, i1 0
+  %nop959 = alloca i1, i1 0
+  %nop960 = alloca i1, i1 0
+  %nop961 = alloca i1, i1 0
+  %nop962 = alloca i1, i1 0
+  %nop963 = alloca i1, i1 0
+  %nop964 = alloca i1, i1 0
+  %nop965 = alloca i1, i1 0
+  %nop966 = alloca i1, i1 0
+  %nop967 = alloca i1, i1 0
+  %nop968 = alloca i1, i1 0
+  %nop969 = alloca i1, i1 0
+  %nop970 = alloca i1, i1 0
+  %nop971 = alloca i1, i1 0
+  %nop972 = alloca i1, i1 0
+  %nop973 = alloca i1, i1 0
+  %nop974 = alloca i1, i1 0
+  %nop975 = alloca i1, i1 0
+  %nop976 = alloca i1, i1 0
+  %nop977 = alloca i1, i1 0
+  %nop978 = alloca i1, i1 0
+  %nop979 = alloca i1, i1 0
+  %nop980 = alloca i1, i1 0
+  %nop981 = alloca i1, i1 0
+  %nop982 = alloca i1, i1 0
+  %nop983 = alloca i1, i1 0
+  %nop984 = alloca i1, i1 0
+  %nop985 = alloca i1, i1 0
+  %nop986 = alloca i1, i1 0
+  %nop987 = alloca i1, i1 0
+  %nop988 = alloca i1, i1 0
+  %nop989 = alloca i1, i1 0
+  %nop990 = alloca i1, i1 0
+  %nop991 = alloca i1, i1 0
+  %nop992 = alloca i1, i1 0
+  %nop993 = alloca i1, i1 0
+  %nop994 = alloca i1, i1 0
+  %nop995 = alloca i1, i1 0
+  %nop996 = alloca i1, i1 0
+  %nop997 = alloca i1, i1 0
+  %nop998 = alloca i1, i1 0
+  %nop999 = alloca i1, i1 0
+  %nop1000 = alloca i1, i1 0
+  %nop1001 = alloca i1, i1 0
+  %nop1002 = alloca i1, i1 0
+  %nop1003 = alloca i1, i1 0
+  %nop1004 = alloca i1, i1 0
+  %nop1005 = alloca i1, i1 0
+  %nop1006 = alloca i1, i1 0
+  %nop1007 = alloca i1, i1 0
+  %nop1008 = alloca i1, i1 0
+  %nop1009 = alloca i1, i1 0
+  %nop1010 = alloca i1, i1 0
+  %nop1011 = alloca i1, i1 0
+  %nop1012 = alloca i1, i1 0
+  %nop1013 = alloca i1, i1 0
+  %nop1014 = alloca i1, i1 0
+  %nop1015 = alloca i1, i1 0
+  %nop1016 = alloca i1, i1 0
+  %nop1017 = alloca i1, i1 0
+  %nop1018 = alloca i1, i1 0
+  %nop1019 = alloca i1, i1 0
+  %nop1020 = alloca i1, i1 0
+  %nop1021 = alloca i1, i1 0
+  %nop1022 = alloca i1, i1 0
+  %nop1023 = alloca i1, i1 0
+  %nop1024 = alloca i1, i1 0
+  %nop1025 = alloca i1, i1 0
+  %nop1026 = alloca i1, i1 0
+  %nop1027 = alloca i1, i1 0
+  %nop1028 = alloca i1, i1 0
+  %nop1029 = alloca i1, i1 0
+  %nop1030 = alloca i1, i1 0
+  %nop1031 = alloca i1, i1 0
+  %nop1032 = alloca i1, i1 0
+  %nop1033 = alloca i1, i1 0
+  %nop1034 = alloca i1, i1 0
+  %nop1035 = alloca i1, i1 0
+  %nop1036 = alloca i1, i1 0
+  %nop1037 = alloca i1, i1 0
+  %nop1038 = alloca i1, i1 0
+  %nop1039 = alloca i1, i1 0
+  %nop1040 = alloca i1, i1 0
+  %nop1041 = alloca i1, i1 0
+  %nop1042 = alloca i1, i1 0
+  %nop1043 = alloca i1, i1 0
+  %nop1044 = alloca i1, i1 0
+  %nop1045 = alloca i1, i1 0
+  %nop1046 = alloca i1, i1 0
+  %nop1047 = alloca i1, i1 0
+  %nop1048 = alloca i1, i1 0
+  %nop1049 = alloca i1, i1 0
+  %nop1050 = alloca i1, i1 0
+  %nop1051 = alloca i1, i1 0
+  %nop1052 = alloca i1, i1 0
+  %nop1053 = alloca i1, i1 0
+  %nop1054 = alloca i1, i1 0
+  %nop1055 = alloca i1, i1 0
+  %nop1056 = alloca i1, i1 0
+  %nop1057 = alloca i1, i1 0
+  %nop1058 = alloca i1, i1 0
+  %nop1059 = alloca i1, i1 0
+  %nop1060 = alloca i1, i1 0
+  %nop1061 = alloca i1, i1 0
+  %nop1062 = alloca i1, i1 0
+  %nop1063 = alloca i1, i1 0
+  %nop1064 = alloca i1, i1 0
+  %nop1065 = alloca i1, i1 0
+  %nop1066 = alloca i1, i1 0
+  %nop1067 = alloca i1, i1 0
+  %nop1068 = alloca i1, i1 0
+  %nop1069 = alloca i1, i1 0
+  %nop1070 = alloca i1, i1 0
+  %nop1071 = alloca i1, i1 0
+  %nop1072 = alloca i1, i1 0
+  %nop1073 = alloca i1, i1 0
+  %nop1074 = alloca i1, i1 0
+  %nop1075 = alloca i1, i1 0
+  %nop1076 = alloca i1, i1 0
+  %nop1077 = alloca i1, i1 0
+  %nop1078 = alloca i1, i1 0
+  %nop1079 = alloca i1, i1 0
+  %nop1080 = alloca i1, i1 0
+  %nop1081 = alloca i1, i1 0
+  %nop1082 = alloca i1, i1 0
+  %nop1083 = alloca i1, i1 0
+  %nop1084 = alloca i1, i1 0
+  %nop1085 = alloca i1, i1 0
+  %nop1086 = alloca i1, i1 0
+  %nop1087 = alloca i1, i1 0
+  %nop1088 = alloca i1, i1 0
+  %nop1089 = alloca i1, i1 0
+  %nop1090 = alloca i1, i1 0
+  %nop1091 = alloca i1, i1 0
+  %nop1092 = alloca i1, i1 0
+  %nop1093 = alloca i1, i1 0
+  %nop1094 = alloca i1, i1 0
+  %nop1095 = alloca i1, i1 0
+  %nop1096 = alloca i1, i1 0
+  %nop1097 = alloca i1, i1 0
+  %nop1098 = alloca i1, i1 0
+  %nop1099 = alloca i1, i1 0
+  %nop1100 = alloca i1, i1 0
+  %nop1101 = alloca i1, i1 0
+  %nop1102 = alloca i1, i1 0
+  %nop1103 = alloca i1, i1 0
+  %nop1104 = alloca i1, i1 0
+  %nop1105 = alloca i1, i1 0
+  %nop1106 = alloca i1, i1 0
+  %nop1107 = alloca i1, i1 0
+  %nop1108 = alloca i1, i1 0
+  %nop1109 = alloca i1, i1 0
+  %nop1110 = alloca i1, i1 0
+  %nop1111 = alloca i1, i1 0
+  %nop1112 = alloca i1, i1 0
+  %nop1113 = alloca i1, i1 0
+  %nop1114 = alloca i1, i1 0
+  %nop1115 = alloca i1, i1 0
+  %nop1116 = alloca i1, i1 0
+  %nop1117 = alloca i1, i1 0
+  %nop1118 = alloca i1, i1 0
+  %nop1119 = alloca i1, i1 0
+  %nop1120 = alloca i1, i1 0
+  %nop1121 = alloca i1, i1 0
+  %nop1122 = alloca i1, i1 0
+  %nop1123 = alloca i1, i1 0
+  %nop1124 = alloca i1, i1 0
+  %nop1125 = alloca i1, i1 0
+  %nop1126 = alloca i1, i1 0
+  %nop1127 = alloca i1, i1 0
+  %nop1128 = alloca i1, i1 0
+  %nop1129 = alloca i1, i1 0
+  %nop1130 = alloca i1, i1 0
+  %nop1131 = alloca i1, i1 0
+  %nop1132 = alloca i1, i1 0
+  %nop1133 = alloca i1, i1 0
+  %nop1134 = alloca i1, i1 0
+  %nop1135 = alloca i1, i1 0
+  %nop1136 = alloca i1, i1 0
+  %nop1137 = alloca i1, i1 0
+  %nop1138 = alloca i1, i1 0
+  %nop1139 = alloca i1, i1 0
+  %nop1140 = alloca i1, i1 0
+  %nop1141 = alloca i1, i1 0
+  %nop1142 = alloca i1, i1 0
+  %nop1143 = alloca i1, i1 0
+  %nop1144 = alloca i1, i1 0
+  %nop1145 = alloca i1, i1 0
+  %nop1146 = alloca i1, i1 0
+  %nop1147 = alloca i1, i1 0
+  %nop1148 = alloca i1, i1 0
+  %nop1149 = alloca i1, i1 0
+  %nop1150 = alloca i1, i1 0
+  %nop1151 = alloca i1, i1 0
+  %nop1152 = alloca i1, i1 0
+  %nop1153 = alloca i1, i1 0
+  %nop1154 = alloca i1, i1 0
+  %nop1155 = alloca i1, i1 0
+  %nop1156 = alloca i1, i1 0
+  %nop1157 = alloca i1, i1 0
+  %nop1158 = alloca i1, i1 0
+  %nop1159 = alloca i1, i1 0
+  %nop1160 = alloca i1, i1 0
+  %nop1161 = alloca i1, i1 0
+  %nop1162 = alloca i1, i1 0
+  %nop1163 = alloca i1, i1 0
+  %nop1164 = alloca i1, i1 0
+  %nop1165 = alloca i1, i1 0
+  %nop1166 = alloca i1, i1 0
+  %nop1167 = alloca i1, i1 0
+  %nop1168 = alloca i1, i1 0
+  %nop1169 = alloca i1, i1 0
+  %nop1170 = alloca i1, i1 0
+  %nop1171 = alloca i1, i1 0
+  %nop1172 = alloca i1, i1 0
+  %nop1173 = alloca i1, i1 0
+  %nop1174 = alloca i1, i1 0
+  %nop1175 = alloca i1, i1 0
+  %nop1176 = alloca i1, i1 0
+  %nop1177 = alloca i1, i1 0
+  %nop1178 = alloca i1, i1 0
+  %nop1179 = alloca i1, i1 0
+  %nop1180 = alloca i1, i1 0
+  %nop1181 = alloca i1, i1 0
+  %nop1182 = alloca i1, i1 0
+  %nop1183 = alloca i1, i1 0
+  %nop1184 = alloca i1, i1 0
+  %nop1185 = alloca i1, i1 0
+  %nop1186 = alloca i1, i1 0
+  %nop1187 = alloca i1, i1 0
+  %nop1188 = alloca i1, i1 0
+  %nop1189 = alloca i1, i1 0
+  %nop1190 = alloca i1, i1 0
+  %nop1191 = alloca i1, i1 0
+  %nop1192 = alloca i1, i1 0
+  %nop1193 = alloca i1, i1 0
+  %nop1194 = alloca i1, i1 0
+  %nop1195 = alloca i1, i1 0
+  %nop1196 = alloca i1, i1 0
+  %nop1197 = alloca i1, i1 0
+  %nop1198 = alloca i1, i1 0
+  %nop1199 = alloca i1, i1 0
+  %nop1200 = alloca i1, i1 0
+  %nop1201 = alloca i1, i1 0
+  %nop1202 = alloca i1, i1 0
+  %nop1203 = alloca i1, i1 0
+  %nop1204 = alloca i1, i1 0
+  %nop1205 = alloca i1, i1 0
+  %nop1206 = alloca i1, i1 0
+  %nop1207 = alloca i1, i1 0
+  %nop1208 = alloca i1, i1 0
+  %nop1209 = alloca i1, i1 0
+  %nop1210 = alloca i1, i1 0
+  %nop1211 = alloca i1, i1 0
+  %nop1212 = alloca i1, i1 0
+  %nop1213 = alloca i1, i1 0
+  %nop1214 = alloca i1, i1 0
+  %nop1215 = alloca i1, i1 0
+  %nop1216 = alloca i1, i1 0
+  %nop1217 = alloca i1, i1 0
+  %nop1218 = alloca i1, i1 0
+  %nop1219 = alloca i1, i1 0
+  %nop1220 = alloca i1, i1 0
+  %nop1221 = alloca i1, i1 0
+  %nop1222 = alloca i1, i1 0
+  %nop1223 = alloca i1, i1 0
+  %nop1224 = alloca i1, i1 0
+  %nop1225 = alloca i1, i1 0
+  %nop1226 = alloca i1, i1 0
+  %nop1227 = alloca i1, i1 0
+  %nop1228 = alloca i1, i1 0
+  %nop1229 = alloca i1, i1 0
+  %nop1230 = alloca i1, i1 0
+  %nop1231 = alloca i1, i1 0
+  %nop1232 = alloca i1, i1 0
+  %nop1233 = alloca i1, i1 0
+  %nop1234 = alloca i1, i1 0
+  %nop1235 = alloca i1, i1 0
+  %nop1236 = alloca i1, i1 0
+  %nop1237 = alloca i1, i1 0
+  %nop1238 = alloca i1, i1 0
+  %nop1239 = alloca i1, i1 0
+  %nop1240 = alloca i1, i1 0
+  %nop1241 = alloca i1, i1 0
+  %nop1242 = alloca i1, i1 0
+  %nop1243 = alloca i1, i1 0
+  %nop1244 = alloca i1, i1 0
+  %nop1245 = alloca i1, i1 0
+  %nop1246 = alloca i1, i1 0
+  %nop1247 = alloca i1, i1 0
+  %nop1248 = alloca i1, i1 0
+  %nop1249 = alloca i1, i1 0
+  %nop1250 = alloca i1, i1 0
+  %nop1251 = alloca i1, i1 0
+  %nop1252 = alloca i1, i1 0
+  %nop1253 = alloca i1, i1 0
+  %nop1254 = alloca i1, i1 0
+  %nop1255 = alloca i1, i1 0
+  %nop1256 = alloca i1, i1 0
+  %nop1257 = alloca i1, i1 0
+  %nop1258 = alloca i1, i1 0
+  %nop1259 = alloca i1, i1 0
+  %nop1260 = alloca i1, i1 0
+  %nop1261 = alloca i1, i1 0
+  %nop1262 = alloca i1, i1 0
+  %nop1263 = alloca i1, i1 0
+  %nop1264 = alloca i1, i1 0
+  %nop1265 = alloca i1, i1 0
+  %nop1266 = alloca i1, i1 0
+  %nop1267 = alloca i1, i1 0
+  %nop1268 = alloca i1, i1 0
+  %nop1269 = alloca i1, i1 0
+  %nop1270 = alloca i1, i1 0
+  %nop1271 = alloca i1, i1 0
+  %nop1272 = alloca i1, i1 0
+  %nop1273 = alloca i1, i1 0
+  %nop1274 = alloca i1, i1 0
+  %nop1275 = alloca i1, i1 0
+  %nop1276 = alloca i1, i1 0
+  %nop1277 = alloca i1, i1 0
+  %nop1278 = alloca i1, i1 0
+  %nop1279 = alloca i1, i1 0
+  %nop1280 = alloca i1, i1 0
+  %nop1281 = alloca i1, i1 0
+  %nop1282 = alloca i1, i1 0
+  %nop1283 = alloca i1, i1 0
+  %nop1284 = alloca i1, i1 0
+  %nop1285 = alloca i1, i1 0
+  %nop1286 = alloca i1, i1 0
+  %nop1287 = alloca i1, i1 0
+  %nop1288 = alloca i1, i1 0
+  %nop1289 = alloca i1, i1 0
+  %nop1290 = alloca i1, i1 0
+  %nop1291 = alloca i1, i1 0
+  %nop1292 = alloca i1, i1 0
+  %nop1293 = alloca i1, i1 0
+  %nop1294 = alloca i1, i1 0
+  %nop1295 = alloca i1, i1 0
+  %nop1296 = alloca i1, i1 0
+  %nop1297 = alloca i1, i1 0
+  %nop1298 = alloca i1, i1 0
+  %nop1299 = alloca i1, i1 0
+  %nop1300 = alloca i1, i1 0
+  %nop1301 = alloca i1, i1 0
+  %nop1302 = alloca i1, i1 0
+  %nop1303 = alloca i1, i1 0
+  %nop1304 = alloca i1, i1 0
+  %nop1305 = alloca i1, i1 0
+  %nop1306 = alloca i1, i1 0
+  %nop1307 = alloca i1, i1 0
+  %nop1308 = alloca i1, i1 0
+  %nop1309 = alloca i1, i1 0
+  %nop1310 = alloca i1, i1 0
+  %nop1311 = alloca i1, i1 0
+  %nop1312 = alloca i1, i1 0
+  %nop1313 = alloca i1, i1 0
+  %nop1314 = alloca i1, i1 0
+  %nop1315 = alloca i1, i1 0
+  %nop1316 = alloca i1, i1 0
+  %nop1317 = alloca i1, i1 0
+  %nop1318 = alloca i1, i1 0
+  %nop1319 = alloca i1, i1 0
+  %nop1320 = alloca i1, i1 0
+  %nop1321 = alloca i1, i1 0
+  %nop1322 = alloca i1, i1 0
+  %nop1323 = alloca i1, i1 0
+  %nop1324 = alloca i1, i1 0
+  %nop1325 = alloca i1, i1 0
+  %nop1326 = alloca i1, i1 0
+  %nop1327 = alloca i1, i1 0
+  %nop1328 = alloca i1, i1 0
+  %nop1329 = alloca i1, i1 0
+  %nop1330 = alloca i1, i1 0
+  %nop1331 = alloca i1, i1 0
+  %nop1332 = alloca i1, i1 0
+  %nop1333 = alloca i1, i1 0
+  %nop1334 = alloca i1, i1 0
+  %nop1335 = alloca i1, i1 0
+  %nop1336 = alloca i1, i1 0
+  %nop1337 = alloca i1, i1 0
+  %nop1338 = alloca i1, i1 0
+  %nop1339 = alloca i1, i1 0
+  %nop1340 = alloca i1, i1 0
+  %nop1341 = alloca i1, i1 0
+  %nop1342 = alloca i1, i1 0
+  %nop1343 = alloca i1, i1 0
+  %nop1344 = alloca i1, i1 0
+  %nop1345 = alloca i1, i1 0
+  %nop1346 = alloca i1, i1 0
+  %nop1347 = alloca i1, i1 0
+  %nop1348 = alloca i1, i1 0
+  %nop1349 = alloca i1, i1 0
+  %nop1350 = alloca i1, i1 0
+  %nop1351 = alloca i1, i1 0
+  %nop1352 = alloca i1, i1 0
+  %nop1353 = alloca i1, i1 0
+  %nop1354 = alloca i1, i1 0
+  %nop1355 = alloca i1, i1 0
+  %nop1356 = alloca i1, i1 0
+  %nop1357 = alloca i1, i1 0
+  %nop1358 = alloca i1, i1 0
+  %nop1359 = alloca i1, i1 0
+  %nop1360 = alloca i1, i1 0
+  %nop1361 = alloca i1, i1 0
+  %nop1362 = alloca i1, i1 0
+  %nop1363 = alloca i1, i1 0
+  %nop1364 = alloca i1, i1 0
+  %nop1365 = alloca i1, i1 0
+  %nop1366 = alloca i1, i1 0
+  %nop1367 = alloca i1, i1 0
+  %nop1368 = alloca i1, i1 0
+  %nop1369 = alloca i1, i1 0
+  %nop1370 = alloca i1, i1 0
+  %nop1371 = alloca i1, i1 0
+  %nop1372 = alloca i1, i1 0
+  %nop1373 = alloca i1, i1 0
+  %nop1374 = alloca i1, i1 0
+  %nop1375 = alloca i1, i1 0
+  %nop1376 = alloca i1, i1 0
+  %nop1377 = alloca i1, i1 0
+  %nop1378 = alloca i1, i1 0
+  %nop1379 = alloca i1, i1 0
+  %nop1380 = alloca i1, i1 0
+  %nop1381 = alloca i1, i1 0
+  %nop1382 = alloca i1, i1 0
+  %nop1383 = alloca i1, i1 0
+  %nop1384 = alloca i1, i1 0
+  %nop1385 = alloca i1, i1 0
+  %nop1386 = alloca i1, i1 0
+  %nop1387 = alloca i1, i1 0
+  %nop1388 = alloca i1, i1 0
+  %nop1389 = alloca i1, i1 0
+  %nop1390 = alloca i1, i1 0
+  %nop1391 = alloca i1, i1 0
+  %nop1392 = alloca i1, i1 0
+  %nop1393 = alloca i1, i1 0
+  %nop1394 = alloca i1, i1 0
+  %nop1395 = alloca i1, i1 0
+  %nop1396 = alloca i1, i1 0
+  %nop1397 = alloca i1, i1 0
+  %nop1398 = alloca i1, i1 0
+  %nop1399 = alloca i1, i1 0
+  %nop1400 = alloca i1, i1 0
+  %nop1401 = alloca i1, i1 0
+  %nop1402 = alloca i1, i1 0
+  %nop1403 = alloca i1, i1 0
+  %nop1404 = alloca i1, i1 0
+  %nop1405 = alloca i1, i1 0
+  %nop1406 = alloca i1, i1 0
+  %nop1407 = alloca i1, i1 0
+  %nop1408 = alloca i1, i1 0
+  %nop1409 = alloca i1, i1 0
+  %nop1410 = alloca i1, i1 0
+  %nop1411 = alloca i1, i1 0
+  %nop1412 = alloca i1, i1 0
+  %nop1413 = alloca i1, i1 0
+  %nop1414 = alloca i1, i1 0
+  %nop1415 = alloca i1, i1 0
+  %nop1416 = alloca i1, i1 0
+  %nop1417 = alloca i1, i1 0
+  %nop1418 = alloca i1, i1 0
+  %nop1419 = alloca i1, i1 0
+  %nop1420 = alloca i1, i1 0
+  %nop1421 = alloca i1, i1 0
+  %nop1422 = alloca i1, i1 0
+  %nop1423 = alloca i1, i1 0
+  %nop1424 = alloca i1, i1 0
+  %nop1425 = alloca i1, i1 0
+  %nop1426 = alloca i1, i1 0
+  %nop1427 = alloca i1, i1 0
+  %nop1428 = alloca i1, i1 0
+  %nop1429 = alloca i1, i1 0
+  %nop1430 = alloca i1, i1 0
+  %nop1431 = alloca i1, i1 0
+  %nop1432 = alloca i1, i1 0
+  %nop1433 = alloca i1, i1 0
+  %nop1434 = alloca i1, i1 0
+  %nop1435 = alloca i1, i1 0
+  %nop1436 = alloca i1, i1 0
+  %nop1437 = alloca i1, i1 0
+  %nop1438 = alloca i1, i1 0
+  %nop1439 = alloca i1, i1 0
+  %nop1440 = alloca i1, i1 0
+  %nop1441 = alloca i1, i1 0
+  %nop1442 = alloca i1, i1 0
+  %nop1443 = alloca i1, i1 0
+  %nop1444 = alloca i1, i1 0
+  %nop1445 = alloca i1, i1 0
+  %nop1446 = alloca i1, i1 0
+  %nop1447 = alloca i1, i1 0
+  %nop1448 = alloca i1, i1 0
+  %nop1449 = alloca i1, i1 0
+  %nop1450 = alloca i1, i1 0
+  %nop1451 = alloca i1, i1 0
+  %nop1452 = alloca i1, i1 0
+  %nop1453 = alloca i1, i1 0
+  %nop1454 = alloca i1, i1 0
+  %nop1455 = alloca i1, i1 0
+  %nop1456 = alloca i1, i1 0
+  %nop1457 = alloca i1, i1 0
+  %nop1458 = alloca i1, i1 0
+  %nop1459 = alloca i1, i1 0
+  %nop1460 = alloca i1, i1 0
+  %nop1461 = alloca i1, i1 0
+  %nop1462 = alloca i1, i1 0
+  %nop1463 = alloca i1, i1 0
+  %nop1464 = alloca i1, i1 0
+  %nop1465 = alloca i1, i1 0
+  %nop1466 = alloca i1, i1 0
+  %nop1467 = alloca i1, i1 0
+  %nop1468 = alloca i1, i1 0
+  %nop1469 = alloca i1, i1 0
+  %nop1470 = alloca i1, i1 0
+  %nop1471 = alloca i1, i1 0
+  %nop1472 = alloca i1, i1 0
+  %nop1473 = alloca i1, i1 0
+  %nop1474 = alloca i1, i1 0
+  %nop1475 = alloca i1, i1 0
+  %nop1476 = alloca i1, i1 0
+  %nop1477 = alloca i1, i1 0
+  %nop1478 = alloca i1, i1 0
+  %nop1479 = alloca i1, i1 0
+  %nop1480 = alloca i1, i1 0
+  %nop1481 = alloca i1, i1 0
+  %nop1482 = alloca i1, i1 0
+  %nop1483 = alloca i1, i1 0
+  %nop1484 = alloca i1, i1 0
+  %nop1485 = alloca i1, i1 0
+  %nop1486 = alloca i1, i1 0
+  %nop1487 = alloca i1, i1 0
+  %nop1488 = alloca i1, i1 0
+  %nop1489 = alloca i1, i1 0
+  %nop1490 = alloca i1, i1 0
+  %nop1491 = alloca i1, i1 0
+  %nop1492 = alloca i1, i1 0
+  %nop1493 = alloca i1, i1 0
+  %nop1494 = alloca i1, i1 0
+  %nop1495 = alloca i1, i1 0
+  %nop1496 = alloca i1, i1 0
+  %nop1497 = alloca i1, i1 0
+  %nop1498 = alloca i1, i1 0
+  %nop1499 = alloca i1, i1 0
+  %nop1500 = alloca i1, i1 0
+  %nop1501 = alloca i1, i1 0
+  %nop1502 = alloca i1, i1 0
+  %nop1503 = alloca i1, i1 0
+  %nop1504 = alloca i1, i1 0
+  %nop1505 = alloca i1, i1 0
+  %nop1506 = alloca i1, i1 0
+  %nop1507 = alloca i1, i1 0
+  %nop1508 = alloca i1, i1 0
+  %nop1509 = alloca i1, i1 0
+  %nop1510 = alloca i1, i1 0
+  %nop1511 = alloca i1, i1 0
+  %nop1512 = alloca i1, i1 0
+  %nop1513 = alloca i1, i1 0
+  %nop1514 = alloca i1, i1 0
+  %nop1515 = alloca i1, i1 0
+  %nop1516 = alloca i1, i1 0
+  %nop1517 = alloca i1, i1 0
+  %nop1518 = alloca i1, i1 0
+  %nop1519 = alloca i1, i1 0
+  %nop1520 = alloca i1, i1 0
+  %nop1521 = alloca i1, i1 0
+  %nop1522 = alloca i1, i1 0
+  %nop1523 = alloca i1, i1 0
+  %nop1524 = alloca i1, i1 0
+  %nop1525 = alloca i1, i1 0
+  %nop1526 = alloca i1, i1 0
+  %nop1527 = alloca i1, i1 0
+  %nop1528 = alloca i1, i1 0
+  %nop1529 = alloca i1, i1 0
+  %nop1530 = alloca i1, i1 0
+  %nop1531 = alloca i1, i1 0
+  %nop1532 = alloca i1, i1 0
+  %nop1533 = alloca i1, i1 0
+  %nop1534 = alloca i1, i1 0
+  %nop1535 = alloca i1, i1 0
+  %nop1536 = alloca i1, i1 0
+  %nop1537 = alloca i1, i1 0
+  %nop1538 = alloca i1, i1 0
+  %nop1539 = alloca i1, i1 0
+  %nop1540 = alloca i1, i1 0
+  %nop1541 = alloca i1, i1 0
+  %nop1542 = alloca i1, i1 0
+  %nop1543 = alloca i1, i1 0
+  %nop1544 = alloca i1, i1 0
+  %nop1545 = alloca i1, i1 0
+  %nop1546 = alloca i1, i1 0
+  %nop1547 = alloca i1, i1 0
+  %nop1548 = alloca i1, i1 0
+  %nop1549 = alloca i1, i1 0
+  %nop1550 = alloca i1, i1 0
+  %nop1551 = alloca i1, i1 0
+  %nop1552 = alloca i1, i1 0
+  %nop1553 = alloca i1, i1 0
+  %nop1554 = alloca i1, i1 0
+  %nop1555 = alloca i1, i1 0
+  %nop1556 = alloca i1, i1 0
+  %nop1557 = alloca i1, i1 0
+  %nop1558 = alloca i1, i1 0
+  %nop1559 = alloca i1, i1 0
+  %nop1560 = alloca i1, i1 0
+  %nop1561 = alloca i1, i1 0
+  %nop1562 = alloca i1, i1 0
+  %nop1563 = alloca i1, i1 0
+  %nop1564 = alloca i1, i1 0
+  %nop1565 = alloca i1, i1 0
+  %nop1566 = alloca i1, i1 0
+  %nop1567 = alloca i1, i1 0
+  %nop1568 = alloca i1, i1 0
+  %nop1569 = alloca i1, i1 0
+  %nop1570 = alloca i1, i1 0
+  %nop1571 = alloca i1, i1 0
+  %nop1572 = alloca i1, i1 0
+  %nop1573 = alloca i1, i1 0
+  %nop1574 = alloca i1, i1 0
+  %nop1575 = alloca i1, i1 0
+  %nop1576 = alloca i1, i1 0
+  %nop1577 = alloca i1, i1 0
+  %nop1578 = alloca i1, i1 0
+  %nop1579 = alloca i1, i1 0
+  %nop1580 = alloca i1, i1 0
+  %nop1581 = alloca i1, i1 0
+  %nop1582 = alloca i1, i1 0
+  %nop1583 = alloca i1, i1 0
+  %nop1584 = alloca i1, i1 0
+  %nop1585 = alloca i1, i1 0
+  %nop1586 = alloca i1, i1 0
+  %nop1587 = alloca i1, i1 0
+  %nop1588 = alloca i1, i1 0
+  %nop1589 = alloca i1, i1 0
+  %nop1590 = alloca i1, i1 0
+  %nop1591 = alloca i1, i1 0
+  %nop1592 = alloca i1, i1 0
+  %nop1593 = alloca i1, i1 0
+  %nop1594 = alloca i1, i1 0
+  %nop1595 = alloca i1, i1 0
+  %nop1596 = alloca i1, i1 0
+  %nop1597 = alloca i1, i1 0
+  %nop1598 = alloca i1, i1 0
+  %nop1599 = alloca i1, i1 0
+  %nop1600 = alloca i1, i1 0
+  %nop1601 = alloca i1, i1 0
+  %nop1602 = alloca i1, i1 0
+  %nop1603 = alloca i1, i1 0
+  %nop1604 = alloca i1, i1 0
+  %nop1605 = alloca i1, i1 0
+  %nop1606 = alloca i1, i1 0
+  %nop1607 = alloca i1, i1 0
+  %nop1608 = alloca i1, i1 0
+  %nop1609 = alloca i1, i1 0
+  %nop1610 = alloca i1, i1 0
+  %nop1611 = alloca i1, i1 0
+  %nop1612 = alloca i1, i1 0
+  %nop1613 = alloca i1, i1 0
+  %nop1614 = alloca i1, i1 0
+  %nop1615 = alloca i1, i1 0
+  %nop1616 = alloca i1, i1 0
+  %nop1617 = alloca i1, i1 0
+  %nop1618 = alloca i1, i1 0
+  %nop1619 = alloca i1, i1 0
+  %nop1620 = alloca i1, i1 0
+  %nop1621 = alloca i1, i1 0
+  %nop1622 = alloca i1, i1 0
+  %nop1623 = alloca i1, i1 0
+  %nop1624 = alloca i1, i1 0
+  %nop1625 = alloca i1, i1 0
+  %nop1626 = alloca i1, i1 0
+  %nop1627 = alloca i1, i1 0
+  %nop1628 = alloca i1, i1 0
+  %nop1629 = alloca i1, i1 0
+  %nop1630 = alloca i1, i1 0
+  %nop1631 = alloca i1, i1 0
+  %nop1632 = alloca i1, i1 0
+  %nop1633 = alloca i1, i1 0
+  %nop1634 = alloca i1, i1 0
+  %nop1635 = alloca i1, i1 0
+  %nop1636 = alloca i1, i1 0
+  %nop1637 = alloca i1, i1 0
+  %nop1638 = alloca i1, i1 0
+  %nop1639 = alloca i1, i1 0
+  %nop1640 = alloca i1, i1 0
+  %nop1641 = alloca i1, i1 0
+  %nop1642 = alloca i1, i1 0
+  %nop1643 = alloca i1, i1 0
+  %nop1644 = alloca i1, i1 0
+  %nop1645 = alloca i1, i1 0
+  %nop1646 = alloca i1, i1 0
+  %nop1647 = alloca i1, i1 0
+  %nop1648 = alloca i1, i1 0
+  %nop1649 = alloca i1, i1 0
+  %nop1650 = alloca i1, i1 0
+  %nop1651 = alloca i1, i1 0
+  %nop1652 = alloca i1, i1 0
+  %nop1653 = alloca i1, i1 0
+  %nop1654 = alloca i1, i1 0
+  %nop1655 = alloca i1, i1 0
+  %nop1656 = alloca i1, i1 0
+  %nop1657 = alloca i1, i1 0
+  %nop1658 = alloca i1, i1 0
+  %nop1659 = alloca i1, i1 0
+  %nop1660 = alloca i1, i1 0
+  %nop1661 = alloca i1, i1 0
+  %nop1662 = alloca i1, i1 0
+  %nop1663 = alloca i1, i1 0
+  %nop1664 = alloca i1, i1 0
+  %nop1665 = alloca i1, i1 0
+  %nop1666 = alloca i1, i1 0
+  %nop1667 = alloca i1, i1 0
+  %nop1668 = alloca i1, i1 0
+  %nop1669 = alloca i1, i1 0
+  %nop1670 = alloca i1, i1 0
+  %nop1671 = alloca i1, i1 0
+  %nop1672 = alloca i1, i1 0
+  %nop1673 = alloca i1, i1 0
+  %nop1674 = alloca i1, i1 0
+  %nop1675 = alloca i1, i1 0
+  %nop1676 = alloca i1, i1 0
+  %nop1677 = alloca i1, i1 0
+  %nop1678 = alloca i1, i1 0
+  %nop1679 = alloca i1, i1 0
+  %nop1680 = alloca i1, i1 0
+  %nop1681 = alloca i1, i1 0
+  %nop1682 = alloca i1, i1 0
+  %nop1683 = alloca i1, i1 0
+  %nop1684 = alloca i1, i1 0
+  %nop1685 = alloca i1, i1 0
+  %nop1686 = alloca i1, i1 0
+  %nop1687 = alloca i1, i1 0
+  %nop1688 = alloca i1, i1 0
+  %nop1689 = alloca i1, i1 0
+  %nop1690 = alloca i1, i1 0
+  %nop1691 = alloca i1, i1 0
+  %nop1692 = alloca i1, i1 0
+  %nop1693 = alloca i1, i1 0
+  %nop1694 = alloca i1, i1 0
+  %nop1695 = alloca i1, i1 0
+  %nop1696 = alloca i1, i1 0
+  %nop1697 = alloca i1, i1 0
+  %nop1698 = alloca i1, i1 0
+  %nop1699 = alloca i1, i1 0
+  %nop1700 = alloca i1, i1 0
+  %nop1701 = alloca i1, i1 0
+  %nop1702 = alloca i1, i1 0
+  %nop1703 = alloca i1, i1 0
+  %nop1704 = alloca i1, i1 0
+  %nop1705 = alloca i1, i1 0
+  %nop1706 = alloca i1, i1 0
+  %nop1707 = alloca i1, i1 0
+  %nop1708 = alloca i1, i1 0
+  %nop1709 = alloca i1, i1 0
+  %nop1710 = alloca i1, i1 0
+  %nop1711 = alloca i1, i1 0
+  %nop1712 = alloca i1, i1 0
+  %nop1713 = alloca i1, i1 0
+  %nop1714 = alloca i1, i1 0
+  %nop1715 = alloca i1, i1 0
+  %nop1716 = alloca i1, i1 0
+  %nop1717 = alloca i1, i1 0
+  %nop1718 = alloca i1, i1 0
+  %nop1719 = alloca i1, i1 0
+  %nop1720 = alloca i1, i1 0
+  %nop1721 = alloca i1, i1 0
+  %nop1722 = alloca i1, i1 0
+  %nop1723 = alloca i1, i1 0
+  %nop1724 = alloca i1, i1 0
+  %nop1725 = alloca i1, i1 0
+  %nop1726 = alloca i1, i1 0
+  %nop1727 = alloca i1, i1 0
+  %nop1728 = alloca i1, i1 0
+  %nop1729 = alloca i1, i1 0
+  %nop1730 = alloca i1, i1 0
+  %nop1731 = alloca i1, i1 0
+  %nop1732 = alloca i1, i1 0
+  %nop1733 = alloca i1, i1 0
+  %nop1734 = alloca i1, i1 0
+  %nop1735 = alloca i1, i1 0
+  %nop1736 = alloca i1, i1 0
+  %nop1737 = alloca i1, i1 0
+  %nop1738 = alloca i1, i1 0
+  %nop1739 = alloca i1, i1 0
+  %nop1740 = alloca i1, i1 0
+  %nop1741 = alloca i1, i1 0
+  %nop1742 = alloca i1, i1 0
+  %nop1743 = alloca i1, i1 0
+  %nop1744 = alloca i1, i1 0
+  %nop1745 = alloca i1, i1 0
+  %nop1746 = alloca i1, i1 0
+  %nop1747 = alloca i1, i1 0
+  %nop1748 = alloca i1, i1 0
+  %nop1749 = alloca i1, i1 0
+  %nop1750 = alloca i1, i1 0
+  %nop1751 = alloca i1, i1 0
+  %nop1752 = alloca i1, i1 0
+  %nop1753 = alloca i1, i1 0
+  %nop1754 = alloca i1, i1 0
+  %nop1755 = alloca i1, i1 0
+  %nop1756 = alloca i1, i1 0
+  %nop1757 = alloca i1, i1 0
+  %nop1758 = alloca i1, i1 0
+  %nop1759 = alloca i1, i1 0
+  %nop1760 = alloca i1, i1 0
+  %nop1761 = alloca i1, i1 0
+  %nop1762 = alloca i1, i1 0
+  %nop1763 = alloca i1, i1 0
+  %nop1764 = alloca i1, i1 0
+  %nop1765 = alloca i1, i1 0
+  %nop1766 = alloca i1, i1 0
+  %nop1767 = alloca i1, i1 0
+  %nop1768 = alloca i1, i1 0
+  %nop1769 = alloca i1, i1 0
+  %nop1770 = alloca i1, i1 0
+  %nop1771 = alloca i1, i1 0
+  %nop1772 = alloca i1, i1 0
+  %nop1773 = alloca i1, i1 0
+  %nop1774 = alloca i1, i1 0
+  %nop1775 = alloca i1, i1 0
+  %nop1776 = alloca i1, i1 0
+  %nop1777 = alloca i1, i1 0
+  %nop1778 = alloca i1, i1 0
+  %nop1779 = alloca i1, i1 0
+  %nop1780 = alloca i1, i1 0
+  %nop1781 = alloca i1, i1 0
+  %nop1782 = alloca i1, i1 0
+  %nop1783 = alloca i1, i1 0
+  %nop1784 = alloca i1, i1 0
+  %nop1785 = alloca i1, i1 0
+  %nop1786 = alloca i1, i1 0
+  %nop1787 = alloca i1, i1 0
+  %nop1788 = alloca i1, i1 0
+  %nop1789 = alloca i1, i1 0
+  %nop1790 = alloca i1, i1 0
+  %nop1791 = alloca i1, i1 0
+  %nop1792 = alloca i1, i1 0
+  %nop1793 = alloca i1, i1 0
+  %nop1794 = alloca i1, i1 0
+  %nop1795 = alloca i1, i1 0
+  %nop1796 = alloca i1, i1 0
+  %nop1797 = alloca i1, i1 0
+  %nop1798 = alloca i1, i1 0
+  %nop1799 = alloca i1, i1 0
+  %nop1800 = alloca i1, i1 0
+  %nop1801 = alloca i1, i1 0
+  %nop1802 = alloca i1, i1 0
+  %nop1803 = alloca i1, i1 0
+  %nop1804 = alloca i1, i1 0
+  %nop1805 = alloca i1, i1 0
+  %nop1806 = alloca i1, i1 0
+  %nop1807 = alloca i1, i1 0
+  %nop1808 = alloca i1, i1 0
+  %nop1809 = alloca i1, i1 0
+  %nop1810 = alloca i1, i1 0
+  %nop1811 = alloca i1, i1 0
+  %nop1812 = alloca i1, i1 0
+  %nop1813 = alloca i1, i1 0
+  %nop1814 = alloca i1, i1 0
+  %nop1815 = alloca i1, i1 0
+  %nop1816 = alloca i1, i1 0
+  %nop1817 = alloca i1, i1 0
+  %nop1818 = alloca i1, i1 0
+  %nop1819 = alloca i1, i1 0
+  %nop1820 = alloca i1, i1 0
+  %nop1821 = alloca i1, i1 0
+  %nop1822 = alloca i1, i1 0
+  %nop1823 = alloca i1, i1 0
+  %nop1824 = alloca i1, i1 0
+  %nop1825 = alloca i1, i1 0
+  %nop1826 = alloca i1, i1 0
+  %nop1827 = alloca i1, i1 0
+  %nop1828 = alloca i1, i1 0
+  %nop1829 = alloca i1, i1 0
+  %nop1830 = alloca i1, i1 0
+  %nop1831 = alloca i1, i1 0
+  %nop1832 = alloca i1, i1 0
+  %nop1833 = alloca i1, i1 0
+  %nop1834 = alloca i1, i1 0
+  %nop1835 = alloca i1, i1 0
+  %nop1836 = alloca i1, i1 0
+  %nop1837 = alloca i1, i1 0
+  %nop1838 = alloca i1, i1 0
+  %nop1839 = alloca i1, i1 0
+  %nop1840 = alloca i1, i1 0
+  %nop1841 = alloca i1, i1 0
+  %nop1842 = alloca i1, i1 0
+  %nop1843 = alloca i1, i1 0
+  %nop1844 = alloca i1, i1 0
+  %nop1845 = alloca i1, i1 0
+  %nop1846 = alloca i1, i1 0
+  %nop1847 = alloca i1, i1 0
+  %nop1848 = alloca i1, i1 0
+  %nop1849 = alloca i1, i1 0
+  %nop1850 = alloca i1, i1 0
+  %nop1851 = alloca i1, i1 0
+  %nop1852 = alloca i1, i1 0
+  %nop1853 = alloca i1, i1 0
+  %nop1854 = alloca i1, i1 0
+  %nop1855 = alloca i1, i1 0
+  %nop1856 = alloca i1, i1 0
+  %nop1857 = alloca i1, i1 0
+  %nop1858 = alloca i1, i1 0
+  %nop1859 = alloca i1, i1 0
+  %nop1860 = alloca i1, i1 0
+  %nop1861 = alloca i1, i1 0
+  %nop1862 = alloca i1, i1 0
+  %nop1863 = alloca i1, i1 0
+  %nop1864 = alloca i1, i1 0
+  %nop1865 = alloca i1, i1 0
+  %nop1866 = alloca i1, i1 0
+  %nop1867 = alloca i1, i1 0
+  %nop1868 = alloca i1, i1 0
+  %nop1869 = alloca i1, i1 0
+  %nop1870 = alloca i1, i1 0
+  %nop1871 = alloca i1, i1 0
+  %nop1872 = alloca i1, i1 0
+  %nop1873 = alloca i1, i1 0
+  %nop1874 = alloca i1, i1 0
+  %nop1875 = alloca i1, i1 0
+  %nop1876 = alloca i1, i1 0
+  %nop1877 = alloca i1, i1 0
+  %nop1878 = alloca i1, i1 0
+  %nop1879 = alloca i1, i1 0
+  %nop1880 = alloca i1, i1 0
+  %nop1881 = alloca i1, i1 0
+  %nop1882 = alloca i1, i1 0
+  %nop1883 = alloca i1, i1 0
+  %nop1884 = alloca i1, i1 0
+  %nop1885 = alloca i1, i1 0
+  %nop1886 = alloca i1, i1 0
+  %nop1887 = alloca i1, i1 0
+  %nop1888 = alloca i1, i1 0
+  %nop1889 = alloca i1, i1 0
+  %nop1890 = alloca i1, i1 0
+  %nop1891 = alloca i1, i1 0
+  %nop1892 = alloca i1, i1 0
+  %nop1893 = alloca i1, i1 0
+  %nop1894 = alloca i1, i1 0
+  %nop1895 = alloca i1, i1 0
+  %nop1896 = alloca i1, i1 0
+  %nop1897 = alloca i1, i1 0
+  %nop1898 = alloca i1, i1 0
+  %nop1899 = alloca i1, i1 0
+  %nop1900 = alloca i1, i1 0
+  %nop1901 = alloca i1, i1 0
+  %nop1902 = alloca i1, i1 0
+  %nop1903 = alloca i1, i1 0
+  %nop1904 = alloca i1, i1 0
+  %nop1905 = alloca i1, i1 0
+  %nop1906 = alloca i1, i1 0
+  %nop1907 = alloca i1, i1 0
+  %nop1908 = alloca i1, i1 0
+  %nop1909 = alloca i1, i1 0
+  %nop1910 = alloca i1, i1 0
+  %nop1911 = alloca i1, i1 0
+  %nop1912 = alloca i1, i1 0
+  %nop1913 = alloca i1, i1 0
+  %nop1914 = alloca i1, i1 0
+  %nop1915 = alloca i1, i1 0
+  %nop1916 = alloca i1, i1 0
+  %nop1917 = alloca i1, i1 0
+  %nop1918 = alloca i1, i1 0
+  %nop1919 = alloca i1, i1 0
+  %nop1920 = alloca i1, i1 0
+  %nop1921 = alloca i1, i1 0
+  %nop1922 = alloca i1, i1 0
+  %nop1923 = alloca i1, i1 0
+  %nop1924 = alloca i1, i1 0
+  %nop1925 = alloca i1, i1 0
+  %nop1926 = alloca i1, i1 0
+  %nop1927 = alloca i1, i1 0
+  %nop1928 = alloca i1, i1 0
+  %nop1929 = alloca i1, i1 0
+  %nop1930 = alloca i1, i1 0
+  %nop1931 = alloca i1, i1 0
+  %nop1932 = alloca i1, i1 0
+  %nop1933 = alloca i1, i1 0
+  %nop1934 = alloca i1, i1 0
+  %nop1935 = alloca i1, i1 0
+  %nop1936 = alloca i1, i1 0
+  %nop1937 = alloca i1, i1 0
+  %nop1938 = alloca i1, i1 0
+  %nop1939 = alloca i1, i1 0
+  %nop1940 = alloca i1, i1 0
+  %nop1941 = alloca i1, i1 0
+  %nop1942 = alloca i1, i1 0
+  %nop1943 = alloca i1, i1 0
+  %nop1944 = alloca i1, i1 0
+  %nop1945 = alloca i1, i1 0
+  %nop1946 = alloca i1, i1 0
+  %nop1947 = alloca i1, i1 0
+  %nop1948 = alloca i1, i1 0
+  %nop1949 = alloca i1, i1 0
+  %nop1950 = alloca i1, i1 0
+  %nop1951 = alloca i1, i1 0
+  %nop1952 = alloca i1, i1 0
+  %nop1953 = alloca i1, i1 0
+  %nop1954 = alloca i1, i1 0
+  %nop1955 = alloca i1, i1 0
+  %nop1956 = alloca i1, i1 0
+  %nop1957 = alloca i1, i1 0
+  %nop1958 = alloca i1, i1 0
+  %nop1959 = alloca i1, i1 0
+  %nop1960 = alloca i1, i1 0
+  %nop1961 = alloca i1, i1 0
+  %nop1962 = alloca i1, i1 0
+  %nop1963 = alloca i1, i1 0
+  %nop1964 = alloca i1, i1 0
+  %nop1965 = alloca i1, i1 0
+  %nop1966 = alloca i1, i1 0
+  %nop1967 = alloca i1, i1 0
+  %nop1968 = alloca i1, i1 0
+  %nop1969 = alloca i1, i1 0
+  %nop1970 = alloca i1, i1 0
+  %nop1971 = alloca i1, i1 0
+  %nop1972 = alloca i1, i1 0
+  %nop1973 = alloca i1, i1 0
+  %nop1974 = alloca i1, i1 0
+  %nop1975 = alloca i1, i1 0
+  %nop1976 = alloca i1, i1 0
+  %nop1977 = alloca i1, i1 0
+  %nop1978 = alloca i1, i1 0
+  %nop1979 = alloca i1, i1 0
+  %nop1980 = alloca i1, i1 0
+  %nop1981 = alloca i1, i1 0
+  %nop1982 = alloca i1, i1 0
+  %nop1983 = alloca i1, i1 0
+  %nop1984 = alloca i1, i1 0
+  %nop1985 = alloca i1, i1 0
+  %nop1986 = alloca i1, i1 0
+  %nop1987 = alloca i1, i1 0
+  %nop1988 = alloca i1, i1 0
+  %nop1989 = alloca i1, i1 0
+  %nop1990 = alloca i1, i1 0
+  %nop1991 = alloca i1, i1 0
+  %nop1992 = alloca i1, i1 0
+  %nop1993 = alloca i1, i1 0
+  %nop1994 = alloca i1, i1 0
+  %nop1995 = alloca i1, i1 0
+  %nop1996 = alloca i1, i1 0
+  %nop1997 = alloca i1, i1 0
+  %nop1998 = alloca i1, i1 0
+  %nop1999 = alloca i1, i1 0
+  %nop2000 = alloca i1, i1 0
+  %nop2001 = alloca i1, i1 0
+  %nop2002 = alloca i1, i1 0
+  %nop2003 = alloca i1, i1 0
+  %nop2004 = alloca i1, i1 0
+  %nop2005 = alloca i1, i1 0
+  %nop2006 = alloca i1, i1 0
+  %nop2007 = alloca i1, i1 0
+  %nop2008 = alloca i1, i1 0
+  %nop2009 = alloca i1, i1 0
+  %nop2010 = alloca i1, i1 0
+  %nop2011 = alloca i1, i1 0
+  %nop2012 = alloca i1, i1 0
+  %nop2013 = alloca i1, i1 0
+  %nop2014 = alloca i1, i1 0
+  %nop2015 = alloca i1, i1 0
+  %nop2016 = alloca i1, i1 0
+  %nop2017 = alloca i1, i1 0
+  %nop2018 = alloca i1, i1 0
+  %nop2019 = alloca i1, i1 0
+  %nop2020 = alloca i1, i1 0
+  %nop2021 = alloca i1, i1 0
+  %nop2022 = alloca i1, i1 0
+  %nop2023 = alloca i1, i1 0
+  %nop2024 = alloca i1, i1 0
+  %nop2025 = alloca i1, i1 0
+  %nop2026 = alloca i1, i1 0
+  %nop2027 = alloca i1, i1 0
+  %nop2028 = alloca i1, i1 0
+  %nop2029 = alloca i1, i1 0
+  %nop2030 = alloca i1, i1 0
+  %nop2031 = alloca i1, i1 0
+  %nop2032 = alloca i1, i1 0
+  %nop2033 = alloca i1, i1 0
+  %nop2034 = alloca i1, i1 0
+  %nop2035 = alloca i1, i1 0
+  %nop2036 = alloca i1, i1 0
+  %nop2037 = alloca i1, i1 0
+  %nop2038 = alloca i1, i1 0
+  %nop2039 = alloca i1, i1 0
+  %nop2040 = alloca i1, i1 0
+  %nop2041 = alloca i1, i1 0
+  %nop2042 = alloca i1, i1 0
+  %nop2043 = alloca i1, i1 0
+  %nop2044 = alloca i1, i1 0
+  %nop2045 = alloca i1, i1 0
+  %nop2046 = alloca i1, i1 0
+  %nop2047 = alloca i1, i1 0
+  %nop2048 = alloca i1, i1 0
+  %nop2049 = alloca i1, i1 0
+  %nop2050 = alloca i1, i1 0
+  %nop2051 = alloca i1, i1 0
+  %nop2052 = alloca i1, i1 0
+  %nop2053 = alloca i1, i1 0
+  %nop2054 = alloca i1, i1 0
+  %nop2055 = alloca i1, i1 0
+  %nop2056 = alloca i1, i1 0
+  %nop2057 = alloca i1, i1 0
+  %nop2058 = alloca i1, i1 0
+  %nop2059 = alloca i1, i1 0
+  %nop2060 = alloca i1, i1 0
+  %nop2061 = alloca i1, i1 0
+  %nop2062 = alloca i1, i1 0
+  %nop2063 = alloca i1, i1 0
+  %nop2064 = alloca i1, i1 0
+  %nop2065 = alloca i1, i1 0
+  %nop2066 = alloca i1, i1 0
+  %nop2067 = alloca i1, i1 0
+  %nop2068 = alloca i1, i1 0
+  %nop2069 = alloca i1, i1 0
+  %nop2070 = alloca i1, i1 0
+  %nop2071 = alloca i1, i1 0
+  %nop2072 = alloca i1, i1 0
+  %nop2073 = alloca i1, i1 0
+  %nop2074 = alloca i1, i1 0
+  %nop2075 = alloca i1, i1 0
+  %nop2076 = alloca i1, i1 0
+  %nop2077 = alloca i1, i1 0
+  %nop2078 = alloca i1, i1 0
+  %nop2079 = alloca i1, i1 0
+  %nop2080 = alloca i1, i1 0
+  %nop2081 = alloca i1, i1 0
+  %nop2082 = alloca i1, i1 0
+  %nop2083 = alloca i1, i1 0
+  %nop2084 = alloca i1, i1 0
+  %nop2085 = alloca i1, i1 0
+  %nop2086 = alloca i1, i1 0
+  %nop2087 = alloca i1, i1 0
+  %nop2088 = alloca i1, i1 0
+  %nop2089 = alloca i1, i1 0
+  %nop2090 = alloca i1, i1 0
+  %nop2091 = alloca i1, i1 0
+  %nop2092 = alloca i1, i1 0
+  %nop2093 = alloca i1, i1 0
+  %nop2094 = alloca i1, i1 0
+  %nop2095 = alloca i1, i1 0
+  %nop2096 = alloca i1, i1 0
+  %nop2097 = alloca i1, i1 0
+  %nop2098 = alloca i1, i1 0
+  %nop2099 = alloca i1, i1 0
+  %nop2100 = alloca i1, i1 0
+  %nop2101 = alloca i1, i1 0
+  %nop2102 = alloca i1, i1 0
+  %nop2103 = alloca i1, i1 0
+  %nop2104 = alloca i1, i1 0
+  %nop2105 = alloca i1, i1 0
+  %nop2106 = alloca i1, i1 0
+  %nop2107 = alloca i1, i1 0
+  %nop2108 = alloca i1, i1 0
+  %nop2109 = alloca i1, i1 0
+  %nop2110 = alloca i1, i1 0
+  %nop2111 = alloca i1, i1 0
+  %nop2112 = alloca i1, i1 0
+  %nop2113 = alloca i1, i1 0
+  %nop2114 = alloca i1, i1 0
+  %nop2115 = alloca i1, i1 0
+  %nop2116 = alloca i1, i1 0
+  %nop2117 = alloca i1, i1 0
+  %nop2118 = alloca i1, i1 0
+  %nop2119 = alloca i1, i1 0
+  %nop2120 = alloca i1, i1 0
+  %nop2121 = alloca i1, i1 0
+  %nop2122 = alloca i1, i1 0
+  %nop2123 = alloca i1, i1 0
+  %nop2124 = alloca i1, i1 0
+  %nop2125 = alloca i1, i1 0
+  %nop2126 = alloca i1, i1 0
+  %nop2127 = alloca i1, i1 0
+  %nop2128 = alloca i1, i1 0
+  %nop2129 = alloca i1, i1 0
+  %nop2130 = alloca i1, i1 0
+  %nop2131 = alloca i1, i1 0
+  %nop2132 = alloca i1, i1 0
+  %nop2133 = alloca i1, i1 0
+  %nop2134 = alloca i1, i1 0
+  %nop2135 = alloca i1, i1 0
+  %nop2136 = alloca i1, i1 0
+  %nop2137 = alloca i1, i1 0
+  %nop2138 = alloca i1, i1 0
+  %nop2139 = alloca i1, i1 0
+  %nop2140 = alloca i1, i1 0
+  %nop2141 = alloca i1, i1 0
+  %nop2142 = alloca i1, i1 0
+  %nop2143 = alloca i1, i1 0
+  %nop2144 = alloca i1, i1 0
+  %nop2145 = alloca i1, i1 0
+  %nop2146 = alloca i1, i1 0
+  %nop2147 = alloca i1, i1 0
+  %nop2148 = alloca i1, i1 0
+  %nop2149 = alloca i1, i1 0
+  %nop2150 = alloca i1, i1 0
+  %nop2151 = alloca i1, i1 0
+  %nop2152 = alloca i1, i1 0
+  %nop2153 = alloca i1, i1 0
+  %nop2154 = alloca i1, i1 0
+  %nop2155 = alloca i1, i1 0
+  %nop2156 = alloca i1, i1 0
+  %nop2157 = alloca i1, i1 0
+  %nop2158 = alloca i1, i1 0
+  %nop2159 = alloca i1, i1 0
+  %nop2160 = alloca i1, i1 0
+  %nop2161 = alloca i1, i1 0
+  %nop2162 = alloca i1, i1 0
+  %nop2163 = alloca i1, i1 0
+  %nop2164 = alloca i1, i1 0
+  %nop2165 = alloca i1, i1 0
+  %nop2166 = alloca i1, i1 0
+  %nop2167 = alloca i1, i1 0
+  %nop2168 = alloca i1, i1 0
+  %nop2169 = alloca i1, i1 0
+  %nop2170 = alloca i1, i1 0
+  %nop2171 = alloca i1, i1 0
+  %nop2172 = alloca i1, i1 0
+  %nop2173 = alloca i1, i1 0
+  %nop2174 = alloca i1, i1 0
+  %nop2175 = alloca i1, i1 0
+  %nop2176 = alloca i1, i1 0
+  %nop2177 = alloca i1, i1 0
+  %nop2178 = alloca i1, i1 0
+  %nop2179 = alloca i1, i1 0
+  %nop2180 = alloca i1, i1 0
+  %nop2181 = alloca i1, i1 0
+  %nop2182 = alloca i1, i1 0
+  %nop2183 = alloca i1, i1 0
+  %nop2184 = alloca i1, i1 0
+  %nop2185 = alloca i1, i1 0
+  %nop2186 = alloca i1, i1 0
+  %nop2187 = alloca i1, i1 0
+  %nop2188 = alloca i1, i1 0
+  %nop2189 = alloca i1, i1 0
+  %nop2190 = alloca i1, i1 0
+  %nop2191 = alloca i1, i1 0
+  %nop2192 = alloca i1, i1 0
+  %nop2193 = alloca i1, i1 0
+  %nop2194 = alloca i1, i1 0
+  %nop2195 = alloca i1, i1 0
+  %nop2196 = alloca i1, i1 0
+  %nop2197 = alloca i1, i1 0
+  %nop2198 = alloca i1, i1 0
+  %nop2199 = alloca i1, i1 0
+  %nop2200 = alloca i1, i1 0
+  %nop2201 = alloca i1, i1 0
+  %nop2202 = alloca i1, i1 0
+  %nop2203 = alloca i1, i1 0
+  %nop2204 = alloca i1, i1 0
+  %nop2205 = alloca i1, i1 0
+  %nop2206 = alloca i1, i1 0
+  %nop2207 = alloca i1, i1 0
+  %nop2208 = alloca i1, i1 0
+  %nop2209 = alloca i1, i1 0
+  %nop2210 = alloca i1, i1 0
+  %nop2211 = alloca i1, i1 0
+  %nop2212 = alloca i1, i1 0
+  %nop2213 = alloca i1, i1 0
+  %nop2214 = alloca i1, i1 0
+  %nop2215 = alloca i1, i1 0
+  %nop2216 = alloca i1, i1 0
+  %nop2217 = alloca i1, i1 0
+  %nop2218 = alloca i1, i1 0
+  %nop2219 = alloca i1, i1 0
+  %nop2220 = alloca i1, i1 0
+  %nop2221 = alloca i1, i1 0
+  %nop2222 = alloca i1, i1 0
+  %nop2223 = alloca i1, i1 0
+  %nop2224 = alloca i1, i1 0
+  %nop2225 = alloca i1, i1 0
+  %nop2226 = alloca i1, i1 0
+  %nop2227 = alloca i1, i1 0
+  %nop2228 = alloca i1, i1 0
+  %nop2229 = alloca i1, i1 0
+  %nop2230 = alloca i1, i1 0
+  %nop2231 = alloca i1, i1 0
+  %nop2232 = alloca i1, i1 0
+  %nop2233 = alloca i1, i1 0
+  %nop2234 = alloca i1, i1 0
+  %nop2235 = alloca i1, i1 0
+  %nop2236 = alloca i1, i1 0
+  %nop2237 = alloca i1, i1 0
+  %nop2238 = alloca i1, i1 0
+  %nop2239 = alloca i1, i1 0
+  %nop2240 = alloca i1, i1 0
+  %nop2241 = alloca i1, i1 0
+  %nop2242 = alloca i1, i1 0
+  %nop2243 = alloca i1, i1 0
+  %nop2244 = alloca i1, i1 0
+  %nop2245 = alloca i1, i1 0
+  %nop2246 = alloca i1, i1 0
+  %nop2247 = alloca i1, i1 0
+  %nop2248 = alloca i1, i1 0
+  %nop2249 = alloca i1, i1 0
+  %nop2250 = alloca i1, i1 0
+  %nop2251 = alloca i1, i1 0
+  %nop2252 = alloca i1, i1 0
+  %nop2253 = alloca i1, i1 0
+  %nop2254 = alloca i1, i1 0
+  %nop2255 = alloca i1, i1 0
+  %nop2256 = alloca i1, i1 0
+  %nop2257 = alloca i1, i1 0
+  %nop2258 = alloca i1, i1 0
+  %nop2259 = alloca i1, i1 0
+  %nop2260 = alloca i1, i1 0
+  %nop2261 = alloca i1, i1 0
+  %nop2262 = alloca i1, i1 0
+  %nop2263 = alloca i1, i1 0
+  %nop2264 = alloca i1, i1 0
+  %nop2265 = alloca i1, i1 0
+  %nop2266 = alloca i1, i1 0
+  %nop2267 = alloca i1, i1 0
+  %nop2268 = alloca i1, i1 0
+  %nop2269 = alloca i1, i1 0
+  %nop2270 = alloca i1, i1 0
+  %nop2271 = alloca i1, i1 0
+  %nop2272 = alloca i1, i1 0
+  %nop2273 = alloca i1, i1 0
+  %nop2274 = alloca i1, i1 0
+  %nop2275 = alloca i1, i1 0
+  %nop2276 = alloca i1, i1 0
+  %nop2277 = alloca i1, i1 0
+  %nop2278 = alloca i1, i1 0
+  %nop2279 = alloca i1, i1 0
+  %nop2280 = alloca i1, i1 0
+  %nop2281 = alloca i1, i1 0
+  %nop2282 = alloca i1, i1 0
+  %nop2283 = alloca i1, i1 0
+  %nop2284 = alloca i1, i1 0
+  %nop2285 = alloca i1, i1 0
+  %nop2286 = alloca i1, i1 0
+  %nop2287 = alloca i1, i1 0
+  %nop2288 = alloca i1, i1 0
+  %nop2289 = alloca i1, i1 0
+  %nop2290 = alloca i1, i1 0
+  %nop2291 = alloca i1, i1 0
+  %nop2292 = alloca i1, i1 0
+  %nop2293 = alloca i1, i1 0
+  %nop2294 = alloca i1, i1 0
+  %nop2295 = alloca i1, i1 0
+  %nop2296 = alloca i1, i1 0
+  %nop2297 = alloca i1, i1 0
+  %nop2298 = alloca i1, i1 0
+  %nop2299 = alloca i1, i1 0
+  %nop2300 = alloca i1, i1 0
+  %nop2301 = alloca i1, i1 0
+  %nop2302 = alloca i1, i1 0
+  %nop2303 = alloca i1, i1 0
+  %nop2304 = alloca i1, i1 0
+  %nop2305 = alloca i1, i1 0
+  %nop2306 = alloca i1, i1 0
+  %nop2307 = alloca i1, i1 0
+  %nop2308 = alloca i1, i1 0
+  %nop2309 = alloca i1, i1 0
+  %nop2310 = alloca i1, i1 0
+  %nop2311 = alloca i1, i1 0
+  %nop2312 = alloca i1, i1 0
+  %nop2313 = alloca i1, i1 0
+  %nop2314 = alloca i1, i1 0
+  %nop2315 = alloca i1, i1 0
+  %nop2316 = alloca i1, i1 0
+  %nop2317 = alloca i1, i1 0
+  %nop2318 = alloca i1, i1 0
+  %nop2319 = alloca i1, i1 0
+  %nop2320 = alloca i1, i1 0
+  %nop2321 = alloca i1, i1 0
+  %nop2322 = alloca i1, i1 0
+  %nop2323 = alloca i1, i1 0
+  %nop2324 = alloca i1, i1 0
+  %nop2325 = alloca i1, i1 0
+  %nop2326 = alloca i1, i1 0
+  %nop2327 = alloca i1, i1 0
+  %nop2328 = alloca i1, i1 0
+  %nop2329 = alloca i1, i1 0
+  %nop2330 = alloca i1, i1 0
+  %nop2331 = alloca i1, i1 0
+  %nop2332 = alloca i1, i1 0
+  %nop2333 = alloca i1, i1 0
+  %nop2334 = alloca i1, i1 0
+  %nop2335 = alloca i1, i1 0
+  %nop2336 = alloca i1, i1 0
+  %nop2337 = alloca i1, i1 0
+  %nop2338 = alloca i1, i1 0
+  %nop2339 = alloca i1, i1 0
+  %nop2340 = alloca i1, i1 0
+  %nop2341 = alloca i1, i1 0
+  %nop2342 = alloca i1, i1 0
+  %nop2343 = alloca i1, i1 0
+  %nop2344 = alloca i1, i1 0
+  %nop2345 = alloca i1, i1 0
+  %nop2346 = alloca i1, i1 0
+  %nop2347 = alloca i1, i1 0
+  %nop2348 = alloca i1, i1 0
+  %nop2349 = alloca i1, i1 0
+  %nop2350 = alloca i1, i1 0
+  %nop2351 = alloca i1, i1 0
+  %nop2352 = alloca i1, i1 0
+  %nop2353 = alloca i1, i1 0
+  %nop2354 = alloca i1, i1 0
+  %nop2355 = alloca i1, i1 0
+  %nop2356 = alloca i1, i1 0
+  %nop2357 = alloca i1, i1 0
+  %nop2358 = alloca i1, i1 0
+  %nop2359 = alloca i1, i1 0
+  %nop2360 = alloca i1, i1 0
+  %nop2361 = alloca i1, i1 0
+  %nop2362 = alloca i1, i1 0
+  %nop2363 = alloca i1, i1 0
+  %nop2364 = alloca i1, i1 0
+  %nop2365 = alloca i1, i1 0
+  %nop2366 = alloca i1, i1 0
+  %nop2367 = alloca i1, i1 0
+  %nop2368 = alloca i1, i1 0
+  %nop2369 = alloca i1, i1 0
+  %nop2370 = alloca i1, i1 0
+  %nop2371 = alloca i1, i1 0
+  %nop2372 = alloca i1, i1 0
+  %nop2373 = alloca i1, i1 0
+  %nop2374 = alloca i1, i1 0
+  %nop2375 = alloca i1, i1 0
+  %nop2376 = alloca i1, i1 0
+  %nop2377 = alloca i1, i1 0
+  %nop2378 = alloca i1, i1 0
+  %nop2379 = alloca i1, i1 0
+  %nop2380 = alloca i1, i1 0
+  %nop2381 = alloca i1, i1 0
+  %nop2382 = alloca i1, i1 0
+  %nop2383 = alloca i1, i1 0
+  %nop2384 = alloca i1, i1 0
+  %nop2385 = alloca i1, i1 0
+  %nop2386 = alloca i1, i1 0
+  %nop2387 = alloca i1, i1 0
+  %nop2388 = alloca i1, i1 0
+  %nop2389 = alloca i1, i1 0
+  %nop2390 = alloca i1, i1 0
+  %nop2391 = alloca i1, i1 0
+  %nop2392 = alloca i1, i1 0
+  %nop2393 = alloca i1, i1 0
+  %nop2394 = alloca i1, i1 0
+  %nop2395 = alloca i1, i1 0
+  %nop2396 = alloca i1, i1 0
+  %nop2397 = alloca i1, i1 0
+  %nop2398 = alloca i1, i1 0
+  %nop2399 = alloca i1, i1 0
+  %nop2400 = alloca i1, i1 0
+  %nop2401 = alloca i1, i1 0
+  %nop2402 = alloca i1, i1 0
+  %nop2403 = alloca i1, i1 0
+  %nop2404 = alloca i1, i1 0
+  %nop2405 = alloca i1, i1 0
+  %nop2406 = alloca i1, i1 0
+  %nop2407 = alloca i1, i1 0
+  %nop2408 = alloca i1, i1 0
+  %nop2409 = alloca i1, i1 0
+  %nop2410 = alloca i1, i1 0
+  %nop2411 = alloca i1, i1 0
+  %nop2412 = alloca i1, i1 0
+  %nop2413 = alloca i1, i1 0
+  %nop2414 = alloca i1, i1 0
+  %nop2415 = alloca i1, i1 0
+  %nop2416 = alloca i1, i1 0
+  %nop2417 = alloca i1, i1 0
+  %nop2418 = alloca i1, i1 0
+  %nop2419 = alloca i1, i1 0
+  %nop2420 = alloca i1, i1 0
+  %nop2421 = alloca i1, i1 0
+  %nop2422 = alloca i1, i1 0
+  %nop2423 = alloca i1, i1 0
+  %nop2424 = alloca i1, i1 0
+  %nop2425 = alloca i1, i1 0
+  %nop2426 = alloca i1, i1 0
+  %nop2427 = alloca i1, i1 0
+  %nop2428 = alloca i1, i1 0
+  %nop2429 = alloca i1, i1 0
+  %nop2430 = alloca i1, i1 0
+  %nop2431 = alloca i1, i1 0
+  %nop2432 = alloca i1, i1 0
+  %nop2433 = alloca i1, i1 0
+  %nop2434 = alloca i1, i1 0
+  %nop2435 = alloca i1, i1 0
+  %nop2436 = alloca i1, i1 0
+  %nop2437 = alloca i1, i1 0
+  %nop2438 = alloca i1, i1 0
+  %nop2439 = alloca i1, i1 0
+  %nop2440 = alloca i1, i1 0
+  %nop2441 = alloca i1, i1 0
+  %nop2442 = alloca i1, i1 0
+  %nop2443 = alloca i1, i1 0
+  %nop2444 = alloca i1, i1 0
+  %nop2445 = alloca i1, i1 0
+  %nop2446 = alloca i1, i1 0
+  %nop2447 = alloca i1, i1 0
+  %nop2448 = alloca i1, i1 0
+  %nop2449 = alloca i1, i1 0
+  %nop2450 = alloca i1, i1 0
+  %nop2451 = alloca i1, i1 0
+  %nop2452 = alloca i1, i1 0
+  %nop2453 = alloca i1, i1 0
+  %nop2454 = alloca i1, i1 0
+  %nop2455 = alloca i1, i1 0
+  %nop2456 = alloca i1, i1 0
+  %nop2457 = alloca i1, i1 0
+  %nop2458 = alloca i1, i1 0
+  %nop2459 = alloca i1, i1 0
+  %nop2460 = alloca i1, i1 0
+  %nop2461 = alloca i1, i1 0
+  %nop2462 = alloca i1, i1 0
+  %nop2463 = alloca i1, i1 0
+  %nop2464 = alloca i1, i1 0
+  %nop2465 = alloca i1, i1 0
+  %nop2466 = alloca i1, i1 0
+  %nop2467 = alloca i1, i1 0
+  %nop2468 = alloca i1, i1 0
+  %nop2469 = alloca i1, i1 0
+  %nop2470 = alloca i1, i1 0
+  %nop2471 = alloca i1, i1 0
+  %nop2472 = alloca i1, i1 0
+  %nop2473 = alloca i1, i1 0
+  %nop2474 = alloca i1, i1 0
+  %nop2475 = alloca i1, i1 0
+  %nop2476 = alloca i1, i1 0
+  %nop2477 = alloca i1, i1 0
+  %nop2478 = alloca i1, i1 0
+  %nop2479 = alloca i1, i1 0
+  %nop2480 = alloca i1, i1 0
+  %nop2481 = alloca i1, i1 0
+  %nop2482 = alloca i1, i1 0
+  %nop2483 = alloca i1, i1 0
+  %nop2484 = alloca i1, i1 0
+  %nop2485 = alloca i1, i1 0
+  %nop2486 = alloca i1, i1 0
+  %nop2487 = alloca i1, i1 0
+  %nop2488 = alloca i1, i1 0
+  %nop2489 = alloca i1, i1 0
+  %nop2490 = alloca i1, i1 0
+  %nop2491 = alloca i1, i1 0
+  %nop2492 = alloca i1, i1 0
+  %nop2493 = alloca i1, i1 0
+  %nop2494 = alloca i1, i1 0
+  %nop2495 = alloca i1, i1 0
+  %nop2496 = alloca i1, i1 0
+  %nop2497 = alloca i1, i1 0
+  %nop2498 = alloca i1, i1 0
+  %nop2499 = alloca i1, i1 0
+  %nop2500 = alloca i1, i1 0
+  %nop2501 = alloca i1, i1 0
+  %nop2502 = alloca i1, i1 0
+  %nop2503 = alloca i1, i1 0
+  %nop2504 = alloca i1, i1 0
+  %nop2505 = alloca i1, i1 0
+  %nop2506 = alloca i1, i1 0
+  %nop2507 = alloca i1, i1 0
+  %nop2508 = alloca i1, i1 0
+  %nop2509 = alloca i1, i1 0
+  %nop2510 = alloca i1, i1 0
+  %nop2511 = alloca i1, i1 0
+  %nop2512 = alloca i1, i1 0
+  %nop2513 = alloca i1, i1 0
+  %nop2514 = alloca i1, i1 0
+  %nop2515 = alloca i1, i1 0
+  %nop2516 = alloca i1, i1 0
+  %nop2517 = alloca i1, i1 0
+  %nop2518 = alloca i1, i1 0
+  %nop2519 = alloca i1, i1 0
+  %nop2520 = alloca i1, i1 0
+  %nop2521 = alloca i1, i1 0
+  %nop2522 = alloca i1, i1 0
+  %nop2523 = alloca i1, i1 0
+  %nop2524 = alloca i1, i1 0
+  %nop2525 = alloca i1, i1 0
+  %nop2526 = alloca i1, i1 0
+  %nop2527 = alloca i1, i1 0
+  %nop2528 = alloca i1, i1 0
+  %nop2529 = alloca i1, i1 0
+  %nop2530 = alloca i1, i1 0
+  %nop2531 = alloca i1, i1 0
+  %nop2532 = alloca i1, i1 0
+  %nop2533 = alloca i1, i1 0
+  %nop2534 = alloca i1, i1 0
+  %nop2535 = alloca i1, i1 0
+  %nop2536 = alloca i1, i1 0
+  %nop2537 = alloca i1, i1 0
+  %nop2538 = alloca i1, i1 0
+  %nop2539 = alloca i1, i1 0
+  %nop2540 = alloca i1, i1 0
+  %nop2541 = alloca i1, i1 0
+  %nop2542 = alloca i1, i1 0
+  %nop2543 = alloca i1, i1 0
+  %nop2544 = alloca i1, i1 0
+  %nop2545 = alloca i1, i1 0
+  %nop2546 = alloca i1, i1 0
+  %nop2547 = alloca i1, i1 0
+  %nop2548 = alloca i1, i1 0
+  %nop2549 = alloca i1, i1 0
+  %nop2550 = alloca i1, i1 0
+  %nop2551 = alloca i1, i1 0
+  %nop2552 = alloca i1, i1 0
+  %nop2553 = alloca i1, i1 0
+  %nop2554 = alloca i1, i1 0
+  %nop2555 = alloca i1, i1 0
+  %nop2556 = alloca i1, i1 0
+  %nop2557 = alloca i1, i1 0
+  %nop2558 = alloca i1, i1 0
+  %nop2559 = alloca i1, i1 0
+  %nop2560 = alloca i1, i1 0
+  %nop2561 = alloca i1, i1 0
+  %nop2562 = alloca i1, i1 0
+  %nop2563 = alloca i1, i1 0
+  %nop2564 = alloca i1, i1 0
+  %nop2565 = alloca i1, i1 0
+  %nop2566 = alloca i1, i1 0
+  %nop2567 = alloca i1, i1 0
+  %nop2568 = alloca i1, i1 0
+  %nop2569 = alloca i1, i1 0
+  %nop2570 = alloca i1, i1 0
+  %nop2571 = alloca i1, i1 0
+  %nop2572 = alloca i1, i1 0
+  %nop2573 = alloca i1, i1 0
+  %nop2574 = alloca i1, i1 0
+  %nop2575 = alloca i1, i1 0
+  %nop2576 = alloca i1, i1 0
+  %nop2577 = alloca i1, i1 0
+  %nop2578 = alloca i1, i1 0
+  %nop2579 = alloca i1, i1 0
+  %nop2580 = alloca i1, i1 0
+  %nop2581 = alloca i1, i1 0
+  %nop2582 = alloca i1, i1 0
+  %nop2583 = alloca i1, i1 0
+  %nop2584 = alloca i1, i1 0
+  %nop2585 = alloca i1, i1 0
+  %nop2586 = alloca i1, i1 0
+  %nop2587 = alloca i1, i1 0
+  %nop2588 = alloca i1, i1 0
+  %nop2589 = alloca i1, i1 0
+  %nop2590 = alloca i1, i1 0
+  %nop2591 = alloca i1, i1 0
+  %nop2592 = alloca i1, i1 0
+  %nop2593 = alloca i1, i1 0
+  %nop2594 = alloca i1, i1 0
+  %nop2595 = alloca i1, i1 0
+  %nop2596 = alloca i1, i1 0
+  %nop2597 = alloca i1, i1 0
+  %nop2598 = alloca i1, i1 0
+  %nop2599 = alloca i1, i1 0
+  %nop2600 = alloca i1, i1 0
+  %nop2601 = alloca i1, i1 0
+  %nop2602 = alloca i1, i1 0
+  %nop2603 = alloca i1, i1 0
+  %nop2604 = alloca i1, i1 0
+  %nop2605 = alloca i1, i1 0
+  %nop2606 = alloca i1, i1 0
+  %nop2607 = alloca i1, i1 0
+  %nop2608 = alloca i1, i1 0
+  %nop2609 = alloca i1, i1 0
+  %nop2610 = alloca i1, i1 0
+  %nop2611 = alloca i1, i1 0
+  %nop2612 = alloca i1, i1 0
+  %nop2613 = alloca i1, i1 0
+  %nop2614 = alloca i1, i1 0
+  %nop2615 = alloca i1, i1 0
+  %nop2616 = alloca i1, i1 0
+  %nop2617 = alloca i1, i1 0
+  %nop2618 = alloca i1, i1 0
+  %nop2619 = alloca i1, i1 0
+  %nop2620 = alloca i1, i1 0
+  %nop2621 = alloca i1, i1 0
+  %nop2622 = alloca i1, i1 0
+  %nop2623 = alloca i1, i1 0
+  %nop2624 = alloca i1, i1 0
+  %nop2625 = alloca i1, i1 0
+  %nop2626 = alloca i1, i1 0
+  %nop2627 = alloca i1, i1 0
+  %nop2628 = alloca i1, i1 0
+  %nop2629 = alloca i1, i1 0
+  %nop2630 = alloca i1, i1 0
+  %nop2631 = alloca i1, i1 0
+  %nop2632 = alloca i1, i1 0
+  %nop2633 = alloca i1, i1 0
+  %nop2634 = alloca i1, i1 0
+  %nop2635 = alloca i1, i1 0
+  %nop2636 = alloca i1, i1 0
+  %nop2637 = alloca i1, i1 0
+  %nop2638 = alloca i1, i1 0
+  %nop2639 = alloca i1, i1 0
+  %nop2640 = alloca i1, i1 0
+  %nop2641 = alloca i1, i1 0
+  %nop2642 = alloca i1, i1 0
+  %nop2643 = alloca i1, i1 0
+  %nop2644 = alloca i1, i1 0
+  %nop2645 = alloca i1, i1 0
+  %nop2646 = alloca i1, i1 0
+  %nop2647 = alloca i1, i1 0
+  %nop2648 = alloca i1, i1 0
+  %nop2649 = alloca i1, i1 0
+  %nop2650 = alloca i1, i1 0
+  %nop2651 = alloca i1, i1 0
+  %nop2652 = alloca i1, i1 0
+  %nop2653 = alloca i1, i1 0
+  %nop2654 = alloca i1, i1 0
+  %nop2655 = alloca i1, i1 0
+  %nop2656 = alloca i1, i1 0
+  %nop2657 = alloca i1, i1 0
+  %nop2658 = alloca i1, i1 0
+  %nop2659 = alloca i1, i1 0
+  %nop2660 = alloca i1, i1 0
+  %nop2661 = alloca i1, i1 0
+  %nop2662 = alloca i1, i1 0
+  %nop2663 = alloca i1, i1 0
+  %nop2664 = alloca i1, i1 0
+  %nop2665 = alloca i1, i1 0
+  %nop2666 = alloca i1, i1 0
+  %nop2667 = alloca i1, i1 0
+  %nop2668 = alloca i1, i1 0
+  %nop2669 = alloca i1, i1 0
+  %nop2670 = alloca i1, i1 0
+  %nop2671 = alloca i1, i1 0
+  %nop2672 = alloca i1, i1 0
+  %nop2673 = alloca i1, i1 0
+  %nop2674 = alloca i1, i1 0
+  %nop2675 = alloca i1, i1 0
+  %nop2676 = alloca i1, i1 0
+  %nop2677 = alloca i1, i1 0
+  %nop2678 = alloca i1, i1 0
+  %nop2679 = alloca i1, i1 0
+  %nop2680 = alloca i1, i1 0
+  %nop2681 = alloca i1, i1 0
+  %nop2682 = alloca i1, i1 0
+  %nop2683 = alloca i1, i1 0
+  %nop2684 = alloca i1, i1 0
+  %nop2685 = alloca i1, i1 0
+  %nop2686 = alloca i1, i1 0
+  %nop2687 = alloca i1, i1 0
+  %nop2688 = alloca i1, i1 0
+  %nop2689 = alloca i1, i1 0
+  %nop2690 = alloca i1, i1 0
+  %nop2691 = alloca i1, i1 0
+  %nop2692 = alloca i1, i1 0
+  %nop2693 = alloca i1, i1 0
+  %nop2694 = alloca i1, i1 0
+  %nop2695 = alloca i1, i1 0
+  %nop2696 = alloca i1, i1 0
+  %nop2697 = alloca i1, i1 0
+  %nop2698 = alloca i1, i1 0
+  %nop2699 = alloca i1, i1 0
+  %nop2700 = alloca i1, i1 0
+  %nop2701 = alloca i1, i1 0
+  %nop2702 = alloca i1, i1 0
+  %nop2703 = alloca i1, i1 0
+  %nop2704 = alloca i1, i1 0
+  %nop2705 = alloca i1, i1 0
+  %nop2706 = alloca i1, i1 0
+  %nop2707 = alloca i1, i1 0
+  %nop2708 = alloca i1, i1 0
+  %nop2709 = alloca i1, i1 0
+  %nop2710 = alloca i1, i1 0
+  %nop2711 = alloca i1, i1 0
+  %nop2712 = alloca i1, i1 0
+  %nop2713 = alloca i1, i1 0
+  %nop2714 = alloca i1, i1 0
+  %nop2715 = alloca i1, i1 0
+  %nop2716 = alloca i1, i1 0
+  %nop2717 = alloca i1, i1 0
+  %nop2718 = alloca i1, i1 0
+  %nop2719 = alloca i1, i1 0
+  %nop2720 = alloca i1, i1 0
+  %nop2721 = alloca i1, i1 0
+  %nop2722 = alloca i1, i1 0
+  %nop2723 = alloca i1, i1 0
+  %nop2724 = alloca i1, i1 0
+  %nop2725 = alloca i1, i1 0
+  %nop2726 = alloca i1, i1 0
+  %nop2727 = alloca i1, i1 0
+  %nop2728 = alloca i1, i1 0
+  %nop2729 = alloca i1, i1 0
+  %nop2730 = alloca i1, i1 0
+  %nop2731 = alloca i1, i1 0
+  %nop2732 = alloca i1, i1 0
+  %nop2733 = alloca i1, i1 0
+  %nop2734 = alloca i1, i1 0
+  %nop2735 = alloca i1, i1 0
+  %nop2736 = alloca i1, i1 0
+  %nop2737 = alloca i1, i1 0
+  %nop2738 = alloca i1, i1 0
+  %nop2739 = alloca i1, i1 0
+  %nop2740 = alloca i1, i1 0
+  %nop2741 = alloca i1, i1 0
+  %nop2742 = alloca i1, i1 0
+  %nop2743 = alloca i1, i1 0
+  %nop2744 = alloca i1, i1 0
+  %nop2745 = alloca i1, i1 0
+  %nop2746 = alloca i1, i1 0
+  %nop2747 = alloca i1, i1 0
+  %nop2748 = alloca i1, i1 0
+  %nop2749 = alloca i1, i1 0
+  %nop2750 = alloca i1, i1 0
+  %nop2751 = alloca i1, i1 0
+  %nop2752 = alloca i1, i1 0
+  %nop2753 = alloca i1, i1 0
+  %nop2754 = alloca i1, i1 0
+  %nop2755 = alloca i1, i1 0
+  %nop2756 = alloca i1, i1 0
+  %nop2757 = alloca i1, i1 0
+  %nop2758 = alloca i1, i1 0
+  %nop2759 = alloca i1, i1 0
+  %nop2760 = alloca i1, i1 0
+  %nop2761 = alloca i1, i1 0
+  %nop2762 = alloca i1, i1 0
+  %nop2763 = alloca i1, i1 0
+  %nop2764 = alloca i1, i1 0
+  %nop2765 = alloca i1, i1 0
+  %nop2766 = alloca i1, i1 0
+  %nop2767 = alloca i1, i1 0
+  %nop2768 = alloca i1, i1 0
+  %nop2769 = alloca i1, i1 0
+  %nop2770 = alloca i1, i1 0
+  %nop2771 = alloca i1, i1 0
+  %nop2772 = alloca i1, i1 0
+  %nop2773 = alloca i1, i1 0
+  %nop2774 = alloca i1, i1 0
+  %nop2775 = alloca i1, i1 0
+  %nop2776 = alloca i1, i1 0
+  %nop2777 = alloca i1, i1 0
+  %nop2778 = alloca i1, i1 0
+  %nop2779 = alloca i1, i1 0
+  %nop2780 = alloca i1, i1 0
+  %nop2781 = alloca i1, i1 0
+  %nop2782 = alloca i1, i1 0
+  %nop2783 = alloca i1, i1 0
+  %nop2784 = alloca i1, i1 0
+  %nop2785 = alloca i1, i1 0
+  %nop2786 = alloca i1, i1 0
+  %nop2787 = alloca i1, i1 0
+  %nop2788 = alloca i1, i1 0
+  %nop2789 = alloca i1, i1 0
+  %nop2790 = alloca i1, i1 0
+  %nop2791 = alloca i1, i1 0
+  %nop2792 = alloca i1, i1 0
+  %nop2793 = alloca i1, i1 0
+  %nop2794 = alloca i1, i1 0
+  %nop2795 = alloca i1, i1 0
+  %nop2796 = alloca i1, i1 0
+  %nop2797 = alloca i1, i1 0
+  %nop2798 = alloca i1, i1 0
+  %nop2799 = alloca i1, i1 0
+  %nop2800 = alloca i1, i1 0
+  %nop2801 = alloca i1, i1 0
+  %nop2802 = alloca i1, i1 0
+  %nop2803 = alloca i1, i1 0
+  %nop2804 = alloca i1, i1 0
+  %nop2805 = alloca i1, i1 0
+  %nop2806 = alloca i1, i1 0
+  %nop2807 = alloca i1, i1 0
+  %nop2808 = alloca i1, i1 0
+  %nop2809 = alloca i1, i1 0
+  %nop2810 = alloca i1, i1 0
+  %nop2811 = alloca i1, i1 0
+  %nop2812 = alloca i1, i1 0
+  %nop2813 = alloca i1, i1 0
+  %nop2814 = alloca i1, i1 0
+  %nop2815 = alloca i1, i1 0
+  %nop2816 = alloca i1, i1 0
+  %nop2817 = alloca i1, i1 0
+  %nop2818 = alloca i1, i1 0
+  %nop2819 = alloca i1, i1 0
+  %nop2820 = alloca i1, i1 0
+  %nop2821 = alloca i1, i1 0
+  %nop2822 = alloca i1, i1 0
+  %nop2823 = alloca i1, i1 0
+  %nop2824 = alloca i1, i1 0
+  %nop2825 = alloca i1, i1 0
+  %nop2826 = alloca i1, i1 0
+  %nop2827 = alloca i1, i1 0
+  %nop2828 = alloca i1, i1 0
+  %nop2829 = alloca i1, i1 0
+  %nop2830 = alloca i1, i1 0
+  %nop2831 = alloca i1, i1 0
+  %nop2832 = alloca i1, i1 0
+  %nop2833 = alloca i1, i1 0
+  %nop2834 = alloca i1, i1 0
+  %nop2835 = alloca i1, i1 0
+  %nop2836 = alloca i1, i1 0
+  %nop2837 = alloca i1, i1 0
+  %nop2838 = alloca i1, i1 0
+  %nop2839 = alloca i1, i1 0
+  %nop2840 = alloca i1, i1 0
+  %nop2841 = alloca i1, i1 0
+  %nop2842 = alloca i1, i1 0
+  %nop2843 = alloca i1, i1 0
+  %nop2844 = alloca i1, i1 0
+  %nop2845 = alloca i1, i1 0
+  %nop2846 = alloca i1, i1 0
+  %nop2847 = alloca i1, i1 0
+  %nop2848 = alloca i1, i1 0
+  %nop2849 = alloca i1, i1 0
+  %nop2850 = alloca i1, i1 0
+  %nop2851 = alloca i1, i1 0
+  %nop2852 = alloca i1, i1 0
+  %nop2853 = alloca i1, i1 0
+  %nop2854 = alloca i1, i1 0
+  %nop2855 = alloca i1, i1 0
+  %nop2856 = alloca i1, i1 0
+  %nop2857 = alloca i1, i1 0
+  %nop2858 = alloca i1, i1 0
+  %nop2859 = alloca i1, i1 0
+  %nop2860 = alloca i1, i1 0
+  %nop2861 = alloca i1, i1 0
+  %nop2862 = alloca i1, i1 0
+  %nop2863 = alloca i1, i1 0
+  %nop2864 = alloca i1, i1 0
+  %nop2865 = alloca i1, i1 0
+  %nop2866 = alloca i1, i1 0
+  %nop2867 = alloca i1, i1 0
+  %nop2868 = alloca i1, i1 0
+  %nop2869 = alloca i1, i1 0
+  %nop2870 = alloca i1, i1 0
+  %nop2871 = alloca i1, i1 0
+  %nop2872 = alloca i1, i1 0
+  %nop2873 = alloca i1, i1 0
+  %nop2874 = alloca i1, i1 0
+  %nop2875 = alloca i1, i1 0
+  %nop2876 = alloca i1, i1 0
+  %nop2877 = alloca i1, i1 0
+  %nop2878 = alloca i1, i1 0
+  %nop2879 = alloca i1, i1 0
+  %nop2880 = alloca i1, i1 0
+  %nop2881 = alloca i1, i1 0
+  %nop2882 = alloca i1, i1 0
+  %nop2883 = alloca i1, i1 0
+  %nop2884 = alloca i1, i1 0
+  %nop2885 = alloca i1, i1 0
+  %nop2886 = alloca i1, i1 0
+  %nop2887 = alloca i1, i1 0
+  %nop2888 = alloca i1, i1 0
+  %nop2889 = alloca i1, i1 0
+  %nop2890 = alloca i1, i1 0
+  %nop2891 = alloca i1, i1 0
+  %nop2892 = alloca i1, i1 0
+  %nop2893 = alloca i1, i1 0
+  %nop2894 = alloca i1, i1 0
+  %nop2895 = alloca i1, i1 0
+  %nop2896 = alloca i1, i1 0
+  %nop2897 = alloca i1, i1 0
+  %nop2898 = alloca i1, i1 0
+  %nop2899 = alloca i1, i1 0
+  %nop2900 = alloca i1, i1 0
+  %nop2901 = alloca i1, i1 0
+  %nop2902 = alloca i1, i1 0
+  %nop2903 = alloca i1, i1 0
+  %nop2904 = alloca i1, i1 0
+  %nop2905 = alloca i1, i1 0
+  %nop2906 = alloca i1, i1 0
+  %nop2907 = alloca i1, i1 0
+  %nop2908 = alloca i1, i1 0
+  %nop2909 = alloca i1, i1 0
+  %nop2910 = alloca i1, i1 0
+  %nop2911 = alloca i1, i1 0
+  %nop2912 = alloca i1, i1 0
+  %nop2913 = alloca i1, i1 0
+  %nop2914 = alloca i1, i1 0
+  %nop2915 = alloca i1, i1 0
+  %nop2916 = alloca i1, i1 0
+  %nop2917 = alloca i1, i1 0
+  %nop2918 = alloca i1, i1 0
+  %nop2919 = alloca i1, i1 0
+  %nop2920 = alloca i1, i1 0
+  %nop2921 = alloca i1, i1 0
+  %nop2922 = alloca i1, i1 0
+  %nop2923 = alloca i1, i1 0
+  %nop2924 = alloca i1, i1 0
+  %nop2925 = alloca i1, i1 0
+  %nop2926 = alloca i1, i1 0
+  %nop2927 = alloca i1, i1 0
+  %nop2928 = alloca i1, i1 0
+  %nop2929 = alloca i1, i1 0
+  %nop2930 = alloca i1, i1 0
+  %nop2931 = alloca i1, i1 0
+  %nop2932 = alloca i1, i1 0
+  %nop2933 = alloca i1, i1 0
+  %nop2934 = alloca i1, i1 0
+  %nop2935 = alloca i1, i1 0
+  %nop2936 = alloca i1, i1 0
+  %nop2937 = alloca i1, i1 0
+  %nop2938 = alloca i1, i1 0
+  %nop2939 = alloca i1, i1 0
+  %nop2940 = alloca i1, i1 0
+  %nop2941 = alloca i1, i1 0
+  %nop2942 = alloca i1, i1 0
+  %nop2943 = alloca i1, i1 0
+  %nop2944 = alloca i1, i1 0
+  %nop2945 = alloca i1, i1 0
+  %nop2946 = alloca i1, i1 0
+  %nop2947 = alloca i1, i1 0
+  %nop2948 = alloca i1, i1 0
+  %nop2949 = alloca i1, i1 0
+  %nop2950 = alloca i1, i1 0
+  %nop2951 = alloca i1, i1 0
+  %nop2952 = alloca i1, i1 0
+  %nop2953 = alloca i1, i1 0
+  %nop2954 = alloca i1, i1 0
+  %nop2955 = alloca i1, i1 0
+  %nop2956 = alloca i1, i1 0
+  %nop2957 = alloca i1, i1 0
+  %nop2958 = alloca i1, i1 0
+  %nop2959 = alloca i1, i1 0
+  %nop2960 = alloca i1, i1 0
+  %nop2961 = alloca i1, i1 0
+  %nop2962 = alloca i1, i1 0
+  %nop2963 = alloca i1, i1 0
+  %nop2964 = alloca i1, i1 0
+  %nop2965 = alloca i1, i1 0
+  %nop2966 = alloca i1, i1 0
+  %nop2967 = alloca i1, i1 0
+  %nop2968 = alloca i1, i1 0
+  %nop2969 = alloca i1, i1 0
+  %nop2970 = alloca i1, i1 0
+  %nop2971 = alloca i1, i1 0
+  %nop2972 = alloca i1, i1 0
+  %nop2973 = alloca i1, i1 0
+  %nop2974 = alloca i1, i1 0
+  %nop2975 = alloca i1, i1 0
+  %nop2976 = alloca i1, i1 0
+  %nop2977 = alloca i1, i1 0
+  %nop2978 = alloca i1, i1 0
+  %nop2979 = alloca i1, i1 0
+  %nop2980 = alloca i1, i1 0
+  %nop2981 = alloca i1, i1 0
+  %nop2982 = alloca i1, i1 0
+  %nop2983 = alloca i1, i1 0
+  %nop2984 = alloca i1, i1 0
+  %nop2985 = alloca i1, i1 0
+  %nop2986 = alloca i1, i1 0
+  %nop2987 = alloca i1, i1 0
+  %nop2988 = alloca i1, i1 0
+  %nop2989 = alloca i1, i1 0
+  %nop2990 = alloca i1, i1 0
+  %nop2991 = alloca i1, i1 0
+  %nop2992 = alloca i1, i1 0
+  %nop2993 = alloca i1, i1 0
+  %nop2994 = alloca i1, i1 0
+  %nop2995 = alloca i1, i1 0
+  %nop2996 = alloca i1, i1 0
+  %nop2997 = alloca i1, i1 0
+  %nop2998 = alloca i1, i1 0
+  %nop2999 = alloca i1, i1 0
+  %nop3000 = alloca i1, i1 0
+  %nop3001 = alloca i1, i1 0
+  %nop3002 = alloca i1, i1 0
+  %nop3003 = alloca i1, i1 0
+  %nop3004 = alloca i1, i1 0
+  %nop3005 = alloca i1, i1 0
+  %nop3006 = alloca i1, i1 0
+  %nop3007 = alloca i1, i1 0
+  %nop3008 = alloca i1, i1 0
+  %nop3009 = alloca i1, i1 0
+  %nop3010 = alloca i1, i1 0
+  %nop3011 = alloca i1, i1 0
+  %nop3012 = alloca i1, i1 0
+  %nop3013 = alloca i1, i1 0
+  %nop3014 = alloca i1, i1 0
+  %nop3015 = alloca i1, i1 0
+  %nop3016 = alloca i1, i1 0
+  %nop3017 = alloca i1, i1 0
+  %nop3018 = alloca i1, i1 0
+  %nop3019 = alloca i1, i1 0
+  %nop3020 = alloca i1, i1 0
+  %nop3021 = alloca i1, i1 0
+  %nop3022 = alloca i1, i1 0
+  %nop3023 = alloca i1, i1 0
+  %nop3024 = alloca i1, i1 0
+  %nop3025 = alloca i1, i1 0
+  %nop3026 = alloca i1, i1 0
+  %nop3027 = alloca i1, i1 0
+  %nop3028 = alloca i1, i1 0
+  %nop3029 = alloca i1, i1 0
+  %nop3030 = alloca i1, i1 0
+  %nop3031 = alloca i1, i1 0
+  %nop3032 = alloca i1, i1 0
+  %nop3033 = alloca i1, i1 0
+  %nop3034 = alloca i1, i1 0
+  %nop3035 = alloca i1, i1 0
+  %nop3036 = alloca i1, i1 0
+  %nop3037 = alloca i1, i1 0
+  %nop3038 = alloca i1, i1 0
+  %nop3039 = alloca i1, i1 0
+  %nop3040 = alloca i1, i1 0
+  %nop3041 = alloca i1, i1 0
+  %nop3042 = alloca i1, i1 0
+  %nop3043 = alloca i1, i1 0
+  %nop3044 = alloca i1, i1 0
+  %nop3045 = alloca i1, i1 0
+  %nop3046 = alloca i1, i1 0
+  %nop3047 = alloca i1, i1 0
+  %nop3048 = alloca i1, i1 0
+  %nop3049 = alloca i1, i1 0
+  %nop3050 = alloca i1, i1 0
+  %nop3051 = alloca i1, i1 0
+  %nop3052 = alloca i1, i1 0
+  %nop3053 = alloca i1, i1 0
+  %nop3054 = alloca i1, i1 0
+  %nop3055 = alloca i1, i1 0
+  %nop3056 = alloca i1, i1 0
+  %nop3057 = alloca i1, i1 0
+  %nop3058 = alloca i1, i1 0
+  %nop3059 = alloca i1, i1 0
+  %nop3060 = alloca i1, i1 0
+  %nop3061 = alloca i1, i1 0
+  %nop3062 = alloca i1, i1 0
+  %nop3063 = alloca i1, i1 0
+  %nop3064 = alloca i1, i1 0
+  %nop3065 = alloca i1, i1 0
+  %nop3066 = alloca i1, i1 0
+  %nop3067 = alloca i1, i1 0
+  %nop3068 = alloca i1, i1 0
+  %nop3069 = alloca i1, i1 0
+  %nop3070 = alloca i1, i1 0
+  %nop3071 = alloca i1, i1 0
+  %nop3072 = alloca i1, i1 0
+  %nop3073 = alloca i1, i1 0
+  %nop3074 = alloca i1, i1 0
+  %nop3075 = alloca i1, i1 0
+  %nop3076 = alloca i1, i1 0
+  %nop3077 = alloca i1, i1 0
+  %nop3078 = alloca i1, i1 0
+  %nop3079 = alloca i1, i1 0
+  %nop3080 = alloca i1, i1 0
+  %nop3081 = alloca i1, i1 0
+  %nop3082 = alloca i1, i1 0
+  %nop3083 = alloca i1, i1 0
+  %nop3084 = alloca i1, i1 0
+  %nop3085 = alloca i1, i1 0
+  %nop3086 = alloca i1, i1 0
+  %nop3087 = alloca i1, i1 0
+  %nop3088 = alloca i1, i1 0
+  %nop3089 = alloca i1, i1 0
+  %nop3090 = alloca i1, i1 0
+  %nop3091 = alloca i1, i1 0
+  %nop3092 = alloca i1, i1 0
+  %nop3093 = alloca i1, i1 0
+  %nop3094 = alloca i1, i1 0
+  %nop3095 = alloca i1, i1 0
+  %nop3096 = alloca i1, i1 0
+  %nop3097 = alloca i1, i1 0
+  %nop3098 = alloca i1, i1 0
+  %nop3099 = alloca i1, i1 0
+  %nop3100 = alloca i1, i1 0
+  %nop3101 = alloca i1, i1 0
+  %nop3102 = alloca i1, i1 0
+  %nop3103 = alloca i1, i1 0
+  %nop3104 = alloca i1, i1 0
+  %nop3105 = alloca i1, i1 0
+  %nop3106 = alloca i1, i1 0
+  %nop3107 = alloca i1, i1 0
+  %nop3108 = alloca i1, i1 0
+  %nop3109 = alloca i1, i1 0
+  %nop3110 = alloca i1, i1 0
+  %nop3111 = alloca i1, i1 0
+  %nop3112 = alloca i1, i1 0
+  %nop3113 = alloca i1, i1 0
+  %nop3114 = alloca i1, i1 0
+  %nop3115 = alloca i1, i1 0
+  %nop3116 = alloca i1, i1 0
+  %nop3117 = alloca i1, i1 0
+  %nop3118 = alloca i1, i1 0
+  %nop3119 = alloca i1, i1 0
+  %nop3120 = alloca i1, i1 0
+  %nop3121 = alloca i1, i1 0
+  %nop3122 = alloca i1, i1 0
+  %nop3123 = alloca i1, i1 0
+  %nop3124 = alloca i1, i1 0
+  %nop3125 = alloca i1, i1 0
+  %nop3126 = alloca i1, i1 0
+  %nop3127 = alloca i1, i1 0
+  %nop3128 = alloca i1, i1 0
+  %nop3129 = alloca i1, i1 0
+  %nop3130 = alloca i1, i1 0
+  %nop3131 = alloca i1, i1 0
+  %nop3132 = alloca i1, i1 0
+  %nop3133 = alloca i1, i1 0
+  %nop3134 = alloca i1, i1 0
+  %nop3135 = alloca i1, i1 0
+  %nop3136 = alloca i1, i1 0
+  %nop3137 = alloca i1, i1 0
+  %nop3138 = alloca i1, i1 0
+  %nop3139 = alloca i1, i1 0
+  %nop3140 = alloca i1, i1 0
+  %nop3141 = alloca i1, i1 0
+  %nop3142 = alloca i1, i1 0
+  %nop3143 = alloca i1, i1 0
+  %nop3144 = alloca i1, i1 0
+  %nop3145 = alloca i1, i1 0
+  %nop3146 = alloca i1, i1 0
+  %nop3147 = alloca i1, i1 0
+  %nop3148 = alloca i1, i1 0
+  %nop3149 = alloca i1, i1 0
+  %nop3150 = alloca i1, i1 0
+  %nop3151 = alloca i1, i1 0
+  %nop3152 = alloca i1, i1 0
+  %nop3153 = alloca i1, i1 0
+  %nop3154 = alloca i1, i1 0
+  %nop3155 = alloca i1, i1 0
+  %nop3156 = alloca i1, i1 0
+  %nop3157 = alloca i1, i1 0
+  %nop3158 = alloca i1, i1 0
+  %nop3159 = alloca i1, i1 0
+  %nop3160 = alloca i1, i1 0
+  %nop3161 = alloca i1, i1 0
+  %nop3162 = alloca i1, i1 0
+  %nop3163 = alloca i1, i1 0
+  %nop3164 = alloca i1, i1 0
+  %nop3165 = alloca i1, i1 0
+  %nop3166 = alloca i1, i1 0
+  %nop3167 = alloca i1, i1 0
+  %nop3168 = alloca i1, i1 0
+  %nop3169 = alloca i1, i1 0
+  %nop3170 = alloca i1, i1 0
+  %nop3171 = alloca i1, i1 0
+  %nop3172 = alloca i1, i1 0
+  %nop3173 = alloca i1, i1 0
+  %nop3174 = alloca i1, i1 0
+  %nop3175 = alloca i1, i1 0
+  %nop3176 = alloca i1, i1 0
+  %nop3177 = alloca i1, i1 0
+  %nop3178 = alloca i1, i1 0
+  %nop3179 = alloca i1, i1 0
+  %nop3180 = alloca i1, i1 0
+  %nop3181 = alloca i1, i1 0
+  %nop3182 = alloca i1, i1 0
+  %nop3183 = alloca i1, i1 0
+  %nop3184 = alloca i1, i1 0
+  %nop3185 = alloca i1, i1 0
+  %nop3186 = alloca i1, i1 0
+  %nop3187 = alloca i1, i1 0
+  %nop3188 = alloca i1, i1 0
+  %nop3189 = alloca i1, i1 0
+  %nop3190 = alloca i1, i1 0
+  %nop3191 = alloca i1, i1 0
+  %nop3192 = alloca i1, i1 0
+  %nop3193 = alloca i1, i1 0
+  %nop3194 = alloca i1, i1 0
+  %nop3195 = alloca i1, i1 0
+  %nop3196 = alloca i1, i1 0
+  %nop3197 = alloca i1, i1 0
+  %nop3198 = alloca i1, i1 0
+  %nop3199 = alloca i1, i1 0
+  %nop3200 = alloca i1, i1 0
+  %nop3201 = alloca i1, i1 0
+  %nop3202 = alloca i1, i1 0
+  %nop3203 = alloca i1, i1 0
+  %nop3204 = alloca i1, i1 0
+  %nop3205 = alloca i1, i1 0
+  %nop3206 = alloca i1, i1 0
+  %nop3207 = alloca i1, i1 0
+  %nop3208 = alloca i1, i1 0
+  %nop3209 = alloca i1, i1 0
+  %nop3210 = alloca i1, i1 0
+  %nop3211 = alloca i1, i1 0
+  %nop3212 = alloca i1, i1 0
+  %nop3213 = alloca i1, i1 0
+  %nop3214 = alloca i1, i1 0
+  %nop3215 = alloca i1, i1 0
+  %nop3216 = alloca i1, i1 0
+  %nop3217 = alloca i1, i1 0
+  %nop3218 = alloca i1, i1 0
+  %nop3219 = alloca i1, i1 0
+  %nop3220 = alloca i1, i1 0
+  %nop3221 = alloca i1, i1 0
+  %nop3222 = alloca i1, i1 0
+  %nop3223 = alloca i1, i1 0
+  %nop3224 = alloca i1, i1 0
+  %nop3225 = alloca i1, i1 0
+  %nop3226 = alloca i1, i1 0
+  %nop3227 = alloca i1, i1 0
+  %nop3228 = alloca i1, i1 0
+  %nop3229 = alloca i1, i1 0
+  %nop3230 = alloca i1, i1 0
+  %nop3231 = alloca i1, i1 0
+  %nop3232 = alloca i1, i1 0
+  %nop3233 = alloca i1, i1 0
+  %nop3234 = alloca i1, i1 0
+  %nop3235 = alloca i1, i1 0
+  %nop3236 = alloca i1, i1 0
+  %nop3237 = alloca i1, i1 0
+  %nop3238 = alloca i1, i1 0
+  %nop3239 = alloca i1, i1 0
+  %nop3240 = alloca i1, i1 0
+  %nop3241 = alloca i1, i1 0
+  %nop3242 = alloca i1, i1 0
+  %nop3243 = alloca i1, i1 0
+  %nop3244 = alloca i1, i1 0
+  %nop3245 = alloca i1, i1 0
+  %nop3246 = alloca i1, i1 0
+  %nop3247 = alloca i1, i1 0
+  %nop3248 = alloca i1, i1 0
+  %nop3249 = alloca i1, i1 0
+  %nop3250 = alloca i1, i1 0
+  %nop3251 = alloca i1, i1 0
+  %nop3252 = alloca i1, i1 0
+  %nop3253 = alloca i1, i1 0
+  %nop3254 = alloca i1, i1 0
+  %nop3255 = alloca i1, i1 0
+  %nop3256 = alloca i1, i1 0
+  %nop3257 = alloca i1, i1 0
+  %nop3258 = alloca i1, i1 0
+  %nop3259 = alloca i1, i1 0
+  %nop3260 = alloca i1, i1 0
+  %nop3261 = alloca i1, i1 0
+  %nop3262 = alloca i1, i1 0
+  %nop3263 = alloca i1, i1 0
+  %nop3264 = alloca i1, i1 0
+  %nop3265 = alloca i1, i1 0
+  %nop3266 = alloca i1, i1 0
+  %nop3267 = alloca i1, i1 0
+  %nop3268 = alloca i1, i1 0
+  %nop3269 = alloca i1, i1 0
+  %nop3270 = alloca i1, i1 0
+  %nop3271 = alloca i1, i1 0
+  %nop3272 = alloca i1, i1 0
+  %nop3273 = alloca i1, i1 0
+  %nop3274 = alloca i1, i1 0
+  %nop3275 = alloca i1, i1 0
+  %nop3276 = alloca i1, i1 0
+  %nop3277 = alloca i1, i1 0
+  %nop3278 = alloca i1, i1 0
+  %nop3279 = alloca i1, i1 0
+  %nop3280 = alloca i1, i1 0
+  %nop3281 = alloca i1, i1 0
+  %nop3282 = alloca i1, i1 0
+  %nop3283 = alloca i1, i1 0
+  %nop3284 = alloca i1, i1 0
+  %nop3285 = alloca i1, i1 0
+  %nop3286 = alloca i1, i1 0
+  %nop3287 = alloca i1, i1 0
+  %nop3288 = alloca i1, i1 0
+  %nop3289 = alloca i1, i1 0
+  %nop3290 = alloca i1, i1 0
+  %nop3291 = alloca i1, i1 0
+  %nop3292 = alloca i1, i1 0
+  %nop3293 = alloca i1, i1 0
+  %nop3294 = alloca i1, i1 0
+  %nop3295 = alloca i1, i1 0
+  %nop3296 = alloca i1, i1 0
+  %nop3297 = alloca i1, i1 0
+  %nop3298 = alloca i1, i1 0
+  %nop3299 = alloca i1, i1 0
+  %nop3300 = alloca i1, i1 0
+  %nop3301 = alloca i1, i1 0
+  %nop3302 = alloca i1, i1 0
+  %nop3303 = alloca i1, i1 0
+  %nop3304 = alloca i1, i1 0
+  %nop3305 = alloca i1, i1 0
+  %nop3306 = alloca i1, i1 0
+  %nop3307 = alloca i1, i1 0
+  %nop3308 = alloca i1, i1 0
+  %nop3309 = alloca i1, i1 0
+  %nop3310 = alloca i1, i1 0
+  %nop3311 = alloca i1, i1 0
+  %nop3312 = alloca i1, i1 0
+  %nop3313 = alloca i1, i1 0
+  %nop3314 = alloca i1, i1 0
+  %nop3315 = alloca i1, i1 0
+  %nop3316 = alloca i1, i1 0
+  %nop3317 = alloca i1, i1 0
+  %nop3318 = alloca i1, i1 0
+  %nop3319 = alloca i1, i1 0
+  %nop3320 = alloca i1, i1 0
+  %nop3321 = alloca i1, i1 0
+  %nop3322 = alloca i1, i1 0
+  %nop3323 = alloca i1, i1 0
+  %nop3324 = alloca i1, i1 0
+  %nop3325 = alloca i1, i1 0
+  %nop3326 = alloca i1, i1 0
+  %nop3327 = alloca i1, i1 0
+  %nop3328 = alloca i1, i1 0
+  %nop3329 = alloca i1, i1 0
+  %nop3330 = alloca i1, i1 0
+  %nop3331 = alloca i1, i1 0
+  %nop3332 = alloca i1, i1 0
+  %nop3333 = alloca i1, i1 0
+  %nop3334 = alloca i1, i1 0
+  %nop3335 = alloca i1, i1 0
+  %nop3336 = alloca i1, i1 0
+  %nop3337 = alloca i1, i1 0
+  %nop3338 = alloca i1, i1 0
+  %nop3339 = alloca i1, i1 0
+  %nop3340 = alloca i1, i1 0
+  %nop3341 = alloca i1, i1 0
+  %nop3342 = alloca i1, i1 0
+  %nop3343 = alloca i1, i1 0
+  %nop3344 = alloca i1, i1 0
+  %nop3345 = alloca i1, i1 0
+  %nop3346 = alloca i1, i1 0
+  %nop3347 = alloca i1, i1 0
+  %nop3348 = alloca i1, i1 0
+  %nop3349 = alloca i1, i1 0
+  %nop3350 = alloca i1, i1 0
+  %nop3351 = alloca i1, i1 0
+  %nop3352 = alloca i1, i1 0
+  %nop3353 = alloca i1, i1 0
+  %nop3354 = alloca i1, i1 0
+  %nop3355 = alloca i1, i1 0
+  %nop3356 = alloca i1, i1 0
+  %nop3357 = alloca i1, i1 0
+  %nop3358 = alloca i1, i1 0
+  %nop3359 = alloca i1, i1 0
+  %nop3360 = alloca i1, i1 0
+  %nop3361 = alloca i1, i1 0
+  %nop3362 = alloca i1, i1 0
+  %nop3363 = alloca i1, i1 0
+  %nop3364 = alloca i1, i1 0
+  %nop3365 = alloca i1, i1 0
+  %nop3366 = alloca i1, i1 0
+  %nop3367 = alloca i1, i1 0
+  %nop3368 = alloca i1, i1 0
+  %nop3369 = alloca i1, i1 0
+  %nop3370 = alloca i1, i1 0
+  %nop3371 = alloca i1, i1 0
+  %nop3372 = alloca i1, i1 0
+  %nop3373 = alloca i1, i1 0
+  %nop3374 = alloca i1, i1 0
+  %nop3375 = alloca i1, i1 0
+  %nop3376 = alloca i1, i1 0
+  %nop3377 = alloca i1, i1 0
+  %nop3378 = alloca i1, i1 0
+  %nop3379 = alloca i1, i1 0
+  %nop3380 = alloca i1, i1 0
+  %nop3381 = alloca i1, i1 0
+  %nop3382 = alloca i1, i1 0
+  %nop3383 = alloca i1, i1 0
+  %nop3384 = alloca i1, i1 0
+  %nop3385 = alloca i1, i1 0
+  %nop3386 = alloca i1, i1 0
+  %nop3387 = alloca i1, i1 0
+  %nop3388 = alloca i1, i1 0
+  %nop3389 = alloca i1, i1 0
+  %nop3390 = alloca i1, i1 0
+  %nop3391 = alloca i1, i1 0
+  %nop3392 = alloca i1, i1 0
+  %nop3393 = alloca i1, i1 0
+  %nop3394 = alloca i1, i1 0
+  %nop3395 = alloca i1, i1 0
+  %nop3396 = alloca i1, i1 0
+  %nop3397 = alloca i1, i1 0
+  %nop3398 = alloca i1, i1 0
+  %nop3399 = alloca i1, i1 0
+  %nop3400 = alloca i1, i1 0
+  %nop3401 = alloca i1, i1 0
+  %nop3402 = alloca i1, i1 0
+  %nop3403 = alloca i1, i1 0
+  %nop3404 = alloca i1, i1 0
+  %nop3405 = alloca i1, i1 0
+  %nop3406 = alloca i1, i1 0
+  %nop3407 = alloca i1, i1 0
+  %nop3408 = alloca i1, i1 0
+  %nop3409 = alloca i1, i1 0
+  %nop3410 = alloca i1, i1 0
+  %nop3411 = alloca i1, i1 0
+  %nop3412 = alloca i1, i1 0
+  %nop3413 = alloca i1, i1 0
+  %nop3414 = alloca i1, i1 0
+  %nop3415 = alloca i1, i1 0
+  %nop3416 = alloca i1, i1 0
+  %nop3417 = alloca i1, i1 0
+  %nop3418 = alloca i1, i1 0
+  %nop3419 = alloca i1, i1 0
+  %nop3420 = alloca i1, i1 0
+  %nop3421 = alloca i1, i1 0
+  %nop3422 = alloca i1, i1 0
+  %nop3423 = alloca i1, i1 0
+  %nop3424 = alloca i1, i1 0
+  %nop3425 = alloca i1, i1 0
+  %nop3426 = alloca i1, i1 0
+  %nop3427 = alloca i1, i1 0
+  %nop3428 = alloca i1, i1 0
+  %nop3429 = alloca i1, i1 0
+  %nop3430 = alloca i1, i1 0
+  %nop3431 = alloca i1, i1 0
+  %nop3432 = alloca i1, i1 0
+  %nop3433 = alloca i1, i1 0
+  %nop3434 = alloca i1, i1 0
+  %nop3435 = alloca i1, i1 0
+  %nop3436 = alloca i1, i1 0
+  %nop3437 = alloca i1, i1 0
+  %nop3438 = alloca i1, i1 0
+  %nop3439 = alloca i1, i1 0
+  %nop3440 = alloca i1, i1 0
+  %nop3441 = alloca i1, i1 0
+  %nop3442 = alloca i1, i1 0
+  %nop3443 = alloca i1, i1 0
+  %nop3444 = alloca i1, i1 0
+  %nop3445 = alloca i1, i1 0
+  %nop3446 = alloca i1, i1 0
+  %nop3447 = alloca i1, i1 0
+  %nop3448 = alloca i1, i1 0
+  %nop3449 = alloca i1, i1 0
+  %nop3450 = alloca i1, i1 0
+  %nop3451 = alloca i1, i1 0
+  %nop3452 = alloca i1, i1 0
+  %nop3453 = alloca i1, i1 0
+  %nop3454 = alloca i1, i1 0
+  %nop3455 = alloca i1, i1 0
+  %nop3456 = alloca i1, i1 0
+  %nop3457 = alloca i1, i1 0
+  %nop3458 = alloca i1, i1 0
+  %nop3459 = alloca i1, i1 0
+  %nop3460 = alloca i1, i1 0
+  %nop3461 = alloca i1, i1 0
+  %nop3462 = alloca i1, i1 0
+  %nop3463 = alloca i1, i1 0
+  %nop3464 = alloca i1, i1 0
+  %nop3465 = alloca i1, i1 0
+  %nop3466 = alloca i1, i1 0
+  %nop3467 = alloca i1, i1 0
+  %nop3468 = alloca i1, i1 0
+  %nop3469 = alloca i1, i1 0
+  %nop3470 = alloca i1, i1 0
+  %nop3471 = alloca i1, i1 0
+  %nop3472 = alloca i1, i1 0
+  %nop3473 = alloca i1, i1 0
+  %nop3474 = alloca i1, i1 0
+  %nop3475 = alloca i1, i1 0
+  %nop3476 = alloca i1, i1 0
+  %nop3477 = alloca i1, i1 0
+  %nop3478 = alloca i1, i1 0
+  %nop3479 = alloca i1, i1 0
+  %nop3480 = alloca i1, i1 0
+  %nop3481 = alloca i1, i1 0
+  %nop3482 = alloca i1, i1 0
+  %nop3483 = alloca i1, i1 0
+  %nop3484 = alloca i1, i1 0
+  %nop3485 = alloca i1, i1 0
+  %nop3486 = alloca i1, i1 0
+  %nop3487 = alloca i1, i1 0
+  %nop3488 = alloca i1, i1 0
+  %nop3489 = alloca i1, i1 0
+  %nop3490 = alloca i1, i1 0
+  %nop3491 = alloca i1, i1 0
+  %nop3492 = alloca i1, i1 0
+  %nop3493 = alloca i1, i1 0
+  %nop3494 = alloca i1, i1 0
+  %nop3495 = alloca i1, i1 0
+  %nop3496 = alloca i1, i1 0
+  %nop3497 = alloca i1, i1 0
+  %nop3498 = alloca i1, i1 0
+  %nop3499 = alloca i1, i1 0
+  %nop3500 = alloca i1, i1 0
+  %nop3501 = alloca i1, i1 0
+  %nop3502 = alloca i1, i1 0
+  %nop3503 = alloca i1, i1 0
+  %nop3504 = alloca i1, i1 0
+  %nop3505 = alloca i1, i1 0
+  %nop3506 = alloca i1, i1 0
+  %nop3507 = alloca i1, i1 0
+  %nop3508 = alloca i1, i1 0
+  %nop3509 = alloca i1, i1 0
+  %nop3510 = alloca i1, i1 0
+  %nop3511 = alloca i1, i1 0
+  %nop3512 = alloca i1, i1 0
+  %nop3513 = alloca i1, i1 0
+  %nop3514 = alloca i1, i1 0
+  %nop3515 = alloca i1, i1 0
+  %nop3516 = alloca i1, i1 0
+  %nop3517 = alloca i1, i1 0
+  %nop3518 = alloca i1, i1 0
+  %nop3519 = alloca i1, i1 0
+  %nop3520 = alloca i1, i1 0
+  %nop3521 = alloca i1, i1 0
+  %nop3522 = alloca i1, i1 0
+  %nop3523 = alloca i1, i1 0
+  %nop3524 = alloca i1, i1 0
+  %nop3525 = alloca i1, i1 0
+  %nop3526 = alloca i1, i1 0
+  %nop3527 = alloca i1, i1 0
+  %nop3528 = alloca i1, i1 0
+  %nop3529 = alloca i1, i1 0
+  %nop3530 = alloca i1, i1 0
+  %nop3531 = alloca i1, i1 0
+  %nop3532 = alloca i1, i1 0
+  %nop3533 = alloca i1, i1 0
+  %nop3534 = alloca i1, i1 0
+  %nop3535 = alloca i1, i1 0
+  %nop3536 = alloca i1, i1 0
+  %nop3537 = alloca i1, i1 0
+  %nop3538 = alloca i1, i1 0
+  %nop3539 = alloca i1, i1 0
+  %nop3540 = alloca i1, i1 0
+  %nop3541 = alloca i1, i1 0
+  %nop3542 = alloca i1, i1 0
+  %nop3543 = alloca i1, i1 0
+  %nop3544 = alloca i1, i1 0
+  %nop3545 = alloca i1, i1 0
+  %nop3546 = alloca i1, i1 0
+  %nop3547 = alloca i1, i1 0
+  %nop3548 = alloca i1, i1 0
+  %nop3549 = alloca i1, i1 0
+  %nop3550 = alloca i1, i1 0
+  %nop3551 = alloca i1, i1 0
+  %nop3552 = alloca i1, i1 0
+  %nop3553 = alloca i1, i1 0
+  %nop3554 = alloca i1, i1 0
+  %nop3555 = alloca i1, i1 0
+  %nop3556 = alloca i1, i1 0
+  %nop3557 = alloca i1, i1 0
+  %nop3558 = alloca i1, i1 0
+  %nop3559 = alloca i1, i1 0
+  %nop3560 = alloca i1, i1 0
+  %nop3561 = alloca i1, i1 0
+  %nop3562 = alloca i1, i1 0
+  %nop3563 = alloca i1, i1 0
+  %nop3564 = alloca i1, i1 0
+  %nop3565 = alloca i1, i1 0
+  %nop3566 = alloca i1, i1 0
+  %nop3567 = alloca i1, i1 0
+  %nop3568 = alloca i1, i1 0
+  %nop3569 = alloca i1, i1 0
+  %nop3570 = alloca i1, i1 0
+  %nop3571 = alloca i1, i1 0
+  %nop3572 = alloca i1, i1 0
+  %nop3573 = alloca i1, i1 0
+  %nop3574 = alloca i1, i1 0
+  %nop3575 = alloca i1, i1 0
+  %nop3576 = alloca i1, i1 0
+  %nop3577 = alloca i1, i1 0
+  %nop3578 = alloca i1, i1 0
+  %nop3579 = alloca i1, i1 0
+  %nop3580 = alloca i1, i1 0
+  %nop3581 = alloca i1, i1 0
+  %nop3582 = alloca i1, i1 0
+  %nop3583 = alloca i1, i1 0
+  %nop3584 = alloca i1, i1 0
+  %nop3585 = alloca i1, i1 0
+  %nop3586 = alloca i1, i1 0
+  %nop3587 = alloca i1, i1 0
+  %nop3588 = alloca i1, i1 0
+  %nop3589 = alloca i1, i1 0
+  %nop3590 = alloca i1, i1 0
+  %nop3591 = alloca i1, i1 0
+  %nop3592 = alloca i1, i1 0
+  %nop3593 = alloca i1, i1 0
+  %nop3594 = alloca i1, i1 0
+  %nop3595 = alloca i1, i1 0
+  %nop3596 = alloca i1, i1 0
+  %nop3597 = alloca i1, i1 0
+  %nop3598 = alloca i1, i1 0
+  %nop3599 = alloca i1, i1 0
+  %nop3600 = alloca i1, i1 0
+  %nop3601 = alloca i1, i1 0
+  %nop3602 = alloca i1, i1 0
+  %nop3603 = alloca i1, i1 0
+  %nop3604 = alloca i1, i1 0
+  %nop3605 = alloca i1, i1 0
+  %nop3606 = alloca i1, i1 0
+  %nop3607 = alloca i1, i1 0
+  %nop3608 = alloca i1, i1 0
+  %nop3609 = alloca i1, i1 0
+  %nop3610 = alloca i1, i1 0
+  %nop3611 = alloca i1, i1 0
+  %nop3612 = alloca i1, i1 0
+  %nop3613 = alloca i1, i1 0
+  %nop3614 = alloca i1, i1 0
+  %nop3615 = alloca i1, i1 0
+  %nop3616 = alloca i1, i1 0
+  %nop3617 = alloca i1, i1 0
+  %nop3618 = alloca i1, i1 0
+  %nop3619 = alloca i1, i1 0
+  %nop3620 = alloca i1, i1 0
+  %nop3621 = alloca i1, i1 0
+  %nop3622 = alloca i1, i1 0
+  %nop3623 = alloca i1, i1 0
+  %nop3624 = alloca i1, i1 0
+  %nop3625 = alloca i1, i1 0
+  %nop3626 = alloca i1, i1 0
+  %nop3627 = alloca i1, i1 0
+  %nop3628 = alloca i1, i1 0
+  %nop3629 = alloca i1, i1 0
+  %nop3630 = alloca i1, i1 0
+  %nop3631 = alloca i1, i1 0
+  %nop3632 = alloca i1, i1 0
+  %nop3633 = alloca i1, i1 0
+  %nop3634 = alloca i1, i1 0
+  %nop3635 = alloca i1, i1 0
+  %nop3636 = alloca i1, i1 0
+  %nop3637 = alloca i1, i1 0
+  %nop3638 = alloca i1, i1 0
+  %nop3639 = alloca i1, i1 0
+  %nop3640 = alloca i1, i1 0
+  %nop3641 = alloca i1, i1 0
+  %nop3642 = alloca i1, i1 0
+  %nop3643 = alloca i1, i1 0
+  %nop3644 = alloca i1, i1 0
+  %nop3645 = alloca i1, i1 0
+  %nop3646 = alloca i1, i1 0
+  %nop3647 = alloca i1, i1 0
+  %nop3648 = alloca i1, i1 0
+  %nop3649 = alloca i1, i1 0
+  %nop3650 = alloca i1, i1 0
+  %nop3651 = alloca i1, i1 0
+  %nop3652 = alloca i1, i1 0
+  %nop3653 = alloca i1, i1 0
+  %nop3654 = alloca i1, i1 0
+  %nop3655 = alloca i1, i1 0
+  %nop3656 = alloca i1, i1 0
+  %nop3657 = alloca i1, i1 0
+  %nop3658 = alloca i1, i1 0
+  %nop3659 = alloca i1, i1 0
+  %nop3660 = alloca i1, i1 0
+  %nop3661 = alloca i1, i1 0
+  %nop3662 = alloca i1, i1 0
+  %nop3663 = alloca i1, i1 0
+  %nop3664 = alloca i1, i1 0
+  %nop3665 = alloca i1, i1 0
+  %nop3666 = alloca i1, i1 0
+  %nop3667 = alloca i1, i1 0
+  %nop3668 = alloca i1, i1 0
+  %nop3669 = alloca i1, i1 0
+  %nop3670 = alloca i1, i1 0
+  %nop3671 = alloca i1, i1 0
+  %nop3672 = alloca i1, i1 0
+  %nop3673 = alloca i1, i1 0
+  %nop3674 = alloca i1, i1 0
+  %nop3675 = alloca i1, i1 0
+  %nop3676 = alloca i1, i1 0
+  %nop3677 = alloca i1, i1 0
+  %nop3678 = alloca i1, i1 0
+  %nop3679 = alloca i1, i1 0
+  %nop3680 = alloca i1, i1 0
+  %nop3681 = alloca i1, i1 0
+  %nop3682 = alloca i1, i1 0
+  %nop3683 = alloca i1, i1 0
+  %nop3684 = alloca i1, i1 0
+  %nop3685 = alloca i1, i1 0
+  %nop3686 = alloca i1, i1 0
+  %nop3687 = alloca i1, i1 0
+  %nop3688 = alloca i1, i1 0
+  %nop3689 = alloca i1, i1 0
+  %nop3690 = alloca i1, i1 0
+  %nop3691 = alloca i1, i1 0
+  %nop3692 = alloca i1, i1 0
+  %nop3693 = alloca i1, i1 0
+  %nop3694 = alloca i1, i1 0
+  %nop3695 = alloca i1, i1 0
+  %nop3696 = alloca i1, i1 0
+  %nop3697 = alloca i1, i1 0
+  %nop3698 = alloca i1, i1 0
+  %nop3699 = alloca i1, i1 0
+  %nop3700 = alloca i1, i1 0
+  %nop3701 = alloca i1, i1 0
+  %nop3702 = alloca i1, i1 0
+  %nop3703 = alloca i1, i1 0
+  %nop3704 = alloca i1, i1 0
+  %nop3705 = alloca i1, i1 0
+  %nop3706 = alloca i1, i1 0
+  %nop3707 = alloca i1, i1 0
+  %nop3708 = alloca i1, i1 0
+  %nop3709 = alloca i1, i1 0
+  %nop3710 = alloca i1, i1 0
+  %nop3711 = alloca i1, i1 0
+  %nop3712 = alloca i1, i1 0
+  %nop3713 = alloca i1, i1 0
+  %nop3714 = alloca i1, i1 0
+  %nop3715 = alloca i1, i1 0
+  %nop3716 = alloca i1, i1 0
+  %nop3717 = alloca i1, i1 0
+  %nop3718 = alloca i1, i1 0
+  %nop3719 = alloca i1, i1 0
+  %nop3720 = alloca i1, i1 0
+  %nop3721 = alloca i1, i1 0
+  %nop3722 = alloca i1, i1 0
+  %nop3723 = alloca i1, i1 0
+  %nop3724 = alloca i1, i1 0
+  %nop3725 = alloca i1, i1 0
+  %nop3726 = alloca i1, i1 0
+  %nop3727 = alloca i1, i1 0
+  %nop3728 = alloca i1, i1 0
+  %nop3729 = alloca i1, i1 0
+  %nop3730 = alloca i1, i1 0
+  %nop3731 = alloca i1, i1 0
+  %nop3732 = alloca i1, i1 0
+  %nop3733 = alloca i1, i1 0
+  %nop3734 = alloca i1, i1 0
+  %nop3735 = alloca i1, i1 0
+  %nop3736 = alloca i1, i1 0
+  %nop3737 = alloca i1, i1 0
+  %nop3738 = alloca i1, i1 0
+  %nop3739 = alloca i1, i1 0
+  %nop3740 = alloca i1, i1 0
+  %nop3741 = alloca i1, i1 0
+  %nop3742 = alloca i1, i1 0
+  %nop3743 = alloca i1, i1 0
+  %nop3744 = alloca i1, i1 0
+  %nop3745 = alloca i1, i1 0
+  %nop3746 = alloca i1, i1 0
+  %nop3747 = alloca i1, i1 0
+  %nop3748 = alloca i1, i1 0
+  %nop3749 = alloca i1, i1 0
+  %nop3750 = alloca i1, i1 0
+  %nop3751 = alloca i1, i1 0
+  %nop3752 = alloca i1, i1 0
+  %nop3753 = alloca i1, i1 0
+  %nop3754 = alloca i1, i1 0
+  %nop3755 = alloca i1, i1 0
+  %nop3756 = alloca i1, i1 0
+  %nop3757 = alloca i1, i1 0
+  %nop3758 = alloca i1, i1 0
+  %nop3759 = alloca i1, i1 0
+  %nop3760 = alloca i1, i1 0
+  %nop3761 = alloca i1, i1 0
+  %nop3762 = alloca i1, i1 0
+  %nop3763 = alloca i1, i1 0
+  %nop3764 = alloca i1, i1 0
+  %nop3765 = alloca i1, i1 0
+  %nop3766 = alloca i1, i1 0
+  %nop3767 = alloca i1, i1 0
+  %nop3768 = alloca i1, i1 0
+  %nop3769 = alloca i1, i1 0
+  %nop3770 = alloca i1, i1 0
+  %nop3771 = alloca i1, i1 0
+  %nop3772 = alloca i1, i1 0
+  %nop3773 = alloca i1, i1 0
+  %nop3774 = alloca i1, i1 0
+  %nop3775 = alloca i1, i1 0
+  %nop3776 = alloca i1, i1 0
+  %nop3777 = alloca i1, i1 0
+  %nop3778 = alloca i1, i1 0
+  %nop3779 = alloca i1, i1 0
+  %nop3780 = alloca i1, i1 0
+  %nop3781 = alloca i1, i1 0
+  %nop3782 = alloca i1, i1 0
+  %nop3783 = alloca i1, i1 0
+  %nop3784 = alloca i1, i1 0
+  %nop3785 = alloca i1, i1 0
+  %nop3786 = alloca i1, i1 0
+  %nop3787 = alloca i1, i1 0
+  %nop3788 = alloca i1, i1 0
+  %nop3789 = alloca i1, i1 0
+  %nop3790 = alloca i1, i1 0
+  %nop3791 = alloca i1, i1 0
+  %nop3792 = alloca i1, i1 0
+  %nop3793 = alloca i1, i1 0
+  %nop3794 = alloca i1, i1 0
+  %nop3795 = alloca i1, i1 0
+  %nop3796 = alloca i1, i1 0
+  %nop3797 = alloca i1, i1 0
+  %nop3798 = alloca i1, i1 0
+  %nop3799 = alloca i1, i1 0
+  %nop3800 = alloca i1, i1 0
+  %nop3801 = alloca i1, i1 0
+  %nop3802 = alloca i1, i1 0
+  %nop3803 = alloca i1, i1 0
+  %nop3804 = alloca i1, i1 0
+  %nop3805 = alloca i1, i1 0
+  %nop3806 = alloca i1, i1 0
+  %nop3807 = alloca i1, i1 0
+  %nop3808 = alloca i1, i1 0
+  %nop3809 = alloca i1, i1 0
+  %nop3810 = alloca i1, i1 0
+  %nop3811 = alloca i1, i1 0
+  %nop3812 = alloca i1, i1 0
+  %nop3813 = alloca i1, i1 0
+  %nop3814 = alloca i1, i1 0
+  %nop3815 = alloca i1, i1 0
+  %nop3816 = alloca i1, i1 0
+  %nop3817 = alloca i1, i1 0
+  %nop3818 = alloca i1, i1 0
+  %nop3819 = alloca i1, i1 0
+  %nop3820 = alloca i1, i1 0
+  %nop3821 = alloca i1, i1 0
+  %nop3822 = alloca i1, i1 0
+  %nop3823 = alloca i1, i1 0
+  %nop3824 = alloca i1, i1 0
+  %nop3825 = alloca i1, i1 0
+  %nop3826 = alloca i1, i1 0
+  %nop3827 = alloca i1, i1 0
+  %nop3828 = alloca i1, i1 0
+  %nop3829 = alloca i1, i1 0
+  %nop3830 = alloca i1, i1 0
+  %nop3831 = alloca i1, i1 0
+  %nop3832 = alloca i1, i1 0
+  %nop3833 = alloca i1, i1 0
+  %nop3834 = alloca i1, i1 0
+  %nop3835 = alloca i1, i1 0
+  %nop3836 = alloca i1, i1 0
+  %nop3837 = alloca i1, i1 0
+  %nop3838 = alloca i1, i1 0
+  %nop3839 = alloca i1, i1 0
+  %nop3840 = alloca i1, i1 0
+  %nop3841 = alloca i1, i1 0
+  %nop3842 = alloca i1, i1 0
+  %nop3843 = alloca i1, i1 0
+  %nop3844 = alloca i1, i1 0
+  %nop3845 = alloca i1, i1 0
+  %nop3846 = alloca i1, i1 0
+  %nop3847 = alloca i1, i1 0
+  %nop3848 = alloca i1, i1 0
+  %nop3849 = alloca i1, i1 0
+  %nop3850 = alloca i1, i1 0
+  %nop3851 = alloca i1, i1 0
+  %nop3852 = alloca i1, i1 0
+  %nop3853 = alloca i1, i1 0
+  %nop3854 = alloca i1, i1 0
+  %nop3855 = alloca i1, i1 0
+  %nop3856 = alloca i1, i1 0
+  %nop3857 = alloca i1, i1 0
+  %nop3858 = alloca i1, i1 0
+  %nop3859 = alloca i1, i1 0
+  %nop3860 = alloca i1, i1 0
+  %nop3861 = alloca i1, i1 0
+  %nop3862 = alloca i1, i1 0
+  %nop3863 = alloca i1, i1 0
+  %nop3864 = alloca i1, i1 0
+  %nop3865 = alloca i1, i1 0
+  %nop3866 = alloca i1, i1 0
+  %nop3867 = alloca i1, i1 0
+  %nop3868 = alloca i1, i1 0
+  %nop3869 = alloca i1, i1 0
+  %nop3870 = alloca i1, i1 0
+  %nop3871 = alloca i1, i1 0
+  %nop3872 = alloca i1, i1 0
+  %nop3873 = alloca i1, i1 0
+  %nop3874 = alloca i1, i1 0
+  %nop3875 = alloca i1, i1 0
+  %nop3876 = alloca i1, i1 0
+  %nop3877 = alloca i1, i1 0
+  %nop3878 = alloca i1, i1 0
+  %nop3879 = alloca i1, i1 0
+  %nop3880 = alloca i1, i1 0
+  %nop3881 = alloca i1, i1 0
+  %nop3882 = alloca i1, i1 0
+  %nop3883 = alloca i1, i1 0
+  %nop3884 = alloca i1, i1 0
+  %nop3885 = alloca i1, i1 0
+  %nop3886 = alloca i1, i1 0
+  %nop3887 = alloca i1, i1 0
+  %nop3888 = alloca i1, i1 0
+  %nop3889 = alloca i1, i1 0
+  %nop3890 = alloca i1, i1 0
+  %nop3891 = alloca i1, i1 0
+  %nop3892 = alloca i1, i1 0
+  %nop3893 = alloca i1, i1 0
+  %nop3894 = alloca i1, i1 0
+  %nop3895 = alloca i1, i1 0
+  %nop3896 = alloca i1, i1 0
+  %nop3897 = alloca i1, i1 0
+  %nop3898 = alloca i1, i1 0
+  %nop3899 = alloca i1, i1 0
+  %nop3900 = alloca i1, i1 0
+  %nop3901 = alloca i1, i1 0
+  %nop3902 = alloca i1, i1 0
+  %nop3903 = alloca i1, i1 0
+  %nop3904 = alloca i1, i1 0
+  %nop3905 = alloca i1, i1 0
+  %nop3906 = alloca i1, i1 0
+  %nop3907 = alloca i1, i1 0
+  %nop3908 = alloca i1, i1 0
+  %nop3909 = alloca i1, i1 0
+  %nop3910 = alloca i1, i1 0
+  %nop3911 = alloca i1, i1 0
+  %nop3912 = alloca i1, i1 0
+  %nop3913 = alloca i1, i1 0
+  %nop3914 = alloca i1, i1 0
+  %nop3915 = alloca i1, i1 0
+  %nop3916 = alloca i1, i1 0
+  %nop3917 = alloca i1, i1 0
+  %nop3918 = alloca i1, i1 0
+  %nop3919 = alloca i1, i1 0
+  %nop3920 = alloca i1, i1 0
+  %nop3921 = alloca i1, i1 0
+  %nop3922 = alloca i1, i1 0
+  %nop3923 = alloca i1, i1 0
+  %nop3924 = alloca i1, i1 0
+  %nop3925 = alloca i1, i1 0
+  %nop3926 = alloca i1, i1 0
+  %nop3927 = alloca i1, i1 0
+  %nop3928 = alloca i1, i1 0
+  %nop3929 = alloca i1, i1 0
+  %nop3930 = alloca i1, i1 0
+  %nop3931 = alloca i1, i1 0
+  %nop3932 = alloca i1, i1 0
+  %nop3933 = alloca i1, i1 0
+  %nop3934 = alloca i1, i1 0
+  %nop3935 = alloca i1, i1 0
+  %nop3936 = alloca i1, i1 0
+  %nop3937 = alloca i1, i1 0
+  %nop3938 = alloca i1, i1 0
+  %nop3939 = alloca i1, i1 0
+  %nop3940 = alloca i1, i1 0
+  %nop3941 = alloca i1, i1 0
+  %nop3942 = alloca i1, i1 0
+  %nop3943 = alloca i1, i1 0
+  %nop3944 = alloca i1, i1 0
+  %nop3945 = alloca i1, i1 0
+  %nop3946 = alloca i1, i1 0
+  %nop3947 = alloca i1, i1 0
+  %nop3948 = alloca i1, i1 0
+  %nop3949 = alloca i1, i1 0
+  %nop3950 = alloca i1, i1 0
+  %nop3951 = alloca i1, i1 0
+  %nop3952 = alloca i1, i1 0
+  %nop3953 = alloca i1, i1 0
+  %nop3954 = alloca i1, i1 0
+  %nop3955 = alloca i1, i1 0
+  %nop3956 = alloca i1, i1 0
+  %nop3957 = alloca i1, i1 0
+  %nop3958 = alloca i1, i1 0
+  %nop3959 = alloca i1, i1 0
+  %nop3960 = alloca i1, i1 0
+  %nop3961 = alloca i1, i1 0
+  %nop3962 = alloca i1, i1 0
+  %nop3963 = alloca i1, i1 0
+  %nop3964 = alloca i1, i1 0
+  %nop3965 = alloca i1, i1 0
+  %nop3966 = alloca i1, i1 0
+  %nop3967 = alloca i1, i1 0
+  %nop3968 = alloca i1, i1 0
+  %nop3969 = alloca i1, i1 0
+  %nop3970 = alloca i1, i1 0
+  %nop3971 = alloca i1, i1 0
+  %nop3972 = alloca i1, i1 0
+  %nop3973 = alloca i1, i1 0
+  %nop3974 = alloca i1, i1 0
+  %nop3975 = alloca i1, i1 0
+  %nop3976 = alloca i1, i1 0
+  %nop3977 = alloca i1, i1 0
+  %nop3978 = alloca i1, i1 0
+  %nop3979 = alloca i1, i1 0
+  %nop3980 = alloca i1, i1 0
+  %nop3981 = alloca i1, i1 0
+  %nop3982 = alloca i1, i1 0
+  %nop3983 = alloca i1, i1 0
+  %nop3984 = alloca i1, i1 0
+  %nop3985 = alloca i1, i1 0
+  %nop3986 = alloca i1, i1 0
+  %nop3987 = alloca i1, i1 0
+  %nop3988 = alloca i1, i1 0
+  %nop3989 = alloca i1, i1 0
+  %nop3990 = alloca i1, i1 0
+  %nop3991 = alloca i1, i1 0
+  %nop3992 = alloca i1, i1 0
+  %nop3993 = alloca i1, i1 0
+  %nop3994 = alloca i1, i1 0
+  %nop3995 = alloca i1, i1 0
+  %nop3996 = alloca i1, i1 0
+  %nop3997 = alloca i1, i1 0
+  %nop3998 = alloca i1, i1 0
+  %nop3999 = alloca i1, i1 0
+  %nop4000 = alloca i1, i1 0
+  %nop4001 = alloca i1, i1 0
+  %nop4002 = alloca i1, i1 0
+  %nop4003 = alloca i1, i1 0
+  %nop4004 = alloca i1, i1 0
+  %nop4005 = alloca i1, i1 0
+  %nop4006 = alloca i1, i1 0
+  %nop4007 = alloca i1, i1 0
+  %nop4008 = alloca i1, i1 0
+  %nop4009 = alloca i1, i1 0
+  %nop4010 = alloca i1, i1 0
+  %nop4011 = alloca i1, i1 0
+  %nop4012 = alloca i1, i1 0
+  %nop4013 = alloca i1, i1 0
+  %nop4014 = alloca i1, i1 0
+  %nop4015 = alloca i1, i1 0
+  %nop4016 = alloca i1, i1 0
+  %nop4017 = alloca i1, i1 0
+  %nop4018 = alloca i1, i1 0
+  %nop4019 = alloca i1, i1 0
+  %nop4020 = alloca i1, i1 0
+  %nop4021 = alloca i1, i1 0
+  %nop4022 = alloca i1, i1 0
+  %nop4023 = alloca i1, i1 0
+  %nop4024 = alloca i1, i1 0
+  %nop4025 = alloca i1, i1 0
+  %nop4026 = alloca i1, i1 0
+  %nop4027 = alloca i1, i1 0
+  %nop4028 = alloca i1, i1 0
+  %nop4029 = alloca i1, i1 0
+  %nop4030 = alloca i1, i1 0
+  %nop4031 = alloca i1, i1 0
+  %nop4032 = alloca i1, i1 0
+  %nop4033 = alloca i1, i1 0
+  %nop4034 = alloca i1, i1 0
+  %nop4035 = alloca i1, i1 0
+  %nop4036 = alloca i1, i1 0
+  %nop4037 = alloca i1, i1 0
+  %nop4038 = alloca i1, i1 0
+  %nop4039 = alloca i1, i1 0
+  %nop4040 = alloca i1, i1 0
+  %nop4041 = alloca i1, i1 0
+  %nop4042 = alloca i1, i1 0
+  %nop4043 = alloca i1, i1 0
+  %nop4044 = alloca i1, i1 0
+  %nop4045 = alloca i1, i1 0
+  %nop4046 = alloca i1, i1 0
+  %nop4047 = alloca i1, i1 0
+  %nop4048 = alloca i1, i1 0
+  %nop4049 = alloca i1, i1 0
+  %nop4050 = alloca i1, i1 0
+  %nop4051 = alloca i1, i1 0
+  %nop4052 = alloca i1, i1 0
+  %nop4053 = alloca i1, i1 0
+  %nop4054 = alloca i1, i1 0
+  %nop4055 = alloca i1, i1 0
+  %nop4056 = alloca i1, i1 0
+  %nop4057 = alloca i1, i1 0
+  %nop4058 = alloca i1, i1 0
+  %nop4059 = alloca i1, i1 0
+  %nop4060 = alloca i1, i1 0
+  %nop4061 = alloca i1, i1 0
+  %nop4062 = alloca i1, i1 0
+  %nop4063 = alloca i1, i1 0
+  %nop4064 = alloca i1, i1 0
+  %nop4065 = alloca i1, i1 0
+  %nop4066 = alloca i1, i1 0
+  %nop4067 = alloca i1, i1 0
+  %nop4068 = alloca i1, i1 0
+  %nop4069 = alloca i1, i1 0
+  %nop4070 = alloca i1, i1 0
+  %nop4071 = alloca i1, i1 0
+  %nop4072 = alloca i1, i1 0
+  %nop4073 = alloca i1, i1 0
+  %nop4074 = alloca i1, i1 0
+  %nop4075 = alloca i1, i1 0
+  %nop4076 = alloca i1, i1 0
+  %nop4077 = alloca i1, i1 0
+  %nop4078 = alloca i1, i1 0
+  %nop4079 = alloca i1, i1 0
+  %nop4080 = alloca i1, i1 0
+  %nop4081 = alloca i1, i1 0
+  %nop4082 = alloca i1, i1 0
+  %nop4083 = alloca i1, i1 0
+  %nop4084 = alloca i1, i1 0
+  %nop4085 = alloca i1, i1 0
+  %nop4086 = alloca i1, i1 0
+  %nop4087 = alloca i1, i1 0
+  %nop4088 = alloca i1, i1 0
+  %nop4089 = alloca i1, i1 0
+  %nop4090 = alloca i1, i1 0
+  %nop4091 = alloca i1, i1 0
+  %nop4092 = alloca i1, i1 0
+  %nop4093 = alloca i1, i1 0
+  %nop4094 = alloca i1, i1 0
+  %nop4095 = alloca i1, i1 0
+  %nop4096 = alloca i1, i1 0
+  %nop4097 = alloca i1, i1 0
+  %nop4098 = alloca i1, i1 0
+  %nop4099 = alloca i1, i1 0
+  %nop4100 = alloca i1, i1 0
+  %nop4101 = alloca i1, i1 0
+  %nop4102 = alloca i1, i1 0
+  %nop4103 = alloca i1, i1 0
+  %nop4104 = alloca i1, i1 0
+  %nop4105 = alloca i1, i1 0
+  %nop4106 = alloca i1, i1 0
+  %nop4107 = alloca i1, i1 0
+  %nop4108 = alloca i1, i1 0
+  %nop4109 = alloca i1, i1 0
+  %nop4110 = alloca i1, i1 0
+  %nop4111 = alloca i1, i1 0
+  %nop4112 = alloca i1, i1 0
+  %nop4113 = alloca i1, i1 0
+  %nop4114 = alloca i1, i1 0
+  %nop4115 = alloca i1, i1 0
+  %nop4116 = alloca i1, i1 0
+  %nop4117 = alloca i1, i1 0
+  %nop4118 = alloca i1, i1 0
+  %nop4119 = alloca i1, i1 0
+  %nop4120 = alloca i1, i1 0
+  %nop4121 = alloca i1, i1 0
+  %nop4122 = alloca i1, i1 0
+  %nop4123 = alloca i1, i1 0
+  %nop4124 = alloca i1, i1 0
+  %nop4125 = alloca i1, i1 0
+  %nop4126 = alloca i1, i1 0
+  %nop4127 = alloca i1, i1 0
+  %nop4128 = alloca i1, i1 0
+  %nop4129 = alloca i1, i1 0
+  %nop4130 = alloca i1, i1 0
+  %nop4131 = alloca i1, i1 0
+  %nop4132 = alloca i1, i1 0
+  %nop4133 = alloca i1, i1 0
+  %nop4134 = alloca i1, i1 0
+  %nop4135 = alloca i1, i1 0
+  %nop4136 = alloca i1, i1 0
+  %nop4137 = alloca i1, i1 0
+  %nop4138 = alloca i1, i1 0
+  %nop4139 = alloca i1, i1 0
+  %nop4140 = alloca i1, i1 0
+  %nop4141 = alloca i1, i1 0
+  %nop4142 = alloca i1, i1 0
+  %nop4143 = alloca i1, i1 0
+  %nop4144 = alloca i1, i1 0
+  %nop4145 = alloca i1, i1 0
+  %nop4146 = alloca i1, i1 0
+  %nop4147 = alloca i1, i1 0
+  %nop4148 = alloca i1, i1 0
+  %nop4149 = alloca i1, i1 0
+  %nop4150 = alloca i1, i1 0
+  %nop4151 = alloca i1, i1 0
+  %nop4152 = alloca i1, i1 0
+  %nop4153 = alloca i1, i1 0
+  %nop4154 = alloca i1, i1 0
+  %nop4155 = alloca i1, i1 0
+  %nop4156 = alloca i1, i1 0
+  %nop4157 = alloca i1, i1 0
+  %nop4158 = alloca i1, i1 0
+  %nop4159 = alloca i1, i1 0
+  %nop4160 = alloca i1, i1 0
+  %nop4161 = alloca i1, i1 0
+  %nop4162 = alloca i1, i1 0
+  %nop4163 = alloca i1, i1 0
+  %nop4164 = alloca i1, i1 0
+  %nop4165 = alloca i1, i1 0
+  %nop4166 = alloca i1, i1 0
+  %nop4167 = alloca i1, i1 0
+  %nop4168 = alloca i1, i1 0
+  %nop4169 = alloca i1, i1 0
+  %nop4170 = alloca i1, i1 0
+  %nop4171 = alloca i1, i1 0
+  %nop4172 = alloca i1, i1 0
+  %nop4173 = alloca i1, i1 0
+  %nop4174 = alloca i1, i1 0
+  %nop4175 = alloca i1, i1 0
+  %nop4176 = alloca i1, i1 0
+  %nop4177 = alloca i1, i1 0
+  %nop4178 = alloca i1, i1 0
+  %nop4179 = alloca i1, i1 0
+  %nop4180 = alloca i1, i1 0
+  %nop4181 = alloca i1, i1 0
+  %nop4182 = alloca i1, i1 0
+  %nop4183 = alloca i1, i1 0
+  %nop4184 = alloca i1, i1 0
+  %nop4185 = alloca i1, i1 0
+  %nop4186 = alloca i1, i1 0
+  %nop4187 = alloca i1, i1 0
+  %nop4188 = alloca i1, i1 0
+  %nop4189 = alloca i1, i1 0
+  %nop4190 = alloca i1, i1 0
+  %nop4191 = alloca i1, i1 0
+  %nop4192 = alloca i1, i1 0
+  %nop4193 = alloca i1, i1 0
+  %nop4194 = alloca i1, i1 0
+  %nop4195 = alloca i1, i1 0
+  %nop4196 = alloca i1, i1 0
+  %nop4197 = alloca i1, i1 0
+  %nop4198 = alloca i1, i1 0
+  %nop4199 = alloca i1, i1 0
+  %nop4200 = alloca i1, i1 0
+  %nop4201 = alloca i1, i1 0
+  %nop4202 = alloca i1, i1 0
+  %nop4203 = alloca i1, i1 0
+  %nop4204 = alloca i1, i1 0
+  %nop4205 = alloca i1, i1 0
+  %nop4206 = alloca i1, i1 0
+  %nop4207 = alloca i1, i1 0
+  %nop4208 = alloca i1, i1 0
+  %nop4209 = alloca i1, i1 0
+  %nop4210 = alloca i1, i1 0
+  %nop4211 = alloca i1, i1 0
+  %nop4212 = alloca i1, i1 0
+  %nop4213 = alloca i1, i1 0
+  %nop4214 = alloca i1, i1 0
+  %nop4215 = alloca i1, i1 0
+  %nop4216 = alloca i1, i1 0
+  %nop4217 = alloca i1, i1 0
+  %nop4218 = alloca i1, i1 0
+  %nop4219 = alloca i1, i1 0
+  %nop4220 = alloca i1, i1 0
+  %nop4221 = alloca i1, i1 0
+  %nop4222 = alloca i1, i1 0
+  %nop4223 = alloca i1, i1 0
+  %nop4224 = alloca i1, i1 0
+  %nop4225 = alloca i1, i1 0
+  %nop4226 = alloca i1, i1 0
+  %nop4227 = alloca i1, i1 0
+  %nop4228 = alloca i1, i1 0
+  %nop4229 = alloca i1, i1 0
+  %nop4230 = alloca i1, i1 0
+  %nop4231 = alloca i1, i1 0
+  %nop4232 = alloca i1, i1 0
+  %nop4233 = alloca i1, i1 0
+  %nop4234 = alloca i1, i1 0
+  %nop4235 = alloca i1, i1 0
+  %nop4236 = alloca i1, i1 0
+  %nop4237 = alloca i1, i1 0
+  %nop4238 = alloca i1, i1 0
+  %nop4239 = alloca i1, i1 0
+  %nop4240 = alloca i1, i1 0
+  %nop4241 = alloca i1, i1 0
+  %nop4242 = alloca i1, i1 0
+  %nop4243 = alloca i1, i1 0
+  %nop4244 = alloca i1, i1 0
+  %nop4245 = alloca i1, i1 0
+  %nop4246 = alloca i1, i1 0
+  %nop4247 = alloca i1, i1 0
+  %nop4248 = alloca i1, i1 0
+  %nop4249 = alloca i1, i1 0
+  %nop4250 = alloca i1, i1 0
+  %nop4251 = alloca i1, i1 0
+  %nop4252 = alloca i1, i1 0
+  %nop4253 = alloca i1, i1 0
+  %nop4254 = alloca i1, i1 0
+  %nop4255 = alloca i1, i1 0
+  %nop4256 = alloca i1, i1 0
+  %nop4257 = alloca i1, i1 0
+  %nop4258 = alloca i1, i1 0
+  %nop4259 = alloca i1, i1 0
+  %nop4260 = alloca i1, i1 0
+  %nop4261 = alloca i1, i1 0
+  %nop4262 = alloca i1, i1 0
+  %nop4263 = alloca i1, i1 0
+  %nop4264 = alloca i1, i1 0
+  %nop4265 = alloca i1, i1 0
+  %nop4266 = alloca i1, i1 0
+  %nop4267 = alloca i1, i1 0
+  %nop4268 = alloca i1, i1 0
+  %nop4269 = alloca i1, i1 0
+  %nop4270 = alloca i1, i1 0
+  %nop4271 = alloca i1, i1 0
+  %nop4272 = alloca i1, i1 0
+  %nop4273 = alloca i1, i1 0
+  %nop4274 = alloca i1, i1 0
+  %nop4275 = alloca i1, i1 0
+  %nop4276 = alloca i1, i1 0
+  %nop4277 = alloca i1, i1 0
+  %nop4278 = alloca i1, i1 0
+  %nop4279 = alloca i1, i1 0
+  %nop4280 = alloca i1, i1 0
+  %nop4281 = alloca i1, i1 0
+  %nop4282 = alloca i1, i1 0
+  %nop4283 = alloca i1, i1 0
+  %nop4284 = alloca i1, i1 0
+  %nop4285 = alloca i1, i1 0
+  %nop4286 = alloca i1, i1 0
+  %nop4287 = alloca i1, i1 0
+  %nop4288 = alloca i1, i1 0
+  %nop4289 = alloca i1, i1 0
+  %nop4290 = alloca i1, i1 0
+  %nop4291 = alloca i1, i1 0
+  %nop4292 = alloca i1, i1 0
+  %nop4293 = alloca i1, i1 0
+  %nop4294 = alloca i1, i1 0
+  %nop4295 = alloca i1, i1 0
+  %nop4296 = alloca i1, i1 0
+  %nop4297 = alloca i1, i1 0
+  %nop4298 = alloca i1, i1 0
+  %nop4299 = alloca i1, i1 0
+  %nop4300 = alloca i1, i1 0
+  %nop4301 = alloca i1, i1 0
+  %nop4302 = alloca i1, i1 0
+  %nop4303 = alloca i1, i1 0
+  %nop4304 = alloca i1, i1 0
+  %nop4305 = alloca i1, i1 0
+  %nop4306 = alloca i1, i1 0
+  %nop4307 = alloca i1, i1 0
+  %nop4308 = alloca i1, i1 0
+  %nop4309 = alloca i1, i1 0
+  %nop4310 = alloca i1, i1 0
+  %nop4311 = alloca i1, i1 0
+  %nop4312 = alloca i1, i1 0
+  %nop4313 = alloca i1, i1 0
+  %nop4314 = alloca i1, i1 0
+  %nop4315 = alloca i1, i1 0
+  %nop4316 = alloca i1, i1 0
+  %nop4317 = alloca i1, i1 0
+  %nop4318 = alloca i1, i1 0
+  %nop4319 = alloca i1, i1 0
+  %nop4320 = alloca i1, i1 0
+  %nop4321 = alloca i1, i1 0
+  %nop4322 = alloca i1, i1 0
+  %nop4323 = alloca i1, i1 0
+  %nop4324 = alloca i1, i1 0
+  %nop4325 = alloca i1, i1 0
+  %nop4326 = alloca i1, i1 0
+  %nop4327 = alloca i1, i1 0
+  %nop4328 = alloca i1, i1 0
+  %nop4329 = alloca i1, i1 0
+  %nop4330 = alloca i1, i1 0
+  %nop4331 = alloca i1, i1 0
+  %nop4332 = alloca i1, i1 0
+  %nop4333 = alloca i1, i1 0
+  %nop4334 = alloca i1, i1 0
+  %nop4335 = alloca i1, i1 0
+  %nop4336 = alloca i1, i1 0
+  %nop4337 = alloca i1, i1 0
+  %nop4338 = alloca i1, i1 0
+  %nop4339 = alloca i1, i1 0
+  %nop4340 = alloca i1, i1 0
+  %nop4341 = alloca i1, i1 0
+  %nop4342 = alloca i1, i1 0
+  %nop4343 = alloca i1, i1 0
+  %nop4344 = alloca i1, i1 0
+  %nop4345 = alloca i1, i1 0
+  %nop4346 = alloca i1, i1 0
+  %nop4347 = alloca i1, i1 0
+  %nop4348 = alloca i1, i1 0
+  %nop4349 = alloca i1, i1 0
+  %nop4350 = alloca i1, i1 0
+  %nop4351 = alloca i1, i1 0
+  %nop4352 = alloca i1, i1 0
+  %nop4353 = alloca i1, i1 0
+  %nop4354 = alloca i1, i1 0
+  %nop4355 = alloca i1, i1 0
+  %nop4356 = alloca i1, i1 0
+  %nop4357 = alloca i1, i1 0
+  %nop4358 = alloca i1, i1 0
+  %nop4359 = alloca i1, i1 0
+  %nop4360 = alloca i1, i1 0
+  %nop4361 = alloca i1, i1 0
+  %nop4362 = alloca i1, i1 0
+  %nop4363 = alloca i1, i1 0
+  %nop4364 = alloca i1, i1 0
+  %nop4365 = alloca i1, i1 0
+  %nop4366 = alloca i1, i1 0
+  %nop4367 = alloca i1, i1 0
+  %nop4368 = alloca i1, i1 0
+  %nop4369 = alloca i1, i1 0
+  %nop4370 = alloca i1, i1 0
+  %nop4371 = alloca i1, i1 0
+  %nop4372 = alloca i1, i1 0
+  %nop4373 = alloca i1, i1 0
+  %nop4374 = alloca i1, i1 0
+  %nop4375 = alloca i1, i1 0
+  %nop4376 = alloca i1, i1 0
+  %nop4377 = alloca i1, i1 0
+  %nop4378 = alloca i1, i1 0
+  %nop4379 = alloca i1, i1 0
+  %nop4380 = alloca i1, i1 0
+  %nop4381 = alloca i1, i1 0
+  %nop4382 = alloca i1, i1 0
+  %nop4383 = alloca i1, i1 0
+  %nop4384 = alloca i1, i1 0
+  %nop4385 = alloca i1, i1 0
+  %nop4386 = alloca i1, i1 0
+  %nop4387 = alloca i1, i1 0
+  %nop4388 = alloca i1, i1 0
+  %nop4389 = alloca i1, i1 0
+  %nop4390 = alloca i1, i1 0
+  %nop4391 = alloca i1, i1 0
+  %nop4392 = alloca i1, i1 0
+  %nop4393 = alloca i1, i1 0
+  %nop4394 = alloca i1, i1 0
+  %nop4395 = alloca i1, i1 0
+  %nop4396 = alloca i1, i1 0
+  %nop4397 = alloca i1, i1 0
+  %nop4398 = alloca i1, i1 0
+  %nop4399 = alloca i1, i1 0
+  %nop4400 = alloca i1, i1 0
+  %nop4401 = alloca i1, i1 0
+  %nop4402 = alloca i1, i1 0
+  %nop4403 = alloca i1, i1 0
+  %nop4404 = alloca i1, i1 0
+  %nop4405 = alloca i1, i1 0
+  %nop4406 = alloca i1, i1 0
+  %nop4407 = alloca i1, i1 0
+  %nop4408 = alloca i1, i1 0
+  %nop4409 = alloca i1, i1 0
+  %nop4410 = alloca i1, i1 0
+  %nop4411 = alloca i1, i1 0
+  %nop4412 = alloca i1, i1 0
+  %nop4413 = alloca i1, i1 0
+  %nop4414 = alloca i1, i1 0
+  %nop4415 = alloca i1, i1 0
+  %nop4416 = alloca i1, i1 0
+  %nop4417 = alloca i1, i1 0
+  %nop4418 = alloca i1, i1 0
+  %nop4419 = alloca i1, i1 0
+  %nop4420 = alloca i1, i1 0
+  %nop4421 = alloca i1, i1 0
+  %nop4422 = alloca i1, i1 0
+  %nop4423 = alloca i1, i1 0
+  %nop4424 = alloca i1, i1 0
+  %nop4425 = alloca i1, i1 0
+  %nop4426 = alloca i1, i1 0
+  %nop4427 = alloca i1, i1 0
+  %nop4428 = alloca i1, i1 0
+  %nop4429 = alloca i1, i1 0
+  %nop4430 = alloca i1, i1 0
+  %nop4431 = alloca i1, i1 0
+  %nop4432 = alloca i1, i1 0
+  %nop4433 = alloca i1, i1 0
+  %nop4434 = alloca i1, i1 0
+  %nop4435 = alloca i1, i1 0
+  %nop4436 = alloca i1, i1 0
+  %nop4437 = alloca i1, i1 0
+  %nop4438 = alloca i1, i1 0
+  %nop4439 = alloca i1, i1 0
+  %nop4440 = alloca i1, i1 0
+  %nop4441 = alloca i1, i1 0
+  %nop4442 = alloca i1, i1 0
+  %nop4443 = alloca i1, i1 0
+  %nop4444 = alloca i1, i1 0
+  %nop4445 = alloca i1, i1 0
+  %nop4446 = alloca i1, i1 0
+  %nop4447 = alloca i1, i1 0
+  %nop4448 = alloca i1, i1 0
+  %nop4449 = alloca i1, i1 0
+  %nop4450 = alloca i1, i1 0
+  %nop4451 = alloca i1, i1 0
+  %nop4452 = alloca i1, i1 0
+  %nop4453 = alloca i1, i1 0
+  %nop4454 = alloca i1, i1 0
+  %nop4455 = alloca i1, i1 0
+  %nop4456 = alloca i1, i1 0
+  %nop4457 = alloca i1, i1 0
+  %nop4458 = alloca i1, i1 0
+  %nop4459 = alloca i1, i1 0
+  %nop4460 = alloca i1, i1 0
+  %nop4461 = alloca i1, i1 0
+  %nop4462 = alloca i1, i1 0
+  %nop4463 = alloca i1, i1 0
+  %nop4464 = alloca i1, i1 0
+  %nop4465 = alloca i1, i1 0
+  %nop4466 = alloca i1, i1 0
+  %nop4467 = alloca i1, i1 0
+  %nop4468 = alloca i1, i1 0
+  %nop4469 = alloca i1, i1 0
+  %nop4470 = alloca i1, i1 0
+  %nop4471 = alloca i1, i1 0
+  %nop4472 = alloca i1, i1 0
+  %nop4473 = alloca i1, i1 0
+  %nop4474 = alloca i1, i1 0
+  %nop4475 = alloca i1, i1 0
+  %nop4476 = alloca i1, i1 0
+  %nop4477 = alloca i1, i1 0
+  %nop4478 = alloca i1, i1 0
+  %nop4479 = alloca i1, i1 0
+  %nop4480 = alloca i1, i1 0
+  %nop4481 = alloca i1, i1 0
+  %nop4482 = alloca i1, i1 0
+  %nop4483 = alloca i1, i1 0
+  %nop4484 = alloca i1, i1 0
+  %nop4485 = alloca i1, i1 0
+  %nop4486 = alloca i1, i1 0
+  %nop4487 = alloca i1, i1 0
+  %nop4488 = alloca i1, i1 0
+  %nop4489 = alloca i1, i1 0
+  %nop4490 = alloca i1, i1 0
+  %nop4491 = alloca i1, i1 0
+  %nop4492 = alloca i1, i1 0
+  %nop4493 = alloca i1, i1 0
+  %nop4494 = alloca i1, i1 0
+  %nop4495 = alloca i1, i1 0
+  %nop4496 = alloca i1, i1 0
+  %nop4497 = alloca i1, i1 0
+  %nop4498 = alloca i1, i1 0
+  %nop4499 = alloca i1, i1 0
+  %nop4500 = alloca i1, i1 0
+  %nop4501 = alloca i1, i1 0
+  %nop4502 = alloca i1, i1 0
+  %nop4503 = alloca i1, i1 0
+  %nop4504 = alloca i1, i1 0
+  %nop4505 = alloca i1, i1 0
+  %nop4506 = alloca i1, i1 0
+  %nop4507 = alloca i1, i1 0
+  %nop4508 = alloca i1, i1 0
+  %nop4509 = alloca i1, i1 0
+  %nop4510 = alloca i1, i1 0
+  %nop4511 = alloca i1, i1 0
+  %nop4512 = alloca i1, i1 0
+  %nop4513 = alloca i1, i1 0
+  %nop4514 = alloca i1, i1 0
+  %nop4515 = alloca i1, i1 0
+  %nop4516 = alloca i1, i1 0
+  %nop4517 = alloca i1, i1 0
+  %nop4518 = alloca i1, i1 0
+  %nop4519 = alloca i1, i1 0
+  %nop4520 = alloca i1, i1 0
+  %nop4521 = alloca i1, i1 0
+  %nop4522 = alloca i1, i1 0
+  %nop4523 = alloca i1, i1 0
+  %nop4524 = alloca i1, i1 0
+  %nop4525 = alloca i1, i1 0
+  %nop4526 = alloca i1, i1 0
+  %nop4527 = alloca i1, i1 0
+  %nop4528 = alloca i1, i1 0
+  %nop4529 = alloca i1, i1 0
+  %nop4530 = alloca i1, i1 0
+  %nop4531 = alloca i1, i1 0
+  %nop4532 = alloca i1, i1 0
+  %nop4533 = alloca i1, i1 0
+  %nop4534 = alloca i1, i1 0
+  %nop4535 = alloca i1, i1 0
+  %nop4536 = alloca i1, i1 0
+  %nop4537 = alloca i1, i1 0
+  %nop4538 = alloca i1, i1 0
+  %nop4539 = alloca i1, i1 0
+  %nop4540 = alloca i1, i1 0
+  %nop4541 = alloca i1, i1 0
+  %nop4542 = alloca i1, i1 0
+  %nop4543 = alloca i1, i1 0
+  %nop4544 = alloca i1, i1 0
+  %nop4545 = alloca i1, i1 0
+  %nop4546 = alloca i1, i1 0
+  %nop4547 = alloca i1, i1 0
+  %nop4548 = alloca i1, i1 0
+  %nop4549 = alloca i1, i1 0
+  %nop4550 = alloca i1, i1 0
+  %nop4551 = alloca i1, i1 0
+  %nop4552 = alloca i1, i1 0
+  %nop4553 = alloca i1, i1 0
+  %nop4554 = alloca i1, i1 0
+  %nop4555 = alloca i1, i1 0
+  %nop4556 = alloca i1, i1 0
+  %nop4557 = alloca i1, i1 0
+  %nop4558 = alloca i1, i1 0
+  %nop4559 = alloca i1, i1 0
+  %nop4560 = alloca i1, i1 0
+  %nop4561 = alloca i1, i1 0
+  %nop4562 = alloca i1, i1 0
+  %nop4563 = alloca i1, i1 0
+  %nop4564 = alloca i1, i1 0
+  %nop4565 = alloca i1, i1 0
+  %nop4566 = alloca i1, i1 0
+  %nop4567 = alloca i1, i1 0
+  %nop4568 = alloca i1, i1 0
+  %nop4569 = alloca i1, i1 0
+  %nop4570 = alloca i1, i1 0
+  %nop4571 = alloca i1, i1 0
+  %nop4572 = alloca i1, i1 0
+  %nop4573 = alloca i1, i1 0
+  %nop4574 = alloca i1, i1 0
+  %nop4575 = alloca i1, i1 0
+  %nop4576 = alloca i1, i1 0
+  %nop4577 = alloca i1, i1 0
+  %nop4578 = alloca i1, i1 0
+  %nop4579 = alloca i1, i1 0
+  %nop4580 = alloca i1, i1 0
+  %nop4581 = alloca i1, i1 0
+  %nop4582 = alloca i1, i1 0
+  %nop4583 = alloca i1, i1 0
+  %nop4584 = alloca i1, i1 0
+  %nop4585 = alloca i1, i1 0
+  %nop4586 = alloca i1, i1 0
+  %nop4587 = alloca i1, i1 0
+  %nop4588 = alloca i1, i1 0
+  %nop4589 = alloca i1, i1 0
+  %nop4590 = alloca i1, i1 0
+  %nop4591 = alloca i1, i1 0
+  %nop4592 = alloca i1, i1 0
+  %nop4593 = alloca i1, i1 0
+  %nop4594 = alloca i1, i1 0
+  %nop4595 = alloca i1, i1 0
+  %nop4596 = alloca i1, i1 0
+  %nop4597 = alloca i1, i1 0
+  %nop4598 = alloca i1, i1 0
+  %nop4599 = alloca i1, i1 0
+  %nop4600 = alloca i1, i1 0
+  %nop4601 = alloca i1, i1 0
+  %nop4602 = alloca i1, i1 0
+  %nop4603 = alloca i1, i1 0
+  %nop4604 = alloca i1, i1 0
+  %nop4605 = alloca i1, i1 0
+  %nop4606 = alloca i1, i1 0
+  %nop4607 = alloca i1, i1 0
+  %nop4608 = alloca i1, i1 0
+  %nop4609 = alloca i1, i1 0
+  %nop4610 = alloca i1, i1 0
+  %nop4611 = alloca i1, i1 0
+  %nop4612 = alloca i1, i1 0
+  %nop4613 = alloca i1, i1 0
+  %nop4614 = alloca i1, i1 0
+  %nop4615 = alloca i1, i1 0
+  %nop4616 = alloca i1, i1 0
+  %nop4617 = alloca i1, i1 0
+  %nop4618 = alloca i1, i1 0
+  %nop4619 = alloca i1, i1 0
+  %nop4620 = alloca i1, i1 0
+  %nop4621 = alloca i1, i1 0
+  %nop4622 = alloca i1, i1 0
+  %nop4623 = alloca i1, i1 0
+  %nop4624 = alloca i1, i1 0
+  %nop4625 = alloca i1, i1 0
+  %nop4626 = alloca i1, i1 0
+  %nop4627 = alloca i1, i1 0
+  %nop4628 = alloca i1, i1 0
+  %nop4629 = alloca i1, i1 0
+  %nop4630 = alloca i1, i1 0
+  %nop4631 = alloca i1, i1 0
+  %nop4632 = alloca i1, i1 0
+  %nop4633 = alloca i1, i1 0
+  %nop4634 = alloca i1, i1 0
+  %nop4635 = alloca i1, i1 0
+  %nop4636 = alloca i1, i1 0
+  %nop4637 = alloca i1, i1 0
+  %nop4638 = alloca i1, i1 0
+  %nop4639 = alloca i1, i1 0
+  %nop4640 = alloca i1, i1 0
+  %nop4641 = alloca i1, i1 0
+  %nop4642 = alloca i1, i1 0
+  %nop4643 = alloca i1, i1 0
+  %nop4644 = alloca i1, i1 0
+  %nop4645 = alloca i1, i1 0
+  %nop4646 = alloca i1, i1 0
+  %nop4647 = alloca i1, i1 0
+  %nop4648 = alloca i1, i1 0
+  %nop4649 = alloca i1, i1 0
+  %nop4650 = alloca i1, i1 0
+  %nop4651 = alloca i1, i1 0
+  %nop4652 = alloca i1, i1 0
+  %nop4653 = alloca i1, i1 0
+  %nop4654 = alloca i1, i1 0
+  %nop4655 = alloca i1, i1 0
+  %nop4656 = alloca i1, i1 0
+  %nop4657 = alloca i1, i1 0
+  %nop4658 = alloca i1, i1 0
+  %nop4659 = alloca i1, i1 0
+  %nop4660 = alloca i1, i1 0
+  %nop4661 = alloca i1, i1 0
+  %nop4662 = alloca i1, i1 0
+  %nop4663 = alloca i1, i1 0
+  %nop4664 = alloca i1, i1 0
+  %nop4665 = alloca i1, i1 0
+  %nop4666 = alloca i1, i1 0
+  %nop4667 = alloca i1, i1 0
+  %nop4668 = alloca i1, i1 0
+  %nop4669 = alloca i1, i1 0
+  %nop4670 = alloca i1, i1 0
+  %nop4671 = alloca i1, i1 0
+  %nop4672 = alloca i1, i1 0
+  %nop4673 = alloca i1, i1 0
+  %nop4674 = alloca i1, i1 0
+  %nop4675 = alloca i1, i1 0
+  %nop4676 = alloca i1, i1 0
+  %nop4677 = alloca i1, i1 0
+  %nop4678 = alloca i1, i1 0
+  %nop4679 = alloca i1, i1 0
+  %nop4680 = alloca i1, i1 0
+  %nop4681 = alloca i1, i1 0
+  %nop4682 = alloca i1, i1 0
+  %nop4683 = alloca i1, i1 0
+  %nop4684 = alloca i1, i1 0
+  %nop4685 = alloca i1, i1 0
+  %nop4686 = alloca i1, i1 0
+  %nop4687 = alloca i1, i1 0
+  %nop4688 = alloca i1, i1 0
+  %nop4689 = alloca i1, i1 0
+  %nop4690 = alloca i1, i1 0
+  %nop4691 = alloca i1, i1 0
+  %nop4692 = alloca i1, i1 0
+  %nop4693 = alloca i1, i1 0
+  %nop4694 = alloca i1, i1 0
+  %nop4695 = alloca i1, i1 0
+  %nop4696 = alloca i1, i1 0
+  %nop4697 = alloca i1, i1 0
+  %nop4698 = alloca i1, i1 0
+  %nop4699 = alloca i1, i1 0
+  %nop4700 = alloca i1, i1 0
+  %nop4701 = alloca i1, i1 0
+  %nop4702 = alloca i1, i1 0
+  %nop4703 = alloca i1, i1 0
+  %nop4704 = alloca i1, i1 0
+  %nop4705 = alloca i1, i1 0
+  %nop4706 = alloca i1, i1 0
+  %nop4707 = alloca i1, i1 0
+  %nop4708 = alloca i1, i1 0
+  %nop4709 = alloca i1, i1 0
+  %nop4710 = alloca i1, i1 0
+  %nop4711 = alloca i1, i1 0
+  %nop4712 = alloca i1, i1 0
+  %nop4713 = alloca i1, i1 0
+  %nop4714 = alloca i1, i1 0
+  %nop4715 = alloca i1, i1 0
+  %nop4716 = alloca i1, i1 0
+  %nop4717 = alloca i1, i1 0
+  %nop4718 = alloca i1, i1 0
+  %nop4719 = alloca i1, i1 0
+  %nop4720 = alloca i1, i1 0
+  %nop4721 = alloca i1, i1 0
+  %nop4722 = alloca i1, i1 0
+  %nop4723 = alloca i1, i1 0
+  %nop4724 = alloca i1, i1 0
+  %nop4725 = alloca i1, i1 0
+  %nop4726 = alloca i1, i1 0
+  %nop4727 = alloca i1, i1 0
+  %nop4728 = alloca i1, i1 0
+  %nop4729 = alloca i1, i1 0
+  %nop4730 = alloca i1, i1 0
+  %nop4731 = alloca i1, i1 0
+  %nop4732 = alloca i1, i1 0
+  %nop4733 = alloca i1, i1 0
+  %nop4734 = alloca i1, i1 0
+  %nop4735 = alloca i1, i1 0
+  %nop4736 = alloca i1, i1 0
+  %nop4737 = alloca i1, i1 0
+  %nop4738 = alloca i1, i1 0
+  %nop4739 = alloca i1, i1 0
+  %nop4740 = alloca i1, i1 0
+  %nop4741 = alloca i1, i1 0
+  %nop4742 = alloca i1, i1 0
+  %nop4743 = alloca i1, i1 0
+  %nop4744 = alloca i1, i1 0
+  %nop4745 = alloca i1, i1 0
+  %nop4746 = alloca i1, i1 0
+  %nop4747 = alloca i1, i1 0
+  %nop4748 = alloca i1, i1 0
+  %nop4749 = alloca i1, i1 0
+  %nop4750 = alloca i1, i1 0
+  %nop4751 = alloca i1, i1 0
+  %nop4752 = alloca i1, i1 0
+  %nop4753 = alloca i1, i1 0
+  %nop4754 = alloca i1, i1 0
+  %nop4755 = alloca i1, i1 0
+  %nop4756 = alloca i1, i1 0
+  %nop4757 = alloca i1, i1 0
+  %nop4758 = alloca i1, i1 0
+  %nop4759 = alloca i1, i1 0
+  %nop4760 = alloca i1, i1 0
+  %nop4761 = alloca i1, i1 0
+  %nop4762 = alloca i1, i1 0
+  %nop4763 = alloca i1, i1 0
+  %nop4764 = alloca i1, i1 0
+  %nop4765 = alloca i1, i1 0
+  %nop4766 = alloca i1, i1 0
+  %nop4767 = alloca i1, i1 0
+  %nop4768 = alloca i1, i1 0
+  %nop4769 = alloca i1, i1 0
+  %nop4770 = alloca i1, i1 0
+  %nop4771 = alloca i1, i1 0
+  %nop4772 = alloca i1, i1 0
+  %nop4773 = alloca i1, i1 0
+  %nop4774 = alloca i1, i1 0
+  %nop4775 = alloca i1, i1 0
+  %nop4776 = alloca i1, i1 0
+  %nop4777 = alloca i1, i1 0
+  %nop4778 = alloca i1, i1 0
+  %nop4779 = alloca i1, i1 0
+  %nop4780 = alloca i1, i1 0
+  %nop4781 = alloca i1, i1 0
+  %nop4782 = alloca i1, i1 0
+  %nop4783 = alloca i1, i1 0
+  %nop4784 = alloca i1, i1 0
+  %nop4785 = alloca i1, i1 0
+  %nop4786 = alloca i1, i1 0
+  %nop4787 = alloca i1, i1 0
+  %nop4788 = alloca i1, i1 0
+  %nop4789 = alloca i1, i1 0
+  %nop4790 = alloca i1, i1 0
+  %nop4791 = alloca i1, i1 0
+  %nop4792 = alloca i1, i1 0
+  %nop4793 = alloca i1, i1 0
+  %nop4794 = alloca i1, i1 0
+  %nop4795 = alloca i1, i1 0
+  %nop4796 = alloca i1, i1 0
+  %nop4797 = alloca i1, i1 0
+  %nop4798 = alloca i1, i1 0
+  %nop4799 = alloca i1, i1 0
+  %nop4800 = alloca i1, i1 0
+  %nop4801 = alloca i1, i1 0
+  %nop4802 = alloca i1, i1 0
+  %nop4803 = alloca i1, i1 0
+  %nop4804 = alloca i1, i1 0
+  %nop4805 = alloca i1, i1 0
+  %nop4806 = alloca i1, i1 0
+  %nop4807 = alloca i1, i1 0
+  %nop4808 = alloca i1, i1 0
+  %nop4809 = alloca i1, i1 0
+  %nop4810 = alloca i1, i1 0
+  %nop4811 = alloca i1, i1 0
+  %nop4812 = alloca i1, i1 0
+  %nop4813 = alloca i1, i1 0
+  %nop4814 = alloca i1, i1 0
+  %nop4815 = alloca i1, i1 0
+  %nop4816 = alloca i1, i1 0
+  %nop4817 = alloca i1, i1 0
+  %nop4818 = alloca i1, i1 0
+  %nop4819 = alloca i1, i1 0
+  %nop4820 = alloca i1, i1 0
+  %nop4821 = alloca i1, i1 0
+  %nop4822 = alloca i1, i1 0
+  %nop4823 = alloca i1, i1 0
+  %nop4824 = alloca i1, i1 0
+  %nop4825 = alloca i1, i1 0
+  %nop4826 = alloca i1, i1 0
+  %nop4827 = alloca i1, i1 0
+  %nop4828 = alloca i1, i1 0
+  %nop4829 = alloca i1, i1 0
+  %nop4830 = alloca i1, i1 0
+  %nop4831 = alloca i1, i1 0
+  %nop4832 = alloca i1, i1 0
+  %nop4833 = alloca i1, i1 0
+  %nop4834 = alloca i1, i1 0
+  %nop4835 = alloca i1, i1 0
+  %nop4836 = alloca i1, i1 0
+  %nop4837 = alloca i1, i1 0
+  %nop4838 = alloca i1, i1 0
+  %nop4839 = alloca i1, i1 0
+  %nop4840 = alloca i1, i1 0
+  %nop4841 = alloca i1, i1 0
+  %nop4842 = alloca i1, i1 0
+  %nop4843 = alloca i1, i1 0
+  %nop4844 = alloca i1, i1 0
+  %nop4845 = alloca i1, i1 0
+  %nop4846 = alloca i1, i1 0
+  %nop4847 = alloca i1, i1 0
+  %nop4848 = alloca i1, i1 0
+  %nop4849 = alloca i1, i1 0
+  %nop4850 = alloca i1, i1 0
+  %nop4851 = alloca i1, i1 0
+  %nop4852 = alloca i1, i1 0
+  %nop4853 = alloca i1, i1 0
+  %nop4854 = alloca i1, i1 0
+  %nop4855 = alloca i1, i1 0
+  %nop4856 = alloca i1, i1 0
+  %nop4857 = alloca i1, i1 0
+  %nop4858 = alloca i1, i1 0
+  %nop4859 = alloca i1, i1 0
+  %nop4860 = alloca i1, i1 0
+  %nop4861 = alloca i1, i1 0
+  %nop4862 = alloca i1, i1 0
+  %nop4863 = alloca i1, i1 0
+  %nop4864 = alloca i1, i1 0
+  %nop4865 = alloca i1, i1 0
+  %nop4866 = alloca i1, i1 0
+  %nop4867 = alloca i1, i1 0
+  %nop4868 = alloca i1, i1 0
+  %nop4869 = alloca i1, i1 0
+  %nop4870 = alloca i1, i1 0
+  %nop4871 = alloca i1, i1 0
+  %nop4872 = alloca i1, i1 0
+  %nop4873 = alloca i1, i1 0
+  %nop4874 = alloca i1, i1 0
+  %nop4875 = alloca i1, i1 0
+  %nop4876 = alloca i1, i1 0
+  %nop4877 = alloca i1, i1 0
+  %nop4878 = alloca i1, i1 0
+  %nop4879 = alloca i1, i1 0
+  %nop4880 = alloca i1, i1 0
+  %nop4881 = alloca i1, i1 0
+  %nop4882 = alloca i1, i1 0
+  %nop4883 = alloca i1, i1 0
+  %nop4884 = alloca i1, i1 0
+  %nop4885 = alloca i1, i1 0
+  %nop4886 = alloca i1, i1 0
+  %nop4887 = alloca i1, i1 0
+  %nop4888 = alloca i1, i1 0
+  %nop4889 = alloca i1, i1 0
+  %nop4890 = alloca i1, i1 0
+  %nop4891 = alloca i1, i1 0
+  %nop4892 = alloca i1, i1 0
+  %nop4893 = alloca i1, i1 0
+  %nop4894 = alloca i1, i1 0
+  %nop4895 = alloca i1, i1 0
+  %nop4896 = alloca i1, i1 0
+  %nop4897 = alloca i1, i1 0
+  %nop4898 = alloca i1, i1 0
+  %nop4899 = alloca i1, i1 0
+  %nop4900 = alloca i1, i1 0
+  %nop4901 = alloca i1, i1 0
+  %nop4902 = alloca i1, i1 0
+  %nop4903 = alloca i1, i1 0
+  %nop4904 = alloca i1, i1 0
+  %nop4905 = alloca i1, i1 0
+  %nop4906 = alloca i1, i1 0
+  %nop4907 = alloca i1, i1 0
+  %nop4908 = alloca i1, i1 0
+  %nop4909 = alloca i1, i1 0
+  %nop4910 = alloca i1, i1 0
+  %nop4911 = alloca i1, i1 0
+  %nop4912 = alloca i1, i1 0
+  %nop4913 = alloca i1, i1 0
+  %nop4914 = alloca i1, i1 0
+  %nop4915 = alloca i1, i1 0
+  %nop4916 = alloca i1, i1 0
+  %nop4917 = alloca i1, i1 0
+  %nop4918 = alloca i1, i1 0
+  %nop4919 = alloca i1, i1 0
+  %nop4920 = alloca i1, i1 0
+  %nop4921 = alloca i1, i1 0
+  %nop4922 = alloca i1, i1 0
+  %nop4923 = alloca i1, i1 0
+  %nop4924 = alloca i1, i1 0
+  %nop4925 = alloca i1, i1 0
+  %nop4926 = alloca i1, i1 0
+  %nop4927 = alloca i1, i1 0
+  %nop4928 = alloca i1, i1 0
+  %nop4929 = alloca i1, i1 0
+  %nop4930 = alloca i1, i1 0
+  %nop4931 = alloca i1, i1 0
+  %nop4932 = alloca i1, i1 0
+  %nop4933 = alloca i1, i1 0
+  %nop4934 = alloca i1, i1 0
+  %nop4935 = alloca i1, i1 0
+  %nop4936 = alloca i1, i1 0
+  %nop4937 = alloca i1, i1 0
+  %nop4938 = alloca i1, i1 0
+  %nop4939 = alloca i1, i1 0
+  %nop4940 = alloca i1, i1 0
+  %nop4941 = alloca i1, i1 0
+  %nop4942 = alloca i1, i1 0
+  %nop4943 = alloca i1, i1 0
+  %nop4944 = alloca i1, i1 0
+  %nop4945 = alloca i1, i1 0
+  %nop4946 = alloca i1, i1 0
+  %nop4947 = alloca i1, i1 0
+  %nop4948 = alloca i1, i1 0
+  %nop4949 = alloca i1, i1 0
+  %nop4950 = alloca i1, i1 0
+  %nop4951 = alloca i1, i1 0
+  %nop4952 = alloca i1, i1 0
+  %nop4953 = alloca i1, i1 0
+  %nop4954 = alloca i1, i1 0
+  %nop4955 = alloca i1, i1 0
+  %nop4956 = alloca i1, i1 0
+  %nop4957 = alloca i1, i1 0
+  %nop4958 = alloca i1, i1 0
+  %nop4959 = alloca i1, i1 0
+  %nop4960 = alloca i1, i1 0
+  %nop4961 = alloca i1, i1 0
+  %nop4962 = alloca i1, i1 0
+  %nop4963 = alloca i1, i1 0
+  %nop4964 = alloca i1, i1 0
+  %nop4965 = alloca i1, i1 0
+  %nop4966 = alloca i1, i1 0
+  %nop4967 = alloca i1, i1 0
+  %nop4968 = alloca i1, i1 0
+  %nop4969 = alloca i1, i1 0
+  %nop4970 = alloca i1, i1 0
+  %nop4971 = alloca i1, i1 0
+  %nop4972 = alloca i1, i1 0
+  %nop4973 = alloca i1, i1 0
+  %nop4974 = alloca i1, i1 0
+  %nop4975 = alloca i1, i1 0
+  %nop4976 = alloca i1, i1 0
+  %nop4977 = alloca i1, i1 0
+  %nop4978 = alloca i1, i1 0
+  %nop4979 = alloca i1, i1 0
+  %nop4980 = alloca i1, i1 0
+  %nop4981 = alloca i1, i1 0
+  %nop4982 = alloca i1, i1 0
+  %nop4983 = alloca i1, i1 0
+  %nop4984 = alloca i1, i1 0
+  %nop4985 = alloca i1, i1 0
+  %nop4986 = alloca i1, i1 0
+  %nop4987 = alloca i1, i1 0
+  %nop4988 = alloca i1, i1 0
+  %nop4989 = alloca i1, i1 0
+  %nop4990 = alloca i1, i1 0
+  %nop4991 = alloca i1, i1 0
+  %nop4992 = alloca i1, i1 0
+  %nop4993 = alloca i1, i1 0
+  %nop4994 = alloca i1, i1 0
+  %nop4995 = alloca i1, i1 0
+  %nop4996 = alloca i1, i1 0
+  %nop4997 = alloca i1, i1 0
+  %nop4998 = alloca i1, i1 0
+  %nop4999 = alloca i1, i1 0
+  %nop5000 = alloca i1, i1 0
+  %nop5001 = alloca i1, i1 0
+  %nop5002 = alloca i1, i1 0
+  %nop5003 = alloca i1, i1 0
+  %nop5004 = alloca i1, i1 0
+  %nop5005 = alloca i1, i1 0
+  %nop5006 = alloca i1, i1 0
+  %nop5007 = alloca i1, i1 0
+  %nop5008 = alloca i1, i1 0
+  %nop5009 = alloca i1, i1 0
+  %nop5010 = alloca i1, i1 0
+  %nop5011 = alloca i1, i1 0
+  %nop5012 = alloca i1, i1 0
+  %nop5013 = alloca i1, i1 0
+  %nop5014 = alloca i1, i1 0
+  %nop5015 = alloca i1, i1 0
+  %nop5016 = alloca i1, i1 0
+  %nop5017 = alloca i1, i1 0
+  %nop5018 = alloca i1, i1 0
+  %nop5019 = alloca i1, i1 0
+  %nop5020 = alloca i1, i1 0
+  %nop5021 = alloca i1, i1 0
+  %nop5022 = alloca i1, i1 0
+  %nop5023 = alloca i1, i1 0
+  %nop5024 = alloca i1, i1 0
+  %nop5025 = alloca i1, i1 0
+  %nop5026 = alloca i1, i1 0
+  %nop5027 = alloca i1, i1 0
+  %nop5028 = alloca i1, i1 0
+  %nop5029 = alloca i1, i1 0
+  %nop5030 = alloca i1, i1 0
+  %nop5031 = alloca i1, i1 0
+  %nop5032 = alloca i1, i1 0
+  %nop5033 = alloca i1, i1 0
+  %nop5034 = alloca i1, i1 0
+  %nop5035 = alloca i1, i1 0
+  %nop5036 = alloca i1, i1 0
+  %nop5037 = alloca i1, i1 0
+  %nop5038 = alloca i1, i1 0
+  %nop5039 = alloca i1, i1 0
+  %nop5040 = alloca i1, i1 0
+  %nop5041 = alloca i1, i1 0
+  %nop5042 = alloca i1, i1 0
+  %nop5043 = alloca i1, i1 0
+  %nop5044 = alloca i1, i1 0
+  %nop5045 = alloca i1, i1 0
+  %nop5046 = alloca i1, i1 0
+  %nop5047 = alloca i1, i1 0
+  %nop5048 = alloca i1, i1 0
+  %nop5049 = alloca i1, i1 0
+  %nop5050 = alloca i1, i1 0
+  %nop5051 = alloca i1, i1 0
+  %nop5052 = alloca i1, i1 0
+  %nop5053 = alloca i1, i1 0
+  %nop5054 = alloca i1, i1 0
+  %nop5055 = alloca i1, i1 0
+  %nop5056 = alloca i1, i1 0
+  %nop5057 = alloca i1, i1 0
+  %nop5058 = alloca i1, i1 0
+  %nop5059 = alloca i1, i1 0
+  %nop5060 = alloca i1, i1 0
+  %nop5061 = alloca i1, i1 0
+  %nop5062 = alloca i1, i1 0
+  %nop5063 = alloca i1, i1 0
+  %nop5064 = alloca i1, i1 0
+  %nop5065 = alloca i1, i1 0
+  %nop5066 = alloca i1, i1 0
+  %nop5067 = alloca i1, i1 0
+  %nop5068 = alloca i1, i1 0
+  %nop5069 = alloca i1, i1 0
+  %nop5070 = alloca i1, i1 0
+  %nop5071 = alloca i1, i1 0
+  %nop5072 = alloca i1, i1 0
+  %nop5073 = alloca i1, i1 0
+  %nop5074 = alloca i1, i1 0
+  %nop5075 = alloca i1, i1 0
+  %nop5076 = alloca i1, i1 0
+  %nop5077 = alloca i1, i1 0
+  %nop5078 = alloca i1, i1 0
+  %nop5079 = alloca i1, i1 0
+  %nop5080 = alloca i1, i1 0
+  %nop5081 = alloca i1, i1 0
+  %nop5082 = alloca i1, i1 0
+  %nop5083 = alloca i1, i1 0
+  %nop5084 = alloca i1, i1 0
+  %nop5085 = alloca i1, i1 0
+  %nop5086 = alloca i1, i1 0
+  %nop5087 = alloca i1, i1 0
+  %nop5088 = alloca i1, i1 0
+  %nop5089 = alloca i1, i1 0
+  %nop5090 = alloca i1, i1 0
+  %nop5091 = alloca i1, i1 0
+  %nop5092 = alloca i1, i1 0
+  %nop5093 = alloca i1, i1 0
+  %nop5094 = alloca i1, i1 0
+  %nop5095 = alloca i1, i1 0
+  %nop5096 = alloca i1, i1 0
+  %nop5097 = alloca i1, i1 0
+  %nop5098 = alloca i1, i1 0
+  %nop5099 = alloca i1, i1 0
+  %nop5100 = alloca i1, i1 0
+  %nop5101 = alloca i1, i1 0
+  %nop5102 = alloca i1, i1 0
+  %nop5103 = alloca i1, i1 0
+  %nop5104 = alloca i1, i1 0
+  %nop5105 = alloca i1, i1 0
+  %nop5106 = alloca i1, i1 0
+  %nop5107 = alloca i1, i1 0
+  %nop5108 = alloca i1, i1 0
+  %nop5109 = alloca i1, i1 0
+  %nop5110 = alloca i1, i1 0
+  %nop5111 = alloca i1, i1 0
+  %nop5112 = alloca i1, i1 0
+  %nop5113 = alloca i1, i1 0
+  %nop5114 = alloca i1, i1 0
+  %nop5115 = alloca i1, i1 0
+  %nop5116 = alloca i1, i1 0
+  %nop5117 = alloca i1, i1 0
+  %nop5118 = alloca i1, i1 0
+  %nop5119 = alloca i1, i1 0
+  %nop5120 = alloca i1, i1 0
+  %nop5121 = alloca i1, i1 0
+  %nop5122 = alloca i1, i1 0
+  %nop5123 = alloca i1, i1 0
+  %nop5124 = alloca i1, i1 0
+  %nop5125 = alloca i1, i1 0
+  %nop5126 = alloca i1, i1 0
+  %nop5127 = alloca i1, i1 0
+  %nop5128 = alloca i1, i1 0
+  %nop5129 = alloca i1, i1 0
+  %nop5130 = alloca i1, i1 0
+  %nop5131 = alloca i1, i1 0
+  %nop5132 = alloca i1, i1 0
+  %nop5133 = alloca i1, i1 0
+  %nop5134 = alloca i1, i1 0
+  %nop5135 = alloca i1, i1 0
+  %nop5136 = alloca i1, i1 0
+  %nop5137 = alloca i1, i1 0
+  %nop5138 = alloca i1, i1 0
+  %nop5139 = alloca i1, i1 0
+  %nop5140 = alloca i1, i1 0
+  %nop5141 = alloca i1, i1 0
+  %nop5142 = alloca i1, i1 0
+  %nop5143 = alloca i1, i1 0
+  %nop5144 = alloca i1, i1 0
+  %nop5145 = alloca i1, i1 0
+  %nop5146 = alloca i1, i1 0
+  %nop5147 = alloca i1, i1 0
+  %nop5148 = alloca i1, i1 0
+  %nop5149 = alloca i1, i1 0
+  %nop5150 = alloca i1, i1 0
+  %nop5151 = alloca i1, i1 0
+  %nop5152 = alloca i1, i1 0
+  %nop5153 = alloca i1, i1 0
+  %nop5154 = alloca i1, i1 0
+  %nop5155 = alloca i1, i1 0
+  %nop5156 = alloca i1, i1 0
+  %nop5157 = alloca i1, i1 0
+  %nop5158 = alloca i1, i1 0
+  %nop5159 = alloca i1, i1 0
+  %nop5160 = alloca i1, i1 0
+  %nop5161 = alloca i1, i1 0
+  %nop5162 = alloca i1, i1 0
+  %nop5163 = alloca i1, i1 0
+  %nop5164 = alloca i1, i1 0
+  %nop5165 = alloca i1, i1 0
+  %nop5166 = alloca i1, i1 0
+  %nop5167 = alloca i1, i1 0
+  %nop5168 = alloca i1, i1 0
+  %nop5169 = alloca i1, i1 0
+  %nop5170 = alloca i1, i1 0
+  %nop5171 = alloca i1, i1 0
+  %nop5172 = alloca i1, i1 0
+  %nop5173 = alloca i1, i1 0
+  %nop5174 = alloca i1, i1 0
+  %nop5175 = alloca i1, i1 0
+  %nop5176 = alloca i1, i1 0
+  %nop5177 = alloca i1, i1 0
+  %nop5178 = alloca i1, i1 0
+  %nop5179 = alloca i1, i1 0
+  %nop5180 = alloca i1, i1 0
+  %nop5181 = alloca i1, i1 0
+  %nop5182 = alloca i1, i1 0
+  %nop5183 = alloca i1, i1 0
+  %nop5184 = alloca i1, i1 0
+  %nop5185 = alloca i1, i1 0
+  %nop5186 = alloca i1, i1 0
+  %nop5187 = alloca i1, i1 0
+  %nop5188 = alloca i1, i1 0
+  %nop5189 = alloca i1, i1 0
+  %nop5190 = alloca i1, i1 0
+  %nop5191 = alloca i1, i1 0
+  %nop5192 = alloca i1, i1 0
+  %nop5193 = alloca i1, i1 0
+  %nop5194 = alloca i1, i1 0
+  %nop5195 = alloca i1, i1 0
+  %nop5196 = alloca i1, i1 0
+  %nop5197 = alloca i1, i1 0
+  %nop5198 = alloca i1, i1 0
+  %nop5199 = alloca i1, i1 0
+  %nop5200 = alloca i1, i1 0
+  %nop5201 = alloca i1, i1 0
+  %nop5202 = alloca i1, i1 0
+  %nop5203 = alloca i1, i1 0
+  %nop5204 = alloca i1, i1 0
+  %nop5205 = alloca i1, i1 0
+  %nop5206 = alloca i1, i1 0
+  %nop5207 = alloca i1, i1 0
+  %nop5208 = alloca i1, i1 0
+  %nop5209 = alloca i1, i1 0
+  %nop5210 = alloca i1, i1 0
+  %nop5211 = alloca i1, i1 0
+  %nop5212 = alloca i1, i1 0
+  %nop5213 = alloca i1, i1 0
+  %nop5214 = alloca i1, i1 0
+  %nop5215 = alloca i1, i1 0
+  %nop5216 = alloca i1, i1 0
+  %nop5217 = alloca i1, i1 0
+  %nop5218 = alloca i1, i1 0
+  %nop5219 = alloca i1, i1 0
+  %nop5220 = alloca i1, i1 0
+  %nop5221 = alloca i1, i1 0
+  %nop5222 = alloca i1, i1 0
+  %nop5223 = alloca i1, i1 0
+  %nop5224 = alloca i1, i1 0
+  %nop5225 = alloca i1, i1 0
+  %nop5226 = alloca i1, i1 0
+  %nop5227 = alloca i1, i1 0
+  %nop5228 = alloca i1, i1 0
+  %nop5229 = alloca i1, i1 0
+  %nop5230 = alloca i1, i1 0
+  %nop5231 = alloca i1, i1 0
+  %nop5232 = alloca i1, i1 0
+  %nop5233 = alloca i1, i1 0
+  %nop5234 = alloca i1, i1 0
+  %nop5235 = alloca i1, i1 0
+  %nop5236 = alloca i1, i1 0
+  %nop5237 = alloca i1, i1 0
+  %nop5238 = alloca i1, i1 0
+  %nop5239 = alloca i1, i1 0
+  %nop5240 = alloca i1, i1 0
+  %nop5241 = alloca i1, i1 0
+  %nop5242 = alloca i1, i1 0
+  %nop5243 = alloca i1, i1 0
+  %nop5244 = alloca i1, i1 0
+  %nop5245 = alloca i1, i1 0
+  %nop5246 = alloca i1, i1 0
+  %nop5247 = alloca i1, i1 0
+  %nop5248 = alloca i1, i1 0
+  %nop5249 = alloca i1, i1 0
+  %nop5250 = alloca i1, i1 0
+  %nop5251 = alloca i1, i1 0
+  %nop5252 = alloca i1, i1 0
+  %nop5253 = alloca i1, i1 0
+  %nop5254 = alloca i1, i1 0
+  %nop5255 = alloca i1, i1 0
+  %nop5256 = alloca i1, i1 0
+  %nop5257 = alloca i1, i1 0
+  %nop5258 = alloca i1, i1 0
+  %nop5259 = alloca i1, i1 0
+  %nop5260 = alloca i1, i1 0
+  %nop5261 = alloca i1, i1 0
+  %nop5262 = alloca i1, i1 0
+  %nop5263 = alloca i1, i1 0
+  %nop5264 = alloca i1, i1 0
+  %nop5265 = alloca i1, i1 0
+  %nop5266 = alloca i1, i1 0
+  %nop5267 = alloca i1, i1 0
+  %nop5268 = alloca i1, i1 0
+  %nop5269 = alloca i1, i1 0
+  %nop5270 = alloca i1, i1 0
+  %nop5271 = alloca i1, i1 0
+  %nop5272 = alloca i1, i1 0
+  %nop5273 = alloca i1, i1 0
+  %nop5274 = alloca i1, i1 0
+  %nop5275 = alloca i1, i1 0
+  %nop5276 = alloca i1, i1 0
+  %nop5277 = alloca i1, i1 0
+  %nop5278 = alloca i1, i1 0
+  %nop5279 = alloca i1, i1 0
+  %nop5280 = alloca i1, i1 0
+  %nop5281 = alloca i1, i1 0
+  %nop5282 = alloca i1, i1 0
+  %nop5283 = alloca i1, i1 0
+  %nop5284 = alloca i1, i1 0
+  %nop5285 = alloca i1, i1 0
+  %nop5286 = alloca i1, i1 0
+  %nop5287 = alloca i1, i1 0
+  %nop5288 = alloca i1, i1 0
+  %nop5289 = alloca i1, i1 0
+  %nop5290 = alloca i1, i1 0
+  %nop5291 = alloca i1, i1 0
+  %nop5292 = alloca i1, i1 0
+  %nop5293 = alloca i1, i1 0
+  %nop5294 = alloca i1, i1 0
+  %nop5295 = alloca i1, i1 0
+  %nop5296 = alloca i1, i1 0
+  %nop5297 = alloca i1, i1 0
+  %nop5298 = alloca i1, i1 0
+  %nop5299 = alloca i1, i1 0
+  %nop5300 = alloca i1, i1 0
+  %nop5301 = alloca i1, i1 0
+  %nop5302 = alloca i1, i1 0
+  %nop5303 = alloca i1, i1 0
+  %nop5304 = alloca i1, i1 0
+  %nop5305 = alloca i1, i1 0
+  %nop5306 = alloca i1, i1 0
+  %nop5307 = alloca i1, i1 0
+  %nop5308 = alloca i1, i1 0
+  %nop5309 = alloca i1, i1 0
+  %nop5310 = alloca i1, i1 0
+  %nop5311 = alloca i1, i1 0
+  %nop5312 = alloca i1, i1 0
+  %nop5313 = alloca i1, i1 0
+  %nop5314 = alloca i1, i1 0
+  %nop5315 = alloca i1, i1 0
+  %nop5316 = alloca i1, i1 0
+  %nop5317 = alloca i1, i1 0
+  %nop5318 = alloca i1, i1 0
+  %nop5319 = alloca i1, i1 0
+  %nop5320 = alloca i1, i1 0
+  %nop5321 = alloca i1, i1 0
+  %nop5322 = alloca i1, i1 0
+  %nop5323 = alloca i1, i1 0
+  %nop5324 = alloca i1, i1 0
+  %nop5325 = alloca i1, i1 0
+  %nop5326 = alloca i1, i1 0
+  %nop5327 = alloca i1, i1 0
+  %nop5328 = alloca i1, i1 0
+  %nop5329 = alloca i1, i1 0
+  %nop5330 = alloca i1, i1 0
+  %nop5331 = alloca i1, i1 0
+  %nop5332 = alloca i1, i1 0
+  %nop5333 = alloca i1, i1 0
+  %nop5334 = alloca i1, i1 0
+  %nop5335 = alloca i1, i1 0
+  %nop5336 = alloca i1, i1 0
+  %nop5337 = alloca i1, i1 0
+  %nop5338 = alloca i1, i1 0
+  %nop5339 = alloca i1, i1 0
+  %nop5340 = alloca i1, i1 0
+  %nop5341 = alloca i1, i1 0
+  %nop5342 = alloca i1, i1 0
+  %nop5343 = alloca i1, i1 0
+  %nop5344 = alloca i1, i1 0
+  %nop5345 = alloca i1, i1 0
+  %nop5346 = alloca i1, i1 0
+  %nop5347 = alloca i1, i1 0
+  %nop5348 = alloca i1, i1 0
+  %nop5349 = alloca i1, i1 0
+  %nop5350 = alloca i1, i1 0
+  %nop5351 = alloca i1, i1 0
+  %nop5352 = alloca i1, i1 0
+  %nop5353 = alloca i1, i1 0
+  %nop5354 = alloca i1, i1 0
+  %nop5355 = alloca i1, i1 0
+  %nop5356 = alloca i1, i1 0
+  %nop5357 = alloca i1, i1 0
+  %nop5358 = alloca i1, i1 0
+  %nop5359 = alloca i1, i1 0
+  %nop5360 = alloca i1, i1 0
+  %nop5361 = alloca i1, i1 0
+  %nop5362 = alloca i1, i1 0
+  %nop5363 = alloca i1, i1 0
+  %nop5364 = alloca i1, i1 0
+  %nop5365 = alloca i1, i1 0
+  %nop5366 = alloca i1, i1 0
+  %nop5367 = alloca i1, i1 0
+  %nop5368 = alloca i1, i1 0
+  %nop5369 = alloca i1, i1 0
+  %nop5370 = alloca i1, i1 0
+  %nop5371 = alloca i1, i1 0
+  %nop5372 = alloca i1, i1 0
+  %nop5373 = alloca i1, i1 0
+  %nop5374 = alloca i1, i1 0
+  %nop5375 = alloca i1, i1 0
+  %nop5376 = alloca i1, i1 0
+  %nop5377 = alloca i1, i1 0
+  %nop5378 = alloca i1, i1 0
+  %nop5379 = alloca i1, i1 0
+  %nop5380 = alloca i1, i1 0
+  %nop5381 = alloca i1, i1 0
+  %nop5382 = alloca i1, i1 0
+  %nop5383 = alloca i1, i1 0
+  %nop5384 = alloca i1, i1 0
+  %nop5385 = alloca i1, i1 0
+  %nop5386 = alloca i1, i1 0
+  %nop5387 = alloca i1, i1 0
+  %nop5388 = alloca i1, i1 0
+  %nop5389 = alloca i1, i1 0
+  %nop5390 = alloca i1, i1 0
+  %nop5391 = alloca i1, i1 0
+  %nop5392 = alloca i1, i1 0
+  %nop5393 = alloca i1, i1 0
+  %nop5394 = alloca i1, i1 0
+  %nop5395 = alloca i1, i1 0
+  %nop5396 = alloca i1, i1 0
+  %nop5397 = alloca i1, i1 0
+  %nop5398 = alloca i1, i1 0
+  %nop5399 = alloca i1, i1 0
+  %nop5400 = alloca i1, i1 0
+  %nop5401 = alloca i1, i1 0
+  %nop5402 = alloca i1, i1 0
+  %nop5403 = alloca i1, i1 0
+  %nop5404 = alloca i1, i1 0
+  %nop5405 = alloca i1, i1 0
+  %nop5406 = alloca i1, i1 0
+  %nop5407 = alloca i1, i1 0
+  %nop5408 = alloca i1, i1 0
+  %nop5409 = alloca i1, i1 0
+  %nop5410 = alloca i1, i1 0
+  %nop5411 = alloca i1, i1 0
+  %nop5412 = alloca i1, i1 0
+  %nop5413 = alloca i1, i1 0
+  %nop5414 = alloca i1, i1 0
+  %nop5415 = alloca i1, i1 0
+  %nop5416 = alloca i1, i1 0
+  %nop5417 = alloca i1, i1 0
+  %nop5418 = alloca i1, i1 0
+  %nop5419 = alloca i1, i1 0
+  %nop5420 = alloca i1, i1 0
+  %nop5421 = alloca i1, i1 0
+  %nop5422 = alloca i1, i1 0
+  %nop5423 = alloca i1, i1 0
+  %nop5424 = alloca i1, i1 0
+  %nop5425 = alloca i1, i1 0
+  %nop5426 = alloca i1, i1 0
+  %nop5427 = alloca i1, i1 0
+  %nop5428 = alloca i1, i1 0
+  %nop5429 = alloca i1, i1 0
+  %nop5430 = alloca i1, i1 0
+  %nop5431 = alloca i1, i1 0
+  %nop5432 = alloca i1, i1 0
+  %nop5433 = alloca i1, i1 0
+  %nop5434 = alloca i1, i1 0
+  %nop5435 = alloca i1, i1 0
+  %nop5436 = alloca i1, i1 0
+  %nop5437 = alloca i1, i1 0
+  %nop5438 = alloca i1, i1 0
+  %nop5439 = alloca i1, i1 0
+  %nop5440 = alloca i1, i1 0
+  %nop5441 = alloca i1, i1 0
+  %nop5442 = alloca i1, i1 0
+  %nop5443 = alloca i1, i1 0
+  %nop5444 = alloca i1, i1 0
+  %nop5445 = alloca i1, i1 0
+  %nop5446 = alloca i1, i1 0
+  %nop5447 = alloca i1, i1 0
+  %nop5448 = alloca i1, i1 0
+  %nop5449 = alloca i1, i1 0
+  %nop5450 = alloca i1, i1 0
+  %nop5451 = alloca i1, i1 0
+  %nop5452 = alloca i1, i1 0
+  %nop5453 = alloca i1, i1 0
+  %nop5454 = alloca i1, i1 0
+  %nop5455 = alloca i1, i1 0
+  %nop5456 = alloca i1, i1 0
+  %nop5457 = alloca i1, i1 0
+  %nop5458 = alloca i1, i1 0
+  %nop5459 = alloca i1, i1 0
+  %nop5460 = alloca i1, i1 0
+  %nop5461 = alloca i1, i1 0
+  %nop5462 = alloca i1, i1 0
+  %nop5463 = alloca i1, i1 0
+  %nop5464 = alloca i1, i1 0
+  %nop5465 = alloca i1, i1 0
+  %nop5466 = alloca i1, i1 0
+  %nop5467 = alloca i1, i1 0
+  %nop5468 = alloca i1, i1 0
+  %nop5469 = alloca i1, i1 0
+  %nop5470 = alloca i1, i1 0
+  %nop5471 = alloca i1, i1 0
+  %nop5472 = alloca i1, i1 0
+  %nop5473 = alloca i1, i1 0
+  %nop5474 = alloca i1, i1 0
+  %nop5475 = alloca i1, i1 0
+  %nop5476 = alloca i1, i1 0
+  %nop5477 = alloca i1, i1 0
+  %nop5478 = alloca i1, i1 0
+  %nop5479 = alloca i1, i1 0
+  %nop5480 = alloca i1, i1 0
+  %nop5481 = alloca i1, i1 0
+  %nop5482 = alloca i1, i1 0
+  %nop5483 = alloca i1, i1 0
+  %nop5484 = alloca i1, i1 0
+  %nop5485 = alloca i1, i1 0
+  %nop5486 = alloca i1, i1 0
+  %nop5487 = alloca i1, i1 0
+  %nop5488 = alloca i1, i1 0
+  %nop5489 = alloca i1, i1 0
+  %nop5490 = alloca i1, i1 0
+  %nop5491 = alloca i1, i1 0
+  %nop5492 = alloca i1, i1 0
+  %nop5493 = alloca i1, i1 0
+  %nop5494 = alloca i1, i1 0
+  %nop5495 = alloca i1, i1 0
+  %nop5496 = alloca i1, i1 0
+  %nop5497 = alloca i1, i1 0
+  %nop5498 = alloca i1, i1 0
+  %nop5499 = alloca i1, i1 0
+  %nop5500 = alloca i1, i1 0
+  %nop5501 = alloca i1, i1 0
+  %nop5502 = alloca i1, i1 0
+  %nop5503 = alloca i1, i1 0
+  %nop5504 = alloca i1, i1 0
+  %nop5505 = alloca i1, i1 0
+  %nop5506 = alloca i1, i1 0
+  %nop5507 = alloca i1, i1 0
+  %nop5508 = alloca i1, i1 0
+  %nop5509 = alloca i1, i1 0
+  %nop5510 = alloca i1, i1 0
+  %nop5511 = alloca i1, i1 0
+  %nop5512 = alloca i1, i1 0
+  %nop5513 = alloca i1, i1 0
+  %nop5514 = alloca i1, i1 0
+  %nop5515 = alloca i1, i1 0
+  %nop5516 = alloca i1, i1 0
+  %nop5517 = alloca i1, i1 0
+  %nop5518 = alloca i1, i1 0
+  %nop5519 = alloca i1, i1 0
+  %nop5520 = alloca i1, i1 0
+  %nop5521 = alloca i1, i1 0
+  %nop5522 = alloca i1, i1 0
+  %nop5523 = alloca i1, i1 0
+  %nop5524 = alloca i1, i1 0
+  %nop5525 = alloca i1, i1 0
+  %nop5526 = alloca i1, i1 0
+  %nop5527 = alloca i1, i1 0
+  %nop5528 = alloca i1, i1 0
+  %nop5529 = alloca i1, i1 0
+  %nop5530 = alloca i1, i1 0
+  %nop5531 = alloca i1, i1 0
+  %nop5532 = alloca i1, i1 0
+  %nop5533 = alloca i1, i1 0
+  %nop5534 = alloca i1, i1 0
+  %nop5535 = alloca i1, i1 0
+  %nop5536 = alloca i1, i1 0
+  %nop5537 = alloca i1, i1 0
+  %nop5538 = alloca i1, i1 0
+  %nop5539 = alloca i1, i1 0
+  %nop5540 = alloca i1, i1 0
+  %nop5541 = alloca i1, i1 0
+  %nop5542 = alloca i1, i1 0
+  %nop5543 = alloca i1, i1 0
+  %nop5544 = alloca i1, i1 0
+  %nop5545 = alloca i1, i1 0
+  %nop5546 = alloca i1, i1 0
+  %nop5547 = alloca i1, i1 0
+  %nop5548 = alloca i1, i1 0
+  %nop5549 = alloca i1, i1 0
+  %nop5550 = alloca i1, i1 0
+  %nop5551 = alloca i1, i1 0
+  %nop5552 = alloca i1, i1 0
+  %nop5553 = alloca i1, i1 0
+  %nop5554 = alloca i1, i1 0
+  %nop5555 = alloca i1, i1 0
+  %nop5556 = alloca i1, i1 0
+  %nop5557 = alloca i1, i1 0
+  %nop5558 = alloca i1, i1 0
+  %nop5559 = alloca i1, i1 0
+  %nop5560 = alloca i1, i1 0
+  %nop5561 = alloca i1, i1 0
+  %nop5562 = alloca i1, i1 0
+  %nop5563 = alloca i1, i1 0
+  %nop5564 = alloca i1, i1 0
+  %nop5565 = alloca i1, i1 0
+  %nop5566 = alloca i1, i1 0
+  %nop5567 = alloca i1, i1 0
+  %nop5568 = alloca i1, i1 0
+  %nop5569 = alloca i1, i1 0
+  %nop5570 = alloca i1, i1 0
+  %nop5571 = alloca i1, i1 0
+  %nop5572 = alloca i1, i1 0
+  %nop5573 = alloca i1, i1 0
+  %nop5574 = alloca i1, i1 0
+  %nop5575 = alloca i1, i1 0
+  %nop5576 = alloca i1, i1 0
+  %nop5577 = alloca i1, i1 0
+  %nop5578 = alloca i1, i1 0
+  %nop5579 = alloca i1, i1 0
+  %nop5580 = alloca i1, i1 0
+  %nop5581 = alloca i1, i1 0
+  %nop5582 = alloca i1, i1 0
+  %nop5583 = alloca i1, i1 0
+  %nop5584 = alloca i1, i1 0
+  %nop5585 = alloca i1, i1 0
+  %nop5586 = alloca i1, i1 0
+  %nop5587 = alloca i1, i1 0
+  %nop5588 = alloca i1, i1 0
+  %nop5589 = alloca i1, i1 0
+  %nop5590 = alloca i1, i1 0
+  %nop5591 = alloca i1, i1 0
+  %nop5592 = alloca i1, i1 0
+  %nop5593 = alloca i1, i1 0
+  %nop5594 = alloca i1, i1 0
+  %nop5595 = alloca i1, i1 0
+  %nop5596 = alloca i1, i1 0
+  %nop5597 = alloca i1, i1 0
+  %nop5598 = alloca i1, i1 0
+  %nop5599 = alloca i1, i1 0
+  %nop5600 = alloca i1, i1 0
+  %nop5601 = alloca i1, i1 0
+  %nop5602 = alloca i1, i1 0
+  %nop5603 = alloca i1, i1 0
+  %nop5604 = alloca i1, i1 0
+  %nop5605 = alloca i1, i1 0
+  %nop5606 = alloca i1, i1 0
+  %nop5607 = alloca i1, i1 0
+  %nop5608 = alloca i1, i1 0
+  %nop5609 = alloca i1, i1 0
+  %nop5610 = alloca i1, i1 0
+  %nop5611 = alloca i1, i1 0
+  %nop5612 = alloca i1, i1 0
+  %nop5613 = alloca i1, i1 0
+  %nop5614 = alloca i1, i1 0
+  %nop5615 = alloca i1, i1 0
+  %nop5616 = alloca i1, i1 0
+  %nop5617 = alloca i1, i1 0
+  %nop5618 = alloca i1, i1 0
+  %nop5619 = alloca i1, i1 0
+  %nop5620 = alloca i1, i1 0
+  %nop5621 = alloca i1, i1 0
+  %nop5622 = alloca i1, i1 0
+  %nop5623 = alloca i1, i1 0
+  %nop5624 = alloca i1, i1 0
+  %nop5625 = alloca i1, i1 0
+  %nop5626 = alloca i1, i1 0
+  %nop5627 = alloca i1, i1 0
+  %nop5628 = alloca i1, i1 0
+  %nop5629 = alloca i1, i1 0
+  %nop5630 = alloca i1, i1 0
+  %nop5631 = alloca i1, i1 0
+  %nop5632 = alloca i1, i1 0
+  %nop5633 = alloca i1, i1 0
+  %nop5634 = alloca i1, i1 0
+  %nop5635 = alloca i1, i1 0
+  %nop5636 = alloca i1, i1 0
+  %nop5637 = alloca i1, i1 0
+  %nop5638 = alloca i1, i1 0
+  %nop5639 = alloca i1, i1 0
+  %nop5640 = alloca i1, i1 0
+  %nop5641 = alloca i1, i1 0
+  %nop5642 = alloca i1, i1 0
+  %nop5643 = alloca i1, i1 0
+  %nop5644 = alloca i1, i1 0
+  %nop5645 = alloca i1, i1 0
+  %nop5646 = alloca i1, i1 0
+  %nop5647 = alloca i1, i1 0
+  %nop5648 = alloca i1, i1 0
+  %nop5649 = alloca i1, i1 0
+  %nop5650 = alloca i1, i1 0
+  %nop5651 = alloca i1, i1 0
+  %nop5652 = alloca i1, i1 0
+  %nop5653 = alloca i1, i1 0
+  %nop5654 = alloca i1, i1 0
+  %nop5655 = alloca i1, i1 0
+  %nop5656 = alloca i1, i1 0
+  %nop5657 = alloca i1, i1 0
+  %nop5658 = alloca i1, i1 0
+  %nop5659 = alloca i1, i1 0
+  %nop5660 = alloca i1, i1 0
+  %nop5661 = alloca i1, i1 0
+  %nop5662 = alloca i1, i1 0
+  %nop5663 = alloca i1, i1 0
+  %nop5664 = alloca i1, i1 0
+  %nop5665 = alloca i1, i1 0
+  %nop5666 = alloca i1, i1 0
+  %nop5667 = alloca i1, i1 0
+  %nop5668 = alloca i1, i1 0
+  %nop5669 = alloca i1, i1 0
+  %nop5670 = alloca i1, i1 0
+  %nop5671 = alloca i1, i1 0
+  %nop5672 = alloca i1, i1 0
+  %nop5673 = alloca i1, i1 0
+  %nop5674 = alloca i1, i1 0
+  %nop5675 = alloca i1, i1 0
+  %nop5676 = alloca i1, i1 0
+  %nop5677 = alloca i1, i1 0
+  %nop5678 = alloca i1, i1 0
+  %nop5679 = alloca i1, i1 0
+  %nop5680 = alloca i1, i1 0
+  %nop5681 = alloca i1, i1 0
+  %nop5682 = alloca i1, i1 0
+  %nop5683 = alloca i1, i1 0
+  %nop5684 = alloca i1, i1 0
+  %nop5685 = alloca i1, i1 0
+  %nop5686 = alloca i1, i1 0
+  %nop5687 = alloca i1, i1 0
+  %nop5688 = alloca i1, i1 0
+  %nop5689 = alloca i1, i1 0
+  %nop5690 = alloca i1, i1 0
+  %nop5691 = alloca i1, i1 0
+  %nop5692 = alloca i1, i1 0
+  %nop5693 = alloca i1, i1 0
+  %nop5694 = alloca i1, i1 0
+  %nop5695 = alloca i1, i1 0
+  %nop5696 = alloca i1, i1 0
+  %nop5697 = alloca i1, i1 0
+  %nop5698 = alloca i1, i1 0
+  %nop5699 = alloca i1, i1 0
+  %nop5700 = alloca i1, i1 0
+  %nop5701 = alloca i1, i1 0
+  %nop5702 = alloca i1, i1 0
+  %nop5703 = alloca i1, i1 0
+  %nop5704 = alloca i1, i1 0
+  %nop5705 = alloca i1, i1 0
+  %nop5706 = alloca i1, i1 0
+  %nop5707 = alloca i1, i1 0
+  %nop5708 = alloca i1, i1 0
+  %nop5709 = alloca i1, i1 0
+  %nop5710 = alloca i1, i1 0
+  %nop5711 = alloca i1, i1 0
+  %nop5712 = alloca i1, i1 0
+  %nop5713 = alloca i1, i1 0
+  %nop5714 = alloca i1, i1 0
+  %nop5715 = alloca i1, i1 0
+  %nop5716 = alloca i1, i1 0
+  %nop5717 = alloca i1, i1 0
+  %nop5718 = alloca i1, i1 0
+  %nop5719 = alloca i1, i1 0
+  %nop5720 = alloca i1, i1 0
+  %nop5721 = alloca i1, i1 0
+  %nop5722 = alloca i1, i1 0
+  %nop5723 = alloca i1, i1 0
+  %nop5724 = alloca i1, i1 0
+  %nop5725 = alloca i1, i1 0
+  %nop5726 = alloca i1, i1 0
+  %nop5727 = alloca i1, i1 0
+  %nop5728 = alloca i1, i1 0
+  %nop5729 = alloca i1, i1 0
+  %nop5730 = alloca i1, i1 0
+  %nop5731 = alloca i1, i1 0
+  %nop5732 = alloca i1, i1 0
+  %nop5733 = alloca i1, i1 0
+  %nop5734 = alloca i1, i1 0
+  %nop5735 = alloca i1, i1 0
+  %nop5736 = alloca i1, i1 0
+  %nop5737 = alloca i1, i1 0
+  %nop5738 = alloca i1, i1 0
+  %nop5739 = alloca i1, i1 0
+  %nop5740 = alloca i1, i1 0
+  %nop5741 = alloca i1, i1 0
+  %nop5742 = alloca i1, i1 0
+  %nop5743 = alloca i1, i1 0
+  %nop5744 = alloca i1, i1 0
+  %nop5745 = alloca i1, i1 0
+  %nop5746 = alloca i1, i1 0
+  %nop5747 = alloca i1, i1 0
+  %nop5748 = alloca i1, i1 0
+  %nop5749 = alloca i1, i1 0
+  %nop5750 = alloca i1, i1 0
+  %nop5751 = alloca i1, i1 0
+  %nop5752 = alloca i1, i1 0
+  %nop5753 = alloca i1, i1 0
+  %nop5754 = alloca i1, i1 0
+  %nop5755 = alloca i1, i1 0
+  %nop5756 = alloca i1, i1 0
+  %nop5757 = alloca i1, i1 0
+  %nop5758 = alloca i1, i1 0
+  %nop5759 = alloca i1, i1 0
+  %nop5760 = alloca i1, i1 0
+  %nop5761 = alloca i1, i1 0
+  %nop5762 = alloca i1, i1 0
+  %nop5763 = alloca i1, i1 0
+  %nop5764 = alloca i1, i1 0
+  %nop5765 = alloca i1, i1 0
+  %nop5766 = alloca i1, i1 0
+  %nop5767 = alloca i1, i1 0
+  %nop5768 = alloca i1, i1 0
+  %nop5769 = alloca i1, i1 0
+  %nop5770 = alloca i1, i1 0
+  %nop5771 = alloca i1, i1 0
+  %nop5772 = alloca i1, i1 0
+  %nop5773 = alloca i1, i1 0
+  %nop5774 = alloca i1, i1 0
+  %nop5775 = alloca i1, i1 0
+  %nop5776 = alloca i1, i1 0
+  %nop5777 = alloca i1, i1 0
+  %nop5778 = alloca i1, i1 0
+  %nop5779 = alloca i1, i1 0
+  %nop5780 = alloca i1, i1 0
+  %nop5781 = alloca i1, i1 0
+  %nop5782 = alloca i1, i1 0
+  %nop5783 = alloca i1, i1 0
+  %nop5784 = alloca i1, i1 0
+  %nop5785 = alloca i1, i1 0
+  %nop5786 = alloca i1, i1 0
+  %nop5787 = alloca i1, i1 0
+  %nop5788 = alloca i1, i1 0
+  %nop5789 = alloca i1, i1 0
+  %nop5790 = alloca i1, i1 0
+  %nop5791 = alloca i1, i1 0
+  %nop5792 = alloca i1, i1 0
+  %nop5793 = alloca i1, i1 0
+  %nop5794 = alloca i1, i1 0
+  %nop5795 = alloca i1, i1 0
+  %nop5796 = alloca i1, i1 0
+  %nop5797 = alloca i1, i1 0
+  %nop5798 = alloca i1, i1 0
+  %nop5799 = alloca i1, i1 0
+  %nop5800 = alloca i1, i1 0
+  %nop5801 = alloca i1, i1 0
+  %nop5802 = alloca i1, i1 0
+  %nop5803 = alloca i1, i1 0
+  %nop5804 = alloca i1, i1 0
+  %nop5805 = alloca i1, i1 0
+  %nop5806 = alloca i1, i1 0
+  %nop5807 = alloca i1, i1 0
+  %nop5808 = alloca i1, i1 0
+  %nop5809 = alloca i1, i1 0
+  %nop5810 = alloca i1, i1 0
+  %nop5811 = alloca i1, i1 0
+  %nop5812 = alloca i1, i1 0
+  %nop5813 = alloca i1, i1 0
+  %nop5814 = alloca i1, i1 0
+  %nop5815 = alloca i1, i1 0
+  %nop5816 = alloca i1, i1 0
+  %nop5817 = alloca i1, i1 0
+  %nop5818 = alloca i1, i1 0
+  %nop5819 = alloca i1, i1 0
+  %nop5820 = alloca i1, i1 0
+  %nop5821 = alloca i1, i1 0
+  %nop5822 = alloca i1, i1 0
+  %nop5823 = alloca i1, i1 0
+  %nop5824 = alloca i1, i1 0
+  %nop5825 = alloca i1, i1 0
+  %nop5826 = alloca i1, i1 0
+  %nop5827 = alloca i1, i1 0
+  %nop5828 = alloca i1, i1 0
+  %nop5829 = alloca i1, i1 0
+  %nop5830 = alloca i1, i1 0
+  %nop5831 = alloca i1, i1 0
+  %nop5832 = alloca i1, i1 0
+  %nop5833 = alloca i1, i1 0
+  %nop5834 = alloca i1, i1 0
+  %nop5835 = alloca i1, i1 0
+  %nop5836 = alloca i1, i1 0
+  %nop5837 = alloca i1, i1 0
+  %nop5838 = alloca i1, i1 0
+  %nop5839 = alloca i1, i1 0
+  %nop5840 = alloca i1, i1 0
+  %nop5841 = alloca i1, i1 0
+  %nop5842 = alloca i1, i1 0
+  %nop5843 = alloca i1, i1 0
+  %nop5844 = alloca i1, i1 0
+  %nop5845 = alloca i1, i1 0
+  %nop5846 = alloca i1, i1 0
+  %nop5847 = alloca i1, i1 0
+  %nop5848 = alloca i1, i1 0
+  %nop5849 = alloca i1, i1 0
+  %nop5850 = alloca i1, i1 0
+  %nop5851 = alloca i1, i1 0
+  %nop5852 = alloca i1, i1 0
+  %nop5853 = alloca i1, i1 0
+  %nop5854 = alloca i1, i1 0
+  %nop5855 = alloca i1, i1 0
+  %nop5856 = alloca i1, i1 0
+  %nop5857 = alloca i1, i1 0
+  %nop5858 = alloca i1, i1 0
+  %nop5859 = alloca i1, i1 0
+  %nop5860 = alloca i1, i1 0
+  %nop5861 = alloca i1, i1 0
+  %nop5862 = alloca i1, i1 0
+  %nop5863 = alloca i1, i1 0
+  %nop5864 = alloca i1, i1 0
+  %nop5865 = alloca i1, i1 0
+  %nop5866 = alloca i1, i1 0
+  %nop5867 = alloca i1, i1 0
+  %nop5868 = alloca i1, i1 0
+  %nop5869 = alloca i1, i1 0
+  %nop5870 = alloca i1, i1 0
+  %nop5871 = alloca i1, i1 0
+  %nop5872 = alloca i1, i1 0
+  %nop5873 = alloca i1, i1 0
+  %nop5874 = alloca i1, i1 0
+  %nop5875 = alloca i1, i1 0
+  %nop5876 = alloca i1, i1 0
+  %nop5877 = alloca i1, i1 0
+  %nop5878 = alloca i1, i1 0
+  %nop5879 = alloca i1, i1 0
+  %nop5880 = alloca i1, i1 0
+  %nop5881 = alloca i1, i1 0
+  %nop5882 = alloca i1, i1 0
+  %nop5883 = alloca i1, i1 0
+  %nop5884 = alloca i1, i1 0
+  %nop5885 = alloca i1, i1 0
+  %nop5886 = alloca i1, i1 0
+  %nop5887 = alloca i1, i1 0
+  %nop5888 = alloca i1, i1 0
+  %nop5889 = alloca i1, i1 0
+  %nop5890 = alloca i1, i1 0
+  %nop5891 = alloca i1, i1 0
+  %nop5892 = alloca i1, i1 0
+  %nop5893 = alloca i1, i1 0
+  %nop5894 = alloca i1, i1 0
+  %nop5895 = alloca i1, i1 0
+  %nop5896 = alloca i1, i1 0
+  %nop5897 = alloca i1, i1 0
+  %nop5898 = alloca i1, i1 0
+  %nop5899 = alloca i1, i1 0
+  %nop5900 = alloca i1, i1 0
+  %nop5901 = alloca i1, i1 0
+  %nop5902 = alloca i1, i1 0
+  %nop5903 = alloca i1, i1 0
+  %nop5904 = alloca i1, i1 0
+  %nop5905 = alloca i1, i1 0
+  %nop5906 = alloca i1, i1 0
+  %nop5907 = alloca i1, i1 0
+  %nop5908 = alloca i1, i1 0
+  %nop5909 = alloca i1, i1 0
+  %nop5910 = alloca i1, i1 0
+  %nop5911 = alloca i1, i1 0
+  %nop5912 = alloca i1, i1 0
+  %nop5913 = alloca i1, i1 0
+  %nop5914 = alloca i1, i1 0
+  %nop5915 = alloca i1, i1 0
+  %nop5916 = alloca i1, i1 0
+  %nop5917 = alloca i1, i1 0
+  %nop5918 = alloca i1, i1 0
+  %nop5919 = alloca i1, i1 0
+  %nop5920 = alloca i1, i1 0
+  %nop5921 = alloca i1, i1 0
+  %nop5922 = alloca i1, i1 0
+  %nop5923 = alloca i1, i1 0
+  %nop5924 = alloca i1, i1 0
+  %nop5925 = alloca i1, i1 0
+  %nop5926 = alloca i1, i1 0
+  %nop5927 = alloca i1, i1 0
+  %nop5928 = alloca i1, i1 0
+  %nop5929 = alloca i1, i1 0
+  %nop5930 = alloca i1, i1 0
+  %nop5931 = alloca i1, i1 0
+  %nop5932 = alloca i1, i1 0
+  %nop5933 = alloca i1, i1 0
+  %nop5934 = alloca i1, i1 0
+  %nop5935 = alloca i1, i1 0
+  %nop5936 = alloca i1, i1 0
+  %nop5937 = alloca i1, i1 0
+  %nop5938 = alloca i1, i1 0
+  %nop5939 = alloca i1, i1 0
+  %nop5940 = alloca i1, i1 0
+  %nop5941 = alloca i1, i1 0
+  %nop5942 = alloca i1, i1 0
+  %nop5943 = alloca i1, i1 0
+  %nop5944 = alloca i1, i1 0
+  %nop5945 = alloca i1, i1 0
+  %nop5946 = alloca i1, i1 0
+  %nop5947 = alloca i1, i1 0
+  %nop5948 = alloca i1, i1 0
+  %nop5949 = alloca i1, i1 0
+  %nop5950 = alloca i1, i1 0
+  %nop5951 = alloca i1, i1 0
+  %nop5952 = alloca i1, i1 0
+  %nop5953 = alloca i1, i1 0
+  %nop5954 = alloca i1, i1 0
+  %nop5955 = alloca i1, i1 0
+  %nop5956 = alloca i1, i1 0
+  %nop5957 = alloca i1, i1 0
+  %nop5958 = alloca i1, i1 0
+  %nop5959 = alloca i1, i1 0
+  %nop5960 = alloca i1, i1 0
+  %nop5961 = alloca i1, i1 0
+  %nop5962 = alloca i1, i1 0
+  %nop5963 = alloca i1, i1 0
+  %nop5964 = alloca i1, i1 0
+  %nop5965 = alloca i1, i1 0
+  %nop5966 = alloca i1, i1 0
+  %nop5967 = alloca i1, i1 0
+  %nop5968 = alloca i1, i1 0
+  %nop5969 = alloca i1, i1 0
+  %nop5970 = alloca i1, i1 0
+  %nop5971 = alloca i1, i1 0
+  %nop5972 = alloca i1, i1 0
+  %nop5973 = alloca i1, i1 0
+  %nop5974 = alloca i1, i1 0
+  %nop5975 = alloca i1, i1 0
+  %nop5976 = alloca i1, i1 0
+  %nop5977 = alloca i1, i1 0
+  %nop5978 = alloca i1, i1 0
+  %nop5979 = alloca i1, i1 0
+  %nop5980 = alloca i1, i1 0
+  %nop5981 = alloca i1, i1 0
+  %nop5982 = alloca i1, i1 0
+  %nop5983 = alloca i1, i1 0
+  %nop5984 = alloca i1, i1 0
+  %nop5985 = alloca i1, i1 0
+  %nop5986 = alloca i1, i1 0
+  %nop5987 = alloca i1, i1 0
+  %nop5988 = alloca i1, i1 0
+  %nop5989 = alloca i1, i1 0
+  %nop5990 = alloca i1, i1 0
+  %nop5991 = alloca i1, i1 0
+  %nop5992 = alloca i1, i1 0
+  %nop5993 = alloca i1, i1 0
+  %nop5994 = alloca i1, i1 0
+  %nop5995 = alloca i1, i1 0
+  %nop5996 = alloca i1, i1 0
+  %nop5997 = alloca i1, i1 0
+  %nop5998 = alloca i1, i1 0
+  %nop5999 = alloca i1, i1 0
+  %nop6000 = alloca i1, i1 0
+  %nop6001 = alloca i1, i1 0
+  %nop6002 = alloca i1, i1 0
+  %nop6003 = alloca i1, i1 0
+  %nop6004 = alloca i1, i1 0
+  %nop6005 = alloca i1, i1 0
+  %nop6006 = alloca i1, i1 0
+  %nop6007 = alloca i1, i1 0
+  %nop6008 = alloca i1, i1 0
+  %nop6009 = alloca i1, i1 0
+  %nop6010 = alloca i1, i1 0
+  %nop6011 = alloca i1, i1 0
+  %nop6012 = alloca i1, i1 0
+  %nop6013 = alloca i1, i1 0
+  %nop6014 = alloca i1, i1 0
+  %nop6015 = alloca i1, i1 0
+  %nop6016 = alloca i1, i1 0
+  %nop6017 = alloca i1, i1 0
+  %nop6018 = alloca i1, i1 0
+  %nop6019 = alloca i1, i1 0
+  %nop6020 = alloca i1, i1 0
+  %nop6021 = alloca i1, i1 0
+  %nop6022 = alloca i1, i1 0
+  %nop6023 = alloca i1, i1 0
+  %nop6024 = alloca i1, i1 0
+  %nop6025 = alloca i1, i1 0
+  %nop6026 = alloca i1, i1 0
+  %nop6027 = alloca i1, i1 0
+  %nop6028 = alloca i1, i1 0
+  %nop6029 = alloca i1, i1 0
+  %nop6030 = alloca i1, i1 0
+  %nop6031 = alloca i1, i1 0
+  %nop6032 = alloca i1, i1 0
+  %nop6033 = alloca i1, i1 0
+  %nop6034 = alloca i1, i1 0
+  %nop6035 = alloca i1, i1 0
+  %nop6036 = alloca i1, i1 0
+  %nop6037 = alloca i1, i1 0
+  %nop6038 = alloca i1, i1 0
+  %nop6039 = alloca i1, i1 0
+  %nop6040 = alloca i1, i1 0
+  %nop6041 = alloca i1, i1 0
+  %nop6042 = alloca i1, i1 0
+  %nop6043 = alloca i1, i1 0
+  %nop6044 = alloca i1, i1 0
+  %nop6045 = alloca i1, i1 0
+  %nop6046 = alloca i1, i1 0
+  %nop6047 = alloca i1, i1 0
+  %nop6048 = alloca i1, i1 0
+  %nop6049 = alloca i1, i1 0
+  %nop6050 = alloca i1, i1 0
+  %nop6051 = alloca i1, i1 0
+  %nop6052 = alloca i1, i1 0
+  %nop6053 = alloca i1, i1 0
+  %nop6054 = alloca i1, i1 0
+  %nop6055 = alloca i1, i1 0
+  %nop6056 = alloca i1, i1 0
+  %nop6057 = alloca i1, i1 0
+  %nop6058 = alloca i1, i1 0
+  %nop6059 = alloca i1, i1 0
+  %nop6060 = alloca i1, i1 0
+  %nop6061 = alloca i1, i1 0
+  %nop6062 = alloca i1, i1 0
+  %nop6063 = alloca i1, i1 0
+  %nop6064 = alloca i1, i1 0
+  %nop6065 = alloca i1, i1 0
+  %nop6066 = alloca i1, i1 0
+  %nop6067 = alloca i1, i1 0
+  %nop6068 = alloca i1, i1 0
+  %nop6069 = alloca i1, i1 0
+  %nop6070 = alloca i1, i1 0
+  %nop6071 = alloca i1, i1 0
+  %nop6072 = alloca i1, i1 0
+  %nop6073 = alloca i1, i1 0
+  %nop6074 = alloca i1, i1 0
+  %nop6075 = alloca i1, i1 0
+  %nop6076 = alloca i1, i1 0
+  %nop6077 = alloca i1, i1 0
+  %nop6078 = alloca i1, i1 0
+  %nop6079 = alloca i1, i1 0
+  %nop6080 = alloca i1, i1 0
+  %nop6081 = alloca i1, i1 0
+  %nop6082 = alloca i1, i1 0
+  %nop6083 = alloca i1, i1 0
+  %nop6084 = alloca i1, i1 0
+  %nop6085 = alloca i1, i1 0
+  %nop6086 = alloca i1, i1 0
+  %nop6087 = alloca i1, i1 0
+  %nop6088 = alloca i1, i1 0
+  %nop6089 = alloca i1, i1 0
+  %nop6090 = alloca i1, i1 0
+  %nop6091 = alloca i1, i1 0
+  %nop6092 = alloca i1, i1 0
+  %nop6093 = alloca i1, i1 0
+  %nop6094 = alloca i1, i1 0
+  %nop6095 = alloca i1, i1 0
+  %nop6096 = alloca i1, i1 0
+  %nop6097 = alloca i1, i1 0
+  %nop6098 = alloca i1, i1 0
+  %nop6099 = alloca i1, i1 0
+  %nop6100 = alloca i1, i1 0
+  %nop6101 = alloca i1, i1 0
+  %nop6102 = alloca i1, i1 0
+  %nop6103 = alloca i1, i1 0
+  %nop6104 = alloca i1, i1 0
+  %nop6105 = alloca i1, i1 0
+  %nop6106 = alloca i1, i1 0
+  %nop6107 = alloca i1, i1 0
+  %nop6108 = alloca i1, i1 0
+  %nop6109 = alloca i1, i1 0
+  %nop6110 = alloca i1, i1 0
+  %nop6111 = alloca i1, i1 0
+  %nop6112 = alloca i1, i1 0
+  %nop6113 = alloca i1, i1 0
+  %nop6114 = alloca i1, i1 0
+  %nop6115 = alloca i1, i1 0
+  %nop6116 = alloca i1, i1 0
+  %nop6117 = alloca i1, i1 0
+  %nop6118 = alloca i1, i1 0
+  %nop6119 = alloca i1, i1 0
+  %nop6120 = alloca i1, i1 0
+  %nop6121 = alloca i1, i1 0
+  %nop6122 = alloca i1, i1 0
+  %nop6123 = alloca i1, i1 0
+  %nop6124 = alloca i1, i1 0
+  %nop6125 = alloca i1, i1 0
+  %nop6126 = alloca i1, i1 0
+  %nop6127 = alloca i1, i1 0
+  %nop6128 = alloca i1, i1 0
+  %nop6129 = alloca i1, i1 0
+  %nop6130 = alloca i1, i1 0
+  %nop6131 = alloca i1, i1 0
+  %nop6132 = alloca i1, i1 0
+  %nop6133 = alloca i1, i1 0
+  %nop6134 = alloca i1, i1 0
+  %nop6135 = alloca i1, i1 0
+  %nop6136 = alloca i1, i1 0
+  %nop6137 = alloca i1, i1 0
+  %nop6138 = alloca i1, i1 0
+  %nop6139 = alloca i1, i1 0
+  %nop6140 = alloca i1, i1 0
+  %nop6141 = alloca i1, i1 0
+  %nop6142 = alloca i1, i1 0
+  %nop6143 = alloca i1, i1 0
+  %nop6144 = alloca i1, i1 0
+  %nop6145 = alloca i1, i1 0
+  %nop6146 = alloca i1, i1 0
+  %nop6147 = alloca i1, i1 0
+  %nop6148 = alloca i1, i1 0
+  %nop6149 = alloca i1, i1 0
+  %nop6150 = alloca i1, i1 0
+  %nop6151 = alloca i1, i1 0
+  %nop6152 = alloca i1, i1 0
+  %nop6153 = alloca i1, i1 0
+  %nop6154 = alloca i1, i1 0
+  %nop6155 = alloca i1, i1 0
+  %nop6156 = alloca i1, i1 0
+  %nop6157 = alloca i1, i1 0
+  %nop6158 = alloca i1, i1 0
+  %nop6159 = alloca i1, i1 0
+  %nop6160 = alloca i1, i1 0
+  %nop6161 = alloca i1, i1 0
+  %nop6162 = alloca i1, i1 0
+  %nop6163 = alloca i1, i1 0
+  %nop6164 = alloca i1, i1 0
+  %nop6165 = alloca i1, i1 0
+  %nop6166 = alloca i1, i1 0
+  %nop6167 = alloca i1, i1 0
+  %nop6168 = alloca i1, i1 0
+  %nop6169 = alloca i1, i1 0
+  %nop6170 = alloca i1, i1 0
+  %nop6171 = alloca i1, i1 0
+  %nop6172 = alloca i1, i1 0
+  %nop6173 = alloca i1, i1 0
+  %nop6174 = alloca i1, i1 0
+  %nop6175 = alloca i1, i1 0
+  %nop6176 = alloca i1, i1 0
+  %nop6177 = alloca i1, i1 0
+  %nop6178 = alloca i1, i1 0
+  %nop6179 = alloca i1, i1 0
+  %nop6180 = alloca i1, i1 0
+  %nop6181 = alloca i1, i1 0
+  %nop6182 = alloca i1, i1 0
+  %nop6183 = alloca i1, i1 0
+  %nop6184 = alloca i1, i1 0
+  %nop6185 = alloca i1, i1 0
+  %nop6186 = alloca i1, i1 0
+  %nop6187 = alloca i1, i1 0
+  %nop6188 = alloca i1, i1 0
+  %nop6189 = alloca i1, i1 0
+  %nop6190 = alloca i1, i1 0
+  %nop6191 = alloca i1, i1 0
+  %nop6192 = alloca i1, i1 0
+  %nop6193 = alloca i1, i1 0
+  %nop6194 = alloca i1, i1 0
+  %nop6195 = alloca i1, i1 0
+  %nop6196 = alloca i1, i1 0
+  %nop6197 = alloca i1, i1 0
+  %nop6198 = alloca i1, i1 0
+  %nop6199 = alloca i1, i1 0
+  %nop6200 = alloca i1, i1 0
+  %nop6201 = alloca i1, i1 0
+  %nop6202 = alloca i1, i1 0
+  %nop6203 = alloca i1, i1 0
+  %nop6204 = alloca i1, i1 0
+  %nop6205 = alloca i1, i1 0
+  %nop6206 = alloca i1, i1 0
+  %nop6207 = alloca i1, i1 0
+  %nop6208 = alloca i1, i1 0
+  %nop6209 = alloca i1, i1 0
+  %nop6210 = alloca i1, i1 0
+  %nop6211 = alloca i1, i1 0
+  %nop6212 = alloca i1, i1 0
+  %nop6213 = alloca i1, i1 0
+  %nop6214 = alloca i1, i1 0
+  %nop6215 = alloca i1, i1 0
+  %nop6216 = alloca i1, i1 0
+  %nop6217 = alloca i1, i1 0
+  %nop6218 = alloca i1, i1 0
+  %nop6219 = alloca i1, i1 0
+  %nop6220 = alloca i1, i1 0
+  %nop6221 = alloca i1, i1 0
+  %nop6222 = alloca i1, i1 0
+  %nop6223 = alloca i1, i1 0
+  %nop6224 = alloca i1, i1 0
+  %nop6225 = alloca i1, i1 0
+  %nop6226 = alloca i1, i1 0
+  %nop6227 = alloca i1, i1 0
+  %nop6228 = alloca i1, i1 0
+  %nop6229 = alloca i1, i1 0
+  %nop6230 = alloca i1, i1 0
+  %nop6231 = alloca i1, i1 0
+  %nop6232 = alloca i1, i1 0
+  %nop6233 = alloca i1, i1 0
+  %nop6234 = alloca i1, i1 0
+  %nop6235 = alloca i1, i1 0
+  %nop6236 = alloca i1, i1 0
+  %nop6237 = alloca i1, i1 0
+  %nop6238 = alloca i1, i1 0
+  %nop6239 = alloca i1, i1 0
+  %nop6240 = alloca i1, i1 0
+  %nop6241 = alloca i1, i1 0
+  %nop6242 = alloca i1, i1 0
+  %nop6243 = alloca i1, i1 0
+  %nop6244 = alloca i1, i1 0
+  %nop6245 = alloca i1, i1 0
+  %nop6246 = alloca i1, i1 0
+  %nop6247 = alloca i1, i1 0
+  %nop6248 = alloca i1, i1 0
+  %nop6249 = alloca i1, i1 0
+  %nop6250 = alloca i1, i1 0
+  %nop6251 = alloca i1, i1 0
+  %nop6252 = alloca i1, i1 0
+  %nop6253 = alloca i1, i1 0
+  %nop6254 = alloca i1, i1 0
+  %nop6255 = alloca i1, i1 0
+  %nop6256 = alloca i1, i1 0
+  %nop6257 = alloca i1, i1 0
+  %nop6258 = alloca i1, i1 0
+  %nop6259 = alloca i1, i1 0
+  %nop6260 = alloca i1, i1 0
+  %nop6261 = alloca i1, i1 0
+  %nop6262 = alloca i1, i1 0
+  %nop6263 = alloca i1, i1 0
+  %nop6264 = alloca i1, i1 0
+  %nop6265 = alloca i1, i1 0
+  %nop6266 = alloca i1, i1 0
+  %nop6267 = alloca i1, i1 0
+  %nop6268 = alloca i1, i1 0
+  %nop6269 = alloca i1, i1 0
+  %nop6270 = alloca i1, i1 0
+  %nop6271 = alloca i1, i1 0
+  %nop6272 = alloca i1, i1 0
+  %nop6273 = alloca i1, i1 0
+  %nop6274 = alloca i1, i1 0
+  %nop6275 = alloca i1, i1 0
+  %nop6276 = alloca i1, i1 0
+  %nop6277 = alloca i1, i1 0
+  %nop6278 = alloca i1, i1 0
+  %nop6279 = alloca i1, i1 0
+  %nop6280 = alloca i1, i1 0
+  %nop6281 = alloca i1, i1 0
+  %nop6282 = alloca i1, i1 0
+  %nop6283 = alloca i1, i1 0
+  %nop6284 = alloca i1, i1 0
+  %nop6285 = alloca i1, i1 0
+  %nop6286 = alloca i1, i1 0
+  %nop6287 = alloca i1, i1 0
+  %nop6288 = alloca i1, i1 0
+  %nop6289 = alloca i1, i1 0
+  %nop6290 = alloca i1, i1 0
+  %nop6291 = alloca i1, i1 0
+  %nop6292 = alloca i1, i1 0
+  %nop6293 = alloca i1, i1 0
+  %nop6294 = alloca i1, i1 0
+  %nop6295 = alloca i1, i1 0
+  %nop6296 = alloca i1, i1 0
+  %nop6297 = alloca i1, i1 0
+  %nop6298 = alloca i1, i1 0
+  %nop6299 = alloca i1, i1 0
+  %nop6300 = alloca i1, i1 0
+  %nop6301 = alloca i1, i1 0
+  %nop6302 = alloca i1, i1 0
+  %nop6303 = alloca i1, i1 0
+  %nop6304 = alloca i1, i1 0
+  %nop6305 = alloca i1, i1 0
+  %nop6306 = alloca i1, i1 0
+  %nop6307 = alloca i1, i1 0
+  %nop6308 = alloca i1, i1 0
+  %nop6309 = alloca i1, i1 0
+  %nop6310 = alloca i1, i1 0
+  %nop6311 = alloca i1, i1 0
+  %nop6312 = alloca i1, i1 0
+  %nop6313 = alloca i1, i1 0
+  %nop6314 = alloca i1, i1 0
+  %nop6315 = alloca i1, i1 0
+  %nop6316 = alloca i1, i1 0
+  %nop6317 = alloca i1, i1 0
+  %nop6318 = alloca i1, i1 0
+  %nop6319 = alloca i1, i1 0
+  %nop6320 = alloca i1, i1 0
+  %nop6321 = alloca i1, i1 0
+  %nop6322 = alloca i1, i1 0
+  %nop6323 = alloca i1, i1 0
+  %nop6324 = alloca i1, i1 0
+  %nop6325 = alloca i1, i1 0
+  %nop6326 = alloca i1, i1 0
+  %nop6327 = alloca i1, i1 0
+  %nop6328 = alloca i1, i1 0
+  %nop6329 = alloca i1, i1 0
+  %nop6330 = alloca i1, i1 0
+  %nop6331 = alloca i1, i1 0
+  %nop6332 = alloca i1, i1 0
+  %nop6333 = alloca i1, i1 0
+  %nop6334 = alloca i1, i1 0
+  %nop6335 = alloca i1, i1 0
+  %nop6336 = alloca i1, i1 0
+  %nop6337 = alloca i1, i1 0
+  %nop6338 = alloca i1, i1 0
+  %nop6339 = alloca i1, i1 0
+  %nop6340 = alloca i1, i1 0
+  %nop6341 = alloca i1, i1 0
+  %nop6342 = alloca i1, i1 0
+  %nop6343 = alloca i1, i1 0
+  %nop6344 = alloca i1, i1 0
+  %nop6345 = alloca i1, i1 0
+  %nop6346 = alloca i1, i1 0
+  %nop6347 = alloca i1, i1 0
+  %nop6348 = alloca i1, i1 0
+  %nop6349 = alloca i1, i1 0
+  %nop6350 = alloca i1, i1 0
+  %nop6351 = alloca i1, i1 0
+  %nop6352 = alloca i1, i1 0
+  %nop6353 = alloca i1, i1 0
+  %nop6354 = alloca i1, i1 0
+  %nop6355 = alloca i1, i1 0
+  %nop6356 = alloca i1, i1 0
+  %nop6357 = alloca i1, i1 0
+  %nop6358 = alloca i1, i1 0
+  %nop6359 = alloca i1, i1 0
+  %nop6360 = alloca i1, i1 0
+  %nop6361 = alloca i1, i1 0
+  %nop6362 = alloca i1, i1 0
+  %nop6363 = alloca i1, i1 0
+  %nop6364 = alloca i1, i1 0
+  %nop6365 = alloca i1, i1 0
+  %nop6366 = alloca i1, i1 0
+  %nop6367 = alloca i1, i1 0
+  %nop6368 = alloca i1, i1 0
+  %nop6369 = alloca i1, i1 0
+  %nop6370 = alloca i1, i1 0
+  %nop6371 = alloca i1, i1 0
+  %nop6372 = alloca i1, i1 0
+  %nop6373 = alloca i1, i1 0
+  %nop6374 = alloca i1, i1 0
+  %nop6375 = alloca i1, i1 0
+  %nop6376 = alloca i1, i1 0
+  %nop6377 = alloca i1, i1 0
+  %nop6378 = alloca i1, i1 0
+  %nop6379 = alloca i1, i1 0
+  %nop6380 = alloca i1, i1 0
+  %nop6381 = alloca i1, i1 0
+  %nop6382 = alloca i1, i1 0
+  %nop6383 = alloca i1, i1 0
+  %nop6384 = alloca i1, i1 0
+  %nop6385 = alloca i1, i1 0
+  %nop6386 = alloca i1, i1 0
+  %nop6387 = alloca i1, i1 0
+  %nop6388 = alloca i1, i1 0
+  %nop6389 = alloca i1, i1 0
+  %nop6390 = alloca i1, i1 0
+  %nop6391 = alloca i1, i1 0
+  %nop6392 = alloca i1, i1 0
+  %nop6393 = alloca i1, i1 0
+  %nop6394 = alloca i1, i1 0
+  %nop6395 = alloca i1, i1 0
+  %nop6396 = alloca i1, i1 0
+  %nop6397 = alloca i1, i1 0
+  %nop6398 = alloca i1, i1 0
+  %nop6399 = alloca i1, i1 0
+  %nop6400 = alloca i1, i1 0
+  %nop6401 = alloca i1, i1 0
+  %nop6402 = alloca i1, i1 0
+  %nop6403 = alloca i1, i1 0
+  %nop6404 = alloca i1, i1 0
+  %nop6405 = alloca i1, i1 0
+  %nop6406 = alloca i1, i1 0
+  %nop6407 = alloca i1, i1 0
+  %nop6408 = alloca i1, i1 0
+  %nop6409 = alloca i1, i1 0
+  %nop6410 = alloca i1, i1 0
+  %nop6411 = alloca i1, i1 0
+  %nop6412 = alloca i1, i1 0
+  %nop6413 = alloca i1, i1 0
+  %nop6414 = alloca i1, i1 0
+  %nop6415 = alloca i1, i1 0
+  %nop6416 = alloca i1, i1 0
+  %nop6417 = alloca i1, i1 0
+  %nop6418 = alloca i1, i1 0
+  %nop6419 = alloca i1, i1 0
+  %nop6420 = alloca i1, i1 0
+  %nop6421 = alloca i1, i1 0
+  %nop6422 = alloca i1, i1 0
+  %nop6423 = alloca i1, i1 0
+  %nop6424 = alloca i1, i1 0
+  %nop6425 = alloca i1, i1 0
+  %nop6426 = alloca i1, i1 0
+  %nop6427 = alloca i1, i1 0
+  %nop6428 = alloca i1, i1 0
+  %nop6429 = alloca i1, i1 0
+  %nop6430 = alloca i1, i1 0
+  %nop6431 = alloca i1, i1 0
+  %nop6432 = alloca i1, i1 0
+  %nop6433 = alloca i1, i1 0
+  %nop6434 = alloca i1, i1 0
+  %nop6435 = alloca i1, i1 0
+  %nop6436 = alloca i1, i1 0
+  %nop6437 = alloca i1, i1 0
+  %nop6438 = alloca i1, i1 0
+  %nop6439 = alloca i1, i1 0
+  %nop6440 = alloca i1, i1 0
+  %nop6441 = alloca i1, i1 0
+  %nop6442 = alloca i1, i1 0
+  %nop6443 = alloca i1, i1 0
+  %nop6444 = alloca i1, i1 0
+  %nop6445 = alloca i1, i1 0
+  %nop6446 = alloca i1, i1 0
+  %nop6447 = alloca i1, i1 0
+  %nop6448 = alloca i1, i1 0
+  %nop6449 = alloca i1, i1 0
+  %nop6450 = alloca i1, i1 0
+  %nop6451 = alloca i1, i1 0
+  %nop6452 = alloca i1, i1 0
+  %nop6453 = alloca i1, i1 0
+  %nop6454 = alloca i1, i1 0
+  %nop6455 = alloca i1, i1 0
+  %nop6456 = alloca i1, i1 0
+  %nop6457 = alloca i1, i1 0
+  %nop6458 = alloca i1, i1 0
+  %nop6459 = alloca i1, i1 0
+  %nop6460 = alloca i1, i1 0
+  %nop6461 = alloca i1, i1 0
+  %nop6462 = alloca i1, i1 0
+  %nop6463 = alloca i1, i1 0
+  %nop6464 = alloca i1, i1 0
+  %nop6465 = alloca i1, i1 0
+  %nop6466 = alloca i1, i1 0
+  %nop6467 = alloca i1, i1 0
+  %nop6468 = alloca i1, i1 0
+  %nop6469 = alloca i1, i1 0
+  %nop6470 = alloca i1, i1 0
+  %nop6471 = alloca i1, i1 0
+  %nop6472 = alloca i1, i1 0
+  %nop6473 = alloca i1, i1 0
+  %nop6474 = alloca i1, i1 0
+  %nop6475 = alloca i1, i1 0
+  %nop6476 = alloca i1, i1 0
+  %nop6477 = alloca i1, i1 0
+  %nop6478 = alloca i1, i1 0
+  %nop6479 = alloca i1, i1 0
+  %nop6480 = alloca i1, i1 0
+  %nop6481 = alloca i1, i1 0
+  %nop6482 = alloca i1, i1 0
+  %nop6483 = alloca i1, i1 0
+  %nop6484 = alloca i1, i1 0
+  %nop6485 = alloca i1, i1 0
+  %nop6486 = alloca i1, i1 0
+  %nop6487 = alloca i1, i1 0
+  %nop6488 = alloca i1, i1 0
+  %nop6489 = alloca i1, i1 0
+  %nop6490 = alloca i1, i1 0
+  %nop6491 = alloca i1, i1 0
+  %nop6492 = alloca i1, i1 0
+  %nop6493 = alloca i1, i1 0
+  %nop6494 = alloca i1, i1 0
+  %nop6495 = alloca i1, i1 0
+  %nop6496 = alloca i1, i1 0
+  %nop6497 = alloca i1, i1 0
+  %nop6498 = alloca i1, i1 0
+  %nop6499 = alloca i1, i1 0
+  %nop6500 = alloca i1, i1 0
+  %nop6501 = alloca i1, i1 0
+  %nop6502 = alloca i1, i1 0
+  %nop6503 = alloca i1, i1 0
+  %nop6504 = alloca i1, i1 0
+  %nop6505 = alloca i1, i1 0
+  %nop6506 = alloca i1, i1 0
+  %nop6507 = alloca i1, i1 0
+  %nop6508 = alloca i1, i1 0
+  %nop6509 = alloca i1, i1 0
+  %nop6510 = alloca i1, i1 0
+  %nop6511 = alloca i1, i1 0
+  %nop6512 = alloca i1, i1 0
+  %nop6513 = alloca i1, i1 0
+  %nop6514 = alloca i1, i1 0
+  %nop6515 = alloca i1, i1 0
+  %nop6516 = alloca i1, i1 0
+  %nop6517 = alloca i1, i1 0
+  %nop6518 = alloca i1, i1 0
+  %nop6519 = alloca i1, i1 0
+  %nop6520 = alloca i1, i1 0
+  %nop6521 = alloca i1, i1 0
+  %nop6522 = alloca i1, i1 0
+  %nop6523 = alloca i1, i1 0
+  %nop6524 = alloca i1, i1 0
+  %nop6525 = alloca i1, i1 0
+  %nop6526 = alloca i1, i1 0
+  %nop6527 = alloca i1, i1 0
+  %nop6528 = alloca i1, i1 0
+  %nop6529 = alloca i1, i1 0
+  %nop6530 = alloca i1, i1 0
+  %nop6531 = alloca i1, i1 0
+  %nop6532 = alloca i1, i1 0
+  %nop6533 = alloca i1, i1 0
+  %nop6534 = alloca i1, i1 0
+  %nop6535 = alloca i1, i1 0
+  %nop6536 = alloca i1, i1 0
+  %nop6537 = alloca i1, i1 0
+  %nop6538 = alloca i1, i1 0
+  %nop6539 = alloca i1, i1 0
+  %nop6540 = alloca i1, i1 0
+  %nop6541 = alloca i1, i1 0
+  %nop6542 = alloca i1, i1 0
+  %nop6543 = alloca i1, i1 0
+  %nop6544 = alloca i1, i1 0
+  %nop6545 = alloca i1, i1 0
+  %nop6546 = alloca i1, i1 0
+  %nop6547 = alloca i1, i1 0
+  %nop6548 = alloca i1, i1 0
+  %nop6549 = alloca i1, i1 0
+  %nop6550 = alloca i1, i1 0
+  %nop6551 = alloca i1, i1 0
+  %nop6552 = alloca i1, i1 0
+  %nop6553 = alloca i1, i1 0
+  %nop6554 = alloca i1, i1 0
+  %nop6555 = alloca i1, i1 0
+  %nop6556 = alloca i1, i1 0
+  %nop6557 = alloca i1, i1 0
+  %nop6558 = alloca i1, i1 0
+  %nop6559 = alloca i1, i1 0
+  %nop6560 = alloca i1, i1 0
+  %nop6561 = alloca i1, i1 0
+  %nop6562 = alloca i1, i1 0
+  %nop6563 = alloca i1, i1 0
+  %nop6564 = alloca i1, i1 0
+  %nop6565 = alloca i1, i1 0
+  %nop6566 = alloca i1, i1 0
+  %nop6567 = alloca i1, i1 0
+  %nop6568 = alloca i1, i1 0
+  %nop6569 = alloca i1, i1 0
+  %nop6570 = alloca i1, i1 0
+  %nop6571 = alloca i1, i1 0
+  %nop6572 = alloca i1, i1 0
+  %nop6573 = alloca i1, i1 0
+  %nop6574 = alloca i1, i1 0
+  %nop6575 = alloca i1, i1 0
+  %nop6576 = alloca i1, i1 0
+  %nop6577 = alloca i1, i1 0
+  %nop6578 = alloca i1, i1 0
+  %nop6579 = alloca i1, i1 0
+  %nop6580 = alloca i1, i1 0
+  %nop6581 = alloca i1, i1 0
+  %nop6582 = alloca i1, i1 0
+  %nop6583 = alloca i1, i1 0
+  %nop6584 = alloca i1, i1 0
+  %nop6585 = alloca i1, i1 0
+  %nop6586 = alloca i1, i1 0
+  %nop6587 = alloca i1, i1 0
+  %nop6588 = alloca i1, i1 0
+  %nop6589 = alloca i1, i1 0
+  %nop6590 = alloca i1, i1 0
+  %nop6591 = alloca i1, i1 0
+  %nop6592 = alloca i1, i1 0
+  %nop6593 = alloca i1, i1 0
+  %nop6594 = alloca i1, i1 0
+  %nop6595 = alloca i1, i1 0
+  %nop6596 = alloca i1, i1 0
+  %nop6597 = alloca i1, i1 0
+  %nop6598 = alloca i1, i1 0
+  %nop6599 = alloca i1, i1 0
+  %nop6600 = alloca i1, i1 0
+  %nop6601 = alloca i1, i1 0
+  %nop6602 = alloca i1, i1 0
+  %nop6603 = alloca i1, i1 0
+  %nop6604 = alloca i1, i1 0
+  %nop6605 = alloca i1, i1 0
+  %nop6606 = alloca i1, i1 0
+  %nop6607 = alloca i1, i1 0
+  %nop6608 = alloca i1, i1 0
+  %nop6609 = alloca i1, i1 0
+  %nop6610 = alloca i1, i1 0
+  %nop6611 = alloca i1, i1 0
+  %nop6612 = alloca i1, i1 0
+  %nop6613 = alloca i1, i1 0
+  %nop6614 = alloca i1, i1 0
+  %nop6615 = alloca i1, i1 0
+  %nop6616 = alloca i1, i1 0
+  %nop6617 = alloca i1, i1 0
+  %nop6618 = alloca i1, i1 0
+  %nop6619 = alloca i1, i1 0
+  %nop6620 = alloca i1, i1 0
+  %nop6621 = alloca i1, i1 0
+  %nop6622 = alloca i1, i1 0
+  %nop6623 = alloca i1, i1 0
+  %nop6624 = alloca i1, i1 0
+  %nop6625 = alloca i1, i1 0
+  %nop6626 = alloca i1, i1 0
+  %nop6627 = alloca i1, i1 0
+  %nop6628 = alloca i1, i1 0
+  %nop6629 = alloca i1, i1 0
+  %nop6630 = alloca i1, i1 0
+  %nop6631 = alloca i1, i1 0
+  %nop6632 = alloca i1, i1 0
+  %nop6633 = alloca i1, i1 0
+  %nop6634 = alloca i1, i1 0
+  %nop6635 = alloca i1, i1 0
+  %nop6636 = alloca i1, i1 0
+  %nop6637 = alloca i1, i1 0
+  %nop6638 = alloca i1, i1 0
+  %nop6639 = alloca i1, i1 0
+  %nop6640 = alloca i1, i1 0
+  %nop6641 = alloca i1, i1 0
+  %nop6642 = alloca i1, i1 0
+  %nop6643 = alloca i1, i1 0
+  %nop6644 = alloca i1, i1 0
+  %nop6645 = alloca i1, i1 0
+  %nop6646 = alloca i1, i1 0
+  %nop6647 = alloca i1, i1 0
+  %nop6648 = alloca i1, i1 0
+  %nop6649 = alloca i1, i1 0
+  %nop6650 = alloca i1, i1 0
+  %nop6651 = alloca i1, i1 0
+  %nop6652 = alloca i1, i1 0
+  %nop6653 = alloca i1, i1 0
+  %nop6654 = alloca i1, i1 0
+  %nop6655 = alloca i1, i1 0
+  %nop6656 = alloca i1, i1 0
+  %nop6657 = alloca i1, i1 0
+  %nop6658 = alloca i1, i1 0
+  %nop6659 = alloca i1, i1 0
+  %nop6660 = alloca i1, i1 0
+  %nop6661 = alloca i1, i1 0
+  %nop6662 = alloca i1, i1 0
+  %nop6663 = alloca i1, i1 0
+  %nop6664 = alloca i1, i1 0
+  %nop6665 = alloca i1, i1 0
+  %nop6666 = alloca i1, i1 0
+  %nop6667 = alloca i1, i1 0
+  %nop6668 = alloca i1, i1 0
+  %nop6669 = alloca i1, i1 0
+  %nop6670 = alloca i1, i1 0
+  %nop6671 = alloca i1, i1 0
+  %nop6672 = alloca i1, i1 0
+  %nop6673 = alloca i1, i1 0
+  %nop6674 = alloca i1, i1 0
+  %nop6675 = alloca i1, i1 0
+  %nop6676 = alloca i1, i1 0
+  %nop6677 = alloca i1, i1 0
+  %nop6678 = alloca i1, i1 0
+  %nop6679 = alloca i1, i1 0
+  %nop6680 = alloca i1, i1 0
+  %nop6681 = alloca i1, i1 0
+  %nop6682 = alloca i1, i1 0
+  %nop6683 = alloca i1, i1 0
+  %nop6684 = alloca i1, i1 0
+  %nop6685 = alloca i1, i1 0
+  %nop6686 = alloca i1, i1 0
+  %nop6687 = alloca i1, i1 0
+  %nop6688 = alloca i1, i1 0
+  %nop6689 = alloca i1, i1 0
+  %nop6690 = alloca i1, i1 0
+  %nop6691 = alloca i1, i1 0
+  %nop6692 = alloca i1, i1 0
+  %nop6693 = alloca i1, i1 0
+  %nop6694 = alloca i1, i1 0
+  %nop6695 = alloca i1, i1 0
+  %nop6696 = alloca i1, i1 0
+  %nop6697 = alloca i1, i1 0
+  %nop6698 = alloca i1, i1 0
+  %nop6699 = alloca i1, i1 0
+  %nop6700 = alloca i1, i1 0
+  %nop6701 = alloca i1, i1 0
+  %nop6702 = alloca i1, i1 0
+  %nop6703 = alloca i1, i1 0
+  %nop6704 = alloca i1, i1 0
+  %nop6705 = alloca i1, i1 0
+  %nop6706 = alloca i1, i1 0
+  %nop6707 = alloca i1, i1 0
+  %nop6708 = alloca i1, i1 0
+  %nop6709 = alloca i1, i1 0
+  %nop6710 = alloca i1, i1 0
+  %nop6711 = alloca i1, i1 0
+  %nop6712 = alloca i1, i1 0
+  %nop6713 = alloca i1, i1 0
+  %nop6714 = alloca i1, i1 0
+  %nop6715 = alloca i1, i1 0
+  %nop6716 = alloca i1, i1 0
+  %nop6717 = alloca i1, i1 0
+  %nop6718 = alloca i1, i1 0
+  %nop6719 = alloca i1, i1 0
+  %nop6720 = alloca i1, i1 0
+  %nop6721 = alloca i1, i1 0
+  %nop6722 = alloca i1, i1 0
+  %nop6723 = alloca i1, i1 0
+  %nop6724 = alloca i1, i1 0
+  %nop6725 = alloca i1, i1 0
+  %nop6726 = alloca i1, i1 0
+  %nop6727 = alloca i1, i1 0
+  %nop6728 = alloca i1, i1 0
+  %nop6729 = alloca i1, i1 0
+  %nop6730 = alloca i1, i1 0
+  %nop6731 = alloca i1, i1 0
+  %nop6732 = alloca i1, i1 0
+  %nop6733 = alloca i1, i1 0
+  %nop6734 = alloca i1, i1 0
+  %nop6735 = alloca i1, i1 0
+  %nop6736 = alloca i1, i1 0
+  %nop6737 = alloca i1, i1 0
+  %nop6738 = alloca i1, i1 0
+  %nop6739 = alloca i1, i1 0
+  %nop6740 = alloca i1, i1 0
+  %nop6741 = alloca i1, i1 0
+  %nop6742 = alloca i1, i1 0
+  %nop6743 = alloca i1, i1 0
+  %nop6744 = alloca i1, i1 0
+  %nop6745 = alloca i1, i1 0
+  %nop6746 = alloca i1, i1 0
+  %nop6747 = alloca i1, i1 0
+  %nop6748 = alloca i1, i1 0
+  %nop6749 = alloca i1, i1 0
+  %nop6750 = alloca i1, i1 0
+  %nop6751 = alloca i1, i1 0
+  %nop6752 = alloca i1, i1 0
+  %nop6753 = alloca i1, i1 0
+  %nop6754 = alloca i1, i1 0
+  %nop6755 = alloca i1, i1 0
+  %nop6756 = alloca i1, i1 0
+  %nop6757 = alloca i1, i1 0
+  %nop6758 = alloca i1, i1 0
+  %nop6759 = alloca i1, i1 0
+  %nop6760 = alloca i1, i1 0
+  %nop6761 = alloca i1, i1 0
+  %nop6762 = alloca i1, i1 0
+  %nop6763 = alloca i1, i1 0
+  %nop6764 = alloca i1, i1 0
+  %nop6765 = alloca i1, i1 0
+  %nop6766 = alloca i1, i1 0
+  %nop6767 = alloca i1, i1 0
+  %nop6768 = alloca i1, i1 0
+  %nop6769 = alloca i1, i1 0
+  %nop6770 = alloca i1, i1 0
+  %nop6771 = alloca i1, i1 0
+  %nop6772 = alloca i1, i1 0
+  %nop6773 = alloca i1, i1 0
+  %nop6774 = alloca i1, i1 0
+  %nop6775 = alloca i1, i1 0
+  %nop6776 = alloca i1, i1 0
+  %nop6777 = alloca i1, i1 0
+  %nop6778 = alloca i1, i1 0
+  %nop6779 = alloca i1, i1 0
+  %nop6780 = alloca i1, i1 0
+  %nop6781 = alloca i1, i1 0
+  %nop6782 = alloca i1, i1 0
+  %nop6783 = alloca i1, i1 0
+  %nop6784 = alloca i1, i1 0
+  %nop6785 = alloca i1, i1 0
+  %nop6786 = alloca i1, i1 0
+  %nop6787 = alloca i1, i1 0
+  %nop6788 = alloca i1, i1 0
+  %nop6789 = alloca i1, i1 0
+  %nop6790 = alloca i1, i1 0
+  %nop6791 = alloca i1, i1 0
+  %nop6792 = alloca i1, i1 0
+  %nop6793 = alloca i1, i1 0
+  %nop6794 = alloca i1, i1 0
+  %nop6795 = alloca i1, i1 0
+  %nop6796 = alloca i1, i1 0
+  %nop6797 = alloca i1, i1 0
+  %nop6798 = alloca i1, i1 0
+  %nop6799 = alloca i1, i1 0
+  %nop6800 = alloca i1, i1 0
+  %nop6801 = alloca i1, i1 0
+  %nop6802 = alloca i1, i1 0
+  %nop6803 = alloca i1, i1 0
+  %nop6804 = alloca i1, i1 0
+  %nop6805 = alloca i1, i1 0
+  %nop6806 = alloca i1, i1 0
+  %nop6807 = alloca i1, i1 0
+  %nop6808 = alloca i1, i1 0
+  %nop6809 = alloca i1, i1 0
+  %nop6810 = alloca i1, i1 0
+  %nop6811 = alloca i1, i1 0
+  %nop6812 = alloca i1, i1 0
+  %nop6813 = alloca i1, i1 0
+  %nop6814 = alloca i1, i1 0
+  %nop6815 = alloca i1, i1 0
+  %nop6816 = alloca i1, i1 0
+  %nop6817 = alloca i1, i1 0
+  %nop6818 = alloca i1, i1 0
+  %nop6819 = alloca i1, i1 0
+  %nop6820 = alloca i1, i1 0
+  %nop6821 = alloca i1, i1 0
+  %nop6822 = alloca i1, i1 0
+  %nop6823 = alloca i1, i1 0
+  %nop6824 = alloca i1, i1 0
+  %nop6825 = alloca i1, i1 0
+  %nop6826 = alloca i1, i1 0
+  %nop6827 = alloca i1, i1 0
+  %nop6828 = alloca i1, i1 0
+  %nop6829 = alloca i1, i1 0
+  %nop6830 = alloca i1, i1 0
+  %nop6831 = alloca i1, i1 0
+  %nop6832 = alloca i1, i1 0
+  %nop6833 = alloca i1, i1 0
+  %nop6834 = alloca i1, i1 0
+  %nop6835 = alloca i1, i1 0
+  %nop6836 = alloca i1, i1 0
+  %nop6837 = alloca i1, i1 0
+  %nop6838 = alloca i1, i1 0
+  %nop6839 = alloca i1, i1 0
+  %nop6840 = alloca i1, i1 0
+  %nop6841 = alloca i1, i1 0
+  %nop6842 = alloca i1, i1 0
+  %nop6843 = alloca i1, i1 0
+  %nop6844 = alloca i1, i1 0
+  %nop6845 = alloca i1, i1 0
+  %nop6846 = alloca i1, i1 0
+  %nop6847 = alloca i1, i1 0
+  %nop6848 = alloca i1, i1 0
+  %nop6849 = alloca i1, i1 0
+  %nop6850 = alloca i1, i1 0
+  %nop6851 = alloca i1, i1 0
+  %nop6852 = alloca i1, i1 0
+  %nop6853 = alloca i1, i1 0
+  %nop6854 = alloca i1, i1 0
+  %nop6855 = alloca i1, i1 0
+  %nop6856 = alloca i1, i1 0
+  %nop6857 = alloca i1, i1 0
+  %nop6858 = alloca i1, i1 0
+  %nop6859 = alloca i1, i1 0
+  %nop6860 = alloca i1, i1 0
+  %nop6861 = alloca i1, i1 0
+  %nop6862 = alloca i1, i1 0
+  %nop6863 = alloca i1, i1 0
+  %nop6864 = alloca i1, i1 0
+  %nop6865 = alloca i1, i1 0
+  %nop6866 = alloca i1, i1 0
+  %nop6867 = alloca i1, i1 0
+  %nop6868 = alloca i1, i1 0
+  %nop6869 = alloca i1, i1 0
+  %nop6870 = alloca i1, i1 0
+  %nop6871 = alloca i1, i1 0
+  %nop6872 = alloca i1, i1 0
+  %nop6873 = alloca i1, i1 0
+  %nop6874 = alloca i1, i1 0
+  %nop6875 = alloca i1, i1 0
+  %nop6876 = alloca i1, i1 0
+  %nop6877 = alloca i1, i1 0
+  %nop6878 = alloca i1, i1 0
+  %nop6879 = alloca i1, i1 0
+  %nop6880 = alloca i1, i1 0
+  %nop6881 = alloca i1, i1 0
+  %nop6882 = alloca i1, i1 0
+  %nop6883 = alloca i1, i1 0
+  %nop6884 = alloca i1, i1 0
+  %nop6885 = alloca i1, i1 0
+  %nop6886 = alloca i1, i1 0
+  %nop6887 = alloca i1, i1 0
+  %nop6888 = alloca i1, i1 0
+  %nop6889 = alloca i1, i1 0
+  %nop6890 = alloca i1, i1 0
+  %nop6891 = alloca i1, i1 0
+  %nop6892 = alloca i1, i1 0
+  %nop6893 = alloca i1, i1 0
+  %nop6894 = alloca i1, i1 0
+  %nop6895 = alloca i1, i1 0
+  %nop6896 = alloca i1, i1 0
+  %nop6897 = alloca i1, i1 0
+  %nop6898 = alloca i1, i1 0
+  %nop6899 = alloca i1, i1 0
+  %nop6900 = alloca i1, i1 0
+  %nop6901 = alloca i1, i1 0
+  %nop6902 = alloca i1, i1 0
+  %nop6903 = alloca i1, i1 0
+  %nop6904 = alloca i1, i1 0
+  %nop6905 = alloca i1, i1 0
+  %nop6906 = alloca i1, i1 0
+  %nop6907 = alloca i1, i1 0
+  %nop6908 = alloca i1, i1 0
+  %nop6909 = alloca i1, i1 0
+  %nop6910 = alloca i1, i1 0
+  %nop6911 = alloca i1, i1 0
+  %nop6912 = alloca i1, i1 0
+  %nop6913 = alloca i1, i1 0
+  %nop6914 = alloca i1, i1 0
+  %nop6915 = alloca i1, i1 0
+  %nop6916 = alloca i1, i1 0
+  %nop6917 = alloca i1, i1 0
+  %nop6918 = alloca i1, i1 0
+  %nop6919 = alloca i1, i1 0
+  %nop6920 = alloca i1, i1 0
+  %nop6921 = alloca i1, i1 0
+  %nop6922 = alloca i1, i1 0
+  %nop6923 = alloca i1, i1 0
+  %nop6924 = alloca i1, i1 0
+  %nop6925 = alloca i1, i1 0
+  %nop6926 = alloca i1, i1 0
+  %nop6927 = alloca i1, i1 0
+  %nop6928 = alloca i1, i1 0
+  %nop6929 = alloca i1, i1 0
+  %nop6930 = alloca i1, i1 0
+  %nop6931 = alloca i1, i1 0
+  %nop6932 = alloca i1, i1 0
+  %nop6933 = alloca i1, i1 0
+  %nop6934 = alloca i1, i1 0
+  %nop6935 = alloca i1, i1 0
+  %nop6936 = alloca i1, i1 0
+  %nop6937 = alloca i1, i1 0
+  %nop6938 = alloca i1, i1 0
+  %nop6939 = alloca i1, i1 0
+  %nop6940 = alloca i1, i1 0
+  %nop6941 = alloca i1, i1 0
+  %nop6942 = alloca i1, i1 0
+  %nop6943 = alloca i1, i1 0
+  %nop6944 = alloca i1, i1 0
+  %nop6945 = alloca i1, i1 0
+  %nop6946 = alloca i1, i1 0
+  %nop6947 = alloca i1, i1 0
+  %nop6948 = alloca i1, i1 0
+  %nop6949 = alloca i1, i1 0
+  %nop6950 = alloca i1, i1 0
+  %nop6951 = alloca i1, i1 0
+  %nop6952 = alloca i1, i1 0
+  %nop6953 = alloca i1, i1 0
+  %nop6954 = alloca i1, i1 0
+  %nop6955 = alloca i1, i1 0
+  %nop6956 = alloca i1, i1 0
+  %nop6957 = alloca i1, i1 0
+  %nop6958 = alloca i1, i1 0
+  %nop6959 = alloca i1, i1 0
+  %nop6960 = alloca i1, i1 0
+  %nop6961 = alloca i1, i1 0
+  %nop6962 = alloca i1, i1 0
+  %nop6963 = alloca i1, i1 0
+  %nop6964 = alloca i1, i1 0
+  %nop6965 = alloca i1, i1 0
+  %nop6966 = alloca i1, i1 0
+  %nop6967 = alloca i1, i1 0
+  %nop6968 = alloca i1, i1 0
+  %nop6969 = alloca i1, i1 0
+  %nop6970 = alloca i1, i1 0
+  %nop6971 = alloca i1, i1 0
+  %nop6972 = alloca i1, i1 0
+  %nop6973 = alloca i1, i1 0
+  %nop6974 = alloca i1, i1 0
+  %nop6975 = alloca i1, i1 0
+  %nop6976 = alloca i1, i1 0
+  %nop6977 = alloca i1, i1 0
+  %nop6978 = alloca i1, i1 0
+  %nop6979 = alloca i1, i1 0
+  %nop6980 = alloca i1, i1 0
+  %nop6981 = alloca i1, i1 0
+  %nop6982 = alloca i1, i1 0
+  %nop6983 = alloca i1, i1 0
+  %nop6984 = alloca i1, i1 0
+  %nop6985 = alloca i1, i1 0
+  %nop6986 = alloca i1, i1 0
+  %nop6987 = alloca i1, i1 0
+  %nop6988 = alloca i1, i1 0
+  %nop6989 = alloca i1, i1 0
+  %nop6990 = alloca i1, i1 0
+  %nop6991 = alloca i1, i1 0
+  %nop6992 = alloca i1, i1 0
+  %nop6993 = alloca i1, i1 0
+  %nop6994 = alloca i1, i1 0
+  %nop6995 = alloca i1, i1 0
+  %nop6996 = alloca i1, i1 0
+  %nop6997 = alloca i1, i1 0
+  %nop6998 = alloca i1, i1 0
+  %nop6999 = alloca i1, i1 0
+  %nop7000 = alloca i1, i1 0
+  %nop7001 = alloca i1, i1 0
+  %nop7002 = alloca i1, i1 0
+  %nop7003 = alloca i1, i1 0
+  %nop7004 = alloca i1, i1 0
+  %nop7005 = alloca i1, i1 0
+  %nop7006 = alloca i1, i1 0
+  %nop7007 = alloca i1, i1 0
+  %nop7008 = alloca i1, i1 0
+  %nop7009 = alloca i1, i1 0
+  %nop7010 = alloca i1, i1 0
+  %nop7011 = alloca i1, i1 0
+  %nop7012 = alloca i1, i1 0
+  %nop7013 = alloca i1, i1 0
+  %nop7014 = alloca i1, i1 0
+  %nop7015 = alloca i1, i1 0
+  %nop7016 = alloca i1, i1 0
+  %nop7017 = alloca i1, i1 0
+  %nop7018 = alloca i1, i1 0
+  %nop7019 = alloca i1, i1 0
+  %nop7020 = alloca i1, i1 0
+  %nop7021 = alloca i1, i1 0
+  %nop7022 = alloca i1, i1 0
+  %nop7023 = alloca i1, i1 0
+  %nop7024 = alloca i1, i1 0
+  %nop7025 = alloca i1, i1 0
+  %nop7026 = alloca i1, i1 0
+  %nop7027 = alloca i1, i1 0
+  %nop7028 = alloca i1, i1 0
+  %nop7029 = alloca i1, i1 0
+  %nop7030 = alloca i1, i1 0
+  %nop7031 = alloca i1, i1 0
+  %nop7032 = alloca i1, i1 0
+  %nop7033 = alloca i1, i1 0
+  %nop7034 = alloca i1, i1 0
+  %nop7035 = alloca i1, i1 0
+  %nop7036 = alloca i1, i1 0
+  %nop7037 = alloca i1, i1 0
+  %nop7038 = alloca i1, i1 0
+  %nop7039 = alloca i1, i1 0
+  %nop7040 = alloca i1, i1 0
+  %nop7041 = alloca i1, i1 0
+  %nop7042 = alloca i1, i1 0
+  %nop7043 = alloca i1, i1 0
+  %nop7044 = alloca i1, i1 0
+  %nop7045 = alloca i1, i1 0
+  %nop7046 = alloca i1, i1 0
+  %nop7047 = alloca i1, i1 0
+  %nop7048 = alloca i1, i1 0
+  %nop7049 = alloca i1, i1 0
+  %nop7050 = alloca i1, i1 0
+  %nop7051 = alloca i1, i1 0
+  %nop7052 = alloca i1, i1 0
+  %nop7053 = alloca i1, i1 0
+  %nop7054 = alloca i1, i1 0
+  %nop7055 = alloca i1, i1 0
+  %nop7056 = alloca i1, i1 0
+  %nop7057 = alloca i1, i1 0
+  %nop7058 = alloca i1, i1 0
+  %nop7059 = alloca i1, i1 0
+  %nop7060 = alloca i1, i1 0
+  %nop7061 = alloca i1, i1 0
+  %nop7062 = alloca i1, i1 0
+  %nop7063 = alloca i1, i1 0
+  %nop7064 = alloca i1, i1 0
+  %nop7065 = alloca i1, i1 0
+  %nop7066 = alloca i1, i1 0
+  %nop7067 = alloca i1, i1 0
+  %nop7068 = alloca i1, i1 0
+  %nop7069 = alloca i1, i1 0
+  %nop7070 = alloca i1, i1 0
+  %nop7071 = alloca i1, i1 0
+  %nop7072 = alloca i1, i1 0
+  %nop7073 = alloca i1, i1 0
+  %nop7074 = alloca i1, i1 0
+  %nop7075 = alloca i1, i1 0
+  %nop7076 = alloca i1, i1 0
+  %nop7077 = alloca i1, i1 0
+  %nop7078 = alloca i1, i1 0
+  %nop7079 = alloca i1, i1 0
+  %nop7080 = alloca i1, i1 0
+  %nop7081 = alloca i1, i1 0
+  %nop7082 = alloca i1, i1 0
+  %nop7083 = alloca i1, i1 0
+  %nop7084 = alloca i1, i1 0
+  %nop7085 = alloca i1, i1 0
+  %nop7086 = alloca i1, i1 0
+  %nop7087 = alloca i1, i1 0
+  %nop7088 = alloca i1, i1 0
+  %nop7089 = alloca i1, i1 0
+  %nop7090 = alloca i1, i1 0
+  %nop7091 = alloca i1, i1 0
+  %nop7092 = alloca i1, i1 0
+  %nop7093 = alloca i1, i1 0
+  %nop7094 = alloca i1, i1 0
+  %nop7095 = alloca i1, i1 0
+  %nop7096 = alloca i1, i1 0
+  %nop7097 = alloca i1, i1 0
+  %nop7098 = alloca i1, i1 0
+  %nop7099 = alloca i1, i1 0
+  %nop7100 = alloca i1, i1 0
+  %nop7101 = alloca i1, i1 0
+  %nop7102 = alloca i1, i1 0
+  %nop7103 = alloca i1, i1 0
+  %nop7104 = alloca i1, i1 0
+  %nop7105 = alloca i1, i1 0
+  %nop7106 = alloca i1, i1 0
+  %nop7107 = alloca i1, i1 0
+  %nop7108 = alloca i1, i1 0
+  %nop7109 = alloca i1, i1 0
+  %nop7110 = alloca i1, i1 0
+  %nop7111 = alloca i1, i1 0
+  %nop7112 = alloca i1, i1 0
+  %nop7113 = alloca i1, i1 0
+  %nop7114 = alloca i1, i1 0
+  %nop7115 = alloca i1, i1 0
+  %nop7116 = alloca i1, i1 0
+  %nop7117 = alloca i1, i1 0
+  %nop7118 = alloca i1, i1 0
+  %nop7119 = alloca i1, i1 0
+  %nop7120 = alloca i1, i1 0
+  %nop7121 = alloca i1, i1 0
+  %nop7122 = alloca i1, i1 0
+  %nop7123 = alloca i1, i1 0
+  %nop7124 = alloca i1, i1 0
+  %nop7125 = alloca i1, i1 0
+  %nop7126 = alloca i1, i1 0
+  %nop7127 = alloca i1, i1 0
+  %nop7128 = alloca i1, i1 0
+  %nop7129 = alloca i1, i1 0
+  %nop7130 = alloca i1, i1 0
+  %nop7131 = alloca i1, i1 0
+  %nop7132 = alloca i1, i1 0
+  %nop7133 = alloca i1, i1 0
+  %nop7134 = alloca i1, i1 0
+  %nop7135 = alloca i1, i1 0
+  %nop7136 = alloca i1, i1 0
+  %nop7137 = alloca i1, i1 0
+  %nop7138 = alloca i1, i1 0
+  %nop7139 = alloca i1, i1 0
+  %nop7140 = alloca i1, i1 0
+  %nop7141 = alloca i1, i1 0
+  %nop7142 = alloca i1, i1 0
+  %nop7143 = alloca i1, i1 0
+  %nop7144 = alloca i1, i1 0
+  %nop7145 = alloca i1, i1 0
+  %nop7146 = alloca i1, i1 0
+  %nop7147 = alloca i1, i1 0
+  %nop7148 = alloca i1, i1 0
+  %nop7149 = alloca i1, i1 0
+  %nop7150 = alloca i1, i1 0
+  %nop7151 = alloca i1, i1 0
+  %nop7152 = alloca i1, i1 0
+  %nop7153 = alloca i1, i1 0
+  %nop7154 = alloca i1, i1 0
+  %nop7155 = alloca i1, i1 0
+  %nop7156 = alloca i1, i1 0
+  %nop7157 = alloca i1, i1 0
+  %nop7158 = alloca i1, i1 0
+  %nop7159 = alloca i1, i1 0
+  %nop7160 = alloca i1, i1 0
+  %nop7161 = alloca i1, i1 0
+  %nop7162 = alloca i1, i1 0
+  %nop7163 = alloca i1, i1 0
+  %nop7164 = alloca i1, i1 0
+  %nop7165 = alloca i1, i1 0
+  %nop7166 = alloca i1, i1 0
+  %nop7167 = alloca i1, i1 0
+  %nop7168 = alloca i1, i1 0
+  %nop7169 = alloca i1, i1 0
+  %nop7170 = alloca i1, i1 0
+  %nop7171 = alloca i1, i1 0
+  %nop7172 = alloca i1, i1 0
+  %nop7173 = alloca i1, i1 0
+  %nop7174 = alloca i1, i1 0
+  %nop7175 = alloca i1, i1 0
+  %nop7176 = alloca i1, i1 0
+  %nop7177 = alloca i1, i1 0
+  %nop7178 = alloca i1, i1 0
+  %nop7179 = alloca i1, i1 0
+  %nop7180 = alloca i1, i1 0
+  %nop7181 = alloca i1, i1 0
+  %nop7182 = alloca i1, i1 0
+  %nop7183 = alloca i1, i1 0
+  %nop7184 = alloca i1, i1 0
+  %nop7185 = alloca i1, i1 0
+  %nop7186 = alloca i1, i1 0
+  %nop7187 = alloca i1, i1 0
+  %nop7188 = alloca i1, i1 0
+  %nop7189 = alloca i1, i1 0
+  %nop7190 = alloca i1, i1 0
+  %nop7191 = alloca i1, i1 0
+  %nop7192 = alloca i1, i1 0
+  %nop7193 = alloca i1, i1 0
+  %nop7194 = alloca i1, i1 0
+  %nop7195 = alloca i1, i1 0
+  %nop7196 = alloca i1, i1 0
+  %nop7197 = alloca i1, i1 0
+  %nop7198 = alloca i1, i1 0
+  %nop7199 = alloca i1, i1 0
+  %nop7200 = alloca i1, i1 0
+  %nop7201 = alloca i1, i1 0
+  %nop7202 = alloca i1, i1 0
+  %nop7203 = alloca i1, i1 0
+  %nop7204 = alloca i1, i1 0
+  %nop7205 = alloca i1, i1 0
+  %nop7206 = alloca i1, i1 0
+  %nop7207 = alloca i1, i1 0
+  %nop7208 = alloca i1, i1 0
+  %nop7209 = alloca i1, i1 0
+  %nop7210 = alloca i1, i1 0
+  %nop7211 = alloca i1, i1 0
+  %nop7212 = alloca i1, i1 0
+  %nop7213 = alloca i1, i1 0
+  %nop7214 = alloca i1, i1 0
+  %nop7215 = alloca i1, i1 0
+  %nop7216 = alloca i1, i1 0
+  %nop7217 = alloca i1, i1 0
+  %nop7218 = alloca i1, i1 0
+  %nop7219 = alloca i1, i1 0
+  %nop7220 = alloca i1, i1 0
+  %nop7221 = alloca i1, i1 0
+  %nop7222 = alloca i1, i1 0
+  %nop7223 = alloca i1, i1 0
+  %nop7224 = alloca i1, i1 0
+  %nop7225 = alloca i1, i1 0
+  %nop7226 = alloca i1, i1 0
+  %nop7227 = alloca i1, i1 0
+  %nop7228 = alloca i1, i1 0
+  %nop7229 = alloca i1, i1 0
+  %nop7230 = alloca i1, i1 0
+  %nop7231 = alloca i1, i1 0
+  %nop7232 = alloca i1, i1 0
+  %nop7233 = alloca i1, i1 0
+  %nop7234 = alloca i1, i1 0
+  %nop7235 = alloca i1, i1 0
+  %nop7236 = alloca i1, i1 0
+  %nop7237 = alloca i1, i1 0
+  %nop7238 = alloca i1, i1 0
+  %nop7239 = alloca i1, i1 0
+  %nop7240 = alloca i1, i1 0
+  %nop7241 = alloca i1, i1 0
+  %nop7242 = alloca i1, i1 0
+  %nop7243 = alloca i1, i1 0
+  %nop7244 = alloca i1, i1 0
+  %nop7245 = alloca i1, i1 0
+  %nop7246 = alloca i1, i1 0
+  %nop7247 = alloca i1, i1 0
+  %nop7248 = alloca i1, i1 0
+  %nop7249 = alloca i1, i1 0
+  %nop7250 = alloca i1, i1 0
+  %nop7251 = alloca i1, i1 0
+  %nop7252 = alloca i1, i1 0
+  %nop7253 = alloca i1, i1 0
+  %nop7254 = alloca i1, i1 0
+  %nop7255 = alloca i1, i1 0
+  %nop7256 = alloca i1, i1 0
+  %nop7257 = alloca i1, i1 0
+  %nop7258 = alloca i1, i1 0
+  %nop7259 = alloca i1, i1 0
+  %nop7260 = alloca i1, i1 0
+  %nop7261 = alloca i1, i1 0
+  %nop7262 = alloca i1, i1 0
+  %nop7263 = alloca i1, i1 0
+  %nop7264 = alloca i1, i1 0
+  %nop7265 = alloca i1, i1 0
+  %nop7266 = alloca i1, i1 0
+  %nop7267 = alloca i1, i1 0
+  %nop7268 = alloca i1, i1 0
+  %nop7269 = alloca i1, i1 0
+  %nop7270 = alloca i1, i1 0
+  %nop7271 = alloca i1, i1 0
+  %nop7272 = alloca i1, i1 0
+  %nop7273 = alloca i1, i1 0
+  %nop7274 = alloca i1, i1 0
+  %nop7275 = alloca i1, i1 0
+  %nop7276 = alloca i1, i1 0
+  %nop7277 = alloca i1, i1 0
+  %nop7278 = alloca i1, i1 0
+  %nop7279 = alloca i1, i1 0
+  %nop7280 = alloca i1, i1 0
+  %nop7281 = alloca i1, i1 0
+  %nop7282 = alloca i1, i1 0
+  %nop7283 = alloca i1, i1 0
+  %nop7284 = alloca i1, i1 0
+  %nop7285 = alloca i1, i1 0
+  %nop7286 = alloca i1, i1 0
+  %nop7287 = alloca i1, i1 0
+  %nop7288 = alloca i1, i1 0
+  %nop7289 = alloca i1, i1 0
+  %nop7290 = alloca i1, i1 0
+  %nop7291 = alloca i1, i1 0
+  %nop7292 = alloca i1, i1 0
+  %nop7293 = alloca i1, i1 0
+  %nop7294 = alloca i1, i1 0
+  %nop7295 = alloca i1, i1 0
+  %nop7296 = alloca i1, i1 0
+  %nop7297 = alloca i1, i1 0
+  %nop7298 = alloca i1, i1 0
+  %nop7299 = alloca i1, i1 0
+  %nop7300 = alloca i1, i1 0
+  %nop7301 = alloca i1, i1 0
+  %nop7302 = alloca i1, i1 0
+  %nop7303 = alloca i1, i1 0
+  %nop7304 = alloca i1, i1 0
+  %nop7305 = alloca i1, i1 0
+  %nop7306 = alloca i1, i1 0
+  %nop7307 = alloca i1, i1 0
+  %nop7308 = alloca i1, i1 0
+  %nop7309 = alloca i1, i1 0
+  %nop7310 = alloca i1, i1 0
+  %nop7311 = alloca i1, i1 0
+  %nop7312 = alloca i1, i1 0
+  %nop7313 = alloca i1, i1 0
+  %nop7314 = alloca i1, i1 0
+  %nop7315 = alloca i1, i1 0
+  %nop7316 = alloca i1, i1 0
+  %nop7317 = alloca i1, i1 0
+  %nop7318 = alloca i1, i1 0
+  %nop7319 = alloca i1, i1 0
+  %nop7320 = alloca i1, i1 0
+  %nop7321 = alloca i1, i1 0
+  %nop7322 = alloca i1, i1 0
+  %nop7323 = alloca i1, i1 0
+  %nop7324 = alloca i1, i1 0
+  %nop7325 = alloca i1, i1 0
+  %nop7326 = alloca i1, i1 0
+  %nop7327 = alloca i1, i1 0
+  %nop7328 = alloca i1, i1 0
+  %nop7329 = alloca i1, i1 0
+  %nop7330 = alloca i1, i1 0
+  %nop7331 = alloca i1, i1 0
+  %nop7332 = alloca i1, i1 0
+  %nop7333 = alloca i1, i1 0
+  %nop7334 = alloca i1, i1 0
+  %nop7335 = alloca i1, i1 0
+  %nop7336 = alloca i1, i1 0
+  %nop7337 = alloca i1, i1 0
+  %nop7338 = alloca i1, i1 0
+  %nop7339 = alloca i1, i1 0
+  %nop7340 = alloca i1, i1 0
+  %nop7341 = alloca i1, i1 0
+  %nop7342 = alloca i1, i1 0
+  %nop7343 = alloca i1, i1 0
+  %nop7344 = alloca i1, i1 0
+  %nop7345 = alloca i1, i1 0
+  %nop7346 = alloca i1, i1 0
+  %nop7347 = alloca i1, i1 0
+  %nop7348 = alloca i1, i1 0
+  %nop7349 = alloca i1, i1 0
+  %nop7350 = alloca i1, i1 0
+  %nop7351 = alloca i1, i1 0
+  %nop7352 = alloca i1, i1 0
+  %nop7353 = alloca i1, i1 0
+  %nop7354 = alloca i1, i1 0
+  %nop7355 = alloca i1, i1 0
+  %nop7356 = alloca i1, i1 0
+  %nop7357 = alloca i1, i1 0
+  %nop7358 = alloca i1, i1 0
+  %nop7359 = alloca i1, i1 0
+  %nop7360 = alloca i1, i1 0
+  %nop7361 = alloca i1, i1 0
+  %nop7362 = alloca i1, i1 0
+  %nop7363 = alloca i1, i1 0
+  %nop7364 = alloca i1, i1 0
+  %nop7365 = alloca i1, i1 0
+  %nop7366 = alloca i1, i1 0
+  %nop7367 = alloca i1, i1 0
+  %nop7368 = alloca i1, i1 0
+  %nop7369 = alloca i1, i1 0
+  %nop7370 = alloca i1, i1 0
+  %nop7371 = alloca i1, i1 0
+  %nop7372 = alloca i1, i1 0
+  %nop7373 = alloca i1, i1 0
+  %nop7374 = alloca i1, i1 0
+  %nop7375 = alloca i1, i1 0
+  %nop7376 = alloca i1, i1 0
+  %nop7377 = alloca i1, i1 0
+  %nop7378 = alloca i1, i1 0
+  %nop7379 = alloca i1, i1 0
+  %nop7380 = alloca i1, i1 0
+  %nop7381 = alloca i1, i1 0
+  %nop7382 = alloca i1, i1 0
+  %nop7383 = alloca i1, i1 0
+  %nop7384 = alloca i1, i1 0
+  %nop7385 = alloca i1, i1 0
+  %nop7386 = alloca i1, i1 0
+  %nop7387 = alloca i1, i1 0
+  %nop7388 = alloca i1, i1 0
+  %nop7389 = alloca i1, i1 0
+  %nop7390 = alloca i1, i1 0
+  %nop7391 = alloca i1, i1 0
+  %nop7392 = alloca i1, i1 0
+  %nop7393 = alloca i1, i1 0
+  %nop7394 = alloca i1, i1 0
+  %nop7395 = alloca i1, i1 0
+  %nop7396 = alloca i1, i1 0
+  %nop7397 = alloca i1, i1 0
+  %nop7398 = alloca i1, i1 0
+  %nop7399 = alloca i1, i1 0
+  %nop7400 = alloca i1, i1 0
+  %nop7401 = alloca i1, i1 0
+  %nop7402 = alloca i1, i1 0
+  %nop7403 = alloca i1, i1 0
+  %nop7404 = alloca i1, i1 0
+  %nop7405 = alloca i1, i1 0
+  %nop7406 = alloca i1, i1 0
+  %nop7407 = alloca i1, i1 0
+  %nop7408 = alloca i1, i1 0
+  %nop7409 = alloca i1, i1 0
+  %nop7410 = alloca i1, i1 0
+  %nop7411 = alloca i1, i1 0
+  %nop7412 = alloca i1, i1 0
+  %nop7413 = alloca i1, i1 0
+  %nop7414 = alloca i1, i1 0
+  %nop7415 = alloca i1, i1 0
+  %nop7416 = alloca i1, i1 0
+  %nop7417 = alloca i1, i1 0
+  %nop7418 = alloca i1, i1 0
+  %nop7419 = alloca i1, i1 0
+  %nop7420 = alloca i1, i1 0
+  %nop7421 = alloca i1, i1 0
+  %nop7422 = alloca i1, i1 0
+  %nop7423 = alloca i1, i1 0
+  %nop7424 = alloca i1, i1 0
+  %nop7425 = alloca i1, i1 0
+  %nop7426 = alloca i1, i1 0
+  %nop7427 = alloca i1, i1 0
+  %nop7428 = alloca i1, i1 0
+  %nop7429 = alloca i1, i1 0
+  %nop7430 = alloca i1, i1 0
+  %nop7431 = alloca i1, i1 0
+  %nop7432 = alloca i1, i1 0
+  %nop7433 = alloca i1, i1 0
+  %nop7434 = alloca i1, i1 0
+  %nop7435 = alloca i1, i1 0
+  %nop7436 = alloca i1, i1 0
+  %nop7437 = alloca i1, i1 0
+  %nop7438 = alloca i1, i1 0
+  %nop7439 = alloca i1, i1 0
+  %nop7440 = alloca i1, i1 0
+  %nop7441 = alloca i1, i1 0
+  %nop7442 = alloca i1, i1 0
+  %nop7443 = alloca i1, i1 0
+  %nop7444 = alloca i1, i1 0
+  %nop7445 = alloca i1, i1 0
+  %nop7446 = alloca i1, i1 0
+  %nop7447 = alloca i1, i1 0
+  %nop7448 = alloca i1, i1 0
+  %nop7449 = alloca i1, i1 0
+  %nop7450 = alloca i1, i1 0
+  %nop7451 = alloca i1, i1 0
+  %nop7452 = alloca i1, i1 0
+  %nop7453 = alloca i1, i1 0
+  %nop7454 = alloca i1, i1 0
+  %nop7455 = alloca i1, i1 0
+  %nop7456 = alloca i1, i1 0
+  %nop7457 = alloca i1, i1 0
+  %nop7458 = alloca i1, i1 0
+  %nop7459 = alloca i1, i1 0
+  %nop7460 = alloca i1, i1 0
+  %nop7461 = alloca i1, i1 0
+  %nop7462 = alloca i1, i1 0
+  %nop7463 = alloca i1, i1 0
+  %nop7464 = alloca i1, i1 0
+  %nop7465 = alloca i1, i1 0
+  %nop7466 = alloca i1, i1 0
+  %nop7467 = alloca i1, i1 0
+  %nop7468 = alloca i1, i1 0
+  %nop7469 = alloca i1, i1 0
+  %nop7470 = alloca i1, i1 0
+  %nop7471 = alloca i1, i1 0
+  %nop7472 = alloca i1, i1 0
+  %nop7473 = alloca i1, i1 0
+  %nop7474 = alloca i1, i1 0
+  %nop7475 = alloca i1, i1 0
+  %nop7476 = alloca i1, i1 0
+  %nop7477 = alloca i1, i1 0
+  %nop7478 = alloca i1, i1 0
+  %nop7479 = alloca i1, i1 0
+  %nop7480 = alloca i1, i1 0
+  %nop7481 = alloca i1, i1 0
+  %nop7482 = alloca i1, i1 0
+  %nop7483 = alloca i1, i1 0
+  %nop7484 = alloca i1, i1 0
+  %nop7485 = alloca i1, i1 0
+  %nop7486 = alloca i1, i1 0
+  %nop7487 = alloca i1, i1 0
+  %nop7488 = alloca i1, i1 0
+  %nop7489 = alloca i1, i1 0
+  %nop7490 = alloca i1, i1 0
+  %nop7491 = alloca i1, i1 0
+  %nop7492 = alloca i1, i1 0
+  %nop7493 = alloca i1, i1 0
+  %nop7494 = alloca i1, i1 0
+  %nop7495 = alloca i1, i1 0
+  %nop7496 = alloca i1, i1 0
+  %nop7497 = alloca i1, i1 0
+  %nop7498 = alloca i1, i1 0
+  %nop7499 = alloca i1, i1 0
+  %nop7500 = alloca i1, i1 0
+  %nop7501 = alloca i1, i1 0
+  %nop7502 = alloca i1, i1 0
+  %nop7503 = alloca i1, i1 0
+  %nop7504 = alloca i1, i1 0
+  %nop7505 = alloca i1, i1 0
+  %nop7506 = alloca i1, i1 0
+  %nop7507 = alloca i1, i1 0
+  %nop7508 = alloca i1, i1 0
+  %nop7509 = alloca i1, i1 0
+  %nop7510 = alloca i1, i1 0
+  %nop7511 = alloca i1, i1 0
+  %nop7512 = alloca i1, i1 0
+  %nop7513 = alloca i1, i1 0
+  %nop7514 = alloca i1, i1 0
+  %nop7515 = alloca i1, i1 0
+  %nop7516 = alloca i1, i1 0
+  %nop7517 = alloca i1, i1 0
+  %nop7518 = alloca i1, i1 0
+  %nop7519 = alloca i1, i1 0
+  %nop7520 = alloca i1, i1 0
+  %nop7521 = alloca i1, i1 0
+  %nop7522 = alloca i1, i1 0
+  %nop7523 = alloca i1, i1 0
+  %nop7524 = alloca i1, i1 0
+  %nop7525 = alloca i1, i1 0
+  %nop7526 = alloca i1, i1 0
+  %nop7527 = alloca i1, i1 0
+  %nop7528 = alloca i1, i1 0
+  %nop7529 = alloca i1, i1 0
+  %nop7530 = alloca i1, i1 0
+  %nop7531 = alloca i1, i1 0
+  %nop7532 = alloca i1, i1 0
+  %nop7533 = alloca i1, i1 0
+  %nop7534 = alloca i1, i1 0
+  %nop7535 = alloca i1, i1 0
+  %nop7536 = alloca i1, i1 0
+  %nop7537 = alloca i1, i1 0
+  %nop7538 = alloca i1, i1 0
+  %nop7539 = alloca i1, i1 0
+  %nop7540 = alloca i1, i1 0
+  %nop7541 = alloca i1, i1 0
+  %nop7542 = alloca i1, i1 0
+  %nop7543 = alloca i1, i1 0
+  %nop7544 = alloca i1, i1 0
+  %nop7545 = alloca i1, i1 0
+  %nop7546 = alloca i1, i1 0
+  %nop7547 = alloca i1, i1 0
+  %nop7548 = alloca i1, i1 0
+  %nop7549 = alloca i1, i1 0
+  %nop7550 = alloca i1, i1 0
+  %nop7551 = alloca i1, i1 0
+  %nop7552 = alloca i1, i1 0
+  %nop7553 = alloca i1, i1 0
+  %nop7554 = alloca i1, i1 0
+  %nop7555 = alloca i1, i1 0
+  %nop7556 = alloca i1, i1 0
+  %nop7557 = alloca i1, i1 0
+  %nop7558 = alloca i1, i1 0
+  %nop7559 = alloca i1, i1 0
+  %nop7560 = alloca i1, i1 0
+  %nop7561 = alloca i1, i1 0
+  %nop7562 = alloca i1, i1 0
+  %nop7563 = alloca i1, i1 0
+  %nop7564 = alloca i1, i1 0
+  %nop7565 = alloca i1, i1 0
+  %nop7566 = alloca i1, i1 0
+  %nop7567 = alloca i1, i1 0
+  %nop7568 = alloca i1, i1 0
+  %nop7569 = alloca i1, i1 0
+  %nop7570 = alloca i1, i1 0
+  %nop7571 = alloca i1, i1 0
+  %nop7572 = alloca i1, i1 0
+  %nop7573 = alloca i1, i1 0
+  %nop7574 = alloca i1, i1 0
+  %nop7575 = alloca i1, i1 0
+  %nop7576 = alloca i1, i1 0
+  %nop7577 = alloca i1, i1 0
+  %nop7578 = alloca i1, i1 0
+  %nop7579 = alloca i1, i1 0
+  %nop7580 = alloca i1, i1 0
+  %nop7581 = alloca i1, i1 0
+  %nop7582 = alloca i1, i1 0
+  %nop7583 = alloca i1, i1 0
+  %nop7584 = alloca i1, i1 0
+  %nop7585 = alloca i1, i1 0
+  %nop7586 = alloca i1, i1 0
+  %nop7587 = alloca i1, i1 0
+  %nop7588 = alloca i1, i1 0
+  %nop7589 = alloca i1, i1 0
+  %nop7590 = alloca i1, i1 0
+  %nop7591 = alloca i1, i1 0
+  %nop7592 = alloca i1, i1 0
+  %nop7593 = alloca i1, i1 0
+  %nop7594 = alloca i1, i1 0
+  %nop7595 = alloca i1, i1 0
+  %nop7596 = alloca i1, i1 0
+  %nop7597 = alloca i1, i1 0
+  %nop7598 = alloca i1, i1 0
+  %nop7599 = alloca i1, i1 0
+  %nop7600 = alloca i1, i1 0
+  %nop7601 = alloca i1, i1 0
+  %nop7602 = alloca i1, i1 0
+  %nop7603 = alloca i1, i1 0
+  %nop7604 = alloca i1, i1 0
+  %nop7605 = alloca i1, i1 0
+  %nop7606 = alloca i1, i1 0
+  %nop7607 = alloca i1, i1 0
+  %nop7608 = alloca i1, i1 0
+  %nop7609 = alloca i1, i1 0
+  %nop7610 = alloca i1, i1 0
+  %nop7611 = alloca i1, i1 0
+  %nop7612 = alloca i1, i1 0
+  %nop7613 = alloca i1, i1 0
+  %nop7614 = alloca i1, i1 0
+  %nop7615 = alloca i1, i1 0
+  %nop7616 = alloca i1, i1 0
+  %nop7617 = alloca i1, i1 0
+  %nop7618 = alloca i1, i1 0
+  %nop7619 = alloca i1, i1 0
+  %nop7620 = alloca i1, i1 0
+  %nop7621 = alloca i1, i1 0
+  %nop7622 = alloca i1, i1 0
+  %nop7623 = alloca i1, i1 0
+  %nop7624 = alloca i1, i1 0
+  %nop7625 = alloca i1, i1 0
+  %nop7626 = alloca i1, i1 0
+  %nop7627 = alloca i1, i1 0
+  %nop7628 = alloca i1, i1 0
+  %nop7629 = alloca i1, i1 0
+  %nop7630 = alloca i1, i1 0
+  %nop7631 = alloca i1, i1 0
+  %nop7632 = alloca i1, i1 0
+  %nop7633 = alloca i1, i1 0
+  %nop7634 = alloca i1, i1 0
+  %nop7635 = alloca i1, i1 0
+  %nop7636 = alloca i1, i1 0
+  %nop7637 = alloca i1, i1 0
+  %nop7638 = alloca i1, i1 0
+  %nop7639 = alloca i1, i1 0
+  %nop7640 = alloca i1, i1 0
+  %nop7641 = alloca i1, i1 0
+  %nop7642 = alloca i1, i1 0
+  %nop7643 = alloca i1, i1 0
+  %nop7644 = alloca i1, i1 0
+  %nop7645 = alloca i1, i1 0
+  %nop7646 = alloca i1, i1 0
+  %nop7647 = alloca i1, i1 0
+  %nop7648 = alloca i1, i1 0
+  %nop7649 = alloca i1, i1 0
+  %nop7650 = alloca i1, i1 0
+  %nop7651 = alloca i1, i1 0
+  %nop7652 = alloca i1, i1 0
+  %nop7653 = alloca i1, i1 0
+  %nop7654 = alloca i1, i1 0
+  %nop7655 = alloca i1, i1 0
+  %nop7656 = alloca i1, i1 0
+  %nop7657 = alloca i1, i1 0
+  %nop7658 = alloca i1, i1 0
+  %nop7659 = alloca i1, i1 0
+  %nop7660 = alloca i1, i1 0
+  %nop7661 = alloca i1, i1 0
+  %nop7662 = alloca i1, i1 0
+  %nop7663 = alloca i1, i1 0
+  %nop7664 = alloca i1, i1 0
+  %nop7665 = alloca i1, i1 0
+  %nop7666 = alloca i1, i1 0
+  %nop7667 = alloca i1, i1 0
+  %nop7668 = alloca i1, i1 0
+  %nop7669 = alloca i1, i1 0
+  %nop7670 = alloca i1, i1 0
+  %nop7671 = alloca i1, i1 0
+  %nop7672 = alloca i1, i1 0
+  %nop7673 = alloca i1, i1 0
+  %nop7674 = alloca i1, i1 0
+  %nop7675 = alloca i1, i1 0
+  %nop7676 = alloca i1, i1 0
+  %nop7677 = alloca i1, i1 0
+  %nop7678 = alloca i1, i1 0
+  %nop7679 = alloca i1, i1 0
+  %nop7680 = alloca i1, i1 0
+  %nop7681 = alloca i1, i1 0
+  %nop7682 = alloca i1, i1 0
+  %nop7683 = alloca i1, i1 0
+  %nop7684 = alloca i1, i1 0
+  %nop7685 = alloca i1, i1 0
+  %nop7686 = alloca i1, i1 0
+  %nop7687 = alloca i1, i1 0
+  %nop7688 = alloca i1, i1 0
+  %nop7689 = alloca i1, i1 0
+  %nop7690 = alloca i1, i1 0
+  %nop7691 = alloca i1, i1 0
+  %nop7692 = alloca i1, i1 0
+  %nop7693 = alloca i1, i1 0
+  %nop7694 = alloca i1, i1 0
+  %nop7695 = alloca i1, i1 0
+  %nop7696 = alloca i1, i1 0
+  %nop7697 = alloca i1, i1 0
+  %nop7698 = alloca i1, i1 0
+  %nop7699 = alloca i1, i1 0
+  %nop7700 = alloca i1, i1 0
+  %nop7701 = alloca i1, i1 0
+  %nop7702 = alloca i1, i1 0
+  %nop7703 = alloca i1, i1 0
+  %nop7704 = alloca i1, i1 0
+  %nop7705 = alloca i1, i1 0
+  %nop7706 = alloca i1, i1 0
+  %nop7707 = alloca i1, i1 0
+  %nop7708 = alloca i1, i1 0
+  %nop7709 = alloca i1, i1 0
+  %nop7710 = alloca i1, i1 0
+  %nop7711 = alloca i1, i1 0
+  %nop7712 = alloca i1, i1 0
+  %nop7713 = alloca i1, i1 0
+  %nop7714 = alloca i1, i1 0
+  %nop7715 = alloca i1, i1 0
+  %nop7716 = alloca i1, i1 0
+  %nop7717 = alloca i1, i1 0
+  %nop7718 = alloca i1, i1 0
+  %nop7719 = alloca i1, i1 0
+  %nop7720 = alloca i1, i1 0
+  %nop7721 = alloca i1, i1 0
+  %nop7722 = alloca i1, i1 0
+  %nop7723 = alloca i1, i1 0
+  %nop7724 = alloca i1, i1 0
+  %nop7725 = alloca i1, i1 0
+  %nop7726 = alloca i1, i1 0
+  %nop7727 = alloca i1, i1 0
+  %nop7728 = alloca i1, i1 0
+  %nop7729 = alloca i1, i1 0
+  %nop7730 = alloca i1, i1 0
+  %nop7731 = alloca i1, i1 0
+  %nop7732 = alloca i1, i1 0
+  %nop7733 = alloca i1, i1 0
+  %nop7734 = alloca i1, i1 0
+  %nop7735 = alloca i1, i1 0
+  %nop7736 = alloca i1, i1 0
+  %nop7737 = alloca i1, i1 0
+  %nop7738 = alloca i1, i1 0
+  %nop7739 = alloca i1, i1 0
+  %nop7740 = alloca i1, i1 0
+  %nop7741 = alloca i1, i1 0
+  %nop7742 = alloca i1, i1 0
+  %nop7743 = alloca i1, i1 0
+  %nop7744 = alloca i1, i1 0
+  %nop7745 = alloca i1, i1 0
+  %nop7746 = alloca i1, i1 0
+  %nop7747 = alloca i1, i1 0
+  %nop7748 = alloca i1, i1 0
+  %nop7749 = alloca i1, i1 0
+  %nop7750 = alloca i1, i1 0
+  %nop7751 = alloca i1, i1 0
+  %nop7752 = alloca i1, i1 0
+  %nop7753 = alloca i1, i1 0
+  %nop7754 = alloca i1, i1 0
+  %nop7755 = alloca i1, i1 0
+  %nop7756 = alloca i1, i1 0
+  %nop7757 = alloca i1, i1 0
+  %nop7758 = alloca i1, i1 0
+  %nop7759 = alloca i1, i1 0
+  %nop7760 = alloca i1, i1 0
+  %nop7761 = alloca i1, i1 0
+  %nop7762 = alloca i1, i1 0
+  %nop7763 = alloca i1, i1 0
+  %nop7764 = alloca i1, i1 0
+  %nop7765 = alloca i1, i1 0
+  %nop7766 = alloca i1, i1 0
+  %nop7767 = alloca i1, i1 0
+  %nop7768 = alloca i1, i1 0
+  %nop7769 = alloca i1, i1 0
+  %nop7770 = alloca i1, i1 0
+  %nop7771 = alloca i1, i1 0
+  %nop7772 = alloca i1, i1 0
+  %nop7773 = alloca i1, i1 0
+  %nop7774 = alloca i1, i1 0
+  %nop7775 = alloca i1, i1 0
+  %nop7776 = alloca i1, i1 0
+  %nop7777 = alloca i1, i1 0
+  %nop7778 = alloca i1, i1 0
+  %nop7779 = alloca i1, i1 0
+  %nop7780 = alloca i1, i1 0
+  %nop7781 = alloca i1, i1 0
+  %nop7782 = alloca i1, i1 0
+  %nop7783 = alloca i1, i1 0
+  %nop7784 = alloca i1, i1 0
+  %nop7785 = alloca i1, i1 0
+  %nop7786 = alloca i1, i1 0
+  %nop7787 = alloca i1, i1 0
+  %nop7788 = alloca i1, i1 0
+  %nop7789 = alloca i1, i1 0
+  %nop7790 = alloca i1, i1 0
+  %nop7791 = alloca i1, i1 0
+  %nop7792 = alloca i1, i1 0
+  %nop7793 = alloca i1, i1 0
+  %nop7794 = alloca i1, i1 0
+  %nop7795 = alloca i1, i1 0
+  %nop7796 = alloca i1, i1 0
+  %nop7797 = alloca i1, i1 0
+  %nop7798 = alloca i1, i1 0
+  %nop7799 = alloca i1, i1 0
+  %nop7800 = alloca i1, i1 0
+  %nop7801 = alloca i1, i1 0
+  %nop7802 = alloca i1, i1 0
+  %nop7803 = alloca i1, i1 0
+  %nop7804 = alloca i1, i1 0
+  %nop7805 = alloca i1, i1 0
+  %nop7806 = alloca i1, i1 0
+  %nop7807 = alloca i1, i1 0
+  %nop7808 = alloca i1, i1 0
+  %nop7809 = alloca i1, i1 0
+  %nop7810 = alloca i1, i1 0
+  %nop7811 = alloca i1, i1 0
+  %nop7812 = alloca i1, i1 0
+  %nop7813 = alloca i1, i1 0
+  %nop7814 = alloca i1, i1 0
+  %nop7815 = alloca i1, i1 0
+  %nop7816 = alloca i1, i1 0
+  %nop7817 = alloca i1, i1 0
+  %nop7818 = alloca i1, i1 0
+  %nop7819 = alloca i1, i1 0
+  %nop7820 = alloca i1, i1 0
+  %nop7821 = alloca i1, i1 0
+  %nop7822 = alloca i1, i1 0
+  %nop7823 = alloca i1, i1 0
+  %nop7824 = alloca i1, i1 0
+  %nop7825 = alloca i1, i1 0
+  %nop7826 = alloca i1, i1 0
+  %nop7827 = alloca i1, i1 0
+  %nop7828 = alloca i1, i1 0
+  %nop7829 = alloca i1, i1 0
+  %nop7830 = alloca i1, i1 0
+  %nop7831 = alloca i1, i1 0
+  %nop7832 = alloca i1, i1 0
+  %nop7833 = alloca i1, i1 0
+  %nop7834 = alloca i1, i1 0
+  %nop7835 = alloca i1, i1 0
+  %nop7836 = alloca i1, i1 0
+  %nop7837 = alloca i1, i1 0
+  %nop7838 = alloca i1, i1 0
+  %nop7839 = alloca i1, i1 0
+  %nop7840 = alloca i1, i1 0
+  %nop7841 = alloca i1, i1 0
+  %nop7842 = alloca i1, i1 0
+  %nop7843 = alloca i1, i1 0
+  %nop7844 = alloca i1, i1 0
+  %nop7845 = alloca i1, i1 0
+  %nop7846 = alloca i1, i1 0
+  %nop7847 = alloca i1, i1 0
+  %nop7848 = alloca i1, i1 0
+  %nop7849 = alloca i1, i1 0
+  %nop7850 = alloca i1, i1 0
+  %nop7851 = alloca i1, i1 0
+  %nop7852 = alloca i1, i1 0
+  %nop7853 = alloca i1, i1 0
+  %nop7854 = alloca i1, i1 0
+  %nop7855 = alloca i1, i1 0
+  %nop7856 = alloca i1, i1 0
+  %nop7857 = alloca i1, i1 0
+  %nop7858 = alloca i1, i1 0
+  %nop7859 = alloca i1, i1 0
+  %nop7860 = alloca i1, i1 0
+  %nop7861 = alloca i1, i1 0
+  %nop7862 = alloca i1, i1 0
+  %nop7863 = alloca i1, i1 0
+  %nop7864 = alloca i1, i1 0
+  %nop7865 = alloca i1, i1 0
+  %nop7866 = alloca i1, i1 0
+  %nop7867 = alloca i1, i1 0
+  %nop7868 = alloca i1, i1 0
+  %nop7869 = alloca i1, i1 0
+  %nop7870 = alloca i1, i1 0
+  %nop7871 = alloca i1, i1 0
+  %nop7872 = alloca i1, i1 0
+  %nop7873 = alloca i1, i1 0
+  %nop7874 = alloca i1, i1 0
+  %nop7875 = alloca i1, i1 0
+  %nop7876 = alloca i1, i1 0
+  %nop7877 = alloca i1, i1 0
+  %nop7878 = alloca i1, i1 0
+  %nop7879 = alloca i1, i1 0
+  %nop7880 = alloca i1, i1 0
+  %nop7881 = alloca i1, i1 0
+  %nop7882 = alloca i1, i1 0
+  %nop7883 = alloca i1, i1 0
+  %nop7884 = alloca i1, i1 0
+  %nop7885 = alloca i1, i1 0
+  %nop7886 = alloca i1, i1 0
+  %nop7887 = alloca i1, i1 0
+  %nop7888 = alloca i1, i1 0
+  %nop7889 = alloca i1, i1 0
+  %nop7890 = alloca i1, i1 0
+  %nop7891 = alloca i1, i1 0
+  %nop7892 = alloca i1, i1 0
+  %nop7893 = alloca i1, i1 0
+  %nop7894 = alloca i1, i1 0
+  %nop7895 = alloca i1, i1 0
+  %nop7896 = alloca i1, i1 0
+  %nop7897 = alloca i1, i1 0
+  %nop7898 = alloca i1, i1 0
+  %nop7899 = alloca i1, i1 0
+  %nop7900 = alloca i1, i1 0
+  %nop7901 = alloca i1, i1 0
+  %nop7902 = alloca i1, i1 0
+  %nop7903 = alloca i1, i1 0
+  %nop7904 = alloca i1, i1 0
+  %nop7905 = alloca i1, i1 0
+  %nop7906 = alloca i1, i1 0
+  %nop7907 = alloca i1, i1 0
+  %nop7908 = alloca i1, i1 0
+  %nop7909 = alloca i1, i1 0
+  %nop7910 = alloca i1, i1 0
+  %nop7911 = alloca i1, i1 0
+  %nop7912 = alloca i1, i1 0
+  %nop7913 = alloca i1, i1 0
+  %nop7914 = alloca i1, i1 0
+  %nop7915 = alloca i1, i1 0
+  %nop7916 = alloca i1, i1 0
+  %nop7917 = alloca i1, i1 0
+  %nop7918 = alloca i1, i1 0
+  %nop7919 = alloca i1, i1 0
+  %nop7920 = alloca i1, i1 0
+  %nop7921 = alloca i1, i1 0
+  %nop7922 = alloca i1, i1 0
+  %nop7923 = alloca i1, i1 0
+  %nop7924 = alloca i1, i1 0
+  %nop7925 = alloca i1, i1 0
+  %nop7926 = alloca i1, i1 0
+  %nop7927 = alloca i1, i1 0
+  %nop7928 = alloca i1, i1 0
+  %nop7929 = alloca i1, i1 0
+  %nop7930 = alloca i1, i1 0
+  %nop7931 = alloca i1, i1 0
+  %nop7932 = alloca i1, i1 0
+  %nop7933 = alloca i1, i1 0
+  %nop7934 = alloca i1, i1 0
+  %nop7935 = alloca i1, i1 0
+  %nop7936 = alloca i1, i1 0
+  %nop7937 = alloca i1, i1 0
+  %nop7938 = alloca i1, i1 0
+  %nop7939 = alloca i1, i1 0
+  %nop7940 = alloca i1, i1 0
+  %nop7941 = alloca i1, i1 0
+  %nop7942 = alloca i1, i1 0
+  %nop7943 = alloca i1, i1 0
+  %nop7944 = alloca i1, i1 0
+  %nop7945 = alloca i1, i1 0
+  %nop7946 = alloca i1, i1 0
+  %nop7947 = alloca i1, i1 0
+  %nop7948 = alloca i1, i1 0
+  %nop7949 = alloca i1, i1 0
+  %nop7950 = alloca i1, i1 0
+  %nop7951 = alloca i1, i1 0
+  %nop7952 = alloca i1, i1 0
+  %nop7953 = alloca i1, i1 0
+  %nop7954 = alloca i1, i1 0
+  %nop7955 = alloca i1, i1 0
+  %nop7956 = alloca i1, i1 0
+  %nop7957 = alloca i1, i1 0
+  %nop7958 = alloca i1, i1 0
+  %nop7959 = alloca i1, i1 0
+  %nop7960 = alloca i1, i1 0
+  %nop7961 = alloca i1, i1 0
+  %nop7962 = alloca i1, i1 0
+  %nop7963 = alloca i1, i1 0
+  %nop7964 = alloca i1, i1 0
+  %nop7965 = alloca i1, i1 0
+  %nop7966 = alloca i1, i1 0
+  %nop7967 = alloca i1, i1 0
+  %nop7968 = alloca i1, i1 0
+  %nop7969 = alloca i1, i1 0
+  %nop7970 = alloca i1, i1 0
+  %nop7971 = alloca i1, i1 0
+  %nop7972 = alloca i1, i1 0
+  %nop7973 = alloca i1, i1 0
+  %nop7974 = alloca i1, i1 0
+  %nop7975 = alloca i1, i1 0
+  %nop7976 = alloca i1, i1 0
+  %nop7977 = alloca i1, i1 0
+  %nop7978 = alloca i1, i1 0
+  %nop7979 = alloca i1, i1 0
+  %nop7980 = alloca i1, i1 0
+  %nop7981 = alloca i1, i1 0
+  %nop7982 = alloca i1, i1 0
+  %nop7983 = alloca i1, i1 0
+  %nop7984 = alloca i1, i1 0
+  %nop7985 = alloca i1, i1 0
+  %nop7986 = alloca i1, i1 0
+  %nop7987 = alloca i1, i1 0
+  %nop7988 = alloca i1, i1 0
+  %nop7989 = alloca i1, i1 0
+  %nop7990 = alloca i1, i1 0
+  %nop7991 = alloca i1, i1 0
+  %nop7992 = alloca i1, i1 0
+  %nop7993 = alloca i1, i1 0
+  %nop7994 = alloca i1, i1 0
+  %nop7995 = alloca i1, i1 0
+  %nop7996 = alloca i1, i1 0
+  %nop7997 = alloca i1, i1 0
+  %nop7998 = alloca i1, i1 0
+  %nop7999 = alloca i1, i1 0
+  %nop8000 = alloca i1, i1 0
+  %nop8001 = alloca i1, i1 0
+  %nop8002 = alloca i1, i1 0
+  %nop8003 = alloca i1, i1 0
+  %nop8004 = alloca i1, i1 0
+  %nop8005 = alloca i1, i1 0
+  %nop8006 = alloca i1, i1 0
+  %nop8007 = alloca i1, i1 0
+  %nop8008 = alloca i1, i1 0
+  %nop8009 = alloca i1, i1 0
+  %nop8010 = alloca i1, i1 0
+  %nop8011 = alloca i1, i1 0
+  %nop8012 = alloca i1, i1 0
+  %nop8013 = alloca i1, i1 0
+  %nop8014 = alloca i1, i1 0
+  %nop8015 = alloca i1, i1 0
+  %nop8016 = alloca i1, i1 0
+  %nop8017 = alloca i1, i1 0
+  %nop8018 = alloca i1, i1 0
+  %nop8019 = alloca i1, i1 0
+  %nop8020 = alloca i1, i1 0
+  %nop8021 = alloca i1, i1 0
+  %nop8022 = alloca i1, i1 0
+  %nop8023 = alloca i1, i1 0
+  %nop8024 = alloca i1, i1 0
+  %nop8025 = alloca i1, i1 0
+  %nop8026 = alloca i1, i1 0
+  %nop8027 = alloca i1, i1 0
+  %nop8028 = alloca i1, i1 0
+  %nop8029 = alloca i1, i1 0
+  %nop8030 = alloca i1, i1 0
+  %nop8031 = alloca i1, i1 0
+  %nop8032 = alloca i1, i1 0
+  %nop8033 = alloca i1, i1 0
+  %nop8034 = alloca i1, i1 0
+  %nop8035 = alloca i1, i1 0
+  %nop8036 = alloca i1, i1 0
+  %nop8037 = alloca i1, i1 0
+  %nop8038 = alloca i1, i1 0
+  %nop8039 = alloca i1, i1 0
+  %nop8040 = alloca i1, i1 0
+  %nop8041 = alloca i1, i1 0
+  %nop8042 = alloca i1, i1 0
+  %nop8043 = alloca i1, i1 0
+  %nop8044 = alloca i1, i1 0
+  %nop8045 = alloca i1, i1 0
+  %nop8046 = alloca i1, i1 0
+  %nop8047 = alloca i1, i1 0
+  %nop8048 = alloca i1, i1 0
+  %nop8049 = alloca i1, i1 0
+  %nop8050 = alloca i1, i1 0
+  %nop8051 = alloca i1, i1 0
+  %nop8052 = alloca i1, i1 0
+  %nop8053 = alloca i1, i1 0
+  %nop8054 = alloca i1, i1 0
+  %nop8055 = alloca i1, i1 0
+  %nop8056 = alloca i1, i1 0
+  %nop8057 = alloca i1, i1 0
+  %nop8058 = alloca i1, i1 0
+  %nop8059 = alloca i1, i1 0
+  %nop8060 = alloca i1, i1 0
+  %nop8061 = alloca i1, i1 0
+  %nop8062 = alloca i1, i1 0
+  %nop8063 = alloca i1, i1 0
+  %nop8064 = alloca i1, i1 0
+  %nop8065 = alloca i1, i1 0
+  %nop8066 = alloca i1, i1 0
+  %nop8067 = alloca i1, i1 0
+  %nop8068 = alloca i1, i1 0
+  %nop8069 = alloca i1, i1 0
+  %nop8070 = alloca i1, i1 0
+  %nop8071 = alloca i1, i1 0
+  %nop8072 = alloca i1, i1 0
+  %nop8073 = alloca i1, i1 0
+  %nop8074 = alloca i1, i1 0
+  %nop8075 = alloca i1, i1 0
+  %nop8076 = alloca i1, i1 0
+  %nop8077 = alloca i1, i1 0
+  %nop8078 = alloca i1, i1 0
+  %nop8079 = alloca i1, i1 0
+  %nop8080 = alloca i1, i1 0
+  %nop8081 = alloca i1, i1 0
+  %nop8082 = alloca i1, i1 0
+  %nop8083 = alloca i1, i1 0
+  %nop8084 = alloca i1, i1 0
+  %nop8085 = alloca i1, i1 0
+  %nop8086 = alloca i1, i1 0
+  %nop8087 = alloca i1, i1 0
+  %nop8088 = alloca i1, i1 0
+  %nop8089 = alloca i1, i1 0
+  %nop8090 = alloca i1, i1 0
+  %nop8091 = alloca i1, i1 0
+  %nop8092 = alloca i1, i1 0
+  %nop8093 = alloca i1, i1 0
+  %nop8094 = alloca i1, i1 0
+  %nop8095 = alloca i1, i1 0
+  %nop8096 = alloca i1, i1 0
+  %nop8097 = alloca i1, i1 0
+  %nop8098 = alloca i1, i1 0
+  %nop8099 = alloca i1, i1 0
+  %nop8100 = alloca i1, i1 0
+  %nop8101 = alloca i1, i1 0
+  %nop8102 = alloca i1, i1 0
+  %nop8103 = alloca i1, i1 0
+  %nop8104 = alloca i1, i1 0
+  %nop8105 = alloca i1, i1 0
+  %nop8106 = alloca i1, i1 0
+  %nop8107 = alloca i1, i1 0
+  %nop8108 = alloca i1, i1 0
+  %nop8109 = alloca i1, i1 0
+  %nop8110 = alloca i1, i1 0
+  %nop8111 = alloca i1, i1 0
+  %nop8112 = alloca i1, i1 0
+  %nop8113 = alloca i1, i1 0
+  %nop8114 = alloca i1, i1 0
+  %nop8115 = alloca i1, i1 0
+  %nop8116 = alloca i1, i1 0
+  %nop8117 = alloca i1, i1 0
+  %nop8118 = alloca i1, i1 0
+  %nop8119 = alloca i1, i1 0
+  %nop8120 = alloca i1, i1 0
+  %nop8121 = alloca i1, i1 0
+  %nop8122 = alloca i1, i1 0
+  %nop8123 = alloca i1, i1 0
+  %nop8124 = alloca i1, i1 0
+  %nop8125 = alloca i1, i1 0
+  %nop8126 = alloca i1, i1 0
+  %nop8127 = alloca i1, i1 0
+  %nop8128 = alloca i1, i1 0
+  %nop8129 = alloca i1, i1 0
+  %nop8130 = alloca i1, i1 0
+  %nop8131 = alloca i1, i1 0
+  %nop8132 = alloca i1, i1 0
+  %nop8133 = alloca i1, i1 0
+  %nop8134 = alloca i1, i1 0
+  %nop8135 = alloca i1, i1 0
+  %nop8136 = alloca i1, i1 0
+  %nop8137 = alloca i1, i1 0
+  %nop8138 = alloca i1, i1 0
+  %nop8139 = alloca i1, i1 0
+  %nop8140 = alloca i1, i1 0
+  %nop8141 = alloca i1, i1 0
+  %nop8142 = alloca i1, i1 0
+  %nop8143 = alloca i1, i1 0
+  %nop8144 = alloca i1, i1 0
+  %nop8145 = alloca i1, i1 0
+  %nop8146 = alloca i1, i1 0
+  %nop8147 = alloca i1, i1 0
+  %nop8148 = alloca i1, i1 0
+  %nop8149 = alloca i1, i1 0
+  %nop8150 = alloca i1, i1 0
+  %nop8151 = alloca i1, i1 0
+  %nop8152 = alloca i1, i1 0
+  %nop8153 = alloca i1, i1 0
+  %nop8154 = alloca i1, i1 0
+  %nop8155 = alloca i1, i1 0
+  %nop8156 = alloca i1, i1 0
+  %nop8157 = alloca i1, i1 0
+  %nop8158 = alloca i1, i1 0
+  %nop8159 = alloca i1, i1 0
+  %nop8160 = alloca i1, i1 0
+  %nop8161 = alloca i1, i1 0
+  %nop8162 = alloca i1, i1 0
+  %nop8163 = alloca i1, i1 0
+  %nop8164 = alloca i1, i1 0
+  %nop8165 = alloca i1, i1 0
+  %nop8166 = alloca i1, i1 0
+  %nop8167 = alloca i1, i1 0
+  %nop8168 = alloca i1, i1 0
+  %nop8169 = alloca i1, i1 0
+  %nop8170 = alloca i1, i1 0
+  %nop8171 = alloca i1, i1 0
+  %nop8172 = alloca i1, i1 0
+  %nop8173 = alloca i1, i1 0
+  %nop8174 = alloca i1, i1 0
+  %nop8175 = alloca i1, i1 0
+  %nop8176 = alloca i1, i1 0
+  %nop8177 = alloca i1, i1 0
+  %nop8178 = alloca i1, i1 0
+  %nop8179 = alloca i1, i1 0
+  %nop8180 = alloca i1, i1 0
+  %nop8181 = alloca i1, i1 0
+  %nop8182 = alloca i1, i1 0
+  %nop8183 = alloca i1, i1 0
+  %nop8184 = alloca i1, i1 0
+  %nop8185 = alloca i1, i1 0
+  %nop8186 = alloca i1, i1 0
+  %nop8187 = alloca i1, i1 0
+  %nop8188 = alloca i1, i1 0
+  %nop8189 = alloca i1, i1 0
+  %nop8190 = alloca i1, i1 0
+  %nop8191 = alloca i1, i1 0
+  %nop8192 = alloca i1, i1 0
+  %nop8193 = alloca i1, i1 0
+  %nop8194 = alloca i1, i1 0
+  %nop8195 = alloca i1, i1 0
+  %nop8196 = alloca i1, i1 0
+  %nop8197 = alloca i1, i1 0
+  %nop8198 = alloca i1, i1 0
+  %nop8199 = alloca i1, i1 0
+  %nop8200 = alloca i1, i1 0
+  %nop8201 = alloca i1, i1 0
+  %nop8202 = alloca i1, i1 0
+  %nop8203 = alloca i1, i1 0
+  %nop8204 = alloca i1, i1 0
+  %nop8205 = alloca i1, i1 0
+  %nop8206 = alloca i1, i1 0
+  %nop8207 = alloca i1, i1 0
+  %nop8208 = alloca i1, i1 0
+  %nop8209 = alloca i1, i1 0
+  %nop8210 = alloca i1, i1 0
+  %nop8211 = alloca i1, i1 0
+  %nop8212 = alloca i1, i1 0
+  %nop8213 = alloca i1, i1 0
+  %nop8214 = alloca i1, i1 0
+  %nop8215 = alloca i1, i1 0
+  %nop8216 = alloca i1, i1 0
+  %nop8217 = alloca i1, i1 0
+  %nop8218 = alloca i1, i1 0
+  %nop8219 = alloca i1, i1 0
+  %nop8220 = alloca i1, i1 0
+  %nop8221 = alloca i1, i1 0
+  %nop8222 = alloca i1, i1 0
+  %nop8223 = alloca i1, i1 0
+  %nop8224 = alloca i1, i1 0
+  %nop8225 = alloca i1, i1 0
+  %nop8226 = alloca i1, i1 0
+  %nop8227 = alloca i1, i1 0
+  %nop8228 = alloca i1, i1 0
+  %nop8229 = alloca i1, i1 0
+  %nop8230 = alloca i1, i1 0
+  %nop8231 = alloca i1, i1 0
+  %nop8232 = alloca i1, i1 0
+  %nop8233 = alloca i1, i1 0
+  %nop8234 = alloca i1, i1 0
+  %nop8235 = alloca i1, i1 0
+  %nop8236 = alloca i1, i1 0
+  %nop8237 = alloca i1, i1 0
+  %nop8238 = alloca i1, i1 0
+  %nop8239 = alloca i1, i1 0
+  %nop8240 = alloca i1, i1 0
+  %nop8241 = alloca i1, i1 0
+  %nop8242 = alloca i1, i1 0
+  %nop8243 = alloca i1, i1 0
+  %nop8244 = alloca i1, i1 0
+  %nop8245 = alloca i1, i1 0
+  %nop8246 = alloca i1, i1 0
+  %nop8247 = alloca i1, i1 0
+  %nop8248 = alloca i1, i1 0
+  %nop8249 = alloca i1, i1 0
+  %nop8250 = alloca i1, i1 0
+  %nop8251 = alloca i1, i1 0
+  %nop8252 = alloca i1, i1 0
+  %nop8253 = alloca i1, i1 0
+  %nop8254 = alloca i1, i1 0
+  %nop8255 = alloca i1, i1 0
+  %nop8256 = alloca i1, i1 0
+  %nop8257 = alloca i1, i1 0
+  %nop8258 = alloca i1, i1 0
+  %nop8259 = alloca i1, i1 0
+  %nop8260 = alloca i1, i1 0
+  %nop8261 = alloca i1, i1 0
+  %nop8262 = alloca i1, i1 0
+  %nop8263 = alloca i1, i1 0
+  %nop8264 = alloca i1, i1 0
+  %nop8265 = alloca i1, i1 0
+  %nop8266 = alloca i1, i1 0
+  %nop8267 = alloca i1, i1 0
+  %nop8268 = alloca i1, i1 0
+  %nop8269 = alloca i1, i1 0
+  %nop8270 = alloca i1, i1 0
+  %nop8271 = alloca i1, i1 0
+  %nop8272 = alloca i1, i1 0
+  %nop8273 = alloca i1, i1 0
+  %nop8274 = alloca i1, i1 0
+  %nop8275 = alloca i1, i1 0
+  %nop8276 = alloca i1, i1 0
+  %nop8277 = alloca i1, i1 0
+  %nop8278 = alloca i1, i1 0
+  %nop8279 = alloca i1, i1 0
+  %nop8280 = alloca i1, i1 0
+  %nop8281 = alloca i1, i1 0
+  %nop8282 = alloca i1, i1 0
+  %nop8283 = alloca i1, i1 0
+  %nop8284 = alloca i1, i1 0
+  %nop8285 = alloca i1, i1 0
+  %nop8286 = alloca i1, i1 0
+  %nop8287 = alloca i1, i1 0
+  %nop8288 = alloca i1, i1 0
+  %nop8289 = alloca i1, i1 0
+  %nop8290 = alloca i1, i1 0
+  %nop8291 = alloca i1, i1 0
+  %nop8292 = alloca i1, i1 0
+  %nop8293 = alloca i1, i1 0
+  %nop8294 = alloca i1, i1 0
+  %nop8295 = alloca i1, i1 0
+  %nop8296 = alloca i1, i1 0
+  %nop8297 = alloca i1, i1 0
+  %nop8298 = alloca i1, i1 0
+  %nop8299 = alloca i1, i1 0
+  %nop8300 = alloca i1, i1 0
+  %nop8301 = alloca i1, i1 0
+  %nop8302 = alloca i1, i1 0
+  %nop8303 = alloca i1, i1 0
+  %nop8304 = alloca i1, i1 0
+  %nop8305 = alloca i1, i1 0
+  %nop8306 = alloca i1, i1 0
+  %nop8307 = alloca i1, i1 0
+  %nop8308 = alloca i1, i1 0
+  %nop8309 = alloca i1, i1 0
+  %nop8310 = alloca i1, i1 0
+  %nop8311 = alloca i1, i1 0
+  %nop8312 = alloca i1, i1 0
+  %nop8313 = alloca i1, i1 0
+  %nop8314 = alloca i1, i1 0
+  %nop8315 = alloca i1, i1 0
+  %nop8316 = alloca i1, i1 0
+  %nop8317 = alloca i1, i1 0
+  %nop8318 = alloca i1, i1 0
+  %nop8319 = alloca i1, i1 0
+  %nop8320 = alloca i1, i1 0
+  %nop8321 = alloca i1, i1 0
+  %nop8322 = alloca i1, i1 0
+  %nop8323 = alloca i1, i1 0
+  %nop8324 = alloca i1, i1 0
+  %nop8325 = alloca i1, i1 0
+  %nop8326 = alloca i1, i1 0
+  %nop8327 = alloca i1, i1 0
+  %nop8328 = alloca i1, i1 0
+  %nop8329 = alloca i1, i1 0
+  %nop8330 = alloca i1, i1 0
+  %nop8331 = alloca i1, i1 0
+  %nop8332 = alloca i1, i1 0
+  %nop8333 = alloca i1, i1 0
+  %nop8334 = alloca i1, i1 0
+  %nop8335 = alloca i1, i1 0
+  %nop8336 = alloca i1, i1 0
+  %nop8337 = alloca i1, i1 0
+  %nop8338 = alloca i1, i1 0
+  %nop8339 = alloca i1, i1 0
+  %nop8340 = alloca i1, i1 0
+  %nop8341 = alloca i1, i1 0
+  %nop8342 = alloca i1, i1 0
+  %nop8343 = alloca i1, i1 0
+  %nop8344 = alloca i1, i1 0
+  %nop8345 = alloca i1, i1 0
+  %nop8346 = alloca i1, i1 0
+  %nop8347 = alloca i1, i1 0
+  %nop8348 = alloca i1, i1 0
+  %nop8349 = alloca i1, i1 0
+  %nop8350 = alloca i1, i1 0
+  %nop8351 = alloca i1, i1 0
+  %nop8352 = alloca i1, i1 0
+  %nop8353 = alloca i1, i1 0
+  %nop8354 = alloca i1, i1 0
+  %nop8355 = alloca i1, i1 0
+  %nop8356 = alloca i1, i1 0
+  %nop8357 = alloca i1, i1 0
+  %nop8358 = alloca i1, i1 0
+  %nop8359 = alloca i1, i1 0
+  %nop8360 = alloca i1, i1 0
+  %nop8361 = alloca i1, i1 0
+  %nop8362 = alloca i1, i1 0
+  %nop8363 = alloca i1, i1 0
+  %nop8364 = alloca i1, i1 0
+  %nop8365 = alloca i1, i1 0
+  %nop8366 = alloca i1, i1 0
+  %nop8367 = alloca i1, i1 0
+  %nop8368 = alloca i1, i1 0
+  %nop8369 = alloca i1, i1 0
+  %nop8370 = alloca i1, i1 0
+  %nop8371 = alloca i1, i1 0
+  %nop8372 = alloca i1, i1 0
+  %nop8373 = alloca i1, i1 0
+  %nop8374 = alloca i1, i1 0
+  %nop8375 = alloca i1, i1 0
+  %nop8376 = alloca i1, i1 0
+  %nop8377 = alloca i1, i1 0
+  %nop8378 = alloca i1, i1 0
+  %nop8379 = alloca i1, i1 0
+  %nop8380 = alloca i1, i1 0
+  %nop8381 = alloca i1, i1 0
+  %nop8382 = alloca i1, i1 0
+  %nop8383 = alloca i1, i1 0
+  %nop8384 = alloca i1, i1 0
+  %nop8385 = alloca i1, i1 0
+  %nop8386 = alloca i1, i1 0
+  %nop8387 = alloca i1, i1 0
+  %nop8388 = alloca i1, i1 0
+  %nop8389 = alloca i1, i1 0
+  %nop8390 = alloca i1, i1 0
+  %nop8391 = alloca i1, i1 0
+  %nop8392 = alloca i1, i1 0
+  %nop8393 = alloca i1, i1 0
+  %nop8394 = alloca i1, i1 0
+  %nop8395 = alloca i1, i1 0
+  %nop8396 = alloca i1, i1 0
+  %nop8397 = alloca i1, i1 0
+  %nop8398 = alloca i1, i1 0
+  %nop8399 = alloca i1, i1 0
+  %nop8400 = alloca i1, i1 0
+  %nop8401 = alloca i1, i1 0
+  %nop8402 = alloca i1, i1 0
+  %nop8403 = alloca i1, i1 0
+  %nop8404 = alloca i1, i1 0
+  %nop8405 = alloca i1, i1 0
+  %nop8406 = alloca i1, i1 0
+  %nop8407 = alloca i1, i1 0
+  %nop8408 = alloca i1, i1 0
+  %nop8409 = alloca i1, i1 0
+  %nop8410 = alloca i1, i1 0
+  %nop8411 = alloca i1, i1 0
+  %nop8412 = alloca i1, i1 0
+  %nop8413 = alloca i1, i1 0
+  %nop8414 = alloca i1, i1 0
+  %nop8415 = alloca i1, i1 0
+  %nop8416 = alloca i1, i1 0
+  %nop8417 = alloca i1, i1 0
+  %nop8418 = alloca i1, i1 0
+  %nop8419 = alloca i1, i1 0
+  %nop8420 = alloca i1, i1 0
+  %nop8421 = alloca i1, i1 0
+  %nop8422 = alloca i1, i1 0
+  %nop8423 = alloca i1, i1 0
+  %nop8424 = alloca i1, i1 0
+  %nop8425 = alloca i1, i1 0
+  %nop8426 = alloca i1, i1 0
+  %nop8427 = alloca i1, i1 0
+  %nop8428 = alloca i1, i1 0
+  %nop8429 = alloca i1, i1 0
+  %nop8430 = alloca i1, i1 0
+  %nop8431 = alloca i1, i1 0
+  %nop8432 = alloca i1, i1 0
+  %nop8433 = alloca i1, i1 0
+  %nop8434 = alloca i1, i1 0
+  %nop8435 = alloca i1, i1 0
+  %nop8436 = alloca i1, i1 0
+  %nop8437 = alloca i1, i1 0
+  %nop8438 = alloca i1, i1 0
+  %nop8439 = alloca i1, i1 0
+  %nop8440 = alloca i1, i1 0
+  %nop8441 = alloca i1, i1 0
+  %nop8442 = alloca i1, i1 0
+  %nop8443 = alloca i1, i1 0
+  %nop8444 = alloca i1, i1 0
+  %nop8445 = alloca i1, i1 0
+  %nop8446 = alloca i1, i1 0
+  %nop8447 = alloca i1, i1 0
+  %nop8448 = alloca i1, i1 0
+  %nop8449 = alloca i1, i1 0
+  %nop8450 = alloca i1, i1 0
+  %nop8451 = alloca i1, i1 0
+  %nop8452 = alloca i1, i1 0
+  %nop8453 = alloca i1, i1 0
+  %nop8454 = alloca i1, i1 0
+  %nop8455 = alloca i1, i1 0
+  %nop8456 = alloca i1, i1 0
+  %nop8457 = alloca i1, i1 0
+  %nop8458 = alloca i1, i1 0
+  %nop8459 = alloca i1, i1 0
+  %nop8460 = alloca i1, i1 0
+  %nop8461 = alloca i1, i1 0
+  %nop8462 = alloca i1, i1 0
+  %nop8463 = alloca i1, i1 0
+  %nop8464 = alloca i1, i1 0
+  %nop8465 = alloca i1, i1 0
+  %nop8466 = alloca i1, i1 0
+  %nop8467 = alloca i1, i1 0
+  %nop8468 = alloca i1, i1 0
+  %nop8469 = alloca i1, i1 0
+  %nop8470 = alloca i1, i1 0
+  %nop8471 = alloca i1, i1 0
+  %nop8472 = alloca i1, i1 0
+  %nop8473 = alloca i1, i1 0
+  %nop8474 = alloca i1, i1 0
+  %nop8475 = alloca i1, i1 0
+  %nop8476 = alloca i1, i1 0
+  %nop8477 = alloca i1, i1 0
+  %nop8478 = alloca i1, i1 0
+  %nop8479 = alloca i1, i1 0
+  %nop8480 = alloca i1, i1 0
+  %nop8481 = alloca i1, i1 0
+  %nop8482 = alloca i1, i1 0
+  %nop8483 = alloca i1, i1 0
+  %nop8484 = alloca i1, i1 0
+  %nop8485 = alloca i1, i1 0
+  %nop8486 = alloca i1, i1 0
+  %nop8487 = alloca i1, i1 0
+  %nop8488 = alloca i1, i1 0
+  %nop8489 = alloca i1, i1 0
+  %nop8490 = alloca i1, i1 0
+  %nop8491 = alloca i1, i1 0
+  %nop8492 = alloca i1, i1 0
+  %nop8493 = alloca i1, i1 0
+  %nop8494 = alloca i1, i1 0
+  %nop8495 = alloca i1, i1 0
+  %nop8496 = alloca i1, i1 0
+  %nop8497 = alloca i1, i1 0
+  %nop8498 = alloca i1, i1 0
+  %nop8499 = alloca i1, i1 0
+  %nop8500 = alloca i1, i1 0
+  %nop8501 = alloca i1, i1 0
+  %nop8502 = alloca i1, i1 0
+  %nop8503 = alloca i1, i1 0
+  %nop8504 = alloca i1, i1 0
+  %nop8505 = alloca i1, i1 0
+  %nop8506 = alloca i1, i1 0
+  %nop8507 = alloca i1, i1 0
+  %nop8508 = alloca i1, i1 0
+  %nop8509 = alloca i1, i1 0
+  %nop8510 = alloca i1, i1 0
+  %nop8511 = alloca i1, i1 0
+  %nop8512 = alloca i1, i1 0
+  %nop8513 = alloca i1, i1 0
+  %nop8514 = alloca i1, i1 0
+  %nop8515 = alloca i1, i1 0
+  %nop8516 = alloca i1, i1 0
+  %nop8517 = alloca i1, i1 0
+  %nop8518 = alloca i1, i1 0
+  %nop8519 = alloca i1, i1 0
+  %nop8520 = alloca i1, i1 0
+  %nop8521 = alloca i1, i1 0
+  %nop8522 = alloca i1, i1 0
+  %nop8523 = alloca i1, i1 0
+  %nop8524 = alloca i1, i1 0
+  %nop8525 = alloca i1, i1 0
+  %nop8526 = alloca i1, i1 0
+  %nop8527 = alloca i1, i1 0
+  %nop8528 = alloca i1, i1 0
+  %nop8529 = alloca i1, i1 0
+  %nop8530 = alloca i1, i1 0
+  %nop8531 = alloca i1, i1 0
+  %nop8532 = alloca i1, i1 0
+  %nop8533 = alloca i1, i1 0
+  %nop8534 = alloca i1, i1 0
+  %nop8535 = alloca i1, i1 0
+  %nop8536 = alloca i1, i1 0
+  %nop8537 = alloca i1, i1 0
+  %nop8538 = alloca i1, i1 0
+  %nop8539 = alloca i1, i1 0
+  %nop8540 = alloca i1, i1 0
+  %nop8541 = alloca i1, i1 0
+  %nop8542 = alloca i1, i1 0
+  %nop8543 = alloca i1, i1 0
+  %nop8544 = alloca i1, i1 0
+  %nop8545 = alloca i1, i1 0
+  %nop8546 = alloca i1, i1 0
+  %nop8547 = alloca i1, i1 0
+  %nop8548 = alloca i1, i1 0
+  %nop8549 = alloca i1, i1 0
+  %nop8550 = alloca i1, i1 0
+  %nop8551 = alloca i1, i1 0
+  %nop8552 = alloca i1, i1 0
+  %nop8553 = alloca i1, i1 0
+  %nop8554 = alloca i1, i1 0
+  %nop8555 = alloca i1, i1 0
+  %nop8556 = alloca i1, i1 0
+  %nop8557 = alloca i1, i1 0
+  %nop8558 = alloca i1, i1 0
+  %nop8559 = alloca i1, i1 0
+  %nop8560 = alloca i1, i1 0
+  %nop8561 = alloca i1, i1 0
+  %nop8562 = alloca i1, i1 0
+  %nop8563 = alloca i1, i1 0
+  %nop8564 = alloca i1, i1 0
+  %nop8565 = alloca i1, i1 0
+  %nop8566 = alloca i1, i1 0
+  %nop8567 = alloca i1, i1 0
+  %nop8568 = alloca i1, i1 0
+  %nop8569 = alloca i1, i1 0
+  %nop8570 = alloca i1, i1 0
+  %nop8571 = alloca i1, i1 0
+  %nop8572 = alloca i1, i1 0
+  %nop8573 = alloca i1, i1 0
+  %nop8574 = alloca i1, i1 0
+  %nop8575 = alloca i1, i1 0
+  %nop8576 = alloca i1, i1 0
+  %nop8577 = alloca i1, i1 0
+  %nop8578 = alloca i1, i1 0
+  %nop8579 = alloca i1, i1 0
+  %nop8580 = alloca i1, i1 0
+  %nop8581 = alloca i1, i1 0
+  %nop8582 = alloca i1, i1 0
+  %nop8583 = alloca i1, i1 0
+  %nop8584 = alloca i1, i1 0
+  %nop8585 = alloca i1, i1 0
+  %nop8586 = alloca i1, i1 0
+  %nop8587 = alloca i1, i1 0
+  %nop8588 = alloca i1, i1 0
+  %nop8589 = alloca i1, i1 0
+  %nop8590 = alloca i1, i1 0
+  %nop8591 = alloca i1, i1 0
+  %nop8592 = alloca i1, i1 0
+  %nop8593 = alloca i1, i1 0
+  %nop8594 = alloca i1, i1 0
+  %nop8595 = alloca i1, i1 0
+  %nop8596 = alloca i1, i1 0
+  %nop8597 = alloca i1, i1 0
+  %nop8598 = alloca i1, i1 0
+  %nop8599 = alloca i1, i1 0
+  %nop8600 = alloca i1, i1 0
+  %nop8601 = alloca i1, i1 0
+  %nop8602 = alloca i1, i1 0
+  %nop8603 = alloca i1, i1 0
+  %nop8604 = alloca i1, i1 0
+  %nop8605 = alloca i1, i1 0
+  %nop8606 = alloca i1, i1 0
+  %nop8607 = alloca i1, i1 0
+  %nop8608 = alloca i1, i1 0
+  %nop8609 = alloca i1, i1 0
+  %nop8610 = alloca i1, i1 0
+  %nop8611 = alloca i1, i1 0
+  %nop8612 = alloca i1, i1 0
+  %nop8613 = alloca i1, i1 0
+  %nop8614 = alloca i1, i1 0
+  %nop8615 = alloca i1, i1 0
+  %nop8616 = alloca i1, i1 0
+  %nop8617 = alloca i1, i1 0
+  %nop8618 = alloca i1, i1 0
+  %nop8619 = alloca i1, i1 0
+  %nop8620 = alloca i1, i1 0
+  %nop8621 = alloca i1, i1 0
+  %nop8622 = alloca i1, i1 0
+  %nop8623 = alloca i1, i1 0
+  %nop8624 = alloca i1, i1 0
+  %nop8625 = alloca i1, i1 0
+  %nop8626 = alloca i1, i1 0
+  %nop8627 = alloca i1, i1 0
+  %nop8628 = alloca i1, i1 0
+  %nop8629 = alloca i1, i1 0
+  %nop8630 = alloca i1, i1 0
+  %nop8631 = alloca i1, i1 0
+  %nop8632 = alloca i1, i1 0
+  %nop8633 = alloca i1, i1 0
+  %nop8634 = alloca i1, i1 0
+  %nop8635 = alloca i1, i1 0
+  %nop8636 = alloca i1, i1 0
+  %nop8637 = alloca i1, i1 0
+  %nop8638 = alloca i1, i1 0
+  %nop8639 = alloca i1, i1 0
+  %nop8640 = alloca i1, i1 0
+  %nop8641 = alloca i1, i1 0
+  %nop8642 = alloca i1, i1 0
+  %nop8643 = alloca i1, i1 0
+  %nop8644 = alloca i1, i1 0
+  %nop8645 = alloca i1, i1 0
+  %nop8646 = alloca i1, i1 0
+  %nop8647 = alloca i1, i1 0
+  %nop8648 = alloca i1, i1 0
+  %nop8649 = alloca i1, i1 0
+  %nop8650 = alloca i1, i1 0
+  %nop8651 = alloca i1, i1 0
+  %nop8652 = alloca i1, i1 0
+  %nop8653 = alloca i1, i1 0
+  %nop8654 = alloca i1, i1 0
+  %nop8655 = alloca i1, i1 0
+  %nop8656 = alloca i1, i1 0
+  %nop8657 = alloca i1, i1 0
+  %nop8658 = alloca i1, i1 0
+  %nop8659 = alloca i1, i1 0
+  %nop8660 = alloca i1, i1 0
+  %nop8661 = alloca i1, i1 0
+  %nop8662 = alloca i1, i1 0
+  %nop8663 = alloca i1, i1 0
+  %nop8664 = alloca i1, i1 0
+  %nop8665 = alloca i1, i1 0
+  %nop8666 = alloca i1, i1 0
+  %nop8667 = alloca i1, i1 0
+  %nop8668 = alloca i1, i1 0
+  %nop8669 = alloca i1, i1 0
+  %nop8670 = alloca i1, i1 0
+  %nop8671 = alloca i1, i1 0
+  %nop8672 = alloca i1, i1 0
+  %nop8673 = alloca i1, i1 0
+  %nop8674 = alloca i1, i1 0
+  %nop8675 = alloca i1, i1 0
+  %nop8676 = alloca i1, i1 0
+  %nop8677 = alloca i1, i1 0
+  %nop8678 = alloca i1, i1 0
+  %nop8679 = alloca i1, i1 0
+  %nop8680 = alloca i1, i1 0
+  %nop8681 = alloca i1, i1 0
+  %nop8682 = alloca i1, i1 0
+  %nop8683 = alloca i1, i1 0
+  %nop8684 = alloca i1, i1 0
+  %nop8685 = alloca i1, i1 0
+  %nop8686 = alloca i1, i1 0
+  %nop8687 = alloca i1, i1 0
+  %nop8688 = alloca i1, i1 0
+  %nop8689 = alloca i1, i1 0
+  %nop8690 = alloca i1, i1 0
+  %nop8691 = alloca i1, i1 0
+  %nop8692 = alloca i1, i1 0
+  %nop8693 = alloca i1, i1 0
+  %nop8694 = alloca i1, i1 0
+  %nop8695 = alloca i1, i1 0
+  %nop8696 = alloca i1, i1 0
+  %nop8697 = alloca i1, i1 0
+  %nop8698 = alloca i1, i1 0
+  %nop8699 = alloca i1, i1 0
+  %nop8700 = alloca i1, i1 0
+  %nop8701 = alloca i1, i1 0
+  %nop8702 = alloca i1, i1 0
+  %nop8703 = alloca i1, i1 0
+  %nop8704 = alloca i1, i1 0
+  %nop8705 = alloca i1, i1 0
+  %nop8706 = alloca i1, i1 0
+  %nop8707 = alloca i1, i1 0
+  %nop8708 = alloca i1, i1 0
+  %nop8709 = alloca i1, i1 0
+  %nop8710 = alloca i1, i1 0
+  %nop8711 = alloca i1, i1 0
+  %nop8712 = alloca i1, i1 0
+  %nop8713 = alloca i1, i1 0
+  %nop8714 = alloca i1, i1 0
+  %nop8715 = alloca i1, i1 0
+  %nop8716 = alloca i1, i1 0
+  %nop8717 = alloca i1, i1 0
+  %nop8718 = alloca i1, i1 0
+  %nop8719 = alloca i1, i1 0
+  %nop8720 = alloca i1, i1 0
+  %nop8721 = alloca i1, i1 0
+  %nop8722 = alloca i1, i1 0
+  %nop8723 = alloca i1, i1 0
+  %nop8724 = alloca i1, i1 0
+  %nop8725 = alloca i1, i1 0
+  %nop8726 = alloca i1, i1 0
+  %nop8727 = alloca i1, i1 0
+  %nop8728 = alloca i1, i1 0
+  %nop8729 = alloca i1, i1 0
+  %nop8730 = alloca i1, i1 0
+  %nop8731 = alloca i1, i1 0
+  %nop8732 = alloca i1, i1 0
+  %nop8733 = alloca i1, i1 0
+  %nop8734 = alloca i1, i1 0
+  %nop8735 = alloca i1, i1 0
+  %nop8736 = alloca i1, i1 0
+  %nop8737 = alloca i1, i1 0
+  %nop8738 = alloca i1, i1 0
+  %nop8739 = alloca i1, i1 0
+  %nop8740 = alloca i1, i1 0
+  %nop8741 = alloca i1, i1 0
+  %nop8742 = alloca i1, i1 0
+  %nop8743 = alloca i1, i1 0
+  %nop8744 = alloca i1, i1 0
+  %nop8745 = alloca i1, i1 0
+  %nop8746 = alloca i1, i1 0
+  %nop8747 = alloca i1, i1 0
+  %nop8748 = alloca i1, i1 0
+  %nop8749 = alloca i1, i1 0
+  %nop8750 = alloca i1, i1 0
+  %nop8751 = alloca i1, i1 0
+  %nop8752 = alloca i1, i1 0
+  %nop8753 = alloca i1, i1 0
+  %nop8754 = alloca i1, i1 0
+  %nop8755 = alloca i1, i1 0
+  %nop8756 = alloca i1, i1 0
+  %nop8757 = alloca i1, i1 0
+  %nop8758 = alloca i1, i1 0
+  %nop8759 = alloca i1, i1 0
+  %nop8760 = alloca i1, i1 0
+  %nop8761 = alloca i1, i1 0
+  %nop8762 = alloca i1, i1 0
+  %nop8763 = alloca i1, i1 0
+  %nop8764 = alloca i1, i1 0
+  %nop8765 = alloca i1, i1 0
+  %nop8766 = alloca i1, i1 0
+  %nop8767 = alloca i1, i1 0
+  %nop8768 = alloca i1, i1 0
+  %nop8769 = alloca i1, i1 0
+  %nop8770 = alloca i1, i1 0
+  %nop8771 = alloca i1, i1 0
+  %nop8772 = alloca i1, i1 0
+  %nop8773 = alloca i1, i1 0
+  %nop8774 = alloca i1, i1 0
+  %nop8775 = alloca i1, i1 0
+  %nop8776 = alloca i1, i1 0
+  %nop8777 = alloca i1, i1 0
+  %nop8778 = alloca i1, i1 0
+  %nop8779 = alloca i1, i1 0
+  %nop8780 = alloca i1, i1 0
+  %nop8781 = alloca i1, i1 0
+  %nop8782 = alloca i1, i1 0
+  %nop8783 = alloca i1, i1 0
+  %nop8784 = alloca i1, i1 0
+  %nop8785 = alloca i1, i1 0
+  %nop8786 = alloca i1, i1 0
+  %nop8787 = alloca i1, i1 0
+  %nop8788 = alloca i1, i1 0
+  %nop8789 = alloca i1, i1 0
+  %nop8790 = alloca i1, i1 0
+  %nop8791 = alloca i1, i1 0
+  %nop8792 = alloca i1, i1 0
+  %nop8793 = alloca i1, i1 0
+  %nop8794 = alloca i1, i1 0
+  %nop8795 = alloca i1, i1 0
+  %nop8796 = alloca i1, i1 0
+  %nop8797 = alloca i1, i1 0
+  %nop8798 = alloca i1, i1 0
+  %nop8799 = alloca i1, i1 0
+  %nop8800 = alloca i1, i1 0
+  %nop8801 = alloca i1, i1 0
+  %nop8802 = alloca i1, i1 0
+  %nop8803 = alloca i1, i1 0
+  %nop8804 = alloca i1, i1 0
+  %nop8805 = alloca i1, i1 0
+  %nop8806 = alloca i1, i1 0
+  %nop8807 = alloca i1, i1 0
+  %nop8808 = alloca i1, i1 0
+  %nop8809 = alloca i1, i1 0
+  %nop8810 = alloca i1, i1 0
+  %nop8811 = alloca i1, i1 0
+  %nop8812 = alloca i1, i1 0
+  %nop8813 = alloca i1, i1 0
+  %nop8814 = alloca i1, i1 0
+  %nop8815 = alloca i1, i1 0
+  %nop8816 = alloca i1, i1 0
+  %nop8817 = alloca i1, i1 0
+  %nop8818 = alloca i1, i1 0
+  %nop8819 = alloca i1, i1 0
+  %nop8820 = alloca i1, i1 0
+  %nop8821 = alloca i1, i1 0
+  %nop8822 = alloca i1, i1 0
+  %nop8823 = alloca i1, i1 0
+  %nop8824 = alloca i1, i1 0
+  %nop8825 = alloca i1, i1 0
+  %nop8826 = alloca i1, i1 0
+  %nop8827 = alloca i1, i1 0
+  %nop8828 = alloca i1, i1 0
+  %nop8829 = alloca i1, i1 0
+  %nop8830 = alloca i1, i1 0
+  %nop8831 = alloca i1, i1 0
+  %nop8832 = alloca i1, i1 0
+  %nop8833 = alloca i1, i1 0
+  %nop8834 = alloca i1, i1 0
+  %nop8835 = alloca i1, i1 0
+  %nop8836 = alloca i1, i1 0
+  %nop8837 = alloca i1, i1 0
+  %nop8838 = alloca i1, i1 0
+  %nop8839 = alloca i1, i1 0
+  %nop8840 = alloca i1, i1 0
+  %nop8841 = alloca i1, i1 0
+  %nop8842 = alloca i1, i1 0
+  %nop8843 = alloca i1, i1 0
+  %nop8844 = alloca i1, i1 0
+  %nop8845 = alloca i1, i1 0
+  %nop8846 = alloca i1, i1 0
+  %nop8847 = alloca i1, i1 0
+  %nop8848 = alloca i1, i1 0
+  %nop8849 = alloca i1, i1 0
+  %nop8850 = alloca i1, i1 0
+  %nop8851 = alloca i1, i1 0
+  %nop8852 = alloca i1, i1 0
+  %nop8853 = alloca i1, i1 0
+  %nop8854 = alloca i1, i1 0
+  %nop8855 = alloca i1, i1 0
+  %nop8856 = alloca i1, i1 0
+  %nop8857 = alloca i1, i1 0
+  %nop8858 = alloca i1, i1 0
+  %nop8859 = alloca i1, i1 0
+  %nop8860 = alloca i1, i1 0
+  %nop8861 = alloca i1, i1 0
+  %nop8862 = alloca i1, i1 0
+  %nop8863 = alloca i1, i1 0
+  %nop8864 = alloca i1, i1 0
+  %nop8865 = alloca i1, i1 0
+  %nop8866 = alloca i1, i1 0
+  %nop8867 = alloca i1, i1 0
+  %nop8868 = alloca i1, i1 0
+  %nop8869 = alloca i1, i1 0
+  %nop8870 = alloca i1, i1 0
+  %nop8871 = alloca i1, i1 0
+  %nop8872 = alloca i1, i1 0
+  %nop8873 = alloca i1, i1 0
+  %nop8874 = alloca i1, i1 0
+  %nop8875 = alloca i1, i1 0
+  %nop8876 = alloca i1, i1 0
+  %nop8877 = alloca i1, i1 0
+  %nop8878 = alloca i1, i1 0
+  %nop8879 = alloca i1, i1 0
+  %nop8880 = alloca i1, i1 0
+  %nop8881 = alloca i1, i1 0
+  %nop8882 = alloca i1, i1 0
+  %nop8883 = alloca i1, i1 0
+  %nop8884 = alloca i1, i1 0
+  %nop8885 = alloca i1, i1 0
+  %nop8886 = alloca i1, i1 0
+  %nop8887 = alloca i1, i1 0
+  %nop8888 = alloca i1, i1 0
+  %nop8889 = alloca i1, i1 0
+  %nop8890 = alloca i1, i1 0
+  %nop8891 = alloca i1, i1 0
+  %nop8892 = alloca i1, i1 0
+  %nop8893 = alloca i1, i1 0
+  %nop8894 = alloca i1, i1 0
+  %nop8895 = alloca i1, i1 0
+  %nop8896 = alloca i1, i1 0
+  %nop8897 = alloca i1, i1 0
+  %nop8898 = alloca i1, i1 0
+  %nop8899 = alloca i1, i1 0
+  %nop8900 = alloca i1, i1 0
+  %nop8901 = alloca i1, i1 0
+  %nop8902 = alloca i1, i1 0
+  %nop8903 = alloca i1, i1 0
+  %nop8904 = alloca i1, i1 0
+  %nop8905 = alloca i1, i1 0
+  %nop8906 = alloca i1, i1 0
+  %nop8907 = alloca i1, i1 0
+  %nop8908 = alloca i1, i1 0
+  %nop8909 = alloca i1, i1 0
+  %nop8910 = alloca i1, i1 0
+  %nop8911 = alloca i1, i1 0
+  %nop8912 = alloca i1, i1 0
+  %nop8913 = alloca i1, i1 0
+  %nop8914 = alloca i1, i1 0
+  %nop8915 = alloca i1, i1 0
+  %nop8916 = alloca i1, i1 0
+  %nop8917 = alloca i1, i1 0
+  %nop8918 = alloca i1, i1 0
+  %nop8919 = alloca i1, i1 0
+  %nop8920 = alloca i1, i1 0
+  %nop8921 = alloca i1, i1 0
+  %nop8922 = alloca i1, i1 0
+  %nop8923 = alloca i1, i1 0
+  %nop8924 = alloca i1, i1 0
+  %nop8925 = alloca i1, i1 0
+  %nop8926 = alloca i1, i1 0
+  %nop8927 = alloca i1, i1 0
+  %nop8928 = alloca i1, i1 0
+  %nop8929 = alloca i1, i1 0
+  %nop8930 = alloca i1, i1 0
+  %nop8931 = alloca i1, i1 0
+  %nop8932 = alloca i1, i1 0
+  %nop8933 = alloca i1, i1 0
+  %nop8934 = alloca i1, i1 0
+  %nop8935 = alloca i1, i1 0
+  %nop8936 = alloca i1, i1 0
+  %nop8937 = alloca i1, i1 0
+  %nop8938 = alloca i1, i1 0
+  %nop8939 = alloca i1, i1 0
+  %nop8940 = alloca i1, i1 0
+  %nop8941 = alloca i1, i1 0
+  %nop8942 = alloca i1, i1 0
+  %nop8943 = alloca i1, i1 0
+  %nop8944 = alloca i1, i1 0
+  %nop8945 = alloca i1, i1 0
+  %nop8946 = alloca i1, i1 0
+  %nop8947 = alloca i1, i1 0
+  %nop8948 = alloca i1, i1 0
+  %nop8949 = alloca i1, i1 0
+  %nop8950 = alloca i1, i1 0
+  %nop8951 = alloca i1, i1 0
+  %nop8952 = alloca i1, i1 0
+  %nop8953 = alloca i1, i1 0
+  %nop8954 = alloca i1, i1 0
+  %nop8955 = alloca i1, i1 0
+  %nop8956 = alloca i1, i1 0
+  %nop8957 = alloca i1, i1 0
+  %nop8958 = alloca i1, i1 0
+  %nop8959 = alloca i1, i1 0
+  %nop8960 = alloca i1, i1 0
+  %nop8961 = alloca i1, i1 0
+  %nop8962 = alloca i1, i1 0
+  %nop8963 = alloca i1, i1 0
+  %nop8964 = alloca i1, i1 0
+  %nop8965 = alloca i1, i1 0
+  %nop8966 = alloca i1, i1 0
+  %nop8967 = alloca i1, i1 0
+  %nop8968 = alloca i1, i1 0
+  %nop8969 = alloca i1, i1 0
+  %nop8970 = alloca i1, i1 0
+  %nop8971 = alloca i1, i1 0
+  %nop8972 = alloca i1, i1 0
+  %nop8973 = alloca i1, i1 0
+  %nop8974 = alloca i1, i1 0
+  %nop8975 = alloca i1, i1 0
+  %nop8976 = alloca i1, i1 0
+  %nop8977 = alloca i1, i1 0
+  %nop8978 = alloca i1, i1 0
+  %nop8979 = alloca i1, i1 0
+  %nop8980 = alloca i1, i1 0
+  %nop8981 = alloca i1, i1 0
+  %nop8982 = alloca i1, i1 0
+  %nop8983 = alloca i1, i1 0
+  %nop8984 = alloca i1, i1 0
+  %nop8985 = alloca i1, i1 0
+  %nop8986 = alloca i1, i1 0
+  %nop8987 = alloca i1, i1 0
+  %nop8988 = alloca i1, i1 0
+  %nop8989 = alloca i1, i1 0
+  %nop8990 = alloca i1, i1 0
+  %nop8991 = alloca i1, i1 0
+  %nop8992 = alloca i1, i1 0
+  %nop8993 = alloca i1, i1 0
+  %nop8994 = alloca i1, i1 0
+  %nop8995 = alloca i1, i1 0
+  %nop8996 = alloca i1, i1 0
+  %nop8997 = alloca i1, i1 0
+  %nop8998 = alloca i1, i1 0
+  %nop8999 = alloca i1, i1 0
+  %nop9000 = alloca i1, i1 0
+  %nop9001 = alloca i1, i1 0
+  %nop9002 = alloca i1, i1 0
+  %nop9003 = alloca i1, i1 0
+  %nop9004 = alloca i1, i1 0
+  %nop9005 = alloca i1, i1 0
+  %nop9006 = alloca i1, i1 0
+  %nop9007 = alloca i1, i1 0
+  %nop9008 = alloca i1, i1 0
+  %nop9009 = alloca i1, i1 0
+  %nop9010 = alloca i1, i1 0
+  %nop9011 = alloca i1, i1 0
+  %nop9012 = alloca i1, i1 0
+  %nop9013 = alloca i1, i1 0
+  %nop9014 = alloca i1, i1 0
+  %nop9015 = alloca i1, i1 0
+  %nop9016 = alloca i1, i1 0
+  %nop9017 = alloca i1, i1 0
+  %nop9018 = alloca i1, i1 0
+  %nop9019 = alloca i1, i1 0
+  %nop9020 = alloca i1, i1 0
+  %nop9021 = alloca i1, i1 0
+  %nop9022 = alloca i1, i1 0
+  %nop9023 = alloca i1, i1 0
+  %nop9024 = alloca i1, i1 0
+  %nop9025 = alloca i1, i1 0
+  %nop9026 = alloca i1, i1 0
+  %nop9027 = alloca i1, i1 0
+  %nop9028 = alloca i1, i1 0
+  %nop9029 = alloca i1, i1 0
+  %nop9030 = alloca i1, i1 0
+  %nop9031 = alloca i1, i1 0
+  %nop9032 = alloca i1, i1 0
+  %nop9033 = alloca i1, i1 0
+  %nop9034 = alloca i1, i1 0
+  %nop9035 = alloca i1, i1 0
+  %nop9036 = alloca i1, i1 0
+  %nop9037 = alloca i1, i1 0
+  %nop9038 = alloca i1, i1 0
+  %nop9039 = alloca i1, i1 0
+  %nop9040 = alloca i1, i1 0
+  %nop9041 = alloca i1, i1 0
+  %nop9042 = alloca i1, i1 0
+  %nop9043 = alloca i1, i1 0
+  %nop9044 = alloca i1, i1 0
+  %nop9045 = alloca i1, i1 0
+  %nop9046 = alloca i1, i1 0
+  %nop9047 = alloca i1, i1 0
+  %nop9048 = alloca i1, i1 0
+  %nop9049 = alloca i1, i1 0
+  %nop9050 = alloca i1, i1 0
+  %nop9051 = alloca i1, i1 0
+  %nop9052 = alloca i1, i1 0
+  %nop9053 = alloca i1, i1 0
+  %nop9054 = alloca i1, i1 0
+  %nop9055 = alloca i1, i1 0
+  %nop9056 = alloca i1, i1 0
+  %nop9057 = alloca i1, i1 0
+  %nop9058 = alloca i1, i1 0
+  %nop9059 = alloca i1, i1 0
+  %nop9060 = alloca i1, i1 0
+  %nop9061 = alloca i1, i1 0
+  %nop9062 = alloca i1, i1 0
+  %nop9063 = alloca i1, i1 0
+  %nop9064 = alloca i1, i1 0
+  %nop9065 = alloca i1, i1 0
+  %nop9066 = alloca i1, i1 0
+  %nop9067 = alloca i1, i1 0
+  %nop9068 = alloca i1, i1 0
+  %nop9069 = alloca i1, i1 0
+  %nop9070 = alloca i1, i1 0
+  %nop9071 = alloca i1, i1 0
+  %nop9072 = alloca i1, i1 0
+  %nop9073 = alloca i1, i1 0
+  %nop9074 = alloca i1, i1 0
+  %nop9075 = alloca i1, i1 0
+  %nop9076 = alloca i1, i1 0
+  %nop9077 = alloca i1, i1 0
+  %nop9078 = alloca i1, i1 0
+  %nop9079 = alloca i1, i1 0
+  %nop9080 = alloca i1, i1 0
+  %nop9081 = alloca i1, i1 0
+  %nop9082 = alloca i1, i1 0
+  %nop9083 = alloca i1, i1 0
+  %nop9084 = alloca i1, i1 0
+  %nop9085 = alloca i1, i1 0
+  %nop9086 = alloca i1, i1 0
+  %nop9087 = alloca i1, i1 0
+  %nop9088 = alloca i1, i1 0
+  %nop9089 = alloca i1, i1 0
+  %nop9090 = alloca i1, i1 0
+  %nop9091 = alloca i1, i1 0
+  %nop9092 = alloca i1, i1 0
+  %nop9093 = alloca i1, i1 0
+  %nop9094 = alloca i1, i1 0
+  %nop9095 = alloca i1, i1 0
+  %nop9096 = alloca i1, i1 0
+  %nop9097 = alloca i1, i1 0
+  %nop9098 = alloca i1, i1 0
+  %nop9099 = alloca i1, i1 0
+  %nop9100 = alloca i1, i1 0
+  %nop9101 = alloca i1, i1 0
+  %nop9102 = alloca i1, i1 0
+  %nop9103 = alloca i1, i1 0
+  %nop9104 = alloca i1, i1 0
+  %nop9105 = alloca i1, i1 0
+  %nop9106 = alloca i1, i1 0
+  %nop9107 = alloca i1, i1 0
+  %nop9108 = alloca i1, i1 0
+  %nop9109 = alloca i1, i1 0
+  %nop9110 = alloca i1, i1 0
+  %nop9111 = alloca i1, i1 0
+  %nop9112 = alloca i1, i1 0
+  %nop9113 = alloca i1, i1 0
+  %nop9114 = alloca i1, i1 0
+  %nop9115 = alloca i1, i1 0
+  %nop9116 = alloca i1, i1 0
+  %nop9117 = alloca i1, i1 0
+  %nop9118 = alloca i1, i1 0
+  %nop9119 = alloca i1, i1 0
+  %nop9120 = alloca i1, i1 0
+  %nop9121 = alloca i1, i1 0
+  %nop9122 = alloca i1, i1 0
+  %nop9123 = alloca i1, i1 0
+  %nop9124 = alloca i1, i1 0
+  %nop9125 = alloca i1, i1 0
+  %nop9126 = alloca i1, i1 0
+  %nop9127 = alloca i1, i1 0
+  %nop9128 = alloca i1, i1 0
+  %nop9129 = alloca i1, i1 0
+  %nop9130 = alloca i1, i1 0
+  %nop9131 = alloca i1, i1 0
+  %nop9132 = alloca i1, i1 0
+  %nop9133 = alloca i1, i1 0
+  %nop9134 = alloca i1, i1 0
+  %nop9135 = alloca i1, i1 0
+  %nop9136 = alloca i1, i1 0
+  %nop9137 = alloca i1, i1 0
+  %nop9138 = alloca i1, i1 0
+  %nop9139 = alloca i1, i1 0
+  %nop9140 = alloca i1, i1 0
+  %nop9141 = alloca i1, i1 0
+  %nop9142 = alloca i1, i1 0
+  %nop9143 = alloca i1, i1 0
+  %nop9144 = alloca i1, i1 0
+  %nop9145 = alloca i1, i1 0
+  %nop9146 = alloca i1, i1 0
+  %nop9147 = alloca i1, i1 0
+  %nop9148 = alloca i1, i1 0
+  %nop9149 = alloca i1, i1 0
+  %nop9150 = alloca i1, i1 0
+  %nop9151 = alloca i1, i1 0
+  %nop9152 = alloca i1, i1 0
+  %nop9153 = alloca i1, i1 0
+  %nop9154 = alloca i1, i1 0
+  %nop9155 = alloca i1, i1 0
+  %nop9156 = alloca i1, i1 0
+  %nop9157 = alloca i1, i1 0
+  %nop9158 = alloca i1, i1 0
+  %nop9159 = alloca i1, i1 0
+  %nop9160 = alloca i1, i1 0
+  %nop9161 = alloca i1, i1 0
+  %nop9162 = alloca i1, i1 0
+  %nop9163 = alloca i1, i1 0
+  %nop9164 = alloca i1, i1 0
+  %nop9165 = alloca i1, i1 0
+  %nop9166 = alloca i1, i1 0
+  %nop9167 = alloca i1, i1 0
+  %nop9168 = alloca i1, i1 0
+  %nop9169 = alloca i1, i1 0
+  %nop9170 = alloca i1, i1 0
+  %nop9171 = alloca i1, i1 0
+  %nop9172 = alloca i1, i1 0
+  %nop9173 = alloca i1, i1 0
+  %nop9174 = alloca i1, i1 0
+  %nop9175 = alloca i1, i1 0
+  %nop9176 = alloca i1, i1 0
+  %nop9177 = alloca i1, i1 0
+  %nop9178 = alloca i1, i1 0
+  %nop9179 = alloca i1, i1 0
+  %nop9180 = alloca i1, i1 0
+  %nop9181 = alloca i1, i1 0
+  %nop9182 = alloca i1, i1 0
+  %nop9183 = alloca i1, i1 0
+  %nop9184 = alloca i1, i1 0
+  %nop9185 = alloca i1, i1 0
+  %nop9186 = alloca i1, i1 0
+  %nop9187 = alloca i1, i1 0
+  %nop9188 = alloca i1, i1 0
+  %nop9189 = alloca i1, i1 0
+  %nop9190 = alloca i1, i1 0
+  %nop9191 = alloca i1, i1 0
+  %nop9192 = alloca i1, i1 0
+  %nop9193 = alloca i1, i1 0
+  %nop9194 = alloca i1, i1 0
+  %nop9195 = alloca i1, i1 0
+  %nop9196 = alloca i1, i1 0
+  %nop9197 = alloca i1, i1 0
+  %nop9198 = alloca i1, i1 0
+  %nop9199 = alloca i1, i1 0
+  %nop9200 = alloca i1, i1 0
+  %nop9201 = alloca i1, i1 0
+  %nop9202 = alloca i1, i1 0
+  %nop9203 = alloca i1, i1 0
+  %nop9204 = alloca i1, i1 0
+  %nop9205 = alloca i1, i1 0
+  %nop9206 = alloca i1, i1 0
+  %nop9207 = alloca i1, i1 0
+  %nop9208 = alloca i1, i1 0
+  %nop9209 = alloca i1, i1 0
+  %nop9210 = alloca i1, i1 0
+  %nop9211 = alloca i1, i1 0
+  %nop9212 = alloca i1, i1 0
+  %nop9213 = alloca i1, i1 0
+  %nop9214 = alloca i1, i1 0
+  %nop9215 = alloca i1, i1 0
+  %nop9216 = alloca i1, i1 0
+  %nop9217 = alloca i1, i1 0
+  %nop9218 = alloca i1, i1 0
+  %nop9219 = alloca i1, i1 0
+  %nop9220 = alloca i1, i1 0
+  %nop9221 = alloca i1, i1 0
+  %nop9222 = alloca i1, i1 0
+  %nop9223 = alloca i1, i1 0
+  %nop9224 = alloca i1, i1 0
+  %nop9225 = alloca i1, i1 0
+  %nop9226 = alloca i1, i1 0
+  %nop9227 = alloca i1, i1 0
+  %nop9228 = alloca i1, i1 0
+  %nop9229 = alloca i1, i1 0
+  %nop9230 = alloca i1, i1 0
+  %nop9231 = alloca i1, i1 0
+  %nop9232 = alloca i1, i1 0
+  %nop9233 = alloca i1, i1 0
+  %nop9234 = alloca i1, i1 0
+  %nop9235 = alloca i1, i1 0
+  %nop9236 = alloca i1, i1 0
+  %nop9237 = alloca i1, i1 0
+  %nop9238 = alloca i1, i1 0
+  %nop9239 = alloca i1, i1 0
+  %nop9240 = alloca i1, i1 0
+  %nop9241 = alloca i1, i1 0
+  %nop9242 = alloca i1, i1 0
+  %nop9243 = alloca i1, i1 0
+  %nop9244 = alloca i1, i1 0
+  %nop9245 = alloca i1, i1 0
+  %nop9246 = alloca i1, i1 0
+  %nop9247 = alloca i1, i1 0
+  %nop9248 = alloca i1, i1 0
+  %nop9249 = alloca i1, i1 0
+  %nop9250 = alloca i1, i1 0
+  %nop9251 = alloca i1, i1 0
+  %nop9252 = alloca i1, i1 0
+  %nop9253 = alloca i1, i1 0
+  %nop9254 = alloca i1, i1 0
+  %nop9255 = alloca i1, i1 0
+  %nop9256 = alloca i1, i1 0
+  %nop9257 = alloca i1, i1 0
+  %nop9258 = alloca i1, i1 0
+  %nop9259 = alloca i1, i1 0
+  %nop9260 = alloca i1, i1 0
+  %nop9261 = alloca i1, i1 0
+  %nop9262 = alloca i1, i1 0
+  %nop9263 = alloca i1, i1 0
+  %nop9264 = alloca i1, i1 0
+  %nop9265 = alloca i1, i1 0
+  %nop9266 = alloca i1, i1 0
+  %nop9267 = alloca i1, i1 0
+  %nop9268 = alloca i1, i1 0
+  %nop9269 = alloca i1, i1 0
+  %nop9270 = alloca i1, i1 0
+  %nop9271 = alloca i1, i1 0
+  %nop9272 = alloca i1, i1 0
+  %nop9273 = alloca i1, i1 0
+  %nop9274 = alloca i1, i1 0
+  %nop9275 = alloca i1, i1 0
+  %nop9276 = alloca i1, i1 0
+  %nop9277 = alloca i1, i1 0
+  %nop9278 = alloca i1, i1 0
+  %nop9279 = alloca i1, i1 0
+  %nop9280 = alloca i1, i1 0
+  %nop9281 = alloca i1, i1 0
+  %nop9282 = alloca i1, i1 0
+  %nop9283 = alloca i1, i1 0
+  %nop9284 = alloca i1, i1 0
+  %nop9285 = alloca i1, i1 0
+  %nop9286 = alloca i1, i1 0
+  %nop9287 = alloca i1, i1 0
+  %nop9288 = alloca i1, i1 0
+  %nop9289 = alloca i1, i1 0
+  %nop9290 = alloca i1, i1 0
+  %nop9291 = alloca i1, i1 0
+  %nop9292 = alloca i1, i1 0
+  %nop9293 = alloca i1, i1 0
+  %nop9294 = alloca i1, i1 0
+  %nop9295 = alloca i1, i1 0
+  %nop9296 = alloca i1, i1 0
+  %nop9297 = alloca i1, i1 0
+  %nop9298 = alloca i1, i1 0
+  %nop9299 = alloca i1, i1 0
+  %nop9300 = alloca i1, i1 0
+  %nop9301 = alloca i1, i1 0
+  %nop9302 = alloca i1, i1 0
+  %nop9303 = alloca i1, i1 0
+  %nop9304 = alloca i1, i1 0
+  %nop9305 = alloca i1, i1 0
+  %nop9306 = alloca i1, i1 0
+  %nop9307 = alloca i1, i1 0
+  %nop9308 = alloca i1, i1 0
+  %nop9309 = alloca i1, i1 0
+  %nop9310 = alloca i1, i1 0
+  %nop9311 = alloca i1, i1 0
+  %nop9312 = alloca i1, i1 0
+  %nop9313 = alloca i1, i1 0
+  %nop9314 = alloca i1, i1 0
+  %nop9315 = alloca i1, i1 0
+  %nop9316 = alloca i1, i1 0
+  %nop9317 = alloca i1, i1 0
+  %nop9318 = alloca i1, i1 0
+  %nop9319 = alloca i1, i1 0
+  %nop9320 = alloca i1, i1 0
+  %nop9321 = alloca i1, i1 0
+  %nop9322 = alloca i1, i1 0
+  %nop9323 = alloca i1, i1 0
+  %nop9324 = alloca i1, i1 0
+  %nop9325 = alloca i1, i1 0
+  %nop9326 = alloca i1, i1 0
+  %nop9327 = alloca i1, i1 0
+  %nop9328 = alloca i1, i1 0
+  %nop9329 = alloca i1, i1 0
+  %nop9330 = alloca i1, i1 0
+  %nop9331 = alloca i1, i1 0
+  %nop9332 = alloca i1, i1 0
+  %nop9333 = alloca i1, i1 0
+  %nop9334 = alloca i1, i1 0
+  %nop9335 = alloca i1, i1 0
+  %nop9336 = alloca i1, i1 0
+  %nop9337 = alloca i1, i1 0
+  %nop9338 = alloca i1, i1 0
+  %nop9339 = alloca i1, i1 0
+  %nop9340 = alloca i1, i1 0
+  %nop9341 = alloca i1, i1 0
+  %nop9342 = alloca i1, i1 0
+  %nop9343 = alloca i1, i1 0
+  %nop9344 = alloca i1, i1 0
+  %nop9345 = alloca i1, i1 0
+  %nop9346 = alloca i1, i1 0
+  %nop9347 = alloca i1, i1 0
+  %nop9348 = alloca i1, i1 0
+  %nop9349 = alloca i1, i1 0
+  %nop9350 = alloca i1, i1 0
+  %nop9351 = alloca i1, i1 0
+  %nop9352 = alloca i1, i1 0
+  %nop9353 = alloca i1, i1 0
+  %nop9354 = alloca i1, i1 0
+  %nop9355 = alloca i1, i1 0
+  %nop9356 = alloca i1, i1 0
+  %nop9357 = alloca i1, i1 0
+  %nop9358 = alloca i1, i1 0
+  %nop9359 = alloca i1, i1 0
+  %nop9360 = alloca i1, i1 0
+  %nop9361 = alloca i1, i1 0
+  %nop9362 = alloca i1, i1 0
+  %nop9363 = alloca i1, i1 0
+  %nop9364 = alloca i1, i1 0
+  %nop9365 = alloca i1, i1 0
+  %nop9366 = alloca i1, i1 0
+  %nop9367 = alloca i1, i1 0
+  %nop9368 = alloca i1, i1 0
+  %nop9369 = alloca i1, i1 0
+  %nop9370 = alloca i1, i1 0
+  %nop9371 = alloca i1, i1 0
+  %nop9372 = alloca i1, i1 0
+  %nop9373 = alloca i1, i1 0
+  %nop9374 = alloca i1, i1 0
+  %nop9375 = alloca i1, i1 0
+  %nop9376 = alloca i1, i1 0
+  %nop9377 = alloca i1, i1 0
+  %nop9378 = alloca i1, i1 0
+  %nop9379 = alloca i1, i1 0
+  %nop9380 = alloca i1, i1 0
+  %nop9381 = alloca i1, i1 0
+  %nop9382 = alloca i1, i1 0
+  %nop9383 = alloca i1, i1 0
+  %nop9384 = alloca i1, i1 0
+  %nop9385 = alloca i1, i1 0
+  %nop9386 = alloca i1, i1 0
+  %nop9387 = alloca i1, i1 0
+  %nop9388 = alloca i1, i1 0
+  %nop9389 = alloca i1, i1 0
+  %nop9390 = alloca i1, i1 0
+  %nop9391 = alloca i1, i1 0
+  %nop9392 = alloca i1, i1 0
+  %nop9393 = alloca i1, i1 0
+  %nop9394 = alloca i1, i1 0
+  %nop9395 = alloca i1, i1 0
+  %nop9396 = alloca i1, i1 0
+  %nop9397 = alloca i1, i1 0
+  %nop9398 = alloca i1, i1 0
+  %nop9399 = alloca i1, i1 0
+  %nop9400 = alloca i1, i1 0
+  %nop9401 = alloca i1, i1 0
+  %nop9402 = alloca i1, i1 0
+  %nop9403 = alloca i1, i1 0
+  %nop9404 = alloca i1, i1 0
+  %nop9405 = alloca i1, i1 0
+  %nop9406 = alloca i1, i1 0
+  %nop9407 = alloca i1, i1 0
+  %nop9408 = alloca i1, i1 0
+  %nop9409 = alloca i1, i1 0
+  %nop9410 = alloca i1, i1 0
+  %nop9411 = alloca i1, i1 0
+  %nop9412 = alloca i1, i1 0
+  %nop9413 = alloca i1, i1 0
+  %nop9414 = alloca i1, i1 0
+  %nop9415 = alloca i1, i1 0
+  %nop9416 = alloca i1, i1 0
+  %nop9417 = alloca i1, i1 0
+  %nop9418 = alloca i1, i1 0
+  %nop9419 = alloca i1, i1 0
+  %nop9420 = alloca i1, i1 0
+  %nop9421 = alloca i1, i1 0
+  %nop9422 = alloca i1, i1 0
+  %nop9423 = alloca i1, i1 0
+  %nop9424 = alloca i1, i1 0
+  %nop9425 = alloca i1, i1 0
+  %nop9426 = alloca i1, i1 0
+  %nop9427 = alloca i1, i1 0
+  %nop9428 = alloca i1, i1 0
+  %nop9429 = alloca i1, i1 0
+  %nop9430 = alloca i1, i1 0
+  %nop9431 = alloca i1, i1 0
+  %nop9432 = alloca i1, i1 0
+  %nop9433 = alloca i1, i1 0
+  %nop9434 = alloca i1, i1 0
+  %nop9435 = alloca i1, i1 0
+  %nop9436 = alloca i1, i1 0
+  %nop9437 = alloca i1, i1 0
+  %nop9438 = alloca i1, i1 0
+  %nop9439 = alloca i1, i1 0
+  %nop9440 = alloca i1, i1 0
+  %nop9441 = alloca i1, i1 0
+  %nop9442 = alloca i1, i1 0
+  %nop9443 = alloca i1, i1 0
+  %nop9444 = alloca i1, i1 0
+  %nop9445 = alloca i1, i1 0
+  %nop9446 = alloca i1, i1 0
+  %nop9447 = alloca i1, i1 0
+  %nop9448 = alloca i1, i1 0
+  %nop9449 = alloca i1, i1 0
+  %nop9450 = alloca i1, i1 0
+  %nop9451 = alloca i1, i1 0
+  %nop9452 = alloca i1, i1 0
+  %nop9453 = alloca i1, i1 0
+  %nop9454 = alloca i1, i1 0
+  %nop9455 = alloca i1, i1 0
+  %nop9456 = alloca i1, i1 0
+  %nop9457 = alloca i1, i1 0
+  %nop9458 = alloca i1, i1 0
+  %nop9459 = alloca i1, i1 0
+  %nop9460 = alloca i1, i1 0
+  %nop9461 = alloca i1, i1 0
+  %nop9462 = alloca i1, i1 0
+  %nop9463 = alloca i1, i1 0
+  %nop9464 = alloca i1, i1 0
+  %nop9465 = alloca i1, i1 0
+  %nop9466 = alloca i1, i1 0
+  %nop9467 = alloca i1, i1 0
+  %nop9468 = alloca i1, i1 0
+  %nop9469 = alloca i1, i1 0
+  %nop9470 = alloca i1, i1 0
+  %nop9471 = alloca i1, i1 0
+  %nop9472 = alloca i1, i1 0
+  %nop9473 = alloca i1, i1 0
+  %nop9474 = alloca i1, i1 0
+  %nop9475 = alloca i1, i1 0
+  %nop9476 = alloca i1, i1 0
+  %nop9477 = alloca i1, i1 0
+  %nop9478 = alloca i1, i1 0
+  %nop9479 = alloca i1, i1 0
+  %nop9480 = alloca i1, i1 0
+  %nop9481 = alloca i1, i1 0
+  %nop9482 = alloca i1, i1 0
+  %nop9483 = alloca i1, i1 0
+  %nop9484 = alloca i1, i1 0
+  %nop9485 = alloca i1, i1 0
+  %nop9486 = alloca i1, i1 0
+  %nop9487 = alloca i1, i1 0
+  %nop9488 = alloca i1, i1 0
+  %nop9489 = alloca i1, i1 0
+  %nop9490 = alloca i1, i1 0
+  %nop9491 = alloca i1, i1 0
+  %nop9492 = alloca i1, i1 0
+  %nop9493 = alloca i1, i1 0
+  %nop9494 = alloca i1, i1 0
+  %nop9495 = alloca i1, i1 0
+  %nop9496 = alloca i1, i1 0
+  %nop9497 = alloca i1, i1 0
+  %nop9498 = alloca i1, i1 0
+  %nop9499 = alloca i1, i1 0
+  %nop9500 = alloca i1, i1 0
+  %nop9501 = alloca i1, i1 0
+  %nop9502 = alloca i1, i1 0
+  %nop9503 = alloca i1, i1 0
+  %nop9504 = alloca i1, i1 0
+  %nop9505 = alloca i1, i1 0
+  %nop9506 = alloca i1, i1 0
+  %nop9507 = alloca i1, i1 0
+  %nop9508 = alloca i1, i1 0
+  %nop9509 = alloca i1, i1 0
+  %nop9510 = alloca i1, i1 0
+  %nop9511 = alloca i1, i1 0
+  %nop9512 = alloca i1, i1 0
+  %nop9513 = alloca i1, i1 0
+  %nop9514 = alloca i1, i1 0
+  %nop9515 = alloca i1, i1 0
+  %nop9516 = alloca i1, i1 0
+  %nop9517 = alloca i1, i1 0
+  %nop9518 = alloca i1, i1 0
+  %nop9519 = alloca i1, i1 0
+  %nop9520 = alloca i1, i1 0
+  %nop9521 = alloca i1, i1 0
+  %nop9522 = alloca i1, i1 0
+  %nop9523 = alloca i1, i1 0
+  %nop9524 = alloca i1, i1 0
+  %nop9525 = alloca i1, i1 0
+  %nop9526 = alloca i1, i1 0
+  %nop9527 = alloca i1, i1 0
+  %nop9528 = alloca i1, i1 0
+  %nop9529 = alloca i1, i1 0
+  %nop9530 = alloca i1, i1 0
+  %nop9531 = alloca i1, i1 0
+  %nop9532 = alloca i1, i1 0
+  %nop9533 = alloca i1, i1 0
+  %nop9534 = alloca i1, i1 0
+  %nop9535 = alloca i1, i1 0
+  %nop9536 = alloca i1, i1 0
+  %nop9537 = alloca i1, i1 0
+  %nop9538 = alloca i1, i1 0
+  %nop9539 = alloca i1, i1 0
+  %nop9540 = alloca i1, i1 0
+  %nop9541 = alloca i1, i1 0
+  %nop9542 = alloca i1, i1 0
+  %nop9543 = alloca i1, i1 0
+  %nop9544 = alloca i1, i1 0
+  %nop9545 = alloca i1, i1 0
+  %nop9546 = alloca i1, i1 0
+  %nop9547 = alloca i1, i1 0
+  %nop9548 = alloca i1, i1 0
+  %nop9549 = alloca i1, i1 0
+  %nop9550 = alloca i1, i1 0
+  %nop9551 = alloca i1, i1 0
+  %nop9552 = alloca i1, i1 0
+  %nop9553 = alloca i1, i1 0
+  %nop9554 = alloca i1, i1 0
+  %nop9555 = alloca i1, i1 0
+  %nop9556 = alloca i1, i1 0
+  %nop9557 = alloca i1, i1 0
+  %nop9558 = alloca i1, i1 0
+  %nop9559 = alloca i1, i1 0
+  %nop9560 = alloca i1, i1 0
+  %nop9561 = alloca i1, i1 0
+  %nop9562 = alloca i1, i1 0
+  %nop9563 = alloca i1, i1 0
+  %nop9564 = alloca i1, i1 0
+  %nop9565 = alloca i1, i1 0
+  %nop9566 = alloca i1, i1 0
+  %nop9567 = alloca i1, i1 0
+  %nop9568 = alloca i1, i1 0
+  %nop9569 = alloca i1, i1 0
+  %nop9570 = alloca i1, i1 0
+  %nop9571 = alloca i1, i1 0
+  %nop9572 = alloca i1, i1 0
+  %nop9573 = alloca i1, i1 0
+  %nop9574 = alloca i1, i1 0
+  %nop9575 = alloca i1, i1 0
+  %nop9576 = alloca i1, i1 0
+  %nop9577 = alloca i1, i1 0
+  %nop9578 = alloca i1, i1 0
+  %nop9579 = alloca i1, i1 0
+  %nop9580 = alloca i1, i1 0
+  %nop9581 = alloca i1, i1 0
+  %nop9582 = alloca i1, i1 0
+  %nop9583 = alloca i1, i1 0
+  %nop9584 = alloca i1, i1 0
+  %nop9585 = alloca i1, i1 0
+  %nop9586 = alloca i1, i1 0
+  %nop9587 = alloca i1, i1 0
+  %nop9588 = alloca i1, i1 0
+  %nop9589 = alloca i1, i1 0
+  %nop9590 = alloca i1, i1 0
+  %nop9591 = alloca i1, i1 0
+  %nop9592 = alloca i1, i1 0
+  %nop9593 = alloca i1, i1 0
+  %nop9594 = alloca i1, i1 0
+  %nop9595 = alloca i1, i1 0
+  %nop9596 = alloca i1, i1 0
+  %nop9597 = alloca i1, i1 0
+  %nop9598 = alloca i1, i1 0
+  %nop9599 = alloca i1, i1 0
+  %nop9600 = alloca i1, i1 0
+  %nop9601 = alloca i1, i1 0
+  %nop9602 = alloca i1, i1 0
+  %nop9603 = alloca i1, i1 0
+  %nop9604 = alloca i1, i1 0
+  %nop9605 = alloca i1, i1 0
+  %nop9606 = alloca i1, i1 0
+  %nop9607 = alloca i1, i1 0
+  %nop9608 = alloca i1, i1 0
+  %nop9609 = alloca i1, i1 0
+  %nop9610 = alloca i1, i1 0
+  %nop9611 = alloca i1, i1 0
+  %nop9612 = alloca i1, i1 0
+  %nop9613 = alloca i1, i1 0
+  %nop9614 = alloca i1, i1 0
+  %nop9615 = alloca i1, i1 0
+  %nop9616 = alloca i1, i1 0
+  %nop9617 = alloca i1, i1 0
+  %nop9618 = alloca i1, i1 0
+  %nop9619 = alloca i1, i1 0
+  %nop9620 = alloca i1, i1 0
+  %nop9621 = alloca i1, i1 0
+  %nop9622 = alloca i1, i1 0
+  %nop9623 = alloca i1, i1 0
+  %nop9624 = alloca i1, i1 0
+  %nop9625 = alloca i1, i1 0
+  %nop9626 = alloca i1, i1 0
+  %nop9627 = alloca i1, i1 0
+  %nop9628 = alloca i1, i1 0
+  %nop9629 = alloca i1, i1 0
+  %nop9630 = alloca i1, i1 0
+  %nop9631 = alloca i1, i1 0
+  %nop9632 = alloca i1, i1 0
+  %nop9633 = alloca i1, i1 0
+  %nop9634 = alloca i1, i1 0
+  %nop9635 = alloca i1, i1 0
+  %nop9636 = alloca i1, i1 0
+  %nop9637 = alloca i1, i1 0
+  %nop9638 = alloca i1, i1 0
+  %nop9639 = alloca i1, i1 0
+  %nop9640 = alloca i1, i1 0
+  %nop9641 = alloca i1, i1 0
+  %nop9642 = alloca i1, i1 0
+  %nop9643 = alloca i1, i1 0
+  %nop9644 = alloca i1, i1 0
+  %nop9645 = alloca i1, i1 0
+  %nop9646 = alloca i1, i1 0
+  %nop9647 = alloca i1, i1 0
+  %nop9648 = alloca i1, i1 0
+  %nop9649 = alloca i1, i1 0
+  %nop9650 = alloca i1, i1 0
+  %nop9651 = alloca i1, i1 0
+  %nop9652 = alloca i1, i1 0
+  %nop9653 = alloca i1, i1 0
+  %nop9654 = alloca i1, i1 0
+  %nop9655 = alloca i1, i1 0
+  %nop9656 = alloca i1, i1 0
+  %nop9657 = alloca i1, i1 0
+  %nop9658 = alloca i1, i1 0
+  %nop9659 = alloca i1, i1 0
+  %nop9660 = alloca i1, i1 0
+  %nop9661 = alloca i1, i1 0
+  %nop9662 = alloca i1, i1 0
+  %nop9663 = alloca i1, i1 0
+  %nop9664 = alloca i1, i1 0
+  %nop9665 = alloca i1, i1 0
+  %nop9666 = alloca i1, i1 0
+  %nop9667 = alloca i1, i1 0
+  %nop9668 = alloca i1, i1 0
+  %nop9669 = alloca i1, i1 0
+  %nop9670 = alloca i1, i1 0
+  %nop9671 = alloca i1, i1 0
+  %nop9672 = alloca i1, i1 0
+  %nop9673 = alloca i1, i1 0
+  %nop9674 = alloca i1, i1 0
+  %nop9675 = alloca i1, i1 0
+  %nop9676 = alloca i1, i1 0
+  %nop9677 = alloca i1, i1 0
+  %nop9678 = alloca i1, i1 0
+  %nop9679 = alloca i1, i1 0
+  %nop9680 = alloca i1, i1 0
+  %nop9681 = alloca i1, i1 0
+  %nop9682 = alloca i1, i1 0
+  %nop9683 = alloca i1, i1 0
+  %nop9684 = alloca i1, i1 0
+  %nop9685 = alloca i1, i1 0
+  %nop9686 = alloca i1, i1 0
+  %nop9687 = alloca i1, i1 0
+  %nop9688 = alloca i1, i1 0
+  %nop9689 = alloca i1, i1 0
+  %nop9690 = alloca i1, i1 0
+  %nop9691 = alloca i1, i1 0
+  %nop9692 = alloca i1, i1 0
+  %nop9693 = alloca i1, i1 0
+  %nop9694 = alloca i1, i1 0
+  %nop9695 = alloca i1, i1 0
+  %nop9696 = alloca i1, i1 0
+  %nop9697 = alloca i1, i1 0
+  %nop9698 = alloca i1, i1 0
+  %nop9699 = alloca i1, i1 0
+  %nop9700 = alloca i1, i1 0
+  %nop9701 = alloca i1, i1 0
+  %nop9702 = alloca i1, i1 0
+  %nop9703 = alloca i1, i1 0
+  %nop9704 = alloca i1, i1 0
+  %nop9705 = alloca i1, i1 0
+  %nop9706 = alloca i1, i1 0
+  %nop9707 = alloca i1, i1 0
+  %nop9708 = alloca i1, i1 0
+  %nop9709 = alloca i1, i1 0
+  %nop9710 = alloca i1, i1 0
+  %nop9711 = alloca i1, i1 0
+  %nop9712 = alloca i1, i1 0
+  %nop9713 = alloca i1, i1 0
+  %nop9714 = alloca i1, i1 0
+  %nop9715 = alloca i1, i1 0
+  %nop9716 = alloca i1, i1 0
+  %nop9717 = alloca i1, i1 0
+  %nop9718 = alloca i1, i1 0
+  %nop9719 = alloca i1, i1 0
+  %nop9720 = alloca i1, i1 0
+  %nop9721 = alloca i1, i1 0
+  %nop9722 = alloca i1, i1 0
+  %nop9723 = alloca i1, i1 0
+  %nop9724 = alloca i1, i1 0
+  %nop9725 = alloca i1, i1 0
+  %nop9726 = alloca i1, i1 0
+  %nop9727 = alloca i1, i1 0
+  %nop9728 = alloca i1, i1 0
+  %nop9729 = alloca i1, i1 0
+  %nop9730 = alloca i1, i1 0
+  %nop9731 = alloca i1, i1 0
+  %nop9732 = alloca i1, i1 0
+  %nop9733 = alloca i1, i1 0
+  %nop9734 = alloca i1, i1 0
+  %nop9735 = alloca i1, i1 0
+  %nop9736 = alloca i1, i1 0
+  %nop9737 = alloca i1, i1 0
+  %nop9738 = alloca i1, i1 0
+  %nop9739 = alloca i1, i1 0
+  %nop9740 = alloca i1, i1 0
+  %nop9741 = alloca i1, i1 0
+  %nop9742 = alloca i1, i1 0
+  %nop9743 = alloca i1, i1 0
+  %nop9744 = alloca i1, i1 0
+  %nop9745 = alloca i1, i1 0
+  %nop9746 = alloca i1, i1 0
+  %nop9747 = alloca i1, i1 0
+  %nop9748 = alloca i1, i1 0
+  %nop9749 = alloca i1, i1 0
+  %nop9750 = alloca i1, i1 0
+  %nop9751 = alloca i1, i1 0
+  %nop9752 = alloca i1, i1 0
+  %nop9753 = alloca i1, i1 0
+  %nop9754 = alloca i1, i1 0
+  %nop9755 = alloca i1, i1 0
+  %nop9756 = alloca i1, i1 0
+  %nop9757 = alloca i1, i1 0
+  %nop9758 = alloca i1, i1 0
+  %nop9759 = alloca i1, i1 0
+  %nop9760 = alloca i1, i1 0
+  %nop9761 = alloca i1, i1 0
+  %nop9762 = alloca i1, i1 0
+  %nop9763 = alloca i1, i1 0
+  %nop9764 = alloca i1, i1 0
+  %nop9765 = alloca i1, i1 0
+  %nop9766 = alloca i1, i1 0
+  %nop9767 = alloca i1, i1 0
+  %nop9768 = alloca i1, i1 0
+  %nop9769 = alloca i1, i1 0
+  %nop9770 = alloca i1, i1 0
+  %nop9771 = alloca i1, i1 0
+  %nop9772 = alloca i1, i1 0
+  %nop9773 = alloca i1, i1 0
+  %nop9774 = alloca i1, i1 0
+  %nop9775 = alloca i1, i1 0
+  %nop9776 = alloca i1, i1 0
+  %nop9777 = alloca i1, i1 0
+  %nop9778 = alloca i1, i1 0
+  %nop9779 = alloca i1, i1 0
+  %nop9780 = alloca i1, i1 0
+  %nop9781 = alloca i1, i1 0
+  %nop9782 = alloca i1, i1 0
+  %nop9783 = alloca i1, i1 0
+  %nop9784 = alloca i1, i1 0
+  %nop9785 = alloca i1, i1 0
+  %nop9786 = alloca i1, i1 0
+  %nop9787 = alloca i1, i1 0
+  %nop9788 = alloca i1, i1 0
+  %nop9789 = alloca i1, i1 0
+  %nop9790 = alloca i1, i1 0
+  %nop9791 = alloca i1, i1 0
+  %nop9792 = alloca i1, i1 0
+  %nop9793 = alloca i1, i1 0
+  %nop9794 = alloca i1, i1 0
+  %nop9795 = alloca i1, i1 0
+  %nop9796 = alloca i1, i1 0
+  %nop9797 = alloca i1, i1 0
+  %nop9798 = alloca i1, i1 0
+  %nop9799 = alloca i1, i1 0
+  %nop9800 = alloca i1, i1 0
+  %nop9801 = alloca i1, i1 0
+  %nop9802 = alloca i1, i1 0
+  %nop9803 = alloca i1, i1 0
+  %nop9804 = alloca i1, i1 0
+  %nop9805 = alloca i1, i1 0
+  %nop9806 = alloca i1, i1 0
+  %nop9807 = alloca i1, i1 0
+  %nop9808 = alloca i1, i1 0
+  %nop9809 = alloca i1, i1 0
+  %nop9810 = alloca i1, i1 0
+  %nop9811 = alloca i1, i1 0
+  %nop9812 = alloca i1, i1 0
+  %nop9813 = alloca i1, i1 0
+  %nop9814 = alloca i1, i1 0
+  %nop9815 = alloca i1, i1 0
+  %nop9816 = alloca i1, i1 0
+  %nop9817 = alloca i1, i1 0
+  %nop9818 = alloca i1, i1 0
+  %nop9819 = alloca i1, i1 0
+  %nop9820 = alloca i1, i1 0
+  %nop9821 = alloca i1, i1 0
+  %nop9822 = alloca i1, i1 0
+  %nop9823 = alloca i1, i1 0
+  %nop9824 = alloca i1, i1 0
+  %nop9825 = alloca i1, i1 0
+  %nop9826 = alloca i1, i1 0
+  %nop9827 = alloca i1, i1 0
+  %nop9828 = alloca i1, i1 0
+  %nop9829 = alloca i1, i1 0
+  %nop9830 = alloca i1, i1 0
+  %nop9831 = alloca i1, i1 0
+  %nop9832 = alloca i1, i1 0
+  %nop9833 = alloca i1, i1 0
+  %nop9834 = alloca i1, i1 0
+  %nop9835 = alloca i1, i1 0
+  %nop9836 = alloca i1, i1 0
+  %nop9837 = alloca i1, i1 0
+  %nop9838 = alloca i1, i1 0
+  %nop9839 = alloca i1, i1 0
+  %nop9840 = alloca i1, i1 0
+  %nop9841 = alloca i1, i1 0
+  %nop9842 = alloca i1, i1 0
+  %nop9843 = alloca i1, i1 0
+  %nop9844 = alloca i1, i1 0
+  %nop9845 = alloca i1, i1 0
+  %nop9846 = alloca i1, i1 0
+  %nop9847 = alloca i1, i1 0
+  %nop9848 = alloca i1, i1 0
+  %nop9849 = alloca i1, i1 0
+  %nop9850 = alloca i1, i1 0
+  %nop9851 = alloca i1, i1 0
+  %nop9852 = alloca i1, i1 0
+  %nop9853 = alloca i1, i1 0
+  %nop9854 = alloca i1, i1 0
+  %nop9855 = alloca i1, i1 0
+  %nop9856 = alloca i1, i1 0
+  %nop9857 = alloca i1, i1 0
+  %nop9858 = alloca i1, i1 0
+  %nop9859 = alloca i1, i1 0
+  %nop9860 = alloca i1, i1 0
+  %nop9861 = alloca i1, i1 0
+  %nop9862 = alloca i1, i1 0
+  %nop9863 = alloca i1, i1 0
+  %nop9864 = alloca i1, i1 0
+  %nop9865 = alloca i1, i1 0
+  %nop9866 = alloca i1, i1 0
+  %nop9867 = alloca i1, i1 0
+  %nop9868 = alloca i1, i1 0
+  %nop9869 = alloca i1, i1 0
+  %nop9870 = alloca i1, i1 0
+  %nop9871 = alloca i1, i1 0
+  %nop9872 = alloca i1, i1 0
+  %nop9873 = alloca i1, i1 0
+  %nop9874 = alloca i1, i1 0
+  %nop9875 = alloca i1, i1 0
+  %nop9876 = alloca i1, i1 0
+  %nop9877 = alloca i1, i1 0
+  %nop9878 = alloca i1, i1 0
+  %nop9879 = alloca i1, i1 0
+  %nop9880 = alloca i1, i1 0
+  %nop9881 = alloca i1, i1 0
+  %nop9882 = alloca i1, i1 0
+  %nop9883 = alloca i1, i1 0
+  %nop9884 = alloca i1, i1 0
+  %nop9885 = alloca i1, i1 0
+  %nop9886 = alloca i1, i1 0
+  %nop9887 = alloca i1, i1 0
+  %nop9888 = alloca i1, i1 0
+  %nop9889 = alloca i1, i1 0
+  %nop9890 = alloca i1, i1 0
+  %nop9891 = alloca i1, i1 0
+  %nop9892 = alloca i1, i1 0
+  %nop9893 = alloca i1, i1 0
+  %nop9894 = alloca i1, i1 0
+  %nop9895 = alloca i1, i1 0
+  %nop9896 = alloca i1, i1 0
+  %nop9897 = alloca i1, i1 0
+  %nop9898 = alloca i1, i1 0
+  %nop9899 = alloca i1, i1 0
+  %nop9900 = alloca i1, i1 0
+  %nop9901 = alloca i1, i1 0
+  %nop9902 = alloca i1, i1 0
+  %nop9903 = alloca i1, i1 0
+  %nop9904 = alloca i1, i1 0
+  %nop9905 = alloca i1, i1 0
+  %nop9906 = alloca i1, i1 0
+  %nop9907 = alloca i1, i1 0
+  %nop9908 = alloca i1, i1 0
+  %nop9909 = alloca i1, i1 0
+  %nop9910 = alloca i1, i1 0
+  %nop9911 = alloca i1, i1 0
+  %nop9912 = alloca i1, i1 0
+  %nop9913 = alloca i1, i1 0
+  %nop9914 = alloca i1, i1 0
+  %nop9915 = alloca i1, i1 0
+  %nop9916 = alloca i1, i1 0
+  %nop9917 = alloca i1, i1 0
+  %nop9918 = alloca i1, i1 0
+  %nop9919 = alloca i1, i1 0
+  %nop9920 = alloca i1, i1 0
+  %nop9921 = alloca i1, i1 0
+  %nop9922 = alloca i1, i1 0
+  %nop9923 = alloca i1, i1 0
+  %nop9924 = alloca i1, i1 0
+  %nop9925 = alloca i1, i1 0
+  %nop9926 = alloca i1, i1 0
+  %nop9927 = alloca i1, i1 0
+  %nop9928 = alloca i1, i1 0
+  %nop9929 = alloca i1, i1 0
+  %nop9930 = alloca i1, i1 0
+  %nop9931 = alloca i1, i1 0
+  %nop9932 = alloca i1, i1 0
+  %nop9933 = alloca i1, i1 0
+  %nop9934 = alloca i1, i1 0
+  %nop9935 = alloca i1, i1 0
+  %nop9936 = alloca i1, i1 0
+  %nop9937 = alloca i1, i1 0
+  %nop9938 = alloca i1, i1 0
+  %nop9939 = alloca i1, i1 0
+  %nop9940 = alloca i1, i1 0
+  %nop9941 = alloca i1, i1 0
+  %nop9942 = alloca i1, i1 0
+  %nop9943 = alloca i1, i1 0
+  %nop9944 = alloca i1, i1 0
+  %nop9945 = alloca i1, i1 0
+  %nop9946 = alloca i1, i1 0
+  %nop9947 = alloca i1, i1 0
+  %nop9948 = alloca i1, i1 0
+  %nop9949 = alloca i1, i1 0
+  %nop9950 = alloca i1, i1 0
+  %nop9951 = alloca i1, i1 0
+  %nop9952 = alloca i1, i1 0
+  %nop9953 = alloca i1, i1 0
+  %nop9954 = alloca i1, i1 0
+  %nop9955 = alloca i1, i1 0
+  %nop9956 = alloca i1, i1 0
+  %nop9957 = alloca i1, i1 0
+  %nop9958 = alloca i1, i1 0
+  %nop9959 = alloca i1, i1 0
+  %nop9960 = alloca i1, i1 0
+  %nop9961 = alloca i1, i1 0
+  %nop9962 = alloca i1, i1 0
+  %nop9963 = alloca i1, i1 0
+  %nop9964 = alloca i1, i1 0
+  %nop9965 = alloca i1, i1 0
+  %nop9966 = alloca i1, i1 0
+  %nop9967 = alloca i1, i1 0
+  %nop9968 = alloca i1, i1 0
+  %nop9969 = alloca i1, i1 0
+  %nop9970 = alloca i1, i1 0
+  %nop9971 = alloca i1, i1 0
+  %nop9972 = alloca i1, i1 0
+  %nop9973 = alloca i1, i1 0
+  %nop9974 = alloca i1, i1 0
+  %nop9975 = alloca i1, i1 0
+  %nop9976 = alloca i1, i1 0
+  %nop9977 = alloca i1, i1 0
+  %nop9978 = alloca i1, i1 0
+  %nop9979 = alloca i1, i1 0
+  %nop9980 = alloca i1, i1 0
+  %nop9981 = alloca i1, i1 0
+  %nop9982 = alloca i1, i1 0
+  %nop9983 = alloca i1, i1 0
+  %nop9984 = alloca i1, i1 0
+  %nop9985 = alloca i1, i1 0
+  %nop9986 = alloca i1, i1 0
+  %nop9987 = alloca i1, i1 0
+  %nop9988 = alloca i1, i1 0
+  %nop9989 = alloca i1, i1 0
+  %nop9990 = alloca i1, i1 0
+  %nop9991 = alloca i1, i1 0
+  %nop9992 = alloca i1, i1 0
+  %nop9993 = alloca i1, i1 0
+  %nop9994 = alloca i1, i1 0
+  %nop9995 = alloca i1, i1 0
+  %nop9996 = alloca i1, i1 0
+  %nop9997 = alloca i1, i1 0
+  %nop9998 = alloca i1, i1 0
+  %nop9999 = alloca i1, i1 0
+  %nop10000 = alloca i1, i1 0
+  %nop10001 = alloca i1, i1 0
+  %nop10002 = alloca i1, i1 0
+  %nop10003 = alloca i1, i1 0
+  %nop10004 = alloca i1, i1 0
+  %nop10005 = alloca i1, i1 0
+  %nop10006 = alloca i1, i1 0
+  %nop10007 = alloca i1, i1 0
+  %nop10008 = alloca i1, i1 0
+  %nop10009 = alloca i1, i1 0
+  %nop10010 = alloca i1, i1 0
+  %nop10011 = alloca i1, i1 0
+  %nop10012 = alloca i1, i1 0
+  %nop10013 = alloca i1, i1 0
+  %nop10014 = alloca i1, i1 0
+  %nop10015 = alloca i1, i1 0
+  %nop10016 = alloca i1, i1 0
+  %nop10017 = alloca i1, i1 0
+  %nop10018 = alloca i1, i1 0
+  %nop10019 = alloca i1, i1 0
+  %nop10020 = alloca i1, i1 0
+  %nop10021 = alloca i1, i1 0
+  %nop10022 = alloca i1, i1 0
+  %nop10023 = alloca i1, i1 0
+  %nop10024 = alloca i1, i1 0
+  %nop10025 = alloca i1, i1 0
+  %nop10026 = alloca i1, i1 0
+  %nop10027 = alloca i1, i1 0
+  %nop10028 = alloca i1, i1 0
+  %nop10029 = alloca i1, i1 0
+  %nop10030 = alloca i1, i1 0
+  %nop10031 = alloca i1, i1 0
+  %nop10032 = alloca i1, i1 0
+  %nop10033 = alloca i1, i1 0
+  %nop10034 = alloca i1, i1 0
+  %nop10035 = alloca i1, i1 0
+  %nop10036 = alloca i1, i1 0
+  %nop10037 = alloca i1, i1 0
+  %nop10038 = alloca i1, i1 0
+  %nop10039 = alloca i1, i1 0
+  %nop10040 = alloca i1, i1 0
+  %nop10041 = alloca i1, i1 0
+  %nop10042 = alloca i1, i1 0
+  %nop10043 = alloca i1, i1 0
+  %nop10044 = alloca i1, i1 0
+  %nop10045 = alloca i1, i1 0
+  %nop10046 = alloca i1, i1 0
+  %nop10047 = alloca i1, i1 0
+  %nop10048 = alloca i1, i1 0
+  %nop10049 = alloca i1, i1 0
+  %nop10050 = alloca i1, i1 0
+  %nop10051 = alloca i1, i1 0
+  %nop10052 = alloca i1, i1 0
+  %nop10053 = alloca i1, i1 0
+  %nop10054 = alloca i1, i1 0
+  %nop10055 = alloca i1, i1 0
+  %nop10056 = alloca i1, i1 0
+  %nop10057 = alloca i1, i1 0
+  %nop10058 = alloca i1, i1 0
+  %nop10059 = alloca i1, i1 0
+  %nop10060 = alloca i1, i1 0
+  %nop10061 = alloca i1, i1 0
+  %nop10062 = alloca i1, i1 0
+  %nop10063 = alloca i1, i1 0
+  %nop10064 = alloca i1, i1 0
+  %nop10065 = alloca i1, i1 0
+  %nop10066 = alloca i1, i1 0
+  %nop10067 = alloca i1, i1 0
+  %nop10068 = alloca i1, i1 0
+  %nop10069 = alloca i1, i1 0
+  %nop10070 = alloca i1, i1 0
+  %nop10071 = alloca i1, i1 0
+  %nop10072 = alloca i1, i1 0
+  %nop10073 = alloca i1, i1 0
+  %nop10074 = alloca i1, i1 0
+  %nop10075 = alloca i1, i1 0
+  %nop10076 = alloca i1, i1 0
+  %nop10077 = alloca i1, i1 0
+  %nop10078 = alloca i1, i1 0
+  %nop10079 = alloca i1, i1 0
+  %nop10080 = alloca i1, i1 0
+  %nop10081 = alloca i1, i1 0
+  %nop10082 = alloca i1, i1 0
+  %nop10083 = alloca i1, i1 0
+  %nop10084 = alloca i1, i1 0
+  %nop10085 = alloca i1, i1 0
+  %nop10086 = alloca i1, i1 0
+  %nop10087 = alloca i1, i1 0
+  %nop10088 = alloca i1, i1 0
+  %nop10089 = alloca i1, i1 0
+  %nop10090 = alloca i1, i1 0
+  %nop10091 = alloca i1, i1 0
+  %nop10092 = alloca i1, i1 0
+  %nop10093 = alloca i1, i1 0
+  %nop10094 = alloca i1, i1 0
+  %nop10095 = alloca i1, i1 0
+  %nop10096 = alloca i1, i1 0
+  %nop10097 = alloca i1, i1 0
+  %nop10098 = alloca i1, i1 0
+  %nop10099 = alloca i1, i1 0
+  %nop10100 = alloca i1, i1 0
+  %nop10101 = alloca i1, i1 0
+  %nop10102 = alloca i1, i1 0
+  %nop10103 = alloca i1, i1 0
+  %nop10104 = alloca i1, i1 0
+  %nop10105 = alloca i1, i1 0
+  %nop10106 = alloca i1, i1 0
+  %nop10107 = alloca i1, i1 0
+  %nop10108 = alloca i1, i1 0
+  %nop10109 = alloca i1, i1 0
+  %nop10110 = alloca i1, i1 0
+  %nop10111 = alloca i1, i1 0
+  %nop10112 = alloca i1, i1 0
+  %nop10113 = alloca i1, i1 0
+  %nop10114 = alloca i1, i1 0
+  %nop10115 = alloca i1, i1 0
+  %nop10116 = alloca i1, i1 0
+  %nop10117 = alloca i1, i1 0
+  %nop10118 = alloca i1, i1 0
+  %nop10119 = alloca i1, i1 0
+  %nop10120 = alloca i1, i1 0
+  %nop10121 = alloca i1, i1 0
+  %nop10122 = alloca i1, i1 0
+  %nop10123 = alloca i1, i1 0
+  %nop10124 = alloca i1, i1 0
+  %nop10125 = alloca i1, i1 0
+  %nop10126 = alloca i1, i1 0
+  %nop10127 = alloca i1, i1 0
+  %nop10128 = alloca i1, i1 0
+  %nop10129 = alloca i1, i1 0
+  %nop10130 = alloca i1, i1 0
+  %nop10131 = alloca i1, i1 0
+  %nop10132 = alloca i1, i1 0
+  %nop10133 = alloca i1, i1 0
+  %nop10134 = alloca i1, i1 0
+  %nop10135 = alloca i1, i1 0
+  %nop10136 = alloca i1, i1 0
+  %nop10137 = alloca i1, i1 0
+  %nop10138 = alloca i1, i1 0
+  %nop10139 = alloca i1, i1 0
+  %nop10140 = alloca i1, i1 0
+  %nop10141 = alloca i1, i1 0
+  %nop10142 = alloca i1, i1 0
+  %nop10143 = alloca i1, i1 0
+  %nop10144 = alloca i1, i1 0
+  %nop10145 = alloca i1, i1 0
+  %nop10146 = alloca i1, i1 0
+  %nop10147 = alloca i1, i1 0
+  %nop10148 = alloca i1, i1 0
+  %nop10149 = alloca i1, i1 0
+  %nop10150 = alloca i1, i1 0
+  %nop10151 = alloca i1, i1 0
+  %nop10152 = alloca i1, i1 0
+  %nop10153 = alloca i1, i1 0
+  %nop10154 = alloca i1, i1 0
+  %nop10155 = alloca i1, i1 0
+  %nop10156 = alloca i1, i1 0
+  %nop10157 = alloca i1, i1 0
+  %nop10158 = alloca i1, i1 0
+  %nop10159 = alloca i1, i1 0
+  %nop10160 = alloca i1, i1 0
+  %nop10161 = alloca i1, i1 0
+  %nop10162 = alloca i1, i1 0
+  %nop10163 = alloca i1, i1 0
+  %nop10164 = alloca i1, i1 0
+  %nop10165 = alloca i1, i1 0
+  %nop10166 = alloca i1, i1 0
+  %nop10167 = alloca i1, i1 0
+  %nop10168 = alloca i1, i1 0
+  %nop10169 = alloca i1, i1 0
+  %nop10170 = alloca i1, i1 0
+  %nop10171 = alloca i1, i1 0
+  %nop10172 = alloca i1, i1 0
+  %nop10173 = alloca i1, i1 0
+  %nop10174 = alloca i1, i1 0
+  %nop10175 = alloca i1, i1 0
+  %nop10176 = alloca i1, i1 0
+  %nop10177 = alloca i1, i1 0
+  %nop10178 = alloca i1, i1 0
+  %nop10179 = alloca i1, i1 0
+  %nop10180 = alloca i1, i1 0
+  %nop10181 = alloca i1, i1 0
+  %nop10182 = alloca i1, i1 0
+  %nop10183 = alloca i1, i1 0
+  %nop10184 = alloca i1, i1 0
+  %nop10185 = alloca i1, i1 0
+  %nop10186 = alloca i1, i1 0
+  %nop10187 = alloca i1, i1 0
+  %nop10188 = alloca i1, i1 0
+  %nop10189 = alloca i1, i1 0
+  %nop10190 = alloca i1, i1 0
+  %nop10191 = alloca i1, i1 0
+  %nop10192 = alloca i1, i1 0
+  %nop10193 = alloca i1, i1 0
+  %nop10194 = alloca i1, i1 0
+  %nop10195 = alloca i1, i1 0
+  %nop10196 = alloca i1, i1 0
+  %nop10197 = alloca i1, i1 0
+  %nop10198 = alloca i1, i1 0
+  %nop10199 = alloca i1, i1 0
+  %nop10200 = alloca i1, i1 0
+  %nop10201 = alloca i1, i1 0
+  %nop10202 = alloca i1, i1 0
+  %nop10203 = alloca i1, i1 0
+  %nop10204 = alloca i1, i1 0
+  %nop10205 = alloca i1, i1 0
+  %nop10206 = alloca i1, i1 0
+  %nop10207 = alloca i1, i1 0
+  %nop10208 = alloca i1, i1 0
+  %nop10209 = alloca i1, i1 0
+  %nop10210 = alloca i1, i1 0
+  %nop10211 = alloca i1, i1 0
+  %nop10212 = alloca i1, i1 0
+  %nop10213 = alloca i1, i1 0
+  %nop10214 = alloca i1, i1 0
+  %nop10215 = alloca i1, i1 0
+  %nop10216 = alloca i1, i1 0
+  %nop10217 = alloca i1, i1 0
+  %nop10218 = alloca i1, i1 0
+  %nop10219 = alloca i1, i1 0
+  %nop10220 = alloca i1, i1 0
+  %nop10221 = alloca i1, i1 0
+  %nop10222 = alloca i1, i1 0
+  %nop10223 = alloca i1, i1 0
+  %nop10224 = alloca i1, i1 0
+  %nop10225 = alloca i1, i1 0
+  %nop10226 = alloca i1, i1 0
+  %nop10227 = alloca i1, i1 0
+  %nop10228 = alloca i1, i1 0
+  %nop10229 = alloca i1, i1 0
+  %nop10230 = alloca i1, i1 0
+  %nop10231 = alloca i1, i1 0
+  %nop10232 = alloca i1, i1 0
+  %nop10233 = alloca i1, i1 0
+  %nop10234 = alloca i1, i1 0
+  %nop10235 = alloca i1, i1 0
+  %nop10236 = alloca i1, i1 0
+  %nop10237 = alloca i1, i1 0
+  %nop10238 = alloca i1, i1 0
+  %nop10239 = alloca i1, i1 0
+  %nop10240 = alloca i1, i1 0
+  %nop10241 = alloca i1, i1 0
+  %nop10242 = alloca i1, i1 0
+  %nop10243 = alloca i1, i1 0
+  %nop10244 = alloca i1, i1 0
+  %nop10245 = alloca i1, i1 0
+  %nop10246 = alloca i1, i1 0
+  %nop10247 = alloca i1, i1 0
+  %nop10248 = alloca i1, i1 0
+  %nop10249 = alloca i1, i1 0
+  %nop10250 = alloca i1, i1 0
+  %nop10251 = alloca i1, i1 0
+  %nop10252 = alloca i1, i1 0
+  %nop10253 = alloca i1, i1 0
+  %nop10254 = alloca i1, i1 0
+  %nop10255 = alloca i1, i1 0
+  %nop10256 = alloca i1, i1 0
+  %nop10257 = alloca i1, i1 0
+  %nop10258 = alloca i1, i1 0
+  %nop10259 = alloca i1, i1 0
+  %nop10260 = alloca i1, i1 0
+  %nop10261 = alloca i1, i1 0
+  %nop10262 = alloca i1, i1 0
+  %nop10263 = alloca i1, i1 0
+  %nop10264 = alloca i1, i1 0
+  %nop10265 = alloca i1, i1 0
+  %nop10266 = alloca i1, i1 0
+  %nop10267 = alloca i1, i1 0
+  %nop10268 = alloca i1, i1 0
+  %nop10269 = alloca i1, i1 0
+  %nop10270 = alloca i1, i1 0
+  %nop10271 = alloca i1, i1 0
+  %nop10272 = alloca i1, i1 0
+  %nop10273 = alloca i1, i1 0
+  %nop10274 = alloca i1, i1 0
+  %nop10275 = alloca i1, i1 0
+  %nop10276 = alloca i1, i1 0
+  %nop10277 = alloca i1, i1 0
+  %nop10278 = alloca i1, i1 0
+  %nop10279 = alloca i1, i1 0
+  %nop10280 = alloca i1, i1 0
+  %nop10281 = alloca i1, i1 0
+  %nop10282 = alloca i1, i1 0
+  %nop10283 = alloca i1, i1 0
+  %nop10284 = alloca i1, i1 0
+  %nop10285 = alloca i1, i1 0
+  %nop10286 = alloca i1, i1 0
+  %nop10287 = alloca i1, i1 0
+  %nop10288 = alloca i1, i1 0
+  %nop10289 = alloca i1, i1 0
+  %nop10290 = alloca i1, i1 0
+  %nop10291 = alloca i1, i1 0
+  %nop10292 = alloca i1, i1 0
+  %nop10293 = alloca i1, i1 0
+  %nop10294 = alloca i1, i1 0
+  %nop10295 = alloca i1, i1 0
+  %nop10296 = alloca i1, i1 0
+  %nop10297 = alloca i1, i1 0
+  %nop10298 = alloca i1, i1 0
+  %nop10299 = alloca i1, i1 0
+  %nop10300 = alloca i1, i1 0
+  %nop10301 = alloca i1, i1 0
+  %nop10302 = alloca i1, i1 0
+  %nop10303 = alloca i1, i1 0
+  %nop10304 = alloca i1, i1 0
+  %nop10305 = alloca i1, i1 0
+  %nop10306 = alloca i1, i1 0
+  %nop10307 = alloca i1, i1 0
+  %nop10308 = alloca i1, i1 0
+  %nop10309 = alloca i1, i1 0
+  %nop10310 = alloca i1, i1 0
+  %nop10311 = alloca i1, i1 0
+  %nop10312 = alloca i1, i1 0
+  %nop10313 = alloca i1, i1 0
+  %nop10314 = alloca i1, i1 0
+  %nop10315 = alloca i1, i1 0
+  %nop10316 = alloca i1, i1 0
+  %nop10317 = alloca i1, i1 0
+  %nop10318 = alloca i1, i1 0
+  %nop10319 = alloca i1, i1 0
+  %nop10320 = alloca i1, i1 0
+  %nop10321 = alloca i1, i1 0
+  %nop10322 = alloca i1, i1 0
+  %nop10323 = alloca i1, i1 0
+  %nop10324 = alloca i1, i1 0
+  %nop10325 = alloca i1, i1 0
+  %nop10326 = alloca i1, i1 0
+  %nop10327 = alloca i1, i1 0
+  %nop10328 = alloca i1, i1 0
+  %nop10329 = alloca i1, i1 0
+  %nop10330 = alloca i1, i1 0
+  %nop10331 = alloca i1, i1 0
+  %nop10332 = alloca i1, i1 0
+  %nop10333 = alloca i1, i1 0
+  %nop10334 = alloca i1, i1 0
+  %nop10335 = alloca i1, i1 0
+  %nop10336 = alloca i1, i1 0
+  %nop10337 = alloca i1, i1 0
+  %nop10338 = alloca i1, i1 0
+  %nop10339 = alloca i1, i1 0
+  %nop10340 = alloca i1, i1 0
+  %nop10341 = alloca i1, i1 0
+  %nop10342 = alloca i1, i1 0
+  %nop10343 = alloca i1, i1 0
+  %nop10344 = alloca i1, i1 0
+  %nop10345 = alloca i1, i1 0
+  %nop10346 = alloca i1, i1 0
+  %nop10347 = alloca i1, i1 0
+  %nop10348 = alloca i1, i1 0
+  %nop10349 = alloca i1, i1 0
+  %nop10350 = alloca i1, i1 0
+  %nop10351 = alloca i1, i1 0
+  %nop10352 = alloca i1, i1 0
+  %nop10353 = alloca i1, i1 0
+  %nop10354 = alloca i1, i1 0
+  %nop10355 = alloca i1, i1 0
+  %nop10356 = alloca i1, i1 0
+  %nop10357 = alloca i1, i1 0
+  %nop10358 = alloca i1, i1 0
+  %nop10359 = alloca i1, i1 0
+  %nop10360 = alloca i1, i1 0
+  %nop10361 = alloca i1, i1 0
+  %nop10362 = alloca i1, i1 0
+  %nop10363 = alloca i1, i1 0
+  %nop10364 = alloca i1, i1 0
+  %nop10365 = alloca i1, i1 0
+  %nop10366 = alloca i1, i1 0
+  %nop10367 = alloca i1, i1 0
+  %nop10368 = alloca i1, i1 0
+  %nop10369 = alloca i1, i1 0
+  %nop10370 = alloca i1, i1 0
+  %nop10371 = alloca i1, i1 0
+  %nop10372 = alloca i1, i1 0
+  %nop10373 = alloca i1, i1 0
+  %nop10374 = alloca i1, i1 0
+  %nop10375 = alloca i1, i1 0
+  %nop10376 = alloca i1, i1 0
+  %nop10377 = alloca i1, i1 0
+  %nop10378 = alloca i1, i1 0
+  %nop10379 = alloca i1, i1 0
+  %nop10380 = alloca i1, i1 0
+  %nop10381 = alloca i1, i1 0
+  %nop10382 = alloca i1, i1 0
+  %nop10383 = alloca i1, i1 0
+  %nop10384 = alloca i1, i1 0
+  %nop10385 = alloca i1, i1 0
+  %nop10386 = alloca i1, i1 0
+  %nop10387 = alloca i1, i1 0
+  %nop10388 = alloca i1, i1 0
+  %nop10389 = alloca i1, i1 0
+  %nop10390 = alloca i1, i1 0
+  %nop10391 = alloca i1, i1 0
+  %nop10392 = alloca i1, i1 0
+  %nop10393 = alloca i1, i1 0
+  %nop10394 = alloca i1, i1 0
+  %nop10395 = alloca i1, i1 0
+  %nop10396 = alloca i1, i1 0
+  %nop10397 = alloca i1, i1 0
+  %nop10398 = alloca i1, i1 0
+  %nop10399 = alloca i1, i1 0
+  %nop10400 = alloca i1, i1 0
+  %nop10401 = alloca i1, i1 0
+  %nop10402 = alloca i1, i1 0
+  %nop10403 = alloca i1, i1 0
+  %nop10404 = alloca i1, i1 0
+  %nop10405 = alloca i1, i1 0
+  %nop10406 = alloca i1, i1 0
+  %nop10407 = alloca i1, i1 0
+  %nop10408 = alloca i1, i1 0
+  %nop10409 = alloca i1, i1 0
+  %nop10410 = alloca i1, i1 0
+  %nop10411 = alloca i1, i1 0
+  %nop10412 = alloca i1, i1 0
+  %nop10413 = alloca i1, i1 0
+  %nop10414 = alloca i1, i1 0
+  %nop10415 = alloca i1, i1 0
+  %nop10416 = alloca i1, i1 0
+  %nop10417 = alloca i1, i1 0
+  %nop10418 = alloca i1, i1 0
+  %nop10419 = alloca i1, i1 0
+  %nop10420 = alloca i1, i1 0
+  %nop10421 = alloca i1, i1 0
+  %nop10422 = alloca i1, i1 0
+  %nop10423 = alloca i1, i1 0
+  %nop10424 = alloca i1, i1 0
+  %nop10425 = alloca i1, i1 0
+  %nop10426 = alloca i1, i1 0
+  %nop10427 = alloca i1, i1 0
+  %nop10428 = alloca i1, i1 0
+  %nop10429 = alloca i1, i1 0
+  %nop10430 = alloca i1, i1 0
+  %nop10431 = alloca i1, i1 0
+  %nop10432 = alloca i1, i1 0
+  %nop10433 = alloca i1, i1 0
+  %nop10434 = alloca i1, i1 0
+  %nop10435 = alloca i1, i1 0
+  %nop10436 = alloca i1, i1 0
+  %nop10437 = alloca i1, i1 0
+  %nop10438 = alloca i1, i1 0
+  %nop10439 = alloca i1, i1 0
+  %nop10440 = alloca i1, i1 0
+  %nop10441 = alloca i1, i1 0
+  %nop10442 = alloca i1, i1 0
+  %nop10443 = alloca i1, i1 0
+  %nop10444 = alloca i1, i1 0
+  %nop10445 = alloca i1, i1 0
+  %nop10446 = alloca i1, i1 0
+  %nop10447 = alloca i1, i1 0
+  %nop10448 = alloca i1, i1 0
+  %nop10449 = alloca i1, i1 0
+  %nop10450 = alloca i1, i1 0
+  %nop10451 = alloca i1, i1 0
+  %nop10452 = alloca i1, i1 0
+  %nop10453 = alloca i1, i1 0
+  %nop10454 = alloca i1, i1 0
+  %nop10455 = alloca i1, i1 0
+  %nop10456 = alloca i1, i1 0
+  %nop10457 = alloca i1, i1 0
+  %nop10458 = alloca i1, i1 0
+  %nop10459 = alloca i1, i1 0
+  %nop10460 = alloca i1, i1 0
+  %nop10461 = alloca i1, i1 0
+  %nop10462 = alloca i1, i1 0
+  %nop10463 = alloca i1, i1 0
+  %nop10464 = alloca i1, i1 0
+  %nop10465 = alloca i1, i1 0
+  %nop10466 = alloca i1, i1 0
+  %nop10467 = alloca i1, i1 0
+  %nop10468 = alloca i1, i1 0
+  %nop10469 = alloca i1, i1 0
+  %nop10470 = alloca i1, i1 0
+  %nop10471 = alloca i1, i1 0
+  %nop10472 = alloca i1, i1 0
+  %nop10473 = alloca i1, i1 0
+  %nop10474 = alloca i1, i1 0
+  %nop10475 = alloca i1, i1 0
+  %nop10476 = alloca i1, i1 0
+  %nop10477 = alloca i1, i1 0
+  %nop10478 = alloca i1, i1 0
+  %nop10479 = alloca i1, i1 0
+  %nop10480 = alloca i1, i1 0
+  %nop10481 = alloca i1, i1 0
+  %nop10482 = alloca i1, i1 0
+  %nop10483 = alloca i1, i1 0
+  %nop10484 = alloca i1, i1 0
+  %nop10485 = alloca i1, i1 0
+  %nop10486 = alloca i1, i1 0
+  %nop10487 = alloca i1, i1 0
+  %nop10488 = alloca i1, i1 0
+  %nop10489 = alloca i1, i1 0
+  %nop10490 = alloca i1, i1 0
+  %nop10491 = alloca i1, i1 0
+  %nop10492 = alloca i1, i1 0
+  %nop10493 = alloca i1, i1 0
+  %nop10494 = alloca i1, i1 0
+  %nop10495 = alloca i1, i1 0
+  %nop10496 = alloca i1, i1 0
+  %nop10497 = alloca i1, i1 0
+  %nop10498 = alloca i1, i1 0
+  %nop10499 = alloca i1, i1 0
+  %nop10500 = alloca i1, i1 0
+  %nop10501 = alloca i1, i1 0
+  %nop10502 = alloca i1, i1 0
+  %nop10503 = alloca i1, i1 0
+  %nop10504 = alloca i1, i1 0
+  %nop10505 = alloca i1, i1 0
+  %nop10506 = alloca i1, i1 0
+  %nop10507 = alloca i1, i1 0
+  %nop10508 = alloca i1, i1 0
+  %nop10509 = alloca i1, i1 0
+  %nop10510 = alloca i1, i1 0
+  %nop10511 = alloca i1, i1 0
+  %nop10512 = alloca i1, i1 0
+  %nop10513 = alloca i1, i1 0
+  %nop10514 = alloca i1, i1 0
+  %nop10515 = alloca i1, i1 0
+  %nop10516 = alloca i1, i1 0
+  %nop10517 = alloca i1, i1 0
+  %nop10518 = alloca i1, i1 0
+  %nop10519 = alloca i1, i1 0
+  %nop10520 = alloca i1, i1 0
+  %nop10521 = alloca i1, i1 0
+  %nop10522 = alloca i1, i1 0
+  %nop10523 = alloca i1, i1 0
+  %nop10524 = alloca i1, i1 0
+  %nop10525 = alloca i1, i1 0
+  %nop10526 = alloca i1, i1 0
+  %nop10527 = alloca i1, i1 0
+  %nop10528 = alloca i1, i1 0
+  %nop10529 = alloca i1, i1 0
+  %nop10530 = alloca i1, i1 0
+  %nop10531 = alloca i1, i1 0
+  %nop10532 = alloca i1, i1 0
+  %nop10533 = alloca i1, i1 0
+  %nop10534 = alloca i1, i1 0
+  %nop10535 = alloca i1, i1 0
+  %nop10536 = alloca i1, i1 0
+  %nop10537 = alloca i1, i1 0
+  %nop10538 = alloca i1, i1 0
+  %nop10539 = alloca i1, i1 0
+  %nop10540 = alloca i1, i1 0
+  %nop10541 = alloca i1, i1 0
+  %nop10542 = alloca i1, i1 0
+  %nop10543 = alloca i1, i1 0
+  %nop10544 = alloca i1, i1 0
+  %nop10545 = alloca i1, i1 0
+  %nop10546 = alloca i1, i1 0
+  %nop10547 = alloca i1, i1 0
+  %nop10548 = alloca i1, i1 0
+  %nop10549 = alloca i1, i1 0
+  %nop10550 = alloca i1, i1 0
+  %nop10551 = alloca i1, i1 0
+  %nop10552 = alloca i1, i1 0
+  %nop10553 = alloca i1, i1 0
+  %nop10554 = alloca i1, i1 0
+  %nop10555 = alloca i1, i1 0
+  %nop10556 = alloca i1, i1 0
+  %nop10557 = alloca i1, i1 0
+  %nop10558 = alloca i1, i1 0
+  %nop10559 = alloca i1, i1 0
+  %nop10560 = alloca i1, i1 0
+  %nop10561 = alloca i1, i1 0
+  %nop10562 = alloca i1, i1 0
+  %nop10563 = alloca i1, i1 0
+  %nop10564 = alloca i1, i1 0
+  %nop10565 = alloca i1, i1 0
+  %nop10566 = alloca i1, i1 0
+  %nop10567 = alloca i1, i1 0
+  %nop10568 = alloca i1, i1 0
+  %nop10569 = alloca i1, i1 0
+  %nop10570 = alloca i1, i1 0
+  %nop10571 = alloca i1, i1 0
+  %nop10572 = alloca i1, i1 0
+  %nop10573 = alloca i1, i1 0
+  %nop10574 = alloca i1, i1 0
+  %nop10575 = alloca i1, i1 0
+  %nop10576 = alloca i1, i1 0
+  %nop10577 = alloca i1, i1 0
+  %nop10578 = alloca i1, i1 0
+  %nop10579 = alloca i1, i1 0
+  %nop10580 = alloca i1, i1 0
+  %nop10581 = alloca i1, i1 0
+  %nop10582 = alloca i1, i1 0
+  %nop10583 = alloca i1, i1 0
+  %nop10584 = alloca i1, i1 0
+  %nop10585 = alloca i1, i1 0
+  %nop10586 = alloca i1, i1 0
+  %nop10587 = alloca i1, i1 0
+  %nop10588 = alloca i1, i1 0
+  %nop10589 = alloca i1, i1 0
+  %nop10590 = alloca i1, i1 0
+  %nop10591 = alloca i1, i1 0
+  %nop10592 = alloca i1, i1 0
+  %nop10593 = alloca i1, i1 0
+  %nop10594 = alloca i1, i1 0
+  %nop10595 = alloca i1, i1 0
+  %nop10596 = alloca i1, i1 0
+  %nop10597 = alloca i1, i1 0
+  %nop10598 = alloca i1, i1 0
+  %nop10599 = alloca i1, i1 0
+  %nop10600 = alloca i1, i1 0
+  %nop10601 = alloca i1, i1 0
+  %nop10602 = alloca i1, i1 0
+  %nop10603 = alloca i1, i1 0
+  %nop10604 = alloca i1, i1 0
+  %nop10605 = alloca i1, i1 0
+  %nop10606 = alloca i1, i1 0
+  %nop10607 = alloca i1, i1 0
+  %nop10608 = alloca i1, i1 0
+  %nop10609 = alloca i1, i1 0
+  %nop10610 = alloca i1, i1 0
+  %nop10611 = alloca i1, i1 0
+  %nop10612 = alloca i1, i1 0
+  %nop10613 = alloca i1, i1 0
+  %nop10614 = alloca i1, i1 0
+  %nop10615 = alloca i1, i1 0
+  %nop10616 = alloca i1, i1 0
+  %nop10617 = alloca i1, i1 0
+  %nop10618 = alloca i1, i1 0
+  %nop10619 = alloca i1, i1 0
+  %nop10620 = alloca i1, i1 0
+  %nop10621 = alloca i1, i1 0
+  %nop10622 = alloca i1, i1 0
+  %nop10623 = alloca i1, i1 0
+  %nop10624 = alloca i1, i1 0
+  %nop10625 = alloca i1, i1 0
+  %nop10626 = alloca i1, i1 0
+  %nop10627 = alloca i1, i1 0
+  %nop10628 = alloca i1, i1 0
+  %nop10629 = alloca i1, i1 0
+  %nop10630 = alloca i1, i1 0
+  %nop10631 = alloca i1, i1 0
+  %nop10632 = alloca i1, i1 0
+  %nop10633 = alloca i1, i1 0
+  %nop10634 = alloca i1, i1 0
+  %nop10635 = alloca i1, i1 0
+  %nop10636 = alloca i1, i1 0
+  %nop10637 = alloca i1, i1 0
+  %nop10638 = alloca i1, i1 0
+  %nop10639 = alloca i1, i1 0
+  %nop10640 = alloca i1, i1 0
+  %nop10641 = alloca i1, i1 0
+  %nop10642 = alloca i1, i1 0
+  %nop10643 = alloca i1, i1 0
+  %nop10644 = alloca i1, i1 0
+  %nop10645 = alloca i1, i1 0
+  %nop10646 = alloca i1, i1 0
+  %nop10647 = alloca i1, i1 0
+  %nop10648 = alloca i1, i1 0
+  %nop10649 = alloca i1, i1 0
+  %nop10650 = alloca i1, i1 0
+  %nop10651 = alloca i1, i1 0
+  %nop10652 = alloca i1, i1 0
+  %nop10653 = alloca i1, i1 0
+  %nop10654 = alloca i1, i1 0
+  %nop10655 = alloca i1, i1 0
+  %nop10656 = alloca i1, i1 0
+  %nop10657 = alloca i1, i1 0
+  %nop10658 = alloca i1, i1 0
+  %nop10659 = alloca i1, i1 0
+  %nop10660 = alloca i1, i1 0
+  %nop10661 = alloca i1, i1 0
+  %nop10662 = alloca i1, i1 0
+  %nop10663 = alloca i1, i1 0
+  %nop10664 = alloca i1, i1 0
+  %nop10665 = alloca i1, i1 0
+  %nop10666 = alloca i1, i1 0
+  %nop10667 = alloca i1, i1 0
+  %nop10668 = alloca i1, i1 0
+  %nop10669 = alloca i1, i1 0
+  %nop10670 = alloca i1, i1 0
+  %nop10671 = alloca i1, i1 0
+  %nop10672 = alloca i1, i1 0
+  %nop10673 = alloca i1, i1 0
+  %nop10674 = alloca i1, i1 0
+  %nop10675 = alloca i1, i1 0
+  %nop10676 = alloca i1, i1 0
+  %nop10677 = alloca i1, i1 0
+  %nop10678 = alloca i1, i1 0
+  %nop10679 = alloca i1, i1 0
+  %nop10680 = alloca i1, i1 0
+  %nop10681 = alloca i1, i1 0
+  %nop10682 = alloca i1, i1 0
+  %nop10683 = alloca i1, i1 0
+  %nop10684 = alloca i1, i1 0
+  %nop10685 = alloca i1, i1 0
+  %nop10686 = alloca i1, i1 0
+  %nop10687 = alloca i1, i1 0
+  %nop10688 = alloca i1, i1 0
+  %nop10689 = alloca i1, i1 0
+  %nop10690 = alloca i1, i1 0
+  %nop10691 = alloca i1, i1 0
+  %nop10692 = alloca i1, i1 0
+  %nop10693 = alloca i1, i1 0
+  %nop10694 = alloca i1, i1 0
+  %nop10695 = alloca i1, i1 0
+  %nop10696 = alloca i1, i1 0
+  %nop10697 = alloca i1, i1 0
+  %nop10698 = alloca i1, i1 0
+  %nop10699 = alloca i1, i1 0
+  %nop10700 = alloca i1, i1 0
+  %nop10701 = alloca i1, i1 0
+  %nop10702 = alloca i1, i1 0
+  %nop10703 = alloca i1, i1 0
+  %nop10704 = alloca i1, i1 0
+  %nop10705 = alloca i1, i1 0
+  %nop10706 = alloca i1, i1 0
+  %nop10707 = alloca i1, i1 0
+  %nop10708 = alloca i1, i1 0
+  %nop10709 = alloca i1, i1 0
+  %nop10710 = alloca i1, i1 0
+  %nop10711 = alloca i1, i1 0
+  %nop10712 = alloca i1, i1 0
+  %nop10713 = alloca i1, i1 0
+  %nop10714 = alloca i1, i1 0
+  %nop10715 = alloca i1, i1 0
+  %nop10716 = alloca i1, i1 0
+  %nop10717 = alloca i1, i1 0
+  %nop10718 = alloca i1, i1 0
+  %nop10719 = alloca i1, i1 0
+  %nop10720 = alloca i1, i1 0
+  %nop10721 = alloca i1, i1 0
+  %nop10722 = alloca i1, i1 0
+  %nop10723 = alloca i1, i1 0
+  %nop10724 = alloca i1, i1 0
+  %nop10725 = alloca i1, i1 0
+  %nop10726 = alloca i1, i1 0
+  %nop10727 = alloca i1, i1 0
+  %nop10728 = alloca i1, i1 0
+  %nop10729 = alloca i1, i1 0
+  %nop10730 = alloca i1, i1 0
+  %nop10731 = alloca i1, i1 0
+  %nop10732 = alloca i1, i1 0
+  %nop10733 = alloca i1, i1 0
+  %nop10734 = alloca i1, i1 0
+  %nop10735 = alloca i1, i1 0
+  %nop10736 = alloca i1, i1 0
+  %nop10737 = alloca i1, i1 0
+  %nop10738 = alloca i1, i1 0
+  %nop10739 = alloca i1, i1 0
+  %nop10740 = alloca i1, i1 0
+  %nop10741 = alloca i1, i1 0
+  %nop10742 = alloca i1, i1 0
+  %nop10743 = alloca i1, i1 0
+  %nop10744 = alloca i1, i1 0
+  %nop10745 = alloca i1, i1 0
+  %nop10746 = alloca i1, i1 0
+  %nop10747 = alloca i1, i1 0
+  %nop10748 = alloca i1, i1 0
+  %nop10749 = alloca i1, i1 0
+  %nop10750 = alloca i1, i1 0
+  %nop10751 = alloca i1, i1 0
+  %nop10752 = alloca i1, i1 0
+  %nop10753 = alloca i1, i1 0
+  %nop10754 = alloca i1, i1 0
+  %nop10755 = alloca i1, i1 0
+  %nop10756 = alloca i1, i1 0
+  %nop10757 = alloca i1, i1 0
+  %nop10758 = alloca i1, i1 0
+  %nop10759 = alloca i1, i1 0
+  %nop10760 = alloca i1, i1 0
+  %nop10761 = alloca i1, i1 0
+  %nop10762 = alloca i1, i1 0
+  %nop10763 = alloca i1, i1 0
+  %nop10764 = alloca i1, i1 0
+  %nop10765 = alloca i1, i1 0
+  %nop10766 = alloca i1, i1 0
+  %nop10767 = alloca i1, i1 0
+  %nop10768 = alloca i1, i1 0
+  %nop10769 = alloca i1, i1 0
+  %nop10770 = alloca i1, i1 0
+  %nop10771 = alloca i1, i1 0
+  %nop10772 = alloca i1, i1 0
+  %nop10773 = alloca i1, i1 0
+  %nop10774 = alloca i1, i1 0
+  %nop10775 = alloca i1, i1 0
+  %nop10776 = alloca i1, i1 0
+  %nop10777 = alloca i1, i1 0
+  %nop10778 = alloca i1, i1 0
+  %nop10779 = alloca i1, i1 0
+  %nop10780 = alloca i1, i1 0
+  %nop10781 = alloca i1, i1 0
+  %nop10782 = alloca i1, i1 0
+  %nop10783 = alloca i1, i1 0
+  %nop10784 = alloca i1, i1 0
+  %nop10785 = alloca i1, i1 0
+  %nop10786 = alloca i1, i1 0
+  %nop10787 = alloca i1, i1 0
+  %nop10788 = alloca i1, i1 0
+  %nop10789 = alloca i1, i1 0
+  %nop10790 = alloca i1, i1 0
+  %nop10791 = alloca i1, i1 0
+  %nop10792 = alloca i1, i1 0
+  %nop10793 = alloca i1, i1 0
+  %nop10794 = alloca i1, i1 0
+  %nop10795 = alloca i1, i1 0
+  %nop10796 = alloca i1, i1 0
+  %nop10797 = alloca i1, i1 0
+  %nop10798 = alloca i1, i1 0
+  %nop10799 = alloca i1, i1 0
+  %nop10800 = alloca i1, i1 0
+  %nop10801 = alloca i1, i1 0
+  %nop10802 = alloca i1, i1 0
+  %nop10803 = alloca i1, i1 0
+  %nop10804 = alloca i1, i1 0
+  %nop10805 = alloca i1, i1 0
+  %nop10806 = alloca i1, i1 0
+  %nop10807 = alloca i1, i1 0
+  %nop10808 = alloca i1, i1 0
+  %nop10809 = alloca i1, i1 0
+  %nop10810 = alloca i1, i1 0
+  %nop10811 = alloca i1, i1 0
+  %nop10812 = alloca i1, i1 0
+  %nop10813 = alloca i1, i1 0
+  %nop10814 = alloca i1, i1 0
+  %nop10815 = alloca i1, i1 0
+  %nop10816 = alloca i1, i1 0
+  %nop10817 = alloca i1, i1 0
+  %nop10818 = alloca i1, i1 0
+  %nop10819 = alloca i1, i1 0
+  %nop10820 = alloca i1, i1 0
+  %nop10821 = alloca i1, i1 0
+  %nop10822 = alloca i1, i1 0
+  %nop10823 = alloca i1, i1 0
+  %nop10824 = alloca i1, i1 0
+  %nop10825 = alloca i1, i1 0
+  %nop10826 = alloca i1, i1 0
+  %nop10827 = alloca i1, i1 0
+  %nop10828 = alloca i1, i1 0
+  %nop10829 = alloca i1, i1 0
+  %nop10830 = alloca i1, i1 0
+  %nop10831 = alloca i1, i1 0
+  %nop10832 = alloca i1, i1 0
+  %nop10833 = alloca i1, i1 0
+  %nop10834 = alloca i1, i1 0
+  %nop10835 = alloca i1, i1 0
+  %nop10836 = alloca i1, i1 0
+  %nop10837 = alloca i1, i1 0
+  %nop10838 = alloca i1, i1 0
+  %nop10839 = alloca i1, i1 0
+  %nop10840 = alloca i1, i1 0
+  %nop10841 = alloca i1, i1 0
+  %nop10842 = alloca i1, i1 0
+  %nop10843 = alloca i1, i1 0
+  %nop10844 = alloca i1, i1 0
+  %nop10845 = alloca i1, i1 0
+  %nop10846 = alloca i1, i1 0
+  %nop10847 = alloca i1, i1 0
+  %nop10848 = alloca i1, i1 0
+  %nop10849 = alloca i1, i1 0
+  %nop10850 = alloca i1, i1 0
+  %nop10851 = alloca i1, i1 0
+  %nop10852 = alloca i1, i1 0
+  %nop10853 = alloca i1, i1 0
+  %nop10854 = alloca i1, i1 0
+  %nop10855 = alloca i1, i1 0
+  %nop10856 = alloca i1, i1 0
+  %nop10857 = alloca i1, i1 0
+  %nop10858 = alloca i1, i1 0
+  %nop10859 = alloca i1, i1 0
+  %nop10860 = alloca i1, i1 0
+  %nop10861 = alloca i1, i1 0
+  %nop10862 = alloca i1, i1 0
+  %nop10863 = alloca i1, i1 0
+  %nop10864 = alloca i1, i1 0
+  %nop10865 = alloca i1, i1 0
+  %nop10866 = alloca i1, i1 0
+  %nop10867 = alloca i1, i1 0
+  %nop10868 = alloca i1, i1 0
+  %nop10869 = alloca i1, i1 0
+  %nop10870 = alloca i1, i1 0
+  %nop10871 = alloca i1, i1 0
+  %nop10872 = alloca i1, i1 0
+  %nop10873 = alloca i1, i1 0
+  %nop10874 = alloca i1, i1 0
+  %nop10875 = alloca i1, i1 0
+  %nop10876 = alloca i1, i1 0
+  %nop10877 = alloca i1, i1 0
+  %nop10878 = alloca i1, i1 0
+  %nop10879 = alloca i1, i1 0
+  %nop10880 = alloca i1, i1 0
+  %nop10881 = alloca i1, i1 0
+  %nop10882 = alloca i1, i1 0
+  %nop10883 = alloca i1, i1 0
+  %nop10884 = alloca i1, i1 0
+  %nop10885 = alloca i1, i1 0
+  %nop10886 = alloca i1, i1 0
+  %nop10887 = alloca i1, i1 0
+  %nop10888 = alloca i1, i1 0
+  %nop10889 = alloca i1, i1 0
+  %nop10890 = alloca i1, i1 0
+  %nop10891 = alloca i1, i1 0
+  %nop10892 = alloca i1, i1 0
+  %nop10893 = alloca i1, i1 0
+  %nop10894 = alloca i1, i1 0
+  %nop10895 = alloca i1, i1 0
+  %nop10896 = alloca i1, i1 0
+  %nop10897 = alloca i1, i1 0
+  %nop10898 = alloca i1, i1 0
+  %nop10899 = alloca i1, i1 0
+  %nop10900 = alloca i1, i1 0
+  %nop10901 = alloca i1, i1 0
+  %nop10902 = alloca i1, i1 0
+  %nop10903 = alloca i1, i1 0
+  %nop10904 = alloca i1, i1 0
+  %nop10905 = alloca i1, i1 0
+  %nop10906 = alloca i1, i1 0
+  %nop10907 = alloca i1, i1 0
+  %nop10908 = alloca i1, i1 0
+  %nop10909 = alloca i1, i1 0
+  %nop10910 = alloca i1, i1 0
+  %nop10911 = alloca i1, i1 0
+  %nop10912 = alloca i1, i1 0
+  %nop10913 = alloca i1, i1 0
+  %nop10914 = alloca i1, i1 0
+  %nop10915 = alloca i1, i1 0
+  %nop10916 = alloca i1, i1 0
+  %nop10917 = alloca i1, i1 0
+  %nop10918 = alloca i1, i1 0
+  %nop10919 = alloca i1, i1 0
+  %nop10920 = alloca i1, i1 0
+  %nop10921 = alloca i1, i1 0
+  %nop10922 = alloca i1, i1 0
+  %nop10923 = alloca i1, i1 0
+  %nop10924 = alloca i1, i1 0
+  %nop10925 = alloca i1, i1 0
+  %nop10926 = alloca i1, i1 0
+  %nop10927 = alloca i1, i1 0
+  %nop10928 = alloca i1, i1 0
+  %nop10929 = alloca i1, i1 0
+  %nop10930 = alloca i1, i1 0
+  %nop10931 = alloca i1, i1 0
+  %nop10932 = alloca i1, i1 0
+  %nop10933 = alloca i1, i1 0
+  %nop10934 = alloca i1, i1 0
+  %nop10935 = alloca i1, i1 0
+  %nop10936 = alloca i1, i1 0
+  %nop10937 = alloca i1, i1 0
+  %nop10938 = alloca i1, i1 0
+  %nop10939 = alloca i1, i1 0
+  %nop10940 = alloca i1, i1 0
+  %nop10941 = alloca i1, i1 0
+  %nop10942 = alloca i1, i1 0
+  %nop10943 = alloca i1, i1 0
+  %nop10944 = alloca i1, i1 0
+  %nop10945 = alloca i1, i1 0
+  %nop10946 = alloca i1, i1 0
+  %nop10947 = alloca i1, i1 0
+  %nop10948 = alloca i1, i1 0
+  %nop10949 = alloca i1, i1 0
+  %nop10950 = alloca i1, i1 0
+  %nop10951 = alloca i1, i1 0
+  %nop10952 = alloca i1, i1 0
+  %nop10953 = alloca i1, i1 0
+  %nop10954 = alloca i1, i1 0
+  %nop10955 = alloca i1, i1 0
+  %nop10956 = alloca i1, i1 0
+  %nop10957 = alloca i1, i1 0
+  %nop10958 = alloca i1, i1 0
+  %nop10959 = alloca i1, i1 0
+  %nop10960 = alloca i1, i1 0
+  %nop10961 = alloca i1, i1 0
+  %nop10962 = alloca i1, i1 0
+  %nop10963 = alloca i1, i1 0
+  %nop10964 = alloca i1, i1 0
+  %nop10965 = alloca i1, i1 0
+  %nop10966 = alloca i1, i1 0
+  %nop10967 = alloca i1, i1 0
+  %nop10968 = alloca i1, i1 0
+  %nop10969 = alloca i1, i1 0
+  %nop10970 = alloca i1, i1 0
+  %nop10971 = alloca i1, i1 0
+  %nop10972 = alloca i1, i1 0
+  %nop10973 = alloca i1, i1 0
+  %nop10974 = alloca i1, i1 0
+  %nop10975 = alloca i1, i1 0
+  %nop10976 = alloca i1, i1 0
+  %nop10977 = alloca i1, i1 0
+  %nop10978 = alloca i1, i1 0
+  %nop10979 = alloca i1, i1 0
+  %nop10980 = alloca i1, i1 0
+  %nop10981 = alloca i1, i1 0
+  %nop10982 = alloca i1, i1 0
+  %nop10983 = alloca i1, i1 0
+  %nop10984 = alloca i1, i1 0
+  %nop10985 = alloca i1, i1 0
+  %nop10986 = alloca i1, i1 0
+  %nop10987 = alloca i1, i1 0
+  %nop10988 = alloca i1, i1 0
+  %nop10989 = alloca i1, i1 0
+  %nop10990 = alloca i1, i1 0
+  %nop10991 = alloca i1, i1 0
+  %nop10992 = alloca i1, i1 0
+  %nop10993 = alloca i1, i1 0
+  %nop10994 = alloca i1, i1 0
+  %nop10995 = alloca i1, i1 0
+  %nop10996 = alloca i1, i1 0
+  %nop10997 = alloca i1, i1 0
+  %nop10998 = alloca i1, i1 0
+  %nop10999 = alloca i1, i1 0
+  %nop11000 = alloca i1, i1 0
+  %nop11001 = alloca i1, i1 0
+  %nop11002 = alloca i1, i1 0
+  %nop11003 = alloca i1, i1 0
+  %nop11004 = alloca i1, i1 0
+  %nop11005 = alloca i1, i1 0
+  %nop11006 = alloca i1, i1 0
+  %nop11007 = alloca i1, i1 0
+  %nop11008 = alloca i1, i1 0
+  %nop11009 = alloca i1, i1 0
+  %nop11010 = alloca i1, i1 0
+  %nop11011 = alloca i1, i1 0
+  %nop11012 = alloca i1, i1 0
+  %nop11013 = alloca i1, i1 0
+  %nop11014 = alloca i1, i1 0
+  %nop11015 = alloca i1, i1 0
+  %nop11016 = alloca i1, i1 0
+  %nop11017 = alloca i1, i1 0
+  %nop11018 = alloca i1, i1 0
+  %nop11019 = alloca i1, i1 0
+  %nop11020 = alloca i1, i1 0
+  %nop11021 = alloca i1, i1 0
+  %nop11022 = alloca i1, i1 0
+  %nop11023 = alloca i1, i1 0
+  %nop11024 = alloca i1, i1 0
+  %nop11025 = alloca i1, i1 0
+  %nop11026 = alloca i1, i1 0
+  %nop11027 = alloca i1, i1 0
+  %nop11028 = alloca i1, i1 0
+  %nop11029 = alloca i1, i1 0
+  %nop11030 = alloca i1, i1 0
+  %nop11031 = alloca i1, i1 0
+  %nop11032 = alloca i1, i1 0
+  %nop11033 = alloca i1, i1 0
+  %nop11034 = alloca i1, i1 0
+  %nop11035 = alloca i1, i1 0
+  %nop11036 = alloca i1, i1 0
+  %nop11037 = alloca i1, i1 0
+  %nop11038 = alloca i1, i1 0
+  %nop11039 = alloca i1, i1 0
+  %nop11040 = alloca i1, i1 0
+  %nop11041 = alloca i1, i1 0
+  %nop11042 = alloca i1, i1 0
+  %nop11043 = alloca i1, i1 0
+  %nop11044 = alloca i1, i1 0
+  %nop11045 = alloca i1, i1 0
+  %nop11046 = alloca i1, i1 0
+  %nop11047 = alloca i1, i1 0
+  %nop11048 = alloca i1, i1 0
+  %nop11049 = alloca i1, i1 0
+  %nop11050 = alloca i1, i1 0
+  %nop11051 = alloca i1, i1 0
+  %nop11052 = alloca i1, i1 0
+  %nop11053 = alloca i1, i1 0
+  %nop11054 = alloca i1, i1 0
+  %nop11055 = alloca i1, i1 0
+  %nop11056 = alloca i1, i1 0
+  %nop11057 = alloca i1, i1 0
+  %nop11058 = alloca i1, i1 0
+  %nop11059 = alloca i1, i1 0
+  %nop11060 = alloca i1, i1 0
+  %nop11061 = alloca i1, i1 0
+  %nop11062 = alloca i1, i1 0
+  %nop11063 = alloca i1, i1 0
+  %nop11064 = alloca i1, i1 0
+  %nop11065 = alloca i1, i1 0
+  %nop11066 = alloca i1, i1 0
+  %nop11067 = alloca i1, i1 0
+  %nop11068 = alloca i1, i1 0
+  %nop11069 = alloca i1, i1 0
+  %nop11070 = alloca i1, i1 0
+  %nop11071 = alloca i1, i1 0
+  %nop11072 = alloca i1, i1 0
+  %nop11073 = alloca i1, i1 0
+  %nop11074 = alloca i1, i1 0
+  %nop11075 = alloca i1, i1 0
+  %nop11076 = alloca i1, i1 0
+  %nop11077 = alloca i1, i1 0
+  %nop11078 = alloca i1, i1 0
+  %nop11079 = alloca i1, i1 0
+  %nop11080 = alloca i1, i1 0
+  %nop11081 = alloca i1, i1 0
+  %nop11082 = alloca i1, i1 0
+  %nop11083 = alloca i1, i1 0
+  %nop11084 = alloca i1, i1 0
+  %nop11085 = alloca i1, i1 0
+  %nop11086 = alloca i1, i1 0
+  %nop11087 = alloca i1, i1 0
+  %nop11088 = alloca i1, i1 0
+  %nop11089 = alloca i1, i1 0
+  %nop11090 = alloca i1, i1 0
+  %nop11091 = alloca i1, i1 0
+  %nop11092 = alloca i1, i1 0
+  %nop11093 = alloca i1, i1 0
+  %nop11094 = alloca i1, i1 0
+  %nop11095 = alloca i1, i1 0
+  %nop11096 = alloca i1, i1 0
+  %nop11097 = alloca i1, i1 0
+  %nop11098 = alloca i1, i1 0
+  %nop11099 = alloca i1, i1 0
+  %nop11100 = alloca i1, i1 0
+  %nop11101 = alloca i1, i1 0
+  %nop11102 = alloca i1, i1 0
+  %nop11103 = alloca i1, i1 0
+  %nop11104 = alloca i1, i1 0
+  %nop11105 = alloca i1, i1 0
+  %nop11106 = alloca i1, i1 0
+  %nop11107 = alloca i1, i1 0
+  %nop11108 = alloca i1, i1 0
+  %nop11109 = alloca i1, i1 0
+  %nop11110 = alloca i1, i1 0
+  %nop11111 = alloca i1, i1 0
+  %nop11112 = alloca i1, i1 0
+  %nop11113 = alloca i1, i1 0
+  %nop11114 = alloca i1, i1 0
+  %nop11115 = alloca i1, i1 0
+  %nop11116 = alloca i1, i1 0
+  %nop11117 = alloca i1, i1 0
+  %nop11118 = alloca i1, i1 0
+  %nop11119 = alloca i1, i1 0
+  %nop11120 = alloca i1, i1 0
+  %nop11121 = alloca i1, i1 0
+  %nop11122 = alloca i1, i1 0
+  %nop11123 = alloca i1, i1 0
+  %nop11124 = alloca i1, i1 0
+  %nop11125 = alloca i1, i1 0
+  %nop11126 = alloca i1, i1 0
+  %nop11127 = alloca i1, i1 0
+  %nop11128 = alloca i1, i1 0
+  %nop11129 = alloca i1, i1 0
+  %nop11130 = alloca i1, i1 0
+  %nop11131 = alloca i1, i1 0
+  %nop11132 = alloca i1, i1 0
+  %nop11133 = alloca i1, i1 0
+  %nop11134 = alloca i1, i1 0
+  %nop11135 = alloca i1, i1 0
+  %nop11136 = alloca i1, i1 0
+  %nop11137 = alloca i1, i1 0
+  %nop11138 = alloca i1, i1 0
+  %nop11139 = alloca i1, i1 0
+  %nop11140 = alloca i1, i1 0
+  %nop11141 = alloca i1, i1 0
+  %nop11142 = alloca i1, i1 0
+  %nop11143 = alloca i1, i1 0
+  %nop11144 = alloca i1, i1 0
+  %nop11145 = alloca i1, i1 0
+  %nop11146 = alloca i1, i1 0
+  %nop11147 = alloca i1, i1 0
+  %nop11148 = alloca i1, i1 0
+  %nop11149 = alloca i1, i1 0
+  %nop11150 = alloca i1, i1 0
+  %nop11151 = alloca i1, i1 0
+  %nop11152 = alloca i1, i1 0
+  %nop11153 = alloca i1, i1 0
+  %nop11154 = alloca i1, i1 0
+  %nop11155 = alloca i1, i1 0
+  %nop11156 = alloca i1, i1 0
+  %nop11157 = alloca i1, i1 0
+  %nop11158 = alloca i1, i1 0
+  %nop11159 = alloca i1, i1 0
+  %nop11160 = alloca i1, i1 0
+  %nop11161 = alloca i1, i1 0
+  %nop11162 = alloca i1, i1 0
+  %nop11163 = alloca i1, i1 0
+  %nop11164 = alloca i1, i1 0
+  %nop11165 = alloca i1, i1 0
+  %nop11166 = alloca i1, i1 0
+  %nop11167 = alloca i1, i1 0
+  %nop11168 = alloca i1, i1 0
+  %nop11169 = alloca i1, i1 0
+  %nop11170 = alloca i1, i1 0
+  %nop11171 = alloca i1, i1 0
+  %nop11172 = alloca i1, i1 0
+  %nop11173 = alloca i1, i1 0
+  %nop11174 = alloca i1, i1 0
+  %nop11175 = alloca i1, i1 0
+  %nop11176 = alloca i1, i1 0
+  %nop11177 = alloca i1, i1 0
+  %nop11178 = alloca i1, i1 0
+  %nop11179 = alloca i1, i1 0
+  %nop11180 = alloca i1, i1 0
+  %nop11181 = alloca i1, i1 0
+  %nop11182 = alloca i1, i1 0
+  %nop11183 = alloca i1, i1 0
+  %nop11184 = alloca i1, i1 0
+  %nop11185 = alloca i1, i1 0
+  %nop11186 = alloca i1, i1 0
+  %nop11187 = alloca i1, i1 0
+  %nop11188 = alloca i1, i1 0
+  %nop11189 = alloca i1, i1 0
+  %nop11190 = alloca i1, i1 0
+  %nop11191 = alloca i1, i1 0
+  %nop11192 = alloca i1, i1 0
+  %nop11193 = alloca i1, i1 0
+  %nop11194 = alloca i1, i1 0
+  %nop11195 = alloca i1, i1 0
+  %nop11196 = alloca i1, i1 0
+  %nop11197 = alloca i1, i1 0
+  %nop11198 = alloca i1, i1 0
+  %nop11199 = alloca i1, i1 0
+  %nop11200 = alloca i1, i1 0
+  %nop11201 = alloca i1, i1 0
+  %nop11202 = alloca i1, i1 0
+  %nop11203 = alloca i1, i1 0
+  %nop11204 = alloca i1, i1 0
+  %nop11205 = alloca i1, i1 0
+  %nop11206 = alloca i1, i1 0
+  %nop11207 = alloca i1, i1 0
+  %nop11208 = alloca i1, i1 0
+  %nop11209 = alloca i1, i1 0
+  %nop11210 = alloca i1, i1 0
+  %nop11211 = alloca i1, i1 0
+  %nop11212 = alloca i1, i1 0
+  %nop11213 = alloca i1, i1 0
+  %nop11214 = alloca i1, i1 0
+  %nop11215 = alloca i1, i1 0
+  %nop11216 = alloca i1, i1 0
+  %nop11217 = alloca i1, i1 0
+  %nop11218 = alloca i1, i1 0
+  %nop11219 = alloca i1, i1 0
+  %nop11220 = alloca i1, i1 0
+  %nop11221 = alloca i1, i1 0
+  %nop11222 = alloca i1, i1 0
+  %nop11223 = alloca i1, i1 0
+  %nop11224 = alloca i1, i1 0
+  %nop11225 = alloca i1, i1 0
+  %nop11226 = alloca i1, i1 0
+  %nop11227 = alloca i1, i1 0
+  %nop11228 = alloca i1, i1 0
+  %nop11229 = alloca i1, i1 0
+  %nop11230 = alloca i1, i1 0
+  %nop11231 = alloca i1, i1 0
+  %nop11232 = alloca i1, i1 0
+  %nop11233 = alloca i1, i1 0
+  %nop11234 = alloca i1, i1 0
+  %nop11235 = alloca i1, i1 0
+  %nop11236 = alloca i1, i1 0
+  %nop11237 = alloca i1, i1 0
+  %nop11238 = alloca i1, i1 0
+  %nop11239 = alloca i1, i1 0
+  %nop11240 = alloca i1, i1 0
+  %nop11241 = alloca i1, i1 0
+  %nop11242 = alloca i1, i1 0
+  %nop11243 = alloca i1, i1 0
+  %nop11244 = alloca i1, i1 0
+  %nop11245 = alloca i1, i1 0
+  %nop11246 = alloca i1, i1 0
+  %nop11247 = alloca i1, i1 0
+  %nop11248 = alloca i1, i1 0
+  %nop11249 = alloca i1, i1 0
+  %nop11250 = alloca i1, i1 0
+  %nop11251 = alloca i1, i1 0
+  %nop11252 = alloca i1, i1 0
+  %nop11253 = alloca i1, i1 0
+  %nop11254 = alloca i1, i1 0
+  %nop11255 = alloca i1, i1 0
+  %nop11256 = alloca i1, i1 0
+  %nop11257 = alloca i1, i1 0
+  %nop11258 = alloca i1, i1 0
+  %nop11259 = alloca i1, i1 0
+  %nop11260 = alloca i1, i1 0
+  %nop11261 = alloca i1, i1 0
+  %nop11262 = alloca i1, i1 0
+  %nop11263 = alloca i1, i1 0
+  %nop11264 = alloca i1, i1 0
+  %nop11265 = alloca i1, i1 0
+  %nop11266 = alloca i1, i1 0
+  %nop11267 = alloca i1, i1 0
+  %nop11268 = alloca i1, i1 0
+  %nop11269 = alloca i1, i1 0
+  %nop11270 = alloca i1, i1 0
+  %nop11271 = alloca i1, i1 0
+  %nop11272 = alloca i1, i1 0
+  %nop11273 = alloca i1, i1 0
+  %nop11274 = alloca i1, i1 0
+  %nop11275 = alloca i1, i1 0
+  %nop11276 = alloca i1, i1 0
+  %nop11277 = alloca i1, i1 0
+  %nop11278 = alloca i1, i1 0
+  %nop11279 = alloca i1, i1 0
+  %nop11280 = alloca i1, i1 0
+  %nop11281 = alloca i1, i1 0
+  %nop11282 = alloca i1, i1 0
+  %nop11283 = alloca i1, i1 0
+  %nop11284 = alloca i1, i1 0
+  %nop11285 = alloca i1, i1 0
+  %nop11286 = alloca i1, i1 0
+  %nop11287 = alloca i1, i1 0
+  %nop11288 = alloca i1, i1 0
+  %nop11289 = alloca i1, i1 0
+  %nop11290 = alloca i1, i1 0
+  %nop11291 = alloca i1, i1 0
+  %nop11292 = alloca i1, i1 0
+  %nop11293 = alloca i1, i1 0
+  %nop11294 = alloca i1, i1 0
+  %nop11295 = alloca i1, i1 0
+  %nop11296 = alloca i1, i1 0
+  %nop11297 = alloca i1, i1 0
+  %nop11298 = alloca i1, i1 0
+  %nop11299 = alloca i1, i1 0
+  %nop11300 = alloca i1, i1 0
+  %nop11301 = alloca i1, i1 0
+  %nop11302 = alloca i1, i1 0
+  %nop11303 = alloca i1, i1 0
+  %nop11304 = alloca i1, i1 0
+  %nop11305 = alloca i1, i1 0
+  %nop11306 = alloca i1, i1 0
+  %nop11307 = alloca i1, i1 0
+  %nop11308 = alloca i1, i1 0
+  %nop11309 = alloca i1, i1 0
+  %nop11310 = alloca i1, i1 0
+  %nop11311 = alloca i1, i1 0
+  %nop11312 = alloca i1, i1 0
+  %nop11313 = alloca i1, i1 0
+  %nop11314 = alloca i1, i1 0
+  %nop11315 = alloca i1, i1 0
+  %nop11316 = alloca i1, i1 0
+  %nop11317 = alloca i1, i1 0
+  %nop11318 = alloca i1, i1 0
+  %nop11319 = alloca i1, i1 0
+  %nop11320 = alloca i1, i1 0
+  %nop11321 = alloca i1, i1 0
+  %nop11322 = alloca i1, i1 0
+  %nop11323 = alloca i1, i1 0
+  %nop11324 = alloca i1, i1 0
+  %nop11325 = alloca i1, i1 0
+  %nop11326 = alloca i1, i1 0
+  %nop11327 = alloca i1, i1 0
+  %nop11328 = alloca i1, i1 0
+  %nop11329 = alloca i1, i1 0
+  %nop11330 = alloca i1, i1 0
+  %nop11331 = alloca i1, i1 0
+  %nop11332 = alloca i1, i1 0
+  %nop11333 = alloca i1, i1 0
+  %nop11334 = alloca i1, i1 0
+  %nop11335 = alloca i1, i1 0
+  %nop11336 = alloca i1, i1 0
+  %nop11337 = alloca i1, i1 0
+  %nop11338 = alloca i1, i1 0
+  %nop11339 = alloca i1, i1 0
+  %nop11340 = alloca i1, i1 0
+  %nop11341 = alloca i1, i1 0
+  %nop11342 = alloca i1, i1 0
+  %nop11343 = alloca i1, i1 0
+  %nop11344 = alloca i1, i1 0
+  %nop11345 = alloca i1, i1 0
+  %nop11346 = alloca i1, i1 0
+  %nop11347 = alloca i1, i1 0
+  %nop11348 = alloca i1, i1 0
+  %nop11349 = alloca i1, i1 0
+  %nop11350 = alloca i1, i1 0
+  %nop11351 = alloca i1, i1 0
+  %nop11352 = alloca i1, i1 0
+  %nop11353 = alloca i1, i1 0
+  %nop11354 = alloca i1, i1 0
+  %nop11355 = alloca i1, i1 0
+  %nop11356 = alloca i1, i1 0
+  %nop11357 = alloca i1, i1 0
+  %nop11358 = alloca i1, i1 0
+  %nop11359 = alloca i1, i1 0
+  %nop11360 = alloca i1, i1 0
+  %nop11361 = alloca i1, i1 0
+  %nop11362 = alloca i1, i1 0
+  %nop11363 = alloca i1, i1 0
+  %nop11364 = alloca i1, i1 0
+  %nop11365 = alloca i1, i1 0
+  %nop11366 = alloca i1, i1 0
+  %nop11367 = alloca i1, i1 0
+  %nop11368 = alloca i1, i1 0
+  %nop11369 = alloca i1, i1 0
+  %nop11370 = alloca i1, i1 0
+  %nop11371 = alloca i1, i1 0
+  %nop11372 = alloca i1, i1 0
+  %nop11373 = alloca i1, i1 0
+  %nop11374 = alloca i1, i1 0
+  %nop11375 = alloca i1, i1 0
+  %nop11376 = alloca i1, i1 0
+  %nop11377 = alloca i1, i1 0
+  %nop11378 = alloca i1, i1 0
+  %nop11379 = alloca i1, i1 0
+  %nop11380 = alloca i1, i1 0
+  %nop11381 = alloca i1, i1 0
+  %nop11382 = alloca i1, i1 0
+  %nop11383 = alloca i1, i1 0
+  %nop11384 = alloca i1, i1 0
+  %nop11385 = alloca i1, i1 0
+  %nop11386 = alloca i1, i1 0
+  %nop11387 = alloca i1, i1 0
+  %nop11388 = alloca i1, i1 0
+  %nop11389 = alloca i1, i1 0
+  %nop11390 = alloca i1, i1 0
+  %nop11391 = alloca i1, i1 0
+  %nop11392 = alloca i1, i1 0
+  %nop11393 = alloca i1, i1 0
+  %nop11394 = alloca i1, i1 0
+  %nop11395 = alloca i1, i1 0
+  %nop11396 = alloca i1, i1 0
+  %nop11397 = alloca i1, i1 0
+  %nop11398 = alloca i1, i1 0
+  %nop11399 = alloca i1, i1 0
+  %nop11400 = alloca i1, i1 0
+  %nop11401 = alloca i1, i1 0
+  %nop11402 = alloca i1, i1 0
+  %nop11403 = alloca i1, i1 0
+  %nop11404 = alloca i1, i1 0
+  %nop11405 = alloca i1, i1 0
+  %nop11406 = alloca i1, i1 0
+  %nop11407 = alloca i1, i1 0
+  %nop11408 = alloca i1, i1 0
+  %nop11409 = alloca i1, i1 0
+  %nop11410 = alloca i1, i1 0
+  %nop11411 = alloca i1, i1 0
+  %nop11412 = alloca i1, i1 0
+  %nop11413 = alloca i1, i1 0
+  %nop11414 = alloca i1, i1 0
+  %nop11415 = alloca i1, i1 0
+  %nop11416 = alloca i1, i1 0
+  %nop11417 = alloca i1, i1 0
+  %nop11418 = alloca i1, i1 0
+  %nop11419 = alloca i1, i1 0
+  %nop11420 = alloca i1, i1 0
+  %nop11421 = alloca i1, i1 0
+  %nop11422 = alloca i1, i1 0
+  %nop11423 = alloca i1, i1 0
+  %nop11424 = alloca i1, i1 0
+  %nop11425 = alloca i1, i1 0
+  %nop11426 = alloca i1, i1 0
+  %nop11427 = alloca i1, i1 0
+  %nop11428 = alloca i1, i1 0
+  %nop11429 = alloca i1, i1 0
+  %nop11430 = alloca i1, i1 0
+  %nop11431 = alloca i1, i1 0
+  %nop11432 = alloca i1, i1 0
+  %nop11433 = alloca i1, i1 0
+  %nop11434 = alloca i1, i1 0
+  %nop11435 = alloca i1, i1 0
+  %nop11436 = alloca i1, i1 0
+  %nop11437 = alloca i1, i1 0
+  %nop11438 = alloca i1, i1 0
+  %nop11439 = alloca i1, i1 0
+  %nop11440 = alloca i1, i1 0
+  %nop11441 = alloca i1, i1 0
+  %nop11442 = alloca i1, i1 0
+  %nop11443 = alloca i1, i1 0
+  %nop11444 = alloca i1, i1 0
+  %nop11445 = alloca i1, i1 0
+  %nop11446 = alloca i1, i1 0
+  %nop11447 = alloca i1, i1 0
+  %nop11448 = alloca i1, i1 0
+  %nop11449 = alloca i1, i1 0
+  %nop11450 = alloca i1, i1 0
+  %nop11451 = alloca i1, i1 0
+  %nop11452 = alloca i1, i1 0
+  %nop11453 = alloca i1, i1 0
+  %nop11454 = alloca i1, i1 0
+  %nop11455 = alloca i1, i1 0
+  %nop11456 = alloca i1, i1 0
+  %nop11457 = alloca i1, i1 0
+  %nop11458 = alloca i1, i1 0
+  %nop11459 = alloca i1, i1 0
+  %nop11460 = alloca i1, i1 0
+  %nop11461 = alloca i1, i1 0
+  %nop11462 = alloca i1, i1 0
+  %nop11463 = alloca i1, i1 0
+  %nop11464 = alloca i1, i1 0
+  %nop11465 = alloca i1, i1 0
+  %nop11466 = alloca i1, i1 0
+  %nop11467 = alloca i1, i1 0
+  %nop11468 = alloca i1, i1 0
+  %nop11469 = alloca i1, i1 0
+  %nop11470 = alloca i1, i1 0
+  %nop11471 = alloca i1, i1 0
+  %nop11472 = alloca i1, i1 0
+  %nop11473 = alloca i1, i1 0
+  %nop11474 = alloca i1, i1 0
+  %nop11475 = alloca i1, i1 0
+  %nop11476 = alloca i1, i1 0
+  %nop11477 = alloca i1, i1 0
+  %nop11478 = alloca i1, i1 0
+  %nop11479 = alloca i1, i1 0
+  %nop11480 = alloca i1, i1 0
+  %nop11481 = alloca i1, i1 0
+  %nop11482 = alloca i1, i1 0
+  %nop11483 = alloca i1, i1 0
+  %nop11484 = alloca i1, i1 0
+  %nop11485 = alloca i1, i1 0
+  %nop11486 = alloca i1, i1 0
+  %nop11487 = alloca i1, i1 0
+  %nop11488 = alloca i1, i1 0
+  %nop11489 = alloca i1, i1 0
+  %nop11490 = alloca i1, i1 0
+  %nop11491 = alloca i1, i1 0
+  %nop11492 = alloca i1, i1 0
+  %nop11493 = alloca i1, i1 0
+  %nop11494 = alloca i1, i1 0
+  %nop11495 = alloca i1, i1 0
+  %nop11496 = alloca i1, i1 0
+  %nop11497 = alloca i1, i1 0
+  %nop11498 = alloca i1, i1 0
+  %nop11499 = alloca i1, i1 0
+  %nop11500 = alloca i1, i1 0
+  %nop11501 = alloca i1, i1 0
+  %nop11502 = alloca i1, i1 0
+  %nop11503 = alloca i1, i1 0
+  %nop11504 = alloca i1, i1 0
+  %nop11505 = alloca i1, i1 0
+  %nop11506 = alloca i1, i1 0
+  %nop11507 = alloca i1, i1 0
+  %nop11508 = alloca i1, i1 0
+  %nop11509 = alloca i1, i1 0
+  %nop11510 = alloca i1, i1 0
+  %nop11511 = alloca i1, i1 0
+  %nop11512 = alloca i1, i1 0
+  %nop11513 = alloca i1, i1 0
+  %nop11514 = alloca i1, i1 0
+  %nop11515 = alloca i1, i1 0
+  %nop11516 = alloca i1, i1 0
+  %nop11517 = alloca i1, i1 0
+  %nop11518 = alloca i1, i1 0
+  %nop11519 = alloca i1, i1 0
+  %nop11520 = alloca i1, i1 0
+  %nop11521 = alloca i1, i1 0
+  %nop11522 = alloca i1, i1 0
+  %nop11523 = alloca i1, i1 0
+  %nop11524 = alloca i1, i1 0
+  %nop11525 = alloca i1, i1 0
+  %nop11526 = alloca i1, i1 0
+  %nop11527 = alloca i1, i1 0
+  %nop11528 = alloca i1, i1 0
+  %nop11529 = alloca i1, i1 0
+  %nop11530 = alloca i1, i1 0
+  %nop11531 = alloca i1, i1 0
+  %nop11532 = alloca i1, i1 0
+  %nop11533 = alloca i1, i1 0
+  %nop11534 = alloca i1, i1 0
+  %nop11535 = alloca i1, i1 0
+  %nop11536 = alloca i1, i1 0
+  %nop11537 = alloca i1, i1 0
+  %nop11538 = alloca i1, i1 0
+  %nop11539 = alloca i1, i1 0
+  %nop11540 = alloca i1, i1 0
+  %nop11541 = alloca i1, i1 0
+  %nop11542 = alloca i1, i1 0
+  %nop11543 = alloca i1, i1 0
+  %nop11544 = alloca i1, i1 0
+  %nop11545 = alloca i1, i1 0
+  %nop11546 = alloca i1, i1 0
+  %nop11547 = alloca i1, i1 0
+  %nop11548 = alloca i1, i1 0
+  %nop11549 = alloca i1, i1 0
+  %nop11550 = alloca i1, i1 0
+  %nop11551 = alloca i1, i1 0
+  %nop11552 = alloca i1, i1 0
+  %nop11553 = alloca i1, i1 0
+  %nop11554 = alloca i1, i1 0
+  %nop11555 = alloca i1, i1 0
+  %nop11556 = alloca i1, i1 0
+  %nop11557 = alloca i1, i1 0
+  %nop11558 = alloca i1, i1 0
+  %nop11559 = alloca i1, i1 0
+  %nop11560 = alloca i1, i1 0
+  %nop11561 = alloca i1, i1 0
+  %nop11562 = alloca i1, i1 0
+  %nop11563 = alloca i1, i1 0
+  %nop11564 = alloca i1, i1 0
+  %nop11565 = alloca i1, i1 0
+  %nop11566 = alloca i1, i1 0
+  %nop11567 = alloca i1, i1 0
+  %nop11568 = alloca i1, i1 0
+  %nop11569 = alloca i1, i1 0
+  %nop11570 = alloca i1, i1 0
+  %nop11571 = alloca i1, i1 0
+  %nop11572 = alloca i1, i1 0
+  %nop11573 = alloca i1, i1 0
+  %nop11574 = alloca i1, i1 0
+  %nop11575 = alloca i1, i1 0
+  %nop11576 = alloca i1, i1 0
+  %nop11577 = alloca i1, i1 0
+  %nop11578 = alloca i1, i1 0
+  %nop11579 = alloca i1, i1 0
+  %nop11580 = alloca i1, i1 0
+  %nop11581 = alloca i1, i1 0
+  %nop11582 = alloca i1, i1 0
+  %nop11583 = alloca i1, i1 0
+  %nop11584 = alloca i1, i1 0
+  %nop11585 = alloca i1, i1 0
+  %nop11586 = alloca i1, i1 0
+  %nop11587 = alloca i1, i1 0
+  %nop11588 = alloca i1, i1 0
+  %nop11589 = alloca i1, i1 0
+  %nop11590 = alloca i1, i1 0
+  %nop11591 = alloca i1, i1 0
+  %nop11592 = alloca i1, i1 0
+  %nop11593 = alloca i1, i1 0
+  %nop11594 = alloca i1, i1 0
+  %nop11595 = alloca i1, i1 0
+  %nop11596 = alloca i1, i1 0
+  %nop11597 = alloca i1, i1 0
+  %nop11598 = alloca i1, i1 0
+  %nop11599 = alloca i1, i1 0
+  %nop11600 = alloca i1, i1 0
+  %nop11601 = alloca i1, i1 0
+  %nop11602 = alloca i1, i1 0
+  %nop11603 = alloca i1, i1 0
+  %nop11604 = alloca i1, i1 0
+  %nop11605 = alloca i1, i1 0
+  %nop11606 = alloca i1, i1 0
+  %nop11607 = alloca i1, i1 0
+  %nop11608 = alloca i1, i1 0
+  %nop11609 = alloca i1, i1 0
+  %nop11610 = alloca i1, i1 0
+  %nop11611 = alloca i1, i1 0
+  %nop11612 = alloca i1, i1 0
+  %nop11613 = alloca i1, i1 0
+  %nop11614 = alloca i1, i1 0
+  %nop11615 = alloca i1, i1 0
+  %nop11616 = alloca i1, i1 0
+  %nop11617 = alloca i1, i1 0
+  %nop11618 = alloca i1, i1 0
+  %nop11619 = alloca i1, i1 0
+  %nop11620 = alloca i1, i1 0
+  %nop11621 = alloca i1, i1 0
+  %nop11622 = alloca i1, i1 0
+  %nop11623 = alloca i1, i1 0
+  %nop11624 = alloca i1, i1 0
+  %nop11625 = alloca i1, i1 0
+  %nop11626 = alloca i1, i1 0
+  %nop11627 = alloca i1, i1 0
+  %nop11628 = alloca i1, i1 0
+  %nop11629 = alloca i1, i1 0
+  %nop11630 = alloca i1, i1 0
+  %nop11631 = alloca i1, i1 0
+  %nop11632 = alloca i1, i1 0
+  %nop11633 = alloca i1, i1 0
+  %nop11634 = alloca i1, i1 0
+  %nop11635 = alloca i1, i1 0
+  %nop11636 = alloca i1, i1 0
+  %nop11637 = alloca i1, i1 0
+  %nop11638 = alloca i1, i1 0
+  %nop11639 = alloca i1, i1 0
+  %nop11640 = alloca i1, i1 0
+  %nop11641 = alloca i1, i1 0
+  %nop11642 = alloca i1, i1 0
+  %nop11643 = alloca i1, i1 0
+  %nop11644 = alloca i1, i1 0
+  %nop11645 = alloca i1, i1 0
+  %nop11646 = alloca i1, i1 0
+  %nop11647 = alloca i1, i1 0
+  %nop11648 = alloca i1, i1 0
+  %nop11649 = alloca i1, i1 0
+  %nop11650 = alloca i1, i1 0
+  %nop11651 = alloca i1, i1 0
+  %nop11652 = alloca i1, i1 0
+  %nop11653 = alloca i1, i1 0
+  %nop11654 = alloca i1, i1 0
+  %nop11655 = alloca i1, i1 0
+  %nop11656 = alloca i1, i1 0
+  %nop11657 = alloca i1, i1 0
+  %nop11658 = alloca i1, i1 0
+  %nop11659 = alloca i1, i1 0
+  %nop11660 = alloca i1, i1 0
+  %nop11661 = alloca i1, i1 0
+  %nop11662 = alloca i1, i1 0
+  %nop11663 = alloca i1, i1 0
+  %nop11664 = alloca i1, i1 0
+  %nop11665 = alloca i1, i1 0
+  %nop11666 = alloca i1, i1 0
+  %nop11667 = alloca i1, i1 0
+  %nop11668 = alloca i1, i1 0
+  %nop11669 = alloca i1, i1 0
+  %nop11670 = alloca i1, i1 0
+  %nop11671 = alloca i1, i1 0
+  %nop11672 = alloca i1, i1 0
+  %nop11673 = alloca i1, i1 0
+  %nop11674 = alloca i1, i1 0
+  %nop11675 = alloca i1, i1 0
+  %nop11676 = alloca i1, i1 0
+  %nop11677 = alloca i1, i1 0
+  %nop11678 = alloca i1, i1 0
+  %nop11679 = alloca i1, i1 0
+  %nop11680 = alloca i1, i1 0
+  %nop11681 = alloca i1, i1 0
+  %nop11682 = alloca i1, i1 0
+  %nop11683 = alloca i1, i1 0
+  %nop11684 = alloca i1, i1 0
+  %nop11685 = alloca i1, i1 0
+  %nop11686 = alloca i1, i1 0
+  %nop11687 = alloca i1, i1 0
+  %nop11688 = alloca i1, i1 0
+  %nop11689 = alloca i1, i1 0
+  %nop11690 = alloca i1, i1 0
+  %nop11691 = alloca i1, i1 0
+  %nop11692 = alloca i1, i1 0
+  %nop11693 = alloca i1, i1 0
+  %nop11694 = alloca i1, i1 0
+  %nop11695 = alloca i1, i1 0
+  %nop11696 = alloca i1, i1 0
+  %nop11697 = alloca i1, i1 0
+  %nop11698 = alloca i1, i1 0
+  %nop11699 = alloca i1, i1 0
+  %nop11700 = alloca i1, i1 0
+  %nop11701 = alloca i1, i1 0
+  %nop11702 = alloca i1, i1 0
+  %nop11703 = alloca i1, i1 0
+  %nop11704 = alloca i1, i1 0
+  %nop11705 = alloca i1, i1 0
+  %nop11706 = alloca i1, i1 0
+  %nop11707 = alloca i1, i1 0
+  %nop11708 = alloca i1, i1 0
+  %nop11709 = alloca i1, i1 0
+  %nop11710 = alloca i1, i1 0
+  %nop11711 = alloca i1, i1 0
+  %nop11712 = alloca i1, i1 0
+  %nop11713 = alloca i1, i1 0
+  %nop11714 = alloca i1, i1 0
+  %nop11715 = alloca i1, i1 0
+  %nop11716 = alloca i1, i1 0
+  %nop11717 = alloca i1, i1 0
+  %nop11718 = alloca i1, i1 0
+  %nop11719 = alloca i1, i1 0
+  %nop11720 = alloca i1, i1 0
+  %nop11721 = alloca i1, i1 0
+  %nop11722 = alloca i1, i1 0
+  %nop11723 = alloca i1, i1 0
+  %nop11724 = alloca i1, i1 0
+  %nop11725 = alloca i1, i1 0
+  %nop11726 = alloca i1, i1 0
+  %nop11727 = alloca i1, i1 0
+  %nop11728 = alloca i1, i1 0
+  %nop11729 = alloca i1, i1 0
+  %nop11730 = alloca i1, i1 0
+  %nop11731 = alloca i1, i1 0
+  %nop11732 = alloca i1, i1 0
+  %nop11733 = alloca i1, i1 0
+  %nop11734 = alloca i1, i1 0
+  %nop11735 = alloca i1, i1 0
+  %nop11736 = alloca i1, i1 0
+  %nop11737 = alloca i1, i1 0
+  %nop11738 = alloca i1, i1 0
+  %nop11739 = alloca i1, i1 0
+  %nop11740 = alloca i1, i1 0
+  %nop11741 = alloca i1, i1 0
+  %nop11742 = alloca i1, i1 0
+  %nop11743 = alloca i1, i1 0
+  %nop11744 = alloca i1, i1 0
+  %nop11745 = alloca i1, i1 0
+  %nop11746 = alloca i1, i1 0
+  %nop11747 = alloca i1, i1 0
+  %nop11748 = alloca i1, i1 0
+  %nop11749 = alloca i1, i1 0
+  %nop11750 = alloca i1, i1 0
+  %nop11751 = alloca i1, i1 0
+  %nop11752 = alloca i1, i1 0
+  %nop11753 = alloca i1, i1 0
+  %nop11754 = alloca i1, i1 0
+  %nop11755 = alloca i1, i1 0
+  %nop11756 = alloca i1, i1 0
+  %nop11757 = alloca i1, i1 0
+  %nop11758 = alloca i1, i1 0
+  %nop11759 = alloca i1, i1 0
+  %nop11760 = alloca i1, i1 0
+  %nop11761 = alloca i1, i1 0
+  %nop11762 = alloca i1, i1 0
+  %nop11763 = alloca i1, i1 0
+  %nop11764 = alloca i1, i1 0
+  %nop11765 = alloca i1, i1 0
+  %nop11766 = alloca i1, i1 0
+  %nop11767 = alloca i1, i1 0
+  %nop11768 = alloca i1, i1 0
+  %nop11769 = alloca i1, i1 0
+  %nop11770 = alloca i1, i1 0
+  %nop11771 = alloca i1, i1 0
+  %nop11772 = alloca i1, i1 0
+  %nop11773 = alloca i1, i1 0
+  %nop11774 = alloca i1, i1 0
+  %nop11775 = alloca i1, i1 0
+  %nop11776 = alloca i1, i1 0
+  %nop11777 = alloca i1, i1 0
+  %nop11778 = alloca i1, i1 0
+  %nop11779 = alloca i1, i1 0
+  %nop11780 = alloca i1, i1 0
+  %nop11781 = alloca i1, i1 0
+  %nop11782 = alloca i1, i1 0
+  %nop11783 = alloca i1, i1 0
+  %nop11784 = alloca i1, i1 0
+  %nop11785 = alloca i1, i1 0
+  %nop11786 = alloca i1, i1 0
+  %nop11787 = alloca i1, i1 0
+  %nop11788 = alloca i1, i1 0
+  %nop11789 = alloca i1, i1 0
+  %nop11790 = alloca i1, i1 0
+  %nop11791 = alloca i1, i1 0
+  %nop11792 = alloca i1, i1 0
+  %nop11793 = alloca i1, i1 0
+  %nop11794 = alloca i1, i1 0
+  %nop11795 = alloca i1, i1 0
+  %nop11796 = alloca i1, i1 0
+  %nop11797 = alloca i1, i1 0
+  %nop11798 = alloca i1, i1 0
+  %nop11799 = alloca i1, i1 0
+  %nop11800 = alloca i1, i1 0
+  %nop11801 = alloca i1, i1 0
+  %nop11802 = alloca i1, i1 0
+  %nop11803 = alloca i1, i1 0
+  %nop11804 = alloca i1, i1 0
+  %nop11805 = alloca i1, i1 0
+  %nop11806 = alloca i1, i1 0
+  %nop11807 = alloca i1, i1 0
+  %nop11808 = alloca i1, i1 0
+  %nop11809 = alloca i1, i1 0
+  %nop11810 = alloca i1, i1 0
+  %nop11811 = alloca i1, i1 0
+  %nop11812 = alloca i1, i1 0
+  %nop11813 = alloca i1, i1 0
+  %nop11814 = alloca i1, i1 0
+  %nop11815 = alloca i1, i1 0
+  %nop11816 = alloca i1, i1 0
+  %nop11817 = alloca i1, i1 0
+  %nop11818 = alloca i1, i1 0
+  %nop11819 = alloca i1, i1 0
+  %nop11820 = alloca i1, i1 0
+  %nop11821 = alloca i1, i1 0
+  %nop11822 = alloca i1, i1 0
+  %nop11823 = alloca i1, i1 0
+  %nop11824 = alloca i1, i1 0
+  %nop11825 = alloca i1, i1 0
+  %nop11826 = alloca i1, i1 0
+  %nop11827 = alloca i1, i1 0
+  %nop11828 = alloca i1, i1 0
+  %nop11829 = alloca i1, i1 0
+  %nop11830 = alloca i1, i1 0
+  %nop11831 = alloca i1, i1 0
+  %nop11832 = alloca i1, i1 0
+  %nop11833 = alloca i1, i1 0
+  %nop11834 = alloca i1, i1 0
+  %nop11835 = alloca i1, i1 0
+  %nop11836 = alloca i1, i1 0
+  %nop11837 = alloca i1, i1 0
+  %nop11838 = alloca i1, i1 0
+  %nop11839 = alloca i1, i1 0
+  %nop11840 = alloca i1, i1 0
+  %nop11841 = alloca i1, i1 0
+  %nop11842 = alloca i1, i1 0
+  %nop11843 = alloca i1, i1 0
+  %nop11844 = alloca i1, i1 0
+  %nop11845 = alloca i1, i1 0
+  %nop11846 = alloca i1, i1 0
+  %nop11847 = alloca i1, i1 0
+  %nop11848 = alloca i1, i1 0
+  %nop11849 = alloca i1, i1 0
+  %nop11850 = alloca i1, i1 0
+  %nop11851 = alloca i1, i1 0
+  %nop11852 = alloca i1, i1 0
+  %nop11853 = alloca i1, i1 0
+  %nop11854 = alloca i1, i1 0
+  %nop11855 = alloca i1, i1 0
+  %nop11856 = alloca i1, i1 0
+  %nop11857 = alloca i1, i1 0
+  %nop11858 = alloca i1, i1 0
+  %nop11859 = alloca i1, i1 0
+  %nop11860 = alloca i1, i1 0
+  %nop11861 = alloca i1, i1 0
+  %nop11862 = alloca i1, i1 0
+  %nop11863 = alloca i1, i1 0
+  %nop11864 = alloca i1, i1 0
+  %nop11865 = alloca i1, i1 0
+  %nop11866 = alloca i1, i1 0
+  %nop11867 = alloca i1, i1 0
+  %nop11868 = alloca i1, i1 0
+  %nop11869 = alloca i1, i1 0
+  %nop11870 = alloca i1, i1 0
+  %nop11871 = alloca i1, i1 0
+  %nop11872 = alloca i1, i1 0
+  %nop11873 = alloca i1, i1 0
+  %nop11874 = alloca i1, i1 0
+  %nop11875 = alloca i1, i1 0
+  %nop11876 = alloca i1, i1 0
+  %nop11877 = alloca i1, i1 0
+  %nop11878 = alloca i1, i1 0
+  %nop11879 = alloca i1, i1 0
+  %nop11880 = alloca i1, i1 0
+  %nop11881 = alloca i1, i1 0
+  %nop11882 = alloca i1, i1 0
+  %nop11883 = alloca i1, i1 0
+  %nop11884 = alloca i1, i1 0
+  %nop11885 = alloca i1, i1 0
+  %nop11886 = alloca i1, i1 0
+  %nop11887 = alloca i1, i1 0
+  %nop11888 = alloca i1, i1 0
+  %nop11889 = alloca i1, i1 0
+  %nop11890 = alloca i1, i1 0
+  %nop11891 = alloca i1, i1 0
+  %nop11892 = alloca i1, i1 0
+  %nop11893 = alloca i1, i1 0
+  %nop11894 = alloca i1, i1 0
+  %nop11895 = alloca i1, i1 0
+  %nop11896 = alloca i1, i1 0
+  %nop11897 = alloca i1, i1 0
+  %nop11898 = alloca i1, i1 0
+  %nop11899 = alloca i1, i1 0
+  %nop11900 = alloca i1, i1 0
+  %nop11901 = alloca i1, i1 0
+  %nop11902 = alloca i1, i1 0
+  %nop11903 = alloca i1, i1 0
+  %nop11904 = alloca i1, i1 0
+  %nop11905 = alloca i1, i1 0
+  %nop11906 = alloca i1, i1 0
+  %nop11907 = alloca i1, i1 0
+  %nop11908 = alloca i1, i1 0
+  %nop11909 = alloca i1, i1 0
+  %nop11910 = alloca i1, i1 0
+  %nop11911 = alloca i1, i1 0
+  %nop11912 = alloca i1, i1 0
+  %nop11913 = alloca i1, i1 0
+  %nop11914 = alloca i1, i1 0
+  %nop11915 = alloca i1, i1 0
+  %nop11916 = alloca i1, i1 0
+  %nop11917 = alloca i1, i1 0
+  %nop11918 = alloca i1, i1 0
+  %nop11919 = alloca i1, i1 0
+  %nop11920 = alloca i1, i1 0
+  %nop11921 = alloca i1, i1 0
+  %nop11922 = alloca i1, i1 0
+  %nop11923 = alloca i1, i1 0
+  %nop11924 = alloca i1, i1 0
+  %nop11925 = alloca i1, i1 0
+  %nop11926 = alloca i1, i1 0
+  %nop11927 = alloca i1, i1 0
+  %nop11928 = alloca i1, i1 0
+  %nop11929 = alloca i1, i1 0
+  %nop11930 = alloca i1, i1 0
+  %nop11931 = alloca i1, i1 0
+  %nop11932 = alloca i1, i1 0
+  %nop11933 = alloca i1, i1 0
+  %nop11934 = alloca i1, i1 0
+  %nop11935 = alloca i1, i1 0
+  %nop11936 = alloca i1, i1 0
+  %nop11937 = alloca i1, i1 0
+  %nop11938 = alloca i1, i1 0
+  %nop11939 = alloca i1, i1 0
+  %nop11940 = alloca i1, i1 0
+  %nop11941 = alloca i1, i1 0
+  %nop11942 = alloca i1, i1 0
+  %nop11943 = alloca i1, i1 0
+  %nop11944 = alloca i1, i1 0
+  %nop11945 = alloca i1, i1 0
+  %nop11946 = alloca i1, i1 0
+  %nop11947 = alloca i1, i1 0
+  %nop11948 = alloca i1, i1 0
+  %nop11949 = alloca i1, i1 0
+  %nop11950 = alloca i1, i1 0
+  %nop11951 = alloca i1, i1 0
+  %nop11952 = alloca i1, i1 0
+  %nop11953 = alloca i1, i1 0
+  %nop11954 = alloca i1, i1 0
+  %nop11955 = alloca i1, i1 0
+  %nop11956 = alloca i1, i1 0
+  %nop11957 = alloca i1, i1 0
+  %nop11958 = alloca i1, i1 0
+  %nop11959 = alloca i1, i1 0
+  %nop11960 = alloca i1, i1 0
+  %nop11961 = alloca i1, i1 0
+  %nop11962 = alloca i1, i1 0
+  %nop11963 = alloca i1, i1 0
+  %nop11964 = alloca i1, i1 0
+  %nop11965 = alloca i1, i1 0
+  %nop11966 = alloca i1, i1 0
+  %nop11967 = alloca i1, i1 0
+  %nop11968 = alloca i1, i1 0
+  %nop11969 = alloca i1, i1 0
+  %nop11970 = alloca i1, i1 0
+  %nop11971 = alloca i1, i1 0
+  %nop11972 = alloca i1, i1 0
+  %nop11973 = alloca i1, i1 0
+  %nop11974 = alloca i1, i1 0
+  %nop11975 = alloca i1, i1 0
+  %nop11976 = alloca i1, i1 0
+  %nop11977 = alloca i1, i1 0
+  %nop11978 = alloca i1, i1 0
+  %nop11979 = alloca i1, i1 0
+  %nop11980 = alloca i1, i1 0
+  %nop11981 = alloca i1, i1 0
+  %nop11982 = alloca i1, i1 0
+  %nop11983 = alloca i1, i1 0
+  %nop11984 = alloca i1, i1 0
+  %nop11985 = alloca i1, i1 0
+  %nop11986 = alloca i1, i1 0
+  %nop11987 = alloca i1, i1 0
+  %nop11988 = alloca i1, i1 0
+  %nop11989 = alloca i1, i1 0
+  %nop11990 = alloca i1, i1 0
+  %nop11991 = alloca i1, i1 0
+  %nop11992 = alloca i1, i1 0
+  %nop11993 = alloca i1, i1 0
+  %nop11994 = alloca i1, i1 0
+  %nop11995 = alloca i1, i1 0
+  %nop11996 = alloca i1, i1 0
+  %nop11997 = alloca i1, i1 0
+  %nop11998 = alloca i1, i1 0
+  %nop11999 = alloca i1, i1 0
+  %nop12000 = alloca i1, i1 0
+  %nop12001 = alloca i1, i1 0
+  %nop12002 = alloca i1, i1 0
+  %nop12003 = alloca i1, i1 0
+  %nop12004 = alloca i1, i1 0
+  %nop12005 = alloca i1, i1 0
+  %nop12006 = alloca i1, i1 0
+  %nop12007 = alloca i1, i1 0
+  %nop12008 = alloca i1, i1 0
+  %nop12009 = alloca i1, i1 0
+  %nop12010 = alloca i1, i1 0
+  %nop12011 = alloca i1, i1 0
+  %nop12012 = alloca i1, i1 0
+  %nop12013 = alloca i1, i1 0
+  %nop12014 = alloca i1, i1 0
+  %nop12015 = alloca i1, i1 0
+  %nop12016 = alloca i1, i1 0
+  %nop12017 = alloca i1, i1 0
+  %nop12018 = alloca i1, i1 0
+  %nop12019 = alloca i1, i1 0
+  %nop12020 = alloca i1, i1 0
+  %nop12021 = alloca i1, i1 0
+  %nop12022 = alloca i1, i1 0
+  %nop12023 = alloca i1, i1 0
+  %nop12024 = alloca i1, i1 0
+  %nop12025 = alloca i1, i1 0
+  %nop12026 = alloca i1, i1 0
+  %nop12027 = alloca i1, i1 0
+  %nop12028 = alloca i1, i1 0
+  %nop12029 = alloca i1, i1 0
+  %nop12030 = alloca i1, i1 0
+  %nop12031 = alloca i1, i1 0
+  %nop12032 = alloca i1, i1 0
+  %nop12033 = alloca i1, i1 0
+  %nop12034 = alloca i1, i1 0
+  %nop12035 = alloca i1, i1 0
+  %nop12036 = alloca i1, i1 0
+  %nop12037 = alloca i1, i1 0
+  %nop12038 = alloca i1, i1 0
+  %nop12039 = alloca i1, i1 0
+  %nop12040 = alloca i1, i1 0
+  %nop12041 = alloca i1, i1 0
+  %nop12042 = alloca i1, i1 0
+  %nop12043 = alloca i1, i1 0
+  %nop12044 = alloca i1, i1 0
+  %nop12045 = alloca i1, i1 0
+  %nop12046 = alloca i1, i1 0
+  %nop12047 = alloca i1, i1 0
+  %nop12048 = alloca i1, i1 0
+  %nop12049 = alloca i1, i1 0
+  %nop12050 = alloca i1, i1 0
+  %nop12051 = alloca i1, i1 0
+  %nop12052 = alloca i1, i1 0
+  %nop12053 = alloca i1, i1 0
+  %nop12054 = alloca i1, i1 0
+  %nop12055 = alloca i1, i1 0
+  %nop12056 = alloca i1, i1 0
+  %nop12057 = alloca i1, i1 0
+  %nop12058 = alloca i1, i1 0
+  %nop12059 = alloca i1, i1 0
+  %nop12060 = alloca i1, i1 0
+  %nop12061 = alloca i1, i1 0
+  %nop12062 = alloca i1, i1 0
+  %nop12063 = alloca i1, i1 0
+  %nop12064 = alloca i1, i1 0
+  %nop12065 = alloca i1, i1 0
+  %nop12066 = alloca i1, i1 0
+  %nop12067 = alloca i1, i1 0
+  %nop12068 = alloca i1, i1 0
+  %nop12069 = alloca i1, i1 0
+  %nop12070 = alloca i1, i1 0
+  %nop12071 = alloca i1, i1 0
+  %nop12072 = alloca i1, i1 0
+  %nop12073 = alloca i1, i1 0
+  %nop12074 = alloca i1, i1 0
+  %nop12075 = alloca i1, i1 0
+  %nop12076 = alloca i1, i1 0
+  %nop12077 = alloca i1, i1 0
+  %nop12078 = alloca i1, i1 0
+  %nop12079 = alloca i1, i1 0
+  %nop12080 = alloca i1, i1 0
+  %nop12081 = alloca i1, i1 0
+  %nop12082 = alloca i1, i1 0
+  %nop12083 = alloca i1, i1 0
+  %nop12084 = alloca i1, i1 0
+  %nop12085 = alloca i1, i1 0
+  %nop12086 = alloca i1, i1 0
+  %nop12087 = alloca i1, i1 0
+  %nop12088 = alloca i1, i1 0
+  %nop12089 = alloca i1, i1 0
+  %nop12090 = alloca i1, i1 0
+  %nop12091 = alloca i1, i1 0
+  %nop12092 = alloca i1, i1 0
+  %nop12093 = alloca i1, i1 0
+  %nop12094 = alloca i1, i1 0
+  %nop12095 = alloca i1, i1 0
+  %nop12096 = alloca i1, i1 0
+  %nop12097 = alloca i1, i1 0
+  %nop12098 = alloca i1, i1 0
+  %nop12099 = alloca i1, i1 0
+  %nop12100 = alloca i1, i1 0
+  %nop12101 = alloca i1, i1 0
+  %nop12102 = alloca i1, i1 0
+  %nop12103 = alloca i1, i1 0
+  %nop12104 = alloca i1, i1 0
+  %nop12105 = alloca i1, i1 0
+  %nop12106 = alloca i1, i1 0
+  %nop12107 = alloca i1, i1 0
+  %nop12108 = alloca i1, i1 0
+  %nop12109 = alloca i1, i1 0
+  %nop12110 = alloca i1, i1 0
+  %nop12111 = alloca i1, i1 0
+  %nop12112 = alloca i1, i1 0
+  %nop12113 = alloca i1, i1 0
+  %nop12114 = alloca i1, i1 0
+  %nop12115 = alloca i1, i1 0
+  %nop12116 = alloca i1, i1 0
+  %nop12117 = alloca i1, i1 0
+  %nop12118 = alloca i1, i1 0
+  %nop12119 = alloca i1, i1 0
+  %nop12120 = alloca i1, i1 0
+  %nop12121 = alloca i1, i1 0
+  %nop12122 = alloca i1, i1 0
+  %nop12123 = alloca i1, i1 0
+  %nop12124 = alloca i1, i1 0
+  %nop12125 = alloca i1, i1 0
+  %nop12126 = alloca i1, i1 0
+  %nop12127 = alloca i1, i1 0
+  %nop12128 = alloca i1, i1 0
+  %nop12129 = alloca i1, i1 0
+  %nop12130 = alloca i1, i1 0
+  %nop12131 = alloca i1, i1 0
+  %nop12132 = alloca i1, i1 0
+  %nop12133 = alloca i1, i1 0
+  %nop12134 = alloca i1, i1 0
+  %nop12135 = alloca i1, i1 0
+  %nop12136 = alloca i1, i1 0
+  %nop12137 = alloca i1, i1 0
+  %nop12138 = alloca i1, i1 0
+  %nop12139 = alloca i1, i1 0
+  %nop12140 = alloca i1, i1 0
+  %nop12141 = alloca i1, i1 0
+  %nop12142 = alloca i1, i1 0
+  %nop12143 = alloca i1, i1 0
+  %nop12144 = alloca i1, i1 0
+  %nop12145 = alloca i1, i1 0
+  %nop12146 = alloca i1, i1 0
+  %nop12147 = alloca i1, i1 0
+  %nop12148 = alloca i1, i1 0
+  %nop12149 = alloca i1, i1 0
+  %nop12150 = alloca i1, i1 0
+  %nop12151 = alloca i1, i1 0
+  %nop12152 = alloca i1, i1 0
+  %nop12153 = alloca i1, i1 0
+  %nop12154 = alloca i1, i1 0
+  %nop12155 = alloca i1, i1 0
+  %nop12156 = alloca i1, i1 0
+  %nop12157 = alloca i1, i1 0
+  %nop12158 = alloca i1, i1 0
+  %nop12159 = alloca i1, i1 0
+  %nop12160 = alloca i1, i1 0
+  %nop12161 = alloca i1, i1 0
+  %nop12162 = alloca i1, i1 0
+  %nop12163 = alloca i1, i1 0
+  %nop12164 = alloca i1, i1 0
+  %nop12165 = alloca i1, i1 0
+  %nop12166 = alloca i1, i1 0
+  %nop12167 = alloca i1, i1 0
+  %nop12168 = alloca i1, i1 0
+  %nop12169 = alloca i1, i1 0
+  %nop12170 = alloca i1, i1 0
+  %nop12171 = alloca i1, i1 0
+  %nop12172 = alloca i1, i1 0
+  %nop12173 = alloca i1, i1 0
+  %nop12174 = alloca i1, i1 0
+  %nop12175 = alloca i1, i1 0
+  %nop12176 = alloca i1, i1 0
+  %nop12177 = alloca i1, i1 0
+  %nop12178 = alloca i1, i1 0
+  %nop12179 = alloca i1, i1 0
+  %nop12180 = alloca i1, i1 0
+  %nop12181 = alloca i1, i1 0
+  %nop12182 = alloca i1, i1 0
+  %nop12183 = alloca i1, i1 0
+  %nop12184 = alloca i1, i1 0
+  %nop12185 = alloca i1, i1 0
+  %nop12186 = alloca i1, i1 0
+  %nop12187 = alloca i1, i1 0
+  %nop12188 = alloca i1, i1 0
+  %nop12189 = alloca i1, i1 0
+  %nop12190 = alloca i1, i1 0
+  %nop12191 = alloca i1, i1 0
+  %nop12192 = alloca i1, i1 0
+  %nop12193 = alloca i1, i1 0
+  %nop12194 = alloca i1, i1 0
+  %nop12195 = alloca i1, i1 0
+  %nop12196 = alloca i1, i1 0
+  %nop12197 = alloca i1, i1 0
+  %nop12198 = alloca i1, i1 0
+  %nop12199 = alloca i1, i1 0
+  %nop12200 = alloca i1, i1 0
+  %nop12201 = alloca i1, i1 0
+  %nop12202 = alloca i1, i1 0
+  %nop12203 = alloca i1, i1 0
+  %nop12204 = alloca i1, i1 0
+  %nop12205 = alloca i1, i1 0
+  %nop12206 = alloca i1, i1 0
+  %nop12207 = alloca i1, i1 0
+  %nop12208 = alloca i1, i1 0
+  %nop12209 = alloca i1, i1 0
+  %nop12210 = alloca i1, i1 0
+  %nop12211 = alloca i1, i1 0
+  %nop12212 = alloca i1, i1 0
+  %nop12213 = alloca i1, i1 0
+  %nop12214 = alloca i1, i1 0
+  %nop12215 = alloca i1, i1 0
+  %nop12216 = alloca i1, i1 0
+  %nop12217 = alloca i1, i1 0
+  %nop12218 = alloca i1, i1 0
+  %nop12219 = alloca i1, i1 0
+  %nop12220 = alloca i1, i1 0
+  %nop12221 = alloca i1, i1 0
+  %nop12222 = alloca i1, i1 0
+  %nop12223 = alloca i1, i1 0
+  %nop12224 = alloca i1, i1 0
+  %nop12225 = alloca i1, i1 0
+  %nop12226 = alloca i1, i1 0
+  %nop12227 = alloca i1, i1 0
+  %nop12228 = alloca i1, i1 0
+  %nop12229 = alloca i1, i1 0
+  %nop12230 = alloca i1, i1 0
+  %nop12231 = alloca i1, i1 0
+  %nop12232 = alloca i1, i1 0
+  %nop12233 = alloca i1, i1 0
+  %nop12234 = alloca i1, i1 0
+  %nop12235 = alloca i1, i1 0
+  %nop12236 = alloca i1, i1 0
+  %nop12237 = alloca i1, i1 0
+  %nop12238 = alloca i1, i1 0
+  %nop12239 = alloca i1, i1 0
+  %nop12240 = alloca i1, i1 0
+  %nop12241 = alloca i1, i1 0
+  %nop12242 = alloca i1, i1 0
+  %nop12243 = alloca i1, i1 0
+  %nop12244 = alloca i1, i1 0
+  %nop12245 = alloca i1, i1 0
+  %nop12246 = alloca i1, i1 0
+  %nop12247 = alloca i1, i1 0
+  %nop12248 = alloca i1, i1 0
+  %nop12249 = alloca i1, i1 0
+  %nop12250 = alloca i1, i1 0
+  %nop12251 = alloca i1, i1 0
+  %nop12252 = alloca i1, i1 0
+  %nop12253 = alloca i1, i1 0
+  %nop12254 = alloca i1, i1 0
+  %nop12255 = alloca i1, i1 0
+  %nop12256 = alloca i1, i1 0
+  %nop12257 = alloca i1, i1 0
+  %nop12258 = alloca i1, i1 0
+  %nop12259 = alloca i1, i1 0
+  %nop12260 = alloca i1, i1 0
+  %nop12261 = alloca i1, i1 0
+  %nop12262 = alloca i1, i1 0
+  %nop12263 = alloca i1, i1 0
+  %nop12264 = alloca i1, i1 0
+  %nop12265 = alloca i1, i1 0
+  %nop12266 = alloca i1, i1 0
+  %nop12267 = alloca i1, i1 0
+  %nop12268 = alloca i1, i1 0
+  %nop12269 = alloca i1, i1 0
+  %nop12270 = alloca i1, i1 0
+  %nop12271 = alloca i1, i1 0
+  %nop12272 = alloca i1, i1 0
+  %nop12273 = alloca i1, i1 0
+  %nop12274 = alloca i1, i1 0
+  %nop12275 = alloca i1, i1 0
+  %nop12276 = alloca i1, i1 0
+  %nop12277 = alloca i1, i1 0
+  %nop12278 = alloca i1, i1 0
+  %nop12279 = alloca i1, i1 0
+  %nop12280 = alloca i1, i1 0
+  %nop12281 = alloca i1, i1 0
+  %nop12282 = alloca i1, i1 0
+  %nop12283 = alloca i1, i1 0
+  %nop12284 = alloca i1, i1 0
+  %nop12285 = alloca i1, i1 0
+  %nop12286 = alloca i1, i1 0
+  %nop12287 = alloca i1, i1 0
+  %nop12288 = alloca i1, i1 0
+  %nop12289 = alloca i1, i1 0
+  %nop12290 = alloca i1, i1 0
+  %nop12291 = alloca i1, i1 0
+  %nop12292 = alloca i1, i1 0
+  %nop12293 = alloca i1, i1 0
+  %nop12294 = alloca i1, i1 0
+  %nop12295 = alloca i1, i1 0
+  %nop12296 = alloca i1, i1 0
+  %nop12297 = alloca i1, i1 0
+  %nop12298 = alloca i1, i1 0
+  %nop12299 = alloca i1, i1 0
+  %nop12300 = alloca i1, i1 0
+  %nop12301 = alloca i1, i1 0
+  %nop12302 = alloca i1, i1 0
+  %nop12303 = alloca i1, i1 0
+  %nop12304 = alloca i1, i1 0
+  %nop12305 = alloca i1, i1 0
+  %nop12306 = alloca i1, i1 0
+  %nop12307 = alloca i1, i1 0
+  %nop12308 = alloca i1, i1 0
+  %nop12309 = alloca i1, i1 0
+  %nop12310 = alloca i1, i1 0
+  %nop12311 = alloca i1, i1 0
+  %nop12312 = alloca i1, i1 0
+  %nop12313 = alloca i1, i1 0
+  %nop12314 = alloca i1, i1 0
+  %nop12315 = alloca i1, i1 0
+  %nop12316 = alloca i1, i1 0
+  %nop12317 = alloca i1, i1 0
+  %nop12318 = alloca i1, i1 0
+  %nop12319 = alloca i1, i1 0
+  %nop12320 = alloca i1, i1 0
+  %nop12321 = alloca i1, i1 0
+  %nop12322 = alloca i1, i1 0
+  %nop12323 = alloca i1, i1 0
+  %nop12324 = alloca i1, i1 0
+  %nop12325 = alloca i1, i1 0
+  %nop12326 = alloca i1, i1 0
+  %nop12327 = alloca i1, i1 0
+  %nop12328 = alloca i1, i1 0
+  %nop12329 = alloca i1, i1 0
+  %nop12330 = alloca i1, i1 0
+  %nop12331 = alloca i1, i1 0
+  %nop12332 = alloca i1, i1 0
+  %nop12333 = alloca i1, i1 0
+  %nop12334 = alloca i1, i1 0
+  %nop12335 = alloca i1, i1 0
+  %nop12336 = alloca i1, i1 0
+  %nop12337 = alloca i1, i1 0
+  %nop12338 = alloca i1, i1 0
+  %nop12339 = alloca i1, i1 0
+  %nop12340 = alloca i1, i1 0
+  %nop12341 = alloca i1, i1 0
+  %nop12342 = alloca i1, i1 0
+  %nop12343 = alloca i1, i1 0
+  %nop12344 = alloca i1, i1 0
+  %nop12345 = alloca i1, i1 0
+  %nop12346 = alloca i1, i1 0
+  %nop12347 = alloca i1, i1 0
+  %nop12348 = alloca i1, i1 0
+  %nop12349 = alloca i1, i1 0
+  %nop12350 = alloca i1, i1 0
+  %nop12351 = alloca i1, i1 0
+  %nop12352 = alloca i1, i1 0
+  %nop12353 = alloca i1, i1 0
+  %nop12354 = alloca i1, i1 0
+  %nop12355 = alloca i1, i1 0
+  %nop12356 = alloca i1, i1 0
+  %nop12357 = alloca i1, i1 0
+  %nop12358 = alloca i1, i1 0
+  %nop12359 = alloca i1, i1 0
+  %nop12360 = alloca i1, i1 0
+  %nop12361 = alloca i1, i1 0
+  %nop12362 = alloca i1, i1 0
+  %nop12363 = alloca i1, i1 0
+  %nop12364 = alloca i1, i1 0
+  %nop12365 = alloca i1, i1 0
+  %nop12366 = alloca i1, i1 0
+  %nop12367 = alloca i1, i1 0
+  %nop12368 = alloca i1, i1 0
+  %nop12369 = alloca i1, i1 0
+  %nop12370 = alloca i1, i1 0
+  %nop12371 = alloca i1, i1 0
+  %nop12372 = alloca i1, i1 0
+  %nop12373 = alloca i1, i1 0
+  %nop12374 = alloca i1, i1 0
+  %nop12375 = alloca i1, i1 0
+  %nop12376 = alloca i1, i1 0
+  %nop12377 = alloca i1, i1 0
+  %nop12378 = alloca i1, i1 0
+  %nop12379 = alloca i1, i1 0
+  %nop12380 = alloca i1, i1 0
+  %nop12381 = alloca i1, i1 0
+  %nop12382 = alloca i1, i1 0
+  %nop12383 = alloca i1, i1 0
+  %nop12384 = alloca i1, i1 0
+  %nop12385 = alloca i1, i1 0
+  %nop12386 = alloca i1, i1 0
+  %nop12387 = alloca i1, i1 0
+  %nop12388 = alloca i1, i1 0
+  %nop12389 = alloca i1, i1 0
+  %nop12390 = alloca i1, i1 0
+  %nop12391 = alloca i1, i1 0
+  %nop12392 = alloca i1, i1 0
+  %nop12393 = alloca i1, i1 0
+  %nop12394 = alloca i1, i1 0
+  %nop12395 = alloca i1, i1 0
+  %nop12396 = alloca i1, i1 0
+  %nop12397 = alloca i1, i1 0
+  %nop12398 = alloca i1, i1 0
+  %nop12399 = alloca i1, i1 0
+  %nop12400 = alloca i1, i1 0
+  %nop12401 = alloca i1, i1 0
+  %nop12402 = alloca i1, i1 0
+  %nop12403 = alloca i1, i1 0
+  %nop12404 = alloca i1, i1 0
+  %nop12405 = alloca i1, i1 0
+  %nop12406 = alloca i1, i1 0
+  %nop12407 = alloca i1, i1 0
+  %nop12408 = alloca i1, i1 0
+  %nop12409 = alloca i1, i1 0
+  %nop12410 = alloca i1, i1 0
+  %nop12411 = alloca i1, i1 0
+  %nop12412 = alloca i1, i1 0
+  %nop12413 = alloca i1, i1 0
+  %nop12414 = alloca i1, i1 0
+  %nop12415 = alloca i1, i1 0
+  %nop12416 = alloca i1, i1 0
+  %nop12417 = alloca i1, i1 0
+  %nop12418 = alloca i1, i1 0
+  %nop12419 = alloca i1, i1 0
+  %nop12420 = alloca i1, i1 0
+  %nop12421 = alloca i1, i1 0
+  %nop12422 = alloca i1, i1 0
+  %nop12423 = alloca i1, i1 0
+  %nop12424 = alloca i1, i1 0
+  %nop12425 = alloca i1, i1 0
+  %nop12426 = alloca i1, i1 0
+  %nop12427 = alloca i1, i1 0
+  %nop12428 = alloca i1, i1 0
+  %nop12429 = alloca i1, i1 0
+  %nop12430 = alloca i1, i1 0
+  %nop12431 = alloca i1, i1 0
+  %nop12432 = alloca i1, i1 0
+  %nop12433 = alloca i1, i1 0
+  %nop12434 = alloca i1, i1 0
+  %nop12435 = alloca i1, i1 0
+  %nop12436 = alloca i1, i1 0
+  %nop12437 = alloca i1, i1 0
+  %nop12438 = alloca i1, i1 0
+  %nop12439 = alloca i1, i1 0
+  %nop12440 = alloca i1, i1 0
+  %nop12441 = alloca i1, i1 0
+  %nop12442 = alloca i1, i1 0
+  %nop12443 = alloca i1, i1 0
+  %nop12444 = alloca i1, i1 0
+  %nop12445 = alloca i1, i1 0
+  %nop12446 = alloca i1, i1 0
+  %nop12447 = alloca i1, i1 0
+  %nop12448 = alloca i1, i1 0
+  %nop12449 = alloca i1, i1 0
+  %nop12450 = alloca i1, i1 0
+  %nop12451 = alloca i1, i1 0
+  %nop12452 = alloca i1, i1 0
+  %nop12453 = alloca i1, i1 0
+  %nop12454 = alloca i1, i1 0
+  %nop12455 = alloca i1, i1 0
+  %nop12456 = alloca i1, i1 0
+  %nop12457 = alloca i1, i1 0
+  %nop12458 = alloca i1, i1 0
+  %nop12459 = alloca i1, i1 0
+  %nop12460 = alloca i1, i1 0
+  %nop12461 = alloca i1, i1 0
+  %nop12462 = alloca i1, i1 0
+  %nop12463 = alloca i1, i1 0
+  %nop12464 = alloca i1, i1 0
+  %nop12465 = alloca i1, i1 0
+  %nop12466 = alloca i1, i1 0
+  %nop12467 = alloca i1, i1 0
+  %nop12468 = alloca i1, i1 0
+  %nop12469 = alloca i1, i1 0
+  %nop12470 = alloca i1, i1 0
+  %nop12471 = alloca i1, i1 0
+  %nop12472 = alloca i1, i1 0
+  %nop12473 = alloca i1, i1 0
+  %nop12474 = alloca i1, i1 0
+  %nop12475 = alloca i1, i1 0
+  %nop12476 = alloca i1, i1 0
+  %nop12477 = alloca i1, i1 0
+  %nop12478 = alloca i1, i1 0
+  %nop12479 = alloca i1, i1 0
+  %nop12480 = alloca i1, i1 0
+  %nop12481 = alloca i1, i1 0
+  %nop12482 = alloca i1, i1 0
+  %nop12483 = alloca i1, i1 0
+  %nop12484 = alloca i1, i1 0
+  %nop12485 = alloca i1, i1 0
+  %nop12486 = alloca i1, i1 0
+  %nop12487 = alloca i1, i1 0
+  %nop12488 = alloca i1, i1 0
+  %nop12489 = alloca i1, i1 0
+  %nop12490 = alloca i1, i1 0
+  %nop12491 = alloca i1, i1 0
+  %nop12492 = alloca i1, i1 0
+  %nop12493 = alloca i1, i1 0
+  %nop12494 = alloca i1, i1 0
+  %nop12495 = alloca i1, i1 0
+  %nop12496 = alloca i1, i1 0
+  %nop12497 = alloca i1, i1 0
+  %nop12498 = alloca i1, i1 0
+  %nop12499 = alloca i1, i1 0
+  %nop12500 = alloca i1, i1 0
+  %nop12501 = alloca i1, i1 0
+  %nop12502 = alloca i1, i1 0
+  %nop12503 = alloca i1, i1 0
+  %nop12504 = alloca i1, i1 0
+  %nop12505 = alloca i1, i1 0
+  %nop12506 = alloca i1, i1 0
+  %nop12507 = alloca i1, i1 0
+  %nop12508 = alloca i1, i1 0
+  %nop12509 = alloca i1, i1 0
+  %nop12510 = alloca i1, i1 0
+  %nop12511 = alloca i1, i1 0
+  %nop12512 = alloca i1, i1 0
+  %nop12513 = alloca i1, i1 0
+  %nop12514 = alloca i1, i1 0
+  %nop12515 = alloca i1, i1 0
+  %nop12516 = alloca i1, i1 0
+  %nop12517 = alloca i1, i1 0
+  %nop12518 = alloca i1, i1 0
+  %nop12519 = alloca i1, i1 0
+  %nop12520 = alloca i1, i1 0
+  %nop12521 = alloca i1, i1 0
+  %nop12522 = alloca i1, i1 0
+  %nop12523 = alloca i1, i1 0
+  %nop12524 = alloca i1, i1 0
+  %nop12525 = alloca i1, i1 0
+  %nop12526 = alloca i1, i1 0
+  %nop12527 = alloca i1, i1 0
+  %nop12528 = alloca i1, i1 0
+  %nop12529 = alloca i1, i1 0
+  %nop12530 = alloca i1, i1 0
+  %nop12531 = alloca i1, i1 0
+  %nop12532 = alloca i1, i1 0
+  %nop12533 = alloca i1, i1 0
+  %nop12534 = alloca i1, i1 0
+  %nop12535 = alloca i1, i1 0
+  %nop12536 = alloca i1, i1 0
+  %nop12537 = alloca i1, i1 0
+  %nop12538 = alloca i1, i1 0
+  %nop12539 = alloca i1, i1 0
+  %nop12540 = alloca i1, i1 0
+  %nop12541 = alloca i1, i1 0
+  %nop12542 = alloca i1, i1 0
+  %nop12543 = alloca i1, i1 0
+  %nop12544 = alloca i1, i1 0
+  %nop12545 = alloca i1, i1 0
+  %nop12546 = alloca i1, i1 0
+  %nop12547 = alloca i1, i1 0
+  %nop12548 = alloca i1, i1 0
+  %nop12549 = alloca i1, i1 0
+  %nop12550 = alloca i1, i1 0
+  %nop12551 = alloca i1, i1 0
+  %nop12552 = alloca i1, i1 0
+  %nop12553 = alloca i1, i1 0
+  %nop12554 = alloca i1, i1 0
+  %nop12555 = alloca i1, i1 0
+  %nop12556 = alloca i1, i1 0
+  %nop12557 = alloca i1, i1 0
+  %nop12558 = alloca i1, i1 0
+  %nop12559 = alloca i1, i1 0
+  %nop12560 = alloca i1, i1 0
+  %nop12561 = alloca i1, i1 0
+  %nop12562 = alloca i1, i1 0
+  %nop12563 = alloca i1, i1 0
+  %nop12564 = alloca i1, i1 0
+  %nop12565 = alloca i1, i1 0
+  %nop12566 = alloca i1, i1 0
+  %nop12567 = alloca i1, i1 0
+  %nop12568 = alloca i1, i1 0
+  %nop12569 = alloca i1, i1 0
+  %nop12570 = alloca i1, i1 0
+  %nop12571 = alloca i1, i1 0
+  %nop12572 = alloca i1, i1 0
+  %nop12573 = alloca i1, i1 0
+  %nop12574 = alloca i1, i1 0
+  %nop12575 = alloca i1, i1 0
+  %nop12576 = alloca i1, i1 0
+  %nop12577 = alloca i1, i1 0
+  %nop12578 = alloca i1, i1 0
+  %nop12579 = alloca i1, i1 0
+  %nop12580 = alloca i1, i1 0
+  %nop12581 = alloca i1, i1 0
+  %nop12582 = alloca i1, i1 0
+  %nop12583 = alloca i1, i1 0
+  %nop12584 = alloca i1, i1 0
+  %nop12585 = alloca i1, i1 0
+  %nop12586 = alloca i1, i1 0
+  %nop12587 = alloca i1, i1 0
+  %nop12588 = alloca i1, i1 0
+  %nop12589 = alloca i1, i1 0
+  %nop12590 = alloca i1, i1 0
+  %nop12591 = alloca i1, i1 0
+  %nop12592 = alloca i1, i1 0
+  %nop12593 = alloca i1, i1 0
+  %nop12594 = alloca i1, i1 0
+  %nop12595 = alloca i1, i1 0
+  %nop12596 = alloca i1, i1 0
+  %nop12597 = alloca i1, i1 0
+  %nop12598 = alloca i1, i1 0
+  %nop12599 = alloca i1, i1 0
+  %nop12600 = alloca i1, i1 0
+  %nop12601 = alloca i1, i1 0
+  %nop12602 = alloca i1, i1 0
+  %nop12603 = alloca i1, i1 0
+  %nop12604 = alloca i1, i1 0
+  %nop12605 = alloca i1, i1 0
+  %nop12606 = alloca i1, i1 0
+  %nop12607 = alloca i1, i1 0
+  %nop12608 = alloca i1, i1 0
+  %nop12609 = alloca i1, i1 0
+  %nop12610 = alloca i1, i1 0
+  %nop12611 = alloca i1, i1 0
+  %nop12612 = alloca i1, i1 0
+  %nop12613 = alloca i1, i1 0
+  %nop12614 = alloca i1, i1 0
+  %nop12615 = alloca i1, i1 0
+  %nop12616 = alloca i1, i1 0
+  %nop12617 = alloca i1, i1 0
+  %nop12618 = alloca i1, i1 0
+  %nop12619 = alloca i1, i1 0
+  %nop12620 = alloca i1, i1 0
+  %nop12621 = alloca i1, i1 0
+  %nop12622 = alloca i1, i1 0
+  %nop12623 = alloca i1, i1 0
+  %nop12624 = alloca i1, i1 0
+  %nop12625 = alloca i1, i1 0
+  %nop12626 = alloca i1, i1 0
+  %nop12627 = alloca i1, i1 0
+  %nop12628 = alloca i1, i1 0
+  %nop12629 = alloca i1, i1 0
+  %nop12630 = alloca i1, i1 0
+  %nop12631 = alloca i1, i1 0
+  %nop12632 = alloca i1, i1 0
+  %nop12633 = alloca i1, i1 0
+  %nop12634 = alloca i1, i1 0
+  %nop12635 = alloca i1, i1 0
+  %nop12636 = alloca i1, i1 0
+  %nop12637 = alloca i1, i1 0
+  %nop12638 = alloca i1, i1 0
+  %nop12639 = alloca i1, i1 0
+  %nop12640 = alloca i1, i1 0
+  %nop12641 = alloca i1, i1 0
+  %nop12642 = alloca i1, i1 0
+  %nop12643 = alloca i1, i1 0
+  %nop12644 = alloca i1, i1 0
+  %nop12645 = alloca i1, i1 0
+  %nop12646 = alloca i1, i1 0
+  %nop12647 = alloca i1, i1 0
+  %nop12648 = alloca i1, i1 0
+  %nop12649 = alloca i1, i1 0
+  %nop12650 = alloca i1, i1 0
+  %nop12651 = alloca i1, i1 0
+  %nop12652 = alloca i1, i1 0
+  %nop12653 = alloca i1, i1 0
+  %nop12654 = alloca i1, i1 0
+  %nop12655 = alloca i1, i1 0
+  %nop12656 = alloca i1, i1 0
+  %nop12657 = alloca i1, i1 0
+  %nop12658 = alloca i1, i1 0
+  %nop12659 = alloca i1, i1 0
+  %nop12660 = alloca i1, i1 0
+  %nop12661 = alloca i1, i1 0
+  %nop12662 = alloca i1, i1 0
+  %nop12663 = alloca i1, i1 0
+  %nop12664 = alloca i1, i1 0
+  %nop12665 = alloca i1, i1 0
+  %nop12666 = alloca i1, i1 0
+  %nop12667 = alloca i1, i1 0
+  %nop12668 = alloca i1, i1 0
+  %nop12669 = alloca i1, i1 0
+  %nop12670 = alloca i1, i1 0
+  %nop12671 = alloca i1, i1 0
+  %nop12672 = alloca i1, i1 0
+  %nop12673 = alloca i1, i1 0
+  %nop12674 = alloca i1, i1 0
+  %nop12675 = alloca i1, i1 0
+  %nop12676 = alloca i1, i1 0
+  %nop12677 = alloca i1, i1 0
+  %nop12678 = alloca i1, i1 0
+  %nop12679 = alloca i1, i1 0
+  %nop12680 = alloca i1, i1 0
+  %nop12681 = alloca i1, i1 0
+  %nop12682 = alloca i1, i1 0
+  %nop12683 = alloca i1, i1 0
+  %nop12684 = alloca i1, i1 0
+  %nop12685 = alloca i1, i1 0
+  %nop12686 = alloca i1, i1 0
+  %nop12687 = alloca i1, i1 0
+  %nop12688 = alloca i1, i1 0
+  %nop12689 = alloca i1, i1 0
+  %nop12690 = alloca i1, i1 0
+  %nop12691 = alloca i1, i1 0
+  %nop12692 = alloca i1, i1 0
+  %nop12693 = alloca i1, i1 0
+  %nop12694 = alloca i1, i1 0
+  %nop12695 = alloca i1, i1 0
+  %nop12696 = alloca i1, i1 0
+  %nop12697 = alloca i1, i1 0
+  %nop12698 = alloca i1, i1 0
+  %nop12699 = alloca i1, i1 0
+  %nop12700 = alloca i1, i1 0
+  %nop12701 = alloca i1, i1 0
+  %nop12702 = alloca i1, i1 0
+  %nop12703 = alloca i1, i1 0
+  %nop12704 = alloca i1, i1 0
+  %nop12705 = alloca i1, i1 0
+  %nop12706 = alloca i1, i1 0
+  %nop12707 = alloca i1, i1 0
+  %nop12708 = alloca i1, i1 0
+  %nop12709 = alloca i1, i1 0
+  %nop12710 = alloca i1, i1 0
+  %nop12711 = alloca i1, i1 0
+  %nop12712 = alloca i1, i1 0
+  %nop12713 = alloca i1, i1 0
+  %nop12714 = alloca i1, i1 0
+  %nop12715 = alloca i1, i1 0
+  %nop12716 = alloca i1, i1 0
+  %nop12717 = alloca i1, i1 0
+  %nop12718 = alloca i1, i1 0
+  %nop12719 = alloca i1, i1 0
+  %nop12720 = alloca i1, i1 0
+  %nop12721 = alloca i1, i1 0
+  %nop12722 = alloca i1, i1 0
+  %nop12723 = alloca i1, i1 0
+  %nop12724 = alloca i1, i1 0
+  %nop12725 = alloca i1, i1 0
+  %nop12726 = alloca i1, i1 0
+  %nop12727 = alloca i1, i1 0
+  %nop12728 = alloca i1, i1 0
+  %nop12729 = alloca i1, i1 0
+  %nop12730 = alloca i1, i1 0
+  %nop12731 = alloca i1, i1 0
+  %nop12732 = alloca i1, i1 0
+  %nop12733 = alloca i1, i1 0
+  %nop12734 = alloca i1, i1 0
+  %nop12735 = alloca i1, i1 0
+  %nop12736 = alloca i1, i1 0
+  %nop12737 = alloca i1, i1 0
+  %nop12738 = alloca i1, i1 0
+  %nop12739 = alloca i1, i1 0
+  %nop12740 = alloca i1, i1 0
+  %nop12741 = alloca i1, i1 0
+  %nop12742 = alloca i1, i1 0
+  %nop12743 = alloca i1, i1 0
+  %nop12744 = alloca i1, i1 0
+  %nop12745 = alloca i1, i1 0
+  %nop12746 = alloca i1, i1 0
+  %nop12747 = alloca i1, i1 0
+  %nop12748 = alloca i1, i1 0
+  %nop12749 = alloca i1, i1 0
+  %nop12750 = alloca i1, i1 0
+  %nop12751 = alloca i1, i1 0
+  %nop12752 = alloca i1, i1 0
+  %nop12753 = alloca i1, i1 0
+  %nop12754 = alloca i1, i1 0
+  %nop12755 = alloca i1, i1 0
+  %nop12756 = alloca i1, i1 0
+  %nop12757 = alloca i1, i1 0
+  %nop12758 = alloca i1, i1 0
+  %nop12759 = alloca i1, i1 0
+  %nop12760 = alloca i1, i1 0
+  %nop12761 = alloca i1, i1 0
+  %nop12762 = alloca i1, i1 0
+  %nop12763 = alloca i1, i1 0
+  %nop12764 = alloca i1, i1 0
+  %nop12765 = alloca i1, i1 0
+  %nop12766 = alloca i1, i1 0
+  %nop12767 = alloca i1, i1 0
+  %nop12768 = alloca i1, i1 0
+  %nop12769 = alloca i1, i1 0
+  %nop12770 = alloca i1, i1 0
+  %nop12771 = alloca i1, i1 0
+  %nop12772 = alloca i1, i1 0
+  %nop12773 = alloca i1, i1 0
+  %nop12774 = alloca i1, i1 0
+  %nop12775 = alloca i1, i1 0
+  %nop12776 = alloca i1, i1 0
+  %nop12777 = alloca i1, i1 0
+  %nop12778 = alloca i1, i1 0
+  %nop12779 = alloca i1, i1 0
+  %nop12780 = alloca i1, i1 0
+  %nop12781 = alloca i1, i1 0
+  %nop12782 = alloca i1, i1 0
+  %nop12783 = alloca i1, i1 0
+  %nop12784 = alloca i1, i1 0
+  %nop12785 = alloca i1, i1 0
+  %nop12786 = alloca i1, i1 0
+  %nop12787 = alloca i1, i1 0
+  %nop12788 = alloca i1, i1 0
+  %nop12789 = alloca i1, i1 0
+  %nop12790 = alloca i1, i1 0
+  %nop12791 = alloca i1, i1 0
+  %nop12792 = alloca i1, i1 0
+  %nop12793 = alloca i1, i1 0
+  %nop12794 = alloca i1, i1 0
+  %nop12795 = alloca i1, i1 0
+  %nop12796 = alloca i1, i1 0
+  %nop12797 = alloca i1, i1 0
+  %nop12798 = alloca i1, i1 0
+  %nop12799 = alloca i1, i1 0
+  %nop12800 = alloca i1, i1 0
+  %nop12801 = alloca i1, i1 0
+  %nop12802 = alloca i1, i1 0
+  %nop12803 = alloca i1, i1 0
+  %nop12804 = alloca i1, i1 0
+  %nop12805 = alloca i1, i1 0
+  %nop12806 = alloca i1, i1 0
+  %nop12807 = alloca i1, i1 0
+  %nop12808 = alloca i1, i1 0
+  %nop12809 = alloca i1, i1 0
+  %nop12810 = alloca i1, i1 0
+  %nop12811 = alloca i1, i1 0
+  %nop12812 = alloca i1, i1 0
+  %nop12813 = alloca i1, i1 0
+  %nop12814 = alloca i1, i1 0
+  %nop12815 = alloca i1, i1 0
+  %nop12816 = alloca i1, i1 0
+  %nop12817 = alloca i1, i1 0
+  %nop12818 = alloca i1, i1 0
+  %nop12819 = alloca i1, i1 0
+  %nop12820 = alloca i1, i1 0
+  %nop12821 = alloca i1, i1 0
+  %nop12822 = alloca i1, i1 0
+  %nop12823 = alloca i1, i1 0
+  %nop12824 = alloca i1, i1 0
+  %nop12825 = alloca i1, i1 0
+  %nop12826 = alloca i1, i1 0
+  %nop12827 = alloca i1, i1 0
+  %nop12828 = alloca i1, i1 0
+  %nop12829 = alloca i1, i1 0
+  %nop12830 = alloca i1, i1 0
+  %nop12831 = alloca i1, i1 0
+  %nop12832 = alloca i1, i1 0
+  %nop12833 = alloca i1, i1 0
+  %nop12834 = alloca i1, i1 0
+  %nop12835 = alloca i1, i1 0
+  %nop12836 = alloca i1, i1 0
+  %nop12837 = alloca i1, i1 0
+  %nop12838 = alloca i1, i1 0
+  %nop12839 = alloca i1, i1 0
+  %nop12840 = alloca i1, i1 0
+  %nop12841 = alloca i1, i1 0
+  %nop12842 = alloca i1, i1 0
+  %nop12843 = alloca i1, i1 0
+  %nop12844 = alloca i1, i1 0
+  %nop12845 = alloca i1, i1 0
+  %nop12846 = alloca i1, i1 0
+  %nop12847 = alloca i1, i1 0
+  %nop12848 = alloca i1, i1 0
+  %nop12849 = alloca i1, i1 0
+  %nop12850 = alloca i1, i1 0
+  %nop12851 = alloca i1, i1 0
+  %nop12852 = alloca i1, i1 0
+  %nop12853 = alloca i1, i1 0
+  %nop12854 = alloca i1, i1 0
+  %nop12855 = alloca i1, i1 0
+  %nop12856 = alloca i1, i1 0
+  %nop12857 = alloca i1, i1 0
+  %nop12858 = alloca i1, i1 0
+  %nop12859 = alloca i1, i1 0
+  %nop12860 = alloca i1, i1 0
+  %nop12861 = alloca i1, i1 0
+  %nop12862 = alloca i1, i1 0
+  %nop12863 = alloca i1, i1 0
+  %nop12864 = alloca i1, i1 0
+  %nop12865 = alloca i1, i1 0
+  %nop12866 = alloca i1, i1 0
+  %nop12867 = alloca i1, i1 0
+  %nop12868 = alloca i1, i1 0
+  %nop12869 = alloca i1, i1 0
+  %nop12870 = alloca i1, i1 0
+  %nop12871 = alloca i1, i1 0
+  %nop12872 = alloca i1, i1 0
+  %nop12873 = alloca i1, i1 0
+  %nop12874 = alloca i1, i1 0
+  %nop12875 = alloca i1, i1 0
+  %nop12876 = alloca i1, i1 0
+  %nop12877 = alloca i1, i1 0
+  %nop12878 = alloca i1, i1 0
+  %nop12879 = alloca i1, i1 0
+  %nop12880 = alloca i1, i1 0
+  %nop12881 = alloca i1, i1 0
+  %nop12882 = alloca i1, i1 0
+  %nop12883 = alloca i1, i1 0
+  %nop12884 = alloca i1, i1 0
+  %nop12885 = alloca i1, i1 0
+  %nop12886 = alloca i1, i1 0
+  %nop12887 = alloca i1, i1 0
+  %nop12888 = alloca i1, i1 0
+  %nop12889 = alloca i1, i1 0
+  %nop12890 = alloca i1, i1 0
+  %nop12891 = alloca i1, i1 0
+  %nop12892 = alloca i1, i1 0
+  %nop12893 = alloca i1, i1 0
+  %nop12894 = alloca i1, i1 0
+  %nop12895 = alloca i1, i1 0
+  %nop12896 = alloca i1, i1 0
+  %nop12897 = alloca i1, i1 0
+  %nop12898 = alloca i1, i1 0
+  %nop12899 = alloca i1, i1 0
+  %nop12900 = alloca i1, i1 0
+  %nop12901 = alloca i1, i1 0
+  %nop12902 = alloca i1, i1 0
+  %nop12903 = alloca i1, i1 0
+  %nop12904 = alloca i1, i1 0
+  %nop12905 = alloca i1, i1 0
+  %nop12906 = alloca i1, i1 0
+  %nop12907 = alloca i1, i1 0
+  %nop12908 = alloca i1, i1 0
+  %nop12909 = alloca i1, i1 0
+  %nop12910 = alloca i1, i1 0
+  %nop12911 = alloca i1, i1 0
+  %nop12912 = alloca i1, i1 0
+  %nop12913 = alloca i1, i1 0
+  %nop12914 = alloca i1, i1 0
+  %nop12915 = alloca i1, i1 0
+  %nop12916 = alloca i1, i1 0
+  %nop12917 = alloca i1, i1 0
+  %nop12918 = alloca i1, i1 0
+  %nop12919 = alloca i1, i1 0
+  %nop12920 = alloca i1, i1 0
+  %nop12921 = alloca i1, i1 0
+  %nop12922 = alloca i1, i1 0
+  %nop12923 = alloca i1, i1 0
+  %nop12924 = alloca i1, i1 0
+  %nop12925 = alloca i1, i1 0
+  %nop12926 = alloca i1, i1 0
+  %nop12927 = alloca i1, i1 0
+  %nop12928 = alloca i1, i1 0
+  %nop12929 = alloca i1, i1 0
+  %nop12930 = alloca i1, i1 0
+  %nop12931 = alloca i1, i1 0
+  %nop12932 = alloca i1, i1 0
+  %nop12933 = alloca i1, i1 0
+  %nop12934 = alloca i1, i1 0
+  %nop12935 = alloca i1, i1 0
+  %nop12936 = alloca i1, i1 0
+  %nop12937 = alloca i1, i1 0
+  %nop12938 = alloca i1, i1 0
+  %nop12939 = alloca i1, i1 0
+  %nop12940 = alloca i1, i1 0
+  %nop12941 = alloca i1, i1 0
+  %nop12942 = alloca i1, i1 0
+  %nop12943 = alloca i1, i1 0
+  %nop12944 = alloca i1, i1 0
+  %nop12945 = alloca i1, i1 0
+  %nop12946 = alloca i1, i1 0
+  %nop12947 = alloca i1, i1 0
+  %nop12948 = alloca i1, i1 0
+  %nop12949 = alloca i1, i1 0
+  %nop12950 = alloca i1, i1 0
+  %nop12951 = alloca i1, i1 0
+  %nop12952 = alloca i1, i1 0
+  %nop12953 = alloca i1, i1 0
+  %nop12954 = alloca i1, i1 0
+  %nop12955 = alloca i1, i1 0
+  %nop12956 = alloca i1, i1 0
+  %nop12957 = alloca i1, i1 0
+  %nop12958 = alloca i1, i1 0
+  %nop12959 = alloca i1, i1 0
+  %nop12960 = alloca i1, i1 0
+  %nop12961 = alloca i1, i1 0
+  %nop12962 = alloca i1, i1 0
+  %nop12963 = alloca i1, i1 0
+  %nop12964 = alloca i1, i1 0
+  %nop12965 = alloca i1, i1 0
+  %nop12966 = alloca i1, i1 0
+  %nop12967 = alloca i1, i1 0
+  %nop12968 = alloca i1, i1 0
+  %nop12969 = alloca i1, i1 0
+  %nop12970 = alloca i1, i1 0
+  %nop12971 = alloca i1, i1 0
+  %nop12972 = alloca i1, i1 0
+  %nop12973 = alloca i1, i1 0
+  %nop12974 = alloca i1, i1 0
+  %nop12975 = alloca i1, i1 0
+  %nop12976 = alloca i1, i1 0
+  %nop12977 = alloca i1, i1 0
+  %nop12978 = alloca i1, i1 0
+  %nop12979 = alloca i1, i1 0
+  %nop12980 = alloca i1, i1 0
+  %nop12981 = alloca i1, i1 0
+  %nop12982 = alloca i1, i1 0
+  %nop12983 = alloca i1, i1 0
+  %nop12984 = alloca i1, i1 0
+  %nop12985 = alloca i1, i1 0
+  %nop12986 = alloca i1, i1 0
+  %nop12987 = alloca i1, i1 0
+  %nop12988 = alloca i1, i1 0
+  %nop12989 = alloca i1, i1 0
+  %nop12990 = alloca i1, i1 0
+  %nop12991 = alloca i1, i1 0
+  %nop12992 = alloca i1, i1 0
+  %nop12993 = alloca i1, i1 0
+  %nop12994 = alloca i1, i1 0
+  %nop12995 = alloca i1, i1 0
+  %nop12996 = alloca i1, i1 0
+  %nop12997 = alloca i1, i1 0
+  %nop12998 = alloca i1, i1 0
+  %nop12999 = alloca i1, i1 0
+  %nop13000 = alloca i1, i1 0
+  %nop13001 = alloca i1, i1 0
+  %nop13002 = alloca i1, i1 0
+  %nop13003 = alloca i1, i1 0
+  %nop13004 = alloca i1, i1 0
+  %nop13005 = alloca i1, i1 0
+  %nop13006 = alloca i1, i1 0
+  %nop13007 = alloca i1, i1 0
+  %nop13008 = alloca i1, i1 0
+  %nop13009 = alloca i1, i1 0
+  %nop13010 = alloca i1, i1 0
+  %nop13011 = alloca i1, i1 0
+  %nop13012 = alloca i1, i1 0
+  %nop13013 = alloca i1, i1 0
+  %nop13014 = alloca i1, i1 0
+  %nop13015 = alloca i1, i1 0
+  %nop13016 = alloca i1, i1 0
+  %nop13017 = alloca i1, i1 0
+  %nop13018 = alloca i1, i1 0
+  %nop13019 = alloca i1, i1 0
+  %nop13020 = alloca i1, i1 0
+  %nop13021 = alloca i1, i1 0
+  %nop13022 = alloca i1, i1 0
+  %nop13023 = alloca i1, i1 0
+  %nop13024 = alloca i1, i1 0
+  %nop13025 = alloca i1, i1 0
+  %nop13026 = alloca i1, i1 0
+  %nop13027 = alloca i1, i1 0
+  %nop13028 = alloca i1, i1 0
+  %nop13029 = alloca i1, i1 0
+  %nop13030 = alloca i1, i1 0
+  %nop13031 = alloca i1, i1 0
+  %nop13032 = alloca i1, i1 0
+  %nop13033 = alloca i1, i1 0
+  %nop13034 = alloca i1, i1 0
+  %nop13035 = alloca i1, i1 0
+  %nop13036 = alloca i1, i1 0
+  %nop13037 = alloca i1, i1 0
+  %nop13038 = alloca i1, i1 0
+  %nop13039 = alloca i1, i1 0
+  %nop13040 = alloca i1, i1 0
+  %nop13041 = alloca i1, i1 0
+  %nop13042 = alloca i1, i1 0
+  %nop13043 = alloca i1, i1 0
+  %nop13044 = alloca i1, i1 0
+  %nop13045 = alloca i1, i1 0
+  %nop13046 = alloca i1, i1 0
+  %nop13047 = alloca i1, i1 0
+  %nop13048 = alloca i1, i1 0
+  %nop13049 = alloca i1, i1 0
+  %nop13050 = alloca i1, i1 0
+  %nop13051 = alloca i1, i1 0
+  %nop13052 = alloca i1, i1 0
+  %nop13053 = alloca i1, i1 0
+  %nop13054 = alloca i1, i1 0
+  %nop13055 = alloca i1, i1 0
+  %nop13056 = alloca i1, i1 0
+  %nop13057 = alloca i1, i1 0
+  %nop13058 = alloca i1, i1 0
+  %nop13059 = alloca i1, i1 0
+  %nop13060 = alloca i1, i1 0
+  %nop13061 = alloca i1, i1 0
+  %nop13062 = alloca i1, i1 0
+  %nop13063 = alloca i1, i1 0
+  %nop13064 = alloca i1, i1 0
+  %nop13065 = alloca i1, i1 0
+  %nop13066 = alloca i1, i1 0
+  %nop13067 = alloca i1, i1 0
+  %nop13068 = alloca i1, i1 0
+  %nop13069 = alloca i1, i1 0
+  %nop13070 = alloca i1, i1 0
+  %nop13071 = alloca i1, i1 0
+  %nop13072 = alloca i1, i1 0
+  %nop13073 = alloca i1, i1 0
+  %nop13074 = alloca i1, i1 0
+  %nop13075 = alloca i1, i1 0
+  %nop13076 = alloca i1, i1 0
+  %nop13077 = alloca i1, i1 0
+  %nop13078 = alloca i1, i1 0
+  %nop13079 = alloca i1, i1 0
+  %nop13080 = alloca i1, i1 0
+  %nop13081 = alloca i1, i1 0
+  %nop13082 = alloca i1, i1 0
+  %nop13083 = alloca i1, i1 0
+  %nop13084 = alloca i1, i1 0
+  %nop13085 = alloca i1, i1 0
+  %nop13086 = alloca i1, i1 0
+  %nop13087 = alloca i1, i1 0
+  %nop13088 = alloca i1, i1 0
+  %nop13089 = alloca i1, i1 0
+  %nop13090 = alloca i1, i1 0
+  %nop13091 = alloca i1, i1 0
+  %nop13092 = alloca i1, i1 0
+  %nop13093 = alloca i1, i1 0
+  %nop13094 = alloca i1, i1 0
+  %nop13095 = alloca i1, i1 0
+  %nop13096 = alloca i1, i1 0
+  %nop13097 = alloca i1, i1 0
+  %nop13098 = alloca i1, i1 0
+  %nop13099 = alloca i1, i1 0
+  %nop13100 = alloca i1, i1 0
+  %nop13101 = alloca i1, i1 0
+  %nop13102 = alloca i1, i1 0
+  %nop13103 = alloca i1, i1 0
+  %nop13104 = alloca i1, i1 0
+  %nop13105 = alloca i1, i1 0
+  %nop13106 = alloca i1, i1 0
+  %nop13107 = alloca i1, i1 0
+  %nop13108 = alloca i1, i1 0
+  %nop13109 = alloca i1, i1 0
+  %nop13110 = alloca i1, i1 0
+  %nop13111 = alloca i1, i1 0
+  %nop13112 = alloca i1, i1 0
+  %nop13113 = alloca i1, i1 0
+  %nop13114 = alloca i1, i1 0
+  %nop13115 = alloca i1, i1 0
+  %nop13116 = alloca i1, i1 0
+  %nop13117 = alloca i1, i1 0
+  %nop13118 = alloca i1, i1 0
+  %nop13119 = alloca i1, i1 0
+  %nop13120 = alloca i1, i1 0
+  %nop13121 = alloca i1, i1 0
+  %nop13122 = alloca i1, i1 0
+  %nop13123 = alloca i1, i1 0
+  %nop13124 = alloca i1, i1 0
+  %nop13125 = alloca i1, i1 0
+  %nop13126 = alloca i1, i1 0
+  %nop13127 = alloca i1, i1 0
+  %nop13128 = alloca i1, i1 0
+  %nop13129 = alloca i1, i1 0
+  %nop13130 = alloca i1, i1 0
+  %nop13131 = alloca i1, i1 0
+  %nop13132 = alloca i1, i1 0
+  %nop13133 = alloca i1, i1 0
+  %nop13134 = alloca i1, i1 0
+  %nop13135 = alloca i1, i1 0
+  %nop13136 = alloca i1, i1 0
+  %nop13137 = alloca i1, i1 0
+  %nop13138 = alloca i1, i1 0
+  %nop13139 = alloca i1, i1 0
+  %nop13140 = alloca i1, i1 0
+  %nop13141 = alloca i1, i1 0
+  %nop13142 = alloca i1, i1 0
+  %nop13143 = alloca i1, i1 0
+  %nop13144 = alloca i1, i1 0
+  %nop13145 = alloca i1, i1 0
+  %nop13146 = alloca i1, i1 0
+  %nop13147 = alloca i1, i1 0
+  %nop13148 = alloca i1, i1 0
+  %nop13149 = alloca i1, i1 0
+  %nop13150 = alloca i1, i1 0
+  %nop13151 = alloca i1, i1 0
+  %nop13152 = alloca i1, i1 0
+  %nop13153 = alloca i1, i1 0
+  %nop13154 = alloca i1, i1 0
+  %nop13155 = alloca i1, i1 0
+  %nop13156 = alloca i1, i1 0
+  %nop13157 = alloca i1, i1 0
+  %nop13158 = alloca i1, i1 0
+  %nop13159 = alloca i1, i1 0
+  %nop13160 = alloca i1, i1 0
+  %nop13161 = alloca i1, i1 0
+  %nop13162 = alloca i1, i1 0
+  %nop13163 = alloca i1, i1 0
+  %nop13164 = alloca i1, i1 0
+  %nop13165 = alloca i1, i1 0
+  %nop13166 = alloca i1, i1 0
+  %nop13167 = alloca i1, i1 0
+  %nop13168 = alloca i1, i1 0
+  %nop13169 = alloca i1, i1 0
+  %nop13170 = alloca i1, i1 0
+  %nop13171 = alloca i1, i1 0
+  %nop13172 = alloca i1, i1 0
+  %nop13173 = alloca i1, i1 0
+  %nop13174 = alloca i1, i1 0
+  %nop13175 = alloca i1, i1 0
+  %nop13176 = alloca i1, i1 0
+  %nop13177 = alloca i1, i1 0
+  %nop13178 = alloca i1, i1 0
+  %nop13179 = alloca i1, i1 0
+  %nop13180 = alloca i1, i1 0
+  %nop13181 = alloca i1, i1 0
+  %nop13182 = alloca i1, i1 0
+  %nop13183 = alloca i1, i1 0
+  %nop13184 = alloca i1, i1 0
+  %nop13185 = alloca i1, i1 0
+  %nop13186 = alloca i1, i1 0
+  %nop13187 = alloca i1, i1 0
+  %nop13188 = alloca i1, i1 0
+  %nop13189 = alloca i1, i1 0
+  %nop13190 = alloca i1, i1 0
+  %nop13191 = alloca i1, i1 0
+  %nop13192 = alloca i1, i1 0
+  %nop13193 = alloca i1, i1 0
+  %nop13194 = alloca i1, i1 0
+  %nop13195 = alloca i1, i1 0
+  %nop13196 = alloca i1, i1 0
+  %nop13197 = alloca i1, i1 0
+  %nop13198 = alloca i1, i1 0
+  %nop13199 = alloca i1, i1 0
+  %nop13200 = alloca i1, i1 0
+  %nop13201 = alloca i1, i1 0
+  %nop13202 = alloca i1, i1 0
+  %nop13203 = alloca i1, i1 0
+  %nop13204 = alloca i1, i1 0
+  %nop13205 = alloca i1, i1 0
+  %nop13206 = alloca i1, i1 0
+  %nop13207 = alloca i1, i1 0
+  %nop13208 = alloca i1, i1 0
+  %nop13209 = alloca i1, i1 0
+  %nop13210 = alloca i1, i1 0
+  %nop13211 = alloca i1, i1 0
+  %nop13212 = alloca i1, i1 0
+  %nop13213 = alloca i1, i1 0
+  %nop13214 = alloca i1, i1 0
+  %nop13215 = alloca i1, i1 0
+  %nop13216 = alloca i1, i1 0
+  %nop13217 = alloca i1, i1 0
+  %nop13218 = alloca i1, i1 0
+  %nop13219 = alloca i1, i1 0
+  %nop13220 = alloca i1, i1 0
+  %nop13221 = alloca i1, i1 0
+  %nop13222 = alloca i1, i1 0
+  %nop13223 = alloca i1, i1 0
+  %nop13224 = alloca i1, i1 0
+  %nop13225 = alloca i1, i1 0
+  %nop13226 = alloca i1, i1 0
+  %nop13227 = alloca i1, i1 0
+  %nop13228 = alloca i1, i1 0
+  %nop13229 = alloca i1, i1 0
+  %nop13230 = alloca i1, i1 0
+  %nop13231 = alloca i1, i1 0
+  %nop13232 = alloca i1, i1 0
+  %nop13233 = alloca i1, i1 0
+  %nop13234 = alloca i1, i1 0
+  %nop13235 = alloca i1, i1 0
+  %nop13236 = alloca i1, i1 0
+  %nop13237 = alloca i1, i1 0
+  %nop13238 = alloca i1, i1 0
+  %nop13239 = alloca i1, i1 0
+  %nop13240 = alloca i1, i1 0
+  %nop13241 = alloca i1, i1 0
+  %nop13242 = alloca i1, i1 0
+  %nop13243 = alloca i1, i1 0
+  %nop13244 = alloca i1, i1 0
+  %nop13245 = alloca i1, i1 0
+  %nop13246 = alloca i1, i1 0
+  %nop13247 = alloca i1, i1 0
+  %nop13248 = alloca i1, i1 0
+  %nop13249 = alloca i1, i1 0
+  %nop13250 = alloca i1, i1 0
+  %nop13251 = alloca i1, i1 0
+  %nop13252 = alloca i1, i1 0
+  %nop13253 = alloca i1, i1 0
+  %nop13254 = alloca i1, i1 0
+  %nop13255 = alloca i1, i1 0
+  %nop13256 = alloca i1, i1 0
+  %nop13257 = alloca i1, i1 0
+  %nop13258 = alloca i1, i1 0
+  %nop13259 = alloca i1, i1 0
+  %nop13260 = alloca i1, i1 0
+  %nop13261 = alloca i1, i1 0
+  %nop13262 = alloca i1, i1 0
+  %nop13263 = alloca i1, i1 0
+  %nop13264 = alloca i1, i1 0
+  %nop13265 = alloca i1, i1 0
+  %nop13266 = alloca i1, i1 0
+  %nop13267 = alloca i1, i1 0
+  %nop13268 = alloca i1, i1 0
+  %nop13269 = alloca i1, i1 0
+  %nop13270 = alloca i1, i1 0
+  %nop13271 = alloca i1, i1 0
+  %nop13272 = alloca i1, i1 0
+  %nop13273 = alloca i1, i1 0
+  %nop13274 = alloca i1, i1 0
+  %nop13275 = alloca i1, i1 0
+  %nop13276 = alloca i1, i1 0
+  %nop13277 = alloca i1, i1 0
+  %nop13278 = alloca i1, i1 0
+  %nop13279 = alloca i1, i1 0
+  %nop13280 = alloca i1, i1 0
+  %nop13281 = alloca i1, i1 0
+  %nop13282 = alloca i1, i1 0
+  %nop13283 = alloca i1, i1 0
+  %nop13284 = alloca i1, i1 0
+  %nop13285 = alloca i1, i1 0
+  %nop13286 = alloca i1, i1 0
+  %nop13287 = alloca i1, i1 0
+  %nop13288 = alloca i1, i1 0
+  %nop13289 = alloca i1, i1 0
+  %nop13290 = alloca i1, i1 0
+  %nop13291 = alloca i1, i1 0
+  %nop13292 = alloca i1, i1 0
+  %nop13293 = alloca i1, i1 0
+  %nop13294 = alloca i1, i1 0
+  %nop13295 = alloca i1, i1 0
+  %nop13296 = alloca i1, i1 0
+  %nop13297 = alloca i1, i1 0
+  %nop13298 = alloca i1, i1 0
+  %nop13299 = alloca i1, i1 0
+  %nop13300 = alloca i1, i1 0
+  %nop13301 = alloca i1, i1 0
+  %nop13302 = alloca i1, i1 0
+  %nop13303 = alloca i1, i1 0
+  %nop13304 = alloca i1, i1 0
+  %nop13305 = alloca i1, i1 0
+  %nop13306 = alloca i1, i1 0
+  %nop13307 = alloca i1, i1 0
+  %nop13308 = alloca i1, i1 0
+  %nop13309 = alloca i1, i1 0
+  %nop13310 = alloca i1, i1 0
+  %nop13311 = alloca i1, i1 0
+  %nop13312 = alloca i1, i1 0
+  %nop13313 = alloca i1, i1 0
+  %nop13314 = alloca i1, i1 0
+  %nop13315 = alloca i1, i1 0
+  %nop13316 = alloca i1, i1 0
+  %nop13317 = alloca i1, i1 0
+  %nop13318 = alloca i1, i1 0
+  %nop13319 = alloca i1, i1 0
+  %nop13320 = alloca i1, i1 0
+  %nop13321 = alloca i1, i1 0
+  %nop13322 = alloca i1, i1 0
+  %nop13323 = alloca i1, i1 0
+  %nop13324 = alloca i1, i1 0
+  %nop13325 = alloca i1, i1 0
+  %nop13326 = alloca i1, i1 0
+  %nop13327 = alloca i1, i1 0
+  %nop13328 = alloca i1, i1 0
+  %nop13329 = alloca i1, i1 0
+  %nop13330 = alloca i1, i1 0
+  %nop13331 = alloca i1, i1 0
+  %nop13332 = alloca i1, i1 0
+  %nop13333 = alloca i1, i1 0
+  %nop13334 = alloca i1, i1 0
+  %nop13335 = alloca i1, i1 0
+  %nop13336 = alloca i1, i1 0
+  %nop13337 = alloca i1, i1 0
+  %nop13338 = alloca i1, i1 0
+  %nop13339 = alloca i1, i1 0
+  %nop13340 = alloca i1, i1 0
+  %nop13341 = alloca i1, i1 0
+  %nop13342 = alloca i1, i1 0
+  %nop13343 = alloca i1, i1 0
+  %nop13344 = alloca i1, i1 0
+  %nop13345 = alloca i1, i1 0
+  %nop13346 = alloca i1, i1 0
+  %nop13347 = alloca i1, i1 0
+  %nop13348 = alloca i1, i1 0
+  %nop13349 = alloca i1, i1 0
+  %nop13350 = alloca i1, i1 0
+  %nop13351 = alloca i1, i1 0
+  %nop13352 = alloca i1, i1 0
+  %nop13353 = alloca i1, i1 0
+  %nop13354 = alloca i1, i1 0
+  %nop13355 = alloca i1, i1 0
+  %nop13356 = alloca i1, i1 0
+  %nop13357 = alloca i1, i1 0
+  %nop13358 = alloca i1, i1 0
+  %nop13359 = alloca i1, i1 0
+  %nop13360 = alloca i1, i1 0
+  %nop13361 = alloca i1, i1 0
+  %nop13362 = alloca i1, i1 0
+  %nop13363 = alloca i1, i1 0
+  %nop13364 = alloca i1, i1 0
+  %nop13365 = alloca i1, i1 0
+  %nop13366 = alloca i1, i1 0
+  %nop13367 = alloca i1, i1 0
+  %nop13368 = alloca i1, i1 0
+  %nop13369 = alloca i1, i1 0
+  %nop13370 = alloca i1, i1 0
+  %nop13371 = alloca i1, i1 0
+  %nop13372 = alloca i1, i1 0
+  %nop13373 = alloca i1, i1 0
+  %nop13374 = alloca i1, i1 0
+  %nop13375 = alloca i1, i1 0
+  %nop13376 = alloca i1, i1 0
+  %nop13377 = alloca i1, i1 0
+  %nop13378 = alloca i1, i1 0
+  %nop13379 = alloca i1, i1 0
+  %nop13380 = alloca i1, i1 0
+  %nop13381 = alloca i1, i1 0
+  %nop13382 = alloca i1, i1 0
+  %nop13383 = alloca i1, i1 0
+  %nop13384 = alloca i1, i1 0
+  %nop13385 = alloca i1, i1 0
+  %nop13386 = alloca i1, i1 0
+  %nop13387 = alloca i1, i1 0
+  %nop13388 = alloca i1, i1 0
+  %nop13389 = alloca i1, i1 0
+  %nop13390 = alloca i1, i1 0
+  %nop13391 = alloca i1, i1 0
+  %nop13392 = alloca i1, i1 0
+  %nop13393 = alloca i1, i1 0
+  %nop13394 = alloca i1, i1 0
+  %nop13395 = alloca i1, i1 0
+  %nop13396 = alloca i1, i1 0
+  %nop13397 = alloca i1, i1 0
+  %nop13398 = alloca i1, i1 0
+  %nop13399 = alloca i1, i1 0
+  %nop13400 = alloca i1, i1 0
+  %nop13401 = alloca i1, i1 0
+  %nop13402 = alloca i1, i1 0
+  %nop13403 = alloca i1, i1 0
+  %nop13404 = alloca i1, i1 0
+  %nop13405 = alloca i1, i1 0
+  %nop13406 = alloca i1, i1 0
+  %nop13407 = alloca i1, i1 0
+  %nop13408 = alloca i1, i1 0
+  %nop13409 = alloca i1, i1 0
+  %nop13410 = alloca i1, i1 0
+  %nop13411 = alloca i1, i1 0
+  %nop13412 = alloca i1, i1 0
+  %nop13413 = alloca i1, i1 0
+  %nop13414 = alloca i1, i1 0
+  %nop13415 = alloca i1, i1 0
+  %nop13416 = alloca i1, i1 0
+  %nop13417 = alloca i1, i1 0
+  %nop13418 = alloca i1, i1 0
+  %nop13419 = alloca i1, i1 0
+  %nop13420 = alloca i1, i1 0
+  %nop13421 = alloca i1, i1 0
+  %nop13422 = alloca i1, i1 0
+  %nop13423 = alloca i1, i1 0
+  %nop13424 = alloca i1, i1 0
+  %nop13425 = alloca i1, i1 0
+  %nop13426 = alloca i1, i1 0
+  %nop13427 = alloca i1, i1 0
+  %nop13428 = alloca i1, i1 0
+  %nop13429 = alloca i1, i1 0
+  %nop13430 = alloca i1, i1 0
+  %nop13431 = alloca i1, i1 0
+  %nop13432 = alloca i1, i1 0
+  %nop13433 = alloca i1, i1 0
+  %nop13434 = alloca i1, i1 0
+  %nop13435 = alloca i1, i1 0
+  %nop13436 = alloca i1, i1 0
+  %nop13437 = alloca i1, i1 0
+  %nop13438 = alloca i1, i1 0
+  %nop13439 = alloca i1, i1 0
+  %nop13440 = alloca i1, i1 0
+  %nop13441 = alloca i1, i1 0
+  %nop13442 = alloca i1, i1 0
+  %nop13443 = alloca i1, i1 0
+  %nop13444 = alloca i1, i1 0
+  %nop13445 = alloca i1, i1 0
+  %nop13446 = alloca i1, i1 0
+  %nop13447 = alloca i1, i1 0
+  %nop13448 = alloca i1, i1 0
+  %nop13449 = alloca i1, i1 0
+  %nop13450 = alloca i1, i1 0
+  %nop13451 = alloca i1, i1 0
+  %nop13452 = alloca i1, i1 0
+  %nop13453 = alloca i1, i1 0
+  %nop13454 = alloca i1, i1 0
+  %nop13455 = alloca i1, i1 0
+  %nop13456 = alloca i1, i1 0
+  %nop13457 = alloca i1, i1 0
+  %nop13458 = alloca i1, i1 0
+  %nop13459 = alloca i1, i1 0
+  %nop13460 = alloca i1, i1 0
+  %nop13461 = alloca i1, i1 0
+  %nop13462 = alloca i1, i1 0
+  %nop13463 = alloca i1, i1 0
+  %nop13464 = alloca i1, i1 0
+  %nop13465 = alloca i1, i1 0
+  %nop13466 = alloca i1, i1 0
+  %nop13467 = alloca i1, i1 0
+  %nop13468 = alloca i1, i1 0
+  %nop13469 = alloca i1, i1 0
+  %nop13470 = alloca i1, i1 0
+  %nop13471 = alloca i1, i1 0
+  %nop13472 = alloca i1, i1 0
+  %nop13473 = alloca i1, i1 0
+  %nop13474 = alloca i1, i1 0
+  %nop13475 = alloca i1, i1 0
+  %nop13476 = alloca i1, i1 0
+  %nop13477 = alloca i1, i1 0
+  %nop13478 = alloca i1, i1 0
+  %nop13479 = alloca i1, i1 0
+  %nop13480 = alloca i1, i1 0
+  %nop13481 = alloca i1, i1 0
+  %nop13482 = alloca i1, i1 0
+  %nop13483 = alloca i1, i1 0
+  %nop13484 = alloca i1, i1 0
+  %nop13485 = alloca i1, i1 0
+  %nop13486 = alloca i1, i1 0
+  %nop13487 = alloca i1, i1 0
+  %nop13488 = alloca i1, i1 0
+  %nop13489 = alloca i1, i1 0
+  %nop13490 = alloca i1, i1 0
+  %nop13491 = alloca i1, i1 0
+  %nop13492 = alloca i1, i1 0
+  %nop13493 = alloca i1, i1 0
+  %nop13494 = alloca i1, i1 0
+  %nop13495 = alloca i1, i1 0
+  %nop13496 = alloca i1, i1 0
+  %nop13497 = alloca i1, i1 0
+  %nop13498 = alloca i1, i1 0
+  %nop13499 = alloca i1, i1 0
+  %nop13500 = alloca i1, i1 0
+  %nop13501 = alloca i1, i1 0
+  %nop13502 = alloca i1, i1 0
+  %nop13503 = alloca i1, i1 0
+  %nop13504 = alloca i1, i1 0
+  %nop13505 = alloca i1, i1 0
+  %nop13506 = alloca i1, i1 0
+  %nop13507 = alloca i1, i1 0
+  %nop13508 = alloca i1, i1 0
+  %nop13509 = alloca i1, i1 0
+  %nop13510 = alloca i1, i1 0
+  %nop13511 = alloca i1, i1 0
+  %nop13512 = alloca i1, i1 0
+  %nop13513 = alloca i1, i1 0
+  %nop13514 = alloca i1, i1 0
+  %nop13515 = alloca i1, i1 0
+  %nop13516 = alloca i1, i1 0
+  %nop13517 = alloca i1, i1 0
+  %nop13518 = alloca i1, i1 0
+  %nop13519 = alloca i1, i1 0
+  %nop13520 = alloca i1, i1 0
+  %nop13521 = alloca i1, i1 0
+  %nop13522 = alloca i1, i1 0
+  %nop13523 = alloca i1, i1 0
+  %nop13524 = alloca i1, i1 0
+  %nop13525 = alloca i1, i1 0
+  %nop13526 = alloca i1, i1 0
+  %nop13527 = alloca i1, i1 0
+  %nop13528 = alloca i1, i1 0
+  %nop13529 = alloca i1, i1 0
+  %nop13530 = alloca i1, i1 0
+  %nop13531 = alloca i1, i1 0
+  %nop13532 = alloca i1, i1 0
+  %nop13533 = alloca i1, i1 0
+  %nop13534 = alloca i1, i1 0
+  %nop13535 = alloca i1, i1 0
+  %nop13536 = alloca i1, i1 0
+  %nop13537 = alloca i1, i1 0
+  %nop13538 = alloca i1, i1 0
+  %nop13539 = alloca i1, i1 0
+  %nop13540 = alloca i1, i1 0
+  %nop13541 = alloca i1, i1 0
+  %nop13542 = alloca i1, i1 0
+  %nop13543 = alloca i1, i1 0
+  %nop13544 = alloca i1, i1 0
+  %nop13545 = alloca i1, i1 0
+  %nop13546 = alloca i1, i1 0
+  %nop13547 = alloca i1, i1 0
+  %nop13548 = alloca i1, i1 0
+  %nop13549 = alloca i1, i1 0
+  %nop13550 = alloca i1, i1 0
+  %nop13551 = alloca i1, i1 0
+  %nop13552 = alloca i1, i1 0
+  %nop13553 = alloca i1, i1 0
+  %nop13554 = alloca i1, i1 0
+  %nop13555 = alloca i1, i1 0
+  %nop13556 = alloca i1, i1 0
+  %nop13557 = alloca i1, i1 0
+  %nop13558 = alloca i1, i1 0
+  %nop13559 = alloca i1, i1 0
+  %nop13560 = alloca i1, i1 0
+  %nop13561 = alloca i1, i1 0
+  %nop13562 = alloca i1, i1 0
+  %nop13563 = alloca i1, i1 0
+  %nop13564 = alloca i1, i1 0
+  %nop13565 = alloca i1, i1 0
+  %nop13566 = alloca i1, i1 0
+  %nop13567 = alloca i1, i1 0
+  %nop13568 = alloca i1, i1 0
+  %nop13569 = alloca i1, i1 0
+  %nop13570 = alloca i1, i1 0
+  %nop13571 = alloca i1, i1 0
+  %nop13572 = alloca i1, i1 0
+  %nop13573 = alloca i1, i1 0
+  %nop13574 = alloca i1, i1 0
+  %nop13575 = alloca i1, i1 0
+  %nop13576 = alloca i1, i1 0
+  %nop13577 = alloca i1, i1 0
+  %nop13578 = alloca i1, i1 0
+  %nop13579 = alloca i1, i1 0
+  %nop13580 = alloca i1, i1 0
+  %nop13581 = alloca i1, i1 0
+  %nop13582 = alloca i1, i1 0
+  %nop13583 = alloca i1, i1 0
+  %nop13584 = alloca i1, i1 0
+  %nop13585 = alloca i1, i1 0
+  %nop13586 = alloca i1, i1 0
+  %nop13587 = alloca i1, i1 0
+  %nop13588 = alloca i1, i1 0
+  %nop13589 = alloca i1, i1 0
+  %nop13590 = alloca i1, i1 0
+  %nop13591 = alloca i1, i1 0
+  %nop13592 = alloca i1, i1 0
+  %nop13593 = alloca i1, i1 0
+  %nop13594 = alloca i1, i1 0
+  %nop13595 = alloca i1, i1 0
+  %nop13596 = alloca i1, i1 0
+  %nop13597 = alloca i1, i1 0
+  %nop13598 = alloca i1, i1 0
+  %nop13599 = alloca i1, i1 0
+  %nop13600 = alloca i1, i1 0
+  %nop13601 = alloca i1, i1 0
+  %nop13602 = alloca i1, i1 0
+  %nop13603 = alloca i1, i1 0
+  %nop13604 = alloca i1, i1 0
+  %nop13605 = alloca i1, i1 0
+  %nop13606 = alloca i1, i1 0
+  %nop13607 = alloca i1, i1 0
+  %nop13608 = alloca i1, i1 0
+  %nop13609 = alloca i1, i1 0
+  %nop13610 = alloca i1, i1 0
+  %nop13611 = alloca i1, i1 0
+  %nop13612 = alloca i1, i1 0
+  %nop13613 = alloca i1, i1 0
+  %nop13614 = alloca i1, i1 0
+  %nop13615 = alloca i1, i1 0
+  %nop13616 = alloca i1, i1 0
+  %nop13617 = alloca i1, i1 0
+  %nop13618 = alloca i1, i1 0
+  %nop13619 = alloca i1, i1 0
+  %nop13620 = alloca i1, i1 0
+  %nop13621 = alloca i1, i1 0
+  %nop13622 = alloca i1, i1 0
+  %nop13623 = alloca i1, i1 0
+  %nop13624 = alloca i1, i1 0
+  %nop13625 = alloca i1, i1 0
+  %nop13626 = alloca i1, i1 0
+  %nop13627 = alloca i1, i1 0
+  %nop13628 = alloca i1, i1 0
+  %nop13629 = alloca i1, i1 0
+  %nop13630 = alloca i1, i1 0
+  %nop13631 = alloca i1, i1 0
+  %nop13632 = alloca i1, i1 0
+  %nop13633 = alloca i1, i1 0
+  %nop13634 = alloca i1, i1 0
+  %nop13635 = alloca i1, i1 0
+  %nop13636 = alloca i1, i1 0
+  %nop13637 = alloca i1, i1 0
+  %nop13638 = alloca i1, i1 0
+  %nop13639 = alloca i1, i1 0
+  %nop13640 = alloca i1, i1 0
+  %nop13641 = alloca i1, i1 0
+  %nop13642 = alloca i1, i1 0
+  %nop13643 = alloca i1, i1 0
+  %nop13644 = alloca i1, i1 0
+  %nop13645 = alloca i1, i1 0
+  %nop13646 = alloca i1, i1 0
+  %nop13647 = alloca i1, i1 0
+  %nop13648 = alloca i1, i1 0
+  %nop13649 = alloca i1, i1 0
+  %nop13650 = alloca i1, i1 0
+  %nop13651 = alloca i1, i1 0
+  %nop13652 = alloca i1, i1 0
+  %nop13653 = alloca i1, i1 0
+  %nop13654 = alloca i1, i1 0
+  %nop13655 = alloca i1, i1 0
+  %nop13656 = alloca i1, i1 0
+  %nop13657 = alloca i1, i1 0
+  %nop13658 = alloca i1, i1 0
+  %nop13659 = alloca i1, i1 0
+  %nop13660 = alloca i1, i1 0
+  %nop13661 = alloca i1, i1 0
+  %nop13662 = alloca i1, i1 0
+  %nop13663 = alloca i1, i1 0
+  %nop13664 = alloca i1, i1 0
+  %nop13665 = alloca i1, i1 0
+  %nop13666 = alloca i1, i1 0
+  %nop13667 = alloca i1, i1 0
+  %nop13668 = alloca i1, i1 0
+  %nop13669 = alloca i1, i1 0
+  %nop13670 = alloca i1, i1 0
+  %nop13671 = alloca i1, i1 0
+  %nop13672 = alloca i1, i1 0
+  %nop13673 = alloca i1, i1 0
+  %nop13674 = alloca i1, i1 0
+  %nop13675 = alloca i1, i1 0
+  %nop13676 = alloca i1, i1 0
+  %nop13677 = alloca i1, i1 0
+  %nop13678 = alloca i1, i1 0
+  %nop13679 = alloca i1, i1 0
+  %nop13680 = alloca i1, i1 0
+  %nop13681 = alloca i1, i1 0
+  %nop13682 = alloca i1, i1 0
+  %nop13683 = alloca i1, i1 0
+  %nop13684 = alloca i1, i1 0
+  %nop13685 = alloca i1, i1 0
+  %nop13686 = alloca i1, i1 0
+  %nop13687 = alloca i1, i1 0
+  %nop13688 = alloca i1, i1 0
+  %nop13689 = alloca i1, i1 0
+  %nop13690 = alloca i1, i1 0
+  %nop13691 = alloca i1, i1 0
+  %nop13692 = alloca i1, i1 0
+  %nop13693 = alloca i1, i1 0
+  %nop13694 = alloca i1, i1 0
+  %nop13695 = alloca i1, i1 0
+  %nop13696 = alloca i1, i1 0
+  %nop13697 = alloca i1, i1 0
+  %nop13698 = alloca i1, i1 0
+  %nop13699 = alloca i1, i1 0
+  %nop13700 = alloca i1, i1 0
+  %nop13701 = alloca i1, i1 0
+  %nop13702 = alloca i1, i1 0
+  %nop13703 = alloca i1, i1 0
+  %nop13704 = alloca i1, i1 0
+  %nop13705 = alloca i1, i1 0
+  %nop13706 = alloca i1, i1 0
+  %nop13707 = alloca i1, i1 0
+  %nop13708 = alloca i1, i1 0
+  %nop13709 = alloca i1, i1 0
+  %nop13710 = alloca i1, i1 0
+  %nop13711 = alloca i1, i1 0
+  %nop13712 = alloca i1, i1 0
+  %nop13713 = alloca i1, i1 0
+  %nop13714 = alloca i1, i1 0
+  %nop13715 = alloca i1, i1 0
+  %nop13716 = alloca i1, i1 0
+  %nop13717 = alloca i1, i1 0
+  %nop13718 = alloca i1, i1 0
+  %nop13719 = alloca i1, i1 0
+  %nop13720 = alloca i1, i1 0
+  %nop13721 = alloca i1, i1 0
+  %nop13722 = alloca i1, i1 0
+  %nop13723 = alloca i1, i1 0
+  %nop13724 = alloca i1, i1 0
+  %nop13725 = alloca i1, i1 0
+  %nop13726 = alloca i1, i1 0
+  %nop13727 = alloca i1, i1 0
+  %nop13728 = alloca i1, i1 0
+  %nop13729 = alloca i1, i1 0
+  %nop13730 = alloca i1, i1 0
+  %nop13731 = alloca i1, i1 0
+  %nop13732 = alloca i1, i1 0
+  %nop13733 = alloca i1, i1 0
+  %nop13734 = alloca i1, i1 0
+  %nop13735 = alloca i1, i1 0
+  %nop13736 = alloca i1, i1 0
+  %nop13737 = alloca i1, i1 0
+  %nop13738 = alloca i1, i1 0
+  %nop13739 = alloca i1, i1 0
+  %nop13740 = alloca i1, i1 0
+  %nop13741 = alloca i1, i1 0
+  %nop13742 = alloca i1, i1 0
+  %nop13743 = alloca i1, i1 0
+  %nop13744 = alloca i1, i1 0
+  %nop13745 = alloca i1, i1 0
+  %nop13746 = alloca i1, i1 0
+  %nop13747 = alloca i1, i1 0
+  %nop13748 = alloca i1, i1 0
+  %nop13749 = alloca i1, i1 0
+  %nop13750 = alloca i1, i1 0
+  %nop13751 = alloca i1, i1 0
+  %nop13752 = alloca i1, i1 0
+  %nop13753 = alloca i1, i1 0
+  %nop13754 = alloca i1, i1 0
+  %nop13755 = alloca i1, i1 0
+  %nop13756 = alloca i1, i1 0
+  %nop13757 = alloca i1, i1 0
+  %nop13758 = alloca i1, i1 0
+  %nop13759 = alloca i1, i1 0
+  %nop13760 = alloca i1, i1 0
+  %nop13761 = alloca i1, i1 0
+  %nop13762 = alloca i1, i1 0
+  %nop13763 = alloca i1, i1 0
+  %nop13764 = alloca i1, i1 0
+  %nop13765 = alloca i1, i1 0
+  %nop13766 = alloca i1, i1 0
+  %nop13767 = alloca i1, i1 0
+  %nop13768 = alloca i1, i1 0
+  %nop13769 = alloca i1, i1 0
+  %nop13770 = alloca i1, i1 0
+  %nop13771 = alloca i1, i1 0
+  %nop13772 = alloca i1, i1 0
+  %nop13773 = alloca i1, i1 0
+  %nop13774 = alloca i1, i1 0
+  %nop13775 = alloca i1, i1 0
+  %nop13776 = alloca i1, i1 0
+  %nop13777 = alloca i1, i1 0
+  %nop13778 = alloca i1, i1 0
+  %nop13779 = alloca i1, i1 0
+  %nop13780 = alloca i1, i1 0
+  %nop13781 = alloca i1, i1 0
+  %nop13782 = alloca i1, i1 0
+  %nop13783 = alloca i1, i1 0
+  %nop13784 = alloca i1, i1 0
+  %nop13785 = alloca i1, i1 0
+  %nop13786 = alloca i1, i1 0
+  %nop13787 = alloca i1, i1 0
+  %nop13788 = alloca i1, i1 0
+  %nop13789 = alloca i1, i1 0
+  %nop13790 = alloca i1, i1 0
+  %nop13791 = alloca i1, i1 0
+  %nop13792 = alloca i1, i1 0
+  %nop13793 = alloca i1, i1 0
+  %nop13794 = alloca i1, i1 0
+  %nop13795 = alloca i1, i1 0
+  %nop13796 = alloca i1, i1 0
+  %nop13797 = alloca i1, i1 0
+  %nop13798 = alloca i1, i1 0
+  %nop13799 = alloca i1, i1 0
+  %nop13800 = alloca i1, i1 0
+  %nop13801 = alloca i1, i1 0
+  %nop13802 = alloca i1, i1 0
+  %nop13803 = alloca i1, i1 0
+  %nop13804 = alloca i1, i1 0
+  %nop13805 = alloca i1, i1 0
+  %nop13806 = alloca i1, i1 0
+  %nop13807 = alloca i1, i1 0
+  %nop13808 = alloca i1, i1 0
+  %nop13809 = alloca i1, i1 0
+  %nop13810 = alloca i1, i1 0
+  %nop13811 = alloca i1, i1 0
+  %nop13812 = alloca i1, i1 0
+  %nop13813 = alloca i1, i1 0
+  %nop13814 = alloca i1, i1 0
+  %nop13815 = alloca i1, i1 0
+  %nop13816 = alloca i1, i1 0
+  %nop13817 = alloca i1, i1 0
+  %nop13818 = alloca i1, i1 0
+  %nop13819 = alloca i1, i1 0
+  %nop13820 = alloca i1, i1 0
+  %nop13821 = alloca i1, i1 0
+  %nop13822 = alloca i1, i1 0
+  %nop13823 = alloca i1, i1 0
+  %nop13824 = alloca i1, i1 0
+  %nop13825 = alloca i1, i1 0
+  %nop13826 = alloca i1, i1 0
+  %nop13827 = alloca i1, i1 0
+  %nop13828 = alloca i1, i1 0
+  %nop13829 = alloca i1, i1 0
+  %nop13830 = alloca i1, i1 0
+  %nop13831 = alloca i1, i1 0
+  %nop13832 = alloca i1, i1 0
+  %nop13833 = alloca i1, i1 0
+  %nop13834 = alloca i1, i1 0
+  %nop13835 = alloca i1, i1 0
+  %nop13836 = alloca i1, i1 0
+  %nop13837 = alloca i1, i1 0
+  %nop13838 = alloca i1, i1 0
+  %nop13839 = alloca i1, i1 0
+  %nop13840 = alloca i1, i1 0
+  %nop13841 = alloca i1, i1 0
+  %nop13842 = alloca i1, i1 0
+  %nop13843 = alloca i1, i1 0
+  %nop13844 = alloca i1, i1 0
+  %nop13845 = alloca i1, i1 0
+  %nop13846 = alloca i1, i1 0
+  %nop13847 = alloca i1, i1 0
+  %nop13848 = alloca i1, i1 0
+  %nop13849 = alloca i1, i1 0
+  %nop13850 = alloca i1, i1 0
+  %nop13851 = alloca i1, i1 0
+  %nop13852 = alloca i1, i1 0
+  %nop13853 = alloca i1, i1 0
+  %nop13854 = alloca i1, i1 0
+  %nop13855 = alloca i1, i1 0
+  %nop13856 = alloca i1, i1 0
+  %nop13857 = alloca i1, i1 0
+  %nop13858 = alloca i1, i1 0
+  %nop13859 = alloca i1, i1 0
+  %nop13860 = alloca i1, i1 0
+  %nop13861 = alloca i1, i1 0
+  %nop13862 = alloca i1, i1 0
+  %nop13863 = alloca i1, i1 0
+  %nop13864 = alloca i1, i1 0
+  %nop13865 = alloca i1, i1 0
+  %nop13866 = alloca i1, i1 0
+  %nop13867 = alloca i1, i1 0
+  %nop13868 = alloca i1, i1 0
+  %nop13869 = alloca i1, i1 0
+  %nop13870 = alloca i1, i1 0
+  %nop13871 = alloca i1, i1 0
+  %nop13872 = alloca i1, i1 0
+  %nop13873 = alloca i1, i1 0
+  %nop13874 = alloca i1, i1 0
+  %nop13875 = alloca i1, i1 0
+  %nop13876 = alloca i1, i1 0
+  %nop13877 = alloca i1, i1 0
+  %nop13878 = alloca i1, i1 0
+  %nop13879 = alloca i1, i1 0
+  %nop13880 = alloca i1, i1 0
+  %nop13881 = alloca i1, i1 0
+  %nop13882 = alloca i1, i1 0
+  %nop13883 = alloca i1, i1 0
+  %nop13884 = alloca i1, i1 0
+  %nop13885 = alloca i1, i1 0
+  %nop13886 = alloca i1, i1 0
+  %nop13887 = alloca i1, i1 0
+  %nop13888 = alloca i1, i1 0
+  %nop13889 = alloca i1, i1 0
+  %nop13890 = alloca i1, i1 0
+  %nop13891 = alloca i1, i1 0
+  %nop13892 = alloca i1, i1 0
+  %nop13893 = alloca i1, i1 0
+  %nop13894 = alloca i1, i1 0
+  %nop13895 = alloca i1, i1 0
+  %nop13896 = alloca i1, i1 0
+  %nop13897 = alloca i1, i1 0
+  %nop13898 = alloca i1, i1 0
+  %nop13899 = alloca i1, i1 0
+  %nop13900 = alloca i1, i1 0
+  %nop13901 = alloca i1, i1 0
+  %nop13902 = alloca i1, i1 0
+  %nop13903 = alloca i1, i1 0
+  %nop13904 = alloca i1, i1 0
+  %nop13905 = alloca i1, i1 0
+  %nop13906 = alloca i1, i1 0
+  %nop13907 = alloca i1, i1 0
+  %nop13908 = alloca i1, i1 0
+  %nop13909 = alloca i1, i1 0
+  %nop13910 = alloca i1, i1 0
+  %nop13911 = alloca i1, i1 0
+  %nop13912 = alloca i1, i1 0
+  %nop13913 = alloca i1, i1 0
+  %nop13914 = alloca i1, i1 0
+  %nop13915 = alloca i1, i1 0
+  %nop13916 = alloca i1, i1 0
+  %nop13917 = alloca i1, i1 0
+  %nop13918 = alloca i1, i1 0
+  %nop13919 = alloca i1, i1 0
+  %nop13920 = alloca i1, i1 0
+  %nop13921 = alloca i1, i1 0
+  %nop13922 = alloca i1, i1 0
+  %nop13923 = alloca i1, i1 0
+  %nop13924 = alloca i1, i1 0
+  %nop13925 = alloca i1, i1 0
+  %nop13926 = alloca i1, i1 0
+  %nop13927 = alloca i1, i1 0
+  %nop13928 = alloca i1, i1 0
+  %nop13929 = alloca i1, i1 0
+  %nop13930 = alloca i1, i1 0
+  %nop13931 = alloca i1, i1 0
+  %nop13932 = alloca i1, i1 0
+  %nop13933 = alloca i1, i1 0
+  %nop13934 = alloca i1, i1 0
+  %nop13935 = alloca i1, i1 0
+  %nop13936 = alloca i1, i1 0
+  %nop13937 = alloca i1, i1 0
+  %nop13938 = alloca i1, i1 0
+  %nop13939 = alloca i1, i1 0
+  %nop13940 = alloca i1, i1 0
+  %nop13941 = alloca i1, i1 0
+  %nop13942 = alloca i1, i1 0
+  %nop13943 = alloca i1, i1 0
+  %nop13944 = alloca i1, i1 0
+  %nop13945 = alloca i1, i1 0
+  %nop13946 = alloca i1, i1 0
+  %nop13947 = alloca i1, i1 0
+  %nop13948 = alloca i1, i1 0
+  %nop13949 = alloca i1, i1 0
+  %nop13950 = alloca i1, i1 0
+  %nop13951 = alloca i1, i1 0
+  %nop13952 = alloca i1, i1 0
+  %nop13953 = alloca i1, i1 0
+  %nop13954 = alloca i1, i1 0
+  %nop13955 = alloca i1, i1 0
+  %nop13956 = alloca i1, i1 0
+  %nop13957 = alloca i1, i1 0
+  %nop13958 = alloca i1, i1 0
+  %nop13959 = alloca i1, i1 0
+  %nop13960 = alloca i1, i1 0
+  %nop13961 = alloca i1, i1 0
+  %nop13962 = alloca i1, i1 0
+  %nop13963 = alloca i1, i1 0
+  %nop13964 = alloca i1, i1 0
+  %nop13965 = alloca i1, i1 0
+  %nop13966 = alloca i1, i1 0
+  %nop13967 = alloca i1, i1 0
+  %nop13968 = alloca i1, i1 0
+  %nop13969 = alloca i1, i1 0
+  %nop13970 = alloca i1, i1 0
+  %nop13971 = alloca i1, i1 0
+  %nop13972 = alloca i1, i1 0
+  %nop13973 = alloca i1, i1 0
+  %nop13974 = alloca i1, i1 0
+  %nop13975 = alloca i1, i1 0
+  %nop13976 = alloca i1, i1 0
+  %nop13977 = alloca i1, i1 0
+  %nop13978 = alloca i1, i1 0
+  %nop13979 = alloca i1, i1 0
+  %nop13980 = alloca i1, i1 0
+  %nop13981 = alloca i1, i1 0
+  %nop13982 = alloca i1, i1 0
+  %nop13983 = alloca i1, i1 0
+  %nop13984 = alloca i1, i1 0
+  %nop13985 = alloca i1, i1 0
+  %nop13986 = alloca i1, i1 0
+  %nop13987 = alloca i1, i1 0
+  %nop13988 = alloca i1, i1 0
+  %nop13989 = alloca i1, i1 0
+  %nop13990 = alloca i1, i1 0
+  %nop13991 = alloca i1, i1 0
+  %nop13992 = alloca i1, i1 0
+  %nop13993 = alloca i1, i1 0
+  %nop13994 = alloca i1, i1 0
+  %nop13995 = alloca i1, i1 0
+  %nop13996 = alloca i1, i1 0
+  %nop13997 = alloca i1, i1 0
+  %nop13998 = alloca i1, i1 0
+  %nop13999 = alloca i1, i1 0
+  %nop14000 = alloca i1, i1 0
+  %nop14001 = alloca i1, i1 0
+  %nop14002 = alloca i1, i1 0
+  %nop14003 = alloca i1, i1 0
+  %nop14004 = alloca i1, i1 0
+  %nop14005 = alloca i1, i1 0
+  %nop14006 = alloca i1, i1 0
+  %nop14007 = alloca i1, i1 0
+  %nop14008 = alloca i1, i1 0
+  %nop14009 = alloca i1, i1 0
+  %nop14010 = alloca i1, i1 0
+  %nop14011 = alloca i1, i1 0
+  %nop14012 = alloca i1, i1 0
+  %nop14013 = alloca i1, i1 0
+  %nop14014 = alloca i1, i1 0
+  %nop14015 = alloca i1, i1 0
+  %nop14016 = alloca i1, i1 0
+  %nop14017 = alloca i1, i1 0
+  %nop14018 = alloca i1, i1 0
+  %nop14019 = alloca i1, i1 0
+  %nop14020 = alloca i1, i1 0
+  %nop14021 = alloca i1, i1 0
+  %nop14022 = alloca i1, i1 0
+  %nop14023 = alloca i1, i1 0
+  %nop14024 = alloca i1, i1 0
+  %nop14025 = alloca i1, i1 0
+  %nop14026 = alloca i1, i1 0
+  %nop14027 = alloca i1, i1 0
+  %nop14028 = alloca i1, i1 0
+  %nop14029 = alloca i1, i1 0
+  %nop14030 = alloca i1, i1 0
+  %nop14031 = alloca i1, i1 0
+  %nop14032 = alloca i1, i1 0
+  %nop14033 = alloca i1, i1 0
+  %nop14034 = alloca i1, i1 0
+  %nop14035 = alloca i1, i1 0
+  %nop14036 = alloca i1, i1 0
+  %nop14037 = alloca i1, i1 0
+  %nop14038 = alloca i1, i1 0
+  %nop14039 = alloca i1, i1 0
+  %nop14040 = alloca i1, i1 0
+  %nop14041 = alloca i1, i1 0
+  %nop14042 = alloca i1, i1 0
+  %nop14043 = alloca i1, i1 0
+  %nop14044 = alloca i1, i1 0
+  %nop14045 = alloca i1, i1 0
+  %nop14046 = alloca i1, i1 0
+  %nop14047 = alloca i1, i1 0
+  %nop14048 = alloca i1, i1 0
+  %nop14049 = alloca i1, i1 0
+  %nop14050 = alloca i1, i1 0
+  %nop14051 = alloca i1, i1 0
+  %nop14052 = alloca i1, i1 0
+  %nop14053 = alloca i1, i1 0
+  %nop14054 = alloca i1, i1 0
+  %nop14055 = alloca i1, i1 0
+  %nop14056 = alloca i1, i1 0
+  %nop14057 = alloca i1, i1 0
+  %nop14058 = alloca i1, i1 0
+  %nop14059 = alloca i1, i1 0
+  %nop14060 = alloca i1, i1 0
+  %nop14061 = alloca i1, i1 0
+  %nop14062 = alloca i1, i1 0
+  %nop14063 = alloca i1, i1 0
+  %nop14064 = alloca i1, i1 0
+  %nop14065 = alloca i1, i1 0
+  %nop14066 = alloca i1, i1 0
+  %nop14067 = alloca i1, i1 0
+  %nop14068 = alloca i1, i1 0
+  %nop14069 = alloca i1, i1 0
+  %nop14070 = alloca i1, i1 0
+  %nop14071 = alloca i1, i1 0
+  %nop14072 = alloca i1, i1 0
+  %nop14073 = alloca i1, i1 0
+  %nop14074 = alloca i1, i1 0
+  %nop14075 = alloca i1, i1 0
+  %nop14076 = alloca i1, i1 0
+  %nop14077 = alloca i1, i1 0
+  %nop14078 = alloca i1, i1 0
+  %nop14079 = alloca i1, i1 0
+  %nop14080 = alloca i1, i1 0
+  %nop14081 = alloca i1, i1 0
+  %nop14082 = alloca i1, i1 0
+  %nop14083 = alloca i1, i1 0
+  %nop14084 = alloca i1, i1 0
+  %nop14085 = alloca i1, i1 0
+  %nop14086 = alloca i1, i1 0
+  %nop14087 = alloca i1, i1 0
+  %nop14088 = alloca i1, i1 0
+  %nop14089 = alloca i1, i1 0
+  %nop14090 = alloca i1, i1 0
+  %nop14091 = alloca i1, i1 0
+  %nop14092 = alloca i1, i1 0
+  %nop14093 = alloca i1, i1 0
+  %nop14094 = alloca i1, i1 0
+  %nop14095 = alloca i1, i1 0
+  %nop14096 = alloca i1, i1 0
+  %nop14097 = alloca i1, i1 0
+  %nop14098 = alloca i1, i1 0
+  %nop14099 = alloca i1, i1 0
+  %nop14100 = alloca i1, i1 0
+  %nop14101 = alloca i1, i1 0
+  %nop14102 = alloca i1, i1 0
+  %nop14103 = alloca i1, i1 0
+  %nop14104 = alloca i1, i1 0
+  %nop14105 = alloca i1, i1 0
+  %nop14106 = alloca i1, i1 0
+  %nop14107 = alloca i1, i1 0
+  %nop14108 = alloca i1, i1 0
+  %nop14109 = alloca i1, i1 0
+  %nop14110 = alloca i1, i1 0
+  %nop14111 = alloca i1, i1 0
+  %nop14112 = alloca i1, i1 0
+  %nop14113 = alloca i1, i1 0
+  %nop14114 = alloca i1, i1 0
+  %nop14115 = alloca i1, i1 0
+  %nop14116 = alloca i1, i1 0
+  %nop14117 = alloca i1, i1 0
+  %nop14118 = alloca i1, i1 0
+  %nop14119 = alloca i1, i1 0
+  %nop14120 = alloca i1, i1 0
+  %nop14121 = alloca i1, i1 0
+  %nop14122 = alloca i1, i1 0
+  %nop14123 = alloca i1, i1 0
+  %nop14124 = alloca i1, i1 0
+  %nop14125 = alloca i1, i1 0
+  %nop14126 = alloca i1, i1 0
+  %nop14127 = alloca i1, i1 0
+  %nop14128 = alloca i1, i1 0
+  %nop14129 = alloca i1, i1 0
+  %nop14130 = alloca i1, i1 0
+  %nop14131 = alloca i1, i1 0
+  %nop14132 = alloca i1, i1 0
+  %nop14133 = alloca i1, i1 0
+  %nop14134 = alloca i1, i1 0
+  %nop14135 = alloca i1, i1 0
+  %nop14136 = alloca i1, i1 0
+  %nop14137 = alloca i1, i1 0
+  %nop14138 = alloca i1, i1 0
+  %nop14139 = alloca i1, i1 0
+  %nop14140 = alloca i1, i1 0
+  %nop14141 = alloca i1, i1 0
+  %nop14142 = alloca i1, i1 0
+  %nop14143 = alloca i1, i1 0
+  %nop14144 = alloca i1, i1 0
+  %nop14145 = alloca i1, i1 0
+  %nop14146 = alloca i1, i1 0
+  %nop14147 = alloca i1, i1 0
+  %nop14148 = alloca i1, i1 0
+  %nop14149 = alloca i1, i1 0
+  %nop14150 = alloca i1, i1 0
+  %nop14151 = alloca i1, i1 0
+  %nop14152 = alloca i1, i1 0
+  %nop14153 = alloca i1, i1 0
+  %nop14154 = alloca i1, i1 0
+  %nop14155 = alloca i1, i1 0
+  %nop14156 = alloca i1, i1 0
+  %nop14157 = alloca i1, i1 0
+  %nop14158 = alloca i1, i1 0
+  %nop14159 = alloca i1, i1 0
+  %nop14160 = alloca i1, i1 0
+  %nop14161 = alloca i1, i1 0
+  %nop14162 = alloca i1, i1 0
+  %nop14163 = alloca i1, i1 0
+  %nop14164 = alloca i1, i1 0
+  %nop14165 = alloca i1, i1 0
+  %nop14166 = alloca i1, i1 0
+  %nop14167 = alloca i1, i1 0
+  %nop14168 = alloca i1, i1 0
+  %nop14169 = alloca i1, i1 0
+  %nop14170 = alloca i1, i1 0
+  %nop14171 = alloca i1, i1 0
+  %nop14172 = alloca i1, i1 0
+  %nop14173 = alloca i1, i1 0
+  %nop14174 = alloca i1, i1 0
+  %nop14175 = alloca i1, i1 0
+  %nop14176 = alloca i1, i1 0
+  %nop14177 = alloca i1, i1 0
+  %nop14178 = alloca i1, i1 0
+  %nop14179 = alloca i1, i1 0
+  %nop14180 = alloca i1, i1 0
+  %nop14181 = alloca i1, i1 0
+  %nop14182 = alloca i1, i1 0
+  %nop14183 = alloca i1, i1 0
+  %nop14184 = alloca i1, i1 0
+  %nop14185 = alloca i1, i1 0
+  %nop14186 = alloca i1, i1 0
+  %nop14187 = alloca i1, i1 0
+  %nop14188 = alloca i1, i1 0
+  %nop14189 = alloca i1, i1 0
+  %nop14190 = alloca i1, i1 0
+  %nop14191 = alloca i1, i1 0
+  %nop14192 = alloca i1, i1 0
+  %nop14193 = alloca i1, i1 0
+  %nop14194 = alloca i1, i1 0
+  %nop14195 = alloca i1, i1 0
+  %nop14196 = alloca i1, i1 0
+  %nop14197 = alloca i1, i1 0
+  %nop14198 = alloca i1, i1 0
+  %nop14199 = alloca i1, i1 0
+  %nop14200 = alloca i1, i1 0
+  %nop14201 = alloca i1, i1 0
+  %nop14202 = alloca i1, i1 0
+  %nop14203 = alloca i1, i1 0
+  %nop14204 = alloca i1, i1 0
+  %nop14205 = alloca i1, i1 0
+  %nop14206 = alloca i1, i1 0
+  %nop14207 = alloca i1, i1 0
+  %nop14208 = alloca i1, i1 0
+  %nop14209 = alloca i1, i1 0
+  %nop14210 = alloca i1, i1 0
+  %nop14211 = alloca i1, i1 0
+  %nop14212 = alloca i1, i1 0
+  %nop14213 = alloca i1, i1 0
+  %nop14214 = alloca i1, i1 0
+  %nop14215 = alloca i1, i1 0
+  %nop14216 = alloca i1, i1 0
+  %nop14217 = alloca i1, i1 0
+  %nop14218 = alloca i1, i1 0
+  %nop14219 = alloca i1, i1 0
+  %nop14220 = alloca i1, i1 0
+  %nop14221 = alloca i1, i1 0
+  %nop14222 = alloca i1, i1 0
+  %nop14223 = alloca i1, i1 0
+  %nop14224 = alloca i1, i1 0
+  %nop14225 = alloca i1, i1 0
+  %nop14226 = alloca i1, i1 0
+  %nop14227 = alloca i1, i1 0
+  %nop14228 = alloca i1, i1 0
+  %nop14229 = alloca i1, i1 0
+  %nop14230 = alloca i1, i1 0
+  %nop14231 = alloca i1, i1 0
+  %nop14232 = alloca i1, i1 0
+  %nop14233 = alloca i1, i1 0
+  %nop14234 = alloca i1, i1 0
+  %nop14235 = alloca i1, i1 0
+  %nop14236 = alloca i1, i1 0
+  %nop14237 = alloca i1, i1 0
+  %nop14238 = alloca i1, i1 0
+  %nop14239 = alloca i1, i1 0
+  %nop14240 = alloca i1, i1 0
+  %nop14241 = alloca i1, i1 0
+  %nop14242 = alloca i1, i1 0
+  %nop14243 = alloca i1, i1 0
+  %nop14244 = alloca i1, i1 0
+  %nop14245 = alloca i1, i1 0
+  %nop14246 = alloca i1, i1 0
+  %nop14247 = alloca i1, i1 0
+  %nop14248 = alloca i1, i1 0
+  %nop14249 = alloca i1, i1 0
+  %nop14250 = alloca i1, i1 0
+  %nop14251 = alloca i1, i1 0
+  %nop14252 = alloca i1, i1 0
+  %nop14253 = alloca i1, i1 0
+  %nop14254 = alloca i1, i1 0
+  %nop14255 = alloca i1, i1 0
+  %nop14256 = alloca i1, i1 0
+  %nop14257 = alloca i1, i1 0
+  %nop14258 = alloca i1, i1 0
+  %nop14259 = alloca i1, i1 0
+  %nop14260 = alloca i1, i1 0
+  %nop14261 = alloca i1, i1 0
+  %nop14262 = alloca i1, i1 0
+  %nop14263 = alloca i1, i1 0
+  %nop14264 = alloca i1, i1 0
+  %nop14265 = alloca i1, i1 0
+  %nop14266 = alloca i1, i1 0
+  %nop14267 = alloca i1, i1 0
+  %nop14268 = alloca i1, i1 0
+  %nop14269 = alloca i1, i1 0
+  %nop14270 = alloca i1, i1 0
+  %nop14271 = alloca i1, i1 0
+  %nop14272 = alloca i1, i1 0
+  %nop14273 = alloca i1, i1 0
+  %nop14274 = alloca i1, i1 0
+  %nop14275 = alloca i1, i1 0
+  %nop14276 = alloca i1, i1 0
+  %nop14277 = alloca i1, i1 0
+  %nop14278 = alloca i1, i1 0
+  %nop14279 = alloca i1, i1 0
+  %nop14280 = alloca i1, i1 0
+  %nop14281 = alloca i1, i1 0
+  %nop14282 = alloca i1, i1 0
+  %nop14283 = alloca i1, i1 0
+  %nop14284 = alloca i1, i1 0
+  %nop14285 = alloca i1, i1 0
+  %nop14286 = alloca i1, i1 0
+  %nop14287 = alloca i1, i1 0
+  %nop14288 = alloca i1, i1 0
+  %nop14289 = alloca i1, i1 0
+  %nop14290 = alloca i1, i1 0
+  %nop14291 = alloca i1, i1 0
+  %nop14292 = alloca i1, i1 0
+  %nop14293 = alloca i1, i1 0
+  %nop14294 = alloca i1, i1 0
+  %nop14295 = alloca i1, i1 0
+  %nop14296 = alloca i1, i1 0
+  %nop14297 = alloca i1, i1 0
+  %nop14298 = alloca i1, i1 0
+  %nop14299 = alloca i1, i1 0
+  %nop14300 = alloca i1, i1 0
+  %nop14301 = alloca i1, i1 0
+  %nop14302 = alloca i1, i1 0
+  %nop14303 = alloca i1, i1 0
+  %nop14304 = alloca i1, i1 0
+  %nop14305 = alloca i1, i1 0
+  %nop14306 = alloca i1, i1 0
+  %nop14307 = alloca i1, i1 0
+  %nop14308 = alloca i1, i1 0
+  %nop14309 = alloca i1, i1 0
+  %nop14310 = alloca i1, i1 0
+  %nop14311 = alloca i1, i1 0
+  %nop14312 = alloca i1, i1 0
+  %nop14313 = alloca i1, i1 0
+  %nop14314 = alloca i1, i1 0
+  %nop14315 = alloca i1, i1 0
+  %nop14316 = alloca i1, i1 0
+  %nop14317 = alloca i1, i1 0
+  %nop14318 = alloca i1, i1 0
+  %nop14319 = alloca i1, i1 0
+  %nop14320 = alloca i1, i1 0
+  %nop14321 = alloca i1, i1 0
+  %nop14322 = alloca i1, i1 0
+  %nop14323 = alloca i1, i1 0
+  %nop14324 = alloca i1, i1 0
+  %nop14325 = alloca i1, i1 0
+  %nop14326 = alloca i1, i1 0
+  %nop14327 = alloca i1, i1 0
+  %nop14328 = alloca i1, i1 0
+  %nop14329 = alloca i1, i1 0
+  %nop14330 = alloca i1, i1 0
+  %nop14331 = alloca i1, i1 0
+  %nop14332 = alloca i1, i1 0
+  %nop14333 = alloca i1, i1 0
+  %nop14334 = alloca i1, i1 0
+  %nop14335 = alloca i1, i1 0
+  %nop14336 = alloca i1, i1 0
+  %nop14337 = alloca i1, i1 0
+  %nop14338 = alloca i1, i1 0
+  %nop14339 = alloca i1, i1 0
+  %nop14340 = alloca i1, i1 0
+  %nop14341 = alloca i1, i1 0
+  %nop14342 = alloca i1, i1 0
+  %nop14343 = alloca i1, i1 0
+  %nop14344 = alloca i1, i1 0
+  %nop14345 = alloca i1, i1 0
+  %nop14346 = alloca i1, i1 0
+  %nop14347 = alloca i1, i1 0
+  %nop14348 = alloca i1, i1 0
+  %nop14349 = alloca i1, i1 0
+  %nop14350 = alloca i1, i1 0
+  %nop14351 = alloca i1, i1 0
+  %nop14352 = alloca i1, i1 0
+  %nop14353 = alloca i1, i1 0
+  %nop14354 = alloca i1, i1 0
+  %nop14355 = alloca i1, i1 0
+  %nop14356 = alloca i1, i1 0
+  %nop14357 = alloca i1, i1 0
+  %nop14358 = alloca i1, i1 0
+  %nop14359 = alloca i1, i1 0
+  %nop14360 = alloca i1, i1 0
+  %nop14361 = alloca i1, i1 0
+  %nop14362 = alloca i1, i1 0
+  %nop14363 = alloca i1, i1 0
+  %nop14364 = alloca i1, i1 0
+  %nop14365 = alloca i1, i1 0
+  %nop14366 = alloca i1, i1 0
+  %nop14367 = alloca i1, i1 0
+  %nop14368 = alloca i1, i1 0
+  %nop14369 = alloca i1, i1 0
+  %nop14370 = alloca i1, i1 0
+  %nop14371 = alloca i1, i1 0
+  %nop14372 = alloca i1, i1 0
+  %nop14373 = alloca i1, i1 0
+  %nop14374 = alloca i1, i1 0
+  %nop14375 = alloca i1, i1 0
+  %nop14376 = alloca i1, i1 0
+  %nop14377 = alloca i1, i1 0
+  %nop14378 = alloca i1, i1 0
+  %nop14379 = alloca i1, i1 0
+  %nop14380 = alloca i1, i1 0
+  %nop14381 = alloca i1, i1 0
+  %nop14382 = alloca i1, i1 0
+  %nop14383 = alloca i1, i1 0
+  %nop14384 = alloca i1, i1 0
+  %nop14385 = alloca i1, i1 0
+  %nop14386 = alloca i1, i1 0
+  %nop14387 = alloca i1, i1 0
+  %nop14388 = alloca i1, i1 0
+  %nop14389 = alloca i1, i1 0
+  %nop14390 = alloca i1, i1 0
+  %nop14391 = alloca i1, i1 0
+  %nop14392 = alloca i1, i1 0
+  %nop14393 = alloca i1, i1 0
+  %nop14394 = alloca i1, i1 0
+  %nop14395 = alloca i1, i1 0
+  %nop14396 = alloca i1, i1 0
+  %nop14397 = alloca i1, i1 0
+  %nop14398 = alloca i1, i1 0
+  %nop14399 = alloca i1, i1 0
+  %nop14400 = alloca i1, i1 0
+  %nop14401 = alloca i1, i1 0
+  %nop14402 = alloca i1, i1 0
+  %nop14403 = alloca i1, i1 0
+  %nop14404 = alloca i1, i1 0
+  %nop14405 = alloca i1, i1 0
+  %nop14406 = alloca i1, i1 0
+  %nop14407 = alloca i1, i1 0
+  %nop14408 = alloca i1, i1 0
+  %nop14409 = alloca i1, i1 0
+  %nop14410 = alloca i1, i1 0
+  %nop14411 = alloca i1, i1 0
+  %nop14412 = alloca i1, i1 0
+  %nop14413 = alloca i1, i1 0
+  %nop14414 = alloca i1, i1 0
+  %nop14415 = alloca i1, i1 0
+  %nop14416 = alloca i1, i1 0
+  %nop14417 = alloca i1, i1 0
+  %nop14418 = alloca i1, i1 0
+  %nop14419 = alloca i1, i1 0
+  %nop14420 = alloca i1, i1 0
+  %nop14421 = alloca i1, i1 0
+  %nop14422 = alloca i1, i1 0
+  %nop14423 = alloca i1, i1 0
+  %nop14424 = alloca i1, i1 0
+  %nop14425 = alloca i1, i1 0
+  %nop14426 = alloca i1, i1 0
+  %nop14427 = alloca i1, i1 0
+  %nop14428 = alloca i1, i1 0
+  %nop14429 = alloca i1, i1 0
+  %nop14430 = alloca i1, i1 0
+  %nop14431 = alloca i1, i1 0
+  %nop14432 = alloca i1, i1 0
+  %nop14433 = alloca i1, i1 0
+  %nop14434 = alloca i1, i1 0
+  %nop14435 = alloca i1, i1 0
+  %nop14436 = alloca i1, i1 0
+  %nop14437 = alloca i1, i1 0
+  %nop14438 = alloca i1, i1 0
+  %nop14439 = alloca i1, i1 0
+  %nop14440 = alloca i1, i1 0
+  %nop14441 = alloca i1, i1 0
+  %nop14442 = alloca i1, i1 0
+  %nop14443 = alloca i1, i1 0
+  %nop14444 = alloca i1, i1 0
+  %nop14445 = alloca i1, i1 0
+  %nop14446 = alloca i1, i1 0
+  %nop14447 = alloca i1, i1 0
+  %nop14448 = alloca i1, i1 0
+  %nop14449 = alloca i1, i1 0
+  %nop14450 = alloca i1, i1 0
+  %nop14451 = alloca i1, i1 0
+  %nop14452 = alloca i1, i1 0
+  %nop14453 = alloca i1, i1 0
+  %nop14454 = alloca i1, i1 0
+  %nop14455 = alloca i1, i1 0
+  %nop14456 = alloca i1, i1 0
+  %nop14457 = alloca i1, i1 0
+  %nop14458 = alloca i1, i1 0
+  %nop14459 = alloca i1, i1 0
+  %nop14460 = alloca i1, i1 0
+  %nop14461 = alloca i1, i1 0
+  %nop14462 = alloca i1, i1 0
+  %nop14463 = alloca i1, i1 0
+  %nop14464 = alloca i1, i1 0
+  %nop14465 = alloca i1, i1 0
+  %nop14466 = alloca i1, i1 0
+  %nop14467 = alloca i1, i1 0
+  %nop14468 = alloca i1, i1 0
+  %nop14469 = alloca i1, i1 0
+  %nop14470 = alloca i1, i1 0
+  %nop14471 = alloca i1, i1 0
+  %nop14472 = alloca i1, i1 0
+  %nop14473 = alloca i1, i1 0
+  %nop14474 = alloca i1, i1 0
+  %nop14475 = alloca i1, i1 0
+  %nop14476 = alloca i1, i1 0
+  %nop14477 = alloca i1, i1 0
+  %nop14478 = alloca i1, i1 0
+  %nop14479 = alloca i1, i1 0
+  %nop14480 = alloca i1, i1 0
+  %nop14481 = alloca i1, i1 0
+  %nop14482 = alloca i1, i1 0
+  %nop14483 = alloca i1, i1 0
+  %nop14484 = alloca i1, i1 0
+  %nop14485 = alloca i1, i1 0
+  %nop14486 = alloca i1, i1 0
+  %nop14487 = alloca i1, i1 0
+  %nop14488 = alloca i1, i1 0
+  %nop14489 = alloca i1, i1 0
+  %nop14490 = alloca i1, i1 0
+  %nop14491 = alloca i1, i1 0
+  %nop14492 = alloca i1, i1 0
+  %nop14493 = alloca i1, i1 0
+  %nop14494 = alloca i1, i1 0
+  %nop14495 = alloca i1, i1 0
+  %nop14496 = alloca i1, i1 0
+  %nop14497 = alloca i1, i1 0
+  %nop14498 = alloca i1, i1 0
+  %nop14499 = alloca i1, i1 0
+  %nop14500 = alloca i1, i1 0
+  %nop14501 = alloca i1, i1 0
+  %nop14502 = alloca i1, i1 0
+  %nop14503 = alloca i1, i1 0
+  %nop14504 = alloca i1, i1 0
+  %nop14505 = alloca i1, i1 0
+  %nop14506 = alloca i1, i1 0
+  %nop14507 = alloca i1, i1 0
+  %nop14508 = alloca i1, i1 0
+  %nop14509 = alloca i1, i1 0
+  %nop14510 = alloca i1, i1 0
+  %nop14511 = alloca i1, i1 0
+  %nop14512 = alloca i1, i1 0
+  %nop14513 = alloca i1, i1 0
+  %nop14514 = alloca i1, i1 0
+  %nop14515 = alloca i1, i1 0
+  %nop14516 = alloca i1, i1 0
+  %nop14517 = alloca i1, i1 0
+  %nop14518 = alloca i1, i1 0
+  %nop14519 = alloca i1, i1 0
+  %nop14520 = alloca i1, i1 0
+  %nop14521 = alloca i1, i1 0
+  %nop14522 = alloca i1, i1 0
+  %nop14523 = alloca i1, i1 0
+  %nop14524 = alloca i1, i1 0
+  %nop14525 = alloca i1, i1 0
+  %nop14526 = alloca i1, i1 0
+  %nop14527 = alloca i1, i1 0
+  %nop14528 = alloca i1, i1 0
+  %nop14529 = alloca i1, i1 0
+  %nop14530 = alloca i1, i1 0
+  %nop14531 = alloca i1, i1 0
+  %nop14532 = alloca i1, i1 0
+  %nop14533 = alloca i1, i1 0
+  %nop14534 = alloca i1, i1 0
+  %nop14535 = alloca i1, i1 0
+  %nop14536 = alloca i1, i1 0
+  %nop14537 = alloca i1, i1 0
+  %nop14538 = alloca i1, i1 0
+  %nop14539 = alloca i1, i1 0
+  %nop14540 = alloca i1, i1 0
+  %nop14541 = alloca i1, i1 0
+  %nop14542 = alloca i1, i1 0
+  %nop14543 = alloca i1, i1 0
+  %nop14544 = alloca i1, i1 0
+  %nop14545 = alloca i1, i1 0
+  %nop14546 = alloca i1, i1 0
+  %nop14547 = alloca i1, i1 0
+  %nop14548 = alloca i1, i1 0
+  %nop14549 = alloca i1, i1 0
+  %nop14550 = alloca i1, i1 0
+  %nop14551 = alloca i1, i1 0
+  %nop14552 = alloca i1, i1 0
+  %nop14553 = alloca i1, i1 0
+  %nop14554 = alloca i1, i1 0
+  %nop14555 = alloca i1, i1 0
+  %nop14556 = alloca i1, i1 0
+  %nop14557 = alloca i1, i1 0
+  %nop14558 = alloca i1, i1 0
+  %nop14559 = alloca i1, i1 0
+  %nop14560 = alloca i1, i1 0
+  %nop14561 = alloca i1, i1 0
+  %nop14562 = alloca i1, i1 0
+  %nop14563 = alloca i1, i1 0
+  %nop14564 = alloca i1, i1 0
+  %nop14565 = alloca i1, i1 0
+  %nop14566 = alloca i1, i1 0
+  %nop14567 = alloca i1, i1 0
+  %nop14568 = alloca i1, i1 0
+  %nop14569 = alloca i1, i1 0
+  %nop14570 = alloca i1, i1 0
+  %nop14571 = alloca i1, i1 0
+  %nop14572 = alloca i1, i1 0
+  %nop14573 = alloca i1, i1 0
+  %nop14574 = alloca i1, i1 0
+  %nop14575 = alloca i1, i1 0
+  %nop14576 = alloca i1, i1 0
+  %nop14577 = alloca i1, i1 0
+  %nop14578 = alloca i1, i1 0
+  %nop14579 = alloca i1, i1 0
+  %nop14580 = alloca i1, i1 0
+  %nop14581 = alloca i1, i1 0
+  %nop14582 = alloca i1, i1 0
+  %nop14583 = alloca i1, i1 0
+  %nop14584 = alloca i1, i1 0
+  %nop14585 = alloca i1, i1 0
+  %nop14586 = alloca i1, i1 0
+  %nop14587 = alloca i1, i1 0
+  %nop14588 = alloca i1, i1 0
+  %nop14589 = alloca i1, i1 0
+  %nop14590 = alloca i1, i1 0
+  %nop14591 = alloca i1, i1 0
+  %nop14592 = alloca i1, i1 0
+  %nop14593 = alloca i1, i1 0
+  %nop14594 = alloca i1, i1 0
+  %nop14595 = alloca i1, i1 0
+  %nop14596 = alloca i1, i1 0
+  %nop14597 = alloca i1, i1 0
+  %nop14598 = alloca i1, i1 0
+  %nop14599 = alloca i1, i1 0
+  %nop14600 = alloca i1, i1 0
+  %nop14601 = alloca i1, i1 0
+  %nop14602 = alloca i1, i1 0
+  %nop14603 = alloca i1, i1 0
+  %nop14604 = alloca i1, i1 0
+  %nop14605 = alloca i1, i1 0
+  %nop14606 = alloca i1, i1 0
+  %nop14607 = alloca i1, i1 0
+  %nop14608 = alloca i1, i1 0
+  %nop14609 = alloca i1, i1 0
+  %nop14610 = alloca i1, i1 0
+  %nop14611 = alloca i1, i1 0
+  %nop14612 = alloca i1, i1 0
+  %nop14613 = alloca i1, i1 0
+  %nop14614 = alloca i1, i1 0
+  %nop14615 = alloca i1, i1 0
+  %nop14616 = alloca i1, i1 0
+  %nop14617 = alloca i1, i1 0
+  %nop14618 = alloca i1, i1 0
+  %nop14619 = alloca i1, i1 0
+  %nop14620 = alloca i1, i1 0
+  %nop14621 = alloca i1, i1 0
+  %nop14622 = alloca i1, i1 0
+  %nop14623 = alloca i1, i1 0
+  %nop14624 = alloca i1, i1 0
+  %nop14625 = alloca i1, i1 0
+  %nop14626 = alloca i1, i1 0
+  %nop14627 = alloca i1, i1 0
+  %nop14628 = alloca i1, i1 0
+  %nop14629 = alloca i1, i1 0
+  %nop14630 = alloca i1, i1 0
+  %nop14631 = alloca i1, i1 0
+  %nop14632 = alloca i1, i1 0
+  %nop14633 = alloca i1, i1 0
+  %nop14634 = alloca i1, i1 0
+  %nop14635 = alloca i1, i1 0
+  %nop14636 = alloca i1, i1 0
+  %nop14637 = alloca i1, i1 0
+  %nop14638 = alloca i1, i1 0
+  %nop14639 = alloca i1, i1 0
+  %nop14640 = alloca i1, i1 0
+  %nop14641 = alloca i1, i1 0
+  %nop14642 = alloca i1, i1 0
+  %nop14643 = alloca i1, i1 0
+  %nop14644 = alloca i1, i1 0
+  %nop14645 = alloca i1, i1 0
+  %nop14646 = alloca i1, i1 0
+  %nop14647 = alloca i1, i1 0
+  %nop14648 = alloca i1, i1 0
+  %nop14649 = alloca i1, i1 0
+  %nop14650 = alloca i1, i1 0
+  %nop14651 = alloca i1, i1 0
+  %nop14652 = alloca i1, i1 0
+  %nop14653 = alloca i1, i1 0
+  %nop14654 = alloca i1, i1 0
+  %nop14655 = alloca i1, i1 0
+  %nop14656 = alloca i1, i1 0
+  %nop14657 = alloca i1, i1 0
+  %nop14658 = alloca i1, i1 0
+  %nop14659 = alloca i1, i1 0
+  %nop14660 = alloca i1, i1 0
+  %nop14661 = alloca i1, i1 0
+  %nop14662 = alloca i1, i1 0
+  %nop14663 = alloca i1, i1 0
+  %nop14664 = alloca i1, i1 0
+  %nop14665 = alloca i1, i1 0
+  %nop14666 = alloca i1, i1 0
+  %nop14667 = alloca i1, i1 0
+  %nop14668 = alloca i1, i1 0
+  %nop14669 = alloca i1, i1 0
+  %nop14670 = alloca i1, i1 0
+  %nop14671 = alloca i1, i1 0
+  %nop14672 = alloca i1, i1 0
+  %nop14673 = alloca i1, i1 0
+  %nop14674 = alloca i1, i1 0
+  %nop14675 = alloca i1, i1 0
+  %nop14676 = alloca i1, i1 0
+  %nop14677 = alloca i1, i1 0
+  %nop14678 = alloca i1, i1 0
+  %nop14679 = alloca i1, i1 0
+  %nop14680 = alloca i1, i1 0
+  %nop14681 = alloca i1, i1 0
+  %nop14682 = alloca i1, i1 0
+  %nop14683 = alloca i1, i1 0
+  %nop14684 = alloca i1, i1 0
+  %nop14685 = alloca i1, i1 0
+  %nop14686 = alloca i1, i1 0
+  %nop14687 = alloca i1, i1 0
+  %nop14688 = alloca i1, i1 0
+  %nop14689 = alloca i1, i1 0
+  %nop14690 = alloca i1, i1 0
+  %nop14691 = alloca i1, i1 0
+  %nop14692 = alloca i1, i1 0
+  %nop14693 = alloca i1, i1 0
+  %nop14694 = alloca i1, i1 0
+  %nop14695 = alloca i1, i1 0
+  %nop14696 = alloca i1, i1 0
+  %nop14697 = alloca i1, i1 0
+  %nop14698 = alloca i1, i1 0
+  %nop14699 = alloca i1, i1 0
+  %nop14700 = alloca i1, i1 0
+  %nop14701 = alloca i1, i1 0
+  %nop14702 = alloca i1, i1 0
+  %nop14703 = alloca i1, i1 0
+  %nop14704 = alloca i1, i1 0
+  %nop14705 = alloca i1, i1 0
+  %nop14706 = alloca i1, i1 0
+  %nop14707 = alloca i1, i1 0
+  %nop14708 = alloca i1, i1 0
+  %nop14709 = alloca i1, i1 0
+  %nop14710 = alloca i1, i1 0
+  %nop14711 = alloca i1, i1 0
+  %nop14712 = alloca i1, i1 0
+  %nop14713 = alloca i1, i1 0
+  %nop14714 = alloca i1, i1 0
+  %nop14715 = alloca i1, i1 0
+  %nop14716 = alloca i1, i1 0
+  %nop14717 = alloca i1, i1 0
+  %nop14718 = alloca i1, i1 0
+  %nop14719 = alloca i1, i1 0
+  %nop14720 = alloca i1, i1 0
+  %nop14721 = alloca i1, i1 0
+  %nop14722 = alloca i1, i1 0
+  %nop14723 = alloca i1, i1 0
+  %nop14724 = alloca i1, i1 0
+  %nop14725 = alloca i1, i1 0
+  %nop14726 = alloca i1, i1 0
+  %nop14727 = alloca i1, i1 0
+  %nop14728 = alloca i1, i1 0
+  %nop14729 = alloca i1, i1 0
+  %nop14730 = alloca i1, i1 0
+  %nop14731 = alloca i1, i1 0
+  %nop14732 = alloca i1, i1 0
+  %nop14733 = alloca i1, i1 0
+  %nop14734 = alloca i1, i1 0
+  %nop14735 = alloca i1, i1 0
+  %nop14736 = alloca i1, i1 0
+  %nop14737 = alloca i1, i1 0
+  %nop14738 = alloca i1, i1 0
+  %nop14739 = alloca i1, i1 0
+  %nop14740 = alloca i1, i1 0
+  %nop14741 = alloca i1, i1 0
+  %nop14742 = alloca i1, i1 0
+  %nop14743 = alloca i1, i1 0
+  %nop14744 = alloca i1, i1 0
+  %nop14745 = alloca i1, i1 0
+  %nop14746 = alloca i1, i1 0
+  %nop14747 = alloca i1, i1 0
+  %nop14748 = alloca i1, i1 0
+  %nop14749 = alloca i1, i1 0
+  %nop14750 = alloca i1, i1 0
+  %nop14751 = alloca i1, i1 0
+  %nop14752 = alloca i1, i1 0
+  %nop14753 = alloca i1, i1 0
+  %nop14754 = alloca i1, i1 0
+  %nop14755 = alloca i1, i1 0
+  %nop14756 = alloca i1, i1 0
+  %nop14757 = alloca i1, i1 0
+  %nop14758 = alloca i1, i1 0
+  %nop14759 = alloca i1, i1 0
+  %nop14760 = alloca i1, i1 0
+  %nop14761 = alloca i1, i1 0
+  %nop14762 = alloca i1, i1 0
+  %nop14763 = alloca i1, i1 0
+  %nop14764 = alloca i1, i1 0
+  %nop14765 = alloca i1, i1 0
+  %nop14766 = alloca i1, i1 0
+  %nop14767 = alloca i1, i1 0
+  %nop14768 = alloca i1, i1 0
+  %nop14769 = alloca i1, i1 0
+  %nop14770 = alloca i1, i1 0
+  %nop14771 = alloca i1, i1 0
+  %nop14772 = alloca i1, i1 0
+  %nop14773 = alloca i1, i1 0
+  %nop14774 = alloca i1, i1 0
+  %nop14775 = alloca i1, i1 0
+  %nop14776 = alloca i1, i1 0
+  %nop14777 = alloca i1, i1 0
+  %nop14778 = alloca i1, i1 0
+  %nop14779 = alloca i1, i1 0
+  %nop14780 = alloca i1, i1 0
+  %nop14781 = alloca i1, i1 0
+  %nop14782 = alloca i1, i1 0
+  %nop14783 = alloca i1, i1 0
+  %nop14784 = alloca i1, i1 0
+  %nop14785 = alloca i1, i1 0
+  %nop14786 = alloca i1, i1 0
+  %nop14787 = alloca i1, i1 0
+  %nop14788 = alloca i1, i1 0
+  %nop14789 = alloca i1, i1 0
+  %nop14790 = alloca i1, i1 0
+  %nop14791 = alloca i1, i1 0
+  %nop14792 = alloca i1, i1 0
+  %nop14793 = alloca i1, i1 0
+  %nop14794 = alloca i1, i1 0
+  %nop14795 = alloca i1, i1 0
+  %nop14796 = alloca i1, i1 0
+  %nop14797 = alloca i1, i1 0
+  %nop14798 = alloca i1, i1 0
+  %nop14799 = alloca i1, i1 0
+  %nop14800 = alloca i1, i1 0
+  %nop14801 = alloca i1, i1 0
+  %nop14802 = alloca i1, i1 0
+  %nop14803 = alloca i1, i1 0
+  %nop14804 = alloca i1, i1 0
+  %nop14805 = alloca i1, i1 0
+  %nop14806 = alloca i1, i1 0
+  %nop14807 = alloca i1, i1 0
+  %nop14808 = alloca i1, i1 0
+  %nop14809 = alloca i1, i1 0
+  %nop14810 = alloca i1, i1 0
+  %nop14811 = alloca i1, i1 0
+  %nop14812 = alloca i1, i1 0
+  %nop14813 = alloca i1, i1 0
+  %nop14814 = alloca i1, i1 0
+  %nop14815 = alloca i1, i1 0
+  %nop14816 = alloca i1, i1 0
+  %nop14817 = alloca i1, i1 0
+  %nop14818 = alloca i1, i1 0
+  %nop14819 = alloca i1, i1 0
+  %nop14820 = alloca i1, i1 0
+  %nop14821 = alloca i1, i1 0
+  %nop14822 = alloca i1, i1 0
+  %nop14823 = alloca i1, i1 0
+  %nop14824 = alloca i1, i1 0
+  %nop14825 = alloca i1, i1 0
+  %nop14826 = alloca i1, i1 0
+  %nop14827 = alloca i1, i1 0
+  %nop14828 = alloca i1, i1 0
+  %nop14829 = alloca i1, i1 0
+  %nop14830 = alloca i1, i1 0
+  %nop14831 = alloca i1, i1 0
+  %nop14832 = alloca i1, i1 0
+  %nop14833 = alloca i1, i1 0
+  %nop14834 = alloca i1, i1 0
+  %nop14835 = alloca i1, i1 0
+  %nop14836 = alloca i1, i1 0
+  %nop14837 = alloca i1, i1 0
+  %nop14838 = alloca i1, i1 0
+  %nop14839 = alloca i1, i1 0
+  %nop14840 = alloca i1, i1 0
+  %nop14841 = alloca i1, i1 0
+  %nop14842 = alloca i1, i1 0
+  %nop14843 = alloca i1, i1 0
+  %nop14844 = alloca i1, i1 0
+  %nop14845 = alloca i1, i1 0
+  %nop14846 = alloca i1, i1 0
+  %nop14847 = alloca i1, i1 0
+  %nop14848 = alloca i1, i1 0
+  %nop14849 = alloca i1, i1 0
+  %nop14850 = alloca i1, i1 0
+  %nop14851 = alloca i1, i1 0
+  %nop14852 = alloca i1, i1 0
+  %nop14853 = alloca i1, i1 0
+  %nop14854 = alloca i1, i1 0
+  %nop14855 = alloca i1, i1 0
+  %nop14856 = alloca i1, i1 0
+  %nop14857 = alloca i1, i1 0
+  %nop14858 = alloca i1, i1 0
+  %nop14859 = alloca i1, i1 0
+  %nop14860 = alloca i1, i1 0
+  %nop14861 = alloca i1, i1 0
+  %nop14862 = alloca i1, i1 0
+  %nop14863 = alloca i1, i1 0
+  %nop14864 = alloca i1, i1 0
+  %nop14865 = alloca i1, i1 0
+  %nop14866 = alloca i1, i1 0
+  %nop14867 = alloca i1, i1 0
+  %nop14868 = alloca i1, i1 0
+  %nop14869 = alloca i1, i1 0
+  %nop14870 = alloca i1, i1 0
+  %nop14871 = alloca i1, i1 0
+  %nop14872 = alloca i1, i1 0
+  %nop14873 = alloca i1, i1 0
+  %nop14874 = alloca i1, i1 0
+  %nop14875 = alloca i1, i1 0
+  %nop14876 = alloca i1, i1 0
+  %nop14877 = alloca i1, i1 0
+  %nop14878 = alloca i1, i1 0
+  %nop14879 = alloca i1, i1 0
+  %nop14880 = alloca i1, i1 0
+  %nop14881 = alloca i1, i1 0
+  %nop14882 = alloca i1, i1 0
+  %nop14883 = alloca i1, i1 0
+  %nop14884 = alloca i1, i1 0
+  %nop14885 = alloca i1, i1 0
+  %nop14886 = alloca i1, i1 0
+  %nop14887 = alloca i1, i1 0
+  %nop14888 = alloca i1, i1 0
+  %nop14889 = alloca i1, i1 0
+  %nop14890 = alloca i1, i1 0
+  %nop14891 = alloca i1, i1 0
+  %nop14892 = alloca i1, i1 0
+  %nop14893 = alloca i1, i1 0
+  %nop14894 = alloca i1, i1 0
+  %nop14895 = alloca i1, i1 0
+  %nop14896 = alloca i1, i1 0
+  %nop14897 = alloca i1, i1 0
+  %nop14898 = alloca i1, i1 0
+  %nop14899 = alloca i1, i1 0
+  %nop14900 = alloca i1, i1 0
+  %nop14901 = alloca i1, i1 0
+  %nop14902 = alloca i1, i1 0
+  %nop14903 = alloca i1, i1 0
+  %nop14904 = alloca i1, i1 0
+  %nop14905 = alloca i1, i1 0
+  %nop14906 = alloca i1, i1 0
+  %nop14907 = alloca i1, i1 0
+  %nop14908 = alloca i1, i1 0
+  %nop14909 = alloca i1, i1 0
+  %nop14910 = alloca i1, i1 0
+  %nop14911 = alloca i1, i1 0
+  %nop14912 = alloca i1, i1 0
+  %nop14913 = alloca i1, i1 0
+  %nop14914 = alloca i1, i1 0
+  %nop14915 = alloca i1, i1 0
+  %nop14916 = alloca i1, i1 0
+  %nop14917 = alloca i1, i1 0
+  %nop14918 = alloca i1, i1 0
+  %nop14919 = alloca i1, i1 0
+  %nop14920 = alloca i1, i1 0
+  %nop14921 = alloca i1, i1 0
+  %nop14922 = alloca i1, i1 0
+  %nop14923 = alloca i1, i1 0
+  %nop14924 = alloca i1, i1 0
+  %nop14925 = alloca i1, i1 0
+  %nop14926 = alloca i1, i1 0
+  %nop14927 = alloca i1, i1 0
+  %nop14928 = alloca i1, i1 0
+  %nop14929 = alloca i1, i1 0
+  %nop14930 = alloca i1, i1 0
+  %nop14931 = alloca i1, i1 0
+  %nop14932 = alloca i1, i1 0
+  %nop14933 = alloca i1, i1 0
+  %nop14934 = alloca i1, i1 0
+  %nop14935 = alloca i1, i1 0
+  %nop14936 = alloca i1, i1 0
+  %nop14937 = alloca i1, i1 0
+  %nop14938 = alloca i1, i1 0
+  %nop14939 = alloca i1, i1 0
+  %nop14940 = alloca i1, i1 0
+  %nop14941 = alloca i1, i1 0
+  %nop14942 = alloca i1, i1 0
+  %nop14943 = alloca i1, i1 0
+  %nop14944 = alloca i1, i1 0
+  %nop14945 = alloca i1, i1 0
+  %nop14946 = alloca i1, i1 0
+  %nop14947 = alloca i1, i1 0
+  %nop14948 = alloca i1, i1 0
+  %nop14949 = alloca i1, i1 0
+  %nop14950 = alloca i1, i1 0
+  %nop14951 = alloca i1, i1 0
+  %nop14952 = alloca i1, i1 0
+  %nop14953 = alloca i1, i1 0
+  %nop14954 = alloca i1, i1 0
+  %nop14955 = alloca i1, i1 0
+  %nop14956 = alloca i1, i1 0
+  %nop14957 = alloca i1, i1 0
+  %nop14958 = alloca i1, i1 0
+  %nop14959 = alloca i1, i1 0
+  %nop14960 = alloca i1, i1 0
+  %nop14961 = alloca i1, i1 0
+  %nop14962 = alloca i1, i1 0
+  %nop14963 = alloca i1, i1 0
+  %nop14964 = alloca i1, i1 0
+  %nop14965 = alloca i1, i1 0
+  %nop14966 = alloca i1, i1 0
+  %nop14967 = alloca i1, i1 0
+  %nop14968 = alloca i1, i1 0
+  %nop14969 = alloca i1, i1 0
+  %nop14970 = alloca i1, i1 0
+  %nop14971 = alloca i1, i1 0
+  %nop14972 = alloca i1, i1 0
+  %nop14973 = alloca i1, i1 0
+  %nop14974 = alloca i1, i1 0
+  %nop14975 = alloca i1, i1 0
+  %nop14976 = alloca i1, i1 0
+  %nop14977 = alloca i1, i1 0
+  %nop14978 = alloca i1, i1 0
+  %nop14979 = alloca i1, i1 0
+  %nop14980 = alloca i1, i1 0
+  %nop14981 = alloca i1, i1 0
+  %nop14982 = alloca i1, i1 0
+  %nop14983 = alloca i1, i1 0
+  %nop14984 = alloca i1, i1 0
+  %nop14985 = alloca i1, i1 0
+  %nop14986 = alloca i1, i1 0
+  %nop14987 = alloca i1, i1 0
+  %nop14988 = alloca i1, i1 0
+  %nop14989 = alloca i1, i1 0
+  %nop14990 = alloca i1, i1 0
+  %nop14991 = alloca i1, i1 0
+  %nop14992 = alloca i1, i1 0
+  %nop14993 = alloca i1, i1 0
+  %nop14994 = alloca i1, i1 0
+  %nop14995 = alloca i1, i1 0
+  %nop14996 = alloca i1, i1 0
+  %nop14997 = alloca i1, i1 0
+  %nop14998 = alloca i1, i1 0
+  %nop14999 = alloca i1, i1 0
+  %nop15000 = alloca i1, i1 0
+  %nop15001 = alloca i1, i1 0
+  %nop15002 = alloca i1, i1 0
+  %nop15003 = alloca i1, i1 0
+  %nop15004 = alloca i1, i1 0
+  %nop15005 = alloca i1, i1 0
+  %nop15006 = alloca i1, i1 0
+  %nop15007 = alloca i1, i1 0
+  %nop15008 = alloca i1, i1 0
+  %nop15009 = alloca i1, i1 0
+  %nop15010 = alloca i1, i1 0
+  %nop15011 = alloca i1, i1 0
+  %nop15012 = alloca i1, i1 0
+  %nop15013 = alloca i1, i1 0
+  %nop15014 = alloca i1, i1 0
+  %nop15015 = alloca i1, i1 0
+  %nop15016 = alloca i1, i1 0
+  %nop15017 = alloca i1, i1 0
+  %nop15018 = alloca i1, i1 0
+  %nop15019 = alloca i1, i1 0
+  %nop15020 = alloca i1, i1 0
+  %nop15021 = alloca i1, i1 0
+  %nop15022 = alloca i1, i1 0
+  %nop15023 = alloca i1, i1 0
+  %nop15024 = alloca i1, i1 0
+  %nop15025 = alloca i1, i1 0
+  %nop15026 = alloca i1, i1 0
+  %nop15027 = alloca i1, i1 0
+  %nop15028 = alloca i1, i1 0
+  %nop15029 = alloca i1, i1 0
+  %nop15030 = alloca i1, i1 0
+  %nop15031 = alloca i1, i1 0
+  %nop15032 = alloca i1, i1 0
+  %nop15033 = alloca i1, i1 0
+  %nop15034 = alloca i1, i1 0
+  %nop15035 = alloca i1, i1 0
+  %nop15036 = alloca i1, i1 0
+  %nop15037 = alloca i1, i1 0
+  %nop15038 = alloca i1, i1 0
+  %nop15039 = alloca i1, i1 0
+  %nop15040 = alloca i1, i1 0
+  %nop15041 = alloca i1, i1 0
+  %nop15042 = alloca i1, i1 0
+  %nop15043 = alloca i1, i1 0
+  %nop15044 = alloca i1, i1 0
+  %nop15045 = alloca i1, i1 0
+  %nop15046 = alloca i1, i1 0
+  %nop15047 = alloca i1, i1 0
+  %nop15048 = alloca i1, i1 0
+  %nop15049 = alloca i1, i1 0
+  %nop15050 = alloca i1, i1 0
+  %nop15051 = alloca i1, i1 0
+  %nop15052 = alloca i1, i1 0
+  %nop15053 = alloca i1, i1 0
+  %nop15054 = alloca i1, i1 0
+  %nop15055 = alloca i1, i1 0
+  %nop15056 = alloca i1, i1 0
+  %nop15057 = alloca i1, i1 0
+  %nop15058 = alloca i1, i1 0
+  %nop15059 = alloca i1, i1 0
+  %nop15060 = alloca i1, i1 0
+  %nop15061 = alloca i1, i1 0
+  %nop15062 = alloca i1, i1 0
+  %nop15063 = alloca i1, i1 0
+  %nop15064 = alloca i1, i1 0
+  %nop15065 = alloca i1, i1 0
+  %nop15066 = alloca i1, i1 0
+  %nop15067 = alloca i1, i1 0
+  %nop15068 = alloca i1, i1 0
+  %nop15069 = alloca i1, i1 0
+  %nop15070 = alloca i1, i1 0
+  %nop15071 = alloca i1, i1 0
+  %nop15072 = alloca i1, i1 0
+  %nop15073 = alloca i1, i1 0
+  %nop15074 = alloca i1, i1 0
+  %nop15075 = alloca i1, i1 0
+  %nop15076 = alloca i1, i1 0
+  %nop15077 = alloca i1, i1 0
+  %nop15078 = alloca i1, i1 0
+  %nop15079 = alloca i1, i1 0
+  %nop15080 = alloca i1, i1 0
+  %nop15081 = alloca i1, i1 0
+  %nop15082 = alloca i1, i1 0
+  %nop15083 = alloca i1, i1 0
+  %nop15084 = alloca i1, i1 0
+  %nop15085 = alloca i1, i1 0
+  %nop15086 = alloca i1, i1 0
+  %nop15087 = alloca i1, i1 0
+  %nop15088 = alloca i1, i1 0
+  %nop15089 = alloca i1, i1 0
+  %nop15090 = alloca i1, i1 0
+  %nop15091 = alloca i1, i1 0
+  %nop15092 = alloca i1, i1 0
+  %nop15093 = alloca i1, i1 0
+  %nop15094 = alloca i1, i1 0
+  %nop15095 = alloca i1, i1 0
+  %nop15096 = alloca i1, i1 0
+  %nop15097 = alloca i1, i1 0
+  %nop15098 = alloca i1, i1 0
+  %nop15099 = alloca i1, i1 0
+  %nop15100 = alloca i1, i1 0
+  %nop15101 = alloca i1, i1 0
+  %nop15102 = alloca i1, i1 0
+  %nop15103 = alloca i1, i1 0
+  %nop15104 = alloca i1, i1 0
+  %nop15105 = alloca i1, i1 0
+  %nop15106 = alloca i1, i1 0
+  %nop15107 = alloca i1, i1 0
+  %nop15108 = alloca i1, i1 0
+  %nop15109 = alloca i1, i1 0
+  %nop15110 = alloca i1, i1 0
+  %nop15111 = alloca i1, i1 0
+  %nop15112 = alloca i1, i1 0
+  %nop15113 = alloca i1, i1 0
+  %nop15114 = alloca i1, i1 0
+  %nop15115 = alloca i1, i1 0
+  %nop15116 = alloca i1, i1 0
+  %nop15117 = alloca i1, i1 0
+  %nop15118 = alloca i1, i1 0
+  %nop15119 = alloca i1, i1 0
+  %nop15120 = alloca i1, i1 0
+  %nop15121 = alloca i1, i1 0
+  %nop15122 = alloca i1, i1 0
+  %nop15123 = alloca i1, i1 0
+  %nop15124 = alloca i1, i1 0
+  %nop15125 = alloca i1, i1 0
+  %nop15126 = alloca i1, i1 0
+  %nop15127 = alloca i1, i1 0
+  %nop15128 = alloca i1, i1 0
+  %nop15129 = alloca i1, i1 0
+  %nop15130 = alloca i1, i1 0
+  %nop15131 = alloca i1, i1 0
+  %nop15132 = alloca i1, i1 0
+  %nop15133 = alloca i1, i1 0
+  %nop15134 = alloca i1, i1 0
+  %nop15135 = alloca i1, i1 0
+  %nop15136 = alloca i1, i1 0
+  %nop15137 = alloca i1, i1 0
+  %nop15138 = alloca i1, i1 0
+  %nop15139 = alloca i1, i1 0
+  %nop15140 = alloca i1, i1 0
+  %nop15141 = alloca i1, i1 0
+  %nop15142 = alloca i1, i1 0
+  %nop15143 = alloca i1, i1 0
+  %nop15144 = alloca i1, i1 0
+  %nop15145 = alloca i1, i1 0
+  %nop15146 = alloca i1, i1 0
+  %nop15147 = alloca i1, i1 0
+  %nop15148 = alloca i1, i1 0
+  %nop15149 = alloca i1, i1 0
+  %nop15150 = alloca i1, i1 0
+  %nop15151 = alloca i1, i1 0
+  %nop15152 = alloca i1, i1 0
+  %nop15153 = alloca i1, i1 0
+  %nop15154 = alloca i1, i1 0
+  %nop15155 = alloca i1, i1 0
+  %nop15156 = alloca i1, i1 0
+  %nop15157 = alloca i1, i1 0
+  %nop15158 = alloca i1, i1 0
+  %nop15159 = alloca i1, i1 0
+  %nop15160 = alloca i1, i1 0
+  %nop15161 = alloca i1, i1 0
+  %nop15162 = alloca i1, i1 0
+  %nop15163 = alloca i1, i1 0
+  %nop15164 = alloca i1, i1 0
+  %nop15165 = alloca i1, i1 0
+  %nop15166 = alloca i1, i1 0
+  %nop15167 = alloca i1, i1 0
+  %nop15168 = alloca i1, i1 0
+  %nop15169 = alloca i1, i1 0
+  %nop15170 = alloca i1, i1 0
+  %nop15171 = alloca i1, i1 0
+  %nop15172 = alloca i1, i1 0
+  %nop15173 = alloca i1, i1 0
+  %nop15174 = alloca i1, i1 0
+  %nop15175 = alloca i1, i1 0
+  %nop15176 = alloca i1, i1 0
+  %nop15177 = alloca i1, i1 0
+  %nop15178 = alloca i1, i1 0
+  %nop15179 = alloca i1, i1 0
+  %nop15180 = alloca i1, i1 0
+  %nop15181 = alloca i1, i1 0
+  %nop15182 = alloca i1, i1 0
+  %nop15183 = alloca i1, i1 0
+  %nop15184 = alloca i1, i1 0
+  %nop15185 = alloca i1, i1 0
+  %nop15186 = alloca i1, i1 0
+  %nop15187 = alloca i1, i1 0
+  %nop15188 = alloca i1, i1 0
+  %nop15189 = alloca i1, i1 0
+  %nop15190 = alloca i1, i1 0
+  %nop15191 = alloca i1, i1 0
+  %nop15192 = alloca i1, i1 0
+  %nop15193 = alloca i1, i1 0
+  %nop15194 = alloca i1, i1 0
+  %nop15195 = alloca i1, i1 0
+  %nop15196 = alloca i1, i1 0
+  %nop15197 = alloca i1, i1 0
+  %nop15198 = alloca i1, i1 0
+  %nop15199 = alloca i1, i1 0
+  %nop15200 = alloca i1, i1 0
+  %nop15201 = alloca i1, i1 0
+  %nop15202 = alloca i1, i1 0
+  %nop15203 = alloca i1, i1 0
+  %nop15204 = alloca i1, i1 0
+  %nop15205 = alloca i1, i1 0
+  %nop15206 = alloca i1, i1 0
+  %nop15207 = alloca i1, i1 0
+  %nop15208 = alloca i1, i1 0
+  %nop15209 = alloca i1, i1 0
+  %nop15210 = alloca i1, i1 0
+  %nop15211 = alloca i1, i1 0
+  %nop15212 = alloca i1, i1 0
+  %nop15213 = alloca i1, i1 0
+  %nop15214 = alloca i1, i1 0
+  %nop15215 = alloca i1, i1 0
+  %nop15216 = alloca i1, i1 0
+  %nop15217 = alloca i1, i1 0
+  %nop15218 = alloca i1, i1 0
+  %nop15219 = alloca i1, i1 0
+  %nop15220 = alloca i1, i1 0
+  %nop15221 = alloca i1, i1 0
+  %nop15222 = alloca i1, i1 0
+  %nop15223 = alloca i1, i1 0
+  %nop15224 = alloca i1, i1 0
+  %nop15225 = alloca i1, i1 0
+  %nop15226 = alloca i1, i1 0
+  %nop15227 = alloca i1, i1 0
+  %nop15228 = alloca i1, i1 0
+  %nop15229 = alloca i1, i1 0
+  %nop15230 = alloca i1, i1 0
+  %nop15231 = alloca i1, i1 0
+  %nop15232 = alloca i1, i1 0
+  %nop15233 = alloca i1, i1 0
+  %nop15234 = alloca i1, i1 0
+  %nop15235 = alloca i1, i1 0
+  %nop15236 = alloca i1, i1 0
+  %nop15237 = alloca i1, i1 0
+  %nop15238 = alloca i1, i1 0
+  %nop15239 = alloca i1, i1 0
+  %nop15240 = alloca i1, i1 0
+  %nop15241 = alloca i1, i1 0
+  %nop15242 = alloca i1, i1 0
+  %nop15243 = alloca i1, i1 0
+  %nop15244 = alloca i1, i1 0
+  %nop15245 = alloca i1, i1 0
+  %nop15246 = alloca i1, i1 0
+  %nop15247 = alloca i1, i1 0
+  %nop15248 = alloca i1, i1 0
+  %nop15249 = alloca i1, i1 0
+  %nop15250 = alloca i1, i1 0
+  %nop15251 = alloca i1, i1 0
+  %nop15252 = alloca i1, i1 0
+  %nop15253 = alloca i1, i1 0
+  %nop15254 = alloca i1, i1 0
+  %nop15255 = alloca i1, i1 0
+  %nop15256 = alloca i1, i1 0
+  %nop15257 = alloca i1, i1 0
+  %nop15258 = alloca i1, i1 0
+  %nop15259 = alloca i1, i1 0
+  %nop15260 = alloca i1, i1 0
+  %nop15261 = alloca i1, i1 0
+  %nop15262 = alloca i1, i1 0
+  %nop15263 = alloca i1, i1 0
+  %nop15264 = alloca i1, i1 0
+  %nop15265 = alloca i1, i1 0
+  %nop15266 = alloca i1, i1 0
+  %nop15267 = alloca i1, i1 0
+  %nop15268 = alloca i1, i1 0
+  %nop15269 = alloca i1, i1 0
+  %nop15270 = alloca i1, i1 0
+  %nop15271 = alloca i1, i1 0
+  %nop15272 = alloca i1, i1 0
+  %nop15273 = alloca i1, i1 0
+  %nop15274 = alloca i1, i1 0
+  %nop15275 = alloca i1, i1 0
+  %nop15276 = alloca i1, i1 0
+  %nop15277 = alloca i1, i1 0
+  %nop15278 = alloca i1, i1 0
+  %nop15279 = alloca i1, i1 0
+  %nop15280 = alloca i1, i1 0
+  %nop15281 = alloca i1, i1 0
+  %nop15282 = alloca i1, i1 0
+  %nop15283 = alloca i1, i1 0
+  %nop15284 = alloca i1, i1 0
+  %nop15285 = alloca i1, i1 0
+  %nop15286 = alloca i1, i1 0
+  %nop15287 = alloca i1, i1 0
+  %nop15288 = alloca i1, i1 0
+  %nop15289 = alloca i1, i1 0
+  %nop15290 = alloca i1, i1 0
+  %nop15291 = alloca i1, i1 0
+  %nop15292 = alloca i1, i1 0
+  %nop15293 = alloca i1, i1 0
+  %nop15294 = alloca i1, i1 0
+  %nop15295 = alloca i1, i1 0
+  %nop15296 = alloca i1, i1 0
+  %nop15297 = alloca i1, i1 0
+  %nop15298 = alloca i1, i1 0
+  %nop15299 = alloca i1, i1 0
+  %nop15300 = alloca i1, i1 0
+  %nop15301 = alloca i1, i1 0
+  %nop15302 = alloca i1, i1 0
+  %nop15303 = alloca i1, i1 0
+  %nop15304 = alloca i1, i1 0
+  %nop15305 = alloca i1, i1 0
+  %nop15306 = alloca i1, i1 0
+  %nop15307 = alloca i1, i1 0
+  %nop15308 = alloca i1, i1 0
+  %nop15309 = alloca i1, i1 0
+  %nop15310 = alloca i1, i1 0
+  %nop15311 = alloca i1, i1 0
+  %nop15312 = alloca i1, i1 0
+  %nop15313 = alloca i1, i1 0
+  %nop15314 = alloca i1, i1 0
+  %nop15315 = alloca i1, i1 0
+  %nop15316 = alloca i1, i1 0
+  %nop15317 = alloca i1, i1 0
+  %nop15318 = alloca i1, i1 0
+  %nop15319 = alloca i1, i1 0
+  %nop15320 = alloca i1, i1 0
+  %nop15321 = alloca i1, i1 0
+  %nop15322 = alloca i1, i1 0
+  %nop15323 = alloca i1, i1 0
+  %nop15324 = alloca i1, i1 0
+  %nop15325 = alloca i1, i1 0
+  %nop15326 = alloca i1, i1 0
+  %nop15327 = alloca i1, i1 0
+  %nop15328 = alloca i1, i1 0
+  %nop15329 = alloca i1, i1 0
+  %nop15330 = alloca i1, i1 0
+  %nop15331 = alloca i1, i1 0
+  %nop15332 = alloca i1, i1 0
+  %nop15333 = alloca i1, i1 0
+  %nop15334 = alloca i1, i1 0
+  %nop15335 = alloca i1, i1 0
+  %nop15336 = alloca i1, i1 0
+  %nop15337 = alloca i1, i1 0
+  %nop15338 = alloca i1, i1 0
+  %nop15339 = alloca i1, i1 0
+  %nop15340 = alloca i1, i1 0
+  %nop15341 = alloca i1, i1 0
+  %nop15342 = alloca i1, i1 0
+  %nop15343 = alloca i1, i1 0
+  %nop15344 = alloca i1, i1 0
+  %nop15345 = alloca i1, i1 0
+  %nop15346 = alloca i1, i1 0
+  %nop15347 = alloca i1, i1 0
+  %nop15348 = alloca i1, i1 0
+  %nop15349 = alloca i1, i1 0
+  %nop15350 = alloca i1, i1 0
+  %nop15351 = alloca i1, i1 0
+  %nop15352 = alloca i1, i1 0
+  %nop15353 = alloca i1, i1 0
+  %nop15354 = alloca i1, i1 0
+  %nop15355 = alloca i1, i1 0
+  %nop15356 = alloca i1, i1 0
+  %nop15357 = alloca i1, i1 0
+  %nop15358 = alloca i1, i1 0
+  %nop15359 = alloca i1, i1 0
+  %nop15360 = alloca i1, i1 0
+  %nop15361 = alloca i1, i1 0
+  %nop15362 = alloca i1, i1 0
+  %nop15363 = alloca i1, i1 0
+  %nop15364 = alloca i1, i1 0
+  %nop15365 = alloca i1, i1 0
+  %nop15366 = alloca i1, i1 0
+  %nop15367 = alloca i1, i1 0
+  %nop15368 = alloca i1, i1 0
+  %nop15369 = alloca i1, i1 0
+  %nop15370 = alloca i1, i1 0
+  %nop15371 = alloca i1, i1 0
+  %nop15372 = alloca i1, i1 0
+  %nop15373 = alloca i1, i1 0
+  %nop15374 = alloca i1, i1 0
+  %nop15375 = alloca i1, i1 0
+  %nop15376 = alloca i1, i1 0
+  %nop15377 = alloca i1, i1 0
+  %nop15378 = alloca i1, i1 0
+  %nop15379 = alloca i1, i1 0
+  %nop15380 = alloca i1, i1 0
+  %nop15381 = alloca i1, i1 0
+  %nop15382 = alloca i1, i1 0
+  %nop15383 = alloca i1, i1 0
+  %nop15384 = alloca i1, i1 0
+  %nop15385 = alloca i1, i1 0
+  %nop15386 = alloca i1, i1 0
+  %nop15387 = alloca i1, i1 0
+  %nop15388 = alloca i1, i1 0
+  %nop15389 = alloca i1, i1 0
+  %nop15390 = alloca i1, i1 0
+  %nop15391 = alloca i1, i1 0
+  %nop15392 = alloca i1, i1 0
+  %nop15393 = alloca i1, i1 0
+  %nop15394 = alloca i1, i1 0
+  %nop15395 = alloca i1, i1 0
+  %nop15396 = alloca i1, i1 0
+  %nop15397 = alloca i1, i1 0
+  %nop15398 = alloca i1, i1 0
+  %nop15399 = alloca i1, i1 0
+  %nop15400 = alloca i1, i1 0
+  %nop15401 = alloca i1, i1 0
+  %nop15402 = alloca i1, i1 0
+  %nop15403 = alloca i1, i1 0
+  %nop15404 = alloca i1, i1 0
+  %nop15405 = alloca i1, i1 0
+  %nop15406 = alloca i1, i1 0
+  %nop15407 = alloca i1, i1 0
+  %nop15408 = alloca i1, i1 0
+  %nop15409 = alloca i1, i1 0
+  %nop15410 = alloca i1, i1 0
+  %nop15411 = alloca i1, i1 0
+  %nop15412 = alloca i1, i1 0
+  %nop15413 = alloca i1, i1 0
+  %nop15414 = alloca i1, i1 0
+  %nop15415 = alloca i1, i1 0
+  %nop15416 = alloca i1, i1 0
+  %nop15417 = alloca i1, i1 0
+  %nop15418 = alloca i1, i1 0
+  %nop15419 = alloca i1, i1 0
+  %nop15420 = alloca i1, i1 0
+  %nop15421 = alloca i1, i1 0
+  %nop15422 = alloca i1, i1 0
+  %nop15423 = alloca i1, i1 0
+  %nop15424 = alloca i1, i1 0
+  %nop15425 = alloca i1, i1 0
+  %nop15426 = alloca i1, i1 0
+  %nop15427 = alloca i1, i1 0
+  %nop15428 = alloca i1, i1 0
+  %nop15429 = alloca i1, i1 0
+  %nop15430 = alloca i1, i1 0
+  %nop15431 = alloca i1, i1 0
+  %nop15432 = alloca i1, i1 0
+  %nop15433 = alloca i1, i1 0
+  %nop15434 = alloca i1, i1 0
+  %nop15435 = alloca i1, i1 0
+  %nop15436 = alloca i1, i1 0
+  %nop15437 = alloca i1, i1 0
+  %nop15438 = alloca i1, i1 0
+  %nop15439 = alloca i1, i1 0
+  %nop15440 = alloca i1, i1 0
+  %nop15441 = alloca i1, i1 0
+  %nop15442 = alloca i1, i1 0
+  %nop15443 = alloca i1, i1 0
+  %nop15444 = alloca i1, i1 0
+  %nop15445 = alloca i1, i1 0
+  %nop15446 = alloca i1, i1 0
+  %nop15447 = alloca i1, i1 0
+  %nop15448 = alloca i1, i1 0
+  %nop15449 = alloca i1, i1 0
+  %nop15450 = alloca i1, i1 0
+  %nop15451 = alloca i1, i1 0
+  %nop15452 = alloca i1, i1 0
+  %nop15453 = alloca i1, i1 0
+  %nop15454 = alloca i1, i1 0
+  %nop15455 = alloca i1, i1 0
+  %nop15456 = alloca i1, i1 0
+  %nop15457 = alloca i1, i1 0
+  %nop15458 = alloca i1, i1 0
+  %nop15459 = alloca i1, i1 0
+  %nop15460 = alloca i1, i1 0
+  %nop15461 = alloca i1, i1 0
+  %nop15462 = alloca i1, i1 0
+  %nop15463 = alloca i1, i1 0
+  %nop15464 = alloca i1, i1 0
+  %nop15465 = alloca i1, i1 0
+  %nop15466 = alloca i1, i1 0
+  %nop15467 = alloca i1, i1 0
+  %nop15468 = alloca i1, i1 0
+  %nop15469 = alloca i1, i1 0
+  %nop15470 = alloca i1, i1 0
+  %nop15471 = alloca i1, i1 0
+  %nop15472 = alloca i1, i1 0
+  %nop15473 = alloca i1, i1 0
+  %nop15474 = alloca i1, i1 0
+  %nop15475 = alloca i1, i1 0
+  %nop15476 = alloca i1, i1 0
+  %nop15477 = alloca i1, i1 0
+  %nop15478 = alloca i1, i1 0
+  %nop15479 = alloca i1, i1 0
+  %nop15480 = alloca i1, i1 0
+  %nop15481 = alloca i1, i1 0
+  %nop15482 = alloca i1, i1 0
+  %nop15483 = alloca i1, i1 0
+  %nop15484 = alloca i1, i1 0
+  %nop15485 = alloca i1, i1 0
+  %nop15486 = alloca i1, i1 0
+  %nop15487 = alloca i1, i1 0
+  %nop15488 = alloca i1, i1 0
+  %nop15489 = alloca i1, i1 0
+  %nop15490 = alloca i1, i1 0
+  %nop15491 = alloca i1, i1 0
+  %nop15492 = alloca i1, i1 0
+  %nop15493 = alloca i1, i1 0
+  %nop15494 = alloca i1, i1 0
+  %nop15495 = alloca i1, i1 0
+  %nop15496 = alloca i1, i1 0
+  %nop15497 = alloca i1, i1 0
+  %nop15498 = alloca i1, i1 0
+  %nop15499 = alloca i1, i1 0
+  %nop15500 = alloca i1, i1 0
+  %nop15501 = alloca i1, i1 0
+  %nop15502 = alloca i1, i1 0
+  %nop15503 = alloca i1, i1 0
+  %nop15504 = alloca i1, i1 0
+  %nop15505 = alloca i1, i1 0
+  %nop15506 = alloca i1, i1 0
+  %nop15507 = alloca i1, i1 0
+  %nop15508 = alloca i1, i1 0
+  %nop15509 = alloca i1, i1 0
+  %nop15510 = alloca i1, i1 0
+  %nop15511 = alloca i1, i1 0
+  %nop15512 = alloca i1, i1 0
+  %nop15513 = alloca i1, i1 0
+  %nop15514 = alloca i1, i1 0
+  %nop15515 = alloca i1, i1 0
+  %nop15516 = alloca i1, i1 0
+  %nop15517 = alloca i1, i1 0
+  %nop15518 = alloca i1, i1 0
+  %nop15519 = alloca i1, i1 0
+  %nop15520 = alloca i1, i1 0
+  %nop15521 = alloca i1, i1 0
+  %nop15522 = alloca i1, i1 0
+  %nop15523 = alloca i1, i1 0
+  %nop15524 = alloca i1, i1 0
+  %nop15525 = alloca i1, i1 0
+  %nop15526 = alloca i1, i1 0
+  %nop15527 = alloca i1, i1 0
+  %nop15528 = alloca i1, i1 0
+  %nop15529 = alloca i1, i1 0
+  %nop15530 = alloca i1, i1 0
+  %nop15531 = alloca i1, i1 0
+  %nop15532 = alloca i1, i1 0
+  %nop15533 = alloca i1, i1 0
+  %nop15534 = alloca i1, i1 0
+  %nop15535 = alloca i1, i1 0
+  %nop15536 = alloca i1, i1 0
+  %nop15537 = alloca i1, i1 0
+  %nop15538 = alloca i1, i1 0
+  %nop15539 = alloca i1, i1 0
+  %nop15540 = alloca i1, i1 0
+  %nop15541 = alloca i1, i1 0
+  %nop15542 = alloca i1, i1 0
+  %nop15543 = alloca i1, i1 0
+  %nop15544 = alloca i1, i1 0
+  %nop15545 = alloca i1, i1 0
+  %nop15546 = alloca i1, i1 0
+  %nop15547 = alloca i1, i1 0
+  %nop15548 = alloca i1, i1 0
+  %nop15549 = alloca i1, i1 0
+  %nop15550 = alloca i1, i1 0
+  %nop15551 = alloca i1, i1 0
+  %nop15552 = alloca i1, i1 0
+  %nop15553 = alloca i1, i1 0
+  %nop15554 = alloca i1, i1 0
+  %nop15555 = alloca i1, i1 0
+  %nop15556 = alloca i1, i1 0
+  %nop15557 = alloca i1, i1 0
+  %nop15558 = alloca i1, i1 0
+  %nop15559 = alloca i1, i1 0
+  %nop15560 = alloca i1, i1 0
+  %nop15561 = alloca i1, i1 0
+  %nop15562 = alloca i1, i1 0
+  %nop15563 = alloca i1, i1 0
+  %nop15564 = alloca i1, i1 0
+  %nop15565 = alloca i1, i1 0
+  %nop15566 = alloca i1, i1 0
+  %nop15567 = alloca i1, i1 0
+  %nop15568 = alloca i1, i1 0
+  %nop15569 = alloca i1, i1 0
+  %nop15570 = alloca i1, i1 0
+  %nop15571 = alloca i1, i1 0
+  %nop15572 = alloca i1, i1 0
+  %nop15573 = alloca i1, i1 0
+  %nop15574 = alloca i1, i1 0
+  %nop15575 = alloca i1, i1 0
+  %nop15576 = alloca i1, i1 0
+  %nop15577 = alloca i1, i1 0
+  %nop15578 = alloca i1, i1 0
+  %nop15579 = alloca i1, i1 0
+  %nop15580 = alloca i1, i1 0
+  %nop15581 = alloca i1, i1 0
+  %nop15582 = alloca i1, i1 0
+  %nop15583 = alloca i1, i1 0
+  %nop15584 = alloca i1, i1 0
+  %nop15585 = alloca i1, i1 0
+  %nop15586 = alloca i1, i1 0
+  %nop15587 = alloca i1, i1 0
+  %nop15588 = alloca i1, i1 0
+  %nop15589 = alloca i1, i1 0
+  %nop15590 = alloca i1, i1 0
+  %nop15591 = alloca i1, i1 0
+  %nop15592 = alloca i1, i1 0
+  %nop15593 = alloca i1, i1 0
+  %nop15594 = alloca i1, i1 0
+  %nop15595 = alloca i1, i1 0
+  %nop15596 = alloca i1, i1 0
+  %nop15597 = alloca i1, i1 0
+  %nop15598 = alloca i1, i1 0
+  %nop15599 = alloca i1, i1 0
+  %nop15600 = alloca i1, i1 0
+  %nop15601 = alloca i1, i1 0
+  %nop15602 = alloca i1, i1 0
+  %nop15603 = alloca i1, i1 0
+  %nop15604 = alloca i1, i1 0
+  %nop15605 = alloca i1, i1 0
+  %nop15606 = alloca i1, i1 0
+  %nop15607 = alloca i1, i1 0
+  %nop15608 = alloca i1, i1 0
+  %nop15609 = alloca i1, i1 0
+  %nop15610 = alloca i1, i1 0
+  %nop15611 = alloca i1, i1 0
+  %nop15612 = alloca i1, i1 0
+  %nop15613 = alloca i1, i1 0
+  %nop15614 = alloca i1, i1 0
+  %nop15615 = alloca i1, i1 0
+  %nop15616 = alloca i1, i1 0
+  %nop15617 = alloca i1, i1 0
+  %nop15618 = alloca i1, i1 0
+  %nop15619 = alloca i1, i1 0
+  %nop15620 = alloca i1, i1 0
+  %nop15621 = alloca i1, i1 0
+  %nop15622 = alloca i1, i1 0
+  %nop15623 = alloca i1, i1 0
+  %nop15624 = alloca i1, i1 0
+  %nop15625 = alloca i1, i1 0
+  %nop15626 = alloca i1, i1 0
+  %nop15627 = alloca i1, i1 0
+  %nop15628 = alloca i1, i1 0
+  %nop15629 = alloca i1, i1 0
+  %nop15630 = alloca i1, i1 0
+  %nop15631 = alloca i1, i1 0
+  %nop15632 = alloca i1, i1 0
+  %nop15633 = alloca i1, i1 0
+  %nop15634 = alloca i1, i1 0
+  %nop15635 = alloca i1, i1 0
+  %nop15636 = alloca i1, i1 0
+  %nop15637 = alloca i1, i1 0
+  %nop15638 = alloca i1, i1 0
+  %nop15639 = alloca i1, i1 0
+  %nop15640 = alloca i1, i1 0
+  %nop15641 = alloca i1, i1 0
+  %nop15642 = alloca i1, i1 0
+  %nop15643 = alloca i1, i1 0
+  %nop15644 = alloca i1, i1 0
+  %nop15645 = alloca i1, i1 0
+  %nop15646 = alloca i1, i1 0
+  %nop15647 = alloca i1, i1 0
+  %nop15648 = alloca i1, i1 0
+  %nop15649 = alloca i1, i1 0
+  %nop15650 = alloca i1, i1 0
+  %nop15651 = alloca i1, i1 0
+  %nop15652 = alloca i1, i1 0
+  %nop15653 = alloca i1, i1 0
+  %nop15654 = alloca i1, i1 0
+  %nop15655 = alloca i1, i1 0
+  %nop15656 = alloca i1, i1 0
+  %nop15657 = alloca i1, i1 0
+  %nop15658 = alloca i1, i1 0
+  %nop15659 = alloca i1, i1 0
+  %nop15660 = alloca i1, i1 0
+  %nop15661 = alloca i1, i1 0
+  %nop15662 = alloca i1, i1 0
+  %nop15663 = alloca i1, i1 0
+  %nop15664 = alloca i1, i1 0
+  %nop15665 = alloca i1, i1 0
+  %nop15666 = alloca i1, i1 0
+  %nop15667 = alloca i1, i1 0
+  %nop15668 = alloca i1, i1 0
+  %nop15669 = alloca i1, i1 0
+  %nop15670 = alloca i1, i1 0
+  %nop15671 = alloca i1, i1 0
+  %nop15672 = alloca i1, i1 0
+  %nop15673 = alloca i1, i1 0
+  %nop15674 = alloca i1, i1 0
+  %nop15675 = alloca i1, i1 0
+  %nop15676 = alloca i1, i1 0
+  %nop15677 = alloca i1, i1 0
+  %nop15678 = alloca i1, i1 0
+  %nop15679 = alloca i1, i1 0
+  %nop15680 = alloca i1, i1 0
+  %nop15681 = alloca i1, i1 0
+  %nop15682 = alloca i1, i1 0
+  %nop15683 = alloca i1, i1 0
+  %nop15684 = alloca i1, i1 0
+  %nop15685 = alloca i1, i1 0
+  %nop15686 = alloca i1, i1 0
+  %nop15687 = alloca i1, i1 0
+  %nop15688 = alloca i1, i1 0
+  %nop15689 = alloca i1, i1 0
+  %nop15690 = alloca i1, i1 0
+  %nop15691 = alloca i1, i1 0
+  %nop15692 = alloca i1, i1 0
+  %nop15693 = alloca i1, i1 0
+  %nop15694 = alloca i1, i1 0
+  %nop15695 = alloca i1, i1 0
+  %nop15696 = alloca i1, i1 0
+  %nop15697 = alloca i1, i1 0
+  %nop15698 = alloca i1, i1 0
+  %nop15699 = alloca i1, i1 0
+  %nop15700 = alloca i1, i1 0
+  %nop15701 = alloca i1, i1 0
+  %nop15702 = alloca i1, i1 0
+  %nop15703 = alloca i1, i1 0
+  %nop15704 = alloca i1, i1 0
+  %nop15705 = alloca i1, i1 0
+  %nop15706 = alloca i1, i1 0
+  %nop15707 = alloca i1, i1 0
+  %nop15708 = alloca i1, i1 0
+  %nop15709 = alloca i1, i1 0
+  %nop15710 = alloca i1, i1 0
+  %nop15711 = alloca i1, i1 0
+  %nop15712 = alloca i1, i1 0
+  %nop15713 = alloca i1, i1 0
+  %nop15714 = alloca i1, i1 0
+  %nop15715 = alloca i1, i1 0
+  %nop15716 = alloca i1, i1 0
+  %nop15717 = alloca i1, i1 0
+  %nop15718 = alloca i1, i1 0
+  %nop15719 = alloca i1, i1 0
+  %nop15720 = alloca i1, i1 0
+  %nop15721 = alloca i1, i1 0
+  %nop15722 = alloca i1, i1 0
+  %nop15723 = alloca i1, i1 0
+  %nop15724 = alloca i1, i1 0
+  %nop15725 = alloca i1, i1 0
+  %nop15726 = alloca i1, i1 0
+  %nop15727 = alloca i1, i1 0
+  %nop15728 = alloca i1, i1 0
+  %nop15729 = alloca i1, i1 0
+  %nop15730 = alloca i1, i1 0
+  %nop15731 = alloca i1, i1 0
+  %nop15732 = alloca i1, i1 0
+  %nop15733 = alloca i1, i1 0
+  %nop15734 = alloca i1, i1 0
+  %nop15735 = alloca i1, i1 0
+  %nop15736 = alloca i1, i1 0
+  %nop15737 = alloca i1, i1 0
+  %nop15738 = alloca i1, i1 0
+  %nop15739 = alloca i1, i1 0
+  %nop15740 = alloca i1, i1 0
+  %nop15741 = alloca i1, i1 0
+  %nop15742 = alloca i1, i1 0
+  %nop15743 = alloca i1, i1 0
+  %nop15744 = alloca i1, i1 0
+  %nop15745 = alloca i1, i1 0
+  %nop15746 = alloca i1, i1 0
+  %nop15747 = alloca i1, i1 0
+  %nop15748 = alloca i1, i1 0
+  %nop15749 = alloca i1, i1 0
+  %nop15750 = alloca i1, i1 0
+  %nop15751 = alloca i1, i1 0
+  %nop15752 = alloca i1, i1 0
+  %nop15753 = alloca i1, i1 0
+  %nop15754 = alloca i1, i1 0
+  %nop15755 = alloca i1, i1 0
+  %nop15756 = alloca i1, i1 0
+  %nop15757 = alloca i1, i1 0
+  %nop15758 = alloca i1, i1 0
+  %nop15759 = alloca i1, i1 0
+  %nop15760 = alloca i1, i1 0
+  %nop15761 = alloca i1, i1 0
+  %nop15762 = alloca i1, i1 0
+  %nop15763 = alloca i1, i1 0
+  %nop15764 = alloca i1, i1 0
+  %nop15765 = alloca i1, i1 0
+  %nop15766 = alloca i1, i1 0
+  %nop15767 = alloca i1, i1 0
+  %nop15768 = alloca i1, i1 0
+  %nop15769 = alloca i1, i1 0
+  %nop15770 = alloca i1, i1 0
+  %nop15771 = alloca i1, i1 0
+  %nop15772 = alloca i1, i1 0
+  %nop15773 = alloca i1, i1 0
+  %nop15774 = alloca i1, i1 0
+  %nop15775 = alloca i1, i1 0
+  %nop15776 = alloca i1, i1 0
+  %nop15777 = alloca i1, i1 0
+  %nop15778 = alloca i1, i1 0
+  %nop15779 = alloca i1, i1 0
+  %nop15780 = alloca i1, i1 0
+  %nop15781 = alloca i1, i1 0
+  %nop15782 = alloca i1, i1 0
+  %nop15783 = alloca i1, i1 0
+  %nop15784 = alloca i1, i1 0
+  %nop15785 = alloca i1, i1 0
+  %nop15786 = alloca i1, i1 0
+  %nop15787 = alloca i1, i1 0
+  %nop15788 = alloca i1, i1 0
+  %nop15789 = alloca i1, i1 0
+  %nop15790 = alloca i1, i1 0
+  %nop15791 = alloca i1, i1 0
+  %nop15792 = alloca i1, i1 0
+  %nop15793 = alloca i1, i1 0
+  %nop15794 = alloca i1, i1 0
+  %nop15795 = alloca i1, i1 0
+  %nop15796 = alloca i1, i1 0
+  %nop15797 = alloca i1, i1 0
+  %nop15798 = alloca i1, i1 0
+  %nop15799 = alloca i1, i1 0
+  %nop15800 = alloca i1, i1 0
+  %nop15801 = alloca i1, i1 0
+  %nop15802 = alloca i1, i1 0
+  %nop15803 = alloca i1, i1 0
+  %nop15804 = alloca i1, i1 0
+  %nop15805 = alloca i1, i1 0
+  %nop15806 = alloca i1, i1 0
+  %nop15807 = alloca i1, i1 0
+  %nop15808 = alloca i1, i1 0
+  %nop15809 = alloca i1, i1 0
+  %nop15810 = alloca i1, i1 0
+  %nop15811 = alloca i1, i1 0
+  %nop15812 = alloca i1, i1 0
+  %nop15813 = alloca i1, i1 0
+  %nop15814 = alloca i1, i1 0
+  %nop15815 = alloca i1, i1 0
+  %nop15816 = alloca i1, i1 0
+  %nop15817 = alloca i1, i1 0
+  %nop15818 = alloca i1, i1 0
+  %nop15819 = alloca i1, i1 0
+  %nop15820 = alloca i1, i1 0
+  %nop15821 = alloca i1, i1 0
+  %nop15822 = alloca i1, i1 0
+  %nop15823 = alloca i1, i1 0
+  %nop15824 = alloca i1, i1 0
+  %nop15825 = alloca i1, i1 0
+  %nop15826 = alloca i1, i1 0
+  %nop15827 = alloca i1, i1 0
+  %nop15828 = alloca i1, i1 0
+  %nop15829 = alloca i1, i1 0
+  %nop15830 = alloca i1, i1 0
+  %nop15831 = alloca i1, i1 0
+  %nop15832 = alloca i1, i1 0
+  %nop15833 = alloca i1, i1 0
+  %nop15834 = alloca i1, i1 0
+  %nop15835 = alloca i1, i1 0
+  %nop15836 = alloca i1, i1 0
+  %nop15837 = alloca i1, i1 0
+  %nop15838 = alloca i1, i1 0
+  %nop15839 = alloca i1, i1 0
+  %nop15840 = alloca i1, i1 0
+  %nop15841 = alloca i1, i1 0
+  %nop15842 = alloca i1, i1 0
+  %nop15843 = alloca i1, i1 0
+  %nop15844 = alloca i1, i1 0
+  %nop15845 = alloca i1, i1 0
+  %nop15846 = alloca i1, i1 0
+  %nop15847 = alloca i1, i1 0
+  %nop15848 = alloca i1, i1 0
+  %nop15849 = alloca i1, i1 0
+  %nop15850 = alloca i1, i1 0
+  %nop15851 = alloca i1, i1 0
+  %nop15852 = alloca i1, i1 0
+  %nop15853 = alloca i1, i1 0
+  %nop15854 = alloca i1, i1 0
+  %nop15855 = alloca i1, i1 0
+  %nop15856 = alloca i1, i1 0
+  %nop15857 = alloca i1, i1 0
+  %nop15858 = alloca i1, i1 0
+  %nop15859 = alloca i1, i1 0
+  %nop15860 = alloca i1, i1 0
+  %nop15861 = alloca i1, i1 0
+  %nop15862 = alloca i1, i1 0
+  %nop15863 = alloca i1, i1 0
+  %nop15864 = alloca i1, i1 0
+  %nop15865 = alloca i1, i1 0
+  %nop15866 = alloca i1, i1 0
+  %nop15867 = alloca i1, i1 0
+  %nop15868 = alloca i1, i1 0
+  %nop15869 = alloca i1, i1 0
+  %nop15870 = alloca i1, i1 0
+  %nop15871 = alloca i1, i1 0
+  %nop15872 = alloca i1, i1 0
+  %nop15873 = alloca i1, i1 0
+  %nop15874 = alloca i1, i1 0
+  %nop15875 = alloca i1, i1 0
+  %nop15876 = alloca i1, i1 0
+  %nop15877 = alloca i1, i1 0
+  %nop15878 = alloca i1, i1 0
+  %nop15879 = alloca i1, i1 0
+  %nop15880 = alloca i1, i1 0
+  %nop15881 = alloca i1, i1 0
+  %nop15882 = alloca i1, i1 0
+  %nop15883 = alloca i1, i1 0
+  %nop15884 = alloca i1, i1 0
+  %nop15885 = alloca i1, i1 0
+  %nop15886 = alloca i1, i1 0
+  %nop15887 = alloca i1, i1 0
+  %nop15888 = alloca i1, i1 0
+  %nop15889 = alloca i1, i1 0
+  %nop15890 = alloca i1, i1 0
+  %nop15891 = alloca i1, i1 0
+  %nop15892 = alloca i1, i1 0
+  %nop15893 = alloca i1, i1 0
+  %nop15894 = alloca i1, i1 0
+  %nop15895 = alloca i1, i1 0
+  %nop15896 = alloca i1, i1 0
+  %nop15897 = alloca i1, i1 0
+  %nop15898 = alloca i1, i1 0
+  %nop15899 = alloca i1, i1 0
+  %nop15900 = alloca i1, i1 0
+  %nop15901 = alloca i1, i1 0
+  %nop15902 = alloca i1, i1 0
+  %nop15903 = alloca i1, i1 0
+  %nop15904 = alloca i1, i1 0
+  %nop15905 = alloca i1, i1 0
+  %nop15906 = alloca i1, i1 0
+  %nop15907 = alloca i1, i1 0
+  %nop15908 = alloca i1, i1 0
+  %nop15909 = alloca i1, i1 0
+  %nop15910 = alloca i1, i1 0
+  %nop15911 = alloca i1, i1 0
+  %nop15912 = alloca i1, i1 0
+  %nop15913 = alloca i1, i1 0
+  %nop15914 = alloca i1, i1 0
+  %nop15915 = alloca i1, i1 0
+  %nop15916 = alloca i1, i1 0
+  %nop15917 = alloca i1, i1 0
+  %nop15918 = alloca i1, i1 0
+  %nop15919 = alloca i1, i1 0
+  %nop15920 = alloca i1, i1 0
+  %nop15921 = alloca i1, i1 0
+  %nop15922 = alloca i1, i1 0
+  %nop15923 = alloca i1, i1 0
+  %nop15924 = alloca i1, i1 0
+  %nop15925 = alloca i1, i1 0
+  %nop15926 = alloca i1, i1 0
+  %nop15927 = alloca i1, i1 0
+  %nop15928 = alloca i1, i1 0
+  %nop15929 = alloca i1, i1 0
+  %nop15930 = alloca i1, i1 0
+  %nop15931 = alloca i1, i1 0
+  %nop15932 = alloca i1, i1 0
+  %nop15933 = alloca i1, i1 0
+  %nop15934 = alloca i1, i1 0
+  %nop15935 = alloca i1, i1 0
+  %nop15936 = alloca i1, i1 0
+  %nop15937 = alloca i1, i1 0
+  %nop15938 = alloca i1, i1 0
+  %nop15939 = alloca i1, i1 0
+  %nop15940 = alloca i1, i1 0
+  %nop15941 = alloca i1, i1 0
+  %nop15942 = alloca i1, i1 0
+  %nop15943 = alloca i1, i1 0
+  %nop15944 = alloca i1, i1 0
+  %nop15945 = alloca i1, i1 0
+  %nop15946 = alloca i1, i1 0
+  %nop15947 = alloca i1, i1 0
+  %nop15948 = alloca i1, i1 0
+  %nop15949 = alloca i1, i1 0
+  %nop15950 = alloca i1, i1 0
+  %nop15951 = alloca i1, i1 0
+  %nop15952 = alloca i1, i1 0
+  %nop15953 = alloca i1, i1 0
+  %nop15954 = alloca i1, i1 0
+  %nop15955 = alloca i1, i1 0
+  %nop15956 = alloca i1, i1 0
+  %nop15957 = alloca i1, i1 0
+  %nop15958 = alloca i1, i1 0
+  %nop15959 = alloca i1, i1 0
+  %nop15960 = alloca i1, i1 0
+  %nop15961 = alloca i1, i1 0
+  %nop15962 = alloca i1, i1 0
+  %nop15963 = alloca i1, i1 0
+  %nop15964 = alloca i1, i1 0
+  %nop15965 = alloca i1, i1 0
+  %nop15966 = alloca i1, i1 0
+  %nop15967 = alloca i1, i1 0
+  %nop15968 = alloca i1, i1 0
+  %nop15969 = alloca i1, i1 0
+  %nop15970 = alloca i1, i1 0
+  %nop15971 = alloca i1, i1 0
+  %nop15972 = alloca i1, i1 0
+  %nop15973 = alloca i1, i1 0
+  %nop15974 = alloca i1, i1 0
+  %nop15975 = alloca i1, i1 0
+  %nop15976 = alloca i1, i1 0
+  %nop15977 = alloca i1, i1 0
+  %nop15978 = alloca i1, i1 0
+  %nop15979 = alloca i1, i1 0
+  %nop15980 = alloca i1, i1 0
+  %nop15981 = alloca i1, i1 0
+  %nop15982 = alloca i1, i1 0
+  %nop15983 = alloca i1, i1 0
+  %nop15984 = alloca i1, i1 0
+  %nop15985 = alloca i1, i1 0
+  %nop15986 = alloca i1, i1 0
+  %nop15987 = alloca i1, i1 0
+  %nop15988 = alloca i1, i1 0
+  %nop15989 = alloca i1, i1 0
+  %nop15990 = alloca i1, i1 0
+  %nop15991 = alloca i1, i1 0
+  %nop15992 = alloca i1, i1 0
+  %nop15993 = alloca i1, i1 0
+  %nop15994 = alloca i1, i1 0
+  %nop15995 = alloca i1, i1 0
+  %nop15996 = alloca i1, i1 0
+  %nop15997 = alloca i1, i1 0
+  %nop15998 = alloca i1, i1 0
+  %nop15999 = alloca i1, i1 0
+  %nop16000 = alloca i1, i1 0
+  %nop16001 = alloca i1, i1 0
+  %nop16002 = alloca i1, i1 0
+  %nop16003 = alloca i1, i1 0
+  %nop16004 = alloca i1, i1 0
+  %nop16005 = alloca i1, i1 0
+  %nop16006 = alloca i1, i1 0
+  %nop16007 = alloca i1, i1 0
+  %nop16008 = alloca i1, i1 0
+  %nop16009 = alloca i1, i1 0
+  %nop16010 = alloca i1, i1 0
+  %nop16011 = alloca i1, i1 0
+  %nop16012 = alloca i1, i1 0
+  %nop16013 = alloca i1, i1 0
+  %nop16014 = alloca i1, i1 0
+  %nop16015 = alloca i1, i1 0
+  %nop16016 = alloca i1, i1 0
+  %nop16017 = alloca i1, i1 0
+  %nop16018 = alloca i1, i1 0
+  %nop16019 = alloca i1, i1 0
+  %nop16020 = alloca i1, i1 0
+  %nop16021 = alloca i1, i1 0
+  %nop16022 = alloca i1, i1 0
+  %nop16023 = alloca i1, i1 0
+  %nop16024 = alloca i1, i1 0
+  %nop16025 = alloca i1, i1 0
+  %nop16026 = alloca i1, i1 0
+  %nop16027 = alloca i1, i1 0
+  %nop16028 = alloca i1, i1 0
+  %nop16029 = alloca i1, i1 0
+  %nop16030 = alloca i1, i1 0
+  %nop16031 = alloca i1, i1 0
+  %nop16032 = alloca i1, i1 0
+  %nop16033 = alloca i1, i1 0
+  %nop16034 = alloca i1, i1 0
+  %nop16035 = alloca i1, i1 0
+  %nop16036 = alloca i1, i1 0
+  %nop16037 = alloca i1, i1 0
+  %nop16038 = alloca i1, i1 0
+  %nop16039 = alloca i1, i1 0
+  %nop16040 = alloca i1, i1 0
+  %nop16041 = alloca i1, i1 0
+  %nop16042 = alloca i1, i1 0
+  %nop16043 = alloca i1, i1 0
+  %nop16044 = alloca i1, i1 0
+  %nop16045 = alloca i1, i1 0
+  %nop16046 = alloca i1, i1 0
+  %nop16047 = alloca i1, i1 0
+  %nop16048 = alloca i1, i1 0
+  %nop16049 = alloca i1, i1 0
+  %nop16050 = alloca i1, i1 0
+  %nop16051 = alloca i1, i1 0
+  %nop16052 = alloca i1, i1 0
+  %nop16053 = alloca i1, i1 0
+  %nop16054 = alloca i1, i1 0
+  %nop16055 = alloca i1, i1 0
+  %nop16056 = alloca i1, i1 0
+  %nop16057 = alloca i1, i1 0
+  %nop16058 = alloca i1, i1 0
+  %nop16059 = alloca i1, i1 0
+  %nop16060 = alloca i1, i1 0
+  %nop16061 = alloca i1, i1 0
+  %nop16062 = alloca i1, i1 0
+  %nop16063 = alloca i1, i1 0
+  %nop16064 = alloca i1, i1 0
+  %nop16065 = alloca i1, i1 0
+  %nop16066 = alloca i1, i1 0
+  %nop16067 = alloca i1, i1 0
+  %nop16068 = alloca i1, i1 0
+  %nop16069 = alloca i1, i1 0
+  %nop16070 = alloca i1, i1 0
+  %nop16071 = alloca i1, i1 0
+  %nop16072 = alloca i1, i1 0
+  %nop16073 = alloca i1, i1 0
+  %nop16074 = alloca i1, i1 0
+  %nop16075 = alloca i1, i1 0
+  %nop16076 = alloca i1, i1 0
+  %nop16077 = alloca i1, i1 0
+  %nop16078 = alloca i1, i1 0
+  %nop16079 = alloca i1, i1 0
+  %nop16080 = alloca i1, i1 0
+  %nop16081 = alloca i1, i1 0
+  %nop16082 = alloca i1, i1 0
+  %nop16083 = alloca i1, i1 0
+  %nop16084 = alloca i1, i1 0
+  %nop16085 = alloca i1, i1 0
+  %nop16086 = alloca i1, i1 0
+  %nop16087 = alloca i1, i1 0
+  %nop16088 = alloca i1, i1 0
+  %nop16089 = alloca i1, i1 0
+  %nop16090 = alloca i1, i1 0
+  %nop16091 = alloca i1, i1 0
+  %nop16092 = alloca i1, i1 0
+  %nop16093 = alloca i1, i1 0
+  %nop16094 = alloca i1, i1 0
+  %nop16095 = alloca i1, i1 0
+  %nop16096 = alloca i1, i1 0
+  %nop16097 = alloca i1, i1 0
+  %nop16098 = alloca i1, i1 0
+  %nop16099 = alloca i1, i1 0
+  %nop16100 = alloca i1, i1 0
+  %nop16101 = alloca i1, i1 0
+  %nop16102 = alloca i1, i1 0
+  %nop16103 = alloca i1, i1 0
+  %nop16104 = alloca i1, i1 0
+  %nop16105 = alloca i1, i1 0
+  %nop16106 = alloca i1, i1 0
+  %nop16107 = alloca i1, i1 0
+  %nop16108 = alloca i1, i1 0
+  %nop16109 = alloca i1, i1 0
+  %nop16110 = alloca i1, i1 0
+  %nop16111 = alloca i1, i1 0
+  %nop16112 = alloca i1, i1 0
+  %nop16113 = alloca i1, i1 0
+  %nop16114 = alloca i1, i1 0
+  %nop16115 = alloca i1, i1 0
+  %nop16116 = alloca i1, i1 0
+  %nop16117 = alloca i1, i1 0
+  %nop16118 = alloca i1, i1 0
+  %nop16119 = alloca i1, i1 0
+  %nop16120 = alloca i1, i1 0
+  %nop16121 = alloca i1, i1 0
+  %nop16122 = alloca i1, i1 0
+  %nop16123 = alloca i1, i1 0
+  %nop16124 = alloca i1, i1 0
+  %nop16125 = alloca i1, i1 0
+  %nop16126 = alloca i1, i1 0
+  %nop16127 = alloca i1, i1 0
+  %nop16128 = alloca i1, i1 0
+  %nop16129 = alloca i1, i1 0
+  %nop16130 = alloca i1, i1 0
+  %nop16131 = alloca i1, i1 0
+  %nop16132 = alloca i1, i1 0
+  %nop16133 = alloca i1, i1 0
+  %nop16134 = alloca i1, i1 0
+  %nop16135 = alloca i1, i1 0
+  %nop16136 = alloca i1, i1 0
+  %nop16137 = alloca i1, i1 0
+  %nop16138 = alloca i1, i1 0
+  %nop16139 = alloca i1, i1 0
+  %nop16140 = alloca i1, i1 0
+  %nop16141 = alloca i1, i1 0
+  %nop16142 = alloca i1, i1 0
+  %nop16143 = alloca i1, i1 0
+  %nop16144 = alloca i1, i1 0
+  %nop16145 = alloca i1, i1 0
+  %nop16146 = alloca i1, i1 0
+  %nop16147 = alloca i1, i1 0
+  %nop16148 = alloca i1, i1 0
+  %nop16149 = alloca i1, i1 0
+  %nop16150 = alloca i1, i1 0
+  %nop16151 = alloca i1, i1 0
+  %nop16152 = alloca i1, i1 0
+  %nop16153 = alloca i1, i1 0
+  %nop16154 = alloca i1, i1 0
+  %nop16155 = alloca i1, i1 0
+  %nop16156 = alloca i1, i1 0
+  %nop16157 = alloca i1, i1 0
+  %nop16158 = alloca i1, i1 0
+  %nop16159 = alloca i1, i1 0
+  %nop16160 = alloca i1, i1 0
+  %nop16161 = alloca i1, i1 0
+  %nop16162 = alloca i1, i1 0
+  %nop16163 = alloca i1, i1 0
+  %nop16164 = alloca i1, i1 0
+  %nop16165 = alloca i1, i1 0
+  %nop16166 = alloca i1, i1 0
+  %nop16167 = alloca i1, i1 0
+  %nop16168 = alloca i1, i1 0
+  %nop16169 = alloca i1, i1 0
+  %nop16170 = alloca i1, i1 0
+  %nop16171 = alloca i1, i1 0
+  %nop16172 = alloca i1, i1 0
+  %nop16173 = alloca i1, i1 0
+  %nop16174 = alloca i1, i1 0
+  %nop16175 = alloca i1, i1 0
+  %nop16176 = alloca i1, i1 0
+  %nop16177 = alloca i1, i1 0
+  %nop16178 = alloca i1, i1 0
+  %nop16179 = alloca i1, i1 0
+  %nop16180 = alloca i1, i1 0
+  %nop16181 = alloca i1, i1 0
+  %nop16182 = alloca i1, i1 0
+  %nop16183 = alloca i1, i1 0
+  %nop16184 = alloca i1, i1 0
+  %nop16185 = alloca i1, i1 0
+  %nop16186 = alloca i1, i1 0
+  %nop16187 = alloca i1, i1 0
+  %nop16188 = alloca i1, i1 0
+  %nop16189 = alloca i1, i1 0
+  %nop16190 = alloca i1, i1 0
+  %nop16191 = alloca i1, i1 0
+  %nop16192 = alloca i1, i1 0
+  %nop16193 = alloca i1, i1 0
+  %nop16194 = alloca i1, i1 0
+  %nop16195 = alloca i1, i1 0
+  %nop16196 = alloca i1, i1 0
+  %nop16197 = alloca i1, i1 0
+  %nop16198 = alloca i1, i1 0
+  %nop16199 = alloca i1, i1 0
+  %nop16200 = alloca i1, i1 0
+  %nop16201 = alloca i1, i1 0
+  %nop16202 = alloca i1, i1 0
+  %nop16203 = alloca i1, i1 0
+  %nop16204 = alloca i1, i1 0
+  %nop16205 = alloca i1, i1 0
+  %nop16206 = alloca i1, i1 0
+  %nop16207 = alloca i1, i1 0
+  %nop16208 = alloca i1, i1 0
+  %nop16209 = alloca i1, i1 0
+  %nop16210 = alloca i1, i1 0
+  %nop16211 = alloca i1, i1 0
+  %nop16212 = alloca i1, i1 0
+  %nop16213 = alloca i1, i1 0
+  %nop16214 = alloca i1, i1 0
+  %nop16215 = alloca i1, i1 0
+  %nop16216 = alloca i1, i1 0
+  %nop16217 = alloca i1, i1 0
+  %nop16218 = alloca i1, i1 0
+  %nop16219 = alloca i1, i1 0
+  %nop16220 = alloca i1, i1 0
+  %nop16221 = alloca i1, i1 0
+  %nop16222 = alloca i1, i1 0
+  %nop16223 = alloca i1, i1 0
+  %nop16224 = alloca i1, i1 0
+  %nop16225 = alloca i1, i1 0
+  %nop16226 = alloca i1, i1 0
+  %nop16227 = alloca i1, i1 0
+  %nop16228 = alloca i1, i1 0
+  %nop16229 = alloca i1, i1 0
+  %nop16230 = alloca i1, i1 0
+  %nop16231 = alloca i1, i1 0
+  %nop16232 = alloca i1, i1 0
+  %nop16233 = alloca i1, i1 0
+  %nop16234 = alloca i1, i1 0
+  %nop16235 = alloca i1, i1 0
+  %nop16236 = alloca i1, i1 0
+  %nop16237 = alloca i1, i1 0
+  %nop16238 = alloca i1, i1 0
+  %nop16239 = alloca i1, i1 0
+  %nop16240 = alloca i1, i1 0
+  %nop16241 = alloca i1, i1 0
+  %nop16242 = alloca i1, i1 0
+  %nop16243 = alloca i1, i1 0
+  %nop16244 = alloca i1, i1 0
+  %nop16245 = alloca i1, i1 0
+  %nop16246 = alloca i1, i1 0
+  %nop16247 = alloca i1, i1 0
+  %nop16248 = alloca i1, i1 0
+  %nop16249 = alloca i1, i1 0
+  %nop16250 = alloca i1, i1 0
+  %nop16251 = alloca i1, i1 0
+  %nop16252 = alloca i1, i1 0
+  %nop16253 = alloca i1, i1 0
+  %nop16254 = alloca i1, i1 0
+  %nop16255 = alloca i1, i1 0
+  %nop16256 = alloca i1, i1 0
+  %nop16257 = alloca i1, i1 0
+  %nop16258 = alloca i1, i1 0
+  %nop16259 = alloca i1, i1 0
+  %nop16260 = alloca i1, i1 0
+  %nop16261 = alloca i1, i1 0
+  %nop16262 = alloca i1, i1 0
+  %nop16263 = alloca i1, i1 0
+  %nop16264 = alloca i1, i1 0
+  %nop16265 = alloca i1, i1 0
+  %nop16266 = alloca i1, i1 0
+  %nop16267 = alloca i1, i1 0
+  %nop16268 = alloca i1, i1 0
+  %nop16269 = alloca i1, i1 0
+  %nop16270 = alloca i1, i1 0
+  %nop16271 = alloca i1, i1 0
+  %nop16272 = alloca i1, i1 0
+  %nop16273 = alloca i1, i1 0
+  %nop16274 = alloca i1, i1 0
+  %nop16275 = alloca i1, i1 0
+  %nop16276 = alloca i1, i1 0
+  %nop16277 = alloca i1, i1 0
+  %nop16278 = alloca i1, i1 0
+  %nop16279 = alloca i1, i1 0
+  %nop16280 = alloca i1, i1 0
+  %nop16281 = alloca i1, i1 0
+  %nop16282 = alloca i1, i1 0
+  %nop16283 = alloca i1, i1 0
+  %nop16284 = alloca i1, i1 0
+  %nop16285 = alloca i1, i1 0
+  %nop16286 = alloca i1, i1 0
+  %nop16287 = alloca i1, i1 0
+  %nop16288 = alloca i1, i1 0
+  %nop16289 = alloca i1, i1 0
+  %nop16290 = alloca i1, i1 0
+  %nop16291 = alloca i1, i1 0
+  %nop16292 = alloca i1, i1 0
+  %nop16293 = alloca i1, i1 0
+  %nop16294 = alloca i1, i1 0
+  %nop16295 = alloca i1, i1 0
+  %nop16296 = alloca i1, i1 0
+  %nop16297 = alloca i1, i1 0
+  %nop16298 = alloca i1, i1 0
+  %nop16299 = alloca i1, i1 0
+  %nop16300 = alloca i1, i1 0
+  %nop16301 = alloca i1, i1 0
+  %nop16302 = alloca i1, i1 0
+  %nop16303 = alloca i1, i1 0
+  %nop16304 = alloca i1, i1 0
+  %nop16305 = alloca i1, i1 0
+  %nop16306 = alloca i1, i1 0
+  %nop16307 = alloca i1, i1 0
+  %nop16308 = alloca i1, i1 0
+  %nop16309 = alloca i1, i1 0
+  %nop16310 = alloca i1, i1 0
+  %nop16311 = alloca i1, i1 0
+  %nop16312 = alloca i1, i1 0
+  %nop16313 = alloca i1, i1 0
+  %nop16314 = alloca i1, i1 0
+  %nop16315 = alloca i1, i1 0
+  %nop16316 = alloca i1, i1 0
+  %nop16317 = alloca i1, i1 0
+  %nop16318 = alloca i1, i1 0
+  %nop16319 = alloca i1, i1 0
+  %nop16320 = alloca i1, i1 0
+  %nop16321 = alloca i1, i1 0
+  %nop16322 = alloca i1, i1 0
+  %nop16323 = alloca i1, i1 0
+  %nop16324 = alloca i1, i1 0
+  %nop16325 = alloca i1, i1 0
+  %nop16326 = alloca i1, i1 0
+  %nop16327 = alloca i1, i1 0
+  %nop16328 = alloca i1, i1 0
+  %nop16329 = alloca i1, i1 0
+  %nop16330 = alloca i1, i1 0
+  %nop16331 = alloca i1, i1 0
+  %nop16332 = alloca i1, i1 0
+  %nop16333 = alloca i1, i1 0
+  %nop16334 = alloca i1, i1 0
+  %nop16335 = alloca i1, i1 0
+  %nop16336 = alloca i1, i1 0
+  %nop16337 = alloca i1, i1 0
+  %nop16338 = alloca i1, i1 0
+  %nop16339 = alloca i1, i1 0
+  %nop16340 = alloca i1, i1 0
+  %nop16341 = alloca i1, i1 0
+  %nop16342 = alloca i1, i1 0
+  %nop16343 = alloca i1, i1 0
+  %nop16344 = alloca i1, i1 0
+  %nop16345 = alloca i1, i1 0
+  %nop16346 = alloca i1, i1 0
+  %nop16347 = alloca i1, i1 0
+  %nop16348 = alloca i1, i1 0
+  %nop16349 = alloca i1, i1 0
+  %nop16350 = alloca i1, i1 0
+  %nop16351 = alloca i1, i1 0
+  %nop16352 = alloca i1, i1 0
+  %nop16353 = alloca i1, i1 0
+  %nop16354 = alloca i1, i1 0
+  %nop16355 = alloca i1, i1 0
+  %nop16356 = alloca i1, i1 0
+  %nop16357 = alloca i1, i1 0
+  %nop16358 = alloca i1, i1 0
+  %nop16359 = alloca i1, i1 0
+  %nop16360 = alloca i1, i1 0
+  %nop16361 = alloca i1, i1 0
+  %nop16362 = alloca i1, i1 0
+  %nop16363 = alloca i1, i1 0
+  %nop16364 = alloca i1, i1 0
+  %nop16365 = alloca i1, i1 0
+  %nop16366 = alloca i1, i1 0
+  %nop16367 = alloca i1, i1 0
+  %nop16368 = alloca i1, i1 0
+  %nop16369 = alloca i1, i1 0
+  %nop16370 = alloca i1, i1 0
+  %nop16371 = alloca i1, i1 0
+  %nop16372 = alloca i1, i1 0
+  %nop16373 = alloca i1, i1 0
+  %nop16374 = alloca i1, i1 0
+  %nop16375 = alloca i1, i1 0
+  %nop16376 = alloca i1, i1 0
+  %nop16377 = alloca i1, i1 0
+  br label %for.inc
+
+for.inc:
+  %3 = load i32* %i, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+; CHECK:  addiu $sp, $sp, -8
+; CHECK:  sw  $ra, 0($sp)
+; CHECK:  lui $[[REG1:[0-9]+]], 65534
+; CHECK:  addiu $[[REG1]], $[[REG1]], -12
+; CHECK:  addu  $[[REG1]], $ra, $[[REG1]]
+; CHECK:  lw  $ra, 0($sp)
+; CHECK:  jr  $[[REG1]]
+; CHECK:  addiu $sp, $sp, 8
+
+for.end:
+  ret i32 0
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false"
+  "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
+  "no-infs-fp-math"="false" "no-nans-fp-math"="false"
+  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "use-soft-float"="false" }
diff --git a/test/MC/Mips/micromips-movcond-instructions.s b/test/MC/Mips/micromips-movcond-instructions.s
new file mode 100644
index 0000000..5da8702
--- /dev/null
+++ b/test/MC/Mips/micromips-movcond-instructions.s
@@ -0,0 +1,26 @@
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips \
+# RUN: | FileCheck -check-prefix=CHECK-EL %s
+# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips \
+# RUN: | FileCheck -check-prefix=CHECK-EB %s
+# Check that the assembler can handle the documented syntax
+# for move conditional instructions.
+#------------------------------------------------------------------------------
+# Move Conditional
+#------------------------------------------------------------------------------
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: movz    $9, $6, $7        # encoding: [0xe6,0x00,0x58,0x48]
+# CHECK-EL: movn    $9, $6, $7        # encoding: [0xe6,0x00,0x18,0x48]
+# CHECK-EL: movt    $9, $6, $fcc0     # encoding: [0x26,0x55,0x7b,0x09]
+# CHECK-EL: movf    $9, $6, $fcc0     # encoding: [0x26,0x55,0x7b,0x01]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: movz    $9, $6, $7        # encoding: [0x00,0xe6,0x48,0x58]
+# CHECK-EB: movn    $9, $6, $7        # encoding: [0x00,0xe6,0x48,0x18]
+# CHECK-EB: movt    $9, $6, $fcc0     # encoding: [0x55,0x26,0x09,0x7b]
+# CHECK-EB: movf    $9, $6, $fcc0     # encoding: [0x55,0x26,0x01,0x7b]
+     movz    $9, $6, $7
+     movn    $9, $6, $7
+     movt    $9, $6, $fcc0
+     movf    $9, $6, $fcc0
diff --git a/test/MC/Mips/micromips-multiply-instructions.s b/test/MC/Mips/micromips-multiply-instructions.s
new file mode 100644
index 0000000..7c3c518
--- /dev/null
+++ b/test/MC/Mips/micromips-multiply-instructions.s
@@ -0,0 +1,26 @@
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips \
+# RUN: | FileCheck -check-prefix=CHECK-EL %s
+# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips \
+# RUN: | FileCheck -check-prefix=CHECK-EB %s
+# Check that the assembler can handle the documented syntax
+# for Multiply Add/Sub instructions.
+#------------------------------------------------------------------------------
+# Multiply Add/Sub Instructions
+#------------------------------------------------------------------------------
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: madd   $4, $5    # encoding: [0xa4,0x00,0x3c,0xcb]
+# CHECK-EL: maddu  $4, $5    # encoding: [0xa4,0x00,0x3c,0xdb]
+# CHECK-EL: msub   $4, $5    # encoding: [0xa4,0x00,0x3c,0xeb]
+# CHECK-EL: msubu  $4, $5    # encoding: [0xa4,0x00,0x3c,0xfb]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: madd   $4, $5    # encoding: [0x00,0xa4,0xcb,0x3c]
+# CHECK-EB: maddu  $4, $5    # encoding: [0x00,0xa4,0xdb,0x3c]
+# CHECK-EB: msub   $4, $5    # encoding: [0x00,0xa4,0xeb,0x3c]
+# CHECK-EB: msubu  $4, $5    # encoding: [0x00,0xa4,0xfb,0x3c]
+    madd     $4, $5
+    maddu    $4, $5
+    msub     $4, $5
+    msubu    $4, $5
diff --git a/test/MC/Mips/micromips-relocations.s b/test/MC/Mips/micromips-relocations.s
new file mode 100644
index 0000000..804dd2f
--- /dev/null
+++ b/test/MC/Mips/micromips-relocations.s
@@ -0,0 +1,99 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mattr=micromips | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mipsel-unknown-linux \
+# RUN: -mattr=micromips | llvm-readobj -r \
+# RUN: | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax
+# for relocations.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: lui $2, %hi(_gp_disp)
+# CHECK-FIXUP:        # encoding: [0xa2'A',0x41'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: _gp_disp@ABS_HI,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_HI16
+# CHECK-FIXUP: addiu $2, $2, %lo(_gp_disp)
+# CHECK-FIXUP:        # encoding: [0x42'A',0x30'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: _gp_disp@ABS_LO,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_LO16
+# CHECK-FIXUP: lw $25, %call16(strchr)($gp)
+# CHECK-FIXUP:        # encoding: [0x3c'A',0xff'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: strchr@GOT_CALL,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_CALL16
+# CHECK-FIXUP: lw $3, %got(loop_1)($2)
+# CHECK-FIXUP:        # encoding: [0x62'A',0xfc'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: loop_1@GOT,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_GOT16
+# CHECK-FIXUP: lui $2, %dtprel_hi(_gp_disp)
+# CHECK-FIXUP:        # encoding: [0xa2'A',0x41'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: _gp_disp@DTPREL_HI,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_TLS_DTPREL_HI16
+# CHECK-FIXUP: addiu $2, $2, %dtprel_lo(_gp_disp)
+# CHECK-FIXUP:        # encoding: [0x42'A',0x30'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: _gp_disp@DTPREL_LO,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_TLS_DTPREL_LO16
+# CHECK-FIXUP: lw $3, %got(loop_1)($2)
+# CHECK-FIXUP:        # encoding: [0x62'A',0xfc'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: loop_1@GOT,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_GOT16
+# CHECK-FIXUP: lw $4, %got_disp(loop_2)($3)
+# CHECK-FIXUP:        # encoding: [0x83'A',0xfc'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: loop_2@GOT_DISP,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_GOT_DISP
+# CHECK-FIXUP: lw $5, %got_page(loop_3)($4)
+# CHECK-FIXUP:        # encoding: [0xa4'A',0xfc'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: loop_3@GOT_PAGE,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_GOT_PAGE
+# CHECK-FIXUP: lw $6, %got_ofst(loop_4)($5)
+# CHECK-FIXUP:        # encoding: [0xc5'A',0xfc'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: loop_4@GOT_OFST,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_GOT_OFST
+# CHECK-FIXUP: lui $2, %tprel_hi(_gp_disp)
+# CHECK-FIXUP:        # encoding: [0xa2'A',0x41'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: _gp_disp@TPREL_HI,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_TLS_TPREL_HI16
+# CHECK-FIXUP: addiu $2, $2, %tprel_lo(_gp_disp)
+# CHECK-FIXUP:        # encoding: [0x42'A',0x30'A',0x00,0x00]
+# CHECK-FIXUP:        # fixup A - offset: 0,
+# CHECK-FIXUP:          value: _gp_disp@TPREL_LO,
+# CHECK-FIXUP:          kind: fixup_MICROMIPS_TLS_TPREL_LO16
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_HI16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_LO16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_CALL16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_GOT16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_TLS_DTPREL_HI16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_TLS_DTPREL_LO16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_GOT16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_GOT_DISP
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_GOT_PAGE
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_GOT_OFST
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_TLS_TPREL_HI16
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_TLS_TPREL_LO16
+# CHECK-ELF: ]
+
+    lui    $2, %hi(_gp_disp)
+    addiu  $2, $2, %lo(_gp_disp)
+    lw     $25, %call16(strchr)($gp)
+    lw     $3, %got(loop_1)($2)
+    lui    $2, %dtprel_hi(_gp_disp)
+    addiu  $2, $2, %dtprel_lo(_gp_disp)
+    lw     $3, %got(loop_1)($2)
+    lw     $4, %got_disp(loop_2)($3)
+    lw     $5, %got_page(loop_3)($4)
+    lw     $6, %got_ofst(loop_4)($5)
+    lui    $2, %tprel_hi(_gp_disp)
+    addiu  $2, $2, %tprel_lo(_gp_disp)
diff --git a/test/MC/Mips/micromips-shift-instructions.s b/test/MC/Mips/micromips-shift-instructions.s
index 3b5060f..bbb71ac 100644
--- a/test/MC/Mips/micromips-shift-instructions.s
+++ b/test/MC/Mips/micromips-shift-instructions.s
@@ -1,17 +1,31 @@
-# RUN: llvm-mc %s -triple=mipsel -show-encoding -mcpu=mips32r2 -mattr=micromips | FileCheck %s
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mcpu=mips32r2 -mattr=micromips | FileCheck -check-prefix=CHECK-EL %s
+# RUN: llvm-mc %s -triple=mips -show-encoding -mcpu=mips32r2 -mattr=micromips | FileCheck -check-prefix=CHECK-EB %s
 # Check that the assembler can handle the documented syntax
 # for shift instructions.
 #------------------------------------------------------------------------------
 # Shift Instructions
 #------------------------------------------------------------------------------
-# CHECK: sll    $4, $3, 7      # encoding: [0x00,0x38,0x83,0x00]
-# CHECK: sllv   $2, $3, $5     # encoding: [0x10,0x10,0x65,0x00]
-# CHECK: sra    $4, $3, 7      # encoding: [0x80,0x38,0x83,0x00]
-# CHECK: srav   $2, $3, $5     # encoding: [0x90,0x10,0x65,0x00]
-# CHECK: srl    $4, $3, 7      # encoding: [0x40,0x38,0x83,0x00]
-# CHECK: srlv   $2, $3, $5     # encoding: [0x50,0x10,0x65,0x00]
-# CHECK: rotr   $9, $6, 7      # encoding: [0xc0,0x38,0x26,0x01]
-# CHECK: rotrv  $9, $6, $7     # encoding: [0xd0,0x48,0xc7,0x00]
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: sll    $4, $3, 7      # encoding: [0x83,0x00,0x00,0x38]
+# CHECK-EL: sllv   $2, $3, $5     # encoding: [0x65,0x00,0x10,0x10]
+# CHECK-EL: sra    $4, $3, 7      # encoding: [0x83,0x00,0x80,0x38]
+# CHECK-EL: srav   $2, $3, $5     # encoding: [0x65,0x00,0x90,0x10]
+# CHECK-EL: srl    $4, $3, 7      # encoding: [0x83,0x00,0x40,0x38]
+# CHECK-EL: srlv   $2, $3, $5     # encoding: [0x65,0x00,0x50,0x10]
+# CHECK-EL: rotr   $9, $6, 7      # encoding: [0x26,0x01,0xc0,0x38]
+# CHECK-EL: rotrv  $9, $6, $7     # encoding: [0xc7,0x00,0xd0,0x48]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: sll $4, $3, 7         # encoding: [0x00,0x83,0x38,0x00]
+# CHECK-EB: sllv  $2, $3, $5      # encoding: [0x00,0x65,0x10,0x10]
+# CHECK-EB: sra $4, $3, 7         # encoding: [0x00,0x83,0x38,0x80]
+# CHECK-EB: srav  $2, $3, $5      # encoding: [0x00,0x65,0x10,0x90]
+# CHECK-EB: srl $4, $3, 7         # encoding: [0x00,0x83,0x38,0x40]
+# CHECK-EB: srlv  $2, $3, $5      # encoding: [0x00,0x65,0x10,0x50]
+# CHECK-EB: rotr  $9, $6, 7       # encoding: [0x01,0x26,0x38,0xc0]
+# CHECK-EB: rotrv $9, $6, $7      # encoding: [0x00,0xc7,0x48,0xd0]
      sll    $4, $3, 7
      sllv   $2, $3, $5
      sra    $4, $3, 7
diff --git a/test/MC/Mips/micromips-tailr.s b/test/MC/Mips/micromips-tailr.s
new file mode 100644
index 0000000..0c21a7b
--- /dev/null
+++ b/test/MC/Mips/micromips-tailr.s
@@ -0,0 +1,26 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding \
+# RUN: -mattr=micromips | FileCheck %s -check-prefix=CHECK-FIXUP
+# RUN: llvm-mc %s -filetype=obj -triple=mipsel-unknown-linux \
+# RUN: -mattr=micromips | llvm-readobj -r \
+# RUN: | FileCheck %s -check-prefix=CHECK-ELF
+#------------------------------------------------------------------------------
+# Check that the assembler can handle the documented syntax
+# for relocations.
+#------------------------------------------------------------------------------
+# CHECK-FIXUP: foo:
+# CHECK-FIXUP:   addiu $2, $zero, 1332
+# CHECK-FIXUP:         # encoding: [0x40,0x30,0x34,0x05]
+# CHECK-FIXUP:   j foo # encoding: [A,0xd4'A',A,0b000000AA]
+# CHECK-FIXUP:         #   fixup A - offset: 0,
+# CHECK-FIXUP:             value: foo, kind: fixup_MICROMIPS_26_S1
+# CHECK-FIXUP:   nop   # encoding: [0x00,0x00,0x00,0x00]
+#------------------------------------------------------------------------------
+# Check that the appropriate relocations were created.
+#------------------------------------------------------------------------------
+# CHECK-ELF: Relocations [
+# CHECK-ELF:     0x{{[0-9,A-F]+}} R_MICROMIPS_26_S1
+# CHECK-ELF: ]
+
+foo:
+  addiu $2, $0, 1332
+  j foo
diff --git a/test/MC/Mips/micromips-trap-instructions.s b/test/MC/Mips/micromips-trap-instructions.s
new file mode 100644
index 0000000..404006c
--- /dev/null
+++ b/test/MC/Mips/micromips-trap-instructions.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc %s -triple=mipsel -show-encoding -mattr=micromips \
+# RUN: | FileCheck -check-prefix=CHECK-EL %s
+# RUN: llvm-mc %s -triple=mips -show-encoding -mattr=micromips \
+# RUN: | FileCheck -check-prefix=CHECK-EB %s
+# Check that the assembler can handle the documented syntax
+# for miscellaneous instructions
+#------------------------------------------------------------------------------
+# Miscellaneous Instructions
+#------------------------------------------------------------------------------
+# Little endian
+#------------------------------------------------------------------------------
+# CHECK-EL: teq     $8, $9       # encoding: [0x28,0x01,0x3c,0x00]
+# CHECK-EL: tge     $8, $9       # encoding: [0x28,0x01,0x3c,0x02]
+# CHECK-EL: tgeu    $8, $9       # encoding: [0x28,0x01,0x3c,0x04]
+# CHECK-EL: tlt     $8, $9       # encoding: [0x28,0x01,0x3c,0x08]
+# CHECK-EL: tltu    $8, $9       # encoding: [0x28,0x01,0x3c,0x0a]
+# CHECK-EL: tne     $8, $9       # encoding: [0x28,0x01,0x3c,0x0c]
+# CHECK-EL: teqi    $9, 17767    # encoding: [0xc9,0x41,0x67,0x45]
+# CHECK-EL: tgei    $9, 17767    # encoding: [0x29,0x41,0x67,0x45]
+# CHECK-EL: tgeiu   $9, 17767    # encoding: [0x69,0x41,0x67,0x45]
+# CHECK-EL: tlti    $9, 17767    # encoding: [0x09,0x41,0x67,0x45]
+# CHECK-EL: tltiu   $9, 17767    # encoding: [0x49,0x41,0x67,0x45]
+# CHECK-EL: tnei    $9, 17767    # encoding: [0x89,0x41,0x67,0x45]
+#------------------------------------------------------------------------------
+# Big endian
+#------------------------------------------------------------------------------
+# CHECK-EB: teq     $8, $9       # encoding: [0x01,0x28,0x00,0x3c]
+# CHECK-EB: tge     $8, $9       # encoding: [0x01,0x28,0x02,0x3c]
+# CHECK-EB: tgeu    $8, $9       # encoding: [0x01,0x28,0x04,0x3c]
+# CHECK-EB: tlt     $8, $9       # encoding: [0x01,0x28,0x08,0x3c]
+# CHECK-EB: tltu    $8, $9       # encoding: [0x01,0x28,0x0a,0x3c]
+# CHECK-EB: tne     $8, $9       # encoding: [0x01,0x28,0x0c,0x3c]
+# CHECK-EB: teqi    $9, 17767    # encoding: [0x41,0xc9,0x45,0x67]
+# CHECK-EB: tgei    $9, 17767    # encoding: [0x41,0x29,0x45,0x67]
+# CHECK-EB: tgeiu   $9, 17767    # encoding: [0x41,0x69,0x45,0x67]
+# CHECK-EB: tlti    $9, 17767    # encoding: [0x41,0x09,0x45,0x67]
+# CHECK-EB: tltiu   $9, 17767    # encoding: [0x41,0x49,0x45,0x67]
+# CHECK-EB: tnei    $9, 17767    # encoding: [0x41,0x89,0x45,0x67]
+    teq     $8, $9, 0
+    tge     $8, $9, 0
+    tgeu    $8, $9, 0
+    tlt     $8, $9, 0
+    tltu    $8, $9, 0
+    tne     $8, $9, 0
+    teqi    $9, 17767
+    tgei    $9, 17767
+    tgeiu   $9, 17767
+    tlti    $9, 17767
+    tltiu   $9, 17767
+    tnei    $9, 17767
diff --git a/test/MC/Mips/mips-alu-instructions.s b/test/MC/Mips/mips-alu-instructions.s
index eccc288..68a8da0 100644
--- a/test/MC/Mips/mips-alu-instructions.s
+++ b/test/MC/Mips/mips-alu-instructions.s
@@ -86,7 +86,9 @@
 # CHECK:  mult   $3, $5          # encoding: [0x18,0x00,0x65,0x00]
 # CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
 # CHECK:  sub    $9, $6, $7      # encoding: [0x22,0x48,0xc7,0x00]
+# CHECK:  addi   $sp, $sp, -56   # encoding: [0xc8,0xff,0xbd,0x23]
 # CHECK:  subu   $4, $3, $5      # encoding: [0x23,0x20,0x65,0x00]
+# CHECK:  addiu   $sp, $sp, -40  # encoding: [0xd8,0xff,0xbd,0x27]
 # CHECK:  neg     $6, $7         # encoding: [0x22,0x30,0x07,0x00]
 # CHECK:  negu    $6, $7         # encoding: [0x23,0x30,0x07,0x00]
 # CHECK:  move    $7, $8         # encoding: [0x21,0x38,0x00,0x01]
@@ -109,7 +111,9 @@
     mult   $3,$5
     multu  $3,$5
     sub    $9,$6,$7
+    sub    $sp,$sp,56
     subu   $4,$3,$5
+    subu    $sp,$sp,40
     neg    $6,$7
     negu   $6,$7
     move   $7,$8
diff --git a/test/MC/Mips/mips-control-instructions.s b/test/MC/Mips/mips-control-instructions.s
index ee70940..4a16c53 100644
--- a/test/MC/Mips/mips-control-instructions.s
+++ b/test/MC/Mips/mips-control-instructions.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc %s -triple=mips-unknown-unknown -show-encoding -mcpu=mips32r2 | \
 # RUN: FileCheck -check-prefix=CHECK32  %s
-# RUN: llvm-mc %s -triple=mips-unknown-unknown -show-encoding -mcpu=mips64r2 | \
-# RUN: FileCheck -check-prefix=CHECK64  %s
+# RUN: llvm-mc %s -triple=mips64-unknown-unknown -show-encoding -mcpu=mips64r2 \
+# RUN: | FileCheck -check-prefix=CHECK64  %s
 
 # CHECK32:    break                      # encoding: [0x00,0x00,0x00,0x0d]
 # CHECK32:    break   7, 0               # encoding: [0x00,0x07,0x00,0x0d]
@@ -10,6 +10,31 @@
 # CHECK32:    syscall 13396              # encoding: [0x00,0x0d,0x15,0x0c]
 # CHECK32:    eret                       # encoding: [0x42,0x00,0x00,0x18]
 # CHECK32:    deret                      # encoding: [0x42,0x00,0x00,0x1f]
+# CHECK32:    di                         # encoding: [0x41,0x60,0x60,0x00]
+# CHECK32:    di                         # encoding: [0x41,0x60,0x60,0x00]
+# CHECK32:    di      $10                # encoding: [0x41,0x6a,0x60,0x00]
+# CHECK32:    ei                         # encoding: [0x41,0x60,0x60,0x20]
+# CHECK32:    ei                         # encoding: [0x41,0x60,0x60,0x20]
+# CHECK32:    ei      $10                # encoding: [0x41,0x6a,0x60,0x20]
+# CHECK32:    wait                       # encoding: [0x42,0x00,0x00,0x20]
+# CHECK32:    teq     $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+# CHECK32:    teq     $zero, $3, 1       # encoding: [0x00,0x03,0x00,0x74]
+# CHECK32:    teqi    $3, 1              # encoding: [0x04,0x6c,0x00,0x01]
+# CHECK32:    tge     $zero, $3          # encoding: [0x00,0x03,0x00,0x30]
+# CHECK32:    tge     $zero, $3, 3       # encoding: [0x00,0x03,0x00,0xf0]
+# CHECK32:    tgei    $3, 3              # encoding: [0x04,0x68,0x00,0x03]
+# CHECK32:    tgeu    $zero, $3          # encoding: [0x00,0x03,0x00,0x31]
+# CHECK32:    tgeu    $zero, $3, 7       # encoding: [0x00,0x03,0x01,0xf1]
+# CHECK32:    tgeiu   $3, 7              # encoding: [0x04,0x69,0x00,0x07]
+# CHECK32:    tlt     $zero, $3          # encoding: [0x00,0x03,0x00,0x32]
+# CHECK32:    tlt     $zero, $3, 31      # encoding: [0x00,0x03,0x07,0xf2]
+# CHECK32:    tlti    $3, 31             # encoding: [0x04,0x6a,0x00,0x1f]
+# CHECK32:    tltu    $zero, $3          # encoding: [0x00,0x03,0x00,0x33]
+# CHECK32:    tltu    $zero, $3, 255     # encoding: [0x00,0x03,0x3f,0xf3]
+# CHECK32:    tltiu   $3, 255            # encoding: [0x04,0x6b,0x00,0xff]
+# CHECK32:    tne     $zero, $3          # encoding: [0x00,0x03,0x00,0x36]
+# CHECK32:    tne     $zero, $3, 1023    # encoding: [0x00,0x03,0xff,0xf6]
+# CHECK32:    tnei    $3, 1023           # encoding: [0x04,0x6e,0x03,0xff]
 
 # CHECK64:    break                      # encoding: [0x00,0x00,0x00,0x0d]
 # CHECK64:    break   7, 0               # encoding: [0x00,0x07,0x00,0x0d]
@@ -18,6 +43,31 @@
 # CHECK64:    syscall 13396              # encoding: [0x00,0x0d,0x15,0x0c]
 # CHECK64:    eret                       # encoding: [0x42,0x00,0x00,0x18]
 # CHECK64:    deret                      # encoding: [0x42,0x00,0x00,0x1f]
+# CHECK64:    di                         # encoding: [0x41,0x60,0x60,0x00]
+# CHECK64:    di                         # encoding: [0x41,0x60,0x60,0x00]
+# CHECK64:    di      $10                # encoding: [0x41,0x6a,0x60,0x00]
+# CHECK64:    ei                         # encoding: [0x41,0x60,0x60,0x20]
+# CHECK64:    ei                         # encoding: [0x41,0x60,0x60,0x20]
+# CHECK64:    ei      $10                # encoding: [0x41,0x6a,0x60,0x20]
+# CHECK64:    wait                       # encoding: [0x42,0x00,0x00,0x20]
+# CHECK64:    teq     $zero, $3          # encoding: [0x00,0x03,0x00,0x34]
+# CHECK64:    teq     $zero, $3, 1       # encoding: [0x00,0x03,0x00,0x74]
+# CHECK64:    teqi    $3, 1              # encoding: [0x04,0x6c,0x00,0x01]
+# CHECK64:    tge     $zero, $3          # encoding: [0x00,0x03,0x00,0x30]
+# CHECK64:    tge     $zero, $3, 3       # encoding: [0x00,0x03,0x00,0xf0]
+# CHECK64:    tgei    $3, 3              # encoding: [0x04,0x68,0x00,0x03]
+# CHECK64:    tgeu    $zero, $3          # encoding: [0x00,0x03,0x00,0x31]
+# CHECK64:    tgeu    $zero, $3, 7       # encoding: [0x00,0x03,0x01,0xf1]
+# CHECK64:    tgeiu   $3, 7              # encoding: [0x04,0x69,0x00,0x07]
+# CHECK64:    tlt     $zero, $3          # encoding: [0x00,0x03,0x00,0x32]
+# CHECK64:    tlt     $zero, $3, 31      # encoding: [0x00,0x03,0x07,0xf2]
+# CHECK64:    tlti    $3, 31             # encoding: [0x04,0x6a,0x00,0x1f]
+# CHECK64:    tltu    $zero, $3          # encoding: [0x00,0x03,0x00,0x33]
+# CHECK64:    tltu    $zero, $3, 255     # encoding: [0x00,0x03,0x3f,0xf3]
+# CHECK64:    tltiu   $3, 255            # encoding: [0x04,0x6b,0x00,0xff]
+# CHECK64:    tne     $zero, $3          # encoding: [0x00,0x03,0x00,0x36]
+# CHECK64:    tne     $zero, $3, 1023    # encoding: [0x00,0x03,0xff,0xf6]
+# CHECK64:    tnei    $3, 1023           # encoding: [0x04,0x6e,0x03,0xff]
 
     break
     break 7
@@ -26,3 +76,31 @@
     syscall 0x3454
     eret
     deret
+    di
+    di  $0
+    di  $10
+
+    ei
+    ei  $0
+    ei  $10
+
+    wait
+
+    teq   $0,$3
+    teq   $0,$3,1
+    teqi  $3,1
+    tge   $0,$3
+    tge   $0,$3,3
+    tgei  $3,3
+    tgeu  $0,$3
+    tgeu  $0,$3,7
+    tgeiu $3,7
+    tlt   $0,$3
+    tlt   $0,$3,31
+    tlti  $3,31
+    tltu  $0,$3
+    tltu  $0,$3,255
+    tltiu $3,255
+    tne   $0,$3
+    tne   $0,$3,1023
+    tnei  $3,1023
diff --git a/test/MC/Mips/mips-dsp-instructions.s b/test/MC/Mips/mips-dsp-instructions.s
index 4de88ce..5a9e8ea 100644
--- a/test/MC/Mips/mips-dsp-instructions.s
+++ b/test/MC/Mips/mips-dsp-instructions.s
@@ -22,6 +22,33 @@
 # CHECK:   precr_sra_r.ph.w  $25, $26, 0     # encoding: [0x7f,0x59,0x07,0xd1]
 # CHECK:   precr_sra_r.ph.w  $25, $26, 31    # encoding: [0x7f,0x59,0xff,0xd1]
 
+# CHECK:   lbux $10, $20($26)                # encoding: [0x7f,0x54,0x51,0x8a]
+# CHECK:   lhx  $11, $21($27)                # encoding: [0x7f,0x75,0x59,0x0a]
+# CHECK:   lwx  $12, $22($gp)                # encoding: [0x7f,0x96,0x60,0x0a]
+
+# CHECK:    mult $ac3, $2, $3               # encoding: [0x00,0x43,0x18,0x18]
+# CHECK:    multu $ac2, $4, $5              # encoding: [0x00,0x85,0x10,0x19]
+# CHECK:    madd $ac1, $6, $7               # encoding: [0x70,0xc7,0x08,0x00]
+# CHECK:    maddu $ac0, $8, $9              # encoding: [0x71,0x09,0x00,0x01]
+# CHECK:    msub $ac3, $10, $11             # encoding: [0x71,0x4b,0x18,0x04]
+# CHECK:    msubu $ac2, $12, $13            # encoding: [0x71,0x8d,0x10,0x05]
+# CHECK:    mfhi $14, $ac1                  # encoding: [0x00,0x20,0x70,0x10]
+# CHECK:    mflo $15, $ac0                  # encoding: [0x00,0x00,0x78,0x12]
+# CHECK:    mthi $16, $ac3                  # encoding: [0x02,0x00,0x18,0x11]
+# CHECK:    mtlo $17, $ac2                  # encoding: [0x02,0x20,0x10,0x13]
+
+# CHECK:    mult $2, $3                      # encoding: [0x00,0x43,0x00,0x18]
+# CHECK:    multu $4, $5                     # encoding: [0x00,0x85,0x00,0x19]
+# CHECK:    madd $6, $7                      # encoding: [0x70,0xc7,0x00,0x00]
+# CHECK:    maddu $8, $9                     # encoding: [0x71,0x09,0x00,0x01]
+# CHECK:    msub $10, $11                    # encoding: [0x71,0x4b,0x00,0x04]
+# CHECK:    msubu $12, $13                   # encoding: [0x71,0x8d,0x00,0x05]
+# CHECK:    mfhi $14                         # encoding: [0x00,0x00,0x70,0x10]
+# CHECK:    mflo $15                         # encoding: [0x00,0x00,0x78,0x12]
+# CHECK:    mthi $16                         # encoding: [0x02,0x00,0x00,0x11]
+# CHECK:    mtlo $17                         # encoding: [0x02,0x20,0x00,0x13]
+
+
   precrq.qb.ph    $16,$17,$18
   precrq.ph.w     $17,$18,$19
   precrq_rs.ph.w  $18,$19,$20
@@ -42,3 +69,29 @@
   precr_sra.ph.w  $24,$25,31
   precr_sra_r.ph.w  $25,$26,0
   precr_sra_r.ph.w  $25,$26,31
+
+  lbux $10, $s4($26)
+  lhx  $11, $s5($27)
+  lwx  $12, $s6($28)
+
+  mult $ac3, $2, $3
+  multu $ac2, $4, $5
+  madd $ac1, $6, $7
+  maddu $ac0, $8, $9
+  msub $ac3, $10, $11
+  msubu $ac2, $12, $13
+  mfhi $14, $ac1
+  mflo $15, $ac0
+  mthi $16, $ac3
+  mtlo $17, $ac2
+
+  mult $2, $3
+  multu $4, $5
+  madd $6, $7
+  maddu $8, $9
+  msub $10, $11
+  msubu $12, $13
+  mfhi $14
+  mflo $15
+  mthi $16
+  mtlo $17
diff --git a/test/MC/Mips/mips-fpu-instructions.s b/test/MC/Mips/mips-fpu-instructions.s
index dc52676..bfaef9e 100644
--- a/test/MC/Mips/mips-fpu-instructions.s
+++ b/test/MC/Mips/mips-fpu-instructions.s
@@ -1,4 +1,5 @@
 # RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# RUN: llvm-mc  %s -triple=mips64el-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
 # Check that the assembler can handle the documented syntax
 # for FPU instructions.
 #------------------------------------------------------------------------------
@@ -137,6 +138,8 @@
 #------------------------------------------------------------------------------
 # FP move instructions
 #------------------------------------------------------------------------------
+# CHECK: bc1f    $BB_1                 # encoding: [A,A,0x00,0x45]
+# CHECK: #   fixup A - offset: 0, value: ($BB_1), kind: fixup_Mips_PC16
 
 # CHECK:  cfc1    $6, $0               # encoding: [0x00,0x00,0x46,0x44]
 # CHECK:  ctc1    $10, $31             # encoding: [0x00,0xf8,0xca,0x44]
@@ -164,7 +167,15 @@
 # CHECK:  movf.s  $f4, $f6, $fcc5         # encoding: [0x11,0x31,0x14,0x46]
 # CHECK:  luxc1   $f0, $6($5)             # encoding: [0x05,0x00,0xa6,0x4c]
 # CHECK:  suxc1   $f4, $24($5)            # encoding: [0x0d,0x20,0xb8,0x4c]
-
+# CHECK:  lwxc1   $f20, $12($14)          # encoding: [0x00,0x05,0xcc,0x4d]
+# CHECK:  swxc1   $f26, $18($22)          # encoding: [0x08,0xd0,0xd2,0x4e]
+# CHECK:  mfhc1   $17, $f4                # encoding: [0x00,0x20,0x71,0x44]
+# CHECK:  mthc1   $17, $f6                # encoding: [0x00,0x30,0xf1,0x44]
+# CHECK:  swc2    $4, 16($sp)             # encoding: [0x10,0x00,0xa4,0xeb]
+# CHECK:  sdc2    $4, 16($sp)             # encoding: [0x10,0x00,0xa4,0xfb]
+# CHECK:  lwc2    $11, 12($ra)            # encoding: [0x0c,0x00,0xeb,0xcb]
+# CHECK:  ldc2    $11, 12($ra)            # encoding: [0x0c,0x00,0xeb,0xdb]
+   bc1f    $fcc0, $BB_1
    cfc1    $a2,$0
    ctc1    $10,$31
    mfc1    $a2,$f7
@@ -189,5 +200,13 @@
    movt    $4, $5, $fcc4
    movf.d  $f4, $f6, $fcc2
    movf.s  $f4, $f6, $fcc5
-   luxc1 $f0, $a2($a1)
-   suxc1 $f4, $t8($a1)
-\ No newline at end of file
+   luxc1   $f0, $a2($a1)
+   suxc1   $f4, $t8($a1)
+   lwxc1   $f20, $12($14)
+   swxc1   $f26, $s2($s6)
+   mfhc1   $17, $f4
+   mthc1   $17, $f6
+   swc2    $4, 16($sp)
+   sdc2    $4, 16($sp)
+   lwc2    $11, 12($ra)
+   ldc2    $11, 12($ra)
diff --git a/test/MC/Mips/mips64-alu-instructions.s b/test/MC/Mips/mips64-alu-instructions.s
index c33ba70..8262a46 100644
--- a/test/MC/Mips/mips64-alu-instructions.s
+++ b/test/MC/Mips/mips64-alu-instructions.s
@@ -1,4 +1,4 @@
-# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
+# RUN: llvm-mc %s -triple=mips64el-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
 # Check that the assembler can handle the documented syntax
 # for arithmetic and logical instructions.
 #------------------------------------------------------------------------------
@@ -73,6 +73,8 @@
 # CHECK:  daddiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x64]
 # CHECK:  daddiu  $9, $9, -15001  # encoding: [0x67,0xc5,0x29,0x65]
 # CHECK:  daddu   $9, $6, $7      # encoding: [0x2d,0x48,0xc7,0x00]
+# CHECK:  drotr   $9, $6, 20      # encoding: [0x3a,0x4d,0x26,0x00]
+# CHECK:  drotr32 $9, $6, 52      # encoding: [0x3e,0x4d,0x26,0x00]
 # CHECK:  madd   $6, $7          # encoding: [0x00,0x00,0xc7,0x70]
 # CHECK:  maddu  $6, $7          # encoding: [0x01,0x00,0xc7,0x70]
 # CHECK:  msub   $6, $7          # encoding: [0x04,0x00,0xc7,0x70]
@@ -94,6 +96,8 @@
     daddiu  $9,$6,-15001
     daddiu  $9,-15001
     daddu   $9,$6,$7
+    drotr   $9, $6, 20
+    drotr32 $9, $6, 52
     madd   $6,$7
     maddu  $6,$7
     msub   $6,$7
diff --git a/test/MC/Mips/mips64-instructions.s b/test/MC/Mips/mips64-instructions.s
new file mode 100644
index 0000000..74e9d13
--- /dev/null
+++ b/test/MC/Mips/mips64-instructions.s
@@ -0,0 +1,7 @@
+# RUN: llvm-mc  %s -triple=mips64el-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
+
+# CHECK: ldxc1 $f2, $2($10)           # encoding: [0x81,0x00,0x42,0x4d]
+# CHECK: sdxc1 $f8, $4($25)           # encoding: [0x09,0x40,0x24,0x4f]
+
+  ldxc1 $f2, $2($10)
+  sdxc1 $f8, $a0($t9)
diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
index bbb2616..44e707c 100644
--- a/test/MC/Mips/mips_directives.s
+++ b/test/MC/Mips/mips_directives.s
@@ -19,9 +19,11 @@ $BB0_2:
     .set    noat
 $JTI0_0:
     .gpword    ($BB0_2)
+
     .word 0x77fffffc
 # CHECK: $JTI0_0:
-# CHECK-NEXT:     .4byte    2013265916
+# CHECK: .gpword ($BB0_2)
+# CHECK:     .4byte    2013265916
     .set  at=$12
     .set macro
 # CHECK:   b 1332               # encoding: [0x10,0x00,0x01,0x4d]
diff --git a/test/MC/Mips/msa/test_2r.s b/test/MC/Mips/msa/test_2r.s
new file mode 100644
index 0000000..67a2b6f
--- /dev/null
+++ b/test/MC/Mips/msa/test_2r.s
@@ -0,0 +1,51 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        fill.b  $w30, $9                # encoding: [0x7b,0x00,0x4f,0x9e]
+# CHECK:        fill.h  $w31, $23               # encoding: [0x7b,0x01,0xbf,0xde]
+# CHECK:        fill.w  $w16, $24               # encoding: [0x7b,0x02,0xc4,0x1e]
+# CHECK:        nloc.b  $w21, $w0               # encoding: [0x7b,0x08,0x05,0x5e]
+# CHECK:        nloc.h  $w18, $w31              # encoding: [0x7b,0x09,0xfc,0x9e]
+# CHECK:        nloc.w  $w2, $w23               # encoding: [0x7b,0x0a,0xb8,0x9e]
+# CHECK:        nloc.d  $w4, $w10               # encoding: [0x7b,0x0b,0x51,0x1e]
+# CHECK:        nlzc.b  $w31, $w2               # encoding: [0x7b,0x0c,0x17,0xde]
+# CHECK:        nlzc.h  $w27, $w22              # encoding: [0x7b,0x0d,0xb6,0xde]
+# CHECK:        nlzc.w  $w10, $w29              # encoding: [0x7b,0x0e,0xea,0x9e]
+# CHECK:        nlzc.d  $w25, $w9               # encoding: [0x7b,0x0f,0x4e,0x5e]
+# CHECK:        pcnt.b  $w20, $w18              # encoding: [0x7b,0x04,0x95,0x1e]
+# CHECK:        pcnt.h  $w0, $w8                # encoding: [0x7b,0x05,0x40,0x1e]
+# CHECK:        pcnt.w  $w23, $w9               # encoding: [0x7b,0x06,0x4d,0xde]
+# CHECK:        pcnt.d  $w21, $w24              # encoding: [0x7b,0x07,0xc5,0x5e]
+
+# CHECKOBJDUMP:        fill.b  $w30, $9
+# CHECKOBJDUMP:        fill.h  $w31, $23
+# CHECKOBJDUMP:        fill.w  $w16, $24
+# CHECKOBJDUMP:        nloc.b  $w21, $w0
+# CHECKOBJDUMP:        nloc.h  $w18, $w31
+# CHECKOBJDUMP:        nloc.w  $w2, $w23
+# CHECKOBJDUMP:        nloc.d  $w4, $w10
+# CHECKOBJDUMP:        nlzc.b  $w31, $w2
+# CHECKOBJDUMP:        nlzc.h  $w27, $w22
+# CHECKOBJDUMP:        nlzc.w  $w10, $w29
+# CHECKOBJDUMP:        nlzc.d  $w25, $w9
+# CHECKOBJDUMP:        pcnt.b  $w20, $w18
+# CHECKOBJDUMP:        pcnt.h  $w0, $w8
+# CHECKOBJDUMP:        pcnt.w  $w23, $w9
+# CHECKOBJDUMP:        pcnt.d  $w21, $w24
+
+                fill.b  $w30, $9
+                fill.h  $w31, $23
+                fill.w  $w16, $24
+                nloc.b  $w21, $w0
+                nloc.h  $w18, $w31
+                nloc.w  $w2, $w23
+                nloc.d  $w4, $w10
+                nlzc.b  $w31, $w2
+                nlzc.h  $w27, $w22
+                nlzc.w  $w10, $w29
+                nlzc.d  $w25, $w9
+                pcnt.b  $w20, $w18
+                pcnt.h  $w0, $w8
+                pcnt.w  $w23, $w9
+                pcnt.d  $w21, $w24
diff --git a/test/MC/Mips/msa/test_2rf.s b/test/MC/Mips/msa/test_2rf.s
new file mode 100644
index 0000000..64025a4
--- /dev/null
+++ b/test/MC/Mips/msa/test_2rf.s
@@ -0,0 +1,102 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        fclass.w        $w26, $w12              # encoding: [0x7b,0x20,0x66,0x9e]
+# CHECK:        fclass.d        $w24, $w17              # encoding: [0x7b,0x21,0x8e,0x1e]
+# CHECK:        fexupl.w        $w8, $w0                # encoding: [0x7b,0x30,0x02,0x1e]
+# CHECK:        fexupl.d        $w17, $w29              # encoding: [0x7b,0x31,0xec,0x5e]
+# CHECK:        fexupr.w        $w13, $w4               # encoding: [0x7b,0x32,0x23,0x5e]
+# CHECK:        fexupr.d        $w5, $w2                # encoding: [0x7b,0x33,0x11,0x5e]
+# CHECK:        ffint_s.w       $w20, $w29              # encoding: [0x7b,0x3c,0xed,0x1e]
+# CHECK:        ffint_s.d       $w12, $w15              # encoding: [0x7b,0x3d,0x7b,0x1e]
+# CHECK:        ffint_u.w       $w7, $w27               # encoding: [0x7b,0x3e,0xd9,0xde]
+# CHECK:        ffint_u.d       $w19, $w16              # encoding: [0x7b,0x3f,0x84,0xde]
+# CHECK:        ffql.w          $w31, $w13              # encoding: [0x7b,0x34,0x6f,0xde]
+# CHECK:        ffql.d          $w12, $w13              # encoding: [0x7b,0x35,0x6b,0x1e]
+# CHECK:        ffqr.w          $w27, $w30              # encoding: [0x7b,0x36,0xf6,0xde]
+# CHECK:        ffqr.d          $w30, $w15              # encoding: [0x7b,0x37,0x7f,0x9e]
+# CHECK:        flog2.w         $w25, $w31              # encoding: [0x7b,0x2e,0xfe,0x5e]
+# CHECK:        flog2.d         $w18, $w10              # encoding: [0x7b,0x2f,0x54,0x9e]
+# CHECK:        frint.w         $w7, $w15               # encoding: [0x7b,0x2c,0x79,0xde]
+# CHECK:        frint.d         $w21, $w22              # encoding: [0x7b,0x2d,0xb5,0x5e]
+# CHECK:        frcp.w          $w19, $w0               # encoding: [0x7b,0x2a,0x04,0xde]
+# CHECK:        frcp.d          $w4, $w14               # encoding: [0x7b,0x2b,0x71,0x1e]
+# CHECK:        frsqrt.w        $w12, $w17              # encoding: [0x7b,0x28,0x8b,0x1e]
+# CHECK:        frsqrt.d        $w23, $w11              # encoding: [0x7b,0x29,0x5d,0xde]
+# CHECK:        fsqrt.w         $w0, $w11               # encoding: [0x7b,0x26,0x58,0x1e]
+# CHECK:        fsqrt.d         $w15, $w12              # encoding: [0x7b,0x27,0x63,0xde]
+# CHECK:        ftint_s.w       $w30, $w5               # encoding: [0x7b,0x38,0x2f,0x9e]
+# CHECK:        ftint_s.d       $w5, $w23               # encoding: [0x7b,0x39,0xb9,0x5e]
+# CHECK:        ftint_u.w       $w20, $w14              # encoding: [0x7b,0x3a,0x75,0x1e]
+# CHECK:        ftint_u.d       $w23, $w21              # encoding: [0x7b,0x3b,0xad,0xde]
+# CHECK:        ftrunc_s.w      $w29, $w17              # encoding: [0x7b,0x22,0x8f,0x5e]
+# CHECK:        ftrunc_s.d      $w12, $w27              # encoding: [0x7b,0x23,0xdb,0x1e]
+# CHECK:        ftrunc_u.w      $w17, $w15              # encoding: [0x7b,0x24,0x7c,0x5e]
+# CHECK:        ftrunc_u.d      $w5, $w27               # encoding: [0x7b,0x25,0xd9,0x5e]
+
+# CHECKOBJDUMP:        fclass.w        $w26, $w12
+# CHECKOBJDUMP:        fclass.d        $w24, $w17
+# CHECKOBJDUMP:        fexupl.w        $w8, $w0
+# CHECKOBJDUMP:        fexupl.d        $w17, $w29
+# CHECKOBJDUMP:        fexupr.w        $w13, $w4
+# CHECKOBJDUMP:        fexupr.d        $w5, $w2
+# CHECKOBJDUMP:        ffint_s.w       $w20, $w29
+# CHECKOBJDUMP:        ffint_s.d       $w12, $w15
+# CHECKOBJDUMP:        ffint_u.w       $w7, $w27
+# CHECKOBJDUMP:        ffint_u.d       $w19, $w16
+# CHECKOBJDUMP:        ffql.w          $w31, $w13
+# CHECKOBJDUMP:        ffql.d          $w12, $w13
+# CHECKOBJDUMP:        ffqr.w          $w27, $w30
+# CHECKOBJDUMP:        ffqr.d          $w30, $w15
+# CHECKOBJDUMP:        flog2.w         $w25, $w31
+# CHECKOBJDUMP:        flog2.d         $w18, $w10
+# CHECKOBJDUMP:        frint.w         $w7, $w15
+# CHECKOBJDUMP:        frint.d         $w21, $w22
+# CHECKOBJDUMP:        frcp.w          $w19, $w0
+# CHECKOBJDUMP:        frcp.d          $w4, $w14
+# CHECKOBJDUMP:        frsqrt.w        $w12, $w17
+# CHECKOBJDUMP:        frsqrt.d        $w23, $w11
+# CHECKOBJDUMP:        fsqrt.w         $w0, $w11
+# CHECKOBJDUMP:        fsqrt.d         $w15, $w12
+# CHECKOBJDUMP:        ftint_s.w       $w30, $w5
+# CHECKOBJDUMP:        ftint_s.d       $w5, $w23
+# CHECKOBJDUMP:        ftint_u.w       $w20, $w14
+# CHECKOBJDUMP:        ftint_u.d       $w23, $w21
+# CHECKOBJDUMP:        ftrunc_s.w      $w29, $w17
+# CHECKOBJDUMP:        ftrunc_s.d      $w12, $w27
+# CHECKOBJDUMP:        ftrunc_u.w      $w17, $w15
+# CHECKOBJDUMP:        ftrunc_u.d      $w5, $w27
+
+                fclass.w        $w26, $w12
+                fclass.d        $w24, $w17
+                fexupl.w        $w8, $w0
+                fexupl.d        $w17, $w29
+                fexupr.w        $w13, $w4
+                fexupr.d        $w5, $w2
+                ffint_s.w       $w20, $w29
+                ffint_s.d       $w12, $w15
+                ffint_u.w       $w7, $w27
+                ffint_u.d       $w19, $w16
+                ffql.w          $w31, $w13
+                ffql.d          $w12, $w13
+                ffqr.w          $w27, $w30
+                ffqr.d          $w30, $w15
+                flog2.w         $w25, $w31
+                flog2.d         $w18, $w10
+                frint.w         $w7, $w15
+                frint.d         $w21, $w22
+                frcp.w          $w19, $w0
+                frcp.d          $w4, $w14
+                frsqrt.w        $w12, $w17
+                frsqrt.d        $w23, $w11
+                fsqrt.w         $w0, $w11
+                fsqrt.d         $w15, $w12
+                ftint_s.w       $w30, $w5
+                ftint_s.d       $w5, $w23
+                ftint_u.w       $w20, $w14
+                ftint_u.d       $w23, $w21
+                ftrunc_s.w      $w29, $w17
+                ftrunc_s.d      $w12, $w27
+                ftrunc_u.w      $w17, $w15
+                ftrunc_u.d      $w5, $w27
diff --git a/test/MC/Mips/msa/test_3r.s b/test/MC/Mips/msa/test_3r.s
new file mode 100644
index 0000000..3047ecb
--- /dev/null
+++ b/test/MC/Mips/msa/test_3r.s
@@ -0,0 +1,732 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        add_a.b         $w26, $w9, $w4                  # encoding: [0x78,0x04,0x4e,0x90]
+# CHECK:        add_a.h         $w23, $w27, $w31                # encoding: [0x78,0x3f,0xdd,0xd0]
+# CHECK:        add_a.w         $w11, $w6, $w22                 # encoding: [0x78,0x56,0x32,0xd0]
+# CHECK:        add_a.d         $w6, $w10, $w0                  # encoding: [0x78,0x60,0x51,0x90]
+# CHECK:        adds_a.b        $w19, $w24, $w19                # encoding: [0x78,0x93,0xc4,0xd0]
+# CHECK:        adds_a.h        $w25, $w6, $w4                  # encoding: [0x78,0xa4,0x36,0x50]
+# CHECK:        adds_a.w        $w25, $w17, $w27                # encoding: [0x78,0xdb,0x8e,0x50]
+# CHECK:        adds_a.d        $w15, $w18, $w26                # encoding: [0x78,0xfa,0x93,0xd0]
+# CHECK:        adds_s.b        $w29, $w11, $w19                # encoding: [0x79,0x13,0x5f,0x50]
+# CHECK:        adds_s.h        $w5, $w23, $w26                 # encoding: [0x79,0x3a,0xb9,0x50]
+# CHECK:        adds_s.w        $w16, $w14, $w13                # encoding: [0x79,0x4d,0x74,0x10]
+# CHECK:        adds_s.d        $w2, $w14, $w28                 # encoding: [0x79,0x7c,0x70,0x90]
+# CHECK:        adds_u.b        $w3, $w17, $w14                 # encoding: [0x79,0x8e,0x88,0xd0]
+# CHECK:        adds_u.h        $w10, $w30, $w4                 # encoding: [0x79,0xa4,0xf2,0x90]
+# CHECK:        adds_u.w        $w15, $w18, $w20                # encoding: [0x79,0xd4,0x93,0xd0]
+# CHECK:        adds_u.d        $w30, $w10, $w9                 # encoding: [0x79,0xe9,0x57,0x90]
+# CHECK:        addv.b          $w24, $w20, $w21                # encoding: [0x78,0x15,0xa6,0x0e]
+# CHECK:        addv.h          $w4, $w13, $w27                 # encoding: [0x78,0x3b,0x69,0x0e]
+# CHECK:        addv.w          $w19, $w11, $w14                # encoding: [0x78,0x4e,0x5c,0xce]
+# CHECK:        addv.d          $w2, $w21, $w31                 # encoding: [0x78,0x7f,0xa8,0x8e]
+# CHECK:        asub_s.b        $w23, $w16, $w3                 # encoding: [0x7a,0x03,0x85,0xd1]
+# CHECK:        asub_s.h        $w22, $w17, $w25                # encoding: [0x7a,0x39,0x8d,0x91]
+# CHECK:        asub_s.w        $w24, $w1, $w9                  # encoding: [0x7a,0x49,0x0e,0x11]
+# CHECK:        asub_s.d        $w13, $w12, $w12                # encoding: [0x7a,0x6c,0x63,0x51]
+# CHECK:        asub_u.b        $w10, $w29, $w11                # encoding: [0x7a,0x8b,0xea,0x91]
+# CHECK:        asub_u.h        $w18, $w9, $w15                 # encoding: [0x7a,0xaf,0x4c,0x91]
+# CHECK:        asub_u.w        $w10, $w19, $w31                # encoding: [0x7a,0xdf,0x9a,0x91]
+# CHECK:        asub_u.d        $w17, $w10, $w0                 # encoding: [0x7a,0xe0,0x54,0x51]
+# CHECK:        ave_s.b         $w2, $w5, $w1                   # encoding: [0x7a,0x01,0x28,0x90]
+# CHECK:        ave_s.h         $w16, $w19, $w9                 # encoding: [0x7a,0x29,0x9c,0x10]
+# CHECK:        ave_s.w         $w17, $w31, $w5                 # encoding: [0x7a,0x45,0xfc,0x50]
+# CHECK:        ave_s.d         $w27, $w25, $w10                # encoding: [0x7a,0x6a,0xce,0xd0]
+# CHECK:        ave_u.b         $w16, $w19, $w9                 # encoding: [0x7a,0x89,0x9c,0x10]
+# CHECK:        ave_u.h         $w28, $w28, $w11                # encoding: [0x7a,0xab,0xe7,0x10]
+# CHECK:        ave_u.w         $w11, $w12, $w11                # encoding: [0x7a,0xcb,0x62,0xd0]
+# CHECK:        ave_u.d         $w30, $w19, $w28                # encoding: [0x7a,0xfc,0x9f,0x90]
+# CHECK:        aver_s.b        $w26, $w16, $w2                 # encoding: [0x7b,0x02,0x86,0x90]
+# CHECK:        aver_s.h        $w31, $w27, $w27                # encoding: [0x7b,0x3b,0xdf,0xd0]
+# CHECK:        aver_s.w        $w28, $w18, $w25                # encoding: [0x7b,0x59,0x97,0x10]
+# CHECK:        aver_s.d        $w29, $w21, $w27                # encoding: [0x7b,0x7b,0xaf,0x50]
+# CHECK:        aver_u.b        $w29, $w26, $w3                 # encoding: [0x7b,0x83,0xd7,0x50]
+# CHECK:        aver_u.h        $w18, $w18, $w9                 # encoding: [0x7b,0xa9,0x94,0x90]
+# CHECK:        aver_u.w        $w17, $w25, $w29                # encoding: [0x7b,0xdd,0xcc,0x50]
+# CHECK:        aver_u.d        $w22, $w22, $w19                # encoding: [0x7b,0xf3,0xb5,0x90]
+# CHECK:        bclr.b          $w2, $w15, $w29                 # encoding: [0x79,0x9d,0x78,0x8d]
+# CHECK:        bclr.h          $w16, $w21, $w28                # encoding: [0x79,0xbc,0xac,0x0d]
+# CHECK:        bclr.w          $w19, $w2, $w9                  # encoding: [0x79,0xc9,0x14,0xcd]
+# CHECK:        bclr.d          $w27, $w31, $w4                 # encoding: [0x79,0xe4,0xfe,0xcd]
+# CHECK:        binsl.b         $w5, $w16, $w24                 # encoding: [0x7b,0x18,0x81,0x4d]
+# CHECK:        binsl.h         $w30, $w5, $w10                 # encoding: [0x7b,0x2a,0x2f,0x8d]
+# CHECK:        binsl.w         $w14, $w15, $w13                # encoding: [0x7b,0x4d,0x7b,0x8d]
+# CHECK:        binsl.d         $w23, $w20, $w12                # encoding: [0x7b,0x6c,0xa5,0xcd]
+# CHECK:        binsr.b         $w22, $w11, $w2                 # encoding: [0x7b,0x82,0x5d,0x8d]
+# CHECK:        binsr.h         $w0, $w26, $w6                  # encoding: [0x7b,0xa6,0xd0,0x0d]
+# CHECK:        binsr.w         $w26, $w3, $w28                 # encoding: [0x7b,0xdc,0x1e,0x8d]
+# CHECK:        binsr.d         $w0, $w0, $w21                  # encoding: [0x7b,0xf5,0x00,0x0d]
+# CHECK:        bneg.b          $w0, $w11, $w24                 # encoding: [0x7a,0x98,0x58,0x0d]
+# CHECK:        bneg.h          $w28, $w16, $w4                 # encoding: [0x7a,0xa4,0x87,0x0d]
+# CHECK:        bneg.w          $w3, $w26, $w19                 # encoding: [0x7a,0xd3,0xd0,0xcd]
+# CHECK:        bneg.d          $w13, $w29, $w15                # encoding: [0x7a,0xef,0xeb,0x4d]
+# CHECK:        bset.b          $w31, $w5, $w31                 # encoding: [0x7a,0x1f,0x2f,0xcd]
+# CHECK:        bset.h          $w14, $w12, $w6                 # encoding: [0x7a,0x26,0x63,0x8d]
+# CHECK:        bset.w          $w31, $w9, $w12                 # encoding: [0x7a,0x4c,0x4f,0xcd]
+# CHECK:        bset.d          $w5, $w22, $w5                  # encoding: [0x7a,0x65,0xb1,0x4d]
+# CHECK:        ceq.b           $w31, $w31, $w18                # encoding: [0x78,0x12,0xff,0xcf]
+# CHECK:        ceq.h           $w10, $w27, $w9                 # encoding: [0x78,0x29,0xda,0x8f]
+# CHECK:        ceq.w           $w9, $w5, $w14                  # encoding: [0x78,0x4e,0x2a,0x4f]
+# CHECK:        ceq.d           $w5, $w17, $w0                  # encoding: [0x78,0x60,0x89,0x4f]
+# CHECK:        cle_s.b         $w23, $w4, $w9                  # encoding: [0x7a,0x09,0x25,0xcf]
+# CHECK:        cle_s.h         $w22, $w27, $w19                # encoding: [0x7a,0x33,0xdd,0x8f]
+# CHECK:        cle_s.w         $w30, $w26, $w10                # encoding: [0x7a,0x4a,0xd7,0x8f]
+# CHECK:        cle_s.d         $w18, $w5, $w10                 # encoding: [0x7a,0x6a,0x2c,0x8f]
+# CHECK:        cle_u.b         $w1, $w25, $w0                  # encoding: [0x7a,0x80,0xc8,0x4f]
+# CHECK:        cle_u.h         $w7, $w0, $w29                  # encoding: [0x7a,0xbd,0x01,0xcf]
+# CHECK:        cle_u.w         $w25, $w18, $w1                 # encoding: [0x7a,0xc1,0x96,0x4f]
+# CHECK:        cle_u.d         $w6, $w0, $w30                  # encoding: [0x7a,0xfe,0x01,0x8f]
+# CHECK:        clt_s.b         $w25, $w2, $w21                 # encoding: [0x79,0x15,0x16,0x4f]
+# CHECK:        clt_s.h         $w2, $w19, $w9                  # encoding: [0x79,0x29,0x98,0x8f]
+# CHECK:        clt_s.w         $w23, $w8, $w16                 # encoding: [0x79,0x50,0x45,0xcf]
+# CHECK:        clt_s.d         $w7, $w30, $w12                 # encoding: [0x79,0x6c,0xf1,0xcf]
+# CHECK:        clt_u.b         $w2, $w31, $w13                 # encoding: [0x79,0x8d,0xf8,0x8f]
+# CHECK:        clt_u.h         $w16, $w31, $w23                # encoding: [0x79,0xb7,0xfc,0x0f]
+# CHECK:        clt_u.w         $w3, $w24, $w9                  # encoding: [0x79,0xc9,0xc0,0xcf]
+# CHECK:        clt_u.d         $w7, $w0, $w1                   # encoding: [0x79,0xe1,0x01,0xcf]
+# CHECK:        div_s.b         $w29, $w3, $w18                 # encoding: [0x7a,0x12,0x1f,0x52]
+# CHECK:        div_s.h         $w17, $w16, $w13                # encoding: [0x7a,0x2d,0x84,0x52]
+# CHECK:        div_s.w         $w4, $w25, $w30                 # encoding: [0x7a,0x5e,0xc9,0x12]
+# CHECK:        div_s.d         $w31, $w9, $w20                 # encoding: [0x7a,0x74,0x4f,0xd2]
+# CHECK:        div_u.b         $w6, $w29, $w10                 # encoding: [0x7a,0x8a,0xe9,0x92]
+# CHECK:        div_u.h         $w24, $w21, $w14                # encoding: [0x7a,0xae,0xae,0x12]
+# CHECK:        div_u.w         $w29, $w14, $w25                # encoding: [0x7a,0xd9,0x77,0x52]
+# CHECK:        div_u.d         $w31, $w1, $w21                 # encoding: [0x7a,0xf5,0x0f,0xd2]
+# CHECK:        dotp_s.h        $w23, $w22, $w25                # encoding: [0x78,0x39,0xb5,0xd3]
+# CHECK:        dotp_s.w        $w20, $w14, $w5                 # encoding: [0x78,0x45,0x75,0x13]
+# CHECK:        dotp_s.d        $w17, $w2, $w22                 # encoding: [0x78,0x76,0x14,0x53]
+# CHECK:        dotp_u.h        $w13, $w2, $w6                  # encoding: [0x78,0xa6,0x13,0x53]
+# CHECK:        dotp_u.w        $w15, $w22, $w21                # encoding: [0x78,0xd5,0xb3,0xd3]
+# CHECK:        dotp_u.d        $w4, $w16, $w26                 # encoding: [0x78,0xfa,0x81,0x13]
+# CHECK:        dpadd_s.h       $w1, $w28, $w22                 # encoding: [0x79,0x36,0xe0,0x53]
+# CHECK:        dpadd_s.w       $w10, $w1, $w12                 # encoding: [0x79,0x4c,0x0a,0x93]
+# CHECK:        dpadd_s.d       $w3, $w21, $w27                 # encoding: [0x79,0x7b,0xa8,0xd3]
+# CHECK:        dpadd_u.h       $w17, $w5, $w20                 # encoding: [0x79,0xb4,0x2c,0x53]
+# CHECK:        dpadd_u.w       $w24, $w8, $w16                 # encoding: [0x79,0xd0,0x46,0x13]
+# CHECK:        dpadd_u.d       $w15, $w29, $w16                # encoding: [0x79,0xf0,0xeb,0xd3]
+# CHECK:        dpsub_s.h       $w4, $w11, $w12                 # encoding: [0x7a,0x2c,0x59,0x13]
+# CHECK:        dpsub_s.w       $w4, $w7, $w6                   # encoding: [0x7a,0x46,0x39,0x13]
+# CHECK:        dpsub_s.d       $w31, $w12, $w28                # encoding: [0x7a,0x7c,0x67,0xd3]
+# CHECK:        dpsub_u.h       $w4, $w25, $w17                 # encoding: [0x7a,0xb1,0xc9,0x13]
+# CHECK:        dpsub_u.w       $w19, $w25, $w16                # encoding: [0x7a,0xd0,0xcc,0xd3]
+# CHECK:        dpsub_u.d       $w7, $w10, $w26                 # encoding: [0x7a,0xfa,0x51,0xd3]
+# CHECK:        hadd_s.h        $w28, $w24, $w2                 # encoding: [0x7a,0x22,0xc7,0x15]
+# CHECK:        hadd_s.w        $w24, $w17, $w11                # encoding: [0x7a,0x4b,0x8e,0x15]
+# CHECK:        hadd_s.d        $w17, $w15, $w20                # encoding: [0x7a,0x74,0x7c,0x55]
+# CHECK:        hadd_u.h        $w12, $w29, $w17                # encoding: [0x7a,0xb1,0xeb,0x15]
+# CHECK:        hadd_u.w        $w9, $w5, $w6                   # encoding: [0x7a,0xc6,0x2a,0x55]
+# CHECK:        hadd_u.d        $w1, $w20, $w6                  # encoding: [0x7a,0xe6,0xa0,0x55]
+# CHECK:        hsub_s.h        $w16, $w14, $w29                # encoding: [0x7b,0x3d,0x74,0x15]
+# CHECK:        hsub_s.w        $w9, $w13, $w11                 # encoding: [0x7b,0x4b,0x6a,0x55]
+# CHECK:        hsub_s.d        $w30, $w18, $w14                # encoding: [0x7b,0x6e,0x97,0x95]
+# CHECK:        hsub_u.h        $w7, $w12, $w14                 # encoding: [0x7b,0xae,0x61,0xd5]
+# CHECK:        hsub_u.w        $w21, $w5, $w5                  # encoding: [0x7b,0xc5,0x2d,0x55]
+# CHECK:        hsub_u.d        $w11, $w12, $w31                # encoding: [0x7b,0xff,0x62,0xd5]
+# CHECK:        ilvev.b         $w18, $w16, $w30                # encoding: [0x7b,0x1e,0x84,0x94]
+# CHECK:        ilvev.h         $w14, $w0, $w13                 # encoding: [0x7b,0x2d,0x03,0x94]
+# CHECK:        ilvev.w         $w12, $w25, $w22                # encoding: [0x7b,0x56,0xcb,0x14]
+# CHECK:        ilvev.d         $w30, $w27, $w3                 # encoding: [0x7b,0x63,0xdf,0x94]
+# CHECK:        ilvl.b          $w29, $w3, $w21                 # encoding: [0x7a,0x15,0x1f,0x54]
+# CHECK:        ilvl.h          $w27, $w10, $w17                # encoding: [0x7a,0x31,0x56,0xd4]
+# CHECK:        ilvl.w          $w6, $w1, $w0                   # encoding: [0x7a,0x40,0x09,0x94]
+# CHECK:        ilvl.d          $w3, $w16, $w24                 # encoding: [0x7a,0x78,0x80,0xd4]
+# CHECK:        ilvod.b         $w11, $w5, $w20                 # encoding: [0x7b,0x94,0x2a,0xd4]
+# CHECK:        ilvod.h         $w18, $w13, $w31                # encoding: [0x7b,0xbf,0x6c,0x94]
+# CHECK:        ilvod.w         $w29, $w16, $w24                # encoding: [0x7b,0xd8,0x87,0x54]
+# CHECK:        ilvod.d         $w22, $w12, $w29                # encoding: [0x7b,0xfd,0x65,0x94]
+# CHECK:        ilvr.b          $w4, $w30, $w6                  # encoding: [0x7a,0x86,0xf1,0x14]
+# CHECK:        ilvr.h          $w28, $w19, $w29                # encoding: [0x7a,0xbd,0x9f,0x14]
+# CHECK:        ilvr.w          $w18, $w20, $w21                # encoding: [0x7a,0xd5,0xa4,0x94]
+# CHECK:        ilvr.d          $w23, $w30, $w12                # encoding: [0x7a,0xec,0xf5,0xd4]
+# CHECK:        maddv.b         $w17, $w31, $w29                # encoding: [0x78,0x9d,0xfc,0x52]
+# CHECK:        maddv.h         $w7, $w24, $w9                  # encoding: [0x78,0xa9,0xc1,0xd2]
+# CHECK:        maddv.w         $w22, $w22, $w20                # encoding: [0x78,0xd4,0xb5,0x92]
+# CHECK:        maddv.d         $w30, $w26, $w20                # encoding: [0x78,0xf4,0xd7,0x92]
+# CHECK:        max_a.b         $w23, $w11, $w23                # encoding: [0x7b,0x17,0x5d,0xce]
+# CHECK:        max_a.h         $w20, $w5, $w30                 # encoding: [0x7b,0x3e,0x2d,0x0e]
+# CHECK:        max_a.w         $w7, $w18, $w30                 # encoding: [0x7b,0x5e,0x91,0xce]
+# CHECK:        max_a.d         $w8, $w8, $w31                  # encoding: [0x7b,0x7f,0x42,0x0e]
+# CHECK:        max_s.b         $w10, $w1, $w19                 # encoding: [0x79,0x13,0x0a,0x8e]
+# CHECK:        max_s.h         $w15, $w29, $w17                # encoding: [0x79,0x31,0xeb,0xce]
+# CHECK:        max_s.w         $w15, $w29, $w14                # encoding: [0x79,0x4e,0xeb,0xce]
+# CHECK:        max_s.d         $w25, $w24, $w3                 # encoding: [0x79,0x63,0xc6,0x4e]
+# CHECK:        max_u.b         $w12, $w24, $w5                 # encoding: [0x79,0x85,0xc3,0x0e]
+# CHECK:        max_u.h         $w5, $w6, $w7                   # encoding: [0x79,0xa7,0x31,0x4e]
+# CHECK:        max_u.w         $w16, $w4, $w7                  # encoding: [0x79,0xc7,0x24,0x0e]
+# CHECK:        max_u.d         $w26, $w12, $w24                # encoding: [0x79,0xf8,0x66,0x8e]
+# CHECK:        min_a.b         $w4, $w26, $w1                  # encoding: [0x7b,0x81,0xd1,0x0e]
+# CHECK:        min_a.h         $w12, $w13, $w31                # encoding: [0x7b,0xbf,0x6b,0x0e]
+# CHECK:        min_a.w         $w28, $w20, $w0                 # encoding: [0x7b,0xc0,0xa7,0x0e]
+# CHECK:        min_a.d         $w12, $w20, $w19                # encoding: [0x7b,0xf3,0xa3,0x0e]
+# CHECK:        min_s.b         $w19, $w3, $w14                 # encoding: [0x7a,0x0e,0x1c,0xce]
+# CHECK:        min_s.h         $w27, $w21, $w8                 # encoding: [0x7a,0x28,0xae,0xce]
+# CHECK:        min_s.w         $w0, $w14, $w30                 # encoding: [0x7a,0x5e,0x70,0x0e]
+# CHECK:        min_s.d         $w6, $w8, $w21                  # encoding: [0x7a,0x75,0x41,0x8e]
+# CHECK:        min_u.b         $w22, $w26, $w8                 # encoding: [0x7a,0x88,0xd5,0x8e]
+# CHECK:        min_u.h         $w7, $w27, $w12                 # encoding: [0x7a,0xac,0xd9,0xce]
+# CHECK:        min_u.w         $w8, $w20, $w14                 # encoding: [0x7a,0xce,0xa2,0x0e]
+# CHECK:        min_u.d         $w26, $w14, $w15                # encoding: [0x7a,0xef,0x76,0x8e]
+# CHECK:        mod_s.b         $w18, $w1, $w26                 # encoding: [0x7b,0x1a,0x0c,0x92]
+# CHECK:        mod_s.h         $w31, $w30, $w28                # encoding: [0x7b,0x3c,0xf7,0xd2]
+# CHECK:        mod_s.w         $w2, $w6, $w13                  # encoding: [0x7b,0x4d,0x30,0x92]
+# CHECK:        mod_s.d         $w21, $w27, $w22                # encoding: [0x7b,0x76,0xdd,0x52]
+# CHECK:        mod_u.b         $w16, $w7, $w13                 # encoding: [0x7b,0x8d,0x3c,0x12]
+# CHECK:        mod_u.h         $w24, $w8, $w7                  # encoding: [0x7b,0xa7,0x46,0x12]
+# CHECK:        mod_u.w         $w30, $w2, $w17                 # encoding: [0x7b,0xd1,0x17,0x92]
+# CHECK:        mod_u.d         $w31, $w2, $w25                 # encoding: [0x7b,0xf9,0x17,0xd2]
+# CHECK:        msubv.b         $w14, $w5, $w12                 # encoding: [0x79,0x0c,0x2b,0x92]
+# CHECK:        msubv.h         $w6, $w7, $w30                  # encoding: [0x79,0x3e,0x39,0x92]
+# CHECK:        msubv.w         $w13, $w2, $w21                 # encoding: [0x79,0x55,0x13,0x52]
+# CHECK:        msubv.d         $w16, $w14, $w27                # encoding: [0x79,0x7b,0x74,0x12]
+# CHECK:        mulv.b          $w20, $w3, $w13                 # encoding: [0x78,0x0d,0x1d,0x12]
+# CHECK:        mulv.h          $w27, $w26, $w14                # encoding: [0x78,0x2e,0xd6,0xd2]
+# CHECK:        mulv.w          $w10, $w29, $w3                 # encoding: [0x78,0x43,0xea,0x92]
+# CHECK:        mulv.d          $w7, $w19, $w29                 # encoding: [0x78,0x7d,0x99,0xd2]
+# CHECK:        pckev.b         $w5, $w27, $w7                  # encoding: [0x79,0x07,0xd9,0x54]
+# CHECK:        pckev.h         $w1, $w4, $w27                  # encoding: [0x79,0x3b,0x20,0x54]
+# CHECK:        pckev.w         $w30, $w20, $w0                 # encoding: [0x79,0x40,0xa7,0x94]
+# CHECK:        pckev.d         $w6, $w1, $w15                  # encoding: [0x79,0x6f,0x09,0x94]
+# CHECK:        pckod.b         $w18, $w28, $w30                # encoding: [0x79,0x9e,0xe4,0x94]
+# CHECK:        pckod.h         $w26, $w5, $w8                  # encoding: [0x79,0xa8,0x2e,0x94]
+# CHECK:        pckod.w         $w9, $w4, $w2                   # encoding: [0x79,0xc2,0x22,0x54]
+# CHECK:        pckod.d         $w30, $w22, $w20                # encoding: [0x79,0xf4,0xb7,0x94]
+# CHECK:        sld.b           $w5, $w23[$12]                  # encoding: [0x78,0x0c,0xb9,0x54]
+# CHECK:        sld.h           $w1, $w23[$3]                   # encoding: [0x78,0x23,0xb8,0x54]
+# CHECK:        sld.w           $w20, $w8[$9]                   # encoding: [0x78,0x49,0x45,0x14]
+# CHECK:        sld.d           $w7, $w23[$fp]                  # encoding: [0x78,0x7e,0xb9,0xd4]
+# CHECK:        sll.b           $w3, $w0, $w17                  # encoding: [0x78,0x11,0x00,0xcd]
+# CHECK:        sll.h           $w17, $w27, $w3                 # encoding: [0x78,0x23,0xdc,0x4d]
+# CHECK:        sll.w           $w16, $w7, $w6                  # encoding: [0x78,0x46,0x3c,0x0d]
+# CHECK:        sll.d           $w9, $w0, $w26                  # encoding: [0x78,0x7a,0x02,0x4d]
+# CHECK:        splat.b         $w28, $w1[$1]                   # encoding: [0x78,0x81,0x0f,0x14]
+# CHECK:        splat.h         $w2, $w11[$11]                  # encoding: [0x78,0xab,0x58,0x94]
+# CHECK:        splat.w         $w22, $w0[$11]                  # encoding: [0x78,0xcb,0x05,0x94]
+# CHECK:        splat.d         $w0, $w0[$2]                    # encoding: [0x78,0xe2,0x00,0x14]
+# CHECK:        sra.b           $w28, $w4, $w17                 # encoding: [0x78,0x91,0x27,0x0d]
+# CHECK:        sra.h           $w13, $w9, $w3                  # encoding: [0x78,0xa3,0x4b,0x4d]
+# CHECK:        sra.w           $w27, $w21, $w19                # encoding: [0x78,0xd3,0xae,0xcd]
+# CHECK:        sra.d           $w30, $w8, $w23                 # encoding: [0x78,0xf7,0x47,0x8d]
+# CHECK:        srar.b          $w19, $w18, $w18                # encoding: [0x78,0x92,0x94,0xd5]
+# CHECK:        srar.h          $w7, $w23, $w8                  # encoding: [0x78,0xa8,0xb9,0xd5]
+# CHECK:        srar.w          $w1, $w12, $w2                  # encoding: [0x78,0xc2,0x60,0x55]
+# CHECK:        srar.d          $w21, $w7, $w14                 # encoding: [0x78,0xee,0x3d,0x55]
+# CHECK:        srl.b           $w12, $w3, $w19                 # encoding: [0x79,0x13,0x1b,0x0d]
+# CHECK:        srl.h           $w23, $w31, $w20                # encoding: [0x79,0x34,0xfd,0xcd]
+# CHECK:        srl.w           $w18, $w27, $w11                # encoding: [0x79,0x4b,0xdc,0x8d]
+# CHECK:        srl.d           $w3, $w12, $w26                 # encoding: [0x79,0x7a,0x60,0xcd]
+# CHECK:        srlr.b          $w15, $w21, $w11                # encoding: [0x79,0x0b,0xab,0xd5]
+# CHECK:        srlr.h          $w21, $w13, $w19                # encoding: [0x79,0x33,0x6d,0x55]
+# CHECK:        srlr.w          $w6, $w30, $w3                  # encoding: [0x79,0x43,0xf1,0x95]
+# CHECK:        srlr.d          $w1, $w2, $w14                  # encoding: [0x79,0x6e,0x10,0x55]
+# CHECK:        subs_s.b        $w25, $w15, $w1                 # encoding: [0x78,0x01,0x7e,0x51]
+# CHECK:        subs_s.h        $w28, $w25, $w22                # encoding: [0x78,0x36,0xcf,0x11]
+# CHECK:        subs_s.w        $w10, $w12, $w21                # encoding: [0x78,0x55,0x62,0x91]
+# CHECK:        subs_s.d        $w4, $w20, $w18                 # encoding: [0x78,0x72,0xa1,0x11]
+# CHECK:        subs_u.b        $w21, $w6, $w25                 # encoding: [0x78,0x99,0x35,0x51]
+# CHECK:        subs_u.h        $w3, $w10, $w7                  # encoding: [0x78,0xa7,0x50,0xd1]
+# CHECK:        subs_u.w        $w9, $w15, $w10                 # encoding: [0x78,0xca,0x7a,0x51]
+# CHECK:        subs_u.d        $w7, $w19, $w10                 # encoding: [0x78,0xea,0x99,0xd1]
+# CHECK:        subsus_u.b      $w6, $w7, $w12                  # encoding: [0x79,0x0c,0x39,0x91]
+# CHECK:        subsus_u.h      $w6, $w29, $w19                 # encoding: [0x79,0x33,0xe9,0x91]
+# CHECK:        subsus_u.w      $w7, $w15, $w7                  # encoding: [0x79,0x47,0x79,0xd1]
+# CHECK:        subsus_u.d      $w9, $w3, $w15                  # encoding: [0x79,0x6f,0x1a,0x51]
+# CHECK:        subsuu_s.b      $w22, $w3, $w31                 # encoding: [0x79,0x9f,0x1d,0x91]
+# CHECK:        subsuu_s.h      $w19, $w23, $w22                # encoding: [0x79,0xb6,0xbc,0xd1]
+# CHECK:        subsuu_s.w      $w9, $w10, $w13                 # encoding: [0x79,0xcd,0x52,0x51]
+# CHECK:        subsuu_s.d      $w5, $w6, $w0                   # encoding: [0x79,0xe0,0x31,0x51]
+# CHECK:        subv.b          $w6, $w13, $w19                 # encoding: [0x78,0x93,0x69,0x8e]
+# CHECK:        subv.h          $w4, $w25, $w12                 # encoding: [0x78,0xac,0xc9,0x0e]
+# CHECK:        subv.w          $w27, $w27, $w11                # encoding: [0x78,0xcb,0xde,0xce]
+# CHECK:        subv.d          $w9, $w24, $w10                 # encoding: [0x78,0xea,0xc2,0x4e]
+# CHECK:        vshf.b          $w3, $w16, $w5                  # encoding: [0x78,0x05,0x80,0xd5]
+# CHECK:        vshf.h          $w20, $w19, $w8                 # encoding: [0x78,0x28,0x9d,0x15]
+# CHECK:        vshf.w          $w16, $w30, $w25                # encoding: [0x78,0x59,0xf4,0x15]
+# CHECK:        vshf.d          $w19, $w11, $w15                # encoding: [0x78,0x6f,0x5c,0xd5]
+
+# CHECKOBJDUMP:        add_a.b         $w26, $w9, $w4
+# CHECKOBJDUMP:        add_a.h         $w23, $w27, $w31
+# CHECKOBJDUMP:        add_a.w         $w11, $w6, $w22
+# CHECKOBJDUMP:        add_a.d         $w6, $w10, $w0
+# CHECKOBJDUMP:        adds_a.b        $w19, $w24, $w19
+# CHECKOBJDUMP:        adds_a.h        $w25, $w6, $w4
+# CHECKOBJDUMP:        adds_a.w        $w25, $w17, $w27
+# CHECKOBJDUMP:        adds_a.d        $w15, $w18, $w26
+# CHECKOBJDUMP:        adds_s.b        $w29, $w11, $w19
+# CHECKOBJDUMP:        adds_s.h        $w5, $w23, $w26
+# CHECKOBJDUMP:        adds_s.w        $w16, $w14, $w13
+# CHECKOBJDUMP:        adds_s.d        $w2, $w14, $w28
+# CHECKOBJDUMP:        adds_u.b        $w3, $w17, $w14
+# CHECKOBJDUMP:        adds_u.h        $w10, $w30, $w4
+# CHECKOBJDUMP:        adds_u.w        $w15, $w18, $w20
+# CHECKOBJDUMP:        adds_u.d        $w30, $w10, $w9
+# CHECKOBJDUMP:        addv.b          $w24, $w20, $w21
+# CHECKOBJDUMP:        addv.h          $w4, $w13, $w27
+# CHECKOBJDUMP:        addv.w          $w19, $w11, $w14
+# CHECKOBJDUMP:        addv.d          $w2, $w21, $w31
+# CHECKOBJDUMP:        asub_s.b        $w23, $w16, $w3
+# CHECKOBJDUMP:        asub_s.h        $w22, $w17, $w25
+# CHECKOBJDUMP:        asub_s.w        $w24, $w1, $w9
+# CHECKOBJDUMP:        asub_s.d        $w13, $w12, $w12
+# CHECKOBJDUMP:        asub_u.b        $w10, $w29, $w11
+# CHECKOBJDUMP:        asub_u.h        $w18, $w9, $w15
+# CHECKOBJDUMP:        asub_u.w        $w10, $w19, $w31
+# CHECKOBJDUMP:        asub_u.d        $w17, $w10, $w0
+# CHECKOBJDUMP:        ave_s.b         $w2, $w5, $w1
+# CHECKOBJDUMP:        ave_s.h         $w16, $w19, $w9
+# CHECKOBJDUMP:        ave_s.w         $w17, $w31, $w5
+# CHECKOBJDUMP:        ave_s.d         $w27, $w25, $w10
+# CHECKOBJDUMP:        ave_u.b         $w16, $w19, $w9
+# CHECKOBJDUMP:        ave_u.h         $w28, $w28, $w11
+# CHECKOBJDUMP:        ave_u.w         $w11, $w12, $w11
+# CHECKOBJDUMP:        ave_u.d         $w30, $w19, $w28
+# CHECKOBJDUMP:        aver_s.b        $w26, $w16, $w2
+# CHECKOBJDUMP:        aver_s.h        $w31, $w27, $w27
+# CHECKOBJDUMP:        aver_s.w        $w28, $w18, $w25
+# CHECKOBJDUMP:        aver_s.d        $w29, $w21, $w27
+# CHECKOBJDUMP:        aver_u.b        $w29, $w26, $w3
+# CHECKOBJDUMP:        aver_u.h        $w18, $w18, $w9
+# CHECKOBJDUMP:        aver_u.w        $w17, $w25, $w29
+# CHECKOBJDUMP:        aver_u.d        $w22, $w22, $w19
+# CHECKOBJDUMP:        bclr.b          $w2, $w15, $w29
+# CHECKOBJDUMP:        bclr.h          $w16, $w21, $w28
+# CHECKOBJDUMP:        bclr.w          $w19, $w2, $w9
+# CHECKOBJDUMP:        bclr.d          $w27, $w31, $w4
+# CHECKOBJDUMP:        binsl.b         $w5, $w16, $w24
+# CHECKOBJDUMP:        binsl.h         $w30, $w5, $w10
+# CHECKOBJDUMP:        binsl.w         $w14, $w15, $w13
+# CHECKOBJDUMP:        binsl.d         $w23, $w20, $w12
+# CHECKOBJDUMP:        binsr.b         $w22, $w11, $w2
+# CHECKOBJDUMP:        binsr.h         $w0, $w26, $w6
+# CHECKOBJDUMP:        binsr.w         $w26, $w3, $w28
+# CHECKOBJDUMP:        binsr.d         $w0, $w0, $w21
+# CHECKOBJDUMP:        bneg.b          $w0, $w11, $w24
+# CHECKOBJDUMP:        bneg.h          $w28, $w16, $w4
+# CHECKOBJDUMP:        bneg.w          $w3, $w26, $w19
+# CHECKOBJDUMP:        bneg.d          $w13, $w29, $w15
+# CHECKOBJDUMP:        bset.b          $w31, $w5, $w31
+# CHECKOBJDUMP:        bset.h          $w14, $w12, $w6
+# CHECKOBJDUMP:        bset.w          $w31, $w9, $w12
+# CHECKOBJDUMP:        bset.d          $w5, $w22, $w5
+# CHECKOBJDUMP:        ceq.b           $w31, $w31, $w18
+# CHECKOBJDUMP:        ceq.h           $w10, $w27, $w9
+# CHECKOBJDUMP:        ceq.w           $w9, $w5, $w14
+# CHECKOBJDUMP:        ceq.d           $w5, $w17, $w0
+# CHECKOBJDUMP:        cle_s.b         $w23, $w4, $w9
+# CHECKOBJDUMP:        cle_s.h         $w22, $w27, $w19
+# CHECKOBJDUMP:        cle_s.w         $w30, $w26, $w10
+# CHECKOBJDUMP:        cle_s.d         $w18, $w5, $w10
+# CHECKOBJDUMP:        cle_u.b         $w1, $w25, $w0
+# CHECKOBJDUMP:        cle_u.h         $w7, $w0, $w29
+# CHECKOBJDUMP:        cle_u.w         $w25, $w18, $w1
+# CHECKOBJDUMP:        cle_u.d         $w6, $w0, $w30
+# CHECKOBJDUMP:        clt_s.b         $w25, $w2, $w21
+# CHECKOBJDUMP:        clt_s.h         $w2, $w19, $w9
+# CHECKOBJDUMP:        clt_s.w         $w23, $w8, $w16
+# CHECKOBJDUMP:        clt_s.d         $w7, $w30, $w12
+# CHECKOBJDUMP:        clt_u.b         $w2, $w31, $w13
+# CHECKOBJDUMP:        clt_u.h         $w16, $w31, $w23
+# CHECKOBJDUMP:        clt_u.w         $w3, $w24, $w9
+# CHECKOBJDUMP:        clt_u.d         $w7, $w0, $w1
+# CHECKOBJDUMP:        div_s.b         $w29, $w3, $w18
+# CHECKOBJDUMP:        div_s.h         $w17, $w16, $w13
+# CHECKOBJDUMP:        div_s.w         $w4, $w25, $w30
+# CHECKOBJDUMP:        div_s.d         $w31, $w9, $w20
+# CHECKOBJDUMP:        div_u.b         $w6, $w29, $w10
+# CHECKOBJDUMP:        div_u.h         $w24, $w21, $w14
+# CHECKOBJDUMP:        div_u.w         $w29, $w14, $w25
+# CHECKOBJDUMP:        div_u.d         $w31, $w1, $w21
+# CHECKOBJDUMP:        dotp_s.h        $w23, $w22, $w25
+# CHECKOBJDUMP:        dotp_s.w        $w20, $w14, $w5
+# CHECKOBJDUMP:        dotp_s.d        $w17, $w2, $w22
+# CHECKOBJDUMP:        dotp_u.h        $w13, $w2, $w6
+# CHECKOBJDUMP:        dotp_u.w        $w15, $w22, $w21
+# CHECKOBJDUMP:        dotp_u.d        $w4, $w16, $w26
+# CHECKOBJDUMP:        dpadd_s.h       $w1, $w28, $w22
+# CHECKOBJDUMP:        dpadd_s.w       $w10, $w1, $w12
+# CHECKOBJDUMP:        dpadd_s.d       $w3, $w21, $w27
+# CHECKOBJDUMP:        dpadd_u.h       $w17, $w5, $w20
+# CHECKOBJDUMP:        dpadd_u.w       $w24, $w8, $w16
+# CHECKOBJDUMP:        dpadd_u.d       $w15, $w29, $w16
+# CHECKOBJDUMP:        dpsub_s.h       $w4, $w11, $w12
+# CHECKOBJDUMP:        dpsub_s.w       $w4, $w7, $w6
+# CHECKOBJDUMP:        dpsub_s.d       $w31, $w12, $w28
+# CHECKOBJDUMP:        dpsub_u.h       $w4, $w25, $w17
+# CHECKOBJDUMP:        dpsub_u.w       $w19, $w25, $w16
+# CHECKOBJDUMP:        dpsub_u.d       $w7, $w10, $w26
+# CHECKOBJDUMP:        hadd_s.h        $w28, $w24, $w2
+# CHECKOBJDUMP:        hadd_s.w        $w24, $w17, $w11
+# CHECKOBJDUMP:        hadd_s.d        $w17, $w15, $w20
+# CHECKOBJDUMP:        hadd_u.h        $w12, $w29, $w17
+# CHECKOBJDUMP:        hadd_u.w        $w9, $w5, $w6
+# CHECKOBJDUMP:        hadd_u.d        $w1, $w20, $w6
+# CHECKOBJDUMP:        hsub_s.h        $w16, $w14, $w29
+# CHECKOBJDUMP:        hsub_s.w        $w9, $w13, $w11
+# CHECKOBJDUMP:        hsub_s.d        $w30, $w18, $w14
+# CHECKOBJDUMP:        hsub_u.h        $w7, $w12, $w14
+# CHECKOBJDUMP:        hsub_u.w        $w21, $w5, $w5
+# CHECKOBJDUMP:        hsub_u.d        $w11, $w12, $w31
+# CHECKOBJDUMP:        ilvev.b         $w18, $w16, $w30
+# CHECKOBJDUMP:        ilvev.h         $w14, $w0, $w13
+# CHECKOBJDUMP:        ilvev.w         $w12, $w25, $w22
+# CHECKOBJDUMP:        ilvev.d         $w30, $w27, $w3
+# CHECKOBJDUMP:        ilvl.b          $w29, $w3, $w21
+# CHECKOBJDUMP:        ilvl.h          $w27, $w10, $w17
+# CHECKOBJDUMP:        ilvl.w          $w6, $w1, $w0
+# CHECKOBJDUMP:        ilvl.d          $w3, $w16, $w24
+# CHECKOBJDUMP:        ilvod.b         $w11, $w5, $w20
+# CHECKOBJDUMP:        ilvod.h         $w18, $w13, $w31
+# CHECKOBJDUMP:        ilvod.w         $w29, $w16, $w24
+# CHECKOBJDUMP:        ilvod.d         $w22, $w12, $w29
+# CHECKOBJDUMP:        ilvr.b          $w4, $w30, $w6
+# CHECKOBJDUMP:        ilvr.h          $w28, $w19, $w29
+# CHECKOBJDUMP:        ilvr.w          $w18, $w20, $w21
+# CHECKOBJDUMP:        ilvr.d          $w23, $w30, $w12
+# CHECKOBJDUMP:        maddv.b         $w17, $w31, $w29
+# CHECKOBJDUMP:        maddv.h         $w7, $w24, $w9
+# CHECKOBJDUMP:        maddv.w         $w22, $w22, $w20
+# CHECKOBJDUMP:        maddv.d         $w30, $w26, $w20
+# CHECKOBJDUMP:        max_a.b         $w23, $w11, $w23
+# CHECKOBJDUMP:        max_a.h         $w20, $w5, $w30
+# CHECKOBJDUMP:        max_a.w         $w7, $w18, $w30
+# CHECKOBJDUMP:        max_a.d         $w8, $w8, $w31
+# CHECKOBJDUMP:        max_s.b         $w10, $w1, $w19
+# CHECKOBJDUMP:        max_s.h         $w15, $w29, $w17
+# CHECKOBJDUMP:        max_s.w         $w15, $w29, $w14
+# CHECKOBJDUMP:        max_s.d         $w25, $w24, $w3
+# CHECKOBJDUMP:        max_u.b         $w12, $w24, $w5
+# CHECKOBJDUMP:        max_u.h         $w5, $w6, $w7
+# CHECKOBJDUMP:        max_u.w         $w16, $w4, $w7
+# CHECKOBJDUMP:        max_u.d         $w26, $w12, $w24
+# CHECKOBJDUMP:        min_a.b         $w4, $w26, $w1
+# CHECKOBJDUMP:        min_a.h         $w12, $w13, $w31
+# CHECKOBJDUMP:        min_a.w         $w28, $w20, $w0
+# CHECKOBJDUMP:        min_a.d         $w12, $w20, $w19
+# CHECKOBJDUMP:        min_s.b         $w19, $w3, $w14
+# CHECKOBJDUMP:        min_s.h         $w27, $w21, $w8
+# CHECKOBJDUMP:        min_s.w         $w0, $w14, $w30
+# CHECKOBJDUMP:        min_s.d         $w6, $w8, $w21
+# CHECKOBJDUMP:        min_u.b         $w22, $w26, $w8
+# CHECKOBJDUMP:        min_u.h         $w7, $w27, $w12
+# CHECKOBJDUMP:        min_u.w         $w8, $w20, $w14
+# CHECKOBJDUMP:        min_u.d         $w26, $w14, $w15
+# CHECKOBJDUMP:        mod_s.b         $w18, $w1, $w26
+# CHECKOBJDUMP:        mod_s.h         $w31, $w30, $w28
+# CHECKOBJDUMP:        mod_s.w         $w2, $w6, $w13
+# CHECKOBJDUMP:        mod_s.d         $w21, $w27, $w22
+# CHECKOBJDUMP:        mod_u.b         $w16, $w7, $w13
+# CHECKOBJDUMP:        mod_u.h         $w24, $w8, $w7
+# CHECKOBJDUMP:        mod_u.w         $w30, $w2, $w17
+# CHECKOBJDUMP:        mod_u.d         $w31, $w2, $w25
+# CHECKOBJDUMP:        msubv.b         $w14, $w5, $w12
+# CHECKOBJDUMP:        msubv.h         $w6, $w7, $w30
+# CHECKOBJDUMP:        msubv.w         $w13, $w2, $w21
+# CHECKOBJDUMP:        msubv.d         $w16, $w14, $w27
+# CHECKOBJDUMP:        mulv.b          $w20, $w3, $w13
+# CHECKOBJDUMP:        mulv.h          $w27, $w26, $w14
+# CHECKOBJDUMP:        mulv.w          $w10, $w29, $w3
+# CHECKOBJDUMP:        mulv.d          $w7, $w19, $w29
+# CHECKOBJDUMP:        pckev.b         $w5, $w27, $w7
+# CHECKOBJDUMP:        pckev.h         $w1, $w4, $w27
+# CHECKOBJDUMP:        pckev.w         $w30, $w20, $w0
+# CHECKOBJDUMP:        pckev.d         $w6, $w1, $w15
+# CHECKOBJDUMP:        pckod.b         $w18, $w28, $w30
+# CHECKOBJDUMP:        pckod.h         $w26, $w5, $w8
+# CHECKOBJDUMP:        pckod.w         $w9, $w4, $w2
+# CHECKOBJDUMP:        pckod.d         $w30, $w22, $w20
+# CHECKOBJDUMP:        sld.b           $w5, $w23[$12]
+# CHECKOBJDUMP:        sld.h           $w1, $w23[$3]
+# CHECKOBJDUMP:        sld.w           $w20, $w8[$9]
+# CHECKOBJDUMP:        sld.d           $w7, $w23[$fp]
+# CHECKOBJDUMP:        sll.b           $w3, $w0, $w17
+# CHECKOBJDUMP:        sll.h           $w17, $w27, $w3
+# CHECKOBJDUMP:        sll.w           $w16, $w7, $w6
+# CHECKOBJDUMP:        sll.d           $w9, $w0, $w26
+# CHECKOBJDUMP:        splat.b         $w28, $w1[$1]
+# CHECKOBJDUMP:        splat.h         $w2, $w11[$11]
+# CHECKOBJDUMP:        splat.w         $w22, $w0[$11]
+# CHECKOBJDUMP:        splat.d         $w0, $w0[$2]
+# CHECKOBJDUMP:        sra.b           $w28, $w4, $w17
+# CHECKOBJDUMP:        sra.h           $w13, $w9, $w3
+# CHECKOBJDUMP:        sra.w           $w27, $w21, $w19
+# CHECKOBJDUMP:        sra.d           $w30, $w8, $w23
+# CHECKOBJDUMP:        srar.b          $w19, $w18, $w18
+# CHECKOBJDUMP:        srar.h          $w7, $w23, $w8
+# CHECKOBJDUMP:        srar.w          $w1, $w12, $w2
+# CHECKOBJDUMP:        srar.d          $w21, $w7, $w14
+# CHECKOBJDUMP:        srl.b           $w12, $w3, $w19
+# CHECKOBJDUMP:        srl.h           $w23, $w31, $w20
+# CHECKOBJDUMP:        srl.w           $w18, $w27, $w11
+# CHECKOBJDUMP:        srl.d           $w3, $w12, $w26
+# CHECKOBJDUMP:        srlr.b          $w15, $w21, $w11
+# CHECKOBJDUMP:        srlr.h          $w21, $w13, $w19
+# CHECKOBJDUMP:        srlr.w          $w6, $w30, $w3
+# CHECKOBJDUMP:        srlr.d          $w1, $w2, $w14
+# CHECKOBJDUMP:        subs_s.b        $w25, $w15, $w1
+# CHECKOBJDUMP:        subs_s.h        $w28, $w25, $w22
+# CHECKOBJDUMP:        subs_s.w        $w10, $w12, $w21
+# CHECKOBJDUMP:        subs_s.d        $w4, $w20, $w18
+# CHECKOBJDUMP:        subs_u.b        $w21, $w6, $w25
+# CHECKOBJDUMP:        subs_u.h        $w3, $w10, $w7
+# CHECKOBJDUMP:        subs_u.w        $w9, $w15, $w10
+# CHECKOBJDUMP:        subs_u.d        $w7, $w19, $w10
+# CHECKOBJDUMP:        subsus_u.b      $w6, $w7, $w12
+# CHECKOBJDUMP:        subsus_u.h      $w6, $w29, $w19
+# CHECKOBJDUMP:        subsus_u.w      $w7, $w15, $w7
+# CHECKOBJDUMP:        subsus_u.d      $w9, $w3, $w15
+# CHECKOBJDUMP:        subsuu_s.b      $w22, $w3, $w31
+# CHECKOBJDUMP:        subsuu_s.h      $w19, $w23, $w22
+# CHECKOBJDUMP:        subsuu_s.w      $w9, $w10, $w13
+# CHECKOBJDUMP:        subsuu_s.d      $w5, $w6, $w0
+# CHECKOBJDUMP:        subv.b          $w6, $w13, $w19
+# CHECKOBJDUMP:        subv.h          $w4, $w25, $w12
+# CHECKOBJDUMP:        subv.w          $w27, $w27, $w11
+# CHECKOBJDUMP:        subv.d          $w9, $w24, $w10
+# CHECKOBJDUMP:        vshf.b          $w3, $w16, $w5
+# CHECKOBJDUMP:        vshf.h          $w20, $w19, $w8
+# CHECKOBJDUMP:        vshf.w          $w16, $w30, $w25
+# CHECKOBJDUMP:        vshf.d          $w19, $w11, $w15
+
+                add_a.b         $w26, $w9, $w4
+                add_a.h         $w23, $w27, $w31
+                add_a.w         $w11, $w6, $w22
+                add_a.d         $w6, $w10, $w0
+                adds_a.b        $w19, $w24, $w19
+                adds_a.h        $w25, $w6, $w4
+                adds_a.w        $w25, $w17, $w27
+                adds_a.d        $w15, $w18, $w26
+                adds_s.b        $w29, $w11, $w19
+                adds_s.h        $w5, $w23, $w26
+                adds_s.w        $w16, $w14, $w13
+                adds_s.d        $w2, $w14, $w28
+                adds_u.b        $w3, $w17, $w14
+                adds_u.h        $w10, $w30, $w4
+                adds_u.w        $w15, $w18, $w20
+                adds_u.d        $w30, $w10, $w9
+                addv.b          $w24, $w20, $w21
+                addv.h          $w4, $w13, $w27
+                addv.w          $w19, $w11, $w14
+                addv.d          $w2, $w21, $w31
+                asub_s.b        $w23, $w16, $w3
+                asub_s.h        $w22, $w17, $w25
+                asub_s.w        $w24, $w1, $w9
+                asub_s.d        $w13, $w12, $w12
+                asub_u.b        $w10, $w29, $w11
+                asub_u.h        $w18, $w9, $w15
+                asub_u.w        $w10, $w19, $w31
+                asub_u.d        $w17, $w10, $w0
+                ave_s.b         $w2, $w5, $w1
+                ave_s.h         $w16, $w19, $w9
+                ave_s.w         $w17, $w31, $w5
+                ave_s.d         $w27, $w25, $w10
+                ave_u.b         $w16, $w19, $w9
+                ave_u.h         $w28, $w28, $w11
+                ave_u.w         $w11, $w12, $w11
+                ave_u.d         $w30, $w19, $w28
+                aver_s.b        $w26, $w16, $w2
+                aver_s.h        $w31, $w27, $w27
+                aver_s.w        $w28, $w18, $w25
+                aver_s.d        $w29, $w21, $w27
+                aver_u.b        $w29, $w26, $w3
+                aver_u.h        $w18, $w18, $w9
+                aver_u.w        $w17, $w25, $w29
+                aver_u.d        $w22, $w22, $w19
+                bclr.b          $w2, $w15, $w29
+                bclr.h          $w16, $w21, $w28
+                bclr.w          $w19, $w2, $w9
+                bclr.d          $w27, $w31, $w4
+                binsl.b         $w5, $w16, $w24
+                binsl.h         $w30, $w5, $w10
+                binsl.w         $w14, $w15, $w13
+                binsl.d         $w23, $w20, $w12
+                binsr.b         $w22, $w11, $w2
+                binsr.h         $w0, $w26, $w6
+                binsr.w         $w26, $w3, $w28
+                binsr.d         $w0, $w0, $w21
+                bneg.b          $w0, $w11, $w24
+                bneg.h          $w28, $w16, $w4
+                bneg.w          $w3, $w26, $w19
+                bneg.d          $w13, $w29, $w15
+                bset.b          $w31, $w5, $w31
+                bset.h          $w14, $w12, $w6
+                bset.w          $w31, $w9, $w12
+                bset.d          $w5, $w22, $w5
+                ceq.b           $w31, $w31, $w18
+                ceq.h           $w10, $w27, $w9
+                ceq.w           $w9, $w5, $w14
+                ceq.d           $w5, $w17, $w0
+                cle_s.b         $w23, $w4, $w9
+                cle_s.h         $w22, $w27, $w19
+                cle_s.w         $w30, $w26, $w10
+                cle_s.d         $w18, $w5, $w10
+                cle_u.b         $w1, $w25, $w0
+                cle_u.h         $w7, $w0, $w29
+                cle_u.w         $w25, $w18, $w1
+                cle_u.d         $w6, $w0, $w30
+                clt_s.b         $w25, $w2, $w21
+                clt_s.h         $w2, $w19, $w9
+                clt_s.w         $w23, $w8, $w16
+                clt_s.d         $w7, $w30, $w12
+                clt_u.b         $w2, $w31, $w13
+                clt_u.h         $w16, $w31, $w23
+                clt_u.w         $w3, $w24, $w9
+                clt_u.d         $w7, $w0, $w1
+                div_s.b         $w29, $w3, $w18
+                div_s.h         $w17, $w16, $w13
+                div_s.w         $w4, $w25, $w30
+                div_s.d         $w31, $w9, $w20
+                div_u.b         $w6, $w29, $w10
+                div_u.h         $w24, $w21, $w14
+                div_u.w         $w29, $w14, $w25
+                div_u.d         $w31, $w1, $w21
+                dotp_s.h        $w23, $w22, $w25
+                dotp_s.w        $w20, $w14, $w5
+                dotp_s.d        $w17, $w2, $w22
+                dotp_u.h        $w13, $w2, $w6
+                dotp_u.w        $w15, $w22, $w21
+                dotp_u.d        $w4, $w16, $w26
+                dpadd_s.h       $w1, $w28, $w22
+                dpadd_s.w       $w10, $w1, $w12
+                dpadd_s.d       $w3, $w21, $w27
+                dpadd_u.h       $w17, $w5, $w20
+                dpadd_u.w       $w24, $w8, $w16
+                dpadd_u.d       $w15, $w29, $w16
+                dpsub_s.h       $w4, $w11, $w12
+                dpsub_s.w       $w4, $w7, $w6
+                dpsub_s.d       $w31, $w12, $w28
+                dpsub_u.h       $w4, $w25, $w17
+                dpsub_u.w       $w19, $w25, $w16
+                dpsub_u.d       $w7, $w10, $w26
+                hadd_s.h        $w28, $w24, $w2
+                hadd_s.w        $w24, $w17, $w11
+                hadd_s.d        $w17, $w15, $w20
+                hadd_u.h        $w12, $w29, $w17
+                hadd_u.w        $w9, $w5, $w6
+                hadd_u.d        $w1, $w20, $w6
+                hsub_s.h        $w16, $w14, $w29
+                hsub_s.w        $w9, $w13, $w11
+                hsub_s.d        $w30, $w18, $w14
+                hsub_u.h        $w7, $w12, $w14
+                hsub_u.w        $w21, $w5, $w5
+                hsub_u.d        $w11, $w12, $w31
+                ilvev.b         $w18, $w16, $w30
+                ilvev.h         $w14, $w0, $w13
+                ilvev.w         $w12, $w25, $w22
+                ilvev.d         $w30, $w27, $w3
+                ilvl.b          $w29, $w3, $w21
+                ilvl.h          $w27, $w10, $w17
+                ilvl.w          $w6, $w1, $w0
+                ilvl.d          $w3, $w16, $w24
+                ilvod.b         $w11, $w5, $w20
+                ilvod.h         $w18, $w13, $w31
+                ilvod.w         $w29, $w16, $w24
+                ilvod.d         $w22, $w12, $w29
+                ilvr.b          $w4, $w30, $w6
+                ilvr.h          $w28, $w19, $w29
+                ilvr.w          $w18, $w20, $w21
+                ilvr.d          $w23, $w30, $w12
+                maddv.b         $w17, $w31, $w29
+                maddv.h         $w7, $w24, $w9
+                maddv.w         $w22, $w22, $w20
+                maddv.d         $w30, $w26, $w20
+                max_a.b         $w23, $w11, $w23
+                max_a.h         $w20, $w5, $w30
+                max_a.w         $w7, $w18, $w30
+                max_a.d         $w8, $w8, $w31
+                max_s.b         $w10, $w1, $w19
+                max_s.h         $w15, $w29, $w17
+                max_s.w         $w15, $w29, $w14
+                max_s.d         $w25, $w24, $w3
+                max_u.b         $w12, $w24, $w5
+                max_u.h         $w5, $w6, $w7
+                max_u.w         $w16, $w4, $w7
+                max_u.d         $w26, $w12, $w24
+                min_a.b         $w4, $w26, $w1
+                min_a.h         $w12, $w13, $w31
+                min_a.w         $w28, $w20, $w0
+                min_a.d         $w12, $w20, $w19
+                min_s.b         $w19, $w3, $w14
+                min_s.h         $w27, $w21, $w8
+                min_s.w         $w0, $w14, $w30
+                min_s.d         $w6, $w8, $w21
+                min_u.b         $w22, $w26, $w8
+                min_u.h         $w7, $w27, $w12
+                min_u.w         $w8, $w20, $w14
+                min_u.d         $w26, $w14, $w15
+                mod_s.b         $w18, $w1, $w26
+                mod_s.h         $w31, $w30, $w28
+                mod_s.w         $w2, $w6, $w13
+                mod_s.d         $w21, $w27, $w22
+                mod_u.b         $w16, $w7, $w13
+                mod_u.h         $w24, $w8, $w7
+                mod_u.w         $w30, $w2, $w17
+                mod_u.d         $w31, $w2, $w25
+                msubv.b         $w14, $w5, $w12
+                msubv.h         $w6, $w7, $w30
+                msubv.w         $w13, $w2, $w21
+                msubv.d         $w16, $w14, $w27
+                mulv.b          $w20, $w3, $w13
+                mulv.h          $w27, $w26, $w14
+                mulv.w          $w10, $w29, $w3
+                mulv.d          $w7, $w19, $w29
+                pckev.b         $w5, $w27, $w7
+                pckev.h         $w1, $w4, $w27
+                pckev.w         $w30, $w20, $w0
+                pckev.d         $w6, $w1, $w15
+                pckod.b         $w18, $w28, $w30
+                pckod.h         $w26, $w5, $w8
+                pckod.w         $w9, $w4, $w2
+                pckod.d         $w30, $w22, $w20
+                sld.b           $w5, $w23[$12]
+                sld.h           $w1, $w23[$3]
+                sld.w           $w20, $w8[$9]
+                sld.d           $w7, $w23[$30]
+                sll.b           $w3, $w0, $w17
+                sll.h           $w17, $w27, $w3
+                sll.w           $w16, $w7, $w6
+                sll.d           $w9, $w0, $w26
+                splat.b         $w28, $w1[$1]
+                splat.h         $w2, $w11[$11]
+                splat.w         $w22, $w0[$11]
+                splat.d         $w0, $w0[$2]
+                sra.b           $w28, $w4, $w17
+                sra.h           $w13, $w9, $w3
+                sra.w           $w27, $w21, $w19
+                sra.d           $w30, $w8, $w23
+                srar.b          $w19, $w18, $w18
+                srar.h          $w7, $w23, $w8
+                srar.w          $w1, $w12, $w2
+                srar.d          $w21, $w7, $w14
+                srl.b           $w12, $w3, $w19
+                srl.h           $w23, $w31, $w20
+                srl.w           $w18, $w27, $w11
+                srl.d           $w3, $w12, $w26
+                srlr.b          $w15, $w21, $w11
+                srlr.h          $w21, $w13, $w19
+                srlr.w          $w6, $w30, $w3
+                srlr.d          $w1, $w2, $w14
+                subs_s.b        $w25, $w15, $w1
+                subs_s.h        $w28, $w25, $w22
+                subs_s.w        $w10, $w12, $w21
+                subs_s.d        $w4, $w20, $w18
+                subs_u.b        $w21, $w6, $w25
+                subs_u.h        $w3, $w10, $w7
+                subs_u.w        $w9, $w15, $w10
+                subs_u.d        $w7, $w19, $w10
+                subsus_u.b      $w6, $w7, $w12
+                subsus_u.h      $w6, $w29, $w19
+                subsus_u.w      $w7, $w15, $w7
+                subsus_u.d      $w9, $w3, $w15
+                subsuu_s.b      $w22, $w3, $w31
+                subsuu_s.h      $w19, $w23, $w22
+                subsuu_s.w      $w9, $w10, $w13
+                subsuu_s.d      $w5, $w6, $w0
+                subv.b          $w6, $w13, $w19
+                subv.h          $w4, $w25, $w12
+                subv.w          $w27, $w27, $w11
+                subv.d          $w9, $w24, $w10
+                vshf.b          $w3, $w16, $w5
+                vshf.h          $w20, $w19, $w8
+                vshf.w          $w16, $w30, $w25
+                vshf.d          $w19, $w11, $w15
diff --git a/test/MC/Mips/msa/test_3rf.s b/test/MC/Mips/msa/test_3rf.s
new file mode 100644
index 0000000..f45557e
--- /dev/null
+++ b/test/MC/Mips/msa/test_3rf.s
@@ -0,0 +1,252 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        fadd.w          $w28, $w19, $w28        # encoding: [0x78,0x1c,0x9f,0x1b]
+# CHECK:        fadd.d          $w13, $w2, $w29         # encoding: [0x78,0x3d,0x13,0x5b]
+# CHECK:        fcaf.w          $w14, $w11, $w25        # encoding: [0x78,0x19,0x5b,0x9a]
+# CHECK:        fcaf.d          $w1, $w1, $w19          # encoding: [0x78,0x33,0x08,0x5a]
+# CHECK:        fceq.w          $w1, $w23, $w16         # encoding: [0x78,0x90,0xb8,0x5a]
+# CHECK:        fceq.d          $w0, $w8, $w16          # encoding: [0x78,0xb0,0x40,0x1a]
+# CHECK:        fcle.w          $w16, $w9, $w24         # encoding: [0x79,0x98,0x4c,0x1a]
+# CHECK:        fcle.d          $w27, $w14, $w1         # encoding: [0x79,0xa1,0x76,0xda]
+# CHECK:        fclt.w          $w28, $w8, $w8          # encoding: [0x79,0x08,0x47,0x1a]
+# CHECK:        fclt.d          $w30, $w25, $w11        # encoding: [0x79,0x2b,0xcf,0x9a]
+# CHECK:        fcne.w          $w2, $w18, $w23         # encoding: [0x78,0xd7,0x90,0x9c]
+# CHECK:        fcne.d          $w14, $w20, $w15        # encoding: [0x78,0xef,0xa3,0x9c]
+# CHECK:        fcor.w          $w10, $w18, $w25        # encoding: [0x78,0x59,0x92,0x9c]
+# CHECK:        fcor.d          $w17, $w25, $w11        # encoding: [0x78,0x6b,0xcc,0x5c]
+# CHECK:        fcueq.w         $w14, $w2, $w21         # encoding: [0x78,0xd5,0x13,0x9a]
+# CHECK:        fcueq.d         $w29, $w3, $w7          # encoding: [0x78,0xe7,0x1f,0x5a]
+# CHECK:        fcule.w         $w17, $w5, $w3          # encoding: [0x79,0xc3,0x2c,0x5a]
+# CHECK:        fcule.d         $w31, $w1, $w30         # encoding: [0x79,0xfe,0x0f,0xda]
+# CHECK:        fcult.w         $w6, $w25, $w9          # encoding: [0x79,0x49,0xc9,0x9a]
+# CHECK:        fcult.d         $w27, $w8, $w17         # encoding: [0x79,0x71,0x46,0xda]
+# CHECK:        fcun.w          $w4, $w20, $w8          # encoding: [0x78,0x48,0xa1,0x1a]
+# CHECK:        fcun.d          $w29, $w11, $w3         # encoding: [0x78,0x63,0x5f,0x5a]
+# CHECK:        fcune.w         $w13, $w18, $w19        # encoding: [0x78,0x93,0x93,0x5c]
+# CHECK:        fcune.d         $w16, $w26, $w21        # encoding: [0x78,0xb5,0xd4,0x1c]
+# CHECK:        fdiv.w          $w13, $w24, $w2         # encoding: [0x78,0xc2,0xc3,0x5b]
+# CHECK:        fdiv.d          $w19, $w4, $w25         # encoding: [0x78,0xf9,0x24,0xdb]
+# CHECK:        fexdo.h         $w8, $w0, $w16          # encoding: [0x7a,0x10,0x02,0x1b]
+# CHECK:        fexdo.w         $w0, $w13, $w27         # encoding: [0x7a,0x3b,0x68,0x1b]
+# CHECK:        fexp2.w         $w17, $w0, $w3          # encoding: [0x79,0xc3,0x04,0x5b]
+# CHECK:        fexp2.d         $w22, $w0, $w10         # encoding: [0x79,0xea,0x05,0x9b]
+# CHECK:        fmadd.w         $w29, $w6, $w23         # encoding: [0x79,0x17,0x37,0x5b]
+# CHECK:        fmadd.d         $w11, $w28, $w21        # encoding: [0x79,0x35,0xe2,0xdb]
+# CHECK:        fmax.w          $w0, $w23, $w13         # encoding: [0x7b,0x8d,0xb8,0x1b]
+# CHECK:        fmax.d          $w26, $w18, $w8         # encoding: [0x7b,0xa8,0x96,0x9b]
+# CHECK:        fmax_a.w        $w10, $w16, $w10        # encoding: [0x7b,0xca,0x82,0x9b]
+# CHECK:        fmax_a.d        $w30, $w9, $w22         # encoding: [0x7b,0xf6,0x4f,0x9b]
+# CHECK:        fmin.w          $w24, $w1, $w30         # encoding: [0x7b,0x1e,0x0e,0x1b]
+# CHECK:        fmin.d          $w27, $w27, $w10        # encoding: [0x7b,0x2a,0xde,0xdb]
+# CHECK:        fmin_a.w        $w10, $w29, $w20        # encoding: [0x7b,0x54,0xea,0x9b]
+# CHECK:        fmin_a.d        $w13, $w30, $w24        # encoding: [0x7b,0x78,0xf3,0x5b]
+# CHECK:        fmsub.w         $w17, $w25, $w0         # encoding: [0x79,0x40,0xcc,0x5b]
+# CHECK:        fmsub.d         $w8, $w18, $w16         # encoding: [0x79,0x70,0x92,0x1b]
+# CHECK:        fmul.w          $w3, $w15, $w15         # encoding: [0x78,0x8f,0x78,0xdb]
+# CHECK:        fmul.d          $w9, $w30, $w10         # encoding: [0x78,0xaa,0xf2,0x5b]
+# CHECK:        fsaf.w          $w25, $w5, $w10         # encoding: [0x7a,0x0a,0x2e,0x5a]
+# CHECK:        fsaf.d          $w25, $w3, $w29         # encoding: [0x7a,0x3d,0x1e,0x5a]
+# CHECK:        fseq.w          $w11, $w17, $w13        # encoding: [0x7a,0x8d,0x8a,0xda]
+# CHECK:        fseq.d          $w29, $w0, $w31         # encoding: [0x7a,0xbf,0x07,0x5a]
+# CHECK:        fsle.w          $w30, $w31, $w31        # encoding: [0x7b,0x9f,0xff,0x9a]
+# CHECK:        fsle.d          $w18, $w23, $w24        # encoding: [0x7b,0xb8,0xbc,0x9a]
+# CHECK:        fslt.w          $w12, $w5, $w6          # encoding: [0x7b,0x06,0x2b,0x1a]
+# CHECK:        fslt.d          $w16, $w26, $w21        # encoding: [0x7b,0x35,0xd4,0x1a]
+# CHECK:        fsne.w          $w30, $w1, $w12         # encoding: [0x7a,0xcc,0x0f,0x9c]
+# CHECK:        fsne.d          $w14, $w13, $w23        # encoding: [0x7a,0xf7,0x6b,0x9c]
+# CHECK:        fsor.w          $w27, $w13, $w27        # encoding: [0x7a,0x5b,0x6e,0xdc]
+# CHECK:        fsor.d          $w12, $w24, $w11        # encoding: [0x7a,0x6b,0xc3,0x1c]
+# CHECK:        fsub.w          $w31, $w26, $w1         # encoding: [0x78,0x41,0xd7,0xdb]
+# CHECK:        fsub.d          $w19, $w17, $w27        # encoding: [0x78,0x7b,0x8c,0xdb]
+# CHECK:        fsueq.w         $w16, $w24, $w25        # encoding: [0x7a,0xd9,0xc4,0x1a]
+# CHECK:        fsueq.d         $w18, $w14, $w14        # encoding: [0x7a,0xee,0x74,0x9a]
+# CHECK:        fsule.w         $w23, $w30, $w13        # encoding: [0x7b,0xcd,0xf5,0xda]
+# CHECK:        fsule.d         $w2, $w11, $w26         # encoding: [0x7b,0xfa,0x58,0x9a]
+# CHECK:        fsult.w         $w11, $w26, $w22        # encoding: [0x7b,0x56,0xd2,0xda]
+# CHECK:        fsult.d         $w6, $w23, $w30         # encoding: [0x7b,0x7e,0xb9,0x9a]
+# CHECK:        fsun.w          $w3, $w18, $w28         # encoding: [0x7a,0x5c,0x90,0xda]
+# CHECK:        fsun.d          $w18, $w11, $w19        # encoding: [0x7a,0x73,0x5c,0x9a]
+# CHECK:        fsune.w         $w16, $w31, $w2         # encoding: [0x7a,0x82,0xfc,0x1c]
+# CHECK:        fsune.d         $w3, $w26, $w17         # encoding: [0x7a,0xb1,0xd0,0xdc]
+# CHECK:        ftq.h           $w16, $w4, $w24         # encoding: [0x7a,0x98,0x24,0x1b]
+# CHECK:        ftq.w           $w5, $w5, $w25          # encoding: [0x7a,0xb9,0x29,0x5b]
+# CHECK:        madd_q.h        $w16, $w20, $w10        # encoding: [0x79,0x4a,0xa4,0x1c]
+# CHECK:        madd_q.w        $w28, $w2, $w9          # encoding: [0x79,0x69,0x17,0x1c]
+# CHECK:        maddr_q.h       $w8, $w18, $w9          # encoding: [0x7b,0x49,0x92,0x1c]
+# CHECK:        maddr_q.w       $w29, $w12, $w16        # encoding: [0x7b,0x70,0x67,0x5c]
+# CHECK:        msub_q.h        $w24, $w26, $w10        # encoding: [0x79,0x8a,0xd6,0x1c]
+# CHECK:        msub_q.w        $w13, $w30, $w28        # encoding: [0x79,0xbc,0xf3,0x5c]
+# CHECK:        msubr_q.h       $w12, $w21, $w11        # encoding: [0x7b,0x8b,0xab,0x1c]
+# CHECK:        msubr_q.w       $w1, $w14, $w20         # encoding: [0x7b,0xb4,0x70,0x5c]
+# CHECK:        mul_q.h         $w6, $w16, $w30         # encoding: [0x79,0x1e,0x81,0x9c]
+# CHECK:        mul_q.w         $w16, $w1, $w4          # encoding: [0x79,0x24,0x0c,0x1c]
+# CHECK:        mulr_q.h        $w6, $w20, $w19         # encoding: [0x7b,0x13,0xa1,0x9c]
+# CHECK:        mulr_q.w        $w27, $w1, $w20         # encoding: [0x7b,0x34,0x0e,0xdc]
+
+# CHECKOBJDUMP:        fadd.w          $w28, $w19, $w28
+# CHECKOBJDUMP:        fadd.d          $w13, $w2, $w29
+# CHECKOBJDUMP:        fcaf.w          $w14, $w11, $w25
+# CHECKOBJDUMP:        fcaf.d          $w1, $w1, $w19
+# CHECKOBJDUMP:        fceq.w          $w1, $w23, $w16
+# CHECKOBJDUMP:        fceq.d          $w0, $w8, $w16
+# CHECKOBJDUMP:        fcle.w          $w16, $w9, $w24
+# CHECKOBJDUMP:        fcle.d          $w27, $w14, $w1
+# CHECKOBJDUMP:        fclt.w          $w28, $w8, $w8
+# CHECKOBJDUMP:        fclt.d          $w30, $w25, $w11
+# CHECKOBJDUMP:        fcne.w          $w2, $w18, $w23
+# CHECKOBJDUMP:        fcne.d          $w14, $w20, $w15
+# CHECKOBJDUMP:        fcor.w          $w10, $w18, $w25
+# CHECKOBJDUMP:        fcor.d          $w17, $w25, $w11
+# CHECKOBJDUMP:        fcueq.w         $w14, $w2, $w21
+# CHECKOBJDUMP:        fcueq.d         $w29, $w3, $w7
+# CHECKOBJDUMP:        fcule.w         $w17, $w5, $w3
+# CHECKOBJDUMP:        fcule.d         $w31, $w1, $w30
+# CHECKOBJDUMP:        fcult.w         $w6, $w25, $w9
+# CHECKOBJDUMP:        fcult.d         $w27, $w8, $w17
+# CHECKOBJDUMP:        fcun.w          $w4, $w20, $w8
+# CHECKOBJDUMP:        fcun.d          $w29, $w11, $w3
+# CHECKOBJDUMP:        fcune.w         $w13, $w18, $w19
+# CHECKOBJDUMP:        fcune.d         $w16, $w26, $w21
+# CHECKOBJDUMP:        fdiv.w          $w13, $w24, $w2
+# CHECKOBJDUMP:        fdiv.d          $w19, $w4, $w25
+# CHECKOBJDUMP:        fexdo.h         $w8, $w0, $w16
+# CHECKOBJDUMP:        fexdo.w         $w0, $w13, $w27
+# CHECKOBJDUMP:        fexp2.w         $w17, $w0, $w3
+# CHECKOBJDUMP:        fexp2.d         $w22, $w0, $w10
+# CHECKOBJDUMP:        fmadd.w         $w29, $w6, $w23
+# CHECKOBJDUMP:        fmadd.d         $w11, $w28, $w21
+# CHECKOBJDUMP:        fmax.w          $w0, $w23, $w13
+# CHECKOBJDUMP:        fmax.d          $w26, $w18, $w8
+# CHECKOBJDUMP:        fmax_a.w        $w10, $w16, $w10
+# CHECKOBJDUMP:        fmax_a.d        $w30, $w9, $w22
+# CHECKOBJDUMP:        fmin.w          $w24, $w1, $w30
+# CHECKOBJDUMP:        fmin.d          $w27, $w27, $w10
+# CHECKOBJDUMP:        fmin_a.w        $w10, $w29, $w20
+# CHECKOBJDUMP:        fmin_a.d        $w13, $w30, $w24
+# CHECKOBJDUMP:        fmsub.w         $w17, $w25, $w0
+# CHECKOBJDUMP:        fmsub.d         $w8, $w18, $w16
+# CHECKOBJDUMP:        fmul.w          $w3, $w15, $w15
+# CHECKOBJDUMP:        fmul.d          $w9, $w30, $w10
+# CHECKOBJDUMP:        fsaf.w          $w25, $w5, $w10
+# CHECKOBJDUMP:        fsaf.d          $w25, $w3, $w29
+# CHECKOBJDUMP:        fseq.w          $w11, $w17, $w13
+# CHECKOBJDUMP:        fseq.d          $w29, $w0, $w31
+# CHECKOBJDUMP:        fsle.w          $w30, $w31, $w31
+# CHECKOBJDUMP:        fsle.d          $w18, $w23, $w24
+# CHECKOBJDUMP:        fslt.w          $w12, $w5, $w6
+# CHECKOBJDUMP:        fslt.d          $w16, $w26, $w21
+# CHECKOBJDUMP:        fsne.w          $w30, $w1, $w12
+# CHECKOBJDUMP:        fsne.d          $w14, $w13, $w23
+# CHECKOBJDUMP:        fsor.w          $w27, $w13, $w27
+# CHECKOBJDUMP:        fsor.d          $w12, $w24, $w11
+# CHECKOBJDUMP:        fsub.w          $w31, $w26, $w1
+# CHECKOBJDUMP:        fsub.d          $w19, $w17, $w27
+# CHECKOBJDUMP:        fsueq.w         $w16, $w24, $w25
+# CHECKOBJDUMP:        fsueq.d         $w18, $w14, $w14
+# CHECKOBJDUMP:        fsule.w         $w23, $w30, $w13
+# CHECKOBJDUMP:        fsule.d         $w2, $w11, $w26
+# CHECKOBJDUMP:        fsult.w         $w11, $w26, $w22
+# CHECKOBJDUMP:        fsult.d         $w6, $w23, $w30
+# CHECKOBJDUMP:        fsun.w          $w3, $w18, $w28
+# CHECKOBJDUMP:        fsun.d          $w18, $w11, $w19
+# CHECKOBJDUMP:        fsune.w         $w16, $w31, $w2
+# CHECKOBJDUMP:        fsune.d         $w3, $w26, $w17
+# CHECKOBJDUMP:        ftq.h           $w16, $w4, $w24
+# CHECKOBJDUMP:        ftq.w           $w5, $w5, $w25
+# CHECKOBJDUMP:        madd_q.h        $w16, $w20, $w10
+# CHECKOBJDUMP:        madd_q.w        $w28, $w2, $w9
+# CHECKOBJDUMP:        maddr_q.h       $w8, $w18, $w9
+# CHECKOBJDUMP:        maddr_q.w       $w29, $w12, $w16
+# CHECKOBJDUMP:        msub_q.h        $w24, $w26, $w10
+# CHECKOBJDUMP:        msub_q.w        $w13, $w30, $w28
+# CHECKOBJDUMP:        msubr_q.h       $w12, $w21, $w11
+# CHECKOBJDUMP:        msubr_q.w       $w1, $w14, $w20
+# CHECKOBJDUMP:        mul_q.h         $w6, $w16, $w30
+# CHECKOBJDUMP:        mul_q.w         $w16, $w1, $w4
+# CHECKOBJDUMP:        mulr_q.h        $w6, $w20, $w19
+# CHECKOBJDUMP:        mulr_q.w        $w27, $w1, $w20
+
+                fadd.w          $w28, $w19, $w28
+                fadd.d          $w13, $w2, $w29
+                fcaf.w          $w14, $w11, $w25
+                fcaf.d          $w1, $w1, $w19
+                fceq.w          $w1, $w23, $w16
+                fceq.d          $w0, $w8, $w16
+                fcle.w          $w16, $w9, $w24
+                fcle.d          $w27, $w14, $w1
+                fclt.w          $w28, $w8, $w8
+                fclt.d          $w30, $w25, $w11
+                fcne.w          $w2, $w18, $w23
+                fcne.d          $w14, $w20, $w15
+                fcor.w          $w10, $w18, $w25
+                fcor.d          $w17, $w25, $w11
+                fcueq.w         $w14, $w2, $w21
+                fcueq.d         $w29, $w3, $w7
+                fcule.w         $w17, $w5, $w3
+                fcule.d         $w31, $w1, $w30
+                fcult.w         $w6, $w25, $w9
+                fcult.d         $w27, $w8, $w17
+                fcun.w          $w4, $w20, $w8
+                fcun.d          $w29, $w11, $w3
+                fcune.w         $w13, $w18, $w19
+                fcune.d         $w16, $w26, $w21
+                fdiv.w          $w13, $w24, $w2
+                fdiv.d          $w19, $w4, $w25
+                fexdo.h         $w8, $w0, $w16
+                fexdo.w         $w0, $w13, $w27
+                fexp2.w         $w17, $w0, $w3
+                fexp2.d         $w22, $w0, $w10
+                fmadd.w         $w29, $w6, $w23
+                fmadd.d         $w11, $w28, $w21
+                fmax.w          $w0, $w23, $w13
+                fmax.d          $w26, $w18, $w8
+                fmax_a.w        $w10, $w16, $w10
+                fmax_a.d        $w30, $w9, $w22
+                fmin.w          $w24, $w1, $w30
+                fmin.d          $w27, $w27, $w10
+                fmin_a.w        $w10, $w29, $w20
+                fmin_a.d        $w13, $w30, $w24
+                fmsub.w         $w17, $w25, $w0
+                fmsub.d         $w8, $w18, $w16
+                fmul.w          $w3, $w15, $w15
+                fmul.d          $w9, $w30, $w10
+                fsaf.w          $w25, $w5, $w10
+                fsaf.d          $w25, $w3, $w29
+                fseq.w          $w11, $w17, $w13
+                fseq.d          $w29, $w0, $w31
+                fsle.w          $w30, $w31, $w31
+                fsle.d          $w18, $w23, $w24
+                fslt.w          $w12, $w5, $w6
+                fslt.d          $w16, $w26, $w21
+                fsne.w          $w30, $w1, $w12
+                fsne.d          $w14, $w13, $w23
+                fsor.w          $w27, $w13, $w27
+                fsor.d          $w12, $w24, $w11
+                fsub.w          $w31, $w26, $w1
+                fsub.d          $w19, $w17, $w27
+                fsueq.w         $w16, $w24, $w25
+                fsueq.d         $w18, $w14, $w14
+                fsule.w         $w23, $w30, $w13
+                fsule.d         $w2, $w11, $w26
+                fsult.w         $w11, $w26, $w22
+                fsult.d         $w6, $w23, $w30
+                fsun.w          $w3, $w18, $w28
+                fsun.d          $w18, $w11, $w19
+                fsune.w         $w16, $w31, $w2
+                fsune.d         $w3, $w26, $w17
+                ftq.h           $w16, $w4, $w24
+                ftq.w           $w5, $w5, $w25
+                madd_q.h        $w16, $w20, $w10
+                madd_q.w        $w28, $w2, $w9
+                maddr_q.h       $w8, $w18, $w9
+                maddr_q.w       $w29, $w12, $w16
+                msub_q.h        $w24, $w26, $w10
+                msub_q.w        $w13, $w30, $w28
+                msubr_q.h       $w12, $w21, $w11
+                msubr_q.w       $w1, $w14, $w20
+                mul_q.h         $w6, $w16, $w30
+                mul_q.w         $w16, $w1, $w4
+                mulr_q.h        $w6, $w20, $w19
+                mulr_q.w        $w27, $w1, $w20
diff --git a/test/MC/Mips/msa/test_bit.s b/test/MC/Mips/msa/test_bit.s
new file mode 100644
index 0000000..7c23131
--- /dev/null
+++ b/test/MC/Mips/msa/test_bit.s
@@ -0,0 +1,150 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        bclri.b         $w21, $w30, 2           # encoding: [0x79,0xf2,0xf5,0x49]
+# CHECK:        bclri.h         $w24, $w21, 0           # encoding: [0x79,0xe0,0xae,0x09]
+# CHECK:        bclri.w         $w23, $w30, 3           # encoding: [0x79,0xc3,0xf5,0xc9]
+# CHECK:        bclri.d         $w9, $w11, 0            # encoding: [0x79,0x80,0x5a,0x49]
+# CHECK:        binsli.b        $w25, $w12, 1           # encoding: [0x7b,0x71,0x66,0x49]
+# CHECK:        binsli.h        $w21, $w22, 0           # encoding: [0x7b,0x60,0xb5,0x49]
+# CHECK:        binsli.w        $w22, $w4, 0            # encoding: [0x7b,0x40,0x25,0x89]
+# CHECK:        binsli.d        $w6, $w2, 6             # encoding: [0x7b,0x06,0x11,0x89]
+# CHECK:        binsri.b        $w15, $w19, 0           # encoding: [0x7b,0xf0,0x9b,0xc9]
+# CHECK:        binsri.h        $w8, $w30, 1            # encoding: [0x7b,0xe1,0xf2,0x09]
+# CHECK:        binsri.w        $w2, $w19, 5            # encoding: [0x7b,0xc5,0x98,0x89]
+# CHECK:        binsri.d        $w18, $w20, 1           # encoding: [0x7b,0x81,0xa4,0x89]
+# CHECK:        bnegi.b         $w24, $w19, 0           # encoding: [0x7a,0xf0,0x9e,0x09]
+# CHECK:        bnegi.h         $w28, $w11, 3           # encoding: [0x7a,0xe3,0x5f,0x09]
+# CHECK:        bnegi.w         $w1, $w27, 5            # encoding: [0x7a,0xc5,0xd8,0x49]
+# CHECK:        bnegi.d         $w4, $w21, 1            # encoding: [0x7a,0x81,0xa9,0x09]
+# CHECK:        bseti.b         $w18, $w8, 0            # encoding: [0x7a,0x70,0x44,0x89]
+# CHECK:        bseti.h         $w24, $w14, 2           # encoding: [0x7a,0x62,0x76,0x09]
+# CHECK:        bseti.w         $w9, $w18, 4            # encoding: [0x7a,0x44,0x92,0x49]
+# CHECK:        bseti.d         $w7, $w15, 1            # encoding: [0x7a,0x01,0x79,0xc9]
+# CHECK:        sat_s.b         $w31, $w31, 2           # encoding: [0x78,0x72,0xff,0xca]
+# CHECK:        sat_s.h         $w19, $w19, 0           # encoding: [0x78,0x60,0x9c,0xca]
+# CHECK:        sat_s.w         $w19, $w29, 0           # encoding: [0x78,0x40,0xec,0xca]
+# CHECK:        sat_s.d         $w11, $w22, 0           # encoding: [0x78,0x00,0xb2,0xca]
+# CHECK:        sat_u.b         $w1, $w13, 3            # encoding: [0x78,0xf3,0x68,0x4a]
+# CHECK:        sat_u.h         $w30, $w24, 4           # encoding: [0x78,0xe4,0xc7,0x8a]
+# CHECK:        sat_u.w         $w31, $w13, 0           # encoding: [0x78,0xc0,0x6f,0xca]
+# CHECK:        sat_u.d         $w29, $w16, 5           # encoding: [0x78,0x85,0x87,0x4a]
+# CHECK:        slli.b          $w23, $w10, 1           # encoding: [0x78,0x71,0x55,0xc9]
+# CHECK:        slli.h          $w9, $w18, 1            # encoding: [0x78,0x61,0x92,0x49]
+# CHECK:        slli.w          $w11, $w29, 4           # encoding: [0x78,0x44,0xea,0xc9]
+# CHECK:        slli.d          $w25, $w20, 1           # encoding: [0x78,0x01,0xa6,0x49]
+# CHECK:        srai.b          $w24, $w29, 1           # encoding: [0x78,0xf1,0xee,0x09]
+# CHECK:        srai.h          $w1, $w6, 0             # encoding: [0x78,0xe0,0x30,0x49]
+# CHECK:        srai.w          $w7, $w26, 1            # encoding: [0x78,0xc1,0xd1,0xc9]
+# CHECK:        srai.d          $w20, $w25, 3           # encoding: [0x78,0x83,0xcd,0x09]
+# CHECK:        srari.b         $w5, $w25, 0            # encoding: [0x79,0x70,0xc9,0x4a]
+# CHECK:        srari.h         $w7, $w6, 4             # encoding: [0x79,0x64,0x31,0xca]
+# CHECK:        srari.w         $w17, $w11, 5           # encoding: [0x79,0x45,0x5c,0x4a]
+# CHECK:        srari.d         $w21, $w25, 5           # encoding: [0x79,0x05,0xcd,0x4a]
+# CHECK:        srli.b          $w2, $w0, 2             # encoding: [0x79,0x72,0x00,0x89]
+# CHECK:        srli.h          $w31, $w31, 2           # encoding: [0x79,0x62,0xff,0xc9]
+# CHECK:        srli.w          $w5, $w9, 4             # encoding: [0x79,0x44,0x49,0x49]
+# CHECK:        srli.d          $w27, $w26, 5           # encoding: [0x79,0x05,0xd6,0xc9]
+# CHECK:        srlri.b         $w18, $w3, 0            # encoding: [0x79,0xf0,0x1c,0x8a]
+# CHECK:        srlri.h         $w1, $w2, 3             # encoding: [0x79,0xe3,0x10,0x4a]
+# CHECK:        srlri.w         $w11, $w22, 2           # encoding: [0x79,0xc2,0xb2,0xca]
+# CHECK:        srlri.d         $w24, $w10, 6           # encoding: [0x79,0x86,0x56,0x0a]
+
+# CHECKOBJDUMP:        bclri.b         $w21, $w30, 2
+# CHECKOBJDUMP:        bclri.h         $w24, $w21, 0
+# CHECKOBJDUMP:        bclri.w         $w23, $w30, 3
+# CHECKOBJDUMP:        bclri.d         $w9, $w11, 0
+# CHECKOBJDUMP:        binsli.b        $w25, $w12, 1
+# CHECKOBJDUMP:        binsli.h        $w21, $w22, 0
+# CHECKOBJDUMP:        binsli.w        $w22, $w4, 0
+# CHECKOBJDUMP:        binsli.d        $w6, $w2, 6
+# CHECKOBJDUMP:        binsri.b        $w15, $w19, 0
+# CHECKOBJDUMP:        binsri.h        $w8, $w30, 1
+# CHECKOBJDUMP:        binsri.w        $w2, $w19, 5
+# CHECKOBJDUMP:        binsri.d        $w18, $w20, 1
+# CHECKOBJDUMP:        bnegi.b         $w24, $w19, 0
+# CHECKOBJDUMP:        bnegi.h         $w28, $w11, 3
+# CHECKOBJDUMP:        bnegi.w         $w1, $w27, 5
+# CHECKOBJDUMP:        bnegi.d         $w4, $w21, 1
+# CHECKOBJDUMP:        bseti.b         $w18, $w8, 0
+# CHECKOBJDUMP:        bseti.h         $w24, $w14, 2
+# CHECKOBJDUMP:        bseti.w         $w9, $w18, 4
+# CHECKOBJDUMP:        bseti.d         $w7, $w15, 1
+# CHECKOBJDUMP:        sat_s.b         $w31, $w31, 2
+# CHECKOBJDUMP:        sat_s.h         $w19, $w19, 0
+# CHECKOBJDUMP:        sat_s.w         $w19, $w29, 0
+# CHECKOBJDUMP:        sat_s.d         $w11, $w22, 0
+# CHECKOBJDUMP:        sat_u.b         $w1, $w13, 3
+# CHECKOBJDUMP:        sat_u.h         $w30, $w24, 4
+# CHECKOBJDUMP:        sat_u.w         $w31, $w13, 0
+# CHECKOBJDUMP:        sat_u.d         $w29, $w16, 5
+# CHECKOBJDUMP:        slli.b          $w23, $w10, 1
+# CHECKOBJDUMP:        slli.h          $w9, $w18, 1
+# CHECKOBJDUMP:        slli.w          $w11, $w29, 4
+# CHECKOBJDUMP:        slli.d          $w25, $w20, 1
+# CHECKOBJDUMP:        srai.b          $w24, $w29, 1
+# CHECKOBJDUMP:        srai.h          $w1, $w6, 0
+# CHECKOBJDUMP:        srai.w          $w7, $w26, 1
+# CHECKOBJDUMP:        srai.d          $w20, $w25, 3
+# CHECKOBJDUMP:        srari.b         $w5, $w25, 0
+# CHECKOBJDUMP:        srari.h         $w7, $w6, 4
+# CHECKOBJDUMP:        srari.w         $w17, $w11, 5
+# CHECKOBJDUMP:        srari.d         $w21, $w25, 5
+# CHECKOBJDUMP:        srli.b          $w2, $w0, 2
+# CHECKOBJDUMP:        srli.h          $w31, $w31, 2
+# CHECKOBJDUMP:        srli.w          $w5, $w9, 4
+# CHECKOBJDUMP:        srli.d          $w27, $w26, 5
+# CHECKOBJDUMP:        srlri.b         $w18, $w3, 0
+# CHECKOBJDUMP:        srlri.h         $w1, $w2, 3
+# CHECKOBJDUMP:        srlri.w         $w11, $w22, 2
+# CHECKOBJDUMP:        srlri.d         $w24, $w10, 6
+
+                bclri.b         $w21, $w30, 2
+                bclri.h         $w24, $w21, 0
+                bclri.w         $w23, $w30, 3
+                bclri.d         $w9, $w11, 0
+                binsli.b        $w25, $w12, 1
+                binsli.h        $w21, $w22, 0
+                binsli.w        $w22, $w4, 0
+                binsli.d        $w6, $w2, 6
+                binsri.b        $w15, $w19, 0
+                binsri.h        $w8, $w30, 1
+                binsri.w        $w2, $w19, 5
+                binsri.d        $w18, $w20, 1
+                bnegi.b         $w24, $w19, 0
+                bnegi.h         $w28, $w11, 3
+                bnegi.w         $w1, $w27, 5
+                bnegi.d         $w4, $w21, 1
+                bseti.b         $w18, $w8, 0
+                bseti.h         $w24, $w14, 2
+                bseti.w         $w9, $w18, 4
+                bseti.d         $w7, $w15, 1
+                sat_s.b         $w31, $w31, 2
+                sat_s.h         $w19, $w19, 0
+                sat_s.w         $w19, $w29, 0
+                sat_s.d         $w11, $w22, 0
+                sat_u.b         $w1, $w13, 3
+                sat_u.h         $w30, $w24, 4
+                sat_u.w         $w31, $w13, 0
+                sat_u.d         $w29, $w16, 5
+                slli.b          $w23, $w10, 1
+                slli.h          $w9, $w18, 1
+                slli.w          $w11, $w29, 4
+                slli.d          $w25, $w20, 1
+                srai.b          $w24, $w29, 1
+                srai.h          $w1, $w6, 0
+                srai.w          $w7, $w26, 1
+                srai.d          $w20, $w25, 3
+                srari.b         $w5, $w25, 0
+                srari.h         $w7, $w6, 4
+                srari.w         $w17, $w11, 5
+                srari.d         $w21, $w25, 5
+                srli.b          $w2, $w0, 2
+                srli.h          $w31, $w31, 2
+                srli.w          $w5, $w9, 4
+                srli.d          $w27, $w26, 5
+                srlri.b         $w18, $w3, 0
+                srlri.h         $w1, $w2, 3
+                srlri.w         $w11, $w22, 2
+                srlri.d         $w24, $w10, 6
diff --git a/test/MC/Mips/msa/test_cbranch.s b/test/MC/Mips/msa/test_cbranch.s
new file mode 100644
index 0000000..2fc65af
--- /dev/null
+++ b/test/MC/Mips/msa/test_cbranch.s
@@ -0,0 +1,78 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+#CHECK:      bnz.b        $w0, 4        # encoding: [0x47,0x80,0x00,0x01]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bnz.h        $w1, 16       # encoding: [0x47,0xa1,0x00,0x04]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bnz.w        $w2, 128      # encoding: [0x47,0xc2,0x00,0x20]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bnz.d        $w3, -128     # encoding: [0x47,0xe3,0xff,0xe0]
+#CHECK:      bnz.b        $w0, SYMBOL0  # encoding: [0x47'A',0x80'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bnz.h        $w1, SYMBOL1  # encoding: [0x47'A',0xa1'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL1, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bnz.w        $w2, SYMBOL2  # encoding: [0x47'A',0xc2'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL2, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bnz.d        $w3, SYMBOL3  # encoding: [0x47'A',0xe3'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL3, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+
+#CHECK:      bnz.v        $w0, 4        # encoding: [0x45,0xe0,0x00,0x01]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bnz.v        $w0, SYMBOL0  # encoding: [0x45'A',0xe0'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+
+#CHECK:      bz.b         $w0, 128      # encoding: [0x47,0x00,0x00,0x20]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.h         $w1, 256      # encoding: [0x47,0x21,0x00,0x40]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.w         $w2, 512      # encoding: [0x47,0x42,0x00,0x80]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.d         $w3, -1024    # encoding: [0x47,0x63,0xff,0x00]
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.b         $w0, SYMBOL0  # encoding: [0x47'A',A,0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.h         $w1, SYMBOL1  # encoding: [0x47'A',0x21'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL1, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.w         $w2, SYMBOL2  # encoding: [0x47'A',0x42'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL2, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.d         $w3, SYMBOL3  # encoding: [0x47'A',0x63'A',0x00,0x00]
+                                        #   fixup A - offset: 0, value: SYMBOL3, kind: fixup_Mips_PC16
+#CHECK:      nop                        # encoding: [0x00,0x00,0x00,0x00]
+
+#CHECK:      bz.v        $w0, 4        # encoding: [0x45,0x60,0x00,0x01]
+#CHECK:      nop                       # encoding: [0x00,0x00,0x00,0x00]
+#CHECK:      bz.v        $w0, SYMBOL0  # encoding: [0x45'A',0x60'A',0x00,0x00]
+                                       #   fixup A - offset: 0, value: SYMBOL0, kind: fixup_Mips_PC16
+#CHECK:      nop                       # encoding: [0x00,0x00,0x00,0x00]
+
+bnz.b        $w0, 4
+bnz.h        $w1, 16
+bnz.w        $w2, 128
+bnz.d        $w3, -128
+bnz.b        $w0, SYMBOL0
+bnz.h        $w1, SYMBOL1
+bnz.w        $w2, SYMBOL2
+bnz.d        $w3, SYMBOL3
+
+bnz.v        $w0, 4
+bnz.v        $w0, SYMBOL0
+
+bz.b        $w0, 128
+bz.h        $w1, 256
+bz.w        $w2, 512
+bz.d        $w3, -1024
+bz.b        $w0, SYMBOL0
+bz.h        $w1, SYMBOL1
+bz.w        $w2, SYMBOL2
+bz.d        $w3, SYMBOL3
+
+bz.v        $w0, 4
+bz.v        $w0, SYMBOL0
diff --git a/test/MC/Mips/msa/test_ctrlregs.s b/test/MC/Mips/msa/test_ctrlregs.s
new file mode 100644
index 0000000..f8f4f9e
--- /dev/null
+++ b/test/MC/Mips/msa/test_ctrlregs.s
@@ -0,0 +1,105 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+#CHECK:  cfcmsa       $1, $0                  # encoding: [0x78,0x7e,0x00,0x59]
+#CHECK:  cfcmsa       $1, $0                  # encoding: [0x78,0x7e,0x00,0x59]
+#CHECK:  cfcmsa       $2, $1                  # encoding: [0x78,0x7e,0x08,0x99]
+#CHECK:  cfcmsa       $2, $1                  # encoding: [0x78,0x7e,0x08,0x99]
+#CHECK:  cfcmsa       $3, $2                  # encoding: [0x78,0x7e,0x10,0xd9]
+#CHECK:  cfcmsa       $3, $2                  # encoding: [0x78,0x7e,0x10,0xd9]
+#CHECK:  cfcmsa       $4, $3                  # encoding: [0x78,0x7e,0x19,0x19]
+#CHECK:  cfcmsa       $4, $3                  # encoding: [0x78,0x7e,0x19,0x19]
+#CHECK:  cfcmsa       $5, $4                  # encoding: [0x78,0x7e,0x21,0x59]
+#CHECK:  cfcmsa       $5, $4                  # encoding: [0x78,0x7e,0x21,0x59]
+#CHECK:  cfcmsa       $6, $5                  # encoding: [0x78,0x7e,0x29,0x99]
+#CHECK:  cfcmsa       $6, $5                  # encoding: [0x78,0x7e,0x29,0x99]
+#CHECK:  cfcmsa       $7, $6                  # encoding: [0x78,0x7e,0x31,0xd9]
+#CHECK:  cfcmsa       $7, $6                  # encoding: [0x78,0x7e,0x31,0xd9]
+#CHECK:  cfcmsa       $8, $7                  # encoding: [0x78,0x7e,0x3a,0x19]
+#CHECK:  cfcmsa       $8, $7                  # encoding: [0x78,0x7e,0x3a,0x19]
+
+#CHECK:  ctcmsa       $0, $1                  # encoding: [0x78,0x3e,0x08,0x19]
+#CHECK:  ctcmsa       $0, $1                  # encoding: [0x78,0x3e,0x08,0x19]
+#CHECK:  ctcmsa       $1, $2                  # encoding: [0x78,0x3e,0x10,0x59]
+#CHECK:  ctcmsa       $1, $2                  # encoding: [0x78,0x3e,0x10,0x59]
+#CHECK:  ctcmsa       $2, $3                  # encoding: [0x78,0x3e,0x18,0x99]
+#CHECK:  ctcmsa       $2, $3                  # encoding: [0x78,0x3e,0x18,0x99]
+#CHECK:  ctcmsa       $3, $4                  # encoding: [0x78,0x3e,0x20,0xd9]
+#CHECK:  ctcmsa       $3, $4                  # encoding: [0x78,0x3e,0x20,0xd9]
+#CHECK:  ctcmsa       $4, $5                  # encoding: [0x78,0x3e,0x29,0x19]
+#CHECK:  ctcmsa       $4, $5                  # encoding: [0x78,0x3e,0x29,0x19]
+#CHECK:  ctcmsa       $5, $6                  # encoding: [0x78,0x3e,0x31,0x59]
+#CHECK:  ctcmsa       $5, $6                  # encoding: [0x78,0x3e,0x31,0x59]
+#CHECK:  ctcmsa       $6, $7                  # encoding: [0x78,0x3e,0x39,0x99]
+#CHECK:  ctcmsa       $6, $7                  # encoding: [0x78,0x3e,0x39,0x99]
+#CHECK:  ctcmsa       $7, $8                  # encoding: [0x78,0x3e,0x41,0xd9]
+#CHECK:  ctcmsa       $7, $8                  # encoding: [0x78,0x3e,0x41,0xd9]
+
+#CHECKOBJDUMP:  cfcmsa       $1, $0
+#CHECKOBJDUMP:  cfcmsa       $1, $0
+#CHECKOBJDUMP:  cfcmsa       $2, $1
+#CHECKOBJDUMP:  cfcmsa       $2, $1
+#CHECKOBJDUMP:  cfcmsa       $3, $2
+#CHECKOBJDUMP:  cfcmsa       $3, $2
+#CHECKOBJDUMP:  cfcmsa       $4, $3
+#CHECKOBJDUMP:  cfcmsa       $4, $3
+#CHECKOBJDUMP:  cfcmsa       $5, $4
+#CHECKOBJDUMP:  cfcmsa       $5, $4
+#CHECKOBJDUMP:  cfcmsa       $6, $5
+#CHECKOBJDUMP:  cfcmsa       $6, $5
+#CHECKOBJDUMP:  cfcmsa       $7, $6
+#CHECKOBJDUMP:  cfcmsa       $7, $6
+#CHECKOBJDUMP:  cfcmsa       $8, $7
+#CHECKOBJDUMP:  cfcmsa       $8, $7
+
+#CHECKOBJDUMP:  ctcmsa       $0, $1
+#CHECKOBJDUMP:  ctcmsa       $0, $1
+#CHECKOBJDUMP:  ctcmsa       $1, $2
+#CHECKOBJDUMP:  ctcmsa       $1, $2
+#CHECKOBJDUMP:  ctcmsa       $2, $3
+#CHECKOBJDUMP:  ctcmsa       $2, $3
+#CHECKOBJDUMP:  ctcmsa       $3, $4
+#CHECKOBJDUMP:  ctcmsa       $3, $4
+#CHECKOBJDUMP:  ctcmsa       $4, $5
+#CHECKOBJDUMP:  ctcmsa       $4, $5
+#CHECKOBJDUMP:  ctcmsa       $5, $6
+#CHECKOBJDUMP:  ctcmsa       $5, $6
+#CHECKOBJDUMP:  ctcmsa       $6, $7
+#CHECKOBJDUMP:  ctcmsa       $6, $7
+#CHECKOBJDUMP:  ctcmsa       $7, $8
+#CHECKOBJDUMP:  ctcmsa       $7, $8
+
+cfcmsa       $1, $msair
+cfcmsa       $1, $0
+cfcmsa       $2, $msacsr
+cfcmsa       $2, $1
+cfcmsa       $3, $msaaccess
+cfcmsa       $3, $2
+cfcmsa       $4, $msasave
+cfcmsa       $4, $3
+cfcmsa       $5, $msamodify
+cfcmsa       $5, $4
+cfcmsa       $6, $msarequest
+cfcmsa       $6, $5
+cfcmsa       $7, $msamap
+cfcmsa       $7, $6
+cfcmsa       $8, $msaunmap
+cfcmsa       $8, $7
+
+ctcmsa       $msair, $1
+ctcmsa       $0, $1
+ctcmsa       $msacsr, $2
+ctcmsa       $1, $2
+ctcmsa       $msaaccess, $3
+ctcmsa       $2, $3
+ctcmsa       $msasave, $4
+ctcmsa       $3, $4
+ctcmsa       $msamodify, $5
+ctcmsa       $4, $5
+ctcmsa       $msarequest, $6
+ctcmsa       $5, $6
+ctcmsa       $msamap, $7
+ctcmsa       $6, $7
+ctcmsa       $msaunmap, $8
+ctcmsa       $7, $8
diff --git a/test/MC/Mips/msa/test_elm.s b/test/MC/Mips/msa/test_elm.s
new file mode 100644
index 0000000..1d04838
--- /dev/null
+++ b/test/MC/Mips/msa/test_elm.s
@@ -0,0 +1,51 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        copy_s.b        $13, $w8[2]             # encoding: [0x78,0x82,0x43,0x59]
+# CHECK:        copy_s.h        $1, $w25[0]             # encoding: [0x78,0xa0,0xc8,0x59]
+# CHECK:        copy_s.w        $22, $w5[1]             # encoding: [0x78,0xb1,0x2d,0x99]
+# CHECK:        copy_u.b        $22, $w20[4]            # encoding: [0x78,0xc4,0xa5,0x99]
+# CHECK:        copy_u.h        $20, $w4[0]             # encoding: [0x78,0xe0,0x25,0x19]
+# CHECK:        copy_u.w        $fp, $w13[2]            # encoding: [0x78,0xf2,0x6f,0x99]
+# CHECK:        sldi.b          $w0, $w29[4]            # encoding: [0x78,0x04,0xe8,0x19]
+# CHECK:        sldi.h          $w8, $w17[0]            # encoding: [0x78,0x20,0x8a,0x19]
+# CHECK:        sldi.w          $w20, $w27[2]           # encoding: [0x78,0x32,0xdd,0x19]
+# CHECK:        sldi.d          $w4, $w12[0]            # encoding: [0x78,0x38,0x61,0x19]
+# CHECK:        splati.b        $w25, $w3[2]            # encoding: [0x78,0x42,0x1e,0x59]
+# CHECK:        splati.h        $w24, $w28[1]           # encoding: [0x78,0x61,0xe6,0x19]
+# CHECK:        splati.w        $w13, $w18[0]           # encoding: [0x78,0x70,0x93,0x59]
+# CHECK:        splati.d        $w28, $w1[0]            # encoding: [0x78,0x78,0x0f,0x19]
+# CHECK:        move.v          $w23, $w24              # encoding: [0x78,0xbe,0xc5,0xd9]
+
+# CHECKOBJDUMP:        copy_s.b        $13, $w8[2]
+# CHECKOBJDUMP:        copy_s.h        $1, $w25[0]
+# CHECKOBJDUMP:        copy_s.w        $22, $w5[1]
+# CHECKOBJDUMP:        copy_u.b        $22, $w20[4]
+# CHECKOBJDUMP:        copy_u.h        $20, $w4[0]
+# CHECKOBJDUMP:        copy_u.w        $fp, $w13[2]
+# CHECKOBJDUMP:        sldi.b          $w0, $w29[4]
+# CHECKOBJDUMP:        sldi.h          $w8, $w17[0]
+# CHECKOBJDUMP:        sldi.w          $w20, $w27[2]
+# CHECKOBJDUMP:        sldi.d          $w4, $w12[0]
+# CHECKOBJDUMP:        splati.b        $w25, $w3[2]
+# CHECKOBJDUMP:        splati.h        $w24, $w28[1]
+# CHECKOBJDUMP:        splati.w        $w13, $w18[0]
+# CHECKOBJDUMP:        splati.d        $w28, $w1[0]
+# CHECKOBJDUMP:        move.v          $w23, $w24
+
+                copy_s.b        $13, $w8[2]
+                copy_s.h        $1, $w25[0]
+                copy_s.w        $22, $w5[1]
+                copy_u.b        $22, $w20[4]
+                copy_u.h        $20, $w4[0]
+                copy_u.w        $30, $w13[2]
+                sldi.b          $w0, $w29[4]
+                sldi.h          $w8, $w17[0]
+                sldi.w          $w20, $w27[2]
+                sldi.d          $w4, $w12[0]
+                splati.b        $w25, $w3[2]
+                splati.h        $w24, $w28[1]
+                splati.w        $w13, $w18[0]
+                splati.d        $w28, $w1[0]
+                move.v          $w23, $w24
diff --git a/test/MC/Mips/msa/test_elm_insert.s b/test/MC/Mips/msa/test_elm_insert.s
new file mode 100644
index 0000000..5fc55f3
--- /dev/null
+++ b/test/MC/Mips/msa/test_elm_insert.s
@@ -0,0 +1,15 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        insert.b        $w23[3], $sp            # encoding: [0x79,0x03,0xed,0xd9]
+# CHECK:        insert.h        $w20[2], $5             # encoding: [0x79,0x22,0x2d,0x19]
+# CHECK:        insert.w        $w8[2], $15             # encoding: [0x79,0x32,0x7a,0x19]
+
+# CHECKOBJDUMP:        insert.b        $w23[3], $sp
+# CHECKOBJDUMP:        insert.h        $w20[2], $5
+# CHECKOBJDUMP:        insert.w        $w8[2], $15
+
+                insert.b        $w23[3], $sp
+                insert.h        $w20[2], $5
+                insert.w        $w8[2], $15
diff --git a/test/MC/Mips/msa/test_elm_insve.s b/test/MC/Mips/msa/test_elm_insve.s
new file mode 100644
index 0000000..d63d687
--- /dev/null
+++ b/test/MC/Mips/msa/test_elm_insve.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        insve.b $w25[3], $w9[0]         # encoding: [0x79,0x43,0x4e,0x59]
+# CHECK:        insve.h $w24[2], $w2[0]         # encoding: [0x79,0x62,0x16,0x19]
+# CHECK:        insve.w $w0[2], $w13[0]         # encoding: [0x79,0x72,0x68,0x19]
+# CHECK:        insve.d $w3[0], $w18[0]         # encoding: [0x79,0x78,0x90,0xd9]
+
+# CHECKOBJDUMP:        insve.b $w25[3], $w9[0]
+# CHECKOBJDUMP:        insve.h $w24[2], $w2[0]
+# CHECKOBJDUMP:        insve.w $w0[2], $w13[0]
+# CHECKOBJDUMP:        insve.d $w3[0], $w18[0]
+
+                insve.b $w25[3], $w9[0]
+                insve.h $w24[2], $w2[0]
+                insve.w $w0[2], $w13[0]
+                insve.d $w3[0], $w18[0]
diff --git a/test/MC/Mips/msa/test_i10.s b/test/MC/Mips/msa/test_i10.s
new file mode 100644
index 0000000..828ebb5
--- /dev/null
+++ b/test/MC/Mips/msa/test_i10.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+
+# CHECK:        ldi.b   $w8, 198                # encoding: [0x7b,0x06,0x32,0x07]
+# CHECK:        ldi.h   $w20, 313               # encoding: [0x7b,0x29,0xcd,0x07]
+# CHECK:        ldi.w   $w24, 492               # encoding: [0x7b,0x4f,0x66,0x07]
+# CHECK:        ldi.d   $w27, -180              # encoding: [0x7b,0x7a,0x66,0xc7]
+
+# CHECKOBJDUMP:        ldi.b   $w8, 198
+# CHECKOBJDUMP:        ldi.h   $w20, 313
+# CHECKOBJDUMP:        ldi.w   $w24, 492
+# CHECKOBJDUMP:        ldi.d   $w27, 844
+
+                ldi.b   $w8, 198
+                ldi.h   $w20, 313
+                ldi.w   $w24, 492
+                ldi.d   $w27, -180
diff --git a/test/MC/Mips/msa/test_i5.s b/test/MC/Mips/msa/test_i5.s
new file mode 100644
index 0000000..992bfe1
--- /dev/null
+++ b/test/MC/Mips/msa/test_i5.s
@@ -0,0 +1,138 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        addvi.b         $w3, $w31, 30           # encoding: [0x78,0x1e,0xf8,0xc6]
+# CHECK:        addvi.h         $w24, $w13, 26          # encoding: [0x78,0x3a,0x6e,0x06]
+# CHECK:        addvi.w         $w26, $w20, 26          # encoding: [0x78,0x5a,0xa6,0x86]
+# CHECK:        addvi.d         $w16, $w1, 21           # encoding: [0x78,0x75,0x0c,0x06]
+# CHECK:        ceqi.b          $w24, $w21, -8          # encoding: [0x78,0x18,0xae,0x07]
+# CHECK:        ceqi.h          $w31, $w15, 2           # encoding: [0x78,0x22,0x7f,0xc7]
+# CHECK:        ceqi.w          $w12, $w1, -1           # encoding: [0x78,0x5f,0x0b,0x07]
+# CHECK:        ceqi.d          $w24, $w22, 7           # encoding: [0x78,0x67,0xb6,0x07]
+# CHECK:        clei_s.b        $w12, $w16, 1           # encoding: [0x7a,0x01,0x83,0x07]
+# CHECK:        clei_s.h        $w2, $w10, -9           # encoding: [0x7a,0x37,0x50,0x87]
+# CHECK:        clei_s.w        $w4, $w11, -10          # encoding: [0x7a,0x56,0x59,0x07]
+# CHECK:        clei_s.d        $w0, $w29, -10          # encoding: [0x7a,0x76,0xe8,0x07]
+# CHECK:        clei_u.b        $w21, $w17, 3           # encoding: [0x7a,0x83,0x8d,0x47]
+# CHECK:        clei_u.h        $w29, $w7, 17           # encoding: [0x7a,0xb1,0x3f,0x47]
+# CHECK:        clei_u.w        $w1, $w1, 2             # encoding: [0x7a,0xc2,0x08,0x47]
+# CHECK:        clei_u.d        $w27, $w27, 29          # encoding: [0x7a,0xfd,0xde,0xc7]
+# CHECK:        clti_s.b        $w19, $w13, -7          # encoding: [0x79,0x19,0x6c,0xc7]
+# CHECK:        clti_s.h        $w15, $w10, -12         # encoding: [0x79,0x34,0x53,0xc7]
+# CHECK:        clti_s.w        $w12, $w12, 11          # encoding: [0x79,0x4b,0x63,0x07]
+# CHECK:        clti_s.d        $w29, $w20, -15         # encoding: [0x79,0x71,0xa7,0x47]
+# CHECK:        clti_u.b        $w14, $w9, 29           # encoding: [0x79,0x9d,0x4b,0x87]
+# CHECK:        clti_u.h        $w24, $w25, 25          # encoding: [0x79,0xb9,0xce,0x07]
+# CHECK:        clti_u.w        $w1, $w1, 22            # encoding: [0x79,0xd6,0x08,0x47]
+# CHECK:        clti_u.d        $w21, $w25, 1           # encoding: [0x79,0xe1,0xcd,0x47]
+# CHECK:        maxi_s.b        $w22, $w21, 1           # encoding: [0x79,0x01,0xad,0x86]
+# CHECK:        maxi_s.h        $w29, $w5, -8           # encoding: [0x79,0x38,0x2f,0x46]
+# CHECK:        maxi_s.w        $w1, $w10, -12          # encoding: [0x79,0x54,0x50,0x46]
+# CHECK:        maxi_s.d        $w13, $w29, -16         # encoding: [0x79,0x70,0xeb,0x46]
+# CHECK:        maxi_u.b        $w20, $w0, 12           # encoding: [0x79,0x8c,0x05,0x06]
+# CHECK:        maxi_u.h        $w1, $w14, 3            # encoding: [0x79,0xa3,0x70,0x46]
+# CHECK:        maxi_u.w        $w27, $w22, 11          # encoding: [0x79,0xcb,0xb6,0xc6]
+# CHECK:        maxi_u.d        $w26, $w6, 4            # encoding: [0x79,0xe4,0x36,0x86]
+# CHECK:        mini_s.b        $w4, $w1, 1             # encoding: [0x7a,0x01,0x09,0x06]
+# CHECK:        mini_s.h        $w27, $w27, -9          # encoding: [0x7a,0x37,0xde,0xc6]
+# CHECK:        mini_s.w        $w28, $w11, 9           # encoding: [0x7a,0x49,0x5f,0x06]
+# CHECK:        mini_s.d        $w11, $w10, 10          # encoding: [0x7a,0x6a,0x52,0xc6]
+# CHECK:        mini_u.b        $w18, $w23, 27          # encoding: [0x7a,0x9b,0xbc,0x86]
+# CHECK:        mini_u.h        $w7, $w26, 18           # encoding: [0x7a,0xb2,0xd1,0xc6]
+# CHECK:        mini_u.w        $w11, $w12, 26          # encoding: [0x7a,0xda,0x62,0xc6]
+# CHECK:        mini_u.d        $w11, $w15, 2           # encoding: [0x7a,0xe2,0x7a,0xc6]
+# CHECK:        subvi.b         $w24, $w20, 19          # encoding: [0x78,0x93,0xa6,0x06]
+# CHECK:        subvi.h         $w11, $w19, 4           # encoding: [0x78,0xa4,0x9a,0xc6]
+# CHECK:        subvi.w         $w12, $w10, 11          # encoding: [0x78,0xcb,0x53,0x06]
+# CHECK:        subvi.d         $w19, $w16, 7           # encoding: [0x78,0xe7,0x84,0xc6]
+
+# CHECKOBJDUMP:        addvi.b         $w3, $w31, 30
+# CHECKOBJDUMP:        addvi.h         $w24, $w13, 26
+# CHECKOBJDUMP:        addvi.w         $w26, $w20, 26
+# CHECKOBJDUMP:        addvi.d         $w16, $w1, 21
+# CHECKOBJDUMP:        ceqi.b          $w24, $w21, 24
+# CHECKOBJDUMP:        ceqi.h          $w31, $w15, 2
+# CHECKOBJDUMP:        ceqi.w          $w12, $w1, 31
+# CHECKOBJDUMP:        ceqi.d          $w24, $w22, 7
+# CHECKOBJDUMP:        clei_s.b        $w12, $w16, 1
+# CHECKOBJDUMP:        clei_s.h        $w2, $w10, 23
+# CHECKOBJDUMP:        clei_s.w        $w4, $w11, 22
+# CHECKOBJDUMP:        clei_s.d        $w0, $w29, 22
+# CHECKOBJDUMP:        clei_u.b        $w21, $w17, 3
+# CHECKOBJDUMP:        clei_u.h        $w29, $w7, 17
+# CHECKOBJDUMP:        clei_u.w        $w1, $w1, 2
+# CHECKOBJDUMP:        clei_u.d        $w27, $w27, 29
+# CHECKOBJDUMP:        clti_s.b        $w19, $w13, 25
+# CHECKOBJDUMP:        clti_s.h        $w15, $w10, 20
+# CHECKOBJDUMP:        clti_s.w        $w12, $w12, 11
+# CHECKOBJDUMP:        clti_s.d        $w29, $w20, 17
+# CHECKOBJDUMP:        clti_u.b        $w14, $w9, 29
+# CHECKOBJDUMP:        clti_u.h        $w24, $w25, 25
+# CHECKOBJDUMP:        clti_u.w        $w1, $w1, 22
+# CHECKOBJDUMP:        clti_u.d        $w21, $w25, 1
+# CHECKOBJDUMP:        maxi_s.b        $w22, $w21, 1
+# CHECKOBJDUMP:        maxi_s.h        $w29, $w5, 24
+# CHECKOBJDUMP:        maxi_s.w        $w1, $w10, 20
+# CHECKOBJDUMP:        maxi_s.d        $w13, $w29, 16
+# CHECKOBJDUMP:        maxi_u.b        $w20, $w0, 12
+# CHECKOBJDUMP:        maxi_u.h        $w1, $w14, 3
+# CHECKOBJDUMP:        maxi_u.w        $w27, $w22, 11
+# CHECKOBJDUMP:        maxi_u.d        $w26, $w6, 4
+# CHECKOBJDUMP:        mini_s.b        $w4, $w1, 1
+# CHECKOBJDUMP:        mini_s.h        $w27, $w27, 23
+# CHECKOBJDUMP:        mini_s.w        $w28, $w11, 9
+# CHECKOBJDUMP:        mini_s.d        $w11, $w10, 10
+# CHECKOBJDUMP:        mini_u.b        $w18, $w23, 27
+# CHECKOBJDUMP:        mini_u.h        $w7, $w26, 18
+# CHECKOBJDUMP:        mini_u.w        $w11, $w12, 26
+# CHECKOBJDUMP:        mini_u.d        $w11, $w15, 2
+# CHECKOBJDUMP:        subvi.b         $w24, $w20, 19
+# CHECKOBJDUMP:        subvi.h         $w11, $w19, 4
+# CHECKOBJDUMP:        subvi.w         $w12, $w10, 11
+# CHECKOBJDUMP:        subvi.d         $w19, $w16, 7
+
+                addvi.b         $w3, $w31, 30
+                addvi.h         $w24, $w13, 26
+                addvi.w         $w26, $w20, 26
+                addvi.d         $w16, $w1, 21
+                ceqi.b          $w24, $w21, -8
+                ceqi.h          $w31, $w15, 2
+                ceqi.w          $w12, $w1, -1
+                ceqi.d          $w24, $w22, 7
+                clei_s.b        $w12, $w16, 1
+                clei_s.h        $w2, $w10, -9
+                clei_s.w        $w4, $w11, -10
+                clei_s.d        $w0, $w29, -10
+                clei_u.b        $w21, $w17, 3
+                clei_u.h        $w29, $w7, 17
+                clei_u.w        $w1, $w1, 2
+                clei_u.d        $w27, $w27, 29
+                clti_s.b        $w19, $w13, -7
+                clti_s.h        $w15, $w10, -12
+                clti_s.w        $w12, $w12, 11
+                clti_s.d        $w29, $w20, -15
+                clti_u.b        $w14, $w9, 29
+                clti_u.h        $w24, $w25, 25
+                clti_u.w        $w1, $w1, 22
+                clti_u.d        $w21, $w25, 1
+                maxi_s.b        $w22, $w21, 1
+                maxi_s.h        $w29, $w5, -8
+                maxi_s.w        $w1, $w10, -12
+                maxi_s.d        $w13, $w29, -16
+                maxi_u.b        $w20, $w0, 12
+                maxi_u.h        $w1, $w14, 3
+                maxi_u.w        $w27, $w22, 11
+                maxi_u.d        $w26, $w6, 4
+                mini_s.b        $w4, $w1, 1
+                mini_s.h        $w27, $w27, -9
+                mini_s.w        $w28, $w11, 9
+                mini_s.d        $w11, $w10, 10
+                mini_u.b        $w18, $w23, 27
+                mini_u.h        $w7, $w26, 18
+                mini_u.w        $w11, $w12, 26
+                mini_u.d        $w11, $w15, 2
+                subvi.b         $w24, $w20, 19
+                subvi.h         $w11, $w19, 4
+                subvi.w         $w12, $w10, 11
+                subvi.d         $w19, $w16, 7
diff --git a/test/MC/Mips/msa/test_i8.s b/test/MC/Mips/msa/test_i8.s
new file mode 100644
index 0000000..2604be0
--- /dev/null
+++ b/test/MC/Mips/msa/test_i8.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        andi.b  $w2, $w29, 48           # encoding: [0x78,0x30,0xe8,0x80]
+# CHECK:        bmnzi.b $w6, $w22, 126          # encoding: [0x78,0x7e,0xb1,0x81]
+# CHECK:        bmzi.b  $w27, $w1, 88           # encoding: [0x79,0x58,0x0e,0xc1]
+# CHECK:        bseli.b $w29, $w3, 189          # encoding: [0x7a,0xbd,0x1f,0x41]
+# CHECK:        nori.b  $w1, $w17, 56           # encoding: [0x7a,0x38,0x88,0x40]
+# CHECK:        ori.b   $w26, $w20, 135         # encoding: [0x79,0x87,0xa6,0x80]
+# CHECK:        shf.b   $w19, $w30, 105         # encoding: [0x78,0x69,0xf4,0xc2]
+# CHECK:        shf.h   $w17, $w8, 76           # encoding: [0x79,0x4c,0x44,0x42]
+# CHECK:        shf.w   $w14, $w3, 93           # encoding: [0x7a,0x5d,0x1b,0x82]
+# CHECK:        xori.b  $w16, $w10, 20          # encoding: [0x7b,0x14,0x54,0x00]
+
+# CHECKOBJDUMP:        andi.b  $w2, $w29, 48
+# CHECKOBJDUMP:        bmnzi.b $w6, $w22, 126
+# CHECKOBJDUMP:        bmzi.b  $w27, $w1, 88
+# CHECKOBJDUMP:        bseli.b $w29, $w3, 189
+# CHECKOBJDUMP:        nori.b  $w1, $w17, 56
+# CHECKOBJDUMP:        ori.b   $w26, $w20, 135
+# CHECKOBJDUMP:        shf.b   $w19, $w30, 105
+# CHECKOBJDUMP:        shf.h   $w17, $w8, 76
+# CHECKOBJDUMP:        shf.w   $w14, $w3, 93
+# CHECKOBJDUMP:        xori.b  $w16, $w10, 20
+
+                andi.b  $w2, $w29, 48
+                bmnzi.b $w6, $w22, 126
+                bmzi.b  $w27, $w1, 88
+                bseli.b $w29, $w3, 189
+                nori.b  $w1, $w17, 56
+                ori.b   $w26, $w20, 135
+                shf.b   $w19, $w30, 105
+                shf.h   $w17, $w8, 76
+                shf.w   $w14, $w3, 93
+                xori.b  $w16, $w10, 20
diff --git a/test/MC/Mips/msa/test_lsa.s b/test/MC/Mips/msa/test_lsa.s
new file mode 100644
index 0000000..6d1d868
--- /dev/null
+++ b/test/MC/Mips/msa/test_lsa.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        lsa        $8, $9, $10, 1              # encoding: [0x01,0x2a,0x40,0x05]
+# CHECK:        lsa        $8, $9, $10, 2              # encoding: [0x01,0x2a,0x40,0x45]
+# CHECK:        lsa        $8, $9, $10, 3              # encoding: [0x01,0x2a,0x40,0x85]
+# CHECK:        lsa        $8, $9, $10, 4              # encoding: [0x01,0x2a,0x40,0xc5]
+
+# CHECKOBJDUMP: lsa        $8, $9, $10, 1
+# CHECKOBJDUMP: lsa        $8, $9, $10, 2
+# CHECKOBJDUMP: lsa        $8, $9, $10, 3
+# CHECKOBJDUMP: lsa        $8, $9, $10, 4
+
+                lsa        $8, $9, $10, 1
+                lsa        $8, $9, $10, 2
+                lsa        $8, $9, $10, 3
+                lsa        $8, $9, $10, 4
diff --git a/test/MC/Mips/msa/test_mi10.s b/test/MC/Mips/msa/test_mi10.s
new file mode 100644
index 0000000..80257cd
--- /dev/null
+++ b/test/MC/Mips/msa/test_mi10.s
@@ -0,0 +1,30 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        ld.b    $w2, 1($7)              # encoding: [0x78,0x01,0x38,0xa0]
+# CHECK:        ld.h    $w16, -9($zero)         # encoding: [0x7b,0xf7,0x04,0x21]
+# CHECK:        ld.w    $w13, -6($4)            # encoding: [0x7b,0xfa,0x23,0x62]
+# CHECK:        ld.d    $w1, -5($16)            # encoding: [0x7b,0xfb,0x80,0x63]
+# CHECK:        st.b    $w29, 1($14)            # encoding: [0x78,0x01,0x77,0x64]
+# CHECK:        st.h    $w6, -1($8)             # encoding: [0x7b,0xff,0x41,0xa5]
+# CHECK:        st.w    $w18, 8($15)            # encoding: [0x78,0x08,0x7c,0xa6]
+# CHECK:        st.d    $w3, -14($18)           # encoding: [0x7b,0xf2,0x90,0xe7]
+
+# CHECKOBJDUMP:        ld.b    $w2, 1($7)
+# CHECKOBJDUMP:        ld.h    $w16, -9($zero)
+# CHECKOBJDUMP:        ld.w    $w13, -6($4)
+# CHECKOBJDUMP:        ld.d    $w1, -5($16)
+# CHECKOBJDUMP:        st.b    $w29, 1($14)
+# CHECKOBJDUMP:        st.h    $w6, -1($8)
+# CHECKOBJDUMP:        st.w    $w18, 8($15)
+# CHECKOBJDUMP:        st.d    $w3, -14($18)
+
+                ld.b    $w2, 1($7)
+                ld.h    $w16, -9($zero)
+                ld.w    $w13, -6($4)
+                ld.d    $w1, -5($16)
+                st.b    $w29, 1($14)
+                st.h    $w6, -1($8)
+                st.w    $w18, 8($15)
+                st.d    $w3, -14($18)
diff --git a/test/MC/Mips/msa/test_vec.s b/test/MC/Mips/msa/test_vec.s
new file mode 100644
index 0000000..9294f37
--- /dev/null
+++ b/test/MC/Mips/msa/test_vec.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -mattr=+msa -arch=mips | FileCheck %s
+#
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -mcpu=mips32r2 -mattr=+msa -arch=mips -filetype=obj -o - | llvm-objdump -d -triple=mipsel-unknown-linux -mattr=+msa -arch=mips - | FileCheck %s -check-prefix=CHECKOBJDUMP
+#
+# CHECK:        and.v   $w25, $w20, $w27        # encoding: [0x78,0x1b,0xa6,0x5e]
+# CHECK:        bmnz.v  $w17, $w6, $w7          # encoding: [0x78,0x87,0x34,0x5e]
+# CHECK:        bmz.v   $w3, $w17, $w9          # encoding: [0x78,0xa9,0x88,0xde]
+# CHECK:        bsel.v  $w8, $w0, $w14          # encoding: [0x78,0xce,0x02,0x1e]
+# CHECK:        nor.v   $w7, $w31, $w0          # encoding: [0x78,0x40,0xf9,0xde]
+# CHECK:        or.v    $w24, $w26, $w30        # encoding: [0x78,0x3e,0xd6,0x1e]
+# CHECK:        xor.v   $w7, $w27, $w15         # encoding: [0x78,0x6f,0xd9,0xde]
+
+# CHECKOBJDUMP:        and.v   $w25, $w20, $w27
+# CHECKOBJDUMP:        bmnz.v  $w17, $w6, $w7
+# CHECKOBJDUMP:        bmz.v   $w3, $w17, $w9
+# CHECKOBJDUMP:        bsel.v  $w8, $w0, $w14
+# CHECKOBJDUMP:        nor.v   $w7, $w31, $w0
+# CHECKOBJDUMP:        or.v    $w24, $w26, $w30
+# CHECKOBJDUMP:        xor.v   $w7, $w27, $w15
+
+                and.v   $w25, $w20, $w27
+                bmnz.v  $w17, $w6, $w7
+                bmz.v   $w3, $w17, $w9
+                bsel.v  $w8, $w0, $w14
+                nor.v   $w7, $w31, $w0
+                or.v    $w24, $w26, $w30
+                xor.v   $w7, $w27, $w15
diff --git a/test/MC/Mips/xgot.ll b/test/MC/Mips/xgot.ll
index e2a500f..cc33678 100644
--- a/test/MC/Mips/xgot.ll
+++ b/test/MC/Mips/xgot.ll
@@ -14,10 +14,10 @@ entry:
 ; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_LO16
 ; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_GOT_HI16
 ; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_GOT_LO16
-; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_GOT
-; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_LO16
 ; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_CALL_HI16
 ; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_CALL_LO16
+; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_GOT
+; CHECK:     0x{{[0-9,A-F]+}} R_MIPS_LO16
 ; CHECK: ]
 
   %0 = load i32* @ext_1, align 4
diff --git a/test/MC/PowerPC/deprecated-p7.s b/test/MC/PowerPC/deprecated-p7.s
new file mode 100644
index 0000000..ded9923
--- /dev/null
+++ b/test/MC/PowerPC/deprecated-p7.s
@@ -0,0 +1,12 @@
+# RUN: llvm-mc -triple powerpc64-unknown-linux-gnu -mcpu=pwr7 -show-encoding < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -triple powerpc-unknown-linux-gnu -mcpu=601 -show-encoding < %s 2>&1 | FileCheck -check-prefix=CHECK-OLD %s
+
+         mftb 3
+# CHECK: warning: deprecated
+# CHECK: mftb 3
+
+# CHECK-OLD-NOT: warning: deprecated
+# CHECK-OLD: mftb 3
+
+# FIXME: Test dst and friends once we can parse them.
+
diff --git a/test/MC/PowerPC/lit.local.cfg b/test/MC/PowerPC/lit.local.cfg
index 88488cd..193ebeb 100644
--- a/test/MC/PowerPC/lit.local.cfg
+++ b/test/MC/PowerPC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'PowerPC' in targets:
     config.unsupported = True
diff --git a/test/MC/PowerPC/ppc-nop.s b/test/MC/PowerPC/ppc-nop.s
index 567943c..50afae2 100644
--- a/test/MC/PowerPC/ppc-nop.s
+++ b/test/MC/PowerPC/ppc-nop.s
@@ -5,5 +5,8 @@ blr
 .p2align 3
 blr
 
-# CHECK:  0000: 4E800020 60000000 4E800020
+.byte 0x42
+.p2align 2
+
+# CHECK:  0000: 4E800020 60000000 4E800020 42000000
 
diff --git a/test/MC/PowerPC/ppc64-encoding-bookIII.s b/test/MC/PowerPC/ppc64-encoding-bookIII.s
new file mode 100644
index 0000000..318c30b
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-encoding-bookIII.s
@@ -0,0 +1,107 @@
+# RUN: llvm-mc -triple powerpc64-unknown-unknown --show-encoding %s | FileCheck %s
+
+# CHECK: mtmsr 4, 0                       # encoding: [0x7c,0x80,0x01,0x24]
+         mtmsr %r4
+
+# CHECK: mtmsr 4, 1                       # encoding: [0x7c,0x81,0x01,0x24]
+         mtmsr %r4, 1
+
+# CHECK: mfmsr 4                         # encoding: [0x7c,0x80,0x00,0xa6]
+         mfmsr %r4
+
+# CHECK: mtmsrd 4, 0                     # encoding: [0x7c,0x80,0x01,0x64]
+         mtmsrd %r4
+
+# CHECK: mtmsrd 4, 1                     # encoding: [0x7c,0x81,0x01,0x64]
+         mtmsrd %r4, 1
+
+# CHECK: mfspr 4, 272                    # encoding: [0x7c,0x90,0x42,0xa6]
+         mfsprg %r4, 0
+
+# CHECK: mfspr 4, 273                    # encoding: [0x7c,0x91,0x42,0xa6]
+         mfsprg %r4, 1
+
+# CHECK: mfspr 4, 274                    # encoding: [0x7c,0x92,0x42,0xa6]
+         mfsprg %r4, 2
+
+# CHECK: mfspr 4, 275                    # encoding: [0x7c,0x93,0x42,0xa6]
+         mfsprg %r4, 3
+
+# CHECK: mtspr 272, 4                    # encoding: [0x7c,0x90,0x43,0xa6]
+         mtsprg 0, %r4
+
+# CHECK: mtspr 273, 4                    # encoding: [0x7c,0x91,0x43,0xa6]
+         mtsprg 1, %r4
+
+# CHECK: mtspr 274, 4                    # encoding: [0x7c,0x92,0x43,0xa6]
+         mtsprg 2, %r4
+
+# CHECK: mtspr 275, 4                    # encoding: [0x7c,0x93,0x43,0xa6]
+         mtsprg 3, %r4
+
+# CHECK: mtspr 272, 4                    # encoding: [0x7c,0x90,0x43,0xa6]
+         mtsprg0 %r4
+
+# CHECK: mtspr 273, 4                    # encoding: [0x7c,0x91,0x43,0xa6]
+         mtsprg1 %r4
+
+# CHECK: mtspr 274, 4                    # encoding: [0x7c,0x92,0x43,0xa6]
+         mtsprg2 %r4
+
+# CHECK: mtspr 275, 4                    # encoding: [0x7c,0x93,0x43,0xa6]
+         mtsprg3 %r4
+
+# CHECK: mtspr 280, 4                    # encoding: [0x7c,0x98,0x43,0xa6]
+         mtasr %r4
+
+# CHECK: mfspr 4, 22                     # encoding: [0x7c,0x96,0x02,0xa6]
+         mfdec %r4
+
+# CHECK: mtspr 22, 4                     # encoding: [0x7c,0x96,0x03,0xa6]
+         mtdec %r4
+
+# CHECK: mfspr 4, 287                    # encoding: [0x7c,0x9f,0x42,0xa6]
+         mfpvr %r4
+
+# CHECK: mfspr 4, 25                     # encoding: [0x7c,0x99,0x02,0xa6]
+         mfsdr1 %r4
+
+# CHECK: mtspr 25, 4                     # encoding: [0x7c,0x99,0x03,0xa6]
+         mtsdr1 %r4
+
+# CHECK: mfspr 4, 26                     # encoding: [0x7c,0x9a,0x02,0xa6]
+         mfsrr0 %r4
+
+# CHECK: mtspr 26, 4                     # encoding: [0x7c,0x9a,0x03,0xa6]
+         mtsrr0 %r4
+
+# CHECK: mfspr 4, 27                     # encoding: [0x7c,0x9b,0x02,0xa6]
+         mfsrr1 %r4
+
+# CHECK: mtspr 27, 4                     # encoding: [0x7c,0x9b,0x03,0xa6]
+         mtsrr1 %r4
+
+# CHECK: slbie 4                         # encoding: [0x7c,0x00,0x23,0x64]
+         slbie %r4
+
+# CHECK: slbmte 4, 5                     # encoding: [0x7c,0x80,0x2b,0x24]
+         slbmte %r4, %r5
+
+# CHECK: slbmfee 4, 5                    # encoding: [0x7c,0x80,0x2f,0x26]
+         slbmfee %r4, %r5
+
+# CHECK: slbia                           # encoding: [0x7c,0x00,0x03,0xe4]
+         slbia
+
+# CHECK: tlbsync                         # encoding: [0x7c,0x00,0x04,0x6c]
+         tlbsync
+
+# CHECK: tlbiel 4                        # encoding: [0x7c,0x00,0x22,0x24]
+         tlbiel %r4
+
+# CHECK: tlbie 4,0                       # encoding: [0x7c,0x00,0x22,0x64]
+         tlbie %r4, 0
+
+# CHECK: tlbie 4,0                       # encoding: [0x7c,0x00,0x22,0x64]
+         tlbie %r4
+
diff --git a/test/MC/PowerPC/ppc64-encoding-fp.s b/test/MC/PowerPC/ppc64-encoding-fp.s
index ae0e286..f9bdee1 100644
--- a/test/MC/PowerPC/ppc64-encoding-fp.s
+++ b/test/MC/PowerPC/ppc64-encoding-fp.s
@@ -65,8 +65,10 @@
          fnabs 2, 3
 # CHECK: fnabs. 2, 3                     # encoding: [0xfc,0x40,0x19,0x11]
          fnabs. 2, 3
-# FIXME: fcpsgn 2, 3
-# FIXME: fcpsgn. 2, 3
+# CHECK: fcpsgn 2, 3, 4                  # encoding: [0xfc,0x43,0x20,0x10]
+         fcpsgn 2, 3, 4
+# CHECK: fcpsgn. 2, 3, 4                 # encoding: [0xfc,0x43,0x20,0x11]
+         fcpsgn. 2, 3, 4
 
 # Floating-point arithmetic instructions
 
@@ -171,8 +173,10 @@
 # CHECK: frsp. 2, 3                      # encoding: [0xfc,0x40,0x18,0x19]
          frsp. 2, 3
 
-# FIXME: fctid 2, 3
-# FIXME: fctid. 2, 3
+# CHECK: fctid 2, 3                      # encoding: [0xfc,0x40,0x1e,0x5c]
+         fctid 2, 3
+# CHECK: fctid. 2, 3                     # encoding: [0xfc,0x40,0x1e,0x5d]
+         fctid. 2, 3
 # CHECK: fctidz 2, 3                     # encoding: [0xfc,0x40,0x1e,0x5e]
          fctidz 2, 3
 # CHECK: fctidz. 2, 3                    # encoding: [0xfc,0x40,0x1e,0x5f]
@@ -183,8 +187,10 @@
          fctiduz 2, 3
 # CHECK: fctiduz. 2, 3                   # encoding: [0xfc,0x40,0x1f,0x5f]
          fctiduz. 2, 3
-# FIXME: fctiw 2, 3
-# FIXME: fctiw. 2, 3
+# CHECK: fctiw 2, 3                      # encoding: [0xfc,0x40,0x18,0x1c]
+         fctiw 2, 3
+# CHECK: fctiw. 2, 3                     # encoding: [0xfc,0x40,0x18,0x1d]
+         fctiw. 2, 3
 # CHECK: fctiwz 2, 3                     # encoding: [0xfc,0x40,0x18,0x1e]
          fctiwz 2, 3
 # CHECK: fctiwz. 2, 3                    # encoding: [0xfc,0x40,0x18,0x1f]
diff --git a/test/MC/PowerPC/ppc64-errors.s b/test/MC/PowerPC/ppc64-errors.s
index bc8c95c..53197ba 100644
--- a/test/MC/PowerPC/ppc64-errors.s
+++ b/test/MC/PowerPC/ppc64-errors.s
@@ -96,3 +96,6 @@
 # CHECK-NEXT: ld 1, 32768(2)
               ld 1, 32768(2)
 
+# CHECK: error: invalid modifier 'got' (no symbols present)
+         addi 4, 3, 123@got
+# CHECK-NEXT: addi 4, 3, 123@got
diff --git a/test/MC/PowerPC/ppc64-fixup-explicit.s b/test/MC/PowerPC/ppc64-fixup-explicit.s
index 217e057..7c56fe8 100644
--- a/test/MC/PowerPC/ppc64-fixup-explicit.s
+++ b/test/MC/PowerPC/ppc64-fixup-explicit.s
@@ -2,7 +2,7 @@
 # RUN: llvm-mc -triple powerpc64-unknown-unknown --show-encoding %s | FileCheck %s
 
 # RUN: llvm-mc -triple powerpc64-unknown-unknown -filetype=obj %s | \
-# RUN: llvm-readobj -r | FileCheck %s -check-prefix=REL
+# RUN: llvm-readobj -r | FileCheck %s -check-prefix=CHECK-REL
 
 # GOT references must result in explicit relocations
 # even if the target symbol is local.
diff --git a/test/MC/PowerPC/ppc64-fixups.s b/test/MC/PowerPC/ppc64-fixups.s
index 56f99d8..a075066 100644
--- a/test/MC/PowerPC/ppc64-fixups.s
+++ b/test/MC/PowerPC/ppc64-fixups.s
@@ -2,7 +2,7 @@
 # RUN: llvm-mc -triple powerpc64-unknown-unknown --show-encoding %s | FileCheck %s
 
 # RUN: llvm-mc -triple powerpc64-unknown-unknown -filetype=obj %s | \
-# RUN: llvm-readobj -r | FileCheck %s -check-prefix=REL
+# RUN: llvm-readobj -r | FileCheck %s -check-prefix=CHECK-REL
 
 # CHECK: b target                        # encoding: [0b010010AA,A,A,0bAAAAAA00]
 # CHECK-NEXT:                            #   fixup A - offset: 0, value: target, kind: fixup_ppc_br24
@@ -442,3 +442,7 @@ base:
 # CHECK-REL: 0x{{[0-9A-F]*[08]}} R_PPC64_DTPREL64 target 0x0
 	.quad target@dtprel
 
+# Constant fixup
+        ori 1, 2, 131071@l
+# CHECK: ori 1, 2, 131071@l              # encoding: [0x60,0x41,A,A]
+# CHECK-NEXT:                            #   fixup A - offset: 2, value: 131071@l, kind: fixup_ppc_half16
diff --git a/test/MC/PowerPC/ppc64-operands.s b/test/MC/PowerPC/ppc64-operands.s
index cb96fd4..fc1cbeb 100644
--- a/test/MC/PowerPC/ppc64-operands.s
+++ b/test/MC/PowerPC/ppc64-operands.s
@@ -108,3 +108,8 @@
 # CHECK: beqa 0, 1024                    # encoding: [0x41,0x82,0x04,0x02]
          beqa 1024
 
+# CHECK:                                 # encoding: [0x42,0x9f,A,0bAAAAAA01]
+         bcl 20, 31, $+4
+
+# CHECK:                                 # encoding: [0x42,0x00,A,0bAAAAAA00]
+         bdnz $-8
diff --git a/test/MC/PowerPC/tls-gd-obj.s b/test/MC/PowerPC/tls-gd-obj.s
new file mode 100644
index 0000000..63d47ee
--- /dev/null
+++ b/test/MC/PowerPC/tls-gd-obj.s
@@ -0,0 +1,56 @@
+// RUN: llvm-mc -triple=powerpc64-pc-linux -filetype=obj %s -o - | \
+// RUN: llvm-readobj -r | FileCheck %s
+
+// Test correct relocation generation for thread-local storage using
+// the general dynamic model and integrated assembly.
+
+
+	.file	"/home/espindola/llvm/llvm/test/CodeGen/PowerPC/tls-gd-obj.ll"
+	.text
+	.globl	main
+	.align	2
+	.type	main,@function
+	.section	.opd,"aw",@progbits
+main:                                   # @main
+	.align	3
+	.quad	.L.main
+	.quad	.TOC.@tocbase
+	.quad	0
+	.text
+.L.main:
+# BB#0:                                 # %entry
+	addis 3, 2, a@got@tlsgd@ha
+	addi 3, 3, a@got@tlsgd@l
+	li 4, 0
+	bl __tls_get_addr(a@tlsgd)
+	nop
+	stw 4, -4(1)
+	lwz 4, 0(3)
+	extsw 3, 4
+	blr
+	.long	0
+	.quad	0
+.Ltmp0:
+	.size	main, .Ltmp0-.L.main
+
+	.type	a,@object               # @a
+	.section	.tbss,"awT",@nobits
+	.globl	a
+	.align	2
+a:
+	.long	0                       # 0x0
+	.size	a, 4
+
+
+// Verify generation of R_PPC64_GOT_TLSGD16_HA, R_PPC64_GOT_TLSGD16_LO,
+// and R_PPC64_TLSGD for accessing external variable a, and R_PPC64_REL24
+// for the call to __tls_get_addr.
+//
+// CHECK: Relocations [
+// CHECK:   Section (2) .rela.text {
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSGD16_HA a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSGD16_LO a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLSGD          a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_REL24          __tls_get_addr
+// CHECK:   }
+// CHECK: ]
diff --git a/test/MC/PowerPC/tls-ie-obj.s b/test/MC/PowerPC/tls-ie-obj.s
new file mode 100644
index 0000000..c8c5d91
--- /dev/null
+++ b/test/MC/PowerPC/tls-ie-obj.s
@@ -0,0 +1,44 @@
+// RUN: llvm-mc -triple=powerpc64-pc-linux -filetype=obj %s -o - | \
+// RUN: llvm-readobj -r | FileCheck %s
+
+// Test correct relocation generation for thread-local storage
+// using the initial-exec model and integrated assembly.
+
+	.file	"/home/espindola/llvm/llvm/test/CodeGen/PowerPC/tls-ie-obj.ll"
+	.text
+	.globl	main
+	.align	2
+	.type	main,@function
+	.section	.opd,"aw",@progbits
+main:                                   # @main
+	.align	3
+	.quad	.L.main
+	.quad	.TOC.@tocbase
+	.quad	0
+	.text
+.L.main:
+# BB#0:                                 # %entry
+	li 3, 0
+	addis 4, 2, a@got@tprel@ha
+	ld 4, a@got@tprel@l(4)
+	add 4, 4, a@tls
+	stw 3, -4(1)
+	lwz 3, 0(4)
+	extsw 3, 3
+	blr
+	.long	0
+	.quad	0
+.Ltmp0:
+	.size	main, .Ltmp0-.L.main
+
+
+// Verify generation of R_PPC64_GOT_TPREL16_DS and R_PPC64_TLS for
+// accessing external variable a.
+//
+// CHECK: Relocations [
+// CHECK:   Section (2) .rela.text {
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TPREL16_HA    a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TPREL16_LO_DS a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLS               a
+// CHECK:   }
+// CHECK: ]
diff --git a/test/MC/PowerPC/tls-ld-obj.s b/test/MC/PowerPC/tls-ld-obj.s
new file mode 100644
index 0000000..b0c4a7a
--- /dev/null
+++ b/test/MC/PowerPC/tls-ld-obj.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple=powerpc64-pc-linux -filetype=obj %s -o - | \
+// RUN: llvm-readobj -r | FileCheck %s
+
+// Test correct relocation generation for thread-local storage using
+// the local dynamic model.
+
+	.file	"/home/espindola/llvm/llvm/test/CodeGen/PowerPC/tls-ld-obj.ll"
+	.text
+	.globl	main
+	.align	2
+	.type	main,@function
+	.section	.opd,"aw",@progbits
+main:                                   # @main
+	.align	3
+	.quad	.L.main
+	.quad	.TOC.@tocbase
+	.quad	0
+	.text
+.L.main:
+# BB#0:                                 # %entry
+	addis 3, 2, a@got@tlsld@ha
+	addi 3, 3, a@got@tlsld@l
+	li 4, 0
+	bl __tls_get_addr(a@tlsld)
+	nop
+	stw 4, -4(1)
+	addis 3, 3, a@dtprel@ha
+	addi 3, 3, a@dtprel@l
+	lwz 4, 0(3)
+	extsw 3, 4
+	blr
+	.long	0
+	.quad	0
+.Ltmp0:
+	.size	main, .Ltmp0-.L.main
+
+	.hidden	a                       # @a
+	.type	a,@object
+	.section	.tbss,"awT",@nobits
+	.globl	a
+	.align	2
+a:
+	.long	0                       # 0x0
+	.size	a, 4
+
+
+// Verify generation of R_PPC64_GOT_TLSLD16_HA, R_PPC64_GOT_TLSLD16_LO,
+// R_PPC64_TLSLD, R_PPC64_DTPREL16_HA, and R_PPC64_DTPREL16_LO for
+// accessing external variable a, and R_PPC64_REL24 for the call to
+// __tls_get_addr.
+//
+// CHECK: Relocations [
+// CHECK:   Section (2) .rela.text {
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSLD16_HA a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSLD16_LO a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLSLD          a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_REL24          __tls_get_addr
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_DTPREL16_HA    a
+// CHECK:     0x{{[0-9,A-F]+}} R_PPC64_DTPREL16_LO    a
+// CHECK:   }
+// CHECK: ]
diff --git a/test/MC/SystemZ/insn-bad-z196.s b/test/MC/SystemZ/insn-bad-z196.s
index ec90c89..089d9b5 100644
--- a/test/MC/SystemZ/insn-bad-z196.s
+++ b/test/MC/SystemZ/insn-bad-z196.s
@@ -25,6 +25,134 @@
 	ahik	%r0, %r1, foo
 
 #CHECK: error: invalid operand
+#CHECK: aih	%r0, (-1 << 31) - 1
+#CHECK: error: invalid operand
+#CHECK: aih	%r0, (1 << 31)
+
+	aih	%r0, (-1 << 31) - 1
+	aih	%r0, (1 << 31)
+
+#CHECK: error: invalid operand
+#CHECK: chf	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: chf	%r0, 524288
+
+	chf	%r0, -524289
+	chf	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: cih	%r0, (-1 << 31) - 1
+#CHECK: error: invalid operand
+#CHECK: cih	%r0, (1 << 31)
+
+	cih	%r0, (-1 << 31) - 1
+	cih	%r0, (1 << 31)
+
+#CHECK: error: invalid operand
+#CHECK: clhf	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: clhf	%r0, 524288
+
+	clhf	%r0, -524289
+	clhf	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: clih	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: clih	%r0, (1 << 32)
+
+	clih	%r0, -1
+	clih	%r0, (1 << 32)
+
+#CHECK: error: invalid operand
+#CHECK: fidbra	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: fidbra	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: fidbra	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: fidbra	%f0, 16, %f0, 0
+
+	fidbra	%f0, 0, %f0, -1
+	fidbra	%f0, 0, %f0, 16
+	fidbra	%f0, -1, %f0, 0
+	fidbra	%f0, 16, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: fiebra	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: fiebra	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: fiebra	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: fiebra	%f0, 16, %f0, 0
+
+	fiebra	%f0, 0, %f0, -1
+	fiebra	%f0, 0, %f0, 16
+	fiebra	%f0, -1, %f0, 0
+	fiebra	%f0, 16, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: fixbra	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: fixbra	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: fixbra	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: fixbra	%f0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: fixbra	%f0, 0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: fixbra	%f2, 0, %f0, 0
+
+	fixbra	%f0, 0, %f0, -1
+	fixbra	%f0, 0, %f0, 16
+	fixbra	%f0, -1, %f0, 0
+	fixbra	%f0, 16, %f0, 0
+	fixbra	%f0, 0, %f2, 0
+	fixbra	%f2, 0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: lbh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: lbh	%r0, 524288
+
+	lbh	%r0, -524289
+	lbh	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: lfh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: lfh	%r0, 524288
+
+	lfh	%r0, -524289
+	lfh	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: lhh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: lhh	%r0, 524288
+
+	lhh	%r0, -524289
+	lhh	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: llch	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llch	%r0, 524288
+
+	llch	%r0, -524289
+	llch	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: llhh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: llhh	%r0, 524288
+
+	llhh	%r0, -524289
+	llhh	%r0, 524288
+
+#CHECK: error: invalid operand
 #CHECK: loc	%r0,0,-1
 #CHECK: error: invalid operand
 #CHECK: loc	%r0,0,16
@@ -157,6 +285,30 @@
 	srlk	%r0,%r0,0(%r1,%r2)
 
 #CHECK: error: invalid operand
+#CHECK: stch	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: stch	%r0, 524288
+
+	stch	%r0, -524289
+	stch	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: sthh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: sthh	%r0, 524288
+
+	sthh	%r0, -524289
+	sthh	%r0, 524288
+
+#CHECK: error: invalid operand
+#CHECK: stfh	%r0, -524289
+#CHECK: error: invalid operand
+#CHECK: stfh	%r0, 524288
+
+	stfh	%r0, -524289
+	stfh	%r0, 524288
+
+#CHECK: error: invalid operand
 #CHECK: stoc	%r0,0,-1
 #CHECK: error: invalid operand
 #CHECK: stoc	%r0,0,16
diff --git a/test/MC/SystemZ/insn-bad.s b/test/MC/SystemZ/insn-bad.s
index b730637..2a3fb98 100644
--- a/test/MC/SystemZ/insn-bad.s
+++ b/test/MC/SystemZ/insn-bad.s
@@ -128,6 +128,11 @@
 	ahy	%r0, -524289
 	ahy	%r0, 524288
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: aih	%r0, 0
+
+	aih	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: al	%r0, -1
 #CHECK: error: invalid operand
@@ -251,6 +256,14 @@
 	ay	%r0, -524289
 	ay	%r0, 524288
 
+#CHECK: error: invalid operand
+#CHECK: bcr	-1, %r1
+#CHECK: error: invalid operand
+#CHECK: bcr	16, %r1
+
+	bcr	-1, %r1
+	bcr	16, %r1
+
 #CHECK: error: offset out of range
 #CHECK: bras	%r0, -0x100002
 #CHECK: error: offset out of range
@@ -607,6 +620,11 @@
 	ch	%r0, -1
 	ch	%r0, 4096
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: chf	%r0, 0
+
+	chf	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: chhsi	-1, 0
 #CHECK: error: invalid operand
@@ -674,6 +692,11 @@
 	chy	%r0, -524289
 	chy	%r0, 524288
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: cih	%r0, 0
+
+	cih	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cij	%r0, -129, 0, 0
 #CHECK: error: invalid operand
@@ -712,6 +735,55 @@
 	cl	%r0, -1
 	cl	%r0, 4096
 
+#CHECK: error: missing length in address
+#CHECK: clc	0, 0
+#CHECK: error: missing length in address
+#CHECK: clc	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: clc	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: clc	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: clc	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: clc	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: clc	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: clc	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: clc	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: clc	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: clc	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: clc	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: clc	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: clc	0(-), 0
+
+	clc	0, 0
+	clc	0(%r1), 0(%r1)
+	clc	0(1,%r1), 0(2,%r1)
+	clc	0(0,%r1), 0(%r1)
+	clc	0(257,%r1), 0(%r1)
+	clc	-1(1,%r1), 0(%r1)
+	clc	4096(1,%r1), 0(%r1)
+	clc	0(1,%r1), -1(%r1)
+	clc	0(1,%r1), 4096(%r1)
+	clc	0(1,%r0), 0(%r1)
+	clc	0(1,%r1), 0(%r0)
+	clc	0(%r1,%r2), 0(%r1)
+	clc	0(1,%r2), 0(%r1,%r2)
+	clc	0(-), 0
+
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: clhf	%r0, 0
+
+	clhf	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: clfhsi	-1, 0
 #CHECK: error: invalid operand
@@ -806,6 +878,50 @@
 	clghsi	0, -1
 	clghsi	0, 65536
 
+#CHECK: error: invalid operand
+#CHECK: clgij	%r0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: clgij	%r0, 256, 0, 0
+
+	clgij	%r0, -1, 0, 0
+	clgij	%r0, 256, 0, 0
+
+#CHECK: error: offset out of range
+#CHECK: clgij	%r0, 0, 0, -0x100002
+#CHECK: error: offset out of range
+#CHECK: clgij	%r0, 0, 0, -1
+#CHECK: error: offset out of range
+#CHECK: clgij	%r0, 0, 0, 1
+#CHECK: error: offset out of range
+#CHECK: clgij	%r0, 0, 0, 0x10000
+
+	clgij	%r0, 0, 0, -0x100002
+	clgij	%r0, 0, 0, -1
+	clgij	%r0, 0, 0, 1
+	clgij	%r0, 0, 0, 0x10000
+
+#CHECK: error: invalid instruction
+#CHECK:	clgijo	%r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	clgijno	%r0, 0, 0, 0
+
+	clgijo	%r0, 0, 0, 0
+	clgijno	%r0, 0, 0, 0
+
+#CHECK: error: offset out of range
+#CHECK: clgrj	%r0, %r0, 0, -0x100002
+#CHECK: error: offset out of range
+#CHECK: clgrj	%r0, %r0, 0, -1
+#CHECK: error: offset out of range
+#CHECK: clgrj	%r0, %r0, 0, 1
+#CHECK: error: offset out of range
+#CHECK: clgrj	%r0, %r0, 0, 0x10000
+
+	clgrj	%r0, %r0, 0, -0x100002
+	clgrj	%r0, %r0, 0, -1
+	clgrj	%r0, %r0, 0, 1
+	clgrj	%r0, %r0, 0, 0x10000
+
 #CHECK: error: offset out of range
 #CHECK: clgrl	%r0, -0x1000000002
 #CHECK: error: offset out of range
@@ -868,6 +984,41 @@
 	cli	0, -1
 	cli	0, 256
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: clih	%r0, 0
+
+	clih	%r0, 0
+
+#CHECK: error: invalid operand
+#CHECK: clij	%r0, -1, 0, 0
+#CHECK: error: invalid operand
+#CHECK: clij	%r0, 256, 0, 0
+
+	clij	%r0, -1, 0, 0
+	clij	%r0, 256, 0, 0
+
+#CHECK: error: offset out of range
+#CHECK: clij	%r0, 0, 0, -0x100002
+#CHECK: error: offset out of range
+#CHECK: clij	%r0, 0, 0, -1
+#CHECK: error: offset out of range
+#CHECK: clij	%r0, 0, 0, 1
+#CHECK: error: offset out of range
+#CHECK: clij	%r0, 0, 0, 0x10000
+
+	clij	%r0, 0, 0, -0x100002
+	clij	%r0, 0, 0, -1
+	clij	%r0, 0, 0, 1
+	clij	%r0, 0, 0, 0x10000
+
+#CHECK: error: invalid instruction
+#CHECK:	clijo	%r0, 0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	clijno	%r0, 0, 0, 0
+
+	clijo	%r0, 0, 0, 0
+	clijno	%r0, 0, 0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cliy	-524289, 0
 #CHECK: error: invalid operand
@@ -886,6 +1037,28 @@
 	cliy	0, 256
 
 #CHECK: error: offset out of range
+#CHECK: clrj	%r0, %r0, 0, -0x100002
+#CHECK: error: offset out of range
+#CHECK: clrj	%r0, %r0, 0, -1
+#CHECK: error: offset out of range
+#CHECK: clrj	%r0, %r0, 0, 1
+#CHECK: error: offset out of range
+#CHECK: clrj	%r0, %r0, 0, 0x10000
+
+	clrj	%r0, %r0, 0, -0x100002
+	clrj	%r0, %r0, 0, -1
+	clrj	%r0, %r0, 0, 1
+	clrj	%r0, %r0, 0, 0x10000
+
+#CHECK: error: invalid instruction
+#CHECK:	clrjo	%r0, %r0, 0, 0
+#CHECK: error: invalid instruction
+#CHECK:	clrjno	%r0, %r0, 0, 0
+
+	clrjo	%r0, %r0, 0, 0
+	clrjno	%r0, %r0, 0, 0
+
+#CHECK: error: offset out of range
 #CHECK: clrl	%r0, -0x1000000002
 #CHECK: error: offset out of range
 #CHECK: clrl	%r0, -1
@@ -1098,6 +1271,11 @@
 	fidbr	%f0, -1, %f0
 	fidbr	%f0, 16, %f0
 
+#CHECK: error: {{(instruction requires: fp-extension)?}}
+#CHECK: fidbra	%f0, 0, %f0, 0
+
+	fidbra	%f0, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: fiebr	%f0, -1, %f0
 #CHECK: error: invalid operand
@@ -1106,6 +1284,11 @@
 	fiebr	%f0, -1, %f0
 	fiebr	%f0, 16, %f0
 
+#CHECK: error: {{(instruction requires: fp-extension)?}}
+#CHECK: fiebra	%f0, 0, %f0, 0
+
+	fiebra	%f0, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: fixbr	%f0, -1, %f0
 #CHECK: error: invalid operand
@@ -1120,6 +1303,11 @@
 	fixbr	%f0, 0, %f2
 	fixbr	%f2, 0, %f0
 
+#CHECK: error: {{(instruction requires: fp-extension)?}}
+#CHECK: fixbra	%f0, 0, %f0, 0
+
+	fixbra	%f0, 0, %f0, 0
+
 #CHECK: error: invalid register pair
 #CHECK: flogr	%r1, %r0
 
@@ -1235,6 +1423,11 @@
 	lb	%r0, -524289
 	lb	%r0, 524288
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: lbh	%r0, 0
+
+	lbh	%r0, 0
+
 #CHECK: error: invalid register pair
 #CHECK: lcxbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -1299,6 +1492,11 @@
 	ley	%f0, -524289
 	ley	%f0, 524288
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: lfh	%r0, 0
+
+	lfh	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: lg	%r0, -524289
 #CHECK: error: invalid operand
@@ -1400,6 +1598,11 @@
 	lh	%r0, -1
 	lh	%r0, 4096
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: lhh	%r0, 0
+
+	lhh	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: lhi	%r0, -32769
 #CHECK: error: invalid operand
@@ -1441,6 +1644,11 @@
 	llc	%r0, -524289
 	llc	%r0, 524288
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: llch	%r0, 0
+
+	llch	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: llgc	%r0, -524289
 #CHECK: error: invalid operand
@@ -1501,6 +1709,11 @@
 	llh	%r0, -524289
 	llh	%r0, 524288
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: llhh	%r0, 0
+
+	llhh	%r0, 0
+
 #CHECK: error: offset out of range
 #CHECK: llhrl	%r0, -0x1000000002
 #CHECK: error: offset out of range
@@ -1992,6 +2205,50 @@
 	n	%r0, -1
 	n	%r0, 4096
 
+#CHECK: error: missing length in address
+#CHECK: nc	0, 0
+#CHECK: error: missing length in address
+#CHECK: nc	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: nc	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: nc	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: nc	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: nc	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: nc	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: nc	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: nc	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: nc	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: nc	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: nc	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: nc	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: nc	0(-), 0
+
+	nc	0, 0
+	nc	0(%r1), 0(%r1)
+	nc	0(1,%r1), 0(2,%r1)
+	nc	0(0,%r1), 0(%r1)
+	nc	0(257,%r1), 0(%r1)
+	nc	-1(1,%r1), 0(%r1)
+	nc	4096(1,%r1), 0(%r1)
+	nc	0(1,%r1), -1(%r1)
+	nc	0(1,%r1), 4096(%r1)
+	nc	0(1,%r0), 0(%r1)
+	nc	0(1,%r1), 0(%r0)
+	nc	0(%r1,%r2), 0(%r1)
+	nc	0(1,%r2), 0(%r1,%r2)
+	nc	0(-), 0
+
 #CHECK: error: invalid operand
 #CHECK: ng	%r0, -524289
 #CHECK: error: invalid operand
@@ -2108,6 +2365,50 @@
 	o	%r0, -1
 	o	%r0, 4096
 
+#CHECK: error: missing length in address
+#CHECK: oc	0, 0
+#CHECK: error: missing length in address
+#CHECK: oc	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: oc	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: oc	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: oc	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: oc	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: oc	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: oc	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: oc	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: oc	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: oc	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: oc	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: oc	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: oc	0(-), 0
+
+	oc	0, 0
+	oc	0(%r1), 0(%r1)
+	oc	0(1,%r1), 0(2,%r1)
+	oc	0(0,%r1), 0(%r1)
+	oc	0(257,%r1), 0(%r1)
+	oc	-1(1,%r1), 0(%r1)
+	oc	4096(1,%r1), 0(%r1)
+	oc	0(1,%r1), -1(%r1)
+	oc	0(1,%r1), 4096(%r1)
+	oc	0(1,%r0), 0(%r1)
+	oc	0(1,%r1), 0(%r0)
+	oc	0(%r1,%r2), 0(%r1)
+	oc	0(1,%r2), 0(%r1,%r2)
+	oc	0(-), 0
+
 #CHECK: error: invalid operand
 #CHECK: og	%r0, -524289
 #CHECK: error: invalid operand
@@ -2217,6 +2518,40 @@
 	oy	%r0, 524288
 
 #CHECK: error: invalid operand
+#CHECK: pfd	-1, 0
+#CHECK: error: invalid operand
+#CHECK: pfd	16, 0
+#CHECK: error: invalid operand
+#CHECK: pfd	1, -524289
+#CHECK: error: invalid operand
+#CHECK: pfd	1, 524288
+
+	pfd	-1, 0
+	pfd	16, 0
+	pfd	1, -524289
+	pfd	1, 524288
+
+#CHECK: error: invalid operand
+#CHECK: pfdrl	-1, 0
+#CHECK: error: invalid operand
+#CHECK: pfdrl	16, 0
+#CHECK: error: offset out of range
+#CHECK: pfdrl	1, -0x1000000002
+#CHECK: error: offset out of range
+#CHECK: pfdrl	1, -1
+#CHECK: error: offset out of range
+#CHECK: pfdrl	1, 1
+#CHECK: error: offset out of range
+#CHECK: pfdrl	1, 0x100000000
+
+	pfdrl	-1, 0
+	pfdrl	16, 0
+	pfdrl	1, -0x1000000002
+	pfdrl	1, -1
+	pfdrl	1, 1
+	pfdrl	1, 0x100000000
+
+#CHECK: error: invalid operand
 #CHECK: risbg	%r0,%r0,0,0,-1
 #CHECK: error: invalid operand
 #CHECK: risbg	%r0,%r0,0,0,64
@@ -2613,6 +2948,11 @@
 	stc	%r0, -1
 	stc	%r0, 4096
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: stch	%r0, 0
+
+	stch	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: stcy	%r0, -524289
 #CHECK: error: invalid operand
@@ -2683,6 +3023,11 @@
 	sth	%r0, -1
 	sth	%r0, 4096
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: sthh	%r0, 0
+
+	sthh	%r0, 0
+
 #CHECK: error: offset out of range
 #CHECK: sthrl	%r0, -0x1000000002
 #CHECK: error: offset out of range
@@ -2705,6 +3050,11 @@
 	sthy	%r0, -524289
 	sthy	%r0, 524288
 
+#CHECK: error: {{(instruction requires: high-word)?}}
+#CHECK: stfh	%r0, 0
+
+	stfh	%r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: stmg	%r0, %r0, -524289
 #CHECK: error: invalid operand
@@ -2771,6 +3121,72 @@
 	sy	%r0, 524288
 
 #CHECK: error: invalid operand
+#CHECK: tm	-1, 0
+#CHECK: error: invalid operand
+#CHECK: tm	4096, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tm	0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: tm	0, -1
+#CHECK: error: invalid operand
+#CHECK: tm	0, 256
+
+	tm	-1, 0
+	tm	4096, 0
+	tm	0(%r1,%r2), 0
+	tm	0, -1
+	tm	0, 256
+
+#CHECK: error: invalid operand
+#CHECK: tmhh	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: tmhh	%r0, 0x10000
+
+	tmhh	%r0, -1
+	tmhh	%r0, 0x10000
+
+#CHECK: error: invalid operand
+#CHECK: tmhl	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: tmhl	%r0, 0x10000
+
+	tmhl	%r0, -1
+	tmhl	%r0, 0x10000
+
+#CHECK: error: invalid operand
+#CHECK: tmlh	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: tmlh	%r0, 0x10000
+
+	tmlh	%r0, -1
+	tmlh	%r0, 0x10000
+
+#CHECK: error: invalid operand
+#CHECK: tmll	%r0, -1
+#CHECK: error: invalid operand
+#CHECK: tmll	%r0, 0x10000
+
+	tmll	%r0, -1
+	tmll	%r0, 0x10000
+
+#CHECK: error: invalid operand
+#CHECK: tmy	-524289, 0
+#CHECK: error: invalid operand
+#CHECK: tmy	524288, 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: tmy	0(%r1,%r2), 0
+#CHECK: error: invalid operand
+#CHECK: tmy	0, -1
+#CHECK: error: invalid operand
+#CHECK: tmy	0, 256
+
+	tmy	-524289, 0
+	tmy	524288, 0
+	tmy	0(%r1,%r2), 0
+	tmy	0, -1
+	tmy	0, 256
+
+#CHECK: error: invalid operand
 #CHECK: x	%r0, -1
 #CHECK: error: invalid operand
 #CHECK: x	%r0, 4096
@@ -2778,6 +3194,50 @@
 	x	%r0, -1
 	x	%r0, 4096
 
+#CHECK: error: missing length in address
+#CHECK: xc	0, 0
+#CHECK: error: missing length in address
+#CHECK: xc	0(%r1), 0(%r1)
+#CHECK: error: invalid use of length addressing
+#CHECK: xc	0(1,%r1), 0(2,%r1)
+#CHECK: error: invalid operand
+#CHECK: xc	0(0,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: xc	0(257,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: xc	-1(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: xc	4096(1,%r1), 0(%r1)
+#CHECK: error: invalid operand
+#CHECK: xc	0(1,%r1), -1(%r1)
+#CHECK: error: invalid operand
+#CHECK: xc	0(1,%r1), 4096(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: xc	0(1,%r0), 0(%r1)
+#CHECK: error: %r0 used in an address
+#CHECK: xc	0(1,%r1), 0(%r0)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: xc	0(%r1,%r2), 0(%r1)
+#CHECK: error: invalid use of indexed addressing
+#CHECK: xc	0(1,%r2), 0(%r1,%r2)
+#CHECK: error: unknown token in expression
+#CHECK: xc	0(-), 0
+
+	xc	0, 0
+	xc	0(%r1), 0(%r1)
+	xc	0(1,%r1), 0(2,%r1)
+	xc	0(0,%r1), 0(%r1)
+	xc	0(257,%r1), 0(%r1)
+	xc	-1(1,%r1), 0(%r1)
+	xc	4096(1,%r1), 0(%r1)
+	xc	0(1,%r1), -1(%r1)
+	xc	0(1,%r1), 4096(%r1)
+	xc	0(1,%r0), 0(%r1)
+	xc	0(1,%r1), 0(%r0)
+	xc	0(%r1,%r2), 0(%r1)
+	xc	0(1,%r2), 0(%r1,%r2)
+	xc	0(-), 0
+
 #CHECK: error: invalid operand
 #CHECK: xg	%r0, -524289
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-good-z196.s b/test/MC/SystemZ/insn-good-z196.s
index 5f7c277..258e06f 100644
--- a/test/MC/SystemZ/insn-good-z196.s
+++ b/test/MC/SystemZ/insn-good-z196.s
@@ -49,6 +49,20 @@
 	ahik	%r15, %r0, 0
 	ahik	%r7, %r8, -16
 
+#CHECK: aih	%r0, -2147483648        # encoding: [0xcc,0x08,0x80,0x00,0x00,0x00]
+#CHECK: aih	%r0, -1                 # encoding: [0xcc,0x08,0xff,0xff,0xff,0xff]
+#CHECK: aih	%r0, 0                  # encoding: [0xcc,0x08,0x00,0x00,0x00,0x00]
+#CHECK: aih	%r0, 1                  # encoding: [0xcc,0x08,0x00,0x00,0x00,0x01]
+#CHECK: aih	%r0, 2147483647         # encoding: [0xcc,0x08,0x7f,0xff,0xff,0xff]
+#CHECK: aih	%r15, 0                 # encoding: [0xcc,0xf8,0x00,0x00,0x00,0x00]
+
+	aih	%r0, -1 << 31
+	aih	%r0, -1
+	aih	%r0, 0
+	aih	%r0, 1
+	aih	%r0, (1 << 31) - 1
+	aih	%r15, 0
+
 #CHECK: alghsik	%r0, %r0, -32768        # encoding: [0xec,0x00,0x80,0x00,0x00,0xdb]
 #CHECK: alghsik	%r0, %r0, -1            # encoding: [0xec,0x00,0xff,0xff,0x00,0xdb]
 #CHECK: alghsik	%r0, %r0, 0             # encoding: [0xec,0x00,0x00,0x00,0x00,0xdb]
@@ -121,6 +135,226 @@
 	ark	%r15,%r0,%r0
 	ark	%r7,%r8,%r9
 
+#CHECK: chf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xcd]
+#CHECK: chf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xcd]
+#CHECK: chf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xcd]
+#CHECK: chf	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xcd]
+#CHECK: chf	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xcd]
+#CHECK: chf	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xcd]
+#CHECK: chf	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xcd]
+#CHECK: chf	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xcd]
+#CHECK: chf	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xcd]
+#CHECK: chf	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xcd]
+
+	chf	%r0, -524288
+	chf	%r0, -1
+	chf	%r0, 0
+	chf	%r0, 1
+	chf	%r0, 524287
+	chf	%r0, 0(%r1)
+	chf	%r0, 0(%r15)
+	chf	%r0, 524287(%r1,%r15)
+	chf	%r0, 524287(%r15,%r1)
+	chf	%r15, 0
+
+#CHECK: cih	%r0, -2147483648        # encoding: [0xcc,0x0d,0x80,0x00,0x00,0x00]
+#CHECK: cih	%r0, -1                 # encoding: [0xcc,0x0d,0xff,0xff,0xff,0xff]
+#CHECK: cih	%r0, 0                  # encoding: [0xcc,0x0d,0x00,0x00,0x00,0x00]
+#CHECK: cih	%r0, 1                  # encoding: [0xcc,0x0d,0x00,0x00,0x00,0x01]
+#CHECK: cih	%r0, 2147483647         # encoding: [0xcc,0x0d,0x7f,0xff,0xff,0xff]
+#CHECK: cih	%r15, 0                 # encoding: [0xcc,0xfd,0x00,0x00,0x00,0x00]
+
+	cih	%r0, -1 << 31
+	cih	%r0, -1
+	cih	%r0, 0
+	cih	%r0, 1
+	cih	%r0, (1 << 31) - 1
+	cih	%r15, 0
+
+#CHECK: clhf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xcf]
+#CHECK: clhf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xcf]
+#CHECK: clhf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xcf]
+#CHECK: clhf	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xcf]
+#CHECK: clhf	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xcf]
+#CHECK: clhf	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xcf]
+#CHECK: clhf	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xcf]
+#CHECK: clhf	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xcf]
+#CHECK: clhf	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xcf]
+#CHECK: clhf	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xcf]
+
+	clhf	%r0, -524288
+	clhf	%r0, -1
+	clhf	%r0, 0
+	clhf	%r0, 1
+	clhf	%r0, 524287
+	clhf	%r0, 0(%r1)
+	clhf	%r0, 0(%r15)
+	clhf	%r0, 524287(%r1,%r15)
+	clhf	%r0, 524287(%r15,%r1)
+	clhf	%r15, 0
+
+#CHECK: clih	%r0, 0                  # encoding: [0xcc,0x0f,0x00,0x00,0x00,0x00]
+#CHECK: clih	%r0, 1                  # encoding: [0xcc,0x0f,0x00,0x00,0x00,0x01]
+#CHECK: clih	%r0, 4294967295         # encoding: [0xcc,0x0f,0xff,0xff,0xff,0xff]
+#CHECK: clih	%r15, 0                 # encoding: [0xcc,0xff,0x00,0x00,0x00,0x00]
+
+	clih	%r0, 0
+	clih	%r0, 1
+	clih	%r0, (1 << 32) - 1
+	clih	%r15, 0
+
+#CHECK: fidbra	%f0, 0, %f0, 0          # encoding: [0xb3,0x5f,0x00,0x00]
+#CHECK: fidbra	%f0, 0, %f0, 15         # encoding: [0xb3,0x5f,0x0f,0x00]
+#CHECK: fidbra	%f0, 0, %f15, 0         # encoding: [0xb3,0x5f,0x00,0x0f]
+#CHECK: fidbra	%f0, 15, %f0, 0         # encoding: [0xb3,0x5f,0xf0,0x00]
+#CHECK: fidbra	%f4, 5, %f6, 7          # encoding: [0xb3,0x5f,0x57,0x46]
+#CHECK: fidbra	%f15, 0, %f0, 0         # encoding: [0xb3,0x5f,0x00,0xf0]
+
+	fidbra	%f0, 0, %f0, 0
+	fidbra	%f0, 0, %f0, 15
+	fidbra	%f0, 0, %f15, 0
+	fidbra	%f0, 15, %f0, 0
+	fidbra	%f4, 5, %f6, 7
+	fidbra	%f15, 0, %f0, 0
+
+#CHECK: fiebra	%f0, 0, %f0, 0          # encoding: [0xb3,0x57,0x00,0x00]
+#CHECK: fiebra	%f0, 0, %f0, 15         # encoding: [0xb3,0x57,0x0f,0x00]
+#CHECK: fiebra	%f0, 0, %f15, 0         # encoding: [0xb3,0x57,0x00,0x0f]
+#CHECK: fiebra	%f0, 15, %f0, 0         # encoding: [0xb3,0x57,0xf0,0x00]
+#CHECK: fiebra	%f4, 5, %f6, 7          # encoding: [0xb3,0x57,0x57,0x46]
+#CHECK: fiebra	%f15, 0, %f0, 0         # encoding: [0xb3,0x57,0x00,0xf0]
+
+	fiebra	%f0, 0, %f0, 0
+	fiebra	%f0, 0, %f0, 15
+	fiebra	%f0, 0, %f15, 0
+	fiebra	%f0, 15, %f0, 0
+	fiebra	%f4, 5, %f6, 7
+	fiebra	%f15, 0, %f0, 0
+
+#CHECK: fixbra	%f0, 0, %f0, 0          # encoding: [0xb3,0x47,0x00,0x00]
+#CHECK: fixbra	%f0, 0, %f0, 15         # encoding: [0xb3,0x47,0x0f,0x00]
+#CHECK: fixbra	%f0, 0, %f13, 0         # encoding: [0xb3,0x47,0x00,0x0d]
+#CHECK: fixbra	%f0, 15, %f0, 0         # encoding: [0xb3,0x47,0xf0,0x00]
+#CHECK: fixbra	%f4, 5, %f8, 9          # encoding: [0xb3,0x47,0x59,0x48]
+#CHECK: fixbra	%f13, 0, %f0, 0         # encoding: [0xb3,0x47,0x00,0xd0]
+
+	fixbra	%f0, 0, %f0, 0
+	fixbra	%f0, 0, %f0, 15
+	fixbra	%f0, 0, %f13, 0
+	fixbra	%f0, 15, %f0, 0
+	fixbra	%f4, 5, %f8, 9
+	fixbra	%f13, 0, %f0, 0
+
+#CHECK: lbh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc0]
+#CHECK: lbh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc0]
+#CHECK: lbh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc0]
+#CHECK: lbh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc0]
+#CHECK: lbh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc0]
+#CHECK: lbh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc0]
+#CHECK: lbh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc0]
+#CHECK: lbh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc0]
+#CHECK: lbh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc0]
+#CHECK: lbh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc0]
+
+	lbh	%r0, -524288
+	lbh	%r0, -1
+	lbh	%r0, 0
+	lbh	%r0, 1
+	lbh	%r0, 524287
+	lbh	%r0, 0(%r1)
+	lbh	%r0, 0(%r15)
+	lbh	%r0, 524287(%r1,%r15)
+	lbh	%r0, 524287(%r15,%r1)
+	lbh	%r15, 0
+
+#CHECK: lfh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xca]
+#CHECK: lfh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xca]
+#CHECK: lfh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xca]
+#CHECK: lfh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xca]
+#CHECK: lfh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xca]
+#CHECK: lfh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xca]
+#CHECK: lfh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xca]
+#CHECK: lfh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xca]
+#CHECK: lfh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xca]
+#CHECK: lfh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xca]
+
+	lfh	%r0, -524288
+	lfh	%r0, -1
+	lfh	%r0, 0
+	lfh	%r0, 1
+	lfh	%r0, 524287
+	lfh	%r0, 0(%r1)
+	lfh	%r0, 0(%r15)
+	lfh	%r0, 524287(%r1,%r15)
+	lfh	%r0, 524287(%r15,%r1)
+	lfh	%r15, 0
+
+#CHECK: lhh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc4]
+#CHECK: lhh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc4]
+#CHECK: lhh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc4]
+#CHECK: lhh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc4]
+#CHECK: lhh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc4]
+#CHECK: lhh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc4]
+#CHECK: lhh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc4]
+#CHECK: lhh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc4]
+#CHECK: lhh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc4]
+#CHECK: lhh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc4]
+
+	lhh	%r0, -524288
+	lhh	%r0, -1
+	lhh	%r0, 0
+	lhh	%r0, 1
+	lhh	%r0, 524287
+	lhh	%r0, 0(%r1)
+	lhh	%r0, 0(%r15)
+	lhh	%r0, 524287(%r1,%r15)
+	lhh	%r0, 524287(%r15,%r1)
+	lhh	%r15, 0
+
+#CHECK: llch	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc2]
+#CHECK: llch	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc2]
+#CHECK: llch	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc2]
+#CHECK: llch	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc2]
+#CHECK: llch	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc2]
+#CHECK: llch	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc2]
+#CHECK: llch	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc2]
+#CHECK: llch	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc2]
+#CHECK: llch	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc2]
+#CHECK: llch	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc2]
+
+	llch	%r0, -524288
+	llch	%r0, -1
+	llch	%r0, 0
+	llch	%r0, 1
+	llch	%r0, 524287
+	llch	%r0, 0(%r1)
+	llch	%r0, 0(%r15)
+	llch	%r0, 524287(%r1,%r15)
+	llch	%r0, 524287(%r15,%r1)
+	llch	%r15, 0
+
+#CHECK: llhh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc6]
+#CHECK: llhh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc6]
+#CHECK: llhh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc6]
+#CHECK: llhh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc6]
+#CHECK: llhh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc6]
+#CHECK: llhh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc6]
+#CHECK: llhh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc6]
+#CHECK: llhh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc6]
+#CHECK: llhh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc6]
+#CHECK: llhh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc6]
+
+	llhh	%r0, -524288
+	llhh	%r0, -1
+	llhh	%r0, 0
+	llhh	%r0, 1
+	llhh	%r0, 524287
+	llhh	%r0, 0(%r1)
+	llhh	%r0, 0(%r15)
+	llhh	%r0, 524287(%r1,%r15)
+	llhh	%r0, 524287(%r15,%r1)
+	llhh	%r15, 0
+
 #CHECK: loc	%r0, 0, 0               # encoding: [0xeb,0x00,0x00,0x00,0x00,0xf2]
 #CHECK: loc	%r0, 0, 15              # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xf2]
 #CHECK: loc	%r0, -524288, 0         # encoding: [0xeb,0x00,0x00,0x00,0x80,0xf2]
@@ -495,6 +729,72 @@
 	srlk	%r0,%r0,524287(%r1)
 	srlk	%r0,%r0,524287(%r15)
 
+#CHECK: stch	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc3]
+#CHECK: stch	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc3]
+#CHECK: stch	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc3]
+#CHECK: stch	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc3]
+#CHECK: stch	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc3]
+#CHECK: stch	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc3]
+#CHECK: stch	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc3]
+#CHECK: stch	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc3]
+#CHECK: stch	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc3]
+#CHECK: stch	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc3]
+
+	stch	%r0, -524288
+	stch	%r0, -1
+	stch	%r0, 0
+	stch	%r0, 1
+	stch	%r0, 524287
+	stch	%r0, 0(%r1)
+	stch	%r0, 0(%r15)
+	stch	%r0, 524287(%r1,%r15)
+	stch	%r0, 524287(%r15,%r1)
+	stch	%r15, 0
+
+#CHECK: sthh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xc7]
+#CHECK: sthh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xc7]
+#CHECK: sthh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xc7]
+#CHECK: sthh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xc7]
+#CHECK: sthh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xc7]
+#CHECK: sthh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xc7]
+#CHECK: sthh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xc7]
+#CHECK: sthh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xc7]
+#CHECK: sthh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xc7]
+#CHECK: sthh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xc7]
+
+	sthh	%r0, -524288
+	sthh	%r0, -1
+	sthh	%r0, 0
+	sthh	%r0, 1
+	sthh	%r0, 524287
+	sthh	%r0, 0(%r1)
+	sthh	%r0, 0(%r15)
+	sthh	%r0, 524287(%r1,%r15)
+	sthh	%r0, 524287(%r15,%r1)
+	sthh	%r15, 0
+
+#CHECK: stfh	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xcb]
+#CHECK: stfh	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xcb]
+#CHECK: stfh	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xcb]
+#CHECK: stfh	%r0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0xcb]
+#CHECK: stfh	%r0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0xcb]
+#CHECK: stfh	%r0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0xcb]
+#CHECK: stfh	%r0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0xcb]
+#CHECK: stfh	%r0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0xcb]
+#CHECK: stfh	%r0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0xcb]
+#CHECK: stfh	%r15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0xcb]
+
+	stfh	%r0, -524288
+	stfh	%r0, -1
+	stfh	%r0, 0
+	stfh	%r0, 1
+	stfh	%r0, 524287
+	stfh	%r0, 0(%r1)
+	stfh	%r0, 0(%r15)
+	stfh	%r0, 524287(%r1,%r15)
+	stfh	%r0, 524287(%r15,%r1)
+	stfh	%r15, 0
+
 #CHECK: stoc	%r0, 0, 0               # encoding: [0xeb,0x00,0x00,0x00,0x00,0xf3]
 #CHECK: stoc	%r0, 0, 15              # encoding: [0xeb,0x0f,0x00,0x00,0x00,0xf3]
 #CHECK: stoc	%r0, -524288, 0         # encoding: [0xeb,0x00,0x00,0x00,0x80,0xf3]
diff --git a/test/MC/SystemZ/insn-good.s b/test/MC/SystemZ/insn-good.s
index c997271..23bd68a 100644
--- a/test/MC/SystemZ/insn-good.s
+++ b/test/MC/SystemZ/insn-good.s
@@ -527,11 +527,102 @@
 	basr	%r14,%r9
 	basr	%r15,%r1
 
+#CHECK: bcr	0, %r0			# encoding: [0x07,0x00]
+#CHECK:	bcr	0, %r15			# encoding: [0x07,0x0f]
 
+	bcr	0, %r0
+	bcr	0, %r15
+
+#CHECK:	bcr	1, %r7			# encoding: [0x07,0x17]
+#CHECK:	bor	%r15			# encoding: [0x07,0x1f]
+
+	bcr	1, %r7
+	bor	%r15
+
+#CHECK:	bcr	2, %r7			# encoding: [0x07,0x27]
+#CHECK:	bhr	%r15			# encoding: [0x07,0x2f]
+
+	bcr	2, %r7
+	bhr	%r15
+
+#CHECK:	bcr	3, %r7			# encoding: [0x07,0x37]
+#CHECK:	bnler	%r15			# encoding: [0x07,0x3f]
+
+	bcr	3, %r7
+	bnler	%r15
+
+#CHECK:	bcr	4, %r7			# encoding: [0x07,0x47]
+#CHECK:	blr	%r15			# encoding: [0x07,0x4f]
+
+	bcr	4, %r7
+	blr	%r15
+
+#CHECK:	bcr	5, %r7			# encoding: [0x07,0x57]
+#CHECK:	bnher	%r15			# encoding: [0x07,0x5f]
+
+	bcr	5, %r7
+	bnher	%r15
+
+#CHECK:	bcr	6, %r7			# encoding: [0x07,0x67]
+#CHECK:	blhr	%r15			# encoding: [0x07,0x6f]
+
+	bcr	6, %r7
+	blhr	%r15
+
+#CHECK:	bcr	7, %r7			# encoding: [0x07,0x77]
+#CHECK:	bner	%r15			# encoding: [0x07,0x7f]
+
+	bcr	7, %r7
+	bner	%r15
+
+#CHECK:	bcr	8, %r7			# encoding: [0x07,0x87]
+#CHECK:	ber	%r15			# encoding: [0x07,0x8f]
+
+	bcr	8, %r7
+	ber	%r15
+
+#CHECK:	bcr	9, %r7			# encoding: [0x07,0x97]
+#CHECK:	bnlhr	%r15			# encoding: [0x07,0x9f]
+
+	bcr	9, %r7
+	bnlhr	%r15
+
+#CHECK:	bcr	10, %r7			# encoding: [0x07,0xa7]
+#CHECK:	bher	%r15			# encoding: [0x07,0xaf]
+
+	bcr	10, %r7
+	bher	%r15
+
+#CHECK:	bcr	11, %r7			# encoding: [0x07,0xb7]
+#CHECK:	bnlr	%r15			# encoding: [0x07,0xbf]
+
+	bcr	11, %r7
+	bnlr	%r15
+
+#CHECK:	bcr	12, %r7			# encoding: [0x07,0xc7]
+#CHECK:	bler	%r15			# encoding: [0x07,0xcf]
+
+	bcr	12, %r7
+	bler	%r15
+
+#CHECK:	bcr	13, %r7			# encoding: [0x07,0xd7]
+#CHECK:	bnhr	%r15			# encoding: [0x07,0xdf]
+
+	bcr	13, %r7
+	bnhr	%r15
+
+#CHECK:	bcr	14, %r7			# encoding: [0x07,0xe7]
+#CHECK:	bnor	%r15			# encoding: [0x07,0xef]
+
+	bcr	14, %r7
+	bnor	%r15
+
+#CHECK:	bcr	15, %r7			# encoding: [0x07,0xf7]
 #CHECK: br	%r1                     # encoding: [0x07,0xf1]
 #CHECK: br	%r14                    # encoding: [0x07,0xfe]
 #CHECK: br	%r15                    # encoding: [0x07,0xff]
 
+	bcr	15, %r7
 	br	%r1
 	br	%r14
 	br	%r15
@@ -2454,6 +2545,32 @@
 	cl	%r0, 4095(%r15,%r1)
 	cl	%r15, 0
 
+#CHECK: clc	0(1), 0                 # encoding: [0xd5,0x00,0x00,0x00,0x00,0x00]
+#CHECK: clc	0(1), 0(%r1)            # encoding: [0xd5,0x00,0x00,0x00,0x10,0x00]
+#CHECK: clc	0(1), 0(%r15)           # encoding: [0xd5,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: clc	0(1), 4095              # encoding: [0xd5,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: clc	0(1), 4095(%r1)         # encoding: [0xd5,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: clc	0(1), 4095(%r15)        # encoding: [0xd5,0x00,0x00,0x00,0xff,0xff]
+#CHECK: clc	0(1,%r1), 0             # encoding: [0xd5,0x00,0x10,0x00,0x00,0x00]
+#CHECK: clc	0(1,%r15), 0            # encoding: [0xd5,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: clc	4095(1,%r1), 0          # encoding: [0xd5,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: clc	4095(1,%r15), 0         # encoding: [0xd5,0x00,0xff,0xff,0x00,0x00]
+#CHECK: clc	0(256,%r1), 0           # encoding: [0xd5,0xff,0x10,0x00,0x00,0x00]
+#CHECK: clc	0(256,%r15), 0          # encoding: [0xd5,0xff,0xf0,0x00,0x00,0x00]
+
+	clc	0(1), 0
+	clc	0(1), 0(%r1)
+	clc	0(1), 0(%r15)
+	clc	0(1), 4095
+	clc	0(1), 4095(%r1)
+	clc	0(1), 4095(%r15)
+	clc	0(1,%r1), 0
+	clc	0(1,%r15), 0
+	clc	4095(1,%r1), 0
+	clc	4095(1,%r15), 0
+	clc	0(256,%r1), 0
+	clc	0(256,%r15), 0
+
 #CHECK: clfhsi	0, 0                    # encoding: [0xe5,0x5d,0x00,0x00,0x00,0x00]
 #CHECK: clfhsi	4095, 0                 # encoding: [0xe5,0x5d,0x0f,0xff,0x00,0x00]
 #CHECK: clfhsi	0, 65535                # encoding: [0xe5,0x5d,0x00,0x00,0xff,0xff]
@@ -2630,6 +2747,233 @@
 	clghsi	4095(%r1), 42
 	clghsi	4095(%r15), 42
 
+#CHECK: clgij	%r0, 0, 0, .[[LAB:L.*]]	# encoding: [0xec,0x00,A,A,0x00,0x7d]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clgij	%r0, 255, 0, .[[LAB:L.*]]	# encoding: [0xec,0x00,A,A,0xff,0x7d]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clgij	%r15, 0, 0, .[[LAB:L.*]]	# encoding: [0xec,0xf0,A,A,0x00,0x7d]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clgij	%r0, 0, 0, 0
+	clgij	%r0, 255, 0, 0
+	clgij	%r15, 0, 0, 0
+
+#CHECK: clgij	%r1, 193, 0, .[[LAB:L.*]]-65536	# encoding: [0xec,0x10,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 0, -0x10000
+#CHECK: clgij	%r1, 193, 0, .[[LAB:L.*]]-2	# encoding: [0xec,0x10,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 0, -2
+#CHECK: clgij	%r1, 193, 0, .[[LAB:L.*]]		# encoding: [0xec,0x10,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 0, 0
+#CHECK: clgij	%r1, 193, 0, .[[LAB:L.*]]+65534	# encoding: [0xec,0x10,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 0, 0xfffe
+
+#CHECK: clgij	%r1, 193, 0, foo                  # encoding: [0xec,0x10,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 0, foo
+
+#CHECK: clgij	%r1, 193, 1, foo                  # encoding: [0xec,0x11,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 1, foo
+
+#CHECK: clgij	%r1, 193, 2, foo                  # encoding: [0xec,0x12,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijh	%r1, 193, foo                     # encoding: [0xec,0x12,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijnle	%r1, 193, foo                     # encoding: [0xec,0x12,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 2, foo
+	clgijh	%r1, 193, foo
+	clgijnle	%r1, 193, foo
+
+#CHECK: clgij	%r1, 193, 3, foo                  # encoding: [0xec,0x13,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 3, foo
+
+#CHECK: clgij	%r1, 193, 4, foo                  # encoding: [0xec,0x14,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijl	%r1, 193, foo                     # encoding: [0xec,0x14,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijnhe	%r1, 193, foo                     # encoding: [0xec,0x14,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 4, foo
+	clgijl	%r1, 193, foo
+	clgijnhe	%r1, 193, foo
+
+#CHECK: clgij	%r1, 193, 5, foo                  # encoding: [0xec,0x15,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 5, foo
+
+#CHECK: clgij	%r1, 193, 6, foo                  # encoding: [0xec,0x16,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijlh	%r1, 193, foo                     # encoding: [0xec,0x16,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijne	%r1, 193, foo                     # encoding: [0xec,0x16,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 6, foo
+	clgijlh	%r1, 193, foo
+	clgijne	%r1, 193, foo
+
+#CHECK: clgij	%r1, 193, 7, foo                  # encoding: [0xec,0x17,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 7, foo
+
+#CHECK: clgij	%r1, 193, 8, foo                  # encoding: [0xec,0x18,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgije	%r1, 193, foo                     # encoding: [0xec,0x18,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijnlh	%r1, 193, foo                     # encoding: [0xec,0x18,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 8, foo
+	clgije	%r1, 193, foo
+	clgijnlh	%r1, 193, foo
+
+#CHECK: clgij	%r1, 193, 9, foo                  # encoding: [0xec,0x19,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 9, foo
+
+#CHECK: clgij	%r1, 193, 10, foo                 # encoding: [0xec,0x1a,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijhe	%r1, 193, foo                     # encoding: [0xec,0x1a,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijnl	%r1, 193, foo                     # encoding: [0xec,0x1a,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 10, foo
+	clgijhe	%r1, 193, foo
+	clgijnl	%r1, 193, foo
+
+#CHECK: clgij	%r1, 193, 11, foo                 # encoding: [0xec,0x1b,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 11, foo
+
+#CHECK: clgij	%r1, 193, 12, foo                 # encoding: [0xec,0x1c,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijle	%r1, 193, foo                     # encoding: [0xec,0x1c,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgijnh	%r1, 193, foo                     # encoding: [0xec,0x1c,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 12, foo
+	clgijle	%r1, 193, foo
+	clgijnh	%r1, 193, foo
+
+#CHECK: clgij	%r1, 193, 13, foo                 # encoding: [0xec,0x1d,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 13, foo
+
+#CHECK: clgij	%r1, 193, 14, foo                 # encoding: [0xec,0x1e,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 14, foo
+
+#CHECK: clgij	%r1, 193, 15, foo                 # encoding: [0xec,0x1f,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 15, foo
+
+#CHECK: clgij	%r1, 193, 0, bar+100              # encoding: [0xec,0x10,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 0, bar+100
+
+#CHECK: clgijh	%r1, 193, bar+100                 # encoding: [0xec,0x12,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijh	%r1, 193, bar+100
+
+#CHECK: clgijnle	%r1, 193, bar+100                 # encoding: [0xec,0x12,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijnle	%r1, 193, bar+100
+
+#CHECK: clgijl	%r1, 193, bar+100                 # encoding: [0xec,0x14,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijl	%r1, 193, bar+100
+
+#CHECK: clgijnhe	%r1, 193, bar+100                 # encoding: [0xec,0x14,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijnhe	%r1, 193, bar+100
+
+#CHECK: clgijlh	%r1, 193, bar+100                 # encoding: [0xec,0x16,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijlh	%r1, 193, bar+100
+
+#CHECK: clgijne	%r1, 193, bar+100                 # encoding: [0xec,0x16,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijne	%r1, 193, bar+100
+
+#CHECK: clgije	%r1, 193, bar+100                 # encoding: [0xec,0x18,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgije	%r1, 193, bar+100
+
+#CHECK: clgijnlh	%r1, 193, bar+100                 # encoding: [0xec,0x18,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijnlh	%r1, 193, bar+100
+
+#CHECK: clgijhe	%r1, 193, bar+100                 # encoding: [0xec,0x1a,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijhe	%r1, 193, bar+100
+
+#CHECK: clgijnl	%r1, 193, bar+100                 # encoding: [0xec,0x1a,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijnl	%r1, 193, bar+100
+
+#CHECK: clgijle	%r1, 193, bar+100                 # encoding: [0xec,0x1c,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijle	%r1, 193, bar+100
+
+#CHECK: clgijnh	%r1, 193, bar+100                 # encoding: [0xec,0x1c,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgijnh	%r1, 193, bar+100
+
+#CHECK: clgij	%r1, 193, 0, bar@PLT              # encoding: [0xec,0x10,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgij	%r1, 193, 0, bar@PLT
+
+#CHECK: clgijh	%r1, 193, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijh	%r1, 193, bar@PLT
+
+#CHECK: clgijnle	%r1, 193, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijnle	%r1, 193, bar@PLT
+
+#CHECK: clgijl	%r1, 193, bar@PLT                 # encoding: [0xec,0x14,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijl	%r1, 193, bar@PLT
+
+#CHECK: clgijnhe	%r1, 193, bar@PLT                 # encoding: [0xec,0x14,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijnhe	%r1, 193, bar@PLT
+
+#CHECK: clgijlh	%r1, 193, bar@PLT                 # encoding: [0xec,0x16,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijlh	%r1, 193, bar@PLT
+
+#CHECK: clgijne	%r1, 193, bar@PLT                 # encoding: [0xec,0x16,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijne	%r1, 193, bar@PLT
+
+#CHECK: clgije	%r1, 193, bar@PLT                 # encoding: [0xec,0x18,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgije	%r1, 193, bar@PLT
+
+#CHECK: clgijnlh	%r1, 193, bar@PLT                 # encoding: [0xec,0x18,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijnlh	%r1, 193, bar@PLT
+
+#CHECK: clgijhe	%r1, 193, bar@PLT                 # encoding: [0xec,0x1a,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijhe	%r1, 193, bar@PLT
+
+#CHECK: clgijnl	%r1, 193, bar@PLT                 # encoding: [0xec,0x1a,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijnl	%r1, 193, bar@PLT
+
+#CHECK: clgijle	%r1, 193, bar@PLT                 # encoding: [0xec,0x1c,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijle	%r1, 193, bar@PLT
+
+#CHECK: clgijnh	%r1, 193, bar@PLT                 # encoding: [0xec,0x1c,A,A,0xc1,0x7d]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgijnh	%r1, 193, bar@PLT
+
 #CHECK: clgr	%r0, %r0                # encoding: [0xb9,0x21,0x00,0x00]
 #CHECK: clgr	%r0, %r15               # encoding: [0xb9,0x21,0x00,0x0f]
 #CHECK: clgr	%r15, %r0               # encoding: [0xb9,0x21,0x00,0xf0]
@@ -2640,6 +2984,236 @@
 	clgr	%r15,%r0
 	clgr	%r7,%r8
 
+#CHECK: clgrj	%r0, %r0, 0, .[[LAB:L.*]]	# encoding: [0xec,0x00,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clgrj	%r0, %r15, 0, .[[LAB:L.*]]	# encoding: [0xec,0x0f,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clgrj	%r15, %r0, 0, .[[LAB:L.*]]	# encoding: [0xec,0xf0,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clgrj	%r7, %r8, 0, .[[LAB:L.*]]	# encoding: [0xec,0x78,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clgrj	%r0,%r0,0,0
+	clgrj	%r0,%r15,0,0
+	clgrj	%r15,%r0,0,0
+	clgrj	%r7,%r8,0,0
+
+#CHECK: clgrj	%r1, %r2, 0, .[[LAB:L.*]]-65536	# encoding: [0xec,0x12,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 0, -0x10000
+#CHECK: clgrj	%r1, %r2, 0, .[[LAB:L.*]]-2	# encoding: [0xec,0x12,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 0, -2
+#CHECK: clgrj	%r1, %r2, 0, .[[LAB:L.*]]		# encoding: [0xec,0x12,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 0, 0
+#CHECK: clgrj	%r1, %r2, 0, .[[LAB:L.*]]+65534	# encoding: [0xec,0x12,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 0, 0xfffe
+
+#CHECK: clgrj	%r1, %r2, 0, foo                  # encoding: [0xec,0x12,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 0, foo
+
+#CHECK: clgrj	%r1, %r2, 1, foo                  # encoding: [0xec,0x12,A,A,0x10,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 1, foo
+
+#CHECK: clgrj	%r1, %r2, 2, foo                  # encoding: [0xec,0x12,A,A,0x20,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x20,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjnle	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x20,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 2, foo
+	clgrjh	%r1, %r2, foo
+	clgrjnle	%r1, %r2, foo
+
+#CHECK: clgrj	%r1, %r2, 3, foo                  # encoding: [0xec,0x12,A,A,0x30,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 3, foo
+
+#CHECK: clgrj	%r1, %r2, 4, foo                  # encoding: [0xec,0x12,A,A,0x40,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjl	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x40,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjnhe	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x40,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 4, foo
+	clgrjl	%r1, %r2, foo
+	clgrjnhe	%r1, %r2, foo
+
+#CHECK: clgrj	%r1, %r2, 5, foo                  # encoding: [0xec,0x12,A,A,0x50,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 5, foo
+
+#CHECK: clgrj	%r1, %r2, 6, foo                  # encoding: [0xec,0x12,A,A,0x60,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjlh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x60,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjne	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x60,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 6, foo
+	clgrjlh	%r1, %r2, foo
+	clgrjne	%r1, %r2, foo
+
+#CHECK: clgrj	%r1, %r2, 7, foo                  # encoding: [0xec,0x12,A,A,0x70,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 7, foo
+
+#CHECK: clgrj	%r1, %r2, 8, foo                  # encoding: [0xec,0x12,A,A,0x80,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrje	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x80,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjnlh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x80,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 8, foo
+	clgrje	%r1, %r2, foo
+	clgrjnlh	%r1, %r2, foo
+
+#CHECK: clgrj	%r1, %r2, 9, foo                  # encoding: [0xec,0x12,A,A,0x90,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 9, foo
+
+#CHECK: clgrj	%r1, %r2, 10, foo                 # encoding: [0xec,0x12,A,A,0xa0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjhe	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xa0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjnl	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xa0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 10, foo
+	clgrjhe	%r1, %r2, foo
+	clgrjnl	%r1, %r2, foo
+
+#CHECK: clgrj	%r1, %r2, 11, foo                 # encoding: [0xec,0x12,A,A,0xb0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 11, foo
+
+#CHECK: clgrj	%r1, %r2, 12, foo                 # encoding: [0xec,0x12,A,A,0xc0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjle	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xc0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clgrjnh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xc0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 12, foo
+	clgrjle	%r1, %r2, foo
+	clgrjnh	%r1, %r2, foo
+
+#CHECK: clgrj	%r1, %r2, 13, foo                 # encoding: [0xec,0x12,A,A,0xd0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 13, foo
+
+#CHECK: clgrj	%r1, %r2, 14, foo                 # encoding: [0xec,0x12,A,A,0xe0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 14, foo
+
+#CHECK: clgrj	%r1, %r2, 15, foo                 # encoding: [0xec,0x12,A,A,0xf0,0x65]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 15, foo
+
+#CHECK: clgrj	%r1, %r2, 0, bar+100              # encoding: [0xec,0x12,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 0, bar+100
+
+#CHECK: clgrjh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x20,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjh	%r1, %r2, bar+100
+
+#CHECK: clgrjnle	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x20,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjnle	%r1, %r2, bar+100
+
+#CHECK: clgrjl	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x40,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjl	%r1, %r2, bar+100
+
+#CHECK: clgrjnhe	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x40,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjnhe	%r1, %r2, bar+100
+
+#CHECK: clgrjlh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x60,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjlh	%r1, %r2, bar+100
+
+#CHECK: clgrjne	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x60,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjne	%r1, %r2, bar+100
+
+#CHECK: clgrje	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x80,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrje	%r1, %r2, bar+100
+
+#CHECK: clgrjnlh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x80,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjnlh	%r1, %r2, bar+100
+
+#CHECK: clgrjhe	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xa0,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjhe	%r1, %r2, bar+100
+
+#CHECK: clgrjnl	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xa0,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjnl	%r1, %r2, bar+100
+
+#CHECK: clgrjle	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xc0,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjle	%r1, %r2, bar+100
+
+#CHECK: clgrjnh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xc0,0x65]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clgrjnh	%r1, %r2, bar+100
+
+#CHECK: clgrj	%r1, %r2, 0, bar@PLT              # encoding: [0xec,0x12,A,A,0x00,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrj	%r1, %r2, 0, bar@PLT
+
+#CHECK: clgrjh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x20,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjh	%r1, %r2, bar@PLT
+
+#CHECK: clgrjnle	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x20,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjnle	%r1, %r2, bar@PLT
+
+#CHECK: clgrjl	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x40,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjl	%r1, %r2, bar@PLT
+
+#CHECK: clgrjnhe	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x40,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjnhe	%r1, %r2, bar@PLT
+
+#CHECK: clgrjlh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x60,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjlh	%r1, %r2, bar@PLT
+
+#CHECK: clgrjne	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x60,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjne	%r1, %r2, bar@PLT
+
+#CHECK: clgrje	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x80,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrje	%r1, %r2, bar@PLT
+
+#CHECK: clgrjnlh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x80,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjnlh	%r1, %r2, bar@PLT
+
+#CHECK: clgrjhe	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xa0,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjhe	%r1, %r2, bar@PLT
+
+#CHECK: clgrjnl	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xa0,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjnl	%r1, %r2, bar@PLT
+
+#CHECK: clgrjle	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc0,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjle	%r1, %r2, bar@PLT
+
+#CHECK: clgrjnh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc0,0x65]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clgrjnh	%r1, %r2, bar@PLT
+
 #CHECK: clgrl	%r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x0a,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
 	clgrl	%r0, -0x100000000
@@ -2746,6 +3320,233 @@
 	cli	4095(%r1), 42
 	cli	4095(%r15), 42
 
+#CHECK: clij	%r0, 0, 0, .[[LAB:L.*]]	# encoding: [0xec,0x00,A,A,0x00,0x7f]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clij	%r0, 255, 0, .[[LAB:L.*]]	# encoding: [0xec,0x00,A,A,0xff,0x7f]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clij	%r15, 0, 0, .[[LAB:L.*]]	# encoding: [0xec,0xf0,A,A,0x00,0x7f]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clij	%r0, 0, 0, 0
+	clij	%r0, 255, 0, 0
+	clij	%r15, 0, 0, 0
+
+#CHECK: clij	%r1, 193, 0, .[[LAB:L.*]]-65536	# encoding: [0xec,0x10,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 0, -0x10000
+#CHECK: clij	%r1, 193, 0, .[[LAB:L.*]]-2	# encoding: [0xec,0x10,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 0, -2
+#CHECK: clij	%r1, 193, 0, .[[LAB:L.*]]		# encoding: [0xec,0x10,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 0, 0
+#CHECK: clij	%r1, 193, 0, .[[LAB:L.*]]+65534	# encoding: [0xec,0x10,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 0, 0xfffe
+
+#CHECK: clij	%r1, 193, 0, foo                  # encoding: [0xec,0x10,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 0, foo
+
+#CHECK: clij	%r1, 193, 1, foo                  # encoding: [0xec,0x11,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 1, foo
+
+#CHECK: clij	%r1, 193, 2, foo                  # encoding: [0xec,0x12,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijh	%r1, 193, foo                     # encoding: [0xec,0x12,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijnle	%r1, 193, foo                     # encoding: [0xec,0x12,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 2, foo
+	clijh	%r1, 193, foo
+	clijnle	%r1, 193, foo
+
+#CHECK: clij	%r1, 193, 3, foo                  # encoding: [0xec,0x13,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 3, foo
+
+#CHECK: clij	%r1, 193, 4, foo                  # encoding: [0xec,0x14,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijl	%r1, 193, foo                     # encoding: [0xec,0x14,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijnhe	%r1, 193, foo                     # encoding: [0xec,0x14,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 4, foo
+	clijl	%r1, 193, foo
+	clijnhe	%r1, 193, foo
+
+#CHECK: clij	%r1, 193, 5, foo                  # encoding: [0xec,0x15,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 5, foo
+
+#CHECK: clij	%r1, 193, 6, foo                  # encoding: [0xec,0x16,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijlh	%r1, 193, foo                     # encoding: [0xec,0x16,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijne	%r1, 193, foo                     # encoding: [0xec,0x16,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 6, foo
+	clijlh	%r1, 193, foo
+	clijne	%r1, 193, foo
+
+#CHECK: clij	%r1, 193, 7, foo                  # encoding: [0xec,0x17,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 7, foo
+
+#CHECK: clij	%r1, 193, 8, foo                  # encoding: [0xec,0x18,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clije	%r1, 193, foo                     # encoding: [0xec,0x18,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijnlh	%r1, 193, foo                     # encoding: [0xec,0x18,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 8, foo
+	clije	%r1, 193, foo
+	clijnlh	%r1, 193, foo
+
+#CHECK: clij	%r1, 193, 9, foo                  # encoding: [0xec,0x19,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 9, foo
+
+#CHECK: clij	%r1, 193, 10, foo                 # encoding: [0xec,0x1a,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijhe	%r1, 193, foo                     # encoding: [0xec,0x1a,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijnl	%r1, 193, foo                     # encoding: [0xec,0x1a,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 10, foo
+	clijhe	%r1, 193, foo
+	clijnl	%r1, 193, foo
+
+#CHECK: clij	%r1, 193, 11, foo                 # encoding: [0xec,0x1b,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 11, foo
+
+#CHECK: clij	%r1, 193, 12, foo                 # encoding: [0xec,0x1c,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijle	%r1, 193, foo                     # encoding: [0xec,0x1c,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clijnh	%r1, 193, foo                     # encoding: [0xec,0x1c,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 12, foo
+	clijle	%r1, 193, foo
+	clijnh	%r1, 193, foo
+
+#CHECK: clij	%r1, 193, 13, foo                 # encoding: [0xec,0x1d,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 13, foo
+
+#CHECK: clij	%r1, 193, 14, foo                 # encoding: [0xec,0x1e,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 14, foo
+
+#CHECK: clij	%r1, 193, 15, foo                 # encoding: [0xec,0x1f,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 15, foo
+
+#CHECK: clij	%r1, 193, 0, bar+100              # encoding: [0xec,0x10,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 0, bar+100
+
+#CHECK: clijh	%r1, 193, bar+100                 # encoding: [0xec,0x12,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijh	%r1, 193, bar+100
+
+#CHECK: clijnle	%r1, 193, bar+100                 # encoding: [0xec,0x12,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijnle	%r1, 193, bar+100
+
+#CHECK: clijl	%r1, 193, bar+100                 # encoding: [0xec,0x14,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijl	%r1, 193, bar+100
+
+#CHECK: clijnhe	%r1, 193, bar+100                 # encoding: [0xec,0x14,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijnhe	%r1, 193, bar+100
+
+#CHECK: clijlh	%r1, 193, bar+100                 # encoding: [0xec,0x16,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijlh	%r1, 193, bar+100
+
+#CHECK: clijne	%r1, 193, bar+100                 # encoding: [0xec,0x16,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijne	%r1, 193, bar+100
+
+#CHECK: clije	%r1, 193, bar+100                 # encoding: [0xec,0x18,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clije	%r1, 193, bar+100
+
+#CHECK: clijnlh	%r1, 193, bar+100                 # encoding: [0xec,0x18,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijnlh	%r1, 193, bar+100
+
+#CHECK: clijhe	%r1, 193, bar+100                 # encoding: [0xec,0x1a,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijhe	%r1, 193, bar+100
+
+#CHECK: clijnl	%r1, 193, bar+100                 # encoding: [0xec,0x1a,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijnl	%r1, 193, bar+100
+
+#CHECK: clijle	%r1, 193, bar+100                 # encoding: [0xec,0x1c,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijle	%r1, 193, bar+100
+
+#CHECK: clijnh	%r1, 193, bar+100                 # encoding: [0xec,0x1c,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clijnh	%r1, 193, bar+100
+
+#CHECK: clij	%r1, 193, 0, bar@PLT              # encoding: [0xec,0x10,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clij	%r1, 193, 0, bar@PLT
+
+#CHECK: clijh	%r1, 193, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijh	%r1, 193, bar@PLT
+
+#CHECK: clijnle	%r1, 193, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijnle	%r1, 193, bar@PLT
+
+#CHECK: clijl	%r1, 193, bar@PLT                 # encoding: [0xec,0x14,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijl	%r1, 193, bar@PLT
+
+#CHECK: clijnhe	%r1, 193, bar@PLT                 # encoding: [0xec,0x14,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijnhe	%r1, 193, bar@PLT
+
+#CHECK: clijlh	%r1, 193, bar@PLT                 # encoding: [0xec,0x16,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijlh	%r1, 193, bar@PLT
+
+#CHECK: clijne	%r1, 193, bar@PLT                 # encoding: [0xec,0x16,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijne	%r1, 193, bar@PLT
+
+#CHECK: clije	%r1, 193, bar@PLT                 # encoding: [0xec,0x18,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clije	%r1, 193, bar@PLT
+
+#CHECK: clijnlh	%r1, 193, bar@PLT                 # encoding: [0xec,0x18,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijnlh	%r1, 193, bar@PLT
+
+#CHECK: clijhe	%r1, 193, bar@PLT                 # encoding: [0xec,0x1a,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijhe	%r1, 193, bar@PLT
+
+#CHECK: clijnl	%r1, 193, bar@PLT                 # encoding: [0xec,0x1a,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijnl	%r1, 193, bar@PLT
+
+#CHECK: clijle	%r1, 193, bar@PLT                 # encoding: [0xec,0x1c,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijle	%r1, 193, bar@PLT
+
+#CHECK: clijnh	%r1, 193, bar@PLT                 # encoding: [0xec,0x1c,A,A,0xc1,0x7f]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clijnh	%r1, 193, bar@PLT
+
 #CHECK: cliy	-524288, 0              # encoding: [0xeb,0x00,0x00,0x00,0x80,0x55]
 #CHECK: cliy	-1, 0                   # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x55]
 #CHECK: cliy	0, 0                    # encoding: [0xeb,0x00,0x00,0x00,0x00,0x55]
@@ -2778,6 +3579,236 @@
 	clr	%r15,%r0
 	clr	%r7,%r8
 
+#CHECK: clrj	%r0, %r0, 0, .[[LAB:L.*]]	# encoding: [0xec,0x00,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clrj	%r0, %r15, 0, .[[LAB:L.*]]	# encoding: [0xec,0x0f,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clrj	%r15, %r0, 0, .[[LAB:L.*]]	# encoding: [0xec,0xf0,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+#CHECK: clrj	%r7, %r8, 0, .[[LAB:L.*]]	# encoding: [0xec,0x78,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clrj	%r0,%r0,0,0
+	clrj	%r0,%r15,0,0
+	clrj	%r15,%r0,0,0
+	clrj	%r7,%r8,0,0
+
+#CHECK: clrj	%r1, %r2, 0, .[[LAB:L.*]]-65536	# encoding: [0xec,0x12,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 0, -0x10000
+#CHECK: clrj	%r1, %r2, 0, .[[LAB:L.*]]-2	# encoding: [0xec,0x12,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 0, -2
+#CHECK: clrj	%r1, %r2, 0, .[[LAB:L.*]]		# encoding: [0xec,0x12,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 0, 0
+#CHECK: clrj	%r1, %r2, 0, .[[LAB:L.*]]+65534	# encoding: [0xec,0x12,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 0, 0xfffe
+
+#CHECK: clrj	%r1, %r2, 0, foo                  # encoding: [0xec,0x12,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 0, foo
+
+#CHECK: clrj	%r1, %r2, 1, foo                  # encoding: [0xec,0x12,A,A,0x10,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 1, foo
+
+#CHECK: clrj	%r1, %r2, 2, foo                  # encoding: [0xec,0x12,A,A,0x20,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x20,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjnle	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x20,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 2, foo
+	clrjh	%r1, %r2, foo
+	clrjnle	%r1, %r2, foo
+
+#CHECK: clrj	%r1, %r2, 3, foo                  # encoding: [0xec,0x12,A,A,0x30,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 3, foo
+
+#CHECK: clrj	%r1, %r2, 4, foo                  # encoding: [0xec,0x12,A,A,0x40,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjl	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x40,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjnhe	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x40,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 4, foo
+	clrjl	%r1, %r2, foo
+	clrjnhe	%r1, %r2, foo
+
+#CHECK: clrj	%r1, %r2, 5, foo                  # encoding: [0xec,0x12,A,A,0x50,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 5, foo
+
+#CHECK: clrj	%r1, %r2, 6, foo                  # encoding: [0xec,0x12,A,A,0x60,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjlh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x60,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjne	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x60,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 6, foo
+	clrjlh	%r1, %r2, foo
+	clrjne	%r1, %r2, foo
+
+#CHECK: clrj	%r1, %r2, 7, foo                  # encoding: [0xec,0x12,A,A,0x70,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 7, foo
+
+#CHECK: clrj	%r1, %r2, 8, foo                  # encoding: [0xec,0x12,A,A,0x80,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrje	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x80,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjnlh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0x80,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 8, foo
+	clrje	%r1, %r2, foo
+	clrjnlh	%r1, %r2, foo
+
+#CHECK: clrj	%r1, %r2, 9, foo                  # encoding: [0xec,0x12,A,A,0x90,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 9, foo
+
+#CHECK: clrj	%r1, %r2, 10, foo                 # encoding: [0xec,0x12,A,A,0xa0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjhe	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xa0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjnl	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xa0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 10, foo
+	clrjhe	%r1, %r2, foo
+	clrjnl	%r1, %r2, foo
+
+#CHECK: clrj	%r1, %r2, 11, foo                 # encoding: [0xec,0x12,A,A,0xb0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 11, foo
+
+#CHECK: clrj	%r1, %r2, 12, foo                 # encoding: [0xec,0x12,A,A,0xc0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjle	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xc0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+#CHECK: clrjnh	%r1, %r2, foo                     # encoding: [0xec,0x12,A,A,0xc0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 12, foo
+	clrjle	%r1, %r2, foo
+	clrjnh	%r1, %r2, foo
+
+#CHECK: clrj	%r1, %r2, 13, foo                 # encoding: [0xec,0x12,A,A,0xd0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 13, foo
+
+#CHECK: clrj	%r1, %r2, 14, foo                 # encoding: [0xec,0x12,A,A,0xe0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 14, foo
+
+#CHECK: clrj	%r1, %r2, 15, foo                 # encoding: [0xec,0x12,A,A,0xf0,0x77]
+#CHECK:  fixup A - offset: 2, value: foo+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 15, foo
+
+#CHECK: clrj	%r1, %r2, 0, bar+100              # encoding: [0xec,0x12,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 0, bar+100
+
+#CHECK: clrjh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x20,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjh	%r1, %r2, bar+100
+
+#CHECK: clrjnle	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x20,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjnle	%r1, %r2, bar+100
+
+#CHECK: clrjl	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x40,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjl	%r1, %r2, bar+100
+
+#CHECK: clrjnhe	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x40,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjnhe	%r1, %r2, bar+100
+
+#CHECK: clrjlh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x60,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjlh	%r1, %r2, bar+100
+
+#CHECK: clrjne	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x60,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjne	%r1, %r2, bar+100
+
+#CHECK: clrje	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x80,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrje	%r1, %r2, bar+100
+
+#CHECK: clrjnlh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0x80,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjnlh	%r1, %r2, bar+100
+
+#CHECK: clrjhe	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xa0,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjhe	%r1, %r2, bar+100
+
+#CHECK: clrjnl	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xa0,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjnl	%r1, %r2, bar+100
+
+#CHECK: clrjle	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xc0,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjle	%r1, %r2, bar+100
+
+#CHECK: clrjnh	%r1, %r2, bar+100                 # encoding: [0xec,0x12,A,A,0xc0,0x77]
+#CHECK:  fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL
+	clrjnh	%r1, %r2, bar+100
+
+#CHECK: clrj	%r1, %r2, 0, bar@PLT              # encoding: [0xec,0x12,A,A,0x00,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrj	%r1, %r2, 0, bar@PLT
+
+#CHECK: clrjh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x20,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjh	%r1, %r2, bar@PLT
+
+#CHECK: clrjnle	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x20,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjnle	%r1, %r2, bar@PLT
+
+#CHECK: clrjl	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x40,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjl	%r1, %r2, bar@PLT
+
+#CHECK: clrjnhe	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x40,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjnhe	%r1, %r2, bar@PLT
+
+#CHECK: clrjlh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x60,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjlh	%r1, %r2, bar@PLT
+
+#CHECK: clrjne	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x60,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjne	%r1, %r2, bar@PLT
+
+#CHECK: clrje	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x80,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrje	%r1, %r2, bar@PLT
+
+#CHECK: clrjnlh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0x80,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjnlh	%r1, %r2, bar@PLT
+
+#CHECK: clrjhe	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xa0,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjhe	%r1, %r2, bar@PLT
+
+#CHECK: clrjnl	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xa0,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjnl	%r1, %r2, bar@PLT
+
+#CHECK: clrjle	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc0,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjle	%r1, %r2, bar@PLT
+
+#CHECK: clrjnh	%r1, %r2, bar@PLT                 # encoding: [0xec,0x12,A,A,0xc0,0x77]
+#CHECK:  fixup A - offset: 2, value: bar@PLT+2, kind: FK_390_PC16DBL
+	clrjnh	%r1, %r2, bar@PLT
+
 #CHECK: clrl	%r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x0f,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
 	clrl	%r0, -0x100000000
@@ -2815,6 +3846,16 @@
 	clrl	%r7,frob@PLT
 	clrl	%r8,frob@PLT
 
+#CHECK: clst	%r0, %r0                # encoding: [0xb2,0x5d,0x00,0x00]
+#CHECK: clst	%r0, %r15               # encoding: [0xb2,0x5d,0x00,0x0f]
+#CHECK: clst	%r15, %r0               # encoding: [0xb2,0x5d,0x00,0xf0]
+#CHECK: clst	%r7, %r8                # encoding: [0xb2,0x5d,0x00,0x78]
+
+	clst	%r0,%r0
+	clst	%r0,%r15
+	clst	%r15,%r0
+	clst	%r7,%r8
+
 #CHECK: cly	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x55]
 #CHECK: cly	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x55]
 #CHECK: cly	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x55]
@@ -3593,6 +4634,14 @@
 	iill	%r0, 0xffff
 	iill	%r15, 0
 
+#CHECK: ipm	%r0                     # encoding: [0xb2,0x22,0x00,0x00]
+#CHECK: ipm	%r1                     # encoding: [0xb2,0x22,0x00,0x10]
+#CHECK: ipm	%r15                    # encoding: [0xb2,0x22,0x00,0xf0]
+
+	ipm	%r0
+	ipm	%r1
+	ipm	%r15
+
 #CHECK: l	%r0, 0                  # encoding: [0x58,0x00,0x00,0x00]
 #CHECK: l	%r0, 4095               # encoding: [0x58,0x00,0x0f,0xff]
 #CHECK: l	%r0, 0(%r1)             # encoding: [0x58,0x00,0x10,0x00]
@@ -4683,6 +5732,36 @@
 	lnebr	%f15,%f0
 	lnebr	%f15,%f9
 
+#CHECK: lngfr	%r0, %r0                # encoding: [0xb9,0x11,0x00,0x00]
+#CHECK: lngfr	%r0, %r15               # encoding: [0xb9,0x11,0x00,0x0f]
+#CHECK: lngfr	%r15, %r0               # encoding: [0xb9,0x11,0x00,0xf0]
+#CHECK: lngfr	%r7, %r8                # encoding: [0xb9,0x11,0x00,0x78]
+
+	lngfr	%r0,%r0
+	lngfr	%r0,%r15
+	lngfr	%r15,%r0
+	lngfr	%r7,%r8
+
+#CHECK: lngr	%r0, %r0                # encoding: [0xb9,0x01,0x00,0x00]
+#CHECK: lngr	%r0, %r15               # encoding: [0xb9,0x01,0x00,0x0f]
+#CHECK: lngr	%r15, %r0               # encoding: [0xb9,0x01,0x00,0xf0]
+#CHECK: lngr	%r7, %r8                # encoding: [0xb9,0x01,0x00,0x78]
+
+	lngr	%r0,%r0
+	lngr	%r0,%r15
+	lngr	%r15,%r0
+	lngr	%r7,%r8
+
+#CHECK: lnr	%r0, %r0                # encoding: [0x11,0x00]
+#CHECK: lnr	%r0, %r15               # encoding: [0x11,0x0f]
+#CHECK: lnr	%r15, %r0               # encoding: [0x11,0xf0]
+#CHECK: lnr	%r7, %r8                # encoding: [0x11,0x78]
+
+	lnr	%r0,%r0
+	lnr	%r0,%r15
+	lnr	%r15,%r0
+	lnr	%r7,%r8
+
 #CHECK: lnxbr	%f0, %f8                # encoding: [0xb3,0x41,0x00,0x08]
 #CHECK: lnxbr	%f0, %f13               # encoding: [0xb3,0x41,0x00,0x0d]
 #CHECK: lnxbr	%f13, %f0               # encoding: [0xb3,0x41,0x00,0xd0]
@@ -4713,6 +5792,36 @@
 	lpebr	%f15,%f0
 	lpebr	%f15,%f9
 
+#CHECK: lpgfr	%r0, %r0                # encoding: [0xb9,0x10,0x00,0x00]
+#CHECK: lpgfr	%r0, %r15               # encoding: [0xb9,0x10,0x00,0x0f]
+#CHECK: lpgfr	%r15, %r0               # encoding: [0xb9,0x10,0x00,0xf0]
+#CHECK: lpgfr	%r7, %r8                # encoding: [0xb9,0x10,0x00,0x78]
+
+	lpgfr	%r0,%r0
+	lpgfr	%r0,%r15
+	lpgfr	%r15,%r0
+	lpgfr	%r7,%r8
+
+#CHECK: lpgr	%r0, %r0                # encoding: [0xb9,0x00,0x00,0x00]
+#CHECK: lpgr	%r0, %r15               # encoding: [0xb9,0x00,0x00,0x0f]
+#CHECK: lpgr	%r15, %r0               # encoding: [0xb9,0x00,0x00,0xf0]
+#CHECK: lpgr	%r7, %r8                # encoding: [0xb9,0x00,0x00,0x78]
+
+	lpgr	%r0,%r0
+	lpgr	%r0,%r15
+	lpgr	%r15,%r0
+	lpgr	%r7,%r8
+
+#CHECK: lpr	%r0, %r0                # encoding: [0x10,0x00]
+#CHECK: lpr	%r0, %r15               # encoding: [0x10,0x0f]
+#CHECK: lpr	%r15, %r0               # encoding: [0x10,0xf0]
+#CHECK: lpr	%r7, %r8                # encoding: [0x10,0x78]
+
+	lpr	%r0,%r0
+	lpr	%r0,%r15
+	lpr	%r15,%r0
+	lpr	%r7,%r8
+
 #CHECK: lpxbr	%f0, %f8                # encoding: [0xb3,0x40,0x00,0x08]
 #CHECK: lpxbr	%f0, %f13               # encoding: [0xb3,0x40,0x00,0x0d]
 #CHECK: lpxbr	%f13, %f0               # encoding: [0xb3,0x40,0x00,0xd0]
@@ -5608,6 +6717,16 @@
 	mviy	524287(%r1), 42
 	mviy	524287(%r15), 42
 
+#CHECK: mvst	%r0, %r0                # encoding: [0xb2,0x55,0x00,0x00]
+#CHECK: mvst	%r0, %r15               # encoding: [0xb2,0x55,0x00,0x0f]
+#CHECK: mvst	%r15, %r0               # encoding: [0xb2,0x55,0x00,0xf0]
+#CHECK: mvst	%r7, %r8                # encoding: [0xb2,0x55,0x00,0x78]
+
+	mvst	%r0,%r0
+	mvst	%r0,%r15
+	mvst	%r15,%r0
+	mvst	%r7,%r8
+
 #CHECK: mxbr	%f0, %f0                # encoding: [0xb3,0x4c,0x00,0x00]
 #CHECK: mxbr	%f0, %f13               # encoding: [0xb3,0x4c,0x00,0x0d]
 #CHECK: mxbr	%f8, %f5                # encoding: [0xb3,0x4c,0x00,0x85]
@@ -5660,6 +6779,32 @@
 	n	%r0, 4095(%r15,%r1)
 	n	%r15, 0
 
+#CHECK: nc	0(1), 0                 # encoding: [0xd4,0x00,0x00,0x00,0x00,0x00]
+#CHECK: nc	0(1), 0(%r1)            # encoding: [0xd4,0x00,0x00,0x00,0x10,0x00]
+#CHECK: nc	0(1), 0(%r15)           # encoding: [0xd4,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: nc	0(1), 4095              # encoding: [0xd4,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: nc	0(1), 4095(%r1)         # encoding: [0xd4,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: nc	0(1), 4095(%r15)        # encoding: [0xd4,0x00,0x00,0x00,0xff,0xff]
+#CHECK: nc	0(1,%r1), 0             # encoding: [0xd4,0x00,0x10,0x00,0x00,0x00]
+#CHECK: nc	0(1,%r15), 0            # encoding: [0xd4,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: nc	4095(1,%r1), 0          # encoding: [0xd4,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: nc	4095(1,%r15), 0         # encoding: [0xd4,0x00,0xff,0xff,0x00,0x00]
+#CHECK: nc	0(256,%r1), 0           # encoding: [0xd4,0xff,0x10,0x00,0x00,0x00]
+#CHECK: nc	0(256,%r15), 0          # encoding: [0xd4,0xff,0xf0,0x00,0x00,0x00]
+
+	nc	0(1), 0
+	nc	0(1), 0(%r1)
+	nc	0(1), 0(%r15)
+	nc	0(1), 4095
+	nc	0(1), 4095(%r1)
+	nc	0(1), 4095(%r15)
+	nc	0(1,%r1), 0
+	nc	0(1,%r15), 0
+	nc	4095(1,%r1), 0
+	nc	4095(1,%r15), 0
+	nc	0(256,%r1), 0
+	nc	0(256,%r15), 0
+
 #CHECK: ng	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x80]
 #CHECK: ng	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x80]
 #CHECK: ng	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x80]
@@ -5834,6 +6979,32 @@
 	o	%r0, 4095(%r15,%r1)
 	o	%r15, 0
 
+#CHECK: oc	0(1), 0                 # encoding: [0xd6,0x00,0x00,0x00,0x00,0x00]
+#CHECK: oc	0(1), 0(%r1)            # encoding: [0xd6,0x00,0x00,0x00,0x10,0x00]
+#CHECK: oc	0(1), 0(%r15)           # encoding: [0xd6,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: oc	0(1), 4095              # encoding: [0xd6,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: oc	0(1), 4095(%r1)         # encoding: [0xd6,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: oc	0(1), 4095(%r15)        # encoding: [0xd6,0x00,0x00,0x00,0xff,0xff]
+#CHECK: oc	0(1,%r1), 0             # encoding: [0xd6,0x00,0x10,0x00,0x00,0x00]
+#CHECK: oc	0(1,%r15), 0            # encoding: [0xd6,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: oc	4095(1,%r1), 0          # encoding: [0xd6,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: oc	4095(1,%r15), 0         # encoding: [0xd6,0x00,0xff,0xff,0x00,0x00]
+#CHECK: oc	0(256,%r1), 0           # encoding: [0xd6,0xff,0x10,0x00,0x00,0x00]
+#CHECK: oc	0(256,%r15), 0          # encoding: [0xd6,0xff,0xf0,0x00,0x00,0x00]
+
+	oc	0(1), 0
+	oc	0(1), 0(%r1)
+	oc	0(1), 0(%r15)
+	oc	0(1), 4095
+	oc	0(1), 4095(%r1)
+	oc	0(1), 4095(%r15)
+	oc	0(1,%r1), 0
+	oc	0(1,%r15), 0
+	oc	4095(1,%r1), 0
+	oc	4095(1,%r15), 0
+	oc	0(256,%r1), 0
+	oc	0(256,%r15), 0
+
 #CHECK: og	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x81]
 #CHECK: og	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x81]
 #CHECK: og	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x81]
@@ -5992,6 +7163,65 @@
 	oy	%r0, 524287(%r15,%r1)
 	oy	%r15, 0
 
+#CHECK: pfd	0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x36]
+#CHECK: pfd	0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x36]
+#CHECK: pfd	0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x36]
+#CHECK: pfd	0, 1                  # encoding: [0xe3,0x00,0x00,0x01,0x00,0x36]
+#CHECK: pfd	0, 524287             # encoding: [0xe3,0x00,0x0f,0xff,0x7f,0x36]
+#CHECK: pfd	0, 0(%r1)             # encoding: [0xe3,0x00,0x10,0x00,0x00,0x36]
+#CHECK: pfd	0, 0(%r15)            # encoding: [0xe3,0x00,0xf0,0x00,0x00,0x36]
+#CHECK: pfd	0, 524287(%r1,%r15)   # encoding: [0xe3,0x01,0xff,0xff,0x7f,0x36]
+#CHECK: pfd	0, 524287(%r15,%r1)   # encoding: [0xe3,0x0f,0x1f,0xff,0x7f,0x36]
+#CHECK: pfd	15, 0                 # encoding: [0xe3,0xf0,0x00,0x00,0x00,0x36]
+
+	pfd	0, -524288
+	pfd	0, -1
+	pfd	0, 0
+	pfd	0, 1
+	pfd	0, 524287
+	pfd	0, 0(%r1)
+	pfd	0, 0(%r15)
+	pfd	0, 524287(%r1,%r15)
+	pfd	0, 524287(%r15,%r1)
+	pfd	15, 0
+
+#CHECK: pfdrl	0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x02,A,A,A,A]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
+	pfdrl	0, -0x100000000
+#CHECK: pfdrl	0, .[[LAB:L.*]]-2	# encoding: [0xc6,0x02,A,A,A,A]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL
+	pfdrl	0, -2
+#CHECK: pfdrl	0, .[[LAB:L.*]]	# encoding: [0xc6,0x02,A,A,A,A]
+#CHECK:  fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL
+	pfdrl	0, 0
+#CHECK: pfdrl	0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x02,A,A,A,A]
+#CHECK:  fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL
+	pfdrl	0, 0xfffffffe
+
+#CHECK: pfdrl	0, foo                # encoding: [0xc6,0x02,A,A,A,A]
+# fixup A - offset: 2, value: foo+2, kind: FK_390_PC32DBL
+#CHECK: pfdrl	15, foo               # encoding: [0xc6,0xf2,A,A,A,A]
+# fixup A - offset: 2, value: foo+2, kind: FK_390_PC32DBL
+
+	pfdrl	0, foo
+	pfdrl	15, foo
+
+#CHECK: pfdrl	3, bar+100            # encoding: [0xc6,0x32,A,A,A,A]
+# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL
+#CHECK: pfdrl	4, bar+100            # encoding: [0xc6,0x42,A,A,A,A]
+# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL
+
+	pfdrl	3, bar+100
+	pfdrl	4, bar+100
+
+#CHECK: pfdrl	7, frob@PLT           # encoding: [0xc6,0x72,A,A,A,A]
+# fixup A - offset: 2, value: frob@PLT+2, kind: FK_390_PC32DBL
+#CHECK: pfdrl	8, frob@PLT           # encoding: [0xc6,0x82,A,A,A,A]
+# fixup A - offset: 2, value: frob@PLT+2, kind: FK_390_PC32DBL
+
+	pfdrl	7, frob@PLT
+	pfdrl	8, frob@PLT
+
 #CHECK: risbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x55]
 #CHECK: risbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x55]
 #CHECK: risbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x55]
@@ -6674,6 +7904,16 @@
 	srlg	%r0,%r0,524287(%r1)
 	srlg	%r0,%r0,524287(%r15)
 
+#CHECK: srst	%r0, %r0                # encoding: [0xb2,0x5e,0x00,0x00]
+#CHECK: srst	%r0, %r15               # encoding: [0xb2,0x5e,0x00,0x0f]
+#CHECK: srst	%r15, %r0               # encoding: [0xb2,0x5e,0x00,0xf0]
+#CHECK: srst	%r7, %r8                # encoding: [0xb2,0x5e,0x00,0x78]
+
+	srst	%r0,%r0
+	srst	%r0,%r15
+	srst	%r15,%r0
+	srst	%r7,%r8
+
 #CHECK: st	%r0, 0                  # encoding: [0x50,0x00,0x00,0x00]
 #CHECK: st	%r0, 4095               # encoding: [0x50,0x00,0x0f,0xff]
 #CHECK: st	%r0, 0(%r1)             # encoding: [0x50,0x00,0x10,0x00]
@@ -7101,6 +8341,84 @@
 	sy	%r0, 524287(%r15,%r1)
 	sy	%r15, 0
 
+#CHECK: tm	0, 0                    # encoding: [0x91,0x00,0x00,0x00]
+#CHECK: tm	4095, 0                 # encoding: [0x91,0x00,0x0f,0xff]
+#CHECK: tm	0, 255                  # encoding: [0x91,0xff,0x00,0x00]
+#CHECK: tm	0(%r1), 42              # encoding: [0x91,0x2a,0x10,0x00]
+#CHECK: tm	0(%r15), 42             # encoding: [0x91,0x2a,0xf0,0x00]
+#CHECK: tm	4095(%r1), 42           # encoding: [0x91,0x2a,0x1f,0xff]
+#CHECK: tm	4095(%r15), 42          # encoding: [0x91,0x2a,0xff,0xff]
+
+	tm	0, 0
+	tm	4095, 0
+	tm	0, 255
+	tm	0(%r1), 42
+	tm	0(%r15), 42
+	tm	4095(%r1), 42
+	tm	4095(%r15), 42
+
+#CHECK: tmhh	%r0, 0                  # encoding: [0xa7,0x02,0x00,0x00]
+#CHECK: tmhh	%r0, 32768              # encoding: [0xa7,0x02,0x80,0x00]
+#CHECK: tmhh	%r0, 65535              # encoding: [0xa7,0x02,0xff,0xff]
+#CHECK: tmhh	%r15, 0                 # encoding: [0xa7,0xf2,0x00,0x00]
+
+	tmhh	%r0, 0
+	tmhh	%r0, 0x8000
+	tmhh	%r0, 0xffff
+	tmhh	%r15, 0
+
+#CHECK: tmhl	%r0, 0                  # encoding: [0xa7,0x03,0x00,0x00]
+#CHECK: tmhl	%r0, 32768              # encoding: [0xa7,0x03,0x80,0x00]
+#CHECK: tmhl	%r0, 65535              # encoding: [0xa7,0x03,0xff,0xff]
+#CHECK: tmhl	%r15, 0                 # encoding: [0xa7,0xf3,0x00,0x00]
+
+	tmhl	%r0, 0
+	tmhl	%r0, 0x8000
+	tmhl	%r0, 0xffff
+	tmhl	%r15, 0
+
+#CHECK: tmlh	%r0, 0                  # encoding: [0xa7,0x00,0x00,0x00]
+#CHECK: tmlh	%r0, 32768              # encoding: [0xa7,0x00,0x80,0x00]
+#CHECK: tmlh	%r0, 65535              # encoding: [0xa7,0x00,0xff,0xff]
+#CHECK: tmlh	%r15, 0                 # encoding: [0xa7,0xf0,0x00,0x00]
+
+	tmlh	%r0, 0
+	tmlh	%r0, 0x8000
+	tmlh	%r0, 0xffff
+	tmlh	%r15, 0
+
+#CHECK: tmll	%r0, 0                  # encoding: [0xa7,0x01,0x00,0x00]
+#CHECK: tmll	%r0, 32768              # encoding: [0xa7,0x01,0x80,0x00]
+#CHECK: tmll	%r0, 65535              # encoding: [0xa7,0x01,0xff,0xff]
+#CHECK: tmll	%r15, 0                 # encoding: [0xa7,0xf1,0x00,0x00]
+
+	tmll	%r0, 0
+	tmll	%r0, 0x8000
+	tmll	%r0, 0xffff
+	tmll	%r15, 0
+
+#CHECK: tmy	-524288, 0              # encoding: [0xeb,0x00,0x00,0x00,0x80,0x51]
+#CHECK: tmy	-1, 0                   # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x51]
+#CHECK: tmy	0, 0                    # encoding: [0xeb,0x00,0x00,0x00,0x00,0x51]
+#CHECK: tmy	1, 0                    # encoding: [0xeb,0x00,0x00,0x01,0x00,0x51]
+#CHECK: tmy	524287, 0               # encoding: [0xeb,0x00,0x0f,0xff,0x7f,0x51]
+#CHECK: tmy	0, 255                  # encoding: [0xeb,0xff,0x00,0x00,0x00,0x51]
+#CHECK: tmy	0(%r1), 42              # encoding: [0xeb,0x2a,0x10,0x00,0x00,0x51]
+#CHECK: tmy	0(%r15), 42             # encoding: [0xeb,0x2a,0xf0,0x00,0x00,0x51]
+#CHECK: tmy	524287(%r1), 42         # encoding: [0xeb,0x2a,0x1f,0xff,0x7f,0x51]
+#CHECK: tmy	524287(%r15), 42        # encoding: [0xeb,0x2a,0xff,0xff,0x7f,0x51]
+
+	tmy	-524288, 0
+	tmy	-1, 0
+	tmy	0, 0
+	tmy	1, 0
+	tmy	524287, 0
+	tmy	0, 255
+	tmy	0(%r1), 42
+	tmy	0(%r15), 42
+	tmy	524287(%r1), 42
+	tmy	524287(%r15), 42
+
 #CHECK: x	%r0, 0                  # encoding: [0x57,0x00,0x00,0x00]
 #CHECK: x	%r0, 4095               # encoding: [0x57,0x00,0x0f,0xff]
 #CHECK: x	%r0, 0(%r1)             # encoding: [0x57,0x00,0x10,0x00]
@@ -7117,6 +8435,32 @@
 	x	%r0, 4095(%r15,%r1)
 	x	%r15, 0
 
+#CHECK: xc	0(1), 0                 # encoding: [0xd7,0x00,0x00,0x00,0x00,0x00]
+#CHECK: xc	0(1), 0(%r1)            # encoding: [0xd7,0x00,0x00,0x00,0x10,0x00]
+#CHECK: xc	0(1), 0(%r15)           # encoding: [0xd7,0x00,0x00,0x00,0xf0,0x00]
+#CHECK: xc	0(1), 4095              # encoding: [0xd7,0x00,0x00,0x00,0x0f,0xff]
+#CHECK: xc	0(1), 4095(%r1)         # encoding: [0xd7,0x00,0x00,0x00,0x1f,0xff]
+#CHECK: xc	0(1), 4095(%r15)        # encoding: [0xd7,0x00,0x00,0x00,0xff,0xff]
+#CHECK: xc	0(1,%r1), 0             # encoding: [0xd7,0x00,0x10,0x00,0x00,0x00]
+#CHECK: xc	0(1,%r15), 0            # encoding: [0xd7,0x00,0xf0,0x00,0x00,0x00]
+#CHECK: xc	4095(1,%r1), 0          # encoding: [0xd7,0x00,0x1f,0xff,0x00,0x00]
+#CHECK: xc	4095(1,%r15), 0         # encoding: [0xd7,0x00,0xff,0xff,0x00,0x00]
+#CHECK: xc	0(256,%r1), 0           # encoding: [0xd7,0xff,0x10,0x00,0x00,0x00]
+#CHECK: xc	0(256,%r15), 0          # encoding: [0xd7,0xff,0xf0,0x00,0x00,0x00]
+
+	xc	0(1), 0
+	xc	0(1), 0(%r1)
+	xc	0(1), 0(%r15)
+	xc	0(1), 4095
+	xc	0(1), 4095(%r1)
+	xc	0(1), 4095(%r15)
+	xc	0(1,%r1), 0
+	xc	0(1,%r15), 0
+	xc	4095(1,%r1), 0
+	xc	4095(1,%r15), 0
+	xc	0(256,%r1), 0
+	xc	0(256,%r15), 0
+
 #CHECK: xg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x82]
 #CHECK: xg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x82]
 #CHECK: xg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x82]
diff --git a/test/MC/SystemZ/lit.local.cfg b/test/MC/SystemZ/lit.local.cfg
index abb6974..b12af09 100644
--- a/test/MC/SystemZ/lit.local.cfg
+++ b/test/MC/SystemZ/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'SystemZ' in targets:
     config.unsupported = True
diff --git a/test/MC/X86/AlignedBundling/lit.local.cfg b/test/MC/X86/AlignedBundling/lit.local.cfg
index 6c49f08..ba763cf 100644
--- a/test/MC/X86/AlignedBundling/lit.local.cfg
+++ b/test/MC/X86/AlignedBundling/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/X86/avx512-encodings.s b/test/MC/X86/avx512-encodings.s
index 26a77c1..38f9190 100644
--- a/test/MC/X86/avx512-encodings.s
+++ b/test/MC/X86/avx512-encodings.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl --show-encoding %s | FileCheck %s
 
 // CHECK: vinserti32x4
 // CHECK: encoding: [0x62,0xa3,0x55,0x48,0x38,0xcd,0x01]
@@ -19,3 +19,27 @@ vextracti64x4  $1, %zmm9, %ymm17
 // CHECK: vextracti64x4
 // CHECK: encoding: [0x62,0x73,0xfd,0x48,0x3b,0x4f,0x10,0x01]
 vextracti64x4  $1, %zmm9, 512(%rdi)
+
+// CHECK: vpsrad
+// CHECK: encoding: [0x62,0xb1,0x35,0x40,0x72,0xe1,0x02]
+vpsrad $2, %zmm17, %zmm25
+
+// CHECK: vpsrad
+// CHECK: encoding: [0x62,0xf1,0x35,0x40,0x72,0x64,0xb7,0x08,0x02]
+vpsrad $2, 512(%rdi, %rsi, 4), %zmm25
+
+// CHECK: vpsrad
+// CHECK: encoding: [0x62,0x21,0x1d,0x48,0xe2,0xc9]
+vpsrad %xmm17, %zmm12, %zmm25
+
+// CHECK: vpsrad
+// CHECK: encoding: [0x62,0x61,0x1d,0x48,0xe2,0x4c,0xb7,0x20]
+vpsrad 512(%rdi, %rsi, 4), %zmm12, %zmm25
+
+// CHECK: vpbroadcastd {{.*}} {%k1} {z}
+// CHECK: encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc8]
+vpbroadcastd  %xmm0, %zmm1 {%k1} {z}
+
+// CHECK: vmovdqu64 {{.*}} {%k3}
+// CHECK: encoding: [0x62,0xf1,0xfe,0x4b,0x6f,0xc8]
+vmovdqu64 %zmm0, %zmm1 {%k3}
diff --git a/test/MC/X86/cfi_def_cfa-crash.s b/test/MC/X86/cfi_def_cfa-crash.s
new file mode 100644
index 0000000..9d22d6e
--- /dev/null
+++ b/test/MC/X86/cfi_def_cfa-crash.s
@@ -0,0 +1,73 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin -filetype=obj %s -o - | macho-dump | FileCheck %s
+
+// We were trying to generate compact unwind info for assembly like this.
+// The .cfi_def_cfa directive, however, throws a wrench into that and was
+// causing an llvm_unreachable() failure. Make sure the assembler can handle
+// the input. The actual eh_frames created using these directives are checked
+// elsewhere. This test is a simpler "does the code assemble" check.
+
+// rdar://15406518
+
+.macro SaveRegisters
+
+ push %rbp
+ .cfi_def_cfa_offset 16
+ .cfi_offset rbp, -16
+
+ mov %rsp, %rbp
+ .cfi_def_cfa_register rbp
+
+ sub $$0x80+8, %rsp
+
+ movdqa %xmm0, -0x80(%rbp)
+ push %rax
+ movdqa %xmm1, -0x70(%rbp)
+ push %rdi
+ movdqa %xmm2, -0x60(%rbp)
+ push %rsi
+ movdqa %xmm3, -0x50(%rbp)
+ push %rdx
+ movdqa %xmm4, -0x40(%rbp)
+ push %rcx
+ movdqa %xmm5, -0x30(%rbp)
+ push %r8
+ movdqa %xmm6, -0x20(%rbp)
+ push %r9
+ movdqa %xmm7, -0x10(%rbp)
+
+.endmacro
+.macro RestoreRegisters
+
+ movdqa -0x80(%rbp), %xmm0
+ pop %r9
+ movdqa -0x70(%rbp), %xmm1
+ pop %r8
+ movdqa -0x60(%rbp), %xmm2
+ pop %rcx
+ movdqa -0x50(%rbp), %xmm3
+ pop %rdx
+ movdqa -0x40(%rbp), %xmm4
+ pop %rsi
+ movdqa -0x30(%rbp), %xmm5
+ pop %rdi
+ movdqa -0x20(%rbp), %xmm6
+ pop %rax
+ movdqa -0x10(%rbp), %xmm7
+
+ leave
+ .cfi_def_cfa rsp, 8
+ .cfi_same_value rbp
+
+.endmacro
+
+_foo:
+.cfi_startproc
+  SaveRegisters
+
+  RestoreRegisters
+  ret
+ .cfi_endproc
+
+
+
+// CHECK: 'section_name', '__eh_frame\x00
diff --git a/test/MC/X86/intel-syntax.s b/test/MC/X86/intel-syntax.s
index ff86e8d..9677da7 100644
--- a/test/MC/X86/intel-syntax.s
+++ b/test/MC/X86/intel-syntax.s
@@ -63,6 +63,14 @@ _main:
     mov ECX, DWORD PTR [4*ECX + _fnan]
 // CHECK: movq %fs:320, %rax
     mov RAX, QWORD PTR FS:[320]
+// CHECK: movq %fs:320, %rax
+    mov RAX, QWORD PTR FS:320
+// CHECK: movq %rax, %fs:320
+    mov QWORD PTR FS:320, RAX
+// CHECK: movq %rax, %fs:20(%rbx)
+    mov QWORD PTR FS:20[rbx], RAX
+// CHECK: vshufpd $1, %xmm2, %xmm1, %xmm0
+    vshufpd XMM0, XMM1, XMM2, 1
 // CHECK: vpgatherdd %xmm8, (%r15,%xmm9,2), %xmm1
     vpgatherdd XMM10, DWORD PTR [R15 + 2*XMM9], XMM8
 // CHECK: movsd	-8, %xmm5
diff --git a/test/MC/X86/lit.local.cfg b/test/MC/X86/lit.local.cfg
index ad280c7..19840aa 100644
--- a/test/MC/X86/lit.local.cfg
+++ b/test/MC/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index c0eac5e..6b41f48 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -549,8 +549,8 @@ cvttpd2dq	0xdeadbeef(%ebx,%ecx,8),%xmm5
 
 // rdar://8490728 - llvm-mc rejects 'movmskpd'
 movmskpd	%xmm6, %rax
-// CHECK: movmskpd	%xmm6, %rax
-// CHECK: encoding: [0x66,0x48,0x0f,0x50,0xc6]
+// CHECK: movmskpd	%xmm6, %eax
+// CHECK: encoding: [0x66,0x0f,0x50,0xc6]
 movmskpd	%xmm6, %eax
 // CHECK: movmskpd	%xmm6, %eax
 // CHECK: encoding: [0x66,0x0f,0x50,0xc6]
@@ -1375,3 +1375,16 @@ fsub %st(1)
 fsubr %st(1)
 fdiv %st(1)
 fdivr %st(1)
+
+// CHECK: movd %xmm0, %eax
+// CHECK: movd %xmm0, %rax
+// CHECK: movd %xmm0, %rax
+// CHECK: vmovd %xmm0, %eax
+// CHECK: vmovq %xmm0, %rax
+// CHECK: vmovq %xmm0, %rax
+movd %xmm0, %eax
+movd %xmm0, %rax
+movq %xmm0, %rax
+vmovd %xmm0, %eax
+vmovd %xmm0, %rax
+vmovq %xmm0, %rax
diff --git a/test/MC/X86/x86_64-avx-encoding.s b/test/MC/X86/x86_64-avx-encoding.s
index 6da9e21..5ba8064 100644
--- a/test/MC/X86/x86_64-avx-encoding.s
+++ b/test/MC/X86/x86_64-avx-encoding.s
@@ -2212,11 +2212,11 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc5,0x79,0x7e,0x30]
           vmovd  %xmm14, (%rax)
 
-// CHECK: vmovd  %rax, %xmm14
+// CHECK: vmovq  %rax, %xmm14
 // CHECK: encoding: [0xc4,0x61,0xf9,0x6e,0xf0]
           vmovd  %rax, %xmm14
 
-// CHECK: vmovd %xmm0, %rax
+// CHECK: vmovq %xmm0, %rax
 // CHECK: encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
           vmovd %xmm0, %rax
 
@@ -4044,43 +4044,43 @@ vdivpd  -4(%rcx,%rbx,8), %xmm10, %xmm11
 // CHECK: encoding: [0xc4,0x43,0x79,0x17,0xc0,0x0a]
           vextractps   $10, %xmm8, %r8
 
-// CHECK: vextractps   $7, %xmm4, %rcx
+// CHECK: vextractps   $7, %xmm4, %ecx
 // CHECK: encoding: [0xc4,0xe3,0x79,0x17,0xe1,0x07]
           vextractps   $7, %xmm4, %rcx
 
-// CHECK: vmovd  %xmm4, %rcx
+// CHECK: vmovq  %xmm4, %rcx
 // CHECK: encoding: [0xc4,0xe1,0xf9,0x7e,0xe1]
           vmovd  %xmm4, %rcx
 
-// CHECK: vmovmskpd  %xmm4, %rcx
+// CHECK: vmovmskpd  %xmm4, %ecx
 // CHECK: encoding: [0xc5,0xf9,0x50,0xcc]
           vmovmskpd  %xmm4, %rcx
 
-// CHECK: vmovmskpd  %ymm4, %rcx
+// CHECK: vmovmskpd  %ymm4, %ecx
 // CHECK: encoding: [0xc5,0xfd,0x50,0xcc]
           vmovmskpd  %ymm4, %rcx
 
-// CHECK: vmovmskps  %xmm4, %rcx
+// CHECK: vmovmskps  %xmm4, %ecx
 // CHECK: encoding: [0xc5,0xf8,0x50,0xcc]
           vmovmskps  %xmm4, %rcx
 
-// CHECK: vmovmskps  %ymm4, %rcx
+// CHECK: vmovmskps  %ymm4, %ecx
 // CHECK: encoding: [0xc5,0xfc,0x50,0xcc]
           vmovmskps  %ymm4, %rcx
 
-// CHECK: vpextrb  $7, %xmm4, %rcx
+// CHECK: vpextrb  $7, %xmm4, %ecx
 // CHECK: encoding: [0xc4,0xe3,0x79,0x14,0xe1,0x07]
           vpextrb  $7, %xmm4, %rcx
 
-// CHECK: vpinsrw  $7, %r8, %xmm15, %xmm8
+// CHECK: vpinsrw  $7, %r8d, %xmm15, %xmm8
 // CHECK: encoding: [0xc4,0x41,0x01,0xc4,0xc0,0x07]
           vpinsrw  $7, %r8, %xmm15, %xmm8
 
-// CHECK: vpinsrw  $7, %rcx, %xmm4, %xmm6
+// CHECK: vpinsrw  $7, %ecx, %xmm4, %xmm6
 // CHECK: encoding: [0xc5,0xd9,0xc4,0xf1,0x07]
           vpinsrw  $7, %rcx, %xmm4, %xmm6
 
-// CHECK: vpmovmskb  %xmm4, %rcx
+// CHECK: vpmovmskb  %xmm4, %ecx
 // CHECK: encoding: [0xc5,0xf9,0xd7,0xcc]
           vpmovmskb  %xmm4, %rcx
 
@@ -4185,3 +4185,59 @@ _foo2:
 // CHECK: vpgatherqd %xmm8, (%r15,%ymm9,2), %xmm10
 // CHECK: encoding: [0xc4,0x02,0x3d,0x91,0x14,0x4f]
           vpgatherqd %xmm8, (%r15,%ymm9,2), %xmm10
+
+// CHECK: vmovaps %xmm0, %xmm8
+// CHECK: encoding: [0xc5,0x78,0x28,0xc0]
+          vmovaps %xmm0, %xmm8
+
+// CHECK: vmovaps %xmm8, %xmm0
+// CHECK: encoding: [0xc5,0x78,0x29,0xc0]
+          vmovaps %xmm8, %xmm0
+
+// CHECK: vmovaps %ymm0, %ymm8
+// CHECK: encoding: [0xc5,0x7c,0x28,0xc0]
+          vmovaps %ymm0, %ymm8
+
+// CHECK: vmovaps %ymm8, %ymm0
+// CHECK: encoding: [0xc5,0x7c,0x29,0xc0]
+          vmovaps %ymm8, %ymm0
+
+// CHECK: vmovups %xmm0, %xmm8
+// CHECK: encoding: [0xc5,0x78,0x10,0xc0]
+          vmovups %xmm0, %xmm8
+
+// CHECK: vmovups %xmm8, %xmm0
+// CHECK: encoding: [0xc5,0x78,0x11,0xc0]
+          vmovups %xmm8, %xmm0
+
+// CHECK: vmovups %ymm0, %ymm8
+// CHECK: encoding: [0xc5,0x7c,0x10,0xc0]
+          vmovups %ymm0, %ymm8
+
+// CHECK: vmovups %ymm8, %ymm0
+// CHECK: encoding: [0xc5,0x7c,0x11,0xc0]
+          vmovups %ymm8, %ymm0
+
+// CHECK: vmovss %xmm0, %xmm0, %xmm8
+// CHECK: encoding: [0xc5,0x7a,0x10,0xc0]
+          vmovss %xmm0, %xmm0, %xmm8
+
+// CHECK: vmovss %xmm0, %xmm8, %xmm0
+// CHECK: encoding: [0xc5,0xba,0x10,0xc0]
+          vmovss %xmm0, %xmm8, %xmm0
+
+// CHECK: vmovss %xmm8, %xmm0, %xmm0
+// CHECK: encoding: [0xc5,0x7a,0x11,0xc0]
+          vmovss %xmm8, %xmm0, %xmm0
+
+// CHECK: vmovsd %xmm0, %xmm0, %xmm8
+// CHECK: encoding: [0xc5,0x7b,0x10,0xc0]
+          vmovsd %xmm0, %xmm0, %xmm8
+
+// CHECK: vmovsd %xmm0, %xmm8, %xmm0
+// CHECK: encoding: [0xc5,0xbb,0x10,0xc0]
+          vmovsd %xmm0, %xmm8, %xmm0
+
+// CHECK: vmovsd %xmm8, %xmm0, %xmm0
+// CHECK: encoding: [0xc5,0x7b,0x11,0xc0]
+          vmovsd %xmm8, %xmm0, %xmm0
diff --git a/test/MC/X86/x86_64-encoding.s b/test/MC/X86/x86_64-encoding.s
index cfdf87f..40b93f0 100644
--- a/test/MC/X86/x86_64-encoding.s
+++ b/test/MC/X86/x86_64-encoding.s
@@ -120,6 +120,66 @@ movd %mm1, %edx
 // CHECK:  fixup A - offset: 5, value: CPI1_0-4
 pshufb	CPI1_0(%rip), %xmm1
 
+// CHECK: sha1rnds4 $1, %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x3a,0xcc,0xd1,0x01]
+sha1rnds4 $1, %xmm1, %xmm2
+
+// CHECK: sha1rnds4 $1, (%rax), %xmm2
+// CHECK:   encoding: [0x0f,0x3a,0xcc,0x10,0x01]
+sha1rnds4 $1, (%rax), %xmm2
+
+// CHECK: sha1nexte %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xc8,0xd1]
+sha1nexte %xmm1, %xmm2
+
+// CHECK: sha1msg1 %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xc9,0xd1]
+sha1msg1 %xmm1, %xmm2
+
+// CHECK: sha1msg1 (%rax), %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xc9,0x10]
+sha1msg1 (%rax), %xmm2
+
+// CHECK: sha1msg2 %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xca,0xd1]
+sha1msg2 %xmm1, %xmm2
+
+// CHECK: sha1msg2 (%rax), %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xca,0x10]
+sha1msg2 (%rax), %xmm2
+
+// CHECK: sha256rnds2 (%rax), %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcb,0x10]
+sha256rnds2 (%rax), %xmm2
+
+// CHECK: sha256rnds2 %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcb,0xd1]
+sha256rnds2 %xmm1, %xmm2
+
+// CHECK: sha256rnds2 (%rax), %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcb,0x10]
+sha256rnds2 %xmm0, (%rax), %xmm2
+
+// CHECK: sha256rnds2 %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcb,0xd1]
+sha256rnds2 %xmm0, %xmm1, %xmm2
+
+// CHECK: sha256msg1 %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcc,0xd1]
+sha256msg1 %xmm1, %xmm2
+
+// CHECK: sha256msg1 (%rax), %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcc,0x10]
+sha256msg1 (%rax), %xmm2
+
+// CHECK: sha256msg2 %xmm1, %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcd,0xd1]
+sha256msg2 %xmm1, %xmm2
+
+// CHECK: sha256msg2 (%rax), %xmm2
+// CHECK:   encoding: [0x0f,0x38,0xcd,0x10]
+sha256msg2 (%rax), %xmm2
+
 // CHECK: movq  57005(,%riz), %rbx
 // CHECK: encoding: [0x48,0x8b,0x1c,0x25,0xad,0xde,0x00,0x00]
           movq  57005(,%riz), %rbx
@@ -171,3 +231,15 @@ pshufb	CPI1_0(%rip), %xmm1
 // CHECK: filds	(%rdi)
 // CHECK:  encoding: [0xdf,0x07]
         	filds	(%rdi)
+
+// CHECK: pmovmskb	%xmm5, %ecx
+// CHECK:  encoding: [0x66,0x0f,0xd7,0xcd]
+        	pmovmskb	%xmm5,%rcx
+
+// CHECK: pinsrw $3, %ecx, %xmm5
+// CHECK: encoding: [0x66,0x0f,0xc4,0xe9,0x03]
+          pinsrw $3, %ecx, %xmm5
+
+// CHECK: pinsrw $3, %ecx, %xmm5
+// CHECK: encoding: [0x66,0x0f,0xc4,0xe9,0x03]
+          pinsrw $3, %rcx, %xmm5
diff --git a/test/MC/X86/x86_64-tbm-encoding.s b/test/MC/X86/x86_64-tbm-encoding.s
new file mode 100644
index 0000000..180578b
--- /dev/null
+++ b/test/MC/X86/x86_64-tbm-encoding.s
@@ -0,0 +1,196 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// bextri 32 reg
+// CHECK: bextr   $2814, %edi, %eax
+// CHECK: encoding: [0x8f,0xea,0x78,0x10,0xc7,0xfe,0x0a,0x00,0x00]
+          bextr   $2814, %edi, %eax
+
+// bextri 32 mem
+// CHECK: bextr   $2814, (%rdi), %eax
+// CHECK: encoding: [0x8f,0xea,0x78,0x10,0x07,0xfe,0x0a,0x00,0x00]
+          bextr   $2814, (%rdi), %eax
+
+// bextri 64 reg
+// CHECK: bextr   $2814, %rdi, %rax
+// CHECK: encoding: [0x8f,0xea,0xf8,0x10,0xc7,0xfe,0x0a,0x00,0x00]
+          bextr   $2814, %rdi, %rax
+
+// bextri 64 mem
+// CHECK: bextr   $2814, (%rdi), %rax
+// CHECK: encoding: [0x8f,0xea,0xf8,0x10,0x07,0xfe,0x0a,0x00,0x00]
+          bextr   $2814, (%rdi), %rax
+
+// blcfill 32 reg
+// CHECK: blcfill %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0xcf]
+          blcfill %edi, %eax
+
+// blcfill 32 mem
+// CHECK: blcfill (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0x0f]
+          blcfill (%rdi), %eax
+
+// blcfill 64 reg
+// CHECK: blcfill %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0xcf]
+          blcfill %rdi, %rax
+
+// blcfill 64 mem
+// CHECK: blcfill (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0x0f]
+          blcfill (%rdi), %rax
+
+// blci   32 reg
+// CHECK: blci    %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x02,0xf7]
+          blci    %edi, %eax
+
+// blci   32 mem
+// CHECK: blci    (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x02,0x37]
+          blci    (%rdi), %eax
+
+// blci   64 reg
+// CHECK: blci    %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x02,0xf7]
+          blci    %rdi, %rax
+
+// blci   64 mem
+// CHECK: blci    (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x02,0x37]
+          blci    (%rdi), %rax
+
+// blcic  32 reg
+// CHECK: blcic   %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0xef]
+          blcic   %edi, %eax
+
+// blcic  32 mem
+// CHECK: blcic   (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0x2f]
+          blcic   (%rdi), %eax
+
+// blcic  64 reg
+// CHECK: blcic   %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0xef]
+          blcic   %rdi, %rax
+
+// blcic  64 mem
+// CHECK: blcic   (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0x2f]
+          blcic   (%rdi), %rax
+
+// blcmsk 32 reg
+// CHECK: blcmsk  %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x02,0xcf]
+          blcmsk  %edi, %eax
+
+// blcmsk 32 mem
+// CHECK: blcmsk  (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x02,0x0f]
+          blcmsk  (%rdi), %eax
+
+// blcmsk 64 reg
+// CHECK: blcmsk  %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x02,0xcf]
+          blcmsk  %rdi, %rax
+
+// blcmsk 64 mem
+// CHECK: blcmsk  (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x02,0x0f]
+          blcmsk  (%rdi), %rax
+
+// blcs   32 reg
+// CHECK: blcs    %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0xdf]
+          blcs    %edi, %eax
+
+// blcs   32 mem
+// CHECK: blcs    (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0x1f]
+          blcs    (%rdi), %eax
+
+// blcs   64 reg
+// CHECK: blcs    %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0xdf]
+          blcs    %rdi, %rax
+
+// blcs   64 mem
+// CHECK: blcs    (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0x1f]
+          blcs    (%rdi), %rax
+
+// blsfill 32 reg
+// CHECK: blsfill %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0xd7]
+          blsfill %edi, %eax
+
+// blsfill 32 mem
+// CHECK: blsfill (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0x17]
+          blsfill (%rdi), %eax
+
+// blsfill 64 reg
+// CHECK: blsfill %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0xd7]
+          blsfill %rdi, %rax
+
+// blsfill 64 mem
+// CHECK: blsfill (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0x17]
+          blsfill (%rdi), %rax
+
+// blsic  32 reg
+// CHECK: blsic   %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0xf7]
+          blsic   %edi, %eax
+
+// blsic  32 mem
+// CHECK: blsic   (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0x37]
+          blsic   (%rdi), %eax
+
+// blsic  64 reg
+// CHECK: blsic   %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0xf7]
+          blsic   %rdi, %rax
+
+// t1mskc 32 reg
+// CHECK: t1mskc  %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0xff]
+          t1mskc  %edi, %eax
+
+// t1mskc 32 mem
+// CHECK: t1mskc  (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0x3f]
+          t1mskc  (%rdi), %eax
+
+// t1mskc 64 reg
+// CHECK: t1mskc  %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0xff]
+          t1mskc  %rdi, %rax
+
+// t1mskc 64 mem
+// CHECK: t1mskc  (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0x3f]
+          t1mskc  (%rdi), %rax
+
+// tzmsk  32 reg
+// CHECK: tzmsk   %edi, %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0xe7]
+          tzmsk   %edi, %eax
+
+// tzmsk  32 mem
+// CHECK: tzmsk   (%rdi), %eax
+// CHECK: encoding: [0x8f,0xe9,0x78,0x01,0x27]
+          tzmsk   (%rdi), %eax
+
+// tzmsk  64 reg
+// CHECK: tzmsk   %rdi, %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0xe7]
+          tzmsk   %rdi, %rax
+
+// tzmsk  64 mem
+// CHECK: tzmsk   (%rdi), %rax
+// CHECK: encoding: [0x8f,0xe9,0xf8,0x01,0x27]
+          tzmsk   (%rdi), %rax
diff --git a/test/MC/X86/x86_nop.s b/test/MC/X86/x86_nop.s
index 396e302..059f591 100644
--- a/test/MC/X86/x86_nop.s
+++ b/test/MC/X86/x86_nop.s
@@ -1,13 +1,36 @@
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=generic %s | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i386 %s | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i486 %s | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i586 %s | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium %s | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium-mmx %s | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=geode %s | llvm-objdump -d - | FileCheck %s
-# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i686 %s | llvm-objdump -d - | not FileCheck %s
-
-# CHECK-NOT: nop{{[lw]}}
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=generic %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=i386 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=i486 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=i586 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=pentium %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=pentium-mmx %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=geode %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=i686 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=k6 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=k6-2 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=k6-3 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=winchip-c6 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=winchip2 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=c3 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=c3-2 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=i686-pc-linux -mcpu=core2 %s | llvm-objdump -d - | FileCheck --check-prefix=NOPL %s
+
+
 inc %eax
 .align 8
 inc %eax
+
+// CHECK: 0:	40                                           	incl	%eax
+// CHECK: 1:	90                                           	nop
+// CHECK: 2:	90                                           	nop
+// CHECK: 3:	90                                           	nop
+// CHECK: 4:	90                                           	nop
+// CHECK: 5:	90                                           	nop
+// CHECK: 6:	90                                           	nop
+// CHECK: 7:	90                                           	nop
+// CHECK: 8:	40                                           	incl	%eax
+
+
+// NOPL: 0:	40                                           	incl	%eax
+// NOPL: 1:	0f 1f 80 00 00 00 00                         	nopl	(%eax)
+// NOPL: 8:	40                                           	incl	%eax
diff --git a/test/Makefile b/test/Makefile
index 7204147..d3227dd 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -31,16 +31,10 @@ endif
 
 ifdef TESTSUITE
 LIT_TESTSUITE := $(TESTSUITE)
-CLEANED_TESTSUITE := $(patsubst %/,%,$(TESTSUITE))
-CLEANED_TESTSUITE := $(patsubst test/%,%,$(CLEANED_TESTSUITE))
 else
 LIT_TESTSUITE := .
 endif
 
-ifdef VG
-VALGRIND := valgrind --tool=memcheck --quiet --trace-children=yes --error-exitcode=3 --leak-check=full $(VALGRIND_EXTRA_ARGS)
-endif
-
 # Check what to run for -all.
 LIT_ALL_TESTSUITES := $(LIT_TESTSUITE)
 
@@ -122,16 +116,6 @@ else
 ENABLE_ASSERTIONS=1
 endif
 
-# Derive whether or not LTO is enabled by checking the extra options.
-LTO_IS_ENABLED := 0
-ifneq ($(findstring -flto,$(CompileCommonOpts)),)
-LTO_IS_ENABLED := 1
-else
-ifneq ($(findstring -O4,$(CompileCommonOpts)),)
-LTO_IS_ENABLED := 1
-endif
-endif
-
 lit.site.cfg: FORCE
 	@echo "Making LLVM 'lit.site.cfg' file..."
 	@$(ECHOPATH) s=@LLVM_HOST_TRIPLE@=$(HOST_TRIPLE)=g > lit.tmp
@@ -142,10 +126,9 @@ lit.site.cfg: FORCE
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=$(PYTHON)=g >> lit.tmp
-	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp
+	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -cclib -L$(LibDir) -I $(LibDir)/ocaml=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
-	@$(ECHOPATH) s=@LTO_IS_ENABLED@=$(LTO_IS_ENABLED)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINDINGS@=$(BINDINGS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp
diff --git a/test/Object/Inputs/corrupt-version.elf-x86_64 b/test/Object/Inputs/corrupt-version.elf-x86_64
new file mode 100644
index 0000000..1241a27
--- /dev/null
+++ b/test/Object/Inputs/corrupt-version.elf-x86_64
diff --git a/test/Object/Inputs/corrupt.elf-x86-64 b/test/Object/Inputs/corrupt.elf-x86-64
new file mode 100644
index 0000000..8ae5f17
--- /dev/null
+++ b/test/Object/Inputs/corrupt.elf-x86-64
diff --git a/test/Object/Inputs/program-headers.mips b/test/Object/Inputs/program-headers.mips
new file mode 100755
index 0000000..54ebfea
--- /dev/null
+++ b/test/Object/Inputs/program-headers.mips
diff --git a/test/Object/Inputs/trivial-executable-test.macho-x86-64 b/test/Object/Inputs/trivial-executable-test.macho-x86-64
new file mode 100755
index 0000000..50a6bab
--- /dev/null
+++ b/test/Object/Inputs/trivial-executable-test.macho-x86-64
diff --git a/test/Object/Inputs/weak-global-symbol.macho-i386 b/test/Object/Inputs/weak-global-symbol.macho-i386
new file mode 100644
index 0000000..a9c8e0c
--- /dev/null
+++ b/test/Object/Inputs/weak-global-symbol.macho-i386
diff --git a/test/Object/Mips/lit.local.cfg b/test/Object/Mips/lit.local.cfg
index 1499317..88262fb 100644
--- a/test/Object/Mips/lit.local.cfg
+++ b/test/Object/Mips/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Mips' in targets:
     config.unsupported = True
diff --git a/test/Object/X86/lit.local.cfg b/test/Object/X86/lit.local.cfg
index 6a29e92..ba763cf 100644
--- a/test/Object/X86/lit.local.cfg
+++ b/test/Object/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Object/X86/objdump-cfg-invalid-opcode.yaml b/test/Object/X86/objdump-cfg-invalid-opcode.yaml
new file mode 100644
index 0000000..56ab1d2
--- /dev/null
+++ b/test/Object/X86/objdump-cfg-invalid-opcode.yaml
@@ -0,0 +1,58 @@
+# RUN: yaml2obj -format=elf %s | llvm-objdump -d -yaml-cfg=%t - && FileCheck --check-prefix=CFG < %t %s
+# REQUIRES: shell
+#
+# Generated from:
+# main:
+# .LBL0_1:
+# 	movq	8(%rsi), %rax
+# 	<invalid opcode: 06>
+# 	nop
+
+!ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .text
+    Type: SHT_PROGBITS
+    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+    Content: "488B46080690"
+
+## 0000000000000000 <main>:
+
+#CFG: Atoms:
+#CFG:   - StartAddress:    0x0000000000000000
+#CFG:     Size:            4
+#CFG:     Type:            Text
+
+##    0:   48 8b 46 08             mov    0x8(%rsi),%rax
+#CFG:       - Inst:            MOV64rm
+#CFG:         Size:            4
+#CFG:         Ops:             [ RRAX, RRSI, I1, R, I8, R ]
+
+
+#CFG:   - StartAddress:    0x0000000000000004
+#CFG:     Size:            1
+#CFG:     Type:            Data
+
+##    4:   06                      (bad)
+#CFG:     Content:         06
+
+#CFG:   - StartAddress:    0x0000000000000005
+#CFG:     Size:            1
+#CFG:     Type:            Text
+
+##    5:   90                      nop
+#CFG:       - Inst:            NOOP
+#CFG:         Size:            1
+#CFG:         Ops:             [  ]
+
+Symbols:
+  Global:
+    - Name: main
+      Type: STT_FUNC
+      Section: .text
+      Value: 0x0
+      Size: 6
diff --git a/test/Object/X86/objdump-cfg-textatomsize.yaml b/test/Object/X86/objdump-cfg-textatomsize.yaml
new file mode 100644
index 0000000..87cb4e1
--- /dev/null
+++ b/test/Object/X86/objdump-cfg-textatomsize.yaml
@@ -0,0 +1,39 @@
+# RUN: yaml2obj -format=elf %s | llvm-objdump -d -yaml-cfg=%t - && FileCheck --check-prefix=CFG < %t %s
+# REQUIRES: shell
+#
+# Generated from:
+# main:
+# .LBL0_1:
+# 	jmp	.LBL0_1
+#
+
+!ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .text
+    Type: SHT_PROGBITS
+    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+    Content: "EBFE"
+
+## 0000000000000000 <main>:
+
+#CFG: Atoms:
+#CFG:   - StartAddress:    0x0000000000000000
+#CFG:     Size:            2
+
+##    0:   eb fe          jmp $-2
+#CFG:       - Inst:            JMP_1
+#CFG:         Size:            2
+#CFG:         Ops:             [ I-2 ]
+
+Symbols:
+  Global:
+    - Name: main
+      Type: STT_FUNC
+      Section: .text
+      Value: 0x0
+      Size: 2
diff --git a/test/Object/X86/objdump-cfg.yaml b/test/Object/X86/objdump-cfg.yaml
new file mode 100644
index 0000000..c5bff03
--- /dev/null
+++ b/test/Object/X86/objdump-cfg.yaml
@@ -0,0 +1,86 @@
+# RUN: yaml2obj -format=elf %s | llvm-objdump -d -yaml-cfg=%t - && FileCheck --check-prefix=CFG < %t %s
+# REQUIRES: shell
+#
+# Generated from:
+# main:
+# 	movl	$48, %eax
+# 	cmpl	$3, %edi
+# 	jl	.LBB0_2
+# 	movq	8(%rsi), %rax
+# 	movsbl	(%rax), %eax
+# .LBB0_2:
+# 	ret
+#
+
+!ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .text
+    Type: SHT_PROGBITS
+    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+    Content: "B83000000083FF037C07488B46080FBE00C3"
+
+## 0000000000000000 <main>:
+
+#CFG: Atoms:
+#CFG:   - StartAddress:    0x0000000000000000
+#CFG:     Size:            10
+
+##    0:   b8 30 00 00 00          mov    $0x30,%eax
+#CFG:       - Inst:            MOV32ri
+#CFG:         Size:            5
+#CFG:         Ops:             [ REAX, I48 ]
+
+##    5:   83 ff 03                cmp    $0x3,%edi
+#CFG:       - Inst:            CMP32ri8
+#CFG:         Size:            3
+#CFG:         Ops:             [ REDI, I3 ]
+
+##    8:   7c 07                   jl     11 <main+0x11>
+#CFG:       - Inst:            JL_1
+#CFG:         Size:            2
+#CFG:         Ops:             [ I7 ]
+
+#CFG:   - StartAddress:    0x000000000000000A
+#CFG:     Size:            7
+
+##    a:   48 8b 46 08             mov    0x8(%rsi),%rax
+#CFG:       - Inst:            MOV64rm
+#CFG:         Size:            4
+#CFG:         Ops:             [ RRAX, RRSI, I1, R, I8, R ]
+
+##    e:   0f be 00                movsbl (%rax),%eax
+#CFG:       - Inst:            MOVSX32rm8
+#CFG:         Size:            3
+#CFG:         Ops:             [ REAX, RRAX, I1, R, I0, R ]
+#CFG:   - StartAddress:    0x0000000000000011
+#CFG:     Size:            1
+
+##   11:   c3                      retq
+#CFG:       - Inst:            RET
+#CFG:         Size:            1
+#CFG:         Ops:             [  ]
+
+Symbols:
+  Global:
+    - Name: main
+      Type: STT_FUNC
+      Section: .text
+      Value: 0x0
+      Size: 18
+
+#CFG: Functions:
+#CFG:     BasicBlocks:
+#CFG:       - Address:         0x0000000000000000
+#CFG:         Preds:           [  ]
+#CFG:         Succs:           [ 0x0000000000000011, 0x000000000000000A ]
+#CFG:       - Address:         0x0000000000000011
+#CFG:         Preds:           [ 0x0000000000000000, 0x000000000000000A ]
+#CFG:         Succs:           [  ]
+#CFG:       - Address:         0x000000000000000A
+#CFG:         Preds:           [ 0x0000000000000000 ]
+#CFG:         Succs:           [ 0x0000000000000011 ]
diff --git a/test/Object/X86/objdump-disassembly-symbolic.test b/test/Object/X86/objdump-disassembly-symbolic.test
index 667bce9..858653e 100644
--- a/test/Object/X86/objdump-disassembly-symbolic.test
+++ b/test/Object/X86/objdump-disassembly-symbolic.test
@@ -3,6 +3,11 @@ RUN:              | FileCheck %s -check-prefix ELF-x86-64
 RUN: llvm-objdump -d -symbolize %p/../Inputs/trivial-object-test.macho-x86-64 \
 RUN:              | FileCheck %s -check-prefix MACHO-x86-64
 
+# Generate this using:
+#   ld trivial-object-test.macho-x86-64 -undefined dynamic_lookup
+RUN: llvm-objdump -d -symbolize %p/../Inputs/trivial-executable-test.macho-x86-64 \
+RUN:              | FileCheck %s -check-prefix MACHO-STUBS-x86-64
+
 ELF-x86-64: file format ELF64-x86-64
 ELF-x86-64: Disassembly of section .text:
 ELF-x86-64: main:
@@ -28,3 +33,16 @@ MACHO-x86-64:       1a:	e8 00 00 00 00                               	callq	_Som
 MACHO-x86-64:       1f:	8b 44 24 04                                  	movl	4(%rsp), %eax
 MACHO-x86-64:       23:	48 83 c4 08                                  	addq	$8, %rsp
 MACHO-x86-64:       27:	c3                                           	ret
+
+MACHO-STUBS-x86-64: file format Mach-O 64-bit x86-64
+MACHO-STUBS-x86-64: Disassembly of section __TEXT,__text:
+MACHO-STUBS-x86-64: _main:
+MACHO-STUBS-x86-64:     1f90:       48 83 ec 08                                     subq    $8, %rsp
+MACHO-STUBS-x86-64:     1f94:       c7 44 24 04 00 00 00 00                         movl    $0, 4(%rsp)
+MACHO-STUBS-x86-64:     1f9c:       48 8d 3d 45 00 00 00                            leaq    69(%rip), %rdi ## literal pool for: Hello World!
+MACHO-STUBS-x86-64:     1fa3:       e8 16 00 00 00                                  callq   puts
+MACHO-STUBS-x86-64:     1fa8:       30 c0                                           xorb    %al, %al
+MACHO-STUBS-x86-64:     1faa:       e8 09 00 00 00                                  callq   SomeOtherFunction
+MACHO-STUBS-x86-64:     1faf:       8b 44 24 04                                     movl    4(%rsp), %eax
+MACHO-STUBS-x86-64:     1fb3:       48 83 c4 08                                     addq    $8, %rsp
+MACHO-STUBS-x86-64:     1fb7:       c3                                              ret
diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test
index 0d2504d..6379504 100644
--- a/test/Object/archive-symtab.test
+++ b/test/Object/archive-symtab.test
@@ -48,3 +48,12 @@ CORRUPT-NEXT: 00000016 T main
 check that the we *don't* update the symbol table.
 RUN: llvm-ar s %t.a
 RUN: llvm-nm -s %t.a | FileCheck %s --check-prefix=CORRUPT
+
+repeate the test with llvm-ranlib
+
+RUN: rm -f %t.a
+RUN: llvm-ar rcS %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
+RUN: llvm-nm -s %t.a | FileCheck %s --check-prefix=NOMAP
+
+RUN: llvm-ranlib %t.a
+RUN: llvm-nm -s %t.a | FileCheck %s
diff --git a/test/Object/corrupt.test b/test/Object/corrupt.test
new file mode 100644
index 0000000..ef72a09
--- /dev/null
+++ b/test/Object/corrupt.test
@@ -0,0 +1,24 @@
+// Section name offset overflows section name string table.
+RUN: not llvm-readobj %p/Inputs/corrupt.elf-x86-64 -sections \
+RUN:     2>&1 | FileCheck --check-prefix=SECNAME %s
+
+// Section data offset past end of file.
+RUN: not llvm-readobj %p/Inputs/corrupt.elf-x86-64 -sections -section-data \
+RUN:     2>&1 | FileCheck --check-prefix=SECDATA %s
+
+// Symbol name offset overflows string table.
+RUN: not llvm-readobj %p/Inputs/corrupt.elf-x86-64 -symbols \
+RUN:     2>&1 | FileCheck --check-prefix=SYMNAME %s
+
+// Version index in .gnu.version overflows the version map.
+RUN: not llvm-readobj %p/Inputs/corrupt-version.elf-x86_64 -dt \
+RUN:     2>&1 | FileCheck --check-prefix=VER %s
+
+SECNAME: Error reading file: Invalid data was encountered while parsing the file.
+
+SECDATA: Error reading file: Invalid data was encountered while parsing the file.
+SECDATA: Error reading file: Invalid data was encountered while parsing the file.
+
+SYMNAME: Error reading file: Invalid data was encountered while parsing the file.
+
+VER: Error reading file: Invalid data was encountered while parsing the file.
diff --git a/test/Object/directory.ll b/test/Object/directory.ll
index bf8ff32..48eefcb 100644
--- a/test/Object/directory.ll
+++ b/test/Object/directory.ll
@@ -2,9 +2,6 @@
 ;RUN: not llvm-ar r %T/test.a . 2>&1 | FileCheck %s
 ;CHECK: .: Is a directory
 
-; Opening a directory works on cygwin and freebsd.
-;XFAIL: freebsd, cygwin
-
 ;RUN: rm -f %T/test.a
 ;RUN: touch %T/a-very-long-file-name
 ;RUN: llvm-ar r %T/test.a %s %T/a-very-long-file-name
diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index d517745..748d6f2 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -25,10 +25,10 @@ ELF:          U puts
 
 
 macho: 00000000 U _SomeOtherFunction
-macho: 00000000 s _main
+macho: 00000000 T _main
 macho: 00000000 U _puts
 
 macho64: 00000028 s L_.str
-macho64: 00000000 u _SomeOtherFunction
-macho64: 00000000 s _main
-macho64: 00000000 u _puts
+macho64: 00000000 U _SomeOtherFunction
+macho64: 00000000 T _main
+macho64: 00000000 U _puts
diff --git a/test/Object/nm-weak-global-macho.test b/test/Object/nm-weak-global-macho.test
new file mode 100644
index 0000000..ede2609
--- /dev/null
+++ b/test/Object/nm-weak-global-macho.test
@@ -0,0 +1,3 @@
+RUN: llvm-nm %p/Inputs/weak-global-symbol.macho-i386 | FileCheck %s
+
+CHECK: 00000000 S _a
diff --git a/test/Other/X86/lit.local.cfg b/test/Other/X86/lit.local.cfg
index da2db5a..ba763cf 100644
--- a/test/Other/X86/lit.local.cfg
+++ b/test/Other/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Other/attribute-comment.ll b/test/Other/attribute-comment.ll
index 7354e7f..d12b179 100644
--- a/test/Other/attribute-comment.ll
+++ b/test/Other/attribute-comment.ll
@@ -6,4 +6,4 @@ define void @test1() #0 {
   ret void
 }
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" uwtable "no-frame-pointer-elim"="true" readnone "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" uwtable "no-frame-pointer-elim"="true" readnone "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Other/close-stderr.ll b/test/Other/close-stderr.ll
index 6e180cd..b310bc2 100644
--- a/test/Other/close-stderr.ll
+++ b/test/Other/close-stderr.ll
@@ -9,8 +9,5 @@
 ; XFAIL: vg_leak
 ; REQUIRES: shell
 
-; opt will fail to open /dev/null on native win32.
-; XFAIL: win32
-
 ; Test that the error handling when writing to stderr fails exits the
 ; program cleanly rather than aborting.
diff --git a/test/Other/constant-fold-gep-address-spaces.ll b/test/Other/constant-fold-gep-address-spaces.ll
new file mode 100644
index 0000000..f6abe74
--- /dev/null
+++ b/test/Other/constant-fold-gep-address-spaces.ll
@@ -0,0 +1,235 @@
+; "PLAIN" - No optimizations. This tests the target-independent
+; constant folder.
+; RUN: opt -S -o - %s | FileCheck --check-prefix=PLAIN %s
+
+target datalayout = "e-p:128:128:128-p1:32:32:32-p2:8:8:8-p3:16:16:16-p4:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
+
+; The automatic constant folder in opt does not have targetdata access, so
+; it can't fold gep arithmetic, in general. However, the constant folder run
+; from instcombine and global opt can use targetdata.
+; PLAIN: @G8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1)
+@G8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1)
+; PLAIN: @G1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -1)
+@G1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -1)
+; PLAIN: @F8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2)
+@F8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2)
+; PLAIN: @F1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -2)
+@F1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -2)
+; PLAIN: @H8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* null, i32 -1)
+@H8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 0 to i8 addrspace(1)*), i32 -1)
+; PLAIN: @H1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i8 -1)
+@H1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 0 to i1 addrspace(2)*), i8 -1)
+
+
+; The target-independent folder should be able to do some clever
+; simplifications on sizeof, alignof, and offsetof expressions. The
+; target-dependent folder should fold these down to constants.
+; PLAIN-X: @a = constant i64 mul (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2310)
+@a = constant i64 mul (i64 3, i64 mul (i64 ptrtoint ({[7 x double], [7 x double]} addrspace(4)* getelementptr ({[7 x double], [7 x double]} addrspace(4)* null, i64 11) to i64), i64 5))
+
+; PLAIN-X: @b = constant i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64)
+@b = constant i64 ptrtoint ([13 x double] addrspace(4)* getelementptr ({i1, [13 x double]} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; PLAIN-X: @c = constant i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2)
+@c = constant i64 ptrtoint (double addrspace(4)* getelementptr ({double, double, double, double} addrspace(4)* null, i64 0, i32 2) to i64)
+
+; PLAIN-X: @d = constant i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 11)
+@d = constant i64 ptrtoint (double addrspace(4)* getelementptr ([13 x double] addrspace(4)* null, i64 0, i32 11) to i64)
+
+; PLAIN-X: @e = constant i64 ptrtoint (double addrspace(4)* getelementptr ({ double, float, double, double }* null, i64 0, i32 2) to i64)
+@e = constant i64 ptrtoint (double addrspace(4)* getelementptr ({double, float, double, double} addrspace(4)* null, i64 0, i32 2) to i64)
+
+; PLAIN-X: @f = constant i64 1
+@f = constant i64 ptrtoint (<{ i16, i128 }> addrspace(4)* getelementptr ({i1, <{ i16, i128 }>} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; PLAIN-X: @g = constant i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64)
+@g = constant i64 ptrtoint ({double, double} addrspace(4)* getelementptr ({i1, {double, double}} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; PLAIN-X: @h = constant i64 ptrtoint (i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i32 1) to i64)
+@h = constant i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i64 1) to i64)
+
+; PLAIN-X: @i = constant i64 ptrtoint (i1 addrspace(2)* getelementptr ({ i1, i1 addrspace(2)* }* null, i64 0, i32 1) to i64)
+@i = constant i64 ptrtoint (double addrspace(4)* getelementptr ({i1, double} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; The target-dependent folder should cast GEP indices to integer-sized pointers.
+
+; PLAIN: @M = constant i64 addrspace(4)* getelementptr (i64 addrspace(4)* null, i32 1)
+; PLAIN: @N = constant i64 addrspace(4)* getelementptr ({ i64, i64 } addrspace(4)* null, i32 0, i32 1)
+; PLAIN: @O = constant i64 addrspace(4)* getelementptr ([2 x i64] addrspace(4)* null, i32 0, i32 1)
+
+@M = constant i64 addrspace(4)* getelementptr (i64 addrspace(4)* null, i32 1)
+@N = constant i64 addrspace(4)* getelementptr ({ i64, i64 } addrspace(4)* null, i32 0, i32 1)
+@O = constant i64 addrspace(4)* getelementptr ([2 x i64] addrspace(4)* null, i32 0, i32 1)
+
+; Fold GEP of a GEP. Very simple cases are folded.
+
+; PLAIN-X: @Y = global [3 x { i32, i32 }]addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 2)
+@ext = external addrspace(3) global [3 x { i32, i32 }]
+@Y = global [3 x { i32, i32 }]addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 1), i64 1)
+
+; PLAIN-X: @Z = global i32addrspace(3)* getelementptr inbounds (i32addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 0, i64 1, i32 0), i64 1)
+@Z = global i32addrspace(3)* getelementptr inbounds (i32addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 0, i64 1, i32 0), i64 1)
+
+
+; Duplicate all of the above as function return values rather than
+; global initializers.
+
+; PLAIN: define i8 addrspace(1)* @goo8() #0 {
+; PLAIN:   %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1) to i8 addrspace(1)*
+; PLAIN:   ret i8 addrspace(1)* %t
+; PLAIN: }
+; PLAIN: define i1 addrspace(2)* @goo1() #0 {
+; PLAIN:   %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -1) to i1 addrspace(2)*
+; PLAIN:   ret i1 addrspace(2)* %t
+; PLAIN: }
+; PLAIN: define i8 addrspace(1)* @foo8() #0 {
+; PLAIN:   %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2) to i8 addrspace(1)*
+; PLAIN:   ret i8 addrspace(1)* %t
+; PLAIN: }
+; PLAIN: define i1 addrspace(2)* @foo1() #0 {
+; PLAIN:   %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -2) to i1 addrspace(2)*
+; PLAIN:   ret i1 addrspace(2)* %t
+; PLAIN: }
+; PLAIN: define i8 addrspace(1)* @hoo8() #0 {
+; PLAIN:   %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* null, i32 -1) to i8 addrspace(1)*
+; PLAIN:   ret i8 addrspace(1)* %t
+; PLAIN: }
+; PLAIN: define i1 addrspace(2)* @hoo1() #0 {
+; PLAIN:   %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i32 -1) to i1 addrspace(2)*
+; PLAIN:   ret i1 addrspace(2)* %t
+; PLAIN: }
+define i8 addrspace(1)* @goo8() #0 {
+  %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1) to i8 addrspace(1)*
+  ret i8 addrspace(1)* %t
+}
+define i1 addrspace(2)* @goo1() #0 {
+  %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -1) to i1 addrspace(2)*
+  ret i1 addrspace(2)* %t
+}
+define i8 addrspace(1)* @foo8() #0 {
+  %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2) to i8 addrspace(1)*
+  ret i8 addrspace(1)* %t
+}
+define i1 addrspace(2)* @foo1() #0 {
+  %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -2) to i1 addrspace(2)*
+  ret i1 addrspace(2)* %t
+}
+define i8 addrspace(1)* @hoo8() #0 {
+  %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 0 to i8 addrspace(1)*), i32 -1) to i8 addrspace(1)*
+  ret i8 addrspace(1)* %t
+}
+define i1 addrspace(2)* @hoo1() #0 {
+  %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 0 to i1 addrspace(2)*), i32 -1) to i1 addrspace(2)*
+  ret i1 addrspace(2)* %t
+}
+
+; PLAIN-X: define i64 @fa() #0 {
+; PLAIN-X:   %t = bitcast i64 mul (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2310) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fb() #0 {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fc() #0 {
+; PLAIN-X:   %t = bitcast i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fd() #0 {
+; PLAIN-X:   %t = bitcast i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 11) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fe() #0 {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({ double, float, double, double }* null, i64 0, i32 2) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @ff() #0 {
+; PLAIN-X:   %t = bitcast i64 1 to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fg() #0 {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fh() #0 {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fi() #0 {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (i1 addrspace(2)* getelementptr ({ i1, i1 addrspace(2)* }* null, i64 0, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+define i64 @fa() #0 {
+  %t = bitcast i64 mul (i64 3, i64 mul (i64 ptrtoint ({[7 x double], [7 x double]}* getelementptr ({[7 x double], [7 x double]}* null, i64 11) to i64), i64 5)) to i64
+  ret i64 %t
+}
+define i64 @fb() #0 {
+  %t = bitcast i64 ptrtoint ([13 x double] addrspace(4)* getelementptr ({i1, [13 x double]} addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fc() #0 {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({double, double, double, double} addrspace(4)* null, i64 0, i32 2) to i64) to i64
+  ret i64 %t
+}
+define i64 @fd() #0 {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ([13 x double] addrspace(4)* null, i64 0, i32 11) to i64) to i64
+  ret i64 %t
+}
+define i64 @fe() #0 {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({double, float, double, double} addrspace(4)* null, i64 0, i32 2) to i64) to i64
+  ret i64 %t
+}
+define i64 @ff() #0 {
+  %t = bitcast i64 ptrtoint (<{ i16, i128 }> addrspace(4)* getelementptr ({i1, <{ i16, i128 }>} addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fg() #0 {
+  %t = bitcast i64 ptrtoint ({double, double} addrspace(4)* getelementptr ({i1, {double, double}} addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fh() #0 {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fi() #0 {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({i1, double}addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+
+; PLAIN: define i64* @fM() #0 {
+; PLAIN:   %t = bitcast i64* getelementptr (i64* null, i32 1) to i64*
+; PLAIN:   ret i64* %t
+; PLAIN: }
+; PLAIN: define i64* @fN() #0 {
+; PLAIN:   %t = bitcast i64* getelementptr ({ i64, i64 }* null, i32 0, i32 1) to i64*
+; PLAIN:   ret i64* %t
+; PLAIN: }
+; PLAIN: define i64* @fO() #0 {
+; PLAIN:   %t = bitcast i64* getelementptr ([2 x i64]* null, i32 0, i32 1) to i64*
+; PLAIN:   ret i64* %t
+; PLAIN: }
+
+define i64* @fM() #0 {
+  %t = bitcast i64* getelementptr (i64* null, i32 1) to i64*
+  ret i64* %t
+}
+define i64* @fN() #0 {
+  %t = bitcast i64* getelementptr ({ i64, i64 }* null, i32 0, i32 1) to i64*
+  ret i64* %t
+}
+define i64* @fO() #0 {
+  %t = bitcast i64* getelementptr ([2 x i64]* null, i32 0, i32 1) to i64*
+  ret i64* %t
+}
+
+; PLAIN: define i32 addrspace(1)* @fZ() #0 {
+; PLAIN:   %t = bitcast i32 addrspace(1)* getelementptr inbounds (i32 addrspace(1)* getelementptr inbounds ([3 x { i32, i32 }] addrspace(1)* @ext2, i64 0, i64 1, i32 0), i64 1) to i32 addrspace(1)*
+; PLAIN:   ret i32 addrspace(1)* %t
+; PLAIN: }
+@ext2 = external addrspace(1) global [3 x { i32, i32 }]
+define i32 addrspace(1)* @fZ() #0 {
+  %t = bitcast i32 addrspace(1)* getelementptr inbounds (i32 addrspace(1)* getelementptr inbounds ([3 x { i32, i32 }] addrspace(1)* @ext2, i64 0, i64 1, i32 0), i64 1) to i32 addrspace(1)*
+  ret i32 addrspace(1)* %t
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Other/constant-fold-gep.ll b/test/Other/constant-fold-gep.ll
index 44b6628..aed4145 100644
--- a/test/Other/constant-fold-gep.ll
+++ b/test/Other/constant-fold-gep.ll
@@ -454,10 +454,10 @@ define i32* @fZ() nounwind {
 
 define i8* @different_addrspace() nounwind noinline {
 ; OPT: different_addrspace
-  %p = getelementptr inbounds i8* bitcast ([4 x i8] addrspace(12)* @p12 to i8*),
+  %p = getelementptr inbounds i8* addrspacecast ([4 x i8] addrspace(12)* @p12 to i8*),
                                   i32 2
   ret i8* %p
-; OPT: ret i8* getelementptr (i8* bitcast ([4 x i8] addrspace(12)* @p12 to i8*), i32 2)
+; OPT: ret i8* getelementptr (i8* addrspacecast ([4 x i8] addrspace(12)* @p12 to i8*), i32 2)
 }
 
 define i8* @same_addrspace() nounwind noinline {
diff --git a/test/Other/extract-alias.ll b/test/Other/extract-alias.ll
index d5bab4b..d1e4af5 100644
--- a/test/Other/extract-alias.ll
+++ b/test/Other/extract-alias.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-extract -func foo -S < %s | FileCheck %s
 ; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
 ; RUN: llvm-extract -alias zeda0 -S < %s | FileCheck --check-prefix=ALIAS %s
-; RUN: llvm-extract -ralias .*bar -S < %s | FileCheck --check-prefix=ALIASRE %s
+; RUN: llvm-extract -ralias '.*bar' -S < %s | FileCheck --check-prefix=ALIASRE %s
 
 ; Both aliases should be converted to declarations
 ; CHECK:      @zeda0 = external global i32
diff --git a/test/Other/extract-linkonce.ll b/test/Other/extract-linkonce.ll
index 31fbf3a..4c6b6b7 100644
--- a/test/Other/extract-linkonce.ll
+++ b/test/Other/extract-linkonce.ll
@@ -1,15 +1,16 @@
 ; RUN: llvm-extract -func foo -S < %s | FileCheck %s
 ; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
 
-; Test that we don't convert weak_odr to external definitions.
+; Test that linkonce definitions are mapped to weak so that they are not
+; dropped.
 
-; CHECK:      @bar = external hidden global i32
-; CHECK:      define hidden i32* @foo() {
+; CHECK:      @bar = external global i32
+; CHECK:      define weak i32* @foo() {
 ; CHECK-NEXT:  ret i32* @bar
 ; CHECK-NEXT: }
 
-; DELETE: @bar = hidden global i32 42
-; DELETE: declare hidden i32* @foo()
+; DELETE: @bar = weak global i32 42
+; DELETE: declare i32* @foo()
 
 @bar = linkonce global i32 42
 
diff --git a/test/Other/lit.local.cfg b/test/Other/lit.local.cfg
deleted file mode 100644
index 67c7ec7..0000000
--- a/test/Other/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.txt', '.test']
diff --git a/test/Other/optimize-options.ll b/test/Other/optimize-options.ll
index 888a78f..22dd842 100644
--- a/test/Other/optimize-options.ll
+++ b/test/Other/optimize-options.ll
@@ -1,8 +1,8 @@
-;RUN: opt -S -O1 -debug-pass=Arguments 2>&1 | FileCheck %s
-;RUN: opt -S -O2 -debug-pass=Arguments 2>&1 | FileCheck %s
-;RUN: opt -S -Os -debug-pass=Arguments 2>&1 | FileCheck %s
-;RUN: opt -S -Oz -debug-pass=Arguments 2>&1 | FileCheck %s
-;RUN: opt -S -O3 -debug-pass=Arguments 2>&1 | FileCheck %s
+;RUN: opt -S -O1 -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -S -O2 -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -S -Os -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -S -Oz -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -S -O3 -debug-pass=Arguments %s 2>&1 | FileCheck %s
 
-; Just check that we get a non-empty set of passes for each -O opton.
+; Just check that we get a non-empty set of passes for each -O option.
 ;CHECK: Pass Arguments: {{.*}} -print-module
diff --git a/test/TableGen/2003-08-03-PassCode.td b/test/TableGen/2003-08-03-PassCode.td
index de7d626..b851a15 100644
--- a/test/TableGen/2003-08-03-PassCode.td
+++ b/test/TableGen/2003-08-03-PassCode.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s
-// XFAIL: vg_leak
 
 class test<code C> {
   code Code = C;
diff --git a/test/TableGen/2006-09-18-LargeInt.td b/test/TableGen/2006-09-18-LargeInt.td
index 94cd1ec..5380212 100644
--- a/test/TableGen/2006-09-18-LargeInt.td
+++ b/test/TableGen/2006-09-18-LargeInt.td
@@ -1,4 +1,6 @@
-// RUN: llvm-tblgen %s | grep -- 4294901760
+// RUN: llvm-tblgen %s | FileCheck %s
+
+// CHECK: 4294901760
 
 def X {
   int Y = 0xFFFF0000;
diff --git a/test/TableGen/2010-03-24-PrematureDefaults.td b/test/TableGen/2010-03-24-PrematureDefaults.td
index 716a1d5..24f6c93 100644
--- a/test/TableGen/2010-03-24-PrematureDefaults.td
+++ b/test/TableGen/2010-03-24-PrematureDefaults.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class A<int k, bits<2> x = 1> {
   int K = k;
diff --git a/test/TableGen/CStyleComment.td b/test/TableGen/CStyleComment.td
index 55fb0e7..9c50f7e 100644
--- a/test/TableGen/CStyleComment.td
+++ b/test/TableGen/CStyleComment.td
@@ -1,7 +1,6 @@
 // Test that multiline, nested, comments work correctly.
 //
 // RUN: llvm-tblgen < %s
-// XFAIL: vg_leak
 
 /* Foo
   bar
diff --git a/test/TableGen/Dag.td b/test/TableGen/Dag.td
index 14d616b..fea3aee 100644
--- a/test/TableGen/Dag.td
+++ b/test/TableGen/Dag.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 //===----------------------------------------------------------------------===//
 // Substitution of an int.
diff --git a/test/TableGen/DefmInherit.td b/test/TableGen/DefmInherit.td
index b52a709..bfbb435 100644
--- a/test/TableGen/DefmInherit.td
+++ b/test/TableGen/DefmInherit.td
@@ -1,4 +1,11 @@
-// RUN: llvm-tblgen %s | grep "zing = 4" | count 4
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK-NOT: zing = 4
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/DefmInsideMultiClass.td b/test/TableGen/DefmInsideMultiClass.td
index 0aea212..d34974d 100644
--- a/test/TableGen/DefmInsideMultiClass.td
+++ b/test/TableGen/DefmInsideMultiClass.td
@@ -1,4 +1,8 @@
-// RUN: llvm-tblgen %s | grep ADDPSrr | count 1
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: ADDPSrr
+// CHECK-NOT: ADDPSrr
 
 class Instruction<bits<4> opc, string Name> {
   bits<4> opcode = opc;
diff --git a/test/TableGen/ForeachList.td b/test/TableGen/ForeachList.td
index 9bc76e0..99b7e14 100644
--- a/test/TableGen/ForeachList.td
+++ b/test/TableGen/ForeachList.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;
diff --git a/test/TableGen/ForeachLoop.td b/test/TableGen/ForeachLoop.td
index a49a60b..4aacc74 100644
--- a/test/TableGen/ForeachLoop.td
+++ b/test/TableGen/ForeachLoop.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;
diff --git a/test/TableGen/Include.td b/test/TableGen/Include.td
index 8783638..1cb779f 100644
--- a/test/TableGen/Include.td
+++ b/test/TableGen/Include.td
@@ -1,5 +1,5 @@
 // RUN: llvm-tblgen -I %p %s
-// XFAIL: vg_leak
+
 def BeforeInclude;
 
 include "Include.inc"
diff --git a/test/TableGen/IntBitInit.td b/test/TableGen/IntBitInit.td
index 83713a3..4e150f1 100644
--- a/test/TableGen/IntBitInit.td
+++ b/test/TableGen/IntBitInit.td
@@ -1,5 +1,5 @@
 // RUN: llvm-tblgen %s
-// XFAIL: vg_leak
+
 def {
   bit A = 1;
   int B = A;
diff --git a/test/TableGen/LazyChange.td b/test/TableGen/LazyChange.td
index 919a1a7..2ad6191 100644
--- a/test/TableGen/LazyChange.td
+++ b/test/TableGen/LazyChange.td
@@ -1,4 +1,6 @@
-// RUN: llvm-tblgen %s | grep "int Y = 3"
+// RUN: llvm-tblgen %s | FileCheck %s
+
+// CHECK: int Y = 3
 
 class C {
   int X = 4;
diff --git a/test/TableGen/LetInsideMultiClasses.td b/test/TableGen/LetInsideMultiClasses.td
index 72f48b6..095f37b 100644
--- a/test/TableGen/LetInsideMultiClasses.td
+++ b/test/TableGen/LetInsideMultiClasses.td
@@ -1,4 +1,10 @@
-// RUN: llvm-tblgen %s | grep "bit IsDouble = 1;" | count 3
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: bit IsDouble = 1;
+// CHECK: bit IsDouble = 1;
+// CHECK: bit IsDouble = 1;
+// CHECK-NOT: bit IsDouble = 1;
 
 class Instruction<bits<4> opc, string Name> {
   bits<4> opcode = opc;
diff --git a/test/TableGen/ListOfList.td b/test/TableGen/ListOfList.td
index adf9fe4..56f964e 100644
--- a/test/TableGen/ListOfList.td
+++ b/test/TableGen/ListOfList.td
@@ -1,6 +1,5 @@
-// RUN llvm-tblgen %s | FileCheck %s
-
-// RUN: llvm-tblgen %s | grep "foo" | count 1
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Base<string t> {
   string text = t;
@@ -11,3 +10,4 @@ class Derived<list<list<string>> thetext> : Base<thetext[0][0]>;
 def FOO : Derived<[["foo"]]>;
 
 // CHECK: text = "foo"
+// CHECK-NOT: text = "foo"
diff --git a/test/TableGen/LoLoL.td b/test/TableGen/LoLoL.td
index f758e1b..778c960 100644
--- a/test/TableGen/LoLoL.td
+++ b/test/TableGen/LoLoL.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Base<list<int> v> {
   list<int> values = v;
diff --git a/test/TableGen/MultiClass.td b/test/TableGen/MultiClass.td
index ef320cf..9c39963 100644
--- a/test/TableGen/MultiClass.td
+++ b/test/TableGen/MultiClass.td
@@ -1,4 +1,9 @@
-// RUN: llvm-tblgen %s | grep "zing = 4" | count 2
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK-NOT: zing = 4
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/MultiClassDefName.td b/test/TableGen/MultiClassDefName.td
index 75d6af5..d3c6de7 100644
--- a/test/TableGen/MultiClassDefName.td
+++ b/test/TableGen/MultiClassDefName.td
@@ -1,4 +1,8 @@
-// RUN: llvm-tblgen %s | grep WorldHelloCC | count 1
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: WorldHelloCC
+// CHECK-NOT: WorldHelloCC
 
 class C<string n> {
   string name = n;
diff --git a/test/TableGen/MultiClassInherit.td b/test/TableGen/MultiClassInherit.td
index 9d1470a..04fef2c 100644
--- a/test/TableGen/MultiClassInherit.td
+++ b/test/TableGen/MultiClassInherit.td
@@ -1,4 +1,36 @@
-// RUN: llvm-tblgen %s | grep "zing = 4" | count 28
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// "zing = 4" x 28
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK: zing = 4
+// CHECK-NOT: zing = 4
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/MultiPat.td b/test/TableGen/MultiPat.td
index b49b06c..b379277 100644
--- a/test/TableGen/MultiPat.td
+++ b/test/TableGen/MultiPat.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/NestedForeach.td b/test/TableGen/NestedForeach.td
index 5b63175..e8c16f7 100644
--- a/test/TableGen/NestedForeach.td
+++ b/test/TableGen/NestedForeach.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Droid<string series, int release, string model, int patchlevel> {
   string Series = series;
diff --git a/test/TableGen/Paste.td b/test/TableGen/Paste.td
index 33d61cc..a7e2a5b 100644
--- a/test/TableGen/Paste.td
+++ b/test/TableGen/Paste.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Instr<int i> {
   int index = i;
diff --git a/test/TableGen/SetTheory.td b/test/TableGen/SetTheory.td
index f26b9e6..7613323 100644
--- a/test/TableGen/SetTheory.td
+++ b/test/TableGen/SetTheory.td
@@ -1,5 +1,6 @@
 // Test evaluation of set operations in dags.
 // RUN: llvm-tblgen -print-sets %s | FileCheck %s
+// XFAIL: vg_leak
 //
 // The -print-sets driver configures a primitive SetTheory instance that
 // understands these sets:
diff --git a/test/TableGen/SiblingForeach.td b/test/TableGen/SiblingForeach.td
index e4c4704..a11f6f8 100644
--- a/test/TableGen/SiblingForeach.td
+++ b/test/TableGen/SiblingForeach.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Set<int i = 0, int j = 0, int k = 0> {
   int I = i;
diff --git a/test/TableGen/Slice.td b/test/TableGen/Slice.td
index 7a35d31..89deaef 100644
--- a/test/TableGen/Slice.td
+++ b/test/TableGen/Slice.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/String.td b/test/TableGen/String.td
index c71ed50..576ba81 100644
--- a/test/TableGen/String.td
+++ b/test/TableGen/String.td
@@ -1,5 +1,5 @@
 // RUN: llvm-tblgen %s 
-// XFAIL: vg_leak
+
 class x {
   string y = "missing terminating '\"' character";
 }
diff --git a/test/TableGen/TargetInstrSpec.td b/test/TableGen/TargetInstrSpec.td
index bf2d257..32253a3 100644
--- a/test/TableGen/TargetInstrSpec.td
+++ b/test/TableGen/TargetInstrSpec.td
@@ -1,5 +1,11 @@
-// RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_pd VR128:$src1, VR128:$src2))\]' | count 1
-// RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_ps VR128:$src1, VR128:$src2))\]' | count 1
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: [(set VR128:$dst, (int_x86_sse2_add_pd VR128:$src1, VR128:$src2))]
+// CHECK-NOT: [(set VR128:$dst, (int_x86_sse2_add_pd VR128:$src1, VR128:$src2))]
+
+// CHECK: [(set VR128:$dst, (int_x86_sse2_add_ps VR128:$src1, VR128:$src2))]
+// CHECK-NOT: [(set VR128:$dst, (int_x86_sse2_add_ps VR128:$src1, VR128:$src2))]
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/TwoLevelName.td b/test/TableGen/TwoLevelName.td
index e886962..9c502f4 100644
--- a/test/TableGen/TwoLevelName.td
+++ b/test/TableGen/TwoLevelName.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Type<string name, int length, int width> {
   string Name = name;
diff --git a/test/TableGen/cast.td b/test/TableGen/cast.td
index b9e4b37..a8bd207 100644
--- a/test/TableGen/cast.td
+++ b/test/TableGen/cast.td
@@ -1,4 +1,10 @@
-// RUN: llvm-tblgen %s | grep "add_ps" | count 3
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: add_ps
+// CHECK: add_ps
+// CHECK: add_ps
+// CHECK-NOT: add_ps
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/defmclass.td b/test/TableGen/defmclass.td
index 6198c00..80f03b3 100644
--- a/test/TableGen/defmclass.td
+++ b/test/TableGen/defmclass.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class XD { bits<4> Prefix = 11; }
 // CHECK: Prefix = { 1, 1, 0, 0 };
diff --git a/test/TableGen/eq.td b/test/TableGen/eq.td
index fc3ad42..f8daf88 100644
--- a/test/TableGen/eq.td
+++ b/test/TableGen/eq.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 // CHECK: Value = 0
 // CHECK: Value = 1
 
diff --git a/test/TableGen/eqbit.td b/test/TableGen/eqbit.td
index b77b1a2..1d58fa0 100644
--- a/test/TableGen/eqbit.td
+++ b/test/TableGen/eqbit.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 // CHECK: a = 6
 // CHECK: a = 5
 
diff --git a/test/TableGen/foreach.td b/test/TableGen/foreach.td
index 7b7c199..541da49 100644
--- a/test/TableGen/foreach.td
+++ b/test/TableGen/foreach.td
@@ -1,6 +1,14 @@
-// RUN: llvm-tblgen %s | grep 'Jr' | count 2
-// RUN: llvm-tblgen %s | grep 'Sr' | count 2
-// RUN: llvm-tblgen %s | grep '"NAME"' | count 1
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+// CHECK: Classes
+// CHECK: Sr
+// CHECK: Jr
+// CHECK: "NAME"
+
+// CHECK: Defs
+// CHECK: Jr
+// CHECK: Sr
 
 // Variables for foreach
 class decls {
diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index e4df74f..1d8d623 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
diff --git a/test/TableGen/ifbit.td b/test/TableGen/ifbit.td
index e334121..88f575e 100644
--- a/test/TableGen/ifbit.td
+++ b/test/TableGen/ifbit.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 // CHECK: a = 6
 // CHECK: a = 5
 
diff --git a/test/TableGen/intrinsic-order.td b/test/TableGen/intrinsic-order.td
index 5eadf60..13c2db2 100644
--- a/test/TableGen/intrinsic-order.td
+++ b/test/TableGen/intrinsic-order.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen -gen-intrinsic %s | FileCheck %s
+// XFAIL: vg_leak
 
 class IntrinsicProperty;
 
diff --git a/test/TableGen/intrinsic-varargs.td b/test/TableGen/intrinsic-varargs.td
new file mode 100644
index 0000000..3e48f8d
--- /dev/null
+++ b/test/TableGen/intrinsic-varargs.td
@@ -0,0 +1,30 @@
+// RUN: llvm-tblgen -gen-intrinsic %s | FileCheck %s
+// XFAIL: vg_leak
+
+class IntrinsicProperty;
+
+class ValueType<int size, int value> {
+  string Namespace = "MVT";
+  int Size = size;
+  int Value = value;
+}
+
+class LLVMType<ValueType vt> {
+  ValueType VT = vt;
+}
+
+class Intrinsic<string name, list<LLVMType> param_types = []> {
+  string LLVMName = name;
+  bit isTarget = 0;
+  string TargetPrefix = "";
+  list<LLVMType> RetTypes = [];
+  list<LLVMType> ParamTypes = param_types;
+  list<IntrinsicProperty> Properties = [];
+}
+
+// isVoid needs to match the definition in ValueTypes.td
+def isVoid : ValueType<0, 56>;   // Produces no value
+def llvm_vararg_ty : LLVMType<isVoid>;   // this means vararg here
+
+// CHECK: /* 0 */ 0, 27, 0,
+def int_foo : Intrinsic<"llvm.foo", [llvm_vararg_ty]>;
diff --git a/test/TableGen/lisp.td b/test/TableGen/lisp.td
index efe0002..9e58605 100644
--- a/test/TableGen/lisp.td
+++ b/test/TableGen/lisp.td
@@ -1,4 +1,20 @@
-// RUN: llvm-tblgen %s | grep ""
+// RUN: llvm-tblgen %s
+// XFAIL: vg_leak
+
+// CHECK:      def One {
+// CHECK-NEXT:   list<string> names = ["Jeffrey Sinclair"];
+// CHECK-NEXT:   string element = "Jeffrey Sinclair";
+// CHECK-NEXT:   list<string> rest = [];
+// CHECK-NEXT:   int null = 1;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def Three {
+// CHECK-NEXT:   list<string> names = ["Tom", "Dick", "Harry"];
+// CHECK-NEXT:   string element = "Tom";
+// CHECK-NEXT:   list<string> rest = ["Dick", "Harry"];
+// CHECK-NEXT:   int null = 0;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
 
 class List<list<string> n> {
   list<string> names = n;
diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
index 7db3d31..4622f28 100644
--- a/test/TableGen/list-element-bitref.td
+++ b/test/TableGen/list-element-bitref.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class C<list<bits<8>> L> {
   bits<2> V0 = L[0]{1-0};
diff --git a/test/TableGen/math.td b/test/TableGen/math.td
index bde267a..59d16ae 100644
--- a/test/TableGen/math.td
+++ b/test/TableGen/math.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Int<int value> {
   int Value = value;
diff --git a/test/TableGen/nested-comment.td b/test/TableGen/nested-comment.td
index bf030e7..f8581ce 100644
--- a/test/TableGen/nested-comment.td
+++ b/test/TableGen/nested-comment.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen < %s
-// XFAIL: vg_leak
 
 /* foo
 
diff --git a/test/TableGen/pr8330.td b/test/TableGen/pr8330.td
index e672014..7779b63 100644
--- a/test/TableGen/pr8330.td
+++ b/test/TableGen/pr8330.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Or4<bits<8> Val> {
   bits<8> V = {Val{7}, Val{6}, Val{5}, Val{4}, Val{3}, 1, Val{1}, Val{0} };
diff --git a/test/TableGen/strconcat.td b/test/TableGen/strconcat.td
index 0173c49..dfb1a94 100644
--- a/test/TableGen/strconcat.td
+++ b/test/TableGen/strconcat.td
@@ -1,4 +1,6 @@
-// RUN: llvm-tblgen %s | grep fufoo
+// RUN: llvm-tblgen %s | FileCheck %s
+
+// CHECK: fufoo
 
 class Y<string S> {
   string T = !strconcat(S, "foo");
diff --git a/test/TableGen/subst.td b/test/TableGen/subst.td
index e265b44..34818af 100644
--- a/test/TableGen/subst.td
+++ b/test/TableGen/subst.td
@@ -1,9 +1,5 @@
-// RUN: llvm-tblgen %s | grep "Smith" | count 7
-// RUN: llvm-tblgen %s | grep "Johnson" | count 2
-// RUN: llvm-tblgen %s | grep "FIRST" | count 1
-// RUN: llvm-tblgen %s | grep "LAST" | count 1
-// RUN: llvm-tblgen %s | grep "TVAR" | count 2
-// RUN: llvm-tblgen %s | grep "Bogus" | count 1
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Honorific<string t> {
   string honorific = t;
@@ -27,3 +23,56 @@ def JohnSmith : AName<"FIRST LAST", TVAR>;
 def JaneSmith : AName<"Jane LAST", Ms>;
 def JohnSmithJones : AName<"FIRST LAST-Jones", Mr>;
 def JimmyJohnson : AName<"Jimmy Johnson", Mr>;
+
+// CHECK:      ------------- Classes -----------------
+// CHECK-NEXT: class AName<string AName:name = ?, Honorific AName:honorific = ?> {
+// CHECK-NEXT:   string name = !subst("FIRST", "John", !subst("LAST", "Smith", AName:name));
+// CHECK-NEXT:   Honorific honorific = !subst(TVAR, Mr, AName:honorific);
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: class Honorific<string Honorific:t = ?> {
+// CHECK-NEXT:   string honorific = Honorific:t;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: class Name<string Name:n = ?, Honorific Name:t = ?> {
+// CHECK-NEXT:   string name = Name:n;
+// CHECK-NEXT:   Honorific honorific = Name:t;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: ------------- Defs -----------------
+// CHECK-NEXT: def JaneSmith {
+// CHECK-NEXT:   string name = "Jane Smith";
+// CHECK-NEXT:   Honorific honorific = Ms;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def JimmyJohnson {
+// CHECK-NEXT:   string name = "Jimmy Johnson";
+// CHECK-NEXT:   Honorific honorific = Mr;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def JohnSmith {
+// CHECK-NEXT:   string name = "John Smith";
+// CHECK-NEXT:   Honorific honorific = Mr;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def JohnSmithJones {
+// CHECK-NEXT:   string name = "John Smith-Jones";
+// CHECK-NEXT:   Honorific honorific = Mr;
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def Mr 
+// CHECK-NEXT:   string honorific = "Mr.";
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def Mrs {
+// CHECK-NEXT:   string honorific = "Mrs.";
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def Ms {
+// CHECK-NEXT:   string honorific = "Ms.";
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
+// CHECK-NEXT: def TVAR {
+// CHECK-NEXT:   string honorific = "Bogus";
+// CHECK-NEXT:   string NAME = ?;
+// CHECK-NEXT: }
diff --git a/test/TableGen/subst2.td b/test/TableGen/subst2.td
index ce73077..7c007f7 100644
--- a/test/TableGen/subst2.td
+++ b/test/TableGen/subst2.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 // CHECK: No subst
 // CHECK: No foo
 // CHECK: RECURSE foo
diff --git a/test/TableGen/usevalname.td b/test/TableGen/usevalname.td
index a80ba12..d85b98a 100644
--- a/test/TableGen/usevalname.td
+++ b/test/TableGen/usevalname.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
 
 class Instr<list<dag> pat> {
   list<dag> Pattern = pat;
diff --git a/test/Transforms/ADCE/lit.local.cfg b/test/Transforms/ADCE/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/ADCE/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/ArgumentPromotion/lit.local.cfg b/test/Transforms/ArgumentPromotion/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/ArgumentPromotion/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
new file mode 100644
index 0000000..4688a83
--- /dev/null
+++ b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -argpromotion -S
+
+; PR17906
+; When we promote two arguments in a single function with different types,
+; before the fix, we used the same tag for the newly-created two loads.
+; This testing case makes sure that we correctly transfer the tbaa tags from the
+; original loads to the newly-created loads when promoting pointer arguments.
+
+@a = global i32* null, align 8
+@e = global i32** @a, align 8
+@g = global i32 0, align 4
+@c = global i64 0, align 8
+@d = global i8 0, align 1
+
+define internal fastcc void @fn(i32* nocapture readonly %p1, i64* nocapture readonly %p2) {
+entry:
+  %0 = load i64* %p2, align 8, !tbaa !1
+  %conv = trunc i64 %0 to i32
+  %1 = load i32* %p1, align 4, !tbaa !5
+  %conv1 = trunc i32 %1 to i8
+  store i8 %conv1, i8* @d, align 1, !tbaa !7
+  ret void
+}
+
+define i32 @main() {
+entry:
+; CHECK-LABEL: main
+; CHECK: store i32 1, i32* %{{.*}}, align 4, !tbaa ![[I32:[0-9]+]]
+; CHECK: %g.val = load i32* @g, align 4, !tbaa ![[I32]]
+; CHECK: %c.val = load i64* @c, align 8, !tbaa ![[LONG:[0-9]+]]
+  %0 = load i32*** @e, align 8, !tbaa !8
+  store i32* @g, i32** %0, align 8, !tbaa !8
+  %1 = load i32** @a, align 8, !tbaa !8
+  store i32 1, i32* %1, align 4, !tbaa !5
+  call fastcc void @fn(i32* @g, i64* @c)
+
+  ret i32 0
+}
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"long", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !6, metadata !6, i64 0}
+!6 = metadata !{metadata !"int", metadata !3, i64 0}
+!7 = metadata !{metadata !3, metadata !3, i64 0}
+!8 = metadata !{metadata !9, metadata !9, i64 0}
+!9 = metadata !{metadata !"any pointer", metadata !3, i64 0}
+; CHECK: ![[I32]] = metadata !{metadata ![[I32_TYPE:[0-9]+]], metadata ![[I32_TYPE]], i64 0}
+; CHECK: ![[I32_TYPE]] = metadata !{metadata !"int", metadata !{{.*}}, i64 0}
+; CHECK: ![[LONG]] = metadata !{metadata ![[LONG_TYPE:[0-9]+]], metadata ![[LONG_TYPE]], i64 0}
+; CHECK: ![[LONG_TYPE]] = metadata !{metadata !"long", metadata !{{.*}}, i64 0}
diff --git a/test/Transforms/BBVectorize/X86/pr15289.ll b/test/Transforms/BBVectorize/X86/pr15289.ll
index 07cc5d8..42bd0ff 100644
--- a/test/Transforms/BBVectorize/X86/pr15289.ll
+++ b/test/Transforms/BBVectorize/X86/pr15289.ll
@@ -45,7 +45,7 @@ entry:
   %13 = fmul double %3, %12
   %14 = fmul double %3, undef
   %15 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 0
-  store double %13, double* %15, align 8, !tbaa !0
+  store double %13, double* %15, align 8
   %16 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 1
   %17 = fmul double undef, %8
   %18 = fmul double %17, undef
@@ -54,7 +54,7 @@ entry:
   %21 = fmul double %3, %19
   %22 = fsub double -0.000000e+00, %21
   %23 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 0
-  store double %22, double* %23, align 8, !tbaa !0
+  store double %22, double* %23, align 8
   %24 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 1
   %25 = fmul double undef, 0x3FE42F601A8C6794
   %26 = fmul double undef, 2.000000e+00
@@ -62,7 +62,7 @@ entry:
   %28 = fmul double %6, undef
   %29 = fsub double undef, %28
   %30 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 0
-  store double undef, double* %30, align 8, !tbaa !0
+  store double undef, double* %30, align 8
   %31 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 1
   %32 = fmul double undef, %17
   %33 = fmul double undef, %17
@@ -71,7 +71,7 @@ entry:
   %36 = fsub double undef, %35
   %37 = fmul double %3, %34
   %38 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 0
-  store double %37, double* %38, align 8, !tbaa !0
+  store double %37, double* %38, align 8
   %39 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 1
   %40 = fmul double undef, %8
   %41 = fmul double undef, %40
@@ -79,20 +79,17 @@ entry:
   %43 = fsub double undef, %42
   %44 = fmul double %3, %43
   %45 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 0
-  store double %13, double* %45, align 8, !tbaa !0
+  store double %13, double* %45, align 8
   %46 = getelementptr inbounds [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 1
   %47 = fsub double -0.000000e+00, %14
-  store double %47, double* %16, align 8, !tbaa !0
-  store double undef, double* %24, align 8, !tbaa !0
-  store double -0.000000e+00, double* %31, align 8, !tbaa !0
-  store double undef, double* %39, align 8, !tbaa !0
-  store double undef, double* %46, align 8, !tbaa !0
+  store double %47, double* %16, align 8
+  store double undef, double* %24, align 8
+  store double -0.000000e+00, double* %31, align 8
+  store double undef, double* %39, align 8
+  store double undef, double* %46, align 8
   ret void
 }
 
 attributes #0 = { nounwind uwtable }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
-
-!0 = metadata !{metadata !"alias set 17: real(kind=8)", metadata !1}
-!1 = metadata !{metadata !1}
diff --git a/test/Transforms/BBVectorize/X86/wr-aliases.ll b/test/Transforms/BBVectorize/X86/wr-aliases.ll
new file mode 100644
index 0000000..34b1d4e
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/wr-aliases.ll
@@ -0,0 +1,144 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -bb-vectorize -S < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.QBezier.15 = type { double, double, double, double, double, double, double, double }
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+; Function Attrs: uwtable
+declare fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval nocapture readonly align 8) #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+define void @main_arrayctor.cont([10 x %class.QBezier.15]* %beziers, %class.QBezier.15* %agg.tmp.i, %class.QBezier.15* %agg.tmp55.i, %class.QBezier.15* %agg.tmp56.i) {
+newFuncRoot:
+  br label %arrayctor.cont
+
+arrayctor.cont.ret.exitStub:                      ; preds = %arrayctor.cont
+  ret void
+
+; CHECK-LABEL: @main_arrayctor.cont
+; CHECK: <2 x double>
+; CHECK: @_ZL12printQBezier7QBezier
+; CHECK: store double %mul8.i, double* %x3.i, align 16
+; CHECK: load double* %x3.i, align 16
+; CHECK: ret
+
+arrayctor.cont:                                   ; preds = %newFuncRoot
+  %ref.tmp.sroa.0.0.idx = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 0
+  store double 1.000000e+01, double* %ref.tmp.sroa.0.0.idx, align 16
+  %ref.tmp.sroa.2.0.idx1 = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 1
+  store double 2.000000e+01, double* %ref.tmp.sroa.2.0.idx1, align 8
+  %ref.tmp.sroa.3.0.idx2 = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 2
+  store double 3.000000e+01, double* %ref.tmp.sroa.3.0.idx2, align 16
+  %ref.tmp.sroa.4.0.idx3 = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 3
+  store double 4.000000e+01, double* %ref.tmp.sroa.4.0.idx3, align 8
+  %ref.tmp.sroa.5.0.idx4 = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 4
+  store double 5.000000e+01, double* %ref.tmp.sroa.5.0.idx4, align 16
+  %ref.tmp.sroa.6.0.idx5 = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 5
+  store double 6.000000e+01, double* %ref.tmp.sroa.6.0.idx5, align 8
+  %ref.tmp.sroa.7.0.idx6 = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 6
+  store double 7.000000e+01, double* %ref.tmp.sroa.7.0.idx6, align 16
+  %ref.tmp.sroa.8.0.idx7 = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 7
+  store double 8.000000e+01, double* %ref.tmp.sroa.8.0.idx7, align 8
+  %add.ptr = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1
+  %v0 = bitcast %class.QBezier.15* %agg.tmp.i to i8*
+  call void @llvm.lifetime.start(i64 64, i8* %v0)
+  %v1 = bitcast %class.QBezier.15* %agg.tmp55.i to i8*
+  call void @llvm.lifetime.start(i64 64, i8* %v1)
+  %v2 = bitcast %class.QBezier.15* %agg.tmp56.i to i8*
+  call void @llvm.lifetime.start(i64 64, i8* %v2)
+  %v3 = bitcast [10 x %class.QBezier.15]* %beziers to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v0, i8* %v3, i64 64, i32 8, i1 false)
+  call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp.i)
+  %x2.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 2
+  %v4 = load double* %x2.i, align 16
+  %x3.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 4
+  %v5 = load double* %x3.i, align 16
+  %add.i = fadd double %v4, %v5
+  %mul.i = fmul double 5.000000e-01, %add.i
+  %x1.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 0
+  %v6 = load double* %x1.i, align 16
+  %add3.i = fadd double %v4, %v6
+  %mul4.i = fmul double 5.000000e-01, %add3.i
+  %x25.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 2
+  store double %mul4.i, double* %x25.i, align 16
+  %v7 = load double* %x3.i, align 16
+  %x4.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 6
+  %v8 = load double* %x4.i, align 16
+  %add7.i = fadd double %v7, %v8
+  %mul8.i = fmul double 5.000000e-01, %add7.i
+  store double %mul8.i, double* %x3.i, align 16
+  %v9 = load double* %x1.i, align 16
+  %x111.i = getelementptr inbounds %class.QBezier.15* %add.ptr, i64 0, i32 0
+  store double %v9, double* %x111.i, align 16
+  %v10 = load double* %x25.i, align 16
+  %add15.i = fadd double %mul.i, %v10
+  %mul16.i = fmul double 5.000000e-01, %add15.i
+  %x317.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 4
+  store double %mul16.i, double* %x317.i, align 16
+  %v11 = load double* %x3.i, align 16
+  %add19.i = fadd double %mul.i, %v11
+  %mul20.i = fmul double 5.000000e-01, %add19.i
+  store double %mul20.i, double* %x2.i, align 16
+  %v12 = load double* %x317.i, align 16
+  %add24.i = fadd double %v12, %mul20.i
+  %mul25.i = fmul double 5.000000e-01, %add24.i
+  store double %mul25.i, double* %x1.i, align 16
+  %x427.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 6
+  store double %mul25.i, double* %x427.i, align 16
+  %y2.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 3
+  %v13 = load double* %y2.i, align 8
+  %y3.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 5
+  %v14 = load double* %y3.i, align 8
+  %add28.i = fadd double %v13, %v14
+  %div.i = fmul double 5.000000e-01, %add28.i
+  %y1.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 1
+  %v15 = load double* %y1.i, align 8
+  %add30.i = fadd double %v13, %v15
+  %mul31.i = fmul double 5.000000e-01, %add30.i
+  %y232.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 3
+  store double %mul31.i, double* %y232.i, align 8
+  %v16 = load double* %y3.i, align 8
+  %y4.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 7
+  %v17 = load double* %y4.i, align 8
+  %add34.i = fadd double %v16, %v17
+  %mul35.i = fmul double 5.000000e-01, %add34.i
+  store double %mul35.i, double* %y3.i, align 8
+  %v18 = load double* %y1.i, align 8
+  %y138.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 1
+  store double %v18, double* %y138.i, align 8
+  %v19 = load double* %y232.i, align 8
+  %add42.i = fadd double %div.i, %v19
+  %mul43.i = fmul double 5.000000e-01, %add42.i
+  %y344.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 5
+  store double %mul43.i, double* %y344.i, align 8
+  %v20 = load double* %y3.i, align 8
+  %add46.i = fadd double %div.i, %v20
+  %mul47.i = fmul double 5.000000e-01, %add46.i
+  store double %mul47.i, double* %y2.i, align 8
+  %v21 = load double* %y344.i, align 8
+  %add51.i = fadd double %v21, %mul47.i
+  %mul52.i = fmul double 5.000000e-01, %add51.i
+  store double %mul52.i, double* %y1.i, align 8
+  %y454.i = getelementptr inbounds [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 7
+  store double %mul52.i, double* %y454.i, align 8
+  %v22 = bitcast %class.QBezier.15* %add.ptr to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v1, i8* %v22, i64 64, i32 8, i1 false)
+  call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp55.i)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v2, i8* %v3, i64 64, i32 8, i1 false)
+  call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp56.i)
+  call void @llvm.lifetime.end(i64 64, i8* %v0)
+  call void @llvm.lifetime.end(i64 64, i8* %v1)
+  call void @llvm.lifetime.end(i64 64, i8* %v2)
+  br label %arrayctor.cont.ret.exitStub
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/Transforms/BBVectorize/lit.local.cfg
+++ b/test/Transforms/BBVectorize/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/BBVectorize/xcore/no-vector-registers.ll b/test/Transforms/BBVectorize/xcore/no-vector-registers.ll
new file mode 100644
index 0000000..9ebdb73
--- /dev/null
+++ b/test/Transforms/BBVectorize/xcore/no-vector-registers.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S -mtriple=xcore | FileCheck %s
+
+target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
+target triple = "xcore"
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+; CHECK-LABEL: @test1(
+; CHECK-NOT: <2 x double>
+  %X1 = fsub double %A1, %B1
+  %X2 = fsub double %A2, %B2
+  %Y1 = fmul double %X1, %A1
+  %Y2 = fmul double %X2, %A2
+  %Z1 = fadd double %Y1, %B1
+  %Z2 = fadd double %Y2, %B2
+  %R  = fmul double %Z1, %Z2
+  ret double %R
+}
diff --git a/test/Transforms/BlockPlacement/basictest.ll b/test/Transforms/BlockPlacement/basictest.ll
deleted file mode 100644
index 47b5079..0000000
--- a/test/Transforms/BlockPlacement/basictest.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: opt < %s -block-placement -disable-output -print-function 2> /dev/null
-
-define i32 @test() {
-        br i1 true, label %X, label %Y
-
-A:              ; preds = %Y, %X
-        ret i32 0
-
-X:              ; preds = %0
-        br label %A
-
-Y:              ; preds = %0
-        br label %A
-}
-
diff --git a/test/Transforms/BlockPlacement/lit.local.cfg b/test/Transforms/BlockPlacement/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/BlockPlacement/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/CodeExtractor/lit.local.cfg b/test/Transforms/CodeExtractor/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/CodeExtractor/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/CodeGenPrepare/lit.local.cfg b/test/Transforms/CodeGenPrepare/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/CodeGenPrepare/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/ConstProp/lit.local.cfg b/test/Transforms/ConstProp/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/ConstProp/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 795dc07..d05db47 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
-; RUN: opt < %s -default-data-layout="E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
+; RUN: opt < %s -default-data-layout="e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
 
 ; {{ 0xDEADBEEF, 0xBA }, 0xCAFEBABE}
 @g1 = constant {{i32,i8},i32} {{i32,i8} { i32 -559038737, i8 186 }, i32 -889275714 }
@@ -155,7 +155,7 @@ entry:
 @test12g = private constant [6 x i8] c"a\00b\00\00\00"
 
 define i16 @test12() {
-  %a = load i16* getelementptr inbounds ([3 x i16]* bitcast ([6 x i8]* @test12g to [3 x i16]*), i32 0, i64 1) 
+  %a = load i16* getelementptr inbounds ([3 x i16]* bitcast ([6 x i8]* @test12g to [3 x i16]*), i32 0, i64 1)
   ret i16 %a
 
 ; 0x0062
@@ -194,6 +194,20 @@ entry:
 ; BE: ret i64 1
 }
 
+; Check with address space pointers
+@g6_as1 = constant [2 x i8 addrspace(1)*] [i8 addrspace(1)* inttoptr (i16 1 to i8 addrspace(1)*), i8 addrspace(1)* inttoptr (i16 2 to i8 addrspace(1)*)]
+define i16 @test14_as1() nounwind {
+entry:
+  %tmp = load i16* bitcast ([2 x i8 addrspace(1)*]* @g6_as1 to i16*)
+  ret i16 %tmp
+
+; LE: @test14_as1
+; LE: ret i16 1
+
+; BE: @test14_as1
+; BE: ret i16 1
+}
+
 define i64 @test15() nounwind {
 entry:
   %tmp = load i64* bitcast (i8** getelementptr inbounds ([2 x i8*]* @g6, i32 0, i64 1) to i64*)
diff --git a/test/Transforms/ConstantMerge/align.ll b/test/Transforms/ConstantMerge/align.ll
new file mode 100644
index 0000000..c1cbcb3
--- /dev/null
+++ b/test/Transforms/ConstantMerge/align.ll
@@ -0,0 +1,28 @@
+; RUN: opt -constmerge -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+
+; Test that with a TD we do merge and mark the alignment as 4
+@T1A = internal unnamed_addr constant i32 1
+@T1B = internal unnamed_addr constant i32 1, align 2
+; CHECK: @T1B = internal unnamed_addr constant i32 1, align 4
+
+define void @test1(i32** %P1, i32** %P2) {
+  store i32* @T1A, i32** %P1
+  store i32* @T1B, i32** %P2
+  ret void
+}
+
+
+; Test that even with a TD we set the alignment to the maximum if both constants
+; have explicit alignments.
+@T2A = internal unnamed_addr constant i32 2, align 1
+@T2B = internal unnamed_addr constant i32 2, align 2
+; CHECK: @T2B = internal unnamed_addr constant i32 2, align 2
+
+define void @test2(i32** %P1, i32** %P2) {
+  store i32* @T2A, i32** %P1
+  store i32* @T2B, i32** %P2
+  ret void
+}
diff --git a/test/Transforms/ConstantMerge/lit.local.cfg b/test/Transforms/ConstantMerge/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/ConstantMerge/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/CorrelatedValuePropagation/lit.local.cfg b/test/Transforms/CorrelatedValuePropagation/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/CorrelatedValuePropagation/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 4cb742d..26982db 100644
--- a/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -44,11 +44,12 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; CHECK: attributes [[NUW]] = { nounwind }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!30}
 !0 = metadata !{i32 524545, metadata !1, metadata !"name", metadata !2, i32 8, metadata !6} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 524334, metadata !28, metadata !2, metadata !"vfs_addname", metadata !"vfs_addname", metadata !"vfs_addname", i32 12, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 524329, metadata !28} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 524305, metadata !28, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !29, metadata !29, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !28, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 524309, metadata !28, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6, metadata !9, metadata !9, metadata !9}
 !6 = metadata !{i32 524303, metadata !28, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
 !7 = metadata !{i32 524326, metadata !28, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ]
@@ -61,7 +62,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !14 = metadata !{i32 524299, metadata !28, metadata !1, i32 12, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 524545, metadata !16, metadata !"name", metadata !2, i32 17, metadata !6} ; [ DW_TAG_arg_variable ]
 !16 = metadata !{i32 524334, metadata !28, metadata !2, metadata !"add_name_internal", metadata !"add_name_internal", metadata !"add_name_internal", i32 22, metadata !17, i1 true, i1 true, i32 0, i32 0, null, i1 false, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 524309, metadata !28, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!17 = metadata !{i32 524309, metadata !28, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{metadata !6, metadata !6, metadata !9, metadata !9, metadata !19, metadata !9}
 !19 = metadata !{i32 524324, metadata !28, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
 !20 = metadata !{i32 524545, metadata !16, metadata !"len", metadata !2, i32 18, metadata !9} ; [ DW_TAG_arg_variable ]
@@ -74,3 +75,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !27 = metadata !{i32 26, i32 0, metadata !25, null}
 !28 = metadata !{metadata !"tail.c", metadata !"/Users/echeng/LLVM/radars/r7927803/"}
 !29 = metadata !{i32 0}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
index 21de114..7bdcbf5 100644
--- a/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/test/Transforms/DeadArgElim/dbginfo.ll
@@ -35,13 +35,14 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786449, metadata !20, i32 4, metadata !"clang version 3.2 (trunk 165305)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1,  metadata !1, metadata !""} ; [ DW_TAG_compile_unit ] [/home/samsonov/tmp/clang-di/test.cc] [DW_LANG_C_plus_plus]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !8, metadata !9}
 !5 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"run", metadata !"run", metadata !"", i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [run]
 !6 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !1, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !1, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{i32 786478, metadata !20, metadata !6, metadata !"dead_vararg", metadata !"dead_vararg", metadata !"", i32 5, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (...)* @_ZN12_GLOBAL__N_111dead_varargEz, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [dead_vararg]
 
 ; CHECK: metadata !"dead_vararg"{{.*}}void ()* @_ZN12_GLOBAL__N_111dead_varargEz
@@ -61,3 +62,4 @@ entry:
 !18 = metadata !{i32 786443, metadata !20, metadata !8, i32 5, i32 23, i32 1} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
 !19 = metadata !{i32 5, i32 30, metadata !18, null}
 !20 = metadata !{metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di"}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/DeadArgElim/linkage.ll b/test/Transforms/DeadArgElim/linkage.ll
new file mode 100644
index 0000000..f475484
--- /dev/null
+++ b/test/Transforms/DeadArgElim/linkage.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -deadargelim -S | FileCheck %s
+
+; rdar://11546243
+%struct.A = type { i8 }
+
+define available_externally void @_Z17externallyDefinedP1A(%struct.A* %a) {
+entry:
+  call void @_Z3foov()
+  ret void
+}
+
+declare void @_Z3foov()
+
+define void @_Z4testP1A(%struct.A* %a) {
+; CHECK: @_Z4testP1A
+; CHECK: @_Z17externallyDefinedP1A(%struct.A* %a)
+
+entry:
+  call void @_Z17externallyDefinedP1A(%struct.A* %a)
+  ret void
+}
diff --git a/test/Transforms/DeadArgElim/lit.local.cfg b/test/Transforms/DeadArgElim/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/DeadArgElim/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/DeadStoreElimination/inst-limits.ll b/test/Transforms/DeadStoreElimination/inst-limits.ll
new file mode 100644
index 0000000..9df8801
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/inst-limits.ll
@@ -0,0 +1,261 @@
+; RUN: opt -S -dse < %s | FileCheck %s
+
+; If there are two stores to the same location, DSE should be able to remove
+; the first store if the two stores are separated by no more than 98
+; instructions. The existence of debug intrinsics between the stores should
+; not affect this instruction limit.
+
+@x = global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @test_within_limit() {
+entry:
+  ; The first store; later there is a second store to the same location,
+  ; so this store should be optimized away by DSE.
+  ; CHECK-NOT: store i32 1, i32* @x, align 4
+  store i32 1, i32* @x, align 4
+
+  ; Insert 98 dummy instructions between the two stores
+  %0 = bitcast i32 0 to i32
+  %1 = bitcast i32 0 to i32
+  %2 = bitcast i32 0 to i32
+  %3 = bitcast i32 0 to i32
+  %4 = bitcast i32 0 to i32
+  %5 = bitcast i32 0 to i32
+  %6 = bitcast i32 0 to i32
+  %7 = bitcast i32 0 to i32
+  %8 = bitcast i32 0 to i32
+  %9 = bitcast i32 0 to i32
+  %10 = bitcast i32 0 to i32
+  %11 = bitcast i32 0 to i32
+  %12 = bitcast i32 0 to i32
+  %13 = bitcast i32 0 to i32
+  %14 = bitcast i32 0 to i32
+  %15 = bitcast i32 0 to i32
+  %16 = bitcast i32 0 to i32
+  %17 = bitcast i32 0 to i32
+  %18 = bitcast i32 0 to i32
+  %19 = bitcast i32 0 to i32
+  %20 = bitcast i32 0 to i32
+  %21 = bitcast i32 0 to i32
+  %22 = bitcast i32 0 to i32
+  %23 = bitcast i32 0 to i32
+  %24 = bitcast i32 0 to i32
+  %25 = bitcast i32 0 to i32
+  %26 = bitcast i32 0 to i32
+  %27 = bitcast i32 0 to i32
+  %28 = bitcast i32 0 to i32
+  %29 = bitcast i32 0 to i32
+  %30 = bitcast i32 0 to i32
+  %31 = bitcast i32 0 to i32
+  %32 = bitcast i32 0 to i32
+  %33 = bitcast i32 0 to i32
+  %34 = bitcast i32 0 to i32
+  %35 = bitcast i32 0 to i32
+  %36 = bitcast i32 0 to i32
+  %37 = bitcast i32 0 to i32
+  %38 = bitcast i32 0 to i32
+  %39 = bitcast i32 0 to i32
+  %40 = bitcast i32 0 to i32
+  %41 = bitcast i32 0 to i32
+  %42 = bitcast i32 0 to i32
+  %43 = bitcast i32 0 to i32
+  %44 = bitcast i32 0 to i32
+  %45 = bitcast i32 0 to i32
+  %46 = bitcast i32 0 to i32
+  %47 = bitcast i32 0 to i32
+  %48 = bitcast i32 0 to i32
+  %49 = bitcast i32 0 to i32
+  %50 = bitcast i32 0 to i32
+  %51 = bitcast i32 0 to i32
+  %52 = bitcast i32 0 to i32
+  %53 = bitcast i32 0 to i32
+  %54 = bitcast i32 0 to i32
+  %55 = bitcast i32 0 to i32
+  %56 = bitcast i32 0 to i32
+  %57 = bitcast i32 0 to i32
+  %58 = bitcast i32 0 to i32
+  %59 = bitcast i32 0 to i32
+  %60 = bitcast i32 0 to i32
+  %61 = bitcast i32 0 to i32
+  %62 = bitcast i32 0 to i32
+  %63 = bitcast i32 0 to i32
+  %64 = bitcast i32 0 to i32
+  %65 = bitcast i32 0 to i32
+  %66 = bitcast i32 0 to i32
+  %67 = bitcast i32 0 to i32
+  %68 = bitcast i32 0 to i32
+  %69 = bitcast i32 0 to i32
+  %70 = bitcast i32 0 to i32
+  %71 = bitcast i32 0 to i32
+  %72 = bitcast i32 0 to i32
+  %73 = bitcast i32 0 to i32
+  %74 = bitcast i32 0 to i32
+  %75 = bitcast i32 0 to i32
+  %76 = bitcast i32 0 to i32
+  %77 = bitcast i32 0 to i32
+  %78 = bitcast i32 0 to i32
+  %79 = bitcast i32 0 to i32
+  %80 = bitcast i32 0 to i32
+  %81 = bitcast i32 0 to i32
+  %82 = bitcast i32 0 to i32
+  %83 = bitcast i32 0 to i32
+  %84 = bitcast i32 0 to i32
+  %85 = bitcast i32 0 to i32
+  %86 = bitcast i32 0 to i32
+  %87 = bitcast i32 0 to i32
+  %88 = bitcast i32 0 to i32
+  %89 = bitcast i32 0 to i32
+  %90 = bitcast i32 0 to i32
+  %91 = bitcast i32 0 to i32
+  %92 = bitcast i32 0 to i32
+  %93 = bitcast i32 0 to i32
+  %94 = bitcast i32 0 to i32
+  %95 = bitcast i32 0 to i32
+  %96 = bitcast i32 0 to i32
+  %97 = bitcast i32 0 to i32
+
+  ; Insert a meaningless dbg.value intrinsic; it should have no
+  ; effect on the working of DSE in any way.
+  call void @llvm.dbg.value(metadata !12, i64 0, metadata !10)
+
+  ; CHECK:  store i32 -1, i32* @x, align 4
+  store i32 -1, i32* @x, align 4
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+define i32 @test_outside_limit() {
+entry:
+  ; The first store; later there is a second store to the same location
+  ; CHECK: store i32 1, i32* @x, align 4
+  store i32 1, i32* @x, align 4
+
+  ; Insert 99 dummy instructions between the two stores; this is
+  ; one too many instruction for the DSE to take place.
+  %0 = bitcast i32 0 to i32
+  %1 = bitcast i32 0 to i32
+  %2 = bitcast i32 0 to i32
+  %3 = bitcast i32 0 to i32
+  %4 = bitcast i32 0 to i32
+  %5 = bitcast i32 0 to i32
+  %6 = bitcast i32 0 to i32
+  %7 = bitcast i32 0 to i32
+  %8 = bitcast i32 0 to i32
+  %9 = bitcast i32 0 to i32
+  %10 = bitcast i32 0 to i32
+  %11 = bitcast i32 0 to i32
+  %12 = bitcast i32 0 to i32
+  %13 = bitcast i32 0 to i32
+  %14 = bitcast i32 0 to i32
+  %15 = bitcast i32 0 to i32
+  %16 = bitcast i32 0 to i32
+  %17 = bitcast i32 0 to i32
+  %18 = bitcast i32 0 to i32
+  %19 = bitcast i32 0 to i32
+  %20 = bitcast i32 0 to i32
+  %21 = bitcast i32 0 to i32
+  %22 = bitcast i32 0 to i32
+  %23 = bitcast i32 0 to i32
+  %24 = bitcast i32 0 to i32
+  %25 = bitcast i32 0 to i32
+  %26 = bitcast i32 0 to i32
+  %27 = bitcast i32 0 to i32
+  %28 = bitcast i32 0 to i32
+  %29 = bitcast i32 0 to i32
+  %30 = bitcast i32 0 to i32
+  %31 = bitcast i32 0 to i32
+  %32 = bitcast i32 0 to i32
+  %33 = bitcast i32 0 to i32
+  %34 = bitcast i32 0 to i32
+  %35 = bitcast i32 0 to i32
+  %36 = bitcast i32 0 to i32
+  %37 = bitcast i32 0 to i32
+  %38 = bitcast i32 0 to i32
+  %39 = bitcast i32 0 to i32
+  %40 = bitcast i32 0 to i32
+  %41 = bitcast i32 0 to i32
+  %42 = bitcast i32 0 to i32
+  %43 = bitcast i32 0 to i32
+  %44 = bitcast i32 0 to i32
+  %45 = bitcast i32 0 to i32
+  %46 = bitcast i32 0 to i32
+  %47 = bitcast i32 0 to i32
+  %48 = bitcast i32 0 to i32
+  %49 = bitcast i32 0 to i32
+  %50 = bitcast i32 0 to i32
+  %51 = bitcast i32 0 to i32
+  %52 = bitcast i32 0 to i32
+  %53 = bitcast i32 0 to i32
+  %54 = bitcast i32 0 to i32
+  %55 = bitcast i32 0 to i32
+  %56 = bitcast i32 0 to i32
+  %57 = bitcast i32 0 to i32
+  %58 = bitcast i32 0 to i32
+  %59 = bitcast i32 0 to i32
+  %60 = bitcast i32 0 to i32
+  %61 = bitcast i32 0 to i32
+  %62 = bitcast i32 0 to i32
+  %63 = bitcast i32 0 to i32
+  %64 = bitcast i32 0 to i32
+  %65 = bitcast i32 0 to i32
+  %66 = bitcast i32 0 to i32
+  %67 = bitcast i32 0 to i32
+  %68 = bitcast i32 0 to i32
+  %69 = bitcast i32 0 to i32
+  %70 = bitcast i32 0 to i32
+  %71 = bitcast i32 0 to i32
+  %72 = bitcast i32 0 to i32
+  %73 = bitcast i32 0 to i32
+  %74 = bitcast i32 0 to i32
+  %75 = bitcast i32 0 to i32
+  %76 = bitcast i32 0 to i32
+  %77 = bitcast i32 0 to i32
+  %78 = bitcast i32 0 to i32
+  %79 = bitcast i32 0 to i32
+  %80 = bitcast i32 0 to i32
+  %81 = bitcast i32 0 to i32
+  %82 = bitcast i32 0 to i32
+  %83 = bitcast i32 0 to i32
+  %84 = bitcast i32 0 to i32
+  %85 = bitcast i32 0 to i32
+  %86 = bitcast i32 0 to i32
+  %87 = bitcast i32 0 to i32
+  %88 = bitcast i32 0 to i32
+  %89 = bitcast i32 0 to i32
+  %90 = bitcast i32 0 to i32
+  %91 = bitcast i32 0 to i32
+  %92 = bitcast i32 0 to i32
+  %93 = bitcast i32 0 to i32
+  %94 = bitcast i32 0 to i32
+  %95 = bitcast i32 0 to i32
+  %96 = bitcast i32 0 to i32
+  %97 = bitcast i32 0 to i32
+  %98 = bitcast i32 0 to i32
+
+  ; CHECK:  store i32 -1, i32* @x, align 4
+  store i32 -1, i32* @x, align 4
+  ret i32 0
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !13}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/tmp/test.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"test.c", metadata !"/home/tmp"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"test_within_limit", metadata !"test_within_limit", metadata !"", i32 3, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @test_within_limit, null, null, metadata !2, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [test]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/tmp/test.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 786484, i32 0, null, metadata !"x", metadata !"x", metadata !"", metadata !5, i32 1, metadata !8, i32 0, i32 1, i32* @x, null} ; [ DW_TAG_variable ] [x] [line 1] [def]
+!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!12 = metadata !{i32* undef}
+
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/DeadStoreElimination/lit.local.cfg b/test/Transforms/DeadStoreElimination/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/DeadStoreElimination/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/DebugIR/lit.local.cfg b/test/Transforms/DebugIR/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/DebugIR/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/DebugIR/simple-addrspace.ll b/test/Transforms/DebugIR/simple-addrspace.ll
new file mode 100644
index 0000000..6bea9b2
--- /dev/null
+++ b/test/Transforms/DebugIR/simple-addrspace.ll
@@ -0,0 +1,13 @@
+; RUN: opt -debug-ir -S %s -o - | FileCheck %s
+
+target datalayout = "e-p:64:64:64-p1:16:16:16"
+
+define void @foo(i32 addrspace(1)*) nounwind {
+  ret void
+}
+
+; Make sure the pointer size is 16
+
+; CHECK: metadata !"i32 addrspace(1)*", i32 0, i64 16, i64 2, i64 0, i32 0
+
+
diff --git a/test/Transforms/EarlyCSE/lit.local.cfg b/test/Transforms/EarlyCSE/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/EarlyCSE/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/FunctionAttrs/annotate-1.ll b/test/Transforms/FunctionAttrs/annotate-1.ll
index adb7bce..9fba7a9 100644
--- a/test/Transforms/FunctionAttrs/annotate-1.ll
+++ b/test/Transforms/FunctionAttrs/annotate-1.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -functionattrs -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -functionattrs -S | FileCheck -check-prefix=POSIX %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -functionattrs -S | FileCheck -check-prefix=CHECK-POSIX %s
 
 declare i8* @fopen(i8*, i8*)
 ; CHECK: declare noalias i8* @fopen(i8* nocapture readonly, i8* nocapture readonly) [[G0:#[0-9]]] 
diff --git a/test/Transforms/FunctionAttrs/lit.local.cfg b/test/Transforms/FunctionAttrs/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/FunctionAttrs/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/GCOVProfiling/linkagename.ll b/test/Transforms/GCOVProfiling/linkagename.ll
index 9453e1e..ed3a5bd 100644
--- a/test/Transforms/GCOVProfiling/linkagename.ll
+++ b/test/Transforms/GCOVProfiling/linkagename.ll
@@ -12,6 +12,7 @@ entry:
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10}
 !llvm.gcov = !{!9}
 
 !0 = metadata !{i32 786449, metadata !2, i32 4, metadata !"clang version 3.3 (trunk 177323)", i1 false, metadata !"", i32 0, metadata !3, metadata !3, metadata !4, metadata !3,  metadata !3, metadata !""} ; [ DW_TAG_compile_unit ] [/home/nlewycky/hello.cc] [DW_LANG_C_plus_plus]
@@ -20,8 +21,9 @@ entry:
 !3 = metadata !{i32 0}
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !1, metadata !1, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3foov, null, null, metadata !3, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
-!6 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 1, i32 0, metadata !5, null}
 
 
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/GCOVProfiling/lit.local.cfg b/test/Transforms/GCOVProfiling/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/GCOVProfiling/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/GCOVProfiling/version.ll b/test/Transforms/GCOVProfiling/version.ll
index a90290f..2f1bd70 100644
--- a/test/Transforms/GCOVProfiling/version.ll
+++ b/test/Transforms/GCOVProfiling/version.ll
@@ -16,6 +16,7 @@ define void @test() {
 
 !llvm.gcov = !{!9}
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12}
 
 !0 = metadata !{i32 786449, metadata !11, i32 4, metadata !"clang version 3.3 (trunk 176994)", i1 false, metadata !"", i32 0, metadata !3, metadata !3, metadata !4, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ] [./version] [DW_LANG_C_plus_plus]
 !2 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
@@ -23,8 +24,9 @@ define void @test() {
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !10, metadata !6, metadata !"test", metadata !"test", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @test, null, null, metadata !3, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [test]
 !6 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !3, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !3, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{i32 1, i32 0, metadata !5, null}
 ;; !9 is added through the echo line at the top.
 !10 = metadata !{metadata !"<stdin>", metadata !"."}
 !11 = metadata !{metadata !"version", metadata !"/usr/local/google/home/nlewycky"}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll b/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
index a1cc008..5a15f0e 100644
--- a/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
+++ b/test/Transforms/GVN/2007-07-26-InterlockingLoops.ll
@@ -2,7 +2,7 @@
 
 @last = external global [65 x i32*]
 
-define i32 @NextRootMove(i32 %wtm) {
+define i32 @NextRootMove(i32 %wtm, i32 %x, i32 %y, i32 %z) {
 entry:
         %A = alloca i32*
 	%tmp17618 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
@@ -15,12 +15,14 @@ entry:
 	br label %cond_true116
 
 cond_true116:
-	br i1 false, label %cond_true128, label %cond_true145
+   %cmp = icmp eq i32 %x, %y
+	br i1 %cmp, label %cond_true128, label %cond_true145
 
 cond_true128:
 	%tmp17625 = load i32** getelementptr ([65 x i32*]* @last, i32 0, i32 1), align 4
         store i32* %tmp17625, i32** %A
-	br i1 false, label %bb98.backedge, label %return.loopexit
+   %cmp1 = icmp eq i32 %x, %z
+	br i1 %cmp1 , label %bb98.backedge, label %return.loopexit
 
 bb98.backedge:
 	br label %cond_true116
diff --git a/test/Transforms/GVN/2008-07-02-Unreachable.ll b/test/Transforms/GVN/2008-07-02-Unreachable.ll
index 4f07868..ce83fa4 100644
--- a/test/Transforms/GVN/2008-07-02-Unreachable.ll
+++ b/test/Transforms/GVN/2008-07-02-Unreachable.ll
@@ -3,10 +3,11 @@
 
 @g_3 = external global i8		; <i8*> [#uses=2]
 
-define i8 @func_1() nounwind  {
+define i8 @func_1(i32 %x, i32 %y) nounwind  {
 entry:
   %A = alloca i8
-	br i1 false, label %ifelse, label %ifthen
+    %cmp = icmp eq i32 %x, %y
+	br i1 %cmp, label %ifelse, label %ifthen
 
 ifthen:		; preds = %entry
 	br label %ifend
@@ -14,9 +15,6 @@ ifthen:		; preds = %entry
 ifelse:		; preds = %entry
 	%tmp3 = load i8* @g_3		; <i8> [#uses=0]
         store i8 %tmp3, i8* %A
-	br label %forcond.thread
-
-forcond.thread:		; preds = %ifelse
 	br label %afterfor
 
 forcond:		; preds = %forinc
diff --git a/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll b/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll
index 4613bc4..298f274 100644
--- a/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll
+++ b/test/Transforms/GVN/2011-06-01-NonLocalMemdepMiscompile.ll
@@ -19,10 +19,10 @@ bb1:
   br i1 undef, label %bb3, label %bb15
 
 ; CHECK: bb1:
-; CHECK: %tmp16 = phi i8* [ getelementptr (i8* null, i64 undef), %bb10 ], [ null, %bb ]
+; CHECK: [[TMP:%.*]] = phi i8* [ getelementptr (i8* null, i64 undef), %bb10 ], [ null, %bb ]
 
 ; CHECK: bb1.bb15_crit_edge:
-; CHECK: %tmp17.pre = load i8* %tmp16, align 1
+; CHECK: %tmp17.pre = load i8* [[TMP]], align 1
 
 bb3:
   call void @isalnum()
diff --git a/test/Transforms/GVN/cond_br.ll b/test/Transforms/GVN/cond_br.ll
new file mode 100644
index 0000000..918e7d4
--- /dev/null
+++ b/test/Transforms/GVN/cond_br.ll
@@ -0,0 +1,55 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+@y = external global i32
+@z = external global i32
+
+; Function Attrs: nounwind ssp uwtable
+define void @foo(i32 %x) {
+; CHECK: @foo(i32 %x)
+; CHECK: %.pre = load i32* @y
+; CHECK: call void @bar(i32 %.pre)
+
+  %t = sub i32 %x, %x
+  %.pre = load i32* @y, align 4
+  %cmp = icmp sgt i32 %t, 2
+  br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
+
+entry.if.end_crit_edge:                           ; preds = %entry
+  br label %if.end
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %x, 3
+  store i32 %add, i32* @y, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry.if.end_crit_edge, %if.then
+  %1 = phi i32 [ %.pre, %entry.if.end_crit_edge ], [ %add, %if.then ]
+  tail call void @bar(i32 %1)
+  ret void
+}
+
+define void @foo2(i32 %x) {
+; CHECK: @foo2(i32 %x)
+; CHECK: %.pre = load i32* @y
+; CHECK: tail call void @bar(i32 %.pre)
+entry:
+  %t = sub i32 %x, %x
+  %.pre = load i32* @y, align 4
+  %cmp = icmp sgt i32 %t, 2
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %x, 3
+  store i32 %add, i32* @y, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i32 1, i32* @z, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %0 = phi i32 [ %.pre, %if.else ], [ %add, %if.then ]
+  tail call void @bar(i32 %0)
+  ret void
+}
+
+declare void @bar(i32)
diff --git a/test/Transforms/GVN/cond_br2.ll b/test/Transforms/GVN/cond_br2.ll
new file mode 100644
index 0000000..27e6f75
--- /dev/null
+++ b/test/Transforms/GVN/cond_br2.ll
@@ -0,0 +1,140 @@
+; RUN: opt -basicaa -gvn -S < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%"class.llvm::SmallVector" = type { %"class.llvm::SmallVectorImpl", [1 x %"union.llvm::SmallVectorBase::U"] }
+%"class.llvm::SmallVectorImpl" = type { %"class.llvm::SmallVectorTemplateBase" }
+%"class.llvm::SmallVectorTemplateBase" = type { %"class.llvm::SmallVectorTemplateCommon" }
+%"class.llvm::SmallVectorTemplateCommon" = type { %"class.llvm::SmallVectorBase" }
+%"class.llvm::SmallVectorBase" = type { i8*, i8*, i8*, %"union.llvm::SmallVectorBase::U" }
+%"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
+
+; Function Attrs: ssp uwtable
+define void @_Z4testv() #0 {
+; CHECK: @_Z4testv()
+; CHECK: invoke.cont:
+; CHECK: br i1 true, label %new.notnull.i11, label %if.end.i14
+; CHECK: Retry.i10:
+
+entry:
+  %sv = alloca %"class.llvm::SmallVector", align 16
+  %0 = bitcast %"class.llvm::SmallVector"* %sv to i8*
+  call void @llvm.lifetime.start(i64 64, i8* %0) #1
+  %BeginX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0
+  %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
+  %1 = bitcast %"union.llvm::SmallVectorBase::U"* %FirstEl.i.i.i.i.i.i to i8*
+  store i8* %1, i8** %BeginX.i.i.i.i.i.i, align 16, !tbaa !4
+  %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
+  store i8* %1, i8** %EndX.i.i.i.i.i.i, align 8, !tbaa !4
+  %CapacityX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 2
+  %add.ptr.i.i.i.i2.i.i = getelementptr inbounds %"union.llvm::SmallVectorBase::U"* %FirstEl.i.i.i.i.i.i, i64 2
+  %add.ptr.i.i.i.i.i.i = bitcast %"union.llvm::SmallVectorBase::U"* %add.ptr.i.i.i.i2.i.i to i8*
+  store i8* %add.ptr.i.i.i.i.i.i, i8** %CapacityX.i.i.i.i.i.i, align 16, !tbaa !4
+  %EndX.i = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
+  %2 = load i8** %EndX.i, align 8, !tbaa !4
+  %CapacityX.i = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 2
+  %cmp.i = icmp ult i8* %2, %add.ptr.i.i.i.i.i.i
+  br i1 %cmp.i, label %Retry.i, label %if.end.i
+
+Retry.i:                                          ; preds = %.noexc, %entry
+  %3 = phi i8* [ %2, %entry ], [ %.pre.i, %.noexc ]
+  %new.isnull.i = icmp eq i8* %3, null
+  br i1 %new.isnull.i, label %invoke.cont, label %new.notnull.i
+
+new.notnull.i:                                    ; preds = %Retry.i
+  %4 = bitcast i8* %3 to i32*
+  store i32 1, i32* %4, align 4, !tbaa !5
+  br label %invoke.cont
+
+if.end.i:                                         ; preds = %entry
+  %5 = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0
+  invoke void @_ZN4llvm15SmallVectorBase8grow_podEmm(%"class.llvm::SmallVectorBase"* %5, i64 0, i64 4)
+          to label %.noexc unwind label %lpad
+
+.noexc:                                           ; preds = %if.end.i
+  %.pre.i = load i8** %EndX.i, align 8, !tbaa !4
+  br label %Retry.i
+
+invoke.cont:                                      ; preds = %new.notnull.i, %Retry.i
+  %add.ptr.i = getelementptr inbounds i8* %3, i64 4
+  store i8* %add.ptr.i, i8** %EndX.i, align 8, !tbaa !4
+  %6 = load i8** %CapacityX.i, align 16, !tbaa !4
+  %cmp.i8 = icmp ult i8* %add.ptr.i, %6
+  br i1 %cmp.i8, label %new.notnull.i11, label %if.end.i14
+
+Retry.i10:                                        ; preds = %if.end.i14
+  %.pre.i13 = load i8** %EndX.i, align 8, !tbaa !4
+  %new.isnull.i9 = icmp eq i8* %.pre.i13, null
+  br i1 %new.isnull.i9, label %invoke.cont2, label %new.notnull.i11
+
+new.notnull.i11:                                  ; preds = %invoke.cont, %Retry.i10
+  %7 = phi i8* [ %.pre.i13, %Retry.i10 ], [ %add.ptr.i, %invoke.cont ]
+  %8 = bitcast i8* %7 to i32*
+  store i32 2, i32* %8, align 4, !tbaa !5
+  br label %invoke.cont2
+
+if.end.i14:                                       ; preds = %invoke.cont
+  %9 = getelementptr inbounds %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0
+  invoke void @_ZN4llvm15SmallVectorBase8grow_podEmm(%"class.llvm::SmallVectorBase"* %9, i64 0, i64 4)
+          to label %Retry.i10 unwind label %lpad
+
+invoke.cont2:                                     ; preds = %new.notnull.i11, %Retry.i10
+  %10 = phi i8* [ null, %Retry.i10 ], [ %7, %new.notnull.i11 ]
+  %add.ptr.i12 = getelementptr inbounds i8* %10, i64 4
+  store i8* %add.ptr.i12, i8** %EndX.i, align 8, !tbaa !4
+  invoke void @_Z1gRN4llvm11SmallVectorIiLj8EEE(%"class.llvm::SmallVector"* %sv)
+          to label %invoke.cont3 unwind label %lpad
+
+invoke.cont3:                                     ; preds = %invoke.cont2
+  %11 = load i8** %BeginX.i.i.i.i.i.i, align 16, !tbaa !4
+  %cmp.i.i.i.i19 = icmp eq i8* %11, %1
+  br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20
+
+if.then.i.i.i20:                                  ; preds = %invoke.cont3
+  call void @free(i8* %11) #1
+  br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
+
+_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
+  call void @llvm.lifetime.end(i64 64, i8* %0) #1
+  ret void
+
+lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
+  %12 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %13 = load i8** %BeginX.i.i.i.i.i.i, align 16, !tbaa !4
+  %cmp.i.i.i.i = icmp eq i8* %13, %1
+  br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i
+
+if.then.i.i.i:                                    ; preds = %lpad
+  call void @free(i8* %13) #1
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %if.then.i.i.i, %lpad
+  resume { i8*, i32 } %12
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(%"class.llvm::SmallVector"*) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(%"class.llvm::SmallVectorBase"*, i64, i64) #2
+
+; Function Attrs: nounwind
+declare void @free(i8* nocapture) #3
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"int", metadata !1}
+!4 = metadata !{metadata !0, metadata !0, i64 0}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/Transforms/GVN/lit.local.cfg b/test/Transforms/GVN/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/GVN/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/GVN/local-pre.ll b/test/Transforms/GVN/local-pre.ll
index 1d0dadf..2c92699 100644
--- a/test/Transforms/GVN/local-pre.ll
+++ b/test/Transforms/GVN/local-pre.ll
@@ -1,9 +1,9 @@
 ; RUN: opt < %s -gvn -enable-pre -S | grep "b.pre"
 
-define i32 @main(i32 %p) {
+define i32 @main(i32 %p, i32 %q) {
 block1:
-  
-	br i1 true, label %block2, label %block3
+    %cmp = icmp eq i32 %p, %q 
+	br i1 %cmp, label %block2, label %block3
 
 block2:
  %a = add i32 %p, 1
diff --git a/test/Transforms/GVN/malloc-load-removal.ll b/test/Transforms/GVN/malloc-load-removal.ll
index e93a62a..d2d2fd7 100644
--- a/test/Transforms/GVN/malloc-load-removal.ll
+++ b/test/Transforms/GVN/malloc-load-removal.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 declare i8* @malloc(i64) nounwind
 
-define noalias i8* @test() nounwind uwtable ssp {
+define noalias i8* @test1() nounwind uwtable ssp {
 entry:
   %call = tail call i8* @malloc(i64 100) nounwind
   %0 = load i8* %call, align 1
@@ -21,11 +21,36 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret i8* %call
 
-; CHECK-LABEL: @test(
+; CHECK-LABEL: @test1(
 ; CHECK-NOT: load
 ; CHECK-NOT: icmp
 
-; CHECK_NO_LIBCALLS-LABEL: @test(
+; CHECK_NO_LIBCALLS-LABEL: @test1(
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: icmp
+}
+
+declare i8* @_Znwm(i64) nounwind
+
+define noalias i8* @test2() nounwind uwtable ssp {
+entry:
+  %call = tail call i8* @_Znwm(i64 100) nounwind
+  %0 = load i8* %call, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i8 0, i8* %call, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i8* %call
+
+; CHECK-LABEL: @test2(
+; CHECK-NOT: load
+; CHECK-NOT: icmp
+
+; CHECK_NO_LIBCALLS-LABEL: @test2(
 ; CHECK_NO_LIBCALLS: load
 ; CHECK_NO_LIBCALLS: icmp
 }
diff --git a/test/Transforms/GVN/pr17732.ll b/test/Transforms/GVN/pr17732.ll
new file mode 100644
index 0000000..606a195
--- /dev/null
+++ b/test/Transforms/GVN/pr17732.ll
@@ -0,0 +1,30 @@
+; RUN: opt -gvn -S -o - < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.with_array = type { [2 x i8], i32, i8 }
+%struct.with_vector = type { <2 x i8>, i32, i8 }
+
+@main.obj_with_array = private unnamed_addr constant { [2 x i8], i32, i8, [3 x i8] } { [2 x i8] zeroinitializer, i32 0, i8 1, [3 x i8] undef }, align 4
+@array_with_zeroinit = common global %struct.with_array zeroinitializer, align 4
+
+@main.obj_with_vector = private unnamed_addr constant { <2 x i8>, i32, i8, [3 x i8] } { <2 x i8> zeroinitializer, i32 0, i8 1, [3 x i8] undef }, align 4
+@vector_with_zeroinit = common global %struct.with_vector zeroinitializer, align 4
+
+define i32 @main() {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds (%struct.with_array* @array_with_zeroinit, i64 0, i32 0, i64 0), i8* getelementptr inbounds ({ [2 x i8], i32, i8, [3 x i8] }* @main.obj_with_array, i64 0, i32 0, i64 0), i64 12, i32 4, i1 false)
+  %0 = load i8* getelementptr inbounds (%struct.with_array* @array_with_zeroinit, i64 0, i32 2), align 4
+
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds (%struct.with_vector* @vector_with_zeroinit, i64 0, i32 0, i64 0), i8* getelementptr inbounds ({ <2 x i8>, i32, i8, [3 x i8] }* @main.obj_with_vector, i64 0, i32 0, i64 0), i64 12, i32 4, i1 false)
+  %1 = load i8* getelementptr inbounds (%struct.with_vector* @vector_with_zeroinit, i64 0, i32 2), align 4
+  %conv0 = sext i8 %0 to i32
+  %conv1 = sext i8 %1 to i32
+  %and = and i32 %conv0, %conv1
+  ret i32 %and
+; CHECK-LABEL: define i32 @main(
+; CHECK: ret i32 1
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/Transforms/GVN/pr17852.ll b/test/Transforms/GVN/pr17852.ll
new file mode 100644
index 0000000..e95ff7f
--- /dev/null
+++ b/test/Transforms/GVN/pr17852.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -basicaa -gvn
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+%struct.S0 = type { [2 x i8], [2 x i8], [4 x i8], [2 x i8], i32, i32, i32, i32 }
+define void @fn1(%struct.S0* byval align 8 %p1) {
+  br label %for.cond
+for.cond:                                         ; preds = %1, %0
+  br label %for.end
+  %f2 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 2
+  %f9 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 7
+  br label %for.cond
+for.end:                                          ; preds = %for.cond
+  br i1 true, label %if.else, label %if.then
+if.then:                                          ; preds = %for.end
+  %f22 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 2
+  %f7 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 5
+  %tmp7 = load i32* %f7, align 8
+  br label %if.end40
+if.else:                                          ; preds = %for.end
+  br i1 false, label %for.cond18, label %if.then6
+if.then6:                                         ; preds = %if.else
+  %f3 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 2
+  %tmp10 = bitcast %struct.S0* %p1 to i16*
+  %f5 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 3
+  %tmp11 = bitcast [2 x i8]* %f5 to i16*
+  %bf.load13 = load i16* %tmp11, align 8
+  br label %if.end36
+for.cond18:                                       ; preds = %if.else
+  call void @fn4()
+  br i1 true, label %if.end, label %if.end36
+if.end:                                           ; preds = %for.cond18
+  %f321 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 2
+  %f925 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 7
+  %f526 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 3
+  %tmp15 = bitcast [2 x i8]* %f526 to i16*
+  %bf.load27 = load i16* %tmp15, align 8
+  %tmp16 = bitcast %struct.S0* %p1 to i16*
+  br label %if.end36
+if.end36:                                         ; preds = %if.end, %for.cond18, %if.then6
+  %f537 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 3
+  %tmp17 = bitcast [2 x i8]* %f537 to i16*
+  %bf.load38 = load i16* %tmp17, align 8
+  %bf.clear39 = and i16 %bf.load38, -16384
+  br label %if.end40
+if.end40:                                         ; preds = %if.end36, %if.then
+  %f6 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 4
+  %tmp18 = load i32* %f6, align 4
+  call void @fn2(i32 %tmp18)
+  %f8 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 6
+  %tmp19 = load i32* %f8, align 4
+  %tobool41 = icmp eq i32 %tmp19, 0
+  br i1 true, label %if.end50, label %if.then42
+if.then42:                                        ; preds = %if.end40
+  %tmp20 = bitcast %struct.S0* %p1 to i16*
+  %f547 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 3
+  %tmp21 = bitcast [2 x i8]* %f547 to i16*
+  %bf.load48 = load i16* %tmp21, align 8
+  br label %if.end50
+if.end50:                                         ; preds = %if.then42, %if.end40
+  %f551 = getelementptr inbounds %struct.S0* %p1, i64 0, i32 3
+  %tmp22 = bitcast [2 x i8]* %f551 to i16*
+  %bf.load52 = load i16* %tmp22, align 8
+  %bf.clear53 = and i16 %bf.load52, -16384
+  ret void
+}
+declare void @fn2(i32)
+declare void @fn4()
diff --git a/test/Transforms/GVN/preserve-tbaa.ll b/test/Transforms/GVN/preserve-tbaa.ll
index e52772b..c52ed96 100644
--- a/test/Transforms/GVN/preserve-tbaa.ll
+++ b/test/Transforms/GVN/preserve-tbaa.ll
@@ -25,6 +25,7 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-!0 = metadata !{metadata !"short", metadata !1}
+!0 = metadata !{metadata !3, metadata !3, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/Transforms/GVN/rle-nonlocal.ll b/test/Transforms/GVN/rle-nonlocal.ll
index 6b74e9a..8229aaa 100644
--- a/test/Transforms/GVN/rle-nonlocal.ll
+++ b/test/Transforms/GVN/rle-nonlocal.ll
@@ -1,8 +1,9 @@
 ; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
 
-define i32 @main(i32** %p) {
+define i32 @main(i32** %p, i32 %x, i32 %y) {
 block1:
-	br i1 true, label %block2, label %block3
+    %cmp = icmp eq i32 %x, %y
+	br i1 %cmp , label %block2, label %block3
 
 block2:
  %a = load i32** %p
diff --git a/test/Transforms/GVN/rle-semidominated.ll b/test/Transforms/GVN/rle-semidominated.ll
index 71aa548..923cd03 100644
--- a/test/Transforms/GVN/rle-semidominated.ll
+++ b/test/Transforms/GVN/rle-semidominated.ll
@@ -1,9 +1,10 @@
 ; RUN: opt < %s -basicaa -gvn -S | grep "DEAD = phi i32 "
 
-define i32 @main(i32* %p) {
+define i32 @main(i32* %p, i32 %x, i32 %y) {
 block1:
   %z = load i32* %p
-	br i1 true, label %block2, label %block3
+  %cmp = icmp eq i32 %x, %y
+	br i1 %cmp, label %block2, label %block3
 
 block2:
  br label %block4
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index 8787dd5..8d289b0 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
-; RUN: opt < %s -default-data-layout="E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -default-data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
@@ -195,6 +195,7 @@ Cont:
 }
 
 @GCst = constant {i32, float, i32 } { i32 42, float 14., i32 97 }
+@GCst_as1 = addrspace(1) constant {i32, float, i32 } { i32 42, float 14., i32 97 }
 
 ; memset -> float forwarding.
 define float @memcpy_to_float_local(float* %A) nounwind ssp {
@@ -209,7 +210,18 @@ entry:
 ; CHECK: ret float 1.400000e+01
 }
 
-
+; memcpy from address space 1
+define float @memcpy_to_float_local_as1(float* %A) nounwind ssp {
+entry:
+  %conv = bitcast float* %A to i8*                ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.p0i8.p1i8.i64(i8* %conv, i8 addrspace(1)* bitcast ({i32, float, i32 } addrspace(1)* @GCst_as1 to i8 addrspace(1)*), i64 12, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds float* %A, i64 1 ; <float*> [#uses=1]
+  %tmp2 = load float* %arrayidx                   ; <float> [#uses=1]
+  ret float %tmp2
+; CHECK-LABEL: @memcpy_to_float_local_as1(
+; CHECK-NOT: load
+; CHECK: ret float 1.400000e+01
+}
 
 ;; non-local i32/float -> i8 load forwarding.
 define i8 @coerce_mustalias_nonlocal0(i32* %P, i1 %cond) {
@@ -357,13 +369,14 @@ Cont:
 ; CHECK: ret i8 %A
 }
 
-define i32 @chained_load(i32** %p) {
+define i32 @chained_load(i32** %p, i32 %x, i32 %y) {
 block1:
   %A = alloca i32*
 
   %z = load i32** %p
   store i32* %z, i32** %A
-  br i1 true, label %block2, label %block3
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %block2, label %block3
 
 block2:
  %a = load i32** %p
@@ -427,10 +440,11 @@ TY:
   ret i32 0
 }
 
-define i32 @phi_trans3(i32* %p) {
+define i32 @phi_trans3(i32* %p, i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @phi_trans3(
 block1:
-  br i1 true, label %block2, label %block3
+  %cmpxy = icmp eq i32 %x, %y
+  br i1 %cmpxy, label %block2, label %block3
 
 block2:
  store i32 87, i32* %p
@@ -443,7 +457,7 @@ block3:
 
 block4:
   %A = phi i32 [-1, %block2], [42, %block3]
-  br i1 true, label %block5, label %exit
+  br i1 %cmpxy, label %block5, label %exit
   
 ; CHECK: block4:
 ; CHECK-NEXT: %D = phi i32 [ 87, %block2 ], [ 97, %block3 ]  
@@ -451,11 +465,11 @@ block4:
 
 block5:
   %B = add i32 %A, 1
-  br i1 true, label %block6, label %exit
+  br i1 %cmpxy, label %block6, label %exit
   
 block6:
   %C = getelementptr i32* %p, i32 %B
-  br i1 true, label %block7, label %exit
+  br i1 %cmpxy, label %block7, label %exit
   
 block7:
   %D = load i32* %C
@@ -645,6 +659,8 @@ entry:
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
+
 
 ;;===----------------------------------------------------------------------===;;
 ;; Load -> Store dependency which isn't interfered with by a call that happens
diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll
index 85fe39a..d6412fc 100644
--- a/test/Transforms/GVN/tbaa.ll
+++ b/test/Transforms/GVN/tbaa.ll
@@ -13,7 +13,7 @@ define i32 @test1(i8* %p, i8* %q) {
 
 define i32 @test2(i8* %p, i8* %q) {
 ; CHECK: @test2(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa !0
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !0
@@ -23,7 +23,7 @@ define i32 @test2(i8* %p, i8* %q) {
 
 define i32 @test3(i8* %p, i8* %q) {
 ; CHECK: @test3(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa !3
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !3
   %b = call i32 @foo(i8* %p), !tbaa !3
@@ -33,7 +33,7 @@ define i32 @test3(i8* %p, i8* %q) {
 
 define i32 @test4(i8* %p, i8* %q) {
 ; CHECK: @test4(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa !1
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !1
   %b = call i32 @foo(i8* %p), !tbaa !0
@@ -43,7 +43,7 @@ define i32 @test4(i8* %p, i8* %q) {
 
 define i32 @test5(i8* %p, i8* %q) {
 ; CHECK: @test5(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa !1
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !1
@@ -53,7 +53,7 @@ define i32 @test5(i8* %p, i8* %q) {
 
 define i32 @test6(i8* %p, i8* %q) {
 ; CHECK: @test6(i8* %p, i8* %q)
-; CHECK: call i32 @foo(i8* %p), !tbaa !1
+; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]]
 ; CHECK: %c = add i32 %a, %a
   %a = call i32 @foo(i8* %p), !tbaa !0
   %b = call i32 @foo(i8* %p), !tbaa !3
@@ -74,8 +74,18 @@ define i32 @test7(i8* %p, i8* %q) {
 
 declare i32 @foo(i8*) readonly
 
-!0 = metadata !{metadata !"C", metadata !1}
-!1 = metadata !{metadata !"A", metadata !2}
+; CHECK: [[TAGC]] = metadata !{metadata [[TYPEC:!.*]], metadata [[TYPEC]], i64 0}
+; CHECK: [[TYPEC]] = metadata !{metadata !"C", metadata [[TYPEA:!.*]]}
+; CHECK: [[TYPEA]] = metadata !{metadata !"A", metadata !{{.*}}}
+; CHECK: [[TAGB]] = metadata !{metadata [[TYPEB:!.*]], metadata [[TYPEB]], i64 0}
+; CHECK: [[TYPEB]] = metadata !{metadata !"B", metadata [[TYPEA]]}
+; CHECK: [[TAGA]] = metadata !{metadata [[TYPEA]], metadata [[TYPEA]], i64 0}
+!0 = metadata !{metadata !5, metadata !5, i64 0}
+!1 = metadata !{metadata !6, metadata !6, i64 0}
 !2 = metadata !{metadata !"tbaa root", null}
-!3 = metadata !{metadata !"B", metadata !1}
-!4 = metadata !{metadata !"another root", null}
+!3 = metadata !{metadata !7, metadata !7, i64 0}
+!4 = metadata !{metadata !8, metadata !8, i64 0}
+!5 = metadata !{metadata !"C", metadata !6}
+!6 = metadata !{metadata !"A", metadata !2}
+!7 = metadata !{metadata !"B", metadata !6}
+!8 = metadata !{metadata !"another root", null}
diff --git a/test/Transforms/GlobalDCE/lit.local.cfg b/test/Transforms/GlobalDCE/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/GlobalDCE/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/GlobalOpt/2008-07-17-addrspace.ll b/test/Transforms/GlobalOpt/2008-07-17-addrspace.ll
index 390e77a..0867ca9 100644
--- a/test/Transforms/GlobalOpt/2008-07-17-addrspace.ll
+++ b/test/Transforms/GlobalOpt/2008-07-17-addrspace.ll
@@ -4,7 +4,7 @@
 
 ; RUN: opt < %s -globalopt -S > %t
 ; Check that the new global values still have their address space
-; RUN: cat %t | grep addrspace.*global
+; RUN: cat %t | grep 'addrspace.*global'
 
 @struct = internal addrspace(1) global { i32, i32 } zeroinitializer
 @array = internal addrspace(1) global [ 2 x i32 ] zeroinitializer 
diff --git a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
index e08320b..0108960 100644
--- a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
+++ b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
@@ -60,7 +60,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !2 = metadata !{i32 458788, null, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !3 = metadata !{i32 459009, metadata !4, metadata !"i", metadata !1, i32 4, metadata !2} ; [ DW_TAG_arg_variable ]
 !4 = metadata !{i32 458798, i32 0, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!5 = metadata !{i32 458773, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{i32 458773, metadata !1, null, metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !2, metadata !2}
 !7 = metadata !{i32 5, i32 0, metadata !8, null}
 !8 = metadata !{i32 458763, metadata !20, metadata !4, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
@@ -71,7 +71,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !13 = metadata !{i32 14, i32 0, metadata !14, null}
 !14 = metadata !{i32 458763, metadata !20, metadata !15, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !15 = metadata !{i32 458798, i32 0, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 13, metadata !16, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!16 = metadata !{i32 458773, metadata !1, metadata !"", metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!16 = metadata !{i32 458773, metadata !1, null, metadata !1, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{metadata !2}
 !18 = metadata !{i32 15, i32 0, metadata !14, null}
 !19 = metadata !{i32 16, i32 0, metadata !14, null}
diff --git a/test/Transforms/GlobalOpt/array-elem-refs.ll b/test/Transforms/GlobalOpt/array-elem-refs.ll
new file mode 100644
index 0000000..ec472b0
--- /dev/null
+++ b/test/Transforms/GlobalOpt/array-elem-refs.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -S -globalopt | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.S = type { i8, i8 }
+
+@c = internal global i8** bitcast (i8* getelementptr (i8* bitcast ([8 x i8*]* @b to i8*), i64 48) to i8**), align 8
+@b = internal global [8 x i8*] [i8* null, i8* null, i8* null, i8* null, i8* null, i8* null, i8* getelementptr inbounds (%struct.S* @a, i32 0, i32 0), i8* getelementptr (i8* getelementptr inbounds (%struct.S* @a, i32 0, i32 0), i64 1)], align 16
+@a = internal global %struct.S zeroinitializer, align 1
+
+; Function Attrs: nounwind uwtable
+define signext i8 @foo() #0 {
+entry:
+  %0 = load i8*** @c, align 8
+  %1 = load i8** %0, align 8
+  %2 = load i8* %1, align 1
+  ret i8 %2
+
+; CHECK-LABEL: @foo
+; CHECK: ret i8 0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  ret i32 0
+}
+
+attributes #0 = { nounwind uwtable }
+
diff --git a/test/Transforms/GlobalOpt/atomic.ll b/test/Transforms/GlobalOpt/atomic.ll
index 4c3f439..ac05bfd 100644
--- a/test/Transforms/GlobalOpt/atomic.ll
+++ b/test/Transforms/GlobalOpt/atomic.ll
@@ -1,10 +1,25 @@
 ; RUN: opt -globalopt < %s -S -o - | FileCheck %s
 
 @GV1 = internal global i64 1
+@GV2 = internal global i32 0
+
 ; CHECK: @GV1 = internal unnamed_addr constant i64 1
+; CHECK: @GV2 = internal unnamed_addr global i32 0
 
 define void @test1() {
 entry:
   %0 = load atomic i8* bitcast (i64* @GV1 to i8*) acquire, align 8
   ret void
 }
+
+; PR17163
+define void @test2a() {
+entry:
+  store atomic i32 10, i32* @GV2 seq_cst, align 4
+  ret void
+}
+define i32 @test2b() {
+entry:
+  %atomic-load = load atomic i32* @GV2 seq_cst, align 4
+  ret i32 %atomic-load
+}
diff --git a/test/Transforms/GlobalOpt/lit.local.cfg b/test/Transforms/GlobalOpt/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/GlobalOpt/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/IPConstantProp/lit.local.cfg b/test/Transforms/IPConstantProp/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/IPConstantProp/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll b/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll
new file mode 100644
index 0000000..e4c31d1
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/lftr-address-space-pointers.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -indvars -o - %s | FileCheck %s
+target datalayout = "e-p:32:32:32-p1:64:64:64-p2:8:8:8-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-n8:16:32:64"
+
+; Derived from ptriv in lftr-reuse.ll
+define void @ptriv_as2(i8 addrspace(2)* %base, i32 %n) nounwind {
+; CHECK-LABEL: @ptriv_as2(
+entry:
+  %idx.trunc = trunc i32 %n to i8
+  %add.ptr = getelementptr inbounds i8 addrspace(2)* %base, i8 %idx.trunc
+  %cmp1 = icmp ult i8 addrspace(2)* %base, %add.ptr
+  br i1 %cmp1, label %for.body, label %for.end
+
+; Make sure the added GEP has the right index type
+; CHECK: %lftr.limit = getelementptr i8 addrspace(2)* %base, i8 %0
+
+; CHECK: for.body:
+; CHECK: phi i8 addrspace(2)*
+; CHECK-NOT: phi
+; CHECK-NOT: add{{^rspace}}
+; CHECK: icmp ne i8 addrspace(2)*
+; CHECK: br i1
+for.body:
+  %p.02 = phi i8 addrspace(2)* [ %base, %entry ], [ %incdec.ptr, %for.body ]
+  ; cruft to make the IV useful
+  %sub.ptr.lhs.cast = ptrtoint i8 addrspace(2)* %p.02 to i8
+  %sub.ptr.rhs.cast = ptrtoint i8 addrspace(2)* %base to i8
+  %sub.ptr.sub = sub i8 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  store i8 %sub.ptr.sub, i8 addrspace(2)* %p.02
+  %incdec.ptr = getelementptr inbounds i8 addrspace(2)* %p.02, i32 1
+  %cmp = icmp ult i8 addrspace(2)* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @ptriv_as3(i8 addrspace(3)* %base, i32 %n) nounwind {
+; CHECK-LABEL: @ptriv_as3(
+entry:
+  %idx.trunc = trunc i32 %n to i16
+  %add.ptr = getelementptr inbounds i8 addrspace(3)* %base, i16 %idx.trunc
+  %cmp1 = icmp ult i8 addrspace(3)* %base, %add.ptr
+  br i1 %cmp1, label %for.body, label %for.end
+
+; Make sure the added GEP has the right index type
+; CHECK: %lftr.limit = getelementptr i8 addrspace(3)* %base, i16 %0
+
+; CHECK: for.body:
+; CHECK: phi i8 addrspace(3)*
+; CHECK-NOT: phi
+; CHECK-NOT: add{{^rspace}}
+; CHECK: icmp ne i8 addrspace(3)*
+; CHECK: br i1
+for.body:
+  %p.02 = phi i8 addrspace(3)* [ %base, %entry ], [ %incdec.ptr, %for.body ]
+  ; cruft to make the IV useful
+  %sub.ptr.lhs.cast = ptrtoint i8 addrspace(3)* %p.02 to i16
+  %sub.ptr.rhs.cast = ptrtoint i8 addrspace(3)* %base to i16
+  %sub.ptr.sub = sub i16 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %conv = trunc i16 %sub.ptr.sub to i8
+  store i8 %conv, i8 addrspace(3)* %p.02
+  %incdec.ptr = getelementptr inbounds i8 addrspace(3)* %p.02, i32 1
+  %cmp = icmp ult i8 addrspace(3)* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
diff --git a/test/Transforms/IndVarSimplify/lftr-zext.ll b/test/Transforms/IndVarSimplify/lftr-zext.ll
new file mode 100644
index 0000000..32fa61a
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/lftr-zext.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@data = common global [240 x i8] zeroinitializer, align 16
+
+define void @foo(i8* %a) nounwind uwtable ssp {
+; CHECK: %exitcond
+; CHECK-NOT: ([240 x i8]* @data, i64 0, i64 -16)
+  br label %1
+
+; <label>:1                                       ; preds = %0, %1
+  %i.0 = phi i8 [ 0, %0 ], [ %5, %1 ]
+  %p.0 = phi i8* [ getelementptr inbounds ([240 x i8]* @data, i64 0, i64 0), %0 ], [ %4, %1 ]
+  %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
+  %2 = getelementptr inbounds i8* %.0, i64 1
+  %3 = load i8* %.0, align 1
+  %4 = getelementptr inbounds i8* %p.0, i64 1
+  store i8 %3, i8* %p.0, align 1
+  %5 = add i8 %i.0, 1
+  %6 = icmp ult i8 %5, -16
+  br i1 %6, label %1, label %7
+
+; <label>:7                                       ; preds = %1
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/lit.local.cfg b/test/Transforms/IndVarSimplify/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/IndVarSimplify/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate_1.ll b/test/Transforms/IndVarSimplify/loop_evaluate_1.ll
index abf1bc3..5d2c8c7 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate_1.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate_1.ll
@@ -1,8 +1,9 @@
-; RUN: opt < %s -indvars -loop-deletion -simplifycfg -S | not grep br
-;
-; Testcase distilled from 256.bzip2
+; RUN: opt < %s -indvars -loop-deletion -simplifycfg -S | FileCheck %s
 
-define i32 @main() {
+; Testcase distilled from 256.bzip2
+; CHECK-LABEL: @test1
+; CHECK-NOT: br
+define i32 @test1() {
 entry:
         br label %loopentry
 
@@ -19,3 +20,28 @@ loopexit:               ; preds = %loopentry
         ret i32 %tmp.2
 }
 
+
+; PR12377
+; CHECK-LABEL: @test2
+; CHECK: [[VAR1:%.+]] = add i32 %arg, -11
+; CHECK: [[VAR2:%.+]] = lshr i32 [[VAR1]], 1
+; CHECK: [[VAR3:%.+]] = add i32 [[VAR2]], 1
+; CHECK: [[VAR4:%.+]] = phi i32 [ 0, %bb ], [ [[VAR3]], %bb1.preheader ]
+; CHECK: ret i32 [[VAR4]]
+define i32 @test2(i32 %arg) {
+bb:
+  %tmp = icmp ugt i32 %arg, 10
+  br i1 %tmp, label %bb1, label %bb7
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp2 = phi i32 [ %tmp5, %bb1 ], [ 0, %bb ]
+  %tmp3 = phi i32 [ %tmp4, %bb1 ], [ %arg, %bb ]
+  %tmp4 = add i32 %tmp3, -2
+  %tmp5 = add i32 %tmp2, 1
+  %tmp6 = icmp ugt i32 %tmp4, 10
+  br i1 %tmp6, label %bb1, label %bb7
+
+bb7:                                              ; preds = %bb1, %bb
+  %tmp8 = phi i32 [ 0, %bb ], [ %tmp5, %bb1 ]
+  ret i32 %tmp8
+}
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate_6.ll b/test/Transforms/IndVarSimplify/loop_evaluate_6.ll
index da38de5..af01fe5 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate_6.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate_6.ll
@@ -1,9 +1,4 @@
 ; RUN: opt < %s -indvars -loop-deletion -S | grep phi | count 1
-; XFAIL: *
-
-; Indvars can't evaluate this loop, because ScalarEvolution can't compute
-; an exact trip count, because it doesn't know if dividing by the stride will
-; have a remainder. It could be done with more aggressive VRP though.
 
 define i32 @test(i32 %x_offs) nounwind readnone {
 entry:
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index 507f695..0576692 100644
--- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -223,13 +223,18 @@ entry:
   %halfLim = ashr i32 %limit, 2
   br label %loop
 
-; Test cloning an or, which is not an OverflowBinaryOperator.
+; This test originally checked that the OR instruction was cloned. Now the
+; ScalarEvolution is able to understand the loop evolution and that '%iv' at the
+; end of the loop is an even value. Thus '%val' is computed at the end of the
+; loop and the OR instruction is replaced by an ADD keeping the result
+; equivalent.
 ;
 ; CHECK: loop:
 ; CHECK: phi i64
 ; CHECK-NOT: sext
-; CHECK: or i64
+; CHECK: icmp slt i32
 ; CHECK: exit:
+; CHECK: add i64
 loop:
   %iv = phi i32 [ 0, %entry], [ %iv.next, %loop ]
   %t1 = sext i32 %iv to i64
diff --git a/test/Transforms/Inline/alloca-merge-align-nodl.ll b/test/Transforms/Inline/alloca-merge-align-nodl.ll
index 203f52b..301505f 100644
--- a/test/Transforms/Inline/alloca-merge-align-nodl.ll
+++ b/test/Transforms/Inline/alloca-merge-align-nodl.ll
@@ -8,13 +8,13 @@ define void @foo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 4
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 4
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx) #2
   ret void
 }
@@ -23,13 +23,13 @@ define void @foo0(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32]
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 4
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx) #2
   ret void
 }
@@ -40,13 +40,13 @@ define void @goo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx) #2
   ret void
 }
@@ -62,9 +62,9 @@ entry:
   %tmpcast = bitcast i64* %a to %struct.s*
   store i64 0, i64* %a, align 8
   %a1 = bitcast i64* %a to i32*
-  store i32 1, i32* %a1, align 8, !tbaa !0
+  store i32 1, i32* %a1, align 8
   call void @foo(%struct.s* byval %tmpcast)
-  store i32 2, i32* %a1, align 8, !tbaa !0
+  store i32 2, i32* %a1, align 8
   call void @goo(%struct.s* byval %tmpcast)
   ret i32 0
 }
@@ -80,14 +80,9 @@ entry:
   %tmpcast = bitcast i64* %a to %struct.s*
   store i64 0, i64* %a, align 8
   %a1 = bitcast i64* %a to i32*
-  store i32 1, i32* %a1, align 8, !tbaa !0
+  store i32 1, i32* %a1, align 8
   call void @foo0(%struct.s* byval %tmpcast)
-  store i32 2, i32* %a1, align 8, !tbaa !0
+  store i32 2, i32* %a1, align 8
   call void @goo(%struct.s* byval %tmpcast)
   ret i32 0
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-
diff --git a/test/Transforms/Inline/alloca-merge-align.ll b/test/Transforms/Inline/alloca-merge-align.ll
index d789c79..d357b3c 100644
--- a/test/Transforms/Inline/alloca-merge-align.ll
+++ b/test/Transforms/Inline/alloca-merge-align.ll
@@ -9,13 +9,13 @@ define void @foo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 4
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 4
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx) #2
   ret void
 }
@@ -24,13 +24,13 @@ define void @foo0(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32]
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 4
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx) #2
   ret void
 }
@@ -39,13 +39,13 @@ define void @foo1(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 1
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 4
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx) #2
   ret void
 }
@@ -56,13 +56,13 @@ define void @goo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx) #2
   ret void
 }
@@ -78,9 +78,9 @@ entry:
   %tmpcast = bitcast i64* %a to %struct.s*
   store i64 0, i64* %a, align 8
   %a1 = bitcast i64* %a to i32*
-  store i32 1, i32* %a1, align 8, !tbaa !0
+  store i32 1, i32* %a1, align 8
   call void @foo(%struct.s* byval %tmpcast)
-  store i32 2, i32* %a1, align 8, !tbaa !0
+  store i32 2, i32* %a1, align 8
   call void @goo(%struct.s* byval %tmpcast)
   ret i32 0
 }
@@ -96,9 +96,9 @@ entry:
   %tmpcast = bitcast i64* %a to %struct.s*
   store i64 0, i64* %a, align 8
   %a1 = bitcast i64* %a to i32*
-  store i32 1, i32* %a1, align 8, !tbaa !0
+  store i32 1, i32* %a1, align 8
   call void @foo0(%struct.s* byval %tmpcast)
-  store i32 2, i32* %a1, align 8, !tbaa !0
+  store i32 2, i32* %a1, align 8
   call void @goo(%struct.s* byval %tmpcast)
   ret i32 0
 }
@@ -114,14 +114,9 @@ entry:
   %tmpcast = bitcast i64* %a to %struct.s*
   store i64 0, i64* %a, align 8
   %a1 = bitcast i64* %a to i32*
-  store i32 1, i32* %a1, align 8, !tbaa !0
+  store i32 1, i32* %a1, align 8
   call void @foo0(%struct.s* byval %tmpcast)
-  store i32 2, i32* %a1, align 8, !tbaa !0
+  store i32 2, i32* %a1, align 8
   call void @foo1(%struct.s* byval %tmpcast)
   ret i32 0
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-
diff --git a/test/Transforms/Inline/attributes.ll b/test/Transforms/Inline/attributes.ll
new file mode 100644
index 0000000..53fb13f
--- /dev/null
+++ b/test/Transforms/Inline/attributes.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+
+define i32 @noattr_callee(i32 %i) {
+  ret i32 %i
+}
+
+define i32 @sanitize_address_callee(i32 %i) sanitize_address {
+  ret i32 %i
+}
+
+define i32 @sanitize_thread_callee(i32 %i) sanitize_thread {
+  ret i32 %i
+}
+
+define i32 @sanitize_memory_callee(i32 %i) sanitize_memory {
+  ret i32 %i
+}
+
+define i32 @alwaysinline_callee(i32 %i) alwaysinline {
+  ret i32 %i
+}
+
+define i32 @alwaysinline_sanitize_address_callee(i32 %i) alwaysinline sanitize_address {
+  ret i32 %i
+}
+
+define i32 @alwaysinline_sanitize_thread_callee(i32 %i) alwaysinline sanitize_thread {
+  ret i32 %i
+}
+
+define i32 @alwaysinline_sanitize_memory_callee(i32 %i) alwaysinline sanitize_memory {
+  ret i32 %i
+}
+
+
+; Check that:
+;  * noattr callee is inlined into noattr caller,
+;  * sanitize_(address|memory|thread) callee is not inlined into noattr caller,
+;  * alwaysinline callee is always inlined no matter what sanitize_* attributes are present.
+
+define i32 @test_no_sanitize_address(i32 %arg) {
+  %x1 = call i32 @noattr_callee(i32 %arg)
+  %x2 = call i32 @sanitize_address_callee(i32 %x1)
+  %x3 = call i32 @alwaysinline_callee(i32 %x2)
+  %x4 = call i32 @alwaysinline_sanitize_address_callee(i32 %x3)
+  ret i32 %x4
+; CHECK-LABEL: @test_no_sanitize_address(
+; CHECK-NEXT: @sanitize_address_callee
+; CHECK-NEXT: ret i32
+}
+
+define i32 @test_no_sanitize_memory(i32 %arg) {
+  %x1 = call i32 @noattr_callee(i32 %arg)
+  %x2 = call i32 @sanitize_memory_callee(i32 %x1)
+  %x3 = call i32 @alwaysinline_callee(i32 %x2)
+  %x4 = call i32 @alwaysinline_sanitize_memory_callee(i32 %x3)
+  ret i32 %x4
+; CHECK-LABEL: @test_no_sanitize_memory(
+; CHECK-NEXT: @sanitize_memory_callee
+; CHECK-NEXT: ret i32
+}
+
+define i32 @test_no_sanitize_thread(i32 %arg) {
+  %x1 = call i32 @noattr_callee(i32 %arg)
+  %x2 = call i32 @sanitize_thread_callee(i32 %x1)
+  %x3 = call i32 @alwaysinline_callee(i32 %x2)
+  %x4 = call i32 @alwaysinline_sanitize_thread_callee(i32 %x3)
+  ret i32 %x4
+; CHECK-LABEL: @test_no_sanitize_thread(
+; CHECK-NEXT: @sanitize_thread_callee
+; CHECK-NEXT: ret i32
+}
+
+
+; Check that:
+;  * noattr callee is not inlined into sanitize_(address|memory|thread) caller,
+;  * sanitize_(address|memory|thread) callee is inlined into the caller with the same attribute,
+;  * alwaysinline callee is always inlined no matter what sanitize_* attributes are present.
+
+define i32 @test_sanitize_address(i32 %arg) sanitize_address {
+  %x1 = call i32 @noattr_callee(i32 %arg)
+  %x2 = call i32 @sanitize_address_callee(i32 %x1)
+  %x3 = call i32 @alwaysinline_callee(i32 %x2)
+  %x4 = call i32 @alwaysinline_sanitize_address_callee(i32 %x3)
+  ret i32 %x4
+; CHECK-LABEL: @test_sanitize_address(
+; CHECK-NEXT: @noattr_callee
+; CHECK-NEXT: ret i32
+}
+
+define i32 @test_sanitize_memory(i32 %arg) sanitize_memory {
+  %x1 = call i32 @noattr_callee(i32 %arg)
+  %x2 = call i32 @sanitize_memory_callee(i32 %x1)
+  %x3 = call i32 @alwaysinline_callee(i32 %x2)
+  %x4 = call i32 @alwaysinline_sanitize_memory_callee(i32 %x3)
+  ret i32 %x4
+; CHECK-LABEL: @test_sanitize_memory(
+; CHECK-NEXT: @noattr_callee
+; CHECK-NEXT: ret i32
+}
+
+define i32 @test_sanitize_thread(i32 %arg) sanitize_thread {
+  %x1 = call i32 @noattr_callee(i32 %arg)
+  %x2 = call i32 @sanitize_thread_callee(i32 %x1)
+  %x3 = call i32 @alwaysinline_callee(i32 %x2)
+  %x4 = call i32 @alwaysinline_sanitize_thread_callee(i32 %x3)
+  ret i32 %x4
+; CHECK-LABEL: @test_sanitize_thread(
+; CHECK-NEXT: @noattr_callee
+; CHECK-NEXT: ret i32
+}
diff --git a/test/Transforms/Inline/byval.ll b/test/Transforms/Inline/byval.ll
index e601faf..d7597ad 100644
--- a/test/Transforms/Inline/byval.ll
+++ b/test/Transforms/Inline/byval.ll
@@ -104,3 +104,26 @@ entry:
 ; CHECK: ret i32 4
 }
 
+%struct.S0 = type { i32 }
+
+@b = global %struct.S0 { i32 1 }, align 4
+@a = common global i32 0, align 4
+
+define internal void @f5(%struct.S0* byval nocapture readonly align 4 %p) {
+entry:
+	store i32 0, i32* getelementptr inbounds (%struct.S0* @b, i64 0, i32 0), align 4
+	%f2 = getelementptr inbounds %struct.S0* %p, i64 0, i32 0
+	%0 = load i32* %f2, align 4
+	store i32 %0, i32* @a, align 4
+	ret void
+}
+
+define i32 @test5() {
+entry:
+	tail call void @f5(%struct.S0* byval align 4 @b)
+	%0 = load i32* @a, align 4
+	ret i32 %0
+; CHECK: @test5()
+; CHECK: store i32 0, i32* getelementptr inbounds (%struct.S0* @b, i64 0, i32 0), align 4
+; CHECK-NOT: load i32* getelementptr inbounds (%struct.S0* @b, i64 0, i32 0), align 4
+}
diff --git a/test/Transforms/Inline/delete-call.ll b/test/Transforms/Inline/delete-call.ll
index 97c52af..7f30ffb 100644
--- a/test/Transforms/Inline/delete-call.ll
+++ b/test/Transforms/Inline/delete-call.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -inline -stats < %s 2>&1 | FileCheck %s
 ; CHECK: Number of functions inlined
 
-; RUN: opt -S -inline -functionattrs -stats < %s 2>&1 | FileCheck -check-prefix=FUNCTIONATTRS %s
+; RUN: opt -S -inline -functionattrs -stats < %s 2>&1 | FileCheck -check-prefix=CHECK-FUNCTIONATTRS %s
 ; CHECK-FUNCTIONATTRS: Number of call sites deleted, not inlined
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
diff --git a/test/Transforms/Inline/inline-invoke-with-asm-call.ll b/test/Transforms/Inline/inline-invoke-with-asm-call.ll
new file mode 100644
index 0000000..876f8d7
--- /dev/null
+++ b/test/Transforms/Inline/inline-invoke-with-asm-call.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+target triple = "x86_64-apple-darwin"
+
+; In inliner, we assume that inline asm does not throw. This testing case makes
+; sure that the inliner does not convert "call asm" to "invoke asm".
+; rdar://15317907
+; CHECK-LABEL: @caller
+; Make sure we are generating "call asm" instead of "invoke asm".
+; CHECK: call void asm
+; CHECK-LABEL: @callee_with_asm
+define void @caller() {
+  br i1 undef, label %1, label %4
+
+; <label>:1
+  invoke void @callee_with_asm()
+          to label %4 unwind label %2
+
+; <label>:2
+  %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          cleanup
+  resume { i8*, i32 } undef
+
+; <label>:4
+  ret void
+}
+
+define void @callee_with_asm() {
+  call void asm sideeffect "mov\09r7, r7\09\09@ marker for objc_retainAutoreleaseReturnValue", ""()
+  ret void
+}
+
+declare i32 @__objc_personality_v0(...)
diff --git a/test/Transforms/Inline/inline-optnone.ll b/test/Transforms/Inline/inline-optnone.ll
new file mode 100644
index 0000000..9b99c45
--- /dev/null
+++ b/test/Transforms/Inline/inline-optnone.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+
+; Test that functions with attribute optnone are not inlined.
+; Also test that only functions with attribute alwaysinline are
+; valid candidates for inlining if the caller has the optnone attribute.
+
+; Function Attrs: alwaysinline nounwind readnone uwtable
+define i32 @alwaysInlineFunction(i32 %a) #0 {
+entry:
+  %mul = mul i32 %a, %a
+  ret i32 %mul
+}
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @simpleFunction(i32 %a) #1 {
+entry:
+  %add = add i32 %a, %a
+  ret i32 %add
+}
+
+; Function Attrs: nounwind noinline optnone readnone uwtable
+define i32 @OptnoneFunction(i32 %a) #2 {
+entry:
+  %0 = tail call i32 @alwaysInlineFunction(i32 %a)
+  %1 = tail call i32 @simpleFunction(i32 %a)
+  %add = add i32 %0, %1
+  ret i32 %add
+}
+
+; CHECK-LABEL: @OptnoneFunction
+; CHECK-NOT: call i32 @alwaysInlineFunction(i32 %a)
+; CHECK: call i32 @simpleFunction(i32 %a)
+; CHECK: ret
+
+; Function Attrs: nounwind readnone uwtable
+define i32 @bar(i32 %a) #1 {
+entry:
+  %0 = tail call i32 @OptnoneFunction(i32 5)
+  %1 = tail call i32 @simpleFunction(i32 6)
+  %add = add i32 %0, %1
+  ret i32 %add
+}
+
+; CHECK-LABEL: @bar
+; CHECK: call i32 @OptnoneFunction(i32 5)
+; CHECK-NOT: call i32 @simpleFunction(i32 6)
+; CHECK: ret
+
+
+attributes #0 = { alwaysinline nounwind readnone uwtable }
+attributes #1 = { nounwind readnone uwtable }
+attributes #2 = { nounwind noinline optnone readnone uwtable }
diff --git a/test/Transforms/Inline/inline_returns_twice.ll b/test/Transforms/Inline/inline_returns_twice.ll
index f316c91..678ee82 100644
--- a/test/Transforms/Inline/inline_returns_twice.ll
+++ b/test/Transforms/Inline/inline_returns_twice.ll
@@ -4,38 +4,81 @@
 ; if they are themselve marked as such.
 
 declare i32 @a() returns_twice
-declare i32 @b() returns_twice
 
-define i32 @f() {
+define i32 @inner1() {
 entry:
   %call = call i32 @a() returns_twice
   %add = add nsw i32 1, %call
   ret i32 %add
 }
 
-define i32 @g() {
+define i32 @outer1() {
 entry:
-; CHECK-LABEL: define i32 @g(
-; CHECK: call i32 @f()
-; CHECK-NOT: call i32 @a()
-  %call = call i32 @f()
+; CHECK-LABEL: define i32 @outer1(
+; CHECK: call i32 @inner1()
+  %call = call i32 @inner1()
   %add = add nsw i32 1, %call
   ret i32 %add
 }
 
-define i32 @h() returns_twice {
+define i32 @inner2() returns_twice {
 entry:
-  %call = call i32 @b() returns_twice
+  %call = call i32 @a() returns_twice
+  %add = add nsw i32 1, %call
+  ret i32 %add
+}
+
+define i32 @outer2() {
+entry:
+; CHECK-LABEL: define i32 @outer2(
+; CHECK: call i32 @a()
+  %call = call i32 @inner2() returns_twice
   %add = add nsw i32 1, %call
   ret i32 %add
 }
 
-define i32 @i() {
+define i32 @inner3() {
+entry:
+  %invoke = invoke i32 @a() returns_twice
+      to label %cont unwind label %lpad
+
+cont:
+  %add = add nsw i32 1, %invoke
+  ret i32 %add
+
+lpad:
+  %lp = landingpad i32 personality i8* null cleanup
+  resume i32 %lp
+}
+
+define i32 @outer3() {
+entry:
+; CHECK-LABEL: define i32 @outer3(
+; CHECK: call i32 @inner3()
+  %call = call i32 @inner3()
+  %add = add nsw i32 1, %call
+  ret i32 %add
+}
+
+define i32 @inner4() returns_twice {
+entry:
+  %invoke = invoke i32 @a() returns_twice
+      to label %cont unwind label %lpad
+
+cont:
+  %add = add nsw i32 1, %invoke
+  ret i32 %add
+
+lpad:
+  %lp = landingpad i32 personality i8* null cleanup
+  resume i32 %lp
+}
+
+define i32 @outer4() {
 entry:
-; CHECK-LABEL: define i32 @i(
-; CHECK: call i32 @b()
-; CHECK-NOT: call i32 @h()
-  %call = call i32 @h() returns_twice
+; CHECK-LABEL: define i32 @outer4(
+; CHECK: invoke i32 @a()
+  %call = call i32 @inner4() returns_twice
   %add = add nsw i32 1, %call
   ret i32 %add
 }
diff --git a/test/Transforms/Inline/invoke-cost.ll b/test/Transforms/Inline/invoke-cost.ll
new file mode 100644
index 0000000..84d33ad
--- /dev/null
+++ b/test/Transforms/Inline/invoke-cost.ll
@@ -0,0 +1,45 @@
+; RUN: opt -inline < %s -S -o - -inline-threshold=100 | FileCheck %s
+
+target datalayout = "p:32:32"
+
+@glbl = external global i32
+
+declare void @f()
+declare i32 @__gxx_personality_v0(...)
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare void @_ZSt9terminatev()
+
+define void @inner1() {
+entry:
+  invoke void @f() to label %cont1 unwind label %terminate.lpad
+
+cont1:
+  invoke void @f() to label %cont2 unwind label %terminate.lpad
+
+cont2:
+  invoke void @f() to label %cont3 unwind label %terminate.lpad
+
+cont3:
+  invoke void @f() to label %cont4 unwind label %terminate.lpad
+
+cont4:
+  ret void
+
+terminate.lpad:
+  landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+            catch i8* null
+  call void @_ZSt9terminatev() noreturn nounwind
+  unreachable
+}
+
+define void @outer1() {
+; CHECK-LABEL: @outer1(
+;
+; This call should not get inlined because inner1 actually calls a function
+; many times, but it only does so through invoke as opposed to call.
+;
+; CHECK: call void @inner1
+  call void @inner1()
+  ret void
+}
diff --git a/test/Transforms/Inline/lit.local.cfg b/test/Transforms/Inline/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/Inline/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/Inline/ptr-diff.ll b/test/Transforms/Inline/ptr-diff.ll
index 01b42da..af42bc7 100644
--- a/test/Transforms/Inline/ptr-diff.ll
+++ b/test/Transforms/Inline/ptr-diff.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -inline < %s -S -o - -inline-threshold=10 | FileCheck %s
 
-target datalayout = "p:32:32"
+target datalayout = "p:32:32-p1:64:64-p2:16:16-n16:32:64"
 
 define i32 @outer1() {
 ; CHECK-LABEL: @outer1(
@@ -56,3 +56,46 @@ else:
   %t = load i32* %begin
   ret i32 %t
 }
+
+; The inttoptrs are free since it is a smaller integer to a larger
+; pointer size
+define i32 @inttoptr_free_cost(i32 %a, i32 %b, i32 %c) {
+  %p1 = inttoptr i32 %a to i32 addrspace(1)*
+  %p2 = inttoptr i32 %b to i32 addrspace(1)*
+  %p3 = inttoptr i32 %c to i32 addrspace(1)*
+  %t1 = load i32 addrspace(1)* %p1
+  %t2 = load i32 addrspace(1)* %p2
+  %t3 = load i32 addrspace(1)* %p3
+  %s = add i32 %t1, %t2
+  %s1 = add i32 %s, %t3
+  ret i32 %s1
+}
+
+define i32 @inttoptr_free_cost_user(i32 %begin, i32 %end) {
+; CHECK-LABEL: @inttoptr_free_cost_user(
+; CHECK-NOT: call
+  %x = call i32 @inttoptr_free_cost(i32 %begin, i32 %end, i32 9)
+  ret i32 %x
+}
+
+; The inttoptrs have a cost since it is a larger integer to a smaller
+; pointer size
+define i32 @inttoptr_cost_smaller_ptr(i32 %a, i32 %b, i32 %c) {
+  %p1 = inttoptr i32 %a to i32 addrspace(2)*
+  %p2 = inttoptr i32 %b to i32 addrspace(2)*
+  %p3 = inttoptr i32 %c to i32 addrspace(2)*
+  %t1 = load i32 addrspace(2)* %p1
+  %t2 = load i32 addrspace(2)* %p2
+  %t3 = load i32 addrspace(2)* %p3
+  %s = add i32 %t1, %t2
+  %s1 = add i32 %s, %t3
+  ret i32 %s1
+}
+
+define i32 @inttoptr_cost_smaller_ptr_user(i32 %begin, i32 %end) {
+; CHECK-LABEL: @inttoptr_cost_smaller_ptr_user(
+; CHECK: call
+  %x = call i32 @inttoptr_cost_smaller_ptr(i32 %begin, i32 %end, i32 9)
+  ret i32 %x
+}
+
diff --git a/test/Transforms/InstCombine/2002-05-14-SubFailure.ll b/test/Transforms/InstCombine/2002-05-14-SubFailure.ll
index d2b2b00..854ec60 100644
--- a/test/Transforms/InstCombine/2002-05-14-SubFailure.ll
+++ b/test/Transforms/InstCombine/2002-05-14-SubFailure.ll
@@ -1,7 +1,8 @@
 ; Instcombine was missing a test that caused it to make illegal transformations
 ; sometimes.  In this case, it transforms the sub into an add:
-; RUN: opt < %s -instcombine -S | grep sub
-;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: sub
+
 define i32 @test(i32 %i, i32 %j) {
         %A = mul i32 %i, %j
         %B = sub i32 2, %A
diff --git a/test/Transforms/InstCombine/2002-12-05-MissedConstProp.ll b/test/Transforms/InstCombine/2002-12-05-MissedConstProp.ll
index 22574f7..49e55c6 100644
--- a/test/Transforms/InstCombine/2002-12-05-MissedConstProp.ll
+++ b/test/Transforms/InstCombine/2002-12-05-MissedConstProp.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s -instcombine -S | not grep add
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; CHECK-NOT: add
 
 define i32 @test(i32 %A) {
         %A.neg = sub i32 0, %A          ; <i32> [#uses=1]
diff --git a/test/Transforms/InstCombine/2003-08-12-AllocaNonNull.ll b/test/Transforms/InstCombine/2003-08-12-AllocaNonNull.ll
index c02d33c..bb9a818 100644
--- a/test/Transforms/InstCombine/2003-08-12-AllocaNonNull.ll
+++ b/test/Transforms/InstCombine/2003-08-12-AllocaNonNull.ll
@@ -1,6 +1,7 @@
-; This testcase can be simplified by "realizing" that alloca can never return 
+; This testcase can be simplified by "realizing" that alloca can never return
 ; null.
-; RUN: opt < %s -instcombine -simplifycfg -S | not grep br
+; RUN: opt < %s -instcombine -simplifycfg -S | FileCheck %s
+; CHECK-NOT: br
 
 declare i32 @bitmap_clear(...)
 
diff --git a/test/Transforms/InstCombine/2006-10-20-mask.ll b/test/Transforms/InstCombine/2006-10-20-mask.ll
index 0aaa5e8..e9797ae 100644
--- a/test/Transforms/InstCombine/2006-10-20-mask.ll
+++ b/test/Transforms/InstCombine/2006-10-20-mask.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S | \
-; RUN:    grep and
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: and
 
 define i64 @foo(i64 %tmp, i64 %tmp2) {
         %tmp.upgrd.1 = trunc i64 %tmp to i32            ; <i32> [#uses=1]
diff --git a/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll b/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll
index d3ba1e2..8ab50e2 100644
--- a/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll
+++ b/test/Transforms/InstCombine/2006-10-26-VectorReassoc.ll
@@ -1,5 +1,6 @@
-; RUN: opt < %s -instcombine -S | \
-; RUN:   grep mul | count 2
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: mul
+; CHECK: mul
 
 define <4 x float> @test(<4 x float> %V) {
         %Y = fmul <4 x float> %V, < float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00 >                ; <<4 x float>> [#uses=1]
diff --git a/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll b/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll
index 2665791..272753c 100644
--- a/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll
+++ b/test/Transforms/InstCombine/2006-12-08-Select-ICmp.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep select
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: select
 ; END.
 
 target datalayout = "e-p:32:32"
diff --git a/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll b/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll
index c161bcc..6b4e89d 100644
--- a/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll
+++ b/test/Transforms/InstCombine/2008-01-27-FloatSelect.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep select
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: select
 
 define double @fold(i1 %a, double %b) {
 %s = select i1 %a, double 0., double 1.
diff --git a/test/Transforms/InstCombine/2008-02-13-MulURem.ll b/test/Transforms/InstCombine/2008-02-13-MulURem.ll
index a88c510..d85ef97 100644
--- a/test/Transforms/InstCombine/2008-02-13-MulURem.ll
+++ b/test/Transforms/InstCombine/2008-02-13-MulURem.ll
@@ -1,6 +1,8 @@
-; RUN: opt < %s -instcombine -S | grep rem
+; RUN: opt < %s -instcombine -S | FileCheck %s
 ; PR1933
 
+; CHECK: rem
+
 define i32 @fold(i32 %a) {
   %s = mul i32 %a, 3
   %c = urem i32 %s, 3
diff --git a/test/Transforms/InstCombine/2008-05-31-AddBool.ll b/test/Transforms/InstCombine/2008-05-31-AddBool.ll
index ed20690..31b1719 100644
--- a/test/Transforms/InstCombine/2008-05-31-AddBool.ll
+++ b/test/Transforms/InstCombine/2008-05-31-AddBool.ll
@@ -1,6 +1,8 @@
-; RUN: opt < %s -instcombine -S | grep "xor"
+; RUN: opt < %s -instcombine -S | FileCheck %s
 ; PR2389
 
+; CHECK: xor
+
 define i1 @test(i1 %a, i1 %b) {
   %A = add i1 %a, %b
   ret i1 %A
diff --git a/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll b/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll
index 949fc59..e354311 100644
--- a/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll
+++ b/test/Transforms/InstCombine/2009-01-16-PointerAddrSpace.ll
@@ -5,7 +5,7 @@ target triple = "i386-apple-darwin9.6"
 
 define i32 @test(i32* %P) nounwind {
 entry:
-  %Q = bitcast i32* %P to i32 addrspace(1)*
+  %Q = addrspacecast i32* %P to i32 addrspace(1)*
   store i32 0, i32 addrspace(1)* %Q, align 4
   ret i32 0
 }
diff --git a/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
index 6f3df5b..4d185bf 100644
--- a/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
+++ b/test/Transforms/InstCombine/2012-07-30-addrsp-bitcast.ll
@@ -1,10 +1,10 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; CHECK: bitcast
+; CHECK: addrspacecast
 
 @base = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 16
 declare void @foo(i32*)
 
 define void @test() nounwind {
-  call void @foo(i32* getelementptr (i32* bitcast ([16 x i32] addrspace(3)* @base to i32*), i64 2147483647)) nounwind
+  call void @foo(i32* getelementptr (i32* addrspacecast ([16 x i32] addrspace(3)* @base to i32*), i64 2147483647)) nounwind
   ret void
 }
diff --git a/test/Transforms/InstCombine/addrspacecast.ll b/test/Transforms/InstCombine/addrspacecast.ll
new file mode 100644
index 0000000..d908b55
--- /dev/null
+++ b/test/Transforms/InstCombine/addrspacecast.ll
@@ -0,0 +1,69 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-p1:32:32:32-p2:16:16:16-n8:16:32:64"
+
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i32(i8*, i8 addrspace(1)*, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p2i8.i32(i8*, i8 addrspace(2)*, i32, i32, i1) nounwind
+
+
+define i32* @combine_redundant_addrspacecast(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_redundant_addrspacecast(
+; CHECK: addrspacecast i32 addrspace(1)* %x to i32*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(3)*
+  %z = addrspacecast i32 addrspace(3)* %y to i32*
+  ret i32* %z
+}
+
+define <4 x i32*> @combine_redundant_addrspacecast_vector(<4 x i32 addrspace(1)*> %x) nounwind {
+; CHECK-LABEL: @combine_redundant_addrspacecast_vector(
+; CHECK: addrspacecast <4 x i32 addrspace(1)*> %x to <4 x i32*>
+; CHECK-NEXT: ret
+  %y = addrspacecast <4 x i32 addrspace(1)*> %x to <4 x i32 addrspace(3)*>
+  %z = addrspacecast <4 x i32 addrspace(3)*> %y to <4 x i32*>
+  ret <4 x i32*> %z
+}
+
+define float* @combine_redundant_addrspacecast_types(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_redundant_addrspacecast_types(
+; CHECK: addrspacecast i32 addrspace(1)* %x to float*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(3)*
+  %z = addrspacecast i32 addrspace(3)* %y to float*
+  ret float* %z
+}
+
+@const_array = addrspace(2) constant [60 x i8] [i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
+                                                i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
+                                                i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
+                                                i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
+                                                i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22 ]
+
+declare void @foo(i8*) nounwind
+
+; A copy from a constant addrspacecast'ed global
+; CHECK-LABEL: @memcpy_addrspacecast(
+; CHECK-NOT:  call void @llvm.memcpy
+define i32 @memcpy_addrspacecast() nounwind {
+entry:
+  %alloca = alloca i8, i32 48
+  call void @llvm.memcpy.p0i8.p1i8.i32(i8* %alloca, i8 addrspace(1)* addrspacecast (i8 addrspace(2)* getelementptr inbounds ([60 x i8] addrspace(2)* @const_array, i16 0, i16 4) to i8 addrspace(1)*), i32 48, i32 4, i1 false) nounwind
+  br label %loop.body
+
+loop.body:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %loop.body ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.inc, %loop.body]
+  %ptr = getelementptr i8* %alloca, i32 %i
+  %load = load i8* %ptr
+  %ext = zext i8 %load to i32
+  %sum.inc = add i32 %sum, %ext
+  %i.inc = add i32 %i, 1
+  %cmp = icmp ne i32 %i, 48
+  br i1 %cmp, label %loop.body, label %end
+
+end:
+  ret i32 %sum.inc
+}
+
diff --git a/test/Transforms/InstCombine/align-addr.ll b/test/Transforms/InstCombine/align-addr.ll
index e33ee9f..4d22c2c 100644
--- a/test/Transforms/InstCombine/align-addr.ll
+++ b/test/Transforms/InstCombine/align-addr.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+target datalayout = "E-p:64:64:64-p1:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 ; Instcombine should be able to prove vector alignment in the
 ; presence of a few mild address computation tricks.
@@ -47,6 +47,27 @@ entry:
 	ret <16 x i8> %tmp
 }
 
+@GLOBAL_as1 = internal addrspace(1) global [4 x i32] zeroinitializer
+
+define <16 x i8> @test1_as1(<2 x i64> %x) {
+; CHECK-LABEL: @test1_as1(
+; CHECK: tmp = load
+; CHECK: GLOBAL_as1{{.*}}align 16
+  %tmp = load <16 x i8> addrspace(1)* bitcast ([4 x i32] addrspace(1)* @GLOBAL_as1 to <16 x i8> addrspace(1)*), align 1
+  ret <16 x i8> %tmp
+}
+
+@GLOBAL_as1_gep = internal addrspace(1) global [8 x i32] zeroinitializer
+
+define <16 x i8> @test1_as1_gep(<2 x i64> %x) {
+; CHECK-LABEL: @test1_as1_gep(
+; CHECK: tmp = load
+; CHECK: GLOBAL_as1_gep{{.*}}align 16
+  %tmp = load <16 x i8> addrspace(1)* bitcast (i32 addrspace(1)* getelementptr ([8 x i32] addrspace(1)* @GLOBAL_as1_gep, i16 0, i16 4) to <16 x i8> addrspace(1)*), align 1
+  ret <16 x i8> %tmp
+}
+
+
 ; When a load or store lacks an explicit alignment, add one.
 
 ; CHECK-LABEL: @test2(
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index 9a80ad9..ae1cfa1 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -1,7 +1,7 @@
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+; RUN: opt < %s -instcombine -S -default-data-layout="E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s
+; RUN: opt < %s -instcombine -S -default-data-layout="E-p:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=P32
+; RUN: opt < %s -instcombine -S | FileCheck %s -check-prefix=NODL
 
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; END.
 
 declare void @use(...)
 
@@ -110,3 +110,22 @@ entry:
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+
+; Check that the GEP indices use the pointer size, or 64 if unknown
+define void @test8() {
+; CHECK-LABEL: @test8(
+; CHECK: alloca [100 x i32]
+; CHECK: getelementptr inbounds [100 x i32]* %x1, i64 0, i64 0
+
+; P32-LABEL: @test8(
+; P32: alloca [100 x i32]
+; P32: getelementptr inbounds [100 x i32]* %x1, i32 0, i32 0
+
+; NODL-LABEL: @test8(
+; NODL: alloca [100 x i32]
+; NODL: getelementptr inbounds [100 x i32]* %x1, i64 0, i64 0
+  %x = alloca i32, i32 100
+  call void (...)* @use(i32* %x)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll
index 504391a..e88fd59 100644
--- a/test/Transforms/InstCombine/and2.ll
+++ b/test/Transforms/InstCombine/and2.ll
@@ -42,3 +42,15 @@ define <4 x i32> @test5(<4 x i32> %A) {
   %2 = and <4 x i32> <i32 1, i32 2, i32 3, i32 4>, %1
   ret <4 x i32> %2
 }
+
+; Check that we combine "if x!=0 && x!=-1" into "if x+1u>1"
+define i32 @test6(i64 %x) nounwind {
+; CHECK: @test6
+; CHECK-NEXT: add i64 %x, 1
+; CHECK-NEXT: icmp ugt i64 %x.off, 1
+  %cmp1 = icmp ne i64 %x, -1
+  %not.cmp = icmp ne i64 %x, 0
+  %.cmp1 = and i1 %cmp1, %not.cmp
+  %land.ext = zext i1 %.cmp1 to i32
+  ret i32 %land.ext
+}
diff --git a/test/Transforms/InstCombine/apint-select.ll b/test/Transforms/InstCombine/apint-select.ll
index f2ea601..cf24a44 100644
--- a/test/Transforms/InstCombine/apint-select.ll
+++ b/test/Transforms/InstCombine/apint-select.ll
@@ -1,6 +1,7 @@
 ; This test makes sure that these instructions are properly eliminated.
 
-; RUN: opt < %s -instcombine -S | not grep select
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK-NOT: select
 
 
 define i41 @test1(i1 %C) {
@@ -37,7 +38,7 @@ define i41 @test5(i41 %X) {
 
 define i1023 @test6(i1023 %X) {
     ;; ((X & 27) ? 27 : 0)
-    %Y = and i1023 %X, 64 
+    %Y = and i1023 %X, 64
     %t = icmp ne i1023 %Y, 0
     %V = select i1 %t, i1023 64, i1023 0
     ret i1023 %V
diff --git a/test/Transforms/InstCombine/bitcast-bigendian.ll b/test/Transforms/InstCombine/bitcast-bigendian.ll
index 28b0e9a..ed812e1 100644
--- a/test/Transforms/InstCombine/bitcast-bigendian.ll
+++ b/test/Transforms/InstCombine/bitcast-bigendian.ll
@@ -48,3 +48,44 @@ define float @test3(<2 x float> %A, <2 x i64> %B) {
 ; CHECK-NEXT:  ret float %add
 }
 
+define <2 x i32> @test4(i32 %A, i32 %B){
+  %tmp38 = zext i32 %A to i64
+  %tmp32 = zext i32 %B to i64
+  %tmp33 = shl i64 %tmp32, 32
+  %ins35 = or i64 %tmp33, %tmp38
+  %tmp43 = bitcast i64 %ins35 to <2 x i32>
+  ret <2 x i32> %tmp43
+  ; CHECK-LABEL: @test4(
+  ; CHECK-NEXT: insertelement <2 x i32> undef, i32 %B, i32 0
+  ; CHECK-NEXT: insertelement <2 x i32> {{.*}}, i32 %A, i32 1
+  ; CHECK-NEXT: ret <2 x i32>
+
+}
+
+define <2 x float> @test5(float %A, float %B) {
+  %tmp37 = bitcast float %A to i32
+  %tmp38 = zext i32 %tmp37 to i64
+  %tmp31 = bitcast float %B to i32
+  %tmp32 = zext i32 %tmp31 to i64
+  %tmp33 = shl i64 %tmp32, 32
+  %ins35 = or i64 %tmp33, %tmp38
+  %tmp43 = bitcast i64 %ins35 to <2 x float>
+  ret <2 x float> %tmp43
+  ; CHECK-LABEL: @test5(
+  ; CHECK-NEXT: insertelement <2 x float> undef, float %B, i32 0
+  ; CHECK-NEXT: insertelement <2 x float> {{.*}}, float %A, i32 1
+  ; CHECK-NEXT: ret <2 x float>
+}
+
+define <2 x float> @test6(float %A){
+  %tmp23 = bitcast float %A to i32              ; <i32> [#uses=1]
+  %tmp24 = zext i32 %tmp23 to i64                 ; <i64> [#uses=1]
+  %tmp25 = shl i64 %tmp24, 32                     ; <i64> [#uses=1]
+  %mask20 = or i64 %tmp25, 1109917696             ; <i64> [#uses=1]
+  %tmp35 = bitcast i64 %mask20 to <2 x float>     ; <<2 x float>> [#uses=1]
+  ret <2 x float> %tmp35
+; CHECK-LABEL: @test6(
+; CHECK-NEXT: insertelement <2 x float> undef, float %A, i32 0
+; CHECK-NEXT: insertelement <2 x float> {{.*}}, float 4.200000e+01, i32 1
+; CHECK: ret
+}
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 4ef8790..c7a520b 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -144,3 +144,13 @@ define <2 x i16> @BitcastInsert(i32 %a) {
 ; CHECK-LABEL: @BitcastInsert(
 ; CHECK: bitcast i32 %a to <2 x i16>
 }
+
+; PR17293
+define <2 x i64> @test7(<2 x i8*>* %arg) nounwind {
+  %cast = bitcast <2 x i8*>* %arg to <2 x i64>*
+  %load = load <2 x i64>* %cast, align 16
+  ret <2 x i64> %load
+; CHECK: @test7
+; CHECK: bitcast
+; CHECK: load
+}
diff --git a/test/Transforms/InstCombine/call.ll b/test/Transforms/InstCombine/call.ll
index 55833fb..e68c0ad 100644
--- a/test/Transforms/InstCombine/call.ll
+++ b/test/Transforms/InstCombine/call.ll
@@ -1,7 +1,7 @@
 ; Ignore stderr, we expect warnings there
 ; RUN: opt < %s -instcombine 2> /dev/null -S | FileCheck %s
 
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+target datalayout = "E-p:64:64:64-p1:16:16:16-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 ; Simple case, argument translatable without changing the value
 declare void @test1a(i8*)
@@ -15,6 +15,28 @@ define void @test1(i32* %A) {
   ret void
 }
 
+
+; Should not do because of change in address space of the parameter
+define void @test1_as1_illegal(i32 addrspace(1)* %A) {
+; CHECK-LABEL: @test1_as1_illegal(
+; CHECK: call void bitcast
+  call void bitcast (void (i8*)* @test1a to void (i32 addrspace(1)*)*)(i32 addrspace(1)* %A)
+  ret void
+}
+
+; Test1, but the argument has a different sized address-space
+declare void @test1a_as1(i8 addrspace(1)*)
+
+; This one is OK to perform
+define void @test1_as1(i32 addrspace(1)* %A) {
+; CHECK-LABEL: @test1_as1(
+; CHECK: %1 = bitcast i32 addrspace(1)* %A to i8 addrspace(1)*
+; CHECK: call void @test1a_as1(i8 addrspace(1)* %1)
+; CHECK: ret void
+  call void bitcast (void (i8 addrspace(1)*)* @test1a_as1 to void (i32 addrspace(1)*)*)(i32 addrspace(1)* %A )
+  ret void
+}
+
 ; More complex case, translate argument because of resolution.  This is safe
 ; because we have the body of the function
 define void @test2a(i8 %A) {
@@ -135,3 +157,122 @@ entry:
 ; CHECK: call i8* bitcast
 }
 
+
+; Parameter that's a vector of pointers
+declare void @test10a(<2 x i8*>)
+
+define void @test10(<2 x i32*> %A) {
+; CHECK-LABEL: @test10(
+; CHECK: %1 = bitcast <2 x i32*> %A to <2 x i8*>
+; CHECK: call void @test10a(<2 x i8*> %1)
+; CHECK: ret void
+  call void bitcast (void (<2 x i8*>)* @test10a to void (<2 x i32*>)*)(<2 x i32*> %A)
+  ret void
+}
+
+; Don't transform because different address spaces
+declare void @test10a_mixed_as(<2 x i8 addrspace(1)*>)
+
+define void @test10_mixed_as(<2 x i8*> %A) {
+; CHECK-LABEL: @test10_mixed_as(
+; CHECK: call void bitcast
+  call void bitcast (void (<2 x i8 addrspace(1)*>)* @test10a_mixed_as to void (<2 x i8*>)*)(<2 x i8*> %A)
+  ret void
+}
+
+; Return type that's a pointer
+define i8* @test11a() {
+  ret i8* zeroinitializer
+}
+
+define i32* @test11() {
+; CHECK-LABEL: @test11(
+; CHECK: %X = call i8* @test11a()
+; CHECK: %1 = bitcast i8* %X to i32*
+  %X = call i32* bitcast (i8* ()* @test11a to i32* ()*)()
+  ret i32* %X
+}
+
+; Return type that's a pointer with a different address space
+define i8 addrspace(1)* @test11a_mixed_as() {
+  ret i8 addrspace(1)* zeroinitializer
+}
+
+define i8* @test11_mixed_as() {
+; CHECK-LABEL: @test11_mixed_as(
+; CHECK: call i8* bitcast
+  %X = call i8* bitcast (i8 addrspace(1)* ()* @test11a_mixed_as to i8* ()*)()
+  ret i8* %X
+}
+
+; Return type that's a vector of pointers
+define <2 x i8*> @test12a() {
+  ret <2 x i8*> zeroinitializer
+}
+
+define <2 x i32*> @test12() {
+; CHECK-LABEL: @test12(
+; CHECK: %X = call <2 x i8*> @test12a()
+; CHECK: %1 = bitcast <2 x i8*> %X to <2 x i32*>
+  %X = call <2 x i32*> bitcast (<2 x i8*> ()* @test12a to <2 x i32*> ()*)()
+  ret <2 x i32*> %X
+}
+
+define <2 x i8 addrspace(1)*> @test12a_mixed_as() {
+  ret <2 x i8 addrspace(1)*> zeroinitializer
+}
+
+define <2 x i8*> @test12_mixed_as() {
+; CHECK-LABEL: @test12_mixed_as(
+; CHECK: call <2 x i8*> bitcast
+  %X = call <2 x i8*> bitcast (<2 x i8 addrspace(1)*> ()* @test12a_mixed_as to <2 x i8*> ()*)()
+  ret <2 x i8*> %X
+}
+
+
+; Mix parameter that's a vector of integers and pointers of the same size
+declare void @test13a(<2 x i64>)
+
+define void @test13(<2 x i32*> %A) {
+; CHECK-LABEL: @test13(
+; CHECK: call void bitcast
+  call void bitcast (void (<2 x i64>)* @test13a to void (<2 x i32*>)*)(<2 x i32*> %A)
+  ret void
+}
+
+; Mix parameter that's a vector of integers and pointers of the same
+; size, but the other way around
+declare void @test14a(<2 x i8*>)
+
+define void @test14(<2 x i64> %A) {
+; CHECK-LABEL: @test14(
+; CHECK: call void bitcast
+  call void bitcast (void (<2 x i8*>)* @test14a to void (<2 x i64>)*)(<2 x i64> %A)
+  ret void
+}
+
+
+; Return type that's a vector
+define <2 x i16> @test15a() {
+  ret <2 x i16> zeroinitializer
+}
+
+define i32 @test15() {
+; CHECK-LABEL: @test15(
+; CHECK: %X = call <2 x i16> @test15a()
+; CHECK: %1 = bitcast <2 x i16> %X to i32
+  %X = call i32 bitcast (<2 x i16> ()* @test15a to i32 ()*)( )
+  ret i32 %X
+}
+
+define i32 @test16a() {
+  ret i32 0
+}
+
+define <2 x i16> @test16() {
+; CHECK-LABEL: @test16(
+; CHECK: %X = call i32 @test16a()
+; CHECK: %1 = bitcast i32 %X to <2 x i16>
+  %X = call <2 x i16> bitcast (i32 ()* @test16a to <2 x i16> ()*)( )
+  ret <2 x i16> %X
+}
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 52ea7b9..cac0ec1 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -1,6 +1,6 @@
 ; Tests to make sure elimination of casts is working correctly
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64"
+target datalayout = "E-p:64:64:64-p1:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64"
 
 @inbuf = external global [32832 x i8]           ; <[32832 x i8]*> [#uses=1]
 
@@ -708,6 +708,19 @@ define %s @test68(%s *%p, i64 %i) {
 ; CHECK-NEXT: ret %s
 }
 
+define %s @test68_as1(%s addrspace(1)* %p, i32 %i) {
+; CHECK-LABEL: @test68_as1(
+  %o = mul i32 %i, 12
+  %q = bitcast %s addrspace(1)* %p to i8 addrspace(1)*
+  %pp = getelementptr inbounds i8 addrspace(1)* %q, i32 %o
+; CHECK-NEXT: getelementptr %s addrspace(1)*
+  %r = bitcast i8 addrspace(1)* %pp to %s addrspace(1)*
+  %l = load %s addrspace(1)* %r
+; CHECK-NEXT: load %s addrspace(1)*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
 define double @test69(double *%p, i64 %i) {
 ; CHECK-LABEL: @test69(
   %o = shl nsw i64 %i, 3
@@ -890,6 +903,20 @@ define double @test80([100 x double]* %p, i32 %i) {
 ; CHECK-NEXT: ret double
 }
 
+define double @test80_as1([100 x double] addrspace(1)* %p, i16 %i) {
+; CHECK-LABEL: @test80_as1(
+  %tmp = mul nsw i16 %i, 8
+; CHECK-NEXT: sext i16 %i to i32
+  %q = bitcast [100 x double] addrspace(1)* %p to i8 addrspace(1)*
+  %pp = getelementptr i8 addrspace(1)* %q, i16 %tmp
+; CHECK-NEXT: getelementptr [100 x double] addrspace(1)*
+  %r = bitcast i8 addrspace(1)* %pp to double addrspace(1)*
+  %l = load double addrspace(1)* %r
+; CHECK-NEXT: load double addrspace(1)*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
 define double @test81(double *%p, float %f) {
   %i = fptosi float %f to i64
   %q = bitcast double* %p to i8*
diff --git a/test/Transforms/InstCombine/cast_ptr.ll b/test/Transforms/InstCombine/cast_ptr.ll
index 7910ea3..23006a8 100644
--- a/test/Transforms/InstCombine/cast_ptr.ll
+++ b/test/Transforms/InstCombine/cast_ptr.ll
@@ -1,7 +1,7 @@
 ; Tests to make sure elimination of casts is working correctly
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-target datalayout = "p:32:32"
+target datalayout = "p:32:32-p1:32:32-p2:16:16"
 
 ; This shouldn't convert to getelementptr because the relationship
 ; between the arithmetic and the layout of allocated memory is
@@ -27,6 +27,26 @@ define i1 @test2(i8* %a, i8* %b) {
         ret i1 %r
 }
 
+; These casts should be folded away.
+; CHECK-LABEL: @test2_as2_same_int(
+; CHECK: icmp eq i8 addrspace(2)* %a, %b
+define i1 @test2_as2_same_int(i8 addrspace(2)* %a, i8 addrspace(2)* %b) {
+  %tmpa = ptrtoint i8 addrspace(2)* %a to i16
+  %tmpb = ptrtoint i8 addrspace(2)* %b to i16
+  %r = icmp eq i16 %tmpa, %tmpb
+  ret i1 %r
+}
+
+; These casts should be folded away.
+; CHECK-LABEL: @test2_as2_larger(
+; CHECK: icmp eq i8 addrspace(2)* %a, %b
+define i1 @test2_as2_larger(i8 addrspace(2)* %a, i8 addrspace(2)* %b) {
+  %tmpa = ptrtoint i8 addrspace(2)* %a to i32
+  %tmpb = ptrtoint i8 addrspace(2)* %b to i32
+  %r = icmp eq i32 %tmpa, %tmpb
+  ret i1 %r
+}
+
 ; These casts should also be folded away.
 ; CHECK-LABEL: @test3(
 ; CHECK: icmp eq i8* %a, @global
@@ -43,11 +63,20 @@ define i1 @test4(i32 %A) {
   ret i1 %C
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT: %C = icmp eq i32 %A, 0
-; CHECK-NEXT: ret i1 %C 
+; CHECK-NEXT: ret i1 %C
 }
 
+define i1 @test4_as2(i16 %A) {
+; CHECK-LABEL: @test4_as2(
+; CHECK-NEXT: %C = icmp eq i16 %A, 0
+; CHECK-NEXT: ret i1 %C
+  %B = inttoptr i16 %A to i8 addrspace(2)*
+  %C = icmp eq i8 addrspace(2)* %B, null
+  ret i1 %C
+}
 
-; Pulling the cast out of the load allows us to eliminate the load, and then 
+
+; Pulling the cast out of the load allows us to eliminate the load, and then
 ; the whole array.
 
         %op = type { float }
@@ -69,11 +98,11 @@ define %unop* @test5(%op* %O) {
 ; InstCombine can not 'load (cast P)' -> cast (load P)' if the cast changes
 ; the address space.
 
-define i8 @test6(i8 addrspace(1)* %source) {                                                                                        
-entry: 
-  %arrayidx223 = bitcast i8 addrspace(1)* %source to i8*
+define i8 @test6(i8 addrspace(1)* %source) {
+entry:
+  %arrayidx223 = addrspacecast i8 addrspace(1)* %source to i8*
   %tmp4 = load i8* %arrayidx223
   ret i8 %tmp4
 ; CHECK-LABEL: @test6(
 ; CHECK: load i8* %arrayidx223
-} 
+}
diff --git a/test/Transforms/InstCombine/compare-signs.ll b/test/Transforms/InstCombine/compare-signs.ll
index cdf95ab..62cd5b3 100644
--- a/test/Transforms/InstCombine/compare-signs.ll
+++ b/test/Transforms/InstCombine/compare-signs.ll
@@ -24,9 +24,9 @@
 define i32 @test3(i32 %a, i32 %b) nounwind readnone {
 ; CHECK-LABEL: @test3(
 entry:
-; CHECK: xor i32 %a, %b
-; CHECK: lshr i32 %0, 31
-; CHECK: xor i32 %1, 1
+; CHECK: [[XOR1:%.*]] = xor i32 %a, %b
+; CHECK: [[SHIFT:%.*]] = lshr i32 [[XOR1]], 31
+; CHECK: [[XOR2:%.*]] = xor i32 [[SHIFT]], 1
         %0 = lshr i32 %a, 31            ; <i32> [#uses=1]
         %1 = lshr i32 %b, 31            ; <i32> [#uses=1]
         %2 = icmp eq i32 %0, %1         ; <i1> [#uses=1]
@@ -34,7 +34,7 @@ entry:
         ret i32 %3
 ; CHECK-NOT: icmp
 ; CHECK-NOT: zext
-; CHECK: ret i32 %2
+; CHECK: ret i32 [[XOR2]]
 }
 
 ; Variation on @test3: checking the 2nd bit in a situation where the 5th bit
diff --git a/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
new file mode 100644
index 0000000..9f21d54
--- /dev/null
+++ b/test/Transforms/InstCombine/constant-fold-address-space-pointer.ll
@@ -0,0 +1,232 @@
+; RUN: opt -S -instcombine %s -o - | FileCheck %s
+target datalayout = "e-p:32:32:32-p1:64:64:64-p2:8:8:8-p3:16:16:16-p4:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
+
+@g = addrspace(3) global i32 89
+
+@const_zero_i8_as1 = addrspace(1) constant i8 0
+@const_zero_i32_as1 = addrspace(1) constant i32 0
+
+@const_zero_i8_as2 = addrspace(2) constant i8 0
+@const_zero_i32_as2 = addrspace(2) constant i32 0
+
+@const_zero_i8_as3 = addrspace(3) constant i8 0
+@const_zero_i32_as3 = addrspace(3) constant i32 0
+
+; Test constant folding of inttoptr (ptrtoint constantexpr)
+; The intermediate integer size is the same as the pointer size
+define i32 addrspace(3)* @test_constant_fold_inttoptr_as_pointer_same_size() {
+; CHECK-LABEL: @test_constant_fold_inttoptr_as_pointer_same_size(
+; CHECK-NEXT: ret i32 addrspace(3)* @const_zero_i32_as3
+  %x = ptrtoint i32 addrspace(3)* @const_zero_i32_as3 to i32
+  %y = inttoptr i32 %x to i32 addrspace(3)*
+  ret i32 addrspace(3)* %y
+}
+
+; The intermediate integer size is larger than the pointer size
+define i32 addrspace(2)* @test_constant_fold_inttoptr_as_pointer_smaller() {
+; CHECK-LABEL: @test_constant_fold_inttoptr_as_pointer_smaller(
+; CHECK-NEXT: ret i32 addrspace(2)* @const_zero_i32_as2
+  %x = ptrtoint i32 addrspace(2)* @const_zero_i32_as2 to i16
+  %y = inttoptr i16 %x to i32 addrspace(2)*
+  ret i32 addrspace(2)* %y
+}
+
+; Different address spaces that are the same size, but they are
+; different so nothing should happen
+define i32 addrspace(4)* @test_constant_fold_inttoptr_as_pointer_smaller_different_as() {
+; CHECK-LABEL: @test_constant_fold_inttoptr_as_pointer_smaller_different_as(
+; CHECK-NEXT: ret i32 addrspace(4)* inttoptr (i16 ptrtoint (i32 addrspace(3)* @const_zero_i32_as3 to i16) to i32 addrspace(4)*)
+  %x = ptrtoint i32 addrspace(3)* @const_zero_i32_as3 to i16
+  %y = inttoptr i16 %x to i32 addrspace(4)*
+  ret i32 addrspace(4)* %y
+}
+
+; Make sure we don't introduce a bitcast between different sized
+; address spaces when folding this
+define i32 addrspace(2)* @test_constant_fold_inttoptr_as_pointer_smaller_different_size_as() {
+; CHECK-LABEL: @test_constant_fold_inttoptr_as_pointer_smaller_different_size_as(
+; CHECK-NEXT: ret i32 addrspace(2)* inttoptr (i32 ptrtoint (i32 addrspace(3)* @const_zero_i32_as3 to i32) to i32 addrspace(2)*)
+  %x = ptrtoint i32 addrspace(3)* @const_zero_i32_as3 to i32
+  %y = inttoptr i32 %x to i32 addrspace(2)*
+  ret i32 addrspace(2)* %y
+}
+
+; The intermediate integer size is too small, nothing should happen
+define i32 addrspace(3)* @test_constant_fold_inttoptr_as_pointer_larger() {
+; CHECK-LABEL: @test_constant_fold_inttoptr_as_pointer_larger(
+; CHECK-NEXT: ret i32 addrspace(3)* inttoptr (i8 ptrtoint (i32 addrspace(3)* @const_zero_i32_as3 to i8) to i32 addrspace(3)*)
+  %x = ptrtoint i32 addrspace(3)* @const_zero_i32_as3 to i8
+  %y = inttoptr i8 %x to i32 addrspace(3)*
+  ret i32 addrspace(3)* %y
+}
+
+define i8 @const_fold_ptrtoint() {
+; CHECK-LABEL: @const_fold_ptrtoint(
+; CHECK-NEXT: ret i8 4
+  ret i8 ptrtoint (i32 addrspace(2)* inttoptr (i4 4 to i32 addrspace(2)*) to i8)
+}
+
+; Test that mask happens when the destination pointer is smaller than
+; the original
+define i8 @const_fold_ptrtoint_mask() {
+; CHECK-LABEL: @const_fold_ptrtoint_mask(
+; CHECK-NEXT: ret i8 1
+  ret i8 ptrtoint (i32 addrspace(3)* inttoptr (i32 257 to i32 addrspace(3)*) to i8)
+}
+
+; Address space 0 is too small for the correct mask, should mask with
+; 64-bits instead of 32
+define i64 @const_fold_ptrtoint_mask_small_as0() {
+; CHECK-LABEL: @const_fold_ptrtoint_mask_small_as0(
+; CHECK: ret i64 -1
+  ret i64 ptrtoint (i32 addrspace(1)* inttoptr (i128 -1 to i32 addrspace(1)*) to i64)
+}
+
+define i32 addrspace(3)* @const_inttoptr() {
+; CHECK-LABEL: @const_inttoptr(
+; CHECK-NEXT: ret i32 addrspace(3)* inttoptr (i16 4 to i32 addrspace(3)*)
+  %p = inttoptr i16 4 to i32 addrspace(3)*
+  ret i32 addrspace(3)* %p
+}
+
+define i16 @const_ptrtoint() {
+; CHECK-LABEL: @const_ptrtoint(
+; CHECK-NEXT: ret i16 ptrtoint (i32 addrspace(3)* @g to i16)
+  %i = ptrtoint i32 addrspace(3)* @g to i16
+  ret i16 %i
+}
+
+define i16 @const_inttoptr_ptrtoint() {
+; CHECK-LABEL: @const_inttoptr_ptrtoint(
+; CHECK-NEXT: ret i16 9
+  ret i16 ptrtoint (i32 addrspace(3)* inttoptr (i16 9 to i32 addrspace(3)*) to i16)
+}
+
+define i1 @constant_fold_cmp_constantexpr_inttoptr() {
+; CHECK-LABEL: @constant_fold_cmp_constantexpr_inttoptr(
+; CHECK-NEXT: ret i1 true
+  %x = icmp eq i32 addrspace(3)* inttoptr (i16 0 to i32 addrspace(3)*), null
+  ret i1 %x
+}
+
+define i1 @constant_fold_inttoptr_null(i16 %i) {
+; CHECK-LABEL: @constant_fold_inttoptr_null(
+; CHECK-NEXT: ret i1 false
+  %x = icmp eq i32 addrspace(3)* inttoptr (i16 99 to i32 addrspace(3)*), inttoptr (i16 0 to i32 addrspace(3)*)
+  ret i1 %x
+}
+
+define i1 @constant_fold_ptrtoint_null() {
+; CHECK-LABEL: @constant_fold_ptrtoint_null(
+; CHECK-NEXT: ret i1 false
+  %x = icmp eq i16 ptrtoint (i32 addrspace(3)* @g to i16), ptrtoint (i32 addrspace(3)* null to i16)
+  ret i1 %x
+}
+
+define i1 @constant_fold_ptrtoint_null_2() {
+; CHECK-LABEL: @constant_fold_ptrtoint_null_2(
+; CHECK-NEXT: ret i1 false
+  %x = icmp eq i16 ptrtoint (i32 addrspace(3)* null to i16), ptrtoint (i32 addrspace(3)* @g to i16)
+  ret i1 %x
+}
+
+define i1 @constant_fold_ptrtoint() {
+; CHECK-LABEL: @constant_fold_ptrtoint(
+; CHECK-NEXT: ret i1 true
+  %x = icmp eq i16 ptrtoint (i32 addrspace(3)* @g to i16), ptrtoint (i32 addrspace(3)* @g to i16)
+  ret i1 %x
+}
+
+define i1 @constant_fold_inttoptr() {
+; CHECK-LABEL: @constant_fold_inttoptr(
+; CHECK-NEXT: ret i1 false
+  %x = icmp eq i32 addrspace(3)* inttoptr (i16 99 to i32 addrspace(3)*), inttoptr (i16 27 to i32 addrspace(3)*)
+  ret i1 %x
+}
+
+@g_float_as3 = addrspace(3) global float zeroinitializer
+@g_v4f_as3 = addrspace(3) global <4 x float> zeroinitializer
+
+define float @constant_fold_bitcast_ftoi_load() {
+; CHECK-LABEL: @constant_fold_bitcast_ftoi_load(
+; CHECK: load float addrspace(3)* bitcast (i32 addrspace(3)* @g to float addrspace(3)*), align 4
+  %a = load float addrspace(3)* bitcast (i32 addrspace(3)* @g to float addrspace(3)*), align 4
+  ret float %a
+}
+
+define i32 @constant_fold_bitcast_itof_load() {
+; CHECK-LABEL: @constant_fold_bitcast_itof_load(
+; CHECK: load i32 addrspace(3)* bitcast (float addrspace(3)* @g_float_as3 to i32 addrspace(3)*), align 4
+  %a = load i32 addrspace(3)* bitcast (float addrspace(3)* @g_float_as3 to i32 addrspace(3)*), align 4
+  ret i32 %a
+}
+
+define <4 x i32> @constant_fold_bitcast_vector_as() {
+; CHECK-LABEL: @constant_fold_bitcast_vector_as(
+; CHECK: load <4 x float> addrspace(3)* @g_v4f_as3, align 16
+; CHECK: bitcast <4 x float> %1 to <4 x i32>
+  %a = load <4 x i32> addrspace(3)* bitcast (<4 x float> addrspace(3)* @g_v4f_as3 to <4 x i32> addrspace(3)*), align 4
+  ret <4 x i32> %a
+}
+
+@i32_array_as3 = addrspace(3) global [10 x i32] zeroinitializer
+
+define i32 @test_cast_gep_small_indices_as() {
+; CHECK-LABEL: @test_cast_gep_small_indices_as(
+; CHECK: load i32 addrspace(3)* getelementptr inbounds ([10 x i32] addrspace(3)* @i32_array_as3, i16 0, i16 0), align 16
+   %p = getelementptr [10 x i32] addrspace(3)* @i32_array_as3, i7 0, i7 0
+   %x = load i32 addrspace(3)* %p, align 4
+   ret i32 %x
+}
+
+%struct.foo = type { float, float, [4 x i32], i32 addrspace(3)* }
+
+@constant_fold_global_ptr = addrspace(3) global %struct.foo {
+  float 0.0,
+  float 0.0,
+  [4 x i32] zeroinitializer,
+  i32 addrspace(3)* getelementptr ([10 x i32] addrspace(3)* @i32_array_as3, i64 0, i64 0)
+}
+
+define i32 @test_cast_gep_large_indices_as() {
+; CHECK-LABEL: @test_cast_gep_large_indices_as(
+; CHECK: load i32 addrspace(3)* getelementptr inbounds ([10 x i32] addrspace(3)* @i32_array_as3, i16 0, i16 0), align 16
+   %p = getelementptr [10 x i32] addrspace(3)* @i32_array_as3, i64 0, i64 0
+   %x = load i32 addrspace(3)* %p, align 4
+   ret i32 %x
+}
+
+define i32 @test_constant_cast_gep_struct_indices_as() {
+; CHECK-LABEL: @test_constant_cast_gep_struct_indices_as(
+; CHECK: load i32 addrspace(3)* getelementptr inbounds (%struct.foo addrspace(3)* @constant_fold_global_ptr, i16 0, i32 2, i16 2), align 8
+  %x = getelementptr %struct.foo addrspace(3)* @constant_fold_global_ptr, i18 0, i32 2, i12 2
+  %y = load i32 addrspace(3)* %x, align 4
+  ret i32 %y
+}
+
+@constant_data_as3 = addrspace(3) constant [5 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5]
+
+define i32 @test_read_data_from_global_as3() {
+; CHECK-LABEL: @test_read_data_from_global_as3(
+; CHECK-NEXT: ret i32 2
+  %x = getelementptr [5 x i32] addrspace(3)* @constant_data_as3, i32 0, i32 1
+  %y = load i32 addrspace(3)* %x, align 4
+  ret i32 %y
+}
+
+@a = addrspace(1) constant i32 9
+@b = addrspace(1) constant i32 23
+@c = addrspace(1) constant i32 34
+@d = addrspace(1) constant i32 99
+
+@ptr_array = addrspace(2) constant [4 x i32 addrspace(1)*] [ i32 addrspace(1)* @a, i32 addrspace(1)* @b, i32 addrspace(1)* @c, i32 addrspace(1)* @d]
+@indirect = addrspace(0) constant i32 addrspace(1)* addrspace(2)* getelementptr inbounds ([4 x i32 addrspace(1)*] addrspace(2)* @ptr_array, i1 0, i32 2)
+
+define i32 @constant_through_array_as_ptrs() {
+; CHECK-LABEL: @constant_through_array_as_ptrs(
+; CHECK-NEXT: ret i32 34
+  %p = load i32 addrspace(1)* addrspace(2)* addrspace(0)* @indirect, align 4
+  %a = load i32 addrspace(1)* addrspace(2)* %p, align 4
+  %b = load i32 addrspace(1)* %a, align 4
+  ret i32 %b
+}
diff --git a/test/Transforms/InstCombine/constant-fold-gep.ll b/test/Transforms/InstCombine/constant-fold-gep.ll
index 9f82e66..5fb5602 100644
--- a/test/Transforms/InstCombine/constant-fold-gep.ll
+++ b/test/Transforms/InstCombine/constant-fold-gep.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
+target datalayout = "E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
 ; Constant folding should fix notionally out-of-bounds indices
 ; and add inbounds keywords.
@@ -72,3 +72,21 @@ entry:
   ret i64 %E
   ; CHECK: ret i64 1000
 }
+
+@X_as1 = addrspace(1) global [1000 x i8] zeroinitializer, align 16
+
+define i16 @test2_as1() {
+; CHECK-LABEL: @test2_as1(
+  ; CHECK: ret i16 1000
+
+entry:
+  %A = bitcast i8 addrspace(1)* getelementptr inbounds ([1000 x i8] addrspace(1)* @X_as1, i64 1, i64 0) to i8 addrspace(1)*
+  %B = bitcast i8 addrspace(1)* getelementptr inbounds ([1000 x i8] addrspace(1)* @X_as1, i64 0, i64 0) to i8 addrspace(1)*
+
+  %B2 = ptrtoint i8 addrspace(1)* %B to i16
+  %C = sub i16 0, %B2
+  %D = getelementptr i8 addrspace(1)* %A, i16 %C
+  %E = ptrtoint i8 addrspace(1)* %D to i16
+
+  ret i16 %E
+}
diff --git a/test/Transforms/InstCombine/debug-line.ll b/test/Transforms/InstCombine/debug-line.ll
index a76c353..2e3785f 100644
--- a/test/Transforms/InstCombine/debug-line.ll
+++ b/test/Transforms/InstCombine/debug-line.ll
@@ -12,15 +12,17 @@ define void @foo() nounwind ssp {
 declare i32 @printf(i8*, ...)
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10}
 !llvm.dbg.sp = !{!0}
 
 !0 = metadata !{i32 589870, metadata !8, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !8, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !8, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !8, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 5, i32 2, metadata !6, null}
 !6 = metadata !{i32 589835, metadata !8, metadata !0, i32 4, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
 !7 = metadata !{i32 6, i32 1, metadata !6, null}
 !8 = metadata !{metadata !"m.c", metadata !"/private/tmp"}
 !9 = metadata !{metadata !0}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/InstCombine/debuginfo.ll b/test/Transforms/InstCombine/debuginfo.ll
index 2f080bf..75082dc 100644
--- a/test/Transforms/InstCombine/debuginfo.ll
+++ b/test/Transforms/InstCombine/debuginfo.ll
@@ -2,7 +2,7 @@
 
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readnone
 
 declare i8* @foo(i8*, i32, i64, i64) nounwind
 
@@ -23,18 +23,19 @@ entry:
   %tmp1 = load i32* %__val.addr, align 4, !dbg !21
   %tmp2 = load i64* %__len.addr, align 8, !dbg !21
   %tmp3 = load i8** %__dest.addr, align 8, !dbg !21
-  %0 = call i64 @llvm.objectsize.i64(i8* %tmp3, i1 false), !dbg !21
+  %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp3, i1 false), !dbg !21
   %call = call i8* @foo(i8* %tmp, i32 %tmp1, i64 %tmp2, i64 %0), !dbg !21
   ret i8* %call, !dbg !21
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!30}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"__dest", metadata !2, i32 16777294, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !27, metadata !2, metadata !"foobar", metadata !"foobar", metadata !"", i32 79, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i8* (i8*, i32, i64)* @foobar, null, null, metadata !25, i32 79} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !27, metadata !2, metadata !"foobar", metadata !"foobar", metadata !"", i32 79, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8* (i8*, i32, i64)* @foobar, null, null, metadata !25, i32 79} ; [ DW_TAG_subprogram ] [line 79] [local] [def] [foobar]
 !2 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !28, i32 12, metadata !"clang version 3.0 (trunk 127710)", i1 true, metadata !"", i32 0, metadata !29, metadata !29, metadata !24, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !27, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !27, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
 !6 = metadata !{i32 786447, null, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
 !7 = metadata !{i32 786689, metadata !1, metadata !"__val", metadata !2, i32 33554510, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -55,3 +56,4 @@ entry:
 !27 = metadata !{metadata !"string.h", metadata !"Game"}
 !28 = metadata !{metadata !"bits.c", metadata !"Game"}
 !29 = metadata !{i32 0}
+!30 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/InstCombine/double-float-shrink-1.ll b/test/Transforms/InstCombine/double-float-shrink-1.ll
index e5448ee..5cacb59 100644
--- a/test/Transforms/InstCombine/double-float-shrink-1.ll
+++ b/test/Transforms/InstCombine/double-float-shrink-1.ll
@@ -263,6 +263,7 @@ define double @sin_test2(float %f) nounwind readnone {
    ret double %call
 ; CHECK: call double @sin(double %conv)
 }
+
 define float @sqrt_test(float %f) nounwind readnone {
 ; CHECK: sqrt_test
    %conv = fpext float %f to double
@@ -272,6 +273,15 @@ define float @sqrt_test(float %f) nounwind readnone {
 ; CHECK: call float @sqrtf(float %f)
 }
 
+define float @sqrt_int_test(float %f) nounwind readnone {
+; CHECK: sqrt_int_test
+   %conv = fpext float %f to double
+   %call = call double @llvm.sqrt.f64(double %conv)
+   %conv1 = fptrunc double %call to float
+   ret float %conv1
+; CHECK: call float @llvm.sqrt.f32(float %f)
+}
+
 define double @sqrt_test2(float %f) nounwind readnone {
 ; CHECK: sqrt_test2
    %conv = fpext float %f to double
@@ -331,3 +341,6 @@ declare double @acos(double) nounwind readnone
 declare double @acosh(double) nounwind readnone
 declare double @asin(double) nounwind readnone
 declare double @asinh(double) nounwind readnone
+
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
diff --git a/test/Transforms/InstCombine/enforce-known-alignment.ll b/test/Transforms/InstCombine/enforce-known-alignment.ll
index 6645d99..46bb605 100644
--- a/test/Transforms/InstCombine/enforce-known-alignment.ll
+++ b/test/Transforms/InstCombine/enforce-known-alignment.ll
@@ -1,8 +1,12 @@
-; RUN: opt < %s -instcombine -S | grep alloca | grep "align 16"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+; RUN: opt  -instcombine -S %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9.6"
 
 define void @foo(i32) {
+; CHECK-LABEL: @foo(
+; CHECK: alloca
+; CHECK: align 16
 	%2 = alloca [3 x <{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>], align 16		; <[3 x <{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>]*> [#uses=1]
 	%3 = getelementptr [3 x <{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>]* %2, i32 0, i32 0		; <<{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>*> [#uses=1]
 	%4 = getelementptr <{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>* %3, i32 0, i32 0		; <{ { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } }*> [#uses=1]
@@ -11,8 +15,24 @@ define void @foo(i32) {
 	%7 = getelementptr { [8 x i16] }* %6, i32 0, i32 0		; <[8 x i16]*> [#uses=1]
 	%8 = getelementptr [8 x i16]* %7, i32 0, i32 0		; <i16*> [#uses=1]
 	store i16 0, i16* %8, align 16
-        call void @bar(i16* %8)
+    call void @bar(i16* %8)
 	ret void
 }
 
 declare void @bar(i16*)
+
+define void @foo_as1(i32 %a, [3 x <{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>] addrspace(1)* %b) {
+; CHECK-LABEL: @foo_as1(
+; CHECK: align 16
+  %1 = getelementptr [3 x <{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>] addrspace(1)* %b, i32 0, i32 0        ; <<{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }>*> [#uses=1]
+  %2 = getelementptr <{ { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } }> addrspace(1)* %1, i32 0, i32 0      ; <{ { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } }*> [#uses=1]
+  %3 = getelementptr { { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } } addrspace(1)* %2, i32 0, i32 0        ; <{ [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 }*> [#uses=1]
+  %4 = bitcast { [2 x { { i32 } }], [2 x i8], { i16 }, [2 x i8], i8, i8 } addrspace(1)* %3 to { [8 x i16] } addrspace(1)*     ; <{ [8 x i16] }*> [#uses=1]
+  %5 = getelementptr { [8 x i16] } addrspace(1)* %4, i32 0, i32 0     ; <[8 x i16]*> [#uses=1]
+  %6 = getelementptr [8 x i16] addrspace(1)* %5, i32 0, i32 0     ; <i16*> [#uses=1]
+  store i16 0, i16 addrspace(1)* %6, align 16
+  call void @bar_as1(i16 addrspace(1)* %6)
+  ret void
+}
+
+declare void @bar_as1(i16 addrspace(1)*)
diff --git a/test/Transforms/InstCombine/err-rep-cold.ll b/test/Transforms/InstCombine/err-rep-cold.ll
new file mode 100644
index 0000000..0cbafc4
--- /dev/null
+++ b/test/Transforms/InstCombine/err-rep-cold.ll
@@ -0,0 +1,77 @@
+; Test the static branch probability heuristics for error-reporting functions.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@stdout = external global %struct._IO_FILE*
+@stderr = external global %struct._IO_FILE*
+@.str = private unnamed_addr constant [13 x i8] c"an error: %d\00", align 1
+@.str1 = private unnamed_addr constant [9 x i8] c"an error\00", align 1
+
+define i32 @test1(i32 %a) #0 {
+; CHECK-LABEL: @test1
+entry:
+  %cmp = icmp sgt i32 %a, 8
+  br i1 %cmp, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %0 = load %struct._IO_FILE** @stderr, align 8
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([13 x i8]* @.str, i64 0, i64 0), i32 %a) #1
+  br label %return
+
+; CHECK: %call = tail call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([13 x i8]* @.str, i64 0, i64 0), i32 %a) #[[AT1:[0-9]+]]
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ 1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) #1
+
+define i32 @test2(i32 %a) #0 {
+; CHECK-LABEL: @test2
+entry:
+  %cmp = icmp sgt i32 %a, 8
+  br i1 %cmp, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %0 = load %struct._IO_FILE** @stderr, align 8
+  %1 = tail call i64 @fwrite(i8* getelementptr inbounds ([9 x i8]* @.str1, i64 0, i64 0), i64 8, i64 1, %struct._IO_FILE* %0)
+  br label %return
+
+; CHECK: tail call i64 @fwrite(i8* getelementptr inbounds ([9 x i8]* @.str1, i64 0, i64 0), i64 8, i64 1, %struct._IO_FILE* %0) #[[AT2:[0-9]+]]
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ 1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
+
+define i32 @test3(i32 %a) #0 {
+; CHECK-LABEL: @test3
+entry:
+  %cmp = icmp sgt i32 %a, 8
+  br i1 %cmp, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %0 = load %struct._IO_FILE** @stdout, align 8
+  %1 = tail call i64 @fwrite(i8* getelementptr inbounds ([9 x i8]* @.str1, i64 0, i64 0), i64 8, i64 1, %struct._IO_FILE* %0)
+  br label %return
+
+; CHECK-NOT: tail call i64 @fwrite(i8* getelementptr inbounds ([9 x i8]* @.str1, i64 0, i64 0), i64 8, i64 1, %struct._IO_FILE* %0) #[[AT2]]
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ 1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
+; CHECK: attributes #[[AT1]] = { cold nounwind }
+; CHECK: attributes #[[AT2]] = { cold }
+
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index a9a7015..d8ba2a5 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -202,6 +202,18 @@ define float @fmul2(float %f1) {
 ; CHECK: fdiv fast float 1.200000e+07, %f1
 }
 
+; X/C1 * C2 => X * (C2/C1) is disabled if X/C1 has multiple uses
+@fmul2_external = external global float
+define float @fmul2_disable(float %f1) {
+  %div = fdiv fast float 1.000000e+00, %f1 
+  store float %div, float* @fmul2_external
+  %mul = fmul fast float %div, 2.000000e+00
+  ret float %mul
+; CHECK-LABEL: @fmul2_disable
+; CHECK: store
+; CHECK: fmul fast
+}
+
 ; X/C1 * C2 => X * (C2/C1) (if C2/C1 is normal Fp)
 define float @fmul3(float %f1, float %f2) {
   %t1 = fdiv float %f1, 2.0e+3
diff --git a/test/Transforms/InstCombine/ffs-1.ll b/test/Transforms/InstCombine/ffs-1.ll
index 8f0b38f..1dec11d 100644
--- a/test/Transforms/InstCombine/ffs-1.ll
+++ b/test/Transforms/InstCombine/ffs-1.ll
@@ -1,7 +1,7 @@
 ; Test that the ffs* library call simplifier works correctly.
 ;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple i386-pc-linux -instcombine -S | FileCheck %s -check-prefix=LINUX
+; RUN: opt < %s -mtriple i386-pc-linux -instcombine -S | FileCheck %s -check-prefix=CHECK-LINUX
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/Transforms/InstCombine/fmul.ll b/test/Transforms/InstCombine/fmul.ll
index cf57bed..402ee52 100644
--- a/test/Transforms/InstCombine/fmul.ll
+++ b/test/Transforms/InstCombine/fmul.ll
@@ -70,3 +70,26 @@ define float @test7(float %x, float %y) {
 ; CHECK-LABEL: @test7(
 ; CHECK: fsub float -0.000000e+00, %x
 }
+
+; Don't crash when attempting to cast a constant FMul to an instruction.
+define void @test8(i32* %inout) {
+entry:
+  %0 = load i32* %inout, align 4
+  %conv = uitofp i32 %0 to float
+  %vecinit = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, float %conv, i32 3
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vecinit
+  %1 = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = fmul <4 x float> zeroinitializer, %1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %local_var_7.0 = phi <4 x float> [ %mul, %entry ], [ %2, %for.body ]
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = insertelement <4 x float> %local_var_7.0, float 0.000000e+00, i32 2
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/test/Transforms/InstCombine/fold-vector-select.ll b/test/Transforms/InstCombine/fold-vector-select.ll
index 2cb970b..b58d9dc 100644
--- a/test/Transforms/InstCombine/fold-vector-select.ll
+++ b/test/Transforms/InstCombine/fold-vector-select.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s -instcombine -S | not grep select
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; CHECK-NOT: select
 
 define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D,
                  <4 x i32> *%E, <4 x i32> *%F, <4 x i32> *%G, <4 x i32> *%H,
diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index 09f0532..05d1b48 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -31,4 +31,16 @@ define half @test4(float %a) {
   ret half %c
 }
 
+; CHECK: test5
+define half @test5(float %a, float %b, float %c) {
+; CHECK: fcmp ogt
+; CHECK: fptrunc
+; CHECK: select
+; CHECK: half 0xH3C00
+  %d = fcmp ogt float %a, %b
+  %e = select i1 %d, float %c, float 1.0
+  %f = fptrunc float %e to half
+  ret half %f
+}
+
 declare float @llvm.fabs.f32(float) nounwind readonly
diff --git a/test/Transforms/InstCombine/fprintf-1.ll b/test/Transforms/InstCombine/fprintf-1.ll
index 1b7c104..3f6a314 100644
--- a/test/Transforms/InstCombine/fprintf-1.ll
+++ b/test/Transforms/InstCombine/fprintf-1.ll
@@ -1,7 +1,7 @@
 ; Test that the fprintf library call simplifier works correctly.
 ;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple xcore-xmos-elf -instcombine -S | FileCheck %s -check-prefix=IPRINTF
+; RUN: opt < %s -mtriple xcore-xmos-elf -instcombine -S | FileCheck %s -check-prefix=CHECK-IPRINTF
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/Transforms/InstCombine/gep-addrspace.ll b/test/Transforms/InstCombine/gep-addrspace.ll
index dfe12db..24c355d 100644
--- a/test/Transforms/InstCombine/gep-addrspace.ll
+++ b/test/Transforms/InstCombine/gep-addrspace.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-pc-win32"
 define void @func(%myStruct addrspace(1)* nocapture %p) nounwind {
 ST:
   %A = getelementptr inbounds %myStruct addrspace(1)* %p, i64 0
-  %B = bitcast %myStruct addrspace(1)* %A to %myStruct*
+  %B = addrspacecast %myStruct addrspace(1)* %A to %myStruct*
   %C = getelementptr inbounds %myStruct* %B, i32 0, i32 1
   %D = getelementptr inbounds [3 x float]* %C, i32 0, i32 2
   %E = load float* %D, align 4
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index 90f144a..c29a7dc 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-target datalayout = "e-p:64:64"
+target datalayout = "e-p:64:64-p1:16:16-p2:32:32:32"
+
 %intstruct = type { i32 }
 %pair = type { i32, i32 }
 %struct.B = type { double }
@@ -8,15 +9,23 @@ target datalayout = "e-p:64:64"
 
 
 @Global = constant [10 x i8] c"helloworld"
+@Global_as1 = addrspace(1) constant [10 x i8] c"helloworld"
 
 ; Test noop elimination
 define i32* @test1(i32* %I) {
-        %A = getelementptr i32* %I, i64 0 
+        %A = getelementptr i32* %I, i64 0
         ret i32* %A
 ; CHECK-LABEL: @test1(
 ; CHECK: ret i32* %I
 }
 
+define i32 addrspace(1)* @test1_as1(i32 addrspace(1)* %I) {
+  %A = getelementptr i32 addrspace(1)* %I, i64 0
+  ret i32 addrspace(1)* %A
+; CHECK-LABEL: @test1_as1(
+; CHECK: ret i32 addrspace(1)* %I
+}
+
 ; Test noop elimination
 define i32* @test2(i32* %I) {
         %A = getelementptr i32* %I
@@ -36,7 +45,7 @@ define i32* @test3(i32* %I) {
 
 ; Test that two getelementptr insts fold
 define i32* @test4({ i32 }* %I) {
-        %A = getelementptr { i32 }* %I, i64 1 
+        %A = getelementptr { i32 }* %I, i64 1
         %B = getelementptr { i32 }* %A, i64 0, i32 0
         ret i32* %B
 ; CHECK-LABEL: @test4(
@@ -45,17 +54,53 @@ define i32* @test4({ i32 }* %I) {
 
 define void @test5(i8 %B) {
         ; This should be turned into a constexpr instead of being an instruction
-        %A = getelementptr [10 x i8]* @Global, i64 0, i64 4 
+        %A = getelementptr [10 x i8]* @Global, i64 0, i64 4
         store i8 %B, i8* %A
         ret void
 ; CHECK-LABEL: @test5(
 ; CHECK: store i8 %B, i8* getelementptr inbounds ([10 x i8]* @Global, i64 0, i64 4)
 }
 
+define void @test5_as1(i8 %B) {
+        ; This should be turned into a constexpr instead of being an instruction
+        %A = getelementptr [10 x i8] addrspace(1)* @Global_as1, i16 0, i16 4
+        store i8 %B, i8 addrspace(1)* %A
+        ret void
+; CHECK-LABEL: @test5_as1(
+; CHECK: store i8 %B, i8 addrspace(1)* getelementptr inbounds ([10 x i8] addrspace(1)* @Global_as1, i16 0, i16 4)
+}
+
+%as1_ptr_struct = type { i32 addrspace(1)* }
+%as2_ptr_struct = type { i32 addrspace(2)* }
+
+@global_as2 = addrspace(2) global i32 zeroinitializer
+@global_as1_as2_ptr = addrspace(1) global %as2_ptr_struct { i32 addrspace(2)* @global_as2 }
+
+; This should be turned into a constexpr instead of being an instruction
+define void @test_evaluate_gep_nested_as_ptrs(i32 addrspace(2)* %B) {
+; CHECK-LABEL: @test_evaluate_gep_nested_as_ptrs(
+; CHECK-NEXT: store i32 addrspace(2)* %B, i32 addrspace(2)* addrspace(1)* getelementptr inbounds (%as2_ptr_struct addrspace(1)* @global_as1_as2_ptr, i16 0, i32 0), align 8
+; CHECK-NEXT: ret void
+  %A = getelementptr %as2_ptr_struct addrspace(1)* @global_as1_as2_ptr, i16 0, i32 0
+  store i32 addrspace(2)* %B, i32 addrspace(2)* addrspace(1)* %A
+  ret void
+}
+
+@arst = addrspace(1) global [4 x i8 addrspace(2)*] zeroinitializer
+
+define void @test_evaluate_gep_as_ptrs_array(i8 addrspace(2)* %B) {
+; CHECK-LABEL: @test_evaluate_gep_as_ptrs_array(
+; CHECK-NEXT: store i8 addrspace(2)* %B, i8 addrspace(2)* addrspace(1)* getelementptr inbounds ([4 x i8 addrspace(2)*] addrspace(1)* @arst, i16 0, i16 2), align 4
+
+; CHECK-NEXT: ret void
+  %A = getelementptr [4 x i8 addrspace(2)*] addrspace(1)* @arst, i16 0, i16 2
+  store i8 addrspace(2)* %B, i8 addrspace(2)* addrspace(1)* %A
+  ret void
+}
 
 define i32* @test7(i32* %I, i64 %C, i64 %D) {
-        %A = getelementptr i32* %I, i64 %C 
-        %B = getelementptr i32* %A, i64 %D 
+        %A = getelementptr i32* %I, i64 %C
+        %B = getelementptr i32* %A, i64 %D
         ret i32* %B
 ; CHECK-LABEL: @test7(
 ; CHECK: %A.sum = add i64 %C, %D
@@ -64,8 +109,8 @@ define i32* @test7(i32* %I, i64 %C, i64 %D) {
 
 define i8* @test8([10 x i32]* %X) {
         ;; Fold into the cast.
-        %A = getelementptr [10 x i32]* %X, i64 0, i64 0 
-        %B = bitcast i32* %A to i8*     
+        %A = getelementptr [10 x i32]* %X, i64 0, i64 0
+        %B = bitcast i32* %A to i8*
         ret i8* %B
 ; CHECK-LABEL: @test8(
 ; CHECK: bitcast [10 x i32]* %X to i8*
@@ -73,7 +118,7 @@ define i8* @test8([10 x i32]* %X) {
 
 define i32 @test9() {
         %A = getelementptr { i32, double }* null, i32 0, i32 1
-        %B = ptrtoint double* %A to i32        
+        %B = ptrtoint double* %A to i32
         ret i32 %B
 ; CHECK-LABEL: @test9(
 ; CHECK: ret i32 8
@@ -83,15 +128,15 @@ define i1 @test10({ i32, i32 }* %x, { i32, i32 }* %y) {
         %tmp.1 = getelementptr { i32, i32 }* %x, i32 0, i32 1
         %tmp.3 = getelementptr { i32, i32 }* %y, i32 0, i32 1
         ;; seteq x, y
-        %tmp.4 = icmp eq i32* %tmp.1, %tmp.3       
+        %tmp.4 = icmp eq i32* %tmp.1, %tmp.3
         ret i1 %tmp.4
 ; CHECK-LABEL: @test10(
 ; CHECK: icmp eq { i32, i32 }* %x, %y
 }
 
 define i1 @test11({ i32, i32 }* %X) {
-        %P = getelementptr { i32, i32 }* %X, i32 0, i32 0 
-        %Q = icmp eq i32* %P, null             
+        %P = getelementptr { i32, i32 }* %X, i32 0, i32 0
+        %Q = icmp eq i32* %P, null
         ret i1 %Q
 ; CHECK-LABEL: @test11(
 ; CHECK: icmp eq { i32, i32 }* %X, null
@@ -105,11 +150,11 @@ entry:
   store i32 10, i32* %g3, align 4
 
   %g4 = getelementptr %struct.A* %a, i32 0, i32 0
-  
+
   %new_a = bitcast %struct.B* %g4 to %struct.A*
 
-  %g5 = getelementptr %struct.A* %new_a, i32 0, i32 1	
-  %a_a = load i32* %g5, align 4	
+  %g5 = getelementptr %struct.A* %new_a, i32 0, i32 1
+  %a_a = load i32* %g5, align 4
   ret i32 %a_a
 ; CHECK-LABEL:      @test12(
 ; CHECK:      getelementptr %struct.A* %a, i64 0, i32 1
@@ -129,8 +174,68 @@ define i1 @test13(i64 %X, %S* %P) {
 ; CHECK:    %C = icmp eq i64 %X, -1
 }
 
-
-@G = external global [3 x i8]      
+define <2 x i1> @test13_vector(<2 x i64> %X, <2 x %S*> %P) nounwind {
+; CHECK-LABEL: @test13_vector(
+; CHECK-NEXT: shl nuw <2 x i64> %X, <i64 2, i64 2>
+; CHECK-NEXT: add <2 x i64> %A.idx, <i64 4, i64 4>
+; CHECK-NEXT: icmp eq <2 x i64> %A.offs, zeroinitializer
+  %A = getelementptr inbounds <2 x %S*> %P, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 1>, <2 x i64> %X
+  %B = getelementptr inbounds <2 x %S*> %P, <2 x i64> <i64 0, i64 0>, <2 x i32> <i32 0, i32 0>
+  %C = icmp eq <2 x i32*> %A, %B
+  ret <2 x i1> %C
+}
+
+define i1 @test13_as1(i16 %X, %S addrspace(1)* %P) {
+; CHECK-LABEL: @test13_as1(
+; CHECK-NEXT:  %C = icmp eq i16 %X, -1
+; CHECK-NEXT: ret i1 %C
+  %A = getelementptr inbounds %S addrspace(1)* %P, i16 0, i32 1, i16 %X
+  %B = getelementptr inbounds %S addrspace(1)* %P, i16 0, i32 0
+  %C = icmp eq i32 addrspace(1)* %A, %B
+  ret i1 %C
+}
+
+define <2 x i1> @test13_vector_as1(<2 x i16> %X, <2 x %S addrspace(1)*> %P) {
+; CHECK-LABEL: @test13_vector_as1(
+; CHECK-NEXT: shl nuw <2 x i16> %X, <i16 2, i16 2>
+; CHECK-NEXT: add <2 x i16> %A.idx, <i16 4, i16 4>
+; CHECK-NEXT: icmp eq <2 x i16> %A.offs, zeroinitializer
+; CHECK-NEXT: ret <2 x i1>
+  %A = getelementptr inbounds <2 x %S addrspace(1)*> %P, <2 x i16> <i16 0, i16 0>, <2 x i32> <i32 1, i32 1>, <2 x i16> %X
+  %B = getelementptr inbounds <2 x %S addrspace(1)*> %P, <2 x i16> <i16 0, i16 0>, <2 x i32> <i32 0, i32 0>
+  %C = icmp eq <2 x i32 addrspace(1)*> %A, %B
+  ret <2 x i1> %C
+}
+
+define i1 @test13_i32(i32 %X, %S* %P) {
+; CHECK-LABEL: @test13_i32(
+; CHECK: %C = icmp eq i32 %X, -1
+  %A = getelementptr inbounds %S* %P, i32 0, i32 1, i32 %X
+  %B = getelementptr inbounds %S* %P, i32 0, i32 0
+  %C = icmp eq i32* %A, %B
+  ret i1 %C
+}
+
+define i1 @test13_i16(i16 %X, %S* %P) {
+; CHECK-LABEL: @test13_i16(
+; CHECK: %C = icmp eq i16 %X, -1
+  %A = getelementptr inbounds %S* %P, i16 0, i32 1, i16 %X
+  %B = getelementptr inbounds %S* %P, i16 0, i32 0
+  %C = icmp eq i32* %A, %B
+  ret i1 %C
+}
+
+define i1 @test13_i128(i128 %X, %S* %P) {
+; CHECK-LABEL: @test13_i128(
+; CHECK: %C = icmp eq i64 %1, -1
+  %A = getelementptr inbounds %S* %P, i128 0, i32 1, i128 %X
+  %B = getelementptr inbounds %S* %P, i128 0, i32 0
+  %C = icmp eq i32* %A, %B
+  ret i1 %C
+}
+
+
+@G = external global [3 x i8]
 define i8* @test14(i32 %Idx) {
         %idx = zext i32 %Idx to i64
         %tmp = getelementptr i8* getelementptr ([3 x i8]* @G, i32 0, i32 0), i64 %idx
@@ -151,7 +256,7 @@ define i32 *@test15(i64 %X) {
 
 
 define i32* @test16(i32* %X, i32 %Idx) {
-        %R = getelementptr i32* %X, i32 %Idx       
+        %R = getelementptr i32* %X, i32 %Idx
         ret i32* %R
 ; CHECK-LABEL: @test16(
 ; CHECK: sext i32 %Idx to i64
@@ -164,7 +269,7 @@ define i1 @test17(i16* %P, i32 %I, i32 %J) {
         %C = icmp ult i16* %X, %Y
         ret i1 %C
 ; CHECK-LABEL: @test17(
-; CHECK: %C = icmp slt i32 %I, %J 
+; CHECK: %C = icmp slt i32 %I, %J
 }
 
 define i1 @test18(i16* %P, i32 %I) {
@@ -175,6 +280,55 @@ define i1 @test18(i16* %P, i32 %I) {
 ; CHECK: %C = icmp slt i32 %I, 0
 }
 
+; Larger than the pointer size for a non-zero address space
+define i1 @test18_as1(i16 addrspace(1)* %P, i32 %I) {
+; CHECK-LABEL: @test18_as1(
+; CHECK-NEXT: %1 = trunc i32 %I to i16
+; CHECK-NEXT: %C = icmp slt i16 %1, 0
+; CHECK-NEXT: ret i1 %C
+  %X = getelementptr inbounds i16 addrspace(1)* %P, i32 %I
+  %C = icmp ult i16 addrspace(1)* %X, %P
+  ret i1 %C
+}
+
+; Smaller than the pointer size for a non-zero address space
+define i1 @test18_as1_i32(i16 addrspace(1)* %P, i32 %I) {
+; CHECK-LABEL: @test18_as1_i32(
+; CHECK-NEXT: %1 = trunc i32 %I to i16
+; CHECK-NEXT: %C = icmp slt i16 %1, 0
+; CHECK-NEXT: ret i1 %C
+  %X = getelementptr inbounds i16 addrspace(1)* %P, i32 %I
+  %C = icmp ult i16 addrspace(1)* %X, %P
+  ret i1 %C
+}
+
+; Smaller than pointer size
+define i1 @test18_i16(i16* %P, i16 %I) {
+; CHECK-LABEL: @test18_i16(
+; CHECK: %C = icmp slt i16 %I, 0
+  %X = getelementptr inbounds i16* %P, i16 %I
+  %C = icmp ult i16* %X, %P
+  ret i1 %C
+}
+
+; Same as pointer size
+define i1 @test18_i64(i16* %P, i64 %I) {
+; CHECK-LABEL: @test18_i64(
+; CHECK: %C = icmp slt i64 %I, 0
+  %X = getelementptr inbounds i16* %P, i64 %I
+  %C = icmp ult i16* %X, %P
+  ret i1 %C
+}
+
+; Larger than the pointer size
+define i1 @test18_i128(i16* %P, i128 %I) {
+; CHECK-LABEL: @test18_i128(
+; CHECK: %C = icmp slt i64 %1, 0
+  %X = getelementptr inbounds i16* %P, i128 %I
+  %C = icmp ult i16* %X, %P
+  ret i1 %C
+}
+
 define i32 @test19(i32* %P, i32 %A, i32 %B) {
         %tmp.4 = getelementptr inbounds i32* %P, i32 %A
         %tmp.9 = getelementptr inbounds i32* %P, i32 %B
@@ -194,6 +348,15 @@ define i32 @test20(i32* %P, i32 %A, i32 %B) {
 ; CHECK: icmp eq i32 %A, 0
 }
 
+define i32 @test20_as1(i32 addrspace(1)* %P, i32 %A, i32 %B) {
+  %tmp.4 = getelementptr inbounds i32 addrspace(1)* %P, i32 %A
+  %tmp.6 = icmp eq i32 addrspace(1)* %tmp.4, %P
+  %tmp.7 = zext i1 %tmp.6 to i32
+  ret i32 %tmp.7
+; CHECK-LABEL: @test20_as1(
+; CHECK: icmp eq i16 %1, 0
+}
+
 
 define i32 @test21() {
         %pbob1 = alloca %intstruct
@@ -210,8 +373,8 @@ define i32 @test21() {
 @B = global i32 2               ; <i32*> [#uses=1]
 
 define i1 @test22() {
-        %C = icmp ult i32* getelementptr (i32* @A, i64 1), 
-                           getelementptr (i32* @B, i64 2) 
+        %C = icmp ult i32* getelementptr (i32* @A, i64 1),
+                           getelementptr (i32* @B, i64 2)
         ret i1 %C
 ; CHECK-LABEL: @test22(
 ; CHECK: icmp ult (i32* getelementptr inbounds (i32* @A, i64 1), i32* getelementptr (i32* @B, i64 2))
@@ -262,15 +425,15 @@ define i1 @test26(i8* %arr) {
 
 define i32 @test27(%struct.compat_siginfo* %to, %struct.siginfo_t* %from) {
 entry:
-	%from_addr = alloca %struct.siginfo_t*	
-	%tmp344 = load %struct.siginfo_t** %from_addr, align 8	
+	%from_addr = alloca %struct.siginfo_t*
+	%tmp344 = load %struct.siginfo_t** %from_addr, align 8
 	%tmp345 = getelementptr %struct.siginfo_t* %tmp344, i32 0, i32 3
 	%tmp346 = getelementptr { { i32, i32, [0 x i8], %struct.sigval_t, i32 }, [88 x i8] }* %tmp345, i32 0, i32 0
-	%tmp346347 = bitcast { i32, i32, [0 x i8], %struct.sigval_t, i32 }* %tmp346 to { i32, i32, %struct.sigval_t }*	
+	%tmp346347 = bitcast { i32, i32, [0 x i8], %struct.sigval_t, i32 }* %tmp346 to { i32, i32, %struct.sigval_t }*
 	%tmp348 = getelementptr { i32, i32, %struct.sigval_t }* %tmp346347, i32 0, i32 2
 	%tmp349 = getelementptr %struct.sigval_t* %tmp348, i32 0, i32 0
 	%tmp349350 = bitcast i8** %tmp349 to i32*
-	%tmp351 = load i32* %tmp349350, align 8	
+	%tmp351 = load i32* %tmp349350, align 8
 	%tmp360 = call i32 asm sideeffect "...",
         "=r,ir,*m,i,0,~{dirflag},~{fpsr},~{flags}"( i32 %tmp351,
          %struct.__large_struct* null, i32 -14, i32 0 )
@@ -280,28 +443,28 @@ entry:
 
 ; PR1978
 	%struct.x = type <{ i8 }>
-@.str = internal constant [6 x i8] c"Main!\00"	
-@.str1 = internal constant [12 x i8] c"destroy %p\0A\00"	
+@.str = internal constant [6 x i8] c"Main!\00"
+@.str1 = internal constant [12 x i8] c"destroy %p\0A\00"
 
 define i32 @test28() nounwind  {
 entry:
 	%orientations = alloca [1 x [1 x %struct.x]]
-	%tmp3 = call i32 @puts( i8* getelementptr ([6 x i8]* @.str, i32 0, i32 0) ) nounwind 
+	%tmp3 = call i32 @puts( i8* getelementptr ([6 x i8]* @.str, i32 0, i32 0) ) nounwind
 	%tmp45 = getelementptr inbounds [1 x [1 x %struct.x]]* %orientations, i32 1, i32 0, i32 0
 	%orientations62 = getelementptr [1 x [1 x %struct.x]]* %orientations, i32 0, i32 0, i32 0
 	br label %bb10
 
 bb10:
 	%indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb10 ]
-	%tmp.0.reg2mem.0.rec = mul i32 %indvar, -1	
-	%tmp12.rec = add i32 %tmp.0.reg2mem.0.rec, -1	
+	%tmp.0.reg2mem.0.rec = mul i32 %indvar, -1
+	%tmp12.rec = add i32 %tmp.0.reg2mem.0.rec, -1
 	%tmp12 = getelementptr inbounds %struct.x* %tmp45, i32 %tmp12.rec
 	%tmp16 = call i32 (i8*, ...)* @printf( i8* getelementptr ([12 x i8]* @.str1, i32 0, i32 0), %struct.x* %tmp12 ) nounwind
 	%tmp84 = icmp eq %struct.x* %tmp12, %orientations62
 	%indvar.next = add i32 %indvar, 1
 	br i1 %tmp84, label %bb17, label %bb10
 
-bb17:	
+bb17:
 	ret i32 0
 ; CHECK-LABEL: @test28(
 ; CHECK: icmp eq i32 %indvar, 0
@@ -318,7 +481,7 @@ declare i32 @printf(i8*, ...)
 	%T = type <{ i64, i64, i64 }>
 define i32 @test29(i8* %start, i32 %X) nounwind {
 entry:
-	%tmp3 = load i64* null		
+	%tmp3 = load i64* null
 	%add.ptr = getelementptr i8* %start, i64 %tmp3
 	%tmp158 = load i32* null
 	%add.ptr159 = getelementptr %T* null, i32 %tmp158
@@ -356,7 +519,7 @@ declare void @test30f(i32*)
 define i1 @test31(i32* %A) {
         %B = getelementptr i32* %A, i32 1
         %C = getelementptr i32* %A, i64 1
-        %V = icmp eq i32* %B, %C 
+        %V = icmp eq i32* %B, %C
         ret i1 %V
 ; CHECK-LABEL: @test31(
 ; CHECK: ret i1 true
@@ -372,7 +535,7 @@ define i8* @test32(i8* %v) {
 	%D = getelementptr { [16 x i8] }* %C, i32 0, i32 0, i32 8
 	%E = bitcast i8* %D to i8**
 	store i8* %v, i8** %E
-	%F = getelementptr [4 x i8*]* %A, i32 0, i32 2	
+	%F = getelementptr [4 x i8*]* %A, i32 0, i32 2
 	%G = load i8** %F
 	ret i8* %G
 ; CHECK-LABEL: @test32(
@@ -384,23 +547,46 @@ define i8* @test32(i8* %v) {
 %struct.Key = type { { i32, i32 } }
 %struct.anon = type <{ i8, [3 x i8], i32 }>
 
-define i32 *@test33(%struct.Key *%A) {
-	%B = bitcast %struct.Key* %A to %struct.anon*
-        %C = getelementptr %struct.anon* %B, i32 0, i32 2 
-	ret i32 *%C
+define i32* @test33(%struct.Key* %A) {
 ; CHECK-LABEL: @test33(
 ; CHECK: getelementptr %struct.Key* %A, i64 0, i32 0, i32 1
+  %B = bitcast %struct.Key* %A to %struct.anon*
+  %C = getelementptr %struct.anon* %B, i32 0, i32 2
+  ret i32* %C
 }
 
+define i32 addrspace(1)* @test33_as1(%struct.Key addrspace(1)* %A) {
+; CHECK-LABEL: @test33_as1(
+; CHECK: getelementptr %struct.Key addrspace(1)* %A, i16 0, i32 0, i32 1
+  %B = bitcast %struct.Key addrspace(1)* %A to %struct.anon addrspace(1)*
+  %C = getelementptr %struct.anon addrspace(1)* %B, i32 0, i32 2
+  ret i32 addrspace(1)* %C
+}
 
+define i32 addrspace(1)* @test33_array_as1([10 x i32] addrspace(1)* %A) {
+; CHECK-LABEL: @test33_array_as1(
+; CHECK: getelementptr [10 x i32] addrspace(1)* %A, i16 0, i16 2
+  %B = bitcast [10 x i32] addrspace(1)* %A to [5 x i32] addrspace(1)*
+  %C = getelementptr [5 x i32] addrspace(1)* %B, i32 0, i32 2
+  ret i32 addrspace(1)* %C
+}
+
+; Make sure the GEP indices use the right pointer sized integer
+define i32 addrspace(1)* @test33_array_struct_as1([10 x %struct.Key] addrspace(1)* %A) {
+; CHECK-LABEL: @test33_array_struct_as1(
+; CHECK: getelementptr [10 x %struct.Key] addrspace(1)* %A, i16 0, i16 1, i32 0, i32 0
+  %B = bitcast [10 x %struct.Key] addrspace(1)* %A to [20 x i32] addrspace(1)*
+  %C = getelementptr [20 x i32] addrspace(1)* %B, i32 0, i32 2
+  ret i32 addrspace(1)* %C
+}
 
 	%T2 = type { i8*, i8 }
 define i8* @test34(i8* %Val, i64 %V) nounwind {
 entry:
-	%A = alloca %T2, align 8	
+	%A = alloca %T2, align 8
 	%mrv_gep = bitcast %T2* %A to i64*
 	%B = getelementptr %T2* %A, i64 0, i32 0
-        
+
       	store i64 %V, i64* %mrv_gep
 	%C = load i8** %B, align 8
 	ret i8* %C
@@ -519,4 +705,88 @@ define i1 @pr16483([1 x i8]* %a, [1 x i8]* %b) {
 ; CHECK-NEXT: icmp ult  [1 x i8]* %a, %b
 }
 
+define i8 @test_gep_bitcast_as1(i32 addrspace(1)* %arr, i16 %N) {
+; CHECK-LABEL: @test_gep_bitcast_as1(
+; CHECK: getelementptr i32 addrspace(1)* %arr, i16 %N
+; CHECK: bitcast
+  %cast = bitcast i32 addrspace(1)* %arr to i8 addrspace(1)*
+  %V = mul i16 %N, 4
+  %t = getelementptr i8 addrspace(1)* %cast, i16 %V
+  %x = load i8 addrspace(1)* %t
+  ret i8 %x
+}
+
+; The element size of the array matches the element size of the pointer
+define i64 @test_gep_bitcast_array_same_size_element([100 x double]* %arr, i64 %N) {
+; CHECK-LABEL: @test_gep_bitcast_array_same_size_element(
+; CHECK: getelementptr [100 x double]* %arr, i64 0, i64 %V
+; CHECK: bitcast
+  %cast = bitcast [100 x double]* %arr to i64*
+  %V = mul i64 %N, 8
+  %t = getelementptr i64* %cast, i64 %V
+  %x = load i64* %t
+  ret i64 %x
+}
+
+; The element size of the array is different the element size of the pointer
+define i8 @test_gep_bitcast_array_different_size_element([100 x double]* %arr, i64 %N) {
+; CHECK-LABEL: @test_gep_bitcast_array_different_size_element(
+; CHECK: getelementptr [100 x double]* %arr, i64 0, i64 %N
+; CHECK: bitcast
+  %cast = bitcast [100 x double]* %arr to i8*
+  %V = mul i64 %N, 8
+  %t = getelementptr i8* %cast, i64 %V
+  %x = load i8* %t
+  ret i8 %x
+}
+
+define i64 @test_gep_bitcast_array_same_size_element_as1([100 x double] addrspace(1)* %arr, i16 %N) {
+; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_as1(
+; CHECK: getelementptr [100 x double] addrspace(1)* %arr, i16 0, i16 %V
+; CHECK: bitcast
+  %cast = bitcast [100 x double] addrspace(1)* %arr to i64 addrspace(1)*
+  %V = mul i16 %N, 8
+  %t = getelementptr i64 addrspace(1)* %cast, i16 %V
+  %x = load i64 addrspace(1)* %t
+  ret i64 %x
+}
+
+define i8 @test_gep_bitcast_array_different_size_element_as1([100 x double] addrspace(1)* %arr, i16 %N) {
+; CHECK-LABEL: @test_gep_bitcast_array_different_size_element_as1(
+; CHECK: getelementptr [100 x double] addrspace(1)* %arr, i16 0, i16 %N
+; CHECK: bitcast
+  %cast = bitcast [100 x double] addrspace(1)* %arr to i8 addrspace(1)*
+  %V = mul i16 %N, 8
+  %t = getelementptr i8 addrspace(1)* %cast, i16 %V
+  %x = load i8 addrspace(1)* %t
+  ret i8 %x
+}
+
+define i64 @test40() {
+  %array = alloca [3 x i32], align 4
+  %gep = getelementptr inbounds [3 x i32]* %array, i64 0, i64 2
+  %gepi8 = bitcast i32* %gep to i8*
+  %p = ptrtoint [3 x i32]* %array to i64
+  %np = sub i64 0, %p
+  %gep2 = getelementptr i8* %gepi8, i64 %np
+  %ret = ptrtoint i8* %gep2 to i64
+  ret i64 %ret
+
+; CHECK-LABEL: @test40
+; CHECK-NEXT: ret i64 8
+}
+
+define i16 @test41([3 x i32] addrspace(1)* %array) {
+  %gep = getelementptr inbounds [3 x i32] addrspace(1)* %array, i16 0, i16 2
+  %gepi8 = bitcast i32 addrspace(1)* %gep to i8 addrspace(1)*
+  %p = ptrtoint [3 x i32] addrspace(1)* %array to i16
+  %np = sub i16 0, %p
+  %gep2 = getelementptr i8 addrspace(1)* %gepi8, i16 %np
+  %ret = ptrtoint i8 addrspace(1)* %gep2 to i16
+  ret i16 %ret
+
+; CHECK-LABEL: @test41(
+; CHECK-NEXT: ret i16 8
+}
+
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/icmp-logical.ll b/test/Transforms/InstCombine/icmp-logical.ll
new file mode 100644
index 0000000..d5d8cbc
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-logical.ll
@@ -0,0 +1,152 @@
+; RUN: opt -instcombine -S -o - %s | FileCheck %s
+
+define i1 @masked_and_notallzeroes(i32 %A) {
+; CHECK-LABEL: @masked_and_notallzeroes
+; CHECK: [[MASK:%.*]] = and i32 %A, 7
+; CHECK: icmp ne i32 [[MASK]], 0
+; CHECK-NOT: and i32 %A, 39
+; CHECK: ret i1
+
+  %mask1 = and i32 %A, 7
+  %tst1 = icmp ne i32 %mask1, 0
+
+  %mask2 = and i32 %A, 39
+  %tst2 = icmp ne i32 %mask2, 0
+
+  %res = and i1 %tst1, %tst2
+  ret i1 %res
+}
+
+define i1 @masked_or_allzeroes(i32 %A) {
+; CHECK-LABEL: @masked_or_allzeroes
+; CHECK: [[MASK:%.*]] = and i32 %A, 7
+; CHECK: icmp eq i32 [[MASK]], 0
+; CHECK-NOT: and i32 %A, 39
+; CHECK: ret i1
+
+  %mask1 = and i32 %A, 7
+  %tst1 = icmp eq i32 %mask1, 0
+
+  %mask2 = and i32 %A, 39
+  %tst2 = icmp eq i32 %mask2, 0
+
+  %res = or i1 %tst1, %tst2
+  ret i1 %res
+}
+
+define i1 @masked_and_notallones(i32 %A) {
+; CHECK-LABEL: @masked_and_notallones
+; CHECK: [[MASK:%.*]] = and i32 %A, 7
+; CHECK: icmp ne i32 [[MASK]], 7
+; CHECK-NOT: and i32 %A, 39
+; CHECK: ret i1
+
+  %mask1 = and i32 %A, 7
+  %tst1 = icmp ne i32 %mask1, 7
+
+  %mask2 = and i32 %A, 39
+  %tst2 = icmp ne i32 %mask2, 39
+
+  %res = and i1 %tst1, %tst2
+  ret i1 %res
+}
+
+define i1 @masked_or_allones(i32 %A) {
+; CHECK-LABEL: @masked_or_allones
+; CHECK: [[MASK:%.*]] = and i32 %A, 7
+; CHECK: icmp eq i32 [[MASK]], 7
+; CHECK-NOT: and i32 %A, 39
+; CHECK: ret i1
+
+  %mask1 = and i32 %A, 7
+  %tst1 = icmp eq i32 %mask1, 7
+
+  %mask2 = and i32 %A, 39
+  %tst2 = icmp eq i32 %mask2, 39
+
+  %res = or i1 %tst1, %tst2
+  ret i1 %res
+}
+
+define i1 @masked_and_notA(i32 %A) {
+; CHECK-LABEL: @masked_and_notA
+; CHECK: [[MASK:%.*]] = and i32 %A, 39
+; CHECK: icmp ne i32 [[MASK]], %A
+; CHECK-NOT: and i32 %A, 7
+; CHECK: ret i1
+
+  %mask1 = and i32 %A, 7
+  %tst1 = icmp ne i32 %mask1, %A
+
+  %mask2 = and i32 %A, 39
+  %tst2 = icmp ne i32 %mask2, %A
+
+  %res = and i1 %tst1, %tst2
+  ret i1 %res
+}
+
+define i1 @masked_or_A(i32 %A) {
+; CHECK-LABEL: @masked_or_A
+; CHECK: [[MASK:%.*]] = and i32 %A, 39
+; CHECK: icmp eq i32 [[MASK]], %A
+; CHECK-NOT: and i32 %A, 7
+; CHECK: ret i1
+
+  %mask1 = and i32 %A, 7
+  %tst1 = icmp eq i32 %mask1, %A
+
+  %mask2 = and i32 %A, 39
+  %tst2 = icmp eq i32 %mask2, %A
+
+  %res = or i1 %tst1, %tst2
+  ret i1 %res
+}
+
+define i1 @masked_or_allzeroes_notoptimised(i32 %A) {
+; CHECK-LABEL: @masked_or_allzeroes_notoptimised
+; CHECK: [[MASK:%.*]] = and i32 %A, 15
+; CHECK: icmp eq i32 [[MASK]], 0
+; CHECK: [[MASK:%.*]] = and i32 %A, 39
+; CHECK: icmp eq i32 [[MASK]], 0
+; CHECK: ret i1
+
+  %mask1 = and i32 %A, 15
+  %tst1 = icmp eq i32 %mask1, 0
+
+  %mask2 = and i32 %A, 39
+  %tst2 = icmp eq i32 %mask2, 0
+
+  %res = or i1 %tst1, %tst2
+  ret i1 %res
+}
+
+define i1 @nomask_lhs(i32 %in) {
+; CHECK-LABEL: @nomask_lhs
+; CHECK: [[MASK:%.*]] = and i32 %in, 1
+; CHECK: icmp eq i32 [[MASK]], 0
+; CHECK-NOT: icmp
+; CHECK: ret i1
+  %tst1 = icmp eq i32 %in, 0
+
+  %masked = and i32 %in, 1
+  %tst2 = icmp eq i32 %masked, 0
+
+  %val = or i1 %tst1, %tst2
+  ret i1 %val
+}
+
+
+define i1 @nomask_rhs(i32 %in) {
+; CHECK-LABEL: @nomask_rhs
+; CHECK: [[MASK:%.*]] = and i32 %in, 1
+; CHECK: icmp eq i32 [[MASK]], 0
+; CHECK-NOT: icmp
+; CHECK: ret i1
+  %masked = and i32 %in, 1
+  %tst1 = icmp eq i32 %masked, 0
+
+  %tst2 = icmp eq i32 %in, 0
+
+  %val = or i1 %tst1, %tst2
+  ret i1 %val
+}
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index dfeac67..12a4744 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 target datalayout =
-"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+"e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define i32 @test1(i32 %X) {
 entry:
@@ -79,7 +79,7 @@ entry:
 
 define i1 @test8(i32 %x){
 entry:
-  %a = add i32 %x, -1 
+  %a = add i32 %x, -1
   %b = icmp eq i32 %a, %x
   ret i1 %b
 ; CHECK-LABEL: @test8(
@@ -89,7 +89,7 @@ entry:
 define i1 @test9(i32 %x)  {
 entry:
   %a = add i32 %x, -2
-  %b = icmp ugt i32 %x, %a 
+  %b = icmp ugt i32 %x, %a
   ret i1 %b
 ; CHECK-LABEL: @test9(
 ; CHECK: icmp ugt i32 %x, 1
@@ -98,10 +98,9 @@ entry:
 
 define i1 @test10(i32 %x){
 entry:
-  %a = add i32 %x, -1      
-  %b = icmp slt i32 %a, %x 
+  %a = add i32 %x, -1
+  %b = icmp slt i32 %a, %x
   ret i1 %b
-  
 ; CHECK-LABEL: @test10(
 ; CHECK: %b = icmp ne i32 %x, -2147483648
 ; CHECK: ret i1 %b
@@ -234,6 +233,18 @@ define i1 @test24(i64 %i) {
   ret i1 %cmp
 }
 
+@X_as1 = addrspace(1) global [1000 x i32] zeroinitializer
+
+; CHECK: @test24_as1
+; CHECK: trunc i64 %i to i16
+; CHECK: %cmp = icmp eq i16 %1, 1000
+; CHECK: ret i1 %cmp
+define i1 @test24_as1(i64 %i) {
+  %p1 = getelementptr inbounds i32 addrspace(1)* getelementptr inbounds ([1000 x i32] addrspace(1)* @X_as1, i64 0, i64 0), i64 %i
+  %cmp = icmp eq i32 addrspace(1)* %p1, getelementptr inbounds ([1000 x i32] addrspace(1)* @X_as1, i64 1, i64 0)
+  ret i1 %cmp
+}
+
 ; CHECK-LABEL: @test25(
 ; X + Z > Y + Z -> X > Y if there is no overflow.
 ; CHECK: %c = icmp sgt i32 %x, %y
@@ -473,7 +484,7 @@ define <2 x i1> @test49(<2 x i32> %tmp3) {
 entry:
   %tmp11 = and <2 x i32> %tmp3, <i32 3, i32 3>
   %cmp = icmp ult <2 x i32> %tmp11, <i32 4, i32 4>
-  ret <2 x i1> %cmp  
+  ret <2 x i1> %cmp
 }
 
 ; PR9343 #7
@@ -512,12 +523,12 @@ define i1 @test52(i32 %x1) nounwind {
 
 ; PR9838
 ; CHECK-LABEL: @test53(
-; CHECK-NEXT: ashr exact
-; CHECK-NEXT: ashr
+; CHECK-NEXT: sdiv exact
+; CHECK-NEXT: sdiv
 ; CHECK-NEXT: icmp
 define i1 @test53(i32 %a, i32 %b) nounwind {
- %x = ashr exact i32 %a, 30
- %y = ashr i32 %b, 30
+ %x = sdiv exact i32 %a, 30
+ %y = sdiv i32 %b, 30
  %z = icmp eq i32 %x, %y
  ret i1 %z
 }
@@ -603,6 +614,21 @@ define i1 @test59(i8* %foo) {
 ; CHECK: ret i1 true
 }
 
+define i1 @test59_as1(i8 addrspace(1)* %foo) {
+  %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
+  %gep1 = getelementptr inbounds i32 addrspace(1)* %bit, i64 2
+  %gep2 = getelementptr inbounds i8 addrspace(1)* %foo, i64 10
+  %cast1 = bitcast i32 addrspace(1)* %gep1 to i8 addrspace(1)*
+  %cmp = icmp ult i8 addrspace(1)* %cast1, %gep2
+  %use = ptrtoint i8 addrspace(1)* %cast1 to i64
+  %call = call i32 @test58_d(i64 %use) nounwind
+  ret i1 %cmp
+; CHECK: @test59_as1
+; CHECK: %[[GEP:.+]] = getelementptr inbounds i8 addrspace(1)* %foo, i16 8
+; CHECK: ptrtoint i8 addrspace(1)* %[[GEP]] to i16
+; CHECK: ret i1 true
+}
+
 define i1 @test60(i8* %foo, i64 %i, i64 %j) {
   %bit = bitcast i8* %foo to i32*
   %gep1 = getelementptr inbounds i32* %bit, i64 %i
@@ -616,6 +642,21 @@ define i1 @test60(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT: ret i1
 }
 
+define i1 @test60_as1(i8 addrspace(1)* %foo, i64 %i, i64 %j) {
+  %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
+  %gep1 = getelementptr inbounds i32 addrspace(1)* %bit, i64 %i
+  %gep2 = getelementptr inbounds i8 addrspace(1)* %foo, i64 %j
+  %cast1 = bitcast i32 addrspace(1)* %gep1 to i8 addrspace(1)*
+  %cmp = icmp ult i8 addrspace(1)* %cast1, %gep2
+  ret i1 %cmp
+; CHECK: @test60_as1
+; CHECK: trunc i64 %i to i16
+; CHECK: trunc i64 %j to i16
+; CHECK: %gep1.idx = shl nuw i16 %{{.+}}, 2
+; CHECK-NEXT: icmp sgt i16 %{{.+}}, %gep1.idx
+; CHECK-NEXT: ret i1
+}
+
 define i1 @test61(i8* %foo, i64 %i, i64 %j) {
   %bit = bitcast i8* %foo to i32*
   %gep1 = getelementptr i32* %bit, i64 %i
@@ -629,6 +670,19 @@ define i1 @test61(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT: ret i1
 }
 
+define i1 @test61_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
+  %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
+  %gep1 = getelementptr i32 addrspace(1)* %bit, i16 %i
+  %gep2 = getelementptr i8 addrspace(1)* %foo, i16 %j
+  %cast1 = bitcast i32 addrspace(1)* %gep1 to i8 addrspace(1)*
+  %cmp = icmp ult i8 addrspace(1)* %cast1, %gep2
+  ret i1 %cmp
+; Don't transform non-inbounds GEPs.
+; CHECK: @test61_as1
+; CHECK: icmp ult i8 addrspace(1)* %cast1, %gep2
+; CHECK-NEXT: ret i1
+}
+
 define i1 @test62(i8* %a) {
   %arrayidx1 = getelementptr inbounds i8* %a, i64 1
   %arrayidx2 = getelementptr inbounds i8* %a, i64 10
@@ -638,6 +692,15 @@ define i1 @test62(i8* %a) {
 ; CHECK-NEXT: ret i1 true
 }
 
+define i1 @test62_as1(i8 addrspace(1)* %a) {
+; CHECK-LABEL: @test62_as1(
+; CHECK-NEXT: ret i1 true
+  %arrayidx1 = getelementptr inbounds i8 addrspace(1)* %a, i64 1
+  %arrayidx2 = getelementptr inbounds i8 addrspace(1)* %a, i64 10
+  %cmp = icmp slt i8 addrspace(1)* %arrayidx1, %arrayidx2
+  ret i1 %cmp
+}
+
 define i1 @test63(i8 %a, i32 %b) nounwind {
   %z = zext i8 %a to i32
   %t = and i32 %b, 255
@@ -999,6 +1062,15 @@ define i1 @test71(i8* %x) {
   ret i1 %c
 }
 
+define i1 @test71_as1(i8 addrspace(1)* %x) {
+; CHECK-LABEL: @test71_as1(
+; CHECK-NEXT: ret i1 false
+  %a = getelementptr i8 addrspace(1)* %x, i64 8
+  %b = getelementptr inbounds i8 addrspace(1)* %x, i64 8
+  %c = icmp ugt i8 addrspace(1)* %a, %b
+  ret i1 %c
+}
+
 ; CHECK-LABEL: @icmp_shl_1_V_ult_32(
 ; CHECK-NEXT: [[CMP:%[a-z0-9]+]] = icmp ult i32 %V, 5
 ; CHECK-NEXT: ret i1 [[CMP]]
@@ -1199,3 +1271,88 @@ define i1 @icmp_sub_-1_X_uge_4(i32 %X) {
   %cmp = icmp uge i32 %sub, 4
   ret i1 %cmp
 }
+
+; CHECK-LABEL: @icmp_swap_operands_for_cse
+; CHECK: [[CMP:%[a-z0-9]+]] = icmp ult i32 %X, %Y
+; CHECK-NEXT: br i1 [[CMP]], label %true, label %false
+; CHECK: ret i1
+define i1 @icmp_swap_operands_for_cse(i32 %X, i32 %Y) {
+entry:
+  %sub = sub i32 %X, %Y
+  %cmp = icmp ugt i32 %Y, %X
+  br i1 %cmp, label %true, label %false
+true:
+  %restrue = trunc i32 %sub to i1
+  br label %end
+false:
+  %shift = lshr i32 %sub, 4
+  %resfalse = trunc i32 %shift to i1
+  br label %end
+end:
+  %res = phi i1 [%restrue, %true], [%resfalse, %false]
+  ret i1 %res
+}
+
+; CHECK-LABEL: @icmp_swap_operands_for_cse2
+; CHECK: [[CMP:%[a-z0-9]+]] = icmp ult i32 %X, %Y
+; CHECK-NEXT: br i1 [[CMP]], label %true, label %false
+; CHECK: ret i1
+define i1 @icmp_swap_operands_for_cse2(i32 %X, i32 %Y) {
+entry:
+  %cmp = icmp ugt i32 %Y, %X
+  br i1 %cmp, label %true, label %false
+true:
+  %sub = sub i32 %X, %Y
+  %sub1 = sub i32 %X, %Y
+  %add = add i32 %sub, %sub1
+  %restrue = trunc i32 %add to i1
+  br label %end
+false:
+  %sub2 = sub i32 %Y, %X
+  %resfalse = trunc i32 %sub2 to i1
+  br label %end
+end:
+  %res = phi i1 [%restrue, %true], [%resfalse, %false]
+  ret i1 %res
+}
+
+; CHECK-LABEL: @icmp_do_not_swap_operands_for_cse
+; CHECK: [[CMP:%[a-z0-9]+]] = icmp ugt i32 %Y, %X
+; CHECK-NEXT: br i1 [[CMP]], label %true, label %false
+; CHECK: ret i1
+define i1 @icmp_do_not_swap_operands_for_cse(i32 %X, i32 %Y) {
+entry:
+  %cmp = icmp ugt i32 %Y, %X
+  br i1 %cmp, label %true, label %false
+true:
+  %sub = sub i32 %X, %Y
+  %restrue = trunc i32 %sub to i1
+  br label %end
+false:
+  %sub2 = sub i32 %Y, %X
+  %resfalse = trunc i32 %sub2 to i1
+  br label %end
+end:
+  %res = phi i1 [%restrue, %true], [%resfalse, %false]
+  ret i1 %res
+}
+
+; CHECK-LABEL: @icmp_lshr_lshr_eq
+; CHECK: %z.unshifted = xor i32 %a, %b
+; CHECK: %z = icmp ult i32 %z.unshifted, 1073741824
+define i1 @icmp_lshr_lshr_eq(i32 %a, i32 %b) nounwind {
+ %x = lshr i32 %a, 30
+ %y = lshr i32 %b, 30
+ %z = icmp eq i32 %x, %y
+ ret i1 %z
+}
+
+; CHECK-LABEL: @icmp_ashr_ashr_ne
+; CHECK: %z.unshifted = xor i32 %a, %b
+; CHECK: %z = icmp ugt i32 %z.unshifted, 255
+define i1 @icmp_ashr_ashr_ne(i32 %a, i32 %b) nounwind {
+ %x = ashr i32 %a, 8
+ %y = ashr i32 %b, 8
+ %z = icmp ne i32 %x, %y
+ ret i1 %z
+}
diff --git a/test/Transforms/InstCombine/lit.local.cfg b/test/Transforms/InstCombine/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/InstCombine/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/InstCombine/load-cmp.ll b/test/Transforms/InstCombine/load-cmp.ll
index 95dc48c..9810026 100644
--- a/test/Transforms/InstCombine/load-cmp.ll
+++ b/test/Transforms/InstCombine/load-cmp.ll
@@ -1,18 +1,75 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt -instcombine -S < %s | FileCheck -check-prefix=NODL %s
+; RUN: opt -instcombine -S -default-data-layout="p:32:32:32-p1:16:16:16-n8:16:32:64" < %s | FileCheck -check-prefix=P32 %s
 
-@G16 = internal constant [10 x i16] [i16 35, i16 82, i16 69, i16 81, i16 85, 
+@G16 = internal constant [10 x i16] [i16 35, i16 82, i16 69, i16 81, i16 85,
                                      i16 73, i16 82, i16 69, i16 68, i16 0]
+
+@G16_as1 = internal addrspace(1) constant [10 x i16] [i16 35, i16 82, i16 69, i16 81, i16 85,
+                                                      i16 73, i16 82, i16 69, i16 68, i16 0]
+
 @GD = internal constant [6 x double]
    [double -10.0, double 1.0, double 4.0, double 2.0, double -20.0, double -40.0]
 
+%Foo = type { i32, i32, i32, i32 }
+
+@GS = internal constant %Foo { i32 1, i32 4, i32 9, i32 14 }
+
+@GStructArr = internal constant [4 x %Foo] [ %Foo { i32 1, i32 4, i32 9, i32 14 },
+                                             %Foo { i32 5, i32 4, i32 6, i32 11 },
+                                             %Foo { i32 6, i32 5, i32 9, i32 20 },
+                                             %Foo { i32 12, i32 3, i32 9, i32 8 } ]
+
+
 define i1 @test1(i32 %X) {
   %P = getelementptr inbounds [10 x i16]* @G16, i32 0, i32 %X
   %Q = load i16* %P
   %R = icmp eq i16 %Q, 0
   ret i1 %R
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: %R = icmp eq i32 %X, 9
-; CHECK-NEXT: ret i1 %R
+; NODL-LABEL: @test1(
+; NODL-NEXT: %R = icmp eq i32 %X, 9
+; NODL-NEXT: ret i1 %R
+
+; P32-LABEL: @test1(
+; P32-NEXT: %R = icmp eq i32 %X, 9
+; P32-NEXT: ret i1 %R
+}
+
+define i1 @test1_noinbounds(i32 %X) {
+  %P = getelementptr [10 x i16]* @G16, i32 0, i32 %X
+  %Q = load i16* %P
+  %R = icmp eq i16 %Q, 0
+  ret i1 %R
+; NODL-LABEL: @test1_noinbounds(
+; NODL-NEXT: %P = getelementptr [10 x i16]* @G16, i32 0, i32 %X
+
+; P32-LABEL: @test1_noinbounds(
+; P32-NEXT: %R = icmp eq i32 %X, 9
+; P32-NEXT: ret i1 %R
+}
+
+define i1 @test1_noinbounds_i64(i64 %X) {
+  %P = getelementptr [10 x i16]* @G16, i64 0, i64 %X
+  %Q = load i16* %P
+  %R = icmp eq i16 %Q, 0
+  ret i1 %R
+; NODL-LABEL: @test1_noinbounds_i64(
+; NODL-NEXT: %P = getelementptr [10 x i16]* @G16, i64 0, i64 %X
+
+; P32-LABEL: @test1_noinbounds_i64(
+; P32: %R = icmp eq i32 %1, 9
+; P32-NEXT: ret i1 %R
+}
+
+define i1 @test1_noinbounds_as1(i32 %x) {
+  %p = getelementptr [10 x i16] addrspace(1)* @G16_as1, i16 0, i32 %x
+  %q = load i16 addrspace(1)* %p
+  %r = icmp eq i16 %q, 0
+  ret i1 %r
+
+; P32-LABEL: @test1_noinbounds_as1(
+; P32-NEXT: trunc i32 %x to i16
+; P32-NEXT: %r = icmp eq i16 %1, 9
+; P32-NEXT: ret i1 %r
 }
 
 define i1 @test2(i32 %X) {
@@ -20,9 +77,9 @@ define i1 @test2(i32 %X) {
   %Q = load i16* %P
   %R = icmp slt i16 %Q, 85
   ret i1 %R
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: %R = icmp ne i32 %X, 4
-; CHECK-NEXT: ret i1 %R
+; NODL-LABEL: @test2(
+; NODL-NEXT: %R = icmp ne i32 %X, 4
+; NODL-NEXT: ret i1 %R
 }
 
 define i1 @test3(i32 %X) {
@@ -30,9 +87,14 @@ define i1 @test3(i32 %X) {
   %Q = load double* %P
   %R = fcmp oeq double %Q, 1.0
   ret i1 %R
-; CHECK-LABEL: @test3(
-; CHECK-NEXT: %R = icmp eq i32 %X, 1
-; CHECK-NEXT: ret i1 %R
+; NODL-LABEL: @test3(
+; NODL-NEXT: %R = icmp eq i32 %X, 1
+; NODL-NEXT: ret i1 %R
+
+; P32-LABEL: @test3(
+; P32-NEXT: %R = icmp eq i32 %X, 1
+; P32-NEXT: ret i1 %R
+
 }
 
 define i1 @test4(i32 %X) {
@@ -40,11 +102,17 @@ define i1 @test4(i32 %X) {
   %Q = load i16* %P
   %R = icmp sle i16 %Q, 73
   ret i1 %R
-; CHECK-LABEL: @test4(
-; CHECK-NEXT: lshr i32 933, %X
-; CHECK-NEXT: and i32 {{.*}}, 1
-; CHECK-NEXT: %R = icmp ne i32 {{.*}}, 0
-; CHECK-NEXT: ret i1 %R
+; NODL-LABEL: @test4(
+; NODL-NEXT: lshr i32 933, %X
+; NODL-NEXT: and i32 {{.*}}, 1
+; NODL-NEXT: %R = icmp ne i32 {{.*}}, 0
+; NODL-NEXT: ret i1 %R
+
+; P32-LABEL: @test4(
+; P32-NEXT: lshr i32 933, %X
+; P32-NEXT: and i32 {{.*}}, 1
+; P32-NEXT: %R = icmp ne i32 {{.*}}, 0
+; P32-NEXT: ret i1 %R
 }
 
 define i1 @test4_i16(i16 %X) {
@@ -52,11 +120,19 @@ define i1 @test4_i16(i16 %X) {
   %Q = load i16* %P
   %R = icmp sle i16 %Q, 73
   ret i1 %R
-; CHECK-LABEL: @test4_i16(
-; CHECK-NEXT: lshr i16 933, %X
-; CHECK-NEXT: and i16 {{.*}}, 1
-; CHECK-NEXT: %R = icmp ne i16 {{.*}}, 0
-; CHECK-NEXT: ret i1 %R
+
+; NODL-LABEL: @test4_i16(
+; NODL-NEXT: lshr i16 933, %X
+; NODL-NEXT: and i16 {{.*}}, 1
+; NODL-NEXT: %R = icmp ne i16 {{.*}}, 0
+; NODL-NEXT: ret i1 %R
+
+; P32-LABEL: @test4_i16(
+; P32-NEXT: sext i16 %X to i32
+; P32-NEXT: lshr i32 933, %1
+; P32-NEXT: and i32 {{.*}}, 1
+; P32-NEXT: %R = icmp ne i32 {{.*}}, 0
+; P32-NEXT: ret i1 %R
 }
 
 define i1 @test5(i32 %X) {
@@ -64,11 +140,17 @@ define i1 @test5(i32 %X) {
   %Q = load i16* %P
   %R = icmp eq i16 %Q, 69
   ret i1 %R
-; CHECK-LABEL: @test5(
-; CHECK-NEXT: icmp eq i32 %X, 2
-; CHECK-NEXT: icmp eq i32 %X, 7
-; CHECK-NEXT: %R = or i1
-; CHECK-NEXT: ret i1 %R
+; NODL-LABEL: @test5(
+; NODL-NEXT: icmp eq i32 %X, 2
+; NODL-NEXT: icmp eq i32 %X, 7
+; NODL-NEXT: %R = or i1
+; NODL-NEXT: ret i1 %R
+
+; P32-LABEL: @test5(
+; P32-NEXT: icmp eq i32 %X, 2
+; P32-NEXT: icmp eq i32 %X, 7
+; P32-NEXT: %R = or i1
+; P32-NEXT: ret i1 %R
 }
 
 define i1 @test6(i32 %X) {
@@ -76,10 +158,15 @@ define i1 @test6(i32 %X) {
   %Q = load double* %P
   %R = fcmp ogt double %Q, 0.0
   ret i1 %R
-; CHECK-LABEL: @test6(
-; CHECK-NEXT: add i32 %X, -1
-; CHECK-NEXT: %R = icmp ult i32 {{.*}}, 3
-; CHECK-NEXT: ret i1 %R
+; NODL-LABEL: @test6(
+; NODL-NEXT: add i32 %X, -1
+; NODL-NEXT: %R = icmp ult i32 {{.*}}, 3
+; NODL-NEXT: ret i1 %R
+
+; P32-LABEL: @test6(
+; P32-NEXT: add i32 %X, -1
+; P32-NEXT: %R = icmp ult i32 {{.*}}, 3
+; P32-NEXT: ret i1 %R
 }
 
 define i1 @test7(i32 %X) {
@@ -87,10 +174,15 @@ define i1 @test7(i32 %X) {
   %Q = load double* %P
   %R = fcmp olt double %Q, 0.0
   ret i1 %R
-; CHECK-LABEL: @test7(
-; CHECK-NEXT: add i32 %X, -1
-; CHECK-NEXT: %R = icmp ugt i32 {{.*}}, 2
-; CHECK-NEXT: ret i1 %R
+; NODL-LABEL: @test7(
+; NODL-NEXT: add i32 %X, -1
+; NODL-NEXT: %R = icmp ugt i32 {{.*}}, 2
+; NODL-NEXT: ret i1 %R
+
+; P32-LABEL: @test7(
+; P32-NEXT: add i32 %X, -1
+; P32-NEXT: %R = icmp ugt i32 {{.*}}, 2
+; P32-NEXT: ret i1 %R
 }
 
 define i1 @test8(i32 %X) {
@@ -99,10 +191,15 @@ define i1 @test8(i32 %X) {
   %R = and i16 %Q, 3
   %S = icmp eq i16 %R, 0
   ret i1 %S
-; CHECK-LABEL: @test8(
-; CHECK-NEXT: and i32 %X, -2
-; CHECK-NEXT: icmp eq i32 {{.*}}, 8
-; CHECK-NEXT: ret i1
+; NODL-LABEL: @test8(
+; NODL-NEXT: and i32 %X, -2
+; NODL-NEXT: icmp eq i32 {{.*}}, 8
+; NODL-NEXT: ret i1
+
+; P32-LABEL: @test8(
+; P32-NEXT: and i32 %X, -2
+; P32-NEXT: icmp eq i32 {{.*}}, 8
+; P32-NEXT: ret i1
 }
 
 @GA = internal constant [4 x { i32, i32 } ] [
@@ -117,8 +214,161 @@ define i1 @test9(i32 %X) {
   %Q = load i32* %P
   %R = icmp eq i32 %Q, 1
   ret i1 %R
-; CHECK-LABEL: @test9(
-; CHECK-NEXT: add i32 %X, -1
-; CHECK-NEXT: icmp ult i32 {{.*}}, 2
-; CHECK-NEXT: ret i1
+; NODL-LABEL: @test9(
+; NODL-NEXT: add i32 %X, -1
+; NODL-NEXT: icmp ult i32 {{.*}}, 2
+; NODL-NEXT: ret i1
+
+; P32-LABEL: @test9(
+; P32-NEXT: add i32 %X, -1
+; P32-NEXT: icmp ult i32 {{.*}}, 2
+; P32-NEXT: ret i1
+}
+
+define i1 @test10_struct(i32 %x) {
+; NODL-LABEL: @test10_struct(
+; NODL: getelementptr inbounds %Foo* @GS, i32 %x, i32 0
+
+; P32-LABEL: @test10_struct(
+; P32: getelementptr inbounds %Foo* @GS, i32 %x, i32 0
+  %p = getelementptr inbounds %Foo* @GS, i32 %x, i32 0
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
+}
+
+define i1 @test10_struct_noinbounds(i32 %x) {
+; NODL-LABEL: @test10_struct_noinbounds(
+; NODL: getelementptr %Foo* @GS, i32 %x, i32 0
+
+; P32-LABEL: @test10_struct_noinbounds(
+; P32: getelementptr %Foo* @GS, i32 %x, i32 0
+  %p = getelementptr %Foo* @GS, i32 %x, i32 0
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
+}
+
+; Test that the GEP indices are converted before we ever get here
+; Index < ptr size
+define i1 @test10_struct_i16(i16 %x){
+; NODL-LABEL: @test10_struct_i16(
+; NODL: getelementptr inbounds %Foo* @GS, i16 %x, i32 0
+
+; P32-LABEL: @test10_struct_i16(
+; P32: %1 = sext i16 %x to i32
+; P32: getelementptr inbounds %Foo* @GS, i32 %1, i32 0
+  %p = getelementptr inbounds %Foo* @GS, i16 %x, i32 0
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 0
+  ret i1 %r
+}
+
+; Test that the GEP indices are converted before we ever get here
+; Index > ptr size
+define i1 @test10_struct_i64(i64 %x){
+; NODL-LABEL: @test10_struct_i64(
+; NODL: getelementptr inbounds %Foo* @GS, i64 %x, i32 0
+
+; P32-LABEL: @test10_struct_i64(
+; P32: %1 = trunc i64 %x to i32
+; P32: getelementptr inbounds %Foo* @GS, i32 %1, i32 0
+  %p = getelementptr inbounds %Foo* @GS, i64 %x, i32 0
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 0
+  ret i1 %r
+}
+
+define i1 @test10_struct_noinbounds_i16(i16 %x) {
+; NODL-LABEL: @test10_struct_noinbounds_i16(
+; NODL: getelementptr %Foo* @GS, i16 %x, i32 0
+
+; P32-LABEL: @test10_struct_noinbounds_i16(
+; P32: %1 = sext i16 %x to i32
+; P32: getelementptr %Foo* @GS, i32 %1, i32 0
+  %p = getelementptr %Foo* @GS, i16 %x, i32 0
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 0
+  ret i1 %r
+}
+
+define i1 @test10_struct_arr(i32 %x) {
+; NODL-LABEL: @test10_struct_arr(
+; NODL-NEXT: %r = icmp ne i32 %x, 1
+; NODL-NEXT: ret i1 %r
+
+; P32-LABEL: @test10_struct_arr(
+; P32-NEXT: %r = icmp ne i32 %x, 1
+; P32-NEXT: ret i1 %r
+  %p = getelementptr inbounds [4 x %Foo]* @GStructArr, i32 0, i32 %x, i32 2
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
+}
+
+define i1 @test10_struct_arr_noinbounds(i32 %x) {
+; NODL-LABEL: @test10_struct_arr_noinbounds(
+; NODL-NEXT  %p = getelementptr [4 x %Foo]* @GStructArr, i32 0, i32 %x, i32 2
+
+; P32-LABEL: @test10_struct_arr_noinbounds(
+; P32-NEXT  %p = getelementptr [4 x %Foo]* @GStructArr, i32 0, i32 %x, i32 2
+  %p = getelementptr [4 x %Foo]* @GStructArr, i32 0, i32 %x, i32 2
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
+}
+
+define i1 @test10_struct_arr_i16(i16 %x) {
+; NODL-LABEL: @test10_struct_arr_i16(
+; NODL-NEXT: %r = icmp ne i16 %x, 1
+; NODL-NEXT: ret i1 %r
+
+; P32-LABEL: @test10_struct_arr_i16(
+; P32-NEXT: %r = icmp ne i16 %x, 1
+; P32-NEXT: ret i1 %r
+  %p = getelementptr inbounds [4 x %Foo]* @GStructArr, i16 0, i16 %x, i32 2
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
+}
+
+define i1 @test10_struct_arr_i64(i64 %x) {
+; NODL-LABEL: @test10_struct_arr_i64(
+; NODL-NEXT: %r = icmp ne i64 %x, 1
+; NODL-NEXT: ret i1 %r
+
+; P32-LABEL: @test10_struct_arr_i64(
+; P32-NEXT: trunc i64 %x to i32
+; P32-NEXT: %r = icmp ne i32 %1, 1
+; P32-NEXT: ret i1 %r
+  %p = getelementptr inbounds [4 x %Foo]* @GStructArr, i64 0, i64 %x, i32 2
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
+}
+
+define i1 @test10_struct_arr_noinbounds_i16(i16 %x) {
+; NODL-LABEL: @test10_struct_arr_noinbounds_i16(
+; NODL-NEXT:  %p = getelementptr [4 x %Foo]* @GStructArr, i32 0, i16 %x, i32 2
+
+; P32-LABEL: @test10_struct_arr_noinbounds_i16(
+; P32-NEXT: %r = icmp ne i16 %x, 1
+  %p = getelementptr [4 x %Foo]* @GStructArr, i32 0, i16 %x, i32 2
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
+}
+
+define i1 @test10_struct_arr_noinbounds_i64(i64 %x) {
+; FIXME: Should be no trunc?
+; NODL-LABEL: @test10_struct_arr_noinbounds_i64(
+; NODL-NEXT:  %p = getelementptr [4 x %Foo]* @GStructArr, i32 0, i64 %x, i32 2
+
+; P32-LABEL: @test10_struct_arr_noinbounds_i64(
+; P32: %r = icmp ne i32 %1, 1
+; P32-NEXT: ret i1 %r
+  %p = getelementptr [4 x %Foo]* @GStructArr, i32 0, i64 %x, i32 2
+  %q = load i32* %p
+  %r = icmp eq i32 %q, 9
+  ret i1 %r
 }
diff --git a/test/Transforms/InstCombine/multi-size-address-space-pointer.ll b/test/Transforms/InstCombine/multi-size-address-space-pointer.ll
new file mode 100644
index 0000000..2d88bed
--- /dev/null
+++ b/test/Transforms/InstCombine/multi-size-address-space-pointer.ll
@@ -0,0 +1,112 @@
+; RUN: opt -S -instcombine %s -o - | FileCheck %s
+target datalayout = "e-p:32:32:32-p1:64:64:64-p2:8:8:8-p3:16:16:16-p4:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
+
+
+define i32 @test_as0(i32 addrspace(0)* %a) {
+; CHECK-LABEL: @test_as0(
+; CHECK: %arrayidx = getelementptr i32* %a, i32 1
+  %arrayidx = getelementptr i32 addrspace(0)* %a, i64 1
+  %y = load i32 addrspace(0)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_as1(i32 addrspace(1)* %a) {
+; CHECK-LABEL: @test_as1(
+; CHECK: %arrayidx = getelementptr i32 addrspace(1)* %a, i64 1
+  %arrayidx = getelementptr i32 addrspace(1)* %a, i32 1
+  %y = load i32 addrspace(1)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_as2(i32 addrspace(2)* %a) {
+; CHECK-LABEL: @test_as2(
+; CHECK: %arrayidx = getelementptr i32 addrspace(2)* %a, i8 1
+  %arrayidx = getelementptr i32 addrspace(2)* %a, i32 1
+  %y = load i32 addrspace(2)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_as3(i32 addrspace(3)* %a) {
+; CHECK-LABEL: @test_as3(
+; CHECK: %arrayidx = getelementptr i32 addrspace(3)* %a, i16 1
+  %arrayidx = getelementptr i32 addrspace(3)* %a, i32 1
+  %y = load i32 addrspace(3)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_combine_ptrtoint(i32 addrspace(2)* %a) {
+; CHECK-LABEL: @test_combine_ptrtoint(
+; CHECK-NEXT: %y = load i32 addrspace(2)* %a
+; CHECK-NEXT: ret i32 %y
+  %cast = ptrtoint i32 addrspace(2)* %a to i8
+  %castback = inttoptr i8 %cast to i32 addrspace(2)*
+  %y = load i32 addrspace(2)* %castback, align 4
+  ret i32 %y
+}
+
+define i8 @test_combine_inttoptr(i8 %a) {
+; CHECK-LABEL: @test_combine_inttoptr(
+; CHECK-NEXT: ret i8 %a
+  %cast = inttoptr i8 %a to i32 addrspace(2)*
+  %castback = ptrtoint i32 addrspace(2)* %cast to i8
+  ret i8 %castback
+}
+
+define i32 @test_combine_vector_ptrtoint(<2 x i32 addrspace(2)*> %a) {
+; CHECK-LABEL: @test_combine_vector_ptrtoint(
+; CHECK-NEXT: %p = extractelement <2 x i32 addrspace(2)*> %a, i32 0
+; CHECK-NEXT: %y = load i32 addrspace(2)* %p, align 4
+; CHECK-NEXT: ret i32 %y
+  %cast = ptrtoint <2 x i32 addrspace(2)*> %a to <2 x i8>
+  %castback = inttoptr <2 x i8> %cast to <2 x i32 addrspace(2)*>
+  %p = extractelement <2 x i32 addrspace(2)*> %castback, i32 0
+  %y = load i32 addrspace(2)* %p, align 4
+  ret i32 %y
+}
+
+define <2 x i8> @test_combine_vector_inttoptr(<2 x i8> %a) {
+; CHECK-LABEL: @test_combine_vector_inttoptr(
+; CHECK-NEXT: ret <2 x i8> %a
+  %cast = inttoptr <2 x i8> %a to <2 x i32 addrspace(2)*>
+  %castback = ptrtoint <2 x i32 addrspace(2)*> %cast to <2 x i8>
+  ret <2 x i8> %castback
+}
+
+; Check that the GEP index is changed to the address space integer type (i64 -> i8)
+define i32 addrspace(2)* @shrink_gep_constant_index_64_as2(i32 addrspace(2)* %p) {
+; CHECK-LABEL: @shrink_gep_constant_index_64_as2(
+; CHECK-NEXT: getelementptr i32 addrspace(2)* %p, i8 1
+  %ret = getelementptr i32 addrspace(2)* %p, i64 1
+  ret i32 addrspace(2)* %ret
+}
+
+define i32 addrspace(2)* @shrink_gep_constant_index_32_as2(i32 addrspace(2)* %p) {
+; CHECK-LABEL: @shrink_gep_constant_index_32_as2(
+; CHECK-NEXT: getelementptr i32 addrspace(2)* %p, i8 1
+  %ret = getelementptr i32 addrspace(2)* %p, i32 1
+  ret i32 addrspace(2)* %ret
+}
+
+define i32 addrspace(3)* @shrink_gep_constant_index_64_as3(i32 addrspace(3)* %p) {
+; CHECK-LABEL: @shrink_gep_constant_index_64_as3(
+; CHECK-NEXT: getelementptr i32 addrspace(3)* %p, i16 1
+  %ret = getelementptr i32 addrspace(3)* %p, i64 1
+  ret i32 addrspace(3)* %ret
+}
+
+define i32 addrspace(2)* @shrink_gep_variable_index_64_as2(i32 addrspace(2)* %p, i64 %idx) {
+; CHECK-LABEL: @shrink_gep_variable_index_64_as2(
+; CHECK-NEXT: %1 = trunc i64 %idx to i8
+; CHECK-NEXT: getelementptr i32 addrspace(2)* %p, i8 %1
+  %ret = getelementptr i32 addrspace(2)* %p, i64 %idx
+  ret i32 addrspace(2)* %ret
+}
+
+define i32 addrspace(1)* @grow_gep_variable_index_8_as1(i32 addrspace(1)* %p, i8 %idx) {
+; CHECK-LABEL: @grow_gep_variable_index_8_as1(
+; CHECK-NEXT: %1 = sext i8 %idx to i64
+; CHECK-NEXT: getelementptr i32 addrspace(1)* %p, i64 %1
+  %ret = getelementptr i32 addrspace(1)* %p, i8 %idx
+  ret i32 addrspace(1)* %ret
+}
+
diff --git a/test/Transforms/InstCombine/objsize-address-space.ll b/test/Transforms/InstCombine/objsize-address-space.ll
new file mode 100644
index 0000000..9cb6884
--- /dev/null
+++ b/test/Transforms/InstCombine/objsize-address-space.ll
@@ -0,0 +1,80 @@
+; RUN: opt -S -instcombine -o - %s | FileCheck %s
+target datalayout = "e-p:32:32:32-p1:64:64:64-p2:8:8:8-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
+
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p2i8(i8 addrspace(2)*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)*, i1) nounwind readonly
+declare i16 @llvm.objectsize.i16.p3i8(i8 addrspace(3)*, i1) nounwind readonly
+
+@array_as2 = private addrspace(2) global [60 x i8] zeroinitializer, align 4
+
+@array_as1_pointers = private global [10 x i32 addrspace(1)*] zeroinitializer, align 4
+@array_as2_pointers = private global [24 x i32 addrspace(2)*] zeroinitializer, align 4
+@array_as3_pointers = private global [42 x i32 addrspace(3)*] zeroinitializer, align 4
+
+@array_as2_as1_pointer_pointers = private global [16 x i32 addrspace(2)* addrspace(1)*] zeroinitializer, align 4
+
+
+@a_as3 = private addrspace(3) global [60 x i8] zeroinitializer, align 1
+
+define i32 @foo_as3() nounwind {
+; CHECK-LABEL: @foo_as3(
+; CHECK-NEXT: ret i32 60
+  %1 = call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* getelementptr inbounds ([60 x i8] addrspace(3)* @a_as3, i32 0, i32 0), i1 false)
+  ret i32 %1
+}
+
+define i16 @foo_as3_i16() nounwind {
+; CHECK-LABEL: @foo_as3_i16(
+; CHECK-NEXT: ret i16 60
+  %1 = call i16 @llvm.objectsize.i16.p3i8(i8 addrspace(3)* getelementptr inbounds ([60 x i8] addrspace(3)* @a_as3, i32 0, i32 0), i1 false)
+  ret i16 %1
+}
+
+@a_alias = alias weak [60 x i8] addrspace(3)* @a_as3
+define i32 @foo_alias() nounwind {
+  %1 = call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* getelementptr inbounds ([60 x i8] addrspace(3)* @a_alias, i32 0, i32 0), i1 false)
+  ret i32 %1
+}
+
+define i32 @array_as2_size() {
+; CHECK-LABEL: @array_as2_size(
+; CHECK-NEXT: ret i32 60
+  %bc = bitcast [60 x i8] addrspace(2)* @array_as2 to i8 addrspace(2)*
+  %1 = call i32 @llvm.objectsize.i32.p2i8(i8 addrspace(2)* %bc, i1 false)
+  ret i32 %1
+}
+
+define i32 @pointer_array_as1() {
+; CHECK-LABEL: @pointer_array_as1(
+; CHECK-NEXT: ret i32 80
+  %bc = addrspacecast [10 x i32 addrspace(1)*]* @array_as1_pointers to i8 addrspace(1)*
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* %bc, i1 false)
+  ret i32 %1
+}
+
+define i32 @pointer_array_as2() {
+; CHECK-LABEL: @pointer_array_as2(
+; CHECK-NEXT: ret i32 24
+  %bc = bitcast [24 x i32 addrspace(2)*]* @array_as2_pointers to i8*
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
+  ret i32 %1
+}
+
+define i32 @pointer_array_as3() {
+; CHECK-LABEL: @pointer_array_as3(
+; CHECK-NEXT: ret i32 84
+  %bc = bitcast [42 x i32 addrspace(3)*]* @array_as3_pointers to i8*
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
+  ret i32 %1
+}
+
+define i32 @pointer_pointer_array_as2_as1() {
+; CHECK-LABEL: @pointer_pointer_array_as2_as1(
+; CHECK-NEXT: ret i32 128
+  %bc = bitcast [16 x i32 addrspace(2)* addrspace(1)*]* @array_as2_as1_pointer_pointers to i8*
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
+  ret i32 %1
+}
+
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index b5351e9..6459032 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -5,11 +5,10 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 
 @a = private global [60 x i8] zeroinitializer, align 1 ; <[60 x i8]*>
 @.str = private constant [8 x i8] c"abcdefg\00"   ; <[8 x i8]*>
-
 define i32 @foo() nounwind {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT: ret i32 60
-  %1 = call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
   ret i32 %1
 }
 
@@ -17,7 +16,7 @@ define i8* @bar() nounwind {
 ; CHECK-LABEL: @bar(
 entry:
   %retval = alloca i8*
-  %0 = call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
+  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i1 false)
   %cmp = icmp ne i32 %0, -1
 ; CHECK: br i1 true
   br i1 %cmp, label %cond.true, label %cond.false
@@ -34,7 +33,7 @@ cond.false:
 define i32 @f() nounwind {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT: ret i32 0
-  %1 = call i32 @llvm.objectsize.i32(i8* getelementptr ([60 x i8]* @a, i32 1, i32 0), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr ([60 x i8]* @a, i32 1, i32 0), i1 false)
   ret i32 %1
 }
 
@@ -43,16 +42,16 @@ define i32 @f() nounwind {
 define i1 @baz() nounwind {
 ; CHECK-LABEL: @baz(
 ; CHECK-NEXT: objectsize
-  %1 = tail call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([0 x i8]* @window, i32 0, i32 0), i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8]* @window, i32 0, i32 0), i1 false)
   %2 = icmp eq i32 %1, -1
   ret i1 %2
 }
 
 define void @test1(i8* %q, i32 %x) nounwind noinline {
 ; CHECK-LABEL: @test1(
-; CHECK: objectsize.i32
+; CHECK: objectsize.i32.p0i8
 entry:
-  %0 = call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([0 x i8]* @window, i32 0, i32 10), i1 false) ; <i64> [#uses=1]
+  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8]* @window, i32 0, i32 10), i1 false) ; <i64> [#uses=1]
   %1 = icmp eq i32 %0, -1                         ; <i1> [#uses=1]
   br i1 %1, label %"47", label %"46"
 
@@ -68,7 +67,7 @@ entry:
 define i32 @test2() nounwind {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT: ret i32 34
-  %1 = call i32 @llvm.objectsize.i32(i8* getelementptr (i8* bitcast ([9 x i32]* @.str5 to i8*), i32 2), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr (i8* bitcast ([9 x i32]* @.str5 to i8*), i32 2), i1 false)
   ret i32 %1
 }
 
@@ -77,7 +76,7 @@ define i32 @test2() nounwind {
 
 declare i8* @__memcpy_chk(i8*, i8*, i32, i32) nounwind
 
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
 
 declare i8* @__inline_memcpy_chk(i8*, i8*, i32) nounwind inlinehint
 
@@ -89,7 +88,7 @@ entry:
 bb11:
   %0 = getelementptr inbounds float* getelementptr inbounds ([480 x float]* @array, i32 0, i32 128), i32 -127 ; <float*> [#uses=1]
   %1 = bitcast float* %0 to i8*                   ; <i8*> [#uses=1]
-  %2 = call i32 @llvm.objectsize.i32(i8* %1, i1 false) ; <i32> [#uses=1]
+  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false) ; <i32> [#uses=1]
   %3 = call i8* @__memcpy_chk(i8* undef, i8* undef, i32 512, i32 %2) nounwind ; <i8*> [#uses=0]
 ; CHECK: unreachable
   unreachable
@@ -111,7 +110,7 @@ define i32 @test4(i8** %esc) nounwind ssp {
 entry:
   %0 = alloca %struct.data, align 8
   %1 = bitcast %struct.data* %0 to i8*
-  %2 = call i32 @llvm.objectsize.i32(i8* %1, i1 false) nounwind
+  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false) nounwind
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memset.p0i8.i32(i8* %1, i8 0, i32 1824, i32 8, i1 false)
   %3 = call i8* @__memset_chk(i8* %1, i32 0, i32 1824, i32 %2) nounwind
@@ -126,7 +125,7 @@ define i8* @test5(i32 %n) nounwind ssp {
 ; CHECK-LABEL: @test5(
 entry:
   %0 = tail call noalias i8* @malloc(i32 20) nounwind
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %0, i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false)
   %2 = load i8** @s, align 8
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 10, i32 1, i1 false)
@@ -138,7 +137,7 @@ define void @test6(i32 %n) nounwind ssp {
 ; CHECK-LABEL: @test6(
 entry:
   %0 = tail call noalias i8* @malloc(i32 20) nounwind
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %0, i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false)
   %2 = load i8** @s, align 8
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @__memcpy_chk(i8* %0, i8* %1, i32 30, i32 20)
@@ -155,7 +154,7 @@ define i32 @test7(i8** %esc) {
   %alloc = call noalias i8* @malloc(i32 48) nounwind
   store i8* %alloc, i8** %esc
   %gep = getelementptr inbounds i8* %alloc, i32 16
-  %objsize = call i32 @llvm.objectsize.i32(i8* %gep, i1 false) nounwind readonly
+  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false) nounwind readonly
 ; CHECK: ret i32 32
   ret i32 %objsize
 }
@@ -167,7 +166,7 @@ define i32 @test8(i8** %esc) {
   %alloc = call noalias i8* @calloc(i32 5, i32 7) nounwind
   store i8* %alloc, i8** %esc
   %gep = getelementptr inbounds i8* %alloc, i32 5
-  %objsize = call i32 @llvm.objectsize.i32(i8* %gep, i1 false) nounwind readonly
+  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false) nounwind readonly
 ; CHECK: ret i32 30
   ret i32 %objsize
 }
@@ -179,7 +178,7 @@ declare noalias i8* @strndup(i8* nocapture, i32) nounwind
 define i32 @test9(i8** %esc) {
   %call = tail call i8* @strdup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0)) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -188,7 +187,7 @@ define i32 @test9(i8** %esc) {
 define i32 @test10(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 3) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
 ; CHECK: ret i32 4
   ret i32 %1
 }
@@ -197,7 +196,7 @@ define i32 @test10(i8** %esc) {
 define i32 @test11(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 7) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -206,7 +205,7 @@ define i32 @test11(i8** %esc) {
 define i32 @test12(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 8) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -215,7 +214,7 @@ define i32 @test12(i8** %esc) {
 define i32 @test13(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8]* @.str, i64 0, i64 0), i32 57) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -229,8 +228,8 @@ entry:
 xpto:
   %select = select i1 %bool, i8* %select, i8* %a
   %select2 = select i1 %bool, i8* %a, i8* %select2
-  %0 = tail call i32 @llvm.objectsize.i32(i8* %select, i1 true)
-  %1 = tail call i32 @llvm.objectsize.i32(i8* %select2, i1 true)
+  %0 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %select, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %select2, i1 true)
   %2 = add i32 %0, %1
 ; CHECK: ret i32 undef
   ret i32 %2
@@ -249,7 +248,7 @@ entry:
 xpto:
   %gep2 = getelementptr i8* %gep, i32 1
   %gep = getelementptr i8* %gep2, i32 1
-  %o = call i32 @llvm.objectsize.i32(i8* %gep, i1 true)
+  %o = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 true)
 ; CHECK: ret i32 undef
   ret i32 %o
 
@@ -263,7 +262,7 @@ return:
 ; CHECK-NEXT: ret i32 60
 define i32 @test18() {
   %bc = bitcast [60 x i8]* @globalalias to i8*
-  %1 = call i32 @llvm.objectsize.i32(i8* %bc, i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
   ret i32 %1
 }
 
@@ -273,6 +272,7 @@ define i32 @test18() {
 ; CHECK: llvm.objectsize
 define i32 @test19() {
   %bc = bitcast [60 x i8]* @globalalias2 to i8*
-  %1 = call i32 @llvm.objectsize.i32(i8* %bc, i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
   ret i32 %1
 }
+
diff --git a/test/Transforms/InstCombine/onehot_merge.ll b/test/Transforms/InstCombine/onehot_merge.ll
new file mode 100644
index 0000000..51f955c
--- /dev/null
+++ b/test/Transforms/InstCombine/onehot_merge.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+;CHECK: @and_consts
+;CHECK: and i32 %k, 12
+;CHECK: icmp ne i32 %0, 12
+;CHECK: ret
+define i1 @and_consts(i32 %k, i32 %c1, i32 %c2) {
+bb:
+  %tmp1 = and i32 4, %k
+  %tmp2 = icmp eq i32 %tmp1, 0
+  %tmp5 = and i32 8, %k
+  %tmp6 = icmp eq i32 %tmp5, 0
+  %or = or i1 %tmp2, %tmp6
+  ret i1 %or
+}
+
+;CHECK: @foo1_and
+;CHECK:  shl i32 1, %c1
+;CHECK-NEXT:  shl i32 1, %c2
+;CHECK-NEXT:  or i32
+;CHECK-NEXT:  and i32
+;CHECK-NEXT:  icmp ne i32 %1, %0
+;CHECK: ret
+define i1 @foo1_and(i32 %k, i32 %c1, i32 %c2) {
+bb:
+  %tmp = shl i32 1, %c1
+  %tmp4 = shl i32 1, %c2
+  %tmp1 = and i32 %tmp, %k
+  %tmp2 = icmp eq i32 %tmp1, 0
+  %tmp5 = and i32 %tmp4, %k
+  %tmp6 = icmp eq i32 %tmp5, 0
+  %or = or i1 %tmp2, %tmp6
+  ret i1 %or
+}
+
diff --git a/test/Transforms/InstCombine/phi-select-constexpr.ll b/test/Transforms/InstCombine/phi-select-constexpr.ll
new file mode 100644
index 0000000..054e069
--- /dev/null
+++ b/test/Transforms/InstCombine/phi-select-constexpr.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+@A = extern_weak global i32, align 4
+@B = extern_weak global i32, align 4
+
+define i32 @foo(i1 %which) {
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+; CHECK-LABEL: final:
+; CHECK: phi i32 [ 1, %entry ], [ select (i1 icmp eq (i32* @A, i32* @B), i32 2, i32 1), %delay ]
+final:
+  %use2 = phi i1 [ false, %entry ], [ icmp eq (i32* @A, i32* @B), %delay ]
+  %value = select i1 %use2, i32 2, i32 1
+  ret i32 %value
+}
+
diff --git a/test/Transforms/InstCombine/pow-1.ll b/test/Transforms/InstCombine/pow-1.ll
index 0fdafeb..9f1d073 100644
--- a/test/Transforms/InstCombine/pow-1.ll
+++ b/test/Transforms/InstCombine/pow-1.ll
@@ -151,4 +151,17 @@ define double @test_simplify16(double %x) {
 ; CHECK-NEXT: ret double [[RECIPROCAL]]
 }
 
+declare double @llvm.pow.f64(double %Val, double %Power)
+define double @test_simplify17(double %x) {
+; CHECK-LABEL: @test_simplify17(
+  %retval = call double @llvm.pow.f64(double %x, double 0.5)
+; CHECK-NEXT: [[SQRT:%[a-z0-9]+]] = call double @sqrt(double %x) [[NUW_RO]]
+; CHECK-NEXT: [[FABS:%[a-z0-9]+]] = call double @fabs(double [[SQRT]]) [[NUW_RO]]
+; CHECK-NEXT: [[FCMP:%[a-z0-9]+]] = fcmp oeq double %x, 0xFFF0000000000000
+; CHECK-NEXT: [[SELECT:%[a-z0-9]+]] = select i1 [[FCMP]], double 0x7FF0000000000000, double [[FABS]]
+  ret double %retval
+; CHECK-NEXT: ret double [[SELECT]]
+}
+
 ; CHECK: attributes [[NUW_RO]] = { nounwind readonly }
+
diff --git a/test/Transforms/InstCombine/pow-3.ll b/test/Transforms/InstCombine/pow-3.ll
new file mode 100644
index 0000000..1c5cf91
--- /dev/null
+++ b/test/Transforms/InstCombine/pow-3.ll
@@ -0,0 +1,12 @@
+; Test that the pow won't get simplified to sqrt(fabs) when they are not available.
+;
+; RUN: opt < %s -disable-simplify-libcalls -instcombine -S | FileCheck %s
+
+declare double @llvm.pow.f64(double %Val, double %Power)
+
+define double @test_simplify_unavailable(double %x) {
+; CHECK-LABEL: @test_simplify_unavailable(
+  %retval = call double @llvm.pow.f64(double %x, double 0.5)
+; CHECK-NEXT: call double @llvm.pow.f64(double %x, double 5.000000e-01)
+  ret double %retval
+}
diff --git a/test/Transforms/InstCombine/pr17827.ll b/test/Transforms/InstCombine/pr17827.ll
new file mode 100644
index 0000000..a8b5926
--- /dev/null
+++ b/test/Transforms/InstCombine/pr17827.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; With left shift, the comparison should not be modified.
+; CHECK-LABEL: @test_shift_and_cmp_not_changed1(
+; CHECK: icmp slt i8 %andp, 32
+define i1 @test_shift_and_cmp_not_changed1(i8 %p) #0 {
+entry:
+  %shlp = shl i8 %p, 5
+  %andp = and i8 %shlp, -64
+  %cmp = icmp slt i8 %andp, 32
+  ret i1 %cmp
+}
+
+; With arithmetic right shift, the comparison should not be modified.
+; CHECK-LABEL: @test_shift_and_cmp_not_changed2(
+; CHECK: icmp slt i8 %andp, 32
+define i1 @test_shift_and_cmp_not_changed2(i8 %p) #0 {
+entry:
+  %shlp = ashr i8 %p, 5
+  %andp = and i8 %shlp, -64
+  %cmp = icmp slt i8 %andp, 32
+  ret i1 %cmp
+}
+
+; This should simplify functionally to the left shift case.
+; The extra input parameter should be optimized away.
+; CHECK-LABEL: @test_shift_and_cmp_changed1(
+; CHECK:  %andp = shl i8 %p, 5
+; CHECK-NEXT: %shl = and i8 %andp, -64
+; CHECK-NEXT:  %cmp = icmp slt i8 %shl, 32
+define i1 @test_shift_and_cmp_changed1(i8 %p, i8 %q) #0 {
+entry:
+  %andp = and i8 %p, 6
+  %andq = and i8 %q, 8
+  %or = or i8 %andq, %andp
+  %shl = shl i8 %or, 5
+  %ashr = ashr i8 %shl, 5
+  %cmp = icmp slt i8 %ashr, 1
+  ret i1 %cmp
+}
+
+; Unsigned compare allows a transformation to compare against 0.
+; CHECK-LABEL: @test_shift_and_cmp_changed2(
+; CHECK: icmp eq i8 %andp, 0
+define i1 @test_shift_and_cmp_changed2(i8 %p) #0 {
+entry:
+  %shlp = shl i8 %p, 5
+  %andp = and i8 %shlp, -64
+  %cmp = icmp ult i8 %andp, 32
+  ret i1 %cmp
+}
+
+; nsw on the shift should not affect the comparison.
+; CHECK-LABEL: @test_shift_and_cmp_changed3(
+; CHECK: icmp slt i8 %andp, 32
+define i1 @test_shift_and_cmp_changed3(i8 %p) #0 {
+entry:
+  %shlp = shl nsw i8 %p, 5
+  %andp = and i8 %shlp, -64
+  %cmp = icmp slt i8 %andp, 32
+  ret i1 %cmp
+}
+
+; Logical shift right allows a return true because the 'and' guarantees no bits are set.
+; CHECK-LABEL: @test_shift_and_cmp_changed4(
+; CHECK: ret i1 true
+define i1 @test_shift_and_cmp_changed4(i8 %p) #0 {
+entry:
+  %shlp = lshr i8 %p, 5
+  %andp = and i8 %shlp, -64
+  %cmp = icmp slt i8 %andp, 32
+  ret i1 %cmp
+}
+
diff --git a/test/Transforms/InstCombine/printf-1.ll b/test/Transforms/InstCombine/printf-1.ll
index 59d0f16..c98ddd5 100644
--- a/test/Transforms/InstCombine/printf-1.ll
+++ b/test/Transforms/InstCombine/printf-1.ll
@@ -1,7 +1,7 @@
 ; Test that the printf library call simplifier works correctly.
 ;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple xcore-xmos-elf -instcombine -S | FileCheck %s -check-prefix=IPRINTF
+; RUN: opt < %s -mtriple xcore-xmos-elf -instcombine -S | FileCheck %s -check-prefix=CHECK-IPRINTF
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/Transforms/InstCombine/select-2.ll b/test/Transforms/InstCombine/select-2.ll
index a76addc..5b9deb4 100644
--- a/test/Transforms/InstCombine/select-2.ll
+++ b/test/Transforms/InstCombine/select-2.ll
@@ -1,4 +1,7 @@
-; RUN: opt < %s -instcombine -S | grep select | count 2
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; CHECK: select
+; CHECK: select
 
 ; Make sure instcombine don't fold select into operands. We don't want to emit
 ; select of two integers unless it's selecting 0 / 1.
diff --git a/test/Transforms/InstCombine/select-extractelement.ll b/test/Transforms/InstCombine/select-extractelement.ll
new file mode 100644
index 0000000..e7ea851
--- /dev/null
+++ b/test/Transforms/InstCombine/select-extractelement.ll
@@ -0,0 +1,102 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare void @v4float_user(<4 x float>) #0
+
+
+
+define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+; CHECK-LABEL: @extract_one_select(
+; CHECK-NOT: select i1 {{.*}}, <4 x float>
+  %cmp = icmp ne i32 %c, 0
+  %sel = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %sel, i32 2
+  ret float %extract
+}
+
+; Multiple extractelements
+define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+; CHECK-LABEL: @extract_two_select(
+; CHECK: select i1 {{.*}}, <4 x float>
+  %cmp = icmp ne i32 %c, 0
+  %sel = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  %extract1 = extractelement <4 x float> %sel, i32 1
+  %extract2 = extractelement <4 x float> %sel, i32 2
+  %build1 = insertelement <2 x float> undef, float %extract1, i32 0
+  %build2 = insertelement <2 x float> %build1, float %extract2, i32 1
+  ret <2 x float> %build2
+}
+
+; Select has an extra non-extractelement user, don't change it
+define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+; CHECK-LABEL: @extract_one_select_user(
+; CHECK: select i1 {{.*}}, <4 x float>
+  %cmp = icmp ne i32 %c, 0
+  %sel = select i1 %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %sel, i32 2
+  call void @v4float_user(<4 x float> %sel)
+  ret float %extract
+}
+
+define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @extract_one_vselect_user(
+; CHECK: select <4 x i1> {{.*}}, <4 x float>
+  %cmp = icmp ne <4 x i32> %c, zeroinitializer
+  %sel = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %sel, i32 2
+  call void @v4float_user(<4 x float> %sel)
+  ret float %extract
+}
+
+; Extract from a vector select
+define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @extract_one_vselect(
+; CHECK-NOT: select <4 x i1>
+  %cmp = icmp ne <4 x i32> %c, zeroinitializer
+  %select = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
+  %extract = extractelement <4 x float> %select, i32 0
+  ret float %extract
+}
+
+; Multiple extractelements from a vector select
+define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @extract_two_vselect(
+; CHECK-NOT: select i1 {{.*}}, <4 x float>
+  %cmp = icmp ne <4 x i32> %c, zeroinitializer
+  %sel = select <4 x i1> %cmp, <4 x float> %a, <4 x float> %b
+  %extract1 = extractelement <4 x float> %sel, i32 1
+  %extract2 = extractelement <4 x float> %sel, i32 2
+  %build1 = insertelement <2 x float> undef, float %extract1, i32 0
+  %build2 = insertelement <2 x float> %build1, float %extract2, i32 1
+  ret <2 x float> %build2
+}
+
+; All the vector selects should be decomposed into scalar selects
+; Test multiple extractelements
+define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_vector_select(
+; CHECK-NOT: select i1 {{.*}}, <4 x float>
+entry:
+  %0 = extractelement <4 x i32> %c, i32 0
+  %tobool = icmp ne i32 %0, 0
+  %a.sink = select i1 %tobool, <4 x float> %a, <4 x float> %b
+  %1 = extractelement <4 x float> %a.sink, i32 0
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = extractelement <4 x i32> %c, i32 1
+  %tobool1 = icmp ne i32 %3, 0
+  %a.sink1 = select i1 %tobool1, <4 x float> %a, <4 x float> %b
+  %4 = extractelement <4 x float> %a.sink1, i32 1
+  %5 = insertelement <4 x float> %2, float %4, i32 1
+  %6 = extractelement <4 x i32> %c, i32 2
+  %tobool6 = icmp ne i32 %6, 0
+  %a.sink2 = select i1 %tobool6, <4 x float> %a, <4 x float> %b
+  %7 = extractelement <4 x float> %a.sink2, i32 2
+  %8 = insertelement <4 x float> %5, float %7, i32 2
+  %9 = extractelement <4 x i32> %c, i32 3
+  %tobool11 = icmp ne i32 %9, 0
+  %a.sink3 = select i1 %tobool11, <4 x float> %a, <4 x float> %b
+  %10 = extractelement <4 x float> %a.sink3, i32 3
+  %11 = insertelement <4 x float> %8, float %10, i32 3
+  ret <4 x float> %11
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index c7809f7..1458bde 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -986,6 +986,16 @@ define i32 @select_icmp_ne_0_and_8_or_1073741824(i8 %x, i32 %y) {
   ret i32 %select
 }
 
+; We can't combine here, because the cmp is scalar and the or vector.
+; Just make sure we don't assert.
+define <2 x i32> @select_icmp_eq_and_1_0_or_vector_of_2s(i32 %x, <2 x i32> %y) {
+  %and = and i32 %x, 1
+  %cmp = icmp eq i32 %and, 0
+  %or = or <2 x i32> %y, <i32 2, i32 2>
+  %select = select i1 %cmp, <2 x i32> %y, <2 x i32> %or
+  ret <2 x i32> %select
+}
+
 define i32 @test65(i64 %x) {
   %1 = and i64 %x, 16
   %2 = icmp ne i64 %1, 0
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index 0bdab13..b1082f0 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -744,3 +744,39 @@ define i32 @test62(i32 %x) {
 ; CHECK-LABEL: @test62(
 ; CHECK: ashr exact i32 %x, 3
 }
+
+; PR17026
+; CHECK-LABEL: @test63(
+; CHECK-NOT: sh
+; CHECK: ret
+define void @test63(i128 %arg) {
+bb:
+  br i1 undef, label %bb1, label %bb12
+
+bb1:                                              ; preds = %bb11, %bb
+  br label %bb2
+
+bb2:                                              ; preds = %bb7, %bb1
+  br i1 undef, label %bb3, label %bb7
+
+bb3:                                              ; preds = %bb2
+  %tmp = lshr i128 %arg, 36893488147419103232
+  %tmp4 = shl i128 %tmp, 0
+  %tmp5 = or i128 %tmp4, undef
+  %tmp6 = trunc i128 %tmp5 to i16
+  br label %bb8
+
+bb7:                                              ; preds = %bb2
+  br i1 undef, label %bb8, label %bb2
+
+bb8:                                              ; preds = %bb7, %bb3
+  %tmp9 = phi i16 [ %tmp6, %bb3 ], [ undef, %bb7 ]
+  %tmp10 = icmp eq i16 %tmp9, 0
+  br i1 %tmp10, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb8
+  br i1 undef, label %bb1, label %bb12
+
+bb12:                                             ; preds = %bb11, %bb8, %bb
+  ret void
+}
diff --git a/test/Transforms/InstCombine/sincospi.ll b/test/Transforms/InstCombine/sincospi.ll
new file mode 100644
index 0000000..0d1a602
--- /dev/null
+++ b/test/Transforms/InstCombine/sincospi.ll
@@ -0,0 +1,91 @@
+; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC
+; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s
+; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
+; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
+; RUN: opt -instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-NO-SINCOS
+
+
+attributes #0 = { readnone nounwind }
+
+declare float @__sinpif(float %x) #0
+declare float @__cospif(float %x) #0 
+
+declare double @__sinpi(double %x) #0
+declare double @__cospi(double %x) #0 
+
+@var32 = global float 0.0
+@var64 = global double 0.0
+
+define float @test_instbased_f32() {
+       %val = load float* @var32
+       %sin = call float @__sinpif(float %val) #0
+       %cos = call float @__cospif(float %val) #0
+       %res = fadd float %sin, %cos
+       ret float %res
+; CHECK-FLOAT-IN-VEC: [[VAL:%[a-z0-9]+]] = load float* @var32
+; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call <2 x float> @__sincospi_stretf(float [[VAL]])
+; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 0
+; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 1
+
+; CHECK: [[VAL:%[a-z0-9]+]] = load float* @var32
+; CHECK: [[SINCOS:%[a-z0-9]+]] = call { float, float } @__sincospi_stretf(float [[VAL]])
+; CHECK: extractvalue { float, float } [[SINCOS]], 0
+; CHECK: extractvalue { float, float } [[SINCOS]], 1
+
+; CHECK-NO-SINCOS: call float @__sinpif
+; CHECK-NO-SINCOS: call float @__cospif
+}
+
+define float @test_constant_f32() {
+       %sin = call float @__sinpif(float 1.0) #0
+       %cos = call float @__cospif(float 1.0) #0
+       %res = fadd float %sin, %cos
+       ret float %res
+; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call <2 x float> @__sincospi_stretf(float 1.000000e+00)
+; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 0
+; CHECK-FLOAT-IN-VEC: extractelement <2 x float> [[SINCOS]], i32 1
+
+; CHECK: [[SINCOS:%[a-z0-9]+]] = call { float, float } @__sincospi_stretf(float 1.000000e+00)
+; CHECK: extractvalue { float, float } [[SINCOS]], 0
+; CHECK: extractvalue { float, float } [[SINCOS]], 1
+
+; CHECK-NO-SINCOS: call float @__sinpif
+; CHECK-NO-SINCOS: call float @__cospif
+}
+
+define double @test_instbased_f64() {
+       %val = load double* @var64
+       %sin = call double @__sinpi(double %val) #0
+       %cos = call double @__cospi(double %val) #0
+       %res = fadd double %sin, %cos
+       ret double %res
+; CHECK-FLOAT-IN-VEC: [[VAL:%[a-z0-9]+]] = load double* @var64
+; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 0
+; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 1
+
+; CHECK: [[VAL:%[a-z0-9]+]] = load double* @var64
+; CHECK: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK: extractvalue { double, double } [[SINCOS]], 0
+; CHECK: extractvalue { double, double } [[SINCOS]], 1
+
+; CHECK-NO-SINCOS: call double @__sinpi
+; CHECK-NO-SINCOS: call double @__cospi
+}
+
+define double @test_constant_f64() {
+       %sin = call double @__sinpi(double 1.0) #0
+       %cos = call double @__cospi(double 1.0) #0
+       %res = fadd double %sin, %cos
+       ret double %res
+; CHECK-FLOAT-IN-VEC: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
+; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 0
+; CHECK-FLOAT-IN-VEC: extractvalue { double, double } [[SINCOS]], 1
+
+; CHECK: [[SINCOS:%[a-z0-9]+]] = call { double, double } @__sincospi_stret(double 1.000000e+00)
+; CHECK: extractvalue { double, double } [[SINCOS]], 0
+; CHECK: extractvalue { double, double } [[SINCOS]], 1
+
+; CHECK-NO-SINCOS: call double @__sinpi
+; CHECK-NO-SINCOS: call double @__cospi
+}
diff --git a/test/Transforms/InstCombine/sprintf-1.ll b/test/Transforms/InstCombine/sprintf-1.ll
index 6d0ab13..78dd7aa 100644
--- a/test/Transforms/InstCombine/sprintf-1.ll
+++ b/test/Transforms/InstCombine/sprintf-1.ll
@@ -1,7 +1,7 @@
 ; Test that the sprintf library call simplifier works correctly.
 ;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple xcore-xmos-elf -instcombine -S | FileCheck %s -check-prefix=IPRINTF
+; RUN: opt < %s -mtriple xcore-xmos-elf -instcombine -S | FileCheck %s -check-prefix=CHECK-IPRINTF
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/test/Transforms/InstCombine/store.ll b/test/Transforms/InstCombine/store.ll
index 9b666b9..b64c800 100644
--- a/test/Transforms/InstCombine/store.ll
+++ b/test/Transforms/InstCombine/store.ll
@@ -113,7 +113,8 @@ for.end:                                          ; preds = %for.cond
 ; CHECK-NEXT: store i32 %storemerge, i32* %gi, align 4, !tbaa !0
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
+!0 = metadata !{metadata !4, metadata !4, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA"}
 !3 = metadata !{metadata !"float", metadata !1}
+!4 = metadata !{metadata !"int", metadata !1}
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
index a6d5585..8a02529 100644
--- a/test/Transforms/InstCombine/stpcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -61,7 +61,7 @@ define i8* @test_simplify5() {
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
 ; CHECK: @__memcpy_chk
-  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
 ; CHECK: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
   ret i8* %ret
@@ -75,7 +75,7 @@ define i8* @test_simplify6() {
 
 ; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
 ; CHECK-NEXT: getelementptr inbounds [60 x i8]* @a, i32 0, i32 [[LEN]]
-  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
 }
@@ -93,4 +93,4 @@ define void @test_no_simplify1() {
 }
 
 declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/strchr-1.ll b/test/Transforms/InstCombine/strchr-1.ll
index 5efab9e..d2c9894 100644
--- a/test/Transforms/InstCombine/strchr-1.ll
+++ b/test/Transforms/InstCombine/strchr-1.ll
@@ -52,3 +52,14 @@ define void @test_simplify4(i32 %chr) {
   store i8* %dst, i8** @chp
   ret void
 }
+
+define void @test_simplify5() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %src, i32 65280)
+  store i8* %dst, i8** @chp
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
index 5b98cf8..8e7fec7 100644
--- a/test/Transforms/InstCombine/strcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -61,7 +61,7 @@ define void @test_simplify5() {
   %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
 
 ; CHECK: @__memcpy_chk
-  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
   ret void
 }
@@ -73,7 +73,7 @@ define i8* @test_simplify6() {
   %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
 
 ; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
-  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
   %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
 }
@@ -91,4 +91,4 @@ define void @test_no_simplify1() {
 }
 
 declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/strrchr-1.ll b/test/Transforms/InstCombine/strrchr-1.ll
index a0bdb22..4615f5f 100644
--- a/test/Transforms/InstCombine/strrchr-1.ll
+++ b/test/Transforms/InstCombine/strrchr-1.ll
@@ -42,6 +42,17 @@ define void @test_simplify3() {
   ret void
 }
 
+define void @test_simplify4() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %src, i32 65280)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
 define void @test_nosimplify1(i32 %chr) {
 ; CHECK-LABEL: @test_nosimplify1(
 ; CHECK: call i8* @strrchr
diff --git a/test/Transforms/InstCombine/struct-assign-tbaa.ll b/test/Transforms/InstCombine/struct-assign-tbaa.ll
index d7a26fa..c80e31a 100644
--- a/test/Transforms/InstCombine/struct-assign-tbaa.ll
+++ b/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -35,10 +35,12 @@ define i32 (i8*, i32*, double*)*** @test2() {
   ret i32 (i8*, i32*, double*)*** %tmp2
 }
 
-; CHECK: !0 = metadata !{metadata !"float", metadata !1}
+; CHECK: !0 = metadata !{metadata !1, metadata !1, i64 0}
+; CHECK: !1 = metadata !{metadata !"float", metadata !2}
 
 !0 = metadata !{metadata !"Simple C/C++ TBAA"}
 !1 = metadata !{metadata !"omnipotent char", metadata !0}
-!2 = metadata !{metadata !"float", metadata !0}
+!2 = metadata !{metadata !5, metadata !5, i64 0}
 !3 = metadata !{i64 0, i64 4, metadata !2}
 !4 = metadata !{i64 0, i64 8, null}
+!5 = metadata !{metadata !"float", metadata !0}
diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 5449656..36c523b 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -1,34 +1,34 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 ; Optimize subtracts.
 ;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define i32 @test1(i32 %A) {
-	%B = sub i32 %A, %A	
+	%B = sub i32 %A, %A
 	ret i32 %B
 ; CHECK-LABEL: @test1(
 ; CHECK: ret i32 0
 }
 
 define i32 @test2(i32 %A) {
-	%B = sub i32 %A, 0	
+	%B = sub i32 %A, 0
 	ret i32 %B
 ; CHECK-LABEL: @test2(
 ; CHECK: ret i32 %A
 }
 
 define i32 @test3(i32 %A) {
-	%B = sub i32 0, %A	
-	%C = sub i32 0, %B	
+	%B = sub i32 0, %A
+	%C = sub i32 0, %B
 	ret i32 %C
 ; CHECK-LABEL: @test3(
 ; CHECK: ret i32 %A
 }
 
 define i32 @test4(i32 %A, i32 %x) {
-	%B = sub i32 0, %A	
-	%C = sub i32 %x, %B	
+	%B = sub i32 0, %A
+	%C = sub i32 %x, %B
 	ret i32 %C
 ; CHECK-LABEL: @test4(
 ; CHECK: %C = add i32 %x, %A
@@ -36,8 +36,8 @@ define i32 @test4(i32 %A, i32 %x) {
 }
 
 define i32 @test5(i32 %A, i32 %B, i32 %C) {
-	%D = sub i32 %B, %C	
-	%E = sub i32 %A, %D	
+	%D = sub i32 %B, %C
+	%E = sub i32 %A, %D
 	ret i32 %E
 ; CHECK-LABEL: @test5(
 ; CHECK: %D1 = sub i32 %C, %B
@@ -46,17 +46,17 @@ define i32 @test5(i32 %A, i32 %B, i32 %C) {
 }
 
 define i32 @test6(i32 %A, i32 %B) {
-	%C = and i32 %A, %B	
-	%D = sub i32 %A, %C	
+	%C = and i32 %A, %B
+	%D = sub i32 %A, %C
 	ret i32 %D
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT: xor i32 %B, -1
-; CHECK-NEXT: %D = and i32 
+; CHECK-NEXT: %D = and i32
 ; CHECK-NEXT: ret i32 %D
 }
 
 define i32 @test7(i32 %A) {
-	%B = sub i32 -1, %A	
+	%B = sub i32 -1, %A
 	ret i32 %B
 ; CHECK-LABEL: @test7(
 ; CHECK: %B = xor i32 %A, -1
@@ -64,8 +64,8 @@ define i32 @test7(i32 %A) {
 }
 
 define i32 @test8(i32 %A) {
-	%B = mul i32 9, %A	
-	%C = sub i32 %B, %A	
+	%B = mul i32 9, %A
+	%C = sub i32 %B, %A
 	ret i32 %C
 ; CHECK-LABEL: @test8(
 ; CHECK: %C = shl i32 %A, 3
@@ -73,8 +73,8 @@ define i32 @test8(i32 %A) {
 }
 
 define i32 @test9(i32 %A) {
-	%B = mul i32 3, %A	
-	%C = sub i32 %A, %B	
+	%B = mul i32 3, %A
+	%C = sub i32 %A, %B
 	ret i32 %C
 ; CHECK-LABEL: @test9(
 ; CHECK: %C = mul i32 %A, -2
@@ -82,9 +82,9 @@ define i32 @test9(i32 %A) {
 }
 
 define i32 @test10(i32 %A, i32 %B) {
-	%C = sub i32 0, %A	
-	%D = sub i32 0, %B	
-	%E = mul i32 %C, %D	
+	%C = sub i32 0, %A
+	%D = sub i32 0, %B
+	%E = mul i32 %C, %D
 	ret i32 %E
 ; CHECK-LABEL: @test10(
 ; CHECK: %E = mul i32 %A, %B
@@ -92,8 +92,8 @@ define i32 @test10(i32 %A, i32 %B) {
 }
 
 define i32 @test10a(i32 %A) {
-	%C = sub i32 0, %A	
-	%E = mul i32 %C, 7	
+	%C = sub i32 0, %A
+	%E = mul i32 %C, 7
 	ret i32 %E
 ; CHECK-LABEL: @test10a(
 ; CHECK: %E = mul i32 %A, -7
@@ -101,8 +101,8 @@ define i32 @test10a(i32 %A) {
 }
 
 define i1 @test11(i8 %A, i8 %B) {
-	%C = sub i8 %A, %B	
-	%cD = icmp ne i8 %C, 0	
+	%C = sub i8 %A, %B
+	%cD = icmp ne i8 %C, 0
 	ret i1 %cD
 ; CHECK-LABEL: @test11(
 ; CHECK: %cD = icmp ne i8 %A, %B
@@ -110,8 +110,8 @@ define i1 @test11(i8 %A, i8 %B) {
 }
 
 define i32 @test12(i32 %A) {
-	%B = ashr i32 %A, 31	
-	%C = sub i32 0, %B	
+	%B = ashr i32 %A, 31
+	%C = sub i32 0, %B
 	ret i32 %C
 ; CHECK-LABEL: @test12(
 ; CHECK: %C = lshr i32 %A, 31
@@ -119,8 +119,8 @@ define i32 @test12(i32 %A) {
 }
 
 define i32 @test13(i32 %A) {
-	%B = lshr i32 %A, 31	
-	%C = sub i32 0, %B	
+	%B = lshr i32 %A, 31
+	%C = sub i32 0, %B
 	ret i32 %C
 ; CHECK-LABEL: @test13(
 ; CHECK: %C = ashr i32 %A, 31
@@ -128,9 +128,9 @@ define i32 @test13(i32 %A) {
 }
 
 define i32 @test14(i32 %A) {
-	%B = lshr i32 %A, 31	
-	%C = bitcast i32 %B to i32	
-	%D = sub i32 0, %C	
+	%B = lshr i32 %A, 31
+	%C = bitcast i32 %B to i32
+	%D = sub i32 0, %C
 	ret i32 %D
 ; CHECK-LABEL: @test14(
 ; CHECK: %D = ashr i32 %A, 31
@@ -138,17 +138,17 @@ define i32 @test14(i32 %A) {
 }
 
 define i32 @test15(i32 %A, i32 %B) {
-	%C = sub i32 0, %A	
-	%D = srem i32 %B, %C	
+	%C = sub i32 0, %A
+	%D = srem i32 %B, %C
 	ret i32 %D
 ; CHECK-LABEL: @test15(
-; CHECK: %D = srem i32 %B, %A 
+; CHECK: %D = srem i32 %B, %A
 ; CHECK: ret i32 %D
 }
 
 define i32 @test16(i32 %A) {
-	%X = sdiv i32 %A, 1123	
-	%Y = sub i32 0, %X	
+	%X = sdiv i32 %A, 1123
+	%Y = sub i32 0, %X
 	ret i32 %Y
 ; CHECK-LABEL: @test16(
 ; CHECK: %Y = sdiv i32 %A, -1123
@@ -158,8 +158,8 @@ define i32 @test16(i32 %A) {
 ; Can't fold subtract here because negation it might oveflow.
 ; PR3142
 define i32 @test17(i32 %A) {
-	%B = sub i32 0, %A	
-	%C = sdiv i32 %B, 1234	
+	%B = sub i32 0, %A
+	%C = sdiv i32 %B, 1234
 	ret i32 %C
 ; CHECK-LABEL: @test17(
 ; CHECK: %B = sub i32 0, %A
@@ -168,25 +168,25 @@ define i32 @test17(i32 %A) {
 }
 
 define i64 @test18(i64 %Y) {
-	%tmp.4 = shl i64 %Y, 2	
-	%tmp.12 = shl i64 %Y, 2	
-	%tmp.8 = sub i64 %tmp.4, %tmp.12	
+	%tmp.4 = shl i64 %Y, 2
+	%tmp.12 = shl i64 %Y, 2
+	%tmp.8 = sub i64 %tmp.4, %tmp.12
 	ret i64 %tmp.8
 ; CHECK-LABEL: @test18(
 ; CHECK: ret i64 0
 }
 
 define i32 @test19(i32 %X, i32 %Y) {
-	%Z = sub i32 %X, %Y	
-	%Q = add i32 %Z, %Y	
+	%Z = sub i32 %X, %Y
+	%Q = add i32 %Z, %Y
 	ret i32 %Q
 ; CHECK-LABEL: @test19(
 ; CHECK: ret i32 %X
 }
 
 define i1 @test20(i32 %g, i32 %h) {
-	%tmp.2 = sub i32 %g, %h	
-	%tmp.4 = icmp ne i32 %tmp.2, %g	
+	%tmp.2 = sub i32 %g, %h
+	%tmp.4 = icmp ne i32 %tmp.2, %g
 	ret i1 %tmp.4
 ; CHECK-LABEL: @test20(
 ; CHECK: %tmp.4 = icmp ne i32 %h, 0
@@ -194,8 +194,8 @@ define i1 @test20(i32 %g, i32 %h) {
 }
 
 define i1 @test21(i32 %g, i32 %h) {
-	%tmp.2 = sub i32 %g, %h	
-	%tmp.4 = icmp ne i32 %tmp.2, %g		
+	%tmp.2 = sub i32 %g, %h
+	%tmp.4 = icmp ne i32 %tmp.2, %g
         ret i1 %tmp.4
 ; CHECK-LABEL: @test21(
 ; CHECK: %tmp.4 = icmp ne i32 %h, 0
@@ -204,9 +204,9 @@ define i1 @test21(i32 %g, i32 %h) {
 
 ; PR2298
 define zeroext i1 @test22(i32 %a, i32 %b)  nounwind  {
-	%tmp2 = sub i32 0, %a	
-	%tmp4 = sub i32 0, %b	
-	%tmp5 = icmp eq i32 %tmp2, %tmp4	
+	%tmp2 = sub i32 0, %a
+	%tmp4 = sub i32 0, %b
+	%tmp5 = icmp eq i32 %tmp2, %tmp4
 	ret i1 %tmp5
 ; CHECK-LABEL: @test22(
 ; CHECK: %tmp5 = icmp eq i32 %b, %a
@@ -227,6 +227,19 @@ define i32 @test23(i8* %P, i64 %A){
 ; CHECK-NEXT: ret i32
 }
 
+define i8 @test23_as1(i8 addrspace(1)* %P, i16 %A) {
+; CHECK: @test23_as1
+; CHECK-NEXT: = trunc i16 %A to i8
+; CHECK-NEXT: ret i8
+  %B = getelementptr inbounds i8 addrspace(1)* %P, i16 %A
+  %C = ptrtoint i8 addrspace(1)* %B to i16
+  %D = trunc i16 %C to i8
+  %E = ptrtoint i8 addrspace(1)* %P to i16
+  %F = trunc i16 %E to i8
+  %G = sub i8 %D, %F
+  ret i8 %G
+}
+
 define i64 @test24(i8* %P, i64 %A){
   %B = getelementptr inbounds i8* %P, i64 %A
   %C = ptrtoint i8* %B to i64
@@ -237,6 +250,16 @@ define i64 @test24(i8* %P, i64 %A){
 ; CHECK-NEXT: ret i64 %A
 }
 
+define i16 @test24_as1(i8 addrspace(1)* %P, i16 %A) {
+; CHECK: @test24_as1
+; CHECK-NEXT: ret i16 %A
+  %B = getelementptr inbounds i8 addrspace(1)* %P, i16 %A
+  %C = ptrtoint i8 addrspace(1)* %B to i16
+  %E = ptrtoint i8 addrspace(1)* %P to i16
+  %G = sub i16 %C, %E
+  ret i16 %G
+}
+
 define i64 @test24a(i8* %P, i64 %A){
   %B = getelementptr inbounds i8* %P, i64 %A
   %C = ptrtoint i8* %B to i64
@@ -245,9 +268,21 @@ define i64 @test24a(i8* %P, i64 %A){
   ret i64 %G
 ; CHECK-LABEL: @test24a(
 ; CHECK-NEXT: sub i64 0, %A
-; CHECK-NEXT: ret i64 
+; CHECK-NEXT: ret i64
 }
 
+define i16 @test24a_as1(i8 addrspace(1)* %P, i16 %A) {
+; CHECK: @test24a_as1
+; CHECK-NEXT: sub i16 0, %A
+; CHECK-NEXT: ret i16
+  %B = getelementptr inbounds i8 addrspace(1)* %P, i16 %A
+  %C = ptrtoint i8 addrspace(1)* %B to i16
+  %E = ptrtoint i8 addrspace(1)* %P to i16
+  %G = sub i16 %E, %C
+  ret i16 %G
+}
+
+
 @Arr = external global [42 x i16]
 
 define i64 @test24b(i8* %P, i64 %A){
@@ -257,7 +292,7 @@ define i64 @test24b(i8* %P, i64 %A){
   ret i64 %G
 ; CHECK-LABEL: @test24b(
 ; CHECK-NEXT: shl nuw i64 %A, 1
-; CHECK-NEXT: ret i64 
+; CHECK-NEXT: ret i64
 }
 
 
@@ -269,7 +304,21 @@ define i64 @test25(i8* %P, i64 %A){
 ; CHECK-LABEL: @test25(
 ; CHECK-NEXT: shl nuw i64 %A, 1
 ; CHECK-NEXT: add i64 {{.*}}, -84
-; CHECK-NEXT: ret i64 
+; CHECK-NEXT: ret i64
+}
+
+@Arr_as1 = external addrspace(1) global [42 x i16]
+
+define i16 @test25_as1(i8 addrspace(1)* %P, i64 %A) {
+; CHECK: @test25_as1
+; CHECK-NEXT: %1 = trunc i64 %A to i16
+; CHECK-NEXT: shl nuw i16 %1, 1
+; CHECK-NEXT: add i16 {{.*}}, -84
+; CHECK-NEXT: ret i16
+  %B = getelementptr inbounds [42 x i16] addrspace(1)* @Arr_as1, i64 0, i64 %A
+  %C = ptrtoint i16 addrspace(1)* %B to i16
+  %G = sub i16 %C, ptrtoint (i16 addrspace(1)* getelementptr ([42 x i16] addrspace(1)* @Arr_as1, i64 1, i64 0) to i16)
+  ret i16 %G
 }
 
 define i32 @test26(i32 %x) {
@@ -327,3 +376,19 @@ define i64 @test30(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT: sub i64 %gep1.idx, %j
 ; CHECK-NEXT: ret i64
 }
+
+define i16 @test30_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
+; CHECK-LABEL: @test30_as1(
+; CHECK-NEXT: %gep1.idx = shl nuw i16 %i, 2
+; CHECK-NEXT: sub i16 %gep1.idx, %j
+; CHECK-NEXT: ret i16
+  %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
+  %gep1 = getelementptr inbounds i32 addrspace(1)* %bit, i16 %i
+  %gep2 = getelementptr inbounds i8 addrspace(1)* %foo, i16 %j
+  %cast1 = ptrtoint i32 addrspace(1)* %gep1 to i16
+  %cast2 = ptrtoint i8 addrspace(1)* %gep2 to i16
+  %sub = sub i16 %cast1, %cast2
+  ret i16 %sub
+}
+
+
diff --git a/test/Transforms/InstCombine/vec_extract_elt.ll b/test/Transforms/InstCombine/vec_extract_elt.ll
index 166066a..3daf72e 100644
--- a/test/Transforms/InstCombine/vec_extract_elt.ll
+++ b/test/Transforms/InstCombine/vec_extract_elt.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -instcombine -S | not grep extractelement
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK-NOT: extractelement
 
 define i32 @test(float %f) {
         %tmp7 = insertelement <4 x float> undef, float %f, i32 0                ; <<4 x float>> [#uses=1]
diff --git a/test/Transforms/InstCombine/vec_insertelt.ll b/test/Transforms/InstCombine/vec_insertelt.ll
index e35fa5e..3b94920 100644
--- a/test/Transforms/InstCombine/vec_insertelt.ll
+++ b/test/Transforms/InstCombine/vec_insertelt.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -instcombine -S | grep "ret <4 x i32> %A"
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; CHECK: ret <4 x i32> %A
 
 ; PR1286
 define <4 x i32> @test1(<4 x i32> %A) {
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index 738e05b..3ee43dc 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -200,3 +200,31 @@ define void @test14(i16 %conv10) {
   %sext = sext <2 x i1> %cmp to <2 x i16>
   ret void
 }
+
+; Check that sequences of insert/extract element are 
+; collapsed into valid shuffle instruction with correct shuffle indexes.
+ 
+define <4 x float> @test15a(<4 x float> %LHS, <4 x float> %RHS) {
+; CHECK-LABEL: @test15a
+; CHECK-NEXT: shufflevector <4 x float> %LHS, <4 x float> %RHS, <4 x i32> <i32 4, i32 0, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x float> %tmp4
+  %tmp1 = extractelement <4 x float> %LHS, i32 0
+  %tmp2 = insertelement <4 x float> %RHS, float %tmp1, i32 1
+  %tmp3 = extractelement <4 x float> %RHS, i32 2
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 3
+  ret <4 x float> %tmp4
+}
+ 
+define <4 x float> @test15b(<4 x float> %LHS, <4 x float> %RHS) {
+; CHECK-LABEL: @test15b
+; CHECK-NEXT: shufflevector <4 x float> %LHS, <4 x float> %RHS, <4 x i32> <i32 4, i32 3, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x float> %tmp5
+  %tmp0 = extractelement <4 x float> %LHS, i32 3
+  %tmp1 = insertelement <4 x float> %RHS, float %tmp0, i32 0
+  %tmp2 = extractelement <4 x float> %tmp1, i32 0
+  %tmp3 = insertelement <4 x float> %RHS, float %tmp2, i32 1
+  %tmp4 = extractelement <4 x float> %RHS, i32 2
+  %tmp5 = insertelement <4 x float> %tmp3, float %tmp4, i32 3
+  ret <4 x float> %tmp5
+}
+
diff --git a/test/Transforms/InstCombine/win-math.ll b/test/Transforms/InstCombine/win-math.ll
index df3ac93..e6e79e2 100644
--- a/test/Transforms/InstCombine/win-math.ll
+++ b/test/Transforms/InstCombine/win-math.ll
@@ -273,3 +273,23 @@ define float @float_round(float %x) nounwind readnone {
     ret float %3
 }
 
+declare float @powf(float, float)
+; win32 lacks sqrtf&fabsf, win64 lacks fabsf
+define float @float_powsqrt(float %x) nounwind readnone {
+; WIN32-LABEL: @float_powsqrt(
+; WIN32-NOT: float @sqrtf
+; WIN32: float @powf
+; WIN64-LABEL: @float_powsqrt(
+; WIN64-NOT: float @sqrtf
+; WIN64: float @powf
+; MINGW32-LABEL: @float_powsqrt(
+; MINGW32: float @sqrtf
+; MINGW32: float @fabsf
+; MINGW32-NOT: float @powf
+; MINGW64-LABEL: @float_powsqrt(
+; MINGW64: float @sqrtf
+; MINGW64: float @fabsf
+; MINGW64-NOT: float @powf
+    %1 = call float @powf(float %x, float 0.5)
+    ret float %1
+}
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
index 3e1621c..fd854c5 100644
--- a/test/Transforms/InstSimplify/call.ll
+++ b/test/Transforms/InstSimplify/call.ll
@@ -101,3 +101,66 @@ define float @test_idempotence(float %a) {
 
   ret float %r4
 }
+
+define i8* @operator_new() {
+entry:
+  %call = tail call noalias i8* @_Znwm(i64 8)
+  %cmp = icmp eq i8* %call, null
+  br i1 %cmp, label %cast.end, label %cast.notnull
+
+cast.notnull:                                     ; preds = %entry
+  %add.ptr = getelementptr inbounds i8* %call, i64 4
+  br label %cast.end
+
+cast.end:                                         ; preds = %cast.notnull, %entry
+  %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
+  ret i8* %cast.result
+
+; CHECK-LABEL: @operator_new
+; CHECK: br i1 false, label %cast.end, label %cast.notnull
+}
+
+declare noalias i8* @_Znwm(i64)
+
+%"struct.std::nothrow_t" = type { i8 }
+@_ZSt7nothrow = external global %"struct.std::nothrow_t"
+
+define i8* @operator_new_nothrow_t() {
+entry:
+  %call = tail call noalias i8* @_ZnamRKSt9nothrow_t(i64 8, %"struct.std::nothrow_t"* @_ZSt7nothrow)
+  %cmp = icmp eq i8* %call, null
+  br i1 %cmp, label %cast.end, label %cast.notnull
+
+cast.notnull:                                     ; preds = %entry
+  %add.ptr = getelementptr inbounds i8* %call, i64 4
+  br label %cast.end
+
+cast.end:                                         ; preds = %cast.notnull, %entry
+  %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
+  ret i8* %cast.result
+
+; CHECK-LABEL: @operator_new_nothrow_t
+; CHECK: br i1 %cmp, label %cast.end, label %cast.notnull
+}
+
+declare i8* @_ZnamRKSt9nothrow_t(i64, %"struct.std::nothrow_t"*) nounwind
+
+define i8* @malloc_can_return_null() {
+entry:
+  %call = tail call noalias i8* @malloc(i64 8)
+  %cmp = icmp eq i8* %call, null
+  br i1 %cmp, label %cast.end, label %cast.notnull
+
+cast.notnull:                                     ; preds = %entry
+  %add.ptr = getelementptr inbounds i8* %call, i64 4
+  br label %cast.end
+
+cast.end:                                         ; preds = %cast.notnull, %entry
+  %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
+  ret i8* %cast.result
+
+; CHECK-LABEL: @malloc_can_return_null
+; CHECK: br i1 %cmp, label %cast.end, label %cast.notnull
+}
+
+declare noalias i8* @malloc(i64)
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index 0957949..abb3869 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -717,3 +717,25 @@ define i1 @alloca_gep(i64 %a, i64 %b) {
   ret i1 %cmp
 ; CHECK-NEXT: ret i1 false
 }
+
+define i1 @non_inbounds_gep_compare(i64* %a) {
+; CHECK-LABEL: @non_inbounds_gep_compare(
+; Equality compares with non-inbounds GEPs can be folded.
+  %x = getelementptr i64* %a, i64 42
+  %y = getelementptr inbounds i64* %x, i64 -42
+  %z = getelementptr i64* %a, i64 -42
+  %w = getelementptr inbounds i64* %z, i64 42
+  %cmp = icmp eq i64* %y, %w
+  ret i1 %cmp
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @non_inbounds_gep_compare2(i64* %a) {
+; CHECK-LABEL: @non_inbounds_gep_compare2(
+; Equality compares with non-inbounds GEPs can be folded.
+  %x = getelementptr i64* %a, i64 4294967297
+  %y = getelementptr i64* %a, i64 1
+  %cmp = icmp eq i64* %y, %y
+  ret i1 %cmp
+; CHECK-NEXT: ret i1 true
+}
diff --git a/test/Transforms/InstSimplify/lit.local.cfg b/test/Transforms/InstSimplify/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/InstSimplify/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/Internalize/2008-05-09-AllButMain.ll b/test/Transforms/Internalize/2008-05-09-AllButMain.ll
deleted file mode 100644
index f75e80d..0000000
--- a/test/Transforms/Internalize/2008-05-09-AllButMain.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; No arguments means internalize everything
-; RUN: opt < %s -internalize -S | FileCheck --check-prefix=NOARGS %s
-
-; Internalize all but foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | FileCheck --check-prefix=LIST %s
-
-; Non existent files should be treated as if they were empty (so internalize
-; everything)
-; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | FileCheck --check-prefix=EMPTYFILE %s
-
-; RUN: opt < %s -S -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file  2> /dev/null | FileCheck --check-prefix=LIST2 %s
-
-; -file and -list options should be merged, the .apifile contains foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %s.apifile -S | FileCheck --check-prefix=MERGE %s
-
-; NOARGS: @i = internal global
-; LIST: @i = internal global
-; EMPTYFILE: @i = internal global
-; LIST2: @i = internal global
-; MERGE: @i = internal global
-@i = global i32 0
-
-; NOARGS: @j = internal global
-; LIST: @j = global
-; EMPTYFILE: @j = internal global
-; LIST2: @j = internal global
-; MERGE: @j = global
-@j = global i32 0
-
-; NOARGS-LABEL: define internal void @main(
-; LIST-LABEL: define internal void @main(
-; EMPTYFILE-LABEL: define internal void @main(
-; LIST2-LABEL: define internal void @main(
-; MERGE-LABEL: define internal void @main(
-define void @main() {
-        ret void
-}
-
-; NOARGS-LABEL: define internal void @foo(
-; LIST-LABEL: define void @foo(
-; EMPTYFILE-LABEL: define internal void @foo(
-; LIST2-LABEL: define void @foo(
-; MERGE-LABEL: define void @foo(
-define void @foo() {
-        ret void
-}
-
-; NOARGS-LABEL: define internal void @bar(
-; LIST-LABEL: define internal void @bar(
-; EMPTYFILE-LABEL: define internal void @bar(
-; LIST2-LABEL: define void @bar(
-; MERGE-LABEL: define void @bar(
-define void @bar() {
-        ret void
-}
diff --git a/test/Transforms/Internalize/2008-05-09-AllButMain.ll.apifile b/test/Transforms/Internalize/apifile
index f6c58b8..f6c58b8 100644
--- a/test/Transforms/Internalize/2008-05-09-AllButMain.ll.apifile
+++ b/test/Transforms/Internalize/apifile
diff --git a/test/Transforms/Internalize/available_externally.ll b/test/Transforms/Internalize/available_externally.ll
deleted file mode 100644
index bb89603..0000000
--- a/test/Transforms/Internalize/available_externally.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: opt < %s -internalize -internalize-public-api-list foo -S | FileCheck %s
-
-; CHECK-LABEL: define void @foo(
-define void @foo() {
-  ret void
-}
-
-; CHECK-LABEL: define internal void @zed(
-define void @zed() {
-  ret void
-}
-
-; CHECK-LABEL: define available_externally void @bar(
-define available_externally void @bar() {
-  ret void
-}
diff --git a/test/Transforms/Internalize/lists.ll b/test/Transforms/Internalize/lists.ll
new file mode 100644
index 0000000..83e441a2
--- /dev/null
+++ b/test/Transforms/Internalize/lists.ll
@@ -0,0 +1,50 @@
+; No arguments means internalize everything
+; RUN: opt < %s -internalize -S | FileCheck --check-prefix=ALL %s
+
+; Non existent files should be treated as if they were empty (so internalize
+; everything)
+; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | FileCheck --check-prefix=ALL %s
+
+; Internalize all but foo and j
+; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | FileCheck --check-prefix=FOO_AND_J %s
+
+; RUN: opt < %s -S -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file  2> /dev/null | FileCheck --check-prefix=FOO_AND_BAR %s
+
+; -file and -list options should be merged, the apifile contains foo and j
+; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %S/apifile -S | FileCheck --check-prefix=FOO_J_AND_BAR %s
+
+; ALL: @i = internal global
+; FOO_AND_J: @i = internal global
+; FOO_AND_BAR: @i = internal global
+; FOO_J_AND_BAR: @i = internal global
+@i = global i32 0
+
+; ALL: @j = internal global
+; FOO_AND_J: @j = global
+; FOO_AND_BAR: @j = internal global
+; FOO_J_AND_BAR: @j = global
+@j = global i32 0
+
+; ALL: define internal void @main() {
+; FOO_AND_J: define internal void @main() {
+; FOO_AND_BAR: define internal void @main() {
+; FOO_J_AND_BAR: define internal void @main() {
+define void @main() {
+        ret void
+}
+
+; ALL: define internal void @foo() {
+; FOO_AND_J: define void @foo() {
+; FOO_AND_BAR: define void @foo() {
+; FOO_J_AND_BAR: define void @foo() {
+define void @foo() {
+        ret void
+}
+
+; ALL: define available_externally void @bar() {
+; FOO_AND_J: define available_externally void @bar() {
+; FOO_AND_BAR: define available_externally void @bar() {
+; FOO_J_AND_BAR: define available_externally void @bar() {
+define available_externally void @bar() {
+  ret void
+}
diff --git a/test/Transforms/Internalize/lit.local.cfg b/test/Transforms/Internalize/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/Internalize/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/JumpThreading/landing-pad.ll b/test/Transforms/JumpThreading/landing-pad.ll
new file mode 100644
index 0000000..9ee0526
--- /dev/null
+++ b/test/Transforms/JumpThreading/landing-pad.ll
@@ -0,0 +1,203 @@
+; RUN: opt < %s -disable-output -jump-threading
+
+%class.E = type { i32 (...)**, %class.C }
+%class.C = type { %class.A }
+%class.A = type { i32 }
+%class.D = type { %class.F }
+%class.F = type { %class.E }
+%class.B = type { %class.D* }
+
+@_ZTV1D = unnamed_addr constant [3 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1D to i8*), i8* bitcast (void (%class.D*)* @_ZN1D7doApplyEv to i8*)]
+@_ZTI1D = external unnamed_addr constant { i8*, i8*, i8* }
+
+define void @_ZN15EditCommandImpl5applyEv(%class.E* %this) uwtable align 2 {
+entry:
+  %0 = bitcast %class.E* %this to void (%class.E*)***
+  %vtable = load void (%class.E*)*** %0, align 8
+  %1 = load void (%class.E*)** %vtable, align 8
+  call void %1(%class.E* %this)
+  ret void
+}
+
+define void @_ZN1DC1Ev(%class.D* nocapture %this) unnamed_addr uwtable align 2 {
+entry:
+  call void @_ZN24CompositeEditCommandImplC2Ev()
+  %0 = getelementptr inbounds %class.D* %this, i64 0, i32 0, i32 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*]* @_ZTV1D, i64 0, i64 2) to i32 (...)**), i32 (...)*** %0, align 8
+  ret void
+}
+
+define void @_ZN1DC2Ev(%class.D* nocapture %this) unnamed_addr uwtable align 2 {
+entry:
+  call void @_ZN24CompositeEditCommandImplC2Ev()
+  %0 = getelementptr inbounds %class.D* %this, i64 0, i32 0, i32 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*]* @_ZTV1D, i64 0, i64 2) to i32 (...)**), i32 (...)*** %0, align 8
+  ret void
+}
+
+declare void @_ZN24CompositeEditCommandImplC2Ev() #1
+
+define void @_ZN1D7doApplyEv(%class.D* nocapture %this) unnamed_addr nounwind readnone uwtable align 2 {
+entry:
+  ret void
+}
+
+define void @_Z3fn1v() uwtable {
+entry:
+  %call = call noalias i8* @_Znwm() #8
+  invoke void @_ZN24CompositeEditCommandImplC2Ev()
+          to label %_ZN1DC1Ev.exit unwind label %lpad
+
+_ZN1DC1Ev.exit:                                   ; preds = %entry
+  %0 = bitcast i8* %call to i32 (...)***
+  store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*]* @_ZTV1D, i64 0, i64 2) to i32 (...)**), i32 (...)*** %0, align 8
+  %_ref.i.i.i = getelementptr inbounds i8* %call, i64 8
+  %1 = bitcast i8* %_ref.i.i.i to i32*
+  %2 = load i32* %1, align 4
+  %inc.i.i.i = add nsw i32 %2, 1
+  store i32 %inc.i.i.i, i32* %1, align 4
+  %3 = bitcast i8* %call to %class.D*
+  invoke void @_ZN1D7doApplyEv(%class.D* %3)
+          to label %_ZN15EditCommandImpl5applyEv.exit unwind label %lpad1
+
+_ZN15EditCommandImpl5applyEv.exit:                ; preds = %_ZN1DC1Ev.exit
+  invoke void @_ZN1D16deleteKeyPressedEv()
+          to label %invoke.cont7 unwind label %lpad1
+
+invoke.cont7:                                     ; preds = %_ZN15EditCommandImpl5applyEv.exit
+  ret void
+
+lpad:                                             ; preds = %entry
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  call void @_ZdlPv() #9
+  unreachable
+
+lpad1:                                            ; preds = %_ZN1DC1Ev.exit, %_ZN15EditCommandImpl5applyEv.exit
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %6 = load i32* %1, align 4
+  %tobool.i.i.i = icmp eq i32 %6, 0
+  br i1 %tobool.i.i.i, label %_ZN1BI1DED1Ev.exit, label %if.then.i.i.i
+
+if.then.i.i.i:                                    ; preds = %lpad1
+  br i1 undef, label %_ZN1BI1DED1Ev.exit, label %delete.notnull.i.i.i
+
+delete.notnull.i.i.i:                             ; preds = %if.then.i.i.i
+  call void @_ZdlPv() #9
+  unreachable
+
+_ZN1BI1DED1Ev.exit:                               ; preds = %lpad1, %if.then.i.i.i
+  resume { i8*, i32 } undef
+
+terminate.lpad:                                   ; No predecessors!
+  %7 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* null
+  unreachable
+}
+
+define void @_ZN1BI1DEC1EPS0_(%class.B* nocapture %this, %class.D* %p1) unnamed_addr uwtable align 2 {
+entry:
+  %m_ptr.i = getelementptr inbounds %class.B* %this, i64 0, i32 0
+  store %class.D* %p1, %class.D** %m_ptr.i, align 8
+  %_ref.i.i = getelementptr inbounds %class.D* %p1, i64 0, i32 0, i32 0, i32 1, i32 0, i32 0
+  %0 = load i32* %_ref.i.i, align 4
+  %inc.i.i = add nsw i32 %0, 1
+  store i32 %inc.i.i, i32* %_ref.i.i, align 4
+  ret void
+}
+
+declare noalias i8* @_Znwm()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZdlPv()
+
+define %class.D* @_ZN1BI1DEptEv(%class.B* nocapture readonly %this) nounwind readonly uwtable align 2 {
+entry:
+  %m_ptr = getelementptr inbounds %class.B* %this, i64 0, i32 0
+  %0 = load %class.D** %m_ptr, align 8
+  ret %class.D* %0
+}
+
+declare void @_ZN1D16deleteKeyPressedEv()
+
+define void @_ZN1BI1DED1Ev(%class.B* nocapture readonly %this) unnamed_addr uwtable align 2 {
+entry:
+  %m_ptr.i = getelementptr inbounds %class.B* %this, i64 0, i32 0
+  %0 = load %class.D** %m_ptr.i, align 8
+  %_ref.i.i = getelementptr inbounds %class.D* %0, i64 0, i32 0, i32 0, i32 1, i32 0, i32 0
+  %1 = load i32* %_ref.i.i, align 4
+  %tobool.i.i = icmp eq i32 %1, 0
+  br i1 %tobool.i.i, label %_ZN1BI1DED2Ev.exit, label %if.then.i.i
+
+if.then.i.i:                                      ; preds = %entry
+  br i1 undef, label %_ZN1BI1DED2Ev.exit, label %delete.notnull.i.i
+
+delete.notnull.i.i:                               ; preds = %if.then.i.i
+  call void @_ZdlPv() #9
+  unreachable
+
+_ZN1BI1DED2Ev.exit:                               ; preds = %entry, %if.then.i.i
+  ret void
+}
+
+declare hidden void @__clang_call_terminate()
+
+define void @_ZN1BI1DED2Ev(%class.B* nocapture readonly %this) unnamed_addr uwtable align 2 {
+entry:
+  %m_ptr = getelementptr inbounds %class.B* %this, i64 0, i32 0
+  %0 = load %class.D** %m_ptr, align 8
+  %_ref.i = getelementptr inbounds %class.D* %0, i64 0, i32 0, i32 0, i32 1, i32 0, i32 0
+  %1 = load i32* %_ref.i, align 4
+  %tobool.i = icmp eq i32 %1, 0
+  br i1 %tobool.i, label %_ZN1AI1CE5derefEv.exit, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  br i1 undef, label %_ZN1AI1CE5derefEv.exit, label %delete.notnull.i
+
+delete.notnull.i:                                 ; preds = %if.then.i
+  call void @_ZdlPv() #9
+  unreachable
+
+_ZN1AI1CE5derefEv.exit:                           ; preds = %entry, %if.then.i
+  ret void
+}
+
+define void @_ZN1AI1CE5derefEv(%class.A* nocapture readonly %this) nounwind uwtable align 2 {
+entry:
+  %_ref = getelementptr inbounds %class.A* %this, i64 0, i32 0
+  %0 = load i32* %_ref, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  br i1 undef, label %if.end, label %delete.notnull
+
+delete.notnull:                                   ; preds = %if.then
+  call void @_ZdlPv() #9
+  unreachable
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+define void @_ZN1BI1DEC2EPS0_(%class.B* nocapture %this, %class.D* %p1) unnamed_addr uwtable align 2 {
+entry:
+  %m_ptr = getelementptr inbounds %class.B* %this, i64 0, i32 0
+  store %class.D* %p1, %class.D** %m_ptr, align 8
+  %_ref.i = getelementptr inbounds %class.D* %p1, i64 0, i32 0, i32 0, i32 1, i32 0, i32 0
+  %0 = load i32* %_ref.i, align 4
+  %inc.i = add nsw i32 %0, 1
+  store i32 %inc.i, i32* %_ref.i, align 4
+  ret void
+}
+
+define void @_ZN1AI1CE3refEv(%class.A* nocapture %this) nounwind uwtable align 2 {
+entry:
+  %_ref = getelementptr inbounds %class.A* %this, i64 0, i32 0
+  %0 = load i32* %_ref, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %_ref, align 4
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/lit.local.cfg b/test/Transforms/JumpThreading/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/JumpThreading/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
index e651f9a..e5bf64b 100644
--- a/test/Transforms/JumpThreading/thread-loads.ll
+++ b/test/Transforms/JumpThreading/thread-loads.ll
@@ -75,7 +75,7 @@ bb3:		; preds = %bb1
 	ret i32 %res.0
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
+!0 = metadata !{metadata !3, metadata !3, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-
+!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/Transforms/LCSSA/lit.local.cfg b/test/Transforms/LCSSA/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LCSSA/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll b/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
index 86c2679..7cf7a32 100644
--- a/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
+++ b/test/Transforms/LICM/2011-04-06-PromoteResultOfPromotion.ll
@@ -30,8 +30,10 @@ for.end:                                          ; preds = %for.inc
   ret void
 }
 
-!0 = metadata !{metadata !"any pointer", metadata !1}
+!0 = metadata !{metadata !5, metadata !5, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
 !3 = metadata !{metadata !"short", metadata !1}
-!4 = metadata !{metadata !"int", metadata !1}
+!4 = metadata !{metadata !6, metadata !6, i64 0}
+!5 = metadata !{metadata !"any pointer", metadata !1}
+!6 = metadata !{metadata !"int", metadata !1}
diff --git a/test/Transforms/LICM/debug-value.ll b/test/Transforms/LICM/debug-value.ll
index 3c70064..e5c774f 100644
--- a/test/Transforms/LICM/debug-value.ll
+++ b/test/Transforms/LICM/debug-value.ll
@@ -33,19 +33,20 @@ for.end104:                                       ; preds = %for.cond.backedge
 
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
+!llvm.module.flags = !{!26}
 !llvm.dbg.sp = !{!0, !6, !9, !10}
 
 !0 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"idamax", metadata !"idamax", metadata !"", i32 112, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 589865, metadata !25} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !25, i32 12, metadata !"clang version 2.9 (trunk 127169)", i1 true, metadata !"", i32 0, metadata !8, metadata !8, metadata !8, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !25, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !25, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"dscal", metadata !"dscal", metadata !"", i32 206, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 589845, metadata !25, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 589845, metadata !25, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null}
 !9 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"daxpy", metadata !"daxpy", metadata !"", i32 230, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"dgefa", metadata !"dgefa", metadata !"", i32 267, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!10 = metadata !{i32 589870, metadata !25, metadata !1, metadata !"dgefa", metadata !"dgefa", metadata !"", i32 267, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 267] [def] [scope 0] [dgefa]
 !11 = metadata !{i32 281, i32 9, metadata !12, null}
 !12 = metadata !{i32 589835, metadata !25, metadata !13, i32 272, i32 5, i32 32} ; [ DW_TAG_lexical_block ]
 !13 = metadata !{i32 589835, metadata !25, metadata !14, i32 271, i32 5, i32 31} ; [ DW_TAG_lexical_block ]
@@ -61,3 +62,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !23 = metadata !{i32 296, i32 13, metadata !17, null}
 !24 = metadata !{i32 313, i32 1, metadata !14, null}
 !25 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/Benchmarks/CoyoteBench/lpbench.c", metadata !"/private/tmp"}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/LICM/lit.local.cfg b/test/Transforms/LICM/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LICM/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LICM/promote-order.ll b/test/Transforms/LICM/promote-order.ll
index b016265..86f11fe 100644
--- a/test/Transforms/LICM/promote-order.ll
+++ b/test/Transforms/LICM/promote-order.ll
@@ -37,5 +37,7 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 !0 = metadata !{metadata !"minimal TBAA"}
-!1 = metadata !{metadata !"float", metadata !0}
-!2 = metadata !{metadata !"int", metadata !0}
+!1 = metadata !{metadata !3, metadata !3, i64 0}
+!2 = metadata !{metadata !4, metadata !4, i64 0}
+!3 = metadata !{metadata !"float", metadata !0}
+!4 = metadata !{metadata !"int", metadata !0}
diff --git a/test/Transforms/LICM/scalar_promote.ll b/test/Transforms/LICM/scalar_promote.ll
index b3e45c5..92ef155 100644
--- a/test/Transforms/LICM/scalar_promote.ll
+++ b/test/Transforms/LICM/scalar_promote.ll
@@ -181,7 +181,9 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK-NEXT:  store i32 %inc, i32* %gi, align 4, !tbaa !0
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
+!0 = metadata !{metadata !4, metadata !4, i64 0}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"float", metadata !1}
+!3 = metadata !{metadata !5, metadata !5, i64 0}
+!4 = metadata !{metadata !"int", metadata !1}
+!5 = metadata !{metadata !"float", metadata !1}
diff --git a/test/Transforms/LICM/volatile-alias.ll b/test/Transforms/LICM/volatile-alias.ll
new file mode 100644
index 0000000..886d7f2
--- /dev/null
+++ b/test/Transforms/LICM/volatile-alias.ll
@@ -0,0 +1,54 @@
+; RUN: opt -basicaa -sroa -loop-rotate -licm -S < %s | FileCheck %s
+; The objects *p and *q are aliased to each other, but even though *q is
+; volatile, *p can be considered invariant in the loop. Check if it is moved
+; out of the loop.
+; CHECK: load i32* %p
+; CHECK: for.body:
+; CHECK; load volatile i32* %q
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32* %p, i32* %q, i32 %n) #0 {
+entry:
+  %p.addr = alloca i32*, align 8
+  %q.addr = alloca i32*, align 8
+  %n.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %s = alloca i32, align 4
+  store i32* %p, i32** %p.addr, align 8
+  store i32* %q, i32** %q.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store i32 0, i32* %s, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32* %i, align 4
+  %1 = load i32* %n.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32** %p.addr, align 8
+  %3 = load i32* %2, align 4
+  %4 = load i32** %q.addr, align 8
+  %5 = load volatile i32* %4, align 4
+  %add = add nsw i32 %3, %5
+  %6 = load i32* %s, align 4
+  %add1 = add nsw i32 %6, %add
+  store i32 %add1, i32* %s, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32* %s, align 4
+  ret i32 %8
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/LoopDeletion/lit.local.cfg b/test/Transforms/LoopDeletion/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopDeletion/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopIdiom/X86/lit.local.cfg b/test/Transforms/LoopIdiom/X86/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/Transforms/LoopIdiom/X86/lit.local.cfg
+++ b/test/Transforms/LoopIdiom/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopIdiom/basic-address-space.ll b/test/Transforms/LoopIdiom/basic-address-space.ll
new file mode 100644
index 0000000..697ab37
--- /dev/null
+++ b/test/Transforms/LoopIdiom/basic-address-space.ll
@@ -0,0 +1,91 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-p1:64:64:64-p2:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+; Two dimensional nested loop should be promoted to one big memset.
+define void @test10(i8 addrspace(2)* %X) nounwind ssp {
+; CHECK-LABEL: @test10(
+; CHECK: entry:
+; CHECK-NEXT: call void @llvm.memset.p2i8.i16(i8 addrspace(2)* %X, i8 0, i16 10000, i32 1, i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+
+entry:
+  br label %bb.nph
+
+bb.nph:                                           ; preds = %entry, %for.inc10
+  %i.04 = phi i16 [ 0, %entry ], [ %inc12, %for.inc10 ]
+  br label %for.body5
+
+for.body5:                                        ; preds = %for.body5, %bb.nph
+  %j.02 = phi i16 [ 0, %bb.nph ], [ %inc, %for.body5 ]
+  %mul = mul nsw i16 %i.04, 100
+  %add = add nsw i16 %j.02, %mul
+  %arrayidx = getelementptr inbounds i8 addrspace(2)* %X, i16 %add
+  store i8 0, i8 addrspace(2)* %arrayidx, align 1
+  %inc = add nsw i16 %j.02, 1
+  %cmp4 = icmp eq i16 %inc, 100
+  br i1 %cmp4, label %for.inc10, label %for.body5
+
+for.inc10:                                        ; preds = %for.body5
+  %inc12 = add nsw i16 %i.04, 1
+  %cmp = icmp eq i16 %inc12, 100
+  br i1 %cmp, label %for.end13, label %bb.nph
+
+for.end13:                                        ; preds = %for.inc10
+  ret void
+}
+
+define void @test11_pattern(i32 addrspace(2)* nocapture %P) nounwind ssp {
+; CHECK-LABEL: @test11_pattern(
+; CHECK-NOT: memset_pattern
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
+  %arrayidx = getelementptr i32 addrspace(2)* %P, i64 %indvar
+  store i32 1, i32 addrspace(2)* %arrayidx, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; PR9815 - This is a partial overlap case that cannot be safely transformed
+; into a memcpy.
+@g_50 = addrspace(2) global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
+
+
+define i32 @test14() nounwind {
+; CHECK-LABEL: @test14(
+; CHECK: for.body:
+; CHECK: load i32
+; CHECK: store i32
+; CHECK: br i1 %cmp
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %tmp5 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %tmp5, 4
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [7 x i32] addrspace(2)* @g_50, i32 0, i64 %idxprom
+  %tmp2 = load i32 addrspace(2)* %arrayidx, align 4
+  %add4 = add nsw i32 %tmp5, 5
+  %idxprom5 = sext i32 %add4 to i64
+  %arrayidx6 = getelementptr inbounds [7 x i32] addrspace(2)* @g_50, i32 0, i64 %idxprom5
+  store i32 %tmp2, i32 addrspace(2)* %arrayidx6, align 4
+  %inc = add nsw i32 %tmp5, 1
+  %cmp = icmp slt i32 %inc, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %tmp8 = load i32 addrspace(2)* getelementptr inbounds ([7 x i32] addrspace(2)* @g_50, i32 0, i64 6), align 4
+  ret i32 %tmp8
+}
+
diff --git a/test/Transforms/LoopIdiom/debug-line.ll b/test/Transforms/LoopIdiom/debug-line.ll
index 2337590..ef4a478 100644
--- a/test/Transforms/LoopIdiom/debug-line.ll
+++ b/test/Transforms/LoopIdiom/debug-line.ll
@@ -27,12 +27,13 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
+!llvm.module.flags = !{!19}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !18, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (double*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 589870, metadata !18, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (double*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
 !1 = metadata !{i32 589865, metadata !18} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !18, i32 12, metadata !"clang version 2.9 (trunk 127165:127174)", i1 true, metadata !"", i32 0, metadata !9, metadata !9, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !18, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !18, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 590081, metadata !0, metadata !"a", metadata !1, i32 16777218, metadata !6, i32 0} ; [ DW_TAG_arg_variable ]
 !6 = metadata !{i32 589839, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
@@ -48,3 +49,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !16 = metadata !{i32 3, i32 29, metadata !11, null}
 !17 = metadata !{i32 5, i32 1, metadata !12, null}
 !18 = metadata !{metadata !"li.c", metadata !"/private/tmp"}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/LoopIdiom/lit.local.cfg b/test/Transforms/LoopIdiom/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopIdiom/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopReroll/basic.ll b/test/Transforms/LoopReroll/basic.ll
new file mode 100644
index 0000000..314a149
--- /dev/null
+++ b/test/Transforms/LoopReroll/basic.ll
@@ -0,0 +1,327 @@
+; RUN: opt < %s -loop-reroll -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; int foo(int a);
+; void bar(int *x) {
+;   for (int i = 0; i < 500; i += 3) {
+;     foo(i);
+;     foo(i+1);
+;     foo(i+2);
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @bar(i32* nocapture readnone %x) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %call = tail call i32 @foo(i32 %i.08) #1
+  %add = add nsw i32 %i.08, 1
+  %call1 = tail call i32 @foo(i32 %add) #1
+  %add2 = add nsw i32 %i.08, 2
+  %call3 = tail call i32 @foo(i32 %add2) #1
+  %add3 = add nsw i32 %i.08, 3
+  %exitcond = icmp eq i32 %add3, 500
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK-LABEL: @bar
+
+; CHECK: for.body:
+; CHECK: %indvar = phi i32 [ %indvar.next, %for.body ], [ 0, %entry ]
+; CHECK: %call = tail call i32 @foo(i32 %indvar) #1
+; CHECK: %indvar.next = add i32 %indvar, 1
+; CHECK: %exitcond1 = icmp eq i32 %indvar.next, 498
+; CHECK: br i1 %exitcond1, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i32 @foo(i32)
+
+; void hi1(int *x) {
+;   for (int i = 0; i < 1500; i += 3) {
+;     x[i] = foo(0);
+;     x[i+1] = foo(0);
+;     x[i+2] = foo(0);
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @hi1(i32* nocapture %x) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %call = tail call i32 @foo(i32 0) #1
+  %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
+  store i32 %call, i32* %arrayidx, align 4
+  %call1 = tail call i32 @foo(i32 0) #1
+  %0 = add nsw i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32* %x, i64 %0
+  store i32 %call1, i32* %arrayidx3, align 4
+  %call4 = tail call i32 @foo(i32 0) #1
+  %1 = add nsw i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds i32* %x, i64 %1
+  store i32 %call4, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %2, 1500
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK-LABEL: @hi1
+
+; CHECK: for.body:
+; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
+; CHECK: %call = tail call i32 @foo(i32 0) #1
+; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvar
+; CHECK: store i32 %call, i32* %arrayidx, align 4
+; CHECK: %indvar.next = add i64 %indvar, 1
+; CHECK: %exitcond = icmp eq i64 %indvar.next, 1500
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; void hi2(int *x) {
+;   for (int i = 0; i < 500; ++i) {
+;     x[3*i] = foo(0);
+;     x[3*i+1] = foo(0);
+;     x[3*i+2] = foo(0);
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @hi2(i32* nocapture %x) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %call = tail call i32 @foo(i32 0) #1
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32* %x, i64 %0
+  store i32 %call, i32* %arrayidx, align 4
+  %call1 = tail call i32 @foo(i32 0) #1
+  %1 = add nsw i64 %0, 1
+  %arrayidx4 = getelementptr inbounds i32* %x, i64 %1
+  store i32 %call1, i32* %arrayidx4, align 4
+  %call5 = tail call i32 @foo(i32 0) #1
+  %2 = add nsw i64 %0, 2
+  %arrayidx9 = getelementptr inbounds i32* %x, i64 %2
+  store i32 %call5, i32* %arrayidx9, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 500
+  br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK-LABEL: @hi2
+
+; CHECK: for.body:
+; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK: %call = tail call i32 @foo(i32 0) #1
+; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
+; CHECK: store i32 %call, i32* %arrayidx, align 4
+; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK: %exitcond1 = icmp eq i64 %indvars.iv.next, 1500
+; CHECK: br i1 %exitcond1, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; void goo(float alpha, float *a, float *b) {
+;   for (int i = 0; i < 3200; i += 5) {
+;     a[i] += alpha * b[i];
+;     a[i + 1] += alpha * b[i + 1];
+;     a[i + 2] += alpha * b[i + 2];
+;     a[i + 3] += alpha * b[i + 3];
+;     a[i + 4] += alpha * b[i + 4];
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @goo(float %alpha, float* nocapture %a, float* nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %mul = fmul float %0, %alpha
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %1 = load float* %arrayidx2, align 4
+  %add = fadd float %1, %mul
+  store float %add, float* %arrayidx2, align 4
+  %2 = add nsw i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds float* %b, i64 %2
+  %3 = load float* %arrayidx5, align 4
+  %mul6 = fmul float %3, %alpha
+  %arrayidx9 = getelementptr inbounds float* %a, i64 %2
+  %4 = load float* %arrayidx9, align 4
+  %add10 = fadd float %4, %mul6
+  store float %add10, float* %arrayidx9, align 4
+  %5 = add nsw i64 %indvars.iv, 2
+  %arrayidx13 = getelementptr inbounds float* %b, i64 %5
+  %6 = load float* %arrayidx13, align 4
+  %mul14 = fmul float %6, %alpha
+  %arrayidx17 = getelementptr inbounds float* %a, i64 %5
+  %7 = load float* %arrayidx17, align 4
+  %add18 = fadd float %7, %mul14
+  store float %add18, float* %arrayidx17, align 4
+  %8 = add nsw i64 %indvars.iv, 3
+  %arrayidx21 = getelementptr inbounds float* %b, i64 %8
+  %9 = load float* %arrayidx21, align 4
+  %mul22 = fmul float %9, %alpha
+  %arrayidx25 = getelementptr inbounds float* %a, i64 %8
+  %10 = load float* %arrayidx25, align 4
+  %add26 = fadd float %10, %mul22
+  store float %add26, float* %arrayidx25, align 4
+  %11 = add nsw i64 %indvars.iv, 4
+  %arrayidx29 = getelementptr inbounds float* %b, i64 %11
+  %12 = load float* %arrayidx29, align 4
+  %mul30 = fmul float %12, %alpha
+  %arrayidx33 = getelementptr inbounds float* %a, i64 %11
+  %13 = load float* %arrayidx33, align 4
+  %add34 = fadd float %13, %mul30
+  store float %add34, float* %arrayidx33, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %14 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %14, 3200
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK-LABEL: @goo
+
+; CHECK: for.body:
+; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
+; CHECK: %arrayidx = getelementptr inbounds float* %b, i64 %indvar
+; CHECK: %0 = load float* %arrayidx, align 4
+; CHECK: %mul = fmul float %0, %alpha
+; CHECK: %arrayidx2 = getelementptr inbounds float* %a, i64 %indvar
+; CHECK: %1 = load float* %arrayidx2, align 4
+; CHECK: %add = fadd float %1, %mul
+; CHECK: store float %add, float* %arrayidx2, align 4
+; CHECK: %indvar.next = add i64 %indvar, 1
+; CHECK: %exitcond = icmp eq i64 %indvar.next, 3200
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; void hoo(float alpha, float *a, float *b, int *ip) {
+;   for (int i = 0; i < 3200; i += 5) {
+;     a[i] += alpha * b[ip[i]];
+;     a[i + 1] += alpha * b[ip[i + 1]];
+;     a[i + 2] += alpha * b[ip[i + 2]];
+;     a[i + 3] += alpha * b[ip[i + 3]];
+;     a[i + 4] += alpha * b[ip[i + 4]];
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @hoo(float %alpha, float* nocapture %a, float* nocapture readonly %b, i32* nocapture readonly %ip) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %ip, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom1
+  %1 = load float* %arrayidx2, align 4
+  %mul = fmul float %1, %alpha
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %2 = load float* %arrayidx4, align 4
+  %add = fadd float %2, %mul
+  store float %add, float* %arrayidx4, align 4
+  %3 = add nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds i32* %ip, i64 %3
+  %4 = load i32* %arrayidx7, align 4
+  %idxprom8 = sext i32 %4 to i64
+  %arrayidx9 = getelementptr inbounds float* %b, i64 %idxprom8
+  %5 = load float* %arrayidx9, align 4
+  %mul10 = fmul float %5, %alpha
+  %arrayidx13 = getelementptr inbounds float* %a, i64 %3
+  %6 = load float* %arrayidx13, align 4
+  %add14 = fadd float %6, %mul10
+  store float %add14, float* %arrayidx13, align 4
+  %7 = add nsw i64 %indvars.iv, 2
+  %arrayidx17 = getelementptr inbounds i32* %ip, i64 %7
+  %8 = load i32* %arrayidx17, align 4
+  %idxprom18 = sext i32 %8 to i64
+  %arrayidx19 = getelementptr inbounds float* %b, i64 %idxprom18
+  %9 = load float* %arrayidx19, align 4
+  %mul20 = fmul float %9, %alpha
+  %arrayidx23 = getelementptr inbounds float* %a, i64 %7
+  %10 = load float* %arrayidx23, align 4
+  %add24 = fadd float %10, %mul20
+  store float %add24, float* %arrayidx23, align 4
+  %11 = add nsw i64 %indvars.iv, 3
+  %arrayidx27 = getelementptr inbounds i32* %ip, i64 %11
+  %12 = load i32* %arrayidx27, align 4
+  %idxprom28 = sext i32 %12 to i64
+  %arrayidx29 = getelementptr inbounds float* %b, i64 %idxprom28
+  %13 = load float* %arrayidx29, align 4
+  %mul30 = fmul float %13, %alpha
+  %arrayidx33 = getelementptr inbounds float* %a, i64 %11
+  %14 = load float* %arrayidx33, align 4
+  %add34 = fadd float %14, %mul30
+  store float %add34, float* %arrayidx33, align 4
+  %15 = add nsw i64 %indvars.iv, 4
+  %arrayidx37 = getelementptr inbounds i32* %ip, i64 %15
+  %16 = load i32* %arrayidx37, align 4
+  %idxprom38 = sext i32 %16 to i64
+  %arrayidx39 = getelementptr inbounds float* %b, i64 %idxprom38
+  %17 = load float* %arrayidx39, align 4
+  %mul40 = fmul float %17, %alpha
+  %arrayidx43 = getelementptr inbounds float* %a, i64 %15
+  %18 = load float* %arrayidx43, align 4
+  %add44 = fadd float %18, %mul40
+  store float %add44, float* %arrayidx43, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %19 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %19, 3200
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK-LABEL: @hoo
+
+; CHECK: for.body:
+; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
+; CHECK: %arrayidx = getelementptr inbounds i32* %ip, i64 %indvar
+; CHECK: %0 = load i32* %arrayidx, align 4
+; CHECK: %idxprom1 = sext i32 %0 to i64
+; CHECK: %arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom1
+; CHECK: %1 = load float* %arrayidx2, align 4
+; CHECK: %mul = fmul float %1, %alpha
+; CHECK: %arrayidx4 = getelementptr inbounds float* %a, i64 %indvar
+; CHECK: %2 = load float* %arrayidx4, align 4
+; CHECK: %add = fadd float %2, %mul
+; CHECK: store float %add, float* %arrayidx4, align 4
+; CHECK: %indvar.next = add i64 %indvar, 1
+; CHECK: %exitcond = icmp eq i64 %indvar.next, 3200
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
diff --git a/test/Transforms/LoopReroll/reduction.ll b/test/Transforms/LoopReroll/reduction.ll
new file mode 100644
index 0000000..aed7670
--- /dev/null
+++ b/test/Transforms/LoopReroll/reduction.ll
@@ -0,0 +1,96 @@
+; RUN: opt < %s -loop-reroll -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo(i32* nocapture readonly %x) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %r.029
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32* %x, i64 %1
+  %2 = load i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %add, %2
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds i32* %x, i64 %3
+  %4 = load i32* %arrayidx7, align 4
+  %add8 = add nsw i32 %add4, %4
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx11 = getelementptr inbounds i32* %x, i64 %5
+  %6 = load i32* %arrayidx11, align 4
+  %add12 = add nsw i32 %add8, %6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %7 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %7, 400
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK-LABEL: @foo
+
+; CHECK: for.body:
+; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
+; CHECK: %r.029 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvar
+; CHECK: %0 = load i32* %arrayidx, align 4
+; CHECK: %add = add nsw i32 %0, %r.029
+; CHECK: %indvar.next = add i64 %indvar, 1
+; CHECK: %exitcond = icmp eq i64 %indvar.next, 400
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add12
+}
+
+define float @bar(float* nocapture readonly %x) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.029 = phi float [ 0.0, %entry ], [ %add12, %for.body ]
+  %arrayidx = getelementptr inbounds float* %x, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %add = fadd float %0, %r.029
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds float* %x, i64 %1
+  %2 = load float* %arrayidx3, align 4
+  %add4 = fadd float %add, %2
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx7 = getelementptr inbounds float* %x, i64 %3
+  %4 = load float* %arrayidx7, align 4
+  %add8 = fadd float %add4, %4
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx11 = getelementptr inbounds float* %x, i64 %5
+  %6 = load float* %arrayidx11, align 4
+  %add12 = fadd float %add8, %6
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %7 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %7, 400
+  br i1 %cmp, label %for.body, label %for.end
+
+; CHECK-LABEL: @bar
+
+; CHECK: for.body:
+; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
+; CHECK: %r.029 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+; CHECK: %arrayidx = getelementptr inbounds float* %x, i64 %indvar
+; CHECK: %0 = load float* %arrayidx, align 4
+; CHECK: %add = fadd float %0, %r.029
+; CHECK: %indvar.next = add i64 %indvar, 1
+; CHECK: %exitcond = icmp eq i64 %indvar.next, 400
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+
+; CHECK: ret
+
+for.end:                                          ; preds = %for.body
+  ret float %add12
+}
+
+attributes #0 = { nounwind readonly uwtable }
+
diff --git a/test/Transforms/LoopRotate/dbgvalue.ll b/test/Transforms/LoopRotate/dbgvalue.ll
index 3434cdc..9461980 100644
--- a/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/test/Transforms/LoopRotate/dbgvalue.ll
@@ -77,12 +77,13 @@ for.end:
   ret void
 }
 
+!llvm.module.flags = !{!20}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !18, metadata !1, metadata !"tak", metadata !"tak", metadata !"", i32 32, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i32, i32)* @tak, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 589870, metadata !18, metadata !1, metadata !"tak", metadata !"tak", metadata !"", i32 32, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i32, i32)* @tak, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 32] [def] [scope 0] [tak]
 !1 = metadata !{i32 589865, metadata !18} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !18, i32 12, metadata !"clang version 2.9 (trunk 125492)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !18, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !18, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 590081, metadata !0, metadata !"x", metadata !1, i32 32, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
@@ -99,3 +100,4 @@ for.end:
 !17 = metadata !{i32 37, i32 1, metadata !13, null}
 !18 = metadata !{metadata !"/Volumes/Lalgate/cj/llvm/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame/recursive.c", metadata !"/Volumes/Lalgate/cj/D/projects/llvm-test/SingleSource/Benchmarks/BenchmarkGame"}
 !19 = metadata !{i32 0}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/LoopRotate/lit.local.cfg b/test/Transforms/LoopRotate/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopRotate/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopSimplify/dup-preds.ll b/test/Transforms/LoopSimplify/dup-preds.ll
new file mode 100644
index 0000000..3d1f149
--- /dev/null
+++ b/test/Transforms/LoopSimplify/dup-preds.ll
@@ -0,0 +1,46 @@
+; RUN: opt -loop-simplify -S %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define fastcc void @do_update_md([3 x float]* nocapture readonly %x) #0 {
+entry:
+  br i1 undef, label %if.end365, label %lor.lhs.false134
+
+lor.lhs.false134:                                 ; preds = %entry
+  br i1 undef, label %lor.lhs.false138, label %if.end365
+
+lor.lhs.false138:                                 ; preds = %lor.lhs.false134
+  br i1 undef, label %lor.lhs.false142, label %if.end365
+
+lor.lhs.false142:                                 ; preds = %lor.lhs.false138
+  br i1 undef, label %for.body276.lr.ph, label %if.end365
+
+for.body276.lr.ph:                                ; preds = %lor.lhs.false142
+  switch i16 undef, label %if.then288 [
+    i16 4, label %for.body344
+    i16 2, label %for.body344
+  ]
+
+if.then288:                                       ; preds = %for.body276.lr.ph
+  br label %for.body305
+
+for.body305:                                      ; preds = %for.body305, %if.then288
+  br label %for.body305
+
+for.body344:                                      ; preds = %for.body344, %for.body276.lr.ph, %for.body276.lr.ph
+  %indvar = phi i64 [ %indvar.next, %for.body344 ], [ 0, %for.body276.lr.ph ]
+  %indvars.iv552 = phi i64 [ %indvars.iv.next553, %for.body344 ], [ 0, %for.body276.lr.ph ], [ 0, %for.body276.lr.ph ]
+  %indvars.iv.next553 = add nuw nsw i64 %indvars.iv552, 1
+  %indvar.next = add i64 %indvar, 1
+  br label %for.body344
+
+; CHECK-LABEL: @do_update_md
+; CHECK: %indvars.iv552 = phi i64 [ %indvars.iv.next553, %for.body344 ], [ 0, %for.body344.preheader ]
+; CHECK: ret
+
+if.end365:                                        ; preds = %lor.lhs.false142, %lor.lhs.false138, %lor.lhs.false134, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/Transforms/LoopSimplify/lit.local.cfg b/test/Transforms/LoopSimplify/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopSimplify/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
index bac2ffa..8a3ba96 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll b/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll
index fad5241..001a1d6 100644
--- a/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/2011-12-04-loserreg.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -loop-reduce -S | FileCheck %s
 ;
-; Test LSR's ability to prune formulae that refer to nonexistant
+; Test LSR's ability to prune formulae that refer to nonexistent
 ; AddRecs in other loops.
 ;
 ; Unable to reduce this case further because it requires LSR to exceed
diff --git a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
index da2db5a..ba763cf 100644
--- a/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
+++ b/test/Transforms/LoopStrengthReduce/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/addrec-gep-address-space.ll b/test/Transforms/LoopStrengthReduce/addrec-gep-address-space.ll
new file mode 100644
index 0000000..6333291
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/addrec-gep-address-space.ll
@@ -0,0 +1,88 @@
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; CHECK: bb1:
+; CHECK: load double addrspace(1)* [[IV:%[^,]+]]
+; CHECK: store double {{.*}}, double addrspace(1)* [[IV]]
+
+; CHECK-NOT: cast
+; Make sure the GEP has the right index type
+; CHECK: getelementptr double addrspace(1)* [[IV]], i16 1
+; CHECK: br {{.*}} label %bb1
+
+; Make sure the GEP has the right index type
+; CHECK: getelementptr double addrspace(1)* {{.*}}, i16
+
+
+; This test tests several things. The load and store should use the
+; same address instead of having it computed twice, and SCEVExpander should
+; be able to reconstruct the full getelementptr, despite it having a few
+; obstacles set in its way.
+; We only check that the inner loop (bb1-bb2) is "reduced" because LSR
+; currently only operates on inner loops.
+
+target datalayout = "e-p:64:64:64-p1:16:16:16-n16:32:64"
+
+define void @foo(i64 %n, i64 %m, i64 %o, i64 %q, double addrspace(1)* nocapture %p) nounwind {
+entry:
+	%tmp = icmp sgt i64 %n, 0		; <i1> [#uses=1]
+	br i1 %tmp, label %bb.nph3, label %return
+
+bb.nph:		; preds = %bb2.preheader
+	%tmp1 = mul i64 %tmp16, %i.02		; <i64> [#uses=1]
+	%tmp2 = mul i64 %tmp19, %i.02		; <i64> [#uses=1]
+	br label %bb1
+
+bb1:		; preds = %bb2, %bb.nph
+	%j.01 = phi i64 [ %tmp9, %bb2 ], [ 0, %bb.nph ]		; <i64> [#uses=3]
+	%tmp3 = add i64 %j.01, %tmp1		; <i64> [#uses=1]
+	%tmp4 = add i64 %j.01, %tmp2		; <i64> [#uses=1]
+        %z0 = add i64 %tmp3, 5203
+	%tmp5 = getelementptr double addrspace(1)* %p, i64 %z0		; <double addrspace(1)*> [#uses=1]
+	%tmp6 = load double addrspace(1)* %tmp5, align 8		; <double> [#uses=1]
+	%tmp7 = fdiv double %tmp6, 2.100000e+00		; <double> [#uses=1]
+        %z1 = add i64 %tmp4, 5203
+	%tmp8 = getelementptr double addrspace(1)* %p, i64 %z1		; <double addrspace(1)*> [#uses=1]
+	store double %tmp7, double addrspace(1)* %tmp8, align 8
+	%tmp9 = add i64 %j.01, 1		; <i64> [#uses=2]
+	br label %bb2
+
+bb2:		; preds = %bb1
+	%tmp10 = icmp slt i64 %tmp9, %m		; <i1> [#uses=1]
+	br i1 %tmp10, label %bb1, label %bb2.bb3_crit_edge
+
+bb2.bb3_crit_edge:		; preds = %bb2
+	br label %bb3
+
+bb3:		; preds = %bb2.preheader, %bb2.bb3_crit_edge
+	%tmp11 = add i64 %i.02, 1		; <i64> [#uses=2]
+	br label %bb4
+
+bb4:		; preds = %bb3
+	%tmp12 = icmp slt i64 %tmp11, %n		; <i1> [#uses=1]
+	br i1 %tmp12, label %bb2.preheader, label %bb4.return_crit_edge
+
+bb4.return_crit_edge:		; preds = %bb4
+	br label %bb4.return_crit_edge.split
+
+bb4.return_crit_edge.split:		; preds = %bb.nph3, %bb4.return_crit_edge
+	br label %return
+
+bb.nph3:		; preds = %entry
+	%tmp13 = icmp sgt i64 %m, 0		; <i1> [#uses=1]
+	%tmp14 = mul i64 %n, 37		; <i64> [#uses=1]
+	%tmp15 = mul i64 %tmp14, %o		; <i64> [#uses=1]
+	%tmp16 = mul i64 %tmp15, %q		; <i64> [#uses=1]
+	%tmp17 = mul i64 %n, 37		; <i64> [#uses=1]
+	%tmp18 = mul i64 %tmp17, %o		; <i64> [#uses=1]
+	%tmp19 = mul i64 %tmp18, %q		; <i64> [#uses=1]
+	br i1 %tmp13, label %bb.nph3.split, label %bb4.return_crit_edge.split
+
+bb.nph3.split:		; preds = %bb.nph3
+	br label %bb2.preheader
+
+bb2.preheader:		; preds = %bb.nph3.split, %bb4
+	%i.02 = phi i64 [ %tmp11, %bb4 ], [ 0, %bb.nph3.split ]		; <i64> [#uses=3]
+	br i1 true, label %bb.nph, label %bb3
+
+return:		; preds = %bb4.return_crit_edge.split, %entry
+	ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/address-space-loop.ll b/test/Transforms/LoopStrengthReduce/address-space-loop.ll
new file mode 100644
index 0000000..9c1b213
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/address-space-loop.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+
+; LSR shouldn't consider %t8 to be an interesting user of %t6, and it
+; should be able to form pretty GEPs.
+
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; Copy of uglygep with a different address space
+; This tests expandAddToGEP uses the right smaller integer type for
+; another address space
+define void @Z4() nounwind {
+; CHECK-LABEL: @Z4(
+bb:
+  br label %bb3
+
+bb1:                                              ; preds = %bb3
+  br i1 undef, label %bb10, label %bb2
+
+bb2:                                              ; preds = %bb1
+  %t = add i16 %t4, 1                         ; <i16> [#uses=1]
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %t4 = phi i16 [ %t, %bb2 ], [ 0, %bb ]      ; <i16> [#uses=3]
+  br label %bb1
+
+; CHECK: bb10:
+; CHECK-NEXT: %t7 = icmp eq i16 %t4, 0
+; Host %t2 computation outside the loop.
+; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8 addrspace(1)* undef, i16 %t4
+; CHECK-NEXT: br label %bb14
+bb10:                                             ; preds = %bb9
+  %t7 = icmp eq i16 %t4, 0                    ; <i1> [#uses=1]
+  %t3 = add i16 %t4, 16                     ; <i16> [#uses=1]
+  br label %bb14
+
+; CHECK: bb14:
+; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[SCEVGEP]]
+; CHECK-NEXT: %t6 = load float addrspace(1)* addrspace(1)* undef
+; Fold %t3's add within the address.
+; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float addrspace(1)* %t6, i16 4
+; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float addrspace(1)* [[SCEVGEP1]] to i8 addrspace(1)*
+; Use the induction variable (%t4) to access the right element
+; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8 addrspace(1)* [[SCEVGEP2]], i16 %t4
+; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[ADDRESS]]
+; CHECK-NEXT: br label %bb14
+bb14:                                             ; preds = %bb14, %bb10
+  %t2 = getelementptr inbounds i8 addrspace(1)* undef, i16 %t4 ; <i8*> [#uses=1]
+  store i8 undef, i8 addrspace(1)* %t2
+  %t6 = load float addrspace(1)* addrspace(1)* undef
+  %t8 = bitcast float addrspace(1)* %t6 to i8 addrspace(1)*              ; <i8*> [#uses=1]
+  %t9 = getelementptr inbounds i8 addrspace(1)* %t8, i16 %t3 ; <i8*> [#uses=1]
+  store i8 undef, i8 addrspace(1)* %t9
+  br label %bb14
+}
+
diff --git a/test/Transforms/LoopStrengthReduce/dominate-assert.ll b/test/Transforms/LoopStrengthReduce/dominate-assert.ll
index ff8cab8..3ba93ff 100644
--- a/test/Transforms/LoopStrengthReduce/dominate-assert.ll
+++ b/test/Transforms/LoopStrengthReduce/dominate-assert.ll
@@ -68,3 +68,46 @@ bb7:
           catch i8* null
   ret void
 }
+
+; PR17425
+define void @i() {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %c.0 = phi i16* [ undef, %entry ], [ %incdec.ptr, %while.cond ]
+  %incdec.ptr = getelementptr inbounds i16* %c.0, i64 1
+  br i1 undef, label %while.cond1, label %while.cond
+
+while.cond1:                                      ; preds = %while.cond1, %while.cond
+  %c.1 = phi i16* [ %incdec.ptr5, %while.cond1 ], [ %c.0, %while.cond ]
+  %incdec.ptr5 = getelementptr inbounds i16* %c.1, i64 1
+  br i1 undef, label %while.cond7, label %while.cond1
+
+while.cond7:                                      ; preds = %while.cond7, %while.cond1
+  %0 = phi i16* [ %incdec.ptr10, %while.cond7 ], [ %c.1, %while.cond1 ]
+  %incdec.ptr10 = getelementptr inbounds i16* %0, i64 1
+  br i1 undef, label %while.cond12.preheader, label %while.cond7
+
+while.cond12.preheader:                           ; preds = %while.cond7
+  br i1 undef, label %while.end16, label %while.body13.lr.ph
+
+while.body13:                                     ; preds = %if.else, %while.body13.lr.ph
+  %1 = phi i16* [ %2, %while.body13.lr.ph ], [ %incdec.ptr15, %if.else ]
+  br i1 undef, label %while.cond12.outer.loopexit, label %if.else
+
+while.cond12.outer.loopexit:                      ; preds = %while.body13
+  br i1 undef, label %while.end16, label %while.body13.lr.ph
+
+while.body13.lr.ph:                               ; preds = %while.cond12.outer.loopexit, %while.cond12.preheader
+  %2 = phi i16* [ %1, %while.cond12.outer.loopexit ], [ undef, %while.cond12.preheader ]
+  br label %while.body13
+
+if.else:                                          ; preds = %while.body13
+  %incdec.ptr15 = getelementptr inbounds i16* %1, i64 1
+  %cmp = icmp eq i16* %incdec.ptr15, %0
+  br i1 %cmp, label %while.end16, label %while.body13
+
+while.end16:                                      ; preds = %if.else, %while.cond12.outer.loopexit, %while.cond12.preheader
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/lit.local.cfg b/test/Transforms/LoopStrengthReduce/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopStrengthReduce/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll b/test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll
new file mode 100644
index 0000000..255cf41
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll
@@ -0,0 +1,42 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; PR15470: LSR miscompile. The test2 function should return '1'.
+;
+; SCEV expander cannot expand quadratic recurrences outside of the
+; loop. This recurrence depends on %sub.us, so can't be expanded.
+;
+; CHECK-LABEL: @test2
+; CHECK-LABEL: test2.loop:
+; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ -16777216, %entry ]
+; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, 16777216
+;
+; CHECK=LABEL: for.end:
+; CHECK: %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
+; CHECK: %sext.us = mul i32 %lsr.iv.next, %sub.cond.us
+; CHECK: %f = ashr i32 %sext.us, 24
+; CHECK: ret i32 %f
+define i32 @test2() {
+entry:
+  br label %test2.loop
+
+test2.loop:
+  %inc1115.us = phi i32 [ 0, %entry ], [ %inc11.us, %test2.loop ]
+  %inc11.us = add nsw i32 %inc1115.us, 1
+  %cmp.us = icmp slt i32 %inc11.us, 2
+  br i1 %cmp.us, label %test2.loop, label %for.end
+
+for.end:
+  %tobool.us = icmp eq i32 %inc1115.us, 0
+  %sub.us = select i1 %tobool.us, i32 0, i32 0
+  %mul.us = shl i32 %inc1115.us, 24
+  %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
+  %sext.us = mul i32 %mul.us, %sub.cond.us
+  %f = ashr i32 %sext.us, 24
+  br label %exit
+
+exit:
+  ret i32 %f
+}
diff --git a/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll b/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
index 0118241..65aa61f 100644
--- a/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
+++ b/test/Transforms/LoopStrengthReduce/quadradic-exit-value.ll
@@ -1,18 +1,50 @@
-; RUN: opt < %s -analyze -iv-users | grep "{1,+,3,+,2}<%loop> (post-inc with loop %loop)"
+; RUN: opt < %s -analyze -iv-users | FileCheck %s
 
 ; The value of %r is dependent on a polynomial iteration expression.
-
+;
+; CHECK-LABEL: IV Users for loop %foo.loop
+; CHECK: {1,+,3,+,2}<%foo.loop>
 define i64 @foo(i64 %n) {
 entry:
-  br label %loop
+  br label %foo.loop
 
-loop:
-  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
+foo.loop:
+  %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %foo.loop ]
   %indvar.next = add i64 %indvar, 1
   %c = icmp eq i64 %indvar.next, %n
-  br i1 %c, label %exit, label %loop
+  br i1 %c, label %exit, label %foo.loop
 
 exit:
   %r = mul i64 %indvar.next, %indvar.next
   ret i64 %r
 }
+
+; PR15470: LSR miscompile. The test2 function should return '1'.
+;
+; SCEV does not know how to denormalize chained recurrences, so make
+; sure they aren't marked as post-inc users.
+;
+; CHECK-LABEL: IV Users for loop %test2.loop
+; CHECK: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us)),+,33554432}<%test2.loop> in %f = ashr i32 %sext.us, 24
+define i32 @test2() {
+entry:
+  br label %test2.loop
+
+test2.loop:
+  %inc1115.us = phi i32 [ 0, %entry ], [ %inc11.us, %test2.loop ]
+  %inc11.us = add nsw i32 %inc1115.us, 1
+  %cmp.us = icmp slt i32 %inc11.us, 2
+  br i1 %cmp.us, label %test2.loop, label %for.end
+
+for.end:
+  %tobool.us = icmp eq i32 %inc1115.us, 0
+  %sub.us = select i1 %tobool.us, i32 0, i32 0
+  %mul.us = shl i32 %inc1115.us, 24
+  %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
+  %sext.us = mul i32 %mul.us, %sub.cond.us
+  %f = ashr i32 %sext.us, 24
+  br label %exit
+
+exit:
+  ret i32 %f
+}
diff --git a/test/Transforms/LoopStrengthReduce/uglygep-address-space.ll b/test/Transforms/LoopStrengthReduce/uglygep-address-space.ll
new file mode 100644
index 0000000..2c65261
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/uglygep-address-space.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+; LSR shouldn't consider %t8 to be an interesting user of %t6, and it
+; should be able to form pretty GEPs.
+
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+; Copy of uglygep with a different address space
+; This tests expandAddToGEP uses the right smaller integer type for
+; another address space
+define void @Z4() nounwind {
+; CHECK: define void @Z4
+bb:
+  br label %bb3
+
+bb1:                                              ; preds = %bb3
+  br i1 undef, label %bb10, label %bb2
+
+bb2:                                              ; preds = %bb1
+  %t = add i16 %t4, 1                         ; <i16> [#uses=1]
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %t4 = phi i16 [ %t, %bb2 ], [ 0, %bb ]      ; <i16> [#uses=3]
+  br label %bb1
+
+; CHECK: bb10:
+; CHECK-NEXT: %t7 = icmp eq i16 %t4, 0
+; Host %t2 computation outside the loop.
+; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8 addrspace(1)* undef, i16 %t4
+; CHECK-NEXT: br label %bb14
+bb10:                                             ; preds = %bb9
+  %t7 = icmp eq i16 %t4, 0                    ; <i1> [#uses=1]
+  %t3 = add i16 %t4, 16                     ; <i16> [#uses=1]
+  br label %bb14
+
+; CHECK: bb14:
+; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[SCEVGEP]]
+; CHECK-NEXT: %t6 = load float addrspace(1)* addrspace(1)* undef
+; Fold %t3's add within the address.
+; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float addrspace(1)* %t6, i16 4
+; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float addrspace(1)* [[SCEVGEP1]] to i8 addrspace(1)*
+; Use the induction variable (%t4) to access the right element
+; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8 addrspace(1)* [[SCEVGEP2]], i16 %t4
+; CHECK-NEXT: store i8 undef, i8 addrspace(1)* [[ADDRESS]]
+; CHECK-NEXT: br label %bb14
+bb14:                                             ; preds = %bb14, %bb10
+  %t2 = getelementptr inbounds i8 addrspace(1)* undef, i16 %t4 ; <i8*> [#uses=1]
+  store i8 undef, i8 addrspace(1)* %t2
+  %t6 = load float addrspace(1)* addrspace(1)* undef
+  %t8 = bitcast float addrspace(1)* %t6 to i8 addrspace(1)*              ; <i8*> [#uses=1]
+  %t9 = getelementptr inbounds i8 addrspace(1)* %t8, i16 %t3 ; <i8*> [#uses=1]
+  store i8 undef, i8 addrspace(1)* %t9
+  br label %bb14
+}
+
diff --git a/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll b/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
new file mode 100644
index 0000000..17c91e5
--- /dev/null
+++ b/test/Transforms/LoopUnroll/PowerPC/a2-unrolling.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -S -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -loop-unroll | FileCheck %s
+define void @unroll_opt_for_size() nounwind optsize {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @unroll_opt_for_size
+; CHECK:      add
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK: icmp
+
+define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
+entry:
+  %cmp1 = icmp eq i32 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+; CHECK-LABEL: @test
+; CHECK: unr.cmp{{.*}}:
+; CHECK: for.body.unr{{.*}}:
+; CHECK: for.body:
+; CHECK: br i1 %exitcond.7, label %for.end.loopexit{{.*}}, label %for.body
+
diff --git a/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg b/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
new file mode 100644
index 0000000..2e46300
--- /dev/null
+++ b/test/Transforms/LoopUnroll/PowerPC/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'PowerPC' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopUnroll/lit.local.cfg b/test/Transforms/LoopUnroll/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopUnroll/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopUnswitch/lit.local.cfg b/test/Transforms/LoopUnswitch/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopUnswitch/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
index cb77b09..8a3ba96 100644
--- a/test/Transforms/LoopVectorize/ARM/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/ARM/width-detect.ll b/test/Transforms/LoopVectorize/ARM/width-detect.ll
index c0795b6..99d7fa7 100644
--- a/test/Transforms/LoopVectorize/ARM/width-detect.ll
+++ b/test/Transforms/LoopVectorize/ARM/width-detect.ll
@@ -3,27 +3,27 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios3.0.0"
 
-;CHECK:foo_F64
-;CHECK: <2 x double>
+;CHECK:foo_F32
+;CHECK: <4 x float>
 ;CHECK:ret
-define double @foo_F64(double* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
 
 .lr.ph:                                           ; preds = %0, %.lr.ph
   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %prod.01 = phi double [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
-  %2 = getelementptr inbounds double* %A, i64 %indvars.iv
-  %3 = load double* %2, align 8
-  %4 = fmul fast double %prod.01, %3
+  %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
+  %2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  %3 = load float* %2, align 8
+  %4 = fmul fast float %prod.01, %3
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n
   br i1 %exitcond, label %._crit_edge, label %.lr.ph
 
 ._crit_edge:                                      ; preds = %.lr.ph, %0
-  %prod.0.lcssa = phi double [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
-  ret double %prod.0.lcssa
+  %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
+  ret float %prod.0.lcssa
 }
 
 ;CHECK:foo_I8
diff --git a/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
new file mode 100644
index 0000000..885418c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; We want to make sure that we don't even try to vectorize loops again
+; The vectorizer used to mark the un-vectorized loop only as already vectorized
+; thus, trying to vectorize the vectorized loop again
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global [255 x i32]
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @vect() {
+; CHECK: LV: Checking a loop in "vect"
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; We need to make sure we did vectorize the loop
+; CHECK: LV: Found a loop: for.body
+; CHECK: LV: We can vectorize this loop!
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [255 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %red.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+; If it did, we have two loops:
+; CHECK: vector.body:
+; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
+; CHECK: for.body:
+; CHECK: br {{.*}} label %for.body, !llvm.loop [[scalar:![0-9]+]]
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+; Now, we check for the Hint metadata
+; CHECK: [[vect]] = metadata !{metadata [[vect]], metadata [[width:![0-9]+]], metadata [[unroll:![0-9]+]]}
+; CHECK: [[width]] = metadata !{metadata !"llvm.vectorizer.width", i32 1}
+; CHECK: [[unroll]] = metadata !{metadata !"llvm.vectorizer.unroll", i32 1}
+; CHECK: [[scalar]] = metadata !{metadata [[scalar]], metadata [[width]], metadata [[unroll]]}
+
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
index b7f479a..98718e1 100644
--- a/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 @a = common global [2048 x i32] zeroinitializer, align 16
 
 ; The program below gathers and scatters data. We better not vectorize it.
-;CHECK: cost_model_1
+;CHECK-LABEL: @cost_model_1(
 ;CHECK-NOT: <2 x i32>
 ;CHECK-NOT: <4 x i32>
 ;CHECK-NOT: <8 x i32>
diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/Transforms/LoopVectorize/X86/lit.local.cfg
+++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/test/Transforms/LoopVectorize/X86/rauw-bug.ll
new file mode 100644
index 0000000..4284fba
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -0,0 +1,33 @@
+; RUN: opt -slp-vectorizer -S %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; This test used to fail under libgmalloc. Because we would try to access a
+; pointer that was already deleted.
+;
+; llvm-lit -v --param use_gmalloc=1 --param
+;   gmalloc_path=/usr/lib/libgmalloc.dylib
+;   test/Transforms/LoopVectorize/X86/rauw-bug.ll
+;
+; radar://15498655
+
+; CHECK: reduced
+define void @reduced()  {
+entry:
+  br i1 undef, label %while.body, label %while.cond63.preheader.while.end76_crit_edge
+
+while.cond63.preheader.while.end76_crit_edge:
+  ret void
+
+while.body:
+  %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
+  %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
+  %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
+  %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+  %div50 = fmul double %d3_fx.012, 1.250000e-01
+  %sub52 = fsub double 0.000000e+00, %div50
+  %div56 = fmul double %d3_fy.013, 1.250000e-01
+  %sub58 = fsub double 0.000000e+00, %div56
+  br label %while.body
+}
diff --git a/test/Transforms/LoopVectorize/X86/tripcount.ll b/test/Transforms/LoopVectorize/X86/tripcount.ll
new file mode 100644
index 0000000..6b38bac
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/tripcount.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -mcpu=prescott < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd11.0"
+
+@big = external global [0 x i32]
+
+; PR18049
+; We need to truncate the exit count to i32. This is legal because the
+; arithmetic is signed (%inc is nsw).
+
+; CHECK-LABEL: tripcount
+; CHECK: trunc i64 %count to i32
+
+define void @tripcount(i64 %count) {
+entry:
+  %cmp6 = icmp sgt i64 %count, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [0 x i32]* @big, i32 0, i32 %i.07
+  %0 = load i32* %arrayidx, align 4
+  %neg = xor i32 %0, -1
+  store i32 %neg, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.07, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %count
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/test/Transforms/LoopVectorize/X86/unroll-pm.ll
new file mode 100644
index 0000000..5064fec
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -disable-loop-unrolling -S | FileCheck %s -check-prefix=CHECK-NOUNRL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK-LABEL: @bar(
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+;CHECK-NOUNRL-LABEL: @bar(
+;CHECK-NOUNRL: store <4 x i32>
+;CHECK-NOUNRL-NOT: store <4 x i32>
+;CHECK-NOUNRL: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/XCore/lit.local.cfg b/test/Transforms/LoopVectorize/XCore/lit.local.cfg
new file mode 100644
index 0000000..4d17d46
--- /dev/null
+++ b/test/Transforms/LoopVectorize/XCore/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'XCore' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll b/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll
new file mode 100644
index 0000000..a099daa
--- /dev/null
+++ b/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=2 -S -mtriple=xcore | FileCheck %s
+
+target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
+target triple = "xcore"
+; The xcore target has no vector registers, so loop should not be vectorized.
+;CHECK-LABEL: @f(
+;CHECK: entry:
+;CHECK-NOT: vector.body
+;CHECK-NEXT: br label %do.body
+define void @f(i8* nocapture %ptr, i32 %len) {
+entry:
+  br label %do.body
+do.body:
+  %ptr.addr.0 = phi i8* [ %ptr, %entry ], [ %incdec.ptr, %do.body ]
+  %len.addr.0 = phi i32 [ %len, %entry ], [ %dec, %do.body ]
+  %incdec.ptr = getelementptr inbounds i8* %ptr.addr.0, i32 1
+  store i8 0, i8* %ptr.addr.0, align 1
+  %dec = add nsw i32 %len.addr.0, -1
+  %tobool = icmp eq i32 %len.addr.0, 0
+  br i1 %tobool, label %do.end, label %do.body
+do.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/align.ll b/test/Transforms/LoopVectorize/align.ll
new file mode 100644
index 0000000..84b0361
--- /dev/null
+++ b/test/Transforms/LoopVectorize/align.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure we output the abi alignment if no alignment is specified.
+
+;CHECK-LABEL: @align
+;CHECK: load <4 x i32>* {{.*}} align  4
+;CHECK: load <4 x i32>* {{.*}} align  4
+;CHECK: store <4 x i32> {{.*}} align  4
+
+define void @align(i32* %a, i32* %b, i32* %c) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %3 = load i32* %2
+  %4 = getelementptr inbounds i32* %c, i64 %indvars.iv
+  %5 = load i32* %4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 %6, i32* %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 128 
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/bsd_regex.ll b/test/Transforms/LoopVectorize/bsd_regex.ll
index a14b92d..7b71272 100644
--- a/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;PR 15830.
 
-;CHECK: foo
+;CHECK-LABEL: @foo(
 ; When scalarizing stores we need to preserve the original order.
 ; Make sure that we are extracting in the correct order (0101, and not 0011).
 ;CHECK: extractelement <2 x i64> {{.*}}, i32 0
diff --git a/test/Transforms/LoopVectorize/dbg.value.ll b/test/Transforms/LoopVectorize/dbg.value.ll
index b69e72f..2497b25 100644
--- a/test/Transforms/LoopVectorize/dbg.value.ll
+++ b/test/Transforms/LoopVectorize/dbg.value.ll
@@ -42,13 +42,14 @@ attributes #0 = { nounwind ssp uwtable "fp-contract-model"="standard" "no-frame-
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!26}
 
 !0 = metadata !{i32 786449, metadata !25, i32 4, metadata !"clang", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !11, null, metadata !""}
 !1 = metadata !{i32 0}
 !2 = metadata !{metadata !3}
 !3 = metadata !{i32 786478, metadata !25, metadata !4, metadata !"test", metadata !"test", metadata !"test", i32 5, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @test, null, null, metadata !8, i32 5}
 !4 = metadata !{i32 786473, metadata !25}
-!5 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, i32 0}
+!5 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
 !8 = metadata !{metadata !9}
@@ -56,7 +57,7 @@ attributes #1 = { nounwind readnone }
 !10 = metadata !{i32 786443, metadata !25, metadata !3, i32 6, i32 0, i32 0}
 !11 = metadata !{metadata !12, metadata !16, metadata !17}
 !12 = metadata !{i32 786484, i32 0, null, metadata !"A", metadata !"A", metadata !"", metadata !4, i32 1, metadata !13, i32 0, i32 1, [1024 x i32]* @A, null}
-!13 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 32768, i64 32, i32 0, i32 0, metadata !7, metadata !14, i32 0, i32 0}
+!13 = metadata !{i32 786433, null, null, null, i32 0, i64 32768, i64 32, i32 0, i32 0, metadata !7, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32768, align 32, offset 0] [from int]
 !14 = metadata !{metadata !15}
 !15 = metadata !{i32 786465, i64 0, i64 1024}
 !16 = metadata !{i32 786484, i32 0, null, metadata !"B", metadata !"B", metadata !"", metadata !4, i32 2, metadata !13, i32 0, i32 1, [1024 x i32]* @B, null}
@@ -66,3 +67,4 @@ attributes #1 = { nounwind readnone }
 !20 = metadata !{i32 786443, metadata !25, metadata !10, i32 6, i32 0, i32 1}
 !24 = metadata !{i32 9, i32 0, metadata !3, null}
 !25 = metadata !{metadata !"test", metadata !"/path/to/somewhere"}
+!26 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll
index 0a6fc4e..bf0b418 100644
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@@ -5,17 +5,17 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; Make sure we are preserving debug info in the vectorized code.
 
 ; CHECK: for.body.lr.ph
-; CHECK:   cmp.zero = icmp eq i64 {{.*}}, 0, !dbg !21
+; CHECK:   cmp.zero = icmp eq i64 {{.*}}, 0, !dbg ![[LOC:[0-9]+]]
 ; CHECK: vector.body
-; CHECK:   index {{.*}}, !dbg !21
-; CHECK:   getelementptr inbounds i32* %a, {{.*}}, !dbg !22
-; CHECK:   load <2 x i32>* {{.*}}, !dbg !22
-; CHECK:   add <2 x i32> {{.*}}, !dbg !22
-; CHECK:   add i64 %index, 2, !dbg !21
-; CHECK:   icmp eq i64 %index.next, %end.idx.rnd.down, !dbg !21
+; CHECK:   index {{.*}}, !dbg ![[LOC]]
+; CHECK:   getelementptr inbounds i32* %a, {{.*}}, !dbg ![[LOC2:[0-9]+]]
+; CHECK:   load <2 x i32>* {{.*}}, !dbg ![[LOC2]]
+; CHECK:   add <2 x i32> {{.*}}, !dbg ![[LOC2]]
+; CHECK:   add i64 %index, 2, !dbg ![[LOC]]
+; CHECK:   icmp eq i64 %index.next, %end.idx.rnd.down, !dbg ![[LOC]]
 ; CHECK: middle.block
-; CHECK:   add <2 x i32> %rdx.vec.exit.phi, %rdx.shuf, !dbg !22
-; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg !22
+; CHECK:   add <2 x i32> %rdx.vec.exit.phi, %rdx.shuf, !dbg ![[LOC2]]
+; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC2]]
 
 define i32 @f(i32* nocapture %a, i32 %size) #0 {
 entry:
@@ -33,7 +33,7 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
   %sum.05 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv, !dbg !22
-  %0 = load i32* %arrayidx, align 4, !dbg !22, !tbaa !23
+  %0 = load i32* %arrayidx, align 4, !dbg !22
   %add = add i32 %0, %sum.05, !dbg !22
   tail call void @llvm.dbg.value(metadata !{i32 %add.lcssa}, i64 0, metadata !15), !dbg !22
   %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !21
@@ -61,7 +61,7 @@ attributes #0 = { nounwind readonly ssp uwtable "less-precise-fpmad"="false" "no
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!18}
+!llvm.module.flags = !{!18, !27}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 185038) (llvm/trunk 185097)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Data/backedup/dev/os/llvm/debug/-] [DW_LANG_C99]
 !1 = metadata !{metadata !"-", metadata !"/Volumes/Data/backedup/dev/os/llvm/debug"}
@@ -70,7 +70,7 @@ attributes #1 = { nounwind readnone }
 !4 = metadata !{i32 786478, metadata !5, metadata !6, metadata !"f", metadata !"f", metadata !"", i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32*, i32)* @f, null, null, metadata !12, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [f]
 !5 = metadata !{metadata !"<stdin>", metadata !"/Volumes/Data/backedup/dev/os/llvm/debug"}
 !6 = metadata !{i32 786473, metadata !5}          ; [ DW_TAG_file_type ] [/Volumes/Data/backedup/dev/os/llvm/debug/<stdin>]
-!7 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !10, metadata !11}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from int]
@@ -86,7 +86,5 @@ attributes #1 = { nounwind readnone }
 !20 = metadata !{i32 4, i32 0, metadata !4, null}
 !21 = metadata !{i32 5, i32 0, metadata !17, null}
 !22 = metadata !{i32 6, i32 0, metadata !17, null}
-!23 = metadata !{metadata !"int", metadata !24}
-!24 = metadata !{metadata !"omnipotent char", metadata !25}
-!25 = metadata !{metadata !"Simple C/C++ TBAA"}
 !26 = metadata !{i32 7, i32 0, metadata !4, null}
+!27 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/LoopVectorize/ee-crash.ll b/test/Transforms/LoopVectorize/ee-crash.ll
new file mode 100644
index 0000000..8a4f8ce
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ee-crash.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; This test checks that we deal with an in-loop extractelement (for now, this
+; means not crashing by not vectorizing).
+; CHECK-LABEL: @_Z4foo1Pii(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+define i32 @_Z4foo1Pii(i32* %A, i32 %n, <2 x i32> %q) #0 {
+entry:
+  %idx.ext = sext i32 %n to i64
+  %add.ptr = getelementptr inbounds i32* %A, i64 %idx.ext
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %entry, %for.body.i
+  %__init.addr.05.i = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %__first.addr.04.i = phi i32* [ %incdec.ptr.i, %for.body.i ], [ %A, %entry ]
+  %0 = load i32* %__first.addr.04.i, align 4
+  %q1 = extractelement <2 x i32> %q, i32 %n
+  %q2 = add nsw i32 %0, %q1
+  %add.i = add nsw i32 %q2, %__init.addr.05.i
+  %incdec.ptr.i = getelementptr inbounds i32* %__first.addr.04.i, i64 1
+  %cmp.i = icmp eq i32* %incdec.ptr.i, %add.ptr
+  br i1 %cmp.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+attributes #0 = { nounwind readonly ssp uwtable }
+
diff --git a/test/Transforms/LoopVectorize/funcall.ll b/test/Transforms/LoopVectorize/funcall.ll
index 0fb929f..f1f068c 100644
--- a/test/Transforms/LoopVectorize/funcall.ll
+++ b/test/Transforms/LoopVectorize/funcall.ll
@@ -7,7 +7,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; contain a limited set of function calls and none of them sets the rounding
 ; mode, so vectorizing them is safe.
 
-; CHECK: test
+; CHECK-LABEL: @test(
 ; CHECK: <2 x double>
 
 define void @test(double* %d, double %t) {
diff --git a/test/Transforms/LoopVectorize/global_alias.ll b/test/Transforms/LoopVectorize/global_alias.ll
index ae72d3c..0118fb4 100644
--- a/test/Transforms/LoopVectorize/global_alias.ll
+++ b/test/Transforms/LoopVectorize/global_alias.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -O3 -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -O1 -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 
@@ -336,9 +336,8 @@ for.end:                                          ; preds = %for.cond
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias07(
-; CHECK: sub nsw <4 x i32>
+; CHECK: store <4 x i32>
 ; CHECK: ret
-
 define i32 @noAlias07(i32 %a) #0 {
 entry:
   %a.addr = alloca i32, align 4
@@ -552,7 +551,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Bar.A[N][a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias11(
-; CHECK: sub nsw <4 x i32>
+; CHECK: store <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias11(i32 %a) #0 {
@@ -612,7 +611,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Bar.A[N][a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias12(
-; CHECK: sub nsw <4 x i32>
+; CHECK: store <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias12(i32 %a) #0 {
diff --git a/test/Transforms/LoopVectorize/hoist-loads.ll b/test/Transforms/LoopVectorize/hoist-loads.ll
index fad1735..765e14d 100644
--- a/test/Transforms/LoopVectorize/hoist-loads.ll
+++ b/test/Transforms/LoopVectorize/hoist-loads.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 @B = common global [1024 x float] zeroinitializer, align 16
 
 ; Make sure we can vectorize in the presence of hoistable conditional loads.
-; CHECK: hoist_cond_load
+; CHECK-LABEL: @hoist_cond_load(
 ; CHECK: load <2 x float>
 
 define void @hoist_cond_load() {
@@ -38,7 +38,7 @@ for.end:
 
 ; However, we can't hoist loads whose address we have not seen unconditionally
 ; accessed.
-; CHECK:     dont_hoist_cond_load
+; CHECK-LABEL: @dont_hoist_cond_load(
 ; CHECK-NOT: load <2 x float>
 
 define void @dont_hoist_cond_load() {
diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index 88e56b2..dbe0243 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll
@@ -106,3 +106,66 @@ for.end:                                          ; preds = %for.inc, %entry
   ret i32 %sum.0.lcssa
 }
 
+@a = common global [1 x i32*] zeroinitializer, align 8
+@c = common global i32* null, align 8
+
+; We use to if convert this loop. This is not safe because there is a trapping
+; constant expression.
+; PR16729
+
+; CHECK-LABEL: trapping_constant_expression
+; CHECK-NOT: or <4 x i32>
+
+define i32 @trapping_constant_expression() {
+entry:
+  br label %for.body
+
+for.body:
+  %inc3 = phi i32 [ 0, %entry ], [ %inc, %cond.end ]
+  %or2 = phi i32 [ 0, %entry ], [ %or, %cond.end ]
+  br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c) to i32)), %cond.false ], [ 0, %for.body ]
+  %or = or i32 %or2, %cond
+  %inc = add nsw i32 %inc3, 1
+  %cmp = icmp slt i32 %inc, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 %or
+}
+
+; Neither should we if-convert if there is an instruction operand that is a
+; trapping constant expression.
+; PR16729
+
+; CHECK-LABEL: trapping_constant_expression2
+; CHECK-NOT: or <4 x i32>
+
+define i32 @trapping_constant_expression2() {
+entry:
+  br label %for.body
+
+for.body:
+  %inc3 = phi i32 [ 0, %entry ], [ %inc, %cond.end ]
+  %or2 = phi i32 [ 0, %entry ], [ %or, %cond.end ]
+  br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end
+
+cond.false:
+  %cond.1 = or i32 %inc3, sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*]* @a, i64 0, i64 0), i32** @c) to i32))
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %cond.1, %cond.false ], [ %inc3, %for.body ]
+  %or = or i32 %or2, %cond
+  %inc = add nsw i32 %inc3, 1
+  %cmp = icmp slt i32 %inc, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 %or
+}
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 48bb438..50c3b6b 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Make sure that we can handle multiple integer induction variables.
-; CHECK: multi_int_induction
+; CHECK-LABEL: @multi_int_induction(
 ; CHECK: vector.body:
 ; CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:  %normalized.idx = sub i64 %index, 0
@@ -28,3 +28,83 @@ for.end:
   ret void
 }
 
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+
+; Make sure we remove unneeded vectorization of induction variables.
+; In order for instcombine to cleanup the vectorized induction variables that we
+; create in the loop vectorizer we need to perform some form of redundancy
+; elimination to get rid of multiple uses.
+
+; IND-LABEL: scalar_use
+
+; IND:     br label %vector.body
+; IND:     vector.body:
+;   Vectorized induction variable.
+; IND-NOT:  insertelement <2 x i64>
+; IND-NOT:  shufflevector <2 x i64>
+; IND:     br {{.*}}, label %vector.body
+
+define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ind.sum = add i64 %iv, %offset
+  %arr.idx = getelementptr inbounds float* %a, i64 %ind.sum
+  %l1 = load float* %arr.idx, align 4
+  %ind.sum2 = add i64 %iv, %offset2
+  %arr.idx2 = getelementptr inbounds float* %a, i64 %ind.sum2
+  %l2 = load float* %arr.idx2, align 4
+  %m = fmul fast float %b, %l2
+  %ad = fadd fast float %l1, %m
+  store float %ad, float* %arr.idx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret void
+}
+
+
+; Make sure that the loop exit count computation does not overflow for i8 and
+; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
+; induction variable to a bigger type the exit count computation will overflow
+; to 0.
+; PR17532
+
+; CHECK-LABEL: i8_loop
+; CHECK; icmp eq i32 {{.*}}, 256
+define i32 @i8_loop() nounwind readnone ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i8 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i8 %b.0, -1
+  %4 = icmp eq i8 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; CHECK-LABEL: i16_loop
+; CHECK; icmp eq i32 {{.*}}, 65536
+
+define i32 @i16_loop() nounwind readnone ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i16 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i16 %b.0, -1
+  %4 = icmp eq i16 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
index 6141c39..9c8201a 100644
--- a/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 @array = common global [1024 x i32] zeroinitializer, align 16
 
 ;CHECK-LABEL: @array_at_plus_one(
-;CHECK: trunc i64
 ;CHECK: add i64 %index, 12
+;CHECK: trunc i64
 ;CHECK: ret i32
 define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/LoopVectorize/infiniteloop.ll b/test/Transforms/LoopVectorize/infiniteloop.ll
index f6ab564..5c5e1a3 100644
--- a/test/Transforms/LoopVectorize/infiniteloop.ll
+++ b/test/Transforms/LoopVectorize/infiniteloop.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 ;   // return SCEVCouldNotCompute.
 ; For an infinite loop SE can return any number.
 
-; CHECK: fn1
+; CHECK-LABEL: @fn1(
 define void @fn1()  {
 entry:
   store i64 0, i64* @a, align 8
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index 95b53b7..c3d570c 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -468,6 +468,59 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.fabs(double) nounwind readnone
 
+;CHECK-LABEL: @copysign_f32(
+;CHECK: llvm.copysign.v4f32
+;CHECK: ret void
+define void @copysign_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %y, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float* %z, i64 %indvars.iv
+  %1 = load float* %arrayidx1, align 4
+  %call = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+
+define void @copysign_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double* %z, i64 %indvars.iv
+  %1 = load double* %arrayidx, align 8
+  %call = tail call double @llvm.copysign(double %0, double %1) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.copysign(double, double) nounwind readnone
+
 ;CHECK-LABEL: @floor_f32(
 ;CHECK: llvm.floor.v4f32
 ;CHECK: ret void
@@ -728,6 +781,58 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare double @llvm.nearbyint.f64(double) nounwind readnone
 
+;CHECK-LABEL: @round_f32(
+;CHECK: llvm.round.v4f32
+;CHECK: ret void
+define void @round_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %y, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %call = tail call float @llvm.round.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.round.f32(float) nounwind readnone
+
+;CHECK-LABEL: @round_f64(
+;CHECK: llvm.round.v4f64
+;CHECK: ret void
+define void @round_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double* %y, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %call = tail call double @llvm.round.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.round.f64(double) nounwind readnone
+
 ;CHECK-LABEL: @fma_f32(
 ;CHECK: llvm.fma.v4f32
 ;CHECK: ret void
@@ -927,3 +1032,61 @@ for.end:                                          ; preds = %for.body
 declare float @fabsf(float) nounwind readnone
 
 declare double @llvm.pow.f64(double, double) nounwind readnone
+
+
+
+; Make sure we don't replace calls to functions with standard library function
+; signatures but defined with internal linkage.
+
+define internal float @roundf(float %x) nounwind readnone {
+  ret float 0.00000000
+}
+; CHECK-LABEL: internal_round
+; CHECK-NOT:  load <4 x float>
+
+define void @internal_round(float* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float* %x, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %call = tail call float @roundf(float %0) nounwind readnone
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure we don't replace calls to functions with standard library names but
+; different signatures.
+
+declare void @round(double %f)
+
+; CHECK-LABEL: wrong_signature
+; CHECK-NOT:  load <4 x double>
+
+define void @wrong_signature(double* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %x, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 4
+  store double %0, double* %arrayidx, align 4
+  tail call void @round(double %0) nounwind readnone
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/lifetime.ll b/test/Transforms/LoopVectorize/lifetime.ll
index 87006ed..4f6f3b8 100644
--- a/test/Transforms/LoopVectorize/lifetime.ll
+++ b/test/Transforms/LoopVectorize/lifetime.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Make sure we can vectorize loops which contain lifetime markers.
 
-; CHECK-LABEL: test
+; CHECK-LABEL: @test(
 ; CHECK: call void @llvm.lifetime.end
 ; CHECK: store <2 x i32>
 ; CHECK: call void @llvm.lifetime.start
@@ -33,7 +33,7 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: testbitcast
+; CHECK-LABEL: @testbitcast(
 ; CHECK: call void @llvm.lifetime.end
 ; CHECK: store <2 x i32>
 ; CHECK: call void @llvm.lifetime.start
@@ -63,7 +63,7 @@ for.end:
   ret void
 }
 
-; CHECK-LABEL: testloopvariant
+; CHECK-LABEL: @testloopvariant(
 ; CHECK: call void @llvm.lifetime.end
 ; CHECK: store <2 x i32>
 ; CHECK: call void @llvm.lifetime.start
diff --git a/test/Transforms/LoopVectorize/lit.local.cfg b/test/Transforms/LoopVectorize/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LoopVectorize/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopVectorize/memdep.ll b/test/Transforms/LoopVectorize/memdep.ll
index b6d9e2e..21cb703 100644
--- a/test/Transforms/LoopVectorize/memdep.ll
+++ b/test/Transforms/LoopVectorize/memdep.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;  for (i = 0; i < 1024; ++i)
 ;    A[i] = A[i + 1] + 1;
 
-; CHECK: f1_vec
+; CHECK-LABEL: @f1_vec(
 ; CHECK: <2 x i32>
 
 define void @f1_vec(i32* %A) {
@@ -35,7 +35,7 @@ for.end:
 ;  for (i = 0; i < 1024; ++i)
 ;    A[i+1] = A[i] + 1;
 
-; CHECK: f2_novec
+; CHECK-LABEL: @f2_novec(
 ; CHECK-NOT: <2 x i32>
 
 define void @f2_novec(i32* %A) {
@@ -61,7 +61,7 @@ for.end:
 ;  for (i = 0; i < 1024; ++i)
 ;    A[i+2] = A[i] + 1;
 
-; CHECK: f3_vec_len
+; CHECK-LABEL: @f3_vec_len(
 ; CHECK: <2 x i32>
 
 ; WIDTH: f3_vec_len
@@ -96,7 +96,7 @@ for.end:
 ;     A[i] = B[i + 1];
 ;   }
 
-; CHECK: f5
+; CHECK-LABEL: @f5(
 ; CHECK-NOT: <2 x i32>
 
 define void @f5(i32*  %A, i32* %B) {
@@ -127,7 +127,7 @@ for.end:
 ;     tmp = a[i];
 ;   }
 
-; CHECK: f6
+; CHECK-LABEL: @f6
 ; CHECK-NOT: <2 x i32>
 
 define i32 @f6(i32* %a, i32 %tmp) {
diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll
index bade561..0e47260 100644
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -17,7 +17,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp sgt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @max_red(i32 %max) {
 entry:
@@ -46,7 +46,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp sgt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @max_red_inverse_select(i32 %max) {
 entry:
@@ -74,7 +74,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp slt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @min_red(i32 %max) {
 entry:
@@ -103,7 +103,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp slt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @min_red_inverse_select(i32 %max) {
 entry:
@@ -133,7 +133,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp ugt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @umax_red(i32 %max) {
 entry:
@@ -162,7 +162,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp ugt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @umax_red_inverse_select(i32 %max) {
 entry:
@@ -190,7 +190,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp ult <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @umin_red(i32 %max) {
 entry:
@@ -219,7 +219,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp ult <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @umin_red_inverse_select(i32 %max) {
 entry:
@@ -248,7 +248,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp slt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @sge_min_red(i32 %max) {
 entry:
@@ -277,7 +277,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp sgt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @sle_min_red(i32 %max) {
 entry:
@@ -306,7 +306,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp ult <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @uge_min_red(i32 %max) {
 entry:
@@ -335,7 +335,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: icmp ugt <2 x i32>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define i32 @ule_min_red(i32 %max) {
 entry:
@@ -416,7 +416,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @max_red_float(float %max) #0 {
 entry:
@@ -442,7 +442,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @max_red_float_ge(float %max) #0 {
 entry:
@@ -468,7 +468,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_max_red_float(float %max) #0 {
 entry:
@@ -494,7 +494,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_max_red_float_le(float %max) #0 {
 entry:
@@ -515,12 +515,12 @@ for.end:
   ret float %max.red.0
 }
 
-; CHECK: @unordered_max_red
+; CHECK-LABEL: @unordered_max_red_float(
 ; CHECK: fcmp ugt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @unordered_max_red_float(float %max) #0 {
 entry:
@@ -546,7 +546,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @unordered_max_red_float_ge(float %max) #0 {
 entry:
@@ -567,12 +567,12 @@ for.end:
   ret float %max.red.0
 }
 
-; CHECK: @inverted_unordered_max_red
+; CHECK-LABEL: @inverted_unordered_max_red_float(
 ; CHECK: fcmp ult <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_unordered_max_red_float(float %max) #0 {
 entry:
@@ -598,7 +598,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp ogt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_unordered_max_red_float_le(float %max) #0 {
 entry:
@@ -627,7 +627,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @min_red_float(float %min) #0 {
 entry:
@@ -653,7 +653,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @min_red_float_le(float %min) #0 {
 entry:
@@ -679,7 +679,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_min_red_float(float %min) #0 {
 entry:
@@ -705,7 +705,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_min_red_float_ge(float %min) #0 {
 entry:
@@ -726,12 +726,12 @@ for.end:
   ret float %min.red.0
 }
 
-; CHECK: @unordered_min_red
+; CHECK-LABEL: @unordered_min_red_float(
 ; CHECK: fcmp ult <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @unordered_min_red_float(float %min) #0 {
 entry:
@@ -757,7 +757,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @unordered_min_red_float_le(float %min) #0 {
 entry:
@@ -778,12 +778,12 @@ for.end:
   ret float %min.red.0
 }
 
-; CHECK: @inverted_unordered_min_red
+; CHECK-LABEL: @inverted_unordered_min_red_float(
 ; CHECK: fcmp ugt <2 x float>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_unordered_min_red_float(float %min) #0 {
 entry:
@@ -809,7 +809,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x float>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define float @inverted_unordered_min_red_float_ge(float %min) #0 {
 entry:
@@ -836,7 +836,7 @@ for.end:
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
 ; CHECK: fcmp olt <2 x double>
-; CHECK: select <2 x i1>
+; CHECK: select i1
 
 define double @min_red_double(double %min) #0 {
 entry:
@@ -882,4 +882,4 @@ for.end:
 }
 
 
-attributes #0 = { "no-nans-fp-math"="true" } 
+attributes #0 = { "no-nans-fp-math"="true" }
diff --git a/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index 6906195..7d836de 100644
--- a/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -28,10 +28,10 @@ entry:
 for.body:                                         ; preds = %for.body, %entry
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds [40000 x i8] addrspace(1)* @Y, i64 0, i64 %indvars.iv
-  %0 = load i8 addrspace(1)* %arrayidx, align 1, !tbaa !0
+  %0 = load i8 addrspace(1)* %arrayidx, align 1
   %add = add i8 %0, 1
   %arrayidx3 = getelementptr inbounds [40000 x i8]* @X, i64 0, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx3, align 1, !tbaa !0
+  store i8 %add, i8* %arrayidx3, align 1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, 40000
@@ -42,6 +42,3 @@ for.end:                                          ; preds = %for.body
 }
 
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"omnipotent char", metadata !1}
-!1 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/no_int_induction.ll b/test/Transforms/LoopVectorize/no_int_induction.ll
index 66d5301..e572d1a 100644
--- a/test/Transforms/LoopVectorize/no_int_induction.ll
+++ b/test/Transforms/LoopVectorize/no_int_induction.ll
@@ -4,10 +4,10 @@
 ;  return std::accumulate(A, A + n, 0);
 ; }
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
 
 ;CHECK-LABEL: @sum_array(
+;CHECK: phi i64
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
 ;CHECK: add nsw <4 x i32>
@@ -31,3 +31,30 @@ _ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %.lr.ph.i, %0
   %.01.lcssa.i = phi i32 [ 0, %0 ], [ %5, %.lr.ph.i ]
   ret i32 %.01.lcssa.i
 }
+
+; Same, but use a pointer with a different size.
+;CHECK-LABEL: @sum_array_as1(
+;CHECK: phi i16
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: ret i32
+define i32 @sum_array_as1(i32 addrspace(1)* %A, i32 %n) nounwind uwtable readonly noinline ssp {
+  %1 = sext i32 %n to i64
+  %2 = getelementptr inbounds i32 addrspace(1)* %A, i64 %1
+  %3 = icmp eq i32 %n, 0
+  br i1 %3, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+.lr.ph.i:                                         ; preds = %0, %.lr.ph.i
+  %.03.i = phi i32 addrspace(1)* [ %6, %.lr.ph.i ], [ %A, %0 ]
+  %.012.i = phi i32 [ %5, %.lr.ph.i ], [ 0, %0 ]
+  %4 = load i32 addrspace(1)* %.03.i, align 4
+  %5 = add nsw i32 %4, %.012.i
+  %6 = getelementptr inbounds i32 addrspace(1)* %.03.i, i64 1
+  %7 = icmp eq i32 addrspace(1)* %6, %2
+  br i1 %7, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %.lr.ph.i, %0
+  %.01.lcssa.i = phi i32 [ 0, %0 ], [ %5, %.lr.ph.i ]
+  ret i32 %.01.lcssa.i
+}
diff --git a/test/Transforms/LoopVectorize/no_outside_user.ll b/test/Transforms/LoopVectorize/no_outside_user.ll
index 6f0357c..1f891ad 100644
--- a/test/Transforms/LoopVectorize/no_outside_user.ll
+++ b/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -12,6 +12,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 ; We used to vectorize this loop. But it has a value that is used outside of the
 ; and is not a recognized reduction variable "tmp17".
 
+; CHECK-LABEL: @main(
 ; CHECK-NOT: <2 x i32>
 
 define i32 @main()  {
@@ -38,4 +39,33 @@ f1.exit.loopexit:
   ret i32 %.lcssa
 }
 
+; Don't vectorize this loop. Its phi node (induction variable) has an outside
+; loop user. We currently don't handle this case.
+; PR17179
 
+; CHECK-LABEL: @test2(
+; CHECK-NOT:  <2 x
+
+@x1 = common global i32 0, align 4
+@x2 = common global i32 0, align 4
+@x0 = common global i32 0, align 4
+
+define i32 @test2()  {
+entry:
+  store i32 0, i32* @x1, align 4
+  %0 = load i32* @x0, align 4
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %inc7 = phi i32 [ 0, %entry ], [ %inc, %for.cond1.preheader ]
+  %inc = add nsw i32 %inc7, 1
+  %cmp = icmp eq i32 %inc, 52
+  br i1 %cmp, label %for.end5, label %for.cond1.preheader
+
+for.end5:
+  %inc7.lcssa = phi i32 [ %inc7, %for.cond1.preheader ]
+  %xor = xor i32 %inc7.lcssa, %0
+  store i32 52, i32* @x1, align 4
+  store i32 1, i32* @x2, align 4
+  ret i32 %xor
+}
diff --git a/test/Transforms/LoopVectorize/opt.ll b/test/Transforms/LoopVectorize/opt.ll
new file mode 100644
index 0000000..27030a2
--- /dev/null
+++ b/test/Transforms/LoopVectorize/opt.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -O3 -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck --check-prefix=LOOPVEC %s
+; RUN: opt -S -O3 -disable-loop-vectorization -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck --check-prefix=NOLOOPVEC %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure we can disable vectorization in opt.
+
+; LOOPVEC:       add <2 x i32>
+; NOLOOPVEC-NOT: add <2 x i32>
+
+define i32 @vect(i32* %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %red.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
diff --git a/test/Transforms/LoopVectorize/ptr_loops.ll b/test/Transforms/LoopVectorize/ptr_loops.ll
index 25599f8..15983f0 100644
--- a/test/Transforms/LoopVectorize/ptr_loops.ll
+++ b/test/Transforms/LoopVectorize/ptr_loops.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 @A = global [36 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35], align 16
 @B = global [36 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35], align 16
 
-;CHECK:_Z5test1v
+;CHECK-LABEL:@_Z5test1v(
 ;CHECK: load <4 x i32>
 ;CHECK: shufflevector <4 x i32>
 ;CHECK: store <4 x i32>
@@ -29,7 +29,7 @@ define i32 @_Z5test1v() nounwind uwtable ssp {
   ret i32 0
 }
 
-;CHECK:_Z5test2v
+;CHECK-LABEL: @_Z5test2v(
 ;CHECK: load <4 x i32>
 ;CHECK: shufflevector <4 x i32>
 ;CHECK: store <4 x i32>
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index 18a0a93..791fce1 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -467,3 +467,30 @@ for.body:
 for.end:
   ret i32 %p.addr.02
 }
+
+; Don't vectorize a reduction value that is not the last in a reduction cyle. We
+; would loose iterations (VF-1) on the operations after that use.
+; PR17498
+
+; CHECK-LABEL: not_last_operation
+; CHECK-NOT: x i32>
+define i32 @not_last_operation(i32 %p, i32 %val) {
+entry:
+  %tobool = icmp eq i32 %p, 0
+  br label %for.body
+
+for.body:
+  %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ]
+  %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ]
+  %0 = zext i1 %tobool to i32
+  %inc4.1 = xor i32 %0, 1
+  %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1
+  %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1
+  %inc6.1 = add nsw i32 %inc613.1, 1
+  %exitcond.1 = icmp eq i32 %inc6.1, 22
+  br i1 %exitcond.1, label %exit, label %for.body
+
+exit:
+  %inc.2 = add nsw i32 %inc511.1.inc4.1, 2
+  ret i32 %inc.2
+}
diff --git a/test/Transforms/LoopVectorize/reverse_induction.ll b/test/Transforms/LoopVectorize/reverse_induction.ll
index 9e8c1b1..65ef95d 100644
--- a/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; Make sure consecutive vector generates correct negative indices.
 ; PR15882
 
-; CHECK: reverse_induction_i64
+; CHECK-LABEL: @reverse_induction_i64(
 ; CHECK: add <4 x i64> %[[SPLAT:.*]], <i64 0, i64 -1, i64 -2, i64 -3>
 ; CHECK: add <4 x i64> %[[SPLAT]], <i64 -4, i64 -5, i64 -6, i64 -7>
 
@@ -29,7 +29,7 @@ loopend:
   ret i32 %inc.redux
 }
 
-; CHECK: reverse_induction_i128
+; CHECK-LABEL: @reverse_induction_i128(
 ; CHECK: add <4 x i128> %[[SPLAT:.*]], <i128 0, i128 -1, i128 -2, i128 -3>
 ; CHECK: add <4 x i128> %[[SPLAT]], <i128 -4, i128 -5, i128 -6, i128 -7>
 define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
@@ -52,7 +52,7 @@ loopend:
   ret i32 %inc.redux
 }
 
-; CHECK: reverse_induction_i16
+; CHECK-LABEL: @reverse_induction_i16(
 ; CHECK: add <4 x i16> %[[SPLAT:.*]], <i16 0, i16 -1, i16 -2, i16 -3>
 ; CHECK: add <4 x i16> %[[SPLAT]], <i16 -4, i16 -5, i16 -6, i16 -7>
 
@@ -93,7 +93,7 @@ loopend:
 ;   }
 ; }
 
-; CHECK: reverse_forward_induction_i64_i8
+; CHECK-LABEL: @reverse_forward_induction_i64_i8(
 ; CHECK: vector.body
 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK: %normalized.idx = sub i64 %index, 0
@@ -120,7 +120,7 @@ while.end:
   ret void
 }
 
-; CHECK: reverse_forward_induction_i64_i8_signed
+; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed(
 ; CHECK: vector.body:
 ; CHECK:  %index = phi i64 [ 129, %vector.ph ], [ %index.next, %vector.body ]
 ; CHECK:  %normalized.idx = sub i64 %index, 129
diff --git a/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/test/Transforms/LoopVectorize/runtime-check-address-space.ll
new file mode 100644
index 0000000..6c86561
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-check-address-space.ll
@@ -0,0 +1,235 @@
+; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+
+; Check vectorization that would ordinarily require a runtime bounds
+; check on the pointers when mixing address spaces. For now we cannot
+; assume address spaces do not alias, and we can't assume that
+; different pointers are directly comparable.
+;
+; These all test this basic loop for different combinations of address
+; spaces, and swapping in globals or adding noalias.
+;
+;void foo(int addrspace(N)* [noalias] a, int addrspace(M)* [noalias] b, int n)
+;{
+;    for (int i = 0; i < n; ++i)
+;    {
+;        a[i] = 3 * b[i];
+;    }
+;}
+
+; Artificial datalayout
+target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+
+
+@g_as1 = common addrspace(1) global [1024 x i32] zeroinitializer, align 16
+@q_as2 = common addrspace(2) global [1024 x i32] zeroinitializer, align 16
+
+; Both parameters are unidentified objects with the same address
+; space, so this should vectorize normally.
+define void @foo(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %n) #0 {
+; CHECK-LABEL: @foo(
+; CHECK: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %idxprom
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.0 to i64
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Parameters are unidentified and different address spaces, so cannot vectorize.
+define void @bar0(i32* %a, i32 addrspace(1)* %b, i32 %n) #0 {
+; CHECK-LABEL: @bar0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %idxprom
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Swapped arguments should be the same
+define void @bar1(i32 addrspace(1)* %a, i32* %b, i32 %n) #0 {
+; CHECK-LABEL: @bar1(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32* %b, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.0 to i64
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; We should still be able to vectorize with noalias even if the
+; address spaces are different.
+define void @bar2(i32* noalias %a, i32 addrspace(1)* noalias %b, i32 %n) #0 {
+; CHECK-LABEL: @bar2(
+; CHECK: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %idxprom
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Store to identified global with different address space. This isn't
+; generally safe and shouldn't be vectorized.
+define void @arst0(i32* %b, i32 %n) #0 {
+; CHECK-LABEL: @arst0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32* %b, i64 %idxprom
+  %0 = load i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.0 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+
+; Load from identified global with different address space.
+; This isn't generally safe and shouldn't be vectorized.
+define void @arst1(i32* %b, i32 %n) #0 {
+; CHECK-LABEL: @arst1(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %b, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Read and write to 2 identified globals in different address
+; spaces. This should be vectorized.
+define void @aoeu(i32 %n) #0 {
+; CHECK-LABEL: @aoeu(
+; CHECK: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds [1024 x i32] addrspace(2)* @q_as2, i64 0, i64 %idxprom
+  %0 = load i32 addrspace(2)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.0 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
new file mode 100644
index 0000000..212b37c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
@@ -0,0 +1,142 @@
+; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+
+; Artificial datalayout
+target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+
+
+define void @add_ints_1_1_1(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
+; CHECK-LABEL: @add_ints_1_1_1(
+; CHECK: <4 x i32>
+; CHECK: ret
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i64 %i.0, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %c, i64 %i.0
+  %1 = load i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %i.0
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add i64 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define void @add_ints_as_1_0_0(i32 addrspace(1)* %a, i32* %b, i32* %c) #0 {
+; CHECK-LABEL: @add_ints_as_1_0_0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i64 %i.0, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32* %b, i64 %i.0
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %c, i64 %i.0
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %a, i64 %i.0
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add i64 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define void @add_ints_as_0_1_0(i32* %a, i32 addrspace(1)* %b, i32* %c) #0 {
+; CHECK-LABEL: @add_ints_as_0_1_0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i64 %i.0, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %c, i64 %i.0
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.0
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add i64 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define void @add_ints_as_0_1_1(i32* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
+; CHECK-LABEL: @add_ints_as_0_1_1(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i64 %i.0, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %c, i64 %i.0
+  %1 = load i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.0
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add i64 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define void @add_ints_as_0_1_2(i32* %a, i32 addrspace(1)* %b, i32 addrspace(2)* %c) #0 {
+; CHECK-LABEL: @add_ints_as_0_1_2(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i64 %i.0, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %b, i64 %i.0
+  %0 = load i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32 addrspace(2)* %c, i64 %i.0
+  %1 = load i32 addrspace(2)* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %i.0
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add i64 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
index 4145d13..a2b9ad9 100644
--- a/test/Transforms/LoopVectorize/runtime-check-readonly.ll
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-;CHECK: add_ints
+;CHECK-LABEL: @add_ints(
 ;CHECK: br
 ;CHECK: getelementptr
 ;CHECK-NEXT: getelementptr
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index 4772256..d15479d 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -34,3 +34,31 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret i32 undef
 }
+
+; Make sure that we try to vectorize loops with a runtime check if the
+; dependency check fails.
+
+; CHECK-LABEL: test_runtime_check
+; CHECK:      <4 x float>
+define void @test_runtime_check(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ind.sum = add i64 %iv, %offset
+  %arr.idx = getelementptr inbounds float* %a, i64 %ind.sum
+  %l1 = load float* %arr.idx, align 4
+  %ind.sum2 = add i64 %iv, %offset2
+  %arr.idx2 = getelementptr inbounds float* %a, i64 %ind.sum2
+  %l2 = load float* %arr.idx2, align 4
+  %m = fmul fast float %b, %l2
+  %ad = fadd fast float %l1, %m
+  store float %ad, float* %arr.idx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/runtime-limit.ll b/test/Transforms/LoopVectorize/runtime-limit.ll
index d783974..7370a6f 100644
--- a/test/Transforms/LoopVectorize/runtime-limit.ll
+++ b/test/Transforms/LoopVectorize/runtime-limit.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; We are vectorizing with 6 runtime checks.
-;CHECK: func1x6
+;CHECK-LABEL: func1x6(
 ;CHECK: <4 x i32>
 ;CHECK: ret
 define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
@@ -38,7 +38,7 @@ for.end:                                          ; preds = %for.body
 }
 
 ; We are not vectorizing with 12 runtime checks.
-;CHECK: func2x6
+;CHECK-LABEL: func2x6(
 ;CHECK-NOT: <4 x i32>
 ;CHECK: ret
 define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
diff --git a/test/Transforms/LoopVectorize/safegep.ll b/test/Transforms/LoopVectorize/safegep.ll
index 46ec28b..c950860 100644
--- a/test/Transforms/LoopVectorize/safegep.ll
+++ b/test/Transforms/LoopVectorize/safegep.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:32:32:32-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:
 
 ; PR16592
 
-; CHECK: safe
+; CHECK-LABEL: @safe(
 ; CHECK: <4 x float>
 
 define void @safe(float* %A, float* %B, float %K) {
@@ -34,7 +34,7 @@ return:
 
 ; In a non-default address space we don't have this rule.
 
-; CHECK: notsafe
+; CHECK-LABEL: @notsafe(
 ; CHECK-NOT: <4 x float>
 
 define void @notsafe(float addrspace(5) * %A, float* %B, float %K) {
diff --git a/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 7687738..683621a 100644
--- a/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-apple-macosx"
 @e = common global i32* null, align 8
 @c = common global i32 0, align 4
 
-; CHECK-LABEL-LABEL: @fn1(
+; CHECK-LABEL: @fn1(
 ; CHECK: vector.body
 define void @fn1() #0 {
 entry:
@@ -29,14 +29,14 @@ for.cond4.preheader:                              ; preds = %for.cond
   br i1 %cmp514, label %for.cond7.preheader.lr.ph, label %for.end26
 
 for.cond7.preheader.lr.ph:                        ; preds = %for.cond4.preheader
-  %0 = load i32** @e, align 8, !tbaa !0
+  %0 = load i32** @e, align 8, !tbaa !4
   br label %for.cond7.preheader
 
 for.cond7.preheader:                              ; preds = %for.cond7.preheader.lr.ph, %for.inc23
   %y.017 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %inc24, %for.inc23 ]
   %i.116 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %i.2.lcssa, %for.inc23 ]
   %n.015 = phi i32 [ undef, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
-  %1 = load i32* @b, align 4, !tbaa !3
+  %1 = load i32* @b, align 4, !tbaa !5
   %tobool11 = icmp eq i32 %1, 0
   br i1 %tobool11, label %for.inc23, label %for.body8.lr.ph
 
@@ -49,7 +49,7 @@ for.body8:                                        ; preds = %for.body8.lr.ph, %f
   %i.213 = phi i32 [ %i.116, %for.body8.lr.ph ], [ 0, %for.inc19 ]
   %2 = trunc i64 %indvars.iv19 to i32
   %add10 = add i32 %add9, %2
-  store i32 %add10, i32* @f, align 4, !tbaa !3
+  store i32 %add10, i32* @f, align 4, !tbaa !5
   %idx.ext = sext i32 %add10 to i64
   %add.ptr = getelementptr inbounds i32* @a, i64 %idx.ext
   %tobool129 = icmp eq i32 %i.213, 0
@@ -63,9 +63,9 @@ for.body13:                                       ; preds = %for.body13.lr.ph, %
   %indvars.iv = phi i64 [ %3, %for.body13.lr.ph ], [ %indvars.iv.next, %for.body13 ]
   %add.ptr.sum = add i64 %idx.ext, %indvars.iv
   %arrayidx = getelementptr inbounds i32* @a, i64 %add.ptr.sum
-  %4 = load i32* %arrayidx, align 4, !tbaa !3
+  %4 = load i32* %arrayidx, align 4, !tbaa !5
   %arrayidx15 = getelementptr inbounds i32* %0, i64 %indvars.iv
-  store i32 %4, i32* %arrayidx15, align 4, !tbaa !3
+  store i32 %4, i32* %arrayidx15, align 4, !tbaa !5
   %indvars.iv.next = add i64 %indvars.iv, 1
   %5 = trunc i64 %indvars.iv.next to i32
   %tobool12 = icmp eq i32 %5, 0
@@ -75,17 +75,17 @@ for.cond11.for.inc19_crit_edge:                   ; preds = %for.body13
   br label %for.inc19
 
 for.inc19:                                        ; preds = %for.cond11.for.inc19_crit_edge, %for.body8
-  %6 = load i32* @c, align 4, !tbaa !3
+  %6 = load i32* @c, align 4, !tbaa !5
   %inc20 = add nsw i32 %6, 1
-  store i32 %inc20, i32* @c, align 4, !tbaa !3
+  store i32 %inc20, i32* @c, align 4, !tbaa !5
   %indvars.iv.next20 = add i64 %indvars.iv19, 1
-  %7 = load i32* @b, align 4, !tbaa !3
+  %7 = load i32* @b, align 4, !tbaa !5
   %tobool = icmp eq i32 %7, 0
   br i1 %tobool, label %for.cond7.for.inc23_crit_edge, label %for.body8
 
 for.cond7.for.inc23_crit_edge:                    ; preds = %for.inc19
   %add.ptr.lcssa = phi i32* [ %add.ptr, %for.inc19 ]
-  store i32* %add.ptr.lcssa, i32** @d, align 8, !tbaa !0
+  store i32* %add.ptr.lcssa, i32** @d, align 8, !tbaa !4
   br label %for.inc23
 
 for.inc23:                                        ; preds = %for.cond7.for.inc23_crit_edge, %for.cond7.preheader
@@ -110,4 +110,5 @@ attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA"}
 !3 = metadata !{metadata !"double", metadata !1}
-!4 = metadata !{metadata !"any pointer", metadata !1}
+!4 = metadata !{metadata !0, metadata !0, i64 0}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/Transforms/LoopVectorize/struct_access.ll b/test/Transforms/LoopVectorize/struct_access.ll
index 0cfaabe..75beae8 100644
--- a/test/Transforms/LoopVectorize/struct_access.ll
+++ b/test/Transforms/LoopVectorize/struct_access.ll
@@ -44,3 +44,45 @@ for.end:                                          ; preds = %for.body, %entry
   %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
   ret i32 %sum.0.lcssa
 }
+
+%struct.lit = type { i32 }
+
+; Verify that we still vectorize the access if the struct has the same size as
+; the loaded element.
+; struct lit {
+;  int x;
+; };
+;
+;
+; int bar(struct lit *A, int n) {
+;
+;   int sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += A[i].x;
+;
+;   return sum;
+; }
+
+;CHECK-LABEL: @bar(
+;CHECK: load <4 x i32>
+;CHECK: ret
+define i32 @bar(%struct.lit* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %x = getelementptr inbounds %struct.lit* %A, i64 %indvars.iv, i32 0
+  %0 = load i32* %x, align 4
+  %add = add nsw i32 %0, %sum.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/unroll_novec.ll b/test/Transforms/LoopVectorize/unroll_novec.ll
new file mode 100644
index 0000000..33f128d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/unroll_novec.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK-LABEL: @inc(
+;CHECK: load i32*
+;CHECK: load i32*
+;CHECK: add nsw i32
+;CHECK: add nsw i32
+;CHECK: store i32
+;CHECK: store i32
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/value-ptr-bug.ll b/test/Transforms/LoopVectorize/value-ptr-bug.ll
index f376656..e8d3728 100644
--- a/test/Transforms/LoopVectorize/value-ptr-bug.ll
+++ b/test/Transforms/LoopVectorize/value-ptr-bug.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; SCEVExpander::expandCodeFor would change a value (the start value of an
 ; induction) that we cached in the induction variable list.
 
-; CHECK: test_vh
+; CHECK-LABEL: @test_vh(
 ; CHECK-NOT: store <4 x i8> undef
 
 define void @test_vh(i32* %ptr265, i32* %ptr266, i32 %sub267) {
diff --git a/test/Transforms/LoopVectorize/vectorize-once.ll b/test/Transforms/LoopVectorize/vectorize-once.ll
index 2b8f3fd..7800469 100644
--- a/test/Transforms/LoopVectorize/vectorize-once.ll
+++ b/test/Transforms/LoopVectorize/vectorize-once.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 
 ; This test checks that we add metadata to vectorized loops
-; CHECK: _Z4foo1Pii
+; CHECK-LABEL: @_Z4foo1Pii(
 ; CHECK: <4 x i32>
 ; CHECK: llvm.loop
 ; CHECK: ret
@@ -41,7 +41,7 @@ _ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
 }
 
 ; This test checks that we don't vectorize loops that are marked with the "width" == 1 metadata.
-; CHECK: _Z4foo2Pii
+; CHECK-LABEL: @_Z4foo2Pii(
 ; CHECK-NOT: <4 x i32>
 ; CHECK: llvm.loop
 ; CHECK: ret
@@ -68,9 +68,10 @@ _ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
 
 attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" }
 
-; CHECK: !0 = metadata !{metadata !0, metadata !1}
+; CHECK: !0 = metadata !{metadata !0, metadata !1, metadata !2}
 ; CHECK: !1 = metadata !{metadata !"llvm.vectorizer.width", i32 1}
-; CHECK: !2 = metadata !{metadata !2, metadata !1}
+; CHECK: !2 = metadata !{metadata !"llvm.vectorizer.unroll", i32 1}
+; CHECK: !3 = metadata !{metadata !3, metadata !1, metadata !2}
 
 !0 = metadata !{metadata !0, metadata !1}
 !1 = metadata !{metadata !"llvm.vectorizer.width", i32 1}
diff --git a/test/Transforms/LowerAtomic/lit.local.cfg b/test/Transforms/LowerAtomic/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LowerAtomic/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LowerExpectIntrinsic/lit.local.cfg b/test/Transforms/LowerExpectIntrinsic/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/LowerExpectIntrinsic/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/LowerInvoke/lit.local.cfg b/test/Transforms/LowerInvoke/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LowerInvoke/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LowerSwitch/feature.ll b/test/Transforms/LowerSwitch/feature.ll
index cc77d3c..e85f03e 100644
--- a/test/Transforms/LowerSwitch/feature.ll
+++ b/test/Transforms/LowerSwitch/feature.ll
@@ -7,88 +7,88 @@
 ;CHECK-NEXT:   br label %NodeBlock37
 
 ;CHECK:      NodeBlock37:                                      ; preds = %entry
-;CHECK-NEXT:   %Pivot38 = icmp ult i32 %tmp158, 11
+;CHECK-NEXT:   %Pivot38 = icmp slt i32 %tmp158, 10
 ;CHECK-NEXT:   br i1 %Pivot38, label %NodeBlock13, label %NodeBlock35
 
 ;CHECK:      NodeBlock35:                                      ; preds = %NodeBlock37
-;CHECK-NEXT:   %Pivot36 = icmp ult i32 %tmp158, 14
+;CHECK-NEXT:   %Pivot36 = icmp slt i32 %tmp158, 13
 ;CHECK-NEXT:   br i1 %Pivot36, label %NodeBlock23, label %NodeBlock33
 
 ;CHECK:      NodeBlock33:                                      ; preds = %NodeBlock35
-;CHECK-NEXT:   %Pivot34 = icmp ult i32 %tmp158, 15
+;CHECK-NEXT:   %Pivot34 = icmp slt i32 %tmp158, 14
 ;CHECK-NEXT:   br i1 %Pivot34, label %LeafBlock25, label %NodeBlock31
 
 ;CHECK:      NodeBlock31:                                      ; preds = %NodeBlock33
-;CHECK-NEXT:   %Pivot32 = icmp ult i32 %tmp158, -6
+;CHECK-NEXT:   %Pivot32 = icmp slt i32 %tmp158, 15
 ;CHECK-NEXT:   br i1 %Pivot32, label %LeafBlock27, label %LeafBlock29
 
 ;CHECK:      LeafBlock29:                                      ; preds = %NodeBlock31
-;CHECK-NEXT:   %tmp158.off = add i32 %tmp158, 6
-;CHECK-NEXT:   %SwitchLeaf30 = icmp ule i32 %tmp158.off, 4
-;CHECK-NEXT:   br i1 %SwitchLeaf30, label %bb338, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf30 = icmp eq i32 %tmp158, 15
+;CHECK-NEXT:   br i1 %SwitchLeaf30, label %bb334, label %NewDefault
 
 ;CHECK:      LeafBlock27:                                      ; preds = %NodeBlock31
-;CHECK-NEXT:   %SwitchLeaf28 = icmp eq i32 %tmp158, 15
-;CHECK-NEXT:   br i1 %SwitchLeaf28, label %bb334, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf28 = icmp eq i32 %tmp158, 14
+;CHECK-NEXT:   br i1 %SwitchLeaf28, label %bb332, label %NewDefault
 
 ;CHECK:      LeafBlock25:                                      ; preds = %NodeBlock33
-;CHECK-NEXT:   %SwitchLeaf26 = icmp eq i32 %tmp158, 14
-;CHECK-NEXT:   br i1 %SwitchLeaf26, label %bb332, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf26 = icmp eq i32 %tmp158, 13
+;CHECK-NEXT:   br i1 %SwitchLeaf26, label %bb330, label %NewDefault
 
 ;CHECK:      NodeBlock23:                                      ; preds = %NodeBlock35
-;CHECK-NEXT:   %Pivot24 = icmp ult i32 %tmp158, 12
+;CHECK-NEXT:   %Pivot24 = icmp slt i32 %tmp158, 11
 ;CHECK-NEXT:   br i1 %Pivot24, label %LeafBlock15, label %NodeBlock21
 
 ;CHECK:      NodeBlock21:                                      ; preds = %NodeBlock23
-;CHECK-NEXT:   %Pivot22 = icmp ult i32 %tmp158, 13
+;CHECK-NEXT:   %Pivot22 = icmp slt i32 %tmp158, 12
 ;CHECK-NEXT:   br i1 %Pivot22, label %LeafBlock17, label %LeafBlock19
 
 ;CHECK:      LeafBlock19:                                      ; preds = %NodeBlock21
-;CHECK-NEXT:   %SwitchLeaf20 = icmp eq i32 %tmp158, 13
-;CHECK-NEXT:   br i1 %SwitchLeaf20, label %bb330, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf20 = icmp eq i32 %tmp158, 12
+;CHECK-NEXT:   br i1 %SwitchLeaf20, label %bb328, label %NewDefault
 
 ;CHECK:      LeafBlock17:                                      ; preds = %NodeBlock21
-;CHECK-NEXT:   %SwitchLeaf18 = icmp eq i32 %tmp158, 12
-;CHECK-NEXT:   br i1 %SwitchLeaf18, label %bb328, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf18 = icmp eq i32 %tmp158, 11
+;CHECK-NEXT:   br i1 %SwitchLeaf18, label %bb326, label %NewDefault
 
 ;CHECK:      LeafBlock15:                                      ; preds = %NodeBlock23
-;CHECK-NEXT:   %SwitchLeaf16 = icmp eq i32 %tmp158, 11
-;CHECK-NEXT:   br i1 %SwitchLeaf16, label %bb326, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf16 = icmp eq i32 %tmp158, 10
+;CHECK-NEXT:   br i1 %SwitchLeaf16, label %bb324, label %NewDefault
 
 ;CHECK:      NodeBlock13:                                      ; preds = %NodeBlock37
-;CHECK-NEXT:   %Pivot14 = icmp ult i32 %tmp158, 8
+;CHECK-NEXT:   %Pivot14 = icmp slt i32 %tmp158, 7
 ;CHECK-NEXT:   br i1 %Pivot14, label %NodeBlock, label %NodeBlock11
 
 ;CHECK:      NodeBlock11:                                      ; preds = %NodeBlock13
-;CHECK-NEXT:   %Pivot12 = icmp ult i32 %tmp158, 9
+;CHECK-NEXT:   %Pivot12 = icmp slt i32 %tmp158, 8
 ;CHECK-NEXT:   br i1 %Pivot12, label %LeafBlock3, label %NodeBlock9
 
 ;CHECK:      NodeBlock9:                                       ; preds = %NodeBlock11
-;CHECK-NEXT:   %Pivot10 = icmp ult i32 %tmp158, 10
+;CHECK-NEXT:   %Pivot10 = icmp slt i32 %tmp158, 9
 ;CHECK-NEXT:   br i1 %Pivot10, label %LeafBlock5, label %LeafBlock7
 
 ;CHECK:      LeafBlock7:                                       ; preds = %NodeBlock9
-;CHECK-NEXT:   %SwitchLeaf8 = icmp eq i32 %tmp158, 10
-;CHECK-NEXT:   br i1 %SwitchLeaf8, label %bb324, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf8 = icmp eq i32 %tmp158, 9
+;CHECK-NEXT:   br i1 %SwitchLeaf8, label %bb322, label %NewDefault
 
 ;CHECK:      LeafBlock5:                                       ; preds = %NodeBlock9
-;CHECK-NEXT:   %SwitchLeaf6 = icmp eq i32 %tmp158, 9
-;CHECK-NEXT:   br i1 %SwitchLeaf6, label %bb322, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf6 = icmp eq i32 %tmp158, 8
+;CHECK-NEXT:   br i1 %SwitchLeaf6, label %bb338, label %NewDefault
 
 ;CHECK:      LeafBlock3:                                       ; preds = %NodeBlock11
-;CHECK-NEXT:   %SwitchLeaf4 = icmp eq i32 %tmp158, 8
-;CHECK-NEXT:   br i1 %SwitchLeaf4, label %bb338, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf4 = icmp eq i32 %tmp158, 7
+;CHECK-NEXT:   br i1 %SwitchLeaf4, label %bb, label %NewDefault
 
 ;CHECK:      NodeBlock:                                        ; preds = %NodeBlock13
-;CHECK-NEXT:   %Pivot = icmp ult i32 %tmp158, 7
+;CHECK-NEXT:   %Pivot = icmp slt i32 %tmp158, 0
 ;CHECK-NEXT:   br i1 %Pivot, label %LeafBlock, label %LeafBlock1
 
 ;CHECK:      LeafBlock1:                                       ; preds = %NodeBlock
-;CHECK-NEXT:   %SwitchLeaf2 = icmp eq i32 %tmp158, 7
-;CHECK-NEXT:   br i1 %SwitchLeaf2, label %bb, label %NewDefault
+;CHECK-NEXT:   %SwitchLeaf2 = icmp ule i32 %tmp158, 6
+;CHECK-NEXT:   br i1 %SwitchLeaf2, label %bb338, label %NewDefault
 
 ;CHECK:      LeafBlock:                                        ; preds = %NodeBlock
-;CHECK-NEXT:   %SwitchLeaf = icmp ule i32 %tmp158, 6
+;CHECK-NEXT:   %tmp158.off = add i32 %tmp158, 6
+;CHECK-NEXT:   %SwitchLeaf = icmp ule i32 %tmp158.off, 4
 ;CHECK-NEXT:   br i1 %SwitchLeaf, label %bb338, label %NewDefault
 
 define i32 @main(i32 %tmp158) {
diff --git a/test/Transforms/LowerSwitch/lit.local.cfg b/test/Transforms/LowerSwitch/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/LowerSwitch/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/Mem2Reg/ConvertDebugInfo.ll b/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
index 5754fcd..33eaed6 100644
--- a/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
+++ b/test/Transforms/Mem2Reg/ConvertDebugInfo.ll
@@ -33,12 +33,13 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!14}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 2, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !12, metadata !2, metadata !"testfunc", metadata !"testfunc", metadata !"testfunc", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (i32, double)* @testfunc, null, null, null, i32 2} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !13, metadata !13, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7, metadata !6}
 !6 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
@@ -48,3 +49,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !11 = metadata !{i32 786443, metadata !12, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !12 = metadata !{metadata !"testfunc.c", metadata !"/tmp"}
 !13 = metadata !{i32 0}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll b/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
index 49dcb04..32acdd6 100644
--- a/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
+++ b/test/Transforms/Mem2Reg/ConvertDebugInfo2.ll
@@ -31,18 +31,19 @@ return:                                           ; preds = %entry
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!22}
 !0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 8, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"baz", metadata !"baz", metadata !"baz", i32 8, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void (i32)* @baz, null, null, null, i32 8} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !20, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{null, metadata !6}
 !6 = metadata !{i32 786468, metadata !20, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 8, i32 0, metadata !1, null}
 !8 = metadata !{i32 9, i32 0, metadata !1, null}
 !9 = metadata !{i32 786689, metadata !10, metadata !"x", metadata !2, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !10 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !11, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 4} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!11 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{null, metadata !6, metadata !13, metadata !14}
 !13 = metadata !{i32 786468, metadata !20, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !14 = metadata !{i32 786447, metadata !20, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
@@ -53,3 +54,4 @@ return:                                           ; preds = %entry
 !19 = metadata !{i32 10, i32 0, metadata !1, null}
 !20 = metadata !{metadata !"bar.c", metadata !"/tmp/"}
 !21 = metadata !{i32 0}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/Mem2Reg/ignore-lifetime.ll b/test/Transforms/Mem2Reg/ignore-lifetime.ll
new file mode 100644
index 0000000..5e4f9bf
--- /dev/null
+++ b/test/Transforms/Mem2Reg/ignore-lifetime.ll
@@ -0,0 +1,26 @@
+; RUN: opt -mem2reg -S -o - < %s | FileCheck %s
+
+declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr)
+
+define void @test1() {
+; CHECK: test1
+; CHECK-NOT: alloca
+  %A = alloca i32
+  %B = bitcast i32* %A to i8*
+  call void @llvm.lifetime.start(i64 2, i8* %B)
+  store i32 1, i32* %A
+  call void @llvm.lifetime.end(i64 2, i8* %B)
+  ret void
+}
+
+define void @test2() {
+; CHECK: test2
+; CHECK-NOT: alloca
+  %A = alloca {i8, i16}
+  %B = getelementptr {i8, i16}* %A, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 2, i8* %B)
+  store {i8, i16} zeroinitializer, {i8, i16}* %A
+  call void @llvm.lifetime.end(i64 2, i8* %B)
+  ret void
+}
diff --git a/test/Transforms/Mem2Reg/lit.local.cfg b/test/Transforms/Mem2Reg/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/Mem2Reg/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/Mem2Reg/use-analysis.ll b/test/Transforms/Mem2Reg/use-analysis.ll
deleted file mode 100644
index b08b1f1..0000000
--- a/test/Transforms/Mem2Reg/use-analysis.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: opt -mem2reg -S -o - < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
-
-declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
-declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr)
-
-define void @test1() {
-; Ensure we can look through a bitcast to i8* and the addition of lifetime
-; markers.
-;
-; CHECK-LABEL: @test1(
-; CHECK-NOT: alloca
-; CHECK: ret void
-
-  %A = alloca i32
-  %B = bitcast i32* %A to i8*
-  call void @llvm.lifetime.start(i64 2, i8* %B)
-  store i32 1, i32* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
-  ret void
-}
-
-define void @test2() {
-; Ensure we can look through a GEP to i8* and the addition of lifetime
-; markers.
-;
-; CHECK-LABEL: @test2(
-; CHECK-NOT: alloca
-; CHECK: ret void
-
-  %A = alloca {i8, i16}
-  %B = getelementptr {i8, i16}* %A, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 2, i8* %B)
-  store {i8, i16} zeroinitializer, {i8, i16}* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
-  ret void
-}
-
-define i32 @test3(i32 %x) {
-; CHECK-LABEL: @test3(
-;
-; Check that we recursively walk the uses of the alloca and thus can see
-; through round trip bitcasts, dead bitcasts, GEPs, multiple GEPs, and lifetime
-; markers.
-entry:
-  %a = alloca i32
-; CHECK-NOT: alloca
-
-  %b = bitcast i32* %a to i8*
-  %b2 = getelementptr inbounds i8* %b, i32 0
-  %b3 = getelementptr inbounds i8* %b2, i32 0
-  call void @llvm.lifetime.start(i64 -1, i8* %b3)
-; CHECK-NOT: call void @llvm.lifetime.start
-
-  store i32 %x, i32* %a
-; CHECK-NOT: store
-
-  %dead = bitcast i32* %a to i4096*
-  %dead1 = bitcast i4096* %dead to i42*
-  %dead2 = getelementptr inbounds i32* %a, i32 %x
-; CHECK-NOT: bitcast
-; CHECK-NOT: getelementptr
-
-  %ret = load i32* %a
-; CHECK-NOT: load
-
-  ret i32 %ret
-; CHECK: ret i32 %x
-}
diff --git a/test/Transforms/MemCpyOpt/lit.local.cfg b/test/Transforms/MemCpyOpt/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/MemCpyOpt/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/MergeFunc/address-spaces.ll b/test/Transforms/MergeFunc/address-spaces.ll
new file mode 100644
index 0000000..0d66b82
--- /dev/null
+++ b/test/Transforms/MergeFunc/address-spaces.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+target datalayout = "p:32:32:32-p1:32:32:32-p2:16:16:16"
+
+declare void @foo(i32) nounwind
+
+; None of these functions should be merged
+
+define i32 @store_as0(i32* %x) {
+; CHECK-LABEL: @store_as0(
+; CHECK: call void @foo(
+  %gep = getelementptr i32* %x, i32 4
+  %y = load i32* %gep
+  call void @foo(i32 %y) nounwind
+  ret i32 %y
+}
+
+define i32 @store_as1(i32 addrspace(1)* %x) {
+; CHECK-LABEL: @store_as1(
+; CHECK: call void @foo(
+  %gep = getelementptr i32 addrspace(1)* %x, i32 4
+  %y = load i32 addrspace(1)* %gep
+  call void @foo(i32 %y) nounwind
+  ret i32 %y
+}
+
+define i32 @store_as2(i32 addrspace(2)* %x) {
+; CHECK-LABEL: @store_as2(
+; CHECK: call void @foo(
+  %gep = getelementptr i32 addrspace(2)* %x, i32 4
+  %y = load i32 addrspace(2)* %gep
+  call void @foo(i32 %y) nounwind
+  ret i32 %y
+}
+
diff --git a/test/Transforms/MergeFunc/inttoptr-address-space.ll b/test/Transforms/MergeFunc/inttoptr-address-space.ll
new file mode 100644
index 0000000..0d834bc
--- /dev/null
+++ b/test/Transforms/MergeFunc/inttoptr-address-space.ll
@@ -0,0 +1,29 @@
+; RUN: opt -mergefunc -S < %s | FileCheck %s
+target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-n8:16:32-S128"
+
+%.qux.2496 = type { i32, %.qux.2497 }
+%.qux.2497 = type { i8, i32 }
+%.qux.2585 = type { i32, i32, i8* }
+
+@g2 = external addrspace(1) constant [9 x i8], align 1
+@g3 = internal hidden unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%.qux.2585 addrspace(1)*)* @func35 to i8*)]
+
+
+define internal hidden i32 @func10(%.qux.2496 addrspace(1)* nocapture %this) align 2 {
+bb:
+  %tmp = getelementptr inbounds %.qux.2496 addrspace(1)* %this, i32 0, i32 1, i32 1
+  %tmp1 = load i32 addrspace(1)* %tmp, align 4
+  ret i32 %tmp1
+}
+
+; Check for pointer bitwidth equal assertion failure
+define internal hidden i8* @func35(%.qux.2585 addrspace(1)* nocapture %this) align 2 {
+bb:
+; CHECK-LABEL: @func35(
+; CHECK: %[[V2:.+]] = bitcast %.qux.2585 addrspace(1)* %{{.*}} to %.qux.2496 addrspace(1)*
+; CHECK: %[[V3:.+]] = tail call i32 @func10(%.qux.2496 addrspace(1)* %[[V2]])
+; CHECK: %{{.*}} = inttoptr i32 %[[V3]] to i8*
+  %tmp = getelementptr inbounds %.qux.2585 addrspace(1)* %this, i32 0, i32 2
+  %tmp1 = load i8* addrspace(1)* %tmp, align 4
+  ret i8* %tmp1
+}
diff --git a/test/Transforms/MergeFunc/inttoptr.ll b/test/Transforms/MergeFunc/inttoptr.ll
index 93250fa..6a69e3f 100644
--- a/test/Transforms/MergeFunc/inttoptr.ll
+++ b/test/Transforms/MergeFunc/inttoptr.ll
@@ -46,6 +46,7 @@ bb:
 
 define internal hidden i8* @func35(%.qux.2585* nocapture %this) align 2 {
 bb:
+; CHECK-LABEL: @func35(
 ; CHECK: %[[V2:.+]] = bitcast %.qux.2585* %{{.*}} to %.qux.2496*
 ; CHECK: %[[V3:.+]] = tail call i32 @func10(%.qux.2496* %[[V2]])
 ; CHECK: %{{.*}} = inttoptr i32 %[[V3]] to i8*
diff --git a/test/Transforms/MergeFunc/lit.local.cfg b/test/Transforms/MergeFunc/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/MergeFunc/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/MergeFunc/merge-ptr-and-int.ll b/test/Transforms/MergeFunc/merge-ptr-and-int.ll
new file mode 100644
index 0000000..4e887ce
--- /dev/null
+++ b/test/Transforms/MergeFunc/merge-ptr-and-int.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+
+declare void @stuff()
+
+; CHECK-LABEL: @f0(
+define void @f0(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
+; CHECK-LABEL: @f1(
+; CHECK: ptrtoint i64*
+; CHECK: tail call void @f0(i64
+
+define void @f1(i64* %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
diff --git a/test/Transforms/MergeFunc/ptr-int-transitivity-1.ll b/test/Transforms/MergeFunc/ptr-int-transitivity-1.ll
new file mode 100644
index 0000000..d6ff10f
--- /dev/null
+++ b/test/Transforms/MergeFunc/ptr-int-transitivity-1.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -mergefunc < %s | not grep "functions merged"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @stuff()
+
+define void @f0(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
+define void @f2(i64 addrspace(1)* %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
diff --git a/test/Transforms/MergeFunc/ptr-int-transitivity-2.ll b/test/Transforms/MergeFunc/ptr-int-transitivity-2.ll
new file mode 100644
index 0000000..c9fb6a6
--- /dev/null
+++ b/test/Transforms/MergeFunc/ptr-int-transitivity-2.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @stuff()
+
+define void @f0(i64 %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
+; CHECK-LABEL: @f0
+; CHECK:  %2 = ptrtoint i64* %0 to i64
+; CHECK:  tail call void @f0(i64 %2)
+; CHECK:  ret void
+define void @f1(i64 addrspace(0)* %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
diff --git a/test/Transforms/MergeFunc/ptr-int-transitivity-3.ll b/test/Transforms/MergeFunc/ptr-int-transitivity-3.ll
new file mode 100644
index 0000000..8f00f03
--- /dev/null
+++ b/test/Transforms/MergeFunc/ptr-int-transitivity-3.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -mergefunc < %s | not grep "functions merged"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @stuff()
+
+define void @f0(i64 addrspace(0)* %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
+define void @f2(i64 addrspace(1)* %p0) {
+entry:
+  call void @stuff()
+  call void @stuff()
+  call void @stuff()
+  ret void
+}
+
diff --git a/test/Transforms/MergeFunc/too-small.ll b/test/Transforms/MergeFunc/too-small.ll
new file mode 100644
index 0000000..1a526ff
--- /dev/null
+++ b/test/Transforms/MergeFunc/too-small.ll
@@ -0,0 +1,14 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+define void @foo(i32 %x) {
+; CHECK-LABEL: @foo(
+; CHECK-NOT: call
+  ret void
+}
+
+define void @bar(i32 %x) {
+; CHECK-LABEL: @bar(
+; CHECK-NOT: call
+  ret void
+}
+
diff --git a/test/Transforms/MetaRenamer/lit.local.cfg b/test/Transforms/MetaRenamer/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/MetaRenamer/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 12af354..885935c 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -182,7 +182,7 @@ if.end5:                                          ; preds = %if.then3, %if.end
 ; CHECK:   tail call i8* @objc_retain(i8* %x) [[NUW:#[0-9]+]]
 ; CHECK-NOT: @objc_
 ; CHECK: if.end5:
-; CHECK:   tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release !0
+; CHECK:   tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release ![[RELEASE:[0-9]+]]
 ; CHECK-NOT: @objc_
 ; CHECK: }
 define void @test1b_imprecise(i8* %x, i1 %p, i1 %q) {
@@ -1357,55 +1357,6 @@ C:
   ret void
 }
 
-; Optimize objc_retainBlock.
-
-; CHECK-LABEL: define void @test23(
-; CHECK-NOT: @objc_
-; CHECK: }
-%block0 = type { i64, i64, i8*, i8* }
-%block1 = type { i8**, i32, i32, i32 (%struct.__block_literal_1*)*, %block0* }
-%struct.__block_descriptor = type { i64, i64 }
-%struct.__block_literal_1 = type { i8**, i32, i32, i8**, %struct.__block_descriptor* }
-@__block_holder_tmp_1 = external constant %block1
-define void @test23() {
-entry:
-  %0 = call i8* @objc_retainBlock(i8* bitcast (%block1* @__block_holder_tmp_1 to i8*)) nounwind, !clang.arc.copy_on_escape !0
-  call void @bar(i32 ()* bitcast (%block1* @__block_holder_tmp_1 to i32 ()*))
-  call void @bar(i32 ()* bitcast (%block1* @__block_holder_tmp_1 to i32 ()*))
-  call void @objc_release(i8* bitcast (%block1* @__block_holder_tmp_1 to i8*)) nounwind
-  ret void
-}
-
-; Don't optimize objc_retainBlock, but do strength reduce it.
-
-; CHECK: define void @test23b(i8* %p) {
-; CHECK: @objc_retain
-; CHECK: @objc_release
-; CHECK: }
-define void @test23b(i8* %p) {
-entry:
-  %0 = call i8* @objc_retainBlock(i8* %p) nounwind, !clang.arc.copy_on_escape !0
-  call void @callee()
-  call void @use_pointer(i8* %p)
-  call void @objc_release(i8* %p) nounwind
-  ret void
-}
-
-; Don't optimize objc_retainBlock, because there's no copy_on_escape metadata.
-
-; CHECK-LABEL: define void @test23c(
-; CHECK: @objc_retainBlock
-; CHECK: @objc_release
-; CHECK: }
-define void @test23c() {
-entry:
-  %0 = call i8* @objc_retainBlock(i8* bitcast (%block1* @__block_holder_tmp_1 to i8*)) nounwind
-  call void @bar(i32 ()* bitcast (%block1* @__block_holder_tmp_1 to i32 ()*))
-  call void @bar(i32 ()* bitcast (%block1* @__block_holder_tmp_1 to i32 ()*))
-  call void @objc_release(i8* bitcast (%block1* @__block_holder_tmp_1 to i8*)) nounwind
-  ret void
-}
-
 ; Any call can decrement a retain count.
 
 ; CHECK-LABEL: define void @test24(
@@ -2251,7 +2202,7 @@ define void @test53(void ()** %zz, i8** %pp) {
 
 ; CHECK-LABEL: define void @test54(
 ; CHECK: call i8* @returner()
-; CHECK-NEXT: call void @objc_release(i8* %t) [[NUW]], !clang.imprecise_release !0
+; CHECK-NEXT: call void @objc_release(i8* %t) [[NUW]], !clang.imprecise_release ![[RELEASE]]
 ; CHECK-NEXT: ret void
 ; CHECK: }
 define void @test54() {
@@ -2285,7 +2236,7 @@ entry:
 ; CHECK-NEXT: %0 = tail call i8* @objc_retain(i8* %x) [[NUW]]
 ; CHECK-NEXT: tail call void @use_pointer(i8* %x)
 ; CHECK-NEXT: tail call void @use_pointer(i8* %x)
-; CHECK-NEXT: tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release !0
+; CHECK-NEXT: tail call void @objc_release(i8* %x) [[NUW]], !clang.imprecise_release ![[RELEASE]]
 ; CHECK-NEXT: br label %if.end
 ; CHECK-NOT: @objc
 ; CHECK: }
@@ -3058,7 +3009,11 @@ define void @test67(i8* %x) {
   ret void
 }
 
+!llvm.module.flags = !{!1}
+
 !0 = metadata !{}
+!1 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
 
 ; CHECK: attributes #0 = { nounwind readnone }
 ; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: ![[RELEASE]] = metadata !{}
diff --git a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index 96a7d3e..0728617 100644
--- a/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -111,14 +111,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; CHECK: attributes [[NUW]] = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!33, !34, !35, !36}
+!llvm.module.flags = !{!33, !34, !35, !36, !61}
 
 !0 = metadata !{i32 786449, metadata !60, i32 16, metadata !"clang version 3.3 ", i1 true, metadata !"", i32 2, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m] [DW_LANG_ObjC]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5, metadata !27}
 !5 = metadata !{i32 786478, metadata !60, metadata !6, metadata !"main", metadata !"main", metadata !"", i32 9, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !10, i32 10} ; [ DW_TAG_subprogram ] [line 9] [def] [scope 10] [main]
 !6 = metadata !{i32 786473, metadata !60} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !10 = metadata !{metadata !11}
@@ -127,11 +127,11 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !13 = metadata !{i32 786443, metadata !60, metadata !5, i32 10, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !14 = metadata !{i32 786454, metadata !60, null, metadata !"id", i32 11, i64 0, i64 0, i64 0, i32 0, metadata !15} ; [ DW_TAG_typedef ] [id] [line 11, size 0, align 0, offset 0] [from ]
 !15 = metadata !{i32 786447, metadata !60, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from objc_object]
-!16 = metadata !{i32 786451, metadata !60, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [from ]
+!16 = metadata !{i32 786451, metadata !60, null, metadata !"objc_object", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [objc_object] [line 0, size 0, align 0, offset 0] [def] [from ]
 !17 = metadata !{metadata !18}
 !18 = metadata !{i32 786445, metadata !60, metadata !16, metadata !"isa", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !19} ; [ DW_TAG_member ] [isa] [line 0, size 64, align 0, offset 0] [from ]
 !19 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 0, offset 0] [from objc_class]
-!20 = metadata !{i32 786451, metadata !60, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [fwd] [from ]
+!20 = metadata !{i32 786451, metadata !60, null, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
 !21 = metadata !{i32 786688, metadata !22, metadata !"ok", metadata !6, i32 13, metadata !23, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ok] [line 13]
 !22 = metadata !{i32 786443, metadata !60, metadata !13, i32 12, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !23 = metadata !{i32 786454, metadata !60, null, metadata !"BOOL", i32 62, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_typedef ] [BOOL] [line 62, size 0, align 0, offset 0] [from signed char]
@@ -139,7 +139,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !25 = metadata !{i32 786688, metadata !26, metadata !"obj2", metadata !6, i32 15, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [obj2] [line 15]
 !26 = metadata !{i32 786443, metadata !60, metadata !22, i32 14, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !27 = metadata !{i32 786478, metadata !60, metadata !6, metadata !"ThrowFunc", metadata !"ThrowFunc", metadata !"", i32 4, metadata !28, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i8*)* @ThrowFunc, null, null, metadata !30, i32 5} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [scope 5] [ThrowFunc]
-!28 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !29, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !29 = metadata !{null, metadata !14}
 !30 = metadata !{metadata !31}
 !31 = metadata !{metadata !32}
@@ -171,3 +171,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !57 = metadata !{i32 786443, metadata !60, metadata !27, i32 5, i32 0, i32 7} ; [ DW_TAG_lexical_block ] [/Volumes/Files/gottesmmcab/Radar/12906997/test.m]
 !58 = metadata !{i32 7, i32 0, metadata !57, null}
 !60 = metadata !{metadata !"test.m", metadata !"/Volumes/Files/gottesmmcab/Radar/12906997"}
+!61 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/ObjCARC/lit.local.cfg b/test/Transforms/ObjCARC/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/ObjCARC/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/ObjCARC/no-objc-arc-exceptions.ll b/test/Transforms/ObjCARC/no-objc-arc-exceptions.ll
deleted file mode 100644
index 2a56371..0000000
--- a/test/Transforms/ObjCARC/no-objc-arc-exceptions.ll
+++ /dev/null
@@ -1,123 +0,0 @@
-; RUN: opt -S -objc-arc < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-%struct.__block_byref_x = type { i8*, %struct.__block_byref_x*, i32, i32, i32 }
-%struct.__block_descriptor = type { i64, i64 }
-@_NSConcreteStackBlock = external global i8*
-@__block_descriptor_tmp = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
-
-; The optimizer should make use of the !clang.arc.no_objc_arc_exceptions
-; metadata and eliminate the retainBlock+release pair here.
-; rdar://10803830.
-
-; CHECK-LABEL: define void @test0(
-; CHECK-NOT: @objc
-; CHECK: }
-define void @test0() {
-entry:
-  %x = alloca %struct.__block_byref_x, align 8
-  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, align 8
-  %byref.isa = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 0
-  store i8* null, i8** %byref.isa, align 8
-  %byref.forwarding = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 1
-  store %struct.__block_byref_x* %x, %struct.__block_byref_x** %byref.forwarding, align 8
-  %byref.flags = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 2
-  store i32 0, i32* %byref.flags, align 8
-  %byref.size = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 3
-  store i32 32, i32* %byref.size, align 4
-  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 0
-  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
-  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 1
-  store i32 1107296256, i32* %block.flags, align 8
-  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 2
-  store i32 0, i32* %block.reserved, align 4
-  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 3
-  store i8* bitcast (void (i8*)* @__foo_block_invoke_0 to i8*), i8** %block.invoke, align 8
-  %block.descriptor = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 4
-  store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_descriptor_tmp to %struct.__block_descriptor*), %struct.__block_descriptor** %block.descriptor, align 8
-  %block.captured = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 5
-  %t1 = bitcast %struct.__block_byref_x* %x to i8*
-  store i8* %t1, i8** %block.captured, align 8
-  %t2 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block to i8*
-  %t3 = call i8* @objc_retainBlock(i8* %t2) nounwind, !clang.arc.copy_on_escape !4
-  %t4 = getelementptr inbounds i8* %t3, i64 16
-  %t5 = bitcast i8* %t4 to i8**
-  %t6 = load i8** %t5, align 8
-  %t7 = bitcast i8* %t6 to void (i8*)*
-  invoke void %t7(i8* %t3)
-          to label %invoke.cont unwind label %lpad, !clang.arc.no_objc_arc_exceptions !4
-
-invoke.cont:                                      ; preds = %entry
-  call void @objc_release(i8* %t3) nounwind, !clang.imprecise_release !4
-  call void @_Block_object_dispose(i8* %t1, i32 8)
-  ret void
-
-lpad:                                             ; preds = %entry
-  %t8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
-          cleanup
-  call void @_Block_object_dispose(i8* %t1, i32 8)
-  resume { i8*, i32 } %t8
-}
-
-; There is no !clang.arc.no_objc_arc_exceptions metadata here, so the optimizer
-; shouldn't eliminate anything, but *CAN* strength reduce the objc_retainBlock
-; to an objc_retain.
-
-; CHECK-LABEL: define void @test0_no_metadata(
-; CHECK: call i8* @objc_retain(
-; CHECK: invoke
-; CHECK: call void @objc_release(
-; CHECK: }
-define void @test0_no_metadata() {
-entry:
-  %x = alloca %struct.__block_byref_x, align 8
-  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>, align 8
-  %byref.isa = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 0
-  store i8* null, i8** %byref.isa, align 8
-  %byref.forwarding = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 1
-  store %struct.__block_byref_x* %x, %struct.__block_byref_x** %byref.forwarding, align 8
-  %byref.flags = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 2
-  store i32 0, i32* %byref.flags, align 8
-  %byref.size = getelementptr inbounds %struct.__block_byref_x* %x, i64 0, i32 3
-  store i32 32, i32* %byref.size, align 4
-  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 0
-  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
-  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 1
-  store i32 1107296256, i32* %block.flags, align 8
-  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 2
-  store i32 0, i32* %block.reserved, align 4
-  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 3
-  store i8* bitcast (void (i8*)* @__foo_block_invoke_0 to i8*), i8** %block.invoke, align 8
-  %block.descriptor = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 4
-  store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_descriptor_tmp to %struct.__block_descriptor*), %struct.__block_descriptor** %block.descriptor, align 8
-  %block.captured = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block, i64 0, i32 5
-  %t1 = bitcast %struct.__block_byref_x* %x to i8*
-  store i8* %t1, i8** %block.captured, align 8
-  %t2 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i8* }>* %block to i8*
-  %t3 = call i8* @objc_retainBlock(i8* %t2) nounwind, !clang.arc.copy_on_escape !4
-  %t4 = getelementptr inbounds i8* %t3, i64 16
-  %t5 = bitcast i8* %t4 to i8**
-  %t6 = load i8** %t5, align 8
-  %t7 = bitcast i8* %t6 to void (i8*)*
-  invoke void %t7(i8* %t3)
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:                                      ; preds = %entry
-  call void @objc_release(i8* %t3) nounwind, !clang.imprecise_release !4
-  call void @_Block_object_dispose(i8* %t1, i32 8)
-  ret void
-
-lpad:                                             ; preds = %entry
-  %t8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
-          cleanup
-  call void @_Block_object_dispose(i8* %t1, i32 8)
-  resume { i8*, i32 } %t8
-}
-
-declare i8* @objc_retainBlock(i8*)
-declare void @objc_release(i8*)
-declare void @_Block_object_dispose(i8*, i32)
-declare i32 @__objc_personality_v0(...)
-declare void @__foo_block_invoke_0(i8* nocapture) uwtable ssp
-
-!4 = metadata !{}
diff --git a/test/Transforms/ObjCARC/path-overflow.ll b/test/Transforms/ObjCARC/path-overflow.ll
index 605e860..3c14353 100644
--- a/test/Transforms/ObjCARC/path-overflow.ll
+++ b/test/Transforms/ObjCARC/path-overflow.ll
@@ -1,6 +1,8 @@
 ; RUN: opt -objc-arc -S < %s
 ; rdar://12277446
 ; rdar://12480535
+; rdar://14590914
+; rdar://15377890
 
 ; The total number of paths grows exponentially with the number of branches, and a
 ; computation of this number can overflow any reasonable fixed-sized
@@ -10,14 +12,22 @@
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios5.0.0"
 
-%struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768 = type { i32*, i32, i8*, i32 }
+%struct.NSConstantString = type { i32*, i32, i8*, i32 }
+%struct.CGPoint = type { float, float }
 
-@_unnamed_cfstring_591 = external constant %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768, section "__DATA,__cfstring"
+@_unnamed_cfstring = external constant %struct.NSConstantString, section "__DATA,__cfstring"
+@_unnamed_cfstring_2 = external constant %struct.NSConstantString, section "__DATA,__cfstring"
 
 declare i8* @objc_retain(i8*) nonlazybind
 declare i8* @objc_retainAutoreleasedReturnValue(i8*) nonlazybind
 declare void @objc_release(i8*) nonlazybind
 declare i8* @returner()
+declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
+declare void @NSLog(i8*, ...)
+declare void @objc_msgSend_stret(i8*, i8*, ...)
+declare i32 @__gxx_personality_sj0(...)
+declare i32 @__objc_personality_v0(...)
+
 
 define hidden void @test1() {
 entry:
@@ -30,7 +40,7 @@ msgSend.nullinit:                                 ; preds = %entry
   br label %msgSend.cont
 
 msgSend.cont:                                     ; preds = %msgSend.nullinit, %msgSend.call
-  %0 = bitcast %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768* @_unnamed_cfstring_591 to i8*
+  %0 = bitcast %struct.NSConstantString* @_unnamed_cfstring to i8*
   %1 = call i8* @objc_retain(i8* %0) nounwind
   br i1 undef, label %msgSend.nullinit33, label %msgSend.call32
 
@@ -853,5 +863,1331 @@ bb222:                                            ; preds = %bb20, %bb19
   ret void
 }
 
+; Function Attrs: ssp
+define void @test3() #1 {
+entry:
+  %call2 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %call5 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont4 unwind label %lpad3
+
+invoke.cont4:                                     ; preds = %invoke.cont
+  br i1 undef, label %land.end, label %land.rhs
+
+land.rhs:                                         ; preds = %invoke.cont4
+  %call7 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %land.end unwind label %lpad3
+
+land.end:                                         ; preds = %land.rhs, %invoke.cont4
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i unwind label %lpad.i
+
+invoke.cont.i:                                    ; preds = %land.end
+  br i1 undef, label %invoke.cont8, label %if.then.i
+
+if.then.i:                                        ; preds = %invoke.cont.i
+  br label %invoke.cont8
+
+lpad.i:                                           ; preds = %land.end
+  %tmp13 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont8:                                     ; preds = %if.then.i, %invoke.cont.i
+  %call18 = invoke i8* (i8*, i8*, i8*, ...)* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*, ...)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
+          to label %invoke.cont17 unwind label %lpad16
+
+invoke.cont17:                                    ; preds = %invoke.cont8
+  %call22 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont21 unwind label %lpad20
+
+invoke.cont21:                                    ; preds = %invoke.cont17
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i1980 unwind label %lpad.i1982
+
+invoke.cont.i1980:                                ; preds = %invoke.cont21
+  br i1 undef, label %invoke.cont24, label %if.then.i1981
+
+if.then.i1981:                                    ; preds = %invoke.cont.i1980
+  br label %invoke.cont24
+
+lpad.i1982:                                       ; preds = %invoke.cont21
+  %tmp28 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont24:                                    ; preds = %if.then.i1981, %invoke.cont.i1980
+  %call37 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont36 unwind label %lpad35
+
+invoke.cont36:                                    ; preds = %invoke.cont24
+  br i1 undef, label %land.end43, label %land.rhs39
+
+land.rhs39:                                       ; preds = %invoke.cont36
+  %call41 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %land.end43 unwind label %lpad35
+
+land.end43:                                       ; preds = %land.rhs39, %invoke.cont36
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i1986 unwind label %lpad.i1988
+
+invoke.cont.i1986:                                ; preds = %land.end43
+  br i1 undef, label %invoke.cont44, label %if.then.i1987
+
+if.then.i1987:                                    ; preds = %invoke.cont.i1986
+  br label %invoke.cont44
+
+lpad.i1988:                                       ; preds = %land.end43
+  %tmp42 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont44:                                    ; preds = %if.then.i1987, %invoke.cont.i1986
+  %call53 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont52 unwind label %lpad51
+
+invoke.cont52:                                    ; preds = %invoke.cont44
+  br i1 undef, label %land.end70, label %land.rhs58
+
+land.rhs58:                                       ; preds = %invoke.cont52
+  %call63 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 42)
+          to label %invoke.cont62 unwind label %lpad61
+
+invoke.cont62:                                    ; preds = %land.rhs58
+  %call68 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+          to label %land.end70 unwind label %lpad66.body.thread
+
+land.end70:                                       ; preds = %invoke.cont62, %invoke.cont52
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i1992 unwind label %lpad66.body
+
+invoke.cont.i1992:                                ; preds = %land.end70
+  br i1 undef, label %invoke.cont71, label %if.then.i1993
+
+if.then.i1993:                                    ; preds = %invoke.cont.i1992
+  br label %invoke.cont71
+
+invoke.cont71:                                    ; preds = %if.then.i1993, %invoke.cont.i1992
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i1998 unwind label %lpad.i2000
+
+invoke.cont.i1998:                                ; preds = %invoke.cont71
+  br i1 undef, label %invoke.cont91, label %if.then.i1999
+
+if.then.i1999:                                    ; preds = %invoke.cont.i1998
+  br label %invoke.cont91
+
+lpad.i2000:                                       ; preds = %invoke.cont71
+  %tmp74 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup102
+
+invoke.cont91:                                    ; preds = %if.then.i1999, %invoke.cont.i1998
+  %call96 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont95 unwind label %lpad94
+
+invoke.cont95:                                    ; preds = %invoke.cont91
+  %call98 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* %call96)
+          to label %invoke.cont97 unwind label %lpad94
+
+invoke.cont97:                                    ; preds = %invoke.cont95
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2004 unwind label %lpad.i2006
+
+invoke.cont.i2004:                                ; preds = %invoke.cont97
+  br i1 undef, label %invoke.cont100, label %if.then.i2005
+
+if.then.i2005:                                    ; preds = %invoke.cont.i2004
+  br label %invoke.cont100
+
+lpad.i2006:                                       ; preds = %invoke.cont97
+  %tmp82 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont100:                                   ; preds = %if.then.i2005, %invoke.cont.i2004
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont110 unwind label %lpad109
+
+invoke.cont110:                                   ; preds = %invoke.cont100
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2010 unwind label %lpad.i2012
+
+invoke.cont.i2010:                                ; preds = %invoke.cont110
+  br i1 undef, label %invoke.cont117, label %if.then.i2011
+
+if.then.i2011:                                    ; preds = %invoke.cont.i2010
+  br label %invoke.cont117
+
+lpad.i2012:                                       ; preds = %invoke.cont110
+  %tmp98 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont117:                                   ; preds = %if.then.i2011, %invoke.cont.i2010
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2022 unwind label %lpad156.body
+
+lpad:                                             ; preds = %entry
+  %tmp118 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup
+
+lpad3:                                            ; preds = %land.rhs, %invoke.cont
+  %tmp119 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup
+
+ehcleanup:                                        ; preds = %lpad3, %lpad
+  unreachable
+
+lpad16:                                           ; preds = %invoke.cont8
+  %tmp121 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup26
+
+lpad20:                                           ; preds = %invoke.cont17
+  %tmp122 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup26
+
+ehcleanup26:                                      ; preds = %lpad20, %lpad16
+  unreachable
+
+lpad35:                                           ; preds = %land.rhs39, %invoke.cont24
+  %tmp124 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad51:                                           ; preds = %invoke.cont44
+  %tmp125 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad61:                                           ; preds = %land.rhs58
+  %tmp127 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad66.body.thread:                               ; preds = %invoke.cont62
+  %tmp128 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad66.body:                                      ; preds = %land.end70
+  %tmp129 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad94:                                           ; preds = %invoke.cont95, %invoke.cont91
+  %tmp133 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup102
+
+ehcleanup102:                                     ; preds = %lpad94, %lpad.i2000
+  unreachable
+
+lpad109:                                          ; preds = %invoke.cont100
+  %tmp134 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont.i2022:                                ; preds = %invoke.cont117
+  br i1 undef, label %invoke.cont157, label %if.then.i2023
+
+if.then.i2023:                                    ; preds = %invoke.cont.i2022
+  br label %invoke.cont157
+
+invoke.cont157:                                   ; preds = %if.then.i2023, %invoke.cont.i2022
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2028 unwind label %lpad164.body
+
+invoke.cont.i2028:                                ; preds = %invoke.cont157
+  br i1 undef, label %invoke.cont165, label %if.then.i2029
+
+if.then.i2029:                                    ; preds = %invoke.cont.i2028
+  br label %invoke.cont165
+
+invoke.cont165:                                   ; preds = %if.then.i2029, %invoke.cont.i2028
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, void (i8*, i8*)*)*)(i8* undef, i8* undef, void (i8*, i8*)* undef)
+          to label %invoke.cont184 unwind label %lpad183
+
+invoke.cont184:                                   ; preds = %invoke.cont165
+  %call186 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont185 unwind label %lpad183
+
+invoke.cont185:                                   ; preds = %invoke.cont184
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2034 unwind label %lpad.i2036
+
+invoke.cont.i2034:                                ; preds = %invoke.cont185
+  br i1 undef, label %invoke.cont190, label %if.then.i2035
+
+if.then.i2035:                                    ; preds = %invoke.cont.i2034
+  br label %invoke.cont190
+
+lpad.i2036:                                       ; preds = %invoke.cont185
+  %tmp168 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %lpad183.body
+
+invoke.cont190:                                   ; preds = %if.then.i2035, %invoke.cont.i2034
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont197 unwind label %lpad196
+
+invoke.cont197:                                   ; preds = %invoke.cont190
+  %call202 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont201 unwind label %lpad200
+
+invoke.cont201:                                   ; preds = %invoke.cont197
+  %call205 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont204 unwind label %lpad203
+
+invoke.cont204:                                   ; preds = %invoke.cont201
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2040 unwind label %lpad.i2042
+
+invoke.cont.i2040:                                ; preds = %invoke.cont204
+  br i1 undef, label %invoke.cont207, label %if.then.i2041
+
+if.then.i2041:                                    ; preds = %invoke.cont.i2040
+  br label %invoke.cont207
+
+lpad.i2042:                                       ; preds = %invoke.cont204
+  %tmp181 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont207:                                   ; preds = %if.then.i2041, %invoke.cont.i2040
+  %call209 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont208 unwind label %lpad203
+
+invoke.cont208:                                   ; preds = %invoke.cont207
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2046 unwind label %lpad212.body
+
+invoke.cont.i2046:                                ; preds = %invoke.cont208
+  br i1 undef, label %invoke.cont213, label %if.then.i2047
+
+if.then.i2047:                                    ; preds = %invoke.cont.i2046
+  br label %invoke.cont213
+
+invoke.cont213:                                   ; preds = %if.then.i2047, %invoke.cont.i2046
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont221 unwind label %lpad220
+
+invoke.cont221:                                   ; preds = %invoke.cont213
+  %call229 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont228 unwind label %lpad227
+
+invoke.cont228:                                   ; preds = %invoke.cont221
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2052 unwind label %lpad.i2054
+
+invoke.cont.i2052:                                ; preds = %invoke.cont228
+  br i1 undef, label %invoke.cont231, label %if.then.i2053
+
+if.then.i2053:                                    ; preds = %invoke.cont.i2052
+  br label %invoke.cont231
+
+lpad.i2054:                                       ; preds = %invoke.cont228
+  %tmp198 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont231:                                   ; preds = %if.then.i2053, %invoke.cont.i2052
+  %call233 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont232 unwind label %lpad227
+
+invoke.cont232:                                   ; preds = %invoke.cont231
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2058 unwind label %lpad236.body
+
+invoke.cont.i2058:                                ; preds = %invoke.cont232
+  br i1 undef, label %invoke.cont237, label %if.then.i2059
+
+if.then.i2059:                                    ; preds = %invoke.cont.i2058
+  br label %invoke.cont237
+
+invoke.cont237:                                   ; preds = %if.then.i2059, %invoke.cont.i2058
+  %call246 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont245 unwind label %lpad244
+
+invoke.cont245:                                   ; preds = %invoke.cont237
+  %call248 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 13)
+          to label %invoke.cont247 unwind label %lpad244
+
+invoke.cont247:                                   ; preds = %invoke.cont245
+  %call251 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 2)
+          to label %invoke.cont250 unwind label %lpad249
+
+invoke.cont250:                                   ; preds = %invoke.cont247
+  %call254 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 7)
+          to label %invoke.cont253 unwind label %lpad252
+
+invoke.cont253:                                   ; preds = %invoke.cont250
+  %call257 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8**, i32)*)(i8* undef, i8* undef, i8** undef, i32 3)
+          to label %invoke.cont256 unwind label %lpad255
+
+invoke.cont256:                                   ; preds = %invoke.cont253
+  %call260 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* undef)
+          to label %invoke.cont259 unwind label %lpad258
+
+invoke.cont259:                                   ; preds = %invoke.cont256
+  %call267 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont266 unwind label %lpad265
+
+invoke.cont266:                                   ; preds = %invoke.cont259
+  %call275 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+          to label %invoke.cont274 unwind label %lpad273
+
+invoke.cont274:                                   ; preds = %invoke.cont266
+  %call279 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont278 unwind label %lpad277
+
+invoke.cont278:                                   ; preds = %invoke.cont274
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2064 unwind label %lpad.i2066
+
+invoke.cont.i2064:                                ; preds = %invoke.cont278
+  br i1 undef, label %invoke.cont281, label %if.then.i2065
+
+if.then.i2065:                                    ; preds = %invoke.cont.i2064
+  br label %invoke.cont281
+
+lpad.i2066:                                       ; preds = %invoke.cont278
+  %tmp253 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont281:                                   ; preds = %if.then.i2065, %invoke.cont.i2064
+  %call291 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont290 unwind label %lpad289
+
+invoke.cont290:                                   ; preds = %invoke.cont281
+  %call303 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 8)
+          to label %invoke.cont302 unwind label %lpad301
+
+invoke.cont302:                                   ; preds = %invoke.cont290
+  %call310 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, double)*)(i8* undef, i8* undef, double 5.000000e-01)
+          to label %invoke.cont309 unwind label %lpad308
+
+invoke.cont309:                                   ; preds = %invoke.cont302
+  %call313 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 42)
+          to label %invoke.cont312 unwind label %lpad311
+
+invoke.cont312:                                   ; preds = %invoke.cont309
+  %call316 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8**, i8**, i32)*)(i8* undef, i8* undef, i8** undef, i8** undef, i32 2)
+          to label %invoke.cont315 unwind label %lpad314
+
+invoke.cont315:                                   ; preds = %invoke.cont312
+  %call322 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+          to label %invoke.cont321 unwind label %lpad320
+
+invoke.cont321:                                   ; preds = %invoke.cont315
+  br i1 undef, label %land.end344, label %land.rhs335
+
+land.rhs335:                                      ; preds = %invoke.cont321
+  %call342 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %land.end344 unwind label %lpad340.body.thread
+
+land.end344:                                      ; preds = %land.rhs335, %invoke.cont321
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2070 unwind label %lpad340.body
+
+invoke.cont.i2070:                                ; preds = %land.end344
+  br i1 undef, label %invoke.cont345, label %if.then.i2071
+
+if.then.i2071:                                    ; preds = %invoke.cont.i2070
+  br label %invoke.cont345
+
+invoke.cont345:                                   ; preds = %if.then.i2071, %invoke.cont.i2070
+  %call362 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef)
+          to label %invoke.cont361 unwind label %lpad360
+
+invoke.cont361:                                   ; preds = %invoke.cont345
+  %call365 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont364 unwind label %lpad363
+
+invoke.cont364:                                   ; preds = %invoke.cont361
+  %call371 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont370 unwind label %lpad369
+
+invoke.cont370:                                   ; preds = %invoke.cont364
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2076 unwind label %lpad.i2078
+
+invoke.cont.i2076:                                ; preds = %invoke.cont370
+  br i1 undef, label %invoke.cont373, label %if.then.i2077
+
+if.then.i2077:                                    ; preds = %invoke.cont.i2076
+  br label %invoke.cont373
+
+lpad.i2078:                                       ; preds = %invoke.cont370
+  %tmp340 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont373:                                   ; preds = %if.then.i2077, %invoke.cont.i2076
+  %call377 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32, i8*)*)(i8* undef, i8* undef, i32 42, i8* undef)
+          to label %invoke.cont376 unwind label %lpad363
+
+invoke.cont376:                                   ; preds = %invoke.cont373
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 5)
+          to label %invoke.cont382 unwind label %lpad381
+
+invoke.cont382:                                   ; preds = %invoke.cont376
+  %call384 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont383 unwind label %lpad381
+
+invoke.cont383:                                   ; preds = %invoke.cont382
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2082 unwind label %lpad.i2084
+
+invoke.cont.i2082:                                ; preds = %invoke.cont383
+  br i1 undef, label %invoke.cont392, label %if.then.i2083
+
+if.then.i2083:                                    ; preds = %invoke.cont.i2082
+  br label %invoke.cont392
+
+lpad.i2084:                                       ; preds = %invoke.cont383
+  %tmp360 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont392:                                   ; preds = %if.then.i2083, %invoke.cont.i2082
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 -2)
+          to label %invoke.cont395 unwind label %lpad381
+
+invoke.cont395:                                   ; preds = %invoke.cont392
+  %call397 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont396 unwind label %lpad381
+
+invoke.cont396:                                   ; preds = %invoke.cont395
+  %call400 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont399 unwind label %lpad398
+
+invoke.cont399:                                   ; preds = %invoke.cont396
+  %call403 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont402 unwind label %lpad401
+
+invoke.cont402:                                   ; preds = %invoke.cont399
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2088 unwind label %lpad.i2090
+
+invoke.cont.i2088:                                ; preds = %invoke.cont402
+  br i1 undef, label %invoke.cont405, label %if.then.i2089
+
+if.then.i2089:                                    ; preds = %invoke.cont.i2088
+  br label %invoke.cont405
+
+lpad.i2090:                                       ; preds = %invoke.cont402
+  %tmp370 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont405:                                   ; preds = %if.then.i2089, %invoke.cont.i2088
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 -1)
+          to label %invoke.cont408 unwind label %lpad381
+
+invoke.cont408:                                   ; preds = %invoke.cont405
+  %call410 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont409 unwind label %lpad381
+
+invoke.cont409:                                   ; preds = %invoke.cont408
+  %call413 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont412 unwind label %lpad411
+
+invoke.cont412:                                   ; preds = %invoke.cont409
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2094 unwind label %lpad.i2096
+
+invoke.cont.i2094:                                ; preds = %invoke.cont412
+  br i1 undef, label %invoke.cont418, label %if.then.i2095
+
+if.then.i2095:                                    ; preds = %invoke.cont.i2094
+  br label %invoke.cont418
+
+lpad.i2096:                                       ; preds = %invoke.cont412
+  %tmp380 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont418:                                   ; preds = %if.then.i2095, %invoke.cont.i2094
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i32)*)(i8* undef, i8* undef, i8* undef, i32 0)
+          to label %invoke.cont422 unwind label %lpad381
+
+invoke.cont422:                                   ; preds = %invoke.cont418
+  %call424 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont423 unwind label %lpad381
+
+invoke.cont423:                                   ; preds = %invoke.cont422
+  %call427 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont426 unwind label %lpad425
+
+invoke.cont426:                                   ; preds = %invoke.cont423
+  %call430 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont429 unwind label %lpad428
+
+invoke.cont429:                                   ; preds = %invoke.cont426
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2100 unwind label %lpad.i2102
+
+invoke.cont.i2100:                                ; preds = %invoke.cont429
+  br i1 undef, label %invoke.cont432, label %if.then.i2101
+
+if.then.i2101:                                    ; preds = %invoke.cont.i2100
+  br label %invoke.cont432
+
+lpad.i2102:                                       ; preds = %invoke.cont429
+  %tmp390 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont432:                                   ; preds = %if.then.i2101, %invoke.cont.i2100
+  %call436 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 0)
+          to label %invoke.cont435 unwind label %lpad381
+
+invoke.cont435:                                   ; preds = %invoke.cont432
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2106 unwind label %lpad.i2108
+
+invoke.cont.i2106:                                ; preds = %invoke.cont435
+  %call444 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 5)
+          to label %invoke.cont443 unwind label %lpad381
+
+lpad.i2108:                                       ; preds = %invoke.cont435
+  %tmp396 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont443:                                   ; preds = %invoke.cont.i2106
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2112 unwind label %lpad.i2114
+
+invoke.cont.i2112:                                ; preds = %invoke.cont443
+  br i1 undef, label %invoke.cont449, label %if.then.i2113
+
+if.then.i2113:                                    ; preds = %invoke.cont.i2112
+  br label %invoke.cont449
+
+lpad.i2114:                                       ; preds = %invoke.cont443
+  %tmp402 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont449:                                   ; preds = %if.then.i2113, %invoke.cont.i2112
+  %call453 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 -2)
+          to label %invoke.cont452 unwind label %lpad381
+
+invoke.cont452:                                   ; preds = %invoke.cont449
+  %call456 = invoke i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont455 unwind label %lpad454
+
+invoke.cont455:                                   ; preds = %invoke.cont452
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2118 unwind label %lpad.i2120
+
+invoke.cont.i2118:                                ; preds = %invoke.cont455
+  br i1 undef, label %invoke.cont458, label %if.then.i2119
+
+if.then.i2119:                                    ; preds = %invoke.cont.i2118
+  br label %invoke.cont458
+
+lpad.i2120:                                       ; preds = %invoke.cont455
+  %tmp408 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont458:                                   ; preds = %if.then.i2119, %invoke.cont.i2118
+  %call461 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 -1)
+          to label %invoke.cont460 unwind label %lpad381
+
+invoke.cont460:                                   ; preds = %invoke.cont458
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2124 unwind label %lpad.i2126
+
+invoke.cont.i2124:                                ; preds = %invoke.cont460
+  br i1 undef, label %invoke.cont466, label %if.then.i2125
+
+if.then.i2125:                                    ; preds = %invoke.cont.i2124
+  br label %invoke.cont466
+
+lpad.i2126:                                       ; preds = %invoke.cont460
+  %tmp414 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup477
+
+invoke.cont466:                                   ; preds = %if.then.i2125, %invoke.cont.i2124
+  %call470 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 0)
+          to label %invoke.cont469 unwind label %lpad381
+
+invoke.cont469:                                   ; preds = %invoke.cont466
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2130 unwind label %lpad.i2132
+
+invoke.cont.i2130:                                ; preds = %invoke.cont469
+  br i1 undef, label %invoke.cont475, label %if.then.i2131
+
+if.then.i2131:                                    ; preds = %invoke.cont.i2130
+  br label %invoke.cont475
+
+lpad.i2132:                                       ; preds = %invoke.cont469
+  %tmp420 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup477
+
+invoke.cont475:                                   ; preds = %if.then.i2131, %invoke.cont.i2130
+  %call491 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 1)
+          to label %invoke.cont490 unwind label %lpad489
+
+invoke.cont490:                                   ; preds = %invoke.cont475
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont499 unwind label %lpad498
+
+invoke.cont499:                                   ; preds = %invoke.cont490
+  %call504 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont503 unwind label %lpad489
+
+invoke.cont503:                                   ; preds = %invoke.cont499
+  %call507 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* undef, i8* undef, i32 3)
+          to label %invoke.cont506 unwind label %lpad505
+
+invoke.cont506:                                   ; preds = %invoke.cont503
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont509 unwind label %lpad508
+
+invoke.cont509:                                   ; preds = %invoke.cont506
+  %call513 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont512 unwind label %lpad489
+
+invoke.cont512:                                   ; preds = %invoke.cont509
+  br i1 undef, label %msgSend.null-receiver, label %msgSend.call
+
+msgSend.call:                                     ; preds = %invoke.cont512
+  invoke void bitcast (void (i8*, i8*, ...)* @objc_msgSend_stret to void (%struct.CGPoint*, i8*, i8*)*)(%struct.CGPoint* sret undef, i8* undef, i8* undef)
+          to label %msgSend.cont unwind label %lpad514
+
+msgSend.null-receiver:                            ; preds = %invoke.cont512
+  br label %msgSend.cont
+
+msgSend.cont:                                     ; preds = %msgSend.null-receiver, %msgSend.call
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2136 unwind label %lpad.i2138
+
+invoke.cont.i2136:                                ; preds = %msgSend.cont
+  br i1 undef, label %invoke.cont521, label %if.then.i2137
+
+if.then.i2137:                                    ; preds = %invoke.cont.i2136
+  br label %invoke.cont521
+
+lpad.i2138:                                       ; preds = %msgSend.cont
+  %tmp468 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont521:                                   ; preds = %if.then.i2137, %invoke.cont.i2136
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
+          to label %invoke.cont528 unwind label %lpad527
+
+invoke.cont528:                                   ; preds = %invoke.cont521
+  %call532 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont531 unwind label %lpad489
+
+invoke.cont531:                                   ; preds = %invoke.cont528
+  %call535 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont534 unwind label %lpad533
+
+invoke.cont534:                                   ; preds = %invoke.cont531
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2142 unwind label %lpad.i2144
+
+invoke.cont.i2142:                                ; preds = %invoke.cont534
+  br i1 undef, label %invoke.cont540, label %if.then.i2143
+
+if.then.i2143:                                    ; preds = %invoke.cont.i2142
+  br label %invoke.cont540
+
+lpad.i2144:                                       ; preds = %invoke.cont534
+  %tmp486 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+invoke.cont540:                                   ; preds = %if.then.i2143, %invoke.cont.i2142
+  %call544 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i32)*)(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef, i32 3)
+          to label %invoke.cont543 unwind label %lpad489
+
+invoke.cont543:                                   ; preds = %invoke.cont540
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* undef)
+          to label %invoke.cont546 unwind label %lpad545
+
+invoke.cont546:                                   ; preds = %invoke.cont543
+  %call549 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont548 unwind label %lpad489
+
+invoke.cont548:                                   ; preds = %invoke.cont546
+  %call555 = invoke signext i8 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8 (i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont554 unwind label %lpad553
+
+invoke.cont554:                                   ; preds = %invoke.cont548
+  %tmp499 = call i8* @objc_retain(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*)) #3
+  invoke void (i8*, ...)* @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i8* %tmp499, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont.i2148 unwind label %lpad.i2150
+
+invoke.cont.i2148:                                ; preds = %invoke.cont554
+  call void @objc_release(i8* %tmp499) #3, !clang.imprecise_release !0
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont566 unwind label %lpad565
+
+lpad.i2150:                                       ; preds = %invoke.cont554
+  %tmp500 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  call void @objc_release(i8* %tmp499) #3, !clang.imprecise_release !0
+  unreachable
+
+invoke.cont566:                                   ; preds = %invoke.cont.i2148
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*, i8*)*)(i8* undef, i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*))
+          to label %invoke.cont572 unwind label %lpad571
+
+invoke.cont572:                                   ; preds = %invoke.cont566
+  %call582 = invoke i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont581 unwind label %lpad580
+
+invoke.cont581:                                   ; preds = %invoke.cont572
+  unreachable
+
+lpad156.body:                                     ; preds = %invoke.cont117
+  %tmp1157 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad164.body:                                     ; preds = %invoke.cont157
+  %tmp1158 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad183:                                          ; preds = %invoke.cont184, %invoke.cont165
+  %tmp1159 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %lpad183.body
+
+lpad183.body:                                     ; preds = %lpad183, %lpad.i2036
+  unreachable
+
+lpad196:                                          ; preds = %invoke.cont190
+  %tmp1160 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad200:                                          ; preds = %invoke.cont197
+  %tmp1161 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad203:                                          ; preds = %invoke.cont207, %invoke.cont201
+  %tmp1162 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad212.body:                                     ; preds = %invoke.cont208
+  %tmp1163 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad220:                                          ; preds = %invoke.cont213
+  %tmp1164 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %eh.resume
+
+lpad227:                                          ; preds = %invoke.cont231, %invoke.cont221
+  %tmp1166 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup239
+
+lpad236.body:                                     ; preds = %invoke.cont232
+  %tmp1167 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup239
+
+ehcleanup239:                                     ; preds = %lpad236.body, %lpad227
+  unreachable
+
+lpad244:                                          ; preds = %invoke.cont245, %invoke.cont237
+  %tmp1168 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad249:                                          ; preds = %invoke.cont247
+  %tmp1169 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad252:                                          ; preds = %invoke.cont250
+  %tmp1170 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup263
+
+lpad255:                                          ; preds = %invoke.cont253
+  %tmp1171 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup263
+
+lpad258:                                          ; preds = %invoke.cont256
+  %tmp1172 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+ehcleanup263:                                     ; preds = %lpad255, %lpad252
+  unreachable
+
+lpad265:                                          ; preds = %invoke.cont259
+  %tmp1173 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad273:                                          ; preds = %invoke.cont266
+  %tmp1175 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad277:                                          ; preds = %invoke.cont274
+  %tmp1176 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad289:                                          ; preds = %invoke.cont281
+  %tmp1177 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad301:                                          ; preds = %invoke.cont290
+  %tmp1180 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad308:                                          ; preds = %invoke.cont302
+  %tmp1182 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad311:                                          ; preds = %invoke.cont309
+  %tmp1183 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad314:                                          ; preds = %invoke.cont312
+  %tmp1184 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad320:                                          ; preds = %invoke.cont315
+  %tmp1186 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad340.body.thread:                              ; preds = %land.rhs335
+  %tmp1188 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad340.body:                                     ; preds = %land.end344
+  %tmp1189 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad360:                                          ; preds = %invoke.cont345
+  %tmp1191 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %eh.resume
+
+lpad363:                                          ; preds = %invoke.cont373, %invoke.cont361
+  %tmp1192 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad369:                                          ; preds = %invoke.cont364
+  %tmp1194 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad381:                                          ; preds = %invoke.cont466, %invoke.cont458, %invoke.cont449, %invoke.cont.i2106, %invoke.cont432, %invoke.cont422, %invoke.cont418, %invoke.cont408, %invoke.cont405, %invoke.cont395, %invoke.cont392, %invoke.cont382, %invoke.cont376
+  %tmp1196 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup477
+
+lpad398:                                          ; preds = %invoke.cont396
+  %tmp1199 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad401:                                          ; preds = %invoke.cont399
+  %tmp1200 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad411:                                          ; preds = %invoke.cont409
+  %tmp1201 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad425:                                          ; preds = %invoke.cont423
+  %tmp1203 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup477
+
+lpad428:                                          ; preds = %invoke.cont426
+  %tmp1204 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad454:                                          ; preds = %invoke.cont452
+  %tmp1207 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+ehcleanup477:                                     ; preds = %lpad425, %lpad381, %lpad.i2132, %lpad.i2126
+  unreachable
+
+lpad489:                                          ; preds = %invoke.cont546, %invoke.cont540, %invoke.cont528, %invoke.cont509, %invoke.cont499, %invoke.cont475
+  %tmp1211 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup560
+
+lpad498:                                          ; preds = %invoke.cont490
+  %tmp1214 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad505:                                          ; preds = %invoke.cont503
+  %tmp1215 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad508:                                          ; preds = %invoke.cont506
+  %tmp1216 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad514:                                          ; preds = %msgSend.call
+  %tmp1217 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad527:                                          ; preds = %invoke.cont521
+  %tmp1219 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %ehcleanup560
+
+lpad533:                                          ; preds = %invoke.cont531
+  %tmp1220 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad545:                                          ; preds = %invoke.cont543
+  %tmp1222 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad553:                                          ; preds = %invoke.cont548
+  %tmp1224 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+ehcleanup560:                                     ; preds = %lpad527, %lpad489
+  br label %eh.resume
+
+lpad565:                                          ; preds = %invoke.cont.i2148
+  %tmp1225 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad571:                                          ; preds = %invoke.cont566
+  %tmp1227 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  unreachable
+
+lpad580:                                          ; preds = %invoke.cont572
+  %tmp1228 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*)
+          cleanup
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad580, %ehcleanup560, %lpad360, %lpad220
+  resume { i8*, i32 } undef
+}
+
+@"OBJC_EHTYPE_$_NSException" = external global i8
+
+define void @test4() {
+entry:
+  br i1 undef, label %if.end13, label %if.then10
+
+if.then10:                                        ; preds = %entry
+  br label %if.end13
+
+if.end13:                                         ; preds = %if.then10, %entry
+  %0 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, i8*, i64, i8*, i8)*)(i8* undef, i8* undef, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring to i8*), i64 2, i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_2 to i8*), i8 signext 0), !clang.arc.no_objc_arc_exceptions !0
+  br i1 undef, label %if.then17, label %if.end18
+
+if.then17:                                        ; preds = %if.end13
+  br label %if.end18
+
+if.end18:                                         ; preds = %if.then17, %if.end13
+  br i1 undef, label %if.then64, label %if.end73
+
+if.then64:                                        ; preds = %if.end18
+  br i1 undef, label %cond.end71, label %cond.true68
+
+cond.true68:                                      ; preds = %if.then64
+  br label %cond.end71
+
+cond.end71:                                       ; preds = %cond.true68, %if.then64
+  br i1 undef, label %cleanup.action, label %cleanup.done
+
+cleanup.action:                                   ; preds = %cond.end71
+  br label %cleanup.done
+
+cleanup.done:                                     ; preds = %cleanup.action, %cond.end71
+  br label %if.end73
+
+if.end73:                                         ; preds = %cleanup.done, %if.end18
+  br i1 undef, label %forcoll.empty, label %forcoll.loopinit
+
+forcoll.loopinit:                                 ; preds = %if.end73
+  br label %forcoll.loopbody.outer
+
+forcoll.loopbody.outer:                           ; preds = %forcoll.refetch, %forcoll.loopinit
+  br label %forcoll.loopbody
+
+forcoll.loopbody:                                 ; preds = %forcoll.notmutated, %forcoll.loopbody.outer
+  br i1 undef, label %forcoll.notmutated, label %forcoll.mutated
+
+forcoll.mutated:                                  ; preds = %forcoll.loopbody
+  br label %forcoll.notmutated
+
+forcoll.notmutated:                               ; preds = %forcoll.mutated, %forcoll.loopbody
+  br i1 undef, label %forcoll.loopbody, label %forcoll.refetch
+
+forcoll.refetch:                                  ; preds = %forcoll.notmutated
+  br i1 undef, label %forcoll.empty, label %forcoll.loopbody.outer
+
+forcoll.empty:                                    ; preds = %forcoll.refetch, %if.end73
+  br i1 undef, label %if.end85, label %if.then82
+
+if.then82:                                        ; preds = %forcoll.empty
+  br label %if.end85
+
+if.end85:                                         ; preds = %if.then82, %forcoll.empty
+  br i1 undef, label %if.then87, label %if.end102
+
+if.then87:                                        ; preds = %if.end85
+  br i1 undef, label %if.end94, label %if.then91
+
+if.then91:                                        ; preds = %if.then87
+  br label %if.end94
+
+if.end94:                                         ; preds = %if.then91, %if.then87
+  br i1 undef, label %if.end101, label %if.then98
+
+if.then98:                                        ; preds = %if.end94
+  br label %if.end101
+
+if.end101:                                        ; preds = %if.then98, %if.end94
+  br label %if.end102
+
+if.end102:                                        ; preds = %if.end101, %if.end85
+  br i1 undef, label %do.body113, label %if.then107
+
+if.then107:                                       ; preds = %if.end102
+  br label %do.body113
+
+do.body113:                                       ; preds = %if.then107, %if.end102
+  br i1 undef, label %if.then116, label %if.end117
+
+if.then116:                                       ; preds = %do.body113
+  br label %if.end117
+
+if.end117:                                        ; preds = %if.then116, %do.body113
+  br i1 undef, label %if.then125, label %if.end126
+
+if.then125:                                       ; preds = %if.end117
+  br label %if.end126
+
+if.end126:                                        ; preds = %if.then125, %if.end117
+  br i1 undef, label %do.end166, label %cond.true132
+
+cond.true132:                                     ; preds = %if.end126
+  br i1 undef, label %do.body148, label %cond.true151
+
+do.body148:                                       ; preds = %cond.true132
+  br i1 undef, label %do.end166, label %cond.true151
+
+cond.true151:                                     ; preds = %do.body148, %cond.true132
+  br i1 undef, label %if.then162, label %do.end166
+
+if.then162:                                       ; preds = %cond.true151
+  br label %do.end166
+
+do.end166:                                        ; preds = %if.then162, %cond.true151, %do.body148, %if.end126
+  br i1 undef, label %if.then304, label %if.then170
+
+if.then170:                                       ; preds = %do.end166
+  br i1 undef, label %do.end193, label %cond.true179
+
+cond.true179:                                     ; preds = %if.then170
+  br i1 undef, label %if.then190, label %do.end193
+
+if.then190:                                       ; preds = %cond.true179
+  br label %do.end193
+
+do.end193:                                        ; preds = %if.then190, %cond.true179, %if.then170
+  br i1 undef, label %do.body200, label %do.body283
+
+do.body200:                                       ; preds = %do.end193
+  br i1 undef, label %do.end254, label %cond.true203
+
+cond.true203:                                     ; preds = %do.body200
+  br i1 undef, label %do.body218, label %cond.true221
+
+do.body218:                                       ; preds = %cond.true203
+  br i1 undef, label %do.end254, label %cond.true221
+
+cond.true221:                                     ; preds = %do.body218, %cond.true203
+  br i1 undef, label %if.then232, label %do.body236
+
+if.then232:                                       ; preds = %cond.true221
+  br label %do.body236
+
+do.body236:                                       ; preds = %if.then232, %cond.true221
+  br i1 undef, label %do.end254, label %cond.true239
+
+cond.true239:                                     ; preds = %do.body236
+  br i1 undef, label %if.then250, label %do.end254
+
+if.then250:                                       ; preds = %cond.true239
+  br label %do.end254
+
+do.end254:                                        ; preds = %if.then250, %cond.true239, %do.body236, %do.body218, %do.body200
+  br i1 undef, label %do.end277, label %cond.true263
+
+cond.true263:                                     ; preds = %do.end254
+  br i1 undef, label %if.then274, label %do.end277
+
+if.then274:                                       ; preds = %cond.true263
+  unreachable
+
+do.end277:                                        ; preds = %cond.true263, %do.end254
+  br i1 undef, label %if.then280, label %do.body283
+
+if.then280:                                       ; preds = %do.end277
+  br label %do.body283
+
+do.body283:                                       ; preds = %if.then280, %do.end277, %do.end193
+  br i1 undef, label %if.end301, label %cond.true286
+
+cond.true286:                                     ; preds = %do.body283
+  br i1 undef, label %if.then297, label %if.end301
+
+if.then297:                                       ; preds = %cond.true286
+  br label %if.end301
+
+if.end301:                                        ; preds = %if.then297, %cond.true286, %do.body283
+  br i1 undef, label %if.then304, label %do.body351
+
+if.then304:                                       ; preds = %if.end301, %do.end166
+  br i1 undef, label %do.body309.lr.ph, label %do.body351
+
+do.body309.lr.ph:                                 ; preds = %if.then304
+  br label %do.body309
+
+do.body309:                                       ; preds = %for.cond.backedge, %do.body309.lr.ph
+  br i1 undef, label %do.end328, label %cond.true312
+
+cond.true312:                                     ; preds = %do.body309
+  br i1 undef, label %if.then323, label %do.end328
+
+if.then323:                                       ; preds = %cond.true312
+  br label %do.end328
+
+do.end328:                                        ; preds = %if.then323, %cond.true312, %do.body309
+  br i1 undef, label %for.cond.backedge, label %cond.true335
+
+for.cond.backedge:                                ; preds = %if.then346, %cond.true335, %do.end328
+  br i1 undef, label %do.body309, label %do.body351
+
+cond.true335:                                     ; preds = %do.end328
+  br i1 undef, label %if.then346, label %for.cond.backedge
+
+if.then346:                                       ; preds = %cond.true335
+  br label %for.cond.backedge
+
+do.body351:                                       ; preds = %for.cond.backedge, %if.then304, %if.end301
+  br i1 undef, label %if.then354, label %if.end355
+
+if.then354:                                       ; preds = %do.body351
+  br label %if.end355
+
+if.end355:                                        ; preds = %if.then354, %do.body351
+  br i1 undef, label %if.else, label %if.then364
+
+if.then364:                                       ; preds = %if.end355
+  br label %do.body366
+
+if.else:                                          ; preds = %if.end355
+  br label %do.body366
+
+do.body366:                                       ; preds = %if.else, %if.then364
+  br i1 undef, label %if.then369, label %if.end377.critedge
+
+if.then369:                                       ; preds = %do.body366
+  br label %if.end377
+
+if.end377.critedge:                               ; preds = %do.body366
+  br label %if.end377
+
+if.end377:                                        ; preds = %if.end377.critedge, %if.then369
+  br i1 undef, label %if.then383, label %if.end392.critedge
+
+if.then383:                                       ; preds = %if.end377
+  br label %if.end392
+
+if.end392.critedge:                               ; preds = %if.end377
+  br label %if.end392
+
+if.end392:                                        ; preds = %if.end392.critedge, %if.then383
+  br i1 undef, label %if.then398, label %if.end399
+
+if.then398:                                       ; preds = %if.end392
+  br label %if.end399
+
+if.end399:                                        ; preds = %if.then398, %if.end392
+  invoke void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %eh.cont unwind label %lpad, !clang.arc.no_objc_arc_exceptions !0
+
+eh.cont:                                          ; preds = %if.end399
+  br i1 undef, label %if.then430, label %if.end439.critedge
+
+if.then430:                                       ; preds = %eh.cont
+  %1 = call i8* @objc_retain(i8* %0)
+  br label %if.end439
+
+lpad:                                             ; preds = %if.end399
+  %2 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__objc_personality_v0 to i8*)
+          catch i8* @"OBJC_EHTYPE_$_NSException"
+  unreachable
+
+if.end439.critedge:                               ; preds = %eh.cont
+  %3 = call i8* @objc_retain(i8* %0)
+  br label %if.end439
+
+if.end439:                                        ; preds = %if.end439.critedge, %if.then430
+  call void @objc_release(i8* %0), !clang.imprecise_release !0
+  unreachable
+
+return:                                           ; No predecessors!
+  ret void
+}
+
 
 !0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/retain-block-alloca.ll b/test/Transforms/ObjCARC/retain-block-alloca.ll
deleted file mode 100644
index 6b1578a..0000000
--- a/test/Transforms/ObjCARC/retain-block-alloca.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-; RUN: opt -S -objc-arc < %s | FileCheck %s
-; rdar://10209613
-
-%0 = type opaque
-%struct.__block_descriptor = type { i64, i64 }
-
-@_NSConcreteStackBlock = external global i8*
-@__block_descriptor_tmp = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
-@"\01L_OBJC_SELECTOR_REFERENCES_" = external hidden global i8*, section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
-
-; CHECK-LABEL: define void @test(
-; CHECK: %3 = call i8* @objc_retainBlock(i8* %2) [[NUW:#[0-9]+]]
-; CHECK: @objc_msgSend
-; CHECK-NEXT: @objc_release(i8* %3)
-define void @test(%0* %array) uwtable {
-entry:
-  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>, align 8
-  %0 = bitcast %0* %array to i8*
-  %1 = tail call i8* @objc_retain(i8* %0) nounwind
-  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 0
-  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
-  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 1
-  store i32 1107296256, i32* %block.flags, align 8
-  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 2
-  store i32 0, i32* %block.reserved, align 4
-  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 3
-  store i8* bitcast (void (i8*)* @__test_block_invoke_0 to i8*), i8** %block.invoke, align 8
-  %block.descriptor = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 4
-  store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_descriptor_tmp to %struct.__block_descriptor*), %struct.__block_descriptor** %block.descriptor, align 8
-  %block.captured = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 5
-  store %0* %array, %0** %block.captured, align 8
-  %2 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block to i8*
-  %3 = call i8* @objc_retainBlock(i8* %2) nounwind
-  %tmp2 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*)*)(i8* %0, i8* %tmp2, i8* %3)
-  call void @objc_release(i8* %3) nounwind
-  %strongdestroy = load %0** %block.captured, align 8
-  %4 = bitcast %0* %strongdestroy to i8*
-  call void @objc_release(i8* %4) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; Same as test, but the objc_retainBlock has a clang.arc.copy_on_escape
-; tag so it's safe to delete.
-
-; CHECK-LABEL: define void @test_with_COE(
-; CHECK-NOT: @objc_retainBlock
-; CHECK: @objc_msgSend
-; CHECK: @objc_release
-; CHECK-NOT: @objc_release
-; CHECK: }
-define void @test_with_COE(%0* %array) uwtable {
-entry:
-  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>, align 8
-  %0 = bitcast %0* %array to i8*
-  %1 = tail call i8* @objc_retain(i8* %0) nounwind
-  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 0
-  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
-  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 1
-  store i32 1107296256, i32* %block.flags, align 8
-  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 2
-  store i32 0, i32* %block.reserved, align 4
-  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 3
-  store i8* bitcast (void (i8*)* @__test_block_invoke_0 to i8*), i8** %block.invoke, align 8
-  %block.descriptor = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 4
-  store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_descriptor_tmp to %struct.__block_descriptor*), %struct.__block_descriptor** %block.descriptor, align 8
-  %block.captured = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block, i64 0, i32 5
-  store %0* %array, %0** %block.captured, align 8
-  %2 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, %0* }>* %block to i8*
-  %3 = call i8* @objc_retainBlock(i8* %2) nounwind, !clang.arc.copy_on_escape !0
-  %tmp2 = load i8** @"\01L_OBJC_SELECTOR_REFERENCES_", align 8
-  call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, i8*)*)(i8* %0, i8* %tmp2, i8* %3)
-  call void @objc_release(i8* %3) nounwind
-  %strongdestroy = load %0** %block.captured, align 8
-  %4 = bitcast %0* %strongdestroy to i8*
-  call void @objc_release(i8* %4) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-declare i8* @objc_retain(i8*)
-
-declare void @__test_block_invoke_0(i8* nocapture) uwtable
-
-declare i8* @objc_retainBlock(i8*)
-
-declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
-
-declare void @objc_release(i8*)
-
-; CHECK: attributes #0 = { uwtable }
-; CHECK: attributes #1 = { nonlazybind }
-; CHECK: attributes [[NUW]] = { nounwind }
-
-!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/retain-block-escape-analysis.ll b/test/Transforms/ObjCARC/retain-block-escape-analysis.ll
deleted file mode 100644
index 7914bb8..0000000
--- a/test/Transforms/ObjCARC/retain-block-escape-analysis.ll
+++ /dev/null
@@ -1,215 +0,0 @@
-; RUN: opt -S -objc-arc < %s | FileCheck %s
-
-declare i8* @objc_retain(i8*) nonlazybind
-declare void @objc_release(i8*) nonlazybind
-declare i8* @objc_retainBlock(i8*)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; Use by an instruction which copies the value is an escape if the             ;
-; result is an escape. The current instructions with this property are:        ;
-;                                                                              ;
-; 1. BitCast.                                                                  ;
-; 2. GEP.                                                                      ;
-; 3. PhiNode.                                                                  ;
-; 4. SelectInst.                                                               ;
-;                                                                              ;
-; Make sure that such instructions do not confuse the optimizer into removing  ;
-; an objc_retainBlock that is needed.                                          ;
-;                                                                              ;
-; rdar://13273675. (With extra test cases to handle bitcast, phi, and select.  ;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define void @bitcasttest(i8* %storage, void (...)* %block)  {
-; CHECK-LABEL: define void @bitcasttest(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %storage to void (...)**
-  %t5 = bitcast i8* %t3 to void (...)*
-  store void (...)* %t5, void (...)** %t4, align 8
-; CHECK: call void @objc_release
-  call void @objc_release(i8* %t1)
-  ret void
-; CHECK: }
-}
-
-define void @bitcasttest_a(i8* %storage, void (...)* %block)  {
-; CHECK-LABEL: define void @bitcasttest_a(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK-NOT: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %storage to void (...)**
-  %t5 = bitcast i8* %t3 to void (...)*
-  store void (...)* %t5, void (...)** %t4, align 8
-; CHECK-NOT: call void @objc_release
-  call void @objc_release(i8* %t1), !clang.imprecise_release !0
-  ret void
-; CHECK: }
-}
-
-define void @geptest(void (...)** %storage_array, void (...)* %block)  {
-; CHECK-LABEL: define void @geptest(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %t3 to void (...)*
-  
-  %storage = getelementptr inbounds void (...)** %storage_array, i64 0
-  
-  store void (...)* %t4, void (...)** %storage, align 8
-; CHECK: call void @objc_release
-  call void @objc_release(i8* %t1)
-  ret void
-; CHECK: }
-}
-
-define void @geptest_a(void (...)** %storage_array, void (...)* %block)  {
-; CHECK-LABEL: define void @geptest_a(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK-NOT: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %t3 to void (...)*
-  
-  %storage = getelementptr inbounds void (...)** %storage_array, i64 0
-  
-  store void (...)* %t4, void (...)** %storage, align 8
-; CHECK-NOT: call void @objc_release
-  call void @objc_release(i8* %t1), !clang.imprecise_release !0
-  ret void
-; CHECK: }
-}
-
-define void @selecttest(void (...)** %store1, void (...)** %store2,
-                        void (...)* %block) {
-; CHECK-LABEL: define void @selecttest(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %t3 to void (...)*
-  %store = select i1 undef, void (...)** %store1, void (...)** %store2
-  store void (...)* %t4, void (...)** %store, align 8
-; CHECK: call void @objc_release
-  call void @objc_release(i8* %t1)
-  ret void
-; CHECK: }
-}
-
-define void @selecttest_a(void (...)** %store1, void (...)** %store2,
-                          void (...)* %block) {
-; CHECK-LABEL: define void @selecttest_a(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK-NOT: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %t3 to void (...)*
-  %store = select i1 undef, void (...)** %store1, void (...)** %store2
-  store void (...)* %t4, void (...)** %store, align 8
-; CHECK-NOT: call void @objc_release
-  call void @objc_release(i8* %t1), !clang.imprecise_release !0
-  ret void
-; CHECK: }
-}
-
-define void @phinodetest(void (...)** %storage1,
-                         void (...)** %storage2,
-                         void (...)* %block) {
-; CHECK-LABEL: define void @phinodetest(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %t3 to void (...)*
-  br i1 undef, label %store1_set, label %store2_set
-; CHECK: store1_set:
-
-store1_set:
-  br label %end
-
-store2_set:
-  br label %end
-
-end:
-; CHECK: end:
-  %storage = phi void (...)** [ %storage1, %store1_set ], [ %storage2, %store2_set]
-  store void (...)* %t4, void (...)** %storage, align 8
-; CHECK: call void @objc_release
-  call void @objc_release(i8* %t1)
-  ret void
-; CHECK: }
-}
-
-define void @phinodetest_a(void (...)** %storage1,
-                           void (...)** %storage2,
-                           void (...)* %block) {
-; CHECK-LABEL: define void @phinodetest_a(
-entry:
-  %t1 = bitcast void (...)* %block to i8*
-; CHECK-NOT: tail call i8* @objc_retain
-  %t2 = tail call i8* @objc_retain(i8* %t1)
-; CHECK: tail call i8* @objc_retainBlock
-  %t3 = tail call i8* @objc_retainBlock(i8* %t1), !clang.arc.copy_on_escape !0
-  %t4 = bitcast i8* %t3 to void (...)*
-  br i1 undef, label %store1_set, label %store2_set
-
-store1_set:
-  br label %end
-
-store2_set:
-  br label %end
-
-end:
-  %storage = phi void (...)** [ %storage1, %store1_set ], [ %storage2, %store2_set]
-  store void (...)* %t4, void (...)** %storage, align 8
-; CHECK-NOT: call void @objc_release
-  call void @objc_release(i8* %t1), !clang.imprecise_release !0
-  ret void
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; This test makes sure that we do not hang clang when visiting a use ;
-; cycle caused by phi nodes during objc-arc analysis. *NOTE* This    ;
-; test case looks a little convoluted since it was produced by	     ;
-; bugpoint.							     ;
-; 								     ;
-; bugzilla://14551						     ;
-; rdar://12851911						     ;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define void @phinode_use_cycle(i8* %block) uwtable optsize ssp {
-; CHECK: define void @phinode_use_cycle(i8* %block)
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %if.then, %for.body, %entry
-  %block.05 = phi void (...)* [ null, %entry ], [ %1, %if.then ], [ %block.05, %for.body ]
-  br i1 undef, label %for.body, label %if.then
-
-if.then:                                          ; preds = %for.body
-  %0 = call i8* @objc_retainBlock(i8* %block), !clang.arc.copy_on_escape !0
-  %1 = bitcast i8* %0 to void (...)*
-  %2 = bitcast void (...)* %block.05 to i8*
-  call void @objc_release(i8* %2) nounwind, !clang.imprecise_release !0
-  br label %for.body
-}
-
-!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/retain-block-load.ll b/test/Transforms/ObjCARC/retain-block-load.ll
deleted file mode 100644
index a5170e3..0000000
--- a/test/Transforms/ObjCARC/retain-block-load.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; RUN: opt -objc-arc -S < %s | FileCheck %s
-
-; rdar://10803830
-; The optimizer should be able to prove that the block does not
-; "escape", so the retainBlock+release pair can be eliminated.
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-
-%struct.__block_descriptor = type { i64, i64 }
-
-@_NSConcreteStackBlock = external global i8*
-@__block_descriptor_tmp = external global { i64, i64, i8*, i8* }
-
-; CHECK: define void @test() {
-; CHECK-NOT: @objc
-; CHECK: declare i8* @objc_retainBlock(i8*)
-; CHECK: declare void @objc_release(i8*)
-
-define void @test() {
-entry:
-  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>, align 8
-  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %block, i64 0, i32 0
-  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
-  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %block, i64 0, i32 1
-  store i32 1073741824, i32* %block.flags, align 8
-  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %block, i64 0, i32 2
-  store i32 0, i32* %block.reserved, align 4
-  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %block, i64 0, i32 3
-  store i8* bitcast (i32 (i8*)* @__test_block_invoke_0 to i8*), i8** %block.invoke, align 8
-  %block.descriptor = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %block, i64 0, i32 4
-  store %struct.__block_descriptor* bitcast ({ i64, i64, i8*, i8* }* @__block_descriptor_tmp to %struct.__block_descriptor*), %struct.__block_descriptor** %block.descriptor, align 8
-  %block.captured = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %block, i64 0, i32 5
-  store i32 4, i32* %block.captured, align 8
-  %tmp = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %block to i8*
-  %tmp1 = call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-  %tmp2 = getelementptr inbounds i8* %tmp1, i64 16
-  %tmp3 = bitcast i8* %tmp2 to i8**
-  %tmp4 = load i8** %tmp3, align 8
-  %tmp5 = bitcast i8* %tmp4 to i32 (i8*)*
-  %call = call i32 %tmp5(i8* %tmp1)
-  call void @objc_release(i8* %tmp1) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-declare i32 @__test_block_invoke_0(i8* nocapture %.block_descriptor) nounwind readonly
-
-declare i8* @objc_retainBlock(i8*)
-
-declare void @objc_release(i8*)
-
-!0 = metadata !{}
diff --git a/test/Transforms/ObjCARC/retain-block.ll b/test/Transforms/ObjCARC/retain-block.ll
deleted file mode 100644
index 1bb3f02..0000000
--- a/test/Transforms/ObjCARC/retain-block.ll
+++ /dev/null
@@ -1,140 +0,0 @@
-; RUN: opt -objc-arc -S < %s | FileCheck %s
-
-target datalayout = "e-p:64:64:64"
-
-!0 = metadata !{}
-
-declare i8* @objc_retain(i8*)
-declare void @callee(i8)
-declare void @use_pointer(i8*)
-declare void @objc_release(i8*)
-declare i8* @objc_retainBlock(i8*)
-declare i8* @objc_autorelease(i8*)
-
-; Basic retainBlock+release elimination.
-
-; CHECK: define void @test0(i8* %tmp) {
-; CHECK-NOT: @objc
-; CHECK: }
-define void @test0(i8* %tmp) {
-entry:
-  %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; Same as test0, but there's no copy_on_escape metadata, so there's no
-; optimization possible.
-
-; CHECK: define void @test0_no_metadata(i8* %tmp) {
-; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW:#[0-9]+]]
-; CHECK: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
-; CHECK: }
-define void @test0_no_metadata(i8* %tmp) {
-entry:
-  %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; Same as test0, but the pointer escapes, so there's no
-; optimization possible.
-
-; CHECK: define void @test0_escape(i8* %tmp, i8** %z) {
-; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]], !clang.arc.copy_on_escape !0
-; CHECK: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
-; CHECK: }
-define void @test0_escape(i8* %tmp, i8** %z) {
-entry:
-  %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-  store i8* %tmp2, i8** %z
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; Same as test0_escape, but there's no intervening call.
-
-; CHECK: define void @test0_just_escape(i8* %tmp, i8** %z) {
-; CHECK: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]], !clang.arc.copy_on_escape !0
-; CHECK: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
-; CHECK: }
-define void @test0_just_escape(i8* %tmp, i8** %z) {
-entry:
-  %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-  store i8* %tmp2, i8** %z
-  tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; Basic nested retainBlock+release elimination.
-
-; CHECK: define void @test1(i8* %tmp) {
-; CHECK-NOT: @objc
-; CHECK: tail call i8* @objc_retain(i8* %tmp) [[NUW]]
-; CHECK-NOT: @objc
-; CHECK: tail call void @objc_release(i8* %tmp) [[NUW]], !clang.imprecise_release !0
-; CHECK-NOT: @objc
-; CHECK: }
-define void @test1(i8* %tmp) {
-entry:
-  %tmp1 = tail call i8* @objc_retain(i8* %tmp) nounwind
-  %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
-  tail call void @objc_release(i8* %tmp) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; Same as test1, but there's no copy_on_escape metadata, so there's no
-; retainBlock+release optimization possible. But we can still eliminate
-; the outer retain+release.
-
-; CHECK: define void @test1_no_metadata(i8* %tmp) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]]
-; CHECK-NEXT: @use_pointer(i8* %tmp2)
-; CHECK-NEXT: @use_pointer(i8* %tmp2)
-; CHECK-NEXT: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
-; CHECK-NOT: @objc
-; CHECK: }
-define void @test1_no_metadata(i8* %tmp) {
-entry:
-  %tmp1 = tail call i8* @objc_retain(i8* %tmp) nounwind
-  %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
-  tail call void @objc_release(i8* %tmp) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; Same as test1, but the pointer escapes, so there's no
-; retainBlock+release optimization possible. But we can still eliminate
-; the outer retain+release
-
-; CHECK: define void @test1_escape(i8* %tmp, i8** %z) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) [[NUW]], !clang.arc.copy_on_escape !0
-; CHECK-NEXT: store i8* %tmp2, i8** %z
-; CHECK-NEXT: @use_pointer(i8* %tmp2)
-; CHECK-NEXT: @use_pointer(i8* %tmp2)
-; CHECK-NEXT: tail call void @objc_release(i8* %tmp2) [[NUW]], !clang.imprecise_release !0
-; CHECK-NOT: @objc
-; CHECK: }
-define void @test1_escape(i8* %tmp, i8** %z) {
-entry:
-  %tmp1 = tail call i8* @objc_retain(i8* %tmp) nounwind
-  %tmp2 = tail call i8* @objc_retainBlock(i8* %tmp) nounwind, !clang.arc.copy_on_escape !0
-  store i8* %tmp2, i8** %z
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @use_pointer(i8* %tmp2)
-  tail call void @objc_release(i8* %tmp2) nounwind, !clang.imprecise_release !0
-  tail call void @objc_release(i8* %tmp) nounwind, !clang.imprecise_release !0
-  ret void
-}
-
-; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/PhaseOrdering/lit.local.cfg b/test/Transforms/PhaseOrdering/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/PhaseOrdering/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/PruneEH/lit.local.cfg b/test/Transforms/PruneEH/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/PruneEH/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/Reassociate/lit.local.cfg b/test/Transforms/Reassociate/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/Reassociate/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/Reg2Mem/lit.local.cfg b/test/Transforms/Reg2Mem/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/Reg2Mem/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/SCCP/lit.local.cfg b/test/Transforms/SCCP/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/SCCP/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/SLPVectorizer/ARM/lit.local.cfg b/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
new file mode 100644
index 0000000..5fc35d8
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/ARM/memory.ll b/test/Transforms/SLPVectorizer/ARM/memory.ll
new file mode 100644
index 0000000..383c808
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/ARM/memory.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; On swift unaligned <2 x double> stores need 4uops and it is there for cheaper
+; to do this scalar.
+
+; CHECK-LABEL: expensive_double_store
+; CHECK-NOT: load <2 x double>
+; CHECK-NOT: store <2 x double>
+define void @expensive_double_store(double* noalias %dst, double* noalias %src, i64 %count) {
+entry:
+  %0 = load double* %src, align 8
+  store double %0, double* %dst, align 8
+  %arrayidx2 = getelementptr inbounds double* %src, i64 1
+  %1 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %dst, i64 1
+  store double %1, double* %arrayidx3, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/ARM/sroa.ll b/test/Transforms/SLPVectorizer/ARM/sroa.ll
new file mode 100644
index 0000000..e0c75b1
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/ARM/sroa.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -mcpu=swift -mtriple=thumbv7-apple-ios -basicaa -slp-vectorizer < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+%class.Complex = type { double, double }
+
+; Code like this is the result of SROA. Make sure we don't vectorize this
+; because the in the scalar version of this the shl/or are handled by the
+; backend and disappear, the vectorized code stays.
+
+; CHECK-LABEL: SROAed
+; CHECK-NOT: shl <2 x i64>
+; CHECK-NOT: or <2 x i64>
+
+define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) {
+entry:
+  %a.coerce.fca.0.extract = extractvalue [4 x i32] %a.coerce, 0
+  %a.sroa.0.0.insert.ext = zext i32 %a.coerce.fca.0.extract to i64
+  %a.coerce.fca.1.extract = extractvalue [4 x i32] %a.coerce, 1
+  %a.sroa.0.4.insert.ext = zext i32 %a.coerce.fca.1.extract to i64
+  %a.sroa.0.4.insert.shift = shl nuw i64 %a.sroa.0.4.insert.ext, 32
+  %a.sroa.0.4.insert.insert = or i64 %a.sroa.0.4.insert.shift, %a.sroa.0.0.insert.ext
+  %0 = bitcast i64 %a.sroa.0.4.insert.insert to double
+  %a.coerce.fca.2.extract = extractvalue [4 x i32] %a.coerce, 2
+  %a.sroa.3.8.insert.ext = zext i32 %a.coerce.fca.2.extract to i64
+  %a.coerce.fca.3.extract = extractvalue [4 x i32] %a.coerce, 3
+  %a.sroa.3.12.insert.ext = zext i32 %a.coerce.fca.3.extract to i64
+  %a.sroa.3.12.insert.shift = shl nuw i64 %a.sroa.3.12.insert.ext, 32
+  %a.sroa.3.12.insert.insert = or i64 %a.sroa.3.12.insert.shift, %a.sroa.3.8.insert.ext
+  %1 = bitcast i64 %a.sroa.3.12.insert.insert to double
+  %b.coerce.fca.0.extract = extractvalue [4 x i32] %b.coerce, 0
+  %b.sroa.0.0.insert.ext = zext i32 %b.coerce.fca.0.extract to i64
+  %b.coerce.fca.1.extract = extractvalue [4 x i32] %b.coerce, 1
+  %b.sroa.0.4.insert.ext = zext i32 %b.coerce.fca.1.extract to i64
+  %b.sroa.0.4.insert.shift = shl nuw i64 %b.sroa.0.4.insert.ext, 32
+  %b.sroa.0.4.insert.insert = or i64 %b.sroa.0.4.insert.shift, %b.sroa.0.0.insert.ext
+  %2 = bitcast i64 %b.sroa.0.4.insert.insert to double
+  %b.coerce.fca.2.extract = extractvalue [4 x i32] %b.coerce, 2
+  %b.sroa.3.8.insert.ext = zext i32 %b.coerce.fca.2.extract to i64
+  %b.coerce.fca.3.extract = extractvalue [4 x i32] %b.coerce, 3
+  %b.sroa.3.12.insert.ext = zext i32 %b.coerce.fca.3.extract to i64
+  %b.sroa.3.12.insert.shift = shl nuw i64 %b.sroa.3.12.insert.ext, 32
+  %b.sroa.3.12.insert.insert = or i64 %b.sroa.3.12.insert.shift, %b.sroa.3.8.insert.ext
+  %3 = bitcast i64 %b.sroa.3.12.insert.insert to double
+  %add = fadd double %0, %2
+  %add3 = fadd double %1, %3
+  %re.i.i = getelementptr inbounds %class.Complex* %agg.result, i32 0, i32 0
+  store double %add, double* %re.i.i, align 4
+  %im.i.i = getelementptr inbounds %class.Complex* %agg.result, i32 0, i32 1
+  store double %add3, double* %im.i.i, align 4
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/R600/lit.local.cfg b/test/Transforms/SLPVectorizer/R600/lit.local.cfg
new file mode 100644
index 0000000..9e0ab99
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/R600/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'R600' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SLPVectorizer/R600/simplebb.ll b/test/Transforms/SLPVectorizer/R600/simplebb.ll
new file mode 100644
index 0000000..b6d794b
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/R600/simplebb.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -slp-vectorizer -dce < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+
+
+; Simple 3-pair chain with loads and stores
+define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
+; CHECK-LABEL: @test1_as_3_3_3(
+; CHECK: load <2 x double> addrspace(3)*
+; CHECK: load <2 x double> addrspace(3)*
+; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
+; CHECK: ret
+  %i0 = load double addrspace(3)* %a, align 8
+  %i1 = load double addrspace(3)* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double addrspace(3)* %a, i64 1
+  %i3 = load double addrspace(3)* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double addrspace(3)* %b, i64 1
+  %i4 = load double addrspace(3)* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double addrspace(3)* %c, align 8
+  %arrayidx5 = getelementptr inbounds double addrspace(3)* %c, i64 1
+  store double %mul5, double addrspace(3)* %arrayidx5, align 8
+  ret void
+}
+
+define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1_as_3_0_0(
+; CHECK: load <2 x double> addrspace(3)*
+; CHECK: load <2 x double>*
+; CHECK: store <2 x double> %{{.*}}, <2 x double>* %
+; CHECK: ret
+  %i0 = load double addrspace(3)* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double addrspace(3)* %a, i64 1
+  %i3 = load double addrspace(3)* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+define void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
+; CHECK-LABEL: @test1_as_0_0_3(
+; CHECK: load <2 x double>*
+; CHECK: load <2 x double>*
+; CHECK: store <2 x double> %{{.*}}, <2 x double> addrspace(3)* %
+; CHECK: ret
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double addrspace(3)* %c, align 8
+  %arrayidx5 = getelementptr inbounds double addrspace(3)* %c, i64 1
+  store double %mul5, double addrspace(3)* %arrayidx5, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
index 931195e..25c6545 100644
--- a/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -60,9 +60,9 @@ if.end332:                                        ; preds = %if.then329, %if.end
   %sub334 = fsub float %add294, %dx272.1
   %sub338 = fsub float %add297, %dy276.1
   %arrayidx.i.i606 = getelementptr inbounds %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices, i64 0, i32 0, i64 0
-  store float %sub334, float* %arrayidx.i.i606, align 4, !tbaa !0
+  store float %sub334, float* %arrayidx.i.i606, align 4
   %arrayidx3.i607 = getelementptr inbounds %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices, i64 0, i32 0, i64 1
-  store float %sub338, float* %arrayidx3.i607, align 4, !tbaa !0
+  store float %sub338, float* %arrayidx3.i607, align 4
   br label %return
 
 return:                                           ; preds = %if.end332, %for.end271, %entry
@@ -82,7 +82,3 @@ if.end22.2:                                       ; preds = %if.then17.2, %if.en
 }
 
 attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
new file mode 100644
index 0000000..8da3c34
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
@@ -0,0 +1,41 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.DState = type { i32, i32 }
+
+@b = common global %struct.DState zeroinitializer, align 4
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+define i32 @fn1() {
+entry:
+  %0 = load i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 0), align 4
+  %1 = load i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 1), align 4
+  %2 = load i32* @d, align 4
+  %cond = icmp eq i32 %2, 0
+  br i1 %cond, label %sw.bb, label %save_state_and_return
+
+sw.bb:                                            ; preds = %entry
+  %3 = load i32* @c, align 4
+  %and = and i32 %3, 7
+  store i32 %and, i32* @a, align 4
+  switch i32 %and, label %if.end [
+    i32 7, label %save_state_and_return
+    i32 0, label %save_state_and_return
+  ]
+
+if.end:                                           ; preds = %sw.bb
+  br label %save_state_and_return
+
+save_state_and_return:                            ; preds = %sw.bb, %sw.bb, %if.end, %entry
+  %t.0 = phi i32 [ 0, %if.end ], [ %0, %entry ], [ %0, %sw.bb ], [ %0, %sw.bb ]
+  %f.0 = phi i32 [ 0, %if.end ], [ %1, %entry ], [ 0, %sw.bb ], [ 0, %sw.bb ]
+  store i32 %t.0, i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 0), align 4
+  store i32 %f.0, i32* getelementptr inbounds (%struct.DState* @b, i32 0, i32 1), align 4
+  ret i32 undef
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/debug_info.ll b/test/Transforms/SLPVectorizer/X86/debug_info.ll
index b408913..f4e68f2 100644
--- a/test/Transforms/SLPVectorizer/X86/debug_info.ll
+++ b/test/Transforms/SLPVectorizer/X86/debug_info.ll
@@ -13,13 +13,13 @@ target triple = "x86_64-apple-macosx10.7.0"
 ; }
 
 ;CHECK: @depth
-;CHECK: getelementptr inbounds {{.*}}, !dbg !24
-;CHECK: bitcast double* {{.*}}, !dbg !24
-;CHECK: load <2 x double>* {{.*}}, !dbg !24
-;CHECK: store <2 x double> {{.*}}, !dbg !26
+;CHECK: getelementptr inbounds {{.*}}, !dbg ![[LOC:[0-9]+]]
+;CHECK: bitcast double* {{.*}}, !dbg ![[LOC]]
+;CHECK: load <2 x double>* {{.*}}, !dbg ![[LOC]]
+;CHECK: store <2 x double> {{.*}}, !dbg ![[LOC2:[0-9]+]]
 ;CHECK: ret
-;CHECK: !24 = metadata !{i32 4, i32 0,
-;CHECK: !26 = metadata !{i32 7, i32 0,
+;CHECK: ![[LOC]] = metadata !{i32 4, i32 0,
+;CHECK: ![[LOC2]] = metadata !{i32 7, i32 0,
 
 define i32 @depth(double* nocapture %A, i32 %m) #0 {
 entry:
@@ -33,18 +33,18 @@ entry:
 
 for.body.lr.ph:                                   ; preds = %entry
   %arrayidx = getelementptr inbounds double* %A, i64 4, !dbg !24
-  %0 = load double* %arrayidx, align 8, !dbg !24, !tbaa !26
+  %0 = load double* %arrayidx, align 8, !dbg !24
   %arrayidx1 = getelementptr inbounds double* %A, i64 5, !dbg !29
-  %1 = load double* %arrayidx1, align 8, !dbg !29, !tbaa !26
+  %1 = load double* %arrayidx1, align 8, !dbg !29
   br label %for.end, !dbg !23
 
 for.end:                                          ; preds = %for.body.lr.ph, %entry
   %y1.0.lcssa = phi double [ %1, %for.body.lr.ph ], [ 1.000000e+00, %entry ]
   %y0.0.lcssa = phi double [ %0, %for.body.lr.ph ], [ 0.000000e+00, %entry ]
   %arrayidx2 = getelementptr inbounds double* %A, i64 8, !dbg !30
-  store double %y0.0.lcssa, double* %arrayidx2, align 8, !dbg !30, !tbaa !26
+  store double %y0.0.lcssa, double* %arrayidx2, align 8, !dbg !30
   %arrayidx3 = getelementptr inbounds double* %A, i64 9, !dbg !30
-  store double %y1.0.lcssa, double* %arrayidx3, align 8, !dbg !30, !tbaa !26
+  store double %y1.0.lcssa, double* %arrayidx3, align 8, !dbg !30
   ret i32 undef, !dbg !31
 }
 
@@ -55,7 +55,7 @@ attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-po
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!18}
+!llvm.module.flags = !{!18, !32}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 187335) (llvm/trunk 187335:187340M)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/nadav/file.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"file.c", metadata !"/Users/nadav"}
@@ -63,7 +63,7 @@ attributes #1 = { nounwind readnone }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"depth", metadata !"depth", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (double*, i32)* @depth, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [depth]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/Users/nadav/file.c]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !9, metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
@@ -83,9 +83,7 @@ attributes #1 = { nounwind readnone }
 !23 = metadata !{i32 3, i32 0, metadata !17, null}
 !24 = metadata !{i32 4, i32 0, metadata !25, null}
 !25 = metadata !{i32 786443, metadata !1, metadata !17, i32 3, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [/Users/nadav/file.c]
-!26 = metadata !{metadata !"double", metadata !27}
-!27 = metadata !{metadata !"omnipotent char", metadata !28}
-!28 = metadata !{metadata !"Simple C/C++ TBAA"}
 !29 = metadata !{i32 5, i32 0, metadata !25, null}
 !30 = metadata !{i32 7, i32 0, metadata !4, null}
 !31 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!32 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/SLPVectorizer/X86/external_user.ll b/test/Transforms/SLPVectorizer/X86/external_user.ll
index 22f0e64..6d09aa6 100644
--- a/test/Transforms/SLPVectorizer/X86/external_user.ll
+++ b/test/Transforms/SLPVectorizer/X86/external_user.ll
@@ -59,3 +59,38 @@ for.end:                                          ; preds = %for.body
   ret double %mul3
 }
 
+; A need-to-gather entry cannot be an external use of the scalar element.
+; Instead the insertelement instructions of the need-to-gather entry are the
+; external users.
+; This test would assert because we would keep the scalar fpext and fadd alive.
+; PR18129
+
+; CHECK-LABEL: needtogather
+define i32 @needtogather(double *noalias %a, i32 *noalias %b,  float * noalias %c,
+                i32 * noalias %d) {
+entry:
+  %0 = load i32* %d, align 4
+  %conv = sitofp i32 %0 to float
+  %1 = load float* %c
+  %sub = fsub float 0.000000e+00, %1
+  %mul = fmul float %sub, 0.000000e+00
+  %add = fadd float %conv, %mul
+  %conv1 = fpext float %add to double
+  %sub3 = fsub float 1.000000e+00, %1
+  %mul4 = fmul float %sub3, 0.000000e+00
+  %add5 = fadd float %conv, %mul4
+  %conv6 = fpext float %add5 to double
+  %tobool = fcmp une float %add, 0.000000e+00
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  br label %if.end
+
+if.end:
+  %storemerge = phi double [ %conv6, %if.then ], [ %conv1, %entry ]
+  %e.0 = phi double [ %conv1, %if.then ], [ %conv6, %entry ]
+  store double %storemerge, double* %a, align 8
+  %conv7 = fptosi double %e.0 to i32
+  store i32 %conv7, i32* %b, align 4
+  ret i32 undef
+}
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll
new file mode 100644
index 0000000..8f91951
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -0,0 +1,417 @@
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSTORE
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; #include <stdint.h>
+;
+; int foo(float *A, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += 7*A[i*4  ] +
+;            7*A[i*4+1] +
+;            7*A[i*4+2] +
+;            7*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; NOSTORE-LABEL: add_red
+; NOSTORE: fmul <4 x float>
+; NOSTORE: shufflevector <4 x float>
+
+define i32 @add_red(float* %A, i32 %n) {
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
+  %mul = shl nsw i64 %i.033, 2
+  %arrayidx = getelementptr inbounds float* %A, i64 %mul
+  %1 = load float* %arrayidx, align 4
+  %mul2 = fmul float %1, 7.000000e+00
+  %add28 = or i64 %mul, 1
+  %arrayidx4 = getelementptr inbounds float* %A, i64 %add28
+  %2 = load float* %arrayidx4, align 4
+  %mul5 = fmul float %2, 7.000000e+00
+  %add6 = fadd fast float %mul2, %mul5
+  %add829 = or i64 %mul, 2
+  %arrayidx9 = getelementptr inbounds float* %A, i64 %add829
+  %3 = load float* %arrayidx9, align 4
+  %mul10 = fmul float %3, 7.000000e+00
+  %add11 = fadd fast float %add6, %mul10
+  %add1330 = or i64 %mul, 3
+  %arrayidx14 = getelementptr inbounds float* %A, i64 %add1330
+  %4 = load float* %arrayidx14, align 4
+  %mul15 = fmul float %4, 7.000000e+00
+  %add16 = fadd fast float %add11, %mul15
+  %add17 = fadd fast float %sum.032, %add16
+  %inc = add nsw i64 %i.033, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add17 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum *= B[0]*A[i*4  ] +
+;       B[1]*A[i*4+1] +
+;       B[2]*A[i*4+2] +
+;       B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: mul_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp38 = icmp sgt i32 %n, 0
+  br i1 %cmp38, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx15, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
+  %mul = shl nsw i64 %i.040, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %5 = load float* %arrayidx2, align 4
+  %mul3 = fmul float %0, %5
+  %add35 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add35
+  %6 = load float* %arrayidx6, align 4
+  %mul7 = fmul float %1, %6
+  %add8 = fadd fast float %mul3, %mul7
+  %add1136 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add1136
+  %7 = load float* %arrayidx12, align 4
+  %mul13 = fmul float %2, %7
+  %add14 = fadd fast float %add8, %mul13
+  %add1737 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add1737
+  %8 = load float* %arrayidx18, align 4
+  %mul19 = fmul float %3, %8
+  %add20 = fadd fast float %add14, %mul19
+  %mul21 = fmul float %sum.039, %add20
+  %inc = add nsw i64 %i.040, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %mul21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*6  ] +
+;            B[1]*A[i*6+1] +
+;            B[2]*A[i*6+2] +
+;            B[3]*A[i*6+3] +
+;            B[4]*A[i*6+4] +
+;            B[5]*A[i*6+5] +
+;            B[6]*A[i*6+6] +
+;            B[7]*A[i*6+7] +
+;            B[8]*A[i*6+8];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: long_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp81 = icmp sgt i32 %n, 0
+  br i1 %cmp81, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx15, align 4
+  %arrayidx21 = getelementptr inbounds float* %B, i64 4
+  %4 = load float* %arrayidx21, align 4
+  %arrayidx27 = getelementptr inbounds float* %B, i64 5
+  %5 = load float* %arrayidx27, align 4
+  %arrayidx33 = getelementptr inbounds float* %B, i64 6
+  %6 = load float* %arrayidx33, align 4
+  %arrayidx39 = getelementptr inbounds float* %B, i64 7
+  %7 = load float* %arrayidx39, align 4
+  %arrayidx45 = getelementptr inbounds float* %B, i64 8
+  %8 = load float* %arrayidx45, align 4
+  %9 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
+  %mul = mul nsw i64 %i.083, 6
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %10 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %10
+  %add80 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add80
+  %11 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %1, %11
+  %add8 = fadd fast float %mul3, %mul7
+  %add11 = add nsw i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add11
+  %12 = load float* %arrayidx12, align 4
+  %mul13 = fmul fast float %2, %12
+  %add14 = fadd fast float %add8, %mul13
+  %add17 = add nsw i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add17
+  %13 = load float* %arrayidx18, align 4
+  %mul19 = fmul fast float %3, %13
+  %add20 = fadd fast float %add14, %mul19
+  %add23 = add nsw i64 %mul, 4
+  %arrayidx24 = getelementptr inbounds float* %A, i64 %add23
+  %14 = load float* %arrayidx24, align 4
+  %mul25 = fmul fast float %4, %14
+  %add26 = fadd fast float %add20, %mul25
+  %add29 = add nsw i64 %mul, 5
+  %arrayidx30 = getelementptr inbounds float* %A, i64 %add29
+  %15 = load float* %arrayidx30, align 4
+  %mul31 = fmul fast float %5, %15
+  %add32 = fadd fast float %add26, %mul31
+  %add35 = add nsw i64 %mul, 6
+  %arrayidx36 = getelementptr inbounds float* %A, i64 %add35
+  %16 = load float* %arrayidx36, align 4
+  %mul37 = fmul fast float %6, %16
+  %add38 = fadd fast float %add32, %mul37
+  %add41 = add nsw i64 %mul, 7
+  %arrayidx42 = getelementptr inbounds float* %A, i64 %add41
+  %17 = load float* %arrayidx42, align 4
+  %mul43 = fmul fast float %7, %17
+  %add44 = fadd fast float %add38, %mul43
+  %add47 = add nsw i64 %mul, 8
+  %arrayidx48 = getelementptr inbounds float* %A, i64 %add47
+  %18 = load float* %arrayidx48, align 4
+  %mul49 = fmul fast float %8, %18
+  %add50 = fadd fast float %add44, %mul49
+  %add51 = fadd fast float %sum.082, %add50
+  %inc = add nsw i64 %i.083, 1
+  %exitcond = icmp eq i64 %inc, %9
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add51 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*4  ];
+;     sum += B[1]*A[i*4+1];
+;     sum += B[2]*A[i*4+2];
+;     sum += B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: chain_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
+entry:
+  %cmp41 = icmp sgt i32 %n, 0
+  br i1 %cmp41, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %1 = load float* %arrayidx4, align 4
+  %arrayidx10 = getelementptr inbounds float* %B, i64 2
+  %2 = load float* %arrayidx10, align 4
+  %arrayidx16 = getelementptr inbounds float* %B, i64 3
+  %3 = load float* %arrayidx16, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
+  %mul = shl nsw i64 %i.043, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %5 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %5
+  %add = fadd fast float %sum.042, %mul3
+  %add638 = or i64 %mul, 1
+  %arrayidx7 = getelementptr inbounds float* %A, i64 %add638
+  %6 = load float* %arrayidx7, align 4
+  %mul8 = fmul fast float %1, %6
+  %add9 = fadd fast float %add, %mul8
+  %add1239 = or i64 %mul, 2
+  %arrayidx13 = getelementptr inbounds float* %A, i64 %add1239
+  %7 = load float* %arrayidx13, align 4
+  %mul14 = fmul fast float %2, %7
+  %add15 = fadd fast float %add9, %mul14
+  %add1840 = or i64 %mul, 3
+  %arrayidx19 = getelementptr inbounds float* %A, i64 %add1840
+  %8 = load float* %arrayidx19, align 4
+  %mul20 = fmul fast float %3, %8
+  %add21 = fadd fast float %add15, %mul20
+  %inc = add nsw i64 %i.043, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] +
+;          B[1] *A[i*4+1] +
+;          B[2] *A[i*4+2] +
+;          B[3] *A[i*4+3];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: store_red
+; CHECK: fmul <4 x float>
+; CHECK: shufflevector <4 x float>
+
+define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
+entry:
+  %cmp37 = icmp sgt i32 %n, 0
+  br i1 %cmp37, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %arrayidx4 = getelementptr inbounds float* %B, i64 1
+  %arrayidx9 = getelementptr inbounds float* %B, i64 2
+  %arrayidx15 = getelementptr inbounds float* %B, i64 3
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %1 = load float* %B, align 4
+  %mul = shl nsw i64 %i.039, 2
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %mul
+  %2 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %1, %2
+  %3 = load float* %arrayidx4, align 4
+  %add34 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float* %A, i64 %add34
+  %4 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %3, %4
+  %add8 = fadd fast float %mul3, %mul7
+  %5 = load float* %arrayidx9, align 4
+  %add1135 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float* %A, i64 %add1135
+  %6 = load float* %arrayidx12, align 4
+  %mul13 = fmul fast float %5, %6
+  %add14 = fadd fast float %add8, %mul13
+  %7 = load float* %arrayidx15, align 4
+  %add1736 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float* %A, i64 %add1736
+  %8 = load float* %arrayidx18, align 4
+  %mul19 = fmul fast float %7, %8
+  %add20 = fadd fast float %add14, %mul19
+  store float %add20, float* %C.addr.038, align 4
+  %incdec.ptr = getelementptr inbounds float* %C.addr.038, i64 1
+  %inc = add nsw i64 %i.039, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
+
+
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S <  %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+;          int n) {
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
+;   }
+; }
+
+; STORE-LABEL: store_red_double
+; STORE: fmul <2 x double>
+; STORE: extractelement <2 x double>
+; STORE: extractelement <2 x double>
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+entry:
+  %cmp17 = icmp sgt i32 %n, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load double* %B, align 8
+  %arrayidx4 = getelementptr inbounds double* %B, i64 1
+  %1 = load double* %arrayidx4, align 8
+  %2 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = shl nsw i64 %i.018, 2
+  %arrayidx2 = getelementptr inbounds double* %A, i64 %mul
+  %3 = load double* %arrayidx2, align 8
+  %mul3 = fmul fast double %0, %3
+  %add16 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds double* %A, i64 %add16
+  %4 = load double* %arrayidx6, align 8
+  %mul7 = fmul fast double %1, %4
+  %add8 = fadd fast double %mul3, %mul7
+  %arrayidx9 = getelementptr inbounds double* %C, i64 %i.018
+  store double %add8, double* %arrayidx9, align 8
+  %inc = add nsw i64 %i.018, 1
+  %exitcond = icmp eq i64 %inc, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
new file mode 100644
index 0000000..43f7aed
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -0,0 +1,197 @@
+; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
+
+target triple = "x86_64-apple-macosx10.8.0"
+
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select(
+; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
+; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+; Insert in an order different from the vector indices to make sure it
+; doesn't matter
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_insert_out_of_order(
+; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
+; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 2
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 0
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+declare void @v4f32_user(<4 x float>) #0
+declare void @f32_user(float) #0
+
+; Multiple users of the final constructed vector
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_users(
+; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
+; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  call void @v4f32_user(<4 x float> %rd) #0
+  ret <4 x float> %rd
+}
+
+; Unused insertelement
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_no_users(
+; CHECK-NOT: icmp ne <4 x i32>
+; CHECK-NOT: select <4 x i1>
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> undef, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+; Make sure infinite loop doesn't happen which I ran into when trying
+; to do this backwards this backwards
+define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+; CHECK-LABEL: @reconstruct(
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %ra = insertelement <4 x i32> undef, i32 %c0, i32 0
+  %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
+  %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
+  %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
+  ret <4 x i32> %rd
+}
+
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_v2(
+; CHECK: icmp ne <2 x i32>
+; CHECK: select <2 x i1>
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %a0 = extractelement <2 x float> %a, i32 0
+  %a1 = extractelement <2 x float> %a, i32 1
+  %b0 = extractelement <2 x float> %b, i32 0
+  %b1 = extractelement <2 x float> %b, i32 1
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %ra = insertelement <2 x float> undef, float %s0, i32 0
+  %rb = insertelement <2 x float> %ra, float %s1, i32 1
+  ret <2 x float> %rb
+}
+
+; Make sure when we construct partial vectors, we don't keep
+; re-visiting the insertelement chains starting with undef
+; (low cost threshold needed to force this to happen)
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %1 = insertelement <2 x i32> undef, i32 %c0, i32 0
+  %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
+  %3 = icmp ne <2 x i32> %2, zeroinitializer
+  %4 = insertelement <2 x float> undef, float %a0, i32 0
+  %5 = insertelement <2 x float> %4, float %a1, i32 1
+  %6 = insertelement <2 x float> undef, float %b0, i32 0
+  %7 = insertelement <2 x float> %6, float %b1, i32 1
+  %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
+  %9 = extractelement <2 x float> %8, i32 0
+  %ra = insertelement <4 x float> undef, float %9, i32 0
+  %10 = extractelement <2 x float> %8, i32 1
+  %rb = insertelement <4 x float> %ra, float %10, i32 1
+  ret <4 x float> %rb
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/SLPVectorizer/X86/lit.local.cfg b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/Transforms/SLPVectorizer/X86/lit.local.cfg
+++ b/test/Transforms/SLPVectorizer/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/X86/operandorder.ll b/test/Transforms/SLPVectorizer/X86/operandorder.ll
new file mode 100644
index 0000000..c5322a8
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -0,0 +1,234 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+
+
+
+; Make sure we order the operands of commutative operations so that we get
+; bigger vectorizable trees.
+
+; CHECK-LABEL: shuffle_operands1
+; CHECK:         load <2 x double>
+; CHECK:         fadd <2 x double>
+
+define void @shuffle_operands1(double * noalias %from, double * noalias %to,
+                               double %v1, double %v2) {
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %v1
+  %v1_2 = fadd double %v2, %v0_2
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %p
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast2
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast2(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_2, %v0_1
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast3
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast3(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+
+; CHECK-LABEL: shuffle_preserve_broadcast4
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast4(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_2, %v0_1
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; CHECK-LABEL: shuffle_preserve_broadcast5
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast5(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+
+; CHECK-LABEL: shuffle_preserve_broadcast6
+; CHECK: %[[BCAST:[a-z0-9]+]] = insertelement <2 x double> undef, double %v0_1
+; CHECK:                      = insertelement <2 x double> %[[BCAST]], double %v0_1
+define void @shuffle_preserve_broadcast6(double * noalias %from,
+                                        double * noalias %to,
+                                        double %v1, double %v2) {
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double *%from, i64 1
+  %v0_1 = load double * %from
+  %v0_2 = load double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %v0_1, %p
+  %to_2 = getelementptr double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; Make sure we don't scramble operands when we reorder them and destroy
+; 'good' source order.
+
+; CHECK-LABEL: good_load_order
+
+; CHECK: %[[V1:[0-9]+]] = load <4 x float>*
+; CHECK: %[[V2:[0-9]+]] = insertelement <4 x float> undef, float %1, i32 0
+; CHECK: %[[V3:[0-9]+]] = shufflevector <4 x float> %[[V2]], <4 x float> %[[V1]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+; CHECK:                = fmul <4 x float> %[[V1]], %[[V3]]
+
+@a = common global [32000 x float] zeroinitializer, align 16
+
+define void @good_load_order() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %0 = load float* getelementptr inbounds ([32000 x float]* @a, i64 0, i64 0), align 16
+  br label %for.body3
+
+for.body3:
+  %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %2 = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %2
+  %3 = load float* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %indvars.iv
+  %mul6 = fmul float %3, %1
+  store float %mul6, float* %arrayidx5, align 4
+  %4 = add nsw i64 %indvars.iv, 2
+  %arrayidx11 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %4
+  %5 = load float* %arrayidx11, align 4
+  %mul15 = fmul float %5, %3
+  store float %mul15, float* %arrayidx, align 4
+  %6 = add nsw i64 %indvars.iv, 3
+  %arrayidx21 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %6
+  %7 = load float* %arrayidx21, align 4
+  %mul25 = fmul float %7, %5
+  store float %mul25, float* %arrayidx11, align 4
+  %8 = add nsw i64 %indvars.iv, 4
+  %arrayidx31 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %8
+  %9 = load float* %arrayidx31, align 4
+  %mul35 = fmul float %9, %7
+  store float %mul35, float* %arrayidx21, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %arrayidx41 = getelementptr inbounds [32000 x float]* @a, i64 0, i64 %indvars.iv.next
+  %10 = load float* %arrayidx41, align 4
+  %mul45 = fmul float %10, %9
+  store float %mul45, float* %arrayidx31, align 4
+  %11 = trunc i64 %indvars.iv.next to i32
+  %cmp2 = icmp slt i32 %11, 31995
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/opt.ll b/test/Transforms/SLPVectorizer/X86/opt.ll
new file mode 100644
index 0000000..14137c1
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/opt.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -O3 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=SLP
+; RUN: opt < %s -O3 -disable-slp-vectorization -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSLP
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure we can disable slp vectorization in opt.
+
+; SLP-LABEL: test1
+; SLP: store <2 x double>
+
+; NOSLP-LABEL: test1
+; NOSLP-NOT: store <2 x double>
+
+
+define void @test1(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/ordering.ll b/test/Transforms/SLPVectorizer/X86/ordering.ll
index 588e115..d2ecd45 100644
--- a/test/Transforms/SLPVectorizer/X86/ordering.ll
+++ b/test/Transforms/SLPVectorizer/X86/ordering.ll
@@ -17,3 +17,65 @@ entry:
   %cmp11 = fcmp olt double %add, 0.000000e+00
   ret void
 }
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i32 @personality_v0(...)
+
+define void @invoketest() {
+entry:
+  br i1 undef, label %cond.true, label %cond.false
+
+cond.true:
+  %call49 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef) 
+          to label %cond.true54 unwind label %lpad
+
+cond.false:
+  %call51 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %cond.false57 unwind label %lpad
+
+cond.true54:
+  %call56 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef) 
+          to label %cond.end60 unwind label %lpad
+
+cond.false57:
+  %call59 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %cond.end60 unwind label %lpad
+
+; Make sure we don't vectorize these phis - they have invokes as inputs.
+
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+; CHECK-LABEL: invoketest
+
+; CHECK-LABEL: cond.end60
+; CHECK-NEXT-NOT: phi <2 x double>
+; CHECK: insertelement
+; CHECK-LABEL: if.then63
+
+cond.end60:
+  %cond126 = phi double [ %call49, %cond.true54 ], [ %call51, %cond.false57 ]
+  %cond61 = phi double [ %call56, %cond.true54 ], [ %call59, %cond.false57 ]
+  br i1 undef, label %if.end98, label %if.then63
+
+if.then63:
+  %conv69 = fptrunc double undef to float
+  %conv70 = fpext float %conv69 to double
+  %div71 = fdiv double %cond126, %conv70
+  %conv78 = fptrunc double undef to float
+  %conv79 = fpext float %conv78 to double
+  %div80 = fdiv double %cond61, %conv79
+  br label %if.end98
+
+lpad:
+  %l = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @personality_v0 to i8*)
+          cleanup
+  resume { i8*, i32 } %l
+
+if.end98:
+  %dimensionsResult.sroa.0.0 = phi double [ %div71, %if.then63 ], [ %cond126, %cond.end60 ]
+  %dimensionsResult.sroa.6.0 = phi double [ %div80, %if.then63 ], [ %cond61, %cond.end60 ]
+  br label %if.end99
+
+if.end99:
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/phi.ll b/test/Transforms/SLPVectorizer/X86/phi.ll
index 1c7f9cc..964e0e4 100644
--- a/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.9.0"
@@ -95,3 +95,154 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+; float foo3(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   float Y = A[3];
+;   float P = A[4];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;     Y+=A[i+3]*10;
+;     P+=A[i+4]*11;
+;   }
+;
+;   return R+G+B+Y+P;
+; }
+
+;CHECK: foo3
+;CHECK: phi <4 x float>
+;CHECK: fmul <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK-NOT: phi <5 x float>
+;CHECK-NOT: fmul <5 x float>
+;CHECK-NOT: fadd <5 x float>
+
+define float @foo3(float* nocapture readonly %A) #0 {
+entry:
+  %0 = load float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float* %A, i64 1
+  %1 = load float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float* %A, i64 2
+  %2 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %A, i64 3
+  %3 = load float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %A, i64 4
+  %4 = load float* %arrayidx4, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %P.056 = phi float [ %4, %entry ], [ %add26, %for.body ]
+  %Y.055 = phi float [ %3, %entry ], [ %add21, %for.body ]
+  %B.054 = phi float [ %2, %entry ], [ %add16, %for.body ]
+  %G.053 = phi float [ %1, %entry ], [ %add11, %for.body ]
+  %R.052 = phi float [ %0, %entry ], [ %add6, %for.body ]
+  %5 = phi float [ %1, %entry ], [ %11, %for.body ]
+  %6 = phi float [ %0, %entry ], [ %9, %for.body ]
+  %mul = fmul float %6, 7.000000e+00
+  %add6 = fadd float %R.052, %mul
+  %mul10 = fmul float %5, 8.000000e+00
+  %add11 = fadd float %G.053, %mul10
+  %7 = add nsw i64 %indvars.iv, 2
+  %arrayidx14 = getelementptr inbounds float* %A, i64 %7
+  %8 = load float* %arrayidx14, align 4
+  %mul15 = fmul float %8, 9.000000e+00
+  %add16 = fadd float %B.054, %mul15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds float* %A, i64 %indvars.iv.next
+  %9 = load float* %arrayidx19, align 4
+  %mul20 = fmul float %9, 1.000000e+01
+  %add21 = fadd float %Y.055, %mul20
+  %10 = add nsw i64 %indvars.iv, 4
+  %arrayidx24 = getelementptr inbounds float* %A, i64 %10
+  %11 = load float* %arrayidx24, align 4
+  %mul25 = fmul float %11, 1.100000e+01
+  %add26 = fadd float %P.056, %mul25
+  %12 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %12, 121
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float %add6, %add11
+  %add29 = fadd float %add28, %add16
+  %add30 = fadd float %add29, %add21
+  %add31 = fadd float %add30, %add26
+  ret float %add31
+}
+
+; Make sure the order of phi nodes of different types does not prevent
+; vectorization of same typed phi nodes.
+; CHECK-LABEL: sort_phi_type
+; CHECK: phi <4 x float>
+; CHECK: fmul <4 x float>
+
+define float @sort_phi_type(float* nocapture readonly %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %Y = phi float [ 1.000000e+01, %entry ], [ %mul10, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %B = phi float [ 1.000000e+01, %entry ], [ %mul15, %for.body ]
+  %G = phi float [ 1.000000e+01, %entry ], [ %mul20, %for.body ]
+  %R = phi float [ 1.000000e+01, %entry ], [ %mul25, %for.body ]
+  %mul10 = fmul float %Y, 8.000000e+00
+  %mul15 = fmul float %B, 9.000000e+00
+  %mul20 = fmul float %R, 10.000000e+01
+  %mul25 = fmul float %G, 11.100000e+01
+  %indvars.iv.next = add nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float 1.000000e+01, %mul10
+  %add29 = fadd float %mul10, %mul15
+  %add30 = fadd float %add29, %mul20
+  %add31 = fadd float %add30, %mul25
+  ret float %add31
+}
+
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+; CHECK-LABEL: @test(
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+;
+; We disable the vectorization of x86_fp80 for now. 
+
+entry:
+  %i1.0 = load x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80* %i1.gep1, align 16
+; CHECK: load x86_fp80*
+; CHECK: load x86_fp80*
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK_NOT: insertelement <2 x x86_fp80>
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80* %i2.gep1, align 16
+; CHECK: load x86_fp80*
+; CHECK: load x86_fp80*
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
+  br label %end
+
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+; CHECK-NOT: phi <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll b/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
new file mode 100644
index 0000000..6d2d5e3
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -disable-output
+
+target datalayout = "f64:64:64-v64:64:64"
+
+define void @test_phi_in_landingpad() {
+entry:
+  invoke void @foo()
+          to label %inner unwind label %lpad
+
+inner:
+  %x0 = fsub double undef, undef
+  %y0 = fsub double undef, undef
+  invoke void @foo()
+          to label %done unwind label %lpad
+
+lpad:
+  %x1 = phi double [ undef, %entry ], [ undef, %inner ]
+  %y1 = phi double [ undef, %entry ], [ undef, %inner ]
+  landingpad { i8*, i32 } personality i8*
+          bitcast (i32 (...)* @__gxx_personality_v0 to i8*) catch i8* null
+  br label %done
+
+done:
+  phi double [ %x0, %inner ], [ %x1, %lpad ]
+  phi double [ %y0, %inner ], [ %y1, %lpad ]
+  ret void
+}
+
+declare void @foo()
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
new file mode 100644
index 0000000..520e672
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+; We purposely over-align f64 to 128bit here. 
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:128:128-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+
+define void @test(double* %i1, double* %i2, double* %o) {
+; CHECK-LABEL: @test(
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+
+entry:
+  %i1.0 = load double* %i1, align 16
+  %i1.gep1 = getelementptr double* %i1, i64 1
+  %i1.1 = load double* %i1.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds double* %i2, i64 0
+  %i2.0 = load double* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds double* %i2, i64 1
+  %i2.1 = load double* %i2.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+  br label %end
+
+end:
+  %phi0 = phi double [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi double [ %i1.1, %entry ], [ %i2.1, %then ]
+; CHECK: phi <2 x double>
+; CHECK: extractelement <2 x double>
+; CHECK: extractelement <2 x double>
+  store double %phi0, double* %o, align 16
+  %o.gep1 = getelementptr inbounds double* %o, i64 1
+  store double %phi1, double* %o.gep1, align 16
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/pr16899.ll b/test/Transforms/SLPVectorizer/X86/pr16899.ll
new file mode 100644
index 0000000..8631bc9
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s  -slp-vectorizer -S -mtriple=i386--netbsd -mcpu=i486
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+@a = common global i32* null, align 4
+
+; Function Attrs: noreturn nounwind readonly
+define i32 @fn1() #0 {
+entry:
+  %0 = load i32** @a, align 4, !tbaa !4
+  %1 = load i32* %0, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32* %0, i32 1
+  %2 = load i32* %arrayidx1, align 4, !tbaa !5
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %c.0 = phi i32 [ %2, %entry ], [ %add2, %do.body ]
+  %b.0 = phi i32 [ %1, %entry ], [ %add, %do.body ]
+  %add = add nsw i32 %b.0, %c.0
+  %add2 = add nsw i32 %add, 1
+  br label %do.body
+}
+
+attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"int", metadata !1}
+!4 = metadata !{metadata !0, metadata !0, i64 0}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/Transforms/SLPVectorizer/X86/pr18060.ll b/test/Transforms/SLPVectorizer/X86/pr18060.ll
new file mode 100644
index 0000000..e6813f3
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/pr18060.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -slp-vectorizer -S -mtriple=i386-pc-linux
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-pc-linux"
+
+; Function Attrs: nounwind
+define i32 @_Z16adjustFixupValueyj(i64 %Value, i32 %Kind) {
+entry:
+  %extract.t = trunc i64 %Value to i32
+  %extract = lshr i64 %Value, 12
+  %extract.t6 = trunc i64 %extract to i32
+  switch i32 %Kind, label %sw.default [
+    i32 0, label %return
+    i32 1, label %return
+    i32 129, label %sw.bb1
+    i32 130, label %sw.bb2
+  ]
+
+sw.default:                                       ; preds = %entry
+  call void @_Z25llvm_unreachable_internalv()
+  unreachable
+
+sw.bb1:                                           ; preds = %entry
+  %shr = lshr i64 %Value, 16
+  %extract.t5 = trunc i64 %shr to i32
+  %extract7 = lshr i64 %Value, 28
+  %extract.t8 = trunc i64 %extract7 to i32
+  br label %sw.bb2
+
+sw.bb2:                                           ; preds = %sw.bb1, %entry
+  %Value.addr.0.off0 = phi i32 [ %extract.t, %entry ], [ %extract.t5, %sw.bb1 ]
+  %Value.addr.0.off12 = phi i32 [ %extract.t6, %entry ], [ %extract.t8, %sw.bb1 ]
+  %conv6 = and i32 %Value.addr.0.off0, 4095
+  %conv4 = shl i32 %Value.addr.0.off12, 16
+  %shl = and i32 %conv4, 983040
+  %or = or i32 %shl, %conv6
+  %or11 = or i32 %or, 8388608
+  br label %return
+
+return:                                           ; preds = %sw.bb2, %entry, %entry
+  %retval.0 = phi i32 [ %or11, %sw.bb2 ], [ %extract.t, %entry ], [ %extract.t, %entry ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: noreturn
+declare void @_Z25llvm_unreachable_internalv()
+
diff --git a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
index 3235fd9..6aea5d3 100644
--- a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -3,6 +3,8 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.9.0"
 
+; We disable the vectorization of <3 x float> for now
+
 ; float foo(float *A) {
 ;
 ;   float R = A[0];
@@ -19,14 +21,14 @@ target triple = "i386-apple-macosx10.9.0"
 
 ;CHECK-LABEL: @foo(
 ;CHECK: br
-;CHECK: phi <3 x float>
-;CHECK: fmul <3 x float>
-;CHECK: fadd <3 x float>
+;CHECK-NOT: phi <3 x float>
+;CHECK-NOT: fmul <3 x float>
+;CHECK-NOT: fadd <3 x float>
 ; At the moment we don't sink extractelements.
 ;CHECK: br
-;CHECK: extractelement
-;CHECK: extractelement
-;CHECK: extractelement
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
 ;CHECK: ret
 
 define float @foo(float* nocapture readonly %A) {
diff --git a/test/Transforms/SLPVectorizer/X86/simplebb.ll b/test/Transforms/SLPVectorizer/X86/simplebb.ll
index cd0b99e..7d682e5 100644
--- a/test/Transforms/SLPVectorizer/X86/simplebb.ll
+++ b/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -23,3 +23,67 @@ entry:
   ret void
 }
 
+; Simple 3-pair chain with loads and stores, obfuscated with bitcasts
+; CHECK: test2
+; CHECK: store <2 x double>
+; CHECK: ret
+define void @test2(double* %a, double* %b, i8* %e) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %c = bitcast i8* %e to double*
+  store double %mul, double* %c, align 8
+  %carrayidx5 = getelementptr inbounds i8* %e, i64 8
+  %arrayidx5 = bitcast i8* %carrayidx5 to double*
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile loads.
+; CHECK: test_volatile_load
+; CHECK-NOT: load <2 x double>
+; CHECK: store <2 x double>
+; CHECK: ret
+define void @test_volatile_load(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load volatile double* %a, align 8
+  %i1 = load volatile double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile stores.
+; CHECK: test_volatile_store
+; CHECK-NOT: store <2 x double>
+; CHECK: ret
+define void @test_volatile_store(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store volatile double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store volatile double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+
diff --git a/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
new file mode 100644
index 0000000..2747a1f
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -0,0 +1,140 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+
+; CHECK: tiny_tree_fully_vectorizable
+; CHECK: load <2 x double>
+; CHECK: store <2 x double>
+; CHECK: ret 
+
+define void @tiny_tree_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double* %src.addr.013, i64 1
+  %1 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %dst.addr.014, i64 1
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK: tiny_tree_fully_vectorizable2
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: ret
+
+define void @tiny_tree_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float* %src.addr.021, i64 1
+  %1 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %src.addr.021, i64 2
+  %2 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float* %src.addr.021, i64 3
+  %3 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; We do not vectorize the tiny tree which is not fully vectorizable. 
+; CHECK: tiny_tree_not_fully_vectorizable
+; CHECK-NOT: <2 x double>
+; CHECK: ret 
+
+define void @tiny_tree_not_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double* %src.addr.013, i64 2
+  %1 = load double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double* %dst.addr.014, i64 1 
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+; CHECK: tiny_tree_not_fully_vectorizable2
+; CHECK-NOT: <2 x double>
+; CHECK: ret
+
+define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float* %src.addr.021, i64 4 
+  %1 = load float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float* %src.addr.021, i64 2
+  %2 = load float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float* %src.addr.021, i64 3
+  %3 = load float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/XCore/lit.local.cfg b/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
new file mode 100644
index 0000000..4d17d46
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'XCore' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll b/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll
new file mode 100644
index 0000000..66392e7
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=xcore  | FileCheck %s
+
+target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
+target triple = "xcore"
+
+; Simple 3-pair chain with loads and stores
+; CHECK: test1
+; CHECK-NOT: <2 x double>
+define void @test1(double* %a, double* %b, double* %c) {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
diff --git a/test/Transforms/SLPVectorizer/lit.local.cfg b/test/Transforms/SLPVectorizer/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/SLPVectorizer/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 458b0df..5d3e4b5 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -1181,7 +1181,6 @@ entry:
   store i1 %x, i1* %b.i1, align 8
   %b.i8 = bitcast <{ i1 }>* %b to i8*
   %foo = load i8* %b.i8, align 1
-; CHECK-NEXT: {{.*}} = zext i1 %x to i8
 ; CHECK-NEXT: %[[ext:.*]] = zext i1 %x to i8
 ; CHECK-NEXT: store i8 %[[ext]], i8* %[[a]], align 8
 ; CHECK-NEXT: {{.*}} = load i8* %[[a]], align 8
diff --git a/test/Transforms/SROA/lit.local.cfg b/test/Transforms/SROA/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/SROA/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/SROA/vector-conversion.ll b/test/Transforms/SROA/vector-conversion.ll
new file mode 100644
index 0000000..08d7960
--- /dev/null
+++ b/test/Transforms/SROA/vector-conversion.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define <4 x i64> @vector_ptrtoint({<2 x i32*>, <2 x i32*>} %x) {
+; CHECK-LABEL: @vector_ptrtoint
+  %a = alloca {<2 x i32*>, <2 x i32*>}
+; CHECK-NOT: alloca
+
+  store {<2 x i32*>, <2 x i32*>} %x, {<2 x i32*>, <2 x i32*>}* %a
+; CHECK-NOT: store
+
+  %cast = bitcast {<2 x i32*>, <2 x i32*>}* %a to <4 x i64>*
+  %vec = load <4 x i64>* %cast
+; CHECK-NOT: load
+; CHECK: ptrtoint
+
+  ret <4 x i64> %vec
+}
+
+define <4 x i32*> @vector_inttoptr({<2 x i64>, <2 x i64>} %x) {
+; CHECK-LABEL: @vector_inttoptr
+  %a = alloca {<2 x i64>, <2 x i64>}
+; CHECK-NOT: alloca
+
+  store {<2 x i64>, <2 x i64>} %x, {<2 x i64>, <2 x i64>}* %a
+; CHECK-NOT: store
+
+  %cast = bitcast {<2 x i64>, <2 x i64>}* %a to <4 x i32*>*
+  %vec = load <4 x i32*>* %cast
+; CHECK-NOT: load
+; CHECK: inttoptr
+
+  ret <4 x i32*> %vec
+}
+
+define <2 x i64> @vector_ptrtointbitcast({<1 x i32*>, <1 x i32*>} %x) {
+; CHECK-LABEL: @vector_ptrtointbitcast
+  %a = alloca {<1 x i32*>, <1 x i32*>}
+; CHECK-NOT: alloca
+
+  store {<1 x i32*>, <1 x i32*>} %x, {<1 x i32*>, <1 x i32*>}* %a
+; CHECK-NOT: store
+
+  %cast = bitcast {<1 x i32*>, <1 x i32*>}* %a to <2 x i64>*
+  %vec = load <2 x i64>* %cast
+; CHECK-NOT: load
+; CHECK: ptrtoint
+; CHECK: bitcast
+; CHECK: ptrtoint
+; CHECK: bitcast
+
+  ret <2 x i64> %vec
+}
diff --git a/test/Transforms/SampleProfile/Inputs/branch.prof b/test/Transforms/SampleProfile/Inputs/branch.prof
new file mode 100644
index 0000000..d19894d
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/branch.prof
@@ -0,0 +1,11 @@
+symbol table
+1
+main
+main:15680:0:7
+0: 0
+4: 0
+7: 0
+9: 10226
+10: 2243
+16: 0
+18: 0
diff --git a/test/Transforms/SampleProfile/branch.ll b/test/Transforms/SampleProfile/branch.ll
new file mode 100644
index 0000000..5167627
--- /dev/null
+++ b/test/Transforms/SampleProfile/branch.ll
@@ -0,0 +1,143 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/branch.prof | opt -analyze -branch-prob | FileCheck %s
+
+; Original C++ code for this test case:
+;
+; #include <stdio.h>
+; #include <stdlib.h>
+;
+; int main(int argc, char *argv[]) {
+;   if (argc < 2)
+;     return 1;
+;   double result;
+;   int limit = atoi(argv[1]);
+;   if (limit > 100) {
+;     double s = 23.041968;
+;     for (int u = 0; u < limit; u++) {
+;       double x = s;
+;       s = x + 3.049 + (double)u;
+;       s -= s + 3.94 / x * 0.32;
+;     }
+;     result = s;
+;   } else {
+;     result = 0;
+;   }
+;   printf("result is %lf\n", result);
+;   return 0;
+; }
+
+@.str = private unnamed_addr constant [15 x i8] c"result is %lf\0A\00", align 1
+
+; Function Attrs: nounwind uwtable
+define i32 @main(i32 %argc, i8** nocapture readonly %argv) #0 {
+; CHECK: Printing analysis 'Branch Probability Analysis' for function 'main':
+
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32 %argc}, i64 0, metadata !13), !dbg !27
+  tail call void @llvm.dbg.value(metadata !{i8** %argv}, i64 0, metadata !14), !dbg !27
+  %cmp = icmp slt i32 %argc, 2, !dbg !28
+  br i1 %cmp, label %return, label %if.end, !dbg !28
+; CHECK: edge entry -> return probability is 1 / 2 = 50%
+; CHECK: edge entry -> if.end probability is 1 / 2 = 50%
+
+if.end:                                           ; preds = %entry
+  %arrayidx = getelementptr inbounds i8** %argv, i64 1, !dbg !30
+  %0 = load i8** %arrayidx, align 8, !dbg !30, !tbaa !31
+  %call = tail call i32 @atoi(i8* %0) #4, !dbg !30
+  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !17), !dbg !30
+  %cmp1 = icmp sgt i32 %call, 100, !dbg !35
+  br i1 %cmp1, label %for.body, label %if.end6, !dbg !35
+; CHECK: edge if.end -> for.body probability is 2243 / 2244 = 99.9554% [HOT edge]
+; CHECK: edge if.end -> if.end6 probability is 1 / 2244 = 0.0445633%
+
+for.body:                                         ; preds = %if.end, %for.body
+  %u.016 = phi i32 [ %inc, %for.body ], [ 0, %if.end ]
+  %s.015 = phi double [ %sub, %for.body ], [ 0x40370ABE6A337A81, %if.end ]
+  %add = fadd double %s.015, 3.049000e+00, !dbg !36
+  %conv = sitofp i32 %u.016 to double, !dbg !36
+  %add4 = fadd double %add, %conv, !dbg !36
+  tail call void @llvm.dbg.value(metadata !{double %add4}, i64 0, metadata !18), !dbg !36
+  %div = fdiv double 3.940000e+00, %s.015, !dbg !37
+  %mul = fmul double %div, 3.200000e-01, !dbg !37
+  %add5 = fadd double %add4, %mul, !dbg !37
+  %sub = fsub double %add4, %add5, !dbg !37
+  tail call void @llvm.dbg.value(metadata !{double %sub}, i64 0, metadata !18), !dbg !37
+  %inc = add nsw i32 %u.016, 1, !dbg !38
+  tail call void @llvm.dbg.value(metadata !{i32 %inc}, i64 0, metadata !21), !dbg !38
+  %exitcond = icmp eq i32 %inc, %call, !dbg !38
+  br i1 %exitcond, label %if.end6, label %for.body, !dbg !38
+; CHECK: edge for.body -> if.end6 probability is 1 / 2244 = 0.0445633%
+; CHECK: edge for.body -> for.body probability is 2243 / 2244 = 99.9554% [HOT edge]
+
+if.end6:                                          ; preds = %for.body, %if.end
+  %result.0 = phi double [ 0.000000e+00, %if.end ], [ %sub, %for.body ]
+  %call7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8]* @.str, i64 0, i64 0), double %result.0), !dbg !39
+  br label %return, !dbg !40
+; CHECK: edge if.end6 -> return probability is 16 / 16 = 100% [HOT edge]
+
+return:                                           ; preds = %entry, %if.end6
+  %retval.0 = phi i32 [ 0, %if.end6 ], [ 1, %entry ]
+  ret i32 %retval.0, !dbg !41
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @atoi(i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #3
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind readnone }
+attributes #4 = { nounwind readonly }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!25, !42}
+!llvm.ident = !{!26}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 (trunk 192896) (llvm/trunk 192895)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [./branch.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"branch.cc", metadata !"."}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 4, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !12, i32 4} ; [ DW_TAG_subprogram ] [line 4] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [./branch.cc]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !17, metadata !18, metadata !21, metadata !23}
+!13 = metadata !{i32 786689, metadata !4, metadata !"argc", metadata !5, i32 16777220, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 4]
+!14 = metadata !{i32 786689, metadata !4, metadata !"argv", metadata !5, i32 33554436, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 4]
+!15 = metadata !{i32 786688, metadata !4, metadata !"result", metadata !5, i32 7, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [result] [line 7]
+!16 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!17 = metadata !{i32 786688, metadata !4, metadata !"limit", metadata !5, i32 8, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [limit] [line 8]
+!18 = metadata !{i32 786688, metadata !19, metadata !"s", metadata !5, i32 10, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [s] [line 10]
+!19 = metadata !{i32 786443, metadata !1, metadata !20, i32 9, i32 0, i32 2} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!20 = metadata !{i32 786443, metadata !1, metadata !4, i32 9, i32 0, i32 1} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!21 = metadata !{i32 786688, metadata !22, metadata !"u", metadata !5, i32 11, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [u] [line 11]
+!22 = metadata !{i32 786443, metadata !1, metadata !19, i32 11, i32 0, i32 3} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!23 = metadata !{i32 786688, metadata !24, metadata !"x", metadata !5, i32 12, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [x] [line 12]
+!24 = metadata !{i32 786443, metadata !1, metadata !22, i32 11, i32 0, i32 4} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!25 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!26 = metadata !{metadata !"clang version 3.4 (trunk 192896) (llvm/trunk 192895)"}
+!27 = metadata !{i32 4, i32 0, metadata !4, null}
+!28 = metadata !{i32 5, i32 0, metadata !29, null}
+!29 = metadata !{i32 786443, metadata !1, metadata !4, i32 5, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [./branch.cc]
+!30 = metadata !{i32 8, i32 0, metadata !4, null} ; [ DW_TAG_imported_declaration ]
+!31 = metadata !{metadata !32, metadata !32, i64 0}
+!32 = metadata !{metadata !"any pointer", metadata !33, i64 0}
+!33 = metadata !{metadata !"omnipotent char", metadata !34, i64 0}
+!34 = metadata !{metadata !"Simple C/C++ TBAA"}
+!35 = metadata !{i32 9, i32 0, metadata !20, null}
+!36 = metadata !{i32 13, i32 0, metadata !24, null}
+!37 = metadata !{i32 14, i32 0, metadata !24, null}
+!38 = metadata !{i32 11, i32 0, metadata !22, null}
+!39 = metadata !{i32 20, i32 0, metadata !4, null}
+!40 = metadata !{i32 21, i32 0, metadata !4, null}
+!41 = metadata !{i32 22, i32 0, metadata !4, null}
+!42 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/ScalarRepl/debuginfo-preserved.ll b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
index 27e6670..71bf22a 100644
--- a/test/Transforms/ScalarRepl/debuginfo-preserved.ll
+++ b/test/Transforms/ScalarRepl/debuginfo-preserved.ll
@@ -40,11 +40,12 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20}
 
 !0 = metadata !{i32 786449, metadata !18, i32 12, metadata !"clang version 3.0 (trunk 131941)", i1 false, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32, i32)* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"f", metadata !"f", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32, i32)* @f, null, null, null, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [f]
 !2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 16777217, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -61,3 +62,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !17 = metadata !{metadata !1}
 !18 = metadata !{metadata !"/d/j/debug-test.c", metadata !"/Volumes/Data/b"}
 !19 = metadata !{i32 0}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/ScalarRepl/lit.local.cfg b/test/Transforms/ScalarRepl/lit.local.cfg
deleted file mode 100644
index c6106e4..0000000
--- a/test/Transforms/ScalarRepl/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll']
diff --git a/test/Transforms/ScalarRepl/union-pointer.ll b/test/Transforms/ScalarRepl/union-pointer.ll
index 03d25ac..f0dc141 100644
--- a/test/Transforms/ScalarRepl/union-pointer.ll
+++ b/test/Transforms/ScalarRepl/union-pointer.ll
@@ -1,13 +1,16 @@
 ; PR892
-; RUN: opt < %s -scalarrepl -S | \
-; RUN:   not grep alloca
-; RUN: opt < %s -scalarrepl -S | grep "ret i8"
+; RUN: opt < %s -scalarrepl -S | FileCheck %s
 
-target datalayout = "e-p:32:32-n8:16:32"
+
+target datalayout = "e-p:32:32-p1:16:16-n8:16:32"
 target triple = "i686-apple-darwin8.7.2"
-	%struct.Val = type { i32*, i32 }
+
+%struct.Val = type { i32*, i32 }
 
 define i8* @test(i16* %X) {
+; CHECK-LABEL: @test(
+; CHECK-NOT: alloca
+; CHECK: ret i8*
 	%X_addr = alloca i16*		; <i16**> [#uses=2]
 	store i16* %X, i16** %X_addr
 	%X_addr.upgrd.1 = bitcast i16** %X_addr to i8**		; <i8**> [#uses=1]
@@ -15,7 +18,37 @@ define i8* @test(i16* %X) {
 	ret i8* %tmp
 }
 
+define i8 addrspace(1)* @test_as1(i16 addrspace(1)* %x) {
+; CHECK-LABEL: @test_as1(
+; CHECK-NEXT: %1 = ptrtoint i16 addrspace(1)* %x to i16
+; CHECK-NEXT: %2 = inttoptr i16 %1 to i8 addrspace(1)*
+; CHECK-NEXT: ret i8 addrspace(1)* %2
+    %x_addr = alloca i16 addrspace(1)*
+	store i16 addrspace(1)* %x, i16 addrspace(1)** %x_addr
+	%x_addr.upgrd.1 = bitcast i16 addrspace(1)** %x_addr to i8 addrspace(1)**
+	%tmp = load i8 addrspace(1)** %x_addr.upgrd.1
+	ret i8 addrspace(1)* %tmp
+}
+
+define i8 addrspace(1)* @test_as1_array(i16 addrspace(1)* %x) {
+; CHECK-LABEL: @test_as1_array(
+; CHECK-NEXT: %1 = ptrtoint i16 addrspace(1)* %x to i16
+; CHECK-NEXT: %2 = inttoptr i16 %1 to i8 addrspace(1)*
+; CHECK-NEXT: ret i8 addrspace(1)* %2
+  %as_ptr_array = alloca [4 x i16 addrspace(1)*]
+  %elem1 = getelementptr [4 x i16 addrspace(1)*]* %as_ptr_array, i32 0, i32 1
+  store i16 addrspace(1)* %x, i16 addrspace(1)** %elem1
+  %elem1.cast = bitcast i16 addrspace(1)** %elem1 to i8 addrspace(1)**
+  %tmp = load i8 addrspace(1)** %elem1.cast
+  ret i8 addrspace(1)* %tmp
+}
+
+
 define void @test2(i64 %Op.0) {
+; CHECK-LABEL: @test2(
+; CHECK-NOT: alloca
+; CHECK: ret void
+
 	%tmp = alloca %struct.Val, align 8		; <%struct.Val*> [#uses=3]
 	%tmp1 = alloca %struct.Val, align 8		; <%struct.Val*> [#uses=3]
 	%tmp.upgrd.2 = call i64 @_Z3foov( )		; <i64> [#uses=1]
diff --git a/test/Transforms/SimplifyCFG/CoveredLookupTable.ll b/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
new file mode 100644
index 0000000..8b45a59
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
@@ -0,0 +1,48 @@
+; RUN: opt -simplifycfg -S %s | FileCheck %s
+; rdar://15268442
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin12.0.0"
+
+; CHECK-LABEL: define i3 @coveredswitch_test(
+; CHECK: entry:
+; CHECK-NEXT: sub i3 %input, -4
+; CHECK-NEXT: zext i3 %switch.tableidx to i24
+; CHECK-NEXT: mul i24 %switch.cast, 3
+; CHECK-NEXT: lshr i24 7507338, %switch.shiftamt
+; CHECK-NEXT: trunc i24 %switch.downshift to i3
+; CHECK-NEXT: ret i3 %switch.masked
+
+define i3 @coveredswitch_test(i3 %input) {
+entry:
+  switch i3 %input, label %bb8 [
+    i3 0, label %bb7
+    i3 1, label %bb
+    i3 2, label %bb3
+    i3 3, label %bb4
+    i3 4, label %bb5
+    i3 5, label %bb6
+  ]
+
+bb:                                               ; preds = %entry
+  br label %bb8
+
+bb3:                                              ; preds = %entry
+  br label %bb8
+
+bb4:                                              ; preds = %entry
+  br label %bb8
+
+bb5:                                              ; preds = %entry
+  br label %bb8
+
+bb6:                                              ; preds = %entry
+  br label %bb8
+
+bb7:                                              ; preds = %entry
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb6, %bb5, %bb4, %bb3, %bb, %entry
+  %result = phi i3 [ 0, %bb7 ], [ 1, %bb6 ], [ 2, %bb5 ], [ 3, %bb4 ], [ 4, %bb3 ], [ 5, %bb ], [ 6, %entry ]
+  ret i3 %result
+}
diff --git a/test/Transforms/SimplifyCFG/MagicPointer.ll b/test/Transforms/SimplifyCFG/MagicPointer.ll
index 93b9a27..b8b8cbd 100644
--- a/test/Transforms/SimplifyCFG/MagicPointer.ll
+++ b/test/Transforms/SimplifyCFG/MagicPointer.ll
@@ -2,15 +2,7 @@
 ;
 ; RUN: opt < %s -simplifycfg -S | FileCheck %s
 
-; CHECK: switch i64 %magicptr
-; CHECK: i64 0, label
-; CHECK: i64 1, label
-; CHECK: i64 2, label
-; CHECK: i64 3, label
-; CHECK: i64 4, label
-; CHECK: }
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
 @.str = private constant [5 x i8] c"null\00"      ; <[5 x i8]*> [#uses=2]
@@ -18,7 +10,24 @@ target triple = "x86_64-apple-darwin10.0.0"
 @.str2 = private constant [4 x i8] c"two\00"      ; <[4 x i8]*> [#uses=2]
 @.str3 = private constant [5 x i8] c"four\00"     ; <[5 x i8]*> [#uses=2]
 
+@.str_as1 = private addrspace(1) constant [5 x i8] c"null\00"      ; <[5 x i8]*> [#uses=2]
+@.str1_as1 = private addrspace(1) constant [4 x i8] c"one\00"      ; <[4 x i8]*> [#uses=2]
+@.str2_as1 = private addrspace(1) constant [4 x i8] c"two\00"      ; <[4 x i8]*> [#uses=2]
+@.str3_as1 = private addrspace(1) constant [5 x i8] c"four\00"     ; <[5 x i8]*> [#uses=2]
+
+declare i32 @puts(i8*)
+declare i32 @puts_as1(i8 addrspace(1)*)
+
 define void @f(i8* %x) nounwind ssp {
+; CHECK-LABEL: @f(
+; CHECK: switch i64 %magicptr
+; CHECK: i64 0, label
+; CHECK: i64 1, label
+; CHECK: i64 2, label
+; CHECK: i64 3, label
+; CHECK: i64 4, label
+; CHECK: }
+
 entry:
   %tobool = icmp eq i8* %x, null                  ; <i1> [#uses=1]
   br i1 %tobool, label %if.then, label %if.else
@@ -72,4 +81,69 @@ if.end21:                                         ; preds = %if.end20, %if.then
   ret void
 }
 
-declare i32 @puts(i8*)
+; Is it useful to test a version where the ptrtoints are to the same
+; size?
+define void @f_as1(i8 addrspace(1)* %x) nounwind ssp {
+; CHECK-LABEL: @f_as1(
+; CHECK: ptrtoint i8 addrspace(1)* %x to i16
+; CHECK: switch i16 %magicptr
+; CHECK: i16 0, label
+; CHECK: i16 1, label
+; CHECK: i16 2, label
+; CHECK: i16 3, label
+; CHECK: i16 4, label
+; CHECK: }
+
+entry:
+  %tobool = icmp eq i8 addrspace(1)* %x, null                  ; <i1> [#uses=1]
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @puts_as1(i8 addrspace(1)* getelementptr inbounds ([5 x i8] addrspace(1)* @.str_as1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
+  br label %if.end21
+
+if.else:                                          ; preds = %entry
+  %cmp = icmp eq i8 addrspace(1)* %x, inttoptr (i64 1 to i8 addrspace(1)*)  ; <i1> [#uses=1]
+  br i1 %cmp, label %if.then2, label %if.else4
+
+if.then2:                                         ; preds = %if.else
+  %call3 = call i32 @puts_as1(i8 addrspace(1)* getelementptr inbounds ([4 x i8] addrspace(1)* @.str1_as1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
+  br label %if.end20
+
+if.else4:                                         ; preds = %if.else
+  %cmp6 = icmp eq i8 addrspace(1)* %x, inttoptr (i64 2 to i8 addrspace(1)*) ; <i1> [#uses=1]
+  br i1 %cmp6, label %if.then9, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %if.else4
+  %cmp8 = icmp eq i8 addrspace(1)* %x, inttoptr (i64 3 to i8 addrspace(1)*) ; <i1> [#uses=1]
+  br i1 %cmp8, label %if.then9, label %if.else11
+
+if.then9:                                         ; preds = %lor.lhs.false, %if.else4
+  %call10 = call i32 @puts_as1(i8 addrspace(1)* getelementptr inbounds ([4 x i8] addrspace(1)* @.str2_as1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
+  br label %if.end19
+
+if.else11:                                        ; preds = %lor.lhs.false
+  %cmp13 = icmp eq i8 addrspace(1)* %x, inttoptr (i64 4 to i8 addrspace(1)*) ; <i1> [#uses=1]
+  br i1 %cmp13, label %if.then14, label %if.else16
+
+if.then14:                                        ; preds = %if.else11
+  %call15 = call i32 @puts_as1(i8 addrspace(1)* getelementptr inbounds ([5 x i8] addrspace(1)* @.str3_as1, i64 0, i64 0)) nounwind ; <i32> [#uses=0]
+  br label %if.end
+
+if.else16:                                        ; preds = %if.else11
+  %call18 = call i32 @puts_as1(i8 addrspace(1)* %x) nounwind       ; <i32> [#uses=0]
+  br label %if.end
+
+if.end:                                           ; preds = %if.else16, %if.then14
+  br label %if.end19
+
+if.end19:                                         ; preds = %if.end, %if.then9
+  br label %if.end20
+
+if.end20:                                         ; preds = %if.end19, %if.then2
+  br label %if.end21
+
+if.end21:                                         ; preds = %if.end20, %if.then
+  ret void
+}
+
diff --git a/test/Transforms/SimplifyCFG/R600/lit.local.cfg b/test/Transforms/SimplifyCFG/R600/lit.local.cfg
deleted file mode 100644
index e69de29..0000000
--- a/test/Transforms/SimplifyCFG/R600/lit.local.cfg
+++ /dev/null
diff --git a/test/Transforms/SimplifyCFG/R600/parallelandifcollapse.ll b/test/Transforms/SimplifyCFG/R600/parallelandifcollapse.ll
deleted file mode 100644
index e69de29..0000000
--- a/test/Transforms/SimplifyCFG/R600/parallelandifcollapse.ll
+++ /dev/null
diff --git a/test/Transforms/SimplifyCFG/R600/parallelorifcollapse.ll b/test/Transforms/SimplifyCFG/R600/parallelorifcollapse.ll
deleted file mode 100644
index e69de29..0000000
--- a/test/Transforms/SimplifyCFG/R600/parallelorifcollapse.ll
+++ /dev/null
diff --git a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
index 786fee9..4d344fa 100644
--- a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
+++ b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Sparc' in targets:
     config.unsupported = True
diff --git a/test/Transforms/SimplifyCFG/X86/lit.local.cfg b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/Transforms/SimplifyCFG/X86/lit.local.cfg
+++ b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 71259c9..3687327 100644
--- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -711,7 +711,7 @@ return:
   ret i32 %retval.0
 }
 
-define i32 @cprop(i32 %x) {
+define i32 @cprop(i32 %x, i32 %y) {
 entry:
   switch i32 %x, label %sw.default [
     i32 1, label %return
@@ -727,7 +727,8 @@ sw.bb1: br label %return
 
 sw.bb2:
   %and = and i32 %x, 1
-  %tobool = icmp ne i32 %and, 0
+  %and.ptr = inttoptr i32 %and to i8*
+  %tobool = icmp ne i8* %and.ptr, null
   %cond = select i1 %tobool, i32 -123, i32 456
   %sub = sub nsw i32 %x, %cond
   br label %return
@@ -735,13 +736,15 @@ sw.bb2:
 sw.bb3:
   %trunc = trunc i32 %x to i8
   %sext = sext i8 %trunc to i32
+  %select.i = icmp sgt i32 %sext, 0
+  %select = select i1 %select.i, i32 %sext, i32 %y
   br label %return
 
 sw.default:
   br label %return
 
 return:
-  %retval.0 = phi i32 [ 123, %sw.default ], [ %sext, %sw.bb3 ], [ %sub, %sw.bb2 ], [ 42, %sw.bb1 ], [ 5, %entry ]
+  %retval.0 = phi i32 [ 123, %sw.default ], [ %select, %sw.bb3 ], [ %sub, %sw.bb2 ], [ 42, %sw.bb1 ], [ 5, %entry ]
   ret i32 %retval.0
 
 ; CHECK-LABEL: @cprop(
diff --git a/test/Transforms/SimplifyCFG/attr-noduplicate.ll b/test/Transforms/SimplifyCFG/attr-noduplicate.ll
new file mode 100644
index 0000000..523aa51
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/attr-noduplicate.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+; This test checks that the SimplifyCFG pass won't duplicate a call to a
+; function marked noduplicate.
+;
+; CHECK-LABEL: @noduplicate
+; CHECK: call void @barrier
+; CHECK-NOT: call void @barrier
+define void @noduplicate(i32 %cond, i32* %out) {
+entry:
+  %out1 = getelementptr i32* %out, i32 1
+  %out2 = getelementptr i32* %out, i32 2
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  store i32 5, i32* %out
+  br label %if.end
+
+if.end:
+  call void @barrier() #0
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:
+  store i32 5, i32* %out1
+  br label %cond.end
+
+cond.end:
+  %value = phi i32 [ 1, %cond.false ], [ 0, %if.end ]
+  store i32 %value, i32* %out2
+  ret void
+}
+
+; Function Attrs: noduplicate nounwind
+declare void @barrier() #0
+
+attributes #0 = { noduplicate nounwind }
diff --git a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
index 7fc0cbd..9d8086c 100644
--- a/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
+++ b/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
@@ -41,10 +41,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !15, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 231, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 589870, metadata !15, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 231, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 231] [def] [scope 0] [foo]
 !1 = metadata !{i32 589865, metadata !15} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !15, i32 12, metadata !"clang (trunk 129006)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !15, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !15, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 131, i32 2, metadata !0, null}
 !6 = metadata !{i32 134, i32 2, metadata !0, null}
diff --git a/test/Transforms/SimplifyCFG/common-dest-folding.ll b/test/Transforms/SimplifyCFG/common-dest-folding.ll
new file mode 100644
index 0000000..0aa3b2c
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/common-dest-folding.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+;CHECK: @foo
+;CHECK: and i32 %c1, %k
+;CHECK: icmp eq i32
+;CHECK: and i32 %c2, %k
+;CHECK: icmp eq i32
+;CHECK: or i1
+;CHECK: ret
+define i32 @foo(i32 %k, i32 %c1, i32 %c2) {
+  %1 = and i32 %c1, %k
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %8, label %3
+
+; <label>:3                                       ; preds = %0
+  %4 = and i32 %c2, %k
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %8, label %6
+
+; <label>:6                                       ; preds = %3
+  %7 = tail call i32 (...)* @bar() nounwind
+  br label %8
+
+; <label>:8                                       ; preds = %3, %0, %6
+  ret i32 undef
+}
+
+;CHECK: @conduse
+;CHECK: shl i32 1, %c1
+;CHECK-NEXT: shl i32 1, %c2
+;CHECK-NEXT: and i32
+;CHECK-NEXT: icmp eq i32
+;CHECK-NEXT: and i32
+;CHECK-NEXT: icmp eq i32
+;CHECK: ret
+define i32 @conduse(i32 %k, i32 %c1, i32 %c2) #0 {
+bb:
+  %tmp = shl i32 1, %c1
+  %tmp4 = shl i32 1, %c2
+  %tmp1 = and i32 %tmp, %k
+  %tmp2 = icmp eq i32 %tmp1, 0
+  br i1 %tmp2, label %bb9, label %bb3
+
+bb3:                                              ; preds = %bb
+  %tmp5 = and i32 %tmp4, %k
+  %tmp6 = icmp eq i32 %tmp5, 0
+  br i1 %tmp6, label %bb9, label %bb7
+
+bb7:                                              ; preds = %bb3
+  %tmp8 = tail call i32 (...)* @bar() #1
+  br label %bb9
+
+bb9:                                              ; preds = %bb7, %bb3, %bb
+  ret i32 undef
+}
+
+declare i32 @bar(...)
diff --git a/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll b/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
index 0e36066..0547fa9 100644
--- a/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
+++ b/test/Transforms/SimplifyCFG/hoist-dbgvalue.ll
@@ -29,12 +29,13 @@ declare i32 @bar(...)
 
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
+!llvm.module.flags = !{!21}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !20, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 589870, metadata !20, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
 !1 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !8, metadata !8, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !20, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !20, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 589860, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 590081, metadata !0, metadata !"i", metadata !1, i32 16777218, metadata !5, i32 0} ; [ DW_TAG_arg_variable ]
@@ -52,3 +53,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !18 = metadata !{i32 8, i32 3, metadata !17, null}
 !19 = metadata !{i32 9, i32 3, metadata !10, null}
 !20 = metadata !{metadata !"b.c", metadata !"/private/tmp"}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/SimplifyCFG/lit.local.cfg b/test/Transforms/SimplifyCFG/lit.local.cfg
deleted file mode 100644
index e69de29..0000000
--- a/test/Transforms/SimplifyCFG/lit.local.cfg
+++ /dev/null
diff --git a/test/Transforms/SimplifyCFG/switch_create.ll b/test/Transforms/SimplifyCFG/switch_create.ll
index 5500ba2..e1e9157 100644
--- a/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/test/Transforms/SimplifyCFG/switch_create.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -simplifycfg -S | FileCheck %s
+; RUN: opt -S -simplifycfg < %s | FileCheck -check-prefix=CHECK %s
+; RUN: opt -S -default-data-layout="p:32:32-p1:16:16" -simplifycfg < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
 
 declare void @foo1()
 
@@ -22,6 +23,44 @@ F:              ; preds = %0
 ; CHECK:  ]
 }
 
+define void @test1_ptr(i32* %V) {
+        %C1 = icmp eq i32* %V, inttoptr (i32 4 to i32*)
+        %C2 = icmp eq i32* %V, inttoptr (i32 17 to i32*)
+        %CN = or i1 %C1, %C2            ; <i1> [#uses=1]
+        br i1 %CN, label %T, label %F
+T:              ; preds = %0
+        call void @foo1( )
+        ret void
+F:              ; preds = %0
+        call void @foo2( )
+        ret void
+; CHECK-LABEL: @test1_ptr(
+; DL:  %magicptr = ptrtoint i32* %V to i32
+; DL:  switch i32 %magicptr, label %F [
+; DL:    i32 17, label %T
+; DL:    i32 4, label %T
+; DL:  ]
+}
+
+define void @test1_ptr_as1(i32 addrspace(1)* %V) {
+        %C1 = icmp eq i32 addrspace(1)* %V, inttoptr (i32 4 to i32 addrspace(1)*)
+        %C2 = icmp eq i32 addrspace(1)* %V, inttoptr (i32 17 to i32 addrspace(1)*)
+        %CN = or i1 %C1, %C2            ; <i1> [#uses=1]
+        br i1 %CN, label %T, label %F
+T:              ; preds = %0
+        call void @foo1( )
+        ret void
+F:              ; preds = %0
+        call void @foo2( )
+        ret void
+; CHECK-LABEL: @test1_ptr_as1(
+; DL:  %magicptr = ptrtoint i32 addrspace(1)* %V to i16
+; DL:  switch i16 %magicptr, label %F [
+; DL:    i16 17, label %T
+; DL:    i16 4, label %T
+; DL:  ]
+}
+
 define void @test2(i32 %V) {
         %C1 = icmp ne i32 %V, 4         ; <i1> [#uses=1]
         %C2 = icmp ne i32 %V, 17                ; <i1> [#uses=1]
@@ -79,7 +118,7 @@ lor.end:                                          ; preds = %lor.rhs, %lor.lhs.f
   %0 = phi i1 [ true, %lor.lhs.false ], [ true, %entry ], [ %cmp8, %lor.rhs ]
   %lor.ext = zext i1 %0 to i32
   ret i32 %lor.ext
-  
+
 ; CHECK-LABEL: @test4(
 ; CHECK:  switch i8 %c, label %lor.rhs [
 ; CHECK:    i8 62, label %lor.end
@@ -139,7 +178,7 @@ shortcirc_done.4:               ; preds = %shortcirc_next.3, %shortcirc_next.2,
 UnifiedReturnBlock:             ; preds = %shortcirc_done.4, %shortcirc_next.4
         %UnifiedRetVal = phi i1 [ %tmp.26, %shortcirc_next.4 ], [ true, %shortcirc_done.4 ]             ; <i1> [#uses=1]
         ret i1 %UnifiedRetVal
-        
+
 ; CHECK-LABEL: @test6(
 ; CHECK: %tmp.2.i.off = add i32 %tmp.2.i, -14
 ; CHECK: %switch = icmp ult i32 %tmp.2.i.off, 6
@@ -160,7 +199,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry
   ret void
-  
+
 ; CHECK-LABEL: @test7(
 ; CHECK:   %cmp = icmp ult i32 %x, 32
 ; CHECK:   br i1 %cmp, label %if.then, label %switch.early.test
@@ -189,7 +228,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry
   ret i32 0
-  
+
 ; CHECK-LABEL: @test8(
 ; CHECK: switch.early.test:
 ; CHECK:   switch i8 %c, label %if.end [
@@ -245,7 +284,7 @@ lor.end:                                          ; preds = %lor.rhs, %lor.lhs.f
   %0 = phi i1 [ true, %lor.lhs.false36 ], [ true, %lor.lhs.false31 ], [ true, %lor.lhs.false26 ], [ true, %lor.lhs.false21 ], [ true, %lor.lhs.false16 ], [ true, %lor.lhs.false11 ], [ true, %lor.lhs.false6 ], [ true, %lor.lhs.false ], [ true, %entry ], [ %cmp43, %lor.rhs ]
   %conv46 = zext i1 %0 to i32
   ret i32 %conv46
-  
+
 ; CHECK-LABEL: @test9(
 ; CHECK:   %cmp = icmp ult i8 %c, 33
 ; CHECK:   br i1 %cmp, label %lor.end, label %switch.early.test
diff --git a/test/Transforms/SimplifyCFG/trap-debugloc.ll b/test/Transforms/SimplifyCFG/trap-debugloc.ll
index 953557ff..3b449cb 100644
--- a/test/Transforms/SimplifyCFG/trap-debugloc.ll
+++ b/test/Transforms/SimplifyCFG/trap-debugloc.ll
@@ -8,15 +8,17 @@ define void @foo() nounwind ssp {
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10}
 !llvm.dbg.sp = !{!0}
 
-!0 = metadata !{i32 589870, metadata !8, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 589870, metadata !8, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 0] [foo]
 !1 = metadata !{i32 589865, metadata !8} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 589841, metadata !8, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-206.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 589845, metadata !8, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 589845, metadata !8, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 4, i32 2, metadata !6, null}
 !6 = metadata !{i32 589835, metadata !8, metadata !0, i32 3, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
 !7 = metadata !{i32 5, i32 1, metadata !6, null}
 !8 = metadata !{metadata !"foo.c", metadata !"/private/tmp"}
 !9 = metadata !{metadata !0}
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/Sink/lit.local.cfg b/test/Transforms/Sink/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/Sink/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll b/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll
index 69febc3..438fa96 100644
--- a/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll
+++ b/test/Transforms/StripSymbols/2007-01-15-llvm.used.ll
@@ -1,5 +1,10 @@
-; RUN: opt < %s -strip -S | grep foo | count 2
-; RUN: opt < %s -strip -S | grep bar | count 2
+; RUN: opt < %s -strip -S | FileCheck %s
+
+; CHECK: foo
+; CHECK: bar
+; CHECK: foo
+; CHECK: bar
+
 @llvm.used = appending global [2 x i8*] [ i8* bitcast (i32* @foo to i8*), i8* bitcast (i32 ()* @bar to i8*) ], section "llvm.metadata"		; <[2 x i8*]*> [#uses=0]
 @foo = internal constant i32 41		; <i32*> [#uses=1]
 
diff --git a/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll b/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
index 0181c9b..5353744 100644
--- a/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
+++ b/test/Transforms/StripSymbols/2010-06-30-StripDebug.ll
@@ -1,4 +1,6 @@
-; RUN: opt -strip-debug < %s | llvm-dis | grep -v llvm.dbg
+; RUN: opt -strip-debug < %s -S | FileCheck %s
+
+; CHECK-NOT: llvm.dbg
 
 @x = common global i32 0                          ; <i32*> [#uses=0]
 
@@ -11,6 +13,7 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!13}
 !llvm.dbg.sp = !{!0}
 !llvm.dbg.lv.foo = !{!5}
 !llvm.dbg.gv = !{!8}
@@ -18,7 +21,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !0 = metadata !{i32 524334, metadata !12, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 524305, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 524309, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 524309, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 524544, metadata !6, metadata !"y", metadata !1, i32 3, metadata !7} ; [ DW_TAG_auto_variable ]
 !6 = metadata !{i32 524299, metadata !12, metadata !0, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
@@ -28,3 +31,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !10 = metadata !{i32 3, i32 0, metadata !6, null}
 !11 = metadata !{i32 4, i32 0, metadata !6, null}
 !12 = metadata !{metadata !"b.c", metadata !"/tmp"}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/StripSymbols/2010-07-01-DeadDbgInfo.ll b/test/Transforms/StripSymbols/2010-07-01-DeadDbgInfo.ll
deleted file mode 100644
index b893410..0000000
--- a/test/Transforms/StripSymbols/2010-07-01-DeadDbgInfo.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: opt -strip-dead-debug-info < %s | llvm-dis -o %t.ll
-; RUN: grep -v bar %t.ll
-; RUN: grep -v abcd %t.ll
-
-@xyz = global i32 2                               ; <i32*> [#uses=1]
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-define i32 @fn() nounwind readnone ssp {
-entry:
-  ret i32 0, !dbg !17
-}
-
-define i32 @foo(i32 %i) nounwind readonly ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !14), !dbg !19
-  %.0 = load i32* @xyz, align 4                   ; <i32> [#uses=1]
-  ret i32 %.0, !dbg !20
-}
-
-!llvm.dbg.cu = !{!2}
-!llvm.dbg.sp = !{!0, !5, !9}
-!llvm.dbg.lv.bar = !{!12}
-!llvm.dbg.lv.foo = !{!14}
-!llvm.dbg.gv = !{!15, !16}
-
-!0 = metadata !{i32 524334, metadata !22, null, metadata !"bar", metadata !"bar", metadata !"", i32 5, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 524329, metadata !22} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 524305, metadata !22, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 524309, metadata !22, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{null}
-!5 = metadata !{i32 524334, metadata !22, null, metadata !"fn", metadata !"fn", metadata !"fn", i32 6, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @fn, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 524309, metadata !22, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!7 = metadata !{metadata !8}
-!8 = metadata !{i32 524324, metadata !22, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!9 = metadata !{i32 524334, metadata !22, null, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 524309, metadata !22, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!11 = metadata !{metadata !8, metadata !8}
-!12 = metadata !{i32 524544, metadata !13, metadata !"bb", metadata !1, i32 5, metadata !8} ; [ DW_TAG_auto_variable ]
-!13 = metadata !{i32 524299, metadata !22, metadata !0, i32 5, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!14 = metadata !{i32 524545, metadata !9, metadata !"i", metadata !1, i32 7, metadata !8} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 524340, i32 0, metadata !1, metadata !"abcd", metadata !"abcd", metadata !"", metadata !1, i32 2, metadata !8, i1 true, i1 true, null} ; [ DW_TAG_variable ]
-!16 = metadata !{i32 524340, i32 0, metadata !1, metadata !"xyz", metadata !"xyz", metadata !"", metadata !1, i32 3, metadata !8, i1 false, i1 true, i32* @xyz} ; [ DW_TAG_variable ]
-!17 = metadata !{i32 6, i32 0, metadata !18, null}
-!18 = metadata !{i32 524299, metadata !22, metadata !5, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 7, i32 0, metadata !9, null}
-!20 = metadata !{i32 10, i32 0, metadata !21, null}
-!21 = metadata !{i32 524299, metadata !22, metadata !9, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{metadata !"g.c", metadata !"/tmp/"}
diff --git a/test/Transforms/StripSymbols/2010-08-25-crash.ll b/test/Transforms/StripSymbols/2010-08-25-crash.ll
index e480f43..2878468 100644
--- a/test/Transforms/StripSymbols/2010-08-25-crash.ll
+++ b/test/Transforms/StripSymbols/2010-08-25-crash.ll
@@ -5,18 +5,20 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
-!llvm.dbg.sp = !{!0}
-!llvm.dbg.gv = !{!6}
+!llvm.module.flags = !{!14}
 
 !0 = metadata !{i32 524334, metadata !10, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 524329, metadata !10} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 524305, metadata !10, i32 12, metadata !"clang version 2.8 (trunk 112062)", i1 true, metadata !"", i32 0, metadata !11, metadata !11, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 524309, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!2 = metadata !{i32 524305, metadata !10, i32 12, metadata !"clang version 2.8 (trunk 112062)", i1 true, metadata !"", i32 0, metadata !11, metadata !11, metadata !12, metadata !13, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!3 = metadata !{i32 524309, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 524324, metadata !10, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 524340, i32 0, metadata !1, metadata !"i", metadata !"i", metadata !"i", metadata !1, i32 2, metadata !7, i1 true, i1 true, i32 0} ; [ DW_TAG_variable ]
+!6 = metadata !{i32 524340, i32 0, metadata !1, metadata !"i", metadata !"i", metadata !"i", metadata !1, i32 2, metadata !7, i1 true, i1 true, i32 0, null} ; [ DW_TAG_variable ]
 !7 = metadata !{i32 524326, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !5} ; [ DW_TAG_const_type ]
 !8 = metadata !{i32 3, i32 13, metadata !9, null}
 !9 = metadata !{i32 524299, metadata !10, metadata !0, i32 3, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
 !10 = metadata !{metadata !"/tmp/a.c", metadata !"/Volumes/Lalgate/clean/D.CW"}
 !11 = metadata !{i32 0}
+!12 = metadata !{metadata !0}
+!13 = metadata !{metadata !6}
+!14 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/StripSymbols/lit.local.cfg b/test/Transforms/StripSymbols/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/StripSymbols/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/StripSymbols/strip-dead-debug-info.ll b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
new file mode 100644
index 0000000..2d687ae
--- /dev/null
+++ b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
@@ -0,0 +1,58 @@
+; RUN: opt -strip-dead-debug-info -verify %s -S | FileCheck %s
+
+; CHECK: ModuleID = '{{.*}}'
+; CHECK-NOT: bar
+; CHECK-NOT: abcd
+
+@xyz = global i32 2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #0
+
+; Function Attrs: nounwind readnone ssp
+define i32 @fn() #1 {
+entry:
+  ret i32 0, !dbg !18
+}
+
+; Function Attrs: nounwind readonly ssp
+define i32 @foo(i32 %i) #2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !15), !dbg !20
+  %.0 = load i32* @xyz, align 4
+  ret i32 %.0, !dbg !21
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind readnone ssp }
+attributes #2 = { nounwind readonly ssp }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!25}
+
+!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !23, metadata !24, null, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp//g.c] [DW_LANG_C89]
+!1 = metadata !{metadata !"g.c", metadata !"/tmp/"}
+!2 = metadata !{null}
+!3 = metadata !{i32 524334, metadata !1, null, metadata !"bar", metadata !"bar", metadata !"", i32 5, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [scope 0] [bar]
+!4 = metadata !{i32 524309, metadata !1, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!5 = metadata !{i32 524329, metadata !1}          ; [ DW_TAG_file_type ] [/tmp//g.c]
+!6 = metadata !{i32 524334, metadata !1, null, metadata !"fn", metadata !"fn", metadata !"fn", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @fn, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 6] [def] [scope 0] [fn]
+!7 = metadata !{i32 524309, metadata !1, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 524324, metadata !1, metadata !5, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 524334, metadata !1, null, metadata !"foo", metadata !"foo", metadata !"foo", i32 7, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 0] [foo]
+!11 = metadata !{i32 524309, metadata !1, metadata !5, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !9, metadata !9}
+!13 = metadata !{i32 524544, metadata !14, metadata !"bb", metadata !5, i32 5, metadata !9}
+!14 = metadata !{i32 524299, metadata !1, metadata !3, i32 5, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!15 = metadata !{i32 524545, metadata !10, metadata !"i", metadata !5, i32 7, metadata !9}
+!16 = metadata !{i32 524340, i32 0, metadata !5, metadata !"abcd", metadata !"abcd", metadata !"", metadata !5, i32 2, metadata !9, i1 true, i1 true, null, null}
+!17 = metadata !{i32 524340, i32 0, metadata !5, metadata !"xyz", metadata !"xyz", metadata !"", metadata !5, i32 3, metadata !9, i1 false, i1 true, i32* @xyz, null}
+!18 = metadata !{i32 6, i32 0, metadata !19, null}
+!19 = metadata !{i32 524299, metadata !1, metadata !6, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!20 = metadata !{i32 7, i32 0, metadata !10, null}
+!21 = metadata !{i32 10, i32 0, metadata !22, null}
+!22 = metadata !{i32 524299, metadata !1, metadata !10, i32 7, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/tmp//g.c]
+!23 = metadata !{metadata !3, metadata !6, metadata !10}
+!24 = metadata !{metadata !16, metadata !17}
+!25 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/Transforms/StructurizeCFG/branch-on-argument.ll b/test/Transforms/StructurizeCFG/branch-on-argument.ll
new file mode 100644
index 0000000..4eba0cd
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/branch-on-argument.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
+
+; CHECK-LABEL: @invert_branch_on_arg_inf_loop(
+; CHECK: entry:
+; CHECK: %arg.inv = xor i1 %arg, true
+; CHECK: phi i1 [ false, %Flow1 ], [ %arg.inv, %entry ]
+define void @invert_branch_on_arg_inf_loop(i32 addrspace(1)* %out, i1 %arg) {
+entry:
+  br i1 %arg, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %for.body
+
+for.end:                                          ; preds = %Flow
+  ret void
+}
+
+
+; CHECK-LABEL: @invert_branch_on_arg_jump_into_loop(
+; CHECK: entry:
+; CHECK: %arg.inv = xor i1 %arg, true
+; CHECK: Flow:
+; CHECK: Flow1:
+define void @invert_branch_on_arg_jump_into_loop(i32 addrspace(1)* %out, i32 %n, i1 %arg) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [0, %entry], [%i.inc, %end.loop]
+  %ptr = getelementptr i32 addrspace(1)* %out, i32 %i
+  store i32 %i, i32 addrspace(1)* %ptr, align 4
+  br i1 %arg, label %mid.loop, label %end.loop
+
+mid.loop:
+  store i32 333, i32 addrspace(1)* %out, align 4
+  br label %for.end
+
+end.loop:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp ne i32 %i.inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
diff --git a/test/Transforms/StructurizeCFG/lit.local.cfg b/test/Transforms/StructurizeCFG/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/StructurizeCFG/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/StructurizeCFG/no-branch-to-entry.ll b/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
new file mode 100644
index 0000000..2e22c87
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/no-branch-to-entry.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
+
+; CHECK-LABEL: @no_branch_to_entry_undef(
+; CHECK: entry:
+; CHECK-NEXT: br label %entry.orig
+define void @no_branch_to_entry_undef(i32 addrspace(1)* %out) {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %for.body
+
+for.end:                                          ; preds = %Flow
+  ret void
+}
+
+; CHECK-LABEL: @no_branch_to_entry_true(
+; CHECK: entry:
+; CHECK-NEXT: br label %entry.orig
+define void @no_branch_to_entry_true(i32 addrspace(1)* %out) {
+entry:
+  br i1 true, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  store i32 999, i32 addrspace(1)* %out, align 4
+  br label %for.body
+
+for.end:                                          ; preds = %Flow
+  ret void
+}
diff --git a/test/Transforms/StructurizeCFG/switch.ll b/test/Transforms/StructurizeCFG/switch.ll
new file mode 100644
index 0000000..316df57
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/switch.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
+
+; The structurizecfg pass cannot handle switch instructions, so we need to
+; make sure the lower switch pass is always run before structurizecfg.
+
+; CHECK-LABEL: @switch
+define void @switch(i32 addrspace(1)* %out, i32 %cond) nounwind {
+entry:
+; CHECK: icmp
+  switch i32 %cond, label %done [ i32 0, label %zero]
+
+; CHECK: zero:
+zero:
+; CHECK: store i32 7, i32 addrspace(1)* %out
+  store i32 7, i32 addrspace(1)* %out
+; CHECK: br label %done
+  br label %done
+
+; CHECK: done:
+done:
+; CHECK: ret void
+  ret void
+}
diff --git a/test/Transforms/TailCallElim/lit.local.cfg b/test/Transforms/TailCallElim/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Transforms/TailCallElim/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/TailDup/X86/lit.local.cfg b/test/Transforms/TailDup/X86/lit.local.cfg
index da2db5a..ba763cf 100644
--- a/test/Transforms/TailDup/X86/lit.local.cfg
+++ b/test/Transforms/TailDup/X86/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Transforms/TailDup/lit.local.cfg b/test/Transforms/TailDup/lit.local.cfg
index 18c604a..19840aa 100644
--- a/test/Transforms/TailDup/lit.local.cfg
+++ b/test/Transforms/TailDup/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Unit/lit.cfg b/test/Unit/lit.cfg
index 15cf626..68ba0b3 100644
--- a/test/Unit/lit.cfg
+++ b/test/Unit/lit.cfg
@@ -4,6 +4,8 @@
 
 import os
 
+import lit.formats
+
 # name: The name of this test suite.
 config.name = 'LLVM-Unit'
 
@@ -43,9 +45,9 @@ if config.test_exec_root is None:
     # out-of-tree build situation).
 
     # Check for 'llvm_unit_site_config' user parameter, and use that if available.
-    site_cfg = lit.params.get('llvm_unit_site_config', None)
+    site_cfg = lit_config.params.get('llvm_unit_site_config', None)
     if site_cfg and os.path.exists(site_cfg):
-        lit.load_config(config, site_cfg)
+        lit_config.load_config(config, site_cfg)
         raise SystemExit
 
     # Try to detect the situation where we are using an out-of-tree build by
@@ -58,7 +60,7 @@ if config.test_exec_root is None:
 
     llvm_config = lit.util.which('llvm-config', config.environment['PATH'])
     if not llvm_config:
-        lit.fatal('No site specific configuration available!')
+        lit_config.fatal('No site specific configuration available!')
 
     # Get the source and object roots.
     llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
@@ -67,16 +69,16 @@ if config.test_exec_root is None:
     # Validate that we got a tree which points to here.
     this_src_root = os.path.join(os.path.dirname(__file__),'..','..')
     if os.path.realpath(llvm_src_root) != os.path.realpath(this_src_root):
-        lit.fatal('No site specific configuration available!')
+        lit_config.fatal('No site specific configuration available!')
 
     # Check that the site specific configuration exists.
     site_cfg = os.path.join(llvm_obj_root, 'test', 'Unit', 'lit.site.cfg')
     if not os.path.exists(site_cfg):
-        lit.fatal('No site specific configuration available!')
+        lit_config.fatal('No site specific configuration available!')
 
     # Okay, that worked. Notify the user of the automagic, and reconfigure.
-    lit.note('using out-of-tree build at %r' % llvm_obj_root)
-    lit.load_config(config, site_cfg)
+    lit_config.note('using out-of-tree build at %r' % llvm_obj_root)
+    lit_config.load_config(config, site_cfg)
     raise SystemExit
 
 # If necessary, point the dynamic loader at libLLVM.so.
diff --git a/test/Unit/lit.site.cfg.in b/test/Unit/lit.site.cfg.in
index 65e98d0..7ff8155 100644
--- a/test/Unit/lit.site.cfg.in
+++ b/test/Unit/lit.site.cfg.in
@@ -1,3 +1,5 @@
+import sys
+
 ## Autogenerated by LLVM/Clang configuration.
 # Do not edit!
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
@@ -11,11 +13,12 @@ config.shlibpath_var = "@SHLIBPATH_VAR@"
 # Support substitution of the tools_dir and build_mode with user parameters.
 # This is used when we can't determine the tool dir at configuration time.
 try:
-    config.llvm_tools_dir = config.llvm_tools_dir % lit.params
-    config.llvm_build_mode = config.llvm_build_mode % lit.params
-except KeyError,e:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+    config.llvm_build_mode = config.llvm_build_mode % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
     key, = e.args
-    lit.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
 
 # Let the main config do the real work.
-lit.load_config(config, "@LLVM_SOURCE_DIR@/test/Unit/lit.cfg")
+lit_config.load_config(config, "@LLVM_SOURCE_DIR@/test/Unit/lit.cfg")
diff --git a/test/Verifier/ident-meta1.ll b/test/Verifier/ident-meta1.ll
new file mode 100644
index 0000000..fb247a8
--- /dev/null
+++ b/test/Verifier/ident-meta1.ll
@@ -0,0 +1,12 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.ident is properly structured.
+; llvm.ident takes a list of metadata entries.
+; Each metadata entry can have only one string.
+
+!llvm.ident = !{!0, !1}
+!0 = metadata !{metadata !"version string"}
+!1 = metadata !{metadata !"string1", metadata !"string2"}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: incorrect number of operands in llvm.ident metadata
+; CHECK-NEXT: metadata !1
+
diff --git a/test/Verifier/ident-meta2.ll b/test/Verifier/ident-meta2.ll
new file mode 100644
index 0000000..e86f18a
--- /dev/null
+++ b/test/Verifier/ident-meta2.ll
@@ -0,0 +1,13 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.ident is properly structured.
+; llvm.ident takes a list of metadata entries.
+; Each metadata entry can contain one string only.
+
+!llvm.ident = !{!0, !1, !2, !3}
+!0 = metadata !{metadata !"str1"}
+!1 = metadata !{metadata !"str2"}
+!2 = metadata !{metadata !"str3"}
+!3 = metadata !{i32 1}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: invalid value for llvm.ident metadata entry operand(the operand should be a string)
+; CHECK-NEXT: i32 1
diff --git a/test/Verifier/ident-meta3.ll b/test/Verifier/ident-meta3.ll
new file mode 100644
index 0000000..a847b46
--- /dev/null
+++ b/test/Verifier/ident-meta3.ll
@@ -0,0 +1,10 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; Verify that llvm.ident is properly structured.
+; llvm.ident takes a list of metadata entries.
+; Each metadata entry can contain one string only.
+
+!llvm.ident = !{!0}
+!0 = metadata !{metadata !{metadata !"nested metadata"}}
+; CHECK: assembly parsed, but does not verify as correct!
+; CHECK-NEXT: invalid value for llvm.ident metadata entry operand(the operand should be a string)
+; CHECK-NEXT: metadata !1
diff --git a/test/Verifier/lit.local.cfg b/test/Verifier/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/Verifier/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Verifier/varargs-intrinsic.ll b/test/Verifier/varargs-intrinsic.ll
new file mode 100644
index 0000000..f6d0a70
--- /dev/null
+++ b/test/Verifier/varargs-intrinsic.ll
@@ -0,0 +1,16 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @llvm.experimental.stackmap(i32, i32)
+declare void @llvm.donothing(...)
+
+define void @foo1() {
+  call void @llvm.experimental.stackmap(i32 0, i32 12)
+; CHECK: Callsite was not defined with variable arguments!
+  ret void
+}
+
+define void @foo2() {
+  call void (...)* @llvm.donothing(i32 0, i64 1)
+; CHECK: Intrinsic was not defined with variable arguments!
+  ret void
+}
diff --git a/test/YAMLParser/spec-02-24.data b/test/YAMLParser/spec-02-24.data
index 01ca7f5..56b25cb 100644
--- a/test/YAMLParser/spec-02-24.data
+++ b/test/YAMLParser/spec-02-24.data
@@ -1,4 +1,4 @@
-# RUN: yaml-bench -canonical %s
+# RUN: yaml-bench -canonical %s | FileCheck %s
 
 %TAG ! tag:clarkevans.com,2002:
 --- !shape
@@ -14,3 +14,8 @@
   start: *ORIGIN
   color: 0xFFEEBB
   text: Pretty vector drawing.
+
+#CHECK: !<tag:clarkevans.com,2002:shape>
+#CHECK:   !<tag:clarkevans.com,2002:circle>
+#CHECK:   !<tag:clarkevans.com,2002:line>
+#CHECK:   !<tag:clarkevans.com,2002:label>
diff --git a/test/YAMLParser/spec-07-04.data b/test/YAMLParser/spec-07-04.data
index beba7d0..2c8b2ec 100644
--- a/test/YAMLParser/spec-07-04.data
+++ b/test/YAMLParser/spec-07-04.data
@@ -1,5 +1,7 @@
-# RUN: yaml-bench -canonical %s
+# RUN: yaml-bench -canonical %s | FileCheck %s
 
 %TAG !yaml! tag:yaml.org,2002:
 ---
 !yaml!str "foo"
+
+#CHECK: !!str "foo"
diff --git a/test/YAMLParser/yaml.data b/test/YAMLParser/yaml.data
index 3ce5e4b..4f9b294 100644
--- a/test/YAMLParser/yaml.data
+++ b/test/YAMLParser/yaml.data
@@ -1,5 +1,11 @@
-# RUN: yaml-bench -canonical %s
+# RUN: yaml-bench -canonical %s | FileCheck %s
 
 - !!yaml '!'
 - !!yaml '&'
 - !!yaml '*'
+
+# CHECK: !!seq [
+# CHECK:   !!yaml "!",
+# CHECK:   !!yaml "&",
+# CHECK:   !!yaml "*",
+# CHECK: ]
diff --git a/test/lit.cfg b/test/lit.cfg
index c018674..df1f4a1 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -7,6 +7,9 @@ import sys
 import re
 import platform
 
+import lit.util
+import lit.formats
+
 # name: The name of this test suite.
 config.name = 'LLVM'
 
@@ -14,9 +17,9 @@ config.name = 'LLVM'
 if sys.platform in ['win32']:
     # Seek sane tools in directories and set to $PATH.
     path = getattr(config, 'lit_tools_dir', None)
-    path = lit.getToolsPath(path,
-                            config.environment['PATH'],
-                            ['cmp.exe', 'grep.exe', 'sed.exe'])
+    path = lit_config.getToolsPath(path,
+                                   config.environment['PATH'],
+                                   ['cmp.exe', 'grep.exe', 'sed.exe'])
     if path is not None:
         path = os.path.pathsep.join((path,
                                      config.environment['PATH']))
@@ -36,17 +39,14 @@ else:
 # testFormat: The test format to use to interpret tests.
 config.test_format = lit.formats.ShTest(execute_external)
 
-# To ignore test output on stderr so it doesn't trigger failures uncomment this:
-#config.test_format = lit.formats.TclTest(ignoreStdErr=True)
-
-# suffixes: A list of file extensions to treat as test files, this is actually
-# set by on_clone().
-config.suffixes = []
+# suffixes: A list of file extensions to treat as test files. This is overriden
+# by individual lit.local.cfg files in the test subdirectories.
+config.suffixes = ['.ll', '.c', '.cpp', '.test', '.txt', '.s']
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
 # directories.
-config.excludes = ['Inputs']
+config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
@@ -60,7 +60,7 @@ if llvm_obj_root is not None:
 if llvm_obj_root is not None:
     llvm_tools_dir = getattr(config, 'llvm_tools_dir', None)
     if not llvm_tools_dir:
-        lit.fatal('No LLVM tools dir set!')
+        lit_config.fatal('No LLVM tools dir set!')
     path = os.path.pathsep.join((llvm_tools_dir, config.environment['PATH']))
     config.environment['PATH'] = path
 
@@ -107,9 +107,9 @@ if config.test_exec_root is None:
     # out-of-tree build situation).
 
     # Check for 'llvm_site_config' user parameter, and use that if available.
-    site_cfg = lit.params.get('llvm_site_config', None)
+    site_cfg = lit_config.params.get('llvm_site_config', None)
     if site_cfg and os.path.exists(site_cfg):
-        lit.load_config(config, site_cfg)
+        lit_config.load_config(config, site_cfg)
         raise SystemExit
 
     # Try to detect the situation where we are using an out-of-tree build by
@@ -122,7 +122,7 @@ if config.test_exec_root is None:
 
     llvm_config = lit.util.which('llvm-config', config.environment['PATH'])
     if not llvm_config:
-        lit.fatal('No site specific configuration available!')
+        lit_config.fatal('No site specific configuration available!')
 
     # Get the source and object roots.
     llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
@@ -131,16 +131,16 @@ if config.test_exec_root is None:
     # Validate that we got a tree which points to here.
     this_src_root = os.path.dirname(config.test_source_root)
     if os.path.realpath(llvm_src_root) != os.path.realpath(this_src_root):
-        lit.fatal('No site specific configuration available!')
+        lit_config.fatal('No site specific configuration available!')
 
     # Check that the site specific configuration exists.
     site_cfg = os.path.join(llvm_obj_root, 'test', 'lit.site.cfg')
     if not os.path.exists(site_cfg):
-        lit.fatal('No site specific configuration available!')
+        lit_config.fatal('No site specific configuration available!')
 
     # Okay, that worked. Notify the user of the automagic, and reconfigure.
-    lit.note('using out-of-tree build at %r' % llvm_obj_root)
-    lit.load_config(config, site_cfg)
+    lit_config.note('using out-of-tree build at %r' % llvm_obj_root)
+    lit_config.load_config(config, site_cfg)
     raise SystemExit
 
 ###
@@ -169,11 +169,11 @@ else:
 config.substitutions.append( ('%defaultjit', '-use-mcjit='+defaultIsMCJIT) )
 
 # Process jit implementation option
-jit_impl_cfg = lit.params.get('jit_impl', None)
+jit_impl_cfg = lit_config.params.get('jit_impl', None)
 if jit_impl_cfg == 'mcjit':
   # When running with mcjit, mangle -mcjit into target triple
   # and add -use-mcjit flag to lli invocation
-  if 'i686' in config.target_triple:
+  if 'i386' in config.target_triple or 'i686' in config.target_triple:
     config.target_triple += jit_impl_cfg + '-ia32'
   elif 'x86_64' in config.target_triple:
     config.target_triple += jit_impl_cfg + '-ia64'
@@ -204,32 +204,53 @@ else:
 # Regex to reject matching a hyphen
 NOHYPHEN = r"(?<!-)"
 
-for pattern in [r"\bbugpoint\b(?!-)",   r"(?<!/|-)\bclang\b(?!-)",
+for pattern in [r"\bbugpoint\b(?!-)",
+                r"(?<!/|-)\bclang\b(?!-)",
                 r"\bgold\b",
                 # Match llc but not -llc
                 NOHYPHEN + r"\bllc\b",
                 r"\blli\b",
-                r"\bllvm-ar\b",         r"\bllvm-as\b",
-                r"\bllvm-bcanalyzer\b", r"\bllvm-config\b",
-                r"\bllvm-cov\b",        r"\bllvm-diff\b",
-                r"\bllvm-dis\b",        r"\bllvm-dwarfdump\b",
-                r"\bllvm-extract\b",    r"\bllvm-jistlistener\b",
-                r"\bllvm-link\b",       r"\bllvm-mc\b",
-                r"\bllvm-nm\b",         r"\bllvm-objdump\b",
-                r"\bllvm-prof\b",       r"\bllvm-size\b",
-                r"\bllvm-rtdyld\b",     r"\bllvm-shlib\b",
+                r"\bllvm-PerfectShuffle\b",
+                r"\bllvm-ar\b",
+                r"\bllvm-as\b",
+                r"\bllvm-bcanalyzer\b",
+                r"\bllvm-config\b",
+                r"\bllvm-cov\b",
+                r"\bllvm-diff\b",
+                r"\bllvm-dis\b",
+                r"\bllvm-dwarfdump\b",
+                r"\bllvm-extract\b",
+                r"\bllvm-jistlistener\b",
+                r"\bllvm-link\b",
+                r"\bllvm-lto\b",
+                r"\bllvm-mc\b",
+                r"\bllvm-mcmarkup\b",
+                r"\bllvm-nm\b",
+                r"\bllvm-objdump\b",
+                r"\bllvm-ranlib\b",
+                r"\bllvm-readobj\b",
+                r"\bllvm-rtdyld\b",
+                r"\bllvm-shlib\b",
+                r"\bllvm-size\b",
+                r"\bllvm-tblgen\b",
+                r"\bllvm-c-test\b",
                 # Match llvmc but not -llvmc
                 NOHYPHEN + r"\bllvmc\b",
-                r"\blto\b",
-                                        # Don't match '.opt', '-opt',
-                                        # '^opt' or '/opt'.
-                r"\bmacho-dump\b",      r"(?<!\.|-|\^|/)\bopt\b",
-                r"\bllvm-tblgen\b",     r"\bFileCheck\b",
-                r"\bFileUpdate\b",      r"\bc-index-test\b",
-                r"\bfpcmp\b",           r"\bllvm-PerfectShuffle\b",
+                # Match lto but not -lto
+                NOHYPHEN + r"\blto\b",
+                r"\bmacho-dump\b",
+                # Don't match '.opt', '-opt', '^opt' or '/opt'.
+                r"(?<!\.|-|\^|/)\bopt\b",
+                r"\bFileCheck\b",
+                r"\bFileUpdate\b",
+                r"\bc-index-test\b",
+                r"\bfpcmp\b",
+                r"\bobj2yaml\b",
+                r"\byaml2obj\b",
                 # Handle these specially as they are strings searched
                 # for during testing.
-                r"\| \bcount\b",         r"\| \bnot\b"]:
+                r"\| \bcount\b",
+                r"\| \bnot\b"]:
     # Extract the tool name from the pattern.  This relies on the tool
     # name being surrounded by \b word match operators.  If the
     # pattern starts with "| ", include it in the string to be
@@ -278,35 +299,53 @@ if not 'hexagon' in config.target_triple:
 if config.have_zlib == "1":
     config.available_features.add("zlib")
 
+# Native compilation: host arch == target arch
+# FIXME: Consider cases that target can be executed
+# even if host_triple were different from target_triple.
+if config.host_triple == config.target_triple:
+    config.available_features.add("native")
+
 # llc knows whether he is compiled with -DNDEBUG.
 import subprocess
 try:
     llc_cmd = subprocess.Popen([os.path.join(llvm_tools_dir, 'llc'), '-version'],
                            stdout = subprocess.PIPE)
-except OSError, why:
-    print "Could not find llc in " + llvm_tools_dir
+except OSError:
+    print("Could not find llc in " + llvm_tools_dir)
     exit(42)
 
-if re.search(r'with assertions', llc_cmd.stdout.read()):
+if re.search(r'with assertions', llc_cmd.stdout.read().decode('ascii')):
     config.available_features.add('asserts')
 llc_cmd.wait()
 
+if 'darwin' == sys.platform:
+    try:
+        sysctl_cmd = subprocess.Popen(['sysctl', 'hw.optional.fma'],
+                                    stdout = subprocess.PIPE)
+    except OSError:
+        print("Could not exec sysctl")
+    result = sysctl_cmd.stdout.read().decode('ascii')
+    if -1 != result.find("hw.optional.fma: 1"):
+        config.available_features.add('fma3')
+    sysctl_cmd.wait()
+
 # Check if we should use gmalloc.
-use_gmalloc_str = lit.params.get('use_gmalloc', None)
+use_gmalloc_str = lit_config.params.get('use_gmalloc', None)
 if use_gmalloc_str is not None:
     if use_gmalloc_str.lower() in ('1', 'true'):
         use_gmalloc = True
     elif use_gmalloc_str.lower() in ('', '0', 'false'):
         use_gmalloc = False
     else:
-        lit.fatal('user parameter use_gmalloc should be 0 or 1')
+        lit_config.fatal('user parameter use_gmalloc should be 0 or 1')
 else:
     # Default to not using gmalloc
     use_gmalloc = False
 
 # Allow use of an explicit path for gmalloc library.
 # Will default to '/usr/lib/libgmalloc.dylib' if not set.
-gmalloc_path_str = lit.params.get('gmalloc_path', '/usr/lib/libgmalloc.dylib')
+gmalloc_path_str = lit_config.params.get('gmalloc_path',
+                                         '/usr/lib/libgmalloc.dylib')
 
 if use_gmalloc:
      config.environment.update({'DYLD_INSERT_LIBRARIES' : gmalloc_path_str})
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 1ae99eb..72fd9c9 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -1,3 +1,5 @@
+import sys
+
 ## Autogenerated by LLVM/Clang configuration.
 # Do not edit!
 config.host_triple = "@LLVM_HOST_TRIPLE@"
@@ -12,7 +14,6 @@ config.python_executable = "@PYTHON_EXECUTABLE@"
 config.ocamlopt_executable = "@OCAMLOPT@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
-config.lto_is_enabled = "@LTO_IS_ENABLED@"
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_bindings = "@LLVM_BINDINGS@"
 config.host_os = "@HOST_OS@"
@@ -24,10 +25,11 @@ config.have_zlib = "@HAVE_LIBZ@"
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
 try:
-    config.llvm_tools_dir = config.llvm_tools_dir % lit.params
-except KeyError,e:
+    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
+except KeyError:
+    e = sys.exc_info()[1]
     key, = e.args
-    lit.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
+    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
 
 # Let the main config do the real work.
-lit.load_config(config, "@LLVM_SOURCE_DIR@/test/lit.cfg")
+lit_config.load_config(config, "@LLVM_SOURCE_DIR@/test/lit.cfg")
diff --git a/test/tools/llvm-cov/Inputs/README b/test/tools/llvm-cov/Inputs/README
new file mode 100644
index 0000000..2cfb191
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/README
@@ -0,0 +1,7 @@
+These inputs were pre-generated to allow for easier testing of llvm-cov.
+
+test.gcno and test.gcda were create by running clang:
+  clang++ -g -ftest-coverage -fprofile-arcs test.cpp
+
+test.cpp.gcov was created by running gcov 4.2.1:
+  gcov test.cpp
diff --git a/test/tools/llvm-cov/Inputs/test.cpp b/test/tools/llvm-cov/Inputs/test.cpp
new file mode 100644
index 0000000..07bc3f2
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test.cpp
@@ -0,0 +1,77 @@
+#include <cstdlib>
+
+bool on = false;
+int len = 42;
+double grid[10][10] = {0};
+const char * hello = "world";
+const char * world = "hello";
+
+struct A {
+  virtual void B();
+};
+
+void A::B() {}
+
+void useless() {}
+
+double more_useless() {
+  return 0;
+}
+
+int foo() {
+  on = true;
+  return 3;
+}
+
+int bar() {
+  len--;
+  return foo() + 45;
+}
+
+void assign(int ii, int jj) {
+  grid[ii][jj] = (ii+1) * (jj+1);
+}
+
+void initialize_grid() {
+  for (int ii = 0; ii < 2; ii++)
+    for (int jj = 0; jj < 2; jj++)
+      assign(ii, jj);
+}
+
+int main() {
+  initialize_grid();
+
+  int a = 2;
+  on = rand() % 2;
+  if (on) {
+    foo();
+    ++a;
+  } else {
+    bar();
+    a += rand();
+  }
+
+  for (int ii = 0; ii < 10; ++ii) {
+    switch (rand() % 5) {
+      case 0:
+        a += rand();
+        break;
+      case 1:
+      case 2:
+        a += rand() / rand();
+        break;
+      case 3:
+        a -= rand();
+        break;
+      default:
+        a = -1;
+    }
+  }
+
+  A thing;
+  for (uint64_t ii = 0; ii < 4294967296; ++ii)
+    thing.B();
+
+  return a + 8 + grid[2][3] + len;
+  return more_useless();
+}
diff --git a/test/tools/llvm-cov/Inputs/test.cpp.gcov b/test/tools/llvm-cov/Inputs/test.cpp.gcov
new file mode 100644
index 0000000..a3dacc2
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test.cpp.gcov
@@ -0,0 +1,82 @@
+        -:    0:Source:test.cpp
+        -:    0:Graph:test.gcno
+        -:    0:Data:test.gcda
+        -:    0:Runs:2
+        -:    0:Programs:1
+        -:    1:#include <cstdlib>
+        -:    2:
+        -:    3:bool on = false;
+        -:    4:int len = 42;
+        -:    5:double grid[10][10] = {0};
+        -:    6:const char * hello = "world";
+        -:    7:const char * world = "hello";
+        -:    8:
+        4:    9:struct A {
+        -:   10:  virtual void B();
+        -:   11:};
+        -:   12:
+8589934592:   13:void A::B() {}
+        -:   14:
+    #####:   15:void useless() {}
+        -:   16:
+        -:   17:double more_useless() {
+    #####:   18:  return 0;
+        -:   19:}
+        -:   20:
+        -:   21:int foo() {
+        2:   22:  on = true;
+        2:   23:  return 3;
+        -:   24:}
+        -:   25:
+        -:   26:int bar() {
+    #####:   27:  len--;
+    #####:   28:  return foo() + 45;
+        -:   29:}
+        -:   30:
+        8:   31:void assign(int ii, int jj) {
+        8:   32:  grid[ii][jj] = (ii+1) * (jj+1);
+        8:   33:}
+        -:   34:
+        -:   35:void initialize_grid() {
+       12:   36:  for (int ii = 0; ii < 2; ii++)
+       24:   37:    for (int jj = 0; jj < 2; jj++)
+       12:   38:      assign(ii, jj);
+        2:   39:}
+        -:   40:
+        -:   41:int main() {
+        2:   42:  initialize_grid();
+        -:   43:
+        2:   44:  int a = 2;
+        2:   45:  on = rand() % 2;
+        2:   46:  if (on) {
+        2:   47:    foo();
+        2:   48:    ++a;
+        2:   49:  } else {
+    #####:   50:    bar();
+    #####:   51:    a += rand();
+        -:   52:  }
+        -:   53:
+       44:   54:  for (int ii = 0; ii < 10; ++ii) {
+       20:   55:    switch (rand() % 5) {
+        -:   56:      case 0:
+        4:   57:        a += rand();
+        4:   58:        break;
+        -:   59:      case 1:
+        -:   60:      case 2:
+        2:   61:        a += rand() / rand();
+        2:   62:        break;
+        -:   63:      case 3:
+        6:   64:        a -= rand();
+        6:   65:        break;
+        -:   66:      default:
+        8:   67:        a = -1;
+        8:   68:    }
+       20:   69:  }
+        -:   70:
+        2:   71:  A thing;
+17179869188:   72:  for (uint64_t ii = 0; ii < 4294967296; ++ii)
+8589934592:   73:    thing.B();
+        -:   74:
+        2:   75:  return a + 8 + grid[2][3] + len;
+        -:   76:  return more_useless();
+        -:   77:}
diff --git a/test/tools/llvm-cov/Inputs/test.gcda b/test/tools/llvm-cov/Inputs/test.gcda
new file mode 100644
index 0000000..23d03bd
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test.gcda
diff --git a/test/tools/llvm-cov/Inputs/test.gcno b/test/tools/llvm-cov/Inputs/test.gcno
new file mode 100644
index 0000000..6162604
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test.gcno
diff --git a/test/tools/llvm-cov/Inputs/test_read_fail.gcno b/test/tools/llvm-cov/Inputs/test_read_fail.gcno
new file mode 100644
index 0000000..63b5d71
--- /dev/null
+++ b/test/tools/llvm-cov/Inputs/test_read_fail.gcno
diff --git a/test/tools/llvm-readobj/lit.local.cfg b/test/tools/llvm-cov/lit.local.cfg
index df9b335..df9b335 100644
--- a/test/tools/llvm-readobj/lit.local.cfg
+++ b/test/tools/llvm-cov/lit.local.cfg
diff --git a/test/tools/llvm-cov/llvm-cov.test b/test/tools/llvm-cov/llvm-cov.test
new file mode 100644
index 0000000..28738a7
--- /dev/null
+++ b/test/tools/llvm-cov/llvm-cov.test
@@ -0,0 +1,10 @@
+RUN: cd %p/Inputs
+# "cd" is unsupported in lit internal runner.
+REQUIRES: shell
+
+RUN: llvm-cov -gcno=test.gcno -gcda=test.gcda \
+RUN:   | diff -aub test.cpp.gcov -
+
+RUN: not llvm-cov -gcno=test_read_fail.gcno -gcda=test.gcda
+
+XFAIL: powerpc64, s390x
diff --git a/test/tools/llvm-lit/lit.local.cfg b/test/tools/llvm-lit/lit.local.cfg
deleted file mode 100644
index 856a549..0000000
--- a/test/tools/llvm-lit/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.c']
diff --git a/test/tools/llvm-objdump/Inputs/nop.exe.coff-i386 b/test/tools/llvm-objdump/Inputs/nop.exe.coff-i386
new file mode 100644
index 0000000..68c9d3d
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/nop.exe.coff-i386
diff --git a/test/tools/llvm-objdump/Inputs/trivial.obj.elf-i386 b/test/tools/llvm-objdump/Inputs/trivial.obj.elf-i386
new file mode 100644
index 0000000..fdc4874
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/trivial.obj.elf-i386
diff --git a/test/tools/llvm-objdump/Inputs/win64-unwind.exe.coff-x86_64 b/test/tools/llvm-objdump/Inputs/win64-unwind.exe.coff-x86_64
new file mode 100644
index 0000000..63460e7
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/win64-unwind.exe.coff-x86_64
diff --git a/test/tools/llvm-objdump/Inputs/win64-unwind.exe.coff-x86_64.asm b/test/tools/llvm-objdump/Inputs/win64-unwind.exe.coff-x86_64.asm
new file mode 100644
index 0000000..4d47fa4
--- /dev/null
+++ b/test/tools/llvm-objdump/Inputs/win64-unwind.exe.coff-x86_64.asm
@@ -0,0 +1,53 @@
+    .text
+    .globl func
+    .def func; .scl 2; .type 32; .endef
+    .seh_proc func
+func:
+    .seh_pushframe @code
+    subq $24, %rsp
+    .seh_stackalloc 24
+    movq %rsi, 16(%rsp)
+    .seh_savereg %rsi, 16
+    movups %xmm8, (%rsp)
+    .seh_savexmm %xmm8, 0
+    pushq %rbx
+    .seh_pushreg 3
+    mov %rsp, %rbx
+    .seh_setframe 3, 0
+    .seh_endprologue
+    .seh_handler __C_specific_handler, @except
+    .seh_handlerdata
+    .long 0
+    .text
+    .seh_startchained
+    .seh_endprologue
+    .seh_endchained
+    lea (%rbx), %rsp
+    pop %rbx
+    addq $24, %rsp
+    ret
+    .seh_endproc
+
+// Test emission of small functions.
+    .globl smallFunc
+    .def smallFunc; .scl 2; .type 32; .endef
+    .seh_proc smallFunc
+smallFunc:
+    ret
+    .seh_endproc
+
+// Function with big stack allocation.
+    .globl allocFunc
+    .def allocFunc; .scl 2; .type 32; .endef
+    .seh_proc allocFunc
+allocFunc:
+    .seh_pushframe @code
+    subq $65520, %rsp
+    .seh_stackalloc 65520
+    sub $8454128, %rsp
+    .seh_stackalloc 8454128
+    .seh_endprologue
+    add $8454128, %rsp
+    addq $65520, %rsp
+    ret
+    .seh_endproc
diff --git a/test/tools/llvm-objdump/coff-private-headers.test b/test/tools/llvm-objdump/coff-private-headers.test
new file mode 100644
index 0000000..d36c148
--- /dev/null
+++ b/test/tools/llvm-objdump/coff-private-headers.test
@@ -0,0 +1,9 @@
+// RUN: llvm-objdump -p %p/Inputs/nop.exe.coff-i386 | FileCheck %s
+
+CHECK:       The Import Tables:
+CHECK-NEXT:  lookup 00005028 time 00000000 fwd 00000000 name 00005096 addr 00005058
+CHECK:       DLL Name: KERNEL32.dll
+CHECK-NEXT:     Hint/Ord  Name
+CHECK-NEXT:          365  ExitProcess
+
+
diff --git a/test/tools/llvm-objdump/disassembly-show-raw.s b/test/tools/llvm-objdump/disassembly-show-raw.s
deleted file mode 100644
index 32fcad4..0000000
--- a/test/tools/llvm-objdump/disassembly-show-raw.s
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: llvm-mc -filetype=obj -arch=x86 %s | llvm-objdump -d - \
-// RUN:                                    | FileCheck %s -check-prefix=WITHRAW
-// RUN: llvm-mc -filetype=obj -arch=x86 %s | llvm-objdump -d -no-show-raw-insn - \
-// RUN:                                    | FileCheck %s -check-prefix=NORAW
-
-// Expect to find the raw incoding when run with raw output (default), but not
-// when run explicitly with -no-show-raw-insn
-
-movl 0, %eax
-// WITHRAW: a1 00 00 00 00 movl
-
-// NORAW: movl
-// NORAW-NOT: a1 00
-
-
diff --git a/test/tools/llvm-objdump/disassembly-show-raw.test b/test/tools/llvm-objdump/disassembly-show-raw.test
new file mode 100644
index 0000000..e9956a5
--- /dev/null
+++ b/test/tools/llvm-objdump/disassembly-show-raw.test
@@ -0,0 +1,14 @@
+// RUN: llvm-objdump -d %p/Inputs/trivial.obj.elf-i386 \
+// RUN:     | FileCheck %s -check-prefix=WITHRAW
+// RUN: llvm-objdump -d -no-show-raw-insn %p/Inputs/trivial.obj.elf-i386 \
+// RUN:     | FileCheck %s -check-prefix=NORAW
+
+// Expect to find the raw incoding when run with raw output (default), but not
+// when run explicitly with -no-show-raw-insn
+
+WITHRAW: a1 00 00 00 00 movl
+
+NORAW: movl
+NORAW-NOT: a1 00
+
+
diff --git a/test/tools/llvm-objdump/lit.local.cfg b/test/tools/llvm-objdump/lit.local.cfg
index 56bf008..19840aa 100644
--- a/test/tools/llvm-objdump/lit.local.cfg
+++ b/test/tools/llvm-objdump/lit.local.cfg
@@ -1,6 +1,3 @@
-config.suffixes = ['.ll', '.s']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
-
diff --git a/test/tools/llvm-objdump/win64-unwind-data.s b/test/tools/llvm-objdump/win64-unwind-data.s
deleted file mode 100644
index 1e4c742..0000000
--- a/test/tools/llvm-objdump/win64-unwind-data.s
+++ /dev/null
@@ -1,106 +0,0 @@
-// This test checks that the unwind data is dumped by llvm-objdump.
-// RUN: llvm-mc -triple x86_64-pc-win32 -filetype=obj %s | llvm-objdump -u - | FileCheck %s
-
-// CHECK:      Unwind info:
-// CHECK:      Function Table:
-// CHECK-NEXT: Start Address: .text
-// CHECK-NEXT: End Address: .text + 0x001b
-// CHECK-NEXT: Unwind Info Address: .xdata
-// CHECK-NEXT: Version: 1
-// CHECK-NEXT: Flags: 1 UNW_ExceptionHandler
-// CHECK-NEXT: Size of prolog: 18
-// CHECK-NEXT: Number of Codes: 8
-// CHECK-NEXT: Frame register: RBX
-// CHECK-NEXT: Frame offset: 0
-// CHECK-NEXT: Unwind Codes:
-// CHECK-NEXT: 0x00: UOP_SetFPReg
-// CHECK-NEXT: 0x0f: UOP_PushNonVol RBX
-// CHECK-NEXT: 0x0e: UOP_SaveXMM128 XMM8 [0x0000]
-// CHECK-NEXT: 0x09: UOP_SaveNonVol RSI [0x0010]
-// CHECK-NEXT: 0x04: UOP_AllocSmall 24
-// CHECK-NEXT: 0x00: UOP_PushMachFrame w/o error code
-// CHECK:      Function Table:
-// CHECK-NEXT: Start Address: .text + 0x0012
-// CHECK-NEXT: End Address: .text + 0x0012
-// CHECK-NEXT: Unwind Info Address: .xdata + 0x001c
-// CHECK-NEXT: Version: 1
-// CHECK-NEXT: Flags: 4 UNW_ChainInfo
-// CHECK-NEXT: Size of prolog: 0
-// CHECK-NEXT: Number of Codes: 0
-// CHECK-NEXT: No frame pointer used
-// CHECK:      Function Table:
-// CHECK-NEXT: Start Address: .text + 0x001b
-// CHECK-NEXT: End Address: .text + 0x001c
-// CHECK-NEXT: Unwind Info Address: .xdata + 0x002c
-// CHECK-NEXT: Version: 1
-// CHECK-NEXT: Flags: 0
-// CHECK-NEXT: Size of prolog: 0
-// CHECK-NEXT: Number of Codes: 0
-// CHECK-NEXT: No frame pointer used
-// CHECK:      Function Table:
-// CHECK-NEXT: Start Address: .text + 0x001c
-// CHECK-NEXT: End Address: .text + 0x0039
-// CHECK-NEXT: Unwind Info Address: .xdata + 0x0034
-// CHECK-NEXT: Version: 1
-// CHECK-NEXT: Flags: 0
-// CHECK-NEXT: Size of prolog: 14
-// CHECK-NEXT: Number of Codes: 6
-// CHECK-NEXT: No frame pointer used
-// CHECK-NEXT: Unwind Codes:
-// CHECK-NEXT: 0x0e: UOP_AllocLarge 8454128
-// CHECK-NEXT: 0x07: UOP_AllocLarge 8190
-// CHECK-NEXT: 0x00: UOP_PushMachFrame w/o error code
-
-    .text
-    .globl func
-    .def func; .scl 2; .type 32; .endef
-    .seh_proc func
-func:
-    .seh_pushframe @code
-    subq $24, %rsp
-    .seh_stackalloc 24
-    movq %rsi, 16(%rsp)
-    .seh_savereg %rsi, 16
-    movups %xmm8, (%rsp)
-    .seh_savexmm %xmm8, 0
-    pushq %rbx
-    .seh_pushreg 3
-    mov %rsp, %rbx
-    .seh_setframe 3, 0
-    .seh_endprologue
-    .seh_handler __C_specific_handler, @except
-    .seh_handlerdata
-    .long 0
-    .text
-    .seh_startchained
-    .seh_endprologue
-    .seh_endchained
-    lea (%rbx), %rsp
-    pop %rbx
-    addq $24, %rsp
-    ret
-    .seh_endproc
-
-// Test emission of small functions.
-    .globl smallFunc
-    .def smallFunc; .scl 2; .type 32; .endef
-    .seh_proc smallFunc
-smallFunc:
-    ret
-    .seh_endproc
-
-// Function with big stack allocation.
-    .globl smallFunc
-    .def allocFunc; .scl 2; .type 32; .endef
-    .seh_proc smallFunc
-allocFunc:
-    .seh_pushframe @code
-    subq $65520, %rsp
-    .seh_stackalloc 65520
-    sub $8454128, %rsp
-    .seh_stackalloc 8454128
-    .seh_endprologue
-    add $8454128, %rsp
-    addq $65520, %rsp
-    ret
-    .seh_endproc
diff --git a/test/tools/llvm-objdump/win64-unwind-data.test b/test/tools/llvm-objdump/win64-unwind-data.test
new file mode 100644
index 0000000..a723ffe
--- /dev/null
+++ b/test/tools/llvm-objdump/win64-unwind-data.test
@@ -0,0 +1,52 @@
+// This test checks that the unwind data is dumped by llvm-objdump.
+// RUN: llvm-objdump -u %p/Inputs/win64-unwind.exe.coff-x86_64 | FileCheck %s
+
+CHECK:      Unwind info:
+CHECK:      Function Table:
+CHECK-NEXT: Start Address: func
+CHECK-NEXT: End Address: func + 0x001b
+CHECK-NEXT: Unwind Info Address: .xdata
+CHECK-NEXT: Version: 1
+CHECK-NEXT: Flags: 1 UNW_ExceptionHandler
+CHECK-NEXT: Size of prolog: 18
+CHECK-NEXT: Number of Codes: 8
+CHECK-NEXT: Frame register: RBX
+CHECK-NEXT: Frame offset: 0
+CHECK-NEXT: Unwind Codes:
+CHECK-NEXT: 0x12: UOP_SetFPReg
+CHECK-NEXT: 0x0f: UOP_PushNonVol RBX
+CHECK-NEXT: 0x0e: UOP_SaveXMM128 XMM8 [0x0000]
+CHECK-NEXT: 0x09: UOP_SaveNonVol RSI [0x0010]
+CHECK-NEXT: 0x04: UOP_AllocSmall 24
+CHECK-NEXT: 0x00: UOP_PushMachFrame w/o error code
+CHECK:      Function Table:
+CHECK-NEXT: Start Address: func + 0x0012
+CHECK-NEXT: End Address: func + 0x0012
+CHECK-NEXT: Unwind Info Address: .xdata + 0x001c
+CHECK-NEXT: Version: 1
+CHECK-NEXT: Flags: 4 UNW_ChainInfo
+CHECK-NEXT: Size of prolog: 0
+CHECK-NEXT: Number of Codes: 0
+CHECK-NEXT: No frame pointer used
+CHECK:      Function Table:
+CHECK-NEXT: Start Address: smallFunc
+CHECK-NEXT: End Address: smallFunc + 0x0001
+CHECK-NEXT: Unwind Info Address: .xdata + 0x002c
+CHECK-NEXT: Version: 1
+CHECK-NEXT: Flags: 0
+CHECK-NEXT: Size of prolog: 0
+CHECK-NEXT: Number of Codes: 0
+CHECK-NEXT: No frame pointer used
+CHECK:      Function Table:
+CHECK-NEXT: Start Address: allocFunc
+CHECK-NEXT: End Address: allocFunc + 0x001d
+CHECK-NEXT: Unwind Info Address: .xdata + 0x0034
+CHECK-NEXT: Version: 1
+CHECK-NEXT: Flags: 0
+CHECK-NEXT: Size of prolog: 14
+CHECK-NEXT: Number of Codes: 6
+CHECK-NEXT: No frame pointer used
+CHECK-NEXT: Unwind Codes:
+CHECK-NEXT: 0x0e: UOP_AllocLarge 8454128
+CHECK-NEXT: 0x07: UOP_AllocLarge 8190
+CHECK-NEXT: 0x00: UOP_PushMachFrame w/o error code
diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table.c b/test/tools/llvm-readobj/Inputs/dynamic-table.c
new file mode 100644
index 0000000..6d36e8a
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/dynamic-table.c
@@ -0,0 +1,7 @@
+// clang -target mipsel-linux-gnu -shared -fPIC -lc dynamic-table.c \
+//       -o dynamic-table.mips
+int puts(const char *);
+
+void foo(void) {
+  puts("Hello, World");
+}
diff --git a/test/tools/llvm-readobj/Inputs/dynamic-table.mips b/test/tools/llvm-readobj/Inputs/dynamic-table.mips
new file mode 100644
index 0000000..ab36cee
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/dynamic-table.mips
diff --git a/test/tools/llvm-readobj/Inputs/magic.coff-importlib b/test/tools/llvm-readobj/Inputs/magic.coff-importlib
new file mode 100644
index 0000000..b934afb
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/magic.coff-importlib
diff --git a/test/tools/llvm-readobj/Inputs/magic.coff-unknown b/test/tools/llvm-readobj/Inputs/magic.coff-unknown
new file mode 100644
index 0000000..7b3b461
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/magic.coff-unknown
diff --git a/test/tools/llvm-readobj/Inputs/rpath.exe.elf-x86_64 b/test/tools/llvm-readobj/Inputs/rpath.exe.elf-x86_64
new file mode 100644
index 0000000..8c01c50
--- /dev/null
+++ b/test/tools/llvm-readobj/Inputs/rpath.exe.elf-x86_64
diff --git a/test/tools/llvm-readobj/dynamic.test b/test/tools/llvm-readobj/dynamic.test
new file mode 100644
index 0000000..78a9b3b
--- /dev/null
+++ b/test/tools/llvm-readobj/dynamic.test
@@ -0,0 +1,33 @@
+RUN: llvm-readobj -dynamic-table %p/Inputs/dynamic-table.mips \
+RUN:     | FileCheck %s -check-prefix ELF-MIPS
+
+ELF-MIPS: Format: ELF32-mips
+ELF-MIPS: Arch: mipsel
+ELF-MIPS: AddressSize: 32bit
+ELF-MIPS: LoadName:
+ELF-MIPS: DynamicSection [ (23 entries)
+ELF-MIPS:   Tag        Type                 Name/Value
+ELF-MIPS:   0x00000001 NEEDED               SharedLibrary (libc.so.6)
+ELF-MIPS:   0x0000000C INIT                 0x528
+ELF-MIPS:   0x0000000D FINI                 0x860
+ELF-MIPS:   0x00000004 HASH                 0x210
+ELF-MIPS:   0x00000005 STRTAB               0x3D8
+ELF-MIPS:   0x00000006 SYMTAB               0x2A8
+ELF-MIPS:   0x0000000A STRSZ                231 (bytes)
+ELF-MIPS:   0x0000000B SYMENT               16 (bytes)
+ELF-MIPS:   0x00000003 PLTGOT               0x108E0
+ELF-MIPS:   0x00000011 REL                  0x518
+ELF-MIPS:   0x00000012 RELSZ                16 (bytes)
+ELF-MIPS:   0x00000013 RELENT               8 (bytes)
+ELF-MIPS:   0x70000001 MIPS_RLD_VERSION     1
+ELF-MIPS:   0x70000005 MIPS_FLAGS           0x2
+ELF-MIPS:   0x70000006 MIPS_BASE_ADDRESS    0x0
+ELF-MIPS:   0x7000000A MIPS_LOCAL_GOTNO     10
+ELF-MIPS:   0x70000011 MIPS_SYMTABNO        19
+ELF-MIPS:   0x70000012 MIPS_UNREFEXTNO      26
+ELF-MIPS:   0x70000013 MIPS_GOTSYM          0xD
+ELF-MIPS:   0x6FFFFFFE VERNEED              0x4E8
+ELF-MIPS:   0x6FFFFFFF VERNEEDNUM           1
+ELF-MIPS:   0x6FFFFFF0 VERSYM               0x4C0
+ELF-MIPS:   0x00000000 NULL                 0x0
+ELF-MIPS: ]
diff --git a/test/tools/llvm-readobj/file-headers.test b/test/tools/llvm-readobj/file-headers.test
index b900e36..b2b4547 100644
--- a/test/tools/llvm-readobj/file-headers.test
+++ b/test/tools/llvm-readobj/file-headers.test
@@ -8,6 +8,10 @@ RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-i386 \
 RUN:   | FileCheck %s -check-prefix ELF32
 RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-x86-64 \
 RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj -h %p/Inputs/magic.coff-unknown \
+RUN:   | FileCheck %s -check-prefix COFF-UNKNOWN
+RUN: llvm-readobj -h %p/Inputs/magic.coff-importlib \
+RUN:   | FileCheck %s -check-prefix COFF-IMPORTLIB
 
 COFF32:      File: {{(.*[/\\])?}}trivial.obj.coff-i386
 COFF32-NEXT: Format: COFF-i386
@@ -183,3 +187,32 @@ PE32-NEXT:     ReservedRVA: 0x0
 PE32-NEXT:     ReservedSize: 0x0
 PE32-NEXT:   }
 PE32-NEXT: }
+
+COFF-UNKNOWN:      Format: COFF-<unknown arch>
+COFF-UNKNOWN-NEXT: Arch: unknown
+COFF-UNKNOWN-NEXT: AddressSize: 32bit
+COFF-UNKNOWN-NEXT: ImageFileHeader {
+COFF-UNKNOWN-NEXT:   Machine: IMAGE_FILE_MACHINE_UNKNOWN (0x0)
+COFF-UNKNOWN-NEXT:   SectionCount: 3
+COFF-UNKNOWN-NEXT:   TimeDateStamp: 2013-11-14 21:19:28 (0x52853E60)
+COFF-UNKNOWN-NEXT:   PointerToSymbolTable: 0xF8
+COFF-UNKNOWN-NEXT:   SymbolCount: 11
+COFF-UNKNOWN-NEXT:   OptionalHeaderSize: 0
+COFF-UNKNOWN-NEXT:   Characteristics [ (0x0)
+COFF-UNKNOWN-NEXT:   ]
+COFF-UNKNOWN-NEXT: }
+
+COFF-IMPORTLIB:      Format: COFF-<unknown arch>
+COFF-IMPORTLIB-NEXT: Arch: unknown
+COFF-IMPORTLIB-NEXT: AddressSize: 32bit
+COFF-IMPORTLIB-NEXT: ImageFileHeader {
+COFF-IMPORTLIB-NEXT:   Machine: IMAGE_FILE_MACHINE_UNKNOWN (0x0)
+COFF-IMPORTLIB-NEXT:   SectionCount: 65535
+COFF-IMPORTLIB-NEXT:   TimeDateStamp: 1970-09-09 19:52:32 (0x14C0000)
+COFF-IMPORTLIB-NEXT:   PointerToSymbolTable: 0x528542EB
+COFF-IMPORTLIB-NEXT:   SymbolCount: 20
+COFF-IMPORTLIB-NEXT:   OptionalHeaderSize: 0
+COFF-IMPORTLIB-NEXT:   Characteristics [ (0x8)
+COFF-IMPORTLIB-NEXT:     IMAGE_FILE_LOCAL_SYMS_STRIPPED (0x8)
+COFF-IMPORTLIB-NEXT:   ]
+COFF-IMPORTLIB-NEXT: }
diff --git a/test/tools/llvm-readobj/program-headers.test b/test/tools/llvm-readobj/program-headers.test
index 2a574bb..7c22f2b 100644
--- a/test/tools/llvm-readobj/program-headers.test
+++ b/test/tools/llvm-readobj/program-headers.test
@@ -2,6 +2,8 @@ RUN: llvm-readobj -program-headers %p/../../Object/Inputs/program-headers.elf-i3
 RUN:     | FileCheck %s -check-prefix ELF-I386
 RUN: llvm-readobj -program-headers %p/../../Object/Inputs/program-headers.elf-x86-64 \
 RUN:     | FileCheck %s -check-prefix ELF-X86-64
+RUN: llvm-readobj -program-headers %p/../../Object/Inputs/program-headers.mips \
+RUN:     | FileCheck %s -check-prefix ELF-MIPS
 
 ELF-I386:      ProgramHeaders [
 ELF-I386-NEXT:   ProgramHeader {
@@ -72,3 +74,31 @@ ELF-X86-64-NEXT:     ]
 ELF-X86-64-NEXT:     Alignment: 8
 ELF-X86-64-NEXT:   }
 ELF-X86-64-NEXT: ]
+
+ELF-MIPS:      ProgramHeaders [
+ELF-MIPS-NEXT:   ProgramHeader {
+ELF-MIPS-NEXT:     Type: PT_MIPS_REGINFO (0x70000000)
+ELF-MIPS-NEXT:     Offset: 0x74
+ELF-MIPS-NEXT:     VirtualAddress: 0x400074
+ELF-MIPS-NEXT:     PhysicalAddress: 0x400074
+ELF-MIPS-NEXT:     FileSize: 24
+ELF-MIPS-NEXT:     MemSize: 24
+ELF-MIPS-NEXT:     Flags [ (0x4)
+ELF-MIPS-NEXT:       PF_R (0x4)
+ELF-MIPS-NEXT:     ]
+ELF-MIPS-NEXT:     Alignment: 4
+ELF-MIPS-NEXT:   }
+ELF-MIPS-NEXT:   ProgramHeader {
+ELF-MIPS-NEXT:     Type: PT_LOAD (0x1)
+ELF-MIPS-NEXT:     Offset: 0x0
+ELF-MIPS-NEXT:     VirtualAddress: 0x400000
+ELF-MIPS-NEXT:     PhysicalAddress: 0x400000
+ELF-MIPS-NEXT:     FileSize: 160
+ELF-MIPS-NEXT:     MemSize: 160
+ELF-MIPS-NEXT:     Flags [ (0x5)
+ELF-MIPS-NEXT:       PF_R (0x4)
+ELF-MIPS-NEXT:       PF_X (0x1)
+ELF-MIPS-NEXT:     ]
+ELF-MIPS-NEXT:     Alignment: 65536
+ELF-MIPS-NEXT:   }
+ELF-MIPS-NEXT: ]
diff --git a/test/tools/llvm-readobj/rpath.test b/test/tools/llvm-readobj/rpath.test
new file mode 100644
index 0000000..600938e
--- /dev/null
+++ b/test/tools/llvm-readobj/rpath.test
@@ -0,0 +1,4 @@
+RUN: llvm-readobj -dynamic-table %p/Inputs/rpath.exe.elf-x86_64 \
+RUN:   | FileCheck %s
+
+CHECK: 0x000000000000000F RPATH /usr/local/lib
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index e663781..12e10fd 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -2,59 +2,66 @@
 # three small executables. This is done to minimize memory load in parallel
 # builds.  Please retain this ordering.
 
-if( NOT WIN32 OR MSYS OR CYGWIN )
-  # We currently require 'sed' to build llvm-config, so don't try to build it
-  # on pure Win32.
-  add_subdirectory(llvm-config)
-endif()
+add_llvm_tool_subdirectory(llvm-config)
 
-add_subdirectory(opt)
-add_subdirectory(llvm-as)
-add_subdirectory(llvm-dis)
-add_subdirectory(llvm-mc)
+add_llvm_tool_subdirectory(opt)
+add_llvm_tool_subdirectory(llvm-as)
+add_llvm_tool_subdirectory(llvm-dis)
+add_llvm_tool_subdirectory(llvm-mc)
 
-add_subdirectory(llc)
-add_subdirectory(llvm-ar)
-add_subdirectory(llvm-nm)
-add_subdirectory(llvm-size)
+add_llvm_tool_subdirectory(llc)
+add_llvm_tool_subdirectory(llvm-ar)
+add_llvm_tool_subdirectory(llvm-nm)
+add_llvm_tool_subdirectory(llvm-size)
 
-add_subdirectory(llvm-cov)
-add_subdirectory(llvm-prof)
-add_subdirectory(llvm-link)
-add_subdirectory(lli)
+add_llvm_tool_subdirectory(llvm-cov)
+add_llvm_tool_subdirectory(llvm-link)
+add_llvm_tool_subdirectory(lli)
 
-add_subdirectory(llvm-extract)
-add_subdirectory(llvm-diff)
-add_subdirectory(macho-dump)
-add_subdirectory(llvm-objdump)
-add_subdirectory(llvm-readobj)
-add_subdirectory(llvm-rtdyld)
-add_subdirectory(llvm-dwarfdump)
+add_llvm_tool_subdirectory(llvm-extract)
+add_llvm_tool_subdirectory(llvm-diff)
+add_llvm_tool_subdirectory(macho-dump)
+add_llvm_tool_subdirectory(llvm-objdump)
+add_llvm_tool_subdirectory(llvm-readobj)
+add_llvm_tool_subdirectory(llvm-rtdyld)
+add_llvm_tool_subdirectory(llvm-dwarfdump)
 if( LLVM_USE_INTEL_JITEVENTS )
-  add_subdirectory(llvm-jitlistener)
+  add_llvm_tool_subdirectory(llvm-jitlistener)
+else()
+  ignore_llvm_tool_subdirectory(llvm-jitlistener)
 endif( LLVM_USE_INTEL_JITEVENTS )
 
-add_subdirectory(bugpoint)
-add_subdirectory(bugpoint-passes)
-add_subdirectory(llvm-bcanalyzer)
-add_subdirectory(llvm-stress)
-add_subdirectory(llvm-mcmarkup)
+add_llvm_tool_subdirectory(bugpoint)
+add_llvm_tool_subdirectory(bugpoint-passes)
+add_llvm_tool_subdirectory(llvm-bcanalyzer)
+add_llvm_tool_subdirectory(llvm-stress)
+add_llvm_tool_subdirectory(llvm-mcmarkup)
+
+add_llvm_tool_subdirectory(llvm-symbolizer)
 
-add_subdirectory(llvm-symbolizer)
+add_llvm_tool_subdirectory(llvm-c-test)
 
-add_subdirectory(obj2yaml)
-add_subdirectory(yaml2obj)
+add_llvm_tool_subdirectory(obj2yaml)
+add_llvm_tool_subdirectory(yaml2obj)
 
-if( NOT WIN32 )
-  add_subdirectory(lto)
+if( NOT CYGWIN )
+  add_llvm_tool_subdirectory(lto)
+  add_llvm_tool_subdirectory(llvm-lto)
+else()
+  ignore_llvm_tool_subdirectory(lto)
+  ignore_llvm_tool_subdirectory(llvm-lto)
 endif()
 
 if( LLVM_ENABLE_PIC )
   # TODO: support other systems:
   if( (CMAKE_SYSTEM_NAME STREQUAL "Linux")
       OR (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") )
-    add_subdirectory(gold)
+    add_llvm_tool_subdirectory(gold)
+  else()
+    ignore_llvm_tool_subdirectory(gold)
   endif()
+else()
+  ignore_llvm_tool_subdirectory(gold)
 endif()
 
 add_llvm_external_project(clang)
@@ -63,6 +70,10 @@ if( NOT LLVM_INCLUDE_TOOLS STREQUAL "bootstrap-only" )
   add_llvm_external_project(lld)
   add_llvm_external_project(lldb)
   add_llvm_external_project(polly)
+
+  # Automatically add remaining sub-directories containing a 'CMakeLists.txt'
+  # file as external projects.
+  add_llvm_implicit_external_projects()
 endif()
 
 set(LLVM_COMMON_DEPENDS ${LLVM_COMMON_DEPENDS} PARENT_SCOPE)
diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt
index 9ec89f3..93b8d98 100644
--- a/tools/LLVMBuild.txt
+++ b/tools/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-jitlistener llvm-link llvm-mc llvm-nm llvm-objdump llvm-prof llvm-rtdyld llvm-size macho-dump opt llvm-mcmarkup
+subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-jitlistener llvm-link llvm-lto llvm-mc llvm-nm llvm-objdump llvm-rtdyld llvm-size macho-dump opt llvm-mcmarkup
 
 [component_0]
 type = Group
diff --git a/tools/Makefile b/tools/Makefile
index b7375c9..be87254 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -27,15 +27,11 @@ OPTIONAL_DIRS := lldb
 # large and three small executables. This is done to minimize memory load
 # in parallel builds.  Please retain this ordering.
 DIRS := llvm-config
-PARALLEL_DIRS := opt llvm-as llvm-dis \
-                 llc llvm-ar llvm-nm \
-                 llvm-prof llvm-link \
-                 lli llvm-extract llvm-mc \
-                 bugpoint llvm-bcanalyzer \
-                 llvm-diff macho-dump llvm-objdump llvm-readobj \
-	         llvm-rtdyld llvm-dwarfdump llvm-cov \
-	         llvm-size llvm-stress llvm-mcmarkup \
-	         llvm-symbolizer obj2yaml yaml2obj
+PARALLEL_DIRS := opt llvm-as llvm-dis llc llvm-ar llvm-nm llvm-link \
+                 lli llvm-extract llvm-mc bugpoint llvm-bcanalyzer llvm-diff \
+                 macho-dump llvm-objdump llvm-readobj llvm-rtdyld \
+                 llvm-dwarfdump llvm-cov llvm-size llvm-stress llvm-mcmarkup \
+                 llvm-symbolizer obj2yaml yaml2obj llvm-c-test
 
 # If Intel JIT Events support is configured, build an extra tool to test it.
 ifeq ($(USE_INTEL_JITEVENTS), 1)
@@ -54,11 +50,10 @@ endif
 ifndef ONLY_TOOLS
 ifeq ($(ENABLE_PIC),1)
   # gold only builds if binutils is around.  It requires "lto" to build before
-  # it so it is added to DIRS.
+  # it so it is added to DIRS. llvm-lto also requires lto
+  DIRS += lto llvm-lto
   ifdef BINUTILS_INCDIR
-    DIRS += lto gold
-  else
-    PARALLEL_DIRS += lto
+    DIRS += gold
   endif
 
   PARALLEL_DIRS += bugpoint-passes
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index c8b4f6f..b90fc61 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -195,10 +195,10 @@ namespace {
 }
 
 bool ReduceCrashingFunctions::TestFuncs(std::vector<Function*> &Funcs) {
-
-  //if main isn't present, claim there is no problem
-  if (KeepMain && find(Funcs.begin(), Funcs.end(),
-                       BD.getProgram()->getFunction("main")) == Funcs.end())
+  // If main isn't present, claim there is no problem.
+  if (KeepMain && std::find(Funcs.begin(), Funcs.end(),
+                            BD.getProgram()->getFunction("main")) ==
+                      Funcs.end())
     return false;
 
   // Clone the program to try hacking it apart...
diff --git a/tools/bugpoint/ExecutionDriver.cpp b/tools/bugpoint/ExecutionDriver.cpp
index 3d3dac3..c05c8d7 100644
--- a/tools/bugpoint/ExecutionDriver.cpp
+++ b/tools/bugpoint/ExecutionDriver.cpp
@@ -301,7 +301,6 @@ std::string BugDriver::executeProgram(const Module *Program,
   if (AI == 0) AI = Interpreter;
   assert(AI && "Interpreter should have been created already!");
   bool CreatedBitcode = false;
-  std::string ErrMsg;
   if (BitcodeFile.empty()) {
     // Emit the program to a bitcode file...
     SmallString<128> UniqueFilename;
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index 107d0db..254d997 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -662,7 +662,7 @@ static bool IsARMArchitecture(std::vector<const char*> Args) {
          I = Args.begin(), E = Args.end(); I != E; ++I) {
     if (StringRef(*I).equals_lower("-arch")) {
       ++I;
-      if (I != E && StringRef(*I).substr(0, strlen("arm")).equals_lower("arm"))
+      if (I != E && StringRef(*I).startswith_lower("arm"))
         return true;
     }
   }
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index 5e8fdd1..9bc592e 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -49,8 +49,8 @@ TimeoutValue("timeout", cl::init(300), cl::value_desc("seconds"),
 
 static cl::opt<int>
 MemoryLimit("mlimit", cl::init(-1), cl::value_desc("MBytes"),
-             cl::desc("Maximum amount of memory to use. 0 disables check."
-                      " Defaults to 100MB (800MB under valgrind)."));
+            cl::desc("Maximum amount of memory to use. 0 disables check."
+                     " Defaults to 300MB (800MB under valgrind)."));
 
 static cl::opt<bool>
 UseValgrind("enable-valgrind",
@@ -152,7 +152,7 @@ int main(int argc, char **argv) {
     if (sys::RunningOnValgrind() || UseValgrind)
       MemoryLimit = 800;
     else
-      MemoryLimit = 100;
+      MemoryLimit = 300;
   }
 
   BugDriver D(argv[0], FindBugs, TimeoutValue, MemoryLimit,
diff --git a/tools/gold/README.txt b/tools/gold/README.txt
index a906a90..286d9af 100644
--- a/tools/gold/README.txt
+++ b/tools/gold/README.txt
@@ -1,21 +1,13 @@
+The LLVM Gold LTO Plugin
+========================
+
 This directory contains a plugin that is designed to work with binutils
 gold linker. At present time, this is not the default linker in
 binutils, and the default build of gold does not support plugins.
 
-Obtaining binutils:
-
-  cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src login
-  {enter "anoncvs" as the password}
-  cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src co binutils
-
-This will create a src/ directory. Make a build/ directory and from
-there configure binutils with "../src/configure --enable-gold --enable-plugins".
-Then build binutils with "make all-gold".
+See docs/GoldPlugin.html for complete build and usage instructions.
 
-To build the LLVMgold plugin, configure LLVM with the option
---with-binutils-include=/path/to/binutils/src/include/ --enable-pic. To use the
-plugin, run "ld-new --plugin /path/to/LLVMgold.so".
-Without PIC libLTO and LLVMgold are not being built (because they would fail
-link on x86-64 with a relocation error: PIC and non-PIC can't be combined).
+NOTE: libLTO and LLVMgold aren't built without PIC because they would fail
+to link on x86-64 with a relocation error: PIC and non-PIC can't be combined.
 As an alternative to passing --enable-pic, you can use 'make ENABLE_PIC=1' in
 your entire LLVM build.
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 7771709..0e3bad2 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Config/config.h" // plugin-api.h requires HAVE_STDINT_H
 #include "plugin-api.h"
 #include "llvm-c/lto.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/FileSystem.h"
@@ -67,6 +68,7 @@ namespace {
   std::list<claimed_file> Modules;
   std::vector<std::string> Cleanup;
   lto_code_gen_t code_gen = NULL;
+  StringSet<> CannotBeHidden;
 }
 
 namespace options {
@@ -197,7 +199,7 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
       case LDPT_ADD_SYMBOLS:
         add_symbols = tv->tv_u.tv_add_symbols;
         break;
-      case LDPT_GET_SYMBOLS:
+      case LDPT_GET_SYMBOLS_V2:
         get_symbols = tv->tv_u.tv_get_symbols;
         break;
       case LDPT_ADD_INPUT_FILE:
@@ -297,6 +299,9 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
     sym.version = NULL;
 
     int scope = attrs & LTO_SYMBOL_SCOPE_MASK;
+    bool CanBeHidden = scope == LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
+    if (!CanBeHidden)
+      CannotBeHidden.insert(sym.name);
     switch (scope) {
       case LTO_SYMBOL_SCOPE_HIDDEN:
         sym.visibility = LDPV_HIDDEN;
@@ -306,6 +311,7 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
         break;
       case 0: // extern
       case LTO_SYMBOL_SCOPE_DEFAULT:
+      case LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN:
         sym.visibility = LDPV_DEFAULT;
         break;
       default:
@@ -351,14 +357,27 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
     }
   }
 
-  if (code_gen)
-    lto_codegen_add_module(code_gen, M);
+  if (code_gen) {
+    if (lto_codegen_add_module(code_gen, M)) {
+      (*message)(LDPL_ERROR, "Error linking module: %s",
+                 lto_get_error_message());
+      return LDPS_ERR;
+    }
+  }
 
   lto_module_dispose(M);
 
   return LDPS_OK;
 }
 
+static bool mustPreserve(const claimed_file &F, int i) {
+  if (F.syms[i].resolution == LDPR_PREVAILING_DEF)
+    return true;
+  if (F.syms[i].resolution == LDPR_PREVAILING_DEF_IRONLY_EXP)
+    return CannotBeHidden.count(F.syms[i].name);
+  return false;
+}
+
 /// all_symbols_read_hook - gold informs us that all symbols have been read.
 /// At this point, we use get_symbols to see if any of our definitions have
 /// been overridden by a native object file. Then, perform optimization and
@@ -381,7 +400,7 @@ static ld_plugin_status all_symbols_read_hook(void) {
       continue;
     (*get_symbols)(I->handle, I->syms.size(), &I->syms[0]);
     for (unsigned i = 0, e = I->syms.size(); i != e; i++) {
-      if (I->syms[i].resolution == LDPR_PREVAILING_DEF) {
+      if (mustPreserve(*I, i)) {
         lto_codegen_add_must_preserve_symbol(code_gen, I->syms[i].name);
 
         if (options::generate_api_file)
@@ -417,12 +436,19 @@ static ld_plugin_status all_symbols_read_hook(void) {
     bool err = lto_codegen_write_merged_modules(code_gen, path.c_str());
     if (err)
       (*message)(LDPL_FATAL, "Failed to write the output file.");
-    if (options::generate_bc_file == options::BC_ONLY)
+    if (options::generate_bc_file == options::BC_ONLY) {
+      lto_codegen_dispose(code_gen);
       exit(0);
+    }
   }
-  const char *objPath;
-  if (lto_codegen_compile_to_file(code_gen, &objPath)) {
-    (*message)(LDPL_ERROR, "Could not produce a combined object file\n");
+
+  std::string ObjPath;
+  {
+    const char *Temp;
+    if (lto_codegen_compile_to_file(code_gen, &Temp)) {
+      (*message)(LDPL_ERROR, "Could not produce a combined object file\n");
+    }
+    ObjPath = Temp;
   }
 
   lto_codegen_dispose(code_gen);
@@ -434,9 +460,9 @@ static ld_plugin_status all_symbols_read_hook(void) {
     }
   }
 
-  if ((*add_input_file)(objPath) != LDPS_OK) {
+  if ((*add_input_file)(ObjPath.c_str()) != LDPS_OK) {
     (*message)(LDPL_ERROR, "Unable to add .o file to the link.");
-    (*message)(LDPL_ERROR, "File left behind in: %s", objPath);
+    (*message)(LDPL_ERROR, "File left behind in: %s", ObjPath.c_str());
     return LDPS_ERR;
   }
 
@@ -447,7 +473,7 @@ static ld_plugin_status all_symbols_read_hook(void) {
   }
 
   if (options::obj_path.empty())
-    Cleanup.push_back(objPath);
+    Cleanup.push_back(ObjPath);
 
   return LDPS_OK;
 }
diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index 98f411d..5f8c7c9 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@@ -1,6 +1,8 @@
 
 set(LLVM_LINK_COMPONENTS mcjit jit interpreter nativecodegen bitreader asmparser irreader selectiondag native instrumentation)
 
+add_subdirectory(ChildTarget)
+
 if( LLVM_USE_OPROFILE )
   set(LLVM_LINK_COMPONENTS
     ${LLVM_LINK_COMPONENTS}
@@ -19,6 +21,7 @@ endif( LLVM_USE_INTEL_JITEVENTS )
 
 add_llvm_tool(lli
   lli.cpp
-  RecordingMemoryManager.cpp
+  RemoteMemoryManager.cpp
   RemoteTarget.cpp
+  RemoteTargetExternal.cpp
   )
diff --git a/tools/lli/ChildTarget/CMakeLists.txt b/tools/lli/ChildTarget/CMakeLists.txt
new file mode 100644
index 0000000..fd1ac24
--- /dev/null
+++ b/tools/lli/ChildTarget/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_tool(lli-child-target
+  ChildTarget.cpp
+  )
diff --git a/tools/lli/ChildTarget/ChildTarget.cpp b/tools/lli/ChildTarget/ChildTarget.cpp
new file mode 100644
index 0000000..55fcae9
--- /dev/null
+++ b/tools/lli/ChildTarget/ChildTarget.cpp
@@ -0,0 +1,242 @@
+#include "llvm/Config/config.h"
+
+#include "../RemoteTargetMessage.h"
+#include <assert.h>
+#include <map>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+class LLIChildTarget {
+public:
+  ~LLIChildTarget(); // OS-specific destructor
+  void initialize();
+  LLIMessageType waitForIncomingMessage();
+  void handleMessage(LLIMessageType messageType);
+
+private:
+  // Incoming message handlers
+  void handleAllocateSpace();
+  void handleLoadSection(bool IsCode);
+  void handleExecute();
+  void handleTerminate();
+
+  // Outgoing message handlers
+  void sendChildActive();
+  void sendAllocationResult(uint64_t Addr);
+  void sendLoadComplete();
+  void sendExecutionComplete(uint64_t Result);
+
+  // OS-specific functions
+  void initializeConnection();
+  int WriteBytes(const void *Data, size_t Size);
+  int ReadBytes(void *Data, size_t Size);
+  uint64_t allocate(uint32_t Alignment, uint32_t Size);
+  void makeSectionExecutable(uint64_t Addr, uint32_t Size);
+  void InvalidateInstructionCache(const void *Addr, size_t Len);
+  void releaseMemory(uint64_t Addr, uint32_t Size);
+
+  // Store a map of allocated buffers to sizes.
+  typedef std::map<uint64_t, uint32_t> AllocMapType;
+  AllocMapType m_AllocatedBufferMap;
+
+  // Communication handles (OS-specific)
+  void *ConnectionData;
+};
+
+int main() {
+  LLIChildTarget  ThisChild;
+  ThisChild.initialize();
+  LLIMessageType MsgType;
+  do {
+    MsgType = ThisChild.waitForIncomingMessage();
+    ThisChild.handleMessage(MsgType);
+  } while (MsgType != LLI_Terminate &&
+           MsgType != LLI_Error);
+  return 0;
+}
+
+// Public methods
+void LLIChildTarget::initialize() {
+  initializeConnection();
+  sendChildActive();
+}
+
+LLIMessageType LLIChildTarget::waitForIncomingMessage() {
+  int32_t MsgType = -1;
+  if (ReadBytes(&MsgType, 4) > 0)
+    return (LLIMessageType)MsgType;
+  return LLI_Error;
+}
+
+void LLIChildTarget::handleMessage(LLIMessageType messageType) {
+  switch (messageType) {
+    case LLI_AllocateSpace:
+      handleAllocateSpace();
+      break;
+    case LLI_LoadCodeSection:
+      handleLoadSection(true);
+      break;
+    case LLI_LoadDataSection:
+      handleLoadSection(false);
+      break;
+    case LLI_Execute:
+      handleExecute();
+      break;
+    case LLI_Terminate:
+      handleTerminate();
+      break;
+    default:
+      // FIXME: Handle error!
+      break;
+  }
+}
+
+// Incoming message handlers
+void LLIChildTarget::handleAllocateSpace() {
+  // Read and verify the message data size.
+  uint32_t DataSize;
+  int rc = ReadBytes(&DataSize, 4);
+  assert(rc == 4);
+  assert(DataSize == 8);
+
+  // Read the message arguments.
+  uint32_t Alignment;
+  uint32_t AllocSize;
+  rc = ReadBytes(&Alignment, 4);
+  assert(rc == 4);
+  rc = ReadBytes(&AllocSize, 4);
+  assert(rc == 4);
+
+  // Allocate the memory.
+  uint64_t Addr = allocate(Alignment, AllocSize);
+
+  // Send AllocationResult message.
+  sendAllocationResult(Addr);
+}
+
+void LLIChildTarget::handleLoadSection(bool IsCode) {
+  // Read the message data size.
+  uint32_t DataSize;
+  int rc = ReadBytes(&DataSize, 4);
+  assert(rc == 4);
+
+  // Read the target load address.
+  uint64_t Addr;
+  rc = ReadBytes(&Addr, 8);
+  assert(rc == 8);
+
+  size_t BufferSize = DataSize - 8;
+
+  // FIXME: Verify that this is in allocated space
+
+  // Read section data into previously allocated buffer
+  rc = ReadBytes((void*)Addr, DataSize - 8);
+  assert(rc == (int)(BufferSize));
+
+  // If IsCode, mark memory executable
+  if (IsCode)
+    makeSectionExecutable(Addr, BufferSize);
+
+  // Send MarkLoadComplete message.
+  sendLoadComplete();
+}
+
+void LLIChildTarget::handleExecute() {
+  // Read the message data size.
+  uint32_t DataSize;
+  int rc = ReadBytes(&DataSize, 4);
+  assert(rc == 4);
+  assert(DataSize == 8);
+
+  // Read the target address.
+  uint64_t Addr;
+  rc = ReadBytes(&Addr, 8);
+  assert(rc == 8);
+
+  // Call function
+  int Result;
+  int (*fn)(void) = (int(*)(void))Addr;
+  Result = fn();
+
+  // Send ExecutionResult message.
+  sendExecutionComplete((int64_t)Result);
+}
+
+void LLIChildTarget::handleTerminate() {
+  // Release all allocated memory
+  AllocMapType::iterator Begin = m_AllocatedBufferMap.begin();
+  AllocMapType::iterator End = m_AllocatedBufferMap.end();
+  for (AllocMapType::iterator It = Begin; It != End; ++It) {
+    releaseMemory(It->first, It->second);
+  }
+  m_AllocatedBufferMap.clear();
+}
+
+// Outgoing message handlers
+void LLIChildTarget::sendChildActive() {
+  // Write the message type.
+  uint32_t MsgType = (uint32_t)LLI_ChildActive;
+  int rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4);
+
+  // Write the data size.
+  uint32_t DataSize = 0;
+  rc = WriteBytes(&DataSize, 4);
+  assert(rc == 4);
+}
+
+void LLIChildTarget::sendAllocationResult(uint64_t Addr) {
+  // Write the message type.
+  uint32_t MsgType = (uint32_t)LLI_AllocationResult;
+  int rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4);
+
+  // Write the data size.
+  uint32_t DataSize = 8;
+  rc = WriteBytes(&DataSize, 4);
+  assert(rc == 4);
+
+  // Write the allocated address.
+  rc = WriteBytes(&Addr, 8);
+  assert(rc == 8);
+}
+
+void LLIChildTarget::sendLoadComplete() {
+  // Write the message type.
+  uint32_t MsgType = (uint32_t)LLI_LoadComplete;
+  int rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4);
+
+  // Write the data size.
+  uint32_t DataSize = 0;
+  rc = WriteBytes(&DataSize, 4);
+  assert(rc == 4);
+}
+
+void LLIChildTarget::sendExecutionComplete(uint64_t Result) {
+  // Write the message type.
+  uint32_t MsgType = (uint32_t)LLI_ExecutionResult;
+  int rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4);
+
+
+  // Write the data size.
+  uint32_t DataSize = 8;
+  rc = WriteBytes(&DataSize, 4);
+  assert(rc == 4);
+
+  // Write the result.
+  rc = WriteBytes(&Result, 8);
+  assert(rc == 8);
+}
+
+#ifdef LLVM_ON_UNIX
+#include "Unix/ChildTarget.inc"
+#endif
+
+#ifdef LLVM_ON_WIN32
+#include "Windows/ChildTarget.inc"
+#endif
diff --git a/tools/lli/ChildTarget/LLVMBuild.txt b/tools/lli/ChildTarget/LLVMBuild.txt
new file mode 100644
index 0000000..daf6df1
--- /dev/null
+++ b/tools/lli/ChildTarget/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./tools/lli/ChildTarget/LLVMBuild.txt --------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = lli-child-target
+parent = lli
diff --git a/tools/lli/ChildTarget/Makefile b/tools/lli/ChildTarget/Makefile
new file mode 100644
index 0000000..f77b144
--- /dev/null
+++ b/tools/lli/ChildTarget/Makefile
@@ -0,0 +1,17 @@
+##===- tools/lli/Makefile ------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../../..
+TOOLNAME := lli-child-target
+
+include $(LEVEL)/Makefile.config
+
+LINK_COMPONENTS :=
+
+include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/tools/lli/ChildTarget/Unix/ChildTarget.inc b/tools/lli/ChildTarget/Unix/ChildTarget.inc
new file mode 100644
index 0000000..cc95810
--- /dev/null
+++ b/tools/lli/ChildTarget/Unix/ChildTarget.inc
@@ -0,0 +1,166 @@
+//===- ChildTarget.inc - Child process for external JIT execution for Unix -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the Unix-specific parts of the ChildTarget class
+// which executes JITed code in a separate process from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach/mach.h>
+#endif
+
+#if defined(__mips__)
+#  if defined(__OpenBSD__)
+#    include <mips64/sysarch.h>
+#  else
+#    include <sys/cachectl.h>
+#  endif
+#endif
+
+#ifdef __APPLE__
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+#else
+extern "C" void __clear_cache(void *, void*);
+#endif
+
+namespace {
+
+struct ConnectionData_t {
+  int InputPipe;
+  int OutputPipe;
+
+  ConnectionData_t(int in, int out) : InputPipe(in), OutputPipe(out) {}
+};
+
+} // namespace
+
+LLIChildTarget::~LLIChildTarget() {
+  delete static_cast<ConnectionData_t *>(ConnectionData);
+}
+
+// OS-specific methods
+void LLIChildTarget::initializeConnection() {
+  // Store the parent ends of the pipes
+  ConnectionData = (void*)new ConnectionData_t(STDIN_FILENO, STDOUT_FILENO);
+}
+
+int LLIChildTarget::WriteBytes(const void *Data, size_t Size) {
+  return write(((ConnectionData_t*)ConnectionData)->OutputPipe, Data, Size);
+}
+
+int LLIChildTarget::ReadBytes(void *Data, size_t Size) {
+  return read(((ConnectionData_t*)ConnectionData)->InputPipe, Data, Size);
+}
+
+// The functions below duplicate functionality that is implemented in
+// Support/Memory.cpp with the goal of avoiding a dependency on any
+// llvm libraries.
+
+uint64_t LLIChildTarget::allocate(uint32_t Alignment, uint32_t Size) {
+  if (!Alignment)
+    Alignment = 16;
+
+  static const size_t PageSize = getpagesize();
+  const size_t NumPages = (Size+PageSize-1)/PageSize;
+  Size = NumPages*PageSize;
+
+  int fd = -1;
+#ifdef NEED_DEV_ZERO_FOR_MMAP
+  static int zero_fd = open("/dev/zero", O_RDWR);
+  if (zero_fd == -1)
+    return 0;
+  fd = zero_fd;
+#endif
+
+  int MMFlags = MAP_PRIVATE |
+#ifdef HAVE_MMAP_ANONYMOUS
+  MAP_ANONYMOUS
+#else
+  MAP_ANON
+#endif
+  ; // Ends statement above
+
+  uint64_t Addr = (uint64_t)::mmap(0, Size, PROT_READ | PROT_WRITE, MMFlags, fd, 0);
+  if (Addr == (uint64_t)MAP_FAILED)
+    return 0;
+
+  // Align the address.
+  Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+
+  m_AllocatedBufferMap[Addr] = Size;
+
+  // Return aligned address
+  return Addr;
+}
+
+void LLIChildTarget::makeSectionExecutable(uint64_t Addr, uint32_t Size) {
+  // FIXME: We have to mark the memory as RWX because multiple code chunks may
+  // be on the same page.  The RemoteTarget interface should be changed to
+  // work around that.
+  int Result = ::mprotect((void*)Addr, Size, PROT_READ | PROT_WRITE | PROT_EXEC);
+  if (Result != 0)
+    InvalidateInstructionCache((const void *)Addr, Size);
+}
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void LLIChildTarget::InvalidateInstructionCache(const void *Addr,
+                                        size_t Len) {
+
+// icache invalidation for PPC and ARM.
+#if defined(__APPLE__)
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
+  sys_icache_invalidate(const_cast<void *>(Addr), Len);
+#  endif
+
+#else
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+       defined(_POWER) || defined(_ARCH_PPC)) && defined(__GNUC__)
+  const size_t LineSize = 32;
+
+  const intptr_t Mask = ~(LineSize - 1);
+  const intptr_t StartLine = ((intptr_t) Addr) & Mask;
+  const intptr_t EndLine = ((intptr_t) Addr + Len + LineSize - 1) & Mask;
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("dcbf 0, %0" : : "r"(Line));
+  asm volatile("sync");
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("icbi 0, %0" : : "r"(Line));
+  asm volatile("isync");
+#  elif defined(__arm__) && defined(__GNUC__)
+  // FIXME: Can we safely always call this for __GNUC__ everywhere?
+  const char *Start = static_cast<const char *>(Addr);
+  const char *End = Start + Len;
+  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
+#  elif defined(__mips__)
+  const char *Start = static_cast<const char *>(Addr);
+  cacheflush(const_cast<char *>(Start), Len, BCACHE);
+#  endif
+
+#endif  // end apple
+}
+
+void LLIChildTarget::releaseMemory(uint64_t Addr, uint32_t Size) {
+  ::munmap((void*)Addr, Size);
+}
diff --git a/tools/lli/ChildTarget/Windows/ChildTarget.inc b/tools/lli/ChildTarget/Windows/ChildTarget.inc
new file mode 100644
index 0000000..45db2b0
--- /dev/null
+++ b/tools/lli/ChildTarget/Windows/ChildTarget.inc
@@ -0,0 +1,44 @@
+//=- ChildTarget.inc - Child process for external JIT execution for Windows -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Non-implementation of the Windows-specific parts of the ChildTarget class
+// which executes JITed code in a separate process from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+LLIChildTarget::~LLIChildTarget() {
+}
+
+// The RemoteTargetExternal implementation should prevent us from ever getting
+// here on Windows, but nothing prevents a user from running this directly.
+void LLIChildTarget::initializeConnection() {
+  assert(0 && "lli-child-target is not implemented for Windows");
+}
+
+int LLIChildTarget::WriteBytes(const void *Data, size_t Size) {
+  return 0;
+}
+
+int LLIChildTarget::ReadBytes(void *Data, size_t Size) {
+  return 0;
+}
+
+uint64_t LLIChildTarget::allocate(uint32_t Alignment, uint32_t Size) {
+  return 0;
+}
+
+void LLIChildTarget::makeSectionExecutable(uint64_t Addr, uint32_t Size) {
+}
+
+void LLIChildTarget::InvalidateInstructionCache(const void *Addr,
+                                        size_t Len) {
+}
+
+void LLIChildTarget::releaseMemory(uint64_t Addr, uint32_t Size) {
+}
diff --git a/tools/lli/LLVMBuild.txt b/tools/lli/LLVMBuild.txt
index c96a9e8..aab2a20 100644
--- a/tools/lli/LLVMBuild.txt
+++ b/tools/lli/LLVMBuild.txt
@@ -15,6 +15,9 @@
 ;
 ;===------------------------------------------------------------------------===;
 
+[common]
+subdirectories = ChildTarget
+
 [component_0]
 type = Tool
 name = lli
diff --git a/tools/lli/Makefile b/tools/lli/Makefile
index 7a40427..eca5d83 100644
--- a/tools/lli/Makefile
+++ b/tools/lli/Makefile
@@ -10,6 +10,8 @@
 LEVEL := ../..
 TOOLNAME := lli
 
+PARALLEL_DIRS := ChildTarget
+
 include $(LEVEL)/Makefile.config
 
 LINK_COMPONENTS := mcjit jit instrumentation interpreter nativecodegen bitreader asmparser irreader selectiondag native
diff --git a/tools/lli/RecordingMemoryManager.cpp b/tools/lli/RecordingMemoryManager.cpp
deleted file mode 100644
index 1fa8176..0000000
--- a/tools/lli/RecordingMemoryManager.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===- RecordingMemoryManager.cpp - Recording memory manager --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This memory manager allocates local storage and keeps a record of each
-// allocation. Iterators are provided for all data and code allocations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "RecordingMemoryManager.h"
-using namespace llvm;
-
-RecordingMemoryManager::~RecordingMemoryManager() {
-  for (SmallVectorImpl<Allocation>::iterator
-         I = AllocatedCodeMem.begin(), E = AllocatedCodeMem.end();
-       I != E; ++I)
-    sys::Memory::releaseMappedMemory(I->first);
-  for (SmallVectorImpl<Allocation>::iterator
-         I = AllocatedDataMem.begin(), E = AllocatedDataMem.end();
-       I != E; ++I)
-    sys::Memory::releaseMappedMemory(I->first);
-}
-
-uint8_t *RecordingMemoryManager::
-allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) {
-  // The recording memory manager is just a local copy of the remote target.
-  // The alignment requirement is just stored here for later use. Regular
-  // heap storage is sufficient here, but we're using mapped memory to work
-  // around a bug in MCJIT.
-  sys::MemoryBlock Block = allocateSection(Size);
-  AllocatedCodeMem.push_back(Allocation(Block, Alignment));
-  return (uint8_t*)Block.base();
-}
-
-uint8_t *RecordingMemoryManager::
-allocateDataSection(uintptr_t Size, unsigned Alignment,
-                    unsigned SectionID, bool IsReadOnly) {
-  // The recording memory manager is just a local copy of the remote target.
-  // The alignment requirement is just stored here for later use. Regular
-  // heap storage is sufficient here, but we're using mapped memory to work
-  // around a bug in MCJIT.
-  sys::MemoryBlock Block = allocateSection(Size);
-  AllocatedDataMem.push_back(Allocation(Block, Alignment));
-  return (uint8_t*)Block.base();
-}
-
-sys::MemoryBlock RecordingMemoryManager::allocateSection(uintptr_t Size) {
-  error_code ec;
-  sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(Size,
-                                                          &Near,
-                                                          sys::Memory::MF_READ |
-                                                          sys::Memory::MF_WRITE,
-                                                          ec);
-  assert(!ec && MB.base());
-
-  // FIXME: This is part of a work around to keep sections near one another
-  // when MCJIT performs relocations after code emission but before
-  // the generated code is moved to the remote target.
-  // Save this address as the basis for our next request
-  Near = MB;
-  return MB;
-}
-
-void RecordingMemoryManager::setMemoryWritable() { llvm_unreachable("Unexpected!"); }
-void RecordingMemoryManager::setMemoryExecutable() { llvm_unreachable("Unexpected!"); }
-void RecordingMemoryManager::setPoisonMemory(bool poison) { llvm_unreachable("Unexpected!"); }
-void RecordingMemoryManager::AllocateGOT() { llvm_unreachable("Unexpected!"); }
-uint8_t *RecordingMemoryManager::getGOTBase() const {
-  llvm_unreachable("Unexpected!");
-  return 0;
-}
-uint8_t *RecordingMemoryManager::startFunctionBody(const Function *F, uintptr_t &ActualSize){
-  llvm_unreachable("Unexpected!");
-  return 0;
-}
-uint8_t *RecordingMemoryManager::allocateStub(const GlobalValue* F, unsigned StubSize,
-                                              unsigned Alignment) {
-  llvm_unreachable("Unexpected!");
-  return 0;
-}
-void RecordingMemoryManager::endFunctionBody(const Function *F, uint8_t *FunctionStart,
-                                             uint8_t *FunctionEnd) {
-  llvm_unreachable("Unexpected!");
-}
-uint8_t *RecordingMemoryManager::allocateSpace(intptr_t Size, unsigned Alignment) {
-  llvm_unreachable("Unexpected!");
-  return 0;
-}
-uint8_t *RecordingMemoryManager::allocateGlobal(uintptr_t Size, unsigned Alignment) {
-  llvm_unreachable("Unexpected!");
-  return 0;
-}
-void RecordingMemoryManager::deallocateFunctionBody(void *Body) {
-  llvm_unreachable("Unexpected!");
-}
-
-static int jit_noop() {
-  return 0;
-}
-
-void *RecordingMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                                        bool AbortOnFailure) {
-  // We should not invoke parent's ctors/dtors from generated main()!
-  // On Mingw and Cygwin, the symbol __main is resolved to
-  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
-  // (and register wrong callee's dtors with atexit(3)).
-  // We expect ExecutionEngine::runStaticConstructorsDestructors()
-  // is called before ExecutionEngine::runFunctionAsMain() is called.
-  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
-
-  return NULL;
-}
diff --git a/tools/lli/RecordingMemoryManager.h b/tools/lli/RecordingMemoryManager.h
deleted file mode 100644
index b2919c3..0000000
--- a/tools/lli/RecordingMemoryManager.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//===- RecordingMemoryManager.h - LLI MCJIT recording memory manager ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This memory manager allocates local storage and keeps a record of each
-// allocation. Iterators are provided for all data and code allocations.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef RECORDINGMEMORYMANAGER_H
-#define RECORDINGMEMORYMANAGER_H
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Memory.h"
-#include <utility>
-
-namespace llvm {
-
-class RecordingMemoryManager : public JITMemoryManager {
-public:
-  typedef std::pair<sys::MemoryBlock, unsigned> Allocation;
-
-private:
-  SmallVector<Allocation, 16> AllocatedDataMem;
-  SmallVector<Allocation, 16> AllocatedCodeMem;
-
-  // FIXME: This is part of a work around to keep sections near one another
-  // when MCJIT performs relocations after code emission but before
-  // the generated code is moved to the remote target.
-  sys::MemoryBlock Near;
-  sys::MemoryBlock allocateSection(uintptr_t Size);
-
-public:
-  RecordingMemoryManager() {}
-  virtual ~RecordingMemoryManager();
-
-  typedef SmallVectorImpl<Allocation>::const_iterator const_data_iterator;
-  typedef SmallVectorImpl<Allocation>::const_iterator const_code_iterator;
-
-  const_data_iterator data_begin() const { return AllocatedDataMem.begin(); }
-  const_data_iterator   data_end() const { return AllocatedDataMem.end(); }
-  const_code_iterator code_begin() const { return AllocatedCodeMem.begin(); }
-  const_code_iterator   code_end() const { return AllocatedCodeMem.end(); }
-
-  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID);
-
-  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID, bool IsReadOnly);
-
-  void *getPointerToNamedFunction(const std::string &Name,
-                                  bool AbortOnFailure = true);
-
-  bool finalizeMemory(std::string *ErrMsg) { return false; }
-
-  // The following obsolete JITMemoryManager calls are stubbed out for
-  // this model.
-  void setMemoryWritable();
-  void setMemoryExecutable();
-  void setPoisonMemory(bool poison);
-  void AllocateGOT();
-  uint8_t *getGOTBase() const;
-  uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize);
-  uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
-                        unsigned Alignment);
-  void endFunctionBody(const Function *F, uint8_t *FunctionStart,
-                       uint8_t *FunctionEnd);
-  uint8_t *allocateSpace(intptr_t Size, unsigned Alignment);
-  uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment);
-  void deallocateFunctionBody(void *Body);
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/tools/lli/RemoteMemoryManager.cpp b/tools/lli/RemoteMemoryManager.cpp
new file mode 100644
index 0000000..04fc40e
--- /dev/null
+++ b/tools/lli/RemoteMemoryManager.cpp
@@ -0,0 +1,206 @@
+//===---- RemoteMemoryManager.cpp - Recording memory manager --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This memory manager allocates local storage and keeps a record of each
+// allocation. Iterators are provided for all data and code allocations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lli"
+#include "RemoteMemoryManager.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+RemoteMemoryManager::~RemoteMemoryManager() {
+  for (SmallVector<Allocation, 2>::iterator
+         I = AllocatedSections.begin(), E = AllocatedSections.end();
+       I != E; ++I)
+    sys::Memory::releaseMappedMemory(I->MB);
+}
+
+uint8_t *RemoteMemoryManager::
+allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID,
+                    StringRef SectionName) {
+  // The recording memory manager is just a local copy of the remote target.
+  // The alignment requirement is just stored here for later use. Regular
+  // heap storage is sufficient here, but we're using mapped memory to work
+  // around a bug in MCJIT.
+  sys::MemoryBlock Block = allocateSection(Size);
+  // AllocatedSections will own this memory.
+  AllocatedSections.push_back( Allocation(Block, Alignment, true) );
+  // UnmappedSections has the same information but does not own the memory.
+  UnmappedSections.push_back( Allocation(Block, Alignment, true) );
+  return (uint8_t*)Block.base();
+}
+
+uint8_t *RemoteMemoryManager::
+allocateDataSection(uintptr_t Size, unsigned Alignment,
+                    unsigned SectionID, StringRef SectionName,
+                    bool IsReadOnly) {
+  // The recording memory manager is just a local copy of the remote target.
+  // The alignment requirement is just stored here for later use. Regular
+  // heap storage is sufficient here, but we're using mapped memory to work
+  // around a bug in MCJIT.
+  sys::MemoryBlock Block = allocateSection(Size);
+  // AllocatedSections will own this memory.
+  AllocatedSections.push_back( Allocation(Block, Alignment, false) );
+  // UnmappedSections has the same information but does not own the memory.
+  UnmappedSections.push_back( Allocation(Block, Alignment, false) );
+  return (uint8_t*)Block.base();
+}
+
+sys::MemoryBlock RemoteMemoryManager::allocateSection(uintptr_t Size) {
+  error_code ec;
+  sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(Size,
+                                                          &Near,
+                                                          sys::Memory::MF_READ |
+                                                          sys::Memory::MF_WRITE,
+                                                          ec);
+  assert(!ec && MB.base());
+
+  // FIXME: This is part of a work around to keep sections near one another
+  // when MCJIT performs relocations after code emission but before
+  // the generated code is moved to the remote target.
+  // Save this address as the basis for our next request
+  Near = MB;
+  return MB;
+}
+
+void RemoteMemoryManager::notifyObjectLoaded(ExecutionEngine *EE,
+                                                const ObjectImage *Obj) {
+  // The client should have called setRemoteTarget() before triggering any
+  // code generation.
+  assert(Target);
+  if (!Target)
+    return;
+
+  // FIXME: Make this function thread safe.
+
+  // Lay out our sections in order, with all the code sections first, then
+  // all the data sections.
+  uint64_t CurOffset = 0;
+  unsigned MaxAlign = Target->getPageAlignment();
+  SmallVector<std::pair<Allocation, uint64_t>, 16> Offsets;
+  unsigned NumSections = UnmappedSections.size();
+  // We're going to go through the list twice to separate code and data, but
+  // it's a very small list, so that's OK.
+  for (size_t i = 0, e = NumSections; i != e; ++i) {
+    Allocation &Section = UnmappedSections[i];
+    if (Section.IsCode) {
+      unsigned Size = Section.MB.size();
+      unsigned Align = Section.Alignment;
+      DEBUG(dbgs() << "code region: size " << Size
+                  << ", alignment " << Align << "\n");
+      // Align the current offset up to whatever is needed for the next
+      // section.
+      CurOffset = (CurOffset + Align - 1) / Align * Align;
+      // Save off the address of the new section and allocate its space.
+      Offsets.push_back(std::pair<Allocation,uint64_t>(Section, CurOffset));
+      CurOffset += Size;
+    }
+  }
+  // Adjust to keep code and data aligned on seperate pages.
+  CurOffset = (CurOffset + MaxAlign - 1) / MaxAlign * MaxAlign;
+  for (size_t i = 0, e = NumSections; i != e; ++i) {
+    Allocation &Section = UnmappedSections[i];
+    if (!Section.IsCode) {
+      unsigned Size = Section.MB.size();
+      unsigned Align = Section.Alignment;
+      DEBUG(dbgs() << "data region: size " << Size
+                  << ", alignment " << Align << "\n");
+      // Align the current offset up to whatever is needed for the next
+      // section.
+      CurOffset = (CurOffset + Align - 1) / Align * Align;
+      // Save off the address of the new section and allocate its space.
+      Offsets.push_back(std::pair<Allocation,uint64_t>(Section, CurOffset));
+      CurOffset += Size;
+    }
+  }
+
+  // Allocate space in the remote target.
+  uint64_t RemoteAddr;
+  if (Target->allocateSpace(CurOffset, MaxAlign, RemoteAddr))
+    report_fatal_error(Target->getErrorMsg());
+
+  // Map the section addresses so relocations will get updated in the local
+  // copies of the sections.
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+    uint64_t Addr = RemoteAddr + Offsets[i].second;
+    EE->mapSectionAddress(const_cast<void*>(Offsets[i].first.MB.base()), Addr);
+
+    DEBUG(dbgs() << "  Mapping local: " << Offsets[i].first.MB.base()
+                 << " to remote: 0x" << format("%llx", Addr) << "\n");
+
+    MappedSections[Addr] = Offsets[i].first;
+  }
+
+  UnmappedSections.clear();
+}
+
+bool RemoteMemoryManager::finalizeMemory(std::string *ErrMsg) {
+  // FIXME: Make this function thread safe.
+  for (DenseMap<uint64_t, Allocation>::iterator
+         I = MappedSections.begin(), E = MappedSections.end();
+       I != E; ++I) {
+    uint64_t RemoteAddr = I->first;
+    const Allocation &Section = I->second;
+    if (Section.IsCode) {
+      Target->loadCode(RemoteAddr, Section.MB.base(), Section.MB.size());
+
+      DEBUG(dbgs() << "  loading code: " << Section.MB.base()
+            << " to remote: 0x" << format("%llx", RemoteAddr) << "\n");
+    } else {
+      Target->loadData(RemoteAddr, Section.MB.base(), Section.MB.size());
+
+      DEBUG(dbgs() << "  loading data: " << Section.MB.base()
+            << " to remote: 0x" << format("%llx", RemoteAddr) << "\n");
+    }
+  }
+
+  MappedSections.clear();
+
+  return false;
+}
+
+void RemoteMemoryManager::setMemoryWritable() { llvm_unreachable("Unexpected!"); }
+void RemoteMemoryManager::setMemoryExecutable() { llvm_unreachable("Unexpected!"); }
+void RemoteMemoryManager::setPoisonMemory(bool poison) { llvm_unreachable("Unexpected!"); }
+void RemoteMemoryManager::AllocateGOT() { llvm_unreachable("Unexpected!"); }
+uint8_t *RemoteMemoryManager::getGOTBase() const {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RemoteMemoryManager::startFunctionBody(const Function *F, uintptr_t &ActualSize){
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RemoteMemoryManager::allocateStub(const GlobalValue* F, unsigned StubSize,
+                                              unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RemoteMemoryManager::endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                                             uint8_t *FunctionEnd) {
+  llvm_unreachable("Unexpected!");
+}
+uint8_t *RemoteMemoryManager::allocateSpace(intptr_t Size, unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RemoteMemoryManager::allocateGlobal(uintptr_t Size, unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RemoteMemoryManager::deallocateFunctionBody(void *Body) {
+  llvm_unreachable("Unexpected!");
+}
diff --git a/tools/lli/RemoteMemoryManager.h b/tools/lli/RemoteMemoryManager.h
new file mode 100644
index 0000000..5d0456f
--- /dev/null
+++ b/tools/lli/RemoteMemoryManager.h
@@ -0,0 +1,114 @@
+//===- RemoteMemoryManager.h - LLI MCJIT recording memory manager ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This memory manager allocates local storage and keeps a record of each
+// allocation. Iterators are provided for all data and code allocations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef REMOTEMEMORYMANAGER_H
+#define REMOTEMEMORYMANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include <utility>
+
+#include "RemoteTarget.h"
+
+namespace llvm {
+
+class RemoteMemoryManager : public JITMemoryManager {
+public:
+  // Notice that this structure takes ownership of the memory allocated.
+  struct Allocation {
+    Allocation() {}
+    Allocation(sys::MemoryBlock mb, unsigned a, bool code)
+      : MB(mb), Alignment(a), IsCode(code) {}
+
+    sys::MemoryBlock  MB;
+    unsigned          Alignment;
+    bool              IsCode;
+  };
+
+private:
+  // This vector contains Allocation objects for all sections which we have
+  // allocated.  This vector effectively owns the memory associated with the
+  // allocations.
+  SmallVector<Allocation, 2>  AllocatedSections;
+
+  // This vector contains pointers to Allocation objects for any sections we
+  // have allocated locally but have not yet remapped for the remote target.
+  // When we receive notification of a completed module load, we will map
+  // these sections into the remote target.
+  SmallVector<Allocation, 2>  UnmappedSections;
+
+  // This map tracks the sections we have remapped for the remote target
+  // but have not yet copied to the target.
+  DenseMap<uint64_t, Allocation>  MappedSections;
+
+  // FIXME: This is part of a work around to keep sections near one another
+  // when MCJIT performs relocations after code emission but before
+  // the generated code is moved to the remote target.
+  sys::MemoryBlock Near;
+  sys::MemoryBlock allocateSection(uintptr_t Size);
+
+  RemoteTarget *Target;
+
+public:
+  RemoteMemoryManager() : Target(NULL) {}
+  virtual ~RemoteMemoryManager();
+
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID, StringRef SectionName);
+
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID, StringRef SectionName,
+                               bool IsReadOnly);
+
+  // For now, remote symbol resolution is not support in lli.  The MCJIT
+  // interface does support this, but clients must provide their own
+  // mechanism for finding remote symbol addresses.  MCJIT will resolve
+  // symbols from Modules it contains.
+  uint64_t getSymbolAddress(const std::string &Name) { return 0; }
+
+  void notifyObjectLoaded(ExecutionEngine *EE, const ObjectImage *Obj);
+
+  bool finalizeMemory(std::string *ErrMsg);
+
+  // For now, remote EH frame registration isn't supported.  Remote symbol
+  // resolution is a prerequisite to supporting remote EH frame registration.
+  void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {}
+  void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {}
+
+  // This is a non-interface function used by lli
+  void setRemoteTarget(RemoteTarget *T) { Target = T; }
+
+  // The following obsolete JITMemoryManager calls are stubbed out for
+  // this model.
+  void setMemoryWritable();
+  void setMemoryExecutable();
+  void setPoisonMemory(bool poison);
+  void AllocateGOT();
+  uint8_t *getGOTBase() const;
+  uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize);
+  uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                        unsigned Alignment);
+  void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                       uint8_t *FunctionEnd);
+  uint8_t *allocateSpace(intptr_t Size, unsigned Alignment);
+  uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment);
+  void deallocateFunctionBody(void *Body);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/tools/lli/RemoteTarget.cpp b/tools/lli/RemoteTarget.cpp
index 212bdfd..5c74e6e 100644
--- a/tools/lli/RemoteTarget.cpp
+++ b/tools/lli/RemoteTarget.cpp
@@ -13,13 +13,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "RemoteTarget.h"
+#include "RemoteTargetExternal.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Memory.h"
 #include <stdlib.h>
 #include <string>
+
 using namespace llvm;
 
+// Static methods
+RemoteTarget *RemoteTarget::createRemoteTarget() {
+  return new RemoteTarget;
+}
+
+RemoteTarget *RemoteTarget::createExternalRemoteTarget(std::string &ChildName) {
+#ifdef LLVM_ON_UNIX
+  return new RemoteTargetExternal(ChildName);
+#else
+  return 0;
+#endif
+}
+
+bool RemoteTarget::hostSupportsExternalRemoteTarget() {
+#ifdef LLVM_ON_UNIX
+  return true;
+#else
+  return false;
+#endif
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Simulated remote execution
+//
+// This implementation will simply move generated code and data to a new memory
+// location in the current executable and let it run from there.
+////////////////////////////////////////////////////////////////////////////////
+
 bool RemoteTarget::allocateSpace(size_t Size, unsigned Alignment,
                                  uint64_t &Address) {
   sys::MemoryBlock *Prev = Allocations.size() ? &Allocations.back() : NULL;
diff --git a/tools/lli/RemoteTarget.h b/tools/lli/RemoteTarget.h
index b2a6d0e..c95fbd1 100644
--- a/tools/lli/RemoteTarget.h
+++ b/tools/lli/RemoteTarget.h
@@ -41,7 +41,9 @@ public:
   ///
   /// @returns False on success. On failure, ErrorMsg is updated with
   ///          descriptive text of the encountered error.
-  bool allocateSpace(size_t Size, unsigned Alignment, uint64_t &Address);
+  virtual bool allocateSpace(size_t Size,
+                             unsigned Alignment,
+                             uint64_t &Address);
 
   /// Load data into the target address space.
   ///
@@ -51,7 +53,9 @@ public:
   ///
   /// @returns False on success. On failure, ErrorMsg is updated with
   ///          descriptive text of the encountered error.
-  bool loadData(uint64_t Address, const void *Data, size_t Size);
+  virtual bool loadData(uint64_t Address,
+                        const void *Data,
+                        size_t Size);
 
   /// Load code into the target address space and prepare it for execution.
   ///
@@ -61,7 +65,9 @@ public:
   ///
   /// @returns False on success. On failure, ErrorMsg is updated with
   ///          descriptive text of the encountered error.
-  bool loadCode(uint64_t Address, const void *Data, size_t Size);
+  virtual bool loadCode(uint64_t Address,
+                        const void *Data,
+                        size_t Size);
 
   /// Execute code in the target process. The called function is required
   /// to be of signature int "(*)(void)".
@@ -72,24 +78,29 @@ public:
   ///
   /// @returns False on success. On failure, ErrorMsg is updated with
   ///          descriptive text of the encountered error.
-  bool executeCode(uint64_t Address, int &RetVal);
+  virtual bool executeCode(uint64_t Address,
+                           int &RetVal);
 
   /// Minimum alignment for memory permissions. Used to seperate code and
   /// data regions to make sure data doesn't get marked as code or vice
   /// versa.
   ///
   /// @returns Page alignment return value. Default of 4k.
-  unsigned getPageAlignment() { return 4096; }
+  virtual unsigned getPageAlignment() { return 4096; }
 
   /// Start the remote process.
-  void create();
+  virtual void create();
 
   /// Terminate the remote process.
-  void stop();
+  virtual void stop();
 
   RemoteTarget() : ErrorMsg(""), IsRunning(false) {}
-  ~RemoteTarget() { if (IsRunning) stop(); }
+  virtual ~RemoteTarget() { if (IsRunning) stop(); }
 
+  // Create an instance of the system-specific remote target class.
+  static RemoteTarget *createRemoteTarget();
+  static RemoteTarget *createExternalRemoteTarget(std::string &ChildName);
+  static bool hostSupportsExternalRemoteTarget(); 
 private:
   // Main processing function for the remote target process. Command messages
   // are received on file descriptor CmdFD and responses come back on OutFD.
diff --git a/tools/lli/RemoteTargetExternal.cpp b/tools/lli/RemoteTargetExternal.cpp
new file mode 100644
index 0000000..742a948
--- /dev/null
+++ b/tools/lli/RemoteTargetExternal.cpp
@@ -0,0 +1,162 @@
+//===---- RemoteTargetExternal.cpp - LLVM out-of-process JIT execution ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the RemoteTargetExternal class which executes JITed code
+// in a separate process from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+
+#include "RemoteTarget.h"
+#include "RemoteTargetExternal.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+using namespace llvm;
+
+bool RemoteTargetExternal::allocateSpace(size_t Size, unsigned Alignment,
+                                 uint64_t &Address) {
+  SendAllocateSpace(Alignment, Size);
+  Receive(LLI_AllocationResult, Address);
+  return false;
+}
+
+bool RemoteTargetExternal::loadData(uint64_t Address, const void *Data, size_t Size) {
+  SendLoadSection(Address, Data, (uint32_t)Size, false);
+  Receive(LLI_LoadComplete);
+  return false;
+}
+
+bool RemoteTargetExternal::loadCode(uint64_t Address, const void *Data, size_t Size) {
+  SendLoadSection(Address, Data, (uint32_t)Size, true);
+  Receive(LLI_LoadComplete);
+  return false;
+}
+
+bool RemoteTargetExternal::executeCode(uint64_t Address, int &RetVal) {
+  SendExecute(Address);
+
+  Receive(LLI_ExecutionResult, RetVal);
+  return false;
+}
+
+void RemoteTargetExternal::stop() {
+  SendTerminate();
+  Wait();
+}
+
+void RemoteTargetExternal::SendAllocateSpace(uint32_t Alignment, uint32_t Size) {
+  int rc;
+  uint32_t MsgType = (uint32_t)LLI_AllocateSpace;
+  rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4 && "Error writing message type.");
+
+  uint32_t DataSize = 8;
+  rc = WriteBytes(&DataSize, 4);
+  assert(rc == 4 && "Error writing data size.");
+
+  rc = WriteBytes(&Alignment, 4);
+  assert(rc == 4 && "Error writing alignment data.");
+
+  rc = WriteBytes(&Size, 4);
+  assert(rc == 4 && "Error writing size data.");
+}
+
+void RemoteTargetExternal::SendLoadSection(uint64_t Addr,
+                                       const void *Data,
+                                       uint32_t Size,
+                                       bool IsCode) {
+  int rc;
+  uint32_t MsgType = IsCode ? LLI_LoadCodeSection : LLI_LoadDataSection;
+  rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4 && "Error writing message type.");
+
+  uint32_t DataSize = Size + 8;
+  rc = WriteBytes(&DataSize, 4);
+  assert(rc == 4 && "Error writing data size.");
+
+  rc = WriteBytes(&Addr, 8);
+  assert(rc == 8 && "Error writing data.");
+
+  rc = WriteBytes(Data, Size);
+  assert(rc == (int)Size && "Error writing data.");
+}
+
+void RemoteTargetExternal::SendExecute(uint64_t Addr) {
+  int rc;
+  uint32_t MsgType = (uint32_t)LLI_Execute;
+  rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4 && "Error writing message type.");
+
+  uint32_t DataSize = 8;
+  rc = WriteBytes(&DataSize, 4);
+  assert(rc == 4 && "Error writing data size.");
+
+  rc = WriteBytes(&Addr, 8);
+  assert(rc == 8 && "Error writing data.");
+}
+
+void RemoteTargetExternal::SendTerminate() {
+  int rc;
+  uint32_t MsgType = (uint32_t)LLI_Terminate;
+  rc = WriteBytes(&MsgType, 4);
+  assert(rc == 4 && "Error writing message type.");
+
+  // No data or data size is sent with Terminate
+}
+
+
+void RemoteTargetExternal::Receive(LLIMessageType ExpectedMsgType) {
+  int rc;
+  uint32_t MsgType;
+  rc = ReadBytes(&MsgType, 4);
+  assert(rc == 4 && "Error reading message type.");
+  assert(MsgType == (uint32_t)ExpectedMsgType && "Error: received unexpected message type.");
+
+  uint32_t DataSize;
+  rc = ReadBytes(&DataSize, 4);
+  assert(rc == 4 && "Error reading data size.");
+  assert(DataSize == 0 && "Error: unexpected data size.");
+}
+
+void RemoteTargetExternal::Receive(LLIMessageType ExpectedMsgType, int &Data) {
+  uint64_t Temp;
+  Receive(ExpectedMsgType, Temp);
+  Data = (int)(int64_t)Temp;
+}
+
+void RemoteTargetExternal::Receive(LLIMessageType ExpectedMsgType, uint64_t &Data) {
+  int rc;
+  uint32_t MsgType;
+  rc = ReadBytes(&MsgType, 4);
+  assert(rc == 4 && "Error reading message type.");
+  assert(MsgType == (uint32_t)ExpectedMsgType && "Error: received unexpected message type.");
+
+  uint32_t DataSize;
+  rc = ReadBytes(&DataSize, 4);
+  assert(rc == 4 && "Error reading data size.");
+  assert(DataSize == 8 && "Error: unexpected data size.");
+
+  rc = ReadBytes(&Data, 8);
+  assert(DataSize == 8 && "Error: unexpected data.");
+}
+
+#ifdef LLVM_ON_UNIX
+#include "Unix/RemoteTargetExternal.inc"
+#endif
+
+#ifdef LLVM_ON_WIN32
+#include "Windows/RemoteTargetExternal.inc"
+#endif
diff --git a/tools/lli/RemoteTargetExternal.h b/tools/lli/RemoteTargetExternal.h
new file mode 100644
index 0000000..a4bfad2
--- /dev/null
+++ b/tools/lli/RemoteTargetExternal.h
@@ -0,0 +1,118 @@
+//===----- RemoteTargetExternal.h - LLVM out-of-process JIT execution -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the RemoteTargetExternal class which executes JITed code in a
+// separate process from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLI_REMOTETARGETEXTERNAL_H
+#define LLI_REMOTETARGETEXTERNAL_H
+
+#include "llvm/Config/config.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Memory.h"
+#include <stdlib.h>
+#include <string>
+
+#include "RemoteTarget.h"
+#include "RemoteTargetMessage.h"
+
+namespace llvm {
+
+class RemoteTargetExternal : public RemoteTarget {
+public:
+  /// Allocate space in the remote target address space.
+  ///
+  /// @param      Size      Amount of space, in bytes, to allocate.
+  /// @param      Alignment Required minimum alignment for allocated space.
+  /// @param[out] Address   Remote address of the allocated memory.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  virtual bool allocateSpace(size_t Size,
+                             unsigned Alignment,
+                             uint64_t &Address);
+
+  /// Load data into the target address space.
+  ///
+  /// @param      Address   Destination address in the target process.
+  /// @param      Data      Source address in the host process.
+  /// @param      Size      Number of bytes to copy.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  virtual bool loadData(uint64_t Address, const void *Data, size_t Size);
+
+  /// Load code into the target address space and prepare it for execution.
+  ///
+  /// @param      Address   Destination address in the target process.
+  /// @param      Data      Source address in the host process.
+  /// @param      Size      Number of bytes to copy.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  virtual bool loadCode(uint64_t Address, const void *Data, size_t Size);
+
+  /// Execute code in the target process. The called function is required
+  /// to be of signature int "(*)(void)".
+  ///
+  /// @param      Address   Address of the loaded function in the target
+  ///                       process.
+  /// @param[out] RetVal    The integer return value of the called function.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  virtual bool executeCode(uint64_t Address, int &RetVal);
+
+  /// Minimum alignment for memory permissions. Used to seperate code and
+  /// data regions to make sure data doesn't get marked as code or vice
+  /// versa.
+  ///
+  /// @returns Page alignment return value. Default of 4k.
+  virtual unsigned getPageAlignment() { return 4096; }
+
+  /// Start the remote process.
+  virtual void create();
+
+  /// Terminate the remote process.
+  virtual void stop();
+
+  RemoteTargetExternal(std::string &Name) : RemoteTarget(), ChildName(Name) {}
+  virtual ~RemoteTargetExternal();
+
+private:
+  std::string ChildName;
+
+  // This will get filled in as a point to an OS-specific structure.
+  void *ConnectionData;
+
+  void SendAllocateSpace(uint32_t Alignment, uint32_t Size);
+  void SendLoadSection(uint64_t Addr,
+                       const void *Data,
+                       uint32_t Size,
+                       bool IsCode);
+  void SendExecute(uint64_t Addr);
+  void SendTerminate();
+
+  void Receive(LLIMessageType Msg);
+  void Receive(LLIMessageType Msg, int &Data);
+  void Receive(LLIMessageType Msg, uint64_t &Data);
+
+  int WriteBytes(const void *Data, size_t Size);
+  int ReadBytes(void *Data, size_t Size);
+  void Wait();
+};
+
+} // end namespace llvm
+
+#endif // LLI_REMOTETARGETEXTERNAL_H
diff --git a/tools/lli/RemoteTargetMessage.h b/tools/lli/RemoteTargetMessage.h
new file mode 100644
index 0000000..12cfa9a
--- /dev/null
+++ b/tools/lli/RemoteTargetMessage.h
@@ -0,0 +1,45 @@
+//===---- RemoteTargetMessage.h - LLI out-of-process message protocol -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the LLIMessageType enum which is used for communication with a
+// child process for remote execution.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLI_REMOTETARGETMESSAGE_H
+#define LLI_REMOTETARGETMESSAGE_H
+
+namespace llvm {
+
+// LLI messages from parent-to-child or vice versa follow an exceedingly simple
+// protocol where the first four bytes represent the message type, the next
+// four bytes represent the size of data for the command and following bytes
+// represent the actual data.
+//
+// The protocol is not intended to be robust, secure or fault-tolerant.  It is
+// only here for testing purposes and is therefore intended to be the simplest
+// implementation that will work.  It is assumed that the parent and child
+// share characteristics like endianness.
+
+enum LLIMessageType {
+  LLI_Error = -1,
+  LLI_ChildActive = 0,        // Data = not used
+  LLI_AllocateSpace,          // Data = struct { uint_32t Align, uint_32t Size }
+  LLI_AllocationResult,       // Data = uint64_t AllocAddress (in Child memory space)
+  LLI_LoadCodeSection,        // Data = uint32_t Addr, followed by section contests
+  LLI_LoadDataSection,        // Data = uint32_t Addr, followed by section contents
+  LLI_LoadComplete,           // Data = not used
+  LLI_Execute,                // Data = Address of function to execute
+  LLI_ExecutionResult,        // Data = uint64_t Result
+  LLI_Terminate               // Data = not used
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/tools/lli/Unix/RemoteTargetExternal.inc b/tools/lli/Unix/RemoteTargetExternal.inc
new file mode 100644
index 0000000..9c1a4cc
--- /dev/null
+++ b/tools/lli/Unix/RemoteTargetExternal.inc
@@ -0,0 +1,96 @@
+//=- RemoteTargetExternal.inc - LLVM out-of-process JIT execution for Unix --=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the Unix-specific parts of the RemoteTargetExternal class
+// which executes JITed code in a separate process from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+
+namespace {
+
+struct ConnectionData_t {
+  int InputPipe;
+  int OutputPipe;
+
+  ConnectionData_t(int in, int out) : InputPipe(in), OutputPipe(out) {}
+};
+
+} // namespace
+
+namespace llvm {
+
+void RemoteTargetExternal::create() {
+  int PipeFD[2][2];
+  pid_t ChildPID;
+
+  // Create two pipes.
+  if (pipe(PipeFD[0]) != 0 || pipe(PipeFD[1]) != 0)
+    perror("Error creating pipe: ");
+
+  ChildPID = fork();
+
+  if (ChildPID == 0) {
+    // In the child...
+
+    // Close the parent ends of the pipes
+    close(PipeFD[0][1]);
+    close(PipeFD[1][0]);
+
+    // Use our pipes as stdin and stdout
+    if (PipeFD[0][0] != STDIN_FILENO) {
+      dup2(PipeFD[0][0], STDIN_FILENO);
+      close(PipeFD[0][0]);
+    }
+    if (PipeFD[1][1] != STDOUT_FILENO) {
+      dup2(PipeFD[1][1], STDOUT_FILENO);
+      close(PipeFD[1][1]);
+    }
+
+    // Execute the child process.
+    char *args[1] = { NULL };
+    int rc = execv(ChildName.c_str(), args);
+    if (rc != 0)
+      perror("Error executing child process: ");
+  }
+  else {
+    // In the parent...
+
+    // Close the child ends of the pipes
+    close(PipeFD[0][0]);
+    close(PipeFD[1][1]);
+
+    // Store the parent ends of the pipes
+    ConnectionData = (void*)new ConnectionData_t(PipeFD[1][0], PipeFD[0][1]);
+
+    Receive(LLI_ChildActive);
+  }
+}
+
+int RemoteTargetExternal::WriteBytes(const void *Data, size_t Size) {
+  return write(((ConnectionData_t*)ConnectionData)->OutputPipe, Data, Size);
+}
+
+int RemoteTargetExternal::ReadBytes(void *Data, size_t Size) {
+  return read(((ConnectionData_t*)ConnectionData)->InputPipe, Data, Size);
+}
+
+void RemoteTargetExternal::Wait() {
+  wait(NULL);
+}
+
+RemoteTargetExternal::~RemoteTargetExternal() {
+  delete static_cast<ConnectionData_t *>(ConnectionData);
+}
+
+} // namespace llvm
diff --git a/tools/lli/Windows/RemoteTargetExternal.inc b/tools/lli/Windows/RemoteTargetExternal.inc
new file mode 100644
index 0000000..aef4627
--- /dev/null
+++ b/tools/lli/Windows/RemoteTargetExternal.inc
@@ -0,0 +1,35 @@
+//= RemoteTargetExternal.inc - LLVM out-of-process JIT execution for Windows =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the Windows-specific parts of the RemoteTargetExternal class
+// which is meant to execute JITed code in a separate process from where it was
+// built.  To support this functionality on Windows, implement these functions.
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void RemoteTargetExternal::create() {
+}
+
+int RemoteTargetExternal::WriteBytes(const void *Data, size_t Size) {
+  return 0;
+}
+
+int RemoteTargetExternal::ReadBytes(void *Data, size_t Size) {
+  return 0;
+}
+
+void RemoteTargetExternal::Wait() {
+}
+
+RemoteTargetExternal::~RemoteTargetExternal() {
+}
+
+} // namespace llvm
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 8d74b23..f317566 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -15,7 +15,7 @@
 
 #define DEBUG_TYPE "lli"
 #include "llvm/IR/LLVMContext.h"
-#include "RecordingMemoryManager.h"
+#include "RemoteMemoryManager.h"
 #include "RemoteTarget.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/ReaderWriter.h"
@@ -27,8 +27,10 @@
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/TypeBuilder.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -41,6 +43,7 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
@@ -83,6 +86,18 @@ namespace {
     cl::desc("Execute MCJIT'ed code in a separate process."),
     cl::init(false));
 
+  // Manually specify the child process for remote execution. This overrides
+  // the simulated remote execution that allocates address space for child
+  // execution. The child process will be executed and will communicate with
+  // lli via stdin/stdout pipes.
+  cl::opt<std::string>
+  MCJITRemoteProcess("mcjit-remote-process",
+            cl::desc("Specify the filename of the process to launch "
+                     "for remote MCJIT execution.  If none is specified,"
+                     "\n\tremote execution will be simulated in-process."),
+            cl::value_desc("filename"),
+            cl::init(""));
+
   // Determine optimization level.
   cl::opt<char>
   OptLevel("O",
@@ -118,6 +133,11 @@ namespace {
             cl::value_desc("function"),
             cl::init("main"));
 
+  cl::list<std::string>
+  ExtraModules("extra-module",
+         cl::desc("Extra modules to be loaded"),
+         cl::value_desc("input bitcode"));
+
   cl::opt<std::string>
   FakeArgv0("fake-argv0",
             cl::desc("Override the 'argv[0]' value passed into the executing"
@@ -209,82 +229,46 @@ static void do_shutdown() {
 #endif
 }
 
-void layoutRemoteTargetMemory(RemoteTarget *T, RecordingMemoryManager *JMM) {
-  // Lay out our sections in order, with all the code sections first, then
-  // all the data sections.
-  uint64_t CurOffset = 0;
-  unsigned MaxAlign = T->getPageAlignment();
-  SmallVector<std::pair<const void*, uint64_t>, 16> Offsets;
-  SmallVector<unsigned, 16> Sizes;
-  for (RecordingMemoryManager::const_code_iterator I = JMM->code_begin(),
-                                                   E = JMM->code_end();
-       I != E; ++I) {
-    DEBUG(dbgs() << "code region: size " << I->first.size()
-                 << ", alignment " << I->second << "\n");
-    // Align the current offset up to whatever is needed for the next
-    // section.
-    unsigned Align = I->second;
-    CurOffset = (CurOffset + Align - 1) / Align * Align;
-    // Save off the address of the new section and allocate its space.
-    Offsets.push_back(std::pair<const void*,uint64_t>(I->first.base(), CurOffset));
-    Sizes.push_back(I->first.size());
-    CurOffset += I->first.size();
-  }
-  // Adjust to keep code and data aligned on seperate pages.
-  CurOffset = (CurOffset + MaxAlign - 1) / MaxAlign * MaxAlign;
-  unsigned FirstDataIndex = Offsets.size();
-  for (RecordingMemoryManager::const_data_iterator I = JMM->data_begin(),
-                                                   E = JMM->data_end();
-       I != E; ++I) {
-    DEBUG(dbgs() << "data region: size " << I->first.size()
-                 << ", alignment " << I->second << "\n");
-    // Align the current offset up to whatever is needed for the next
-    // section.
-    unsigned Align = I->second;
-    CurOffset = (CurOffset + Align - 1) / Align * Align;
-    // Save off the address of the new section and allocate its space.
-    Offsets.push_back(std::pair<const void*,uint64_t>(I->first.base(), CurOffset));
-    Sizes.push_back(I->first.size());
-    CurOffset += I->first.size();
-  }
-
-  // Allocate space in the remote target.
-  uint64_t RemoteAddr;
-  if (T->allocateSpace(CurOffset, MaxAlign, RemoteAddr))
-    report_fatal_error(T->getErrorMsg());
-  // Map the section addresses so relocations will get updated in the local
-  // copies of the sections.
-  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
-    uint64_t Addr = RemoteAddr + Offsets[i].second;
-    EE->mapSectionAddress(const_cast<void*>(Offsets[i].first), Addr);
-
-    DEBUG(dbgs() << "  Mapping local: " << Offsets[i].first
-                 << " to remote: 0x" << format("%llx", Addr) << "\n");
-
-  }
-
-  // Trigger application of relocations
-  EE->finalizeObject();
-
-  // Now load it all to the target.
-  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
-    uint64_t Addr = RemoteAddr + Offsets[i].second;
-
-    if (i < FirstDataIndex) {
-      T->loadCode(Addr, Offsets[i].first, Sizes[i]);
-
-      DEBUG(dbgs() << "  loading code: " << Offsets[i].first
-            << " to remote: 0x" << format("%llx", Addr) << "\n");
-    } else {
-      T->loadData(Addr, Offsets[i].first, Sizes[i]);
-
-      DEBUG(dbgs() << "  loading data: " << Offsets[i].first
-            << " to remote: 0x" << format("%llx", Addr) << "\n");
-    }
-
+// On Mingw and Cygwin, an external symbol named '__main' is called from the
+// generated 'main' function to allow static intialization.  To avoid linking
+// problems with remote targets (because lli's remote target support does not
+// currently handle external linking) we add a secondary module which defines
+// an empty '__main' function.
+static void addCygMingExtraModule(ExecutionEngine *EE,
+                                  LLVMContext &Context,
+                                  StringRef TargetTripleStr) {
+  IRBuilder<> Builder(Context);
+  Triple TargetTriple(TargetTripleStr);
+
+  // Create a new module.
+  Module *M = new Module("CygMingHelper", Context);
+  M->setTargetTriple(TargetTripleStr);
+
+  // Create an empty function named "__main".
+  Function *Result;
+  if (TargetTriple.isArch64Bit()) {
+    Result = Function::Create(
+      TypeBuilder<int64_t(void), false>::get(Context),
+      GlobalValue::ExternalLinkage, "__main", M);
+  } else {
+    Result = Function::Create(
+      TypeBuilder<int32_t(void), false>::get(Context),
+      GlobalValue::ExternalLinkage, "__main", M);
   }
+  BasicBlock *BB = BasicBlock::Create(Context, "__main", Result);
+  Builder.SetInsertPoint(BB);
+  Value *ReturnVal;
+  if (TargetTriple.isArch64Bit())
+    ReturnVal = ConstantInt::get(Context, APInt(64, 0));
+  else
+    ReturnVal = ConstantInt::get(Context, APInt(32, 0));
+  Builder.CreateRet(ReturnVal);
+
+  // Add this new module to the ExecutionEngine.
+  EE->addModule(M);
 }
 
+
 //===----------------------------------------------------------------------===//
 // main Driver function
 //
@@ -357,7 +341,7 @@ int main(int argc, char **argv, char * const *envp) {
   if (UseMCJIT && !ForceInterpreter) {
     builder.setUseMCJIT(true);
     if (RemoteMCJIT)
-      RTDyldMM = new RecordingMemoryManager();
+      RTDyldMM = new RemoteMemoryManager();
     else
       RTDyldMM = new SectionMemoryManager();
     builder.setMCJITMemoryManager(RTDyldMM);
@@ -407,6 +391,22 @@ int main(int argc, char **argv, char * const *envp) {
     exit(1);
   }
 
+  // Load any additional modules specified on the command line.
+  for (unsigned i = 0, e = ExtraModules.size(); i != e; ++i) {
+    Module *XMod = ParseIRFile(ExtraModules[i], Err, Context);
+    if (!XMod) {
+      Err.print(argv[0], errs());
+      return 1;
+    }
+    EE->addModule(XMod);
+  }
+
+  // If the target is Cygwin/MingW and we are generating remote code, we
+  // need an extra module to help out with linking.
+  if (RemoteMCJIT && Triple(Mod->getTargetTriple()).isOSCygMing()) {
+    addCygMingExtraModule(EE, Context, Mod->getTargetTriple());
+  }
+
   // The following functions have no effect if their respective profiling
   // support wasn't enabled in the build configuration.
   EE->RegisterJITEventListener(
@@ -445,68 +445,33 @@ int main(int argc, char **argv, char * const *envp) {
     return -1;
   }
 
-  // If the program doesn't explicitly call exit, we will need the Exit
-  // function later on to make an explicit call, so get the function now.
-  Constant *Exit = Mod->getOrInsertFunction("exit", Type::getVoidTy(Context),
-                                                    Type::getInt32Ty(Context),
-                                                    NULL);
-
   // Reset errno to zero on entry to main.
   errno = 0;
 
-  // Remote target MCJIT doesn't (yet) support static constructors. No reason
-  // it couldn't. This is a limitation of the LLI implemantation, not the
-  // MCJIT itself. FIXME.
-  //
-  // Run static constructors.
-  if (!RemoteMCJIT) {
-      if (UseMCJIT && !ForceInterpreter) {
-        // Give MCJIT a chance to apply relocations and set page permissions.
-        EE->finalizeObject();
-      }
-      EE->runStaticConstructorsDestructors(false);
-  }
-
-  if (NoLazyCompilation) {
-    for (Module::iterator I = Mod->begin(), E = Mod->end(); I != E; ++I) {
-      Function *Fn = &*I;
-      if (Fn != EntryFn && !Fn->isDeclaration())
-        EE->getPointerToFunction(Fn);
-    }
-  }
-
   int Result;
-  if (RemoteMCJIT) {
-    RecordingMemoryManager *MM = static_cast<RecordingMemoryManager*>(RTDyldMM);
-    // Everything is prepared now, so lay out our program for the target
-    // address space, assign the section addresses to resolve any relocations,
-    // and send it to the target.
-    RemoteTarget Target;
-    Target.create();
-
-    // Ask for a pointer to the entry function. This triggers the actual
-    // compilation.
-    (void)EE->getPointerToFunction(EntryFn);
 
-    // Enough has been compiled to execute the entry function now, so
-    // layout the target memory.
-    layoutRemoteTargetMemory(&Target, MM);
-
-    // Since we're executing in a (at least simulated) remote address space,
-    // we can't use the ExecutionEngine::runFunctionAsMain(). We have to
-    // grab the function address directly here and tell the remote target
-    // to execute the function.
-    // FIXME: argv and envp handling.
-    uint64_t Entry = (uint64_t)EE->getPointerToFunction(EntryFn);
-
-    DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at 0x"
-                 << format("%llx", Entry) << "\n");
+  if (!RemoteMCJIT) {
+    // If the program doesn't explicitly call exit, we will need the Exit
+    // function later on to make an explicit call, so get the function now.
+    Constant *Exit = Mod->getOrInsertFunction("exit", Type::getVoidTy(Context),
+                                                      Type::getInt32Ty(Context),
+                                                      NULL);
+
+    // Run static constructors.
+    if (UseMCJIT && !ForceInterpreter) {
+      // Give MCJIT a chance to apply relocations and set page permissions.
+      EE->finalizeObject();
+    }
+    EE->runStaticConstructorsDestructors(false);
 
-    if (Target.executeCode(Entry, Result))
-      errs() << "ERROR: " << Target.getErrorMsg() << "\n";
+    if (!UseMCJIT && NoLazyCompilation) {
+      for (Module::iterator I = Mod->begin(), E = Mod->end(); I != E; ++I) {
+        Function *Fn = &*I;
+        if (Fn != EntryFn && !Fn->isDeclaration())
+          EE->getPointerToFunction(Fn);
+      }
+    }
 
-    Target.stop();
-  } else {
     // Trigger compilation separately so code regions that need to be 
     // invalidated will be known.
     (void)EE->getPointerToFunction(EntryFn);
@@ -516,11 +481,7 @@ int main(int argc, char **argv, char * const *envp) {
 
     // Run main.
     Result = EE->runFunctionAsMain(EntryFn, InputArgv, envp);
-  }
 
-  // Like static constructors, the remote target MCJIT support doesn't handle
-  // this yet. It could. FIXME.
-  if (!RemoteMCJIT) {
     // Run static destructors.
     EE->runStaticConstructorsDestructors(true);
 
@@ -538,6 +499,67 @@ int main(int argc, char **argv, char * const *envp) {
       errs() << "ERROR: exit defined with wrong prototype!\n";
       abort();
     }
+  } else {
+    // else == "if (RemoteMCJIT)"
+
+    // Remote target MCJIT doesn't (yet) support static constructors. No reason
+    // it couldn't. This is a limitation of the LLI implemantation, not the
+    // MCJIT itself. FIXME.
+    //
+    RemoteMemoryManager *MM = static_cast<RemoteMemoryManager*>(RTDyldMM);
+    // Everything is prepared now, so lay out our program for the target
+    // address space, assign the section addresses to resolve any relocations,
+    // and send it to the target.
+
+    OwningPtr<RemoteTarget> Target;
+    if (!MCJITRemoteProcess.empty()) { // Remote execution on a child process
+      if (!RemoteTarget::hostSupportsExternalRemoteTarget()) {
+        errs() << "Warning: host does not support external remote targets.\n"
+               << "  Defaulting to simulated remote execution\n";
+        Target.reset(RemoteTarget::createRemoteTarget());
+      } else {
+        std::string ChildEXE = sys::FindProgramByName(MCJITRemoteProcess);
+        if (ChildEXE == "") {
+          errs() << "Unable to find child target: '\''" << MCJITRemoteProcess << "\'\n";
+          return -1;
+        }
+        Target.reset(RemoteTarget::createExternalRemoteTarget(ChildEXE));
+      }
+    } else {
+      // No child process name provided, use simulated remote execution.
+      Target.reset(RemoteTarget::createRemoteTarget());
+    }
+
+    // Give the memory manager a pointer to our remote target interface object.
+    MM->setRemoteTarget(Target.get());
+
+    // Create the remote target.
+    Target->create();
+
+    // Since we're executing in a (at least simulated) remote address space,
+    // we can't use the ExecutionEngine::runFunctionAsMain(). We have to
+    // grab the function address directly here and tell the remote target
+    // to execute the function.
+    //
+    // Our memory manager will map generated code into the remote address
+    // space as it is loaded and copy the bits over during the finalizeMemory
+    // operation.
+    //
+    // FIXME: argv and envp handling.
+    uint64_t Entry = EE->getFunctionAddress(EntryFn->getName().str());
+
+    DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at 0x"
+                 << format("%llx", Entry) << "\n");
+
+    if (Target->executeCode(Entry, Result))
+      errs() << "ERROR: " << Target->getErrorMsg() << "\n";
+
+    // Like static constructors, the remote target MCJIT support doesn't handle
+    // this yet. It could. FIXME.
+
+    // Stop the remote target
+    Target->stop();
   }
+
   return Result;
 }
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 503999c..f15a1e2 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -4,4 +4,22 @@ add_llvm_tool(llvm-ar
   llvm-ar.cpp
   )
 
+# FIXME: this is duplicated from the clang CMakeLists.txt
+# FIXME: bin/llvm-ranlib is not a valid build target with this setup (pr17024)
+
+if(UNIX)
+  set(LLVM_LINK_OR_COPY create_symlink)
+  set(llvm_ar_binary "llvm-ar${CMAKE_EXECUTABLE_SUFFIX}")
+else()
+  set(LLVM_LINK_OR_COPY copy)
+  set(llvm_ar_binary "${LLVM_BINARY_DIR}/bin/${CMAKE_CFG_INTDIR}/llvm-ar${CMAKE_EXECUTABLE_SUFFIX}")
+endif()
+
+set(llvm_ranlib "${LLVM_BINARY_DIR}/bin/${CMAKE_CFG_INTDIR}/llvm-ranlib${CMAKE_EXECUTABLE_SUFFIX}")
+add_custom_command(TARGET llvm-ar POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E ${LLVM_LINK_OR_COPY} "${llvm_ar_binary}" "${llvm_ranlib}")
+
+set_property(DIRECTORY APPEND
+  PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${llvm_ranlib})
+
 # TODO: Support check-local.
diff --git a/tools/llvm-ar/Makefile b/tools/llvm-ar/Makefile
index 15fb090..16a8283 100644
--- a/tools/llvm-ar/Makefile
+++ b/tools/llvm-ar/Makefile
@@ -9,6 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-ar
+TOOLALIAS = llvm-ranlib
 LINK_COMPONENTS := bitreader support object
 
 # This tool has no plugins, optimize startup time.
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 6026fa7..d70db72 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -63,20 +63,13 @@ static void failIfError(error_code EC, Twine Context = "") {
   fail(Context + ": " + EC.message());
 }
 
-// Option for compatibility with AIX, not used but must allow it to be present.
-static cl::opt<bool>
-X32Option ("X32_64", cl::Hidden,
-            cl::desc("Ignored option for compatibility with AIX"));
-
-// llvm-ar operation code and modifier flags. This must come first.
-static cl::opt<std::string>
-Options(cl::Positional, cl::Required, cl::desc("{operation}[modifiers]..."));
-
-// llvm-ar remaining positional arguments.
+// llvm-ar/llvm-ranlib remaining positional arguments.
 static cl::list<std::string>
 RestOfArgs(cl::Positional, cl::OneOrMore,
     cl::desc("[relpos] [count] <archive-file> [members]..."));
 
+std::string Options;
+
 // MoreHelp - Provide additional help output explaining the operations and
 // modifiers of llvm-ar. This object instructs the CommandLine library
 // to print the text of the constructor when the --help option is given.
@@ -156,6 +149,13 @@ static void getRelPos() {
   RestOfArgs.erase(RestOfArgs.begin());
 }
 
+static void getOptions() {
+  if(RestOfArgs.size() == 0)
+    show_help("Expected options");
+  Options = RestOfArgs[0];
+  RestOfArgs.erase(RestOfArgs.begin());
+}
+
 // getArchive - Get the archive file name from the command line
 static void getArchive() {
   if(RestOfArgs.size() == 0)
@@ -175,6 +175,7 @@ static void getMembers() {
 // operation specified. Process all modifiers and check to make sure that
 // constraints on modifier/operation pairs have not been violated.
 static ArchiveOperation parseCommandLine() {
+  getOptions();
 
   // Keep track of number of operations. We can only specify one
   // per execution.
@@ -781,6 +782,13 @@ static void performWriteOperation(ArchiveOperation Operation,
       sys::fs::file_status Status;
       failIfError(sys::fs::status(FD, Status), FileName);
 
+      // Opening a directory doesn't make sense. Let it failed.
+      // Linux cannot open directories with open(2), although
+      // cygwin and *bsd can.
+      if (Status.type() == sys::fs::file_type::directory_file)
+        failIfError(error_code(errc::is_a_directory, posix_category()),
+                    FileName);
+
       OwningPtr<MemoryBuffer> File;
       failIfError(MemoryBuffer::getOpenFile(FD, FileName, File,
                                             Status.getSize(), false),
@@ -857,6 +865,9 @@ static void performOperation(ArchiveOperation Operation,
   llvm_unreachable("Unknown operation.");
 }
 
+static int ar_main(char **argv);
+static int ranlib_main();
+
 // main - main program for llvm-ar .. see comments in the code
 int main(int argc, char **argv) {
   ToolName = argv[0];
@@ -872,15 +883,36 @@ int main(int argc, char **argv) {
     "  This program archives bitcode files into single libraries\n"
   );
 
+  StringRef Stem = sys::path::stem(ToolName);
+  if (Stem.find("ar") != StringRef::npos)
+    return ar_main(argv);
+  if (Stem.find("ranlib") != StringRef::npos)
+    return ranlib_main();
+  fail("Not ranlib or ar!");
+}
+
+static int performOperation(ArchiveOperation Operation);
+
+int ranlib_main() {
+  if (RestOfArgs.size() != 1)
+    fail(ToolName + "takes just one archive as argument");
+  ArchiveName = RestOfArgs[0];
+  return performOperation(CreateSymTab);
+}
+
+int ar_main(char **argv) {
   // Do our own parsing of the command line because the CommandLine utility
   // can't handle the grouped positional parameters without a dash.
   ArchiveOperation Operation = parseCommandLine();
+  return performOperation(Operation);
+}
 
+static int performOperation(ArchiveOperation Operation) {
   // Create or open the archive object.
   OwningPtr<MemoryBuffer> Buf;
   error_code EC = MemoryBuffer::getFile(ArchiveName, Buf, -1, false);
   if (EC && EC != llvm::errc::no_such_file_or_directory) {
-    errs() << argv[0] << ": error opening '" << ArchiveName
+    errs() << ToolName << ": error opening '" << ArchiveName
            << "': " << EC.message() << "!\n";
     return 1;
   }
@@ -889,7 +921,7 @@ int main(int argc, char **argv) {
     object::Archive Archive(Buf.take(), EC);
 
     if (EC) {
-      errs() << argv[0] << ": error loading '" << ArchiveName
+      errs() << ToolName << ": error loading '" << ArchiveName
              << "': " << EC.message() << "!\n";
       return 1;
     }
@@ -904,7 +936,7 @@ int main(int argc, char **argv) {
   } else {
     if (!Create) {
       // Produce a warning if we should and we're creating the archive
-      errs() << argv[0] << ": creating " << ArchiveName << "\n";
+      errs() << ToolName << ": creating " << ArchiveName << "\n";
     }
   }
 
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index 186eea9..da40da2 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include <algorithm>
+#include <cctype>
 #include <map>
 using namespace llvm;
 
diff --git a/tools/llvm-c-test/CMakeLists.txt b/tools/llvm-c-test/CMakeLists.txt
new file mode 100644
index 0000000..2926d9d
--- /dev/null
+++ b/tools/llvm-c-test/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_LINK_COMPONENTS all)
+
+if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -Wstrict-prototypes")
+endif ()
+
+add_llvm_tool(llvm-c-test
+  calc.c
+  disassemble.c
+  helpers.c
+  include-all.c
+  main.c
+  module.c
+  object.c
+  targets.c
+  )
diff --git a/tools/llvm-c-test/Makefile b/tools/llvm-c-test/Makefile
new file mode 100644
index 0000000..08be7c3
--- /dev/null
+++ b/tools/llvm-c-test/Makefile
@@ -0,0 +1,29 @@
+##===- tools/llvm-c-test -----------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+TOOLNAME = llvm-c-test
+
+TOOL_NO_EXPORTS = 1
+NO_INSTALL = 1
+
+
+# If there is no shared lib, link all components...
+ifneq ($(ENABLE_SHARED),1)
+LINK_COMPONENTS = all
+endif
+
+include $(LEVEL)/Makefile.common
+
+CFLAGS += -std=c99 -Wall -Wstrict-prototypes
+
+# ...but if it is built - use it
+ifeq ($(ENABLE_SHARED),1)
+LIBS = -lLLVM-$(LLVMVersion)
+endif
diff --git a/tools/llvm-c-test/calc.c b/tools/llvm-c-test/calc.c
new file mode 100644
index 0000000..3119ccf
--- /dev/null
+++ b/tools/llvm-c-test/calc.c
@@ -0,0 +1,148 @@
+/*===-- calc.c - tool for testing libLLVM and llvm-c API ------------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file implements the --calc command in llvm-c-test. --calc reads lines *|
+|* from stdin, parses them as a name and an expression in reverse polish      *|
+|* notation and prints a module with a function with the expression.          *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include "llvm-c/Core.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+typedef LLVMValueRef (*binop_func_t)(LLVMBuilderRef, LLVMValueRef LHS,
+                                     LLVMValueRef RHS, const char *Name);
+
+static LLVMOpcode op_to_opcode(char op) {
+  switch (op) {
+  case '+': return LLVMAdd;
+  case '-': return LLVMSub;
+  case '*': return LLVMMul;
+  case '/': return LLVMSDiv;
+  case '&': return LLVMAnd;
+  case '|': return LLVMOr;
+  case '^': return LLVMXor;
+  }
+  assert(0 && "unknown operation");
+  return 0;
+}
+
+#define MAX_DEPTH 32
+
+static LLVMValueRef build_from_tokens(char **tokens, int ntokens,
+                                      LLVMBuilderRef builder,
+                                      LLVMValueRef param) {
+  LLVMValueRef stack[MAX_DEPTH];
+  int depth = 0;
+  int i;
+
+  for (i = 0; i < ntokens; i++) {
+    char tok = tokens[i][0];
+    switch (tok) {
+    case '+':
+    case '-':
+    case '*':
+    case '/':
+    case '&':
+    case '|':
+    case '^':
+      if (depth < 2) {
+        printf("stack underflow\n");
+        return NULL;
+      }
+
+      stack[depth - 2] = LLVMBuildBinOp(builder, op_to_opcode(tok),
+                                        stack[depth - 1], stack[depth - 2], "");
+      depth--;
+
+      break;
+
+    case '@': {
+      LLVMValueRef off;
+
+      if (depth < 1) {
+        printf("stack underflow\n");
+        return NULL;
+      }
+
+      off = LLVMBuildGEP(builder, param, &stack[depth - 1], 1, "");
+      stack[depth - 1] = LLVMBuildLoad(builder, off, "");
+
+      break;
+    }
+
+    default: {
+      char *end;
+      long val = strtol(tokens[i], &end, 0);
+      if (end[0] != '\0') {
+        printf("error parsing number\n");
+        return NULL;
+      }
+
+      if (depth >= MAX_DEPTH) {
+        printf("stack overflow\n");
+        return NULL;
+      }
+
+      stack[depth++] = LLVMConstInt(LLVMInt64Type(), val, 1);
+      break;
+    }
+    }
+  }
+
+  if (depth < 1) {
+    printf("stack underflow at return\n");
+    return NULL;
+  }
+
+  LLVMBuildRet(builder, stack[depth - 1]);
+
+  return stack[depth - 1];
+}
+
+static void handle_line(char **tokens, int ntokens) {
+  char *name = tokens[0];
+  LLVMValueRef param;
+  LLVMValueRef res;
+
+  LLVMModuleRef M = LLVMModuleCreateWithName(name);
+
+  LLVMTypeRef I64ty = LLVMInt64Type();
+  LLVMTypeRef I64Ptrty = LLVMPointerType(I64ty, 0);
+  LLVMTypeRef Fty = LLVMFunctionType(I64ty, &I64Ptrty, 1, 0);
+
+  LLVMValueRef F = LLVMAddFunction(M, name, Fty);
+  LLVMBuilderRef builder = LLVMCreateBuilder();
+  LLVMPositionBuilderAtEnd(builder, LLVMAppendBasicBlock(F, "entry"));
+
+  LLVMGetParams(F, &param);
+  LLVMSetValueName(param, "in");
+
+  res = build_from_tokens(tokens + 1, ntokens - 1, builder, param);
+  if (res) {
+    char *irstr = LLVMPrintModuleToString(M);
+    puts(irstr);
+    LLVMDisposeMessage(irstr);
+  }
+
+  LLVMDisposeBuilder(builder);
+
+  LLVMDisposeModule(M);
+}
+
+int calc(void) {
+
+  tokenize_stdin(handle_line);
+
+  return 0;
+}
diff --git a/tools/llvm-c-test/disassemble.c b/tools/llvm-c-test/disassemble.c
new file mode 100644
index 0000000..eb40bf3
--- /dev/null
+++ b/tools/llvm-c-test/disassemble.c
@@ -0,0 +1,88 @@
+/*===-- disassemble.c - tool for testing libLLVM and llvm-c API -----------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file implements the --disassemble command in llvm-c-test.             *|
+|* --disassemble reads lines from stdin, parses them as a triple and hex      *|
+|*  machine code, and prints disassembly of the machine code.                 *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include "llvm-c/Disassembler.h"
+#include "llvm-c/Target.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+static void pprint(int pos, unsigned char *buf, int len, const char *disasm) {
+  int i;
+  printf("%04x:  ", pos);
+  for (i = 0; i < 8; i++) {
+    if (i < len) {
+      printf("%02x ", buf[i]);
+    } else {
+      printf("   ");
+    }
+  }
+
+  printf("   %s\n", disasm);
+}
+
+static void do_disassemble(const char *triple, unsigned char *buf, int siz) {
+  LLVMDisasmContextRef D = LLVMCreateDisasm(triple, NULL, 0, NULL, NULL);
+  char outline[1024];
+  int pos;
+
+  if (!D) {
+    printf("ERROR: Couldn't create disassebler for triple %s\n", triple);
+    return;
+  }
+
+  pos = 0;
+  while (pos < siz) {
+    size_t l = LLVMDisasmInstruction(D, buf + pos, siz - pos, 0, outline,
+                                     sizeof(outline));
+    if (!l) {
+      pprint(pos, buf + pos, 1, "\t???");
+      pos++;
+    } else {
+      pprint(pos, buf + pos, l, outline);
+      pos += l;
+    }
+  }
+
+  LLVMDisasmDispose(D);
+}
+
+static void handle_line(char **tokens, int ntokens) {
+  unsigned char disbuf[128];
+  size_t disbuflen = 0;
+  char *triple = tokens[0];
+  int i;
+
+  printf("triple: %s\n", triple);
+
+  for (i = 1; i < ntokens; i++) {
+    disbuf[disbuflen++] = strtol(tokens[i], NULL, 16);
+    if (disbuflen >= sizeof(disbuf)) {
+      fprintf(stderr, "Warning: Too long line, truncating\n");
+      break;
+    }
+  }
+  do_disassemble(triple, disbuf, disbuflen);
+}
+
+int disassemble(void) {
+  LLVMInitializeAllTargetInfos();
+  LLVMInitializeAllTargetMCs();
+  LLVMInitializeAllDisassemblers();
+
+  tokenize_stdin(handle_line);
+
+  return 0;
+}
diff --git a/tools/llvm-c-test/helpers.c b/tools/llvm-c-test/helpers.c
new file mode 100644
index 0000000..1ea8a4f
--- /dev/null
+++ b/tools/llvm-c-test/helpers.c
@@ -0,0 +1,40 @@
+/*===-- helpers.c - tool for testing libLLVM and llvm-c API ---------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* Helper functions                                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_TOKENS 512
+#define MAX_LINE_LEN 1024
+
+void tokenize_stdin(void (*cb)(char **tokens, int ntokens)) {
+  char line[MAX_LINE_LEN];
+  char *tokbuf[MAX_TOKENS];
+
+  while (fgets(line, sizeof(line), stdin)) {
+    int c = 0;
+
+    if (line[0] == ';' || line[0] == '\n')
+      continue;
+
+    while (c < MAX_TOKENS) {
+      tokbuf[c] = strtok(c ? NULL : line, " \n");
+      if (!tokbuf[c])
+        break;
+      c++;
+    }
+    if (c)
+      cb(tokbuf, c);
+  }
+}
diff --git a/tools/llvm-c-test/include-all.c b/tools/llvm-c-test/include-all.c
new file mode 100644
index 0000000..17b9917
--- /dev/null
+++ b/tools/llvm-c-test/include-all.c
@@ -0,0 +1,33 @@
+/*===-- include-all.c - tool for testing libLLVM and llvm-c API -----------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file doesn't have any actual code. It just make sure that all         *|
+|* the llvm-c include files are good and doesn't generate any warnings        *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+// FIXME: Autogenerate this list
+
+#include "llvm-c/Analysis.h"
+#include "llvm-c/BitReader.h"
+#include "llvm-c/BitWriter.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Disassembler.h"
+#include "llvm-c/ExecutionEngine.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/LinkTimeOptimizer.h"
+#include "llvm-c/Linker.h"
+#include "llvm-c/Object.h"
+#include "llvm-c/Target.h"
+#include "llvm-c/TargetMachine.h"
+#include "llvm-c/Transforms/IPO.h"
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm-c/lto.h"
diff --git a/tools/llvm-c-test/llvm-c-test.h b/tools/llvm-c-test/llvm-c-test.h
new file mode 100644
index 0000000..0a25aa6
--- /dev/null
+++ b/tools/llvm-c-test/llvm-c-test.h
@@ -0,0 +1,37 @@
+/*===-- llvm-c-test.h - tool for testing libLLVM and llvm-c API -----------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* Header file for llvm-c-test                                                *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+#ifndef LLVM_C_TEST_H
+#define LLVM_C_TEST_H
+
+// helpers.c
+void tokenize_stdin(void (*cb)(char **tokens, int ntokens));
+
+// module.c
+int module_dump(void);
+int module_list_functions(void);
+int module_list_globals(void);
+
+// calc.c
+int calc(void);
+
+// disassemble.c
+int disassemble(void);
+
+// object.c
+int object_list_sections(void);
+int object_list_symbols(void);
+
+// targets.c
+int targets_list(void);
+
+#endif
diff --git a/tools/llvm-c-test/main.c b/tools/llvm-c-test/main.c
new file mode 100644
index 0000000..72f8b04
--- /dev/null
+++ b/tools/llvm-c-test/main.c
@@ -0,0 +1,73 @@
+/*===-- main.c - tool for testing libLLVM and llvm-c API ------------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* Main file for llvm-c-tests. "Parses" arguments and dispatches.             *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include "llvm-c/BitReader.h"
+#include "llvm-c/Core.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void print_usage(void) {
+  fprintf(stderr, "llvm-c-test command\n\n");
+  fprintf(stderr, " Commands:\n");
+  fprintf(stderr, "  * --module-dump\n");
+  fprintf(stderr, "    Read bytecode from stdin - print disassembly\n\n");
+  fprintf(stderr, "  * --module-list-functions\n");
+  fprintf(stderr,
+          "    Read bytecode from stdin - list summary of functions\n\n");
+  fprintf(stderr, "  * --module-list-globals\n");
+  fprintf(stderr, "    Read bytecode from stdin - list summary of globals\n\n");
+  fprintf(stderr, "  * --targets-list\n");
+  fprintf(stderr, "    List available targets\n\n");
+  fprintf(stderr, "  * --object-list-sections\n");
+  fprintf(stderr, "    Read object file form stdin - list sections\n\n");
+  fprintf(stderr, "  * --object-list-symbols\n");
+  fprintf(stderr,
+          "    Read object file form stdin - list symbols (like nm)\n\n");
+  fprintf(stderr, "  * --disassemble\n");
+  fprintf(stderr, "    Read lines of triple, hex ascii machine code from stdin "
+                  "- print disassembly\n\n");
+  fprintf(stderr, "  * --calc\n");
+  fprintf(
+      stderr,
+      "    Read lines of name, rpn from stdin - print generated module\n\n");
+}
+
+int main(int argc, char **argv) {
+  LLVMPassRegistryRef pr = LLVMGetGlobalPassRegistry();
+
+  LLVMInitializeCore(pr);
+
+  if (argc == 2 && !strcmp(argv[1], "--module-dump")) {
+    return module_dump();
+  } else if (argc == 2 && !strcmp(argv[1], "--module-list-functions")) {
+    return module_list_functions();
+  } else if (argc == 2 && !strcmp(argv[1], "--module-list-globals")) {
+    return module_list_globals();
+  } else if (argc == 2 && !strcmp(argv[1], "--targets-list")) {
+    return targets_list();
+  } else if (argc == 2 && !strcmp(argv[1], "--object-list-sections")) {
+    return object_list_sections();
+  } else if (argc == 2 && !strcmp(argv[1], "--object-list-symbols")) {
+    return object_list_symbols();
+  } else if (argc == 2 && !strcmp(argv[1], "--disassemble")) {
+    return disassemble();
+  } else if (argc == 2 && !strcmp(argv[1], "--calc")) {
+    return calc();
+  } else {
+    print_usage();
+  }
+
+  return 1;
+}
diff --git a/tools/llvm-c-test/module.c b/tools/llvm-c-test/module.c
new file mode 100644
index 0000000..2661fc8
--- /dev/null
+++ b/tools/llvm-c-test/module.c
@@ -0,0 +1,116 @@
+/*===-- module.c - tool for testing libLLVM and llvm-c API ----------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file implements the --module-dump, --module-list-functions and        *|
+|* --module-list-globals commands in llvm-c-test.                             *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include "llvm-c/BitReader.h"
+#include "llvm-c/Core.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static LLVMModuleRef load_module(void) {
+  LLVMMemoryBufferRef MB;
+  LLVMModuleRef M;
+  char *msg = NULL;
+
+  if (LLVMCreateMemoryBufferWithSTDIN(&MB, &msg)) {
+    fprintf(stderr, "Error reading file: %s\n", msg);
+    exit(1);
+  }
+
+  if (LLVMParseBitcode(MB, &M, &msg)) {
+    fprintf(stderr, "Error parsing bitcode: %s\n", msg);
+    LLVMDisposeMemoryBuffer(MB);
+    exit(1);
+  }
+
+  LLVMDisposeMemoryBuffer(MB);
+  return M;
+}
+
+int module_dump(void) {
+  LLVMModuleRef M = load_module();
+
+  char *irstr = LLVMPrintModuleToString(M);
+  puts(irstr);
+  LLVMDisposeMessage(irstr);
+
+  LLVMDisposeModule(M);
+
+  return 0;
+}
+
+int module_list_functions(void) {
+  LLVMModuleRef M = load_module();
+  LLVMValueRef f;
+
+  f = LLVMGetFirstFunction(M);
+  while (f) {
+    if (LLVMIsDeclaration(f)) {
+      printf("FunctionDeclaration: %s\n", LLVMGetValueName(f));
+    } else {
+      LLVMBasicBlockRef bb;
+      LLVMValueRef isn;
+      unsigned nisn = 0;
+      unsigned nbb = 0;
+
+      printf("FunctionDefinition: %s [#bb=%u]\n", LLVMGetValueName(f),
+             LLVMCountBasicBlocks(f));
+
+      for (bb = LLVMGetFirstBasicBlock(f); bb;
+           bb = LLVMGetNextBasicBlock(bb)) {
+        nbb++;
+        for (isn = LLVMGetFirstInstruction(bb); isn;
+             isn = LLVMGetNextInstruction(isn)) {
+          nisn++;
+          if (LLVMIsACallInst(isn)) {
+            LLVMValueRef callee =
+                LLVMGetOperand(isn, LLVMGetNumOperands(isn) - 1);
+            printf(" calls: %s\n", LLVMGetValueName(callee));
+          }
+        }
+      }
+      printf(" #isn: %u\n", nisn);
+      printf(" #bb: %u\n\n", nbb);
+    }
+    f = LLVMGetNextFunction(f);
+  }
+
+  LLVMDisposeModule(M);
+
+  return 0;
+}
+
+int module_list_globals(void) {
+  LLVMModuleRef M = load_module();
+  LLVMValueRef g;
+
+  g = LLVMGetFirstGlobal(M);
+  while (g) {
+    LLVMTypeRef T = LLVMTypeOf(g);
+    char *s = LLVMPrintTypeToString(T);
+
+    printf("Global%s: %s %s\n",
+           LLVMIsDeclaration(g) ? "Declaration" : "Definition",
+           LLVMGetValueName(g), s);
+
+    LLVMDisposeMessage(s);
+
+    g = LLVMGetNextGlobal(g);
+  }
+
+  LLVMDisposeModule(M);
+
+  return 0;
+}
diff --git a/tools/llvm-c-test/object.c b/tools/llvm-c-test/object.c
new file mode 100644
index 0000000..2792928
--- /dev/null
+++ b/tools/llvm-c-test/object.c
@@ -0,0 +1,88 @@
+/*===-- object.c - tool for testing libLLVM and llvm-c API ----------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file implements the --object-list-sections and --object-list-symbols  *|
+|* commands in llvm-c-test.                                                   *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c-test.h"
+#include "llvm-c/Object.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int object_list_sections(void) {
+  LLVMMemoryBufferRef MB;
+  LLVMObjectFileRef O;
+  LLVMSectionIteratorRef sect;
+  char *msg = NULL;
+
+  if (LLVMCreateMemoryBufferWithSTDIN(&MB, &msg)) {
+    fprintf(stderr, "Error reading file: %s\n", msg);
+    exit(1);
+  }
+
+  O = LLVMCreateObjectFile(MB);
+  if (!O) {
+    fprintf(stderr, "Error reading object\n");
+    exit(1);
+  }
+
+  sect = LLVMGetSections(O);
+  while (!LLVMIsSectionIteratorAtEnd(O, sect)) {
+    printf("'%s': @0x%08" PRIx64 " +%" PRIu64 "\n", LLVMGetSectionName(sect),
+           LLVMGetSectionAddress(sect), LLVMGetSectionSize(sect));
+
+    LLVMMoveToNextSection(sect);
+  }
+
+  LLVMDisposeSectionIterator(sect);
+
+  LLVMDisposeObjectFile(O);
+
+  return 0;
+}
+
+int object_list_symbols(void) {
+  LLVMMemoryBufferRef MB;
+  LLVMObjectFileRef O;
+  LLVMSectionIteratorRef sect;
+  LLVMSymbolIteratorRef sym;
+  char *msg = NULL;
+
+  if (LLVMCreateMemoryBufferWithSTDIN(&MB, &msg)) {
+    fprintf(stderr, "Error reading file: %s\n", msg);
+    exit(1);
+  }
+
+  O = LLVMCreateObjectFile(MB);
+  if (!O) {
+    fprintf(stderr, "Error reading object\n");
+    exit(1);
+  }
+
+  sect = LLVMGetSections(O);
+  sym = LLVMGetSymbols(O);
+  while (!LLVMIsSymbolIteratorAtEnd(O, sym)) {
+
+    LLVMMoveToContainingSection(sect, sym);
+    printf("%s @0x%08" PRIx64 "/0x%08" PRIx64 " +%" PRIu64 " (%s)\n",
+           LLVMGetSymbolName(sym), LLVMGetSymbolAddress(sym),
+           LLVMGetSymbolFileOffset(sym), LLVMGetSymbolSize(sym),
+           LLVMGetSectionName(sect));
+
+    LLVMMoveToNextSymbol(sym);
+  }
+
+  LLVMDisposeSymbolIterator(sym);
+
+  LLVMDisposeObjectFile(O);
+
+  return 0;
+}
diff --git a/tools/llvm-c-test/targets.c b/tools/llvm-c-test/targets.c
new file mode 100644
index 0000000..252c2e0
--- /dev/null
+++ b/tools/llvm-c-test/targets.c
@@ -0,0 +1,30 @@
+/*===-- targets.c - tool for testing libLLVM and llvm-c API ---------------===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file implements the --targets command in llvm-c-test.                 *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#include "llvm-c/TargetMachine.h"
+#include <stdio.h>
+
+int targets_list(void) {
+  LLVMTargetRef t;
+  LLVMInitializeAllTargetInfos();
+  LLVMInitializeAllTargets();
+
+  for (t = LLVMGetFirstTarget(); t; t = LLVMGetNextTarget(t)) {
+    printf("%s", LLVMGetTargetName(t));
+    if (LLVMTargetHasJIT(t))
+      printf(" (+jit)");
+    printf("\n - %s\n", LLVMGetTargetDescription(t));
+  }
+
+  return 0;
+}
diff --git a/tools/llvm-config/CMakeLists.txt b/tools/llvm-config/CMakeLists.txt
index 5ad58bf..c651833 100644
--- a/tools/llvm-config/CMakeLists.txt
+++ b/tools/llvm-config/CMakeLists.txt
@@ -1,37 +1,26 @@
 set(LLVM_LINK_COMPONENTS support)
 
-# We need to generate the BuildVariables.inc file containing values which are
-# only defined when under certain build modes. Unfortunately, that precludes
-# doing this inside CMake so we have to shell out to sed. For now, that means we
-# can't expect to build llvm-config on Window.s
 set(BUILDVARIABLES_SRCPATH ${CMAKE_CURRENT_SOURCE_DIR}/BuildVariables.inc.in)
 set(BUILDVARIABLES_OBJPATH ${CMAKE_CURRENT_BINARY_DIR}/BuildVariables.inc)
-set(SEDSCRIPT_OBJPATH ${CMAKE_CURRENT_BINARY_DIR}/BuildVariables.configure.sed)
 
 # Compute the substitution values for various items.
 get_system_libs(LLVM_SYSTEM_LIBS_LIST)
 foreach(l ${LLVM_SYSTEM_LIBS_LIST})
   set(SYSTEM_LIBS ${SYSTEM_LIBS} "-l${l}")
 endforeach()
-set(C_FLGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
-set(CXX_FLGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
-set(CPP_FLGS "${CMAKE_CPP_FLAGS} ${CMAKE_CPP_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
+string(REPLACE ";" " " SYSTEM_LIBS "${SYSTEM_LIBS}")
 
-add_custom_command(OUTPUT ${BUILDVARIABLES_OBJPATH}
-  COMMAND echo s!@LLVM_SRC_ROOT@!${LLVM_MAIN_SRC_DIR}! > ${SEDSCRIPT_OBJPATH}
-  COMMAND echo s!@LLVM_OBJ_ROOT@!${LLVM_BINARY_DIR}! >> ${SEDSCRIPT_OBJPATH}
-  COMMAND echo s!@LLVM_CPPFLAGS@!${CPP_FLGS}! >> ${SEDSCRIPT_OBJPATH}
-  COMMAND echo s!@LLVM_CFLAGS@!${C_FLGS}! >> ${SEDSCRIPT_OBJPATH}
-  COMMAND echo s!@LLVM_CXXFLAGS@!${CXX_FLGS}! >> ${SEDSCRIPT_OBJPATH}
-  # TODO: Use general flags for linking! not just for shared libs:
-  COMMAND echo s!@LLVM_LDFLAGS@!${CMAKE_SHARED_LINKER_FLAGS}! >> ${SEDSCRIPT_OBJPATH}
-  COMMAND echo s!@LLVM_BUILDMODE@!${CMAKE_BUILD_TYPE}! >> ${SEDSCRIPT_OBJPATH}
-  COMMAND echo s!@LLVM_SYSTEM_LIBS@!${SYSTEM_LIBS}! >> ${SEDSCRIPT_OBJPATH}
-  COMMAND echo s!@LLVM_TARGETS_BUILT@!${LLVM_TARGETS_TO_BUILD}! >> ${SEDSCRIPT_OBJPATH}
-  COMMAND sed -f ${SEDSCRIPT_OBJPATH} < ${BUILDVARIABLES_SRCPATH} > ${BUILDVARIABLES_OBJPATH}
-  VERBATIM
-  COMMENT "Building BuildVariables.inc include."
-  )
+# Use configure_file to create BuildVariables.inc.
+set(LLVM_SRC_ROOT ${LLVM_MAIN_SRC_DIR})
+set(LLVM_OBJ_ROOT ${LLVM_BINARY_DIR})
+set(LLVM_CPPFLAGS "${CMAKE_CPP_FLAGS} ${CMAKE_CPP_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
+set(LLVM_CFLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
+set(LLVM_CXXFLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${LLVM_DEFINITIONS}")
+set(LLVM_LDFLAGS ${CMAKE_SHARED_LINKER_FLAGS})
+set(LLVM_BUILDMODE ${CMAKE_BUILD_TYPE})
+set(LLVM_SYSTEM_LIBS ${SYSTEM_LIBS})
+string(REPLACE ";" " " LLVM_TARGETS_BUILT "${LLVM_TARGETS_TO_BUILD}")
+configure_file(${BUILDVARIABLES_SRCPATH} ${BUILDVARIABLES_OBJPATH} @ONLY)
 
 # Add the llvm-config tool.
 add_llvm_tool(llvm-config
diff --git a/tools/llvm-cov/CMakeLists.txt b/tools/llvm-cov/CMakeLists.txt
index 7184b9e..67cea71 100644
--- a/tools/llvm-cov/CMakeLists.txt
+++ b/tools/llvm-cov/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS instrumentation )
+set(LLVM_LINK_COMPONENTS core support )
 
 add_llvm_tool(llvm-cov
   llvm-cov.cpp
diff --git a/tools/llvm-cov/Makefile b/tools/llvm-cov/Makefile
index 2d47ce4..efed6cc 100644
--- a/tools/llvm-cov/Makefile
+++ b/tools/llvm-cov/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-cov
-LINK_COMPONENTS := instrumentation
+LINK_COMPONENTS := core support
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-cov/llvm-cov.cpp b/tools/llvm-cov/llvm-cov.cpp
index 7b21c5b..5f6999e 100644
--- a/tools/llvm-cov/llvm-cov.cpp
+++ b/tools/llvm-cov/llvm-cov.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/system_error.h"
 using namespace llvm;
@@ -30,6 +31,9 @@ InputGCNO("gcno", cl::desc("<input gcno file>"), cl::init(""));
 static cl::opt<std::string>
 InputGCDA("gcda", cl::desc("<input gcda file>"), cl::init(""));
 
+static cl::opt<std::string>
+OutputFile("o", cl::desc("<output llvm-cov file>"), cl::init("-"));
+
 
 //===----------------------------------------------------------------------===//
 int main(int argc, char **argv) {
@@ -38,7 +42,12 @@ int main(int argc, char **argv) {
   PrettyStackTraceProgram X(argc, argv);
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
-  cl::ParseCommandLineOptions(argc, argv, "llvm cov\n");
+  cl::ParseCommandLineOptions(argc, argv, "llvm coverage tool\n");
+
+  std::string ErrorInfo;
+  raw_fd_ostream OS(OutputFile.c_str(), ErrorInfo);
+  if (!ErrorInfo.empty())
+    errs() << ErrorInfo << "\n";
 
   GCOVFile GF;
   if (InputGCNO.empty())
@@ -49,7 +58,7 @@ int main(int argc, char **argv) {
     errs() << InputGCNO << ": " << ec.message() << "\n";
     return 1;
   }
-  GCOVBuffer GCNO_GB(GCNO_Buff.take());
+  GCOVBuffer GCNO_GB(GCNO_Buff.get());
   if (!GF.read(GCNO_GB)) {
     errs() << "Invalid .gcno File!\n";
     return 1;
@@ -61,7 +70,7 @@ int main(int argc, char **argv) {
       errs() << InputGCDA << ": " << ec.message() << "\n";
       return 1;
     }
-    GCOVBuffer GCDA_GB(GCDA_Buff.take());
+    GCOVBuffer GCDA_GB(GCDA_Buff.get());
     if (!GF.read(GCDA_GB)) {
       errs() << "Invalid .gcda File!\n";
       return 1;
@@ -74,5 +83,6 @@ int main(int argc, char **argv) {
 
   FileInfo FI;
   GF.collectLineCounts(FI);
+  FI.print(OS, InputGCNO, InputGCDA);
   return 0;
 }
diff --git a/tools/llvm-diff/DifferenceEngine.cpp b/tools/llvm-diff/DifferenceEngine.cpp
index 4b11315..768b94b 100644
--- a/tools/llvm-diff/DifferenceEngine.cpp
+++ b/tools/llvm-diff/DifferenceEngine.cpp
@@ -195,8 +195,6 @@ class FunctionDifferenceEngine {
     BasicBlock::iterator LI = L->begin(), LE = L->end();
     BasicBlock::iterator RI = R->begin();
 
-    llvm::SmallVector<std::pair<Instruction*,Instruction*>, 20> TentativePairs;
-
     do {
       assert(LI != LE && RI != R->end());
       Instruction *LeftI = &*LI, *RightI = &*RI;
@@ -316,15 +314,15 @@ class FunctionDifferenceEngine {
 
       bool Difference = false;
 
-      DenseMap<Constant*, BasicBlock*> LCases;
+      DenseMap<ConstantInt*,BasicBlock*> LCases;
       
       for (SwitchInst::CaseIt I = LI->case_begin(), E = LI->case_end();
            I != E; ++I)
-        LCases[I.getCaseValueEx()] = I.getCaseSuccessor();
+        LCases[I.getCaseValue()] = I.getCaseSuccessor();
         
       for (SwitchInst::CaseIt I = RI->case_begin(), E = RI->case_end();
            I != E; ++I) {
-        IntegersSubset CaseValue = I.getCaseValueEx();
+        ConstantInt *CaseValue = I.getCaseValue();
         BasicBlock *LCase = LCases[CaseValue];
         if (LCase) {
           if (TryUnify) tryUnify(LCase, I.getCaseSuccessor());
@@ -336,7 +334,7 @@ class FunctionDifferenceEngine {
         }
       }
       if (!Difference)
-        for (DenseMap<Constant*, BasicBlock*>::iterator
+        for (DenseMap<ConstantInt*,BasicBlock*>::iterator
                I = LCases.begin(), E = LCases.end(); I != E; ++I) {
           if (Complain)
             Engine.logf("left switch has extra case %l") << I->first;
diff --git a/tools/llvm-diff/llvm-diff.cpp b/tools/llvm-diff/llvm-diff.cpp
index 6eca1e2..f70219e 100644
--- a/tools/llvm-diff/llvm-diff.cpp
+++ b/tools/llvm-diff/llvm-diff.cpp
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv);
 
   LLVMContext Context;
-  
+
   // Load both modules.  Die if that fails.
   Module *LModule = ReadModule(Context, LeftFilename);
   Module *RModule = ReadModule(Context, RightFilename);
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index eef6f79..413a50b 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -62,11 +62,15 @@ DumpType("debug-dump", cl::init(DIDT_All),
         clEnumValN(DIDT_Aranges, "aranges", ".debug_aranges"),
         clEnumValN(DIDT_Info, "info", ".debug_info"),
         clEnumValN(DIDT_InfoDwo, "info.dwo", ".debug_info.dwo"),
+        clEnumValN(DIDT_Types, "types", ".debug_types"),
         clEnumValN(DIDT_Line, "line", ".debug_line"),
         clEnumValN(DIDT_Loc, "loc", ".debug_loc"),
         clEnumValN(DIDT_Frames, "frames", ".debug_frame"),
         clEnumValN(DIDT_Ranges, "ranges", ".debug_ranges"),
         clEnumValN(DIDT_Pubnames, "pubnames", ".debug_pubnames"),
+        clEnumValN(DIDT_Pubtypes, "pubtypes", ".debug_pubtypes"),
+        clEnumValN(DIDT_GnuPubnames, "gnu_pubnames", ".debug_gnu_pubnames"),
+        clEnumValN(DIDT_GnuPubtypes, "gnu_pubtypes", ".debug_gnu_pubtypes"),
         clEnumValN(DIDT_Str, "str", ".debug_str"),
         clEnumValN(DIDT_StrDwo, "str.dwo", ".debug_str.dwo"),
         clEnumValN(DIDT_StrOffsetsDwo, "str_offsets.dwo", ".debug_str_offsets.dwo"),
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index 9ba68b4..dc1a410 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -53,7 +53,7 @@ static cl::list<std::string>
 ExtractFuncs("func", cl::desc("Specify function to extract"),
              cl::ZeroOrMore, cl::value_desc("function"));
 
-// ExtractRegExpFuncs - The functions, matched via regular expression, to 
+// ExtractRegExpFuncs - The functions, matched via regular expression, to
 // extract from the module.
 static cl::list<std::string>
 ExtractRegExpFuncs("rfunc", cl::desc("Specify function(s) to extract using a "
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index 652c414..99cca23 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -70,7 +70,7 @@ int main(int argc, char **argv) {
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal();
   PrettyStackTraceProgram X(argc, argv);
-  
+
   LLVMContext &Context = getGlobalContext();
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm linker\n");
diff --git a/tools/llvm-lto/CMakeLists.txt b/tools/llvm-lto/CMakeLists.txt
new file mode 100644
index 0000000..348976c
--- /dev/null
+++ b/tools/llvm-lto/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} lto support)
+
+add_llvm_tool(llvm-lto
+  llvm-lto.cpp
+  )
+
diff --git a/tools/llvm-lto/LLVMBuild.txt b/tools/llvm-lto/LLVMBuild.txt
new file mode 100644
index 0000000..c1613a3
--- /dev/null
+++ b/tools/llvm-lto/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llvm-lto/LLVMBuild.txt ----------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-lto
+parent = Tools
+required_libraries = LTO Support all-targets
diff --git a/tools/llvm-lto/Makefile b/tools/llvm-lto/Makefile
new file mode 100644
index 0000000..f1801b4
--- /dev/null
+++ b/tools/llvm-lto/Makefile
@@ -0,0 +1,19 @@
+##===- tools/llvm-lto/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := llvm-lto
+LINK_COMPONENTS := lto ipo scalaropts linker bitreader bitwriter mcdisassembler support target vectorize all-targets
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+NO_INSTALL := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
new file mode 100644
index 0000000..0fc68ae
--- /dev/null
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -0,0 +1,187 @@
+//===-- llvm-lto: a simple command-line program to link modules with LTO --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This program takes in a list of bitcode files, links them, performs link-time
+// optimization, and outputs an object file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/LTO/LTOCodeGenerator.h"
+#include "llvm/LTO/LTOModule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetSelect.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableOpt("disable-opt", cl::init(false),
+  cl::desc("Do not run any optimization passes"));
+
+static cl::opt<bool>
+DisableInline("disable-inlining", cl::init(false),
+  cl::desc("Do not run the inliner pass"));
+
+static cl::opt<bool>
+DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
+  cl::desc("Do not run the GVN load PRE pass"));
+
+static cl::list<std::string>
+InputFilenames(cl::Positional, cl::OneOrMore,
+  cl::desc("<input bitcode files>"));
+
+static cl::opt<std::string>
+OutputFilename("o", cl::init(""),
+  cl::desc("Override output filename"),
+  cl::value_desc("filename"));
+
+static cl::list<std::string>
+ExportedSymbols("exported-symbol",
+  cl::desc("Symbol to export from the resulting object file"),
+  cl::ZeroOrMore);
+
+static cl::list<std::string>
+DSOSymbols("dso-symbol",
+  cl::desc("Symbol to put in the symtab in the resulting dso"),
+  cl::ZeroOrMore);
+
+namespace {
+struct ModuleInfo {
+  std::vector<bool> CanBeHidden;
+};
+}
+
+int main(int argc, char **argv) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+
+  llvm_shutdown_obj Y; // Call llvm_shutdown() on exit.
+  cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n");
+
+  // Initialize the configured targets.
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  // set up the TargetOptions for the machine
+  TargetOptions Options;
+  Options.LessPreciseFPMADOption = EnableFPMAD;
+  Options.NoFramePointerElim = DisableFPElim;
+  Options.AllowFPOpFusion = FuseFPOps;
+  Options.UnsafeFPMath = EnableUnsafeFPMath;
+  Options.NoInfsFPMath = EnableNoInfsFPMath;
+  Options.NoNaNsFPMath = EnableNoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+    EnableHonorSignDependentRoundingFPMath;
+  Options.UseSoftFloat = GenerateSoftFloatCalls;
+  if (FloatABIForCalls != FloatABI::Default)
+    Options.FloatABIType = FloatABIForCalls;
+  Options.NoZerosInBSS = DontPlaceZerosInBSS;
+  Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
+  Options.DisableTailCalls = DisableTailCalls;
+  Options.StackAlignmentOverride = OverrideStackAlignment;
+  Options.TrapFuncName = TrapFuncName;
+  Options.PositionIndependentExecutable = EnablePIE;
+  Options.EnableSegmentedStacks = SegmentedStacks;
+  Options.UseInitArray = UseInitArray;
+
+  unsigned BaseArg = 0;
+
+  LTOCodeGenerator CodeGen;
+
+  CodeGen.setCodePICModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC);
+  CodeGen.setDebugInfo(LTO_DEBUG_MODEL_DWARF);
+  CodeGen.setTargetOptions(Options);
+
+  llvm::StringSet<llvm::MallocAllocator> DSOSymbolsSet;
+  for (unsigned i = 0; i < DSOSymbols.size(); ++i)
+    DSOSymbolsSet.insert(DSOSymbols[i]);
+
+  std::vector<std::string> KeptDSOSyms;
+
+  for (unsigned i = BaseArg; i < InputFilenames.size(); ++i) {
+    std::string error;
+    OwningPtr<LTOModule> Module(LTOModule::makeLTOModule(InputFilenames[i].c_str(),
+                                                         Options, error));
+    if (!error.empty()) {
+      errs() << argv[0] << ": error loading file '" << InputFilenames[i]
+             << "': " << error << "\n";
+      return 1;
+    }
+
+
+    if (!CodeGen.addModule(Module.get(), error)) {
+      errs() << argv[0] << ": error adding file '" << InputFilenames[i]
+             << "': " << error << "\n";
+      return 1;
+    }
+
+    unsigned NumSyms = Module->getSymbolCount();
+    for (unsigned I = 0; I < NumSyms; ++I) {
+      StringRef Name = Module->getSymbolName(I);
+      if (!DSOSymbolsSet.count(Name))
+        continue;
+      lto_symbol_attributes Attrs = Module->getSymbolAttributes(I);
+      unsigned Scope = Attrs & LTO_SYMBOL_SCOPE_MASK;
+      if (Scope != LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN)
+        KeptDSOSyms.push_back(Name);
+    }
+  }
+
+  // Add all the exported symbols to the table of symbols to preserve.
+  for (unsigned i = 0; i < ExportedSymbols.size(); ++i)
+    CodeGen.addMustPreserveSymbol(ExportedSymbols[i].c_str());
+
+  // Add all the dso symbols to the table of symbols to expose.
+  for (unsigned i = 0; i < KeptDSOSyms.size(); ++i)
+    CodeGen.addMustPreserveSymbol(KeptDSOSyms[i].c_str());
+
+  if (!OutputFilename.empty()) {
+    size_t len = 0;
+    std::string ErrorInfo;
+    const void *Code = CodeGen.compile(&len, DisableOpt, DisableInline,
+                                       DisableGVNLoadPRE, ErrorInfo);
+    if (Code == NULL) {
+      errs() << argv[0]
+             << ": error compiling the code: " << ErrorInfo << "\n";
+      return 1;
+    }
+
+    raw_fd_ostream FileStream(OutputFilename.c_str(), ErrorInfo,
+                              sys::fs::F_Binary);
+    if (!ErrorInfo.empty()) {
+      errs() << argv[0] << ": error opening the file '" << OutputFilename
+             << "': " << ErrorInfo << "\n";
+      return 1;
+    }
+
+    FileStream.write(reinterpret_cast<const char *>(Code), len);
+  } else {
+    std::string ErrorInfo;
+    const char *OutputName = NULL;
+    if (!CodeGen.compile_to_file(&OutputName, DisableOpt, DisableInline,
+                                 DisableGVNLoadPRE, ErrorInfo)) {
+      errs() << argv[0]
+             << ": error compiling the code: " << ErrorInfo
+             << "\n";
+      return 1;
+    }
+
+    outs() << "Wrote native object file '" << OutputName << "'\n";
+  }
+
+  return 0;
+}
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index f10a614..7ec2bba 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -319,10 +319,10 @@ static int AsLexInput(SourceMgr &SrcMgr, MCAsmInfo &MAI, tool_output_file *Out)
 
 static int AssembleInput(const char *ProgName, const Target *TheTarget, 
                          SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
-                         MCAsmInfo &MAI, MCSubtargetInfo &STI) {
+                         MCAsmInfo &MAI, MCSubtargetInfo &STI, MCInstrInfo &MCII) {
   OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr, Ctx,
                                                   Str, MAI));
-  OwningPtr<MCTargetAsmParser> TAP(TheTarget->createMCAsmParser(STI, *Parser));
+  OwningPtr<MCTargetAsmParser> TAP(TheTarget->createMCAsmParser(STI, *Parser, MCII));
   if (!TAP) {
     errs() << ProgName
            << ": error: this target does not support assembly parsing.\n";
@@ -432,7 +432,7 @@ int main(int argc, char **argv) {
     MCAsmBackend *MAB = 0;
     if (ShowEncoding) {
       CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
-      MAB = TheTarget->createMCAsmBackend(TripleName, MCPU);
+      MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU);
     }
     bool UseCFI = !DisableCFI;
     Str.reset(TheTarget->createAsmStreamer(Ctx, FOS, /*asmverbose*/true,
@@ -446,7 +446,7 @@ int main(int argc, char **argv) {
   } else {
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
-    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(TripleName, MCPU);
+    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU);
     Str.reset(TheTarget->createMCObjectStreamer(TripleName, Ctx, *MAB,
                                                 FOS, CE, RelaxAll,
                                                 NoExecStack));
@@ -459,7 +459,7 @@ int main(int argc, char **argv) {
     Res = AsLexInput(SrcMgr, *MAI, Out.get());
     break;
   case AC_Assemble:
-    Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI);
+    Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI, *MCII);
     break;
   case AC_MDisassemble:
     assert(IP && "Expected assembly output");
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 01dd1c3..8449c29 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -20,6 +20,9 @@
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
@@ -303,6 +306,226 @@ static void DumpSymbolNamesFromModule(Module *M) {
   SortAndPrintSymbolList();
 }
 
+template <class ELFT>
+error_code getSymbolNMTypeChar(ELFObjectFile<ELFT> &Obj, symbol_iterator I,
+                               char &Result) {
+  typedef typename ELFObjectFile<ELFT>::Elf_Sym Elf_Sym;
+  typedef typename ELFObjectFile<ELFT>::Elf_Shdr Elf_Shdr;
+
+  DataRefImpl Symb = I->getRawDataRefImpl();
+  const Elf_Sym *ESym = Obj.getSymbol(Symb);
+  const ELFFile<ELFT> &EF = *Obj.getELFFile();
+  const Elf_Shdr *ESec = EF.getSection(ESym);
+
+  char ret = '?';
+
+  if (ESec) {
+    switch (ESec->sh_type) {
+    case ELF::SHT_PROGBITS:
+    case ELF::SHT_DYNAMIC:
+      switch (ESec->sh_flags) {
+      case(ELF::SHF_ALLOC | ELF::SHF_EXECINSTR) :
+        ret = 't';
+        break;
+      case(ELF::SHF_ALLOC | ELF::SHF_WRITE) :
+        ret = 'd';
+        break;
+      case ELF::SHF_ALLOC:
+      case(ELF::SHF_ALLOC | ELF::SHF_MERGE) :
+      case(ELF::SHF_ALLOC | ELF::SHF_MERGE | ELF::SHF_STRINGS) :
+        ret = 'r';
+        break;
+      }
+      break;
+    case ELF::SHT_NOBITS:
+      ret = 'b';
+    }
+  }
+
+  switch (EF.getSymbolTableIndex(ESym)) {
+  case ELF::SHN_UNDEF:
+    if (ret == '?')
+      ret = 'U';
+    break;
+  case ELF::SHN_ABS:
+    ret = 'a';
+    break;
+  case ELF::SHN_COMMON:
+    ret = 'c';
+    break;
+  }
+
+  switch (ESym->getBinding()) {
+  case ELF::STB_GLOBAL:
+    ret = ::toupper(ret);
+    break;
+  case ELF::STB_WEAK:
+    if (EF.getSymbolTableIndex(ESym) == ELF::SHN_UNDEF)
+      ret = 'w';
+    else if (ESym->getType() == ELF::STT_OBJECT)
+      ret = 'V';
+    else
+      ret = 'W';
+  }
+
+  if (ret == '?' && ESym->getType() == ELF::STT_SECTION) {
+    StringRef Name;
+    error_code EC = I->getName(Name);
+    if (EC)
+      return EC;
+    Result = StringSwitch<char>(Name)
+                 .StartsWith(".debug", 'N')
+                 .StartsWith(".note", 'n')
+                 .Default('?');
+    return object_error::success;
+  }
+
+  Result = ret;
+  return object_error::success;
+}
+
+static error_code getSymbolNMTypeChar(COFFObjectFile &Obj, symbol_iterator I,
+                                      char &Result) {
+  const coff_symbol *symb = Obj.getCOFFSymbol(I);
+  StringRef name;
+  if (error_code ec = I->getName(name))
+    return ec;
+  char ret = StringSwitch<char>(name)
+                 .StartsWith(".debug", 'N')
+                 .StartsWith(".sxdata", 'N')
+                 .Default('?');
+
+  if (ret != '?') {
+    Result = ret;
+    return object_error::success;
+  }
+
+  uint32_t Characteristics = 0;
+  if (symb->SectionNumber > 0) {
+    section_iterator SecI = Obj.end_sections();
+    if (error_code ec = I->getSection(SecI))
+      return ec;
+    const coff_section *Section = Obj.getCOFFSection(SecI);
+    Characteristics = Section->Characteristics;
+  }
+
+  switch (symb->SectionNumber) {
+  case COFF::IMAGE_SYM_UNDEFINED:
+    // Check storage classes.
+    if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL) {
+      Result = 'w';
+      return object_error::success; // Don't do ::toupper.
+    } else if (symb->Value != 0)    // Check for common symbols.
+      ret = 'c';
+    else
+      ret = 'u';
+    break;
+  case COFF::IMAGE_SYM_ABSOLUTE:
+    ret = 'a';
+    break;
+  case COFF::IMAGE_SYM_DEBUG:
+    ret = 'n';
+    break;
+  default:
+    // Check section type.
+    if (Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      ret = 't';
+    else if (Characteristics & COFF::IMAGE_SCN_MEM_READ &&
+             ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
+      ret = 'r';
+    else if (Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+      ret = 'd';
+    else if (Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+      ret = 'b';
+    else if (Characteristics & COFF::IMAGE_SCN_LNK_INFO)
+      ret = 'i';
+
+    // Check for section symbol.
+    else if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_STATIC &&
+             symb->Value == 0)
+      ret = 's';
+  }
+
+  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
+    ret = ::toupper(static_cast<unsigned char>(ret));
+
+  Result = ret;
+  return object_error::success;
+}
+
+static uint8_t getNType(MachOObjectFile &Obj, DataRefImpl Symb) {
+  if (Obj.is64Bit()) {
+    MachO::nlist_64 STE = Obj.getSymbol64TableEntry(Symb);
+    return STE.n_type;
+  }
+  MachO::nlist STE = Obj.getSymbolTableEntry(Symb);
+  return STE.n_type;
+}
+
+static error_code getSymbolNMTypeChar(MachOObjectFile &Obj, symbol_iterator I,
+                                      char &Res) {
+  DataRefImpl Symb = I->getRawDataRefImpl();
+  uint8_t NType = getNType(Obj, Symb);
+
+  char Char;
+  switch (NType & MachO::N_TYPE) {
+  case MachO::N_UNDF:
+    Char = 'u';
+    break;
+  case MachO::N_ABS:
+    Char = 's';
+    break;
+  case MachO::N_SECT: {
+    section_iterator Sec = Obj.end_sections();
+    Obj.getSymbolSection(Symb, Sec);
+    DataRefImpl Ref = Sec->getRawDataRefImpl();
+    StringRef SectionName;
+    Obj.getSectionName(Ref, SectionName);
+    StringRef SegmentName = Obj.getSectionFinalSegmentName(Ref);
+    if (SegmentName == "__TEXT" && SectionName == "__text")
+      Char = 't';
+    else
+      Char = 's';
+  } break;
+  default:
+    Char = '?';
+    break;
+  }
+
+  if (NType & (MachO::N_EXT | MachO::N_PEXT))
+    Char = toupper(static_cast<unsigned char>(Char));
+  Res = Char;
+  return object_error::success;
+}
+
+static char getNMTypeChar(ObjectFile *Obj, symbol_iterator I) {
+  char Res = '?';
+  if (COFFObjectFile *COFF = dyn_cast<COFFObjectFile>(Obj)) {
+    error(getSymbolNMTypeChar(*COFF, I, Res));
+    return Res;
+  }
+  if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(Obj)) {
+    error(getSymbolNMTypeChar(*MachO, I, Res));
+    return Res;
+  }
+
+  if (ELF32LEObjectFile *ELF = dyn_cast<ELF32LEObjectFile>(Obj)) {
+    error(getSymbolNMTypeChar(*ELF, I, Res));
+    return Res;
+  }
+  if (ELF64LEObjectFile *ELF = dyn_cast<ELF64LEObjectFile>(Obj)) {
+    error(getSymbolNMTypeChar(*ELF, I, Res));
+    return Res;
+  }
+  if (ELF32BEObjectFile *ELF = dyn_cast<ELF32BEObjectFile>(Obj)) {
+    error(getSymbolNMTypeChar(*ELF, I, Res));
+    return Res;
+  }
+  ELF64BEObjectFile *ELF = cast<ELF64BEObjectFile>(Obj);
+  error(getSymbolNMTypeChar(*ELF, I, Res));
+  return Res;
+}
+
 static void DumpSymbolNamesFromObject(ObjectFile *obj) {
   error_code ec;
   symbol_iterator ibegin = obj->begin_symbols();
@@ -325,7 +548,7 @@ static void DumpSymbolNamesFromObject(ObjectFile *obj) {
     }
     if (PrintAddress)
       if (error(i->getAddress(s.Address))) break;
-    if (error(i->getNMTypeChar(s.TypeChar))) break;
+    s.TypeChar = getNMTypeChar(obj, i);
     if (error(i->getName(s.Name))) break;
     SymbolList.push_back(s);
   }
diff --git a/tools/llvm-objdump/COFFDump.cpp b/tools/llvm-objdump/COFFDump.cpp
index bca6fc9..5f0bcbb 100644
--- a/tools/llvm-objdump/COFFDump.cpp
+++ b/tools/llvm-objdump/COFFDump.cpp
@@ -227,6 +227,48 @@ static void printCOFFSymbolAddress(llvm::raw_ostream &Out,
     Out << format(" + 0x%04x", Disp);
 }
 
+// Prints import tables. The import table is a table containing the list of
+// DLL name and symbol names which will be linked by the loader.
+static void printImportTables(const COFFObjectFile *Obj) {
+  outs() << "The Import Tables:\n";
+  error_code ec;
+  for (import_directory_iterator i = Obj->import_directory_begin(),
+                                 e = Obj->import_directory_end();
+       i != e; i = i.increment(ec)) {
+    if (ec)
+      return;
+
+    const import_directory_table_entry *Dir;
+    StringRef Name;
+    if (i->getImportTableEntry(Dir)) return;
+    if (i->getName(Name)) return;
+
+    outs() << format("  lookup %08x time %08x fwd %08x name %08x addr %08x\n\n",
+                     static_cast<uint32_t>(Dir->ImportLookupTableRVA),
+                     static_cast<uint32_t>(Dir->TimeDateStamp),
+                     static_cast<uint32_t>(Dir->ForwarderChain),
+                     static_cast<uint32_t>(Dir->NameRVA),
+                     static_cast<uint32_t>(Dir->ImportAddressTableRVA));
+    outs() << "    DLL Name: " << Name << "\n";
+    outs() << "    Hint/Ord  Name\n";
+    const import_lookup_table_entry32 *entry;
+    if (i->getImportLookupEntry(entry))
+      return;
+    for (; entry->data; ++entry) {
+      if (entry->isOrdinal()) {
+        outs() << format("      % 6d\n", entry->getOrdinal());
+        continue;
+      }
+      uint16_t Hint;
+      StringRef Name;
+      if (Obj->getHintName(entry->getHintNameRVA(), Hint, Name))
+        return;
+      outs() << format("      % 6d  ", Hint) << Name << "\n";
+    }
+    outs() << "\n";
+  }
+}
+
 void llvm::printCOFFUnwindInfo(const COFFObjectFile *Obj) {
   const coff_file_header *Header;
   if (error(Obj->getCOFFHeader(Header))) return;
@@ -353,3 +395,7 @@ void llvm::printCOFFUnwindInfo(const COFFObjectFile *Obj) {
     }
   }
 }
+
+void llvm::printCOFFFileHeader(const object::ObjectFile *Obj) {
+  printImportTables(dyn_cast<const COFFObjectFile>(Obj));
+}
diff --git a/tools/llvm-objdump/ELFDump.cpp b/tools/llvm-objdump/ELFDump.cpp
index ef1f0e9..9c091a41 100644
--- a/tools/llvm-objdump/ELFDump.cpp
+++ b/tools/llvm-objdump/ELFDump.cpp
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-objdump.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -21,10 +21,8 @@
 using namespace llvm;
 using namespace llvm::object;
 
-template<class ELFT>
-void printProgramHeaders(
-    const ELFObjectFile<ELFT> *o) {
-  typedef ELFObjectFile<ELFT> ELFO;
+template <class ELFT> void printProgramHeaders(const ELFFile<ELFT> *o) {
+  typedef ELFFile<ELFT> ELFO;
   outs() << "Program Header:\n";
   for (typename ELFO::Elf_Phdr_Iter pi = o->begin_program_headers(),
                                     pe = o->end_program_headers();
@@ -80,17 +78,17 @@ void printProgramHeaders(
 void llvm::printELFFileHeader(const object::ObjectFile *Obj) {
   // Little-endian 32-bit
   if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    printProgramHeaders(ELFObj);
+    printProgramHeaders(ELFObj->getELFFile());
 
   // Big-endian 32-bit
   if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    printProgramHeaders(ELFObj);
+    printProgramHeaders(ELFObj->getELFFile());
 
   // Little-endian 64-bit
   if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    printProgramHeaders(ELFObj);
+    printProgramHeaders(ELFObj->getELFFile());
 
   // Big-endian 64-bit
   if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    printProgramHeaders(ELFObj);
+    printProgramHeaders(ELFObj->getELFFile());
 }
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index e0ec9cc..86923fd 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -104,7 +104,7 @@ static void DumpDataInCode(const char *bytes, uint64_t Size,
   uint64_t Value;
 
   switch (Kind) {
-  case macho::Data:
+  case MachO::DICE_KIND_DATA:
     switch (Size) {
     case 4:
       Value = bytes[3] << 24 |
@@ -125,16 +125,16 @@ static void DumpDataInCode(const char *bytes, uint64_t Size,
     }
     outs() << "\t@ KIND_DATA\n";
     break;
-  case macho::JumpTable8:
+  case MachO::DICE_KIND_JUMP_TABLE8:
     Value = bytes[0];
     outs() << "\t.byte " << Value << "\t@ KIND_JUMP_TABLE8";
     break;
-  case macho::JumpTable16:
+  case MachO::DICE_KIND_JUMP_TABLE16:
     Value = bytes[1] << 8 |
             bytes[0];
     outs() << "\t.short " << Value << "\t@ KIND_JUMP_TABLE16";
     break;
-  case macho::JumpTable32:
+  case MachO::DICE_KIND_JUMP_TABLE32:
     Value = bytes[3] << 24 |
             bytes[2] << 16 |
             bytes[1] << 8 |
@@ -148,7 +148,7 @@ static void DumpDataInCode(const char *bytes, uint64_t Size,
 }
 
 static void
-getSectionsAndSymbols(const macho::Header Header,
+getSectionsAndSymbols(const MachO::mach_header Header,
                       MachOObjectFile *MachOObj,
                       std::vector<SectionRef> &Sections,
                       std::vector<SymbolRef> &Symbols,
@@ -171,25 +171,25 @@ getSectionsAndSymbols(const macho::Header Header,
     MachOObj->getFirstLoadCommandInfo();
   bool BaseSegmentAddressSet = false;
   for (unsigned i = 0; ; ++i) {
-    if (Command.C.Type == macho::LCT_FunctionStarts) {
+    if (Command.C.cmd == MachO::LC_FUNCTION_STARTS) {
       // We found a function starts segment, parse the addresses for later
       // consumption.
-      macho::LinkeditDataLoadCommand LLC =
+      MachO::linkedit_data_command LLC =
         MachOObj->getLinkeditDataLoadCommand(Command);
 
-      MachOObj->ReadULEB128s(LLC.DataOffset, FoundFns);
+      MachOObj->ReadULEB128s(LLC.dataoff, FoundFns);
     }
-    else if (Command.C.Type == macho::LCT_Segment) {
-      macho::SegmentLoadCommand SLC =
+    else if (Command.C.cmd == MachO::LC_SEGMENT) {
+      MachO::segment_command SLC =
         MachOObj->getSegmentLoadCommand(Command);
-      StringRef SegName = SLC.Name;
+      StringRef SegName = SLC.segname;
       if(!BaseSegmentAddressSet && SegName != "__PAGEZERO") {
         BaseSegmentAddressSet = true;
-        BaseSegmentAddress = SLC.VMAddress;
+        BaseSegmentAddress = SLC.vmaddr;
       }
     }
 
-    if (i == Header.NumLoadCommands - 1)
+    if (i == Header.ncmds - 1)
       break;
     else
       Command = MachOObj->getNextLoadCommandInfo(Command);
@@ -244,7 +244,7 @@ static void DisassembleInputMachO2(StringRef Filename,
 
   outs() << '\n' << Filename << ":\n\n";
 
-  macho::Header Header = MachOOF->getHeader();
+  MachO::mach_header Header = MachOOF->getHeader();
 
   // FIXME: FoundFns isn't used anymore. Using symbols/LC_FUNCTION_STARTS to
   // determine function locations will eventually go in MCObjectDisassembler.
@@ -260,14 +260,12 @@ static void DisassembleInputMachO2(StringRef Filename,
   getSectionsAndSymbols(Header, MachOOF, Sections, Symbols, FoundFns,
                         BaseSegmentAddress);
 
-  // Make a copy of the unsorted symbol list. FIXME: duplication
-  std::vector<SymbolRef> UnsortedSymbols(Symbols);
   // Sort the symbols by address, just in case they didn't come in that way.
   std::sort(Symbols.begin(), Symbols.end(), SymbolSorter());
 
   // Build a data in code table that is sorted on by the address of each entry.
   uint64_t BaseAddress = 0;
-  if (Header.FileType == macho::HFT_Object)
+  if (Header.filetype == MachO::MH_OBJECT)
     Sections[0].getAddress(BaseAddress);
   else
     BaseAddress = BaseSegmentAddress;
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 122ac83..9bc092e 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCModuleYAML.h"
 #include "llvm/MC/MCObjectDisassembler.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectSymbolizer.h"
@@ -61,6 +62,7 @@
 #include <algorithm>
 #include <cctype>
 #include <cstring>
+
 using namespace llvm;
 using namespace object;
 
@@ -139,6 +141,12 @@ static cl::opt<bool>
 CFG("cfg", cl::desc("Create a CFG for every function found in the object"
                       " and write it to a graphviz file"));
 
+// FIXME: Does it make sense to have a dedicated tool for yaml cfg output?
+static cl::opt<std::string>
+YAMLCFG("yaml-cfg",
+        cl::desc("Create a CFG and write it as a YAML MCModule."),
+        cl::value_desc("yaml output file"));
+
 static StringRef ToolName;
 
 bool llvm::error(error_code ec) {
@@ -178,6 +186,7 @@ static const Target *getTarget(const ObjectFile *Obj = NULL) {
 }
 
 // Write a graphviz file for the CFG inside an MCFunction.
+// FIXME: Use GraphWriter
 static void emitDOTFile(const char *FileName, const MCFunction &f,
                         MCInstPrinter *IP) {
   // Start a new dot file.
@@ -333,7 +342,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     return;
   }
 
-  if (CFG) {
+  if (CFG || !YAMLCFG.empty()) {
     OwningPtr<MCObjectDisassembler> OD(
       new MCObjectDisassembler(*Obj, *DisAsm, *MIA));
     OwningPtr<MCModule> Mod(OD->buildModule(/* withCFG */ true));
@@ -350,14 +359,25 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
         }
       }
     }
-    for (MCModule::const_func_iterator FI = Mod->func_begin(),
-                                       FE = Mod->func_end();
-                                       FI != FE; ++FI) {
-      static int filenum = 0;
-      emitDOTFile((Twine((*FI)->getName()) + "_" +
-                   utostr(filenum) + ".dot").str().c_str(),
-                    **FI, IP.get());
-      ++filenum;
+    if (CFG) {
+      for (MCModule::const_func_iterator FI = Mod->func_begin(),
+                                         FE = Mod->func_end();
+                                         FI != FE; ++FI) {
+        static int filenum = 0;
+        emitDOTFile((Twine((*FI)->getName()) + "_" +
+                     utostr(filenum) + ".dot").str().c_str(),
+                      **FI, IP.get());
+        ++filenum;
+      }
+    }
+    if (!YAMLCFG.empty()) {
+      std::string Error;
+      raw_fd_ostream YAMLOut(YAMLCFG.c_str(), Error);
+      if (!Error.empty()) {
+        errs() << ToolName << ": warning: " << Error << '\n';
+        return;
+      }
+      mcmodule2yaml(YAMLOut, *Mod, *MII, *MRI);
     }
   }
 
@@ -750,6 +770,14 @@ static void PrintUnwindInfo(const ObjectFile *o) {
   }
 }
 
+static void printPrivateFileHeader(const ObjectFile *o) {
+  if (o->isELF()) {
+    printELFFileHeader(o);
+  } else if (o->isCOFF()) {
+    printCOFFFileHeader(o);
+  }
+}
+
 static void DumpObject(const ObjectFile *o) {
   outs() << '\n';
   outs() << o->getFileName()
@@ -767,8 +795,8 @@ static void DumpObject(const ObjectFile *o) {
     PrintSymbolTable(o);
   if (UnwindInfo)
     PrintUnwindInfo(o);
-  if (PrivateHeaders && o->isELF())
-    printELFFileHeader(o);
+  if (PrivateHeaders)
+    printPrivateFileHeader(o);
 }
 
 /// @brief Dump each object file in \a a;
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index 87f19ba..b716a26 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -34,7 +34,8 @@ void DumpBytes(StringRef bytes);
 void DisassembleInputMachO(StringRef Filename);
 void printCOFFUnwindInfo(const object::COFFObjectFile* o);
 void printELFFileHeader(const object::ObjectFile *o);
+void printCOFFFileHeader(const object::ObjectFile *o);
 
-}
+} // end namespace llvm
 
 #endif
diff --git a/tools/llvm-prof/CMakeLists.txt b/tools/llvm-prof/CMakeLists.txt
deleted file mode 100644
index 442112b..0000000
--- a/tools/llvm-prof/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(LLVM_LINK_COMPONENTS bitreader analysis)
-
-add_llvm_tool(llvm-prof
-  llvm-prof.cpp
-  )
diff --git a/tools/llvm-prof/LLVMBuild.txt b/tools/llvm-prof/LLVMBuild.txt
deleted file mode 100644
index d59127c..0000000
--- a/tools/llvm-prof/LLVMBuild.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-;===- ./tools/llvm-prof/LLVMBuild.txt --------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Tool
-name = llvm-prof
-parent = Tools
-required_libraries = Analysis BitReader
diff --git a/tools/llvm-prof/Makefile b/tools/llvm-prof/Makefile
deleted file mode 100644
index f829786..0000000
--- a/tools/llvm-prof/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-##===- tools/llvm-prof/Makefile ----------------------------*- Makefile -*-===##
-# 
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-# 
-##===----------------------------------------------------------------------===##
-
-LEVEL := ../..
-TOOLNAME := llvm-prof
-LINK_COMPONENTS := bitreader analysis
-
-# This tool has no plugins, optimize startup time.
-TOOL_NO_EXPORTS = 1
-
-include $(LEVEL)/Makefile.common
diff --git a/tools/llvm-prof/llvm-prof.cpp b/tools/llvm-prof/llvm-prof.cpp
deleted file mode 100644
index b2c3f06..0000000
--- a/tools/llvm-prof/llvm-prof.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-//===- llvm-prof.cpp - Read in and process llvmprof.out data files --------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This tools is meant for use with the various LLVM profiling instrumentation
-// passes.  It reads in the data file produced by executing an instrumented
-// program, and outputs a nice report.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/Assembly/AssemblyAnnotationWriter.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-#include <algorithm>
-#include <iomanip>
-#include <map>
-#include <set>
-
-using namespace llvm;
-
-namespace {
-  cl::opt<std::string>
-  BitcodeFile(cl::Positional, cl::desc("<program bitcode file>"),
-              cl::Required);
-
-  cl::opt<std::string>
-  ProfileDataFile(cl::Positional, cl::desc("<llvmprof.out file>"),
-                  cl::Optional, cl::init("llvmprof.out"));
-
-  cl::opt<bool>
-  PrintAnnotatedLLVM("annotated-llvm",
-                     cl::desc("Print LLVM code with frequency annotations"));
-  cl::alias PrintAnnotated2("A", cl::desc("Alias for --annotated-llvm"),
-                            cl::aliasopt(PrintAnnotatedLLVM));
-  cl::opt<bool>
-  PrintAllCode("print-all-code",
-               cl::desc("Print annotated code for the entire program"));
-}
-
-// PairSecondSort - A sorting predicate to sort by the second element of a pair.
-template<class T>
-struct PairSecondSortReverse
-  : public std::binary_function<std::pair<T, double>,
-                                std::pair<T, double>, bool> {
-  bool operator()(const std::pair<T, double> &LHS,
-                  const std::pair<T, double> &RHS) const {
-    return LHS.second > RHS.second;
-  }
-};
-
-static double ignoreMissing(double w) {
-  if (w == ProfileInfo::MissingValue) return 0;
-  return w;
-}
-
-namespace {
-  class ProfileAnnotator : public AssemblyAnnotationWriter {
-    ProfileInfo &PI;
-  public:
-    ProfileAnnotator(ProfileInfo &pi) : PI(pi) {}
-
-    virtual void emitFunctionAnnot(const Function *F,
-                                   formatted_raw_ostream &OS) {
-      double w = PI.getExecutionCount(F);
-      if (w != ProfileInfo::MissingValue) {
-        OS << ";;; %" << F->getName() << " called "<<(unsigned)w
-           <<" times.\n;;;\n";
-      }
-    }
-    virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
-                                          formatted_raw_ostream &OS) {
-      double w = PI.getExecutionCount(BB);
-      if (w != ProfileInfo::MissingValue) {
-        if (w != 0) {
-          OS << "\t;;; Basic block executed " << (unsigned)w << " times.\n";
-        } else {
-          OS << "\t;;; Never executed!\n";
-        }
-      }
-    }
-
-    virtual void emitBasicBlockEndAnnot(const BasicBlock *BB,
-                                        formatted_raw_ostream &OS) {
-      // Figure out how many times each successor executed.
-      std::vector<std::pair<ProfileInfo::Edge, double> > SuccCounts;
-
-      const TerminatorInst *TI = BB->getTerminator();
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-        BasicBlock* Succ = TI->getSuccessor(s);
-        double w = ignoreMissing(PI.getEdgeWeight(std::make_pair(BB, Succ)));
-        if (w != 0)
-          SuccCounts.push_back(std::make_pair(std::make_pair(BB, Succ), w));
-      }
-      if (!SuccCounts.empty()) {
-        OS << "\t;;; Out-edge counts:";
-        for (unsigned i = 0, e = SuccCounts.size(); i != e; ++i)
-          OS << " [" << (SuccCounts[i]).second << " -> "
-             << (SuccCounts[i]).first.second->getName() << "]";
-        OS << "\n";
-      }
-    }
-  };
-}
-
-namespace {
-  /// ProfileInfoPrinterPass - Helper pass to dump the profile information for
-  /// a module.
-  //
-  // FIXME: This should move elsewhere.
-  class ProfileInfoPrinterPass : public ModulePass {
-    ProfileInfoLoader &PIL;
-  public:
-    static char ID; // Class identification, replacement for typeinfo.
-    explicit ProfileInfoPrinterPass(ProfileInfoLoader &_PIL) 
-      : ModulePass(ID), PIL(_PIL) {}
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-      AU.addRequired<ProfileInfo>();
-    }
-
-    bool runOnModule(Module &M);
-  };
-}
-
-char ProfileInfoPrinterPass::ID = 0;
-
-bool ProfileInfoPrinterPass::runOnModule(Module &M) {
-  ProfileInfo &PI = getAnalysis<ProfileInfo>();
-  std::map<const Function  *, unsigned> FuncFreqs;
-  std::map<const BasicBlock*, unsigned> BlockFreqs;
-  std::map<ProfileInfo::Edge, unsigned> EdgeFreqs;
-
-  // Output a report. Eventually, there will be multiple reports selectable on
-  // the command line, for now, just keep things simple.
-
-  // Emit the most frequent function table...
-  std::vector<std::pair<Function*, double> > FunctionCounts;
-  std::vector<std::pair<BasicBlock*, double> > Counts;
-  for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
-    if (FI->isDeclaration()) continue;
-    double w = ignoreMissing(PI.getExecutionCount(FI));
-    FunctionCounts.push_back(std::make_pair(FI, w));
-    for (Function::iterator BB = FI->begin(), BBE = FI->end(); 
-         BB != BBE; ++BB) {
-      double w = ignoreMissing(PI.getExecutionCount(BB));
-      Counts.push_back(std::make_pair(BB, w));
-    }
-  }
-
-  // Sort by the frequency, backwards.
-  sort(FunctionCounts.begin(), FunctionCounts.end(),
-            PairSecondSortReverse<Function*>());
-
-  double TotalExecutions = 0;
-  for (unsigned i = 0, e = FunctionCounts.size(); i != e; ++i)
-    TotalExecutions += FunctionCounts[i].second;
-
-  outs() << "===" << std::string(73, '-') << "===\n"
-         << "LLVM profiling output for execution";
-  if (PIL.getNumExecutions() != 1) outs() << "s";
-  outs() << ":\n";
-
-  for (unsigned i = 0, e = PIL.getNumExecutions(); i != e; ++i) {
-    outs() << "  ";
-    if (e != 1) outs() << i+1 << ". ";
-    outs() << PIL.getExecution(i) << "\n";
-  }
-
-  outs() << "\n===" << std::string(73, '-') << "===\n";
-  outs() << "Function execution frequencies:\n\n";
-
-  // Print out the function frequencies...
-  outs() << " ##   Frequency\n";
-  for (unsigned i = 0, e = FunctionCounts.size(); i != e; ++i) {
-    if (FunctionCounts[i].second == 0) {
-      outs() << "\n  NOTE: " << e-i << " function" 
-        << (e-i-1 ? "s were" : " was") << " never executed!\n";
-      break;
-    }
-
-    outs() << format("%3d", i+1) << ". "
-           << format("%5.2g", FunctionCounts[i].second) << "/"
-           << format("%g", TotalExecutions) << " "
-           << FunctionCounts[i].first->getName() << "\n";
-  }
-
-  std::set<Function*> FunctionsToPrint;
-
-  TotalExecutions = 0;
-  for (unsigned i = 0, e = Counts.size(); i != e; ++i)
-    TotalExecutions += Counts[i].second;
-  
-  // Sort by the frequency, backwards.
-  sort(Counts.begin(), Counts.end(),
-       PairSecondSortReverse<BasicBlock*>());
-  
-  outs() << "\n===" << std::string(73, '-') << "===\n";
-  outs() << "Top 20 most frequently executed basic blocks:\n\n";
-  
-  // Print out the function frequencies...
-  outs() <<" ##      %% \tFrequency\n";
-  unsigned BlocksToPrint = Counts.size();
-  if (BlocksToPrint > 20) BlocksToPrint = 20;
-  for (unsigned i = 0; i != BlocksToPrint; ++i) {
-    if (Counts[i].second == 0) break;
-    Function *F = Counts[i].first->getParent();
-    outs() << format("%3d", i+1) << ". "
-           << format("%5g", Counts[i].second/(double)TotalExecutions*100)<<"% "
-           << format("%5.0f", Counts[i].second) << "/"
-           << format("%g", TotalExecutions) << "\t"
-           << F->getName() << "() - "
-           << Counts[i].first->getName() << "\n";
-    FunctionsToPrint.insert(F);
-  }
-
-  if (PrintAnnotatedLLVM || PrintAllCode) {
-    outs() << "\n===" << std::string(73, '-') << "===\n";
-    outs() << "Annotated LLVM code for the module:\n\n";
-  
-    ProfileAnnotator PA(PI);
-
-    if (FunctionsToPrint.empty() || PrintAllCode)
-      M.print(outs(), &PA);
-    else
-      // Print just a subset of the functions.
-      for (std::set<Function*>::iterator I = FunctionsToPrint.begin(),
-             E = FunctionsToPrint.end(); I != E; ++I)
-        (*I)->print(outs(), &PA);
-  }
-
-  return false;
-}
-
-int main(int argc, char **argv) {
-  // Print a stack trace if we signal out.
-  sys::PrintStackTraceOnErrorSignal();
-  PrettyStackTraceProgram X(argc, argv);
-
-  LLVMContext &Context = getGlobalContext();
-  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
-  
-  cl::ParseCommandLineOptions(argc, argv, "llvm profile dump decoder\n");
-
-  // Read in the bitcode file...
-  std::string ErrorMessage;
-  OwningPtr<MemoryBuffer> Buffer;
-  error_code ec;
-  Module *M = 0;
-  if (!(ec = MemoryBuffer::getFileOrSTDIN(BitcodeFile, Buffer))) {
-    M = ParseBitcodeFile(Buffer.get(), Context, &ErrorMessage);
-  } else
-    ErrorMessage = ec.message();
-  if (M == 0) {
-    errs() << argv[0] << ": " << BitcodeFile << ": "
-      << ErrorMessage << "\n";
-    return 1;
-  }
-
-  // Read the profiling information. This is redundant since we load it again
-  // using the standard profile info provider pass, but for now this gives us
-  // access to additional information not exposed via the ProfileInfo
-  // interface.
-  ProfileInfoLoader PIL(argv[0], ProfileDataFile);
-
-  // Run the printer pass.
-  PassManager PassMgr;
-  PassMgr.add(createProfileLoaderPass(ProfileDataFile));
-  PassMgr.add(new ProfileInfoPrinterPass(PIL));
-  PassMgr.run(*M);
-
-  return 0;
-}
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 3628c3b..07a9083 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -18,7 +18,7 @@
 #include "StreamWriter.h"
 
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
@@ -28,7 +28,6 @@ using namespace llvm;
 using namespace llvm::object;
 using namespace ELF;
 
-
 #define LLVM_READOBJ_ENUM_CASE(ns, enum) \
   case ns::enum: return #enum;
 
@@ -37,9 +36,8 @@ namespace {
 template<typename ELFT>
 class ELFDumper : public ObjDumper {
 public:
-  ELFDumper(const ELFObjectFile<ELFT> *Obj, StreamWriter& Writer)
-    : ObjDumper(Writer)
-    , Obj(Obj) { }
+  ELFDumper(const ELFFile<ELFT> *Obj, StreamWriter &Writer)
+      : ObjDumper(Writer), Obj(Obj) {}
 
   virtual void printFileHeaders() LLVM_OVERRIDE;
   virtual void printSections() LLVM_OVERRIDE;
@@ -53,24 +51,32 @@ public:
   virtual void printProgramHeaders() LLVM_OVERRIDE;
 
 private:
-  typedef ELFObjectFile<ELFT> ELFO;
+  typedef ELFFile<ELFT> ELFO;
   typedef typename ELFO::Elf_Shdr Elf_Shdr;
   typedef typename ELFO::Elf_Sym Elf_Sym;
 
-  void printSymbol(symbol_iterator SymI, bool IsDynamic = false);
+  void printSymbol(typename ELFO::Elf_Sym_Iter Symbol);
 
-  void printRelocation(section_iterator SecI, relocation_iterator RelI);
+  void printRelocations(const Elf_Shdr *Sec);
+  void printRelocation(const Elf_Shdr *Sec, typename ELFO::Elf_Rela Rel);
 
   const ELFO *Obj;
 };
 
-} // namespace
+template <class T> T errorOrDefault(ErrorOr<T> Val, T Default = T()) {
+  if (!Val) {
+    error(Val);
+    return Default;
+  }
 
+  return *Val;
+}
+} // namespace
 
 namespace llvm {
 
 template <class ELFT>
-static error_code createELFDumper(const ELFObjectFile<ELFT> *Obj,
+static error_code createELFDumper(const ELFFile<ELFT> *Obj,
                                   StreamWriter &Writer,
                                   OwningPtr<ObjDumper> &Result) {
   Result.reset(new ELFDumper<ELFT>(Obj, Writer));
@@ -82,26 +88,25 @@ error_code createELFDumper(const object::ObjectFile *Obj,
                            OwningPtr<ObjDumper> &Result) {
   // Little-endian 32-bit
   if (const ELF32LEObjectFile *ELFObj = dyn_cast<ELF32LEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
+    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
 
   // Big-endian 32-bit
   if (const ELF32BEObjectFile *ELFObj = dyn_cast<ELF32BEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
+    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
 
   // Little-endian 64-bit
   if (const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
+    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
 
   // Big-endian 64-bit
   if (const ELF64BEObjectFile *ELFObj = dyn_cast<ELF64BEObjectFile>(Obj))
-    return createELFDumper(ELFObj, Writer, Result);
+    return createELFDumper(ELFObj->getELFFile(), Writer, Result);
 
   return readobj_error::unsupported_obj_file_format;
 }
 
 } // namespace llvm
 
-
 static const EnumEntry<unsigned> ElfClass[] = {
   { "None",   ELF::ELFCLASSNONE },
   { "32-bit", ELF::ELFCLASS32   },
@@ -322,7 +327,7 @@ static const EnumEntry<unsigned> ElfSymbolTypes[] = {
 
 static const char *getElfSectionType(unsigned Arch, unsigned Type) {
   switch (Arch) {
-  case Triple::arm:
+  case ELF::EM_ARM:
     switch (Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_ARM_EXIDX);
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_ARM_PREEMPTMAP);
@@ -330,16 +335,12 @@ static const char *getElfSectionType(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_ARM_DEBUGOVERLAY);
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_ARM_OVERLAYSECTION);
     }
-  case Triple::hexagon:
-    switch (Type) {
-    LLVM_READOBJ_ENUM_CASE(ELF, SHT_HEX_ORDERED);
-    }
-  case Triple::x86_64:
-    switch (Type) {
-    LLVM_READOBJ_ENUM_CASE(ELF, SHT_X86_64_UNWIND);
-    }
-  case Triple::mips:
-  case Triple::mipsel:
+  case ELF::EM_HEXAGON:
+    switch (Type) { LLVM_READOBJ_ENUM_CASE(ELF, SHT_HEX_ORDERED); }
+  case ELF::EM_X86_64:
+    switch (Type) { LLVM_READOBJ_ENUM_CASE(ELF, SHT_X86_64_UNWIND); }
+  case ELF::EM_MIPS:
+  case ELF::EM_MIPS_RS3_LE:
     switch (Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_REGINFO);
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_OPTIONS);
@@ -376,6 +377,7 @@ static const char *getElfSectionType(unsigned Arch, unsigned Type) {
 static const EnumEntry<unsigned> ElfSectionFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, SHF_WRITE           ),
   LLVM_READOBJ_ENUM_ENT(ELF, SHF_ALLOC           ),
+  LLVM_READOBJ_ENUM_ENT(ELF, SHF_EXCLUDE         ),
   LLVM_READOBJ_ENUM_ENT(ELF, SHF_EXECINSTR       ),
   LLVM_READOBJ_ENUM_ENT(ELF, SHF_MERGE           ),
   LLVM_READOBJ_ENUM_ENT(ELF, SHF_STRINGS         ),
@@ -389,26 +391,41 @@ static const EnumEntry<unsigned> ElfSectionFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, SHF_MIPS_NOSTRIP    )
 };
 
-static const EnumEntry<unsigned> ElfSegmentTypes[] = {
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_NULL   ),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_LOAD   ),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_DYNAMIC),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_INTERP ),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_NOTE   ),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_SHLIB  ),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_PHDR   ),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_TLS    ),
-
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_GNU_EH_FRAME),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_SUNW_EH_FRAME),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_SUNW_UNWIND),
-
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_GNU_STACK),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_GNU_RELRO),
-
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_ARM_EXIDX),
-  LLVM_READOBJ_ENUM_ENT(ELF, PT_ARM_UNWIND)
-};
+static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
+  // Check potentially overlapped processor-specific
+  // program header type.
+  switch (Arch) {
+  case ELF::EM_ARM:
+    switch (Type) {
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_ARM_EXIDX);
+    }
+  case ELF::EM_MIPS:
+  case ELF::EM_MIPS_RS3_LE:
+    switch (Type) {
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_REGINFO);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_RTPROC);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_OPTIONS);
+    }
+  }
+
+  switch (Type) {
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_NULL   );
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_LOAD   );
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_DYNAMIC);
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_INTERP );
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_NOTE   );
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_SHLIB  );
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_PHDR   );
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_TLS    );
+
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_EH_FRAME);
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_SUNW_UNWIND);
+
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_STACK);
+  LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_RELRO);
+  default: return "";
+  }
+}
 
 static const EnumEntry<unsigned> ElfSegmentFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, PF_X),
@@ -416,12 +433,9 @@ static const EnumEntry<unsigned> ElfSegmentFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, PF_R)
 };
 
-
 template<class ELFT>
 void ELFDumper<ELFT>::printFileHeaders() {
-  error_code EC;
-
-  const typename ELFO::Elf_Ehdr *Header = Obj->getElfHeader();
+  const typename ELFO::Elf_Ehdr *Header = Obj->getHeader();
 
   {
     DictScope D(W, "ElfHeader");
@@ -461,24 +475,20 @@ void ELFDumper<ELFT>::printSections() {
   ListScope SectionsD(W, "Sections");
 
   int SectionIndex = -1;
-  error_code EC;
-  for (section_iterator SecI = Obj->begin_sections(),
-                        SecE = Obj->end_sections();
-                        SecI != SecE; SecI.increment(EC)) {
-    if (error(EC)) break;
-
+  for (typename ELFO::Elf_Shdr_Iter SecI = Obj->begin_sections(),
+                                    SecE = Obj->end_sections();
+       SecI != SecE; ++SecI) {
     ++SectionIndex;
 
-    const Elf_Shdr *Section = Obj->getElfSection(SecI);
-    StringRef Name;
-    if (error(SecI->getName(Name)))
-        Name = "";
+    const Elf_Shdr *Section = &*SecI;
+    StringRef Name = errorOrDefault(Obj->getSectionName(Section));
 
     DictScope SectionD(W, "Section");
     W.printNumber("Index", SectionIndex);
     W.printNumber("Name", Name, Section->sh_name);
-    W.printHex   ("Type", getElfSectionType(Obj->getArch(), Section->sh_type),
-                    Section->sh_type);
+    W.printHex("Type",
+               getElfSectionType(Obj->getHeader()->e_machine, Section->sh_type),
+               Section->sh_type);
     W.printFlags ("Flags", Section->sh_flags, makeArrayRef(ElfSectionFlags));
     W.printHex   ("Address", Section->sh_addr);
     W.printHex   ("Offset", Section->sh_offset);
@@ -490,35 +500,23 @@ void ELFDumper<ELFT>::printSections() {
 
     if (opts::SectionRelocations) {
       ListScope D(W, "Relocations");
-      for (relocation_iterator RelI = SecI->begin_relocations(),
-                               RelE = SecI->end_relocations();
-                               RelI != RelE; RelI.increment(EC)) {
-        if (error(EC)) break;
-
-        printRelocation(SecI, RelI);
-      }
+      printRelocations(Section);
     }
 
     if (opts::SectionSymbols) {
       ListScope D(W, "Symbols");
-      for (symbol_iterator SymI = Obj->begin_symbols(),
-                           SymE = Obj->end_symbols();
-                           SymI != SymE; SymI.increment(EC)) {
-        if (error(EC)) break;
-
-        bool Contained = false;
-        if (SecI->containsSymbol(*SymI, Contained) || !Contained)
-          continue;
-
-        printSymbol(SymI);
+      for (typename ELFO::Elf_Sym_Iter SymI = Obj->begin_symbols(),
+                                       SymE = Obj->end_symbols();
+           SymI != SymE; ++SymI) {
+        if (Obj->getSection(&*SymI) == Section)
+          printSymbol(SymI);
       }
     }
 
     if (opts::SectionData) {
-      StringRef Data;
-      if (error(SecI->getContents(Data))) break;
-
-      W.printBinaryBlock("SectionData", Data);
+      ArrayRef<uint8_t> Data = errorOrDefault(Obj->getSectionContents(Section));
+      W.printBinaryBlock("SectionData",
+                         StringRef((const char *)Data.data(), Data.size()));
     }
   }
 }
@@ -527,72 +525,74 @@ template<class ELFT>
 void ELFDumper<ELFT>::printRelocations() {
   ListScope D(W, "Relocations");
 
-  error_code EC;
   int SectionNumber = -1;
-  for (section_iterator SecI = Obj->begin_sections(),
-                        SecE = Obj->end_sections();
-                        SecI != SecE; SecI.increment(EC)) {
-    if (error(EC)) break;
-
+  for (typename ELFO::Elf_Shdr_Iter SecI = Obj->begin_sections(),
+                                    SecE = Obj->end_sections();
+       SecI != SecE; ++SecI) {
     ++SectionNumber;
-    StringRef Name;
-    if (error(SecI->getName(Name)))
+
+    if (SecI->sh_type != ELF::SHT_REL && SecI->sh_type != ELF::SHT_RELA)
       continue;
 
-    bool PrintedGroup = false;
-    for (relocation_iterator RelI = SecI->begin_relocations(),
-                             RelE = SecI->end_relocations();
-                             RelI != RelE; RelI.increment(EC)) {
-      if (error(EC)) break;
+    StringRef Name = errorOrDefault(Obj->getSectionName(&*SecI));
 
-      if (!PrintedGroup) {
-        W.startLine() << "Section (" << SectionNumber << ") " << Name << " {\n";
-        W.indent();
-        PrintedGroup = true;
-      }
+    W.startLine() << "Section (" << SectionNumber << ") " << Name << " {\n";
+    W.indent();
 
-      printRelocation(SecI, RelI);
-    }
+    printRelocations(&*SecI);
 
-    if (PrintedGroup) {
-      W.unindent();
-      W.startLine() << "}\n";
+    W.unindent();
+    W.startLine() << "}\n";
+  }
+}
+
+template <class ELFT>
+void ELFDumper<ELFT>::printRelocations(const Elf_Shdr *Sec) {
+  switch (Sec->sh_type) {
+  case ELF::SHT_REL:
+    for (typename ELFO::Elf_Rel_Iter RI = Obj->begin_rel(Sec),
+                                     RE = Obj->end_rel(Sec);
+         RI != RE; ++RI) {
+      typename ELFO::Elf_Rela Rela;
+      Rela.r_offset = RI->r_offset;
+      Rela.r_info = RI->r_info;
+      Rela.r_addend = 0;
+      printRelocation(Sec, Rela);
     }
+    break;
+  case ELF::SHT_RELA:
+    for (typename ELFO::Elf_Rela_Iter RI = Obj->begin_rela(Sec),
+                                      RE = Obj->end_rela(Sec);
+         RI != RE; ++RI) {
+      printRelocation(Sec, *RI);
+    }
+    break;
   }
 }
 
-template<class ELFT>
-void ELFDumper<ELFT>::printRelocation(section_iterator Sec,
-                                      relocation_iterator RelI) {
-  uint64_t Offset;
-  uint64_t RelocType;
+template <class ELFT>
+void ELFDumper<ELFT>::printRelocation(const Elf_Shdr *Sec,
+                                      typename ELFO::Elf_Rela Rel) {
   SmallString<32> RelocName;
-  int64_t Addend;
+  Obj->getRelocationTypeName(Rel.getType(Obj->isMips64EL()), RelocName);
   StringRef SymbolName;
-  if (Obj->getElfHeader()->e_type == ELF::ET_REL){
-    if (error(RelI->getOffset(Offset))) return;
-  } else {
-    if (error(RelI->getAddress(Offset))) return;
-  }
-  if (error(RelI->getType(RelocType))) return;
-  if (error(RelI->getTypeName(RelocName))) return;
-  if (error(getELFRelocationAddend(*RelI, Addend))) return;
-  symbol_iterator Symbol = RelI->getSymbol();
-  if (Symbol != Obj->end_symbols() && error(Symbol->getName(SymbolName)))
-    return;
+  std::pair<const Elf_Shdr *, const Elf_Sym *> Sym =
+      Obj->getRelocationSymbol(Sec, &Rel);
+  if (Sym.first)
+    SymbolName = errorOrDefault(Obj->getSymbolName(Sym.first, Sym.second));
 
   if (opts::ExpandRelocs) {
     DictScope Group(W, "Relocation");
-    W.printHex("Offset", Offset);
-    W.printNumber("Type", RelocName, RelocType);
+    W.printHex("Offset", Rel.r_offset);
+    W.printNumber("Type", RelocName, (int)Rel.getType(Obj->isMips64EL()));
     W.printString("Symbol", SymbolName.size() > 0 ? SymbolName : "-");
-    W.printHex("Addend", Addend);
+    W.printHex("Addend", Rel.r_addend);
   } else {
     raw_ostream& OS = W.startLine();
-    OS << W.hex(Offset)
+    OS << W.hex(Rel.r_offset)
        << " " << RelocName
        << " " << (SymbolName.size() > 0 ? SymbolName : "-")
-       << " " << W.hex(Addend)
+       << " " << W.hex(Rel.r_addend)
        << "\n";
   }
 }
@@ -600,12 +600,9 @@ void ELFDumper<ELFT>::printRelocation(section_iterator Sec,
 template<class ELFT>
 void ELFDumper<ELFT>::printSymbols() {
   ListScope Group(W, "Symbols");
-
-  error_code EC;
-  for (symbol_iterator SymI = Obj->begin_symbols(), SymE = Obj->end_symbols();
-                       SymI != SymE; SymI.increment(EC)) {
-    if (error(EC)) break;
-
+  for (typename ELFO::Elf_Sym_Iter SymI = Obj->begin_symbols(),
+                                   SymE = Obj->end_symbols();
+       SymI != SymE; ++SymI) {
     printSymbol(SymI);
   }
 }
@@ -614,41 +611,27 @@ template<class ELFT>
 void ELFDumper<ELFT>::printDynamicSymbols() {
   ListScope Group(W, "DynamicSymbols");
 
-  error_code EC;
-  for (symbol_iterator SymI = Obj->begin_dynamic_symbols(),
-                       SymE = Obj->end_dynamic_symbols();
-                       SymI != SymE; SymI.increment(EC)) {
-    if (error(EC)) break;
-
-    printSymbol(SymI, true);
+  for (typename ELFO::Elf_Sym_Iter SymI = Obj->begin_dynamic_symbols(),
+                                   SymE = Obj->end_dynamic_symbols();
+       SymI != SymE; ++SymI) {
+    printSymbol(SymI);
   }
 }
 
-template<class ELFT>
-void ELFDumper<ELFT>::printSymbol(symbol_iterator SymI, bool IsDynamic) {
-  error_code EC;
-
-  const Elf_Sym *Symbol = Obj->getElfSymbol(SymI);
-  const Elf_Shdr *Section = Obj->getSection(Symbol);
-
-  StringRef SymbolName;
-  if (SymI->getName(SymbolName))
-    SymbolName = "";
-
-  StringRef SectionName = "";
-  if (Section)
-    Obj->getSectionName(Section, SectionName);
-
+template <class ELFT>
+void ELFDumper<ELFT>::printSymbol(typename ELFO::Elf_Sym_Iter Symbol) {
+  StringRef SymbolName = errorOrDefault(Obj->getSymbolName(Symbol));
+  const Elf_Shdr *Sec = Obj->getSection(&*Symbol);
+  StringRef SectionName = Sec ? errorOrDefault(Obj->getSectionName(Sec)) : "";
   std::string FullSymbolName(SymbolName);
-  if (IsDynamic) {
-    StringRef Version;
+  if (Symbol.isDynamic()) {
     bool IsDefault;
-    if (error(Obj->getSymbolVersion(*SymI, Version, IsDefault)))
-      return;
-    if (!Version.empty()) {
+    ErrorOr<StringRef> Version = Obj->getSymbolVersion(0, &*Symbol, IsDefault);
+    if (Version) {
       FullSymbolName += (IsDefault ? "@@" : "@");
-      FullSymbolName += Version;
-    }
+      FullSymbolName += *Version;
+    } else
+      error(Version);
   }
 
   DictScope D(W, "Symbol");
@@ -700,15 +683,25 @@ static const char *getTypeString(uint64_t Type) {
   LLVM_READOBJ_TYPE_CASE(SYMENT);
   LLVM_READOBJ_TYPE_CASE(SYMTAB);
   LLVM_READOBJ_TYPE_CASE(TEXTREL);
+  LLVM_READOBJ_TYPE_CASE(VERNEED);
+  LLVM_READOBJ_TYPE_CASE(VERNEEDNUM);
+  LLVM_READOBJ_TYPE_CASE(VERSYM);
+  LLVM_READOBJ_TYPE_CASE(MIPS_RLD_VERSION);
+  LLVM_READOBJ_TYPE_CASE(MIPS_FLAGS);
+  LLVM_READOBJ_TYPE_CASE(MIPS_BASE_ADDRESS);
+  LLVM_READOBJ_TYPE_CASE(MIPS_LOCAL_GOTNO);
+  LLVM_READOBJ_TYPE_CASE(MIPS_SYMTABNO);
+  LLVM_READOBJ_TYPE_CASE(MIPS_UNREFEXTNO);
+  LLVM_READOBJ_TYPE_CASE(MIPS_GOTSYM);
   default: return "unknown";
   }
 }
 
 #undef LLVM_READOBJ_TYPE_CASE
 
-template<class ELFT>
-static void printValue(const ELFObjectFile<ELFT> *O, uint64_t Type,
-                       uint64_t Value, bool Is64, raw_ostream &OS) {
+template <class ELFT>
+static void printValue(const ELFFile<ELFT> *O, uint64_t Type, uint64_t Value,
+                       bool Is64, raw_ostream &OS) {
   switch (Type) {
   case DT_PLTREL:
     if (Value == DT_REL) {
@@ -732,9 +725,21 @@ static void printValue(const ELFObjectFile<ELFT> *O, uint64_t Type,
   case DT_FINI_ARRAY:
   case DT_PREINIT_ARRAY:
   case DT_DEBUG:
+  case DT_VERNEED:
+  case DT_VERSYM:
   case DT_NULL:
+  case DT_MIPS_FLAGS:
+  case DT_MIPS_BASE_ADDRESS:
+  case DT_MIPS_GOTSYM:
     OS << format("0x%" PRIX64, Value);
     break;
+  case DT_VERNEEDNUM:
+  case DT_MIPS_RLD_VERSION:
+  case DT_MIPS_LOCAL_GOTNO:
+  case DT_MIPS_SYMTABNO:
+  case DT_MIPS_UNREFEXTNO:
+    OS << Value;
+    break;
   case DT_PLTRELSZ:
   case DT_RELASZ:
   case DT_RELAENT:
@@ -748,12 +753,14 @@ static void printValue(const ELFObjectFile<ELFT> *O, uint64_t Type,
     OS << Value << " (bytes)";
     break;
   case DT_NEEDED:
-    OS << "SharedLibrary ("
-       << O->getString(O->getDynamicStringTableSectionHeader(), Value) << ")";
+    OS << "SharedLibrary (" << O->getDynamicString(Value) << ")";
     break;
   case DT_SONAME:
-    OS << "LibrarySoname ("
-       << O->getString(O->getDynamicStringTableSectionHeader(), Value) << ")";
+    OS << "LibrarySoname (" << O->getDynamicString(Value) << ")";
+    break;
+  case DT_RPATH:
+  case DT_RUNPATH:
+    OS << O->getDynamicString(Value);
     break;
   }
 }
@@ -765,9 +772,8 @@ void ELFDumper<ELFT>::printUnwindInfo() {
 
 template<class ELFT>
 void ELFDumper<ELFT>::printDynamicTable() {
-  typedef typename ELFO::Elf_Dyn_iterator EDI;
-  EDI Start = Obj->begin_dynamic_table(),
-      End = Obj->end_dynamic_table(true);
+  typedef typename ELFO::Elf_Dyn_Iter EDI;
+  EDI Start = Obj->begin_dynamic_table(), End = Obj->end_dynamic_table(true);
 
   if (Start == End)
     return;
@@ -776,7 +782,7 @@ void ELFDumper<ELFT>::printDynamicTable() {
   raw_ostream &OS = W.getOStream();
   W.startLine() << "DynamicSection [ (" << Total << " entries)\n";
 
-  bool Is64 = Obj->getBytesInAddress() == 8;
+  bool Is64 = ELFT::Is64Bits;
 
   W.startLine()
      << "  Tag" << (Is64 ? "                " : "        ") << "Type"
@@ -793,38 +799,23 @@ void ELFDumper<ELFT>::printDynamicTable() {
   W.startLine() << "]\n";
 }
 
-static bool compareLibraryName(const LibraryRef &L, const LibraryRef &R) {
-  StringRef LPath, RPath;
-  L.getPath(LPath);
-  R.getPath(RPath);
-  return LPath < RPath;
-}
-
 template<class ELFT>
 void ELFDumper<ELFT>::printNeededLibraries() {
   ListScope D(W, "NeededLibraries");
 
-  error_code EC;
-
-  typedef std::vector<LibraryRef> LibsTy;
+  typedef std::vector<StringRef> LibsTy;
   LibsTy Libs;
 
-  for (library_iterator I = Obj->begin_libraries_needed(),
-                        E = Obj->end_libraries_needed();
-                        I != E; I.increment(EC)) {
-    if (EC)
-      report_fatal_error("Needed libraries iteration failed");
-
-    Libs.push_back(*I);
-  }
+  for (typename ELFO::Elf_Dyn_Iter DynI = Obj->begin_dynamic_table(),
+                                   DynE = Obj->end_dynamic_table();
+       DynI != DynE; ++DynI)
+    if (DynI->d_tag == ELF::DT_NEEDED)
+      Libs.push_back(Obj->getDynamicString(DynI->d_un.d_val));
 
-  std::sort(Libs.begin(), Libs.end(), &compareLibraryName);
+  std::stable_sort(Libs.begin(), Libs.end());
 
-  for (LibsTy::const_iterator I = Libs.begin(), E = Libs.end();
-                                  I != E; ++I) {
-    StringRef Path;
-    I->getPath(Path);
-    outs() << "  " << Path << "\n";
+  for (LibsTy::const_iterator I = Libs.begin(), E = Libs.end(); I != E; ++I) {
+    outs() << "  " << *I << "\n";
   }
 }
 
@@ -836,7 +827,9 @@ void ELFDumper<ELFT>::printProgramHeaders() {
                                     PE = Obj->end_program_headers();
                                     PI != PE; ++PI) {
     DictScope P(W, "ProgramHeader");
-    W.printEnum  ("Type", PI->p_type, makeArrayRef(ElfSegmentTypes));
+    W.printHex   ("Type",
+                  getElfSegmentType(Obj->getHeader()->e_machine, PI->p_type),
+                  PI->p_type);
     W.printHex   ("Offset", PI->p_offset);
     W.printHex   ("VirtualAddress", PI->p_vaddr);
     W.printHex   ("PhysicalAddress", PI->p_paddr);
diff --git a/tools/llvm-readobj/MachODumper.cpp b/tools/llvm-readobj/MachODumper.cpp
index 8df6fd6..b97166b 100644
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp
@@ -166,28 +166,28 @@ static void getSection(const MachOObjectFile *Obj,
                        DataRefImpl Sec,
                        MachOSection &Section) {
   if (!Obj->is64Bit()) {
-    macho::Section Sect = Obj->getSection(Sec);
-    Section.Address     = Sect.Address;
-    Section.Size        = Sect.Size;
-    Section.Offset      = Sect.Offset;
-    Section.Alignment   = Sect.Align;
-    Section.RelocationTableOffset = Sect.RelocationTableOffset;
-    Section.NumRelocationTableEntries = Sect.NumRelocationTableEntries;
-    Section.Flags       = Sect.Flags;
-    Section.Reserved1   = Sect.Reserved1;
-    Section.Reserved2   = Sect.Reserved2;
+    MachO::section Sect = Obj->getSection(Sec);
+    Section.Address     = Sect.addr;
+    Section.Size        = Sect.size;
+    Section.Offset      = Sect.offset;
+    Section.Alignment   = Sect.align;
+    Section.RelocationTableOffset = Sect.reloff;
+    Section.NumRelocationTableEntries = Sect.nreloc;
+    Section.Flags       = Sect.flags;
+    Section.Reserved1   = Sect.reserved1;
+    Section.Reserved2   = Sect.reserved2;
     return;
   }
-  macho::Section64 Sect = Obj->getSection64(Sec);
-  Section.Address     = Sect.Address;
-  Section.Size        = Sect.Size;
-  Section.Offset      = Sect.Offset;
-  Section.Alignment   = Sect.Align;
-  Section.RelocationTableOffset = Sect.RelocationTableOffset;
-  Section.NumRelocationTableEntries = Sect.NumRelocationTableEntries;
-  Section.Flags       = Sect.Flags;
-  Section.Reserved1   = Sect.Reserved1;
-  Section.Reserved2   = Sect.Reserved2;
+  MachO::section_64 Sect = Obj->getSection64(Sec);
+  Section.Address     = Sect.addr;
+  Section.Size        = Sect.size;
+  Section.Offset      = Sect.offset;
+  Section.Alignment   = Sect.align;
+  Section.RelocationTableOffset = Sect.reloff;
+  Section.NumRelocationTableEntries = Sect.nreloc;
+  Section.Flags       = Sect.flags;
+  Section.Reserved1   = Sect.reserved1;
+  Section.Reserved2   = Sect.reserved2;
 }
 
 
@@ -195,20 +195,20 @@ static void getSymbol(const MachOObjectFile *Obj,
                       DataRefImpl DRI,
                       MachOSymbol &Symbol) {
   if (!Obj->is64Bit()) {
-    macho::SymbolTableEntry Entry = Obj->getSymbolTableEntry(DRI);
-    Symbol.StringIndex  = Entry.StringIndex;
-    Symbol.Type         = Entry.Type;
-    Symbol.SectionIndex = Entry.SectionIndex;
-    Symbol.Flags        = Entry.Flags;
-    Symbol.Value        = Entry.Value;
+    MachO::nlist Entry = Obj->getSymbolTableEntry(DRI);
+    Symbol.StringIndex  = Entry.n_strx;
+    Symbol.Type         = Entry.n_type;
+    Symbol.SectionIndex = Entry.n_sect;
+    Symbol.Flags        = Entry.n_desc;
+    Symbol.Value        = Entry.n_value;
     return;
   }
-  macho::Symbol64TableEntry Entry = Obj->getSymbol64TableEntry(DRI);
-  Symbol.StringIndex  = Entry.StringIndex;
-  Symbol.Type         = Entry.Type;
-  Symbol.SectionIndex = Entry.SectionIndex;
-  Symbol.Flags        = Entry.Flags;
-  Symbol.Value        = Entry.Value;
+  MachO::nlist_64 Entry = Obj->getSymbol64TableEntry(DRI);
+  Symbol.StringIndex  = Entry.n_strx;
+  Symbol.Type         = Entry.n_type;
+  Symbol.SectionIndex = Entry.n_sect;
+  Symbol.Flags        = Entry.n_desc;
+  Symbol.Value        = Entry.n_value;
 }
 
 void MachODumper::printFileHeaders() {
@@ -349,7 +349,7 @@ void MachODumper::printRelocation(const MachOObjectFile *Obj,
     return;
 
   DataRefImpl DR = RelI->getRawDataRefImpl();
-  macho::RelocationEntry RE = Obj->getRelocation(DR);
+  MachO::any_relocation_info RE = Obj->getRelocation(DR);
   bool IsScattered = Obj->isRelocationScattered(RE);
 
   if (opts::ExpandRelocs) {
@@ -398,8 +398,6 @@ void MachODumper::printDynamicSymbols() {
 }
 
 void MachODumper::printSymbol(symbol_iterator SymI) {
-  error_code EC;
-
   StringRef SymbolName;
   if (SymI->getName(SymbolName))
     SymbolName = "";
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index 2e95b6b..f84a72f 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -130,12 +130,15 @@ namespace opts {
     cl::desc("Expand each shown relocation to multiple lines"));
 } // namespace opts
 
+static int ReturnValue = EXIT_SUCCESS;
+
 namespace llvm {
 
 bool error(error_code EC) {
   if (!EC)
     return false;
 
+  ReturnValue = EXIT_FAILURE;
   outs() << "\nError reading file: " << EC.message() << ".\n";
   outs().flush();
   return true;
@@ -157,6 +160,7 @@ static void reportError(StringRef Input, error_code EC) {
 
   errs() << Input << ": " << EC.message() << "\n";
   errs().flush();
+  ReturnValue = EXIT_FAILURE;
 }
 
 static void reportError(StringRef Input, StringRef Message) {
@@ -164,6 +168,7 @@ static void reportError(StringRef Input, StringRef Message) {
     Input = "<stdin>";
 
   errs() << Input << ": " << Message << "\n";
+  ReturnValue = EXIT_FAILURE;
 }
 
 /// @brief Creates an format-specific object file dumper.
@@ -289,5 +294,5 @@ int main(int argc, const char *argv[]) {
   std::for_each(opts::InputFilenames.begin(), opts::InputFilenames.end(),
                 dumpInput);
 
-  return 0;
+  return ReturnValue;
 }
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 7f042d2..531595e 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 using namespace llvm;
@@ -60,9 +62,10 @@ public:
   SmallVector<sys::MemoryBlock, 16> DataMemory;
 
   uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                               unsigned SectionID);
+                               unsigned SectionID, StringRef SectionName);
   uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                               unsigned SectionID, bool IsReadOnly);
+                               unsigned SectionID, StringRef SectionName,
+                               bool IsReadOnly);
 
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true) {
@@ -80,7 +83,8 @@ public:
 
 uint8_t *TrivialMemoryManager::allocateCodeSection(uintptr_t Size,
                                                    unsigned Alignment,
-                                                   unsigned SectionID) {
+                                                   unsigned SectionID,
+                                                   StringRef SectionName) {
   sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, 0, 0);
   FunctionMemory.push_back(MB);
   return (uint8_t*)MB.base();
@@ -89,6 +93,7 @@ uint8_t *TrivialMemoryManager::allocateCodeSection(uintptr_t Size,
 uint8_t *TrivialMemoryManager::allocateDataSection(uintptr_t Size,
                                                    unsigned Alignment,
                                                    unsigned SectionID,
+                                                   StringRef SectionName,
                                                    bool IsReadOnly) {
   sys::MemoryBlock MB = sys::Memory::AllocateRWX(Size, 0, 0);
   DataMemory.push_back(MB);
@@ -236,6 +241,9 @@ static int executeInput() {
 }
 
 int main(int argc, char **argv) {
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+
   ProgramName = argv[0];
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
 
diff --git a/tools/llvm-shlib/Makefile b/tools/llvm-shlib/Makefile
index 1d9053b..92f3132 100644
--- a/tools/llvm-shlib/Makefile
+++ b/tools/llvm-shlib/Makefile
@@ -62,13 +62,13 @@ ifeq ($(HOST_OS),Darwin)
     endif
 endif
 
-ifeq ($(HOST_OS), $(filter $(HOST_OS), Linux FreeBSD GNU/kFreeBSD OpenBSD GNU Bitrig))
+ifeq ($(HOST_OS), $(filter $(HOST_OS), DragonFly Linux FreeBSD GNU/kFreeBSD OpenBSD GNU Bitrig))
     # Include everything from the .a's into the shared library.
     LLVMLibsOptions := -Wl,--whole-archive $(LLVMLibsOptions) \
                        -Wl,--no-whole-archive
 endif
 
-ifeq ($(HOST_OS), $(filter $(HOST_OS), Linux FreeBSD GNU/kFreeBSD GNU))
+ifeq ($(HOST_OS), $(filter $(HOST_OS), DragonFly Linux FreeBSD GNU/kFreeBSD GNU))
     # Add soname to the library.
     LLVMLibsOptions += -Wl,--soname,lib$(LIBRARYNAME)$(SHLIBEXT)
 endif
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 15f7abf..fd10baf 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -52,6 +52,7 @@ static cl::opt<bool> GenPPCFP128("generate-ppc-fp128",
 static cl::opt<bool> GenX86MMX("generate-x86-mmx",
   cl::desc("Generate X86 MMX floating-point values"), cl::init(false));
 
+namespace {
 /// A utility class to provide a pseudo-random number generator which is
 /// the same across all platforms. This is somewhat close to the libc
 /// implementation. Note: This is not a cryptographically secure pseudorandom
@@ -607,7 +608,9 @@ struct CmpModifier: public Modifier {
   }
 };
 
-void FillFunction(Function *F, Random &R) {
+} // end anonymous namespace
+
+static void FillFunction(Function *F, Random &R) {
   // Create a legal entry block.
   BasicBlock *BB = BasicBlock::Create(F->getContext(), "BB", F);
   ReturnInst::Create(F->getContext(), BB);
@@ -654,7 +657,7 @@ void FillFunction(Function *F, Random &R) {
   SM->ActN(5); // Throw in a few stores.
 }
 
-void IntroduceControlFlow(Function *F, Random &R) {
+static void IntroduceControlFlow(Function *F, Random &R) {
   std::vector<Instruction*> BoolInst;
   for (BasicBlock::iterator it = F->begin()->begin(),
        e = F->begin()->end(); it != e; ++it) {
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.cpp b/tools/llvm-symbolizer/LLVMSymbolize.cpp
index 0346fb2..320ab3f 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.cpp
+++ b/tools/llvm-symbolizer/LLVMSymbolize.cpp
@@ -13,12 +13,17 @@
 
 #include "LLVMSymbolize.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/config.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 
 #include <sstream>
+#include <stdlib.h>
 
 namespace llvm {
 namespace symbolize {
@@ -191,7 +196,7 @@ std::string LLVMSymbolizer::symbolizeData(const std::string &ModuleName,
   if (Opts.UseSymbolTable) {
     if (ModuleInfo *Info = getOrCreateModuleInfo(ModuleName)) {
       if (Info->symbolizeData(ModuleOffset, Name, Start, Size) && Opts.Demangle)
-        Name = DemangleName(Name);
+        Name = DemangleGlobalName(Name);
     }
   }
   std::stringstream ss;
@@ -215,6 +220,81 @@ static std::string getDarwinDWARFResourceForPath(const std::string &Path) {
   return ResourceName.str();
 }
 
+static bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
+  OwningPtr<MemoryBuffer> MB;
+  if (MemoryBuffer::getFileOrSTDIN(Path, MB))
+    return false;
+  return !zlib::isAvailable() || CRCHash == zlib::crc32(MB->getBuffer());
+}
+
+static bool findDebugBinary(const std::string &OrigPath,
+                            const std::string &DebuglinkName, uint32_t CRCHash,
+                            std::string &Result) {
+  std::string OrigRealPath = OrigPath;
+#if defined(HAVE_REALPATH)
+  if (char *RP = realpath(OrigPath.c_str(), NULL)) {
+    OrigRealPath = RP;
+    free(RP);
+  }
+#endif
+  SmallString<16> OrigDir(OrigRealPath);
+  llvm::sys::path::remove_filename(OrigDir);
+  SmallString<16> DebugPath = OrigDir;
+  // Try /path/to/original_binary/debuglink_name
+  llvm::sys::path::append(DebugPath, DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = DebugPath.str();
+    return true;
+  }
+  // Try /path/to/original_binary/.debug/debuglink_name
+  DebugPath = OrigRealPath;
+  llvm::sys::path::append(DebugPath, ".debug", DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = DebugPath.str();
+    return true;
+  }
+  // Try /usr/lib/debug/path/to/original_binary/debuglink_name
+  DebugPath = "/usr/lib/debug";
+  llvm::sys::path::append(DebugPath, llvm::sys::path::relative_path(OrigDir),
+                          DebuglinkName);
+  if (checkFileCRC(DebugPath, CRCHash)) {
+    Result = DebugPath.str();
+    return true;
+  }
+  return false;
+}
+
+static bool getGNUDebuglinkContents(const Binary *Bin, std::string &DebugName,
+                                    uint32_t &CRCHash) {
+  const ObjectFile *Obj = dyn_cast<ObjectFile>(Bin);
+  if (!Obj)
+    return false;
+  error_code EC;
+  for (section_iterator I = Obj->begin_sections(), E = Obj->end_sections();
+       I != E; I.increment(EC)) {
+    StringRef Name;
+    I->getName(Name);
+    Name = Name.substr(Name.find_first_not_of("._"));
+    if (Name == "gnu_debuglink") {
+      StringRef Data;
+      I->getContents(Data);
+      DataExtractor DE(Data, Obj->isLittleEndian(), 0);
+      uint32_t Offset = 0;
+      if (const char *DebugNameStr = DE.getCStr(&Offset)) {
+        // 4-byte align the offset.
+        Offset = (Offset + 3) & ~0x3;
+        if (DE.isValidOffsetForDataOfSize(Offset, 4)) {
+          DebugName = DebugNameStr;
+          CRCHash = DE.getU32(&Offset);
+          return true;
+        }
+      }
+      break;
+    }
+  }
+  return false;
+}
+
 LLVMSymbolizer::BinaryPair
 LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
   BinaryMapTy::iterator I = BinaryForPath.find(Path);
@@ -241,6 +321,18 @@ LLVMSymbolizer::getOrCreateBinary(const std::string &Path) {
         ParsedBinariesAndObjects.push_back(DbgBin);
       }
     }
+    // Try to locate the debug binary using .gnu_debuglink section.
+    if (DbgBin == 0) {
+      std::string DebuglinkName;
+      uint32_t CRCHash;
+      std::string DebugBinaryPath;
+      if (getGNUDebuglinkContents(Bin, DebuglinkName, CRCHash) &&
+          findDebugBinary(Path, DebuglinkName, CRCHash, DebugBinaryPath) &&
+          !error(createBinary(DebugBinaryPath, ParsedDbgBinary))) {
+        DbgBin = ParsedDbgBinary.take();
+        ParsedBinariesAndObjects.push_back(DbgBin);
+      }
+    }
   }
   if (DbgBin == 0)
     DbgBin = Bin;
@@ -344,5 +436,11 @@ std::string LLVMSymbolizer::DemangleName(const std::string &Name) {
 #endif
 }
 
+std::string LLVMSymbolizer::DemangleGlobalName(const std::string &Name) {
+  // We can spoil names of globals with C linkage, so use an heuristic
+  // approach to check if the name should be demangled.
+  return (Name.substr(0, 2) == "_Z") ? DemangleName(Name) : Name;
+}
+
 } // namespace symbolize
 } // namespace llvm
diff --git a/tools/llvm-symbolizer/LLVMSymbolize.h b/tools/llvm-symbolizer/LLVMSymbolize.h
index 03c765c..eb2666a 100644
--- a/tools/llvm-symbolizer/LLVMSymbolize.h
+++ b/tools/llvm-symbolizer/LLVMSymbolize.h
@@ -71,6 +71,7 @@ private:
   ObjectFile *getObjectFileFromBinary(Binary *Bin, const std::string &ArchName);
 
   std::string printDILineInfo(DILineInfo LineInfo) const;
+  static std::string DemangleGlobalName(const std::string &Name);
 
   // Owns all the parsed binaries and object files.
   SmallVector<Binary*, 4> ParsedBinariesAndObjects;
diff --git a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt
index 5820b14..957a9f0 100644
--- a/tools/lto/CMakeLists.txt
+++ b/tools/lto/CMakeLists.txt
@@ -1,22 +1,38 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
-  ipo scalaropts linker bitreader bitwriter mcdisassembler vectorize)
+  ipo scalaropts linker bitreader bitwriter lto mcdisassembler vectorize)
 
 add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" )
 
 set(SOURCES
-  LTOCodeGenerator.cpp
   LTODisassembler.cpp
   lto.cpp
-  LTOModule.cpp
   )
 
-set(LLVM_COMMON_DEPENDS intrinsics_gen)
+if( NOT CYGWIN AND LLVM_ENABLE_PIC )
+  if ( WIN32 )
+    # Create .def file containing a list of exports preceeded by
+    # 'EXPORTS'.  The file "lto.exports" already contains the list, so we
+    # massage it into the correct format here to create "lto.exports.def".
+    set(LTO_EXPORTS_DEF ${CMAKE_CURRENT_BINARY_DIR}/lto.exports.def)
+    set(LTO_EXPORTS_DEF_TEMP ${LTO_EXPORTS_DEF}.txt)
+    file(READ "lto.exports" exports_list)
+    file(WRITE ${LTO_EXPORTS_DEF_TEMP} "LIBRARY LTO\n")
+    file(APPEND ${LTO_EXPORTS_DEF_TEMP} "EXPORTS\n")
+    file(APPEND ${LTO_EXPORTS_DEF_TEMP} ${exports_list})
+
+    # Copy the file only if it has changed.
+    execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
+      ${LTO_EXPORTS_DEF_TEMP} ${LTO_EXPORTS_DEF})
+
+    set(SHARED_LIB_SOURCES ${SOURCES} ${LTO_EXPORTS_DEF})
+  else()
+    set(SHARED_LIB_SOURCES ${SOURCES})
+  endif()
 
-if( NOT WIN32 AND LLVM_ENABLE_PIC )
   set(bsl ${BUILD_SHARED_LIBS})
   set(BUILD_SHARED_LIBS ON)
-  add_llvm_library(LTO ${SOURCES})
+  add_llvm_library(LTO ${SHARED_LIB_SOURCES})
   set_property(TARGET LTO PROPERTY OUTPUT_NAME "LTO")
   set(BUILD_SHARED_LIBS ${bsl})
   set(LTO_STATIC_TARGET_NAME LTO_static)
@@ -28,3 +44,8 @@ if( NOT BUILD_SHARED_LIBS )
   add_llvm_library(${LTO_STATIC_TARGET_NAME} ${SOURCES})
   set_property(TARGET ${LTO_STATIC_TARGET_NAME} PROPERTY OUTPUT_NAME "LTO")
 endif()
+
+if( NOT CYGWIN )
+  install(FILES ${LLVM_MAIN_INCLUDE_DIR}/llvm-c/lto.h
+          DESTINATION include/llvm-c)
+endif()
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
deleted file mode 100644
index 758227d..0000000
--- a/tools/lto/LTOCodeGenerator.cpp
+++ /dev/null
@@ -1,459 +0,0 @@
-//===-LTOCodeGenerator.cpp - LLVM Link Time Optimizer ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Link Time Optimization library. This library is
-// intended to be used by linker to optimize code at link time.
-//
-//===----------------------------------------------------------------------===//
-
-#include "LTOCodeGenerator.h"
-#include "LTOModule.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/Config/config.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Linker.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/system_error.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/ObjCARC.h"
-using namespace llvm;
-
-static cl::opt<bool>
-DisableOpt("disable-opt", cl::init(false),
-  cl::desc("Do not run any optimization passes"));
-
-static cl::opt<bool>
-DisableInline("disable-inlining", cl::init(false),
-  cl::desc("Do not run the inliner pass"));
-
-static cl::opt<bool>
-DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
-  cl::desc("Do not run the GVN load PRE pass"));
-
-const char* LTOCodeGenerator::getVersionString() {
-#ifdef LLVM_VERSION_INFO
-  return PACKAGE_NAME " version " PACKAGE_VERSION ", " LLVM_VERSION_INFO;
-#else
-  return PACKAGE_NAME " version " PACKAGE_VERSION;
-#endif
-}
-
-LTOCodeGenerator::LTOCodeGenerator()
-  : _context(getGlobalContext()),
-    _linker(new Module("ld-temp.o", _context)), _target(NULL),
-    _emitDwarfDebugInfo(false), _scopeRestrictionsDone(false),
-    _codeModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC),
-    _nativeObjectFile(NULL) {
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  InitializeAllAsmPrinters();
-  initializeLTOPasses();
-}
-
-LTOCodeGenerator::~LTOCodeGenerator() {
-  delete _target;
-  delete _nativeObjectFile;
-  delete _linker.getModule();
-
-  for (std::vector<char*>::iterator I = _codegenOptions.begin(),
-         E = _codegenOptions.end(); I != E; ++I)
-    free(*I);
-}
-
-// Initialize LTO passes. Please keep this funciton in sync with
-// PassManagerBuilder::populateLTOPassManager(), and make sure all LTO
-// passes are initialized. 
-//
-void LTOCodeGenerator::initializeLTOPasses() {
-  PassRegistry &R = *PassRegistry::getPassRegistry();
-
-  initializeInternalizePassPass(R);
-  initializeIPSCCPPass(R);
-  initializeGlobalOptPass(R);
-  initializeConstantMergePass(R);
-  initializeDAHPass(R);
-  initializeInstCombinerPass(R);
-  initializeSimpleInlinerPass(R);
-  initializePruneEHPass(R);
-  initializeGlobalDCEPass(R);
-  initializeArgPromotionPass(R);
-  initializeJumpThreadingPass(R);
-  initializeSROAPass(R);
-  initializeSROA_DTPass(R);
-  initializeSROA_SSAUpPass(R);
-  initializeFunctionAttrsPass(R);
-  initializeGlobalsModRefPass(R);
-  initializeLICMPass(R);
-  initializeGVNPass(R);
-  initializeMemCpyOptPass(R);
-  initializeDCEPass(R);
-  initializeCFGSimplifyPassPass(R);
-}
-
-bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
-  bool ret = _linker.linkInModule(mod->getLLVVMModule(), &errMsg);
-
-  const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
-  for (int i = 0, e = undefs.size(); i != e; ++i)
-    _asmUndefinedRefs[undefs[i]] = 1;
-
-  return !ret;
-}
-
-void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) {
-  switch (debug) {
-  case LTO_DEBUG_MODEL_NONE:
-    _emitDwarfDebugInfo = false;
-    return;
-
-  case LTO_DEBUG_MODEL_DWARF:
-    _emitDwarfDebugInfo = true;
-    return;
-  }
-  llvm_unreachable("Unknown debug format!");
-}
-
-void LTOCodeGenerator::setCodePICModel(lto_codegen_model model) {
-  switch (model) {
-  case LTO_CODEGEN_PIC_MODEL_STATIC:
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
-    _codeModel = model;
-    return;
-  }
-  llvm_unreachable("Unknown PIC model!");
-}
-
-bool LTOCodeGenerator::writeMergedModules(const char *path,
-                                          std::string &errMsg) {
-  if (!determineTarget(errMsg))
-    return false;
-
-  // Run the verifier on the merged modules.
-  PassManager passes;
-  passes.add(createVerifierPass());
-  passes.run(*_linker.getModule());
-
-  // create output file
-  std::string ErrInfo;
-  tool_output_file Out(path, ErrInfo, sys::fs::F_Binary);
-  if (!ErrInfo.empty()) {
-    errMsg = "could not open bitcode file for writing: ";
-    errMsg += path;
-    return false;
-  }
-
-  // write bitcode to it
-  WriteBitcodeToFile(_linker.getModule(), Out.os());
-  Out.os().close();
-
-  if (Out.os().has_error()) {
-    errMsg = "could not write bitcode file: ";
-    errMsg += path;
-    Out.os().clear_error();
-    return false;
-  }
-
-  Out.keep();
-  return true;
-}
-
-bool LTOCodeGenerator::compile_to_file(const char** name, std::string& errMsg) {
-  // make unique temp .o file to put generated object file
-  SmallString<128> Filename;
-  int FD;
-  error_code EC = sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
-  if (EC) {
-    errMsg = EC.message();
-    return false;
-  }
-
-  // generate object file
-  tool_output_file objFile(Filename.c_str(), FD);
-
-  bool genResult = generateObjectFile(objFile.os(), errMsg);
-  objFile.os().close();
-  if (objFile.os().has_error()) {
-    objFile.os().clear_error();
-    sys::fs::remove(Twine(Filename));
-    return false;
-  }
-
-  objFile.keep();
-  if (!genResult) {
-    sys::fs::remove(Twine(Filename));
-    return false;
-  }
-
-  _nativeObjectPath = Filename.c_str();
-  *name = _nativeObjectPath.c_str();
-  return true;
-}
-
-const void* LTOCodeGenerator::compile(size_t* length, std::string& errMsg) {
-  const char *name;
-  if (!compile_to_file(&name, errMsg))
-    return NULL;
-
-  // remove old buffer if compile() called twice
-  delete _nativeObjectFile;
-
-  // read .o file into memory buffer
-  OwningPtr<MemoryBuffer> BuffPtr;
-  if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
-    errMsg = ec.message();
-    sys::fs::remove(_nativeObjectPath);
-    return NULL;
-  }
-  _nativeObjectFile = BuffPtr.take();
-
-  // remove temp files
-  sys::fs::remove(_nativeObjectPath);
-
-  // return buffer, unless error
-  if (_nativeObjectFile == NULL)
-    return NULL;
-  *length = _nativeObjectFile->getBufferSize();
-  return _nativeObjectFile->getBufferStart();
-}
-
-bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
-  if (_target != NULL)
-    return true;
-
-  // if options were requested, set them
-  if (!_codegenOptions.empty())
-    cl::ParseCommandLineOptions(_codegenOptions.size(),
-                                const_cast<char **>(&_codegenOptions[0]));
-
-  std::string TripleStr = _linker.getModule()->getTargetTriple();
-  if (TripleStr.empty())
-    TripleStr = sys::getDefaultTargetTriple();
-  llvm::Triple Triple(TripleStr);
-
-  // create target machine from info for merged modules
-  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
-  if (march == NULL)
-    return false;
-
-  // The relocation model is actually a static member of TargetMachine and
-  // needs to be set before the TargetMachine is instantiated.
-  Reloc::Model RelocModel = Reloc::Default;
-  switch (_codeModel) {
-  case LTO_CODEGEN_PIC_MODEL_STATIC:
-    RelocModel = Reloc::Static;
-    break;
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
-    RelocModel = Reloc::PIC_;
-    break;
-  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
-    RelocModel = Reloc::DynamicNoPIC;
-    break;
-  }
-
-  // construct LTOModule, hand over ownership of module and target
-  SubtargetFeatures Features;
-  Features.getDefaultSubtargetFeatures(Triple);
-  std::string FeatureStr = Features.getString();
-  // Set a default CPU for Darwin triples.
-  if (_mCpu.empty() && Triple.isOSDarwin()) {
-    if (Triple.getArch() == llvm::Triple::x86_64)
-      _mCpu = "core2";
-    else if (Triple.getArch() == llvm::Triple::x86)
-      _mCpu = "yonah";
-  }
-  TargetOptions Options;
-  LTOModule::getTargetOptions(Options);
-  _target = march->createTargetMachine(TripleStr, _mCpu, FeatureStr, Options,
-                                       RelocModel, CodeModel::Default,
-                                       CodeGenOpt::Aggressive);
-  return true;
-}
-
-void LTOCodeGenerator::
-applyRestriction(GlobalValue &GV,
-                 std::vector<const char*> &mustPreserveList,
-                 SmallPtrSet<GlobalValue*, 8> &asmUsed,
-                 Mangler &mangler) {
-  SmallString<64> Buffer;
-  mangler.getNameWithPrefix(Buffer, &GV, false);
-
-  if (GV.isDeclaration())
-    return;
-  if (_mustPreserveSymbols.count(Buffer))
-    mustPreserveList.push_back(GV.getName().data());
-  if (_asmUndefinedRefs.count(Buffer))
-    asmUsed.insert(&GV);
-}
-
-static void findUsedValues(GlobalVariable *LLVMUsed,
-                           SmallPtrSet<GlobalValue*, 8> &UsedValues) {
-  if (LLVMUsed == 0) return;
-
-  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
-  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
-    if (GlobalValue *GV =
-        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
-      UsedValues.insert(GV);
-}
-
-void LTOCodeGenerator::applyScopeRestrictions() {
-  if (_scopeRestrictionsDone) return;
-  Module *mergedModule = _linker.getModule();
-
-  // Start off with a verification pass.
-  PassManager passes;
-  passes.add(createVerifierPass());
-
-  // mark which symbols can not be internalized
-  MCContext Context(_target->getMCAsmInfo(), _target->getRegisterInfo(), NULL);
-  Mangler mangler(Context, _target);
-  std::vector<const char*> mustPreserveList;
-  SmallPtrSet<GlobalValue*, 8> asmUsed;
-
-  for (Module::iterator f = mergedModule->begin(),
-         e = mergedModule->end(); f != e; ++f)
-    applyRestriction(*f, mustPreserveList, asmUsed, mangler);
-  for (Module::global_iterator v = mergedModule->global_begin(),
-         e = mergedModule->global_end(); v !=  e; ++v)
-    applyRestriction(*v, mustPreserveList, asmUsed, mangler);
-  for (Module::alias_iterator a = mergedModule->alias_begin(),
-         e = mergedModule->alias_end(); a != e; ++a)
-    applyRestriction(*a, mustPreserveList, asmUsed, mangler);
-
-  GlobalVariable *LLVMCompilerUsed =
-    mergedModule->getGlobalVariable("llvm.compiler.used");
-  findUsedValues(LLVMCompilerUsed, asmUsed);
-  if (LLVMCompilerUsed)
-    LLVMCompilerUsed->eraseFromParent();
-
-  if (!asmUsed.empty()) {
-    llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(_context);
-    std::vector<Constant*> asmUsed2;
-    for (SmallPtrSet<GlobalValue*, 16>::const_iterator i = asmUsed.begin(),
-           e = asmUsed.end(); i !=e; ++i) {
-      GlobalValue *GV = *i;
-      Constant *c = ConstantExpr::getBitCast(GV, i8PTy);
-      asmUsed2.push_back(c);
-    }
-
-    llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, asmUsed2.size());
-    LLVMCompilerUsed =
-      new llvm::GlobalVariable(*mergedModule, ATy, false,
-                               llvm::GlobalValue::AppendingLinkage,
-                               llvm::ConstantArray::get(ATy, asmUsed2),
-                               "llvm.compiler.used");
-
-    LLVMCompilerUsed->setSection("llvm.metadata");
-  }
-
-  passes.add(createInternalizePass(mustPreserveList));
-
-  // apply scope restrictions
-  passes.run(*mergedModule);
-
-  _scopeRestrictionsDone = true;
-}
-
-/// Optimize merged modules using various IPO passes
-bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
-                                          std::string &errMsg) {
-  if (!this->determineTarget(errMsg))
-    return false;
-
-  Module* mergedModule = _linker.getModule();
-
-  // Mark which symbols can not be internalized
-  this->applyScopeRestrictions();
-
-  // Instantiate the pass manager to organize the passes.
-  PassManager passes;
-
-  // Start off with a verification pass.
-  passes.add(createVerifierPass());
-
-  // Add an appropriate DataLayout instance for this module...
-  passes.add(new DataLayout(*_target->getDataLayout()));
-  _target->addAnalysisPasses(passes);
-
-  // Enabling internalize here would use its AllButMain variant. It
-  // keeps only main if it exists and does nothing for libraries. Instead
-  // we create the pass ourselves with the symbol list provided by the linker.
-  if (!DisableOpt)
-    PassManagerBuilder().populateLTOPassManager(passes,
-                                              /*Internalize=*/false,
-                                              !DisableInline,
-                                              DisableGVNLoadPRE);
-
-  // Make sure everything is still good.
-  passes.add(createVerifierPass());
-
-  PassManager codeGenPasses;
-
-  codeGenPasses.add(new DataLayout(*_target->getDataLayout()));
-  _target->addAnalysisPasses(codeGenPasses);
-
-  formatted_raw_ostream Out(out);
-
-  // If the bitcode files contain ARC code and were compiled with optimization,
-  // the ObjCARCContractPass must be run, so do it unconditionally here.
-  codeGenPasses.add(createObjCARCContractPass());
-
-  if (_target->addPassesToEmitFile(codeGenPasses, Out,
-                                   TargetMachine::CGFT_ObjectFile)) {
-    errMsg = "target file type not supported";
-    return false;
-  }
-
-  // Run our queue of passes all at once now, efficiently.
-  passes.run(*mergedModule);
-
-  // Run the code generator, and write assembly file
-  codeGenPasses.run(*mergedModule);
-
-  return true;
-}
-
-/// setCodeGenDebugOptions - Set codegen debugging options to aid in debugging
-/// LTO problems.
-void LTOCodeGenerator::setCodeGenDebugOptions(const char *options) {
-  for (std::pair<StringRef, StringRef> o = getToken(options);
-       !o.first.empty(); o = getToken(o.second)) {
-    // ParseCommandLineOptions() expects argv[0] to be program name. Lazily add
-    // that.
-    if (_codegenOptions.empty())
-      _codegenOptions.push_back(strdup("libLTO"));
-    _codegenOptions.push_back(strdup(o.first.str().c_str()));
-  }
-}
diff --git a/tools/lto/LTOCodeGenerator.h b/tools/lto/LTOCodeGenerator.h
deleted file mode 100644
index 8f37cf0..0000000
--- a/tools/lto/LTOCodeGenerator.h
+++ /dev/null
@@ -1,132 +0,0 @@
-//===-LTOCodeGenerator.h - LLVM Link Time Optimizer -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the LTOCodeGenerator class.
-//
-//   LTO compilation consists of three phases: Pre-IPO, IPO and Post-IPO. 
-//
-//   The Pre-IPO phase compiles source code into bitcode file. The resulting
-// bitcode files, along with object files and libraries, will be fed to the
-// linker to through the IPO and Post-IPO phases. By using obj-file extension,
-// the resulting bitcode file disguises itself as an object file, and therefore
-// obviates the need of writing a special set of the make-rules only for LTO
-// compilation.
-//
-//   The IPO phase perform inter-procedural analyses and optimizations, and
-// the Post-IPO consists two sub-phases: intra-procedural scalar optimizations
-// (SOPT), and intra-procedural target-dependent code generator (CG).
-// 
-//   As of this writing, we don't separate IPO and the Post-IPO SOPT. They
-// are intermingled together, and are driven by a single pass manager (see
-// PassManagerBuilder::populateLTOPassManager()).
-// 
-//   The "LTOCodeGenerator" is the driver for the IPO and Post-IPO stages. 
-// The "CodeGenerator" here is bit confusing. Don't confuse the "CodeGenerator"
-// with the machine specific code generator.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LTO_CODE_GENERATOR_H
-#define LTO_CODE_GENERATOR_H
-
-#include "llvm-c/lto.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/Linker.h"
-#include <string>
-#include <vector>
-
-namespace llvm {
-  class LLVMContext;
-  class GlobalValue;
-  class Mangler;
-  class MemoryBuffer;
-  class TargetMachine;
-  class raw_ostream;
-}
-
-//===----------------------------------------------------------------------===//
-/// LTOCodeGenerator - C++ class which implements the opaque lto_code_gen_t
-/// type.
-///
-struct LTOCodeGenerator {
-  static const char *getVersionString();
-
-  LTOCodeGenerator();
-  ~LTOCodeGenerator();
-
-  // Merge given module, return true on success.
-  bool addModule(struct LTOModule*, std::string &errMsg);
-
-  void setDebugInfo(lto_debug_model);
-  void setCodePICModel(lto_codegen_model);
-
-  void setCpu(const char* mCpu) { _mCpu = mCpu; }
-
-  void addMustPreserveSymbol(const char* sym) {
-    _mustPreserveSymbols[sym] = 1;
-  }
-
-  // To pass options to the driver and optimization passes. These options are
-  // not necessarily for debugging purpose (The function name is misleading).
-  // This function should be called before LTOCodeGenerator::compilexxx(),
-  // and LTOCodeGenerator::writeMergedModules().
-  //
-  void setCodeGenDebugOptions(const char *opts);
-
-  // Write the merged module to the file specified by the given path.
-  // Return true on success.
-  bool writeMergedModules(const char *path, std::string &errMsg);
-
-  // Compile the merged module into a *single* object file; the path to object
-  // file is returned to the caller via argument "name". Return true on
-  // success.
-  //
-  // NOTE that it is up to the linker to remove the intermediate object file.
-  //  Do not try to remove the object file in LTOCodeGenerator's destructor
-  //  as we don't who (LTOCodeGenerator or the obj file) will last longer.
-  // 
-  bool compile_to_file(const char **name, std::string &errMsg);
-
-  // As with compile_to_file(), this function compiles the merged module into
-  // single object file. Instead of returning the object-file-path to the caller
-  // (linker), it brings the object to a buffer, and return the buffer to the
-  // caller. This function should delete intermediate object file once its content
-  // is brought to memory. Return NULL is the compilation was not successful. 
-  //
-  const void *compile(size_t *length, std::string &errMsg);
-
-private:
-  void initializeLTOPasses();
-
-  bool generateObjectFile(llvm::raw_ostream &out, std::string &errMsg);
-  void applyScopeRestrictions();
-  void applyRestriction(llvm::GlobalValue &GV,
-                        std::vector<const char*> &mustPreserveList,
-                        llvm::SmallPtrSet<llvm::GlobalValue*, 8> &asmUsed,
-                        llvm::Mangler &mangler);
-  bool determineTarget(std::string &errMsg);
-
-  typedef llvm::StringMap<uint8_t> StringSet;
-
-  llvm::LLVMContext&          _context;
-  llvm::Linker                _linker;
-  llvm::TargetMachine*        _target;
-  bool                        _emitDwarfDebugInfo;
-  bool                        _scopeRestrictionsDone;
-  lto_codegen_model           _codeModel;
-  StringSet                   _mustPreserveSymbols;
-  StringSet                   _asmUndefinedRefs;
-  llvm::MemoryBuffer*         _nativeObjectFile;
-  std::vector<char*>          _codegenOptions;
-  std::string                 _mCpu;
-  std::string                 _nativeObjectPath;
-};
-
-#endif // LTO_CODE_GENERATOR_H
diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
deleted file mode 100644
index e89733f..0000000
--- a/tools/lto/LTOModule.cpp
+++ /dev/null
@@ -1,902 +0,0 @@
-//===-- LTOModule.cpp - LLVM Link Time Optimizer --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Link Time Optimization library. This library is
-// intended to be used by linker to optimize code at link time.
-//
-//===----------------------------------------------------------------------===//
-
-#include "LTOModule.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/system_error.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-using namespace llvm;
-
-static cl::opt<bool>
-EnableFPMAD("enable-fp-mad",
-  cl::desc("Enable less precise MAD instructions to be generated"),
-  cl::init(false));
-
-static cl::opt<bool>
-DisableFPElim("disable-fp-elim",
-  cl::desc("Disable frame pointer elimination optimization"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableUnsafeFPMath("enable-unsafe-fp-math",
-  cl::desc("Enable optimizations that may decrease FP precision"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableNoInfsFPMath("enable-no-infs-fp-math",
-  cl::desc("Enable FP math optimizations that assume no +-Infs"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableNoNaNsFPMath("enable-no-nans-fp-math",
-  cl::desc("Enable FP math optimizations that assume no NaNs"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
-  cl::Hidden,
-  cl::desc("Force codegen to assume rounding mode can change dynamically"),
-  cl::init(false));
-
-static cl::opt<bool>
-GenerateSoftFloatCalls("soft-float",
-  cl::desc("Generate software floating point library calls"),
-  cl::init(false));
-
-static cl::opt<llvm::FloatABI::ABIType>
-FloatABIForCalls("float-abi",
-  cl::desc("Choose float ABI type"),
-  cl::init(FloatABI::Default),
-  cl::values(
-    clEnumValN(FloatABI::Default, "default",
-               "Target default float ABI type"),
-    clEnumValN(FloatABI::Soft, "soft",
-               "Soft float ABI (implied by -soft-float)"),
-    clEnumValN(FloatABI::Hard, "hard",
-               "Hard float ABI (uses FP registers)"),
-    clEnumValEnd));
-
-static cl::opt<llvm::FPOpFusion::FPOpFusionMode>
-FuseFPOps("fp-contract",
-  cl::desc("Enable aggresive formation of fused FP ops"),
-  cl::init(FPOpFusion::Standard),
-  cl::values(
-    clEnumValN(FPOpFusion::Fast, "fast",
-               "Fuse FP ops whenever profitable"),
-    clEnumValN(FPOpFusion::Standard, "on",
-               "Only fuse 'blessed' FP ops."),
-    clEnumValN(FPOpFusion::Strict, "off",
-               "Only fuse FP ops when the result won't be effected."),
-    clEnumValEnd));
-
-static cl::opt<bool>
-DontPlaceZerosInBSS("nozero-initialized-in-bss",
-  cl::desc("Don't place zero-initialized symbols into bss section"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableGuaranteedTailCallOpt("tailcallopt",
-  cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
-  cl::init(false));
-
-static cl::opt<bool>
-DisableTailCalls("disable-tail-calls",
-  cl::desc("Never emit tail calls"),
-  cl::init(false));
-
-static cl::opt<unsigned>
-OverrideStackAlignment("stack-alignment",
-  cl::desc("Override default stack alignment"),
-  cl::init(0));
-
-static cl::opt<std::string>
-TrapFuncName("trap-func", cl::Hidden,
-  cl::desc("Emit a call to trap function rather than a trap instruction"),
-  cl::init(""));
-
-static cl::opt<bool>
-EnablePIE("enable-pie",
-  cl::desc("Assume the creation of a position independent executable."),
-  cl::init(false));
-
-static cl::opt<bool>
-SegmentedStacks("segmented-stacks",
-  cl::desc("Use segmented stacks if possible."),
-  cl::init(false));
-
-static cl::opt<bool>
-UseInitArray("use-init-array",
-  cl::desc("Use .init_array instead of .ctors."),
-  cl::init(false));
-
-LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
-  : _module(m), _target(t),
-    _context(_target->getMCAsmInfo(), _target->getRegisterInfo(), NULL),
-    _mangler(_context, t) {}
-
-/// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
-/// bitcode.
-bool LTOModule::isBitcodeFile(const void *mem, size_t length) {
-  return sys::fs::identify_magic(StringRef((const char *)mem, length)) ==
-         sys::fs::file_magic::bitcode;
-}
-
-bool LTOModule::isBitcodeFile(const char *path) {
-  sys::fs::file_magic type;
-  if (sys::fs::identify_magic(path, type))
-    return false;
-  return type == sys::fs::file_magic::bitcode;
-}
-
-/// isBitcodeFileForTarget - Returns 'true' if the file (or memory contents) is
-/// LLVM bitcode for the specified triple.
-bool LTOModule::isBitcodeFileForTarget(const void *mem, size_t length,
-                                       const char *triplePrefix) {
-  MemoryBuffer *buffer = makeBuffer(mem, length);
-  if (!buffer)
-    return false;
-  return isTargetMatch(buffer, triplePrefix);
-}
-
-bool LTOModule::isBitcodeFileForTarget(const char *path,
-                                       const char *triplePrefix) {
-  OwningPtr<MemoryBuffer> buffer;
-  if (MemoryBuffer::getFile(path, buffer))
-    return false;
-  return isTargetMatch(buffer.take(), triplePrefix);
-}
-
-/// isTargetMatch - Returns 'true' if the memory buffer is for the specified
-/// target triple.
-bool LTOModule::isTargetMatch(MemoryBuffer *buffer, const char *triplePrefix) {
-  std::string Triple = getBitcodeTargetTriple(buffer, getGlobalContext());
-  delete buffer;
-  return strncmp(Triple.c_str(), triplePrefix, strlen(triplePrefix)) == 0;
-}
-
-/// makeLTOModule - Create an LTOModule. N.B. These methods take ownership of
-/// the buffer.
-LTOModule *LTOModule::makeLTOModule(const char *path, std::string &errMsg) {
-  OwningPtr<MemoryBuffer> buffer;
-  if (error_code ec = MemoryBuffer::getFile(path, buffer)) {
-    errMsg = ec.message();
-    return NULL;
-  }
-  return makeLTOModule(buffer.take(), errMsg);
-}
-
-LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
-                                    size_t size, std::string &errMsg) {
-  return makeLTOModule(fd, path, size, 0, errMsg);
-}
-
-LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
-                                    size_t map_size,
-                                    off_t offset,
-                                    std::string &errMsg) {
-  OwningPtr<MemoryBuffer> buffer;
-  if (error_code ec =
-          MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) {
-    errMsg = ec.message();
-    return NULL;
-  }
-  return makeLTOModule(buffer.take(), errMsg);
-}
-
-LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length,
-                                    std::string &errMsg) {
-  OwningPtr<MemoryBuffer> buffer(makeBuffer(mem, length));
-  if (!buffer)
-    return NULL;
-  return makeLTOModule(buffer.take(), errMsg);
-}
-
-void LTOModule::getTargetOptions(TargetOptions &Options) {
-  Options.LessPreciseFPMADOption = EnableFPMAD;
-  Options.NoFramePointerElim = DisableFPElim;
-  Options.AllowFPOpFusion = FuseFPOps;
-  Options.UnsafeFPMath = EnableUnsafeFPMath;
-  Options.NoInfsFPMath = EnableNoInfsFPMath;
-  Options.NoNaNsFPMath = EnableNoNaNsFPMath;
-  Options.HonorSignDependentRoundingFPMathOption =
-    EnableHonorSignDependentRoundingFPMath;
-  Options.UseSoftFloat = GenerateSoftFloatCalls;
-  if (FloatABIForCalls != FloatABI::Default)
-    Options.FloatABIType = FloatABIForCalls;
-  Options.NoZerosInBSS = DontPlaceZerosInBSS;
-  Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
-  Options.DisableTailCalls = DisableTailCalls;
-  Options.StackAlignmentOverride = OverrideStackAlignment;
-  Options.TrapFuncName = TrapFuncName;
-  Options.PositionIndependentExecutable = EnablePIE;
-  Options.EnableSegmentedStacks = SegmentedStacks;
-  Options.UseInitArray = UseInitArray;
-}
-
-LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
-                                    std::string &errMsg) {
-  static bool Initialized = false;
-  if (!Initialized) {
-    InitializeAllTargets();
-    InitializeAllTargetMCs();
-    InitializeAllAsmParsers();
-    Initialized = true;
-  }
-
-  // parse bitcode buffer
-  OwningPtr<Module> m(getLazyBitcodeModule(buffer, getGlobalContext(),
-                                           &errMsg));
-  if (!m) {
-    delete buffer;
-    return NULL;
-  }
-
-  std::string TripleStr = m->getTargetTriple();
-  if (TripleStr.empty())
-    TripleStr = sys::getDefaultTargetTriple();
-  llvm::Triple Triple(TripleStr);
-
-  // find machine architecture for this module
-  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
-  if (!march)
-    return NULL;
-
-  // construct LTOModule, hand over ownership of module and target
-  SubtargetFeatures Features;
-  Features.getDefaultSubtargetFeatures(Triple);
-  std::string FeatureStr = Features.getString();
-  // Set a default CPU for Darwin triples.
-  std::string CPU;
-  if (Triple.isOSDarwin()) {
-    if (Triple.getArch() == llvm::Triple::x86_64)
-      CPU = "core2";
-    else if (Triple.getArch() == llvm::Triple::x86)
-      CPU = "yonah";
-  }
-  TargetOptions Options;
-  getTargetOptions(Options);
-  TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
-                                                     Options);
-  LTOModule *Ret = new LTOModule(m.take(), target);
-  if (Ret->parseSymbols(errMsg)) {
-    delete Ret;
-    return NULL;
-  }
-
-  return Ret;
-}
-
-/// makeBuffer - Create a MemoryBuffer from a memory range.
-MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
-  const char *startPtr = (const char*)mem;
-  return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), "", false);
-}
-
-/// objcClassNameFromExpression - Get string that the data pointer points to.
-bool
-LTOModule::objcClassNameFromExpression(const Constant *c, std::string &name) {
-  if (const ConstantExpr *ce = dyn_cast<ConstantExpr>(c)) {
-    Constant *op = ce->getOperand(0);
-    if (GlobalVariable *gvn = dyn_cast<GlobalVariable>(op)) {
-      Constant *cn = gvn->getInitializer();
-      if (ConstantDataArray *ca = dyn_cast<ConstantDataArray>(cn)) {
-        if (ca->isCString()) {
-          name = ".objc_class_name_" + ca->getAsCString().str();
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
-/// addObjCClass - Parse i386/ppc ObjC class data structure.
-void LTOModule::addObjCClass(const GlobalVariable *clgv) {
-  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
-  if (!c) return;
-
-  // second slot in __OBJC,__class is pointer to superclass name
-  std::string superclassName;
-  if (objcClassNameFromExpression(c->getOperand(1), superclassName)) {
-    NameAndAttributes info;
-    StringMap<NameAndAttributes>::value_type &entry =
-      _undefines.GetOrCreateValue(superclassName);
-    if (!entry.getValue().name) {
-      const char *symbolName = entry.getKey().data();
-      info.name = symbolName;
-      info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
-      info.isFunction = false;
-      info.symbol = clgv;
-      entry.setValue(info);
-    }
-  }
-
-  // third slot in __OBJC,__class is pointer to class name
-  std::string className;
-  if (objcClassNameFromExpression(c->getOperand(2), className)) {
-    StringSet::value_type &entry = _defines.GetOrCreateValue(className);
-    entry.setValue(1);
-
-    NameAndAttributes info;
-    info.name = entry.getKey().data();
-    info.attributes = LTO_SYMBOL_PERMISSIONS_DATA |
-      LTO_SYMBOL_DEFINITION_REGULAR | LTO_SYMBOL_SCOPE_DEFAULT;
-    info.isFunction = false;
-    info.symbol = clgv;
-    _symbols.push_back(info);
-  }
-}
-
-/// addObjCCategory - Parse i386/ppc ObjC category data structure.
-void LTOModule::addObjCCategory(const GlobalVariable *clgv) {
-  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
-  if (!c) return;
-
-  // second slot in __OBJC,__category is pointer to target class name
-  std::string targetclassName;
-  if (!objcClassNameFromExpression(c->getOperand(1), targetclassName))
-    return;
-
-  NameAndAttributes info;
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(targetclassName);
-
-  if (entry.getValue().name)
-    return;
-
-  const char *symbolName = entry.getKey().data();
-  info.name = symbolName;
-  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
-  info.isFunction = false;
-  info.symbol = clgv;
-  entry.setValue(info);
-}
-
-/// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
-void LTOModule::addObjCClassRef(const GlobalVariable *clgv) {
-  std::string targetclassName;
-  if (!objcClassNameFromExpression(clgv->getInitializer(), targetclassName))
-    return;
-
-  NameAndAttributes info;
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(targetclassName);
-  if (entry.getValue().name)
-    return;
-
-  const char *symbolName = entry.getKey().data();
-  info.name = symbolName;
-  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
-  info.isFunction = false;
-  info.symbol = clgv;
-  entry.setValue(info);
-}
-
-/// addDefinedDataSymbol - Add a data symbol as defined to the list.
-void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
-  // Add to list of defined symbols.
-  addDefinedSymbol(v, false);
-
-  if (!v->hasSection() /* || !isTargetDarwin */)
-    return;
-
-  // Special case i386/ppc ObjC data structures in magic sections:
-  // The issue is that the old ObjC object format did some strange
-  // contortions to avoid real linker symbols.  For instance, the
-  // ObjC class data structure is allocated statically in the executable
-  // that defines that class.  That data structures contains a pointer to
-  // its superclass.  But instead of just initializing that part of the
-  // struct to the address of its superclass, and letting the static and
-  // dynamic linkers do the rest, the runtime works by having that field
-  // instead point to a C-string that is the name of the superclass.
-  // At runtime the objc initialization updates that pointer and sets
-  // it to point to the actual super class.  As far as the linker
-  // knows it is just a pointer to a string.  But then someone wanted the
-  // linker to issue errors at build time if the superclass was not found.
-  // So they figured out a way in mach-o object format to use an absolute
-  // symbols (.objc_class_name_Foo = 0) and a floating reference
-  // (.reference .objc_class_name_Bar) to cause the linker into erroring when
-  // a class was missing.
-  // The following synthesizes the implicit .objc_* symbols for the linker
-  // from the ObjC data structures generated by the front end.
-
-  // special case if this data blob is an ObjC class definition
-  if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
-    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-      addObjCClass(gv);
-    }
-  }
-
-  // special case if this data blob is an ObjC category definition
-  else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
-    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-      addObjCCategory(gv);
-    }
-  }
-
-  // special case if this data blob is the list of referenced classes
-  else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
-    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
-      addObjCClassRef(gv);
-    }
-  }
-}
-
-/// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
-void LTOModule::addDefinedFunctionSymbol(const Function *f) {
-  // add to list of defined symbols
-  addDefinedSymbol(f, true);
-}
-
-/// addDefinedSymbol - Add a defined symbol to the list.
-void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
-  // ignore all llvm.* symbols
-  if (def->getName().startswith("llvm."))
-    return;
-
-  // string is owned by _defines
-  SmallString<64> Buffer;
-  _mangler.getNameWithPrefix(Buffer, def, false);
-
-  // set alignment part log2() can have rounding errors
-  uint32_t align = def->getAlignment();
-  uint32_t attr = align ? countTrailingZeros(def->getAlignment()) : 0;
-
-  // set permissions part
-  if (isFunction) {
-    attr |= LTO_SYMBOL_PERMISSIONS_CODE;
-  } else {
-    const GlobalVariable *gv = dyn_cast<GlobalVariable>(def);
-    if (gv && gv->isConstant())
-      attr |= LTO_SYMBOL_PERMISSIONS_RODATA;
-    else
-      attr |= LTO_SYMBOL_PERMISSIONS_DATA;
-  }
-
-  // set definition part
-  if (def->hasWeakLinkage() || def->hasLinkOnceLinkage() ||
-      def->hasLinkerPrivateWeakLinkage())
-    attr |= LTO_SYMBOL_DEFINITION_WEAK;
-  else if (def->hasCommonLinkage())
-    attr |= LTO_SYMBOL_DEFINITION_TENTATIVE;
-  else
-    attr |= LTO_SYMBOL_DEFINITION_REGULAR;
-
-  // set scope part
-  if (def->hasHiddenVisibility())
-    attr |= LTO_SYMBOL_SCOPE_HIDDEN;
-  else if (def->hasProtectedVisibility())
-    attr |= LTO_SYMBOL_SCOPE_PROTECTED;
-  else if (def->hasExternalLinkage() || def->hasWeakLinkage() ||
-           def->hasLinkOnceLinkage() || def->hasCommonLinkage() ||
-           def->hasLinkerPrivateWeakLinkage())
-    attr |= LTO_SYMBOL_SCOPE_DEFAULT;
-  else if (def->hasLinkOnceODRAutoHideLinkage())
-    attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
-  else
-    attr |= LTO_SYMBOL_SCOPE_INTERNAL;
-
-  StringSet::value_type &entry = _defines.GetOrCreateValue(Buffer);
-  entry.setValue(1);
-
-  // fill information structure
-  NameAndAttributes info;
-  StringRef Name = entry.getKey();
-  info.name = Name.data();
-  assert(info.name[Name.size()] == '\0');
-  info.attributes = attr;
-  info.isFunction = isFunction;
-  info.symbol = def;
-
-  // add to table of symbols
-  _symbols.push_back(info);
-}
-
-/// addAsmGlobalSymbol - Add a global symbol from module-level ASM to the
-/// defined list.
-void LTOModule::addAsmGlobalSymbol(const char *name,
-                                   lto_symbol_attributes scope) {
-  StringSet::value_type &entry = _defines.GetOrCreateValue(name);
-
-  // only add new define if not already defined
-  if (entry.getValue())
-    return;
-
-  entry.setValue(1);
-
-  NameAndAttributes &info = _undefines[entry.getKey().data()];
-
-  if (info.symbol == 0) {
-    // FIXME: This is trying to take care of module ASM like this:
-    //
-    //   module asm ".zerofill __FOO, __foo, _bar_baz_qux, 0"
-    //
-    // but is gross and its mother dresses it funny. Have the ASM parser give us
-    // more details for this type of situation so that we're not guessing so
-    // much.
-
-    // fill information structure
-    info.name = entry.getKey().data();
-    info.attributes =
-      LTO_SYMBOL_PERMISSIONS_DATA | LTO_SYMBOL_DEFINITION_REGULAR | scope;
-    info.isFunction = false;
-    info.symbol = 0;
-
-    // add to table of symbols
-    _symbols.push_back(info);
-    return;
-  }
-
-  if (info.isFunction)
-    addDefinedFunctionSymbol(cast<Function>(info.symbol));
-  else
-    addDefinedDataSymbol(info.symbol);
-
-  _symbols.back().attributes &= ~LTO_SYMBOL_SCOPE_MASK;
-  _symbols.back().attributes |= scope;
-}
-
-/// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to the
-/// undefined list.
-void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(name);
-
-  _asm_undefines.push_back(entry.getKey().data());
-
-  // we already have the symbol
-  if (entry.getValue().name)
-    return;
-
-  uint32_t attr = LTO_SYMBOL_DEFINITION_UNDEFINED;;
-  attr |= LTO_SYMBOL_SCOPE_DEFAULT;
-  NameAndAttributes info;
-  info.name = entry.getKey().data();
-  info.attributes = attr;
-  info.isFunction = false;
-  info.symbol = 0;
-
-  entry.setValue(info);
-}
-
-/// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet to a
-/// list to be resolved later.
-void
-LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
-  // ignore all llvm.* symbols
-  if (decl->getName().startswith("llvm."))
-    return;
-
-  // ignore all aliases
-  if (isa<GlobalAlias>(decl))
-    return;
-
-  SmallString<64> name;
-  _mangler.getNameWithPrefix(name, decl, false);
-
-  StringMap<NameAndAttributes>::value_type &entry =
-    _undefines.GetOrCreateValue(name);
-
-  // we already have the symbol
-  if (entry.getValue().name)
-    return;
-
-  NameAndAttributes info;
-
-  info.name = entry.getKey().data();
-
-  if (decl->hasExternalWeakLinkage())
-    info.attributes = LTO_SYMBOL_DEFINITION_WEAKUNDEF;
-  else
-    info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
-
-  info.isFunction = isFunc;
-  info.symbol = decl;
-
-  entry.setValue(info);
-}
-
-namespace {
-  class RecordStreamer : public MCStreamer {
-  public:
-    enum State { NeverSeen, Global, Defined, DefinedGlobal, Used };
-
-  private:
-    StringMap<State> Symbols;
-
-    void markDefined(const MCSymbol &Symbol) {
-      State &S = Symbols[Symbol.getName()];
-      switch (S) {
-      case DefinedGlobal:
-      case Global:
-        S = DefinedGlobal;
-        break;
-      case NeverSeen:
-      case Defined:
-      case Used:
-        S = Defined;
-        break;
-      }
-    }
-    void markGlobal(const MCSymbol &Symbol) {
-      State &S = Symbols[Symbol.getName()];
-      switch (S) {
-      case DefinedGlobal:
-      case Defined:
-        S = DefinedGlobal;
-        break;
-
-      case NeverSeen:
-      case Global:
-      case Used:
-        S = Global;
-        break;
-      }
-    }
-    void markUsed(const MCSymbol &Symbol) {
-      State &S = Symbols[Symbol.getName()];
-      switch (S) {
-      case DefinedGlobal:
-      case Defined:
-      case Global:
-        break;
-
-      case NeverSeen:
-      case Used:
-        S = Used;
-        break;
-      }
-    }
-
-    // FIXME: mostly copied for the obj streamer.
-    void AddValueSymbols(const MCExpr *Value) {
-      switch (Value->getKind()) {
-      case MCExpr::Target:
-        // FIXME: What should we do in here?
-        break;
-
-      case MCExpr::Constant:
-        break;
-
-      case MCExpr::Binary: {
-        const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-        AddValueSymbols(BE->getLHS());
-        AddValueSymbols(BE->getRHS());
-        break;
-      }
-
-      case MCExpr::SymbolRef:
-        markUsed(cast<MCSymbolRefExpr>(Value)->getSymbol());
-        break;
-
-      case MCExpr::Unary:
-        AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
-        break;
-      }
-    }
-
-  public:
-    typedef StringMap<State>::const_iterator const_iterator;
-
-    const_iterator begin() {
-      return Symbols.begin();
-    }
-
-    const_iterator end() {
-      return Symbols.end();
-    }
-
-    RecordStreamer(MCContext &Context)
-        : MCStreamer(SK_RecordStreamer, Context) {}
-
-    virtual void EmitInstruction(const MCInst &Inst) {
-      // Scan for values.
-      for (unsigned i = Inst.getNumOperands(); i--; )
-        if (Inst.getOperand(i).isExpr())
-          AddValueSymbols(Inst.getOperand(i).getExpr());
-    }
-    virtual void EmitLabel(MCSymbol *Symbol) {
-      Symbol->setSection(*getCurrentSection().first);
-      markDefined(*Symbol);
-    }
-    virtual void EmitDebugLabel(MCSymbol *Symbol) {
-      EmitLabel(Symbol);
-    }
-    virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
-      // FIXME: should we handle aliases?
-      markDefined(*Symbol);
-    }
-    virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
-      if (Attribute == MCSA_Global)
-        markGlobal(*Symbol);
-    }
-    virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
-                              uint64_t Size , unsigned ByteAlignment) {
-      markDefined(*Symbol);
-    }
-    virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                  unsigned ByteAlignment) {
-      markDefined(*Symbol);
-    }
-
-    virtual void EmitBundleAlignMode(unsigned AlignPow2) {}
-    virtual void EmitBundleLock(bool AlignToEnd) {}
-    virtual void EmitBundleUnlock() {}
-
-    // Noop calls.
-    virtual void ChangeSection(const MCSection *Section,
-                               const MCExpr *Subsection) {}
-    virtual void InitToTextSection() {}
-    virtual void InitSections() {}
-    virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {}
-    virtual void EmitThumbFunc(MCSymbol *Func) {}
-    virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
-    virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
-    virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
-    virtual void EmitCOFFSymbolStorageClass(int StorageClass) {}
-    virtual void EmitCOFFSymbolType(int Type) {}
-    virtual void EndCOFFSymbolDef() {}
-    virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
-    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                       unsigned ByteAlignment) {}
-    virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
-                                uint64_t Size, unsigned ByteAlignment) {}
-    virtual void EmitBytes(StringRef Data) {}
-    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {}
-    virtual void EmitULEB128Value(const MCExpr *Value) {}
-    virtual void EmitSLEB128Value(const MCExpr *Value) {}
-    virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
-                                      unsigned ValueSize,
-                                      unsigned MaxBytesToEmit) {}
-    virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                   unsigned MaxBytesToEmit) {}
-    virtual bool EmitValueToOffset(const MCExpr *Offset,
-                                   unsigned char Value ) { return false; }
-    virtual void EmitFileDirective(StringRef Filename) {}
-    virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
-                                          const MCSymbol *LastLabel,
-                                          const MCSymbol *Label,
-                                          unsigned PointerSize) {}
-    virtual void FinishImpl() {}
-
-    static bool classof(const MCStreamer *S) {
-      return S->getKind() == SK_RecordStreamer;
-    }
-  };
-} // end anonymous namespace
-
-/// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
-/// defined or undefined lists.
-bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
-  const std::string &inlineAsm = _module->getModuleInlineAsm();
-  if (inlineAsm.empty())
-    return false;
-
-  OwningPtr<RecordStreamer> Streamer(new RecordStreamer(_context));
-  MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(inlineAsm);
-  SourceMgr SrcMgr;
-  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
-  OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr,
-                                                  _context, *Streamer,
-                                                  *_target->getMCAsmInfo()));
-  const Target &T = _target->getTarget();
-  OwningPtr<MCSubtargetInfo>
-    STI(T.createMCSubtargetInfo(_target->getTargetTriple(),
-                                _target->getTargetCPU(),
-                                _target->getTargetFeatureString()));
-  OwningPtr<MCTargetAsmParser> TAP(T.createMCAsmParser(*STI, *Parser.get()));
-  if (!TAP) {
-    errMsg = "target " + std::string(T.getName()) +
-      " does not define AsmParser.";
-    return true;
-  }
-
-  Parser->setTargetParser(*TAP);
-  if (Parser->Run(false))
-    return true;
-
-  for (RecordStreamer::const_iterator i = Streamer->begin(),
-         e = Streamer->end(); i != e; ++i) {
-    StringRef Key = i->first();
-    RecordStreamer::State Value = i->second;
-    if (Value == RecordStreamer::DefinedGlobal)
-      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_DEFAULT);
-    else if (Value == RecordStreamer::Defined)
-      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_INTERNAL);
-    else if (Value == RecordStreamer::Global ||
-             Value == RecordStreamer::Used)
-      addAsmGlobalSymbolUndef(Key.data());
-  }
-
-  return false;
-}
-
-/// isDeclaration - Return 'true' if the global value is a declaration.
-static bool isDeclaration(const GlobalValue &V) {
-  if (V.hasAvailableExternallyLinkage())
-    return true;
-
-  if (V.isMaterializable())
-    return false;
-
-  return V.isDeclaration();
-}
-
-/// parseSymbols - Parse the symbols from the module and model-level ASM and add
-/// them to either the defined or undefined lists.
-bool LTOModule::parseSymbols(std::string &errMsg) {
-  // add functions
-  for (Module::iterator f = _module->begin(), e = _module->end(); f != e; ++f) {
-    if (isDeclaration(*f))
-      addPotentialUndefinedSymbol(f, true);
-    else
-      addDefinedFunctionSymbol(f);
-  }
-
-  // add data
-  for (Module::global_iterator v = _module->global_begin(),
-         e = _module->global_end(); v !=  e; ++v) {
-    if (isDeclaration(*v))
-      addPotentialUndefinedSymbol(v, false);
-    else
-      addDefinedDataSymbol(v);
-  }
-
-  // add asm globals
-  if (addAsmGlobalSymbols(errMsg))
-    return true;
-
-  // add aliases
-  for (Module::alias_iterator a = _module->alias_begin(),
-         e = _module->alias_end(); a != e; ++a) {
-    if (isDeclaration(*a->getAliasedGlobal()))
-      // Is an alias to a declaration.
-      addPotentialUndefinedSymbol(a, false);
-    else
-      addDefinedDataSymbol(a);
-  }
-
-  // make symbols for all undefines
-  for (StringMap<NameAndAttributes>::iterator u =_undefines.begin(),
-         e = _undefines.end(); u != e; ++u) {
-    // If this symbol also has a definition, then don't make an undefine because
-    // it is a tentative definition.
-    if (_defines.count(u->getKey())) continue;
-    NameAndAttributes info = u->getValue();
-    _symbols.push_back(info);
-  }
-
-  return false;
-}
diff --git a/tools/lto/LTOModule.h b/tools/lto/LTOModule.h
deleted file mode 100644
index 902e9c5..0000000
--- a/tools/lto/LTOModule.h
+++ /dev/null
@@ -1,190 +0,0 @@
-//===-LTOModule.h - LLVM Link Time Optimizer ------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the LTOModule class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LTO_MODULE_H
-#define LTO_MODULE_H
-
-#include "llvm-c/lto.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetMachine.h"
-#include <string>
-#include <vector>
-
-// Forward references to llvm classes.
-namespace llvm {
-  class Function;
-  class GlobalValue;
-  class MemoryBuffer;
-  class TargetOptions;
-  class Value;
-}
-
-//===----------------------------------------------------------------------===//
-/// LTOModule - C++ class which implements the opaque lto_module_t type.
-///
-struct LTOModule {
-private:
-  typedef llvm::StringMap<uint8_t> StringSet;
-
-  struct NameAndAttributes {
-    const char        *name;
-    uint32_t           attributes;
-    bool               isFunction;
-    const llvm::GlobalValue *symbol;
-  };
-
-  llvm::OwningPtr<llvm::Module>           _module;
-  llvm::OwningPtr<llvm::TargetMachine>    _target;
-  std::vector<NameAndAttributes>          _symbols;
-
-  // _defines and _undefines only needed to disambiguate tentative definitions
-  StringSet                               _defines;
-  llvm::StringMap<NameAndAttributes>      _undefines;
-  std::vector<const char*>                _asm_undefines;
-  llvm::MCContext                         _context;
-
-  // Use mangler to add GlobalPrefix to names to match linker names.
-  llvm::Mangler                           _mangler;
-
-  LTOModule(llvm::Module *m, llvm::TargetMachine *t);
-public:
-  /// isBitcodeFile - Returns 'true' if the file or memory contents is LLVM
-  /// bitcode.
-  static bool isBitcodeFile(const void *mem, size_t length);
-  static bool isBitcodeFile(const char *path);
-
-  /// isBitcodeFileForTarget - Returns 'true' if the file or memory contents
-  /// is LLVM bitcode for the specified triple.
-  static bool isBitcodeFileForTarget(const void *mem,
-                                     size_t length,
-                                     const char *triplePrefix);
-  static bool isBitcodeFileForTarget(const char *path,
-                                     const char *triplePrefix);
-
-  /// makeLTOModule - Create an LTOModule. N.B. These methods take ownership
-  /// of the buffer.
-  static LTOModule *makeLTOModule(const char* path,
-                                  std::string &errMsg);
-  static LTOModule *makeLTOModule(int fd, const char *path,
-                                  size_t size, std::string &errMsg);
-  static LTOModule *makeLTOModule(int fd, const char *path,
-                                  size_t map_size,
-                                  off_t offset,
-                                  std::string& errMsg);
-  static LTOModule *makeLTOModule(const void *mem, size_t length,
-                                  std::string &errMsg);
-
-  /// getTargetTriple - Return the Module's target triple.
-  const char *getTargetTriple() {
-    return _module->getTargetTriple().c_str();
-  }
-
-  /// setTargetTriple - Set the Module's target triple.
-  void setTargetTriple(const char *triple) {
-    _module->setTargetTriple(triple);
-  }
-
-  /// getSymbolCount - Get the number of symbols
-  uint32_t getSymbolCount() {
-    return _symbols.size();
-  }
-
-  /// getSymbolAttributes - Get the attributes for a symbol at the specified
-  /// index.
-  lto_symbol_attributes getSymbolAttributes(uint32_t index) {
-    if (index < _symbols.size())
-      return lto_symbol_attributes(_symbols[index].attributes);
-    return lto_symbol_attributes(0);
-  }
-
-  /// getSymbolName - Get the name of the symbol at the specified index.
-  const char *getSymbolName(uint32_t index) {
-    if (index < _symbols.size())
-      return _symbols[index].name;
-    return NULL;
-  }
-
-  /// getLLVVMModule - Return the Module.
-  llvm::Module *getLLVVMModule() { return _module.get(); }
-
-  /// getAsmUndefinedRefs -
-  const std::vector<const char*> &getAsmUndefinedRefs() {
-    return _asm_undefines;
-  }
-
-  /// getTargetOptions - Fill the TargetOptions object with the options
-  /// specified on the command line.
-  static void getTargetOptions(llvm::TargetOptions &Options);
-
-private:
-  /// parseSymbols - Parse the symbols from the module and model-level ASM and
-  /// add them to either the defined or undefined lists.
-  bool parseSymbols(std::string &errMsg);
-
-  /// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet
-  /// to a list to be resolved later.
-  void addPotentialUndefinedSymbol(const llvm::GlobalValue *dcl, bool isFunc);
-
-  /// addDefinedSymbol - Add a defined symbol to the list.
-  void addDefinedSymbol(const llvm::GlobalValue *def, bool isFunction);
-
-  /// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
-  void addDefinedFunctionSymbol(const llvm::Function *f);
-
-  /// addDefinedDataSymbol - Add a data symbol as defined to the list.
-  void addDefinedDataSymbol(const llvm::GlobalValue *v);
-
-  /// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
-  /// defined or undefined lists.
-  bool addAsmGlobalSymbols(std::string &errMsg);
-
-  /// addAsmGlobalSymbol - Add a global symbol from module-level ASM to the
-  /// defined list.
-  void addAsmGlobalSymbol(const char *, lto_symbol_attributes scope);
-
-  /// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to
-  /// the undefined list.
-  void addAsmGlobalSymbolUndef(const char *);
-
-  /// addObjCClass - Parse i386/ppc ObjC class data structure.
-  void addObjCClass(const llvm::GlobalVariable *clgv);
-
-  /// addObjCCategory - Parse i386/ppc ObjC category data structure.
-  void addObjCCategory(const llvm::GlobalVariable *clgv);
-
-  /// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
-  void addObjCClassRef(const llvm::GlobalVariable *clgv);
-
-  /// objcClassNameFromExpression - Get string that the data pointer points
-  /// to.
-  bool objcClassNameFromExpression(const llvm::Constant* c, std::string &name);
-
-  /// isTargetMatch - Returns 'true' if the memory buffer is for the specified
-  /// target triple.
-  static bool isTargetMatch(llvm::MemoryBuffer *memBuffer,
-                            const char *triplePrefix);
-
-  /// makeLTOModule - Create an LTOModule (private version). N.B. This
-  /// method takes ownership of the buffer.
-  static LTOModule *makeLTOModule(llvm::MemoryBuffer *buffer,
-                                  std::string &errMsg);
-
-  /// makeBuffer - Create a MemoryBuffer from a memory range.
-  static llvm::MemoryBuffer *makeBuffer(const void *mem, size_t length);
-};
-
-#endif // LTO_MODULE_H
diff --git a/tools/lto/Makefile b/tools/lto/Makefile
index 56c67df..cedbee1 100644
--- a/tools/lto/Makefile
+++ b/tools/lto/Makefile
@@ -10,7 +10,7 @@
 LEVEL := ../..
 LIBRARYNAME := LTO
 LINK_COMPONENTS := all-targets ipo scalaropts linker bitreader bitwriter \
-                   mcdisassembler vectorize
+                   lto mcdisassembler vectorize
 LINK_LIBS_IN_SHARED := 1
 SHARED_LIBRARY := 1
 
@@ -46,7 +46,7 @@ ifeq ($(HOST_OS),Darwin)
     ifneq ($(DARWIN_VERS),8)
        LLVMLibsOptions    := $(LLVMLibsOptions)  \
                             -Wl,-install_name \
-                            -Wl,"@rpath/lib$(LIBRARYNAME)$(SHLIBEXT)"
+                            -Wl,"@executable_path/../lib/lib$(LIBRARYNAME)$(SHLIBEXT)"
     endif
 
     # If we're doing an Apple-style build, add the LTO object path.
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index db7147c..7bfddcd 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -13,15 +13,71 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/lto.h"
-#include "LTOCodeGenerator.h"
-#include "LTOModule.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/LTO/LTOCodeGenerator.h"
+#include "llvm/LTO/LTOModule.h"
 #include "llvm-c/Core.h"
+#include "llvm-c/Target.h"
 
+// extra command-line flags needed for LTOCodeGenerator
+static cl::opt<bool>
+DisableOpt("disable-opt", cl::init(false),
+  cl::desc("Do not run any optimization passes"));
+
+static cl::opt<bool>
+DisableInline("disable-inlining", cl::init(false),
+  cl::desc("Do not run the inliner pass"));
+
+static cl::opt<bool>
+DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
+  cl::desc("Do not run the GVN load PRE pass"));
 
 // Holds most recent error string.
 // *** Not thread safe ***
 static std::string sLastErrorString;
 
+// Holds the initialization state of the LTO module.
+// *** Not thread safe ***
+static bool initialized = false;
+
+// Holds the command-line option parsing state of the LTO module.
+static bool parsedOptions = false;
+
+// Initialize the configured targets if they have not been initialized.
+static void lto_initialize() {
+  if (!initialized) {
+    LLVMInitializeAllTargetInfos();
+    LLVMInitializeAllTargets();
+    LLVMInitializeAllTargetMCs();
+    LLVMInitializeAllAsmParsers();
+    LLVMInitializeAllAsmPrinters();
+    LLVMInitializeAllDisassemblers();
+    initialized = true;
+  }
+}
+
+static void lto_set_target_options(llvm::TargetOptions &Options) {
+  Options.LessPreciseFPMADOption = EnableFPMAD;
+  Options.NoFramePointerElim = DisableFPElim;
+  Options.AllowFPOpFusion = FuseFPOps;
+  Options.UnsafeFPMath = EnableUnsafeFPMath;
+  Options.NoInfsFPMath = EnableNoInfsFPMath;
+  Options.NoNaNsFPMath = EnableNoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+    EnableHonorSignDependentRoundingFPMath;
+  Options.UseSoftFloat = GenerateSoftFloatCalls;
+  if (FloatABIForCalls != llvm::FloatABI::Default)
+    Options.FloatABIType = FloatABIForCalls;
+  Options.NoZerosInBSS = DontPlaceZerosInBSS;
+  Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
+  Options.DisableTailCalls = DisableTailCalls;
+  Options.StackAlignmentOverride = OverrideStackAlignment;
+  Options.TrapFuncName = TrapFuncName;
+  Options.PositionIndependentExecutable = EnablePIE;
+  Options.EnableSegmentedStacks = SegmentedStacks;
+  Options.UseInitArray = UseInitArray;
+}
+
 /// lto_get_version - Returns a printable string.
 extern const char* lto_get_version() {
   return LTOCodeGenerator::getVersionString();
@@ -63,13 +119,19 @@ lto_module_is_object_file_in_memory_for_target(const void* mem,
 /// lto_module_create - Loads an object file from disk. Returns NULL on error
 /// (check lto_get_error_message() for details).
 lto_module_t lto_module_create(const char* path) {
-  return LTOModule::makeLTOModule(path, sLastErrorString);
+  lto_initialize();
+  llvm::TargetOptions Options;
+  lto_set_target_options(Options);
+  return LTOModule::makeLTOModule(path, Options, sLastErrorString);
 }
 
 /// lto_module_create_from_fd - Loads an object file from disk. Returns NULL on
 /// error (check lto_get_error_message() for details).
 lto_module_t lto_module_create_from_fd(int fd, const char *path, size_t size) {
-  return LTOModule::makeLTOModule(fd, path, size, sLastErrorString);
+  lto_initialize();
+  llvm::TargetOptions Options;
+  lto_set_target_options(Options);
+  return LTOModule::makeLTOModule(fd, path, size, Options, sLastErrorString);
 }
 
 /// lto_module_create_from_fd_at_offset - Loads an object file from disk.
@@ -78,13 +140,20 @@ lto_module_t lto_module_create_from_fd_at_offset(int fd, const char *path,
                                                  size_t file_size,
                                                  size_t map_size,
                                                  off_t offset) {
-  return LTOModule::makeLTOModule(fd, path, map_size, offset, sLastErrorString);
+  lto_initialize();
+  llvm::TargetOptions Options;
+  lto_set_target_options(Options);
+  return LTOModule::makeLTOModule(fd, path, map_size, offset, Options,
+                                  sLastErrorString);
 }
 
 /// lto_module_create_from_memory - Loads an object file from memory. Returns
 /// NULL on error (check lto_get_error_message() for details).
 lto_module_t lto_module_create_from_memory(const void* mem, size_t length) {
-  return LTOModule::makeLTOModule(mem, length, sLastErrorString);
+  lto_initialize();
+  llvm::TargetOptions Options;
+  lto_set_target_options(Options);
+  return LTOModule::makeLTOModule(mem, length, Options, sLastErrorString);
 }
 
 /// lto_module_dispose - Frees all memory for a module. Upon return the
@@ -127,7 +196,15 @@ lto_symbol_attributes lto_module_get_symbol_attribute(lto_module_t mod,
 /// lto_codegen_create - Instantiates a code generator. Returns NULL if there
 /// is an error.
 lto_code_gen_t lto_codegen_create(void) {
-  return new LTOCodeGenerator();
+  lto_initialize();
+
+  TargetOptions Options;
+  lto_set_target_options(Options);
+
+  LTOCodeGenerator *CodeGen = new LTOCodeGenerator();
+  if (CodeGen)
+    CodeGen->setTargetOptions(Options);
+  return CodeGen;
 }
 
 /// lto_codegen_dispose - Frees all memory for a code generator. Upon return the
@@ -187,6 +264,10 @@ void lto_codegen_add_must_preserve_symbol(lto_code_gen_t cg,
 /// that contains the merged contents of all modules added so far. Returns true
 /// on error (check lto_get_error_message() for details).
 bool lto_codegen_write_merged_modules(lto_code_gen_t cg, const char *path) {
+  if (!parsedOptions) {
+    cg->parseCodeGenDebugOptions();
+    parsedOptions = true;
+  }
   return !cg->writeMergedModules(path, sLastErrorString);
 }
 
@@ -197,14 +278,24 @@ bool lto_codegen_write_merged_modules(lto_code_gen_t cg, const char *path) {
 /// lto_codegen_compile() is called again. On failure, returns NULL (check
 /// lto_get_error_message() for details).
 const void *lto_codegen_compile(lto_code_gen_t cg, size_t *length) {
-  return cg->compile(length, sLastErrorString);
+  if (!parsedOptions) {
+    cg->parseCodeGenDebugOptions();
+    parsedOptions = true;
+  }
+  return cg->compile(length, DisableOpt, DisableInline, DisableGVNLoadPRE,
+                     sLastErrorString);
 }
 
 /// lto_codegen_compile_to_file - Generates code for all added modules into one
 /// native object file. The name of the file is written to name. Returns true on
 /// error.
 bool lto_codegen_compile_to_file(lto_code_gen_t cg, const char **name) {
-  return !cg->compile_to_file(name, sLastErrorString);
+  if (!parsedOptions) {
+    cg->parseCodeGenDebugOptions();
+    parsedOptions = true;
+  }
+  return !cg->compile_to_file(name, DisableOpt, DisableInline, DisableGVNLoadPRE,
+                              sLastErrorString);
 }
 
 /// lto_codegen_debug_options - Used to pass extra options to the code
diff --git a/tools/macho-dump/macho-dump.cpp b/tools/macho-dump/macho-dump.cpp
index 897a785..0dfbd5f 100644
--- a/tools/macho-dump/macho-dump.cpp
+++ b/tools/macho-dump/macho-dump.cpp
@@ -97,12 +97,12 @@ static int DumpSectionData(const MachOObjectFile &Obj, unsigned Index,
   outs() << "  ('_relocations', [\n";
   unsigned RelNum = 0;
   error_code EC;
-  for (relocation_iterator I = Obj.getSectionRelBegin(Index),
-         E = Obj.getSectionRelEnd(Index); I != E; I.increment(EC), ++RelNum) {
-    macho::RelocationEntry RE = Obj.getRelocation(I->getRawDataRefImpl());
+  for (relocation_iterator I = Obj.section_rel_begin(Index),
+         E = Obj.section_rel_end(Index); I != E; I.increment(EC), ++RelNum) {
+    MachO::any_relocation_info RE = Obj.getRelocation(I->getRawDataRefImpl());
     outs() << "    # Relocation " << RelNum << "\n";
-    outs() << "    (('word-0', " << format("0x%x", RE.Word0) << "),\n";
-    outs() << "     ('word-1', " << format("0x%x", RE.Word1) << ")),\n";
+    outs() << "    (('word-0', " << format("0x%x", RE.r_word0) << "),\n";
+    outs() << "     ('word-1', " << format("0x%x", RE.r_word1) << ")),\n";
   }
   outs() << "  ])\n";
 
@@ -124,23 +124,21 @@ static int DumpSectionData(const MachOObjectFile &Obj, unsigned Index,
 
 static int DumpSegmentCommand(const MachOObjectFile &Obj,
                               const MachOObjectFile::LoadCommandInfo &LCI) {
-  macho::SegmentLoadCommand SLC = Obj.getSegmentLoadCommand(LCI);
+  MachO::segment_command SLC = Obj.getSegmentLoadCommand(LCI);
 
-  DumpSegmentCommandData(StringRef(SLC.Name, 16), SLC.VMAddress,
-                         SLC.VMSize, SLC.FileOffset, SLC.FileSize,
-                         SLC.MaxVMProtection, SLC.InitialVMProtection,
-                         SLC.NumSections, SLC.Flags);
+  DumpSegmentCommandData(StringRef(SLC.segname, 16), SLC.vmaddr,
+                         SLC.vmsize, SLC.fileoff, SLC.filesize,
+                         SLC.maxprot, SLC.initprot, SLC.nsects, SLC.flags);
 
   // Dump the sections.
   outs() << "  ('sections', [\n";
-  for (unsigned i = 0; i != SLC.NumSections; ++i) {
-    macho::Section Sect = Obj.getSection(LCI, i);
-    DumpSectionData(Obj, i, StringRef(Sect.Name, 16),
-                    StringRef(Sect.SegmentName, 16), Sect.Address,
-                    Sect.Size, Sect.Offset, Sect.Align,
-                    Sect.RelocationTableOffset,
-                    Sect.NumRelocationTableEntries, Sect.Flags,
-                    Sect.Reserved1, Sect.Reserved2);
+  for (unsigned i = 0; i != SLC.nsects; ++i) {
+    MachO::section Sect = Obj.getSection(LCI, i);
+    DumpSectionData(Obj, i, StringRef(Sect.sectname, 16),
+                    StringRef(Sect.segname, 16), Sect.addr,
+                    Sect.size, Sect.offset, Sect.align,
+                    Sect.reloff, Sect.nreloc, Sect.flags,
+                    Sect.reserved1, Sect.reserved2);
   }
   outs() << "  ])\n";
 
@@ -149,24 +147,22 @@ static int DumpSegmentCommand(const MachOObjectFile &Obj,
 
 static int DumpSegment64Command(const MachOObjectFile &Obj,
                                 const MachOObjectFile::LoadCommandInfo &LCI) {
-  macho::Segment64LoadCommand SLC = Obj.getSegment64LoadCommand(LCI);
-  DumpSegmentCommandData(StringRef(SLC.Name, 16), SLC.VMAddress,
-                          SLC.VMSize, SLC.FileOffset, SLC.FileSize,
-                          SLC.MaxVMProtection, SLC.InitialVMProtection,
-                          SLC.NumSections, SLC.Flags);
+  MachO::segment_command_64 SLC = Obj.getSegment64LoadCommand(LCI);
+  DumpSegmentCommandData(StringRef(SLC.segname, 16), SLC.vmaddr,
+                         SLC.vmsize, SLC.fileoff, SLC.filesize,
+                         SLC.maxprot, SLC.initprot, SLC.nsects, SLC.flags);
 
   // Dump the sections.
   outs() << "  ('sections', [\n";
-  for (unsigned i = 0; i != SLC.NumSections; ++i) {
-    macho::Section64 Sect = Obj.getSection64(LCI, i);
-
-    DumpSectionData(Obj, i, StringRef(Sect.Name, 16),
-                    StringRef(Sect.SegmentName, 16), Sect.Address,
-                    Sect.Size, Sect.Offset, Sect.Align,
-                    Sect.RelocationTableOffset,
-                    Sect.NumRelocationTableEntries, Sect.Flags,
-                    Sect.Reserved1, Sect.Reserved2,
-                    Sect.Reserved3);
+  for (unsigned i = 0; i != SLC.nsects; ++i) {
+    MachO::section_64 Sect = Obj.getSection64(LCI, i);
+
+    DumpSectionData(Obj, i, StringRef(Sect.sectname, 16),
+                    StringRef(Sect.segname, 16), Sect.addr,
+                    Sect.size, Sect.offset, Sect.align,
+                    Sect.reloff, Sect.nreloc, Sect.flags,
+                    Sect.reserved1, Sect.reserved2,
+                    Sect.reserved3);
   }
   outs() << "  ])\n";
 
@@ -190,12 +186,12 @@ static void DumpSymbolTableEntryData(const MachOObjectFile &Obj,
 }
 
 static int DumpSymtabCommand(const MachOObjectFile &Obj) {
-  macho::SymtabLoadCommand SLC = Obj.getSymtabLoadCommand();
+  MachO::symtab_command SLC = Obj.getSymtabLoadCommand();
 
-  outs() << "  ('symoff', " << SLC.SymbolTableOffset << ")\n";
-  outs() << "  ('nsyms', " << SLC.NumSymbolTableEntries << ")\n";
-  outs() << "  ('stroff', " << SLC.StringTableOffset << ")\n";
-  outs() << "  ('strsize', " << SLC.StringTableSize << ")\n";
+  outs() << "  ('symoff', " << SLC.symoff << ")\n";
+  outs() << "  ('nsyms', " << SLC.nsyms << ")\n";
+  outs() << "  ('stroff', " << SLC.stroff << ")\n";
+  outs() << "  ('strsize', " << SLC.strsize << ")\n";
 
   // Dump the string data.
   outs() << "  ('_string_data', '";
@@ -211,14 +207,14 @@ static int DumpSymtabCommand(const MachOObjectFile &Obj) {
        I.increment(EC), ++SymNum) {
     DataRefImpl DRI = I->getRawDataRefImpl();
     if (Obj.is64Bit()) {
-      macho::Symbol64TableEntry STE = Obj.getSymbol64TableEntry(DRI);
-      DumpSymbolTableEntryData(Obj, SymNum, STE.StringIndex, STE.Type,
-                               STE.SectionIndex, STE.Flags, STE.Value,
+      MachO::nlist_64 STE = Obj.getSymbol64TableEntry(DRI);
+      DumpSymbolTableEntryData(Obj, SymNum, STE.n_strx, STE.n_type,
+                               STE.n_sect, STE.n_desc, STE.n_value,
                                StringTable);
     } else {
-      macho::SymbolTableEntry STE = Obj.getSymbolTableEntry(DRI);
-      DumpSymbolTableEntryData(Obj, SymNum, STE.StringIndex, STE.Type,
-                               STE.SectionIndex, STE.Flags, STE.Value,
+      MachO::nlist STE = Obj.getSymbolTableEntry(DRI);
+      DumpSymbolTableEntryData(Obj, SymNum, STE.n_strx, STE.n_type,
+                               STE.n_sect, STE.n_desc, STE.n_value,
                                StringTable);
     }
   }
@@ -228,37 +224,33 @@ static int DumpSymtabCommand(const MachOObjectFile &Obj) {
 }
 
 static int DumpDysymtabCommand(const MachOObjectFile &Obj) {
-  macho::DysymtabLoadCommand DLC = Obj.getDysymtabLoadCommand();
-
-  outs() << "  ('ilocalsym', " << DLC.LocalSymbolsIndex << ")\n";
-  outs() << "  ('nlocalsym', " << DLC.NumLocalSymbols << ")\n";
-  outs() << "  ('iextdefsym', " << DLC.ExternalSymbolsIndex << ")\n";
-  outs() << "  ('nextdefsym', " << DLC.NumExternalSymbols << ")\n";
-  outs() << "  ('iundefsym', " << DLC.UndefinedSymbolsIndex << ")\n";
-  outs() << "  ('nundefsym', " << DLC.NumUndefinedSymbols << ")\n";
-  outs() << "  ('tocoff', " << DLC.TOCOffset << ")\n";
-  outs() << "  ('ntoc', " << DLC.NumTOCEntries << ")\n";
-  outs() << "  ('modtaboff', " << DLC.ModuleTableOffset << ")\n";
-  outs() << "  ('nmodtab', " << DLC.NumModuleTableEntries << ")\n";
-  outs() << "  ('extrefsymoff', " << DLC.ReferenceSymbolTableOffset << ")\n";
-  outs() << "  ('nextrefsyms', "
-         << DLC.NumReferencedSymbolTableEntries << ")\n";
-  outs() << "  ('indirectsymoff', " << DLC.IndirectSymbolTableOffset << ")\n";
-  outs() << "  ('nindirectsyms', "
-         << DLC.NumIndirectSymbolTableEntries << ")\n";
-  outs() << "  ('extreloff', " << DLC.ExternalRelocationTableOffset << ")\n";
-  outs() << "  ('nextrel', " << DLC.NumExternalRelocationTableEntries << ")\n";
-  outs() << "  ('locreloff', " << DLC.LocalRelocationTableOffset << ")\n";
-  outs() << "  ('nlocrel', " << DLC.NumLocalRelocationTableEntries << ")\n";
+  MachO::dysymtab_command DLC = Obj.getDysymtabLoadCommand();
+
+  outs() << "  ('ilocalsym', " << DLC.ilocalsym << ")\n";
+  outs() << "  ('nlocalsym', " << DLC.nlocalsym << ")\n";
+  outs() << "  ('iextdefsym', " << DLC.iextdefsym << ")\n";
+  outs() << "  ('nextdefsym', " << DLC.nextdefsym << ")\n";
+  outs() << "  ('iundefsym', " << DLC.iundefsym << ")\n";
+  outs() << "  ('nundefsym', " << DLC.nundefsym << ")\n";
+  outs() << "  ('tocoff', " << DLC.tocoff << ")\n";
+  outs() << "  ('ntoc', " << DLC.ntoc << ")\n";
+  outs() << "  ('modtaboff', " << DLC.modtaboff << ")\n";
+  outs() << "  ('nmodtab', " << DLC.nmodtab << ")\n";
+  outs() << "  ('extrefsymoff', " << DLC.extrefsymoff << ")\n";
+  outs() << "  ('nextrefsyms', " << DLC.nextrefsyms << ")\n";
+  outs() << "  ('indirectsymoff', " << DLC.indirectsymoff << ")\n";
+  outs() << "  ('nindirectsyms', " << DLC.nindirectsyms << ")\n";
+  outs() << "  ('extreloff', " << DLC.extreloff << ")\n";
+  outs() << "  ('nextrel', " << DLC.nextrel << ")\n";
+  outs() << "  ('locreloff', " << DLC.locreloff << ")\n";
+  outs() << "  ('nlocrel', " << DLC.nlocrel << ")\n";
 
   // Dump the indirect symbol table.
   outs() << "  ('_indirect_symbols', [\n";
-  for (unsigned i = 0; i != DLC.NumIndirectSymbolTableEntries; ++i) {
-    macho::IndirectSymbolTableEntry ISTE =
-      Obj.getIndirectSymbolTableEntry(DLC, i);
+  for (unsigned i = 0; i != DLC.nindirectsyms; ++i) {
+    uint32_t ISTE = Obj.getIndirectSymbolTableEntry(DLC, i);
     outs() << "    # Indirect Symbol " << i << "\n";
-    outs() << "    (('symbol_index', "
-           << format("0x%x", ISTE.Index) << "),),\n";
+    outs() << "    (('symbol_index', " << format("0x%x", ISTE) << "),),\n";
   }
   outs() << "  ])\n";
 
@@ -268,13 +260,13 @@ static int DumpDysymtabCommand(const MachOObjectFile &Obj) {
 static int
 DumpLinkeditDataCommand(const MachOObjectFile &Obj,
                         const MachOObjectFile::LoadCommandInfo &LCI) {
-  macho::LinkeditDataLoadCommand LLC = Obj.getLinkeditDataLoadCommand(LCI);
-  outs() << "  ('dataoff', " << LLC.DataOffset << ")\n"
-         << "  ('datasize', " << LLC.DataSize << ")\n"
+  MachO::linkedit_data_command LLC = Obj.getLinkeditDataLoadCommand(LCI);
+  outs() << "  ('dataoff', " << LLC.dataoff << ")\n"
+         << "  ('datasize', " << LLC.datasize << ")\n"
          << "  ('_addresses', [\n";
 
   SmallVector<uint64_t, 8> Addresses;
-  Obj.ReadULEB128s(LLC.DataOffset, Addresses);
+  Obj.ReadULEB128s(LLC.dataoff, Addresses);
   for (unsigned i = 0, e = Addresses.size(); i != e; ++i)
     outs() << "    # Address " << i << '\n'
            << "    ('address', " << format("0x%x", Addresses[i]) << "),\n";
@@ -287,19 +279,18 @@ DumpLinkeditDataCommand(const MachOObjectFile &Obj,
 static int
 DumpDataInCodeDataCommand(const MachOObjectFile &Obj,
                           const MachOObjectFile::LoadCommandInfo &LCI) {
-  macho::LinkeditDataLoadCommand LLC = Obj.getLinkeditDataLoadCommand(LCI);
-  outs() << "  ('dataoff', " << LLC.DataOffset << ")\n"
-         << "  ('datasize', " << LLC.DataSize << ")\n"
+  MachO::linkedit_data_command LLC = Obj.getLinkeditDataLoadCommand(LCI);
+  outs() << "  ('dataoff', " << LLC.dataoff << ")\n"
+         << "  ('datasize', " << LLC.datasize << ")\n"
          << "  ('_data_regions', [\n";
 
-  unsigned NumRegions = LLC.DataSize / sizeof(macho::DataInCodeTableEntry);
+  unsigned NumRegions = LLC.datasize / sizeof(MachO::data_in_code_entry);
   for (unsigned i = 0; i < NumRegions; ++i) {
-    macho::DataInCodeTableEntry DICE =
-      Obj.getDataInCodeTableEntry(LLC.DataOffset, i);
+    MachO::data_in_code_entry DICE= Obj.getDataInCodeTableEntry(LLC.dataoff, i);
     outs() << "    # DICE " << i << "\n"
-           << "    ('offset', " << DICE.Offset << ")\n"
-           << "    ('length', " << DICE.Length << ")\n"
-           << "    ('kind', " << DICE.Kind << ")\n";
+           << "    ('offset', " << DICE.offset << ")\n"
+           << "    ('length', " << DICE.length << ")\n"
+           << "    ('kind', " << DICE.kind << ")\n";
   }
 
   outs() <<"  ])\n";
@@ -310,46 +301,46 @@ DumpDataInCodeDataCommand(const MachOObjectFile &Obj,
 static int
 DumpLinkerOptionsCommand(const MachOObjectFile &Obj,
                          const MachOObjectFile::LoadCommandInfo &LCI) {
-  macho::LinkerOptionsLoadCommand LOLC = Obj.getLinkerOptionsLoadCommand(LCI);
-   outs() << "  ('count', " << LOLC.Count << ")\n"
-          << "  ('_strings', [\n";
-
-   uint64_t DataSize = LOLC.Size - sizeof(macho::LinkerOptionsLoadCommand);
-   const char *P = LCI.Ptr + sizeof(macho::LinkerOptionsLoadCommand);
-   StringRef Data(P, DataSize);
-   for (unsigned i = 0; i != LOLC.Count; ++i) {
-     std::pair<StringRef,StringRef> Split = Data.split('\0');
-     outs() << "\t\"";
-     outs().write_escaped(Split.first);
-     outs() << "\",\n";
-     Data = Split.second;
-   }
-   outs() <<"  ])\n";
+  MachO::linker_options_command LOLC = Obj.getLinkerOptionsLoadCommand(LCI);
+  outs() << "  ('count', " << LOLC.count << ")\n"
+         << "  ('_strings', [\n";
+
+  uint64_t DataSize = LOLC.cmdsize - sizeof(MachO::linker_options_command);
+  const char *P = LCI.Ptr + sizeof(MachO::linker_options_command);
+  StringRef Data(P, DataSize);
+  for (unsigned i = 0; i != LOLC.count; ++i) {
+    std::pair<StringRef,StringRef> Split = Data.split('\0');
+    outs() << "\t\"";
+    outs().write_escaped(Split.first);
+    outs() << "\",\n";
+    Data = Split.second;
+  }
+  outs() <<"  ])\n";
 
   return 0;
 }
 
 static int DumpLoadCommand(const MachOObjectFile &Obj,
                            MachOObjectFile::LoadCommandInfo &LCI) {
-  switch (LCI.C.Type) {
-  case macho::LCT_Segment:
+  switch (LCI.C.cmd) {
+  case MachO::LC_SEGMENT:
     return DumpSegmentCommand(Obj, LCI);
-  case macho::LCT_Segment64:
+  case MachO::LC_SEGMENT_64:
     return DumpSegment64Command(Obj, LCI);
-  case macho::LCT_Symtab:
+  case MachO::LC_SYMTAB:
     return DumpSymtabCommand(Obj);
-  case macho::LCT_Dysymtab:
+  case MachO::LC_DYSYMTAB:
     return DumpDysymtabCommand(Obj);
-  case macho::LCT_CodeSignature:
-  case macho::LCT_SegmentSplitInfo:
-  case macho::LCT_FunctionStarts:
+  case MachO::LC_CODE_SIGNATURE:
+  case MachO::LC_SEGMENT_SPLIT_INFO:
+  case MachO::LC_FUNCTION_STARTS:
     return DumpLinkeditDataCommand(Obj, LCI);
-  case macho::LCT_DataInCode:
+  case MachO::LC_DATA_IN_CODE:
     return DumpDataInCodeDataCommand(Obj, LCI);
-  case macho::LCT_LinkerOptions:
+  case MachO::LC_LINKER_OPTIONS:
     return DumpLinkerOptionsCommand(Obj, LCI);
   default:
-    Warning("unknown load command: " + Twine(LCI.C.Type));
+    Warning("unknown load command: " + Twine(LCI.C.cmd));
     return 0;
   }
 }
@@ -358,26 +349,27 @@ static int DumpLoadCommand(const MachOObjectFile &Obj,
 static int DumpLoadCommand(const MachOObjectFile &Obj, unsigned Index,
                            MachOObjectFile::LoadCommandInfo &LCI) {
   outs() << "  # Load Command " << Index << "\n"
-         << " (('command', " << LCI.C.Type << ")\n"
-         << "  ('size', " << LCI.C.Size << ")\n";
+         << " (('command', " << LCI.C.cmd << ")\n"
+         << "  ('size', " << LCI.C.cmdsize << ")\n";
   int Res = DumpLoadCommand(Obj, LCI);
   outs() << " ),\n";
   return Res;
 }
 
 static void printHeader(const MachOObjectFile *Obj,
-                        const macho::Header &Header) {
-  outs() << "('cputype', " << Header.CPUType << ")\n";
-  outs() << "('cpusubtype', " << Header.CPUSubtype << ")\n";
-  outs() << "('filetype', " << Header.FileType << ")\n";
-  outs() << "('num_load_commands', " << Header.NumLoadCommands << ")\n";
-  outs() << "('load_commands_size', " << Header.SizeOfLoadCommands << ")\n";
-  outs() << "('flag', " << Header.Flags << ")\n";
+                        const MachO::mach_header &Header) {
+  outs() << "('cputype', " << Header.cputype << ")\n";
+  outs() << "('cpusubtype', " << Header.cpusubtype << ")\n";
+  outs() << "('filetype', " << Header.filetype << ")\n";
+  outs() << "('num_load_commands', " << Header.ncmds << ")\n";
+  outs() << "('load_commands_size', " << Header.sizeofcmds << ")\n";
+  outs() << "('flag', " << Header.flags << ")\n";
 
   // Print extended header if 64-bit.
   if (Obj->is64Bit()) {
-    macho::Header64Ext Header64Ext = Obj->getHeader64Ext();
-    outs() << "('reserved', " << Header64Ext.Reserved << ")\n";
+    const MachO::mach_header_64 *Header64 =
+      reinterpret_cast<const MachO::mach_header_64 *>(&Header);
+    outs() << "('reserved', " << Header64->reserved << ")\n";
   }
 }
 
@@ -396,8 +388,13 @@ int main(int argc, char **argv) {
     return Error("Not a MachO object");
 
   // Print the header
-  macho::Header Header = InputObject->getHeader();
-  printHeader(InputObject, Header);
+  MachO::mach_header_64 Header64;
+  MachO::mach_header *Header = reinterpret_cast<MachO::mach_header*>(&Header64);
+  if (InputObject->is64Bit())
+    Header64 = InputObject->getHeader64();
+  else
+    *Header = InputObject->getHeader();
+  printHeader(InputObject, *Header);
 
   // Print the load commands.
   int Res = 0;
@@ -408,7 +405,7 @@ int main(int argc, char **argv) {
     if (DumpLoadCommand(*InputObject, i, Command))
       break;
 
-    if (i == Header.NumLoadCommands - 1)
+    if (i == Header->ncmds - 1)
       break;
     Command = InputObject->getNextLoadCommandInfo(Command);
   }
diff --git a/tools/msbuild/CMakeLists.txt b/tools/msbuild/CMakeLists.txt
new file mode 100644
index 0000000..08b8aee
--- /dev/null
+++ b/tools/msbuild/CMakeLists.txt
@@ -0,0 +1,42 @@
+if (WIN32)
+  set(prop_file_in "Microsoft.Cpp.Win32.llvm.props.in")
+  set(prop_file_v100 "Microsoft.Cpp.Win32.LLVM-vs2010.props")
+  set(prop_file_v110 "Microsoft.Cpp.Win32.LLVM-vs2012.props")
+  set(prop_file_v110_xp "Microsoft.Cpp.Win32.LLVM-vs2012_xp.props")
+  set(prop_file_v120 "toolset-vs2013.props")
+  set(prop_file_v120_xp "toolset-vs2013_xp.props")
+
+  # CPack will install a registry key in this format that we wish to reference.
+  set(REG_KEY "${CPACK_PACKAGE_INSTALL_REGISTRY_KEY}")
+
+  set(VS_VERSION "v100")
+  set(MSC_VERSION "1600")
+  configure_file(${prop_file_in} ${prop_file_v100})
+  set(VS_VERSION "v110")
+  set(MSC_VERSION "1700")
+  configure_file(${prop_file_in} ${prop_file_v110})
+  set(VS_VERSION "v110_xp")
+  configure_file(${prop_file_in} ${prop_file_v110_xp})
+  set(VS_VERSION "v120")
+  set(MSC_VERSION "1800")
+  configure_file(${prop_file_in} ${prop_file_v120})
+  set(VS_VERSION "v120_xp")
+  configure_file(${prop_file_in} ${prop_file_v120_xp})
+
+  set(REG_KEY)
+  set(VS_VERSION)
+  set(MSC_VERSION)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${prop_file_v100}" DESTINATION tools/msbuild)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${prop_file_v110}" DESTINATION tools/msbuild)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${prop_file_v110_xp}" DESTINATION tools/msbuild)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${prop_file_v120}" DESTINATION tools/msbuild)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${prop_file_v120_xp}" DESTINATION tools/msbuild)
+
+  install(DIRECTORY .
+    DESTINATION tools/msbuild
+    FILES_MATCHING
+    PATTERN "*.targets"
+    PATTERN "*.bat"
+    PATTERN ".svn" EXCLUDE
+    )
+endif()
diff --git a/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2010.targets b/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2010.targets
new file mode 100644
index 0000000..df41a84
--- /dev/null
+++ b/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2010.targets
@@ -0,0 +1,2 @@
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+</Project>
diff --git a/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2012.targets b/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2012.targets
new file mode 100644
index 0000000..f7432f2
--- /dev/null
+++ b/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2012.targets
@@ -0,0 +1,3 @@
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(VCTargetsPath)\Microsoft.CppCommon.targets" />
+</Project>
diff --git a/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2012_xp.targets b/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2012_xp.targets
new file mode 100644
index 0000000..e8250d8
--- /dev/null
+++ b/tools/msbuild/Microsoft.Cpp.Win32.LLVM-vs2012_xp.targets
@@ -0,0 +1,21 @@
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <!-- Force TargetFrameworkVersion to v4.0 to support XP-->
+  <PropertyGroup>
+    <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
+    <BeforeClCompileTargets>NoSupportCodeAnalysisXP;$(BeforeClCompileTargets)</BeforeClCompileTargets>
+  </PropertyGroup>
+
+  <Import Project="$(VCTargetsPath)\Microsoft.CppCommon.targets" />
+
+  <Target Name="NoSupportCodeAnalysisXP" Condition="'$(ErrorNoSupportCodeAnalysisXP)' != 'false'">
+    <VCMessage Condition="'$(DesignTimeBuild)' != 'true' and '@(ClCompile->AnyHaveMetadataValue('EnablePREfast', 'true'))'=='true'" Code="MSB8026" Type="Error"/>
+  </Target>
+
+  <PropertyGroup>
+    <PrepareForBuildDependsOn>CheckWindowsSDK71A;$(PrepareForBuildDependsOn)</PrepareForBuildDependsOn>
+  </PropertyGroup>
+
+  <Target Name="CheckWindowsSDK71A">
+    <VCMessage Code="MSB8003" Type="Warning" Arguments="WindowsSdkDir_71A" Condition="'$(WindowsSdkDir_71A)'=='' and '$(UseEnv)' != 'true'" />
+  </Target>
+</Project>
diff --git a/tools/msbuild/Microsoft.Cpp.Win32.llvm.props.in b/tools/msbuild/Microsoft.Cpp.Win32.llvm.props.in
new file mode 100644
index 0000000..a6ef4ea
--- /dev/null
+++ b/tools/msbuild/Microsoft.Cpp.Win32.llvm.props.in
@@ -0,0 +1,18 @@
+<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">  
+  <Import Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\@VS_VERSION@\Microsoft.Cpp.$(Platform).@VS_VERSION@.props" Condition="Exists('$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\@VS_VERSION@\Microsoft.Cpp.$(Platform).@VS_VERSION@.props')"/>
+  <Import Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\@VS_VERSION@\Toolset.props" Condition="Exists('$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\@VS_VERSION@\Toolset.props')"/>
+
+  <PropertyGroup>
+    <LLVMInstallDir>$(Registry:HKEY_LOCAL_MACHINE\SOFTWARE\LLVM\@REG_KEY@)</LLVMInstallDir>
+    <LLVMInstallDir Condition="'$(LLVMInstallDir)' == ''">$(Registry:HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\LLVM\@REG_KEY@)</LLVMInstallDir>
+    <ExecutablePath>$(LLVMInstallDir)\msbuild-bin;$(ExecutablePath)</ExecutablePath>
+    <LibraryPath>$(LLVMInstallDir)\lib\clang\3.4\lib\windows;$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <!-- Set the value of _MSC_VER to claim for compatibility -->
+      <AdditionalOptions>-fmsc-version=@MSC_VERSION@ %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+</Project>
diff --git a/tools/msbuild/install.bat b/tools/msbuild/install.bat
new file mode 100644
index 0000000..c4c61ac
--- /dev/null
+++ b/tools/msbuild/install.bat
@@ -0,0 +1,81 @@
+@echo off
+
+echo Installing MSVC integration...
+set SUCCESS=0
+
+REM Change to the directory of this batch file.
+cd /d %~dp0
+
+REM Search for the MSBuild toolsets directory.
+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\Platforms\Win32\PlatformToolsets"
+IF EXIST %D% GOTO FOUND_V100
+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\Platforms\Win32\PlatformToolsets"
+IF EXIST %D% GOTO FOUND_V100
+
+:TRY_V110
+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\V110\Platforms\Win32\PlatformToolsets"
+IF EXIST %D% GOTO FOUND_V110
+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\V110\Platforms\Win32\PlatformToolsets"
+IF EXIST %D% GOTO FOUND_V110
+
+:TRY_V120
+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\V120\Platforms\Win32\PlatformToolsets"
+IF EXIST %D% GOTO FOUND_V120
+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\V120\Platforms\Win32\PlatformToolsets"
+IF EXIST %D% GOTO FOUND_V120
+
+IF %SUCCESS% == 1 goto DONE
+echo Failed to find MSBuild toolsets directory.
+goto FAILED
+
+
+:FOUND_V100
+IF NOT EXIST %D%\LLVM-vs2010 mkdir %D%\LLVM-vs2010
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy Microsoft.Cpp.Win32.LLVM-vs2010.props %D%\LLVM-vs2010
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy Microsoft.Cpp.Win32.LLVM-vs2010.targets %D%\LLVM-vs2010
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+set SUCCESS=1
+GOTO TRY_V110
+
+:FOUND_V110
+IF NOT EXIST %D%\LLVM-vs2012 mkdir %D%\LLVM-vs2012
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy Microsoft.Cpp.Win32.LLVM-vs2012.props %D%\LLVM-vs2012
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy Microsoft.Cpp.Win32.LLVM-vs2012.targets %D%\LLVM-vs2012
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+IF NOT EXIST %D%\LLVM-vs2012_xp mkdir %D%\LLVM-vs2012_xp
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy Microsoft.Cpp.Win32.LLVM-vs2012_xp.props %D%\LLVM-vs2012_xp
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy Microsoft.Cpp.Win32.LLVM-vs2012_xp.targets %D%\LLVM-vs2012_xp
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+set SUCCESS=1
+GOTO TRY_V120
+
+:FOUND_V120
+IF NOT EXIST %D%\LLVM-vs2013 mkdir %D%\LLVM-vs2013
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy toolset-vs2013.props %D%\LLVM-vs2013\toolset.props
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy toolset-vs2013.targets %D%\LLVM-vs2013\toolset.targets
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+IF NOT EXIST %D%\LLVM-vs2013_xp mkdir %D%\LLVM-vs2013_xp
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy toolset-vs2013_xp.props %D%\LLVM-vs2013_xp\toolset.props
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+copy toolset-vs2013_xp.targets %D%\LLVM-vs2013_xp\toolset.targets
+IF NOT %ERRORLEVEL% == 0 GOTO FAILED
+
+:DONE
+echo Done!
+goto END
+
+:FAILED
+echo MSVC integration install failed.
+pause
+goto END
+
+:END
diff --git a/tools/msbuild/toolset-vs2013.targets b/tools/msbuild/toolset-vs2013.targets
new file mode 100644
index 0000000..a6efac4
--- /dev/null
+++ b/tools/msbuild/toolset-vs2013.targets
@@ -0,0 +1,3 @@
+<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(VCTargetsPath)\Microsoft.CppCommon.targets" />
+</Project>
diff --git a/tools/msbuild/toolset-vs2013_xp.targets b/tools/msbuild/toolset-vs2013_xp.targets
new file mode 100644
index 0000000..e719681
--- /dev/null
+++ b/tools/msbuild/toolset-vs2013_xp.targets
@@ -0,0 +1,21 @@
+<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <!-- Force TargetFrameworkVersion to v4.0 to support XP-->
+  <PropertyGroup>
+    <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
+    <BeforeClCompileTargets>NoSupportCodeAnalysisXP;$(BeforeClCompileTargets)</BeforeClCompileTargets>
+  </PropertyGroup>
+
+  <Import Project="$(VCTargetsPath)\Microsoft.CppCommon.targets" />
+
+  <Target Name="NoSupportCodeAnalysisXP" Condition="'$(ErrorNoSupportCodeAnalysisXP)' != 'false'">
+    <VCMessage Condition="'$(DesignTimeBuild)' != 'true' and '@(ClCompile->AnyHaveMetadataValue('EnablePREfast', 'true'))'=='true'" Code="MSB8026" Type="Error"/>
+  </Target>
+
+  <PropertyGroup>
+    <PrepareForBuildDependsOn>CheckWindowsSDK71A;$(PrepareForBuildDependsOn)</PrepareForBuildDependsOn>
+  </PropertyGroup>
+
+  <Target Name="CheckWindowsSDK71A">
+    <VCMessage Code="MSB8003" Type="Warning" Arguments="WindowsSdkDir_71A" Condition="'$(WindowsSdkDir_71A)'=='' and '$(UseEnv)' != 'true'" />
+  </Target>
+</Project>
diff --git a/tools/msbuild/uninstall.bat b/tools/msbuild/uninstall.bat
new file mode 100644
index 0000000..7e94f87
--- /dev/null
+++ b/tools/msbuild/uninstall.bat
@@ -0,0 +1,50 @@
+@echo off
+
+echo Uninstalling MSVC integration...
+
+REM CD to the directory of this batch file.
+cd /d %~dp0
+
+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\Platforms\Win32\PlatformToolsets"
+IF EXIST %D%\LLVM-vs2010 del %D%\LLVM-vs2010\Microsoft.Cpp.Win32.LLVM-vs2010.props
+IF EXIST %D%\LLVM-vs2010 del %D%\LLVM-vs2010\Microsoft.Cpp.Win32.LLVM-vs2010.targets
+IF EXIST %D%\LLVM-vs2010 rmdir %D%\LLVM-vs2010
+
+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\Platforms\Win32\PlatformToolsets"
+IF EXIST %D%\LLVM-vs2010 del %D%\LLVM-vs2010\Microsoft.Cpp.Win32.LLVM-vs2010.props
+IF EXIST %D%\LLVM-vs2010 del %D%\LLVM-vs2010\Microsoft.Cpp.Win32.LLVM-vs2010.targets
+IF EXIST %D%\LLVM-vs2010 rmdir %D%\LLVM-vs2010
+
+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\V110\Platforms\Win32\PlatformToolsets"
+IF EXIST %D%\LLVM-vs2012 del %D%\LLVM-vs2012\Microsoft.Cpp.Win32.LLVM-vs2012.props
+IF EXIST %D%\LLVM-vs2012 del %D%\LLVM-vs2012\Microsoft.Cpp.Win32.LLVM-vs2012.targets
+IF EXIST %D%\LLVM-vs2012 rmdir %D%\LLVM-vs2012
+IF EXIST %D%\LLVM-vs2012_xp del %D%\LLVM-vs2012_xp\Microsoft.Cpp.Win32.LLVM-vs2012_xp.props
+IF EXIST %D%\LLVM-vs2012_xp del %D%\LLVM-vs2012_xp\Microsoft.Cpp.Win32.LLVM-vs2012_xp.targets
+IF EXIST %D%\LLVM-vs2012_xp rmdir %D%\LLVM-vs2012_xp
+
+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\V110\Platforms\Win32\PlatformToolsets"
+IF EXIST %D%\LLVM-vs2012 del %D%\LLVM-vs2012\Microsoft.Cpp.Win32.LLVM-vs2012.props
+IF EXIST %D%\LLVM-vs2012 del %D%\LLVM-vs2012\Microsoft.Cpp.Win32.LLVM-vs2012.targets
+IF EXIST %D%\LLVM-vs2012 rmdir %D%\LLVM-vs2012
+IF EXIST %D%\LLVM-vs2012_xp del %D%\LLVM-vs2012_xp\Microsoft.Cpp.Win32.LLVM-vs2012_xp.props
+IF EXIST %D%\LLVM-vs2012_xp del %D%\LLVM-vs2012_xp\Microsoft.Cpp.Win32.LLVM-vs2012_xp.targets
+IF EXIST %D%\LLVM-vs2012_xp rmdir %D%\LLVM-vs2012_xp
+
+SET D="%ProgramFiles%\MSBuild\Microsoft.Cpp\v4.0\V120\Platforms\Win32\PlatformToolsets"
+IF EXIST %D%\LLVM-vs2013 del %D%\LLVM-vs2013\toolset.props
+IF EXIST %D%\LLVM-vs2013 del %D%\LLVM-vs2013\toolset.targets
+IF EXIST %D%\LLVM-vs2013 rmdir %D%\LLVM-vs2013
+IF EXIST %D%\LLVM-vs2013_xp del %D%\LLVM-vs2013_xp\toolset.props
+IF EXIST %D%\LLVM-vs2013_xp del %D%\LLVM-vs2013_xp\toolset.targets
+IF EXIST %D%\LLVM-vs2013_xp rmdir %D%\LLVM-vs2013_xp
+
+SET D="%ProgramFiles(x86)%\MSBuild\Microsoft.Cpp\v4.0\V120\Platforms\Win32\PlatformToolsets"
+IF EXIST %D%\LLVM-vs2013 del %D%\LLVM-vs2013\toolset.props
+IF EXIST %D%\LLVM-vs2013 del %D%\LLVM-vs2013\toolset.targets
+IF EXIST %D%\LLVM-vs2013 rmdir %D%\LLVM-vs2013
+IF EXIST %D%\LLVM-vs2013_xp del %D%\LLVM-vs2013_xp\toolset.props
+IF EXIST %D%\LLVM-vs2013_xp del %D%\LLVM-vs2013_xp\toolset.targets
+IF EXIST %D%\LLVM-vs2013_xp rmdir %D%\LLVM-vs2013_xp
+
+echo Done!
diff --git a/tools/obj2yaml/coff2yaml.cpp b/tools/obj2yaml/coff2yaml.cpp
index 1e28c4e..02d7ebf 100644
--- a/tools/obj2yaml/coff2yaml.cpp
+++ b/tools/obj2yaml/coff2yaml.cpp
@@ -72,7 +72,6 @@ void COFFDumper::dumpSections(unsigned NumSections) {
       const object::coff_relocation *reloc = Obj.getCOFFRelocation(rIter);
       COFFYAML::Relocation Rel;
       object::symbol_iterator Sym = rIter->getSymbol();
-      StringRef Name;
       Sym->getName(Rel.SymbolName);
       Rel.VirtualAddress = reloc->VirtualAddress;
       Rel.Type = reloc->Type;
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 37637ca..dba16f7 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -136,6 +136,21 @@ UnitAtATime("funit-at-a-time",
             cl::init(true));
 
 static cl::opt<bool>
+DisableLoopUnrolling("disable-loop-unrolling",
+                     cl::desc("Disable loop unrolling in all relevant passes"),
+                     cl::init(false));
+static cl::opt<bool>
+DisableLoopVectorization("disable-loop-vectorization",
+                     cl::desc("Disable the loop vectorization pass"),
+                     cl::init(false));
+
+static cl::opt<bool>
+DisableSLPVectorization("disable-slp-vectorization",
+                        cl::desc("Disable the slp vectorization pass"),
+                        cl::init(false));
+
+
+static cl::opt<bool>
 DisableSimplifyLibCalls("disable-simplify-libcalls",
                         cl::desc("Disable simplify-libcalls"));
 
@@ -362,6 +377,7 @@ char BasicBlockPassPrinter::ID = 0;
 struct BreakpointPrinter : public ModulePass {
   raw_ostream &Out;
   static char ID;
+  DITypeIdentifierMap TypeIdentifierMap;
 
   BreakpointPrinter(raw_ostream &out)
     : ModulePass(ID), Out(out) {
@@ -377,13 +393,18 @@ struct BreakpointPrinter : public ModulePass {
     } else if (Context.isType()) {
       DIType TY(Context);
       if (!TY.getName().empty()) {
-        getContextName(TY.getContext(), N);
+        getContextName(TY.getContext().resolve(TypeIdentifierMap), N);
         N = N + TY.getName().str() + "::";
       }
     }
   }
 
   virtual bool runOnModule(Module &M) {
+    TypeIdentifierMap.clear();
+    NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu");
+    if (CU_Nodes)
+      TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
+
     StringSet<> Processed;
     if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp"))
       for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
@@ -393,7 +414,7 @@ struct BreakpointPrinter : public ModulePass {
           "A MDNode in llvm.dbg.sp should be null or a DISubprogram.");
         if (!SP)
           continue;
-        getContextName(SP.getContext(), Name);
+        getContextName(SP.getContext().resolve(TypeIdentifierMap), Name);
         Name = Name + SP.getDisplayName().str();
         if (!Name.empty() && Processed.insert(Name)) {
           Out << Name << "\n";
@@ -406,7 +427,7 @@ struct BreakpointPrinter : public ModulePass {
     AU.setPreservesAll();
   }
 };
- 
+
 } // anonymous namespace
 
 char BreakpointPrinter::ID = 0;
@@ -447,8 +468,14 @@ static void AddOptimizationPasses(PassManagerBase &MPM,FunctionPassManager &FPM,
     Builder.Inliner = createAlwaysInlinerPass();
   }
   Builder.DisableUnitAtATime = !UnitAtATime;
-  Builder.DisableUnrollLoops = OptLevel == 0;
-  
+  Builder.DisableUnrollLoops = (DisableLoopUnrolling.getNumOccurrences() > 0) ?
+                               DisableLoopUnrolling : OptLevel == 0;
+
+  Builder.LoopVectorize =
+      DisableLoopVectorization ? false : OptLevel > 1 && SizeLevel < 2;
+  Builder.SLPVectorize =
+      DisableSLPVectorization ? false : OptLevel > 1 && SizeLevel < 2;
+
   Builder.populateFunctionPassManager(FPM);
   Builder.populateModulePassManager(MPM);
 }
diff --git a/tools/yaml2obj/yaml2coff.cpp b/tools/yaml2obj/yaml2coff.cpp
index 11aae0e..c757eb6 100644
--- a/tools/yaml2obj/yaml2coff.cpp
+++ b/tools/yaml2obj/yaml2coff.cpp
@@ -32,7 +32,7 @@ struct COFFParser {
   COFFParser(COFFYAML::Object &Obj) : Obj(Obj) {
     // A COFF string table always starts with a 4 byte size field. Offsets into
     // it include this size, so allocate it now.
-    StringTable.append(4, 0);
+    StringTable.append(4, char(0));
   }
 
   bool parseSections() {
diff --git a/tools/yaml2obj/yaml2elf.cpp b/tools/yaml2obj/yaml2elf.cpp
index 61252a4..d46e154 100644
--- a/tools/yaml2obj/yaml2elf.cpp
+++ b/tools/yaml2obj/yaml2elf.cpp
@@ -13,7 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "yaml2obj.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFYAML.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -119,13 +120,13 @@ public:
 } // end anonymous namespace
 
 template <class T>
-static size_t vectorDataSize(const std::vector<T> &Vec) {
-  return Vec.size() * sizeof(T);
+static size_t arrayDataSize(ArrayRef<T> A) {
+  return A.size() * sizeof(T);
 }
 
 template <class T>
-static void writeVectorData(raw_ostream &OS, const std::vector<T> &Vec) {
-  OS.write((const char *)Vec.data(), vectorDataSize(Vec));
+static void writeArrayData(raw_ostream &OS, ArrayRef<T> A) {
+  OS.write((const char *)A.data(), arrayDataSize(A));
 }
 
 template <class T>
@@ -157,7 +158,7 @@ class ELFState {
   unsigned DotStrtabSecNo;
   /// \brief The accumulated contents of all sections so far.
   ContiguousBlobAccumulator &SectionContentAccum;
-  typedef typename object::ELFObjectFile<ELFT>::Elf_Ehdr Elf_Ehdr;
+  typedef typename object::ELFFile<ELFT>::Elf_Ehdr Elf_Ehdr;
   /// \brief The ELF file header.
   Elf_Ehdr &Header;
 
@@ -185,9 +186,9 @@ public:
 template <class ELFT>
 static void
 addSymbols(const std::vector<ELFYAML::Symbol> &Symbols, ELFState<ELFT> &State,
-           std::vector<typename object::ELFObjectFile<ELFT>::Elf_Sym> &Syms,
+           std::vector<typename object::ELFFile<ELFT>::Elf_Sym> &Syms,
            unsigned SymbolBinding) {
-  typedef typename object::ELFObjectFile<ELFT>::Elf_Sym Elf_Sym;
+  typedef typename object::ELFFile<ELFT>::Elf_Sym Elf_Sym;
   for (unsigned i = 0, e = Symbols.size(); i != e; ++i) {
     const ELFYAML::Symbol &Sym = Symbols[i];
     Elf_Sym Symbol;
@@ -211,11 +212,12 @@ addSymbols(const std::vector<ELFYAML::Symbol> &Symbols, ELFState<ELFT> &State,
 }
 
 template <class ELFT>
-static void handleSymtabSectionHeader(
-    const ELFYAML::LocalGlobalWeakSymbols &Symbols, ELFState<ELFT> &State,
-    typename object::ELFObjectFile<ELFT>::Elf_Shdr &SHeader) {
+static void
+handleSymtabSectionHeader(const ELFYAML::LocalGlobalWeakSymbols &Symbols,
+                          ELFState<ELFT> &State,
+                          typename object::ELFFile<ELFT>::Elf_Shdr &SHeader) {
 
-  typedef typename object::ELFObjectFile<ELFT>::Elf_Sym Elf_Sym;
+  typedef typename object::ELFFile<ELFT>::Elf_Sym Elf_Sym;
   SHeader.sh_type = ELF::SHT_SYMTAB;
   SHeader.sh_link = State.getDotStrTabSecNo();
   // One greater than symbol table index of the last local symbol.
@@ -234,15 +236,16 @@ static void handleSymtabSectionHeader(
   addSymbols(Symbols.Weak, State, Syms, ELF::STB_WEAK);
 
   ContiguousBlobAccumulator &CBA = State.getSectionContentAccum();
-  writeVectorData(CBA.getOSAndAlignedOffset(SHeader.sh_offset), Syms);
-  SHeader.sh_size = vectorDataSize(Syms);
+  writeArrayData(CBA.getOSAndAlignedOffset(SHeader.sh_offset),
+                 makeArrayRef(Syms));
+  SHeader.sh_size = arrayDataSize(makeArrayRef(Syms));
 }
 
 template <class ELFT>
 static int writeELF(raw_ostream &OS, const ELFYAML::Object &Doc) {
   using namespace llvm::ELF;
-  typedef typename object::ELFObjectFile<ELFT>::Elf_Ehdr Elf_Ehdr;
-  typedef typename object::ELFObjectFile<ELFT>::Elf_Shdr Elf_Shdr;
+  typedef typename object::ELFFile<ELFT>::Elf_Ehdr Elf_Ehdr;
+  typedef typename object::ELFFile<ELFT>::Elf_Shdr Elf_Shdr;
 
   const ELFYAML::FileHeader &Hdr = Doc.Header;
 
@@ -358,7 +361,7 @@ static int writeELF(raw_ostream &OS, const ELFYAML::Object &Doc) {
   SHeaders.push_back(SHStrTabSHeader);
 
   OS.write((const char *)&Header, sizeof(Header));
-  writeVectorData(OS, SHeaders);
+  writeArrayData(OS, makeArrayRef(SHeaders));
   CBA.writeBlobToStream(OS);
   return 0;
 }
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 3b69de2..e57c8d4 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -866,10 +866,11 @@ TEST(APFloatTest, toString) {
   ASSERT_EQ("0.0101", convertToString(1.01E-2, 5, 2));
   ASSERT_EQ("0.0101", convertToString(1.01E-2, 4, 2));
   ASSERT_EQ("1.01E-2", convertToString(1.01E-2, 5, 1));
-  ASSERT_EQ("0.7853981633974483", convertToString(0.78539816339744830961, 0, 3));
-  ASSERT_EQ("4.940656458412465E-324", convertToString(4.9406564584124654e-324, 0, 3));
-  ASSERT_EQ("873.1834", convertToString(873.1834, 0, 1));
-  ASSERT_EQ("8.731834E+2", convertToString(873.1834, 0, 0));
+  ASSERT_EQ("0.78539816339744828", convertToString(0.78539816339744830961, 0, 3));
+  ASSERT_EQ("4.9406564584124654E-324", convertToString(4.9406564584124654e-324, 0, 3));
+  ASSERT_EQ("873.18340000000001", convertToString(873.1834, 0, 1));
+  ASSERT_EQ("8.7318340000000001E+2", convertToString(873.1834, 0, 0));
+  ASSERT_EQ("1.7976931348623157E+308", convertToString(1.7976931348623157E+308, 0, 0));
 }
 
 TEST(APFloatTest, toInteger) {
diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
index 9aad793..8ad303a 100644
--- a/unittests/ADT/CMakeLists.txt
+++ b/unittests/ADT/CMakeLists.txt
@@ -21,6 +21,7 @@ set(ADTSources
   MapVectorTest.cpp
   OptionalTest.cpp
   PackedVectorTest.cpp
+  PointerUnionTest.cpp
   SCCIteratorTest.cpp
   SmallPtrSetTest.cpp
   SmallStringTest.cpp
@@ -34,6 +35,7 @@ set(ADTSources
   TripleTest.cpp
   TwineTest.cpp
   VariadicFunctionTest.cpp
+  polymorphic_ptr_test.cpp
  )
 
 # They cannot be compiled on MSVC9 due to its bug.
diff --git a/unittests/ADT/IntrusiveRefCntPtrTest.cpp b/unittests/ADT/IntrusiveRefCntPtrTest.cpp
index 0c8c4ca..c67ec13 100644
--- a/unittests/ADT/IntrusiveRefCntPtrTest.cpp
+++ b/unittests/ADT/IntrusiveRefCntPtrTest.cpp
@@ -10,11 +10,13 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "gtest/gtest.h"
 
-namespace llvm {
-
-struct VirtualRefCounted : public RefCountedBaseVPTR {
+namespace {
+struct VirtualRefCounted : public llvm::RefCountedBaseVPTR {
   virtual void f() {}
 };
+}
+
+namespace llvm {
 
 // Run this test with valgrind to detect memory leaks.
 TEST(IntrusiveRefCntPtr, RefCountedBaseVPTRCopyDoesNotLeak) {
diff --git a/unittests/ADT/PointerUnionTest.cpp b/unittests/ADT/PointerUnionTest.cpp
new file mode 100644
index 0000000..7eb7181
--- /dev/null
+++ b/unittests/ADT/PointerUnionTest.cpp
@@ -0,0 +1,64 @@
+//===- llvm/unittest/ADT/PointerUnionTest.cpp - Optional unit tests -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/PointerUnion.h"
+using namespace llvm;
+
+namespace {
+
+typedef PointerUnion<int*, float*> PU;
+
+// Test fixture
+class PointerUnionTest : public testing::Test {
+};
+
+float f = 3.14f;
+int i = 42;
+
+const PU a(&f);
+const PU b(&i);
+const PU n;
+
+TEST_F(PointerUnionTest, Comparison) {
+  EXPECT_TRUE(a != b);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(b != n);
+  EXPECT_FALSE(b == n);
+}
+
+TEST_F(PointerUnionTest, Null) {
+  EXPECT_FALSE(a.isNull());
+  EXPECT_FALSE(b.isNull());
+  EXPECT_TRUE(n.isNull());
+  EXPECT_FALSE(!a);
+  EXPECT_FALSE(!b);
+  EXPECT_TRUE(!n);
+  // workaround an issue with EXPECT macros and explicit bool
+  EXPECT_TRUE((bool)a);
+  EXPECT_TRUE((bool)b);
+  EXPECT_FALSE(n);
+}
+
+TEST_F(PointerUnionTest, Is) {
+  EXPECT_FALSE(a.is<int*>());
+  EXPECT_TRUE(a.is<float*>());
+  EXPECT_TRUE(b.is<int*>());
+  EXPECT_FALSE(b.is<float*>());
+  EXPECT_TRUE(n.is<int*>());
+  EXPECT_FALSE(n.is<float*>());
+}
+
+TEST_F(PointerUnionTest, Get) {
+  EXPECT_EQ(a.get<float*>(), &f);
+  EXPECT_EQ(b.get<int*>(), &i);
+  EXPECT_EQ(n.get<int*>(), (int*)0);
+}
+
+} // end anonymous namespace
diff --git a/unittests/ADT/StringRefTest.cpp b/unittests/ADT/StringRefTest.cpp
index fa87cd0..88691ae 100644
--- a/unittests/ADT/StringRefTest.cpp
+++ b/unittests/ADT/StringRefTest.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
@@ -60,6 +61,9 @@ TEST(StringRefTest, StringOps) {
   EXPECT_EQ( 0, StringRef("AaB").compare_lower("aab"));
   EXPECT_EQ( 1, StringRef("AaB").compare_lower("AAA"));
   EXPECT_EQ(-1, StringRef("AaB").compare_lower("aaBb"));
+  EXPECT_EQ(-1, StringRef("AaB").compare_lower("bb"));
+  EXPECT_EQ( 1, StringRef("aaBb").compare_lower("AaB"));
+  EXPECT_EQ( 1, StringRef("bb").compare_lower("AaB"));
   EXPECT_EQ( 1, StringRef("AaB").compare_lower("aA"));
   EXPECT_EQ( 1, StringRef("\xFF").compare_lower("\1"));
 
@@ -247,19 +251,41 @@ TEST(StringRefTest, Trim) {
 
 TEST(StringRefTest, StartsWith) {
   StringRef Str("hello");
+  EXPECT_TRUE(Str.startswith(""));
   EXPECT_TRUE(Str.startswith("he"));
   EXPECT_FALSE(Str.startswith("helloworld"));
   EXPECT_FALSE(Str.startswith("hi"));
 }
 
+TEST(StringRefTest, StartsWithLower) {
+  StringRef Str("heLLo");
+  EXPECT_TRUE(Str.startswith_lower(""));
+  EXPECT_TRUE(Str.startswith_lower("he"));
+  EXPECT_TRUE(Str.startswith_lower("hell"));
+  EXPECT_TRUE(Str.startswith_lower("HELlo"));
+  EXPECT_FALSE(Str.startswith_lower("helloworld"));
+  EXPECT_FALSE(Str.startswith_lower("hi"));
+}
+
 TEST(StringRefTest, EndsWith) {
   StringRef Str("hello");
+  EXPECT_TRUE(Str.endswith(""));
   EXPECT_TRUE(Str.endswith("lo"));
   EXPECT_FALSE(Str.endswith("helloworld"));
   EXPECT_FALSE(Str.endswith("worldhello"));
   EXPECT_FALSE(Str.endswith("so"));
 }
 
+TEST(StringRefTest, EndsWithLower) {
+  StringRef Str("heLLo");
+  EXPECT_TRUE(Str.endswith_lower(""));
+  EXPECT_TRUE(Str.endswith_lower("lo"));
+  EXPECT_TRUE(Str.endswith_lower("LO"));
+  EXPECT_TRUE(Str.endswith_lower("ELlo"));
+  EXPECT_FALSE(Str.endswith_lower("helloworld"));
+  EXPECT_FALSE(Str.endswith_lower("hi"));
+}
+
 TEST(StringRefTest, Find) {
   StringRef Str("hello");
   EXPECT_EQ(2U, Str.find('l'));
@@ -477,6 +503,32 @@ TEST(StringRefTest, getAsUnsignedIntegerBadStrings) {
   }
 }
 
+static const char *join_input[] = { "a", "b", "c" };
+static const char join_result1[] = "a";
+static const char join_result2[] = "a:b:c";
+static const char join_result3[] = "a::b::c";
+
+TEST(StringRefTest, joinStrings) {
+  std::vector<StringRef> v1;
+  std::vector<std::string> v2;
+  for (size_t i = 0; i < array_lengthof(join_input); ++i) {
+    v1.push_back(join_input[i]);
+    v2.push_back(join_input[i]);
+  }
 
+  bool v1_join1 = join(v1.begin(), v1.begin() + 1, ":") == join_result1;
+  EXPECT_TRUE(v1_join1);
+  bool v1_join2 = join(v1.begin(), v1.end(), ":") == join_result2;
+  EXPECT_TRUE(v1_join2);
+  bool v1_join3 = join(v1.begin(), v1.end(), "::") == join_result3;
+  EXPECT_TRUE(v1_join3);
+
+  bool v2_join1 = join(v2.begin(), v2.begin() + 1, ":") == join_result1;
+  EXPECT_TRUE(v2_join1);
+  bool v2_join2 = join(v2.begin(), v2.end(), ":") == join_result2;
+  EXPECT_TRUE(v2_join2);
+  bool v2_join3 = join(v2.begin(), v2.end(), "::") == join_result3;
+  EXPECT_TRUE(v2_join3);
+}
 
 } // end anonymous namespace
diff --git a/unittests/ADT/polymorphic_ptr_test.cpp b/unittests/ADT/polymorphic_ptr_test.cpp
new file mode 100644
index 0000000..bd5d838
--- /dev/null
+++ b/unittests/ADT/polymorphic_ptr_test.cpp
@@ -0,0 +1,129 @@
+//===- llvm/unittest/ADT/polymorphic_ptr.h - polymorphic_ptr<T> tests -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/polymorphic_ptr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+struct S {
+  S(int x) : x(x) {}
+  S *clone() { return new S(*this); }
+  int x;
+};
+
+// A function that forces the return of a copy.
+template <typename T>
+T dummy_copy(const T &arg) { return arg; }
+
+TEST(polymorphic_ptr_test, Basic) {
+  polymorphic_ptr<S> null;
+  EXPECT_FALSE((bool)null);
+  EXPECT_TRUE(!null);
+  EXPECT_EQ((S*)0, null.get());
+
+  S *s = new S(42);
+  polymorphic_ptr<S> p(s);
+  EXPECT_TRUE((bool)p);
+  EXPECT_FALSE(!p);
+  EXPECT_TRUE(p != null);
+  EXPECT_FALSE(p == null);
+  EXPECT_TRUE(p == s);
+  EXPECT_TRUE(s == p);
+  EXPECT_FALSE(p != s);
+  EXPECT_FALSE(s != p);
+  EXPECT_EQ(s, &*p);
+  EXPECT_EQ(s, p.operator->());
+  EXPECT_EQ(s, p.get());
+  EXPECT_EQ(42, p->x);
+
+  EXPECT_EQ(s, p.take());
+  EXPECT_FALSE((bool)p);
+  EXPECT_TRUE(!p);
+  p = s;
+  EXPECT_TRUE((bool)p);
+  EXPECT_FALSE(!p);
+  EXPECT_EQ(s, &*p);
+  EXPECT_EQ(s, p.operator->());
+  EXPECT_EQ(s, p.get());
+  EXPECT_EQ(42, p->x);
+
+  polymorphic_ptr<S> p2((llvm_move(p)));
+#if !LLVM_HAS_RVALUE_REFERENCES
+  // 'p' may not have been moved from in C++98, fake it for the test.
+  p2 = p.take();
+#endif
+  EXPECT_FALSE((bool)p);
+  EXPECT_TRUE(!p);
+  EXPECT_TRUE((bool)p2);
+  EXPECT_FALSE(!p2);
+  EXPECT_EQ(s, &*p2);
+
+  using std::swap;
+  swap(p, p2);
+  EXPECT_TRUE((bool)p);
+  EXPECT_FALSE(!p);
+  EXPECT_EQ(s, &*p);
+  EXPECT_FALSE((bool)p2);
+  EXPECT_TRUE(!p2);
+
+  // Force copies and that everything survives.
+  polymorphic_ptr<S> p3 = dummy_copy(polymorphic_ptr<S>(p));
+  EXPECT_TRUE((bool)p3);
+  EXPECT_FALSE(!p3);
+  EXPECT_NE(s, &*p3);
+  EXPECT_EQ(42, p3->x);
+
+  // Force copies of null without trying to dereference anything.
+  polymorphic_ptr<S> null_copy = dummy_copy(polymorphic_ptr<S>(null));
+  EXPECT_FALSE((bool)null_copy);
+  EXPECT_TRUE(!null_copy);
+  EXPECT_EQ(null, null_copy);
+}
+
+struct Base {
+  virtual ~Base() {}
+  virtual Base *clone() = 0;
+  virtual StringRef name() { return "Base"; }
+};
+
+struct DerivedA : Base {
+  virtual DerivedA *clone() { return new DerivedA(); }
+  virtual StringRef name() { return "DerivedA"; }
+};
+struct DerivedB : Base {
+  virtual DerivedB *clone() { return new DerivedB(); }
+  virtual StringRef name() { return "DerivedB"; }
+};
+
+TEST(polymorphic_ptr_test, Polymorphism) {
+  polymorphic_ptr<Base> a(new DerivedA());
+  polymorphic_ptr<Base> b(new DerivedB());
+
+  EXPECT_EQ("DerivedA", a->name());
+  EXPECT_EQ("DerivedB", b->name());
+
+  polymorphic_ptr<Base> copy = dummy_copy(a);
+  EXPECT_NE(a, copy);
+  EXPECT_EQ("DerivedA", copy->name());
+
+  copy = dummy_copy(b);
+  EXPECT_NE(b, copy);
+  EXPECT_EQ("DerivedB", copy->name());
+
+  // Test creating a copy out of a temporary directly.
+  copy = dummy_copy<polymorphic_ptr<Base> >(new DerivedA());
+  EXPECT_NE(a, copy);
+  EXPECT_EQ("DerivedA", copy->name());
+}
+
+}
diff --git a/unittests/Analysis/CFGTest.cpp b/unittests/Analysis/CFGTest.cpp
index 2358803..e931709 100644
--- a/unittests/Analysis/CFGTest.cpp
+++ b/unittests/Analysis/CFGTest.cpp
@@ -147,6 +147,23 @@ TEST_F(IsPotentiallyReachableTest, SameBlockPath) {
   ExpectPath(true);
 }
 
+TEST_F(IsPotentiallyReachableTest, SameBlockNoLoop) {
+  ParseAssembly(
+      "define void @test() {\n"
+      "entry:\n"
+      "  br label %middle\n"
+      "middle:\n"
+      "  %B = bitcast i8 undef to i8\n"
+      "  bitcast i8 undef to i8\n"
+      "  bitcast i8 undef to i8\n"
+      "  %A = bitcast i8 undef to i8\n"
+      "  br label %nextblock\n"
+      "nextblock:\n"
+      "  ret void\n"
+      "}\n");
+  ExpectPath(false);
+}
+
 TEST_F(IsPotentiallyReachableTest, StraightNoPath) {
   ParseAssembly(
       "define void @test() {\n"
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 53b7e9c..52702ba 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -7,11 +7,13 @@ endfunction()
 
 add_subdirectory(ADT)
 add_subdirectory(Analysis)
-add_subdirectory(ExecutionEngine)
 add_subdirectory(Bitcode)
+add_subdirectory(CodeGen)
+add_subdirectory(DebugInfo)
+add_subdirectory(ExecutionEngine)
+add_subdirectory(IR)
+add_subdirectory(MC)
+add_subdirectory(Object)
 add_subdirectory(Option)
 add_subdirectory(Support)
 add_subdirectory(Transforms)
-add_subdirectory(IR)
-add_subdirectory(DebugInfo)
-add_subdirectory(Object)
diff --git a/unittests/CodeGen/CMakeLists.txt b/unittests/CodeGen/CMakeLists.txt
new file mode 100644
index 0000000..5973bae
--- /dev/null
+++ b/unittests/CodeGen/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  asmprinter
+  codegen
+  support
+  )
+
+set(CodeGenSources
+  DIEHashTest.cpp
+  )
+
+add_llvm_unittest(CodeGenTests
+  ${CodeGenSources}
+  )
diff --git a/unittests/CodeGen/DIEHashTest.cpp b/unittests/CodeGen/DIEHashTest.cpp
new file mode 100644
index 0000000..8d8fc39
--- /dev/null
+++ b/unittests/CodeGen/DIEHashTest.cpp
@@ -0,0 +1,517 @@
+//===- llvm/unittest/DebugInfo/DWARFFormValueTest.cpp ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../lib/CodeGen/AsmPrinter/DIE.h"
+#include "../lib/CodeGen/AsmPrinter/DIEHash.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+TEST(DIEHashTest, Data1) {
+  DIEHash Hash;
+  DIE Die(dwarf::DW_TAG_base_type);
+  DIEInteger Size(4);
+  Die.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Size);
+  uint64_t MD5Res = Hash.computeTypeSignature(Die);
+  ASSERT_EQ(0x1AFE116E83701108ULL, MD5Res);
+}
+
+// struct {};
+TEST(DIEHashTest, TrivialType) {
+  DIE Unnamed(dwarf::DW_TAG_structure_type);
+  DIEInteger One(1);
+  Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+  // Line and file number are ignored.
+  Unnamed.addValue(dwarf::DW_AT_decl_file, dwarf::DW_FORM_data1, &One);
+  Unnamed.addValue(dwarf::DW_AT_decl_line, dwarf::DW_FORM_data1, &One);
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
+
+  // The exact same hash GCC produces for this DIE.
+  ASSERT_EQ(0x715305ce6cfd9ad1ULL, MD5Res);
+}
+
+// struct foo { };
+TEST(DIEHashTest, NamedType) {
+  DIE Foo(dwarf::DW_TAG_structure_type);
+  DIEInteger One(1);
+  DIEString FooStr(&One, "foo");
+  Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+  Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
+
+  // The exact same hash GCC produces for this DIE.
+  ASSERT_EQ(0xd566dbd2ca5265ffULL, MD5Res);
+}
+
+// namespace space { struct foo { }; }
+TEST(DIEHashTest, NamespacedType) {
+  DIE CU(dwarf::DW_TAG_compile_unit);
+
+  DIE *Space = new DIE(dwarf::DW_TAG_namespace);
+  DIEInteger One(1);
+  DIEString SpaceStr(&One, "space");
+  Space->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &SpaceStr);
+  // DW_AT_declaration is ignored.
+  Space->addValue(dwarf::DW_AT_declaration, dwarf::DW_FORM_flag_present, &One);
+  // sibling?
+
+  DIE *Foo = new DIE(dwarf::DW_TAG_structure_type);
+  DIEString FooStr(&One, "foo");
+  Foo->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+  Foo->addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+  Space->addChild(Foo);
+  CU.addChild(Space);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(*Foo);
+
+  // The exact same hash GCC produces for this DIE.
+  ASSERT_EQ(0x7b80381fd17f1e33ULL, MD5Res);
+}
+
+// struct { int member; };
+TEST(DIEHashTest, TypeWithMember) {
+  DIE Unnamed(dwarf::DW_TAG_structure_type);
+  DIEInteger Four(4);
+  Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
+
+  DIE *Member = new DIE(dwarf::DW_TAG_member);
+  DIEString MemberStr(&Four, "member");
+  Member->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemberStr);
+  DIEInteger Zero(0);
+  Member->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
+
+  Unnamed.addChild(Member);
+
+  DIE Int(dwarf::DW_TAG_base_type);
+  DIEString IntStr(&Four, "int");
+  Int.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &IntStr);
+  Int.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
+  DIEInteger Five(5);
+  Int.addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Five);
+
+  DIEEntry IntRef(&Int);
+  Member->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntRef);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
+
+  ASSERT_EQ(0x5646aa436b7e07c6ULL, MD5Res);
+}
+
+// struct foo { int mem1, mem2; };
+TEST(DIEHashTest, ReusedType) {
+  DIE Unnamed(dwarf::DW_TAG_structure_type);
+  DIEInteger Eight(8);
+  Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+
+  DIE *Mem1 = new DIE(dwarf::DW_TAG_member);
+  DIEInteger Four(4);
+  DIEString Mem1Str(&Four, "mem1");
+  Mem1->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &Mem1Str);
+  DIEInteger Zero(0);
+  Mem1->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
+
+  Unnamed.addChild(Mem1);
+
+  DIE *Mem2 = new DIE(dwarf::DW_TAG_member);
+  DIEString Mem2Str(&Four, "mem2");
+  Mem2->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &Mem2Str);
+  Mem2->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Four);
+
+  Unnamed.addChild(Mem2);
+
+  DIE Int(dwarf::DW_TAG_base_type);
+  DIEString IntStr(&Four, "int");
+  Int.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &IntStr);
+  Int.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Four);
+  DIEInteger Five(5);
+  Int.addValue(dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, &Five);
+
+  DIEEntry IntRef(&Int);
+  Mem1->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntRef);
+  Mem2->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &IntRef);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
+
+  ASSERT_EQ(0x3a7dc3ed7b76b2f8ULL, MD5Res);
+}
+
+// struct foo { static foo f; };
+TEST(DIEHashTest, RecursiveType) {
+  DIE Foo(dwarf::DW_TAG_structure_type);
+  DIEInteger One(1);
+  Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+  DIEString FooStr(&One, "foo");
+  Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  DIEString MemStr(&One, "mem");
+  Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+  DIEEntry FooRef(&Foo);
+  Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRef);
+  // DW_AT_external and DW_AT_declaration are ignored anyway, so skip them.
+
+  Foo.addChild(Mem);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
+
+  ASSERT_EQ(0x73d8b25aef227b06ULL, MD5Res);
+}
+
+// struct foo { foo *mem; };
+TEST(DIEHashTest, Pointer) {
+  DIE Foo(dwarf::DW_TAG_structure_type);
+  DIEInteger Eight(8);
+  Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEString FooStr(&Eight, "foo");
+  Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  DIEString MemStr(&Eight, "mem");
+  Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+  DIEInteger Zero(0);
+  Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
+
+  DIE FooPtr(dwarf::DW_TAG_pointer_type);
+  FooPtr.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEEntry FooRef(&Foo);
+  FooPtr.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRef);
+
+  DIEEntry FooPtrRef(&FooPtr);
+  Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooPtrRef);
+
+  Foo.addChild(Mem);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
+
+  ASSERT_EQ(0x74ea73862e8708d2ULL, MD5Res);
+}
+
+// struct foo { foo &mem; };
+TEST(DIEHashTest, Reference) {
+  DIE Foo(dwarf::DW_TAG_structure_type);
+  DIEInteger Eight(8);
+  Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEString FooStr(&Eight, "foo");
+  Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  DIEString MemStr(&Eight, "mem");
+  Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+  DIEInteger Zero(0);
+  Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
+
+  DIE FooRef(dwarf::DW_TAG_reference_type);
+  FooRef.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEEntry FooEntry(&Foo);
+  FooRef.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooEntry);
+
+  DIE FooRefConst(dwarf::DW_TAG_const_type);
+  DIEEntry FooRefRef(&FooRef);
+  FooRefConst.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefRef);
+
+  DIEEntry FooRefConstRef(&FooRefConst);
+  Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefConstRef);
+
+  Foo.addChild(Mem);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
+
+  ASSERT_EQ(0xa0b15f467ad4525bULL, MD5Res);
+}
+
+// struct foo { foo &&mem; };
+TEST(DIEHashTest, RValueReference) {
+  DIE Foo(dwarf::DW_TAG_structure_type);
+  DIEInteger Eight(8);
+  Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEString FooStr(&Eight, "foo");
+  Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  DIEString MemStr(&Eight, "mem");
+  Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+  DIEInteger Zero(0);
+  Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
+
+  DIE FooRef(dwarf::DW_TAG_rvalue_reference_type);
+  FooRef.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEEntry FooEntry(&Foo);
+  FooRef.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooEntry);
+
+  DIE FooRefConst(dwarf::DW_TAG_const_type);
+  DIEEntry FooRefRef(&FooRef);
+  FooRefConst.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefRef);
+
+  DIEEntry FooRefConstRef(&FooRefConst);
+  Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooRefConstRef);
+
+  Foo.addChild(Mem);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
+
+  ASSERT_EQ(0xad211c8c3b31e57ULL, MD5Res);
+}
+
+// struct foo { foo foo::*mem; };
+TEST(DIEHashTest, PtrToMember) {
+  DIE Foo(dwarf::DW_TAG_structure_type);
+  DIEInteger Eight(8);
+  Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEString FooStr(&Eight, "foo");
+  Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  DIEString MemStr(&Eight, "mem");
+  Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+  DIEInteger Zero(0);
+  Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
+
+  DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
+  DIEEntry FooEntry(&Foo);
+  PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &FooEntry);
+  PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, &FooEntry);
+
+  DIEEntry PtrToFooMemRef(&PtrToFooMem);
+  Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
+
+  Foo.addChild(Mem);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
+
+  ASSERT_EQ(0x852e0c9ff7c04ebULL, MD5Res);
+}
+
+// Check that the hash for a pointer-to-member matches regardless of whether the
+// pointed-to type is a declaration or a definition.
+//
+//   struct bar; // { };
+//   struct foo { bar foo::*mem; };
+TEST(DIEHashTest, PtrToMemberDeclDefMatch) {
+  DIEInteger Zero(0);
+  DIEInteger One(1);
+  DIEInteger Eight(8);
+  DIEString FooStr(&Eight, "foo");
+  DIEString BarStr(&Eight, "bar");
+  DIEString MemStr(&Eight, "mem");
+  uint64_t MD5ResDecl;
+  {
+    DIE Bar(dwarf::DW_TAG_structure_type);
+    Bar.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &BarStr);
+    Bar.addValue(dwarf::DW_AT_declaration, dwarf::DW_FORM_flag_present, &One);
+
+    DIE Foo(dwarf::DW_TAG_structure_type);
+    Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+    Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+    Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
+                  &Zero);
+
+    DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
+    DIEEntry BarEntry(&Bar);
+    PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
+    DIEEntry FooEntry(&Foo);
+    PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
+                         &FooEntry);
+
+    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
+
+    Foo.addChild(Mem);
+
+    MD5ResDecl = DIEHash().computeTypeSignature(Foo);
+  }
+  uint64_t MD5ResDef;
+  {
+    DIE Bar(dwarf::DW_TAG_structure_type);
+    Bar.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &BarStr);
+    Bar.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+    DIE Foo(dwarf::DW_TAG_structure_type);
+    Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+    Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+    Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
+                  &Zero);
+
+    DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
+    DIEEntry BarEntry(&Bar);
+    PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
+    DIEEntry FooEntry(&Foo);
+    PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
+                         &FooEntry);
+
+    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
+
+    Foo.addChild(Mem);
+
+    MD5ResDef = DIEHash().computeTypeSignature(Foo);
+  }
+  ASSERT_EQ(MD5ResDef, MD5ResDecl);
+}
+
+// Check that the hash for a pointer-to-member matches regardless of whether the
+// pointed-to type is a declaration or a definition.
+//
+//   struct bar; // { };
+//   struct foo { bar bar::*mem; };
+TEST(DIEHashTest, PtrToMemberDeclDefMisMatch) {
+  DIEInteger Zero(0);
+  DIEInteger One(1);
+  DIEInteger Eight(8);
+  DIEString FooStr(&Eight, "foo");
+  DIEString BarStr(&Eight, "bar");
+  DIEString MemStr(&Eight, "mem");
+  uint64_t MD5ResDecl;
+  {
+    DIE Bar(dwarf::DW_TAG_structure_type);
+    Bar.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &BarStr);
+    Bar.addValue(dwarf::DW_AT_declaration, dwarf::DW_FORM_flag_present, &One);
+
+    DIE Foo(dwarf::DW_TAG_structure_type);
+    Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+    Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+    Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
+                  &Zero);
+
+    DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
+    DIEEntry BarEntry(&Bar);
+    PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
+    PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
+                         &BarEntry);
+
+    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
+
+    Foo.addChild(Mem);
+
+    MD5ResDecl = DIEHash().computeTypeSignature(Foo);
+  }
+  uint64_t MD5ResDef;
+  {
+    DIE Bar(dwarf::DW_TAG_structure_type);
+    Bar.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &BarStr);
+    Bar.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+    DIE Foo(dwarf::DW_TAG_structure_type);
+    Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+    Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+    DIE *Mem = new DIE(dwarf::DW_TAG_member);
+    Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+    Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1,
+                  &Zero);
+
+    DIE PtrToFooMem(dwarf::DW_TAG_ptr_to_member_type);
+    DIEEntry BarEntry(&Bar);
+    PtrToFooMem.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &BarEntry);
+    PtrToFooMem.addValue(dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
+                         &BarEntry);
+
+    DIEEntry PtrToFooMemRef(&PtrToFooMem);
+    Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &PtrToFooMemRef);
+
+    Foo.addChild(Mem);
+
+    MD5ResDef = DIEHash().computeTypeSignature(Foo);
+  }
+  // FIXME: This seems to be a bug in the DWARF type hashing specification that
+  // only uses the brief name hashing for types referenced via DW_AT_type. In
+  // this case the type is referenced via DW_AT_containing_type and full hashing
+  // causes a hash to differ when the containing type is a declaration in one TU
+  // and a definition in another.
+  ASSERT_NE(MD5ResDef, MD5ResDecl);
+}
+
+// struct { } a;
+// struct foo { decltype(a) mem; };
+TEST(DIEHashTest, RefUnnamedType) {
+  DIEInteger Zero(0);
+  DIEInteger One(1);
+  DIEInteger Eight(8);
+  DIEString FooStr(&Zero, "foo");
+  DIEString MemStr(&Zero, "mem");
+
+  DIE Unnamed(dwarf::DW_TAG_structure_type);
+  Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+  DIE Foo(dwarf::DW_TAG_structure_type);
+  Foo.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  Foo.addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+
+  DIE *Mem = new DIE(dwarf::DW_TAG_member);
+  Mem->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &MemStr);
+  Mem->addValue(dwarf::DW_AT_data_member_location, dwarf::DW_FORM_data1, &Zero);
+
+  DIE UnnamedPtr(dwarf::DW_TAG_pointer_type);
+  UnnamedPtr.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &Eight);
+  DIEEntry UnnamedRef(&Unnamed);
+  UnnamedPtr.addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &UnnamedRef);
+
+  DIEEntry UnnamedPtrRef(&UnnamedPtr);
+  Mem->addValue(dwarf::DW_AT_type, dwarf::DW_FORM_ref4, &UnnamedPtrRef);
+
+  Foo.addChild(Mem);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Foo);
+
+  ASSERT_EQ(0x954e026f01c02529ULL, MD5Res);
+}
+
+// struct { struct bar { }; };
+TEST(DIEHashTest, NestedType) {
+  DIE Unnamed(dwarf::DW_TAG_structure_type);
+  DIEInteger One(1);
+  Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+  DIE *Foo = new DIE(dwarf::DW_TAG_structure_type);
+  DIEString FooStr(&One, "foo");
+  Foo->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FooStr);
+  Foo->addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+  Unnamed.addChild(Foo);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
+
+  // The exact same hash GCC produces for this DIE.
+  ASSERT_EQ(0xde8a3b7b43807f4aULL, MD5Res);
+}
+
+// struct { static void func(); };
+TEST(DIEHashTest, MemberFunc) {
+  DIE Unnamed(dwarf::DW_TAG_structure_type);
+  DIEInteger One(1);
+  Unnamed.addValue(dwarf::DW_AT_byte_size, dwarf::DW_FORM_data1, &One);
+
+  DIE *Func = new DIE(dwarf::DW_TAG_subprogram);
+  DIEString FuncStr(&One, "func");
+  Func->addValue(dwarf::DW_AT_name, dwarf::DW_FORM_strp, &FuncStr);
+
+  Unnamed.addChild(Func);
+
+  uint64_t MD5Res = DIEHash().computeTypeSignature(Unnamed);
+
+  // The exact same hash GCC produces for this DIE.
+  ASSERT_EQ(0xd36a1b6dfb604ba0ULL, MD5Res);
+}
+}
diff --git a/unittests/CodeGen/Makefile b/unittests/CodeGen/Makefile
new file mode 100644
index 0000000..4f07017
--- /dev/null
+++ b/unittests/CodeGen/Makefile
@@ -0,0 +1,16 @@
+##===- unittests/DebugInfo/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+TESTNAME = CodeGen
+LINK_COMPONENTS := asmprinter codegen support
+
+include $(LEVEL)/Makefile.config
+
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/DebugInfo/DWARFFormValueTest.cpp b/unittests/DebugInfo/DWARFFormValueTest.cpp
index 04b859b..38b932e 100644
--- a/unittests/DebugInfo/DWARFFormValueTest.cpp
+++ b/unittests/DebugInfo/DWARFFormValueTest.cpp
@@ -18,14 +18,32 @@ namespace {
 TEST(DWARFFormValue, FixedFormSizes) {
   // Size of DW_FORM_addr and DW_FORM_ref_addr are equal in DWARF2,
   // DW_FORM_ref_addr is always 4 bytes in DWARF32 starting from DWARF3.
-  const uint8_t *sizes = DWARFFormValue::getFixedFormSizes(4, 2);
+  ArrayRef<uint8_t> sizes = DWARFFormValue::getFixedFormSizes(4, 2);
   EXPECT_EQ(sizes[DW_FORM_addr], sizes[DW_FORM_ref_addr]);
   sizes = DWARFFormValue::getFixedFormSizes(8, 2);
   EXPECT_EQ(sizes[DW_FORM_addr], sizes[DW_FORM_ref_addr]);
   sizes = DWARFFormValue::getFixedFormSizes(8, 3);
   EXPECT_EQ(4, sizes[DW_FORM_ref_addr]);
   // Check that we don't have fixed form sizes for weird address sizes.
-  EXPECT_EQ(0, DWARFFormValue::getFixedFormSizes(16, 2));
+  EXPECT_EQ(0U, DWARFFormValue::getFixedFormSizes(16, 2).size());
+}
+
+bool isFormClass(uint16_t Form, DWARFFormValue::FormClass FC) {
+  return DWARFFormValue(Form).isFormClass(FC);
+}
+
+TEST(DWARFFormValue, FormClass) {
+  EXPECT_TRUE(isFormClass(DW_FORM_addr, DWARFFormValue::FC_Address));
+  EXPECT_FALSE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_Address));
+  EXPECT_TRUE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_Constant));
+  EXPECT_TRUE(isFormClass(DW_FORM_data8, DWARFFormValue::FC_SectionOffset));
+  EXPECT_TRUE(
+      isFormClass(DW_FORM_sec_offset, DWARFFormValue::FC_SectionOffset));
+  EXPECT_TRUE(isFormClass(DW_FORM_GNU_str_index, DWARFFormValue::FC_String));
+  EXPECT_TRUE(isFormClass(DW_FORM_GNU_addr_index, DWARFFormValue::FC_Address));
+  EXPECT_FALSE(isFormClass(DW_FORM_ref_addr, DWARFFormValue::FC_Address));
+  EXPECT_TRUE(isFormClass(DW_FORM_ref_addr, DWARFFormValue::FC_Reference));
+  EXPECT_TRUE(isFormClass(DW_FORM_ref_sig8, DWARFFormValue::FC_Reference));
 }
 
 } // end anonymous namespace
diff --git a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
index 68ca53d..731f780 100644
--- a/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITMemoryManagerTest.cpp
@@ -281,11 +281,11 @@ TEST(JITMemoryManagerTest, TestManyStubs) {
 TEST(JITMemoryManagerTest, AllocateSection) {
   OwningPtr<JITMemoryManager> MemMgr(
       JITMemoryManager::CreateDefaultMemManager());
-  uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1);
-  uint8_t *data1 = MemMgr->allocateDataSection(256, 16, 2, true);
-  uint8_t *code2 = MemMgr->allocateCodeSection(257, 32, 3);
-  uint8_t *data2 = MemMgr->allocateDataSection(256, 64, 4, false);
-  uint8_t *code3 = MemMgr->allocateCodeSection(258, 64, 5);
+  uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1, StringRef());
+  uint8_t *data1 = MemMgr->allocateDataSection(256, 16, 2, StringRef(), true);
+  uint8_t *code2 = MemMgr->allocateCodeSection(257, 32, 3, StringRef());
+  uint8_t *data2 = MemMgr->allocateDataSection(256, 64, 4, StringRef(), false);
+  uint8_t *code3 = MemMgr->allocateCodeSection(258, 64, 5, StringRef());
 
   EXPECT_NE((uint8_t*)0, code1);
   EXPECT_NE((uint8_t*)0, code2);
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index 3b94d76..4c7b1e2 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -135,13 +135,17 @@ public:
       EndFunctionBodyCall(F, FunctionStart, FunctionEnd));
     Base->endFunctionBody(F, FunctionStart, FunctionEnd);
   }
-  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID, bool IsReadOnly) {
-    return Base->allocateDataSection(Size, Alignment, SectionID, IsReadOnly);
+  virtual uint8_t *allocateDataSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName, bool IsReadOnly) {
+    return Base->allocateDataSection(
+      Size, Alignment, SectionID, SectionName, IsReadOnly);
   }
-  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID) {
-    return Base->allocateCodeSection(Size, Alignment, SectionID);
+  virtual uint8_t *allocateCodeSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName) {
+    return Base->allocateCodeSection(
+      Size, Alignment, SectionID, SectionName);
   }
   virtual bool finalizeMemory(std::string *ErrMsg) { return false; }
   virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) {
@@ -717,16 +721,4 @@ TEST(LazyLoadedJITTest, EagerCompiledRecursionThroughGhost) {
 }
 #endif // !defined(__arm__) && !defined(__powerpc__) && !defined(__s390__)
 
-// This code is copied from JITEventListenerTest, but it only runs once for all
-// the tests in this directory.  Everything seems fine, but that's strange
-// behavior.
-class JITEnvironment : public testing::Environment {
-  virtual void SetUp() {
-    // Required to create a JIT.
-    InitializeNativeTarget();
-  }
-};
-testing::Environment* const jit_env =
-  testing::AddGlobalTestEnvironment(new JITEnvironment);
-
 }
diff --git a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
index 922cb7e..ed43099 100644
--- a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -2,7 +2,6 @@ set(LLVM_LINK_COMPONENTS
   asmparser
   bitreader
   bitwriter
-  jit
   mcjit
   nativecodegen
   )
@@ -11,6 +10,7 @@ set(MCJITTestsSources
   MCJITTest.cpp
   MCJITCAPITest.cpp
   MCJITMemoryManagerTest.cpp
+  MCJITMultipleModuleTest.cpp
   MCJITObjectCacheTest.cpp
   )
 
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
index c434a7c..46d6d9b 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITCAPITest.cpp
@@ -28,18 +28,20 @@ static bool didCallAllocateCodeSection;
 
 static uint8_t *roundTripAllocateCodeSection(void *object, uintptr_t size,
                                              unsigned alignment,
-                                             unsigned sectionID) {
+                                             unsigned sectionID,
+                                             const char *sectionName) {
   didCallAllocateCodeSection = true;
   return static_cast<SectionMemoryManager*>(object)->allocateCodeSection(
-    size, alignment, sectionID);
+    size, alignment, sectionID, sectionName);
 }
 
 static uint8_t *roundTripAllocateDataSection(void *object, uintptr_t size,
                                              unsigned alignment,
                                              unsigned sectionID,
+                                             const char *sectionName,
                                              LLVMBool isReadOnly) {
   return static_cast<SectionMemoryManager*>(object)->allocateDataSection(
-    size, alignment, sectionID, isReadOnly);
+    size, alignment, sectionID, sectionName, isReadOnly);
 }
 
 static LLVMBool roundTripFinalizeMemory(void *object, char **errMsg) {
@@ -57,6 +59,7 @@ static void roundTripDestroy(void *object) {
   delete static_cast<SectionMemoryManager*>(object);
 }
 
+namespace {
 class MCJITCAPITest : public testing::Test, public MCJITTestAPICommon {
 protected:
   MCJITCAPITest() {
@@ -154,6 +157,7 @@ protected:
   LLVMExecutionEngineRef Engine;
   char *Error;
 };
+} // end anonymous namespace
 
 TEST_F(MCJITCAPITest, simple_function) {
   SKIP_UNSUPPORTED_PLATFORM;
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
index f6dbf98..c24346d 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITMemoryManagerTest.cpp
@@ -19,10 +19,10 @@ namespace {
 TEST(MCJITMemoryManagerTest, BasicAllocations) {
   OwningPtr<SectionMemoryManager> MemMgr(new SectionMemoryManager());
 
-  uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1);
-  uint8_t *data1 = MemMgr->allocateDataSection(256, 0, 2, true);
-  uint8_t *code2 = MemMgr->allocateCodeSection(256, 0, 3);
-  uint8_t *data2 = MemMgr->allocateDataSection(256, 0, 4, false);
+  uint8_t *code1 = MemMgr->allocateCodeSection(256, 0, 1, "");
+  uint8_t *data1 = MemMgr->allocateDataSection(256, 0, 2, "", true);
+  uint8_t *code2 = MemMgr->allocateCodeSection(256, 0, 3, "");
+  uint8_t *data2 = MemMgr->allocateDataSection(256, 0, 4, "", false);
 
   EXPECT_NE((uint8_t*)0, code1);
   EXPECT_NE((uint8_t*)0, code2);
@@ -52,10 +52,10 @@ TEST(MCJITMemoryManagerTest, BasicAllocations) {
 TEST(MCJITMemoryManagerTest, LargeAllocations) {
   OwningPtr<SectionMemoryManager> MemMgr(new SectionMemoryManager());
 
-  uint8_t *code1 = MemMgr->allocateCodeSection(0x100000, 0, 1);
-  uint8_t *data1 = MemMgr->allocateDataSection(0x100000, 0, 2, true);
-  uint8_t *code2 = MemMgr->allocateCodeSection(0x100000, 0, 3);
-  uint8_t *data2 = MemMgr->allocateDataSection(0x100000, 0, 4, false);
+  uint8_t *code1 = MemMgr->allocateCodeSection(0x100000, 0, 1, "");
+  uint8_t *data1 = MemMgr->allocateDataSection(0x100000, 0, 2, "", true);
+  uint8_t *code2 = MemMgr->allocateCodeSection(0x100000, 0, 3, "");
+  uint8_t *data2 = MemMgr->allocateDataSection(0x100000, 0, 4, "", false);
 
   EXPECT_NE((uint8_t*)0, code1);
   EXPECT_NE((uint8_t*)0, code2);
@@ -91,8 +91,8 @@ TEST(MCJITMemoryManagerTest, ManyAllocations) {
   for (unsigned i = 0; i < 10000; ++i) {
     const bool isReadOnly = i % 2 == 0;
 
-    code[i] = MemMgr->allocateCodeSection(32, 0, 1);
-    data[i] = MemMgr->allocateDataSection(32, 0, 2, isReadOnly);
+    code[i] = MemMgr->allocateCodeSection(32, 0, 1, "");
+    data[i] = MemMgr->allocateDataSection(32, 0, 2, "", isReadOnly);
 
     for (unsigned j = 0; j < 32; j++) {
       code[i][j] = 1 + (i % 254);
@@ -130,8 +130,8 @@ TEST(MCJITMemoryManagerTest, ManyVariedAllocations) {
     bool isReadOnly = i % 3 == 0;
     unsigned Align = 8 << (i % 4);
 
-    code[i] = MemMgr->allocateCodeSection(CodeSize, Align, i);
-    data[i] = MemMgr->allocateDataSection(DataSize, Align, i + 10000,
+    code[i] = MemMgr->allocateCodeSection(CodeSize, Align, i, "");
+    data[i] = MemMgr->allocateDataSection(DataSize, Align, i + 10000, "",
                                           isReadOnly);
 
     for (unsigned j = 0; j < CodeSize; j++) {
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp
new file mode 100644
index 0000000..7c3239e
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/MCJITMultipleModuleTest.cpp
@@ -0,0 +1,395 @@
+//===- MCJITMultipeModuleTest.cpp - Unit tests for the MCJIT---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This test suite verifies MCJIT for handling multiple modules in a single
+// ExecutionEngine by building multiple modules, making function calls across
+// modules, accessing global variables, etc.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "MCJITTestBase.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+class MCJITMultipleModuleTest : public testing::Test, public MCJITTestBase {};
+
+// FIXME: ExecutionEngine has no support empty modules
+/*
+TEST_F(MCJITMultipleModuleTest, multiple_empty_modules) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  createJIT(M.take());
+  // JIT-compile
+  EXPECT_NE(0, TheJIT->getObjectImage())
+    << "Unable to generate executable loaded object image";
+
+  TheJIT->addModule(createEmptyModule("<other module>"));
+  TheJIT->addModule(createEmptyModule("<other other module>"));
+
+  // JIT again
+  EXPECT_NE(0, TheJIT->getObjectImage())
+    << "Unable to generate executable loaded object image";
+}
+*/
+
+// Helper Function to test add operation
+void checkAdd(uint64_t ptr) {
+  ASSERT_TRUE(ptr != 0) << "Unable to get pointer to function.";
+  int (*AddPtr)(int, int) = (int (*)(int, int))ptr;
+  EXPECT_EQ(0, AddPtr(0, 0));
+  EXPECT_EQ(1, AddPtr(1, 0));
+  EXPECT_EQ(3, AddPtr(1, 2));
+  EXPECT_EQ(-5, AddPtr(-2, -3));
+  EXPECT_EQ(30, AddPtr(10, 20));
+  EXPECT_EQ(-30, AddPtr(-10, -20));
+  EXPECT_EQ(-40, AddPtr(-10, -30));
+}
+
+void checkAccumulate(uint64_t ptr) {
+  ASSERT_TRUE(ptr != 0) << "Unable to get pointer to function.";
+  int32_t (*FPtr)(int32_t) = (int32_t (*)(int32_t))(intptr_t)ptr;
+  EXPECT_EQ(0, FPtr(0));
+  EXPECT_EQ(1, FPtr(1));
+  EXPECT_EQ(3, FPtr(2));
+  EXPECT_EQ(6, FPtr(3));
+  EXPECT_EQ(10, FPtr(4));
+  EXPECT_EQ(15, FPtr(5));
+}
+
+// FIXME: ExecutionEngine has no support empty modules
+/*
+TEST_F(MCJITMultipleModuleTest, multiple_empty_modules) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  createJIT(M.take());
+  // JIT-compile
+  EXPECT_NE(0, TheJIT->getObjectImage())
+    << "Unable to generate executable loaded object image";
+
+  TheJIT->addModule(createEmptyModule("<other module>"));
+  TheJIT->addModule(createEmptyModule("<other other module>"));
+
+  // JIT again
+  EXPECT_NE(0, TheJIT->getObjectImage())
+    << "Unable to generate executable loaded object image";
+}
+*/
+
+// Module A { Function FA },
+// Module B { Function FB },
+// execute FA then FB
+TEST_F(MCJITMultipleModuleTest, two_module_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB;
+  createTwoModuleCase(A, FA, B, FB);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Function FA },
+// Module B { Function FB },
+// execute FB then FA
+TEST_F(MCJITMultipleModuleTest, two_module_reverse_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB;
+  createTwoModuleCase(A, FA, B, FB);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  TheJIT->finalizeObject();
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Function FA },
+// Module B { Extern FA, Function FB which calls FA },
+// execute FB then FA
+TEST_F(MCJITMultipleModuleTest, two_module_extern_reverse_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB;
+  createTwoModuleExternCase(A, FA, B, FB);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  TheJIT->finalizeObject();
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Function FA },
+// Module B { Extern FA, Function FB which calls FA },
+// execute FA then FB
+TEST_F(MCJITMultipleModuleTest, two_module_extern_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB;
+  createTwoModuleExternCase(A, FA, B, FB);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Function FA1, Function FA2 which calls FA1 },
+// Module B { Extern FA1, Function FB which calls FA1 },
+// execute FB then FA2
+TEST_F(MCJITMultipleModuleTest, two_module_consecutive_call_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA1, *FA2, *FB;
+  createTwoModuleExternCase(A, FA1, B, FB);
+  FA2 = insertSimpleCallFunction<int32_t(int32_t, int32_t)>(A.get(), FA1);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  TheJIT->finalizeObject();
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FA2->getName().str());
+  checkAdd(ptr);
+}
+
+// TODO:
+// Module A { Extern Global GVB, Global Variable GVA, Function FA loads GVB },
+// Module B { Extern Global GVA, Global Variable GVB, Function FB loads GVA },
+
+
+// Module A { Global Variable GVA, Function FA loads GVA },
+// Module B { Global Variable GVB, Function FB loads GVB },
+// execute FB then FA
+TEST_F(MCJITMultipleModuleTest, two_module_global_variables_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB;
+  GlobalVariable *GVA, *GVB;
+  A.reset(createEmptyModule("A"));
+  B.reset(createEmptyModule("B"));
+
+  int32_t initialNum = 7;
+  GVA = insertGlobalInt32(A.get(), "GVA", initialNum);
+  GVB = insertGlobalInt32(B.get(), "GVB", initialNum);
+  FA = startFunction<int32_t(void)>(A.get(), "FA");
+  endFunctionWithRet(FA, Builder.CreateLoad(GVA));
+  FB = startFunction<int32_t(void)>(B.get(), "FB");
+  endFunctionWithRet(FB, Builder.CreateLoad(GVB));
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t FBPtr = TheJIT->getFunctionAddress(FB->getName().str());
+  TheJIT->finalizeObject();
+  EXPECT_TRUE(0 != FBPtr);
+  int32_t(*FuncPtr)(void) = (int32_t(*)(void))FBPtr;
+  EXPECT_EQ(initialNum, FuncPtr())
+    << "Invalid value for global returned from JITted function in module B";
+
+  uint64_t FAPtr = TheJIT->getFunctionAddress(FA->getName().str());
+  EXPECT_TRUE(0 != FAPtr);
+  FuncPtr = (int32_t(*)(void))FAPtr;
+  EXPECT_EQ(initialNum, FuncPtr())
+    << "Invalid value for global returned from JITted function in module A";
+}
+
+// Module A { Function FA },
+// Module B { Extern FA, Function FB which calls FA },
+// Module C { Extern FA, Function FC which calls FA },
+// execute FC, FB, FA
+TEST_F(MCJITMultipleModuleTest, three_module_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B, C;
+  Function *FA, *FB, *FC;
+  createThreeModuleCase(A, FA, B, FB, C, FC);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+  TheJIT->addModule(C.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FC->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Function FA },
+// Module B { Extern FA, Function FB which calls FA },
+// Module C { Extern FA, Function FC which calls FA },
+// execute FA, FB, FC
+TEST_F(MCJITMultipleModuleTest, three_module_case_reverse_order) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B, C;
+  Function *FA, *FB, *FC;
+  createThreeModuleCase(A, FA, B, FB, C, FC);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+  TheJIT->addModule(C.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FC->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Function FA },
+// Module B { Extern FA, Function FB which calls FA },
+// Module C { Extern FB, Function FC which calls FB },
+// execute FC, FB, FA
+TEST_F(MCJITMultipleModuleTest, three_module_chain_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B, C;
+  Function *FA, *FB, *FC;
+  createThreeModuleChainedCallsCase(A, FA, B, FB, C, FC);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+  TheJIT->addModule(C.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FC->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Function FA },
+// Module B { Extern FA, Function FB which calls FA },
+// Module C { Extern FB, Function FC which calls FB },
+// execute FA, FB, FC
+TEST_F(MCJITMultipleModuleTest, three_modules_chain_case_reverse_order) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B, C;
+  Function *FA, *FB, *FC;
+  createThreeModuleChainedCallsCase(A, FA, B, FB, C, FC);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+  TheJIT->addModule(C.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB->getName().str());
+  checkAdd(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FC->getName().str());
+  checkAdd(ptr);
+}
+
+// Module A { Extern FB, Function FA which calls FB1 },
+// Module B { Extern FA, Function FB1, Function FB2 which calls FA },
+// execute FA, then FB1
+// FIXME: this test case is not supported by MCJIT
+TEST_F(MCJITMultipleModuleTest, cross_module_dependency_case) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB1, *FB2;
+  createCrossModuleRecursiveCase(A, FA, B, FB1, FB2);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAccumulate(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB1->getName().str());
+  checkAccumulate(ptr);
+}
+
+// Module A { Extern FB, Function FA which calls FB1 },
+// Module B { Extern FA, Function FB1, Function FB2 which calls FA },
+// execute FB1 then FA
+// FIXME: this test case is not supported by MCJIT
+TEST_F(MCJITMultipleModuleTest, cross_module_dependency_case_reverse_order) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB1, *FB2;
+  createCrossModuleRecursiveCase(A, FA, B, FB1, FB2);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FB1->getName().str());
+  checkAccumulate(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FA->getName().str());
+  checkAccumulate(ptr);
+}
+
+// Module A { Extern FB1, Function FA which calls FB1 },
+// Module B { Extern FA, Function FB1, Function FB2 which calls FA },
+// execute FB1 then FB2
+// FIXME: this test case is not supported by MCJIT
+TEST_F(MCJITMultipleModuleTest, cross_module_dependency_case3) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  OwningPtr<Module> A, B;
+  Function *FA, *FB1, *FB2;
+  createCrossModuleRecursiveCase(A, FA, B, FB1, FB2);
+
+  createJIT(A.take());
+  TheJIT->addModule(B.take());
+
+  uint64_t ptr = TheJIT->getFunctionAddress(FB1->getName().str());
+  checkAccumulate(ptr);
+
+  ptr = TheJIT->getFunctionAddress(FB2->getName().str());
+  checkAccumulate(ptr);
+}
+}
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
index 43a8298..fab8155 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
@@ -18,23 +18,27 @@
 
 using namespace llvm;
 
+namespace {
+
 class MCJITTest : public testing::Test, public MCJITTestBase {
 protected:
-
-  virtual void SetUp() {
-    M.reset(createEmptyModule("<main>"));
-  }
+  virtual void SetUp() { M.reset(createEmptyModule("<main>")); }
 };
 
-namespace {
+// FIXME: Ensure creating an execution engine does not crash when constructed
+//        with a null module.
+/*
+TEST_F(MCJITTest, null_module) {
+  createJIT(0);
+}
+*/
 
 // FIXME: In order to JIT an empty module, there needs to be
 // an interface to ExecutionEngine that forces compilation but
-// does require retrieval of a pointer to a function/global.
+// does not require retrieval of a pointer to a function/global.
 /*
 TEST_F(MCJITTest, empty_module) {
   createJIT(M.take());
-  TheJIT->finalizeObject();
   //EXPECT_NE(0, TheJIT->getObjectImage())
   //  << "Unable to generate executable loaded object image";
 }
@@ -47,7 +51,6 @@ TEST_F(MCJITTest, global_variable) {
   GlobalValue *Global = insertGlobalInt32(M.get(), "test_global", initialValue);
   createJIT(M.take());
   void *globalPtr =  TheJIT->getPointerToGlobal(Global);
-  TheJIT->finalizeObject();
   EXPECT_TRUE(0 != globalPtr)
     << "Unable to get pointer to global value from JIT";
 
@@ -60,15 +63,19 @@ TEST_F(MCJITTest, add_function) {
 
   Function *F = insertAddFunction(M.get());
   createJIT(M.take());
-  void *addPtr = TheJIT->getPointerToFunction(F);
-  TheJIT->finalizeObject();
+  uint64_t addPtr = TheJIT->getFunctionAddress(F->getName().str());
   EXPECT_TRUE(0 != addPtr)
     << "Unable to get pointer to function from JIT";
 
-  int (*AddPtrTy)(int, int) = (int(*)(int, int))(intptr_t)addPtr;
-  EXPECT_EQ(0, AddPtrTy(0, 0));
-  EXPECT_EQ(3, AddPtrTy(1, 2));
-  EXPECT_EQ(-5, AddPtrTy(-2, -3));
+  ASSERT_TRUE(addPtr != 0) << "Unable to get pointer to function .";
+  int (*AddPtr)(int, int) = (int(*)(int, int))addPtr ;
+  EXPECT_EQ(0,   AddPtr(0, 0));
+  EXPECT_EQ(1,   AddPtr(1, 0));
+  EXPECT_EQ(3,   AddPtr(1, 2));
+  EXPECT_EQ(-5,  AddPtr(-2, -3));
+  EXPECT_EQ(30,  AddPtr(10, 20));
+  EXPECT_EQ(-30, AddPtr(-10, -20));
+  EXPECT_EQ(-40, AddPtr(-10, -30));
 }
 
 TEST_F(MCJITTest, run_main) {
@@ -77,12 +84,11 @@ TEST_F(MCJITTest, run_main) {
   int rc = 6;
   Function *Main = insertMainFunction(M.get(), 6);
   createJIT(M.take());
-  void *vPtr = TheJIT->getPointerToFunction(Main);
-  TheJIT->finalizeObject();
-  EXPECT_TRUE(0 != vPtr)
+  uint64_t ptr = TheJIT->getFunctionAddress(Main->getName().str());
+  EXPECT_TRUE(0 != ptr)
     << "Unable to get pointer to main() from JIT";
 
-  int (*FuncPtr)(void) = (int(*)(void))(intptr_t)vPtr;
+  int (*FuncPtr)(void) = (int(*)(void))ptr;
   int returnCode = FuncPtr();
   EXPECT_EQ(returnCode, rc);
 }
@@ -99,11 +105,10 @@ TEST_F(MCJITTest, return_global) {
   endFunctionWithRet(ReturnGlobal, ReadGlobal);
 
   createJIT(M.take());
-  void *rgvPtr = TheJIT->getPointerToFunction(ReturnGlobal);
-  TheJIT->finalizeObject();
+  uint64_t rgvPtr = TheJIT->getFunctionAddress(ReturnGlobal->getName().str());
   EXPECT_TRUE(0 != rgvPtr);
 
-  int32_t(*FuncPtr)(void) = (int32_t(*)(void))(intptr_t)rgvPtr;
+  int32_t(*FuncPtr)(void) = (int32_t(*)(void))rgvPtr;
   EXPECT_EQ(initialNum, FuncPtr())
     << "Invalid value for global returned from JITted function";
 }
@@ -131,10 +136,9 @@ TEST_F(MCJITTest, increment_global) {
 
   createJIT(M.take());
   void *gvPtr = TheJIT->getPointerToGlobal(GV);
-  TheJIT->finalizeObject();
   EXPECT_EQ(initialNum, *(int32_t*)gvPtr);
 
-  void *vPtr = TheJIT->getPointerToFunction(IncrementGlobal);
+  void *vPtr = TheJIT->getFunctionAddress(IncrementGlobal->getName().str());
   EXPECT_TRUE(0 != vPtr)
     << "Unable to get pointer to main() from JIT";
 
@@ -172,67 +176,15 @@ TEST_F(MCJITTest, multiple_functions) {
   }
 
   createJIT(M.take());
-  void *vPtr = TheJIT->getPointerToFunction(Outer);
-  TheJIT->finalizeObject();
-  EXPECT_TRUE(0 != vPtr)
+  uint64_t ptr = TheJIT->getFunctionAddress(Outer->getName().str());
+  EXPECT_TRUE(0 != ptr)
     << "Unable to get pointer to outer function from JIT";
 
-  int32_t(*FuncPtr)(void) = (int32_t(*)(void))(intptr_t)vPtr;
+  int32_t(*FuncPtr)(void) = (int32_t(*)(void))ptr;
   EXPECT_EQ(innerRetVal, FuncPtr())
     << "Incorrect result returned from function";
 }
 
 #endif /*!defined(__arm__)*/
 
-// FIXME: ExecutionEngine has no support empty modules
-/*
-TEST_F(MCJITTest, multiple_empty_modules) {
-  SKIP_UNSUPPORTED_PLATFORM;
-
-  createJIT(M.take());
-  // JIT-compile
-  EXPECT_NE(0, TheJIT->getObjectImage())
-    << "Unable to generate executable loaded object image";
-
-  TheJIT->addModule(createEmptyModule("<other module>"));
-  TheJIT->addModule(createEmptyModule("<other other module>"));
-
-  // JIT again
-  EXPECT_NE(0, TheJIT->getObjectImage())
-    << "Unable to generate executable loaded object image";
-}
-*/
-
-// FIXME: MCJIT must support multiple modules
-/*
-TEST_F(MCJITTest, multiple_modules) {
-  SKIP_UNSUPPORTED_PLATFORM;
-
-  Function *Callee = insertAddFunction(M.get());
-  createJIT(M.take());
-
-  // caller function is defined in a different module
-  M.reset(createEmptyModule("<caller module>"));
-
-  Function *CalleeRef = insertExternalReferenceToFunction(M.get(), Callee);
-  Function *Caller = insertSimpleCallFunction(M.get(), CalleeRef);
-
-  TheJIT->addModule(M.take());
-
-  // get a function pointer in a module that was not used in EE construction
-  void *vPtr = TheJIT->getPointerToFunction(Caller);
-  TheJIT->finalizeObject();
-  EXPECT_NE(0, vPtr)
-    << "Unable to get pointer to caller function from JIT";
-
-  int(*FuncPtr)(int, int) = (int(*)(int, int))(intptr_t)vPtr;
-  EXPECT_EQ(0, FuncPtr(0, 0));
-  EXPECT_EQ(30, FuncPtr(10, 20));
-  EXPECT_EQ(-30, FuncPtr(-10, -20));
-
-  // ensure caller is destroyed before callee (free use before def)
-  M.reset();
-}
-*/
-
 }
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
index 5debb8b..b42a9c0 100644
--- a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
@@ -119,12 +119,13 @@ protected:
   // Inserts an declaration to a function defined elsewhere
   Function *insertExternalReferenceToFunction(Module *M, Function *Func) {
     Function *Result = Function::Create(Func->getFunctionType(),
-                                        GlobalValue::AvailableExternallyLinkage,
+                                        GlobalValue::ExternalLinkage,
                                         Func->getName(), M);
     return Result;
   }
 
   // Inserts a global variable of type int32
+  // FIXME: make this a template function to support any type
   GlobalVariable *insertGlobalInt32(Module *M,
                                     StringRef name,
                                     int32_t InitialValue) {
@@ -138,11 +139,148 @@ protected:
                                                 name);
     return Global;
   }
+
+  // Inserts a function
+  //   int32_t recursive_add(int32_t num) {
+  //     if (num == 0) {
+  //       return num;
+  //     } else {
+  //       int32_t recursive_param = num - 1;
+  //       return num + Helper(recursive_param);
+  //     }
+  //   }
+  // NOTE: if Helper is left as the default parameter, Helper == recursive_add.
+  Function *insertAccumulateFunction(Module *M,
+                                              Function *Helper = 0,
+                                              StringRef Name = "accumulate") {
+    Function *Result = startFunction<int32_t(int32_t)>(M, Name);
+    if (Helper == 0)
+      Helper = Result;
+
+    BasicBlock *BaseCase = BasicBlock::Create(Context, "", Result);
+    BasicBlock *RecursiveCase = BasicBlock::Create(Context, "", Result);
+
+    // if (num == 0)
+    Value *Param = Result->arg_begin();
+    Value *Zero = ConstantInt::get(Context, APInt(32, 0));
+    Builder.CreateCondBr(Builder.CreateICmpEQ(Param, Zero),
+                         BaseCase, RecursiveCase);
+
+    //   return num;
+    Builder.SetInsertPoint(BaseCase);
+    Builder.CreateRet(Param);
+
+    //   int32_t recursive_param = num - 1;
+    //   return Helper(recursive_param);
+    Builder.SetInsertPoint(RecursiveCase);
+    Value *One = ConstantInt::get(Context, APInt(32, 1));
+    Value *RecursiveParam = Builder.CreateSub(Param, One);
+    Value *RecursiveReturn = Builder.CreateCall(Helper, RecursiveParam);
+    Value *Accumulator = Builder.CreateAdd(Param, RecursiveReturn);
+    Builder.CreateRet(Accumulator);
+
+    return Result;
+  }
+
+  // Populates Modules A and B:
+  // Module A { Extern FB1, Function FA which calls FB1 },
+  // Module B { Extern FA, Function FB1, Function FB2 which calls FA },
+  void createCrossModuleRecursiveCase(OwningPtr<Module> &A,
+                                      Function *&FA,
+                                      OwningPtr<Module> &B,
+                                      Function *&FB1,
+                                      Function *&FB2) {
+    // Define FB1 in B.
+    B.reset(createEmptyModule("B"));
+    FB1 = insertAccumulateFunction(B.get(), 0, "FB1");
+
+    // Declare FB1 in A (as an external).
+    A.reset(createEmptyModule("A"));
+    Function *FB1Extern = insertExternalReferenceToFunction(A.get(), FB1);
+
+    // Define FA in A (with a call to FB1).
+    FA = insertAccumulateFunction(A.get(), FB1Extern, "FA");
+
+    // Declare FA in B (as an external)
+    Function *FAExtern = insertExternalReferenceToFunction(B.get(), FA);
+
+    // Define FB2 in B (with a call to FA)
+    FB2 = insertAccumulateFunction(B.get(), FAExtern, "FB2");
+  }
+
+  // Module A { Function FA },
+  // Module B { Extern FA, Function FB which calls FA },
+  // Module C { Extern FB, Function FC which calls FB },
+  void createThreeModuleChainedCallsCase(OwningPtr<Module> &A,
+                             Function *&FA,
+                             OwningPtr<Module> &B,
+                             Function *&FB,
+                             OwningPtr<Module> &C,
+                             Function *&FC) {
+    A.reset(createEmptyModule("A"));
+    FA = insertAddFunction(A.get());
+
+    B.reset(createEmptyModule("B"));
+    Function *FAExtern_in_B = insertExternalReferenceToFunction(B.get(), FA);
+    FB = insertSimpleCallFunction<int32_t(int32_t, int32_t)>(B.get(), FAExtern_in_B);
+
+    C.reset(createEmptyModule("C"));
+    Function *FBExtern_in_C = insertExternalReferenceToFunction(C.get(), FB);
+    FC = insertSimpleCallFunction<int32_t(int32_t, int32_t)>(C.get(), FBExtern_in_C);
+  }
+
+
+  // Module A { Function FA },
+  // Populates Modules A and B:
+  // Module B { Function FB }
+  void createTwoModuleCase(OwningPtr<Module> &A, Function *&FA,
+                           OwningPtr<Module> &B, Function *&FB) {
+    A.reset(createEmptyModule("A"));
+    FA = insertAddFunction(A.get());
+
+    B.reset(createEmptyModule("B"));
+    FB = insertAddFunction(B.get());
+  }
+
+  // Module A { Function FA },
+  // Module B { Extern FA, Function FB which calls FA }
+  void createTwoModuleExternCase(OwningPtr<Module> &A, Function *&FA,
+                                 OwningPtr<Module> &B, Function *&FB) {
+    A.reset(createEmptyModule("A"));
+    FA = insertAddFunction(A.get());
+
+    B.reset(createEmptyModule("B"));
+    Function *FAExtern_in_B = insertExternalReferenceToFunction(B.get(), FA);
+    FB = insertSimpleCallFunction<int32_t(int32_t, int32_t)>(B.get(),
+                                                             FAExtern_in_B);
+  }
+
+  // Module A { Function FA },
+  // Module B { Extern FA, Function FB which calls FA },
+  // Module C { Extern FB, Function FC which calls FA },
+  void createThreeModuleCase(OwningPtr<Module> &A,
+                             Function *&FA,
+                             OwningPtr<Module> &B,
+                             Function *&FB,
+                             OwningPtr<Module> &C,
+                             Function *&FC) {
+    A.reset(createEmptyModule("A"));
+    FA = insertAddFunction(A.get());
+
+    B.reset(createEmptyModule("B"));
+    Function *FAExtern_in_B = insertExternalReferenceToFunction(B.get(), FA);
+    FB = insertSimpleCallFunction<int32_t(int32_t, int32_t)>(B.get(), FAExtern_in_B);
+
+    C.reset(createEmptyModule("C"));
+    Function *FAExtern_in_C = insertExternalReferenceToFunction(C.get(), FA);
+    FC = insertSimpleCallFunction<int32_t(int32_t, int32_t)>(C.get(), FAExtern_in_C);
+  }
 };
 
+
 class MCJITTestBase : public MCJITTestAPICommon, public TrivialModuleBuilder {
 protected:
-  
+
   MCJITTestBase()
     : TrivialModuleBuilder(HostTriple)
     , OptLevel(CodeGenOpt::None)
diff --git a/unittests/ExecutionEngine/MCJIT/Makefile b/unittests/ExecutionEngine/MCJIT/Makefile
index 454f830..33b043b 100644
--- a/unittests/ExecutionEngine/MCJIT/Makefile
+++ b/unittests/ExecutionEngine/MCJIT/Makefile
@@ -9,7 +9,7 @@
 
 LEVEL = ../../..
 TESTNAME = MCJIT
-LINK_COMPONENTS := core jit mcjit native support
+LINK_COMPONENTS := core mcjit native support
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index c53043e..fd0831f 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -10,6 +10,7 @@ set(IRSources
   DominatorTreeTest.cpp
   IRBuilderTest.cpp
   InstructionsTest.cpp
+  LegacyPassManagerTest.cpp
   MDBuilderTest.cpp
   MetadataTest.cpp
   PassManagerTest.cpp
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index fecc4a4..2f390f7 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -25,12 +25,12 @@ namespace {
 class IRBuilderTest : public testing::Test {
 protected:
   virtual void SetUp() {
-    M.reset(new Module("MyModule", getGlobalContext()));
-    FunctionType *FTy = FunctionType::get(Type::getVoidTy(getGlobalContext()),
+    M.reset(new Module("MyModule", Ctx));
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
                                           /*isVarArg=*/false);
     F = Function::Create(FTy, Function::ExternalLinkage, "", M.get());
-    BB = BasicBlock::Create(getGlobalContext(), "", F);
-    GV = new GlobalVariable(*M, Type::getFloatTy(getGlobalContext()), true,
+    BB = BasicBlock::Create(Ctx, "", F);
+    GV = new GlobalVariable(*M, Type::getFloatTy(Ctx), true,
                             GlobalValue::ExternalLinkage, 0);
   }
 
@@ -39,6 +39,7 @@ protected:
     M.reset();
   }
 
+  LLVMContext Ctx;
   OwningPtr<Module> M;
   Function *F;
   BasicBlock *BB;
@@ -78,8 +79,8 @@ TEST_F(IRBuilderTest, Lifetime) {
 
 TEST_F(IRBuilderTest, CreateCondBr) {
   IRBuilder<> Builder(BB);
-  BasicBlock *TBB = BasicBlock::Create(getGlobalContext(), "", F);
-  BasicBlock *FBB = BasicBlock::Create(getGlobalContext(), "", F);
+  BasicBlock *TBB = BasicBlock::Create(Ctx, "", F);
+  BasicBlock *FBB = BasicBlock::Create(Ctx, "", F);
 
   BranchInst *BI = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB);
   TerminatorInst *TI = BB->getTerminator();
@@ -89,7 +90,7 @@ TEST_F(IRBuilderTest, CreateCondBr) {
   EXPECT_EQ(FBB, TI->getSuccessor(1));
 
   BI->eraseFromParent();
-  MDNode *Weights = MDBuilder(getGlobalContext()).createBranchWeights(42, 13);
+  MDNode *Weights = MDBuilder(Ctx).createBranchWeights(42, 13);
   BI = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB, Weights);
   TI = BB->getTerminator();
   EXPECT_EQ(BI, TI);
@@ -109,12 +110,12 @@ TEST_F(IRBuilderTest, LandingPadName) {
 TEST_F(IRBuilderTest, GetIntTy) {
   IRBuilder<> Builder(BB);
   IntegerType *Ty1 = Builder.getInt1Ty();
-  EXPECT_EQ(Ty1, IntegerType::get(getGlobalContext(), 1));
+  EXPECT_EQ(Ty1, IntegerType::get(Ctx, 1));
 
   DataLayout* DL = new DataLayout(M.get());
   IntegerType *IntPtrTy = Builder.getIntPtrTy(DL);
   unsigned IntPtrBitSize =  DL->getPointerSizeInBits(0);
-  EXPECT_EQ(IntPtrTy, IntegerType::get(getGlobalContext(), IntPtrBitSize));
+  EXPECT_EQ(IntPtrTy, IntegerType::get(Ctx, IntPtrBitSize));
   delete DL;
 }
 
@@ -182,4 +183,40 @@ TEST_F(IRBuilderTest, FastMathFlags) {
 
 }
 
+TEST_F(IRBuilderTest, RAIIHelpersTest) {
+  IRBuilder<> Builder(BB);
+  EXPECT_FALSE(Builder.getFastMathFlags().allowReciprocal());
+  MDBuilder MDB(M->getContext());
+
+  MDNode *FPMathA = MDB.createFPMath(0.01f);
+  MDNode *FPMathB = MDB.createFPMath(0.1f);
+
+  Builder.SetDefaultFPMathTag(FPMathA);
+
+  {
+    IRBuilder<>::FastMathFlagGuard Guard(Builder);
+    FastMathFlags FMF;
+    FMF.setAllowReciprocal();
+    Builder.SetFastMathFlags(FMF);
+    Builder.SetDefaultFPMathTag(FPMathB);
+    EXPECT_TRUE(Builder.getFastMathFlags().allowReciprocal());
+    EXPECT_EQ(FPMathB, Builder.getDefaultFPMathTag());
+  }
+
+  EXPECT_FALSE(Builder.getFastMathFlags().allowReciprocal());
+  EXPECT_EQ(FPMathA, Builder.getDefaultFPMathTag());
+
+  Value *F = Builder.CreateLoad(GV);
+
+  {
+    IRBuilder<>::InsertPointGuard Guard(Builder);
+    Builder.SetInsertPoint(cast<Instruction>(F));
+    EXPECT_EQ(F, Builder.GetInsertPoint());
+  }
+
+  EXPECT_EQ(BB->end(), Builder.GetInsertPoint());
+  EXPECT_EQ(BB, Builder.GetInsertBlock());
+}
+
+
 }
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index ce6c465..65f85ba 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -149,6 +149,8 @@ TEST(InstructionsTest, CastInst) {
   const Constant* c8 = Constant::getNullValue(V8x8Ty);
   const Constant* c64 = Constant::getNullValue(V8x64Ty);
 
+  const Constant *v2ptr32 = Constant::getNullValue(V2Int32PtrTy);
+
   EXPECT_TRUE(CastInst::isCastable(V8x8Ty, X86MMXTy));
   EXPECT_TRUE(CastInst::isCastable(X86MMXTy, V8x8Ty));
   EXPECT_FALSE(CastInst::isCastable(Int64Ty, X86MMXTy));
@@ -169,6 +171,10 @@ TEST(InstructionsTest, CastInst) {
   EXPECT_FALSE(CastInst::isBitCastable(V2Int32PtrTy, V2Int32PtrAS1Ty));
   EXPECT_FALSE(CastInst::isBitCastable(V2Int32PtrAS1Ty, V2Int32PtrTy));
   EXPECT_TRUE(CastInst::isBitCastable(V2Int32PtrAS1Ty, V2Int64PtrAS1Ty));
+  EXPECT_TRUE(CastInst::isCastable(V2Int32PtrAS1Ty, V2Int32PtrTy));
+  EXPECT_EQ(CastInst::AddrSpaceCast, CastInst::getCastOpcode(v2ptr32, true,
+                                                             V2Int32PtrAS1Ty,
+                                                             true));
 
   // Test mismatched number of elements for pointers
   EXPECT_FALSE(CastInst::isBitCastable(V2Int32PtrAS1Ty, V4Int64PtrAS1Ty));
@@ -371,7 +377,6 @@ TEST(InstructionsTest, isEliminableCastPair) {
                                            0, Int32Ty, 0),
             0U);
 
-
   // Test that we don't eliminate bitcasts between different address spaces,
   // or if we don't have available pointer size information.
   DataLayout DL("e-p:32:32:32-p1:16:16:16-p2:64:64:64-i1:8:8-i8:8:8-i16:16:16"
@@ -384,26 +389,18 @@ TEST(InstructionsTest, isEliminableCastPair) {
   IntegerType *Int16SizePtr = DL.getIntPtrType(C, 1);
   IntegerType *Int64SizePtr = DL.getIntPtrType(C, 2);
 
-  // Fail since the ptr int types are not provided
+  // Cannot simplify inttoptr, addrspacecast
   EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
-                                           CastInst::BitCast,
+                                           CastInst::AddrSpaceCast,
                                            Int16Ty, Int64PtrTyAS1, Int64PtrTyAS2,
-                                           0, 0, 0),
-            0U);
-
-  // Fail since the the bitcast is between different sized address spaces
-  EXPECT_EQ(CastInst::isEliminableCastPair(
-              CastInst::IntToPtr,
-              CastInst::BitCast,
-              Int16Ty, Int64PtrTyAS1, Int64PtrTyAS2,
-              0, Int16SizePtr, Int64SizePtr),
+                                           0, Int16SizePtr, Int64SizePtr),
             0U);
 
-  // Fail since the the bitcast is between different sized address spaces
-  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
-                                           CastInst::BitCast,
-                                           Int16Ty, Int64PtrTyAS1, Int64PtrTyAS2,
-                                           0, Int16SizePtr, Int64SizePtr),
+  // Cannot simplify addrspacecast, ptrtoint
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::AddrSpaceCast,
+                                           CastInst::PtrToInt,
+                                           Int64PtrTyAS1, Int64PtrTyAS2, Int16Ty,
+                                           Int64SizePtr, Int16SizePtr, 0),
             0U);
 
   // Pass since the bitcast address spaces are the same
@@ -413,28 +410,6 @@ TEST(InstructionsTest, isEliminableCastPair) {
                                            0, 0, 0),
             CastInst::IntToPtr);
 
-
-  // Fail without known pointer sizes and different address spaces
-  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::BitCast,
-                                           CastInst::PtrToInt,
-                                           Int64PtrTyAS1, Int64PtrTyAS2, Int16Ty,
-                                           0, 0, 0),
-            0U);
-
-  // Pass since the address spaces are the same, even though the pointer sizes
-  // are unknown
-  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::BitCast,
-                                           CastInst::PtrToInt,
-                                           Int64PtrTyAS1, Int64PtrTyAS1, Int32Ty,
-                                           0, 0, 0),
-            Instruction::PtrToInt);
-
-  // Fail since the bitcast is the wrong size
-  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::BitCast,
-                                           CastInst::PtrToInt,
-                                           Int64PtrTyAS1, Int64PtrTyAS2, Int64Ty,
-                                           Int16SizePtr, Int64SizePtr, 0),
-            0U);
 }
 
 }  // end anonymous namespace
diff --git a/unittests/IR/LegacyPassManagerTest.cpp b/unittests/IR/LegacyPassManagerTest.cpp
new file mode 100644
index 0000000..11841bd
--- /dev/null
+++ b/unittests/IR/LegacyPassManagerTest.cpp
@@ -0,0 +1,559 @@
+//===- llvm/unittest/IR/LegacyPassManager.cpp - Legacy PassManager tests --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This unit test exercises the legacy pass manager infrastructure. We use the
+// old names as well to ensure that the source-level compatibility wrapper
+// works for out-of-tree code that expects to include llvm/PassManager.h and
+// subclass the core pass classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/PassManager.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeModuleNDMPass(PassRegistry&);
+  void initializeFPassPass(PassRegistry&);
+  void initializeCGPassPass(PassRegistry&);
+  void initializeLPassPass(PassRegistry&);
+  void initializeBPassPass(PassRegistry&);
+
+  namespace {
+    // ND = no deps
+    // NM = no modifications
+    struct ModuleNDNM: public ModulePass {
+    public:
+      static char run;
+      static char ID;
+      ModuleNDNM() : ModulePass(ID) { }
+      virtual bool runOnModule(Module &M) {
+        run++;
+        return false;
+      }
+      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+        AU.setPreservesAll();
+      }
+    };
+    char ModuleNDNM::ID=0;
+    char ModuleNDNM::run=0;
+
+    struct ModuleNDM : public ModulePass {
+    public:
+      static char run;
+      static char ID;
+      ModuleNDM() : ModulePass(ID) {}
+      virtual bool runOnModule(Module &M) {
+        run++;
+        return true;
+      }
+    };
+    char ModuleNDM::ID=0;
+    char ModuleNDM::run=0;
+
+    struct ModuleNDM2 : public ModulePass {
+    public:
+      static char run;
+      static char ID;
+      ModuleNDM2() : ModulePass(ID) {}
+      virtual bool runOnModule(Module &M) {
+        run++;
+        return true;
+      }
+    };
+    char ModuleNDM2::ID=0;
+    char ModuleNDM2::run=0;
+
+    struct ModuleDNM : public ModulePass {
+    public:
+      static char run;
+      static char ID;
+      ModuleDNM() : ModulePass(ID) {
+        initializeModuleNDMPass(*PassRegistry::getPassRegistry());
+      }
+      virtual bool runOnModule(Module &M) {
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
+        run++;
+        return false;
+      }
+      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+        AU.addRequired<ModuleNDM>();
+        AU.setPreservesAll();
+      }
+    };
+    char ModuleDNM::ID=0;
+    char ModuleDNM::run=0;
+
+    template<typename P>
+    struct PassTestBase : public P {
+    protected:
+      static int runc;
+      static bool initialized;
+      static bool finalized;
+      int allocated;
+      void run() {
+        EXPECT_TRUE(initialized);
+        EXPECT_FALSE(finalized);
+        EXPECT_EQ(0, allocated);
+        allocated++;
+        runc++;
+      }
+    public:
+      static char ID;
+      static void finishedOK(int run) {
+        EXPECT_GT(runc, 0);
+        EXPECT_TRUE(initialized);
+        EXPECT_TRUE(finalized);
+        EXPECT_EQ(run, runc);
+      }
+      PassTestBase() : P(ID), allocated(0) {
+        initialized = false;
+        finalized = false;
+        runc = 0;
+      }
+
+      virtual void releaseMemory() {
+        EXPECT_GT(runc, 0);
+        EXPECT_GT(allocated, 0);
+        allocated--;
+      }
+    };
+    template<typename P> char PassTestBase<P>::ID;
+    template<typename P> int PassTestBase<P>::runc;
+    template<typename P> bool PassTestBase<P>::initialized;
+    template<typename P> bool PassTestBase<P>::finalized;
+
+    template<typename T, typename P>
+    struct PassTest : public PassTestBase<P> {
+    public:
+#ifndef _MSC_VER // MSVC complains that Pass is not base class.
+      using llvm::Pass::doInitialization;
+      using llvm::Pass::doFinalization;
+#endif
+      virtual bool doInitialization(T &t) {
+        EXPECT_FALSE(PassTestBase<P>::initialized);
+        PassTestBase<P>::initialized = true;
+        return false;
+      }
+      virtual bool doFinalization(T &t) {
+        EXPECT_FALSE(PassTestBase<P>::finalized);
+        PassTestBase<P>::finalized = true;
+        EXPECT_EQ(0, PassTestBase<P>::allocated);
+        return false;
+      }
+    };
+
+    struct CGPass : public PassTest<CallGraph, CallGraphSCCPass> {
+    public:
+      CGPass() {
+        initializeCGPassPass(*PassRegistry::getPassRegistry());
+      }
+      virtual bool runOnSCC(CallGraphSCC &SCMM) {
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
+        run();
+        return false;
+      }
+    };
+
+    struct FPass : public PassTest<Module, FunctionPass> {
+    public:
+      virtual bool runOnFunction(Function &F) {
+        // FIXME: PR4112
+        // EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
+        run();
+        return false;
+      }
+    };
+
+    struct LPass : public PassTestBase<LoopPass> {
+    private:
+      static int initcount;
+      static int fincount;
+    public:
+      LPass() {
+        initializeLPassPass(*PassRegistry::getPassRegistry());
+        initcount = 0; fincount=0;
+        EXPECT_FALSE(initialized);
+      }
+      static void finishedOK(int run, int finalized) {
+        PassTestBase<LoopPass>::finishedOK(run);
+        EXPECT_EQ(run, initcount);
+        EXPECT_EQ(finalized, fincount);
+      }
+      using llvm::Pass::doInitialization;
+      using llvm::Pass::doFinalization;
+      virtual bool doInitialization(Loop* L, LPPassManager &LPM) {
+        initialized = true;
+        initcount++;
+        return false;
+      }
+      virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
+        run();
+        return false;
+      }
+      virtual bool doFinalization() {
+        fincount++;
+        finalized = true;
+        return false;
+      }
+    };
+    int LPass::initcount=0;
+    int LPass::fincount=0;
+
+    struct BPass : public PassTestBase<BasicBlockPass> {
+    private:
+      static int inited;
+      static int fin;
+    public:
+      static void finishedOK(int run, int N) {
+        PassTestBase<BasicBlockPass>::finishedOK(run);
+        EXPECT_EQ(inited, N);
+        EXPECT_EQ(fin, N);
+      }
+      BPass() {
+        inited = 0;
+        fin = 0;
+      }
+      virtual bool doInitialization(Module &M) {
+        EXPECT_FALSE(initialized);
+        initialized = true;
+        return false;
+      }
+      virtual bool doInitialization(Function &F) {
+        inited++;
+        return false;
+      }
+      virtual bool runOnBasicBlock(BasicBlock &BB) {
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
+        run();
+        return false;
+      }
+      virtual bool doFinalization(Function &F) {
+        fin++;
+        return false;
+      }
+      virtual bool doFinalization(Module &M) {
+        EXPECT_FALSE(finalized);
+        finalized = true;
+        EXPECT_EQ(0, allocated);
+        return false;
+      }
+    };
+    int BPass::inited=0;
+    int BPass::fin=0;
+
+    struct OnTheFlyTest: public ModulePass {
+    public:
+      static char ID;
+      OnTheFlyTest() : ModulePass(ID) {
+        initializeFPassPass(*PassRegistry::getPassRegistry());
+      }
+      virtual bool runOnModule(Module &M) {
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
+        for (Module::iterator I=M.begin(),E=M.end(); I != E; ++I) {
+          Function &F = *I;
+          {
+            SCOPED_TRACE("Running on the fly function pass");
+            getAnalysis<FPass>(F);
+          }
+        }
+        return false;
+      }
+      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+        AU.addRequired<FPass>();
+      }
+    };
+    char OnTheFlyTest::ID=0;
+
+    TEST(PassManager, RunOnce) {
+      Module M("test-once", getGlobalContext());
+      struct ModuleNDNM *mNDNM = new ModuleNDNM();
+      struct ModuleDNM *mDNM = new ModuleDNM();
+      struct ModuleNDM *mNDM = new ModuleNDM();
+      struct ModuleNDM2 *mNDM2 = new ModuleNDM2();
+
+      mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
+
+      PassManager Passes;
+      Passes.add(new DataLayout(&M));
+      Passes.add(mNDM2);
+      Passes.add(mNDM);
+      Passes.add(mNDNM);
+      Passes.add(mDNM);
+
+      Passes.run(M);
+      // each pass must be run exactly once, since nothing invalidates them
+      EXPECT_EQ(1, mNDM->run);
+      EXPECT_EQ(1, mNDNM->run);
+      EXPECT_EQ(1, mDNM->run);
+      EXPECT_EQ(1, mNDM2->run);
+    }
+
+    TEST(PassManager, ReRun) {
+      Module M("test-rerun", getGlobalContext());
+      struct ModuleNDNM *mNDNM = new ModuleNDNM();
+      struct ModuleDNM *mDNM = new ModuleDNM();
+      struct ModuleNDM *mNDM = new ModuleNDM();
+      struct ModuleNDM2 *mNDM2 = new ModuleNDM2();
+
+      mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
+
+      PassManager Passes;
+      Passes.add(new DataLayout(&M));
+      Passes.add(mNDM);
+      Passes.add(mNDNM);
+      Passes.add(mNDM2);// invalidates mNDM needed by mDNM
+      Passes.add(mDNM);
+
+      Passes.run(M);
+      // Some passes must be rerun because a pass that modified the
+      // module/function was run in between
+      EXPECT_EQ(2, mNDM->run);
+      EXPECT_EQ(1, mNDNM->run);
+      EXPECT_EQ(1, mNDM2->run);
+      EXPECT_EQ(1, mDNM->run);
+    }
+
+    Module* makeLLVMModule();
+
+    template<typename T>
+    void MemoryTestHelper(int run) {
+      OwningPtr<Module> M(makeLLVMModule());
+      T *P = new T();
+      PassManager Passes;
+      Passes.add(new DataLayout(M.get()));
+      Passes.add(P);
+      Passes.run(*M);
+      T::finishedOK(run);
+    }
+
+    template<typename T>
+    void MemoryTestHelper(int run, int N) {
+      Module *M = makeLLVMModule();
+      T *P = new T();
+      PassManager Passes;
+      Passes.add(new DataLayout(M));
+      Passes.add(P);
+      Passes.run(*M);
+      T::finishedOK(run, N);
+      delete M;
+    }
+
+    TEST(PassManager, Memory) {
+      // SCC#1: test1->test2->test3->test1
+      // SCC#2: test4
+      // SCC#3: indirect call node
+      {
+        SCOPED_TRACE("Callgraph pass");
+        MemoryTestHelper<CGPass>(3);
+      }
+
+      {
+        SCOPED_TRACE("Function pass");
+        MemoryTestHelper<FPass>(4);// 4 functions
+      }
+
+      {
+        SCOPED_TRACE("Loop pass");
+        MemoryTestHelper<LPass>(2, 1); //2 loops, 1 function
+      }
+      {
+        SCOPED_TRACE("Basic block pass");
+        MemoryTestHelper<BPass>(7, 4); //9 basic blocks
+      }
+
+    }
+
+    TEST(PassManager, MemoryOnTheFly) {
+      Module *M = makeLLVMModule();
+      {
+        SCOPED_TRACE("Running OnTheFlyTest");
+        struct OnTheFlyTest *O = new OnTheFlyTest();
+        PassManager Passes;
+        Passes.add(new DataLayout(M));
+        Passes.add(O);
+        Passes.run(*M);
+
+        FPass::finishedOK(4);
+      }
+      delete M;
+    }
+
+    Module* makeLLVMModule() {
+      // Module Construction
+      Module* mod = new Module("test-mem", getGlobalContext());
+      mod->setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-"
+                         "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-"
+                         "a0:0:64-s0:64:64-f80:128:128");
+      mod->setTargetTriple("x86_64-unknown-linux-gnu");
+
+      // Type Definitions
+      std::vector<Type*>FuncTy_0_args;
+      FunctionType* FuncTy_0 = FunctionType::get(
+        /*Result=*/IntegerType::get(getGlobalContext(), 32),
+        /*Params=*/FuncTy_0_args,
+        /*isVarArg=*/false);
+
+      std::vector<Type*>FuncTy_2_args;
+      FuncTy_2_args.push_back(IntegerType::get(getGlobalContext(), 1));
+      FunctionType* FuncTy_2 = FunctionType::get(
+        /*Result=*/Type::getVoidTy(getGlobalContext()),
+        /*Params=*/FuncTy_2_args,
+        /*isVarArg=*/false);
+
+
+      // Function Declarations
+
+      Function* func_test1 = Function::Create(
+        /*Type=*/FuncTy_0,
+        /*Linkage=*/GlobalValue::ExternalLinkage,
+        /*Name=*/"test1", mod);
+      func_test1->setCallingConv(CallingConv::C);
+      AttributeSet func_test1_PAL;
+      func_test1->setAttributes(func_test1_PAL);
+
+      Function* func_test2 = Function::Create(
+        /*Type=*/FuncTy_0,
+        /*Linkage=*/GlobalValue::ExternalLinkage,
+        /*Name=*/"test2", mod);
+      func_test2->setCallingConv(CallingConv::C);
+      AttributeSet func_test2_PAL;
+      func_test2->setAttributes(func_test2_PAL);
+
+      Function* func_test3 = Function::Create(
+        /*Type=*/FuncTy_0,
+        /*Linkage=*/GlobalValue::ExternalLinkage,
+        /*Name=*/"test3", mod);
+      func_test3->setCallingConv(CallingConv::C);
+      AttributeSet func_test3_PAL;
+      func_test3->setAttributes(func_test3_PAL);
+
+      Function* func_test4 = Function::Create(
+        /*Type=*/FuncTy_2,
+        /*Linkage=*/GlobalValue::ExternalLinkage,
+        /*Name=*/"test4", mod);
+      func_test4->setCallingConv(CallingConv::C);
+      AttributeSet func_test4_PAL;
+      func_test4->setAttributes(func_test4_PAL);
+
+      // Global Variable Declarations
+
+
+      // Constant Definitions
+
+      // Global Variable Definitions
+
+      // Function Definitions
+
+      // Function: test1 (func_test1)
+      {
+
+        BasicBlock* label_entry = BasicBlock::Create(getGlobalContext(), "entry",func_test1,0);
+
+        // Block entry (label_entry)
+        CallInst* int32_3 = CallInst::Create(func_test2, "", label_entry);
+        int32_3->setCallingConv(CallingConv::C);
+        int32_3->setTailCall(false);AttributeSet int32_3_PAL;
+        int32_3->setAttributes(int32_3_PAL);
+
+        ReturnInst::Create(getGlobalContext(), int32_3, label_entry);
+
+      }
+
+      // Function: test2 (func_test2)
+      {
+
+        BasicBlock* label_entry_5 = BasicBlock::Create(getGlobalContext(), "entry",func_test2,0);
+
+        // Block entry (label_entry_5)
+        CallInst* int32_6 = CallInst::Create(func_test3, "", label_entry_5);
+        int32_6->setCallingConv(CallingConv::C);
+        int32_6->setTailCall(false);AttributeSet int32_6_PAL;
+        int32_6->setAttributes(int32_6_PAL);
+
+        ReturnInst::Create(getGlobalContext(), int32_6, label_entry_5);
+
+      }
+
+      // Function: test3 (func_test3)
+      {
+
+        BasicBlock* label_entry_8 = BasicBlock::Create(getGlobalContext(), "entry",func_test3,0);
+
+        // Block entry (label_entry_8)
+        CallInst* int32_9 = CallInst::Create(func_test1, "", label_entry_8);
+        int32_9->setCallingConv(CallingConv::C);
+        int32_9->setTailCall(false);AttributeSet int32_9_PAL;
+        int32_9->setAttributes(int32_9_PAL);
+
+        ReturnInst::Create(getGlobalContext(), int32_9, label_entry_8);
+
+      }
+
+      // Function: test4 (func_test4)
+      {
+        Function::arg_iterator args = func_test4->arg_begin();
+        Value* int1_f = args++;
+        int1_f->setName("f");
+
+        BasicBlock* label_entry_11 = BasicBlock::Create(getGlobalContext(), "entry",func_test4,0);
+        BasicBlock* label_bb = BasicBlock::Create(getGlobalContext(), "bb",func_test4,0);
+        BasicBlock* label_bb1 = BasicBlock::Create(getGlobalContext(), "bb1",func_test4,0);
+        BasicBlock* label_return = BasicBlock::Create(getGlobalContext(), "return",func_test4,0);
+
+        // Block entry (label_entry_11)
+        BranchInst::Create(label_bb, label_entry_11);
+
+        // Block bb (label_bb)
+        BranchInst::Create(label_bb, label_bb1, int1_f, label_bb);
+
+        // Block bb1 (label_bb1)
+        BranchInst::Create(label_bb1, label_return, int1_f, label_bb1);
+
+        // Block return (label_return)
+        ReturnInst::Create(getGlobalContext(), label_return);
+
+      }
+      return mod;
+    }
+
+  }
+}
+
+INITIALIZE_PASS(ModuleNDM, "mndm", "mndm", false, false)
+INITIALIZE_PASS_BEGIN(CGPass, "cgp","cgp", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_END(CGPass, "cgp","cgp", false, false)
+INITIALIZE_PASS(FPass, "fp","fp", false, false)
+INITIALIZE_PASS_BEGIN(LPass, "lp","lp", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(LPass, "lp","lp", false, false)
+INITIALIZE_PASS(BPass, "bp","bp", false, false)
diff --git a/unittests/IR/PassManagerTest.cpp b/unittests/IR/PassManagerTest.cpp
index 1097da6..7b60e38 100644
--- a/unittests/IR/PassManagerTest.cpp
+++ b/unittests/IR/PassManagerTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/IR/PassManager.cpp - PassManager unit tests ----------===//
+//===- llvm/unittest/IR/PassManager.cpp - PassManager tests ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,546 +7,125 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/PassManager.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Assembly/Parser.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
-namespace llvm {
-  void initializeModuleNDMPass(PassRegistry&);
-  void initializeFPassPass(PassRegistry&);
-  void initializeCGPassPass(PassRegistry&);
-  void initializeLPassPass(PassRegistry&);
-  void initializeBPassPass(PassRegistry&);
-
-  namespace {
-    // ND = no deps
-    // NM = no modifications
-    struct ModuleNDNM: public ModulePass {
-    public:
-      static char run;
-      static char ID;
-      ModuleNDNM() : ModulePass(ID) { }
-      virtual bool runOnModule(Module &M) {
-        run++;
-        return false;
-      }
-      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-        AU.setPreservesAll();
-      }
-    };
-    char ModuleNDNM::ID=0;
-    char ModuleNDNM::run=0;
-
-    struct ModuleNDM : public ModulePass {
-    public:
-      static char run;
-      static char ID;
-      ModuleNDM() : ModulePass(ID) {}
-      virtual bool runOnModule(Module &M) {
-        run++;
-        return true;
-      }
-    };
-    char ModuleNDM::ID=0;
-    char ModuleNDM::run=0;
-
-    struct ModuleNDM2 : public ModulePass {
-    public:
-      static char run;
-      static char ID;
-      ModuleNDM2() : ModulePass(ID) {}
-      virtual bool runOnModule(Module &M) {
-        run++;
-        return true;
-      }
-    };
-    char ModuleNDM2::ID=0;
-    char ModuleNDM2::run=0;
-
-    struct ModuleDNM : public ModulePass {
-    public:
-      static char run;
-      static char ID;
-      ModuleDNM() : ModulePass(ID) {
-        initializeModuleNDMPass(*PassRegistry::getPassRegistry());
-      }
-      virtual bool runOnModule(Module &M) {
-        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
-        run++;
-        return false;
-      }
-      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-        AU.addRequired<ModuleNDM>();
-        AU.setPreservesAll();
-      }
-    };
-    char ModuleDNM::ID=0;
-    char ModuleDNM::run=0;
-
-    template<typename P>
-    struct PassTestBase : public P {
-    protected:
-      static int runc;
-      static bool initialized;
-      static bool finalized;
-      int allocated;
-      void run() {
-        EXPECT_TRUE(initialized);
-        EXPECT_FALSE(finalized);
-        EXPECT_EQ(0, allocated);
-        allocated++;
-        runc++;
-      }
-    public:
-      static char ID;
-      static void finishedOK(int run) {
-        EXPECT_GT(runc, 0);
-        EXPECT_TRUE(initialized);
-        EXPECT_TRUE(finalized);
-        EXPECT_EQ(run, runc);
-      }
-      PassTestBase() : P(ID), allocated(0) {
-        initialized = false;
-        finalized = false;
-        runc = 0;
-      }
-
-      virtual void releaseMemory() {
-        EXPECT_GT(runc, 0);
-        EXPECT_GT(allocated, 0);
-        allocated--;
-      }
-    };
-    template<typename P> char PassTestBase<P>::ID;
-    template<typename P> int PassTestBase<P>::runc;
-    template<typename P> bool PassTestBase<P>::initialized;
-    template<typename P> bool PassTestBase<P>::finalized;
-
-    template<typename T, typename P>
-    struct PassTest : public PassTestBase<P> {
-    public:
-#ifndef _MSC_VER // MSVC complains that Pass is not base class.
-      using llvm::Pass::doInitialization;
-      using llvm::Pass::doFinalization;
-#endif
-      virtual bool doInitialization(T &t) {
-        EXPECT_FALSE(PassTestBase<P>::initialized);
-        PassTestBase<P>::initialized = true;
-        return false;
-      }
-      virtual bool doFinalization(T &t) {
-        EXPECT_FALSE(PassTestBase<P>::finalized);
-        PassTestBase<P>::finalized = true;
-        EXPECT_EQ(0, PassTestBase<P>::allocated);
-        return false;
-      }
-    };
-
-    struct CGPass : public PassTest<CallGraph, CallGraphSCCPass> {
-    public:
-      CGPass() {
-        initializeCGPassPass(*PassRegistry::getPassRegistry());
-      }
-      virtual bool runOnSCC(CallGraphSCC &SCMM) {
-        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
-        run();
-        return false;
-      }
-    };
-
-    struct FPass : public PassTest<Module, FunctionPass> {
-    public:
-      virtual bool runOnFunction(Function &F) {
-        // FIXME: PR4112
-        // EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
-        run();
-        return false;
-      }
-    };
-
-    struct LPass : public PassTestBase<LoopPass> {
-    private:
-      static int initcount;
-      static int fincount;
-    public:
-      LPass() {
-        initializeLPassPass(*PassRegistry::getPassRegistry());
-        initcount = 0; fincount=0;
-        EXPECT_FALSE(initialized);
-      }
-      static void finishedOK(int run, int finalized) {
-        PassTestBase<LoopPass>::finishedOK(run);
-        EXPECT_EQ(run, initcount);
-        EXPECT_EQ(finalized, fincount);
-      }
-      using llvm::Pass::doInitialization;
-      using llvm::Pass::doFinalization;
-      virtual bool doInitialization(Loop* L, LPPassManager &LPM) {
-        initialized = true;
-        initcount++;
-        return false;
-      }
-      virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
-        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
-        run();
-        return false;
-      }
-      virtual bool doFinalization() {
-        fincount++;
-        finalized = true;
-        return false;
-      }
-    };
-    int LPass::initcount=0;
-    int LPass::fincount=0;
-
-    struct BPass : public PassTestBase<BasicBlockPass> {
-    private:
-      static int inited;
-      static int fin;
-    public:
-      static void finishedOK(int run, int N) {
-        PassTestBase<BasicBlockPass>::finishedOK(run);
-        EXPECT_EQ(inited, N);
-        EXPECT_EQ(fin, N);
-      }
-      BPass() {
-        inited = 0;
-        fin = 0;
-      }
-      virtual bool doInitialization(Module &M) {
-        EXPECT_FALSE(initialized);
-        initialized = true;
-        return false;
-      }
-      virtual bool doInitialization(Function &F) {
-        inited++;
-        return false;
-      }
-      virtual bool runOnBasicBlock(BasicBlock &BB) {
-        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
-        run();
-        return false;
-      }
-      virtual bool doFinalization(Function &F) {
-        fin++;
-        return false;
-      }
-      virtual bool doFinalization(Module &M) {
-        EXPECT_FALSE(finalized);
-        finalized = true;
-        EXPECT_EQ(0, allocated);
-        return false;
-      }
-    };
-    int BPass::inited=0;
-    int BPass::fin=0;
-
-    struct OnTheFlyTest: public ModulePass {
-    public:
-      static char ID;
-      OnTheFlyTest() : ModulePass(ID) {
-        initializeFPassPass(*PassRegistry::getPassRegistry());
-      }
-      virtual bool runOnModule(Module &M) {
-        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
-        for (Module::iterator I=M.begin(),E=M.end(); I != E; ++I) {
-          Function &F = *I;
-          {
-            SCOPED_TRACE("Running on the fly function pass");
-            getAnalysis<FPass>(F);
-          }
-        }
-        return false;
-      }
-      virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-        AU.addRequired<FPass>();
-      }
-    };
-    char OnTheFlyTest::ID=0;
-
-    TEST(PassManager, RunOnce) {
-      Module M("test-once", getGlobalContext());
-      struct ModuleNDNM *mNDNM = new ModuleNDNM();
-      struct ModuleDNM *mDNM = new ModuleDNM();
-      struct ModuleNDM *mNDM = new ModuleNDM();
-      struct ModuleNDM2 *mNDM2 = new ModuleNDM2();
-
-      mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
-
-      PassManager Passes;
-      Passes.add(new DataLayout(&M));
-      Passes.add(mNDM2);
-      Passes.add(mNDM);
-      Passes.add(mNDNM);
-      Passes.add(mDNM);
-
-      Passes.run(M);
-      // each pass must be run exactly once, since nothing invalidates them
-      EXPECT_EQ(1, mNDM->run);
-      EXPECT_EQ(1, mNDNM->run);
-      EXPECT_EQ(1, mDNM->run);
-      EXPECT_EQ(1, mNDM2->run);
-    }
-
-    TEST(PassManager, ReRun) {
-      Module M("test-rerun", getGlobalContext());
-      struct ModuleNDNM *mNDNM = new ModuleNDNM();
-      struct ModuleDNM *mDNM = new ModuleDNM();
-      struct ModuleNDM *mNDM = new ModuleNDM();
-      struct ModuleNDM2 *mNDM2 = new ModuleNDM2();
-
-      mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
-
-      PassManager Passes;
-      Passes.add(new DataLayout(&M));
-      Passes.add(mNDM);
-      Passes.add(mNDNM);
-      Passes.add(mNDM2);// invalidates mNDM needed by mDNM
-      Passes.add(mDNM);
-
-      Passes.run(M);
-      // Some passes must be rerun because a pass that modified the
-      // module/function was run in between
-      EXPECT_EQ(2, mNDM->run);
-      EXPECT_EQ(1, mNDNM->run);
-      EXPECT_EQ(1, mNDM2->run);
-      EXPECT_EQ(1, mDNM->run);
-    }
-
-    Module* makeLLVMModule();
-
-    template<typename T>
-    void MemoryTestHelper(int run) {
-      OwningPtr<Module> M(makeLLVMModule());
-      T *P = new T();
-      PassManager Passes;
-      Passes.add(new DataLayout(M.get()));
-      Passes.add(P);
-      Passes.run(*M);
-      T::finishedOK(run);
-    }
-
-    template<typename T>
-    void MemoryTestHelper(int run, int N) {
-      Module *M = makeLLVMModule();
-      T *P = new T();
-      PassManager Passes;
-      Passes.add(new DataLayout(M));
-      Passes.add(P);
-      Passes.run(*M);
-      T::finishedOK(run, N);
-      delete M;
-    }
-
-    TEST(PassManager, Memory) {
-      // SCC#1: test1->test2->test3->test1
-      // SCC#2: test4
-      // SCC#3: indirect call node
-      {
-        SCOPED_TRACE("Callgraph pass");
-        MemoryTestHelper<CGPass>(3);
-      }
-
-      {
-        SCOPED_TRACE("Function pass");
-        MemoryTestHelper<FPass>(4);// 4 functions
-      }
-
-      {
-        SCOPED_TRACE("Loop pass");
-        MemoryTestHelper<LPass>(2, 1); //2 loops, 1 function
-      }
-      {
-        SCOPED_TRACE("Basic block pass");
-        MemoryTestHelper<BPass>(7, 4); //9 basic blocks
-      }
-
-    }
-
-    TEST(PassManager, MemoryOnTheFly) {
-      Module *M = makeLLVMModule();
-      {
-        SCOPED_TRACE("Running OnTheFlyTest");
-        struct OnTheFlyTest *O = new OnTheFlyTest();
-        PassManager Passes;
-        Passes.add(new DataLayout(M));
-        Passes.add(O);
-        Passes.run(*M);
-
-        FPass::finishedOK(4);
-      }
-      delete M;
-    }
-
-    Module* makeLLVMModule() {
-      // Module Construction
-      Module* mod = new Module("test-mem", getGlobalContext());
-      mod->setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-"
-                         "i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-"
-                         "a0:0:64-s0:64:64-f80:128:128");
-      mod->setTargetTriple("x86_64-unknown-linux-gnu");
-
-      // Type Definitions
-      std::vector<Type*>FuncTy_0_args;
-      FunctionType* FuncTy_0 = FunctionType::get(
-        /*Result=*/IntegerType::get(getGlobalContext(), 32),
-        /*Params=*/FuncTy_0_args,
-        /*isVarArg=*/false);
-
-      std::vector<Type*>FuncTy_2_args;
-      FuncTy_2_args.push_back(IntegerType::get(getGlobalContext(), 1));
-      FunctionType* FuncTy_2 = FunctionType::get(
-        /*Result=*/Type::getVoidTy(getGlobalContext()),
-        /*Params=*/FuncTy_2_args,
-        /*isVarArg=*/false);
-
-
-      // Function Declarations
-
-      Function* func_test1 = Function::Create(
-        /*Type=*/FuncTy_0,
-        /*Linkage=*/GlobalValue::ExternalLinkage,
-        /*Name=*/"test1", mod);
-      func_test1->setCallingConv(CallingConv::C);
-      AttributeSet func_test1_PAL;
-      func_test1->setAttributes(func_test1_PAL);
-
-      Function* func_test2 = Function::Create(
-        /*Type=*/FuncTy_0,
-        /*Linkage=*/GlobalValue::ExternalLinkage,
-        /*Name=*/"test2", mod);
-      func_test2->setCallingConv(CallingConv::C);
-      AttributeSet func_test2_PAL;
-      func_test2->setAttributes(func_test2_PAL);
-
-      Function* func_test3 = Function::Create(
-        /*Type=*/FuncTy_0,
-        /*Linkage=*/GlobalValue::ExternalLinkage,
-        /*Name=*/"test3", mod);
-      func_test3->setCallingConv(CallingConv::C);
-      AttributeSet func_test3_PAL;
-      func_test3->setAttributes(func_test3_PAL);
-
-      Function* func_test4 = Function::Create(
-        /*Type=*/FuncTy_2,
-        /*Linkage=*/GlobalValue::ExternalLinkage,
-        /*Name=*/"test4", mod);
-      func_test4->setCallingConv(CallingConv::C);
-      AttributeSet func_test4_PAL;
-      func_test4->setAttributes(func_test4_PAL);
-
-      // Global Variable Declarations
-
-
-      // Constant Definitions
-
-      // Global Variable Definitions
-
-      // Function Definitions
-
-      // Function: test1 (func_test1)
-      {
-
-        BasicBlock* label_entry = BasicBlock::Create(getGlobalContext(), "entry",func_test1,0);
-
-        // Block entry (label_entry)
-        CallInst* int32_3 = CallInst::Create(func_test2, "", label_entry);
-        int32_3->setCallingConv(CallingConv::C);
-        int32_3->setTailCall(false);AttributeSet int32_3_PAL;
-        int32_3->setAttributes(int32_3_PAL);
-
-        ReturnInst::Create(getGlobalContext(), int32_3, label_entry);
-
-      }
-
-      // Function: test2 (func_test2)
-      {
-
-        BasicBlock* label_entry_5 = BasicBlock::Create(getGlobalContext(), "entry",func_test2,0);
-
-        // Block entry (label_entry_5)
-        CallInst* int32_6 = CallInst::Create(func_test3, "", label_entry_5);
-        int32_6->setCallingConv(CallingConv::C);
-        int32_6->setTailCall(false);AttributeSet int32_6_PAL;
-        int32_6->setAttributes(int32_6_PAL);
-
-        ReturnInst::Create(getGlobalContext(), int32_6, label_entry_5);
-
-      }
-
-      // Function: test3 (func_test3)
-      {
+namespace {
+
+class TestAnalysisPass {
+public:
+  typedef Function IRUnitT;
+
+  struct Result {
+    Result(int Count) : InstructionCount(Count) {}
+    bool invalidate(Function *) { return true; }
+    int InstructionCount;
+  };
+
+  /// \brief Returns an opaque, unique ID for this pass type.
+  static void *ID() { return (void *)&PassID; }
+
+  /// \brief Run the analysis pass over the function and return a result.
+  Result run(Function *F) {
+    int Count = 0;
+    for (Function::iterator BBI = F->begin(), BBE = F->end(); BBI != BBE; ++BBI)
+      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+           ++II)
+        ++Count;
+    return Result(Count);
+  }
 
-        BasicBlock* label_entry_8 = BasicBlock::Create(getGlobalContext(), "entry",func_test3,0);
+private:
+  /// \brief Private static data to provide unique ID.
+  static char PassID;
+};
 
-        // Block entry (label_entry_8)
-        CallInst* int32_9 = CallInst::Create(func_test1, "", label_entry_8);
-        int32_9->setCallingConv(CallingConv::C);
-        int32_9->setTailCall(false);AttributeSet int32_9_PAL;
-        int32_9->setAttributes(int32_9_PAL);
+char TestAnalysisPass::PassID;
 
-        ReturnInst::Create(getGlobalContext(), int32_9, label_entry_8);
+struct TestModulePass {
+  TestModulePass(int &RunCount) : RunCount(RunCount) {}
 
-      }
+  bool run(Module *M) {
+    ++RunCount;
+    return true;
+  }
 
-      // Function: test4 (func_test4)
-      {
-        Function::arg_iterator args = func_test4->arg_begin();
-        Value* int1_f = args++;
-        int1_f->setName("f");
+  int &RunCount;
+};
 
-        BasicBlock* label_entry_11 = BasicBlock::Create(getGlobalContext(), "entry",func_test4,0);
-        BasicBlock* label_bb = BasicBlock::Create(getGlobalContext(), "bb",func_test4,0);
-        BasicBlock* label_bb1 = BasicBlock::Create(getGlobalContext(), "bb1",func_test4,0);
-        BasicBlock* label_return = BasicBlock::Create(getGlobalContext(), "return",func_test4,0);
+struct TestFunctionPass {
+  TestFunctionPass(AnalysisManager &AM, int &RunCount, int &AnalyzedInstrCount)
+      : AM(AM), RunCount(RunCount), AnalyzedInstrCount(AnalyzedInstrCount) {
+  }
 
-        // Block entry (label_entry_11)
-        BranchInst::Create(label_bb, label_entry_11);
+  bool run(Function *F) {
+    ++RunCount;
 
-        // Block bb (label_bb)
-        BranchInst::Create(label_bb, label_bb1, int1_f, label_bb);
+    const TestAnalysisPass::Result &AR = AM.getResult<TestAnalysisPass>(F);
+    AnalyzedInstrCount += AR.InstructionCount;
 
-        // Block bb1 (label_bb1)
-        BranchInst::Create(label_bb1, label_return, int1_f, label_bb1);
+    return true;
+  }
 
-        // Block return (label_return)
-        ReturnInst::Create(getGlobalContext(), label_return);
+  AnalysisManager &AM;
+  int &RunCount;
+  int &AnalyzedInstrCount;
+};
 
-      }
-      return mod;
-    }
+Module *parseIR(const char *IR) {
+  LLVMContext &C = getGlobalContext();
+  SMDiagnostic Err;
+  return ParseAssemblyString(IR, 0, Err, C);
+}
 
-  }
+class PassManagerTest : public ::testing::Test {
+protected:
+  OwningPtr<Module> M;
+
+public:
+  PassManagerTest()
+      : M(parseIR("define void @f() {\n"
+                  "entry:\n"
+                  "  call void @g()\n"
+                  "  call void @h()\n"
+                  "  ret void\n"
+                  "}\n"
+                  "define void @g() {\n"
+                  "  ret void\n"
+                  "}\n"
+                  "define void @h() {\n"
+                  "  ret void\n"
+                  "}\n")) {}
+};
+
+TEST_F(PassManagerTest, Basic) {
+  AnalysisManager AM(M.get());
+  AM.registerAnalysisPass(TestAnalysisPass());
+
+  ModulePassManager MPM(M.get(), &AM);
+  FunctionPassManager FPM(&AM);
+
+  // Count the runs over a module.
+  int ModulePassRunCount = 0;
+  MPM.addPass(TestModulePass(ModulePassRunCount));
+
+  // Count the runs over a Function.
+  int FunctionPassRunCount = 0;
+  int AnalyzedInstrCount = 0;
+  FPM.addPass(TestFunctionPass(AM, FunctionPassRunCount, AnalyzedInstrCount));
+  MPM.addPass(FPM);
+
+  MPM.run();
+  EXPECT_EQ(1, ModulePassRunCount);
+  EXPECT_EQ(3, FunctionPassRunCount);
+  EXPECT_EQ(5, AnalyzedInstrCount);
 }
 
-INITIALIZE_PASS(ModuleNDM, "mndm", "mndm", false, false)
-INITIALIZE_PASS_BEGIN(CGPass, "cgp","cgp", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
-INITIALIZE_PASS_END(CGPass, "cgp","cgp", false, false)
-INITIALIZE_PASS(FPass, "fp","fp", false, false)
-INITIALIZE_PASS_BEGIN(LPass, "lp","lp", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
-INITIALIZE_PASS_END(LPass, "lp","lp", false, false)
-INITIALIZE_PASS(BPass, "bp","bp", false, false)
+}
diff --git a/unittests/IR/ValueTest.cpp b/unittests/IR/ValueTest.cpp
index 52efb1a..ebe23e8 100644
--- a/unittests/IR/ValueTest.cpp
+++ b/unittests/IR/ValueTest.cpp
@@ -43,4 +43,44 @@ TEST(ValueTest, UsedInBasicBlock) {
   EXPECT_TRUE(F->arg_begin()->isUsedInBasicBlock(F->begin()));
 }
 
+TEST(GlobalTest, CreateAddressSpace) {
+  LLVMContext &Ctx = getGlobalContext();
+  OwningPtr<Module> M(new Module("TestModule", Ctx));
+  Type *Int8Ty = Type::getInt8Ty(Ctx);
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+
+  GlobalVariable *Dummy0
+    = new GlobalVariable(*M,
+                         Int32Ty,
+                         true,
+                         GlobalValue::ExternalLinkage,
+                         Constant::getAllOnesValue(Int32Ty),
+                         "dummy",
+                         0,
+                         GlobalVariable::NotThreadLocal,
+                         1);
+
+  // Make sure the address space isn't dropped when returning this.
+  Constant *Dummy1 = M->getOrInsertGlobal("dummy", Int32Ty);
+  EXPECT_EQ(Dummy0, Dummy1);
+  EXPECT_EQ(1u, Dummy1->getType()->getPointerAddressSpace());
+
+
+  // This one requires a bitcast, but the address space must also stay the same.
+  GlobalVariable *DummyCast0
+    = new GlobalVariable(*M,
+                         Int32Ty,
+                         true,
+                         GlobalValue::ExternalLinkage,
+                         Constant::getAllOnesValue(Int32Ty),
+                         "dummy_cast",
+                         0,
+                         GlobalVariable::NotThreadLocal,
+                         1);
+
+  // Make sure the address space isn't dropped when returning this.
+  Constant *DummyCast1 = M->getOrInsertGlobal("dummy_cast", Int8Ty);
+  EXPECT_EQ(1u, DummyCast1->getType()->getPointerAddressSpace());
+  EXPECT_NE(DummyCast0, DummyCast1) << *DummyCast1;
+}
 } // end anonymous namespace
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index 2848cb8..31936c3 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -24,10 +24,11 @@ namespace {
 
 TEST(VerifierTest, Branch_i1) {
   LLVMContext &C = getGlobalContext();
+  Module M("M", C);
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg=*/false);
-  OwningPtr<Function> F(Function::Create(FTy, GlobalValue::ExternalLinkage));
-  BasicBlock *Entry = BasicBlock::Create(C, "entry", F.get());
-  BasicBlock *Exit = BasicBlock::Create(C, "exit", F.get());
+  Function *F = cast<Function>(M.getOrInsertFunction("foo", FTy));
+  BasicBlock *Entry = BasicBlock::Create(C, "entry", F);
+  BasicBlock *Exit = BasicBlock::Create(C, "exit", F);
   ReturnInst::Create(C, Exit);
 
   // To avoid triggering an assertion in BranchInst::Create, we first create
diff --git a/unittests/IR/WaymarkTest.cpp b/unittests/IR/WaymarkTest.cpp
index cf7d76d..9a9b4a2 100644
--- a/unittests/IR/WaymarkTest.cpp
+++ b/unittests/IR/WaymarkTest.cpp
@@ -9,6 +9,7 @@
 
 // we perform white-box tests
 //
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/unittests/MC/CMakeLists.txt b/unittests/MC/CMakeLists.txt
new file mode 100644
index 0000000..0e4782c
--- /dev/null
+++ b/unittests/MC/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(LLVM_LINK_COMPONENTS
+  MC
+  )
+
+set(MCSources
+  MCAtomTest.cpp
+  )
+
+add_llvm_unittest(MCTests
+  ${MCSources}
+  )
diff --git a/unittests/MC/MCAtomTest.cpp b/unittests/MC/MCAtomTest.cpp
new file mode 100644
index 0000000..17b056c
--- /dev/null
+++ b/unittests/MC/MCAtomTest.cpp
@@ -0,0 +1,31 @@
+//===- llvm/unittest/MC/MCAtomTest.cpp - Instructions unit tests ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCModule.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+TEST(MCAtomTest, MCDataSize) {
+  MCModule M;
+  MCDataAtom *Atom = M.createDataAtom(0, 0);
+  EXPECT_EQ(uint64_t(0), Atom->getEndAddr());
+  Atom->addData(0);
+  EXPECT_EQ(uint64_t(0), Atom->getEndAddr());
+  Atom->addData(1);
+  EXPECT_EQ(uint64_t(1), Atom->getEndAddr());
+  Atom->addData(2);
+  EXPECT_EQ(uint64_t(2), Atom->getEndAddr());
+  EXPECT_EQ(size_t(3), Atom->getData().size());
+}
+
+}  // end anonymous namespace
+}  // end namespace llvm
diff --git a/unittests/MC/Makefile b/unittests/MC/Makefile
new file mode 100644
index 0000000..4c25697
--- /dev/null
+++ b/unittests/MC/Makefile
@@ -0,0 +1,15 @@
+##===- unittests/IR/Makefile -------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+TESTNAME = MC
+LINK_COMPONENTS := MC
+
+include $(LEVEL)/Makefile.config
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Makefile b/unittests/Makefile
index 36b82da..06ba932 100644
--- a/unittests/Makefile
+++ b/unittests/Makefile
@@ -9,8 +9,8 @@
 
 LEVEL = ..
 
-PARALLEL_DIRS = ADT Analysis Bitcode DebugInfo ExecutionEngine IR Object \
-		Option Support Transforms
+PARALLEL_DIRS = ADT Analysis Bitcode CodeGen DebugInfo ExecutionEngine IR \
+		MC Object Option Support Transforms
 
 include $(LEVEL)/Makefile.common
 
diff --git a/unittests/Option/OptionParsingTest.cpp b/unittests/Option/OptionParsingTest.cpp
index 5a76d65..11d6d1e 100644
--- a/unittests/Option/OptionParsingTest.cpp
+++ b/unittests/Option/OptionParsingTest.cpp
@@ -20,7 +20,7 @@ using namespace llvm::opt;
 enum ID {
   OPT_INVALID = 0, // This is not an option ID.
 #define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \
-              HELPTEXT, METAVAR) OPT_##ID,
+               HELPTEXT, METAVAR) OPT_##ID,
 #include "Opts.inc"
   LastOption
 #undef OPTION
@@ -48,8 +48,8 @@ static const OptTable::Info InfoTable[] = {
 namespace {
 class TestOptTable : public OptTable {
 public:
-  TestOptTable()
-    : OptTable(InfoTable, array_lengthof(InfoTable)) {}
+  TestOptTable(bool IgnoreCase = false)
+    : OptTable(InfoTable, array_lengthof(InfoTable), IgnoreCase) {}
 };
 }
 
@@ -157,15 +157,49 @@ TEST(Option, AliasArgs) {
   EXPECT_EQ(AL->getAllArgValues(OPT_B)[1], "bar");
 }
 
-TEST(Option, DashDash) {
+TEST(Option, IgnoreCase) {
+  TestOptTable T(true);
+  unsigned MAI, MAC;
+
+  const char *MyArgs[] = { "-a", "-joo" };
+  OwningPtr<InputArgList> AL(T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+  EXPECT_TRUE(AL->hasArg(OPT_A));
+  EXPECT_TRUE(AL->hasArg(OPT_B));
+}
+
+TEST(Option, DoNotIgnoreCase) {
+  TestOptTable T;
+  unsigned MAI, MAC;
+
+  const char *MyArgs[] = { "-a", "-joo" };
+  OwningPtr<InputArgList> AL(T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+  EXPECT_FALSE(AL->hasArg(OPT_A));
+  EXPECT_FALSE(AL->hasArg(OPT_B));
+}
+
+TEST(Option, SlurpEmpty) {
+  TestOptTable T;
+  unsigned MAI, MAC;
+
+  const char *MyArgs[] = { "-A", "-slurp" };
+  OwningPtr<InputArgList> AL(T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+  EXPECT_TRUE(AL->hasArg(OPT_A));
+  EXPECT_TRUE(AL->hasArg(OPT_Slurp));
+  EXPECT_EQ(AL->getAllArgValues(OPT_Slurp).size(), 0U);
+}
+
+TEST(Option, Slurp) {
   TestOptTable T;
   unsigned MAI, MAC;
 
-  const char *MyArgs[] = { "-A", "--", "-B", "--" };
+  const char *MyArgs[] = { "-A", "-slurp", "-B", "--", "foo" };
   OwningPtr<InputArgList> AL(T.ParseArgs(MyArgs, array_endof(MyArgs), MAI, MAC));
+  EXPECT_EQ(AL->size(), 2U);
   EXPECT_TRUE(AL->hasArg(OPT_A));
   EXPECT_FALSE(AL->hasArg(OPT_B));
-  EXPECT_EQ(AL->getAllArgValues(OPT_INPUT).size(), 2U);
-  EXPECT_EQ(AL->getAllArgValues(OPT_INPUT)[0], "-B");
-  EXPECT_EQ(AL->getAllArgValues(OPT_INPUT)[1], "--");
+  EXPECT_TRUE(AL->hasArg(OPT_Slurp));
+  EXPECT_EQ(AL->getAllArgValues(OPT_Slurp).size(), 3U);
+  EXPECT_EQ(AL->getAllArgValues(OPT_Slurp)[0], "-B");
+  EXPECT_EQ(AL->getAllArgValues(OPT_Slurp)[1], "--");
+  EXPECT_EQ(AL->getAllArgValues(OPT_Slurp)[2], "foo");
 }
diff --git a/unittests/Option/Opts.td b/unittests/Option/Opts.td
index 986b312..aaed6b2 100644
--- a/unittests/Option/Opts.td
+++ b/unittests/Option/Opts.td
@@ -22,3 +22,5 @@ def I : Flag<["-"], "I">, Alias<H>, Group<my_group>;
 
 def J : Flag<["-"], "J">, Alias<B>, AliasArgs<["foo"]>;
 def Joo : Flag<["-"], "Joo">, Alias<B>, AliasArgs<["bar"]>;
+
+def Slurp : Option<["-"], "slurp", KIND_REMAINING_ARGS>;
diff --git a/unittests/Support/BlockFrequencyTest.cpp b/unittests/Support/BlockFrequencyTest.cpp
index ca420fc..ffdea2c 100644
--- a/unittests/Support/BlockFrequencyTest.cpp
+++ b/unittests/Support/BlockFrequencyTest.cpp
@@ -13,6 +13,11 @@ TEST(BlockFrequencyTest, OneToZero) {
   BranchProbability Prob(UINT32_MAX - 1, UINT32_MAX);
   Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 0u);
+
+  Freq = BlockFrequency(1);
+  uint32_t Remainder = Freq.scale(Prob);
+  EXPECT_EQ(Freq.getFrequency(), 0u);
+  EXPECT_EQ(Remainder, UINT32_MAX - 1);
 }
 
 TEST(BlockFrequencyTest, OneToOne) {
@@ -20,6 +25,11 @@ TEST(BlockFrequencyTest, OneToOne) {
   BranchProbability Prob(UINT32_MAX, UINT32_MAX);
   Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 1u);
+
+  Freq = BlockFrequency(1);
+  uint32_t Remainder = Freq.scale(Prob);
+  EXPECT_EQ(Freq.getFrequency(), 1u);
+  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, ThreeToOne) {
@@ -27,6 +37,11 @@ TEST(BlockFrequencyTest, ThreeToOne) {
   BranchProbability Prob(3000000, 9000000);
   Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 1u);
+
+  Freq = BlockFrequency(3);
+  uint32_t Remainder = Freq.scale(Prob);
+  EXPECT_EQ(Freq.getFrequency(), 1u);
+  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, MaxToHalfMax) {
@@ -34,6 +49,11 @@ TEST(BlockFrequencyTest, MaxToHalfMax) {
   BranchProbability Prob(UINT32_MAX / 2, UINT32_MAX);
   Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), 9223372034707292159ULL);
+
+  Freq = BlockFrequency(UINT64_MAX);
+  uint32_t Remainder = Freq.scale(Prob);
+  EXPECT_EQ(Freq.getFrequency(), 9223372034707292159ULL);
+  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, BigToBig) {
@@ -43,6 +63,11 @@ TEST(BlockFrequencyTest, BigToBig) {
   BranchProbability Prob(P, P);
   Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), Big);
+
+  Freq = BlockFrequency(Big);
+  uint32_t Remainder = Freq.scale(Prob);
+  EXPECT_EQ(Freq.getFrequency(), Big);
+  EXPECT_EQ(Remainder, 0u);
 }
 
 TEST(BlockFrequencyTest, MaxToMax) {
@@ -50,6 +75,114 @@ TEST(BlockFrequencyTest, MaxToMax) {
   BranchProbability Prob(UINT32_MAX, UINT32_MAX);
   Freq *= Prob;
   EXPECT_EQ(Freq.getFrequency(), UINT64_MAX);
+
+  // This additionally makes sure if we have a value equal to our saturating
+  // value, we do not signal saturation if the result equals said value, but
+  // saturating does not occur.
+  Freq = BlockFrequency(UINT64_MAX);
+  uint32_t Remainder = Freq.scale(Prob);
+  EXPECT_EQ(Freq.getFrequency(), UINT64_MAX);
+  EXPECT_EQ(Remainder, 0u);
+}
+
+TEST(BlockFrequencyTest, ScaleResultRemainderTest) {
+  struct {
+    uint64_t Freq;
+    uint32_t Prob[2];
+    uint64_t ExpectedFreq;
+    uint32_t ExpectedRemainder;
+  } Tests[80] = {
+    // Data for scaling that results in <= 64 bit division.
+    { 0x1423e2a50ULL, { 0x64819521, 0x7765dd13 }, 0x10f418889ULL, 0x92b9d25 },
+    { 0x35ef14ceULL, { 0x28ade3c7, 0x304532ae }, 0x2d73c33aULL, 0x2c0fd0b6 },
+    { 0xd03dbfbe24ULL, { 0x790079, 0xe419f3 }, 0x6e776fc1fdULL, 0x4a06dd },
+    { 0x21d67410bULL, { 0x302a9dc2, 0x3ddb4442 }, 0x1a5948fd6ULL, 0x265d1c2a },
+    { 0x8664aeadULL, { 0x3d523513, 0x403523b1 }, 0x805a04cfULL, 0x324c27b8 },
+    { 0x201db0cf4ULL, { 0x35112a7b, 0x79fc0c74 }, 0xdf8b07f6ULL, 0x490c1dc4 },
+    { 0x13f1e4430aULL, { 0x21c92bf, 0x21e63aae }, 0x13e0cba15ULL, 0x1df47c30 },
+    { 0x16c83229ULL, { 0x3793f66f, 0x53180dea }, 0xf3ce7b6ULL, 0x1d0c1b6b },
+    { 0xc62415be8ULL, { 0x9cc4a63, 0x4327ae9b }, 0x1ce8b71caULL, 0x3f2c696a },
+    { 0x6fac5e434ULL, { 0xe5f9170, 0x1115e10b }, 0x5df23dd4cULL, 0x4dafc7c },
+    { 0x1929375f2ULL, { 0x3a851375, 0x76c08456 }, 0xc662b082ULL, 0x343589ee },
+    { 0x243c89db6ULL, { 0x354ebfc0, 0x450ef197 }, 0x1bf8c1661ULL, 0x4948e49 },
+    { 0x310e9b31aULL, { 0x1b1b8acf, 0x2d3629f0 }, 0x1d69c93f9ULL, 0x73e3b96 },
+    { 0xa1fae921dULL, { 0xa7a098c, 0x10469f44 }, 0x684413d6cULL, 0x86a882c },
+    { 0xc1582d957ULL, { 0x498e061, 0x59856bc }, 0x9edc5f4e7ULL, 0x29b0653 },
+    { 0x57cfee75ULL, { 0x1d061dc3, 0x7c8bfc17 }, 0x1476a220ULL, 0x2383d33f },
+    { 0x139220080ULL, { 0x294a6c71, 0x2a2b07c9 }, 0x1329e1c76ULL, 0x7aa5da },
+    { 0x1665d353cULL, { 0x7080db5, 0xde0d75c }, 0xb590d9fbULL, 0x7ba8c38 },
+    { 0xe8f14541ULL, { 0x5188e8b2, 0x736527ef }, 0xa4971be5ULL, 0x6b612167 },
+    { 0x2f4775f29ULL, { 0x254ef0fe, 0x435fcf50 }, 0x1a2e449c1ULL, 0x28bbf5e },
+    { 0x27b85d8d7ULL, { 0x304c8220, 0x5de678f2 }, 0x146e3bef9ULL, 0x4b27097e },
+    { 0x1d362e36bULL, { 0x36c85b12, 0x37a66f55 }, 0x1cc19b8e6ULL, 0x688e828 },
+    { 0x155fd48c7ULL, { 0xf5894d, 0x1256108 }, 0x11e383602ULL, 0x111f0cb },
+    { 0xb5db2d15ULL, { 0x39bb26c5, 0x5bdcda3e }, 0x72499259ULL, 0x59c4939b },
+    { 0x153990298ULL, { 0x48921c09, 0x706eb817 }, 0xdb3268e8ULL, 0x66bb8a80 },
+    { 0x28a7c3ed7ULL, { 0x1f776fd7, 0x349f7a70 }, 0x184f73ae1ULL, 0x28910321 },
+    { 0x724dbeabULL, { 0x1bd149f5, 0x253a085e }, 0x5569c0b3ULL, 0xff8e2ed },
+    { 0xd8f0c513ULL, { 0x18c8cc4c, 0x1b72bad0 }, 0xc3e30643ULL, 0xd85e134 },
+    { 0x17ce3dcbULL, { 0x1e4c6260, 0x233b359e }, 0x1478f4afULL, 0x49ea31e },
+    { 0x1ce036ce0ULL, { 0x29e3c8af, 0x5318dd4a }, 0xe8e76196ULL, 0x11d5b9c4 },
+    { 0x1473ae2aULL, { 0x29b897ba, 0x2be29378 }, 0x13718185ULL, 0x6f93b2c },
+    { 0x1dd41aa68ULL, { 0x3d0a4441, 0x5a0e8f12 }, 0x1437b6bbfULL, 0x54b09ffa },
+    { 0x1b49e4a53ULL, { 0x3430c1fe, 0x5a204aed }, 0xfcd6852fULL, 0x15ad6ed7 },
+    { 0x217941b19ULL, { 0x12ced2bd, 0x21b68310 }, 0x12aca65b1ULL, 0x1b2a9565 },
+    { 0xac6a4dc8ULL, { 0x3ed68da8, 0x6fdca34c }, 0x60da926dULL, 0x22ff53e4 },
+    { 0x1c503a4e7ULL, { 0xfcbbd32, 0x11e48d17 }, 0x18fec7d38ULL, 0xa8aa816 },
+    { 0x1c885855ULL, { 0x213e919d, 0x25941897 }, 0x193de743ULL, 0x4ea09c },
+    { 0x29b9c168eULL, { 0x2b644aea, 0x45725ee7 }, 0x1a122e5d5ULL, 0xbee1099 },
+    { 0x806a33f2ULL, { 0x30a80a23, 0x5063733a }, 0x4db9a264ULL, 0x1eaed76e },
+    { 0x282afc96bULL, { 0x143ae554, 0x1a9863ff }, 0x1e8de5204ULL, 0x158d9020 },
+    // Data for scaling that results in > 64 bit division.
+    { 0x23ca5f2f672ca41cULL, { 0xecbc641, 0x111373f7 }, 0x1f0301e5e8295ab5ULL, 0xf627f79 },
+    { 0x5e4f2468142265e3ULL, { 0x1ddf5837, 0x32189233 }, 0x383ca7ba9fdd2c8cULL, 0x1c8f33e1 },
+    { 0x277a1a6f6b266bf6ULL, { 0x415d81a8, 0x61eb5e1e }, 0x1a5a3e1d41b30c0fULL, 0x29cde3ae },
+    { 0x1bdbb49a237035cbULL, { 0xea5bf17, 0x1d25ffb3 }, 0xdffc51c53d44b93ULL, 0x5170574 },
+    { 0x2bce6d29b64fb8ULL, { 0x3bfd5631, 0x7525c9bb }, 0x166ebedda7ac57ULL, 0x3026dfab },
+    { 0x3a02116103df5013ULL, { 0x2ee18a83, 0x3299aea8 }, 0x35be8922ab1e2a84ULL, 0x298d9919 },
+    { 0x7b5762390799b18cULL, { 0x12f8e5b9, 0x2563bcd4 }, 0x3e960077aca01209ULL, 0x93afeb8 },
+    { 0x69cfd72537021579ULL, { 0x4c35f468, 0x6a40feee }, 0x4be4cb3848be98a3ULL, 0x4ff96b9e },
+    { 0x49dfdf835120f1c1ULL, { 0x8cb3759, 0x559eb891 }, 0x79663f7120edadeULL, 0x51b1fb5b },
+    { 0x74b5be5c27676381ULL, { 0x47e4c5e0, 0x7c7b19ff }, 0x4367d2dff36a1028ULL, 0x7a7b5608 },
+    { 0x4f50f97075e7f431ULL, { 0x9a50a17, 0x11cd1185 }, 0x2af952b34c032df4ULL, 0xfddc6a3 },
+    { 0x2f8b0d712e393be4ULL, { 0x1487e386, 0x15aa356e }, 0x2d0df36478a776aaULL, 0x14e2564c },
+    { 0x224c1c75999d3deULL, { 0x3b2df0ea, 0x4523b100 }, 0x1d5b481d145f08aULL, 0x15145eec },
+    { 0x2bcbcea22a399a76ULL, { 0x28b58212, 0x48dd013e }, 0x187814d084c47cabULL, 0x3a38ebe2 },
+    { 0x1dbfca91257cb2d1ULL, { 0x1a8c04d9, 0x5e92502c }, 0x859cf7d00f77545ULL, 0x7431f4d },
+    { 0x7f20039b57cda935ULL, { 0xeccf651, 0x323f476e }, 0x25720cd976461a77ULL, 0x202817a3 },
+    { 0x40512c6a586aa087ULL, { 0x113b0423, 0x398c9eab }, 0x1341c03de8696a7eULL, 0x1e27284b },
+    { 0x63d802693f050a11ULL, { 0xf50cdd6, 0xfce2a44 }, 0x60c0177bb5e46846ULL, 0xf7ad89e },
+    { 0x2d956b422838de77ULL, { 0xb2d345b, 0x1321e557 }, 0x1aa0ed16b6aa5319ULL, 0xfe1a5ce },
+    { 0x5a1cdf0c1657bc91ULL, { 0x1d77bb0c, 0x1f991ff1 }, 0x54097ee94ff87560ULL, 0x11c4a26c },
+    { 0x3801b26d7e00176bULL, { 0xeed25da, 0x1a819d8b }, 0x1f89e96a3a639526ULL, 0xcd51e7c },
+    { 0x37655e74338e1e45ULL, { 0x300e170a, 0x5a1595fe }, 0x1d8cfb55fddc0441ULL, 0x3df05434 },
+    { 0x7b38703f2a84e6ULL, { 0x66d9053, 0xc79b6b9 }, 0x3f7d4c91774094ULL, 0x26d939e },
+    { 0x2245063c0acb3215ULL, { 0x30ce2f5b, 0x610e7271 }, 0x113b916468389235ULL, 0x1b588512 },
+    { 0x6bc195877b7b8a7eULL, { 0x392004aa, 0x4a24e60c }, 0x530594fb17db6ba5ULL, 0x35c0a5f0 },
+    { 0x40a3fde23c7b43dbULL, { 0x4e712195, 0x6553e56e }, 0x320a799bc76a466aULL, 0x5e23a5eb },
+    { 0x1d3dfc2866fbccbaULL, { 0x5075b517, 0x5fc42245 }, 0x18917f0061595bc3ULL, 0x3fcf4527 },
+    { 0x19aeb14045a61121ULL, { 0x1bf6edec, 0x707e2f4b }, 0x6626672a070bcc7ULL, 0x3607801f },
+    { 0x44ff90486c531e9fULL, { 0x66598a, 0x8a90dc }, 0x32f6f2b0525199b0ULL, 0x5ab576 },
+    { 0x3f3e7121092c5bcbULL, { 0x1c754df7, 0x5951a1b9 }, 0x14267f50b7ef375dULL, 0x221220a8 },
+    { 0x60e2dafb7e50a67eULL, { 0x4d96c66e, 0x65bd878d }, 0x49e31715ac393f8bULL, 0x4e97b195 },
+    { 0x656286667e0e6e29ULL, { 0x9d971a2, 0xacda23b }, 0x5c6ee315ead6cb4fULL, 0x516f5bd },
+    { 0x1114e0974255d507ULL, { 0x1c693, 0x2d6ff }, 0xaae42e4b35f6e60ULL, 0x8b65 },
+    { 0x508c8baf3a70ff5aULL, { 0x3b26b779, 0x6ad78745 }, 0x2c98387636c4b365ULL, 0x11dc6a51 },
+    { 0x5b47bc666bf1f9cfULL, { 0x10a87ed6, 0x187d358a }, 0x3e1767155848368bULL, 0xfb871c },
+    { 0x50954e3744460395ULL, { 0x7a42263, 0xcdaa048 }, 0x2fe739f0aee1fee1ULL, 0xb8add57 },
+    { 0x20020b406550dd8fULL, { 0x3318539, 0x42eead0 }, 0x186f326325fa346bULL, 0x10d3ae7 },
+    { 0x5bcb0b872439ffd5ULL, { 0x6f61fb2, 0x9af7344 }, 0x41fa1e3bec3c1b30ULL, 0x4fee45a },
+    { 0x7a670f365db87a53ULL, { 0x417e102, 0x3bb54c67 }, 0x8642a558304fd9eULL, 0x3b65f514 },
+    { 0x1ef0db1e7bab1cd0ULL, { 0x2b60cf38, 0x4188f78f }, 0x147ae0d6226b2ee6ULL, 0x336b6106 }
+  };
+
+  for (unsigned i = 0; i < 80; i++) {
+    BlockFrequency Freq(Tests[i].Freq);
+    uint32_t Remainder = Freq.scale(BranchProbability(Tests[i].Prob[0],
+                                                      Tests[i].Prob[1]));
+    EXPECT_EQ(Tests[i].ExpectedFreq, Freq.getFrequency());
+    EXPECT_EQ(Tests[i].ExpectedRemainder, Remainder);
+  }
 }
 
 TEST(BlockFrequency, Divide) {
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 197561e..0abc2ff 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -17,9 +17,7 @@ add_llvm_unittest(SupportTests
   EndianTest.cpp
   ErrorOrTest.cpp
   FileOutputBufferTest.cpp
-  IntegersSubsetTest.cpp
   LeakDetectorTest.cpp
-  LocaleTest.cpp
   LockFileManagerTest.cpp
   ManagedStatic.cpp
   MathExtrasTest.cpp
@@ -30,8 +28,11 @@ add_llvm_unittest(SupportTests
   ProcessTest.cpp
   ProgramTest.cpp
   RegexTest.cpp
+  SourceMgrTest.cpp
   SwapByteOrderTest.cpp
+  ThreadLocalTest.cpp
   TimeValueTest.cpp
+  UnicodeTest.cpp
   ValueHandleTest.cpp
   YAMLIOTest.cpp
   YAMLParserTest.cpp
diff --git a/unittests/Support/CompressionTest.cpp b/unittests/Support/CompressionTest.cpp
index c8e2cd9..c0a9ada 100644
--- a/unittests/Support/CompressionTest.cpp
+++ b/unittests/Support/CompressionTest.cpp
@@ -63,6 +63,12 @@ TEST(CompressionTest, Zlib) {
   TestZlibCompression(BinaryDataStr, zlib::DefaultCompression);
 }
 
+TEST(CompressionTest, ZlibCRC32) {
+  EXPECT_EQ(
+      0x414FA339U,
+      zlib::crc32(StringRef("The quick brown fox jumps over the lazy dog")));
+}
+
 #endif
 
 }
diff --git a/unittests/Support/ConstantRangeTest.cpp b/unittests/Support/ConstantRangeTest.cpp
index 4d6bbf6..3e0a085 100644
--- a/unittests/Support/ConstantRangeTest.cpp
+++ b/unittests/Support/ConstantRangeTest.cpp
@@ -216,6 +216,9 @@ TEST_F(ConstantRangeTest, SExt) {
 
   EXPECT_EQ(ConstantRange(APInt(8, 120), APInt(8, 140)).signExtend(16),
             ConstantRange(APInt(16, -128), APInt(16, 128)));
+
+  EXPECT_EQ(ConstantRange(APInt(16, 0x0200), APInt(16, 0x8000)).signExtend(19),
+            ConstantRange(APInt(19, 0x0200), APInt(19, 0x8000)));
 }
 
 TEST_F(ConstantRangeTest, IntersectWith) {
diff --git a/unittests/Support/ErrorOrTest.cpp b/unittests/Support/ErrorOrTest.cpp
index 4853426..feb6a08 100644
--- a/unittests/Support/ErrorOrTest.cpp
+++ b/unittests/Support/ErrorOrTest.cpp
@@ -45,9 +45,6 @@ TEST(ErrorOr, Types) {
   *a = 42;
   EXPECT_EQ(42, x);
 
-  EXPECT_FALSE(ErrorOr<void>(errc::broken_pipe));
-  EXPECT_TRUE(ErrorOr<void>(errc::success));
-
 #if LLVM_HAS_CXX11_STDLIB
   // Move only types.
   EXPECT_EQ(3, **t3());
@@ -67,38 +64,3 @@ TEST(ErrorOr, Covariant) {
 #endif
 }
 } // end anon namespace
-
-struct InvalidArgError {
-  InvalidArgError() {}
-  InvalidArgError(std::string S) : ArgName(S) {}
-  std::string ArgName;
-};
-
-namespace llvm {
-template<>
-struct ErrorOrUserDataTraits<InvalidArgError> : true_type {
-  static error_code error() {
-    return make_error_code(errc::invalid_argument);
-  }
-};
-} // end namespace llvm
-
-ErrorOr<int> t4() {
-  return InvalidArgError("adena");
-}
-
-ErrorOr<void> t5() {
-  return InvalidArgError("pie");
-}
-
-namespace {
-TEST(ErrorOr, UserErrorData) {
-  ErrorOr<int> a = t4();
-  EXPECT_EQ(errc::invalid_argument, a);
-  EXPECT_EQ("adena", t4().getError<InvalidArgError>().ArgName);
-  
-  ErrorOr<void> b = t5();
-  EXPECT_EQ(errc::invalid_argument, b);
-  EXPECT_EQ("pie", b.getError<InvalidArgError>().ArgName);
-}
-} // end anon namespace
diff --git a/unittests/Support/IntegersSubsetTest.cpp b/unittests/Support/IntegersSubsetTest.cpp
deleted file mode 100644
index f4298bf..0000000
--- a/unittests/Support/IntegersSubsetTest.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-//===- llvm/unittest/Support/IntegersSubsetTest.cpp - IntegersSubset tests ===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/IntegersSubset.h" 
-#include "llvm/ADT/APInt.h"
-#include "llvm/Support/IntegersSubsetMapping.h"
-#include "gtest/gtest.h"
-#include <vector>
-
-using namespace llvm;
-
-namespace {
-  
-  class Int : public APInt {
-  public:
-    Int() {}
-    Int(uint64_t V) : APInt(64, V) {}
-    Int(const APInt& Src) : APInt(Src) {}
-    bool operator < (const APInt& RHS) const { return ult(RHS); }
-    bool operator > (const APInt& RHS) const { return ugt(RHS); }
-    bool operator <= (const APInt& RHS) const { return ule(RHS); }
-    bool operator >= (const APInt& RHS) const { return uge(RHS); }
-  };
-  
-  typedef IntRange<Int> Range;
-  typedef IntegersSubsetGeneric<Int> Subset;
-  typedef IntegersSubsetMapping<unsigned,Subset,Int> Mapping;
-  
-  TEST(IntegersSubsetTest, GeneralTest) {
-    
-    // Test construction.
-
-    std::vector<Range> Ranges;
-    Ranges.reserve(3);
-
-    // Initialize Subset as union of three pairs:
-    // { {0, 8}, {10, 18}, {20, 28} }
-    for (unsigned i = 0; i < 3; ++i)
-      Ranges.push_back(Range(Int(i*10), Int(i*10 + 8)));
-
-    Subset TheSubset(Ranges);
-    
-    for (unsigned i = 0; i < 3; ++i) {
-      EXPECT_EQ(TheSubset.getItem(i).getLow(), Int(i*10));
-      EXPECT_EQ(TheSubset.getItem(i).getHigh(), Int(i*10 + 8));
-    }
-    
-    EXPECT_EQ(TheSubset.getNumItems(), 3ULL);
-    
-    // Test belonging to range.
-    
-    EXPECT_TRUE(TheSubset.isSatisfies(Int(5)));
-    EXPECT_FALSE(TheSubset.isSatisfies(Int(9)));
-    
-    // Test when subset contains the only item.
-    
-    Ranges.clear();
-    Ranges.push_back(Range(Int(10), Int(10)));
-    
-    Subset TheSingleNumber(Ranges);
-    
-    EXPECT_TRUE(TheSingleNumber.isSingleNumber());
-    
-    Ranges.push_back(Range(Int(12), Int(15)));
-    
-    Subset NotASingleNumber(Ranges);
-    
-    EXPECT_FALSE(NotASingleNumber.isSingleNumber());
-    
-    // Test when subset contains items that are not a ranges but
-    // the single numbers.
-    
-    Ranges.clear();
-    Ranges.push_back(Range(Int(10), Int(10)));
-    Ranges.push_back(Range(Int(15), Int(19)));
-    
-    Subset WithSingleNumberItems(Ranges);
-    
-    EXPECT_TRUE(WithSingleNumberItems.isSingleNumber(0));
-    EXPECT_FALSE(WithSingleNumberItems.isSingleNumber(1));
-    
-    // Test size of subset. Note subset itself may be not optimized (improper),
-    // so it may contain duplicates, and the size of subset { {0, 9} {5, 9} }
-    // will 15 instead of 10.
-    
-    Ranges.clear();
-    Ranges.push_back(Range(Int(0), Int(9)));
-    Ranges.push_back(Range(Int(5), Int(9)));
-    
-    Subset NotOptimizedSubset(Ranges);
-    
-    EXPECT_EQ(NotOptimizedSubset.getSize(), 15ULL);
-
-    // Test access to a single value.
-    // getSingleValue(idx) method represents subset as flat numbers collection,
-    // so subset { {0, 3}, {8, 10} } will represented as array
-    // { 0, 1, 2, 3, 8, 9, 10 }.
-    
-    Ranges.clear();
-    Ranges.push_back(Range(Int(0), Int(3)));
-    Ranges.push_back(Range(Int(8), Int(10)));
-    
-    Subset OneMoreSubset(Ranges);
-    
-    EXPECT_EQ(OneMoreSubset.getSingleValue(5), Int(9));
-  }
-  
-  TEST(IntegersSubsetTest, MappingTest) {
-
-    Mapping::Cases TheCases;
-    
-    unsigned Successors[3] = {0, 1, 2};
-    
-    // Test construction.
-    
-    Mapping TheMapping;
-    for (unsigned i = 0; i < 3; ++i)
-      TheMapping.add(Int(10*i), Int(10*i + 9), Successors + i);
-    TheMapping.add(Int(111), Int(222), Successors);
-    TheMapping.removeItem(--TheMapping.end());
-    
-    TheMapping.getCases(TheCases);
-    
-    EXPECT_EQ(TheCases.size(), 3ULL);
-    
-    for (unsigned i = 0; i < 3; ++i) {
-      Mapping::Cases::iterator CaseIt = TheCases.begin();
-      std::advance(CaseIt, i);  
-      EXPECT_EQ(CaseIt->first, Successors + i);
-      EXPECT_EQ(CaseIt->second.getNumItems(), 1ULL);
-      EXPECT_EQ(CaseIt->second.getItem(0), Range(Int(10*i), Int(10*i + 9)));
-    }
-    
-    // Test verification.
-    
-    Mapping ImproperMapping;
-    ImproperMapping.add(Int(10), Int(11), Successors + 0);
-    ImproperMapping.add(Int(11), Int(12), Successors + 1);
-    
-    Mapping::RangeIterator ErrItem;
-    EXPECT_FALSE(ImproperMapping.verify(ErrItem));
-    EXPECT_EQ(ErrItem, --ImproperMapping.end());
-    
-    Mapping ProperMapping;
-    ProperMapping.add(Int(10), Int(11), Successors + 0);
-    ProperMapping.add(Int(12), Int(13), Successors + 1);
-    
-    EXPECT_TRUE(ProperMapping.verify(ErrItem));
-    
-    // Test optimization.
-    
-    Mapping ToBeOptimized;
-    
-    for (unsigned i = 0; i < 3; ++i) {
-      ToBeOptimized.add(Int(i * 10), Int(i * 10 + 1), Successors + i);
-      ToBeOptimized.add(Int(i * 10 + 2), Int(i * 10 + 9), Successors + i);
-    }
-    
-    ToBeOptimized.optimize();
-    
-    TheCases.clear();
-    ToBeOptimized.getCases(TheCases);
-    
-    EXPECT_EQ(TheCases.size(), 3ULL);
-    
-    for (unsigned i = 0; i < 3; ++i) {
-      Mapping::Cases::iterator CaseIt = TheCases.begin();
-      std::advance(CaseIt, i);  
-      EXPECT_EQ(CaseIt->first, Successors + i);
-      EXPECT_EQ(CaseIt->second.getNumItems(), 1ULL);
-      EXPECT_EQ(CaseIt->second.getItem(0), Range(Int(i * 10), Int(i * 10 + 9)));
-    }
-  }
-  
-  typedef unsigned unsigned_pair[2];
-  typedef unsigned_pair unsigned_ranges[];
-  
-  void TestDiff(
-      const unsigned_ranges LHS,
-      unsigned LSize,
-      const unsigned_ranges RHS,
-      unsigned RSize,
-      const unsigned_ranges ExcludeRes,
-      unsigned ExcludeResSize,
-      const unsigned_ranges IntersectRes,
-      unsigned IntersectResSize
-      ) {
-    
-    Mapping::RangesCollection Ranges;
-    
-    Mapping LHSMapping;
-    for (unsigned i = 0; i < LSize; ++i)
-      Ranges.push_back(Range(Int(LHS[i][0]), Int(LHS[i][1])));
-    LHSMapping.add(Ranges);
-    
-    Ranges.clear();
-
-    Mapping RHSMapping;
-    for (unsigned i = 0; i < RSize; ++i)
-      Ranges.push_back(Range(Int(RHS[i][0]), Int(RHS[i][1])));
-    RHSMapping.add(Ranges);
-    
-    Mapping LExclude, Intersection;
-    
-    LHSMapping.diff(&LExclude, &Intersection, 0, RHSMapping);
-    
-    if (ExcludeResSize) {
-      EXPECT_EQ(LExclude.size(), ExcludeResSize);
-      
-      unsigned i = 0;
-      for (Mapping::RangeIterator rei = LExclude.begin(),
-           e = LExclude.end(); rei != e; ++rei, ++i)
-        EXPECT_EQ(rei->first, Range(ExcludeRes[i][0], ExcludeRes[i][1]));
-    } else
-      EXPECT_TRUE(LExclude.empty());
-    
-    if (IntersectResSize) {
-      EXPECT_EQ(Intersection.size(), IntersectResSize);
-      
-      unsigned i = 0;
-      for (Mapping::RangeIterator ii = Intersection.begin(),
-           e = Intersection.end(); ii != e; ++ii, ++i)
-        EXPECT_EQ(ii->first, Range(IntersectRes[i][0], IntersectRes[i][1]));
-    } else
-      EXPECT_TRUE(Intersection.empty());
-
-    LExclude.clear();
-    Intersection.clear();
-    RHSMapping.diff(0, &Intersection, &LExclude, LHSMapping);
-    
-    // Check LExclude again.
-    if (ExcludeResSize) {
-      EXPECT_EQ(LExclude.size(), ExcludeResSize);
-      
-      unsigned i = 0;
-      for (Mapping::RangeIterator rei = LExclude.begin(),
-           e = LExclude.end(); rei != e; ++rei, ++i)
-        EXPECT_EQ(rei->first, Range(ExcludeRes[i][0], ExcludeRes[i][1]));
-    } else
-      EXPECT_TRUE(LExclude.empty());    
-  }  
-  
-  TEST(IntegersSubsetTest, DiffTest) {
-    
-    static const unsigned NOT_A_NUMBER = 0xffff;
-    
-    {
-      unsigned_ranges LHS = { { 0, 4 }, { 7, 10 }, { 13, 17 } };
-      unsigned_ranges RHS = { { 3, 14 } };
-      unsigned_ranges ExcludeRes = { { 0, 2 }, { 15, 17 } };
-      unsigned_ranges IntersectRes = { { 3, 4 }, { 7, 10 }, { 13, 14 } };
-
-      TestDiff(LHS, 3, RHS, 1, ExcludeRes, 2, IntersectRes, 3);
-    }
-
-    {
-      unsigned_ranges LHS = { { 0, 4 }, { 7, 10 }, { 13, 17 } };
-      unsigned_ranges RHS = { { 0, 4 }, { 13, 17 } };
-      unsigned_ranges ExcludeRes = { { 7, 10 } };
-      unsigned_ranges IntersectRes = { { 0, 4 }, { 13, 17 } };
-
-      TestDiff(LHS, 3, RHS, 2, ExcludeRes, 1, IntersectRes, 2);
-    }
-
-    {
-      unsigned_ranges LHS = { { 0, 17 } };
-      unsigned_ranges RHS = { { 1, 5 }, { 10, 12 }, { 15, 16 } };
-      unsigned_ranges ExcludeRes =
-          { { 0, 0 }, { 6, 9 }, { 13, 14 }, { 17, 17 } };
-      unsigned_ranges IntersectRes = { { 1, 5 }, { 10, 12 }, { 15, 16 } };
-
-      TestDiff(LHS, 1, RHS, 3, ExcludeRes, 4, IntersectRes, 3);
-    }
-
-    {
-      unsigned_ranges LHS = { { 2, 4 } };
-      unsigned_ranges RHS = { { 0, 5 } };
-      unsigned_ranges ExcludeRes = { {NOT_A_NUMBER, NOT_A_NUMBER} };
-      unsigned_ranges IntersectRes = { { 2, 4 } };
-
-      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 0, IntersectRes, 1);
-    }
-
-    {
-      unsigned_ranges LHS = { { 2, 4 } };
-      unsigned_ranges RHS = { { 7, 8 } };
-      unsigned_ranges ExcludeRes = { { 2, 4 } };
-      unsigned_ranges IntersectRes = { {NOT_A_NUMBER, NOT_A_NUMBER} };
-
-      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 1, IntersectRes, 0);
-    }
-
-    {
-      unsigned_ranges LHS = { { 3, 7 } };
-      unsigned_ranges RHS = { { 1, 4 } };
-      unsigned_ranges ExcludeRes = { { 5, 7 } };
-      unsigned_ranges IntersectRes = { { 3, 4 } };
-
-      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 1, IntersectRes, 1);
-    }
-    
-    {
-      unsigned_ranges LHS = { { 0, 7 } };
-      unsigned_ranges RHS = { { 0, 5 }, { 6, 9 } };
-      unsigned_ranges ExcludeRes = { {NOT_A_NUMBER, NOT_A_NUMBER} };
-      unsigned_ranges IntersectRes = { { 0, 5 }, {6, 7} };
-
-      TestDiff(LHS, 1, RHS, 2, ExcludeRes, 0, IntersectRes, 2);
-    }
-    
-    {
-      unsigned_ranges LHS = { { 17, 17 } };
-      unsigned_ranges RHS = { { 4, 4 } };
-      unsigned_ranges ExcludeRes = { {17, 17} };
-      unsigned_ranges IntersectRes = { { NOT_A_NUMBER, NOT_A_NUMBER } };
-
-      TestDiff(LHS, 1, RHS, 1, ExcludeRes, 1, IntersectRes, 0);
-    }     
-  }   
-}
diff --git a/unittests/Support/LocaleTest.cpp b/unittests/Support/LocaleTest.cpp
deleted file mode 100644
index 3524b4b..0000000
--- a/unittests/Support/LocaleTest.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-//===- unittests/Support/LocaleTest.cpp - Locale.h tests ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Locale.h"
-#include "gtest/gtest.h"
-
-namespace llvm {
-namespace sys {
-namespace locale {
-namespace {
-
-// FIXME: WIN32 implementation is incorrect. We should consider using the one
-// from LocaleGeneric.inc for WIN32.
-#ifndef _WIN32
-TEST(Locale, columnWidth) {
-  // FIXME: This test fails with MacOSX implementation of columnWidth.
-#ifndef __APPLE__
-  EXPECT_EQ(0, columnWidth(""));
-  EXPECT_EQ(1, columnWidth(" "));
-  EXPECT_EQ(1, columnWidth("a"));
-  EXPECT_EQ(1, columnWidth("~"));
-
-  EXPECT_EQ(6, columnWidth("abcdef"));
-
-  EXPECT_EQ(-1, columnWidth("\x01"));
-  EXPECT_EQ(-1, columnWidth("aaaaaaaaaa\x01"));
-  EXPECT_EQ(-1, columnWidth("\342\200\213")); // 200B ZERO WIDTH SPACE
-
-  EXPECT_EQ(0, columnWidth("\314\200")); // 0300 COMBINING GRAVE ACCENT
-  EXPECT_EQ(1, columnWidth("\340\270\201")); // 0E01 THAI CHARACTER KO KAI
-  EXPECT_EQ(2, columnWidth("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00
-
-  EXPECT_EQ(4, columnWidth("\344\270\200\344\270\200"));
-  EXPECT_EQ(3, columnWidth("q\344\270\200"));
-  EXPECT_EQ(3, columnWidth("\314\200\340\270\201\344\270\200"));
-
-  // Invalid UTF-8 strings, columnWidth should error out.
-  EXPECT_EQ(-2, columnWidth("\344"));
-  EXPECT_EQ(-2, columnWidth("\344\270"));
-  EXPECT_EQ(-2, columnWidth("\344\270\033"));
-  EXPECT_EQ(-2, columnWidth("\344\270\300"));
-  EXPECT_EQ(-2, columnWidth("\377\366\355"));
-
-  EXPECT_EQ(-2, columnWidth("qwer\344"));
-  EXPECT_EQ(-2, columnWidth("qwer\344\270"));
-  EXPECT_EQ(-2, columnWidth("qwer\344\270\033"));
-  EXPECT_EQ(-2, columnWidth("qwer\344\270\300"));
-  EXPECT_EQ(-2, columnWidth("qwer\377\366\355"));
-
-  // UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode
-  // characters.
-  EXPECT_EQ(-2, columnWidth("\370\200\200\200\200"));     // U+200000
-  EXPECT_EQ(-2, columnWidth("\374\200\200\200\200\200")); // U+4000000
-#endif // __APPLE__
-}
-
-TEST(Locale, isPrint) {
-  EXPECT_EQ(false, isPrint(0)); // <control-0000>-<control-001F>
-  EXPECT_EQ(false, isPrint(0x01));
-  EXPECT_EQ(false, isPrint(0x1F));
-  EXPECT_EQ(true, isPrint(' '));
-  EXPECT_EQ(true, isPrint('A'));
-  EXPECT_EQ(true, isPrint('~'));
-  EXPECT_EQ(false, isPrint(0x7F)); // <control-007F>..<control-009F>
-  EXPECT_EQ(false, isPrint(0x90));
-  EXPECT_EQ(false, isPrint(0x9F));
-
-  EXPECT_EQ(true, isPrint(0xAC));
-  // FIXME: Figure out if we want to treat SOFT HYPHEN as printable character.
-#ifndef __APPLE__
-  EXPECT_EQ(false, isPrint(0xAD)); // SOFT HYPHEN
-#endif // __APPLE__
-  EXPECT_EQ(true, isPrint(0xAE));
-
-  // MacOS implementation doesn't think it's printable.
-#ifndef __APPLE__
-  EXPECT_EQ(true, isPrint(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
-#endif // __APPLE__
-  EXPECT_EQ(false, isPrint(0x0378)); // <reserved-0378>..<reserved-0379>
-
-  EXPECT_EQ(false, isPrint(0x0600)); // ARABIC NUMBER SIGN
-
-  EXPECT_EQ(false, isPrint(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF>
-  EXPECT_EQ(true, isPrint(0x20000)); // CJK UNIFIED IDEOGRAPH-20000
-
-  EXPECT_EQ(false, isPrint(0x10FFFF)); // noncharacter
-}
-
-#endif // _WIN32
-
-} // namespace
-} // namespace locale
-} // namespace sys
-} // namespace llvm
diff --git a/unittests/Support/MemoryBufferTest.cpp b/unittests/Support/MemoryBufferTest.cpp
index de1dbb7..2b8806c 100644
--- a/unittests/Support/MemoryBufferTest.cpp
+++ b/unittests/Support/MemoryBufferTest.cpp
@@ -65,6 +65,28 @@ TEST_F(MemoryBufferTest, get) {
   EXPECT_EQ("this is some data", data);
 }
 
+TEST_F(MemoryBufferTest, NullTerminator4K) {
+  // Test that a file with size that is a multiple of the page size can be null
+  // terminated correctly by MemoryBuffer.
+  int TestFD;
+  SmallString<64> TestPath;
+  sys::fs::createTemporaryFile("MemoryBufferTest_NullTerminator4K", "temp",
+                               TestFD, TestPath);
+  raw_fd_ostream OF(TestFD, true, /*unbuffered=*/true);
+  for (unsigned i = 0; i < 4096 / 16; ++i) {
+    OF << "0123456789abcdef";
+  }
+  OF.close();
+
+  OwningPtr<MemoryBuffer> MB;
+  error_code EC = MemoryBuffer::getFile(TestPath.c_str(), MB);
+  ASSERT_FALSE(EC);
+
+  const char *BufData = MB->getBufferStart();
+  EXPECT_EQ('f', BufData[4095]);
+  EXPECT_EQ('\0', BufData[4096]);
+}
+
 TEST_F(MemoryBufferTest, copy) {
   // copy with no name
   OwningBuffer MBC1(MemoryBuffer::getMemBufferCopy(data));
@@ -112,7 +134,7 @@ void MemoryBufferTest::testGetOpenFileSlice(bool Reopen) {
   SmallString<64> TestPath;
   // Create a temporary file and write data into it.
   sys::fs::createTemporaryFile("prefix", "temp", TestFD, TestPath);
-  // OF is responsible for closing the file; If the file is not 
+  // OF is responsible for closing the file; If the file is not
   // reopened, it will be unbuffered so that the results are
   // immediately visible through the fd.
   raw_fd_ostream OF(TestFD, true, !Reopen);
@@ -128,7 +150,7 @@ void MemoryBufferTest::testGetOpenFileSlice(bool Reopen) {
   OwningBuffer Buf;
   error_code EC = MemoryBuffer::getOpenFileSlice(TestFD, TestPath.c_str(), Buf,
                                                  40000, // Size
-                                                 8000   // Offset
+                                                 80000  // Offset
                                                  );
   EXPECT_FALSE(EC);
 
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index 6f5992b..0316241 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -141,6 +141,75 @@ TEST(Support, Path) {
   }
 }
 
+TEST(Support, RelativePathIterator) {
+  SmallString<64> Path(StringRef("c/d/e/foo.txt"));
+  typedef SmallVector<StringRef, 4> PathComponents;
+  PathComponents ExpectedPathComponents;
+  PathComponents ActualPathComponents;
+
+  StringRef(Path).split(ExpectedPathComponents, "/");
+
+  for (path::const_iterator I = path::begin(Path), E = path::end(Path); I != E;
+       ++I) {
+    ActualPathComponents.push_back(*I);
+  }
+
+  ASSERT_EQ(ExpectedPathComponents.size(), ActualPathComponents.size());
+
+  for (size_t i = 0; i <ExpectedPathComponents.size(); ++i) {
+    EXPECT_EQ(ExpectedPathComponents[i].str(), ActualPathComponents[i].str());
+  }
+}
+
+TEST(Support, AbsolutePathIterator) {
+  SmallString<64> Path(StringRef("/c/d/e/foo.txt"));
+  typedef SmallVector<StringRef, 4> PathComponents;
+  PathComponents ExpectedPathComponents;
+  PathComponents ActualPathComponents;
+
+  StringRef(Path).split(ExpectedPathComponents, "/");
+
+  // The root path will also be a component when iterating
+  ExpectedPathComponents[0] = "/";
+
+  for (path::const_iterator I = path::begin(Path), E = path::end(Path); I != E;
+       ++I) {
+    ActualPathComponents.push_back(*I);
+  }
+
+  ASSERT_EQ(ExpectedPathComponents.size(), ActualPathComponents.size());
+
+  for (size_t i = 0; i <ExpectedPathComponents.size(); ++i) {
+    EXPECT_EQ(ExpectedPathComponents[i].str(), ActualPathComponents[i].str());
+  }
+}
+
+#ifdef LLVM_ON_WIN32
+TEST(Support, AbsolutePathIteratorWin32) {
+  SmallString<64> Path(StringRef("c:\\c\\e\\foo.txt"));
+  typedef SmallVector<StringRef, 4> PathComponents;
+  PathComponents ExpectedPathComponents;
+  PathComponents ActualPathComponents;
+
+  StringRef(Path).split(ExpectedPathComponents, "\\");
+
+  // The root path (which comes after the drive name) will also be a component
+  // when iterating.
+  ExpectedPathComponents.insert(ExpectedPathComponents.begin()+1, "\\");
+
+  for (path::const_iterator I = path::begin(Path), E = path::end(Path); I != E;
+       ++I) {
+    ActualPathComponents.push_back(*I);
+  }
+
+  ASSERT_EQ(ExpectedPathComponents.size(), ActualPathComponents.size());
+
+  for (size_t i = 0; i <ExpectedPathComponents.size(); ++i) {
+    EXPECT_EQ(ExpectedPathComponents[i].str(), ActualPathComponents[i].str());
+  }
+}
+#endif // LLVM_ON_WIN32
+
 class FileSystemTest : public testing::Test {
 protected:
   /// Unique temporary directory in which all created filesystem entities must
@@ -347,7 +416,25 @@ TEST_F(FileSystemTest, DirectoryIteration) {
   ASSERT_LT(z0, za1);
 }
 
-const char elf[] = {0x7f, 'E', 'L', 'F', 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+const char archive[] = "!<arch>\x0A";
+const char bitcode[] = "\xde\xc0\x17\x0b";
+const char coff_object[] = "\x00\x00......";
+const char coff_import_library[] = "\x00\x00\xff\xff....";
+const char elf_relocatable[] = { 0x7f, 'E', 'L', 'F', 1, 2, 1, 0, 0,
+                                 0,    0,   0,   0,   0, 0, 0, 0, 1 };
+const char macho_universal_binary[] = "\xca\xfe\xba\xbe...\0x00";
+const char macho_object[] = "\xfe\xed\xfa\xce..........\x00\x01";
+const char macho_executable[] = "\xfe\xed\xfa\xce..........\x00\x02";
+const char macho_fixed_virtual_memory_shared_lib[] =
+    "\xfe\xed\xfa\xce..........\x00\x03";
+const char macho_core[] = "\xfe\xed\xfa\xce..........\x00\x04";
+const char macho_preload_executable[] = "\xfe\xed\xfa\xce..........\x00\x05";
+const char macho_dynamically_linked_shared_lib[] =
+    "\xfe\xed\xfa\xce..........\x00\x06";
+const char macho_dynamic_linker[] = "\xfe\xed\xfa\xce..........\x00\x07";
+const char macho_bundle[] = "\xfe\xed\xfa\xce..........\x00\x08";
+const char macho_dsym_companion[] = "\xfe\xed\xfa\xce..........\x00\x0a";
+const char windows_resource[] = "\x00\x00\x00\x00\x020\x00\x00\x00\xff";
 
 TEST_F(FileSystemTest, Magic) {
   struct type {
@@ -355,11 +442,27 @@ TEST_F(FileSystemTest, Magic) {
     const char *magic_str;
     size_t magic_str_len;
     fs::file_magic magic;
-  } types [] = {
-    {"magic.archive", "!<arch>\x0A", 8, fs::file_magic::archive},
-    {"magic.elf", elf, sizeof(elf),
-     fs::file_magic::elf_relocatable}
-  };
+  } types[] = {
+#define DEFINE(magic)                                           \
+    { #magic, magic, sizeof(magic), fs::file_magic::magic }
+    DEFINE(archive),
+    DEFINE(bitcode),
+    DEFINE(coff_object),
+    DEFINE(coff_import_library),
+    DEFINE(elf_relocatable),
+    DEFINE(macho_universal_binary),
+    DEFINE(macho_object),
+    DEFINE(macho_executable),
+    DEFINE(macho_fixed_virtual_memory_shared_lib),
+    DEFINE(macho_core),
+    DEFINE(macho_preload_executable),
+    DEFINE(macho_dynamically_linked_shared_lib),
+    DEFINE(macho_dynamic_linker),
+    DEFINE(macho_bundle),
+    DEFINE(macho_dsym_companion),
+    DEFINE(windows_resource)
+#undef DEFINE
+    };
 
   // Create some files filled with magic.
   for (type *i = types, *e = types + (sizeof(types) / sizeof(type)); i != e;
@@ -392,7 +495,7 @@ TEST_F(FileSystemTest, CarriageReturn) {
   }
   {
     OwningPtr<MemoryBuffer> Buf;
-    MemoryBuffer::getFile(FilePathname, Buf);
+    MemoryBuffer::getFile(FilePathname.c_str(), Buf);
     EXPECT_EQ(Buf->getBuffer(), "\r\n");
   }
 
@@ -403,7 +506,7 @@ TEST_F(FileSystemTest, CarriageReturn) {
   }
   {
     OwningPtr<MemoryBuffer> Buf;
-    MemoryBuffer::getFile(FilePathname, Buf);
+    MemoryBuffer::getFile(FilePathname.c_str(), Buf);
     EXPECT_EQ(Buf->getBuffer(), "\n");
   }
 }
@@ -431,7 +534,7 @@ TEST_F(FileSystemTest, FileMapping) {
     mfr.data()[Val.size()] = 0;
     // Unmap temp file
   }
-  
+
   // Map it back in read-only
   fs::mapped_file_region mfr(Twine(TempPath),
                              fs::mapped_file_region::readonly,
@@ -439,10 +542,10 @@ TEST_F(FileSystemTest, FileMapping) {
                              0,
                              EC);
   ASSERT_NO_ERROR(EC);
-  
+
   // Verify content
   EXPECT_EQ(StringRef(mfr.const_data()), Val);
-  
+
   // Unmap temp file
 
 #if LLVM_HAS_RVALUE_REFERENCES
diff --git a/unittests/Support/ProcessTest.cpp b/unittests/Support/ProcessTest.cpp
index eff9c71..27c2318 100644
--- a/unittests/Support/ProcessTest.cpp
+++ b/unittests/Support/ProcessTest.cpp
@@ -39,4 +39,32 @@ TEST(ProcessTest, SelfProcess) {
   EXPECT_GT(TimeValue::MaxTime, process::get_self()->get_wall_time());
 }
 
+#ifdef _MSC_VER
+#define setenv(name, var, ignore) _putenv_s(name, var)
+#endif
+
+#if HAVE_SETENV || _MSC_VER
+TEST(ProcessTest, Basic) {
+  setenv("__LLVM_TEST_ENVIRON_VAR__", "abc", true);
+  Optional<std::string> val(Process::GetEnv("__LLVM_TEST_ENVIRON_VAR__"));
+  EXPECT_TRUE(val.hasValue());
+  EXPECT_STREQ("abc", val->c_str());
+}
+
+TEST(ProcessTest, None) {
+  Optional<std::string> val(
+      Process::GetEnv("__LLVM_TEST_ENVIRON_NO_SUCH_VAR__"));
+  EXPECT_FALSE(val.hasValue());
+}
+#endif
+
+#ifdef LLVM_ON_WIN32
+TEST(ProcessTest, Wchar) {
+  SetEnvironmentVariableW(L"__LLVM_TEST_ENVIRON_VAR__", L"abcdefghijklmnopqrs");
+  Optional<std::string> val(Process::GetEnv("__LLVM_TEST_ENVIRON_VAR__"));
+  EXPECT_TRUE(val.hasValue());
+  EXPECT_STREQ("abcdefghijklmnopqrs", val->c_str());
+}
+#endif
+
 } // end anonymous namespace
diff --git a/unittests/Support/ProgramTest.cpp b/unittests/Support/ProgramTest.cpp
index 6852ca6..6eb990f 100644
--- a/unittests/Support/ProgramTest.cpp
+++ b/unittests/Support/ProgramTest.cpp
@@ -21,6 +21,20 @@
 extern char **environ;
 #endif
 
+#if defined(LLVM_ON_UNIX)
+#include <unistd.h>
+void sleep_for(unsigned int seconds) {
+  sleep(seconds);
+}
+#elif defined(LLVM_ON_WIN32)
+#include <windows.h>
+void sleep_for(unsigned int seconds) {
+  Sleep(seconds * 1000);
+}
+#else
+#error sleep_for is not implemented on your platform.
+#endif
+
 // From TestMain.cpp.
 extern const char *TestMainArgv0;
 
@@ -60,7 +74,7 @@ TEST(ProgramTest, CreateProcessTrailingSlash) {
       sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
   const char *argv[] = {
     my_exe.c_str(),
-    "--gtest_filter=ProgramTest.CreateProcessTrailingSlashChild",
+    "--gtest_filter=ProgramTest.CreateProcessTrailingSlash",
     "-program-test-string-arg1", "has\\\\ trailing\\",
     "-program-test-string-arg2", "has\\\\ trailing\\",
     0
@@ -88,4 +102,93 @@ TEST(ProgramTest, CreateProcessTrailingSlash) {
   EXPECT_EQ(0, rc);
 }
 
+TEST(ProgramTest, TestExecuteNoWait) {
+  using namespace llvm::sys;
+
+  if (getenv("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT")) {
+    sleep_for(/*seconds*/ 1);
+    exit(0);
+  }
+
+  std::string Executable =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
+  const char *argv[] = {
+    Executable.c_str(),
+    "--gtest_filter=ProgramTest.TestExecuteNoWait",
+    0
+  };
+
+  // Add LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT to the environment of the child.
+  std::vector<const char *> envp;
+  CopyEnvironment(envp);
+  envp.push_back("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT=1");
+  envp.push_back(0);
+
+  std::string Error;
+  bool ExecutionFailed;
+  ProcessInfo PI1 =
+      ExecuteNoWait(Executable, argv, &envp[0], 0, 0, &Error, &ExecutionFailed);
+  ASSERT_FALSE(ExecutionFailed) << Error;
+  ASSERT_NE(PI1.Pid, 0) << "Invalid process id";
+
+  unsigned LoopCount = 0;
+
+  // Test that Wait() with WaitUntilTerminates=true works. In this case,
+  // LoopCount should only be incremented once.
+  while (true) {
+    ++LoopCount;
+    ProcessInfo WaitResult = Wait(PI1, 0, true, &Error);
+    ASSERT_TRUE(Error.empty());
+    if (WaitResult.Pid == PI1.Pid)
+      break;
+  }
+
+  EXPECT_EQ(LoopCount, 1u) << "LoopCount should be 1";
+
+  ProcessInfo PI2 =
+      ExecuteNoWait(Executable, argv, &envp[0], 0, 0, &Error, &ExecutionFailed);
+  ASSERT_FALSE(ExecutionFailed) << Error;
+  ASSERT_NE(PI2.Pid, 0) << "Invalid process id";
+
+  // Test that Wait() with SecondsToWait=0 performs a non-blocking wait. In this
+  // cse, LoopCount should be greater than 1 (more than one increment occurs).
+  while (true) {
+    ++LoopCount;
+    ProcessInfo WaitResult = Wait(PI2, 0, false, &Error);
+    ASSERT_TRUE(Error.empty());
+    if (WaitResult.Pid == PI2.Pid)
+      break;
+  }
+
+  ASSERT_GT(LoopCount, 1u) << "LoopCount should be >1";
+}
+
+TEST(ProgramTest, TestExecuteNegative) {
+  std::string Executable = "i_dont_exist";
+  const char *argv[] = { Executable.c_str(), 0 };
+
+  {
+    std::string Error;
+    bool ExecutionFailed;
+    int RetCode =
+        ExecuteAndWait(Executable, argv, 0, 0, 0, 0, &Error, &ExecutionFailed);
+    ASSERT_TRUE(RetCode < 0) << "On error ExecuteAndWait should return 0 or "
+                                "positive value indicating the result code";
+    ASSERT_TRUE(ExecutionFailed);
+    ASSERT_FALSE(Error.empty());
+  }
+
+  {
+    std::string Error;
+    bool ExecutionFailed;
+    ProcessInfo PI =
+        ExecuteNoWait(Executable, argv, 0, 0, 0, &Error, &ExecutionFailed);
+    ASSERT_EQ(PI.Pid, 0)
+        << "On error ExecuteNoWait should return an invalid ProcessInfo";
+    ASSERT_TRUE(ExecutionFailed);
+    ASSERT_FALSE(Error.empty());
+  }
+
+}
+
 } // end anonymous namespace
diff --git a/unittests/Support/RegexTest.cpp b/unittests/Support/RegexTest.cpp
index 02869b3..7b977f7 100644
--- a/unittests/Support/RegexTest.cpp
+++ b/unittests/Support/RegexTest.cpp
@@ -127,4 +127,12 @@ TEST_F(RegexTest, IsLiteralERE) {
   EXPECT_FALSE(Regex::isLiteralERE("abc{1,2}"));
 }
 
+TEST_F(RegexTest, IsValid) {
+  std::string Error;
+  EXPECT_FALSE(Regex("(foo").isValid(Error));
+  EXPECT_EQ("parentheses not balanced", Error);
+  EXPECT_FALSE(Regex("a[b-").isValid(Error));
+  EXPECT_EQ("invalid character range", Error);
+}
+
 }
diff --git a/unittests/Support/SourceMgrTest.cpp b/unittests/Support/SourceMgrTest.cpp
new file mode 100644
index 0000000..2b69fe9
--- /dev/null
+++ b/unittests/Support/SourceMgrTest.cpp
@@ -0,0 +1,174 @@
+//===- unittests/Support/SourceMgrTest.cpp - SourceMgr tests --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+class SourceMgrTest : public testing::Test {
+public:
+  SourceMgr SM;
+  unsigned MainBufferID;
+  std::string Output;
+
+  void setMainBuffer(StringRef Text, StringRef BufferName) {
+    MemoryBuffer *MainBuffer = MemoryBuffer::getMemBuffer(Text, BufferName);
+    MainBufferID = SM.AddNewSourceBuffer(MainBuffer, llvm::SMLoc());
+  }
+
+  SMLoc getLoc(unsigned Offset) {
+    return SMLoc::getFromPointer(
+        SM.getMemoryBuffer(MainBufferID)->getBufferStart() + Offset);
+  }
+
+  SMRange getRange(unsigned Offset, unsigned Length) {
+    return SMRange(getLoc(Offset), getLoc(Offset + Length));
+  }
+
+  void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+                    const Twine &Msg, ArrayRef<SMRange> Ranges,
+                    ArrayRef<SMFixIt> FixIts) {
+    raw_string_ostream OS(Output);
+    SM.PrintMessage(OS, Loc, Kind, Msg, Ranges, FixIts);
+  }
+};
+
+} // unnamed namespace
+
+TEST_F(SourceMgrTest, BasicError) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(4), SourceMgr::DK_Error, "message", None, None);
+
+  EXPECT_EQ("file.in:1:5: error: message\n"
+            "aaa bbb\n"
+            "    ^\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, BasicWarning) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(4), SourceMgr::DK_Warning, "message", None, None);
+
+  EXPECT_EQ("file.in:1:5: warning: message\n"
+            "aaa bbb\n"
+            "    ^\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, BasicNote) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(4), SourceMgr::DK_Note, "message", None, None);
+
+  EXPECT_EQ("file.in:1:5: note: message\n"
+            "aaa bbb\n"
+            "    ^\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, LocationAtEndOfLine) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(6), SourceMgr::DK_Error, "message", None, None);
+
+  EXPECT_EQ("file.in:1:7: error: message\n"
+            "aaa bbb\n"
+            "      ^\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, LocationAtNewline) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(7), SourceMgr::DK_Error, "message", None, None);
+
+  EXPECT_EQ("file.in:1:8: error: message\n"
+            "aaa bbb\n"
+            "       ^\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, BasicRange) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(4), SourceMgr::DK_Error, "message", getRange(4, 3), None);
+
+  EXPECT_EQ("file.in:1:5: error: message\n"
+            "aaa bbb\n"
+            "    ^~~\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, RangeWithTab) {
+  setMainBuffer("aaa\tbbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(4), SourceMgr::DK_Error, "message", getRange(3, 3), None);
+
+  EXPECT_EQ("file.in:1:5: error: message\n"
+            "aaa     bbb\n"
+            "   ~~~~~^~\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, MultiLineRange) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(4), SourceMgr::DK_Error, "message", getRange(4, 7), None);
+
+  EXPECT_EQ("file.in:1:5: error: message\n"
+            "aaa bbb\n"
+            "    ^~~\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, MultipleRanges) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  SMRange Ranges[] = { getRange(0, 3), getRange(4, 3) };
+  printMessage(getLoc(4), SourceMgr::DK_Error, "message", Ranges, None);
+
+  EXPECT_EQ("file.in:1:5: error: message\n"
+            "aaa bbb\n"
+            "~~~ ^~~\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, OverlappingRanges) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  SMRange Ranges[] = { getRange(0, 3), getRange(2, 4) };
+  printMessage(getLoc(4), SourceMgr::DK_Error, "message", Ranges, None);
+
+  EXPECT_EQ("file.in:1:5: error: message\n"
+            "aaa bbb\n"
+            "~~~~^~\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, BasicFixit) {
+  setMainBuffer("aaa bbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(4), SourceMgr::DK_Error, "message", None,
+               makeArrayRef(SMFixIt(getRange(4, 3), "zzz")));
+
+  EXPECT_EQ("file.in:1:5: error: message\n"
+            "aaa bbb\n"
+            "    ^~~\n"
+            "    zzz\n",
+            Output);
+}
+
+TEST_F(SourceMgrTest, FixitForTab) {
+  setMainBuffer("aaa\tbbb\nccc ddd\n", "file.in");
+  printMessage(getLoc(3), SourceMgr::DK_Error, "message", None,
+               makeArrayRef(SMFixIt(getRange(3, 1), "zzz")));
+
+  EXPECT_EQ("file.in:1:4: error: message\n"
+            "aaa     bbb\n"
+            "   ^^^^^\n"
+            "   zzz\n",
+            Output);
+}
+
diff --git a/unittests/Support/ThreadLocalTest.cpp b/unittests/Support/ThreadLocalTest.cpp
new file mode 100644
index 0000000..dd4d706
--- /dev/null
+++ b/unittests/Support/ThreadLocalTest.cpp
@@ -0,0 +1,38 @@
+//===- llvm/unittest/Support/ThreadLocalTest.cpp - Therad Local tests   ---===//
+//
+//		       The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ThreadLocal.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace sys;
+
+namespace {
+
+class ThreadLocalTest : public ::testing::Test {
+};
+
+struct S {
+  int i;
+};
+
+TEST_F(ThreadLocalTest, Basics) {
+  ThreadLocal<const S> x;
+
+  EXPECT_EQ(0, x.get());
+
+  S s;
+  x.set(&s);
+  EXPECT_EQ(&s, x.get());
+
+  x.erase();
+  EXPECT_EQ(0, x.get());
+}
+
+}
diff --git a/unittests/Support/UnicodeTest.cpp b/unittests/Support/UnicodeTest.cpp
new file mode 100644
index 0000000..0733397
--- /dev/null
+++ b/unittests/Support/UnicodeTest.cpp
@@ -0,0 +1,93 @@
+//===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Unicode.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+namespace {
+
+TEST(Unicode, columnWidthUTF8) {
+  EXPECT_EQ(0, columnWidthUTF8(""));
+  EXPECT_EQ(1, columnWidthUTF8(" "));
+  EXPECT_EQ(1, columnWidthUTF8("a"));
+  EXPECT_EQ(1, columnWidthUTF8("~"));
+
+  EXPECT_EQ(6, columnWidthUTF8("abcdef"));
+
+  EXPECT_EQ(-1, columnWidthUTF8("\x01"));
+  EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01"));
+  EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE
+
+  // 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some
+  // text editors display it only when a line is broken at it, some use it as a
+  // line-break hint, but don't display. We choose terminal-oriented
+  // interpretation.
+  EXPECT_EQ(1, columnWidthUTF8("\302\255"));
+
+  EXPECT_EQ(0, columnWidthUTF8("\314\200"));     // 0300 COMBINING GRAVE ACCENT
+  EXPECT_EQ(1, columnWidthUTF8("\340\270\201")); // 0E01 THAI CHARACTER KO KAI
+  EXPECT_EQ(2, columnWidthUTF8("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00
+
+  EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200"));
+  EXPECT_EQ(3, columnWidthUTF8("q\344\270\200"));
+  EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200"));
+
+  // Invalid UTF-8 strings, columnWidthUTF8 should error out.
+  EXPECT_EQ(-2, columnWidthUTF8("\344"));
+  EXPECT_EQ(-2, columnWidthUTF8("\344\270"));
+  EXPECT_EQ(-2, columnWidthUTF8("\344\270\033"));
+  EXPECT_EQ(-2, columnWidthUTF8("\344\270\300"));
+  EXPECT_EQ(-2, columnWidthUTF8("\377\366\355"));
+
+  EXPECT_EQ(-2, columnWidthUTF8("qwer\344"));
+  EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270"));
+  EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033"));
+  EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300"));
+  EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355"));
+
+  // UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode
+  // characters.
+  EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200"));     // U+200000
+  EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200")); // U+4000000
+}
+
+TEST(Unicode, isPrintable) {
+  EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F>
+  EXPECT_FALSE(isPrintable(0x01));
+  EXPECT_FALSE(isPrintable(0x1F));
+  EXPECT_TRUE(isPrintable(' '));
+  EXPECT_TRUE(isPrintable('A'));
+  EXPECT_TRUE(isPrintable('~'));
+  EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F>
+  EXPECT_FALSE(isPrintable(0x90));
+  EXPECT_FALSE(isPrintable(0x9F));
+
+  EXPECT_TRUE(isPrintable(0xAC));
+  EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals
+                                  // as either a space or a dash.
+  EXPECT_TRUE(isPrintable(0xAE));
+
+  EXPECT_TRUE(isPrintable(0x0377));  // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
+  EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379>
+
+  EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN
+
+  EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF>
+  EXPECT_TRUE(isPrintable(0x20000));  // CJK UNIFIED IDEOGRAPH-20000
+
+  EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter
+}
+
+} // namespace
+} // namespace unicode
+} // namespace sys
+} // namespace llvm
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 0993d8c..6c0b9e6 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -58,14 +58,24 @@ namespace yaml {
 //
 TEST(YAMLIO, TestMapRead) {
   FooBar doc;
-  Input yin("---\nfoo:  3\nbar:  5\n...\n");
-  yin >> doc;
+  {
+    Input yin("---\nfoo:  3\nbar:  5\n...\n");
+    yin >> doc;
 
-  EXPECT_FALSE(yin.error());
-  EXPECT_EQ(doc.foo, 3);
-  EXPECT_EQ(doc.bar,5);
-}
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(doc.foo, 3);
+    EXPECT_EQ(doc.bar, 5);
+  }
 
+  {
+    Input yin("{foo: 3, bar: 5}");
+    yin >> doc;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(doc.foo, 3);
+    EXPECT_EQ(doc.bar, 5);
+  }
+}
 
 //
 // Test the reading of a yaml sequence of mappings
@@ -273,7 +283,64 @@ TEST(YAMLIO, TestReadWriteBuiltInTypes) {
   }
 }
 
+struct StringTypes {
+  llvm::StringRef str1;
+  llvm::StringRef str2;
+  llvm::StringRef str3;
+  llvm::StringRef str4;
+  llvm::StringRef str5;
+};
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<StringTypes> {
+    static void mapping(IO &io, StringTypes& st) {
+      io.mapRequired("str1",      st.str1);
+      io.mapRequired("str2",      st.str2);
+      io.mapRequired("str3",      st.str3);
+      io.mapRequired("str4",      st.str4);
+      io.mapRequired("str5",      st.str5);
+    }
+  };
+}
+}
+
+TEST(YAMLIO, TestReadWriteStringTypes) {
+  std::string intermediate;
+  {
+    StringTypes map;
+    map.str1 = "'aaa";
+    map.str2 = "\"bbb";
+    map.str3 = "`ccc";
+    map.str4 = "@ddd";
+    map.str5 = "";
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
 
+  llvm::StringRef flowOut(intermediate);
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'''aaa"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'\"bbb'"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'`ccc'"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("'@ddd'"));
+  EXPECT_NE(llvm::StringRef::npos, flowOut.find("''\n"));
+
+  {
+    Input yin(intermediate);
+    StringTypes map;
+    yin >> map;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_TRUE(map.str1.equals("'aaa"));
+    EXPECT_TRUE(map.str2.equals("\"bbb"));
+    EXPECT_TRUE(map.str3.equals("`ccc"));
+    EXPECT_TRUE(map.str4.equals("@ddd"));
+    EXPECT_TRUE(map.str5.equals(""));
+  }
+}
 
 //===----------------------------------------------------------------------===//
 //  Test ScalarEnumerationTraits
@@ -600,9 +667,9 @@ TEST(YAMLIO, TestReadWriteMyFlowSequence) {
     map.numbers.push_back(1024);
 
     llvm::raw_string_ostream ostr(intermediate);
-    Output yout(ostr); 
+    Output yout(ostr);
     yout << map;
-    
+
     // Verify sequences were written in flow style
     ostr.flush();
     llvm::StringRef flowOut(intermediate);
@@ -932,6 +999,161 @@ TEST(YAMLIO, TestSequenceDocListWriteAndRead) {
   }
 }
 
+//===----------------------------------------------------------------------===//
+//  Test document tags
+//===----------------------------------------------------------------------===//
+
+struct MyDouble {
+  MyDouble() : value(0.0) { }
+  MyDouble(double x) : value(x) { }
+  double value;
+};
+
+LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(MyDouble)
+
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<MyDouble> {
+    static void mapping(IO &io, MyDouble &d) {
+      if (io.mapTag("!decimal", true)) {
+        mappingDecimal(io, d);
+      } else if (io.mapTag("!fraction")) {
+        mappingFraction(io, d);
+      }
+    }
+    static void mappingDecimal(IO &io, MyDouble &d) {
+      io.mapRequired("value", d.value);
+    }
+    static void mappingFraction(IO &io, MyDouble &d) {
+        double num, denom;
+        io.mapRequired("numerator",      num);
+        io.mapRequired("denominator",    denom);
+        // convert fraction to double
+        d.value = num/denom;
+    }
+  };
+ }
+}
+
+
+//
+// Test the reading of two different tagged yaml documents.
+//
+TEST(YAMLIO, TestTaggedDocuments) {
+  std::vector<MyDouble> docList;
+  Input yin("--- !decimal\nvalue:  3.0\n"
+            "--- !fraction\nnumerator:  9.0\ndenominator:  2\n...\n");
+  yin >> docList;
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(docList.size(), 2UL);
+  EXPECT_EQ(docList[0].value, 3.0);
+  EXPECT_EQ(docList[1].value, 4.5);
+}
+
+
+
+//
+// Test writing then reading back tagged documents
+//
+TEST(YAMLIO, TestTaggedDocumentsWriteAndRead) {
+  std::string intermediate;
+  {
+    MyDouble a(10.25);
+    MyDouble b(-3.75);
+    std::vector<MyDouble> docList;
+    docList.push_back(a);
+    docList.push_back(b);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << docList;
+  }
+
+  {
+    Input yin(intermediate);
+    std::vector<MyDouble> docList2;
+    yin >> docList2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(docList2.size(), 2UL);
+    EXPECT_EQ(docList2[0].value, 10.25);
+    EXPECT_EQ(docList2[1].value, -3.75);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Test dyn_cast<> on IO object 
+//===----------------------------------------------------------------------===//
+
+struct DynCast {
+  int value;
+};
+typedef std::vector<DynCast> DynCastSequence;
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(DynCast)
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<DynCast> {
+    static void mapping(IO &io, DynCast& info) {
+      // Change 10 to 13 when writing yaml.
+      if (Output *output = dyn_cast<Output>(&io)) {
+        (void)output;
+        if (info.value == 10)
+          info.value = 13;
+      }
+      io.mapRequired("value", info.value);
+      // Change 20 to 23 when parsing yaml.
+      if (Input *input = dyn_cast<Input>(&io)) {
+        (void)input;
+        if (info.value == 20)
+          info.value = 23;
+      }
+    }
+  };
+}
+}
+
+//
+// Test writing then reading back a sequence of mappings
+//
+TEST(YAMLIO, TestDynCast) {
+  std::string intermediate;
+  {
+    DynCast entry1;
+    entry1.value = 10;
+    DynCast entry2;
+    entry2.value = 20;
+    DynCast entry3;
+    entry3.value = 30;
+    DynCastSequence seq;
+    seq.push_back(entry1);
+    seq.push_back(entry2);
+    seq.push_back(entry3);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << seq;
+  }
+
+  {
+    Input yin(intermediate);
+    DynCastSequence seq2;
+    yin >> seq2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(seq2.size(), 3UL);
+    EXPECT_EQ(seq2[0].value, 13);   // Verify changed to 13.
+    EXPECT_EQ(seq2[1].value, 23);   // Verify changed to 23.
+    EXPECT_EQ(seq2[2].value, 30);   // Verify stays same.
+  }
+}
+
+
 
 //===----------------------------------------------------------------------===//
 //  Test error handling
@@ -952,8 +1174,9 @@ TEST(YAMLIO, TestColorsReadError) {
             "c1:  blue\n"
             "c2:  purple\n"
             "c3:  green\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> map;
   EXPECT_TRUE(yin.error());
 }
@@ -968,8 +1191,9 @@ TEST(YAMLIO, TestFlagsReadError) {
             "f1:  [ big ]\n"
             "f2:  [ round, hollow ]\n"
             "f3:  []\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> map;
 
   EXPECT_TRUE(yin.error());
@@ -986,8 +1210,9 @@ TEST(YAMLIO, TestReadBuiltInTypesUint8Error) {
             "- 255\n"
             "- 0\n"
             "- 257\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1004,8 +1229,9 @@ TEST(YAMLIO, TestReadBuiltInTypesUint16Error) {
             "- 65535\n"
             "- 0\n"
             "- 66000\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1022,8 +1248,9 @@ TEST(YAMLIO, TestReadBuiltInTypesUint32Error) {
             "- 4000000000\n"
             "- 0\n"
             "- 5000000000\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1040,8 +1267,9 @@ TEST(YAMLIO, TestReadBuiltInTypesUint64Error) {
             "- 18446744073709551615\n"
             "- 0\n"
             "- 19446744073709551615\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1059,8 +1287,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint8OverError) {
             "- 0\n"
             "- 127\n"
             "- 128\n"
-           "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+           "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1076,8 +1305,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint8UnderError) {
             "- 0\n"
             "- 127\n"
             "- -129\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1095,8 +1325,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint16UnderError) {
             "- 0\n"
             "- -32768\n"
             "- -32769\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1113,8 +1344,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint16OverError) {
             "- 0\n"
             "- -32768\n"
             "- 32768\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1132,8 +1364,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint32UnderError) {
             "- 0\n"
             "- -2147483648\n"
             "- -2147483649\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1149,8 +1382,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint32OverError) {
             "- 0\n"
             "- -2147483648\n"
             "- 2147483649\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1168,8 +1402,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint64UnderError) {
             "- 0\n"
             "- 9223372036854775807\n"
             "- -9223372036854775809\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1185,8 +1420,9 @@ TEST(YAMLIO, TestReadBuiltInTypesint64OverError) {
             "- 0\n"
             "- 9223372036854775807\n"
             "- 9223372036854775809\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1203,8 +1439,9 @@ TEST(YAMLIO, TestReadBuiltInTypesFloatError) {
             "- 1000.1\n"
             "- -123.456\n"
             "- 1.2.3\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1221,8 +1458,9 @@ TEST(YAMLIO, TestReadBuiltInTypesDoubleError) {
             "- 1000.1\n"
             "- -123.456\n"
             "- 1.2.3\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1238,8 +1476,9 @@ TEST(YAMLIO, TestReadBuiltInTypesHex8Error) {
             "- 0x12\n"
             "- 0xFE\n"
             "- 0x123\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1256,8 +1495,9 @@ TEST(YAMLIO, TestReadBuiltInTypesHex16Error) {
             "- 0x0012\n"
             "- 0xFEFF\n"
             "- 0x12345\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1273,8 +1513,9 @@ TEST(YAMLIO, TestReadBuiltInTypesHex32Error) {
             "- 0x0012\n"
             "- 0xFEFF0000\n"
             "- 0x1234556789\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
@@ -1290,10 +1531,114 @@ TEST(YAMLIO, TestReadBuiltInTypesHex64Error) {
             "- 0x0012\n"
             "- 0xFFEEDDCCBBAA9988\n"
             "- 0x12345567890ABCDEF0\n"
-            "...\n");
-  yin.setDiagHandler(suppressErrorMessages);
+            "...\n",
+            /*Ctxt=*/NULL,
+            suppressErrorMessages);
   yin >> seq;
 
   EXPECT_TRUE(yin.error());
 }
 
+TEST(YAMLIO, TestMalformedMapFailsGracefully) {
+  FooBar doc;
+  {
+    // We pass the suppressErrorMessages handler to handle the error
+    // message generated in the constructor of Input.
+    Input yin("{foo:3, bar: 5}", /*Ctxt=*/NULL, suppressErrorMessages);
+    yin >> doc;
+    EXPECT_TRUE(yin.error());
+  }
+
+  {
+    Input yin("---\nfoo:3\nbar: 5\n...\n", /*Ctxt=*/NULL, suppressErrorMessages);
+    yin >> doc;
+    EXPECT_TRUE(yin.error());
+  }
+}
+
+struct OptionalTest {
+  std::vector<int> Numbers;
+};
+
+struct OptionalTestSeq {
+  std::vector<OptionalTest> Tests;
+};
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(OptionalTest)
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<OptionalTest> {
+    static void mapping(IO& IO, OptionalTest &OT) {
+      IO.mapOptional("Numbers", OT.Numbers);
+    }
+  };
+
+  template <>
+  struct MappingTraits<OptionalTestSeq> {
+    static void mapping(IO &IO, OptionalTestSeq &OTS) {
+      IO.mapOptional("Tests", OTS.Tests);
+    }
+  };
+}
+}
+
+TEST(YAMLIO, SequenceElideTest) {
+  // Test that writing out a purely optional structure with its fields set to
+  // default followed by other data is properly read back in.
+  OptionalTestSeq Seq;
+  OptionalTest One, Two, Three, Four;
+  int N[] = {1, 2, 3};
+  Three.Numbers.assign(N, N + 3);
+  Seq.Tests.push_back(One);
+  Seq.Tests.push_back(Two);
+  Seq.Tests.push_back(Three);
+  Seq.Tests.push_back(Four);
+
+  std::string intermediate;
+  {
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << Seq;
+  }
+
+  Input yin(intermediate);
+  OptionalTestSeq Seq2;
+  yin >> Seq2;
+
+  EXPECT_FALSE(yin.error());
+
+  EXPECT_EQ(4UL, Seq2.Tests.size());
+
+  EXPECT_TRUE(Seq2.Tests[0].Numbers.empty());
+  EXPECT_TRUE(Seq2.Tests[1].Numbers.empty());
+
+  EXPECT_EQ(1, Seq2.Tests[2].Numbers[0]);
+  EXPECT_EQ(2, Seq2.Tests[2].Numbers[1]);
+  EXPECT_EQ(3, Seq2.Tests[2].Numbers[2]);
+
+  EXPECT_TRUE(Seq2.Tests[3].Numbers.empty());
+}
+
+TEST(YAMLIO, TestEmptyStringFailsForMapWithRequiredFields) {
+  FooBar doc;
+  Input yin("");
+  yin >> doc;
+  EXPECT_TRUE(yin.error());
+}
+
+TEST(YAMLIO, TestEmptyStringSucceedsForMapWithOptionalFields) {
+  OptionalTest doc;
+  Input yin("");
+  yin >> doc;
+  EXPECT_FALSE(yin.error());
+}
+
+TEST(YAMLIO, TestEmptyStringSucceedsForSequence) {
+  std::vector<uint8_t> seq;
+  Input yin("", /*Ctxt=*/NULL, suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_TRUE(seq.empty());
+}
diff --git a/unittests/Transforms/Utils/SpecialCaseList.cpp b/unittests/Transforms/Utils/SpecialCaseList.cpp
index 9deee3c..07ac908 100644
--- a/unittests/Transforms/Utils/SpecialCaseList.cpp
+++ b/unittests/Transforms/Utils/SpecialCaseList.cpp
@@ -34,9 +34,22 @@ protected:
         M, ST, false, GlobalValue::ExternalLinkage, 0, Name);
   }
 
-  SpecialCaseList *makeSpecialCaseList(StringRef List) {
+  GlobalAlias *makeAlias(StringRef Name, GlobalValue *Aliasee) {
+    return new GlobalAlias(Aliasee->getType(), GlobalValue::ExternalLinkage,
+                           Name, Aliasee, Aliasee->getParent());
+  }
+
+  SpecialCaseList *makeSpecialCaseList(StringRef List, std::string &Error) {
     OwningPtr<MemoryBuffer> MB(MemoryBuffer::getMemBuffer(List));
-    return new SpecialCaseList(MB.get());
+    return SpecialCaseList::create(MB.get(), Error);
+  }
+
+  SpecialCaseList *makeSpecialCaseList(StringRef List) {
+    std::string Error;
+    SpecialCaseList *SCL = makeSpecialCaseList(List, Error);
+    assert(SCL);
+    assert(Error == "");
+    return SCL;
   }
 
   LLVMContext Ctx;
@@ -86,8 +99,6 @@ TEST_F(SpecialCaseListTest, FunctionIsIn) {
   SCL.reset(makeSpecialCaseList("fun:foo=functional\n"));
   EXPECT_TRUE(SCL->isIn(*Foo, "functional"));
   StringRef Category;
-  EXPECT_TRUE(SCL->findCategory(*Foo, Category));
-  EXPECT_EQ("functional", Category);
   EXPECT_FALSE(SCL->isIn(*Bar, "functional"));
 }
 
@@ -139,10 +150,48 @@ TEST_F(SpecialCaseListTest, GlobalIsIn) {
   EXPECT_TRUE(SCL->isIn(*Bar, "init"));
 }
 
+TEST_F(SpecialCaseListTest, AliasIsIn) {
+  Module M("hello", Ctx);
+  Function *Foo = makeFunction("foo", M);
+  GlobalVariable *Bar = makeGlobal("bar", "t", M);
+  GlobalAlias *FooAlias = makeAlias("fooalias", Foo);
+  GlobalAlias *BarAlias = makeAlias("baralias", Bar);
+
+  OwningPtr<SpecialCaseList> SCL(makeSpecialCaseList("fun:foo\n"));
+  EXPECT_FALSE(SCL->isIn(*FooAlias));
+  EXPECT_FALSE(SCL->isIn(*BarAlias));
+
+  SCL.reset(makeSpecialCaseList("global:bar\n"));
+  EXPECT_FALSE(SCL->isIn(*FooAlias));
+  EXPECT_FALSE(SCL->isIn(*BarAlias));
+
+  SCL.reset(makeSpecialCaseList("global:fooalias\n"));
+  EXPECT_FALSE(SCL->isIn(*FooAlias));
+  EXPECT_FALSE(SCL->isIn(*BarAlias));
+
+  SCL.reset(makeSpecialCaseList("fun:fooalias\n"));
+  EXPECT_TRUE(SCL->isIn(*FooAlias));
+  EXPECT_FALSE(SCL->isIn(*BarAlias));
+
+  SCL.reset(makeSpecialCaseList("global:baralias=init\n"));
+  EXPECT_FALSE(SCL->isIn(*FooAlias, "init"));
+  EXPECT_TRUE(SCL->isIn(*BarAlias, "init"));
+
+  SCL.reset(makeSpecialCaseList("type:t=init\n"));
+  EXPECT_FALSE(SCL->isIn(*FooAlias, "init"));
+  EXPECT_TRUE(SCL->isIn(*BarAlias, "init"));
+
+  SCL.reset(makeSpecialCaseList("fun:baralias=init\n"));
+  EXPECT_FALSE(SCL->isIn(*FooAlias, "init"));
+  EXPECT_FALSE(SCL->isIn(*BarAlias, "init"));
+}
+
 TEST_F(SpecialCaseListTest, Substring) {
   Module M("othello", Ctx);
   Function *F = makeFunction("tomfoolery", M);
   GlobalVariable *GV = makeGlobal("bartender", "t", M);
+  GlobalAlias *GA1 = makeAlias("buffoonery", F);
+  GlobalAlias *GA2 = makeAlias("foobar", GV);
 
   OwningPtr<SpecialCaseList> SCL(makeSpecialCaseList("src:hello\n"
                                                      "fun:foo\n"
@@ -150,9 +199,34 @@ TEST_F(SpecialCaseListTest, Substring) {
   EXPECT_FALSE(SCL->isIn(M));
   EXPECT_FALSE(SCL->isIn(*F));
   EXPECT_FALSE(SCL->isIn(*GV));
+  EXPECT_FALSE(SCL->isIn(*GA1));
+  EXPECT_FALSE(SCL->isIn(*GA2));
 
   SCL.reset(makeSpecialCaseList("fun:*foo*\n"));
   EXPECT_TRUE(SCL->isIn(*F));
+  EXPECT_TRUE(SCL->isIn(*GA1));
+}
+
+TEST_F(SpecialCaseListTest, InvalidSpecialCaseList) {
+  std::string Error;
+  EXPECT_EQ(0, makeSpecialCaseList("badline", Error));
+  EXPECT_EQ("Malformed line 1: 'badline'", Error);
+  EXPECT_EQ(0, makeSpecialCaseList("src:bad[a-", Error));
+  EXPECT_EQ("Malformed regex in line 1: 'bad[a-': invalid character range",
+            Error);
+  EXPECT_EQ(0, makeSpecialCaseList("src:a.c\n"
+                                   "fun:fun(a\n",
+                                   Error));
+  EXPECT_EQ("Malformed regex in line 2: 'fun(a': parentheses not balanced",
+            Error);
+  EXPECT_EQ(0, SpecialCaseList::create("unexisting", Error));
+  EXPECT_EQ(0U, Error.find("Can't open file 'unexisting':"));
+}
+
+TEST_F(SpecialCaseListTest, EmptySpecialCaseList) {
+  OwningPtr<SpecialCaseList> SCL(makeSpecialCaseList(""));
+  Module M("foo", Ctx);
+  EXPECT_FALSE(SCL->isIn(M));
 }
 
 }
diff --git a/utils/FileCheck/CMakeLists.txt b/utils/FileCheck/CMakeLists.txt
index fa56f92..d691ceb 100644
--- a/utils/FileCheck/CMakeLists.txt
+++ b/utils/FileCheck/CMakeLists.txt
@@ -4,7 +4,7 @@ add_llvm_utility(FileCheck
 
 target_link_libraries(FileCheck LLVMSupport)
 if( MINGW )
-  target_link_libraries(FileCheck imagehlp psapi)
+  target_link_libraries(FileCheck imagehlp psapi shell32)
 endif( MINGW )
 if( LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD )
   target_link_libraries(FileCheck pthread)
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 8d5af58..f2510d7 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -29,6 +30,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include <algorithm>
+#include <cctype>
 #include <map>
 #include <string>
 #include <vector>
@@ -41,30 +43,39 @@ static cl::opt<std::string>
 InputFilename("input-file", cl::desc("File to check (defaults to stdin)"),
               cl::init("-"), cl::value_desc("filename"));
 
-static cl::opt<std::string>
-CheckPrefix("check-prefix", cl::init("CHECK"),
-            cl::desc("Prefix to use from check file (defaults to 'CHECK')"));
+static cl::list<std::string>
+CheckPrefixes("check-prefix",
+              cl::desc("Prefix to use from check file (defaults to 'CHECK')"));
 
 static cl::opt<bool>
 NoCanonicalizeWhiteSpace("strict-whitespace",
               cl::desc("Do not treat all horizontal whitespace as equivalent"));
 
+typedef cl::list<std::string>::const_iterator prefix_iterator;
+
 //===----------------------------------------------------------------------===//
 // Pattern Handling Code.
 //===----------------------------------------------------------------------===//
 
+namespace Check {
+  enum CheckType {
+    CheckNone = 0,
+    CheckPlain,
+    CheckNext,
+    CheckNot,
+    CheckDAG,
+    CheckLabel,
+
+    /// MatchEOF - When set, this pattern only matches the end of file. This is
+    /// used for trailing CHECK-NOTs.
+    CheckEOF
+  };
+}
+
 class Pattern {
   SMLoc PatternLoc;
 
-  /// MatchEOF - When set, this pattern only matches the end of file. This is
-  /// used for trailing CHECK-NOTs.
-  bool MatchEOF;
-
-  /// MatchNot
-  bool MatchNot;
-
-  /// MatchDag
-  bool MatchDag;
+  Check::CheckType CheckTy;
 
   /// FixedStr - If non-empty, this pattern is a fixed string match with the
   /// specified fixed string.
@@ -89,17 +100,21 @@ class Pattern {
 
 public:
 
-  Pattern(bool matchEOF = false)
-    : MatchEOF(matchEOF), MatchNot(false), MatchDag(false) { }
+  Pattern(Check::CheckType Ty)
+    : CheckTy(Ty) { }
 
   /// getLoc - Return the location in source code.
   SMLoc getLoc() const { return PatternLoc; }
 
-  /// ParsePattern - Parse the given string into the Pattern.  SM provides the
-  /// SourceMgr used for error reports, and LineNumber is the line number in
-  /// the input file from which the pattern string was read.
-  /// Returns true in case of an error, false otherwise.
-  bool ParsePattern(StringRef PatternStr, SourceMgr &SM, unsigned LineNumber);
+  /// ParsePattern - Parse the given string into the Pattern. Prefix provides
+  /// which prefix is being matched, SM provides the SourceMgr used for error
+  /// reports, and LineNumber is the line number in the input file from which
+  /// the pattern string was read.  Returns true in case of an error, false
+  /// otherwise.
+  bool ParsePattern(StringRef PatternStr,
+                    StringRef Prefix,
+                    SourceMgr &SM,
+                    unsigned LineNumber);
 
   /// Match - Match the pattern string against the input buffer Buffer.  This
   /// returns the position that is matched or npos if there is no match.  If
@@ -118,11 +133,7 @@ public:
   bool hasVariable() const { return !(VariableUses.empty() &&
                                       VariableDefs.empty()); }
 
-  void setMatchNot(bool Not) { MatchNot = Not; }
-  bool getMatchNot() const { return MatchNot; }
-
-  void setMatchDag(bool Dag) { MatchDag = Dag; }
-  bool getMatchDag() const { return MatchDag; }
+  Check::CheckType getCheckTy() const { return CheckTy; }
 
 private:
   static void AddFixedStringToRegEx(StringRef FixedStr, std::string &TheStr);
@@ -148,7 +159,9 @@ private:
 };
 
 
-bool Pattern::ParsePattern(StringRef PatternStr, SourceMgr &SM,
+bool Pattern::ParsePattern(StringRef PatternStr,
+                           StringRef Prefix,
+                           SourceMgr &SM,
                            unsigned LineNumber) {
   this->LineNumber = LineNumber;
   PatternLoc = SMLoc::getFromPointer(PatternStr.data());
@@ -162,7 +175,7 @@ bool Pattern::ParsePattern(StringRef PatternStr, SourceMgr &SM,
   if (PatternStr.empty()) {
     SM.PrintMessage(PatternLoc, SourceMgr::DK_Error,
                     "found empty check string with prefix '" +
-                    CheckPrefix+":'");
+                    Prefix + ":'");
     return true;
   }
 
@@ -381,7 +394,7 @@ bool Pattern::EvaluateExpression(StringRef Expr, std::string &Value) const {
 size_t Pattern::Match(StringRef Buffer, size_t &MatchLen,
                       StringMap<StringRef> &VariableTable) const {
   // If this is the EOF pattern, match it immediately.
-  if (MatchEOF) {
+  if (CheckTy == Check::CheckEOF) {
     MatchLen = 0;
     return Buffer.size();
   }
@@ -590,27 +603,30 @@ struct CheckString {
   /// Pat - The pattern to match.
   Pattern Pat;
 
+  /// Prefix - Which prefix name this check matched.
+  StringRef Prefix;
+
   /// Loc - The location in the match file that the check string was specified.
   SMLoc Loc;
 
-  /// IsCheckNext - This is true if this is a CHECK-NEXT: directive (as opposed
-  /// to a CHECK: directive.
-  bool IsCheckNext;
-
-  /// IsCheckLabel - This is true if this is a CHECK-LABEL: directive (as
-  /// opposed to a CHECK: directive.
-  bool IsCheckLabel;
+  /// CheckTy - Specify what kind of check this is. e.g. CHECK-NEXT: directive,
+  /// as opposed to a CHECK: directive.
+  Check::CheckType CheckTy;
 
   /// DagNotStrings - These are all of the strings that are disallowed from
   /// occurring between this match string and the previous one (or start of
   /// file).
   std::vector<Pattern> DagNotStrings;
 
-  CheckString(const Pattern &P, SMLoc L, bool isCheckNext, bool isCheckLabel)
-    : Pat(P), Loc(L), IsCheckNext(isCheckNext), IsCheckLabel(isCheckLabel) {}
+
+  CheckString(const Pattern &P,
+              StringRef S,
+              SMLoc L,
+              Check::CheckType Ty)
+    : Pat(P), Prefix(S), Loc(L), CheckTy(Ty) {}
 
   /// Check - Match check string and its "not strings" and/or "dag strings".
-  size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabel,
+  size_t Check(const SourceMgr &SM, StringRef Buffer, bool IsLabelScanMode,
                size_t &MatchLen, StringMap<StringRef> &VariableTable) const;
 
   /// CheckNext - Verify there is a single line in the given buffer.
@@ -666,6 +682,161 @@ static MemoryBuffer *CanonicalizeInputFile(MemoryBuffer *MB,
   return MB2;
 }
 
+static bool IsPartOfWord(char c) {
+  return (isalnum(c) || c == '-' || c == '_');
+}
+
+// Get the size of the prefix extension.
+static size_t CheckTypeSize(Check::CheckType Ty) {
+  switch (Ty) {
+  case Check::CheckNone:
+    return 0;
+
+  case Check::CheckPlain:
+    return sizeof(":") - 1;
+
+  case Check::CheckNext:
+    return sizeof("-NEXT:") - 1;
+
+  case Check::CheckNot:
+    return sizeof("-NOT:") - 1;
+
+  case Check::CheckDAG:
+    return sizeof("-DAG:") - 1;
+
+  case Check::CheckLabel:
+    return sizeof("-LABEL:") - 1;
+
+  case Check::CheckEOF:
+    llvm_unreachable("Should not be using EOF size");
+  }
+
+  llvm_unreachable("Bad check type");
+}
+
+static Check::CheckType FindCheckType(StringRef Buffer, StringRef Prefix) {
+  char NextChar = Buffer[Prefix.size()];
+
+  // Verify that the : is present after the prefix.
+  if (NextChar == ':')
+    return Check::CheckPlain;
+
+  if (NextChar != '-')
+    return Check::CheckNone;
+
+  StringRef Rest = Buffer.drop_front(Prefix.size() + 1);
+  if (Rest.startswith("NEXT:"))
+    return Check::CheckNext;
+
+  if (Rest.startswith("NOT:"))
+    return Check::CheckNot;
+
+  if (Rest.startswith("DAG:"))
+    return Check::CheckDAG;
+
+  if (Rest.startswith("LABEL:"))
+    return Check::CheckLabel;
+
+  return Check::CheckNone;
+}
+
+// From the given position, find the next character after the word.
+static size_t SkipWord(StringRef Str, size_t Loc) {
+  while (Loc < Str.size() && IsPartOfWord(Str[Loc]))
+    ++Loc;
+  return Loc;
+}
+
+// Try to find the first match in buffer for any prefix. If a valid match is
+// found, return that prefix and set its type and location.  If there are almost
+// matches (e.g. the actual prefix string is found, but is not an actual check
+// string), but no valid match, return an empty string and set the position to
+// resume searching from. If no partial matches are found, return an empty
+// string and the location will be StringRef::npos. If one prefix is a substring
+// of another, the maximal match should be found. e.g. if "A" and "AA" are
+// prefixes then AA-CHECK: should match the second one.
+static StringRef FindFirstCandidateMatch(StringRef &Buffer,
+                                         Check::CheckType &CheckTy,
+                                         size_t &CheckLoc) {
+  StringRef FirstPrefix;
+  size_t FirstLoc = StringRef::npos;
+  size_t SearchLoc = StringRef::npos;
+  Check::CheckType FirstTy = Check::CheckNone;
+
+  CheckTy = Check::CheckNone;
+  CheckLoc = StringRef::npos;
+
+  for (prefix_iterator I = CheckPrefixes.begin(), E = CheckPrefixes.end();
+       I != E; ++I) {
+    StringRef Prefix(*I);
+    size_t PrefixLoc = Buffer.find(Prefix);
+
+    if (PrefixLoc == StringRef::npos)
+      continue;
+
+    // Track where we are searching for invalid prefixes that look almost right.
+    // We need to only advance to the first partial match on the next attempt
+    // since a partial match could be a substring of a later, valid prefix.
+    // Need to skip to the end of the word, otherwise we could end up
+    // matching a prefix in a substring later.
+    if (PrefixLoc < SearchLoc)
+      SearchLoc = SkipWord(Buffer, PrefixLoc);
+
+    // We only want to find the first match to avoid skipping some.
+    if (PrefixLoc > FirstLoc)
+      continue;
+    // If one matching check-prefix is a prefix of another, choose the
+    // longer one.
+    if (PrefixLoc == FirstLoc && Prefix.size() < FirstPrefix.size())
+      continue;
+
+    StringRef Rest = Buffer.drop_front(PrefixLoc);
+    // Make sure we have actually found the prefix, and not a word containing
+    // it. This should also prevent matching the wrong prefix when one is a
+    // substring of another.
+    if (PrefixLoc != 0 && IsPartOfWord(Buffer[PrefixLoc - 1]))
+      continue;
+
+    FirstLoc = PrefixLoc;
+    FirstTy = FindCheckType(Rest, Prefix);
+    FirstPrefix = Prefix;
+  }
+
+  // If the first prefix is invalid, we should continue the search after it.
+  if (FirstTy == Check::CheckNone) {
+    CheckLoc = SearchLoc;
+    return "";
+  }
+
+  CheckTy = FirstTy;
+  CheckLoc = FirstLoc;
+  return FirstPrefix;
+}
+
+static StringRef FindFirstMatchingPrefix(StringRef &Buffer,
+                                         unsigned &LineNumber,
+                                         Check::CheckType &CheckTy,
+                                         size_t &CheckLoc) {
+  while (!Buffer.empty()) {
+    StringRef Prefix = FindFirstCandidateMatch(Buffer, CheckTy, CheckLoc);
+    // If we found a real match, we are done.
+    if (!Prefix.empty()) {
+      LineNumber += Buffer.substr(0, CheckLoc).count('\n');
+      return Prefix;
+    }
+
+    // We didn't find any almost matches either, we are also done.
+    if (CheckLoc == StringRef::npos)
+      return StringRef();
+
+    LineNumber += Buffer.substr(0, CheckLoc + 1).count('\n');
+
+    // Advance to the last possible match we found and try again.
+    Buffer = Buffer.drop_front(CheckLoc + 1);
+  }
+
+  return StringRef();
+}
 
 /// ReadCheckFile - Read the check file, which specifies the sequence of
 /// expected strings.  The strings are added to the CheckStrings vector.
@@ -696,49 +867,27 @@ static bool ReadCheckFile(SourceMgr &SM,
   unsigned LineNumber = 1;
 
   while (1) {
-    // See if Prefix occurs in the memory buffer.
-    size_t PrefixLoc = Buffer.find(CheckPrefix);
-    // If we didn't find a match, we're done.
-    if (PrefixLoc == StringRef::npos)
+    Check::CheckType CheckTy;
+    size_t PrefixLoc;
+
+    // See if a prefix occurs in the memory buffer.
+    StringRef UsedPrefix = FindFirstMatchingPrefix(Buffer,
+                                                   LineNumber,
+                                                   CheckTy,
+                                                   PrefixLoc);
+    if (UsedPrefix.empty())
       break;
 
-    LineNumber += Buffer.substr(0, PrefixLoc).count('\n');
-
-    Buffer = Buffer.substr(PrefixLoc);
-
-    const char *CheckPrefixStart = Buffer.data();
-
-    // When we find a check prefix, keep track of whether we find CHECK: or
-    // CHECK-NEXT:
-    bool IsCheckNext = false, IsCheckNot = false, IsCheckDag = false,
-         IsCheckLabel = false;
-
-    // Verify that the : is present after the prefix.
-    if (Buffer[CheckPrefix.size()] == ':') {
-      Buffer = Buffer.substr(CheckPrefix.size()+1);
-    } else if (Buffer.size() > CheckPrefix.size()+6 &&
-               memcmp(Buffer.data()+CheckPrefix.size(), "-NEXT:", 6) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+6);
-      IsCheckNext = true;
-    } else if (Buffer.size() > CheckPrefix.size()+5 &&
-               memcmp(Buffer.data()+CheckPrefix.size(), "-NOT:", 5) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+5);
-      IsCheckNot = true;
-    } else if (Buffer.size() > CheckPrefix.size()+5 &&
-               memcmp(Buffer.data()+CheckPrefix.size(), "-DAG:", 5) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+5);
-      IsCheckDag = true;
-    } else if (Buffer.size() > CheckPrefix.size()+7 &&
-               memcmp(Buffer.data()+CheckPrefix.size(), "-LABEL:", 7) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+7);
-      IsCheckLabel = true;
-    } else {
-      Buffer = Buffer.substr(1);
-      continue;
-    }
+    Buffer = Buffer.drop_front(PrefixLoc);
+
+    // Location to use for error messages.
+    const char *UsedPrefixStart = Buffer.data() + (PrefixLoc == 0 ? 0 : 1);
 
-    // Okay, we found the prefix, yay.  Remember the rest of the line, but
-    // ignore leading and trailing whitespace.
+    // PrefixLoc is to the start of the prefix. Skip to the end.
+    Buffer = Buffer.drop_front(UsedPrefix.size() + CheckTypeSize(CheckTy));
+
+    // Okay, we found the prefix, yay. Remember the rest of the line, but ignore
+    // leading and trailing whitespace.
     Buffer = Buffer.substr(Buffer.find_first_not_of(" \t"));
 
     // Scan ahead to the end of line.
@@ -748,59 +897,65 @@ static bool ReadCheckFile(SourceMgr &SM,
     SMLoc PatternLoc = SMLoc::getFromPointer(Buffer.data());
 
     // Parse the pattern.
-    Pattern P;
-    if (P.ParsePattern(Buffer.substr(0, EOL), SM, LineNumber))
+    Pattern P(CheckTy);
+    if (P.ParsePattern(Buffer.substr(0, EOL), UsedPrefix, SM, LineNumber))
       return true;
 
     // Verify that CHECK-LABEL lines do not define or use variables
-    if (IsCheckLabel && P.hasVariable()) {
-      SM.PrintMessage(SMLoc::getFromPointer(CheckPrefixStart),
+    if ((CheckTy == Check::CheckLabel) && P.hasVariable()) {
+      SM.PrintMessage(SMLoc::getFromPointer(UsedPrefixStart),
                       SourceMgr::DK_Error,
-                      "found '"+CheckPrefix+"-LABEL:' with variable definition"
-                      " or use'");
+                      "found '" + UsedPrefix + "-LABEL:'"
+                      " with variable definition or use");
       return true;
     }
 
-    P.setMatchNot(IsCheckNot);
-    P.setMatchDag(IsCheckDag);
-
     Buffer = Buffer.substr(EOL);
 
     // Verify that CHECK-NEXT lines have at least one CHECK line before them.
-    if (IsCheckNext && CheckStrings.empty()) {
-      SM.PrintMessage(SMLoc::getFromPointer(CheckPrefixStart),
+    if ((CheckTy == Check::CheckNext) && CheckStrings.empty()) {
+      SM.PrintMessage(SMLoc::getFromPointer(UsedPrefixStart),
                       SourceMgr::DK_Error,
-                      "found '"+CheckPrefix+"-NEXT:' without previous '"+
-                      CheckPrefix+ ": line");
+                      "found '" + UsedPrefix + "-NEXT:' without previous '"
+                      + UsedPrefix + ": line");
       return true;
     }
 
     // Handle CHECK-DAG/-NOT.
-    if (IsCheckDag || IsCheckNot) {
+    if (CheckTy == Check::CheckDAG || CheckTy == Check::CheckNot) {
       DagNotMatches.push_back(P);
       continue;
     }
 
     // Okay, add the string we captured to the output vector and move on.
     CheckStrings.push_back(CheckString(P,
+                                       UsedPrefix,
                                        PatternLoc,
-                                       IsCheckNext,
-                                       IsCheckLabel));
+                                       CheckTy));
     std::swap(DagNotMatches, CheckStrings.back().DagNotStrings);
   }
 
-  // Add an EOF pattern for any trailing CHECK-DAG/-NOTs.
+  // Add an EOF pattern for any trailing CHECK-DAG/-NOTs, and use the first
+  // prefix as a filler for the error message.
   if (!DagNotMatches.empty()) {
-    CheckStrings.push_back(CheckString(Pattern(true),
+    CheckStrings.push_back(CheckString(Pattern(Check::CheckEOF),
+                                       CheckPrefixes[0],
                                        SMLoc::getFromPointer(Buffer.data()),
-                                       false,
-                                       false));
+                                       Check::CheckEOF));
     std::swap(DagNotMatches, CheckStrings.back().DagNotStrings);
   }
 
   if (CheckStrings.empty()) {
-    errs() << "error: no check strings found with prefix '" << CheckPrefix
-           << ":'\n";
+    errs() << "error: no check strings found with prefix"
+           << (CheckPrefixes.size() > 1 ? "es " : " ");
+    for (size_t I = 0, N = CheckPrefixes.size(); I != N; ++I) {
+      StringRef Prefix(CheckPrefixes[I]);
+      errs() << '\'' << Prefix << ":'";
+      if (I != N - 1)
+        errs() << ", ";
+    }
+
+    errs() << '\n';
     return true;
   }
 
@@ -852,12 +1007,16 @@ static unsigned CountNumNewlinesBetween(StringRef Range) {
 }
 
 size_t CheckString::Check(const SourceMgr &SM, StringRef Buffer,
-                          bool IsLabel, size_t &MatchLen,
+                          bool IsLabelScanMode, size_t &MatchLen,
                           StringMap<StringRef> &VariableTable) const {
   size_t LastPos = 0;
   std::vector<const Pattern *> NotStrings;
 
-  if (!IsLabel) {
+  // IsLabelScanMode is true when we are scanning forward to find CHECK-LABEL
+  // bounds; we have not processed variable definitions within the bounded block
+  // yet so cannot handle any final CHECK-DAG yet; this is handled when going
+  // over the block again (including the last CHECK-LABEL) in normal mode.
+  if (!IsLabelScanMode) {
     // Match "dag strings" (with mixed "not strings" if any).
     LastPos = CheckDag(SM, Buffer, NotStrings, VariableTable);
     if (LastPos == StringRef::npos)
@@ -873,7 +1032,9 @@ size_t CheckString::Check(const SourceMgr &SM, StringRef Buffer,
   }
   MatchPos += LastPos;
 
-  if (!IsLabel) {
+  // Similar to the above, in "label-scan mode" we can't yet handle CHECK-NEXT
+  // or CHECK-NOT
+  if (!IsLabelScanMode) {
     StringRef SkippedRegion = Buffer.substr(LastPos, MatchPos);
 
     // If this check is a "CHECK-NEXT", verify that the previous match was on
@@ -891,7 +1052,7 @@ size_t CheckString::Check(const SourceMgr &SM, StringRef Buffer,
 }
 
 bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
-  if (!IsCheckNext)
+  if (CheckTy != Check::CheckNext)
     return false;
 
   // Count the number of newlines between the previous match and this one.
@@ -904,7 +1065,7 @@ bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
   unsigned NumNewLines = CountNumNewlinesBetween(Buffer);
 
   if (NumNewLines == 0) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Error, CheckPrefix+
+    SM.PrintMessage(Loc, SourceMgr::DK_Error, Prefix +
                     "-NEXT: is on the same line as previous match");
     SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()),
                     SourceMgr::DK_Note, "'next' match was here");
@@ -914,7 +1075,7 @@ bool CheckString::CheckNext(const SourceMgr &SM, StringRef Buffer) const {
   }
 
   if (NumNewLines != 1) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Error, CheckPrefix+
+    SM.PrintMessage(Loc, SourceMgr::DK_Error, Prefix +
                     "-NEXT: is not on the line after the previous match");
     SM.PrintMessage(SMLoc::getFromPointer(Buffer.end()),
                     SourceMgr::DK_Note, "'next' match was here");
@@ -932,7 +1093,7 @@ bool CheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
   for (unsigned ChunkNo = 0, e = NotStrings.size();
        ChunkNo != e; ++ChunkNo) {
     const Pattern *Pat = NotStrings[ChunkNo];
-    assert(Pat->getMatchNot() && "Expect CHECK-NOT!");
+    assert((Pat->getCheckTy() == Check::CheckNot) && "Expect CHECK-NOT!");
 
     size_t MatchLen = 0;
     size_t Pos = Pat->Match(Buffer, MatchLen, VariableTable);
@@ -941,9 +1102,9 @@ bool CheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
 
     SM.PrintMessage(SMLoc::getFromPointer(Buffer.data()+Pos),
                     SourceMgr::DK_Error,
-                    CheckPrefix+"-NOT: string occurred!");
+                    Prefix + "-NOT: string occurred!");
     SM.PrintMessage(Pat->getLoc(), SourceMgr::DK_Note,
-                    CheckPrefix+"-NOT: pattern specified here");
+                    Prefix + "-NOT: pattern specified here");
     return true;
   }
 
@@ -963,15 +1124,16 @@ size_t CheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
        ChunkNo != e; ++ChunkNo) {
     const Pattern &Pat = DagNotStrings[ChunkNo];
 
-    assert((Pat.getMatchDag() ^ Pat.getMatchNot()) &&
+    assert((Pat.getCheckTy() == Check::CheckDAG ||
+            Pat.getCheckTy() == Check::CheckNot) &&
            "Invalid CHECK-DAG or CHECK-NOT!");
 
-    if (Pat.getMatchNot()) {
+    if (Pat.getCheckTy() == Check::CheckNot) {
       NotStrings.push_back(&Pat);
       continue;
     }
 
-    assert(Pat.getMatchDag() && "Expect CHECK-DAG!");
+    assert((Pat.getCheckTy() == Check::CheckDAG) && "Expect CHECK-DAG!");
 
     size_t MatchLen = 0, MatchPos;
 
@@ -992,17 +1154,17 @@ size_t CheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
         // Reordered?
         SM.PrintMessage(SMLoc::getFromPointer(Buffer.data() + MatchPos),
                         SourceMgr::DK_Error,
-                        CheckPrefix+"-DAG: found a match of CHECK-DAG"
+                        Prefix + "-DAG: found a match of CHECK-DAG"
                         " reordering across a CHECK-NOT");
         SM.PrintMessage(SMLoc::getFromPointer(Buffer.data() + LastPos),
                         SourceMgr::DK_Note,
-                        CheckPrefix+"-DAG: the farthest match of CHECK-DAG"
+                        Prefix + "-DAG: the farthest match of CHECK-DAG"
                         " is found here");
         SM.PrintMessage(NotStrings[0]->getLoc(), SourceMgr::DK_Note,
-                        CheckPrefix+"-NOT: the crossed pattern specified"
+                        Prefix + "-NOT: the crossed pattern specified"
                         " here");
         SM.PrintMessage(Pat.getLoc(), SourceMgr::DK_Note,
-                        CheckPrefix+"-DAG: the reordered pattern specified"
+                        Prefix + "-DAG: the reordered pattern specified"
                         " here");
         return StringRef::npos;
       }
@@ -1026,11 +1188,50 @@ size_t CheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
   return LastPos;
 }
 
+// A check prefix must contain only alphanumeric, hyphens and underscores.
+static bool ValidateCheckPrefix(StringRef CheckPrefix) {
+  Regex Validator("^[a-zA-Z0-9_-]*$");
+  return Validator.match(CheckPrefix);
+}
+
+static bool ValidateCheckPrefixes() {
+  StringSet<> PrefixSet;
+
+  for (prefix_iterator I = CheckPrefixes.begin(), E = CheckPrefixes.end();
+       I != E; ++I) {
+    StringRef Prefix(*I);
+
+    if (!PrefixSet.insert(Prefix))
+      return false;
+
+    if (!ValidateCheckPrefix(Prefix))
+      return false;
+  }
+
+  return true;
+}
+
+// I don't think there's a way to specify an initial value for cl::list,
+// so if nothing was specified, add the default
+static void AddCheckPrefixIfNeeded() {
+  if (CheckPrefixes.empty())
+    CheckPrefixes.push_back("CHECK");
+}
+
 int main(int argc, char **argv) {
   sys::PrintStackTraceOnErrorSignal();
   PrettyStackTraceProgram X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
+  if (!ValidateCheckPrefixes()) {
+    errs() << "Supplied check-prefix is invalid! Prefixes must be unique and "
+              "start with a letter and contain only alphanumeric characters, "
+              "hyphens and underscores\n";
+    return 2;
+  }
+
+  AddCheckPrefixIfNeeded();
+
   SourceMgr SM;
 
   // Read the expected strings from the check file.
@@ -1076,7 +1277,7 @@ int main(int argc, char **argv) {
       CheckRegion = Buffer;
     } else {
       const CheckString &CheckLabelStr = CheckStrings[j];
-      if (!CheckLabelStr.IsCheckLabel) {
+      if (CheckLabelStr.CheckTy != Check::CheckLabel) {
         ++j;
         continue;
       }
diff --git a/utils/FileUpdate/CMakeLists.txt b/utils/FileUpdate/CMakeLists.txt
index 655aaec..0114e50 100644
--- a/utils/FileUpdate/CMakeLists.txt
+++ b/utils/FileUpdate/CMakeLists.txt
@@ -4,7 +4,7 @@ add_llvm_utility(FileUpdate
 
 target_link_libraries(FileUpdate LLVMSupport)
 if( MINGW )
-  target_link_libraries(FileUpdate imagehlp psapi)
+  target_link_libraries(FileUpdate imagehlp psapi shell32)
 endif( MINGW )
 if( LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD )
   target_link_libraries(FileUpdate pthread)
diff --git a/utils/GetRepositoryPath b/utils/GetRepositoryPath
index f3b0cc5..2d1122e 100755
--- a/utils/GetRepositoryPath
+++ b/utils/GetRepositoryPath
@@ -15,11 +15,11 @@ fi
 
 cd $1
 if [ -d .svn ]; then
-  svn info | grep 'URL:' | cut -d: -f2-
+  svn info | grep '^URL:' | cut -d: -f2-
 elif [ -f .git/svn/.metadata ]; then
   git svn info | grep 'URL:' | cut -d: -f2-
 elif [ -d .git ]; then
-  git remote -v | grep 'fetch' | awk '{ print $2 }'
+  git remote -v | grep 'fetch' | awk '{ print $2 }' | head -n1
 else
   exit 1;
 fi
diff --git a/utils/Misc/mergefunctions.clang.svn.patch b/utils/Misc/mergefunctions.clang.svn.patch
new file mode 100644
index 0000000..6e2f0f5
--- /dev/null
+++ b/utils/Misc/mergefunctions.clang.svn.patch
@@ -0,0 +1,14 @@
+Index: lib/CodeGen/BackendUtil.cpp
+===================================================================
+--- lib/CodeGen/BackendUtil.cpp	(revision 191330)
++++ lib/CodeGen/BackendUtil.cpp	(working copy)
+@@ -336,6 +336,9 @@
+       MPM->add(createStripSymbolsPass(true));
+   }
+ 
++  // Force MergeFunctions pass.
++  MPM->add(createMergeFunctionsPass());
++
+   PMBuilder.populateModulePassManager(*MPM);
+ }
+ 
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 468ce1c..de24cde 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -97,7 +97,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
-#include "StringToOffsetTable.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
@@ -110,8 +109,10 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/StringMatcher.h"
+#include "llvm/TableGen/StringToOffsetTable.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <cassert>
+#include <cctype>
 #include <map>
 #include <set>
 #include <sstream>
@@ -125,6 +126,13 @@ namespace {
 class AsmMatcherInfo;
 struct SubtargetFeatureInfo;
 
+// Register sets are used as keys in some second-order sets TableGen creates
+// when generating its data structures. This means that the order of two
+// RegisterSets can be seen in the outputted AsmMatcher tables occasionally, and
+// can even affect compiler output (at least seen in diagnostics produced when
+// all matches fail). So we use a type that sorts them consistently.
+typedef std::set<Record*, LessRecordByID> RegisterSet;
+
 class AsmMatcherEmitter {
   RecordKeeper &Records;
 public:
@@ -185,7 +193,7 @@ struct ClassInfo {
   std::string ParserMethod;
 
   /// For register classes, the records for all the registers in this class.
-  std::set<Record*> Registers;
+  RegisterSet Registers;
 
   /// For custom match classes, he diagnostic kind for when the predicate fails.
   std::string DiagnosticType;
@@ -213,11 +221,11 @@ public:
       if (!isRegisterClass() || !RHS.isRegisterClass())
         return false;
 
-      std::set<Record*> Tmp;
-      std::insert_iterator< std::set<Record*> > II(Tmp, Tmp.begin());
+      RegisterSet Tmp;
+      std::insert_iterator<RegisterSet> II(Tmp, Tmp.begin());
       std::set_intersection(Registers.begin(), Registers.end(),
                             RHS.Registers.begin(), RHS.Registers.end(),
-                            II);
+                            II, LessRecordByID());
 
       return !Tmp.empty();
     }
@@ -430,6 +438,9 @@ struct MatchableInfo {
   /// function.
   std::string ConversionFnKind;
 
+  /// If this instruction is deprecated in some form.
+  bool HasDeprecation;
+
   MatchableInfo(const CodeGenInstruction &CGI)
     : AsmVariantID(0), TheDef(CGI.TheDef), DefRec(&CGI),
       AsmString(CGI.AsmString) {
@@ -613,7 +624,7 @@ public:
   RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
-  std::map<Record*, SubtargetFeatureInfo*> SubtargetFeatures;
+  std::map<Record*, SubtargetFeatureInfo*, LessRecordByID> SubtargetFeatures;
 
   /// Map of AsmOperandClass records to their class information.
   std::map<Record*, ClassInfo*> AsmOperandClasses;
@@ -663,7 +674,7 @@ public:
   /// given operand.
   SubtargetFeatureInfo *getSubtargetFeature(Record *Def) const {
     assert(Def->isSubClassOf("Predicate") && "Invalid predicate type!");
-    std::map<Record*, SubtargetFeatureInfo*>::const_iterator I =
+    std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator I =
       SubtargetFeatures.find(Def);
     return I == SubtargetFeatures.end() ? 0 : I->second;
   }
@@ -779,6 +790,13 @@ void MatchableInfo::initialize(const AsmMatcherInfo &Info,
     if (Record *Reg = AsmOperands[i].SingletonReg)
       SingletonRegisters.insert(Reg);
   }
+
+  const RecordVal *DepMask = TheDef->getValue("DeprecatedFeatureMask");
+  if (!DepMask)
+    DepMask = TheDef->getValue("ComplexDeprecationPredicate");
+
+  HasDeprecation =
+      DepMask ? !DepMask->getValue()->getAsUnquotedString().empty() : false;
 }
 
 /// tokenizeAsmString - Tokenize a simplified assembly string.
@@ -1047,6 +1065,18 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
   PrintFatalError(Rec->getLoc(), "operand has no match class!");
 }
 
+struct LessRegisterSet {
+  bool operator() (const RegisterSet &LHS, const RegisterSet & RHS) const {
+    // std::set<T> defines its own compariso "operator<", but it
+    // performs a lexicographical comparison by T's innate comparison
+    // for some reason. We don't want non-deterministic pointer
+    // comparisons so use this instead.
+    return std::lexicographical_compare(LHS.begin(), LHS.end(),
+                                        RHS.begin(), RHS.end(),
+                                        LessRecordByID());
+  }
+};
+
 void AsmMatcherInfo::
 buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   const std::vector<CodeGenRegister*> &Registers =
@@ -1054,33 +1084,35 @@ buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   ArrayRef<CodeGenRegisterClass*> RegClassList =
     Target.getRegBank().getRegClasses();
 
+  typedef std::set<RegisterSet, LessRegisterSet> RegisterSetSet;
+
   // The register sets used for matching.
-  std::set< std::set<Record*> > RegisterSets;
+  RegisterSetSet RegisterSets;
 
   // Gather the defined sets.
   for (ArrayRef<CodeGenRegisterClass*>::const_iterator it =
-       RegClassList.begin(), ie = RegClassList.end(); it != ie; ++it)
-    RegisterSets.insert(std::set<Record*>(
+         RegClassList.begin(), ie = RegClassList.end(); it != ie; ++it)
+    RegisterSets.insert(RegisterSet(
         (*it)->getOrder().begin(), (*it)->getOrder().end()));
 
   // Add any required singleton sets.
   for (SmallPtrSet<Record*, 16>::iterator it = SingletonRegisters.begin(),
        ie = SingletonRegisters.end(); it != ie; ++it) {
     Record *Rec = *it;
-    RegisterSets.insert(std::set<Record*>(&Rec, &Rec + 1));
+    RegisterSets.insert(RegisterSet(&Rec, &Rec + 1));
   }
 
   // Introduce derived sets where necessary (when a register does not determine
   // a unique register set class), and build the mapping of registers to the set
   // they should classify to.
-  std::map<Record*, std::set<Record*> > RegisterMap;
+  std::map<Record*, RegisterSet> RegisterMap;
   for (std::vector<CodeGenRegister*>::const_iterator it = Registers.begin(),
          ie = Registers.end(); it != ie; ++it) {
     const CodeGenRegister &CGR = **it;
     // Compute the intersection of all sets containing this register.
-    std::set<Record*> ContainingSet;
+    RegisterSet ContainingSet;
 
-    for (std::set< std::set<Record*> >::iterator it = RegisterSets.begin(),
+    for (RegisterSetSet::iterator it = RegisterSets.begin(),
            ie = RegisterSets.end(); it != ie; ++it) {
       if (!it->count(CGR.TheDef))
         continue;
@@ -1090,11 +1122,12 @@ buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
         continue;
       }
 
-      std::set<Record*> Tmp;
+      RegisterSet Tmp;
       std::swap(Tmp, ContainingSet);
-      std::insert_iterator< std::set<Record*> > II(ContainingSet,
-                                                   ContainingSet.begin());
-      std::set_intersection(Tmp.begin(), Tmp.end(), it->begin(), it->end(), II);
+      std::insert_iterator<RegisterSet> II(ContainingSet,
+                                           ContainingSet.begin());
+      std::set_intersection(Tmp.begin(), Tmp.end(), it->begin(), it->end(), II,
+                            LessRecordByID());
     }
 
     if (!ContainingSet.empty()) {
@@ -1104,9 +1137,9 @@ buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   }
 
   // Construct the register classes.
-  std::map<std::set<Record*>, ClassInfo*> RegisterSetClasses;
+  std::map<RegisterSet, ClassInfo*, LessRegisterSet> RegisterSetClasses;
   unsigned Index = 0;
-  for (std::set< std::set<Record*> >::iterator it = RegisterSets.begin(),
+  for (RegisterSetSet::iterator it = RegisterSets.begin(),
          ie = RegisterSets.end(); it != ie; ++it, ++Index) {
     ClassInfo *CI = new ClassInfo();
     CI->Kind = ClassInfo::RegisterClass0 + Index;
@@ -1124,13 +1157,14 @@ buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
 
   // Find the superclasses; we could compute only the subgroup lattice edges,
   // but there isn't really a point.
-  for (std::set< std::set<Record*> >::iterator it = RegisterSets.begin(),
+  for (RegisterSetSet::iterator it = RegisterSets.begin(),
          ie = RegisterSets.end(); it != ie; ++it) {
     ClassInfo *CI = RegisterSetClasses[*it];
-    for (std::set< std::set<Record*> >::iterator it2 = RegisterSets.begin(),
+    for (RegisterSetSet::iterator it2 = RegisterSets.begin(),
            ie2 = RegisterSets.end(); it2 != ie2; ++it2)
       if (*it != *it2 &&
-          std::includes(it2->begin(), it2->end(), it->begin(), it->end()))
+          std::includes(it2->begin(), it2->end(), it->begin(), it->end(),
+                        LessRecordByID()))
         CI->SuperClasses.push_back(RegisterSetClasses[*it2]);
   }
 
@@ -1142,8 +1176,8 @@ buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
     Record *Def = RC.getDef();
     if (!Def)
       continue;
-    ClassInfo *CI = RegisterSetClasses[std::set<Record*>(RC.getOrder().begin(),
-                                                         RC.getOrder().end())];
+    ClassInfo *CI = RegisterSetClasses[RegisterSet(RC.getOrder().begin(),
+                                                   RC.getOrder().end())];
     if (CI->ValueName.empty()) {
       CI->ClassName = RC.getName();
       CI->Name = "MCK_" + RC.getName();
@@ -1155,7 +1189,7 @@ buildRegisterClasses(SmallPtrSet<Record*, 16> &SingletonRegisters) {
   }
 
   // Populate the map for individual registers.
-  for (std::map<Record*, std::set<Record*> >::iterator it = RegisterMap.begin(),
+  for (std::map<Record*, RegisterSet>::iterator it = RegisterMap.begin(),
          ie = RegisterMap.end(); it != ie; ++it)
     RegisterClasses[it->first] = RegisterSetClasses[it->second];
 
@@ -2179,7 +2213,7 @@ static void emitSubtargetFeatureFlagEnumeration(AsmMatcherInfo &Info,
   OS << "// Flags for subtarget features that participate in "
      << "instruction matching.\n";
   OS << "enum SubtargetFeatureFlag {\n";
-  for (std::map<Record*, SubtargetFeatureInfo*>::const_iterator
+  for (std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator
          it = Info.SubtargetFeatures.begin(),
          ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
     SubtargetFeatureInfo &SFI = *it->second;
@@ -2217,9 +2251,9 @@ static void emitGetSubtargetFeatureName(AsmMatcherInfo &Info, raw_ostream &OS) {
      << "static const char *getSubtargetFeatureName(unsigned Val) {\n";
   if (!Info.SubtargetFeatures.empty()) {
     OS << "  switch(Val) {\n";
-    for (std::map<Record*, SubtargetFeatureInfo*>::const_iterator
-           it = Info.SubtargetFeatures.begin(),
-           ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
+    typedef std::map<Record*, SubtargetFeatureInfo*, LessRecordByID> RecFeatMap;
+    for (RecFeatMap::const_iterator it = Info.SubtargetFeatures.begin(),
+             ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
       SubtargetFeatureInfo &SFI = *it->second;
       // FIXME: Totally just a placeholder name to get the algorithm working.
       OS << "  case " << SFI.getEnumName() << ": return \""
@@ -2244,7 +2278,7 @@ static void emitComputeAvailableFeatures(AsmMatcherInfo &Info,
   OS << "unsigned " << Info.Target.getName() << ClassName << "::\n"
      << "ComputeAvailableFeatures(uint64_t FB) const {\n";
   OS << "  unsigned Features = 0;\n";
-  for (std::map<Record*, SubtargetFeatureInfo*>::const_iterator
+  for (std::map<Record*, SubtargetFeatureInfo*, LessRecordByID>::const_iterator
          it = Info.SubtargetFeatures.begin(),
          ie = Info.SubtargetFeatures.end(); it != ie; ++it) {
     SubtargetFeatureInfo &SFI = *it->second;
@@ -2743,11 +2777,13 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   size_t MaxNumOperands = 0;
   unsigned MaxMnemonicIndex = 0;
+  bool HasDeprecation = false;
   for (std::vector<MatchableInfo*>::const_iterator it =
          Info.Matchables.begin(), ie = Info.Matchables.end();
        it != ie; ++it) {
     MatchableInfo &II = **it;
     MaxNumOperands = std::max(MaxNumOperands, II.AsmOperands.size());
+    HasDeprecation |= II.HasDeprecation;
 
     // Store a pascal-style length byte in the mnemonic.
     std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
@@ -2804,9 +2840,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   unsigned VariantCount = Target.getAsmParserVariantCount();
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
-    std::string CommentDelimiter =
-      AsmVariant->getValueAsString("CommentDelimiter");
-    std::string RegisterPrefix = AsmVariant->getValueAsString("RegisterPrefix");
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
 
     OS << "static const MatchEntry MatchTable" << VC << "[] = {\n";
@@ -2857,9 +2890,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  default: // unreachable\n";
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
-    std::string CommentDelimiter =
-      AsmVariant->getValueAsString("CommentDelimiter");
-    std::string RegisterPrefix = AsmVariant->getValueAsString("RegisterPrefix");
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
     OS << "  case " << AsmVariantNo << ": Start = MatchTable" << VC
        << "; End = array_endof(MatchTable" << VC << "); break;\n";
@@ -2915,9 +2945,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  default: // unreachable\n";
   for (unsigned VC = 0; VC != VariantCount; ++VC) {
     Record *AsmVariant = Target.getAsmParserVariant(VC);
-    std::string CommentDelimiter =
-      AsmVariant->getValueAsString("CommentDelimiter");
-    std::string RegisterPrefix = AsmVariant->getValueAsString("RegisterPrefix");
     int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
     OS << "  case " << AsmVariantNo << ": Start = MatchTable" << VC
        << "; End = array_endof(MatchTable" << VC << "); break;\n";
@@ -3018,6 +3045,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   if (!InsnCleanupFn.empty())
     OS << "    " << InsnCleanupFn << "(Inst);\n";
 
+  if (HasDeprecation) {
+    OS << "    std::string Info;\n";
+    OS << "    if (MII.get(Inst.getOpcode()).getDeprecatedInfo(Inst, STI, Info)) {\n";
+    OS << "      SMLoc Loc = ((" << Target.getName() << "Operand*)Operands[0])->getStartLoc();\n";
+    OS << "      Parser.Warning(Loc, Info, None);\n";
+    OS << "    }\n";
+  }
+
   OS << "    return Match_Success;\n";
   OS << "  }\n\n";
 
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index ac8d896..a18b6b5 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -31,10 +32,12 @@ using namespace llvm;
 namespace {
 class AsmWriterEmitter {
   RecordKeeper &Records;
+  CodeGenTarget Target;
   std::map<const CodeGenInstruction*, AsmWriterInst*> CGIAWIMap;
   std::vector<const CodeGenInstruction*> NumberedInstructions;
+  std::vector<AsmWriterInst> Instructions;
 public:
-  AsmWriterEmitter(RecordKeeper &R) : Records(R) {}
+  AsmWriterEmitter(RecordKeeper &R);
 
   void run(raw_ostream &o);
 
@@ -272,9 +275,9 @@ static void UnescapeString(std::string &Str) {
 }
 
 /// EmitPrintInstruction - Generate the code for the "printInstruction" method
-/// implementation.
+/// implementation. Destroys all instances of AsmWriterInst information, by
+/// clearing the Instructions vector.
 void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
-  CodeGenTarget Target(Records);
   Record *AsmWriter = Target.getAsmWriter();
   std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
   bool isMC = AsmWriter->getValueAsBit("isMCAsmWriter");
@@ -287,27 +290,6 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
             << "::printInstruction(const " << MachineInstrClassName
             << " *MI, raw_ostream &O) {\n";
 
-  std::vector<AsmWriterInst> Instructions;
-
-  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
-         E = Target.inst_end(); I != E; ++I)
-    if (!(*I)->AsmString.empty() &&
-        (*I)->TheDef->getName() != "PHI")
-      Instructions.push_back(
-        AsmWriterInst(**I,
-                      AsmWriter->getValueAsInt("Variant"),
-                      AsmWriter->getValueAsInt("FirstOperandColumn"),
-                      AsmWriter->getValueAsInt("OperandSpacing")));
-
-  // Get the instruction numbering.
-  NumberedInstructions = Target.getInstructionsByEnumValue();
-
-  // Compute the CodeGenInstruction -> AsmWriterInst mapping.  Note that not
-  // all machine instructions are necessarily being printed, so there may be
-  // target instructions not in this map.
-  for (unsigned i = 0, e = Instructions.size(); i != e; ++i)
-    CGIAWIMap.insert(std::make_pair(Instructions[i].CGI, &Instructions[i]));
-
   // Build an aggregate string, and build a table of offsets into it.
   SequenceToOffsetTable<std::string> StringTable;
 
@@ -591,7 +573,6 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
 }
 
 void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) {
-  CodeGenTarget Target(Records);
   Record *AsmWriter = Target.getAsmWriter();
   std::string ClassName = AsmWriter->getValueAsString("AsmWriterClassName");
   const std::vector<CodeGenRegister*> &Registers =
@@ -657,7 +638,10 @@ public:
 
   void addCond(const std::string &C) { Conds.push_back(C); }
 
-  void addOperand(StringRef Op, unsigned Idx) { OpMap[Op] = Idx; }
+  void addOperand(StringRef Op, unsigned Idx) {
+    assert(Idx < 0xFF && "Index too large!");
+    OpMap[Op] = Idx;
+  }
   unsigned getOpIndex(StringRef Op) { return OpMap[Op]; }
   bool isOpMapped(StringRef Op) { return OpMap.find(Op) != OpMap.end(); }
 
@@ -681,12 +665,35 @@ public:
 
     O << ") {\n";
     O.indent(6) << "// " << Result << "\n";
-    O.indent(6) << "AsmString = \"" << AsmString << "\";\n";
 
-    for (std::map<StringRef, unsigned>::iterator
-           I = OpMap.begin(), E = OpMap.end(); I != E; ++I)
-      O.indent(6) << "OpMap.push_back(std::make_pair(\"" << I->first << "\", "
-                  << I->second << "));\n";
+    // Directly mangle mapped operands into the string. Each operand is
+    // identified by a '$' sign followed by a byte identifying the number of the
+    // operand. We add one to the index to avoid zero bytes.
+    std::pair<StringRef, StringRef> ASM = StringRef(AsmString).split(' ');
+    SmallString<128> OutString = ASM.first;
+    if (!ASM.second.empty()) {
+      raw_svector_ostream OS(OutString);
+      OS << ' ';
+      for (StringRef::iterator I = ASM.second.begin(), E = ASM.second.end();
+           I != E;) {
+        OS << *I;
+        if (*I == '$') {
+          StringRef::iterator Start = ++I;
+          while (I != E &&
+                 ((*I >= 'a' && *I <= 'z') || (*I >= 'A' && *I <= 'Z') ||
+                  (*I >= '0' && *I <= '9') || *I == '_'))
+            ++I;
+          StringRef Name(Start, I - Start);
+          assert(isOpMapped(Name) && "Unmapped operand!");
+          OS << format("\\x%02X", (unsigned char)getOpIndex(Name) + 1);
+        } else {
+          ++I;
+        }
+      }
+    }
+
+    // Emit the string.
+    O.indent(6) << "AsmString = \"" << OutString.str() << "\";\n";
 
     O.indent(6) << "break;\n";
     O.indent(4) << '}';
@@ -721,19 +728,6 @@ public:
 
 } // end anonymous namespace
 
-static void EmitGetMapOperandNumber(raw_ostream &O) {
-  O << "static unsigned getMapOperandNumber("
-    << "const SmallVectorImpl<std::pair<StringRef, unsigned> > &OpMap,\n";
-  O << "                                    StringRef Name) {\n";
-  O << "  for (SmallVectorImpl<std::pair<StringRef, unsigned> >::"
-    << "const_iterator\n";
-  O << "         I = OpMap.begin(), E = OpMap.end(); I != E; ++I)\n";
-  O << "    if (I->first == Name)\n";
-  O << "      return I->second;\n";
-  O << "  llvm_unreachable(\"Operand not in map!\");\n";
-  O << "}\n\n";
-}
-
 static unsigned CountNumOperands(StringRef AsmString) {
   unsigned NumOps = 0;
   std::pair<StringRef, StringRef> ASM = AsmString.split(' ');
@@ -768,7 +762,6 @@ static unsigned CountResultNumOperands(StringRef AsmString) {
 }
 
 void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
-  CodeGenTarget Target(Records);
   Record *AsmWriter = Target.getAsmWriter();
 
   if (!AsmWriter->getValueAsBit("isMCAsmWriter"))
@@ -822,7 +815,6 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
       Cond = std::string("MI->getNumOperands() == ") + llvm::utostr(LastOpNo);
       IAP->addCond(Cond);
 
-      std::map<StringRef, unsigned> OpMap;
       bool CantHandle = false;
 
       for (unsigned i = 0, e = LastOpNo; i != e; ++i) {
@@ -955,11 +947,8 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     return;
   }
 
-  EmitGetMapOperandNumber(O);
-
   O << HeaderO.str();
-  O.indent(2) << "StringRef AsmString;\n";
-  O.indent(2) << "SmallVector<std::pair<StringRef, unsigned>, 4> OpMap;\n";
+  O.indent(2) << "const char *AsmString;\n";
   O.indent(2) << "switch (MI->getOpcode()) {\n";
   O.indent(2) << "default: return false;\n";
   O << CasesO.str();
@@ -967,27 +956,21 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
 
   // Code that prints the alias, replacing the operands with the ones from the
   // MCInst.
-  O << "  std::pair<StringRef, StringRef> ASM = AsmString.split(' ');\n";
-  O << "  OS << '\\t' << ASM.first;\n";
+  O << "  unsigned I = 0;\n";
+  O << "  while (AsmString[I] != ' ' && AsmString[I] != '\\0')\n";
+  O << "    ++I;\n";
+  O << "  OS << '\\t' << StringRef(AsmString, I);\n";
 
-  O << "  if (!ASM.second.empty()) {\n";
+  O << "  if (AsmString[I] != '\\0') {\n";
   O << "    OS << '\\t';\n";
-  O << "    for (StringRef::iterator\n";
-  O << "         I = ASM.second.begin(), E = ASM.second.end(); I != E; ) {\n";
-  O << "      if (*I == '$') {\n";
-  O << "        StringRef::iterator Start = ++I;\n";
-  O << "        while (I != E &&\n";
-  O << "               ((*I >= 'a' && *I <= 'z') ||\n";
-  O << "                (*I >= 'A' && *I <= 'Z') ||\n";
-  O << "                (*I >= '0' && *I <= '9') ||\n";
-  O << "                *I == '_'))\n";
-  O << "          ++I;\n";
-  O << "        StringRef Name(Start, I - Start);\n";
-  O << "        printOperand(MI, getMapOperandNumber(OpMap, Name), OS);\n";
+  O << "    do {\n";
+  O << "      if (AsmString[I] == '$') {\n";
+  O << "        ++I;\n";
+  O << "        printOperand(MI, unsigned(AsmString[I++]) - 1, OS);\n";
   O << "      } else {\n";
-  O << "        OS << *I++;\n";
+  O << "        OS << AsmString[I++];\n";
   O << "      }\n";
-  O << "    }\n";
+  O << "    } while (AsmString[I] != '\\0');\n";
   O << "  }\n\n";
 
   O << "  return true;\n";
@@ -996,6 +979,27 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
   O << "#endif // PRINT_ALIAS_INSTR\n";
 }
 
+AsmWriterEmitter::AsmWriterEmitter(RecordKeeper &R) : Records(R), Target(R) {
+  Record *AsmWriter = Target.getAsmWriter();
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+                                    E = Target.inst_end();
+       I != E; ++I)
+    if (!(*I)->AsmString.empty() && (*I)->TheDef->getName() != "PHI")
+      Instructions.push_back(
+          AsmWriterInst(**I, AsmWriter->getValueAsInt("Variant"),
+                        AsmWriter->getValueAsInt("FirstOperandColumn"),
+                        AsmWriter->getValueAsInt("OperandSpacing")));
+
+  // Get the instruction numbering.
+  NumberedInstructions = Target.getInstructionsByEnumValue();
+
+  // Compute the CodeGenInstruction -> AsmWriterInst mapping.  Note that not
+  // all machine instructions are necessarily being printed, so there may be
+  // target instructions not in this map.
+  for (unsigned i = 0, e = Instructions.size(); i != e; ++i)
+    CGIAWIMap.insert(std::make_pair(Instructions[i].CGI, &Instructions[i]));
+}
+
 void AsmWriterEmitter::run(raw_ostream &O) {
   EmitPrintInstruction(O);
   EmitGetRegisterName(O);
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index ee025a0..717090a 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -30,16 +30,16 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 static inline bool isInteger(MVT::SimpleValueType VT) {
-  return EVT(VT).isInteger();
+  return MVT(VT).isInteger();
 }
 static inline bool isFloatingPoint(MVT::SimpleValueType VT) {
-  return EVT(VT).isFloatingPoint();
+  return MVT(VT).isFloatingPoint();
 }
 static inline bool isVector(MVT::SimpleValueType VT) {
-  return EVT(VT).isVector();
+  return MVT(VT).isVector();
 }
 static inline bool isScalar(MVT::SimpleValueType VT) {
-  return !EVT(VT).isVector();
+  return !MVT(VT).isVector();
 }
 
 EEVT::TypeSet::TypeSet(MVT::SimpleValueType VT, TreePattern &TP) {
@@ -385,8 +385,8 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
     // Otherwise, if these are both vector types, either this vector
     // must have a larger bitsize than the other, or this element type
     // must be larger than the other.
-    EVT Type(TypeVec[0]);
-    EVT OtherType(Other.TypeVec[0]);
+    MVT Type(TypeVec[0]);
+    MVT OtherType(Other.TypeVec[0]);
 
     if (hasVectorTypes() && Other.hasVectorTypes()) {
       if (Type.getSizeInBits() >= OtherType.getSizeInBits())
@@ -397,8 +397,7 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
                    Other.getName() +"'!");
           return false;
         }
-    }
-    else
+    } else
       // For scalar types, the bitsize of this type must be larger
       // than that of the other.
       if (Type.getSizeInBits() >= OtherType.getSizeInBits()) {
@@ -450,8 +449,7 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
         MadeChange = true;
         continue;
       }
-    }
-    else if (isFloatingPoint(*TVI)) {
+    } else if (isFloatingPoint(*TVI)) {
       ++OtherFPSize;
       if (*TVI == SmallestFP) {
         TVI = Other.TypeVec.erase(TVI);
@@ -465,8 +463,8 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
 
   // If this is the only type in the large set, the constraint can never be
   // satisfied.
-  if ((Other.hasIntegerTypes() && OtherIntSize == 0)
-      || (Other.hasFloatingPointTypes() && OtherFPSize == 0)) {
+  if ((Other.hasIntegerTypes() && OtherIntSize == 0) ||
+      (Other.hasFloatingPointTypes() && OtherFPSize == 0)) {
     TP.error("Type inference contradiction found, '" +
              Other.getName() + "' has nothing larger than '" + getName() +"'!");
     return false;
@@ -508,8 +506,7 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
         MadeChange = true;
         continue;
       }
-    }
-    else if (isFloatingPoint(*TVI)) {
+    } else if (isFloatingPoint(*TVI)) {
       ++FPSize;
       if (*TVI == LargestFP) {
         TVI = TypeVec.erase(TVI);
@@ -523,8 +520,8 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
 
   // If this is the only type in the small set, the constraint can never be
   // satisfied.
-  if ((hasIntegerTypes() && IntSize == 0)
-      || (hasFloatingPointTypes() && FPSize == 0)) {
+  if ((hasIntegerTypes() && IntSize == 0) ||
+      (hasFloatingPointTypes() && FPSize == 0)) {
     TP.error("Type inference contradiction found, '" +
              getName() + "' has nothing smaller than '" + Other.getName()+"'!");
     return false;
@@ -547,10 +544,10 @@ bool EEVT::TypeSet::EnforceVectorEltTypeIs(EEVT::TypeSet &VTOperand,
 
   // If we know the vector type, it forces the scalar to agree.
   if (isConcrete()) {
-    EVT IVT = getConcrete();
+    MVT IVT = getConcrete();
     IVT = IVT.getVectorElementType();
     return MadeChange |
-      VTOperand.MergeInTypeInfo(IVT.getSimpleVT().SimpleTy, TP);
+      VTOperand.MergeInTypeInfo(IVT.SimpleTy, TP);
   }
 
   // If the scalar type is known, filter out vector types whose element types
@@ -565,7 +562,7 @@ bool EEVT::TypeSet::EnforceVectorEltTypeIs(EEVT::TypeSet &VTOperand,
   // Filter out all the types which don't have the right element type.
   for (unsigned i = 0; i != TypeVec.size(); ++i) {
     assert(isVector(TypeVec[i]) && "EnforceVector didn't work");
-    if (EVT(TypeVec[i]).getVectorElementType().getSimpleVT().SimpleTy != VT) {
+    if (MVT(TypeVec[i]).getVectorElementType().SimpleTy != VT) {
       TypeVec.erase(TypeVec.begin()+i--);
       MadeChange = true;
     }
@@ -593,16 +590,16 @@ bool EEVT::TypeSet::EnforceVectorSubVectorTypeIs(EEVT::TypeSet &VTOperand,
 
   // If we know the vector type, it forces the scalar types to agree.
   if (isConcrete()) {
-    EVT IVT = getConcrete();
+    MVT IVT = getConcrete();
     IVT = IVT.getVectorElementType();
 
-    EEVT::TypeSet EltTypeSet(IVT.getSimpleVT().SimpleTy, TP);
+    EEVT::TypeSet EltTypeSet(IVT.SimpleTy, TP);
     MadeChange |= VTOperand.EnforceVectorEltTypeIs(EltTypeSet, TP);
   } else if (VTOperand.isConcrete()) {
-    EVT IVT = VTOperand.getConcrete();
+    MVT IVT = VTOperand.getConcrete();
     IVT = IVT.getVectorElementType();
 
-    EEVT::TypeSet EltTypeSet(IVT.getSimpleVT().SimpleTy, TP);
+    EEVT::TypeSet EltTypeSet(IVT.SimpleTy, TP);
     MadeChange |= EnforceVectorEltTypeIs(EltTypeSet, TP);
   }
 
@@ -1522,7 +1519,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       if (VT == MVT::iPTR || VT == MVT::iPTRAny)
         return MadeChange;
 
-      unsigned Size = EVT(VT).getSizeInBits();
+      unsigned Size = MVT(VT).getSizeInBits();
       // Make sure that the value is representable for this type.
       if (Size >= 32) return MadeChange;
 
@@ -2678,54 +2675,13 @@ static bool checkOperandClass(CGIOperandList::OperandInfo &OI,
   return false;
 }
 
-/// ParseInstructions - Parse all of the instructions, inlining and resolving
-/// any fragments involved.  This populates the Instructions list with fully
-/// resolved instructions.
-void CodeGenDAGPatterns::ParseInstructions() {
-  std::vector<Record*> Instrs = Records.getAllDerivedDefinitions("Instruction");
-
-  for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
-    ListInit *LI = 0;
+const DAGInstruction &CodeGenDAGPatterns::parseInstructionPattern(
+    CodeGenInstruction &CGI, ListInit *Pat, DAGInstMap &DAGInsts) {
 
-    if (isa<ListInit>(Instrs[i]->getValueInit("Pattern")))
-      LI = Instrs[i]->getValueAsListInit("Pattern");
-
-    // If there is no pattern, only collect minimal information about the
-    // instruction for its operand list.  We have to assume that there is one
-    // result, as we have no detailed info. A pattern which references the
-    // null_frag operator is as-if no pattern were specified. Normally this
-    // is from a multiclass expansion w/ a SDPatternOperator passed in as
-    // null_frag.
-    if (!LI || LI->getSize() == 0 || hasNullFragReference(LI)) {
-      std::vector<Record*> Results;
-      std::vector<Record*> Operands;
-
-      CodeGenInstruction &InstInfo = Target.getInstruction(Instrs[i]);
-
-      if (InstInfo.Operands.size() != 0) {
-        if (InstInfo.Operands.NumDefs == 0) {
-          // These produce no results
-          for (unsigned j = 0, e = InstInfo.Operands.size(); j < e; ++j)
-            Operands.push_back(InstInfo.Operands[j].Rec);
-        } else {
-          // Assume the first operand is the result.
-          Results.push_back(InstInfo.Operands[0].Rec);
-
-          // The rest are inputs.
-          for (unsigned j = 1, e = InstInfo.Operands.size(); j < e; ++j)
-            Operands.push_back(InstInfo.Operands[j].Rec);
-        }
-      }
-
-      // Create and insert the instruction.
-      std::vector<Record*> ImpResults;
-      Instructions.insert(std::make_pair(Instrs[i],
-                          DAGInstruction(0, Results, Operands, ImpResults)));
-      continue;  // no pattern.
-    }
+    assert(!DAGInsts.count(CGI.TheDef) && "Instruction already parsed!");
 
     // Parse the instruction.
-    TreePattern *I = new TreePattern(Instrs[i], LI, true, *this);
+    TreePattern *I = new TreePattern(CGI.TheDef, Pat, true, *this);
     // Inline pattern fragments into it.
     I->InlinePatternFragments();
 
@@ -2764,7 +2720,6 @@ void CodeGenDAGPatterns::ParseInstructions() {
 
     // Parse the operands list from the (ops) list, validating it.
     assert(I->getArgList().empty() && "Args list should still be empty here!");
-    CodeGenInstruction &CGI = Target.getInstruction(Instrs[i]);
 
     // Check that all of the results occur first in the list.
     std::vector<Record*> Results;
@@ -2863,18 +2818,71 @@ void CodeGenDAGPatterns::ParseInstructions() {
     // Create and insert the instruction.
     // FIXME: InstImpResults should not be part of DAGInstruction.
     DAGInstruction TheInst(I, Results, Operands, InstImpResults);
-    Instructions.insert(std::make_pair(I->getRecord(), TheInst));
+    DAGInsts.insert(std::make_pair(I->getRecord(), TheInst));
 
     // Use a temporary tree pattern to infer all types and make sure that the
     // constructed result is correct.  This depends on the instruction already
-    // being inserted into the Instructions map.
+    // being inserted into the DAGInsts map.
     TreePattern Temp(I->getRecord(), ResultPattern, false, *this);
     Temp.InferAllTypes(&I->getNamedNodesMap());
 
-    DAGInstruction &TheInsertedInst = Instructions.find(I->getRecord())->second;
+    DAGInstruction &TheInsertedInst = DAGInsts.find(I->getRecord())->second;
     TheInsertedInst.setResultPattern(Temp.getOnlyTree());
 
-    DEBUG(I->dump());
+    return TheInsertedInst;
+  }
+
+/// ParseInstructions - Parse all of the instructions, inlining and resolving
+/// any fragments involved.  This populates the Instructions list with fully
+/// resolved instructions.
+void CodeGenDAGPatterns::ParseInstructions() {
+  std::vector<Record*> Instrs = Records.getAllDerivedDefinitions("Instruction");
+
+  for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+    ListInit *LI = 0;
+
+    if (isa<ListInit>(Instrs[i]->getValueInit("Pattern")))
+      LI = Instrs[i]->getValueAsListInit("Pattern");
+
+    // If there is no pattern, only collect minimal information about the
+    // instruction for its operand list.  We have to assume that there is one
+    // result, as we have no detailed info. A pattern which references the
+    // null_frag operator is as-if no pattern were specified. Normally this
+    // is from a multiclass expansion w/ a SDPatternOperator passed in as
+    // null_frag.
+    if (!LI || LI->getSize() == 0 || hasNullFragReference(LI)) {
+      std::vector<Record*> Results;
+      std::vector<Record*> Operands;
+
+      CodeGenInstruction &InstInfo = Target.getInstruction(Instrs[i]);
+
+      if (InstInfo.Operands.size() != 0) {
+        if (InstInfo.Operands.NumDefs == 0) {
+          // These produce no results
+          for (unsigned j = 0, e = InstInfo.Operands.size(); j < e; ++j)
+            Operands.push_back(InstInfo.Operands[j].Rec);
+        } else {
+          // Assume the first operand is the result.
+          Results.push_back(InstInfo.Operands[0].Rec);
+
+          // The rest are inputs.
+          for (unsigned j = 1, e = InstInfo.Operands.size(); j < e; ++j)
+            Operands.push_back(InstInfo.Operands[j].Rec);
+        }
+      }
+
+      // Create and insert the instruction.
+      std::vector<Record*> ImpResults;
+      Instructions.insert(std::make_pair(Instrs[i],
+                          DAGInstruction(0, Results, Operands, ImpResults)));
+      continue;  // no pattern.
+    }
+
+    CodeGenInstruction &CGI = Target.getInstruction(Instrs[i]);
+    const DAGInstruction &DI = parseInstructionPattern(CGI, LI, Instructions);
+
+    (void)DI;
+    DEBUG(DI.getPattern()->dump());
   }
 
   // If we can, convert the instructions to be patterns that are matched!
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 7c2fa36..6fbdd4f 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -778,7 +778,11 @@ public:
   ptm_iterator ptm_begin() const { return PatternsToMatch.begin(); }
   ptm_iterator ptm_end() const { return PatternsToMatch.end(); }
 
-
+  /// Parse the Pattern for an instruction, and insert the result in DAGInsts.
+  typedef std::map<Record*, DAGInstruction, LessRecordByID> DAGInstMap;
+  const DAGInstruction &parseInstructionPattern(
+      CodeGenInstruction &CGI, ListInit *Pattern,
+      DAGInstMap &DAGInsts);
 
   const DAGInstruction &getInstruction(Record *R) const {
     assert(Instructions.count(R) && "Unknown instruction!");
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 3673204..576388b 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -90,7 +90,7 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
       if (unsigned NumArgs = MIOpInfo->getNumArgs())
         NumOps = NumArgs;
 
-      if (Rec->isSubClassOf("PredicateOperand"))
+      if (Rec->isSubClassOf("PredicateOp"))
         isPredicable = true;
       else if (Rec->isSubClassOf("OptionalDefOperand"))
         hasOptionalDef = true;
@@ -192,6 +192,7 @@ CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
 
   // Otherwise, didn't find it!
   PrintFatalError(TheDef->getName() + ": unknown suboperand name in '" + Op + "'");
+  return std::make_pair(0U, 0U);
 }
 
 static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
@@ -336,6 +337,20 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
 
   // Parse the DisableEncoding field.
   Operands.ProcessDisableEncoding(R->getValueAsString("DisableEncoding"));
+
+  // First check for a ComplexDeprecationPredicate.
+  if (R->getValue("ComplexDeprecationPredicate")) {
+    HasComplexDeprecationPredicate = true;
+    DeprecatedReason = R->getValueAsString("ComplexDeprecationPredicate");
+  } else if (RecordVal *Dep = R->getValue("DeprecatedFeatureMask")) {
+    // Check if we have a Subtarget feature mask.
+    HasComplexDeprecationPredicate = false;
+    DeprecatedReason = Dep->getValue()->getAsString();
+  } else {
+    // This instruction isn't deprecated.
+    HasComplexDeprecationPredicate = false;
+    DeprecatedReason = "";
+  }
 }
 
 /// HasOneImplicitDefWithKnownVT - If the instruction has at least one
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index d1e1153..6004f66 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -248,6 +248,9 @@ namespace llvm {
     bool isCodeGenOnly;
     bool isPseudo;
 
+    std::string DeprecatedReason;
+    bool HasComplexDeprecationPredicate;
+
     /// Are there any undefined flags?
     bool hasUndefFlags() const {
       return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset;
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
index ee32aa1..cb7ec3e 100644
--- a/utils/TableGen/CodeGenMapTable.cpp
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -240,7 +240,6 @@ public:
 
 void MapTableEmitter::buildRowInstrMap() {
   for (unsigned i = 0, e = InstrDefs.size(); i < e; i++) {
-    std::vector<Record*> InstrList;
     Record *CurInstr = InstrDefs[i];
     std::vector<Init*> KeyValue;
     ListInit *RowFields = InstrMapDesc.getRowFields();
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 43de2be..f2eef4f 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -813,9 +813,10 @@ static bool testSubClass(const CodeGenRegisterClass *A,
 /// Register classes with the same registers, spill size, and alignment form a
 /// clique.  They will be ordered alphabetically.
 ///
-static int TopoOrderRC(const void *PA, const void *PB) {
-  const CodeGenRegisterClass *A = *(const CodeGenRegisterClass* const*)PA;
-  const CodeGenRegisterClass *B = *(const CodeGenRegisterClass* const*)PB;
+static int TopoOrderRC(CodeGenRegisterClass *const *PA,
+                       CodeGenRegisterClass *const *PB) {
+  const CodeGenRegisterClass *A = *PA;
+  const CodeGenRegisterClass *B = *PB;
   if (A == B)
     return 0;
 
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index 001e97d..dd06433 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -36,10 +36,11 @@ static void dumpIdxVec(const SmallVectorImpl<unsigned> &V) {
 }
 #endif
 
+namespace {
 // (instrs a, b, ...) Evaluate and union all arguments. Identical to AddOp.
 struct InstrsOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
-             ArrayRef<SMLoc> Loc) {
+  virtual void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+                     ArrayRef<SMLoc> Loc) {
     ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc);
   }
 };
@@ -84,6 +85,7 @@ struct InstRegexOp : public SetTheory::Operator {
     DeleteContainerPointers(RegexList);
   }
 };
+} // end anonymous namespace
 
 /// CodeGenModels ctor interprets machine model records and populates maps.
 CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index b2c883d..dd17059 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -75,6 +75,7 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v16i1:    return "MVT::v16i1";
   case MVT::v32i1:    return "MVT::v32i1";
   case MVT::v64i1:    return "MVT::v64i1";
+  case MVT::v1i8:     return "MVT::v1i8";
   case MVT::v2i8:     return "MVT::v2i8";
   case MVT::v4i8:     return "MVT::v4i8";
   case MVT::v8i8:     return "MVT::v8i8";
@@ -98,10 +99,14 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::v8i64:    return "MVT::v8i64";
   case MVT::v16i64:   return "MVT::v16i64";
   case MVT::v2f16:    return "MVT::v2f16";
+  case MVT::v4f16:    return "MVT::v4f16";
+  case MVT::v8f16:    return "MVT::v8f16";
+  case MVT::v1f32:    return "MVT::v1f32";
   case MVT::v2f32:    return "MVT::v2f32";
   case MVT::v4f32:    return "MVT::v4f32";
   case MVT::v8f32:    return "MVT::v8f32";
   case MVT::v16f32:   return "MVT::v16f32";
+  case MVT::v1f64:    return "MVT::v1f64";
   case MVT::v2f64:    return "MVT::v2f64";
   case MVT::v4f64:    return "MVT::v4f64";
   case MVT::v8f64:    return "MVT::v8f64";
@@ -316,6 +321,8 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
     "BUNDLE",
     "LIFETIME_START",
     "LIFETIME_END",
+    "STACKMAP",
+    "PATCHPOINT",
     0
   };
   const DenseMap<const Record*, CodeGenInstruction*> &Insts = getInstructions();
diff --git a/utils/TableGen/DAGISelEmitter.cpp b/utils/TableGen/DAGISelEmitter.cpp
index b47dd71..a76ea32 100644
--- a/utils/TableGen/DAGISelEmitter.cpp
+++ b/utils/TableGen/DAGISelEmitter.cpp
@@ -81,15 +81,13 @@ struct PatternSortingPredicate {
     const TreePatternNode *LHSSrc = LHS->getSrcPattern();
     const TreePatternNode *RHSSrc = RHS->getSrcPattern();
 
-    if (LHSSrc->getNumTypes() != 0 && RHSSrc->getNumTypes() != 0 &&
-        LHSSrc->getType(0) != RHSSrc->getType(0)) {
-      MVT::SimpleValueType V1 = LHSSrc->getType(0), V2 = RHSSrc->getType(0);
-      if (MVT(V1).isVector() != MVT(V2).isVector())
-        return MVT(V2).isVector();
-
-      if (MVT(V1).isFloatingPoint() != MVT(V2).isFloatingPoint())
-        return MVT(V2).isFloatingPoint();
-    }
+    MVT LHSVT = (LHSSrc->getNumTypes() != 0 ? LHSSrc->getType(0) : MVT::Other);
+    MVT RHSVT = (RHSSrc->getNumTypes() != 0 ? RHSSrc->getType(0) : MVT::Other);
+    if (LHSVT.isVector() != RHSVT.isVector())
+      return RHSVT.isVector();
+
+    if (LHSVT.isFloatingPoint() != RHSVT.isFloatingPoint())
+      return RHSVT.isFloatingPoint();
 
     // Otherwise, if the patterns might both match, sort based on complexity,
     // which means that we prefer to match patterns that cover more nodes in the
diff --git a/utils/TableGen/DAGISelMatcher.cpp b/utils/TableGen/DAGISelMatcher.cpp
index d173cf0..5d6a11a 100644
--- a/utils/TableGen/DAGISelMatcher.cpp
+++ b/utils/TableGen/DAGISelMatcher.cpp
@@ -63,7 +63,7 @@ bool Matcher::canMoveBefore(const Matcher *Other) const {
   }
 }
 
-/// canMoveBefore - Return true if it is safe to move the current matcher
+/// canMoveBeforeNode - Return true if it is safe to move the current matcher
 /// across the specified one.
 bool Matcher::canMoveBeforeNode(const Matcher *Other) const {
   // We can move simple predicates before record nodes.
@@ -134,6 +134,10 @@ void CheckSameMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CheckSame " << MatchNumber << '\n';
 }
 
+void CheckChildSameMatcher::printImpl(raw_ostream &OS, unsigned indent) const {
+  OS.indent(indent) << "CheckChild" << ChildNo << "Same\n";
+}
+
 void CheckPatternPredicateMatcher::
 printImpl(raw_ostream &OS, unsigned indent) const {
   OS.indent(indent) << "CheckPatternPredicate " << Predicate << '\n';
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index f978188..70031fa 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -55,6 +55,7 @@ public:
 
     // Predicate checking.
     CheckSame,            // Fail if not same as prev match.
+    CheckChildSame,       // Fail if child not same as prev match.
     CheckPatternPredicate,
     CheckPredicate,       // Fail if node predicate fails.
     CheckOpcode,          // Fail if not opcode.
@@ -122,6 +123,7 @@ public:
     switch (getKind()) {
     default: return false;
     case CheckSame:
+    case CheckChildSame:
     case CheckPatternPredicate:
     case CheckPredicate:
     case CheckOpcode:
@@ -154,7 +156,7 @@ public:
   /// node.  Other must be equal to or before this.
   bool canMoveBefore(const Matcher *Other) const;
 
-  /// canMoveBefore - Return true if it is safe to move the current matcher
+  /// canMoveBeforeNode - Return true if it is safe to move the current matcher
   /// across the specified one.
   bool canMoveBeforeNode(const Matcher *Other) const;
 
@@ -392,6 +394,34 @@ private:
   virtual unsigned getHashImpl() const { return getMatchNumber(); }
 };
 
+/// CheckChildSameMatcher - This checks to see if child node is exactly the same
+/// node as the specified match that was recorded with 'Record'.  This is used
+/// when patterns have the same name in them, like '(mul GPR:$in, GPR:$in)'.
+class CheckChildSameMatcher : public Matcher {
+  unsigned ChildNo;
+  unsigned MatchNumber;
+public:
+  CheckChildSameMatcher(unsigned childno, unsigned matchnumber)
+    : Matcher(CheckChildSame), ChildNo(childno), MatchNumber(matchnumber) {}
+
+  unsigned getChildNo() const { return ChildNo; }
+  unsigned getMatchNumber() const { return MatchNumber; }
+
+  static inline bool classof(const Matcher *N) {
+    return N->getKind() == CheckChildSame;
+  }
+
+  virtual bool isSafeToReorderWithPatternPredicate() const { return true; }
+
+private:
+  virtual void printImpl(raw_ostream &OS, unsigned indent) const;
+  virtual bool isEqualImpl(const Matcher *M) const {
+    return cast<CheckChildSameMatcher>(M)->ChildNo == ChildNo &&
+           cast<CheckChildSameMatcher>(M)->MatchNumber == MatchNumber;
+  }
+  virtual unsigned getHashImpl() const { return (MatchNumber << 2) | ChildNo; }
+};
+
 /// CheckPatternPredicateMatcher - This checks the target-specific predicate
 /// to see if the entire pattern is capable of matching.  This predicate does
 /// not take a node as input.  This is used for subtarget feature checks etc.
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 93f84ce..04fe0d1 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -242,6 +242,12 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
        << cast<CheckSameMatcher>(N)->getMatchNumber() << ",\n";
     return 2;
 
+  case Matcher::CheckChildSame:
+    OS << "OPC_CheckChild"
+       << cast<CheckChildSameMatcher>(N)->getChildNo() << "Same, "
+       << cast<CheckChildSameMatcher>(N)->getMatchNumber() << ",\n";
+    return 2;
+
   case Matcher::CheckPatternPredicate: {
     StringRef Pred =cast<CheckPatternPredicateMatcher>(N)->getPredicate();
     OS << "OPC_CheckPatternPredicate, " << getPatternPredicate(Pred) << ',';
@@ -315,10 +321,12 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
       assert(ChildSize != 0 && "Should not have a zero-sized child!");
 
       if (i != 0) {
+        if (!OmitComments)
+          OS << "/*" << CurrentIdx << "*/";
         OS.PadToColumn(Indent*2);
         if (!OmitComments)
-        OS << (isa<SwitchOpcodeMatcher>(N) ?
-                   "/*SwitchOpcode*/ " : "/*SwitchType*/ ");
+          OS << (isa<SwitchOpcodeMatcher>(N) ?
+                     "/*SwitchOpcode*/ " : "/*SwitchType*/ ");
       }
 
       // Emit the VBR.
@@ -340,6 +348,8 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     }
 
     // Emit the final zero to terminate the switch.
+    if (!OmitComments)
+      OS << "/*" << CurrentIdx << "*/";
     OS.PadToColumn(Indent*2) << "0, ";
     if (!OmitComments)
       OS << (isa<SwitchOpcodeMatcher>(N) ?
@@ -749,6 +759,7 @@ void MatcherTableEmitter::EmitHistogram(const Matcher *M,
     case Matcher::MoveChild: OS << "OPC_MoveChild"; break;
     case Matcher::MoveParent: OS << "OPC_MoveParent"; break;
     case Matcher::CheckSame: OS << "OPC_CheckSame"; break;
+    case Matcher::CheckChildSame: OS << "OPC_CheckChildSame"; break;
     case Matcher::CheckPatternPredicate:
       OS << "OPC_CheckPatternPredicate"; break;
     case Matcher::CheckPredicate: OS << "OPC_CheckPredicate"; break;
diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index f996422..82e5d63 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -51,7 +51,11 @@ static void ContractNodes(OwningPtr<Matcher> &MatcherPtr,
       if (MC->getChildNo() < 8 &&  // Only have CheckChildType0...7
           CT->getResNo() == 0)     // CheckChildType checks res #0
         New = new CheckChildTypeMatcher(MC->getChildNo(), CT->getType());
-    
+
+    if (CheckSameMatcher *CS = dyn_cast<CheckSameMatcher>(MC->getNext()))
+      if (MC->getChildNo() < 4)  // Only have CheckChildSame0...3
+        New = new CheckChildSameMatcher(MC->getChildNo(), CS->getMatchNumber());
+
     if (New) {
       // Insert the new node.
       New->setNext(MatcherPtr.take());
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index ce7f8c8..e35cf0f 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -46,7 +46,7 @@ class ImmPredicateSet {
   DenseMap<TreePattern *, unsigned> ImmIDs;
   std::vector<TreePredicateFn> PredsByName;
 public:
-  
+
   unsigned getIDFor(TreePredicateFn Pred) {
     unsigned &Entry = ImmIDs[Pred.getOrigPatFragRecord()];
     if (Entry == 0) {
@@ -55,16 +55,16 @@ public:
     }
     return Entry-1;
   }
-  
+
   const TreePredicateFn &getPredicate(unsigned i) {
     assert(i < PredsByName.size());
     return PredsByName[i];
   }
-  
+
   typedef std::vector<TreePredicateFn>::const_iterator iterator;
   iterator begin() const { return PredsByName.begin(); }
   iterator end() const { return PredsByName.end(); }
-  
+
 };
 } // End anonymous namespace
 
@@ -77,9 +77,9 @@ struct OperandsSignature {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
     char Repr;
   public:
-    
+
     OpKind() : Repr(OK_Invalid) {}
-    
+
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }
 
@@ -90,13 +90,13 @@ struct OperandsSignature {
              "Too many integer predicates for the 'Repr' char");
       OpKind K; K.Repr = OK_Imm+V; return K;
     }
-    
+
     bool isReg() const { return Repr == OK_Reg; }
     bool isFP() const  { return Repr == OK_FP; }
     bool isImm() const { return Repr >= OK_Imm; }
-    
+
     unsigned getImmCode() const { assert(isImm()); return Repr-OK_Imm; }
-    
+
     void printManglingSuffix(raw_ostream &OS, ImmPredicateSet &ImmPredicates,
                              bool StripImmCodes) const {
       if (isReg())
@@ -111,8 +111,8 @@ struct OperandsSignature {
       }
     }
   };
-  
-  
+
+
   SmallVector<OpKind, 3> Operands;
 
   bool operator<(const OperandsSignature &O) const {
@@ -130,7 +130,7 @@ struct OperandsSignature {
         return true;
     return false;
   }
-  
+
   /// getWithoutImmCodes - Return a copy of this with any immediate codes forced
   /// to zero.
   OperandsSignature getWithoutImmCodes() const {
@@ -142,31 +142,31 @@ struct OperandsSignature {
         Result.Operands.push_back(OpKind::getImm(0));
     return Result;
   }
-  
+
   void emitImmediatePredicate(raw_ostream &OS, ImmPredicateSet &ImmPredicates) {
     bool EmittedAnything = false;
     for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
       if (!Operands[i].isImm()) continue;
-      
+
       unsigned Code = Operands[i].getImmCode();
       if (Code == 0) continue;
-      
+
       if (EmittedAnything)
         OS << " &&\n        ";
-      
+
       TreePredicateFn PredFn = ImmPredicates.getPredicate(Code-1);
-      
+
       // Emit the type check.
       OS << "VT == "
          << getEnumName(PredFn.getOrigPatFragRecord()->getTree(0)->getType(0))
          << " && ";
-      
-      
+
+
       OS << PredFn.getFnName() << "(imm" << i <<')';
       EmittedAnything = true;
     }
   }
-  
+
   /// initialize - Examine the given pattern and initialize the contents
   /// of the Operands array accordingly. Return true if all the operands
   /// are supported, false otherwise.
@@ -177,12 +177,12 @@ struct OperandsSignature {
                   const CodeGenRegisterClass *OrigDstRC) {
     if (InstPatNode->isLeaf())
       return false;
-    
+
     if (InstPatNode->getOperator()->getName() == "imm") {
       Operands.push_back(OpKind::getImm(0));
       return true;
     }
-    
+
     if (InstPatNode->getOperator()->getName() == "fpimm") {
       Operands.push_back(OpKind::getFP());
       return true;
@@ -211,19 +211,19 @@ struct OperandsSignature {
           Record *Rec = PredFn.getOrigPatFragRecord()->getRecord();
           if (Rec->getValueAsBit("FastIselShouldIgnore"))
             return false;
-        
+
           PredNo = ImmediatePredicates.getIDFor(PredFn)+1;
         }
-        
+
         // Handle unmatched immediate sizes here.
         //if (Op->getType(0) != VT)
         //  return false;
-        
+
         Operands.push_back(OpKind::getImm(PredNo));
         continue;
       }
 
-      
+
       // For now, filter out any operand with a predicate.
       // For now, filter out any operand with multiple values.
       if (!Op->getPredicateFns().empty() || Op->getNumTypes() != 1)
@@ -237,7 +237,7 @@ struct OperandsSignature {
         // For now, ignore other non-leaf nodes.
         return false;
       }
-      
+
       assert(Op->hasTypeSet(0) && "Type infererence not done?");
 
       // For now, all the operands must have the same type (if they aren't
@@ -250,7 +250,7 @@ struct OperandsSignature {
       if (!OpDI)
         return false;
       Record *OpLeafRec = OpDI->getDef();
-      
+
       // For now, the only other thing we accept is register operands.
       const CodeGenRegisterClass *RC = 0;
       if (OpLeafRec->isSubClassOf("RegisterOperand"))
@@ -375,7 +375,7 @@ class FastISelMap {
 
   std::map<OperandsSignature, std::vector<OperandsSignature> >
     SignaturesWithConstantForms;
-  
+
   std::string InstNS;
   ImmPredicateSet ImmediatePredicates;
 public:
@@ -551,13 +551,13 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
       SubRegNo,
       PhysRegInputs
     };
-    
+
     if (SimplePatterns[Operands][OpcodeName][VT][RetVT].count(PredicateCheck))
       PrintFatalError(Pattern.getSrcRecord()->getLoc(),
                     "Duplicate record in FastISel table!");
 
     SimplePatterns[Operands][OpcodeName][VT][RetVT][PredicateCheck] = Memo;
-    
+
     // If any of the operands were immediates with predicates on them, strip
     // them down to a signature that doesn't have predicates so that we can
     // associate them with the stripped predicate version.
@@ -571,14 +571,14 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
 void FastISelMap::printImmediatePredicates(raw_ostream &OS) {
   if (ImmediatePredicates.begin() == ImmediatePredicates.end())
     return;
-  
+
   OS << "\n// FastEmit Immediate Predicate functions.\n";
   for (ImmPredicateSet::iterator I = ImmediatePredicates.begin(),
        E = ImmediatePredicates.end(); I != E; ++I) {
     OS << "static bool " << I->getFnName() << "(int64_t Imm) {\n";
     OS << I->getImmediatePredicateCode() << "\n}\n";
   }
-  
+
   OS << "\n\n";
 }
 
@@ -804,11 +804,11 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
       OS << ", ";
     Operands.PrintParameters(OS);
     OS << ") {\n";
-    
-    // If there are any forms of this signature available that operand on
-    // constrained forms of the immediate (e.g. 32-bit sext immediate in a
+
+    // If there are any forms of this signature available that operate on
+    // constrained forms of the immediate (e.g., 32-bit sext immediate in a
     // 64-bit operand), check them first.
-    
+
     std::map<OperandsSignature, std::vector<OperandsSignature> >::iterator MI
       = SignaturesWithConstantForms.find(Operands);
     if (MI != SignaturesWithConstantForms.end()) {
@@ -816,7 +816,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
       std::sort(MI->second.begin(), MI->second.end());
       MI->second.erase(std::unique(MI->second.begin(), MI->second.end()),
                        MI->second.end());
-      
+
       // Check each in order it was seen.  It would be nice to have a good
       // relative ordering between them, but we're not going for optimality
       // here.
@@ -831,11 +831,11 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
         MI->second[i].PrintArguments(OS);
         OS << "))\n      return Reg;\n\n";
       }
-      
+
       // Done with this, remove it.
       SignaturesWithConstantForms.erase(MI);
     }
-    
+
     OS << "  switch (Opcode) {\n";
     for (OpcodeTypeRetPredMap::const_iterator I = OTM.begin(), E = OTM.end();
          I != E; ++I) {
@@ -855,7 +855,7 @@ void FastISelMap::printFunctionDefinitions(raw_ostream &OS) {
     OS << "}\n";
     OS << "\n";
   }
-  
+
   // TODO: SignaturesWithConstantForms should be empty here.
 }
 
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 6e45240..87d18cd 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -2038,7 +2038,6 @@ void FixedLenDecoderEmitter::run(raw_ostream &o) {
   }
 
   DecoderTableInfo TableInfo;
-  std::set<unsigned> Sizes;
   for (std::map<std::pair<std::string, unsigned>,
                 std::vector<unsigned> >::const_iterator
        I = OpcMap.begin(), E = OpcMap.end(); I != E; ++I) {
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index c8304de..d3d9cc1 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -57,6 +57,7 @@ private:
                   std::map<std::vector<Record*>, unsigned> &EL,
                   const OperandInfoMapTy &OpInfo,
                   raw_ostream &OS);
+  void emitOperandTypesEnum(raw_ostream &OS, const CodeGenTarget &Target);
   void initOperandMapData(
              const std::vector<const CodeGenInstruction *> NumberedInstructions,
              const std::string &Namespace,
@@ -132,8 +133,8 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
         Res += "|(1<<MCOI::LookupPtrRegClass)";
 
       // Predicate operands.  Check to see if the original unexpanded operand
-      // was of type PredicateOperand.
-      if (Inst.Operands[i].Rec->isSubClassOf("PredicateOperand"))
+      // was of type PredicateOp.
+      if (Inst.Operands[i].Rec->isSubClassOf("PredicateOp"))
         Res += "|(1<<MCOI::Predicate)";
 
       // Optional def operands.  Check to see if the original unexpanded operand
@@ -311,6 +312,34 @@ void InstrInfoEmitter::emitOperandNameMappings(raw_ostream &OS,
 
 }
 
+/// Generate an enum for all the operand types for this target, under the
+/// llvm::TargetNamespace::OpTypes namespace.
+/// Operand types are all definitions derived of the Operand Target.td class.
+void InstrInfoEmitter::emitOperandTypesEnum(raw_ostream &OS,
+                                            const CodeGenTarget &Target) {
+
+  const std::string &Namespace = Target.getInstNamespace();
+  std::vector<Record *> Operands = Records.getAllDerivedDefinitions("Operand");
+
+  OS << "\n#ifdef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
+  OS << "#undef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
+  OS << "namespace llvm {";
+  OS << "namespace " << Namespace << " {\n";
+  OS << "namespace OpTypes { \n";
+  OS << "enum OperandType {\n";
+
+  for (unsigned oi = 0, oe = Operands.size(); oi != oe; ++oi) {
+    if (!Operands[oi]->isAnonymous())
+      OS << "  " << Operands[oi]->getName() << " = " << oi << ",\n";
+  }
+
+  OS << "  OPERAND_TYPE_LIST_END" << "\n};\n";
+  OS << "} // End namespace OpTypes\n";
+  OS << "} // End namespace " << Namespace << "\n";
+  OS << "} // End namespace llvm\n";
+  OS << "#endif // GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
+}
+
 //===----------------------------------------------------------------------===//
 // Main Output.
 //===----------------------------------------------------------------------===//
@@ -408,13 +437,14 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
   OS << "namespace llvm {\n";
   OS << "struct " << ClassName << " : public TargetInstrInfo {\n"
      << "  explicit " << ClassName << "(int SO = -1, int DO = -1);\n"
+     << "  virtual ~" << ClassName << "();\n"
      << "};\n";
   OS << "} // End llvm namespace \n";
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
 
-  OS << "\n#ifdef GET_INSTRINFO_CTOR\n";
-  OS << "#undef GET_INSTRINFO_CTOR\n";
+  OS << "\n#ifdef GET_INSTRINFO_CTOR_DTOR\n";
+  OS << "#undef GET_INSTRINFO_CTOR_DTOR\n";
 
   OS << "namespace llvm {\n";
   OS << "extern const MCInstrDesc " << TargetName << "Insts[];\n";
@@ -424,12 +454,15 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
      << "  : TargetInstrInfo(SO, DO) {\n"
      << "  InitMCInstrInfo(" << TargetName << "Insts, "
      << TargetName << "InstrNameIndices, " << TargetName << "InstrNameData, "
-     << NumberedInstructions.size() << ");\n}\n";
+     << NumberedInstructions.size() << ");\n}\n"
+     << ClassName << "::~" << ClassName << "() {}\n";
   OS << "} // End llvm namespace \n";
 
-  OS << "#endif // GET_INSTRINFO_CTOR\n\n";
+  OS << "#endif // GET_INSTRINFO_CTOR_DTOR\n\n";
 
   emitOperandNameMappings(OS, Target, NumberedInstructions);
+
+  emitOperandTypesEnum(OS, Target);
 }
 
 void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
@@ -514,6 +547,19 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
   else
     OS << "OperandInfo" << OpInfo.find(OperandInfo)->second;
 
+  CodeGenTarget &Target = CDP.getTargetInfo();
+  if (Inst.HasComplexDeprecationPredicate)
+    // Emit a function pointer to the complex predicate method.
+    OS << ",0"
+       << ",&get" << Inst.DeprecatedReason << "DeprecationInfo";
+  else if (!Inst.DeprecatedReason.empty())
+    // Emit the Subtarget feature.
+    OS << "," << Target.getInstNamespace() << "::" << Inst.DeprecatedReason
+       << ",0";
+  else
+    // Instruction isn't deprecated.
+    OS << ",0,0";
+
   OS << " },  // Inst #" << Num << " = " << Inst.TheDef->getName() << "\n";
 }
 
@@ -545,7 +591,15 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
        << "\t= " << i << ",\n";
   }
   OS << "    INSTRUCTION_LIST_END = " << NumberedInstructions.size() << "\n";
-  OS << "  };\n}\n";
+  OS << "  };\n";
+  OS << "namespace Sched {\n";
+  OS << "  enum {\n";
+  for (unsigned i = 0, e = SchedModels.numInstrSchedClasses(); i != e; ++i) {
+    OS << "    " << SchedModels.getSchedClass(i).Name
+       << "\t= " << i << ",\n";
+  }
+  OS << "    SCHED_LIST_END = " << SchedModels.numInstrSchedClasses() << "\n";
+  OS << "  };\n}\n}\n";
   OS << "} // End llvm namespace \n";
 
   OS << "#endif // GET_INSTRINFO_ENUM\n\n";
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index c508795..8f137f8 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -84,10 +84,10 @@ void IntrinsicEmitter::run(raw_ostream &OS) {
 
   // Emit the function name recognizer.
   EmitFnNameRecognizer(Ints, OS);
-  
+
   // Emit the intrinsic declaration generator.
   EmitGenerator(Ints, OS);
-  
+
   // Emit the intrinsic parameter attributes.
   EmitAttributes(Ints, OS);
 
@@ -125,7 +125,7 @@ void IntrinsicEmitter::EmitEnumInfo(const std::vector<CodeGenIntrinsic> &Ints,
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     OS << "    " << Ints[i].EnumName;
     OS << ((i != e-1) ? ", " : "  ");
-    OS << std::string(40-Ints[i].EnumName.size(), ' ') 
+    OS << std::string(40-Ints[i].EnumName.size(), ' ')
       << "// " << Ints[i].Name << "\n";
   }
   OS << "#endif\n\n";
@@ -146,13 +146,13 @@ private:
 };
 
 void IntrinsicEmitter::
-EmitFnNameRecognizer(const std::vector<CodeGenIntrinsic> &Ints, 
+EmitFnNameRecognizer(const std::vector<CodeGenIntrinsic> &Ints,
                      raw_ostream &OS) {
   // Build a 'first character of function name' -> intrinsic # mapping.
   std::map<char, std::vector<unsigned> > IntMapping;
   for (unsigned i = 0, e = Ints.size(); i != e; ++i)
     IntMapping[Ints[i].Name[5]].push_back(i);
-  
+
   OS << "// Function name -> enum value recognizer code.\n";
   OS << "#ifdef GET_FUNCTION_RECOGNIZER\n";
   OS << "  StringRef NameR(Name+6, Len-6);   // Skip over 'llvm.'\n";
@@ -171,7 +171,7 @@ EmitFnNameRecognizer(const std::vector<CodeGenIntrinsic> &Ints,
     // Emit all the overloaded intrinsics first, build a table of the
     // non-overloaded ones.
     std::vector<StringMatcher::StringPair> MatchTable;
-    
+
     for (unsigned i = 0, e = IntList.size(); i != e; ++i) {
       unsigned IntNo = IntList[i];
       std::string Result = "return " + TargetPrefix + "Intrinsic::" +
@@ -188,18 +188,18 @@ EmitFnNameRecognizer(const std::vector<CodeGenIntrinsic> &Ints,
       OS << "    if (NameR.startswith(\"" << TheStr << "\")) "
          << Result << '\n';
     }
-    
+
     // Emit the matcher logic for the fixed length strings.
     StringMatcher("NameR", MatchTable, OS).Emit(1);
     OS << "    break;  // end of '" << I->first << "' case.\n";
   }
-  
+
   OS << "  }\n";
   OS << "#endif\n\n";
 }
 
 void IntrinsicEmitter::
-EmitIntrinsicToNameTable(const std::vector<CodeGenIntrinsic> &Ints, 
+EmitIntrinsicToNameTable(const std::vector<CodeGenIntrinsic> &Ints,
                          raw_ostream &OS) {
   OS << "// Intrinsic ID to name table\n";
   OS << "#ifdef GET_INTRINSIC_NAME_TABLE\n";
@@ -210,7 +210,7 @@ EmitIntrinsicToNameTable(const std::vector<CodeGenIntrinsic> &Ints,
 }
 
 void IntrinsicEmitter::
-EmitIntrinsicToOverloadTable(const std::vector<CodeGenIntrinsic> &Ints, 
+EmitIntrinsicToOverloadTable(const std::vector<CodeGenIntrinsic> &Ints,
                          raw_ostream &OS) {
   OS << "// Intrinsic ID to overload bitset\n";
   OS << "#ifdef GET_INTRINSIC_OVERLOAD_TABLE\n";
@@ -260,7 +260,9 @@ enum IIT_Info {
   IIT_STRUCT5 = 22,
   IIT_EXTEND_VEC_ARG = 23,
   IIT_TRUNC_VEC_ARG = 24,
-  IIT_ANYPTR = 25
+  IIT_ANYPTR = 25,
+  IIT_V1   = 26,
+  IIT_VARARG = 27
 };
 
 
@@ -277,7 +279,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
     case 64: return Sig.push_back(IIT_I64);
     }
   }
-  
+
   switch (VT) {
   default: PrintFatalError("unhandled MVT in intrinsic!");
   case MVT::f16: return Sig.push_back(IIT_F16);
@@ -287,16 +289,18 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   case MVT::x86mmx: return Sig.push_back(IIT_MMX);
   // MVT::OtherVT is used to mean the empty struct type here.
   case MVT::Other: return Sig.push_back(IIT_EMPTYSTRUCT);
+  // MVT::isVoid is used to represent varargs here.
+  case MVT::isVoid: return Sig.push_back(IIT_VARARG);
   }
 }
 
 #ifdef _MSC_VER
 #pragma optimize("",off) // MSVC 2010 optimizer can't deal with this function.
-#endif 
+#endif
 
 static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
                             std::vector<unsigned char> &Sig) {
-  
+
   if (R->isSubClassOf("LLVMMatchType")) {
     unsigned Number = R->getValueAsInt("Number");
     assert(Number < ArgCodes.size() && "Invalid matching number!");
@@ -308,7 +312,7 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
       Sig.push_back(IIT_ARG);
     return Sig.push_back((Number << 2) | ArgCodes[Number]);
   }
-  
+
   MVT::SimpleValueType VT = getValueType(R->getValueAsDef("VT"));
 
   unsigned Tmp = 0;
@@ -319,17 +323,17 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
   case MVT::fAny: ++Tmp; // FALL THROUGH.
   case MVT::iAny: {
     // If this is an "any" valuetype, then the type is the type of the next
-    // type in the list specified to getIntrinsic().  
+    // type in the list specified to getIntrinsic().
     Sig.push_back(IIT_ARG);
-    
+
     // Figure out what arg # this is consuming, and remember what kind it was.
     unsigned ArgNo = ArgCodes.size();
     ArgCodes.push_back(Tmp);
-    
+
     // Encode what sort of argument it must be in the low 2 bits of the ArgNo.
     return Sig.push_back((ArgNo << 2) | Tmp);
   }
-  
+
   case MVT::iPTR: {
     unsigned AddrSpace = 0;
     if (R->isSubClassOf("LLVMQualPointerType")) {
@@ -345,18 +349,19 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
     return EncodeFixedType(R->getValueAsDef("ElTy"), ArgCodes, Sig);
   }
   }
-  
+
   if (EVT(VT).isVector()) {
     EVT VVT = VT;
     switch (VVT.getVectorNumElements()) {
     default: PrintFatalError("unhandled vector type width in intrinsic!");
+    case 1: Sig.push_back(IIT_V1); break;
     case 2: Sig.push_back(IIT_V2); break;
     case 4: Sig.push_back(IIT_V4); break;
     case 8: Sig.push_back(IIT_V8); break;
     case 16: Sig.push_back(IIT_V16); break;
     case 32: Sig.push_back(IIT_V32); break;
     }
-    
+
     return EncodeFixedValueType(VVT.getVectorElementType().
                                 getSimpleVT().SimpleTy, Sig);
   }
@@ -373,7 +378,7 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
 static void ComputeFixedEncoding(const CodeGenIntrinsic &Int,
                                  std::vector<unsigned char> &TypeSig) {
   std::vector<unsigned char> ArgCodes;
-  
+
   if (Int.IS.RetVTs.empty())
     TypeSig.push_back(IIT_Done);
   else if (Int.IS.RetVTs.size() == 1 &&
@@ -388,11 +393,11 @@ static void ComputeFixedEncoding(const CodeGenIntrinsic &Int,
       case 5: TypeSig.push_back(IIT_STRUCT5); break;
       default: assert(0 && "Unhandled case in struct");
     }
-    
+
     for (unsigned i = 0, e = Int.IS.RetVTs.size(); i != e; ++i)
       EncodeFixedType(Int.IS.RetTypeDefs[i], ArgCodes, TypeSig);
   }
-  
+
   for (unsigned i = 0, e = Int.IS.ParamTypeDefs.size(); i != e; ++i)
     EncodeFixedType(Int.IS.ParamTypeDefs[i], ArgCodes, TypeSig);
 }
@@ -401,16 +406,16 @@ static void printIITEntry(raw_ostream &OS, unsigned char X) {
   OS << (unsigned)X;
 }
 
-void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints, 
+void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
                                      raw_ostream &OS) {
   // If we can compute a 32-bit fixed encoding for this intrinsic, do so and
   // capture it in this vector, otherwise store a ~0U.
   std::vector<unsigned> FixedEncodings;
-  
+
   SequenceToOffsetTable<std::vector<unsigned char> > LongEncodingTable;
-  
+
   std::vector<unsigned char> TypeSig;
-  
+
   // Compute the unique argument type info.
   for (unsigned i = 0, e = Ints.size(); i != e; ++i) {
     // Get the signature for the intrinsic.
@@ -430,7 +435,7 @@ void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
         }
         Result = (Result << 4) | TypeSig[e-i-1];
       }
-      
+
       // If this could be encoded into a 31-bit word, return it.
       if (!Failed && (Result >> 31) == 0) {
         FixedEncodings.push_back(Result);
@@ -441,45 +446,45 @@ void IntrinsicEmitter::EmitGenerator(const std::vector<CodeGenIntrinsic> &Ints,
     // Otherwise, we're going to unique the sequence into the
     // LongEncodingTable, and use its offset in the 32-bit table instead.
     LongEncodingTable.add(TypeSig);
-      
+
     // This is a placehold that we'll replace after the table is laid out.
     FixedEncodings.push_back(~0U);
   }
-  
+
   LongEncodingTable.layout();
-  
+
   OS << "// Global intrinsic function declaration type table.\n";
   OS << "#ifdef GET_INTRINSIC_GENERATOR_GLOBAL\n";
 
   OS << "static const unsigned IIT_Table[] = {\n  ";
-  
+
   for (unsigned i = 0, e = FixedEncodings.size(); i != e; ++i) {
     if ((i & 7) == 7)
       OS << "\n  ";
-    
+
     // If the entry fit in the table, just emit it.
     if (FixedEncodings[i] != ~0U) {
       OS << "0x" << utohexstr(FixedEncodings[i]) << ", ";
       continue;
     }
-    
+
     TypeSig.clear();
     ComputeFixedEncoding(Ints[i], TypeSig);
 
-    
+
     // Otherwise, emit the offset into the long encoding table.  We emit it this
     // way so that it is easier to read the offset in the .def file.
     OS << "(1U<<31) | " << LongEncodingTable.get(TypeSig) << ", ";
   }
-  
+
   OS << "0\n};\n\n";
-  
+
   // Emit the shared table of register lists.
   OS << "static const unsigned char IIT_LongEncodingTable[] = {\n";
   if (!LongEncodingTable.empty())
     LongEncodingTable.emit(OS, printIITEntry);
   OS << "  255\n};\n\n";
-  
+
   OS << "#endif\n\n";  // End of GET_INTRINSIC_GENERATOR_GLOBAL
 }
 
@@ -567,7 +572,6 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   OS << "  AttributeSet AS[" << maxArgAttrs+1 << "];\n";
   OS << "  unsigned NumAttrs = 0;\n";
   OS << "  if (id != 0) {\n";
-  OS << "    SmallVector<Attribute::AttrKind, 8> AttrVec;\n";
   OS << "    switch(IntrinsicsToAttributesMap[id - ";
   if (TargetOnly)
     OS << "Intrinsic::num_intrinsics";
@@ -577,7 +581,7 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   OS << "    default: llvm_unreachable(\"Invalid attribute number\");\n";
   for (UniqAttrMapTy::const_iterator I = UniqAttributes.begin(),
        E = UniqAttributes.end(); I != E; ++I) {
-    OS << "    case " << I->second << ":\n";
+    OS << "    case " << I->second << ": {\n";
 
     const CodeGenIntrinsic &intrinsic = *(I->first);
 
@@ -590,60 +594,82 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
       while (ai != ae) {
         unsigned argNo = intrinsic.ArgumentAttributes[ai].first;
 
-        OS << "      AttrVec.clear();\n";
+        OS <<  "      const Attribute::AttrKind AttrParam" << argNo + 1 <<"[]= {";
+        bool addComma = false;
 
         do {
           switch (intrinsic.ArgumentAttributes[ai].second) {
           case CodeGenIntrinsic::NoCapture:
-            OS << "      AttrVec.push_back(Attribute::NoCapture);\n";
+            if (addComma)
+              OS << ",";
+            OS << "Attribute::NoCapture";
+            addComma = true;
             break;
           case CodeGenIntrinsic::ReadOnly:
-            OS << "      AttrVec.push_back(Attribute::ReadOnly);\n";
+            if (addComma)
+              OS << ",";
+            OS << "Attribute::ReadOnly";
+            addComma = true;
             break;
           case CodeGenIntrinsic::ReadNone:
-            OS << "      AttrVec.push_back(Attribute::ReadNone);\n";
+            if (addComma)
+              OS << ",";
+            OS << "Attributes::ReadNone";
+            addComma = true;
             break;
           }
 
           ++ai;
         } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
-
+        OS << "};\n";
         OS << "      AS[" << numAttrs++ << "] = AttributeSet::get(C, "
-           << argNo+1 << ", AttrVec);\n";
+           << argNo+1 << ", AttrParam" << argNo +1 << ");\n";
       }
     }
 
     ModRefKind modRef = getModRefKind(intrinsic);
 
     if (!intrinsic.canThrow || modRef || intrinsic.isNoReturn) {
-      OS << "      AttrVec.clear();\n";
-
-      if (!intrinsic.canThrow)
-        OS << "      AttrVec.push_back(Attribute::NoUnwind);\n";
-      if (intrinsic.isNoReturn)
-        OS << "      AttrVec.push_back(Attribute::NoReturn);\n";
+      OS << "      const Attribute::AttrKind Atts[] = {";
+      bool addComma = false;
+      if (!intrinsic.canThrow) {
+        OS << "Attribute::NoUnwind";
+        addComma = true;
+      }
+      if (intrinsic.isNoReturn) {
+        if (addComma)
+          OS << ",";
+        OS << "Attribute::NoReturn";
+        addComma = true;
+      }
 
       switch (modRef) {
       case MRK_none: break;
       case MRK_readonly:
-        OS << "      AttrVec.push_back(Attribute::ReadOnly);\n";
+        if (addComma)
+          OS << ",";
+        OS << "Attribute::ReadOnly";
         break;
       case MRK_readnone:
-        OS << "      AttrVec.push_back(Attribute::ReadNone);\n"; 
+        if (addComma)
+          OS << ",";
+        OS << "Attribute::ReadNone";
         break;
       }
+      OS << "};\n";
       OS << "      AS[" << numAttrs++ << "] = AttributeSet::get(C, "
-         << "AttributeSet::FunctionIndex, AttrVec);\n";
+         << "AttributeSet::FunctionIndex, Atts);\n";
     }
 
     if (numAttrs) {
       OS << "      NumAttrs = " << numAttrs << ";\n";
       OS << "      break;\n";
+      OS << "      }\n";
     } else {
       OS << "      return AttributeSet();\n";
     }
   }
-  
+
   OS << "    }\n";
   OS << "  }\n";
   OS << "  return AttributeSet::get(C, ArrayRef<AttributeSet>(AS, "
@@ -692,9 +718,9 @@ EmitModRefBehavior(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS){
 static void EmitTargetBuiltins(const std::map<std::string, std::string> &BIM,
                                const std::string &TargetPrefix,
                                raw_ostream &OS) {
-  
+
   std::vector<StringMatcher::StringPair> Results;
-  
+
   for (std::map<std::string, std::string>::const_iterator I = BIM.begin(),
        E = BIM.end(); I != E; ++I) {
     std::string ResultCode =
@@ -705,9 +731,9 @@ static void EmitTargetBuiltins(const std::map<std::string, std::string> &BIM,
   StringMatcher("BuiltinName", Results, OS).Emit();
 }
 
-        
+
 void IntrinsicEmitter::
-EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints, 
+EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
                              raw_ostream &OS) {
   typedef std::map<std::string, std::map<std::string, std::string> > BIMTy;
   BIMTy BuiltinMap;
@@ -715,20 +741,20 @@ EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
     if (!Ints[i].GCCBuiltinName.empty()) {
       // Get the map for this target prefix.
       std::map<std::string, std::string> &BIM =BuiltinMap[Ints[i].TargetPrefix];
-      
+
       if (!BIM.insert(std::make_pair(Ints[i].GCCBuiltinName,
                                      Ints[i].EnumName)).second)
         PrintFatalError("Intrinsic '" + Ints[i].TheDef->getName() +
               "': duplicate GCC builtin name!");
     }
   }
-  
+
   OS << "// Get the LLVM intrinsic that corresponds to a GCC builtin.\n";
   OS << "// This is used by the C front-end.  The GCC builtin name is passed\n";
   OS << "// in as BuiltinName, and a target prefix (e.g. 'ppc') is passed\n";
   OS << "// in as TargetPrefix.  The result is assigned to 'IntrinsicID'.\n";
   OS << "#ifdef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN\n";
-  
+
   if (TargetOnly) {
     OS << "static " << TargetPrefix << "Intrinsic::ID "
        << "getIntrinsicForGCCBuiltin(const char "
@@ -737,10 +763,10 @@ EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
     OS << "Intrinsic::ID Intrinsic::getIntrinsicForGCCBuiltin(const char "
        << "*TargetPrefixStr, const char *BuiltinNameStr) {\n";
   }
-  
+
   OS << "  StringRef BuiltinName(BuiltinNameStr);\n";
   OS << "  StringRef TargetPrefix(TargetPrefixStr);\n\n";
-  
+
   // Note: this could emit significantly better code if we cared.
   for (BIMTy::iterator I = BuiltinMap.begin(), E = BuiltinMap.end();I != E;++I){
     OS << "  ";
diff --git a/utils/TableGen/OptParserEmitter.cpp b/utils/TableGen/OptParserEmitter.cpp
index 86328bf..9c4daaf 100644
--- a/utils/TableGen/OptParserEmitter.cpp
+++ b/utils/TableGen/OptParserEmitter.cpp
@@ -13,18 +13,23 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+#include <cstring>
+#include <cctype>
 #include <map>
 
 using namespace llvm;
 
+// Ordering on Info. The logic should match with the consumer-side function in
+// llvm/Option/OptTable.h.
 static int StrCmpOptionName(const char *A, const char *B) {
-  char a = *A, b = *B;
+  const char *X = A, *Y = B;
+  char a = tolower(*A), b = tolower(*B);
   while (a == b) {
     if (a == '\0')
-      return 0;
+      return strcmp(A, B);
 
-    a = *++A;
-    b = *++B;
+    a = tolower(*++X);
+    b = tolower(*++Y);
   }
 
   if (a == '\0') // A is a prefix of B.
@@ -36,9 +41,9 @@ static int StrCmpOptionName(const char *A, const char *B) {
   return (a < b) ? -1 : 1;
 }
 
-static int CompareOptionRecords(const void *Av, const void *Bv) {
-  const Record *A = *(const Record*const*) Av;
-  const Record *B = *(const Record*const*) Bv;
+static int CompareOptionRecords(Record *const *Av, Record *const *Bv) {
+  const Record *A = *Av;
+  const Record *B = *Bv;
 
   // Sentinel options precede all others and are only ordered by precedence.
   bool ASent = A->getValueAsDef("Kind")->getValueAsBit("Sentinel");
@@ -50,7 +55,7 @@ static int CompareOptionRecords(const void *Av, const void *Bv) {
   if (!ASent)
     if (int Cmp = StrCmpOptionName(A->getValueAsString("Name").c_str(),
                                    B->getValueAsString("Name").c_str()))
-    return Cmp;
+      return Cmp;
 
   if (!ASent) {
     std::vector<std::string> APrefixes = A->getValueAsListOfStrings("Prefixes");
@@ -74,7 +79,7 @@ static int CompareOptionRecords(const void *Av, const void *Bv) {
   if (APrec == BPrec &&
       A->getValueAsListOfStrings("Prefixes") ==
       B->getValueAsListOfStrings("Prefixes")) {
-    PrintError(A->getLoc(), Twine("Option is equivilent to"));
+    PrintError(A->getLoc(), Twine("Option is equivalent to"));
     PrintError(B->getLoc(), Twine("Other defined here"));
     PrintFatalError("Equivalent Options found.");
   }
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 731dccf..cc08df9 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -121,7 +121,7 @@ void RegisterInfoEmitter::runEnums(raw_ostream &OS,
       OS << "}\n";
   }
 
-  const std::vector<Record*> RegAltNameIndices = Target.getRegAltNameIndices();
+  const std::vector<Record*> &RegAltNameIndices = Target.getRegAltNameIndices();
   // If the only definition is the default NoRegAltName, we don't need to
   // emit anything.
   if (RegAltNameIndices.size() > 1) {
@@ -722,7 +722,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
   // Keep track of sub-register names as well. These are not differentially
   // encoded.
   typedef SmallVector<const CodeGenSubRegIndex*, 4> SubRegIdxVec;
-  SequenceToOffsetTable<SubRegIdxVec> SubRegIdxSeqs;
+  SequenceToOffsetTable<SubRegIdxVec, CodeGenSubRegIndex::Less> SubRegIdxSeqs;
   SmallVector<SubRegIdxVec, 4> SubRegIdxLists(Regs.size());
 
   SequenceToOffsetTable<std::string> RegStrings;
@@ -1090,7 +1090,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     // Compress the sub-reg index lists.
     typedef std::vector<const CodeGenSubRegIndex*> IdxList;
     SmallVector<IdxList, 8> SuperRegIdxLists(RegisterClasses.size());
-    SequenceToOffsetTable<IdxList> SuperRegIdxSeqs;
+    SequenceToOffsetTable<IdxList, CodeGenSubRegIndex::Less> SuperRegIdxSeqs;
     BitVector MaskBV(RegisterClasses.size());
 
     for (unsigned rc = 0, e = RegisterClasses.size(); rc != e; ++rc) {
@@ -1314,9 +1314,21 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     OS << "0 };\n";
 
     // Emit the *_RegMask bit mask of call-preserved registers.
+    BitVector Covered = RegBank.computeCoveredRegisters(*Regs);
+
+    // Check for an optional OtherPreserved set.
+    // Add those registers to RegMask, but not to SaveList.
+    if (DagInit *OPDag =
+        dyn_cast<DagInit>(CSRSet->getValueInit("OtherPreserved"))) {
+      SetTheory::RecSet OPSet;
+      RegBank.getSets().evaluate(OPDag, OPSet, CSRSet->getLoc());
+      Covered |= RegBank.computeCoveredRegisters(
+        ArrayRef<Record*>(OPSet.begin(), OPSet.end()));
+    }
+
     OS << "static const uint32_t " << CSRSet->getName()
        << "_RegMask[] = { ";
-    printBitVectorAsHex(OS, RegBank.computeCoveredRegisters(*Regs), 32);
+    printBitVectorAsHex(OS, Covered, 32);
     OS << "};\n";
   }
   OS << "\n\n";
diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h
index fcda233..e6ab664 100644
--- a/utils/TableGen/SequenceToOffsetTable.h
+++ b/utils/TableGen/SequenceToOffsetTable.h
@@ -21,6 +21,7 @@
 #include <cassert>
 #include <cctype>
 #include <functional>
+#include <map>
 #include <vector>
 
 namespace llvm {
diff --git a/utils/TableGen/SetTheory.cpp b/utils/TableGen/SetTheory.cpp
index 3e5c38c..ad3d7c7 100644
--- a/utils/TableGen/SetTheory.cpp
+++ b/utils/TableGen/SetTheory.cpp
@@ -27,14 +27,16 @@ typedef SetTheory::RecVec RecVec;
 
 // (add a, b, ...) Evaluate and union all arguments.
 struct AddOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts,
+                     ArrayRef<SMLoc> Loc) {
     ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc);
   }
 };
 
 // (sub Add, Sub, ...) Set difference.
 struct SubOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts,
+                     ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() < 2)
       PrintFatalError(Loc, "Set difference needs at least two arguments: " +
         Expr->getAsString());
@@ -49,7 +51,8 @@ struct SubOp : public SetTheory::Operator {
 
 // (and S1, S2) Set intersection.
 struct AndOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts,
+                     ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() != 2)
       PrintFatalError(Loc, "Set intersection requires two arguments: " +
         Expr->getAsString());
@@ -68,7 +71,8 @@ struct SetIntBinOp : public SetTheory::Operator {
                      RecSet &Set, int64_t N,
                      RecSet &Elts, ArrayRef<SMLoc> Loc) =0;
 
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts,
+                     ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() != 2)
       PrintFatalError(Loc, "Operator requires (Op Set, Int) arguments: " +
         Expr->getAsString());
@@ -84,9 +88,9 @@ struct SetIntBinOp : public SetTheory::Operator {
 
 // (shl S, N) Shift left, remove the first N elements.
 struct ShlOp : public SetIntBinOp {
-  void apply2(SetTheory &ST, DagInit *Expr,
-             RecSet &Set, int64_t N,
-             RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply2(SetTheory &ST, DagInit *Expr,
+                      RecSet &Set, int64_t N,
+                      RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N < 0)
       PrintFatalError(Loc, "Positive shift required: " +
         Expr->getAsString());
@@ -97,9 +101,9 @@ struct ShlOp : public SetIntBinOp {
 
 // (trunc S, N) Truncate after the first N elements.
 struct TruncOp : public SetIntBinOp {
-  void apply2(SetTheory &ST, DagInit *Expr,
-             RecSet &Set, int64_t N,
-             RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply2(SetTheory &ST, DagInit *Expr,
+                      RecSet &Set, int64_t N,
+                      RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N < 0)
       PrintFatalError(Loc, "Positive length required: " +
         Expr->getAsString());
@@ -115,9 +119,9 @@ struct RotOp : public SetIntBinOp {
 
   RotOp(bool Rev) : Reverse(Rev) {}
 
-  void apply2(SetTheory &ST, DagInit *Expr,
-             RecSet &Set, int64_t N,
-             RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply2(SetTheory &ST, DagInit *Expr,
+                      RecSet &Set, int64_t N,
+                      RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Reverse)
       N = -N;
     // N > 0 -> rotate left, N < 0 -> rotate right.
@@ -134,9 +138,9 @@ struct RotOp : public SetIntBinOp {
 
 // (decimate S, N) Pick every N'th element of S.
 struct DecimateOp : public SetIntBinOp {
-  void apply2(SetTheory &ST, DagInit *Expr,
-             RecSet &Set, int64_t N,
-             RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply2(SetTheory &ST, DagInit *Expr,
+                      RecSet &Set, int64_t N,
+                      RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N <= 0)
       PrintFatalError(Loc, "Positive stride required: " +
         Expr->getAsString());
@@ -147,7 +151,8 @@ struct DecimateOp : public SetIntBinOp {
 
 // (interleave S1, S2, ...) Interleave elements of the arguments.
 struct InterleaveOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts,
+                     ArrayRef<SMLoc> Loc) {
     // Evaluate the arguments individually.
     SmallVector<RecSet, 4> Args(Expr->getNumArgs());
     unsigned MaxSize = 0;
@@ -165,7 +170,8 @@ struct InterleaveOp : public SetTheory::Operator {
 
 // (sequence "Format", From, To) Generate a sequence of records by name.
 struct SequenceOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+  virtual void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts,
+                     ArrayRef<SMLoc> Loc) {
     int Step = 1;
     if (Expr->arg_size() > 4)
       PrintFatalError(Loc, "Bad args to (sequence \"Format\", From, To): " +
@@ -232,15 +238,16 @@ struct FieldExpander : public SetTheory::Expander {
 
   FieldExpander(StringRef fn) : FieldName(fn) {}
 
-  void expand(SetTheory &ST, Record *Def, RecSet &Elts) {
+  virtual void expand(SetTheory &ST, Record *Def, RecSet &Elts) {
     ST.evaluate(Def->getValueInit(FieldName), Elts, Def->getLoc());
   }
 };
 } // end anonymous namespace
 
-void SetTheory::Operator::anchor() { }
+// Pin the vtables to this file.
+void SetTheory::Operator::anchor() {}
+void SetTheory::Expander::anchor() {}
 
-void SetTheory::Expander::anchor() { }
 
 SetTheory::SetTheory() {
   addOperator("add", new AddOp);
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 81bb6f8..b9f9d06 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -1198,6 +1198,11 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
+
+    OS << "  " << (bool)(PI->ModelDef ?
+                         PI->ModelDef->getValueAsBit("CompleteModel") : 0)
+       << ", // " << "CompleteModel\n";
+
     OS << "  " << PI->Index << ", // Processor ID\n";
     if (PI->hasInstrSchedModel())
       OS << "  " << PI->ModelName << "ProcResources" << ",\n"
diff --git a/utils/TableGen/TGValueTypes.cpp b/utils/TableGen/TGValueTypes.cpp
index 3ac71a4..f4893f5 100644
--- a/utils/TableGen/TGValueTypes.cpp
+++ b/utils/TableGen/TGValueTypes.cpp
@@ -35,11 +35,15 @@ public:
   }
   Type(TypeKind K) : Kind(K) {}
   virtual unsigned getSizeInBits() const = 0;
-  virtual ~Type() {}
+  virtual ~Type();
 };
 
+// Provide out-of-line definition to prevent weak vtable.
+Type::~Type() {}
+
 }
 
+namespace {
 class ExtendedIntegerType : public Type {
   unsigned BitWidth;
 public:
@@ -48,7 +52,7 @@ public:
   static bool classof(const Type *T) {
     return T->getKind() == TK_ExtendedIntegerType;
   }
-  unsigned getSizeInBits() const {
+  virtual unsigned getSizeInBits() const {
     return getBitWidth();
   }
   unsigned getBitWidth() const {
@@ -65,7 +69,7 @@ public:
   static bool classof(const Type *T) {
     return T->getKind() == TK_ExtendedVectorType;
   }
-  unsigned getSizeInBits() const {
+  virtual unsigned getSizeInBits() const {
     return getNumElements() * getElementType().getSizeInBits();
   }
   EVT getElementType() const {
@@ -75,6 +79,7 @@ public:
     return NumElements;
   }
 };
+} // end anonymous namespace
 
 static std::map<unsigned, const Type *>
   ExtendedIntegerTypeMap;
diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index 0838ac4..bdb4793 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp
@@ -81,31 +81,37 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_REXW_OPSIZE:
     return false;
   case IC_VEX:
-    return inheritsFrom(child, IC_VEX_L_W) ||
+    return (VEX_LIG && inheritsFrom(child, IC_VEX_L_W)) ||
            inheritsFrom(child, IC_VEX_W) ||
            (VEX_LIG && inheritsFrom(child, IC_VEX_L));
   case IC_VEX_XS:
-    return inheritsFrom(child, IC_VEX_L_W_XS) ||
+    return (VEX_LIG && inheritsFrom(child, IC_VEX_L_W_XS)) ||
            inheritsFrom(child, IC_VEX_W_XS) ||
            (VEX_LIG && inheritsFrom(child, IC_VEX_L_XS));
   case IC_VEX_XD:
-    return inheritsFrom(child, IC_VEX_L_W_XD) ||
+    return (VEX_LIG && inheritsFrom(child, IC_VEX_L_W_XD)) ||
            inheritsFrom(child, IC_VEX_W_XD) ||
            (VEX_LIG && inheritsFrom(child, IC_VEX_L_XD));
   case IC_VEX_OPSIZE:
-    return inheritsFrom(child, IC_VEX_L_W_OPSIZE) ||
+    return (VEX_LIG && inheritsFrom(child, IC_VEX_L_W_OPSIZE)) ||
            inheritsFrom(child, IC_VEX_W_OPSIZE) ||
            (VEX_LIG && inheritsFrom(child, IC_VEX_L_OPSIZE));
   case IC_VEX_W:
+    return VEX_LIG && inheritsFrom(child, IC_VEX_L_W);
   case IC_VEX_W_XS:
+    return VEX_LIG && inheritsFrom(child, IC_VEX_L_W_XS);
   case IC_VEX_W_XD:
+    return VEX_LIG && inheritsFrom(child, IC_VEX_L_W_XD);
   case IC_VEX_W_OPSIZE:
-    return false;
+    return VEX_LIG && inheritsFrom(child, IC_VEX_L_W_OPSIZE);
   case IC_VEX_L:
+    return inheritsFrom(child, IC_VEX_L_W);
   case IC_VEX_L_XS:
+    return inheritsFrom(child, IC_VEX_L_W_XS);
   case IC_VEX_L_XD:
+    return inheritsFrom(child, IC_VEX_L_W_XD);
   case IC_VEX_L_OPSIZE:
-    return false;
+    return inheritsFrom(child, IC_VEX_L_W_OPSIZE);
   case IC_VEX_L_W:
   case IC_VEX_L_W_XS:
   case IC_VEX_L_W_XD:
@@ -122,7 +128,7 @@ static inline bool inheritsFrom(InstructionContext child,
            inheritsFrom(child, IC_EVEX_L_W_XD);
   case IC_EVEX_OPSIZE:
     return inheritsFrom(child, IC_EVEX_W_OPSIZE) ||
-           inheritsFrom(child, IC_EVEX_W_OPSIZE);
+           inheritsFrom(child, IC_EVEX_L_W_OPSIZE);
   case IC_EVEX_W:
   case IC_EVEX_W_XS:
   case IC_EVEX_W_XD:
@@ -170,10 +176,24 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_L_XD_K:
   case IC_EVEX_L_OPSIZE_K:
     return false;
+  case IC_EVEX_W_KZ:
+  case IC_EVEX_W_XS_KZ:
+  case IC_EVEX_W_XD_KZ:
+  case IC_EVEX_W_OPSIZE_KZ:
+    return false;
+  case IC_EVEX_L_KZ:
+  case IC_EVEX_L_XS_KZ:
+  case IC_EVEX_L_XD_KZ:
+  case IC_EVEX_L_OPSIZE_KZ:
+    return false;
   case IC_EVEX_L_W_K:
   case IC_EVEX_L_W_XS_K:
   case IC_EVEX_L_W_XD_K:
   case IC_EVEX_L_W_OPSIZE_K:
+  case IC_EVEX_L_W_KZ:
+  case IC_EVEX_L_W_XS_KZ:
+  case IC_EVEX_L_W_XD_KZ:
+  case IC_EVEX_L_W_OPSIZE_KZ:
     return false;
   case IC_EVEX_L2_K:
   case IC_EVEX_L2_B:
@@ -181,12 +201,25 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_EVEX_L2_XD_K:
   case IC_EVEX_L2_OPSIZE_K:
   case IC_EVEX_L2_OPSIZE_B:
+  case IC_EVEX_L2_OPSIZE_K_B:
+  case IC_EVEX_L2_KZ:
+  case IC_EVEX_L2_XS_KZ:
+  case IC_EVEX_L2_XD_KZ:
+  case IC_EVEX_L2_OPSIZE_KZ:
+  case IC_EVEX_L2_OPSIZE_KZ_B:
     return false;
   case IC_EVEX_L2_W_K:
+  case IC_EVEX_L2_W_B:
   case IC_EVEX_L2_W_XS_K:
   case IC_EVEX_L2_W_XD_K:
   case IC_EVEX_L2_W_OPSIZE_K:
   case IC_EVEX_L2_W_OPSIZE_B:
+  case IC_EVEX_L2_W_OPSIZE_K_B:
+  case IC_EVEX_L2_W_KZ:
+  case IC_EVEX_L2_W_XS_KZ:
+  case IC_EVEX_L2_W_XD_KZ:
+  case IC_EVEX_L2_W_OPSIZE_KZ:
+  case IC_EVEX_L2_W_OPSIZE_KZ_B:
     return false;
   default:
     llvm_unreachable("Unknown instruction class");
@@ -207,7 +240,8 @@ static inline bool outranks(InstructionContext upper,
 
 #define ENUM_ENTRY(n, r, d) r,
 #define ENUM_ENTRY_K_B(n, r, d) ENUM_ENTRY(n, r, d) \
-  ENUM_ENTRY(n##_K_B, r, d) ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d)
+  ENUM_ENTRY(n##_K_B, r, d) ENUM_ENTRY(n##_KZ_B, r, d) \
+  ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d)
   static int ranks[IC_max] = {
     INSTRUCTION_CONTEXTS
   };
@@ -229,7 +263,8 @@ static inline const char* stringForContext(InstructionContext insnContext) {
     llvm_unreachable("Unhandled instruction class");
 #define ENUM_ENTRY(n, r, d)   case n: return #n; break;
 #define ENUM_ENTRY_K_B(n, r, d) ENUM_ENTRY(n, r, d) ENUM_ENTRY(n##_K_B, r, d)\
-        ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d)
+        ENUM_ENTRY(n##_KZ, r, d) ENUM_ENTRY(n##_K, r, d) ENUM_ENTRY(n##_B, r, d)\
+        ENUM_ENTRY(n##_KZ_B, r, d)
   INSTRUCTION_CONTEXTS
 #undef ENUM_ENTRY
 #undef ENUM_ENTRY_K_B
@@ -259,35 +294,6 @@ static inline const char* stringForOperandEncoding(OperandEncoding encoding) {
   }
 }
 
-void DisassemblerTables::emitOneID(raw_ostream &o, unsigned &i, InstrUID id,
-                                   bool addComma) const {
-  if (id)
-    o.indent(i * 2) << format("0x%hx", id);
-  else
-    o.indent(i * 2) << 0;
-
-  if (addComma)
-    o << ", ";
-  else
-    o << "  ";
-
-  o << "/* ";
-  o << InstructionSpecifiers[id].name;
-  o << "*/";
-
-  o << "\n";
-}
-
-/// emitEmptyTable - Emits the modRMEmptyTable, which is used as a ID table by
-///   all ModR/M decisions for instructions that are invalid for all possible
-///   ModR/M byte values.
-///
-/// @param o        - The output stream on which to emit the table.
-/// @param i        - The indentation level for that output stream.
-static void emitEmptyTable(raw_ostream &o, unsigned &i) {
-  o.indent(i * 2) << "0x0, /* EmptyTable */\n";
-}
-
 /// getDecisionType - Determines whether a ModRM decision with 255 entries can
 ///   be compacted by eliminating redundant information.
 ///
@@ -387,6 +393,7 @@ DisassemblerTables::~DisassemblerTables() {
 
 void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
                                            unsigned &i1, unsigned &i2,
+                                           unsigned &ModRMTableNum,
                                            ModRMDecision &decision) const {
   static uint32_t sTableNumber = 0;
   static uint32_t sEntryNumber = 1;
@@ -405,44 +412,56 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
     return;
   }
 
-  o1 << "/* Table" << sTableNumber << " */\n";
-  i1++;
+  std::vector<unsigned> ModRMDecision;
 
   switch (dt) {
     default:
       llvm_unreachable("Unknown decision type");
     case MODRM_ONEENTRY:
-      emitOneID(o1, i1, decision.instructionIDs[0], true);
+      ModRMDecision.push_back(decision.instructionIDs[0]);
       break;
     case MODRM_SPLITRM:
-      emitOneID(o1, i1, decision.instructionIDs[0x00], true); // mod = 0b00
-      emitOneID(o1, i1, decision.instructionIDs[0xc0], true); // mod = 0b11
+      ModRMDecision.push_back(decision.instructionIDs[0x00]);
+      ModRMDecision.push_back(decision.instructionIDs[0xc0]);
       break;
     case MODRM_SPLITREG:
       for (unsigned index = 0; index < 64; index += 8)
-        emitOneID(o1, i1, decision.instructionIDs[index], true);
+        ModRMDecision.push_back(decision.instructionIDs[index]);
       for (unsigned index = 0xc0; index < 256; index += 8)
-        emitOneID(o1, i1, decision.instructionIDs[index], true);
+        ModRMDecision.push_back(decision.instructionIDs[index]);
       break;
     case MODRM_SPLITMISC:
       for (unsigned index = 0; index < 64; index += 8)
-        emitOneID(o1, i1, decision.instructionIDs[index], true);
+        ModRMDecision.push_back(decision.instructionIDs[index]);
       for (unsigned index = 0xc0; index < 256; ++index)
-        emitOneID(o1, i1, decision.instructionIDs[index], true);
+        ModRMDecision.push_back(decision.instructionIDs[index]);
       break;
     case MODRM_FULL:
       for (unsigned index = 0; index < 256; ++index)
-        emitOneID(o1, i1, decision.instructionIDs[index], true);
+        ModRMDecision.push_back(decision.instructionIDs[index]);
       break;
   }
 
-  i1--;
+  unsigned &EntryNumber = ModRMTable[ModRMDecision];
+  if (EntryNumber == 0) {
+    EntryNumber = ModRMTableNum;
+
+    ModRMTableNum += ModRMDecision.size();
+    o1 << "/* Table" << EntryNumber << " */\n";
+    i1++;
+    for (std::vector<unsigned>::const_iterator I = ModRMDecision.begin(),
+           E = ModRMDecision.end(); I != E; ++I) {
+      o1.indent(i1 * 2) << format("0x%hx", *I) << ", /* "
+                        << InstructionSpecifiers[*I].name << " */\n";
+    }
+    i1--;
+  }
 
   o2.indent(i2) << "{ /* struct ModRMDecision */" << "\n";
   i2++;
 
   o2.indent(i2) << stringForDecisionType(dt) << "," << "\n";
-  o2.indent(i2) << sEntryNumber << " /* Table" << sTableNumber << " */\n";
+  o2.indent(i2) << EntryNumber << " /* Table" << EntryNumber << " */\n";
 
   i2--;
   o2.indent(i2) << "}";
@@ -476,6 +495,7 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
 
 void DisassemblerTables::emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
                                             unsigned &i1, unsigned &i2,
+                                            unsigned &ModRMTableNum,
                                             OpcodeDecision &decision) const {
   o2.indent(i2) << "{ /* struct OpcodeDecision */" << "\n";
   i2++;
@@ -487,7 +507,8 @@ void DisassemblerTables::emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
 
     o2 << "/* 0x" << format("%02hhx", index) << " */" << "\n";
 
-    emitModRMDecision(o1, o2, i1, i2, decision.modRMDecisions[index]);
+    emitModRMDecision(o1, o2, i1, i2, ModRMTableNum,
+                      decision.modRMDecisions[index]);
 
     if (index <  255)
       o2 << ",";
@@ -503,6 +524,7 @@ void DisassemblerTables::emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
 
 void DisassemblerTables::emitContextDecision(raw_ostream &o1, raw_ostream &o2,
                                              unsigned &i1, unsigned &i2,
+                                             unsigned &ModRMTableNum,
                                              ContextDecision &decision,
                                              const char* name) const {
   o2.indent(i2) << "static const struct ContextDecision " << name << " = {\n";
@@ -516,7 +538,8 @@ void DisassemblerTables::emitContextDecision(raw_ostream &o1, raw_ostream &o2,
     o2 << " */";
     o2 << "\n";
 
-    emitOpcodeDecision(o1, o2, i1, i2, decision.opcodeDecisions[index]);
+    emitOpcodeDecision(o1, o2, i1, i2, ModRMTableNum,
+                       decision.opcodeDecisions[index]);
 
     if (index + 1 < IC_max)
       o2 << ", ";
@@ -622,6 +645,12 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
 
     if ((index & ATTR_VEXL) && (index & ATTR_REXW) && (index & ATTR_OPSIZE))
       o << "IC_VEX_L_W_OPSIZE";
+    else if ((index & ATTR_VEXL) && (index & ATTR_REXW) && (index & ATTR_XD))
+      o << "IC_VEX_L_W_XD";
+    else if ((index & ATTR_VEXL) && (index & ATTR_REXW) && (index & ATTR_XS))
+      o << "IC_VEX_L_W_XS";
+    else if ((index & ATTR_VEXL) && (index & ATTR_REXW))
+      o << "IC_VEX_L_W";
     else if ((index & ATTR_VEXL) && (index & ATTR_OPSIZE))
       o << "IC_VEX_L_OPSIZE";
     else if ((index & ATTR_VEXL) && (index & ATTR_XD))
@@ -699,13 +728,17 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
 }
 
 void DisassemblerTables::emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
-                                             unsigned &i1, unsigned &i2) const {
-  emitContextDecision(o1, o2, i1, i2, *Tables[0], ONEBYTE_STR);
-  emitContextDecision(o1, o2, i1, i2, *Tables[1], TWOBYTE_STR);
-  emitContextDecision(o1, o2, i1, i2, *Tables[2], THREEBYTE38_STR);
-  emitContextDecision(o1, o2, i1, i2, *Tables[3], THREEBYTE3A_STR);
-  emitContextDecision(o1, o2, i1, i2, *Tables[4], THREEBYTEA6_STR);
-  emitContextDecision(o1, o2, i1, i2, *Tables[5], THREEBYTEA7_STR);
+                                              unsigned &i1, unsigned &i2,
+                                              unsigned &ModRMTableNum) const {
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[0], ONEBYTE_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[1], TWOBYTE_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[2], THREEBYTE38_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[3], THREEBYTE3A_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[4], THREEBYTEA6_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[5], THREEBYTEA7_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[6], XOP8_MAP_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[7], XOP9_MAP_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[8], XOPA_MAP_STR);
 }
 
 void DisassemblerTables::emit(raw_ostream &o) const {
@@ -724,11 +757,17 @@ void DisassemblerTables::emit(raw_ostream &o) const {
   emitContextTable(o, i2);
   o << "\n";
 
+  unsigned ModRMTableNum = 0;
+
   o << "static const InstrUID modRMTable[] = {\n";
   i1++;
-  emitEmptyTable(o1, i1);
+  std::vector<unsigned> EmptyTable(1, 0);
+  ModRMTable[EmptyTable] = ModRMTableNum;
+  ModRMTableNum += EmptyTable.size();
+  o1 << "/* EmptyTable */\n";
+  o1.indent(i1 * 2) << "0x0,\n";
   i1--;
-  emitContextDecisions(o1, o2, i1, i2);
+  emitContextDecisions(o1, o2, i1, i2, ModRMTableNum);
 
   o << o1.str();
   o << "  0x0\n";
diff --git a/utils/TableGen/X86DisassemblerTables.h b/utils/TableGen/X86DisassemblerTables.h
index 01aeaaf..bf8b127 100644
--- a/utils/TableGen/X86DisassemblerTables.h
+++ b/utils/TableGen/X86DisassemblerTables.h
@@ -20,6 +20,7 @@
 #include "X86DisassemblerShared.h"
 #include "X86ModRMFilters.h"
 #include "llvm/Support/raw_ostream.h"
+#include <map>
 #include <vector>
 
 namespace llvm {
@@ -39,7 +40,14 @@ private:
   /// [3] three-byte opcodes of the form 0f 3a __
   /// [4] three-byte opcodes of the form 0f a6 __
   /// [5] three-byte opcodes of the form 0f a7 __
-  ContextDecision* Tables[6];
+  /// [6] XOP8 map opcode
+  /// [7] XOP9 map opcode
+  /// [8] XOPA map opcode
+  ContextDecision* Tables[9];
+
+  // Table of ModRM encodings.
+  typedef std::map<std::vector<unsigned>, unsigned> ModRMMapTy;
+  mutable ModRMMapTy ModRMTable;
 
   /// The instruction information table
   std::vector<InstructionSpecifier> InstructionSpecifiers;
@@ -47,22 +55,6 @@ private:
   /// True if there are primary decode conflicts in the instruction set
   bool HasConflicts;
 
-  /// emitOneID - Emits a table entry for a single instruction entry, at the
-  ///   innermost level of the structure hierarchy.  The entry is printed out
-  ///   in the format "nnnn, /* MNEMONIC */" where nnnn is the ID in decimal,
-  ///   the comma is printed if addComma is true, and the menonic is the name
-  ///   of the instruction as listed in the LLVM tables.
-  ///
-  /// @param o        - The output stream to print the entry on.
-  /// @param i        - The indentation level for o.
-  /// @param id       - The unique ID of the instruction to print.
-  /// @param addComma - Whether or not to print a comma after the ID.  True if
-  ///                    additional items will follow.
-  void emitOneID(raw_ostream &o,
-                 uint32_t &i,
-                 InstrUID id,
-                 bool addComma) const;
-
   /// emitModRMDecision - Emits a table of entries corresponding to a single
   ///   ModR/M decision.  Compacts the ModR/M decision if possible.  ModR/M
   ///   decisions are printed as:
@@ -91,12 +83,11 @@ private:
   /// @param o2       - The output stream to print the decision structure to.
   /// @param i1       - The indentation level to use with stream o1.
   /// @param i2       - The indentation level to use with stream o2.
+  /// @param ModRMTableNum - next table number for adding to ModRMTable.
   /// @param decision - The ModR/M decision to emit.  This decision has 256
   ///                   entries - emitModRMDecision decides how to compact it.
-  void emitModRMDecision(raw_ostream &o1,
-                         raw_ostream &o2,
-                         uint32_t &i1,
-                         uint32_t &i2,
+  void emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
+                         unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
                          ModRMDecision &decision) const;
 
   /// emitOpcodeDecision - Emits an OpcodeDecision and all its subsidiary ModR/M
@@ -120,12 +111,11 @@ private:
   /// @param o2       - The output stream for the decision structure itself.
   /// @param i1       - The indent level to use with stream o1.
   /// @param i2       - The indent level to use with stream o2.
+  /// @param ModRMTableNum - next table number for adding to ModRMTable.
   /// @param decision - The OpcodeDecision to emit along with its subsidiary
   ///                    structures.
-  void emitOpcodeDecision(raw_ostream &o1,
-                          raw_ostream &o2,
-                          uint32_t &i1,
-                          uint32_t &i2,
+  void emitOpcodeDecision(raw_ostream &o1, raw_ostream &o2,
+                          unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
                           OpcodeDecision &decision) const;
 
   /// emitContextDecision - Emits a ContextDecision and all its subsidiary
@@ -155,15 +145,13 @@ private:
   /// @param o2       - The output stream to print the decision structure to.
   /// @param i1       - The indent level to use with stream o1.
   /// @param i2       - The indent level to use with stream o2.
+  /// @param ModRMTableNum - next table number for adding to ModRMTable.
   /// @param decision - The ContextDecision to emit along with its subsidiary
   ///                   structures.
   /// @param name     - The name for the ContextDecision.
-  void emitContextDecision(raw_ostream &o1,
-                           raw_ostream &o2,
-                           uint32_t &i1,
-                           uint32_t &i2,
-                           ContextDecision &decision,
-                           const char* name) const;
+  void emitContextDecision(raw_ostream &o1, raw_ostream &o2,
+                           unsigned &i1, unsigned &i2, unsigned &ModRMTableNum,
+                           ContextDecision &decision, const char* name) const;
 
   /// emitInstructionInfo - Prints the instruction specifier table, which has
   ///   one entry for each instruction, and contains name and operand
@@ -194,7 +182,7 @@ private:
   /// @param o  - The output stream to which the instruction table should be
   ///             written.
   /// @param i  - The indent level for use with the stream.
-  void emitInstructionInfo(raw_ostream &o, uint32_t &i) const;
+  void emitInstructionInfo(raw_ostream &o, unsigned &i) const;
 
   /// emitContextTable - Prints the table that is used to translate from an
   ///   instruction attribute mask to an instruction context.  This table is
@@ -220,10 +208,10 @@ private:
   /// @param o2 - The output stream to print the decision structures to.
   /// @param i1 - The indent level to use with stream o1.
   /// @param i2 - The indent level to use with stream o2.
-  void emitContextDecisions(raw_ostream &o1,
-                            raw_ostream &o2,
-                            uint32_t &i1,
-                            uint32_t &i2) const;
+  /// @param ModRMTableNum - next table number for adding to ModRMTable.
+  void emitContextDecisions(raw_ostream &o1, raw_ostream &o2,
+                            unsigned &i1, unsigned &i2,
+                            unsigned &ModRMTableNum) const;
 
   /// setTableFields - Uses a ModRMFilter to set the appropriate entries in a
   ///   ModRMDecision to refer to a particular instruction ID.
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 7962f9b..708e72d 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -79,7 +79,8 @@ namespace X86Local {
     DC = 7, DD = 8, DE = 9, DF = 10,
     XD = 11,  XS = 12,
     T8 = 13,  P_TA = 14,
-    A6 = 15,  A7 = 16, T8XD = 17, T8XS = 18, TAXD = 19
+    A6 = 15,  A7 = 16, T8XD = 17, T8XS = 18, TAXD = 19,
+    XOP8 = 20, XOP9 = 21, XOPA = 22
   };
 }
 
@@ -134,6 +135,10 @@ namespace X86Local {
 #define THREE_BYTE_38_EXTENSION_TABLES \
   EXTENSION_TABLE(F3)
 
+#define XOP9_MAP_EXTENSION_TABLES \
+  EXTENSION_TABLE(01)             \
+  EXTENSION_TABLE(02)
+
 using namespace X86Disassembler;
 
 /// needsModRMForDecode - Indicates whether a particular instruction requires a
@@ -239,6 +244,7 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   HasEVEXPrefix    = Rec->getValueAsBit("hasEVEXPrefix");
   HasEVEX_L2Prefix = Rec->getValueAsBit("hasEVEX_L2");
   HasEVEX_K        = Rec->getValueAsBit("hasEVEX_K");
+  HasEVEX_KZ       = Rec->getValueAsBit("hasEVEX_Z");
   HasEVEX_B        = Rec->getValueAsBit("hasEVEX_B");
   HasLockPrefix    = Rec->getValueAsBit("hasLockPrefix");
   IsCodeGenOnly    = Rec->getValueAsBit("isCodeGenOnly");
@@ -299,8 +305,10 @@ void RecognizableInstr::processInstr(DisassemblerTables &tables,
     recogInstr.emitDecodePath(tables);
 }
 
-#define EVEX_KB(n) (HasEVEX_K && HasEVEX_B? n##_K_B : \
-                    (HasEVEX_K? n##_K : (HasEVEX_B ? n##_B : n)))
+#define EVEX_KB(n) (HasEVEX_KZ && HasEVEX_B ? n##_KZ_B : \
+                    (HasEVEX_K && HasEVEX_B ? n##_K_B : \
+                    (HasEVEX_KZ ? n##_KZ : \
+                    (HasEVEX_K? n##_K : (HasEVEX_B ? n##_B : n)))))
 
 InstructionContext RecognizableInstr::insnContext() const {
   InstructionContext insnContext;
@@ -486,7 +494,8 @@ RecognizableInstr::filter_ret RecognizableInstr::filter() const {
   assert(Rec->isSubClassOf("X86Inst") && "Can only filter X86 instructions");
 
   if (Form == X86Local::Pseudo ||
-      (IsCodeGenOnly && Name.find("_REV") == Name.npos))
+      (IsCodeGenOnly && Name.find("_REV") == Name.npos &&
+       Name.find("INC32") == Name.npos && Name.find("DEC32") == Name.npos))
     return FILTER_STRONG;
 
 
@@ -516,35 +525,17 @@ RecognizableInstr::filter_ret RecognizableInstr::filter() const {
 
   // Filter out alternate forms of AVX instructions
   if (Name.find("_alt") != Name.npos ||
-      Name.find("XrYr") != Name.npos ||
-      (Name.find("r64r") != Name.npos && Name.find("r64r64") == Name.npos) ||
+      (Name.find("r64r") != Name.npos && Name.find("r64r64") == Name.npos && Name.find("r64r8") == Name.npos) ||
       Name.find("_64mr") != Name.npos ||
-      Name.find("Xrr") != Name.npos ||
       Name.find("rr64") != Name.npos)
     return FILTER_WEAK;
 
   // Special cases.
 
-  if (Name.find("PCMPISTRI") != Name.npos && Name != "PCMPISTRI")
-    return FILTER_WEAK;
-  if (Name.find("PCMPESTRI") != Name.npos && Name != "PCMPESTRI")
-    return FILTER_WEAK;
-
-  if (Name.find("MOV") != Name.npos && Name.find("r0") != Name.npos)
-    return FILTER_WEAK;
-  if (Name.find("MOVZ") != Name.npos && Name.find("MOVZX") == Name.npos)
-    return FILTER_WEAK;
-  if (Name.find("Fs") != Name.npos)
-    return FILTER_WEAK;
   if (Name == "PUSH64i16"         ||
       Name == "MOVPQI2QImr"       ||
       Name == "VMOVPQI2QImr"      ||
-      Name == "MMX_MOVD64rrv164"  ||
-      Name == "MOV64ri64i32"      ||
-      Name == "VMASKMOVDQU64"     ||
-      Name == "VEXTRACTPSrr64"    ||
-      Name == "VMOVQd64rr"        ||
-      Name == "VMOVQs64rr")
+      Name == "VMASKMOVDQU64")
     return FILTER_WEAK;
 
   // XACQUIRE and XRELEASE reuse REPNE and REP respectively.
@@ -553,11 +544,6 @@ RecognizableInstr::filter_ret RecognizableInstr::filter() const {
       Name == "XRELEASE_PREFIX")
     return FILTER_WEAK;
 
-  if (HasFROperands && Name.find("MOV") != Name.npos &&
-     ((Name.find("2") != Name.npos && Name.find("32") == Name.npos) ||
-      (Name.find("to") != Name.npos)))
-    return FILTER_STRONG;
-
   return FILTER_NORMAL;
 }
 
@@ -818,17 +804,20 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
   case X86Local::MRM5r:
   case X86Local::MRM6r:
   case X86Local::MRM7r:
-    // Operand 1 is a register operand in the R/M field.
-    // Operand 2 (optional) is an immediate or relocation.
-    // Operand 3 (optional) is an immediate.
-    if (HasVEX_4VPrefix)
-      assert(numPhysicalOperands <= 3 &&
-             "Unexpected number of operands for MRMnRFrm with VEX_4V");
-    else
-      assert(numPhysicalOperands <= 3 &&
-             "Unexpected number of operands for MRMnRFrm");
+    {
+      // Operand 1 is a register operand in the R/M field.
+      // Operand 2 (optional) is an immediate or relocation.
+      // Operand 3 (optional) is an immediate.
+      unsigned kOp = (HasEVEX_K) ? 1:0;
+      unsigned Op4v = (HasVEX_4VPrefix) ? 1:0;
+      if (numPhysicalOperands > 3 + kOp + Op4v)
+        llvm_unreachable("Unexpected number of operands for MRMnr");
+    }
     if (HasVEX_4VPrefix)
       HANDLE_OPERAND(vvvvRegister)
+
+    if (HasEVEX_K)
+      HANDLE_OPERAND(writemaskRegister)
     HANDLE_OPTIONAL(rmRegister)
     HANDLE_OPTIONAL(relocation)
     HANDLE_OPTIONAL(immediate)
@@ -841,16 +830,19 @@ void RecognizableInstr::emitInstructionSpecifier(DisassemblerTables &tables) {
   case X86Local::MRM5m:
   case X86Local::MRM6m:
   case X86Local::MRM7m:
-    // Operand 1 is a memory operand (possibly SIB-extended)
-    // Operand 2 (optional) is an immediate or relocation.
-    if (HasVEX_4VPrefix)
-      assert(numPhysicalOperands >= 2 && numPhysicalOperands <= 3 &&
-             "Unexpected number of operands for MRMnMFrm");
-    else
-      assert(numPhysicalOperands >= 1 && numPhysicalOperands <= 2 &&
-             "Unexpected number of operands for MRMnMFrm");
+    {
+      // Operand 1 is a memory operand (possibly SIB-extended)
+      // Operand 2 (optional) is an immediate or relocation.
+      unsigned kOp = (HasEVEX_K) ? 1:0;
+      unsigned Op4v = (HasVEX_4VPrefix) ? 1:0;
+      if (numPhysicalOperands < 1 + kOp + Op4v ||
+          numPhysicalOperands > 2 + kOp + Op4v)
+        llvm_unreachable("Unexpected number of operands for MRMnm");
+    }
     if (HasVEX_4VPrefix)
       HANDLE_OPERAND(vvvvRegister)
+    if (HasEVEX_K)
+      HANDLE_OPERAND(writemaskRegister)
     HANDLE_OPERAND(memory)
     HANDLE_OPTIONAL(relocation)
     break;
@@ -902,6 +894,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
   uint8_t       opcodeToSet = 0;
 
   switch (Prefix) {
+  default: llvm_unreachable("Invalid prefix!");
   // Extended two-byte opcodes can start with f2 0f, f3 0f, or 0f
   case X86Local::XD:
   case X86Local::XS:
@@ -1015,6 +1008,63 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
       filter = new DumbFilter();
     opcodeToSet = Opcode;
     break;
+  case X86Local::XOP8:
+    opcodeType = XOP8_MAP;
+    if (needsModRMForDecode(Form))
+      filter = new ModFilter(isRegFormat(Form));
+    else
+      filter = new DumbFilter();
+    opcodeToSet = Opcode;
+    break;
+  case X86Local::XOP9:
+    opcodeType = XOP9_MAP;
+    switch (Opcode) {
+    default:
+      if (needsModRMForDecode(Form))
+        filter = new ModFilter(isRegFormat(Form));
+      else
+        filter = new DumbFilter();
+      break;
+#define EXTENSION_TABLE(n) case 0x##n:
+    XOP9_MAP_EXTENSION_TABLES
+#undef EXTENSION_TABLE
+      switch (Form) {
+      default:
+        llvm_unreachable("Unhandled XOP9 extended opcode");
+      case X86Local::MRM0r:
+      case X86Local::MRM1r:
+      case X86Local::MRM2r:
+      case X86Local::MRM3r:
+      case X86Local::MRM4r:
+      case X86Local::MRM5r:
+      case X86Local::MRM6r:
+      case X86Local::MRM7r:
+        filter = new ExtendedFilter(true, Form - X86Local::MRM0r);
+        break;
+      case X86Local::MRM0m:
+      case X86Local::MRM1m:
+      case X86Local::MRM2m:
+      case X86Local::MRM3m:
+      case X86Local::MRM4m:
+      case X86Local::MRM5m:
+      case X86Local::MRM6m:
+      case X86Local::MRM7m:
+        filter = new ExtendedFilter(false, Form - X86Local::MRM0m);
+        break;
+      MRM_MAPPING
+      } // switch (Form)
+      break;
+    } // switch (Opcode)
+    opcodeToSet = Opcode;
+    break;
+  case X86Local::XOPA:
+    opcodeType = XOPA_MAP;
+    if (needsModRMForDecode(Form))
+      filter = new ModFilter(isRegFormat(Form));
+    else
+      filter = new DumbFilter();
+    opcodeToSet = Opcode;
+    break;
   case X86Local::D8:
   case X86Local::D9:
   case X86Local::DA:
@@ -1035,7 +1085,7 @@ void RecognizableInstr::emitDecodePath(DisassemblerTables &tables) const {
     opcodeToSet = 0xd8 + (Prefix - X86Local::D8);
     break;
   case X86Local::REP:
-  default:
+  case 0:
     opcodeType = ONEBYTE;
     switch (Opcode) {
 #define EXTENSION_TABLE(n) case 0x##n:
@@ -1166,6 +1216,7 @@ OperandType RecognizableInstr::typeFromString(const std::string &s,
   TYPE("i32i8imm",            TYPE_IMM32)
   TYPE("u32u8imm",            TYPE_IMM32)
   TYPE("GR32",                TYPE_Rv)
+  TYPE("GR32orGR64",          TYPE_R32)
   TYPE("i64mem",              TYPE_Mv)
   TYPE("i64i32imm",           TYPE_IMM64)
   TYPE("i64i8imm",            TYPE_IMM64)
@@ -1276,6 +1327,7 @@ OperandEncoding RecognizableInstr::rmRegisterEncodingFromString
    bool hasOpSizePrefix) {
   ENCODING("GR16",            ENCODING_RM)
   ENCODING("GR32",            ENCODING_RM)
+  ENCODING("GR32orGR64",      ENCODING_RM)
   ENCODING("GR64",            ENCODING_RM)
   ENCODING("GR8",             ENCODING_RM)
   ENCODING("VR128",           ENCODING_RM)
@@ -1299,6 +1351,7 @@ OperandEncoding RecognizableInstr::roRegisterEncodingFromString
    bool hasOpSizePrefix) {
   ENCODING("GR16",            ENCODING_REG)
   ENCODING("GR32",            ENCODING_REG)
+  ENCODING("GR32orGR64",      ENCODING_REG)
   ENCODING("GR64",            ENCODING_REG)
   ENCODING("GR8",             ENCODING_REG)
   ENCODING("VR128",           ENCODING_REG)
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 7e1d362..4d4686e 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -72,6 +72,8 @@ private:
   bool HasEVEX_L2Prefix;
   /// The hasEVEX_K field from the record
   bool HasEVEX_K;
+  /// The hasEVEX_KZ field from the record
+  bool HasEVEX_KZ;
   /// The hasEVEX_B field from the record
   bool HasEVEX_B;
   /// The hasLockPrefix field from the record
diff --git a/utils/buildit/build_llvm b/utils/buildit/build_llvm
index 4ed25f2..9e93216 100755
--- a/utils/buildit/build_llvm
+++ b/utils/buildit/build_llvm
@@ -104,7 +104,8 @@ COMMON_CONFIGURE_OPTS="\
   --enable-assertions=$LLVM_ASSERTIONS \
   --enable-optimized=$LLVM_OPTIMIZED \
   --disable-bindings \
-  --disable-zlib"
+  --disable-zlib \
+  --enable-terminfo=no"
 
 COMMON_MAKEFLAGS="\
   UNIVERSAL=1 \
diff --git a/utils/emacs/llvm-mode.el b/utils/emacs/llvm-mode.el
index 25d9742..99d3294 100644
--- a/utils/emacs/llvm-mode.el
+++ b/utils/emacs/llvm-mode.el
@@ -32,16 +32,23 @@
                     "null" "undef" "to" "except" "not" "target" "endian" "little" "big"
                     "pointersize" "volatile" "fastcc" "coldcc" "cc") 'words) . font-lock-keyword-face)
    ;; Arithmetic and Logical Operators
-   `(,(regexp-opt '("add" "sub" "mul" "div" "rem" "and" "or" "xor"
+   `(,(regexp-opt '("add" "sub" "mul" "sdiv" "udiv" "urem" "srem" "and" "or" "xor"
                     "setne" "seteq" "setlt" "setgt" "setle" "setge") 'words) . font-lock-keyword-face)
    ;; Floating-point operators
    `(,(regexp-opt '("fadd" "fsub" "fmul" "fdiv" "frem") 'words) . font-lock-keyword-face)
    ;; Special instructions
-   `(,(regexp-opt '("phi" "tail" "call" "cast" "select" "to" "shl" "shr" "fcmp" "icmp" "vaarg" "vanext") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad") 'words) . font-lock-keyword-face)
    ;; Control instructions
-   `(,(regexp-opt '("ret" "br" "switch" "invoke" "unwind" "unreachable") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("ret" "br" "switch" "invoke" "resume" "unwind" "unreachable" "indirectbr") 'words) . font-lock-keyword-face)
    ;; Memory operators
-   `(,(regexp-opt '("malloc" "alloca" "free" "load" "store" "getelementptr") 'words) . font-lock-keyword-face)
+   `(,(regexp-opt '("malloc" "alloca" "free" "load" "store" "getelementptr" "fence" "cmpxchg" "atomicrmw") 'words) . font-lock-keyword-face)
+   ;; Casts
+   `(,(regexp-opt '("bitcast" "inttoptr" "ptrtoint" "trunc" "zext" "sext" "fptrunc" "fpext" "fptoui" "fptosi" "uitofp" "sitofp" "addrspacecast") 'words) . font-lock-keyword-face)
+   ;; Vector ops
+   `(,(regexp-opt '("extractelement" "insertelement" "shufflevector") 'words) . font-lock-keyword-face)
+   ;; Aggregate ops
+   `(,(regexp-opt '("extractvalue" "insertvalue") 'words) . font-lock-keyword-face)
+
    )
   "Syntax highlighting for LLVM"
   )
diff --git a/utils/kate/llvm.xml b/utils/kate/llvm.xml
index 1778cfc..dfacfb5 100644
--- a/utils/kate/llvm.xml
+++ b/utils/kate/llvm.xml
@@ -41,7 +41,6 @@
       <item> private </item>
       <item> linker_private </item>
       <item> linker_private_weak </item>
-      <item> linker_private_weak_def_auto </item>
       <item> internal </item>
       <item> available_externally </item>
       <item> linkonce </item>
@@ -85,6 +84,7 @@
       <item> noredzone </item>
       <item> noreturn </item>
       <item> nounwind </item>
+      <item> optnone </item>
       <item> optsize </item>
       <item> readnone </item>
       <item> readonly </item>
@@ -157,6 +157,7 @@
       <item> ptrtoint </item>
       <item> inttoptr </item>
       <item> bitcast </item>
+      <item> addrspacecast </item>
       <item> icmp </item>
       <item> fcmp </item>
       <item> phi </item>
diff --git a/utils/lit/TODO b/utils/lit/TODO
index d2ff842..c1a60c6 100644
--- a/utils/lit/TODO
+++ b/utils/lit/TODO
@@ -1,26 +1,166 @@
- - Move temp directory name into local test config.
+================
+ lit TODO Items
+================
 
- - Add --show-unsupported, don't show by default?
+Infrastructure
+==============
 
- - Optionally use multiprocessing.
+1. Change to always load suites, then resolve command line arguments?
 
- - Support valgrind in all configs, and LLVM style valgrind.
+  Currently we expect each input argument to be a path on disk; we do a
+  recursive search to find the test suite for each item, but then we only do a
+  local search based at the input path to find tests. Additionally, for any path
+  that matches a file on disk we explicitly construct a test instance (bypassing
+  the formats on discovery implementation).
 
- - Support a timeout / ulimit.
+  This has a couple problems:
 
- - Rename 'lit' injected variable for config to be lit_config.
+  * The test format doesn't have control over the test instances that result
+    from file paths.
 
- - Allow import of 'lit' in test suite definitions.
+  * It isn't possible to specify virtual tests as inputs. For example, it is not
+    possible to specify an individual subtest to run with the googletest format.
 
- - Create an explicit test suite object (instead of using the top-level
-   TestingConfig object).
+  * The test format doesn't have full control over the discovery of tests in
+    subdirectories.
 
- - Allow 'lit' driver to cooperate with test suites to add options (or at least
-   sanitize accepted params).
+  Instead, we should move to a model whereby first all of the input specifiers
+  are resolved to test suites, and then the resolution of the input specifier is
+  delegated to each test suite. This could take a couple forms:
 
- - Consider move to identifying all tests by path-to-test-suite and then path to
+  * We could resolve to test suites, then fully load each test suite, then have
+    a fixed process to map input specifiers to tests in the test suite
+    (presumably based on path-in-suite derivations). This has the benefit of
+    being consistent across all test formats, but the downside of requiring
+    loading the entire test suite.
+
+  * We could delegate all of the resolution of specifiers to the test
+    suite. This would allow formats that anticipate large test suites to manage
+    their own resolution for better performance. We could provide a default
+    resolution strategy that was similar to what we do now (start at subpaths
+    for directories, but allow the test format control over what happens for
+    individual tests).
+
+2. Consider move to identifying all tests by path-to-test-suite and then path to
    subtest, and don't use test suite names.
 
- - Consider move to change workflow to always load suites, then resolve command
-   line arguments.
+  Currently the test suite name is presented as part of test names, but it has
+  no other useful function, and it is something that has to be skipped over to
+  cut-and-paste a name to subsequently use to rerun a test. If we just
+  represented each test suite by the path to its suite, then it would allow more
+  easy cut-and-paste of the test output lines. This has the downside that the
+  lines might get rather long.
+
+3. Allow 'lit' driver to cooperate with test formats and suites to add options
+   (or at least sanitize accepted params).
+
+  We have started to use the --params method more and more extensively, and it is
+  cumbersome and error prone. Additionally, there are currently various options
+  ``lit`` honors that should more correctly be specified as belonging to the
+  ShTest test format.
+  
+  It would be really nice if we could allow test formats and test suites to add
+  their own options to be parsed. The difficulty here, of course, is that we
+  don't know what test formats or test suites are in use until we have parsed the
+  input specifiers. For test formats we could ostensibly require all the possible
+  formats to be registered in order to have options, but for test suites we would
+  certainly have to load the suite before we can query it for what options it
+  understands.
+
+  That leaves us with the following options:
+
+  * Currently we could almost get away with parsing the input specifiers without
+    having done option parsing first (the exception is ``--config-prefix``) but
+    that isn't a very extensible design.
+
+  * We could make a distinction in the command line syntax for test format and
+    test suite options. For example, we could require something like::
+
+      lit -j 1 -sv input-specifier -- --some-format-option
+
+    which would be relatively easy to implement with optparser (I think).
+
+  * We could allow fully interspersed arguments by first extracting the options
+    lit knows about and parsing them, then dispatching the remainder to the
+    formats. This seems the most convenient for users, who are unlikely to care
+    about (or even be aware of) the distinction between the generic lit
+    infrastructure and format or suite specific options.
+
+4. Eliminate duplicate execution models for ShTest tests.
+
+  Currently, the ShTest format uses tests written with shell-script like syntax,
+  and executes them in one of two ways. The first way is by converting them into
+  a bash script and literally executing externally them using bash. The second
+  way is through the use of an internal shell parser and shell execution code
+  (built on the subprocess module). The external execution mode is used on most
+  Unix systems that have bash, the internal execution mode is used on Windows.
+
+  Having two ways to do the same thing is error prone and leads to unnecessary
+  complexity in the testing environment. Additionally, because the mode that
+  converts scripts to bash doesn't try and validate the syntax, it is possible
+  to write tests that use bash shell features unsupported by the internal
+  shell. Such tests won't work on Windows but this may not be obvious to the
+  developer writing the test.
+
+  Another limitation is that when executing the scripts externally, the ShTest
+  format has no idea which commands fail, or what output comes from which
+  commands, so this limits how convenient the output of ShTest failures can be
+  and limits other features (for example, knowing what temporary files were
+  written).
+
+  We should eliminate having two ways of executing the same tests to reduce
+  platform differences and make it easier to develop new features in the ShTest
+  module. This is currently blocked on:
+
+  * The external execution mode is faster in some situations, because it avoids
+    being bottlenecked on the GIL. This can hopefully be obviated simply by
+    using --use-processes.
+
+  * Some tests in LLVM/Clang are explicitly disabled with the internal shell
+    (because they use features specific to bash). We would need to rewrite these
+    tests, or add additional features to the internal shell handling to allow
+    them to pass.
+
+5. Consider changing core to support setup vs. execute distinction.
+
+  Many of the existing test formats are cleanly divided into two phases, once
+  parses the test format and extracts XFAIL and REQUIRES information, etc., and
+  the other code actually executes the test.
+
+  We could make this distinction part of the core infrastructure and that would
+  enable a couple things:
+
+  * The REQUIREs handling could be lifted to the core, which is nice.
+
+  * This would provide a clear place to insert subtest support, because the
+    setup phase could be responsible for providing subtests back to the
+    core. That would provide part of the infrastructure to parallelize them, for
+    example, and would probably interact well with other possible features like
+    parameterized tests.
+
+  * This affords a clean implementation of --no-execute.
+
+  * One possible downside could be for test formats that cannot determine their
+    subtests without having executed the test. Supporting such formats would
+    either force the test to actually be executed in the setup stage (which
+    might be ok, as long as the API was explicitly phrased to support that), or
+    would mean we are forced into supporting subtests as return values from the
+    execute phase.
+
+  Any format can just keep all of its code in execute, presumably, so the only
+  cost of implementing this is its impact on the API and futures changes.
+
+
+Miscellaneous
+=============
+
+* Move temp directory name into local test config.
+
+* Add --show-unsupported, don't show by default?
+
+* Support valgrind in all configs, and LLVM style valgrind.
+
+* Support a timeout / ulimit.
 
+* Create an explicit test suite object (instead of using the top-level
+  TestingConfig object).
diff --git a/utils/lit/examples/README.txt b/utils/lit/examples/README.txt
new file mode 100644
index 0000000..a59daa8
--- /dev/null
+++ b/utils/lit/examples/README.txt
@@ -0,0 +1,7 @@
+==============
+ lit Examples
+==============
+
+This directory contains examples of 'lit' test suite configurations. The test
+suites they define can be run with 'lit examples/example-name', for more details
+see the README in each example.
diff --git a/utils/lit/examples/many-tests/README.txt b/utils/lit/examples/many-tests/README.txt
new file mode 100644
index 0000000..6bffff1
--- /dev/null
+++ b/utils/lit/examples/many-tests/README.txt
@@ -0,0 +1,10 @@
+========================
+ Many Tests lit Example
+========================
+
+This directory contains a trivial lit test suite configuration that defines a
+custom test format which just generates a large (N=10000) number of tests that
+do a small amount of work in the Python test execution code.
+
+This test suite is useful for testing the performance of lit on large numbers of
+tests.
diff --git a/utils/lit/examples/many-tests/lit.cfg b/utils/lit/examples/many-tests/lit.cfg
new file mode 100644
index 0000000..8f7b940
--- /dev/null
+++ b/utils/lit/examples/many-tests/lit.cfg
@@ -0,0 +1,23 @@
+# -*- Python -*-
+
+from lit import Test
+
+class ManyTests(object):
+    def __init__(self, N=10000):
+        self.N = N
+
+    def getTestsInDirectory(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        for i in range(self.N):
+            test_name = 'test-%04d' % (i,)
+            yield Test.Test(testSuite, path_in_suite + (test_name,),
+                            localConfig)
+
+    def execute(self, test, litConfig):
+        # Do a "non-trivial" amount of Python work.
+        sum = 0
+        for i in range(10000):
+            sum += i
+        return Test.PASS,''
+
+config.test_format = ManyTests()
diff --git a/utils/lit/lit/ExampleTests/Clang/fsyntax-only.c b/utils/lit/lit/ExampleTests/Clang/fsyntax-only.c
deleted file mode 100644
index a4a064b..0000000
--- a/utils/lit/lit/ExampleTests/Clang/fsyntax-only.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: clang -fsyntax-only -Xclang -verify %s
-
-int f0(void) {} // expected-warning {{control reaches end of non-void function}}
-
diff --git a/utils/lit/lit/ExampleTests/Clang/lit.cfg b/utils/lit/lit/ExampleTests/Clang/lit.cfg
deleted file mode 100644
index 9295bd9..0000000
--- a/utils/lit/lit/ExampleTests/Clang/lit.cfg
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- Python -*-
-
-# Configuration file for the 'lit' test runner.
-
-# name: The name of this test suite.
-config.name = 'Clang'
-
-# testFormat: The test format to use to interpret tests.
-#
-# For now we require '&&' between commands, until they get globally killed and
-# the test runner updated.
-config.test_format = lit.formats.ShTest(execute_external = True)
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.m', '.mm']
-
-# target_triple: Used by ShTest format for XFAIL checks.
-config.target_triple = 'foo'
-
-###
-
-# Discover the 'clang' and 'clangcc' to use.
-
-import os
-
-def inferClang(PATH):
-    # Determine which clang to use.
-    clang = os.getenv('CLANG')
-
-    # If the user set clang in the environment, definitely use that and don't
-    # try to validate.
-    if clang:
-        return clang
-
-    # Otherwise look in the path.
-    clang = lit.util.which('clang', PATH)
-
-    if not clang:
-        lit.fatal("couldn't find 'clang' program, try setting "
-                  "CLANG in your environment")
-
-    return clang
-
-clang = inferClang(config.environment['PATH'])
-if not lit.quiet:
-    lit.note('using clang: %r' % clang)
-config.substitutions.append( (' clang ', ' ' + clang + ' ') )
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/data.txt b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/data.txt
deleted file mode 100644
index 45b983b..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/data.txt
+++ /dev/null
@@ -1 +0,0 @@
-hi
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/pct-S.ll b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/pct-S.ll
deleted file mode 100644
index 3ff3633..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/pct-S.ll
+++ /dev/null
@@ -1 +0,0 @@
-; RUN: grep "hi" %S/data.txt
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
deleted file mode 100644
index 533c445..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- Python -*-
-
-# Configuration file for the 'lit' test runner.
-
-import os
-
-# name: The name of this test suite.
-config.name = 'LLVM'
-
-# testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.ShTest()
-
-# suffixes: A list of file extensions to treat as test files, this is actually
-# set by on_clone().
-config.suffixes = [ '.ll' ]
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root path where tests should be run.
-llvm_obj_root = getattr(config, 'llvm_obj_root', None)
-if llvm_obj_root is not None:
-    config.test_exec_root = os.path.join(llvm_obj_root, 'test')
-
-###
-
-import os
-
-# Check that the object root is known.
-if config.test_exec_root is None:
-    # Otherwise, we haven't loaded the site specific configuration (the user is
-    # probably trying to run on a test file directly, and either the site
-    # configuration hasn't been created by the build system, or we are in an
-    # out-of-tree build situation).
-
-    # Try to detect the situation where we are using an out-of-tree build by
-    # looking for 'llvm-config'.
-    #
-    # FIXME: I debated (i.e., wrote and threw away) adding logic to
-    # automagically generate the lit.site.cfg if we are in some kind of fresh
-    # build situation. This means knowing how to invoke the build system
-    # though, and I decided it was too much magic.
-
-    llvm_config = lit.util.which('llvm-config', config.environment['PATH'])
-    if not llvm_config:
-        lit.fatal('No site specific configuration available!')
-
-    # Get the source and object roots.
-    llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
-    llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip()
-
-    # Validate that we got a tree which points to here.
-    this_src_root = os.path.dirname(config.test_source_root)
-    if os.path.realpath(llvm_src_root) != os.path.realpath(this_src_root):
-        lit.fatal('No site specific configuration available!')
-
-    # Check that the site specific configuration exists.
-    site_cfg = os.path.join(llvm_obj_root, 'test', 'lit.site.cfg')
-    if not os.path.exists(site_cfg):
-        lit.fatal('No site specific configuration available!')
-
-    # Okay, that worked. Notify the user of the automagic, and reconfigure.
-    lit.note('using out-of-tree build at %r' % llvm_obj_root)
-    lit.load_config(config, site_cfg)
-    raise SystemExit
-
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.site.cfg b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.site.cfg
deleted file mode 100644
index d45f3ac..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.site.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
-# -*- Python -*-
-
-# Preserve some key paths for use by main LLVM test suite config.
-config.llvm_obj_root = os.path.dirname(os.path.dirname(__file__))
-
-# Let the main config do the real work.
-lit.load_config(config, os.path.join(config.llvm_obj_root, 'test/lit.cfg'))
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/lit.local.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/lit.local.cfg
deleted file mode 100644
index 80d0c7e..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.excludes = ['src']
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/Foo/lit.local.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/Foo/lit.local.cfg
deleted file mode 100644
index e69de29..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/Foo/lit.local.cfg
+++ /dev/null
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/lit.site.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/lit.site.cfg
deleted file mode 100644
index 94a02d8..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/obj/test/lit.site.cfg
+++ /dev/null
@@ -1,8 +0,0 @@
-# -*- Python -*-
-
-# Preserve some key paths for use by main LLVM test suite config.
-config.llvm_obj_root = os.path.dirname(os.path.dirname(__file__))
-
-# Let the main config do the real work.
-lit.load_config(config, os.path.join(config.llvm_obj_root,
-                                     '../src/test/lit.cfg'))
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/data.txt b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/data.txt
deleted file mode 100644
index 45b983b..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/data.txt
+++ /dev/null
@@ -1 +0,0 @@
-hi
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/pct-S.ll b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/pct-S.ll
deleted file mode 100644
index 3ff3633..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/Foo/pct-S.ll
+++ /dev/null
@@ -1 +0,0 @@
-; RUN: grep "hi" %S/data.txt
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
deleted file mode 100644
index 533c445..0000000
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
+++ /dev/null
@@ -1,66 +0,0 @@
-# -*- Python -*-
-
-# Configuration file for the 'lit' test runner.
-
-import os
-
-# name: The name of this test suite.
-config.name = 'LLVM'
-
-# testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.ShTest()
-
-# suffixes: A list of file extensions to treat as test files, this is actually
-# set by on_clone().
-config.suffixes = [ '.ll' ]
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root path where tests should be run.
-llvm_obj_root = getattr(config, 'llvm_obj_root', None)
-if llvm_obj_root is not None:
-    config.test_exec_root = os.path.join(llvm_obj_root, 'test')
-
-###
-
-import os
-
-# Check that the object root is known.
-if config.test_exec_root is None:
-    # Otherwise, we haven't loaded the site specific configuration (the user is
-    # probably trying to run on a test file directly, and either the site
-    # configuration hasn't been created by the build system, or we are in an
-    # out-of-tree build situation).
-
-    # Try to detect the situation where we are using an out-of-tree build by
-    # looking for 'llvm-config'.
-    #
-    # FIXME: I debated (i.e., wrote and threw away) adding logic to
-    # automagically generate the lit.site.cfg if we are in some kind of fresh
-    # build situation. This means knowing how to invoke the build system
-    # though, and I decided it was too much magic.
-
-    llvm_config = lit.util.which('llvm-config', config.environment['PATH'])
-    if not llvm_config:
-        lit.fatal('No site specific configuration available!')
-
-    # Get the source and object roots.
-    llvm_src_root = lit.util.capture(['llvm-config', '--src-root']).strip()
-    llvm_obj_root = lit.util.capture(['llvm-config', '--obj-root']).strip()
-
-    # Validate that we got a tree which points to here.
-    this_src_root = os.path.dirname(config.test_source_root)
-    if os.path.realpath(llvm_src_root) != os.path.realpath(this_src_root):
-        lit.fatal('No site specific configuration available!')
-
-    # Check that the site specific configuration exists.
-    site_cfg = os.path.join(llvm_obj_root, 'test', 'lit.site.cfg')
-    if not os.path.exists(site_cfg):
-        lit.fatal('No site specific configuration available!')
-
-    # Okay, that worked. Notify the user of the automagic, and reconfigure.
-    lit.note('using out-of-tree build at %r' % llvm_obj_root)
-    lit.load_config(config, site_cfg)
-    raise SystemExit
-
diff --git a/utils/lit/lit/ExampleTests/ManyTests/lit.local.cfg b/utils/lit/lit/ExampleTests/ManyTests/lit.local.cfg
deleted file mode 100644
index 6cc4752..0000000
--- a/utils/lit/lit/ExampleTests/ManyTests/lit.local.cfg
+++ /dev/null
@@ -1,23 +0,0 @@
-# -*- Python -*-
-
-Test = lit.Test
-
-class ManyTests(object):
-    def __init__(self, N=10000):
-        self.N = N
-
-    def getTestsInDirectory(self, testSuite, path_in_suite,
-                            litConfig, localConfig):
-        for i in range(self.N):
-            test_name = 'test-%04d' % (i,)
-            yield Test.Test(testSuite, path_in_suite + (test_name,),
-                            localConfig)
-
-    def execute(self, test, litConfig):
-        # Do a "non-trivial" amount of Python work.
-        sum = 0
-        for i in range(10000):
-            sum += i
-        return Test.PASS,''
-
-config.test_format = ManyTests()
diff --git a/utils/lit/lit/ExampleTests/ShExternal/lit.local.cfg b/utils/lit/lit/ExampleTests/ShExternal/lit.local.cfg
deleted file mode 100644
index 1061da6..0000000
--- a/utils/lit/lit/ExampleTests/ShExternal/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-# -*- Python -*-
-
-config.test_format = lit.formats.ShTest(execute_external = True)
-
-config.suffixes = ['.c']
-
diff --git a/utils/lit/lit/ExampleTests/ShInternal/lit.local.cfg b/utils/lit/lit/ExampleTests/ShInternal/lit.local.cfg
deleted file mode 100644
index 448eaa4..0000000
--- a/utils/lit/lit/ExampleTests/ShInternal/lit.local.cfg
+++ /dev/null
@@ -1,6 +0,0 @@
-# -*- Python -*-
-
-config.test_format = lit.formats.ShTest(execute_external = False)
-
-config.suffixes = ['.c']
-
diff --git a/utils/lit/lit/ExampleTests/fail.c b/utils/lit/lit/ExampleTests/fail.c
deleted file mode 100644
index 84db41a..0000000
--- a/utils/lit/lit/ExampleTests/fail.c
+++ /dev/null
@@ -1,2 +0,0 @@
-// RUN: echo 'I am some stdout'
-// RUN: false
diff --git a/utils/lit/lit/ExampleTests/lit.cfg b/utils/lit/lit/ExampleTests/lit.cfg
deleted file mode 100644
index 164daba..0000000
--- a/utils/lit/lit/ExampleTests/lit.cfg
+++ /dev/null
@@ -1,26 +0,0 @@
-# -*- Python -*-
-
-# Configuration file for the 'lit' test runner.
-
-# name: The name of this test suite.
-config.name = 'Examples'
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.m', '.mm', '.ll']
-
-# testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.ShTest()
-
-# test_source_root: The path where tests are located (default is the test suite
-# root).
-config.test_source_root = None
-
-# test_exec_root: The path where tests are located (default is the test suite
-# root).
-config.test_exec_root = None
-
-# target_triple: Used by ShTest format for XFAIL checks.
-config.target_triple = 'foo'
-
-# available_features: Used by ShTest format for REQUIRES checks.
-config.available_features.add('some-feature-name')
diff --git a/utils/lit/lit/ExampleTests/pass.c b/utils/lit/lit/ExampleTests/pass.c
deleted file mode 100644
index 5c1031c..0000000
--- a/utils/lit/lit/ExampleTests/pass.c
+++ /dev/null
@@ -1 +0,0 @@
-// RUN: true
diff --git a/utils/lit/lit/ExampleTests/required-and-missing.c b/utils/lit/lit/ExampleTests/required-and-missing.c
deleted file mode 100644
index 47ba72e..0000000
--- a/utils/lit/lit/ExampleTests/required-and-missing.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// This test shouldn't be run, the required feature is missing.
-//
-// RUN: false
-// REQUIRES: some-missing-feature-name
diff --git a/utils/lit/lit/ExampleTests/required-and-present.c b/utils/lit/lit/ExampleTests/required-and-present.c
deleted file mode 100644
index 2a09e08..0000000
--- a/utils/lit/lit/ExampleTests/required-and-present.c
+++ /dev/null
@@ -1,2 +0,0 @@
-// RUN: true
-// REQUIRES: some-feature-name
diff --git a/utils/lit/lit/ExampleTests/vg-fail.c b/utils/lit/lit/ExampleTests/vg-fail.c
deleted file mode 100644
index e3339ff..0000000
--- a/utils/lit/lit/ExampleTests/vg-fail.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// This test should XPASS, when run without valgrind.
-
-// RUN: true
-// XFAIL: valgrind
diff --git a/utils/lit/lit/ExampleTests/xfail-feature.c b/utils/lit/lit/ExampleTests/xfail-feature.c
deleted file mode 100644
index 3444bf8..0000000
--- a/utils/lit/lit/ExampleTests/xfail-feature.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// This test should XPASS.
-
-// RUN: true
-// XFAIL: some-feature-name
diff --git a/utils/lit/lit/ExampleTests/xfail.c b/utils/lit/lit/ExampleTests/xfail.c
deleted file mode 100644
index b36cd99..0000000
--- a/utils/lit/lit/ExampleTests/xfail.c
+++ /dev/null
@@ -1,2 +0,0 @@
-// RUN: false
-// XFAIL: *
diff --git a/utils/lit/lit/ExampleTests/xpass.c b/utils/lit/lit/ExampleTests/xpass.c
deleted file mode 100644
index ad84990..0000000
--- a/utils/lit/lit/ExampleTests/xpass.c
+++ /dev/null
@@ -1,2 +0,0 @@
-// RUN: true
-// XFAIL
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index bd7a603..b0dde5d 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -1,10 +1,12 @@
 from __future__ import absolute_import
+import inspect
 import os
+import sys
 
 import lit.Test
-import lit.TestFormats
+import lit.formats
 import lit.TestingConfig
-import lit.Util
+import lit.util
 
 class LitConfig:
     """LitConfig - Configuration data for a 'lit' test runner instance, shared
@@ -16,15 +18,6 @@ class LitConfig:
     easily.
     """
 
-    # Provide access to Test module.
-    Test = lit.Test
-
-    # Provide access to built-in formats.
-    formats = lit.TestFormats
-
-    # Provide access to built-in utility functions.
-    util = lit.Util
-
     def __init__(self, progname, path, quiet,
                  useValgrind, valgrindLeakCheck, valgrindArgs,
                  noExecute, debug, isWindows,
@@ -32,7 +25,7 @@ class LitConfig:
         # The name of the test runner.
         self.progname = progname
         # The items to add to the PATH environment variable.
-        self.path = list(map(str, path))
+        self.path = [str(p) for p in path]
         self.quiet = bool(quiet)
         self.useValgrind = bool(useValgrind)
         self.valgrindLeakCheck = bool(valgrindLeakCheck)
@@ -70,23 +63,17 @@ class LitConfig:
         path."""
         if self.debug:
             self.note('load_config from %r' % path)
-        return lit.TestingConfig.TestingConfig.frompath(
-            path, config.parent, self, mustExist = True, config = config)
+        config.load_from_path(path, self)
+        return config
 
     def getBashPath(self):
         """getBashPath - Get the path to 'bash'"""
-        import os
-
         if self.bashPath is not None:
             return self.bashPath
 
-        self.bashPath = lit.Util.which('bash', os.pathsep.join(self.path))
+        self.bashPath = lit.util.which('bash', os.pathsep.join(self.path))
         if self.bashPath is None:
-            # Check some known paths.
-            for path in ('/bin/bash', '/usr/bin/bash', '/usr/local/bin/bash'):
-                if os.path.exists(path):
-                    self.bashPath = path
-                    break
+            self.bashPath = lit.util.which('bash')
 
         if self.bashPath is None:
             self.warning("Unable to find 'bash'.")
@@ -96,13 +83,13 @@ class LitConfig:
 
     def getToolsPath(self, dir, paths, tools):
         if dir is not None and os.path.isabs(dir) and os.path.isdir(dir):
-            if not lit.Util.checkToolsPath(dir, tools):
+            if not lit.util.checkToolsPath(dir, tools):
                 return None
         else:
-            dir = lit.Util.whichTools(tools, paths)
+            dir = lit.util.whichTools(tools, paths)
 
         # bash
-        self.bashPath = lit.Util.which('bash', dir)
+        self.bashPath = lit.util.which('bash', dir)
         if self.bashPath is None:
             self.note("Unable to find 'bash.exe'.")
             self.bashPath = ''
@@ -110,8 +97,6 @@ class LitConfig:
         return dir
 
     def _write_message(self, kind, message):
-        import inspect, os, sys
-
         # Get the file/line where this message was generated.
         f = inspect.currentframe()
         # Step out of _write_message, and then out of wrapper.
@@ -134,6 +119,5 @@ class LitConfig:
         self.numErrors += 1
 
     def fatal(self, message):
-        import sys
         self._write_message('fatal', message)
         sys.exit(2)
diff --git a/utils/lit/lit/LitTestCase.py b/utils/lit/lit/LitTestCase.py
index 8951185..e04846c 100644
--- a/utils/lit/lit/LitTestCase.py
+++ b/utils/lit/lit/LitTestCase.py
@@ -1,5 +1,7 @@
+from __future__ import absolute_import
 import unittest
-import Test
+
+import lit.Test
 
 """
 TestCase adaptor for providing a 'unittest' compatible interface to 'lit' tests.
@@ -9,10 +11,10 @@ class UnresolvedError(RuntimeError):
     pass
         
 class LitTestCase(unittest.TestCase):
-    def __init__(self, test, lit_config):
+    def __init__(self, test, run):
         unittest.TestCase.__init__(self)
         self._test = test
-        self._lit_config = lit_config
+        self._run = run
 
     def id(self):
         return self._test.getFullName()
@@ -21,10 +23,12 @@ class LitTestCase(unittest.TestCase):
         return self._test.getFullName()
 
     def runTest(self):
-        tr, output = self._test.config.test_format.execute(
-            self._test, self._lit_config)
+        # Run the test.
+        self._run.execute_test(self._test)
 
-        if tr is Test.UNRESOLVED:
-            raise UnresolvedError(output)
-        elif tr.isFailure:
-            self.fail(output)
+        # Adapt the result to unittest.
+        result = self._test.result
+        if result.code is lit.Test.UNRESOLVED:
+            raise UnresolvedError(result.output)
+        elif result.code.isFailure:
+            self.fail(result.output)
diff --git a/utils/lit/lit/ProgressBar.py b/utils/lit/lit/ProgressBar.py
index 0454ba2..e3644f1 100644
--- a/utils/lit/lit/ProgressBar.py
+++ b/utils/lit/lit/ProgressBar.py
@@ -5,6 +5,10 @@
 
 import sys, re, time
 
+def to_bytes(str):
+    # Encode to Latin1 to get binary data.
+    return str.encode('ISO-8859-1')
+
 class TerminalController:
     """
     A class that can be used to portably generate formatted output to
@@ -116,26 +120,34 @@ class TerminalController:
         set_fg = self._tigetstr('setf')
         if set_fg:
             for i,color in zip(range(len(self._COLORS)), self._COLORS):
-                setattr(self, color, curses.tparm(set_fg, i) or '')
+                setattr(self, color, self._tparm(set_fg, i))
         set_fg_ansi = self._tigetstr('setaf')
         if set_fg_ansi:
             for i,color in zip(range(len(self._ANSICOLORS)), self._ANSICOLORS):
-                setattr(self, color, curses.tparm(set_fg_ansi, i) or '')
+                setattr(self, color, self._tparm(set_fg_ansi, i))
         set_bg = self._tigetstr('setb')
         if set_bg:
             for i,color in zip(range(len(self._COLORS)), self._COLORS):
-                setattr(self, 'BG_'+color, curses.tparm(set_bg, i) or '')
+                setattr(self, 'BG_'+color, self._tparm(set_bg, i))
         set_bg_ansi = self._tigetstr('setab')
         if set_bg_ansi:
             for i,color in zip(range(len(self._ANSICOLORS)), self._ANSICOLORS):
-                setattr(self, 'BG_'+color, curses.tparm(set_bg_ansi, i) or '')
+                setattr(self, 'BG_'+color, self._tparm(set_bg_ansi, i))
+
+    def _tparm(self, arg, index):
+        import curses
+        return curses.tparm(to_bytes(arg), index).decode('ascii') or ''
 
     def _tigetstr(self, cap_name):
         # String capabilities can include "delays" of the form "$<2>".
         # For any modern terminal, we should be able to just ignore
         # these, so strip them out.
         import curses
-        cap = curses.tigetstr(cap_name) or ''
+        cap = curses.tigetstr(cap_name)
+        if cap is None:
+            cap = ''
+        else:
+            cap = cap.decode('ascii')
         return re.sub(r'\$<\d+>[/*]?', '', cap)
 
     def render(self, template):
@@ -269,7 +281,6 @@ class ProgressBar:
             self.cleared = 1
 
 def test():
-    import time
     tc = TerminalController()
     p = ProgressBar(tc, 'Tests')
     for i in range(101):
diff --git a/utils/lit/lit/ShUtil.py b/utils/lit/lit/ShUtil.py
index fb0689b..1945ba7 100644
--- a/utils/lit/lit/ShUtil.py
+++ b/utils/lit/lit/ShUtil.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import
 import itertools
 
-import lit.Util
+import lit.util
 from lit.ShCommands import Command, Pipeline, Seq
 
 class ShLexer:
@@ -75,7 +75,7 @@ class ShLexer:
                 # Outside of a string, '\\' escapes everything.
                 self.eat()
                 if self.pos == self.end:
-                    lit.Util.warning(
+                    lit.util.warning(
                         "escape at end of quoted argument in: %r" % self.data)
                     return str
                 str += self.eat()
@@ -93,7 +93,7 @@ class ShLexer:
                 # Inside a '"' quoted string, '\\' only escapes the quote
                 # character and backslash, otherwise it is preserved.
                 if self.pos == self.end:
-                    lit.Util.warning(
+                    lit.util.warning(
                         "escape at end of quoted argument in: %r" % self.data)
                     return str
                 c = self.eat()
@@ -105,7 +105,7 @@ class ShLexer:
                     str += '\\' + c
             else:
                 str += c
-        lit.Util.warning("missing quote character in %r" % self.data)
+        lit.util.warning("missing quote character in %r" % self.data)
         return str
     
     def lex_arg_checked(self, c):
diff --git a/utils/lit/lit/Test.py b/utils/lit/lit/Test.py
index 9471e3a..b4988f5 100644
--- a/utils/lit/lit/Test.py
+++ b/utils/lit/lit/Test.py
@@ -1,8 +1,21 @@
 import os
 
-# Test results.
+# Test result codes.
+
+class ResultCode(object):
+    """Test result codes."""
+
+    # We override __new__ and __getnewargs__ to ensure that pickling still
+    # provides unique ResultCode objects in any particular instance.
+    _instances = {}
+    def __new__(cls, name, isFailure):
+        res = cls._instances.get(name)
+        if res is None:
+            cls._instances[name] = res = super(ResultCode, cls).__new__(cls)
+        return res
+    def __getnewargs__(self):
+        return (self.name, self.isFailure)
 
-class TestResult:
     def __init__(self, name, isFailure):
         self.name = name
         self.isFailure = isFailure
@@ -11,20 +24,87 @@ class TestResult:
         return '%s%r' % (self.__class__.__name__,
                          (self.name, self.isFailure))
 
-PASS        = TestResult('PASS', False)
-XFAIL       = TestResult('XFAIL', False)
-FAIL        = TestResult('FAIL', True)
-XPASS       = TestResult('XPASS', True)
-UNRESOLVED  = TestResult('UNRESOLVED', True)
-UNSUPPORTED = TestResult('UNSUPPORTED', False)
+PASS        = ResultCode('PASS', False)
+XFAIL       = ResultCode('XFAIL', False)
+FAIL        = ResultCode('FAIL', True)
+XPASS       = ResultCode('XPASS', True)
+UNRESOLVED  = ResultCode('UNRESOLVED', True)
+UNSUPPORTED = ResultCode('UNSUPPORTED', False)
 
-# Test classes.
+# Test metric values.
 
-class TestFormat:
-    """TestFormat - Test information provider."""
+class MetricValue(object):
+    def format(self):
+        """
+        format() -> str
 
-    def __init__(self, name):
-        self.name = name
+        Convert this metric to a string suitable for displaying as part of the
+        console output.
+        """
+        raise RuntimeError("abstract method")
+
+    def todata(self):
+        """
+        todata() -> json-serializable data
+
+        Convert this metric to content suitable for serializing in the JSON test
+        output.
+        """
+        raise RuntimeError("abstract method")
+
+class IntMetricValue(MetricValue):
+    def __init__(self, value):
+        self.value = value
+
+    def format(self):
+        return str(self.value)
+
+    def todata(self):
+        return self.value
+
+class RealMetricValue(MetricValue):
+    def __init__(self, value):
+        self.value = value
+
+    def format(self):
+        return '%.4f' % self.value
+
+    def todata(self):
+        return self.value
+
+# Test results.
+
+class Result(object):
+    """Wrapper for the results of executing an individual test."""
+
+    def __init__(self, code, output='', elapsed=None):
+        # The result code.
+        self.code = code
+        # The test output.
+        self.output = output
+        # The wall timing to execute the test, if timing.
+        self.elapsed = elapsed
+        # The metrics reported by this test.
+        self.metrics = {}
+
+    def addMetric(self, name, value):
+        """
+        addMetric(name, value)
+
+        Attach a test metric to the test result, with the given name and list of
+        values. It is an error to attempt to attach the metrics with the same
+        name multiple times.
+
+        Each value must be an instance of a MetricValue subclass.
+        """
+        if name in self.metrics:
+            raise ValueError("result already includes metrics for %r" % (
+                    name,))
+        if not isinstance(value, MetricValue):
+            raise TypeError("unexpected metric value: %r" % (value,))
+        self.metrics[name] = value
+
+# Test classes.
 
 class TestSuite:
     """TestSuite - Information on a group of tests.
@@ -52,27 +132,28 @@ class Test:
         self.suite = suite
         self.path_in_suite = path_in_suite
         self.config = config
-        # The test result code, once complete.
+        # A list of conditions under which this test is expected to fail. These
+        # can optionally be provided by test format handlers, and will be
+        # honored when the test result is supplied.
+        self.xfails = []
+        # The test result, once complete.
         self.result = None
-        # Any additional output from the test, once complete.
-        self.output = None
-        # The wall time to execute this test, if timing and once complete.
-        self.elapsed = None
-        # The repeat index of this test, or None.
-        self.index = None
-
-    def copyWithIndex(self, index):
-        import copy
-        res = copy.copy(self)
-        res.index = index
-        return res
 
-    def setResult(self, result, output, elapsed):
-        assert self.result is None, "Test result already set!"
+    def setResult(self, result):
+        if self.result is not None:
+            raise ArgumentError("test result already set")
+        if not isinstance(result, Result):
+            raise ArgumentError("unexpected result type")
+
         self.result = result
-        self.output = output
-        self.elapsed = elapsed
 
+        # Apply the XFAIL handling to resolve the result exit code.
+        if self.isExpectedToFail():
+            if self.result.code == PASS:
+                self.result.code = XPASS
+            elif self.result.code == FAIL:
+                self.result.code = XFAIL
+        
     def getFullName(self):
         return self.suite.config.name + ' :: ' + '/'.join(self.path_in_suite)
 
@@ -81,3 +162,29 @@ class Test:
 
     def getExecPath(self):
         return self.suite.getExecPath(self.path_in_suite)
+
+    def isExpectedToFail(self):
+        """
+        isExpectedToFail() -> bool
+
+        Check whether this test is expected to fail in the current
+        configuration. This check relies on the test xfails property which by
+        some test formats may not be computed until the test has first been
+        executed.
+        """
+
+        # Check if any of the xfails match an available feature or the target.
+        for item in self.xfails:
+            # If this is the wildcard, it always fails.
+            if item == '*':
+                return True
+
+            # If this is an exact match for one of the features, it fails.
+            if item in self.config.available_features:
+                return True
+
+            # If this is a part of the target triple, it fails.
+            if item in self.suite.config.target_triple:
+                return True
+
+        return False
diff --git a/utils/lit/lit/TestFormats.py b/utils/lit/lit/TestFormats.py
deleted file mode 100644
index 9e0c7a0..0000000
--- a/utils/lit/lit/TestFormats.py
+++ /dev/null
@@ -1,229 +0,0 @@
-from __future__ import absolute_import
-import os
-import sys
-
-import lit.Test
-import lit.TestRunner
-import lit.Util
-
-kIsWindows = sys.platform in ['win32', 'cygwin']
-
-class GoogleTest(object):
-    def __init__(self, test_sub_dir, test_suffix):
-        self.test_sub_dir = os.path.normcase(str(test_sub_dir)).split(';')
-        self.test_suffix = str(test_suffix)
-
-        # On Windows, assume tests will also end in '.exe'.
-        if kIsWindows:
-            self.test_suffix += '.exe'
-
-    def getGTestTests(self, path, litConfig, localConfig):
-        """getGTestTests(path) - [name]
-
-        Return the tests available in gtest executable.
-
-        Args:
-          path: String path to a gtest executable
-          litConfig: LitConfig instance
-          localConfig: TestingConfig instance"""
-
-        try:
-            lines = lit.Util.capture([path, '--gtest_list_tests'],
-                                     env=localConfig.environment)
-            if kIsWindows:
-              lines = lines.replace('\r', '')
-            lines = lines.split('\n')
-        except:
-            litConfig.error("unable to discover google-tests in %r" % path)
-            raise StopIteration
-
-        nested_tests = []
-        for ln in lines:
-            if not ln.strip():
-                continue
-
-            prefix = ''
-            index = 0
-            while ln[index*2:index*2+2] == '  ':
-                index += 1
-            while len(nested_tests) > index:
-                nested_tests.pop()
-
-            ln = ln[index*2:]
-            if ln.endswith('.'):
-                nested_tests.append(ln)
-            else:
-                yield ''.join(nested_tests) + ln
-
-    # Note: path_in_suite should not include the executable name.
-    def getTestsInExecutable(self, testSuite, path_in_suite, execpath,
-                             litConfig, localConfig):
-        if not execpath.endswith(self.test_suffix):
-            return
-        (dirname, basename) = os.path.split(execpath)
-        # Discover the tests in this executable.
-        for testname in self.getGTestTests(execpath, litConfig, localConfig):
-            testPath = path_in_suite + (basename, testname)
-            yield lit.Test.Test(testSuite, testPath, localConfig)
-
-    def getTestsInDirectory(self, testSuite, path_in_suite,
-                            litConfig, localConfig):
-        source_path = testSuite.getSourcePath(path_in_suite)
-        for filename in os.listdir(source_path):
-            filepath = os.path.join(source_path, filename)
-            if os.path.isdir(filepath):
-                # Iterate over executables in a directory.
-                if not os.path.normcase(filename) in self.test_sub_dir:
-                    continue
-                dirpath_in_suite = path_in_suite + (filename, )
-                for subfilename in os.listdir(filepath):
-                    execpath = os.path.join(filepath, subfilename)
-                    for test in self.getTestsInExecutable(
-                            testSuite, dirpath_in_suite, execpath,
-                            litConfig, localConfig):
-                      yield test
-            elif ('.' in self.test_sub_dir):
-                for test in self.getTestsInExecutable(
-                        testSuite, path_in_suite, filepath,
-                        litConfig, localConfig):
-                    yield test
-
-    def execute(self, test, litConfig):
-        testPath,testName = os.path.split(test.getSourcePath())
-        while not os.path.exists(testPath):
-            # Handle GTest parametrized and typed tests, whose name includes
-            # some '/'s.
-            testPath, namePrefix = os.path.split(testPath)
-            testName = os.path.join(namePrefix, testName)
-
-        cmd = [testPath, '--gtest_filter=' + testName]
-        if litConfig.useValgrind:
-            cmd = litConfig.valgrindArgs + cmd
-
-        if litConfig.noExecute:
-            return lit.Test.PASS, ''
-
-        out, err, exitCode = lit.TestRunner.executeCommand(
-            cmd, env=test.config.environment)
-
-        if not exitCode:
-            return lit.Test.PASS,''
-
-        return lit.Test.FAIL, out + err
-
-###
-
-class FileBasedTest(object):
-    def getTestsInDirectory(self, testSuite, path_in_suite,
-                            litConfig, localConfig):
-        source_path = testSuite.getSourcePath(path_in_suite)
-        for filename in os.listdir(source_path):
-            # Ignore dot files and excluded tests.
-            if (filename.startswith('.') or
-                filename in localConfig.excludes):
-                continue
-
-            filepath = os.path.join(source_path, filename)
-            if not os.path.isdir(filepath):
-                base,ext = os.path.splitext(filename)
-                if ext in localConfig.suffixes:
-                    yield lit.Test.Test(testSuite, path_in_suite + (filename,),
-                                        localConfig)
-
-class ShTest(FileBasedTest):
-    def __init__(self, execute_external = False):
-        self.execute_external = execute_external
-
-    def execute(self, test, litConfig):
-        return lit.TestRunner.executeShTest(test, litConfig,
-                                            self.execute_external)
-
-###
-
-import re
-import tempfile
-
-class OneCommandPerFileTest:
-    # FIXME: Refactor into generic test for running some command on a directory
-    # of inputs.
-
-    def __init__(self, command, dir, recursive=False,
-                 pattern=".*", useTempInput=False):
-        if isinstance(command, str):
-            self.command = [command]
-        else:
-            self.command = list(command)
-        if dir is not None:
-            dir = str(dir)
-        self.dir = dir
-        self.recursive = bool(recursive)
-        self.pattern = re.compile(pattern)
-        self.useTempInput = useTempInput
-
-    def getTestsInDirectory(self, testSuite, path_in_suite,
-                            litConfig, localConfig):
-        dir = self.dir
-        if dir is None:
-            dir = testSuite.getSourcePath(path_in_suite)
-
-        for dirname,subdirs,filenames in os.walk(dir):
-            if not self.recursive:
-                subdirs[:] = []
-
-            subdirs[:] = [d for d in subdirs
-                          if (d != '.svn' and
-                              d not in localConfig.excludes)]
-
-            for filename in filenames:
-                if (filename.startswith('.') or
-                    not self.pattern.match(filename) or
-                    filename in localConfig.excludes):
-                    continue
-
-                path = os.path.join(dirname,filename)
-                suffix = path[len(dir):]
-                if suffix.startswith(os.sep):
-                    suffix = suffix[1:]
-                test = lit.Test.Test(
-                    testSuite, path_in_suite + tuple(suffix.split(os.sep)),
-                    localConfig)
-                # FIXME: Hack?
-                test.source_path = path
-                yield test
-
-    def createTempInput(self, tmp, test):
-        abstract
-
-    def execute(self, test, litConfig):
-        if test.config.unsupported:
-            return (lit.Test.UNSUPPORTED, 'Test is unsupported')
-
-        cmd = list(self.command)
-
-        # If using temp input, create a temporary file and hand it to the
-        # subclass.
-        if self.useTempInput:
-            tmp = tempfile.NamedTemporaryFile(suffix='.cpp')
-            self.createTempInput(tmp, test)
-            tmp.flush()
-            cmd.append(tmp.name)
-        elif hasattr(test, 'source_path'):
-            cmd.append(test.source_path)
-        else:
-            cmd.append(test.getSourcePath())
-
-        out, err, exitCode = lit.TestRunner.executeCommand(cmd)
-
-        diags = out + err
-        if not exitCode and not diags.strip():
-            return lit.Test.PASS,''
-
-        # Try to include some useful information.
-        report = """Command: %s\n""" % ' '.join(["'%s'" % a
-                                                 for a in cmd])
-        if self.useTempInput:
-            report += """Temporary File: %s\n""" % tmp.name
-            report += "--\n%s--\n""" % open(tmp.name).read()
-        report += """Output:\n--\n%s--""" % diags
-
-        return lit.Test.FAIL, report
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 989a992..9752417 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -3,14 +3,10 @@ import os, signal, subprocess, sys
 import re
 import platform
 import tempfile
-try:
-    from io import StringIO
-except ImportError:
-    from StringIO import StringIO
 
 import lit.ShUtil as ShUtil
 import lit.Test as Test
-import lit.Util as Util
+import lit.util
 
 class InternalShellError(Exception):
     def __init__(self, command, message):
@@ -25,25 +21,6 @@ kUseCloseFDs = not kIsWindows
 # Use temporary files to replace /dev/null on Windows.
 kAvoidDevNull = kIsWindows
 
-def executeCommand(command, cwd=None, env=None):
-    # Close extra file handles on UNIX (on Windows this cannot be done while
-    # also redirecting input).
-    close_fds = not kIsWindows
-
-    p = subprocess.Popen(command, cwd=cwd,
-                         stdin=subprocess.PIPE,
-                         stdout=subprocess.PIPE,
-                         stderr=subprocess.PIPE,
-                         env=env, close_fds=close_fds)
-    out,err = p.communicate()
-    exitCode = p.wait()
-
-    # Detect Ctrl-C in subprocess.
-    if exitCode == -signal.SIGINT:
-        raise KeyboardInterrupt
-
-    return out, err, exitCode
-
 def executeShCmd(cmd, cfg, cwd, results):
     if isinstance(cmd, ShUtil.Seq):
         if cmd.op == ';':
@@ -154,8 +131,8 @@ def executeShCmd(cmd, cfg, cwd, results):
 
         # Resolve the executable path ourselves.
         args = list(j.args)
-        args[0] = Util.which(args[0], cfg.environment['PATH'])
-        if not args[0]:
+        executable = lit.util.which(args[0], cfg.environment['PATH'])
+        if not executable:
             raise InternalShellError(j, '%r: command not found' % j.args[0])
 
         # Replace uses of /dev/null with temporary files.
@@ -168,6 +145,7 @@ def executeShCmd(cmd, cfg, cwd, results):
                     args[i] = f.name
 
         procs.append(subprocess.Popen(args, cwd=cwd,
+                                      executable = executable,
                                       stdin = stdin,
                                       stdout = stdout,
                                       stderr = stderr,
@@ -221,10 +199,22 @@ def executeShCmd(cmd, cfg, cwd, results):
         if res == -signal.SIGINT:
             raise KeyboardInterrupt
 
+        # Ensure the resulting output is always of string type.
+        try:
+            out = str(out.decode('ascii'))
+        except:
+            out = str(out)
+        try:
+            err = str(err.decode('ascii'))
+        except:
+            err = str(err)
+
         results.append((cmd.commands[i], out, err, res))
         if cmd.pipe_err:
             # Python treats the exit code as a signed char.
-            if res < 0:
+            if exitCode is None:
+                exitCode = res
+            elif res < 0:
                 exitCode = min(exitCode, res)
             else:
                 exitCode = max(exitCode, res)
@@ -250,7 +240,7 @@ def executeScriptInternal(test, litConfig, tmpBase, commands, cwd):
             cmds.append(ShUtil.ShParser(ln, litConfig.isWindows,
                                         test.config.pipefail).parse())
         except:
-            return (Test.FAIL, "shell parser error on: %r" % ln)
+            return lit.Test.Result(Test.FAIL, "shell parser error on: %r" % ln)
 
     cmd = cmds[0]
     for c in cmds[1:]:
@@ -306,24 +296,60 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
             # run on clang with no real loss.
             command = litConfig.valgrindArgs + command
 
-    return executeCommand(command, cwd=cwd, env=test.config.environment)
+    return lit.util.executeCommand(command, cwd=cwd,
+                                   env=test.config.environment)
 
-def isExpectedFail(test, xfails):
-    # Check if any of the xfails match an available feature or the target.
-    for item in xfails:
-        # If this is the wildcard, it always fails.
-        if item == '*':
-            return True
-
-        # If this is an exact match for one of the features, it fails.
-        if item in test.config.available_features:
-            return True
+def parseIntegratedTestScriptCommands(source_path):
+    """
+    parseIntegratedTestScriptCommands(source_path) -> commands
 
-        # If this is a part of the target triple, it fails.
-        if item in test.suite.config.target_triple:
-            return True
+    Parse the commands in an integrated test script file into a list of
+    (line_number, command_type, line).
+    """
 
-    return False
+    # This code is carefully written to be dual compatible with Python 2.5+ and
+    # Python 3 without requiring input files to always have valid codings. The
+    # trick we use is to open the file in binary mode and use the regular
+    # expression library to find the commands, with it scanning strings in
+    # Python2 and bytes in Python3.
+    #
+    # Once we find a match, we do require each script line to be decodable to
+    # ascii, so we convert the outputs to ascii before returning. This way the
+    # remaining code can work with "strings" agnostic of the executing Python
+    # version.
+    
+    def to_bytes(str):
+        # Encode to Latin1 to get binary data.
+        return str.encode('ISO-8859-1')
+    keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.')
+    keywords_re = re.compile(
+        to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),)))
+
+    f = open(source_path, 'rb')
+    try:
+        # Read the entire file contents.
+        data = f.read()
+
+        # Iterate over the matches.
+        line_number = 1
+        last_match_position = 0
+        for match in keywords_re.finditer(data):
+            # Compute the updated line number by counting the intervening
+            # newlines.
+            match_position = match.start()
+            line_number += data.count(to_bytes('\n'), last_match_position,
+                                      match_position)
+            last_match_position = match_position
+
+            # Convert the keyword and line to ascii strings and yield the
+            # command. Note that we take care to return regular strings in
+            # Python 2, to avoid other code having to differentiate between the
+            # str and unicode types.
+            keyword,ln = match.groups()
+            yield (line_number, str(keyword[:-1].decode('ascii')),
+                   str(ln.decode('ascii')))
+    finally:
+        f.close()
 
 def parseIntegratedTestScript(test, normalize_slashes=False,
                               extra_substitutions=[]):
@@ -342,8 +368,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
     execdir,execbase = os.path.split(execpath)
     tmpDir = os.path.join(execdir, 'Output')
     tmpBase = os.path.join(tmpDir, execbase)
-    if test.index is not None:
-        tmpBase += '_%d' % test.index
 
     # Normalize slashes, if requested.
     if normalize_slashes:
@@ -364,18 +388,21 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
                           ('%T', tmpDir),
                           ('#_MARKER_#', '%')])
 
+    # "%/[STpst]" should be normalized.
+    substitutions.extend([
+            ('%/s', sourcepath.replace('\\', '/')),
+            ('%/S', sourcedir.replace('\\', '/')),
+            ('%/p', sourcedir.replace('\\', '/')),
+            ('%/t', tmpBase.replace('\\', '/') + '.tmp'),
+            ('%/T', tmpDir.replace('\\', '/')),
+            ])
+
     # Collect the test lines from the script.
     script = []
-    xfails = []
     requires = []
-    line_number = 0
-    for ln in open(sourcepath):
-        line_number += 1
-        if 'RUN:' in ln:
-            # Isolate the command to run.
-            index = ln.index('RUN:')
-            ln = ln[index+4:]
-
+    for line_number, command_type, ln in \
+            parseIntegratedTestScriptCommands(sourcepath):
+        if command_type == 'RUN':
             # Trim trailing whitespace.
             ln = ln.rstrip()
 
@@ -393,16 +420,17 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
                 script[-1] = script[-1][:-1] + ln
             else:
                 script.append(ln)
-        elif 'XFAIL:' in ln:
-            items = ln[ln.index('XFAIL:') + 6:].split(',')
-            xfails.extend([s.strip() for s in items])
-        elif 'REQUIRES:' in ln:
-            items = ln[ln.index('REQUIRES:') + 9:].split(',')
-            requires.extend([s.strip() for s in items])
-        elif 'END.' in ln:
-            # Check for END. lines.
-            if ln[ln.index('END.'):].strip() == 'END.':
+        elif command_type == 'XFAIL':
+            test.xfails.extend([s.strip() for s in ln.split(',')])
+        elif command_type == 'REQUIRES':
+            requires.extend([s.strip() for s in ln.split(',')])
+        elif command_type == 'END':
+            # END commands are only honored if the rest of the line is empty.
+            if not ln.strip():
                 break
+        else:
+            raise ValueError("unknown script command type: %r" % (
+                    command_type,))
 
     # Apply substitutions to the script.  Allow full regular
     # expression syntax.  Replace each matching occurrence of regular
@@ -416,45 +444,27 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
 
         # Strip the trailing newline and any extra whitespace.
         return ln.strip()
-    script = map(processLine, script)
+    script = [processLine(ln)
+              for ln in script]
 
     # Verify the script contains a run line.
     if not script:
-        return (Test.UNRESOLVED, "Test has no run line!")
+        return lit.Test.Result(Test.UNRESOLVED, "Test has no run line!")
 
     # Check for unterminated run lines.
     if script[-1][-1] == '\\':
-        return (Test.UNRESOLVED, "Test has unterminated run lines (with '\\')")
+        return lit.Test.Result(Test.UNRESOLVED,
+                               "Test has unterminated run lines (with '\\')")
 
     # Check that we have the required features:
     missing_required_features = [f for f in requires
                                  if f not in test.config.available_features]
     if missing_required_features:
         msg = ', '.join(missing_required_features)
-        return (Test.UNSUPPORTED,
-                "Test requires the following features: %s" % msg)
-
-    isXFail = isExpectedFail(test, xfails)
-    return script,isXFail,tmpBase,execdir
-
-def formatTestOutput(status, out, err, exitCode, script):
-    output = StringIO()
-    output.write(u"Script:\n")
-    output.write(u"--\n")
-    output.write(u'\n'.join(script))
-    output.write(u"\n--\n")
-    output.write(u"Exit Code: %r\n\n" % exitCode)
-    if out:
-        output.write(u"Command Output (stdout):\n")
-        output.write(u"--\n")
-        output.write(unicode(out))
-        output.write(u"--\n")
-    if err:
-        output.write(u"Command Output (stderr):\n")
-        output.write(u"--\n")
-        output.write(unicode(err))
-        output.write(u"--\n")
-    return (status, output.getvalue())
+        return lit.Test.Result(Test.UNSUPPORTED,
+                               "Test requires the following features: %s" % msg)
+
+    return script,tmpBase,execdir
 
 def executeShTest(test, litConfig, useExternalSh,
                   extra_substitutions=[]):
@@ -462,39 +472,37 @@ def executeShTest(test, litConfig, useExternalSh,
         return (Test.UNSUPPORTED, 'Test is unsupported')
 
     res = parseIntegratedTestScript(test, useExternalSh, extra_substitutions)
-    if len(res) == 2:
+    if isinstance(res, lit.Test.Result):
         return res
-
-    script, isXFail, tmpBase, execdir = res
-
     if litConfig.noExecute:
-        return (Test.PASS, '')
+        return lit.Test.Result(Test.PASS)
+
+    script, tmpBase, execdir = res
 
     # Create the output directory if it does not already exist.
-    Util.mkdir_p(os.path.dirname(tmpBase))
+    lit.util.mkdir_p(os.path.dirname(tmpBase))
 
     if useExternalSh:
         res = executeScript(test, litConfig, tmpBase, script, execdir)
     else:
         res = executeScriptInternal(test, litConfig, tmpBase, script, execdir)
-    if len(res) == 2:
+    if isinstance(res, lit.Test.Result):
         return res
 
     out,err,exitCode = res
-    if isXFail:
-        ok = exitCode != 0
-        if ok:
-            status = Test.XFAIL
-        else:
-            status = Test.XPASS
+    if exitCode == 0:
+        status = Test.PASS
     else:
-        ok = exitCode == 0
-        if ok:
-            status = Test.PASS
-        else:
-            status = Test.FAIL
+        status = Test.FAIL
 
-    if ok:
-        return (status,'')
+    # Form the output log.
+    output = """Script:\n--\n%s\n--\nExit Code: %d\n\n""" % (
+        '\n'.join(script), exitCode)
+
+    # Append the outputs, if present.
+    if out:
+        output += """Command Output (stdout):\n--\n%s\n--\n""" % (out,)
+    if err:
+        output += """Command Output (stderr):\n--\n%s\n--\n""" % (err,)
 
-    return formatTestOutput(status, out, err, exitCode, script)
+    return lit.Test.Result(status, output)
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index f4ff89f..4a34b77 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -9,83 +9,106 @@ class TestingConfig:
     """
 
     @staticmethod
-    def frompath(path, parent, litConfig, mustExist, config = None):
-        if config is None:
-            # Set the environment based on the command line arguments.
-            environment = {
-                'LIBRARY_PATH' : os.environ.get('LIBRARY_PATH',''),
-                'LD_LIBRARY_PATH' : os.environ.get('LD_LIBRARY_PATH',''),
-                'PATH' : os.pathsep.join(litConfig.path +
-                                         [os.environ.get('PATH','')]),
-                'SYSTEMROOT' : os.environ.get('SYSTEMROOT',''),
-                'TERM' : os.environ.get('TERM',''),
-                'LLVM_DISABLE_CRASH_REPORT' : '1',
-                }
-
-            if sys.platform == 'win32':
-                environment.update({
-                        'INCLUDE' : os.environ.get('INCLUDE',''),
-                        'PATHEXT' : os.environ.get('PATHEXT',''),
-                        'PYTHONUNBUFFERED' : '1',
-                        'TEMP' : os.environ.get('TEMP',''),
-                        'TMP' : os.environ.get('TMP',''),
-                        })
-
-            # Set the default available features based on the LitConfig.
-            available_features = []
-            if litConfig.useValgrind:
-                available_features.append('valgrind')
-                if litConfig.valgrindLeakCheck:
-                    available_features.append('vg_leak')
-
-            config = TestingConfig(parent,
-                                   name = '<unnamed>',
-                                   suffixes = set(),
-                                   test_format = None,
-                                   environment = environment,
-                                   substitutions = [],
-                                   unsupported = False,
-                                   on_clone = None,
-                                   test_exec_root = None,
-                                   test_source_root = None,
-                                   excludes = [],
-                                   available_features = available_features,
-                                   pipefail = True)
-
-        if os.path.exists(path):
-            # FIXME: Improve detection and error reporting of errors in the
-            # config file.
-            f = open(path)
-            cfg_globals = dict(globals())
-            cfg_globals['config'] = config
-            cfg_globals['lit'] = litConfig
-            cfg_globals['__file__'] = path
-            try:
-                data = f.read()
-                if PY2:
-                    exec("exec data in cfg_globals")
-                else:
-                    exec(data, cfg_globals)
-                if litConfig.debug:
-                    litConfig.note('... loaded config %r' % path)
-            except SystemExit:
-                e = sys.exc_info()[1]
-                # We allow normal system exit inside a config file to just
-                # return control without error.
-                if e.args:
-                    raise
-            f.close()
-        else:
-            if mustExist:
-                litConfig.fatal('unable to load config from %r ' % path)
-            elif litConfig.debug:
-                litConfig.note('... config not found  - %r' %path)
+    def fromdefaults(litConfig):
+        """
+        fromdefaults(litConfig) -> TestingConfig
+
+        Create a TestingConfig object with default values.
+        """
+        # Set the environment based on the command line arguments.
+        environment = {
+            'LIBRARY_PATH' : os.environ.get('LIBRARY_PATH',''),
+            'LD_LIBRARY_PATH' : os.environ.get('LD_LIBRARY_PATH',''),
+            'PATH' : os.pathsep.join(litConfig.path +
+                                     [os.environ.get('PATH','')]),
+            'SYSTEMROOT' : os.environ.get('SYSTEMROOT',''),
+            'TERM' : os.environ.get('TERM',''),
+            'LLVM_DISABLE_CRASH_REPORT' : '1',
+            }
+
+        if sys.platform == 'win32':
+            environment.update({
+                    'INCLUDE' : os.environ.get('INCLUDE',''),
+                    'PATHEXT' : os.environ.get('PATHEXT',''),
+                    'PYTHONUNBUFFERED' : '1',
+                    'TEMP' : os.environ.get('TEMP',''),
+                    'TMP' : os.environ.get('TMP',''),
+                    })
+
+        # The option to preserve TEMP, TMP, and TMPDIR.
+        # This is intended to check how many temporary files would be generated
+        # (and be not cleaned up) in automated builders.
+        if os.environ.has_key('LIT_PRESERVES_TMP'):
+            environment.update({
+                    'TEMP' : os.environ.get('TEMP',''),
+                    'TMP' : os.environ.get('TMP',''),
+                    'TMPDIR' : os.environ.get('TMPDIR',''),
+                    })
+
+        # Set the default available features based on the LitConfig.
+        available_features = []
+        if litConfig.useValgrind:
+            available_features.append('valgrind')
+            if litConfig.valgrindLeakCheck:
+                available_features.append('vg_leak')
+
+        return TestingConfig(None,
+                             name = '<unnamed>',
+                             suffixes = set(),
+                             test_format = None,
+                             environment = environment,
+                             substitutions = [],
+                             unsupported = False,
+                             test_exec_root = None,
+                             test_source_root = None,
+                             excludes = [],
+                             available_features = available_features,
+                             pipefail = True)
 
-        config.finish(litConfig)
-        return config
+    def load_from_path(self, path, litConfig):
+        """
+        load_from_path(path, litConfig)
+
+        Load the configuration module at the provided path into the given config
+        object.
+        """
+
+        # Load the config script data.
+        f = open(path)
+        try:
+            data = f.read()
+        except:
+            litConfig.fatal('unable to load config file: %r' % (path,))
+        f.close()
+
+        # Execute the config script to initialize the object.
+        cfg_globals = dict(globals())
+        cfg_globals['config'] = self
+        cfg_globals['lit_config'] = litConfig
+        cfg_globals['__file__'] = path
+        try:
+            if PY2:
+                exec("exec data in cfg_globals")
+            else:
+                exec(data, cfg_globals)
+            if litConfig.debug:
+                litConfig.note('... loaded config %r' % path)
+        except SystemExit:
+            e = sys.exc_info()[1]
+            # We allow normal system exit inside a config file to just
+            # return control without error.
+            if e.args:
+                raise
+        except:
+            import traceback
+            litConfig.fatal(
+                'unable to parse config file %r, traceback: %s' % (
+                    path, traceback.format_exc()))
+
+        self.finish(litConfig)
 
     def __init__(self, parent, name, suffixes, test_format,
-                 environment, substitutions, unsupported, on_clone,
+                 environment, substitutions, unsupported,
                  test_exec_root, test_source_root, excludes,
                  available_features, pipefail):
         self.parent = parent
@@ -95,27 +118,12 @@ class TestingConfig:
         self.environment = dict(environment)
         self.substitutions = list(substitutions)
         self.unsupported = unsupported
-        self.on_clone = on_clone
         self.test_exec_root = test_exec_root
         self.test_source_root = test_source_root
         self.excludes = set(excludes)
         self.available_features = set(available_features)
         self.pipefail = pipefail
 
-    def clone(self, path):
-        # FIXME: Chain implementations?
-        #
-        # FIXME: Allow extra parameters?
-        cfg = TestingConfig(self, self.name, self.suffixes, self.test_format,
-                            self.environment, self.substitutions,
-                            self.unsupported, self.on_clone,
-                            self.test_exec_root, self.test_source_root,
-                            self.excludes, self.available_features,
-                            self.pipefail)
-        if cfg.on_clone:
-            cfg.on_clone(self, cfg, path)
-        return cfg
-
     def finish(self, litConfig):
         """finish() - Finish this config object, after loading is complete."""
 
diff --git a/utils/lit/lit/Util.py b/utils/lit/lit/Util.py
deleted file mode 100644
index 6f09eed..0000000
--- a/utils/lit/lit/Util.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import os, sys
-
-def detectCPUs():
-    """
-    Detects the number of CPUs on a system. Cribbed from pp.
-    """
-    # Linux, Unix and MacOS:
-    if hasattr(os, "sysconf"):
-        if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
-            # Linux & Unix:
-            ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
-            if isinstance(ncpus, int) and ncpus > 0:
-                return ncpus
-        else: # OSX:
-            return int(capture(['sysctl', '-n', 'hw.ncpu']))
-    # Windows:
-    if os.environ.has_key("NUMBER_OF_PROCESSORS"):
-        ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
-        if ncpus > 0:
-            return ncpus
-    return 1 # Default
-
-def mkdir_p(path):
-    """mkdir_p(path) - Make the "path" directory, if it does not exist; this
-    will also make directories for any missing parent directories."""
-    import errno
-
-    if not path or os.path.exists(path):
-        return
-
-    parent = os.path.dirname(path) 
-    if parent != path:
-        mkdir_p(parent)
-
-    try:
-        os.mkdir(path)
-    except OSError:
-        e = sys.exc_info()[1]
-        # Ignore EEXIST, which may occur during a race condition.
-        if e.errno != errno.EEXIST:
-            raise
-
-def capture(args, env=None):
-    import subprocess
-    """capture(command) - Run the given command (or argv list) in a shell and
-    return the standard output."""
-    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                         env=env)
-    out,_ = p.communicate()
-    return out
-
-def which(command, paths = None):
-    """which(command, [paths]) - Look up the given command in the paths string
-    (or the PATH environment variable, if unspecified)."""
-
-    if paths is None:
-        paths = os.environ.get('PATH','')
-
-    # Check for absolute match first.
-    if os.path.isfile(command):
-        return command
-
-    # Would be nice if Python had a lib function for this.
-    if not paths:
-        paths = os.defpath
-
-    # Get suffixes to search.
-    # On Cygwin, 'PATHEXT' may exist but it should not be used.
-    if os.pathsep == ';':
-        pathext = os.environ.get('PATHEXT', '').split(';')
-    else:
-        pathext = ['']
-
-    # Search the paths...
-    for path in paths.split(os.pathsep):
-        for ext in pathext:
-            p = os.path.join(path, command + ext)
-            if os.path.exists(p):
-                return p
-
-    return None
-
-def checkToolsPath(dir, tools):
-    for tool in tools:
-        if not os.path.exists(os.path.join(dir, tool)):
-            return False;
-    return True;
-
-def whichTools(tools, paths):
-    for path in paths.split(os.pathsep):
-        if checkToolsPath(path, tools):
-            return path
-    return None
-
-def printHistogram(items, title = 'Items'):
-    import itertools, math
-
-    items.sort(key = lambda item: item[1])
-
-    maxValue = max([v for _,v in items])
-
-    # Select first "nice" bar height that produces more than 10 bars.
-    power = int(math.ceil(math.log(maxValue, 10)))
-    for inc in itertools.cycle((5, 2, 2.5, 1)):
-        barH = inc * 10**power
-        N = int(math.ceil(maxValue / barH))
-        if N > 10:
-            break
-        elif inc == 1:
-            power -= 1
-
-    histo = [set() for i in range(N)]
-    for name,v in items:
-        bin = min(int(N * v/maxValue), N-1)
-        histo[bin].add(name)
-
-    barW = 40
-    hr = '-' * (barW + 34)
-    print('\nSlowest %s:' % title)
-    print(hr)
-    for name,value in items[-20:]:
-        print('%.2fs: %s' % (value, name))
-    print('\n%s Times:' % title)
-    print(hr)
-    pDigits = int(math.ceil(math.log(maxValue, 10)))
-    pfDigits = max(0, 3-pDigits)
-    if pfDigits:
-        pDigits += pfDigits + 1
-    cDigits = int(math.ceil(math.log(len(items), 10)))
-    print("[%s] :: [%s] :: [%s]" % ('Range'.center((pDigits+1)*2 + 3),
-                                    'Percentage'.center(barW),
-                                    'Count'.center(cDigits*2 + 1)))
-    print(hr)
-    for i,row in enumerate(histo):
-        pct = float(len(row)) / len(items)
-        w = int(barW * pct)
-        print("[%*.*fs,%*.*fs) :: [%s%s] :: [%*d/%*d]" % (
-            pDigits, pfDigits, i*barH, pDigits, pfDigits, (i+1)*barH,
-            '*'*w, ' '*(barW-w), cDigits, len(row), cDigits, len(items)))
-
diff --git a/utils/lit/lit/__init__.py b/utils/lit/lit/__init__.py
index b9f573d..3967fdd 100644
--- a/utils/lit/lit/__init__.py
+++ b/utils/lit/lit/__init__.py
@@ -6,6 +6,6 @@ from .main import main
 __author__ = 'Daniel Dunbar'
 __email__ = 'daniel@zuster.org'
 __versioninfo__ = (0, 3, 0)
-__version__ = '.'.join(map(str, __versioninfo__)) + 'dev'
+__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
 
 __all__ = []
diff --git a/utils/lit/lit/discovery.py b/utils/lit/lit/discovery.py
index f76bd22..c3c0f28 100644
--- a/utils/lit/lit/discovery.py
+++ b/utils/lit/lit/discovery.py
@@ -2,9 +2,11 @@
 Test discovery functions.
 """
 
+import copy
 import os
 import sys
 
+import lit.run
 from lit.TestingConfig import TestingConfig
 from lit import LitConfig, Test
 
@@ -38,11 +40,12 @@ def getTestSuite(item, litConfig, cache):
             ts, relative = search(parent)
             return (ts, relative + (base,))
 
-        # We found a config file, load it.
+        # We found a test suite, create a new config for it and load it.
         if litConfig.debug:
             litConfig.note('loading suite config %r' % cfgpath)
 
-        cfg = TestingConfig.frompath(cfgpath, None, litConfig, mustExist = True)
+        cfg = TestingConfig.fromdefaults(litConfig)
+        cfg.load_from_path(cfgpath, litConfig)
         source_root = os.path.realpath(cfg.test_source_root or path)
         exec_root = os.path.realpath(cfg.test_exec_root or path)
         return Test.TestSuite(cfg.name, source_root, exec_root, cfg), ()
@@ -78,14 +81,21 @@ def getLocalConfig(ts, path_in_suite, litConfig, cache):
         else:
             parent = search(path_in_suite[:-1])
 
-        # Load the local configuration.
+        # Check if there is a local configuration file.
         source_path = ts.getSourcePath(path_in_suite)
         cfgpath = os.path.join(source_path, litConfig.local_config_name)
+
+        # If not, just reuse the parent config.
+        if not os.path.exists(cfgpath):
+            return parent
+
+        # Otherwise, copy the current config and load the local configuration
+        # file into it.
+        config = copy.copy(parent)
         if litConfig.debug:
             litConfig.note('loading local config %r' % cfgpath)
-        return TestingConfig.frompath(cfgpath, parent, litConfig,
-                                    mustExist = False,
-                                    config = parent.clone(cfgpath))
+        config.load_from_path(cfgpath, litConfig)
+        return config
 
     def search(path_in_suite):
         key = (ts, path_in_suite)
@@ -237,7 +247,9 @@ def load_test_suite(inputs):
                                     isWindows = (platform.system()=='Windows'),
                                     params = {})
 
-    tests = find_tests_for_inputs(litConfig, inputs)
+    # Perform test discovery.
+    run = lit.run.Run(litConfig, find_tests_for_inputs(litConfig, inputs))
 
     # Return a unittest test suite which just runs the tests in order.
-    return unittest.TestSuite([LitTestCase(test, litConfig) for test in tests])
+    return unittest.TestSuite([LitTestCase(test, run)
+                               for test in run.tests])
diff --git a/utils/lit/lit/formats/__init__.py b/utils/lit/lit/formats/__init__.py
new file mode 100644
index 0000000..6862708
--- /dev/null
+++ b/utils/lit/lit/formats/__init__.py
@@ -0,0 +1,4 @@
+from __future__ import absolute_import
+from lit.formats.base import TestFormat, FileBasedTest, OneCommandPerFileTest
+from lit.formats.googletest import GoogleTest
+from lit.formats.shtest import ShTest
diff --git a/utils/lit/lit/formats/base.py b/utils/lit/lit/formats/base.py
new file mode 100644
index 0000000..9e5420b
--- /dev/null
+++ b/utils/lit/lit/formats/base.py
@@ -0,0 +1,118 @@
+from __future__ import absolute_import
+import os
+import sys
+
+import lit.Test
+import lit.util
+
+class TestFormat(object):
+    pass
+
+###
+
+class FileBasedTest(TestFormat):
+    def getTestsInDirectory(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        source_path = testSuite.getSourcePath(path_in_suite)
+        for filename in os.listdir(source_path):
+            # Ignore dot files and excluded tests.
+            if (filename.startswith('.') or
+                filename in localConfig.excludes):
+                continue
+
+            filepath = os.path.join(source_path, filename)
+            if not os.path.isdir(filepath):
+                base,ext = os.path.splitext(filename)
+                if ext in localConfig.suffixes:
+                    yield lit.Test.Test(testSuite, path_in_suite + (filename,),
+                                        localConfig)
+
+###
+
+import re
+import tempfile
+
+class OneCommandPerFileTest(TestFormat):
+    # FIXME: Refactor into generic test for running some command on a directory
+    # of inputs.
+
+    def __init__(self, command, dir, recursive=False,
+                 pattern=".*", useTempInput=False):
+        if isinstance(command, str):
+            self.command = [command]
+        else:
+            self.command = list(command)
+        if dir is not None:
+            dir = str(dir)
+        self.dir = dir
+        self.recursive = bool(recursive)
+        self.pattern = re.compile(pattern)
+        self.useTempInput = useTempInput
+
+    def getTestsInDirectory(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        dir = self.dir
+        if dir is None:
+            dir = testSuite.getSourcePath(path_in_suite)
+
+        for dirname,subdirs,filenames in os.walk(dir):
+            if not self.recursive:
+                subdirs[:] = []
+
+            subdirs[:] = [d for d in subdirs
+                          if (d != '.svn' and
+                              d not in localConfig.excludes)]
+
+            for filename in filenames:
+                if (filename.startswith('.') or
+                    not self.pattern.match(filename) or
+                    filename in localConfig.excludes):
+                    continue
+
+                path = os.path.join(dirname,filename)
+                suffix = path[len(dir):]
+                if suffix.startswith(os.sep):
+                    suffix = suffix[1:]
+                test = lit.Test.Test(
+                    testSuite, path_in_suite + tuple(suffix.split(os.sep)),
+                    localConfig)
+                # FIXME: Hack?
+                test.source_path = path
+                yield test
+
+    def createTempInput(self, tmp, test):
+        abstract
+
+    def execute(self, test, litConfig):
+        if test.config.unsupported:
+            return (lit.Test.UNSUPPORTED, 'Test is unsupported')
+
+        cmd = list(self.command)
+
+        # If using temp input, create a temporary file and hand it to the
+        # subclass.
+        if self.useTempInput:
+            tmp = tempfile.NamedTemporaryFile(suffix='.cpp')
+            self.createTempInput(tmp, test)
+            tmp.flush()
+            cmd.append(tmp.name)
+        elif hasattr(test, 'source_path'):
+            cmd.append(test.source_path)
+        else:
+            cmd.append(test.getSourcePath())
+
+        out, err, exitCode = lit.util.executeCommand(cmd)
+
+        diags = out + err
+        if not exitCode and not diags.strip():
+            return lit.Test.PASS,''
+
+        # Try to include some useful information.
+        report = """Command: %s\n""" % ' '.join(["'%s'" % a
+                                                 for a in cmd])
+        if self.useTempInput:
+            report += """Temporary File: %s\n""" % tmp.name
+            report += "--\n%s--\n""" % open(tmp.name).read()
+        report += """Output:\n--\n%s--""" % diags
+
+        return lit.Test.FAIL, report
diff --git a/utils/lit/lit/formats/googletest.py b/utils/lit/lit/formats/googletest.py
new file mode 100644
index 0000000..b77e184
--- /dev/null
+++ b/utils/lit/lit/formats/googletest.py
@@ -0,0 +1,114 @@
+from __future__ import absolute_import
+import os
+import sys
+
+import lit.Test
+import lit.TestRunner
+import lit.util
+from .base import TestFormat
+
+kIsWindows = sys.platform in ['win32', 'cygwin']
+
+class GoogleTest(TestFormat):
+    def __init__(self, test_sub_dir, test_suffix):
+        self.test_sub_dir = os.path.normcase(str(test_sub_dir)).split(';')
+        self.test_suffix = str(test_suffix)
+
+        # On Windows, assume tests will also end in '.exe'.
+        if kIsWindows:
+            self.test_suffix += '.exe'
+
+    def getGTestTests(self, path, litConfig, localConfig):
+        """getGTestTests(path) - [name]
+
+        Return the tests available in gtest executable.
+
+        Args:
+          path: String path to a gtest executable
+          litConfig: LitConfig instance
+          localConfig: TestingConfig instance"""
+
+        try:
+            lines = lit.util.capture([path, '--gtest_list_tests'],
+                                     env=localConfig.environment)
+            lines = lines.decode('ascii')
+            if kIsWindows:
+              lines = lines.replace('\r', '')
+            lines = lines.split('\n')
+        except:
+            litConfig.error("unable to discover google-tests in %r" % path)
+            raise StopIteration
+
+        nested_tests = []
+        for ln in lines:
+            if not ln.strip():
+                continue
+
+            prefix = ''
+            index = 0
+            while ln[index*2:index*2+2] == '  ':
+                index += 1
+            while len(nested_tests) > index:
+                nested_tests.pop()
+
+            ln = ln[index*2:]
+            if ln.endswith('.'):
+                nested_tests.append(ln)
+            else:
+                yield ''.join(nested_tests) + ln
+
+    # Note: path_in_suite should not include the executable name.
+    def getTestsInExecutable(self, testSuite, path_in_suite, execpath,
+                             litConfig, localConfig):
+        if not execpath.endswith(self.test_suffix):
+            return
+        (dirname, basename) = os.path.split(execpath)
+        # Discover the tests in this executable.
+        for testname in self.getGTestTests(execpath, litConfig, localConfig):
+            testPath = path_in_suite + (basename, testname)
+            yield lit.Test.Test(testSuite, testPath, localConfig)
+
+    def getTestsInDirectory(self, testSuite, path_in_suite,
+                            litConfig, localConfig):
+        source_path = testSuite.getSourcePath(path_in_suite)
+        for filename in os.listdir(source_path):
+            filepath = os.path.join(source_path, filename)
+            if os.path.isdir(filepath):
+                # Iterate over executables in a directory.
+                if not os.path.normcase(filename) in self.test_sub_dir:
+                    continue
+                dirpath_in_suite = path_in_suite + (filename, )
+                for subfilename in os.listdir(filepath):
+                    execpath = os.path.join(filepath, subfilename)
+                    for test in self.getTestsInExecutable(
+                            testSuite, dirpath_in_suite, execpath,
+                            litConfig, localConfig):
+                      yield test
+            elif ('.' in self.test_sub_dir):
+                for test in self.getTestsInExecutable(
+                        testSuite, path_in_suite, filepath,
+                        litConfig, localConfig):
+                    yield test
+
+    def execute(self, test, litConfig):
+        testPath,testName = os.path.split(test.getSourcePath())
+        while not os.path.exists(testPath):
+            # Handle GTest parametrized and typed tests, whose name includes
+            # some '/'s.
+            testPath, namePrefix = os.path.split(testPath)
+            testName = os.path.join(namePrefix, testName)
+
+        cmd = [testPath, '--gtest_filter=' + testName]
+        if litConfig.useValgrind:
+            cmd = litConfig.valgrindArgs + cmd
+
+        if litConfig.noExecute:
+            return lit.Test.PASS, ''
+
+        out, err, exitCode = lit.util.executeCommand(
+            cmd, env=test.config.environment)
+
+        if not exitCode:
+            return lit.Test.PASS,''
+
+        return lit.Test.FAIL, out + err
diff --git a/utils/lit/lit/formats/shtest.py b/utils/lit/lit/formats/shtest.py
new file mode 100644
index 0000000..30a6a33
--- /dev/null
+++ b/utils/lit/lit/formats/shtest.py
@@ -0,0 +1,12 @@
+from __future__ import absolute_import
+
+import lit.TestRunner
+from .base import FileBasedTest
+
+class ShTest(FileBasedTest):
+    def __init__(self, execute_external = False):
+        self.execute_external = execute_external
+
+    def execute(self, test, litConfig):
+        return lit.TestRunner.executeShTest(test, litConfig,
+                                            self.execute_external)
diff --git a/utils/lit/lit/main.py b/utils/lit/lit/main.py
index acf6101..6f672a0 100755
--- a/utils/lit/lit/main.py
+++ b/utils/lit/lit/main.py
@@ -7,37 +7,23 @@ See lit.pod for more information.
 """
 
 from __future__ import absolute_import
-import math, os, platform, random, re, sys, time, threading, traceback
+import math, os, platform, random, re, sys, time
 
 import lit.ProgressBar
 import lit.LitConfig
 import lit.Test
-import lit.Util
-
+import lit.run
+import lit.util
 import lit.discovery
 
-class TestingProgressDisplay:
+class TestingProgressDisplay(object):
     def __init__(self, opts, numTests, progressBar=None):
         self.opts = opts
         self.numTests = numTests
         self.current = None
-        self.lock = threading.Lock()
         self.progressBar = progressBar
         self.completed = 0
 
-    def update(self, test):
-        # Avoid locking overhead in quiet mode
-        if self.opts.quiet and not test.result.isFailure:
-            self.completed += 1
-            return
-
-        # Output lock.
-        self.lock.acquire()
-        try:
-            self.handleUpdate(test)
-        finally:
-            self.lock.release()
-
     def finish(self):
         if self.progressBar:
             self.progressBar.clear()
@@ -46,123 +32,86 @@ class TestingProgressDisplay:
         elif self.opts.succinct:
             sys.stdout.write('\n')
 
-    def handleUpdate(self, test):
+    def update(self, test):
         self.completed += 1
         if self.progressBar:
             self.progressBar.update(float(self.completed)/self.numTests,
                                     test.getFullName())
 
-        if self.opts.succinct and not test.result.isFailure:
+        if not test.result.code.isFailure and \
+                (self.opts.quiet or self.opts.succinct):
             return
 
         if self.progressBar:
             self.progressBar.clear()
 
-        print('%s: %s (%d of %d)' % (test.result.name, test.getFullName(),
+        # Show the test result line.
+        test_name = test.getFullName()
+        print('%s: %s (%d of %d)' % (test.result.code.name, test_name,
                                      self.completed, self.numTests))
 
-        if test.result.isFailure and self.opts.showOutput:
+        # Show the test failure output, if requested.
+        if test.result.code.isFailure and self.opts.showOutput:
             print("%s TEST '%s' FAILED %s" % ('*'*20, test.getFullName(),
                                               '*'*20))
-            print(test.output)
+            print(test.result.output)
             print("*" * 20)
 
+        # Report test metrics, if present.
+        if test.result.metrics:
+            print("%s TEST '%s' RESULTS %s" % ('*'*10, test.getFullName(),
+                                               '*'*10))
+            items = sorted(test.result.metrics.items())
+            for metric_name, value in items:
+                print('%s: %s ' % (metric_name, value.format()))
+            print("*" * 10)
+
+        # Ensure the output is flushed.
         sys.stdout.flush()
 
-class TestProvider:
-    def __init__(self, tests, maxTime):
-        self.maxTime = maxTime
-        self.iter = iter(tests)
-        self.lock = threading.Lock()
-        self.startTime = time.time()
-        self.canceled = False
-
-    def cancel(self):
-        self.lock.acquire()
-        self.canceled = True
-        self.lock.release()
-
-    def get(self):
-        # Check if we have run out of time.
-        if self.maxTime is not None:
-            if time.time() - self.startTime > self.maxTime:
-                return None
-
-        # Otherwise take the next test.
-        self.lock.acquire()
-        if self.canceled:
-          self.lock.release()
-          return None
-        for item in self.iter:
-            break
-        else:
-            item = None
-        self.lock.release()
-        return item
-
-class Tester(threading.Thread):
-    def __init__(self, litConfig, provider, display):
-        threading.Thread.__init__(self)
-        self.litConfig = litConfig
-        self.provider = provider
-        self.display = display
-
-    def run(self):
-        while 1:
-            item = self.provider.get()
-            if item is None:
-                break
-            self.runTest(item)
-
-    def runTest(self, test):
-        result = None
-        startTime = time.time()
-        try:
-            result, output = test.config.test_format.execute(test,
-                                                             self.litConfig)
-        except KeyboardInterrupt:
-            # This is a sad hack. Unfortunately subprocess goes
-            # bonkers with ctrl-c and we start forking merrily.
-            print('\nCtrl-C detected, goodbye.')
-            os.kill(0,9)
-        except:
-            if self.litConfig.debug:
-                raise
-            result = lit.Test.UNRESOLVED
-            output = 'Exception during script execution:\n'
-            output += traceback.format_exc()
-            output += '\n'
-        elapsed = time.time() - startTime
-
-        test.setResult(result, output, elapsed)
-        self.display.update(test)
-
-def runTests(numThreads, litConfig, provider, display):
-    # If only using one testing thread, don't use threads at all; this lets us
-    # profile, among other things.
-    if numThreads == 1:
-        t = Tester(litConfig, provider, display)
-        t.run()
-        return
-
-    # Otherwise spin up the testing threads and wait for them to finish.
-    testers = [Tester(litConfig, provider, display)
-               for i in range(numThreads)]
-    for t in testers:
-        t.start()
+def write_test_results(run, lit_config, testing_time, output_path):
     try:
-        for t in testers:
-            t.join()
-    except KeyboardInterrupt:
-        sys.exit(2)
+        import json
+    except ImportError:
+        lit_config.fatal('test output unsupported with Python 2.5')
+
+    # Construct the data we will write.
+    data = {}
+    # Encode the current lit version as a schema version.
+    data['__version__'] = lit.__versioninfo__
+    data['elapsed'] = testing_time
+    # FIXME: Record some information on the lit configuration used?
+    # FIXME: Record information from the individual test suites?
+
+    # Encode the tests.
+    data['tests'] = tests_data = []
+    for test in run.tests:
+        test_data = {
+            'name' : test.getFullName(),
+            'code' : test.result.code.name,
+            'output' : test.result.output,
+            'elapsed' : test.result.elapsed }
+
+        # Add test metrics, if present.
+        if test.result.metrics:
+            test_data['metrics'] = metrics_data = {}
+            for key, value in test.result.metrics.items():
+                metrics_data[key] = value.todata()
+
+        tests_data.append(test_data)
+
+    # Write the output.
+    f = open(output_path, 'w')
+    try:
+        json.dump(data, f, indent=2, sort_keys=True)
+        f.write('\n')
+    finally:
+        f.close()
 
 def main(builtinParameters = {}):
-    # Bump the GIL check interval, its more important to get any one thread to a
-    # blocking operation (hopefully exec) than to try and unblock other threads.
-    #
-    # FIXME: This is a hack.
-    import sys
-    sys.setcheckinterval(1000)
+    # Use processes by default on Unix platforms.
+    isWindows = platform.system() == 'Windows'
+    useProcessesIsDefault = not isWindows
 
     global options
     from optparse import OptionParser, OptionGroup
@@ -191,6 +140,9 @@ def main(builtinParameters = {}):
     group.add_option("-v", "--verbose", dest="showOutput",
                      help="Show all test output",
                      action="store_true", default=False)
+    group.add_option("-o", "--output", dest="output_path",
+                     help="Write test results to the provided path",
+                     action="store", type=str, metavar="PATH")
     group.add_option("", "--no-progress-bar", dest="useProgressBar",
                      help="Do not use curses based progress bar",
                      action="store_false", default=True)
@@ -243,9 +195,12 @@ def main(builtinParameters = {}):
     group.add_option("", "--show-tests", dest="showTests",
                       help="Show all discovered tests",
                       action="store_true", default=False)
-    group.add_option("", "--repeat", dest="repeatTests", metavar="N",
-                      help="Repeat tests N times (for timing)",
-                      action="store", default=None, type=int)
+    group.add_option("", "--use-processes", dest="useProcesses",
+                      help="Run tests in parallel with processes (not threads)",
+                      action="store_true", default=useProcessesIsDefault)
+    group.add_option("", "--use-threads", dest="useProcesses",
+                      help="Run tests in parallel with threads (not processes)",
+                      action="store_false", default=useProcessesIsDefault)
     parser.add_option_group(group)
 
     (opts, args) = parser.parse_args()
@@ -259,7 +214,7 @@ def main(builtinParameters = {}):
 # I haven't seen this bug occur with 2.5.2 and later, so only enable multiple
 # threads by default there.
        if sys.hexversion >= 0x2050200:
-               opts.numThreads = lit.Util.detectCPUs()
+               opts.numThreads = lit.util.detectCPUs()
        else:
                opts.numThreads = 1
 
@@ -284,20 +239,22 @@ def main(builtinParameters = {}):
         valgrindArgs = opts.valgrindArgs,
         noExecute = opts.noExecute,
         debug = opts.debug,
-        isWindows = (platform.system()=='Windows'),
+        isWindows = isWindows,
         params = userParams,
         config_prefix = opts.configPrefix)
 
-    tests = lit.discovery.find_tests_for_inputs(litConfig, inputs)
+    # Perform test discovery.
+    run = lit.run.Run(litConfig,
+                      lit.discovery.find_tests_for_inputs(litConfig, inputs))
 
     if opts.showSuites or opts.showTests:
         # Aggregate the tests by suite.
         suitesAndTests = {}
-        for t in tests:
+        for t in run.tests:
             if t.suite not in suitesAndTests:
                 suitesAndTests[t.suite] = []
             suitesAndTests[t.suite].append(t)
-        suitesAndTests = suitesAndTests.items()
+        suitesAndTests = list(suitesAndTests.items())
         suitesAndTests.sort(key = lambda item: item[0].name)
 
         # Show the suites, if requested.
@@ -315,9 +272,12 @@ def main(builtinParameters = {}):
                 ts_tests.sort(key = lambda test: test.path_in_suite)
                 for test in ts_tests:
                     print('  %s' % (test.getFullName(),))
-        
+
+        # Exit.
+        sys.exit(0)
+
     # Select and order the tests.
-    numTotalTests = len(tests)
+    numTotalTests = len(run.tests)
 
     # First, select based on the filter expression if given.
     if opts.filter:
@@ -326,33 +286,28 @@ def main(builtinParameters = {}):
         except:
             parser.error("invalid regular expression for --filter: %r" % (
                     opts.filter))
-        tests = [t for t in tests
-                 if rex.search(t.getFullName())]
+        run.tests = [t for t in run.tests
+                     if rex.search(t.getFullName())]
 
     # Then select the order.
     if opts.shuffle:
-        random.shuffle(tests)
+        random.shuffle(run.tests)
     else:
-        tests.sort(key = lambda t: t.getFullName())
+        run.tests.sort(key = lambda t: t.getFullName())
 
     # Finally limit the number of tests, if desired.
     if opts.maxTests is not None:
-        tests = tests[:opts.maxTests]
+        run.tests = run.tests[:opts.maxTests]
 
     # Don't create more threads than tests.
-    opts.numThreads = min(len(tests), opts.numThreads)
+    opts.numThreads = min(len(run.tests), opts.numThreads)
 
     extra = ''
-    if len(tests) != numTotalTests:
+    if len(run.tests) != numTotalTests:
         extra = ' of %d' % numTotalTests
-    header = '-- Testing: %d%s tests, %d threads --'%(len(tests),extra,
+    header = '-- Testing: %d%s tests, %d threads --'%(len(run.tests), extra,
                                                       opts.numThreads)
 
-    if opts.repeatTests:
-        tests = [t.copyWithIndex(i)
-                 for t in tests
-                 for i in range(opts.repeatTests)]
-
     progressBar = None
     if not opts.quiet:
         if opts.succinct and opts.useProgressBar:
@@ -366,63 +321,50 @@ def main(builtinParameters = {}):
             print(header)
 
     startTime = time.time()
-    display = TestingProgressDisplay(opts, len(tests), progressBar)
-    provider = TestProvider(tests, opts.maxTime)
-
+    display = TestingProgressDisplay(opts, len(run.tests), progressBar)
     try:
-      import win32api
-    except ImportError:
-      pass
-    else:
-      def console_ctrl_handler(type):
-        provider.cancel()
-        return True
-      win32api.SetConsoleCtrlHandler(console_ctrl_handler, True)
-
-    runTests(opts.numThreads, litConfig, provider, display)
+        run.execute_tests(display, opts.numThreads, opts.maxTime,
+                          opts.useProcesses)
+    except KeyboardInterrupt:
+        sys.exit(2)
     display.finish()
 
+    testing_time = time.time() - startTime
     if not opts.quiet:
-        print('Testing Time: %.2fs'%(time.time() - startTime))
+        print('Testing Time: %.2fs' % (testing_time,))
 
-    # Update results for any tests which weren't run.
-    for t in tests:
-        if t.result is None:
-            t.setResult(lit.Test.UNRESOLVED, '', 0.0)
+    # Write out the test data, if requested.
+    if opts.output_path is not None:
+        write_test_results(run, litConfig, testing_time, opts.output_path)
 
     # List test results organized by kind.
     hasFailures = False
     byCode = {}
-    for t in tests:
-        if t.result not in byCode:
-            byCode[t.result] = []
-        byCode[t.result].append(t)
-        if t.result.isFailure:
+    for test in run.tests:
+        if test.result.code not in byCode:
+            byCode[test.result.code] = []
+        byCode[test.result.code].append(test)
+        if test.result.code.isFailure:
             hasFailures = True
 
-    # FIXME: Show unresolved and (optionally) unsupported tests.
+    # Print each test in any of the failing groups.
     for title,code in (('Unexpected Passing Tests', lit.Test.XPASS),
-                       ('Failing Tests', lit.Test.FAIL)):
+                       ('Failing Tests', lit.Test.FAIL),
+                       ('Unresolved Tests', lit.Test.UNRESOLVED)):
         elts = byCode.get(code)
         if not elts:
             continue
         print('*'*20)
         print('%s (%d):' % (title, len(elts)))
-        for t in elts:
-            print('    %s' % t.getFullName())
+        for test in elts:
+            print('    %s' % test.getFullName())
         sys.stdout.write('\n')
 
-    if opts.timeTests:
-        # Collate, in case we repeated tests.
-        times = {}
-        for t in tests:
-            key = t.getFullName()
-            times[key] = times.get(key, 0.) + t.elapsed
-
-        byTime = list(times.items())
-        byTime.sort(key = lambda item: item[1])
-        if byTime:
-            lit.Util.printHistogram(byTime, title='Tests')
+    if opts.timeTests and run.tests:
+        # Order by time.
+        test_times = [(test.getFullName(), test.result.elapsed)
+                      for test in run.tests]
+        lit.util.printHistogram(test_times, title='Tests')
 
     for name,code in (('Expected Passes    ', lit.Test.PASS),
                       ('Expected Failures  ', lit.Test.XFAIL),
diff --git a/utils/lit/lit/run.py b/utils/lit/lit/run.py
new file mode 100644
index 0000000..27c414d
--- /dev/null
+++ b/utils/lit/lit/run.py
@@ -0,0 +1,277 @@
+import os
+import threading
+import time
+import traceback
+try:
+    import Queue as queue
+except ImportError:
+    import queue
+
+try:
+    import win32api
+except ImportError:
+    win32api = None
+
+try:
+    import multiprocessing
+except ImportError:
+    multiprocessing = None
+
+import lit.Test
+
+###
+# Test Execution Implementation
+
+class LockedValue(object):
+    def __init__(self, value):
+        self.lock = threading.Lock()
+        self._value = value
+
+    def _get_value(self):
+        self.lock.acquire()
+        try:
+            return self._value
+        finally:
+            self.lock.release()
+
+    def _set_value(self, value):
+        self.lock.acquire()
+        try:
+            self._value = value
+        finally:
+            self.lock.release()
+
+    value = property(_get_value, _set_value)
+
+class TestProvider(object):
+    def __init__(self, tests, num_jobs, queue_impl, canceled_flag):
+        self.canceled_flag = canceled_flag
+
+        # Create a shared queue to provide the test indices.
+        self.queue = queue_impl()
+        for i in range(len(tests)):
+            self.queue.put(i)
+        for i in range(num_jobs):
+            self.queue.put(None)
+
+    def cancel(self):
+        self.canceled_flag.value = 1
+
+    def get(self):
+        # Check if we are canceled.
+        if self.canceled_flag.value:
+          return None
+
+        # Otherwise take the next test.
+        return self.queue.get()
+
+class Tester(object):
+    def __init__(self, run_instance, provider, consumer):
+        self.run_instance = run_instance
+        self.provider = provider
+        self.consumer = consumer
+
+    def run(self):
+        while True:
+            item = self.provider.get()
+            if item is None:
+                break
+            self.run_test(item)
+        self.consumer.task_finished()
+
+    def run_test(self, test_index):
+        test = self.run_instance.tests[test_index]
+        try:
+            self.run_instance.execute_test(test)
+        except KeyboardInterrupt:
+            # This is a sad hack. Unfortunately subprocess goes
+            # bonkers with ctrl-c and we start forking merrily.
+            print('\nCtrl-C detected, goodbye.')
+            os.kill(0,9)
+        self.consumer.update(test_index, test)
+
+class ThreadResultsConsumer(object):
+    def __init__(self, display):
+        self.display = display
+        self.lock = threading.Lock()
+
+    def update(self, test_index, test):
+        self.lock.acquire()
+        try:
+            self.display.update(test)
+        finally:
+            self.lock.release()
+
+    def task_finished(self):
+        pass
+
+    def handle_results(self):
+        pass
+
+class MultiprocessResultsConsumer(object):
+    def __init__(self, run, display, num_jobs):
+        self.run = run
+        self.display = display
+        self.num_jobs = num_jobs
+        self.queue = multiprocessing.Queue()
+
+    def update(self, test_index, test):
+        # This method is called in the child processes, and communicates the
+        # results to the actual display implementation via an output queue.
+        self.queue.put((test_index, test.result))
+
+    def task_finished(self):
+        # This method is called in the child processes, and communicates that
+        # individual tasks are complete.
+        self.queue.put(None)
+
+    def handle_results(self):
+        # This method is called in the parent, and consumes the results from the
+        # output queue and dispatches to the actual display. The method will
+        # complete after each of num_jobs tasks has signalled completion.
+        completed = 0
+        while completed != self.num_jobs:
+            # Wait for a result item.
+            item = self.queue.get()
+            if item is None:
+                completed += 1
+                continue
+
+            # Update the test result in the parent process.
+            index,result = item
+            test = self.run.tests[index]
+            test.result = result
+
+            self.display.update(test)
+
+def run_one_tester(run, provider, display):
+    tester = Tester(run, provider, display)
+    tester.run()
+
+###
+
+class Run(object):
+    """
+    This class represents a concrete, configured testing run.
+    """
+
+    def __init__(self, lit_config, tests):
+        self.lit_config = lit_config
+        self.tests = tests
+
+    def execute_test(self, test):
+        result = None
+        start_time = time.time()
+        try:
+            result = test.config.test_format.execute(test, self.lit_config)
+
+            # Support deprecated result from execute() which returned the result
+            # code and additional output as a tuple.
+            if isinstance(result, tuple):
+                code, output = result
+                result = lit.Test.Result(code, output)
+            elif not isinstance(result, lit.Test.Result):
+                raise ValueError("unexpected result from test execution")
+        except KeyboardInterrupt:
+            raise
+        except:
+            if self.lit_config.debug:
+                raise
+            output = 'Exception during script execution:\n'
+            output += traceback.format_exc()
+            output += '\n'
+            result = lit.Test.Result(lit.Test.UNRESOLVED, output)
+        result.elapsed = time.time() - start_time
+
+        test.setResult(result)
+
+    def execute_tests(self, display, jobs, max_time=None,
+                      use_processes=False):
+        """
+        execute_tests(display, jobs, [max_time])
+
+        Execute each of the tests in the run, using up to jobs number of
+        parallel tasks, and inform the display of each individual result. The
+        provided tests should be a subset of the tests available in this run
+        object.
+
+        If max_time is non-None, it should be a time in seconds after which to
+        stop executing tests.
+
+        The display object will have its update method called with each test as
+        it is completed. The calls are guaranteed to be locked with respect to
+        one another, but are *not* guaranteed to be called on the same thread as
+        this method was invoked on.
+
+        Upon completion, each test in the run will have its result
+        computed. Tests which were not actually executed (for any reason) will
+        be given an UNRESOLVED result.
+        """
+
+        # Choose the appropriate parallel execution implementation.
+        consumer = None
+        if jobs != 1 and use_processes and multiprocessing:
+            try:
+                task_impl = multiprocessing.Process
+                queue_impl = multiprocessing.Queue
+                canceled_flag =  multiprocessing.Value('i', 0)
+                consumer = MultiprocessResultsConsumer(self, display, jobs)
+            except:
+                # multiprocessing fails to initialize with certain OpenBSD and
+                # FreeBSD Python versions: http://bugs.python.org/issue3770
+                # Unfortunately the error raised also varies by platform.
+                self.lit_config.note('failed to initialize multiprocessing')
+                consumer = None
+        if not consumer:
+            task_impl = threading.Thread
+            queue_impl = queue.Queue
+            canceled_flag = LockedValue(0)
+            consumer = ThreadResultsConsumer(display)
+
+        # Create the test provider.
+        provider = TestProvider(self.tests, jobs, queue_impl, canceled_flag)
+
+        # Install a console-control signal handler on Windows.
+        if win32api is not None:
+            def console_ctrl_handler(type):
+                provider.cancel()
+                return True
+            win32api.SetConsoleCtrlHandler(console_ctrl_handler, True)
+
+        # Install a timeout handler, if requested.
+        if max_time is not None:
+            def timeout_handler():
+                provider.cancel()
+            timeout_timer = threading.Timer(max_time, timeout_handler)
+            timeout_timer.start()
+
+        # If not using multiple tasks, just run the tests directly.
+        if jobs == 1:
+            run_one_tester(self, provider, consumer)
+        else:
+            # Otherwise, execute the tests in parallel
+            self._execute_tests_in_parallel(task_impl, provider, consumer, jobs)
+
+        # Cancel the timeout handler.
+        if max_time is not None:
+            timeout_timer.cancel()
+
+        # Update results for any tests which weren't run.
+        for test in self.tests:
+            if test.result is None:
+                test.setResult(lit.Test.Result(lit.Test.UNRESOLVED, '', 0.0))
+
+    def _execute_tests_in_parallel(self, task_impl, provider, consumer, jobs):
+        # Start all of the tasks.
+        tasks = [task_impl(target=run_one_tester,
+                           args=(self, provider, consumer))
+                 for i in range(jobs)]
+        for t in tasks:
+            t.start()
+
+        # Allow the consumer to handle results, if necessary.
+        consumer.handle_results()
+
+        # Wait for all the tasks to complete.
+        for t in tasks:
+            t.join()
diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py
new file mode 100644
index 0000000..2b1010c
--- /dev/null
+++ b/utils/lit/lit/util.py
@@ -0,0 +1,169 @@
+import errno
+import itertools
+import math
+import os
+import platform
+import signal
+import subprocess
+import sys
+
+def detectCPUs():
+    """
+    Detects the number of CPUs on a system. Cribbed from pp.
+    """
+    # Linux, Unix and MacOS:
+    if hasattr(os, "sysconf"):
+        if "SC_NPROCESSORS_ONLN" in os.sysconf_names:
+            # Linux & Unix:
+            ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
+            if isinstance(ncpus, int) and ncpus > 0:
+                return ncpus
+        else: # OSX:
+            return int(capture(['sysctl', '-n', 'hw.ncpu']))
+    # Windows:
+    if "NUMBER_OF_PROCESSORS" in os.environ:
+        ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
+        if ncpus > 0:
+            return ncpus
+    return 1 # Default
+
+def mkdir_p(path):
+    """mkdir_p(path) - Make the "path" directory, if it does not exist; this
+    will also make directories for any missing parent directories."""
+    if not path or os.path.exists(path):
+        return
+
+    parent = os.path.dirname(path) 
+    if parent != path:
+        mkdir_p(parent)
+
+    try:
+        os.mkdir(path)
+    except OSError:
+        e = sys.exc_info()[1]
+        # Ignore EEXIST, which may occur during a race condition.
+        if e.errno != errno.EEXIST:
+            raise
+
+def capture(args, env=None):
+    """capture(command) - Run the given command (or argv list) in a shell and
+    return the standard output."""
+    p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                         env=env)
+    out,_ = p.communicate()
+    return out
+
+def which(command, paths = None):
+    """which(command, [paths]) - Look up the given command in the paths string
+    (or the PATH environment variable, if unspecified)."""
+
+    if paths is None:
+        paths = os.environ.get('PATH','')
+
+    # Check for absolute match first.
+    if os.path.isfile(command):
+        return command
+
+    # Would be nice if Python had a lib function for this.
+    if not paths:
+        paths = os.defpath
+
+    # Get suffixes to search.
+    # On Cygwin, 'PATHEXT' may exist but it should not be used.
+    if os.pathsep == ';':
+        pathext = os.environ.get('PATHEXT', '').split(';')
+    else:
+        pathext = ['']
+
+    # Search the paths...
+    for path in paths.split(os.pathsep):
+        for ext in pathext:
+            p = os.path.join(path, command + ext)
+            if os.path.exists(p):
+                return p
+
+    return None
+
+def checkToolsPath(dir, tools):
+    for tool in tools:
+        if not os.path.exists(os.path.join(dir, tool)):
+            return False;
+    return True;
+
+def whichTools(tools, paths):
+    for path in paths.split(os.pathsep):
+        if checkToolsPath(path, tools):
+            return path
+    return None
+
+def printHistogram(items, title = 'Items'):
+    items.sort(key = lambda item: item[1])
+
+    maxValue = max([v for _,v in items])
+
+    # Select first "nice" bar height that produces more than 10 bars.
+    power = int(math.ceil(math.log(maxValue, 10)))
+    for inc in itertools.cycle((5, 2, 2.5, 1)):
+        barH = inc * 10**power
+        N = int(math.ceil(maxValue / barH))
+        if N > 10:
+            break
+        elif inc == 1:
+            power -= 1
+
+    histo = [set() for i in range(N)]
+    for name,v in items:
+        bin = min(int(N * v/maxValue), N-1)
+        histo[bin].add(name)
+
+    barW = 40
+    hr = '-' * (barW + 34)
+    print('\nSlowest %s:' % title)
+    print(hr)
+    for name,value in items[-20:]:
+        print('%.2fs: %s' % (value, name))
+    print('\n%s Times:' % title)
+    print(hr)
+    pDigits = int(math.ceil(math.log(maxValue, 10)))
+    pfDigits = max(0, 3-pDigits)
+    if pfDigits:
+        pDigits += pfDigits + 1
+    cDigits = int(math.ceil(math.log(len(items), 10)))
+    print("[%s] :: [%s] :: [%s]" % ('Range'.center((pDigits+1)*2 + 3),
+                                    'Percentage'.center(barW),
+                                    'Count'.center(cDigits*2 + 1)))
+    print(hr)
+    for i,row in enumerate(histo):
+        pct = float(len(row)) / len(items)
+        w = int(barW * pct)
+        print("[%*.*fs,%*.*fs) :: [%s%s] :: [%*d/%*d]" % (
+            pDigits, pfDigits, i*barH, pDigits, pfDigits, (i+1)*barH,
+            '*'*w, ' '*(barW-w), cDigits, len(row), cDigits, len(items)))
+
+# Close extra file handles on UNIX (on Windows this cannot be done while
+# also redirecting input).
+kUseCloseFDs = not (platform.system() == 'Windows')
+def executeCommand(command, cwd=None, env=None):
+    p = subprocess.Popen(command, cwd=cwd,
+                         stdin=subprocess.PIPE,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         env=env, close_fds=kUseCloseFDs)
+    out,err = p.communicate()
+    exitCode = p.wait()
+
+    # Detect Ctrl-C in subprocess.
+    if exitCode == -signal.SIGINT:
+        raise KeyboardInterrupt
+
+    # Ensure the resulting output is always of string type.
+    try:
+        out = str(out.decode('ascii'))
+    except:
+        out = str(out)
+    try:
+        err = str(err.decode('ascii'))
+    except:
+        err = str(err)
+
+    return out, err, exitCode
diff --git a/utils/lit/setup.py b/utils/lit/setup.py
index a94e6ea..10de6bb 100644
--- a/utils/lit/setup.py
+++ b/utils/lit/setup.py
@@ -1,7 +1,14 @@
 import lit
+import os
 
-# FIXME: Support distutils?
 from setuptools import setup, find_packages
+
+# setuptools expects to be invoked from within the directory of setup.py, but it
+# is nice to allow:
+#   python path/to/setup.py install
+# to work (for scripts, etc.)
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
 setup(
     name = "lit",
     version = lit.__version__,
diff --git a/utils/lit/tests/Inputs/discovery/lit.cfg b/utils/lit/tests/Inputs/discovery/lit.cfg
index 4049ab1..c48ca0b 100644
--- a/utils/lit/tests/Inputs/discovery/lit.cfg
+++ b/utils/lit/tests/Inputs/discovery/lit.cfg
@@ -1,3 +1,4 @@
+import lit.formats
 config.name = 'top-level-suite'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
@@ -8,3 +9,6 @@ config.test_format = lit.formats.ShTest()
 #
 #config.test_source_root = None
 #config.test_exec_root = None
+
+# Check that arbitrary config values are copied (tested by subdir/lit.local.cfg).
+config.an_extra_variable = False
diff --git a/utils/lit/tests/Inputs/discovery/subdir/lit.local.cfg b/utils/lit/tests/Inputs/discovery/subdir/lit.local.cfg
index 5ae6b3c..631cb60 100644
--- a/utils/lit/tests/Inputs/discovery/subdir/lit.local.cfg
+++ b/utils/lit/tests/Inputs/discovery/subdir/lit.local.cfg
@@ -1 +1,4 @@
 config.suffixes = ['.py']
+
+# Check that the arbitrary config values in our parent was inherited.
+assert hasattr(config, 'an_extra_variable')
diff --git a/utils/lit/tests/Inputs/discovery/subsuite/lit.cfg b/utils/lit/tests/Inputs/discovery/subsuite/lit.cfg
index 0c2979d..b49329a 100644
--- a/utils/lit/tests/Inputs/discovery/subsuite/lit.cfg
+++ b/utils/lit/tests/Inputs/discovery/subsuite/lit.cfg
@@ -1,3 +1,4 @@
+import lit.formats
 config.name = 'sub-suite'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
diff --git a/utils/lit/tests/Inputs/exec-discovery-in-tree/lit.cfg b/utils/lit/tests/Inputs/exec-discovery-in-tree/lit.cfg
index 342b2fd..ae25b4f 100644
--- a/utils/lit/tests/Inputs/exec-discovery-in-tree/lit.cfg
+++ b/utils/lit/tests/Inputs/exec-discovery-in-tree/lit.cfg
@@ -1,6 +1,8 @@
+import lit.formats
+
 # Verify that the site configuration was loaded.
 if config.test_source_root is None or config.test_exec_root is None:
-    lit.fatal("No site specific configuration")
+    lit_config.fatal("No site specific configuration")
 
 config.name = 'exec-discovery-in-tree-suite'
 config.suffixes = ['.txt']
diff --git a/utils/lit/tests/Inputs/exec-discovery-in-tree/obj/lit.site.cfg b/utils/lit/tests/Inputs/exec-discovery-in-tree/obj/lit.site.cfg
index de9a3d0..4061c89 100644
--- a/utils/lit/tests/Inputs/exec-discovery-in-tree/obj/lit.site.cfg
+++ b/utils/lit/tests/Inputs/exec-discovery-in-tree/obj/lit.site.cfg
@@ -1,4 +1,4 @@
 import os
 config.test_exec_root = os.path.dirname(__file__)
 config.test_source_root = os.path.dirname(config.test_exec_root)
-lit.load_config(config, os.path.join(config.test_source_root, "lit.cfg"))
-\ No newline at end of file
+lit_config.load_config(config, os.path.join(config.test_source_root, "lit.cfg"))
+\ No newline at end of file
diff --git a/utils/lit/tests/Inputs/exec-discovery/lit.site.cfg b/utils/lit/tests/Inputs/exec-discovery/lit.site.cfg
index 796569a..ac273c7 100644
--- a/utils/lit/tests/Inputs/exec-discovery/lit.site.cfg
+++ b/utils/lit/tests/Inputs/exec-discovery/lit.site.cfg
@@ -2,4 +2,4 @@
 import os
 config.test_exec_root = os.path.dirname(__file__)
 config.test_source_root = os.path.join(os.path.dirname(config.test_exec_root), "discovery")
-lit.load_config(config, os.path.join(config.test_source_root, "lit.cfg"))
+lit_config.load_config(config, os.path.join(config.test_source_root, "lit.cfg"))
diff --git a/utils/lit/tests/Inputs/googletest-format/DummySubDir/OneTest b/utils/lit/tests/Inputs/googletest-format/DummySubDir/OneTest
new file mode 100755
index 0000000..9dff137
--- /dev/null
+++ b/utils/lit/tests/Inputs/googletest-format/DummySubDir/OneTest
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+import sys
+
+if len(sys.argv) != 2:
+    raise ValueError("unexpected number of args")
+
+if sys.argv[1] == "--gtest_list_tests":
+    print("""\
+FirstTest.
+  subTestA
+  subTestB
+ParameterizedTest/0.
+  subTest
+ParameterizedTest/1.
+  subTest""")
+    sys.exit(0)
+elif not sys.argv[1].startswith("--gtest_filter="):
+    raise ValueError("unexpected argument: %r" % (sys.argv[1]))
+
+test_name = sys.argv[1].split('=',1)[1]
+if test_name == 'FirstTest.subTestA':
+    print('I am subTest A, I PASS')
+    sys.exit(0)
+elif test_name == 'FirstTest.subTestB':
+    print('I am subTest B, I FAIL')
+    print('And I have two lines of output')
+    sys.exit(1)
+elif test_name in ('ParameterizedTest/0.subTest',
+                   'ParameterizedTest/1.subTest'):
+    print('I am a parameterized test, I also PASS')
+    sys.exit(0)
+else:
+    raise SystemExit("error: invalid test name: %r" % (test_name,))
diff --git a/utils/lit/tests/Inputs/googletest-format/lit.cfg b/utils/lit/tests/Inputs/googletest-format/lit.cfg
new file mode 100644
index 0000000..f2f6cda
--- /dev/null
+++ b/utils/lit/tests/Inputs/googletest-format/lit.cfg
@@ -0,0 +1,3 @@
+import lit.formats
+config.name = 'googletest-format'
+config.test_format = lit.formats.GoogleTest('DummySubDir', 'Test')
diff --git a/utils/lit/tests/Inputs/progress-bar/lit.cfg b/utils/lit/tests/Inputs/progress-bar/lit.cfg
index 4878b65..7f31129 100644
--- a/utils/lit/tests/Inputs/progress-bar/lit.cfg
+++ b/utils/lit/tests/Inputs/progress-bar/lit.cfg
@@ -1,3 +1,4 @@
+import lit.formats
 config.name = 'shtest-shell'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
diff --git a/utils/lit/tests/Inputs/shtest-format/argv0.txt b/utils/lit/tests/Inputs/shtest-format/argv0.txt
new file mode 100644
index 0000000..2ff2890
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/argv0.txt
@@ -0,0 +1,6 @@
+# Check that we route argv[0] as it was written, instead of the resolved
+# path. This is important for some tools, in particular '[' which at least on OS
+# X only recognizes that it is in '['-mode when its argv[0] is exactly
+# '['. Otherwise it will refuse to accept the trailing closing bracket.
+#
+# RUN: [ "A" = "A" ]
diff --git a/utils/lit/tests/Inputs/shtest-format/external_shell/fail.txt b/utils/lit/tests/Inputs/shtest-format/external_shell/fail.txt
index 1e74be5..069e376 100644
--- a/utils/lit/tests/Inputs/shtest-format/external_shell/fail.txt
+++ b/utils/lit/tests/Inputs/shtest-format/external_shell/fail.txt
@@ -1,3 +1,5 @@
 # Run a command that fails with error on stdout.
 #
+# RUN: echo "line 1: failed test output on stdout"
+# RUN: echo "line 2: failed test output on stdout"
 # RUN: cat "does-not-exist"
diff --git a/utils/lit/tests/Inputs/shtest-format/external_shell/fail_with_bad_encoding.txt b/utils/lit/tests/Inputs/shtest-format/external_shell/fail_with_bad_encoding.txt
new file mode 100644
index 0000000..f6157e6
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/external_shell/fail_with_bad_encoding.txt
@@ -0,0 +1,5 @@
+# Run a command that fails with error on stdout.
+#
+# RUN: %S/write-bad-encoding.sh
+# RUN: false
+
diff --git a/utils/lit/tests/Inputs/shtest-format/external_shell/lit.local.cfg b/utils/lit/tests/Inputs/shtest-format/external_shell/lit.local.cfg
index d14d147..5e87c72 100644
--- a/utils/lit/tests/Inputs/shtest-format/external_shell/lit.local.cfg
+++ b/utils/lit/tests/Inputs/shtest-format/external_shell/lit.local.cfg
@@ -1 +1,2 @@
+import lit.formats
 config.test_format = lit.formats.ShTest(execute_external=True)
diff --git a/utils/lit/tests/Inputs/shtest-format/external_shell/write-bad-encoding.sh b/utils/lit/tests/Inputs/shtest-format/external_shell/write-bad-encoding.sh
new file mode 100755
index 0000000..6b622cb
--- /dev/null
+++ b/utils/lit/tests/Inputs/shtest-format/external_shell/write-bad-encoding.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+echo "a line with bad encoding: �."
diff --git a/utils/lit/tests/Inputs/shtest-format/fail.txt b/utils/lit/tests/Inputs/shtest-format/fail.txt
index 49932c3..8c305eb 100644
--- a/utils/lit/tests/Inputs/shtest-format/fail.txt
+++ b/utils/lit/tests/Inputs/shtest-format/fail.txt
@@ -1 +1,2 @@
+# RUN: printf "line 1: failed test output on stdout\nline 2: failed test output on stdout"
 # RUN: false
diff --git a/utils/lit/tests/Inputs/shtest-format/lit.cfg b/utils/lit/tests/Inputs/shtest-format/lit.cfg
index 78dd1bf..9b47985 100644
--- a/utils/lit/tests/Inputs/shtest-format/lit.cfg
+++ b/utils/lit/tests/Inputs/shtest-format/lit.cfg
@@ -1,3 +1,4 @@
+import lit.formats
 config.name = 'shtest-format'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
diff --git a/utils/lit/tests/Inputs/shtest-shell/lit.cfg b/utils/lit/tests/Inputs/shtest-shell/lit.cfg
index 4878b65..7f31129 100644
--- a/utils/lit/tests/Inputs/shtest-shell/lit.cfg
+++ b/utils/lit/tests/Inputs/shtest-shell/lit.cfg
@@ -1,3 +1,4 @@
+import lit.formats
 config.name = 'shtest-shell'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
diff --git a/utils/lit/tests/Inputs/test-data/lit.cfg b/utils/lit/tests/Inputs/test-data/lit.cfg
new file mode 100644
index 0000000..f5aba7b
--- /dev/null
+++ b/utils/lit/tests/Inputs/test-data/lit.cfg
@@ -0,0 +1,44 @@
+import os
+try:
+    import ConfigParser
+except ImportError:
+    import configparser as ConfigParser
+
+import lit.formats
+import lit.Test
+
+class DummyFormat(lit.formats.FileBasedTest):
+    def execute(self, test, lit_config):
+        # In this dummy format, expect that each test file is actually just a
+        # .ini format dump of the results to report.
+
+        source_path = test.getSourcePath()
+
+        cfg = ConfigParser.ConfigParser()
+        cfg.read(source_path)
+
+        # Create the basic test result.
+        result_code = cfg.get('global', 'result_code')
+        result_output = cfg.get('global', 'result_output')
+        result = lit.Test.Result(getattr(lit.Test, result_code),
+                                 result_output)
+
+        # Load additional metrics.
+        for key,value_str in cfg.items('results'):
+            value = eval(value_str)
+            if isinstance(value, int):
+                metric = lit.Test.IntMetricValue(value)
+            elif isinstance(value, float):
+                metric = lit.Test.RealMetricValue(value)
+            else:
+                raise RuntimeError("unsupported result type")
+            result.addMetric(key, metric)
+
+        return result
+
+config.name = 'test-data'
+config.suffixes = ['.ini']
+config.test_format = DummyFormat()
+config.test_source_root = None
+config.test_exec_root = None
+config.target_triple = None
diff --git a/utils/lit/tests/Inputs/test-data/metrics.ini b/utils/lit/tests/Inputs/test-data/metrics.ini
new file mode 100644
index 0000000..01b09c5
--- /dev/null
+++ b/utils/lit/tests/Inputs/test-data/metrics.ini
@@ -0,0 +1,7 @@
+[global]
+result_code = PASS
+result_output = Test passed.
+
+[results]
+value0 = 1
+value1 = 2.3456
+\ No newline at end of file
diff --git a/utils/lit/tests/Inputs/unittest-adaptor/lit.cfg b/utils/lit/tests/Inputs/unittest-adaptor/lit.cfg
index 52de709..9e08a86 100644
--- a/utils/lit/tests/Inputs/unittest-adaptor/lit.cfg
+++ b/utils/lit/tests/Inputs/unittest-adaptor/lit.cfg
@@ -1,3 +1,4 @@
+import lit.formats
 config.name = 'unittest-adaptor'
 config.suffixes = ['.txt']
 config.test_format = lit.formats.ShTest()
diff --git a/utils/lit/tests/discovery.py b/utils/lit/tests/discovery.py
index be98c4b..2801089 100644
--- a/utils/lit/tests/discovery.py
+++ b/utils/lit/tests/discovery.py
@@ -1,7 +1,7 @@
 # Check the basic discovery process, including a sub-suite.
 #
 # RUN: %{lit} %{inputs}/discovery \
-# RUN:   -j 1 --debug --show-tests --show-suites --max-tests 0 \
+# RUN:   -j 1 --debug --show-tests --show-suites \
 # RUN:   -v > %t.out 2> %t.err
 # RUN: FileCheck --check-prefix=CHECK-BASIC-OUT < %t.out %s
 # RUN: FileCheck --check-prefix=CHECK-BASIC-ERR < %t.err %s
@@ -24,7 +24,6 @@
 # CHECK-BASIC-OUT: top-level-suite :: subdir/test-three
 # CHECK-BASIC-OUT: top-level-suite :: test-one
 # CHECK-BASIC-OUT: top-level-suite :: test-two
-# CHECK-BASIC-OUT: -- Testing: 0
 
 
 # Check discovery when exact test names are given.
@@ -32,19 +31,18 @@
 # RUN: %{lit} \
 # RUN:     %{inputs}/discovery/subdir/test-three.py \
 # RUN:     %{inputs}/discovery/subsuite/test-one.txt \
-# RUN:   -j 1 --show-tests --show-suites --max-tests 0 -v > %t.out
+# RUN:   -j 1 --show-tests --show-suites -v > %t.out
 # RUN: FileCheck --check-prefix=CHECK-EXACT-TEST < %t.out %s
 #
 # CHECK-EXACT-TEST: -- Available Tests --
 # CHECK-EXACT-TEST: sub-suite :: test-one
 # CHECK-EXACT-TEST: top-level-suite :: subdir/test-three
-# CHECK-EXACT-TEST: -- Testing: 0
 
 
 # Check discovery when using an exec path.
 #
 # RUN: %{lit} %{inputs}/exec-discovery \
-# RUN:   -j 1 --debug --show-tests --show-suites --max-tests 0 \
+# RUN:   -j 1 --debug --show-tests --show-suites \
 # RUN:   -v > %t.out 2> %t.err
 # RUN: FileCheck --check-prefix=CHECK-ASEXEC-OUT < %t.out %s
 # RUN: FileCheck --check-prefix=CHECK-ASEXEC-ERR < %t.err %s
@@ -70,7 +68,6 @@
 # CHECK-ASEXEC-OUT: top-level-suite :: subdir/test-three
 # CHECK-ASEXEC-OUT: top-level-suite :: test-one
 # CHECK-ASEXEC-OUT: top-level-suite :: test-two
-# CHECK-ASEXEC-OUT: -- Testing: 0
 
 # Check discovery when exact test names are given.
 #
@@ -78,12 +75,11 @@
 #
 # RUN: %{lit} \
 # RUN:     %{inputs}/exec-discovery/subdir/test-three.py \
-# RUN:   -j 1 --show-tests --show-suites --max-tests 0 -v > %t.out
+# RUN:   -j 1 --show-tests --show-suites -v > %t.out
 # RUN: FileCheck --check-prefix=CHECK-ASEXEC-EXACT-TEST < %t.out %s
 #
 # CHECK-ASEXEC-EXACT-TEST: -- Available Tests --
 # CHECK-ASEXEC-EXACT-TEST: top-level-suite :: subdir/test-three
-# CHECK-ASEXEC-EXACT-TEST: -- Testing: 0
 
 
 # Check that we don't recurse infinitely when loading an site specific test
@@ -91,7 +87,7 @@
 #
 # RUN: %{lit} \
 # RUN:     %{inputs}/exec-discovery-in-tree/obj/ \
-# RUN:   -j 1 --show-tests --show-suites --max-tests 0 -v > %t.out
+# RUN:   -j 1 --show-tests --show-suites -v > %t.out
 # RUN: FileCheck --check-prefix=CHECK-ASEXEC-INTREE < %t.out %s
 #
 #      CHECK-ASEXEC-INTREE:   exec-discovery-in-tree-suite - 1 tests
@@ -99,4 +95,3 @@
 # CHECK-ASEXEC-INTREE-NEXT:     Exec Root  : {{.*/exec-discovery-in-tree/obj$}}
 # CHECK-ASEXEC-INTREE-NEXT: -- Available Tests --
 # CHECK-ASEXEC-INTREE-NEXT: exec-discovery-in-tree-suite :: test-one
-# CHECK-ASEXEC-INTREE: -- Testing: 0
diff --git a/utils/lit/tests/googletest-format.py b/utils/lit/tests/googletest-format.py
new file mode 100644
index 0000000..a62fd1b
--- /dev/null
+++ b/utils/lit/tests/googletest-format.py
@@ -0,0 +1,20 @@
+# Check the various features of the GoogleTest format.
+#
+# RUN: not %{lit} -j 1 -v %{inputs}/googletest-format > %t.out
+# RUN: FileCheck < %t.out %s
+#
+# END.
+
+# CHECK: -- Testing:
+# CHECK: PASS: googletest-format :: DummySubDir/OneTest/FirstTest.subTestA
+# CHECK: FAIL: googletest-format :: DummySubDir/OneTest/FirstTest.subTestB
+# CHECK-NEXT: *** TEST 'googletest-format :: DummySubDir/OneTest/FirstTest.subTestB' FAILED ***
+# CHECK-NEXT: I am subTest B, I FAIL
+# CHECK-NEXT: And I have two lines of output
+# CHECK: ***
+# CHECK: PASS: googletest-format :: DummySubDir/OneTest/ParameterizedTest/0.subTest
+# CHECK: PASS: googletest-format :: DummySubDir/OneTest/ParameterizedTest/1.subTest
+# CHECK: Failing Tests (1)
+# CHECK: Expected Passes    : 3
+# CHECK: Unexpected Failures: 1
+
diff --git a/utils/lit/tests/lit.cfg b/utils/lit/tests/lit.cfg
index 32760ce..2111b72 100644
--- a/utils/lit/tests/lit.cfg
+++ b/utils/lit/tests/lit.cfg
@@ -1,6 +1,9 @@
 # -*- Python -*-
 
 import os
+import sys
+
+import lit.formats
 
 # Configuration file for the 'lit' test runner.
 
@@ -20,17 +23,23 @@ config.excludes = ['Inputs']
 config.test_source_root = os.path.dirname(__file__)
 config.test_exec_root = config.test_source_root
 
-config.target_triple = None
+config.target_triple = '(unused)'
 
 src_root = os.path.join(config.test_source_root, '..')
 config.environment['PYTHONPATH'] = src_root
 config.substitutions.append(('%{src_root}', src_root))
 config.substitutions.append(('%{inputs}', os.path.join(
             src_root, 'tests', 'Inputs')))
-config.substitutions.append(('%{lit}', os.path.join(src_root, 'lit.py')))
+config.substitutions.append(('%{lit}', "%%{python} %s" % (
+            os.path.join(src_root, 'lit.py'),)))
+config.substitutions.append(('%{python}', sys.executable))
 
 # Enable coverage.py reporting, assuming the coverage module has been installed
 # and sitecustomize.py in the virtualenv has been modified appropriately.
-if lit.params.get('check-coverage', None):
+if lit_config.params.get('check-coverage', None):
     config.environment['COVERAGE_PROCESS_START'] = os.path.join(
         os.path.dirname(__file__), ".coveragerc")
+
+# Add a feature to detect the Python version.
+config.available_features.add("python%d.%d" % (sys.version_info[0],
+                                                  sys.version_info[1]))
diff --git a/utils/lit/tests/shell-parsing.py b/utils/lit/tests/shell-parsing.py
index f644132..a07e988 100644
--- a/utils/lit/tests/shell-parsing.py
+++ b/utils/lit/tests/shell-parsing.py
@@ -1,3 +1,3 @@
 # Just run the ShUtil unit tests.
 #
-# RUN: python -m lit.ShUtil
+# RUN: %{python} -m lit.ShUtil
diff --git a/utils/lit/tests/shtest-encoding.py b/utils/lit/tests/shtest-encoding.py
new file mode 100644
index 0000000..dfc987f
--- /dev/null
+++ b/utils/lit/tests/shtest-encoding.py
@@ -0,0 +1,3 @@
+# RUN: true
+
+# Here is a string that cannot be decoded in line mode: �.
diff --git a/utils/lit/tests/shtest-format.py b/utils/lit/tests/shtest-format.py
index 4b36873..751f0d7 100644
--- a/utils/lit/tests/shtest-format.py
+++ b/utils/lit/tests/shtest-format.py
@@ -7,15 +7,43 @@
 
 # CHECK: -- Testing:
 
+# CHECK: PASS: shtest-format :: argv0.txt
 # CHECK: FAIL: shtest-format :: external_shell/fail.txt
-# CHECK: *** TEST 'shtest-format :: external_shell/fail.txt' FAILED ***
+# CHECK-NEXT: *** TEST 'shtest-format :: external_shell/fail.txt' FAILED ***
+# CHECK: Command Output (stdout):
+# CHECK-NEXT: --
+# CHECK-NEXT: line 1: failed test output on stdout
+# CHECK-NEXT: line 2: failed test output on stdout
 # CHECK: Command Output (stderr):
-# CHECK: cat: does-not-exist: No such file or directory
+# CHECK-NEXT: --
+# CHECK-NEXT: cat: does-not-exist: No such file or directory
+# CHECK: --
+
+# CHECK: FAIL: shtest-format :: external_shell/fail_with_bad_encoding.txt
+# CHECK-NEXT: *** TEST 'shtest-format :: external_shell/fail_with_bad_encoding.txt' FAILED ***
+# CHECK: Command Output (stdout):
+# CHECK-NEXT: --
+# CHECK-NEXT: a line with bad encoding:
 # CHECK: --
 
 # CHECK: PASS: shtest-format :: external_shell/pass.txt
 
 # CHECK: FAIL: shtest-format :: fail.txt
+# CHECK-NEXT: *** TEST 'shtest-format :: fail.txt' FAILED ***
+# CHECK-NEXT: Script:
+# CHECK-NEXT: --
+# CHECK-NEXT: printf "line 1
+# CHECK-NEXT: false
+# CHECK-NEXT: --
+# CHECK-NEXT: Exit Code: 1
+#
+# CHECK: Command Output (stdout):
+# CHECK-NEXT: --
+# CHECK-NEXT: Command 0: "printf"
+# CHECK-NEXT: Command 0 Result: 0
+# CHECK-NEXT: Command 0 Output:
+# CHECK-NEXT: line 1: failed test output on stdout
+# CHECK-NEXT: line 2: failed test output on stdout
 
 # CHECK: UNRESOLVED: shtest-format :: no-test-line.txt
 # CHECK: PASS: shtest-format :: pass.txt
@@ -26,18 +54,24 @@
 # CHECK: XFAIL: shtest-format :: xfail-target.txt
 # CHECK: XFAIL: shtest-format :: xfail.txt
 # CHECK: XPASS: shtest-format :: xpass.txt
+# CHECK-NEXT: *** TEST 'shtest-format :: xpass.txt' FAILED ***
+# CHECK-NEXT: Script
+# CHECK-NEXT: --
+# CHECK-NEXT: true
+# CHECK-NEXT: --
 # CHECK: Testing Time
 
 # CHECK: Unexpected Passing Tests (1)
 # CHECK: shtest-format :: xpass.txt
 
-# CHECK: Failing Tests (2)
+# CHECK: Failing Tests (3)
 # CHECK: shtest-format :: external_shell/fail.txt
+# CHECK: shtest-format :: external_shell/fail_with_bad_encoding.txt
 # CHECK: shtest-format :: fail.txt
 
-# CHECK: Expected Passes    : 3
+# CHECK: Expected Passes    : 4
 # CHECK: Expected Failures  : 3
 # CHECK: Unsupported Tests  : 2
 # CHECK: Unresolved Tests   : 1
 # CHECK: Unexpected Passes  : 1
-# CHECK: Unexpected Failures: 2
+# CHECK: Unexpected Failures: 3
diff --git a/utils/lit/tests/test-data.py b/utils/lit/tests/test-data.py
new file mode 100644
index 0000000..54909d7
--- /dev/null
+++ b/utils/lit/tests/test-data.py
@@ -0,0 +1,12 @@
+# Test features related to formats which support reporting additional test data.
+
+# RUN: %{lit} -j 1 -v %{inputs}/test-data > %t.out
+# RUN: FileCheck < %t.out %s
+
+# CHECK: -- Testing:
+
+# CHECK: PASS: test-data :: metrics.ini
+# CHECK-NEXT: *** TEST 'test-data :: metrics.ini' RESULTS ***
+# CHECK-NEXT: value0: 1
+# CHECK-NEXT: value1: 2.3456
+# CHECK-NEXT: ***
diff --git a/utils/lit/tests/test-output.py b/utils/lit/tests/test-output.py
new file mode 100644
index 0000000..adfbcd8
--- /dev/null
+++ b/utils/lit/tests/test-output.py
@@ -0,0 +1,21 @@
+# XFAIL: python2.5
+
+# RUN: %{lit} -j 1 -v %{inputs}/test-data --output %t.results.out > %t.out
+# RUN: FileCheck < %t.results.out %s
+
+# CHECK: {
+# CHECK: "__version__"
+# CHECK: "elapsed"
+# CHECK-NEXT: "tests": [
+# CHECK-NEXT:   {
+# CHECK-NEXT:     "code": "PASS",
+# CHECK-NEXT:     "elapsed": {{[0-9.]+}},
+# CHECK-NEXT:     "metrics": {
+# CHECK-NEXT:       "value0": 1,
+# CHECK-NEXT:       "value1": 2.3456
+# CHECK-NEXT:     }
+# CHECK-NEXT:     "name": "test-data :: metrics.ini",
+# CHECK-NEXT:     "output": "Test passed."
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+# CHECK-NEXT: }
diff --git a/utils/lit/tests/unittest-adaptor.py b/utils/lit/tests/unittest-adaptor.py
index 243dd41..7435dda 100644
--- a/utils/lit/tests/unittest-adaptor.py
+++ b/utils/lit/tests/unittest-adaptor.py
@@ -1,6 +1,6 @@
 # Check the lit adaption to run under unittest.
 #
-# RUN: python %s %{inputs}/unittest-adaptor 2> %t.err
+# RUN: %{python} %s %{inputs}/unittest-adaptor 2> %t.err
 # RUN: FileCheck < %t.err %s
 #
 # CHECK: unittest-adaptor :: test-one.txt ... ok
diff --git a/utils/llvm-build/llvmbuild/__init__.py b/utils/llvm-build/llvmbuild/__init__.py
index 7760218..eb20d09 100644
--- a/utils/llvm-build/llvmbuild/__init__.py
+++ b/utils/llvm-build/llvmbuild/__init__.py
@@ -1 +1 @@
-from main import main
+from llvmbuild.main import main
diff --git a/utils/llvm-build/llvmbuild/componentinfo.py b/utils/llvm-build/llvmbuild/componentinfo.py
index e684ac2..eda3a48 100644
--- a/utils/llvm-build/llvmbuild/componentinfo.py
+++ b/utils/llvm-build/llvmbuild/componentinfo.py
@@ -2,11 +2,14 @@
 Descriptor objects for entities that are part of the LLVM project.
 """
 
-import ConfigParser
-import StringIO
+from __future__ import absolute_import
+try:
+    import configparser
+except:
+    import ConfigParser as configparser
 import sys
 
-from util import *
+from llvmbuild.util import *
 
 class ParseError(Exception):
     pass
@@ -29,7 +32,7 @@ class ComponentInfo(object):
 
     def __init__(self, subpath, name, dependencies, parent):
         if not subpath.startswith('/'):
-            raise ValueError,"invalid subpath: %r" % subpath
+            raise ValueError("invalid subpath: %r" % subpath)
         self.subpath = subpath
         self.name = name
         self.dependencies = list(dependencies)
@@ -100,11 +103,11 @@ class GroupComponentInfo(ComponentInfo):
         ComponentInfo.__init__(self, subpath, name, [], parent)
 
     def get_llvmbuild_fragment(self):
-        result = StringIO.StringIO()
-        print >>result, 'type = %s' % self.type_name
-        print >>result, 'name = %s' % self.name
-        print >>result, 'parent = %s' % self.parent
-        return result.getvalue()
+        return """\
+type = %s
+name = %s
+parent = %s
+""" % (self.type_name, self.name, self.parent)
 
 class LibraryComponentInfo(ComponentInfo):
     type_name = 'Library'
@@ -152,21 +155,22 @@ class LibraryComponentInfo(ComponentInfo):
             yield ('library group', r)
 
     def get_llvmbuild_fragment(self):
-        result = StringIO.StringIO()
-        print >>result, 'type = %s' % self.type_name
-        print >>result, 'name = %s' % self.name
-        print >>result, 'parent = %s' % self.parent
+        result = """\
+type = %s
+name = %s
+parent = %s
+""" % (self.type_name, self.name, self.parent)
         if self.library_name is not None:
-            print >>result, 'library_name = %s' % self.library_name
+            result += 'library_name = %s\n' % self.library_name
         if self.required_libraries:
-            print >>result, 'required_libraries = %s' % ' '.join(
+            result += 'required_libraries = %s\n' % ' '.join(
                 self.required_libraries)
         if self.add_to_library_groups:
-            print >>result, 'add_to_library_groups = %s' % ' '.join(
+            result += 'add_to_library_groups = %s\n' % ' '.join(
                 self.add_to_library_groups)
         if not self.installed:
-            print >>result, 'installed = 0'
-        return result.getvalue()
+            result += 'installed = 0\n'
+        return result
 
     def get_library_name(self):
         return self.library_name or self.name
@@ -237,17 +241,18 @@ class LibraryGroupComponentInfo(ComponentInfo):
             yield ('library group', r)
 
     def get_llvmbuild_fragment(self):
-        result = StringIO.StringIO()
-        print >>result, 'type = %s' % self.type_name
-        print >>result, 'name = %s' % self.name
-        print >>result, 'parent = %s' % self.parent
+        result = """\
+type = %s
+name = %s
+parent = %s
+""" % (self.type_name, self.name, self.parent)
         if self.required_libraries and not self._is_special_group:
-            print >>result, 'required_libraries = %s' % ' '.join(
+            result += 'required_libraries = %s\n' % ' '.join(
                 self.required_libraries)
         if self.add_to_library_groups:
-            print >>result, 'add_to_library_groups = %s' % ' '.join(
+            result += 'add_to_library_groups = %s\n' % ' '.join(
                 self.add_to_library_groups)
-        return result.getvalue()
+        return result
 
     def get_llvmconfig_component_name(self):
         return self.name.lower()
@@ -309,21 +314,22 @@ class TargetGroupComponentInfo(ComponentInfo):
             yield ('library group', r)
 
     def get_llvmbuild_fragment(self):
-        result = StringIO.StringIO()
-        print >>result, 'type = %s' % self.type_name
-        print >>result, 'name = %s' % self.name
-        print >>result, 'parent = %s' % self.parent
+        result = """\
+type = %s
+name = %s
+parent = %s
+""" % (self.type_name, self.name, self.parent)
         if self.required_libraries:
-            print >>result, 'required_libraries = %s' % ' '.join(
+            result += 'required_libraries = %s\n' % ' '.join(
                 self.required_libraries)
         if self.add_to_library_groups:
-            print >>result, 'add_to_library_groups = %s' % ' '.join(
+            result += 'add_to_library_groups = %s\n' % ' '.join(
                 self.add_to_library_groups)
         for bool_key in ('has_asmparser', 'has_asmprinter', 'has_disassembler',
                          'has_jit'):
             if getattr(self, bool_key):
-                print >>result, '%s = 1' % (bool_key,)
-        return result.getvalue()
+                result += '%s = 1\n' % (bool_key,)
+        return result
 
     def get_llvmconfig_component_name(self):
         return self.name.lower()
@@ -352,13 +358,13 @@ class ToolComponentInfo(ComponentInfo):
             yield ('required library', r)
 
     def get_llvmbuild_fragment(self):
-        result = StringIO.StringIO()
-        print >>result, 'type = %s' % self.type_name
-        print >>result, 'name = %s' % self.name
-        print >>result, 'parent = %s' % self.parent
-        print >>result, 'required_libraries = %s' % ' '.join(
-            self.required_libraries)
-        return result.getvalue()
+        return """\
+type = %s
+name = %s
+parent = %s
+required_libraries = %s
+""" % (self.type_name, self.name, self.parent,
+       ' '.join(self.required_libraries))
 
 class BuildToolComponentInfo(ToolComponentInfo):
     type_name = 'BuildTool'
@@ -418,7 +424,7 @@ _component_type_map = dict(
               TargetGroupComponentInfo, OptionalLibraryComponentInfo))
 def load_from_path(path, subpath):
     # Load the LLVMBuild.txt file as an .ini format file.
-    parser = ConfigParser.RawConfigParser()
+    parser = configparser.RawConfigParser()
     parser.read(path)
 
     # Extract the common section.
@@ -459,8 +465,9 @@ def _read_components_from_parser(parser, path, subpath):
                 section, path, "unable to instantiate: %r" % type_name)
             import traceback
             traceback.print_exc()
-            raise SystemExit, 1
-        except ParseError,e:
+            raise SystemExit(1)
+        except ParseError:
+            e = sys.exc_info()[1]
             fatal("unable to load component %r in %r: %s" % (
                     section, path, e.message))
 
diff --git a/utils/llvm-build/llvmbuild/main.py b/utils/llvm-build/llvmbuild/main.py
index 87e8819..eacefdf 100644
--- a/utils/llvm-build/llvmbuild/main.py
+++ b/utils/llvm-build/llvmbuild/main.py
@@ -1,11 +1,11 @@
-import StringIO
+from __future__ import absolute_import
 import os
 import sys
 
-import componentinfo
-import configutil
+import llvmbuild.componentinfo as componentinfo
+import llvmbuild.configutil as configutil
 
-from util import *
+from llvmbuild.util import *
 
 ###
 
@@ -186,7 +186,7 @@ class LLVMProjectInfo(object):
             set(self.component_infos),
             key = lambda c: c.name)
         while components_to_visit:
-            visit_component_info(iter(components_to_visit).next(), [], set())
+            visit_component_info(components_to_visit[0], [], set())
 
         # Canonicalize children lists.
         for c in self.ordered_component_infos:
@@ -194,7 +194,7 @@ class LLVMProjectInfo(object):
 
     def print_tree(self):
         def visit(node, depth = 0):
-            print '%s%-40s (%s)' % ('  '*depth, node.name, node.type_name)
+            print('%s%-40s (%s)' % ('  '*depth, node.name, node.type_name))
             for c in node.children:
                 visit(c, depth + 1)
         visit(self.component_info_map['$ROOT'])
@@ -283,7 +283,7 @@ subdirectories = %s
             header_name = '.' + os.path.join(subpath, 'LLVMBuild.txt')
             header_pad = '-' * (80 - len(header_fmt % (header_name, '')))
             header_string = header_fmt % (header_name, header_pad)
-            print >>f, """\
+            f.write("""\
 %s
 ;
 ;                     The LLVM Compiler Infrastructure
@@ -300,17 +300,18 @@ subdirectories = %s
 ;   http://llvm.org/docs/LLVMBuild.html
 ;
 ;===------------------------------------------------------------------------===;
-""" % header_string
+
+""" % header_string)
 
             # Write out each fragment.each component fragment.
             for name,fragment in fragments:
                 comment = comments_map.get(name)
                 if comment is not None:
                     f.write(comment)
-                print >>f, "[%s]" % name
+                f.write("[%s]\n" % name)
                 f.write(fragment)
                 if fragment is not fragments[-1][1]:
-                    print >>f
+                    f.write('\n')
 
             f.close()
 
@@ -363,7 +364,7 @@ subdirectories = %s
                                is_installed)
 
         # Convert to a list of entries and sort by name.
-        entries = entries.values()
+        entries = list(entries.values())
 
         # Create an 'all' pseudo component. We keep the dependency list small by
         # only listing entries that have no other dependents.
@@ -382,7 +383,7 @@ subdirectories = %s
         # Write out the library table.
         make_install_dir(os.path.dirname(output_path))
         f = open(output_path, 'w')
-        print >>f, """\
+        f.write("""\
 //===- llvm-build generated file --------------------------------*- C++ -*-===//
 //
 // Component Library Depenedency Table
@@ -390,32 +391,33 @@ subdirectories = %s
 // Automatically generated file, do not edit!
 //
 //===----------------------------------------------------------------------===//
-"""
-        print >>f, 'struct AvailableComponent {'
-        print >>f, '  /// The name of the component.'
-        print >>f, '  const char *Name;'
-        print >>f, ''
-        print >>f, '  /// The name of the library for this component (or NULL).'
-        print >>f, '  const char *Library;'
-        print >>f, ''
-        print >>f, '  /// Whether the component is installed.'
-        print >>f, '  bool IsInstalled;'
-        print >>f, ''
-        print >>f, '\
-  /// The list of libraries required when linking this component.'
-        print >>f, '  const char *RequiredLibraries[%d];' % (
-            max_required_libraries)
-        print >>f, '} AvailableComponents[%d] = {' % len(entries)
+
+""")
+        f.write('struct AvailableComponent {\n')
+        f.write('  /// The name of the component.\n')
+        f.write('  const char *Name;\n')
+        f.write('\n')
+        f.write('  /// The name of the library for this component (or NULL).\n')
+        f.write('  const char *Library;\n')
+        f.write('\n')
+        f.write('  /// Whether the component is installed.\n')
+        f.write('  bool IsInstalled;\n')
+        f.write('\n')
+        f.write('\
+  /// The list of libraries required when linking this component.\n')
+        f.write('  const char *RequiredLibraries[%d];\n' % (
+            max_required_libraries))
+        f.write('} AvailableComponents[%d] = {\n' % len(entries))
         for name,library_name,required_names,is_installed in entries:
             if library_name is None:
                 library_name_as_cstr = '0'
             else:
                 library_name_as_cstr = '"lib%s.a"' % library_name
-            print >>f, '  { "%s", %s, %d, { %s } },' % (
+            f.write('  { "%s", %s, %d, { %s } },\n' % (
                 name, library_name_as_cstr, is_installed,
                 ', '.join('"%s"' % dep
-                          for dep in required_names))
-        print >>f, '};'
+                          for dep in required_names)))
+        f.write('};\n')
         f.close()
 
     def get_required_libraries_for_component(self, ci, traverse_groups = False):
@@ -512,7 +514,7 @@ subdirectories = %s
         header_name = os.path.basename(output_path)
         header_pad = '-' * (80 - len(header_fmt % (header_name, '')))
         header_string = header_fmt % (header_name, header_pad)
-        print >>f, """\
+        f.write("""\
 %s
 #
 #                     The LLVM Compiler Infrastructure
@@ -528,10 +530,11 @@ subdirectories = %s
 # This file is autogenerated by llvm-build, do not edit!
 #
 #===------------------------------------------------------------------------===#
-""" % header_string
+
+""" % header_string)
 
         # Write the dependency information in the best way we can.
-        print >>f, """
+        f.write("""
 # LLVMBuild CMake fragment dependencies.
 #
 # CMake has no builtin way to declare that the configuration depends on
@@ -541,30 +544,32 @@ subdirectories = %s
 # CMake.
 #
 # FIXME: File a CMake RFE to get a properly supported version of this
-# feature."""
+# feature.
+""")
         for dep in dependencies:
-            print >>f, """\
+            f.write("""\
 configure_file(\"%s\"
-               ${CMAKE_CURRENT_BINARY_DIR}/DummyConfigureOutput)""" % (
-                cmake_quote_path(dep),)
+               ${CMAKE_CURRENT_BINARY_DIR}/DummyConfigureOutput)\n""" % (
+                cmake_quote_path(dep),))
 
         # Write the properties we use to encode the required library dependency
         # information in a form CMake can easily use directly.
-        print >>f, """
+        f.write("""
 # Explicit library dependency information.
 #
 # The following property assignments effectively create a map from component
-# names to required libraries, in a way that is easily accessed from CMake."""
+# names to required libraries, in a way that is easily accessed from CMake.
+""")
         for ci in self.ordered_component_infos:
             # We only write the information for libraries currently.
             if ci.type_name != 'Library':
                 continue
 
-            print >>f, """\
-set_property(GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_%s %s)""" % (
+            f.write("""\
+set_property(GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_%s %s)\n""" % (
                 ci.get_prefixed_library_name(), " ".join(sorted(
                      dep.get_prefixed_library_name()
-                     for dep in self.get_required_libraries_for_component(ci))))
+                     for dep in self.get_required_libraries_for_component(ci)))))
 
         f.close()
 
@@ -590,7 +595,7 @@ set_property(GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_%s %s)""" % (
         header_name = os.path.basename(output_path)
         header_pad = '-' * (80 - len(header_fmt % (header_name, '')))
         header_string = header_fmt % (header_name, header_pad)
-        print >>f, """\
+        f.write("""\
 %s
 #
 #                     The LLVM Compiler Infrastructure
@@ -606,30 +611,33 @@ set_property(GLOBAL PROPERTY LLVMBUILD_LIB_DEPS_%s %s)""" % (
 # This file is autogenerated by llvm-build, do not edit!
 #
 #===------------------------------------------------------------------------===#
-""" % header_string
+
+""" % header_string)
 
         # Write the dependencies for the fragment.
         #
         # FIXME: Technically, we need to properly quote for Make here.
-        print >>f, """\
+        f.write("""\
 # Clients must explicitly enable LLVMBUILD_INCLUDE_DEPENDENCIES to get
 # these dependencies. This is a compromise to help improve the
-# performance of recursive Make systems.""" 
-        print >>f, 'ifeq ($(LLVMBUILD_INCLUDE_DEPENDENCIES),1)'
-        print >>f, "# The dependencies for this Makefile fragment itself."
-        print >>f, "%s: \\" % (mk_quote_string_for_target(output_path),)
+# performance of recursive Make systems.
+""")
+        f.write('ifeq ($(LLVMBUILD_INCLUDE_DEPENDENCIES),1)\n')
+        f.write("# The dependencies for this Makefile fragment itself.\n")
+        f.write("%s: \\\n" % (mk_quote_string_for_target(output_path),))
         for dep in dependencies:
-            print >>f, "\t%s \\" % (dep,)
-        print >>f
+            f.write("\t%s \\\n" % (dep,))
+        f.write('\n')
 
         # Generate dummy rules for each of the dependencies, so that things
         # continue to work correctly if any of those files are moved or removed.
-        print >>f, """\
+        f.write("""\
 # The dummy targets to allow proper regeneration even when files are moved or
-# removed."""
+# removed.
+""")
         for dep in dependencies:
-            print >>f, "%s:" % (mk_quote_string_for_target(dep),)
-        print >>f, 'endif'
+            f.write("%s:\n" % (mk_quote_string_for_target(dep),))
+        f.write('endif\n')
 
         f.close()
 
@@ -801,7 +809,7 @@ given by --build-root) at the same SUBPATH""",
                       dest="optional_components", metavar="NAMES",
                       help=("Enable the given space or semi-colon separated "
                             "list of optional components"),
-                      action="store", default=None)
+                      action="store", default="")
     parser.add_option_group(group)
 
     (opts, args) = parser.parse_args()
diff --git a/utils/llvm-build/llvmbuild/util.py b/utils/llvm-build/llvmbuild/util.py
index e581af2..ca021c4 100644
--- a/utils/llvm-build/llvmbuild/util.py
+++ b/utils/llvm-build/llvmbuild/util.py
@@ -3,7 +3,7 @@ import sys
 
 def _write_message(kind, message):
     program = os.path.basename(sys.argv[0])
-    print >>sys.stderr, '%s: %s: %s' % (program, kind, message)
+    sys.stderr.write('%s: %s: %s\n' % (program, kind, message))
 
 note = lambda message: _write_message('note', message)
 warning = lambda message: _write_message('warning', message)
diff --git a/utils/llvm-lit/CMakeLists.txt b/utils/llvm-lit/CMakeLists.txt
index 602cc88..b535eae 100644
--- a/utils/llvm-lit/CMakeLists.txt
+++ b/utils/llvm-lit/CMakeLists.txt
@@ -2,11 +2,3 @@ configure_file(
   llvm-lit.in
   ${LLVM_TOOLS_BINARY_DIR}/llvm-lit
   )
-
-install(FILES
-  ${LLVM_TOOLS_BINARY_DIR}/llvm-lit
-  DESTINATION bin
-  PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
-              GROUP_READ GROUP_EXECUTE
-              WORLD_READ WORLD_EXECUTE
-  )
diff --git a/utils/not/CMakeLists.txt b/utils/not/CMakeLists.txt
index f4c0229..5ff14d6 100644
--- a/utils/not/CMakeLists.txt
+++ b/utils/not/CMakeLists.txt
@@ -4,7 +4,7 @@ add_llvm_utility(not
 
 target_link_libraries(not LLVMSupport)
 if( MINGW )
-  target_link_libraries(not imagehlp psapi)
+  target_link_libraries(not imagehlp psapi shell32)
 endif( MINGW )
 if( LLVM_ENABLE_THREADS AND HAVE_LIBPTHREAD )
   target_link_libraries(not pthread)
diff --git a/utils/profile.pl b/utils/profile.pl
deleted file mode 100755
index 782e5dc..0000000
--- a/utils/profile.pl
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/usr/bin/perl -w
-#
-# Program:  profile.pl
-#
-# Synopsis: Insert instrumentation code into a program, run it with the JIT,
-#           then print out a profile report.
-#
-# Syntax:   profile.pl [OPTIONS] bitcodefile <arguments>
-#
-# OPTIONS may include one or more of the following:
-#     -block    - Enable basicblock profiling
-#     -edge     - Enable edge profiling
-#     -function - Enable function profiling
-#     -o <filename> - Emit profiling information to the specified file, instead
-#                     of llvmprof.out
-#
-# Any unrecognized options are passed into the invocation of llvm-prof
-#
-
-my $ProfilePass = "-insert-edge-profiling";
-
-my $LLVMProfOpts = "";
-my $ProgramOpts = "";
-my $ProfileFile = "";
-
-# Parse arguments...
-while (scalar(@ARGV) and ($_ = $ARGV[0], /^[-+]/)) {
-  shift;
-  last if /^--$/;  # Stop processing arguments on --
-
-  # List command line options here...
-  if (/^-?-block$/)    { $ProfilePass = "-insert-block-profiling"; next; }
-  if (/^-?-edge$/)     { $ProfilePass = "-insert-edge-profiling"; next; }
-  if (/^-?-function$/) { $ProfilePass = "-insert-function-profiling"; next; }
-  if (/^-?-o$/) {         # Read -o filename...
-    die "-o option requires a filename argument!" if (!scalar(@ARGV));
-    $ProgramOpts .= " -llvmprof-output $ARGV[0]";
-    $ProfileFile = $ARGV[0];
-    shift;
-    next;
-  }
-  if (/^-?-help$/) {
-    print "OVERVIEW: profile.pl - Instrumentation and profile printer.\n\n";
-    print "USAGE: profile.pl [options] program.bc <program args>\n\n";
-    print "OPTIONS:\n";
-    print "  -block    - Enable basicblock profiling\n";
-    print "  -edge     - Enable edge profiling\n";
-    print "  -function - Enable function profiling\n";
-    print "  -o <file> - Specify an output file other than llvm-prof.out.\n";
-    print "  -help     - Print this usage information\n";
-    print "\nAll other options are passed into llvm-prof.\n";
-    exit 1;
-  }
-
-  # Otherwise, pass the option on to llvm-prof
-  $LLVMProfOpts .= " " . $_;
-}
-
-die "Must specify LLVM bitcode file as first argument!" if (@ARGV == 0);
-
-my $BytecodeFile = $ARGV[0];
-
-shift @ARGV;
-
-my $libdir = `llvm-config --libdir`;
-chomp $libdir;
-
-my $LibProfPath = $libdir . "/libprofile_rt.so";
-
-system "opt -q -f $ProfilePass $BytecodeFile -o $BytecodeFile.inst";
-system "lli -fake-argv0 '$BytecodeFile' -load $LibProfPath " .
-       "$BytecodeFile.inst $ProgramOpts " . (join ' ', @ARGV);
-system "rm $BytecodeFile.inst";
-system "llvm-prof $LLVMProfOpts $BytecodeFile $ProfileFile";
diff --git a/utils/release/tag.sh b/utils/release/tag.sh
index 5e3f69a..6c5039d 100755
--- a/utils/release/tag.sh
+++ b/utils/release/tag.sh
@@ -17,7 +17,7 @@ set -e
 release=""
 rc=""
 rebranch="no"
-projects="llvm cfe dragonegg test-suite compiler-rt libcxx clang-tools-extra polly lldb"
+projects="llvm cfe dragonegg test-suite compiler-rt libcxx clang-tools-extra polly lldb lld"
 
 base_url="https://llvm.org/svn/llvm-project"
 
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index 6c17b46..91dac4c 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -26,6 +26,8 @@ Base_url="http://llvm.org/svn/llvm-project"
 Release=""
 Release_no_dot=""
 RC=""
+Triple=""
+use_gzip="no"
 do_checkout="yes"
 do_ada="no"
 do_clang="yes"
@@ -44,6 +46,7 @@ function usage() {
     echo " -release X.Y      The release number to test."
     echo " -rc NUM           The pre-release candidate number."
     echo " -final            The final release candidate."
+    echo " -triple TRIPLE    The target triple for this machine."
     echo " -j NUM            Number of compile jobs to run. [default: 3]"
     echo " -build-dir DIR    Directory to perform testing in. [default: pwd]"
     echo " -no-checkout      Don't checkout the sources from SVN."
@@ -56,6 +59,7 @@ function usage() {
     echo " -test-debug       Test the debug build. [default: no]"
     echo " -test-asserts     Test with asserts on. [default: no]"
     echo " -no-compare-files Don't test that phase 2 and 3 files are identical."
+    echo " -use-gzip         Use gzip instead of xz."
 }
 
 while [ $# -gt 0 ]; do
@@ -72,6 +76,10 @@ while [ $# -gt 0 ]; do
         -final | --final )
             RC=final
             ;;
+        -triple | --triple )
+            shift
+            Triple="$1"
+            ;;
         -j* )
             NumJobs="`echo $1 | sed -e 's,-j\([0-9]*\),\1,g'`"
             if [ -z "$NumJobs" ]; then
@@ -113,6 +121,9 @@ while [ $# -gt 0 ]; do
         -no-compare-files | --no-compare-files )
             do_compare="no"
             ;;
+        -use-gzip | --use-gzip )
+            use_gzip="yes"
+            ;;
         -help | --help | -h | --h | -\? )
             usage
             exit 0
@@ -135,6 +146,10 @@ if [ -z "$RC" ]; then
     echo "error: no release candidate number specified"
     exit 1
 fi
+if [ -z "$Triple" ]; then
+    echo "error: no target triple specified"
+    exit 1
+fi
 
 # Figure out how many make processes to run.
 if [ -z "$NumJobs" ]; then
@@ -159,6 +174,13 @@ cd $BuildDir
 LogDir=$BuildDir/logs
 mkdir -p $LogDir
 
+# Final package name.
+Package=clang+llvm-$Release
+if [ $RC != "final" ]; then
+  Package=$Package-$RC
+fi
+Package=$Package-$Triple
+
 # Find compilers.
 if [ "$do_dragonegg" = "yes" ]; then
     gcc_compiler="$GCC"
@@ -180,6 +202,20 @@ if [ "$do_dragonegg" = "yes" ]; then
     fi
 fi
 
+# Make sure that a required program is available
+function check_program_exists() {
+  local program="$1"
+  if ! type -P $program > /dev/null 2>&1 ; then
+    echo "program '$1' not found !"
+    exit 1
+  fi
+}
+
+if [ `uname -s` != "Darwin" ]; then
+  check_program_exists 'chrpath'
+  check_program_exists 'file'
+  check_program_exists 'objdump'
+fi
 
 # Make sure that the URLs are valid.
 function check_valid_urls() {
@@ -328,6 +364,38 @@ function test_llvmCore() {
     cd $BuildDir
 }
 
+# Clean RPATH. Libtool adds the build directory to the search path, which is
+# not necessary --- and even harmful --- for the binary packages we release.
+function clean_RPATH() {
+  if [ `uname -s` = "Darwin" ]; then
+    return
+  fi
+  local InstallPath="$1"
+  for Candidate in `find $InstallPath/{bin,lib} -type f`; do
+    if file $Candidate | grep ELF | egrep 'executable|shared object' > /dev/null 2>&1 ; then
+      rpath=`objdump -x $Candidate | grep 'RPATH' | sed -e's/^ *RPATH *//'`
+      if [ -n "$rpath" ]; then
+        newrpath=`echo $rpath | sed -e's/.*\(\$ORIGIN[^:]*\).*/\1/'`
+        chrpath -r $newrpath $Candidate 2>&1 > /dev/null 2>&1
+      fi
+    fi
+  done
+}
+
+# Create a package of the release binaries.
+function package_release() {
+    cwd=`pwd`
+    cd $BuildDir/Phase3/Release
+    mv llvmCore-$Release-$RC.install $Package
+    if [ "$use_gzip" = "yes" ]; then
+      tar cfz $BuildDir/$Package.tar.gz $Package
+    else
+      tar cfJ $BuildDir/$Package.tar.xz $Package
+    fi
+    mv $Package llvmCore-$Release-$RC.install
+    cd $cwd
+}
+
 set -e                          # Exit if any command fails
 
 if [ "$do_checkout" = "yes" ]; then
@@ -415,6 +483,7 @@ for Flavor in $Flavors ; do
         $llvmCore_phase1_objdir $llvmCore_phase1_installdir
     build_llvmCore 1 $Flavor \
         $llvmCore_phase1_objdir
+    clean_RPATH $llvmCore_phase1_installdir
 
     # Test clang
     if [ "$do_clang" = "yes" ]; then
@@ -427,6 +496,7 @@ for Flavor in $Flavors ; do
             $llvmCore_phase2_objdir $llvmCore_phase2_installdir
         build_llvmCore 2 $Flavor \
             $llvmCore_phase2_objdir
+        clean_RPATH $llvmCore_phase2_installdir
 
         ########################################################################
         # Phase 3: Build llvmCore with newly built clang from phase 2.
@@ -437,6 +507,7 @@ for Flavor in $Flavors ; do
             $llvmCore_phase3_objdir $llvmCore_phase3_installdir
         build_llvmCore 3 $Flavor \
             $llvmCore_phase3_objdir
+        clean_RPATH $llvmCore_phase3_installdir
 
         ########################################################################
         # Testing: Test phase 3
@@ -478,6 +549,7 @@ for Flavor in $Flavors ; do
         build_llvmCore 2 $Flavor \
             $llvmCore_de_phase2_objdir
         build_dragonegg 2 $Flavor $llvmCore_de_phase2_installdir $dragonegg_phase2_objdir
+        clean_RPATH $llvmCore_de_phase2_installdir
 
         ########################################################################
         # Phase 3: Build llvmCore with newly built dragonegg from phase 2.
@@ -489,6 +561,7 @@ for Flavor in $Flavors ; do
         build_llvmCore 3 $Flavor \
             $llvmCore_de_phase3_objdir
         build_dragonegg 3 $Flavor $llvmCore_de_phase3_installdir $dragonegg_phase3_objdir
+        clean_RPATH $llvmCore_de_phase3_installdir
 
         ########################################################################
         # Testing: Test phase 3
@@ -518,9 +591,16 @@ for Flavor in $Flavors ; do
 done
 ) 2>&1 | tee $LogDir/testing.$Release-$RC.log
 
+package_release
+
 set +e
 
 # Woo hoo!
 echo "### Testing Finished ###"
+if [ "$use_gzip" = "yes" ]; then
+  echo "### Package: $Package.tar.gz"
+else
+  echo "### Package: $Package.tar.xz"
+fi
 echo "### Logs: $LogDir"
 exit 0
diff --git a/utils/test_debuginfo.pl b/utils/test_debuginfo.pl
index 0832a33..a6b6137 100755
--- a/utils/test_debuginfo.pl
+++ b/utils/test_debuginfo.pl
@@ -11,8 +11,13 @@
 # and run the script generated from source program comments. Finally,
 # the debugger output is checked, using FileCheck, to validate 
 # debugging information.
+#
+# On Darwin the default is to use the llgdb.py wrapper script which
+# translates gdb commands into their lldb equivalents.
 
 use File::Basename;
+use Config;
+use Cwd;
 
 my $testcase_file = $ARGV[0];
 my $executable_file = $ARGV[1];
@@ -23,6 +28,11 @@ my $output_dir = dirname $executable_file;
 my $debugger_script_file = "$output_dir/$input_filename.debugger.script";
 my $output_file = "$output_dir/$input_filename.gdb.output";
 
+my %cmd_map = ();
+# Assume lldb to be the debugger on Darwin.
+my $use_lldb = 0;
+$use_lldb = 1 if ($Config{osname} eq "darwin");
+
 # Extract debugger commands from testcase. They are marked with DEBUGGER: 
 # at the beginning of a comment line.
 open(INPUT, $testcase_file);
@@ -44,8 +54,15 @@ close(OUTPUT);
 # setup debugger and debugger options to run a script.
 my $my_debugger = $ENV{'DEBUGGER'};
 if (!$my_debugger) {
-    $my_debugger = "gdb";
+    if ($use_lldb) {
+        my $path = dirname(Cwd::abs_path($0));
+        $my_debugger = "/usr/bin/env python $path/../tools/clang/test/debuginfo-tests/llgdb.py";
+    } else {
+        $my_debugger = "gdb";
+    }
 }
+
+# quiet / exit after cmdline / no init file / execute script
 my $debugger_options = "-q -batch -n -x";
 
 # run debugger and capture output.
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index 8bdfee1..fd1a048 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -14,6 +14,7 @@
 # Where gtest's .h files can be found.
 include_directories(
   googletest/include
+  googletest
   )
 
 if(WIN32)
@@ -38,13 +39,7 @@ if(MSVC AND MSVC_VERSION EQUAL 1700)
 endif ()
 
 add_llvm_library(gtest
-  googletest/gtest.cc
-  googletest/gtest-death-test.cc
-  googletest/gtest-filepath.cc
-  googletest/gtest-port.cc
-  googletest/gtest-printers.cc
-  googletest/gtest-test-part.cc
-  googletest/gtest-typed-test.cc
+  googletest/src/gtest-all.cc
   )
 
 add_llvm_library(gtest_main
diff --git a/utils/unittest/googletest/Makefile b/utils/unittest/googletest/Makefile
index bf73670..3d85e7d 100644
--- a/utils/unittest/googletest/Makefile
+++ b/utils/unittest/googletest/Makefile
@@ -19,6 +19,7 @@ REQUIRES_RTTI = 1
 # unittests/Makefile.unittest and ../UnitTestMain/Makefile; ensure that any
 # changes are made to both.
 CPP.Flags += -I$(LLVM_SRC_ROOT)/utils/unittest/googletest/include
+CPP.Flags += -I$(LLVM_SRC_ROOT)/utils/unittest/googletest
 CPP.Flags += $(NO_MISSING_FIELD_INITIALIZERS) $(NO_VARIADIC_MACROS)
 CPP.Flags += -DGTEST_HAS_RTTI=0
 # libstdc++'s TR1 <tuple> header depends on RTTI and uses C++'0x features not
@@ -36,6 +37,6 @@ endif
 
 NO_INSTALL = 1
 
-SOURCES = $(filter-out gtest-all.cc, $(notdir $(wildcard $(PROJ_SRC_DIR)/*.cc)))
+SOURCES = src/gtest-all.cc
 
 include $(LEVEL)/Makefile.common
diff --git a/utils/unittest/googletest/README.LLVM b/utils/unittest/googletest/README.LLVM
index 3565a32..1a6f0f5 100644
--- a/utils/unittest/googletest/README.LLVM
+++ b/utils/unittest/googletest/README.LLVM
@@ -10,23 +10,11 @@ Cleaned up as follows:
 $ rm -f aclocal* CMakeLists.txt configure* Makefile* CHANGES CONTRIBUTORS README
 $ rm -rf build-aux cmake codegear fused-src m4 make msvc samples scripts test xcode
 $ rm -f `find . -name \*\.pump`
+$ rm -f src/gtest_main.cc
 
-# Move all the source files to the current directory
-$ mv src/* .
-$ rmdir src
-
-# Move extra headers into the already-existing internal headers dir
-$ mv *.h include/gtest/internal/
-
-# Update paths to the included files
-$ perl -pi -e 's|^#include "src/|#include "|' gtest-all.cc
-$ perl -pi -e 's|^#include "src/|#include "gtest/internal/|' *.cc
-
-$ rm -f gtest_main.cc
-
+# Put the license in the consistent place for LLVM.
 $ mv COPYING LICENSE.TXT
 
-
 Modified as follows:
 * To GTestStreamToHelper in include/gtest/internal/gtest-internal.h,
   added the ability to stream with raw_os_ostream.
diff --git a/utils/unittest/googletest/gtest-all.cc b/utils/unittest/googletest/gtest-all.cc
deleted file mode 100644
index 97753e5..0000000
--- a/utils/unittest/googletest/gtest-all.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// Google C++ Testing Framework (Google Test)
-//
-// Sometimes it's desirable to build Google Test by compiling a single file.
-// This file serves this purpose.
-
-// This line ensures that gtest.h can be compiled on its own, even
-// when it's fused.
-#include "gtest/gtest.h"
-
-// The following lines pull in the real gtest *.cc files.
-#include "gtest.cc"
-#include "gtest-death-test.cc"
-#include "gtest-filepath.cc"
-#include "gtest-port.cc"
-#include "gtest-printers.cc"
-#include "gtest-test-part.cc"
-#include "gtest-typed-test.cc"
diff --git a/utils/unittest/googletest/gtest-death-test.cc b/utils/unittest/googletest/gtest-death-test.cc
deleted file mode 100644
index bf7e32c..0000000
--- a/utils/unittest/googletest/gtest-death-test.cc
+++ /dev/null
@@ -1,1233 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
-//
-// This file implements death tests.
-
-#include "gtest/gtest-death-test.h"
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_HAS_DEATH_TEST
-
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-#include "gtest/gtest-message.h"
-#include "gtest/internal/gtest-string.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#include "gtest/internal/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-// Constants.
-
-// The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
-
-GTEST_DEFINE_string_(
-    death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
-    "Indicates how to run a death test in a forked child process: "
-    "\"threadsafe\" (child process re-executes the test binary "
-    "from the beginning, running only the specific death test) or "
-    "\"fast\" (child process runs the death test immediately "
-    "after forking).");
-
-GTEST_DEFINE_bool_(
-    death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
-    "Instructs to use fork()/_exit() instead of clone() in death tests. "
-    "Ignored and always uses fork() on POSIX systems where clone() is not "
-    "implemented. Useful when running under valgrind or similar tools if "
-    "those do not support clone(). Valgrind 3.3.1 will just fail if "
-    "it sees an unsupported combination of clone() flags. "
-    "It is not recommended to use this flag w/o valgrind though it will "
-    "work in 99% of the cases. Once valgrind is fixed, this flag will "
-    "most likely be removed.");
-
-namespace internal {
-GTEST_DEFINE_string_(
-    internal_run_death_test, "",
-    "Indicates the file, line number, temporal index of "
-    "the single death test to run, and a file descriptor to "
-    "which a success code may be sent, all separated by "
-    "colons.  This flag is specified if and only if the current "
-    "process is a sub-process launched for running a thread-safe "
-    "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
-
-#if GTEST_HAS_DEATH_TEST
-
-// ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
-
-// ExitedWithCode function-call operator.
-bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
-
-  return exit_status == exit_code_;
-
-# else
-
-  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
-
-# endif  // GTEST_OS_WINDOWS
-}
-
-# if !GTEST_OS_WINDOWS
-// KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
-
-// KilledBySignal function-call operator.
-bool KilledBySignal::operator()(int exit_status) const {
-  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
-}
-# endif  // !GTEST_OS_WINDOWS
-
-namespace internal {
-
-// Utilities needed for death tests.
-
-// Generates a textual description of a given exit code, in the format
-// specified by wait(2).
-static String ExitSummary(int exit_code) {
-  Message m;
-
-# if GTEST_OS_WINDOWS
-
-  m << "Exited with exit status " << exit_code;
-
-# else
-
-  if (WIFEXITED(exit_code)) {
-    m << "Exited with exit status " << WEXITSTATUS(exit_code);
-  } else if (WIFSIGNALED(exit_code)) {
-    m << "Terminated by signal " << WTERMSIG(exit_code);
-  }
-#  ifdef WCOREDUMP
-  if (WCOREDUMP(exit_code)) {
-    m << " (core dumped)";
-  }
-#  endif
-# endif  // GTEST_OS_WINDOWS
-
-  return m.GetString();
-}
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-bool ExitedUnsuccessfully(int exit_status) {
-  return !ExitedWithCode(0)(exit_status);
-}
-
-# if !GTEST_OS_WINDOWS
-// Generates a textual failure message when a death test finds more than
-// one thread running, or cannot determine the number of threads, prior
-// to executing the given statement.  It is the responsibility of the
-// caller not to pass a thread_count of 1.
-static String DeathTestThreadWarning(size_t thread_count) {
-  Message msg;
-  msg << "Death tests use fork(), which is unsafe particularly"
-      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
-    msg << "couldn't detect the number of threads.";
-  else
-    msg << "detected " << thread_count << " threads.";
-  return msg.GetString();
-}
-# endif  // !GTEST_OS_WINDOWS
-
-// Flag characters for reporting a death test that did not die.
-static const char kDeathTestLived = 'L';
-static const char kDeathTestReturned = 'R';
-static const char kDeathTestThrew = 'T';
-static const char kDeathTestInternalError = 'I';
-
-// An enumeration describing all of the possible ways that a death test can
-// conclude.  DIED means that the process died while executing the test
-// code; LIVED means that process lived beyond the end of the test code;
-// RETURNED means that the test statement attempted to execute a return
-// statement, which is not allowed; THREW means that the test statement
-// returned control by throwing an exception.  IN_PROGRESS means the test
-// has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
-// AbortReason, DeathTestOutcome, and flag characters above.
-enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
-
-// Routine for aborting the program which is safe to call from an
-// exec-style death test child process, in which case the error
-// message is propagated back to the parent process.  Otherwise, the
-// message is simply printed to stderr.  In either case, the program
-// then exits with status 1.
-void DeathTestAbort(const String& message) {
-  // On a POSIX system, this function may be called from a threadsafe-style
-  // death test child process, which operates on a very small stack.  Use
-  // the heap for any additional non-minuscule memory requirements.
-  const InternalRunDeathTestFlag* const flag =
-      GetUnitTestImpl()->internal_run_death_test_flag();
-  if (flag != NULL) {
-    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
-    fputc(kDeathTestInternalError, parent);
-    fprintf(parent, "%s", message.c_str());
-    fflush(parent);
-    _exit(1);
-  } else {
-    fprintf(stderr, "%s", message.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-
-// A replacement for CHECK that calls DeathTestAbort if the assertion
-// fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort(::testing::internal::String::Format( \
-          "CHECK failed: File %s, line %d: %s", \
-          __FILE__, __LINE__, #expression)); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
-// evaluating any system call that fulfills two conditions: it must return
-// -1 on failure, and set errno to EINTR when it is interrupted and
-// should be tried again.  The macro expands to a loop that repeatedly
-// evaluates the expression as long as it evaluates to -1 and sets
-// errno to EINTR.  If the expression evaluates to -1 but errno is
-// something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort(::testing::internal::String::Format( \
-          "CHECK failed: File %s, line %d: %s != -1", \
-          __FILE__, __LINE__, #expression)); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// Returns the message describing the last system error in errno.
-String GetLastErrnoDescription() {
-    return String(errno == 0 ? "" : posix::StrError(errno));
-}
-
-// This is called from a death test parent process to read a failure
-// message from the death test child process and log it with the FATAL
-// severity. On Windows, the message is read from a pipe handle. On other
-// platforms, it is read from a file descriptor.
-static void FailFromInternalError(int fd) {
-  Message error;
-  char buffer[256];
-  int num_read;
-
-  do {
-    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
-      buffer[num_read] = '\0';
-      error << buffer;
-    }
-  } while (num_read == -1 && errno == EINTR);
-
-  if (num_read == 0) {
-    GTEST_LOG_(FATAL) << error.GetString();
-  } else {
-    const int last_error = errno;
-    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
-                      << GetLastErrnoDescription() << " [" << last_error << "]";
-  }
-}
-
-// Death test constructor.  Increments the running death test count
-// for the current test.
-DeathTest::DeathTest() {
-  TestInfo* const info = GetUnitTestImpl()->current_test_info();
-  if (info == NULL) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
-  }
-}
-
-// Creates and returns a death test by dispatching to the current
-// death test factory.
-bool DeathTest::Create(const char* statement, const RE* regex,
-                       const char* file, int line, DeathTest** test) {
-  return GetUnitTestImpl()->death_test_factory()->Create(
-      statement, regex, file, line, test);
-}
-
-const char* DeathTest::LastMessage() {
-  return last_death_test_message_.c_str();
-}
-
-void DeathTest::set_last_death_test_message(const String& message) {
-  last_death_test_message_ = message;
-}
-
-String DeathTest::last_death_test_message_;
-
-// Provides cross platform implementation for some death functionality.
-class DeathTestImpl : public DeathTest {
- protected:
-  DeathTestImpl(const char* a_statement, const RE* a_regex)
-      : statement_(a_statement),
-        regex_(a_regex),
-        spawned_(false),
-        status_(-1),
-        outcome_(IN_PROGRESS),
-        read_fd_(-1),
-        write_fd_(-1) {}
-
-  // read_fd_ is expected to be closed and cleared by a derived class.
-  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
-
-  void Abort(AbortReason reason);
-  virtual bool Passed(bool status_ok);
-
-  const char* statement() const { return statement_; }
-  const RE* regex() const { return regex_; }
-  bool spawned() const { return spawned_; }
-  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
-  int status() const { return status_; }
-  void set_status(int a_status) { status_ = a_status; }
-  DeathTestOutcome outcome() const { return outcome_; }
-  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
-  int read_fd() const { return read_fd_; }
-  void set_read_fd(int fd) { read_fd_ = fd; }
-  int write_fd() const { return write_fd_; }
-  void set_write_fd(int fd) { write_fd_ = fd; }
-
-  // Called in the parent process only. Reads the result code of the death
-  // test child process via a pipe, interprets it to set the outcome_
-  // member, and closes read_fd_.  Outputs diagnostics and terminates in
-  // case of unexpected codes.
-  void ReadAndInterpretStatusByte();
-
- private:
-  // The textual content of the code this object is testing.  This class
-  // doesn't own this string and should not attempt to delete it.
-  const char* const statement_;
-  // The regular expression which test output must match.  DeathTestImpl
-  // doesn't own this object and should not attempt to delete it.
-  const RE* const regex_;
-  // True if the death test child process has been successfully spawned.
-  bool spawned_;
-  // The exit status of the child process.
-  int status_;
-  // How the death test concluded.
-  DeathTestOutcome outcome_;
-  // Descriptor to the read end of the pipe to the child process.  It is
-  // always -1 in the child process.  The child keeps its write end of the
-  // pipe in write_fd_.
-  int read_fd_;
-  // Descriptor to the child's write end of the pipe to the parent process.
-  // It is always -1 in the parent process.  The parent keeps its end of the
-  // pipe in read_fd_.
-  int write_fd_;
-};
-
-// Called in the parent process only. Reads the result code of the death
-// test child process via a pipe, interprets it to set the outcome_
-// member, and closes read_fd_.  Outputs diagnostics and terminates in
-// case of unexpected codes.
-void DeathTestImpl::ReadAndInterpretStatusByte() {
-  char flag;
-  int bytes_read;
-
-  // The read() here blocks until data is available (signifying the
-  // failure of the death test) or until the pipe is closed (signifying
-  // its success), so it's okay to call this in the parent before
-  // the child process has exited.
-  do {
-    bytes_read = posix::Read(read_fd(), &flag, 1);
-  } while (bytes_read == -1 && errno == EINTR);
-
-  if (bytes_read == 0) {
-    set_outcome(DIED);
-  } else if (bytes_read == 1) {
-    switch (flag) {
-      case kDeathTestReturned:
-        set_outcome(RETURNED);
-        break;
-      case kDeathTestThrew:
-        set_outcome(THREW);
-        break;
-      case kDeathTestLived:
-        set_outcome(LIVED);
-        break;
-      case kDeathTestInternalError:
-        FailFromInternalError(read_fd());  // Does not return.
-        break;
-      default:
-        GTEST_LOG_(FATAL) << "Death test child process reported "
-                          << "unexpected status byte ("
-                          << static_cast<unsigned int>(flag) << ")";
-    }
-  } else {
-    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
-                      << GetLastErrnoDescription();
-  }
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
-  set_read_fd(-1);
-}
-
-// Signals that the death test code which should have exited, didn't.
-// Should be called only in a death test child process.
-// Writes a status byte to the child's status file descriptor, then
-// calls _exit(1).
-void DeathTestImpl::Abort(AbortReason reason) {
-  // The parent process considers the death test to be a failure if
-  // it finds any data in our pipe.  So, here we write a single flag byte
-  // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
-
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
-  // We are leaking the descriptor here because on some platforms (i.e.,
-  // when built as Windows DLL), destructors of global objects will still
-  // run after calling _exit(). On such systems, write_fd_ will be
-  // indirectly closed from the destructor of UnitTestImpl, causing double
-  // close if it is also closed here. On debug configurations, double close
-  // may assert. As there are no in-process buffers to flush here, we are
-  // relying on the OS to close the descriptor after the process terminates
-  // when the destructors are not run.
-  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
-}
-
-// Returns an indented copy of stderr output for a death test.
-// This makes distinguishing death test output lines from regular log lines
-// much easier.
-static ::std::string FormatDeathTestOutput(const ::std::string& output) {
-  ::std::string ret;
-  for (size_t at = 0; ; ) {
-    const size_t line_end = output.find('\n', at);
-    ret += "[  DEATH   ] ";
-    if (line_end == ::std::string::npos) {
-      ret += output.substr(at);
-      break;
-    }
-    ret += output.substr(at, line_end + 1 - at);
-    at = line_end + 1;
-  }
-  return ret;
-}
-
-// Assesses the success or failure of a death test, using both private
-// members which have previously been set, and one argument:
-//
-// Private data members:
-//   outcome:  An enumeration describing how the death test
-//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
-//             fails in the latter three cases.
-//   status:   The exit status of the child process. On *nix, it is in the
-//             in the format specified by wait(2). On Windows, this is the
-//             value supplied to the ExitProcess() API or a numeric code
-//             of the exception that terminated the program.
-//   regex:    A regular expression object to be applied to
-//             the test's captured standard error output; the death test
-//             fails if it does not match.
-//
-// Argument:
-//   status_ok: true if exit_status is acceptable in the context of
-//              this particular death test, which fails if it is false
-//
-// Returns true iff all of the above conditions are met.  Otherwise, the
-// first failing condition, in the order given above, is the one that is
-// reported. Also sets the last death test message string.
-bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
-
-  const String error_message = GetCapturedStderr();
-
-  bool success = false;
-  Message buffer;
-
-  buffer << "Death test: " << statement() << "\n";
-  switch (outcome()) {
-    case LIVED:
-      buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case THREW:
-      buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case RETURNED:
-      buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case DIED:
-      if (status_ok) {
-        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
-        if (matched) {
-          success = true;
-        } else {
-          buffer << "    Result: died but not with expected error.\n"
-                 << "  Expected: " << regex()->pattern() << "\n"
-                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-        }
-      } else {
-        buffer << "    Result: died but not with expected exit code:\n"
-               << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-      }
-      break;
-    case IN_PROGRESS:
-      GTEST_LOG_(FATAL)
-          << "DeathTest::Passed somehow called before conclusion of test";
-  }
-
-  DeathTest::set_last_death_test_message(buffer.GetString());
-  return success;
-}
-
-# if GTEST_OS_WINDOWS
-// WindowsDeathTest implements death tests on Windows. Due to the
-// specifics of starting new processes on Windows, death tests there are
-// always threadsafe, and Google Test considers the
-// --gtest_death_test_style=fast setting to be equivalent to
-// --gtest_death_test_style=threadsafe there.
-//
-// A few implementation notes:  Like the Linux version, the Windows
-// implementation uses pipes for child-to-parent communication. But due to
-// the specifics of pipes on Windows, some extra steps are required:
-//
-// 1. The parent creates a communication pipe and stores handles to both
-//    ends of it.
-// 2. The parent starts the child and provides it with the information
-//    necessary to acquire the handle to the write end of the pipe.
-// 3. The child acquires the write end of the pipe and signals the parent
-//    using a Windows event.
-// 4. Now the parent can release the write end of the pipe on its side. If
-//    this is done before step 3, the object's reference count goes down to
-//    0 and it is destroyed, preventing the child from acquiring it. The
-//    parent now has to release it, or read operations on the read end of
-//    the pipe will not return when the child terminates.
-// 5. The parent reads child's output through the pipe (outcome code and
-//    any possible error messages) from the pipe, and its stderr and then
-//    determines whether to fail the test.
-//
-// Note: to distinguish Win32 API calls from the local method and function
-// calls, the former are explicitly resolved in the global namespace.
-//
-class WindowsDeathTest : public DeathTestImpl {
- public:
-  WindowsDeathTest(const char* a_statement,
-                   const RE* a_regex,
-                   const char* file,
-                   int line)
-      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-  virtual TestRole AssumeRole();
-
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-  // Handle to the write end of the pipe to the child process.
-  AutoHandle write_handle_;
-  // Child process handle.
-  AutoHandle child_handle_;
-  // Event the child process uses to signal the parent that it has
-  // acquired the handle to the write end of the pipe. After seeing this
-  // event the parent can release its own handles to make sure its
-  // ReadFile() calls return when the child terminates.
-  AutoHandle event_handle_;
-};
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  // Wait until the child either signals that it has acquired the write end
-  // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
-                                   FALSE,  // Waits for any of the handles.
-                                   INFINITE)) {
-    case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1:
-      break;
-    default:
-      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
-  }
-
-  // The child has acquired the write end of the pipe or exited.
-  // We release the handle on our side and continue.
-  write_handle_.Reset();
-  event_handle_.Reset();
-
-  ReadAndInterpretStatusByte();
-
-  // Waits for the child process to exit if it haven't already. This
-  // returns immediately if the child has already exited, regardless of
-  // whether previous calls to WaitForMultipleObjects synchronized on this
-  // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
-  DWORD status_code;
-  GTEST_DEATH_TEST_CHECK_(
-      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
-  child_handle_.Reset();
-  set_status(static_cast<int>(status_code));
-  return status();
-}
-
-// The AssumeRole process for a Windows death test.  It creates a child
-// process with the same executable as the current process to run the
-// death test.  The child process is given the --gtest_filter and
-// --gtest_internal_run_death_test flags such that it knows to run the
-// current death test only.
-DeathTest::TestRole WindowsDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    // ParseInternalRunDeathTestFlag() has performed all the necessary
-    // processing.
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  // WindowsDeathTest uses an anonymous pipe to communicate results of
-  // a death test.
-  SECURITY_ATTRIBUTES handles_are_inheritable = {
-    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
-  HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
-  write_handle_.Reset(write_handle);
-  event_handle_.Reset(::CreateEvent(
-      &handles_are_inheritable,
-      TRUE,    // The event will automatically reset to non-signaled state.
-      FALSE,   // The initial state is non-signalled.
-      NULL));  // The even is unnamed.
-  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
-  const String filter_flag = String::Format("--%s%s=%s.%s",
-                                            GTEST_FLAG_PREFIX_, kFilterFlag,
-                                            info->test_case_name(),
-                                            info->name());
-  const String internal_flag = String::Format(
-    "--%s%s=%s|%d|%d|%u|%Iu|%Iu",
-      GTEST_FLAG_PREFIX_,
-      kInternalRunDeathTestFlag,
-      file_, line_,
-      death_test_index,
-      static_cast<unsigned int>(::GetCurrentProcessId()),
-      // size_t has the same with as pointers on both 32-bit and 64-bit
-      // Windows platforms.
-      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      reinterpret_cast<size_t>(write_handle),
-      reinterpret_cast<size_t>(event_handle_.Get()));
-
-  char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(
-      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
-                                            executable_path,
-                                            _MAX_PATH));
-
-  String command_line = String::Format("%s %s \"%s\"",
-                                       ::GetCommandLineA(),
-                                       filter_flag.c_str(),
-                                       internal_flag.c_str());
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // Flush the log buffers since the log streams are shared with the child.
-  FlushInfoLog();
-
-  // The child process will share the standard handles with the parent.
-  STARTUPINFOA startup_info;
-  memset(&startup_info, 0, sizeof(STARTUPINFO));
-  startup_info.dwFlags = STARTF_USESTDHANDLES;
-  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
-  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
-  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
-
-  PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
-      executable_path,
-      const_cast<char*>(command_line.c_str()),
-      NULL,   // Retuned process handle is not inheritable.
-      NULL,   // Retuned thread handle is not inheritable.
-      TRUE,   // Child inherits all inheritable handles (for write_handle_).
-      0x0,    // Default creation flags.
-      NULL,   // Inherit the parent's environment.
-      UnitTest::GetInstance()->original_working_dir(),
-      &startup_info,
-      &process_info) != FALSE);
-  child_handle_.Reset(process_info.hProcess);
-  ::CloseHandle(process_info.hThread);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-# else  // We are not on Windows.
-
-// ForkingDeathTest provides implementations for most of the abstract
-// methods of the DeathTest interface.  Only the AssumeRole method is
-// left undefined.
-class ForkingDeathTest : public DeathTestImpl {
- public:
-  ForkingDeathTest(const char* statement, const RE* regex);
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-
- protected:
-  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
-
- private:
-  // PID of child process during death test; 0 in the child process itself.
-  pid_t child_pid_;
-};
-
-// Constructs a ForkingDeathTest.
-ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
-    : DeathTestImpl(a_statement, a_regex),
-      child_pid_(-1) {}
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  ReadAndInterpretStatusByte();
-
-  int status_value;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
-  set_status(status_value);
-  return status_value;
-}
-
-// A concrete death test class that forks, then immediately runs the test
-// in the child process.
-class NoExecDeathTest : public ForkingDeathTest {
- public:
-  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
-      ForkingDeathTest(a_statement, a_regex) { }
-  virtual TestRole AssumeRole();
-};
-
-// The AssumeRole process for a fork-and-run death test.  It implements a
-// straightforward fork, with a simple pipe to transmit the status byte.
-DeathTest::TestRole NoExecDeathTest::AssumeRole() {
-  const size_t thread_count = GetThreadCount();
-  if (thread_count != 1) {
-    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-
-  DeathTest::set_last_death_test_message("");
-  CaptureStderr();
-  // When we fork the process below, the log file buffers are copied, but the
-  // file descriptors are shared.  We flush all log files here so that closing
-  // the file descriptors in the child process doesn't throw off the
-  // synchronization between descriptors and buffers in the parent process.
-  // This is as close to the fork as possible to avoid a race condition in case
-  // there are multiple threads running before the death test, and another
-  // thread writes to the log file.
-  FlushInfoLog();
-
-  const pid_t child_pid = fork();
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  set_child_pid(child_pid);
-  if (child_pid == 0) {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
-    set_write_fd(pipe_fd[1]);
-    // Redirects all logging to stderr in the child process to prevent
-    // concurrent writes to the log files.  We capture stderr in the parent
-    // process and append the child process' output to a log.
-    LogToStderr();
-    // Event forwarding to the listeners of event listener API mush be shut
-    // down in death test subprocesses.
-    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
-    return EXECUTE_TEST;
-  } else {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-    set_read_fd(pipe_fd[0]);
-    set_spawned(true);
-    return OVERSEE_TEST;
-  }
-}
-
-// A concrete death test class that forks and re-executes the main
-// program from the beginning, with command-line flags set that cause
-// only this specific death test to be run.
-class ExecDeathTest : public ForkingDeathTest {
- public:
-  ExecDeathTest(const char* a_statement, const RE* a_regex,
-                const char* file, int line) :
-      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
-  virtual TestRole AssumeRole();
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-};
-
-// Utility class for accumulating command-line arguments.
-class Arguments {
- public:
-  Arguments() {
-    args_.push_back(NULL);
-  }
-
-  ~Arguments() {
-    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
-         ++i) {
-      free(*i);
-    }
-  }
-  void AddArgument(const char* argument) {
-    args_.insert(args_.end() - 1, posix::StrDup(argument));
-  }
-
-  template <typename Str>
-  void AddArguments(const ::std::vector<Str>& arguments) {
-    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
-      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
-    }
-  }
-  char* const* Argv() {
-    return &args_[0];
-  }
- private:
-  std::vector<char*> args_;
-};
-
-// A struct that encompasses the arguments to the child process of a
-// threadsafe-style death test process.
-struct ExecDeathTestArgs {
-  char* const* argv;  // Command-line arguments for the child's call to exec
-  int close_fd;       // File descriptor to close; the read end of a pipe
-};
-
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
-extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-// The main function for a threadsafe-style death test child process.
-// This function is called in a clone()-ed process and thus must avoid
-// any potentially unsafe operations like malloc or libc functions.
-static int ExecDeathTestChildMain(void* child_arg) {
-  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
-
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(String::Format("chdir(\"%s\") failed: %s",
-                                  original_dir,
-                                  GetLastErrnoDescription().c_str()));
-    return EXIT_FAILURE;
-  }
-
-  // We can safely call execve() as it's a direct system call.  We
-  // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
-  // invoke the test program via a valid path that contains at least
-  // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(String::Format("execve(%s, ...) in %s failed: %s",
-                                args->argv[0],
-                                original_dir,
-                                GetLastErrnoDescription().c_str()));
-  return EXIT_FAILURE;
-}
-
-// Two utility routines that together determine the direction the stack
-// grows.
-// This could be accomplished more elegantly by a single recursive
-// function, but we want to guard against the unlikely possibility of
-// a smart compiler optimizing the recursion away.
-//
-// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
-// StackLowerThanAddress into StackGrowsDown, which then doesn't give
-// correct answer.
-bool StackLowerThanAddress(const void* ptr) GTEST_NO_INLINE_;
-bool StackLowerThanAddress(const void* ptr) {
-  int dummy;
-  return &dummy < ptr;
-}
-
-bool StackGrowsDown() {
-  int dummy;
-  return StackLowerThanAddress(&dummy);
-}
-
-// A threadsafe implementation of fork(2) for threadsafe-style death tests
-// that uses clone(2).  It dies with an error message if anything goes
-// wrong.
-static pid_t ExecDeathTestFork(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
-  pid_t child_pid = -1;
-
-#  if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
-
-  if (!use_fork) {
-    static const bool stack_grows_down = StackGrowsDown();
-    const size_t stack_size = getpagesize();
-    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
-    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
-                             MAP_ANON | MAP_PRIVATE, -1, 0);
-    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
-    void* const stack_top =
-        static_cast<char*>(stack) + (stack_grows_down ? stack_size : 0);
-
-    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
-
-    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
-  }
-#  else
-  const bool use_fork = true;
-#  endif  // GTEST_HAS_CLONE
-
-  if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
-  }
-
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  return child_pid;
-}
-
-// The AssumeRole process for a fork-and-exec death test.  It re-executes the
-// main program from the beginning, setting the --gtest_filter
-// and --gtest_internal_run_death_test flags to cause only the current
-// death test to be re-run.
-DeathTest::TestRole ExecDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-  // Clear the close-on-exec flag on the write end of the pipe, lest
-  // it be closed when the child process does an exec:
-  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
-
-  const String filter_flag =
-      String::Format("--%s%s=%s.%s",
-                     GTEST_FLAG_PREFIX_, kFilterFlag,
-                     info->test_case_name(), info->name());
-  const String internal_flag =
-      String::Format("--%s%s=%s|%d|%d|%d",
-                     GTEST_FLAG_PREFIX_, kInternalRunDeathTestFlag,
-                     file_, line_, death_test_index, pipe_fd[1]);
-  Arguments args;
-  args.AddArguments(GetArgvs());
-  args.AddArgument(filter_flag.c_str());
-  args.AddArgument(internal_flag.c_str());
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // See the comment in NoExecDeathTest::AssumeRole for why the next line
-  // is necessary.
-  FlushInfoLog();
-
-  const pid_t child_pid = ExecDeathTestFork(args.Argv(), pipe_fd[0]);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-  set_child_pid(child_pid);
-  set_read_fd(pipe_fd[0]);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-
-# endif  // !GTEST_OS_WINDOWS
-
-// Creates a concrete DeathTest-derived class that depends on the
-// --gtest_death_test_style flag, and sets the pointer pointed to
-// by the "test" argument to its address.  If the test should be
-// skipped, sets that pointer to NULL.  Returns true, unless the
-// flag is set to an invalid value.
-bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
-                                     const char* file, int line,
-                                     DeathTest** test) {
-  UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
-
-  if (flag != NULL) {
-    if (death_test_index > flag->index()) {
-      DeathTest::set_last_death_test_message(String::Format(
-          "Death test count (%d) somehow exceeded expected maximum (%d)",
-          death_test_index, flag->index()));
-      return false;
-    }
-
-    if (!(flag->file() == file && flag->line() == line &&
-          flag->index() == death_test_index)) {
-      *test = NULL;
-      return true;
-    }
-  }
-
-# if GTEST_OS_WINDOWS
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
-    *test = new WindowsDeathTest(statement, regex, file, line);
-  }
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
-    *test = new ExecDeathTest(statement, regex, file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
-    *test = new NoExecDeathTest(statement, regex);
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(String::Format(
-        "Unknown death test style \"%s\" encountered",
-        GTEST_FLAG(death_test_style).c_str()));
-    return false;
-  }
-
-  return true;
-}
-
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
-// ::std::string, so we can use it here.
-static void SplitString(const ::std::string& str, char delimiter,
-                        ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
-# if GTEST_OS_WINDOWS
-// Recreates the pipe and event handles from the provided parameters,
-// signals the event, and returns a file descriptor wrapped around the pipe
-// handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
-  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
-  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
-    DeathTestAbort(String::Format("Unable to open parent process %u",
-                                  parent_process_id));
-  }
-
-  // TODO(vladl@google.com): Replace the following check with a
-  // compile-time assertion when available.
-  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
-
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
-  HANDLE dup_write_handle;
-
-  // The newly initialized handle is accessible only in in the parent
-  // process. To obtain one accessible within the child, we need to use
-  // DuplicateHandle.
-  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
-                         ::GetCurrentProcess(), &dup_write_handle,
-                         0x0,    // Requested privileges ignored since
-                                 // DUPLICATE_SAME_ACCESS is used.
-                         FALSE,  // Request non-inheritable handler.
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort(String::Format(
-        "Unable to duplicate the pipe handle %Iu from the parent process %u",
-        write_handle_as_size_t, parent_process_id));
-  }
-
-  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
-  HANDLE dup_event_handle;
-
-  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort(String::Format(
-        "Unable to duplicate the event handle %Iu from the parent process %u",
-        event_handle_as_size_t, parent_process_id));
-  }
-
-  const int write_fd =
-      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
-  if (write_fd == -1) {
-    DeathTestAbort(String::Format(
-        "Unable to convert pipe handle %Iu to a file descriptor",
-        write_handle_as_size_t));
-  }
-
-  // Signals the parent that the write end of the pipe has been acquired
-  // so the parent can release its own write end.
-  ::SetEvent(dup_event_handle);
-
-  return write_fd;
-}
-# endif  // GTEST_OS_WINDOWS
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
-
-  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
-  // can use it here.
-  int line = -1;
-  int index = -1;
-  ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
-  int write_fd = -1;
-
-# if GTEST_OS_WINDOWS
-
-  unsigned int parent_process_id = 0;
-  size_t write_handle_as_size_t = 0;
-  size_t event_handle_as_size_t = 0;
-
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
-    DeathTestAbort(String::Format(
-        "Bad --gtest_internal_run_death_test flag: %s",
-        GTEST_FLAG(internal_run_death_test).c_str()));
-  }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
-                                     event_handle_as_size_t);
-# else
-
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort(String::Format(
-        "Bad --gtest_internal_run_death_test flag: %s",
-        GTEST_FLAG(internal_run_death_test).c_str()));
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
-}
-
-}  // namespace internal
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace testing
diff --git a/utils/unittest/googletest/gtest-port.cc b/utils/unittest/googletest/gtest-port.cc
deleted file mode 100644
index 3c32ff1..0000000
--- a/utils/unittest/googletest/gtest-port.cc
+++ /dev/null
@@ -1,750 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-#include "gtest/internal/gtest-port.h"
-
-#include <limits.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>  // For TerminateProcess()
-#elif GTEST_OS_WINDOWS
-# include <io.h>
-# include <sys/stat.h>
-#else
-# include <unistd.h>
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
-#endif  // GTEST_OS_MAC
-
-#include "gtest/gtest-spi.h"
-#include "gtest/gtest-message.h"
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#include "gtest/internal/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-namespace internal {
-
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_MAC
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  const task_t task = mach_task_self();
-  mach_msg_type_number_t thread_count;
-  thread_act_array_t thread_list;
-  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
-  if (status == KERN_SUCCESS) {
-    // task_threads allocates resources in thread_list and we need to free them
-    // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
-                  sizeof(thread_t) * thread_count);
-    return static_cast<size_t>(thread_count);
-  } else {
-    return 0;
-  }
-}
-
-#else
-
-size_t GetThreadCount() {
-  // There's no portable way to detect the number of threads, so we just
-  // return 0 to indicate that we cannot detect it.
-  return 0;
-}
-
-#endif  // GTEST_OS_MAC
-
-#if GTEST_USES_POSIX_RE
-
-// Implements RE.  Currently only needed for death tests.
-
-RE::~RE() {
-  if (is_valid_) {
-    // regfree'ing an invalid regex might crash because the content
-    // of the regex is undefined. Since the regex's are essentially
-    // the same, one cannot be valid (or invalid) without the other
-    // being so too.
-    regfree(&partial_regex_);
-    regfree(&full_regex_);
-  }
-  free(const_cast<char*>(pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = posix::StrDup(regex);
-
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match.
-  const size_t full_regex_len = strlen(regex) + 10;
-  char* const full_pattern = new char[full_regex_len];
-
-  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
-  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
-  // We want to call regcomp(&partial_regex_, ...) even if the
-  // previous expression returns false.  Otherwise partial_regex_ may
-  // not be properly initialized can may cause trouble when it's
-  // freed.
-  //
-  // Some implementation of POSIX regex (e.g. on at least some
-  // versions of Cygwin) doesn't accept the empty string as a valid
-  // regex.  We change it to an equivalent form "()" to be safe.
-  if (is_valid_) {
-    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
-    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
-  }
-  EXPECT_TRUE(is_valid_)
-      << "Regular expression \"" << regex
-      << "\" is not a valid POSIX Extended regular expression.";
-
-  delete[] full_pattern;
-}
-
-#elif GTEST_USES_SIMPLE_RE
-
-// Returns true iff ch appears anywhere in str (excluding the
-// terminating '\0' character).
-bool IsInSet(char ch, const char* str) {
-  return ch != '\0' && strchr(str, ch) != NULL;
-}
-
-// Returns true iff ch belongs to the given classification.  Unlike
-// similar functions in <ctype.h>, these aren't affected by the
-// current locale.
-bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
-bool IsAsciiPunct(char ch) {
-  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
-}
-bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
-bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
-bool IsAsciiWordChar(char ch) {
-  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
-}
-
-// Returns true iff "\\c" is a supported escape sequence.
-bool IsValidEscape(char c) {
-  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
-}
-
-// Returns true iff the given atom (specified by escaped and pattern)
-// matches ch.  The result is undefined if the atom is invalid.
-bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
-  if (escaped) {  // "\\p" where p is pattern_char.
-    switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
-    }
-    return IsAsciiPunct(pattern_char) && pattern_char == ch;
-  }
-
-  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
-}
-
-// Helper function used by ValidateRegex() to format error messages.
-String FormatRegexSyntaxError(const char* regex, int index) {
-  return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
-}
-
-// Generates non-fatal failures and returns false if regex is invalid;
-// otherwise returns true.
-bool ValidateRegex(const char* regex) {
-  if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
-    // assertion failures to match where the regex is used in user
-    // code.
-    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
-    return false;
-  }
-
-  bool is_valid = true;
-
-  // True iff ?, *, or + can follow the previous atom.
-  bool prev_repeatable = false;
-  for (int i = 0; regex[i]; i++) {
-    if (regex[i] == '\\') {  // An escape sequence
-      i++;
-      if (regex[i] == '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "'\\' cannot appear at the end.";
-        return false;
-      }
-
-      if (!IsValidEscape(regex[i])) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "invalid escape sequence \"\\" << regex[i] << "\".";
-        is_valid = false;
-      }
-      prev_repeatable = true;
-    } else {  // Not an escape sequence.
-      const char ch = regex[i];
-
-      if (ch == '^' && i > 0) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'^' can only appear at the beginning.";
-        is_valid = false;
-      } else if (ch == '$' && regex[i + 1] != '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'$' can only appear at the end.";
-        is_valid = false;
-      } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
-        is_valid = false;
-      } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
-        is_valid = false;
-      }
-
-      prev_repeatable = !IsInSet(ch, "^$?*+");
-    }
-  }
-
-  return is_valid;
-}
-
-// Matches a repeated regex atom followed by a valid simple regular
-// expression.  The regex atom is defined as c if escaped is false,
-// or \c otherwise.  repeat is the repetition meta character (?, *,
-// or +).  The behavior is undefined if str contains too many
-// characters to be indexable by size_t, in which case the test will
-// probably time out anyway.  We are fine with this limitation as
-// std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
-  const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
-  // We cannot call numeric_limits::max() as it conflicts with the
-  // max() macro on Windows.
-
-  for (size_t i = 0; i <= max_count; ++i) {
-    // We know that the atom matches each of the first i characters in str.
-    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
-      // We have enough matches at the head, and the tail matches too.
-      // Since we only care about *whether* the pattern matches str
-      // (as opposed to *how* it matches), there is no need to find a
-      // greedy match.
-      return true;
-    }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
-  }
-  return false;
-}
-
-// Returns true iff regex matches a prefix of str.  regex must be a
-// valid simple regular expression and not start with "^", or the
-// result is undefined.
-bool MatchRegexAtHead(const char* regex, const char* str) {
-  if (*regex == '\0')  // An empty regex matches a prefix of anything.
-    return true;
-
-  // "$" only matches the end of a string.  Note that regex being
-  // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
-
-  // Is the first thing in regex an escape sequence?
-  const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
-  if (IsRepeat(regex[1])) {
-    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
-    // here's an indirect recursion.  It terminates as the regex gets
-    // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
-  } else {
-    // regex isn't empty, isn't "$", and doesn't start with a
-    // repetition.  We match the first atom of regex with the first
-    // character of str and recurse.
-    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
-  }
-}
-
-// Returns true iff regex matches any substring of str.  regex must be
-// a valid simple regular expression, or the result is undefined.
-//
-// The algorithm is recursive, but the recursion depth doesn't exceed
-// the regex length, so we won't need to worry about running out of
-// stack space normally.  In rare cases the time complexity can be
-// exponential with respect to the regex length + the string length,
-// but usually it's must faster (often close to linear).
-bool MatchRegexAnywhere(const char* regex, const char* str) {
-  if (regex == NULL || str == NULL)
-    return false;
-
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
-
-  // A successful match can be anywhere in str.
-  do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
-  } while (*str++ != '\0');
-  return false;
-}
-
-// Implements the RE class.
-
-RE::~RE() {
-  free(const_cast<char*>(pattern_));
-  free(const_cast<char*>(full_pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = full_pattern_ = NULL;
-  if (regex != NULL) {
-    pattern_ = posix::StrDup(regex);
-  }
-
-  is_valid_ = ValidateRegex(regex);
-  if (!is_valid_) {
-    // No need to calculate the full pattern when the regex is invalid.
-    return;
-  }
-
-  const size_t len = strlen(regex);
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match: we need space to prepend a '^', append a '$', and
-  // terminate the string with '\0'.
-  char* buffer = static_cast<char*>(malloc(len + 3));
-  full_pattern_ = buffer;
-
-  if (*regex != '^')
-    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
-
-  // We don't use snprintf or strncpy, as they trigger a warning when
-  // compiled with VC++ 8.0.
-  memcpy(buffer, regex, len);
-  buffer += len;
-
-  if (len == 0 || regex[len - 1] != '$')
-    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
-
-  *buffer = '\0';
-}
-
-#endif  // GTEST_USES_POSIX_RE
-
-const char kUnknownFile[] = "unknown file";
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
-  const char* const file_name = file == NULL ? kUnknownFile : file;
-
-  if (line < 0) {
-    return String::Format("%s:", file_name).c_str();
-  }
-#ifdef _MSC_VER
-  return String::Format("%s(%d):", file_name, line).c_str();
-#else
-  return String::Format("%s:%d:", file_name, line).c_str();
-#endif  // _MSC_VER
-}
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-// Note that FormatCompilerIndependentFileLocation() does NOT append colon
-// to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
-  const char* const file_name = file == NULL ? kUnknownFile : file;
-
-  if (line < 0)
-    return file_name;
-  else
-    return String::Format("%s:%d", file_name, line).c_str();
-}
-
-
-GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
-    : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
-}
-
-// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-GTestLog::~GTestLog() {
-  GetStream() << ::std::endl;
-  if (severity_ == GTEST_FATAL) {
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-// Disable Microsoft deprecation warnings for POSIX functions called from
-// this class (creat, dup, dup2, and close)
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4996)
-#endif  // _MSC_VER
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Object that captures an output stream (stdout/stderr).
-class CapturedStream {
- public:
-  // The ctor redirects the stream to a temporary file.
-  CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-
-    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
-                                            0,  // Generate unique file name.
-                                            temp_file_path);
-    GTEST_CHECK_(success != 0)
-        << "Unable to create a temporary file in " << temp_dir_path;
-    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
-    filename_ = temp_file_path;
-#elif GTEST_OS_LINUX_ANDROID
-    char name_template[] = "/sdcard/captured_stderr.XXXXXX";
-    const int captured_fd = mkstemp(name_template);
-    filename_ = name_template;
-# else
-    // There's no guarantee that a test has write access to the
-    // current directory, so we create the temporary file in the /tmp
-    // directory instead.
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-    const int captured_fd = mkstemp(name_template);
-    filename_ = name_template;
-# endif  // GTEST_OS_WINDOWS
-    fflush(NULL);
-    dup2(captured_fd, fd_);
-    close(captured_fd);
-  }
-
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
-
-  String GetCapturedString() {
-    if (uncaptured_fd_ != -1) {
-      // Restores the original stream.
-      fflush(NULL);
-      dup2(uncaptured_fd_, fd_);
-      close(uncaptured_fd_);
-      uncaptured_fd_ = -1;
-    }
-
-    FILE* const file = posix::FOpen(filename_.c_str(), "r");
-    const String content = ReadEntireFile(file);
-    posix::FClose(file);
-    return content;
-  }
-
- private:
-  // Reads the entire content of a file as a String.
-  static String ReadEntireFile(FILE* file);
-
-  // Returns the size (in bytes) of a file.
-  static size_t GetFileSize(FILE* file);
-
-  const int fd_;  // A stream to capture.
-  int uncaptured_fd_;
-  // Name of the temporary file holding the stderr output.
-  ::std::string filename_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
-};
-
-// Returns the size (in bytes) of a file.
-size_t CapturedStream::GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-// Reads the entire content of a file as a string.
-String CapturedStream::ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const String content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-# ifdef _MSC_VER
-#  pragma warning(pop)
-# endif  // _MSC_VER
-
-static CapturedStream* g_captured_stderr = NULL;
-static CapturedStream* g_captured_stdout = NULL;
-
-// Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
-  if (*stream != NULL) {
-    GTEST_LOG_(FATAL) << "Only one " << stream_name
-                      << " capturer can exist at a time.";
-  }
-  *stream = new CapturedStream(fd);
-}
-
-// Stops capturing the output stream and returns the captured string.
-String GetCapturedStream(CapturedStream** captured_stream) {
-  const String content = (*captured_stream)->GetCapturedString();
-
-  delete *captured_stream;
-  *captured_stream = NULL;
-
-  return content;
-}
-
-// Starts capturing stdout.
-void CaptureStdout() {
-  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
-}
-
-// Starts capturing stderr.
-void CaptureStderr() {
-  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
-}
-
-// Stops capturing stdout and returns the captured string.
-String GetCapturedStdout() { return GetCapturedStream(&g_captured_stdout); }
-
-// Stops capturing stderr and returns the captured string.
-String GetCapturedStderr() { return GetCapturedStream(&g_captured_stderr); }
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-#if GTEST_HAS_DEATH_TEST
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<String> g_argvs;
-
-// Returns the command line as a vector of strings.
-const ::std::vector<String>& GetArgvs() { return g_argvs; }
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-#if GTEST_OS_WINDOWS_MOBILE
-namespace posix {
-void Abort() {
-  DebugBreak();
-  TerminateProcess(GetCurrentProcess(), 1);
-}
-}  // namespace posix
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Returns the name of the environment variable corresponding to the
-// given flag.  For example, FlagToEnvVar("foo") will return
-// "GTEST_FOO" in the open-source version.
-static String FlagToEnvVar(const char* flag) {
-  const String full_flag =
-      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
-
-  Message env_var;
-  for (size_t i = 0; i != full_flag.length(); i++) {
-    env_var << ToUpper(full_flag.c_str()[i]);
-  }
-
-  return env_var.GetString();
-}
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes
-// the result to *value and returns true; otherwise leaves *value
-// unchanged and returns false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
-  // Parses the environment variable as a decimal integer.
-  char* end = NULL;
-  const long long_value = strtol(str, &end, 10);  // NOLINT
-
-  // Has strtol() consumed all characters in the string?
-  if (*end != '\0') {
-    // No - an invalid character was encountered.
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value \"" << str << "\".\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  // Is the parsed value in the range of an Int32?
-  const Int32 result = static_cast<Int32>(long_value);
-  if (long_value == LONG_MAX || long_value == LONG_MIN ||
-      // The parsed value overflows as a long.  (strtol() returns
-      // LONG_MAX or LONG_MIN when the input overflows.)
-      result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value " << str << ", which overflows.\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  *value = result;
-  return true;
-}
-
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromGTestEnv(const char* flag, bool default_value) {
-  const String env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  return string_value == NULL ?
-      default_value : strcmp(string_value, "0") != 0;
-}
-
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
-  const String env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  if (string_value == NULL) {
-    // The environment variable is not set.
-    return default_value;
-  }
-
-  Int32 result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
-    printf("The default value %s is used.\n",
-           (Message() << default_value).GetString().c_str());
-    fflush(stdout);
-    return default_value;
-  }
-
-  return result;
-}
-
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromGTestEnv(const char* flag, const char* default_value) {
-  const String env_var = FlagToEnvVar(flag);
-  const char* const value = posix::GetEnv(env_var.c_str());
-  return value == NULL ? default_value : value;
-}
-
-}  // namespace internal
-}  // namespace testing
diff --git a/utils/unittest/googletest/gtest-test-part.cc b/utils/unittest/googletest/gtest-test-part.cc
deleted file mode 100644
index 1612780..0000000
--- a/utils/unittest/googletest/gtest-test-part.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
-
-#include "gtest/gtest-test-part.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#include "gtest/internal/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-using internal::GetUnitTestImpl;
-
-// Gets the summary of the failure message by omitting the stack trace
-// in it.
-internal::String TestPartResult::ExtractSummary(const char* message) {
-  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
-  return stack_trace == NULL ? internal::String(message) :
-      internal::String(message, stack_trace - message);
-}
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
-  return os
-      << result.file_name() << ":" << result.line_number() << ": "
-      << (result.type() == TestPartResult::kSuccess ? "Success" :
-          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
-          "Non-fatal failure") << ":\n"
-      << result.message() << std::endl;
-}
-
-// Appends a TestPartResult to the array.
-void TestPartResultArray::Append(const TestPartResult& result) {
-  array_.push_back(result);
-}
-
-// Returns the TestPartResult at the given index (0-based).
-const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
-  if (index < 0 || index >= size()) {
-    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
-    internal::posix::Abort();
-  }
-
-  return array_[index];
-}
-
-// Returns the number of TestPartResult objects in the array.
-int TestPartResultArray::size() const {
-  return static_cast<int>(array_.size());
-}
-
-namespace internal {
-
-HasNewFatalFailureHelper::HasNewFatalFailureHelper()
-    : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
-}
-
-HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
-      original_reporter_);
-}
-
-void HasNewFatalFailureHelper::ReportTestPartResult(
-    const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
-  original_reporter_->ReportTestPartResult(result);
-}
-
-}  // namespace internal
-
-}  // namespace testing
diff --git a/utils/unittest/googletest/gtest.cc b/utils/unittest/googletest/gtest.cc
deleted file mode 100644
index eb5c68c..0000000
--- a/utils/unittest/googletest/gtest.cc
+++ /dev/null
@@ -1,4866 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-
-#include "gtest/gtest.h"
-#include "gtest/gtest-spi.h"
-
-#include <ctype.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <wchar.h>
-#include <wctype.h>
-
-#include <algorithm>
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <vector>
-
-#if GTEST_OS_LINUX
-
-// TODO(kenton@google.com): Use autoconf to detect availability of
-// gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
-// Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
-
-#elif GTEST_OS_SYMBIAN
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-#elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-// On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
-
-# include <windows.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS  // We are on Windows proper.
-
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
-
-# if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
-//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
-//   supports these.  consider using them instead.
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <windows.h>  // NOLINT
-
-#else
-
-// Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-#endif
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#include "gtest/internal/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
-
-#if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
-#endif  // GTEST_OS_WINDOWS
-
-namespace testing {
-
-using internal::CountIf;
-using internal::ForEach;
-using internal::GetElementOr;
-using internal::Shuffle;
-
-// Constants.
-
-// A test whose test case name or test name matches this filter is
-// disabled and not run.
-static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
-
-// A test case whose name matches this filter is considered a death
-// test case and will be run before test cases whose name doesn't
-// match this filter.
-static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
-
-// A test filter that matches everything.
-static const char kUniversalFilter[] = "*";
-
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
-
-// The environment variable name for the test shard index.
-static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
-// The environment variable name for the total number of test shards.
-static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
-// The environment variable name for the test shard status file.
-static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
-
-namespace internal {
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-const char kStackTraceMarker[] = "\nStack trace:\n";
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-bool g_help_flag = false;
-
-}  // namespace internal
-
-GTEST_DEFINE_bool_(
-    also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
-    "Run disabled tests too, in addition to the tests normally being run.");
-
-GTEST_DEFINE_bool_(
-    break_on_failure,
-    internal::BoolFromGTestEnv("break_on_failure", false),
-    "True iff a failed assertion should be a debugger break-point.");
-
-GTEST_DEFINE_bool_(
-    catch_exceptions,
-    internal::BoolFromGTestEnv("catch_exceptions", true),
-    "True iff " GTEST_NAME_
-    " should catch exceptions and treat them as test failures.");
-
-GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
-    "Whether to use colors in the output.  Valid values: yes, no, "
-    "and auto.  'auto' means to use colors if the output is "
-    "being sent to a terminal and the TERM environment variable "
-    "is set to xterm, xterm-color, xterm-256color, linux or cygwin.");
-
-GTEST_DEFINE_string_(
-    filter,
-    internal::StringFromGTestEnv("filter", kUniversalFilter),
-    "A colon-separated list of glob (not regex) patterns "
-    "for filtering the tests to run, optionally followed by a "
-    "'-' and a : separated list of negative patterns (tests to "
-    "exclude).  A test is run if it matches one of the positive "
-    "patterns and does not match any of the negative patterns.");
-
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
-
-GTEST_DEFINE_string_(
-    output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
-    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
-    "If a directory is specified, output files will be created "
-    "within that directory, with file-names based on the test "
-    "executable's name and, if necessary, made unique by adding "
-    "digits.");
-
-GTEST_DEFINE_bool_(
-    print_time,
-    internal::BoolFromGTestEnv("print_time", true),
-    "True iff " GTEST_NAME_
-    " should display elapsed time in text output.");
-
-GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
-    "Random number seed to use when shuffling test orders.  Must be in range "
-    "[1, 99999], or 0 to use a seed based on the current time.");
-
-GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
-    "How many times to repeat each test.  Specify a negative number "
-    "for repeating forever.  Useful for shaking out flaky tests.");
-
-GTEST_DEFINE_bool_(
-    show_internal_stack_frames, false,
-    "True iff " GTEST_NAME_ " should include internal stack frames when "
-    "printing test failure stack traces.");
-
-GTEST_DEFINE_bool_(
-    shuffle,
-    internal::BoolFromGTestEnv("shuffle", false),
-    "True iff " GTEST_NAME_
-    " should randomize tests' order on every run.");
-
-GTEST_DEFINE_int32_(
-    stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
-    "The maximum number of stack frames to print when an "
-    "assertion fails.  The valid range is 0 through 100, inclusive.");
-
-GTEST_DEFINE_string_(
-    stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
-    "This flag specifies the host name and the port number on which to stream "
-    "test results. Example: \"localhost:555\". The flag is effective only on "
-    "Linux.");
-
-GTEST_DEFINE_bool_(
-    throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
-    "When this flag is specified, a failed assertion will throw an exception "
-    "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
-
-namespace internal {
-
-// Generates a random number from [0, range), using a Linear
-// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
-// than kMaxRange.
-UInt32 Random::Generate(UInt32 range) {
-  // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
-
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
-  GTEST_CHECK_(range <= kMaxRange)
-      << "Generation of a number in [0, " << range << ") was requested, "
-      << "but this can only generate numbers in [0, " << kMaxRange << ").";
-
-  // Converting via modulus introduces a bit of downward bias, but
-  // it's simple, and a linear congruential generator isn't too good
-  // to begin with.
-  return state_ % range;
-}
-
-// GTestIsInitialized() returns true iff the user has initialized
-// Google Test.  Useful for catching the user mistake of not initializing
-// Google Test before calling RUN_ALL_TESTS().
-//
-// A user must call testing::InitGoogleTest() to initialize Google
-// Test.  g_init_gtest_count is set to the number of times
-// InitGoogleTest() has been called.  We don't protect this variable
-// under a mutex as it is only accessed in the main thread.
-int g_init_gtest_count = 0;
-static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
-
-// Iterates over a vector of TestCases, keeping a running sum of the
-// results of calling a given int-returning method on each.
-// Returns the sum.
-static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
-                               int (TestCase::*method)() const) {
-  int sum = 0;
-  for (size_t i = 0; i < case_list.size(); i++) {
-    sum += (case_list[i]->*method)();
-  }
-  return sum;
-}
-
-// Returns true iff the test case passed.
-static bool TestCasePassed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Passed();
-}
-
-// Returns true iff the test case failed.
-static bool TestCaseFailed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Failed();
-}
-
-// Returns true iff test_case contains at least one test that should
-// run.
-static bool ShouldRunTestCase(const TestCase* test_case) {
-  return test_case->should_run();
-}
-
-// AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
-
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
-
-// Message assignment, for assertion streaming support.
-void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
-}
-
-// Mutex for linked pointers.
-GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// Application pathname gotten in InitGoogleTest.
-String g_executable_path;
-
-// Returns the current application's name, removing directory path if that
-// is present.
-FilePath GetCurrentExecutableName() {
-  FilePath result;
-
-#if GTEST_OS_WINDOWS
-  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
-#else
-  result.Set(FilePath(g_executable_path));
-#endif  // GTEST_OS_WINDOWS
-
-  return result.RemoveDirectoryName();
-}
-
-// Functions for processing the gtest_output flag.
-
-// Returns the output format, or "" for normal printed output.
-String UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return String("");
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  return (colon == NULL) ?
-      String(gtest_output_flag) :
-      String(gtest_output_flag, colon - gtest_output_flag);
-}
-
-// Returns the name of the requested output file, or the default if none
-// was explicitly specified.
-String UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return String("");
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  if (colon == NULL)
-    return String(internal::FilePath::ConcatPaths(
-               internal::FilePath(
-                   UnitTest::GetInstance()->original_working_dir()),
-               internal::FilePath(kDefaultOutputFile)).ToString() );
-
-  internal::FilePath output_name(colon + 1);
-  if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
-    // path (as its meaning depends on the current drive), yet the
-    // following logic for turning it into an absolute path is wrong.
-    // Fix it.
-    output_name = internal::FilePath::ConcatPaths(
-        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(colon + 1));
-
-  if (!output_name.IsDirectory())
-    return output_name.ToString();
-
-  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
-      output_name, internal::GetCurrentExecutableName(),
-      GetOutputFormat().c_str()));
-  return result.ToString();
-}
-
-// Returns true iff the wildcard pattern matches the string.  The
-// first ':' or '\0' character in pattern marks the end of it.
-//
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(const String& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
-    }
-
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
-
-    // Returns if no more pattern can be found.
-    if (cur_pattern == NULL) {
-      return false;
-    }
-
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
-  }
-}
-
-// TODO(keithray): move String function implementations to gtest-string.cc.
-
-// Returns true iff the user-specified filter matches the test case
-// name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const String &test_case_name,
-                                        const String &test_name) {
-  const String& full_name = String::Format("%s.%s",
-                                           test_case_name.c_str(),
-                                           test_name.c_str());
-
-  // Split --gtest_filter at '-', if there is one, to separate into
-  // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  String positive;
-  String negative;
-  if (dash == NULL) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = String("");
-  } else {
-    positive = String(p, dash - p);  // Everything up to the dash
-    negative = String(dash+1);       // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
-}
-
-#if GTEST_HAS_SEH
-// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-// This function is useful as an __except condition.
-int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
-  // Google Test should handle a SEH exception if:
-  //   1. the user wants it to, AND
-  //   2. this is not a breakpoint exception, AND
-  //   3. this is not a C++ exception (VC++ implements them via SEH,
-  //      apparently).
-  //
-  // SEH exception code for C++ exceptions.
-  // (see http://support.microsoft.com/kb/185294 for more information).
-  const DWORD kCxxExceptionCode = 0xe06d7363;
-
-  bool should_handle = true;
-
-  if (!GTEST_FLAG(catch_exceptions))
-    should_handle = false;
-  else if (exception_code == EXCEPTION_BREAKPOINT)
-    should_handle = false;
-  else if (exception_code == kCxxExceptionCode)
-    should_handle = false;
-
-  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
-}
-#endif  // GTEST_HAS_SEH
-
-}  // namespace internal
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results. Intercepts only failures from the current thread.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
-  Init();
-}
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
-  Init();
-}
-
-void ScopedFakeTestPartResultReporter::Init() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    old_reporter_ = impl->GetGlobalTestPartResultReporter();
-    impl->SetGlobalTestPartResultReporter(this);
-  } else {
-    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
-    impl->SetTestPartResultReporterForCurrentThread(this);
-  }
-}
-
-// The d'tor restores the test part result reporter used by Google Test
-// before.
-ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    impl->SetGlobalTestPartResultReporter(old_reporter_);
-  } else {
-    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
-  }
-}
-
-// Increments the test part result count and remembers the result.
-// This method is from the TestPartResultReporterInterface interface.
-void ScopedFakeTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  result_->Append(result);
-}
-
-namespace internal {
-
-// Returns the type ID of ::testing::Test.  We should always call this
-// instead of GetTypeId< ::testing::Test>() to get the type ID of
-// testing::Test.  This is to work around a suspected linker bug when
-// using Google Test as a framework on Mac OS X.  The bug causes
-// GetTypeId< ::testing::Test>() to return different values depending
-// on whether the call is from the Google Test framework itself or
-// from user test code.  GetTestTypeId() is guaranteed to always
-// return the same value, as it always calls GetTypeId<>() from the
-// gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
-
-// This predicate-formatter checks that 'results' contains a test part
-// failure of the given type and that the failure message contains the
-// given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
-  const String expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
-  Message msg;
-  if (results.size() != 1) {
-    msg << "Expected: " << expected << "\n"
-        << "  Actual: " << results.size() << " failures";
-    for (int i = 0; i < results.size(); i++) {
-      msg << "\n" << results.GetTestPartResult(i);
-    }
-    return AssertionFailure() << msg;
-  }
-
-  const TestPartResult& r = results.GetTestPartResult(0);
-  if (r.type() != type) {
-    return AssertionFailure() << "Expected: " << expected << "\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  if (strstr(r.message(), substr.c_str()) == NULL) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  return AssertionSuccess();
-}
-
-// The constructor of SingleFailureChecker remembers where to look up
-// test part results, what type of failure we expect, and what
-// substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
-
-// The destructor of SingleFailureChecker verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-SingleFailureChecker::~SingleFailureChecker() {
-  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
-}
-
-DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->current_test_result()->AddTestPartResult(result);
-  unit_test_->listeners()->repeater()->OnTestPartResult(result);
-}
-
-DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
-}
-
-// Returns the global test part result reporter.
-TestPartResultReporterInterface*
-UnitTestImpl::GetGlobalTestPartResultReporter() {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  return global_test_part_result_repoter_;
-}
-
-// Sets the global test part result reporter.
-void UnitTestImpl::SetGlobalTestPartResultReporter(
-    TestPartResultReporterInterface* reporter) {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  global_test_part_result_repoter_ = reporter;
-}
-
-// Returns the test part result reporter for the current thread.
-TestPartResultReporterInterface*
-UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
-  return per_thread_test_part_result_reporter_.get();
-}
-
-// Sets the test part result reporter for the current thread.
-void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
-    TestPartResultReporterInterface* reporter) {
-  per_thread_test_part_result_reporter_.set(reporter);
-}
-
-// Gets the number of successful test cases.
-int UnitTestImpl::successful_test_case_count() const {
-  return CountIf(test_cases_, TestCasePassed);
-}
-
-// Gets the number of failed test cases.
-int UnitTestImpl::failed_test_case_count() const {
-  return CountIf(test_cases_, TestCaseFailed);
-}
-
-// Gets the number of all test cases.
-int UnitTestImpl::total_test_case_count() const {
-  return static_cast<int>(test_cases_.size());
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTestImpl::test_case_to_run_count() const {
-  return CountIf(test_cases_, ShouldRunTestCase);
-}
-
-// Gets the number of successful tests.
-int UnitTestImpl::successful_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
-}
-
-// Gets the number of failed tests.
-int UnitTestImpl::failed_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
-}
-
-// Gets the number of disabled tests.
-int UnitTestImpl::disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
-}
-
-// Gets the number of all tests.
-int UnitTestImpl::total_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
-}
-
-// Gets the number of tests that should run.
-int UnitTestImpl::test_to_run_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
-}
-
-// Returns the current OS stack trace as a String.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-String UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  (void)skip_count;
-  return String("");
-}
-
-// Returns the current time in milliseconds.
-TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
-  //   GetSystemTimeAsFileTime()?
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-# ifdef _MSC_VER
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
-  //   SystemTimeToFileTime()
-#  pragma warning(push)          // Saves the current warning state.
-#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
-  _ftime64(&now);
-#  pragma warning(pop)           // Restores the warning state.
-# else
-
-  _ftime64(&now);
-
-# endif  // _MSC_VER
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, NULL);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
-}
-
-// Utilities
-
-// class String
-
-// Returns the input enclosed in double quotes if it's not NULL;
-// otherwise returns "(null)".  For example, "\"Hello\"" is returned
-// for input "Hello".
-//
-// This is useful for printing a C string in the syntax of a literal.
-//
-// Known issue: escape sequences are not handled yet.
-String String::ShowCStringQuoted(const char* c_str) {
-  return c_str ? String::Format("\"%s\"", c_str) : String("(null)");
-}
-
-// Copies at most length characters from str into a newly-allocated
-// piece of memory of size length+1.  The memory is allocated with new[].
-// A terminating null byte is written to the memory, and a pointer to it
-// is returned.  If str is NULL, NULL is returned.
-static char* CloneString(const char* str, size_t length) {
-  if (str == NULL) {
-    return NULL;
-  } else {
-    char* const clone = new char[length + 1];
-    posix::StrNCpy(clone, str, length);
-    clone[length] = '\0';
-    return clone;
-  }
-}
-
-// Clones a 0-terminated C string, allocating memory using new.  The
-// caller is responsible for deleting[] the return value.  Returns the
-// cloned string, or NULL if the input is NULL.
-const char * String::CloneCString(const char* c_str) {
-  return (c_str == NULL) ?
-                    NULL : CloneString(c_str, strlen(c_str));
-}
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Creates a UTF-16 wide string from the given ANSI string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the wide string, or NULL if the
-// input is NULL.
-LPCWSTR String::AnsiToUtf16(const char* ansi) {
-  if (!ansi) return NULL;
-  const int length = strlen(ansi);
-  const int unicode_length =
-      MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                          NULL, 0);
-  WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
-  unicode[unicode_length] = 0;
-  return unicode;
-}
-
-// Creates an ANSI string from the given wide string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the ANSI string, or NULL if the
-// input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
-  if (!utf16_str) return NULL;
-  const int ansi_length =
-      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                          NULL, 0, NULL, NULL);
-  char* ansi = new char[ansi_length + 1];
-  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                      ansi, ansi_length, NULL, NULL);
-  ansi[ansi_length] = 0;
-  return ansi;
-}
-
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Compares two C strings.  Returns true iff they have the same content.
-//
-// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
-  if ( lhs == NULL ) return rhs == NULL;
-
-  if ( rhs == NULL ) return false;
-
-  return strcmp(lhs, rhs) == 0;
-}
-
-#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-// Converts an array of wide chars to a narrow string using the UTF-8
-// encoding, and streams the result to the given Message object.
-static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
-                                     Message* msg) {
-  // TODO(wan): consider allowing a testing::String object to
-  // contain '\0'.  This will make it behave more like std::string,
-  // and will allow ToUtf8String() to return the correct encoding
-  // for '\0' s.t. we can get rid of the conditional here (and in
-  // several other places).
-  for (size_t i = 0; i != length; ) {  // NOLINT
-    if (wstr[i] != L'\0') {
-      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
-    } else {
-      *msg << '\0';
-      i++;
-    }
-  }
-}
-
-#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-}  // namespace internal
-
-#if GTEST_HAS_STD_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != NULL ?
-               new ::std::string(*other.message_) :
-               static_cast< ::std::string*>(NULL)) {
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != NULL)
-    negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
-namespace internal {
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-AssertionResult EqFailure(const char* expected_expression,
-                          const char* actual_expression,
-                          const String& expected_value,
-                          const String& actual_value,
-                          bool ignoring_case) {
-  Message msg;
-  msg << "Value of: " << actual_expression;
-  if (actual_value != actual_expression) {
-    msg << "\n  Actual: " << actual_value;
-  }
-
-  msg << "\nExpected: " << expected_expression;
-  if (ignoring_case) {
-    msg << " (ignoring case)";
-  }
-  if (expected_value != expected_expression) {
-    msg << "\nWhich is: " << expected_value;
-  }
-
-  return AssertionFailure() << msg;
-}
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-String GetBoolAssertionFailureMessage(const AssertionResult& assertion_result,
-                                      const char* expression_text,
-                                      const char* actual_predicate_value,
-                                      const char* expected_predicate_value) {
-  const char* actual_message = assertion_result.message();
-  Message msg;
-  msg << "Value of: " << expression_text
-      << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
-  msg << "\nExpected: " << expected_predicate_value;
-  return msg.GetString();
-}
-
-// Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
-  const double diff = fabs(val1 - val2);
-  if (diff <= abs_error) return AssertionSuccess();
-
-  // TODO(wan): do not print the value of an expression if it's
-  // already a literal.
-  return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
-}
-
-
-// Helper template for implementing FloatLE() and DoubleLE().
-template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
-  // Returns success if val1 is less than val2,
-  if (val1 < val2) {
-    return AssertionSuccess();
-  }
-
-  // or if val1 is almost equal to val2.
-  const FloatingPoint<RawType> lhs(val1), rhs(val2);
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  // Note that the above two checks will both fail if either val1 or
-  // val2 is NaN, as the IEEE floating-point standard requires that
-  // any predicate involving a NaN must return false.
-
-  ::std::stringstream val1_ss;
-  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val1;
-
-  ::std::stringstream val2_ss;
-  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val2;
-
-  return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
-}
-
-}  // namespace internal
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
-  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
-}
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
-  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
-}
-
-namespace internal {
-
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            BiggestInt expected,
-                            BiggestInt actual) {
-  if (expected == actual) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
-// The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const char* expected,
-                               const char* actual) {
-  if (String::CStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowCStringQuoted(expected),
-                   String::ShowCStringQuoted(actual),
-                   false);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const char* expected,
-                                   const char* actual) {
-  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowCStringQuoted(expected),
-                   String::ShowCStringQuoted(actual),
-                   true);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRNE.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
-                               const char* s2) {
-  if (!String::CStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASENE.
-AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
-                                   const char* s2) {
-  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-}  // namespace internal
-
-namespace {
-
-// Helper functions for implementing IsSubString() and IsNotSubstring().
-
-// This group of overloaded functions return true iff needle is a
-// substring of haystack.  NULL is considered a substring of itself
-// only.
-
-bool IsSubstringPred(const char* needle, const char* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return strstr(haystack, needle) != NULL;
-}
-
-bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return wcsstr(haystack, needle) != NULL;
-}
-
-// StringType here can be either ::std::string or ::std::wstring.
-template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
-  return haystack.find(needle) != StringType::npos;
-}
-
-// This function implements either IsSubstring() or IsNotSubstring(),
-// depending on the value of the expected_to_be_substring parameter.
-// StringType here can be const char*, const wchar_t*, ::std::string,
-// or ::std::wstring.
-template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
-  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
-    return AssertionSuccess();
-
-  const bool is_wide_string = sizeof(needle[0]) > 1;
-  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
-  return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
-}
-
-}  // namespace
-
-// IsSubstring() and IsNotSubstring() check whether needle is a
-// substring of haystack (NULL is considered a substring of itself
-// only), and return an appropriate error message when they fail.
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-#if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-
-namespace {
-
-// Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
-                                     long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
-
-  // Windows CE doesn't support FormatMessage.
-  const char error_text[] = "";
-
-# else
-
-  // Looks up the human-readable system message for the HRESULT code
-  // and since we're not passing any params to FormatMessage, we don't
-  // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
-  const DWORD kBufSize = 4096;  // String::Format can't exceed this length.
-  // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
-  DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,  // no source, we're asking system
-                                          hr,  // the error
-                                          0,  // no line width restrictions
-                                          error_text,  // output buffer
-                                          kBufSize,  // buf size
-                                          NULL);  // no arguments for inserts
-  // Trims tailing white space (FormatMessage leaves a trailing cr-lf)
-  for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
-    error_text[message_length - 1] = '\0';
-  }
-
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-  const String error_hex(String::Format("0x%08X ", hr));
-  return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << error_text << "\n";
-}
-
-}  // namespace
-
-AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
-  if (SUCCEEDED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "succeeds", hr);
-}
-
-AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
-  if (FAILED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "fails", hr);
-}
-
-#endif  // GTEST_OS_WINDOWS
-
-// Utility functions for encoding Unicode text (wide strings) in
-// UTF-8.
-
-// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
-// like this:
-//
-// Code-point length   Encoding
-//   0 -  7 bits       0xxxxxxx
-//   8 - 11 bits       110xxxxx 10xxxxxx
-//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
-//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-
-// The maximum code-point a one-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
-
-// The maximum code-point a two-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
-
-// The maximum code-point a three-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
-
-// The maximum code-point a four-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
-
-// Chops off the n lowest bits from a bit pattern.  Returns the n
-// lowest bits.  As a side effect, the original bit pattern will be
-// shifted to the right by n bits.
-inline UInt32 ChopLowBits(UInt32* bits, int n) {
-  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
-  *bits >>= n;
-  return low_bits;
-}
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// The output buffer str must containt at least 32 characters.
-// The function returns the address of the output buffer.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'.
-char* CodePointToUtf8(UInt32 code_point, char* str) {
-  if (code_point <= kMaxCodePoint1) {
-    str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
-  } else if (code_point <= kMaxCodePoint2) {
-    str[2] = '\0';
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
-  } else if (code_point <= kMaxCodePoint3) {
-    str[3] = '\0';
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
-  } else if (code_point <= kMaxCodePoint4) {
-    str[4] = '\0';
-    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
-  } else {
-    // The longest string String::Format can produce when invoked
-    // with these parameters is 28 character long (not including
-    // the terminating nul character). We are asking for 32 character
-    // buffer just in case. This is also enough for strncpy to
-    // null-terminate the destination string.
-    posix::StrNCpy(
-        str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(), 32);
-    str[31] = '\0';  // Makes sure no change in the format to strncpy leaves
-                     // the result unterminated.
-  }
-  return str;
-}
-
-// The following two functions only make sense if the system
-// uses UTF-16 for wide string encoding. All supported systems
-// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
-
-// Determines if the arguments constitute UTF-16 surrogate pair
-// and thus should be combined into a single Unicode code point
-// using CreateCodePointFromUtf16SurrogatePair.
-inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
-}
-
-// Creates a Unicode code point from UTF16 surrogate pair.
-inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
-                                                    wchar_t second) {
-  const UInt32 mask = (1 << 10) - 1;
-  return (sizeof(wchar_t) == 2) ?
-      (((first & mask) << 10) | (second & mask)) + 0x10000 :
-      // This function should not be called when the condition is
-      // false, but we provide a sensible default in case it is.
-      static_cast<UInt32>(first);
-}
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-String WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
-
-  ::std::stringstream stream;
-  for (int i = 0; i < num_chars; ++i) {
-    UInt32 unicode_code_point;
-
-    if (str[i] == L'\0') {
-      break;
-    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
-      i++;
-    } else {
-      unicode_code_point = static_cast<UInt32>(str[i]);
-    }
-
-    char buffer[32];  // CodePointToUtf8 requires a buffer this big.
-    stream << CodePointToUtf8(unicode_code_point, buffer);
-  }
-  return StringStreamToString(&stream);
-}
-
-// Converts a wide C string to a String using the UTF-8 encoding.
-// NULL will be converted to "(null)".
-String String::ShowWideCString(const wchar_t * wide_c_str) {
-  if (wide_c_str == NULL) return String("(null)");
-
-  return String(internal::WideStringToUtf8(wide_c_str, -1).c_str());
-}
-
-// Similar to ShowWideCString(), except that this function encloses
-// the converted string in double quotes.
-String String::ShowWideCStringQuoted(const wchar_t* wide_c_str) {
-  if (wide_c_str == NULL) return String("(null)");
-
-  return String::Format("L\"%s\"",
-                        String::ShowWideCString(wide_c_str).c_str());
-}
-
-// Compares two wide C strings.  Returns true iff they have the same
-// content.
-//
-// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-  return wcscmp(lhs, rhs) == 0;
-}
-
-// Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const wchar_t* expected,
-                               const wchar_t* actual) {
-  if (String::WideCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowWideCStringQuoted(expected),
-                   String::ShowWideCStringQuoted(actual),
-                   false);
-}
-
-// Helper function for *_STRNE on wide strings.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
-                               const wchar_t* s2) {
-  if (!String::WideCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  }
-
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << String::ShowWideCStringQuoted(s1)
-                            << " vs " << String::ShowWideCStringQuoted(s2);
-}
-
-// Compares two C strings, ignoring case.  Returns true iff they have
-// the same content.
-//
-// Unlike strcasecmp(), this function can handle NULL argument(s).  A
-// NULL C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
-  if (lhs == NULL)
-    return rhs == NULL;
-  if (rhs == NULL)
-    return false;
-  return posix::StrCaseCmp(lhs, rhs) == 0;
-}
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                              const wchar_t* rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-#if GTEST_OS_WINDOWS
-  return _wcsicmp(lhs, rhs) == 0;
-#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
-  return wcscasecmp(lhs, rhs) == 0;
-#else
-  // Android, Mac OS X and Cygwin don't define wcscasecmp.
-  // Other unknown OSes may not define it either.
-  wint_t left, right;
-  do {
-    left = towlower(*lhs++);
-    right = towlower(*rhs++);
-  } while (left && left == right);
-  return left == right;
-#endif  // OS selector
-}
-
-// Compares this with another String.
-// Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0
-// if this is greater than rhs.
-int String::Compare(const String & rhs) const {
-  const char* const lhs_c_str = c_str();
-  const char* const rhs_c_str = rhs.c_str();
-
-  if (lhs_c_str == NULL) {
-    return rhs_c_str == NULL ? 0 : -1;  // NULL < anything except NULL
-  } else if (rhs_c_str == NULL) {
-    return 1;
-  }
-
-  const size_t shorter_str_len =
-      length() <= rhs.length() ? length() : rhs.length();
-  for (size_t i = 0; i != shorter_str_len; i++) {
-    if (lhs_c_str[i] < rhs_c_str[i]) {
-      return -1;
-    } else if (lhs_c_str[i] > rhs_c_str[i]) {
-      return 1;
-    }
-  }
-  return (length() < rhs.length()) ? -1 :
-      (length() > rhs.length()) ? 1 : 0;
-}
-
-// Returns true iff this String ends with the given suffix.  *Any*
-// String is considered to end with a NULL or empty suffix.
-bool String::EndsWith(const char* suffix) const {
-  if (suffix == NULL || CStringEquals(suffix, "")) return true;
-
-  if (c_str() == NULL) return false;
-
-  const size_t this_len = strlen(c_str());
-  const size_t suffix_len = strlen(suffix);
-  return (this_len >= suffix_len) &&
-         CStringEquals(c_str() + this_len - suffix_len, suffix);
-}
-
-// Returns true iff this String ends with the given suffix, ignoring case.
-// Any String is considered to end with a NULL or empty suffix.
-bool String::EndsWithCaseInsensitive(const char* suffix) const {
-  if (suffix == NULL || CStringEquals(suffix, "")) return true;
-
-  if (c_str() == NULL) return false;
-
-  const size_t this_len = strlen(c_str());
-  const size_t suffix_len = strlen(suffix);
-  return (this_len >= suffix_len) &&
-         CaseInsensitiveCStringEquals(c_str() + this_len - suffix_len, suffix);
-}
-
-// Formats a list of arguments to a String, using the same format
-// spec string as for printf.
-//
-// We do not use the StringPrintf class as it is not universally
-// available.
-//
-// The result is limited to 4096 characters (including the tailing 0).
-// If 4096 characters are not enough to format the input, or if
-// there's an error, "<formatting error or buffer exceeded>" is
-// returned.
-String String::Format(const char * format, ...) {
-  va_list args;
-  va_start(args, format);
-
-  char buffer[4096];
-  const int kBufferSize = sizeof(buffer)/sizeof(buffer[0]);
-
-  // MSVC 8 deprecates vsnprintf(), so we want to suppress warning
-  // 4996 (deprecated function) there.
-#ifdef _MSC_VER  // We are using MSVC.
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4996)  // Temporarily disables warning 4996.
-
-  const int size = vsnprintf(buffer, kBufferSize, format, args);
-
-# pragma warning(pop)           // Restores the warning state.
-#else  // We are not using MSVC.
-  const int size = vsnprintf(buffer, kBufferSize, format, args);
-#endif  // _MSC_VER
-  va_end(args);
-
-  // vsnprintf()'s behavior is not portable.  When the buffer is not
-  // big enough, it returns a negative value in MSVC, and returns the
-  // needed buffer size on Linux.  When there is an output error, it
-  // always returns a negative value.  For simplicity, we lump the two
-  // error cases together.
-  if (size < 0 || size >= kBufferSize) {
-    return String("<formatting error or buffer exceeded>");
-  } else {
-    return String(buffer, size);
-  }
-}
-
-// Converts the buffer in a stringstream to a String, converting NUL
-// bytes to "\\0" along the way.
-String StringStreamToString(::std::stringstream* ss) {
-  const ::std::string& str = ss->str();
-  const char* const start = str.c_str();
-  const char* const end = start + str.length();
-
-  // We need to use a helper stringstream to do this transformation
-  // because String doesn't support push_back().
-  ::std::stringstream helper;
-  for (const char* ch = start; ch != end; ++ch) {
-    if (*ch == '\0') {
-      helper << "\\0";  // Replaces NUL with "\\0";
-    } else {
-      helper.put(*ch);
-    }
-  }
-
-  return String(helper.str().c_str());
-}
-
-// Appends the user-supplied message to the Google-Test-generated message.
-String AppendUserMessage(const String& gtest_msg,
-                         const Message& user_msg) {
-  // Appends the user message if it's non-empty.
-  const String user_msg_string = user_msg.GetString();
-  if (user_msg_string.empty()) {
-    return gtest_msg;
-  }
-
-  Message msg;
-  msg << gtest_msg << "\n" << user_msg_string;
-
-  return msg.GetString();
-}
-
-}  // namespace internal
-
-// class TestResult
-
-// Creates an empty TestResult.
-TestResult::TestResult()
-    : death_test_count_(0),
-      elapsed_time_(0) {
-}
-
-// D'tor.
-TestResult::~TestResult() {
-}
-
-// Returns the i-th test part result among all the results. i can
-// range from 0 to total_part_count() - 1. If i is not in that range,
-// aborts the program.
-const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
-  return test_part_results_.at(i);
-}
-
-// Returns the i-th test property. i can range from 0 to
-// test_property_count() - 1. If i is not in that range, aborts the
-// program.
-const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
-  return test_properties_.at(i);
-}
-
-// Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
-
-// Adds a test part result to the list.
-void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
-  test_part_results_.push_back(test_part_result);
-}
-
-// Adds a test property to the list. If a property with the same key as the
-// supplied property is already represented, the value of this test_property
-// replaces the old value for that key.
-void TestResult::RecordProperty(const TestProperty& test_property) {
-  if (!ValidateTestProperty(test_property)) {
-    return;
-  }
-  internal::MutexLock lock(&test_properites_mutex_);
-  const std::vector<TestProperty>::iterator property_with_matching_key =
-      std::find_if(test_properties_.begin(), test_properties_.end(),
-                   internal::TestPropertyKeyIs(test_property.key()));
-  if (property_with_matching_key == test_properties_.end()) {
-    test_properties_.push_back(test_property);
-    return;
-  }
-  property_with_matching_key->SetValue(test_property.value());
-}
-
-// Adds a failure if the key is a reserved attribute of Google Test
-// testcase tags.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const TestProperty& test_property) {
-  internal::String key(test_property.key());
-  if (key == "name" || key == "status" || key == "time" || key == "classname") {
-    ADD_FAILURE()
-        << "Reserved key used in RecordProperty(): "
-        << key
-        << " ('name', 'status', 'time', and 'classname' are reserved by "
-        << GTEST_NAME_ << ")";
-    return false;
-  }
-  return true;
-}
-
-// Clears the object.
-void TestResult::Clear() {
-  test_part_results_.clear();
-  test_properties_.clear();
-  death_test_count_ = 0;
-  elapsed_time_ = 0;
-}
-
-// Returns true iff the test failed.
-bool TestResult::Failed() const {
-  for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
-  }
-  return false;
-}
-
-// Returns true iff the test part fatally failed.
-static bool TestPartFatallyFailed(const TestPartResult& result) {
-  return result.fatally_failed();
-}
-
-// Returns true iff the test fatally failed.
-bool TestResult::HasFatalFailure() const {
-  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
-}
-
-// Returns true iff the test part non-fatally failed.
-static bool TestPartNonfatallyFailed(const TestPartResult& result) {
-  return result.nonfatally_failed();
-}
-
-// Returns true iff the test has a non-fatal failure.
-bool TestResult::HasNonfatalFailure() const {
-  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
-}
-
-// Gets the number of all test parts.  This is the sum of the number
-// of successful test parts and the number of failed test parts.
-int TestResult::total_part_count() const {
-  return static_cast<int>(test_part_results_.size());
-}
-
-// Returns the number of the test properties.
-int TestResult::test_property_count() const {
-  return static_cast<int>(test_properties_.size());
-}
-
-// class Test
-
-// Creates a Test object.
-
-// The c'tor saves the values of all Google Test flags.
-Test::Test()
-    : gtest_flag_saver_(new internal::GTestFlagSaver) {
-}
-
-// The d'tor restores the values of all Google Test flags.
-Test::~Test() {
-  delete gtest_flag_saver_;
-}
-
-// Sets up the test fixture.
-//
-// A sub-class may override this.
-void Test::SetUp() {
-}
-
-// Tears down the test fixture.
-//
-// A sub-class may override this.
-void Test::TearDown() {
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const char* key, const char* value) {
-  UnitTest::GetInstance()->RecordPropertyForCurrentTest(key, value);
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const char* key, int value) {
-  Message value_message;
-  value_message << value;
-  RecordProperty(key, value_message.GetString().c_str());
-}
-
-namespace internal {
-
-void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const String& message) {
-  // This function is a friend of UnitTest and as such has access to
-  // AddTestPartResult.
-  UnitTest::GetInstance()->AddTestPartResult(
-      result_type,
-      NULL,  // No info about the source file where the exception occurred.
-      -1,    // We have no info on which line caused the exception.
-      message,
-      String());  // No stack trace, either.
-}
-
-}  // namespace internal
-
-// Google Test requires all tests in the same test case to use the same test
-// fixture class.  This function checks if the current test has the
-// same fixture class as the first test in the current test case.  If
-// yes, it returns true; otherwise it generates a Google Test failure and
-// returns false.
-bool Test::HasSameFixtureClass() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  const TestCase* const test_case = impl->current_test_case();
-
-  // Info about the first test in the current test case.
-  const TestInfo* const first_test_info = test_case->test_info_list()[0];
-  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
-  const char* const first_test_name = first_test_info->name();
-
-  // Info about the current test.
-  const TestInfo* const this_test_info = impl->current_test_info();
-  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
-  const char* const this_test_name = this_test_info->name();
-
-  if (this_fixture_id != first_fixture_id) {
-    // Is the first test defined using TEST?
-    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
-    // Is this test defined using TEST?
-    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
-
-    if (first_is_TEST || this_is_TEST) {
-      // The user mixed TEST and TEST_F in this test case - we'll tell
-      // him/her how to fix it.
-
-      // Gets the name of the TEST and the name of the TEST_F.  Note
-      // that first_is_TEST and this_is_TEST cannot both be true, as
-      // the fixture IDs are different for the two tests.
-      const char* const TEST_name =
-          first_is_TEST ? first_test_name : this_test_name;
-      const char* const TEST_F_name =
-          first_is_TEST ? this_test_name : first_test_name;
-
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class, so mixing TEST_F and TEST in the same test case is\n"
-          << "illegal.  In test case " << this_test_info->test_case_name()
-          << ",\n"
-          << "test " << TEST_F_name << " is defined using TEST_F but\n"
-          << "test " << TEST_name << " is defined using TEST.  You probably\n"
-          << "want to change the TEST to TEST_F or move it to another test\n"
-          << "case.";
-    } else {
-      // The user defined two fixture classes with the same name in
-      // two namespaces - we'll tell him/her how to fix it.
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class.  However, in test case "
-          << this_test_info->test_case_name() << ",\n"
-          << "you defined test " << first_test_name
-          << " and test " << this_test_name << "\n"
-          << "using two different test fixture classes.  This can happen if\n"
-          << "the two classes are from different namespaces or translation\n"
-          << "units and have the same name.  You should probably rename one\n"
-          << "of the classes to put the tests into different test cases.";
-    }
-    return false;
-  }
-
-  return true;
-}
-
-#if GTEST_HAS_SEH
-
-// Adds an "exception thrown" fatal failure to the current test.  This
-// function returns its result via an output parameter pointer because VC++
-// prohibits creation of objects with destructors on stack in functions
-// using __try (see error C2712).
-static internal::String* FormatSehExceptionMessage(DWORD exception_code,
-                                                   const char* location) {
-  Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
-
-  return new internal::String(message.GetString());
-}
-
-#endif  // GTEST_HAS_SEH
-
-#if GTEST_HAS_EXCEPTIONS
-
-// Adds an "exception thrown" fatal failure to the current test.
-static internal::String FormatCxxExceptionMessage(const char* description,
-                                                  const char* location) {
-  Message message;
-  if (description != NULL) {
-    message << "C++ exception with description \"" << description << "\"";
-  } else {
-    message << "Unknown C++ exception";
-  }
-  message << " thrown in " << location << ".";
-
-  return message.GetString();
-}
-
-static internal::String PrintTestPartResultToString(
-    const TestPartResult& test_part_result);
-
-// A failed Google Test assertion will throw an exception of this type when
-// GTEST_FLAG(throw_on_failure) is true (if exceptions are enabled).  We
-// derive it from std::runtime_error, which is for errors presumably
-// detectable only at run time.  Since std::runtime_error inherits from
-// std::exception, many testing frameworks know how to extract and print the
-// message inside it.
-class GoogleTestFailureException : public ::std::runtime_error {
- public:
-  explicit GoogleTestFailureException(const TestPartResult& failure)
-      : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
-};
-#endif  // GTEST_HAS_EXCEPTIONS
-
-namespace internal {
-// We put these helper functions in the internal namespace as IBM's xlC
-// compiler rejects the code if they were declared static.
-
-// Runs the given method and handles SEH exceptions it throws, when
-// SEH is supported; returns the 0-value for type Result in case of an
-// SEH exception.  (Microsoft compilers cannot handle SEH and C++
-// exceptions in the same function.  Therefore, we provide a separate
-// wrapper function for handling SEH exceptions.)
-template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-#if GTEST_HAS_SEH
-  __try {
-    return (object->*method)();
-  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
-      GetExceptionCode())) {
-    // We create the exception message on the heap because VC++ prohibits
-    // creation of objects with destructors on stack in functions using __try
-    // (see error C2712).
-    internal::String* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
-    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
-                                             *exception_message);
-    delete exception_message;
-    return static_cast<Result>(0);
-  }
-#else
-  (void)location;
-  return (object->*method)();
-#endif  // GTEST_HAS_SEH
-}
-
-// Runs the given method and catches and reports C++ and/or SEH-style
-// exceptions, if they are supported; returns the 0-value for type
-// Result in case of an SEH exception.
-template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-  // NOTE: The user code can affect the way in which Google Test handles
-  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
-  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
-  // after the exception is caught and either report or re-throw the
-  // exception based on the flag's value:
-  //
-  // try {
-  //   // Perform the test method.
-  // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
-  //     // Report the exception as failure.
-  //   else
-  //     throw;  // Re-throws the original exception.
-  // }
-  //
-  // However, the purpose of this flag is to allow the program to drop into
-  // the debugger when the exception is thrown. On most platforms, once the
-  // control enters the catch block, the exception origin information is
-  // lost and the debugger will stop the program at the point of the
-  // re-throw in this function -- instead of at the point of the original
-  // throw statement in the code under test.  For this reason, we perform
-  // the check early, sacrificing the ability to affect Google Test's
-  // exception handling in the method where the exception is thrown.
-  if (internal::GetUnitTestImpl()->catch_exceptions()) {
-#if GTEST_HAS_EXCEPTIONS
-    try {
-      return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const GoogleTestFailureException&) {  // NOLINT
-      // This exception doesn't originate in code under test. It makes no
-      // sense to report it as a test failure.
-      throw;
-    } catch (const std::exception& e) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(e.what(), location));
-    } catch (...) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(NULL, location));
-    }
-    return static_cast<Result>(0);
-#else
-    return HandleSehExceptionsInMethodIfSupported(object, method, location);
-#endif  // GTEST_HAS_EXCEPTIONS
-  } else {
-    return (object->*method)();
-  }
-}
-
-}  // namespace internal
-
-// Runs the test and updates the test result.
-void Test::Run() {
-  if (!HasSameFixtureClass()) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
-  // We will run the test only if SetUp() was successful.
-  if (!HasFatalFailure()) {
-    impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
-  }
-
-  // However, we want to clean up as much as possible.  Hence we will
-  // always call TearDown(), even if SetUp() or the test body has
-  // failed.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
-}
-
-// Returns true iff the current test has a fatal failure.
-bool Test::HasFatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
-}
-
-// Returns true iff the current test has a non-fatal failure.
-bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
-}
-
-// class TestInfo
-
-// Constructs a TestInfo object. It assumes ownership of the test factory
-// object.
-// TODO(vladl@google.com): Make a_test_case_name and a_name const string&'s
-// to signify they cannot be NULLs.
-TestInfo::TestInfo(const char* a_test_case_name,
-                   const char* a_name,
-                   const char* a_type_param,
-                   const char* a_value_param,
-                   internal::TypeId fixture_class_id,
-                   internal::TestFactoryBase* factory)
-    : test_case_name_(a_test_case_name),
-      name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
-      fixture_class_id_(fixture_class_id),
-      should_run_(false),
-      is_disabled_(false),
-      matches_filter_(false),
-      factory_(factory),
-      result_() {}
-
-// Destructs a TestInfo object.
-TestInfo::~TestInfo() { delete factory_; }
-
-namespace internal {
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_case_name:   name of the test case
-//   name:             name of the test
-//   type_param:       the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param:      text representation of the test's value parameter,
-//                     or NULL if this is not a value-parameterized test.
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name, const char* name,
-    const char* type_param,
-    const char* value_param,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory) {
-  TestInfo* const test_info =
-      new TestInfo(test_case_name, name, type_param, value_param,
-                   fixture_class_id, factory);
-  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
-  return test_info;
-}
-
-#if GTEST_HAS_PARAM_TEST
-void ReportInvalidTestCaseType(const char* test_case_name,
-                               const char* file, int line) {
-  Message errors;
-  errors
-      << "Attempted redefinition of test case " << test_case_name << ".\n"
-      << "All tests in the same test case must use the same test fixture\n"
-      << "class.  However, in test case " << test_case_name << ", you tried\n"
-      << "to define a test using a fixture class different from the one\n"
-      << "used earlier. This can happen if the two fixture classes are\n"
-      << "from different namespaces and have the same name. You should\n"
-      << "probably rename one of the classes to put the tests into different\n"
-      << "test cases.";
-
-  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-          errors.GetString().c_str());
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-}  // namespace internal
-
-namespace internal {
-
-// This method expands all parameterized tests registered with macros TEST_P
-// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
-// This will be done just once during the program runtime.
-void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
-  if (!parameterized_tests_registered_) {
-    parameterized_test_registry_.RegisterTests();
-    parameterized_tests_registered_ = true;
-  }
-#endif
-}
-
-}  // namespace internal
-
-// Creates the test object, runs it, records its result, and then
-// deletes it.
-void TestInfo::Run() {
-  if (!should_run_) return;
-
-  // Tells UnitTest where to store test result.
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_info(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  // Notifies the unit test event listeners that a test is about to start.
-  repeater->OnTestStart(*this);
-
-  const TimeInMillis start = internal::GetTimeInMillis();
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-
-  // Creates the test object.
-  Test* const test = internal::HandleExceptionsInMethodIfSupported(
-      factory_, &internal::TestFactoryBase::CreateTest,
-      "the test fixture's constructor");
-
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
-    // This doesn't throw as all user code that can throw are wrapped into
-    // exception handling code.
-    test->Run();
-  }
-
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
-
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
-
-  // Notifies the unit test event listener that a test has just finished.
-  repeater->OnTestEnd(*this);
-
-  // Tells UnitTest to stop associating assertion results to this
-  // test.
-  impl->set_current_test_info(NULL);
-}
-
-// class TestCase
-
-// Gets the number of successful tests in this test case.
-int TestCase::successful_test_count() const {
-  return CountIf(test_info_list_, TestPassed);
-}
-
-// Gets the number of failed tests in this test case.
-int TestCase::failed_test_count() const {
-  return CountIf(test_info_list_, TestFailed);
-}
-
-int TestCase::disabled_test_count() const {
-  return CountIf(test_info_list_, TestDisabled);
-}
-
-// Get the number of tests in this test case that should run.
-int TestCase::test_to_run_count() const {
-  return CountIf(test_info_list_, ShouldRunTest);
-}
-
-// Gets the number of all tests.
-int TestCase::total_test_count() const {
-  return static_cast<int>(test_info_list_.size());
-}
-
-// Creates a TestCase with the given name.
-//
-// Arguments:
-//
-//   name:         name of the test case
-//   a_type_param: the name of the test case's type parameter, or NULL if
-//                 this is not a typed or a type-parameterized test case.
-//   set_up_tc:    pointer to the function that sets up the test case
-//   tear_down_tc: pointer to the function that tears down the test case
-TestCase::TestCase(const char* a_name, const char* a_type_param,
-                   Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc)
-    : name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      set_up_tc_(set_up_tc),
-      tear_down_tc_(tear_down_tc),
-      should_run_(false),
-      elapsed_time_(0) {
-}
-
-// Destructor of TestCase.
-TestCase::~TestCase() {
-  // Deletes every Test in the collection.
-  ForEach(test_info_list_, internal::Delete<TestInfo>);
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-const TestInfo* TestCase::GetTestInfo(int i) const {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-TestInfo* TestCase::GetMutableTestInfo(int i) {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Adds a test to this test case.  Will delete the test upon
-// destruction of the TestCase object.
-void TestCase::AddTestInfo(TestInfo * test_info) {
-  test_info_list_.push_back(test_info);
-  test_indices_.push_back(static_cast<int>(test_indices_.size()));
-}
-
-// Runs every test in this TestCase.
-void TestCase::Run() {
-  if (!should_run_) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_case(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  repeater->OnTestCaseStart(*this);
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
-
-  const internal::TimeInMillis start = internal::GetTimeInMillis();
-  for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-  }
-  elapsed_time_ = internal::GetTimeInMillis() - start;
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
-
-  repeater->OnTestCaseEnd(*this);
-  impl->set_current_test_case(NULL);
-}
-
-// Clears the results of all tests in this test case.
-void TestCase::ClearResult() {
-  ForEach(test_info_list_, TestInfo::ClearTestResult);
-}
-
-// Shuffles the tests in this test case.
-void TestCase::ShuffleTests(internal::Random* random) {
-  Shuffle(random, &test_indices_);
-}
-
-// Restores the test order to before the first shuffle.
-void TestCase::UnshuffleTests() {
-  for (size_t i = 0; i < test_indices_.size(); i++) {
-    test_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Formats a countable noun.  Depending on its quantity, either the
-// singular form or the plural form is used. e.g.
-//
-// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
-// FormatCountableNoun(5, "book", "books") returns "5 books".
-static internal::String FormatCountableNoun(int count,
-                                            const char * singular_form,
-                                            const char * plural_form) {
-  return internal::String::Format("%d %s", count,
-                                  count == 1 ? singular_form : plural_form);
-}
-
-// Formats the count of tests.
-static internal::String FormatTestCount(int test_count) {
-  return FormatCountableNoun(test_count, "test", "tests");
-}
-
-// Formats the count of test cases.
-static internal::String FormatTestCaseCount(int test_case_count) {
-  return FormatCountableNoun(test_case_count, "test case", "test cases");
-}
-
-// Converts a TestPartResult::Type enum to human-friendly string
-// representation.  Both kNonFatalFailure and kFatalFailure are translated
-// to "Failure", as the user usually doesn't care about the difference
-// between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
-  switch (type) {
-    case TestPartResult::kSuccess:
-      return "Success";
-
-    case TestPartResult::kNonFatalFailure:
-    case TestPartResult::kFatalFailure:
-#ifdef _MSC_VER
-      return "error: ";
-#else
-      return "Failure\n";
-#endif
-  }
-
-  // All cases return, so this is unreachable but GCC doesn't know it
-  abort();
-}
-
-// Prints a TestPartResult to a String.
-static internal::String PrintTestPartResultToString(
-    const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
-}
-
-// Prints a TestPartResult.
-static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const internal::String& result =
-      PrintTestPartResultToString(test_part_result);
-  printf("%s\n", result.c_str());
-  fflush(stdout);
-  // If the test program runs in Visual Studio or a debugger, the
-  // following statements add the test part result message to the Output
-  // window such that the user can double-click on it to jump to the
-  // corresponding source code location; otherwise they do nothing.
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  // We don't call OutputDebugString*() on Windows Mobile, as printing
-  // to stdout is done by OutputDebugString() there already - we don't
-  // want the same message printed twice.
-  ::OutputDebugStringA(result.c_str());
-  ::OutputDebugStringA("\n");
-#endif
-}
-
-// class PrettyUnitTestResultPrinter
-
-namespace internal {
-
-enum GTestColor {
-  COLOR_DEFAULT,
-  COLOR_RED,
-  COLOR_GREEN,
-  COLOR_YELLOW
-};
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
-  }
-}
-
-#else
-
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
-// an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
-    default:            return NULL;
-  };
-}
-
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns true iff Google Test should use colors in the output.
-bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
-
-  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
-    // On Windows the TERM variable is usually not set, but the
-    // console there does support colors.
-    return stdout_is_tty;
-#else
-    // On non-Windows platforms, we rely on the TERM variable.
-    const char* const term = posix::GetEnv("TERM");
-    const bool term_supports_color =
-        String::CStringEquals(term, "xterm") ||
-        String::CStringEquals(term, "xterm-color") ||
-        String::CStringEquals(term, "xterm-256color") ||
-        String::CStringEquals(term, "screen") ||
-        String::CStringEquals(term, "linux") ||
-        String::CStringEquals(term, "cygwin");
-    return stdout_is_tty && term_supports_color;
-#endif  // GTEST_OS_WINDOWS
-  }
-
-  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
-  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
-  // value is neither one of these nor "auto", we treat it as "no" to
-  // be conservative.
-}
-
-// Helpers for printing colored strings to stdout. Note that on Windows, we
-// cannot simply emit special characters and have the terminal change colors.
-// This routine must actually emit the characters rather than return a string
-// that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  const bool use_color = false;
-#else
-  static const bool in_color_mode =
-      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
-
-  if (!use_color) {
-    vprintf(fmt, args);
-    va_end(args);
-    return;
-  }
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
-
-  // Gets the current text color.
-  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
-  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
-
-  // We need to flush the stream buffers into the console before each
-  // SetConsoleTextAttribute call lest it affect the text that is already
-  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
-
-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
-#else
-  printf("\033[0;3%sm", GetAnsiColorCode(color));
-  vprintf(fmt, args);
-  printf("\033[m");  // Resets the terminal to default.
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  va_end(args);
-}
-
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
-  const char* const type_param = test_info.type_param();
-  const char* const value_param = test_info.value_param();
-
-  if (type_param != NULL || value_param != NULL) {
-    printf(", where ");
-    if (type_param != NULL) {
-      printf("TypeParam = %s", type_param);
-      if (value_param != NULL)
-        printf(" and ");
-    }
-    if (value_param != NULL) {
-      printf("GetParam() = %s", value_param);
-    }
-  }
-}
-
-// This class implements the TestEventListener interface.
-//
-// Class PrettyUnitTestResultPrinter is copyable.
-class PrettyUnitTestResultPrinter : public TestEventListener {
- public:
-  PrettyUnitTestResultPrinter() {}
-  static void PrintTestName(const char * test_case, const char * test) {
-    printf("%s.%s", test_case, test);
-  }
-
-  // The following methods override what's in the TestEventListener class.
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
-
- private:
-  static void PrintFailedTests(const UnitTest& unit_test);
-
-  internal::String test_case_name_;
-};
-
-  // Fired before each iteration of tests starts.
-void PrettyUnitTestResultPrinter::OnTestIterationStart(
-    const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
-    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
-
-  const char* const filter = GTEST_FLAG(filter).c_str();
-
-  // Prints the filter if it's not *.  This reminds the user that some
-  // tests may be skipped.
-  if (!internal::String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
-  }
-
-  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
-    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
-                  static_cast<int>(shard_index) + 1,
-                  internal::posix::GetEnv(kTestTotalShards));
-  }
-
-  if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: Randomizing tests' orders with a seed of %d .\n",
-                  unit_test.random_seed());
-  }
-
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("Running %s from %s.\n",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment set-up.\n");
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
-  test_case_name_ = test_case.name();
-  const internal::String counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s", counts.c_str(), test_case_name_.c_str());
-  if (test_case.type_param() == NULL) {
-    printf("\n");
-  } else {
-    printf(", where TypeParam = %s\n", test_case.type_param());
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
-  PrintTestName(test_case_name_.c_str(), test_info.name());
-  printf("\n");
-  fflush(stdout);
-}
-
-// Called after an assertion failure.
-void PrettyUnitTestResultPrinter::OnTestPartResult(
-    const TestPartResult& result) {
-  // If the test part succeeded, we don't need to do anything.
-  if (result.type() == TestPartResult::kSuccess)
-    return;
-
-  // Print failure message from the assertion (e.g. expected this and got that).
-  PrintTestPartResult(result);
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
-  if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
-  } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-  }
-  PrintTestName(test_case_name_.c_str(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
-
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
-  } else {
-    printf("\n");
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
-
-  test_case_name_ = test_case.name();
-  const internal::String counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s (%s ms total)\n\n",
-         counts.c_str(), test_case_name_.c_str(),
-         internal::StreamableToString(test_case.elapsed_time()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment tear-down\n");
-  fflush(stdout);
-}
-
-// Internal helper for printing the list of failed tests.
-void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
-  const int failed_test_count = unit_test.failed_test_count();
-  if (failed_test_count == 0) {
-    return;
-  }
-
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    const TestCase& test_case = *unit_test.GetTestCase(i);
-    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
-      continue;
-    }
-    for (int j = 0; j < test_case.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_case.GetTestInfo(j);
-      if (!test_info.should_run() || test_info.result()->Passed()) {
-        continue;
-      }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-      printf("%s.%s", test_case.name(), test_info.name());
-      PrintFullTestCommentIfPresent(test_info);
-      printf("\n");
-    }
-  }
-}
-
-void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                     int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("%s from %s ran.",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms total)",
-           internal::StreamableToString(unit_test.elapsed_time()).c_str());
-  }
-  printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
-  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
-
-  int num_failures = unit_test.failed_test_count();
-  if (!unit_test.Passed()) {
-    const int failed_test_count = unit_test.failed_test_count();
-    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
-    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
-    PrintFailedTests(unit_test);
-    printf("\n%2d FAILED %s\n", num_failures,
-                        num_failures == 1 ? "TEST" : "TESTS");
-  }
-
-  int num_disabled = unit_test.disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
-    if (!num_failures) {
-      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
-    }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
-  }
-  // Ensure that Google Test output is printed before, e.g., heapchecker output.
-  fflush(stdout);
-}
-
-// End PrettyUnitTestResultPrinter
-
-// class TestEventRepeater
-//
-// This class forwards events to other event listeners.
-class TestEventRepeater : public TestEventListener {
- public:
-  TestEventRepeater() : forwarding_enabled_(true) {}
-  virtual ~TestEventRepeater();
-  void Append(TestEventListener *listener);
-  TestEventListener* Release(TestEventListener* listener);
-
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled() const { return forwarding_enabled_; }
-  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
-
-  virtual void OnTestProgramStart(const UnitTest& unit_test);
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& unit_test);
-
- private:
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled_;
-  // The list of listeners that receive events.
-  std::vector<TestEventListener*> listeners_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
-};
-
-TestEventRepeater::~TestEventRepeater() {
-  ForEach(listeners_, Delete<TestEventListener>);
-}
-
-void TestEventRepeater::Append(TestEventListener *listener) {
-  listeners_.push_back(listener);
-}
-
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
-  for (size_t i = 0; i < listeners_.size(); ++i) {
-    if (listeners_[i] == listener) {
-      listeners_.erase(listeners_.begin() + i);
-      return listener;
-    }
-  }
-
-  return NULL;
-}
-
-// Since most methods are very similar, use macros to reduce boilerplate.
-// This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-// This defines a member that forwards the call to all listeners in reverse
-// order.
-#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-
-GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
-GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
-GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
-GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
-
-#undef GTEST_REPEATER_METHOD_
-#undef GTEST_REVERSE_REPEATER_METHOD_
-
-void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
-                                             int iteration) {
-  if (forwarding_enabled_) {
-    for (size_t i = 0; i < listeners_.size(); i++) {
-      listeners_[i]->OnTestIterationStart(unit_test, iteration);
-    }
-  }
-}
-
-void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
-                                           int iteration) {
-  if (forwarding_enabled_) {
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
-      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
-    }
-  }
-}
-
-// End TestEventRepeater
-
-// This class generates an XML output file.
-class XmlUnitTestResultPrinter : public EmptyTestEventListener {
- public:
-  explicit XmlUnitTestResultPrinter(const char* output_file);
-
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-
- private:
-  // Is c a whitespace character that is normalized to a space character
-  // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
-  }
-
-  // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
-    return IsNormalizableWhitespace(c) || c >= 0x20;
-  }
-
-  // Returns an XML-escaped copy of the input string str.  If
-  // is_attribute is true, the text is meant to appear as an attribute
-  // value, and normalizable whitespace is preserved by replacing it
-  // with character references.
-  static String EscapeXml(const char* str, bool is_attribute);
-
-  // Returns the given string with all characters invalid in XML removed.
-  static string RemoveInvalidXmlCharacters(const string& str);
-
-  // Convenience wrapper around EscapeXml when str is an attribute value.
-  static String EscapeXmlAttribute(const char* str) {
-    return EscapeXml(str, true);
-  }
-
-  // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static String EscapeXmlText(const char* str) { return EscapeXml(str, false); }
-
-  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
-
-  // Streams an XML representation of a TestInfo object.
-  static void OutputXmlTestInfo(::std::ostream* stream,
-                                const char* test_case_name,
-                                const TestInfo& test_info);
-
-  // Prints an XML representation of a TestCase object
-  static void PrintXmlTestCase(FILE* out, const TestCase& test_case);
-
-  // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(FILE* out, const UnitTest& unit_test);
-
-  // Produces a string representing the test properties in a result as space
-  // delimited XML attributes based on the property key="value" pairs.
-  // When the String is not empty, it includes a space at the beginning,
-  // to delimit this attribute from prior attributes.
-  static String TestPropertiesAsXmlAttributes(const TestResult& result);
-
-  // The output file.
-  const String output_file_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
-};
-
-// Creates a new XmlUnitTestResultPrinter.
-XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
-    : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Called after the unit test ends.
-void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-  PrintXmlUnitTest(xmlout, unit_test);
-  fclose(xmlout);
-}
-
-// Returns an XML-escaped copy of the input string str.  If is_attribute
-// is true, the text is meant to appear as an attribute value, and
-// normalizable whitespace is preserved by replacing it with character
-// references.
-//
-// Invalid XML characters in str, if any, are stripped from the output.
-// It is expected that most, if not all, of the text processed by this
-// module will consist of ordinary English text.
-// If this module is ever modified to produce version 1.1 XML output,
-// most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
-// escaping scheme for invalid characters, rather than dropping them.
-String XmlUnitTestResultPrinter::EscapeXml(const char* str, bool is_attribute) {
-  Message m;
-
-  if (str != NULL) {
-    for (const char* src = str; *src; ++src) {
-      switch (*src) {
-        case '<':
-          m << "&lt;";
-          break;
-        case '>':
-          m << "&gt;";
-          break;
-        case '&':
-          m << "&amp;";
-          break;
-        case '\'':
-          if (is_attribute)
-            m << "&apos;";
-          else
-            m << '\'';
-          break;
-        case '"':
-          if (is_attribute)
-            m << "&quot;";
-          else
-            m << '"';
-          break;
-        default:
-          if (IsValidXmlCharacter(*src)) {
-            if (is_attribute && IsNormalizableWhitespace(*src))
-              m << String::Format("&#x%02X;", unsigned(*src));
-            else
-              m << *src;
-          }
-          break;
-      }
-    }
-  }
-
-  return m.GetString();
-}
-
-// Returns the given string with all characters invalid in XML removed.
-// Currently invalid characters are dropped from the string. An
-// alternative is to replace them with certain characters such as . or ?.
-string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(const string& str) {
-  string output;
-  output.reserve(str.size());
-  for (string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
-      output.push_back(*it);
-
-  return output;
-}
-
-// The following routines generate an XML representation of a UnitTest
-// object.
-//
-// This is how Google Test concepts map to the DTD:
-//
-// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
-//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
-//     <testcase name="test-name">     <-- corresponds to a TestInfo object
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//                                     <-- individual assertion failures
-//     </testcase>
-//   </testsuite>
-// </testsuites>
-
-// Formats the given time in milliseconds as seconds.
-std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
-  ::std::stringstream ss;
-  ss << ms/1000.0;
-  return ss.str();
-}
-
-// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
-                                                     const char* data) {
-  const char* segment = data;
-  *stream << "<![CDATA[";
-  for (;;) {
-    const char* const next_segment = strstr(segment, "]]>");
-    if (next_segment != NULL) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
-      *stream << "]]>]]&gt;<![CDATA[";
-      segment = next_segment + strlen("]]>");
-    } else {
-      *stream << segment;
-      break;
-    }
-  }
-  *stream << "]]>";
-}
-
-// Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
-void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
-                                                 const char* test_case_name,
-                                                 const TestInfo& test_info) {
-  const TestResult& result = *test_info.result();
-  *stream << "    <testcase name=\""
-          << EscapeXmlAttribute(test_info.name()).c_str() << "\"";
-
-  if (test_info.value_param() != NULL) {
-    *stream << " value_param=\"" << EscapeXmlAttribute(test_info.value_param())
-            << "\"";
-  }
-  if (test_info.type_param() != NULL) {
-    *stream << " type_param=\"" << EscapeXmlAttribute(test_info.type_param())
-            << "\"";
-  }
-
-  *stream << " status=\""
-          << (test_info.should_run() ? "run" : "notrun")
-          << "\" time=\""
-          << FormatTimeInMillisAsSeconds(result.elapsed_time())
-          << "\" classname=\"" << EscapeXmlAttribute(test_case_name).c_str()
-          << "\"" << TestPropertiesAsXmlAttributes(result).c_str();
-
-  int failures = 0;
-  for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult& part = result.GetTestPartResult(i);
-    if (part.failed()) {
-      if (++failures == 1)
-        *stream << ">\n";
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(part.summary()).c_str()
-              << "\" type=\"\">";
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string message = location + "\n" + part.message();
-      OutputXmlCDataSection(stream,
-                            RemoveInvalidXmlCharacters(message).c_str());
-      *stream << "</failure>\n";
-    }
-  }
-
-  if (failures == 0)
-    *stream << " />\n";
-  else
-    *stream << "    </testcase>\n";
-}
-
-// Prints an XML representation of a TestCase object
-void XmlUnitTestResultPrinter::PrintXmlTestCase(FILE* out,
-                                                const TestCase& test_case) {
-  fprintf(out,
-          "  <testsuite name=\"%s\" tests=\"%d\" failures=\"%d\" "
-          "disabled=\"%d\" ",
-          EscapeXmlAttribute(test_case.name()).c_str(),
-          test_case.total_test_count(),
-          test_case.failed_test_count(),
-          test_case.disabled_test_count());
-  fprintf(out,
-          "errors=\"0\" time=\"%s\">\n",
-          FormatTimeInMillisAsSeconds(test_case.elapsed_time()).c_str());
-  for (int i = 0; i < test_case.total_test_count(); ++i) {
-    ::std::stringstream stream;
-    OutputXmlTestInfo(&stream, test_case.name(), *test_case.GetTestInfo(i));
-    fprintf(out, "%s", StringStreamToString(&stream).c_str());
-  }
-  fprintf(out, "  </testsuite>\n");
-}
-
-// Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(FILE* out,
-                                                const UnitTest& unit_test) {
-  fprintf(out, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
-  fprintf(out,
-          "<testsuites tests=\"%d\" failures=\"%d\" disabled=\"%d\" "
-          "errors=\"0\" time=\"%s\" ",
-          unit_test.total_test_count(),
-          unit_test.failed_test_count(),
-          unit_test.disabled_test_count(),
-          FormatTimeInMillisAsSeconds(unit_test.elapsed_time()).c_str());
-  if (GTEST_FLAG(shuffle)) {
-    fprintf(out, "random_seed=\"%d\" ", unit_test.random_seed());
-  }
-  fprintf(out, "name=\"AllTests\">\n");
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i)
-    PrintXmlTestCase(out, *unit_test.GetTestCase(i));
-  fprintf(out, "</testsuites>\n");
-}
-
-// Produces a string representing the test properties in a result as space
-// delimited XML attributes based on the property key="value" pairs.
-String XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
-    const TestResult& result) {
-  Message attributes;
-  for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
-    attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
-  }
-  return attributes.GetString();
-}
-
-// End XmlUnitTestResultPrinter
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Streams test results to the given port on the given host machine.
-class StreamingListener : public EmptyTestEventListener {
- public:
-  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
-
-  StreamingListener(const string& host, const string& port)
-      : sockfd_(-1), host_name_(host), port_num_(port) {
-    MakeConnection();
-    Send("gtest_streaming_protocol_version=1.0\n");
-  }
-
-  virtual ~StreamingListener() {
-    if (sockfd_ != -1)
-      CloseConnection();
-  }
-
-  void OnTestProgramStart(const UnitTest& /* unit_test */) {
-    Send("event=TestProgramStart\n");
-  }
-
-  void OnTestProgramEnd(const UnitTest& unit_test) {
-    // Note that Google Test current only report elapsed time for each
-    // test iteration, not for the entire test program.
-    Send(String::Format("event=TestProgramEnd&passed=%d\n",
-                        unit_test.Passed()));
-
-    // Notify the streaming server to stop.
-    CloseConnection();
-  }
-
-  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
-    Send(String::Format("event=TestIterationStart&iteration=%d\n",
-                        iteration));
-  }
-
-  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
-    Send(String::Format("event=TestIterationEnd&passed=%d&elapsed_time=%sms\n",
-                        unit_test.Passed(),
-                        StreamableToString(unit_test.elapsed_time()).c_str()));
-  }
-
-  void OnTestCaseStart(const TestCase& test_case) {
-    Send(String::Format("event=TestCaseStart&name=%s\n", test_case.name()));
-  }
-
-  void OnTestCaseEnd(const TestCase& test_case) {
-    Send(String::Format("event=TestCaseEnd&passed=%d&elapsed_time=%sms\n",
-                        test_case.Passed(),
-                        StreamableToString(test_case.elapsed_time()).c_str()));
-  }
-
-  void OnTestStart(const TestInfo& test_info) {
-    Send(String::Format("event=TestStart&name=%s\n", test_info.name()));
-  }
-
-  void OnTestEnd(const TestInfo& test_info) {
-    Send(String::Format(
-        "event=TestEnd&passed=%d&elapsed_time=%sms\n",
-        (test_info.result())->Passed(),
-        StreamableToString((test_info.result())->elapsed_time()).c_str()));
-  }
-
-  void OnTestPartResult(const TestPartResult& test_part_result) {
-    const char* file_name = test_part_result.file_name();
-    if (file_name == NULL)
-      file_name = "";
-    Send(String::Format("event=TestPartResult&file=%s&line=%d&message=",
-                        UrlEncode(file_name).c_str(),
-                        test_part_result.line_number()));
-    Send(UrlEncode(test_part_result.message()) + "\n");
-  }
-
- private:
-  // Creates a client socket and connects to the server.
-  void MakeConnection();
-
-  // Closes the socket.
-  void CloseConnection() {
-    GTEST_CHECK_(sockfd_ != -1)
-        << "CloseConnection() can be called only when there is a connection.";
-
-    close(sockfd_);
-    sockfd_ = -1;
-  }
-
-  // Sends a string to the socket.
-  void Send(const string& message) {
-    GTEST_CHECK_(sockfd_ != -1)
-        << "Send() can be called only when there is a connection.";
-
-    const int len = static_cast<int>(message.length());
-    if (write(sockfd_, message.c_str(), len) != len) {
-      GTEST_LOG_(WARNING)
-          << "stream_result_to: failed to stream to "
-          << host_name_ << ":" << port_num_;
-    }
-  }
-
-  int sockfd_;   // socket file descriptor
-  const string host_name_;
-  const string port_num_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
-};  // class StreamingListener
-
-// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
-// replaces them by "%xx" where xx is their hexadecimal value. For
-// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
-// in both time and space -- important as the input str may contain an
-// arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
-  result.reserve(strlen(str) + 1);
-  for (char ch = *str; ch != '\0'; ch = *++str) {
-    switch (ch) {
-      case '%':
-      case '=':
-      case '&':
-      case '\n':
-        result.append(String::Format("%%%02x", static_cast<unsigned char>(ch)));
-        break;
-      default:
-        result.push_back(ch);
-        break;
-    }
-  }
-  return result;
-}
-
-void StreamingListener::MakeConnection() {
-  GTEST_CHECK_(sockfd_ == -1)
-      << "MakeConnection() can't be called when there is already a connection.";
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
-  hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = NULL;
-
-  // Use the getaddrinfo() to get a linked list of IP addresses for
-  // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
-  if (error_num != 0) {
-    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
-                        << gai_strerror(error_num);
-  }
-
-  // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
-       cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
-    if (sockfd_ != -1) {
-      // Connect the client socket to the server socket.
-      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
-        close(sockfd_);
-        sockfd_ = -1;
-      }
-    }
-  }
-
-  freeaddrinfo(servinfo);  // all done with this structure
-
-  if (sockfd_ == -1) {
-    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
-                        << host_name_ << ":" << port_num_;
-  }
-}
-
-// End of class Streaming Listener
-#endif  // GTEST_CAN_STREAM_RESULTS__
-
-// Class ScopedTrace
-
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-// L < UnitTest::mutex_
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
-
-  UnitTest::GetInstance()->PushGTestTrace(trace);
-}
-
-// Pops the info pushed by the c'tor.
-// L < UnitTest::mutex_
-ScopedTrace::~ScopedTrace() {
-  UnitTest::GetInstance()->PopGTestTrace();
-}
-
-
-// class OsStackTraceGetter
-
-// Returns the current OS stack trace as a String.  Parameters:
-//
-//   max_depth  - the maximum number of stack frames to be included
-//                in the trace.
-//   skip_count - the number of top frames to be skipped; doesn't count
-//                against max_depth.
-//
-// L < mutex_
-// We use "L < mutex_" to denote that the function may acquire mutex_.
-String OsStackTraceGetter::CurrentStackTrace(int, int) {
-  return String("");
-}
-
-// L < mutex_
-void OsStackTraceGetter::UponLeavingGTest() {
-}
-
-const char* const
-OsStackTraceGetter::kElidedFramesMarker =
-    "... " GTEST_NAME_ " internal frames ...";
-
-}  // namespace internal
-
-// class TestEventListeners
-
-TestEventListeners::TestEventListeners()
-    : repeater_(new internal::TestEventRepeater()),
-      default_result_printer_(NULL),
-      default_xml_generator_(NULL) {
-}
-
-TestEventListeners::~TestEventListeners() { delete repeater_; }
-
-// Returns the standard listener responsible for the default console
-// output.  Can be removed from the listeners list to shut down default
-// console output.  Note that removing this object from the listener list
-// with Release transfers its ownership to the user.
-void TestEventListeners::Append(TestEventListener* listener) {
-  repeater_->Append(listener);
-}
-
-// Removes the given event listener from the list and returns it.  It then
-// becomes the caller's responsibility to delete the listener. Returns
-// NULL if the listener is not found in the list.
-TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
-  if (listener == default_result_printer_)
-    default_result_printer_ = NULL;
-  else if (listener == default_xml_generator_)
-    default_xml_generator_ = NULL;
-  return repeater_->Release(listener);
-}
-
-// Returns repeater that broadcasts the TestEventListener events to all
-// subscribers.
-TestEventListener* TestEventListeners::repeater() { return repeater_; }
-
-// Sets the default_result_printer attribute to the provided listener.
-// The listener is also added to the listener list and previous
-// default_result_printer is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
-  if (default_result_printer_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_result_printer_);
-    default_result_printer_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Sets the default_xml_generator attribute to the provided listener.  The
-// listener is also added to the listener list and previous
-// default_xml_generator is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
-  if (default_xml_generator_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_xml_generator_);
-    default_xml_generator_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Controls whether events will be forwarded by the repeater to the
-// listeners in the list.
-bool TestEventListeners::EventForwardingEnabled() const {
-  return repeater_->forwarding_enabled();
-}
-
-void TestEventListeners::SuppressEventForwarding() {
-  repeater_->set_forwarding_enabled(false);
-}
-
-// class UnitTest
-
-// Gets the singleton UnitTest object.  The first time this method is
-// called, a UnitTest object is constructed and returned.  Consecutive
-// calls will return the same object.
-//
-// We don't protect this under mutex_ as a user is not supposed to
-// call this before main() starts, from which point on the return
-// value will never change.
-UnitTest * UnitTest::GetInstance() {
-  // When compiled with MSVC 7.1 in optimized mode, destroying the
-  // UnitTest object upon exiting the program messes up the exit code,
-  // causing successful tests to appear failed.  We have to use a
-  // different implementation in this case to bypass the compiler bug.
-  // This implementation makes the compiler happy, at the cost of
-  // leaking the UnitTest object.
-
-  // CodeGear C++Builder insists on a public destructor for the
-  // default implementation.  Use this implementation to keep good OO
-  // design with private destructor.
-
-#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-  static UnitTest* const instance = new UnitTest;
-  return instance;
-#else
-  static UnitTest instance;
-  return &instance;
-#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-}
-
-// Gets the number of successful test cases.
-int UnitTest::successful_test_case_count() const {
-  return impl()->successful_test_case_count();
-}
-
-// Gets the number of failed test cases.
-int UnitTest::failed_test_case_count() const {
-  return impl()->failed_test_case_count();
-}
-
-// Gets the number of all test cases.
-int UnitTest::total_test_case_count() const {
-  return impl()->total_test_case_count();
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTest::test_case_to_run_count() const {
-  return impl()->test_case_to_run_count();
-}
-
-// Gets the number of successful tests.
-int UnitTest::successful_test_count() const {
-  return impl()->successful_test_count();
-}
-
-// Gets the number of failed tests.
-int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
-
-// Gets the number of disabled tests.
-int UnitTest::disabled_test_count() const {
-  return impl()->disabled_test_count();
-}
-
-// Gets the number of all tests.
-int UnitTest::total_test_count() const { return impl()->total_test_count(); }
-
-// Gets the number of tests that should run.
-int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
-
-// Gets the elapsed time, in milliseconds.
-internal::TimeInMillis UnitTest::elapsed_time() const {
-  return impl()->elapsed_time();
-}
-
-// Returns true iff the unit test passed (i.e. all test cases passed).
-bool UnitTest::Passed() const { return impl()->Passed(); }
-
-// Returns true iff the unit test failed (i.e. some test case failed
-// or something outside of all tests failed).
-bool UnitTest::Failed() const { return impl()->Failed(); }
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-const TestCase* UnitTest::GetTestCase(int i) const {
-  return impl()->GetTestCase(i);
-}
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-TestCase* UnitTest::GetMutableTestCase(int i) {
-  return impl()->GetMutableTestCase(i);
-}
-
-// Returns the list of event listeners that can be used to track events
-// inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
-
-// Registers and returns a global test environment.  When a test
-// program is run, all global test environments will be set-up in the
-// order they were registered.  After all tests in the program have
-// finished, all global test environments will be torn-down in the
-// *reverse* order they were registered.
-//
-// The UnitTest object takes ownership of the given environment.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-Environment* UnitTest::AddEnvironment(Environment* env) {
-  if (env == NULL) {
-    return NULL;
-  }
-
-  impl_->environments().push_back(env);
-  return env;
-}
-
-// Adds a TestPartResult to the current TestResult object.  All Google Test
-// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
-// this to report their results.  The user code should use the
-// assertion macros instead of calling this directly.
-// L < mutex_
-void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
-                                 const char* file_name,
-                                 int line_number,
-                                 const internal::String& message,
-                                 const internal::String& os_stack_trace) {
-  Message msg;
-  msg << message;
-
-  internal::MutexLock lock(&mutex_);
-  if (impl_->gtest_trace_stack().size() > 0) {
-    msg << "\n" << GTEST_NAME_ << " trace:";
-
-    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
-         i > 0; --i) {
-      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
-    }
-  }
-
-  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
-    msg << internal::kStackTraceMarker << os_stack_trace;
-  }
-
-  const TestPartResult result =
-    TestPartResult(result_type, file_name, line_number,
-                   msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
-
-  if (result_type != TestPartResult::kSuccess) {
-    // gtest_break_on_failure takes precedence over
-    // gtest_throw_on_failure.  This allows a user to set the latter
-    // in the code (perhaps in order to use Google Test assertions
-    // with another testing framework) and specify the former on the
-    // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
-#if GTEST_OS_WINDOWS
-      // Using DebugBreak on Windows allows gtest to still break into a debugger
-      // when a failure happens and both the --gtest_break_on_failure and
-      // the --gtest_catch_exceptions flags are specified.
-      DebugBreak();
-#else
-      abort();
-#endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
-#if GTEST_HAS_EXCEPTIONS
-      throw GoogleTestFailureException(result);
-#else
-      // We cannot call abort() as it generates a pop-up in debug mode
-      // that cannot be suppressed in VC 7.1 or below.
-      exit(1);
-#endif
-    }
-  }
-}
-
-// Creates and adds a property to the current TestResult. If a property matching
-// the supplied value already exists, updates its value instead.
-void UnitTest::RecordPropertyForCurrentTest(const char* key,
-                                            const char* value) {
-  const TestProperty test_property(key, value);
-  impl_->current_test_result()->RecordProperty(test_property);
-}
-
-// Runs all tests in this UnitTest object and prints the result.
-// Returns 0 if successful, or 1 otherwise.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-int UnitTest::Run() {
-  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
-  // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
-
-#if GTEST_HAS_SEH
-  const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
-
-  // Either the user wants Google Test to catch exceptions thrown by the
-  // tests or this is executing in the context of death test child
-  // process. In either case the user does not want to see pop-up dialogs
-  // about crashes - they are expected.
-  if (impl()->catch_exceptions() || in_death_test_child_process) {
-
-# if !GTEST_OS_WINDOWS_MOBILE
-    // SetErrorMode doesn't exist on CE.
-    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
-                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
-
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
-    // Death test children can be terminated with _abort().  On Windows,
-    // _abort() can show a dialog with a warning message.  This forces the
-    // abort message to go to stderr instead.
-    _set_error_mode(_OUT_TO_STDERR);
-# endif
-
-# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
-    // In the debug version, Visual Studio pops up a separate dialog
-    // offering a choice to debug the aborted program. We need to suppress
-    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
-    // executed. Google Test will notify the user of any unexpected
-    // failure via stderr.
-    //
-    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
-    // Users of prior VC versions shall suffer the agony and pain of
-    // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
-    // debug mode when compiled with VC 7.1 or lower.
-    if (!GTEST_FLAG(break_on_failure))
-      _set_abort_behavior(
-          0x0,                                    // Clear the following flags:
-          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
-# endif
-
-  }
-#endif  // GTEST_HAS_SEH
-
-  return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
-}
-
-// Returns the working directory when the first TEST() or TEST_F() was
-// executed.
-const char* UnitTest::original_working_dir() const {
-  return impl_->original_working_dir_.c_str();
-}
-
-// Returns the TestCase object for the test that's currently running,
-// or NULL if no test is running.
-// L < mutex_
-const TestCase* UnitTest::current_test_case() const {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_case();
-}
-
-// Returns the TestInfo object for the test that's currently running,
-// or NULL if no test is running.
-// L < mutex_
-const TestInfo* UnitTest::current_test_info() const {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_info();
-}
-
-// Returns the random seed used at the start of the current test run.
-int UnitTest::random_seed() const { return impl_->random_seed(); }
-
-#if GTEST_HAS_PARAM_TEST
-// Returns ParameterizedTestCaseRegistry object used to keep track of
-// value-parameterized tests and instantiate and register them.
-// L < mutex_
-internal::ParameterizedTestCaseRegistry&
-    UnitTest::parameterized_test_registry() {
-  return impl_->parameterized_test_registry();
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-// Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
-
-// Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
-
-// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
-// Google Test trace stack.
-// L < mutex_
-void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().push_back(trace);
-}
-
-// Pops a trace from the per-thread Google Test trace stack.
-// L < mutex_
-void UnitTest::PopGTestTrace() {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().pop_back();
-}
-
-namespace internal {
-
-UnitTestImpl::UnitTestImpl(UnitTest* parent)
-    : parent_(parent),
-#ifdef _MSC_VER
-# pragma warning(push)                    // Saves the current warning state.
-# pragma warning(disable:4355)            // Temporarily disables warning 4355
-                                         // (using this in initializer).
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-# pragma warning(pop)                     // Restores the warning state again.
-#else
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-#endif  // _MSC_VER
-      global_test_part_result_repoter_(
-          &default_global_test_part_result_reporter_),
-      per_thread_test_part_result_reporter_(
-          &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
-      parameterized_test_registry_(),
-      parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
-      last_death_test_case_(-1),
-      current_test_case_(NULL),
-      current_test_info_(NULL),
-      ad_hoc_test_result_(),
-      os_stack_trace_getter_(NULL),
-      post_flag_parse_init_performed_(false),
-      random_seed_(0),  // Will be overridden by the flag before first use.
-      random_(0),  // Will be reseeded before first use.
-      elapsed_time_(0),
-#if GTEST_HAS_DEATH_TEST
-      internal_run_death_test_flag_(NULL),
-      death_test_factory_(new DefaultDeathTestFactory),
-#endif
-      // Will be overridden by the flag before first use.
-      catch_exceptions_(false) {
-  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
-}
-
-UnitTestImpl::~UnitTestImpl() {
-  // Deletes every TestCase.
-  ForEach(test_cases_, internal::Delete<TestCase>);
-
-  // Deletes every Environment.
-  ForEach(environments_, internal::Delete<Environment>);
-
-  delete os_stack_trace_getter_;
-}
-
-#if GTEST_HAS_DEATH_TEST
-// Disables event forwarding if the control is currently in a death test
-// subprocess. Must not be called before InitGoogleTest.
-void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
-  if (internal_run_death_test_flag_.get() != NULL)
-    listeners()->SuppressEventForwarding();
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Initializes event listeners performing XML output as specified by
-// UnitTestOptions. Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureXmlOutput() {
-  const String& output_format = UnitTestOptions::GetOutputFormat();
-  if (output_format == "xml") {
-    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
-        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
-  } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
-  }
-}
-
-#if GTEST_CAN_STREAM_RESULTS_
-// Initializes event listeners for streaming test results in String form.
-// Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureStreamingOutput() {
-  const string& target = GTEST_FLAG(stream_result_to);
-  if (!target.empty()) {
-    const size_t pos = target.find(':');
-    if (pos != string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
-    } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
-    }
-  }
-}
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-// Performs initialization dependent upon flag values obtained in
-// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-// this function is also called from RunAllTests.  Since this function can be
-// called more than once, it has to be idempotent.
-void UnitTestImpl::PostFlagParsingInit() {
-  // Ensures that this function does not execute more than once.
-  if (!post_flag_parse_init_performed_) {
-    post_flag_parse_init_performed_ = true;
-
-#if GTEST_HAS_DEATH_TEST
-    InitDeathTestSubprocessControlInfo();
-    SuppressTestEventsIfInSubprocess();
-#endif  // GTEST_HAS_DEATH_TEST
-
-    // Registers parameterized tests. This makes parameterized tests
-    // available to the UnitTest reflection API without running
-    // RUN_ALL_TESTS.
-    RegisterParameterizedTests();
-
-    // Configures listeners for XML output. This makes it possible for users
-    // to shut down the default XML output before invoking RUN_ALL_TESTS.
-    ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-    // Configures listeners for streaming test results to the specified server.
-    ConfigureStreamingOutput();
-#endif  // GTEST_CAN_STREAM_RESULTS_
-  }
-}
-
-// A predicate that checks the name of a TestCase against a known
-// value.
-//
-// This is used for implementation of the UnitTest class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestCaseNameIs is copyable.
-class TestCaseNameIs {
- public:
-  // Constructor.
-  explicit TestCaseNameIs(const String& name)
-      : name_(name) {}
-
-  // Returns true iff the name of test_case matches name_.
-  bool operator()(const TestCase* test_case) const {
-    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
-  }
-
- private:
-  String name_;
-};
-
-// Finds and returns a TestCase with the given name.  If one doesn't
-// exist, creates one and returns it.  It's the CALLER'S
-// RESPONSIBILITY to ensure that this function is only called WHEN THE
-// TESTS ARE NOT SHUFFLED.
-//
-// Arguments:
-//
-//   test_case_name: name of the test case
-//   type_param:     the name of the test case's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test case.
-//   set_up_tc:      pointer to the function that sets up the test case
-//   tear_down_tc:   pointer to the function that tears down the test case
-TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
-                                    const char* type_param,
-                                    Test::SetUpTestCaseFunc set_up_tc,
-                                    Test::TearDownTestCaseFunc tear_down_tc) {
-  // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
-                   TestCaseNameIs(test_case_name));
-
-  if (test_case != test_cases_.end())
-    return *test_case;
-
-  // No.  Let's create one.
-  TestCase* const new_test_case =
-      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
-
-  // Is this a death test case?
-  if (internal::UnitTestOptions::MatchesFilter(String(test_case_name),
-                                               kDeathTestCaseFilter)) {
-    // Yes.  Inserts the test case after the last death test case
-    // defined so far.  This only works when the test cases haven't
-    // been shuffled.  Otherwise we may end up running a death test
-    // after a non-death test.
-    ++last_death_test_case_;
-    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
-                       new_test_case);
-  } else {
-    // No.  Appends to the end of the list.
-    test_cases_.push_back(new_test_case);
-  }
-
-  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
-  return new_test_case;
-}
-
-// Helpers for setting up / tearing down the given environment.  They
-// are for use in the ForEach() function.
-static void SetUpEnvironment(Environment* env) { env->SetUp(); }
-static void TearDownEnvironment(Environment* env) { env->TearDown(); }
-
-// Runs all tests in this UnitTest object, prints the result, and
-// returns true if all tests are successful.  If any exception is
-// thrown during a test, the test is considered to be failed, but the
-// rest of the tests will still be run.
-//
-// When parameterized tests are enabled, it expands and registers
-// parameterized tests first in RegisterParameterizedTests().
-// All other functions called from RunAllTests() may safely assume that
-// parameterized tests are ready to be counted and run.
-bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
-
-  // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
-
-  // Repeats the call to the post-flag parsing initialization in case the
-  // user didn't call InitGoogleTest.
-  PostFlagParsingInit();
-
-  // Even if sharding is not on, test runners may want to use the
-  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
-  // protocol.
-  internal::WriteToShardStatusFileIfNeeded();
-
-  // True iff we are in a subprocess for running a thread-safe-style
-  // death test.
-  bool in_subprocess_for_death_test = false;
-
-#if GTEST_HAS_DEATH_TEST
-  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
-#endif  // GTEST_HAS_DEATH_TEST
-
-  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
-                                        in_subprocess_for_death_test);
-
-  // Compares the full test names with the filter to decide which
-  // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
-
-  // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
-    // This must be called *after* FilterTests() has been called.
-    ListTestsMatchingFilter();
-    return true;
-  }
-
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
-
-  // True iff at least one test has failed.
-  bool failed = false;
-
-  TestEventListener* repeater = listeners()->repeater();
-
-  repeater->OnTestProgramStart(*parent_);
-
-  // How many times to repeat the tests?  We don't want to repeat them
-  // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
-  // Repeats forever if the repeat count is negative.
-  const bool forever = repeat < 0;
-  for (int i = 0; forever || i != repeat; i++) {
-    // We want to preserve failures generated by ad-hoc test
-    // assertions executed before RUN_ALL_TESTS().
-    ClearNonAdHocTestResult();
-
-    const TimeInMillis start = GetTimeInMillis();
-
-    // Shuffles test cases and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
-      random()->Reseed(random_seed_);
-      // This should be done before calling OnTestIterationStart(),
-      // such that a test event listener can see the actual test order
-      // in the event.
-      ShuffleTests();
-    }
-
-    // Tells the unit test event listeners that the tests are about to start.
-    repeater->OnTestIterationStart(*parent_, i);
-
-    // Runs each test case if there is at least one test to run.
-    if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
-
-      // Runs the tests only if there was no fatal failure during global
-      // set-up.
-      if (!Test::HasFatalFailure()) {
-        for (int test_index = 0; test_index < total_test_case_count();
-             test_index++) {
-          GetMutableTestCase(test_index)->Run();
-        }
-      }
-
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
-    }
-
-    elapsed_time_ = GetTimeInMillis() - start;
-
-    // Tells the unit test event listener that the tests have just finished.
-    repeater->OnTestIterationEnd(*parent_, i);
-
-    // Gets the result and clears it.
-    if (!Passed()) {
-      failed = true;
-    }
-
-    // Restores the original test order after the iteration.  This
-    // allows the user to quickly repro a failure that happens in the
-    // N-th iteration without repeating the first (N - 1) iterations.
-    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
-    // case the user somehow changes the value of the flag somewhere
-    // (it's always safe to unshuffle the tests).
-    UnshuffleTests();
-
-    if (GTEST_FLAG(shuffle)) {
-      // Picks a new random seed for each iteration.
-      random_seed_ = GetNextRandomSeed(random_seed_);
-    }
-  }
-
-  repeater->OnTestProgramEnd(*parent_);
-
-  return !failed;
-}
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded() {
-  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
-  if (test_shard_file != NULL) {
-    FILE* const file = posix::FOpen(test_shard_file, "w");
-    if (file == NULL) {
-      ColoredPrintf(COLOR_RED,
-                    "Could not write to the test shard status file \"%s\" "
-                    "specified by the %s environment variable.\n",
-                    test_shard_file, kTestShardStatusFile);
-      fflush(stdout);
-      exit(EXIT_FAILURE);
-    }
-    fclose(file);
-  }
-}
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (i.e., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
-                 bool in_subprocess_for_death_test) {
-  if (in_subprocess_for_death_test) {
-    return false;
-  }
-
-  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
-  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
-
-  if (total_shards == -1 && shard_index == -1) {
-    return false;
-  } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (total_shards != -1 && shard_index == -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  }
-
-  return total_shards > 1;
-}
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error
-// and aborts.
-Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
-  const char* str_val = posix::GetEnv(var);
-  if (str_val == NULL) {
-    return default_val;
-  }
-
-  Int32 result;
-  if (!ParseInt32(Message() << "The value of environment variable " << var,
-                  str_val, &result)) {
-    exit(EXIT_FAILURE);
-  }
-  return result;
-}
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
-  return (test_id % total_shards) == shard_index;
-}
-
-// Compares the name of each test with the user-specified filter to
-// decide whether the test should be run, then records the result in
-// each TestCase and TestInfo object.
-// If shard_tests == true, further filters tests based on sharding
-// variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
-int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
-  // num_runnable_tests are the number of tests that will
-  // run across all shards (i.e., match filter and are not disabled).
-  // num_selected_tests are the number of tests to be run on
-  // this shard.
-  int num_runnable_tests = 0;
-  int num_selected_tests = 0;
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    TestCase* const test_case = test_cases_[i];
-    const String &test_case_name = test_case->name();
-    test_case->set_should_run(false);
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      TestInfo* const test_info = test_case->test_info_list()[j];
-      const String test_name(test_info->name());
-      // A test is disabled if test case name or test name matches
-      // kDisableTestFilter.
-      const bool is_disabled =
-          internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                                   kDisableTestFilter) ||
-          internal::UnitTestOptions::MatchesFilter(test_name,
-                                                   kDisableTestFilter);
-      test_info->is_disabled_ = is_disabled;
-
-      const bool matches_filter =
-          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
-                                                       test_name);
-      test_info->matches_filter_ = matches_filter;
-
-      const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
-          matches_filter;
-
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
-
-      num_runnable_tests += is_runnable;
-      num_selected_tests += is_selected;
-
-      test_info->should_run_ = is_selected;
-      test_case->set_should_run(test_case->should_run() || is_selected);
-    }
-  }
-  return num_selected_tests;
-}
-
-// Prints the names of the tests matching the user-specified filter flag.
-void UnitTestImpl::ListTestsMatchingFilter() {
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    const TestCase* const test_case = test_cases_[i];
-    bool printed_test_case_name = false;
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      const TestInfo* const test_info =
-          test_case->test_info_list()[j];
-      if (test_info->matches_filter_) {
-        if (!printed_test_case_name) {
-          printed_test_case_name = true;
-          printf("%s.\n", test_case->name());
-        }
-        printf("  %s\n", test_info->name());
-      }
-    }
-  }
-  fflush(stdout);
-}
-
-// Sets the OS stack trace getter.
-//
-// Does nothing if the input and the current OS stack trace getter are
-// the same; otherwise, deletes the old getter and makes the input the
-// current getter.
-void UnitTestImpl::set_os_stack_trace_getter(
-    OsStackTraceGetterInterface* getter) {
-  if (os_stack_trace_getter_ != getter) {
-    delete os_stack_trace_getter_;
-    os_stack_trace_getter_ = getter;
-  }
-}
-
-// Returns the current OS stack trace getter if it is not NULL;
-// otherwise, creates an OsStackTraceGetter, makes it the current
-// getter, and returns it.
-OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
-  if (os_stack_trace_getter_ == NULL) {
-    os_stack_trace_getter_ = new OsStackTraceGetter;
-  }
-
-  return os_stack_trace_getter_;
-}
-
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
-TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
-}
-
-// Shuffles all test cases, and the tests within each test case,
-// making sure that death tests are still run first.
-void UnitTestImpl::ShuffleTests() {
-  // Shuffles the death test cases.
-  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
-
-  // Shuffles the non-death test cases.
-  ShuffleRange(random(), last_death_test_case_ + 1,
-               static_cast<int>(test_cases_.size()), &test_case_indices_);
-
-  // Shuffles the tests inside each test case.
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    test_cases_[i]->ShuffleTests(random());
-  }
-}
-
-// Restores the test cases and tests to their order before the first shuffle.
-void UnitTestImpl::UnshuffleTests() {
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    // Unshuffles the tests in each test case.
-    test_cases_[i]->UnshuffleTests();
-    // Resets the index of each test case.
-    test_case_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Returns the current OS stack trace as a String.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-String GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                       int skip_count) {
-  // We pass skip_count + 1 to skip this wrapper function in addition
-  // to what the user really wants to skip.
-  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
-}
-
-// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
-// suppress unreachable code warnings.
-namespace {
-class ClassUniqueToAlwaysTrue {};
-}
-
-bool IsTrue(bool condition) { return condition; }
-
-bool AlwaysTrue() {
-#if GTEST_HAS_EXCEPTIONS
-  // This condition is always false so AlwaysTrue() never actually throws,
-  // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
-#endif  // GTEST_HAS_EXCEPTIONS
-  return true;
-}
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-bool SkipPrefix(const char* prefix, const char** pstr) {
-  const size_t prefix_len = strlen(prefix);
-  if (strncmp(*pstr, prefix, prefix_len) == 0) {
-    *pstr += prefix_len;
-    return true;
-  }
-  return false;
-}
-
-// Parses a string as a command line flag.  The string should have
-// the format "--flag=value".  When def_optional is true, the "=value"
-// part can be omitted.
-//
-// Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
-  // str and flag must not be NULL.
-  if (str == NULL || flag == NULL) return NULL;
-
-  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const String flag_str = String::Format("--%s%s", GTEST_FLAG_PREFIX_, flag);
-  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
-
-  // Skips the flag name.
-  const char* flag_end = str + flag_len;
-
-  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) {
-    return flag_end;
-  }
-
-  // If def_optional is true and there are more characters after the
-  // flag name, or if def_optional is false, there must be a '=' after
-  // the flag name.
-  if (flag_end[0] != '=') return NULL;
-
-  // Returns the string after "=".
-  return flag_end + 1;
-}
-
-// Parses a string for a bool flag, in the form of either
-// "--flag=value" or "--flag".
-//
-// In the former case, the value is taken as true as long as it does
-// not start with '0', 'f', or 'F'.
-//
-// In the latter case, the value is taken as true.
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
-  return true;
-}
-
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
-}
-
-// Parses a string for a string flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, String* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  *value = value_str;
-  return true;
-}
-
-// Determines whether a string has a prefix that Google Test uses for its
-// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
-// If Google Test detects that a command line flag has its prefix but is not
-// recognized, it will print its help message. Flags starting with
-// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
-// internal flags and do not trigger the help message.
-static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
-          SkipPrefix("/", &str)) &&
-         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
-         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
-          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
-}
-
-// Prints a string containing code-encoded text.  The following escape
-// sequences can be used in the string to control the text color:
-//
-//   @@    prints a single '@' character.
-//   @R    changes the color to red.
-//   @G    changes the color to green.
-//   @Y    changes the color to yellow.
-//   @D    changes to the default terminal text color.
-//
-// TODO(wan@google.com): Write tests for this once we add stdout
-// capturing to Google Test.
-static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
-
-  // Conceptually, we split the string into segments divided by escape
-  // sequences.  Then we print one segment at a time.  At the end of
-  // each iteration, the str pointer advances to the beginning of the
-  // next segment.
-  for (;;) {
-    const char* p = strchr(str, '@');
-    if (p == NULL) {
-      ColoredPrintf(color, "%s", str);
-      return;
-    }
-
-    ColoredPrintf(color, "%s", String(str, p - str).c_str());
-
-    const char ch = p[1];
-    str = p + 2;
-    if (ch == '@') {
-      ColoredPrintf(color, "@");
-    } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
-    } else if (ch == 'R') {
-      color = COLOR_RED;
-    } else if (ch == 'G') {
-      color = COLOR_GREEN;
-    } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
-    } else {
-      --str;
-    }
-  }
-}
-
-static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
-    "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
-    "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
-    "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.  The type parameter CharType can be
-// instantiated to either char or wchar_t.
-template <typename CharType>
-void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
-  for (int i = 1; i < *argc; i++) {
-    const String arg_string = StreamableToString(argv[i]);
-    const char* const arg = arg_string.c_str();
-
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
-
-    // Do we see a Google Test flag?
-    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                      &GTEST_FLAG(also_run_disabled_tests)) ||
-        ParseBoolFlag(arg, kBreakOnFailureFlag,
-                      &GTEST_FLAG(break_on_failure)) ||
-        ParseBoolFlag(arg, kCatchExceptionsFlag,
-                      &GTEST_FLAG(catch_exceptions)) ||
-        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-        ParseStringFlag(arg, kDeathTestStyleFlag,
-                        &GTEST_FLAG(death_test_style)) ||
-        ParseBoolFlag(arg, kDeathTestUseFork,
-                      &GTEST_FLAG(death_test_use_fork)) ||
-        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-        ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                        &GTEST_FLAG(internal_run_death_test)) ||
-        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-        ParseInt32Flag(arg, kStackTraceDepthFlag,
-                       &GTEST_FLAG(stack_trace_depth)) ||
-        ParseStringFlag(arg, kStreamResultToFlag,
-                        &GTEST_FLAG(stream_result_to)) ||
-        ParseBoolFlag(arg, kThrowOnFailureFlag,
-                      &GTEST_FLAG(throw_on_failure))
-        ) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
-      // that argv has (*argc + 1) elements, the last one always being
-      // NULL.  The following loop moves the trailing NULL element as
-      // well.
-      for (int j = i; j != *argc; j++) {
-        argv[j] = argv[j + 1];
-      }
-
-      // Decrements the argument count.
-      (*argc)--;
-
-      // We also need to decrement the iterator as we just removed
-      // an element.
-      i--;
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
-    }
-  }
-
-  if (g_help_flag) {
-    // We print the help here instead of in RUN_ALL_TESTS(), as the
-    // latter may not be called at all if the user is using Google
-    // Test with another testing framework.
-    PrintColorEncoded(kColorEncodedHelpMessage);
-  }
-}
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-
-// The internal implementation of InitGoogleTest().
-//
-// The type parameter CharType can be instantiated to either char or
-// wchar_t.
-template <typename CharType>
-void InitGoogleTestImpl(int* argc, CharType** argv) {
-  g_init_gtest_count++;
-
-  // We don't want to run the initialization code twice.
-  if (g_init_gtest_count != 1) return;
-
-  if (*argc <= 0) return;
-
-  internal::g_executable_path = internal::StreamableToString(argv[0]);
-
-#if GTEST_HAS_DEATH_TEST
-
-  g_argvs.clear();
-  for (int i = 0; i != *argc; i++) {
-    g_argvs.push_back(StreamableToString(argv[i]));
-  }
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-  ParseGoogleTestFlagsOnly(argc, argv);
-  GetUnitTestImpl()->PostFlagParsingInit();
-}
-
-}  // namespace internal
-
-// Initializes Google Test.  This must be called before calling
-// RUN_ALL_TESTS().  In particular, it parses a command line for the
-// flags that Google Test recognizes.  Whenever a Google Test flag is
-// seen, it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Test flag variables are
-// updated.
-//
-// Calling the function for the second time has no user-visible effect.
-void InitGoogleTest(int* argc, char** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-void InitGoogleTest(int* argc, wchar_t** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-}  // namespace testing
diff --git a/utils/unittest/googletest/include/gtest/gtest-test-part.h b/utils/unittest/googletest/include/gtest/gtest-test-part.h
index 8aeea14..98e8b84 100644
--- a/utils/unittest/googletest/include/gtest/gtest-test-part.h
+++ b/utils/unittest/googletest/include/gtest/gtest-test-part.h
@@ -142,7 +142,7 @@ class GTEST_API_ TestPartResultArray {
 // This interface knows how to report a test part result.
 class TestPartResultReporterInterface {
  public:
-  virtual ~TestPartResultReporterInterface() {}
+  virtual ~TestPartResultReporterInterface();
 
   virtual void ReportTestPartResult(const TestPartResult& result) = 0;
 };
diff --git a/utils/unittest/googletest/include/gtest/gtest.h b/utils/unittest/googletest/include/gtest/gtest.h
index 1734c44..07ed92b 100644
--- a/utils/unittest/googletest/include/gtest/gtest.h
+++ b/utils/unittest/googletest/include/gtest/gtest.h
@@ -910,7 +910,7 @@ class GTEST_API_ TestCase {
 class Environment {
  public:
   // The d'tor is virtual as we need to subclass Environment.
-  virtual ~Environment() {}
+  virtual ~Environment();
 
   // Override this to define how to set up the environment.
   virtual void SetUp() {}
@@ -928,7 +928,7 @@ class Environment {
 // the order the corresponding events are fired.
 class TestEventListener {
  public:
-  virtual ~TestEventListener() {}
+  virtual ~TestEventListener();
 
   // Fired before any test activity starts.
   virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
@@ -980,6 +980,7 @@ class TestEventListener {
 // comments about each method please see the definition of TestEventListener
 // above.
 class EmptyTestEventListener : public TestEventListener {
+  virtual void anchor();
  public:
   virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
   virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h b/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h
index 7bac2bd..8d53c45 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -79,7 +79,7 @@ class GTEST_API_ DeathTest {
   static bool Create(const char* statement, const RE* regex,
                      const char* file, int line, DeathTest** test);
   DeathTest();
-  virtual ~DeathTest() { }
+  virtual ~DeathTest();
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
@@ -139,7 +139,7 @@ class GTEST_API_ DeathTest {
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory() { }
+  virtual ~DeathTestFactory();
   virtual bool Create(const char* statement, const RE* regex,
                       const char* file, int line, DeathTest** test) = 0;
 };
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-internal-inl.h b/utils/unittest/googletest/include/gtest/internal/gtest-internal-inl.h
deleted file mode 100644
index 6554cfc..0000000
--- a/utils/unittest/googletest/include/gtest/internal/gtest-internal-inl.h
+++ /dev/null
@@ -1,1037 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// This file contains purely Google Test's internal implementation.  Please
-// DO NOT #INCLUDE IT IN A USER PROGRAM.
-
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
-
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// A user is trying to include this from his code - just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
-#ifndef _WIN32_WCE
-# include <errno.h>
-#endif  // !_WIN32_WCE
-#include <stddef.h>
-#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
-#include <string.h>  // For memmove.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-#include "gtest/gtest.h"  // NOLINT
-#include "gtest/gtest-spi.h"
-
-namespace testing {
-
-// Declares the flags.
-//
-// We don't want the users to modify this flag in the code, but want
-// Google Test's own unit tests to be able to access it. Therefore we
-// declare it here as opposed to in gtest.h.
-GTEST_DECLARE_bool_(death_test_use_fork);
-
-namespace internal {
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kPrintTimeFlag[] = "print_time";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-
-// A valid random seed must be in [1, kMaxRandomSeed].
-const int kMaxRandomSeed = 99999;
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-GTEST_API_ extern bool g_help_flag;
-
-// Returns the current time in milliseconds.
-GTEST_API_ TimeInMillis GetTimeInMillis();
-
-// Returns true iff Google Test should use colors in the output.
-GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
-
-// Formats the given time in milliseconds as seconds.
-GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
-
-// Parses a string for an Int32 flag, in the form of "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, Int32* value);
-
-// Returns a random seed in range [1, kMaxRandomSeed] based on the
-// given --gtest_random_seed flag value.
-inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
-
-  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
-  // it's easy to type.
-  const int normalized_seed =
-      static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
-  return normalized_seed;
-}
-
-// Returns the first valid random seed after 'seed'.  The behavior is
-// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
-// considered to be 1.
-inline int GetNextRandomSeed(int seed) {
-  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
-      << "Invalid random seed " << seed << " - must be in [1, "
-      << kMaxRandomSeed << "].";
-  const int next_seed = seed + 1;
-  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
-}
-
-// This class saves the values of all Google Test flags in its c'tor, and
-// restores them in its d'tor.
-class GTestFlagSaver {
- public:
-  // The c'tor.
-  GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    print_time_ = GTEST_FLAG(print_time);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
-  }
-
-  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
-  ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
-  }
- private:
-  // Fields for saving the original values of flags.
-  bool also_run_disabled_tests_;
-  bool break_on_failure_;
-  bool catch_exceptions_;
-  String color_;
-  String death_test_style_;
-  bool death_test_use_fork_;
-  String filter_;
-  String internal_run_death_test_;
-  bool list_tests_;
-  String output_;
-  bool print_time_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
-  bool shuffle_;
-  internal::Int32 stack_trace_depth_;
-  String stream_result_to_;
-  bool throw_on_failure_;
-} GTEST_ATTRIBUTE_UNUSED_;
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// The output buffer str must containt at least 32 characters.
-// The function returns the address of the output buffer.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'.
-GTEST_API_ char* CodePointToUtf8(UInt32 code_point, char* str);
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ String WideStringToUtf8(const wchar_t* str, int num_chars);
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded();
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (e.g., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-GTEST_API_ bool ShouldShard(const char* total_shards_str,
-                            const char* shard_index_str,
-                            bool in_subprocess_for_death_test);
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error and
-// and aborts.
-GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
-
-// STL container utilities.
-
-// Returns the number of elements in the given container that satisfy
-// the given predicate.
-template <class Container, typename Predicate>
-inline int CountIf(const Container& c, Predicate predicate) {
-  // Implemented as an explicit loop since std::count_if() in libCstd on
-  // Solaris has a non-standard signature.
-  int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
-  }
-  return count;
-}
-
-// Applies a function/functor to each element in the container.
-template <class Container, typename Functor>
-void ForEach(const Container& c, Functor functor) {
-  std::for_each(c.begin(), c.end(), functor);
-}
-
-// Returns the i-th element of the vector, or default_value if i is not
-// in range [0, v.size()).
-template <typename E>
-inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
-  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
-}
-
-// Performs an in-place shuffle of a range of the vector's elements.
-// 'begin' and 'end' are element indices as an STL-style range;
-// i.e. [begin, end) are shuffled, where 'end' == size() means to
-// shuffle to the end of the vector.
-template <typename E>
-void ShuffleRange(internal::Random* random, int begin, int end,
-                  std::vector<E>* v) {
-  const int size = static_cast<int>(v->size());
-  GTEST_CHECK_(0 <= begin && begin <= size)
-      << "Invalid shuffle range start " << begin << ": must be in range [0, "
-      << size << "].";
-  GTEST_CHECK_(begin <= end && end <= size)
-      << "Invalid shuffle range finish " << end << ": must be in range ["
-      << begin << ", " << size << "].";
-
-  // Fisher-Yates shuffle, from
-  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
-  for (int range_width = end - begin; range_width >= 2; range_width--) {
-    const int last_in_range = begin + range_width - 1;
-    const int selected = begin + random->Generate(range_width);
-    std::swap((*v)[selected], (*v)[last_in_range]);
-  }
-}
-
-// Performs an in-place shuffle of the vector's elements.
-template <typename E>
-inline void Shuffle(internal::Random* random, std::vector<E>* v) {
-  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
-}
-
-// A function for deleting an object.  Handy for being used as a
-// functor.
-template <typename T>
-static void Delete(T* x) {
-  delete x;
-}
-
-// A predicate that checks the key of a TestProperty against a known key.
-//
-// TestPropertyKeyIs is copyable.
-class TestPropertyKeyIs {
- public:
-  // Constructor.
-  //
-  // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const char* key)
-      : key_(key) {}
-
-  // Returns true iff the test name of test property matches on key_.
-  bool operator()(const TestProperty& test_property) const {
-    return String(test_property.key()).Compare(key_) == 0;
-  }
-
- private:
-  String key_;
-};
-
-// Class UnitTestOptions.
-//
-// This class contains functions for processing options the user
-// specifies when running the tests.  It has only static members.
-//
-// In most cases, the user can specify an option using either an
-// environment variable or a command line flag.  E.g. you can set the
-// test filter using either GTEST_FILTER or --gtest_filter.  If both
-// the variable and the flag are present, the latter overrides the
-// former.
-class GTEST_API_ UnitTestOptions {
- public:
-  // Functions for processing the gtest_output flag.
-
-  // Returns the output format, or "" for normal printed output.
-  static String GetOutputFormat();
-
-  // Returns the absolute path of the requested output file, or the
-  // default (test_detail.xml in the original working directory) if
-  // none was explicitly specified.
-  static String GetAbsolutePathToOutputFile();
-
-  // Functions for processing the gtest_filter flag.
-
-  // Returns true iff the wildcard pattern matches the string.  The
-  // first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
-  // Returns true iff the user-specified filter matches the test case
-  // name and the test name.
-  static bool FilterMatchesTest(const String &test_case_name,
-                                const String &test_name);
-
-#if GTEST_OS_WINDOWS
-  // Function for supporting the gtest_catch_exception flag.
-
-  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-  // This function is useful as an __except condition.
-  static int GTestShouldProcessSEH(DWORD exception_code);
-#endif  // GTEST_OS_WINDOWS
-
-  // Returns true if "name" matches the ':' separated list of glob-style
-  // filters in "filter".
-  static bool MatchesFilter(const String& name, const char* filter);
-};
-
-// Returns the current application's name, removing directory path if that
-// is present.  Used by UnitTestOptions::GetOutputFile.
-GTEST_API_ FilePath GetCurrentExecutableName();
-
-// The role interface for getting the OS stack trace as a string.
-class OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetterInterface() {}
-  virtual ~OsStackTraceGetterInterface() {}
-
-  // Returns the current OS stack trace as a String.  Parameters:
-  //
-  //   max_depth  - the maximum number of stack frames to be included
-  //                in the trace.
-  //   skip_count - the number of top frames to be skipped; doesn't count
-  //                against max_depth.
-  virtual String CurrentStackTrace(int max_depth, int skip_count) = 0;
-
-  // UponLeavingGTest() should be called immediately before Google Test calls
-  // user code. It saves some information about the current stack that
-  // CurrentStackTrace() will use to find and hide Google Test stack frames.
-  virtual void UponLeavingGTest() = 0;
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
-};
-
-// A working implementation of the OsStackTraceGetterInterface interface.
-class OsStackTraceGetter : public OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetter() : caller_frame_(NULL) {}
-  virtual String CurrentStackTrace(int max_depth, int skip_count);
-  virtual void UponLeavingGTest();
-
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
-
- private:
-  Mutex mutex_;  // protects all internal state
-
-  // We save the stack frame below the frame that calls user code.
-  // We do this because the address of the frame immediately below
-  // the user code changes between the call to UponLeavingGTest()
-  // and any calls to CurrentStackTrace() from within the user code.
-  void* caller_frame_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
-};
-
-// Information about a Google Test trace point.
-struct TraceInfo {
-  const char* file;
-  int line;
-  String message;
-};
-
-// This is the default global test part result reporter used in UnitTestImpl.
-// This class should only be used by UnitTestImpl.
-class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
- public:
-  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. Reports the test part
-  // result in the current test.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
-};
-
-// This is the default per thread test part result reporter used in
-// UnitTestImpl. This class should only be used by UnitTestImpl.
-class DefaultPerThreadTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. The implementation just
-  // delegates to the current global test part result reporter of *unit_test_.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
-};
-
-// The private implementation of the UnitTest class.  We don't protect
-// the methods under a mutex, as this class is not accessible by a
-// user and the UnitTest class that delegates work to this class does
-// proper locking.
-class GTEST_API_ UnitTestImpl {
- public:
-  explicit UnitTestImpl(UnitTest* parent);
-  virtual ~UnitTestImpl();
-
-  // There are two different ways to register your own TestPartResultReporter.
-  // You can register your own repoter to listen either only for test results
-  // from the current thread or for results from all threads.
-  // By default, each per-thread test result repoter just passes a new
-  // TestPartResult to the global test result reporter, which registers the
-  // test part result for the currently running test.
-
-  // Returns the global test part result reporter.
-  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
-
-  // Sets the global test part result reporter.
-  void SetGlobalTestPartResultReporter(
-      TestPartResultReporterInterface* reporter);
-
-  // Returns the test part result reporter for the current thread.
-  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
-
-  // Sets the test part result reporter for the current thread.
-  void SetTestPartResultReporterForCurrentThread(
-      TestPartResultReporterInterface* reporter);
-
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
-
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
-
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
-
-  // Gets the number of all test cases that contain at least one test
-  // that should run.
-  int test_case_to_run_count() const;
-
-  // Gets the number of successful tests.
-  int successful_test_count() const;
-
-  // Gets the number of failed tests.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests.
-  int disabled_test_count() const;
-
-  // Gets the number of all tests.
-  int total_test_count() const;
-
-  // Gets the number of tests that should run.
-  int test_to_run_count() const;
-
-  // Gets the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns true iff the unit test passed (i.e. all test cases passed).
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
-  bool Failed() const {
-    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  const TestCase* GetTestCase(int i) const {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[i];
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i) {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[index];
-  }
-
-  // Provides access to the event listener list.
-  TestEventListeners* listeners() { return &listeners_; }
-
-  // Returns the TestResult for the test that's currently running, or
-  // the TestResult for the ad hoc test if no test is running.
-  TestResult* current_test_result();
-
-  // Returns the TestResult for the ad hoc test.
-  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
-
-  // Sets the OS stack trace getter.
-  //
-  // Does nothing if the input and the current OS stack trace getter
-  // are the same; otherwise, deletes the old getter and makes the
-  // input the current getter.
-  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
-
-  // Returns the current OS stack trace getter if it is not NULL;
-  // otherwise, creates an OsStackTraceGetter, makes it the current
-  // getter, and returns it.
-  OsStackTraceGetterInterface* os_stack_trace_getter();
-
-  // Returns the current OS stack trace as a String.
-  //
-  // The maximum number of stack frames to be included is specified by
-  // the gtest_stack_trace_depth flag.  The skip_count parameter
-  // specifies the number of top frames to be skipped, which doesn't
-  // count against the number of frames to be included.
-  //
-  // For example, if Foo() calls Bar(), which in turn calls
-  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  String CurrentOsStackTraceExceptTop(int skip_count);
-
-  // Finds and returns a TestCase with the given name.  If one doesn't
-  // exist, creates one and returns it.
-  //
-  // Arguments:
-  //
-  //   test_case_name: name of the test case
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test case
-  //   tear_down_tc:   pointer to the function that tears down the test case
-  TestCase* GetTestCase(const char* test_case_name,
-                        const char* type_param,
-                        Test::SetUpTestCaseFunc set_up_tc,
-                        Test::TearDownTestCaseFunc tear_down_tc);
-
-  // Adds a TestInfo to the unit test.
-  //
-  // Arguments:
-  //
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  //   test_info:    the TestInfo object
-  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc,
-                   TestInfo* test_info) {
-    // In order to support thread-safe death tests, we need to
-    // remember the original working directory when the test program
-    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
-    // the user may have changed the current directory before calling
-    // RUN_ALL_TESTS().  Therefore we capture the current directory in
-    // AddTestInfo(), which is called to register a TEST or TEST_F
-    // before main() is reached.
-    if (original_working_dir_.IsEmpty()) {
-      original_working_dir_.Set(FilePath::GetCurrentDir());
-      GTEST_CHECK_(!original_working_dir_.IsEmpty())
-          << "Failed to get the current working directory.";
-    }
-
-    GetTestCase(test_info->test_case_name(),
-                test_info->type_param(),
-                set_up_tc,
-                tear_down_tc)->AddTestInfo(test_info);
-  }
-
-#if GTEST_HAS_PARAM_TEST
-  // Returns ParameterizedTestCaseRegistry object used to keep track of
-  // value-parameterized tests and instantiate and register them.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
-    return parameterized_test_registry_;
-  }
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Sets the TestCase object for the test that's currently running.
-  void set_current_test_case(TestCase* a_current_test_case) {
-    current_test_case_ = a_current_test_case;
-  }
-
-  // Sets the TestInfo object for the test that's currently running.  If
-  // current_test_info is NULL, the assertion results will be stored in
-  // ad_hoc_test_result_.
-  void set_current_test_info(TestInfo* a_current_test_info) {
-    current_test_info_ = a_current_test_info;
-  }
-
-  // Registers all parameterized tests defined using TEST_P and
-  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
-  // combination. This method can be called more then once; it has guards
-  // protecting from registering the tests more then once.  If
-  // value-parameterized tests are disabled, RegisterParameterizedTests is
-  // present but does nothing.
-  void RegisterParameterizedTests();
-
-  // Runs all tests in this UnitTest object, prints the result, and
-  // returns true if all tests are successful.  If any exception is
-  // thrown during a test, this test is considered to be failed, but
-  // the rest of the tests will still be run.
-  bool RunAllTests();
-
-  // Clears the results of all tests, except the ad hoc tests.
-  void ClearNonAdHocTestResult() {
-    ForEach(test_cases_, TestCase::ClearTestCaseResult);
-  }
-
-  // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
-
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
-
-  // Matches the full name of each test against the user-specified
-  // filter to decide whether the test should run, then records the
-  // result in each TestCase and TestInfo object.
-  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
-  // based on sharding variables in the environment.
-  // Returns the number of tests that should run.
-  int FilterTests(ReactionToSharding shard_tests);
-
-  // Prints the names of the tests matching the user-specified filter flag.
-  void ListTestsMatchingFilter();
-
-  const TestCase* current_test_case() const { return current_test_case_; }
-  TestInfo* current_test_info() { return current_test_info_; }
-  const TestInfo* current_test_info() const { return current_test_info_; }
-
-  // Returns the vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*>& environments() { return environments_; }
-
-  // Getters for the per-thread Google Test trace stack.
-  std::vector<TraceInfo>& gtest_trace_stack() {
-    return *(gtest_trace_stack_.pointer());
-  }
-  const std::vector<TraceInfo>& gtest_trace_stack() const {
-    return gtest_trace_stack_.get();
-  }
-
-#if GTEST_HAS_DEATH_TEST
-  void InitDeathTestSubprocessControlInfo() {
-    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
-  }
-  // Returns a pointer to the parsed --gtest_internal_run_death_test
-  // flag, or NULL if that flag was not specified.
-  // This information is useful only in a death test child process.
-  // Must not be called before a call to InitGoogleTest.
-  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
-    return internal_run_death_test_flag_.get();
-  }
-
-  // Returns a pointer to the current death test factory.
-  internal::DeathTestFactory* death_test_factory() {
-    return death_test_factory_.get();
-  }
-
-  void SuppressTestEventsIfInSubprocess();
-
-  friend class ReplaceDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // Initializes the event listener performing XML output as specified by
-  // UnitTestOptions. Must not be called before InitGoogleTest.
-  void ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-  // Initializes the event listener for streaming test results to a socket.
-  // Must not be called before InitGoogleTest.
-  void ConfigureStreamingOutput();
-#endif
-
-  // Performs initialization dependent upon flag values obtained in
-  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-  // this function is also called from RunAllTests.  Since this function can be
-  // called more than once, it has to be idempotent.
-  void PostFlagParsingInit();
-
-  // Gets the random seed used at the start of the current test iteration.
-  int random_seed() const { return random_seed_; }
-
-  // Gets the random number generator.
-  internal::Random* random() { return &random_; }
-
-  // Shuffles all test cases, and the tests within each test case,
-  // making sure that death tests are still run first.
-  void ShuffleTests();
-
-  // Restores the test cases and tests to their order before the first shuffle.
-  void UnshuffleTests();
-
-  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
-  // UnitTest::Run() starts.
-  bool catch_exceptions() const { return catch_exceptions_; }
-
- private:
-  friend class ::testing::UnitTest;
-
-  // Used by UnitTest::Run() to capture the state of
-  // GTEST_FLAG(catch_exceptions) at the moment it starts.
-  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
-
-  // The UnitTest object that owns this implementation object.
-  UnitTest* const parent_;
-
-  // The working directory when the first TEST() or TEST_F() was
-  // executed.
-  internal::FilePath original_working_dir_;
-
-  // The default test part result reporters.
-  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
-  DefaultPerThreadTestPartResultReporter
-      default_per_thread_test_part_result_reporter_;
-
-  // Points to (but doesn't own) the global test part result reporter.
-  TestPartResultReporterInterface* global_test_part_result_repoter_;
-
-  // Protects read and write access to global_test_part_result_reporter_.
-  internal::Mutex global_test_part_result_reporter_mutex_;
-
-  // Points to (but doesn't own) the per-thread test part result reporter.
-  internal::ThreadLocal<TestPartResultReporterInterface*>
-      per_thread_test_part_result_reporter_;
-
-  // The vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*> environments_;
-
-  // The vector of TestCases in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestCase*> test_cases_;
-
-  // Provides a level of indirection for the test case list to allow
-  // easy shuffling and restoring the test case order.  The i-th
-  // element of this vector is the index of the i-th test case in the
-  // shuffled order.
-  std::vector<int> test_case_indices_;
-
-#if GTEST_HAS_PARAM_TEST
-  // ParameterizedTestRegistry object used to register value-parameterized
-  // tests.
-  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
-
-  // Indicates whether RegisterParameterizedTests() has been called already.
-  bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Index of the last death test case registered.  Initially -1.
-  int last_death_test_case_;
-
-  // This points to the TestCase for the currently running test.  It
-  // changes as Google Test goes through one test case after another.
-  // When no test is running, this is set to NULL and Google Test
-  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestCase* current_test_case_;
-
-  // This points to the TestInfo for the currently running test.  It
-  // changes as Google Test goes through one test after another.  When
-  // no test is running, this is set to NULL and Google Test stores
-  // assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestInfo* current_test_info_;
-
-  // Normally, a user only writes assertions inside a TEST or TEST_F,
-  // or inside a function called by a TEST or TEST_F.  Since Google
-  // Test keeps track of which test is current running, it can
-  // associate such an assertion with the test it belongs to.
-  //
-  // If an assertion is encountered when no TEST or TEST_F is running,
-  // Google Test attributes the assertion result to an imaginary "ad hoc"
-  // test, and records the result in ad_hoc_test_result_.
-  TestResult ad_hoc_test_result_;
-
-  // The list of event listeners that can be used to track events inside
-  // Google Test.
-  TestEventListeners listeners_;
-
-  // The OS stack trace getter.  Will be deleted when the UnitTest
-  // object is destructed.  By default, an OsStackTraceGetter is used,
-  // but the user can set this field to use a custom getter if that is
-  // desired.
-  OsStackTraceGetterInterface* os_stack_trace_getter_;
-
-  // True iff PostFlagParsingInit() has been called.
-  bool post_flag_parse_init_performed_;
-
-  // The random number seed used at the beginning of the test run.
-  int random_seed_;
-
-  // Our random number generator.
-  internal::Random random_;
-
-  // How long the test took to run, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-#if GTEST_HAS_DEATH_TEST
-  // The decomposed components of the gtest_internal_run_death_test flag,
-  // parsed when RUN_ALL_TESTS is called.
-  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
-  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
-  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
-
-  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
-  // starts.
-  bool catch_exceptions_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
-};  // class UnitTestImpl
-
-// Convenience function for accessing the global UnitTest
-// implementation object.
-inline UnitTestImpl* GetUnitTestImpl() {
-  return UnitTest::GetInstance()->impl();
-}
-
-#if GTEST_USES_SIMPLE_RE
-
-// Internal helper functions for implementing the simple regular
-// expression matcher.
-GTEST_API_ bool IsInSet(char ch, const char* str);
-GTEST_API_ bool IsAsciiDigit(char ch);
-GTEST_API_ bool IsAsciiPunct(char ch);
-GTEST_API_ bool IsRepeat(char ch);
-GTEST_API_ bool IsAsciiWhiteSpace(char ch);
-GTEST_API_ bool IsAsciiWordChar(char ch);
-GTEST_API_ bool IsValidEscape(char ch);
-GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
-GTEST_API_ bool ValidateRegex(const char* regex);
-GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
-GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
-
-#endif  // GTEST_USES_SIMPLE_RE
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
-
-#if GTEST_HAS_DEATH_TEST
-
-// Returns the message describing the last system error, regardless of the
-// platform.
-GTEST_API_ String GetLastErrnoDescription();
-
-# if GTEST_OS_WINDOWS
-// Provides leak-safe Windows kernel handle ownership.
-class AutoHandle {
- public:
-  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
-  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
-
-  ~AutoHandle() { Reset(); }
-
-  HANDLE Get() const { return handle_; }
-  void Reset() { Reset(INVALID_HANDLE_VALUE); }
-  void Reset(HANDLE handle) {
-    if (handle != handle_) {
-      if (handle_ != INVALID_HANDLE_VALUE)
-        ::CloseHandle(handle_);
-      handle_ = handle;
-    }
-  }
-
- private:
-  HANDLE handle_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
-};
-# endif  // GTEST_OS_WINDOWS
-
-// Attempts to parse a string into a positive integer pointed to by the
-// number parameter.  Returns true if that is possible.
-// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
-// it here.
-template <typename Integer>
-bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
-  // Fail fast if the given string does not begin with a digit;
-  // this bypasses strtoXXX's "optional leading whitespace and plus
-  // or minus sign" semantics, which are undesirable here.
-  if (str.empty() || !IsDigit(str[0])) {
-    return false;
-  }
-  errno = 0;
-
-  char* end;
-  // BiggestConvertible is the largest integer type that system-provided
-  // string-to-number conversion routines can return.
-
-# if GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  // MSVC and C++ Builder define __int64 instead of the standard long long.
-  typedef unsigned __int64 BiggestConvertible;
-  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
-
-# else
-
-  typedef unsigned long long BiggestConvertible;  // NOLINT
-  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
-
-# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  const bool parse_success = *end == '\0' && errno == 0;
-
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
-  // available.
-  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
-
-  const Integer result = static_cast<Integer>(parsed);
-  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
-    *number = result;
-    return true;
-  }
-  return false;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// TestResult contains some private methods that should be hidden from
-// Google Test user but are required for testing. This class allow our tests
-// to access them.
-//
-// This class is supplied only for the purpose of testing Google Test's own
-// constructs. Do not use it in user tests, either directly or indirectly.
-class TestResultAccessor {
- public:
-  static void RecordProperty(TestResult* test_result,
-                             const TestProperty& property) {
-    test_result->RecordProperty(property);
-  }
-
-  static void ClearTestPartResults(TestResult* test_result) {
-    test_result->ClearTestPartResults();
-  }
-
-  static const std::vector<testing::TestPartResult>& test_part_results(
-      const TestResult& test_result) {
-    return test_result.test_part_results();
-  }
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-internal.h b/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
index a94bf28..63f72ac 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-internal.h
@@ -105,6 +105,7 @@
 #if !GTEST_NO_LLVM_RAW_OSTREAM
 namespace llvm {
 class convertible_fwd_ostream : public std::ostream {
+  virtual void anchor();
   raw_os_ostream ros_;
 
 public:
@@ -536,7 +537,7 @@ GTEST_API_ TypeId GetTestTypeId();
 // of a Test object.
 class TestFactoryBase {
  public:
-  virtual ~TestFactoryBase() {}
+  virtual ~TestFactoryBase();
 
   // Creates a test instance to run. The instance is both created and destroyed
   // within TestInfoImpl::Run()
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h b/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h
index 0ef9718..3bb2ffb 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-param-util.h
@@ -414,7 +414,7 @@ class TestMetaFactory
 // and calls RegisterTests() on each of them when asked.
 class ParameterizedTestCaseInfoBase {
  public:
-  virtual ~ParameterizedTestCaseInfoBase() {}
+  virtual ~ParameterizedTestCaseInfoBase();
 
   // Base part of test case name for display purposes.
   virtual const string& GetTestCaseName() const = 0;
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-port.h b/utils/unittest/googletest/include/gtest/internal/gtest-port.h
index 58f6caf..32fd9c6 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-port.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-port.h
@@ -1116,7 +1116,7 @@ class Notification {
 // problem.
 class ThreadWithParamBase {
  public:
-  virtual ~ThreadWithParamBase() {}
+  virtual ~ThreadWithParamBase();
   virtual void Run() = 0;
 };
 
@@ -1290,7 +1290,7 @@ typedef GTestMutexLock MutexLock;
 // ThreadLocalValueHolderBase.
 class ThreadLocalValueHolderBase {
  public:
-  virtual ~ThreadLocalValueHolderBase() {}
+  virtual ~ThreadLocalValueHolderBase();
 };
 
 // Called by pthread to delete thread-local data stored by
diff --git a/utils/unittest/googletest/src/gtest-all.cc b/utils/unittest/googletest/src/gtest-all.cc
new file mode 100644
index 0000000..0a9cee5
--- /dev/null
+++ b/utils/unittest/googletest/src/gtest-all.cc
@@ -0,0 +1,48 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// Google C++ Testing Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+#include "src/gtest.cc"
+#include "src/gtest-death-test.cc"
+#include "src/gtest-filepath.cc"
+#include "src/gtest-port.cc"
+#include "src/gtest-printers.cc"
+#include "src/gtest-test-part.cc"
+#include "src/gtest-typed-test.cc"
diff --git a/utils/unittest/googletest/src/gtest-death-test.cc b/utils/unittest/googletest/src/gtest-death-test.cc
new file mode 100644
index 0000000..314dba2
--- /dev/null
+++ b/utils/unittest/googletest/src/gtest-death-test.cc
@@ -0,0 +1,1239 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+//
+// This file implements death tests.
+
+#include "gtest/gtest-death-test.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+static const char kDefaultDeathTestStyle[] = "fast";
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "colons.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS
+}
+
+# if !GTEST_OS_WINDOWS
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static String ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static String DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0)
+    msg << "couldn't detect the number of threads.";
+  else
+    msg << "detected " << thread_count << " threads.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// TODO(vladl@google.com): Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+void DeathTestAbort(const String& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort(::testing::internal::String::Format( \
+          "CHECK failed: File %s, line %d: %s", \
+          __FILE__, __LINE__, #expression)); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort(::testing::internal::String::Format( \
+          "CHECK failed: File %s, line %d: %s != -1", \
+          __FILE__, __LINE__, #expression)); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+String GetLastErrnoDescription() {
+    return String(errno == 0 ? "" : posix::StrError(errno));
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Pin the vtable to this file.
+DeathTest::~DeathTest() {}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const String& message) {
+  last_death_test_message_ = message;
+}
+
+String DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const String error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const String filter_flag = String::Format("--%s%s=%s.%s",
+                                            GTEST_FLAG_PREFIX_, kFilterFlag,
+                                            info->test_case_name(),
+                                            info->name());
+  const String internal_flag = String::Format(
+    "--%s%s=%s|%d|%d|%u|%Iu|%Iu",
+      GTEST_FLAG_PREFIX_,
+      kInternalRunDeathTestFlag,
+      file_, line_,
+      death_test_index,
+      static_cast<unsigned int>(::GetCurrentProcessId()),
+      // size_t has the same with as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      reinterpret_cast<size_t>(write_handle),
+      reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  String command_line = String::Format("%s %s \"%s\"",
+                                       ::GetCommandLineA(),
+                                       filter_flag.c_str(),
+                                       internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+# else  // We are not on Windows.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(String::Format("chdir(\"%s\") failed: %s",
+                                  original_dir,
+                                  GetLastErrnoDescription().c_str()));
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(String::Format("execve(%s, ...) in %s failed: %s",
+                                args->argv[0],
+                                original_dir,
+                                GetLastErrnoDescription().c_str()));
+  return EXIT_FAILURE;
+}
+
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+bool StackLowerThanAddress(const void* ptr) GTEST_NO_INLINE_;
+bool StackLowerThanAddress(const void* ptr) {
+  int dummy;
+  return &dummy < ptr;
+}
+
+bool StackGrowsDown() {
+  int dummy;
+  return StackLowerThanAddress(&dummy);
+}
+
+// A threadsafe implementation of fork(2) for threadsafe-style death tests
+// that uses clone(2).  It dies with an error message if anything goes
+// wrong.
+static pid_t ExecDeathTestFork(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+    void* const stack_top =
+        static_cast<char*>(stack) + (stack_grows_down ? stack_size : 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#  else
+  const bool use_fork = true;
+#  endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const String filter_flag =
+      String::Format("--%s%s=%s.%s",
+                     GTEST_FLAG_PREFIX_, kFilterFlag,
+                     info->test_case_name(), info->name());
+  const String internal_flag =
+      String::Format("--%s%s=%s|%d|%d|%d",
+                     GTEST_FLAG_PREFIX_, kInternalRunDeathTestFlag,
+                     file_, line_, death_test_index, pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestFork(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(String::Format(
+          "Death test count (%d) somehow exceeded expected maximum (%d)",
+          death_test_index, flag->index()));
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(String::Format(
+        "Unknown death test style \"%s\" encountered",
+        GTEST_FLAG(death_test_style).c_str()));
+    return false;
+  }
+
+  return true;
+}
+
+// Pin the vtable to this file.
+DeathTestFactory::~DeathTestFactory() {}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
+// ::std::string, so we can use it here.
+static void SplitString(const ::std::string& str, char delimiter,
+                        ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort(String::Format("Unable to open parent process %u",
+                                  parent_process_id));
+  }
+
+  // TODO(vladl@google.com): Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort(String::Format(
+        "Unable to duplicate the pipe handle %Iu from the parent process %u",
+        write_handle_as_size_t, parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort(String::Format(
+        "Unable to duplicate the event handle %Iu from the parent process %u",
+        event_handle_as_size_t, parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort(String::Format(
+        "Unable to convert pipe handle %Iu to a file descriptor",
+        write_handle_as_size_t));
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort(String::Format(
+        "Bad --gtest_internal_run_death_test flag: %s",
+        GTEST_FLAG(internal_run_death_test).c_str()));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort(String::Format(
+        "Bad --gtest_internal_run_death_test flag: %s",
+        GTEST_FLAG(internal_run_death_test).c_str()));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
diff --git a/utils/unittest/googletest/gtest-filepath.cc b/utils/unittest/googletest/src/gtest-filepath.cc
index ad1bab8..ad1bab8 100644
--- a/utils/unittest/googletest/gtest-filepath.cc
+++ b/utils/unittest/googletest/src/gtest-filepath.cc
diff --git a/utils/unittest/googletest/src/gtest-internal-inl.h b/utils/unittest/googletest/src/gtest-internal-inl.h
new file mode 100644
index 0000000..1bae630
--- /dev/null
+++ b/utils/unittest/googletest/src/gtest-internal-inl.h
@@ -0,0 +1,1037 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// A user is trying to include this from his code - just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+#include "gtest/gtest.h"  // NOLINT
+#include "gtest/gtest-spi.h"
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  String color_;
+  String death_test_style_;
+  bool death_test_use_fork_;
+  String filter_;
+  String internal_run_death_test_;
+  bool list_tests_;
+  String output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  String stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// The output buffer str must containt at least 32 characters.
+// The function returns the address of the output buffer.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'.
+GTEST_API_ char* CodePointToUtf8(UInt32 code_point, char* str);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ String WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const char* key)
+      : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return String(test_property.key()).Compare(key_) == 0;
+  }
+
+ private:
+  String key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static String GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static String GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const String &test_case_name,
+                                const String &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const String& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface();
+
+  // Returns the current OS stack trace as a String.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual String CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() : caller_frame_(NULL) {}
+  virtual String CurrentStackTrace(int max_depth, int skip_count);
+  virtual void UponLeavingGTest();
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  Mutex mutex_;  // protects all internal state
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to CurrentStackTrace() from within the user code.
+  void* caller_frame_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  String message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as a String.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  String CurrentOsStackTraceExceptTop(int skip_count);
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ String GetLastErrnoDescription();
+
+# if GTEST_OS_WINDOWS
+// Provides leak-safe Windows kernel handle ownership.
+class AutoHandle {
+ public:
+  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
+
+  ~AutoHandle() { Reset(); }
+
+  HANDLE Get() const { return handle_; }
+  void Reset() { Reset(INVALID_HANDLE_VALUE); }
+  void Reset(HANDLE handle) {
+    if (handle != handle_) {
+      if (handle_ != INVALID_HANDLE_VALUE)
+        ::CloseHandle(handle_);
+      handle_ = handle;
+    }
+  }
+
+ private:
+  HANDLE handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+# endif  // GTEST_OS_WINDOWS
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const TestProperty& property) {
+    test_result->RecordProperty(property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/utils/unittest/googletest/src/gtest-port.cc b/utils/unittest/googletest/src/gtest-port.cc
new file mode 100644
index 0000000..94fc57f
--- /dev/null
+++ b/utils/unittest/googletest/src/gtest-port.cc
@@ -0,0 +1,764 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#include "gtest/internal/gtest-port.h"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>  // For TerminateProcess()
+#elif GTEST_OS_WINDOWS
+# include <io.h>
+# include <sys/stat.h>
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_MAC
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_MAC
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+String FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // TODO(wan@google.com): fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const char* const file_name = file == NULL ? kUnknownFile : file;
+
+  if (line < 0) {
+    return String::Format("%s:", file_name).c_str();
+  }
+#ifdef _MSC_VER
+  return String::Format("%s(%d):", file_name, line).c_str();
+#else
+  return String::Format("%s:%d:", file_name, line).c_str();
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const char* const file_name = file == NULL ? kUnknownFile : file;
+
+  if (line < 0)
+    return file_name;
+  else
+    return String::Format("%s:%d", file_name, line).c_str();
+}
+
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable: 4996)
+#endif  // _MSC_VER
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+#elif GTEST_OS_LINUX_ANDROID
+    char name_template[] = "/sdcard/captured_stderr.XXXXXX";
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# else
+    // There's no guarantee that a test has write access to the
+    // current directory, so we create the temporary file in the /tmp
+    // directory instead.
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  String GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const String content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  // Reads the entire content of a file as a String.
+  static String ReadEntireFile(FILE* file);
+
+  // Returns the size (in bytes) of a file.
+  static size_t GetFileSize(FILE* file);
+
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+// Returns the size (in bytes) of a file.
+size_t CapturedStream::GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+// Reads the entire content of a file as a string.
+String CapturedStream::ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const String content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+# ifdef _MSC_VER
+#  pragma warning(pop)
+# endif  // _MSC_VER
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+String GetCapturedStream(CapturedStream** captured_stream) {
+  const String content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+String GetCapturedStdout() { return GetCapturedStream(&g_captured_stdout); }
+
+// Stops capturing stderr and returns the captured string.
+String GetCapturedStderr() { return GetCapturedStream(&g_captured_stderr); }
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+#if GTEST_HAS_DEATH_TEST
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<String> g_argvs;
+
+// Returns the command line as a vector of strings.
+const ::std::vector<String>& GetArgvs() { return g_argvs; }
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static String FlagToEnvVar(const char* flag) {
+  const String full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+  const String env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+  const String env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
+  const String env_var = FlagToEnvVar(flag);
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+}
+
+// Pin the vtables to this file.
+#if GTEST_HAS_PTHREAD
+ThreadWithParamBase::~ThreadWithParamBase() {}
+ThreadLocalValueHolderBase::~ThreadLocalValueHolderBase() {}
+#endif
+TestFactoryBase::~TestFactoryBase() {}
+
+}  // namespace internal
+}  // namespace testing
+
+// Pin the vtable to this file.
+#if !GTEST_NO_LLVM_RAW_OSTREAM
+namespace llvm {
+void convertible_fwd_ostream::anchor() {}
+}
+#endif
diff --git a/utils/unittest/googletest/gtest-printers.cc b/utils/unittest/googletest/src/gtest-printers.cc
index 205a394..205a394 100644
--- a/utils/unittest/googletest/gtest-printers.cc
+++ b/utils/unittest/googletest/src/gtest-printers.cc
diff --git a/utils/unittest/googletest/src/gtest-test-part.cc b/utils/unittest/googletest/src/gtest-test-part.cc
new file mode 100644
index 0000000..5ddc67c
--- /dev/null
+++ b/utils/unittest/googletest/src/gtest-test-part.cc
@@ -0,0 +1,110 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// The Google C++ Testing Framework (Google Test)
+
+#include "gtest/gtest-test-part.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+internal::String TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? internal::String(message) :
+      internal::String(message, stack_trace - message);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/utils/unittest/googletest/gtest-typed-test.cc b/utils/unittest/googletest/src/gtest-typed-test.cc
index a5cc88f..a5cc88f 100644
--- a/utils/unittest/googletest/gtest-typed-test.cc
+++ b/utils/unittest/googletest/src/gtest-typed-test.cc
diff --git a/utils/unittest/googletest/src/gtest.cc b/utils/unittest/googletest/src/gtest.cc
new file mode 100644
index 0000000..bf850c6
--- /dev/null
+++ b/utils/unittest/googletest/src/gtest.cc
@@ -0,0 +1,4876 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton@google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton@google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to xterm, xterm-color, xterm-256color, linux or cygwin.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", kUniversalFilter),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+//
+// A user must call testing::InitGoogleTest() to initialize Google
+// Test.  g_init_gtest_count is set to the number of times
+// InitGoogleTest() has been called.  We don't protect this variable
+// under a mutex as it is only accessed in the main thread.
+int g_init_gtest_count = 0;
+static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// Application pathname gotten in InitGoogleTest.
+String g_executable_path;
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(g_executable_path));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+String UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return String("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      String(gtest_output_flag) :
+      String(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+String UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return String("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return String(internal::FilePath::ConcatPaths(
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile)).ToString() );
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.ToString();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.ToString();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(const String& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// TODO(keithray): move String function implementations to gtest-string.cc.
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const String &test_case_name,
+                                        const String &test_name) {
+  const String& full_name = String::Format("%s.%s",
+                                           test_case_name.c_str(),
+                                           test_name.c_str());
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  String positive;
+  String negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = String("");
+  } else {
+    positive = String(p, dash - p);  // Everything up to the dash
+    negative = String(dash+1);       // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const String expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as a String.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+String UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  (void)skip_count;
+  return String("");
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton@google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+# ifdef _MSC_VER
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+#  pragma warning(push)          // Saves the current warning state.
+#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
+  _ftime64(&now);
+#  pragma warning(pop)           // Restores the warning state.
+# else
+
+  _ftime64(&now);
+
+# endif  // _MSC_VER
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String
+
+// Returns the input enclosed in double quotes if it's not NULL;
+// otherwise returns "(null)".  For example, "\"Hello\"" is returned
+// for input "Hello".
+//
+// This is useful for printing a C string in the syntax of a literal.
+//
+// Known issue: escape sequences are not handled yet.
+String String::ShowCStringQuoted(const char* c_str) {
+  return c_str ? String::Format("\"%s\"", c_str) : String("(null)");
+}
+
+// Copies at most length characters from str into a newly-allocated
+// piece of memory of size length+1.  The memory is allocated with new[].
+// A terminating null byte is written to the memory, and a pointer to it
+// is returned.  If str is NULL, NULL is returned.
+static char* CloneString(const char* str, size_t length) {
+  if (str == NULL) {
+    return NULL;
+  } else {
+    char* const clone = new char[length + 1];
+    posix::StrNCpy(clone, str, length);
+    clone[length] = '\0';
+    return clone;
+  }
+}
+
+// Clones a 0-terminated C string, allocating memory using new.  The
+// caller is responsible for deleting[] the return value.  Returns the
+// cloned string, or NULL if the input is NULL.
+const char * String::CloneCString(const char* c_str) {
+  return (c_str == NULL) ?
+                    NULL : CloneString(c_str, strlen(c_str));
+}
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  // TODO(wan): consider allowing a testing::String object to
+  // contain '\0'.  This will make it behave more like std::string,
+  // and will allow ToUtf8String() to return the correct encoding
+  // for '\0' s.t. we can get rid of the conditional here (and in
+  // several other places).
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+}  // namespace internal
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* expected_expression,
+                          const char* actual_expression,
+                          const String& expected_value,
+                          const String& actual_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "Value of: " << actual_expression;
+  if (actual_value != actual_expression) {
+    msg << "\n  Actual: " << actual_value;
+  }
+
+  msg << "\nExpected: " << expected_expression;
+  if (ignoring_case) {
+    msg << " (ignoring case)";
+  }
+  if (expected_value != expected_expression) {
+    msg << "\nWhich is: " << expected_value;
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+String GetBoolAssertionFailureMessage(const AssertionResult& assertion_result,
+                                      const char* expression_text,
+                                      const char* actual_predicate_value,
+                                      const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // TODO(wan): do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* expected_expression,
+                            const char* actual_expression,
+                            BiggestInt expected,
+                            BiggestInt actual) {
+  if (expected == actual) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   FormatForComparisonFailureMessage(expected, actual),
+                   FormatForComparisonFailureMessage(actual, expected),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const char* expected,
+                               const char* actual) {
+  if (String::CStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   String::ShowCStringQuoted(expected),
+                   String::ShowCStringQuoted(actual),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
+                                   const char* actual_expression,
+                                   const char* expected,
+                                   const char* actual) {
+  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   String::ShowCStringQuoted(expected),
+                   String::ShowCStringQuoted(actual),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;  // String::Format can't exceed this length.
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing cr-lf)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const String error_hex(String::Format("0x%08X ", hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// The output buffer str must containt at least 32 characters.
+// The function returns the address of the output buffer.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'.
+char* CodePointToUtf8(UInt32 code_point, char* str) {
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else if (code_point <= kMaxCodePoint4) {
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  } else {
+    // The longest string String::Format can produce when invoked
+    // with these parameters is 28 character long (not including
+    // the terminating nul character). We are asking for 32 character
+    // buffer just in case. This is also enough for strncpy to
+    // null-terminate the destination string.
+    posix::StrNCpy(
+        str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(), 32);
+    str[31] = '\0';  // Makes sure no change in the format to strncpy leaves
+                     // the result unterminated.
+  }
+  return str;
+}
+
+// The following two functions only make sense if the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+String WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    char buffer[32];  // CodePointToUtf8 requires a buffer this big.
+    stream << CodePointToUtf8(unicode_code_point, buffer);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to a String using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+String String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL) return String("(null)");
+
+  return String(internal::WideStringToUtf8(wide_c_str, -1).c_str());
+}
+
+// Similar to ShowWideCString(), except that this function encloses
+// the converted string in double quotes.
+String String::ShowWideCStringQuoted(const wchar_t* wide_c_str) {
+  if (wide_c_str == NULL) return String("(null)");
+
+  return String::Format("L\"%s\"",
+                        String::ShowWideCString(wide_c_str).c_str());
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* expected_expression,
+                               const char* actual_expression,
+                               const wchar_t* expected,
+                               const wchar_t* actual) {
+  if (String::WideCStringEquals(expected, actual)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(expected_expression,
+                   actual_expression,
+                   String::ShowWideCStringQuoted(expected),
+                   String::ShowWideCStringQuoted(actual),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << String::ShowWideCStringQuoted(s1)
+                            << " vs " << String::ShowWideCStringQuoted(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Compares this with another String.
+// Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0
+// if this is greater than rhs.
+int String::Compare(const String & rhs) const {
+  const char* const lhs_c_str = c_str();
+  const char* const rhs_c_str = rhs.c_str();
+
+  if (lhs_c_str == NULL) {
+    return rhs_c_str == NULL ? 0 : -1;  // NULL < anything except NULL
+  } else if (rhs_c_str == NULL) {
+    return 1;
+  }
+
+  const size_t shorter_str_len =
+      length() <= rhs.length() ? length() : rhs.length();
+  for (size_t i = 0; i != shorter_str_len; i++) {
+    if (lhs_c_str[i] < rhs_c_str[i]) {
+      return -1;
+    } else if (lhs_c_str[i] > rhs_c_str[i]) {
+      return 1;
+    }
+  }
+  return (length() < rhs.length()) ? -1 :
+      (length() > rhs.length()) ? 1 : 0;
+}
+
+// Returns true iff this String ends with the given suffix.  *Any*
+// String is considered to end with a NULL or empty suffix.
+bool String::EndsWith(const char* suffix) const {
+  if (suffix == NULL || CStringEquals(suffix, "")) return true;
+
+  if (c_str() == NULL) return false;
+
+  const size_t this_len = strlen(c_str());
+  const size_t suffix_len = strlen(suffix);
+  return (this_len >= suffix_len) &&
+         CStringEquals(c_str() + this_len - suffix_len, suffix);
+}
+
+// Returns true iff this String ends with the given suffix, ignoring case.
+// Any String is considered to end with a NULL or empty suffix.
+bool String::EndsWithCaseInsensitive(const char* suffix) const {
+  if (suffix == NULL || CStringEquals(suffix, "")) return true;
+
+  if (c_str() == NULL) return false;
+
+  const size_t this_len = strlen(c_str());
+  const size_t suffix_len = strlen(suffix);
+  return (this_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(c_str() + this_len - suffix_len, suffix);
+}
+
+// Formats a list of arguments to a String, using the same format
+// spec string as for printf.
+//
+// We do not use the StringPrintf class as it is not universally
+// available.
+//
+// The result is limited to 4096 characters (including the tailing 0).
+// If 4096 characters are not enough to format the input, or if
+// there's an error, "<formatting error or buffer exceeded>" is
+// returned.
+String String::Format(const char * format, ...) {
+  va_list args;
+  va_start(args, format);
+
+  char buffer[4096];
+  const int kBufferSize = sizeof(buffer)/sizeof(buffer[0]);
+
+  // MSVC 8 deprecates vsnprintf(), so we want to suppress warning
+  // 4996 (deprecated function) there.
+#ifdef _MSC_VER  // We are using MSVC.
+# pragma warning(push)          // Saves the current warning state.
+# pragma warning(disable:4996)  // Temporarily disables warning 4996.
+
+  const int size = vsnprintf(buffer, kBufferSize, format, args);
+
+# pragma warning(pop)           // Restores the warning state.
+#else  // We are not using MSVC.
+  const int size = vsnprintf(buffer, kBufferSize, format, args);
+#endif  // _MSC_VER
+  va_end(args);
+
+  // vsnprintf()'s behavior is not portable.  When the buffer is not
+  // big enough, it returns a negative value in MSVC, and returns the
+  // needed buffer size on Linux.  When there is an output error, it
+  // always returns a negative value.  For simplicity, we lump the two
+  // error cases together.
+  if (size < 0 || size >= kBufferSize) {
+    return String("<formatting error or buffer exceeded>");
+  } else {
+    return String(buffer, size);
+  }
+}
+
+// Converts the buffer in a stringstream to a String, converting NUL
+// bytes to "\\0" along the way.
+String StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  // We need to use a helper stringstream to do this transformation
+  // because String doesn't support push_back().
+  ::std::stringstream helper;
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      helper << "\\0";  // Replaces NUL with "\\0";
+    } else {
+      helper.put(*ch);
+    }
+  }
+
+  return String(helper.str().c_str());
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+String AppendUserMessage(const String& gtest_msg,
+                         const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const String user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  Message msg;
+  msg << gtest_msg << "\n" << user_msg_string;
+
+  return msg.GetString();
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const TestProperty& test_property) {
+  if (!ValidateTestProperty(test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// Adds a failure if the key is a reserved attribute of Google Test
+// testcase tags.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const TestProperty& test_property) {
+  internal::String key(test_property.key());
+  if (key == "name" || key == "status" || key == "time" || key == "classname") {
+    ADD_FAILURE()
+        << "Reserved key used in RecordProperty(): "
+        << key
+        << " ('name', 'status', 'time', and 'classname' are reserved by "
+        << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the values of all Google Test flags.
+Test::Test()
+    : gtest_flag_saver_(new internal::GTestFlagSaver) {
+}
+
+// The d'tor restores the values of all Google Test flags.
+Test::~Test() {
+  delete gtest_flag_saver_;
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const char* key, const char* value) {
+  UnitTest::GetInstance()->RecordPropertyForCurrentTest(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const char* key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const String& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      String());  // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // The user mixed TEST and TEST_F in this test case - we'll tell
+      // him/her how to fix it.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // The user defined two fixture classes with the same name in
+      // two namespaces - we'll tell him/her how to fix it.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static internal::String* FormatSehExceptionMessage(DWORD exception_code,
+                                                   const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new internal::String(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static internal::String FormatCxxExceptionMessage(const char* description,
+                                                  const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static internal::String PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+// A failed Google Test assertion will throw an exception of this type when
+// GTEST_FLAG(throw_on_failure) is true (if exceptions are enabled).  We
+// derive it from std::runtime_error, which is for errors presumably
+// detectable only at run time.  Since std::runtime_error inherits from
+// std::exception, many testing frameworks know how to extract and print the
+// message inside it.
+class GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure)
+      : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+};
+#endif  // GTEST_HAS_EXCEPTIONS
+
+namespace internal {
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    internal::String* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const GoogleTestFailureException&) {  // NOLINT
+      // This exception doesn't originate in code under test. It makes no
+      // sense to report it as a test failure.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+// TODO(vladl@google.com): Make a_test_case_name and a_name const string&'s
+// to signify they cannot be NULLs.
+TestInfo::TestInfo(const char* a_test_case_name,
+                   const char* a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name, const char* name,
+    const char* type_param,
+    const char* value_param,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+#if GTEST_HAS_PARAM_TEST
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               const char* file, int line) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+          errors.GetString().c_str());
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+}  // namespace internal
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+#if GTEST_HAS_PARAM_TEST
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+#endif
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test only if the test object was created and its
+  // constructor didn't generate a fatal failure.
+  if ((test != NULL) && !Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  // Deletes the test object.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static internal::String FormatCountableNoun(int count,
+                                            const char * singular_form,
+                                            const char * plural_form) {
+  return internal::String::Format("%d %s", count,
+                                  count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static internal::String FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static internal::String FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+  }
+
+  // All cases return, so this is unreachable but GCC doesn't know it
+  abort();
+}
+
+// Prints a TestPartResult to a String.
+static internal::String PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const internal::String& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+namespace internal {
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns the character attribute for the given color.
+WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  const bool use_color = false;
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("TypeParam = %s", type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("GetParam() = %s", value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+
+  internal::String test_case_name_;
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!internal::String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  test_case_name_ = test_case.name();
+  const internal::String counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case_name_.c_str());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where TypeParam = %s\n", test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_case_name_.c_str(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_case_name_.c_str(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  test_case_name_ = test_case.name();
+  const internal::String counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case_name_.c_str(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static String EscapeXml(const char* str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static string RemoveInvalidXmlCharacters(const string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static String EscapeXmlAttribute(const char* str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static String EscapeXmlText(const char* str) { return EscapeXml(str, false); }
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(FILE* out, const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(FILE* out, const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the String is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static String TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // The output file.
+  const String output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.c_str() == NULL || output_file_.empty()) {
+    fprintf(stderr, "XML output file may not be null\n");
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = NULL;
+  FilePath output_file(output_file_);
+  FilePath output_dir(output_file.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    xmlout = posix::FOpen(output_file_.c_str(), "w");
+  }
+  if (xmlout == NULL) {
+    // TODO(wan): report the reason of the failure.
+    //
+    // We don't do it for now as:
+    //
+    //   1. There is no urgent need for it.
+    //   2. It's a bit involved to make the errno variable thread-safe on
+    //      all three operating systems (Linux, Windows, and Mac OS).
+    //   3. To interpret the meaning of errno in a thread-safe way,
+    //      we need the strerror_r() function, which is not available on
+    //      Windows.
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            output_file_.c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  PrintXmlUnitTest(xmlout, unit_test);
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+String XmlUnitTestResultPrinter::EscapeXml(const char* str, bool is_attribute) {
+  Message m;
+
+  if (str != NULL) {
+    for (const char* src = str; *src; ++src) {
+      switch (*src) {
+        case '<':
+          m << "&lt;";
+          break;
+        case '>':
+          m << "&gt;";
+          break;
+        case '&':
+          m << "&amp;";
+          break;
+        case '\'':
+          if (is_attribute)
+            m << "&apos;";
+          else
+            m << '\'';
+          break;
+        case '"':
+          if (is_attribute)
+            m << "&quot;";
+          else
+            m << '"';
+          break;
+        default:
+          if (IsValidXmlCharacter(*src)) {
+            if (is_attribute && IsNormalizableWhitespace(*src))
+              m << String::Format("&#x%02X;", unsigned(*src));
+            else
+              m << *src;
+          }
+          break;
+      }
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(const string& str) {
+  string output;
+  output.reserve(str.size());
+  for (string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << ms/1000.0;
+  return ss.str();
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+// Prints an XML representation of a TestInfo object.
+// TODO(wan): There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  *stream << "    <testcase name=\""
+          << EscapeXmlAttribute(test_info.name()).c_str() << "\"";
+
+  if (test_info.value_param() != NULL) {
+    *stream << " value_param=\"" << EscapeXmlAttribute(test_info.value_param())
+            << "\"";
+  }
+  if (test_info.type_param() != NULL) {
+    *stream << " type_param=\"" << EscapeXmlAttribute(test_info.type_param())
+            << "\"";
+  }
+
+  *stream << " status=\""
+          << (test_info.should_run() ? "run" : "notrun")
+          << "\" time=\""
+          << FormatTimeInMillisAsSeconds(result.elapsed_time())
+          << "\" classname=\"" << EscapeXmlAttribute(test_case_name).c_str()
+          << "\"" << TestPropertiesAsXmlAttributes(result).c_str();
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1)
+        *stream << ">\n";
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(part.summary()).c_str()
+              << "\" type=\"\">";
+      const string location = internal::FormatCompilerIndependentFileLocation(
+          part.file_name(), part.line_number());
+      const string message = location + "\n" + part.message();
+      OutputXmlCDataSection(stream,
+                            RemoveInvalidXmlCharacters(message).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0)
+    *stream << " />\n";
+  else
+    *stream << "    </testcase>\n";
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(FILE* out,
+                                                const TestCase& test_case) {
+  fprintf(out,
+          "  <testsuite name=\"%s\" tests=\"%d\" failures=\"%d\" "
+          "disabled=\"%d\" ",
+          EscapeXmlAttribute(test_case.name()).c_str(),
+          test_case.total_test_count(),
+          test_case.failed_test_count(),
+          test_case.disabled_test_count());
+  fprintf(out,
+          "errors=\"0\" time=\"%s\">\n",
+          FormatTimeInMillisAsSeconds(test_case.elapsed_time()).c_str());
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    ::std::stringstream stream;
+    OutputXmlTestInfo(&stream, test_case.name(), *test_case.GetTestInfo(i));
+    fprintf(out, "%s", StringStreamToString(&stream).c_str());
+  }
+  fprintf(out, "  </testsuite>\n");
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(FILE* out,
+                                                const UnitTest& unit_test) {
+  fprintf(out, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
+  fprintf(out,
+          "<testsuites tests=\"%d\" failures=\"%d\" disabled=\"%d\" "
+          "errors=\"0\" time=\"%s\" ",
+          unit_test.total_test_count(),
+          unit_test.failed_test_count(),
+          unit_test.disabled_test_count(),
+          FormatTimeInMillisAsSeconds(unit_test.elapsed_time()).c_str());
+  if (GTEST_FLAG(shuffle)) {
+    fprintf(out, "random_seed=\"%d\" ", unit_test.random_seed());
+  }
+  fprintf(out, "name=\"AllTests\">\n");
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i)
+    PrintXmlTestCase(out, *unit_test.GetTestCase(i));
+  fprintf(out, "</testsuites>\n");
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+String XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End XmlUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : sockfd_(-1), host_name_(host), port_num_(port) {
+    MakeConnection();
+    Send("gtest_streaming_protocol_version=1.0\n");
+  }
+
+  virtual ~StreamingListener() {
+    if (sockfd_ != -1)
+      CloseConnection();
+  }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    Send("event=TestProgramStart\n");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    Send(String::Format("event=TestProgramEnd&passed=%d\n",
+                        unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    Send(String::Format("event=TestIterationStart&iteration=%d\n",
+                        iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    Send(String::Format("event=TestIterationEnd&passed=%d&elapsed_time=%sms\n",
+                        unit_test.Passed(),
+                        StreamableToString(unit_test.elapsed_time()).c_str()));
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    Send(String::Format("event=TestCaseStart&name=%s\n", test_case.name()));
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    Send(String::Format("event=TestCaseEnd&passed=%d&elapsed_time=%sms\n",
+                        test_case.Passed(),
+                        StreamableToString(test_case.elapsed_time()).c_str()));
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    Send(String::Format("event=TestStart&name=%s\n", test_info.name()));
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    Send(String::Format(
+        "event=TestEnd&passed=%d&elapsed_time=%sms\n",
+        (test_info.result())->Passed(),
+        StreamableToString((test_info.result())->elapsed_time()).c_str()));
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    Send(String::Format("event=TestPartResult&file=%s&line=%d&message=",
+                        UrlEncode(file_name).c_str(),
+                        test_part_result.line_number()));
+    Send(UrlEncode(test_part_result.message()) + "\n");
+  }
+
+ private:
+  // Creates a client socket and connects to the server.
+  void MakeConnection();
+
+  // Closes the socket.
+  void CloseConnection() {
+    GTEST_CHECK_(sockfd_ != -1)
+        << "CloseConnection() can be called only when there is a connection.";
+
+    close(sockfd_);
+    sockfd_ = -1;
+  }
+
+  // Sends a string to the socket.
+  void Send(const string& message) {
+    GTEST_CHECK_(sockfd_ != -1)
+        << "Send() can be called only when there is a connection.";
+
+    const int len = static_cast<int>(message.length());
+    if (write(sockfd_, message.c_str(), len) != len) {
+      GTEST_LOG_(WARNING)
+          << "stream_result_to: failed to stream to "
+          << host_name_ << ":" << port_num_;
+    }
+  }
+
+  int sockfd_;   // socket file descriptor
+  const string host_name_;
+  const string port_num_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+string StreamingListener::UrlEncode(const char* str) {
+  string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append(String::Format("%%%02x", static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+// L < UnitTest::mutex_
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) {
+  TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message = message.GetString();
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+// L < UnitTest::mutex_
+ScopedTrace::~ScopedTrace() {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+
+// class OsStackTraceGetter
+
+// Returns the current OS stack trace as a String.  Parameters:
+//
+//   max_depth  - the maximum number of stack frames to be included
+//                in the trace.
+//   skip_count - the number of top frames to be skipped; doesn't count
+//                against max_depth.
+//
+// L < mutex_
+// We use "L < mutex_" to denote that the function may acquire mutex_.
+String OsStackTraceGetter::CurrentStackTrace(int, int) {
+  return String("");
+}
+
+// L < mutex_
+void OsStackTraceGetter::UponLeavingGTest() {
+}
+
+const char* const
+OsStackTraceGetter::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest * UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+// L < mutex_
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char* file_name,
+                                 int line_number,
+                                 const internal::String& message,
+                                 const internal::String& os_stack_trace) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#else
+      abort();
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Creates and adds a property to the current TestResult. If a property matching
+// the supplied value already exists, updates its value instead.
+void UnitTest::RecordPropertyForCurrentTest(const char* key,
+                                            const char* value) {
+  const TestProperty test_property(key, value);
+  impl_->current_test_result()->RecordProperty(test_property);
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_HAS_SEH
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+
+# if !GTEST_OS_WINDOWS_MOBILE
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+
+  }
+#endif  // GTEST_HAS_SEH
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+// L < mutex_
+const TestCase* UnitTest::current_test_case() const {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+// L < mutex_
+const TestInfo* UnitTest::current_test_info() const {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+#if GTEST_HAS_PARAM_TEST
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+// L < mutex_
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry() {
+  return impl_->parameterized_test_registry();
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+// L < mutex_
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+// L < mutex_
+void UnitTest::PopGTestTrace() {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+#ifdef _MSC_VER
+# pragma warning(push)                    // Saves the current warning state.
+# pragma warning(disable:4355)            // Temporarily disables warning 4355
+                                         // (using this in initializer).
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+# pragma warning(pop)                     // Restores the warning state again.
+#else
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+#endif  // _MSC_VER
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+#if GTEST_HAS_PARAM_TEST
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+#endif  // GTEST_HAS_PARAM_TEST
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      internal_run_death_test_flag_(NULL),
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const String& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
+           output_format.c_str());
+    fflush(stdout);
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in String form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
+             target.c_str());
+      fflush(stdout);
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const String& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  String name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_iterator test_case =
+      std::find_if(test_cases_.begin(), test_cases_.end(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.end())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(String(test_case_name),
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // Makes sure InitGoogleTest() was called.
+  if (!GTestIsInitialized()) {
+    printf("%s",
+           "\nThis test program did NOT call ::testing::InitGoogleTest "
+           "before calling RUN_ALL_TESTS().  Please fix it.\n");
+    return false;
+  }
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
+// Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const String &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const String test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_selected = is_runnable &&
+          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
+           ShouldRunTestOnShard(total_shards, shard_index,
+                                num_runnable_tests));
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.\n", test_case->name());
+        }
+        printf("  %s\n", test_info->name());
+      }
+    }
+  }
+  fflush(stdout);
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the TestResult for the test that's currently running, or
+// the TestResult for the ad hoc test if no test is running.
+TestResult* UnitTestImpl::current_test_result() {
+  return current_test_info_ ?
+      &(current_test_info_->result_) : &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as a String.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+String GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                       int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const String flag_str = String::Format("--%s%s", GTEST_FLAG_PREFIX_, flag);
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// TODO(wan@google.com): Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", String(str, p - str).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate an XML report in the given directory or with the given file\n"
+"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const String arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    // Do we see a Google Test flag?
+    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                      &GTEST_FLAG(also_run_disabled_tests)) ||
+        ParseBoolFlag(arg, kBreakOnFailureFlag,
+                      &GTEST_FLAG(break_on_failure)) ||
+        ParseBoolFlag(arg, kCatchExceptionsFlag,
+                      &GTEST_FLAG(catch_exceptions)) ||
+        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+        ParseStringFlag(arg, kDeathTestStyleFlag,
+                        &GTEST_FLAG(death_test_style)) ||
+        ParseBoolFlag(arg, kDeathTestUseFork,
+                      &GTEST_FLAG(death_test_use_fork)) ||
+        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+        ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                        &GTEST_FLAG(internal_run_death_test)) ||
+        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+        ParseInt32Flag(arg, kStackTraceDepthFlag,
+                       &GTEST_FLAG(stack_trace_depth)) ||
+        ParseStringFlag(arg, kStreamResultToFlag,
+                        &GTEST_FLAG(stream_result_to)) ||
+        ParseBoolFlag(arg, kThrowOnFailureFlag,
+                      &GTEST_FLAG(throw_on_failure))
+        ) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  g_init_gtest_count++;
+
+  // We don't want to run the initialization code twice.
+  if (g_init_gtest_count != 1) return;
+
+  if (*argc <= 0) return;
+
+  internal::g_executable_path = internal::StreamableToString(argv[0]);
+
+#if GTEST_HAS_DEATH_TEST
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+  internal::InitGoogleTestImpl(argc, argv);
+}
+
+// Pin the vtables to this file.
+Environment::~Environment() {}
+TestPartResultReporterInterface::~TestPartResultReporterInterface() {}
+TestEventListener::~TestEventListener() {}
+void EmptyTestEventListener::anchor() {}
+namespace internal {
+OsStackTraceGetterInterface::~OsStackTraceGetterInterface() {}
+ParameterizedTestCaseInfoBase::~ParameterizedTestCaseInfoBase() {}
+}
+
+}  // namespace testing
diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim
index 6c87cff..c2ec57c 100644
--- a/utils/vim/llvm.vim
+++ b/utils/vim/llvm.vim
@@ -22,17 +22,18 @@ syn match   llvmType /\<i\d\+\>/
 " Instructions.
 " The true and false tokens can be used for comparison opcodes, but it's
 " much more common for these tokens to be used for boolean constants.
-syn keyword llvmStatement add alloca and arcp ashr atomicrmw bitcast br call
-syn keyword llvmStatement cmpxchg eq exact extractelement extractvalue fadd fast
-syn keyword llvmStatement fcmp fdiv fence fmul fpext fptosi fptoui fptrunc free
-syn keyword llvmStatement frem fsub getelementptr icmp inbounds indirectbr
-syn keyword llvmStatement insertelement insertvalue inttoptr invoke landingpad
-syn keyword llvmStatement load lshr malloc max min mul nand ne ninf nnan nsw nsz
-syn keyword llvmStatement nuw oeq oge ogt ole olt one or ord phi ptrtoint resume
-syn keyword llvmStatement ret sdiv select sext sge sgt shl shufflevector sitofp
-syn keyword llvmStatement sle slt srem store sub switch trunc udiv ueq uge ugt
-syn keyword llvmStatement uitofp ule ult umax umin une uno unreachable unwind
-syn keyword llvmStatement urem va_arg xchg xor zext
+syn keyword llvmStatement add addrspacecast alloca and arcp ashr atomicrmw
+syn keyword llvmStatement bitcast br call cmpxchg eq exact extractelement
+syn keyword llvmStatement extractvalue fadd fast fcmp fdiv fence fmul fpext
+syn keyword llvmStatement fptosi fptoui fptrunc free frem fsub getelementptr
+syn keyword llvmStatement icmp inbounds indirectbr insertelement insertvalue
+syn keyword llvmStatement inttoptr invoke landingpad load lshr malloc max min
+syn keyword llvmStatement mul nand ne ninf nnan nsw nsz nuw oeq oge ogt ole
+syn keyword llvmStatement olt one or ord phi ptrtoint resume ret sdiv select
+syn keyword llvmStatement sext sge sgt shl shufflevector sitofp sle slt srem
+syn keyword llvmStatement store sub switch trunc udiv ueq uge ugt uitofp ule ult
+syn keyword llvmStatement umax umin une uno unreachable unwind urem va_arg
+syn keyword llvmStatement xchg xor zext
 
 " Keywords.
 syn keyword llvmKeyword acq_rel acquire sanitize_address addrspace alias align
@@ -43,12 +44,12 @@ syn keyword llvmKeyword constant datalayout declare default define deplibs
 syn keyword llvmKeyword dllexport dllimport except extern_weak external fastcc
 syn keyword llvmKeyword filter gc global hidden initialexec inlinehint inreg
 syn keyword llvmKeyword intel_ocl_bicc inteldialect internal linker_private
-syn keyword llvmKeyword linker_private_weak linker_private_weak_def_auto
-syn keyword llvmKeyword linkonce linkonce_odr linkonce_odr_auto_hide
+syn keyword llvmKeyword linker_private_weak
+syn keyword llvmKeyword linkonce linkonce_odr
 syn keyword llvmKeyword localdynamic localexec minsize module monotonic
 syn keyword llvmKeyword msp430_intrcc naked nest noalias nocapture
 syn keyword llvmKeyword noimplicitfloat noinline nonlazybind noredzone noreturn
-syn keyword llvmKeyword nounwind optsize personality private protected
+syn keyword llvmKeyword nounwind optnone optsize personality private protected
 syn keyword llvmKeyword ptx_device ptx_kernel readnone readonly release
 syn keyword llvmKeyword returns_twice sanitize_thread sanitize_memory
 syn keyword llvmKeyword section seq_cst sideeffect signext singlethread
diff --git a/utils/yaml-bench/YAMLBench.cpp b/utils/yaml-bench/YAMLBench.cpp
index eef4a72..f335605 100644
--- a/utils/yaml-bench/YAMLBench.cpp
+++ b/utils/yaml-bench/YAMLBench.cpp
@@ -63,6 +63,20 @@ static raw_ostream &operator <<(raw_ostream &os, const indent &in) {
   return os;
 }
 
+/// \brief Pretty print a tag by replacing tag:yaml.org,2002: with !!.
+static std::string prettyTag(yaml::Node *N) {
+  std::string Tag = N->getVerbatimTag();
+  if (StringRef(Tag).startswith("tag:yaml.org,2002:")) {
+    std::string Ret = "!!";
+    Ret += StringRef(Tag).substr(18);
+    return llvm_move(Ret);
+  }
+  std::string Ret = "!<";
+  Ret += Tag;
+  Ret += ">";
+  return Ret;
+}
+
 static void dumpNode( yaml::Node *n
                     , unsigned Indent = 0
                     , bool SuppressFirstIndent = false) {
@@ -76,9 +90,9 @@ static void dumpNode( yaml::Node *n
   if (yaml::ScalarNode *sn = dyn_cast<yaml::ScalarNode>(n)) {
     SmallString<32> Storage;
     StringRef Val = sn->getValue(Storage);
-    outs() << "!!str \"" << yaml::escape(Val) << "\"";
+    outs() << prettyTag(n) << " \"" << yaml::escape(Val) << "\"";
   } else if (yaml::SequenceNode *sn = dyn_cast<yaml::SequenceNode>(n)) {
-    outs() << "!!seq [\n";
+    outs() << prettyTag(n) << " [\n";
     ++Indent;
     for (yaml::SequenceNode::iterator i = sn->begin(), e = sn->end();
                                       i != e; ++i) {
@@ -88,7 +102,7 @@ static void dumpNode( yaml::Node *n
     --Indent;
     outs() << indent(Indent) << "]";
   } else if (yaml::MappingNode *mn = dyn_cast<yaml::MappingNode>(n)) {
-    outs() << "!!map {\n";
+    outs() << prettyTag(n) << " {\n";
     ++Indent;
     for (yaml::MappingNode::iterator i = mn->begin(), e = mn->end();
                                      i != e; ++i) {
@@ -104,7 +118,7 @@ static void dumpNode( yaml::Node *n
   } else if (yaml::AliasNode *an = dyn_cast<yaml::AliasNode>(n)){
     outs() << "*" << an->getName();
   } else if (dyn_cast<yaml::NullNode>(n)) {
-    outs() << "!!null null";
+    outs() << prettyTag(n) << " null";
   }
 }